diff --git a/.gitattributes b/.gitattributes
index 1815c810bcab07646132a6dc7dd1e391dd96727a..267d0e5d84cba76ed182c682bec3156e7d39352b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -86,3 +86,18 @@ MLPY/Lib/site-packages/PIL/_imaging.cp39-win_amd64.pyd filter=lfs diff=lfs merge
 MLPY/Lib/site-packages/PIL/_imagingft.cp39-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
 MLPY/Lib/site-packages/pythonwin/mfc140u.dll filter=lfs diff=lfs merge=lfs -text
 MLPY/Lib/site-packages/pythonwin/win32ui.pyd filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/bin/fbgemm.dll filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/bin/protoc.exe filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/dnnl.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/fbgemm.dll filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/fbgemm.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/fmt.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/kineto.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/libiomp5md.dll filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/libprotobuf-lite.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/libprotobuf.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/libprotoc.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/torch_cpu.dll filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/torch_cpu.lib filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/torch_python.dll filter=lfs diff=lfs merge=lfs -text
+MLPY/Lib/site-packages/torch/lib/XNNPACK.lib filter=lfs diff=lfs merge=lfs -text
diff --git a/MLPY/Lib/site-packages/torch/_C.cp39-win_amd64.pyd b/MLPY/Lib/site-packages/torch/_C.cp39-win_amd64.pyd
new file mode 100644
index 0000000000000000000000000000000000000000..46a4a4ab50e1e50d15f4b62676d0be4e47217ebd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_C.cp39-win_amd64.pyd differ
diff --git a/MLPY/Lib/site-packages/torch/_C/_VariableFunctions.pyi b/MLPY/Lib/site-packages/torch/_C/_VariableFunctions.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e7bc45da38b2228706f8e353adb5af335d22eae3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_VariableFunctions.pyi
@@ -0,0 +1,25648 @@
+# @generated from torch/_C/_VariableFunctions.pyi.in
+# mypy: disable-error-code="type-arg"
+
+import builtins
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterator,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch import contiguous_format, Generator, inf, memory_format, strided, SymInt, Tensor
+from torch.types import (
+    _bool,
+    _complex,
+    _device,
+    _dtype,
+    _float,
+    _int,
+    _layout,
+    _qscheme,
+    _size,
+    Device,
+    Number,
+)
+
+from torch._prims_common import DeviceLikeType
+
+@overload
+def __and__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __and__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def _adaptive_avg_pool2d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _adaptive_avg_pool3d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _add_batch_dim(input: Tensor, batch_dim: _int, level: _int) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _addmm_activation(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, use_gelu: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _aminmax(input: Tensor) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _aminmax(input: Tensor, dim: _int, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _amp_foreach_non_finite_check_and_unscale_(self: Union[Tuple[Tensor, ...], List[Tensor]], found_inf: Tensor, inv_scale: Tensor) -> None: ...
+def _amp_update_scale_(input: Tensor, growth_tracker: Tensor, found_inf: Tensor, scale_growth_factor: _float, scale_backoff_factor: _float, growth_interval: _int) -> Tensor: ...
+@overload
+def _assert_async(input: Tensor) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+@overload
+def _assert_async(input: Tensor, assert_msg: str) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+def _assert_scalar(self: Union[Number, _complex], assert_msg: str) -> None: ...
+def _assert_tensor_metadata(a: Tensor, size: Optional[Sequence[Union[_int, SymInt]]] = None, stride: Optional[Sequence[Union[_int, SymInt]]] = None, dtype: Optional[_dtype] = None) -> None: ...
+def _batch_norm_impl_index(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, _int]: ...
+def _cast_Byte(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Char(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Double(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Float(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Half(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Int(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Long(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Short(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _choose_qparams_per_tensor(input: Tensor, reduce_range: _bool = False) -> Tuple[_float, _int]: ...
+def _chunk_cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int, num_chunks: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _coalesce(input: Tensor) -> Tensor: ...
+def _compute_linear_combination(input: Tensor, coefficients: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj(input: Tensor) -> Tensor: ...
+def _conj_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj_physical(input: Tensor) -> Tensor: ...
+def _convert_indices_from_coo_to_csr(input: Tensor, size: _int, *, out_int32: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_indices_from_csr_to_coo(crow_indices: Tensor, col_indices: Tensor, *, out_int32: _bool = False, transpose: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_weight_to_int4pack(input: Tensor, innerKTiles: _int) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: _size, groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool, allow_tf32: _bool) -> Tensor: ...
+def _convolution_mode(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: str, dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _copy_from(input: Tensor, dst: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _copy_from_and_resize(input: Tensor, dst: Tensor) -> Tensor: ...
+def _cslt_compress(input: Tensor) -> Tensor: ...
+def _cslt_sparse_mm(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False, alg_id: _int = 0) -> Tensor: ...
+def _cslt_sparse_mm_search(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False) -> _int: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+def _cudnn_init_dropout_state(dropout: _float, train: _bool, dropout_seed: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _cudnn_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, weight_buf: Optional[Tensor], hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: Sequence[Union[_int, SymInt]], dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _cudnn_rnn_flatten_weight(weight_arr: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, input_size: Union[_int, SymInt], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, bidirectional: _bool) -> Tensor: ...
+def _cufft_clear_plan_cache(device_index: _int) -> None: ...
+def _cufft_get_plan_cache_max_size(device_index: _int) -> _int: ...
+def _cufft_get_plan_cache_size(device_index: _int) -> _int: ...
+def _cufft_set_plan_cache_max_size(device_index: _int, max_size: _int) -> None: ...
+def _cummax_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _cummin_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _debug_has_internal_overlap(input: Tensor) -> _int: ...
+def _dim_arange(like: Tensor, dim: _int) -> Tensor: ...
+def _dirichlet_grad(x: Tensor, alpha: Tensor, total: Tensor) -> Tensor: ...
+def _disable_functionalization(): ...
+@overload
+def _efficientzerotensor(size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _efficientzerotensor(*size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def _embedding_bag_forward_only(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def _empty_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_affine_quantized(*size: _int, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(*size: _int, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _enable_functionalization(*, reapply_views: _bool = False): ...
+def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor: ...
+def _fake_quantize_learnable_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_learnable_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(input: Tensor, scale: Tensor, zero_point: Tensor, fake_quant_enabled: Tensor, quant_min: _int, quant_max: _int) -> torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams: ...
+def _fft_c2c(input: Tensor, dim: Sequence[Union[_int, SymInt]], normalization: _int, forward: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_c2r(input: Tensor, dim: _size, normalization: _int, last_dim_size: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_r2c(input: Tensor, dim: _size, normalization: _int, onesided: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fill_mem_eff_dropout_mask_(input: Tensor, dropout_p: _float, seed: _int, offset: _int) -> Tensor: ...
+def _foobar(input: Tensor, arg1: _bool = True, arg2: _bool = True, *, arg3: _bool = True) -> Tensor: ...
+def _foreach_abs(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_abs(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_abs_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_abs_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_acos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_acos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+def _foreach_asin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_asin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_asin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_asin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_atan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_atan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_ceil(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_ceil_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_copy_(self: Union[Tuple[Tensor, ...], List[Tensor]], src: Union[Tuple[Tensor, ...], List[Tensor]], non_blocking: _bool = False) -> None: ...
+def _foreach_cos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cosh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cosh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_erf(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erf(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erf_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erf_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erfc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erfc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_exp(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_exp_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_expm1(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_expm1_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_floor(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_floor_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_frac(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_frac_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_lgamma(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_lgamma(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_lgamma_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_lgamma_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log10(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log10_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log1p(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log1p_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log2(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log2_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_neg(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_neg(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_neg_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_neg_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_norm(self: Union[Tuple[Tensor, ...], List[Tensor]], ord: Union[Number, _complex] = 2) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Number, _complex], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_reciprocal(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_reciprocal(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_reciprocal_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_reciprocal_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_round(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_round_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sigmoid(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sigmoid_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sign(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _foreach_sign_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_sin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sinh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sinh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sqrt(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sqrt_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+def _foreach_tan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tanh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tanh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_trunc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_trunc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_zero_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_zero_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.zero` to each Tensor of the input list.
+    """
+    ...
+def _from_functional_tensor(t: Tensor) -> Tensor: ...
+def _functional_assert_async(input: Tensor, assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_assert_scalar(self: Union[Number, _complex], assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range_for_size(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functionalize_are_all_mutations_hidden_from_autograd(t: Tensor) -> _bool: ...
+def _functionalize_are_all_mutations_under_no_grad_or_inference_mode(t: Tensor) -> _bool: ...
+def _functionalize_commit_update(t: Tensor) -> None: ...
+def _functionalize_mark_mutation_hidden_from_autograd(t: Tensor) -> None: ...
+def _functionalize_replace(self_: Tensor, other: Tensor) -> None: ...
+def _functionalize_sync(t: Tensor) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fused_dropout(input: Tensor, p: _float, generator: Optional[Generator] = None) -> Tuple[Tensor, Tensor]: ...
+def _fused_moving_avg_obs_fq_helper(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> torch.return_types._fused_moving_avg_obs_fq_helper: ...
+def _fused_sdp_choice(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> _int: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: Tensor, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: _float, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fw_primal_copy(input: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _grid_sampler_2d_cpu_fallback(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def _has_compatible_shallow_copy_type(input: Tensor, from_: Tensor) -> _bool: ...
+def _histogramdd_bin_edges(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tuple[Tensor, ...]: ...
+def _histogramdd_from_bin_cts(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _histogramdd_from_bin_tensors(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], *, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _index_put_impl_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False, unsafe: _bool = False) -> Tensor: ...
+def _indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _int_mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _is_all_true(input: Tensor) -> Tensor: ...
+def _is_any_true(input: Tensor) -> Tensor: ...
+def _is_functional_tensor(t: Tensor) -> _bool: ...
+def _is_zerotensor(input: Tensor) -> _bool: ...
+def _lazy_clone(input: Tensor) -> Tensor: ...
+def _linalg_check_errors(info: Tensor, api_name: str, *, is_matrix: _bool) -> None: ...
+def _linalg_det(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_det: ...
+def _linalg_eigh(A: Tensor, UPLO: str = "L", compute_v: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_eigh: ...
+def _linalg_slogdet(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_slogdet: ...
+def _linalg_solve_ex(A: Tensor, B: Tensor, *, left: _bool = True, check_errors: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_solve_ex: ...
+def _linalg_svd(A: Tensor, full_matrices: _bool = False, compute_uv: _bool = True, *, driver: Optional[str] = None, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_svd: ...
+def _log_softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _lstm_mps(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _lu_with_info(input: Tensor, pivot: _bool = True, check_errors: _bool = True) -> torch.return_types._lu_with_info: ...
+def _make_dep_token(*, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _make_dual(primal: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _make_dual_copy(primal: Tensor, tangent: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _make_per_channel_quantized_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int) -> Tensor: ...
+def _make_per_tensor_quantized_tensor(input: Tensor, scale: _float, zero_point: _int) -> Tensor: ...
+def _masked_scale(input: Tensor, mask: Tensor, scale: _float) -> Tensor: ...
+def _masked_softmax(input: Tensor, mask: Tensor, dim: Optional[_int] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _mixed_dtypes_linear(input: Tensor, weight: Tensor, scale: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None) -> Tensor: ...
+def _mkldnn_reshape(input: Tensor, shape: _size) -> Tensor: ...
+def _mkldnn_transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mkldnn_transpose_(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mps_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _mps_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_batch_norm_legit_no_training(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, momentum: _float, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None, need_weights: _bool = True, average_attn_weights: _bool = True, mask_type: Optional[_int] = None) -> Tuple[Tensor, Tensor]: ...
+def _neg_view(input: Tensor) -> Tensor: ...
+def _neg_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_from_padded(padded: Tensor, cpu_nested_shape_example: Tensor, fuse_transform_0213: _bool = False) -> Tensor: ...
+def _nested_from_padded_and_nested_example(padded: Tensor, nt_example: Tensor) -> Tensor: ...
+def _nested_get_jagged_dummy(any: Tensor) -> Tensor: ...
+def _nested_get_lengths(input: Tensor) -> Tensor: ...
+def _nested_get_offsets(input: Tensor) -> Tensor: ...
+def _nested_get_ragged_idx(input: Tensor) -> _int: ...
+def _nested_get_values(input: Tensor) -> Tensor: ...
+def _nested_get_values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_tensor_from_mask(t: Tensor, mask: Tensor, mask_check: _bool = True) -> Tensor: ...
+def _nested_tensor_from_mask_left_aligned(t: Tensor, mask: Tensor) -> _bool: ...
+def _nested_tensor_from_tensor_list(list: Union[Tuple[Tensor, ...], List[Tensor]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = None) -> Tensor: ...
+def _nested_tensor_softmax_with_shape(input: Tensor, query: Tensor) -> Tensor: ...
+def _nested_view_from_buffer(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor) -> Tensor: ...
+def _nested_view_from_buffer_copy(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_view_from_jagged(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1) -> Tensor: ...
+def _nested_view_from_jagged_copy(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nnpack_available() -> _bool: ...
+def _nnpack_spatial_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def _pack_padded_sequence(input: Tensor, lengths: Tensor, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def _pad_packed_sequence(data: Tensor, batch_sizes: Tensor, batch_first: _bool, padding_value: Union[Number, _complex], total_length: _int) -> Tuple[Tensor, Tensor]: ...
+def _pin_memory(input: Tensor, device: Optional[Optional[DeviceLikeType]] = None) -> Tensor: ...
+def _prelu_kernel(input: Tensor, weight: Tensor) -> Tensor: ...
+def _print(s: str) -> None: ...
+def _propagate_xla_data(input: Tensor, output: Tensor) -> None: ...
+def _remove_batch_dim(input: Tensor, level: _int, batch_size: _int, out_dim: _int) -> Tensor: ...
+def _reshape_alias_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _reshape_from_tensor(input: Tensor, shape: Tensor) -> Tensor: ...
+def _resize_output_(input: Tensor, size: Sequence[Union[_int, SymInt]], device: Optional[DeviceLikeType]) -> Tensor: ...
+def _rowwise_prune(weight: Tensor, mask: Tensor, compressed_indices_dtype: _dtype) -> Tuple[Tensor, Tensor]: ...
+def _sample_dirichlet(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _saturate_weight_to_fp16(weight: Tensor) -> Tensor: ...
+def _scaled_dot_product_attention_math(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, dropout_mask: Optional[Tensor] = None, *, scale: Optional[_float] = None) -> Tuple[Tensor, Tensor]: ...
+def _scaled_dot_product_cudnn_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_cudnn_attention: ...
+def _scaled_dot_product_efficient_attention(query: Tensor, key: Tensor, value: Tensor, attn_bias: Optional[Tensor], compute_log_sumexp: _bool, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_efficient_attention: ...
+def _scaled_dot_product_flash_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention: ...
+def _scaled_dot_product_flash_attention_for_cpu(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, *, attn_mask: Optional[Tensor] = None, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention_for_cpu: ...
+def _scaled_mm(input: Tensor, mat2: Tensor, *, bias: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, scale_a: Optional[Tensor] = None, scale_b: Optional[Tensor] = None, scale_result: Optional[Tensor] = None, use_fast_accum: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor]: ...
+def _shape_as_tensor(input: Tensor) -> Tensor: ...
+def _sobol_engine_draw(quasi: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int, dtype: Optional[_dtype]) -> Tuple[Tensor, Tensor]: ...
+def _sobol_engine_ff_(input: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int) -> Tensor: ...
+def _sobol_engine_initialize_state_(input: Tensor, dimension: _int) -> Tensor: ...
+def _sobol_engine_scramble_(input: Tensor, ltm: Tensor, dimension: _int) -> Tensor: ...
+def _softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, grad_input: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_broadcast_to(input: Tensor, size: _size) -> Tensor: ...
+def _sparse_broadcast_to_copy(input: Tensor, size: _size, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_csr_prod(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_csr_sum(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_semi_structured_linear(input: Tensor, weight: Tensor, meta: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None, out_dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_sparse_matmul(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, *, dtype: _dtype) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size]) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size], *, dtype: _dtype) -> Tensor: ...
+def _stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _standard_gamma(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _standard_gamma_grad(input: Tensor, output: Tensor) -> Tensor: ...
+def _sync(t: Tensor) -> None: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor) -> Tensor: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor, b: _bool) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view(input: Tensor) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _test_check_tensor(input: Tensor) -> Tensor: ...
+def _test_functorch_fallback(input: Tensor, other: Tensor) -> Tensor: ...
+def _test_parallel_materialize(input: Tensor, num_parallel: _int, skip_first: _bool = False) -> Tensor: ...
+def _test_serialization_subcmul(input: Tensor, other: Tensor, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _to_cpu(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _to_functional_tensor(t: Tensor) -> Tensor: ...
+def _to_sparse_semi_structured(dense: Tensor) -> Tuple[Tensor, Tensor]: ...
+def _transform_bias_rescale_qkv(qkv: Tensor, qkv_bias: Tensor, num_heads: _int) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _transformer_encoder_layer_fwd(src: Tensor, embed_dim: _int, num_heads: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, use_gelu: _bool, norm_first: _bool, eps: _float, norm_weight_1: Tensor, norm_bias_1: Tensor, norm_weight_2: Tensor, norm_bias_2: Tensor, ffn_weight_1: Tensor, ffn_bias_1: Tensor, ffn_weight_2: Tensor, ffn_bias_2: Tensor, mask: Optional[Tensor] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _trilinear(i1: Tensor, i2: Tensor, i3: Tensor, expand1: _size, expand2: _size, expand3: _size, sumdim: _size, unroll_dim: _int = 1) -> Tensor: ...
+def _triton_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None) -> Tensor: ...
+def _triton_scaled_dot_attention(q: Tensor, k: Tensor, v: Tensor, dropout_p: _float = 0.0) -> Tensor: ...
+def _unique(input: Tensor, sorted: _bool = True, return_inverse: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _unique2(input: Tensor, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _unpack_dual(dual: Tensor, level: _int) -> torch.return_types._unpack_dual: ...
+def _unsafe_index(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]]) -> Tensor: ...
+def _unsafe_index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int) -> _bool: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int) -> _bool: ...
+def _use_cudnn_rnn_flatten_weight() -> _bool: ...
+def _validate_compressed_sparse_indices(is_crow: _bool, compressed_idx: Tensor, plain_idx: Tensor, cdim: _int, dim: _int, nnz: _int) -> None: ...
+def _validate_sparse_bsc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_bsr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_compressed_tensor_args(compressed_indices: Tensor, plain_indices: Tensor, values: Tensor, size: _size, layout: _layout) -> None: ...
+def _validate_sparse_coo_tensor_args(indices: Tensor, values: Tensor, size: _size, is_coalesced: Optional[_bool] = None) -> None: ...
+def _validate_sparse_csc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_csr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _weight_int4pack_mm(input: Tensor, mat2: Tensor, qGroupSize: _int, qScaleAndZeros: Tensor) -> Tensor: ...
+def _weight_int8pack_mm(input: Tensor, mat2: Tensor, scales: Tensor) -> Tensor: ...
+def _weight_norm(v: Tensor, g: Tensor, dim: _int = 0) -> Tensor: ...
+def _weight_norm_interface(v: Tensor, g: Tensor, dim: _int = 0) -> Tuple[Tensor, Tensor]: ...
+def abs(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    abs(input, *, out=None) -> Tensor
+    
+    Computes the absolute value of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = |\text{input}_{i}|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.abs(torch.tensor([-1, -2, 3]))
+        tensor([ 1,  2,  3])
+    """
+    ...
+def abs_(input: Tensor) -> Tensor: ...
+def absolute(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    absolute(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.abs`
+    """
+    ...
+def acos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acos(input, *, out=None) -> Tensor
+    
+    Computes the inverse cosine of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.3348, -0.5889,  0.2005, -0.1584])
+        >>> torch.acos(a)
+        tensor([ 1.2294,  2.2004,  1.3690,  1.7298])
+    """
+    ...
+def acos_(input: Tensor) -> Tensor: ...
+def acosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh^{-1}(\text{input}_{i})
+    
+    Note:
+        The domain of the inverse hyperbolic cosine is `[1, inf)` and values outside this range
+        will be mapped to ``NaN``, except for `+ INF` for which the output is mapped to `+ INF`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(1, 2)
+        >>> a
+        tensor([ 1.3192, 1.9915, 1.9674, 1.7151 ])
+        >>> torch.acosh(a)
+        tensor([ 0.7791, 1.3120, 1.2979, 1.1341 ])
+    """
+    ...
+def acosh_(input: Tensor) -> Tensor: ...
+def adaptive_avg_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tensor: ...
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def add(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addmv_(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(input: Tensor, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+def adjoint(input: Tensor) -> Tensor: 
+    r"""
+    adjoint(Tensor) -> Tensor
+    Returns a view of the tensor conjugated and with the last two dimensions transposed.
+    
+    ``x.adjoint()`` is equivalent to ``x.transpose(-2, -1).conj()`` for complex tensors and
+    to ``x.transpose(-2, -1)`` for real tensors.
+    
+    Example::
+        >>> x = torch.arange(4, dtype=torch.float)
+        >>> A = torch.complex(x, x).reshape(2, 2)
+        >>> A
+        tensor([[0.+0.j, 1.+1.j],
+                [2.+2.j, 3.+3.j]])
+        >>> A.adjoint()
+        tensor([[0.-0.j, 2.-2.j],
+                [1.-1.j, 3.-3.j]])
+        >>> (A.adjoint() == A.mH).all()
+        tensor(True)
+    """
+    ...
+def affine_grid_generator(theta: Tensor, size: Sequence[Union[_int, SymInt]], align_corners: _bool) -> Tensor: ...
+def alias_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.alias`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def all(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+def allclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> _bool: 
+    r"""
+    allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> bool
+    
+    This function checks if :attr:`input` and :attr:`other` satisfy the condition:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Example::
+    
+        >>> torch.allclose(torch.tensor([10000., 1e-07]), torch.tensor([10000.1, 1e-08]))
+        False
+        >>> torch.allclose(torch.tensor([10000., 1e-08]), torch.tensor([10000.1, 1e-09]))
+        True
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]))
+        False
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]), equal_nan=True)
+        True
+    """
+    ...
+def alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def amax(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amax(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the maximum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.8177,  1.4878, -0.2491,  0.9130],
+                [-0.7158,  1.1775,  2.0992,  0.4817],
+                [-0.0053,  0.0164, -1.3738, -0.0507],
+                [ 1.9700,  1.1106, -1.0318, -1.0816]])
+        >>> torch.amax(a, 1)
+        tensor([1.4878, 2.0992, 0.0164, 1.9700])
+    """
+    ...
+def amin(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amin(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the minimum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.6451, -0.4866,  0.2987, -1.3312],
+                [-0.5744,  1.2980,  1.8397, -0.2713],
+                [ 0.9128,  0.9214, -1.7268, -0.2995],
+                [ 0.9023,  0.4853,  0.9075, -1.6165]])
+        >>> torch.amin(a, 1)
+        tensor([-1.3312, -0.5744, -1.7268, -1.6165])
+    """
+    ...
+def aminmax(input: Tensor, *, dim: Optional[_int] = None, keepdim: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.aminmax: 
+    r"""
+    aminmax(input, *, dim=None, keepdim=False, out=None) -> (Tensor min, Tensor max)
+    
+    Computes the minimum and maximum values of the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor):
+            The input tensor
+    
+    Keyword Args:
+        dim (Optional[int]):
+            The dimension along which to compute the values. If `None`,
+            computes the values over the entire :attr:`input` tensor.
+            Default is `None`.
+        keepdim (bool):
+            If `True`, the reduced dimensions will be kept in the output
+            tensor as dimensions with size 1 for broadcasting, otherwise
+            they will be removed, as if calling (:func:`torch.squeeze`).
+            Default is `False`.
+        out (Optional[Tuple[Tensor, Tensor]]):
+            Optional tensors on which to write the result. Must have the same
+            shape and dtype as the expected output.
+            Default is `None`.
+    
+    Returns:
+        A named tuple `(min, max)` containing the minimum and maximum values.
+    
+    Raises:
+        RuntimeError
+            If any of the dimensions to compute the values over has size 0.
+    
+    .. note::
+        NaN values are propagated to the output if at least one value is NaN.
+    
+    .. seealso::
+        :func:`torch.amin` computes just the minimum value
+        :func:`torch.amax` computes just the maximum value
+    
+    Example::
+    
+        >>> torch.aminmax(torch.tensor([1, -3, 5]))
+        torch.return_types.aminmax(
+        min=tensor(-3),
+        max=tensor(5))
+    
+        >>> # aminmax propagates NaNs
+        >>> torch.aminmax(torch.tensor([1, -3, 5, torch.nan]))
+        torch.return_types.aminmax(
+        min=tensor(nan),
+        max=tensor(nan))
+    
+        >>> t = torch.arange(10).view(2, 5)
+        >>> t
+        tensor([[0, 1, 2, 3, 4],
+                [5, 6, 7, 8, 9]])
+        >>> t.aminmax(dim=0, keepdim=True)
+        torch.return_types.aminmax(
+        min=tensor([[0, 1, 2, 3, 4]]),
+        max=tensor([[5, 6, 7, 8, 9]]))
+    """
+    ...
+def angle(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    angle(input, *, out=None) -> Tensor
+    
+    Computes the element-wise angle (in radians) of the given :attr:`input` tensor.
+    
+    .. math::
+        \text{out}_{i} = angle(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers,
+              zero for non-negative real numbers, and propagates NaNs. Previously
+              the function would return zero for all real numbers and not propagate
+              floating-point NaNs.
+    
+    Example::
+    
+        >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
+        tensor([ 135.,  135,  -45])
+    """
+    ...
+@overload
+def any(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, step: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], step: Union[Number, _complex] = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+def arccos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccos(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acos`.
+    """
+    ...
+def arccos_(input: Tensor) -> Tensor: ...
+def arccosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccosh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acosh`.
+    """
+    ...
+def arccosh_(input: Tensor) -> Tensor: ...
+def arcsin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsin(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asin`.
+    """
+    ...
+def arcsin_(input: Tensor) -> Tensor: ...
+def arcsinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsinh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asinh`.
+    """
+    ...
+def arcsinh_(input: Tensor) -> Tensor: ...
+def arctan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atan`.
+    """
+    ...
+def arctan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan2(input, other, *, out=None) -> Tensor
+    Alias for :func:`torch.atan2`.
+    """
+    ...
+def arctan_(input: Tensor) -> Tensor: ...
+def arctanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctanh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atanh`.
+    """
+    ...
+def arctanh_(input: Tensor) -> Tensor: ...
+def argmax(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmax(input) -> LongTensor
+    
+    Returns the indices of the maximum value of all elements in the :attr:`input` tensor.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple maximal values then the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a)
+        tensor(0)
+    
+    .. function:: argmax(input, dim, keepdim=False) -> LongTensor
+       :noindex:
+    
+    Returns the indices of the maximum values of a tensor across a dimension.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmax of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a, dim=1)
+        tensor([ 0,  2,  0,  1])
+    """
+    ...
+def argmin(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmin(input, dim=None, keepdim=False) -> LongTensor
+    
+    Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
+    
+    This is the second value returned by :meth:`torch.min`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmin of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+                [ 1.0100, -1.1975, -0.0102, -0.4732],
+                [-0.9240,  0.1207, -0.7506, -1.0213],
+                [ 1.7809, -1.2960,  0.9384,  0.1438]])
+        >>> torch.argmin(a)
+        tensor(13)
+        >>> torch.argmin(a, dim=1)
+        tensor([ 2,  1,  3,  1])
+        >>> torch.argmin(a, dim=1, keepdim=True)
+        tensor([[2],
+                [1],
+                [3],
+                [1]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, *, stable: _bool, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+def argwhere(input: Tensor) -> Tensor: 
+    r"""
+    argwhere(input) -> Tensor
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    .. note::
+        This function is similar to NumPy's `argwhere`.
+    
+        When :attr:`input` is on CUDA, this function causes host-device synchronization.
+    
+    Args:
+        {input}
+    
+    Example::
+    
+        >>> t = torch.tensor([1, 0, 1])
+        >>> torch.argwhere(t)
+        tensor([[0],
+                [2]])
+        >>> t = torch.tensor([[1, 0, 1], [0, 1, 1]])
+        >>> torch.argwhere(t)
+        tensor([[0, 0],
+                [0, 2],
+                [1, 1],
+                [1, 2]])
+    """
+    ...
+def as_strided(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided(input, size, stride, storage_offset=None) -> Tensor
+    
+    Create a view of an existing `torch.Tensor` :attr:`input` with specified
+    :attr:`size`, :attr:`stride` and :attr:`storage_offset`.
+    
+    .. warning::
+        Prefer using other view functions, like :meth:`torch.Tensor.expand`,
+        to setting a view's strides manually with `as_strided`, as this
+        function's behavior depends on the implementation of a tensor's storage.
+        The constructed view of the storage must only refer to elements within
+        the storage or a runtime error will be thrown, and if the view is
+        "overlapped" (with multiple indices referring to the same element in
+        memory) its behavior is undefined.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor.
+            If ``None``, the storage_offset of the output tensor will match the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 3)
+        >>> x
+        tensor([[ 0.9039,  0.6291,  1.0795],
+                [ 0.1586,  2.1939, -0.4900],
+                [-0.1909, -0.7503,  1.9355]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2))
+        >>> t
+        tensor([[0.9039, 1.0795],
+                [0.6291, 0.1586]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2), 1)
+        tensor([[0.6291, 0.1586],
+                [1.0795, 2.1939]])
+    """
+    ...
+def as_strided_(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: ...
+def as_strided_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.as_strided`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def as_strided_scatter(input: Tensor, src: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the elements corresponding to the result of calling
+    input.as_strided(size, stride, storage_offset).
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        `torch.as_strided(input, size, stride, storage_offset)`
+    
+    Example::
+    
+        >>> a = torch.arange(4).reshape(2, 2) + 1
+        >>> a
+        tensor([[1, 2],
+                [3, 4]])
+        >>> b = torch.zeros(3, 3)
+        >>> b
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+        >>> torch.as_strided_scatter(b, a, (2, 2), (1, 2))
+        tensor([[1., 3., 2.],
+                [4., 0., 0.],
+                [0., 0., 0.]])
+    """
+    ...
+def as_tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None) -> Tensor: 
+    r"""
+    as_tensor(data, dtype=None, device=None) -> Tensor
+    
+    Converts :attr:`data` into a tensor, sharing data and preserving autograd
+    history if possible.
+    
+    If :attr:`data` is already a tensor with the requested dtype and device
+    then :attr:`data` itself is returned, but if :attr:`data` is a
+    tensor with a different dtype or device then it's copied as if using
+    `data.to(dtype=dtype, device=device)`.
+    
+    If :attr:`data` is a NumPy array (an ndarray) with the same dtype and device then a
+    tensor is constructed using :func:`torch.from_numpy`.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` never shares its data and creates a new "leaf tensor" (see :doc:`/notes/autograd`).
+    
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+    
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a, device=torch.device('cuda'))
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([1,  2,  3])
+    """
+    ...
+def asarray(obj: Any, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, copy: Optional[_bool] = None, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    asarray(obj, *, dtype=None, device=None, copy=None, requires_grad=False) -> Tensor
+    
+    Converts :attr:`obj` to a tensor.
+    
+    :attr:`obj` can be one of:
+    
+    1. a tensor
+    2. a NumPy array or a NumPy scalar
+    3. a DLPack capsule
+    4. an object that implements Python's buffer protocol
+    5. a scalar
+    6. a sequence of scalars
+    
+    When :attr:`obj` is a tensor, NumPy array, or DLPack capsule the returned tensor will,
+    by default, not require a gradient, have the same datatype as :attr:`obj`, be on the
+    same device, and share memory with it. These properties can be controlled with the
+    :attr:`dtype`, :attr:`device`, :attr:`copy`, and :attr:`requires_grad` keyword arguments.
+    If the returned tensor is of a different datatype, on a different device, or a copy is
+    requested then it will not share its memory with :attr:`obj`. If :attr:`requires_grad`
+    is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is
+    also a tensor with an autograd history then the returned tensor will have the same history.
+    
+    When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's
+    buffer protocol then the buffer is interpreted as an array of bytes grouped according to
+    the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is
+    passed then the default floating point datatype is used, instead.) The returned tensor
+    will have the specified datatype (or default floating point datatype if none is specified)
+    and, by default, be on the CPU device and share memory with the buffer.
+    
+    When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on
+    the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will
+    be the PyTorch datatype corresponding to the NumPy's scalar's datatype.
+    
+    When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the
+    returned tensor will, by default, infer its datatype from the scalar values, be on the
+    current default device, and not share its memory.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` creates a tensor that always copies the data from the input object.
+        :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays.
+        :func:`torch.frombuffer` creates a tensor that always shares memory from objects that
+        implement the buffer protocol.
+        :func:`torch.from_dlpack` creates a tensor that always shares memory from
+        DLPack capsules.
+    
+    Args:
+        obj (object): a tensor, NumPy array, DLPack Capsule, object that implements Python's
+               buffer protocol, scalar, or sequence of scalars.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the datatype of the returned tensor.
+               Default: ``None``, which causes the datatype of the returned tensor to be
+               inferred from :attr:`obj`.
+        copy (bool, optional): controls whether the returned tensor shares memory with :attr:`obj`.
+               Default: ``None``, which causes the returned tensor to share memory with :attr:`obj`
+               whenever possible. If ``True`` then the returned tensor does not share its memory.
+               If ``False`` then the returned tensor shares its memory with :attr:`obj` and an
+               error is thrown if it cannot.
+        device (:class:`torch.device`, optional): the device of the returned tensor.
+               Default: ``None``, which causes the device of :attr:`obj` to be used. Or, if
+               :attr:`obj` is a Python sequence, the current default device will be used.
+        requires_grad (bool, optional): whether the returned tensor requires grad.
+               Default: ``False``, which causes the returned tensor not to require a gradient.
+               If ``True``, then the returned tensor will require a gradient, and if :attr:`obj`
+               is also a tensor with an autograd history then the returned tensor will have
+               the same history.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> # Shares memory with tensor 'a'
+        >>> b = torch.asarray(a)
+        >>> a.data_ptr() == b.data_ptr()
+        True
+        >>> # Forces memory copy
+        >>> c = torch.asarray(a, copy=True)
+        >>> a.data_ptr() == c.data_ptr()
+        False
+    
+        >>> a = torch.tensor([1., 2., 3.], requires_grad=True)
+        >>> b = a + 2
+        >>> b
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+        >>> # Shares memory with tensor 'b', with no grad
+        >>> c = torch.asarray(b)
+        >>> c
+        tensor([3., 4., 5.])
+        >>> # Shares memory with tensor 'b', retaining autograd history
+        >>> d = torch.asarray(b, requires_grad=True)
+        >>> d
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+    
+        >>> array = numpy.array([1, 2, 3])
+        >>> # Shares memory with array 'array'
+        >>> t1 = torch.asarray(array)
+        >>> array.__array_interface__['data'][0] == t1.data_ptr()
+        True
+        >>> # Copies memory due to dtype mismatch
+        >>> t2 = torch.asarray(array, dtype=torch.float32)
+        >>> array.__array_interface__['data'][0] == t2.data_ptr()
+        False
+    
+        >>> scalar = numpy.float64(0.5)
+        >>> torch.asarray(scalar)
+        tensor(0.5000, dtype=torch.float64)
+    """
+    ...
+def asin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5962,  1.4985, -0.4396,  1.4525])
+        >>> torch.asin(a)
+        tensor([-0.6387,     nan, -0.4552,     nan])
+    """
+    ...
+def asin_(input: Tensor) -> Tensor: ...
+def asinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1606, -1.4267, -1.0899, -1.0250 ])
+        >>> torch.asinh(a)
+        tensor([ 0.1599, -1.1534, -0.9435, -0.8990 ])
+    """
+    ...
+def asinh_(input: Tensor) -> Tensor: ...
+def atan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arctangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.2341,  0.2539, -0.6256, -0.6448])
+        >>> torch.atan(a)
+        tensor([ 0.2299,  0.2487, -0.5591, -0.5727])
+    """
+    ...
+def atan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan2(input, other, *, out=None) -> Tensor
+    
+    Element-wise arctangent of :math:`\text{input}_{i} / \text{other}_{i}`
+    with consideration of the quadrant. Returns a new tensor with the signed angles
+    in radians between vector :math:`(\text{other}_{i}, \text{input}_{i})`
+    and vector :math:`(1, 0)`. (Note that :math:`\text{other}_{i}`, the second
+    parameter, is the x-coordinate, while :math:`\text{input}_{i}`, the first
+    parameter, is the y-coordinate.)
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.9041,  0.0196, -0.3108, -2.4423])
+        >>> torch.atan2(a, torch.randn(4))
+        tensor([ 0.9833,  0.0811, -1.9743, -1.4151])
+    """
+    ...
+def atan_(input: Tensor) -> Tensor: ...
+def atanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+    
+    Note:
+        The domain of the inverse hyperbolic tangent is `(-1, 1)` and values outside this range
+        will be mapped to ``NaN``, except for the values `1` and `-1` for which the output is
+        mapped to `+/-INF` respectively.
+    
+    .. math::
+        \text{out}_{i} = \tanh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(-1, 1)
+        >>> a
+        tensor([ -0.9385, 0.2968, -0.8591, -0.1871 ])
+        >>> torch.atanh(a)
+        tensor([ -1.7253, 0.3060, -1.2899, -0.1893 ])
+    """
+    ...
+def atanh_(input: Tensor) -> Tensor: ...
+def avg_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, ceil_mode: _bool = False, count_include_pad: _bool = True) -> Tensor: ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def batch_norm_backward_elemt(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], sum_dy: Tensor, sum_dy_xmu: Tensor, count: Tensor) -> Tensor: ...
+def batch_norm_backward_reduce(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], input_g: _bool, weight_g: _bool, bias_g: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def batch_norm_elemt(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, invstd: Tensor, eps: _float, *, out: Optional[Tensor] = None) -> Tensor: ...
+def batch_norm_gather_stats(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, count: _int) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_gather_stats_with_counts(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, counts: Tensor) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_stats(input: Tensor, eps: _float) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_update_stats(input: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float) -> Tuple[Tensor, Tensor]: ...
+@overload
+def bernoulli(input: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+@overload
+def bernoulli(input: Tensor, p: _float, *, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, weight: Optional[Tensor] = None, pos_weight: Optional[Tensor] = None, reduction: _int = 1) -> Tensor: ...
+def bincount(input: Tensor, weights: Optional[Tensor] = None, minlength: _int = 0) -> Tensor: 
+    r"""
+    bincount(input, weights=None, minlength=0) -> Tensor
+    
+    Count the frequency of each value in an array of non-negative ints.
+    
+    The number of bins (size 1) is one larger than the largest value in
+    :attr:`input` unless :attr:`input` is empty, in which case the result is a
+    tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
+    :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
+    :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
+    ``out[n] += weights[i]`` if :attr:`weights` is specified else
+    ``out[n] += 1``.
+    
+    Note:
+        This operation may produce nondeterministic gradients when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+    
+    Arguments:
+        input (Tensor): 1-d int tensor
+        weights (Tensor): optional, weight for each value in the input tensor.
+            Should be of same size as input tensor.
+        minlength (int): optional, minimum number of bins. Should be non-negative.
+    
+    Returns:
+        output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+        :attr:`input` is non-empty, else ``Size(0)``
+    
+    Example::
+    
+        >>> input = torch.randint(0, 8, (5,), dtype=torch.int64)
+        >>> weights = torch.linspace(0, 1, steps=5)
+        >>> input, weights
+        (tensor([4, 3, 6, 3, 4]),
+         tensor([ 0.0000,  0.2500,  0.5000,  0.7500,  1.0000])
+    
+        >>> torch.bincount(input)
+        tensor([0, 0, 0, 2, 2, 0, 1])
+    
+        >>> input.bincount(weights)
+        tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000])
+    """
+    ...
+def binomial(count: Tensor, prob: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+@overload
+def bitwise_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+def bitwise_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_not(input, *, out=None) -> Tensor
+    
+    Computes the bitwise NOT of the given input tensor. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical NOT.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_not(torch.tensor([-1, -2, 3], dtype=torch.int8))
+        tensor([ 0,  1, -4], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def bmm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bmm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored in :attr:`input`
+    and :attr:`mat2`.
+    
+    :attr:`input` and :attr:`mat2` must be 3-D tensors each containing
+    the same number of matrices.
+    
+    If :attr:`input` is a :math:`(b \times n \times m)` tensor, :attr:`mat2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor.
+    
+    .. math::
+        \text{out}_i = \text{input}_i \mathbin{@} \text{mat2}_i
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Args:
+        input (Tensor): the first batch of matrices to be multiplied
+        mat2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword Args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.randn(10, 3, 4)
+        >>> mat2 = torch.randn(10, 4, 5)
+        >>> res = torch.bmm(input, mat2)
+        >>> res.size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def broadcast_to(input: Tensor, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    broadcast_to(input, shape) -> Tensor
+    
+    Broadcasts :attr:`input` to the shape :attr:`\shape`.
+    Equivalent to calling ``input.expand(shape)``. See :meth:`~Tensor.expand` for details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shape (list, tuple, or :class:`torch.Size`): the new shape.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> torch.broadcast_to(x, (3, 3))
+        tensor([[1, 2, 3],
+                [1, 2, 3],
+                [1, 2, 3]])
+    """
+    ...
+@overload
+def bucketize(input: Tensor, boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+@overload
+def bucketize(self: Union[Number, _complex], boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+def can_cast(from_: _dtype, to: _dtype) -> _bool: 
+    r"""
+    can_cast(from, to) -> bool
+    
+    Determines if a type conversion is allowed under PyTorch casting rules
+    described in the type promotion :ref:`documentation <type-promotion-doc>`.
+    
+    Args:
+        from (dtype): The original :class:`torch.dtype`.
+        to (dtype): The target :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.can_cast(torch.double, torch.float)
+        True
+        >>> torch.can_cast(torch.float, torch.int)
+        False
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+def ccol_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def ceil(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ceil(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the ceil of the elements of :attr:`input`,
+    the smallest integer greater than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.6341, -1.4208, -1.0900,  0.5826])
+        >>> torch.ceil(a)
+        tensor([-0., -1., -1.,  1.])
+    """
+    ...
+def ceil_(input: Tensor) -> Tensor: ...
+def celu(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def celu_(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def cholesky(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky(input, upper=False, *, out=None) -> Tensor
+    
+    Computes the Cholesky decomposition of a symmetric positive-definite
+    matrix :math:`A` or for batches of symmetric positive-definite matrices.
+    
+    If :attr:`upper` is ``True``, the returned matrix ``U`` is upper-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+      A = U^TU
+    
+    If :attr:`upper` is ``False``, the returned matrix ``L`` is lower-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+        A = LL^T
+    
+    If :attr:`upper` is ``True``, and :math:`A` is a batch of symmetric positive-definite
+    matrices, then the returned tensor will be composed of upper-triangular Cholesky factors
+    of each of the individual matrices. Similarly, when :attr:`upper` is ``False``, the returned
+    tensor will be composed of lower-triangular Cholesky factors of each of the individual
+    matrices.
+    
+    .. warning::
+    
+        :func:`torch.cholesky` is deprecated in favor of :func:`torch.linalg.cholesky`
+        and will be removed in a future PyTorch release.
+    
+        ``L = torch.cholesky(A)`` should be replaced with
+    
+        .. code:: python
+    
+            L = torch.linalg.cholesky(A)
+    
+        ``U = torch.cholesky(A, upper=True)`` should be replaced with
+    
+        .. code:: python
+    
+            U = torch.linalg.cholesky(A).mH
+    
+        This transform will produce equivalent results for all valid (symmetric positive definite) inputs.
+    
+    Args:
+        input (Tensor): the input tensor :math:`A` of size :math:`(*, n, n)` where `*` is zero or more
+                    batch dimensions consisting of symmetric positive-definite matrices.
+        upper (bool, optional): flag that indicates whether to return a
+                                upper or lower triangular matrix. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output matrix
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a = a @ a.mT + 1e-3 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> a
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> l
+        tensor([[ 1.5528,  0.0000,  0.0000],
+                [-0.4821,  1.0592,  0.0000],
+                [ 0.9371,  0.5487,  0.7023]])
+        >>> l @ l.mT
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> a = torch.randn(3, 2, 2) # Example for batched input
+        >>> a = a @ a.mT + 1e-03 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> z = l @ l.mT
+        >>> torch.dist(z, a)
+        tensor(2.3842e-07)
+    """
+    ...
+def cholesky_inverse(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_inverse(L, upper=False, *, out=None) -> Tensor
+    
+    Computes the inverse of a complex Hermitian or real symmetric
+    positive-definite matrix given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Computes the inverse matrix :math:`A^{-1}`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> torch.cholesky_inverse(L)
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+        >>> A.inverse()
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> torch.dist(torch.inverse(A), torch.cholesky_inverse(L))
+        tensor(5.6358e-7)
+    """
+    ...
+def cholesky_solve(input: Tensor, input2: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_solve(B, L, upper=False, *, out=None) -> Tensor
+    
+    Computes the solution of a system of linear equations with complex Hermitian
+    or real symmetric positive-definite lhs given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Returns the solution :math:`X` of the following linear system:
+    
+    .. math::
+    
+        AX = B
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` or :math:`B` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        B (Tensor): right-hand side tensor of shape `(*, n, k)`
+            where :math:`*` is zero or more batch dimensions
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``.
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> B = torch.randn(3, 2)
+        >>> torch.cholesky_solve(B, L)
+        tensor([[ -8.1625,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+        >>> A.inverse() @  B
+        tensor([[ -8.1626,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> B = torch.randn(2, 1, dtype=torch.complex64)
+        >>> X = torch.cholesky_solve(B, L)
+        >>> torch.dist(X, A.inverse() @ B)
+        tensor(1.6881e-5)
+    """
+    ...
+def choose_qparams_optimized(input: Tensor, numel: _int, n_bins: _int, ratio: _float, bit_width: _int) -> Tuple[Tensor, Tensor]: ...
+def chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Attempts to split a tensor into the specified number of chunks. Each chunk is a view of
+    the input tensor.
+    
+    
+    .. note::
+    
+        This function may return fewer than the specified number of chunks!
+    
+    .. seealso::
+    
+        :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks
+    
+    If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`,
+    all returned chunks will be the same size.
+    If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`,
+    all returned chunks will be the same size, except the last one.
+    If such division is not possible, this function may return fewer
+    than the specified number of chunks.
+    
+    Arguments:
+        input (Tensor): the tensor to split
+        chunks (int): number of chunks to return
+        dim (int): dimension along which to split the tensor
+    
+    Example:
+        >>> torch.arange(11).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10]))
+        >>> torch.arange(12).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10, 11]))
+        >>> torch.arange(13).chunk(6)
+        (tensor([0, 1, 2]),
+         tensor([3, 4, 5]),
+         tensor([6, 7, 8]),
+         tensor([ 9, 10, 11]),
+         tensor([12]))
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Tensor) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Tensor) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clip(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clip_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+def clone(input: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+    r"""
+    clone(input, *, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a copy of :attr:`input`.
+    
+    .. note::
+    
+        This function is differentiable, so gradients will flow back from the
+        result of this operation to :attr:`input`. To create a tensor without an
+        autograd relationship to :attr:`input` see :meth:`~Tensor.detach`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def col_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.col_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def column_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    column_stack(tensors, *, out=None) -> Tensor
+    
+    Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`.
+    
+    Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t``
+    in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.column_stack((a, b))
+        tensor([[1, 4],
+            [2, 5],
+            [3, 6]])
+        >>> a = torch.arange(5)
+        >>> b = torch.arange(10).reshape(5, 2)
+        >>> torch.column_stack((a, b, b))
+        tensor([[0, 0, 1, 0, 1],
+                [1, 2, 3, 2, 3],
+                [2, 4, 5, 4, 5],
+                [3, 6, 7, 6, 7],
+                [4, 8, 9, 8, 9]])
+    """
+    ...
+def combinations(input: Tensor, r: _int = 2, with_replacement: _bool = False) -> Tensor: 
+    r"""
+    combinations(input, r=2, with_replacement=False) -> seq
+    
+    Compute combinations of length :math:`r` of the given tensor. The behavior is similar to
+    python's `itertools.combinations` when `with_replacement` is set to `False`, and
+    `itertools.combinations_with_replacement` when `with_replacement` is set to `True`.
+    
+    Arguments:
+        input (Tensor): 1D vector.
+        r (int, optional): number of elements to combine
+        with_replacement (bool, optional): whether to allow duplication in combination
+    
+    Returns:
+        Tensor: A tensor equivalent to converting all the input tensors into lists, do
+        `itertools.combinations` or `itertools.combinations_with_replacement` on these
+        lists, and finally convert the resulting list into tensor.
+    
+    Example::
+    
+        >>> a = [1, 2, 3]
+        >>> list(itertools.combinations(a, r=2))
+        [(1, 2), (1, 3), (2, 3)]
+        >>> list(itertools.combinations(a, r=3))
+        [(1, 2, 3)]
+        >>> list(itertools.combinations_with_replacement(a, r=2))
+        [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
+        >>> tensor_a = torch.tensor(a)
+        >>> torch.combinations(tensor_a)
+        tensor([[1, 2],
+                [1, 3],
+                [2, 3]])
+        >>> torch.combinations(tensor_a, r=3)
+        tensor([[1, 2, 3]])
+        >>> torch.combinations(tensor_a, with_replacement=True)
+        tensor([[1, 1],
+                [1, 2],
+                [1, 3],
+                [2, 2],
+                [2, 3],
+                [3, 3]])
+    """
+    ...
+def complex(real: Tensor, imag: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    complex(real, imag, *, out=None) -> Tensor
+    
+    Constructs a complex tensor with its real part equal to :attr:`real` and its
+    imaginary part equal to :attr:`imag`.
+    
+    Args:
+        real (Tensor): The real part of the complex tensor. Must be half, float or double.
+        imag (Tensor): The imaginary part of the complex tensor. Must be same dtype
+            as :attr:`real`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> real = torch.tensor([1, 2], dtype=torch.float32)
+        >>> imag = torch.tensor([3, 4], dtype=torch.float32)
+        >>> z = torch.complex(real, imag)
+        >>> z
+        tensor([(1.+3.j), (2.+4.j)])
+        >>> z.dtype
+        torch.complex64
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+def conj(input: Tensor) -> Tensor: 
+    r"""
+    conj(input) -> Tensor
+    
+    Returns a view of :attr:`input` with a flipped conjugate bit. If :attr:`input` has a non-complex dtype,
+    this function just returns :attr:`input`.
+    
+    .. note::
+        :func:`torch.conj` performs a lazy conjugation, but the actual conjugated tensor can be materialized
+        at any time using :func:`torch.resolve_conj`.
+    
+    .. warning:: In the future, :func:`torch.conj` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> x.is_conj()
+        False
+        >>> y = torch.conj(x)
+        >>> y.is_conj()
+        True
+    """
+    ...
+def conj_physical(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    conj_physical(input, *, out=None) -> Tensor
+    
+    Computes the element-wise conjugate of the given :attr:`input` tensor.
+    If :attr:`input` has a non-complex dtype, this function just returns :attr:`input`.
+    
+    .. note::
+       This performs the conjugate operation regardless of the fact conjugate bit is set or not.
+    
+    .. warning:: In the future, :func:`torch.conj_physical` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    .. math::
+        \text{out}_{i} = conj(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.conj_physical(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+    """
+    ...
+def conj_physical_(input: Tensor) -> Tensor: ...
+def constant_pad_nd(input: Tensor, pad: Sequence[Union[_int, SymInt]], value: Union[Number, _complex] = 0) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+def conv_tbc(input: Tensor, weight: Tensor, bias: Tensor, pad: _int = 0) -> Tensor: ...
+def conv_transpose1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def copysign(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+@overload
+def copysign(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+def corrcoef(input: Tensor) -> Tensor: 
+    r"""
+    corrcoef(input) -> Tensor
+    
+    Estimates the Pearson product-moment correlation coefficient matrix of the variables given by the :attr:`input` matrix,
+    where rows are the variables and columns are the observations.
+    
+    .. note::
+    
+        The correlation coefficient matrix R is computed using the covariance matrix C as given by
+        :math:`R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }`
+    
+    .. note::
+    
+        Due to floating point rounding, the resulting array may not be Hermitian and its diagonal elements may not be 1.
+        The real and imaginary values are clipped to the interval [-1, 1] in an attempt to improve this situation.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Returns:
+        (Tensor) The correlation coefficient matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.cov` covariance matrix.
+    
+    Example::
+    
+        >>> x = torch.tensor([[0, 1, 2], [2, 1, 0]])
+        >>> torch.corrcoef(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> x = torch.randn(2, 4)
+        >>> x
+        tensor([[-0.2678, -0.0908, -0.3766,  0.2780],
+                [-0.5812,  0.1535,  0.2387,  0.2350]])
+        >>> torch.corrcoef(x)
+        tensor([[1.0000, 0.3582],
+                [0.3582, 1.0000]])
+        >>> torch.corrcoef(x[0])
+        tensor(1.)
+    """
+    ...
+def cos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cos(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the cosine  of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 1.4309,  1.2706, -0.8562,  0.9796])
+        >>> torch.cos(a)
+        tensor([ 0.1395,  0.2957,  0.6553,  0.5574])
+    """
+    ...
+def cos_(input: Tensor) -> Tensor: ...
+def cosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic cosine  of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1632,  1.1835, -0.6979, -0.7325])
+        >>> torch.cosh(a)
+        tensor([ 1.0133,  1.7860,  1.2536,  1.2805])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.cosh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def cosh_(input: Tensor) -> Tensor: ...
+def cosine_embedding_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+def cosine_similarity(x1: Tensor, x2: Tensor, dim: _int = 1, eps: _float = 1e-08) -> Tensor: ...
+@overload
+def count_nonzero(input: Tensor, dim: Optional[_int] = None) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+@overload
+def count_nonzero(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+def cov(input: Tensor, *, correction: _int = 1, fweights: Optional[Tensor] = None, aweights: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cov(input, *, correction=1, fweights=None, aweights=None) -> Tensor
+    
+    Estimates the covariance matrix of the variables given by the :attr:`input` matrix, where rows are
+    the variables and columns are the observations.
+    
+    A covariance matrix is a square matrix giving the covariance of each pair of variables. The diagonal contains
+    the variance of each variable (covariance of a variable with itself). By definition, if :attr:`input` represents
+    a single variable (Scalar or 1D) then its variance is returned.
+    
+    The sample covariance of the variables :math:`x` and :math:`y` is given by:
+    
+    .. math::
+        \text{cov}(x,y) = \frac{\sum^{N}_{i = 1}(x_{i} - \bar{x})(y_{i} - \bar{y})}{\max(0,~N~-~\delta N)}
+    
+    where :math:`\bar{x}` and :math:`\bar{y}` are the simple means of the :math:`x` and :math:`y` respectively, and
+    :math:`\delta N` is the :attr:`correction`.
+    
+    If :attr:`fweights` and/or :attr:`aweights` are provided, the weighted covariance
+    is calculated, which is given by:
+    
+    .. math::
+        \text{cov}_w(x,y) = \frac{\sum^{N}_{i = 1}w_i(x_{i} - \mu_x^*)(y_{i} - \mu_y^*)}
+        {\max(0,~\sum^{N}_{i = 1}w_i~-~\frac{\sum^{N}_{i = 1}w_ia_i}{\sum^{N}_{i = 1}w_i}~\delta N)}
+    
+    where :math:`w` denotes :attr:`fweights` or :attr:`aweights` (``f`` and ``a`` for brevity) based on whichever is
+    provided, or :math:`w = f \times a` if both are provided, and
+    :math:`\mu_x^* = \frac{\sum^{N}_{i = 1}w_ix_{i} }{\sum^{N}_{i = 1}w_i}` is the weighted mean of the variable. If not
+    provided, ``f`` and/or ``a`` can be seen as a :math:`\mathbb{1}` vector of appropriate size.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Keyword Args:
+        correction (int, optional): difference between the sample size and sample degrees of freedom.
+            Defaults to Bessel's correction, ``correction = 1`` which returns the unbiased estimate,
+            even if both :attr:`fweights` and :attr:`aweights` are specified. ``correction = 0``
+            will return the simple average. Defaults to ``1``.
+        fweights (tensor, optional): A Scalar or 1D tensor of observation vector frequencies representing the number of
+            times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
+            Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
+        aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
+            These relative weights are typically large for observations considered "important" and smaller for
+            observations considered less "important". Its numel must equal the number of columns of :attr:`input`.
+            Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.
+    
+    Returns:
+        (Tensor) The covariance matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.corrcoef` normalized covariance matrix.
+    
+    Example::
+        >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T
+        >>> x
+        tensor([[0, 1, 2],
+                [2, 1, 0]])
+        >>> torch.cov(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> torch.cov(x, correction=0)
+        tensor([[ 0.6667, -0.6667],
+                [-0.6667,  0.6667]])
+        >>> fw = torch.randint(1, 10, (3,))
+        >>> fw
+        tensor([1, 6, 9])
+        >>> aw = torch.rand(3)
+        >>> aw
+        tensor([0.4282, 0.0255, 0.4144])
+        >>> torch.cov(x, fweights=fw, aweights=aw)
+        tensor([[ 0.4169, -0.4169],
+                [-0.4169,  0.4169]])
+    """
+    ...
+def cross(input: Tensor, other: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cross(input, other, dim=None, *, out=None) -> Tensor
+    
+    
+    Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input`
+    and :attr:`other`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes. Also supports batches
+    of vectors, for which it computes the product along the dimension :attr:`dim`.
+    In this case, the output has the same batch dimensions as the inputs.
+    
+    .. warning::
+        If :attr:`dim` is not given, it defaults to the first dimension found
+        with the size 3. Note that this might be unexpected.
+    
+        This behavior is deprecated and will be changed to match that of :func:`torch.linalg.cross`
+        in a future release.
+    
+    .. seealso::
+            :func:`torch.linalg.cross` which has dim=-1 as default.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+        dim  (int, optional): the dimension to take the cross-product in.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 3)
+        >>> a
+        tensor([[-0.3956,  1.1455,  1.6895],
+                [-0.5849,  1.3672,  0.3599],
+                [-1.1626,  0.7180, -0.0521],
+                [-0.1339,  0.9902, -2.0225]])
+        >>> b = torch.randn(4, 3)
+        >>> b
+        tensor([[-0.0257, -1.4725, -1.2251],
+                [-1.1479, -0.7005, -1.9757],
+                [-1.3904,  0.3726, -1.1836],
+                [-0.9688, -0.7153,  0.2159]])
+        >>> torch.cross(a, b, dim=1)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+        >>> torch.cross(a, b)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+    """
+    ...
+def crow_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.crow_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+def cudnn_affine_grid_generator(theta: Tensor, N: _int, C: _int, H: _int, W: _int) -> Tensor: ...
+def cudnn_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def cudnn_convolution(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def cudnn_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool) -> Tensor: ...
+def cudnn_grid_sampler(input: Tensor, grid: Tensor) -> Tensor: ...
+def cudnn_is_acceptable(input: Tensor) -> _bool: ...
+@overload
+def cummax(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummax(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+def deg2rad(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    deg2rad(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in degrees to radians.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]])
+        >>> torch.deg2rad(a)
+        tensor([[ 3.1416, -3.1416],
+                [ 6.2832, -6.2832],
+                [ 1.5708, -1.5708]])
+    """
+    ...
+def deg2rad_(input: Tensor) -> Tensor: ...
+@overload
+def dequantize(input: Tensor) -> Tensor: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+@overload
+def dequantize(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+def det(input: Tensor) -> Tensor: 
+    r"""
+    det(input) -> Tensor
+    
+    Alias for :func:`torch.linalg.det`
+    """
+    ...
+def detach(input: Tensor) -> Tensor: ...
+def detach_(input: Tensor) -> Tensor: ...
+def detach_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.detach`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diag(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diag(input, diagonal=0, *, out=None) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with
+      the diagonal elements of :attr:`input`.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider:
+    
+    - If :attr:`diagonal` = 0, it is the main diagonal.
+    - If :attr:`diagonal` > 0, it is above the main diagonal.
+    - If :attr:`diagonal` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+            :func:`torch.diagonal` always returns the diagonal of its input.
+    
+            :func:`torch.diagflat` always constructs a tensor with diagonal elements
+            specified by the input.
+    
+    Examples:
+    
+    Get the square matrix where the input vector is the diagonal::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.5950,-0.0872, 2.3298])
+        >>> torch.diag(a)
+        tensor([[ 0.5950, 0.0000, 0.0000],
+                [ 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 2.3298]])
+        >>> torch.diag(a, 1)
+        tensor([[ 0.0000, 0.5950, 0.0000, 0.0000],
+                [ 0.0000, 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 0.0000, 2.3298],
+                [ 0.0000, 0.0000, 0.0000, 0.0000]])
+    
+    Get the k-th diagonal of a given matrix::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-0.4264, 0.0255,-0.1064],
+                [ 0.8795,-0.2429, 0.1374],
+                [ 0.1029,-0.6482,-1.6300]])
+        >>> torch.diag(a, 0)
+        tensor([-0.4264,-0.2429,-1.6300])
+        >>> torch.diag(a, 1)
+        tensor([ 0.0255, 0.1374])
+    """
+    ...
+def diag_embed(input: Tensor, offset: _int = 0, dim1: _int = -2, dim2: _int = -1) -> Tensor: 
+    r"""
+    diag_embed(input, offset=0, dim1=-2, dim2=-1) -> Tensor
+    
+    Creates a tensor whose diagonals of certain 2D planes (specified by
+    :attr:`dim1` and :attr:`dim2`) are filled by :attr:`input`.
+    To facilitate creating batched diagonal matrices, the 2D planes formed by
+    the last two dimensions of the returned tensor are chosen by default.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    The size of the new matrix will be calculated to make the specified diagonal
+    of the size of the last input dimension.
+    Note that for :attr:`offset` other than :math:`0`, the order of :attr:`dim1`
+    and :attr:`dim2` matters. Exchanging them is equivalent to changing the
+    sign of :attr:`offset`.
+    
+    Applying :meth:`torch.diagonal` to the output of this function with
+    the same arguments yields a matrix identical to input. However,
+    :meth:`torch.diagonal` has different default dimensions, so those
+    need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 1-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: -2.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: -1.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> torch.diag_embed(a)
+        tensor([[[ 1.5410,  0.0000,  0.0000],
+                 [ 0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -2.1788]],
+    
+                [[ 0.5684,  0.0000,  0.0000],
+                 [ 0.0000, -1.0845,  0.0000],
+                 [ 0.0000,  0.0000, -1.3986]]])
+    
+        >>> torch.diag_embed(a, offset=1, dim1=0, dim2=2)
+        tensor([[[ 0.0000,  1.5410,  0.0000,  0.0000],
+                 [ 0.0000,  0.5684,  0.0000,  0.0000]],
+    
+                [[ 0.0000,  0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -1.0845,  0.0000]],
+    
+                [[ 0.0000,  0.0000,  0.0000, -2.1788],
+                 [ 0.0000,  0.0000,  0.0000, -1.3986]],
+    
+                [[ 0.0000,  0.0000,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000,  0.0000]]])
+    """
+    ...
+def diagflat(input: Tensor, offset: _int = 0) -> Tensor: 
+    r"""
+    diagflat(input, offset=0) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a tensor with more than one dimension, then returns a
+      2-D tensor with diagonal elements equal to a flattened :attr:`input`.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        offset (int, optional): the diagonal to consider. Default: 0 (main
+            diagonal).
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([-0.2956, -0.9068,  0.1695])
+        >>> torch.diagflat(a)
+        tensor([[-0.2956,  0.0000,  0.0000],
+                [ 0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.1695]])
+        >>> torch.diagflat(a, 1)
+        tensor([[ 0.0000, -0.2956,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  0.1695],
+                [ 0.0000,  0.0000,  0.0000,  0.0000]])
+    
+        >>> a = torch.randn(2, 2)
+        >>> a
+        tensor([[ 0.2094, -0.3018],
+                [-0.1516,  1.9342]])
+        >>> torch.diagflat(a)
+        tensor([[ 0.2094,  0.0000,  0.0000,  0.0000],
+                [ 0.0000, -0.3018,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.1516,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  1.9342]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, *, outdim: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None], dim2: Union[str, ellipsis, None], offset: _int = 0) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+def diagonal_copy(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.diagonal`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diagonal_scatter(input: Tensor, src: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal_scatter(input, src, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the diagonal elements of :attr:`input`, with respect to :attr:`dim1`
+    and :attr:`dim2`.
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        src (Tensor): the tensor to embed into :attr:`input`.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.diagonal(input, offset, dim1, dim2)``
+    
+    Examples::
+    
+        >>> a = torch.zeros(3, 3)
+        >>> a
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(3), 0)
+        tensor([[1., 0., 0.],
+                [0., 1., 0.],
+                [0., 0., 1.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(2), 1)
+        tensor([[0., 1., 0.],
+                [0., 0., 1.],
+                [0., 0., 0.]])
+    """
+    ...
+def diff(input: Tensor, n: _int = 1, dim: _int = -1, prepend: Optional[Tensor] = None, append: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diff(input, n=1, dim=-1, prepend=None, append=None) -> Tensor
+    
+    Computes the n-th forward difference along the given dimension.
+    
+    The first-order differences are given by `out[i] = input[i + 1] - input[i]`. Higher-order
+    differences are calculated by using :func:`torch.diff` recursively.
+    
+    Args:
+        input (Tensor): the tensor to compute the differences on
+        n (int, optional): the number of times to recursively compute the difference
+        dim (int, optional): the dimension to compute the difference along.
+            Default is the last dimension.
+        prepend, append (Tensor, optional): values to prepend or append to
+            :attr:`input` along :attr:`dim` before computing the difference.
+            Their dimensions must be equivalent to that of input, and their shapes
+            must match input's shape except on :attr:`dim`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 3, 2])
+        >>> torch.diff(a)
+        tensor([ 2, -1])
+        >>> b = torch.tensor([4, 5])
+        >>> torch.diff(a, append=b)
+        tensor([ 2, -1,  2,  1])
+        >>> c = torch.tensor([[1, 2, 3], [3, 4, 5]])
+        >>> torch.diff(c, dim=0)
+        tensor([[2, 2, 2]])
+        >>> torch.diff(c, dim=1)
+        tensor([[1, 1],
+                [1, 1]])
+    """
+    ...
+def digamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    digamma(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.digamma`.
+    """
+    ...
+def dist(input: Tensor, other: Tensor, p: Union[Number, _complex] = 2) -> Tensor: 
+    r"""
+    dist(input, other, p=2) -> Tensor
+    
+    Returns the p-norm of (:attr:`input` - :attr:`other`)
+    
+    The shapes of :attr:`input` and :attr:`other` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the Right-hand-side input tensor
+        p (float, optional): the norm to be computed
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([-1.5393, -0.8675,  0.5916,  1.6321])
+        >>> y = torch.randn(4)
+        >>> y
+        tensor([ 0.0967, -1.0511,  0.6295,  0.8360])
+        >>> torch.dist(x, y, 3.5)
+        tensor(1.6727)
+        >>> torch.dist(x, y, 3)
+        tensor(1.6973)
+        >>> torch.dist(x, y, 0)
+        tensor(4.)
+        >>> torch.dist(x, y, 1)
+        tensor(2.6537)
+    """
+    ...
+def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    div(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Divides each element of the input ``input`` by the corresponding element of
+    :attr:`other`.
+    
+    .. math::
+        \text{out}_i = \frac{\text{input}_i}{\text{other}_i}
+    
+    .. note::
+        By default, this performs a "true" division like Python 3.
+        See the :attr:`rounding_mode` argument for floor division.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    Always promotes integer types to the default scalar type.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        rounding_mode (str, optional): Type of rounding applied to the result:
+    
+            * None - default behavior. Performs no rounding and, if both :attr:`input` and
+              :attr:`other` are integer types, promotes the inputs to the default scalar type.
+              Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
+            * ``"trunc"`` - rounds the results of the division towards zero.
+              Equivalent to C-style integer division.
+            * ``"floor"`` - rounds the results of the division down.
+              Equivalent to floor division in Python (the ``//`` operator) and NumPy's ``np.floor_divide``.
+    
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> x = torch.tensor([ 0.3810,  1.2774, -0.2972, -0.3719,  0.4637])
+        >>> torch.div(x, 0.5)
+        tensor([ 0.7620,  2.5548, -0.5944, -0.7438,  0.9274])
+    
+        >>> a = torch.tensor([[-0.3711, -1.9353, -0.4605, -0.2917],
+        ...                   [ 0.1815, -1.0111,  0.9805, -1.5923],
+        ...                   [ 0.1062,  1.4581,  0.7759, -1.2344],
+        ...                   [-0.1830, -0.0313,  1.1908, -1.4757]])
+        >>> b = torch.tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
+        >>> torch.div(a, b)
+        tensor([[-0.4620, -6.6051,  0.5676,  1.2639],
+                [ 0.2260, -3.4509, -1.2086,  6.8990],
+                [ 0.1322,  4.9764, -0.9564,  5.3484],
+                [-0.2278, -0.1068, -1.4678,  6.3938]])
+    
+        >>> torch.div(a, b, rounding_mode='trunc')
+        tensor([[-0., -6.,  0.,  1.],
+                [ 0., -3., -1.,  6.],
+                [ 0.,  4., -0.,  5.],
+                [-0., -0., -1.,  6.]])
+    
+        >>> torch.div(a, b, rounding_mode='floor')
+        tensor([[-1., -7.,  0.,  1.],
+                [ 0., -4., -2.,  6.],
+                [ 0.,  4., -1.,  5.],
+                [-1., -1., -2.,  6.]])
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, rounding_mode: Optional[str], out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+def dot(input: Tensor, tensor: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D tensors.
+    
+    .. note::
+    
+        Unlike NumPy's dot, torch.dot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+    """
+    ...
+def dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def dsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+@overload
+def dsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+def dstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence depthwise (along third axis).
+    
+    This is equivalent to concatenation along the third axis after 1-D and 2-D tensors have been reshaped by :func:`torch.atleast_3d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4],
+                 [2, 5],
+                 [3, 6]]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4]],
+                [[2, 5]],
+                [[3, 6]]])
+    """
+    ...
+def embedding(weight: Tensor, indices: Tensor, padding_idx: Union[_int, SymInt] = -1, scale_grad_by_freq: _bool = False, sparse: _bool = False) -> Tensor: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool, mode: _int, sparse: _bool, per_sample_weights: Optional[Tensor], include_last_offset: _bool, padding_idx: Optional[_int]) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def embedding_renorm_(input: Tensor, indices: Tensor, max_norm: _float, norm_type: _float) -> Tensor: ...
+@overload
+def empty(size: Sequence[Union[_int, SymInt]], *, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+def empty_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns an uninitialized tensor with the same size as :attr:`input`.
+    ``torch.empty_like(input)`` is equivalent to
+    ``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> a=torch.empty((2,3), dtype=torch.int32, device = 'cuda')
+        >>> torch.empty_like(a)
+        tensor([[0, 0, 0],
+                [0, 0, 0]], device='cuda:0', dtype=torch.int32)
+    """
+    ...
+def empty_permuted(size: Sequence[Union[_int, SymInt]], physical_layout: _size, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates an uninitialized, non-overlapping and dense tensor with the
+    specified :attr:`size`, with :attr:`physical_layout` specifying how the
+    dimensions are physically laid out in memory (each logical dimension is listed
+    from outermost to innermost).  :attr:`physical_layout` is a generalization
+    of NCHW/NHWC notation: if each dimension is assigned a number according to
+    what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)``
+    while NHWC is ``(0, 2, 3, 1)``.  Equivalently, the strides of the output
+    tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]``
+    (notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``).
+    
+    Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense
+    tensor with no overlaps.  If possible, prefer using this function over
+    :func:`torch.empty_strided` or manual use of :func:`torch.as_strided`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        physical_layout (tuple of int): the ordering of dimensions physically in memory
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Examples:
+    
+        >>> torch.empty((2, 3, 5, 7)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).dim_order()
+        (0, 2, 3, 1)
+    """
+    ...
+def empty_quantized(size: _size, qtensor: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def empty_strided(size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates a tensor with the specified :attr:`size` and :attr:`stride` and filled with undefined data.
+    
+    .. warning::
+        If the constructed tensor is "overlapped" (with multiple indices referring to the same element
+        in memory) its behavior is undefined.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        stride (tuple of int): the strides of the output tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> a = torch.empty_strided((2, 3), (1, 2))
+        >>> a
+        tensor([[8.9683e-44, 4.4842e-44, 5.1239e+07],
+                [0.0000e+00, 0.0000e+00, 3.0705e-41]])
+        >>> a.stride()
+        (1, 2)
+        >>> a.size()
+        torch.Size([2, 3])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+def equal(input: Tensor, other: Tensor) -> _bool: 
+    r"""
+    equal(input, other) -> bool
+    
+    ``True`` if two tensors have the same size and elements, ``False`` otherwise.
+    
+    Example::
+    
+        >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2]))
+        True
+    """
+    ...
+def erf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erf(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erf`.
+    """
+    ...
+def erf_(input: Tensor) -> Tensor: ...
+def erfc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfc`.
+    """
+    ...
+def erfc_(input: Tensor) -> Tensor: ...
+def erfinv(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfinv(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfinv`.
+    """
+    ...
+def exp(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the exponential of the elements
+    of the input tensor :attr:`input`.
+    
+    .. math::
+        y_{i} = e^{x_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.exp(torch.tensor([0, math.log(2.)]))
+        tensor([ 1.,  2.])
+    """
+    ...
+def exp2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp2(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.exp2`.
+    """
+    ...
+def exp2_(input: Tensor) -> Tensor: ...
+def exp_(input: Tensor) -> Tensor: ...
+def expand_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, implicit: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.expand`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def expm1(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    expm1(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expm1`.
+    """
+    ...
+def expm1_(input: Tensor) -> Tensor: ...
+@overload
+def eye(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+@overload
+def eye(n: Union[_int, SymInt], m: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+def fake_quantize_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_channel_affine(input, scale, zero_point, axis, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized per channel using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`, across the channel specified by :attr:`axis`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), in ``torch.float32``
+        scale (Tensor): quantization scale, per channel in ``torch.float32``
+        zero_point (Tensor): quantization zero_point, per channel in ``torch.int32`` or ``torch.half`` or ``torch.float32``
+        axis (int32): channel axis
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized per channel ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(2, 2, 2)
+        >>> x
+        tensor([[[-0.2525, -0.0466],
+                 [ 0.3491, -0.2168]],
+    
+                [[-0.5906,  1.6258],
+                 [ 0.6444, -0.0542]]])
+        >>> scales = (torch.randn(2) + 1) * 0.05
+        >>> scales
+        tensor([0.0475, 0.0486])
+        >>> zero_points = torch.zeros(2).to(torch.int32)
+        >>> zero_points
+        tensor([0, 0])
+        >>> torch.fake_quantize_per_channel_affine(x, scales, zero_points, 1, 0, 255)
+        tensor([[[0.0000, 0.0000],
+                 [0.3405, 0.0000]],
+    
+                [[0.0000, 1.6134],
+                [0.6323, 0.0000]]])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: _float, zero_point: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+def fbgemm_linear_fp16_weight(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_fp16_weight_fp32_activation(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight_fp32_activation(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_quantize_weight(input: Tensor) -> Tuple[Tensor, Tensor, _float, _int]: ...
+def fbgemm_pack_gemm_matrix_fp16(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor, K: _int, N: _int) -> Tensor: ...
+def feature_alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def fix(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fix(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.trunc`
+    """
+    ...
+def fix_(input: Tensor) -> Tensor: ...
+@overload
+def flatten(input: Tensor, start_dim: _int = 0, end_dim: _int = -1) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: _int, end_dim: _int, out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: Union[str, ellipsis, None], end_dim: Union[str, ellipsis, None], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, dims: Sequence[Union[str, ellipsis, None]], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+def flip(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    flip(input, dims) -> Tensor
+    
+    Reverse the order of an n-D tensor along given axis in dims.
+    
+    .. note::
+        `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flip` is expected to be slower than `np.flip`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (a list or tuple): axis to flip on
+    
+    Example::
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[ 0,  1],
+                 [ 2,  3]],
+    
+                [[ 4,  5],
+                 [ 6,  7]]])
+        >>> torch.flip(x, [0, 1])
+        tensor([[[ 6,  7],
+                 [ 4,  5]],
+    
+                [[ 2,  3],
+                 [ 0,  1]]])
+    """
+    ...
+def fliplr(input: Tensor) -> Tensor: 
+    r"""
+    fliplr(input) -> Tensor
+    
+    Flip tensor in the left/right direction, returning a new tensor.
+    
+    Flip the entries in each row in the left/right direction.
+    Columns are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 2-D.
+    
+    .. note::
+        `torch.fliplr` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.fliplr`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.fliplr` is expected to be slower than `np.fliplr`.
+    
+    Args:
+        input (Tensor): Must be at least 2-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.fliplr(x)
+        tensor([[1, 0],
+                [3, 2]])
+    """
+    ...
+def flipud(input: Tensor) -> Tensor: 
+    r"""
+    flipud(input) -> Tensor
+    
+    Flip tensor in the up/down direction, returning a new tensor.
+    
+    Flip the entries in each column in the up/down direction.
+    Rows are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 1-D.
+    
+    .. note::
+        `torch.flipud` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flipud`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flipud` is expected to be slower than `np.flipud`.
+    
+    Args:
+        input (Tensor): Must be at least 1-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.flipud(x)
+        tensor([[2, 3],
+                [0, 1]])
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+def floor(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the floor of the elements of :attr:`input`,
+    the largest integer less than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.8166,  1.5308, -0.2530, -0.2091])
+        >>> torch.floor(a)
+        tensor([-1.,  1., -1., -1.])
+    """
+    ...
+def floor_(input: Tensor) -> Tensor: ...
+def floor_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor_divide(input, other, *, out=None) -> Tensor
+    
+    .. note::
+    
+        Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed
+        truncation division. To restore the previous behavior use
+        :func:`torch.div` with ``rounding_mode='trunc'``.
+    
+    Computes :attr:`input` divided by :attr:`other`, elementwise, and floors
+    the result.
+    
+    .. math::
+        \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
+    
+    
+    
+    Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+    
+    Args:
+        input (Tensor or Number): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([4.0, 3.0])
+        >>> b = torch.tensor([2.0, 2.0])
+        >>> torch.floor_divide(a, b)
+        tensor([2.0, 1.0])
+        >>> torch.floor_divide(a, 1.4)
+        tensor([2.0, 2.0])
+    """
+    ...
+def fmax(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmax(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.maximum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the maximum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmax`` and is similar to NumPy's ``fmax`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([9.7, float('nan'), 3.1, float('nan')])
+        >>> b = torch.tensor([-2.2, 0.5, float('nan'), float('nan')])
+        >>> torch.fmax(a, b)
+        tensor([9.7000, 0.5000, 3.1000,    nan])
+    """
+    ...
+def fmin(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmin(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.minimum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the minimum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmin`` and is similar to NumPy's ``fmin`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([2.2, float('nan'), 2.1, float('nan')])
+        >>> b = torch.tensor([-9.3, 0.1, float('nan'), float('nan')])
+        >>> torch.fmin(a, b)
+        tensor([-9.3000, 0.1000, 2.1000,    nan])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+def frac(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    frac(input, *, out=None) -> Tensor
+    
+    Computes the fractional portion of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \text{input}_{i} - \left\lfloor |\text{input}_{i}| \right\rfloor * \operatorname{sgn}(\text{input}_{i})
+    
+    Example::
+    
+        >>> torch.frac(torch.tensor([1, 2.5, -3.2]))
+        tensor([ 0.0000,  0.5000, -0.2000])
+    """
+    ...
+def frac_(input: Tensor) -> Tensor: ...
+def frexp(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.frexp: 
+    r"""
+    frexp(input, *, out=None) -> (Tensor mantissa, Tensor exponent)
+    
+    Decomposes :attr:`input` into mantissa and exponent tensors
+    such that :math:`\text{input} = \text{mantissa} \times 2^{\text{exponent}}`.
+    
+    The range of mantissa is the open interval (-1, 1).
+    
+    Supports float inputs.
+    
+    Args:
+        input (Tensor): the input tensor
+    
+    
+    Keyword args:
+        out (tuple, optional): the output tensors
+    
+    Example::
+    
+        >>> x = torch.arange(9.)
+        >>> mantissa, exponent = torch.frexp(x)
+        >>> mantissa
+        tensor([0.0000, 0.5000, 0.5000, 0.7500, 0.5000, 0.6250, 0.7500, 0.8750, 0.5000])
+        >>> exponent
+        tensor([0, 1, 2, 2, 3, 3, 3, 3, 4], dtype=torch.int32)
+        >>> torch.ldexp(mantissa, exponent)
+        tensor([0., 1., 2., 3., 4., 5., 6., 7., 8.])
+    """
+    ...
+def frobenius_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def from_file(filename: str, shared: Optional[_bool] = None, size: Optional[_int] = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    from_file(filename, shared=None, size=0, *, dtype=None, layout=None, device=None, pin_memory=False)
+    
+    Creates a CPU tensor with a storage backed by a memory-mapped file.
+    
+    If ``shared`` is True, then memory is shared between processes. All changes are written to the file.
+    If ``shared`` is False, then changes to the tensor do not affect the file.
+    
+    ``size`` is the number of elements in the Tensor. If ``shared`` is ``False``, then the file must contain
+    at least ``size * sizeof(dtype)`` bytes. If ``shared`` is ``True`` the file will be created if needed.
+    
+    .. note::
+        Only CPU tensors can be mapped to files.
+    
+    .. note::
+        For now, tensors with storages backed by a memory-mapped file cannot be created in pinned memory.
+    
+    
+    Args:
+        filename (str): file name to map
+        shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                        underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+        size (int): number of elements in the tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+        >>> t = torch.randn(2, 5, dtype=torch.float64)
+        >>> t.numpy().tofile('storage.pt')
+        >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64)
+    """
+    ...
+def from_numpy(ndarray) -> Tensor: 
+    r"""
+    from_numpy(ndarray) -> Tensor
+    
+    Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
+    
+    The returned tensor and :attr:`ndarray` share the same memory. Modifications to
+    the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
+    tensor is not resizable.
+    
+    It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``,
+    ``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``,
+    ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
+    and ``bool``.
+    
+    .. warning::
+        Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior.
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.from_numpy(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    """
+    ...
+def frombuffer(buffer: Any, *, dtype: _dtype, count: int = -1, offset: int = 0, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    frombuffer(buffer, *, dtype, count=-1, offset=0, requires_grad=False) -> Tensor
+    
+    Creates a 1-dimensional :class:`Tensor` from an object that implements
+    the Python buffer protocol.
+    
+    Skips the first :attr:`offset` bytes in the buffer, and interprets the rest of
+    the raw bytes as a 1-dimensional tensor of type :attr:`dtype` with :attr:`count`
+    elements.
+    
+    Note that either of the following must be true:
+    
+    1. :attr:`count` is a positive non-zero number, and the total number of bytes
+    in the buffer is more than :attr:`offset` plus :attr:`count` times the size
+    (in bytes) of :attr:`dtype`.
+    
+    2. :attr:`count` is negative, and the length (number of bytes) of the buffer
+    subtracted by the :attr:`offset` is a multiple of the size (in bytes) of
+    :attr:`dtype`.
+    
+    The returned tensor and buffer share the same memory. Modifications to
+    the tensor will be reflected in the buffer and vice versa. The returned
+    tensor is not resizable.
+    
+    .. note::
+        This function increments the reference count for the object that
+        owns the shared memory. Therefore, such memory will not be deallocated
+        before the returned tensor goes out of scope.
+    
+    .. warning::
+        This function's behavior is undefined when passed an object implementing
+        the buffer protocol whose data is not on the CPU. Doing so is likely to
+        cause a segmentation fault.
+    
+    .. warning::
+        This function does not try to infer the :attr:`dtype` (hence, it is not
+        optional). Passing a different :attr:`dtype` than its source may result
+        in unexpected behavior.
+    
+    Args:
+        buffer (object): a Python object that exposes the buffer interface.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        count (int, optional): the number of desired elements to be read.
+            If negative, all the elements (until the end of the buffer) will be
+            read. Default: -1.
+        offset (int, optional): the number of bytes to skip at the start of
+            the buffer. Default: 0.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> import array
+        >>> a = array.array('i', [1, 2, 3])
+        >>> t = torch.frombuffer(a, dtype=torch.int32)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> # Interprets the signed char bytes as 32-bit integers.
+        >>> # Each 4 signed char elements will be interpreted as
+        >>> # 1 signed 32-bit integer.
+        >>> import array
+        >>> a = array.array('b', [-1, 0, 0, 0])
+        >>> torch.frombuffer(a, dtype=torch.int32)
+        tensor([255], dtype=torch.int32)
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: List[Union[str, None]], layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: Sequence[Union[_int, SymInt]], fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+def full_like(input: Tensor, fill_value: Union[Number, _complex], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full_like(input, fill_value, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
+    ``torch.full_like(input, fill_value)`` is equivalent to
+    ``torch.full(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        fill_value: the number to fill the output tensor with.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def fused_moving_avg_obs_fake_quant(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> Tensor: ...
+@overload
+def gather(input: Tensor, dim: _int, index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+@overload
+def gather(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+def gcd(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gcd(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise greatest common divisor (GCD) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`gcd(0, 0) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.gcd(a, b)
+        tensor([1, 2, 5])
+        >>> c = torch.tensor([3])
+        >>> torch.gcd(a, c)
+        tensor([1, 1, 3])
+    """
+    ...
+def gcd_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def ge(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+@overload
+def ge(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+def geqrf(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.geqrf: 
+    r"""
+    geqrf(input, *, out=None) -> (Tensor, Tensor)
+    
+    This is a low-level function for calling LAPACK's geqrf directly. This function
+    returns a namedtuple (a, tau) as defined in `LAPACK documentation for geqrf`_ .
+    
+    Computes a QR decomposition of :attr:`input`.
+    Both `Q` and `R` matrices are stored in the same output tensor `a`.
+    The elements of `R` are stored on and above the diagonal.
+    Elementary reflectors (or Householder vectors) implicitly defining matrix `Q`
+    are stored below the diagonal.
+    The results of this function can be used together with :func:`torch.linalg.householder_product`
+    to obtain the `Q` matrix or
+    with :func:`torch.ormqr`, which uses an implicit representation of the `Q` matrix,
+    for an efficient matrix-matrix multiplication.
+    
+    See `LAPACK documentation for geqrf`_ for further details.
+    
+    .. note::
+        See also :func:`torch.linalg.qr`, which computes Q and R matrices, and :func:`torch.linalg.lstsq`
+        with the ``driver="gels"`` option for a function that can solve matrix equations using a QR decomposition.
+    
+    Args:
+        input (Tensor): the input matrix
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, Tensor). Ignored if `None`. Default: `None`.
+    
+    .. _LAPACK documentation for geqrf:
+        http://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html
+    """
+    ...
+def ger(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ger(input, vec2, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.outer`.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future PyTorch release.
+        Use :func:`torch.outer` instead.
+    """
+    ...
+def get_default_dtype() -> _dtype: 
+    r"""
+    get_default_dtype() -> torch.dtype
+    
+    Get the current default floating point :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.get_default_dtype()  # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_dtype(torch.float64)
+        >>> torch.get_default_dtype()  # default is now changed to torch.float64
+        torch.float64
+    """
+    ...
+def get_num_interop_threads() -> _int: 
+    r"""
+    get_num_interop_threads() -> int
+    
+    Returns the number of threads used for inter-op parallelism on CPU
+    (e.g. in JIT interpreter)
+    """
+    ...
+def get_num_threads() -> _int: 
+    r"""
+    get_num_threads() -> int
+    
+    Returns the number of threads used for parallelizing CPU operations
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Optional[Union[Number, _complex]] = None, dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Number, _complex], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+def grid_sampler(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_2d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def group_norm(input: Tensor, num_groups: _int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enabled: _bool = True) -> Tensor: ...
+@overload
+def gru(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def gru(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def gt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def gt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hann_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def hann_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def hardshrink(input: Tensor, lambd: Union[Number, _complex] = 0.5, *, out: Optional[Tensor] = None) -> Tensor: ...
+def heaviside(input: Tensor, values: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    heaviside(input, values, *, out=None) -> Tensor
+    
+    Computes the Heaviside step function for each element in :attr:`input`.
+    The Heaviside step function is defined as:
+    
+    .. math::
+        \text{{heaviside}}(input, values) = \begin{cases}
+            0, & \text{if input < 0}\\
+            values, & \text{if input == 0}\\
+            1, & \text{if input > 0}
+        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        values (Tensor): The values to use where :attr:`input` is zero.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.tensor([-1.5, 0, 2.0])
+        >>> values = torch.tensor([0.5])
+        >>> torch.heaviside(input, values)
+        tensor([0.0000, 0.5000, 1.0000])
+        >>> values = torch.tensor([1.2, -2.0, 3.5])
+        >>> torch.heaviside(input, values)
+        tensor([0., -2., 1.])
+    """
+    ...
+def hinge_embedding_loss(input: Tensor, target: Tensor, margin: _float = 1.0, reduction: _int = 1) -> Tensor: ...
+def histc(input: Tensor, bins: _int = 100, min: Union[Number, _complex] = 0, max: Union[Number, _complex] = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor
+    
+    Computes the histogram of a tensor.
+    
+    The elements are sorted into equal width bins between :attr:`min` and
+    :attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
+    maximum values of the data are used.
+    
+    Elements lower than min and higher than max and ``NaN`` elements are ignored.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins (int): number of histogram bins
+        min (Scalar): lower end of the range (inclusive)
+        max (Scalar): upper end of the range (inclusive)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: Histogram represented as a tensor
+    
+    Example::
+    
+        >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
+        tensor([ 0.,  2.,  1.,  0.])
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: Tensor, *, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: _int = 100, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _int, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _size, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def hsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+@overload
+def hsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+def hspmm(mat1: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hspmm(mat1, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of a :ref:`sparse COO matrix
+    <sparse-coo-docs>` :attr:`mat1` and a strided matrix :attr:`mat2`. The
+    result is a (1 + 1)-dimensional :ref:`hybrid COO matrix
+    <sparse-hybrid-coo-docs>`.
+    
+    Args:
+        mat1 (Tensor): the first sparse matrix to be matrix multiplied
+        mat2 (Tensor): the second strided matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def hstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence horizontally (column wise).
+    
+    This is equivalent to concatenation along the first axis for 1-D tensors, and along the second axis for all other tensors.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.hstack((a,b))
+        tensor([1, 2, 3, 4, 5, 6])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.hstack((a,b))
+        tensor([[1, 4],
+                [2, 5],
+                [3, 6]])
+    """
+    ...
+def hypot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hypot(input, other, *, out=None) -> Tensor
+    
+    Given the legs of a right triangle, return its hypotenuse.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}^{2} + \text{other}_{i}^{2}}
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0]))
+        tensor([5.0000, 5.6569, 6.4031])
+    """
+    ...
+def i0(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    i0(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.i0`.
+    """
+    ...
+def i0_(input: Tensor) -> Tensor: ...
+def igamma(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igamma(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammainc`.
+    """
+    ...
+def igammac(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igammac(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammaincc`.
+    """
+    ...
+def imag(input: Tensor) -> Tensor: 
+    r"""
+    imag(input) -> Tensor
+    
+    Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    .. warning::
+        :func:`imag` is only supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.imag
+        tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_put_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_reduce(input: Tensor, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_reduce_` for function description.
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: _int, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+def indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def init_num_threads() -> None: ...
+def inner(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inner(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product for 1D tensors. For higher dimensions, sums the product
+    of elements from :attr:`input` and :attr:`other` along their last dimension.
+    
+    .. note::
+    
+        If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent
+        to `torch.mul(input, other)`.
+    
+        If both :attr:`input` and :attr:`other` are non-scalars, the size of their last
+        dimension must match and the result is equivalent to `torch.tensordot(input,
+        other, dims=([-1], [-1]))`
+    
+    Args:
+        input (Tensor): First input tensor
+        other (Tensor): Second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): Optional output tensor to write result into. The output
+                                shape is `input.shape[:-1] + other.shape[:-1]`.
+    
+    Example::
+    
+        # Dot product
+        >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1]))
+        tensor(7)
+    
+        # Multidimensional input tensors
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[0.8173, 1.0874, 1.1784],
+                [0.3279, 0.1234, 2.7894]])
+        >>> b = torch.randn(2, 4, 3)
+        >>> b
+        tensor([[[-0.4682, -0.7159,  0.1506],
+                [ 0.4034, -0.3657,  1.0387],
+                [ 0.9892, -0.6684,  0.1774],
+                [ 0.9482,  1.3261,  0.3917]],
+    
+                [[ 0.4537,  0.7493,  1.1724],
+                [ 0.2291,  0.5749, -0.2267],
+                [-0.7920,  0.3607, -0.3701],
+                [ 1.3666, -0.5850, -1.7242]]])
+        >>> torch.inner(a, b)
+        tensor([[[-0.9837,  1.1560,  0.2907,  2.6785],
+                [ 2.5671,  0.5452, -0.6912, -1.5509]],
+    
+                [[ 0.1782,  2.9843,  0.7366,  1.5672],
+                [ 3.5115, -0.4864, -1.2476, -4.4337]]])
+    
+        # Scalar input
+        >>> torch.inner(a, torch.tensor(2))
+        tensor([[1.6347, 2.1748, 2.3567],
+                [0.6558, 0.2469, 5.5787]])
+    """
+    ...
+def instance_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], use_input_stats: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def int_repr(input: Tensor) -> Tensor: ...
+def inverse(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inverse(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.inv`
+    """
+    ...
+def is_complex(input: Tensor) -> _bool: 
+    r"""
+    is_complex(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a complex data type i.e.,
+    one of ``torch.complex64``, and ``torch.complex128``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_conj(input: Tensor) -> _bool: 
+    r"""
+    is_conj(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a conjugated tensor, i.e. its conjugate bit is set to `True`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_distributed(input: Tensor) -> _bool: ...
+def is_floating_point(input: Tensor) -> _bool: 
+    r"""
+    is_floating_point(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a floating point data type i.e.,
+    one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_grad_enabled() -> _bool: 
+    r"""
+    is_grad_enabled() -> (bool)
+    
+    Returns True if grad mode is currently enabled.
+    """
+    ...
+def is_inference(input: Tensor) -> _bool: 
+    r"""
+    is_inference(input) -> (bool)
+    
+    Returns True if :attr:`input` is an inference tensor.
+    
+    A non-view tensor is an inference tensor if and only if it was
+    allocated during inference mode. A view tensor is an inference
+    tensor if and only if the tensor it is a view of is an inference tensor.
+    
+    For details on inference mode please see
+    `Inference Mode <https://pytorch.org/cppdocs/notes/inference_mode.html>`_.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_inference_mode_enabled() -> _bool: 
+    r"""
+    is_inference_mode_enabled() -> (bool)
+    
+    Returns True if inference mode is currently enabled.
+    """
+    ...
+def is_neg(input: Tensor) -> _bool: ...
+def is_nonzero(input: Tensor) -> _bool: 
+    r"""
+    is_nonzero(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a single element tensor which is not equal to zero
+    after type conversions.
+    i.e. not equal to ``torch.tensor([0.])`` or ``torch.tensor([0])`` or
+    ``torch.tensor([False])``.
+    Throws a ``RuntimeError`` if ``torch.numel() != 1`` (even in case
+    of sparse tensors).
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Examples::
+    
+        >>> torch.is_nonzero(torch.tensor([0.]))
+        False
+        >>> torch.is_nonzero(torch.tensor([1.5]))
+        True
+        >>> torch.is_nonzero(torch.tensor([False]))
+        False
+        >>> torch.is_nonzero(torch.tensor([3]))
+        True
+        >>> torch.is_nonzero(torch.tensor([1, 3, 5]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with more than one value is ambiguous
+        >>> torch.is_nonzero(torch.tensor([]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with no values is ambiguous
+    """
+    ...
+def is_same_size(input: Tensor, other: Tensor) -> _bool: ...
+def is_signed(input: Tensor) -> _bool: ...
+def is_vulkan_available() -> _bool: ...
+def isclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> Tensor: 
+    r"""
+    isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of
+    :attr:`input` is "close" to the corresponding element of :attr:`other`.
+    Closeness is defined as:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    
+    where :attr:`input` and :attr:`other` are finite. Where :attr:`input`
+    and/or :attr:`other` are nonfinite they are close if and only if
+    they are equal, with NaNs being considered equal to each other when
+    :attr:`equal_nan` is True.
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Examples::
+    
+        >>> torch.isclose(torch.tensor((1., 2, 3)), torch.tensor((1 + 1e-10, 3, 4)))
+        tensor([ True, False, False])
+        >>> torch.isclose(torch.tensor((float('inf'), 4)), torch.tensor((float('inf'), 6)), rtol=.5)
+        tensor([True, True])
+    """
+    ...
+def isfinite(input: Tensor) -> Tensor: 
+    r"""
+    isfinite(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element is `finite` or not.
+    
+    Real values are finite when they are not NaN, negative infinity, or infinity.
+    Complex values are finite when both their real and imaginary parts are finite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is finite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isfinite(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([True,  False,  True,  False,  False])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(element: Union[Number, _complex], test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_element: Union[Number, _complex], *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+def isinf(input: Tensor) -> Tensor: 
+    r"""
+    isinf(input) -> Tensor
+    
+    Tests if each element of :attr:`input` is infinite
+    (positive or negative infinity) or not.
+    
+    .. note::
+        Complex values are infinite when their real or imaginary part is
+        infinite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is infinite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isinf(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([False,  True,  False,  True,  False])
+    """
+    ...
+def isnan(input: Tensor) -> Tensor: 
+    r"""
+    isnan(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input`
+    is NaN or not. Complex values are considered NaN when either their real
+    and/or imaginary part is NaN.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is NaN and False elsewhere
+    
+    Example::
+    
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+        tensor([False, True, False])
+    """
+    ...
+def isneginf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isneginf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is negative infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isneginf(a)
+        tensor([ True, False, False])
+    """
+    ...
+def isposinf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isposinf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is positive infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isposinf(a)
+        tensor([False,  True, False])
+    """
+    ...
+def isreal(input: Tensor) -> Tensor: 
+    r"""
+    isreal(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input` is real-valued or not.
+    All real-valued types are considered real. Complex values are considered real when their imaginary part is 0.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is real and False elsewhere
+    
+    Example::
+    
+        >>> torch.isreal(torch.tensor([1, 1+1j, 2+0j]))
+        tensor([True, False, True])
+    """
+    ...
+def istft(input: Tensor, n_fft: _int, hop_length: Optional[_int] = None, win_length: Optional[_int] = None, window: Optional[Tensor] = None, center: _bool = True, normalized: _bool = False, onesided: Optional[_bool] = None, length: Optional[_int] = None, return_complex: _bool = False) -> Tensor: ...
+@overload
+def kaiser_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+def kl_div(input: Tensor, target: Tensor, reduction: _int = 1, *, log_target: _bool = False) -> Tensor: ...
+def kron(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    kron(input, other, *, out=None) -> Tensor
+    
+    Computes the Kronecker product, denoted by :math:`\otimes`, of :attr:`input` and :attr:`other`.
+    
+    If :attr:`input` is a :math:`(a_0 \times a_1 \times \dots \times a_n)` tensor and :attr:`other` is a
+    :math:`(b_0 \times b_1 \times \dots \times b_n)` tensor, the result will be a
+    :math:`(a_0*b_0 \times a_1*b_1 \times \dots \times a_n*b_n)` tensor with the following entries:
+    
+    .. math::
+        (\text{input} \otimes \text{other})_{k_0, k_1, \dots, k_n} =
+            \text{input}_{i_0, i_1, \dots, i_n} * \text{other}_{j_0, j_1, \dots, j_n},
+    
+    where :math:`k_t = i_t * b_t + j_t` for :math:`0 \leq t \leq n`.
+    If one tensor has fewer dimensions than the other it is unsqueezed until it has the same number of dimensions.
+    
+    Supports real-valued and complex-valued inputs.
+    
+    .. note::
+        This function generalizes the typical definition of the Kronecker product for two matrices to two tensors,
+        as described above. When :attr:`input` is a :math:`(m \times n)` matrix and :attr:`other` is a
+        :math:`(p \times q)` matrix, the result will be a :math:`(p*m \times q*n)` block matrix:
+    
+        .. math::
+            \mathbf{A} \otimes \mathbf{B}=\begin{bmatrix}
+            a_{11} \mathbf{B} & \cdots & a_{1 n} \mathbf{B} \\
+            \vdots & \ddots & \vdots \\
+            a_{m 1} \mathbf{B} & \cdots & a_{m n} \mathbf{B} \end{bmatrix}
+    
+        where :attr:`input` is :math:`\mathbf{A}` and :attr:`other` is :math:`\mathbf{B}`.
+    
+    Arguments:
+        input (Tensor)
+        other (Tensor)
+    
+    Keyword args:
+        out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+    
+    Examples::
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.ones(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 1., 0., 0.],
+                [1., 1., 0., 0.],
+                [0., 0., 1., 1.],
+                [0., 0., 1., 1.]])
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.arange(1, 5).reshape(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 2., 0., 0.],
+                [3., 4., 0., 0.],
+                [0., 0., 1., 2.],
+                [0., 0., 3., 4.]])
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+def layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enable: _bool = True) -> Tensor: ...
+def lcm(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lcm(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise least common multiple (LCM) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`lcm(0, 0) = 0` and :math:`lcm(0, a) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.lcm(a, b)
+        tensor([15, 20, 15])
+        >>> c = torch.tensor([3])
+        >>> torch.lcm(a, c)
+        tensor([15, 30, 15])
+    """
+    ...
+def lcm_(input: Tensor, other: Tensor) -> Tensor: ...
+def ldexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ldexp(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by 2 ** :attr:`other`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i * 2^\text{{other}}_i
+    
+    
+    Typically this function is used to construct floating point numbers by multiplying
+    mantissas in :attr:`input` with integral powers of two created from the exponents
+    in :attr:`other`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): a tensor of exponents, typically integers.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.ldexp(torch.tensor([1.]), torch.tensor([1]))
+        tensor([2.])
+        >>> torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4]))
+        tensor([ 2.,  4.,  8., 16.])
+    """
+    ...
+def ldexp_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def le(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def le(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def less(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+def lgamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lgamma(input, *, out=None) -> Tensor
+    
+    Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \ln |\Gamma(\text{input}_{i})|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.arange(0.5, 2, 0.5)
+        >>> torch.lgamma(a)
+        tensor([ 0.5724,  0.0000, -0.1208])
+    """
+    ...
+@overload
+def linspace(start: Number, end: Number, steps: Optional[_int] = None, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+def log(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{e} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5) * 5
+        >>> a
+        tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739])
+        >>> torch.log(a)
+        tensor([ 1.5637,  1.4640,  0.1952, -1.4226,  1.5204])
+    """
+    ...
+def log10(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log10(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 10 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{10} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.5224,  0.9354,  0.7257,  0.1301,  0.2251])
+    
+    
+        >>> torch.log10(a)
+        tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476])
+    """
+    ...
+def log10_(input: Tensor) -> Tensor: ...
+def log1p(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log1p(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of (1 + :attr:`input`).
+    
+    .. math::
+        y_i = \log_{e} (x_i + 1)
+    
+    .. note:: This function is more accurate than :func:`torch.log` for small
+              values of :attr:`input`
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
+        >>> torch.log1p(a)
+        tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
+    """
+    ...
+def log1p_(input: Tensor) -> Tensor: ...
+def log2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log2(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 2 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{2} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.8419,  0.8003,  0.9971,  0.5287,  0.0490])
+    
+    
+        >>> torch.log2(a)
+        tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504])
+    """
+    ...
+def log2_(input: Tensor) -> Tensor: ...
+def log_(input: Tensor) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def logaddexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs.
+    
+    Calculates pointwise :math:`\log\left(e^x + e^y\right)`. This function is useful
+    in statistics where the calculated probabilities of events may be so small as to
+    exceed the range of normal floating point numbers. In such cases the logarithm
+    of the calculated probability is stored. This function allows adding
+    probabilities stored in such a fashion.
+    
+    This op should be disambiguated with :func:`torch.logsumexp` which performs a
+    reduction on a single tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1.0, -2, -3]))
+        tensor([-0.3069, -0.6867, -0.8731])
+        >>> torch.logaddexp(torch.tensor([-100.0, -200, -300]), torch.tensor([-1.0, -2, -3]))
+        tensor([-1., -2., -3.])
+        >>> torch.logaddexp(torch.tensor([1.0, 2000, 30000]), torch.tensor([-1.0, -2, -3]))
+        tensor([1.1269e+00, 2.0000e+03, 3.0000e+04])
+    """
+    ...
+def logaddexp2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp2(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs in base-2.
+    
+    Calculates pointwise :math:`\log_2\left(2^x + 2^y\right)`. See
+    :func:`torch.logaddexp` for more details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+def logdet(input: Tensor) -> Tensor: 
+    r"""
+    logdet(input) -> Tensor
+    
+    Calculates log determinant of a square matrix or batches of square matrices.
+    
+    It returns ``-inf`` if the input has a determinant of zero, and ``NaN`` if it has
+    a negative determinant.
+    
+    .. note::
+        Backward through :meth:`logdet` internally uses SVD results when :attr:`input`
+        is not invertible. In this case, double backward through :meth:`logdet` will
+        be unstable in when :attr:`input` doesn't have distinct singular values. See
+        :func:`torch.linalg.svd` for details.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.slogdet` computes the sign (resp. angle) and natural logarithm of the
+            absolute value of the determinant of real-valued (resp. complex) square matrices.
+    
+    Arguments:
+        input (Tensor): the input tensor of size ``(*, n, n)`` where ``*`` is zero or more
+                    batch dimensions.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> torch.det(A)
+        tensor(0.2611)
+        >>> torch.logdet(A)
+        tensor(-1.3430)
+        >>> A
+        tensor([[[ 0.9254, -0.6213],
+                 [-0.5787,  1.6843]],
+    
+                [[ 0.3242, -0.9665],
+                 [ 0.4539, -0.0887]],
+    
+                [[ 1.1336, -0.4025],
+                 [-0.7089,  0.9032]]])
+        >>> A.det()
+        tensor([1.1990, 0.4099, 0.7386])
+        >>> A.det().log()
+        tensor([ 0.1815, -0.8917, -0.3031])
+    """
+    ...
+def logical_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_and(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute AND with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False, False])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_and(a, b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b.double())
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([False, False,  True, False])
+    """
+    ...
+def logical_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_not(input, *, out=None) -> Tensor
+    
+    Computes the element-wise logical NOT of the given input tensor. If not specified, the output tensor will have the bool
+    dtype. If the input tensor is not a bool tensor, zeros are treated as ``False`` and non-zeros are treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_not(torch.tensor([True, False]))
+        tensor([False,  True])
+        >>> torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1.5, -10.], dtype=torch.double))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1., -10.], dtype=torch.double), out=torch.empty(3, dtype=torch.int16))
+        tensor([1, 0, 0], dtype=torch.int16)
+    """
+    ...
+def logical_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_or(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical OR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute OR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_or(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_or(a, b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b.double())
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True,  True, False])
+    """
+    ...
+def logical_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical XOR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute XOR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_xor(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([False, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_xor(a, b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b.double())
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True, False, False])
+    """
+    ...
+def logit(input: Tensor, eps: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logit(input, eps=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.logit`.
+    """
+    ...
+def logit_(input: Tensor, eps: Optional[_float] = None) -> Tensor: ...
+@overload
+def logspace(start: Number, end: Number, steps: Optional[_int] = None, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def lstm(data: Tensor, batch_sizes: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def lstm(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+def lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def lt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+@overload
+def lt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+def lu_solve(input: Tensor, LU_data: Tensor, LU_pivots: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor
+    
+    Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
+    LU factorization of A from :func:`~linalg.lu_factor`.
+    
+    This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+    
+    .. warning::
+    
+        :func:`torch.lu_solve` is deprecated in favor of :func:`torch.linalg.lu_solve`.
+        :func:`torch.lu_solve` will be removed in a future PyTorch release.
+        ``X = torch.lu_solve(B, LU, pivots)`` should be replaced with
+    
+        .. code:: python
+    
+            X = linalg.lu_solve(LU, pivots, B)
+    
+    Arguments:
+        b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*`
+                    is zero or more batch dimensions.
+        LU_data (Tensor): the pivoted LU factorization of A from :meth:`~linalg.lu_factor` of size :math:`(*, m, m)`,
+                           where :math:`*` is zero or more batch dimensions.
+        LU_pivots (IntTensor): the pivots of the LU factorization from :meth:`~linalg.lu_factor` of size :math:`(*, m)`,
+                               where :math:`*` is zero or more batch dimensions.
+                               The batch dimensions of :attr:`LU_pivots` must be equal to the batch dimensions of
+                               :attr:`LU_data`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> b = torch.randn(2, 3, 1)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> x = torch.lu_solve(b, LU, pivots)
+        >>> torch.dist(A @ x, b)
+        tensor(1.00000e-07 *
+               2.8312)
+    """
+    ...
+def lu_unpack(LU_data: Tensor, LU_pivots: Tensor, unpack_data: _bool = True, unpack_pivots: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.lu_unpack: 
+    r"""
+    lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Unpacks the LU decomposition returned by :func:`~linalg.lu_factor` into the `P, L, U` matrices.
+    
+    .. seealso::
+    
+        :func:`~linalg.lu` returns the matrices from the LU decomposition. Its gradient formula is more efficient
+        than that of doing :func:`~linalg.lu_factor` followed by :func:`~linalg.lu_unpack`.
+    
+    Args:
+        LU_data (Tensor): the packed LU factorization data
+        LU_pivots (Tensor): the packed LU factorization pivots
+        unpack_data (bool): flag indicating if the data should be unpacked.
+                            If ``False``, then the returned ``L`` and ``U`` are empty tensors.
+                            Default: ``True``
+        unpack_pivots (bool): flag indicating if the pivots should be unpacked into a permutation matrix ``P``.
+                              If ``False``, then the returned ``P`` is  an empty tensor.
+                              Default: ``True``
+    
+    Keyword args:
+        out (tuple, optional): output tuple of three tensors. Ignored if `None`.
+    
+    Returns:
+        A namedtuple ``(P, L, U)``
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # We can recover A from the factorization
+        >>> A_ = P @ L @ U
+        >>> torch.allclose(A, A_)
+        True
+    
+        >>> # LU factorization of a rectangular matrix:
+        >>> A = torch.randn(2, 3, 2)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # P, L, U are the same as returned by linalg.lu
+        >>> P_, L_, U_ = torch.linalg.lu(A)
+        >>> torch.allclose(P, P_) and torch.allclose(L, L_) and torch.allclose(U, U_)
+        True
+    """
+    ...
+def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def masked_scatter(input: Tensor, mask: Tensor, source: Tensor) -> Tensor: ...
+def masked_select(input: Tensor, mask: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    masked_select(input, mask, *, out=None) -> Tensor
+    
+    Returns a new 1-D tensor which indexes the :attr:`input` tensor according to
+    the boolean mask :attr:`mask` which is a `BoolTensor`.
+    
+    The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need
+    to match, but they must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    .. note:: The returned tensor does **not** use the same storage
+              as the original tensor
+    
+    Args:
+        input (Tensor): the input tensor.
+        mask  (BoolTensor): the tensor containing the binary mask to index with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.3552, -2.3825, -0.8297,  0.3477],
+                [-1.2035,  1.2252,  0.5002,  0.6248],
+                [ 0.1307, -2.0608,  0.1244,  2.0139]])
+        >>> mask = x.ge(0.5)
+        >>> mask
+        tensor([[False, False, False, False],
+                [False, True, True, True],
+                [False, False, False, True]])
+        >>> torch.masked_select(x, mask)
+        tensor([ 1.2252,  0.5002,  0.6248,  2.0139])
+    """
+    ...
+def matmul(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matmul(input, other, *, out=None) -> Tensor
+    
+    Matrix product of two tensors.
+    
+    The behavior depends on the dimensionality of the tensors as follows:
+    
+    - If both tensors are 1-dimensional, the dot product (scalar) is returned.
+    - If both arguments are 2-dimensional, the matrix-matrix product is returned.
+    - If the first argument is 1-dimensional and the second argument is 2-dimensional,
+      a 1 is prepended to its dimension for the purpose of the matrix multiply.
+      After the matrix multiply, the prepended dimension is removed.
+    - If the first argument is 2-dimensional and the second argument is 1-dimensional,
+      the matrix-vector product is returned.
+    - If both arguments are at least 1-dimensional and at least one argument is
+      N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+      argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+      batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+      1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+      must be broadcastable).  For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+      tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+    
+      Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+      are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
+      tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+      matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
+    matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
+    as :func:`torch.mm`
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note::
+    
+        The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
+    
+    Arguments:
+        input (Tensor): the first tensor to be multiplied
+        other (Tensor): the second tensor to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> # vector x vector
+        >>> tensor1 = torch.randn(3)
+        >>> tensor2 = torch.randn(3)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([])
+        >>> # matrix x vector
+        >>> tensor1 = torch.randn(3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([3])
+        >>> # batched matrix x broadcasted vector
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3])
+        >>> # batched matrix x batched matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(10, 4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+        >>> # batched matrix x broadcasted matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def matrix_exp(input: Tensor) -> Tensor: 
+    r"""
+    matrix_exp(A) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_exp`.
+    """
+    ...
+def matrix_power(input: Tensor, n: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matrix_power(input, n, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_power`
+    """
+    ...
+@overload
+def max(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool1d_with_indices(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def maximum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    maximum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`maximum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.maximum(a, b)
+        tensor([3, 2, 4])
+    """
+    ...
+@overload
+def mean(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def median(input: Tensor) -> Tensor: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def min(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+def minimum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    minimum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`minimum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.minimum(a, b)
+        tensor([1, 0, -1])
+    """
+    ...
+def miopen_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def miopen_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_transpose(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_depthwise_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: _int, num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: _size, dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def mkldnn_adaptive_avg_pool2d(input: Tensor, output_size: Union[_int, _size], *, out: Optional[Tensor] = None) -> Tensor: ...
+def mkldnn_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def mkldnn_linear_backward_weights(grad_output: Tensor, input: Tensor, weight: Tensor, bias_defined: _bool) -> Tuple[Tensor, Tensor]: ...
+def mkldnn_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_rnn_layer(input: Tensor, weight0: Tensor, weight1: Tensor, weight2: Tensor, weight3: Tensor, hx_: Tensor, cx_: Tensor, reverse: _bool, batch_sizes: _size, mode: _int, hidden_size: _int, num_layers: _int, has_biases: _bool, bidirectional: _bool, batch_first: _bool, train: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Supports strided and sparse 2-D tensors as inputs, autograd with
+    respect to strided inputs.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`.
+    If :attr:`out` is provided it's layout will be used. Otherwise, the result
+    layout will be deduced from that of :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.mm(mat1, mat2)
+        tensor([[ 0.4851,  0.5037, -0.3633],
+                [-0.0760, -3.6705,  2.4784]])
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+def msort(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    msort(input, *, out=None) -> Tensor
+    
+    Sorts the elements of the :attr:`input` tensor along its first dimension
+    in ascending order by value.
+    
+    .. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`.
+              See also :func:`torch.sort`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(3, 4)
+        >>> t
+        tensor([[-0.1321,  0.4370, -1.2631, -1.1289],
+                [-2.0527, -1.1250,  0.2275,  0.3077],
+                [-0.0881, -0.1259, -0.5495,  1.0284]])
+        >>> torch.msort(t)
+        tensor([[-2.0527, -1.1250, -1.2631, -1.1289],
+                [-0.1321, -0.1259, -0.5495,  0.3077],
+                [-0.0881,  0.4370,  0.2275,  1.0284]])
+    """
+    ...
+def mul(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mul(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by :attr:`other`.
+    
+    
+    .. math::
+        \text{out}_i = \text{input}_i \times \text{other}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number) - the tensor or number to multiply input by.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.2015, -0.4255,  2.6087])
+        >>> torch.mul(a, 100)
+        tensor([  20.1494,  -42.5491,  260.8663])
+    
+        >>> b = torch.randn(4, 1)
+        >>> b
+        tensor([[ 1.1207],
+                [-0.3137],
+                [ 0.0700],
+                [ 0.8378]])
+        >>> c = torch.randn(1, 4)
+        >>> c
+        tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
+        >>> torch.mul(b, c)
+        tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
+                [-0.1614, -0.0382,  0.1645, -0.7021],
+                [ 0.0360,  0.0085, -0.0367,  0.1567],
+                [ 0.4312,  0.1019, -0.4394,  1.8753]])
+    """
+    ...
+def multinomial(input: Tensor, num_samples: _int, replacement: _bool = False, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor
+    
+    Returns a tensor where each row contains :attr:`num_samples` indices sampled
+    from the multinomial (a stricter definition would be multivariate,
+    refer to torch.distributions.multinomial.Multinomial for more details)
+    probability distribution located in the corresponding row
+    of tensor :attr:`input`.
+    
+    .. note::
+        The rows of :attr:`input` do not need to sum to one (in which case we use
+        the values as weights), but must be non-negative, finite and have
+        a non-zero sum.
+    
+    Indices are ordered from left to right according to when each was sampled
+    (first samples are placed in first column).
+    
+    If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
+    
+    If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
+    :math:`(m \times \text{num\_samples})`.
+    
+    If replacement is ``True``, samples are drawn with replacement.
+    
+    If not, they are drawn without replacement, which means that when a
+    sample index is drawn for a row, it cannot be drawn again for that row.
+    
+    .. note::
+        When drawn without replacement, :attr:`num_samples` must be lower than
+        number of non-zero elements in :attr:`input` (or the min number of non-zero
+        elements in each row of :attr:`input` if it is a matrix).
+    
+    Args:
+        input (Tensor): the input tensor containing probabilities
+        num_samples (int): number of samples to draw
+        replacement (bool, optional): whether to draw with replacement or not
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
+        >>> torch.multinomial(weights, 2)
+        tensor([1, 2])
+        >>> torch.multinomial(weights, 4) # ERROR!
+        RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False,
+        not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320
+        >>> torch.multinomial(weights, 4, replacement=True)
+        tensor([ 2,  1,  1,  1])
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+def mv(input: Tensor, vec: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mv(input, vec, *, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`input` and the vector
+    :attr:`vec`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): matrix to be multiplied
+        vec (Tensor): vector to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.mv(mat, vec)
+        tensor([ 1.0404, -0.6361])
+    """
+    ...
+def mvlgamma(input: Tensor, p: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mvlgamma(input, p, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.multigammaln`.
+    """
+    ...
+def nan_to_num(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+    
+    Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+    with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+    By default, :literal:`NaN`\ s are replaced with zero, positive infinity is replaced with the
+    greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+    is replaced with the least finite value representable by :attr:`input`'s dtype.
+    
+    Args:
+        input (Tensor): the input tensor.
+        nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+        posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+            If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+        neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+            If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+        >>> torch.nan_to_num(x)
+        tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0)
+        tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+        tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+    """
+    ...
+def nan_to_num_(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: ...
+def nanmean(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanmean(input, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+    
+    Computes the mean of all `non-NaN` elements along the specified dimensions.
+    
+    This function is identical to :func:`torch.mean` when there are no `NaN` values
+    in the :attr:`input` tensor. In the presence of `NaN`, :func:`torch.mean` will
+    propagate the `NaN` to the output whereas :func:`torch.nanmean` will ignore the
+    `NaN` values (`torch.nanmean(a)` is equivalent to `torch.mean(a[~a.isnan()])`).
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.mean` computes the mean value, propagating `NaN`.
+    
+    Example::
+    
+        >>> x = torch.tensor([[torch.nan, 1, 2], [1, 2, 3]])
+        >>> x.mean()
+        tensor(nan)
+        >>> x.nanmean()
+        tensor(1.8000)
+        >>> x.mean(dim=0)
+        tensor([   nan, 1.5000, 2.5000])
+        >>> x.nanmean(dim=0)
+        tensor([1.0000, 1.5000, 2.5000])
+    
+        # If all elements in the reduced dimensions are NaN then the result is NaN
+        >>> torch.tensor([torch.nan]).nanmean()
+        tensor(nan)
+    """
+    ...
+@overload
+def nanmedian(input: Tensor) -> Tensor: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+def nansum(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nansum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.tensor([1., 2., float('nan'), 4.])
+        >>> torch.nansum(a)
+        tensor(7.)
+    
+    .. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
+    If :attr:`dim` is a list of dimensions, reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> torch.nansum(torch.tensor([1., float("nan")]))
+        1.0
+        >>> a = torch.tensor([[1, 2], [3., float("nan")]])
+        >>> torch.nansum(a)
+        tensor(6.)
+        >>> torch.nansum(a, dim=0)
+        tensor([4., 2.])
+        >>> torch.nansum(a, dim=1)
+        tensor([3., 3.])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Tensor, length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+def narrow_copy(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    narrow_copy(input, dim, start, length, *, out=None) -> Tensor
+    
+    Same as :meth:`Tensor.narrow` except this returns a copy rather
+    than shared storage. This is primarily for sparse tensors, which
+    do not have a shared-storage narrow method.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int): index of the element to start the narrowed dimension from. Can
+            be negative, which means indexing from the end of `dim`
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow_copy(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow_copy(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
+        >>> torch.narrow_copy(s, 0, 0, 1)
+        tensor(indices=tensor([[0, 0],
+                               [0, 1]]),
+               values=tensor([[[0, 1],
+                               [2, 3]],
+    
+                              [[4, 5],
+                               [6, 7]]]),
+               size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+    
+    .. seealso::
+    
+            :func:`torch.narrow` for a non copy variant
+    """
+    ...
+def native_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def native_dropout(input: Tensor, p: _float, train: Optional[_bool]) -> Tuple[Tensor, Tensor]: ...
+def native_group_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], N: Union[_int, SymInt], C: Union[_int, SymInt], HxW: Union[_int, SymInt], group: _int, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor], bias: Optional[Tensor], eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def native_norm(input: Tensor, p: Optional[Union[Number, _complex]], dim: Union[_int, _size], keepdim: _bool, dtype: Optional[_dtype]) -> Tensor: ...
+@overload
+def native_norm(input: Tensor, p: Union[Number, _complex] = 2) -> Tensor: ...
+@overload
+def ne(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+@overload
+def ne(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+def neg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    neg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the negative of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out} = -1 \times \text{input}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.neg(a)
+        tensor([-0.0090,  0.2262,  0.0682,  0.2866, -0.3940])
+    """
+    ...
+def neg_(input: Tensor) -> Tensor: ...
+def negative(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    negative(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.neg`
+    """
+    ...
+def negative_(input: Tensor) -> Tensor: ...
+def nextafter(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nextafter(input, other, *, out=None) -> Tensor
+    
+    Return the next floating-point value after :attr:`input` towards :attr:`other`, elementwise.
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> eps = torch.finfo(torch.float32).eps
+        >>> torch.nextafter(torch.tensor([1.0, 2.0]), torch.tensor([2.0, 1.0])) == torch.tensor([eps + 1, 2 - eps])
+        tensor([True, True])
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[False] = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+def nonzero_static(input: Tensor, *, size: _int, fill_value: _int = -1, out: Optional[Tensor] = None) -> Tensor: ...
+def norm_except_dim(v: Tensor, pow: _int = 2, dim: _int = 0) -> Tensor: ...
+@overload
+def normal(mean: Tensor, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: Tensor, std: _float = 1, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: _float, size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def nuclear_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def nuclear_norm(input: Tensor, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def numel(self: Tensor) -> _int: 
+    r"""
+    numel(input) -> int
+    
+    Returns the total number of elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 2, 3, 4, 5)
+        >>> torch.numel(a)
+        120
+        >>> a = torch.zeros(4,4)
+        >>> torch.numel(a)
+        16
+    """
+    ...
+@overload
+def ones(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+def ones_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the same size as
+    :attr:`input`. ``torch.ones_like(input)`` is equivalent to
+    ``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.ones_like(input, out=output)`` is equivalent to
+        ``torch.ones(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword arguments:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.ones_like(input)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    """
+    ...
+def orgqr(input: Tensor, input2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    orgqr(input, tau) -> Tensor
+    
+    Alias for :func:`torch.linalg.householder_product`.
+    """
+    ...
+def ormqr(input: Tensor, input2: Tensor, input3: Tensor, left: _bool = True, transpose: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ormqr(input, tau, other, left=True, transpose=False, *, out=None) -> Tensor
+    
+    Computes the matrix-matrix multiplication of a product of Householder matrices with a general matrix.
+    
+    Multiplies a :math:`m \times n` matrix `C` (given by :attr:`other`) with a matrix `Q`,
+    where `Q` is represented using Householder reflectors `(input, tau)`.
+    See `Representation of Orthogonal or Unitary Matrices`_ for further details.
+    
+    If :attr:`left` is `True` then `op(Q)` times `C` is computed, otherwise the result is `C` times `op(Q)`.
+    When :attr:`left` is `True`, the implicit matrix `Q` has size :math:`m \times m`.
+    It has size :math:`n \times n` otherwise.
+    If :attr:`transpose` is `True` then `op` is the conjugate transpose operation, otherwise it's a no-op.
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batched inputs, and, if the input is batched, the output is batched with the same dimensions.
+    
+    .. seealso::
+            :func:`torch.geqrf` can be used to form the Householder representation `(input, tau)` of matrix `Q`
+            from the QR decomposition.
+    
+    .. note::
+            This function supports backward but it is only fast when ``(input, tau)`` do not require gradients
+            and/or ``tau.size(-1)`` is very small.
+            ``
+    
+    Args:
+        input (Tensor): tensor of shape `(*, mn, k)` where `*` is zero or more batch dimensions
+                        and `mn` equals to `m` or `n` depending on the :attr:`left`.
+        tau (Tensor): tensor of shape `(*, min(mn, k))` where `*` is zero or more batch dimensions.
+        other (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+        left (bool): controls the order of multiplication.
+        transpose (bool): controls whether the matrix `Q` is conjugate transposed or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output Tensor. Ignored if `None`. Default: `None`.
+    
+    .. _Representation of Orthogonal or Unitary Matrices:
+        https://www.netlib.org/lapack/lug/node128.html
+    """
+    ...
+def outer(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    outer(input, vec2, *, out=None) -> Tensor
+    
+    Outer product of :attr:`input` and :attr:`vec2`.
+    If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
+    size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): 1-D input vector
+        vec2 (Tensor): 1-D input vector
+    
+    Keyword args:
+        out (Tensor, optional): optional output matrix
+    
+    Example::
+    
+        >>> v1 = torch.arange(1., 5.)
+        >>> v2 = torch.arange(1., 4.)
+        >>> torch.outer(v1, v2)
+        tensor([[  1.,   2.,   3.],
+                [  2.,   4.,   6.],
+                [  3.,   6.,   9.],
+                [  4.,   8.,  12.]])
+    """
+    ...
+def pairwise_distance(x1: Tensor, x2: Tensor, p: _float = 2, eps: _float = 1e-06, keepdim: _bool = False) -> Tensor: ...
+def pdist(input: Tensor, p: _float = 2) -> Tensor: ...
+def permute(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    permute(input, dims) -> Tensor
+    
+    Returns a view of the original tensor :attr:`input` with its dimensions permuted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (tuple of int): The desired ordering of dimensions
+    
+    Example:
+        >>> x = torch.randn(2, 3, 5)
+        >>> x.size()
+        torch.Size([2, 3, 5])
+        >>> torch.permute(x, (2, 0, 1)).size()
+        torch.Size([5, 2, 3])
+    """
+    ...
+def permute_copy(input: Tensor, dims: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.permute`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def pinverse(input: Tensor, rcond: _float = 1e-15) -> Tensor: 
+    r"""
+    pinverse(input, rcond=1e-15) -> Tensor
+    
+    Alias for :func:`torch.linalg.pinv`
+    """
+    ...
+def pixel_shuffle(input: Tensor, upscale_factor: _int) -> Tensor: ...
+def pixel_unshuffle(input: Tensor, downscale_factor: _int) -> Tensor: ...
+def poisson(input: Tensor, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    poisson(input, generator=None) -> Tensor
+    
+    Returns a tensor of the same size as :attr:`input` with each element
+    sampled from a Poisson distribution with rate parameter given by the corresponding
+    element in :attr:`input` i.e.,
+    
+    .. math::
+        \text{out}_i \sim \text{Poisson}(\text{input}_i)
+    
+    :attr:`input` must be non-negative.
+    
+    Args:
+        input (Tensor): the input tensor containing the rates of the Poisson distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+    
+    Example::
+    
+        >>> rates = torch.rand(4, 4) * 5  # rate parameter between 0 and 5
+        >>> torch.poisson(rates)
+        tensor([[9., 1., 3., 5.],
+                [8., 6., 6., 0.],
+                [0., 4., 5., 3.],
+                [2., 1., 4., 2.]])
+    """
+    ...
+def poisson_nll_loss(input: Tensor, target: Tensor, log_input: _bool, full: _bool, eps: _float, reduction: _int) -> Tensor: ...
+def polar(abs: Tensor, angle: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polar(abs, angle, *, out=None) -> Tensor
+    
+    Constructs a complex tensor whose elements are Cartesian coordinates
+    corresponding to the polar coordinates with absolute value :attr:`abs` and angle
+    :attr:`angle`.
+    
+    .. math::
+        \text{out} = \text{abs} \cdot \cos(\text{angle}) + \text{abs} \cdot \sin(\text{angle}) \cdot j
+    
+    .. note::
+        `torch.polar` is similar to
+        `std::polar <https://en.cppreference.com/w/cpp/numeric/complex/polar>`_
+        and does not compute the polar decomposition
+        of a complex tensor like Python's `cmath.polar` and SciPy's `linalg.polar` do.
+        The behavior of this function is undefined if `abs` is negative or NaN, or if `angle` is
+        infinite.
+    
+    
+    Args:
+        abs (Tensor): The absolute value the complex tensor. Must be float or double.
+        angle (Tensor): The angle of the complex tensor. Must be same dtype as
+            :attr:`abs`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> import numpy as np
+        >>> abs = torch.tensor([1, 2], dtype=torch.float64)
+        >>> angle = torch.tensor([np.pi / 2, 5 * np.pi / 4], dtype=torch.float64)
+        >>> z = torch.polar(abs, angle)
+        >>> z
+        tensor([(0.0000+1.0000j), (-1.4142-1.4142j)], dtype=torch.complex128)
+    """
+    ...
+def polygamma(n: _int, input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polygamma(n, input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.polygamma`.
+    """
+    ...
+def positive(input: Tensor) -> Tensor: 
+    r"""
+    positive(input) -> Tensor
+    
+    Returns :attr:`input`.
+    Throws a runtime error if :attr:`input` is a bool tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(5)
+        >>> t
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.positive(t)
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+def prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+@overload
+def prod(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: _int, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+def promote_types(type1: _dtype, type2: _dtype) -> _dtype: 
+    r"""
+    promote_types(type1, type2) -> dtype
+    
+    Returns the :class:`torch.dtype` with the smallest size and scalar kind that is
+    not smaller nor of lower kind than either `type1` or `type2`. See type promotion
+    :ref:`documentation <type-promotion-doc>` for more information on the type
+    promotion logic.
+    
+    Args:
+        type1 (:class:`torch.dtype`)
+        type2 (:class:`torch.dtype`)
+    
+    Example::
+    
+        >>> torch.promote_types(torch.int32, torch.float32)
+        torch.float32
+        >>> torch.promote_types(torch.uint8, torch.long)
+        torch.long
+    """
+    ...
+def put(input: Tensor, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: ...
+def q_per_channel_axis(input: Tensor) -> _int: ...
+def q_per_channel_scales(input: Tensor) -> Tensor: ...
+def q_per_channel_zero_points(input: Tensor) -> Tensor: ...
+def q_scale(input: Tensor) -> _float: ...
+def q_zero_point(input: Tensor) -> _int: ...
+def qr(input: Tensor, some: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.qr: 
+    r"""
+    qr(input, some=True, *, out=None) -> (Tensor, Tensor)
+    
+    Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
+    and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
+    with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and
+    :math:`R` being an upper triangular matrix or batch of upper triangular matrices.
+    
+    If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization.
+    Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization.
+    
+    .. warning::
+    
+        :func:`torch.qr` is deprecated in favor of :func:`torch.linalg.qr`
+        and will be removed in a future PyTorch release. The boolean parameter :attr:`some` has been
+        replaced with a string parameter :attr:`mode`.
+    
+        ``Q, R = torch.qr(A)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A)
+    
+        ``Q, R = torch.qr(A, some=False)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A, mode="complete")
+    
+    .. warning::
+              If you plan to backpropagate through QR, note that the current backward implementation
+              is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
+              columns of :attr:`input` are linearly independent.
+              This behavior will probably change once QR supports pivoting.
+    
+    .. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+              and may produce different (valid) decompositions on different device types
+              or different platforms.
+    
+    Args:
+        input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
+                    batch dimensions consisting of matrices of dimension :math:`m \times n`.
+        some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
+                    complete QR decomposition. If `k = min(m, n)` then:
+    
+                      * ``some=True`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default)
+    
+                      * ``'some=False'``: returns `(Q, R)` with dimensions (m, m), (m, n)
+    
+    Keyword args:
+        out (tuple, optional): tuple of `Q` and `R` tensors.
+                    The dimensions of `Q` and `R` are detailed in the description of :attr:`some` above.
+    
+    Example::
+    
+        >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+        >>> q, r = torch.qr(a)
+        >>> q
+        tensor([[-0.8571,  0.3943,  0.3314],
+                [-0.4286, -0.9029, -0.0343],
+                [ 0.2857, -0.1714,  0.9429]])
+        >>> r
+        tensor([[ -14.0000,  -21.0000,   14.0000],
+                [   0.0000, -175.0000,   70.0000],
+                [   0.0000,    0.0000,  -35.0000]])
+        >>> torch.mm(q, r).round()
+        tensor([[  12.,  -51.,    4.],
+                [   6.,  167.,  -68.],
+                [  -4.,   24.,  -41.]])
+        >>> torch.mm(q.t(), q).round()
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1., -0.],
+                [ 0., -0.,  1.]])
+        >>> a = torch.randn(3, 4, 5)
+        >>> q, r = torch.qr(a, some=False)
+        >>> torch.allclose(torch.matmul(q, r), a)
+        True
+        >>> torch.allclose(torch.matmul(q.mT, q), torch.eye(5))
+        True
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+def quantize_per_channel(input: Tensor, scales: Tensor, zero_points: Tensor, axis: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_channel(input, scales, zero_points, axis, dtype) -> Tensor
+    
+    Converts a float tensor to a per-channel quantized tensor with given scales and zero points.
+    
+    Arguments:
+        input (Tensor): float tensor to quantize
+        scales (Tensor): float 1D tensor of scales to use, size should match ``input.size(axis)``
+        zero_points (int): integer 1D tensor of offset to use, size should match ``input.size(axis)``
+        axis (int): dimension on which apply per-channel quantization
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor
+    
+    Example::
+    
+        >>> x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8)
+        tensor([[-1.,  0.],
+                [ 1.,  2.]], size=(2, 2), dtype=torch.quint8,
+               quantization_scheme=torch.per_channel_affine,
+               scale=tensor([0.1000, 0.0100], dtype=torch.float64),
+               zero_point=tensor([10,  0]), axis=0)
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8).int_repr()
+        tensor([[  0,  10],
+                [100, 200]], dtype=torch.uint8)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: _float, zero_point: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(tensors: Union[Tuple[Tensor, ...], List[Tensor]], scales: Tensor, zero_points: Tensor, dtype: _dtype) -> Tuple[Tensor, ...]: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+def quantize_per_tensor_dynamic(input: Tensor, dtype: _dtype, reduce_range: _bool) -> Tensor: 
+    r"""
+    quantize_per_tensor_dynamic(input, dtype, reduce_range) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with scale and zero_point calculated
+    dynamically based on the input.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``
+        reduce_range (bool): a flag to indicate whether to reduce the range of quantized
+        data by 1 bit, it's required to avoid instruction overflow for some hardwares
+    
+    Returns:
+        Tensor: A newly (dynamically) quantized tensor
+    
+    Example::
+    
+        >>> t = torch.quantize_per_tensor_dynamic(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.quint8, False)
+        >>> print(t)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.011764705882352941,
+               zero_point=85)
+        >>> t.int_repr()
+        tensor([  0,  85, 170, 255], dtype=torch.uint8)
+    """
+    ...
+def quantized_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, var: Tensor, eps: _float, output_scale: _float, output_zero_point: _int) -> Tensor: 
+    r"""
+    quantized_batch_norm(input, weight=None, bias=None, mean, var, eps, output_scale, output_zero_point) -> Tensor
+    
+    Applies batch normalization on a 4D (NCHW) quantized tensor.
+    
+    .. math::
+    
+            y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        weight (Tensor): float tensor that corresponds to the gamma, size C
+        bias (Tensor):  float tensor that corresponds to the beta, size C
+        mean (Tensor): float mean value in batch normalization, size C
+        var (Tensor): float tensor for variance, size C
+        eps (float): a value added to the denominator for numerical stability.
+        output_scale (float): output quantized tensor scale
+        output_zero_point (int): output quantized tensor zero_point
+    
+    Returns:
+        Tensor: A quantized tensor with batch normalization applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_batch_norm(qx, torch.ones(2), torch.zeros(2), torch.rand(2), torch.rand(2), 0.00001, 0.2, 2)
+        tensor([[[[-0.2000, -0.2000],
+              [ 1.6000, -0.2000]],
+    
+             [[-0.4000, -0.4000],
+              [-0.4000,  0.6000]]],
+    
+    
+            [[[-0.2000, -0.2000],
+              [-0.2000, -0.2000]],
+    
+             [[ 0.6000, -0.4000],
+              [ 0.6000, -0.4000]]]], size=(2, 2, 2, 2), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=2)
+    """
+    ...
+def quantized_gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tuple[Tensor, Tensor]: ...
+def quantized_max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool1d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 1D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (list of int): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool1d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool1d(qx, [2])
+        tensor([[0.0000],
+                [1.5000]], size=(2, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool2d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 2D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (``list of int``): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool2d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool2d(qx, [2,2])
+        tensor([[[[1.5000]],
+    
+                [[1.5000]]],
+    
+    
+                [[[0.0000]],
+    
+                [[0.0000]]]], size=(2, 2, 1, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def quantized_rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def rad2deg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rad2deg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in radians to degrees.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]])
+        >>> torch.rad2deg(a)
+        tensor([[ 180.0233, -180.0233],
+                [ 359.9894, -359.9894],
+                [  89.9544,  -89.9544]])
+    """
+    ...
+def rad2deg_(input: Tensor) -> Tensor: ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+def rand_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a uniform distribution on the interval :math:`[0, 1)`.
+    ``torch.rand_like(input)`` is equivalent to
+    ``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint(low: _int, high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint_like(input: Tensor, high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint_like(input: Tensor, low: Union[_int, SymInt], high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+def randn_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the
+    sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to
+    ``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+def range(start: Number, end: Number, step: Number = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
+    with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+    the gap between two values in the tensor.
+    
+    .. math::
+        \text{out}_{i+1} = \text{out}_i + \text{step}.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future release because its behavior is inconsistent with
+        Python's range builtin. Instead, use :func:`torch.arange`, which produces values in [start, end).
+    
+    Args:
+        start (float): the starting value for the set of points. Default: ``0``.
+        end (float): the ending value for the set of points
+        step (float): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.range(1, 4)
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.range(1, 4, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000,  2.5000,  3.0000,  3.5000,  4.0000])
+    """
+    ...
+def ravel(input: Tensor) -> Tensor: 
+    r"""
+    ravel(input) -> Tensor
+    
+    Return a contiguous flattened tensor. A copy is made only if needed.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.ravel(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    """
+    ...
+def real(input: Tensor) -> Tensor: 
+    r"""
+    real(input) -> Tensor
+    
+    Returns a new tensor containing real values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.real
+        tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+    """
+    ...
+def reciprocal(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    reciprocal(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the elements of :attr:`input`
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\text{input}_{i}}
+    
+    .. note::
+        Unlike NumPy's reciprocal, torch.reciprocal supports integral inputs. Integral
+        inputs to reciprocal are automatically :ref:`promoted <type-promotion-doc>` to
+        the default scalar type.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.4595, -2.1219, -1.4314,  0.7298])
+        >>> torch.reciprocal(a)
+        tensor([-2.1763, -0.4713, -0.6986,  1.3702])
+    """
+    ...
+def reciprocal_(input: Tensor) -> Tensor: ...
+def relu(input: Tensor) -> Tensor: ...
+def relu_(input: Tensor) -> Tensor: ...
+@overload
+def remainder(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+def renorm(input: Tensor, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
+    
+    Returns a tensor where each sub-tensor of :attr:`input` along dimension
+    :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
+    than the value :attr:`maxnorm`
+    
+    .. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged
+    
+    Args:
+        input (Tensor): the input tensor.
+        p (float): the power for the norm computation
+        dim (int): the dimension to slice over to get the sub-tensors
+        maxnorm (float): the maximum norm to keep each sub-tensor under
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.ones(3, 3)
+        >>> x[1].fill_(2)
+        tensor([ 2.,  2.,  2.])
+        >>> x[2].fill_(3)
+        tensor([ 3.,  3.,  3.])
+        >>> x
+        tensor([[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.],
+                [ 3.,  3.,  3.]])
+        >>> torch.renorm(x, 1, 0, 5)
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [ 1.6667,  1.6667,  1.6667],
+                [ 1.6667,  1.6667,  1.6667]])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Tensor, dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(repeats: Tensor, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Union[_int, SymInt], dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+def reshape(input: Tensor, shape: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    reshape(input, shape) -> Tensor
+    
+    Returns a tensor with the same data and number of elements as :attr:`input`,
+    but with the specified shape. When possible, the returned tensor will be a view
+    of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
+    with compatible strides can be reshaped without copying, but you should not
+    depend on the copying vs. viewing behavior.
+    
+    See :meth:`torch.Tensor.view` on when it is possible to return a view.
+    
+    A single dimension may be -1, in which case it's inferred from the remaining
+    dimensions and the number of elements in :attr:`input`.
+    
+    Args:
+        input (Tensor): the tensor to be reshaped
+        shape (tuple of int): the new shape
+    
+    Example::
+    
+        >>> a = torch.arange(4.)
+        >>> torch.reshape(a, (2, 2))
+        tensor([[ 0.,  1.],
+                [ 2.,  3.]])
+        >>> b = torch.tensor([[0, 1], [2, 3]])
+        >>> torch.reshape(b, (-1,))
+        tensor([ 0,  1,  2,  3])
+    """
+    ...
+def resize_as_(input: Tensor, the_template: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: ...
+def resize_as_sparse_(input: Tensor, the_template: Tensor) -> Tensor: ...
+def resolve_conj(input: Tensor) -> Tensor: 
+    r"""
+    resolve_conj(input) -> Tensor
+    
+    Returns a new tensor with materialized conjugation if :attr:`input`'s conjugate bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its conjugate bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> y.is_conj()
+        True
+        >>> z = y.resolve_conj()
+        >>> z
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+        >>> z.is_conj()
+        False
+    """
+    ...
+def resolve_neg(input: Tensor) -> Tensor: 
+    r"""
+    resolve_neg(input) -> Tensor
+    
+    Returns a new tensor with materialized negation if :attr:`input`'s negative bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its negative bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> z = y.imag
+        >>> z.is_neg()
+        True
+        >>> out = z.resolve_neg()
+        >>> out
+        tensor([-1., -2., 3.])
+        >>> out.is_neg()
+        False
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar: Union[Number, _complex], tensor: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar1: Union[Number, _complex], scalar2: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def rnn_relu(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_relu(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def rnn_tanh(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_tanh(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+def roll(input: Tensor, shifts: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], dims: Union[_int, _size] = ()) -> Tensor: 
+    r"""
+    roll(input, shifts, dims=None) -> Tensor
+    
+    Roll the tensor :attr:`input` along the given dimension(s). Elements that are
+    shifted beyond the last position are re-introduced at the first position. If
+    :attr:`dims` is `None`, the tensor will be flattened before rolling and then
+    restored to the original shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shifts (int or tuple of ints): The number of places by which the elements
+            of the tensor are shifted. If shifts is a tuple, dims must be a tuple of
+            the same size, and each dimension will be rolled by the corresponding
+            value
+        dims (int or tuple of ints): Axis along which to roll
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+        >>> x
+        tensor([[1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8]])
+        >>> torch.roll(x, 1)
+        tensor([[8, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7]])
+        >>> torch.roll(x, 1, 0)
+        tensor([[7, 8],
+                [1, 2],
+                [3, 4],
+                [5, 6]])
+        >>> torch.roll(x, -1, 0)
+        tensor([[3, 4],
+                [5, 6],
+                [7, 8],
+                [1, 2]])
+        >>> torch.roll(x, shifts=(2, 1), dims=(0, 1))
+        tensor([[6, 5],
+                [8, 7],
+                [2, 1],
+                [4, 3]])
+    """
+    ...
+def rot90(input: Tensor, k: _int = 1, dims: _size = (0,1)) -> Tensor: 
+    r"""
+    rot90(input, k=1, dims=[0,1]) -> Tensor
+    
+    Rotate an n-D tensor by 90 degrees in the plane specified by dims axis.
+    Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): number of times to rotate. Default value is 1
+        dims (a list or tuple): axis to rotate. Default value is [0, 1]
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.rot90(x, 1, [0, 1])
+        tensor([[1, 3],
+                [0, 2]])
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[0, 1],
+                 [2, 3]],
+    
+                [[4, 5],
+                 [6, 7]]])
+        >>> torch.rot90(x, 1, [1, 2])
+        tensor([[[1, 3],
+                 [0, 2]],
+    
+                [[5, 7],
+                 [4, 6]]])
+    """
+    ...
+@overload
+def round(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round(input: Tensor, *, decimals: _int, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round_(input: Tensor) -> Tensor: ...
+@overload
+def round_(input: Tensor, *, decimals: _int) -> Tensor: ...
+def row_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def row_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    row_stack(tensors, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.vstack`.
+    """
+    ...
+def rrelu(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rrelu_(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rsqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rsqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the square-root of each of
+    the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.0370,  0.2970,  1.5420, -0.9105])
+        >>> torch.rsqrt(a)
+        tensor([    nan,  1.8351,  0.8053,     nan])
+    """
+    ...
+def rsqrt_(input: Tensor) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number = 1, alpha: Number = 1, out: Optional[Tensor] = None) -> Tensor: ...
+def scalar_tensor(s: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+def scatter_reduce(input: Tensor, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, input: Tensor, *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, self: Union[Number, _complex], *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+def segment_reduce(data: Tensor, reduce: str, *, lengths: Optional[Tensor] = None, indices: Optional[Tensor] = None, offsets: Optional[Tensor] = None, axis: _int = 0, unsafe: _bool = False, initial: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def select(input: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+@overload
+def select(input: Tensor, dim: Union[str, ellipsis, None], index: _int) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+def select_copy(input: Tensor, dim: _int, index: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.select`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def select_scatter(input: Tensor, src: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select_scatter(input, src, dim, index) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given index.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into.
+        index (int): the index to select with
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.select(input, dim, index)``
+    
+    Example::
+    
+        >>> a = torch.zeros(2, 2)
+        >>> b = torch.ones(2)
+        >>> a.select_scatter(b, 0, 0)
+        tensor([[1., 1.],
+                [0., 0.]])
+    """
+    ...
+def selu(input: Tensor) -> Tensor: ...
+def selu_(input: Tensor) -> Tensor: ...
+def set_flush_denormal(mode: _bool) -> _bool: 
+    r"""
+    set_flush_denormal(mode) -> bool
+    
+    Disables denormal floating numbers on CPU.
+    
+    Returns ``True`` if your system supports flushing denormal numbers and it
+    successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
+    is supported on x86 architectures supporting SSE3 and AArch64 architecture.
+    
+    Args:
+        mode (bool): Controls whether to enable flush denormal mode or not
+    
+    Example::
+    
+        >>> torch.set_flush_denormal(True)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor([ 0.], dtype=torch.float64)
+        >>> torch.set_flush_denormal(False)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor(9.88131e-324 *
+               [ 1.0000], dtype=torch.float64)
+    """
+    ...
+def set_num_interop_threads(num: _int) -> None: 
+    r"""
+    set_num_interop_threads(int)
+    
+    Sets the number of threads used for interop parallelism
+    (e.g. in JIT interpreter) on CPU.
+    
+    .. warning::
+        Can only be called once and before any inter-op parallel work
+        is started (e.g. JIT execution).
+    """
+    ...
+def set_num_threads(num: _int) -> None: 
+    r"""
+    set_num_threads(int)
+    
+    Sets the number of threads used for intraop parallelism on CPU.
+    
+    .. warning::
+        To ensure that the correct number of threads is used, set_num_threads
+        must be called before running eager, JIT or autograd code.
+    """
+    ...
+def sgn(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sgn(input, *, out=None) -> Tensor
+    
+    This function is an extension of torch.sign() to complex tensors.
+    It computes a new tensor whose elements have
+    the same angles as the corresponding elements of :attr:`input` and
+    absolute values (i.e. magnitudes) of one for complex tensors and
+    is equivalent to torch.sign() for non-complex tensors.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+                        0 & |\text{{input}}_i| == 0 \\
+                        \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|} & \text{otherwise}
+                        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([3+4j, 7-24j, 0, 1+2j])
+        >>> t.sgn()
+        tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+    """
+    ...
+def sigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sigmoid(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expit`.
+    """
+    ...
+def sigmoid_(input: Tensor) -> Tensor: ...
+def sign(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sign(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the signs of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \operatorname{sgn}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> a
+        tensor([ 0.7000, -1.2000,  0.0000,  2.3000])
+        >>> torch.sign(a)
+        tensor([ 1., -1.,  0.,  1.])
+    """
+    ...
+def signbit(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    signbit(input, *, out=None) -> Tensor
+    
+    Tests if each element of :attr:`input` has its sign bit set or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> torch.signbit(a)
+        tensor([ False, True,  False,  False])
+        >>> a = torch.tensor([-0.0, 0.0])
+        >>> torch.signbit(a)
+        tensor([ True,  False])
+    
+    .. note::
+        signbit handles signed zeros, so negative zero (-0) returns True.
+    """
+    ...
+def sin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5461,  0.1347, -2.7266, -0.2746])
+        >>> torch.sin(a)
+        tensor([-0.5194,  0.1343, -0.4032, -0.2711])
+    """
+    ...
+def sin_(input: Tensor) -> Tensor: ...
+def sinc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.sinc`.
+    """
+    ...
+def sinc_(input: Tensor) -> Tensor: ...
+def sinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic sine of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.5380, -0.8632, -0.1265,  0.9399])
+        >>> torch.sinh(a)
+        tensor([ 0.5644, -0.9744, -0.1268,  1.0845])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.sinh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def sinh_(input: Tensor) -> Tensor: ...
+def slice_copy(input: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.slice`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def slice_inverse(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: ...
+def slice_scatter(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    slice_scatter(input, src, dim=0, start=None, end=None, step=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given
+    dimension.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into
+        start (Optional[int]): the start index of where to insert the slice
+        end (Optional[int]): the end index of where to insert the slice
+        step (int): the how many elements to skip in
+    
+    Example::
+    
+        >>> a = torch.zeros(8, 8)
+        >>> b = torch.ones(2, 8)
+        >>> a.slice_scatter(b, start=6)
+        tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1.],
+                [1., 1., 1., 1., 1., 1., 1., 1.]])
+    
+        >>> b = torch.ones(8, 2)
+        >>> a.slice_scatter(b, dim=1, start=2, end=6, step=2)
+        tensor([[0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.]])
+    """
+    ...
+def slogdet(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.slogdet: 
+    r"""
+    slogdet(input) -> (Tensor, Tensor)
+    
+    Alias for :func:`torch.linalg.slogdet`
+    """
+    ...
+def smm(input: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    smm(input, mat) -> Tensor
+    
+    Performs a matrix multiplication of the sparse matrix :attr:`input`
+    with the dense matrix :attr:`mat`.
+    
+    Args:
+        input (Tensor): a sparse matrix to be matrix multiplied
+        mat (Tensor): a dense matrix to be matrix multiplied
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: _int = -1, descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: _int = -1, descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: Union[str, ellipsis, None], descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+def sparse_bsc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse
+    Column)) <sparse-bsc-docs>` with specified 2-dimensional blocks at the
+    given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in BSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncolblocks + 1)``. The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            index in values and row_indices depending on where the given
+            column starts. Each successive number in the tensor subtracted
+            by the number before it denotes the number of elements in a
+            given column.
+        row_indices (array_like): Row block co-ordinates of each block in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial blocks for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 1, 2]
+        >>> row_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 1, 2]),
+               row_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsc)
+    """
+    ...
+def sparse_bsr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row))
+    <sparse-bsr-docs>` with specified 2-dimensional blocks at the given
+    :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix
+    multiplication operations in BSR format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrowblocks + 1)``.  The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            block index in values and col_indices depending on where the
+            given row block starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            blocks in a given row.
+        col_indices (array_like): Column block co-ordinates of each block
+            in values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize ==
+            values.shape[1:3]``. If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 1, 2]
+        >>> col_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 1, 2]),
+               col_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsr)
+    """
+    ...
+def sparse_compressed_tensor(compressed_indices: Union[Tensor, List], plain_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, *, dtype=None, layout=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR,
+    CSC, BSR, or BSC - <sparse-compressed-docs>` with specified values at
+    the given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse
+    matrix multiplication operations in Compressed Sparse format are
+    typically faster than that for sparse tensors in COO format. Make you
+    have a look at :ref:`the note on the data type of the indices
+    <sparse-compressed-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        compressed_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, compressed_dim_size + 1)``.  The last element of
+            each batch is the number of non-zero elements or blocks. This
+            tensor encodes the index in ``values`` and ``plain_indices``
+            depending on where the given compressed dimension (row or
+            column) starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            elements or blocks in a given compressed dimension.
+        plain_indices (array_like): Plain dimension (column or row)
+            co-ordinates of each element or block in values. (B+1)-dimensional
+            tensor with the same length as values.
+    
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types.  that
+            represents a (1+K)-dimensional (for CSR and CSC layouts) or
+            (1+2+K)-dimensional tensor (for BSR and BSC layouts) where
+            ``K`` is the number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize[0] ==
+            blocksize[1] == 1`` for CSR and CSC formats. If not provided,
+            the size will be inferred as the minimum size big enough to
+            hold all non-zero elements or blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        layout (:class:`torch.layout`, required): the desired layout of
+            returned tensor: :attr:`torch.sparse_csr`,
+            :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or
+            :attr:`torch.sparse_bsc`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> compressed_indices = [0, 2, 4]
+        >>> plain_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64),
+        ...                                torch.tensor(plain_indices, dtype=torch.int64),
+        ...                                torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def sparse_coo_tensor(indices: Tensor, values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None, is_coalesced: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None, is_coalesced=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in COO(rdinate) format
+    <sparse-coo-docs>` with specified values at the given
+    :attr:`indices`.
+    
+    .. note::
+    
+       This function returns an :ref:`uncoalesced tensor
+       <sparse-uncoalesced-coo-docs>` when :attr:`is_coalesced` is
+       unspecified or ``None``.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        indices (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`
+            internally. The indices are the coordinates of the non-zero values in the matrix, and thus
+            should be two-dimensional where the first dimension is the number of tensor dimensions and
+            the second dimension is the number of non-zero values.
+        values (array_like): Initial values for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
+            provided the size will be inferred as the minimum size big enough to hold all non-zero
+            elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if None, infers data type from :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if None, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+        is_coalesced (bool, optional): When``True``, the caller is
+            responsible for providing tensor indices that correspond to a
+            coalesced tensor.  If the :attr:`check_invariants` flag is
+            False, no error will be raised if the prerequisites are not
+            met and this will lead to silently incorrect results. To force
+            coalescion please use :meth:`coalesce` on the resulting
+            Tensor.
+            Default: None: except for trivial cases (e.g. nnz < 2) the
+            resulting Tensor has is_coalesced set to ``False```.
+    
+    Example::
+    
+        >>> i = torch.tensor([[0, 1, 1],
+        ...                   [2, 0, 2]])
+        >>> v = torch.tensor([3, 4, 5], dtype=torch.float32)
+        >>> torch.sparse_coo_tensor(i, v, [2, 4])
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 4), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v)  # Shape inference
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 3), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v, [2, 4],
+        ...                         dtype=torch.float64,
+        ...                         device=torch.device('cuda:0'))
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               device='cuda:0', size=(2, 4), nnz=3, dtype=torch.float64,
+               layout=torch.sparse_coo)
+    
+        # Create an empty sparse tensor with the following invariants:
+        #   1. sparse_dim + dense_dim = len(SparseTensor.shape)
+        #   2. SparseTensor._indices().shape = (sparse_dim, nnz)
+        #   3. SparseTensor._values().shape = (nnz, SparseTensor.shape[sparse_dim:])
+        #
+        # For instance, to create an empty sparse tensor with nnz = 0, dense_dim = 0 and
+        # sparse_dim = 1 (hence indices is a 2D tensor of shape = (1, 0))
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), [], [1])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0,)),
+               size=(1,), nnz=0, layout=torch.sparse_coo)
+    
+        # and to create an empty sparse tensor with nnz = 0, dense_dim = 1 and
+        # sparse_dim = 1
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), torch.empty([0, 2]), [1, 2])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0, 2)),
+               size=(1, 2), nnz=0, layout=torch.sparse_coo)
+    
+    .. _torch.sparse: https://pytorch.org/docs/stable/sparse.html
+    """
+    ...
+def sparse_csc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column)
+    <sparse-csc-docs>` with specified values at the given
+    :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in CSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-csc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncols + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and row_indices depending on where the given column
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            column.
+        row_indices (array_like): Row co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 2, 4]
+        >>> row_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 2, 4]),
+               row_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csc)
+    """
+    ...
+def sparse_csr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSR (Compressed Sparse Row) <sparse-csr-docs>` with specified
+    values at the given :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix multiplication operations
+    in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look
+    at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrows + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and col_indices depending on where the given row
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            row.
+        col_indices (array_like): Column co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 2, 4]
+        >>> col_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def split_copy(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def split_with_sizes_copy(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+def sqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square-root of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.sqrt(a)
+        tensor([    nan,  1.0112,  0.2883,  0.6933])
+    """
+    ...
+def sqrt_(input: Tensor) -> Tensor: ...
+def square(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    square(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square of the elements of :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.square(a)
+        tensor([ 4.3077,  1.0457,  0.0069,  0.2310])
+    """
+    ...
+def square_(input: Tensor) -> Tensor: ...
+@overload
+def squeeze(input: Tensor) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    stack(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates a sequence of tensors along a new dimension.
+    
+    All tensors need to be of the same size.
+    
+    .. seealso::
+    
+        :func:`torch.cat` concatenates the given sequence along an existing dimension.
+    
+    Arguments:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+        dim (int, optional): dimension to insert. Has to be between 0 and the number
+            of dimensions of concatenated tensors (inclusive). Default: 0
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.3367,  0.1288,  0.2345],
+                [ 0.2303, -1.1229, -0.1863]])
+        >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0)
+        >>> x
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]],
+    
+                [[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x.size()
+        torch.Size([2, 2, 3])
+        >>> x = torch.stack((x, x), dim=1)
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.3367,  0.1288,  0.2345]],
+    
+                [[ 0.2303, -1.1229, -0.1863],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=2)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=-1)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def sub(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def sum(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+def svd(input: Tensor, some: _bool = True, compute_uv: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.svd: 
+    r"""
+    svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Computes the singular value decomposition of either a matrix or batch of
+    matrices :attr:`input`. The singular value decomposition is represented as a
+    namedtuple `(U, S, V)`, such that :attr:`input` :math:`= U \text{diag}(S) V^{\text{H}}`.
+    where :math:`V^{\text{H}}` is the transpose of `V` for real inputs,
+    and the conjugate transpose of `V` for complex inputs.
+    If :attr:`input` is a batch of matrices, then `U`, `S`, and `V` are also
+    batched with the same batch dimensions as :attr:`input`.
+    
+    If :attr:`some` is `True` (default), the method returns the reduced singular
+    value decomposition. In this case, if the last two dimensions of :attr:`input` are
+    `m` and `n`, then the returned `U` and `V` matrices will contain only
+    `min(n, m)` orthonormal columns.
+    
+    If :attr:`compute_uv` is `False`, the returned `U` and `V` will be
+    zero-filled matrices of shape `(m, m)` and `(n, n)`
+    respectively, and the same device as :attr:`input`. The argument :attr:`some`
+    has no effect when :attr:`compute_uv` is `False`.
+    
+    Supports :attr:`input` of float, double, cfloat and cdouble data types.
+    The dtypes of `U` and `V` are the same as :attr:`input`'s. `S` will
+    always be real-valued, even if :attr:`input` is complex.
+    
+    .. warning::
+    
+        :func:`torch.svd` is deprecated in favor of :func:`torch.linalg.svd`
+        and will be removed in a future PyTorch release.
+    
+        ``U, S, V = torch.svd(A, some=some, compute_uv=True)`` (default) should be replaced with
+    
+        .. code:: python
+    
+            U, S, Vh = torch.linalg.svd(A, full_matrices=not some)
+            V = Vh.mH
+    
+        ``_, S, _ = torch.svd(A, some=some, compute_uv=False)`` should be replaced with
+    
+        .. code:: python
+    
+            S = torch.linalg.svdvals(A)
+    
+    .. note:: Differences with :func:`torch.linalg.svd`:
+    
+                 * :attr:`some` is the opposite of
+                   :func:`torch.linalg.svd`'s :attr:`full_matrices`. Note that
+                   default value for both is `True`, so the default behavior is
+                   effectively the opposite.
+                 * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns
+                   `Vh`, that is, :math:`V^{\text{H}}`.
+                 * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled
+                   tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns
+                   empty tensors.
+    
+    .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
+              then the singular values of each matrix in the batch are returned in descending order.
+    
+    .. note:: The `S` tensor can only be used to compute gradients if :attr:`compute_uv` is `True`.
+    
+    .. note:: When :attr:`some` is `False`, the gradients on `U[..., :, min(m, n):]`
+              and `V[..., :, min(m, n):]` will be ignored in the backward pass, as those vectors
+              can be arbitrary bases of the corresponding subspaces.
+    
+    .. note:: The implementation of :func:`torch.linalg.svd` on CPU uses LAPACK's routine `?gesdd`
+              (a divide-and-conquer algorithm) instead of `?gesvd` for speed. Analogously,
+              on GPU, it uses cuSOLVER's routines `gesvdj` and `gesvdjBatched` on CUDA 10.1.243
+              and later, and MAGMA's routine `gesdd` on earlier versions of CUDA.
+    
+    .. note:: The returned `U` will not be contiguous. The matrix (or batch of matrices) will
+              be represented as a column-major matrix (i.e. Fortran-contiguous).
+    
+    .. warning:: The gradients with respect to `U` and `V` will only be finite when the input does not
+                 have zero nor repeated singular values.
+    
+    .. warning:: If the distance between any two singular values is close to zero, the gradients with respect to
+                 `U` and `V` will be numerically unstable, as they depends on
+                 :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`. The same happens when the matrix
+                 has small singular values, as these gradients also depend on `S^{-1}`.
+    
+    .. warning:: For complex-valued :attr:`input` the singular value decomposition is not unique,
+                 as `U` and `V` may be multiplied by an arbitrary phase factor :math:`e^{i \phi}` on every column.
+                 The same happens when :attr:`input` has repeated singular values, where one may multiply
+                 the columns of the spanning subspace in `U` and `V` by a rotation matrix
+                 and `the resulting vectors will span the same subspace`_.
+                 Different platforms, like NumPy, or inputs on different device types,
+                 may produce different `U` and `V` tensors.
+    
+    Args:
+        input (Tensor): the input tensor of size `(*, m, n)` where `*` is zero or more
+                        batch dimensions consisting of `(m, n)` matrices.
+        some (bool, optional): controls whether to compute the reduced or full decomposition, and
+                               consequently, the shape of returned `U` and `V`. Default: `True`.
+        compute_uv (bool, optional): controls whether to compute `U` and `V`. Default: `True`.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of tensors
+    
+    Example::
+    
+        >>> a = torch.randn(5, 3)
+        >>> a
+        tensor([[ 0.2364, -0.7752,  0.6372],
+                [ 1.7201,  0.7394, -0.0504],
+                [-0.3371, -1.0584,  0.5296],
+                [ 0.3550, -0.4022,  1.5569],
+                [ 0.2445, -0.0158,  1.1414]])
+        >>> u, s, v = torch.svd(a)
+        >>> u
+        tensor([[ 0.4027,  0.0287,  0.5434],
+                [-0.1946,  0.8833,  0.3679],
+                [ 0.4296, -0.2890,  0.5261],
+                [ 0.6604,  0.2717, -0.2618],
+                [ 0.4234,  0.2481, -0.4733]])
+        >>> s
+        tensor([2.3289, 2.0315, 0.7806])
+        >>> v
+        tensor([[-0.0199,  0.8766,  0.4809],
+                [-0.5080,  0.4054, -0.7600],
+                [ 0.8611,  0.2594, -0.4373]])
+        >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t()))
+        tensor(8.6531e-07)
+        >>> a_big = torch.randn(7, 5, 3)
+        >>> u, s, v = torch.svd(a_big)
+        >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.mT))
+        tensor(2.6503e-06)
+    
+    .. _the resulting vectors will span the same subspace:
+           (https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD)
+    """
+    ...
+def swapaxes(input: Tensor, axis0: _int, axis1: _int) -> Tensor: 
+    r"""
+    swapaxes(input, axis0, axis1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def swapdims(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    swapdims(input, dim0, dim1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def sym_constrain_range(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def sym_constrain_range_for_size(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def t(input: Tensor) -> Tensor: 
+    r"""
+    t(input) -> Tensor
+    
+    Expects :attr:`input` to be <= 2-D tensor and transposes dimensions 0
+    and 1.
+    
+    0-D and 1-D tensors are returned as is. When input is a 2-D tensor this
+    is equivalent to ``transpose(input, 0, 1)``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(())
+        >>> x
+        tensor(0.1995)
+        >>> torch.t(x)
+        tensor(0.1995)
+        >>> x = torch.randn(3)
+        >>> x
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> torch.t(x)
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.4875,  0.9158, -0.5872],
+                [ 0.3938, -0.6929,  0.6932]])
+        >>> torch.t(x)
+        tensor([[ 0.4875,  0.3938],
+                [ 0.9158, -0.6929],
+                [-0.5872,  0.6932]])
+    
+    See also :func:`torch.transpose`.
+    """
+    ...
+def t_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.t`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def take(input: Tensor, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take(input, index) -> Tensor
+    
+    Returns a new tensor with the elements of :attr:`input` at the given indices.
+    The input tensor is treated as if it were viewed as a 1-D tensor. The result
+    takes the same shape as the indices.
+    
+    Args:
+        input (Tensor): the input tensor.
+        index (LongTensor): the indices into tensor
+    
+    Example::
+    
+        >>> src = torch.tensor([[4, 3, 5],
+        ...                     [6, 7, 8]])
+        >>> torch.take(src, torch.tensor([0, 2, 5]))
+        tensor([ 4,  5,  8])
+    """
+    ...
+def take_along_dim(input: Tensor, indices: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take_along_dim(input, indices, dim=None, *, out=None) -> Tensor
+    
+    Selects values from :attr:`input` at the 1-dimensional indices from :attr:`indices` along the given :attr:`dim`.
+    
+    If :attr:`dim` is None, the input array is treated as if it has been flattened to 1d.
+    
+    Functions that return indices along a dimension, like :func:`torch.argmax` and :func:`torch.argsort`,
+    are designed to work with this function. See the examples below.
+    
+    .. note::
+        This function is similar to NumPy's `take_along_axis`.
+        See also :func:`torch.gather`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        indices (tensor): the indices into :attr:`input`. Must have long dtype.
+        dim (int, optional): dimension to select along.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[10, 30, 20], [60, 40, 50]])
+        >>> max_idx = torch.argmax(t)
+        >>> torch.take_along_dim(t, max_idx)
+        tensor([60])
+        >>> sorted_idx = torch.argsort(t, dim=1)
+        >>> torch.take_along_dim(t, sorted_idx, dim=1)
+        tensor([[10, 20, 30],
+                [40, 50, 60]])
+    """
+    ...
+def tan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the tangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.2027, -1.7687,  0.4412, -1.3856])
+        >>> torch.tan(a)
+        tensor([-2.5930,  4.9859,  0.4722, -5.3366])
+    """
+    ...
+def tan_(input: Tensor) -> Tensor: ...
+def tanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic tangent of the elements
+    of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tanh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.8986, -0.7279,  1.1745,  0.2611])
+        >>> torch.tanh(a)
+        tensor([ 0.7156, -0.6218,  0.8257,  0.2553])
+    """
+    ...
+def tanh_(input: Tensor) -> Tensor: ...
+def tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.
+    
+    .. warning::
+    
+        When working with tensors prefer using :func:`torch.Tensor.clone`,
+        :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
+        readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
+        ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
+        is equivalent to ``t.clone().detach().requires_grad_(True)``.
+    
+    .. seealso::
+    
+        :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
+        :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+        tensor([[ 0.1000,  1.2000],
+                [ 2.2000,  3.1000],
+                [ 4.9000,  5.2000]])
+    
+        >>> torch.tensor([0, 1])  # Type inference on data
+        tensor([ 0,  1])
+    
+        >>> torch.tensor([[0.11111, 0.222222, 0.3333333]],
+        ...              dtype=torch.float64,
+        ...              device=torch.device('cuda:0'))  # creates a double tensor on a CUDA device
+        tensor([[ 0.1111,  0.2222,  0.3333]], dtype=torch.float64, device='cuda:0')
+    
+        >>> torch.tensor(3.14159)  # Create a zero-dimensional (scalar) tensor
+        tensor(3.1416)
+    
+        >>> torch.tensor([])  # Create an empty tensor (of size (0,))
+        tensor([])
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, tensor_indices_or_sections: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, sections: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, indices: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+def threshold(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+def threshold_(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex]) -> Tensor: ...
+def tile(input: Tensor, dims: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    tile(input, dims) -> Tensor
+    
+    Constructs a tensor by repeating the elements of :attr:`input`.
+    The :attr:`dims` argument specifies the number of repetitions
+    in each dimension.
+    
+    If :attr:`dims` specifies fewer dimensions than :attr:`input` has, then
+    ones are prepended to :attr:`dims` until all dimensions are specified.
+    For example, if :attr:`input` has shape (8, 6, 4, 2) and :attr:`dims`
+    is (2, 2), then :attr:`dims` is treated as (1, 1, 2, 2).
+    
+    Analogously, if :attr:`input` has fewer dimensions than :attr:`dims`
+    specifies, then :attr:`input` is treated as if it were unsqueezed at
+    dimension zero until it has as many dimensions as :attr:`dims` specifies.
+    For example, if :attr:`input` has shape (4, 2) and :attr:`dims`
+    is (3, 3, 2, 2), then :attr:`input` is treated as if it had the
+    shape (1, 1, 4, 2).
+    
+    .. note::
+    
+        This function is similar to NumPy's tile function.
+    
+    Args:
+        input (Tensor): the tensor whose elements to repeat.
+        dims (tuple): the number of repetitions per dimension.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.tile((2,))
+        tensor([1, 2, 3, 1, 2, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.tile(y, (2, 2))
+        tensor([[1, 2, 1, 2],
+                [3, 4, 3, 4],
+                [1, 2, 1, 2],
+                [3, 4, 3, 4]])
+    """
+    ...
+def topk(input: Tensor, k: Union[_int, SymInt], dim: _int = -1, largest: _bool = True, sorted: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.topk: 
+    r"""
+    topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
+    a given dimension.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`largest` is ``False`` then the `k` smallest elements are returned.
+    
+    A namedtuple of `(values, indices)` is returned with the `values` and
+    `indices` of the largest `k` elements of each row of the `input` tensor in the
+    given dimension `dim`.
+    
+    The boolean option :attr:`sorted` if ``True``, will make sure that the returned
+    `k` elements are themselves sorted
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): the k in "top-k"
+        dim (int, optional): the dimension to sort along
+        largest (bool, optional): controls whether to return largest or
+               smallest elements
+        sorted (bool, optional): controls whether to return the elements
+               in sorted order
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
+            optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.topk(x, 3)
+        torch.return_types.topk(values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))
+    """
+    ...
+def trace(input: Tensor) -> Tensor: 
+    r"""
+    trace(input) -> Tensor
+    
+    Returns the sum of the elements of the diagonal of the input 2-D matrix.
+    
+    Example::
+    
+        >>> x = torch.arange(1., 10.).view(3, 3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.],
+                [ 7.,  8.,  9.]])
+        >>> torch.trace(x)
+        tensor(15.)
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+def transpose_copy(input: Tensor, dim0: _int, dim1: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.transpose`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapz(y: Tensor, *, dx: _float = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+@overload
+def trapz(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+def triangular_solve(input: Tensor, A: Tensor, upper: _bool = True, transpose: _bool = False, unitriangular: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.triangular_solve: 
+    r"""
+    triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor)
+    
+    Solves a system of equations with a square upper or lower triangular invertible matrix :math:`A`
+    and multiple right-hand sides :math:`b`.
+    
+    In symbols, it solves :math:`AX = b` and assumes :math:`A` is square upper-triangular
+    (or lower-triangular if :attr:`upper`\ `= False`) and does not have zeros on the diagonal.
+    
+    `torch.triangular_solve(b, A)` can take in 2D inputs `b, A` or inputs that are
+    batches of 2D matrices. If the inputs are batches, then returns
+    batched outputs `X`
+    
+    If the diagonal of :attr:`A` contains zeros or elements that are very close to zero and
+    :attr:`unitriangular`\ `= False` (default) or if the input matrix is badly conditioned,
+    the result may contain `NaN` s.
+    
+    Supports input of float, double, cfloat and cdouble data types.
+    
+    .. warning::
+    
+        :func:`torch.triangular_solve` is deprecated in favor of :func:`torch.linalg.solve_triangular`
+        and will be removed in a future PyTorch release.
+        :func:`torch.linalg.solve_triangular` has its arguments reversed and does not return a
+        copy of one of the inputs.
+    
+        ``X = torch.triangular_solve(B, A).solution`` should be replaced with
+    
+        .. code:: python
+    
+            X = torch.linalg.solve_triangular(A, B)
+    
+    Args:
+        b (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where
+                    :math:`*` is zero of more batch dimensions
+        A (Tensor): the input triangular coefficient matrix of size :math:`(*, m, m)`
+                    where :math:`*` is zero or more batch dimensions
+        upper (bool, optional): whether :math:`A` is upper or lower triangular. Default: ``True``.
+        transpose (bool, optional): solves `op(A)X = b` where `op(A) = A^T` if this flag is ``True``,
+                                    and `op(A) = A` if it is ``False``. Default: ``False``.
+        unitriangular (bool, optional): whether :math:`A` is unit triangular.
+            If True, the diagonal elements of :math:`A` are assumed to be
+            1 and not referenced from :math:`A`. Default: ``False``.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): tuple of two tensors to write
+            the output to. Ignored if `None`. Default: `None`.
+    
+    Returns:
+        A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient`
+        is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b`
+        (or whatever variant of the system of equations, depending on the keyword arguments.)
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 2).triu()
+        >>> A
+        tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]])
+        >>> b = torch.randn(2, 3)
+        >>> b
+        tensor([[-0.0210,  2.3513, -1.5492],
+                [ 1.5429,  0.7403, -1.0243]])
+        >>> torch.triangular_solve(b, A)
+        torch.return_types.triangular_solve(
+        solution=tensor([[ 1.7841,  2.9046, -2.5405],
+                [ 1.9320,  0.9270, -1.2826]]),
+        cloned_coefficient=tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]]))
+    """
+    ...
+def tril(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tril(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0813, -0.8619,  0.7105],
+                [ 0.0935,  0.1380,  2.2112],
+                [-0.3409, -0.9828,  0.0289]])
+        >>> torch.tril(a)
+        tensor([[-1.0813,  0.0000,  0.0000],
+                [ 0.0935,  0.1380,  0.0000],
+                [-0.3409, -0.9828,  0.0289]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 1.2219,  0.5653, -0.2521, -0.2345,  1.2544,  0.3461],
+                [ 0.4785, -0.4477,  0.6049,  0.6368,  0.8775,  0.7145],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.3615,  0.6864],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0978]])
+        >>> torch.tril(b, diagonal=1)
+        tensor([[ 1.2219,  0.5653,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785, -0.4477,  0.6049,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0000]])
+        >>> torch.tril(b, diagonal=-1)
+        tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716,  0.0000,  0.0000,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164,  0.0000,  0.0000,  0.0000]])
+    """
+    ...
+def tril_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the lower triangular part of a :attr:`row`-by-
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.tril_indices(3, 3)
+        >>> a
+        tensor([[0, 1, 1, 2, 2, 2],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, -1)
+        >>> a
+        tensor([[1, 2, 2, 3, 3, 3],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
+                [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
+    """
+    ...
+def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, margin: _float = 1.0, p: _float = 2, eps: _float = 1e-06, swap: _bool = False, reduction: _int = 1) -> Tensor: ...
+def triu(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    triu(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.3480, -0.5211, -0.4573]])
+        >>> torch.triu(a)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.0000, -1.0680,  0.6602],
+                [ 0.0000,  0.0000, -0.4573]])
+        >>> torch.triu(a, diagonal=1)
+        tensor([[ 0.0000,  0.5207,  2.0049],
+                [ 0.0000,  0.0000,  0.6602],
+                [ 0.0000,  0.0000,  0.0000]])
+        >>> torch.triu(a, diagonal=-1)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.0000, -0.5211, -0.4573]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.4333,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=1)
+        tensor([[ 0.0000, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [ 0.0000,  0.0000, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.0000,  0.0000, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000,  0.0000,  0.0000,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=-1)
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000, -1.3337, -1.6556,  0.4798,  0.2830]])
+    """
+    ...
+def triu_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the upper triangular part of a :attr:`row` by
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.triu_indices(3, 3)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 2],
+                [0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, -1)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3],
+                [0, 1, 2, 0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1],
+                [1, 2, 2]])
+    """
+    ...
+def true_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    true_divide(dividend, divisor, *, out) -> Tensor
+    
+    Alias for :func:`torch.div` with ``rounding_mode=None``.
+    """
+    ...
+def trunc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    trunc(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the truncated integer values of
+    the elements of :attr:`input`.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 3.4742,  0.5466, -0.8008, -0.9079])
+        >>> torch.trunc(a)
+        tensor([ 3.,  0., -0., -0.])
+    """
+    ...
+def trunc_(input: Tensor) -> Tensor: ...
+@overload
+def unbind(input: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+@overload
+def unbind(input: Tensor, dim: Union[str, ellipsis, None]) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+def unbind_copy(input: Tensor, dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.unbind`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: Union[str, ellipsis, None], sizes: Sequence[Union[_int, SymInt]], names: Sequence[Union[str, ellipsis, None]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: _int, sizes: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+def unfold_copy(input: Tensor, dimension: _int, size: _int, step: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unfold`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def unique_dim(input: Tensor, dim: _int, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def unsafe_chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.chunk` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_split(tensor, split_size_or_sections, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.split` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def unsqueeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    unsqueeze(input, dim) -> Tensor
+    
+    Returns a new tensor with a dimension of size one inserted at the
+    specified position.
+    
+    The returned tensor shares the same underlying data with this tensor.
+    
+    A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+    can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+    applied at :attr:`dim` = ``dim + input.dim() + 1``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the index at which to insert the singleton dimension
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4])
+        >>> torch.unsqueeze(x, 0)
+        tensor([[ 1,  2,  3,  4]])
+        >>> torch.unsqueeze(x, 1)
+        tensor([[ 1],
+                [ 2],
+                [ 3],
+                [ 4]])
+    """
+    ...
+def unsqueeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unsqueeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.values`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def vander(x: Tensor, N: Optional[_int] = None, increasing: _bool = False) -> Tensor: 
+    r"""
+    vander(x, N=None, increasing=False) -> Tensor
+    
+    Generates a Vandermonde matrix.
+    
+    The columns of the output matrix are elementwise powers of the input vector :math:`x^{(N-1)}, x^{(N-2)}, ..., x^0`.
+    If increasing is True, the order of the columns is reversed :math:`x^0, x^1, ..., x^{(N-1)}`. Such a
+    matrix with a geometric progression in each row is named for Alexandre-Theophile Vandermonde.
+    
+    Arguments:
+        x (Tensor): 1-D input tensor.
+        N (int, optional): Number of columns in the output. If N is not specified,
+            a square array is returned :math:`(N = len(x))`.
+        increasing (bool, optional): Order of the powers of the columns. If True,
+            the powers increase from left to right, if False (the default) they are reversed.
+    
+    Returns:
+        Tensor: Vandermonde matrix. If increasing is False, the first column is :math:`x^{(N-1)}`,
+        the second :math:`x^{(N-2)}` and so forth. If increasing is True, the columns
+        are :math:`x^0, x^1, ..., x^{(N-1)}`.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 5])
+        >>> torch.vander(x)
+        tensor([[  1,   1,   1,   1],
+                [  8,   4,   2,   1],
+                [ 27,   9,   3,   1],
+                [125,  25,   5,   1]])
+        >>> torch.vander(x, N=3)
+        tensor([[ 1,  1,  1],
+                [ 4,  2,  1],
+                [ 9,  3,  1],
+                [25,  5,  1]])
+        >>> torch.vander(x, N=3, increasing=True)
+        tensor([[ 1,  1,  1],
+                [ 1,  2,  4],
+                [ 1,  3,  9],
+                [ 1,  5, 25]])
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+def vdot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vdot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D vectors along a dimension.
+    
+    In symbols, this function computes
+    
+    .. math::
+    
+        \sum_{i=1}^n \overline{x_i}y_i.
+    
+    where :math:`\overline{x_i}` denotes the conjugate for complex
+    vectors, and it is the identity for real vectors.
+    
+    .. note::
+    
+        Unlike NumPy's vdot, torch.vdot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.vecdot` computes the dot product of two batches of vectors along a dimension.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D. Its conjugate is used if it's complex.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+    
+    .. note:: out (Tensor, optional): the output tensor.
+    
+    
+    Example::
+    
+        >>> torch.vdot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+        >>> a = torch.tensor((1 +2j, 3 - 1j))
+        >>> b = torch.tensor((2 +1j, 4 - 0j))
+        >>> torch.vdot(a, b)
+        tensor([16.+1.j])
+        >>> torch.vdot(b, a)
+        tensor([16.-1.j])
+    """
+    ...
+def view_as_complex(input: Tensor) -> Tensor: 
+    r"""
+    view_as_complex(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a complex tensor. For an input complex
+    tensor of :attr:`size` :math:`m1, m2, \dots, mi, 2`, this function returns a
+    new complex tensor of :attr:`size` :math:`m1, m2, \dots, mi` where the last
+    dimension of the input tensor is expected to represent the real and imaginary
+    components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_complex` is only supported for tensors with
+        :class:`torch.dtype` ``torch.float64`` and ``torch.float32``.  The input is
+        expected to have the last dimension of :attr:`size` 2. In addition, the
+        tensor must have a `stride` of 1 for its last dimension. The strides of all
+        other dimensions must be even numbers.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, 2)
+        >>> x
+        tensor([[ 1.6116, -0.5772],
+                [-1.4606, -0.9120],
+                [ 0.0786, -1.7497],
+                [-0.6561, -1.6623]])
+        >>> torch.view_as_complex(x)
+        tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)])
+    """
+    ...
+def view_as_complex_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_complex`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def view_as_real(input: Tensor) -> Tensor: 
+    r"""
+    view_as_real(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a real tensor. For an input complex tensor of
+    :attr:`size` :math:`m1, m2, \dots, mi`, this function returns a new
+    real tensor of size :math:`m1, m2, \dots, mi, 2`, where the last dimension of size 2
+    represents the real and imaginary components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_real` is only supported for tensors with ``complex dtypes``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.4737-0.3839j), (-0.2098-0.6699j), (0.3470-0.9451j), (-0.5174-1.3136j)])
+        >>> torch.view_as_real(x)
+        tensor([[ 0.4737, -0.3839],
+                [-0.2098, -0.6699],
+                [ 0.3470, -0.9451],
+                [-0.5174, -1.3136]])
+    """
+    ...
+def view_as_real_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_real`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def vsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+@overload
+def vsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+def vstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence vertically (row wise).
+    
+    This is equivalent to concatenation along the first axis after all 1-D tensors have been reshaped by :func:`torch.atleast_2d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.vstack((a,b))
+        tensor([[1, 2, 3],
+                [4, 5, 6]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.vstack((a,b))
+        tensor([[1],
+                [2],
+                [3],
+                [4],
+                [5],
+                [6]])
+    """
+    ...
+@overload
+def where(condition: Tensor) -> Tuple[Tensor, ...]: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(self: Union[Number, _complex], other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def xlogy_(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def zero_(input: Tensor) -> Tensor: ...
+@overload
+def zeros(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+def zeros_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the same size as
+    :attr:`input`. ``torch.zeros_like(input)`` is equivalent to
+    ``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.zeros_like(input, out=output)`` is equivalent to
+        ``torch.zeros(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.zeros_like(input)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+
+__all__ = ['__and__', '__lshift__', '__or__', '__rshift__', '__xor__', '_adaptive_avg_pool2d',
+ '_adaptive_avg_pool3d', '_add_batch_dim', '_add_relu', '_add_relu_', '_addmm_activation',
+ '_aminmax', '_amp_foreach_non_finite_check_and_unscale_', '_amp_update_scale_', '_assert_async',
+ '_assert_scalar', '_assert_tensor_metadata', '_batch_norm_impl_index', '_cast_Byte', '_cast_Char',
+ '_cast_Double', '_cast_Float', '_cast_Half', '_cast_Int', '_cast_Long', '_cast_Short',
+ '_choose_qparams_per_tensor', '_chunk_cat', '_coalesce', '_compute_linear_combination', '_conj',
+ '_conj_copy', '_conj_physical', '_convert_indices_from_coo_to_csr',
+ '_convert_indices_from_csr_to_coo', '_convert_weight_to_int4pack', '_convolution',
+ '_convolution_mode', '_copy_from', '_copy_from_and_resize', '_cslt_compress', '_cslt_sparse_mm',
+ '_cslt_sparse_mm_search', '_ctc_loss', '_cudnn_ctc_loss', '_cudnn_init_dropout_state',
+ '_cudnn_rnn', '_cudnn_rnn_flatten_weight', '_cufft_clear_plan_cache',
+ '_cufft_get_plan_cache_max_size', '_cufft_get_plan_cache_size', '_cufft_set_plan_cache_max_size',
+ '_cummax_helper', '_cummin_helper', '_debug_has_internal_overlap', '_dim_arange',
+ '_dirichlet_grad', '_disable_functionalization', '_efficientzerotensor', '_embedding_bag',
+ '_embedding_bag_forward_only', '_empty_affine_quantized', '_empty_per_channel_affine_quantized',
+ '_enable_functionalization', '_euclidean_dist', '_fake_quantize_learnable_per_channel_affine',
+ '_fake_quantize_learnable_per_tensor_affine',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', '_fft_c2c', '_fft_c2r', '_fft_r2c',
+ '_fill_mem_eff_dropout_mask_', '_foobar', '_foreach_abs', '_foreach_abs_', '_foreach_acos',
+ '_foreach_acos_', '_foreach_add', '_foreach_add_', '_foreach_addcdiv', '_foreach_addcdiv_',
+ '_foreach_addcmul', '_foreach_addcmul_', '_foreach_asin', '_foreach_asin_', '_foreach_atan',
+ '_foreach_atan_', '_foreach_ceil', '_foreach_ceil_', '_foreach_clamp_max', '_foreach_clamp_max_',
+ '_foreach_clamp_min', '_foreach_clamp_min_', '_foreach_copy_', '_foreach_cos', '_foreach_cos_',
+ '_foreach_cosh', '_foreach_cosh_', '_foreach_div', '_foreach_div_', '_foreach_erf',
+ '_foreach_erf_', '_foreach_erfc', '_foreach_erfc_', '_foreach_exp', '_foreach_exp_',
+ '_foreach_expm1', '_foreach_expm1_', '_foreach_floor', '_foreach_floor_', '_foreach_frac',
+ '_foreach_frac_', '_foreach_lerp', '_foreach_lerp_', '_foreach_lgamma', '_foreach_lgamma_',
+ '_foreach_log', '_foreach_log10', '_foreach_log10_', '_foreach_log1p', '_foreach_log1p_',
+ '_foreach_log2', '_foreach_log2_', '_foreach_log_', '_foreach_maximum', '_foreach_maximum_',
+ '_foreach_minimum', '_foreach_minimum_', '_foreach_mul', '_foreach_mul_', '_foreach_neg',
+ '_foreach_neg_', '_foreach_norm', '_foreach_pow', '_foreach_pow_', '_foreach_reciprocal',
+ '_foreach_reciprocal_', '_foreach_round', '_foreach_round_', '_foreach_sigmoid',
+ '_foreach_sigmoid_', '_foreach_sign', '_foreach_sign_', '_foreach_sin', '_foreach_sin_',
+ '_foreach_sinh', '_foreach_sinh_', '_foreach_sqrt', '_foreach_sqrt_', '_foreach_sub',
+ '_foreach_sub_', '_foreach_tan', '_foreach_tan_', '_foreach_tanh', '_foreach_tanh_',
+ '_foreach_trunc', '_foreach_trunc_', '_foreach_zero_', '_from_functional_tensor',
+ '_functional_assert_async', '_functional_assert_scalar', '_functional_sym_constrain_range',
+ '_functional_sym_constrain_range_for_size',
+ '_functionalize_are_all_mutations_hidden_from_autograd',
+ '_functionalize_are_all_mutations_under_no_grad_or_inference_mode', '_functionalize_commit_update',
+ '_functionalize_mark_mutation_hidden_from_autograd', '_functionalize_replace',
+ '_functionalize_sync', '_fused_adam_', '_fused_adamw_', '_fused_dropout',
+ '_fused_moving_avg_obs_fq_helper', '_fused_moving_avg_obs_fq_helper', '_fused_sdp_choice',
+ '_fused_sgd_', '_fw_primal_copy', '_grid_sampler_2d_cpu_fallback',
+ '_has_compatible_shallow_copy_type', '_histogramdd_bin_edges', '_histogramdd_from_bin_cts',
+ '_histogramdd_from_bin_tensors', '_index_put_impl_', '_indices_copy', '_int_mm', '_is_all_true',
+ '_is_any_true', '_is_functional_tensor', '_is_zerotensor', '_lazy_clone', '_linalg_check_errors',
+ '_linalg_det', '_linalg_det', '_linalg_eigh', '_linalg_eigh', '_linalg_slogdet', '_linalg_slogdet',
+ '_linalg_solve_ex', '_linalg_solve_ex', '_linalg_svd', '_linalg_svd', '_log_softmax',
+ '_log_softmax_backward_data', '_logcumsumexp', '_lstm_mps', '_lu_with_info', '_lu_with_info',
+ '_make_dep_token', '_make_dual', '_make_dual_copy', '_make_per_channel_quantized_tensor',
+ '_make_per_tensor_quantized_tensor', '_masked_scale', '_masked_softmax', '_mixed_dtypes_linear',
+ '_mkldnn_reshape', '_mkldnn_transpose', '_mkldnn_transpose_', '_mps_convolution',
+ '_mps_convolution_transpose', '_native_batch_norm_legit', '_native_batch_norm_legit_no_training',
+ '_native_multi_head_attention', '_neg_view', '_neg_view_copy', '_nested_from_padded',
+ '_nested_from_padded_and_nested_example', '_nested_get_jagged_dummy', '_nested_get_lengths',
+ '_nested_get_offsets', '_nested_get_ragged_idx', '_nested_get_values', '_nested_get_values_copy',
+ '_nested_tensor_from_mask', '_nested_tensor_from_mask_left_aligned',
+ '_nested_tensor_from_tensor_list', '_nested_tensor_softmax_with_shape', '_nested_view_from_buffer',
+ '_nested_view_from_buffer_copy', '_nested_view_from_jagged', '_nested_view_from_jagged_copy',
+ '_nnpack_available', '_nnpack_spatial_convolution', '_pack_padded_sequence',
+ '_pad_packed_sequence', '_pin_memory', '_prelu_kernel', '_print', '_propagate_xla_data',
+ '_remove_batch_dim', '_reshape_alias_copy', '_reshape_from_tensor', '_resize_output_',
+ '_rowwise_prune', '_sample_dirichlet', '_saturate_weight_to_fp16',
+ '_scaled_dot_product_attention_math', '_scaled_dot_product_cudnn_attention',
+ '_scaled_dot_product_cudnn_attention', '_scaled_dot_product_efficient_attention',
+ '_scaled_dot_product_efficient_attention', '_scaled_dot_product_flash_attention',
+ '_scaled_dot_product_flash_attention', '_scaled_dot_product_flash_attention_for_cpu',
+ '_scaled_dot_product_flash_attention_for_cpu', '_scaled_mm', '_shape_as_tensor',
+ '_sobol_engine_draw', '_sobol_engine_ff_', '_sobol_engine_initialize_state_',
+ '_sobol_engine_scramble_', '_softmax', '_softmax_backward_data', '_sparse_broadcast_to',
+ '_sparse_broadcast_to_copy', '_sparse_csr_prod', '_sparse_csr_sum',
+ '_sparse_log_softmax_backward_data', '_sparse_semi_structured_linear',
+ '_sparse_softmax_backward_data', '_sparse_sparse_matmul', '_sparse_sum', '_stack',
+ '_standard_gamma', '_standard_gamma_grad', '_sync', '_test_autograd_multiple_dispatch',
+ '_test_autograd_multiple_dispatch_view', '_test_autograd_multiple_dispatch_view_copy',
+ '_test_check_tensor', '_test_functorch_fallback', '_test_parallel_materialize',
+ '_test_serialization_subcmul', '_to_cpu', '_to_functional_tensor', '_to_sparse_semi_structured',
+ '_transform_bias_rescale_qkv', '_transformer_encoder_layer_fwd', '_trilinear',
+ '_triton_multi_head_attention', '_triton_scaled_dot_attention', '_unique', '_unique2',
+ '_unpack_dual', '_unpack_dual', '_unsafe_index', '_unsafe_index_put', '_use_cudnn_ctc_loss',
+ '_use_cudnn_rnn_flatten_weight', '_validate_compressed_sparse_indices',
+ '_validate_sparse_bsc_tensor_args', '_validate_sparse_bsr_tensor_args',
+ '_validate_sparse_compressed_tensor_args', '_validate_sparse_coo_tensor_args',
+ '_validate_sparse_csc_tensor_args', '_validate_sparse_csr_tensor_args', '_values_copy',
+ '_weight_int4pack_mm', '_weight_int8pack_mm', '_weight_norm', '_weight_norm_interface', 'abs',
+ 'abs_', 'absolute', 'acos', 'acos_', 'acosh', 'acosh_', 'adaptive_avg_pool1d',
+ 'adaptive_max_pool1d', 'add', 'addbmm', 'addcdiv', 'addcmul', 'addmm', 'addmv', 'addmv_', 'addr',
+ 'adjoint', 'affine_grid_generator', 'alias_copy', 'all', 'allclose', 'alpha_dropout',
+ 'alpha_dropout_', 'amax', 'amin', 'aminmax', 'aminmax', 'angle', 'any', 'arange', 'arccos',
+ 'arccos_', 'arccosh', 'arccosh_', 'arcsin', 'arcsin_', 'arcsinh', 'arcsinh_', 'arctan', 'arctan2',
+ 'arctan_', 'arctanh', 'arctanh_', 'argmax', 'argmin', 'argsort', 'argwhere', 'as_strided',
+ 'as_strided_', 'as_strided_copy', 'as_strided_scatter', 'as_tensor', 'asarray', 'asin', 'asin_',
+ 'asinh', 'asinh_', 'atan', 'atan2', 'atan_', 'atanh', 'atanh_', 'avg_pool1d', 'baddbmm',
+ 'bartlett_window', 'batch_norm', 'batch_norm_backward_elemt', 'batch_norm_backward_reduce',
+ 'batch_norm_elemt', 'batch_norm_gather_stats', 'batch_norm_gather_stats_with_counts',
+ 'batch_norm_stats', 'batch_norm_update_stats', 'bernoulli', 'bilinear',
+ 'binary_cross_entropy_with_logits', 'bincount', 'binomial', 'bitwise_and', 'bitwise_left_shift',
+ 'bitwise_not', 'bitwise_or', 'bitwise_right_shift', 'bitwise_xor', 'blackman_window', 'bmm',
+ 'broadcast_to', 'bucketize', 'can_cast', 'cat', 'ccol_indices_copy', 'ceil', 'ceil_', 'celu',
+ 'celu_', 'channel_shuffle', 'cholesky', 'cholesky_inverse', 'cholesky_solve',
+ 'choose_qparams_optimized', 'chunk', 'clamp', 'clamp_', 'clamp_max', 'clamp_max_', 'clamp_min',
+ 'clamp_min_', 'clip', 'clip_', 'clone', 'col_indices_copy', 'column_stack', 'combinations',
+ 'complex', 'concat', 'concatenate', 'conj', 'conj_physical', 'conj_physical_', 'constant_pad_nd',
+ 'conv1d', 'conv2d', 'conv3d', 'conv_tbc', 'conv_transpose1d', 'conv_transpose2d',
+ 'conv_transpose3d', 'convolution', 'copysign', 'corrcoef', 'cos', 'cos_', 'cosh', 'cosh_',
+ 'cosine_embedding_loss', 'cosine_similarity', 'count_nonzero', 'cov', 'cross', 'crow_indices_copy',
+ 'ctc_loss', 'cudnn_affine_grid_generator', 'cudnn_batch_norm', 'cudnn_convolution',
+ 'cudnn_convolution_add_relu', 'cudnn_convolution_relu', 'cudnn_convolution_transpose',
+ 'cudnn_grid_sampler', 'cudnn_is_acceptable', 'cummax', 'cummax', 'cummin', 'cummin', 'cumprod',
+ 'cumsum', 'cumulative_trapezoid', 'deg2rad', 'deg2rad_', 'dequantize', 'det', 'detach', 'detach_',
+ 'detach_copy', 'diag', 'diag_embed', 'diagflat', 'diagonal', 'diagonal_copy', 'diagonal_scatter',
+ 'diff', 'digamma', 'dist', 'div', 'divide', 'dot', 'dropout', 'dropout_', 'dsmm', 'dsplit',
+ 'dstack', 'embedding', 'embedding_bag', 'embedding_renorm_', 'empty', 'empty_like',
+ 'empty_permuted', 'empty_quantized', 'empty_strided', 'eq', 'equal', 'erf', 'erf_', 'erfc',
+ 'erfc_', 'erfinv', 'exp', 'exp2', 'exp2_', 'exp_', 'expand_copy', 'expm1', 'expm1_', 'eye',
+ 'fake_quantize_per_channel_affine', 'fake_quantize_per_tensor_affine', 'fbgemm_linear_fp16_weight',
+ 'fbgemm_linear_fp16_weight_fp32_activation', 'fbgemm_linear_int8_weight',
+ 'fbgemm_linear_int8_weight_fp32_activation', 'fbgemm_linear_quantize_weight',
+ 'fbgemm_pack_gemm_matrix_fp16', 'fbgemm_pack_quantized_matrix', 'feature_alpha_dropout',
+ 'feature_alpha_dropout_', 'feature_dropout', 'feature_dropout_', 'fill', 'fill_', 'fix', 'fix_',
+ 'flatten', 'flip', 'fliplr', 'flipud', 'float_power', 'floor', 'floor_', 'floor_divide', 'fmax',
+ 'fmin', 'fmod', 'frac', 'frac_', 'frexp', 'frexp', 'frobenius_norm', 'from_file', 'from_numpy',
+ 'frombuffer', 'full', 'full_like', 'fused_moving_avg_obs_fake_quant', 'gather', 'gcd', 'gcd_',
+ 'ge', 'geqrf', 'geqrf', 'ger', 'get_default_dtype', 'get_num_interop_threads', 'get_num_threads',
+ 'gradient', 'greater', 'greater_equal', 'grid_sampler', 'grid_sampler_2d', 'grid_sampler_3d',
+ 'group_norm', 'gru', 'gru_cell', 'gt', 'hamming_window', 'hann_window', 'hardshrink', 'heaviside',
+ 'hinge_embedding_loss', 'histc', 'histogram', 'histogram', 'histogramdd', 'histogramdd', 'hsmm',
+ 'hsplit', 'hspmm', 'hstack', 'hypot', 'i0', 'i0_', 'igamma', 'igammac', 'imag', 'index_add',
+ 'index_copy', 'index_fill', 'index_put', 'index_put_', 'index_reduce', 'index_select',
+ 'indices_copy', 'init_num_threads', 'inner', 'instance_norm', 'int_repr', 'inverse', 'is_complex',
+ 'is_conj', 'is_distributed', 'is_floating_point', 'is_grad_enabled', 'is_inference',
+ 'is_inference_mode_enabled', 'is_neg', 'is_nonzero', 'is_same_size', 'is_signed',
+ 'is_vulkan_available', 'isclose', 'isfinite', 'isin', 'isinf', 'isnan', 'isneginf', 'isposinf',
+ 'isreal', 'istft', 'kaiser_window', 'kl_div', 'kron', 'kthvalue', 'kthvalue', 'layer_norm', 'lcm',
+ 'lcm_', 'ldexp', 'ldexp_', 'le', 'lerp', 'less', 'less_equal', 'lgamma', 'linspace', 'log',
+ 'log10', 'log10_', 'log1p', 'log1p_', 'log2', 'log2_', 'log_', 'log_softmax', 'logaddexp',
+ 'logaddexp2', 'logcumsumexp', 'logdet', 'logical_and', 'logical_not', 'logical_or', 'logical_xor',
+ 'logit', 'logit_', 'logspace', 'logsumexp', 'lstm', 'lstm_cell', 'lt', 'lu_solve', 'lu_unpack',
+ 'lu_unpack', 'margin_ranking_loss', 'masked_fill', 'masked_scatter', 'masked_select', 'matmul',
+ 'matrix_exp', 'matrix_power', 'max', 'max', 'max_pool1d', 'max_pool1d_with_indices', 'max_pool2d',
+ 'max_pool3d', 'maximum', 'mean', 'median', 'median', 'min', 'min', 'minimum', 'miopen_batch_norm',
+ 'miopen_convolution', 'miopen_convolution_add_relu', 'miopen_convolution_relu',
+ 'miopen_convolution_transpose', 'miopen_depthwise_convolution', 'miopen_rnn',
+ 'mkldnn_adaptive_avg_pool2d', 'mkldnn_convolution', 'mkldnn_linear_backward_weights',
+ 'mkldnn_max_pool2d', 'mkldnn_max_pool3d', 'mkldnn_rnn_layer', 'mm', 'mode', 'mode', 'moveaxis',
+ 'movedim', 'msort', 'mul', 'multinomial', 'multiply', 'mv', 'mvlgamma', 'nan_to_num',
+ 'nan_to_num_', 'nanmean', 'nanmedian', 'nanmedian', 'nanquantile', 'nansum', 'narrow',
+ 'narrow_copy', 'native_batch_norm', 'native_channel_shuffle', 'native_dropout',
+ 'native_group_norm', 'native_layer_norm', 'native_norm', 'ne', 'neg', 'neg_', 'negative',
+ 'negative_', 'nextafter', 'nonzero', 'nonzero_static', 'norm_except_dim', 'normal', 'not_equal',
+ 'nuclear_norm', 'numel', 'ones', 'ones_like', 'orgqr', 'ormqr', 'outer', 'pairwise_distance',
+ 'pdist', 'permute', 'permute_copy', 'pinverse', 'pixel_shuffle', 'pixel_unshuffle', 'poisson',
+ 'poisson_nll_loss', 'polar', 'polygamma', 'positive', 'pow', 'prelu', 'prod', 'promote_types',
+ 'put', 'q_per_channel_axis', 'q_per_channel_scales', 'q_per_channel_zero_points', 'q_scale',
+ 'q_zero_point', 'qr', 'qr', 'quantile', 'quantize_per_channel', 'quantize_per_tensor',
+ 'quantize_per_tensor_dynamic', 'quantized_batch_norm', 'quantized_gru_cell', 'quantized_lstm_cell',
+ 'quantized_max_pool1d', 'quantized_max_pool2d', 'quantized_max_pool3d', 'quantized_rnn_relu_cell',
+ 'quantized_rnn_tanh_cell', 'rad2deg', 'rad2deg_', 'rand', 'rand_like', 'randint', 'randint_like',
+ 'randn', 'randn_like', 'randperm', 'range', 'ravel', 'real', 'reciprocal', 'reciprocal_', 'relu',
+ 'relu_', 'remainder', 'renorm', 'repeat_interleave', 'reshape', 'resize_as_', 'resize_as_sparse_',
+ 'resolve_conj', 'resolve_neg', 'result_type', 'rnn_relu', 'rnn_relu_cell', 'rnn_tanh',
+ 'rnn_tanh_cell', 'roll', 'rot90', 'round', 'round_', 'row_indices_copy', 'row_stack', 'rrelu',
+ 'rrelu_', 'rsqrt', 'rsqrt_', 'rsub', 'saddmm', 'scalar_tensor', 'scatter', 'scatter_add',
+ 'scatter_reduce', 'searchsorted', 'segment_reduce', 'select', 'select_copy', 'select_scatter',
+ 'selu', 'selu_', 'set_flush_denormal', 'set_num_interop_threads', 'set_num_threads', 'sgn',
+ 'sigmoid', 'sigmoid_', 'sign', 'signbit', 'sin', 'sin_', 'sinc', 'sinc_', 'sinh', 'sinh_',
+ 'slice_copy', 'slice_inverse', 'slice_scatter', 'slogdet', 'slogdet', 'smm', 'softmax', 'sort',
+ 'sort', 'sparse_bsc_tensor', 'sparse_bsr_tensor', 'sparse_compressed_tensor', 'sparse_coo_tensor',
+ 'sparse_csc_tensor', 'sparse_csr_tensor', 'split_copy', 'split_with_sizes',
+ 'split_with_sizes_copy', 'spmm', 'sqrt', 'sqrt_', 'square', 'square_', 'squeeze', 'squeeze_copy',
+ 'sspaddmm', 'stack', 'std', 'std_mean', 'sub', 'subtract', 'sum', 'svd', 'svd', 'swapaxes',
+ 'swapdims', 'sym_constrain_range', 'sym_constrain_range_for_size', 't', 't_copy', 'take',
+ 'take_along_dim', 'tan', 'tan_', 'tanh', 'tanh_', 'tensor', 'tensor_split', 'threshold',
+ 'threshold_', 'tile', 'topk', 'topk', 'trace', 'transpose', 'transpose_copy', 'trapezoid', 'trapz',
+ 'triangular_solve', 'triangular_solve', 'tril', 'tril_indices', 'triplet_margin_loss', 'triu',
+ 'triu_indices', 'true_divide', 'trunc', 'trunc_', 'unbind', 'unbind_copy', 'unflatten',
+ 'unfold_copy', 'unique_dim', 'unsafe_chunk', 'unsafe_split', 'unsafe_split_with_sizes',
+ 'unsqueeze', 'unsqueeze_copy', 'values_copy', 'vander', 'var', 'var_mean', 'vdot',
+ 'view_as_complex', 'view_as_complex_copy', 'view_as_real', 'view_as_real_copy', 'view_copy',
+ 'vsplit', 'vstack', 'where', 'xlogy', 'xlogy_', 'zero_', 'zeros', 'zeros_like']
diff --git a/MLPY/Lib/site-packages/torch/_C/__init__.pyi b/MLPY/Lib/site-packages/torch/_C/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..ae27eb407aa24f500f6c21b09b51d5418bc2a12c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/__init__.pyi
@@ -0,0 +1,10976 @@
+# @generated from torch/_C/__init__.pyi.in
+# mypy: disable-error-code="type-arg"
+
+import builtins
+from enum import Enum, IntEnum
+from pathlib import Path
+from typing import (
+    Any,
+    AnyStr,
+    BinaryIO,
+    Callable,
+    ContextManager,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Protocol,
+    Sequence,
+    Set,
+    SupportsIndex,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    overload,
+    runtime_checkable,
+)
+from typing_extensions import ParamSpec
+
+import torch
+from torch import inf, SymInt, Tensor
+from torch.autograd.graph import Node as _Node
+from torch.package import PackageExporter
+from torch.storage import UntypedStorage, TypedStorage
+from torch.types import (
+    _bool,
+    _complex,
+    _device,
+    _dispatchkey,
+    _dtype,
+    _float,
+    _int,
+    _layout,
+    _qscheme,
+    _size,
+    Device,
+    Number,
+    Storage,
+)
+
+from torch._prims_common import DeviceLikeType
+
+# This module is defined in torch/csrc/Module.cpp
+
+from . import _functorch, _lazy, _lazy_ts_backend, _nn, _onnx, _VariableFunctions, _cpu, _aoti, _verbose
+
+K = TypeVar("K")
+T = TypeVar("T")
+S = TypeVar("S", bound="torch.Tensor")
+P = ParamSpec("P")
+ReturnVal = TypeVar("ReturnVal", covariant=True)  # return value (always covariant)
+_T_co = TypeVar("_T_co", covariant=True)
+
+
+@runtime_checkable
+class _NestedSequence(Protocol[_T_co]):
+    """A protocol for representing nested sequences.
+
+    References::
+        `numpy._typing._NestedSequence`
+        <https://github.com/numpy/numpy/blob/main/numpy/_typing/_nested_sequence.py>
+    """
+
+    def __len__(self, /) -> builtins.int: ...
+    def __getitem__(self, index: builtins.int, /) -> _T_co | _NestedSequence[_T_co]: ...
+    def __contains__(self, x: builtins.object, /) -> builtins.bool: ...
+    def __iter__(self, /) -> Iterator[_T_co | _NestedSequence[_T_co]]: ...
+    def __reversed__(self, /) -> Iterator[_T_co | _NestedSequence[_T_co]]: ...
+    def count(self, value: Any, /) -> builtins.int: ...
+    def index(self, value: Any, /) -> builtins.int: ...
+
+
+# Defined in torch/csrc/Device.cpp
+class device:
+    type: str  # THPDevice_type
+    index: _int  # THPDevice_index
+
+    def __get__(self, instance, owner=None) -> device: ...
+
+    # THPDevice_pynew
+    @overload
+    def __init__(self, device: DeviceLikeType) -> None: ...
+    @overload
+    def __init__(self, type: str, index: _int) -> None: ...
+
+    # Uncomment if we ever make torch.device a decorator
+    # def __call__(self, func: T) -> T: ...
+
+    def __enter__(self) -> device: ...
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None: ...
+    def __reduce__(self) -> Tuple[Any, ...]: ...  # THPDevice_reduce
+
+# Defined in torch/csrc/Stream.cpp
+class Stream:
+    stream_id: _int  # Stream id
+    device_index: _int
+    device_type: _int
+
+    device: device  # The device of the stream
+
+# Defined in torch/csrc/Size.cpp
+class Size(Tuple[_int, ...]):
+    # TODO: __reduce__
+
+    @overload  # type: ignore[override]
+    def __getitem__(self: Size, key: _int) -> _int: ...
+    @overload
+    def __getitem__(self: Size, key: slice) -> Size: ...
+    def numel(self: Size) -> _int: ...
+
+# Defined in torch/csrc/Dtype.cpp
+class dtype:
+    # TODO: __reduce__
+    is_floating_point: _bool
+    is_complex: _bool
+    is_signed: _bool
+    itemsize: _int
+    def to_real(self) -> dtype: ...
+    def to_complex(self) -> dtype: ...
+
+# Defined in torch/csrc/TypeInfo.cpp
+class iinfo:
+    bits: _int
+    min: _int
+    max: _int
+    dtype: str
+
+    def __init__(self, dtype: _dtype) -> None: ...
+
+class finfo:
+    bits: _int
+    min: _float
+    max: _float
+    eps: _float
+    tiny: _float
+    smallest_normal: _float
+    resolution: _float
+    dtype: str
+
+    @overload
+    def __init__(self, dtype: _dtype) -> None: ...
+    @overload
+    def __init__(self) -> None: ...
+
+float32: dtype = ...
+float: dtype = ...
+float64: dtype = ...
+double: dtype = ...
+float16: dtype = ...
+bfloat16: dtype = ...
+float8_e4m3fn: dtype = ...
+float8_e4m3fnuz: dtype = ...
+float8_e5m2: dtype = ...
+float8_e5m2fnuz: dtype = ...
+half: dtype = ...
+uint8: dtype = ...
+uint16: dtype = ...
+uint32: dtype = ...
+uint64: dtype = ...
+int8: dtype = ...
+int16: dtype = ...
+short: dtype = ...
+int32: dtype = ...
+int: dtype = ...
+int64: dtype = ...
+long: dtype = ...
+complex32: dtype = ...
+complex64: dtype = ...
+chalf: dtype = ...
+cfloat: dtype = ...
+complex128: dtype = ...
+cdouble: dtype = ...
+quint8: dtype = ...
+qint8: dtype = ...
+qint32: dtype = ...
+bool: dtype = ...
+quint4x2: dtype = ...
+quint2x4: dtype = ...
+bits1x8: dtype = ...
+bits2x4: dtype = ...
+bits4x2: dtype = ...
+bits8: dtype = ...
+bits16: dtype = ...
+
+# Defined in torch/csrc/Layout.cpp
+class layout: ...
+
+# Defined in torch/csrc/utils/disable_torch_function.cpp
+def DisableTorchFunction(): ...
+def DisableTorchFunctionSubclass(): ...
+
+# Defined in torch/csrc/utils/tensor_layouts.cpp
+strided: layout = ...
+sparse_coo: layout = ...
+sparse_csr: layout = ...
+sparse_csc: layout = ...
+sparse_bsr: layout = ...
+sparse_bsc: layout = ...
+_mkldnn: layout = ...
+jagged: layout = ...
+
+# Defined in torch/csrc/MemoryFormat.cpp
+class memory_format: ...
+
+# Defined in torch/csrc/utils/tensor_memoryformats.cpp
+contiguous_format: memory_format = ...
+channels_last: memory_format = ...
+channels_last_3d: memory_format = ...
+preserve_format: memory_format = ...
+
+# Defined in torch/csrc/QScheme.cpp
+class qscheme: ...
+
+# Defined in torch/csrc/utils/tensor_qschemes.h
+per_tensor_affine: qscheme = ...
+per_channel_affine: qscheme = ...
+per_tensor_symmetric: qscheme = ...
+per_channel_symmetric: qscheme = ...
+per_channel_affine_float_qparams: qscheme = ...
+
+# Defined in torch/csrc/autograd/python_function.cpp
+class _FunctionBase:
+    saved_tensors: Tuple[Tensor]
+    _raw_saved_tensors: Tuple[Any]
+    next_functions: Tuple[Tuple[Any, _int], ...]
+    needs_input_grad: Tuple[_bool]
+    metadata: dict
+    _materialize_non_diff_grads: _bool
+    # skip adding type hints for the fields that have wrappers defined
+    # in torch/autograd/function.py
+
+# Defined in torch/csrc/autograd/python_legacy_variable.cpp
+class _LegacyVariableBase(Tensor):  # inherits from Tensor to appease mypy
+    def __init__(
+        self,
+        data: Optional[Tensor] = ...,
+        requires_grad: Optional[_bool] = ...,
+        volatile: Optional[_bool] = ...,
+        _grad_fn: Optional[_FunctionBase] = ...,
+    ) -> None: ...
+
+# Defined in torch/csrc/jit/python/init.cpp
+class IODescriptor: ...
+class JITException: ...
+
+class Future(Generic[T]):
+    def __init__(self, devices: List[device]) -> None: ...
+    def done(self) -> _bool: ...
+    def value(self) -> T: ...
+    def wait(self) -> T: ...
+    def add_done_callback(self, callback: Callable) -> None: ...
+    def then(self, callback: Callable) -> Future[T]: ...
+    def set_result(self, result: T) -> None: ...
+    def _set_unwrap_func(self, callback: Callable) -> None: ...
+
+class _Await:
+    def __init__(self) -> None: ...
+    def fn(self) -> Callable: ...
+    def args(self) -> Tuple[Any, ...]: ...
+    def is_nowait(self) -> _bool: ...
+
+def _jit_set_num_profiled_runs(num: _size) -> _size: ...
+
+# Defined in torch/csrc/jit/passes/mobile_optimizer_type.h
+class _MobileOptimizerType: ...
+
+CONV_BN_FUSION: _MobileOptimizerType
+INSERT_FOLD_PREPACK_OPS: _MobileOptimizerType
+REMOVE_DROPOUT: _MobileOptimizerType
+FUSE_ADD_RELU: _MobileOptimizerType
+HOIST_CONV_PACKED_PARAMS: _MobileOptimizerType
+VULKAN_AUTOMATIC_GPU_TRANSFER: _MobileOptimizerType
+
+def fork(*args: Any, **kwargs: Any) -> Future: ...
+def wait(fut: Future) -> Any: ...
+def _awaitable(*args: Any, **kwargs: Any) -> _Await: ...
+def _awaitable_wait(aw: _Await) -> Any: ...
+def _awaitable_nowait(x: Any) -> _Await: ...
+def _collect_all(futures: List[Future]) -> Future: ...
+def _set_print_stack_traces_on_fatal_signal(print: _bool) -> None: ...
+def unify_type_list(types: List[JitType]) -> JitType: ...
+def _freeze_module(
+    module: ScriptModule,
+    preserved_attrs: List[str] = [],
+    freeze_interfaces: _bool = True,
+    preserveParameters: _bool = True,
+) -> ScriptModule: ...
+def _jit_pass_optimize_frozen_graph(Graph, optimize_numerics: _bool = True) -> None: ...
+def _jit_pass_optimize_for_inference(
+    module: torch.jit.ScriptModule,
+    other_methods: List[str] = [],
+) -> None: ...
+def _jit_pass_fold_frozen_conv_bn(graph: Graph): ...
+def _jit_pass_fold_frozen_conv_add_or_sub(graph: Graph): ...
+def _jit_pass_fold_frozen_conv_mul_or_div(graph: Graph): ...
+def _jit_pass_fuse_frozen_conv_add_relu(graph: Graph): ...
+def _jit_pass_concat_frozen_linear(graph: Graph): ...
+def _jit_pass_convert_frozen_ops_to_mkldnn(graph: Graph): ...
+def _jit_pass_transpose_frozen_linear(graph: Graph): ...
+def _jit_pass_remove_dropout(module: torch.jit.ScriptModule): ...
+def _is_tracing() -> _bool: ...
+def _jit_init() -> _bool: ...
+def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ...
+def _jit_unflatten(vars: List[Tensor], desc: IODescriptor) -> Any: ...
+def _jit_get_operation(op_name: str) -> Tuple[Callable, List[str]]: ...
+def _get_operation_overload(
+    op_name: str,
+    op_overload_name: str,
+) -> Tuple[Callable, Callable, List[Any]]: ...
+def _get_schema(op_name: str, overload_name: str) -> FunctionSchema: ...
+def _jit_pass_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    optimization_blocklist: Set[_MobileOptimizerType],
+    preserved_methods: List[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _clone_module_with_class(
+    module: torch.jit.ScriptModule,
+    ignored_methods: List[AnyStr],
+    ignored_attributes: List[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _jit_pass_vulkan_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    optimization_blocklist: Set[_MobileOptimizerType],
+    preserved_methods: List[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _jit_pass_metal_optimize_for_mobile(
+    module: torch.jit.ScriptModule,
+    preserved_methods: List[AnyStr],
+) -> torch.jit.ScriptModule: ...
+def _jit_pass_inline(Graph) -> None: ...
+def _jit_pass_constant_propagation(Graph) -> None: ...
+def _jit_pass_propagate_shapes_on_graph(Graph) -> None: ...
+def _jit_register_decomposition_for_schema(schema: FunctionSchema, Graph) -> None: ...
+def _jit_erase_non_input_shape_information(Graph) -> None: ...
+def _jit_get_schemas_for_operator(name: str) -> List[FunctionSchema]: ...
+def _jit_get_all_schemas() -> List[FunctionSchema]: ...
+def _jit_check_alias_annotation(
+    g: Graph,
+    args: Tuple[Any, ...],
+    unqualified_op_name: str,
+): ...
+def _jit_can_fuse_on_cpu() -> _bool: ...
+def _jit_can_fuse_on_gpu() -> _bool: ...
+def _jit_can_fuse_on_cpu_legacy() -> _bool: ...
+def _debug_get_fusion_group_inlining() -> _bool: ...
+def _debug_set_fusion_group_inlining(enable: _bool): ...
+def _jit_texpr_fuser_enabled() -> _bool: ...
+def _jit_nvfuser_enabled() -> _bool: ...
+def _jit_llga_enabled() -> _bool: ...
+def _jit_set_llga_enabled(enable: _bool): ...
+def _llvm_enabled() -> _bool: ...
+def _jit_override_can_fuse_on_cpu(override: _bool): ...
+def _jit_override_can_fuse_on_gpu(override: _bool): ...
+def _jit_override_can_fuse_on_cpu_legacy(override: _bool): ...
+def _jit_set_symbolic_shapes_test_mode(override: _bool): ...
+def _jit_symbolic_shapes_test_mode_enabled() -> _bool: ...
+def _jit_set_texpr_fuser_enabled(enable: _bool): ...
+def _jit_set_te_must_use_llvm_cpu(use_llvm: _bool): ...
+def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ...
+def _jit_cat_wo_conditionals(optimize_cat: _bool): ...
+def _jit_opt_conditionals(opt_conds: _bool): ...
+def _jit_pass_canonicalize(graph: Graph, keep_unique_names: _bool = True): ...
+def _jit_pass_erase_shape_information(graph: Graph): ...
+def _jit_pass_fold_convbn(module: torch.jit.ScriptModule): ...
+def _jit_pass_insert_observers(
+    module: torch.jit.ScriptModule,
+    method_name: str,
+    qconfig_dict: Dict[str, Any],
+    inplace: _bool,
+    quant_type: _int,
+): ...
+def _jit_pass_insert_quant_dequant(
+    module: torch.jit.ScriptModule,
+    method_name: str,
+    inplace: _bool,
+    debug: _bool,
+    quant_type: _int,
+): ...
+def _jit_pass_insert_quant_dequant_for_ondevice_ptq(
+    module: torch.jit.ScriptModule,
+    method_name: str,
+    inplace: _bool,
+    debug: _bool,
+    quant_type: _int,
+): ...
+def _jit_pass_quant_finalize(
+    module: torch.jit.ScriptModule,
+    quant_type: _int,
+    preserved_attrs: Sequence[str],
+): ...
+def _jit_pass_quant_finalize_for_ondevice_ptq(
+    module: torch.jit.ScriptModule,
+    quant_type: _int,
+    method_name: str,
+): ...
+def _jit_pass_insert_observer_method_for_ondevice_ptq(
+    module: torch.jit.ScriptModule,
+    method_name: str,
+    qconfig_dict: Dict[str, Any],
+    inplace: _bool,
+    quant_type: _int,
+): ...
+def _jit_set_profiling_executor(profiling_flag: _bool) -> _bool: ...
+def _jit_set_profiling_mode(profiling_flag: _bool) -> _bool: ...
+def _jit_set_fusion_strategy(
+    strategy: List[Tuple[str, _int]],
+) -> List[Tuple[str, _int]]: ...
+def _jit_try_infer_type(obj: Any) -> InferredType: ...
+def _jit_get_trigger_value(trigger_name: str) -> _int: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+ResolutionCallback = Callable[[str], Callable[..., Any]]
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+#        and torch/csrc/jit/python/init.cpp
+def _create_function_from_graph(qualname: str, graph: Graph) -> ScriptFunction: ...
+def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ...
+def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ...
+def _jit_assert_is_instance(obj: Any, type: JitType): ...
+def _jit_clear_class_registry() -> None: ...
+def _jit_set_emit_hooks(
+    ModuleHook: Optional[Callable],
+    FunctionHook: Optional[Callable],
+) -> None: ...
+def _jit_get_emit_hooks() -> Tuple[Callable, Callable]: ...
+def _load_for_lite_interpreter(
+    filename: Union[str, Path],
+    map_location: Optional[DeviceLikeType],
+): ...
+def _load_for_lite_interpreter_from_buffer(
+    buffer: BinaryIO,
+    map_location: Optional[DeviceLikeType],
+): ...
+def _export_operator_list(module: LiteScriptModule): ...
+def _quantize_ondevice_ptq_dynamic(module: LiteScriptModule, method_name: str): ...
+def _get_model_bytecode_version(filename: Union[str, Path]) -> _int: ...
+def _get_model_bytecode_version_from_buffer(buffer: BinaryIO) -> _int: ...
+def _backport_for_mobile(
+    filename_input: Union[str, Path],
+    filename_output: Union[str, Path],
+    to_version: _int,
+) -> None: ...
+def _backport_for_mobile_from_buffer(
+    buffer: BinaryIO,
+    filename_output: Union[str, Path],
+    to_version: _int,
+) -> None: ...
+def _backport_for_mobile_to_buffer(
+    filename_input: Union[str, Path],
+    to_version: _int,
+) -> bytes: ...
+def _backport_for_mobile_from_buffer_to_buffer(
+    buffer: BinaryIO,
+    to_version: _int,
+) -> bytes: ...
+def _get_model_ops_and_info(filename: Union[str, Path]): ...
+def _get_model_ops_and_info_from_buffer(buffer: BinaryIO): ...
+def _get_mobile_model_contained_types(filename: Union[str, Path]): ...
+def _get_mobile_model_contained_types_from_buffer(buffer: BinaryIO): ...
+def _logging_set_logger(logger: LoggerBase) -> LoggerBase: ...
+def _get_graph_executor_optimize(optimize: Optional[_bool] = None) -> _bool: ...
+def _set_graph_executor_optimize(optimize: _bool): ...
+def _export_opnames(module: ScriptModule) -> List[str]: ...
+def _create_function_from_trace(
+    qualname: str,
+    func: Callable[..., Any],
+    input_tuple: Tuple[Any, ...],
+    var_lookup_fn: Callable[[Tensor], str],
+    strict: _bool,
+    force_outplace: _bool,
+    argument_names: List[str],
+) -> Tuple[Graph, Stack]: ...
+def _create_function_from_trace_with_dict(
+    qualname: str,
+    func: Callable[..., Any],
+    input_dict: Dict[str, Any],
+    var_lookup_fn: Callable[[Tensor], str],
+    strict: _bool,
+    force_outplace: _bool,
+    argument_names: List[str],
+) -> Tuple[Graph, Stack]: ...
+def _jit_is_script_object(obj: Any) -> _bool: ...
+def _last_executed_optimized_graph() -> Graph: ...
+def parse_type_comment(comment: str) -> Decl: ...
+def _get_upgraders_map_size() -> _int: ...
+def _get_upgraders_entry_map() -> Dict[str, str]: ...
+def _dump_upgraders_map() -> Dict[str, str]: ...
+def _test_only_populate_upgraders(content: Dict[str, str]) -> None: ...
+def _test_only_remove_upgraders(content: Dict[str, str]) -> None: ...
+def merge_type_from_type_comment(
+    decl: Decl,
+    type_annotation_decl: Decl,
+    is_method: _bool,
+) -> Decl: ...
+def parse_ir(input: str, parse_tensor_constants: _bool = False) -> Graph: ...
+def parse_schema(schema: str) -> FunctionSchema: ...
+def get_device(input: Tensor) -> _int: ...
+def _resolve_type_from_object(
+    obj: Any,
+    range: SourceRange,
+    rcb: ResolutionCallback,
+) -> JitType: ...
+def _create_module_with_type(ty: JitType) -> ScriptModule: ...
+def _create_object_with_type(ty: ClassType) -> ScriptObject: ...
+def _run_emit_module_hook(m: ScriptModule): ...
+def _replace_overloaded_method_decl(
+    overload_decl: Decl,
+    implementation_def: Def,
+    new_name: str,
+) -> Def: ...
+def _jit_pass_lower_all_tuples(graph: Graph) -> None: ...
+def _jit_pass_onnx_set_dynamic_input_shape(
+    graph: Graph,
+    dynamic_axes: Dict[str, Dict[_int, str]],
+    input_names: List[str],
+) -> None: ...
+def _jit_pass_onnx_graph_shape_type_inference(
+    graph: Graph,
+    params_dict: Dict[str, IValue],
+    opset_version: _int,
+) -> None: ...
+def _jit_pass_onnx_assign_output_shape(
+    graph: Graph,
+    tensors: List[Tensor],
+    desc: IODescriptor,
+    onnx_shape_inference: _bool,
+    is_script: _bool,
+    opset_version: _int,
+) -> None: ...
+def _jit_pass_onnx_remove_inplace_ops_for_onnx(
+    graph: Graph,
+    module: Optional[ScriptModule] = None,
+) -> None: ...
+def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ...
+def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ...
+def _jit_pass_peephole(
+    graph: Graph,
+    disable_shape_peepholes: _bool = False,
+) -> None: ...
+def _jit_pass_onnx_autograd_function_process(graph: Graph) -> None: ...
+def _jit_pass_fuse_addmm(graph: Graph) -> None: ...
+def _jit_pass_onnx_preprocess(graph: Graph) -> None: ...
+def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ...
+def _jit_pass_onnx_remove_print(graph: Graph) -> None: ...
+def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ...
+def _jit_pass_onnx_unpack_quantized_weights(
+    graph: Graph,
+    paramsDict: Dict[str, IValue],
+    caffe2: _bool,
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_quantization_insert_permutes(
+    graph: Graph,
+    paramsDict: Dict[str, IValue],
+) -> Dict[str, IValue]: ...
+def _jit_pass_custom_pattern_based_rewrite_graph(
+    pattern: str,
+    fused_node_name: str,
+    graph: Graph,
+) -> None: ...
+def _jit_onnx_list_model_parameters(
+    module: ScriptModule,
+) -> Tuple[ScriptModule, List[IValue]]: ...
+def _jit_pass_erase_number_types(graph: Graph) -> None: ...
+def _jit_pass_onnx_lint(graph: Graph) -> None: ...
+def _jit_pass_onnx(
+    graph: Graph,
+    _jit_pass_onnx: _onnx.OperatorExportTypes,
+) -> Graph: ...
+def _jit_pass_onnx_scalar_type_analysis(
+    graph: Graph,
+    lowprecision_cast: _bool,
+    opset_version: _int,
+) -> None: ...
+def _jit_pass_onnx_peephole(
+    graph: Graph,
+    opset_version: _int,
+    fixed_batch_size: _bool,
+) -> None: ...
+def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ...
+def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ...
+def _jit_pass_onnx_function_extraction(
+    graph: Graph,
+    module_names: Set[str],
+    param_names: List[str],
+) -> Dict[Node, Dict[str, str]]: ...
+def _jit_pass_onnx_clear_scope_records() -> None: ...
+def _jit_pass_onnx_track_scope_attributes(
+    graph: Graph,
+    onnx_attrs: Dict[str, Any],
+) -> None: ...
+def _jit_is_onnx_log_enabled() -> _bool: ...
+def _jit_set_onnx_log_enabled(enabled: _bool) -> None: ...
+def _jit_set_onnx_log_output_stream(stream_name: str) -> None: ...
+def _jit_onnx_log(*args: Any) -> None: ...
+def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ...
+def _jit_pass_inline_fork_wait(graph: Graph) -> None: ...
+def _jit_pass_onnx_deduplicate_initializers(
+    graph: Graph,
+    params_dict: Dict[str, IValue],
+    is_train: _bool,
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_eval_peephole(
+    graph: Graph,
+    paramsDict: Dict[str, IValue],
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_constant_fold(
+    graph: Graph,
+    paramsDict: Dict[str, IValue],
+    opset_version: _int,
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_eliminate_unused_items(
+    graph: Graph,
+    paramsDict: Dict[str, IValue],
+) -> Dict[str, IValue]: ...
+def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ...
+def _jit_pass_filter_non_tensor_arguments(
+    params: Dict[str, IValue],
+) -> Dict[str, Tensor]: ...
+def _jit_decay_packed_param_input_types(graph: Graph) -> None: ...
+def _jit_pass_onnx_node_shape_type_inference(
+    n: Node,
+    paramsDict: Dict[str, IValue],
+    opset_version: _int,
+) -> None: ...
+def _jit_onnx_convert_pattern_from_subblock(
+    block: Block,
+    n: Node,
+    env: Dict[Value, Value],
+) -> List[Value]: ...
+def _jit_pass_onnx_block(
+    old_block: Block,
+    new_block: Block,
+    operator_export_type: _onnx.OperatorExportTypes,
+    env: Dict[Value, Value],
+    is_sub_block: _bool,
+) -> Dict[Value, Value]: ...
+def _jit_pass_onnx_assign_scoped_names_for_node_and_value(graph: Graph) -> None: ...
+def _jit_pass_fixup_onnx_controlflow_node(
+    n: Node,
+    opset_version: _int,
+) -> List[Value]: ...
+def _jit_onnx_create_full_scope_name(class_name: str, variable_name: str) -> str: ...
+def _compile_graph_to_code_table(name: str, graph: Graph) -> IValue: ...
+def _generate_upgraders_graph() -> Dict[str, Graph]: ...
+def _calculate_package_version_based_on_upgraders(val: _bool): ...
+def _get_version_calculator_flag() -> _bool: ...
+def _jit_script_interface_compile(
+    name: str,
+    class_def: ClassDef,
+    rcb: ResolutionCallback,
+    is_module: _bool,
+): ...
+def _jit_script_compile_overload(
+    qualname: str,
+    overload_decl: Decl,
+    implementation_def: Def,
+    rcb: ResolutionCallback,
+    implementation_defaults: Dict[str, Any],
+    signature: Any,
+): ...
+def _jit_script_compile(
+    qual_name: str,
+    definition: Def,
+    rcb: ResolutionCallback,
+    defaults: Dict[str, Any],
+): ...
+def _jit_script_class_compile(
+    qual_name: str,
+    definition: ClassDef,
+    defaults: Dict[str, Dict[str, Any]],
+    rcb: ResolutionCallback,
+): ...
+def _parse_source_def(src: str) -> Def: ...
+def import_ir_module(
+    cu: CompilationUnit,
+    filename: Union[str, Path],
+    map_location: Optional[DeviceLikeType],
+    extra_files: Dict[str, Any],
+) -> ScriptModule: ...
+def import_ir_module_from_buffer(
+    cu: CompilationUnit,
+    buffer: BinaryIO,
+    map_location: Optional[DeviceLikeType],
+    extra_files: Dict[str, Any],
+) -> ScriptModule: ...
+def _import_ir_module_from_package(
+    cu: CompilationUnit,
+    reader: PyTorchFileReader,
+    storage_context: DeserializationStorageContext,
+    map_location: Optional[DeviceLikeType],
+    ts_id: str,
+) -> ScriptModule: ...
+def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ...
+def _check_onnx_proto(proto: str) -> None: ...
+def _propagate_and_assign_input_shapes(
+    graph: Graph,
+    inputs: Tuple[Tensor, ...],
+    param_count_list: List[_int],
+    with_grad: _bool,
+    propagate: _bool,
+) -> Graph: ...
+
+# Defined in torch/csrc/jit/runtime/graph_executor.h
+class GraphExecutorState: ...
+
+# Defined in torch/torch/csrc/jit/ir/alias_analysis.h
+class AliasDb:
+    def __str__(self) -> str: ...
+
+class _InsertPoint:
+    def __enter__(self) -> None: ...
+    def __exit__(self, *args) -> None: ...
+
+# Defined in torch/csrc/jit/ir/ir.h
+class Use:
+    @property
+    def user(self) -> Node: ...
+    @property
+    def offset(self) -> _int: ...
+    def isAfter(self, other: Use) -> _bool: ...
+
+# Defined in torch/csrc/jit/ir/ir.h
+class Value:
+    def type(self) -> JitType: ...
+    def setType(self, t: JitType) -> Value: ...
+    def setTypeAs(self, other: Value) -> Value: ...
+    def inferTypeFrom(self, t: Tensor) -> None: ...
+    def debugName(self) -> str: ...
+    def setDebugName(self, name: str) -> None: ...
+    def unique(self) -> _int: ...
+    def offset(self) -> _int: ...
+    def node(self) -> Node: ...
+    def uses(self) -> List[Use]: ...
+    def replaceAllUsesWith(self, val: Value) -> None: ...
+    def replaceAllUsesAfterNodeWith(self, node: Node, val: Value) -> None: ...
+    def requires_grad(self) -> _bool: ...
+    def requiresGrad(self) -> _bool: ...
+    def copyMetadata(self, other: Value) -> Value: ...
+    def isCompleteTensor(self) -> _bool: ...
+    def toIValue(self) -> IValue: ...
+
+# Defined in torch/csrc/jit/ir/ir.h
+class Block:
+    def inputs(self) -> Iterator[Value]: ...
+    def outputs(self) -> Iterator[Value]: ...
+    def nodes(self) -> Iterator[Node]: ...
+    def paramNode(self) -> Node: ...
+    def returnNode(self) -> Node: ...
+    def owningNode(self) -> Node: ...
+    def registerOutput(self, n: Value) -> _int: ...
+    def addNode(self, name: str, inputs: Sequence[Value]) -> Node: ...
+
+# Defined in torch/csrc/jit/ir/ir.h
+class Node:
+    def __getitem__(self, key: str) -> Any: ...
+    def schema(self) -> str: ...
+    def input(self) -> Value: ...
+    def inputs(self) -> Iterator[Value]: ...
+    def inputsAt(self, idx: _int) -> Value: ...
+    def inputsSize(self) -> _int: ...
+    def output(self) -> Value: ...
+    def outputs(self) -> Iterator[Value]: ...
+    def outputsAt(self, idx: _int) -> Value: ...
+    def outputsSize(self) -> _int: ...
+    def hasMultipleOutputs(self) -> _bool: ...
+    def blocks(self) -> List[Block]: ...
+    def addBlock(self) -> Block: ...
+    def mustBeNone(self) -> _bool: ...
+    def matches(self, pattern: str) -> _bool: ...
+    def kind(self) -> str: ...
+    def kindOf(self, name: str) -> str: ...
+    def addInput(self, name: str) -> Value: ...
+    def replaceInput(self, i: _int, newValue: Value) -> Value: ...
+    def replaceInputWith(self, from_: Value, to: Value) -> None: ...
+    def replaceAllUsesWith(self, n: Node) -> None: ...
+    def insertBefore(self, n: Node) -> Node: ...
+    def insertAfter(self, n: Node) -> Node: ...
+    def isBefore(self, n: Node) -> _bool: ...
+    def isAfter(self, n: Node) -> _bool: ...
+    def moveBefore(self, n: Node) -> None: ...
+    def moveAfter(self, n: Node) -> None: ...
+    def removeInput(self, i: _int) -> None: ...
+    def removeAllInputs(self, i: _int) -> None: ...
+    def hasUses(self) -> _bool: ...
+    def eraseOutput(self, i: _int) -> None: ...
+    def addOutput(self) -> Value: ...
+    def scopeName(self) -> str: ...
+    def isNondeterministic(self) -> _bool: ...
+    def copyAttributes(self, rhs: Node) -> Node: ...
+    def copyMetadata(self, rhs: Node) -> Node: ...
+    def hasAttributes(self) -> _bool: ...
+    def hasAttribute(self, name: str) -> _bool: ...
+    def removeAttribute(self, attr: str) -> Node: ...
+    def namedInput(self, name: str) -> Value: ...
+    def sourceRange(self) -> SourceRange: ...
+    def owningBlock(self) -> Block: ...
+    def findNode(self, kind: str, recurse: _bool = True) -> Node: ...
+    def findAllNodes(self, kind: str, recurse: _bool = True) -> List[Node]: ...
+    def getModuleHierarchy(self) -> str: ...
+    def prev(self) -> Node: ...
+    def destroy(self) -> None: ...
+    def attributeNames(self) -> List[str]: ...
+
+    # Accessors for attributes as types.
+    def f(self, name: str) -> _float: ...
+    def f_(self, name: str, val: _float) -> Node: ...
+    def fs(self, name: str) -> List[_float]: ...
+    def fs_(self, name: str, val: List[_float]) -> Node: ...
+    def c(self, name: str) -> complex: ...
+    def c_(self, name: str, val: complex) -> Node: ...
+    def s(self, name: str) -> str: ...
+    def s_(self, name: str, val: str) -> Node: ...
+    def ss(self, name: str) -> List[str]: ...
+    def ss_(self, name: str, val: List[str]) -> Node: ...
+    def i(self, name: str) -> _int: ...
+    def i_(self, name: str, val: _int) -> Node: ...
+    # Cannot define "is" like this because it's a reserved keyword in python.
+    # def is(self, name: str) -> List[_int]: ...
+    # def is_(self, name: str, val: List[_int]) -> Node: ...
+    def g(self, name: str) -> Graph: ...
+    def g_(self, name: str, val: Graph) -> Node: ...
+    def gs(self, name: str) -> List[Graph]: ...
+    def gs_(self, name: str, val: List[Graph]) -> Node: ...
+    def ival(self, name: str) -> IValue: ...
+    def ival_(self, name: str, val: IValue) -> Node: ...
+    def t(self, name: str) -> Tensor: ...
+    def t_(self, name: str, val: Tensor) -> Node: ...
+    def ts(self, name: str) -> List[Tensor]: ...
+    def ts_(self, name: str, val: List[Tensor]) -> Node: ...
+    def ty(self, name: str) -> JitType: ...
+    def ty_(self, name: str, val: JitType) -> Node: ...
+    def tys(self, name: str) -> List[JitType]: ...
+    def tys_(self, name: str, val: List[JitType]) -> Node: ...
+
+# Defined in torch/torch/csrc/jit/ir/ir.h
+class Graph:
+    def inputs(self) -> Iterator[Value]: ...
+    def outputs(self) -> Iterator[Value]: ...
+    def nodes(self) -> Iterator[Node]: ...
+    def param_node(self) -> Node: ...
+    def return_node(self) -> Node: ...
+    def addInput(self, name: str = "") -> Value: ...
+    def eraseInput(self, i: _int) -> None: ...
+    def registerOutput(self, n: Value) -> _int: ...
+    def eraseOutput(self, i: _int) -> None: ...
+    def create(self, name: str, args, num_outputs: _int) -> Node: ...
+    def appendNode(self, n: Node) -> Node: ...
+    def prependNode(self, n: Node) -> Node: ...
+    def insertNode(self, n: Node) -> Node: ...
+    def block(self) -> Block: ...
+    def lint(self) -> None: ...
+    def alias_db(self) -> AliasDb: ...
+    def setInsertPoint(self, n: Union[Block, Node]) -> None: ...
+    def insert_point_guard(self, n: Union[Block, Node]) -> _InsertPoint: ...
+    def insertPoint(self) -> Node: ...
+    def insertGraph(self, callee: Graph, inputs: List[Value]) -> List[Value]: ...
+    def makeMultiOutputIntoTuple(self) -> None: ...
+    def copy(self) -> Graph: ...
+
+# Defined in torch/aten/src/ATen/core/alias_info.h
+class AliasInfo:
+    is_write: _bool
+    before_set: Set[str]
+    after_set: Set[str]
+
+# Defined in torch/aten/src/ATen/core/function_schema.h
+class Argument:
+    name: str
+    type: JitType
+    default_value: Optional[Any]
+    def has_default_value(self) -> _bool: ...
+    kwarg_only: _bool
+    is_out: _bool
+    alias_info: Optional[AliasInfo]
+
+class FunctionSchema:
+    arguments: List[Argument]
+    returns: List[Argument]
+    name: str
+    overload_name: str
+
+class _UpgraderEntry:
+    bumped_at_version: _int
+    upgrader_name: str
+    old_schema: str
+    def __init__(
+        self,
+        bumped_at_version: _int,
+        upgrader_name: str,
+        old_schema: str,
+    ) -> None: ...
+
+class _UpgraderRange:
+    min_version: _int
+    max_version: _int
+
+def _get_max_operator_version() -> _int: ...
+def _get_operator_version_map() -> Dict[str, List[_UpgraderEntry]]: ...
+def _get_upgrader_ranges(name: str) -> List[_UpgraderRange]: ...
+def _test_only_add_entry_to_op_version(op_name: str, entry: _UpgraderEntry) -> None: ...
+def _test_only_remove_entry_to_op_version(op_name: str) -> None: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+class ScriptModuleSerializer:
+    def __init__(self, export_writer: PyTorchFileWriter) -> None: ...
+    def serialize(self, model: ScriptModule, script_module_id: _int) -> None: ...
+    def write_files(self) -> None: ...
+    def storage_context(self) -> SerializationStorageContext: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+class SerializationStorageContext:
+    def __init__(self) -> None: ...
+    def has_storage(self, storage: Storage) -> _bool: ...
+    def get_or_add_storage(self, storage: Storage) -> _int: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+class DeserializationStorageContext:
+    def __init__(self) -> None: ...
+    def get_storage(self, name: str, dtype: _dtype) -> Tensor: ...
+    def has_storage(self, name: str) -> _bool: ...
+    def add_storage(self, name: str, tensor: Tensor) -> _int: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+class ConcreteModuleTypeBuilder:
+    def __init__(self, obj: Any) -> None: ...
+    def set_module_dict(self): ...
+    def set_module_list(self): ...
+    def set_parameter_list(self): ...
+    def set_parameter_dict(self): ...
+    def add_attribute(
+        self,
+        name: str,
+        ty: JitType,
+        is_param: _bool,
+        is_buffer: _bool,
+    ): ...
+    def add_module(self, name: str, meta: ConcreteModuleType): ...
+    def add_constant(self, name: str, value: Any): ...
+    def add_overload(self, method_name: str, overloaded_method_names: List[str]): ...
+    def add_builtin_function(self, name: str, symbol_name: str): ...
+    def add_failed_attribute(self, name: str, failure_reason: str): ...
+    def add_function_attribute(
+        self,
+        name: str,
+        ty: JitType,
+        func: Callable[..., Any],
+    ): ...
+    def add_ignored_attribute(self, name: str): ...
+    def add_ignored_attributes(self, names: List[str]): ...
+    def add_forward_hook(self, hook: Callable[..., Any]): ...
+    def add_forward_pre_hook(self, pre_hook: Callable[..., Any]): ...
+
+class ConcreteModuleType:
+    def get_constants(self) -> Dict[str, Any]: ...
+    def equals(self, other: ConcreteModuleType) -> _bool: ...
+    @staticmethod
+    def from_jit_type(ty: JitType) -> ConcreteModuleType: ...
+
+class CallStack:
+    def __init__(self, name: str, range: SourceRange): ...
+
+class ErrorReport:
+    def __init__(self, range: SourceRange) -> None: ...
+    def what(self) -> str: ...
+    @staticmethod
+    def call_stack() -> str: ...
+
+class CompilationUnit:
+    def __init__(self, lang: str = ..., _frames_up: _int = ...) -> None: ...
+    def find_function(self, name: str) -> ScriptFunction: ...
+    def __getattr__(self, name: str) -> ScriptFunction: ...
+    def define(
+        self,
+        script: str,
+        rcb: ResolutionCallback = ...,
+        _frames_up: _int = ...,
+    ): ...
+    def get_interface(self, name: str) -> InterfaceType: ...
+    def get_functions(self) -> List[ScriptFunction]: ...
+    def create_function(
+        self,
+        name: str,
+        graph: Graph,
+        shouldMangle: _bool = ...,
+    ) -> ScriptFunction: ...
+    def get_class(self, name: str) -> ClassType: ...
+
+class ScriptObject:
+    def setattr(self, name: str, value: Any): ...
+
+class ScriptModule(ScriptObject):
+    def _method_names(self) -> List[str]: ...
+    def _get_method(self, name: str) -> ScriptMethod: ...
+
+class LiteScriptModule:
+    def __call__(self, *input): ...
+    def find_method(self, method_name: str): ...
+    def forward(self, *input) -> List[str]: ...
+    def run_method(self, method_name: str, *input): ...
+
+# NOTE: switch to collections.abc.Callable in python 3.9
+class ScriptFunction(Generic[P, ReturnVal]):
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> ReturnVal: ...
+    def save(self, filename: str, _extra_files: Dict[str, bytes]) -> None: ...
+    def save_to_buffer(self, _extra_files: Dict[str, bytes]) -> bytes: ...
+    @property
+    def graph(self) -> Graph: ...
+    def inlined_graph(self) -> Graph: ...
+    def schema(self) -> FunctionSchema: ...
+    def code(self) -> str: ...
+    def name(self) -> str: ...
+    @property
+    def qualified_name(self) -> str: ...
+
+# NOTE: switch to collections.abc.Callable in python 3.9
+class ScriptMethod(Generic[P, ReturnVal]):
+    graph: Graph
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> ReturnVal: ...
+    @property
+    def owner(self) -> ScriptModule: ...
+    @property
+    def name(self) -> str: ...
+
+class ScriptDict(Generic[K, T]):
+    def __init__(self, dict: Dict[K, T]) -> None: ...
+    def __len__(self) -> _int: ...
+    def __contains__(self, key: K) -> _bool: ...
+    def __getitem__(self, key: K) -> T: ...
+    def __setitem__(self, key: K, value: T) -> None: ...
+    def __delitem__(self, key: K) -> None: ...
+    def __iter__(self) -> Iterator[K]: ...
+    def items(self) -> Iterator[tuple[K, T]]: ...
+    def keys(self) -> Iterator[K]: ...
+
+class ScriptList(Generic[T]):
+    def __init__(self, list: List[T]) -> None: ...
+    def __len__(self) -> _int: ...
+    def __contains__(self, item: T) -> _bool: ...
+    @overload
+    def __getitem__(self, idx: _int) -> T: ...
+    @overload
+    def __getitem__(self, idx: slice) -> ScriptList[T]: ...
+    @overload
+    def __setitem__(self, idx: _int, value: T) -> None: ...
+    @overload
+    def __setitem__(self, idx: slice, value: List[T]) -> None: ...
+    def __delitem__(self, idx: _int) -> None: ...
+    def __iter__(self) -> Iterator[T]: ...
+    def count(self, value: T) -> _int: ...
+    def remove(self, value: T) -> None: ...
+    def append(self, value: T) -> None: ...
+    def clear(self) -> None: ...
+    @overload
+    def extend(self, values: List[T]) -> None: ...
+    @overload
+    def extend(self, values: Iterable[T]) -> None: ...
+    @overload
+    def pop(self) -> T: ...
+    @overload
+    def pop(self, idx: _int) -> T: ...
+
+class ModuleDict:
+    def __init__(self, mod: ScriptModule) -> None: ...
+    def items(self) -> List[Tuple[str, Any]]: ...
+
+class ParameterDict:
+    def __init__(self, mod: ScriptModule) -> None: ...
+
+class BufferDict:
+    def __init__(self, mod: ScriptModule) -> None: ...
+
+# Defined in torch/csrc/jit/api/module.h
+class Module: ...
+
+# Defined in torch/csrc/Module.cpp
+def _initExtension(shm_manager_path: str) -> None: ...  # THPModule_initExtension
+def _autograd_init() -> _bool: ...  # THPAutograd_initExtension
+def _add_docstr(obj: T, doc_obj: str) -> T: ...  # THPModule_addDocStr
+def _init_names(arg: Sequence[Type]) -> None: ...  # THPModule_initNames
+def _has_distributed() -> _bool: ...  # THPModule_hasDistributed
+def _set_default_tensor_type(type) -> None: ...  # THPModule_setDefaultTensorType
+def _set_default_dtype(d: _dtype) -> None: ...  # THPModule_setDefaultDtype
+def _infer_size(arg1: Size, arg2: Size) -> Size: ...  # THPModule_inferSize
+def _crash_if_csrc_asan() -> _int: ...  # THPModule_crashIfCsrcASAN
+def _crash_if_csrc_ubsan() -> _int: ...  # THPModule_crashIfCsrcUBSAN
+def _crash_if_aten_asan() -> _int: ...  # THPModule_crashIfATenASAN
+def _show_config() -> str: ...  # THPModule_showConfig
+def _cxx_flags() -> str: ...  # THPModule_cxxFlags
+def _parallel_info() -> str: ...  # THPModule_parallelInfo
+def _get_cpu_capability() -> str: ...  # THPModule_getCpuCapability
+def _set_backcompat_broadcast_warn(
+    arg: _bool,
+) -> None: ...  # THPModule_setBackcompatBroadcastWarn
+def _get_backcompat_broadcast_warn() -> _bool: ...  # THPModule_getBackcompatBroadcastWarn
+def _set_backcompat_keepdim_warn(
+    arg: _bool,
+) -> None: ...  # THPModule_setBackcompatKeepdimWarn
+def _get_backcompat_keepdim_warn() -> _bool: ...  # THPModule_getBackcompatKeepdimWarn
+def get_num_thread() -> _int: ...  # THPModule_getNumThreads
+def set_num_threads(nthreads: _int) -> None: ...  # THPModule_setNumThreads
+def get_num_interop_threads() -> _int: ...  # THPModule_getNumInteropThreads
+def set_num_interop_threads(
+    nthreads: _int,
+) -> None: ...  # THPModule_setNumInteropThreads
+def _get_cudnn_enabled() -> _bool: ...  # THPModule_userEnabledCuDNN
+def _set_cudnn_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledCuDNN
+def _get_flash_sdp_enabled() -> _bool: ...  # THPModule_userEnabledFusedSDP
+def _set_sdp_use_flash(arg: _bool) -> None: ...  # THPModule_setSDPUseFlash
+def _get_mem_efficient_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
+def _set_sdp_use_mem_efficient(
+    arg: _bool,
+) -> None: ...  # THPModule_setSDPUseMemEfficient
+def _get_math_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
+def _set_sdp_use_math(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
+def _get_cudnn_sdp_enabled() -> _bool: ...  # THPModule_userEnabledMathSDP
+def _set_sdp_use_cudnn(arg: _bool) -> None: ...  # THPModule_setSDPUseMath
+def _get_mkldnn_enabled() -> _bool: ...  # THPModule_userEnabledMkldnn
+def _set_mkldnn_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledMkldnn
+def _get_cudnn_benchmark() -> _bool: ...  # THPModule_benchmarkCuDNN
+def _set_cudnn_benchmark(arg: _bool) -> None: ...  # THPModule_setBenchmarkCuDNN
+def _get_cudnn_deterministic() -> _bool: ...  # THPModule_deterministicCuDNN
+def _set_cudnn_deterministic(arg: _bool) -> None: ...  # THPModule_setDeterministicCuDNN
+def _get_deterministic_algorithms() -> _bool: ...  # THPModule_deterministicAlgorithms
+def _get_deterministic_algorithms_warn_only() -> _bool: ...  # THPModule_deterministicAlgorithmsWarnOnly
+def _set_deterministic_algorithms(
+    mode: _bool,
+    *,
+    warn_only: _bool = ...,
+) -> None: ...  # THPModule_setDeterministicAlgorithms
+def _get_deterministic_fill_uninitialized_memory() -> _bool: ...  # THPModule_deterministicFillUninitializedMemory
+def _set_deterministic_fill_uninitialized_memory(arg: _bool) -> None: ...  # THPModule_setDeterministicFillUninitializedMemory
+def _get_nnpack_enabled() -> _bool: ...  # THPModule_userEnabledNNPACK
+def _set_nnpack_enabled(arg: _bool) -> None: ...  # THPModule_setUserEnabledNNPACK
+def _get_warnAlways() -> _bool: ...  # THPModule_warnAlways
+def _set_warnAlways(arg: _bool) -> None: ...  # THPModule_setWarnAlways
+def _get_cudnn_allow_tf32() -> _bool: ...  # THPModule_allowTF32CuDNN
+def _set_cudnn_allow_tf32(arg: _bool) -> None: ...  # THPModule_setAllowTF32CuDNN
+def _get_cublas_allow_tf32() -> _bool: ...  # THPModule_allowTF32CuBLAS
+def _set_cublas_allow_tf32(arg: _bool) -> None: ...  # THPModule_setAllowTF32CuBLAS
+def _get_float32_matmul_precision() -> str: ...  # THPModule_float32MatmulPrecision
+def _set_float32_matmul_precision(
+    arg: str,
+) -> None: ...  # THPModule_setFloat32MatmulPrecision
+def _get_cublas_allow_fp16_reduced_precision_reduction() -> _bool: ...  # THPModule_allowFP16ReductionCuBLAS
+def _set_cublas_allow_fp16_reduced_precision_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowFP16ReductionCuBLAS
+def _get_cublas_allow_bf16_reduced_precision_reduction() -> _bool: ...  # THPModule_allowBF16ReductionCuBLAS
+def _set_cublas_allow_bf16_reduced_precision_reduction(
+    arg: _bool,
+) -> None: ...  # THPModule_setAllowBF16ReductionCuBLAS
+def _set_conj(x: Tensor, conj: _bool) -> None: ...
+def _set_neg(x: Tensor, neg: _bool) -> None: ...
+def _set_meta_in_tls_dispatch_include(meta_in_tls: _bool) -> None: ...
+def _meta_in_tls_dispatch_include() -> _bool: ...
+def _stash_obj_in_tls(key: str, arg: Any) -> None: ...
+def _get_obj_in_tls(key: str) -> Any: ...
+def _is_key_in_tls(key: str) -> _bool: ...
+def _select_conv_backend(*args, **kwargs) -> ConvBackend: ...
+def _conv_determine_backend_memory_format(
+    input: Tensor,
+    weight: Tensor,
+    backend: ConvBackend,
+) -> memory_format: ...
+def _has_storage(x: Tensor) -> _bool: ...
+def _construct_storage_from_data_pointer(data_ptr: _int, device: torch.device, size: _int) -> Storage: ...
+def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
+def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, str], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ...
+
+# NB: There is no Capsule type in typing, see
+# https://code.activestate.com/lists/python-dev/139675/
+def _to_dlpack(data: Tensor) -> Any: ...  # THPModule_toDLPack
+def _from_dlpack(data: Any) -> Tensor: ...  # THPModule_fromDLPack
+def _get_cpp_backtrace(
+    frames_to_skip: _int,
+    maximum_number_of_frames: _int,
+) -> str: ...  # THPModule_getCppBacktrace
+def set_flush_denormal(arg: _bool) -> _bool: ...  # THPModule_setFlushDenormal
+def get_default_dtype() -> _dtype: ...  # THPModule_getDefaultDtype
+def _get_default_device() -> str: ...  # THPModule_getDefaultDevice
+def _get_qengine() -> _int: ...  # THPModule_qEngine
+def _set_qengine(qengine: _int) -> None: ...  # THPModule_setQEngine
+def _supported_qengines() -> List[_int]: ...  # THPModule_supportedQEngines
+def _is_xnnpack_enabled() -> _bool: ...  # THPModule_isEnabledXNNPACK
+def _check_sparse_tensor_invariants() -> _bool: ...  # THPModule_checkSparseTensorInvariants
+def _set_check_sparse_tensor_invariants(
+    arg: _bool,
+) -> None: ...  # THPModule_setCheckSparseTensorInvariants
+def _set_default_mobile_cpu_allocator() -> None: ...  # THPModule_setDefaultMobileCPUAllocator
+def _unset_default_mobile_cpu_allocator() -> None: ...  # THPModule_unsetDefaultMobileCPUAllocator
+def _is_torch_function_enabled() -> _bool: ...  # THPModule_isEnabledTorchFunction
+def _has_torch_function(
+    args: Iterable[Any],
+) -> _bool: ...  # THPModule_has_torch_function
+def _has_torch_function_unary(Any) -> _bool: ...  # THPModule_has_torch_function_unary
+def _has_torch_function_variadic(
+    *args: Any,
+) -> _bool: ...  # THPModule_has_torch_function_variadic
+def _vmapmode_increment_nesting() -> _int: ...  # THPModule_vmapmode_increment_nesting
+def _vmapmode_decrement_nesting() -> _int: ...  # THPModule_vmapmode_decrement_nesting
+def _log_api_usage_once(str) -> None: ...  # LogAPIUsageOnceFromPython
+def _log_api_usage_metadata(event: str, metadata_map: Dict[str, str]) -> None: ...  # LogAPIUsageMetadataFromPython
+def _demangle(str) -> str: ...  # c10::demangle
+def _disabled_torch_function_impl(
+    func: Callable,
+    types: Iterable[Type],
+    args: Tuple,
+    kwargs: Dict,
+) -> Any: ...  # THPModule_disable_torch_function
+def _disabled_torch_dispatch_impl(
+    func: Callable,
+    types: Iterable[Type],
+    args: Tuple,
+    kwargs: Dict,
+) -> Any: ...  # THPModule_disable_dispatch_function
+def _get_linalg_preferred_backend() -> torch._C._LinalgBackend: ...
+def _set_linalg_preferred_backend(arg: torch._C._LinalgBackend): ...
+
+class _LinalgBackend:
+    Default: _LinalgBackend
+    Cusolver: _LinalgBackend
+    Magma: _LinalgBackend
+
+class ConvBackend(Enum): ...
+
+class Tag(Enum):
+    core: _int = 0
+    data_dependent_output: _int = 1
+    dynamic_output_shape: _int = 2
+    generated: _int = 3
+    inplace_view: _int = 4
+    needs_fixed_stride_order: _int = 5
+    nondeterministic_bitwise: _int = 6
+    nondeterministic_seeded: _int = 7
+    pointwise: _int = 8
+    pt2_compliant_tag: _int = 9
+    view_copy: _int = 10
+
+# Defined in `valgrind.h` and `callgrind.h` respectively.
+def _valgrind_supported_platform() -> _bool: ...  # NVALGRIND
+def _valgrind_toggle() -> None: ...  # CALLGRIND_TOGGLE_COLLECT
+def _valgrind_toggle_and_dump_stats() -> None: ...  # CALLGRIND_TOGGLE_COLLECT and CALLGRIND_DUMP_STATS
+
+has_openmp: _bool
+has_mkl: _bool
+_has_mps: _bool
+has_lapack: _bool
+_has_cuda: _bool
+_has_magma: _bool
+_has_xpu: _bool
+_has_mkldnn: _bool
+_has_cudnn: _bool
+has_spectral: _bool
+_GLIBCXX_USE_CXX11_ABI: _bool
+default_generator: Generator
+
+# Defined in torch/csrc/autograd/init.cpp
+def _set_grad_enabled(enabled: _bool) -> None: ...
+def is_grad_enabled() -> _bool: ...
+def _set_fwd_grad_enabled(enabled: _bool) -> None: ...
+def _is_fwd_grad_enabled() -> _bool: ...
+def is_inference_mode_enabled() -> _bool: ...
+def set_autocast_enabled(enabled: _bool) -> None: ...
+def is_autocast_enabled() -> _bool: ...
+def clear_autocast_cache() -> None: ...
+def set_autocast_cpu_enabled(enabled: _bool) -> None: ...
+def is_autocast_cpu_enabled() -> _bool: ...
+def _is_any_autocast_enabled() -> _bool: ...
+def set_autocast_cpu_dtype(dtype: _dtype) -> None: ...
+def set_autocast_gpu_dtype(dtype: _dtype) -> None: ...
+def get_autocast_cpu_dtype() -> _dtype: ...
+def get_autocast_gpu_dtype() -> _dtype: ...
+def autocast_increment_nesting() -> _int: ...
+def autocast_decrement_nesting() -> _int: ...
+def is_autocast_cache_enabled() -> _bool: ...
+def set_autocast_cache_enabled(enabled: _bool) -> None: ...
+def _increment_version(tensor: Tensor) -> None: ...
+def set_anomaly_enabled(enabled: _bool, check_nan: _bool = True) -> None: ...
+def is_anomaly_enabled() -> _bool: ...
+def is_anomaly_check_nan_enabled() -> _bool: ...
+def _is_multithreading_enabled() -> _bool: ...
+def _set_multithreading_enabled(enabled: _bool) -> None: ...
+def _set_view_replay_enabled(enabled: _bool) -> None: ...
+def _is_view_replay_enabled() -> _bool: ...
+def _enter_dual_level() -> _int: ...
+def _exit_dual_level(level: _int) -> None: ...
+def _make_dual(tensor: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _unpack_dual(tensor: Tensor, level: _int) -> Tensor: ...
+def __set_forward_AD_enabled(enabled: _bool) -> None: ...
+def __is_forward_AD_enabled() -> _bool: ...
+def _register_default_hooks(pack_hook: Callable, unpack_hook: Callable) -> None: ...
+def _reset_default_hooks() -> None: ...
+def _is_torch_function_mode_enabled() -> _bool: ...
+def _set_torch_function_mode(cls: Any) -> None: ...
+def _push_on_torch_function_stack(cls: Any) -> None: ...
+def _pop_torch_function_stack() -> Any: ...
+def _get_function_stack_at(idx: _int) -> Any: ...
+def _len_torch_function_stack() -> _int: ...
+def _set_torch_dispatch_mode(cls: Any) -> None: ...
+def _push_on_torch_dispatch_stack(cls: Any) -> None: ...
+def _pop_torch_dispatch_stack(mode_key: Optional[torch._C._TorchDispatchModeKey] = None) -> Any: ...
+def _get_dispatch_mode(mode_key: Optional[torch._C._TorchDispatchModeKey]) -> Any: ...
+def _unset_dispatch_mode(mode: torch._C._TorchDispatchModeKey) -> Any: ...
+def _set_dispatch_mode(mode: Any) -> None: ...
+def _get_dispatch_stack_at(idx: _int) -> Any: ...
+def _len_torch_dispatch_stack() -> _int: ...
+
+class _DisableTorchDispatch:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _EnableTorchFunction:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _EnablePythonDispatcher:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _DisablePythonDispatcher:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _EnablePreDispatch:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _DisableFuncTorch:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _DisableAutocast:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _InferenceMode:
+    def __init__(self, enabled: _bool): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+def _set_autograd_fallback_mode(mode: str) -> None: ...
+def _get_autograd_fallback_mode() -> str: ...
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+class LoggerBase: ...
+class NoopLogger(LoggerBase): ...
+class LockingLogger(LoggerBase): ...
+
+class AggregationType(Enum):
+    SUM = 0
+    AVG = 1
+
+class FileCheck:
+    def run(self, test_string: str) -> None: ...
+    def check(self, test_string: str) -> FileCheck: ...
+    def check_not(self, test_string: str) -> FileCheck: ...
+    def check_same(self, test_string: str) -> FileCheck: ...
+    def check_next(self, test_string: str) -> FileCheck: ...
+    def check_count(
+        self,
+        test_string: str,
+        count: _int,
+        exactly: _bool = False,
+    ) -> FileCheck: ...
+    def check_dag(self, test_string: str) -> FileCheck: ...
+    def check_source_highlighted(self, test_string: str) -> FileCheck: ...
+    def check_regex(self, test_string: str) -> FileCheck: ...
+
+# Defined in torch/csrc/jit/python/init.cpp
+class PyTorchFileReader:
+    @overload
+    def __init__(self, name: str) -> None: ...
+    @overload
+    def __init__(self, buffer: BinaryIO) -> None: ...
+    def get_record(self, name: str) -> bytes: ...
+    def serialization_id(self) -> str: ...
+
+class PyTorchFileWriter:
+    @overload
+    def __init__(self, name: str) -> None: ...
+    @overload
+    def __init__(self, buffer: BinaryIO) -> None: ...
+    def write_record(self, name: str, data: Union[Storage, bytes, _int], size: _int) -> None: ...
+    def write_end_of_file(self) -> None: ...
+    def set_min_version(self, version: _int) -> None: ...
+    def get_all_written_records(self) -> List[str]: ...
+    def archive_name(self) -> str: ...
+    def serialization_id(self) -> str: ...
+
+def _jit_get_inline_everything_mode() -> _bool: ...
+def _jit_set_inline_everything_mode(enabled: _bool) -> None: ...
+def _jit_get_logging_option() -> str: ...
+def _jit_set_logging_option(option: str) -> None: ...
+def _jit_set_logging_stream(stream_name: str) -> None: ...
+def _jit_pass_cse(Graph) -> _bool: ...
+def _jit_pass_dce(Graph) -> None: ...
+def _jit_pass_lint(Graph) -> None: ...
+
+# Defined in torch/csrc/jit/python/python_custom_class.cpp
+def _get_custom_class_python_wrapper(name: str, attr: str) -> Any: ...
+
+# Defined in torch/csrc/Module.cpp
+def _rename_privateuse1_backend(backend: str) -> None: ...
+def _get_privateuse1_backend_name() -> str: ...
+
+# Defined in torch/csrc/Generator.cpp
+class Generator:
+    device: _device
+    def __init__(self, device: Optional[DeviceLikeType] = None) -> None: ...
+    def get_state(self) -> Tensor: ...
+    def set_state(self, _new_state: Tensor) -> Generator: ...
+    def set_offset(self, offset: _int) -> Generator: ...
+    def get_offset(self) -> _int: ...
+    def manual_seed(self, seed: _int) -> Generator: ...
+    def seed(self) -> _int: ...
+    def initial_seed(self) -> _int: ...
+
+# Defined in torch/csrc/utils/python_dispatch.cpp
+
+class _DispatchOperatorHandle:
+    def schema(self) -> FunctionSchema: ...
+    def debug(self) -> str: ...
+
+class _DispatchModule:
+    def def_(self, schema: str, alias: str = "") -> _DispatchModule: ...
+    def def_legacy(self, schema: str) -> _DispatchModule: ...
+    def def_name_t_t(
+        self,
+        name: str,
+        dispatch: str,
+        debug: str = "default_def_name_t_t",
+    ) -> _DispatchModule: ...
+    def def_schema_t_t(
+        self,
+        schema: str,
+        dispatch: str,
+        alias: str,
+        debug: str = "default_def_schema_t_t",
+    ) -> _DispatchModule: ...
+    def impl_t_t(
+        self,
+        name: str,
+        dispatch: str,
+        debug: str = "impl_t_t",
+    ) -> _DispatchModule: ...
+    def impl(self, name: str, dispatch: str, func: Callable) -> _DispatchModule: ...
+    def define(self, schema: str, alias: str = "") -> _DispatchModule: ...
+    def fallback_fallthrough(self, dispatch: str = "") -> _DispatchModule: ...
+
+def _dispatch_library(
+    kind: str,
+    name: str,
+    dispatch: str,
+    file: str = "",
+    linenum: Any = 0,
+) -> _DispatchModule: ...
+def _dispatch_dump(name: str) -> str: ...
+def _dispatch_dump_table(name: str) -> str: ...
+def _dispatch_check_invariants(name: str) -> None: ...
+def _dispatch_check_all_invariants() -> None: ...
+def _dispatch_call_boxed(handle: _DispatchOperatorHandle, *args, **kwargs) -> Any: ...
+def _dispatch_find_schema_or_throw(name: str, overload_name: str) -> _DispatchOperatorHandle: ...
+def _dispatch_set_report_error_callback(handle: _DispatchOperatorHandle, callback: Callable) -> None: ...
+def _dispatch_has_kernel(name: str) -> _bool: ...
+def _dispatch_has_kernel_for_dispatch_key(
+    name: str,
+    dispatch: _dispatchkey,
+) -> _bool: ...
+def _dispatch_has_kernel_for_any_dispatch_key(
+    name: str,
+    dispatch_key_set: DispatchKeySet,
+) -> _bool: ...
+def _dispatch_has_computed_kernel_for_dispatch_key(
+    name: str,
+    dispatch: _dispatchkey,
+) -> _bool: ...
+def _dispatch_find_dangling_impls() -> List[str]: ...
+def _dispatch_get_all_op_names() -> List[str]: ...
+def _dispatch_tls_set_dispatch_key_excluded(
+    dispatch: _dispatchkey,
+    val: _bool,
+) -> None: ...
+def _dispatch_tls_is_dispatch_key_excluded(dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_tls_set_dispatch_key_included(
+    dispatch: _dispatchkey,
+    val: _bool,
+) -> None: ...
+def _dispatch_tls_is_dispatch_key_included(dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_isTensorSubclassLike(tensor: Tensor) -> _bool: ...
+def _dispatch_key_name(dispatch: _dispatchkey) -> str: ...
+def _dispatch_key_for_device(device_type: str) -> str: ...
+def _parse_dispatch_key(key: str) -> Optional[DispatchKey]: ...
+def _dispatch_key_parse(dispatch: _dispatchkey) -> DispatchKey: ...
+def _dispatch_num_backends() -> _int: ...
+def _dispatch_pystub(name: str, overload: str) -> Optional[Tuple[str, str]]: ...
+def _dispatch_is_alias_key(dispatch: _dispatchkey) -> _bool: ...
+def _functionality_to_backend_keys(dispatch: _dispatchkey) -> List[DispatchKey]: ...
+def _functionalization_reapply_views_tls() -> _bool: ...
+
+class DispatchKey(Enum):
+    Undefined: DispatchKey = ...
+    FPGA: DispatchKey = ...
+    ORT: DispatchKey = ...
+    Vulkan: DispatchKey = ...
+    Metal: DispatchKey = ...
+    MKLDNN: DispatchKey = ...
+    OpenGL: DispatchKey = ...
+    OpenCL: DispatchKey = ...
+    IDEEP: DispatchKey = ...
+    CustomRNGKeyId: DispatchKey = ...
+    MkldnnCPU: DispatchKey = ...
+    Sparse: DispatchKey = ...
+    SparseCsr: DispatchKey = ...
+    NestedTensor: DispatchKey = ...
+    Dense: DispatchKey = ...
+    PreDispatch: DispatchKey = ...
+    Python: DispatchKey = ...
+    FuncTorchDynamicLayerBackMode: DispatchKey = ...
+    ZeroTensor: DispatchKey = ...
+    Conjugate: DispatchKey = ...
+    Negative: DispatchKey = ...
+    BackendSelect: DispatchKey = ...
+    Named: DispatchKey = ...
+    AutogradOther: DispatchKey = ...
+    AutogradFunctionality: DispatchKey = ...
+    AutogradNestedTensor: DispatchKey = ...
+    Tracer: DispatchKey = ...
+    Autocast: DispatchKey = ...
+    Batched: DispatchKey = ...
+    VmapMode: DispatchKey = ...
+    FuncTorchGradWrapper: DispatchKey = ...
+    FuncTorchBatched: DispatchKey = ...
+    BatchedNestedTensor: DispatchKey = ...
+    FuncTorchVmapMode: DispatchKey = ...
+    FuncTorchDynamicLayerFrontMode: DispatchKey = ...
+    Functionalize: DispatchKey = ...
+    TESTING_ONLY_GenericWrapper: DispatchKey = ...
+    TESTING_ONLY_GenericMode: DispatchKey = ...
+    ADInplaceOrView: DispatchKey = ...
+    Autograd: DispatchKey = ...
+    CompositeImplicitAutograd: DispatchKey = ...
+    CompositeImplicitAutogradNestedTensor: DispatchKey = ...
+    CompositeExplicitAutograd: DispatchKey = ...
+    CompositeExplicitAutogradNonFunctional: DispatchKey = ...
+    FuncTorchBatchedDecomposition: DispatchKey = ...
+    CPU: DispatchKey = ...
+    CUDA: DispatchKey = ...
+    HIP: DispatchKey = ...
+    XLA: DispatchKey = ...
+    MTIA: DispatchKey = ...
+    MPS: DispatchKey = ...
+    IPU: DispatchKey = ...
+    XPU: DispatchKey = ...
+    HPU: DispatchKey = ...
+    VE: DispatchKey = ...
+    Lazy: DispatchKey = ...
+    Meta: DispatchKey = ...
+    PrivateUse1: DispatchKey = ...
+    PrivateUse2: DispatchKey = ...
+    PrivateUse3: DispatchKey = ...
+    QuantizedCPU: DispatchKey = ...
+    QuantizedCUDA: DispatchKey = ...
+    QuantizedHIP: DispatchKey = ...
+    QuantizedXLA: DispatchKey = ...
+    QuantizedMTIA: DispatchKey = ...
+    QuantizedMPS: DispatchKey = ...
+    QuantizedIPU: DispatchKey = ...
+    QuantizedXPU: DispatchKey = ...
+    QuantizedHPU: DispatchKey = ...
+    QuantizedVE: DispatchKey = ...
+    QuantizedLazy: DispatchKey = ...
+    QuantizedMeta: DispatchKey = ...
+    QuantizedPrivateUse1: DispatchKey = ...
+    QuantizedPrivateUse2: DispatchKey = ...
+    QuantizedPrivateUse3: DispatchKey = ...
+    SparseCPU: DispatchKey = ...
+    SparseCUDA: DispatchKey = ...
+    SparseHIP: DispatchKey = ...
+    SparseXLA: DispatchKey = ...
+    SparseMTIA: DispatchKey = ...
+    SparseMPS: DispatchKey = ...
+    SparseIPU: DispatchKey = ...
+    SparseXPU: DispatchKey = ...
+    SparseHPU: DispatchKey = ...
+    SparseVE: DispatchKey = ...
+    SparseLazy: DispatchKey = ...
+    SparseMeta: DispatchKey = ...
+    SparsePrivateUse1: DispatchKey = ...
+    SparsePrivateUse2: DispatchKey = ...
+    SparsePrivateUse3: DispatchKey = ...
+    SparseCsrCPU: DispatchKey = ...
+    SparseCsrCUDA: DispatchKey = ...
+    SparseCsrHIP: DispatchKey = ...
+    SparseCsrXLA: DispatchKey = ...
+    SparseCsrMTIA: DispatchKey = ...
+    SparseCsrMPS: DispatchKey = ...
+    SparseCsrIPU: DispatchKey = ...
+    SparseCsrXPU: DispatchKey = ...
+    SparseCsrHPU: DispatchKey = ...
+    SparseCsrVE: DispatchKey = ...
+    SparseCsrLazy: DispatchKey = ...
+    SparseCsrMeta: DispatchKey = ...
+    SparseCsrPrivateUse1: DispatchKey = ...
+    SparseCsrPrivateUse2: DispatchKey = ...
+    SparseCsrPrivateUse3: DispatchKey = ...
+    NestedTensorCPU: DispatchKey = ...
+    NestedTensorCUDA: DispatchKey = ...
+    NestedTensorHIP: DispatchKey = ...
+    NestedTensorXLA: DispatchKey = ...
+    NestedTensorMTIA: DispatchKey = ...
+    NestedTensorMPS: DispatchKey = ...
+    NestedTensorIPU: DispatchKey = ...
+    NestedTensorXPU: DispatchKey = ...
+    NestedTensorHPU: DispatchKey = ...
+    NestedTensorVE: DispatchKey = ...
+    NestedTensorLazy: DispatchKey = ...
+    NestedTensorMeta: DispatchKey = ...
+    NestedTensorPrivateUse1: DispatchKey = ...
+    NestedTensorPrivateUse2: DispatchKey = ...
+    NestedTensorPrivateUse3: DispatchKey = ...
+    AutogradCPU: DispatchKey = ...
+    AutogradCUDA: DispatchKey = ...
+    AutogradHIP: DispatchKey = ...
+    AutogradXLA: DispatchKey = ...
+    AutogradMTIA: DispatchKey = ...
+    AutogradMPS: DispatchKey = ...
+    AutogradIPU: DispatchKey = ...
+    AutogradXPU: DispatchKey = ...
+    AutogradHPU: DispatchKey = ...
+    AutogradVE: DispatchKey = ...
+    AutogradLazy: DispatchKey = ...
+    AutogradMeta: DispatchKey = ...
+    AutogradPrivateUse1: DispatchKey = ...
+    AutogradPrivateUse2: DispatchKey = ...
+    AutogradPrivateUse3: DispatchKey = ...
+
+class DispatchKeySet:
+    def __init__(self, key: DispatchKey) -> None: ...
+    def __or__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def __sub__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def __and__(self, other: DispatchKeySet) -> DispatchKeySet: ...
+    def highestPriorityTypeId(self) -> DispatchKey: ...
+    def has(self, k: _dispatchkey) -> _bool: ...
+    def add(self, k: _dispatchkey) -> DispatchKeySet: ...
+    def remove(self, k: _dispatchkey) -> DispatchKeySet: ...
+    def __repr__(self) -> str: ...
+
+_dispatch_autogradother_backends: DispatchKeySet
+_additional_keys_to_prop_for_wrapper_tensors: DispatchKeySet
+
+def _dispatch_has_backend_fallback(dispatch: _dispatchkey) -> _bool: ...
+def _dispatch_keyset_full_after(t: _dispatchkey) -> DispatchKeySet: ...
+def _dispatch_keyset_full() -> DispatchKeySet: ...
+def _dispatch_keyset_to_string(keyset: DispatchKeySet) -> str: ...
+def _dispatch_get_backend_keyset_from_autograd(
+    dispatch: _dispatchkey,
+) -> DispatchKeySet: ...
+def _dispatch_keys(tensor: Tensor) -> DispatchKeySet: ...
+def _dispatch_tls_local_exclude_set() -> DispatchKeySet: ...
+def _dispatch_tls_local_include_set() -> DispatchKeySet: ...
+def _dispatch_is_included_in_alias(
+    dispatch_a: _dispatchkey,
+    dispatch_b: _dispatchkey,
+) -> _bool: ...
+def _propagate_xla_data(a: Tensor, b: Tensor) -> None: ...
+def _replace_(a: Tensor, b: Tensor) -> None: ...
+def _commit_update(a: Tensor) -> None: ...
+
+class _ExcludeDispatchKeyGuard:
+    def __init__(self, keyset: DispatchKeySet): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _IncludeDispatchKeyGuard:
+    def __init__(self, k: DispatchKey): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _ForceDispatchKeyGuard:
+    def __init__(self, include: DispatchKeySet, exclude: DispatchKeySet): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+class _AutoDispatchBelowAutograd:
+    def __init__(self): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+def _dispatch_print_registrations_for_dispatch_key(dispatch_key: str = "") -> None: ...
+def _dispatch_get_registrations_for_dispatch_key(
+    dispatch_key: str = "",
+) -> List[str]: ...
+def _are_functorch_transforms_active() -> _bool: ...
+
+# Define in torch/csrc/autograd/init.cpp
+def _set_python_dispatcher(dispatcher: object) -> None: ...
+
+def _get_nested_int(id: _int, coeff: _int) -> SymInt: ...
+
+def _get_constant_bool_symnode(val: _bool) -> Any: ...
+
+class _TorchDispatchModeKey(Enum):
+    FAKE: _TorchDispatchModeKey = ...
+    PROXY: _TorchDispatchModeKey = ...
+    FUNCTIONAL: _TorchDispatchModeKey = ...
+
+class _SetExcludeDispatchKeyGuard:
+    def __init__(self, k: DispatchKey, enabled: _bool): ...
+    def __enter__(self): ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+# Defined in torch/csrc/utils/init.cpp
+class BenchmarkConfig:
+    num_calling_threads: _int
+    num_worker_threads: _int
+    num_warmup_iters: _int
+    num_iters: _int
+    profiler_output_path: str
+
+class BenchmarkExecutionStats:
+    latency_avg_ms: _float
+    num_iters: _int
+
+class ThroughputBenchmark:
+    def __init__(self, module: Any) -> None: ...
+    def add_input(self, *args: Any, **kwargs: Any) -> None: ...
+    def run_once(self, *args: Any, **kwargs: Any) -> Any: ...
+    def benchmark(self, config: BenchmarkConfig) -> BenchmarkExecutionStats: ...
+
+# Defined in torch/csrc/Storage.cpp
+class StorageBase(object): ...
+
+# TODO: where
+class DoubleTensor(Tensor): ...
+class FloatTensor(Tensor): ...
+class BFloat16Tensor(Tensor): ...
+class LongTensor(Tensor): ...
+class IntTensor(Tensor): ...
+class ShortTensor(Tensor): ...
+class HalfTensor(Tensor): ...
+class CharTensor(Tensor): ...
+class ByteTensor(Tensor): ...
+class BoolTensor(Tensor): ...
+
+# Defined in torch/csrc/autograd/python_engine.cpp
+class _ImperativeEngine:
+    def queue_callback(self, callback: Callable[[], None]) -> None: ...
+    def run_backward(self, *args: Any, **kwargs: Any) -> Tuple[Tensor, ...]: ...
+    def is_checkpoint_valid(self) -> _bool: ...
+
+# Defined in torch/csrc/autograd/python_variable.cpp
+class _TensorMeta(type): ...
+
+# Defined in torch/csrc/autograd/python_variable.cpp
+class TensorBase(metaclass=_TensorMeta):
+    requires_grad: _bool
+    retains_grad: _bool
+    shape: Size
+    data: Tensor
+    names: List[str]
+    device: _device
+    dtype: _dtype
+    layout: _layout
+    real: Tensor
+    imag: Tensor
+    T: Tensor
+    H: Tensor
+    mT: Tensor
+    mH: Tensor
+    ndim: _int
+    output_nr: _int
+    _version: _int
+    _base: Optional[Tensor]
+    _cdata: _int
+    grad_fn: Optional[_Node]
+    _grad_fn: Any
+    _grad: Optional[Tensor]
+    grad: Optional[Tensor]
+    _backward_hooks: Optional[Dict[_int, Callable[[Tensor], Optional[Tensor]]]]
+    nbytes: _int
+    itemsize: _int
+    _has_symbolic_sizes_strides: _bool
+    def __abs__(self) -> Tensor: ...
+    def __add__(self, other: Any) -> Tensor: ...
+    @overload
+    def __and__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __and__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __and__(self, other: Any) -> Tensor: ...
+    def __bool__(self) -> builtins.bool: ...
+    def __complex__(self) -> builtins.complex: ...
+    def __div__(self, other: Any) -> Tensor: ...
+    def __eq__(self, other: Any) -> Tensor: ...  # type: ignore[override]
+    def __float__(self) -> builtins.float: ...
+    def __floordiv__(self, other: Any) -> Tensor: ...
+    def __ge__(self, other: Any) -> Tensor: ...
+    def __getitem__(self, indices: Union[Union[SupportsIndex, Union[None, _bool, _int, slice, ellipsis, Tensor], _NestedSequence[Union[None, _bool, _int, slice, ellipsis, Tensor]]], tuple[Union[SupportsIndex, Union[None, _bool, _int, slice, ellipsis, Tensor], _NestedSequence[Union[None, _bool, _int, slice, ellipsis, Tensor]]], ...]]) -> Tensor: ...
+    def __gt__(self, other: Any) -> Tensor: ...
+    def __iadd__(self, other: Any) -> Tensor: ...
+    @overload
+    def __iand__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __iand__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __iand__(self, other: Any) -> Tensor: ...
+    def __idiv__(self, other: Any) -> Tensor: ...
+    def __ifloordiv__(self, other: Any) -> Tensor: ...
+    @overload
+    def __ilshift__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __ilshift__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __ilshift__(self, other: Any) -> Tensor: ...
+    def __imod__(self, other: Any) -> Tensor: ...
+    def __imul__(self, other: Any) -> Tensor: ...
+    def __index__(self) -> builtins.int: ...
+    @overload
+    def __init__(self, *args: Any, device: Optional[DeviceLikeType] = None) -> None: ...
+    @overload
+    def __init__(self, storage: Storage) -> None: ...
+    @overload
+    def __init__(self, other: Tensor) -> None: ...
+    @overload
+    def __init__(self, size: _size, *, device: Optional[DeviceLikeType] = None) -> None: ...
+    def __int__(self) -> builtins.int: ...
+    def __invert__(self) -> Tensor: ...
+    @overload
+    def __ior__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __ior__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __ior__(self, other: Any) -> Tensor: ...
+    @overload
+    def __irshift__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __irshift__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __irshift__(self, other: Any) -> Tensor: ...
+    def __isub__(self, other: Any) -> Tensor: ...
+    @overload
+    def __ixor__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __ixor__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __ixor__(self, other: Any) -> Tensor: ...
+    def __le__(self, other: Any) -> Tensor: ...
+    def __long__(self) -> builtins.int: ...
+    @overload
+    def __lshift__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __lshift__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __lshift__(self, other: Any) -> Tensor: ...
+    def __lt__(self, other: Any) -> Tensor: ...
+    def __matmul__(self, other: Any) -> Tensor: ...
+    def __mod__(self, other: Any) -> Tensor: ...
+    def __mul__(self, other: Any) -> Tensor: ...
+    def __ne__(self, other: Any) -> Tensor: ...  # type: ignore[override]
+    def __neg__(self) -> Tensor: ...
+    def __new__(self, *args, **kwargs) -> Tensor: ...
+    def __nonzero__(self) -> builtins.bool: ...
+    @overload
+    def __or__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __or__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __or__(self, other: Any) -> Tensor: ...
+    def __pow__(self, other: Any) -> Tensor: ...
+    def __radd__(self, other: Any) -> Tensor: ...
+    def __rand__(self, other: Any) -> Tensor: ...
+    def __rfloordiv__(self, other: Any) -> Tensor: ...
+    def __rmul__(self, other: Any) -> Tensor: ...
+    def __ror__(self, other: Any) -> Tensor: ...
+    def __rpow__(self, other: Any) -> Tensor: ...
+    @overload
+    def __rshift__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __rshift__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __rshift__(self, other: Any) -> Tensor: ...
+    def __rsub__(self, other: Any) -> Tensor: ...
+    def __rtruediv__(self, other: Any) -> Tensor: ...
+    def __rxor__(self, other: Any) -> Tensor: ...
+    def __setitem__(self, indices: Union[Union[SupportsIndex, Union[None, _bool, _int, slice, ellipsis, Tensor], _NestedSequence[Union[None, _bool, _int, slice, ellipsis, Tensor]]], tuple[Union[SupportsIndex, Union[None, _bool, _int, slice, ellipsis, Tensor], _NestedSequence[Union[None, _bool, _int, slice, ellipsis, Tensor]]], ...]], val: Union[Tensor, Number]) -> None: ...
+    def __sub__(self, other: Any) -> Tensor: ...
+    def __truediv__(self, other: Any) -> Tensor: ...
+    @overload
+    def __xor__(self, other: Tensor) -> Tensor: ...
+    @overload
+    def __xor__(self, other: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def __xor__(self, other: Any) -> Tensor: ...
+    def _addmm_activation(self, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, use_gelu: _bool = False) -> Tensor: ...
+    def _autocast_to_full_precision(self, cuda_enabled: _bool, cpu_enabled: _bool) -> Tensor: ...
+    def _autocast_to_reduced_precision(self, cuda_enabled: _bool, cpu_enabled: _bool, cuda_dtype: _dtype, cpu_dtype: _dtype) -> Tensor: ...
+    def _coalesced_(self, coalesced: _bool) -> Tensor: ...
+    def _conj(self) -> Tensor: ...
+    def _conj_physical(self) -> Tensor: ...
+    def _dimI(self) -> _int: ...
+    def _dimV(self) -> _int: ...
+    def _indices(self) -> Tensor: ...
+    def _is_all_true(self) -> Tensor: ...
+    def _is_any_true(self) -> Tensor: ...
+    def _is_view(self) -> _bool: ...
+    def _is_zerotensor(self) -> _bool: ...
+    def _lazy_clone(self) -> Tensor: ...
+    @staticmethod    
+    def _make_subclass(cls: Type[S], data: Tensor, require_grad: _bool = False, dispatch_strides: _bool = False, dispatch_device: _bool = False, device_for_backend_keys: Optional[_device] = None) -> S: ...
+    def _neg_view(self) -> Tensor: ...
+    def _nested_tensor_size(self) -> Tensor: ...
+    def _nested_tensor_storage_offsets(self) -> Tensor: ...
+    def _nested_tensor_strides(self) -> Tensor: ...
+    def _nnz(self) -> _int: ...
+    def _sparse_mask_projection(self, mask: Tensor, accumulate_matches: _bool = False) -> Tensor: ...
+    def _to_dense(self, dtype: Optional[_dtype] = None, masked_grad: Optional[_bool] = None) -> Tensor: ...
+    @overload
+    def _to_sparse(self, *, layout: Optional[_layout] = None, blocksize: Optional[Union[_int, _size]] = None, dense_dim: Optional[_int] = None) -> Tensor: ...
+    @overload
+    def _to_sparse(self, sparse_dim: _int) -> Tensor: ...
+    def _to_sparse_bsc(self, blocksize: Union[_int, _size], dense_dim: Optional[_int] = None) -> Tensor: ...
+    def _to_sparse_bsr(self, blocksize: Union[_int, _size], dense_dim: Optional[_int] = None) -> Tensor: ...
+    def _to_sparse_csc(self, dense_dim: Optional[_int] = None) -> Tensor: ...
+    def _to_sparse_csr(self, dense_dim: Optional[_int] = None) -> Tensor: ...
+    def _values(self) -> Tensor: ...
+    def abs(self) -> Tensor: 
+        r"""
+        abs() -> Tensor
+        
+        See :func:`torch.abs`
+        """
+        ...
+    def abs_(self) -> Tensor: 
+        r"""
+        abs_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.abs`
+        """
+        ...
+    def absolute(self) -> Tensor: 
+        r"""
+        absolute() -> Tensor
+        
+        Alias for :func:`abs`
+        """
+        ...
+    def absolute_(self) -> Tensor: 
+        r"""
+        absolute_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.absolute`
+        Alias for :func:`abs_`
+        """
+        ...
+    def acos(self) -> Tensor: 
+        r"""
+        acos() -> Tensor
+        
+        See :func:`torch.acos`
+        """
+        ...
+    def acos_(self) -> Tensor: 
+        r"""
+        acos_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.acos`
+        """
+        ...
+    def acosh(self) -> Tensor: 
+        r"""
+        acosh() -> Tensor
+        
+        See :func:`torch.acosh`
+        """
+        ...
+    def acosh_(self) -> Tensor: 
+        r"""
+        acosh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.acosh`
+        """
+        ...
+    def add(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        add(other, *, alpha=1) -> Tensor
+        
+        Add a scalar or tensor to :attr:`self` tensor. If both :attr:`alpha`
+        and :attr:`other` are specified, each element of :attr:`other` is scaled by
+        :attr:`alpha` before being used.
+        
+        When :attr:`other` is a tensor, the shape of :attr:`other` must be
+        :ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+        tensor
+        
+        See :func:`torch.add`
+        """
+        ...
+    def add_(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], *, alpha: Optional[Union[Number, _complex]] = 1) -> Tensor: 
+        r"""
+        add_(other, *, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.add`
+        """
+        ...
+    def addbmm(self, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.addbmm`
+        """
+        ...
+    def addbmm_(self, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addbmm`
+        """
+        ...
+    def addcdiv(self, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addcdiv(tensor1, tensor2, *, value=1) -> Tensor
+        
+        See :func:`torch.addcdiv`
+        """
+        ...
+    def addcdiv_(self, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addcdiv_(tensor1, tensor2, *, value=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addcdiv`
+        """
+        ...
+    def addcmul(self, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addcmul(tensor1, tensor2, *, value=1) -> Tensor
+        
+        See :func:`torch.addcmul`
+        """
+        ...
+    def addcmul_(self, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addcmul_(tensor1, tensor2, *, value=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addcmul`
+        """
+        ...
+    def addmm(self, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.addmm`
+        """
+        ...
+    def addmm_(self, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addmm_(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addmm`
+        """
+        ...
+    def addmv(self, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addmv(mat, vec, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.addmv`
+        """
+        ...
+    def addmv_(self, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addmv_(mat, vec, *, beta=1, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addmv`
+        """
+        ...
+    def addr(self, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addr(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.addr`
+        """
+        ...
+    def addr_(self, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        addr_(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.addr`
+        """
+        ...
+    def adjoint(self) -> Tensor: 
+        r"""
+        adjoint() -> Tensor
+        
+        Alias for :func:`adjoint`
+        """
+        ...
+    def align_as(self, other: Tensor) -> Tensor: 
+        r"""
+        align_as(other) -> Tensor
+        
+        Permutes the dimensions of the :attr:`self` tensor to match the dimension order
+        in the :attr:`other` tensor, adding size-one dims for any new names.
+        
+        This operation is useful for explicit broadcasting by names (see examples).
+        
+        All of the dims of :attr:`self` must be named in order to use this method.
+        The resulting tensor is a view on the original tensor.
+        
+        All dimension names of :attr:`self` must be present in ``other.names``.
+        :attr:`other` may contain named dimensions that are not in ``self.names``;
+        the output tensor has a size-one dimension for each of those new names.
+        
+        To align a tensor to a specific order, use :meth:`~Tensor.align_to`.
+        
+        Examples::
+        
+            # Example 1: Applying a mask
+            >>> mask = torch.randint(2, [127, 128], dtype=torch.bool).refine_names('W', 'H')
+            >>> imgs = torch.randn(32, 128, 127, 3, names=('N', 'H', 'W', 'C'))
+            >>> imgs.masked_fill_(mask.align_as(imgs), 0)
+        
+        
+            # Example 2: Applying a per-channel-scale
+            >>> def scale_channels(input, scale):
+            >>>    scale = scale.refine_names('C')
+            >>>    return input * scale.align_as(input)
+        
+            >>> num_channels = 3
+            >>> scale = torch.randn(num_channels, names=('C',))
+            >>> imgs = torch.rand(32, 128, 128, num_channels, names=('N', 'H', 'W', 'C'))
+            >>> more_imgs = torch.rand(32, num_channels, 128, 128, names=('N', 'C', 'H', 'W'))
+            >>> videos = torch.randn(3, num_channels, 128, 128, 128, names=('N', 'C', 'H', 'W', 'D'))
+        
+            # scale_channels is agnostic to the dimension order of the input
+            >>> scale_channels(imgs, scale)
+            >>> scale_channels(more_imgs, scale)
+            >>> scale_channels(videos, scale)
+        
+        .. warning::
+            The named tensor API is experimental and subject to change.
+        """
+        ...
+    @overload
+    def align_to(self, order: Sequence[Union[str, ellipsis, None]], ellipsis_idx: _int) -> Tensor: ...
+    @overload
+    def align_to(self, names: Sequence[Union[str, ellipsis, None]]) -> Tensor: ...
+    @overload
+    def all(self) -> Tensor: 
+        r"""
+        all(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.all`
+        """
+        ...
+    @overload
+    def all(self, dim: Optional[_size] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        all(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.all`
+        """
+        ...
+    @overload
+    def all(self, dim: _int, keepdim: _bool = False) -> Tensor: 
+        r"""
+        all(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.all`
+        """
+        ...
+    @overload
+    def all(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> Tensor: 
+        r"""
+        all(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.all`
+        """
+        ...
+    def allclose(self, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> _bool: 
+        r"""
+        allclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+        
+        See :func:`torch.allclose`
+        """
+        ...
+    def amax(self, dim: Union[_int, _size] = (), keepdim: _bool = False) -> Tensor: 
+        r"""
+        amax(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.amax`
+        """
+        ...
+    def amin(self, dim: Union[_int, _size] = (), keepdim: _bool = False) -> Tensor: 
+        r"""
+        amin(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.amin`
+        """
+        ...
+    def aminmax(self, *, dim: Optional[_int] = None, keepdim: _bool = False) -> torch.return_types.aminmax: 
+        r"""
+        aminmax(*, dim=None, keepdim=False) -> (Tensor min, Tensor max)
+        
+        See :func:`torch.aminmax`
+        """
+        ...
+    def angle(self) -> Tensor: 
+        r"""
+        angle() -> Tensor
+        
+        See :func:`torch.angle`
+        """
+        ...
+    @overload
+    def any(self) -> Tensor: 
+        r"""
+        any(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.any`
+        """
+        ...
+    @overload
+    def any(self, dim: Optional[_size] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        any(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.any`
+        """
+        ...
+    @overload
+    def any(self, dim: _int, keepdim: _bool = False) -> Tensor: 
+        r"""
+        any(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.any`
+        """
+        ...
+    @overload
+    def any(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> Tensor: 
+        r"""
+        any(dim=None, keepdim=False) -> Tensor
+        
+        See :func:`torch.any`
+        """
+        ...
+    def apply_(self, callable: Callable) -> Tensor: 
+        r"""
+        apply_(callable) -> Tensor
+        
+        Applies the function :attr:`callable` to each element in the tensor, replacing
+        each element with the value returned by :attr:`callable`.
+        
+        .. note::
+        
+            This function only works with CPU tensors and should not be used in code
+            sections that require high performance.
+        """
+        ...
+    def arccos(self) -> Tensor: 
+        r"""
+        arccos() -> Tensor
+        
+        See :func:`torch.arccos`
+        """
+        ...
+    def arccos_(self) -> Tensor: 
+        r"""
+        arccos_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.arccos`
+        """
+        ...
+    def arccosh(self) -> Tensor: 
+        r"""
+        acosh() -> Tensor
+        
+        See :func:`torch.arccosh`
+        """
+        ...
+    def arccosh_(self) -> Tensor: 
+        r"""
+        acosh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.arccosh`
+        """
+        ...
+    def arcsin(self) -> Tensor: 
+        r"""
+        arcsin() -> Tensor
+        
+        See :func:`torch.arcsin`
+        """
+        ...
+    def arcsin_(self) -> Tensor: 
+        r"""
+        arcsin_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.arcsin`
+        """
+        ...
+    def arcsinh(self) -> Tensor: 
+        r"""
+        arcsinh() -> Tensor
+        
+        See :func:`torch.arcsinh`
+        """
+        ...
+    def arcsinh_(self) -> Tensor: 
+        r"""
+        arcsinh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.arcsinh`
+        """
+        ...
+    def arctan(self) -> Tensor: 
+        r"""
+        arctan() -> Tensor
+        
+        See :func:`torch.arctan`
+        """
+        ...
+    def arctan2(self, other: Tensor) -> Tensor: 
+        r"""
+        arctan2(other) -> Tensor
+        
+        See :func:`torch.arctan2`
+        """
+        ...
+    def arctan2_(self, other: Tensor) -> Tensor: 
+        r"""
+        atan2_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.arctan2`
+        """
+        ...
+    def arctan_(self) -> Tensor: 
+        r"""
+        arctan_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.arctan`
+        """
+        ...
+    def arctanh(self) -> Tensor: 
+        r"""
+        arctanh() -> Tensor
+        
+        See :func:`torch.arctanh`
+        """
+        ...
+    def arctanh_(self) -> Tensor: 
+        r"""
+        arctanh_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.arctanh`
+        """
+        ...
+    def argmax(self, dim: Optional[_int] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        argmax(dim=None, keepdim=False) -> LongTensor
+        
+        See :func:`torch.argmax`
+        """
+        ...
+    def argmin(self, dim: Optional[_int] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        argmin(dim=None, keepdim=False) -> LongTensor
+        
+        See :func:`torch.argmin`
+        """
+        ...
+    @overload
+    def argsort(self, *, stable: _bool, dim: _int = -1, descending: _bool = False) -> Tensor: 
+        r"""
+        argsort(dim=-1, descending=False) -> LongTensor
+        
+        See :func:`torch.argsort`
+        """
+        ...
+    @overload
+    def argsort(self, dim: _int = -1, descending: _bool = False) -> Tensor: 
+        r"""
+        argsort(dim=-1, descending=False) -> LongTensor
+        
+        See :func:`torch.argsort`
+        """
+        ...
+    @overload
+    def argsort(self, dim: Union[str, ellipsis, None], descending: _bool = False) -> Tensor: 
+        r"""
+        argsort(dim=-1, descending=False) -> LongTensor
+        
+        See :func:`torch.argsort`
+        """
+        ...
+    def argwhere(self) -> Tensor: 
+        r"""
+        argwhere() -> Tensor
+        
+        See :func:`torch.argwhere`
+        """
+        ...
+    def as_strided(self, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+        r"""
+        as_strided(size, stride, storage_offset=None) -> Tensor
+        
+        See :func:`torch.as_strided`
+        """
+        ...
+    def as_strided_(self, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+        r"""
+        as_strided_(size, stride, storage_offset=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.as_strided`
+        """
+        ...
+    def as_strided_scatter(self, src: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+        r"""
+        as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor
+        
+        See :func:`torch.as_strided_scatter`
+        """
+        ...
+    def as_subclass(self, cls: Type[S]) -> S: 
+        r"""
+        as_subclass(cls) -> Tensor
+        
+        Makes a ``cls`` instance with the same data pointer as ``self``. Changes
+        in the output mirror changes in ``self``, and the output stays attached
+        to the autograd graph. ``cls`` must be a subclass of ``Tensor``.
+        """
+        ...
+    def asin(self) -> Tensor: 
+        r"""
+        asin() -> Tensor
+        
+        See :func:`torch.asin`
+        """
+        ...
+    def asin_(self) -> Tensor: 
+        r"""
+        asin_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.asin`
+        """
+        ...
+    def asinh(self) -> Tensor: 
+        r"""
+        asinh() -> Tensor
+        
+        See :func:`torch.asinh`
+        """
+        ...
+    def asinh_(self) -> Tensor: 
+        r"""
+        asinh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.asinh`
+        """
+        ...
+    def atan(self) -> Tensor: 
+        r"""
+        atan() -> Tensor
+        
+        See :func:`torch.atan`
+        """
+        ...
+    def atan2(self, other: Tensor) -> Tensor: 
+        r"""
+        atan2(other) -> Tensor
+        
+        See :func:`torch.atan2`
+        """
+        ...
+    def atan2_(self, other: Tensor) -> Tensor: 
+        r"""
+        atan2_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.atan2`
+        """
+        ...
+    def atan_(self) -> Tensor: 
+        r"""
+        atan_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.atan`
+        """
+        ...
+    def atanh(self) -> Tensor: 
+        r"""
+        atanh() -> Tensor
+        
+        See :func:`torch.atanh`
+        """
+        ...
+    def atanh_(self) -> Tensor: 
+        r"""
+        atanh_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.atanh`
+        """
+        ...
+    def baddbmm(self, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        baddbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.baddbmm`
+        """
+        ...
+    def baddbmm_(self, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        baddbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.baddbmm`
+        """
+        ...
+    @overload
+    def bernoulli(self, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        bernoulli(*, generator=None) -> Tensor
+        
+        Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+        sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+        floating point ``dtype``, and the result will have the same ``dtype``.
+        
+        See :func:`torch.bernoulli`
+        """
+        ...
+    @overload
+    def bernoulli(self, p: _float, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        bernoulli(*, generator=None) -> Tensor
+        
+        Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+        sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+        floating point ``dtype``, and the result will have the same ``dtype``.
+        
+        See :func:`torch.bernoulli`
+        """
+        ...
+    @overload
+    def bernoulli_(self, p: Tensor, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        bernoulli_(p=0.5, *, generator=None) -> Tensor
+        
+        Fills each location of :attr:`self` with an independent sample from
+        :math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+        ``dtype``.
+        
+        :attr:`p` should either be a scalar or tensor containing probabilities to be
+        used for drawing the binary random number.
+        
+        If it is a tensor, the :math:`\text{i}^{th}` element of :attr:`self` tensor
+        will be set to a value sampled from
+        :math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`. In this case `p` must have
+        floating point ``dtype``.
+        
+        See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
+        """
+        ...
+    @overload
+    def bernoulli_(self, p: _float = 0.5, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        bernoulli_(p=0.5, *, generator=None) -> Tensor
+        
+        Fills each location of :attr:`self` with an independent sample from
+        :math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+        ``dtype``.
+        
+        :attr:`p` should either be a scalar or tensor containing probabilities to be
+        used for drawing the binary random number.
+        
+        If it is a tensor, the :math:`\text{i}^{th}` element of :attr:`self` tensor
+        will be set to a value sampled from
+        :math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`. In this case `p` must have
+        floating point ``dtype``.
+        
+        See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
+        """
+        ...
+    def bfloat16(self) -> Tensor: 
+        r"""
+        bfloat16(memory_format=torch.preserve_format) -> Tensor
+        ``self.bfloat16()`` is equivalent to ``self.to(torch.bfloat16)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def bincount(self, weights: Optional[Tensor] = None, minlength: _int = 0) -> Tensor: 
+        r"""
+        bincount(weights=None, minlength=0) -> Tensor
+        
+        See :func:`torch.bincount`
+        """
+        ...
+    @overload
+    def bitwise_and(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_and() -> Tensor
+        
+        See :func:`torch.bitwise_and`
+        """
+        ...
+    @overload
+    def bitwise_and(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_and() -> Tensor
+        
+        See :func:`torch.bitwise_and`
+        """
+        ...
+    @overload
+    def bitwise_and_(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_and_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_and`
+        """
+        ...
+    @overload
+    def bitwise_and_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_and_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_and`
+        """
+        ...
+    @overload
+    def bitwise_left_shift(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_left_shift(other) -> Tensor
+        
+        See :func:`torch.bitwise_left_shift`
+        """
+        ...
+    @overload
+    def bitwise_left_shift(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_left_shift(other) -> Tensor
+        
+        See :func:`torch.bitwise_left_shift`
+        """
+        ...
+    @overload
+    def bitwise_left_shift_(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_left_shift_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_left_shift`
+        """
+        ...
+    @overload
+    def bitwise_left_shift_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_left_shift_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_left_shift`
+        """
+        ...
+    def bitwise_not(self) -> Tensor: 
+        r"""
+        bitwise_not() -> Tensor
+        
+        See :func:`torch.bitwise_not`
+        """
+        ...
+    def bitwise_not_(self) -> Tensor: 
+        r"""
+        bitwise_not_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_not`
+        """
+        ...
+    @overload
+    def bitwise_or(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_or() -> Tensor
+        
+        See :func:`torch.bitwise_or`
+        """
+        ...
+    @overload
+    def bitwise_or(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_or() -> Tensor
+        
+        See :func:`torch.bitwise_or`
+        """
+        ...
+    @overload
+    def bitwise_or_(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_or_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_or`
+        """
+        ...
+    @overload
+    def bitwise_or_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_or_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_or`
+        """
+        ...
+    @overload
+    def bitwise_right_shift(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_right_shift(other) -> Tensor
+        
+        See :func:`torch.bitwise_right_shift`
+        """
+        ...
+    @overload
+    def bitwise_right_shift(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_right_shift(other) -> Tensor
+        
+        See :func:`torch.bitwise_right_shift`
+        """
+        ...
+    @overload
+    def bitwise_right_shift_(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_right_shift_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_right_shift`
+        """
+        ...
+    @overload
+    def bitwise_right_shift_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_right_shift_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_right_shift`
+        """
+        ...
+    @overload
+    def bitwise_xor(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_xor() -> Tensor
+        
+        See :func:`torch.bitwise_xor`
+        """
+        ...
+    @overload
+    def bitwise_xor(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_xor() -> Tensor
+        
+        See :func:`torch.bitwise_xor`
+        """
+        ...
+    @overload
+    def bitwise_xor_(self, other: Tensor) -> Tensor: 
+        r"""
+        bitwise_xor_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_xor`
+        """
+        ...
+    @overload
+    def bitwise_xor_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        bitwise_xor_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.bitwise_xor`
+        """
+        ...
+    def bmm(self, mat2: Tensor) -> Tensor: 
+        r"""
+        bmm(batch2) -> Tensor
+        
+        See :func:`torch.bmm`
+        """
+        ...
+    def bool(self) -> Tensor: 
+        r"""
+        bool(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.bool()`` is equivalent to ``self.to(torch.bool)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    @overload
+    def broadcast_to(self, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        broadcast_to(shape) -> Tensor
+        
+        See :func:`torch.broadcast_to`.
+        """
+        ...
+    @overload
+    def broadcast_to(self, *size: _int) -> Tensor: 
+        r"""
+        broadcast_to(shape) -> Tensor
+        
+        See :func:`torch.broadcast_to`.
+        """
+        ...
+    def byte(self) -> Tensor: 
+        r"""
+        byte(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def cauchy_(self, median: _float = 0, sigma: _float = 1, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        cauchy_(median=0, sigma=1, *, generator=None) -> Tensor
+        
+        Fills the tensor with numbers drawn from the Cauchy distribution:
+        
+        .. math::
+        
+            f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
+        
+        .. note::
+          Sigma (:math:`\sigma`) is used to denote the scale parameter in Cauchy distribution.
+        """
+        ...
+    def ccol_indices(self) -> Tensor: ...
+    def ceil(self) -> Tensor: 
+        r"""
+        ceil() -> Tensor
+        
+        See :func:`torch.ceil`
+        """
+        ...
+    def ceil_(self) -> Tensor: 
+        r"""
+        ceil_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.ceil`
+        """
+        ...
+    def chalf(self, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+        r"""
+        chalf(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`.
+        
+        Args:
+             memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def char(self) -> Tensor: 
+        r"""
+        char(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def cholesky(self, upper: _bool = False) -> Tensor: 
+        r"""
+        cholesky(upper=False) -> Tensor
+        
+        See :func:`torch.cholesky`
+        """
+        ...
+    def cholesky_inverse(self, upper: _bool = False) -> Tensor: 
+        r"""
+        cholesky_inverse(upper=False) -> Tensor
+        
+        See :func:`torch.cholesky_inverse`
+        """
+        ...
+    def cholesky_solve(self, input2: Tensor, upper: _bool = False) -> Tensor: 
+        r"""
+        cholesky_solve(input2, upper=False) -> Tensor
+        
+        See :func:`torch.cholesky_solve`
+        """
+        ...
+    def chunk(self, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        chunk(chunks, dim=0) -> List of Tensors
+        
+        See :func:`torch.chunk`
+        """
+        ...
+    @overload
+    def clamp(self, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        clamp(min=None, max=None) -> Tensor
+        
+        See :func:`torch.clamp`
+        """
+        ...
+    @overload
+    def clamp(self, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: 
+        r"""
+        clamp(min=None, max=None) -> Tensor
+        
+        See :func:`torch.clamp`
+        """
+        ...
+    @overload
+    def clamp_(self, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        clamp_(min=None, max=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.clamp`
+        """
+        ...
+    @overload
+    def clamp_(self, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: 
+        r"""
+        clamp_(min=None, max=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.clamp`
+        """
+        ...
+    @overload
+    def clamp_max(self, max: Tensor) -> Tensor: ...
+    @overload
+    def clamp_max(self, max: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def clamp_max_(self, max: Tensor) -> Tensor: ...
+    @overload
+    def clamp_max_(self, max: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def clamp_min(self, min: Tensor) -> Tensor: ...
+    @overload
+    def clamp_min(self, min: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def clamp_min_(self, min: Tensor) -> Tensor: ...
+    @overload
+    def clamp_min_(self, min: Union[Number, _complex]) -> Tensor: ...
+    @overload
+    def clip(self, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        clip(min=None, max=None) -> Tensor
+        
+        Alias for :meth:`~Tensor.clamp`.
+        """
+        ...
+    @overload
+    def clip(self, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: 
+        r"""
+        clip(min=None, max=None) -> Tensor
+        
+        Alias for :meth:`~Tensor.clamp`.
+        """
+        ...
+    @overload
+    def clip_(self, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        clip_(min=None, max=None) -> Tensor
+        
+        Alias for :meth:`~Tensor.clamp_`.
+        """
+        ...
+    @overload
+    def clip_(self, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: 
+        r"""
+        clip_(min=None, max=None) -> Tensor
+        
+        Alias for :meth:`~Tensor.clamp_`.
+        """
+        ...
+    def clone(self, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+        r"""
+        clone(*, memory_format=torch.preserve_format) -> Tensor
+        
+        See :func:`torch.clone`
+        """
+        ...
+    def coalesce(self) -> Tensor: 
+        r"""
+        coalesce() -> Tensor
+        
+        Returns a coalesced copy of :attr:`self` if :attr:`self` is an
+        :ref:`uncoalesced tensor <sparse-uncoalesced-coo-docs>`.
+        
+        Returns :attr:`self` if :attr:`self` is a coalesced tensor.
+        
+        .. warning::
+          Throws an error if :attr:`self` is not a sparse COO tensor.
+        """
+        ...
+    def col_indices(self) -> Tensor: 
+        r"""
+        col_indices() -> IntTensor
+        
+        Returns the tensor containing the column indices of the :attr:`self`
+        tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+        The ``col_indices`` tensor is strictly of shape (:attr:`self`.nnz())
+        and of type ``int32`` or ``int64``.  When using MKL routines such as sparse
+        matrix multiplication, it is necessary to use ``int32`` indexing in order
+        to avoid downcasting and potentially losing information.
+        
+        Example::
+            >>> csr = torch.eye(5,5).to_sparse_csr()
+            >>> csr.col_indices()
+            tensor([0, 1, 2, 3, 4], dtype=torch.int32)
+        """
+        ...
+    def conj(self) -> Tensor: 
+        r"""
+        conj() -> Tensor
+        
+        See :func:`torch.conj`
+        """
+        ...
+    def conj_physical(self) -> Tensor: 
+        r"""
+        conj_physical() -> Tensor
+        
+        See :func:`torch.conj_physical`
+        """
+        ...
+    def conj_physical_(self) -> Tensor: 
+        r"""
+        conj_physical_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.conj_physical`
+        """
+        ...
+    def contiguous(self, memory_format=torch.contiguous_format) -> Tensor: 
+        r"""
+        contiguous(memory_format=torch.contiguous_format) -> Tensor
+        
+        Returns a contiguous in memory tensor containing the same data as :attr:`self` tensor. If
+        :attr:`self` tensor is already in the specified memory format, this function returns the
+        :attr:`self` tensor.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.contiguous_format``.
+        """
+        ...
+    def copy_(self, src: Tensor, non_blocking: _bool = False) -> Tensor: 
+        r"""
+        copy_(src, non_blocking=False) -> Tensor
+        
+        Copies the elements from :attr:`src` into :attr:`self` tensor and returns
+        :attr:`self`.
+        
+        The :attr:`src` tensor must be :ref:`broadcastable <broadcasting-semantics>`
+        with the :attr:`self` tensor. It may be of a different data type or reside on a
+        different device.
+        
+        Args:
+            src (Tensor): the source tensor to copy from
+            non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+                the copy may occur asynchronously with respect to the host. For other
+                cases, this argument has no effect.
+        """
+        ...
+    @overload
+    def copysign(self, other: Tensor) -> Tensor: 
+        r"""
+        copysign(other) -> Tensor
+        
+        See :func:`torch.copysign`
+        """
+        ...
+    @overload
+    def copysign(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        copysign(other) -> Tensor
+        
+        See :func:`torch.copysign`
+        """
+        ...
+    @overload
+    def copysign_(self, other: Tensor) -> Tensor: 
+        r"""
+        copysign_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.copysign`
+        """
+        ...
+    @overload
+    def copysign_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        copysign_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.copysign`
+        """
+        ...
+    def corrcoef(self) -> Tensor: 
+        r"""
+        corrcoef() -> Tensor
+        
+        See :func:`torch.corrcoef`
+        """
+        ...
+    def cos(self) -> Tensor: 
+        r"""
+        cos() -> Tensor
+        
+        See :func:`torch.cos`
+        """
+        ...
+    def cos_(self) -> Tensor: 
+        r"""
+        cos_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.cos`
+        """
+        ...
+    def cosh(self) -> Tensor: 
+        r"""
+        cosh() -> Tensor
+        
+        See :func:`torch.cosh`
+        """
+        ...
+    def cosh_(self) -> Tensor: 
+        r"""
+        cosh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.cosh`
+        """
+        ...
+    @overload
+    def count_nonzero(self, dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        count_nonzero(dim=None) -> Tensor
+        
+        See :func:`torch.count_nonzero`
+        """
+        ...
+    @overload
+    def count_nonzero(self, dim: _size) -> Tensor: 
+        r"""
+        count_nonzero(dim=None) -> Tensor
+        
+        See :func:`torch.count_nonzero`
+        """
+        ...
+    @overload
+    def count_nonzero(self, *dim: _int) -> Tensor: 
+        r"""
+        count_nonzero(dim=None) -> Tensor
+        
+        See :func:`torch.count_nonzero`
+        """
+        ...
+    def cov(self, *, correction: _int = 1, fweights: Optional[Tensor] = None, aweights: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        cov(*, correction=1, fweights=None, aweights=None) -> Tensor
+        
+        See :func:`torch.cov`
+        """
+        ...
+    def cpu(self, memory_format: torch.memory_format = torch.preserve_format) -> Tensor: 
+        r"""
+        cpu(memory_format=torch.preserve_format) -> Tensor
+        
+        Returns a copy of this object in CPU memory.
+        
+        If this object is already in CPU memory and on the correct device,
+        then no copy is performed and the original object is returned.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def cross(self, other: Tensor, dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        cross(other, dim=None) -> Tensor
+        
+        See :func:`torch.cross`
+        """
+        ...
+    def crow_indices(self) -> Tensor: 
+        r"""
+        crow_indices() -> IntTensor
+        
+        Returns the tensor containing the compressed row indices of the :attr:`self`
+        tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+        The ``crow_indices`` tensor is strictly of shape (:attr:`self`.size(0) + 1)
+        and of type ``int32`` or ``int64``. When using MKL routines such as sparse
+        matrix multiplication, it is necessary to use ``int32`` indexing in order
+        to avoid downcasting and potentially losing information.
+        
+        Example::
+            >>> csr = torch.eye(5,5).to_sparse_csr()
+            >>> csr.crow_indices()
+            tensor([0, 1, 2, 3, 4, 5], dtype=torch.int32)
+        """
+        ...
+    def cuda(self, device: Optional[Union[_device, _int, str]] = None, non_blocking: _bool = False, memory_format: torch.memory_format = torch.preserve_format) -> Tensor: 
+        r"""
+        cuda(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+        
+        Returns a copy of this object in CUDA memory.
+        
+        If this object is already in CUDA memory and on the correct device,
+        then no copy is performed and the original object is returned.
+        
+        Args:
+            device (:class:`torch.device`): The destination GPU device.
+                Defaults to the current CUDA device.
+            non_blocking (bool): If ``True`` and the source is in pinned memory,
+                the copy will be asynchronous with respect to the host.
+                Otherwise, the argument has no effect. Default: ``False``.
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    @overload
+    def cummax(self, dim: _int) -> torch.return_types.cummax: 
+        r"""
+        cummax(dim) -> (Tensor, Tensor)
+        
+        See :func:`torch.cummax`
+        """
+        ...
+    @overload
+    def cummax(self, dim: Union[str, ellipsis, None]) -> torch.return_types.cummax: 
+        r"""
+        cummax(dim) -> (Tensor, Tensor)
+        
+        See :func:`torch.cummax`
+        """
+        ...
+    @overload
+    def cummin(self, dim: _int) -> torch.return_types.cummin: 
+        r"""
+        cummin(dim) -> (Tensor, Tensor)
+        
+        See :func:`torch.cummin`
+        """
+        ...
+    @overload
+    def cummin(self, dim: Union[str, ellipsis, None]) -> torch.return_types.cummin: 
+        r"""
+        cummin(dim) -> (Tensor, Tensor)
+        
+        See :func:`torch.cummin`
+        """
+        ...
+    @overload
+    def cumprod(self, dim: _int, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumprod(dim, dtype=None) -> Tensor
+        
+        See :func:`torch.cumprod`
+        """
+        ...
+    @overload
+    def cumprod(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumprod(dim, dtype=None) -> Tensor
+        
+        See :func:`torch.cumprod`
+        """
+        ...
+    @overload
+    def cumprod_(self, dim: _int, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumprod_(dim, dtype=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.cumprod`
+        """
+        ...
+    @overload
+    def cumprod_(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumprod_(dim, dtype=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.cumprod`
+        """
+        ...
+    @overload
+    def cumsum(self, dim: _int, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumsum(dim, dtype=None) -> Tensor
+        
+        See :func:`torch.cumsum`
+        """
+        ...
+    @overload
+    def cumsum(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumsum(dim, dtype=None) -> Tensor
+        
+        See :func:`torch.cumsum`
+        """
+        ...
+    @overload
+    def cumsum_(self, dim: _int, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumsum_(dim, dtype=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.cumsum`
+        """
+        ...
+    @overload
+    def cumsum_(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        cumsum_(dim, dtype=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.cumsum`
+        """
+        ...
+    def data_ptr(self) -> _int: 
+        r"""
+        data_ptr() -> int
+        
+        Returns the address of the first element of :attr:`self` tensor.
+        """
+        ...
+    def deg2rad(self) -> Tensor: 
+        r"""
+        deg2rad() -> Tensor
+        
+        See :func:`torch.deg2rad`
+        """
+        ...
+    def deg2rad_(self) -> Tensor: 
+        r"""
+        deg2rad_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.deg2rad`
+        """
+        ...
+    def dense_dim(self) -> _int: 
+        r"""
+        dense_dim() -> int
+        
+        Return the number of dense dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+        
+        .. note::
+          Returns ``len(self.shape)`` if :attr:`self` is not a sparse tensor.
+        
+        See also :meth:`Tensor.sparse_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+        """
+        ...
+    def dequantize(self) -> Tensor: 
+        r"""
+        dequantize() -> Tensor
+        
+        Given a quantized Tensor, dequantize it and return the dequantized float Tensor.
+        """
+        ...
+    def det(self) -> Tensor: 
+        r"""
+        det() -> Tensor
+        
+        See :func:`torch.det`
+        """
+        ...
+    def detach(self) -> Tensor: ...
+    def detach_(self) -> Tensor: ...
+    def diag(self, diagonal: _int = 0) -> Tensor: 
+        r"""
+        diag(diagonal=0) -> Tensor
+        
+        See :func:`torch.diag`
+        """
+        ...
+    def diag_embed(self, offset: _int = 0, dim1: _int = -2, dim2: _int = -1) -> Tensor: 
+        r"""
+        diag_embed(offset=0, dim1=-2, dim2=-1) -> Tensor
+        
+        See :func:`torch.diag_embed`
+        """
+        ...
+    def diagflat(self, offset: _int = 0) -> Tensor: 
+        r"""
+        diagflat(offset=0) -> Tensor
+        
+        See :func:`torch.diagflat`
+        """
+        ...
+    @overload
+    def diagonal(self, *, outdim: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None], dim2: Union[str, ellipsis, None], offset: _int = 0) -> Tensor: 
+        r"""
+        diagonal(offset=0, dim1=0, dim2=1) -> Tensor
+        
+        See :func:`torch.diagonal`
+        """
+        ...
+    @overload
+    def diagonal(self, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+        r"""
+        diagonal(offset=0, dim1=0, dim2=1) -> Tensor
+        
+        See :func:`torch.diagonal`
+        """
+        ...
+    def diagonal_scatter(self, src: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+        r"""
+        diagonal_scatter(src, offset=0, dim1=0, dim2=1) -> Tensor
+        
+        See :func:`torch.diagonal_scatter`
+        """
+        ...
+    def diff(self, n: _int = 1, dim: _int = -1, prepend: Optional[Tensor] = None, append: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        diff(n=1, dim=-1, prepend=None, append=None) -> Tensor
+        
+        See :func:`torch.diff`
+        """
+        ...
+    def digamma(self) -> Tensor: 
+        r"""
+        digamma() -> Tensor
+        
+        See :func:`torch.digamma`
+        """
+        ...
+    def digamma_(self) -> Tensor: 
+        r"""
+        digamma_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.digamma`
+        """
+        ...
+    def dim(self) -> _int: 
+        r"""
+        dim() -> int
+        
+        Returns the number of dimensions of :attr:`self` tensor.
+        """
+        ...
+    def dist(self, other: Tensor, p: Union[Number, _complex] = 2) -> Tensor: 
+        r"""
+        dist(other, p=2) -> Tensor
+        
+        See :func:`torch.dist`
+        """
+        ...
+    def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: 
+        r"""
+        div(value, *, rounding_mode=None) -> Tensor
+        
+        See :func:`torch.div`
+        """
+        ...
+    def div_(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: 
+        r"""
+        div_(value, *, rounding_mode=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.div`
+        """
+        ...
+    @overload
+    def divide(self, other: Tensor) -> Tensor: 
+        r"""
+        divide(value, *, rounding_mode=None) -> Tensor
+        
+        See :func:`torch.divide`
+        """
+        ...
+    @overload
+    def divide(self, other: Tensor, *, rounding_mode: Optional[str]) -> Tensor: 
+        r"""
+        divide(value, *, rounding_mode=None) -> Tensor
+        
+        See :func:`torch.divide`
+        """
+        ...
+    @overload
+    def divide(self, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: 
+        r"""
+        divide(value, *, rounding_mode=None) -> Tensor
+        
+        See :func:`torch.divide`
+        """
+        ...
+    @overload
+    def divide(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        divide(value, *, rounding_mode=None) -> Tensor
+        
+        See :func:`torch.divide`
+        """
+        ...
+    @overload
+    def divide_(self, other: Tensor) -> Tensor: 
+        r"""
+        divide_(value, *, rounding_mode=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.divide`
+        """
+        ...
+    @overload
+    def divide_(self, other: Tensor, *, rounding_mode: Optional[str]) -> Tensor: 
+        r"""
+        divide_(value, *, rounding_mode=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.divide`
+        """
+        ...
+    @overload
+    def divide_(self, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: 
+        r"""
+        divide_(value, *, rounding_mode=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.divide`
+        """
+        ...
+    @overload
+    def divide_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        divide_(value, *, rounding_mode=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.divide`
+        """
+        ...
+    def dot(self, tensor: Tensor) -> Tensor: 
+        r"""
+        dot(other) -> Tensor
+        
+        See :func:`torch.dot`
+        """
+        ...
+    def double(self) -> Tensor: 
+        r"""
+        double(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    @overload
+    def dsplit(self, sections: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        dsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.dsplit`
+        """
+        ...
+    @overload
+    def dsplit(self, indices: _size) -> Tuple[Tensor, ...]: 
+        r"""
+        dsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.dsplit`
+        """
+        ...
+    @overload
+    def dsplit(self, *indices: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        dsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.dsplit`
+        """
+        ...
+    def element_size(self) -> _int: 
+        r"""
+        element_size() -> int
+        
+        Returns the size in bytes of an individual element.
+        
+        Example::
+        
+            >>> torch.tensor([]).element_size()
+            4
+            >>> torch.tensor([], dtype=torch.uint8).element_size()
+            1
+        """
+        ...
+    @overload
+    def eq(self, other: Tensor) -> Tensor: 
+        r"""
+        eq(other) -> Tensor
+        
+        See :func:`torch.eq`
+        """
+        ...
+    @overload
+    def eq(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        eq(other) -> Tensor
+        
+        See :func:`torch.eq`
+        """
+        ...
+    @overload
+    def eq_(self, other: Tensor) -> Tensor: 
+        r"""
+        eq_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.eq`
+        """
+        ...
+    @overload
+    def eq_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        eq_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.eq`
+        """
+        ...
+    def equal(self, other: Tensor) -> _bool: 
+        r"""
+        equal(other) -> bool
+        
+        See :func:`torch.equal`
+        """
+        ...
+    def erf(self) -> Tensor: 
+        r"""
+        erf() -> Tensor
+        
+        See :func:`torch.erf`
+        """
+        ...
+    def erf_(self) -> Tensor: 
+        r"""
+        erf_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.erf`
+        """
+        ...
+    def erfc(self) -> Tensor: 
+        r"""
+        erfc() -> Tensor
+        
+        See :func:`torch.erfc`
+        """
+        ...
+    def erfc_(self) -> Tensor: 
+        r"""
+        erfc_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.erfc`
+        """
+        ...
+    def erfinv(self) -> Tensor: 
+        r"""
+        erfinv() -> Tensor
+        
+        See :func:`torch.erfinv`
+        """
+        ...
+    def erfinv_(self) -> Tensor: 
+        r"""
+        erfinv_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.erfinv`
+        """
+        ...
+    def exp(self) -> Tensor: 
+        r"""
+        exp() -> Tensor
+        
+        See :func:`torch.exp`
+        """
+        ...
+    def exp2(self) -> Tensor: 
+        r"""
+        exp2() -> Tensor
+        
+        See :func:`torch.exp2`
+        """
+        ...
+    def exp2_(self) -> Tensor: 
+        r"""
+        exp2_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.exp2`
+        """
+        ...
+    def exp_(self) -> Tensor: 
+        r"""
+        exp_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.exp`
+        """
+        ...
+    @overload
+    def expand(self, size: Sequence[Union[_int, SymInt]], *, implicit: _bool = False) -> Tensor: 
+        r"""
+        expand(*sizes) -> Tensor
+        
+        Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+        to a larger size.
+        
+        Passing -1 as the size for a dimension means not changing the size of
+        that dimension.
+        
+        Tensor can be also expanded to a larger number of dimensions, and the
+        new ones will be appended at the front. For the new dimensions, the
+        size cannot be set to -1.
+        
+        Expanding a tensor does not allocate new memory, but only creates a
+        new view on the existing tensor where a dimension of size one is
+        expanded to a larger size by setting the ``stride`` to 0. Any dimension
+        of size 1 can be expanded to an arbitrary value without allocating new
+        memory.
+        
+        Args:
+            *sizes (torch.Size or int...): the desired expanded size
+        
+        .. warning::
+        
+            More than one element of an expanded tensor may refer to a single
+            memory location. As a result, in-place operations (especially ones that
+            are vectorized) may result in incorrect behavior. If you need to write
+            to the tensors, please clone them first.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1], [2], [3]])
+            >>> x.size()
+            torch.Size([3, 1])
+            >>> x.expand(3, 4)
+            tensor([[ 1,  1,  1,  1],
+                    [ 2,  2,  2,  2],
+                    [ 3,  3,  3,  3]])
+            >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+            tensor([[ 1,  1,  1,  1],
+                    [ 2,  2,  2,  2],
+                    [ 3,  3,  3,  3]])
+        """
+        ...
+    @overload
+    def expand(self, *size: _int, implicit: _bool = False) -> Tensor: 
+        r"""
+        expand(*sizes) -> Tensor
+        
+        Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+        to a larger size.
+        
+        Passing -1 as the size for a dimension means not changing the size of
+        that dimension.
+        
+        Tensor can be also expanded to a larger number of dimensions, and the
+        new ones will be appended at the front. For the new dimensions, the
+        size cannot be set to -1.
+        
+        Expanding a tensor does not allocate new memory, but only creates a
+        new view on the existing tensor where a dimension of size one is
+        expanded to a larger size by setting the ``stride`` to 0. Any dimension
+        of size 1 can be expanded to an arbitrary value without allocating new
+        memory.
+        
+        Args:
+            *sizes (torch.Size or int...): the desired expanded size
+        
+        .. warning::
+        
+            More than one element of an expanded tensor may refer to a single
+            memory location. As a result, in-place operations (especially ones that
+            are vectorized) may result in incorrect behavior. If you need to write
+            to the tensors, please clone them first.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1], [2], [3]])
+            >>> x.size()
+            torch.Size([3, 1])
+            >>> x.expand(3, 4)
+            tensor([[ 1,  1,  1,  1],
+                    [ 2,  2,  2,  2],
+                    [ 3,  3,  3,  3]])
+            >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+            tensor([[ 1,  1,  1,  1],
+                    [ 2,  2,  2,  2],
+                    [ 3,  3,  3,  3]])
+        """
+        ...
+    def expand_as(self, other: Tensor) -> Tensor: 
+        r"""
+        expand_as(other) -> Tensor
+        
+        Expand this tensor to the same size as :attr:`other`.
+        ``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+        
+        Please see :meth:`~Tensor.expand` for more information about ``expand``.
+        
+        Args:
+            other (:class:`torch.Tensor`): The result tensor has the same size
+                as :attr:`other`.
+        """
+        ...
+    def expm1(self) -> Tensor: 
+        r"""
+        expm1() -> Tensor
+        
+        See :func:`torch.expm1`
+        """
+        ...
+    def expm1_(self) -> Tensor: 
+        r"""
+        expm1_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.expm1`
+        """
+        ...
+    def exponential_(self, lambd: _float = 1, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        exponential_(lambd=1, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with elements drawn from the PDF (probability density function):
+        
+        .. math::
+        
+            f(x) = \lambda e^{-\lambda x}, x > 0
+        
+        .. note::
+          In probability theory, exponential distribution is supported on interval [0, :math:`\inf`) (i.e., :math:`x >= 0`)
+          implying that zero can be sampled from the exponential distribution.
+          However, :func:`torch.Tensor.exponential_` does not sample zero,
+          which means that its actual support is the interval (0, :math:`\inf`).
+        
+          Note that :func:`torch.distributions.exponential.Exponential` is supported on the interval [0, :math:`\inf`) and can sample zero.
+        """
+        ...
+    @overload
+    def fill_(self, value: Tensor) -> Tensor: 
+        r"""
+        fill_(value) -> Tensor
+        
+        Fills :attr:`self` tensor with the specified value.
+        """
+        ...
+    @overload
+    def fill_(self, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        fill_(value) -> Tensor
+        
+        Fills :attr:`self` tensor with the specified value.
+        """
+        ...
+    def fill_diagonal_(self, fill_value: Union[Number, _complex], wrap: _bool = False) -> Tensor: 
+        r"""
+        fill_diagonal_(fill_value, wrap=False) -> Tensor
+        
+        Fill the main diagonal of a tensor that has at least 2-dimensions.
+        When dims>2, all dimensions of input must be of equal length.
+        This function modifies the input tensor in-place, and returns the input tensor.
+        
+        Arguments:
+            fill_value (Scalar): the fill value
+            wrap (bool): the diagonal 'wrapped' after N columns for tall matrices.
+        
+        Example::
+        
+            >>> a = torch.zeros(3, 3)
+            >>> a.fill_diagonal_(5)
+            tensor([[5., 0., 0.],
+                    [0., 5., 0.],
+                    [0., 0., 5.]])
+            >>> b = torch.zeros(7, 3)
+            >>> b.fill_diagonal_(5)
+            tensor([[5., 0., 0.],
+                    [0., 5., 0.],
+                    [0., 0., 5.],
+                    [0., 0., 0.],
+                    [0., 0., 0.],
+                    [0., 0., 0.],
+                    [0., 0., 0.]])
+            >>> c = torch.zeros(7, 3)
+            >>> c.fill_diagonal_(5, wrap=True)
+            tensor([[5., 0., 0.],
+                    [0., 5., 0.],
+                    [0., 0., 5.],
+                    [0., 0., 0.],
+                    [5., 0., 0.],
+                    [0., 5., 0.],
+                    [0., 0., 5.]])
+        """
+        ...
+    def fix(self) -> Tensor: 
+        r"""
+        fix() -> Tensor
+        
+        See :func:`torch.fix`.
+        """
+        ...
+    def fix_(self) -> Tensor: 
+        r"""
+        fix_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.fix`
+        """
+        ...
+    @overload
+    def flatten(self, start_dim: _int = 0, end_dim: _int = -1) -> Tensor: 
+        r"""
+        flatten(start_dim=0, end_dim=-1) -> Tensor
+        
+        See :func:`torch.flatten`
+        """
+        ...
+    @overload
+    def flatten(self, start_dim: _int, end_dim: _int, out_dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        flatten(start_dim=0, end_dim=-1) -> Tensor
+        
+        See :func:`torch.flatten`
+        """
+        ...
+    @overload
+    def flatten(self, start_dim: Union[str, ellipsis, None], end_dim: Union[str, ellipsis, None], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        flatten(start_dim=0, end_dim=-1) -> Tensor
+        
+        See :func:`torch.flatten`
+        """
+        ...
+    @overload
+    def flatten(self, dims: Sequence[Union[str, ellipsis, None]], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        flatten(start_dim=0, end_dim=-1) -> Tensor
+        
+        See :func:`torch.flatten`
+        """
+        ...
+    @overload
+    def flip(self, dims: _size) -> Tensor: 
+        r"""
+        flip(dims) -> Tensor
+        
+        See :func:`torch.flip`
+        """
+        ...
+    @overload
+    def flip(self, *dims: _int) -> Tensor: 
+        r"""
+        flip(dims) -> Tensor
+        
+        See :func:`torch.flip`
+        """
+        ...
+    def fliplr(self) -> Tensor: 
+        r"""
+        fliplr() -> Tensor
+        
+        See :func:`torch.fliplr`
+        """
+        ...
+    def flipud(self) -> Tensor: 
+        r"""
+        flipud() -> Tensor
+        
+        See :func:`torch.flipud`
+        """
+        ...
+    def float(self) -> Tensor: 
+        r"""
+        float(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    @overload
+    def float_power(self, exponent: Tensor) -> Tensor: 
+        r"""
+        float_power(exponent) -> Tensor
+        
+        See :func:`torch.float_power`
+        """
+        ...
+    @overload
+    def float_power(self, exponent: Union[Number, _complex]) -> Tensor: 
+        r"""
+        float_power(exponent) -> Tensor
+        
+        See :func:`torch.float_power`
+        """
+        ...
+    @overload
+    def float_power_(self, exponent: Tensor) -> Tensor: 
+        r"""
+        float_power_(exponent) -> Tensor
+        
+        In-place version of :meth:`~Tensor.float_power`
+        """
+        ...
+    @overload
+    def float_power_(self, exponent: Union[Number, _complex]) -> Tensor: 
+        r"""
+        float_power_(exponent) -> Tensor
+        
+        In-place version of :meth:`~Tensor.float_power`
+        """
+        ...
+    def floor(self) -> Tensor: 
+        r"""
+        floor() -> Tensor
+        
+        See :func:`torch.floor`
+        """
+        ...
+    def floor_(self) -> Tensor: 
+        r"""
+        floor_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.floor`
+        """
+        ...
+    def floor_divide(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat], *, out: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        floor_divide(value) -> Tensor
+        
+        See :func:`torch.floor_divide`
+        """
+        ...
+    def floor_divide_(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat]) -> Tensor: 
+        r"""
+        floor_divide_(value) -> Tensor
+        
+        In-place version of :meth:`~Tensor.floor_divide`
+        """
+        ...
+    def fmax(self, other: Tensor) -> Tensor: 
+        r"""
+        fmax(other) -> Tensor
+        
+        See :func:`torch.fmax`
+        """
+        ...
+    def fmin(self, other: Tensor) -> Tensor: 
+        r"""
+        fmin(other) -> Tensor
+        
+        See :func:`torch.fmin`
+        """
+        ...
+    @overload
+    def fmod(self, other: Tensor) -> Tensor: 
+        r"""
+        fmod(divisor) -> Tensor
+        
+        See :func:`torch.fmod`
+        """
+        ...
+    @overload
+    def fmod(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        fmod(divisor) -> Tensor
+        
+        See :func:`torch.fmod`
+        """
+        ...
+    @overload
+    def fmod_(self, other: Tensor) -> Tensor: 
+        r"""
+        fmod_(divisor) -> Tensor
+        
+        In-place version of :meth:`~Tensor.fmod`
+        """
+        ...
+    @overload
+    def fmod_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        fmod_(divisor) -> Tensor
+        
+        In-place version of :meth:`~Tensor.fmod`
+        """
+        ...
+    def frac(self) -> Tensor: 
+        r"""
+        frac() -> Tensor
+        
+        See :func:`torch.frac`
+        """
+        ...
+    def frac_(self) -> Tensor: 
+        r"""
+        frac_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.frac`
+        """
+        ...
+    def frexp(self) -> torch.return_types.frexp: 
+        r"""
+        frexp(input) -> (Tensor mantissa, Tensor exponent)
+        
+        See :func:`torch.frexp`
+        """
+        ...
+    @overload
+    def gather(self, dim: _int, index: Tensor, *, sparse_grad: _bool = False) -> Tensor: 
+        r"""
+        gather(dim, index) -> Tensor
+        
+        See :func:`torch.gather`
+        """
+        ...
+    @overload
+    def gather(self, dim: Union[str, ellipsis, None], index: Tensor, *, sparse_grad: _bool = False) -> Tensor: 
+        r"""
+        gather(dim, index) -> Tensor
+        
+        See :func:`torch.gather`
+        """
+        ...
+    def gcd(self, other: Tensor) -> Tensor: 
+        r"""
+        gcd(other) -> Tensor
+        
+        See :func:`torch.gcd`
+        """
+        ...
+    def gcd_(self, other: Tensor) -> Tensor: 
+        r"""
+        gcd_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.gcd`
+        """
+        ...
+    @overload
+    def ge(self, other: Tensor) -> Tensor: 
+        r"""
+        ge(other) -> Tensor
+        
+        See :func:`torch.ge`.
+        """
+        ...
+    @overload
+    def ge(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        ge(other) -> Tensor
+        
+        See :func:`torch.ge`.
+        """
+        ...
+    @overload
+    def ge_(self, other: Tensor) -> Tensor: 
+        r"""
+        ge_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.ge`.
+        """
+        ...
+    @overload
+    def ge_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        ge_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.ge`.
+        """
+        ...
+    def geometric_(self, p: _float, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        geometric_(p, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with elements drawn from the geometric distribution:
+        
+        .. math::
+        
+            P(X=k) = (1 - p)^{k - 1} p, k = 1, 2, ...
+        
+        .. note::
+          :func:`torch.Tensor.geometric_` `k`-th trial is the first success hence draws samples in :math:`\{1, 2, \ldots\}`, whereas
+          :func:`torch.distributions.geometric.Geometric` :math:`(k+1)`-th trial is the first success
+          hence draws samples in :math:`\{0, 1, \ldots\}`.
+        """
+        ...
+    def geqrf(self) -> torch.return_types.geqrf: 
+        r"""
+        geqrf() -> (Tensor, Tensor)
+        
+        See :func:`torch.geqrf`
+        """
+        ...
+    def ger(self, vec2: Tensor) -> Tensor: 
+        r"""
+        ger(vec2) -> Tensor
+        
+        See :func:`torch.ger`
+        """
+        ...
+    def get_device(self) -> _int: 
+        r"""
+        get_device() -> Device ordinal (Integer)
+        
+        For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
+        For CPU tensors, this function returns `-1`.
+        
+        Example::
+        
+            >>> x = torch.randn(3, 4, 5, device='cuda:0')
+            >>> x.get_device()
+            0
+            >>> x.cpu().get_device()
+            -1
+        """
+        ...
+    @overload
+    def greater(self, other: Tensor) -> Tensor: 
+        r"""
+        greater(other) -> Tensor
+        
+        See :func:`torch.greater`.
+        """
+        ...
+    @overload
+    def greater(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        greater(other) -> Tensor
+        
+        See :func:`torch.greater`.
+        """
+        ...
+    @overload
+    def greater_(self, other: Tensor) -> Tensor: 
+        r"""
+        greater_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.greater`.
+        """
+        ...
+    @overload
+    def greater_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        greater_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.greater`.
+        """
+        ...
+    @overload
+    def greater_equal(self, other: Tensor) -> Tensor: 
+        r"""
+        greater_equal(other) -> Tensor
+        
+        See :func:`torch.greater_equal`.
+        """
+        ...
+    @overload
+    def greater_equal(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        greater_equal(other) -> Tensor
+        
+        See :func:`torch.greater_equal`.
+        """
+        ...
+    @overload
+    def greater_equal_(self, other: Tensor) -> Tensor: 
+        r"""
+        greater_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.greater_equal`.
+        """
+        ...
+    @overload
+    def greater_equal_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        greater_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.greater_equal`.
+        """
+        ...
+    @overload
+    def gt(self, other: Tensor) -> Tensor: 
+        r"""
+        gt(other) -> Tensor
+        
+        See :func:`torch.gt`.
+        """
+        ...
+    @overload
+    def gt(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        gt(other) -> Tensor
+        
+        See :func:`torch.gt`.
+        """
+        ...
+    @overload
+    def gt_(self, other: Tensor) -> Tensor: 
+        r"""
+        gt_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.gt`.
+        """
+        ...
+    @overload
+    def gt_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        gt_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.gt`.
+        """
+        ...
+    def half(self) -> Tensor: 
+        r"""
+        half(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def hardshrink(self, lambd: Union[Number, _complex] = 0.5) -> Tensor: 
+        r"""
+        hardshrink(lambd=0.5) -> Tensor
+        
+        See :func:`torch.nn.functional.hardshrink`
+        """
+        ...
+    def has_names(self) -> _bool: 
+        r"""
+        Is ``True`` if any of this tensor's dimensions are named. Otherwise, is ``False``.
+        """
+        ...
+    def heaviside(self, values: Tensor) -> Tensor: 
+        r"""
+        heaviside(values) -> Tensor
+        
+        See :func:`torch.heaviside`
+        """
+        ...
+    def heaviside_(self, values: Tensor) -> Tensor: 
+        r"""
+        heaviside_(values) -> Tensor
+        
+        In-place version of :meth:`~Tensor.heaviside`
+        """
+        ...
+    def histc(self, bins: _int = 100, min: Union[Number, _complex] = 0, max: Union[Number, _complex] = 0) -> Tensor: 
+        r"""
+        histc(bins=100, min=0, max=0) -> Tensor
+        
+        See :func:`torch.histc`
+        """
+        ...
+    @overload
+    def histogram(self, bins: Tensor, *, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogram: 
+        r"""
+        histogram(input, bins, *, range=None, weight=None, density=False) -> (Tensor, Tensor)
+        
+        See :func:`torch.histogram`
+        """
+        ...
+    @overload
+    def histogram(self, bins: _int = 100, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogram: 
+        r"""
+        histogram(input, bins, *, range=None, weight=None, density=False) -> (Tensor, Tensor)
+        
+        See :func:`torch.histogram`
+        """
+        ...
+    @overload
+    def hsplit(self, sections: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        hsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.hsplit`
+        """
+        ...
+    @overload
+    def hsplit(self, indices: _size) -> Tuple[Tensor, ...]: 
+        r"""
+        hsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.hsplit`
+        """
+        ...
+    @overload
+    def hsplit(self, *indices: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        hsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.hsplit`
+        """
+        ...
+    def hypot(self, other: Tensor) -> Tensor: 
+        r"""
+        hypot(other) -> Tensor
+        
+        See :func:`torch.hypot`
+        """
+        ...
+    def hypot_(self, other: Tensor) -> Tensor: 
+        r"""
+        hypot_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.hypot`
+        """
+        ...
+    def i0(self) -> Tensor: 
+        r"""
+        i0() -> Tensor
+        
+        See :func:`torch.i0`
+        """
+        ...
+    def i0_(self) -> Tensor: 
+        r"""
+        i0_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.i0`
+        """
+        ...
+    def igamma(self, other: Tensor) -> Tensor: 
+        r"""
+        igamma(other) -> Tensor
+        
+        See :func:`torch.igamma`
+        """
+        ...
+    def igamma_(self, other: Tensor) -> Tensor: 
+        r"""
+        igamma_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.igamma`
+        """
+        ...
+    def igammac(self, other: Tensor) -> Tensor: 
+        r"""
+        igammac(other) -> Tensor
+        See :func:`torch.igammac`
+        """
+        ...
+    def igammac_(self, other: Tensor) -> Tensor: 
+        r"""
+        igammac_(other) -> Tensor
+        In-place version of :meth:`~Tensor.igammac`
+        """
+        ...
+    @overload
+    def index_add(self, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        index_add(dim, index, source, *, alpha=1) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_add_`.
+        """
+        ...
+    @overload
+    def index_add(self, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        index_add(dim, index, source, *, alpha=1) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_add_`.
+        """
+        ...
+    def index_add_(self, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        index_add_(dim, index, source, *, alpha=1) -> Tensor
+        
+        Accumulate the elements of :attr:`alpha` times ``source`` into the :attr:`self`
+        tensor by adding to the indices in the order given in :attr:`index`. For example,
+        if ``dim == 0``, ``index[i] == j``, and ``alpha=-1``, then the ``i``\ th row of
+        ``source`` is subtracted from the ``j``\ th row of :attr:`self`.
+        
+        The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+        length of :attr:`index` (which must be a vector), and all other dimensions must
+        match :attr:`self`, or an error will be raised.
+        
+        For a 3-D tensor the output is given as::
+        
+            self[index[i], :, :] += alpha * src[i, :, :]  # if dim == 0
+            self[:, index[i], :] += alpha * src[:, i, :]  # if dim == 1
+            self[:, :, index[i]] += alpha * src[:, :, i]  # if dim == 2
+        
+        Note:
+            This operation may behave nondeterministically when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (Tensor): indices of ``source`` to select from,
+                    should have dtype either `torch.int64` or `torch.int32`
+            source (Tensor): the tensor containing values to add
+        
+        Keyword args:
+            alpha (Number): the scalar multiplier for ``source``
+        
+        Example::
+        
+            >>> x = torch.ones(5, 3)
+            >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 4, 2])
+            >>> x.index_add_(0, index, t)
+            tensor([[  2.,   3.,   4.],
+                    [  1.,   1.,   1.],
+                    [  8.,   9.,  10.],
+                    [  1.,   1.,   1.],
+                    [  5.,   6.,   7.]])
+            >>> x.index_add_(0, index, t, alpha=-1)
+            tensor([[  1.,   1.,   1.],
+                    [  1.,   1.,   1.],
+                    [  1.,   1.,   1.],
+                    [  1.,   1.,   1.],
+                    [  1.,   1.,   1.]])
+        """
+        ...
+    @overload
+    def index_copy(self, dim: _int, index: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        index_copy(dim, index, tensor2) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_copy_`.
+        """
+        ...
+    @overload
+    def index_copy(self, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        index_copy(dim, index, tensor2) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_copy_`.
+        """
+        ...
+    @overload
+    def index_copy_(self, dim: _int, index: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        index_copy_(dim, index, tensor) -> Tensor
+        
+        Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+        the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+        and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+        ``j``\ th row of :attr:`self`.
+        
+        The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+        length of :attr:`index` (which must be a vector), and all other dimensions must
+        match :attr:`self`, or an error will be raised.
+        
+        .. note::
+            If :attr:`index` contains duplicate entries, multiple elements from
+            :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+            is nondeterministic since it depends on which copy occurs last.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`tensor` to select from
+            tensor (Tensor): the tensor containing values to copy
+        
+        Example::
+        
+            >>> x = torch.zeros(5, 3)
+            >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 4, 2])
+            >>> x.index_copy_(0, index, t)
+            tensor([[ 1.,  2.,  3.],
+                    [ 0.,  0.,  0.],
+                    [ 7.,  8.,  9.],
+                    [ 0.,  0.,  0.],
+                    [ 4.,  5.,  6.]])
+        """
+        ...
+    @overload
+    def index_copy_(self, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        index_copy_(dim, index, tensor) -> Tensor
+        
+        Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+        the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+        and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+        ``j``\ th row of :attr:`self`.
+        
+        The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+        length of :attr:`index` (which must be a vector), and all other dimensions must
+        match :attr:`self`, or an error will be raised.
+        
+        .. note::
+            If :attr:`index` contains duplicate entries, multiple elements from
+            :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+            is nondeterministic since it depends on which copy occurs last.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`tensor` to select from
+            tensor (Tensor): the tensor containing values to copy
+        
+        Example::
+        
+            >>> x = torch.zeros(5, 3)
+            >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 4, 2])
+            >>> x.index_copy_(0, index, t)
+            tensor([[ 1.,  2.,  3.],
+                    [ 0.,  0.,  0.],
+                    [ 7.,  8.,  9.],
+                    [ 0.,  0.,  0.],
+                    [ 4.,  5.,  6.]])
+        """
+        ...
+    @overload
+    def index_fill(self, dim: _int, index: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        index_fill(dim, index, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+        """
+        ...
+    @overload
+    def index_fill(self, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        index_fill(dim, index, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+        """
+        ...
+    @overload
+    def index_fill(self, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        index_fill(dim, index, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+        """
+        ...
+    @overload
+    def index_fill(self, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        index_fill(dim, index, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+        """
+        ...
+    @overload
+    def index_fill_(self, dim: _int, index: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        index_fill_(dim, index, value) -> Tensor
+        
+        Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+        selecting the indices in the order given in :attr:`index`.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`self` tensor to fill in
+            value (float): the value to fill with
+        
+        Example::
+            >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 2])
+            >>> x.index_fill_(1, index, -1)
+            tensor([[-1.,  2., -1.],
+                    [-1.,  5., -1.],
+                    [-1.,  8., -1.]])
+        """
+        ...
+    @overload
+    def index_fill_(self, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        index_fill_(dim, index, value) -> Tensor
+        
+        Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+        selecting the indices in the order given in :attr:`index`.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`self` tensor to fill in
+            value (float): the value to fill with
+        
+        Example::
+            >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 2])
+            >>> x.index_fill_(1, index, -1)
+            tensor([[-1.,  2., -1.],
+                    [-1.,  5., -1.],
+                    [-1.,  8., -1.]])
+        """
+        ...
+    @overload
+    def index_fill_(self, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        index_fill_(dim, index, value) -> Tensor
+        
+        Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+        selecting the indices in the order given in :attr:`index`.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`self` tensor to fill in
+            value (float): the value to fill with
+        
+        Example::
+            >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 2])
+            >>> x.index_fill_(1, index, -1)
+            tensor([[-1.,  2., -1.],
+                    [-1.,  5., -1.],
+                    [-1.,  8., -1.]])
+        """
+        ...
+    @overload
+    def index_fill_(self, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        index_fill_(dim, index, value) -> Tensor
+        
+        Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+        selecting the indices in the order given in :attr:`index`.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (LongTensor): indices of :attr:`self` tensor to fill in
+            value (float): the value to fill with
+        
+        Example::
+            >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+            >>> index = torch.tensor([0, 2])
+            >>> x.index_fill_(1, index, -1)
+            tensor([[-1.,  2., -1.],
+                    [-1.,  5., -1.],
+                    [-1.,  8., -1.]])
+        """
+        ...
+    def index_put(self, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: 
+        r"""
+        index_put(indices, values, accumulate=False) -> Tensor
+        
+        Out-place version of :meth:`~Tensor.index_put_`.
+        """
+        ...
+    def index_put_(self, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: 
+        r"""
+        index_put_(indices, values, accumulate=False) -> Tensor
+        
+        Puts values from the tensor :attr:`values` into the tensor :attr:`self` using
+        the indices specified in :attr:`indices` (which is a tuple of Tensors). The
+        expression ``tensor.index_put_(indices, values)`` is equivalent to
+        ``tensor[indices] = values``. Returns :attr:`self`.
+        
+        If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to
+        :attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
+        contain duplicate elements.
+        
+        Args:
+            indices (tuple of LongTensor): tensors used to index into `self`.
+            values (Tensor): tensor of same dtype as `self`.
+            accumulate (bool): whether to accumulate into self
+        """
+        ...
+    def index_reduce(self, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True) -> Tensor: ...
+    def index_reduce_(self, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True) -> Tensor: 
+        r"""
+        index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor
+        
+        Accumulate the elements of ``source`` into the :attr:`self`
+        tensor by accumulating to the indices in the order given in :attr:`index`
+        using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``,
+        ``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th
+        row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
+        :obj:`include_self="True"`, the values in the :attr:`self` tensor are included
+        in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+        to are treated as if they were filled with the reduction identites.
+        
+        The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+        length of :attr:`index` (which must be a vector), and all other dimensions must
+        match :attr:`self`, or an error will be raised.
+        
+        For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the
+        output is given as::
+        
+            self[index[i], :, :] *= src[i, :, :]  # if dim == 0
+            self[:, index[i], :] *= src[:, i, :]  # if dim == 1
+            self[:, :, index[i]] *= src[:, :, i]  # if dim == 2
+        
+        Note:
+            This operation may behave nondeterministically when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+        
+        .. note::
+        
+            This function only supports floating point tensors.
+        
+        .. warning::
+        
+            This function is in beta and may change in the near future.
+        
+        Args:
+            dim (int): dimension along which to index
+            index (Tensor): indices of ``source`` to select from,
+                should have dtype either `torch.int64` or `torch.int32`
+            source (FloatTensor): the tensor containing values to accumulate
+            reduce (str): the reduction operation to apply
+                (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+        
+        Keyword args:
+            include_self (bool): whether the elements from the ``self`` tensor are
+                included in the reduction
+        
+        Example::
+        
+            >>> x = torch.empty(5, 3).fill_(2)
+            >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
+            >>> index = torch.tensor([0, 4, 2, 0])
+            >>> x.index_reduce_(0, index, t, 'prod')
+            tensor([[20., 44., 72.],
+                    [ 2.,  2.,  2.],
+                    [14., 16., 18.],
+                    [ 2.,  2.,  2.],
+                    [ 8., 10., 12.]])
+            >>> x = torch.empty(5, 3).fill_(2)
+            >>> x.index_reduce_(0, index, t, 'prod', include_self=False)
+            tensor([[10., 22., 36.],
+                    [ 2.,  2.,  2.],
+                    [ 7.,  8.,  9.],
+                    [ 2.,  2.,  2.],
+                    [ 4.,  5.,  6.]])
+        """
+        ...
+    @overload
+    def index_select(self, dim: _int, index: Tensor) -> Tensor: 
+        r"""
+        index_select(dim, index) -> Tensor
+        
+        See :func:`torch.index_select`
+        """
+        ...
+    @overload
+    def index_select(self, dim: Union[str, ellipsis, None], index: Tensor) -> Tensor: 
+        r"""
+        index_select(dim, index) -> Tensor
+        
+        See :func:`torch.index_select`
+        """
+        ...
+    def indices(self) -> Tensor: 
+        r"""
+        indices() -> Tensor
+        
+        Return the indices tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+        
+        .. warning::
+          Throws an error if :attr:`self` is not a sparse COO tensor.
+        
+        See also :meth:`Tensor.values`.
+        
+        .. note::
+          This method can only be called on a coalesced sparse tensor. See
+          :meth:`Tensor.coalesce` for details.
+        """
+        ...
+    def inner(self, other: Tensor) -> Tensor: 
+        r"""
+        inner(other) -> Tensor
+        
+        See :func:`torch.inner`.
+        """
+        ...
+    def int(self) -> Tensor: 
+        r"""
+        int(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def int_repr(self) -> Tensor: 
+        r"""
+        int_repr() -> Tensor
+        
+        Given a quantized Tensor,
+        ``self.int_repr()`` returns a CPU Tensor with uint8_t as data type that stores the
+        underlying uint8_t values of the given Tensor.
+        """
+        ...
+    def inverse(self) -> Tensor: 
+        r"""
+        inverse() -> Tensor
+        
+        See :func:`torch.inverse`
+        """
+        ...
+    def is_coalesced(self) -> _bool: 
+        r"""
+        is_coalesced() -> bool
+        
+        Returns ``True`` if :attr:`self` is a :ref:`sparse COO tensor
+        <sparse-coo-docs>` that is coalesced, ``False`` otherwise.
+        
+        .. warning::
+          Throws an error if :attr:`self` is not a sparse COO tensor.
+        
+        See :meth:`coalesce` and :ref:`uncoalesced tensors <sparse-uncoalesced-coo-docs>`.
+        """
+        ...
+    def is_complex(self) -> _bool: 
+        r"""
+        is_complex() -> bool
+        
+        Returns True if the data type of :attr:`self` is a complex data type.
+        """
+        ...
+    def is_conj(self) -> _bool: 
+        r"""
+        is_conj() -> bool
+        
+        Returns True if the conjugate bit of :attr:`self` is set to true.
+        """
+        ...
+    def is_contiguous(self, memory_format=torch.contiguous_format) -> _bool: 
+        r"""
+        is_contiguous(memory_format=torch.contiguous_format) -> bool
+        
+        Returns True if :attr:`self` tensor is contiguous in memory in the order specified
+        by memory format.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): Specifies memory allocation
+                order. Default: ``torch.contiguous_format``.
+        """
+        ...
+    is_cpu: _bool
+    r"""Is ``True`` if the Tensor is stored on the CPU, ``False`` otherwise."""
+    is_cuda: _bool
+    r"""Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise."""
+    def is_distributed(self) -> _bool: ...
+    def is_floating_point(self) -> _bool: 
+        r"""
+        is_floating_point() -> bool
+        
+        Returns True if the data type of :attr:`self` is a floating point data type.
+        """
+        ...
+    def is_inference(self) -> _bool: 
+        r"""
+        is_inference() -> bool
+        
+        See :func:`torch.is_inference`
+        """
+        ...
+    is_ipu: _bool
+    r"""Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise."""
+    is_leaf: _bool
+    r"""All Tensors that have :attr:`requires_grad` which is ``False`` will be leaf Tensors by convention.
+    
+    For Tensors that have :attr:`requires_grad` which is ``True``, they will be leaf Tensors if they were
+    created by the user. This means that they are not the result of an operation and so
+    :attr:`grad_fn` is None.
+    
+    Only leaf Tensors will have their :attr:`grad` populated during a call to :func:`backward`.
+    To get :attr:`grad` populated for non-leaf Tensors, you can use :func:`retain_grad`.
+    
+    Example::
+    
+        >>> a = torch.rand(10, requires_grad=True)
+        >>> a.is_leaf
+        True
+        >>> b = torch.rand(10, requires_grad=True).cuda()
+        >>> b.is_leaf
+        False
+        # b was created by the operation that cast a cpu Tensor into a cuda Tensor
+        >>> c = torch.rand(10, requires_grad=True) + 2
+        >>> c.is_leaf
+        False
+        # c was created by the addition operation
+        >>> d = torch.rand(10).cuda()
+        >>> d.is_leaf
+        True
+        # d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+        >>> e = torch.rand(10).cuda().requires_grad_()
+        >>> e.is_leaf
+        True
+        # e requires gradients and has no operations creating it
+        >>> f = torch.rand(10, requires_grad=True, device="cuda")
+        >>> f.is_leaf
+        True
+        # f requires grad, has no operation creating it"""
+    is_meta: _bool
+    r"""Is ``True`` if the Tensor is a meta tensor, ``False`` otherwise.  Meta tensors
+    are like normal tensors, but they carry no data."""
+    is_mkldnn: _bool
+    is_mps: _bool
+    r"""Is ``True`` if the Tensor is stored on the MPS device, ``False`` otherwise."""
+    is_mtia: _bool
+    def is_neg(self) -> _bool: 
+        r"""
+        is_neg() -> bool
+        
+        Returns True if the negative bit of :attr:`self` is set to true.
+        """
+        ...
+    is_nested: _bool
+    def is_nonzero(self) -> _bool: ...
+    is_ort: _bool
+    def is_pinned(self, device: Optional[Optional[DeviceLikeType]] = None) -> _bool: 
+        r"""
+        Returns true if this tensor resides in pinned memory.
+        """
+        ...
+    is_quantized: _bool
+    r"""Is ``True`` if the Tensor is quantized, ``False`` otherwise."""
+    def is_same_size(self, other: Tensor) -> _bool: ...
+    def is_set_to(self, tensor: Tensor) -> _bool: 
+        r"""
+        is_set_to(tensor) -> bool
+        
+        Returns True if both tensors are pointing to the exact same memory (same
+        storage, offset, size and stride).
+        """
+        ...
+    def is_signed(self) -> _bool: 
+        r"""
+        is_signed() -> bool
+        
+        Returns True if the data type of :attr:`self` is a signed data type.
+        """
+        ...
+    is_sparse: _bool
+    r"""Is ``True`` if the Tensor uses sparse COO storage layout, ``False`` otherwise."""
+    is_sparse_csr: _bool
+    r"""Is ``True`` if the Tensor uses sparse CSR storage layout, ``False`` otherwise."""
+    is_vulkan: _bool
+    def isclose(self, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> Tensor: 
+        r"""
+        isclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+        
+        See :func:`torch.isclose`
+        """
+        ...
+    def isfinite(self) -> Tensor: 
+        r"""
+        isfinite() -> Tensor
+        
+        See :func:`torch.isfinite`
+        """
+        ...
+    def isinf(self) -> Tensor: 
+        r"""
+        isinf() -> Tensor
+        
+        See :func:`torch.isinf`
+        """
+        ...
+    def isnan(self) -> Tensor: 
+        r"""
+        isnan() -> Tensor
+        
+        See :func:`torch.isnan`
+        """
+        ...
+    def isneginf(self) -> Tensor: 
+        r"""
+        isneginf() -> Tensor
+        
+        See :func:`torch.isneginf`
+        """
+        ...
+    def isposinf(self) -> Tensor: 
+        r"""
+        isposinf() -> Tensor
+        
+        See :func:`torch.isposinf`
+        """
+        ...
+    def isreal(self) -> Tensor: 
+        r"""
+        isreal() -> Tensor
+        
+        See :func:`torch.isreal`
+        """
+        ...
+    def istft(self, n_fft: _int, hop_length: Optional[_int] = None, win_length: Optional[_int] = None, window: Optional[Tensor] = None, center: _bool = True, normalized: _bool = False, onesided: Optional[_bool] = None, length: Optional[_int] = None, return_complex: _bool = False) -> Tensor: 
+        r"""
+        istft(n_fft, hop_length=None, win_length=None, window=None,
+         center=True, normalized=False, onesided=True, length=None) -> Tensor
+        
+        See :func:`torch.istft`
+        """
+        ...
+    def item(self) -> Number: 
+        r"""
+        item() -> number
+        
+        Returns the value of this tensor as a standard Python number. This only works
+        for tensors with one element. For other cases, see :meth:`~Tensor.tolist`.
+        
+        This operation is not differentiable.
+        
+        Example::
+        
+            >>> x = torch.tensor([1.0])
+            >>> x.item()
+            1.0
+        """
+        ...
+    def kron(self, other: Tensor) -> Tensor: 
+        r"""
+        kron(other) -> Tensor
+        
+        See :func:`torch.kron`
+        """
+        ...
+    @overload
+    def kthvalue(self, k: _int, dim: _int = -1, keepdim: _bool = False) -> torch.return_types.kthvalue: 
+        r"""
+        kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.kthvalue`
+        """
+        ...
+    @overload
+    def kthvalue(self, k: _int, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.kthvalue: 
+        r"""
+        kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.kthvalue`
+        """
+        ...
+    def lcm(self, other: Tensor) -> Tensor: 
+        r"""
+        lcm(other) -> Tensor
+        
+        See :func:`torch.lcm`
+        """
+        ...
+    def lcm_(self, other: Tensor) -> Tensor: 
+        r"""
+        lcm_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.lcm`
+        """
+        ...
+    def ldexp(self, other: Tensor) -> Tensor: 
+        r"""
+        ldexp(other) -> Tensor
+        
+        See :func:`torch.ldexp`
+        """
+        ...
+    def ldexp_(self, other: Tensor) -> Tensor: 
+        r"""
+        ldexp_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.ldexp`
+        """
+        ...
+    @overload
+    def le(self, other: Tensor) -> Tensor: 
+        r"""
+        le(other) -> Tensor
+        
+        See :func:`torch.le`.
+        """
+        ...
+    @overload
+    def le(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        le(other) -> Tensor
+        
+        See :func:`torch.le`.
+        """
+        ...
+    @overload
+    def le_(self, other: Tensor) -> Tensor: 
+        r"""
+        le_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.le`.
+        """
+        ...
+    @overload
+    def le_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        le_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.le`.
+        """
+        ...
+    @overload
+    def lerp(self, end: Tensor, weight: Tensor) -> Tensor: 
+        r"""
+        lerp(end, weight) -> Tensor
+        
+        See :func:`torch.lerp`
+        """
+        ...
+    @overload
+    def lerp(self, end: Tensor, weight: Union[Number, _complex]) -> Tensor: 
+        r"""
+        lerp(end, weight) -> Tensor
+        
+        See :func:`torch.lerp`
+        """
+        ...
+    @overload
+    def lerp_(self, end: Tensor, weight: Tensor) -> Tensor: 
+        r"""
+        lerp_(end, weight) -> Tensor
+        
+        In-place version of :meth:`~Tensor.lerp`
+        """
+        ...
+    @overload
+    def lerp_(self, end: Tensor, weight: Union[Number, _complex]) -> Tensor: 
+        r"""
+        lerp_(end, weight) -> Tensor
+        
+        In-place version of :meth:`~Tensor.lerp`
+        """
+        ...
+    @overload
+    def less(self, other: Tensor) -> Tensor: 
+        r"""
+        lt(other) -> Tensor
+        
+        See :func:`torch.less`.
+        """
+        ...
+    @overload
+    def less(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        lt(other) -> Tensor
+        
+        See :func:`torch.less`.
+        """
+        ...
+    @overload
+    def less_(self, other: Tensor) -> Tensor: 
+        r"""
+        less_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.less`.
+        """
+        ...
+    @overload
+    def less_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        less_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.less`.
+        """
+        ...
+    @overload
+    def less_equal(self, other: Tensor) -> Tensor: 
+        r"""
+        less_equal(other) -> Tensor
+        
+        See :func:`torch.less_equal`.
+        """
+        ...
+    @overload
+    def less_equal(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        less_equal(other) -> Tensor
+        
+        See :func:`torch.less_equal`.
+        """
+        ...
+    @overload
+    def less_equal_(self, other: Tensor) -> Tensor: 
+        r"""
+        less_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.less_equal`.
+        """
+        ...
+    @overload
+    def less_equal_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        less_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.less_equal`.
+        """
+        ...
+    def lgamma(self) -> Tensor: 
+        r"""
+        lgamma() -> Tensor
+        
+        See :func:`torch.lgamma`
+        """
+        ...
+    def lgamma_(self) -> Tensor: 
+        r"""
+        lgamma_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.lgamma`
+        """
+        ...
+    def log(self) -> Tensor: 
+        r"""
+        log() -> Tensor
+        
+        See :func:`torch.log`
+        """
+        ...
+    def log10(self) -> Tensor: 
+        r"""
+        log10() -> Tensor
+        
+        See :func:`torch.log10`
+        """
+        ...
+    def log10_(self) -> Tensor: 
+        r"""
+        log10_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.log10`
+        """
+        ...
+    def log1p(self) -> Tensor: 
+        r"""
+        log1p() -> Tensor
+        
+        See :func:`torch.log1p`
+        """
+        ...
+    def log1p_(self) -> Tensor: 
+        r"""
+        log1p_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.log1p`
+        """
+        ...
+    def log2(self) -> Tensor: 
+        r"""
+        log2() -> Tensor
+        
+        See :func:`torch.log2`
+        """
+        ...
+    def log2_(self) -> Tensor: 
+        r"""
+        log2_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.log2`
+        """
+        ...
+    def log_(self) -> Tensor: 
+        r"""
+        log_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.log`
+        """
+        ...
+    def log_normal_(self, mean: _float = 1, std: _float = 2, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        log_normal_(mean=1, std=2, *, generator=None)
+        
+        Fills :attr:`self` tensor with numbers samples from the log-normal distribution
+        parameterized by the given mean :math:`\mu` and standard deviation
+        :math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+        standard deviation of the underlying normal distribution, and not of the
+        returned distribution:
+        
+        .. math::
+        
+            f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
+        """
+        ...
+    @overload
+    def log_softmax(self, dim: _int, dtype: Optional[_dtype] = None) -> Tensor: ...
+    @overload
+    def log_softmax(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: ...
+    def logaddexp(self, other: Tensor) -> Tensor: 
+        r"""
+        logaddexp(other) -> Tensor
+        
+        See :func:`torch.logaddexp`
+        """
+        ...
+    def logaddexp2(self, other: Tensor) -> Tensor: 
+        r"""
+        logaddexp2(other) -> Tensor
+        
+        See :func:`torch.logaddexp2`
+        """
+        ...
+    @overload
+    def logcumsumexp(self, dim: _int) -> Tensor: 
+        r"""
+        logcumsumexp(dim) -> Tensor
+        
+        See :func:`torch.logcumsumexp`
+        """
+        ...
+    @overload
+    def logcumsumexp(self, dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        logcumsumexp(dim) -> Tensor
+        
+        See :func:`torch.logcumsumexp`
+        """
+        ...
+    def logdet(self) -> Tensor: 
+        r"""
+        logdet() -> Tensor
+        
+        See :func:`torch.logdet`
+        """
+        ...
+    def logical_and(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_and() -> Tensor
+        
+        See :func:`torch.logical_and`
+        """
+        ...
+    def logical_and_(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_and_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.logical_and`
+        """
+        ...
+    def logical_not(self) -> Tensor: 
+        r"""
+        logical_not() -> Tensor
+        
+        See :func:`torch.logical_not`
+        """
+        ...
+    def logical_not_(self) -> Tensor: 
+        r"""
+        logical_not_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.logical_not`
+        """
+        ...
+    def logical_or(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_or() -> Tensor
+        
+        See :func:`torch.logical_or`
+        """
+        ...
+    def logical_or_(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_or_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.logical_or`
+        """
+        ...
+    def logical_xor(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_xor() -> Tensor
+        
+        See :func:`torch.logical_xor`
+        """
+        ...
+    def logical_xor_(self, other: Tensor) -> Tensor: 
+        r"""
+        logical_xor_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.logical_xor`
+        """
+        ...
+    def logit(self, eps: Optional[_float] = None) -> Tensor: 
+        r"""
+        logit() -> Tensor
+        
+        See :func:`torch.logit`
+        """
+        ...
+    def logit_(self, eps: Optional[_float] = None) -> Tensor: 
+        r"""
+        logit_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.logit`
+        """
+        ...
+    @overload
+    def logsumexp(self, dim: Union[_int, _size], keepdim: _bool = False) -> Tensor: 
+        r"""
+        logsumexp(dim, keepdim=False) -> Tensor
+        
+        See :func:`torch.logsumexp`
+        """
+        ...
+    @overload
+    def logsumexp(self, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False) -> Tensor: 
+        r"""
+        logsumexp(dim, keepdim=False) -> Tensor
+        
+        See :func:`torch.logsumexp`
+        """
+        ...
+    def long(self) -> Tensor: 
+        r"""
+        long(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    @overload
+    def lt(self, other: Tensor) -> Tensor: 
+        r"""
+        lt(other) -> Tensor
+        
+        See :func:`torch.lt`.
+        """
+        ...
+    @overload
+    def lt(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        lt(other) -> Tensor
+        
+        See :func:`torch.lt`.
+        """
+        ...
+    @overload
+    def lt_(self, other: Tensor) -> Tensor: 
+        r"""
+        lt_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.lt`.
+        """
+        ...
+    @overload
+    def lt_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        lt_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.lt`.
+        """
+        ...
+    def lu_solve(self, LU_data: Tensor, LU_pivots: Tensor) -> Tensor: 
+        r"""
+        lu_solve(LU_data, LU_pivots) -> Tensor
+        
+        See :func:`torch.lu_solve`
+        """
+        ...
+    def map2_(self, x: Tensor, y: Tensor, callable: Callable) -> Tensor: ...
+    def map_(self, tensor: Tensor, callable: Callable) -> Tensor: 
+        r"""
+        map_(tensor, callable)
+        
+        Applies :attr:`callable` for each element in :attr:`self` tensor and the given
+        :attr:`tensor` and stores the results in :attr:`self` tensor. :attr:`self` tensor and
+        the given :attr:`tensor` must be :ref:`broadcastable <broadcasting-semantics>`.
+        
+        The :attr:`callable` should have the signature::
+        
+            def callable(a, b) -> number
+        """
+        ...
+    @overload
+    def masked_fill(self, mask: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        masked_fill(mask, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.masked_fill_`
+        """
+        ...
+    @overload
+    def masked_fill(self, mask: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        masked_fill(mask, value) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.masked_fill_`
+        """
+        ...
+    @overload
+    def masked_fill_(self, mask: Tensor, value: Tensor) -> Tensor: 
+        r"""
+        masked_fill_(mask, value)
+        
+        Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+        True. The shape of :attr:`mask` must be
+        :ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+        tensor.
+        
+        Args:
+            mask (BoolTensor): the boolean mask
+            value (float): the value to fill in with
+        """
+        ...
+    @overload
+    def masked_fill_(self, mask: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        masked_fill_(mask, value)
+        
+        Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+        True. The shape of :attr:`mask` must be
+        :ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+        tensor.
+        
+        Args:
+            mask (BoolTensor): the boolean mask
+            value (float): the value to fill in with
+        """
+        ...
+    def masked_scatter(self, mask: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        masked_scatter(mask, tensor) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
+        
+        .. note::
+        
+            The inputs :attr:`self` and :attr:`mask`
+            :ref:`broadcast <broadcasting-semantics>`.
+        
+        Example:
+        
+            >>> self = torch.tensor([0, 0, 0, 0, 0])
+            >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+            >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+            >>> self.masked_scatter(mask, source)
+            tensor([[0, 0, 0, 0, 1],
+                    [2, 3, 0, 4, 5]])
+        """
+        ...
+    def masked_scatter_(self, mask: Tensor, source: Tensor) -> Tensor: 
+        r"""
+        masked_scatter_(mask, source)
+        
+        Copies elements from :attr:`source` into :attr:`self` tensor at positions where
+        the :attr:`mask` is True. Elements from :attr:`source` are copied into :attr:`self`
+        starting at position 0 of :attr:`source` and continuing in order one-by-one for each
+        occurrence of :attr:`mask` being True.
+        The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
+        with the shape of the underlying tensor. The :attr:`source` should have at least
+        as many elements as the number of ones in :attr:`mask`.
+        
+        Args:
+            mask (BoolTensor): the boolean mask
+            source (Tensor): the tensor to copy from
+        
+        .. note::
+        
+            The :attr:`mask` operates on the :attr:`self` tensor, not on the given
+            :attr:`source` tensor.
+        
+        Example:
+        
+            >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+            >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+            >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+            >>> self.masked_scatter_(mask, source)
+            tensor([[0, 0, 0, 0, 1],
+                    [2, 3, 0, 4, 5]])
+        """
+        ...
+    def masked_select(self, mask: Tensor) -> Tensor: 
+        r"""
+        masked_select(mask) -> Tensor
+        
+        See :func:`torch.masked_select`
+        """
+        ...
+    def matmul(self, other: Tensor) -> Tensor: 
+        r"""
+        matmul(tensor2) -> Tensor
+        
+        See :func:`torch.matmul`
+        """
+        ...
+    def matrix_exp(self) -> Tensor: 
+        r"""
+        matrix_exp() -> Tensor
+        
+        See :func:`torch.matrix_exp`
+        """
+        ...
+    def matrix_power(self, n: _int) -> Tensor: 
+        r"""
+        matrix_power(n) -> Tensor
+        
+        .. note:: :meth:`~Tensor.matrix_power` is deprecated, use :func:`torch.linalg.matrix_power` instead.
+        
+        Alias for :func:`torch.linalg.matrix_power`
+        """
+        ...
+    @overload
+    def max(self) -> Tensor: 
+        r"""
+        max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.max`
+        """
+        ...
+    @overload
+    def max(self, other: Tensor) -> Tensor: 
+        r"""
+        max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.max`
+        """
+        ...
+    @overload
+    def max(self, dim: _int, keepdim: _bool = False) -> torch.return_types.max: 
+        r"""
+        max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.max`
+        """
+        ...
+    @overload
+    def max(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.max: 
+        r"""
+        max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.max`
+        """
+        ...
+    def maximum(self, other: Tensor) -> Tensor: 
+        r"""
+        maximum(other) -> Tensor
+        
+        See :func:`torch.maximum`
+        """
+        ...
+    @overload
+    def mean(self, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+        
+        See :func:`torch.mean`
+        """
+        ...
+    @overload
+    def mean(self, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+        
+        See :func:`torch.mean`
+        """
+        ...
+    @overload
+    def mean(self, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+        
+        See :func:`torch.mean`
+        """
+        ...
+    @overload
+    def median(self) -> Tensor: 
+        r"""
+        median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.median`
+        """
+        ...
+    @overload
+    def median(self, dim: _int, keepdim: _bool = False) -> torch.return_types.median: 
+        r"""
+        median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.median`
+        """
+        ...
+    @overload
+    def median(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.median: 
+        r"""
+        median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.median`
+        """
+        ...
+    @overload
+    def min(self) -> Tensor: 
+        r"""
+        min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.min`
+        """
+        ...
+    @overload
+    def min(self, other: Tensor) -> Tensor: 
+        r"""
+        min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.min`
+        """
+        ...
+    @overload
+    def min(self, dim: _int, keepdim: _bool = False) -> torch.return_types.min: 
+        r"""
+        min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.min`
+        """
+        ...
+    @overload
+    def min(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.min: 
+        r"""
+        min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+        
+        See :func:`torch.min`
+        """
+        ...
+    def minimum(self, other: Tensor) -> Tensor: 
+        r"""
+        minimum(other) -> Tensor
+        
+        See :func:`torch.minimum`
+        """
+        ...
+    def mm(self, mat2: Tensor) -> Tensor: 
+        r"""
+        mm(mat2) -> Tensor
+        
+        See :func:`torch.mm`
+        """
+        ...
+    @overload
+    def mode(self, dim: _int = -1, keepdim: _bool = False) -> torch.return_types.mode: 
+        r"""
+        mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.mode`
+        """
+        ...
+    @overload
+    def mode(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.mode: 
+        r"""
+        mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.mode`
+        """
+        ...
+    @overload
+    def moveaxis(self, source: _int, destination: _int) -> Tensor: 
+        r"""
+        moveaxis(source, destination) -> Tensor
+        
+        See :func:`torch.moveaxis`
+        """
+        ...
+    @overload
+    def moveaxis(self, source: _size, destination: _size) -> Tensor: 
+        r"""
+        moveaxis(source, destination) -> Tensor
+        
+        See :func:`torch.moveaxis`
+        """
+        ...
+    @overload
+    def movedim(self, source: _int, destination: _int) -> Tensor: 
+        r"""
+        movedim(source, destination) -> Tensor
+        
+        See :func:`torch.movedim`
+        """
+        ...
+    @overload
+    def movedim(self, source: _size, destination: _size) -> Tensor: 
+        r"""
+        movedim(source, destination) -> Tensor
+        
+        See :func:`torch.movedim`
+        """
+        ...
+    def msort(self) -> Tensor: 
+        r"""
+        msort() -> Tensor
+        
+        See :func:`torch.msort`
+        """
+        ...
+    def mul(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], *, out: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        mul(value) -> Tensor
+        
+        See :func:`torch.mul`.
+        """
+        ...
+    def mul_(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat]) -> Tensor: 
+        r"""
+        mul_(value) -> Tensor
+        
+        In-place version of :meth:`~Tensor.mul`.
+        """
+        ...
+    def multinomial(self, num_samples: _int, replacement: _bool = False, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        multinomial(num_samples, replacement=False, *, generator=None) -> Tensor
+        
+        See :func:`torch.multinomial`
+        """
+        ...
+    @overload
+    def multiply(self, other: Tensor) -> Tensor: 
+        r"""
+        multiply(value) -> Tensor
+        
+        See :func:`torch.multiply`.
+        """
+        ...
+    @overload
+    def multiply(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        multiply(value) -> Tensor
+        
+        See :func:`torch.multiply`.
+        """
+        ...
+    @overload
+    def multiply_(self, other: Tensor) -> Tensor: 
+        r"""
+        multiply_(value) -> Tensor
+        
+        In-place version of :meth:`~Tensor.multiply`.
+        """
+        ...
+    @overload
+    def multiply_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        multiply_(value) -> Tensor
+        
+        In-place version of :meth:`~Tensor.multiply`.
+        """
+        ...
+    def mv(self, vec: Tensor) -> Tensor: 
+        r"""
+        mv(vec) -> Tensor
+        
+        See :func:`torch.mv`
+        """
+        ...
+    def mvlgamma(self, p: _int) -> Tensor: 
+        r"""
+        mvlgamma(p) -> Tensor
+        
+        See :func:`torch.mvlgamma`
+        """
+        ...
+    def mvlgamma_(self, p: _int) -> Tensor: 
+        r"""
+        mvlgamma_(p) -> Tensor
+        
+        In-place version of :meth:`~Tensor.mvlgamma`
+        """
+        ...
+    def nan_to_num(self, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: 
+        r"""
+        nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+        
+        See :func:`torch.nan_to_num`.
+        """
+        ...
+    def nan_to_num_(self, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: 
+        r"""
+        nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.nan_to_num`.
+        """
+        ...
+    def nanmean(self, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        nanmean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+        
+        See :func:`torch.nanmean`
+        """
+        ...
+    @overload
+    def nanmedian(self) -> Tensor: 
+        r"""
+        nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.nanmedian`
+        """
+        ...
+    @overload
+    def nanmedian(self, dim: _int, keepdim: _bool = False) -> torch.return_types.nanmedian: 
+        r"""
+        nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.nanmedian`
+        """
+        ...
+    @overload
+    def nanmedian(self, dim: Union[str, ellipsis, None], keepdim: _bool = False) -> torch.return_types.nanmedian: 
+        r"""
+        nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.nanmedian`
+        """
+        ...
+    @overload
+    def nanquantile(self, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear") -> Tensor: 
+        r"""
+        nanquantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+        
+        See :func:`torch.nanquantile`
+        """
+        ...
+    @overload
+    def nanquantile(self, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear") -> Tensor: 
+        r"""
+        nanquantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+        
+        See :func:`torch.nanquantile`
+        """
+        ...
+    def nansum(self, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        nansum(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.nansum`
+        """
+        ...
+    @overload
+    def narrow(self, dim: _int, start: Tensor, length: Union[_int, SymInt]) -> Tensor: 
+        r"""
+        narrow(dimension, start, length) -> Tensor
+        
+        See :func:`torch.narrow`.
+        """
+        ...
+    @overload
+    def narrow(self, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: 
+        r"""
+        narrow(dimension, start, length) -> Tensor
+        
+        See :func:`torch.narrow`.
+        """
+        ...
+    def narrow_copy(self, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: 
+        r"""
+        narrow_copy(dimension, start, length) -> Tensor
+        
+        See :func:`torch.narrow_copy`.
+        """
+        ...
+    def ndimension(self) -> _int: 
+        r"""
+        ndimension() -> int
+        
+        Alias for :meth:`~Tensor.dim()`
+        """
+        ...
+    @overload
+    def ne(self, other: Tensor) -> Tensor: 
+        r"""
+        ne(other) -> Tensor
+        
+        See :func:`torch.ne`.
+        """
+        ...
+    @overload
+    def ne(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        ne(other) -> Tensor
+        
+        See :func:`torch.ne`.
+        """
+        ...
+    @overload
+    def ne_(self, other: Tensor) -> Tensor: 
+        r"""
+        ne_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.ne`.
+        """
+        ...
+    @overload
+    def ne_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        ne_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.ne`.
+        """
+        ...
+    def neg(self) -> Tensor: 
+        r"""
+        neg() -> Tensor
+        
+        See :func:`torch.neg`
+        """
+        ...
+    def neg_(self) -> Tensor: 
+        r"""
+        neg_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.neg`
+        """
+        ...
+    def negative(self) -> Tensor: 
+        r"""
+        negative() -> Tensor
+        
+        See :func:`torch.negative`
+        """
+        ...
+    def negative_(self) -> Tensor: 
+        r"""
+        negative_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.negative`
+        """
+        ...
+    def nelement(self) -> _int: 
+        r"""
+        nelement() -> int
+        
+        Alias for :meth:`~Tensor.numel`
+        """
+        ...
+    @overload
+    def new(self, *args: Any, device: Optional[DeviceLikeType] = None) -> Tensor: ...
+    @overload
+    def new(self, storage: Storage) -> Tensor: ...
+    @overload
+    def new(self, other: Tensor) -> Tensor: ...
+    @overload
+    def new(self, size: _size, *, device: Optional[DeviceLikeType] = None) -> Tensor: ...
+    @overload
+    def new_empty(self, size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_empty(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with uninitialized data.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.ones(())
+            >>> tensor.new_empty((2, 3))
+            tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+                    [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+        """
+        ...
+    @overload
+    def new_empty(self, *size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_empty(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with uninitialized data.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.ones(())
+            >>> tensor.new_empty((2, 3))
+            tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+                    [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+        """
+        ...
+    def new_empty_strided(self, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_empty_strided(size, stride, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` and strides :attr:`stride` filled with
+        uninitialized data. By default, the returned Tensor has the same
+        :class:`torch.dtype` and :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.ones(())
+            >>> tensor.new_empty_strided((2, 3), (3, 1))
+            tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+                    [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+        """
+        ...
+    def new_full(self, size: Sequence[Union[_int, SymInt]], fill_value: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_full(size, fill_value, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            fill_value (scalar): the number to fill the output tensor with.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.ones((2,), dtype=torch.float64)
+            >>> tensor.new_full((3, 4), 3.141592)
+            tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+                    [ 3.1416,  3.1416,  3.1416,  3.1416],
+                    [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+        """
+        ...
+    @overload
+    def new_ones(self, size: _size, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+        r"""
+        new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with ``1``.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.tensor((), dtype=torch.int32)
+            >>> tensor.new_ones((2, 3))
+            tensor([[ 1,  1,  1],
+                    [ 1,  1,  1]], dtype=torch.int32)
+        """
+        ...
+    @overload
+    def new_ones(self, size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with ``1``.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.tensor((), dtype=torch.int32)
+            >>> tensor.new_ones((2, 3))
+            tensor([[ 1,  1,  1],
+                    [ 1,  1,  1]], dtype=torch.int32)
+        """
+        ...
+    @overload
+    def new_ones(self, *size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with ``1``.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.tensor((), dtype=torch.int32)
+            >>> tensor.new_ones((2, 3))
+            tensor([[ 1,  1,  1],
+                    [ 1,  1,  1]], dtype=torch.int32)
+        """
+        ...
+    def new_tensor(self, data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+        r"""
+        new_tensor(data, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a new Tensor with :attr:`data` as the tensor data.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        .. warning::
+        
+            :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+            ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+            or :func:`torch.Tensor.detach`.
+            If you have a numpy array and want to avoid a copy, use
+            :func:`torch.from_numpy`.
+        
+        .. warning::
+        
+            When data is a tensor `x`, :func:`new_tensor()` reads out 'the data' from whatever it is passed,
+            and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.clone().detach()``
+            and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+            The equivalents using ``clone()`` and ``detach()`` are recommended.
+        
+        Args:
+            data (array_like): The returned Tensor copies :attr:`data`.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.ones((2,), dtype=torch.int8)
+            >>> data = [[0, 1], [2, 3]]
+            >>> tensor.new_tensor(data)
+            tensor([[ 0,  1],
+                    [ 2,  3]], dtype=torch.int8)
+        """
+        ...
+    @overload
+    def new_zeros(self, size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_zeros(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with ``0``.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.tensor((), dtype=torch.float64)
+            >>> tensor.new_zeros((2, 3))
+            tensor([[ 0.,  0.,  0.],
+                    [ 0.,  0.,  0.]], dtype=torch.float64)
+        """
+        ...
+    @overload
+    def new_zeros(self, *size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+        r"""
+        new_zeros(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, pin_memory=False) -> Tensor
+        
+        
+        Returns a Tensor of size :attr:`size` filled with ``0``.
+        By default, the returned Tensor has the same :class:`torch.dtype` and
+        :class:`torch.device` as this tensor.
+        
+        Args:
+            size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+                shape of the output tensor.
+        
+        Keyword args:
+            dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+                Default: if None, same :class:`torch.dtype` as this tensor.
+            device (:class:`torch.device`, optional): the desired device of returned tensor.
+                Default: if None, same :class:`torch.device` as this tensor.
+            requires_grad (bool, optional): If autograd should record operations on the
+                returned tensor. Default: ``False``.
+            layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+                Default: ``torch.strided``.
+            pin_memory (bool, optional): If set, returned tensor would be allocated in
+                the pinned memory. Works only for CPU tensors. Default: ``False``.
+        
+        Example::
+        
+            >>> tensor = torch.tensor((), dtype=torch.float64)
+            >>> tensor.new_zeros((2, 3))
+            tensor([[ 0.,  0.,  0.],
+                    [ 0.,  0.,  0.]], dtype=torch.float64)
+        """
+        ...
+    def nextafter(self, other: Tensor) -> Tensor: 
+        r"""
+        nextafter(other) -> Tensor
+        See :func:`torch.nextafter`
+        """
+        ...
+    def nextafter_(self, other: Tensor) -> Tensor: 
+        r"""
+        nextafter_(other) -> Tensor
+        In-place version of :meth:`~Tensor.nextafter`
+        """
+        ...
+    @overload
+    def nonzero(self, *, as_tuple: Literal[False] = False) -> Tensor: 
+        r"""
+        nonzero() -> LongTensor
+        
+        See :func:`torch.nonzero`
+        """
+        ...
+    @overload
+    def nonzero(self, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: 
+        r"""
+        nonzero() -> LongTensor
+        
+        See :func:`torch.nonzero`
+        """
+        ...
+    def nonzero_static(self, *, size: _int, fill_value: _int = -1) -> Tensor: 
+        r"""
+        nonzero_static(input, *, size, fill_value=-1) -> Tensor
+        
+        Returns a 2-D tensor where each row is the index for a non-zero value.
+        The returned Tensor has the same `torch.dtype` as `torch.nonzero()`.
+        
+        Args:
+            input (Tensor): the input tensor to count non-zero elements.
+        
+        Keyword args:
+            size (int): the size of non-zero elements expected to be included in the out
+                tensor. Pad the out tensor with `fill_value` if the `size` is larger
+                than total number of non-zero elements, truncate out tensor if `size`
+                is smaller. The size must be a non-negative integer.
+            fill_value (int): the value to fill the output tensor with when `size` is larger
+                than the total number of non-zero elements. Default is `-1` to represent
+                invalid index.
+        
+        Example:
+        
+            # Example 1: Padding
+            >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+            >>> static_size = 4
+            >>> t = torch.nonzero_static(input_tensor, size = static_size)
+            tensor([[  0,   0],
+                    [  1,   0],
+                    [  1,   1],
+                    [  -1, -1]], dtype=torch.int64)
+        
+            # Example 2: Truncating
+            >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+            >>> static_size = 2
+            >>> t = torch.nonzero_static(input_tensor, size = static_size)
+            tensor([[  0,   0],
+                    [  1,   0]], dtype=torch.int64)
+        
+            # Example 3: 0 size
+            >>> input_tensor = torch.tensor([10])
+            >>> static_size = 0
+            >>> t = torch.nonzero_static(input_tensor, size = static_size)
+            tensor([], size=(0, 1), dtype=torch.int64)
+        
+            # Example 4: 0 rank input
+            >>> input_tensor = torch.tensor(10)
+            >>> static_size = 2
+            >>> t = torch.nonzero_static(input_tensor, size = static_size)
+            tensor([], size=(2, 0), dtype=torch.int64)
+        """
+        ...
+    def normal_(self, mean: _float = 0, std: _float = 1, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        normal_(mean=0, std=1, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with elements samples from the normal distribution
+        parameterized by :attr:`mean` and :attr:`std`.
+        """
+        ...
+    @overload
+    def not_equal(self, other: Tensor) -> Tensor: 
+        r"""
+        not_equal(other) -> Tensor
+        
+        See :func:`torch.not_equal`.
+        """
+        ...
+    @overload
+    def not_equal(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        not_equal(other) -> Tensor
+        
+        See :func:`torch.not_equal`.
+        """
+        ...
+    @overload
+    def not_equal_(self, other: Tensor) -> Tensor: 
+        r"""
+        not_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.not_equal`.
+        """
+        ...
+    @overload
+    def not_equal_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        not_equal_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.not_equal`.
+        """
+        ...
+    def numel(self) -> _int: 
+        r"""
+        numel() -> int
+        
+        See :func:`torch.numel`
+        """
+        ...
+    def numpy(self, *, force: _bool = False) -> Any: 
+        r"""
+        numpy(*, force=False) -> numpy.ndarray
+        
+        Returns the tensor as a NumPy :class:`ndarray`.
+        
+        If :attr:`force` is ``False`` (the default), the conversion
+        is performed only if the tensor is on the CPU, does not require grad,
+        does not have its conjugate bit set, and is a dtype and layout that
+        NumPy supports. The returned ndarray and the tensor will share their
+        storage, so changes to the tensor will be reflected in the ndarray
+        and vice versa.
+        
+        If :attr:`force` is ``True`` this is equivalent to
+        calling ``t.detach().cpu().resolve_conj().resolve_neg().numpy()``.
+        If the tensor isn't on the CPU or the conjugate or negative bit is set,
+        the tensor won't share its storage with the returned ndarray.
+        Setting :attr:`force` to ``True`` can be a useful shorthand.
+        
+        Args:
+            force (bool): if ``True``, the ndarray may be a copy of the tensor
+                       instead of always sharing memory, defaults to ``False``.
+        """
+        ...
+    def orgqr(self, input2: Tensor) -> Tensor: 
+        r"""
+        orgqr(input2) -> Tensor
+        
+        See :func:`torch.orgqr`
+        """
+        ...
+    def ormqr(self, input2: Tensor, input3: Tensor, left: _bool = True, transpose: _bool = False) -> Tensor: 
+        r"""
+        ormqr(input2, input3, left=True, transpose=False) -> Tensor
+        
+        See :func:`torch.ormqr`
+        """
+        ...
+    def outer(self, vec2: Tensor) -> Tensor: 
+        r"""
+        outer(vec2) -> Tensor
+        
+        See :func:`torch.outer`.
+        """
+        ...
+    @overload
+    def permute(self, dims: _size) -> Tensor: 
+        r"""
+        permute(*dims) -> Tensor
+        
+        See :func:`torch.permute`
+        """
+        ...
+    @overload
+    def permute(self, *dims: _int) -> Tensor: 
+        r"""
+        permute(*dims) -> Tensor
+        
+        See :func:`torch.permute`
+        """
+        ...
+    def pin_memory(self, device: Optional[Optional[DeviceLikeType]] = None) -> Tensor: 
+        r"""
+        pin_memory() -> Tensor
+        
+        Copies the tensor to pinned memory, if it's not already pinned.
+        """
+        ...
+    def pinverse(self, rcond: _float = 1e-15) -> Tensor: 
+        r"""
+        pinverse() -> Tensor
+        
+        See :func:`torch.pinverse`
+        """
+        ...
+    def polygamma(self, n: _int) -> Tensor: 
+        r"""
+        polygamma(n) -> Tensor
+        
+        See :func:`torch.polygamma`
+        """
+        ...
+    def polygamma_(self, n: _int) -> Tensor: 
+        r"""
+        polygamma_(n) -> Tensor
+        
+        In-place version of :meth:`~Tensor.polygamma`
+        """
+        ...
+    def positive(self) -> Tensor: 
+        r"""
+        positive() -> Tensor
+        
+        See :func:`torch.positive`
+        """
+        ...
+    @overload
+    def pow(self, exponent: Tensor) -> Tensor: 
+        r"""
+        pow(exponent) -> Tensor
+        
+        See :func:`torch.pow`
+        """
+        ...
+    @overload
+    def pow(self, exponent: Union[Number, _complex]) -> Tensor: 
+        r"""
+        pow(exponent) -> Tensor
+        
+        See :func:`torch.pow`
+        """
+        ...
+    @overload
+    def pow_(self, exponent: Tensor) -> Tensor: 
+        r"""
+        pow_(exponent) -> Tensor
+        
+        In-place version of :meth:`~Tensor.pow`
+        """
+        ...
+    @overload
+    def pow_(self, exponent: Union[Number, _complex]) -> Tensor: 
+        r"""
+        pow_(exponent) -> Tensor
+        
+        In-place version of :meth:`~Tensor.pow`
+        """
+        ...
+    def prelu(self, weight: Tensor) -> Tensor: ...
+    @overload
+    def prod(self, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        prod(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.prod`
+        """
+        ...
+    @overload
+    def prod(self, dim: _int, keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        prod(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.prod`
+        """
+        ...
+    @overload
+    def prod(self, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        prod(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.prod`
+        """
+        ...
+    def put(self, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: 
+        r"""
+        put(input, index, source, accumulate=False) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.put_`.
+        `input` corresponds to `self` in :meth:`torch.Tensor.put_`.
+        """
+        ...
+    def put_(self, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: 
+        r"""
+        put_(index, source, accumulate=False) -> Tensor
+        
+        Copies the elements from :attr:`source` into the positions specified by
+        :attr:`index`. For the purpose of indexing, the :attr:`self` tensor is treated as if
+        it were a 1-D tensor.
+        
+        :attr:`index` and :attr:`source` need to have the same number of elements, but not necessarily
+        the same shape.
+        
+        If :attr:`accumulate` is ``True``, the elements in :attr:`source` are added to
+        :attr:`self`. If accumulate is ``False``, the behavior is undefined if :attr:`index`
+        contain duplicate elements.
+        
+        Args:
+            index (LongTensor): the indices into self
+            source (Tensor): the tensor containing values to copy from
+            accumulate (bool): whether to accumulate into self
+        
+        Example::
+        
+            >>> src = torch.tensor([[4, 3, 5],
+            ...                     [6, 7, 8]])
+            >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+            tensor([[  4,   9,   5],
+                    [ 10,   7,   8]])
+        """
+        ...
+    def q_per_channel_axis(self) -> _int: 
+        r"""
+        q_per_channel_axis() -> int
+        
+        Given a Tensor quantized by linear (affine) per-channel quantization,
+        returns the index of dimension on which per-channel quantization is applied.
+        """
+        ...
+    def q_per_channel_scales(self) -> Tensor: 
+        r"""
+        q_per_channel_scales() -> Tensor
+        
+        Given a Tensor quantized by linear (affine) per-channel quantization,
+        returns a Tensor of scales of the underlying quantizer. It has the number of
+        elements that matches the corresponding dimensions (from q_per_channel_axis) of
+        the tensor.
+        """
+        ...
+    def q_per_channel_zero_points(self) -> Tensor: 
+        r"""
+        q_per_channel_zero_points() -> Tensor
+        
+        Given a Tensor quantized by linear (affine) per-channel quantization,
+        returns a tensor of zero_points of the underlying quantizer. It has the number of
+        elements that matches the corresponding dimensions (from q_per_channel_axis) of
+        the tensor.
+        """
+        ...
+    def q_scale(self) -> _float: 
+        r"""
+        q_scale() -> float
+        
+        Given a Tensor quantized by linear(affine) quantization,
+        returns the scale of the underlying quantizer().
+        """
+        ...
+    def q_zero_point(self) -> _int: 
+        r"""
+        q_zero_point() -> int
+        
+        Given a Tensor quantized by linear(affine) quantization,
+        returns the zero_point of the underlying quantizer().
+        """
+        ...
+    def qr(self, some: _bool = True) -> torch.return_types.qr: 
+        r"""
+        qr(some=True) -> (Tensor, Tensor)
+        
+        See :func:`torch.qr`
+        """
+        ...
+    def qscheme(self) -> _qscheme: 
+        r"""
+        qscheme() -> torch.qscheme
+        
+        Returns the quantization scheme of a given QTensor.
+        """
+        ...
+    @overload
+    def quantile(self, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear") -> Tensor: 
+        r"""
+        quantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+        
+        See :func:`torch.quantile`
+        """
+        ...
+    @overload
+    def quantile(self, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear") -> Tensor: 
+        r"""
+        quantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+        
+        See :func:`torch.quantile`
+        """
+        ...
+    def rad2deg(self) -> Tensor: 
+        r"""
+        rad2deg() -> Tensor
+        
+        See :func:`torch.rad2deg`
+        """
+        ...
+    def rad2deg_(self) -> Tensor: 
+        r"""
+        rad2deg_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.rad2deg`
+        """
+        ...
+    @overload
+    def random_(self, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        random_(from=0, to=None, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+        distribution over ``[from, to - 1]``. If not specified, the values are usually
+        only bounded by :attr:`self` tensor's data type. However, for floating point
+        types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+        value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+        will be uniform in ``[0, 2^53]``.
+        """
+        ...
+    @overload
+    def random_(self, from_: _int, to: Optional[_int], *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        random_(from=0, to=None, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+        distribution over ``[from, to - 1]``. If not specified, the values are usually
+        only bounded by :attr:`self` tensor's data type. However, for floating point
+        types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+        value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+        will be uniform in ``[0, 2^53]``.
+        """
+        ...
+    @overload
+    def random_(self, to: _int, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        random_(from=0, to=None, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+        distribution over ``[from, to - 1]``. If not specified, the values are usually
+        only bounded by :attr:`self` tensor's data type. However, for floating point
+        types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+        value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+        will be uniform in ``[0, 2^53]``.
+        """
+        ...
+    def ravel(self) -> Tensor: 
+        r"""
+        ravel() -> Tensor
+        
+        see :func:`torch.ravel`
+        """
+        ...
+    def reciprocal(self) -> Tensor: 
+        r"""
+        reciprocal() -> Tensor
+        
+        See :func:`torch.reciprocal`
+        """
+        ...
+    def reciprocal_(self) -> Tensor: 
+        r"""
+        reciprocal_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.reciprocal`
+        """
+        ...
+    def record_stream(self, s: Stream) -> None: 
+        r"""
+        record_stream(stream)
+        
+        Marks the tensor as having been used by this stream.  When the tensor
+        is deallocated, ensure the tensor memory is not reused for another tensor
+        until all work queued on :attr:`stream` at the time of deallocation is
+        complete.
+        
+        .. note::
+        
+            The caching allocator is aware of only the stream where a tensor was
+            allocated. Due to the awareness, it already correctly manages the life
+            cycle of tensors on only one stream. But if a tensor is used on a stream
+            different from the stream of origin, the allocator might reuse the memory
+            unexpectedly. Calling this method lets the allocator know which streams
+            have used the tensor.
+        
+        .. warning::
+        
+            This method is most suitable for use cases where you are providing a
+            function that created a tensor on a side stream, and want users to be able
+            to make use of the tensor without having to think carefully about stream
+            safety when making use of them.  These safety guarantees come at some
+            performance and predictability cost (analogous to the tradeoff between GC
+            and manual memory management), so if you are in a situation where
+            you manage the full lifetime of your tensors, you may consider instead
+            manually managing CUDA events so that calling this method is not necessary.
+            In particular, when you call this method, on later allocations the
+            allocator will poll the recorded stream to see if all operations have
+            completed yet; you can potentially race with side stream computation and
+            non-deterministically reuse or fail to reuse memory for an allocation.
+        
+            You can safely use tensors allocated on side streams without
+            :meth:`~Tensor.record_stream`; you must manually ensure that
+            any non-creation stream uses of a tensor are synced back to the creation
+            stream before you deallocate the tensor.  As the CUDA caching allocator
+            guarantees that the memory will only be reused with the same creation stream,
+            this is sufficient to ensure that writes to future reallocations of the
+            memory will be delayed until non-creation stream uses are done.
+            (Counterintuitively, you may observe that on the CPU side we have already
+            reallocated the tensor, even though CUDA kernels on the old tensor are
+            still in progress.  This is fine, because CUDA operations on the new
+            tensor will appropriately wait for the old operations to complete, as they
+            are all on the same stream.)
+        
+            Concretely, this looks like this::
+        
+                with torch.cuda.stream(s0):
+                    x = torch.zeros(N)
+        
+                s1.wait_stream(s0)
+                with torch.cuda.stream(s1):
+                    y = some_comm_op(x)
+        
+                ... some compute on s0 ...
+        
+                # synchronize creation stream s0 to side stream s1
+                # before deallocating x
+                s0.wait_stream(s1)
+                del x
+        
+            Note that some discretion is required when deciding when to perform
+            ``s0.wait_stream(s1)``.  In particular, if we were to wait immediately
+            after ``some_comm_op``, there wouldn't be any point in having the side
+            stream; it would be equivalent to have run ``some_comm_op`` on ``s0``.
+            Instead, the synchronization must be placed at some appropriate, later
+            point in time where you expect the side stream ``s1`` to have finished
+            work.  This location is typically identified via profiling, e.g., using
+            Chrome traces produced
+            :meth:`torch.autograd.profiler.profile.export_chrome_trace`.  If you
+            place the wait too early, work on s0 will block until ``s1`` has finished,
+            preventing further overlapping of communication and computation.  If you
+            place the wait too late, you will use more memory than is strictly
+            necessary (as you are keeping ``x`` live for longer.)  For a concrete
+            example of how this guidance can be applied in practice, see this post:
+            `FSDP and CUDACachingAllocator
+            <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486>`_.
+        """
+        ...
+    def refine_names(self, names: Sequence[Union[str, ellipsis, None]]) -> Tensor: ...
+    def relu(self) -> Tensor: ...
+    def relu_(self) -> Tensor: ...
+    @overload
+    def remainder(self, other: Tensor) -> Tensor: 
+        r"""
+        remainder(divisor) -> Tensor
+        
+        See :func:`torch.remainder`
+        """
+        ...
+    @overload
+    def remainder(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        remainder(divisor) -> Tensor
+        
+        See :func:`torch.remainder`
+        """
+        ...
+    @overload
+    def remainder_(self, other: Tensor) -> Tensor: 
+        r"""
+        remainder_(divisor) -> Tensor
+        
+        In-place version of :meth:`~Tensor.remainder`
+        """
+        ...
+    @overload
+    def remainder_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        remainder_(divisor) -> Tensor
+        
+        In-place version of :meth:`~Tensor.remainder`
+        """
+        ...
+    def rename(self, names: Optional[Sequence[Union[str, ellipsis, None]]]) -> Tensor: ...
+    def rename_(self, names: Optional[Sequence[Union[str, ellipsis, None]]]) -> Tensor: ...
+    def renorm(self, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex]) -> Tensor: 
+        r"""
+        renorm(p, dim, maxnorm) -> Tensor
+        
+        See :func:`torch.renorm`
+        """
+        ...
+    def renorm_(self, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex]) -> Tensor: 
+        r"""
+        renorm_(p, dim, maxnorm) -> Tensor
+        
+        In-place version of :meth:`~Tensor.renorm`
+        """
+        ...
+    @overload
+    def repeat(self, repeats: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        repeat(*sizes) -> Tensor
+        
+        Repeats this tensor along the specified dimensions.
+        
+        Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+        
+        .. warning::
+        
+            :meth:`~Tensor.repeat` behaves differently from
+            `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
+            but is more similar to
+            `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+            For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
+        
+        Args:
+            sizes (torch.Size or int...): The number of times to repeat this tensor along each
+                dimension
+        
+        Example::
+        
+            >>> x = torch.tensor([1, 2, 3])
+            >>> x.repeat(4, 2)
+            tensor([[ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3]])
+            >>> x.repeat(4, 2, 1).size()
+            torch.Size([4, 2, 3])
+        """
+        ...
+    @overload
+    def repeat(self, *repeats: _int) -> Tensor: 
+        r"""
+        repeat(*sizes) -> Tensor
+        
+        Repeats this tensor along the specified dimensions.
+        
+        Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+        
+        .. warning::
+        
+            :meth:`~Tensor.repeat` behaves differently from
+            `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
+            but is more similar to
+            `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+            For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
+        
+        Args:
+            sizes (torch.Size or int...): The number of times to repeat this tensor along each
+                dimension
+        
+        Example::
+        
+            >>> x = torch.tensor([1, 2, 3])
+            >>> x.repeat(4, 2)
+            tensor([[ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3],
+                    [ 1,  2,  3,  1,  2,  3]])
+            >>> x.repeat(4, 2, 1).size()
+            torch.Size([4, 2, 3])
+        """
+        ...
+    @overload
+    def repeat_interleave(self, repeats: Tensor, dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+        r"""
+        repeat_interleave(repeats, dim=None, *, output_size=None) -> Tensor
+        
+        See :func:`torch.repeat_interleave`.
+        """
+        ...
+    @overload
+    def repeat_interleave(self, repeats: Union[_int, SymInt], dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+        r"""
+        repeat_interleave(repeats, dim=None, *, output_size=None) -> Tensor
+        
+        See :func:`torch.repeat_interleave`.
+        """
+        ...
+    def requires_grad_(self, mode: _bool = True) -> Tensor: 
+        r"""
+        requires_grad_(requires_grad=True) -> Tensor
+        
+        Change if autograd should record operations on this tensor: sets this tensor's
+        :attr:`requires_grad` attribute in-place. Returns this tensor.
+        
+        :func:`requires_grad_`'s main use case is to tell autograd to begin recording
+        operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+        (because it was obtained through a DataLoader, or required preprocessing or
+        initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+        begin to record operations on ``tensor``.
+        
+        Args:
+            requires_grad (bool): If autograd should record operations on this tensor.
+                Default: ``True``.
+        
+        Example::
+        
+            >>> # Let's say we want to preprocess some saved weights and use
+            >>> # the result as new weights.
+            >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+            >>> loaded_weights = torch.tensor(saved_weights)
+            >>> weights = preprocess(loaded_weights)  # some function
+            >>> weights
+            tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+        
+            >>> # Now, start to record operations done to weights
+            >>> weights.requires_grad_()
+            >>> out = weights.pow(2).sum()
+            >>> out.backward()
+            >>> weights.grad
+            tensor([-1.1007,  0.9853, -4.2316, -1.6606])
+        """
+        ...
+    @overload
+    def reshape(self, shape: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        reshape(*shape) -> Tensor
+        
+        Returns a tensor with the same data and number of elements as :attr:`self`
+        but with the specified shape. This method returns a view if :attr:`shape` is
+        compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+        possible to return a view.
+        
+        See :func:`torch.reshape`
+        
+        Args:
+            shape (tuple of ints or int...): the desired shape
+        """
+        ...
+    @overload
+    def reshape(self, *shape: _int) -> Tensor: 
+        r"""
+        reshape(*shape) -> Tensor
+        
+        Returns a tensor with the same data and number of elements as :attr:`self`
+        but with the specified shape. This method returns a view if :attr:`shape` is
+        compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+        possible to return a view.
+        
+        See :func:`torch.reshape`
+        
+        Args:
+            shape (tuple of ints or int...): the desired shape
+        """
+        ...
+    def reshape_as(self, other: Tensor) -> Tensor: 
+        r"""
+        reshape_as(other) -> Tensor
+        
+        Returns this tensor as the same shape as :attr:`other`.
+        ``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+        This method returns a view if ``other.sizes()`` is compatible with the current
+        shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
+        
+        Please see :meth:`reshape` for more information about ``reshape``.
+        
+        Args:
+            other (:class:`torch.Tensor`): The result tensor has the same shape
+                as :attr:`other`.
+        """
+        ...
+    @overload
+    def resize_(self, size: Sequence[Union[_int, SymInt]], *, memory_format: Optional[memory_format] = None) -> Tensor: 
+        r"""
+        resize_(*sizes, memory_format=torch.contiguous_format) -> Tensor
+        
+        Resizes :attr:`self` tensor to the specified size. If the number of elements is
+        larger than the current storage size, then the underlying storage is resized
+        to fit the new number of elements. If the number of elements is smaller, the
+        underlying storage is not changed. Existing elements are preserved but any new
+        memory is uninitialized.
+        
+        .. warning::
+        
+            This is a low-level method. The storage is reinterpreted as C-contiguous,
+            ignoring the current strides (unless the target size equals the current
+            size, in which case the tensor is left unchanged). For most purposes, you
+            will instead want to use :meth:`~Tensor.view()`, which checks for
+            contiguity, or :meth:`~Tensor.reshape()`, which copies data if needed. To
+            change the size in-place with custom strides, see :meth:`~Tensor.set_()`.
+        
+        .. note::
+        
+            If :func:`torch.use_deterministic_algorithms()` and
+            :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+            ``True``, new elements are initialized to prevent nondeterministic behavior
+            from using the result as an input to an operation. Floating point and
+            complex values are set to NaN, and integer values are set to the maximum
+            value.
+        
+        Args:
+            sizes (torch.Size or int...): the desired size
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+                :attr:`self` is going to be unaffected if ``self.size()`` matches ``sizes``.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+            >>> x.resize_(2, 2)
+            tensor([[ 1,  2],
+                    [ 3,  4]])
+        """
+        ...
+    @overload
+    def resize_(self, *size: _int, memory_format: Optional[memory_format] = None) -> Tensor: 
+        r"""
+        resize_(*sizes, memory_format=torch.contiguous_format) -> Tensor
+        
+        Resizes :attr:`self` tensor to the specified size. If the number of elements is
+        larger than the current storage size, then the underlying storage is resized
+        to fit the new number of elements. If the number of elements is smaller, the
+        underlying storage is not changed. Existing elements are preserved but any new
+        memory is uninitialized.
+        
+        .. warning::
+        
+            This is a low-level method. The storage is reinterpreted as C-contiguous,
+            ignoring the current strides (unless the target size equals the current
+            size, in which case the tensor is left unchanged). For most purposes, you
+            will instead want to use :meth:`~Tensor.view()`, which checks for
+            contiguity, or :meth:`~Tensor.reshape()`, which copies data if needed. To
+            change the size in-place with custom strides, see :meth:`~Tensor.set_()`.
+        
+        .. note::
+        
+            If :func:`torch.use_deterministic_algorithms()` and
+            :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+            ``True``, new elements are initialized to prevent nondeterministic behavior
+            from using the result as an input to an operation. Floating point and
+            complex values are set to NaN, and integer values are set to the maximum
+            value.
+        
+        Args:
+            sizes (torch.Size or int...): the desired size
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+                :attr:`self` is going to be unaffected if ``self.size()`` matches ``sizes``.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+            >>> x.resize_(2, 2)
+            tensor([[ 1,  2],
+                    [ 3,  4]])
+        """
+        ...
+    def resize_as_(self, the_template: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+        r"""
+        resize_as_(tensor, memory_format=torch.contiguous_format) -> Tensor
+        
+        Resizes the :attr:`self` tensor to be the same size as the specified
+        :attr:`tensor`. This is equivalent to ``self.resize_(tensor.size())``.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+                :attr:`self` is going to be unaffected if ``self.size()`` matches ``tensor.size()``.
+        """
+        ...
+    def resize_as_sparse_(self, the_template: Tensor) -> Tensor: ...
+    def resolve_conj(self) -> Tensor: 
+        r"""
+        resolve_conj() -> Tensor
+        
+        See :func:`torch.resolve_conj`
+        """
+        ...
+    def resolve_neg(self) -> Tensor: 
+        r"""
+        resolve_neg() -> Tensor
+        
+        See :func:`torch.resolve_neg`
+        """
+        ...
+    def retain_grad(self) -> None: 
+        r"""
+        retain_grad() -> None
+        
+        Enables this Tensor to have their :attr:`grad` populated during
+        :func:`backward`. This is a no-op for leaf tensors.
+        """
+        ...
+    def roll(self, shifts: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], dims: Union[_int, _size] = ()) -> Tensor: 
+        r"""
+        roll(shifts, dims) -> Tensor
+        
+        See :func:`torch.roll`
+        """
+        ...
+    def rot90(self, k: _int = 1, dims: _size = (0,1)) -> Tensor: 
+        r"""
+        rot90(k, dims) -> Tensor
+        
+        See :func:`torch.rot90`
+        """
+        ...
+    @overload
+    def round(self) -> Tensor: 
+        r"""
+        round(decimals=0) -> Tensor
+        
+        See :func:`torch.round`
+        """
+        ...
+    @overload
+    def round(self, *, decimals: _int) -> Tensor: 
+        r"""
+        round(decimals=0) -> Tensor
+        
+        See :func:`torch.round`
+        """
+        ...
+    @overload
+    def round_(self) -> Tensor: 
+        r"""
+        round_(decimals=0) -> Tensor
+        
+        In-place version of :meth:`~Tensor.round`
+        """
+        ...
+    @overload
+    def round_(self, *, decimals: _int) -> Tensor: 
+        r"""
+        round_(decimals=0) -> Tensor
+        
+        In-place version of :meth:`~Tensor.round`
+        """
+        ...
+    def row_indices(self) -> Tensor: ...
+    def rsqrt(self) -> Tensor: 
+        r"""
+        rsqrt() -> Tensor
+        
+        See :func:`torch.rsqrt`
+        """
+        ...
+    def rsqrt_(self) -> Tensor: 
+        r"""
+        rsqrt_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.rsqrt`
+        """
+        ...
+    @overload
+    def scatter(self, dim: _int, index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter(self, dim: _int, index: Tensor, src: Tensor, *, reduce: str) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter(self, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter(self, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter(self, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter(self, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        scatter(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_`
+        """
+        ...
+    @overload
+    def scatter_(self, dim: _int, index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter_(dim, index, src, *, reduce=None) -> Tensor
+        
+        Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+        index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+        the corresponding value in :attr:`index` for ``dimension = dim``.
+        
+        For a 3-D tensor, :attr:`self` is updated as::
+        
+            self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+        
+        This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+        
+        :attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+        the same number of dimensions. It is also required that
+        ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+        ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+        Note that ``index`` and ``src`` do not broadcast.
+        
+        Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+        between ``0`` and ``self.size(dim) - 1`` inclusive.
+        
+        .. warning::
+        
+            When indices are not unique, the behavior is non-deterministic (one of the
+            values from ``src`` will be picked arbitrarily) and the gradient will be
+            incorrect (it will be propagated to all locations in the source that
+            correspond to the same index)!
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        Additionally accepts an optional :attr:`reduce` argument that allows
+        specification of an optional reduction operation, which is applied to all
+        values in the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+        operation is applied to an index in :attr:`self` which is specified by
+        its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+        value in :attr:`index` for ``dimension = dim``.
+        
+        Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+        is updated as::
+        
+            self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+        
+        Reducing with the addition operation is the same as using
+        :meth:`~torch.Tensor.scatter_add_`.
+        
+        .. warning::
+            The reduce argument with Tensor ``src`` is deprecated and will be removed in
+            a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+            instead for more reduction options.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            src (Tensor): the source element(s) to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> src = torch.arange(1, 11).reshape((2, 5))
+            >>> src
+            tensor([[ 1,  2,  3,  4,  5],
+                    [ 6,  7,  8,  9, 10]])
+            >>> index = torch.tensor([[0, 1, 2, 0]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+            tensor([[1, 0, 0, 4, 0],
+                    [0, 2, 0, 0, 0],
+                    [0, 0, 3, 0, 0]])
+            >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+            tensor([[1, 2, 3, 0, 0],
+                    [6, 7, 0, 0, 8],
+                    [0, 0, 0, 0, 0]])
+        
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='multiply')
+            tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 2.4600]])
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='add')
+            tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 3.2300]])
+        
+        .. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+           :noindex:
+        
+        Writes the value from :attr:`value` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+        with the :attr:`src` tensor filled entirely with :attr:`value`.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            value (Scalar): the value to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> index = torch.tensor([[0, 1]])
+            >>> value = 2
+            >>> torch.zeros(3, 5).scatter_(0, index, value)
+            tensor([[2., 0., 0., 0., 0.],
+                    [0., 2., 0., 0., 0.],
+                    [0., 0., 0., 0., 0.]])
+        """
+        ...
+    @overload
+    def scatter_(self, dim: _int, index: Tensor, src: Tensor, *, reduce: str) -> Tensor: 
+        r"""
+        scatter_(dim, index, src, *, reduce=None) -> Tensor
+        
+        Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+        index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+        the corresponding value in :attr:`index` for ``dimension = dim``.
+        
+        For a 3-D tensor, :attr:`self` is updated as::
+        
+            self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+        
+        This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+        
+        :attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+        the same number of dimensions. It is also required that
+        ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+        ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+        Note that ``index`` and ``src`` do not broadcast.
+        
+        Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+        between ``0`` and ``self.size(dim) - 1`` inclusive.
+        
+        .. warning::
+        
+            When indices are not unique, the behavior is non-deterministic (one of the
+            values from ``src`` will be picked arbitrarily) and the gradient will be
+            incorrect (it will be propagated to all locations in the source that
+            correspond to the same index)!
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        Additionally accepts an optional :attr:`reduce` argument that allows
+        specification of an optional reduction operation, which is applied to all
+        values in the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+        operation is applied to an index in :attr:`self` which is specified by
+        its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+        value in :attr:`index` for ``dimension = dim``.
+        
+        Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+        is updated as::
+        
+            self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+        
+        Reducing with the addition operation is the same as using
+        :meth:`~torch.Tensor.scatter_add_`.
+        
+        .. warning::
+            The reduce argument with Tensor ``src`` is deprecated and will be removed in
+            a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+            instead for more reduction options.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            src (Tensor): the source element(s) to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> src = torch.arange(1, 11).reshape((2, 5))
+            >>> src
+            tensor([[ 1,  2,  3,  4,  5],
+                    [ 6,  7,  8,  9, 10]])
+            >>> index = torch.tensor([[0, 1, 2, 0]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+            tensor([[1, 0, 0, 4, 0],
+                    [0, 2, 0, 0, 0],
+                    [0, 0, 3, 0, 0]])
+            >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+            tensor([[1, 2, 3, 0, 0],
+                    [6, 7, 0, 0, 8],
+                    [0, 0, 0, 0, 0]])
+        
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='multiply')
+            tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 2.4600]])
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='add')
+            tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 3.2300]])
+        
+        .. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+           :noindex:
+        
+        Writes the value from :attr:`value` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+        with the :attr:`src` tensor filled entirely with :attr:`value`.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            value (Scalar): the value to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> index = torch.tensor([[0, 1]])
+            >>> value = 2
+            >>> torch.zeros(3, 5).scatter_(0, index, value)
+            tensor([[2., 0., 0., 0., 0.],
+                    [0., 2., 0., 0., 0.],
+                    [0., 0., 0., 0., 0.]])
+        """
+        ...
+    @overload
+    def scatter_(self, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str) -> Tensor: 
+        r"""
+        scatter_(dim, index, src, *, reduce=None) -> Tensor
+        
+        Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+        index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+        the corresponding value in :attr:`index` for ``dimension = dim``.
+        
+        For a 3-D tensor, :attr:`self` is updated as::
+        
+            self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+        
+        This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+        
+        :attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+        the same number of dimensions. It is also required that
+        ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+        ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+        Note that ``index`` and ``src`` do not broadcast.
+        
+        Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+        between ``0`` and ``self.size(dim) - 1`` inclusive.
+        
+        .. warning::
+        
+            When indices are not unique, the behavior is non-deterministic (one of the
+            values from ``src`` will be picked arbitrarily) and the gradient will be
+            incorrect (it will be propagated to all locations in the source that
+            correspond to the same index)!
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        Additionally accepts an optional :attr:`reduce` argument that allows
+        specification of an optional reduction operation, which is applied to all
+        values in the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+        operation is applied to an index in :attr:`self` which is specified by
+        its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+        value in :attr:`index` for ``dimension = dim``.
+        
+        Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+        is updated as::
+        
+            self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+        
+        Reducing with the addition operation is the same as using
+        :meth:`~torch.Tensor.scatter_add_`.
+        
+        .. warning::
+            The reduce argument with Tensor ``src`` is deprecated and will be removed in
+            a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+            instead for more reduction options.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            src (Tensor): the source element(s) to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> src = torch.arange(1, 11).reshape((2, 5))
+            >>> src
+            tensor([[ 1,  2,  3,  4,  5],
+                    [ 6,  7,  8,  9, 10]])
+            >>> index = torch.tensor([[0, 1, 2, 0]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+            tensor([[1, 0, 0, 4, 0],
+                    [0, 2, 0, 0, 0],
+                    [0, 0, 3, 0, 0]])
+            >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+            tensor([[1, 2, 3, 0, 0],
+                    [6, 7, 0, 0, 8],
+                    [0, 0, 0, 0, 0]])
+        
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='multiply')
+            tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 2.4600]])
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='add')
+            tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 3.2300]])
+        
+        .. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+           :noindex:
+        
+        Writes the value from :attr:`value` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+        with the :attr:`src` tensor filled entirely with :attr:`value`.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            value (Scalar): the value to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> index = torch.tensor([[0, 1]])
+            >>> value = 2
+            >>> torch.zeros(3, 5).scatter_(0, index, value)
+            tensor([[2., 0., 0., 0., 0.],
+                    [0., 2., 0., 0., 0.],
+                    [0., 0., 0., 0., 0.]])
+        """
+        ...
+    @overload
+    def scatter_(self, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+        r"""
+        scatter_(dim, index, src, *, reduce=None) -> Tensor
+        
+        Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+        index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+        the corresponding value in :attr:`index` for ``dimension = dim``.
+        
+        For a 3-D tensor, :attr:`self` is updated as::
+        
+            self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+        
+        This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+        
+        :attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+        the same number of dimensions. It is also required that
+        ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+        ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+        Note that ``index`` and ``src`` do not broadcast.
+        
+        Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+        between ``0`` and ``self.size(dim) - 1`` inclusive.
+        
+        .. warning::
+        
+            When indices are not unique, the behavior is non-deterministic (one of the
+            values from ``src`` will be picked arbitrarily) and the gradient will be
+            incorrect (it will be propagated to all locations in the source that
+            correspond to the same index)!
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        Additionally accepts an optional :attr:`reduce` argument that allows
+        specification of an optional reduction operation, which is applied to all
+        values in the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+        operation is applied to an index in :attr:`self` which is specified by
+        its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+        value in :attr:`index` for ``dimension = dim``.
+        
+        Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+        is updated as::
+        
+            self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+        
+        Reducing with the addition operation is the same as using
+        :meth:`~torch.Tensor.scatter_add_`.
+        
+        .. warning::
+            The reduce argument with Tensor ``src`` is deprecated and will be removed in
+            a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+            instead for more reduction options.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            src (Tensor): the source element(s) to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> src = torch.arange(1, 11).reshape((2, 5))
+            >>> src
+            tensor([[ 1,  2,  3,  4,  5],
+                    [ 6,  7,  8,  9, 10]])
+            >>> index = torch.tensor([[0, 1, 2, 0]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+            tensor([[1, 0, 0, 4, 0],
+                    [0, 2, 0, 0, 0],
+                    [0, 0, 3, 0, 0]])
+            >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+            tensor([[1, 2, 3, 0, 0],
+                    [6, 7, 0, 0, 8],
+                    [0, 0, 0, 0, 0]])
+        
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='multiply')
+            tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 2.4600]])
+            >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+            ...            1.23, reduce='add')
+            tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+                    [2.0000, 2.0000, 2.0000, 3.2300]])
+        
+        .. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+           :noindex:
+        
+        Writes the value from :attr:`value` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+        with the :attr:`src` tensor filled entirely with :attr:`value`.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter, can be either empty
+                or of the same dimensionality as ``src``. When empty, the operation
+                returns ``self`` unchanged.
+            value (Scalar): the value to scatter.
+        
+        Keyword args:
+            reduce (str, optional): reduction operation to apply, can be either
+                ``'add'`` or ``'multiply'``.
+        
+        Example::
+        
+            >>> index = torch.tensor([[0, 1]])
+            >>> value = 2
+            >>> torch.zeros(3, 5).scatter_(0, index, value)
+            tensor([[2., 0., 0., 0., 0.],
+                    [0., 2., 0., 0., 0.],
+                    [0., 0., 0., 0., 0.]])
+        """
+        ...
+    @overload
+    def scatter_add(self, dim: _int, index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter_add(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+        """
+        ...
+    @overload
+    def scatter_add(self, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter_add(dim, index, src) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+        """
+        ...
+    def scatter_add_(self, dim: _int, index: Tensor, src: Tensor) -> Tensor: 
+        r"""
+        scatter_add_(dim, index, src) -> Tensor
+        
+        Adds all values from the tensor :attr:`src` into :attr:`self` at the indices
+        specified in the :attr:`index` tensor in a similar fashion as
+        :meth:`~torch.Tensor.scatter_`. For each value in :attr:`src`, it is added to
+        an index in :attr:`self` which is specified by its index in :attr:`src`
+        for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+        ``dimension = dim``.
+        
+        For a 3-D tensor, :attr:`self` is updated as::
+        
+            self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+        
+        :attr:`self`, :attr:`index` and :attr:`src` should have same number of
+        dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
+        dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+        ``d != dim``. Note that ``index`` and ``src`` do not broadcast.
+        
+        Note:
+            This operation may behave nondeterministically when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter and add, can be
+                either empty or of the same dimensionality as ``src``. When empty, the
+                operation returns ``self`` unchanged.
+            src (Tensor): the source elements to scatter and add
+        
+        Example::
+        
+            >>> src = torch.ones((2, 5))
+            >>> index = torch.tensor([[0, 1, 2, 0, 0]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+            tensor([[1., 0., 0., 1., 1.],
+                    [0., 1., 0., 0., 0.],
+                    [0., 0., 1., 0., 0.]])
+            >>> index = torch.tensor([[0, 1, 2, 0, 0], [0, 1, 2, 2, 2]])
+            >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+            tensor([[2., 0., 0., 1., 1.],
+                    [0., 2., 0., 0., 0.],
+                    [0., 0., 2., 1., 1.]])
+        """
+        ...
+    def scatter_reduce(self, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True) -> Tensor: 
+        r"""
+        scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor
+        
+        Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+        """
+        ...
+    def scatter_reduce_(self, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True) -> Tensor: 
+        r"""
+        scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor
+        
+        Reduces all values from the :attr:`src` tensor to the indices specified in
+        the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction
+        defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`,
+        :obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an
+        index in :attr:`self` which is specified by its index in :attr:`src` for
+        ``dimension != dim`` and by the corresponding value in :attr:`index` for
+        ``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self`
+        tensor are included in the reduction.
+        
+        :attr:`self`, :attr:`index` and :attr:`src` should all have
+        the same number of dimensions. It is also required that
+        ``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+        ``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+        Note that ``index`` and ``src`` do not broadcast.
+        
+        For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the
+        output is given as::
+        
+            self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+            self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+            self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+        
+        Note:
+            This operation may behave nondeterministically when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+        
+        .. note::
+        
+            The backward pass is implemented only for ``src.shape == index.shape``.
+        
+        .. warning::
+        
+            This function is in beta and may change in the near future.
+        
+        Args:
+            dim (int): the axis along which to index
+            index (LongTensor): the indices of elements to scatter and reduce.
+            src (Tensor): the source elements to scatter and reduce
+            reduce (str): the reduction operation to apply for non-unique indices
+                (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+            include_self (bool): whether elements from the :attr:`self` tensor are
+                included in the reduction
+        
+        Example::
+        
+            >>> src = torch.tensor([1., 2., 3., 4., 5., 6.])
+            >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+            >>> input = torch.tensor([1., 2., 3., 4.])
+            >>> input.scatter_reduce(0, index, src, reduce="sum")
+            tensor([5., 14., 8., 4.])
+            >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False)
+            tensor([4., 12., 5., 4.])
+            >>> input2 = torch.tensor([5., 4., 3., 2.])
+            >>> input2.scatter_reduce(0, index, src, reduce="amax")
+            tensor([5., 6., 5., 2.])
+            >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False)
+            tensor([3., 6., 5., 2.])
+        """
+        ...
+    @overload
+    def select(self, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+        r"""
+        select(dim, index) -> Tensor
+        
+        See :func:`torch.select`
+        """
+        ...
+    @overload
+    def select(self, dim: Union[str, ellipsis, None], index: _int) -> Tensor: 
+        r"""
+        select(dim, index) -> Tensor
+        
+        See :func:`torch.select`
+        """
+        ...
+    def select_scatter(self, src: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+        r"""
+        select_scatter(src, dim, index) -> Tensor
+        
+        See :func:`torch.select_scatter`
+        """
+        ...
+    @overload
+    def set_(self, storage: Union[Storage, TypedStorage, UntypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: 
+        r"""
+        set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+        
+        Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+        :attr:`self` tensor will share the same storage and have the same size and
+        strides as :attr:`source`. Changes to elements in one tensor will be reflected
+        in the other.
+        
+        If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+        storage, offset, size, and stride.
+        
+        Args:
+            source (Tensor or Storage): the tensor or storage to use
+            storage_offset (int, optional): the offset in the storage
+            size (torch.Size, optional): the desired size. Defaults to the size of the source.
+            stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+        """
+        ...
+    @overload
+    def set_(self, storage: Union[Storage, TypedStorage, UntypedStorage]) -> Tensor: 
+        r"""
+        set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+        
+        Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+        :attr:`self` tensor will share the same storage and have the same size and
+        strides as :attr:`source`. Changes to elements in one tensor will be reflected
+        in the other.
+        
+        If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+        storage, offset, size, and stride.
+        
+        Args:
+            source (Tensor or Storage): the tensor or storage to use
+            storage_offset (int, optional): the offset in the storage
+            size (torch.Size, optional): the desired size. Defaults to the size of the source.
+            stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+        """
+        ...
+    def sgn(self) -> Tensor: 
+        r"""
+        sgn() -> Tensor
+        
+        See :func:`torch.sgn`
+        """
+        ...
+    def sgn_(self) -> Tensor: 
+        r"""
+        sgn_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sgn`
+        """
+        ...
+    def short(self) -> Tensor: 
+        r"""
+        short(memory_format=torch.preserve_format) -> Tensor
+        
+        ``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+        
+        Args:
+            memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        """
+        ...
+    def sigmoid(self) -> Tensor: 
+        r"""
+        sigmoid() -> Tensor
+        
+        See :func:`torch.sigmoid`
+        """
+        ...
+    def sigmoid_(self) -> Tensor: 
+        r"""
+        sigmoid_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sigmoid`
+        """
+        ...
+    def sign(self) -> Tensor: 
+        r"""
+        sign() -> Tensor
+        
+        See :func:`torch.sign`
+        """
+        ...
+    def sign_(self) -> Tensor: 
+        r"""
+        sign_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sign`
+        """
+        ...
+    def signbit(self) -> Tensor: 
+        r"""
+        signbit() -> Tensor
+        
+        See :func:`torch.signbit`
+        """
+        ...
+    def sin(self) -> Tensor: 
+        r"""
+        sin() -> Tensor
+        
+        See :func:`torch.sin`
+        """
+        ...
+    def sin_(self) -> Tensor: 
+        r"""
+        sin_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sin`
+        """
+        ...
+    def sinc(self) -> Tensor: 
+        r"""
+        sinc() -> Tensor
+        
+        See :func:`torch.sinc`
+        """
+        ...
+    def sinc_(self) -> Tensor: 
+        r"""
+        sinc_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sinc`
+        """
+        ...
+    def sinh(self) -> Tensor: 
+        r"""
+        sinh() -> Tensor
+        
+        See :func:`torch.sinh`
+        """
+        ...
+    def sinh_(self) -> Tensor: 
+        r"""
+        sinh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sinh`
+        """
+        ...
+    @overload
+    def size(self, dim: None = None) -> Size: 
+        r"""
+        size(dim=None) -> torch.Size or int
+        
+        Returns the size of the :attr:`self` tensor. If ``dim`` is not specified,
+        the returned value is a :class:`torch.Size`, a subclass of :class:`tuple`.
+        If ``dim`` is specified, returns an int holding the size of that dimension.
+        
+        Args:
+          dim (int, optional): The dimension for which to retrieve the size.
+        
+        Example::
+        
+            >>> t = torch.empty(3, 4, 5)
+            >>> t.size()
+            torch.Size([3, 4, 5])
+            >>> t.size(dim=1)
+            4
+        """
+        ...
+    @overload
+    def size(self, dim: _int) -> _int: 
+        r"""
+        size(dim=None) -> torch.Size or int
+        
+        Returns the size of the :attr:`self` tensor. If ``dim`` is not specified,
+        the returned value is a :class:`torch.Size`, a subclass of :class:`tuple`.
+        If ``dim`` is specified, returns an int holding the size of that dimension.
+        
+        Args:
+          dim (int, optional): The dimension for which to retrieve the size.
+        
+        Example::
+        
+            >>> t = torch.empty(3, 4, 5)
+            >>> t.size()
+            torch.Size([3, 4, 5])
+            >>> t.size(dim=1)
+            4
+        """
+        ...
+    def slice_inverse(self, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: ...
+    def slice_scatter(self, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: 
+        r"""
+        slice_scatter(src, dim=0, start=None, end=None, step=1) -> Tensor
+        
+        See :func:`torch.slice_scatter`
+        """
+        ...
+    def slogdet(self) -> torch.return_types.slogdet: 
+        r"""
+        slogdet() -> (Tensor, Tensor)
+        
+        See :func:`torch.slogdet`
+        """
+        ...
+    def smm(self, mat2: Tensor) -> Tensor: 
+        r"""
+        smm(mat) -> Tensor
+        
+        See :func:`torch.smm`
+        """
+        ...
+    @overload
+    def softmax(self, dim: _int, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        softmax(dim) -> Tensor
+        
+        Alias for :func:`torch.nn.functional.softmax`.
+        """
+        ...
+    @overload
+    def softmax(self, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        softmax(dim) -> Tensor
+        
+        Alias for :func:`torch.nn.functional.softmax`.
+        """
+        ...
+    @overload
+    def sort(self, *, stable: Optional[_bool], dim: _int = -1, descending: _bool = False) -> torch.return_types.sort: 
+        r"""
+        sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.sort`
+        """
+        ...
+    @overload
+    def sort(self, dim: _int = -1, descending: _bool = False) -> torch.return_types.sort: 
+        r"""
+        sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.sort`
+        """
+        ...
+    @overload
+    def sort(self, *, stable: Optional[_bool], dim: Union[str, ellipsis, None], descending: _bool = False) -> torch.return_types.sort: 
+        r"""
+        sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.sort`
+        """
+        ...
+    @overload
+    def sort(self, dim: Union[str, ellipsis, None], descending: _bool = False) -> torch.return_types.sort: 
+        r"""
+        sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+        
+        See :func:`torch.sort`
+        """
+        ...
+    def sparse_dim(self) -> _int: 
+        r"""
+        sparse_dim() -> int
+        
+        Return the number of sparse dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+        
+        .. note::
+          Returns ``0`` if :attr:`self` is not a sparse tensor.
+        
+        See also :meth:`Tensor.dense_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+        """
+        ...
+    def sparse_mask(self, mask: Tensor) -> Tensor: 
+        r"""
+        sparse_mask(mask) -> Tensor
+        
+        Returns a new :ref:`sparse tensor <sparse-docs>` with values from a
+        strided tensor :attr:`self` filtered by the indices of the sparse
+        tensor :attr:`mask`. The values of :attr:`mask` sparse tensor are
+        ignored. :attr:`self` and :attr:`mask` tensors must have the same
+        shape.
+        
+        .. note::
+        
+          The returned sparse tensor might contain duplicate values if :attr:`mask`
+          is not coalesced. It is therefore advisable to pass ``mask.coalesce()``
+          if such behavior is not desired.
+        
+        .. note::
+        
+          The returned sparse tensor has the same indices as the sparse tensor
+          :attr:`mask`, even when the corresponding values in :attr:`self` are
+          zeros.
+        
+        Args:
+            mask (Tensor): a sparse tensor whose indices are used as a filter
+        
+        Example::
+        
+            >>> nse = 5
+            >>> dims = (5, 5, 2, 2)
+            >>> I = torch.cat([torch.randint(0, dims[0], size=(nse,)),
+            ...                torch.randint(0, dims[1], size=(nse,))], 0).reshape(2, nse)
+            >>> V = torch.randn(nse, dims[2], dims[3])
+            >>> S = torch.sparse_coo_tensor(I, V, dims).coalesce()
+            >>> D = torch.randn(dims)
+            >>> D.sparse_mask(S)
+            tensor(indices=tensor([[0, 0, 0, 2],
+                                   [0, 1, 4, 3]]),
+                   values=tensor([[[ 1.6550,  0.2397],
+                                   [-0.1611, -0.0779]],
+        
+                                  [[ 0.2326, -1.0558],
+                                   [ 1.4711,  1.9678]],
+        
+                                  [[-0.5138, -0.0411],
+                                   [ 1.9417,  0.5158]],
+        
+                                  [[ 0.0793,  0.0036],
+                                   [-0.2569, -0.1055]]]),
+                   size=(5, 5, 2, 2), nnz=4, layout=torch.sparse_coo)
+        """
+        ...
+    def sparse_resize_(self, size: _size, sparse_dim: _int, dense_dim: _int) -> Tensor: 
+        r"""
+        sparse_resize_(size, sparse_dim, dense_dim) -> Tensor
+        
+        Resizes :attr:`self` :ref:`sparse tensor <sparse-docs>` to the desired
+        size and the number of sparse and dense dimensions.
+        
+        .. note::
+          If the number of specified elements in :attr:`self` is zero, then
+          :attr:`size`, :attr:`sparse_dim`, and :attr:`dense_dim` can be any
+          size and positive integers such that ``len(size) == sparse_dim +
+          dense_dim``.
+        
+          If :attr:`self` specifies one or more elements, however, then each
+          dimension in :attr:`size` must not be smaller than the corresponding
+          dimension of :attr:`self`, :attr:`sparse_dim` must equal the number
+          of sparse dimensions in :attr:`self`, and :attr:`dense_dim` must
+          equal the number of dense dimensions in :attr:`self`.
+        
+        .. warning::
+          Throws an error if :attr:`self` is not a sparse tensor.
+        
+        Args:
+            size (torch.Size): the desired size. If :attr:`self` is non-empty
+              sparse tensor, the desired size cannot be smaller than the
+              original size.
+            sparse_dim (int): the number of sparse dimensions
+            dense_dim (int): the number of dense dimensions
+        """
+        ...
+    def sparse_resize_and_clear_(self, size: _size, sparse_dim: _int, dense_dim: _int) -> Tensor: 
+        r"""
+        sparse_resize_and_clear_(size, sparse_dim, dense_dim) -> Tensor
+        
+        Removes all specified elements from a :ref:`sparse tensor
+        <sparse-docs>` :attr:`self` and resizes :attr:`self` to the desired
+        size and the number of sparse and dense dimensions.
+        
+        .. warning:
+          Throws an error if :attr:`self` is not a sparse tensor.
+        
+        Args:
+            size (torch.Size): the desired size.
+            sparse_dim (int): the number of sparse dimensions
+            dense_dim (int): the number of dense dimensions
+        """
+        ...
+    @overload
+    def split(self, split_size: _int, dim: _int = 0) -> Sequence[Tensor]: ...
+    @overload
+    def split(self, split_size: Tuple[_int, ...], dim: _int = 0) -> Sequence[Tensor]: ...
+    def split_with_sizes(self, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+    def sqrt(self) -> Tensor: 
+        r"""
+        sqrt() -> Tensor
+        
+        See :func:`torch.sqrt`
+        """
+        ...
+    def sqrt_(self) -> Tensor: 
+        r"""
+        sqrt_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.sqrt`
+        """
+        ...
+    def square(self) -> Tensor: 
+        r"""
+        square() -> Tensor
+        
+        See :func:`torch.square`
+        """
+        ...
+    def square_(self) -> Tensor: 
+        r"""
+        square_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.square`
+        """
+        ...
+    @overload
+    def squeeze(self) -> Tensor: 
+        r"""
+        squeeze(dim=None) -> Tensor
+        
+        See :func:`torch.squeeze`
+        """
+        ...
+    @overload
+    def squeeze(self, dim: _int) -> Tensor: 
+        r"""
+        squeeze(dim=None) -> Tensor
+        
+        See :func:`torch.squeeze`
+        """
+        ...
+    @overload
+    def squeeze(self, dim: _size) -> Tensor: 
+        r"""
+        squeeze(dim=None) -> Tensor
+        
+        See :func:`torch.squeeze`
+        """
+        ...
+    @overload
+    def squeeze(self, *dim: _int) -> Tensor: 
+        r"""
+        squeeze(dim=None) -> Tensor
+        
+        See :func:`torch.squeeze`
+        """
+        ...
+    @overload
+    def squeeze(self, dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        squeeze(dim=None) -> Tensor
+        
+        See :func:`torch.squeeze`
+        """
+        ...
+    @overload
+    def squeeze_(self) -> Tensor: 
+        r"""
+        squeeze_(dim=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.squeeze`
+        """
+        ...
+    @overload
+    def squeeze_(self, dim: _int) -> Tensor: 
+        r"""
+        squeeze_(dim=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.squeeze`
+        """
+        ...
+    @overload
+    def squeeze_(self, dim: _size) -> Tensor: 
+        r"""
+        squeeze_(dim=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.squeeze`
+        """
+        ...
+    @overload
+    def squeeze_(self, *dim: _int) -> Tensor: 
+        r"""
+        squeeze_(dim=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.squeeze`
+        """
+        ...
+    @overload
+    def squeeze_(self, dim: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        squeeze_(dim=None) -> Tensor
+        
+        In-place version of :meth:`~Tensor.squeeze`
+        """
+        ...
+    def sspaddmm(self, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        sspaddmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+        
+        See :func:`torch.sspaddmm`
+        """
+        ...
+    @overload
+    def std(self, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tensor: 
+        r"""
+        std(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.std`
+        """
+        ...
+    @overload
+    def std(self, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        std(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.std`
+        """
+        ...
+    @overload
+    def std(self, unbiased: _bool = True) -> Tensor: 
+        r"""
+        std(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.std`
+        """
+        ...
+    @overload
+    def std(self, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tensor: 
+        r"""
+        std(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.std`
+        """
+        ...
+    @overload
+    def std(self, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        std(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.std`
+        """
+        ...
+    def untyped_storage(self) -> UntypedStorage: ...
+    def storage_offset(self) -> _int: 
+        r"""
+        storage_offset() -> int
+        
+        Returns :attr:`self` tensor's offset in the underlying storage in terms of
+        number of storage elements (not bytes).
+        
+        Example::
+        
+            >>> x = torch.tensor([1, 2, 3, 4, 5])
+            >>> x.storage_offset()
+            0
+            >>> x[3:].storage_offset()
+            3
+        """
+        ...
+    def storage_type(self) -> Storage: ...
+    @overload
+    def stride(self, dim: None = None) -> Tuple[_int, ...]: 
+        r"""
+        stride(dim) -> tuple or int
+        
+        Returns the stride of :attr:`self` tensor.
+        
+        Stride is the jump necessary to go from one element to the next one in the
+        specified dimension :attr:`dim`. A tuple of all strides is returned when no
+        argument is passed in. Otherwise, an integer value is returned as the stride in
+        the particular dimension :attr:`dim`.
+        
+        Args:
+            dim (int, optional): the desired dimension in which stride is required
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+            >>> x.stride()
+            (5, 1)
+            >>> x.stride(0)
+            5
+            >>> x.stride(-1)
+            1
+        """
+        ...
+    @overload
+    def stride(self, dim: _int) -> _int: 
+        r"""
+        stride(dim) -> tuple or int
+        
+        Returns the stride of :attr:`self` tensor.
+        
+        Stride is the jump necessary to go from one element to the next one in the
+        specified dimension :attr:`dim`. A tuple of all strides is returned when no
+        argument is passed in. Otherwise, an integer value is returned as the stride in
+        the particular dimension :attr:`dim`.
+        
+        Args:
+            dim (int, optional): the desired dimension in which stride is required
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+            >>> x.stride()
+            (5, 1)
+            >>> x.stride(0)
+            5
+            >>> x.stride(-1)
+            1
+        """
+        ...
+    def sub(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        sub(other, *, alpha=1) -> Tensor
+        
+        See :func:`torch.sub`.
+        """
+        ...
+    def sub_(self, other: Union[Tensor, Number, _complex, torch.SymInt, torch.SymFloat], *, alpha: Optional[Union[Number, _complex]] = 1) -> Tensor: 
+        r"""
+        sub_(other, *, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.sub`
+        """
+        ...
+    @overload
+    def subtract(self, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        subtract(other, *, alpha=1) -> Tensor
+        
+        See :func:`torch.subtract`.
+        """
+        ...
+    @overload
+    def subtract(self, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        subtract(other, *, alpha=1) -> Tensor
+        
+        See :func:`torch.subtract`.
+        """
+        ...
+    @overload
+    def subtract_(self, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        subtract_(other, *, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.subtract`.
+        """
+        ...
+    @overload
+    def subtract_(self, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: 
+        r"""
+        subtract_(other, *, alpha=1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.subtract`.
+        """
+        ...
+    @overload
+    def sum(self, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        sum(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.sum`
+        """
+        ...
+    @overload
+    def sum(self, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        sum(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.sum`
+        """
+        ...
+    @overload
+    def sum(self, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        sum(dim=None, keepdim=False, dtype=None) -> Tensor
+        
+        See :func:`torch.sum`
+        """
+        ...
+    @overload
+    def sum_to_size(self, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        sum_to_size(*size) -> Tensor
+        
+        Sum ``this`` tensor to :attr:`size`.
+        :attr:`size` must be broadcastable to ``this`` tensor size.
+        
+        Args:
+            size (int...): a sequence of integers defining the shape of the output tensor.
+        """
+        ...
+    @overload
+    def sum_to_size(self, *size: _int) -> Tensor: 
+        r"""
+        sum_to_size(*size) -> Tensor
+        
+        Sum ``this`` tensor to :attr:`size`.
+        :attr:`size` must be broadcastable to ``this`` tensor size.
+        
+        Args:
+            size (int...): a sequence of integers defining the shape of the output tensor.
+        """
+        ...
+    def svd(self, some: _bool = True, compute_uv: _bool = True) -> torch.return_types.svd: 
+        r"""
+        svd(some=True, compute_uv=True) -> (Tensor, Tensor, Tensor)
+        
+        See :func:`torch.svd`
+        """
+        ...
+    def swapaxes(self, axis0: _int, axis1: _int) -> Tensor: 
+        r"""
+        swapaxes(axis0, axis1) -> Tensor
+        
+        See :func:`torch.swapaxes`
+        """
+        ...
+    def swapaxes_(self, axis0: _int, axis1: _int) -> Tensor: 
+        r"""
+        swapaxes_(axis0, axis1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.swapaxes`
+        """
+        ...
+    def swapdims(self, dim0: _int, dim1: _int) -> Tensor: 
+        r"""
+        swapdims(dim0, dim1) -> Tensor
+        
+        See :func:`torch.swapdims`
+        """
+        ...
+    def swapdims_(self, dim0: _int, dim1: _int) -> Tensor: 
+        r"""
+        swapdims_(dim0, dim1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.swapdims`
+        """
+        ...
+    def t(self) -> Tensor: 
+        r"""
+        t() -> Tensor
+        
+        See :func:`torch.t`
+        """
+        ...
+    def t_(self) -> Tensor: 
+        r"""
+        t_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.t`
+        """
+        ...
+    def take(self, index: Tensor) -> Tensor: 
+        r"""
+        take(indices) -> Tensor
+        
+        See :func:`torch.take`
+        """
+        ...
+    def take_along_dim(self, indices: Tensor, dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        take_along_dim(indices, dim) -> Tensor
+        
+        See :func:`torch.take_along_dim`
+        """
+        ...
+    def tan(self) -> Tensor: 
+        r"""
+        tan() -> Tensor
+        
+        See :func:`torch.tan`
+        """
+        ...
+    def tan_(self) -> Tensor: 
+        r"""
+        tan_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.tan`
+        """
+        ...
+    def tanh(self) -> Tensor: 
+        r"""
+        tanh() -> Tensor
+        
+        See :func:`torch.tanh`
+        """
+        ...
+    def tanh_(self) -> Tensor: 
+        r"""
+        tanh_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.tanh`
+        """
+        ...
+    @overload
+    def tensor_split(self, indices: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        tensor_split(indices_or_sections, dim=0) -> List of Tensors
+        
+        See :func:`torch.tensor_split`
+        """
+        ...
+    @overload
+    def tensor_split(self, tensor_indices_or_sections: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        tensor_split(indices_or_sections, dim=0) -> List of Tensors
+        
+        See :func:`torch.tensor_split`
+        """
+        ...
+    @overload
+    def tensor_split(self, sections: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        tensor_split(indices_or_sections, dim=0) -> List of Tensors
+        
+        See :func:`torch.tensor_split`
+        """
+        ...
+    @overload
+    def tile(self, dims: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        tile(dims) -> Tensor
+        
+        See :func:`torch.tile`
+        """
+        ...
+    @overload
+    def tile(self, *dims: _int) -> Tensor: 
+        r"""
+        tile(dims) -> Tensor
+        
+        See :func:`torch.tile`
+        """
+        ...
+    @overload
+    def to(self, dtype: _dtype, non_blocking: _bool = False, copy: _bool = False, *, memory_format: Optional[torch.memory_format] = None) -> Tensor: 
+        r"""
+        to(*args, **kwargs) -> Tensor
+        
+        Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+        inferred from the arguments of ``self.to(*args, **kwargs)``.
+        
+        .. note::
+        
+            If the ``self`` Tensor already
+            has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, the returned tensor is a copy of ``self`` with the desired
+            :class:`torch.dtype` and :class:`torch.device`.
+        
+        Here are the ways to call ``to``:
+        
+        .. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`dtype`
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`device` and (optional)
+            :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+            When :attr:`non_blocking`, tries to convert asynchronously with respect to
+            the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+            CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(other, non_blocking=False, copy=False) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+            the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+            asynchronously with respect to the host if possible, e.g., converting a CPU
+            Tensor with pinned memory to a CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+        Example::
+        
+            >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+            >>> tensor.to(torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64)
+        
+            >>> cuda0 = torch.device('cuda:0')
+            >>> tensor.to(cuda0)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], device='cuda:0')
+        
+            >>> tensor.to(cuda0, dtype=torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        
+            >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+            >>> tensor.to(other, non_blocking=True)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        """
+        ...
+    @overload
+    def to(self, device: Optional[DeviceLikeType] = None, dtype: Optional[_dtype] = None, non_blocking: _bool = False, copy: _bool = False, *, memory_format: Optional[torch.memory_format] = None) -> Tensor: 
+        r"""
+        to(*args, **kwargs) -> Tensor
+        
+        Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+        inferred from the arguments of ``self.to(*args, **kwargs)``.
+        
+        .. note::
+        
+            If the ``self`` Tensor already
+            has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, the returned tensor is a copy of ``self`` with the desired
+            :class:`torch.dtype` and :class:`torch.device`.
+        
+        Here are the ways to call ``to``:
+        
+        .. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`dtype`
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`device` and (optional)
+            :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+            When :attr:`non_blocking`, tries to convert asynchronously with respect to
+            the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+            CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(other, non_blocking=False, copy=False) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+            the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+            asynchronously with respect to the host if possible, e.g., converting a CPU
+            Tensor with pinned memory to a CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+        Example::
+        
+            >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+            >>> tensor.to(torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64)
+        
+            >>> cuda0 = torch.device('cuda:0')
+            >>> tensor.to(cuda0)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], device='cuda:0')
+        
+            >>> tensor.to(cuda0, dtype=torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        
+            >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+            >>> tensor.to(other, non_blocking=True)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        """
+        ...
+    @overload
+    def to(self, other: Tensor, non_blocking: _bool = False, copy: _bool = False, *, memory_format: Optional[torch.memory_format] = None) -> Tensor: 
+        r"""
+        to(*args, **kwargs) -> Tensor
+        
+        Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+        inferred from the arguments of ``self.to(*args, **kwargs)``.
+        
+        .. note::
+        
+            If the ``self`` Tensor already
+            has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, the returned tensor is a copy of ``self`` with the desired
+            :class:`torch.dtype` and :class:`torch.device`.
+        
+        Here are the ways to call ``to``:
+        
+        .. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`dtype`
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with the specified :attr:`device` and (optional)
+            :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+            When :attr:`non_blocking`, tries to convert asynchronously with respect to
+            the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+            CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+            Args:
+                memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+                returned Tensor. Default: ``torch.preserve_format``.
+        
+        .. method:: to(other, non_blocking=False, copy=False) -> Tensor
+           :noindex:
+        
+            Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+            the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+            asynchronously with respect to the host if possible, e.g., converting a CPU
+            Tensor with pinned memory to a CUDA Tensor.
+            When :attr:`copy` is set, a new Tensor is created even when the Tensor
+            already matches the desired conversion.
+        
+        Example::
+        
+            >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+            >>> tensor.to(torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64)
+        
+            >>> cuda0 = torch.device('cuda:0')
+            >>> tensor.to(cuda0)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], device='cuda:0')
+        
+            >>> tensor.to(cuda0, dtype=torch.float64)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        
+            >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+            >>> tensor.to(other, non_blocking=True)
+            tensor([[-0.5044,  0.0005],
+                    [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+        """
+        ...
+    def to_dense(self, dtype: Optional[_dtype] = None, *, masked_grad: Optional[_bool] = None) -> Tensor: 
+        r"""
+        to_dense(dtype=None, *, masked_grad=True) -> Tensor
+        
+        Creates a strided copy of :attr:`self` if :attr:`self` is not a strided tensor, otherwise returns :attr:`self`.
+        
+        Keyword args:
+            {dtype}
+            masked_grad (bool, optional): If set to ``True`` (default) and
+              :attr:`self` has a sparse layout then the backward of
+              :meth:`to_dense` returns ``grad.sparse_mask(self)``.
+        
+        Example::
+        
+            >>> s = torch.sparse_coo_tensor(
+            ...        torch.tensor([[1, 1],
+            ...                      [0, 2]]),
+            ...        torch.tensor([9, 10]),
+            ...        size=(3, 3))
+            >>> s.to_dense()
+            tensor([[ 0,  0,  0],
+                    [ 9,  0, 10],
+                    [ 0,  0,  0]])
+        """
+        ...
+    def to_mkldnn(self, dtype: Optional[_dtype] = None) -> Tensor: 
+        r"""
+        to_mkldnn() -> Tensor
+        Returns a copy of the tensor in ``torch.mkldnn`` layout.
+        """
+        ...
+    def to_padded_tensor(self, padding: _float, output_size: Optional[Sequence[Union[_int, SymInt]]] = None) -> Tensor: 
+        r"""
+        to_padded_tensor(padding, output_size=None) -> Tensor
+        See :func:`to_padded_tensor`
+        """
+        ...
+    @overload
+    def to_sparse(self, *, layout: Optional[_layout] = None, blocksize: Optional[Union[_int, _size]] = None, dense_dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        to_sparse(sparseDims) -> Tensor
+        
+        Returns a sparse copy of the tensor.  PyTorch supports sparse tensors in
+        :ref:`coordinate format <sparse-coo-docs>`.
+        
+        Args:
+            sparseDims (int, optional): the number of sparse dimensions to include in the new sparse tensor
+        
+        Example::
+        
+            >>> d = torch.tensor([[0, 0, 0], [9, 0, 10], [0, 0, 0]])
+            >>> d
+            tensor([[ 0,  0,  0],
+                    [ 9,  0, 10],
+                    [ 0,  0,  0]])
+            >>> d.to_sparse()
+            tensor(indices=tensor([[1, 1],
+                                   [0, 2]]),
+                   values=tensor([ 9, 10]),
+                   size=(3, 3), nnz=2, layout=torch.sparse_coo)
+            >>> d.to_sparse(1)
+            tensor(indices=tensor([[1]]),
+                   values=tensor([[ 9,  0, 10]]),
+                   size=(3, 3), nnz=1, layout=torch.sparse_coo)
+        
+        .. method:: to_sparse(*, layout=None, blocksize=None, dense_dim=None) -> Tensor
+           :noindex:
+        
+        Returns a sparse tensor with the specified layout and blocksize.  If
+        the :attr:`self` is strided, the number of dense dimensions could be
+        specified, and a hybrid sparse tensor will be created, with
+        `dense_dim` dense dimensions and `self.dim() - 2 - dense_dim` batch
+        dimension.
+        
+        .. note:: If the :attr:`self` layout and blocksize parameters match
+                  with the specified layout and blocksize, return
+                  :attr:`self`. Otherwise, return a sparse tensor copy of
+                  :attr:`self`.
+        
+        Args:
+        
+            layout (:class:`torch.layout`, optional): The desired sparse
+              layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+              ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+              ``torch.sparse_bsc``. Default: if ``None``,
+              ``torch.sparse_coo``.
+        
+            blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+              of the resulting BSR or BSC tensor. For other layouts,
+              specifying the block size that is not ``None`` will result in a
+              RuntimeError exception.  A block size must be a tuple of length
+              two such that its items evenly divide the two sparse dimensions.
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting CSR, CSC, BSR or BSC tensor.  This argument should be
+              used only if :attr:`self` is a strided tensor, and must be a
+              value between 0 and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+            >>> x.to_sparse(layout=torch.sparse_coo)
+            tensor(indices=tensor([[0, 2, 2],
+                                   [0, 0, 1]]),
+                   values=tensor([1, 2, 3]),
+                   size=(3, 2), nnz=3, layout=torch.sparse_coo)
+            >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+            tensor(crow_indices=tensor([0, 1, 1, 2]),
+                   col_indices=tensor([0, 0]),
+                   values=tensor([[[1, 0]],
+                                  [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+            >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+            RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+            >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+            RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
+        
+            >>> x = torch.tensor([[[1], [0]], [[0], [0]], [[2], [3]]])
+            >>> x.to_sparse(layout=torch.sparse_csr, dense_dim=1)
+            tensor(crow_indices=tensor([0, 1, 1, 3]),
+                   col_indices=tensor([0, 0, 1]),
+                   values=tensor([[1],
+                                  [2],
+                                  [3]]), size=(3, 2, 1), nnz=3, layout=torch.sparse_csr)
+        """
+        ...
+    @overload
+    def to_sparse(self, sparse_dim: _int) -> Tensor: 
+        r"""
+        to_sparse(sparseDims) -> Tensor
+        
+        Returns a sparse copy of the tensor.  PyTorch supports sparse tensors in
+        :ref:`coordinate format <sparse-coo-docs>`.
+        
+        Args:
+            sparseDims (int, optional): the number of sparse dimensions to include in the new sparse tensor
+        
+        Example::
+        
+            >>> d = torch.tensor([[0, 0, 0], [9, 0, 10], [0, 0, 0]])
+            >>> d
+            tensor([[ 0,  0,  0],
+                    [ 9,  0, 10],
+                    [ 0,  0,  0]])
+            >>> d.to_sparse()
+            tensor(indices=tensor([[1, 1],
+                                   [0, 2]]),
+                   values=tensor([ 9, 10]),
+                   size=(3, 3), nnz=2, layout=torch.sparse_coo)
+            >>> d.to_sparse(1)
+            tensor(indices=tensor([[1]]),
+                   values=tensor([[ 9,  0, 10]]),
+                   size=(3, 3), nnz=1, layout=torch.sparse_coo)
+        
+        .. method:: to_sparse(*, layout=None, blocksize=None, dense_dim=None) -> Tensor
+           :noindex:
+        
+        Returns a sparse tensor with the specified layout and blocksize.  If
+        the :attr:`self` is strided, the number of dense dimensions could be
+        specified, and a hybrid sparse tensor will be created, with
+        `dense_dim` dense dimensions and `self.dim() - 2 - dense_dim` batch
+        dimension.
+        
+        .. note:: If the :attr:`self` layout and blocksize parameters match
+                  with the specified layout and blocksize, return
+                  :attr:`self`. Otherwise, return a sparse tensor copy of
+                  :attr:`self`.
+        
+        Args:
+        
+            layout (:class:`torch.layout`, optional): The desired sparse
+              layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+              ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+              ``torch.sparse_bsc``. Default: if ``None``,
+              ``torch.sparse_coo``.
+        
+            blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+              of the resulting BSR or BSC tensor. For other layouts,
+              specifying the block size that is not ``None`` will result in a
+              RuntimeError exception.  A block size must be a tuple of length
+              two such that its items evenly divide the two sparse dimensions.
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting CSR, CSC, BSR or BSC tensor.  This argument should be
+              used only if :attr:`self` is a strided tensor, and must be a
+              value between 0 and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+            >>> x.to_sparse(layout=torch.sparse_coo)
+            tensor(indices=tensor([[0, 2, 2],
+                                   [0, 0, 1]]),
+                   values=tensor([1, 2, 3]),
+                   size=(3, 2), nnz=3, layout=torch.sparse_coo)
+            >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+            tensor(crow_indices=tensor([0, 1, 1, 2]),
+                   col_indices=tensor([0, 0]),
+                   values=tensor([[[1, 0]],
+                                  [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+            >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+            RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+            >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+            RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
+        
+            >>> x = torch.tensor([[[1], [0]], [[0], [0]], [[2], [3]]])
+            >>> x.to_sparse(layout=torch.sparse_csr, dense_dim=1)
+            tensor(crow_indices=tensor([0, 1, 1, 3]),
+                   col_indices=tensor([0, 0, 1]),
+                   values=tensor([[1],
+                                  [2],
+                                  [3]]), size=(3, 2, 1), nnz=3, layout=torch.sparse_csr)
+        """
+        ...
+    def to_sparse_bsc(self, blocksize: Union[_int, _size], dense_dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        to_sparse_bsc(blocksize, dense_dim) -> Tensor
+        
+        Convert a tensor to a block sparse column (BSC) storage format of
+        given blocksize.  If the :attr:`self` is strided, then the number of
+        dense dimensions could be specified, and a hybrid BSC tensor will be
+        created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+        dense_dim` batch dimension.
+        
+        Args:
+        
+            blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+              of the resulting BSC tensor. A block size must be a tuple of
+              length two such that its items evenly divide the two sparse
+              dimensions.
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting BSC tensor.  This argument should be used only if
+              :attr:`self` is a strided tensor, and must be a value between 0
+              and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> dense = torch.randn(10, 10)
+            >>> sparse = dense.to_sparse_csr()
+            >>> sparse_bsc = sparse.to_sparse_bsc((5, 5))
+            >>> sparse_bsc.row_indices()
+            tensor([0, 1, 0, 1])
+        
+            >>> dense = torch.zeros(4, 3, 1)
+            >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+            >>> dense.to_sparse_bsc((2, 1), 1)
+            tensor(ccol_indices=tensor([0, 1, 2, 3]),
+                   row_indices=tensor([0, 1, 0]),
+                   values=tensor([[[[1.]],
+        
+                                   [[1.]]],
+        
+        
+                                  [[[1.]],
+        
+                                   [[1.]]],
+        
+        
+                                  [[[1.]],
+        
+                                   [[1.]]]]), size=(4, 3, 1), nnz=3,
+                   layout=torch.sparse_bsc)
+        """
+        ...
+    def to_sparse_bsr(self, blocksize: Union[_int, _size], dense_dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        to_sparse_bsr(blocksize, dense_dim) -> Tensor
+        
+        Convert a tensor to a block sparse row (BSR) storage format of given
+        blocksize.  If the :attr:`self` is strided, then the number of dense
+        dimensions could be specified, and a hybrid BSR tensor will be
+        created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+        dense_dim` batch dimension.
+        
+        Args:
+        
+            blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+              of the resulting BSR tensor. A block size must be a tuple of
+              length two such that its items evenly divide the two sparse
+              dimensions.
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting BSR tensor.  This argument should be used only if
+              :attr:`self` is a strided tensor, and must be a value between 0
+              and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> dense = torch.randn(10, 10)
+            >>> sparse = dense.to_sparse_csr()
+            >>> sparse_bsr = sparse.to_sparse_bsr((5, 5))
+            >>> sparse_bsr.col_indices()
+            tensor([0, 1, 0, 1])
+        
+            >>> dense = torch.zeros(4, 3, 1)
+            >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+            >>> dense.to_sparse_bsr((2, 1), 1)
+            tensor(crow_indices=tensor([0, 2, 3]),
+                   col_indices=tensor([0, 2, 1]),
+                   values=tensor([[[[1.]],
+        
+                                   [[1.]]],
+        
+        
+                                  [[[1.]],
+        
+                                   [[1.]]],
+        
+        
+                                  [[[1.]],
+        
+                                   [[1.]]]]), size=(4, 3, 1), nnz=3,
+                   layout=torch.sparse_bsr)
+        """
+        ...
+    def to_sparse_csc(self, dense_dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        to_sparse_csc() -> Tensor
+        
+        Convert a tensor to compressed column storage (CSC) format.  Except
+        for strided tensors, only works with 2D tensors.  If the :attr:`self`
+        is strided, then the number of dense dimensions could be specified,
+        and a hybrid CSC tensor will be created, with `dense_dim` dense
+        dimensions and `self.dim() - 2 - dense_dim` batch dimension.
+        
+        Args:
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting CSC tensor.  This argument should be used only if
+              :attr:`self` is a strided tensor, and must be a value between 0
+              and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> dense = torch.randn(5, 5)
+            >>> sparse = dense.to_sparse_csc()
+            >>> sparse._nnz()
+            25
+        
+            >>> dense = torch.zeros(3, 3, 1, 1)
+            >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+            >>> dense.to_sparse_csc(dense_dim=2)
+            tensor(ccol_indices=tensor([0, 1, 2, 3]),
+                   row_indices=tensor([0, 2, 1]),
+                   values=tensor([[[1.]],
+        
+                                  [[1.]],
+        
+                                  [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+                   layout=torch.sparse_csc)
+        """
+        ...
+    def to_sparse_csr(self, dense_dim: Optional[_int] = None) -> Tensor: 
+        r"""
+        to_sparse_csr(dense_dim=None) -> Tensor
+        
+        Convert a tensor to compressed row storage format (CSR).  Except for
+        strided tensors, only works with 2D tensors.  If the :attr:`self` is
+        strided, then the number of dense dimensions could be specified, and a
+        hybrid CSR tensor will be created, with `dense_dim` dense dimensions
+        and `self.dim() - 2 - dense_dim` batch dimension.
+        
+        Args:
+        
+            dense_dim (int, optional): Number of dense dimensions of the
+              resulting CSR tensor.  This argument should be used only if
+              :attr:`self` is a strided tensor, and must be a value between 0
+              and dimension of :attr:`self` tensor minus two.
+        
+        Example::
+        
+            >>> dense = torch.randn(5, 5)
+            >>> sparse = dense.to_sparse_csr()
+            >>> sparse._nnz()
+            25
+        
+            >>> dense = torch.zeros(3, 3, 1, 1)
+            >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+            >>> dense.to_sparse_csr(dense_dim=2)
+            tensor(crow_indices=tensor([0, 1, 2, 3]),
+                   col_indices=tensor([0, 2, 1]),
+                   values=tensor([[[1.]],
+        
+                                  [[1.]],
+        
+                                  [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+                   layout=torch.sparse_csr)
+        """
+        ...
+    def tolist(self) -> List: 
+        r"""
+        tolist() -> list or number
+        
+        Returns the tensor as a (nested) list. For scalars, a standard
+        Python number is returned, just like with :meth:`~Tensor.item`.
+        Tensors are automatically moved to the CPU first if necessary.
+        
+        This operation is not differentiable.
+        
+        Examples::
+        
+            >>> a = torch.randn(2, 2)
+            >>> a.tolist()
+            [[0.012766935862600803, 0.5415473580360413],
+             [-0.08909505605697632, 0.7729271650314331]]
+            >>> a[0,0].tolist()
+            0.012766935862600803
+        """
+        ...
+    def topk(self, k: Union[_int, SymInt], dim: _int = -1, largest: _bool = True, sorted: _bool = True) -> torch.return_types.topk: 
+        r"""
+        topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
+        
+        See :func:`torch.topk`
+        """
+        ...
+    def trace(self) -> Tensor: 
+        r"""
+        trace() -> Tensor
+        
+        See :func:`torch.trace`
+        """
+        ...
+    @overload
+    def transpose(self, dim0: _int, dim1: _int) -> Tensor: 
+        r"""
+        transpose(dim0, dim1) -> Tensor
+        
+        See :func:`torch.transpose`
+        """
+        ...
+    @overload
+    def transpose(self, dim0: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None]) -> Tensor: 
+        r"""
+        transpose(dim0, dim1) -> Tensor
+        
+        See :func:`torch.transpose`
+        """
+        ...
+    def transpose_(self, dim0: _int, dim1: _int) -> Tensor: 
+        r"""
+        transpose_(dim0, dim1) -> Tensor
+        
+        In-place version of :meth:`~Tensor.transpose`
+        """
+        ...
+    def triangular_solve(self, A: Tensor, upper: _bool = True, transpose: _bool = False, unitriangular: _bool = False) -> torch.return_types.triangular_solve: 
+        r"""
+        triangular_solve(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+        
+        See :func:`torch.triangular_solve`
+        """
+        ...
+    def tril(self, diagonal: _int = 0) -> Tensor: 
+        r"""
+        tril(diagonal=0) -> Tensor
+        
+        See :func:`torch.tril`
+        """
+        ...
+    def tril_(self, diagonal: _int = 0) -> Tensor: 
+        r"""
+        tril_(diagonal=0) -> Tensor
+        
+        In-place version of :meth:`~Tensor.tril`
+        """
+        ...
+    def triu(self, diagonal: _int = 0) -> Tensor: 
+        r"""
+        triu(diagonal=0) -> Tensor
+        
+        See :func:`torch.triu`
+        """
+        ...
+    def triu_(self, diagonal: _int = 0) -> Tensor: 
+        r"""
+        triu_(diagonal=0) -> Tensor
+        
+        In-place version of :meth:`~Tensor.triu`
+        """
+        ...
+    def true_divide(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat], *, out: Optional[Tensor] = None) -> Tensor: 
+        r"""
+        true_divide(value) -> Tensor
+        
+        See :func:`torch.true_divide`
+        """
+        ...
+    def true_divide_(self, other: Union[Tensor, Number, torch.SymInt, torch.SymFloat]) -> Tensor: 
+        r"""
+        true_divide_(value) -> Tensor
+        
+        In-place version of :meth:`~Tensor.true_divide_`
+        """
+        ...
+    def trunc(self) -> Tensor: 
+        r"""
+        trunc() -> Tensor
+        
+        See :func:`torch.trunc`
+        """
+        ...
+    def trunc_(self) -> Tensor: 
+        r"""
+        trunc_() -> Tensor
+        
+        In-place version of :meth:`~Tensor.trunc`
+        """
+        ...
+    @overload
+    def type(self, dtype: None = None, non_blocking: _bool = False) -> str: 
+        r"""
+        type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+        Returns the type if `dtype` is not provided, else casts this object to
+        the specified type.
+        
+        If this is already of the correct type, no copy is performed and the
+        original object is returned.
+        
+        Args:
+            dtype (dtype or string): The desired type
+            non_blocking (bool): If ``True``, and the source is in pinned memory
+                and destination is on the GPU or vice versa, the copy is performed
+                asynchronously with respect to the host. Otherwise, the argument
+                has no effect.
+            **kwargs: For compatibility, may contain the key ``async`` in place of
+                the ``non_blocking`` argument. The ``async`` arg is deprecated.
+        """
+        ...
+    @overload
+    def type(self, dtype: Union[str, _dtype], non_blocking: _bool = False) -> Tensor: 
+        r"""
+        type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+        Returns the type if `dtype` is not provided, else casts this object to
+        the specified type.
+        
+        If this is already of the correct type, no copy is performed and the
+        original object is returned.
+        
+        Args:
+            dtype (dtype or string): The desired type
+            non_blocking (bool): If ``True``, and the source is in pinned memory
+                and destination is on the GPU or vice versa, the copy is performed
+                asynchronously with respect to the host. Otherwise, the argument
+                has no effect.
+            **kwargs: For compatibility, may contain the key ``async`` in place of
+                the ``non_blocking`` argument. The ``async`` arg is deprecated.
+        """
+        ...
+    def type_as(self, other: Tensor) -> Tensor: 
+        r"""
+        type_as(tensor) -> Tensor
+        
+        Returns this tensor cast to the type of the given tensor.
+        
+        This is a no-op if the tensor is already of the correct type. This is
+        equivalent to ``self.type(tensor.type())``
+        
+        Args:
+            tensor (Tensor): the tensor which has the desired type
+        """
+        ...
+    @overload
+    def unbind(self, dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        unbind(dim=0) -> seq
+        
+        See :func:`torch.unbind`
+        """
+        ...
+    @overload
+    def unbind(self, dim: Union[str, ellipsis, None]) -> Tuple[Tensor, ...]: 
+        r"""
+        unbind(dim=0) -> seq
+        
+        See :func:`torch.unbind`
+        """
+        ...
+    @overload
+    def unflatten(self, dim: Union[str, ellipsis, None], sizes: Sequence[Union[_int, SymInt]], names: Sequence[Union[str, ellipsis, None]]) -> Tensor: ...
+    @overload
+    def unflatten(self, dim: _int, sizes: Sequence[Union[_int, SymInt]]) -> Tensor: ...
+    def unfold(self, dimension: _int, size: _int, step: _int) -> Tensor: 
+        r"""
+        unfold(dimension, size, step) -> Tensor
+        
+        Returns a view of the original tensor which contains all slices of size :attr:`size` from
+        :attr:`self` tensor in the dimension :attr:`dimension`.
+        
+        Step between two slices is given by :attr:`step`.
+        
+        If `sizedim` is the size of dimension :attr:`dimension` for :attr:`self`, the size of
+        dimension :attr:`dimension` in the returned tensor will be
+        `(sizedim - size) / step + 1`.
+        
+        An additional dimension of size :attr:`size` is appended in the returned tensor.
+        
+        Args:
+            dimension (int): dimension in which unfolding happens
+            size (int): the size of each slice that is unfolded
+            step (int): the step between each slice
+        
+        Example::
+        
+            >>> x = torch.arange(1., 8)
+            >>> x
+            tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
+            >>> x.unfold(0, 2, 1)
+            tensor([[ 1.,  2.],
+                    [ 2.,  3.],
+                    [ 3.,  4.],
+                    [ 4.,  5.],
+                    [ 5.,  6.],
+                    [ 6.,  7.]])
+            >>> x.unfold(0, 2, 2)
+            tensor([[ 1.,  2.],
+                    [ 3.,  4.],
+                    [ 5.,  6.]])
+        """
+        ...
+    def uniform_(self, from_: _float = 0, to: _float = 1, *, generator: Optional[Generator] = None) -> Tensor: 
+        r"""
+        uniform_(from=0, to=1, *, generator=None) -> Tensor
+        
+        Fills :attr:`self` tensor with numbers sampled from the continuous uniform
+        distribution:
+        
+        .. math::
+            f(x) = \dfrac{1}{\text{to} - \text{from}}
+        """
+        ...
+    def unsafe_chunk(self, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        unsafe_chunk(chunks, dim=0) -> List of Tensors
+        
+        See :func:`torch.unsafe_chunk`
+        """
+        ...
+    def unsafe_split(self, split_size: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+        r"""
+        unsafe_split(split_size, dim=0) -> List of Tensors
+        
+        See :func:`torch.unsafe_split`
+        """
+        ...
+    def unsafe_split_with_sizes(self, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+    def unsqueeze(self, dim: _int) -> Tensor: 
+        r"""
+        unsqueeze(dim) -> Tensor
+        
+        See :func:`torch.unsqueeze`
+        """
+        ...
+    def unsqueeze_(self, dim: _int) -> Tensor: 
+        r"""
+        unsqueeze_(dim) -> Tensor
+        
+        In-place version of :meth:`~Tensor.unsqueeze`
+        """
+        ...
+    def values(self) -> Tensor: 
+        r"""
+        values() -> Tensor
+        
+        Return the values tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+        
+        .. warning::
+          Throws an error if :attr:`self` is not a sparse COO tensor.
+        
+        See also :meth:`Tensor.indices`.
+        
+        .. note::
+          This method can only be called on a coalesced sparse tensor. See
+          :meth:`Tensor.coalesce` for details.
+        """
+        ...
+    @overload
+    def var(self, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tensor: 
+        r"""
+        var(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.var`
+        """
+        ...
+    @overload
+    def var(self, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        var(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.var`
+        """
+        ...
+    @overload
+    def var(self, unbiased: _bool = True) -> Tensor: 
+        r"""
+        var(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.var`
+        """
+        ...
+    @overload
+    def var(self, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tensor: 
+        r"""
+        var(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.var`
+        """
+        ...
+    @overload
+    def var(self, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tensor: 
+        r"""
+        var(dim=None, *, correction=1, keepdim=False) -> Tensor
+        
+        See :func:`torch.var`
+        """
+        ...
+    def vdot(self, other: Tensor) -> Tensor: 
+        r"""
+        vdot(other) -> Tensor
+        
+        See :func:`torch.vdot`
+        """
+        ...
+    @overload
+    def view(self, dtype: _dtype) -> Tensor: 
+        r"""
+        view(*shape) -> Tensor
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`shape`.
+        
+        The returned tensor shares the same data and must have the same number
+        of elements, but may have a different size. For a tensor to be viewed, the new
+        view size must be compatible with its original size and stride, i.e., each new
+        view dimension must either be a subspace of an original dimension, or only span
+        across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+        contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+        
+        .. math::
+        
+          \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+        
+        Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+        without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+        :meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+        returns a view if the shapes are compatible, and copies (equivalent to calling
+        :meth:`contiguous`) otherwise.
+        
+        Args:
+            shape (torch.Size or int...): the desired size
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x.size()
+            torch.Size([4, 4])
+            >>> y = x.view(16)
+            >>> y.size()
+            torch.Size([16])
+            >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+            >>> z.size()
+            torch.Size([2, 8])
+        
+            >>> a = torch.randn(1, 2, 3, 4)
+            >>> a.size()
+            torch.Size([1, 2, 3, 4])
+            >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+            >>> b.size()
+            torch.Size([1, 3, 2, 4])
+            >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+            >>> c.size()
+            torch.Size([1, 3, 2, 4])
+            >>> torch.equal(b, c)
+            False
+        
+        
+        .. method:: view(dtype) -> Tensor
+           :noindex:
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`dtype`.
+        
+        If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+        then the size of the last dimension of the output will be scaled
+        proportionally.  For instance, if :attr:`dtype` element size is twice that of
+        ``self.dtype``, then each pair of elements in the last dimension of
+        :attr:`self` will be combined, and the size of the last dimension of the output
+        will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+        of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+        be split in two, and the size of the last dimension of the output will be
+        double that of :attr:`self`. For this to be possible, the following conditions
+        must be true:
+        
+            * ``self.dim()`` must be greater than 0.
+            * ``self.stride(-1)`` must be 1.
+        
+        Additionally, if the element size of :attr:`dtype` is greater than that of
+        ``self.dtype``, the following conditions must be true as well:
+        
+            * ``self.size(-1)`` must be divisible by the ratio between the element
+              sizes of the dtypes.
+            * ``self.storage_offset()`` must be divisible by the ratio between the
+              element sizes of the dtypes.
+            * The strides of all dimensions, except the last dimension, must be
+              divisible by the ratio between the element sizes of the dtypes.
+        
+        If any of the above conditions are not met, an error is thrown.
+        
+        .. warning::
+        
+            This overload is not supported by TorchScript, and using it in a Torchscript
+            program will cause undefined behavior.
+        
+        
+        Args:
+            dtype (:class:`torch.dtype`): the desired dtype
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x
+            tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+            >>> x.dtype
+            torch.float32
+        
+            >>> y = x.view(torch.int32)
+            >>> y
+            tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+                    [-1105482831,  1061112040,  1057999968, -1084397505],
+                    [-1071760287, -1123489973, -1097310419, -1084649136],
+                    [-1101533110,  1073668768, -1082790149, -1088634448]],
+                dtype=torch.int32)
+            >>> y[0, 0] = 1000000000
+            >>> x
+            tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+        
+            >>> x.view(torch.cfloat)
+            tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+                    [-0.1520+0.7472j,  0.5617-0.8649j],
+                    [-2.4724-0.0334j, -0.2976-0.8499j],
+                    [-0.2109+1.9913j, -0.9607-0.6123j]])
+            >>> x.view(torch.cfloat).size()
+            torch.Size([4, 2])
+        
+            >>> x.view(torch.uint8)
+            tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+                       8, 191],
+                    [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+                      93, 191],
+                    [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+                      89, 191],
+                    [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+                      28, 191]], dtype=torch.uint8)
+            >>> x.view(torch.uint8).size()
+            torch.Size([4, 16])
+        """
+        ...
+    @overload
+    def view(self, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+        r"""
+        view(*shape) -> Tensor
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`shape`.
+        
+        The returned tensor shares the same data and must have the same number
+        of elements, but may have a different size. For a tensor to be viewed, the new
+        view size must be compatible with its original size and stride, i.e., each new
+        view dimension must either be a subspace of an original dimension, or only span
+        across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+        contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+        
+        .. math::
+        
+          \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+        
+        Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+        without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+        :meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+        returns a view if the shapes are compatible, and copies (equivalent to calling
+        :meth:`contiguous`) otherwise.
+        
+        Args:
+            shape (torch.Size or int...): the desired size
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x.size()
+            torch.Size([4, 4])
+            >>> y = x.view(16)
+            >>> y.size()
+            torch.Size([16])
+            >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+            >>> z.size()
+            torch.Size([2, 8])
+        
+            >>> a = torch.randn(1, 2, 3, 4)
+            >>> a.size()
+            torch.Size([1, 2, 3, 4])
+            >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+            >>> b.size()
+            torch.Size([1, 3, 2, 4])
+            >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+            >>> c.size()
+            torch.Size([1, 3, 2, 4])
+            >>> torch.equal(b, c)
+            False
+        
+        
+        .. method:: view(dtype) -> Tensor
+           :noindex:
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`dtype`.
+        
+        If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+        then the size of the last dimension of the output will be scaled
+        proportionally.  For instance, if :attr:`dtype` element size is twice that of
+        ``self.dtype``, then each pair of elements in the last dimension of
+        :attr:`self` will be combined, and the size of the last dimension of the output
+        will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+        of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+        be split in two, and the size of the last dimension of the output will be
+        double that of :attr:`self`. For this to be possible, the following conditions
+        must be true:
+        
+            * ``self.dim()`` must be greater than 0.
+            * ``self.stride(-1)`` must be 1.
+        
+        Additionally, if the element size of :attr:`dtype` is greater than that of
+        ``self.dtype``, the following conditions must be true as well:
+        
+            * ``self.size(-1)`` must be divisible by the ratio between the element
+              sizes of the dtypes.
+            * ``self.storage_offset()`` must be divisible by the ratio between the
+              element sizes of the dtypes.
+            * The strides of all dimensions, except the last dimension, must be
+              divisible by the ratio between the element sizes of the dtypes.
+        
+        If any of the above conditions are not met, an error is thrown.
+        
+        .. warning::
+        
+            This overload is not supported by TorchScript, and using it in a Torchscript
+            program will cause undefined behavior.
+        
+        
+        Args:
+            dtype (:class:`torch.dtype`): the desired dtype
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x
+            tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+            >>> x.dtype
+            torch.float32
+        
+            >>> y = x.view(torch.int32)
+            >>> y
+            tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+                    [-1105482831,  1061112040,  1057999968, -1084397505],
+                    [-1071760287, -1123489973, -1097310419, -1084649136],
+                    [-1101533110,  1073668768, -1082790149, -1088634448]],
+                dtype=torch.int32)
+            >>> y[0, 0] = 1000000000
+            >>> x
+            tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+        
+            >>> x.view(torch.cfloat)
+            tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+                    [-0.1520+0.7472j,  0.5617-0.8649j],
+                    [-2.4724-0.0334j, -0.2976-0.8499j],
+                    [-0.2109+1.9913j, -0.9607-0.6123j]])
+            >>> x.view(torch.cfloat).size()
+            torch.Size([4, 2])
+        
+            >>> x.view(torch.uint8)
+            tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+                       8, 191],
+                    [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+                      93, 191],
+                    [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+                      89, 191],
+                    [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+                      28, 191]], dtype=torch.uint8)
+            >>> x.view(torch.uint8).size()
+            torch.Size([4, 16])
+        """
+        ...
+    @overload
+    def view(self, *size: _int) -> Tensor: 
+        r"""
+        view(*shape) -> Tensor
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`shape`.
+        
+        The returned tensor shares the same data and must have the same number
+        of elements, but may have a different size. For a tensor to be viewed, the new
+        view size must be compatible with its original size and stride, i.e., each new
+        view dimension must either be a subspace of an original dimension, or only span
+        across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+        contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+        
+        .. math::
+        
+          \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+        
+        Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+        without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+        :meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+        returns a view if the shapes are compatible, and copies (equivalent to calling
+        :meth:`contiguous`) otherwise.
+        
+        Args:
+            shape (torch.Size or int...): the desired size
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x.size()
+            torch.Size([4, 4])
+            >>> y = x.view(16)
+            >>> y.size()
+            torch.Size([16])
+            >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+            >>> z.size()
+            torch.Size([2, 8])
+        
+            >>> a = torch.randn(1, 2, 3, 4)
+            >>> a.size()
+            torch.Size([1, 2, 3, 4])
+            >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+            >>> b.size()
+            torch.Size([1, 3, 2, 4])
+            >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+            >>> c.size()
+            torch.Size([1, 3, 2, 4])
+            >>> torch.equal(b, c)
+            False
+        
+        
+        .. method:: view(dtype) -> Tensor
+           :noindex:
+        
+        Returns a new tensor with the same data as the :attr:`self` tensor but of a
+        different :attr:`dtype`.
+        
+        If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+        then the size of the last dimension of the output will be scaled
+        proportionally.  For instance, if :attr:`dtype` element size is twice that of
+        ``self.dtype``, then each pair of elements in the last dimension of
+        :attr:`self` will be combined, and the size of the last dimension of the output
+        will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+        of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+        be split in two, and the size of the last dimension of the output will be
+        double that of :attr:`self`. For this to be possible, the following conditions
+        must be true:
+        
+            * ``self.dim()`` must be greater than 0.
+            * ``self.stride(-1)`` must be 1.
+        
+        Additionally, if the element size of :attr:`dtype` is greater than that of
+        ``self.dtype``, the following conditions must be true as well:
+        
+            * ``self.size(-1)`` must be divisible by the ratio between the element
+              sizes of the dtypes.
+            * ``self.storage_offset()`` must be divisible by the ratio between the
+              element sizes of the dtypes.
+            * The strides of all dimensions, except the last dimension, must be
+              divisible by the ratio between the element sizes of the dtypes.
+        
+        If any of the above conditions are not met, an error is thrown.
+        
+        .. warning::
+        
+            This overload is not supported by TorchScript, and using it in a Torchscript
+            program will cause undefined behavior.
+        
+        
+        Args:
+            dtype (:class:`torch.dtype`): the desired dtype
+        
+        Example::
+        
+            >>> x = torch.randn(4, 4)
+            >>> x
+            tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+            >>> x.dtype
+            torch.float32
+        
+            >>> y = x.view(torch.int32)
+            >>> y
+            tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+                    [-1105482831,  1061112040,  1057999968, -1084397505],
+                    [-1071760287, -1123489973, -1097310419, -1084649136],
+                    [-1101533110,  1073668768, -1082790149, -1088634448]],
+                dtype=torch.int32)
+            >>> y[0, 0] = 1000000000
+            >>> x
+            tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+                    [-0.1520,  0.7472,  0.5617, -0.8649],
+                    [-2.4724, -0.0334, -0.2976, -0.8499],
+                    [-0.2109,  1.9913, -0.9607, -0.6123]])
+        
+            >>> x.view(torch.cfloat)
+            tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+                    [-0.1520+0.7472j,  0.5617-0.8649j],
+                    [-2.4724-0.0334j, -0.2976-0.8499j],
+                    [-0.2109+1.9913j, -0.9607-0.6123j]])
+            >>> x.view(torch.cfloat).size()
+            torch.Size([4, 2])
+        
+            >>> x.view(torch.uint8)
+            tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+                       8, 191],
+                    [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+                      93, 191],
+                    [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+                      89, 191],
+                    [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+                      28, 191]], dtype=torch.uint8)
+            >>> x.view(torch.uint8).size()
+            torch.Size([4, 16])
+        """
+        ...
+    def view_as(self, other: Tensor) -> Tensor: 
+        r"""
+        view_as(other) -> Tensor
+        
+        View this tensor as the same size as :attr:`other`.
+        ``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+        
+        Please see :meth:`~Tensor.view` for more information about ``view``.
+        
+        Args:
+            other (:class:`torch.Tensor`): The result tensor has the same size
+                as :attr:`other`.
+        """
+        ...
+    @overload
+    def vsplit(self, sections: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        vsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.vsplit`
+        """
+        ...
+    @overload
+    def vsplit(self, indices: _size) -> Tuple[Tensor, ...]: 
+        r"""
+        vsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.vsplit`
+        """
+        ...
+    @overload
+    def vsplit(self, *indices: _int) -> Tuple[Tensor, ...]: 
+        r"""
+        vsplit(split_size_or_sections) -> List of Tensors
+        
+        See :func:`torch.vsplit`
+        """
+        ...
+    @overload
+    def where(self, condition: Tensor, other: Tensor) -> Tensor: 
+        r"""
+        where(condition, y) -> Tensor
+        
+        ``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+        See :func:`torch.where`
+        """
+        ...
+    @overload
+    def where(self, condition: Tensor, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        where(condition, y) -> Tensor
+        
+        ``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+        See :func:`torch.where`
+        """
+        ...
+    @overload
+    def xlogy(self, other: Tensor) -> Tensor: 
+        r"""
+        xlogy(other) -> Tensor
+        
+        See :func:`torch.xlogy`
+        """
+        ...
+    @overload
+    def xlogy(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        xlogy(other) -> Tensor
+        
+        See :func:`torch.xlogy`
+        """
+        ...
+    @overload
+    def xlogy_(self, other: Tensor) -> Tensor: 
+        r"""
+        xlogy_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.xlogy`
+        """
+        ...
+    @overload
+    def xlogy_(self, other: Union[Number, _complex]) -> Tensor: 
+        r"""
+        xlogy_(other) -> Tensor
+        
+        In-place version of :meth:`~Tensor.xlogy`
+        """
+        ...
+    def zero_(self) -> Tensor: 
+        r"""
+        zero_() -> Tensor
+        
+        Fills :attr:`self` tensor with zeros.
+        """
+        ...
+
+_TensorBase = TensorBase
+
+# Defined in torch/csrc/multiprocessing/init.cpp
+def _multiprocessing_init() -> None: ...
+
+# Defined in torch/csrc/mps/Module.cpp
+def _mps_deviceSynchronize() -> None: ...
+def _mps_get_default_generator() -> Generator: ...
+def _mps_emptyCache() -> None: ...
+def _mps_setMemoryFraction(fraction: _float) -> None: ...
+def _mps_currentAllocatedMemory() -> _int: ...
+def _mps_driverAllocatedMemory() -> _int: ...
+def _mps_is_available() -> _bool: ...
+def _mps_is_on_macos_or_newer(major: _int, minor: _int) -> _bool: ...
+def _mps_profilerStartTrace(mode: str, wait_until_completed: _bool) -> None: ...
+def _mps_profilerStopTrace() -> None: ...
+def _mps_acquireEvent(enable_timing: _bool) -> _int: ...
+def _mps_releaseEvent(event_id: _int) -> None: ...
+def _mps_recordEvent(event_id: _int) -> None: ...
+def _mps_waitForEvent(event_id: _int) -> None: ...
+def _mps_synchronizeEvent(event_id: _int) -> None: ...
+def _mps_queryEvent(event_id: _int) -> _bool: ...
+def _mps_elapsedTimeOfEvents(start_event_id: _int, end_event_id: _int) -> _float: ...
+
+
+# Defined in torch/csrc/cuda/Module.cpp
+def _cuda_getCurrentStream(device: _int) -> Tuple: ...
+def _cuda_getCurrentRawStream(device: _int) -> _int: ...
+def _cuda_getDefaultStream(device: _int) -> Tuple: ...
+def _cuda_getCurrentBlasHandle() -> _int: ...
+def _cuda_clearCublasWorkspaces() -> None: ...
+def _cuda_setDevice(device: _int) -> None: ...
+def _cuda_exchangeDevice(device: _int) -> _int: ...
+def _cuda_maybeExchangeDevice(device: _int) -> _int: ...
+def _cuda_getDevice() -> _int: ...
+def _cuda_getDeviceCount() -> _int: ...
+def _cuda_set_sync_debug_mode(warn_level: Union[_int, str]) -> None: ...
+def _cuda_get_sync_debug_mode() -> _int: ...
+def _cuda_sleep(cycles: _int) -> None: ...
+def _cuda_synchronize() -> None: ...
+def _cuda_ipc_collect() -> None: ...
+def _cuda_getArchFlags() -> Optional[str]: ...
+def _cuda_init() -> None: ...
+def _cuda_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
+def _cuda_getCompiledVersion() -> _int: ...
+def _cuda_cudaHostAllocator() -> _int: ...
+def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
+def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
+def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
+def _cuda_beginAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
+def _cuda_endAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
+def _cuda_releasePool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
+def _cuda_checkPoolLiveAllocations(device: _int, mempool_id: Tuple[_int, _int], expected_live_allocations: Set) -> _bool: ...
+def _cuda_setCheckpointPoolState(device: _int, state: _cuda_CUDAAllocator_AllocatorState,  stale_storages: List[_int], storages_to_add_deleters_to: List[_int]) -> None: ...
+def _cuda_setMemoryFraction(fraction: _float, device: _int) -> None: ...
+def _cuda_emptyCache() -> None: ...
+def _cuda_memoryStats(device: _int) -> Dict[str, Any]: ...
+def _cuda_resetAccumulatedMemoryStats(device: _int) -> None: ...
+def _cuda_resetPeakMemoryStats(device: _int) -> None: ...
+def _cuda_memorySnapshot() -> Dict[str, Any]: ...
+def _cuda_record_memory_history_legacy(
+    enabled: _bool,
+    record_context: _bool,
+    record_context_cpp: _bool,
+    alloc_trace_max_entries: _int,
+    alloc_trace_record_context: _bool,
+) -> None: ...
+def _cuda_record_memory_history(
+    enabled: Optional[str],
+    context: Optional[str],
+    stacks: str,
+    max_entries
+) -> None: ...
+def _cuda_isHistoryEnabled() -> _bool: ...
+
+def _cuda_getAllocatorBackend() -> str: ...
+class _cuda_CUDAAllocator_AllocatorState:
+    pass
+def _cuda_getCheckpointState(device: _int, mempool: Tuple[_int, _int]) -> _cuda_CUDAAllocator_AllocatorState: ...
+def _set_cached_tensors_enabled(enabled: _bool) -> None: ...
+def _add_cached_tensor(t: Tensor) -> None: ...
+def _remove_cached_tensor(t: Tensor) -> None: ...
+def _construct_CUDA_Tensor_From_Storage_And_Metadata(metadata: dict, storage: Storage) -> Tensor: ...
+def _storage_Use_Count(storage_ptr: _int) -> _int: ...
+def _set_storage_access_error_msg(t: Tensor, s: str) -> None: ...
+def _free_And_Remove_DeleterFn(storage_ptr: _int) -> None: ...
+def _has_Standard_Deleter(storage_ptr: _int) -> _bool: ...
+
+class _cuda_CUDAAllocator: ...
+
+def _cuda_customAllocator(alloc_fn: _int, free_fn: _int) -> _cuda_CUDAAllocator: ...
+def _cuda_changeCurrentAllocator(allocator: _cuda_CUDAAllocator) -> None: ...
+def _cuda_getAllocator() -> _cuda_CUDAAllocator: ...
+def _cuda_lock_mutex() -> None: ...
+def _cuda_unlock_mutex() -> None: ...
+def _cuda_canDeviceAccessPeer(device: _int, peer_device: _int) -> _bool: ...
+def _cuda_jiterator_compile_and_launch_kernel(
+    code_string: str,
+    kernel_name: str,
+    return_by_ref: _bool,
+    num_outputs: _int,
+    tensors: Tuple,
+    kwargs: Dict[str, Union[_int, _float, _bool]],
+) -> Tensor: ...
+def _cuda_get_cudnn_benchmark_limit() -> _int: ...
+def _cuda_set_cudnn_benchmark_limit(arg: _int) -> None: ...
+def _cuda_get_conv_benchmark_empty_cache() -> _bool: ...
+def _cudnn_set_conv_benchmark_empty_cache(enable: _bool) -> None: ...
+def _nccl_version() -> _int: ...
+def _nccl_version_suffix() -> bytes : ...
+def _nccl_unique_id() -> bytes: ...
+def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ...
+def _nccl_reduce(
+    input: Sequence[Tensor],
+    output: Tensor,
+    root: _int,
+    op: _int,
+    streams: Optional[Sequence[_CudaStreamBase]],
+    comms: Optional[Sequence[object]],
+) -> None: ...
+def _nccl_all_reduce(
+    input: Sequence[Tensor],
+    output: Sequence[Tensor],
+    op: _int,
+    streams: Optional[Sequence[_CudaStreamBase]],
+    comms: Optional[Sequence[object]],
+) -> None: ...
+def _nccl_broadcast(
+    input: Sequence[Tensor],
+    root: _int,
+    streams: Optional[Sequence[_CudaStreamBase]],
+    comms: Optional[Sequence[object]],
+) -> None: ...
+def _nccl_all_gather(
+    input: Sequence[Tensor],
+    output: Sequence[Tensor],
+    streams: Optional[Sequence[_CudaStreamBase]],
+    comms: Optional[Sequence[object]],
+) -> None: ...
+def _nccl_reduce_scatter(
+    input: Sequence[Tensor],
+    output: Sequence[Tensor],
+    op: _int,
+    streams: Optional[Sequence[_CudaStreamBase]],
+    comms: Optional[Sequence[object]],
+) -> None: ...
+def _rocm_is_backward_pass() -> _bool: ...
+
+class _CudaDeviceProperties:
+    name: str
+    major: _int
+    minor: _int
+    multi_processor_count: _int
+    total_memory: _int
+    is_integrated: _int
+    is_multi_gpu_board: _int
+    max_threads_per_multi_processor: _int
+    gcnArchName: str
+
+# Functions related to SDPA
+class _SDPAParams:
+    query: Tensor
+    key: Tensor
+    value: Tensor
+    attn_mask: Optional[Tensor]
+    dropout: _float
+    is_causal: _bool
+    def __init__(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        attn_mask: Optional[Tensor],
+        dropout: _float,
+        is_causal: _bool) -> None: ...
+
+class _SDPBackend(Enum):
+    ERROR = -1
+    MATH = 0
+    FLASH_ATTENTION = 1
+    EFFICIENT_ATTENTION = 2
+    CUDNN_ATTENTION = 3
+
+def _can_use_flash_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
+def _can_use_mem_efficient_attention(params: _SDPAParams, debug: _bool) -> _bool: ...
+
+# Defined in torch/csrc/cuda/python_comm.cpp
+def _broadcast(tensor: Tensor, devices: List[_int]) -> List[Tensor]: ...
+def _broadcast_out(tensor: Tensor, out_tensors: List[Tensor]) -> List[Tensor]: ...
+def _broadcast_coalesced(
+    tensors: List[Tensor],
+    devices: List[_int],
+    buffer_size: _int,
+) -> List[List[Tensor]]: ...
+def _scatter(
+    tensor: Tensor,
+    devices: List[_int],
+    chunk_sizes: Optional[List[_int]],
+    dim: _int,
+    streams: Optional[List[Stream]],
+) -> List[Tensor]: ...
+def _scatter_out(
+    tensor: Tensor,
+    out_tensors: List[Tensor],
+    dim: _int,
+    streams: Optional[List[Stream]],
+) -> List[Tensor]: ...
+def _gather(
+    tensors: List[Tensor],
+    dim: _int,
+    destination_index: Optional[_int],
+) -> Tensor: ...
+def _gather_out(tensors: List[Tensor], out_tensor: Tensor, dim: _int) -> Tensor: ...
+
+# Defined in torch/csrc/cuda/Stream.cpp
+class _CudaStreamBase(Stream):
+    stream_id: _int
+    device_index: _int
+    device_type: _int
+
+    device: _device
+    cuda_stream: _int
+    priority: _int
+
+    def __new__(
+        self,
+        priority: _int = 0,
+        stream_id: _int = 0,
+        device_index: _int = 0,
+        stream_ptr: _int = 0,
+    ) -> _CudaStreamBase: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    def priority_range(self) -> Tuple[_int, _int]: ...
+
+# Defined in torch/csrc/cuda/Event.cpp
+class _CudaEventBase:
+    device: _device
+    cuda_event: _int
+
+    def __new__(
+        cls,
+        enable_timing: _bool = False,
+        blocking: _bool = False,
+        interprocess: _bool = False,
+    ) -> _CudaEventBase: ...
+    @classmethod
+    def from_ipc_handle(cls, device: _device, ipc_handle: bytes) -> _CudaEventBase: ...
+    def record(self, stream: _CudaStreamBase) -> None: ...
+    def wait(self, stream: _CudaStreamBase) -> None: ...
+    def query(self) -> _bool: ...
+    def elapsed_time(self, other: _CudaEventBase) -> _float: ...
+    def synchronize(self) -> None: ...
+    def ipc_handle(self) -> bytes: ...
+
+# Defined in torch/csrc/cuda/Graph.cpp
+class _CUDAGraph:
+    def capture_begin(self, pool: Optional[Tuple[_int, _int]] = ..., capture_error_mode: str = "global") -> None: ...
+    def capture_end(self) -> None: ...
+    def replay(self) -> None: ...
+    def reset(self) -> None: ...
+    def pool(self) -> Tuple[_int, _int]: ...
+    def enable_debug_mode(self) -> None: ...
+    def debug_dump(self, debug_path: str) -> None: ...
+
+def _cuda_isCurrentStreamCapturing() -> _bool: ...
+def _graph_pool_handle() -> Tuple[_int, _int]: ...
+
+# Defined in torch/csrc/xpu/Module.cpp
+def _xpu_setDevice(device: _int) -> None: ...
+def _xpu_exchangeDevice(device: _int) -> _int: ...
+def _xpu_maybeExchangeDevice(device: _int) -> _int: ...
+def _xpu_getDevice() -> _int: ...
+def _xpu_getDeviceCount() -> _int: ...
+def _xpu_init() -> None: ...
+def _xpu_setStream(stream_id: _int, device_index: _int, device_type: _int) -> None: ...
+def _xpu_getCurrentStream(device: _int) -> Tuple: ...
+def _xpu_getCurrentRawStream(device: _int) -> _int: ...
+def _xpu_synchronize(device: _int) -> None: ...
+def _xpu_emptyCache() -> None: ...
+
+class _XpuDeviceProperties:
+    name: str
+    platform_name: str
+    total_memory: _int
+    max_compute_units: _int
+    gpu_eu_count: _int
+    gpu_subslice_count: _int
+    max_work_group_size: _int
+    max_num_sub_groups: _int
+    sub_group_sizes: List[_int]
+    type: str
+
+# Defined in torch/csrc/xpu/Stream.cpp
+class _XpuStreamBase(Stream):
+    stream_id: _int
+    device_index: _int
+    device_type: _int
+
+    device: _device
+    sycl_queue: _int
+    priority: _int
+
+    def __new__(
+        cls,
+        priority: _int = 0,
+        stream_id: _int = 0,
+        device_index: _int = 0,
+        device_type: _int = 0,
+    ) -> _XpuStreamBase: ...
+    def query(self) -> _bool: ...
+    def synchronize(self) -> None: ...
+    @staticmethod
+    def priority_range() -> Tuple: ...
+
+# Defined in torch/csrc/xpu/Event.cpp
+class _XpuEventBase:
+    device: _device
+    sycl_event: _int
+
+    def __new__(cls, enable_timing: _bool = False) -> _XpuEventBase: ...
+    def record(self, stream: _XpuEventBase) -> None: ...
+    def wait(self, stream: _XpuStreamBase) -> None: ...
+    def query(self) -> _bool: ...
+    def elapsed_time(self, other: _XpuEventBase) -> _float: ...
+    def synchronize(self) -> None: ...
+
+# Defined in torch/csrc/DataLoader.cpp
+def _set_worker_signal_handlers(
+    *arg: Any,
+) -> None: ...  # THPModule_setWorkerSignalHandlers
+def _set_worker_pids(
+    key: _int,
+    child_pids: Tuple[_int, ...],
+) -> None: ...  # THPModule_setWorkerPIDs
+def _remove_worker_pids(loader_id: _int) -> None: ...  # THPModule_removeWorkerPIDs
+def _error_if_any_worker_fails() -> None: ...  # THPModule_errorIfAnyWorkerFails
+
+# Defined in torch/csrc/jit/python/python_tracer.cpp
+class TracingState:
+    def push_scope(self, scope_name: str) -> None: ...
+    def pop_scope(self) -> None: ...
+    def current_scope(self) -> str: ...
+    def set_graph(self, graph: Graph) -> None: ...
+    def graph(self) -> Graph: ...
+
+def _create_graph_by_tracing(
+    func: Callable[..., Any],
+    inputs: Any,
+    var_name_lookup_fn: Callable[[Tensor], str],
+    strict: Any,
+    force_outplace: Any,
+    self: Any = None,
+    argument_names: List[str] = [],
+) -> Tuple[Graph, Stack]: ...
+def _tracer_warn_use_python(): ...
+def _get_tracing_state() -> TracingState: ...
+
+# Defined in torch/csrc/jit/python/python_ir.cpp
+# Not actually defined in python_ir.cpp, not sure where they are.
+class IValue: ...
+
+Stack = List[IValue]
+
+class JitType:
+    annotation_str: str
+    def isSubtypeOf(self, other: JitType) -> _bool: ...
+    def with_dtype(self, dtype: _dtype) -> JitType: ...
+    def with_sizes(self, sizes: List[Optional[_int]]) -> JitType: ...
+    def kind(self) -> str: ...
+    def scalarType(self) -> Optional[str]: ...
+    def getElementType(self) -> JitType: ...
+    def dtype(self) -> Optional[_dtype]: ...
+
+class InferredType:
+    def __init__(self, arg: Union[JitType, str]): ...
+    def type(self) -> JitType: ...
+    def success(self) -> _bool: ...
+    def reason(self) -> str: ...
+
+R = TypeVar("R", bound=JitType)
+
+class AnyType(JitType):
+    @staticmethod
+    def get() -> AnyType: ...
+
+class NoneType(JitType):
+    @staticmethod
+    def get() -> NoneType: ...
+
+class BoolType(JitType):
+    @staticmethod
+    def get() -> BoolType: ...
+
+class FloatType(JitType):
+    @staticmethod
+    def get() -> FloatType: ...
+
+class ComplexType(JitType):
+    @staticmethod
+    def get() -> ComplexType: ...
+
+class IntType(JitType):
+    @staticmethod
+    def get() -> IntType: ...
+
+class SymIntType(JitType):
+    @staticmethod
+    def get() -> SymIntType: ...
+
+class SymBoolType(JitType):
+    @staticmethod
+    def get() -> SymBoolType: ...
+
+class NumberType(JitType):
+    @staticmethod
+    def get() -> NumberType: ...
+
+class StringType(JitType):
+    @staticmethod
+    def get() -> StringType: ...
+
+class DeviceObjType(JitType):
+    @staticmethod
+    def get() -> DeviceObjType: ...
+
+class _GeneratorType(JitType):
+    @staticmethod
+    def get() -> _GeneratorType: ...
+
+class StreamObjType(JitType):
+    @staticmethod
+    def get() -> StreamObjType: ...
+
+class ListType(JitType):
+    def __init__(self, a: JitType) -> None: ...
+    def getElementType(self) -> JitType: ...
+    @staticmethod
+    def ofInts() -> ListType: ...
+    @staticmethod
+    def ofTensors() -> ListType: ...
+    @staticmethod
+    def ofFloats() -> ListType: ...
+    @staticmethod
+    def ofComplexDoubles() -> ListType: ...
+    @staticmethod
+    def ofBools() -> ListType: ...
+    @staticmethod
+    def ofStrings() -> ListType: ...
+
+class DictType(JitType):
+    def __init__(self, key: JitType, value: JitType) -> None: ...
+    def getKeyType(self) -> JitType: ...
+    def getValueType(self) -> JitType: ...
+
+class TupleType(JitType):
+    def __init__(self, a: List[Optional[JitType]]) -> None: ...
+    def elements(self) -> List[JitType]: ...
+
+class UnionType(JitType):
+    def __init__(self, a: List[JitType]) -> None: ...
+
+class ClassType(JitType):
+    def __init__(self, qualified_name: str) -> None: ...
+
+class InterfaceType(JitType):
+    def __init__(self, qualified_name: str) -> None: ...
+    def getMethod(self, name: str) -> Optional[FunctionSchema]: ...
+    def getMethodNames(self) -> List[str]: ...
+
+class OptionalType(JitType, Generic[R]):
+    def __init__(self, a: JitType) -> None: ...
+    def getElementType(self) -> JitType: ...
+    @staticmethod
+    def ofTensor() -> OptionalType: ...
+
+class FutureType(JitType):
+    def __init__(self, a: JitType) -> None: ...
+    def getElementType(self) -> JitType: ...
+
+class AwaitType(JitType):
+    def __init__(self, a: JitType) -> None: ...
+    def getElementType(self) -> JitType: ...
+
+class RRefType(JitType):
+    def __init__(self, a: JitType) -> None: ...
+
+class EnumType(JitType):
+    def __init__(
+        self,
+        qualified_name: str,
+        value_type: JitType,
+        enum_names_values: List[Any],
+    ) -> None: ...
+
+class TensorType(JitType):
+    @classmethod
+    def get(cls) -> TensorType: ...
+    @classmethod
+    def getInferred(cls) -> TensorType: ...
+    def with_sizes(self, other: Optional[List[Optional[_int]]]) -> TensorType: ...
+    def sizes(self) -> Optional[List[_int]]: ...
+    def varyingSizes(self) -> Optional[List[Optional[_int]]]: ...
+    def strides(self) -> Optional[List[_int]]: ...
+    def device(self) -> Optional[_device]: ...
+    def dim(self) -> _int: ...
+    def dtype(self) -> Optional[_dtype]: ...
+    @staticmethod
+    def create_from_tensor(t: Tensor) -> TensorType: ...
+
+# Defined in torch/csrc/jit/python/python_tree_views.cpp
+class SourceRange: ...
+class TreeView: ...
+
+class Ident(TreeView):
+    @property
+    def name(self) -> str: ...
+
+class ClassDef(TreeView): ...
+
+class Def(TreeView):
+    def name(self) -> Ident: ...
+
+class Decl(TreeView): ...
+
+# Defined in torch/csrc/distributed/rpc/init.cpp
+def _rpc_init() -> _bool: ...
+
+# Defined in torch/csrc/distributed/autograd/init.cpp
+def _dist_autograd_init() -> _bool: ...
+
+# Defined in torch/csrc/distributed/c10d/init.cpp
+def _c10d_init() -> _bool: ...
+
+# Defined in torch/csrc/distributed/rpc/testing/init.cpp
+def _faulty_agent_init() -> _bool: ...
+def _register_py_class_for_device(device: str, cls: Any) -> None: ...
+def _activate_cuda_trace() -> None: ...
+
+# Defined in torch/csrc/Module.cpp
+def _current_graph_task_id() -> _int: ...
+def _current_autograd_node() -> _Node: ...
+
+# Defined in torch/csrc/Exceptions.cpp
+class _OutOfMemoryError(RuntimeError): ...
+class _DistError(RuntimeError): ...
+class _DistBackendError(RuntimeError): ...
+class _DistStoreError(RuntimeError): ...
+class _DistNetworkError(RuntimeError): ...
+
+# Defined in torch/csrc/profiler/init.cpp
+class CapturedTraceback:
+    pass
+def gather_traceback(python: _bool, script: _bool, cpp: _bool) -> CapturedTraceback: ...
+def symbolize_tracebacks(tracebacks: List[CapturedTraceback]) -> List[Dict[str, Any]]: ...
+
+def _load_mobile_module_from_file(filename: str): ...
+def _load_mobile_module_from_bytes(bytes_: bytes): ...
+def _load_jit_module_from_file(filename: str): ...
+def _load_jit_module_from_bytes(bytes_: bytes): ...
+def _save_mobile_module(m: LiteScriptModule, filename: str): ...
+def _save_jit_module(m: ScriptModule, filename: str, extra_files: Dict[str, Any]): ...
+def _save_mobile_module_to_bytes(m: LiteScriptModule) -> bytes: ...
+def _save_jit_module_to_bytes(m: ScriptModule,  extra_files: Dict[str, Any]) -> bytes: ...
+def _get_module_info_from_flatbuffer(data: bytes): ...
+def _jit_resolve_packet(op_name: str, *args, **kwargs) -> str: ...
+def _swap_tensor_impl(t1: Tensor, t2: Tensor): ...
+def _save_pickle(obj: Any) -> bytes: ...
+
+# Defined in torch/csrc/jit/runtime/static/init.cpp
+def _jit_to_static_module(graph_or_module: Union[Graph,ScriptModule]) -> Any: ...
+def _fuse_to_static_module(graph_or_module: Union[Graph,ScriptModule], min_size: _int) -> Any: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_aoti.pyi b/MLPY/Lib/site-packages/torch/_C/_aoti.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6a567acae8b4d281361fb2db9855e10184b03cdd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_aoti.pyi
@@ -0,0 +1,3 @@
+# Defined in torch/csrc/inductor/aoti_runner/pybind.cpp
+class AOTIModelContainerRunnerCpu: ...
+class AOTIModelContainerRunnerCuda: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_autograd.pyi b/MLPY/Lib/site-packages/torch/_C/_autograd.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..027c69854ec12fcda0397d9f00729f18d41695a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_autograd.pyi
@@ -0,0 +1,123 @@
+from enum import Enum
+from typing import Any, Callable, List, Optional, Set
+
+import torch
+
+from ._profiler import (
+    _ProfilerEvent,
+    ActiveProfilerType,
+    ProfilerActivity,
+    ProfilerConfig,
+)
+
+# Defined in tools/autograd/init.cpp
+
+class DeviceType(Enum):
+    CPU = ...
+    CUDA = ...
+    MKLDNN = ...
+    OPENGL = ...
+    OPENCL = ...
+    IDEEP = ...
+    HIP = ...
+    FPGA = ...
+    ORT = ...
+    XLA = ...
+    MPS = ...
+    HPU = ...
+    Meta = ...
+    Vulkan = ...
+    Metal = ...
+    PrivateUse1 = ...
+
+class ProfilerEvent:
+    def cpu_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cpu_memory_usage(self) -> int: ...
+    def cuda_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def privateuse1_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cuda_memory_usage(self) -> int: ...
+    def device(self) -> int: ...
+    def handle(self) -> int: ...
+    def has_cuda(self) -> bool: ...
+    def is_remote(self) -> bool: ...
+    def kind(self) -> int: ...
+    def name(self) -> str: ...
+    def node_id(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def shapes(self) -> List[List[int]]: ...
+    def thread_id(self) -> int: ...
+    def flops(self) -> float: ...
+    def is_async(self) -> bool: ...
+
+class _KinetoEvent:
+    def name(self) -> str: ...
+    def device_index(self) -> int: ...
+    def start_us(self) -> int: ...
+    def duration_us(self) -> int: ...
+    def is_async(self) -> bool: ...
+    def linked_correlation_id(self) -> int: ...
+    def shapes(self) -> List[List[int]]: ...
+    def dtypes(self) -> List[str]: ...
+    def concrete_inputs(self) -> List[Any]: ...
+    def device_type(self) -> DeviceType: ...
+    def start_thread_id(self) -> int: ...
+    def end_thread_id(self) -> int: ...
+    def correlation_id(self) -> int: ...
+    def fwd_thread_id(self) -> int: ...
+    def stack(self) -> List[str]: ...
+    def scope(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def flops(self) -> int: ...
+    def cuda_elapsed_us(self) -> int: ...
+    def privateuse1_elapsed_us(self) -> int: ...
+
+class _ProfilerResult:
+    def events(self) -> List[_KinetoEvent]: ...
+    def legacy_events(self) -> List[List[ProfilerEvent]]: ...
+    def save(self, path: str) -> None: ...
+    def experimental_event_tree(self) -> List[_ProfilerEvent]: ...
+    def trace_start_us(self) -> int: ...
+
+class SavedTensor: ...
+
+def _enable_profiler(
+    config: ProfilerConfig,
+    activities: Set[ProfilerActivity],
+) -> None: ...
+def _prepare_profiler(
+    config: ProfilerConfig,
+    activities: Set[ProfilerActivity],
+) -> None: ...
+def _disable_profiler() -> _ProfilerResult: ...
+def _profiler_enabled() -> bool: ...
+def _add_metadata_json(key: str, value: str) -> None: ...
+def _kineto_step() -> None: ...
+def _get_sequence_nr() -> int: ...
+def kineto_available() -> bool: ...
+def _record_function_with_args_enter(name: str, *args) -> torch.Tensor: ...
+def _record_function_with_args_exit(handle: torch.Tensor) -> None: ...
+def _supported_activities() -> Set[ProfilerActivity]: ...
+def _enable_record_function(enable: bool) -> None: ...
+def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ...
+def _push_saved_tensors_default_hooks(
+    pack_hook: Callable[[torch.Tensor], Any],
+    unpack_hook: Callable[[Any], torch.Tensor],
+) -> None: ...
+def _pop_saved_tensors_default_hooks() -> None: ...
+def _unsafe_set_version_counter(t: torch.Tensor, prev_version: int) -> None: ...
+def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
+def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ...
+def _profiler_type() -> ActiveProfilerType: ...
+def _saved_tensors_hooks_enable() -> None: ...
+def _saved_tensors_hooks_disable(message: str) -> None: ...
+def _saved_tensors_hooks_get_disabled_error_message() -> Optional[str]: ...
+
+class CreationMeta(Enum):
+    DEFAULT = ...
+    IN_CUSTOM_FUNCTION = ...
+    MULTI_OUTPUT_NODE = ...
+    NO_GRAD_MODE = ...
+    INFERENCE_MODE = ...
+
+def _set_creation_meta(t: torch.Tensor, creation_meta: CreationMeta) -> None: ...
+def _get_creation_meta(t: torch.Tensor) -> CreationMeta: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_cpu.pyi b/MLPY/Lib/site-packages/torch/_C/_cpu.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..9dfd41a9f6dee4cceb52e89cb6c20c5c06c941b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_cpu.pyi
@@ -0,0 +1,5 @@
+from torch.types import _bool
+
+# Defined in torch/csrc/cpu/Module.cpp
+
+def _is_cpu_support_vnni() -> _bool: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_cudnn.pyi b/MLPY/Lib/site-packages/torch/_C/_cudnn.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..15d6289a9180e36bf02fb7e726675302826b3daf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_cudnn.pyi
@@ -0,0 +1,17 @@
+from enum import Enum
+
+from torch.types import _bool, Tuple
+
+# Defined in torch/csrc/cuda/shared/cudnn.cpp
+is_cuda: _bool
+
+def getRuntimeVersion() -> Tuple[int, int, int]: ...
+def getCompileVersion() -> Tuple[int, int, int]: ...
+def getVersionInt() -> int: ...
+
+class RNNMode(int, Enum):
+    value: int
+    rnn_relu = ...
+    rnn_tanh = ...
+    lstm = ...
+    gru = ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_distributed_autograd.pyi b/MLPY/Lib/site-packages/torch/_C/_distributed_autograd.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..b1a4062d58a119a4a352a2e565a94100b466d420
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_distributed_autograd.pyi
@@ -0,0 +1,26 @@
+from typing import Any, Dict, List, Set
+
+import torch
+
+# This module is defined in torch/csrc/distributed/autograd/init.cpp
+
+class DistAutogradContext:
+    def _context_id(self) -> int: ...
+    def _recv_functions(self) -> Dict[int, Any]: ...
+    def _send_functions(self) -> Dict[int, Any]: ...
+    def _known_worker_ids(self) -> Set[int]: ...
+
+def _new_context() -> DistAutogradContext: ...
+def _release_context(context_id: int) -> None: ...
+def _get_max_id() -> int: ...
+def _is_valid_context(worker_id: int) -> bool: ...
+def _retrieve_context(context_id: int) -> DistAutogradContext: ...
+def _current_context() -> DistAutogradContext: ...
+def _init(worker_id: int) -> None: ...
+def _get_debug_info() -> Dict[str, str]: ...
+def backward(
+    context_id: int,
+    roots: List[torch.Tensor],
+    retain_graph=False,
+) -> None: ...
+def get_gradients(context_id: int) -> Dict[torch.Tensor, torch.Tensor]: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_distributed_c10d.pyi b/MLPY/Lib/site-packages/torch/_C/_distributed_c10d.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e0d0cdef4a575a09335b52f85d0f976169103b38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_distributed_c10d.pyi
@@ -0,0 +1,590 @@
+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from enum import Enum
+from typing import Any, Dict, List, Optional, overload, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch._C import ScriptObject
+from torch.futures import Future
+
+# This module is defined in torch/csrc/distributed/c10d/init.cpp
+
+_DEFAULT_FIRST_BUCKET_BYTES: int
+_DEFAULT_NO_TIMEOUT: timedelta
+_DEFAULT_PG_TIMEOUT: timedelta
+_DEFAULT_PG_NCCL_TIMEOUT: timedelta
+
+class BuiltinCommHookType(Enum):
+    ALLREDUCE = ...
+    FP16_COMPRESS = ...
+
+def _register_comm_hook(reducer: Reducer, state: Any, comm_hook: Any): ...
+def _register_builtin_comm_hook(
+    reducer: Reducer,
+    comm_hook_type: BuiltinCommHookType,
+): ...
+def _set_global_rank(rank: int) -> None: ...
+def _hash_tensors(tensors: List[Tensor]) -> int: ...
+
+class GradBucket:
+    def index(self) -> int: ...
+    def buffer(self) -> Tensor: ...
+    def gradients(self) -> List[Tensor]: ...
+    def is_last(self) -> bool: ...
+    def set_buffer(self, tensor: Tensor) -> None: ...
+    def parameters(self) -> List[Tensor]: ...
+
+class Reducer:
+    def __init__(
+        self,
+        params: List[Tensor],
+        bucket_indices: List[List[int]],
+        per_bucket_size_limits: List[int],
+        process_group: ProcessGroup,
+        expect_sparse_gradients: List[bool] = ...,
+        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
+        find_unused_parameters: bool = ...,
+        gradient_as_bucket_view: bool = ...,
+        param_to_name_mapping: Dict[int, str] = ...,
+        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
+    ): ...
+    def prepare_for_forward(self) -> None: ...
+    def prepare_for_backward(self, output: List[Tensor]) -> None: ...
+    def get_backward_stats(self) -> List[int]: ...
+    def _install_post_backward_futures(self, futures: List[Future]) -> None: ...
+    def _rebuild_buckets(self) -> bool: ...
+    def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ...
+    def _push_all_rebuilt_params(self) -> None: ...
+    def _set_forward_pass_work_handle(
+        self,
+        work: Work,
+        use_static_world_size: bool,
+    ): ...
+    def _get_local_used_map(self) -> Tensor: ...
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate: int) -> None: ...
+    def _set_static_graph(self) -> None: ...
+    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
+    def set_logger(self, logger: Logger) -> None: ...
+    def _remove_autograd_hooks(self) -> None: ...
+    def _check_reducer_finalized(self) -> None: ...
+    def _set_sparse_metadata(self, global_unique_ids: Dict[str, Tensor]) -> None: ...
+    def _reset_state(self) -> None: ...
+    def _update_process_group(self, new_process_group: ProcessGroup) -> None: ...
+
+class DDPLoggingData:
+    strs_map: Dict[str, str]
+    ints_map: Dict[str, int]
+
+class Logger:
+    def __init__(self, reducer: Reducer): ...
+    def set_construction_data_and_log(
+        self,
+        module_name: str,
+        device_ids: List[int],
+        output_device: int,
+        broadcast_buffers: bool,
+        has_sync_bn: bool,
+        static_graph: bool,
+    ): ...
+    def set_runtime_stats_and_log(self) -> None: ...
+    def set_error_and_log(self, error: str) -> None: ...
+    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
+    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
+    def _set_uneven_input_join(self) -> None: ...
+    def _set_static_graph(self) -> None: ...
+
+def get_debug_level(): ...
+def set_debug_level(): ...
+def set_debug_level_from_env(): ...
+
+class DebugLevel(Enum):
+    OFF = ...
+    INFO = ...
+    DETAIL = ...
+
+class ReduceOp:
+    def __init__(self, op: RedOpType): ...
+
+    SUM: RedOpType = ...
+    AVG: RedOpType = ...
+    PRODUCT: RedOpType = ...
+    MIN: RedOpType = ...
+    MAX: RedOpType = ...
+    BAND: RedOpType = ...
+    BOR: RedOpType = ...
+    BXOR: RedOpType = ...
+    PREMUL_SUM: RedOpType = ...
+    UNUSED: RedOpType = ...
+
+    class RedOpType(Enum): ...
+
+class BroadcastOptions:
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+    asyncOp: bool
+
+class AllreduceOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+
+class AllreduceCoalescedOptions(AllreduceOptions): ...
+
+class ReduceOptions:
+    reduceOp: ReduceOp
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+
+class AllgatherOptions:
+    timeout: timedelta
+    asyncOp: bool
+
+class GatherOptions:
+    rootRank: int
+    timeout: timedelta
+
+class ScatterOptions:
+    rootRank: int
+    timeout: timedelta
+    asyncOp: bool
+
+class ReduceScatterOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+    asyncOp: bool
+
+class BarrierOptions:
+    device_ids: List[int]
+    device: torch.device
+    timeout: timedelta
+
+class AllToAllOptions:
+    timeout: timedelta
+
+class Store:
+    def set(self, key: str, value: str): ...
+    def get(self, key: str) -> bytes: ...
+    def add(self, key: str, value: int) -> int: ...
+    def compare_set(
+        self,
+        key: str,
+        expected_value: str,
+        desired_value: str,
+    ) -> bytes: ...
+    def delete_key(self, key: str) -> bool: ...
+    def num_keys(self) -> int: ...
+    def set_timeout(self, timeout: timedelta): ...
+    @overload
+    def wait(self, keys: List[str]): ...
+    @overload
+    def wait(self, keys: List[str], timeout: timedelta): ...
+
+class FileStore(Store):
+    def __init__(self, path: str, numWorkers: int = ...): ...
+
+class HashStore(Store):
+    def __init__(self): ...
+
+class TCPStore(Store):
+    def __init__(
+        self,
+        host_name: str,
+        port: int,
+        world_size: Optional[int] = ...,
+        is_master: bool = ...,
+        timeout: timedelta = ...,
+        wait_for_workers: bool = ...,
+        multi_tenant: bool = ...,
+        master_listen_fd: Optional[int] = ...,
+        use_libuv: Optional[bool] = ...,
+    ): ...
+    @property
+    def host(self) -> str: ...
+    @property
+    def port(self) -> int: ...
+
+class PrefixStore(Store):
+    def __init__(self, prefix: str, store: Store): ...
+    @property
+    def underlying_store(self) -> Store: ...
+
+class _DistributedBackendOptions:
+    def __init__(self): ...
+    @property
+    def store(self) -> Store: ...
+    @store.setter
+    def store(self, store: Store) -> None: ...
+    @property
+    def group_rank(self) -> int: ...
+    @group_rank.setter
+    def group_rank(self, rank: int) -> None: ...
+    @property
+    def group_size(self) -> int: ...
+    @group_size.setter
+    def group_size(self, size: int) -> None: ...
+    @property
+    def timeout(self) -> timedelta: ...
+    @timeout.setter
+    def timeout(self, timeout: timedelta) -> None: ...
+    @property
+    def group_id(self) -> str: ...
+    @group_id.setter
+    def group_id(self, group_id: str) -> None: ...
+    @property
+    def global_ranks_in_group(self) -> List[int]: ...
+    @global_ranks_in_group.setter
+    def global_ranks_in_group(self, ranks: List[int]) -> None: ...
+
+class Work:
+    def is_completed(self) -> bool: ...
+    def is_success(self) -> bool: ...
+    def exception(self) -> Any: ...
+    def wait(self, timeout: timedelta = ...) -> bool: ...
+    def get_future(self) -> Future: ...
+    def source_rank(self) -> int: ...
+    def _source_rank(self) -> int: ...
+    def result(self) -> List[Tensor]: ...
+    def synchronize(self): ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> Work: ...
+
+class Backend:
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+    ): ...
+    @property
+    def supports_splitting(self) -> bool: ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    def eager_connect_single_device(self, device: Optional[torch.device]) -> None: ...
+    def _set_sequence_number_for_group(self) -> None: ...
+
+class ProcessGroup:
+    class Options:
+        def __init__(self, backend: str, timeout: timedelta = ...): ...
+        @property
+        def backend(self) -> str: ...
+        @property
+        def _timeout(self) -> timedelta: ...
+        @_timeout.setter
+        def _timeout(self, val: timedelta) -> None: ...
+
+    class BackendType(Enum):
+        UNDEFINED = ...
+        GLOO = ...
+        NCCL = ...
+        UCC = ...
+        MPI = ...
+        CUSTOM = ...
+    def __init__(self, store: Store, rank: int, size: int, options: Options): ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    @overload
+    def broadcast(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def broadcast(
+        self,
+        tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: List[Tensor],
+        opts: AllreduceOptions = ...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: List[Tensor],
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensor: Tensor,
+        op=...,
+    ) -> Work: ...
+    def allreduce_coalesced(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    def reduce_scatter_tensor_coalesced(
+        self,
+        outputTensors: List[Tensor],
+        inputTensors: List[Tensor],
+        opts: Optional[ReduceScatterOptions] = None,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensor: Tensor,
+        root: int,
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: List[List[Tensor]],
+        input_tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: List[Tensor],
+        input_tensor: Tensor,
+    ) -> Work: ...
+    def _allgather_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        opts=...,
+    ) -> Work: ...
+    def allgather_coalesced(
+        self,
+        output_lists: List[List[Tensor]],
+        input_list: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    def allgather_into_tensor_coalesced(
+        self,
+        output_lists: List[Tensor],
+        input_list: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: List[List[Tensor]],
+        input_tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: List[Tensor],
+        input_tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensors: List[Tensor],
+        input_tensors: List[List[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensor: Tensor,
+        input_tensors: List[Tensor],
+        root: int,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: List[Tensor],
+        input_tensors: List[List[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: Tensor,
+        input_tensor: List[Tensor],
+    ) -> Work: ...
+    def _reduce_scatter_base(
+        self,
+        outputTensor: Tensor,
+        inputTensor: Tensor,
+        opts: Optional[ReduceScatterOptions],
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output_tensor: Tensor,
+        input_tensor: Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output_tensor: List[Tensor],
+        input_tensor: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output: List[Tensor],
+        input: List[Tensor],
+    ) -> Work: ...
+    def send(
+        self,
+        tensors: List[Tensor],
+        dstRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv(
+        self,
+        tensors: List[Tensor],
+        srcRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv_anysource(self, tensors: List[Tensor], tag: int) -> Work: ...
+    def barrier(self, opts=...) -> Work: ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> ProcessGroup: ...
+    def _start_coalescing(self, device: torch.device) -> None: ...
+    def _end_coalescing(self, device: torch.device) -> Work: ...
+    def _get_backend_name(self) -> str: ...
+    def _backend_id(self, backend_type: BackendType) -> int: ...
+    @property
+    def _device_types(self) -> List[torch.device]: ...
+    def _get_backend(self, device: torch.device) -> Backend: ...
+    def _register_backend(
+        self,
+        device: torch.device,
+        backend_type: BackendType,
+        backend: Optional[Backend],
+    ) -> None: ...
+    def _set_group_name(self, name: str) -> None: ...
+    def name(self) -> str: ...
+    def _has_hooks(self) -> bool: ...
+    def _wait_for_pending_works(self) -> None: ...
+    def _set_sequence_number_for_group(self) -> None: ...
+    @property
+    def bound_device_id(self) -> Optional[torch.device]: ...
+    @bound_device_id.setter
+    def bound_device_id(self, device: Optional[torch.device]) -> None: ...
+    @property
+    def group_name(self) -> str: ...
+
+class ProcessGroupRoundRobin(ProcessGroup): ...
+
+def _round_robin_process_groups(
+    process_groups: List[ProcessGroup],
+) -> ProcessGroupRoundRobin: ...
+
+class ProcessGroupGloo(Backend):
+    class Device: ...
+    class Options: ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+    @staticmethod
+    def create_device(hostname="", interface="") -> Device: ...
+    @staticmethod
+    def create_default_device() -> Device: ...
+    def _set_default_timeout(self, timeout) -> None: ...
+
+class _ProcessGroupWrapper(Backend):
+    def __init__(self, pg: Backend, gloo_pg: ProcessGroupGloo): ...
+    wrapped_pg: Backend
+
+class ProcessGroupNCCL(Backend):
+    class Options:
+        def __init__(self, timeout: Optional[timedelta] = None): ...
+        @property
+        def backend(self) -> str: ...
+        @property
+        def _timeout(self) -> timedelta: ...
+        @_timeout.setter
+        def _timeout(self, val: timedelta) -> None: ...
+        @property
+        def _is_high_priority_stream(self) -> bool: ...
+        @_is_high_priority_stream.setter
+        def _is_high_priority_stream(self, val: bool) -> None: ...
+
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+    def _group_start(self) -> None: ...
+    def _group_end(self) -> None: ...
+    def _set_default_timeout(self, timeout) -> None: ...
+    def _shutdown(self) -> None: ...
+    @property
+    def uid(self) -> int: ...
+
+class ProcessGroupUCC(Backend):
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+
+class ProcessGroupMPI(Backend):
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+        pgComm: int,
+    ): ...
+    @staticmethod
+    def create(ranks: List[int]) -> ProcessGroupMPI: ...
+
+def _compute_bucket_assignment_by_size(
+    tensors: List[Tensor],
+    bucket_size_limits: List[int],
+    expect_sparse_gradient: List[bool] = ...,
+    tensor_indices: List[int] = ...,
+) -> Tuple[List[List[int]], List[int]]: ...
+def _broadcast_coalesced(
+    process_group: ProcessGroup,
+    tensors: List[Tensor],
+    buffer_size: int,
+    src: int,
+): ...
+def _test_python_store(store: Store): ...
+def _verify_params_across_processes(
+    process_group: ProcessGroup,
+    params: List[Tensor],
+    logger: Optional[Logger],
+): ...
+def _make_nccl_premul_sum(factor: Union[float, List[Tensor]]) -> ReduceOp: ...
+def _register_process_group(
+    group_name: str,
+    process_group: ProcessGroup,
+) -> None: ...
+def _resolve_process_group(group_name: str) -> ProcessGroup: ...
+def _unregister_all_process_groups() -> None: ...
+def _unregister_process_group(group_name: str) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_distributed_rpc.pyi b/MLPY/Lib/site-packages/torch/_C/_distributed_rpc.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8ecf79635a4131324743a256f07af50f196c70df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_distributed_rpc.pyi
@@ -0,0 +1,188 @@
+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from typing import Any, Dict, Generic, List, Optional, overload, Tuple, Type, TypeVar
+
+import torch
+
+from . import Future
+from ._autograd import ProfilerEvent
+from ._distributed_c10d import Store
+from ._profiler import ProfilerConfig
+
+# This module is defined in torch/csrc/distributed/rpc/init.cpp
+
+_DEFAULT_INIT_METHOD: str
+_DEFAULT_NUM_WORKER_THREADS: int
+_UNSET_RPC_TIMEOUT: float
+_DEFAULT_RPC_TIMEOUT_SEC: float
+
+_T = TypeVar("_T")
+
+class RpcBackendOptions:
+    rpc_timeout: float
+    init_method: str
+    def __init__(
+        self,
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+    ): ...
+
+class WorkerInfo:
+    def __init__(self, name: str, worker_id: int): ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def id(self) -> int: ...
+    def __eq__(self, other: object) -> bool: ...
+
+class RpcAgent:
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def sync(self): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    def get_worker_infos(self) -> List[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
+    def get_debug_info(self) -> Dict[str, str]: ...
+    def get_metrics(self) -> Dict[str, str]: ...
+
+class PyRRef(Generic[_T]):
+    def __init__(self, value: _T, type_hint: Any = None) -> None: ...
+    def is_owner(self) -> bool: ...
+    def confirmed_by_owner(self) -> bool: ...
+    def owner(self) -> WorkerInfo: ...
+    def owner_name(self) -> str: ...
+    def to_here(self, timeout: float = ...) -> _T: ...
+    def local_value(self) -> Any: ...
+    def rpc_sync(self, timeout: float = ...) -> Any: ...
+    def rpc_async(self, timeout: float = ...) -> Any: ...
+    def remote(self, timeout: float = ...) -> Any: ...
+    def _serialize(self) -> Tuple: ...
+    @staticmethod
+    def _deserialize(tp: Tuple) -> PyRRef: ...
+    def _get_type(self) -> Type[_T]: ...
+    def _get_future(self) -> Future[_T]: ...
+    def _get_profiling_future(self) -> Future[_T]: ...
+    def _set_profiling_future(self, profilingFuture: Future[_T]): ...
+
+class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions):
+    num_worker_threads: int
+    device_maps: Dict[str, Dict[torch.device, torch.device]]
+    devices: List[torch.device]
+    def __init__(
+        self,
+        num_worker_threads: int,
+        _transports: Optional[List],
+        _channels: Optional[List],
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+        device_maps: Dict[str, Dict[torch.device, torch.device]] = {},  # noqa: B006
+        devices: List[torch.device] = [],  # noqa: B006
+    ): ...
+    def _set_device_map(
+        self,
+        to: str,
+        device_map: Dict[torch.device, torch.device],
+    ): ...
+
+class TensorPipeAgent(RpcAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        worker_id: int,
+        world_size: Optional[int],
+        opts: _TensorPipeRpcBackendOptionsBase,
+        reverse_device_maps: Dict[str, Dict[torch.device, torch.device]],
+        devices: List[torch.device],
+    ): ...
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, id: int) -> WorkerInfo: ...
+    def get_worker_infos(self) -> List[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
+    def _update_group_membership(
+        self,
+        worker_info: WorkerInfo,
+        my_devices: List[torch.device],
+        reverse_device_map: Dict[str, Dict[torch.device, torch.device]],
+        is_join: bool,
+    ): ...
+    def _get_backend_options(self) -> _TensorPipeRpcBackendOptionsBase: ...
+    @property
+    def is_static_group(self) -> bool: ...
+    @property
+    def store(self) -> Store: ...
+
+def _is_current_rpc_agent_set() -> bool: ...
+def _get_current_rpc_agent() -> RpcAgent: ...
+def _set_and_start_rpc_agent(agent: RpcAgent): ...
+def _reset_current_rpc_agent(): ...
+def _delete_all_user_and_unforked_owner_rrefs(timeout: timedelta = ...): ...
+def _destroy_rref_context(ignoreRRefLeak: bool): ...
+def _rref_context_get_debug_info() -> Dict[str, str]: ...
+def _cleanup_python_rpc_handler(): ...
+def _invoke_rpc_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_rpc_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: List[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_rpc_torchscript(
+    dstWorkerName: str,
+    qualifiedNameStr: str,
+    argsTuple: Tuple,
+    kwargsDict: Dict,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_remote_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: List[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_torchscript(
+    dstWorkerName: WorkerInfo,
+    qualifiedNameStr: str,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def get_rpc_timeout() -> float: ...
+def enable_gil_profiling(flag: bool): ...
+def _set_rpc_timeout(rpcTimeoutSeconds: float): ...
+
+class RemoteProfilerManager:
+    @staticmethod
+    def set_current_profiling_key(key: str): ...
+
+def _enable_server_process_global_profiler(new_config: ProfilerConfig): ...
+def _disable_server_process_global_profiler() -> List[List[List[ProfilerEvent]]]: ...
+def _set_profiler_node_id(default_node_id: int): ...
+def _enable_jit_rref_pickle(): ...
+def _disable_jit_rref_pickle(): ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_distributed_rpc_testing.pyi b/MLPY/Lib/site-packages/torch/_C/_distributed_rpc_testing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bf66235d1eaea85bafde64e316a5b3168ecb61db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_distributed_rpc_testing.pyi
@@ -0,0 +1,35 @@
+from typing import Dict, List
+
+import torch
+
+from ._distributed_c10d import Store
+from ._distributed_rpc import _TensorPipeRpcBackendOptionsBase, TensorPipeAgent
+
+# This module is defined in torch/csrc/distributed/rpc/testing/init.cpp
+
+class FaultyTensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
+    def __init__(
+        self,
+        num_worker_threads: int,
+        rpc_timeout: float,
+        init_method: str,
+        messages_to_fail: List[str],
+        messages_to_delay: Dict[str, float],
+        num_fail_sends: int,
+    ): ...
+    num_send_recv_threads: int
+    messages_to_fail: List[str]
+    messages_to_delay: Dict[str, float]
+    num_fail_sends: int
+
+class FaultyTensorPipeAgent(TensorPipeAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        rank: int,
+        world_size: int,
+        options: FaultyTensorPipeRpcBackendOptions,
+        reverse_device_maps: Dict[str, Dict[torch.device, torch.device]],
+        devices: List[torch.device],
+    ): ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_functions.pyi b/MLPY/Lib/site-packages/torch/_C/_functions.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..c50c31039b91219308ff1ce00c90ac7d77870f19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_functions.pyi
@@ -0,0 +1,11 @@
+from typing import AnyStr, List
+
+from torch import Tensor
+
+class UndefinedGrad:
+    def __init__(self) -> None: ...
+    def __call__(self, *inputs: Tensor) -> List[Tensor]: ...
+
+class DelayedError:
+    def __init__(self, msg: AnyStr, num_inputs: int) -> None: ...
+    def __call__(self, inputs: List[Tensor]) -> List[Tensor]: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_functorch.pyi b/MLPY/Lib/site-packages/torch/_C/_functorch.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8dce498aa642ad78f96fe1ed5895b0da41f72b4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_functorch.pyi
@@ -0,0 +1,77 @@
+from enum import Enum
+from typing import Optional, Tuple
+
+from torch import Tensor
+
+# Defined in torch/csrc/functorch/init.cpp
+
+def _set_dynamic_layer_keys_included(included: bool) -> None: ...
+def get_unwrapped(tensor: Tensor) -> Tensor: ...
+def is_batchedtensor(tensor: Tensor) -> bool: ...
+def is_functionaltensor(tensor: Tensor) -> bool: ...
+def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
+def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
+def maybe_get_bdim(tensor: Tensor) -> int: ...
+def maybe_get_level(tensor: Tensor) -> int: ...
+def maybe_current_level() -> Optional[int]: ...
+def unwrap_if_dead(tensor: Tensor) -> Tensor: ...
+def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _unwrap_batched(tensor: Tensor, level: int) -> Tuple[Tensor, Optional[int]]: ...
+def current_level() -> int: ...
+def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
+def get_single_level_autograd_function_allowed() -> bool: ...
+def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
+def _wrap_functional_tensor(tensor: Tensor, level: int) -> Tensor: ...
+def _vmap_increment_nesting(batch_size: int, randomness: str) -> int: ...
+def _vmap_decrement_nesting() -> int: ...
+def _grad_increment_nesting() -> int: ...
+def _grad_decrement_nesting() -> int: ...
+
+# Defined in aten/src/ATen/functorch/Interpreter.h
+class TransformType(Enum):
+    Torch: TransformType = ...
+    Vmap: TransformType = ...
+    Grad: TransformType = ...
+    Jvp: TransformType = ...
+    Functionalize: TransformType = ...
+
+class RandomnessType(Enum):
+    Error: TransformType = ...
+    Same: TransformType = ...
+    Different: TransformType = ...
+
+class CInterpreter:
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+
+class CGradInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevGradMode(self) -> bool: ...
+
+class CJvpInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevFwdGradMode(self) -> bool: ...
+
+class CFunctionalizeInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def functionalizeAddBackViews(self) -> bool: ...
+
+class CVmapInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def batchSize(self) -> int: ...
+    def randomness(self) -> RandomnessType: ...
+
+class DynamicLayer: ...
+
+def get_interpreter_stack() -> list[CInterpreter]: ...
+def peek_interpreter_stack() -> CInterpreter: ...
+def pop_dynamic_layer_stack() -> DynamicLayer: ...
+def push_dynamic_layer_stack(dl: DynamicLayer) -> int: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_itt.pyi b/MLPY/Lib/site-packages/torch/_C/_itt.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..a6f2559396fde84b318a768d3e6563ba6be93873
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_itt.pyi
@@ -0,0 +1,5 @@
+# Defined in torch/csrc/itt.cpp
+def is_available() -> None: ...
+def rangePush(message: str) -> None: ...
+def rangePop() -> None: ...
+def mark(message: str) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_lazy.pyi b/MLPY/Lib/site-packages/torch/_C/_lazy.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7d7889a1981afa90a28eab6ef08ade70280b1e18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_lazy.pyi
@@ -0,0 +1,28 @@
+from typing import List
+
+from torch import Tensor
+
+# defined in torch/csrc/lazy/python/init.cpp
+def _mark_step(device: str, devices: List[str], wait: bool): ...
+def _wait_device_ops(devices: List[str]): ...
+def _reset_metrics(): ...
+def _counter_names() -> List[str]: ...
+def _counter_value(name: str) -> int: ...
+def _metrics_report() -> str: ...
+def _get_graph_hash(tensors: List[Tensor]) -> str: ...
+def _sync_multi(
+    tensors: List[Tensor],
+    devices: List[str],
+    wait: bool = True,
+    sync_ltc_data: bool = True,
+): ...
+def _get_tensor_id(tensor: Tensor) -> int: ...
+def _get_tensors_text(tensors: List[Tensor]) -> str: ...
+def _get_tensors_dot(tensors: List[Tensor]) -> str: ...
+def _get_tensors_backend(tensors: List[Tensor]) -> str: ...
+def _get_force_fallback() -> str: ...
+def _set_force_fallback(newval: str): ...
+def _clear_ir_cache(): ...
+def _dump_ir_cache(filename: str): ...
+def _set_reuse_ir(val: bool): ...
+def _get_default_device_type(): ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_lazy_ts_backend.pyi b/MLPY/Lib/site-packages/torch/_C/_lazy_ts_backend.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..535af4c4851101fb690292378ffe6da55eb80e32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_lazy_ts_backend.pyi
@@ -0,0 +1,11 @@
+# defined in torch/csrc/lazy/python/init.cpp
+
+from typing import Any, List, Tuple
+
+from torch import Tensor
+
+def _init(): ...
+def _get_tensors_ts_device_data_node(
+    tensors: List[Tensor],
+) -> Tuple[List[int], List[Any]]: ...
+def _run_cached_graph(hash_str: str, graph_inputs: List[Any]) -> List[Tensor]: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_monitor.pyi b/MLPY/Lib/site-packages/torch/_C/_monitor.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6d33ebde320174a6bd7d4eb9505e4d2245c852ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_monitor.pyi
@@ -0,0 +1,44 @@
+# Defined in torch/csrc/monitor/python_init.cpp
+
+import datetime
+from enum import Enum
+from typing import Callable, Dict, List, Union
+
+class Aggregation(Enum):
+    VALUE = ...
+    MEAN = ...
+    COUNT = ...
+    SUM = ...
+    MAX = ...
+    MIN = ...
+
+class Stat:
+    name: str
+    count: int
+    def __init__(
+        self,
+        name: str,
+        aggregations: List[Aggregation],
+        window_size: int,
+        max_samples: int = -1,
+    ) -> None: ...
+    def add(self, v: float) -> None: ...
+    def get(self) -> Dict[Aggregation, float]: ...
+
+class Event:
+    name: str
+    timestamp: datetime.datetime
+    data: Dict[str, Union[int, float, bool, str]]
+    def __init__(
+        self,
+        name: str,
+        timestamp: datetime.datetime,
+        data: Dict[str, Union[int, float, bool, str]],
+    ) -> None: ...
+
+def log_event(e: Event) -> None: ...
+
+class EventHandlerHandle: ...
+
+def register_event_handler(handler: Callable[[Event], None]) -> EventHandlerHandle: ...
+def unregister_event_handler(handle: EventHandlerHandle) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_nn.pyi b/MLPY/Lib/site-packages/torch/_C/_nn.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..9beb7e61a5753a6206a426599ae99fe19611d331
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_nn.pyi
@@ -0,0 +1,86 @@
+# mypy: disable-error-code="type-arg"
+from typing import List, Optional, overload, Sequence, Tuple, Union
+
+from torch import memory_format, Tensor
+from torch.types import _bool, _device, _dtype, _int, _size
+
+# Defined in tools/autograd/templates/python_nn_functions.cpp
+
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def avg_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def avg_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def elu_(input: Tensor, alpha: float = ...) -> Tensor: ...
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def gelu(input: Tensor, approximate: str = ...) -> Tensor: ...
+def hardsigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh(input: Tensor, min_val: float = ..., max_val: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh_(input: Tensor, min_val: float = ..., max_val: float = ...) -> Tensor: ...
+def leaky_relu(input: Tensor, negative_slope: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def leaky_relu_(input: Tensor, negative_slope: float = ...) -> Tensor: ...
+def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def log_sigmoid(input: Tensor) -> Tensor: ...
+def one_hot(tensor: Tensor, num_classes: int = ...) -> Tensor: ...
+def pad(input: Tensor, pad: Sequence[int], mode: str = ..., value: Optional[float] = None) -> Tensor: ...
+def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, is_causal: bool = False, scale: Optional[float] = None) -> Tensor: ...
+def softplus(input: Tensor, beta: float = ..., threshold: float = ...) -> Tensor: ...
+def softshrink(input: Tensor, lambd: float = ...) -> Tensor: ...
+
+# Defined in aten/src/ATen/native/mkldnn/Linear.cpp
+def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: ...
+
+# Defined at aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+def mkldnn_reorder_conv2d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+def mkldnn_reorder_conv3d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+
+# Defined in aten/src/ATen/native/mkldnn/Prelu.cpp
+def mkldnn_prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+
+# Defined at tools/autograd/templates/python_nn_functions.cpp
+@overload
+def _parse_to(
+    device: _device,
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    tensor: Tensor,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+
+# Defined in aten/src/ATen/native/PadSequence.cpp
+def pad_sequence(
+    sequences: List[Tensor],
+    batch_first: bool = False,
+    padding_value: float = ...,
+) -> Tensor: ...
+def flatten_dense_tensors(tensors: List[Tensor]) -> Tensor: ...
+def unflatten_dense_tensors(flat: Tensor, tensors: List[Tensor]) -> List[Tensor]: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_nvtx.pyi b/MLPY/Lib/site-packages/torch/_C/_nvtx.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..ff1b574b947940e31ff403cca714c8d1bec0c50d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_nvtx.pyi
@@ -0,0 +1,6 @@
+# Defined in torch/csrc/cuda/shared/nvtx.cpp
+def rangePushA(message: str) -> int: ...
+def rangePop() -> int: ...
+def rangeStartA(message: str) -> int: ...
+def rangeEnd(int) -> None: ...
+def markA(message: str) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_onnx.pyi b/MLPY/Lib/site-packages/torch/_C/_onnx.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..ac1d0f6d51934957c01ea23e7b0059d4ea42316f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_onnx.pyi
@@ -0,0 +1,40 @@
+# Defined in torch/csrc/onnx/init.cpp
+
+from enum import Enum
+
+_CAFFE2_ATEN_FALLBACK: bool
+PRODUCER_VERSION: str
+
+class TensorProtoDataType(Enum):
+    UNDEFINED = ...
+    FLOAT = ...
+    UINT8 = ...
+    INT8 = ...
+    UINT16 = ...
+    INT16 = ...
+    INT32 = ...
+    INT64 = ...
+    STRING = ...
+    BOOL = ...
+    FLOAT16 = ...
+    DOUBLE = ...
+    UINT32 = ...
+    UINT64 = ...
+    COMPLEX64 = ...
+    COMPLEX128 = ...
+    BFLOAT16 = ...
+    FLOAT8E5M2 = ...
+    FLOAT8E4M3FN = ...
+    FLOAT8E5M2FNUZ = ...
+    FLOAT8E4M3FNUZ = ...
+
+class OperatorExportTypes(Enum):
+    ONNX = ...
+    ONNX_ATEN = ...
+    ONNX_ATEN_FALLBACK = ...
+    ONNX_FALLTHROUGH = ...
+
+class TrainingMode(Enum):
+    EVAL = ...
+    PRESERVE = ...
+    TRAINING = ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_profiler.pyi b/MLPY/Lib/site-packages/torch/_C/_profiler.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7cc8dc08e673c4906ac78d506b3c0012a5c9b0bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_profiler.pyi
@@ -0,0 +1,238 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+from torch._C import device, dtype, layout
+from typing_extensions import TypeAlias
+
+# defined in torch/csrc/profiler/python/init.cpp
+
+class RecordScope(Enum):
+    FUNCTION = ...
+    BACKWARD_FUNCTION = ...
+    TORCHSCRIPT_FUNCTION = ...
+    KERNEL_FUNCTION_DTYPE = ...
+    CUSTOM_CLASS = ...
+    BUILD_FEATURE = ...
+    LITE_INTERPRETER = ...
+    USER_SCOPE = ...
+    STATIC_RUNTIME_OP = ...
+    STATIC_RUNTIME_MODEL = ...
+
+class ProfilerState(Enum):
+    Disable = ...
+    CPU = ...
+    CUDA = ...
+    NVTX = ...
+    ITT = ...
+    KINETO = ...
+    KINETO_GPU_FALLBACK = ...
+    KINETO_PRIVATEUSE1_FALLBACK = ...
+    KINETO_PRIVATEUSE1 = ...
+
+class ActiveProfilerType(Enum):
+    NONE = ...
+    LEGACY = ...
+    KINETO = ...
+    NVTX = ...
+    ITT = ...
+
+class ProfilerActivity(Enum):
+    CPU = ...
+    CUDA = ...
+    MTIA = ...
+    PrivateUse1 = ...
+
+class _EventType(Enum):
+    TorchOp = ...
+    Backend = ...
+    Allocation = ...
+    OutOfMemory = ...
+    PyCall = ...
+    PyCCall = ...
+    Kineto = ...
+
+class _ExperimentalConfig:
+    def __init__(
+        self,
+        profiler_metrics: List[str] = ...,
+        profiler_measure_per_kernel: bool = ...,
+        verbose: bool = ...,
+        performance_events: List[str] = ...,
+        enable_cuda_sync_events: bool = ...,
+    ) -> None: ...
+
+class ProfilerConfig:
+    def __init__(
+        self,
+        state: ProfilerState,
+        report_input_shapes: bool,
+        profile_memory: bool,
+        with_stack: bool,
+        with_flops: bool,
+        with_modules: bool,
+        experimental_config: _ExperimentalConfig,
+    ) -> None: ...
+
+class _ProfilerEvent:
+    start_tid: int
+    start_time_ns: int
+    children: List[_ProfilerEvent]
+
+    # TODO(robieta): remove in favor of `self.typed`
+    extra_fields: Union[
+        _ExtraFields_TorchOp,
+        _ExtraFields_Backend,
+        _ExtraFields_Allocation,
+        _ExtraFields_OutOfMemory,
+        _ExtraFields_PyCall,
+        _ExtraFields_PyCCall,
+        _ExtraFields_Kineto,
+    ]
+
+    @property
+    def typed(
+        self,
+    ) -> Union[
+        Tuple[Literal[_EventType.TorchOp], _ExtraFields_TorchOp],
+        Tuple[Literal[_EventType.Backend], _ExtraFields_Backend],
+        Tuple[Literal[_EventType.Allocation], _ExtraFields_Allocation],
+        Tuple[Literal[_EventType.OutOfMemory], _ExtraFields_OutOfMemory],
+        Tuple[Literal[_EventType.PyCall], _ExtraFields_PyCall],
+        Tuple[Literal[_EventType.PyCCall], _ExtraFields_PyCCall],
+        Tuple[Literal[_EventType.Kineto], _ExtraFields_Kineto],
+    ]: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def tag(self) -> _EventType: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def parent(self) -> Optional[_ProfilerEvent]: ...
+    @property
+    def correlation_id(self) -> int: ...
+    @property
+    def end_time_ns(self) -> int: ...
+    @property
+    def duration_time_ns(self) -> int: ...
+
+class _TensorMetadata:
+    impl_ptr: Optional[int]
+    storage_data_ptr: Optional[int]
+    id: Optional[int]
+
+    @property
+    def allocation_id(self) -> Optional[int]: ...
+    @property
+    def layout(self) -> layout: ...
+    @property
+    def device(self) -> device: ...
+    @property
+    def dtype(self) -> dtype: ...
+    @property
+    def sizes(self) -> List[int]: ...
+    @property
+    def strides(self) -> List[int]: ...
+
+Scalar: TypeAlias = Union[int, float, bool, complex]
+Input: TypeAlias = Optional[Union[_TensorMetadata, List[_TensorMetadata], Scalar]]
+
+class _ExtraFields_TorchOp:
+    name: str
+    sequence_number: int
+    allow_tf32_cublas: bool
+
+    @property
+    def inputs(self) -> List[Input]: ...
+    @property
+    def scope(self) -> RecordScope: ...
+
+class _ExtraFields_Backend: ...
+
+class _ExtraFields_Allocation:
+    ptr: int
+    id: Optional[int]
+    alloc_size: int
+    total_allocated: int
+    total_reserved: int
+
+    @property
+    def allocation_id(self) -> Optional[int]: ...
+    @property
+    def device(self) -> device: ...
+
+class _ExtraFields_OutOfMemory: ...
+
+class _PyFrameState:
+    line_number: int
+    function_name: str
+
+    @property
+    def file_name(self) -> str: ...
+
+class _NNModuleInfo:
+    @property
+    def self_ptr(self) -> int: ...
+    @property
+    def cls_ptr(self) -> int: ...
+    @property
+    def cls_name(self) -> str: ...
+    @property
+    def parameters(
+        self,
+    ) -> List[Tuple[str, _TensorMetadata, Optional[_TensorMetadata]]]: ...
+
+class _OptimizerInfo:
+    @property
+    def parameters(
+        self,
+    ) -> List[
+        Tuple[
+            # Parameter
+            _TensorMetadata,
+            #
+            # Gradient (if present during optimizer.step())
+            Optional[_TensorMetadata],
+            #
+            # Optimizer state for Parameter as (name, tensor) pairs
+            List[Tuple[str, _TensorMetadata]],
+        ]
+    ]: ...
+
+class _ExtraFields_PyCCall:
+    @property
+    def caller(self) -> _PyFrameState: ...
+
+class _ExtraFields_PyCall:
+    @property
+    def callsite(self) -> _PyFrameState: ...
+    @property
+    def caller(self) -> _PyFrameState: ...
+    @property
+    def module(self) -> Optional[_NNModuleInfo]: ...
+    @property
+    def optimizer(self) -> Optional[_OptimizerInfo]: ...
+
+class _ExtraFields_Kineto: ...
+
+def _add_execution_trace_observer(output_file_path: str) -> bool: ...
+def _remove_execution_trace_observer() -> None: ...
+def _enable_execution_trace_observer() -> None: ...
+def _disable_execution_trace_observer() -> None: ...
+def _set_record_concrete_inputs_enabled_val(val: bool) -> None: ...
+def _set_fwd_bwd_enabled_val(val: bool) -> None: ...
+def _set_cuda_sync_enabled_val(val: bool) -> None: ...
+
+class CapturedTraceback: ...
+
+def gather_traceback(python: bool, script: bool, cpp: bool) -> CapturedTraceback: ...
+
+# The Dict has name, filename, line
+def symbolize_tracebacks(
+    to_symbolize: List[CapturedTraceback],
+) -> List[List[Dict[str, str]]]: ...
+
+class _RecordFunctionFast:
+    def __init__(self, name: str) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/_C/_verbose.pyi b/MLPY/Lib/site-packages/torch/_C/_verbose.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6d1dbfda288978aa1680412ad24bf488160ba854
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_C/_verbose.pyi
@@ -0,0 +1,3 @@
+# Defined in torch/csrc/utils/verbose.cpp
+def mkl_set_verbose(enable: int) -> int: ...
+def mkldnn_set_verbose(level: int) -> int: ...
diff --git a/MLPY/Lib/site-packages/torch/_VF.py b/MLPY/Lib/site-packages/torch/_VF.py
new file mode 100644
index 0000000000000000000000000000000000000000..53724c3246e81163c95826a3c69f5912e0dc3304
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_VF.py
@@ -0,0 +1,30 @@
+"""
+This makes the functions in torch._C._VariableFunctions available as
+    torch._VF.<funcname>
+without mypy being able to find them.
+
+A subset of those functions are mapped to ATen functions in
+torch/jit/_builtins.py
+
+See https://github.com/pytorch/pytorch/issues/21478 for the reason for
+introducing torch._VF
+
+"""
+import sys
+import types
+
+import torch
+
+
+class VFModule(types.ModuleType):
+    vf: types.ModuleType
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.vf = torch._C._VariableFunctions
+
+    def __getattr__(self, attr):
+        return getattr(self.vf, attr)
+
+
+sys.modules[__name__] = VFModule(__name__)
diff --git a/MLPY/Lib/site-packages/torch/_VF.pyi b/MLPY/Lib/site-packages/torch/_VF.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e7bc45da38b2228706f8e353adb5af335d22eae3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_VF.pyi
@@ -0,0 +1,25648 @@
+# @generated from torch/_C/_VariableFunctions.pyi.in
+# mypy: disable-error-code="type-arg"
+
+import builtins
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterator,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch import contiguous_format, Generator, inf, memory_format, strided, SymInt, Tensor
+from torch.types import (
+    _bool,
+    _complex,
+    _device,
+    _dtype,
+    _float,
+    _int,
+    _layout,
+    _qscheme,
+    _size,
+    Device,
+    Number,
+)
+
+from torch._prims_common import DeviceLikeType
+
+@overload
+def __and__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __and__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __lshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __or__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __rshift__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def __xor__(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def _adaptive_avg_pool2d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _adaptive_avg_pool3d(input: Tensor, output_size: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]]) -> Tensor: ...
+def _add_batch_dim(input: Tensor, batch_dim: _int, level: _int) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _add_relu(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def _add_relu_(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _addmm_activation(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, use_gelu: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def _aminmax(input: Tensor) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _aminmax(input: Tensor, dim: _int, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _amp_foreach_non_finite_check_and_unscale_(self: Union[Tuple[Tensor, ...], List[Tensor]], found_inf: Tensor, inv_scale: Tensor) -> None: ...
+def _amp_update_scale_(input: Tensor, growth_tracker: Tensor, found_inf: Tensor, scale_growth_factor: _float, scale_backoff_factor: _float, growth_interval: _int) -> Tensor: ...
+@overload
+def _assert_async(input: Tensor) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+@overload
+def _assert_async(input: Tensor, assert_msg: str) -> None: 
+    r"""
+    _assert_async(tensor) -> void
+    
+    Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+    this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+    CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+    failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+    testing invariants in CUDA tensors without giving up performance.  This function
+    is NOT intended to be used for regular error checking, as it will trash your CUDA
+    context if the assert fails (forcing you to restart your PyTorch process.)
+    
+    Args:
+        tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+            elements (including False for boolean tensors) cause an assertion failure
+            to be raised.
+    """
+    ...
+def _assert_scalar(self: Union[Number, _complex], assert_msg: str) -> None: ...
+def _assert_tensor_metadata(a: Tensor, size: Optional[Sequence[Union[_int, SymInt]]] = None, stride: Optional[Sequence[Union[_int, SymInt]]] = None, dtype: Optional[_dtype] = None) -> None: ...
+def _batch_norm_impl_index(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, _int]: ...
+def _cast_Byte(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Char(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Double(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Float(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Half(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Int(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Long(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _cast_Short(input: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _choose_qparams_per_tensor(input: Tensor, reduce_range: _bool = False) -> Tuple[_float, _int]: ...
+def _chunk_cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int, num_chunks: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _coalesce(input: Tensor) -> Tensor: ...
+def _compute_linear_combination(input: Tensor, coefficients: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj(input: Tensor) -> Tensor: ...
+def _conj_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _conj_physical(input: Tensor) -> Tensor: ...
+def _convert_indices_from_coo_to_csr(input: Tensor, size: _int, *, out_int32: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_indices_from_csr_to_coo(crow_indices: Tensor, col_indices: Tensor, *, out_int32: _bool = False, transpose: _bool = False, out: Optional[Tensor] = None) -> Tensor: ...
+def _convert_weight_to_int4pack(input: Tensor, innerKTiles: _int) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: _size, groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool) -> Tensor: ...
+@overload
+def _convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, cudnn_enabled: _bool, allow_tf32: _bool) -> Tensor: ...
+def _convolution_mode(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: str, dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _copy_from(input: Tensor, dst: Tensor, non_blocking: _bool = False) -> Tensor: ...
+def _copy_from_and_resize(input: Tensor, dst: Tensor) -> Tensor: ...
+def _cslt_compress(input: Tensor) -> Tensor: ...
+def _cslt_sparse_mm(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False, alg_id: _int = 0) -> Tensor: ...
+def _cslt_sparse_mm_search(compressed_A: Tensor, dense_B: Tensor, bias: Optional[Tensor] = None, alpha: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, transpose_result: _bool = False) -> _int: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, zero_infinity: _bool = False) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def _cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int, deterministic: _bool, zero_infinity: _bool) -> Tuple[Tensor, Tensor]: ...
+def _cudnn_init_dropout_state(dropout: _float, train: _bool, dropout_seed: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _cudnn_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, weight_buf: Optional[Tensor], hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: Sequence[Union[_int, SymInt]], dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _cudnn_rnn_flatten_weight(weight_arr: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, input_size: Union[_int, SymInt], mode: _int, hidden_size: Union[_int, SymInt], proj_size: Union[_int, SymInt], num_layers: _int, batch_first: _bool, bidirectional: _bool) -> Tensor: ...
+def _cufft_clear_plan_cache(device_index: _int) -> None: ...
+def _cufft_get_plan_cache_max_size(device_index: _int) -> _int: ...
+def _cufft_get_plan_cache_size(device_index: _int) -> _int: ...
+def _cufft_set_plan_cache_max_size(device_index: _int, max_size: _int) -> None: ...
+def _cummax_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _cummin_helper(input: Tensor, values: Tensor, indices: Tensor, dim: _int) -> None: ...
+def _debug_has_internal_overlap(input: Tensor) -> _int: ...
+def _dim_arange(like: Tensor, dim: _int) -> Tensor: ...
+def _dirichlet_grad(x: Tensor, alpha: Tensor, total: Tensor) -> Tensor: ...
+def _disable_functionalization(): ...
+@overload
+def _efficientzerotensor(size: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _efficientzerotensor(*size: _int, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def _embedding_bag_forward_only(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False, padding_idx: _int = -1) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def _empty_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_affine_quantized(*size: _int, scale: _float = 1, zero_point: _int = 0, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(size: Sequence[Union[_int, SymInt]], *, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def _empty_per_channel_affine_quantized(*size: _int, scales: Tensor, zero_points: Tensor, axis: _int, memory_format: Optional[memory_format] = contiguous_format, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _enable_functionalization(*, reapply_views: _bool = False): ...
+def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor: ...
+def _fake_quantize_learnable_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_learnable_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int, grad_factor: _float = 1.0) -> Tensor: ...
+def _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(input: Tensor, scale: Tensor, zero_point: Tensor, fake_quant_enabled: Tensor, quant_min: _int, quant_max: _int) -> torch.return_types._fake_quantize_per_tensor_affine_cachemask_tensor_qparams: ...
+def _fft_c2c(input: Tensor, dim: Sequence[Union[_int, SymInt]], normalization: _int, forward: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_c2r(input: Tensor, dim: _size, normalization: _int, last_dim_size: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fft_r2c(input: Tensor, dim: _size, normalization: _int, onesided: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _fill_mem_eff_dropout_mask_(input: Tensor, dropout_p: _float, seed: _int, offset: _int) -> Tensor: ...
+def _foobar(input: Tensor, arg1: _bool = True, arg2: _bool = True, *, arg3: _bool = True) -> Tensor: ...
+def _foreach_abs(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_abs(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_abs_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_abs_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.abs` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_acos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_acos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_acos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.acos` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor, *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_add_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcdiv_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Tensor) -> None: ...
+@overload
+def _foreach_addcmul_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensor1: Union[Tuple[Tensor, ...], List[Tensor]], tensor2: Union[Tuple[Tensor, ...], List[Tensor]], value: Union[Number, _complex] = 1) -> None: ...
+def _foreach_asin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_asin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_asin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_asin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.asin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_atan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_atan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_atan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.atan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_ceil(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+def _foreach_ceil_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_ceil_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.ceil` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_max_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_clamp_min_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_copy_(self: Union[Tuple[Tensor, ...], List[Tensor]], src: Union[Tuple[Tensor, ...], List[Tensor]], non_blocking: _bool = False) -> None: ...
+def _foreach_cos(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cos(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cos_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cos_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cos` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_cosh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_cosh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_cosh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.cosh` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_div_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_erf(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erf(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erf_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erf_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erf` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_erfc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_erfc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_erfc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.erfc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_exp(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_exp_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_exp_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.exp` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_expm1(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_expm1_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_expm1_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.expm1` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_floor(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_floor_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_floor_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.floor` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_frac(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+def _foreach_frac_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_frac_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.frac` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weight: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_lerp_(self: Union[Tuple[Tensor, ...], List[Tensor]], tensors1: Union[Tuple[Tensor, ...], List[Tensor]], weights: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_lgamma(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_lgamma(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_lgamma_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_lgamma_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.lgamma` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log10(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log10_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log10_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log10` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log1p(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log1p_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log1p_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log1p` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_log2(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log2_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log2_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log2` to each Tensor of the input list.
+    """
+    ...
+def _foreach_log_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_log_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.log` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_maximum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_minimum_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Tensor) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_mul_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_neg(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_neg(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_neg_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_neg_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.neg` to each Tensor of the input list.
+    """
+    ...
+def _foreach_norm(self: Union[Tuple[Tensor, ...], List[Tensor]], ord: Union[Number, _complex] = 2) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow(self: Union[Number, _complex], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Number, _complex]) -> None: ...
+@overload
+def _foreach_pow_(self: Union[Tuple[Tensor, ...], List[Tensor]], exponent: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_reciprocal(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_reciprocal(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_reciprocal_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_reciprocal_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.reciprocal` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_round(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_round_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_round_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.round` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sigmoid(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sigmoid_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sigmoid_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sigmoid` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sign(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _foreach_sign_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: ...
+def _foreach_sin(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sin(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sin_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sin_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sin` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sinh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sinh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sinh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sinh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_sqrt(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+def _foreach_sqrt_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_sqrt_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.sqrt` to each Tensor of the input list.
+    """
+    ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> Tuple[Tensor, ...]: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalars: Sequence[Union[Number, _complex]]) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], other: Union[Tuple[Tensor, ...], List[Tensor]], *, alpha: Union[Number, _complex] = 1) -> None: ...
+@overload
+def _foreach_sub_(self: Union[Tuple[Tensor, ...], List[Tensor]], scalar: Union[Number, _complex]) -> None: ...
+def _foreach_tan(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tan(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tan_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tan_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tan` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_tanh(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_tanh_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_tanh_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.tanh` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    _foreach_trunc(self: List[Tensor]) -> List[Tensor]
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_trunc_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_trunc_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.trunc` to each Tensor of the input list.
+    """
+    ...
+def _foreach_zero_(self: Union[Tuple[Tensor, ...], List[Tensor]]) -> None: 
+    r"""
+    _foreach_zero_(self: List[Tensor]) -> None
+    
+    Apply :func:`torch.zero` to each Tensor of the input list.
+    """
+    ...
+def _from_functional_tensor(t: Tensor) -> Tensor: ...
+def _functional_assert_async(input: Tensor, assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_assert_scalar(self: Union[Number, _complex], assert_msg: str, dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functional_sym_constrain_range_for_size(size: Union[Number, _complex], min: Optional[_int], max: Optional[_int], dep_token: Tensor) -> Tensor: ...
+def _functionalize_are_all_mutations_hidden_from_autograd(t: Tensor) -> _bool: ...
+def _functionalize_are_all_mutations_under_no_grad_or_inference_mode(t: Tensor) -> _bool: ...
+def _functionalize_commit_update(t: Tensor) -> None: ...
+def _functionalize_mark_mutation_hidden_from_autograd(t: Tensor) -> None: ...
+def _functionalize_replace(self_: Tensor, other: Tensor) -> None: ...
+def _functionalize_sync(t: Tensor) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adam_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: Tensor, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_adamw_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], exp_avgs: Union[Tuple[Tensor, ...], List[Tensor]], exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], max_exp_avg_sqs: Union[Tuple[Tensor, ...], List[Tensor]], state_steps: Union[Tuple[Tensor, ...], List[Tensor]], *, lr: _float, beta1: _float, beta2: _float, weight_decay: _float, eps: _float, amsgrad: _bool, maximize: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fused_dropout(input: Tensor, p: _float, generator: Optional[Generator] = None) -> Tuple[Tensor, Tensor]: ...
+def _fused_moving_avg_obs_fq_helper(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> torch.return_types._fused_moving_avg_obs_fq_helper: ...
+def _fused_sdp_choice(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> _int: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: Tensor, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+@overload
+def _fused_sgd_(self: Union[Tuple[Tensor, ...], List[Tensor]], grads: Union[Tuple[Tensor, ...], List[Tensor]], momentum_buffer_list: Union[Tuple[Tensor, ...], List[Tensor]], *, weight_decay: _float, momentum: _float, lr: _float, dampening: _float, nesterov: _bool, maximize: _bool, is_first_step: _bool, grad_scale: Optional[Tensor] = None, found_inf: Optional[Tensor] = None) -> None: ...
+def _fw_primal_copy(input: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _grid_sampler_2d_cpu_fallback(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def _has_compatible_shallow_copy_type(input: Tensor, from_: Tensor) -> _bool: ...
+def _histogramdd_bin_edges(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tuple[Tensor, ...]: ...
+def _histogramdd_from_bin_cts(input: Tensor, bins: _size, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _histogramdd_from_bin_tensors(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], *, weight: Optional[Tensor] = None, density: _bool = False) -> Tensor: ...
+def _index_put_impl_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False, unsafe: _bool = False) -> Tensor: ...
+def _indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _int_mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _is_all_true(input: Tensor) -> Tensor: ...
+def _is_any_true(input: Tensor) -> Tensor: ...
+def _is_functional_tensor(t: Tensor) -> _bool: ...
+def _is_zerotensor(input: Tensor) -> _bool: ...
+def _lazy_clone(input: Tensor) -> Tensor: ...
+def _linalg_check_errors(info: Tensor, api_name: str, *, is_matrix: _bool) -> None: ...
+def _linalg_det(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_det: ...
+def _linalg_eigh(A: Tensor, UPLO: str = "L", compute_v: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_eigh: ...
+def _linalg_slogdet(A: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_slogdet: ...
+def _linalg_solve_ex(A: Tensor, B: Tensor, *, left: _bool = True, check_errors: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_solve_ex: ...
+def _linalg_svd(A: Tensor, full_matrices: _bool = False, compute_uv: _bool = True, *, driver: Optional[str] = None, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types._linalg_svd: ...
+def _log_softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _lstm_mps(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def _lu_with_info(input: Tensor, pivot: _bool = True, check_errors: _bool = True) -> torch.return_types._lu_with_info: ...
+def _make_dep_token(*, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def _make_dual(primal: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _make_dual_copy(primal: Tensor, tangent: Tensor, level: _int, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _make_per_channel_quantized_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int) -> Tensor: ...
+def _make_per_tensor_quantized_tensor(input: Tensor, scale: _float, zero_point: _int) -> Tensor: ...
+def _masked_scale(input: Tensor, mask: Tensor, scale: _float) -> Tensor: ...
+def _masked_softmax(input: Tensor, mask: Tensor, dim: Optional[_int] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _mixed_dtypes_linear(input: Tensor, weight: Tensor, scale: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None) -> Tensor: ...
+def _mkldnn_reshape(input: Tensor, shape: _size) -> Tensor: ...
+def _mkldnn_transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mkldnn_transpose_(input: Tensor, dim0: _int, dim1: _int) -> Tensor: ...
+def _mps_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def _mps_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def _native_batch_norm_legit(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_batch_norm_legit_no_training(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Tensor, running_var: Tensor, momentum: _float, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _native_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None, need_weights: _bool = True, average_attn_weights: _bool = True, mask_type: Optional[_int] = None) -> Tuple[Tensor, Tensor]: ...
+def _neg_view(input: Tensor) -> Tensor: ...
+def _neg_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_from_padded(padded: Tensor, cpu_nested_shape_example: Tensor, fuse_transform_0213: _bool = False) -> Tensor: ...
+def _nested_from_padded_and_nested_example(padded: Tensor, nt_example: Tensor) -> Tensor: ...
+def _nested_get_jagged_dummy(any: Tensor) -> Tensor: ...
+def _nested_get_lengths(input: Tensor) -> Tensor: ...
+def _nested_get_offsets(input: Tensor) -> Tensor: ...
+def _nested_get_ragged_idx(input: Tensor) -> _int: ...
+def _nested_get_values(input: Tensor) -> Tensor: ...
+def _nested_get_values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_tensor_from_mask(t: Tensor, mask: Tensor, mask_check: _bool = True) -> Tensor: ...
+def _nested_tensor_from_mask_left_aligned(t: Tensor, mask: Tensor) -> _bool: ...
+def _nested_tensor_from_tensor_list(list: Union[Tuple[Tensor, ...], List[Tensor]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = None) -> Tensor: ...
+def _nested_tensor_softmax_with_shape(input: Tensor, query: Tensor) -> Tensor: ...
+def _nested_view_from_buffer(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor) -> Tensor: ...
+def _nested_view_from_buffer_copy(input: Tensor, nested_size: Tensor, nested_strides: Tensor, offsets: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nested_view_from_jagged(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1) -> Tensor: ...
+def _nested_view_from_jagged_copy(input: Tensor, offsets: Tensor, dummy: Tensor, lengths: Optional[Tensor] = None, ragged_idx: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _nnpack_available() -> _bool: ...
+def _nnpack_spatial_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def _pack_padded_sequence(input: Tensor, lengths: Tensor, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def _pad_packed_sequence(data: Tensor, batch_sizes: Tensor, batch_first: _bool, padding_value: Union[Number, _complex], total_length: _int) -> Tuple[Tensor, Tensor]: ...
+def _pin_memory(input: Tensor, device: Optional[Optional[DeviceLikeType]] = None) -> Tensor: ...
+def _prelu_kernel(input: Tensor, weight: Tensor) -> Tensor: ...
+def _print(s: str) -> None: ...
+def _propagate_xla_data(input: Tensor, output: Tensor) -> None: ...
+def _remove_batch_dim(input: Tensor, level: _int, batch_size: _int, out_dim: _int) -> Tensor: ...
+def _reshape_alias_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: ...
+def _reshape_from_tensor(input: Tensor, shape: Tensor) -> Tensor: ...
+def _resize_output_(input: Tensor, size: Sequence[Union[_int, SymInt]], device: Optional[DeviceLikeType]) -> Tensor: ...
+def _rowwise_prune(weight: Tensor, mask: Tensor, compressed_indices_dtype: _dtype) -> Tuple[Tensor, Tensor]: ...
+def _sample_dirichlet(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _saturate_weight_to_fp16(weight: Tensor) -> Tensor: ...
+def _scaled_dot_product_attention_math(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: _float = 0.0, is_causal: _bool = False, dropout_mask: Optional[Tensor] = None, *, scale: Optional[_float] = None) -> Tuple[Tensor, Tensor]: ...
+def _scaled_dot_product_cudnn_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_cudnn_attention: ...
+def _scaled_dot_product_efficient_attention(query: Tensor, key: Tensor, value: Tensor, attn_bias: Optional[Tensor], compute_log_sumexp: _bool, dropout_p: _float = 0.0, is_causal: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_efficient_attention: ...
+def _scaled_dot_product_flash_attention(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, return_debug_mask: _bool = False, *, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention: ...
+def _scaled_dot_product_flash_attention_for_cpu(query: Tensor, key: Tensor, value: Tensor, dropout_p: _float = 0.0, is_causal: _bool = False, *, attn_mask: Optional[Tensor] = None, scale: Optional[_float] = None) -> torch.return_types._scaled_dot_product_flash_attention_for_cpu: ...
+def _scaled_mm(input: Tensor, mat2: Tensor, *, bias: Optional[Tensor] = None, out_dtype: Optional[_dtype] = None, scale_a: Optional[Tensor] = None, scale_b: Optional[Tensor] = None, scale_result: Optional[Tensor] = None, use_fast_accum: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor]: ...
+def _shape_as_tensor(input: Tensor) -> Tensor: ...
+def _sobol_engine_draw(quasi: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int, dtype: Optional[_dtype]) -> Tuple[Tensor, Tensor]: ...
+def _sobol_engine_ff_(input: Tensor, n: _int, sobolstate: Tensor, dimension: _int, num_generated: _int) -> Tensor: ...
+def _sobol_engine_initialize_state_(input: Tensor, dimension: _int) -> Tensor: ...
+def _sobol_engine_scramble_(input: Tensor, ltm: Tensor, dimension: _int) -> Tensor: ...
+def _softmax(input: Tensor, dim: _int, half_to_float: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input_dtype: _dtype, *, grad_input: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_broadcast_to(input: Tensor, size: _size) -> Tensor: ...
+def _sparse_broadcast_to_copy(input: Tensor, size: _size, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _sparse_csr_prod(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_csr_sum(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_log_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_semi_structured_linear(input: Tensor, weight: Tensor, meta: Tensor, *, bias: Optional[Tensor] = None, activation: Optional[str] = None, out_dtype: Optional[_dtype] = None) -> Tensor: ...
+def _sparse_softmax_backward_data(grad_output: Tensor, output: Tensor, dim: _int, input: Tensor) -> Tensor: ...
+def _sparse_sparse_matmul(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, *, dtype: _dtype) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size]) -> Tensor: ...
+@overload
+def _sparse_sum(input: Tensor, dim: Union[_int, _size], *, dtype: _dtype) -> Tensor: ...
+def _stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _standard_gamma(input: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+def _standard_gamma_grad(input: Tensor, output: Tensor) -> Tensor: ...
+def _sync(t: Tensor) -> None: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor) -> Tensor: ...
+@overload
+def _test_autograd_multiple_dispatch(input: Tensor, b: _bool) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view(input: Tensor) -> Tensor: ...
+def _test_autograd_multiple_dispatch_view_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _test_check_tensor(input: Tensor) -> Tensor: ...
+def _test_functorch_fallback(input: Tensor, other: Tensor) -> Tensor: ...
+def _test_parallel_materialize(input: Tensor, num_parallel: _int, skip_first: _bool = False) -> Tensor: ...
+def _test_serialization_subcmul(input: Tensor, other: Tensor, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def _to_cpu(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: ...
+def _to_functional_tensor(t: Tensor) -> Tensor: ...
+def _to_sparse_semi_structured(dense: Tensor) -> Tuple[Tensor, Tensor]: ...
+def _transform_bias_rescale_qkv(qkv: Tensor, qkv_bias: Tensor, num_heads: _int) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _transformer_encoder_layer_fwd(src: Tensor, embed_dim: _int, num_heads: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, use_gelu: _bool, norm_first: _bool, eps: _float, norm_weight_1: Tensor, norm_bias_1: Tensor, norm_weight_2: Tensor, norm_bias_2: Tensor, ffn_weight_1: Tensor, ffn_bias_1: Tensor, ffn_weight_2: Tensor, ffn_bias_2: Tensor, mask: Optional[Tensor] = None, mask_type: Optional[_int] = None) -> Tensor: ...
+def _trilinear(i1: Tensor, i2: Tensor, i3: Tensor, expand1: _size, expand2: _size, expand3: _size, sumdim: _size, unroll_dim: _int = 1) -> Tensor: ...
+def _triton_multi_head_attention(query: Tensor, key: Tensor, value: Tensor, embed_dim: _int, num_head: _int, qkv_weight: Tensor, qkv_bias: Tensor, proj_weight: Tensor, proj_bias: Tensor, mask: Optional[Tensor] = None) -> Tensor: ...
+def _triton_scaled_dot_attention(q: Tensor, k: Tensor, v: Tensor, dropout_p: _float = 0.0) -> Tensor: ...
+def _unique(input: Tensor, sorted: _bool = True, return_inverse: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def _unique2(input: Tensor, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def _unpack_dual(dual: Tensor, level: _int) -> torch.return_types._unpack_dual: ...
+def _unsafe_index(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]]) -> Tensor: ...
+def _unsafe_index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int) -> _bool: ...
+@overload
+def _use_cudnn_ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int) -> _bool: ...
+def _use_cudnn_rnn_flatten_weight() -> _bool: ...
+def _validate_compressed_sparse_indices(is_crow: _bool, compressed_idx: Tensor, plain_idx: Tensor, cdim: _int, dim: _int, nnz: _int) -> None: ...
+def _validate_sparse_bsc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_bsr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_compressed_tensor_args(compressed_indices: Tensor, plain_indices: Tensor, values: Tensor, size: _size, layout: _layout) -> None: ...
+def _validate_sparse_coo_tensor_args(indices: Tensor, values: Tensor, size: _size, is_coalesced: Optional[_bool] = None) -> None: ...
+def _validate_sparse_csc_tensor_args(ccol_indices: Tensor, row_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _validate_sparse_csr_tensor_args(crow_indices: Tensor, col_indices: Tensor, values: Tensor, size: _size) -> None: ...
+def _values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def _weight_int4pack_mm(input: Tensor, mat2: Tensor, qGroupSize: _int, qScaleAndZeros: Tensor) -> Tensor: ...
+def _weight_int8pack_mm(input: Tensor, mat2: Tensor, scales: Tensor) -> Tensor: ...
+def _weight_norm(v: Tensor, g: Tensor, dim: _int = 0) -> Tensor: ...
+def _weight_norm_interface(v: Tensor, g: Tensor, dim: _int = 0) -> Tuple[Tensor, Tensor]: ...
+def abs(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    abs(input, *, out=None) -> Tensor
+    
+    Computes the absolute value of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = |\text{input}_{i}|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.abs(torch.tensor([-1, -2, 3]))
+        tensor([ 1,  2,  3])
+    """
+    ...
+def abs_(input: Tensor) -> Tensor: ...
+def absolute(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    absolute(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.abs`
+    """
+    ...
+def acos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acos(input, *, out=None) -> Tensor
+    
+    Computes the inverse cosine of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.3348, -0.5889,  0.2005, -0.1584])
+        >>> torch.acos(a)
+        tensor([ 1.2294,  2.2004,  1.3690,  1.7298])
+    """
+    ...
+def acos_(input: Tensor) -> Tensor: ...
+def acosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    acosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh^{-1}(\text{input}_{i})
+    
+    Note:
+        The domain of the inverse hyperbolic cosine is `[1, inf)` and values outside this range
+        will be mapped to ``NaN``, except for `+ INF` for which the output is mapped to `+ INF`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(1, 2)
+        >>> a
+        tensor([ 1.3192, 1.9915, 1.9674, 1.7151 ])
+        >>> torch.acosh(a)
+        tensor([ 0.7791, 1.3120, 1.2979, 1.1341 ])
+    """
+    ...
+def acosh_(input: Tensor) -> Tensor: ...
+def adaptive_avg_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tensor: ...
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def add(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def add(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    add(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to add to :attr:`input`.
+    
+    Keyword arguments:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+        >>> torch.add(a, 20)
+        tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+    
+        >>> b = torch.randn(4)
+        >>> b
+        tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+        >>> c = torch.randn(4, 1)
+        >>> c
+        tensor([[ 0.3743],
+                [-1.7724],
+                [-0.5811],
+                [-0.8017]])
+        >>> torch.add(b, c, alpha=10)
+        tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+                [-18.6971, -18.0736, -17.0994, -17.3216],
+                [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+                [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored
+    in :attr:`batch1` and :attr:`batch2`,
+    with a reduced add step (all matrix multiplications get accumulated
+    along the first dimension).
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+    same number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. math::
+        out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+    must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        input (Tensor): matrix to be added
+        alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.addbmm(M, batch1, batch2)
+        tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+                [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+                [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcdiv(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+    multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+    
+    .. warning::
+        Integer division with addcdiv is no longer supported, and in a future
+        release addcdiv will perform a true division of tensor1 and tensor2.
+        The historic addcdiv behavior can be implemented as
+        (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+        for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+        The future addcdiv behavior is just the latter implementation:
+        (input + value * tensor1 / tensor2), for all dtypes.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+    
+    
+    The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the numerator tensor
+        tensor2 (Tensor): the denominator tensor
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`\text{tensor1} / \text{tensor2}`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcdiv(t, t1, t2, value=0.1)
+        tensor([[-0.2312, -3.6496,  0.1312],
+                [-1.0428,  3.4292, -0.1030],
+                [-0.5369, -0.9829,  0.0430]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(self: Tensor, value: Union[Number, _complex], tensor1: Tensor, tensor2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addcmul(input: Tensor, tensor1: Tensor, tensor2: Tensor, *, value: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+    
+    Performs the element-wise multiplication of :attr:`tensor1`
+    by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+    and adds it to :attr:`input`.
+    
+    .. math::
+        \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+    
+    The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+    a real number, otherwise an integer.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        tensor1 (Tensor): the tensor to be multiplied
+        tensor2 (Tensor): the tensor to be multiplied
+    
+    Keyword args:
+        value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(1, 3)
+        >>> t1 = torch.randn(3, 1)
+        >>> t2 = torch.randn(1, 3)
+        >>> torch.addcmul(t, t1, t2, value=0.1)
+        tensor([[-0.8635, -0.6391,  1.6174],
+                [-0.7617, -0.5879,  1.7388],
+                [-0.8353, -0.6249,  1.6511]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+    The matrix :attr:`input` is added to the final result.
+    
+    If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+    and :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+    :attr:`input` is sparse the result will have the same layout and if :attr:`out`
+    is provided it must have the same layout as :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): matrix to be added
+        mat1 (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2, 3)
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.addmm(M, mat1, mat2)
+        tensor([[-4.8716,  1.4671, -1.3746],
+                [ 0.7573, -3.9555, -2.8681]])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`mat` and
+    the vector :attr:`vec`.
+    The vector :attr:`input` is added to the final result.
+    
+    If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+    :attr:`out` will be 1-D tensor of size `n`.
+    
+    :attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+    :attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    Args:
+        input (Tensor): vector to be added
+        mat (Tensor): matrix to be matrix multiplied
+        vec (Tensor): vector to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(2)
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.addmv(M, mat, vec)
+        tensor([-0.3768, -5.5565])
+    """
+    ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addmv_(input: Tensor, mat: Tensor, vec: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def addmv_(beta: Union[Number, _complex], self: Tensor, mat: Tensor, vec: Tensor) -> Tensor: ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(input: Tensor, vec1: Tensor, vec2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+@overload
+def addr(beta: Union[Number, _complex], self: Tensor, vec1: Tensor, vec2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+    and adds it to the matrix :attr:`input`.
+    
+    Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+    outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+    :attr:`input` respectively.
+    
+    .. math::
+        \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+    of size `m`, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+    :math:`(n \times m)` and :attr:`out` will be a matrix of size
+    :math:`(n \times m)`.
+    
+    Args:
+        input (Tensor): matrix to be added
+        vec1 (Tensor): the first vector of the outer product
+        vec2 (Tensor): the second vector of the outer product
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{vec1} \otimes \text{vec2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> vec1 = torch.arange(1., 4.)
+        >>> vec2 = torch.arange(1., 3.)
+        >>> M = torch.zeros(3, 2)
+        >>> torch.addr(M, vec1, vec2)
+        tensor([[ 1.,  2.],
+                [ 2.,  4.],
+                [ 3.,  6.]])
+    """
+    ...
+def adjoint(input: Tensor) -> Tensor: 
+    r"""
+    adjoint(Tensor) -> Tensor
+    Returns a view of the tensor conjugated and with the last two dimensions transposed.
+    
+    ``x.adjoint()`` is equivalent to ``x.transpose(-2, -1).conj()`` for complex tensors and
+    to ``x.transpose(-2, -1)`` for real tensors.
+    
+    Example::
+        >>> x = torch.arange(4, dtype=torch.float)
+        >>> A = torch.complex(x, x).reshape(2, 2)
+        >>> A
+        tensor([[0.+0.j, 1.+1.j],
+                [2.+2.j, 3.+3.j]])
+        >>> A.adjoint()
+        tensor([[0.-0.j, 2.-2.j],
+                [1.-1.j, 3.-3.j]])
+        >>> (A.adjoint() == A.mH).all()
+        tensor(True)
+    """
+    ...
+def affine_grid_generator(theta: Tensor, size: Sequence[Union[_int, SymInt]], align_corners: _bool) -> Tensor: ...
+def alias_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.alias`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def all(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+@overload
+def all(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    all(input) -> Tensor
+    
+    Tests if all elements in :attr:`input` evaluate to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.all(a)
+        tensor(False, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.all(a)
+        tensor(False)
+    
+    .. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(4, 2).bool()
+        >>> a
+        tensor([[True, True],
+                [True, False],
+                [True, True],
+                [True, True]], dtype=torch.bool)
+        >>> torch.all(a, dim=1)
+        tensor([ True, False,  True,  True], dtype=torch.bool)
+        >>> torch.all(a, dim=0)
+        tensor([ True, False], dtype=torch.bool)
+    """
+    ...
+def allclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> _bool: 
+    r"""
+    allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> bool
+    
+    This function checks if :attr:`input` and :attr:`other` satisfy the condition:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
+    `numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Example::
+    
+        >>> torch.allclose(torch.tensor([10000., 1e-07]), torch.tensor([10000.1, 1e-08]))
+        False
+        >>> torch.allclose(torch.tensor([10000., 1e-08]), torch.tensor([10000.1, 1e-09]))
+        True
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]))
+        False
+        >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]), equal_nan=True)
+        True
+    """
+    ...
+def alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def amax(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amax(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the maximum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.8177,  1.4878, -0.2491,  0.9130],
+                [-0.7158,  1.1775,  2.0992,  0.4817],
+                [-0.0053,  0.0164, -1.3738, -0.0507],
+                [ 1.9700,  1.1106, -1.0318, -1.0816]])
+        >>> torch.amax(a, 1)
+        tensor([1.4878, 2.0992, 0.0164, 1.9700])
+    """
+    ...
+def amin(input: Tensor, dim: Union[_int, _size] = (), keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    amin(input, dim, keepdim=False, *, out=None) -> Tensor
+    
+    Returns the minimum value of each slice of the :attr:`input` tensor in the given
+    dimension(s) :attr:`dim`.
+    
+    .. note::
+        The difference between ``max``/``min`` and ``amax``/``amin`` is:
+            - ``amax``/``amin`` supports reducing on multiple dimensions,
+            - ``amax``/``amin`` does not return indices,
+            - ``amax``/``amin`` evenly distributes gradient between equal values,
+              while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+              index in the source tensor.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.6451, -0.4866,  0.2987, -1.3312],
+                [-0.5744,  1.2980,  1.8397, -0.2713],
+                [ 0.9128,  0.9214, -1.7268, -0.2995],
+                [ 0.9023,  0.4853,  0.9075, -1.6165]])
+        >>> torch.amin(a, 1)
+        tensor([-1.3312, -0.5744, -1.7268, -1.6165])
+    """
+    ...
+def aminmax(input: Tensor, *, dim: Optional[_int] = None, keepdim: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.aminmax: 
+    r"""
+    aminmax(input, *, dim=None, keepdim=False, out=None) -> (Tensor min, Tensor max)
+    
+    Computes the minimum and maximum values of the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor):
+            The input tensor
+    
+    Keyword Args:
+        dim (Optional[int]):
+            The dimension along which to compute the values. If `None`,
+            computes the values over the entire :attr:`input` tensor.
+            Default is `None`.
+        keepdim (bool):
+            If `True`, the reduced dimensions will be kept in the output
+            tensor as dimensions with size 1 for broadcasting, otherwise
+            they will be removed, as if calling (:func:`torch.squeeze`).
+            Default is `False`.
+        out (Optional[Tuple[Tensor, Tensor]]):
+            Optional tensors on which to write the result. Must have the same
+            shape and dtype as the expected output.
+            Default is `None`.
+    
+    Returns:
+        A named tuple `(min, max)` containing the minimum and maximum values.
+    
+    Raises:
+        RuntimeError
+            If any of the dimensions to compute the values over has size 0.
+    
+    .. note::
+        NaN values are propagated to the output if at least one value is NaN.
+    
+    .. seealso::
+        :func:`torch.amin` computes just the minimum value
+        :func:`torch.amax` computes just the maximum value
+    
+    Example::
+    
+        >>> torch.aminmax(torch.tensor([1, -3, 5]))
+        torch.return_types.aminmax(
+        min=tensor(-3),
+        max=tensor(5))
+    
+        >>> # aminmax propagates NaNs
+        >>> torch.aminmax(torch.tensor([1, -3, 5, torch.nan]))
+        torch.return_types.aminmax(
+        min=tensor(nan),
+        max=tensor(nan))
+    
+        >>> t = torch.arange(10).view(2, 5)
+        >>> t
+        tensor([[0, 1, 2, 3, 4],
+                [5, 6, 7, 8, 9]])
+        >>> t.aminmax(dim=0, keepdim=True)
+        torch.return_types.aminmax(
+        min=tensor([[0, 1, 2, 3, 4]]),
+        max=tensor([[5, 6, 7, 8, 9]]))
+    """
+    ...
+def angle(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    angle(input, *, out=None) -> Tensor
+    
+    Computes the element-wise angle (in radians) of the given :attr:`input` tensor.
+    
+    .. math::
+        \text{out}_{i} = angle(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers,
+              zero for non-negative real numbers, and propagates NaNs. Previously
+              the function would return zero for all real numbers and not propagate
+              floating-point NaNs.
+    
+    Example::
+    
+        >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
+        tensor([ 135.,  135,  -45])
+    """
+    ...
+@overload
+def any(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Optional[_size] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def any(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    any(input) -> Tensor
+    
+    Tests if any element in :attr:`input` evaluates to `True`.
+    
+    .. note:: This function matches the behaviour of NumPy in returning
+              output of dtype `bool` for all supported dtypes except `uint8`.
+              For `uint8` the dtype of output is `uint8` itself.
+    
+    Example::
+    
+        >>> a = torch.rand(1, 2).bool()
+        >>> a
+        tensor([[False, True]], dtype=torch.bool)
+        >>> torch.any(a)
+        tensor(True, dtype=torch.bool)
+        >>> a = torch.arange(0, 3)
+        >>> a
+        tensor([0, 1, 2])
+        >>> torch.any(a)
+        tensor(True)
+    
+    .. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+       :noindex:
+    
+    For each row of :attr:`input` in the given dimension :attr:`dim`,
+    returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2) < 0
+        >>> a
+        tensor([[ True,  True],
+                [False,  True],
+                [ True,  True],
+                [False, False]])
+        >>> torch.any(a, 1)
+        tensor([ True,  True,  True, False])
+        >>> torch.any(a, 0)
+        tensor([True, True])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, step: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Number, end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Number, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(end: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+@overload
+def arange(start: Union[Number, _complex], end: Union[Number, _complex], step: Union[Number, _complex] = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+    with values from the interval ``[start, end)`` taken with common difference
+    :attr:`step` beginning from `start`.
+    
+    Note that non-integer :attr:`step` is subject to floating point rounding errors when
+    comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+    in such cases.
+    
+    .. math::
+        \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+    
+    Args:
+        start (Number): the starting value for the set of points. Default: ``0``.
+        end (Number): the ending value for the set of points
+        step (Number): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.arange(5)
+        tensor([ 0,  1,  2,  3,  4])
+        >>> torch.arange(1, 4)
+        tensor([ 1,  2,  3])
+        >>> torch.arange(1, 2.5, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000])
+    """
+    ...
+def arccos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccos(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acos`.
+    """
+    ...
+def arccos_(input: Tensor) -> Tensor: ...
+def arccosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arccosh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.acosh`.
+    """
+    ...
+def arccosh_(input: Tensor) -> Tensor: ...
+def arcsin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsin(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asin`.
+    """
+    ...
+def arcsin_(input: Tensor) -> Tensor: ...
+def arcsinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arcsinh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.asinh`.
+    """
+    ...
+def arcsinh_(input: Tensor) -> Tensor: ...
+def arctan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atan`.
+    """
+    ...
+def arctan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctan2(input, other, *, out=None) -> Tensor
+    Alias for :func:`torch.atan2`.
+    """
+    ...
+def arctan_(input: Tensor) -> Tensor: ...
+def arctanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    arctanh(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.atanh`.
+    """
+    ...
+def arctanh_(input: Tensor) -> Tensor: ...
+def argmax(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmax(input) -> LongTensor
+    
+    Returns the indices of the maximum value of all elements in the :attr:`input` tensor.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple maximal values then the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a)
+        tensor(0)
+    
+    .. function:: argmax(input, dim, keepdim=False) -> LongTensor
+       :noindex:
+    
+    Returns the indices of the maximum values of a tensor across a dimension.
+    
+    This is the second value returned by :meth:`torch.max`. See its
+    documentation for the exact semantics of this method.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmax of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+                [-0.7401, -0.8805, -0.3402, -1.1936],
+                [ 0.4907, -1.3948, -1.0691, -0.3132],
+                [-1.6092,  0.5419, -0.2993,  0.3195]])
+        >>> torch.argmax(a, dim=1)
+        tensor([ 0,  2,  0,  1])
+    """
+    ...
+def argmin(input: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    argmin(input, dim=None, keepdim=False) -> LongTensor
+    
+    Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
+    
+    This is the second value returned by :meth:`torch.min`. See its
+    documentation for the exact semantics of this method.
+    
+    .. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce. If ``None``, the argmin of the flattened input is returned.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+                [ 1.0100, -1.1975, -0.0102, -0.4732],
+                [-0.9240,  0.1207, -0.7506, -1.0213],
+                [ 1.7809, -1.2960,  0.9384,  0.1438]])
+        >>> torch.argmin(a)
+        tensor(13)
+        >>> torch.argmin(a, dim=1)
+        tensor([ 2,  1,  3,  1])
+        >>> torch.argmin(a, dim=1, keepdim=True)
+        tensor([[2],
+                [1],
+                [3],
+                [1]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, *, stable: _bool, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: _int = -1, descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+@overload
+def argsort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False) -> Tensor: 
+    r"""
+    argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+    
+    Returns the indices that sort a tensor along a given dimension in ascending
+    order by value.
+    
+    This is the second value returned by :meth:`torch.sort`.  See its documentation
+    for the exact semantics of this method.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements. If ``False``, the relative order of values
+    which compare equal is not guaranteed. ``True`` is slower.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): controls the relative order of equivalent elements
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+                [ 0.1598,  0.0788, -0.0745, -1.2700],
+                [ 1.2208,  1.0722, -0.7064,  1.2564],
+                [ 0.0669, -0.2318, -0.8229, -0.9280]])
+    
+    
+        >>> torch.argsort(a, dim=1)
+        tensor([[2, 0, 3, 1],
+                [3, 2, 1, 0],
+                [2, 1, 0, 3],
+                [3, 2, 1, 0]])
+    """
+    ...
+def argwhere(input: Tensor) -> Tensor: 
+    r"""
+    argwhere(input) -> Tensor
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    .. note::
+        This function is similar to NumPy's `argwhere`.
+    
+        When :attr:`input` is on CUDA, this function causes host-device synchronization.
+    
+    Args:
+        {input}
+    
+    Example::
+    
+        >>> t = torch.tensor([1, 0, 1])
+        >>> torch.argwhere(t)
+        tensor([[0],
+                [2]])
+        >>> t = torch.tensor([[1, 0, 1], [0, 1, 1]])
+        >>> torch.argwhere(t)
+        tensor([[0, 0],
+                [0, 2],
+                [1, 1],
+                [1, 2]])
+    """
+    ...
+def as_strided(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided(input, size, stride, storage_offset=None) -> Tensor
+    
+    Create a view of an existing `torch.Tensor` :attr:`input` with specified
+    :attr:`size`, :attr:`stride` and :attr:`storage_offset`.
+    
+    .. warning::
+        Prefer using other view functions, like :meth:`torch.Tensor.expand`,
+        to setting a view's strides manually with `as_strided`, as this
+        function's behavior depends on the implementation of a tensor's storage.
+        The constructed view of the storage must only refer to elements within
+        the storage or a runtime error will be thrown, and if the view is
+        "overlapped" (with multiple indices referring to the same element in
+        memory) its behavior is undefined.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor.
+            If ``None``, the storage_offset of the output tensor will match the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 3)
+        >>> x
+        tensor([[ 0.9039,  0.6291,  1.0795],
+                [ 0.1586,  2.1939, -0.4900],
+                [-0.1909, -0.7503,  1.9355]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2))
+        >>> t
+        tensor([[0.9039, 1.0795],
+                [0.6291, 0.1586]])
+        >>> t = torch.as_strided(x, (2, 2), (1, 2), 1)
+        tensor([[0.6291, 0.1586],
+                [1.0795, 2.1939]])
+    """
+    ...
+def as_strided_(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: ...
+def as_strided_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.as_strided`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def as_strided_scatter(input: Tensor, src: Tensor, size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], storage_offset: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the elements corresponding to the result of calling
+    input.as_strided(size, stride, storage_offset).
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        size (tuple or ints): the shape of the output tensor
+        stride (tuple or ints): the stride of the output tensor
+        storage_offset (int, optional): the offset in the underlying storage of the output tensor
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        `torch.as_strided(input, size, stride, storage_offset)`
+    
+    Example::
+    
+        >>> a = torch.arange(4).reshape(2, 2) + 1
+        >>> a
+        tensor([[1, 2],
+                [3, 4]])
+        >>> b = torch.zeros(3, 3)
+        >>> b
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+        >>> torch.as_strided_scatter(b, a, (2, 2), (1, 2))
+        tensor([[1., 3., 2.],
+                [4., 0., 0.],
+                [0., 0., 0.]])
+    """
+    ...
+def as_tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None) -> Tensor: 
+    r"""
+    as_tensor(data, dtype=None, device=None) -> Tensor
+    
+    Converts :attr:`data` into a tensor, sharing data and preserving autograd
+    history if possible.
+    
+    If :attr:`data` is already a tensor with the requested dtype and device
+    then :attr:`data` itself is returned, but if :attr:`data` is a
+    tensor with a different dtype or device then it's copied as if using
+    `data.to(dtype=dtype, device=device)`.
+    
+    If :attr:`data` is a NumPy array (an ndarray) with the same dtype and device then a
+    tensor is constructed using :func:`torch.from_numpy`.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` never shares its data and creates a new "leaf tensor" (see :doc:`/notes/autograd`).
+    
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+    
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.as_tensor(a, device=torch.device('cuda'))
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([1,  2,  3])
+    """
+    ...
+def asarray(obj: Any, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, copy: Optional[_bool] = None, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    asarray(obj, *, dtype=None, device=None, copy=None, requires_grad=False) -> Tensor
+    
+    Converts :attr:`obj` to a tensor.
+    
+    :attr:`obj` can be one of:
+    
+    1. a tensor
+    2. a NumPy array or a NumPy scalar
+    3. a DLPack capsule
+    4. an object that implements Python's buffer protocol
+    5. a scalar
+    6. a sequence of scalars
+    
+    When :attr:`obj` is a tensor, NumPy array, or DLPack capsule the returned tensor will,
+    by default, not require a gradient, have the same datatype as :attr:`obj`, be on the
+    same device, and share memory with it. These properties can be controlled with the
+    :attr:`dtype`, :attr:`device`, :attr:`copy`, and :attr:`requires_grad` keyword arguments.
+    If the returned tensor is of a different datatype, on a different device, or a copy is
+    requested then it will not share its memory with :attr:`obj`. If :attr:`requires_grad`
+    is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is
+    also a tensor with an autograd history then the returned tensor will have the same history.
+    
+    When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's
+    buffer protocol then the buffer is interpreted as an array of bytes grouped according to
+    the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is
+    passed then the default floating point datatype is used, instead.) The returned tensor
+    will have the specified datatype (or default floating point datatype if none is specified)
+    and, by default, be on the CPU device and share memory with the buffer.
+    
+    When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on
+    the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will
+    be the PyTorch datatype corresponding to the NumPy's scalar's datatype.
+    
+    When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the
+    returned tensor will, by default, infer its datatype from the scalar values, be on the
+    current default device, and not share its memory.
+    
+    .. seealso::
+    
+        :func:`torch.tensor` creates a tensor that always copies the data from the input object.
+        :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays.
+        :func:`torch.frombuffer` creates a tensor that always shares memory from objects that
+        implement the buffer protocol.
+        :func:`torch.from_dlpack` creates a tensor that always shares memory from
+        DLPack capsules.
+    
+    Args:
+        obj (object): a tensor, NumPy array, DLPack Capsule, object that implements Python's
+               buffer protocol, scalar, or sequence of scalars.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the datatype of the returned tensor.
+               Default: ``None``, which causes the datatype of the returned tensor to be
+               inferred from :attr:`obj`.
+        copy (bool, optional): controls whether the returned tensor shares memory with :attr:`obj`.
+               Default: ``None``, which causes the returned tensor to share memory with :attr:`obj`
+               whenever possible. If ``True`` then the returned tensor does not share its memory.
+               If ``False`` then the returned tensor shares its memory with :attr:`obj` and an
+               error is thrown if it cannot.
+        device (:class:`torch.device`, optional): the device of the returned tensor.
+               Default: ``None``, which causes the device of :attr:`obj` to be used. Or, if
+               :attr:`obj` is a Python sequence, the current default device will be used.
+        requires_grad (bool, optional): whether the returned tensor requires grad.
+               Default: ``False``, which causes the returned tensor not to require a gradient.
+               If ``True``, then the returned tensor will require a gradient, and if :attr:`obj`
+               is also a tensor with an autograd history then the returned tensor will have
+               the same history.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> # Shares memory with tensor 'a'
+        >>> b = torch.asarray(a)
+        >>> a.data_ptr() == b.data_ptr()
+        True
+        >>> # Forces memory copy
+        >>> c = torch.asarray(a, copy=True)
+        >>> a.data_ptr() == c.data_ptr()
+        False
+    
+        >>> a = torch.tensor([1., 2., 3.], requires_grad=True)
+        >>> b = a + 2
+        >>> b
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+        >>> # Shares memory with tensor 'b', with no grad
+        >>> c = torch.asarray(b)
+        >>> c
+        tensor([3., 4., 5.])
+        >>> # Shares memory with tensor 'b', retaining autograd history
+        >>> d = torch.asarray(b, requires_grad=True)
+        >>> d
+        tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+    
+        >>> array = numpy.array([1, 2, 3])
+        >>> # Shares memory with array 'array'
+        >>> t1 = torch.asarray(array)
+        >>> array.__array_interface__['data'][0] == t1.data_ptr()
+        True
+        >>> # Copies memory due to dtype mismatch
+        >>> t2 = torch.asarray(array, dtype=torch.float32)
+        >>> array.__array_interface__['data'][0] == t2.data_ptr()
+        False
+    
+        >>> scalar = numpy.float64(0.5)
+        >>> torch.asarray(scalar)
+        tensor(0.5000, dtype=torch.float64)
+    """
+    ...
+def asin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arcsine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5962,  1.4985, -0.4396,  1.4525])
+        >>> torch.asin(a)
+        tensor([-0.6387,     nan, -0.4552,     nan])
+    """
+    ...
+def asin_(input: Tensor) -> Tensor: ...
+def asinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    asinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1606, -1.4267, -1.0899, -1.0250 ])
+        >>> torch.asinh(a)
+        tensor([ 0.1599, -1.1534, -0.9435, -0.8990 ])
+    """
+    ...
+def asinh_(input: Tensor) -> Tensor: ...
+def atan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the arctangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.2341,  0.2539, -0.6256, -0.6448])
+        >>> torch.atan(a)
+        tensor([ 0.2299,  0.2487, -0.5591, -0.5727])
+    """
+    ...
+def atan2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atan2(input, other, *, out=None) -> Tensor
+    
+    Element-wise arctangent of :math:`\text{input}_{i} / \text{other}_{i}`
+    with consideration of the quadrant. Returns a new tensor with the signed angles
+    in radians between vector :math:`(\text{other}_{i}, \text{input}_{i})`
+    and vector :math:`(1, 0)`. (Note that :math:`\text{other}_{i}`, the second
+    parameter, is the x-coordinate, while :math:`\text{input}_{i}`, the first
+    parameter, is the y-coordinate.)
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.9041,  0.0196, -0.3108, -2.4423])
+        >>> torch.atan2(a, torch.randn(4))
+        tensor([ 0.9833,  0.0811, -1.9743, -1.4151])
+    """
+    ...
+def atan_(input: Tensor) -> Tensor: ...
+def atanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    atanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+    
+    Note:
+        The domain of the inverse hyperbolic tangent is `(-1, 1)` and values outside this range
+        will be mapped to ``NaN``, except for the values `1` and `-1` for which the output is
+        mapped to `+/-INF` respectively.
+    
+    .. math::
+        \text{out}_{i} = \tanh^{-1}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4).uniform_(-1, 1)
+        >>> a
+        tensor([ -0.9385, 0.2968, -0.8591, -0.1871 ])
+        >>> torch.atanh(a)
+        tensor([ -1.7253, 0.3060, -1.2899, -0.1893 ])
+    """
+    ...
+def atanh_(input: Tensor) -> Tensor: ...
+def avg_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, ceil_mode: _bool = False, count_include_pad: _bool = True) -> Tensor: ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(input: Tensor, batch1: Tensor, batch2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def baddbmm(beta: Union[Number, _complex], self: Tensor, batch1: Tensor, batch2: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+    and :attr:`batch2`.
+    :attr:`input` is added to the final result.
+    
+    :attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+    number of matrices.
+    
+    If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+    :math:`(b \times m \times p)` tensor, then :attr:`input` must be
+    :ref:`broadcastable <broadcasting-semantics>` with a
+    :math:`(b \times n \times p)` tensor and :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+    same as the scaling factors used in :meth:`torch.addbmm`.
+    
+    .. math::
+        \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+    
+    If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+    it will not be propagated.
+    
+    For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+    :attr:`alpha` must be real numbers, otherwise they should be integers.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the tensor to be added
+        batch1 (Tensor): the first batch of matrices to be multiplied
+        batch2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`\text{batch1} \mathbin{@} \text{batch2}` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> M = torch.randn(10, 3, 5)
+        >>> batch1 = torch.randn(10, 3, 4)
+        >>> batch2 = torch.randn(10, 4, 5)
+        >>> torch.baddbmm(M, batch1, batch2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def bartlett_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    bartlett_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Bartlett window function.
+    
+    .. math::
+        w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+            \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+            2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+        \end{cases},
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.bartlett_window(L, periodic=True)`` equal to
+    ``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def batch_norm_backward_elemt(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], sum_dy: Tensor, sum_dy_xmu: Tensor, count: Tensor) -> Tensor: ...
+def batch_norm_backward_reduce(grad_out: Tensor, input: Tensor, mean: Tensor, invstd: Tensor, weight: Optional[Tensor], input_g: _bool, weight_g: _bool, bias_g: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def batch_norm_elemt(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, invstd: Tensor, eps: _float, *, out: Optional[Tensor] = None) -> Tensor: ...
+def batch_norm_gather_stats(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, count: _int) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_gather_stats_with_counts(input: Tensor, mean: Tensor, invstd: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float, eps: _float, counts: Tensor) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_stats(input: Tensor, eps: _float) -> Tuple[Tensor, Tensor]: ...
+def batch_norm_update_stats(input: Tensor, running_mean: Optional[Tensor], running_var: Optional[Tensor], momentum: _float) -> Tuple[Tensor, Tensor]: ...
+@overload
+def bernoulli(input: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+@overload
+def bernoulli(input: Tensor, p: _float, *, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    bernoulli(input, *, generator=None, out=None) -> Tensor
+    
+    Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+    
+    The :attr:`input` tensor should be a tensor containing probabilities
+    to be used for drawing the binary random number.
+    Hence, all values in :attr:`input` have to be in the range:
+    :math:`0 \leq \text{input}_i \leq 1`.
+    
+    The :math:`\text{i}^{th}` element of the output tensor will draw a
+    value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+    in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+    
+    The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+    shape as :attr:`input`.
+    
+    :attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+    point ``dtype``.
+    
+    Args:
+        input (Tensor): the input tensor of probability values for the Bernoulli distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+        >>> a
+        tensor([[ 0.1737,  0.0950,  0.3609],
+                [ 0.7148,  0.0289,  0.2676],
+                [ 0.9456,  0.8937,  0.7202]])
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 1.,  1.,  1.]])
+    
+        >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+        >>> torch.bernoulli(a)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+        >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+        >>> torch.bernoulli(a)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+def bilinear(input1: Tensor, input2: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def binary_cross_entropy_with_logits(input: Tensor, target: Tensor, weight: Optional[Tensor] = None, pos_weight: Optional[Tensor] = None, reduction: _int = 1) -> Tensor: ...
+def bincount(input: Tensor, weights: Optional[Tensor] = None, minlength: _int = 0) -> Tensor: 
+    r"""
+    bincount(input, weights=None, minlength=0) -> Tensor
+    
+    Count the frequency of each value in an array of non-negative ints.
+    
+    The number of bins (size 1) is one larger than the largest value in
+    :attr:`input` unless :attr:`input` is empty, in which case the result is a
+    tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
+    :attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
+    :attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
+    ``out[n] += weights[i]`` if :attr:`weights` is specified else
+    ``out[n] += 1``.
+    
+    Note:
+        This operation may produce nondeterministic gradients when given tensors on a CUDA device. See :doc:`/notes/randomness` for more information.
+    
+    Arguments:
+        input (Tensor): 1-d int tensor
+        weights (Tensor): optional, weight for each value in the input tensor.
+            Should be of same size as input tensor.
+        minlength (int): optional, minimum number of bins. Should be non-negative.
+    
+    Returns:
+        output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+        :attr:`input` is non-empty, else ``Size(0)``
+    
+    Example::
+    
+        >>> input = torch.randint(0, 8, (5,), dtype=torch.int64)
+        >>> weights = torch.linspace(0, 1, steps=5)
+        >>> input, weights
+        (tensor([4, 3, 6, 3, 4]),
+         tensor([ 0.0000,  0.2500,  0.5000,  0.7500,  1.0000])
+    
+        >>> torch.bincount(input)
+        tensor([0, 0, 0, 2, 2, 0, 1])
+    
+        >>> input.bincount(weights)
+        tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000])
+    """
+    ...
+def binomial(count: Tensor, prob: Tensor, generator: Optional[Generator] = None) -> Tensor: ...
+@overload
+def bitwise_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_and(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_and(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical AND.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([1, 0,  3], dtype=torch.int8)
+        >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ False, True, False])
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_left_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_left_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i << \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2, 24], dtype=torch.int8)
+    """
+    ...
+def bitwise_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_not(input, *, out=None) -> Tensor
+    
+    Computes the bitwise NOT of the given input tensor. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical NOT.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_not(torch.tensor([-1, -2, 3], dtype=torch.int8))
+        tensor([ 0,  1, -4], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_or(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_or(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical OR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -2,  3], dtype=torch.int8)
+        >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, True, False])
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_right_shift(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_right_shift(input, other, *, out=None) -> Tensor
+    
+    Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+    The input tensor must be of integral type. This operator supports
+    :ref:`broadcasting to a common shape <broadcasting-semantics>` and
+    :ref:`type promotion <type-promotion-doc>`.
+    In any case, if the value of the right operand is negative or is greater
+    or equal to the number of bits in the promoted left operand, the behavior is undefined.
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{input}_i >> \text{other}_i
+    
+    Args:
+        input (Tensor or Scalar): the first input tensor
+        other (Tensor or Scalar): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-1, -7,  3], dtype=torch.int8)
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def bitwise_xor(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bitwise_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+    integral or Boolean types. For bool tensors, it computes the logical XOR.
+    
+    Args:
+        input: the first input tensor
+        other: the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+        tensor([-2, -2,  0], dtype=torch.int8)
+        >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+        tensor([ True, False, False])
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def blackman_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    blackman_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Blackman window function.
+    
+    .. math::
+        w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.blackman_window(L, periodic=True)`` equal to
+    ``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def bmm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bmm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a batch matrix-matrix product of matrices stored in :attr:`input`
+    and :attr:`mat2`.
+    
+    :attr:`input` and :attr:`mat2` must be 3-D tensors each containing
+    the same number of matrices.
+    
+    If :attr:`input` is a :math:`(b \times n \times m)` tensor, :attr:`mat2` is a
+    :math:`(b \times m \times p)` tensor, :attr:`out` will be a
+    :math:`(b \times n \times p)` tensor.
+    
+    .. math::
+        \text{out}_i = \text{input}_i \mathbin{@} \text{mat2}_i
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Args:
+        input (Tensor): the first batch of matrices to be multiplied
+        mat2 (Tensor): the second batch of matrices to be multiplied
+    
+    Keyword Args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.randn(10, 3, 4)
+        >>> mat2 = torch.randn(10, 4, 5)
+        >>> res = torch.bmm(input, mat2)
+        >>> res.size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def broadcast_to(input: Tensor, size: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    broadcast_to(input, shape) -> Tensor
+    
+    Broadcasts :attr:`input` to the shape :attr:`\shape`.
+    Equivalent to calling ``input.expand(shape)``. See :meth:`~Tensor.expand` for details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shape (list, tuple, or :class:`torch.Size`): the new shape.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> torch.broadcast_to(x, (3, 3))
+        tensor([[1, 2, 3],
+                [1, 2, 3],
+                [1, 2, 3]])
+    """
+    ...
+@overload
+def bucketize(input: Tensor, boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+@overload
+def bucketize(self: Union[Number, _complex], boundaries: Tensor, *, out_int32: _bool = False, right: _bool = False) -> Tensor: 
+    r"""
+    bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+    
+    Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+    boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+    as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+    this behavior is opposite the behavior of
+    `numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+    More formally, the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 15 85
+       :header-rows: 1
+    
+       * - :attr:`right`
+         - *returned index satisfies*
+       * - False
+         - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+       * - True
+         - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+    
+    Args:
+        input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+        boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                                In other words, if False, gets the lower bound index for each value in :attr:`input`
+                                from :attr:`boundaries`. If True, gets the upper bound index instead.
+                                Default value is False.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+    
+    
+    Example::
+    
+        >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+        >>> boundaries
+        tensor([1, 3, 5, 7, 9])
+        >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> v
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.bucketize(v, boundaries)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+        >>> torch.bucketize(v, boundaries, right=True)
+        tensor([[2, 3, 5],
+                [2, 3, 5]])
+    """
+    ...
+def can_cast(from_: _dtype, to: _dtype) -> _bool: 
+    r"""
+    can_cast(from, to) -> bool
+    
+    Determines if a type conversion is allowed under PyTorch casting rules
+    described in the type promotion :ref:`documentation <type-promotion-doc>`.
+    
+    Args:
+        from (dtype): The original :class:`torch.dtype`.
+        to (dtype): The target :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.can_cast(torch.double, torch.float)
+        True
+        >>> torch.can_cast(torch.float, torch.int)
+        False
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+@overload
+def cat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+    All tensors must either have the same shape (except in the concatenating
+    dimension) or be a 1-D empty tensor with size ``(0,)``.
+    
+    :func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+    and :func:`torch.chunk`.
+    
+    :func:`torch.cat` can be best understood via examples.
+    
+    .. seealso::
+    
+        :func:`torch.stack` concatenates the given sequence along a new dimension.
+    
+    Args:
+        tensors (sequence of Tensors): any python sequence of tensors of the same type.
+            Non-empty tensors provided must have the same shape, except in the
+            cat dimension.
+        dim (int, optional): the dimension over which the tensors are concatenated
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 0)
+        tensor([[ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497],
+                [ 0.6580, -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497]])
+        >>> torch.cat((x, x, x), 1)
+        tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+                 -1.0969, -0.4614],
+                [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+                 -0.5790,  0.1497]])
+    """
+    ...
+def ccol_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def ceil(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ceil(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the ceil of the elements of :attr:`input`,
+    the smallest integer greater than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.6341, -1.4208, -1.0900,  0.5826])
+        >>> torch.ceil(a)
+        tensor([-0., -1., -1.,  1.])
+    """
+    ...
+def ceil_(input: Tensor) -> Tensor: ...
+def celu(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def celu_(input: Tensor, alpha: Union[Number, _complex] = 1.0) -> Tensor: ...
+def channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def cholesky(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky(input, upper=False, *, out=None) -> Tensor
+    
+    Computes the Cholesky decomposition of a symmetric positive-definite
+    matrix :math:`A` or for batches of symmetric positive-definite matrices.
+    
+    If :attr:`upper` is ``True``, the returned matrix ``U`` is upper-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+      A = U^TU
+    
+    If :attr:`upper` is ``False``, the returned matrix ``L`` is lower-triangular, and
+    the decomposition has the form:
+    
+    .. math::
+    
+        A = LL^T
+    
+    If :attr:`upper` is ``True``, and :math:`A` is a batch of symmetric positive-definite
+    matrices, then the returned tensor will be composed of upper-triangular Cholesky factors
+    of each of the individual matrices. Similarly, when :attr:`upper` is ``False``, the returned
+    tensor will be composed of lower-triangular Cholesky factors of each of the individual
+    matrices.
+    
+    .. warning::
+    
+        :func:`torch.cholesky` is deprecated in favor of :func:`torch.linalg.cholesky`
+        and will be removed in a future PyTorch release.
+    
+        ``L = torch.cholesky(A)`` should be replaced with
+    
+        .. code:: python
+    
+            L = torch.linalg.cholesky(A)
+    
+        ``U = torch.cholesky(A, upper=True)`` should be replaced with
+    
+        .. code:: python
+    
+            U = torch.linalg.cholesky(A).mH
+    
+        This transform will produce equivalent results for all valid (symmetric positive definite) inputs.
+    
+    Args:
+        input (Tensor): the input tensor :math:`A` of size :math:`(*, n, n)` where `*` is zero or more
+                    batch dimensions consisting of symmetric positive-definite matrices.
+        upper (bool, optional): flag that indicates whether to return a
+                                upper or lower triangular matrix. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output matrix
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a = a @ a.mT + 1e-3 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> a
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> l
+        tensor([[ 1.5528,  0.0000,  0.0000],
+                [-0.4821,  1.0592,  0.0000],
+                [ 0.9371,  0.5487,  0.7023]])
+        >>> l @ l.mT
+        tensor([[ 2.4112, -0.7486,  1.4551],
+                [-0.7486,  1.3544,  0.1294],
+                [ 1.4551,  0.1294,  1.6724]])
+        >>> a = torch.randn(3, 2, 2) # Example for batched input
+        >>> a = a @ a.mT + 1e-03 # make symmetric positive-definite
+        >>> l = torch.cholesky(a)
+        >>> z = l @ l.mT
+        >>> torch.dist(z, a)
+        tensor(2.3842e-07)
+    """
+    ...
+def cholesky_inverse(input: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_inverse(L, upper=False, *, out=None) -> Tensor
+    
+    Computes the inverse of a complex Hermitian or real symmetric
+    positive-definite matrix given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Computes the inverse matrix :math:`A^{-1}`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> torch.cholesky_inverse(L)
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+        >>> A.inverse()
+        tensor([[ 1.9314,  1.2251, -0.0889],
+                [ 1.2251,  2.4439,  0.2122],
+                [-0.0889,  0.2122,  0.1412]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> torch.dist(torch.inverse(A), torch.cholesky_inverse(L))
+        tensor(5.6358e-7)
+    """
+    ...
+def cholesky_solve(input: Tensor, input2: Tensor, upper: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cholesky_solve(B, L, upper=False, *, out=None) -> Tensor
+    
+    Computes the solution of a system of linear equations with complex Hermitian
+    or real symmetric positive-definite lhs given its Cholesky decomposition.
+    
+    Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+    and :math:`L` its Cholesky decomposition such that:
+    
+    .. math::
+    
+        A = LL^{\text{H}}
+    
+    where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+    and the transpose when :math:`L` is real-valued.
+    
+    Returns the solution :math:`X` of the following linear system:
+    
+    .. math::
+    
+        AX = B
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batches of matrices, and if :math:`A` or :math:`B` is a batch of matrices
+    then the output has the same batch dimensions.
+    
+    Args:
+        B (Tensor): right-hand side tensor of shape `(*, n, k)`
+            where :math:`*` is zero or more batch dimensions
+        L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+            consisting of lower or upper triangular Cholesky decompositions of
+            symmetric or Hermitian positive-definite matrices.
+        upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+            or upper triangular. Default: ``False``.
+    
+    Keyword args:
+        out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+        >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+        >>> B = torch.randn(3, 2)
+        >>> torch.cholesky_solve(B, L)
+        tensor([[ -8.1625,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+        >>> A.inverse() @  B
+        tensor([[ -8.1626,  19.6097],
+                [ -5.8398,  14.2387],
+                [ -4.3771,  10.4173]])
+    
+        >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+        >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+        >>> L = torch.linalg.cholesky(A)
+        >>> B = torch.randn(2, 1, dtype=torch.complex64)
+        >>> X = torch.cholesky_solve(B, L)
+        >>> torch.dist(X, A.inverse() @ B)
+        tensor(1.6881e-5)
+    """
+    ...
+def choose_qparams_optimized(input: Tensor, numel: _int, n_bins: _int, ratio: _float, bit_width: _int) -> Tuple[Tensor, Tensor]: ...
+def chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Attempts to split a tensor into the specified number of chunks. Each chunk is a view of
+    the input tensor.
+    
+    
+    .. note::
+    
+        This function may return fewer than the specified number of chunks!
+    
+    .. seealso::
+    
+        :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks
+    
+    If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`,
+    all returned chunks will be the same size.
+    If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`,
+    all returned chunks will be the same size, except the last one.
+    If such division is not possible, this function may return fewer
+    than the specified number of chunks.
+    
+    Arguments:
+        input (Tensor): the tensor to split
+        chunks (int): number of chunks to return
+        dim (int): dimension along which to split the tensor
+    
+    Example:
+        >>> torch.arange(11).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10]))
+        >>> torch.arange(12).chunk(6)
+        (tensor([0, 1]),
+         tensor([2, 3]),
+         tensor([4, 5]),
+         tensor([6, 7]),
+         tensor([8, 9]),
+         tensor([10, 11]))
+        >>> torch.arange(13).chunk(6)
+        (tensor([0, 1, 2]),
+         tensor([3, 4, 5]),
+         tensor([6, 7, 8]),
+         tensor([ 9, 10, 11]),
+         tensor([12]))
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clamp(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+    Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+    
+    .. math::
+        y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+    
+    If :attr:`min` is ``None``, there is no lower bound.
+    Or, if :attr:`max` is ``None`` there is no upper bound.
+    
+    
+    .. note::
+        If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+        sets all elements in :attr:`input` to the value of :attr:`max`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        min (Number or Tensor, optional): lower-bound of the range to be clamped to
+        max (Number or Tensor, optional): upper-bound of the range to be clamped to
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+        >>> torch.clamp(a, min=-0.5, max=0.5)
+        tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+    
+        >>> min = torch.linspace(-1, 1, steps=4)
+        >>> torch.clamp(a, min=min)
+        tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+    """
+    ...
+@overload
+def clamp_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max(input: Tensor, max: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Tensor) -> Tensor: ...
+@overload
+def clamp_max_(input: Tensor, max: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min(input: Tensor, min: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Tensor) -> Tensor: ...
+@overload
+def clamp_min_(input: Tensor, min: Union[Number, _complex]) -> Tensor: ...
+@overload
+def clip(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    clip(input, min=None, max=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.clamp`.
+    """
+    ...
+@overload
+def clip_(input: Tensor, min: Optional[Tensor] = None, max: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def clip_(input: Tensor, min: Optional[Union[Number, _complex]] = None, max: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+def clone(input: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: 
+    r"""
+    clone(input, *, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a copy of :attr:`input`.
+    
+    .. note::
+    
+        This function is differentiable, so gradients will flow back from the
+        result of this operation to :attr:`input`. To create a tensor without an
+        autograd relationship to :attr:`input` see :meth:`~Tensor.detach`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def col_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.col_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def column_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    column_stack(tensors, *, out=None) -> Tensor
+    
+    Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`.
+    
+    Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t``
+    in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.column_stack((a, b))
+        tensor([[1, 4],
+            [2, 5],
+            [3, 6]])
+        >>> a = torch.arange(5)
+        >>> b = torch.arange(10).reshape(5, 2)
+        >>> torch.column_stack((a, b, b))
+        tensor([[0, 0, 1, 0, 1],
+                [1, 2, 3, 2, 3],
+                [2, 4, 5, 4, 5],
+                [3, 6, 7, 6, 7],
+                [4, 8, 9, 8, 9]])
+    """
+    ...
+def combinations(input: Tensor, r: _int = 2, with_replacement: _bool = False) -> Tensor: 
+    r"""
+    combinations(input, r=2, with_replacement=False) -> seq
+    
+    Compute combinations of length :math:`r` of the given tensor. The behavior is similar to
+    python's `itertools.combinations` when `with_replacement` is set to `False`, and
+    `itertools.combinations_with_replacement` when `with_replacement` is set to `True`.
+    
+    Arguments:
+        input (Tensor): 1D vector.
+        r (int, optional): number of elements to combine
+        with_replacement (bool, optional): whether to allow duplication in combination
+    
+    Returns:
+        Tensor: A tensor equivalent to converting all the input tensors into lists, do
+        `itertools.combinations` or `itertools.combinations_with_replacement` on these
+        lists, and finally convert the resulting list into tensor.
+    
+    Example::
+    
+        >>> a = [1, 2, 3]
+        >>> list(itertools.combinations(a, r=2))
+        [(1, 2), (1, 3), (2, 3)]
+        >>> list(itertools.combinations(a, r=3))
+        [(1, 2, 3)]
+        >>> list(itertools.combinations_with_replacement(a, r=2))
+        [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
+        >>> tensor_a = torch.tensor(a)
+        >>> torch.combinations(tensor_a)
+        tensor([[1, 2],
+                [1, 3],
+                [2, 3]])
+        >>> torch.combinations(tensor_a, r=3)
+        tensor([[1, 2, 3]])
+        >>> torch.combinations(tensor_a, with_replacement=True)
+        tensor([[1, 1],
+                [1, 2],
+                [1, 3],
+                [2, 2],
+                [2, 3],
+                [3, 3]])
+    """
+    ...
+def complex(real: Tensor, imag: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    complex(real, imag, *, out=None) -> Tensor
+    
+    Constructs a complex tensor with its real part equal to :attr:`real` and its
+    imaginary part equal to :attr:`imag`.
+    
+    Args:
+        real (Tensor): The real part of the complex tensor. Must be half, float or double.
+        imag (Tensor): The imaginary part of the complex tensor. Must be same dtype
+            as :attr:`real`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> real = torch.tensor([1, 2], dtype=torch.float32)
+        >>> imag = torch.tensor([3, 4], dtype=torch.float32)
+        >>> z = torch.complex(real, imag)
+        >>> z
+        tensor([(1.+3.j), (2.+4.j)])
+        >>> z.dtype
+        torch.complex64
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concat(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concat(tensors, dim=0, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+@overload
+def concatenate(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    concatenate(tensors, axis=0, out=None) -> Tensor
+    
+    Alias of :func:`torch.cat`.
+    """
+    ...
+def conj(input: Tensor) -> Tensor: 
+    r"""
+    conj(input) -> Tensor
+    
+    Returns a view of :attr:`input` with a flipped conjugate bit. If :attr:`input` has a non-complex dtype,
+    this function just returns :attr:`input`.
+    
+    .. note::
+        :func:`torch.conj` performs a lazy conjugation, but the actual conjugated tensor can be materialized
+        at any time using :func:`torch.resolve_conj`.
+    
+    .. warning:: In the future, :func:`torch.conj` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> x.is_conj()
+        False
+        >>> y = torch.conj(x)
+        >>> y.is_conj()
+        True
+    """
+    ...
+def conj_physical(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    conj_physical(input, *, out=None) -> Tensor
+    
+    Computes the element-wise conjugate of the given :attr:`input` tensor.
+    If :attr:`input` has a non-complex dtype, this function just returns :attr:`input`.
+    
+    .. note::
+       This performs the conjugate operation regardless of the fact conjugate bit is set or not.
+    
+    .. warning:: In the future, :func:`torch.conj_physical` may return a non-writeable view for an :attr:`input` of
+                 non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+                 when :attr:`input` is of non-complex dtype to be compatible with this change.
+    
+    .. math::
+        \text{out}_{i} = conj(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.conj_physical(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+    """
+    ...
+def conj_physical_(input: Tensor) -> Tensor: ...
+def constant_pad_nd(input: Tensor, pad: Sequence[Union[_int, SymInt]], value: Union[Number, _complex] = 0) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+@overload
+def conv3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: str = "valid", dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, groups: Union[_int, SymInt] = 1) -> Tensor: ...
+def conv_tbc(input: Tensor, weight: Tensor, bias: Tensor, pad: _int = 0) -> Tensor: ...
+def conv_transpose1d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose2d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def conv_transpose3d(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None, stride: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1, padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, output_padding: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 0, groups: Union[_int, SymInt] = 1, dilation: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]] = 1) -> Tensor: ...
+def convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], transposed: _bool, output_padding: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+@overload
+def copysign(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+@overload
+def copysign(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    copysign(input, other, *, out=None) -> Tensor
+    
+    Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+            -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+             |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+        \end{cases}
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    and integer and float inputs.
+    
+    Args:
+        input (Tensor): magnitudes.
+        other (Tensor or Number): contains value(s) whose signbit(s) are
+            applied to the magnitudes in :attr:`input`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+        >>> torch.copysign(a, 1)
+        tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+                [-0.0059, -0.2600, -0.4475, -1.3948],
+                [ 0.3667, -0.9567, -2.5757, -0.1751],
+                [ 0.2046, -0.0742,  0.2998, -0.1054]])
+        >>> b = torch.randn(4)
+        tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+        >>> torch.copysign(a, b)
+        tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+                [ 0.0059,  0.2600,  0.4475, -1.3948],
+                [ 0.3667,  0.9567,  2.5757, -0.1751],
+                [ 0.2046,  0.0742,  0.2998, -0.1054]])
+        >>> a = torch.tensor([1.])
+        >>> b = torch.tensor([-0.])
+        >>> torch.copysign(a, b)
+        tensor([-1.])
+    
+    .. note::
+        copysign handles signed zeros. If the other argument has a negative zero (-0),
+        the corresponding output value will be negative.
+    """
+    ...
+def corrcoef(input: Tensor) -> Tensor: 
+    r"""
+    corrcoef(input) -> Tensor
+    
+    Estimates the Pearson product-moment correlation coefficient matrix of the variables given by the :attr:`input` matrix,
+    where rows are the variables and columns are the observations.
+    
+    .. note::
+    
+        The correlation coefficient matrix R is computed using the covariance matrix C as given by
+        :math:`R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }`
+    
+    .. note::
+    
+        Due to floating point rounding, the resulting array may not be Hermitian and its diagonal elements may not be 1.
+        The real and imaginary values are clipped to the interval [-1, 1] in an attempt to improve this situation.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Returns:
+        (Tensor) The correlation coefficient matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.cov` covariance matrix.
+    
+    Example::
+    
+        >>> x = torch.tensor([[0, 1, 2], [2, 1, 0]])
+        >>> torch.corrcoef(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> x = torch.randn(2, 4)
+        >>> x
+        tensor([[-0.2678, -0.0908, -0.3766,  0.2780],
+                [-0.5812,  0.1535,  0.2387,  0.2350]])
+        >>> torch.corrcoef(x)
+        tensor([[1.0000, 0.3582],
+                [0.3582, 1.0000]])
+        >>> torch.corrcoef(x[0])
+        tensor(1.)
+    """
+    ...
+def cos(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cos(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the cosine  of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cos(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 1.4309,  1.2706, -0.8562,  0.9796])
+        >>> torch.cos(a)
+        tensor([ 0.1395,  0.2957,  0.6553,  0.5574])
+    """
+    ...
+def cos_(input: Tensor) -> Tensor: ...
+def cosh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cosh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic cosine  of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \cosh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.1632,  1.1835, -0.6979, -0.7325])
+        >>> torch.cosh(a)
+        tensor([ 1.0133,  1.7860,  1.2536,  1.2805])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.cosh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def cosh_(input: Tensor) -> Tensor: ...
+def cosine_embedding_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+def cosine_similarity(x1: Tensor, x2: Tensor, dim: _int = 1, eps: _float = 1e-08) -> Tensor: ...
+@overload
+def count_nonzero(input: Tensor, dim: Optional[_int] = None) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+@overload
+def count_nonzero(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    count_nonzero(input, dim=None) -> Tensor
+    
+    Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+    If no dim is specified then all non-zeros in the tensor are counted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+    
+    Example::
+    
+        >>> x = torch.zeros(3,3)
+        >>> x[torch.randn(3,3) > 0.5] = 1
+        >>> x
+        tensor([[0., 1., 1.],
+                [0., 0., 0.],
+                [0., 0., 1.]])
+        >>> torch.count_nonzero(x)
+        tensor(3)
+        >>> torch.count_nonzero(x, dim=0)
+        tensor([0, 1, 2])
+    """
+    ...
+def cov(input: Tensor, *, correction: _int = 1, fweights: Optional[Tensor] = None, aweights: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cov(input, *, correction=1, fweights=None, aweights=None) -> Tensor
+    
+    Estimates the covariance matrix of the variables given by the :attr:`input` matrix, where rows are
+    the variables and columns are the observations.
+    
+    A covariance matrix is a square matrix giving the covariance of each pair of variables. The diagonal contains
+    the variance of each variable (covariance of a variable with itself). By definition, if :attr:`input` represents
+    a single variable (Scalar or 1D) then its variance is returned.
+    
+    The sample covariance of the variables :math:`x` and :math:`y` is given by:
+    
+    .. math::
+        \text{cov}(x,y) = \frac{\sum^{N}_{i = 1}(x_{i} - \bar{x})(y_{i} - \bar{y})}{\max(0,~N~-~\delta N)}
+    
+    where :math:`\bar{x}` and :math:`\bar{y}` are the simple means of the :math:`x` and :math:`y` respectively, and
+    :math:`\delta N` is the :attr:`correction`.
+    
+    If :attr:`fweights` and/or :attr:`aweights` are provided, the weighted covariance
+    is calculated, which is given by:
+    
+    .. math::
+        \text{cov}_w(x,y) = \frac{\sum^{N}_{i = 1}w_i(x_{i} - \mu_x^*)(y_{i} - \mu_y^*)}
+        {\max(0,~\sum^{N}_{i = 1}w_i~-~\frac{\sum^{N}_{i = 1}w_ia_i}{\sum^{N}_{i = 1}w_i}~\delta N)}
+    
+    where :math:`w` denotes :attr:`fweights` or :attr:`aweights` (``f`` and ``a`` for brevity) based on whichever is
+    provided, or :math:`w = f \times a` if both are provided, and
+    :math:`\mu_x^* = \frac{\sum^{N}_{i = 1}w_ix_{i} }{\sum^{N}_{i = 1}w_i}` is the weighted mean of the variable. If not
+    provided, ``f`` and/or ``a`` can be seen as a :math:`\mathbb{1}` vector of appropriate size.
+    
+    Args:
+        input (Tensor): A 2D matrix containing multiple variables and observations, or a
+            Scalar or 1D vector representing a single variable.
+    
+    Keyword Args:
+        correction (int, optional): difference between the sample size and sample degrees of freedom.
+            Defaults to Bessel's correction, ``correction = 1`` which returns the unbiased estimate,
+            even if both :attr:`fweights` and :attr:`aweights` are specified. ``correction = 0``
+            will return the simple average. Defaults to ``1``.
+        fweights (tensor, optional): A Scalar or 1D tensor of observation vector frequencies representing the number of
+            times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
+            Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
+        aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
+            These relative weights are typically large for observations considered "important" and smaller for
+            observations considered less "important". Its numel must equal the number of columns of :attr:`input`.
+            Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.
+    
+    Returns:
+        (Tensor) The covariance matrix of the variables.
+    
+    .. seealso::
+    
+            :func:`torch.corrcoef` normalized covariance matrix.
+    
+    Example::
+        >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T
+        >>> x
+        tensor([[0, 1, 2],
+                [2, 1, 0]])
+        >>> torch.cov(x)
+        tensor([[ 1., -1.],
+                [-1.,  1.]])
+        >>> torch.cov(x, correction=0)
+        tensor([[ 0.6667, -0.6667],
+                [-0.6667,  0.6667]])
+        >>> fw = torch.randint(1, 10, (3,))
+        >>> fw
+        tensor([1, 6, 9])
+        >>> aw = torch.rand(3)
+        >>> aw
+        tensor([0.4282, 0.0255, 0.4144])
+        >>> torch.cov(x, fweights=fw, aweights=aw)
+        tensor([[ 0.4169, -0.4169],
+                [-0.4169,  0.4169]])
+    """
+    ...
+def cross(input: Tensor, other: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cross(input, other, dim=None, *, out=None) -> Tensor
+    
+    
+    Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input`
+    and :attr:`other`.
+    
+    Supports input of float, double, cfloat and cdouble dtypes. Also supports batches
+    of vectors, for which it computes the product along the dimension :attr:`dim`.
+    In this case, the output has the same batch dimensions as the inputs.
+    
+    .. warning::
+        If :attr:`dim` is not given, it defaults to the first dimension found
+        with the size 3. Note that this might be unexpected.
+    
+        This behavior is deprecated and will be changed to match that of :func:`torch.linalg.cross`
+        in a future release.
+    
+    .. seealso::
+            :func:`torch.linalg.cross` which has dim=-1 as default.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+        dim  (int, optional): the dimension to take the cross-product in.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 3)
+        >>> a
+        tensor([[-0.3956,  1.1455,  1.6895],
+                [-0.5849,  1.3672,  0.3599],
+                [-1.1626,  0.7180, -0.0521],
+                [-0.1339,  0.9902, -2.0225]])
+        >>> b = torch.randn(4, 3)
+        >>> b
+        tensor([[-0.0257, -1.4725, -1.2251],
+                [-1.1479, -0.7005, -1.9757],
+                [-1.3904,  0.3726, -1.1836],
+                [-0.9688, -0.7153,  0.2159]])
+        >>> torch.cross(a, b, dim=1)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+        >>> torch.cross(a, b)
+        tensor([[ 1.0844, -0.5281,  0.6120],
+                [-2.4490, -1.5687,  1.9792],
+                [-0.8304, -1.3037,  0.5650],
+                [-1.2329,  1.9883,  1.0551]])
+    """
+    ...
+def crow_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.crow_indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: _size, target_lengths: _size, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+@overload
+def ctc_loss(log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor, blank: _int = 0, reduction: _int = 1, zero_infinity: _bool = False) -> Tensor: ...
+def cudnn_affine_grid_generator(theta: Tensor, N: _int, C: _int, H: _int, W: _int) -> Tensor: ...
+def cudnn_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def cudnn_convolution(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool, *, out: Optional[Tensor] = None) -> Tensor: ...
+def cudnn_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def cudnn_convolution_transpose(input: Tensor, weight: Tensor, padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool, allow_tf32: _bool) -> Tensor: ...
+def cudnn_grid_sampler(input: Tensor, grid: Tensor) -> Tensor: ...
+def cudnn_is_acceptable(input: Tensor) -> _bool: ...
+@overload
+def cummax(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummax(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummax: 
+    r"""
+    cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = max(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+             1.9946, -0.8209])
+        >>> torch.cummax(a, dim=0)
+        torch.return_types.cummax(
+            values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+             1.9946,  1.9946]),
+            indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: _int, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cummin(input: Tensor, dim: Union[str, ellipsis, None], *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.cummin: 
+    r"""
+    cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+    elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+    location of each maximum value found in the dimension :attr:`dim`.
+    
+    .. math::
+        y_i = min(x_1, x_2, x_3, \dots, x_i)
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+             0.9165,  1.6684])
+        >>> torch.cummin(a, dim=0)
+        torch.return_types.cummin(
+            values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+            -1.3298, -1.3298]),
+            indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumprod(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative product of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> a
+        tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+                -0.2129, -0.4206,  0.1968])
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+                 0.0014, -0.0006, -0.0001])
+    
+        >>> a[5] = 0.0
+        >>> torch.cumprod(a, dim=0)
+        tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+                 0.0000, -0.0000, -0.0000])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: _int, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumsum(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+    
+    Returns the cumulative sum of elements of :attr:`input` in the dimension
+    :attr:`dim`.
+    
+    For example, if :attr:`input` is a vector of size N, the result will also be
+    a vector of size N, with elements.
+    
+    .. math::
+        y_i = x_1 + x_2 + x_3 + \dots + x_i
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(1, 20, (10,))
+        >>> a
+        tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+        >>> torch.cumsum(a, dim=0)
+        tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+@overload
+def cumulative_trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+    along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+    and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+    where as this function returns a cumulative value for every spacing within the integration. This
+    is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([3., 10.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> (1 + 5) / 2
+        3.0
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.cumulative_trapezoid(y, dx=2)
+        tensor([6., 21.])
+    
+        >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([6., 28.5])
+    
+        >>> # Computes the same trapezoidal rule directly up to each element to verify
+        >>> ((3 - 1) * (1 + 5)) / 2
+        6.0
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.cumulative_trapezoid(y)
+        tensor([[ 0.5,  2.],
+                [ 3.5,  8.],
+                [ 6.5, 14.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+        >>> torch.cumulative_trapezoid(y, dim=0)
+        tensor([[ 1.5,  2.5,  3.5],
+                [ 6.0,  8.0, 10.0]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[2., 5.],
+                [2., 5.],
+                [2., 5.]])
+    
+        >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.cumulative_trapezoid(y, x)
+        tensor([[1., 2.],
+                [2., 4.],
+                [3., 6.]])
+    """
+    ...
+def deg2rad(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    deg2rad(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in degrees to radians.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]])
+        >>> torch.deg2rad(a)
+        tensor([[ 3.1416, -3.1416],
+                [ 6.2832, -6.2832],
+                [ 1.5708, -1.5708]])
+    """
+    ...
+def deg2rad_(input: Tensor) -> Tensor: ...
+@overload
+def dequantize(input: Tensor) -> Tensor: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+@overload
+def dequantize(tensors: Union[Tuple[Tensor, ...], List[Tensor]]) -> Tuple[Tensor, ...]: 
+    r"""
+    dequantize(tensor) -> Tensor
+    
+    Returns an fp32 Tensor by dequantizing a quantized Tensor
+    
+    Args:
+        tensor (Tensor): A quantized Tensor
+    
+    .. function:: dequantize(tensors) -> sequence of Tensors
+       :noindex:
+    
+    Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+    
+    Args:
+         tensors (sequence of Tensors): A list of quantized Tensors
+    """
+    ...
+def det(input: Tensor) -> Tensor: 
+    r"""
+    det(input) -> Tensor
+    
+    Alias for :func:`torch.linalg.det`
+    """
+    ...
+def detach(input: Tensor) -> Tensor: ...
+def detach_(input: Tensor) -> Tensor: ...
+def detach_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.detach`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diag(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diag(input, diagonal=0, *, out=None) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with
+      the diagonal elements of :attr:`input`.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider:
+    
+    - If :attr:`diagonal` = 0, it is the main diagonal.
+    - If :attr:`diagonal` > 0, it is above the main diagonal.
+    - If :attr:`diagonal` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+            :func:`torch.diagonal` always returns the diagonal of its input.
+    
+            :func:`torch.diagflat` always constructs a tensor with diagonal elements
+            specified by the input.
+    
+    Examples:
+    
+    Get the square matrix where the input vector is the diagonal::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.5950,-0.0872, 2.3298])
+        >>> torch.diag(a)
+        tensor([[ 0.5950, 0.0000, 0.0000],
+                [ 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 2.3298]])
+        >>> torch.diag(a, 1)
+        tensor([[ 0.0000, 0.5950, 0.0000, 0.0000],
+                [ 0.0000, 0.0000,-0.0872, 0.0000],
+                [ 0.0000, 0.0000, 0.0000, 2.3298],
+                [ 0.0000, 0.0000, 0.0000, 0.0000]])
+    
+    Get the k-th diagonal of a given matrix::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-0.4264, 0.0255,-0.1064],
+                [ 0.8795,-0.2429, 0.1374],
+                [ 0.1029,-0.6482,-1.6300]])
+        >>> torch.diag(a, 0)
+        tensor([-0.4264,-0.2429,-1.6300])
+        >>> torch.diag(a, 1)
+        tensor([ 0.0255, 0.1374])
+    """
+    ...
+def diag_embed(input: Tensor, offset: _int = 0, dim1: _int = -2, dim2: _int = -1) -> Tensor: 
+    r"""
+    diag_embed(input, offset=0, dim1=-2, dim2=-1) -> Tensor
+    
+    Creates a tensor whose diagonals of certain 2D planes (specified by
+    :attr:`dim1` and :attr:`dim2`) are filled by :attr:`input`.
+    To facilitate creating batched diagonal matrices, the 2D planes formed by
+    the last two dimensions of the returned tensor are chosen by default.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    The size of the new matrix will be calculated to make the specified diagonal
+    of the size of the last input dimension.
+    Note that for :attr:`offset` other than :math:`0`, the order of :attr:`dim1`
+    and :attr:`dim2` matters. Exchanging them is equivalent to changing the
+    sign of :attr:`offset`.
+    
+    Applying :meth:`torch.diagonal` to the output of this function with
+    the same arguments yields a matrix identical to input. However,
+    :meth:`torch.diagonal` has different default dimensions, so those
+    need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 1-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: -2.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: -1.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> torch.diag_embed(a)
+        tensor([[[ 1.5410,  0.0000,  0.0000],
+                 [ 0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -2.1788]],
+    
+                [[ 0.5684,  0.0000,  0.0000],
+                 [ 0.0000, -1.0845,  0.0000],
+                 [ 0.0000,  0.0000, -1.3986]]])
+    
+        >>> torch.diag_embed(a, offset=1, dim1=0, dim2=2)
+        tensor([[[ 0.0000,  1.5410,  0.0000,  0.0000],
+                 [ 0.0000,  0.5684,  0.0000,  0.0000]],
+    
+                [[ 0.0000,  0.0000, -0.2934,  0.0000],
+                 [ 0.0000,  0.0000, -1.0845,  0.0000]],
+    
+                [[ 0.0000,  0.0000,  0.0000, -2.1788],
+                 [ 0.0000,  0.0000,  0.0000, -1.3986]],
+    
+                [[ 0.0000,  0.0000,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000,  0.0000]]])
+    """
+    ...
+def diagflat(input: Tensor, offset: _int = 0) -> Tensor: 
+    r"""
+    diagflat(input, offset=0) -> Tensor
+    
+    - If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+      with the elements of :attr:`input` as the diagonal.
+    - If :attr:`input` is a tensor with more than one dimension, then returns a
+      2-D tensor with diagonal elements equal to a flattened :attr:`input`.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor.
+        offset (int, optional): the diagonal to consider. Default: 0 (main
+            diagonal).
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([-0.2956, -0.9068,  0.1695])
+        >>> torch.diagflat(a)
+        tensor([[-0.2956,  0.0000,  0.0000],
+                [ 0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.1695]])
+        >>> torch.diagflat(a, 1)
+        tensor([[ 0.0000, -0.2956,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.9068,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  0.1695],
+                [ 0.0000,  0.0000,  0.0000,  0.0000]])
+    
+        >>> a = torch.randn(2, 2)
+        >>> a
+        tensor([[ 0.2094, -0.3018],
+                [-0.1516,  1.9342]])
+        >>> torch.diagflat(a)
+        tensor([[ 0.2094,  0.0000,  0.0000,  0.0000],
+                [ 0.0000, -0.3018,  0.0000,  0.0000],
+                [ 0.0000,  0.0000, -0.1516,  0.0000],
+                [ 0.0000,  0.0000,  0.0000,  1.9342]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+@overload
+def diagonal(input: Tensor, *, outdim: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None], dim2: Union[str, ellipsis, None], offset: _int = 0) -> Tensor: 
+    r"""
+    diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Returns a partial view of :attr:`input` with the its diagonal elements
+    with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+    at the end of the shape.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Applying :meth:`torch.diag_embed` to the output of this function with
+    the same arguments yields a diagonal matrix with the diagonal entries
+    of the input. However, :meth:`torch.diag_embed` has different default
+    dimensions, so those need to be explicitly specified.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+    
+    Examples::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0854,  1.1431, -0.1752],
+                [ 0.8536, -0.0905,  0.0360],
+                [ 0.6927, -0.3735, -0.4945]])
+    
+    
+        >>> torch.diagonal(a, 0)
+        tensor([-1.0854, -0.0905, -0.4945])
+    
+    
+        >>> torch.diagonal(a, 1)
+        tensor([ 1.1431,  0.0360])
+    
+    
+        >>> x = torch.randn(2, 5, 4, 2)
+        >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+        tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+                 [-1.1065,  1.0401, -0.2235, -0.7938]],
+    
+                [[-1.7325, -0.3081,  0.6166,  0.2335],
+                 [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+    """
+    ...
+def diagonal_copy(input: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.diagonal`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def diagonal_scatter(input: Tensor, src: Tensor, offset: _int = 0, dim1: _int = 0, dim2: _int = 1) -> Tensor: 
+    r"""
+    diagonal_scatter(input, src, offset=0, dim1=0, dim2=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` along
+    the diagonal elements of :attr:`input`, with respect to :attr:`dim1`
+    and :attr:`dim2`.
+    
+    This function returns a tensor with fresh storage; it does not
+    return a view.
+    
+    The argument :attr:`offset` controls which diagonal to consider:
+    
+    - If :attr:`offset` = 0, it is the main diagonal.
+    - If :attr:`offset` > 0, it is above the main diagonal.
+    - If :attr:`offset` < 0, it is below the main diagonal.
+    
+    Args:
+        input (Tensor): the input tensor. Must be at least 2-dimensional.
+        src (Tensor): the tensor to embed into :attr:`input`.
+        offset (int, optional): which diagonal to consider. Default: 0
+            (main diagonal).
+        dim1 (int, optional): first dimension with respect to which to
+            take diagonal. Default: 0.
+        dim2 (int, optional): second dimension with respect to which to
+            take diagonal. Default: 1.
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.diagonal(input, offset, dim1, dim2)``
+    
+    Examples::
+    
+        >>> a = torch.zeros(3, 3)
+        >>> a
+        tensor([[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(3), 0)
+        tensor([[1., 0., 0.],
+                [0., 1., 0.],
+                [0., 0., 1.]])
+    
+        >>> torch.diagonal_scatter(a, torch.ones(2), 1)
+        tensor([[0., 1., 0.],
+                [0., 0., 1.],
+                [0., 0., 0.]])
+    """
+    ...
+def diff(input: Tensor, n: _int = 1, dim: _int = -1, prepend: Optional[Tensor] = None, append: Optional[Tensor] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    diff(input, n=1, dim=-1, prepend=None, append=None) -> Tensor
+    
+    Computes the n-th forward difference along the given dimension.
+    
+    The first-order differences are given by `out[i] = input[i + 1] - input[i]`. Higher-order
+    differences are calculated by using :func:`torch.diff` recursively.
+    
+    Args:
+        input (Tensor): the tensor to compute the differences on
+        n (int, optional): the number of times to recursively compute the difference
+        dim (int, optional): the dimension to compute the difference along.
+            Default is the last dimension.
+        prepend, append (Tensor, optional): values to prepend or append to
+            :attr:`input` along :attr:`dim` before computing the difference.
+            Their dimensions must be equivalent to that of input, and their shapes
+            must match input's shape except on :attr:`dim`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 3, 2])
+        >>> torch.diff(a)
+        tensor([ 2, -1])
+        >>> b = torch.tensor([4, 5])
+        >>> torch.diff(a, append=b)
+        tensor([ 2, -1,  2,  1])
+        >>> c = torch.tensor([[1, 2, 3], [3, 4, 5]])
+        >>> torch.diff(c, dim=0)
+        tensor([[2, 2, 2]])
+        >>> torch.diff(c, dim=1)
+        tensor([[1, 1],
+                [1, 1]])
+    """
+    ...
+def digamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    digamma(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.digamma`.
+    """
+    ...
+def dist(input: Tensor, other: Tensor, p: Union[Number, _complex] = 2) -> Tensor: 
+    r"""
+    dist(input, other, p=2) -> Tensor
+    
+    Returns the p-norm of (:attr:`input` - :attr:`other`)
+    
+    The shapes of :attr:`input` and :attr:`other` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the Right-hand-side input tensor
+        p (float, optional): the norm to be computed
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([-1.5393, -0.8675,  0.5916,  1.6321])
+        >>> y = torch.randn(4)
+        >>> y
+        tensor([ 0.0967, -1.0511,  0.6295,  0.8360])
+        >>> torch.dist(x, y, 3.5)
+        tensor(1.6727)
+        >>> torch.dist(x, y, 3)
+        tensor(1.6973)
+        >>> torch.dist(x, y, 0)
+        tensor(4.)
+        >>> torch.dist(x, y, 1)
+        tensor(2.6537)
+    """
+    ...
+def div(input: Union[Tensor, Number], other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    div(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Divides each element of the input ``input`` by the corresponding element of
+    :attr:`other`.
+    
+    .. math::
+        \text{out}_i = \frac{\text{input}_i}{\text{other}_i}
+    
+    .. note::
+        By default, this performs a "true" division like Python 3.
+        See the :attr:`rounding_mode` argument for floor division.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    Always promotes integer types to the default scalar type.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        rounding_mode (str, optional): Type of rounding applied to the result:
+    
+            * None - default behavior. Performs no rounding and, if both :attr:`input` and
+              :attr:`other` are integer types, promotes the inputs to the default scalar type.
+              Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
+            * ``"trunc"`` - rounds the results of the division towards zero.
+              Equivalent to C-style integer division.
+            * ``"floor"`` - rounds the results of the division down.
+              Equivalent to floor division in Python (the ``//`` operator) and NumPy's ``np.floor_divide``.
+    
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> x = torch.tensor([ 0.3810,  1.2774, -0.2972, -0.3719,  0.4637])
+        >>> torch.div(x, 0.5)
+        tensor([ 0.7620,  2.5548, -0.5944, -0.7438,  0.9274])
+    
+        >>> a = torch.tensor([[-0.3711, -1.9353, -0.4605, -0.2917],
+        ...                   [ 0.1815, -1.0111,  0.9805, -1.5923],
+        ...                   [ 0.1062,  1.4581,  0.7759, -1.2344],
+        ...                   [-0.1830, -0.0313,  1.1908, -1.4757]])
+        >>> b = torch.tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
+        >>> torch.div(a, b)
+        tensor([[-0.4620, -6.6051,  0.5676,  1.2639],
+                [ 0.2260, -3.4509, -1.2086,  6.8990],
+                [ 0.1322,  4.9764, -0.9564,  5.3484],
+                [-0.2278, -0.1068, -1.4678,  6.3938]])
+    
+        >>> torch.div(a, b, rounding_mode='trunc')
+        tensor([[-0., -6.,  0.,  1.],
+                [ 0., -3., -1.,  6.],
+                [ 0.,  4., -0.,  5.],
+                [-0., -0., -1.,  6.]])
+    
+        >>> torch.div(a, b, rounding_mode='floor')
+        tensor([[-1., -7.,  0.,  1.],
+                [ 0., -4., -2.,  6.],
+                [ 0.,  4., -1.,  5.],
+                [-1., -1., -2.,  6.]])
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Tensor, *, rounding_mode: Optional[str], out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex], *, rounding_mode: Optional[str]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+@overload
+def divide(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+    
+    Alias for :func:`torch.div`.
+    """
+    ...
+def dot(input: Tensor, tensor: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D tensors.
+    
+    .. note::
+    
+        Unlike NumPy's dot, torch.dot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+    """
+    ...
+def dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def dsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def dsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+@overload
+def dsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    dsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+    depthwise according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+    (the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.dsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(2, 2, 4)
+        >>> t
+        tensor([[[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.]],
+                [[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]])
+        >>> torch.dsplit(t, 2)
+        (tensor([[[ 0.,  1.],
+                [ 4.,  5.]],
+               [[ 8.,  9.],
+                [12., 13.]]]),
+         tensor([[[ 2.,  3.],
+                  [ 6.,  7.]],
+                 [[10., 11.],
+                  [14., 15.]]]))
+    
+        >>> torch.dsplit(t, [3, 6])
+        (tensor([[[ 0.,  1.,  2.],
+                  [ 4.,  5.,  6.]],
+                 [[ 8.,  9., 10.],
+                  [12., 13., 14.]]]),
+         tensor([[[ 3.],
+                  [ 7.]],
+                 [[11.],
+                  [15.]]]),
+         tensor([], size=(2, 2, 0)))
+    """
+    ...
+def dstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    dstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence depthwise (along third axis).
+    
+    This is equivalent to concatenation along the third axis after 1-D and 2-D tensors have been reshaped by :func:`torch.atleast_3d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4],
+                 [2, 5],
+                 [3, 6]]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.dstack((a,b))
+        tensor([[[1, 4]],
+                [[2, 5]],
+                [[3, 6]]])
+    """
+    ...
+def embedding(weight: Tensor, indices: Tensor, padding_idx: Union[_int, SymInt] = -1, scale_grad_by_freq: _bool = False, sparse: _bool = False) -> Tensor: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool, mode: _int, sparse: _bool, per_sample_weights: Optional[Tensor], include_last_offset: _bool, padding_idx: Optional[_int]) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+@overload
+def embedding_bag(weight: Tensor, indices: Tensor, offsets: Tensor, scale_grad_by_freq: _bool = False, mode: _int = 0, sparse: _bool = False, per_sample_weights: Optional[Tensor] = None, include_last_offset: _bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def embedding_renorm_(input: Tensor, indices: Tensor, max_norm: _float, norm_type: _float) -> Tensor: ...
+@overload
+def empty(size: Sequence[Union[_int, SymInt]], *, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, memory_format: Optional[memory_format] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+@overload
+def empty(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, memory_format=torch.contiguous_format) -> Tensor
+    
+    Returns a tensor filled with uninitialized data. The shape of the tensor is
+    defined by the variable argument :attr:`size`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+    
+    Example::
+    
+        >>> torch.empty((2,3), dtype=torch.int64)
+        tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+                [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+    """
+    ...
+def empty_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns an uninitialized tensor with the same size as :attr:`input`.
+    ``torch.empty_like(input)`` is equivalent to
+    ``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> a=torch.empty((2,3), dtype=torch.int32, device = 'cuda')
+        >>> torch.empty_like(a)
+        tensor([[0, 0, 0],
+                [0, 0, 0]], device='cuda:0', dtype=torch.int32)
+    """
+    ...
+def empty_permuted(size: Sequence[Union[_int, SymInt]], physical_layout: _size, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates an uninitialized, non-overlapping and dense tensor with the
+    specified :attr:`size`, with :attr:`physical_layout` specifying how the
+    dimensions are physically laid out in memory (each logical dimension is listed
+    from outermost to innermost).  :attr:`physical_layout` is a generalization
+    of NCHW/NHWC notation: if each dimension is assigned a number according to
+    what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)``
+    while NHWC is ``(0, 2, 3, 1)``.  Equivalently, the strides of the output
+    tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]``
+    (notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``).
+    
+    Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense
+    tensor with no overlaps.  If possible, prefer using this function over
+    :func:`torch.empty_strided` or manual use of :func:`torch.as_strided`.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        physical_layout (tuple of int): the ordering of dimensions physically in memory
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Examples:
+    
+        >>> torch.empty((2, 3, 5, 7)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride()
+        (105, 35, 7, 1)
+        >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride()
+        (105, 1, 21, 3)
+        >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).dim_order()
+        (0, 2, 3, 1)
+    """
+    ...
+def empty_quantized(size: _size, qtensor: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+def empty_strided(size: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Creates a tensor with the specified :attr:`size` and :attr:`stride` and filled with undefined data.
+    
+    .. warning::
+        If the constructed tensor is "overlapped" (with multiple indices referring to the same element
+        in memory) its behavior is undefined.
+    
+    .. note::
+        If :func:`torch.use_deterministic_algorithms()` and
+        :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+        ``True``, the output tensor is initialized to prevent any possible
+        nondeterministic behavior from using the data as an input to an operation.
+        Floating point and complex tensors are filled with NaN, and integer tensors
+        are filled with the maximum value.
+    
+    Args:
+        size (tuple of int): the shape of the output tensor
+        stride (tuple of int): the strides of the output tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> a = torch.empty_strided((2, 3), (1, 2))
+        >>> a
+        tensor([[8.9683e-44, 4.4842e-44, 5.1239e+07],
+                [0.0000e+00, 0.0000e+00, 3.0705e-41]])
+        >>> a.stride()
+        (1, 2)
+        >>> a.size()
+        torch.Size([2, 3])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+@overload
+def eq(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    eq(input, other, *, out=None) -> Tensor
+    
+    Computes element-wise equality
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[ True, False],
+                [False, True]])
+    """
+    ...
+def equal(input: Tensor, other: Tensor) -> _bool: 
+    r"""
+    equal(input, other) -> bool
+    
+    ``True`` if two tensors have the same size and elements, ``False`` otherwise.
+    
+    Example::
+    
+        >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2]))
+        True
+    """
+    ...
+def erf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erf(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erf`.
+    """
+    ...
+def erf_(input: Tensor) -> Tensor: ...
+def erfc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfc`.
+    """
+    ...
+def erfc_(input: Tensor) -> Tensor: ...
+def erfinv(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    erfinv(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.erfinv`.
+    """
+    ...
+def exp(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the exponential of the elements
+    of the input tensor :attr:`input`.
+    
+    .. math::
+        y_{i} = e^{x_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.exp(torch.tensor([0, math.log(2.)]))
+        tensor([ 1.,  2.])
+    """
+    ...
+def exp2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    exp2(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.exp2`.
+    """
+    ...
+def exp2_(input: Tensor) -> Tensor: ...
+def exp_(input: Tensor) -> Tensor: ...
+def expand_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, implicit: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.expand`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def expm1(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    expm1(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expm1`.
+    """
+    ...
+def expm1_(input: Tensor) -> Tensor: ...
+@overload
+def eye(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+@overload
+def eye(n: Union[_int, SymInt], m: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+    
+    Args:
+        n (int): the number of rows
+        m (int, optional): the number of columns with default being :attr:`n`
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+    
+    Example::
+    
+        >>> torch.eye(3)
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1.,  0.],
+                [ 0.,  0.,  1.]])
+    """
+    ...
+def fake_quantize_per_channel_affine(input: Tensor, scale: Tensor, zero_point: Tensor, axis: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_channel_affine(input, scale, zero_point, axis, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized per channel using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`, across the channel specified by :attr:`axis`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), in ``torch.float32``
+        scale (Tensor): quantization scale, per channel in ``torch.float32``
+        zero_point (Tensor): quantization zero_point, per channel in ``torch.int32`` or ``torch.half`` or ``torch.float32``
+        axis (int32): channel axis
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized per channel ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(2, 2, 2)
+        >>> x
+        tensor([[[-0.2525, -0.0466],
+                 [ 0.3491, -0.2168]],
+    
+                [[-0.5906,  1.6258],
+                 [ 0.6444, -0.0542]]])
+        >>> scales = (torch.randn(2) + 1) * 0.05
+        >>> scales
+        tensor([0.0475, 0.0486])
+        >>> zero_points = torch.zeros(2).to(torch.int32)
+        >>> zero_points
+        tensor([0, 0])
+        >>> torch.fake_quantize_per_channel_affine(x, scales, zero_points, 1, 0, 255)
+        tensor([[[0.0000, 0.0000],
+                 [0.3405, 0.0000]],
+    
+                [[0.0000, 1.6134],
+                [0.6323, 0.0000]]])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: _float, zero_point: _int, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+@overload
+def fake_quantize_per_tensor_affine(input: Tensor, scale: Tensor, zero_point: Tensor, quant_min: _int, quant_max: _int) -> Tensor: 
+    r"""
+    fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+    
+    Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+    :attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+    
+    .. math::
+        \text{output} = (
+            min(
+                \text{quant\_max},
+                max(
+                    \text{quant\_min},
+                    \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+                )
+            ) - \text{zero\_point}
+        ) \times \text{scale}
+    
+    Args:
+        input (Tensor): the input value(s), ``torch.float32`` tensor
+        scale (double scalar or ``float32`` Tensor): quantization scale
+        zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+        quant_min (int64): lower bound of the quantized domain
+        quant_max (int64): upper bound of the quantized domain
+    
+    Returns:
+        Tensor: A newly fake_quantized ``torch.float32`` tensor
+    
+    Example::
+    
+        >>> x = torch.randn(4)
+        >>> x
+        tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+        >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+        >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+        tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    """
+    ...
+def fbgemm_linear_fp16_weight(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_fp16_weight_fp32_activation(input: Tensor, packed_weight: Tensor, bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_int8_weight_fp32_activation(input: Tensor, weight: Tensor, packed: Tensor, col_offsets: Tensor, weight_scale: Union[Number, _complex], weight_zero_point: Union[Number, _complex], bias: Tensor) -> Tensor: ...
+def fbgemm_linear_quantize_weight(input: Tensor) -> Tuple[Tensor, Tensor, _float, _int]: ...
+def fbgemm_pack_gemm_matrix_fp16(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor) -> Tensor: ...
+@overload
+def fbgemm_pack_quantized_matrix(input: Tensor, K: _int, N: _int) -> Tensor: ...
+def feature_alpha_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_alpha_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+def feature_dropout_(input: Tensor, p: _float, train: _bool) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def fill_(input: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def fix(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fix(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.trunc`
+    """
+    ...
+def fix_(input: Tensor) -> Tensor: ...
+@overload
+def flatten(input: Tensor, start_dim: _int = 0, end_dim: _int = -1) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: _int, end_dim: _int, out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, start_dim: Union[str, ellipsis, None], end_dim: Union[str, ellipsis, None], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+@overload
+def flatten(input: Tensor, dims: Sequence[Union[str, ellipsis, None]], out_dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    flatten(input, start_dim=0, end_dim=-1) -> Tensor
+    
+    Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+    are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+    The order of elements in :attr:`input` is unchanged.
+    
+    Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+    or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+    be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+    flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+    
+    .. note::
+        Flattening a zero-dimensional tensor will return a one-dimensional view.
+    
+    Args:
+        input (Tensor): the input tensor.
+        start_dim (int): the first dim to flatten
+        end_dim (int): the last dim to flatten
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.flatten(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+        >>> torch.flatten(t, start_dim=1)
+        tensor([[1, 2, 3, 4],
+                [5, 6, 7, 8]])
+    """
+    ...
+def flip(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    flip(input, dims) -> Tensor
+    
+    Reverse the order of an n-D tensor along given axis in dims.
+    
+    .. note::
+        `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flip` is expected to be slower than `np.flip`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (a list or tuple): axis to flip on
+    
+    Example::
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[ 0,  1],
+                 [ 2,  3]],
+    
+                [[ 4,  5],
+                 [ 6,  7]]])
+        >>> torch.flip(x, [0, 1])
+        tensor([[[ 6,  7],
+                 [ 4,  5]],
+    
+                [[ 2,  3],
+                 [ 0,  1]]])
+    """
+    ...
+def fliplr(input: Tensor) -> Tensor: 
+    r"""
+    fliplr(input) -> Tensor
+    
+    Flip tensor in the left/right direction, returning a new tensor.
+    
+    Flip the entries in each row in the left/right direction.
+    Columns are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 2-D.
+    
+    .. note::
+        `torch.fliplr` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.fliplr`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.fliplr` is expected to be slower than `np.fliplr`.
+    
+    Args:
+        input (Tensor): Must be at least 2-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.fliplr(x)
+        tensor([[1, 0],
+                [3, 2]])
+    """
+    ...
+def flipud(input: Tensor) -> Tensor: 
+    r"""
+    flipud(input) -> Tensor
+    
+    Flip tensor in the up/down direction, returning a new tensor.
+    
+    Flip the entries in each column in the up/down direction.
+    Rows are preserved, but appear in a different order than before.
+    
+    Note:
+        Requires the tensor to be at least 1-D.
+    
+    .. note::
+        `torch.flipud` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flipud`,
+        which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+        `torch.flipud` is expected to be slower than `np.flipud`.
+    
+    Args:
+        input (Tensor): Must be at least 1-dimensional.
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.flipud(x)
+        tensor([[2, 3],
+                [0, 1]])
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+@overload
+def float_power(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    float_power(input, exponent, *, out=None) -> Tensor
+    
+    Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+    If neither input is complex returns a ``torch.float64`` tensor,
+    and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+    
+    .. note::
+        This function always computes in double precision, unlike :func:`torch.pow`,
+        which implements more typical :ref:`type promotion <type-promotion-doc>`.
+        This is useful when the computation needs to be performed in a wider or more precise dtype,
+        or the results of the computation may contain fractional values not representable in the input dtypes,
+        like when an integer base is raised to a negative integer exponent.
+    
+    Args:
+        input (Tensor or Number): the base value(s)
+        exponent (Tensor or Number): the exponent value(s)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randint(10, (4,))
+        >>> a
+        tensor([6, 4, 7, 1])
+        >>> torch.float_power(a, 2)
+        tensor([36., 16., 49.,  1.], dtype=torch.float64)
+    
+        >>> a = torch.arange(1, 5)
+        >>> a
+        tensor([ 1,  2,  3,  4])
+        >>> exp = torch.tensor([2, -3, 4, -5])
+        >>> exp
+        tensor([ 2, -3,  4, -5])
+        >>> torch.float_power(a, exp)
+        tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+    """
+    ...
+def floor(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the floor of the elements of :attr:`input`,
+    the largest integer less than or equal to each element.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    .. math::
+        \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.8166,  1.5308, -0.2530, -0.2091])
+        >>> torch.floor(a)
+        tensor([-1.,  1., -1., -1.])
+    """
+    ...
+def floor_(input: Tensor) -> Tensor: ...
+def floor_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    floor_divide(input, other, *, out=None) -> Tensor
+    
+    .. note::
+    
+        Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed
+        truncation division. To restore the previous behavior use
+        :func:`torch.div` with ``rounding_mode='trunc'``.
+    
+    Computes :attr:`input` divided by :attr:`other`, elementwise, and floors
+    the result.
+    
+    .. math::
+        \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
+    
+    
+    
+    Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+    
+    Args:
+        input (Tensor or Number): the dividend
+        other (Tensor or Number): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([4.0, 3.0])
+        >>> b = torch.tensor([2.0, 2.0])
+        >>> torch.floor_divide(a, b)
+        tensor([2.0, 1.0])
+        >>> torch.floor_divide(a, 1.4)
+        tensor([2.0, 2.0])
+    """
+    ...
+def fmax(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmax(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.maximum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the maximum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmax`` and is similar to NumPy's ``fmax`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([9.7, float('nan'), 3.1, float('nan')])
+        >>> b = torch.tensor([-2.2, 0.5, float('nan'), float('nan')])
+        >>> torch.fmax(a, b)
+        tensor([9.7000, 0.5000, 3.1000,    nan])
+    """
+    ...
+def fmin(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmin(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    This is like :func:`torch.minimum` except it handles NaNs differently:
+    if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the minimum.
+    Only if both elements are NaN is NaN propagated.
+    
+    This function is a wrapper around C++'s ``std::fmin`` and is similar to NumPy's ``fmin`` function.
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([2.2, float('nan'), 2.1, float('nan')])
+        >>> b = torch.tensor([-9.3, 0.1, float('nan'), float('nan')])
+        >>> torch.fmin(a, b)
+        tensor([-9.3000, 0.1000, 2.1000,    nan])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+@overload
+def fmod(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    fmod(input, other, *, out=None) -> Tensor
+    
+    Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+    The result has the same sign as the dividend :attr:`input` and its absolute value
+    is less than that of :attr:`other`.
+    
+    This function may be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+    
+        When the divisor is zero, returns ``NaN`` for floating point dtypes
+        on both CPU and GPU; raises ``RuntimeError`` for integer division by
+        zero on CPU; Integer division by zero on GPU may return any value.
+    
+    .. note::
+    
+       Complex inputs are not supported. In some cases, it is not mathematically
+       possible to satisfy the definition of a modulo operation with complex numbers.
+    
+    .. seealso::
+    
+        :func:`torch.remainder` which implements Python's modulus operator.
+        This one is defined using division rounding down the result.
+    
+    Args:
+        input (Tensor): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([-1., -0., -1.,  1.,  0.,  1.])
+        >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+    """
+    ...
+def frac(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    frac(input, *, out=None) -> Tensor
+    
+    Computes the fractional portion of each element in :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \text{input}_{i} - \left\lfloor |\text{input}_{i}| \right\rfloor * \operatorname{sgn}(\text{input}_{i})
+    
+    Example::
+    
+        >>> torch.frac(torch.tensor([1, 2.5, -3.2]))
+        tensor([ 0.0000,  0.5000, -0.2000])
+    """
+    ...
+def frac_(input: Tensor) -> Tensor: ...
+def frexp(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.frexp: 
+    r"""
+    frexp(input, *, out=None) -> (Tensor mantissa, Tensor exponent)
+    
+    Decomposes :attr:`input` into mantissa and exponent tensors
+    such that :math:`\text{input} = \text{mantissa} \times 2^{\text{exponent}}`.
+    
+    The range of mantissa is the open interval (-1, 1).
+    
+    Supports float inputs.
+    
+    Args:
+        input (Tensor): the input tensor
+    
+    
+    Keyword args:
+        out (tuple, optional): the output tensors
+    
+    Example::
+    
+        >>> x = torch.arange(9.)
+        >>> mantissa, exponent = torch.frexp(x)
+        >>> mantissa
+        tensor([0.0000, 0.5000, 0.5000, 0.7500, 0.5000, 0.6250, 0.7500, 0.8750, 0.5000])
+        >>> exponent
+        tensor([0, 1, 2, 2, 3, 3, 3, 3, 4], dtype=torch.int32)
+        >>> torch.ldexp(mantissa, exponent)
+        tensor([0., 1., 2., 3., 4., 5., 6., 7., 8.])
+    """
+    ...
+def frobenius_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def from_file(filename: str, shared: Optional[_bool] = None, size: Optional[_int] = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    from_file(filename, shared=None, size=0, *, dtype=None, layout=None, device=None, pin_memory=False)
+    
+    Creates a CPU tensor with a storage backed by a memory-mapped file.
+    
+    If ``shared`` is True, then memory is shared between processes. All changes are written to the file.
+    If ``shared`` is False, then changes to the tensor do not affect the file.
+    
+    ``size`` is the number of elements in the Tensor. If ``shared`` is ``False``, then the file must contain
+    at least ``size * sizeof(dtype)`` bytes. If ``shared`` is ``True`` the file will be created if needed.
+    
+    .. note::
+        Only CPU tensors can be mapped to files.
+    
+    .. note::
+        For now, tensors with storages backed by a memory-mapped file cannot be created in pinned memory.
+    
+    
+    Args:
+        filename (str): file name to map
+        shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                        underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+        size (int): number of elements in the tensor
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+        >>> t = torch.randn(2, 5, dtype=torch.float64)
+        >>> t.numpy().tofile('storage.pt')
+        >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64)
+    """
+    ...
+def from_numpy(ndarray) -> Tensor: 
+    r"""
+    from_numpy(ndarray) -> Tensor
+    
+    Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
+    
+    The returned tensor and :attr:`ndarray` share the same memory. Modifications to
+    the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
+    tensor is not resizable.
+    
+    It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``,
+    ``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``,
+    ``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
+    and ``bool``.
+    
+    .. warning::
+        Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior.
+    
+    Example::
+    
+        >>> a = numpy.array([1, 2, 3])
+        >>> t = torch.from_numpy(a)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    """
+    ...
+def frombuffer(buffer: Any, *, dtype: _dtype, count: int = -1, offset: int = 0, requires_grad: _bool = False) -> Tensor: 
+    r"""
+    frombuffer(buffer, *, dtype, count=-1, offset=0, requires_grad=False) -> Tensor
+    
+    Creates a 1-dimensional :class:`Tensor` from an object that implements
+    the Python buffer protocol.
+    
+    Skips the first :attr:`offset` bytes in the buffer, and interprets the rest of
+    the raw bytes as a 1-dimensional tensor of type :attr:`dtype` with :attr:`count`
+    elements.
+    
+    Note that either of the following must be true:
+    
+    1. :attr:`count` is a positive non-zero number, and the total number of bytes
+    in the buffer is more than :attr:`offset` plus :attr:`count` times the size
+    (in bytes) of :attr:`dtype`.
+    
+    2. :attr:`count` is negative, and the length (number of bytes) of the buffer
+    subtracted by the :attr:`offset` is a multiple of the size (in bytes) of
+    :attr:`dtype`.
+    
+    The returned tensor and buffer share the same memory. Modifications to
+    the tensor will be reflected in the buffer and vice versa. The returned
+    tensor is not resizable.
+    
+    .. note::
+        This function increments the reference count for the object that
+        owns the shared memory. Therefore, such memory will not be deallocated
+        before the returned tensor goes out of scope.
+    
+    .. warning::
+        This function's behavior is undefined when passed an object implementing
+        the buffer protocol whose data is not on the CPU. Doing so is likely to
+        cause a segmentation fault.
+    
+    .. warning::
+        This function does not try to infer the :attr:`dtype` (hence, it is not
+        optional). Passing a different :attr:`dtype` than its source may result
+        in unexpected behavior.
+    
+    Args:
+        buffer (object): a Python object that exposes the buffer interface.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        count (int, optional): the number of desired elements to be read.
+            If negative, all the elements (until the end of the buffer) will be
+            read. Default: -1.
+        offset (int, optional): the number of bytes to skip at the start of
+            the buffer. Default: 0.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> import array
+        >>> a = array.array('i', [1, 2, 3])
+        >>> t = torch.frombuffer(a, dtype=torch.int32)
+        >>> t
+        tensor([ 1,  2,  3])
+        >>> t[0] = -1
+        >>> a
+        array([-1,  2,  3])
+    
+        >>> # Interprets the signed char bytes as 32-bit integers.
+        >>> # Each 4 signed char elements will be interpreted as
+        >>> # 1 signed 32-bit integer.
+        >>> import array
+        >>> a = array.array('b', [-1, 0, 0, 0])
+        >>> torch.frombuffer(a, dtype=torch.int32)
+        tensor([255], dtype=torch.int32)
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: List[Union[str, None]], layout: _layout = strided, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: Sequence[Union[_int, SymInt]], fill_value: Union[Number, _complex], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+@overload
+def full(size: _size, fill_value: Union[Number, _complex], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+    tensor's dtype is inferred from :attr:`fill_value`.
+    
+    Args:
+        size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+            shape of the output tensor.
+        fill_value (Scalar): the value to fill the output tensor with.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.full((2, 3), 3.141592)
+        tensor([[ 3.1416,  3.1416,  3.1416],
+                [ 3.1416,  3.1416,  3.1416]])
+    """
+    ...
+def full_like(input: Tensor, fill_value: Union[Number, _complex], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    full_like(input, fill_value, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
+    ``torch.full_like(input, fill_value)`` is equivalent to
+    ``torch.full(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        fill_value: the number to fill the output tensor with.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+def fused_moving_avg_obs_fake_quant(input: Tensor, observer_on: Tensor, fake_quant_on: Tensor, running_min: Tensor, running_max: Tensor, scale: Tensor, zero_point: Tensor, averaging_const: _float, quant_min: _int, quant_max: _int, ch_axis: _int, per_row_fake_quant: _bool = False, symmetric_quant: _bool = False) -> Tensor: ...
+@overload
+def gather(input: Tensor, dim: _int, index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+@overload
+def gather(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, sparse_grad: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+    
+    Gathers values along an axis specified by `dim`.
+    
+    For a 3-D tensor the output is specified by::
+    
+        out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+        out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+        out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+    
+    :attr:`input` and :attr:`index` must have the same number of dimensions.
+    It is also required that ``index.size(d) <= input.size(d)`` for all
+    dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+    Note that ``input`` and ``index`` do not broadcast against each other.
+    
+    Args:
+        input (Tensor): the source tensor
+        dim (int): the axis along which to index
+        index (LongTensor): the indices of elements to gather
+    
+    Keyword arguments:
+        sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+        out (Tensor, optional): the destination tensor
+    
+    Example::
+    
+        >>> t = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+        tensor([[ 1,  1],
+                [ 4,  3]])
+    """
+    ...
+def gcd(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gcd(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise greatest common divisor (GCD) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`gcd(0, 0) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.gcd(a, b)
+        tensor([1, 2, 5])
+        >>> c = torch.tensor([3])
+        >>> torch.gcd(a, c)
+        tensor([1, 1, 3])
+    """
+    ...
+def gcd_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def ge(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+@overload
+def ge(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ge(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \geq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, True], [False, True]])
+    """
+    ...
+def geqrf(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.geqrf: 
+    r"""
+    geqrf(input, *, out=None) -> (Tensor, Tensor)
+    
+    This is a low-level function for calling LAPACK's geqrf directly. This function
+    returns a namedtuple (a, tau) as defined in `LAPACK documentation for geqrf`_ .
+    
+    Computes a QR decomposition of :attr:`input`.
+    Both `Q` and `R` matrices are stored in the same output tensor `a`.
+    The elements of `R` are stored on and above the diagonal.
+    Elementary reflectors (or Householder vectors) implicitly defining matrix `Q`
+    are stored below the diagonal.
+    The results of this function can be used together with :func:`torch.linalg.householder_product`
+    to obtain the `Q` matrix or
+    with :func:`torch.ormqr`, which uses an implicit representation of the `Q` matrix,
+    for an efficient matrix-matrix multiplication.
+    
+    See `LAPACK documentation for geqrf`_ for further details.
+    
+    .. note::
+        See also :func:`torch.linalg.qr`, which computes Q and R matrices, and :func:`torch.linalg.lstsq`
+        with the ``driver="gels"`` option for a function that can solve matrix equations using a QR decomposition.
+    
+    Args:
+        input (Tensor): the input matrix
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, Tensor). Ignored if `None`. Default: `None`.
+    
+    .. _LAPACK documentation for geqrf:
+        http://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html
+    """
+    ...
+def ger(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ger(input, vec2, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.outer`.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future PyTorch release.
+        Use :func:`torch.outer` instead.
+    """
+    ...
+def get_default_dtype() -> _dtype: 
+    r"""
+    get_default_dtype() -> torch.dtype
+    
+    Get the current default floating point :class:`torch.dtype`.
+    
+    Example::
+    
+        >>> torch.get_default_dtype()  # initial default for floating point is torch.float32
+        torch.float32
+        >>> torch.set_default_dtype(torch.float64)
+        >>> torch.get_default_dtype()  # default is now changed to torch.float64
+        torch.float64
+    """
+    ...
+def get_num_interop_threads() -> _int: 
+    r"""
+    get_num_interop_threads() -> int
+    
+    Returns the number of threads used for inter-op parallelism on CPU
+    (e.g. in JIT interpreter)
+    """
+    ...
+def get_num_threads() -> _int: 
+    r"""
+    get_num_threads() -> int
+    
+    Returns the number of threads used for parallelizing CPU operations
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Optional[Union[Number, _complex]] = None, dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Sequence[Union[Number, _complex]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: Optional[_int] = None, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Number, _complex], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, spacing: Union[Tuple[Tensor, ...], List[Tensor]], dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def gradient(input: Tensor, *, dim: _size, edge_order: _int = 1) -> Tuple[Tensor, ...]: 
+    r"""
+    gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+    
+    Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+    one or more dimensions using the `second-order accurate central differences method
+    <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+    either first or second order estimates at the boundaries.
+    
+    The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+    specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+    to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+    :attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+    :math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+    
+    When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+    This is detailed in the "Keyword Arguments" section below.
+    
+    The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+    accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+    improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+    is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+    Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+    it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+    
+    .. math::
+        \begin{aligned}
+            f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+            f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+        \end{aligned}
+    
+    Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+    
+    .. math::
+        f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+              + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+    
+    .. note::
+        We estimate the gradient of functions in complex domain
+        :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+    
+    The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+    
+    Args:
+        input (``Tensor``): the tensor that represents the values of the function
+    
+    Keyword args:
+        spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+            how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+            the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+            indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+            indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+            Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+            the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+            the coordinates are (t0[1], t1[2], t2[3])
+    
+        dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+            the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+            the :attr:`spacing` argument must correspond with the specified dims."
+    
+        edge_order (``int``, optional): 1 or 2, for `first-order
+            <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+            `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+            estimation of the boundary ("edge") values, respectively.
+    
+    Examples::
+    
+        >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+        >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+        >>> values = torch.tensor([4., 1., 1., 16.], )
+        >>> torch.gradient(values, spacing = coordinates)
+        (tensor([-3., -2., 2., 5.]),)
+    
+        >>> # Estimates the gradient of the R^2 -> R function whose samples are
+        >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+        >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+        >>> # partial derivative for both dimensions.
+        >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+        >>> torch.gradient(t)
+        (tensor([[ 9., 18., 36., 72.],
+                 [ 9., 18., 36., 72.]]),
+         tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]))
+    
+        >>> # A scalar value for spacing modifies the relationship between tensor indices
+        >>> # and input coordinates by multiplying the indices to find the
+        >>> # coordinates. For example, below the indices of the innermost
+        >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+        >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                  [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+                  [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+        >>> # doubling the spacing between samples halves the estimated partial gradients.
+    
+        >>>
+        >>> # Estimates only the partial derivative for dimension 1
+        >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+        (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+                 [10.0000, 15.0000, 30.0000, 40.0000]]),)
+    
+        >>> # When spacing is a list of scalars, the relationship between the tensor
+        >>> # indices and input coordinates changes based on dimension.
+        >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+        >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+        >>> # 0, 1 translate to coordinates of [0, 2].
+        >>> torch.gradient(t, spacing = [3., 2.])
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    
+        >>> # The following example is a replication of the previous one with explicit
+        >>> # coordinates.
+        >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+        >>> torch.gradient(t, spacing = coords)
+        (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+                 [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+         tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+                 [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.gt`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+@overload
+def greater_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    greater_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ge`.
+    """
+    ...
+def grid_sampler(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_2d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: _int, padding_mode: _int, align_corners: _bool) -> Tensor: ...
+def group_norm(input: Tensor, num_groups: _int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enabled: _bool = True) -> Tensor: ...
+@overload
+def gru(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def gru(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def gt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def gt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    gt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} > \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [False, False]])
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hamming_window(window_length: _int, periodic: _bool, alpha: _float, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hamming window function.
+    
+    .. math::
+        w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hamming_window(L, periodic=True)`` equal to
+    ``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    .. note::
+        This is a generalized version of :meth:`torch.hann_window`.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+        alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+        beta (float, optional): The coefficient :math:`\beta` in the equation above
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window.
+    """
+    ...
+@overload
+def hann_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+@overload
+def hann_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    hann_window(window_length, periodic=True, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Hann window function.
+    
+    .. math::
+        w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+                \sin^2 \left( \frac{\pi n}{N - 1} \right),
+    
+    where :math:`N` is the full window size.
+    
+    The input :attr:`window_length` is a positive integer controlling the
+    returned window size. :attr:`periodic` flag determines whether the returned
+    window trims off the last duplicate value from the symmetric window and is
+    ready to be used as a periodic window with functions like
+    :meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+    above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+    ``torch.hann_window(L, periodic=True)`` equal to
+    ``torch.hann_window(L + 1, periodic=False)[:-1])``.
+    
+    .. note::
+        If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+    
+    Arguments:
+        window_length (int): the size of returned window
+        periodic (bool, optional): If True, returns a window to be used as periodic
+            function. If False, return a symmetric window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). Only floating point types are supported.
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Returns:
+        Tensor: A 1-D tensor of size :math:`(\text{window\_length},)` containing the window
+    """
+    ...
+def hardshrink(input: Tensor, lambd: Union[Number, _complex] = 0.5, *, out: Optional[Tensor] = None) -> Tensor: ...
+def heaviside(input: Tensor, values: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    heaviside(input, values, *, out=None) -> Tensor
+    
+    Computes the Heaviside step function for each element in :attr:`input`.
+    The Heaviside step function is defined as:
+    
+    .. math::
+        \text{{heaviside}}(input, values) = \begin{cases}
+            0, & \text{if input < 0}\\
+            values, & \text{if input == 0}\\
+            1, & \text{if input > 0}
+        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        values (Tensor): The values to use where :attr:`input` is zero.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> input = torch.tensor([-1.5, 0, 2.0])
+        >>> values = torch.tensor([0.5])
+        >>> torch.heaviside(input, values)
+        tensor([0.0000, 0.5000, 1.0000])
+        >>> values = torch.tensor([1.2, -2.0, 3.5])
+        >>> torch.heaviside(input, values)
+        tensor([0., -2., 1.])
+    """
+    ...
+def hinge_embedding_loss(input: Tensor, target: Tensor, margin: _float = 1.0, reduction: _int = 1) -> Tensor: ...
+def histc(input: Tensor, bins: _int = 100, min: Union[Number, _complex] = 0, max: Union[Number, _complex] = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor
+    
+    Computes the histogram of a tensor.
+    
+    The elements are sorted into equal width bins between :attr:`min` and
+    :attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
+    maximum values of the data are used.
+    
+    Elements lower than min and higher than max and ``NaN`` elements are ignored.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins (int): number of histogram bins
+        min (Scalar): lower end of the range (inclusive)
+        max (Scalar): upper end of the range (inclusive)
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: Histogram represented as a tensor
+    
+    Example::
+    
+        >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
+        tensor([ 0.,  2.,  1.,  0.])
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: Tensor, *, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogram(input: Tensor, bins: _int = 100, *, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.histogram: 
+    r"""
+    histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+    
+    Computes a histogram of the values in a tensor.
+    
+    :attr:`bins` can be an integer or a 1D tensor.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins.
+    By default, the lower and upper range of the bins is determined by the
+    minimum and maximum elements of the input tensor. The :attr:`range`
+    argument can be provided to specify a range for the bins.
+    
+    If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+    including the rightmost edge. It should contain at least 2 elements
+    and its elements should be increasing.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+              defines the sequence of bin edges including the rightmost edge.
+    
+    Keyword args:
+        range (tuple of float): Defines the range of the bins.
+        weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                         input contributes its associated weight towards its bin's result.
+        density (bool): If False, the result will contain the count (or total weight) in each bin.
+                        If True, the result is the value of the probability density function over the bins,
+                        normalized such that the integral over the range of the bins is 1.
+        out (Tensor, optional): the output tensor. (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+    
+    Returns:
+        hist (Tensor): 1D Tensor containing the values of the histogram.
+        bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+    
+    Example::
+    
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+        (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+        >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+        (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _int, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: _size, range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+@overload
+def histogramdd(input: Tensor, bins: Union[Tuple[Tensor, ...], List[Tensor]], range: Optional[Sequence[_float]] = None, weight: Optional[Tensor] = None, density: _bool = False) -> torch.return_types.histogramdd: 
+    r"""
+    histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+    
+    Computes a multi-dimensional histogram of the values in a tensor.
+    
+    Interprets the elements of an input tensor whose innermost dimension has size N
+    as a collection of N-dimensional points. Maps each of the points into a set of
+    N-dimensional bins and returns the number of points (or total weight) in each bin.
+    
+    :attr:`input` must be a tensor with at least 2 dimensions.
+    If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+    If input has three or more dimensions, all but the last dimension are flattened.
+    
+    Each dimension is independently associated with its own strictly increasing sequence
+    of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+    tensors. Alternatively, bin edges may be constructed automatically by passing a
+    sequence of integers specifying the number of equal-width bins in each dimension.
+    
+    For each N-dimensional point in input:
+        - Each of its coordinates is binned independently among the bin edges
+            corresponding to its dimension
+        - Binning results are combined to identify the N-dimensional bin (if any)
+            into which the point falls
+        - If the point falls into a bin, the bin's count (or total weight) is incremented
+        - Points which do not fall into any bin do not contribute to the output
+    
+    :attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+    
+    If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+    of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+    least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+    the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+    the rightmost bin is inclusive of its right edge.
+    
+    If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+    in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+    are determined by the minimum and maximum elements of the input tensor in the
+    corresponding dimension. The :attr:`range` argument can be provided to manually
+    specify the leftmost and rightmost bin edges in each dimension.
+    
+    If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+    
+    .. note::
+        See also :func:`torch.histogram`, which specifically computes 1D histograms.
+        While :func:`torch.histogramdd` infers the dimensionality of its bins and
+        binned values from the shape of :attr:`input`, :func:`torch.histogram`
+        accepts and flattens :attr:`input` of any shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        bins: Tensor[], int[], or int.
+                If Tensor[], defines the sequences of bin edges.
+                If int[], defines the number of equal-width bins in each dimension.
+                If int, defines the number of equal-width bins for all dimensions.
+    Keyword args:
+        range (sequence of float): Defines the leftmost and rightmost bin edges
+                                    in each dimension.
+        weight (Tensor): By default, each value in the input has weight 1. If a weight
+                            tensor is passed, each N-dimensional coordinate in input
+                            contributes its associated weight towards its bin's result.
+                            The weight tensor should have the same shape as the :attr:`input`
+                            tensor excluding its innermost dimension N.
+        density (bool): If False (default), the result will contain the count (or total weight)
+                        in each bin. If True, each count (weight) is divided by the total count
+                        (total weight), then divided by the volume of its associated bin.
+    Returns:
+        hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+        bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+    
+    Example::
+        >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+        ...                   weight=torch.tensor([1., 2., 4., 8.]))
+            torch.return_types.histogramdd(
+                hist=tensor([[0., 1., 0.],
+                             [2., 0., 0.],
+                             [4., 0., 8.]]),
+                bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                           tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+    
+        >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+        ...                   range=[0., 1., 0., 1.], density=True)
+            torch.return_types.histogramdd(
+               hist=tensor([[2., 0.],
+                            [0., 2.]]),
+               bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                          tensor([0.0000, 0.5000, 1.0000])))
+    """
+    ...
+def hsmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+@overload
+def hsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+@overload
+def hsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    hsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+    horizontally according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    If :attr:`input` is one dimensional this is equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+    zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+    torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+    except that if :attr:`indices_or_sections` is an integer it must evenly divide
+    the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.hsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.hsplit(t, 2)
+        (tensor([[ 0.,  1.],
+                 [ 4.,  5.],
+                 [ 8.,  9.],
+                 [12., 13.]]),
+         tensor([[ 2.,  3.],
+                 [ 6.,  7.],
+                 [10., 11.],
+                 [14., 15.]]))
+        >>> torch.hsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.],
+                 [ 4.,  5.,  6.],
+                 [ 8.,  9., 10.],
+                 [12., 13., 14.]]),
+         tensor([[ 3.],
+                 [ 7.],
+                 [11.],
+                 [15.]]),
+         tensor([], size=(4, 0)))
+    """
+    ...
+def hspmm(mat1: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hspmm(mat1, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of a :ref:`sparse COO matrix
+    <sparse-coo-docs>` :attr:`mat1` and a strided matrix :attr:`mat2`. The
+    result is a (1 + 1)-dimensional :ref:`hybrid COO matrix
+    <sparse-hybrid-coo-docs>`.
+    
+    Args:
+        mat1 (Tensor): the first sparse matrix to be matrix multiplied
+        mat2 (Tensor): the second strided matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def hstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence horizontally (column wise).
+    
+    This is equivalent to concatenation along the first axis for 1-D tensors, and along the second axis for all other tensors.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.hstack((a,b))
+        tensor([1, 2, 3, 4, 5, 6])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.hstack((a,b))
+        tensor([[1, 4],
+                [2, 5],
+                [3, 6]])
+    """
+    ...
+def hypot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    hypot(input, other, *, out=None) -> Tensor
+    
+    Given the legs of a right triangle, return its hypotenuse.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}^{2} + \text{other}_{i}^{2}}
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0]))
+        tensor([5.0000, 5.6569, 6.4031])
+    """
+    ...
+def i0(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    i0(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.i0`.
+    """
+    ...
+def i0_(input: Tensor) -> Tensor: ...
+def igamma(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igamma(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammainc`.
+    """
+    ...
+def igammac(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    igammac(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.gammaincc`.
+    """
+    ...
+def imag(input: Tensor) -> Tensor: 
+    r"""
+    imag(input) -> Tensor
+    
+    Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    .. warning::
+        :func:`imag` is only supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.imag
+        tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: _int, index: Tensor, source: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_copy(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, source: Tensor) -> Tensor: 
+    r"""
+    index_copy(input, dim, index, source, *, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_add_` for function description.
+    """
+    ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+@overload
+def index_fill(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def index_put(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_put_(input: Tensor, indices: Optional[Union[Tuple[Tensor, ...], List[Tensor]]], values: Tensor, accumulate: _bool = False) -> Tensor: ...
+def index_reduce(input: Tensor, dim: _int, index: Tensor, source: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor
+    
+    See :meth:`~Tensor.index_reduce_` for function description.
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: _int, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+@overload
+def index_select(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    index_select(input, dim, index, *, out=None) -> Tensor
+    
+    Returns a new tensor which indexes the :attr:`input` tensor along dimension
+    :attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+    
+    The returned tensor has the same number of dimensions as the original tensor
+    (:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+    of :attr:`index`; other dimensions have the same size as in the original tensor.
+    
+    .. note:: The returned tensor does **not** use the same storage as the original
+              tensor.  If :attr:`out` has a different shape than expected, we
+              silently change it to the correct shape, reallocating the underlying
+              storage if necessary.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension in which we index
+        index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-0.4664,  0.2647, -0.1228, -1.1068],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> indices = torch.tensor([0, 2])
+        >>> torch.index_select(x, 0, indices)
+        tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+                [-1.1734, -0.6571,  0.7230, -0.6004]])
+        >>> torch.index_select(x, 1, indices)
+        tensor([[ 0.1427, -0.5414],
+                [-0.4664, -0.1228],
+                [-1.1734,  0.7230]])
+    """
+    ...
+def indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.indices`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def init_num_threads() -> None: ...
+def inner(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inner(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product for 1D tensors. For higher dimensions, sums the product
+    of elements from :attr:`input` and :attr:`other` along their last dimension.
+    
+    .. note::
+    
+        If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent
+        to `torch.mul(input, other)`.
+    
+        If both :attr:`input` and :attr:`other` are non-scalars, the size of their last
+        dimension must match and the result is equivalent to `torch.tensordot(input,
+        other, dims=([-1], [-1]))`
+    
+    Args:
+        input (Tensor): First input tensor
+        other (Tensor): Second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): Optional output tensor to write result into. The output
+                                shape is `input.shape[:-1] + other.shape[:-1]`.
+    
+    Example::
+    
+        # Dot product
+        >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1]))
+        tensor(7)
+    
+        # Multidimensional input tensors
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[0.8173, 1.0874, 1.1784],
+                [0.3279, 0.1234, 2.7894]])
+        >>> b = torch.randn(2, 4, 3)
+        >>> b
+        tensor([[[-0.4682, -0.7159,  0.1506],
+                [ 0.4034, -0.3657,  1.0387],
+                [ 0.9892, -0.6684,  0.1774],
+                [ 0.9482,  1.3261,  0.3917]],
+    
+                [[ 0.4537,  0.7493,  1.1724],
+                [ 0.2291,  0.5749, -0.2267],
+                [-0.7920,  0.3607, -0.3701],
+                [ 1.3666, -0.5850, -1.7242]]])
+        >>> torch.inner(a, b)
+        tensor([[[-0.9837,  1.1560,  0.2907,  2.6785],
+                [ 2.5671,  0.5452, -0.6912, -1.5509]],
+    
+                [[ 0.1782,  2.9843,  0.7366,  1.5672],
+                [ 3.5115, -0.4864, -1.2476, -4.4337]]])
+    
+        # Scalar input
+        >>> torch.inner(a, torch.tensor(2))
+        tensor([[1.6347, 2.1748, 2.3567],
+                [0.6558, 0.2469, 5.5787]])
+    """
+    ...
+def instance_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], use_input_stats: _bool, momentum: _float, eps: _float, cudnn_enabled: _bool) -> Tensor: ...
+def int_repr(input: Tensor) -> Tensor: ...
+def inverse(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    inverse(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.inv`
+    """
+    ...
+def is_complex(input: Tensor) -> _bool: 
+    r"""
+    is_complex(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a complex data type i.e.,
+    one of ``torch.complex64``, and ``torch.complex128``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_conj(input: Tensor) -> _bool: 
+    r"""
+    is_conj(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a conjugated tensor, i.e. its conjugate bit is set to `True`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_distributed(input: Tensor) -> _bool: ...
+def is_floating_point(input: Tensor) -> _bool: 
+    r"""
+    is_floating_point(input) -> (bool)
+    
+    Returns True if the data type of :attr:`input` is a floating point data type i.e.,
+    one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_grad_enabled() -> _bool: 
+    r"""
+    is_grad_enabled() -> (bool)
+    
+    Returns True if grad mode is currently enabled.
+    """
+    ...
+def is_inference(input: Tensor) -> _bool: 
+    r"""
+    is_inference(input) -> (bool)
+    
+    Returns True if :attr:`input` is an inference tensor.
+    
+    A non-view tensor is an inference tensor if and only if it was
+    allocated during inference mode. A view tensor is an inference
+    tensor if and only if the tensor it is a view of is an inference tensor.
+    
+    For details on inference mode please see
+    `Inference Mode <https://pytorch.org/cppdocs/notes/inference_mode.html>`_.
+    
+    Args:
+        input (Tensor): the input tensor.
+    """
+    ...
+def is_inference_mode_enabled() -> _bool: 
+    r"""
+    is_inference_mode_enabled() -> (bool)
+    
+    Returns True if inference mode is currently enabled.
+    """
+    ...
+def is_neg(input: Tensor) -> _bool: ...
+def is_nonzero(input: Tensor) -> _bool: 
+    r"""
+    is_nonzero(input) -> (bool)
+    
+    Returns True if the :attr:`input` is a single element tensor which is not equal to zero
+    after type conversions.
+    i.e. not equal to ``torch.tensor([0.])`` or ``torch.tensor([0])`` or
+    ``torch.tensor([False])``.
+    Throws a ``RuntimeError`` if ``torch.numel() != 1`` (even in case
+    of sparse tensors).
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Examples::
+    
+        >>> torch.is_nonzero(torch.tensor([0.]))
+        False
+        >>> torch.is_nonzero(torch.tensor([1.5]))
+        True
+        >>> torch.is_nonzero(torch.tensor([False]))
+        False
+        >>> torch.is_nonzero(torch.tensor([3]))
+        True
+        >>> torch.is_nonzero(torch.tensor([1, 3, 5]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with more than one value is ambiguous
+        >>> torch.is_nonzero(torch.tensor([]))
+        Traceback (most recent call last):
+        ...
+        RuntimeError: bool value of Tensor with no values is ambiguous
+    """
+    ...
+def is_same_size(input: Tensor, other: Tensor) -> _bool: ...
+def is_signed(input: Tensor) -> _bool: ...
+def is_vulkan_available() -> _bool: ...
+def isclose(input: Tensor, other: Tensor, rtol: _float = 1e-05, atol: _float = 1e-08, equal_nan: _bool = False) -> Tensor: 
+    r"""
+    isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of
+    :attr:`input` is "close" to the corresponding element of :attr:`other`.
+    Closeness is defined as:
+    
+    .. math::
+        \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+    
+    
+    where :attr:`input` and :attr:`other` are finite. Where :attr:`input`
+    and/or :attr:`other` are nonfinite they are close if and only if
+    they are equal, with NaNs being considered equal to each other when
+    :attr:`equal_nan` is True.
+    
+    Args:
+        input (Tensor): first tensor to compare
+        other (Tensor): second tensor to compare
+        atol (float, optional): absolute tolerance. Default: 1e-08
+        rtol (float, optional): relative tolerance. Default: 1e-05
+        equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+    
+    Examples::
+    
+        >>> torch.isclose(torch.tensor((1., 2, 3)), torch.tensor((1 + 1e-10, 3, 4)))
+        tensor([ True, False, False])
+        >>> torch.isclose(torch.tensor((float('inf'), 4)), torch.tensor((float('inf'), 6)), rtol=.5)
+        tensor([True, True])
+    """
+    ...
+def isfinite(input: Tensor) -> Tensor: 
+    r"""
+    isfinite(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element is `finite` or not.
+    
+    Real values are finite when they are not NaN, negative infinity, or infinity.
+    Complex values are finite when both their real and imaginary parts are finite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is finite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isfinite(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([True,  False,  True,  False,  False])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(element: Union[Number, _complex], test_elements: Tensor, *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+@overload
+def isin(elements: Tensor, test_element: Union[Number, _complex], *, assume_unique: _bool = False, invert: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+    
+    Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+    a boolean tensor of the same shape as :attr:`elements` that is True for elements
+    in :attr:`test_elements` and False otherwise.
+    
+    .. note::
+        One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+    
+    Args:
+        elements (Tensor or Scalar): Input elements
+        test_elements (Tensor or Scalar): Values against which to test for each input element
+        assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+            :attr:`test_elements` contain unique elements, which can speed up the
+            calculation. Default: False
+        invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+            values for elements *not* in :attr:`test_elements`. Default: False
+    
+    Returns:
+        A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+        :attr:`test_elements` and False otherwise
+    
+    Example:
+        >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+        tensor([[False,  True],
+                [ True, False]])
+    """
+    ...
+def isinf(input: Tensor) -> Tensor: 
+    r"""
+    isinf(input) -> Tensor
+    
+    Tests if each element of :attr:`input` is infinite
+    (positive or negative infinity) or not.
+    
+    .. note::
+        Complex values are infinite when their real or imaginary part is
+        infinite.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is infinite and False elsewhere
+    
+    Example::
+    
+        >>> torch.isinf(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+        tensor([False,  True,  False,  True,  False])
+    """
+    ...
+def isnan(input: Tensor) -> Tensor: 
+    r"""
+    isnan(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input`
+    is NaN or not. Complex values are considered NaN when either their real
+    and/or imaginary part is NaN.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is NaN and False elsewhere
+    
+    Example::
+    
+        >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+        tensor([False, True, False])
+    """
+    ...
+def isneginf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isneginf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is negative infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isneginf(a)
+        tensor([ True, False, False])
+    """
+    ...
+def isposinf(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    isposinf(input, *, out=None) -> Tensor
+    Tests if each element of :attr:`input` is positive infinity or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+        >>> torch.isposinf(a)
+        tensor([False,  True, False])
+    """
+    ...
+def isreal(input: Tensor) -> Tensor: 
+    r"""
+    isreal(input) -> Tensor
+    
+    Returns a new tensor with boolean elements representing if each element of :attr:`input` is real-valued or not.
+    All real-valued types are considered real. Complex values are considered real when their imaginary part is 0.
+    
+    Arguments:
+        input (Tensor): the input tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is real and False elsewhere
+    
+    Example::
+    
+        >>> torch.isreal(torch.tensor([1, 1+1j, 2+0j]))
+        tensor([True, False, True])
+    """
+    ...
+def istft(input: Tensor, n_fft: _int, hop_length: Optional[_int] = None, win_length: Optional[_int] = None, window: Optional[Tensor] = None, center: _bool = True, normalized: _bool = False, onesided: Optional[_bool] = None, length: Optional[_int] = None, return_complex: _bool = False) -> Tensor: ...
+@overload
+def kaiser_window(window_length: _int, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+@overload
+def kaiser_window(window_length: _int, periodic: _bool, beta: _float, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+    
+    Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+    ``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+    where ``L`` is the :attr:`window_length`. This function computes:
+    
+    .. math::
+        out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+    
+    Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+    ``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+    The :attr:`periodic` argument is intended as a helpful shorthand
+    to produce a periodic window as input to functions like :func:`torch.stft`.
+    
+    .. note::
+        If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+    
+    
+    Args:
+        window_length (int): length of the window.
+        periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+            If False, returns a symmetric window suitable for use in filter design.
+        beta (float, optional): shape parameter for the window.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+              ``torch.strided`` (dense layout) is supported.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    """
+    ...
+def kl_div(input: Tensor, target: Tensor, reduction: _int = 1, *, log_target: _bool = False) -> Tensor: ...
+def kron(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    kron(input, other, *, out=None) -> Tensor
+    
+    Computes the Kronecker product, denoted by :math:`\otimes`, of :attr:`input` and :attr:`other`.
+    
+    If :attr:`input` is a :math:`(a_0 \times a_1 \times \dots \times a_n)` tensor and :attr:`other` is a
+    :math:`(b_0 \times b_1 \times \dots \times b_n)` tensor, the result will be a
+    :math:`(a_0*b_0 \times a_1*b_1 \times \dots \times a_n*b_n)` tensor with the following entries:
+    
+    .. math::
+        (\text{input} \otimes \text{other})_{k_0, k_1, \dots, k_n} =
+            \text{input}_{i_0, i_1, \dots, i_n} * \text{other}_{j_0, j_1, \dots, j_n},
+    
+    where :math:`k_t = i_t * b_t + j_t` for :math:`0 \leq t \leq n`.
+    If one tensor has fewer dimensions than the other it is unsqueezed until it has the same number of dimensions.
+    
+    Supports real-valued and complex-valued inputs.
+    
+    .. note::
+        This function generalizes the typical definition of the Kronecker product for two matrices to two tensors,
+        as described above. When :attr:`input` is a :math:`(m \times n)` matrix and :attr:`other` is a
+        :math:`(p \times q)` matrix, the result will be a :math:`(p*m \times q*n)` block matrix:
+    
+        .. math::
+            \mathbf{A} \otimes \mathbf{B}=\begin{bmatrix}
+            a_{11} \mathbf{B} & \cdots & a_{1 n} \mathbf{B} \\
+            \vdots & \ddots & \vdots \\
+            a_{m 1} \mathbf{B} & \cdots & a_{m n} \mathbf{B} \end{bmatrix}
+    
+        where :attr:`input` is :math:`\mathbf{A}` and :attr:`other` is :math:`\mathbf{B}`.
+    
+    Arguments:
+        input (Tensor)
+        other (Tensor)
+    
+    Keyword args:
+        out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+    
+    Examples::
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.ones(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 1., 0., 0.],
+                [1., 1., 0., 0.],
+                [0., 0., 1., 1.],
+                [0., 0., 1., 1.]])
+    
+        >>> mat1 = torch.eye(2)
+        >>> mat2 = torch.arange(1, 5).reshape(2, 2)
+        >>> torch.kron(mat1, mat2)
+        tensor([[1., 2., 0., 0.],
+                [3., 4., 0., 0.],
+                [0., 0., 1., 2.],
+                [0., 0., 3., 4.]])
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+@overload
+def kthvalue(input: Tensor, k: _int, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.kthvalue: 
+    r"""
+    kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+    smallest element of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each element found.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+    are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+    they are of size 1. Otherwise, :attr:`dim` is squeezed
+    (see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+    :attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+    
+    .. note::
+        When :attr:`input` is a CUDA tensor and there are multiple valid
+        :attr:`k` th values, this function may nondeterministically return
+        :attr:`indices` for any of them.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): k for the k-th smallest element
+        dim (int, optional): the dimension to find the kth value along
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                               can be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.kthvalue(x, 4)
+        torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+    
+        >>> x=torch.arange(1.,7.).resize_(2,3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.]])
+        >>> torch.kthvalue(x, 2, 0, True)
+        torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+    """
+    ...
+def layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: _float = 1e-05, cudnn_enable: _bool = True) -> Tensor: ...
+def lcm(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lcm(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise least common multiple (LCM) of :attr:`input` and :attr:`other`.
+    
+    Both :attr:`input` and :attr:`other` must have integer types.
+    
+    .. note::
+        This defines :math:`lcm(0, 0) = 0` and :math:`lcm(0, a) = 0`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([5, 10, 15])
+        >>> b = torch.tensor([3, 4, 5])
+        >>> torch.lcm(a, b)
+        tensor([15, 20, 15])
+        >>> c = torch.tensor([3])
+        >>> torch.lcm(a, c)
+        tensor([15, 30, 15])
+    """
+    ...
+def lcm_(input: Tensor, other: Tensor) -> Tensor: ...
+def ldexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ldexp(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by 2 ** :attr:`other`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i * 2^\text{{other}}_i
+    
+    
+    Typically this function is used to construct floating point numbers by multiplying
+    mantissas in :attr:`input` with integral powers of two created from the exponents
+    in :attr:`other`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): a tensor of exponents, typically integers.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.ldexp(torch.tensor([1.]), torch.tensor([1]))
+        tensor([2.])
+        >>> torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4]))
+        tensor([ 2.,  4.,  8., 16.])
+    """
+    ...
+def ldexp_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def le(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def le(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    le(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \leq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or Scalar): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than or equal to
+        :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[True, False], [True, True]])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def lerp(input: Tensor, end: Tensor, weight: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lerp(input, end, weight, *, out=None)
+    
+    Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+    on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+    
+    .. math::
+        \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+    
+    The shapes of :attr:`start` and :attr:`end` must be
+    :ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+    the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the tensor with the starting points
+        end (Tensor): the tensor with the ending points
+        weight (float or tensor): the weight for the interpolation formula
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> start = torch.arange(1., 5.)
+        >>> end = torch.empty(4).fill_(10)
+        >>> start
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> end
+        tensor([ 10.,  10.,  10.,  10.])
+        >>> torch.lerp(start, end, 0.5)
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+        >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+        tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    """
+    ...
+@overload
+def less(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.lt`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+@overload
+def less_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    less_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.le`.
+    """
+    ...
+def lgamma(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lgamma(input, *, out=None) -> Tensor
+    
+    Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \ln |\Gamma(\text{input}_{i})|
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.arange(0.5, 2, 0.5)
+        >>> torch.lgamma(a)
+        tensor([ 0.5724,  0.0000, -0.1208])
+    """
+    ...
+@overload
+def linspace(start: Number, end: Number, steps: Optional[_int] = None, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Tensor, steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Tensor, end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+@overload
+def linspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+    
+    .. math::
+        (\text{start},
+        \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \ldots,
+        \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+        \text{end})
+    
+    
+    From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.linspace(3, 10, steps=5)
+        tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+        >>> torch.linspace(-10, 10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=5)
+        tensor([-10.,  -5.,   0.,   5.,  10.])
+        >>> torch.linspace(start=-10, end=10, steps=1)
+        tensor([-10.])
+    """
+    ...
+def log(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{e} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5) * 5
+        >>> a
+        tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739])
+        >>> torch.log(a)
+        tensor([ 1.5637,  1.4640,  0.1952, -1.4226,  1.5204])
+    """
+    ...
+def log10(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log10(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 10 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{10} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.5224,  0.9354,  0.7257,  0.1301,  0.2251])
+    
+    
+        >>> torch.log10(a)
+        tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476])
+    """
+    ...
+def log10_(input: Tensor) -> Tensor: ...
+def log1p(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log1p(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the natural logarithm of (1 + :attr:`input`).
+    
+    .. math::
+        y_i = \log_{e} (x_i + 1)
+    
+    .. note:: This function is more accurate than :func:`torch.log` for small
+              values of :attr:`input`
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
+        >>> torch.log1p(a)
+        tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
+    """
+    ...
+def log1p_(input: Tensor) -> Tensor: ...
+def log2(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    log2(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the logarithm to the base 2 of the elements
+    of :attr:`input`.
+    
+    .. math::
+        y_{i} = \log_{2} (x_{i})
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.rand(5)
+        >>> a
+        tensor([ 0.8419,  0.8003,  0.9971,  0.5287,  0.0490])
+    
+    
+        >>> torch.log2(a)
+        tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504])
+    """
+    ...
+def log2_(input: Tensor) -> Tensor: ...
+def log_(input: Tensor) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def log_softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: ...
+def logaddexp(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs.
+    
+    Calculates pointwise :math:`\log\left(e^x + e^y\right)`. This function is useful
+    in statistics where the calculated probabilities of events may be so small as to
+    exceed the range of normal floating point numbers. In such cases the logarithm
+    of the calculated probability is stored. This function allows adding
+    probabilities stored in such a fashion.
+    
+    This op should be disambiguated with :func:`torch.logsumexp` which performs a
+    reduction on a single tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1.0, -2, -3]))
+        tensor([-0.3069, -0.6867, -0.8731])
+        >>> torch.logaddexp(torch.tensor([-100.0, -200, -300]), torch.tensor([-1.0, -2, -3]))
+        tensor([-1., -2., -3.])
+        >>> torch.logaddexp(torch.tensor([1.0, 2000, 30000]), torch.tensor([-1.0, -2, -3]))
+        tensor([1.1269e+00, 2.0000e+03, 3.0000e+04])
+    """
+    ...
+def logaddexp2(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logaddexp2(input, other, *, out=None) -> Tensor
+    
+    Logarithm of the sum of exponentiations of the inputs in base-2.
+    
+    Calculates pointwise :math:`\log_2\left(2^x + 2^y\right)`. See
+    :func:`torch.logaddexp` for more details.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+@overload
+def logcumsumexp(input: Tensor, dim: Union[str, ellipsis, None], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logcumsumexp(input, dim, *, out=None) -> Tensor
+    Returns the logarithm of the cumulative summation of the exponentiation of
+    elements of :attr:`input` in the dimension :attr:`dim`.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logcumsumexp}(x)_{ij} = \log \sum\limits_{j=0}^{i} \exp(x_{ij})
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim  (int): the dimension to do the operation over
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(10)
+        >>> torch.logcumsumexp(a, dim=0)
+        tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+                 1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+    """
+    ...
+def logdet(input: Tensor) -> Tensor: 
+    r"""
+    logdet(input) -> Tensor
+    
+    Calculates log determinant of a square matrix or batches of square matrices.
+    
+    It returns ``-inf`` if the input has a determinant of zero, and ``NaN`` if it has
+    a negative determinant.
+    
+    .. note::
+        Backward through :meth:`logdet` internally uses SVD results when :attr:`input`
+        is not invertible. In this case, double backward through :meth:`logdet` will
+        be unstable in when :attr:`input` doesn't have distinct singular values. See
+        :func:`torch.linalg.svd` for details.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.slogdet` computes the sign (resp. angle) and natural logarithm of the
+            absolute value of the determinant of real-valued (resp. complex) square matrices.
+    
+    Arguments:
+        input (Tensor): the input tensor of size ``(*, n, n)`` where ``*`` is zero or more
+                    batch dimensions.
+    
+    Example::
+    
+        >>> A = torch.randn(3, 3)
+        >>> torch.det(A)
+        tensor(0.2611)
+        >>> torch.logdet(A)
+        tensor(-1.3430)
+        >>> A
+        tensor([[[ 0.9254, -0.6213],
+                 [-0.5787,  1.6843]],
+    
+                [[ 0.3242, -0.9665],
+                 [ 0.4539, -0.0887]],
+    
+                [[ 1.1336, -0.4025],
+                 [-0.7089,  0.9032]]])
+        >>> A.det()
+        tensor([1.1990, 0.4099, 0.7386])
+        >>> A.det().log()
+        tensor([ 0.1815, -0.8917, -0.3031])
+    """
+    ...
+def logical_and(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_and(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute AND with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False, False])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_and(a, b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b.double())
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a.double(), b)
+        tensor([False, False,  True, False])
+        >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([False, False,  True, False])
+    """
+    ...
+def logical_not(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_not(input, *, out=None) -> Tensor
+    
+    Computes the element-wise logical NOT of the given input tensor. If not specified, the output tensor will have the bool
+    dtype. If the input tensor is not a bool tensor, zeros are treated as ``False`` and non-zeros are treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_not(torch.tensor([True, False]))
+        tensor([False,  True])
+        >>> torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1.5, -10.], dtype=torch.double))
+        tensor([ True, False, False])
+        >>> torch.logical_not(torch.tensor([0., 1., -10.], dtype=torch.double), out=torch.empty(3, dtype=torch.int16))
+        tensor([1, 0, 0], dtype=torch.int16)
+    """
+    ...
+def logical_or(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_or(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical OR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute OR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_or(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([ True, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_or(a, b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b.double())
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a.double(), b)
+        tensor([ True,  True,  True, False])
+        >>> torch.logical_or(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True,  True, False])
+    """
+    ...
+def logical_xor(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logical_xor(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise logical XOR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+    treated as ``True``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the tensor to compute XOR with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.logical_xor(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+        tensor([False, False,  True])
+        >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+        >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+        >>> torch.logical_xor(a, b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b.double())
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a.double(), b)
+        tensor([ True,  True, False, False])
+        >>> torch.logical_xor(a, b, out=torch.empty(4, dtype=torch.bool))
+        tensor([ True,  True, False, False])
+    """
+    ...
+def logit(input: Tensor, eps: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logit(input, eps=None, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.logit`.
+    """
+    ...
+def logit_(input: Tensor, eps: Optional[_float] = None) -> Tensor: ...
+@overload
+def logspace(start: Number, end: Number, steps: Optional[_int] = None, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Tensor, steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Tensor, end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logspace(start: Union[Number, _complex], end: Union[Number, _complex], steps: _int, base: _float = 10.0, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    logspace(start, end, steps, base=10.0, *,          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    
+    Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+    spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+    :math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+    with base :attr:`base`. That is, the values are:
+    
+    .. math::
+        (\text{base}^{\text{start}},
+        \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \ldots,
+        \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+        \text{base}^{\text{end}})
+    
+    
+    
+    From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+    
+    Args:
+        start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+        end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+        steps (int): size of the constructed tensor
+        base (float, optional): base of the logarithm function. Default: ``10.0``.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (torch.dtype, optional): the data type to perform the computation in.
+            Default: if None, uses the global default dtype (see torch.get_default_dtype())
+            when both :attr:`start` and :attr:`end` are real,
+            and corresponding complex dtype when either is complex.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.logspace(start=-10, end=10, steps=5)
+        tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+        >>> torch.logspace(start=0.1, end=1.0, steps=5)
+        tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+        >>> torch.logspace(start=0.1, end=1.0, steps=1)
+        tensor([1.2589])
+        >>> torch.logspace(start=2, end=2, steps=1, base=2)
+        tensor([4.0])
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def logsumexp(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    logsumexp(input, dim, keepdim=False, *, out=None)
+    
+    Returns the log of summed exponentials of each row of the :attr:`input`
+    tensor in the given dimension :attr:`dim`. The computation is numerically
+    stabilized.
+    
+    For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+    
+        .. math::
+            \text{logsumexp}(x)_{i} = \log \sum_j \exp(x_{ij})
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> torch.logsumexp(a, 1)
+        tensor([1.4907, 1.0593, 1.5696])
+        >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+        tensor(1.6859e-07)
+    """
+    ...
+@overload
+def lstm(data: Tensor, batch_sizes: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def lstm(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor, Tensor]: ...
+def lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def lt(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+@overload
+def lt(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lt(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} < \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, False], [True, False]])
+    """
+    ...
+def lu_solve(input: Tensor, LU_data: Tensor, LU_pivots: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor
+    
+    Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
+    LU factorization of A from :func:`~linalg.lu_factor`.
+    
+    This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+    
+    .. warning::
+    
+        :func:`torch.lu_solve` is deprecated in favor of :func:`torch.linalg.lu_solve`.
+        :func:`torch.lu_solve` will be removed in a future PyTorch release.
+        ``X = torch.lu_solve(B, LU, pivots)`` should be replaced with
+    
+        .. code:: python
+    
+            X = linalg.lu_solve(LU, pivots, B)
+    
+    Arguments:
+        b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*`
+                    is zero or more batch dimensions.
+        LU_data (Tensor): the pivoted LU factorization of A from :meth:`~linalg.lu_factor` of size :math:`(*, m, m)`,
+                           where :math:`*` is zero or more batch dimensions.
+        LU_pivots (IntTensor): the pivots of the LU factorization from :meth:`~linalg.lu_factor` of size :math:`(*, m)`,
+                               where :math:`*` is zero or more batch dimensions.
+                               The batch dimensions of :attr:`LU_pivots` must be equal to the batch dimensions of
+                               :attr:`LU_data`.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> b = torch.randn(2, 3, 1)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> x = torch.lu_solve(b, LU, pivots)
+        >>> torch.dist(A @ x, b)
+        tensor(1.00000e-07 *
+               2.8312)
+    """
+    ...
+def lu_unpack(LU_data: Tensor, LU_pivots: Tensor, unpack_data: _bool = True, unpack_pivots: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.lu_unpack: 
+    r"""
+    lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Unpacks the LU decomposition returned by :func:`~linalg.lu_factor` into the `P, L, U` matrices.
+    
+    .. seealso::
+    
+        :func:`~linalg.lu` returns the matrices from the LU decomposition. Its gradient formula is more efficient
+        than that of doing :func:`~linalg.lu_factor` followed by :func:`~linalg.lu_unpack`.
+    
+    Args:
+        LU_data (Tensor): the packed LU factorization data
+        LU_pivots (Tensor): the packed LU factorization pivots
+        unpack_data (bool): flag indicating if the data should be unpacked.
+                            If ``False``, then the returned ``L`` and ``U`` are empty tensors.
+                            Default: ``True``
+        unpack_pivots (bool): flag indicating if the pivots should be unpacked into a permutation matrix ``P``.
+                              If ``False``, then the returned ``P`` is  an empty tensor.
+                              Default: ``True``
+    
+    Keyword args:
+        out (tuple, optional): output tuple of three tensors. Ignored if `None`.
+    
+    Returns:
+        A namedtuple ``(P, L, U)``
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 3, 3)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # We can recover A from the factorization
+        >>> A_ = P @ L @ U
+        >>> torch.allclose(A, A_)
+        True
+    
+        >>> # LU factorization of a rectangular matrix:
+        >>> A = torch.randn(2, 3, 2)
+        >>> LU, pivots = torch.linalg.lu_factor(A)
+        >>> P, L, U = torch.lu_unpack(LU, pivots)
+        >>> # P, L, U are the same as returned by linalg.lu
+        >>> P_, L_, U_ = torch.linalg.lu(A)
+        >>> torch.allclose(P, P_) and torch.allclose(L, L_) and torch.allclose(U, U_)
+        True
+    """
+    ...
+def margin_ranking_loss(input1: Tensor, input2: Tensor, target: Tensor, margin: _float = 0.0, reduction: _int = 1) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Tensor) -> Tensor: ...
+@overload
+def masked_fill(input: Tensor, mask: Tensor, value: Union[Number, _complex]) -> Tensor: ...
+def masked_scatter(input: Tensor, mask: Tensor, source: Tensor) -> Tensor: ...
+def masked_select(input: Tensor, mask: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    masked_select(input, mask, *, out=None) -> Tensor
+    
+    Returns a new 1-D tensor which indexes the :attr:`input` tensor according to
+    the boolean mask :attr:`mask` which is a `BoolTensor`.
+    
+    The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need
+    to match, but they must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    .. note:: The returned tensor does **not** use the same storage
+              as the original tensor
+    
+    Args:
+        input (Tensor): the input tensor.
+        mask  (BoolTensor): the tensor containing the binary mask to index with
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> x
+        tensor([[ 0.3552, -2.3825, -0.8297,  0.3477],
+                [-1.2035,  1.2252,  0.5002,  0.6248],
+                [ 0.1307, -2.0608,  0.1244,  2.0139]])
+        >>> mask = x.ge(0.5)
+        >>> mask
+        tensor([[False, False, False, False],
+                [False, True, True, True],
+                [False, False, False, True]])
+        >>> torch.masked_select(x, mask)
+        tensor([ 1.2252,  0.5002,  0.6248,  2.0139])
+    """
+    ...
+def matmul(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matmul(input, other, *, out=None) -> Tensor
+    
+    Matrix product of two tensors.
+    
+    The behavior depends on the dimensionality of the tensors as follows:
+    
+    - If both tensors are 1-dimensional, the dot product (scalar) is returned.
+    - If both arguments are 2-dimensional, the matrix-matrix product is returned.
+    - If the first argument is 1-dimensional and the second argument is 2-dimensional,
+      a 1 is prepended to its dimension for the purpose of the matrix multiply.
+      After the matrix multiply, the prepended dimension is removed.
+    - If the first argument is 2-dimensional and the second argument is 1-dimensional,
+      the matrix-vector product is returned.
+    - If both arguments are at least 1-dimensional and at least one argument is
+      N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+      argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+      batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+      1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+      must be broadcastable).  For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+      tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+    
+      Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+      are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+      :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
+      tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+      matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
+    matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
+    as :func:`torch.mm`
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    .. note::
+    
+        The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
+    
+    Arguments:
+        input (Tensor): the first tensor to be multiplied
+        other (Tensor): the second tensor to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> # vector x vector
+        >>> tensor1 = torch.randn(3)
+        >>> tensor2 = torch.randn(3)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([])
+        >>> # matrix x vector
+        >>> tensor1 = torch.randn(3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([3])
+        >>> # batched matrix x broadcasted vector
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3])
+        >>> # batched matrix x batched matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(10, 4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+        >>> # batched matrix x broadcasted matrix
+        >>> tensor1 = torch.randn(10, 3, 4)
+        >>> tensor2 = torch.randn(4, 5)
+        >>> torch.matmul(tensor1, tensor2).size()
+        torch.Size([10, 3, 5])
+    """
+    ...
+def matrix_exp(input: Tensor) -> Tensor: 
+    r"""
+    matrix_exp(A) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_exp`.
+    """
+    ...
+def matrix_power(input: Tensor, n: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    matrix_power(input, n, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.linalg.matrix_power`
+    """
+    ...
+@overload
+def max(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+@overload
+def max(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.max: 
+    r"""
+    max(input) -> Tensor
+    
+    Returns the maximum value of all elements in the ``input`` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6763,  0.7445, -2.2369]])
+        >>> torch.max(a)
+        tensor(0.7445)
+    
+    .. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each maximum value found
+    (argmax).
+    
+    If ``keepdim`` is ``True``, the output tensors are of the same size
+    as ``input`` except in the dimension ``dim`` where they are of size 1.
+    Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than ``input``.
+    
+    .. note:: If there are multiple maximal values in a reduced row then
+              the indices of the first maximal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not. Default: ``False``.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+                [ 1.1949, -1.1127, -2.2379, -0.6702],
+                [ 1.5717, -0.9207,  0.1297, -1.8768],
+                [-0.6172,  1.0036, -0.6060, -0.2432]])
+        >>> torch.max(a, 1)
+        torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+    
+    .. function:: max(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.maximum`.
+    """
+    ...
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool1d_with_indices(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tuple[Tensor, Tensor]: ...
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def maximum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    maximum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`maximum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.maximum(a, b)
+        tensor([3, 2, 4])
+    """
+    ...
+@overload
+def mean(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mean(input, *, dtype=None) -> Tensor
+    
+    Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+    
+    Args:
+        input (Tensor):
+          the input tensor, either of floating point or complex dtype
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.2294, -0.5481,  1.3288]])
+        >>> torch.mean(a)
+        tensor(0.3367)
+    
+    .. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+       :noindex:
+    
+    Returns the mean value of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+                [-0.9644,  1.0131, -0.6549, -1.4279],
+                [-0.2951, -1.3350, -0.7694,  0.5600],
+                [ 1.0842, -0.9580,  0.3623,  0.2343]])
+        >>> torch.mean(a, 1)
+        tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+        >>> torch.mean(a, 1, True)
+        tensor([[-0.0163],
+                [-0.5085],
+                [-0.4599],
+                [ 0.1807]])
+    """
+    ...
+@overload
+def median(input: Tensor) -> Tensor: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def median(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.median: 
+    r"""
+    median(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements. In this case the lower of the two medians is returned. To
+        compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 1.5219, -1.5212,  0.2202]])
+        >>> torch.median(a)
+        tensor(0.2202)
+    
+    .. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the outputs tensor having 1 fewer dimension than :attr:`input`.
+    
+    .. note::
+        The median is not unique for :attr:`input` tensors with an even number
+        of elements in the dimension :attr:`dim`. In this case the lower of the
+        two medians is returned. To compute the mean of both medians in
+        :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+    
+    .. warning::
+        ``indices`` does not necessarily contain the first occurrence of each
+        median value found, unless it is unique.
+        The exact implementation details are device-specific.
+        Do not expect the same result when run on CPU and GPU in general.
+        For the same reason do not expect the gradients to be deterministic.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 5)
+        >>> a
+        tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+                [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+                [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+                [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+        >>> torch.median(a, 1)
+        torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+    """
+    ...
+@overload
+def min(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+@overload
+def min(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.min: 
+    r"""
+    min(input) -> Tensor
+    
+    Returns the minimum value of all elements in the :attr:`input` tensor.
+    
+    .. warning::
+        This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.6750,  1.0857,  1.7197]])
+        >>> torch.min(a)
+        tensor(0.6750)
+    
+    .. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`. And ``indices`` is the index location of each minimum value found
+    (argmin).
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: If there are multiple minimal values in a reduced row then
+              the indices of the first minimal value are returned.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the tuple of two output tensors (min, min_indices)
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+                [-1.4644, -0.2635, -0.3651,  0.6134],
+                [ 0.2457,  0.0384,  1.0128,  0.7015],
+                [-0.1153,  2.9849,  2.1458,  0.5788]])
+        >>> torch.min(a, 1)
+        torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+    
+    .. function:: min(input, other, *, out=None) -> Tensor
+       :noindex:
+    
+    See :func:`torch.minimum`.
+    """
+    ...
+def minimum(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    minimum(input, other, *, out=None) -> Tensor
+    
+    Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+    
+    .. note::
+        If one of the elements being compared is a NaN, then that element is returned.
+        :func:`minimum` is not supported for tensors with complex dtypes.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2, -1))
+        >>> b = torch.tensor((3, 0, 4))
+        >>> torch.minimum(a, b)
+        tensor([1, 0, -1])
+    """
+    ...
+def miopen_batch_norm(input: Tensor, weight: Tensor, bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, exponential_average_factor: _float, epsilon: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def miopen_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_convolution_add_relu(input: Tensor, weight: Tensor, z: Tensor, alpha: Optional[Union[Number, _complex]], bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_relu(input: Tensor, weight: Tensor, bias: Optional[Tensor], stride: Sequence[Union[_int, SymInt]], padding: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def miopen_convolution_transpose(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], output_padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_depthwise_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt], benchmark: _bool, deterministic: _bool) -> Tensor: ...
+def miopen_rnn(input: Tensor, weight: Union[Tuple[Tensor, ...], List[Tensor]], weight_stride0: _int, hx: Tensor, cx: Optional[Tensor], mode: _int, hidden_size: _int, num_layers: _int, batch_first: _bool, dropout: _float, train: _bool, bidirectional: _bool, batch_sizes: _size, dropout_state: Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: ...
+def mkldnn_adaptive_avg_pool2d(input: Tensor, output_size: Union[_int, _size], *, out: Optional[Tensor] = None) -> Tensor: ...
+def mkldnn_convolution(input: Tensor, weight: Tensor, bias: Optional[Tensor], padding: Sequence[Union[_int, SymInt]], stride: Sequence[Union[_int, SymInt]], dilation: Sequence[Union[_int, SymInt]], groups: Union[_int, SymInt]) -> Tensor: ...
+def mkldnn_linear_backward_weights(grad_output: Tensor, input: Tensor, weight: Tensor, bias_defined: _bool) -> Tuple[Tensor, Tensor]: ...
+def mkldnn_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def mkldnn_rnn_layer(input: Tensor, weight0: Tensor, weight1: Tensor, weight2: Tensor, weight3: Tensor, hx_: Tensor, cx_: Tensor, reverse: _bool, batch_sizes: _size, mode: _int, hidden_size: _int, num_layers: _int, has_biases: _bool, bidirectional: _bool, batch_first: _bool, train: _bool) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ...
+def mm(input: Tensor, mat2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mm(input, mat2, *, out=None) -> Tensor
+    
+    Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+    :math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+              For broadcasting matrix products, see :func:`torch.matmul`.
+    
+    Supports strided and sparse 2-D tensors as inputs, autograd with
+    respect to strided inputs.
+    
+    This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`.
+    If :attr:`out` is provided it's layout will be used. Otherwise, the result
+    layout will be deduced from that of :attr:`input`.
+    
+    
+    .. warning::
+        Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+        or may not have autograd support. If you notice missing functionality please
+        open a feature request.
+    
+    This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+    
+    Args:
+        input (Tensor): the first matrix to be matrix multiplied
+        mat2 (Tensor): the second matrix to be matrix multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat1 = torch.randn(2, 3)
+        >>> mat2 = torch.randn(3, 3)
+        >>> torch.mm(mat1, mat2)
+        tensor([[ 0.4851,  0.5037, -0.3633],
+                [-0.0760, -3.6705,  2.4784]])
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: _int = -1, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def mode(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.mode: 
+    r"""
+    mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+    value of each row of the :attr:`input` tensor in the given dimension
+    :attr:`dim`, i.e. a value which appears most often
+    in that row, and ``indices`` is the index location of each mode value found.
+    
+    By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+    
+    If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+    :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+    in the output tensors having 1 fewer dimension than :attr:`input`.
+    
+    .. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out (tuple, optional): the result tuple of two output tensors (values, indices)
+    
+    Example::
+    
+        >>> b = torch.tensor(
+               [[0, 0, 0, 2, 0, 0, 2],
+                [0, 3, 0, 0, 2, 0, 1],
+                [2, 2, 2, 0, 0, 0, 3],
+                [2, 2, 3, 0, 1, 1, 0],
+                [1, 1, 0, 0, 2, 0, 2]])
+        >>> torch.mode(b, 0)
+        torch.return_types.mode(
+        values=tensor([0, 2, 0, 0, 0, 0, 2]),
+        indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def moveaxis(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    moveaxis(input, source, destination) -> Tensor
+    
+    Alias for :func:`torch.movedim`.
+    
+    This function is equivalent to NumPy's moveaxis function.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.moveaxis(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.moveaxis(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _int, destination: _int) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+@overload
+def movedim(input: Tensor, source: _size, destination: _size) -> Tensor: 
+    r"""
+    movedim(input, source, destination) -> Tensor
+    
+    Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+    to the position(s) in :attr:`destination`.
+    
+    Other dimensions of :attr:`input` that are not explicitly moved remain in
+    their original order and appear at the positions not specified in :attr:`destination`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+        destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+    
+    Examples::
+    
+        >>> t = torch.randn(3,2,1)
+        >>> t
+        tensor([[[-0.3362],
+                [-0.8437]],
+    
+                [[-0.9627],
+                [ 0.1727]],
+    
+                [[ 0.5173],
+                [-0.1398]]])
+        >>> torch.movedim(t, 1, 0).shape
+        torch.Size([2, 3, 1])
+        >>> torch.movedim(t, 1, 0)
+        tensor([[[-0.3362],
+                [-0.9627],
+                [ 0.5173]],
+    
+                [[-0.8437],
+                [ 0.1727],
+                [-0.1398]]])
+        >>> torch.movedim(t, (1, 2), (0, 1)).shape
+        torch.Size([2, 1, 3])
+        >>> torch.movedim(t, (1, 2), (0, 1))
+        tensor([[[-0.3362, -0.9627,  0.5173]],
+    
+                [[-0.8437,  0.1727, -0.1398]]])
+    """
+    ...
+def msort(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    msort(input, *, out=None) -> Tensor
+    
+    Sorts the elements of the :attr:`input` tensor along its first dimension
+    in ascending order by value.
+    
+    .. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`.
+              See also :func:`torch.sort`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(3, 4)
+        >>> t
+        tensor([[-0.1321,  0.4370, -1.2631, -1.1289],
+                [-2.0527, -1.1250,  0.2275,  0.3077],
+                [-0.0881, -0.1259, -0.5495,  1.0284]])
+        >>> torch.msort(t)
+        tensor([[-2.0527, -1.1250, -1.2631, -1.1289],
+                [-0.1321, -0.1259, -0.5495,  0.3077],
+                [-0.0881,  0.4370,  0.2275,  1.0284]])
+    """
+    ...
+def mul(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mul(input, other, *, out=None) -> Tensor
+    
+    Multiplies :attr:`input` by :attr:`other`.
+    
+    
+    .. math::
+        \text{out}_i = \text{input}_i \times \text{other}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number) - the tensor or number to multiply input by.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Examples::
+    
+        >>> a = torch.randn(3)
+        >>> a
+        tensor([ 0.2015, -0.4255,  2.6087])
+        >>> torch.mul(a, 100)
+        tensor([  20.1494,  -42.5491,  260.8663])
+    
+        >>> b = torch.randn(4, 1)
+        >>> b
+        tensor([[ 1.1207],
+                [-0.3137],
+                [ 0.0700],
+                [ 0.8378]])
+        >>> c = torch.randn(1, 4)
+        >>> c
+        tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
+        >>> torch.mul(b, c)
+        tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
+                [-0.1614, -0.0382,  0.1645, -0.7021],
+                [ 0.0360,  0.0085, -0.0367,  0.1567],
+                [ 0.4312,  0.1019, -0.4394,  1.8753]])
+    """
+    ...
+def multinomial(input: Tensor, num_samples: _int, replacement: _bool = False, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor
+    
+    Returns a tensor where each row contains :attr:`num_samples` indices sampled
+    from the multinomial (a stricter definition would be multivariate,
+    refer to torch.distributions.multinomial.Multinomial for more details)
+    probability distribution located in the corresponding row
+    of tensor :attr:`input`.
+    
+    .. note::
+        The rows of :attr:`input` do not need to sum to one (in which case we use
+        the values as weights), but must be non-negative, finite and have
+        a non-zero sum.
+    
+    Indices are ordered from left to right according to when each was sampled
+    (first samples are placed in first column).
+    
+    If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
+    
+    If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
+    :math:`(m \times \text{num\_samples})`.
+    
+    If replacement is ``True``, samples are drawn with replacement.
+    
+    If not, they are drawn without replacement, which means that when a
+    sample index is drawn for a row, it cannot be drawn again for that row.
+    
+    .. note::
+        When drawn without replacement, :attr:`num_samples` must be lower than
+        number of non-zero elements in :attr:`input` (or the min number of non-zero
+        elements in each row of :attr:`input` if it is a matrix).
+    
+    Args:
+        input (Tensor): the input tensor containing probabilities
+        num_samples (int): number of samples to draw
+        replacement (bool, optional): whether to draw with replacement or not
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
+        >>> torch.multinomial(weights, 2)
+        tensor([1, 2])
+        >>> torch.multinomial(weights, 4) # ERROR!
+        RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False,
+        not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320
+        >>> torch.multinomial(weights, 4, replacement=True)
+        tensor([ 2,  1,  1,  1])
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+@overload
+def multiply(input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    multiply(input, other, *, out=None)
+    
+    Alias for :func:`torch.mul`.
+    """
+    ...
+def mv(input: Tensor, vec: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mv(input, vec, *, out=None) -> Tensor
+    
+    Performs a matrix-vector product of the matrix :attr:`input` and the vector
+    :attr:`vec`.
+    
+    If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+    size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): matrix to be multiplied
+        vec (Tensor): vector to be multiplied
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> mat = torch.randn(2, 3)
+        >>> vec = torch.randn(3)
+        >>> torch.mv(mat, vec)
+        tensor([ 1.0404, -0.6361])
+    """
+    ...
+def mvlgamma(input: Tensor, p: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    mvlgamma(input, p, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.multigammaln`.
+    """
+    ...
+def nan_to_num(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+    
+    Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+    with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+    By default, :literal:`NaN`\ s are replaced with zero, positive infinity is replaced with the
+    greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+    is replaced with the least finite value representable by :attr:`input`'s dtype.
+    
+    Args:
+        input (Tensor): the input tensor.
+        nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+        posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+            If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+        neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+            If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+            Default is None.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+        >>> torch.nan_to_num(x)
+        tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0)
+        tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+        >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+        tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+    """
+    ...
+def nan_to_num_(input: Tensor, nan: Optional[_float] = None, posinf: Optional[_float] = None, neginf: Optional[_float] = None) -> Tensor: ...
+def nanmean(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanmean(input, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+    
+    Computes the mean of all `non-NaN` elements along the specified dimensions.
+    
+    This function is identical to :func:`torch.mean` when there are no `NaN` values
+    in the :attr:`input` tensor. In the presence of `NaN`, :func:`torch.mean` will
+    propagate the `NaN` to the output whereas :func:`torch.nanmean` will ignore the
+    `NaN` values (`torch.nanmean(a)` is equivalent to `torch.mean(a[~a.isnan()])`).
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+        out (Tensor, optional): the output tensor.
+    
+    .. seealso::
+    
+        :func:`torch.mean` computes the mean value, propagating `NaN`.
+    
+    Example::
+    
+        >>> x = torch.tensor([[torch.nan, 1, 2], [1, 2, 3]])
+        >>> x.mean()
+        tensor(nan)
+        >>> x.nanmean()
+        tensor(1.8000)
+        >>> x.mean(dim=0)
+        tensor([   nan, 1.5000, 2.5000])
+        >>> x.nanmean(dim=0)
+        tensor([1.0000, 1.5000, 2.5000])
+    
+        # If all elements in the reduced dimensions are NaN then the result is NaN
+        >>> torch.tensor([torch.nan]).nanmean()
+        tensor(nan)
+    """
+    ...
+@overload
+def nanmedian(input: Tensor) -> Tensor: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: _int, keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.nanmedian: 
+    r"""
+    nanmedian(input) -> Tensor
+    
+    Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+    When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+    while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+    If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, float('nan'), 3, 2])
+        >>> a.median()
+        tensor(nan)
+        >>> a.nanmedian()
+        tensor(2.)
+    
+    .. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+       :noindex:
+    
+    Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+    in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+    found in the dimension :attr:`dim`.
+    
+    This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+    one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+    median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                          tensor, which must have dtype long, with their indices in the dimension
+                                          :attr:`dim` of :attr:`input`.
+    
+    Example::
+    
+        >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+        >>> a
+        tensor([[2., 3., 1.],
+                [nan, 1., nan]])
+        >>> a.median(0)
+        torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+        >>> a.nanmedian(0)
+        torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+@overload
+def nanquantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+    computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+    not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+    that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([float('nan'), 1, 2])
+        >>> t.quantile(0.5)
+        tensor(nan)
+        >>> t.nanquantile(0.5)
+        tensor(1.5000)
+        >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+        >>> t
+        tensor([[nan, nan],
+                [1., 2.]])
+        >>> t.nanquantile(0.5, dim=0)
+        tensor([1., 2.])
+        >>> t.nanquantile(0.5, dim=1)
+        tensor([   nan, 1.5000])
+    """
+    ...
+def nansum(input: Tensor, dim: Optional[Union[_int, _size]] = None, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nansum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.tensor([1., 2., float('nan'), 4.])
+        >>> torch.nansum(a)
+        tensor(7.)
+    
+    .. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
+    If :attr:`dim` is a list of dimensions, reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> torch.nansum(torch.tensor([1., float("nan")]))
+        1.0
+        >>> a = torch.tensor([[1, 2], [3., float("nan")]])
+        >>> torch.nansum(a)
+        tensor(6.)
+        >>> torch.nansum(a, dim=0)
+        tensor([4., 2.])
+        >>> torch.nansum(a, dim=1)
+        tensor([3., 3.])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Tensor, length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+@overload
+def narrow(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    narrow(input, dim, start, length) -> Tensor
+    
+    Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+    dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+    returned tensor and :attr:`input` tensor share the same underlying storage.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int or Tensor): index of the element to start the narrowed dimension
+            from. Can be negative, which means indexing from the end of `dim`. If
+            `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+        tensor([[3],
+                [6],
+                [9]])
+    """
+    ...
+def narrow_copy(input: Tensor, dim: _int, start: Union[_int, SymInt], length: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    narrow_copy(input, dim, start, length, *, out=None) -> Tensor
+    
+    Same as :meth:`Tensor.narrow` except this returns a copy rather
+    than shared storage. This is primarily for sparse tensors, which
+    do not have a shared-storage narrow method.
+    
+    Args:
+        input (Tensor): the tensor to narrow
+        dim (int): the dimension along which to narrow
+        start (int): index of the element to start the narrowed dimension from. Can
+            be negative, which means indexing from the end of `dim`
+        length (int): length of the narrowed dimension, must be weakly positive
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        >>> torch.narrow_copy(x, 0, 0, 2)
+        tensor([[ 1,  2,  3],
+                [ 4,  5,  6]])
+        >>> torch.narrow_copy(x, 1, 1, 2)
+        tensor([[ 2,  3],
+                [ 5,  6],
+                [ 8,  9]])
+        >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
+        >>> torch.narrow_copy(s, 0, 0, 1)
+        tensor(indices=tensor([[0, 0],
+                               [0, 1]]),
+               values=tensor([[[0, 1],
+                               [2, 3]],
+    
+                              [[4, 5],
+                               [6, 7]]]),
+               size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+    
+    .. seealso::
+    
+            :func:`torch.narrow` for a non copy variant
+    """
+    ...
+def native_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], running_mean: Optional[Tensor], running_var: Optional[Tensor], training: _bool, momentum: _float, eps: _float, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_channel_shuffle(input: Tensor, groups: Union[_int, SymInt]) -> Tensor: ...
+def native_dropout(input: Tensor, p: _float, train: Optional[_bool]) -> Tuple[Tensor, Tensor]: ...
+def native_group_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], N: Union[_int, SymInt], C: Union[_int, SymInt], HxW: Union[_int, SymInt], group: _int, eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+def native_layer_norm(input: Tensor, normalized_shape: Sequence[Union[_int, SymInt]], weight: Optional[Tensor], bias: Optional[Tensor], eps: _float) -> Tuple[Tensor, Tensor, Tensor]: ...
+@overload
+def native_norm(input: Tensor, p: Optional[Union[Number, _complex]], dim: Union[_int, _size], keepdim: _bool, dtype: Optional[_dtype]) -> Tensor: ...
+@overload
+def native_norm(input: Tensor, p: Union[Number, _complex] = 2) -> Tensor: ...
+@overload
+def ne(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+@overload
+def ne(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ne(input, other, *, out=None) -> Tensor
+    
+    Computes :math:`\text{input} \neq \text{other}` element-wise.
+    
+    
+    The second argument can be a number or a tensor whose shape is
+    :ref:`broadcastable <broadcasting-semantics>` with the first argument.
+    
+    Args:
+        input (Tensor): the tensor to compare
+        other (Tensor or float): the tensor or value to compare
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+    
+    Example::
+    
+        >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+        tensor([[False, True], [True, False]])
+    """
+    ...
+def neg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    neg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the negative of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out} = -1 \times \text{input}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(5)
+        >>> a
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.neg(a)
+        tensor([-0.0090,  0.2262,  0.0682,  0.2866, -0.3940])
+    """
+    ...
+def neg_(input: Tensor) -> Tensor: ...
+def negative(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    negative(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.neg`
+    """
+    ...
+def negative_(input: Tensor) -> Tensor: ...
+def nextafter(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nextafter(input, other, *, out=None) -> Tensor
+    
+    Return the next floating-point value after :attr:`input` towards :attr:`other`, elementwise.
+    
+    The shapes of ``input`` and ``other`` must be
+    :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the first input tensor
+        other (Tensor): the second input tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> eps = torch.finfo(torch.float32).eps
+        >>> torch.nextafter(torch.tensor([1.0, 2.0]), torch.tensor([2.0, 1.0])) == torch.tensor([eps + 1, 2 - eps])
+        tensor([True, True])
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[False] = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+@overload
+def nonzero(input: Tensor, *, as_tuple: Literal[True]) -> Tuple[Tensor, ...]: 
+    r"""
+    nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+    
+    .. note::
+        :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+        2-D tensor where each row is the index for a nonzero value.
+    
+        :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+        index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+        gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+        contains nonzero indices for a certain dimension.
+    
+        See below for more details on the two behaviors.
+    
+        When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+        host-device synchronization.
+    
+    **When** :attr:`as_tuple` **is** ``False`` **(default)**:
+    
+    Returns a tensor containing the indices of all non-zero elements of
+    :attr:`input`.  Each row in the result contains the indices of a non-zero
+    element in :attr:`input`. The result is sorted lexicographically, with
+    the last index changing the fastest (C-style).
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+    :attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    **When** :attr:`as_tuple` **is** ``True``:
+    
+    Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+    each containing the indices (in that dimension) of all non-zero elements of
+    :attr:`input` .
+    
+    If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+    tensors of size :math:`z`, where :math:`z` is the total number of
+    non-zero elements in the :attr:`input` tensor.
+    
+    As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+    value, it is treated as a one-dimensional tensor with one element.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (LongTensor, optional): the output tensor containing indices
+    
+    Returns:
+        LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+        tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+        each dimension, containing the indices of each nonzero element along that
+        dimension.
+    
+    Example::
+    
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+        tensor([[ 0],
+                [ 1],
+                [ 2],
+                [ 4]])
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]))
+        tensor([[ 0,  0],
+                [ 1,  1],
+                [ 2,  2],
+                [ 3,  3]])
+        >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+        (tensor([0, 1, 2, 4]),)
+        >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+        ...                             [0.0, 0.4, 0.0, 0.0],
+        ...                             [0.0, 0.0, 1.2, 0.0],
+        ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+        (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+        >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+        (tensor([0]),)
+    """
+    ...
+def nonzero_static(input: Tensor, *, size: _int, fill_value: _int = -1, out: Optional[Tensor] = None) -> Tensor: ...
+def norm_except_dim(v: Tensor, pow: _int = 2, dim: _int = 0) -> Tensor: ...
+@overload
+def normal(mean: Tensor, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: Tensor, std: _float = 1, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: Tensor, *, generator: Optional[Generator] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def normal(mean: _float, std: _float, size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator] = None, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    normal(mean, std, *, generator=None, out=None) -> Tensor
+    
+    Returns a tensor of random numbers drawn from separate normal distributions
+    whose mean and standard deviation are given.
+    
+    The :attr:`mean` is a tensor with the mean of
+    each output element's normal distribution
+    
+    The :attr:`std` is a tensor with the standard deviation of
+    each output element's normal distribution
+    
+    The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+    total number of elements in each tensor need to be the same.
+    
+    .. note:: When the shapes do not match, the shape of :attr:`mean`
+              is used as the shape for the returned output tensor
+    
+    .. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+              its device with the CPU.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+        tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+                  8.0505,   8.1408,   9.0563,  10.0566])
+    
+    .. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means are shared among all drawn
+    elements.
+    
+    Args:
+        mean (float, optional): the mean for all distributions
+        std (Tensor): the tensor of per-element standard deviations
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+        tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+    
+    .. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the standard deviations are shared among
+    all drawn elements.
+    
+    Args:
+        mean (Tensor): the tensor of per-element means
+        std (float, optional): the standard deviation for all distributions
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor
+    
+    Example::
+    
+        >>> torch.normal(mean=torch.arange(1., 6.))
+        tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+    
+    .. function:: normal(mean, std, size, *, out=None) -> Tensor
+       :noindex:
+    
+    Similar to the function above, but the means and standard deviations are shared
+    among all drawn elements. The resulting tensor has size given by :attr:`size`.
+    
+    Args:
+        mean (float): the mean for all distributions
+        std (float): the standard deviation for all distributions
+        size (int...): a sequence of integers defining the shape of the output tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.normal(2, 3, size=(1, 4))
+        tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def not_equal(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    not_equal(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.ne`.
+    """
+    ...
+@overload
+def nuclear_norm(input: Tensor, dim: Union[_int, _size], keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def nuclear_norm(input: Tensor, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: ...
+def numel(self: Tensor) -> _int: 
+    r"""
+    numel(input) -> int
+    
+    Returns the total number of elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 2, 3, 4, 5)
+        >>> torch.numel(a)
+        120
+        >>> a = torch.zeros(4,4)
+        >>> torch.numel(a)
+        16
+    """
+    ...
+@overload
+def ones(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+@overload
+def ones(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.ones(2, 3)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    
+        >>> torch.ones(5)
+        tensor([ 1.,  1.,  1.,  1.,  1.])
+    """
+    ...
+def ones_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `1`, with the same size as
+    :attr:`input`. ``torch.ones_like(input)`` is equivalent to
+    ``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.ones_like(input, out=output)`` is equivalent to
+        ``torch.ones(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword arguments:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.ones_like(input)
+        tensor([[ 1.,  1.,  1.],
+                [ 1.,  1.,  1.]])
+    """
+    ...
+def orgqr(input: Tensor, input2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    orgqr(input, tau) -> Tensor
+    
+    Alias for :func:`torch.linalg.householder_product`.
+    """
+    ...
+def ormqr(input: Tensor, input2: Tensor, input3: Tensor, left: _bool = True, transpose: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    ormqr(input, tau, other, left=True, transpose=False, *, out=None) -> Tensor
+    
+    Computes the matrix-matrix multiplication of a product of Householder matrices with a general matrix.
+    
+    Multiplies a :math:`m \times n` matrix `C` (given by :attr:`other`) with a matrix `Q`,
+    where `Q` is represented using Householder reflectors `(input, tau)`.
+    See `Representation of Orthogonal or Unitary Matrices`_ for further details.
+    
+    If :attr:`left` is `True` then `op(Q)` times `C` is computed, otherwise the result is `C` times `op(Q)`.
+    When :attr:`left` is `True`, the implicit matrix `Q` has size :math:`m \times m`.
+    It has size :math:`n \times n` otherwise.
+    If :attr:`transpose` is `True` then `op` is the conjugate transpose operation, otherwise it's a no-op.
+    
+    Supports inputs of float, double, cfloat and cdouble dtypes.
+    Also supports batched inputs, and, if the input is batched, the output is batched with the same dimensions.
+    
+    .. seealso::
+            :func:`torch.geqrf` can be used to form the Householder representation `(input, tau)` of matrix `Q`
+            from the QR decomposition.
+    
+    .. note::
+            This function supports backward but it is only fast when ``(input, tau)`` do not require gradients
+            and/or ``tau.size(-1)`` is very small.
+            ``
+    
+    Args:
+        input (Tensor): tensor of shape `(*, mn, k)` where `*` is zero or more batch dimensions
+                        and `mn` equals to `m` or `n` depending on the :attr:`left`.
+        tau (Tensor): tensor of shape `(*, min(mn, k))` where `*` is zero or more batch dimensions.
+        other (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+        left (bool): controls the order of multiplication.
+        transpose (bool): controls whether the matrix `Q` is conjugate transposed or not.
+    
+    Keyword args:
+        out (Tensor, optional): the output Tensor. Ignored if `None`. Default: `None`.
+    
+    .. _Representation of Orthogonal or Unitary Matrices:
+        https://www.netlib.org/lapack/lug/node128.html
+    """
+    ...
+def outer(input: Tensor, vec2: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    outer(input, vec2, *, out=None) -> Tensor
+    
+    Outer product of :attr:`input` and :attr:`vec2`.
+    If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
+    size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
+    
+    .. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): 1-D input vector
+        vec2 (Tensor): 1-D input vector
+    
+    Keyword args:
+        out (Tensor, optional): optional output matrix
+    
+    Example::
+    
+        >>> v1 = torch.arange(1., 5.)
+        >>> v2 = torch.arange(1., 4.)
+        >>> torch.outer(v1, v2)
+        tensor([[  1.,   2.,   3.],
+                [  2.,   4.,   6.],
+                [  3.,   6.,   9.],
+                [  4.,   8.,  12.]])
+    """
+    ...
+def pairwise_distance(x1: Tensor, x2: Tensor, p: _float = 2, eps: _float = 1e-06, keepdim: _bool = False) -> Tensor: ...
+def pdist(input: Tensor, p: _float = 2) -> Tensor: ...
+def permute(input: Tensor, dims: _size) -> Tensor: 
+    r"""
+    permute(input, dims) -> Tensor
+    
+    Returns a view of the original tensor :attr:`input` with its dimensions permuted.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dims (tuple of int): The desired ordering of dimensions
+    
+    Example:
+        >>> x = torch.randn(2, 3, 5)
+        >>> x.size()
+        torch.Size([2, 3, 5])
+        >>> torch.permute(x, (2, 0, 1)).size()
+        torch.Size([5, 2, 3])
+    """
+    ...
+def permute_copy(input: Tensor, dims: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.permute`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def pinverse(input: Tensor, rcond: _float = 1e-15) -> Tensor: 
+    r"""
+    pinverse(input, rcond=1e-15) -> Tensor
+    
+    Alias for :func:`torch.linalg.pinv`
+    """
+    ...
+def pixel_shuffle(input: Tensor, upscale_factor: _int) -> Tensor: ...
+def pixel_unshuffle(input: Tensor, downscale_factor: _int) -> Tensor: ...
+def poisson(input: Tensor, generator: Optional[Generator] = None) -> Tensor: 
+    r"""
+    poisson(input, generator=None) -> Tensor
+    
+    Returns a tensor of the same size as :attr:`input` with each element
+    sampled from a Poisson distribution with rate parameter given by the corresponding
+    element in :attr:`input` i.e.,
+    
+    .. math::
+        \text{out}_i \sim \text{Poisson}(\text{input}_i)
+    
+    :attr:`input` must be non-negative.
+    
+    Args:
+        input (Tensor): the input tensor containing the rates of the Poisson distribution
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+    
+    Example::
+    
+        >>> rates = torch.rand(4, 4) * 5  # rate parameter between 0 and 5
+        >>> torch.poisson(rates)
+        tensor([[9., 1., 3., 5.],
+                [8., 6., 6., 0.],
+                [0., 4., 5., 3.],
+                [2., 1., 4., 2.]])
+    """
+    ...
+def poisson_nll_loss(input: Tensor, target: Tensor, log_input: _bool, full: _bool, eps: _float, reduction: _int) -> Tensor: ...
+def polar(abs: Tensor, angle: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polar(abs, angle, *, out=None) -> Tensor
+    
+    Constructs a complex tensor whose elements are Cartesian coordinates
+    corresponding to the polar coordinates with absolute value :attr:`abs` and angle
+    :attr:`angle`.
+    
+    .. math::
+        \text{out} = \text{abs} \cdot \cos(\text{angle}) + \text{abs} \cdot \sin(\text{angle}) \cdot j
+    
+    .. note::
+        `torch.polar` is similar to
+        `std::polar <https://en.cppreference.com/w/cpp/numeric/complex/polar>`_
+        and does not compute the polar decomposition
+        of a complex tensor like Python's `cmath.polar` and SciPy's `linalg.polar` do.
+        The behavior of this function is undefined if `abs` is negative or NaN, or if `angle` is
+        infinite.
+    
+    
+    Args:
+        abs (Tensor): The absolute value the complex tensor. Must be float or double.
+        angle (Tensor): The angle of the complex tensor. Must be same dtype as
+            :attr:`abs`.
+    
+    Keyword args:
+        out (Tensor): If the inputs are ``torch.float32``, must be
+            ``torch.complex64``. If the inputs are ``torch.float64``, must be
+            ``torch.complex128``.
+    
+    Example::
+    
+        >>> import numpy as np
+        >>> abs = torch.tensor([1, 2], dtype=torch.float64)
+        >>> angle = torch.tensor([np.pi / 2, 5 * np.pi / 4], dtype=torch.float64)
+        >>> z = torch.polar(abs, angle)
+        >>> z
+        tensor([(0.0000+1.0000j), (-1.4142-1.4142j)], dtype=torch.complex128)
+    """
+    ...
+def polygamma(n: _int, input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    polygamma(n, input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.polygamma`.
+    """
+    ...
+def positive(input: Tensor) -> Tensor: 
+    r"""
+    positive(input) -> Tensor
+    
+    Returns :attr:`input`.
+    Throws a runtime error if :attr:`input` is a bool tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.randn(5)
+        >>> t
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+        >>> torch.positive(t)
+        tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(self: Union[Number, _complex], exponent: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+@overload
+def pow(input: Tensor, exponent: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    pow(input, exponent, *, out=None) -> Tensor
+    
+    Takes the power of each element in :attr:`input` with :attr:`exponent` and
+    returns a tensor with the result.
+    
+    :attr:`exponent` can be either a single ``float`` number or a `Tensor`
+    with the same number of elements as :attr:`input`.
+    
+    When :attr:`exponent` is a scalar value, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ \text{exponent}
+    
+    When :attr:`exponent` is a tensor, the operation applied is:
+    
+    .. math::
+        \text{out}_i = x_i ^ {\text{exponent}_i}
+    
+    When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+    and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        exponent (float or tensor): the exponent value
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+        >>> torch.pow(a, 2)
+        tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+        >>> exp = torch.arange(1., 5.)
+    
+        >>> a = torch.arange(1., 5.)
+        >>> a
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> exp
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.pow(a, exp)
+        tensor([   1.,    4.,   27.,  256.])
+    
+    .. function:: pow(self, exponent, *, out=None) -> Tensor
+       :noindex:
+    
+    :attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+    The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+    
+    The operation applied is:
+    
+    .. math::
+        \text{out}_i = \text{self} ^ {\text{exponent}_i}
+    
+    Args:
+        self (float): the scalar base value for the power operation
+        exponent (Tensor): the exponent tensor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> exp = torch.arange(1., 5.)
+        >>> base = 2
+        >>> torch.pow(base, exp)
+        tensor([  2.,   4.,   8.,  16.])
+    """
+    ...
+def prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+@overload
+def prod(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: _int, keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+@overload
+def prod(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    prod(input, *, dtype=None) -> Tensor
+    
+    Returns the product of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[-0.8020,  0.5428, -1.5854]])
+        >>> torch.prod(a)
+        tensor(0.6902)
+    
+    .. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the product of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`.
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+    the output tensor having 1 fewer dimension than :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 2)
+        >>> a
+        tensor([[ 0.5261, -0.3837],
+                [ 1.1857, -0.2498],
+                [-1.1646,  0.0705],
+                [ 1.1131, -1.0629]])
+        >>> torch.prod(a, 1)
+        tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+    """
+    ...
+def promote_types(type1: _dtype, type2: _dtype) -> _dtype: 
+    r"""
+    promote_types(type1, type2) -> dtype
+    
+    Returns the :class:`torch.dtype` with the smallest size and scalar kind that is
+    not smaller nor of lower kind than either `type1` or `type2`. See type promotion
+    :ref:`documentation <type-promotion-doc>` for more information on the type
+    promotion logic.
+    
+    Args:
+        type1 (:class:`torch.dtype`)
+        type2 (:class:`torch.dtype`)
+    
+    Example::
+    
+        >>> torch.promote_types(torch.int32, torch.float32)
+        torch.float32
+        >>> torch.promote_types(torch.uint8, torch.long)
+        torch.long
+    """
+    ...
+def put(input: Tensor, index: Tensor, source: Tensor, accumulate: _bool = False) -> Tensor: ...
+def q_per_channel_axis(input: Tensor) -> _int: ...
+def q_per_channel_scales(input: Tensor) -> Tensor: ...
+def q_per_channel_zero_points(input: Tensor) -> Tensor: ...
+def q_scale(input: Tensor) -> _float: ...
+def q_zero_point(input: Tensor) -> _int: ...
+def qr(input: Tensor, some: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.qr: 
+    r"""
+    qr(input, some=True, *, out=None) -> (Tensor, Tensor)
+    
+    Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
+    and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
+    with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and
+    :math:`R` being an upper triangular matrix or batch of upper triangular matrices.
+    
+    If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization.
+    Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization.
+    
+    .. warning::
+    
+        :func:`torch.qr` is deprecated in favor of :func:`torch.linalg.qr`
+        and will be removed in a future PyTorch release. The boolean parameter :attr:`some` has been
+        replaced with a string parameter :attr:`mode`.
+    
+        ``Q, R = torch.qr(A)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A)
+    
+        ``Q, R = torch.qr(A, some=False)`` should be replaced with
+    
+        .. code:: python
+    
+            Q, R = torch.linalg.qr(A, mode="complete")
+    
+    .. warning::
+              If you plan to backpropagate through QR, note that the current backward implementation
+              is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
+              columns of :attr:`input` are linearly independent.
+              This behavior will probably change once QR supports pivoting.
+    
+    .. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+              and may produce different (valid) decompositions on different device types
+              or different platforms.
+    
+    Args:
+        input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
+                    batch dimensions consisting of matrices of dimension :math:`m \times n`.
+        some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
+                    complete QR decomposition. If `k = min(m, n)` then:
+    
+                      * ``some=True`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default)
+    
+                      * ``'some=False'``: returns `(Q, R)` with dimensions (m, m), (m, n)
+    
+    Keyword args:
+        out (tuple, optional): tuple of `Q` and `R` tensors.
+                    The dimensions of `Q` and `R` are detailed in the description of :attr:`some` above.
+    
+    Example::
+    
+        >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+        >>> q, r = torch.qr(a)
+        >>> q
+        tensor([[-0.8571,  0.3943,  0.3314],
+                [-0.4286, -0.9029, -0.0343],
+                [ 0.2857, -0.1714,  0.9429]])
+        >>> r
+        tensor([[ -14.0000,  -21.0000,   14.0000],
+                [   0.0000, -175.0000,   70.0000],
+                [   0.0000,    0.0000,  -35.0000]])
+        >>> torch.mm(q, r).round()
+        tensor([[  12.,  -51.,    4.],
+                [   6.,  167.,  -68.],
+                [  -4.,   24.,  -41.]])
+        >>> torch.mm(q.t(), q).round()
+        tensor([[ 1.,  0.,  0.],
+                [ 0.,  1., -0.],
+                [ 0., -0.,  1.]])
+        >>> a = torch.randn(3, 4, 5)
+        >>> q, r = torch.qr(a, some=False)
+        >>> torch.allclose(torch.matmul(q, r), a)
+        True
+        >>> torch.allclose(torch.matmul(q.mT, q), torch.eye(5))
+        True
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: Tensor, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+@overload
+def quantile(input: Tensor, q: _float, dim: Optional[_int] = None, keepdim: _bool = False, *, interpolation: str = "linear", out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+    
+    Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+    
+    To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+    of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+    indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+    :attr:`interpolation` method as follows:
+    
+    - ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+    - ``lower``: ``a``.
+    - ``higher``: ``b``.
+    - ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+    - ``midpoint``: ``(a + b) / 2``.
+    
+    If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+    equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+    
+    .. note::
+        By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+    
+    Args:
+        input (Tensor): the input tensor.
+        q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+        dim (int): the dimension to reduce.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword arguments:
+        interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                                Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                                Default is ``linear``.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(2, 3)
+        >>> a
+        tensor([[ 0.0795, -1.2117,  0.9765],
+                [ 1.1707,  0.6706,  0.4884]])
+        >>> q = torch.tensor([0.25, 0.5, 0.75])
+        >>> torch.quantile(a, q, dim=1, keepdim=True)
+        tensor([[[-0.5661],
+                [ 0.5795]],
+    
+                [[ 0.0795],
+                [ 0.6706]],
+    
+                [[ 0.5280],
+                [ 0.9206]]])
+        >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+        torch.Size([3, 2, 1])
+        >>> a = torch.arange(4.)
+        >>> a
+        tensor([0., 1., 2., 3.])
+        >>> torch.quantile(a, 0.6, interpolation='linear')
+        tensor(1.8000)
+        >>> torch.quantile(a, 0.6, interpolation='lower')
+        tensor(1.)
+        >>> torch.quantile(a, 0.6, interpolation='higher')
+        tensor(2.)
+        >>> torch.quantile(a, 0.6, interpolation='midpoint')
+        tensor(1.5000)
+        >>> torch.quantile(a, 0.6, interpolation='nearest')
+        tensor(2.)
+        >>> torch.quantile(a, 0.4, interpolation='nearest')
+        tensor(1.)
+    """
+    ...
+def quantize_per_channel(input: Tensor, scales: Tensor, zero_points: Tensor, axis: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_channel(input, scales, zero_points, axis, dtype) -> Tensor
+    
+    Converts a float tensor to a per-channel quantized tensor with given scales and zero points.
+    
+    Arguments:
+        input (Tensor): float tensor to quantize
+        scales (Tensor): float 1D tensor of scales to use, size should match ``input.size(axis)``
+        zero_points (int): integer 1D tensor of offset to use, size should match ``input.size(axis)``
+        axis (int): dimension on which apply per-channel quantization
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor
+    
+    Example::
+    
+        >>> x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8)
+        tensor([[-1.,  0.],
+                [ 1.,  2.]], size=(2, 2), dtype=torch.quint8,
+               quantization_scheme=torch.per_channel_affine,
+               scale=tensor([0.1000, 0.0100], dtype=torch.float64),
+               zero_point=tensor([10,  0]), axis=0)
+        >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8).int_repr()
+        tensor([[  0,  10],
+                [100, 200]], dtype=torch.uint8)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: Tensor, zero_point: Tensor, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(input: Tensor, scale: _float, zero_point: _int, dtype: _dtype) -> Tensor: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+@overload
+def quantize_per_tensor(tensors: Union[Tuple[Tensor, ...], List[Tensor]], scales: Tensor, zero_points: Tensor, dtype: _dtype) -> Tuple[Tensor, ...]: 
+    r"""
+    quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with given scale and zero point.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        scale (float or Tensor): scale to apply in quantization formula
+        zero_point (int or Tensor): offset in integer value that maps to float zero
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+    
+    Returns:
+        Tensor: A newly quantized tensor or list of quantized tensors.
+    
+    Example::
+    
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+        tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+        >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+        >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+        (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+            tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+        >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+    """
+    ...
+def quantize_per_tensor_dynamic(input: Tensor, dtype: _dtype, reduce_range: _bool) -> Tensor: 
+    r"""
+    quantize_per_tensor_dynamic(input, dtype, reduce_range) -> Tensor
+    
+    Converts a float tensor to a quantized tensor with scale and zero_point calculated
+    dynamically based on the input.
+    
+    Arguments:
+        input (Tensor): float tensor or list of tensors to quantize
+        dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+            Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``
+        reduce_range (bool): a flag to indicate whether to reduce the range of quantized
+        data by 1 bit, it's required to avoid instruction overflow for some hardwares
+    
+    Returns:
+        Tensor: A newly (dynamically) quantized tensor
+    
+    Example::
+    
+        >>> t = torch.quantize_per_tensor_dynamic(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.quint8, False)
+        >>> print(t)
+        tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+               quantization_scheme=torch.per_tensor_affine, scale=0.011764705882352941,
+               zero_point=85)
+        >>> t.int_repr()
+        tensor([  0,  85, 170, 255], dtype=torch.uint8)
+    """
+    ...
+def quantized_batch_norm(input: Tensor, weight: Optional[Tensor], bias: Optional[Tensor], mean: Tensor, var: Tensor, eps: _float, output_scale: _float, output_zero_point: _int) -> Tensor: 
+    r"""
+    quantized_batch_norm(input, weight=None, bias=None, mean, var, eps, output_scale, output_zero_point) -> Tensor
+    
+    Applies batch normalization on a 4D (NCHW) quantized tensor.
+    
+    .. math::
+    
+            y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        weight (Tensor): float tensor that corresponds to the gamma, size C
+        bias (Tensor):  float tensor that corresponds to the beta, size C
+        mean (Tensor): float mean value in batch normalization, size C
+        var (Tensor): float tensor for variance, size C
+        eps (float): a value added to the denominator for numerical stability.
+        output_scale (float): output quantized tensor scale
+        output_zero_point (int): output quantized tensor zero_point
+    
+    Returns:
+        Tensor: A quantized tensor with batch normalization applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_batch_norm(qx, torch.ones(2), torch.zeros(2), torch.rand(2), torch.rand(2), 0.00001, 0.2, 2)
+        tensor([[[[-0.2000, -0.2000],
+              [ 1.6000, -0.2000]],
+    
+             [[-0.4000, -0.4000],
+              [-0.4000,  0.6000]]],
+    
+    
+            [[[-0.2000, -0.2000],
+              [-0.2000, -0.2000]],
+    
+             [[ 0.6000, -0.4000],
+              [ 0.6000, -0.4000]]]], size=(2, 2, 2, 2), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=2)
+    """
+    ...
+def quantized_gru_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_lstm_cell(input: Tensor, hx: Union[Tuple[Tensor, ...], List[Tensor]], w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tuple[Tensor, Tensor]: ...
+def quantized_max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool1d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 1D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (list of int): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool1d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool1d(qx, [2])
+        tensor([[0.0000],
+                [1.5000]], size=(2, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: 
+    r"""
+    quantized_max_pool2d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+    
+    Applies a 2D max pooling over an input quantized tensor composed of several input planes.
+    
+    Arguments:
+        input (Tensor): quantized tensor
+        kernel_size (``list of int``): the size of the sliding window
+        stride (``list of int``, optional): the stride of the sliding window
+        padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+        dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+        ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+            Defaults to False.
+    
+    
+    Returns:
+        Tensor: A quantized tensor with max_pool2d applied.
+    
+    Example::
+    
+        >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+        >>> torch.quantized_max_pool2d(qx, [2,2])
+        tensor([[[[1.5000]],
+    
+                [[1.5000]]],
+    
+    
+                [[[0.0000]],
+    
+                [[0.0000]]]], size=(2, 2, 1, 1), dtype=torch.quint8,
+            quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+    """
+    ...
+def quantized_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Union[_int, _size] = (), padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: _bool = False) -> Tensor: ...
+def quantized_rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def quantized_rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Tensor, b_hh: Tensor, packed_ih: Tensor, packed_hh: Tensor, col_offsets_ih: Tensor, col_offsets_hh: Tensor, scale_ih: Union[Number, _complex], scale_hh: Union[Number, _complex], zero_point_ih: Union[Number, _complex], zero_point_hh: Union[Number, _complex]) -> Tensor: ...
+def rad2deg(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rad2deg(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with each of the elements of :attr:`input`
+    converted from angles in radians to degrees.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword arguments:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]])
+        >>> torch.rad2deg(a)
+        tensor([[ 180.0233, -180.0233],
+                [ 359.9894, -359.9894],
+                [  89.9544,  -89.9544]])
+    """
+    ...
+def rad2deg_(input: Tensor) -> Tensor: ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+@overload
+def rand(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a tensor filled with random numbers from a uniform distribution
+    on the interval :math:`[0, 1)`
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.rand(4)
+        tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+        >>> torch.rand(2, 3)
+        tensor([[ 0.8237,  0.5781,  0.6879],
+                [ 0.3816,  0.7249,  0.0998]])
+    """
+    ...
+def rand_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a uniform distribution on the interval :math:`[0, 1)`.
+    ``torch.rand_like(input)`` is equivalent to
+    ``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint(low: _int, high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: _int, size: _size, *, generator: Optional[Generator] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint(low: Union[_int, SymInt], high: Union[_int, SymInt], size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with random integers generated uniformly
+    between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    .. note::
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+        size (tuple): a tuple defining the shape of the output tensor.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+            this function returns a tensor with dtype ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randint(3, 5, (3,))
+        tensor([4, 3, 4])
+    
+    
+        >>> torch.randint(10, (2, 2))
+        tensor([[0, 2],
+                [5, 5]])
+    
+    
+        >>> torch.randint(3, 10, (2, 2))
+        tensor([[4, 5],
+                [6, 7]])
+    """
+    ...
+@overload
+def randint_like(input: Tensor, high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randint_like(input: Tensor, low: Union[_int, SymInt], high: Union[_int, SymInt], *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randint_like(input, low=0, high, \*, dtype=None, layout=torch.strided, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same shape as Tensor :attr:`input` filled with
+    random integers generated uniformly between :attr:`low` (inclusive) and
+    :attr:`high` (exclusive).
+    
+    .. note:
+        With the global dtype default (``torch.float32``), this function returns
+        a tensor with dtype ``torch.int64``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+        low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+        high (int): One above the highest integer to be drawn from the distribution.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(size: Sequence[Union[_int, SymInt]], *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+@overload
+def randn(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    
+    Returns a tensor filled with random numbers from a normal distribution
+    with mean `0` and variance `1` (also called the standard normal
+    distribution).
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{N}(0, 1)
+    
+    For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+    unit variance as
+    
+    .. math::
+        \text{out}_{i} \sim \mathcal{CN}(0, 1)
+    
+    This is equivalent to separately sampling the real :math:`(\operatorname{Re})` and imaginary
+    :math:`(\operatorname{Im})` part of :math:`\text{out}_i` as
+    
+    .. math::
+        \operatorname{Re}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2}),\quad
+        \operatorname{Im}(\text{out}_{i}) \sim \mathcal{N}(0, \frac{1}{2})
+    
+    The shape of the tensor is defined by the variable argument :attr:`size`.
+    
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randn(4)
+        tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+        >>> torch.randn(2, 3)
+        tensor([[ 1.5954,  2.8929, -1.0923],
+                [ 1.1719, -0.4709, -0.1996]])
+    
+    .. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+    """
+    ...
+def randn_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor with the same size as :attr:`input` that is filled with
+    random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the
+    sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to
+    ``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, generator: Optional[Generator], out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+@overload
+def randperm(n: Union[_int, SymInt], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Returns a random permutation of integers from ``0`` to ``n - 1``.
+    
+    Args:
+        n (int): the upper bound (exclusive)
+    
+    Keyword args:
+        generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: ``torch.int64``.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.randperm(4)
+        tensor([2, 1, 0, 3])
+    """
+    ...
+def range(start: Number, end: Number, step: Number = 1, *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
+    with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+    the gap between two values in the tensor.
+    
+    .. math::
+        \text{out}_{i+1} = \text{out}_i + \text{step}.
+    
+    .. warning::
+        This function is deprecated and will be removed in a future release because its behavior is inconsistent with
+        Python's range builtin. Instead, use :func:`torch.arange`, which produces values in [start, end).
+    
+    Args:
+        start (float): the starting value for the set of points. Default: ``0``.
+        end (float): the ending value for the set of points
+        step (float): the gap between each pair of adjacent points. Default: ``1``.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`). If `dtype` is not given, infer the data type from the other input
+            arguments. If any of `start`, `end`, or `stop` are floating-point, the
+            `dtype` is inferred to be the default dtype, see
+            :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+            be `torch.int64`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.range(1, 4)
+        tensor([ 1.,  2.,  3.,  4.])
+        >>> torch.range(1, 4, 0.5)
+        tensor([ 1.0000,  1.5000,  2.0000,  2.5000,  3.0000,  3.5000,  4.0000])
+    """
+    ...
+def ravel(input: Tensor) -> Tensor: 
+    r"""
+    ravel(input) -> Tensor
+    
+    Return a contiguous flattened tensor. A copy is made only if needed.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[[1, 2],
+        ...                    [3, 4]],
+        ...                   [[5, 6],
+        ...                    [7, 8]]])
+        >>> torch.ravel(t)
+        tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    """
+    ...
+def real(input: Tensor) -> Tensor: 
+    r"""
+    real(input) -> Tensor
+    
+    Returns a new tensor containing real values of the :attr:`self` tensor.
+    The returned tensor and :attr:`self` share the same underlying storage.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+        >>> x.real
+        tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+    """
+    ...
+def reciprocal(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    reciprocal(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the elements of :attr:`input`
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\text{input}_{i}}
+    
+    .. note::
+        Unlike NumPy's reciprocal, torch.reciprocal supports integral inputs. Integral
+        inputs to reciprocal are automatically :ref:`promoted <type-promotion-doc>` to
+        the default scalar type.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.4595, -2.1219, -1.4314,  0.7298])
+        >>> torch.reciprocal(a)
+        tensor([-2.1763, -0.4713, -0.6986,  1.3702])
+    """
+    ...
+def reciprocal_(input: Tensor) -> Tensor: ...
+def relu(input: Tensor) -> Tensor: ...
+def relu_(input: Tensor) -> Tensor: ...
+@overload
+def remainder(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+@overload
+def remainder(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    remainder(input, other, *, out=None) -> Tensor
+    
+    Computes
+    `Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+    entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+    is less than that of :attr:`other`.
+    
+    It may also be defined in terms of :func:`torch.div` as
+    
+    .. code:: python
+    
+        torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+    
+    .. note::
+        Complex inputs are not supported. In some cases, it is not mathematically
+        possible to satisfy the definition of a modulo operation with complex numbers.
+        See :func:`torch.fmod` for how division by zero is handled.
+    
+    .. seealso::
+    
+        :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+        This one is defined in terms of division rounding towards zero.
+    
+    Args:
+        input (Tensor or Scalar): the dividend
+        other (Tensor or Scalar): the divisor
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+        tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+        >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+        tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+    """
+    ...
+def renorm(input: Tensor, p: Union[Number, _complex], dim: _int, maxnorm: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
+    
+    Returns a tensor where each sub-tensor of :attr:`input` along dimension
+    :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
+    than the value :attr:`maxnorm`
+    
+    .. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged
+    
+    Args:
+        input (Tensor): the input tensor.
+        p (float): the power for the norm computation
+        dim (int): the dimension to slice over to get the sub-tensors
+        maxnorm (float): the maximum norm to keep each sub-tensor under
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.ones(3, 3)
+        >>> x[1].fill_(2)
+        tensor([ 2.,  2.,  2.])
+        >>> x[2].fill_(3)
+        tensor([ 3.,  3.,  3.])
+        >>> x
+        tensor([[ 1.,  1.,  1.],
+                [ 2.,  2.,  2.],
+                [ 3.,  3.,  3.]])
+        >>> torch.renorm(x, 1, 0, 5)
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [ 1.6667,  1.6667,  1.6667],
+                [ 1.6667,  1.6667,  1.6667]])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Tensor, dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(repeats: Tensor, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+@overload
+def repeat_interleave(input: Tensor, repeats: Union[_int, SymInt], dim: Optional[_int] = None, *, output_size: Optional[Union[_int, SymInt]] = None) -> Tensor: 
+    r"""
+    repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+    
+    Repeat elements of a tensor.
+    
+    .. warning::
+    
+        This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        repeats (Tensor or int): The number of repetitions for each element.
+            repeats is broadcasted to fit the shape of the given axis.
+        dim (int, optional): The dimension along which to repeat values.
+            By default, use the flattened input array, and return a flat output
+            array.
+    
+    Keyword args:
+        output_size (int, optional): Total output size for the given axis
+            ( e.g. sum of repeats). If given, it will avoid stream synchronization
+            needed to calculate output shape of the tensor.
+    
+    Returns:
+        Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.repeat_interleave(2)
+        tensor([1, 1, 2, 2, 3, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.repeat_interleave(y, 2)
+        tensor([1, 1, 2, 2, 3, 3, 4, 4])
+        >>> torch.repeat_interleave(y, 3, dim=1)
+        tensor([[1, 1, 1, 2, 2, 2],
+                [3, 3, 3, 4, 4, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+        >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+        tensor([[1, 2],
+                [3, 4],
+                [3, 4]])
+    
+    If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+    `tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+    `1` appears `n2` times, `2` appears `n3` times, etc.
+    
+    .. function:: repeat_interleave(repeats, *) -> Tensor
+       :noindex:
+    
+    Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+    
+    Args:
+        repeats (Tensor): The number of repetitions for each element.
+    
+    Returns:
+        Tensor: Repeated tensor of size `sum(repeats)`.
+    
+    Example::
+    
+        >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+        tensor([0, 1, 1, 2, 2, 2])
+    """
+    ...
+def reshape(input: Tensor, shape: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    reshape(input, shape) -> Tensor
+    
+    Returns a tensor with the same data and number of elements as :attr:`input`,
+    but with the specified shape. When possible, the returned tensor will be a view
+    of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
+    with compatible strides can be reshaped without copying, but you should not
+    depend on the copying vs. viewing behavior.
+    
+    See :meth:`torch.Tensor.view` on when it is possible to return a view.
+    
+    A single dimension may be -1, in which case it's inferred from the remaining
+    dimensions and the number of elements in :attr:`input`.
+    
+    Args:
+        input (Tensor): the tensor to be reshaped
+        shape (tuple of int): the new shape
+    
+    Example::
+    
+        >>> a = torch.arange(4.)
+        >>> torch.reshape(a, (2, 2))
+        tensor([[ 0.,  1.],
+                [ 2.,  3.]])
+        >>> b = torch.tensor([[0, 1], [2, 3]])
+        >>> torch.reshape(b, (-1,))
+        tensor([ 0,  1,  2,  3])
+    """
+    ...
+def resize_as_(input: Tensor, the_template: Tensor, *, memory_format: Optional[memory_format] = None) -> Tensor: ...
+def resize_as_sparse_(input: Tensor, the_template: Tensor) -> Tensor: ...
+def resolve_conj(input: Tensor) -> Tensor: 
+    r"""
+    resolve_conj(input) -> Tensor
+    
+    Returns a new tensor with materialized conjugation if :attr:`input`'s conjugate bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its conjugate bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> y.is_conj()
+        True
+        >>> z = y.resolve_conj()
+        >>> z
+        tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+        >>> z.is_conj()
+        False
+    """
+    ...
+def resolve_neg(input: Tensor) -> Tensor: 
+    r"""
+    resolve_neg(input) -> Tensor
+    
+    Returns a new tensor with materialized negation if :attr:`input`'s negative bit is set to `True`,
+    else returns :attr:`input`. The output tensor will always have its negative bit set to `False`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+        >>> y = x.conj()
+        >>> z = y.imag
+        >>> z.is_neg()
+        True
+        >>> out = z.resolve_neg()
+        >>> out
+        tensor([-1., -2., 3.])
+        >>> out.is_neg()
+        False
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar: Union[Number, _complex], tensor: Tensor) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(tensor: Tensor, other: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def result_type(scalar1: Union[Number, _complex], scalar2: Union[Number, _complex]) -> _dtype: 
+    r"""
+    result_type(tensor1, tensor2) -> dtype
+    
+    Returns the :class:`torch.dtype` that would result from performing an arithmetic
+    operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+    for more information on the type promotion logic.
+    
+    Args:
+        tensor1 (Tensor or Number): an input tensor or number
+        tensor2 (Tensor or Number): an input tensor or number
+    
+    Example::
+    
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+        torch.float32
+        >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+        torch.uint8
+    """
+    ...
+@overload
+def rnn_relu(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_relu(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_relu_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def rnn_tanh(data: Tensor, batch_sizes: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool) -> Tuple[Tensor, Tensor]: ...
+@overload
+def rnn_tanh(input: Tensor, hx: Tensor, params: Union[Tuple[Tensor, ...], List[Tensor]], has_biases: _bool, num_layers: _int, dropout: _float, train: _bool, bidirectional: _bool, batch_first: _bool) -> Tuple[Tensor, Tensor]: ...
+def rnn_tanh_cell(input: Tensor, hx: Tensor, w_ih: Tensor, w_hh: Tensor, b_ih: Optional[Tensor] = None, b_hh: Optional[Tensor] = None) -> Tensor: ...
+def roll(input: Tensor, shifts: Union[Union[_int, SymInt], Sequence[Union[_int, SymInt]]], dims: Union[_int, _size] = ()) -> Tensor: 
+    r"""
+    roll(input, shifts, dims=None) -> Tensor
+    
+    Roll the tensor :attr:`input` along the given dimension(s). Elements that are
+    shifted beyond the last position are re-introduced at the first position. If
+    :attr:`dims` is `None`, the tensor will be flattened before rolling and then
+    restored to the original shape.
+    
+    Args:
+        input (Tensor): the input tensor.
+        shifts (int or tuple of ints): The number of places by which the elements
+            of the tensor are shifted. If shifts is a tuple, dims must be a tuple of
+            the same size, and each dimension will be rolled by the corresponding
+            value
+        dims (int or tuple of ints): Axis along which to roll
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+        >>> x
+        tensor([[1, 2],
+                [3, 4],
+                [5, 6],
+                [7, 8]])
+        >>> torch.roll(x, 1)
+        tensor([[8, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7]])
+        >>> torch.roll(x, 1, 0)
+        tensor([[7, 8],
+                [1, 2],
+                [3, 4],
+                [5, 6]])
+        >>> torch.roll(x, -1, 0)
+        tensor([[3, 4],
+                [5, 6],
+                [7, 8],
+                [1, 2]])
+        >>> torch.roll(x, shifts=(2, 1), dims=(0, 1))
+        tensor([[6, 5],
+                [8, 7],
+                [2, 1],
+                [4, 3]])
+    """
+    ...
+def rot90(input: Tensor, k: _int = 1, dims: _size = (0,1)) -> Tensor: 
+    r"""
+    rot90(input, k=1, dims=[0,1]) -> Tensor
+    
+    Rotate an n-D tensor by 90 degrees in the plane specified by dims axis.
+    Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): number of times to rotate. Default value is 1
+        dims (a list or tuple): axis to rotate. Default value is [0, 1]
+    
+    Example::
+    
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.rot90(x, 1, [0, 1])
+        tensor([[1, 3],
+                [0, 2]])
+    
+        >>> x = torch.arange(8).view(2, 2, 2)
+        >>> x
+        tensor([[[0, 1],
+                 [2, 3]],
+    
+                [[4, 5],
+                 [6, 7]]])
+        >>> torch.rot90(x, 1, [1, 2])
+        tensor([[[1, 3],
+                 [0, 2]],
+    
+                [[5, 7],
+                 [4, 6]]])
+    """
+    ...
+@overload
+def round(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round(input: Tensor, *, decimals: _int, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    round(input, *, decimals=0, out=None) -> Tensor
+    
+    Rounds elements of :attr:`input` to the nearest integer.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    The return type of output is same as that of input's dtype.
+    
+    .. note::
+        This function implements the "round half to even" to
+        break ties when a number is equidistant from two
+        integers (e.g. `round(2.5)` is 2).
+    
+        When the :attr:\`decimals\` argument is specified the
+        algorithm used is similar to NumPy's `around`. This
+        algorithm is fast but inexact and it can easily
+        overflow for low precision dtypes.
+        Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+    
+    .. seealso::
+        :func:`torch.ceil`, which rounds up.
+        :func:`torch.floor`, which rounds down.
+        :func:`torch.trunc`, which rounds towards zero.
+    
+    Args:
+        input (Tensor): the input tensor.
+        decimals (int): Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of positions
+            to the left of the decimal point.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+        tensor([ 5.,  -2.,  9., -8.])
+    
+        >>> # Values equidistant from two integers are rounded towards the
+        >>> #   the nearest even value (zero is treated as even)
+        >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+        tensor([-0., 0., 2., 2.])
+    
+        >>> # A positive decimals argument rounds to the to that decimal place
+        >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+        tensor([0.1230])
+    
+        >>> # A negative decimals argument rounds to the left of the decimal
+        >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+        tensor([1000.])
+    """
+    ...
+@overload
+def round_(input: Tensor) -> Tensor: ...
+@overload
+def round_(input: Tensor, *, decimals: _int) -> Tensor: ...
+def row_indices_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def row_stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    row_stack(tensors, *, out=None) -> Tensor
+    
+    Alias of :func:`torch.vstack`.
+    """
+    ...
+def rrelu(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rrelu_(input: Tensor, lower: Union[Number, _complex] = 0.125, upper: Union[Number, _complex] = 0.3333333333333333, training: _bool = False, generator: Optional[Generator] = None) -> Tensor: ...
+def rsqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    rsqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the reciprocal of the square-root of each of
+    the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.0370,  0.2970,  1.5420, -0.9105])
+        >>> torch.rsqrt(a)
+        tensor([    nan,  1.8351,  0.8053,     nan])
+    """
+    ...
+def rsqrt_(input: Tensor) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1) -> Tensor: ...
+@overload
+def rsub(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: ...
+def saddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Number = 1, alpha: Number = 1, out: Optional[Tensor] = None) -> Tensor: ...
+def scalar_tensor(s: Union[Number, _complex], *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, reduce: str, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: _int, index: Tensor, value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, value: Union[Number, _complex]) -> Tensor: 
+    r"""
+    scatter(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: _int, index: Tensor, src: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+@overload
+def scatter_add(input: Tensor, dim: Union[str, ellipsis, None], index: Tensor, src: Tensor) -> Tensor: 
+    r"""
+    scatter_add(input, dim, index, src) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+    """
+    ...
+def scatter_reduce(input: Tensor, dim: _int, index: Tensor, src: Tensor, reduce: str, *, include_self: _bool = True, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor
+    
+    Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, input: Tensor, *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+@overload
+def searchsorted(sorted_sequence: Tensor, self: Union[Number, _complex], *, out_int32: _bool = False, right: _bool = False, side: Optional[str] = None, sorter: Optional[Tensor] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+    
+    Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+    corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+    of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+    Return a new tensor with the same size as :attr:`values`. More formally,
+    the returned index satisfies the following rules:
+    
+    .. list-table::
+       :widths: 12 10 78
+       :header-rows: 1
+    
+       * - :attr:`sorted_sequence`
+         - :attr:`right`
+         - *returned index satisfies*
+       * - 1-D
+         - False
+         - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+       * - 1-D
+         - True
+         - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+       * - N-D
+         - False
+         - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+       * - N-D
+         - True
+         - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+    
+    Args:
+        sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                                  dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                                  need to be sorted
+        values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    
+    Keyword args:
+        out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                    Default value is False, i.e. default output data type is torch.int64.
+        right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                                last such index. If no suitable index found, return 0 for non-numerical value
+                                (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                                (one pass the last index of the *innermost* dimension). In other words, if False,
+                                gets the lower bound index for each value in :attr:`values` on the corresponding
+                                *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                                bound index instead. Default value is False. :attr:`side` does the same and is
+                                preferred. It will error if :attr:`side` is set to "left" while this is True.
+        side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                                and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                                "left" while :attr:`right` is True. Default value is None.
+        out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+        sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                                :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                                ascending order on the innermost dimension
+    
+    
+    Example::
+    
+        >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+        >>> sorted_sequence
+        tensor([[ 1,  3,  5,  7,  9],
+                [ 2,  4,  6,  8, 10]])
+        >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+        >>> values
+        tensor([[3, 6, 9],
+                [3, 6, 9]])
+        >>> torch.searchsorted(sorted_sequence, values)
+        tensor([[1, 3, 4],
+                [1, 2, 4]])
+        >>> torch.searchsorted(sorted_sequence, values, side='right')
+        tensor([[2, 3, 5],
+                [1, 3, 4]])
+    
+        >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+        >>> sorted_sequence_1d
+        tensor([1, 3, 5, 7, 9])
+        >>> torch.searchsorted(sorted_sequence_1d, values)
+        tensor([[1, 3, 4],
+                [1, 3, 4]])
+    """
+    ...
+def segment_reduce(data: Tensor, reduce: str, *, lengths: Optional[Tensor] = None, indices: Optional[Tensor] = None, offsets: Optional[Tensor] = None, axis: _int = 0, unsafe: _bool = False, initial: Optional[Union[Number, _complex]] = None) -> Tensor: ...
+@overload
+def select(input: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+@overload
+def select(input: Tensor, dim: Union[str, ellipsis, None], index: _int) -> Tensor: 
+    r"""
+    select(input, dim, index) -> Tensor
+    
+    Slices the :attr:`input` tensor along the selected dimension at the given index.
+    This function returns a view of the original tensor with the given dimension removed.
+    
+    .. note:: If :attr:`input` is a sparse tensor and returning a view of
+              the tensor is not possible, a RuntimeError exception is
+              raised. In this is the case, consider using
+              :func:`torch.select_copy` function.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the dimension to slice
+        index (int): the index to select with
+    
+    .. note::
+    
+        :meth:`select` is equivalent to slicing. For example,
+        ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+        ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+    """
+    ...
+def select_copy(input: Tensor, dim: _int, index: Union[_int, SymInt], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.select`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def select_scatter(input: Tensor, src: Tensor, dim: _int, index: Union[_int, SymInt]) -> Tensor: 
+    r"""
+    select_scatter(input, src, dim, index) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given index.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into.
+        index (int): the index to select with
+    
+    .. note::
+    
+        :attr:`src` must be of the proper size in order to be embedded
+        into :attr:`input`. Specifically, it should have the same shape as
+        ``torch.select(input, dim, index)``
+    
+    Example::
+    
+        >>> a = torch.zeros(2, 2)
+        >>> b = torch.ones(2)
+        >>> a.select_scatter(b, 0, 0)
+        tensor([[1., 1.],
+                [0., 0.]])
+    """
+    ...
+def selu(input: Tensor) -> Tensor: ...
+def selu_(input: Tensor) -> Tensor: ...
+def set_flush_denormal(mode: _bool) -> _bool: 
+    r"""
+    set_flush_denormal(mode) -> bool
+    
+    Disables denormal floating numbers on CPU.
+    
+    Returns ``True`` if your system supports flushing denormal numbers and it
+    successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
+    is supported on x86 architectures supporting SSE3 and AArch64 architecture.
+    
+    Args:
+        mode (bool): Controls whether to enable flush denormal mode or not
+    
+    Example::
+    
+        >>> torch.set_flush_denormal(True)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor([ 0.], dtype=torch.float64)
+        >>> torch.set_flush_denormal(False)
+        True
+        >>> torch.tensor([1e-323], dtype=torch.float64)
+        tensor(9.88131e-324 *
+               [ 1.0000], dtype=torch.float64)
+    """
+    ...
+def set_num_interop_threads(num: _int) -> None: 
+    r"""
+    set_num_interop_threads(int)
+    
+    Sets the number of threads used for interop parallelism
+    (e.g. in JIT interpreter) on CPU.
+    
+    .. warning::
+        Can only be called once and before any inter-op parallel work
+        is started (e.g. JIT execution).
+    """
+    ...
+def set_num_threads(num: _int) -> None: 
+    r"""
+    set_num_threads(int)
+    
+    Sets the number of threads used for intraop parallelism on CPU.
+    
+    .. warning::
+        To ensure that the correct number of threads is used, set_num_threads
+        must be called before running eager, JIT or autograd code.
+    """
+    ...
+def sgn(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sgn(input, *, out=None) -> Tensor
+    
+    This function is an extension of torch.sign() to complex tensors.
+    It computes a new tensor whose elements have
+    the same angles as the corresponding elements of :attr:`input` and
+    absolute values (i.e. magnitudes) of one for complex tensors and
+    is equivalent to torch.sign() for non-complex tensors.
+    
+    .. math::
+        \text{out}_{i} = \begin{cases}
+                        0 & |\text{{input}}_i| == 0 \\
+                        \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|} & \text{otherwise}
+                        \end{cases}
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([3+4j, 7-24j, 0, 1+2j])
+        >>> t.sgn()
+        tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+    """
+    ...
+def sigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sigmoid(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.expit`.
+    """
+    ...
+def sigmoid_(input: Tensor) -> Tensor: ...
+def sign(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sign(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the signs of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \operatorname{sgn}(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> a
+        tensor([ 0.7000, -1.2000,  0.0000,  2.3000])
+        >>> torch.sign(a)
+        tensor([ 1., -1.,  0.,  1.])
+    """
+    ...
+def signbit(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    signbit(input, *, out=None) -> Tensor
+    
+    Tests if each element of :attr:`input` has its sign bit set or not.
+    
+    Args:
+      input (Tensor): the input tensor.
+    
+    Keyword args:
+      out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+        >>> torch.signbit(a)
+        tensor([ False, True,  False,  False])
+        >>> a = torch.tensor([-0.0, 0.0])
+        >>> torch.signbit(a)
+        tensor([ True,  False])
+    
+    .. note::
+        signbit handles signed zeros, so negative zero (-0) returns True.
+    """
+    ...
+def sin(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sin(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the sine of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sin(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-0.5461,  0.1347, -2.7266, -0.2746])
+        >>> torch.sin(a)
+        tensor([-0.5194,  0.1343, -0.4032, -0.2711])
+    """
+    ...
+def sin_(input: Tensor) -> Tensor: ...
+def sinc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinc(input, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.sinc`.
+    """
+    ...
+def sinc_(input: Tensor) -> Tensor: ...
+def sinh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sinh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic sine of the elements of
+    :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sinh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.5380, -0.8632, -0.1265,  0.9399])
+        >>> torch.sinh(a)
+        tensor([ 0.5644, -0.9744, -0.1268,  1.0845])
+    
+    .. note::
+       When :attr:`input` is on the CPU, the implementation of torch.sinh may use
+       the Sleef library, which rounds very large results to infinity or negative
+       infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+    """
+    ...
+def sinh_(input: Tensor) -> Tensor: ...
+def slice_copy(input: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.slice`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def slice_inverse(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1) -> Tensor: ...
+def slice_scatter(input: Tensor, src: Tensor, dim: _int = 0, start: Optional[Union[_int, SymInt]] = None, end: Optional[Union[_int, SymInt]] = None, step: Union[_int, SymInt] = 1, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    slice_scatter(input, src, dim=0, start=None, end=None, step=1) -> Tensor
+    
+    Embeds the values of the :attr:`src` tensor into :attr:`input` at the given
+    dimension.
+    This function returns a tensor with fresh storage; it does not create a view.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        src (Tensor): The tensor to embed into :attr:`input`
+        dim (int): the dimension to insert the slice into
+        start (Optional[int]): the start index of where to insert the slice
+        end (Optional[int]): the end index of where to insert the slice
+        step (int): the how many elements to skip in
+    
+    Example::
+    
+        >>> a = torch.zeros(8, 8)
+        >>> b = torch.ones(2, 8)
+        >>> a.slice_scatter(b, start=6)
+        tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [0., 0., 0., 0., 0., 0., 0., 0.],
+                [1., 1., 1., 1., 1., 1., 1., 1.],
+                [1., 1., 1., 1., 1., 1., 1., 1.]])
+    
+        >>> b = torch.ones(8, 2)
+        >>> a.slice_scatter(b, dim=1, start=2, end=6, step=2)
+        tensor([[0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.],
+                [0., 0., 1., 0., 1., 0., 0., 0.]])
+    """
+    ...
+def slogdet(input: Tensor, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.slogdet: 
+    r"""
+    slogdet(input) -> (Tensor, Tensor)
+    
+    Alias for :func:`torch.linalg.slogdet`
+    """
+    ...
+def smm(input: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    smm(input, mat) -> Tensor
+    
+    Performs a matrix multiplication of the sparse matrix :attr:`input`
+    with the dense matrix :attr:`mat`.
+    
+    Args:
+        input (Tensor): a sparse matrix to be matrix multiplied
+        mat (Tensor): a dense matrix to be matrix multiplied
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: _int, dtype: Optional[_dtype] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def softmax(input: Tensor, dim: Union[str, ellipsis, None], *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    softmax(input, dim, *, dtype=None) -> Tensor
+    
+    Alias for :func:`torch.nn.functional.softmax`.
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: _int = -1, descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: _int = -1, descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, *, stable: Optional[_bool], dim: Union[str, ellipsis, None], descending: _bool = False, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+@overload
+def sort(input: Tensor, dim: Union[str, ellipsis, None], descending: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.sort: 
+    r"""
+    sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+    
+    Sorts the elements of the :attr:`input` tensor along a given dimension
+    in ascending order by value.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`descending` is ``True`` then the elements are sorted in descending
+    order by value.
+    
+    If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+    the order of equivalent elements.
+    
+    A namedtuple of (values, indices) is returned, where the `values` are the
+    sorted values and `indices` are the indices of the elements in the original
+    `input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int, optional): the dimension to sort along
+        descending (bool, optional): controls the sorting order (ascending or descending)
+        stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+           of equivalent elements is preserved.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+            be optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.randn(3, 4)
+        >>> sorted, indices = torch.sort(x)
+        >>> sorted
+        tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+                [-0.5793,  0.0061,  0.6058,  0.9497],
+                [-0.5071,  0.3343,  0.9553,  1.0960]])
+        >>> indices
+        tensor([[ 1,  0,  2,  3],
+                [ 3,  1,  0,  2],
+                [ 0,  3,  1,  2]])
+    
+        >>> sorted, indices = torch.sort(x, 0)
+        >>> sorted
+        tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+                [ 0.0608,  0.0061,  0.9497,  0.3343],
+                [ 0.6058,  0.9553,  1.0960,  2.3332]])
+        >>> indices
+        tensor([[ 2,  0,  0,  1],
+                [ 0,  1,  1,  2],
+                [ 1,  2,  2,  0]])
+        >>> x = torch.tensor([0, 1] * 9)
+        >>> x.sort()
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+        >>> x.sort(stable=True)
+        torch.return_types.sort(
+            values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+            indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+    """
+    ...
+def sparse_bsc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse
+    Column)) <sparse-bsc-docs>` with specified 2-dimensional blocks at the
+    given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in BSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncolblocks + 1)``. The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            index in values and row_indices depending on where the given
+            column starts. Each successive number in the tensor subtracted
+            by the number before it denotes the number of elements in a
+            given column.
+        row_indices (array_like): Row block co-ordinates of each block in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial blocks for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 1, 2]
+        >>> row_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 1, 2]),
+               row_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsc)
+    """
+    ...
+def sparse_bsr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_bsr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row))
+    <sparse-bsr-docs>` with specified 2-dimensional blocks at the given
+    :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix
+    multiplication operations in BSR format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-bsr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrowblocks + 1)``.  The last element of each
+            batch is the number of non-zeros. This tensor encodes the
+            block index in values and col_indices depending on where the
+            given row block starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            blocks in a given row.
+        col_indices (array_like): Column block co-ordinates of each block
+            in values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+            number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize ==
+            values.shape[1:3]``. If not provided, the size will be
+            inferred as the minimum size big enough to hold all non-zero
+            blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 1, 2]
+        >>> col_indices = [0, 1]
+        >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+        >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 1, 2]),
+               col_indices=tensor([0, 1]),
+               values=tensor([[[1., 2.],
+                               [3., 4.]],
+                              [[5., 6.],
+                               [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+               layout=torch.sparse_bsr)
+    """
+    ...
+def sparse_compressed_tensor(compressed_indices: Union[Tensor, List], plain_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, *, dtype=None, layout=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR,
+    CSC, BSR, or BSC - <sparse-compressed-docs>` with specified values at
+    the given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse
+    matrix multiplication operations in Compressed Sparse format are
+    typically faster than that for sparse tensors in COO format. Make you
+    have a look at :ref:`the note on the data type of the indices
+    <sparse-compressed-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        compressed_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, compressed_dim_size + 1)``.  The last element of
+            each batch is the number of non-zero elements or blocks. This
+            tensor encodes the index in ``values`` and ``plain_indices``
+            depending on where the given compressed dimension (row or
+            column) starts. Each successive number in the tensor
+            subtracted by the number before it denotes the number of
+            elements or blocks in a given compressed dimension.
+        plain_indices (array_like): Plain dimension (column or row)
+            co-ordinates of each element or block in values. (B+1)-dimensional
+            tensor with the same length as values.
+    
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types.  that
+            represents a (1+K)-dimensional (for CSR and CSC layouts) or
+            (1+2+K)-dimensional tensor (for BSR and BSC layouts) where
+            ``K`` is the number of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+            blocksize[1], *densesize)`` where ``blocksize[0] ==
+            blocksize[1] == 1`` for CSR and CSC formats. If not provided,
+            the size will be inferred as the minimum size big enough to
+            hold all non-zero elements or blocks.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        layout (:class:`torch.layout`, required): the desired layout of
+            returned tensor: :attr:`torch.sparse_csr`,
+            :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or
+            :attr:`torch.sparse_bsc`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> compressed_indices = [0, 2, 4]
+        >>> plain_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64),
+        ...                                torch.tensor(plain_indices, dtype=torch.int64),
+        ...                                torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def sparse_coo_tensor(indices: Tensor, values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None, is_coalesced: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None, is_coalesced=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in COO(rdinate) format
+    <sparse-coo-docs>` with specified values at the given
+    :attr:`indices`.
+    
+    .. note::
+    
+       This function returns an :ref:`uncoalesced tensor
+       <sparse-uncoalesced-coo-docs>` when :attr:`is_coalesced` is
+       unspecified or ``None``.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        indices (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`
+            internally. The indices are the coordinates of the non-zero values in the matrix, and thus
+            should be two-dimensional where the first dimension is the number of tensor dimensions and
+            the second dimension is the number of non-zero values.
+        values (array_like): Initial values for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+        size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
+            provided the size will be inferred as the minimum size big enough to hold all non-zero
+            elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if None, infers data type from :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if None, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+        is_coalesced (bool, optional): When``True``, the caller is
+            responsible for providing tensor indices that correspond to a
+            coalesced tensor.  If the :attr:`check_invariants` flag is
+            False, no error will be raised if the prerequisites are not
+            met and this will lead to silently incorrect results. To force
+            coalescion please use :meth:`coalesce` on the resulting
+            Tensor.
+            Default: None: except for trivial cases (e.g. nnz < 2) the
+            resulting Tensor has is_coalesced set to ``False```.
+    
+    Example::
+    
+        >>> i = torch.tensor([[0, 1, 1],
+        ...                   [2, 0, 2]])
+        >>> v = torch.tensor([3, 4, 5], dtype=torch.float32)
+        >>> torch.sparse_coo_tensor(i, v, [2, 4])
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 4), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v)  # Shape inference
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               size=(2, 3), nnz=3, layout=torch.sparse_coo)
+    
+        >>> torch.sparse_coo_tensor(i, v, [2, 4],
+        ...                         dtype=torch.float64,
+        ...                         device=torch.device('cuda:0'))
+        tensor(indices=tensor([[0, 1, 1],
+                               [2, 0, 2]]),
+               values=tensor([3., 4., 5.]),
+               device='cuda:0', size=(2, 4), nnz=3, dtype=torch.float64,
+               layout=torch.sparse_coo)
+    
+        # Create an empty sparse tensor with the following invariants:
+        #   1. sparse_dim + dense_dim = len(SparseTensor.shape)
+        #   2. SparseTensor._indices().shape = (sparse_dim, nnz)
+        #   3. SparseTensor._values().shape = (nnz, SparseTensor.shape[sparse_dim:])
+        #
+        # For instance, to create an empty sparse tensor with nnz = 0, dense_dim = 0 and
+        # sparse_dim = 1 (hence indices is a 2D tensor of shape = (1, 0))
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), [], [1])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0,)),
+               size=(1,), nnz=0, layout=torch.sparse_coo)
+    
+        # and to create an empty sparse tensor with nnz = 0, dense_dim = 1 and
+        # sparse_dim = 1
+        >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), torch.empty([0, 2]), [1, 2])
+        tensor(indices=tensor([], size=(1, 0)),
+               values=tensor([], size=(0, 2)),
+               size=(1, 2), nnz=0, layout=torch.sparse_coo)
+    
+    .. _torch.sparse: https://pytorch.org/docs/stable/sparse.html
+    """
+    ...
+def sparse_csc_tensor(ccol_indices: Union[Tensor, List], row_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csc_tensor(ccol_indices, row_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column)
+    <sparse-csc-docs>` with specified values at the given
+    :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+    multiplication operations in CSC format are typically faster than that
+    for sparse tensors in COO format. Make you have a look at :ref:`the
+    note on the data type of the indices <sparse-csc-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        ccol_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, ncols + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and row_indices depending on where the given column
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            column.
+        row_indices (array_like): Row co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length as
+            values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> ccol_indices = [0, 2, 4]
+        >>> row_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+        ...                         torch.tensor(row_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(ccol_indices=tensor([0, 2, 4]),
+               row_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csc)
+    """
+    ...
+def sparse_csr_tensor(crow_indices: Union[Tensor, List], col_indices: Union[Tensor, List], values: Union[Tensor, List], size: Optional[_size] = None, *, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, check_invariants: Optional[_bool] = None) -> Tensor: 
+    r"""
+    sparse_csr_tensor(crow_indices, col_indices, values, size=None, *, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+    
+    Constructs a :ref:`sparse tensor in CSR (Compressed Sparse Row) <sparse-csr-docs>` with specified
+    values at the given :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix multiplication operations
+    in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look
+    at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
+    
+    .. note::
+    
+       If the ``device`` argument is not specified the device of the given
+       :attr:`values` and indices tensor(s) must match. If, however, the
+       argument is specified the input Tensors will be converted to the
+       given device and in turn determine the device of the constructed
+       sparse tensor.
+    
+    Args:
+        crow_indices (array_like): (B+1)-dimensional array of size
+            ``(*batchsize, nrows + 1)``.  The last element of each batch
+            is the number of non-zeros. This tensor encodes the index in
+            values and col_indices depending on where the given row
+            starts. Each successive number in the tensor subtracted by the
+            number before it denotes the number of elements in a given
+            row.
+        col_indices (array_like): Column co-ordinates of each element in
+            values. (B+1)-dimensional tensor with the same length
+            as values.
+        values (array_list): Initial values for the tensor. Can be a list,
+            tuple, NumPy ``ndarray``, scalar, and other types that
+            represents a (1+K)-dimensional tensor where ``K`` is the number
+            of dense dimensions.
+        size (list, tuple, :class:`torch.Size`, optional): Size of the
+            sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+            not provided, the size will be inferred as the minimum size
+            big enough to hold all non-zero elements.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor.  Default: if None, infers data type from
+            :attr:`values`.
+        device (:class:`torch.device`, optional): the desired device of
+            returned tensor.  Default: if None, uses the current device
+            for the default tensor type (see
+            :func:`torch.set_default_device`). :attr:`device` will be
+            the CPU for CPU tensor types and the current CUDA device for
+            CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        check_invariants (bool, optional): If sparse tensor invariants are checked.
+            Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+            initially False.
+    
+    Example::
+        >>> crow_indices = [0, 2, 4]
+        >>> col_indices = [0, 1, 0, 1]
+        >>> values = [1, 2, 3, 4]
+        >>> torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+        ...                         torch.tensor(col_indices, dtype=torch.int64),
+        ...                         torch.tensor(values), dtype=torch.double)
+        tensor(crow_indices=tensor([0, 2, 4]),
+               col_indices=tensor([0, 1, 0, 1]),
+               values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+               dtype=torch.float64, layout=torch.sparse_csr)
+    """
+    ...
+def split_copy(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def split_with_sizes_copy(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def spmm(input: Tensor, mat2: Tensor) -> Tensor: ...
+def sqrt(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sqrt(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square-root of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \sqrt{\text{input}_{i}}
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.sqrt(a)
+        tensor([    nan,  1.0112,  0.2883,  0.6933])
+    """
+    ...
+def sqrt_(input: Tensor) -> Tensor: ...
+def square(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    square(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the square of the elements of :attr:`input`.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+        >>> torch.square(a)
+        tensor([ 4.3077,  1.0457,  0.0069,  0.2310])
+    """
+    ...
+def square_(input: Tensor) -> Tensor: ...
+@overload
+def squeeze(input: Tensor) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: _size) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze(input: Tensor, dim: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    squeeze(input, dim=None) -> Tensor
+    
+    Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+    
+    For example, if `input` is of shape:
+    :math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+    will be of shape: :math:`(A \times B \times C \times D)`.
+    
+    When :attr:`dim` is given, a squeeze operation is done only in the given
+    dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+    ``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+    will squeeze the tensor to the shape :math:`(A \times B)`.
+    
+    .. note:: The returned tensor shares the storage with the input tensor,
+              so changing the contents of one will change the contents of the other.
+    
+    .. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+              will also remove the batch dimension, which can lead to unexpected
+              errors. Consider specifying only the dims you wish to be squeezed.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints, optional): if given, the input will be squeezed
+               only in the specified dimensions.
+    
+            .. versionchanged:: 2.0
+               :attr:`dim` now accepts tuples of dimensions.
+    
+    Example::
+    
+        >>> x = torch.zeros(2, 1, 2, 1, 2)
+        >>> x.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x)
+        >>> y.size()
+        torch.Size([2, 2, 2])
+        >>> y = torch.squeeze(x, 0)
+        >>> y.size()
+        torch.Size([2, 1, 2, 1, 2])
+        >>> y = torch.squeeze(x, 1)
+        >>> y.size()
+        torch.Size([2, 2, 1, 2])
+        >>> y = torch.squeeze(x, (1, 2, 3))
+        torch.Size([2, 2, 2])
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def squeeze_copy(input: Tensor, dim: _size, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.squeeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, alpha: Union[Number, _complex], mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(input: Tensor, mat1: Tensor, mat2: Tensor, *, beta: Union[Number, _complex] = 1, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+@overload
+def sspaddmm(beta: Union[Number, _complex], self: Tensor, mat1: Tensor, mat2: Tensor) -> Tensor: 
+    r"""
+    sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+    
+    Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+    :attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+    
+    Note: This function is equivalent to :func:`torch.addmm`, except
+    :attr:`input` and :attr:`mat1` are sparse.
+    
+    Args:
+        input (Tensor): a sparse matrix to be added
+        mat1 (Tensor): a sparse matrix to be matrix multiplied
+        mat2 (Tensor): a dense matrix to be matrix multiplied
+    
+    Keyword args:
+        beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+        alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+        out (Tensor, optional): the output tensor.
+    """
+    ...
+def stack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    stack(tensors, dim=0, *, out=None) -> Tensor
+    
+    Concatenates a sequence of tensors along a new dimension.
+    
+    All tensors need to be of the same size.
+    
+    .. seealso::
+    
+        :func:`torch.cat` concatenates the given sequence along an existing dimension.
+    
+    Arguments:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+        dim (int, optional): dimension to insert. Has to be between 0 and the number
+            of dimensions of concatenated tensors (inclusive). Default: 0
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.3367,  0.1288,  0.2345],
+                [ 0.2303, -1.1229, -0.1863]])
+        >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0)
+        >>> x
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]],
+    
+                [[ 0.3367,  0.1288,  0.2345],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x.size()
+        torch.Size([2, 2, 3])
+        >>> x = torch.stack((x, x), dim=1)
+        tensor([[[ 0.3367,  0.1288,  0.2345],
+                 [ 0.3367,  0.1288,  0.2345]],
+    
+                [[ 0.2303, -1.1229, -0.1863],
+                 [ 0.2303, -1.1229, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=2)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+        >>> x = torch.stack((x, x), dim=-1)
+        tensor([[[ 0.3367,  0.3367],
+                 [ 0.1288,  0.1288],
+                 [ 0.2345,  0.2345]],
+    
+                [[ 0.2303,  0.2303],
+                 [-1.1229, -1.1229],
+                 [-0.1863, -0.1863]]])
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int or tuple of ints): the dimension or dimensions to reduce.
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std(a, dim=1, keepdim=True)
+        tensor([[1.0311],
+                [0.7477],
+                [1.2204],
+                [0.9087]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def std_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the standard deviation and mean over the dimensions specified by
+    :attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+    ``None`` to reduce over all dimensions.
+    
+    The standard deviation (:math:`\sigma`) is calculated as
+    
+    .. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (std, mean) containing the standard deviation and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.std_mean(a, dim=0, keepdim=True)
+        (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def sub(input: Union[Tensor, Number, _complex], other: Union[Tensor, Number, _complex], *, alpha: Optional[Union[Number, _complex]] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def sub(self: Tensor, alpha: Union[Number, _complex], other: Tensor, *, out: Tensor) -> Tensor: 
+    r"""
+    sub(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+    
+    .. math::
+        \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+    
+    
+    Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+    :ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+    
+    Args:
+        input (Tensor): the input tensor.
+        other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+    
+    Keyword args:
+        alpha (Number): the multiplier for :attr:`other`.
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor((1, 2))
+        >>> b = torch.tensor((0, 1))
+        >>> torch.sub(a, b, alpha=2)
+        tensor([1, 0])
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Tensor, *, alpha: Union[Number, _complex] = 1, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def subtract(input: Tensor, other: Union[Number, _complex], alpha: Union[Number, _complex] = 1) -> Tensor: 
+    r"""
+    subtract(input, other, *, alpha=1, out=None) -> Tensor
+    
+    Alias for :func:`torch.sub`.
+    """
+    ...
+@overload
+def sum(input: Tensor, *, dtype: Optional[_dtype] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Optional[Union[_int, _size]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+@overload
+def sum(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], keepdim: _bool = False, *, dtype: Optional[_dtype] = None, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    sum(input, *, dtype=None) -> Tensor
+    
+    Returns the sum of all elements in the :attr:`input` tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(1, 3)
+        >>> a
+        tensor([[ 0.1133, -0.9567,  0.2958]])
+        >>> torch.sum(a)
+        tensor(-0.5475)
+    
+    .. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+       :noindex:
+    
+    Returns the sum of each row of the :attr:`input` tensor in the given
+    dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them.
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            If specified, the input tensor is casted to :attr:`dtype` before the operation
+            is performed. This is useful for preventing data type overflows. Default: None.
+    
+    Example::
+    
+        >>> a = torch.randn(4, 4)
+        >>> a
+        tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+                [-0.2993,  0.9138,  0.9337, -1.6864],
+                [ 0.1132,  0.7892, -0.1003,  0.5688],
+                [ 0.3637, -0.9906, -0.4752, -1.5197]])
+        >>> torch.sum(a, 1)
+        tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+        >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+        >>> torch.sum(b, (2, 1))
+        tensor([  435.,  1335.,  2235.,  3135.])
+    """
+    ...
+def svd(input: Tensor, some: _bool = True, compute_uv: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.svd: 
+    r"""
+    svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
+    
+    Computes the singular value decomposition of either a matrix or batch of
+    matrices :attr:`input`. The singular value decomposition is represented as a
+    namedtuple `(U, S, V)`, such that :attr:`input` :math:`= U \text{diag}(S) V^{\text{H}}`.
+    where :math:`V^{\text{H}}` is the transpose of `V` for real inputs,
+    and the conjugate transpose of `V` for complex inputs.
+    If :attr:`input` is a batch of matrices, then `U`, `S`, and `V` are also
+    batched with the same batch dimensions as :attr:`input`.
+    
+    If :attr:`some` is `True` (default), the method returns the reduced singular
+    value decomposition. In this case, if the last two dimensions of :attr:`input` are
+    `m` and `n`, then the returned `U` and `V` matrices will contain only
+    `min(n, m)` orthonormal columns.
+    
+    If :attr:`compute_uv` is `False`, the returned `U` and `V` will be
+    zero-filled matrices of shape `(m, m)` and `(n, n)`
+    respectively, and the same device as :attr:`input`. The argument :attr:`some`
+    has no effect when :attr:`compute_uv` is `False`.
+    
+    Supports :attr:`input` of float, double, cfloat and cdouble data types.
+    The dtypes of `U` and `V` are the same as :attr:`input`'s. `S` will
+    always be real-valued, even if :attr:`input` is complex.
+    
+    .. warning::
+    
+        :func:`torch.svd` is deprecated in favor of :func:`torch.linalg.svd`
+        and will be removed in a future PyTorch release.
+    
+        ``U, S, V = torch.svd(A, some=some, compute_uv=True)`` (default) should be replaced with
+    
+        .. code:: python
+    
+            U, S, Vh = torch.linalg.svd(A, full_matrices=not some)
+            V = Vh.mH
+    
+        ``_, S, _ = torch.svd(A, some=some, compute_uv=False)`` should be replaced with
+    
+        .. code:: python
+    
+            S = torch.linalg.svdvals(A)
+    
+    .. note:: Differences with :func:`torch.linalg.svd`:
+    
+                 * :attr:`some` is the opposite of
+                   :func:`torch.linalg.svd`'s :attr:`full_matrices`. Note that
+                   default value for both is `True`, so the default behavior is
+                   effectively the opposite.
+                 * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns
+                   `Vh`, that is, :math:`V^{\text{H}}`.
+                 * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled
+                   tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns
+                   empty tensors.
+    
+    .. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
+              then the singular values of each matrix in the batch are returned in descending order.
+    
+    .. note:: The `S` tensor can only be used to compute gradients if :attr:`compute_uv` is `True`.
+    
+    .. note:: When :attr:`some` is `False`, the gradients on `U[..., :, min(m, n):]`
+              and `V[..., :, min(m, n):]` will be ignored in the backward pass, as those vectors
+              can be arbitrary bases of the corresponding subspaces.
+    
+    .. note:: The implementation of :func:`torch.linalg.svd` on CPU uses LAPACK's routine `?gesdd`
+              (a divide-and-conquer algorithm) instead of `?gesvd` for speed. Analogously,
+              on GPU, it uses cuSOLVER's routines `gesvdj` and `gesvdjBatched` on CUDA 10.1.243
+              and later, and MAGMA's routine `gesdd` on earlier versions of CUDA.
+    
+    .. note:: The returned `U` will not be contiguous. The matrix (or batch of matrices) will
+              be represented as a column-major matrix (i.e. Fortran-contiguous).
+    
+    .. warning:: The gradients with respect to `U` and `V` will only be finite when the input does not
+                 have zero nor repeated singular values.
+    
+    .. warning:: If the distance between any two singular values is close to zero, the gradients with respect to
+                 `U` and `V` will be numerically unstable, as they depends on
+                 :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`. The same happens when the matrix
+                 has small singular values, as these gradients also depend on `S^{-1}`.
+    
+    .. warning:: For complex-valued :attr:`input` the singular value decomposition is not unique,
+                 as `U` and `V` may be multiplied by an arbitrary phase factor :math:`e^{i \phi}` on every column.
+                 The same happens when :attr:`input` has repeated singular values, where one may multiply
+                 the columns of the spanning subspace in `U` and `V` by a rotation matrix
+                 and `the resulting vectors will span the same subspace`_.
+                 Different platforms, like NumPy, or inputs on different device types,
+                 may produce different `U` and `V` tensors.
+    
+    Args:
+        input (Tensor): the input tensor of size `(*, m, n)` where `*` is zero or more
+                        batch dimensions consisting of `(m, n)` matrices.
+        some (bool, optional): controls whether to compute the reduced or full decomposition, and
+                               consequently, the shape of returned `U` and `V`. Default: `True`.
+        compute_uv (bool, optional): controls whether to compute `U` and `V`. Default: `True`.
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of tensors
+    
+    Example::
+    
+        >>> a = torch.randn(5, 3)
+        >>> a
+        tensor([[ 0.2364, -0.7752,  0.6372],
+                [ 1.7201,  0.7394, -0.0504],
+                [-0.3371, -1.0584,  0.5296],
+                [ 0.3550, -0.4022,  1.5569],
+                [ 0.2445, -0.0158,  1.1414]])
+        >>> u, s, v = torch.svd(a)
+        >>> u
+        tensor([[ 0.4027,  0.0287,  0.5434],
+                [-0.1946,  0.8833,  0.3679],
+                [ 0.4296, -0.2890,  0.5261],
+                [ 0.6604,  0.2717, -0.2618],
+                [ 0.4234,  0.2481, -0.4733]])
+        >>> s
+        tensor([2.3289, 2.0315, 0.7806])
+        >>> v
+        tensor([[-0.0199,  0.8766,  0.4809],
+                [-0.5080,  0.4054, -0.7600],
+                [ 0.8611,  0.2594, -0.4373]])
+        >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t()))
+        tensor(8.6531e-07)
+        >>> a_big = torch.randn(7, 5, 3)
+        >>> u, s, v = torch.svd(a_big)
+        >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.mT))
+        tensor(2.6503e-06)
+    
+    .. _the resulting vectors will span the same subspace:
+           (https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD)
+    """
+    ...
+def swapaxes(input: Tensor, axis0: _int, axis1: _int) -> Tensor: 
+    r"""
+    swapaxes(input, axis0, axis1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapaxes(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def swapdims(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    swapdims(input, dim0, dim1) -> Tensor
+    
+    Alias for :func:`torch.transpose`.
+    
+    This function is equivalent to NumPy's swapaxes function.
+    
+    Examples::
+    
+        >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+        >>> x
+        tensor([[[0, 1],
+                [2, 3]],
+    
+                [[4, 5],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 1)
+        tensor([[[0, 1],
+                [4, 5]],
+    
+                [[2, 3],
+                [6, 7]]])
+        >>> torch.swapdims(x, 0, 2)
+        tensor([[[0, 4],
+                [2, 6]],
+    
+                [[1, 5],
+                [3, 7]]])
+    """
+    ...
+def sym_constrain_range(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def sym_constrain_range_for_size(size: Union[Number, _complex], *, min: Optional[_int] = None, max: Optional[_int] = None) -> None: ...
+def t(input: Tensor) -> Tensor: 
+    r"""
+    t(input) -> Tensor
+    
+    Expects :attr:`input` to be <= 2-D tensor and transposes dimensions 0
+    and 1.
+    
+    0-D and 1-D tensors are returned as is. When input is a 2-D tensor this
+    is equivalent to ``transpose(input, 0, 1)``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x = torch.randn(())
+        >>> x
+        tensor(0.1995)
+        >>> torch.t(x)
+        tensor(0.1995)
+        >>> x = torch.randn(3)
+        >>> x
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> torch.t(x)
+        tensor([ 2.4320, -0.4608,  0.7702])
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 0.4875,  0.9158, -0.5872],
+                [ 0.3938, -0.6929,  0.6932]])
+        >>> torch.t(x)
+        tensor([[ 0.4875,  0.3938],
+                [ 0.9158, -0.6929],
+                [-0.5872,  0.6932]])
+    
+    See also :func:`torch.transpose`.
+    """
+    ...
+def t_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.t`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def take(input: Tensor, index: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take(input, index) -> Tensor
+    
+    Returns a new tensor with the elements of :attr:`input` at the given indices.
+    The input tensor is treated as if it were viewed as a 1-D tensor. The result
+    takes the same shape as the indices.
+    
+    Args:
+        input (Tensor): the input tensor.
+        index (LongTensor): the indices into tensor
+    
+    Example::
+    
+        >>> src = torch.tensor([[4, 3, 5],
+        ...                     [6, 7, 8]])
+        >>> torch.take(src, torch.tensor([0, 2, 5]))
+        tensor([ 4,  5,  8])
+    """
+    ...
+def take_along_dim(input: Tensor, indices: Tensor, dim: Optional[_int] = None, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    take_along_dim(input, indices, dim=None, *, out=None) -> Tensor
+    
+    Selects values from :attr:`input` at the 1-dimensional indices from :attr:`indices` along the given :attr:`dim`.
+    
+    If :attr:`dim` is None, the input array is treated as if it has been flattened to 1d.
+    
+    Functions that return indices along a dimension, like :func:`torch.argmax` and :func:`torch.argsort`,
+    are designed to work with this function. See the examples below.
+    
+    .. note::
+        This function is similar to NumPy's `take_along_axis`.
+        See also :func:`torch.gather`.
+    
+    Args:
+        input (Tensor): the input tensor.
+        indices (tensor): the indices into :attr:`input`. Must have long dtype.
+        dim (int, optional): dimension to select along.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> t = torch.tensor([[10, 30, 20], [60, 40, 50]])
+        >>> max_idx = torch.argmax(t)
+        >>> torch.take_along_dim(t, max_idx)
+        tensor([60])
+        >>> sorted_idx = torch.argsort(t, dim=1)
+        >>> torch.take_along_dim(t, sorted_idx, dim=1)
+        tensor([[10, 20, 30],
+                [40, 50, 60]])
+    """
+    ...
+def tan(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tan(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the tangent of the elements of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tan(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([-1.2027, -1.7687,  0.4412, -1.3856])
+        >>> torch.tan(a)
+        tensor([-2.5930,  4.9859,  0.4722, -5.3366])
+    """
+    ...
+def tan_(input: Tensor) -> Tensor: ...
+def tanh(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tanh(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the hyperbolic tangent of the elements
+    of :attr:`input`.
+    
+    .. math::
+        \text{out}_{i} = \tanh(\text{input}_{i})
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 0.8986, -0.7279,  1.1745,  0.2611])
+        >>> torch.tanh(a)
+        tensor([ 0.7156, -0.6218,  0.8257,  0.2553])
+    """
+    ...
+def tanh_(input: Tensor) -> Tensor: ...
+def tensor(data: Any, dtype: Optional[_dtype] = None, device: Optional[DeviceLikeType] = None, requires_grad: _bool = False, pin_memory: _bool = False) -> Tensor: 
+    r"""
+    tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+    
+    Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.
+    
+    .. warning::
+    
+        When working with tensors prefer using :func:`torch.Tensor.clone`,
+        :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
+        readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
+        ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
+        is equivalent to ``t.clone().detach().requires_grad_(True)``.
+    
+    .. seealso::
+    
+        :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
+        :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.
+    
+    Args:
+        data (array_like): Initial data for the tensor. Can be a list, tuple,
+            NumPy ``ndarray``, scalar, and other types.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, infers data type from :attr:`data`.
+        device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+            then the device of data is used. If None and data is not a tensor then
+            the result tensor is constructed on the current device.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+    
+    
+    Example::
+    
+        >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+        tensor([[ 0.1000,  1.2000],
+                [ 2.2000,  3.1000],
+                [ 4.9000,  5.2000]])
+    
+        >>> torch.tensor([0, 1])  # Type inference on data
+        tensor([ 0,  1])
+    
+        >>> torch.tensor([[0.11111, 0.222222, 0.3333333]],
+        ...              dtype=torch.float64,
+        ...              device=torch.device('cuda:0'))  # creates a double tensor on a CUDA device
+        tensor([[ 0.1111,  0.2222,  0.3333]], dtype=torch.float64, device='cuda:0')
+    
+        >>> torch.tensor(3.14159)  # Create a zero-dimensional (scalar) tensor
+        tensor(3.1416)
+    
+        >>> torch.tensor([])  # Create an empty tensor (of size (0,))
+        tensor([])
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, tensor_indices_or_sections: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, sections: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+@overload
+def tensor_split(input: Tensor, indices: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+    
+    Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+    along dimension :attr:`dim` according to the indices or number of sections specified
+    by :attr:`indices_or_sections`. This function is based on NumPy's
+    :func:`numpy.array_split`.
+    
+    Args:
+        input (Tensor): the tensor to split
+        indices_or_sections (Tensor, int or list or tuple of ints):
+            If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+            with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+            If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+            section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+            is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+            sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+            have size :code:`int(input.size(dim) / n)`.
+    
+            If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+            tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+            in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+            would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+    
+            If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+            long tensor on the CPU.
+    
+        dim (int, optional): dimension along which to split the tensor. Default: ``0``
+    
+    Example::
+    
+        >>> x = torch.arange(8)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+    
+        >>> x = torch.arange(7)
+        >>> torch.tensor_split(x, 3)
+        (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+        >>> torch.tensor_split(x, (1, 6))
+        (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+    
+        >>> x = torch.arange(14).reshape(2, 7)
+        >>> x
+        tensor([[ 0,  1,  2,  3,  4,  5,  6],
+                [ 7,  8,  9, 10, 11, 12, 13]])
+        >>> torch.tensor_split(x, 3, dim=1)
+        (tensor([[0, 1, 2],
+                [7, 8, 9]]),
+         tensor([[ 3,  4],
+                [10, 11]]),
+         tensor([[ 5,  6],
+                [12, 13]]))
+        >>> torch.tensor_split(x, (1, 6), dim=1)
+        (tensor([[0],
+                [7]]),
+         tensor([[ 1,  2,  3,  4,  5],
+                [ 8,  9, 10, 11, 12]]),
+         tensor([[ 6],
+                [13]]))
+    """
+    ...
+def threshold(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: ...
+def threshold_(input: Tensor, threshold: Union[Number, _complex], value: Union[Number, _complex]) -> Tensor: ...
+def tile(input: Tensor, dims: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    tile(input, dims) -> Tensor
+    
+    Constructs a tensor by repeating the elements of :attr:`input`.
+    The :attr:`dims` argument specifies the number of repetitions
+    in each dimension.
+    
+    If :attr:`dims` specifies fewer dimensions than :attr:`input` has, then
+    ones are prepended to :attr:`dims` until all dimensions are specified.
+    For example, if :attr:`input` has shape (8, 6, 4, 2) and :attr:`dims`
+    is (2, 2), then :attr:`dims` is treated as (1, 1, 2, 2).
+    
+    Analogously, if :attr:`input` has fewer dimensions than :attr:`dims`
+    specifies, then :attr:`input` is treated as if it were unsqueezed at
+    dimension zero until it has as many dimensions as :attr:`dims` specifies.
+    For example, if :attr:`input` has shape (4, 2) and :attr:`dims`
+    is (3, 3, 2, 2), then :attr:`input` is treated as if it had the
+    shape (1, 1, 4, 2).
+    
+    .. note::
+    
+        This function is similar to NumPy's tile function.
+    
+    Args:
+        input (Tensor): the tensor whose elements to repeat.
+        dims (tuple): the number of repetitions per dimension.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3])
+        >>> x.tile((2,))
+        tensor([1, 2, 3, 1, 2, 3])
+        >>> y = torch.tensor([[1, 2], [3, 4]])
+        >>> torch.tile(y, (2, 2))
+        tensor([[1, 2, 1, 2],
+                [3, 4, 3, 4],
+                [1, 2, 1, 2],
+                [3, 4, 3, 4]])
+    """
+    ...
+def topk(input: Tensor, k: Union[_int, SymInt], dim: _int = -1, largest: _bool = True, sorted: _bool = True, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.topk: 
+    r"""
+    topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
+    
+    Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
+    a given dimension.
+    
+    If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+    
+    If :attr:`largest` is ``False`` then the `k` smallest elements are returned.
+    
+    A namedtuple of `(values, indices)` is returned with the `values` and
+    `indices` of the largest `k` elements of each row of the `input` tensor in the
+    given dimension `dim`.
+    
+    The boolean option :attr:`sorted` if ``True``, will make sure that the returned
+    `k` elements are themselves sorted
+    
+    Args:
+        input (Tensor): the input tensor.
+        k (int): the k in "top-k"
+        dim (int, optional): the dimension to sort along
+        largest (bool, optional): controls whether to return largest or
+               smallest elements
+        sorted (bool, optional): controls whether to return the elements
+               in sorted order
+    
+    Keyword args:
+        out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
+            optionally given to be used as output buffers
+    
+    Example::
+    
+        >>> x = torch.arange(1., 6.)
+        >>> x
+        tensor([ 1.,  2.,  3.,  4.,  5.])
+        >>> torch.topk(x, 3)
+        torch.return_types.topk(values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))
+    """
+    ...
+def trace(input: Tensor) -> Tensor: 
+    r"""
+    trace(input) -> Tensor
+    
+    Returns the sum of the elements of the diagonal of the input 2-D matrix.
+    
+    Example::
+    
+        >>> x = torch.arange(1., 10.).view(3, 3)
+        >>> x
+        tensor([[ 1.,  2.,  3.],
+                [ 4.,  5.,  6.],
+                [ 7.,  8.,  9.]])
+        >>> torch.trace(x)
+        tensor(15.)
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: _int, dim1: _int) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+@overload
+def transpose(input: Tensor, dim0: Union[str, ellipsis, None], dim1: Union[str, ellipsis, None]) -> Tensor: 
+    r"""
+    transpose(input, dim0, dim1) -> Tensor
+    
+    Returns a tensor that is a transposed version of :attr:`input`.
+    The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+    
+    If :attr:`input` is a strided tensor then the resulting :attr:`out`
+    tensor shares its underlying storage with the :attr:`input` tensor, so
+    changing the content of one would change the content of the other.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+    resulting :attr:`out` tensor *does not* share the underlying storage
+    with the :attr:`input` tensor.
+    
+    If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+    layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+    :attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+    both be sparse dimensions. The batch dimensions of a sparse tensor are the
+    dimensions preceding the sparse dimensions.
+    
+    .. note::
+        Transpositions which interchange the sparse dimensions of a `SparseCSR`
+        or `SparseCSC` layout tensor will result in the layout changing between
+        the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+        or `SparseBSC` layout tensor will likewise generate a result with the
+        opposite layout.
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim0 (int): the first dimension to be transposed
+        dim1 (int): the second dimension to be transposed
+    
+    Example::
+    
+        >>> x = torch.randn(2, 3)
+        >>> x
+        tensor([[ 1.0028, -0.9893,  0.5809],
+                [-0.1669,  0.7299,  0.4942]])
+        >>> torch.transpose(x, 0, 1)
+        tensor([[ 1.0028, -0.1669],
+                [-0.9893,  0.7299],
+                [ 0.5809,  0.4942]])
+    
+    See also :func:`torch.t`.
+    """
+    ...
+def transpose_copy(input: Tensor, dim0: _int, dim1: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.transpose`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapezoid(y: Tensor, *, dx: Union[Number, _complex] = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+    
+    Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+    :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+    :attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+    used to specify arbitrary spacing along :attr:`dim`.
+    
+    
+    Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+    the default computation is
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`dx` is specified the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+    assuming :attr:`x` is also a one-dimensional tensor with
+    elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+    
+    .. math::
+        \begin{aligned}
+            \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+        \end{aligned}
+    
+    When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+    The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+    and :attr:`y`, the function computes the difference between consecutive elements along
+    dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+    the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+    After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+    See the examples below for details.
+    
+    .. note::
+        The trapezoidal rule is a technique for approximating the definite integral of a function
+        by averaging its left and right Riemann sums. The approximation becomes more accurate as
+        the resolution of the partition increases.
+    
+    Arguments:
+        y (Tensor): Values to use when computing the trapezoidal rule.
+        x (Tensor): If specified, defines spacing between values as specified above.
+    
+    Keyword arguments:
+        dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+            are specified then this defaults to 1. Effectively multiplies the result by its value.
+        dim (int): The dimension along which to compute the trapezoidal rule.
+            The last (inner-most) dimension by default.
+    
+    Examples::
+    
+        >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+        >>> y = torch.tensor([1, 5, 10])
+        >>> torch.trapezoid(y)
+        tensor(10.5)
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> (1 + 10 + 10) / 2
+        10.5
+    
+        >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+        >>> # NOTE: the result is the same as before, but multiplied by 2
+        >>> torch.trapezoid(y, dx=2)
+        21.0
+    
+        >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        28.5
+    
+        >>> # Computes the same trapezoidal rule directly to verify
+        >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+        28.5
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+        >>> y = torch.arange(9).reshape(3, 3)
+        tensor([[0, 1, 2],
+                [3, 4, 5],
+                [6, 7, 8]])
+        >>> torch.trapezoid(y)
+        tensor([ 2., 8., 14.])
+    
+        >>> # Computes the trapezoidal rule for each column of the matrix
+        >>> torch.trapezoid(y, dim=0)
+        tensor([ 6., 8., 10.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with the same arbitrary spacing
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([1, 3, 6])
+        >>> torch.trapezoid(y, x)
+        array([5., 5., 5.])
+    
+        >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+        >>> #   with different arbitrary spacing per row
+        >>> y = torch.ones(3, 3)
+        >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+        >>> torch.trapezoid(y, x)
+        array([2., 4., 6.])
+    """
+    ...
+@overload
+def trapz(y: Tensor, *, dx: _float = 1, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+@overload
+def trapz(y: Tensor, x: Tensor, *, dim: _int = -1) -> Tensor: 
+    r"""
+    trapz(y, x, *, dim=-1) -> Tensor
+    
+    Alias for :func:`torch.trapezoid`.
+    """
+    ...
+def triangular_solve(input: Tensor, A: Tensor, upper: _bool = True, transpose: _bool = False, unitriangular: _bool = False, *, out: Union[Tensor, Tuple[Tensor, ...], List[Tensor], None] = None) -> torch.return_types.triangular_solve: 
+    r"""
+    triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor)
+    
+    Solves a system of equations with a square upper or lower triangular invertible matrix :math:`A`
+    and multiple right-hand sides :math:`b`.
+    
+    In symbols, it solves :math:`AX = b` and assumes :math:`A` is square upper-triangular
+    (or lower-triangular if :attr:`upper`\ `= False`) and does not have zeros on the diagonal.
+    
+    `torch.triangular_solve(b, A)` can take in 2D inputs `b, A` or inputs that are
+    batches of 2D matrices. If the inputs are batches, then returns
+    batched outputs `X`
+    
+    If the diagonal of :attr:`A` contains zeros or elements that are very close to zero and
+    :attr:`unitriangular`\ `= False` (default) or if the input matrix is badly conditioned,
+    the result may contain `NaN` s.
+    
+    Supports input of float, double, cfloat and cdouble data types.
+    
+    .. warning::
+    
+        :func:`torch.triangular_solve` is deprecated in favor of :func:`torch.linalg.solve_triangular`
+        and will be removed in a future PyTorch release.
+        :func:`torch.linalg.solve_triangular` has its arguments reversed and does not return a
+        copy of one of the inputs.
+    
+        ``X = torch.triangular_solve(B, A).solution`` should be replaced with
+    
+        .. code:: python
+    
+            X = torch.linalg.solve_triangular(A, B)
+    
+    Args:
+        b (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where
+                    :math:`*` is zero of more batch dimensions
+        A (Tensor): the input triangular coefficient matrix of size :math:`(*, m, m)`
+                    where :math:`*` is zero or more batch dimensions
+        upper (bool, optional): whether :math:`A` is upper or lower triangular. Default: ``True``.
+        transpose (bool, optional): solves `op(A)X = b` where `op(A) = A^T` if this flag is ``True``,
+                                    and `op(A) = A` if it is ``False``. Default: ``False``.
+        unitriangular (bool, optional): whether :math:`A` is unit triangular.
+            If True, the diagonal elements of :math:`A` are assumed to be
+            1 and not referenced from :math:`A`. Default: ``False``.
+    
+    Keyword args:
+        out ((Tensor, Tensor), optional): tuple of two tensors to write
+            the output to. Ignored if `None`. Default: `None`.
+    
+    Returns:
+        A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient`
+        is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b`
+        (or whatever variant of the system of equations, depending on the keyword arguments.)
+    
+    Examples::
+    
+        >>> A = torch.randn(2, 2).triu()
+        >>> A
+        tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]])
+        >>> b = torch.randn(2, 3)
+        >>> b
+        tensor([[-0.0210,  2.3513, -1.5492],
+                [ 1.5429,  0.7403, -1.0243]])
+        >>> torch.triangular_solve(b, A)
+        torch.return_types.triangular_solve(
+        solution=tensor([[ 1.7841,  2.9046, -2.5405],
+                [ 1.9320,  0.9270, -1.2826]]),
+        cloned_coefficient=tensor([[ 1.1527, -1.0753],
+                [ 0.0000,  0.7986]]))
+    """
+    ...
+def tril(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    tril(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[-1.0813, -0.8619,  0.7105],
+                [ 0.0935,  0.1380,  2.2112],
+                [-0.3409, -0.9828,  0.0289]])
+        >>> torch.tril(a)
+        tensor([[-1.0813,  0.0000,  0.0000],
+                [ 0.0935,  0.1380,  0.0000],
+                [-0.3409, -0.9828,  0.0289]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 1.2219,  0.5653, -0.2521, -0.2345,  1.2544,  0.3461],
+                [ 0.4785, -0.4477,  0.6049,  0.6368,  0.8775,  0.7145],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.3615,  0.6864],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0978]])
+        >>> torch.tril(b, diagonal=1)
+        tensor([[ 1.2219,  0.5653,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785, -0.4477,  0.6049,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716, -1.1243, -0.5413,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0000]])
+        >>> torch.tril(b, diagonal=-1)
+        tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 0.4785,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                [ 1.1502,  3.2716,  0.0000,  0.0000,  0.0000,  0.0000],
+                [-0.0614, -0.7344, -1.3164,  0.0000,  0.0000,  0.0000]])
+    """
+    ...
+def tril_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the lower triangular part of a :attr:`row`-by-
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The lower triangular part of the matrix is defined as the elements on and
+    below the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and below the main diagonal are
+    retained. A positive value includes just as many diagonals above the main
+    diagonal, and similarly a negative value excludes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.tril_indices(3, 3)
+        >>> a
+        tensor([[0, 1, 1, 2, 2, 2],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, -1)
+        >>> a
+        tensor([[1, 2, 2, 3, 3, 3],
+                [0, 0, 1, 0, 1, 2]])
+    
+        >>> a = torch.tril_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
+                [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
+    """
+    ...
+def triplet_margin_loss(anchor: Tensor, positive: Tensor, negative: Tensor, margin: _float = 1.0, p: _float = 2, eps: _float = 1e-06, swap: _bool = False, reduction: _int = 1) -> Tensor: ...
+def triu(input: Tensor, diagonal: _int = 0, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    triu(input, diagonal=0, *, out=None) -> Tensor
+    
+    Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
+    :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`diagonal` controls which diagonal to consider. If
+    :attr:`diagonal` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+    :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    Args:
+        input (Tensor): the input tensor.
+        diagonal (int, optional): the diagonal to consider
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(3, 3)
+        >>> a
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.3480, -0.5211, -0.4573]])
+        >>> torch.triu(a)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.0000, -1.0680,  0.6602],
+                [ 0.0000,  0.0000, -0.4573]])
+        >>> torch.triu(a, diagonal=1)
+        tensor([[ 0.0000,  0.5207,  2.0049],
+                [ 0.0000,  0.0000,  0.6602],
+                [ 0.0000,  0.0000,  0.0000]])
+        >>> torch.triu(a, diagonal=-1)
+        tensor([[ 0.2309,  0.5207,  2.0049],
+                [ 0.2072, -1.0680,  0.6602],
+                [ 0.0000, -0.5211, -0.4573]])
+    
+        >>> b = torch.randn(4, 6)
+        >>> b
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.4333,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=1)
+        tensor([[ 0.0000, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [ 0.0000,  0.0000, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.0000,  0.0000, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000,  0.0000,  0.0000,  0.4798,  0.2830]])
+        >>> torch.triu(b, diagonal=-1)
+        tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+                [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+                [ 0.0000,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+                [ 0.0000,  0.0000, -1.3337, -1.6556,  0.4798,  0.2830]])
+    """
+    ...
+def triu_indices(row: _int, col: _int, offset: _int = 0, *, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+    
+    Returns the indices of the upper triangular part of a :attr:`row` by
+    :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+    coordinates of all indices and the second row contains column coordinates.
+    Indices are ordered based on rows and then columns.
+    
+    The upper triangular part of the matrix is defined as the elements on and
+    above the diagonal.
+    
+    The argument :attr:`offset` controls which diagonal to consider. If
+    :attr:`offset` = 0, all elements on and above the main diagonal are
+    retained. A positive value excludes just as many diagonals above the main
+    diagonal, and similarly a negative value includes just as many diagonals below
+    the main diagonal. The main diagonal are the set of indices
+    :math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+    where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+    
+    .. note::
+        When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+        prevent overflow during calculation.
+    
+    Args:
+        row (``int``): number of rows in the 2-D matrix.
+        col (``int``): number of columns in the 2-D matrix.
+        offset (``int``): diagonal offset from the main diagonal.
+            Default: if not provided, 0.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, ``torch.long``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+    
+    Example::
+    
+        >>> a = torch.triu_indices(3, 3)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 2],
+                [0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, -1)
+        >>> a
+        tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3],
+                [0, 1, 2, 0, 1, 2, 1, 2, 2]])
+    
+        >>> a = torch.triu_indices(4, 3, 1)
+        >>> a
+        tensor([[0, 0, 1],
+                [1, 2, 2]])
+    """
+    ...
+def true_divide(input: Union[Tensor, Number], other: Union[Tensor, Number], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    true_divide(dividend, divisor, *, out) -> Tensor
+    
+    Alias for :func:`torch.div` with ``rounding_mode=None``.
+    """
+    ...
+def trunc(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    trunc(input, *, out=None) -> Tensor
+    
+    Returns a new tensor with the truncated integer values of
+    the elements of :attr:`input`.
+    
+    For integer inputs, follows the array-api convention of returning a
+    copy of the input tensor.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.randn(4)
+        >>> a
+        tensor([ 3.4742,  0.5466, -0.8008, -0.9079])
+        >>> torch.trunc(a)
+        tensor([ 3.,  0., -0., -0.])
+    """
+    ...
+def trunc_(input: Tensor) -> Tensor: ...
+@overload
+def unbind(input: Tensor, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+@overload
+def unbind(input: Tensor, dim: Union[str, ellipsis, None]) -> Tuple[Tensor, ...]: 
+    r"""
+    unbind(input, dim=0) -> seq
+    
+    Removes a tensor dimension.
+    
+    Returns a tuple of all slices along a given dimension, already without it.
+    
+    Arguments:
+        input (Tensor): the tensor to unbind
+        dim (int): dimension to remove
+    
+    Example::
+    
+        >>> torch.unbind(torch.tensor([[1, 2, 3],
+        >>>                            [4, 5, 6],
+        >>>                            [7, 8, 9]]))
+        (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+    """
+    ...
+def unbind_copy(input: Tensor, dim: _int = 0, *, out: Union[Tuple[Tensor, ...], List[Tensor], None] = None) -> None: 
+    r"""
+    Performs the same operation as :func:`torch.unbind`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: Union[str, ellipsis, None], sizes: Sequence[Union[_int, SymInt]], names: Sequence[Union[str, ellipsis, None]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+@overload
+def unflatten(input: Tensor, dim: _int, sizes: Sequence[Union[_int, SymInt]]) -> Tensor: 
+    r"""
+    unflatten(input, dim, sizes) -> Tensor
+    
+    Expands a dimension of the input tensor over multiple dimensions.
+    
+    .. seealso::
+    
+        :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): Dimension to be unflattened, specified as an index into
+             ``input.shape``.
+        sizes (Tuple[int]): New shape of the unflattened dimension.
+             One of its elements can be `-1` in which case the corresponding output
+             dimension is inferred. Otherwise, the product of ``sizes`` *must*
+             equal ``input.shape[dim]``.
+    
+    Returns:
+        A View of input with the specified dimension unflattened.
+    
+    Examples::
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+        torch.Size([3, 2, 2, 1])
+        >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+        torch.Size([5, 2, 2, 3, 1, 1, 3])
+    """
+    ...
+def unfold_copy(input: Tensor, dimension: _int, size: _int, step: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unfold`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def unique_dim(input: Tensor, dim: _int, sorted: _bool = True, return_inverse: _bool = False, return_counts: _bool = False) -> Tuple[Tensor, Tensor, Tensor]: ...
+def unsafe_chunk(input: Tensor, chunks: _int, dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_chunk(input, chunks, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.chunk` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split(input: Tensor, split_size: Union[_int, SymInt], dim: _int = 0) -> Tuple[Tensor, ...]: 
+    r"""
+    unsafe_split(tensor, split_size_or_sections, dim=0) -> List of Tensors
+    
+    Works like :func:`torch.split` but without enforcing the autograd restrictions
+    on inplace modification of the outputs.
+    
+    .. warning::
+        This function is safe to use as long as only the input, or only the outputs
+        are modified inplace after calling this function. It is user's
+        responsibility to ensure that is the case. If both the input and one or more
+        of the outputs are modified inplace, gradients computed by autograd will be
+        silently incorrect.
+    """
+    ...
+def unsafe_split_with_sizes(input: Tensor, split_sizes: Sequence[Union[_int, SymInt]], dim: _int = 0) -> Tuple[Tensor, ...]: ...
+def unsqueeze(input: Tensor, dim: _int) -> Tensor: 
+    r"""
+    unsqueeze(input, dim) -> Tensor
+    
+    Returns a new tensor with a dimension of size one inserted at the
+    specified position.
+    
+    The returned tensor shares the same underlying data with this tensor.
+    
+    A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+    can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+    applied at :attr:`dim` = ``dim + input.dim() + 1``.
+    
+    Args:
+        input (Tensor): the input tensor.
+        dim (int): the index at which to insert the singleton dimension
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 4])
+        >>> torch.unsqueeze(x, 0)
+        tensor([[ 1,  2,  3,  4]])
+        >>> torch.unsqueeze(x, 1)
+        tensor([[ 1],
+                [ 2],
+                [ 3],
+                [ 4]])
+    """
+    ...
+def unsqueeze_copy(input: Tensor, dim: _int, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.unsqueeze`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def values_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.values`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def vander(x: Tensor, N: Optional[_int] = None, increasing: _bool = False) -> Tensor: 
+    r"""
+    vander(x, N=None, increasing=False) -> Tensor
+    
+    Generates a Vandermonde matrix.
+    
+    The columns of the output matrix are elementwise powers of the input vector :math:`x^{(N-1)}, x^{(N-2)}, ..., x^0`.
+    If increasing is True, the order of the columns is reversed :math:`x^0, x^1, ..., x^{(N-1)}`. Such a
+    matrix with a geometric progression in each row is named for Alexandre-Theophile Vandermonde.
+    
+    Arguments:
+        x (Tensor): 1-D input tensor.
+        N (int, optional): Number of columns in the output. If N is not specified,
+            a square array is returned :math:`(N = len(x))`.
+        increasing (bool, optional): Order of the powers of the columns. If True,
+            the powers increase from left to right, if False (the default) they are reversed.
+    
+    Returns:
+        Tensor: Vandermonde matrix. If increasing is False, the first column is :math:`x^{(N-1)}`,
+        the second :math:`x^{(N-2)}` and so forth. If increasing is True, the columns
+        are :math:`x^0, x^1, ..., x^{(N-1)}`.
+    
+    Example::
+    
+        >>> x = torch.tensor([1, 2, 3, 5])
+        >>> torch.vander(x)
+        tensor([[  1,   1,   1,   1],
+                [  8,   4,   2,   1],
+                [ 27,   9,   3,   1],
+                [125,  25,   5,   1]])
+        >>> torch.vander(x, N=3)
+        tensor([[ 1,  1,  1],
+                [ 4,  2,  1],
+                [ 9,  3,  1],
+                [25,  5,  1]])
+        >>> torch.vander(x, N=3, increasing=True)
+        tensor([[ 1,  1,  1],
+                [ 1,  2,  4],
+                [ 1,  3,  9],
+                [ 1,  5, 25]])
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, unbiased: _bool = True) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+    
+    Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+    can be a single dimension, list of dimensions, or ``None`` to reduce over all
+    dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var(a, dim=1, keepdim=True)
+        tensor([[1.0631],
+                [0.5590],
+                [1.4893],
+                [0.8258]])
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Optional[Union[_int, _size]] = None, *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, unbiased: _bool = True) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], *, correction: Optional[Union[Number, _complex]] = None, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+@overload
+def var_mean(input: Tensor, dim: Sequence[Union[str, ellipsis, None]], unbiased: _bool = True, keepdim: _bool = False) -> Tuple[Tensor, Tensor]: 
+    r"""
+    var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+    
+    Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+    :attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+    reduce over all dimensions.
+    
+    The variance (:math:`\sigma^2`) is calculated as
+    
+    .. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+    
+    where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+    sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+    the :attr:`correction`.
+    
+    
+    
+    If :attr:`keepdim` is ``True``, the output tensor is of the same size
+    as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+    Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+    output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+    
+    
+    Args:
+        input (Tensor): the input tensor.
+        
+        dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+            If ``None``, all dimensions are reduced.
+    
+    
+    Keyword args:
+        correction (int): difference between the sample size and sample degrees of freedom.
+            Defaults to `Bessel's correction`_, ``correction=1``.
+    
+            .. versionchanged:: 2.0
+                Previously this argument was called ``unbiased`` and was a boolean
+                with ``True`` corresponding to ``correction=1`` and ``False`` being
+                ``correction=0``.
+        keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        A tuple (var, mean) containing the variance and mean.
+    
+    Example:
+    
+        >>> a = torch.tensor(
+        ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+        ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+        ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+        ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+        >>> torch.var_mean(a, dim=0, keepdim=True)
+        (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+         tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+    
+    .. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+    """
+    ...
+def vdot(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vdot(input, other, *, out=None) -> Tensor
+    
+    Computes the dot product of two 1D vectors along a dimension.
+    
+    In symbols, this function computes
+    
+    .. math::
+    
+        \sum_{i=1}^n \overline{x_i}y_i.
+    
+    where :math:`\overline{x_i}` denotes the conjugate for complex
+    vectors, and it is the identity for real vectors.
+    
+    .. note::
+    
+        Unlike NumPy's vdot, torch.vdot intentionally only supports computing the dot product
+        of two 1D tensors with the same number of elements.
+    
+    .. seealso::
+    
+            :func:`torch.linalg.vecdot` computes the dot product of two batches of vectors along a dimension.
+    
+    Args:
+        input (Tensor): first tensor in the dot product, must be 1D. Its conjugate is used if it's complex.
+        other (Tensor): second tensor in the dot product, must be 1D.
+    
+    Keyword args:
+    
+    .. note:: out (Tensor, optional): the output tensor.
+    
+    
+    Example::
+    
+        >>> torch.vdot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+        tensor(7)
+        >>> a = torch.tensor((1 +2j, 3 - 1j))
+        >>> b = torch.tensor((2 +1j, 4 - 0j))
+        >>> torch.vdot(a, b)
+        tensor([16.+1.j])
+        >>> torch.vdot(b, a)
+        tensor([16.-1.j])
+    """
+    ...
+def view_as_complex(input: Tensor) -> Tensor: 
+    r"""
+    view_as_complex(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a complex tensor. For an input complex
+    tensor of :attr:`size` :math:`m1, m2, \dots, mi, 2`, this function returns a
+    new complex tensor of :attr:`size` :math:`m1, m2, \dots, mi` where the last
+    dimension of the input tensor is expected to represent the real and imaginary
+    components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_complex` is only supported for tensors with
+        :class:`torch.dtype` ``torch.float64`` and ``torch.float32``.  The input is
+        expected to have the last dimension of :attr:`size` 2. In addition, the
+        tensor must have a `stride` of 1 for its last dimension. The strides of all
+        other dimensions must be even numbers.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, 2)
+        >>> x
+        tensor([[ 1.6116, -0.5772],
+                [-1.4606, -0.9120],
+                [ 0.0786, -1.7497],
+                [-0.6561, -1.6623]])
+        >>> torch.view_as_complex(x)
+        tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)])
+    """
+    ...
+def view_as_complex_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_complex`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+def view_as_real(input: Tensor) -> Tensor: 
+    r"""
+    view_as_real(input) -> Tensor
+    
+    Returns a view of :attr:`input` as a real tensor. For an input complex tensor of
+    :attr:`size` :math:`m1, m2, \dots, mi`, this function returns a new
+    real tensor of size :math:`m1, m2, \dots, mi, 2`, where the last dimension of size 2
+    represents the real and imaginary components of complex numbers.
+    
+    .. warning::
+        :func:`view_as_real` is only supported for tensors with ``complex dtypes``.
+    
+    Args:
+        input (Tensor): the input tensor.
+    
+    Example::
+    
+        >>> x=torch.randn(4, dtype=torch.cfloat)
+        >>> x
+        tensor([(0.4737-0.3839j), (-0.2098-0.6699j), (0.3470-0.9451j), (-0.5174-1.3136j)])
+        >>> torch.view_as_real(x)
+        tensor([[ 0.4737, -0.3839],
+                [-0.2098, -0.6699],
+                [ 0.3470, -0.9451],
+                [-0.5174, -1.3136]])
+    """
+    ...
+def view_as_real_copy(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view_as_real`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, dtype: _dtype, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def view_copy(input: Tensor, size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    Performs the same operation as :func:`torch.view`, but all output tensors
+    are freshly created instead of aliasing the input.
+    """
+    ...
+@overload
+def vsplit(input: Tensor, sections: _int) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+@overload
+def vsplit(input: Tensor, indices: _size) -> Tuple[Tensor, ...]: 
+    r"""
+    vsplit(input, indices_or_sections) -> List of Tensors
+    
+    Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+    vertically according to :attr:`indices_or_sections`. Each split is a view of
+    :attr:`input`.
+    
+    This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+    (the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+    it must evenly divide the split dimension or a runtime error will be thrown.
+    
+    This function is based on NumPy's :func:`numpy.vsplit`.
+    
+    Args:
+        input (Tensor): tensor to split.
+        indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+    
+    Example::
+        >>> t = torch.arange(16.0).reshape(4,4)
+        >>> t
+        tensor([[ 0.,  1.,  2.,  3.],
+                [ 4.,  5.,  6.,  7.],
+                [ 8.,  9., 10., 11.],
+                [12., 13., 14., 15.]])
+        >>> torch.vsplit(t, 2)
+        (tensor([[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]),
+         tensor([[ 8.,  9., 10., 11.],
+                 [12., 13., 14., 15.]]))
+        >>> torch.vsplit(t, [3, 6])
+        (tensor([[ 0.,  1.,  2.,  3.],
+                 [ 4.,  5.,  6.,  7.],
+                 [ 8.,  9., 10., 11.]]),
+         tensor([[12., 13., 14., 15.]]),
+         tensor([], size=(0, 4)))
+    """
+    ...
+def vstack(tensors: Union[Tuple[Tensor, ...], List[Tensor]], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    vstack(tensors, *, out=None) -> Tensor
+    
+    Stack tensors in sequence vertically (row wise).
+    
+    This is equivalent to concatenation along the first axis after all 1-D tensors have been reshaped by :func:`torch.atleast_2d`.
+    
+    Args:
+        tensors (sequence of Tensors): sequence of tensors to concatenate
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Example::
+    
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5, 6])
+        >>> torch.vstack((a,b))
+        tensor([[1, 2, 3],
+                [4, 5, 6]])
+        >>> a = torch.tensor([[1],[2],[3]])
+        >>> b = torch.tensor([[4],[5],[6]])
+        >>> torch.vstack((a,b))
+        tensor([[1],
+                [2],
+                [3],
+                [4],
+                [5],
+                [6]])
+    """
+    ...
+@overload
+def where(condition: Tensor) -> Tuple[Tensor, ...]: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Tensor) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, input: Tensor, other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def where(condition: Tensor, self: Union[Number, _complex], other: Union[Number, _complex]) -> Tensor: 
+    r"""
+    where(condition, input, other, *, out=None) -> Tensor
+    
+    Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+    
+    The operation is defined as:
+    
+    .. math::
+        \text{out}_i = \begin{cases}
+            \text{input}_i & \text{if } \text{condition}_i \\
+            \text{other}_i & \text{otherwise} \\
+        \end{cases}
+    
+    .. note::
+        The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+    
+    Arguments:
+        condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+        input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``True``
+        other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                              where :attr:`condition` is ``False``
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+    
+    Returns:
+        Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+    
+    Example::
+    
+        >>> x = torch.randn(3, 2)
+        >>> y = torch.ones(3, 2)
+        >>> x
+        tensor([[-0.4620,  0.3139],
+                [ 0.3898, -0.7197],
+                [ 0.0478, -0.1657]])
+        >>> torch.where(x > 0, 1.0, 0.0)
+        tensor([[0., 1.],
+                [1., 0.],
+                [1., 0.]])
+        >>> torch.where(x > 0, x, y)
+        tensor([[ 1.0000,  0.3139],
+                [ 0.3898,  1.0000],
+                [ 0.0478,  1.0000]])
+        >>> x = torch.randn(2, 2, dtype=torch.double)
+        >>> x
+        tensor([[ 1.0779,  0.0383],
+                [-0.8785, -1.1089]], dtype=torch.float64)
+        >>> torch.where(x > 0, x, 0.)
+        tensor([[1.0779, 0.0383],
+                [0.0000, 0.0000]], dtype=torch.float64)
+    
+    .. function:: where(condition) -> tuple of LongTensor
+       :noindex:
+    
+    ``torch.where(condition)`` is identical to
+    ``torch.nonzero(condition, as_tuple=True)``.
+    
+    .. note::
+        See also :func:`torch.nonzero`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(self: Union[Number, _complex], other: Tensor, *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy(input: Tensor, other: Union[Number, _complex], *, out: Optional[Tensor] = None) -> Tensor: 
+    r"""
+    xlogy(input, other, *, out=None) -> Tensor
+    
+    Alias for :func:`torch.special.xlogy`.
+    """
+    ...
+@overload
+def xlogy_(input: Tensor, other: Tensor) -> Tensor: ...
+@overload
+def xlogy_(input: Tensor, other: Union[Number, _complex]) -> Tensor: ...
+def zero_(input: Tensor) -> Tensor: ...
+@overload
+def zeros(size: Sequence[Union[_int, SymInt]], *, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, out: Optional[Tensor] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(size: _size, *, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+@overload
+def zeros(*size: _int, names: Optional[Sequence[Union[str, ellipsis, None]]], dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the shape defined
+    by the variable argument :attr:`size`.
+    
+    Args:
+        size (int...): a sequence of integers defining the shape of the output tensor.
+            Can be a variable number of arguments or a collection like a list or tuple.
+    
+    Keyword args:
+        out (Tensor, optional): the output tensor.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, uses the current device for the default tensor type
+            (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+            for CPU tensor types and the current CUDA device for CUDA tensor types.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+    
+    Example::
+    
+        >>> torch.zeros(2, 3)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    
+        >>> torch.zeros(5)
+        tensor([ 0.,  0.,  0.,  0.,  0.])
+    """
+    ...
+def zeros_like(input: Tensor, *, memory_format: Optional[memory_format] = None, dtype: Optional[_dtype] = None, layout: Optional[_layout] = None, device: Optional[Optional[DeviceLikeType]] = None, pin_memory: Optional[_bool] = False, requires_grad: Optional[_bool] = False) -> Tensor: 
+    r"""
+    zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+    
+    Returns a tensor filled with the scalar value `0`, with the same size as
+    :attr:`input`. ``torch.zeros_like(input)`` is equivalent to
+    ``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+    
+    .. warning::
+        As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+        the old ``torch.zeros_like(input, out=output)`` is equivalent to
+        ``torch.zeros(input.size(), out=output)``.
+    
+    Args:
+        input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: if ``None``, defaults to the dtype of :attr:`input`.
+        layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+            Default: if ``None``, defaults to the layout of :attr:`input`.
+        device (:class:`torch.device`, optional): the desired device of returned tensor.
+            Default: if ``None``, defaults to the device of :attr:`input`.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.preserve_format``.
+    
+    Example::
+    
+        >>> input = torch.empty(2, 3)
+        >>> torch.zeros_like(input)
+        tensor([[ 0.,  0.,  0.],
+                [ 0.,  0.,  0.]])
+    """
+    ...
+
+__all__ = ['__and__', '__lshift__', '__or__', '__rshift__', '__xor__', '_adaptive_avg_pool2d',
+ '_adaptive_avg_pool3d', '_add_batch_dim', '_add_relu', '_add_relu_', '_addmm_activation',
+ '_aminmax', '_amp_foreach_non_finite_check_and_unscale_', '_amp_update_scale_', '_assert_async',
+ '_assert_scalar', '_assert_tensor_metadata', '_batch_norm_impl_index', '_cast_Byte', '_cast_Char',
+ '_cast_Double', '_cast_Float', '_cast_Half', '_cast_Int', '_cast_Long', '_cast_Short',
+ '_choose_qparams_per_tensor', '_chunk_cat', '_coalesce', '_compute_linear_combination', '_conj',
+ '_conj_copy', '_conj_physical', '_convert_indices_from_coo_to_csr',
+ '_convert_indices_from_csr_to_coo', '_convert_weight_to_int4pack', '_convolution',
+ '_convolution_mode', '_copy_from', '_copy_from_and_resize', '_cslt_compress', '_cslt_sparse_mm',
+ '_cslt_sparse_mm_search', '_ctc_loss', '_cudnn_ctc_loss', '_cudnn_init_dropout_state',
+ '_cudnn_rnn', '_cudnn_rnn_flatten_weight', '_cufft_clear_plan_cache',
+ '_cufft_get_plan_cache_max_size', '_cufft_get_plan_cache_size', '_cufft_set_plan_cache_max_size',
+ '_cummax_helper', '_cummin_helper', '_debug_has_internal_overlap', '_dim_arange',
+ '_dirichlet_grad', '_disable_functionalization', '_efficientzerotensor', '_embedding_bag',
+ '_embedding_bag_forward_only', '_empty_affine_quantized', '_empty_per_channel_affine_quantized',
+ '_enable_functionalization', '_euclidean_dist', '_fake_quantize_learnable_per_channel_affine',
+ '_fake_quantize_learnable_per_tensor_affine',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams',
+ '_fake_quantize_per_tensor_affine_cachemask_tensor_qparams', '_fft_c2c', '_fft_c2r', '_fft_r2c',
+ '_fill_mem_eff_dropout_mask_', '_foobar', '_foreach_abs', '_foreach_abs_', '_foreach_acos',
+ '_foreach_acos_', '_foreach_add', '_foreach_add_', '_foreach_addcdiv', '_foreach_addcdiv_',
+ '_foreach_addcmul', '_foreach_addcmul_', '_foreach_asin', '_foreach_asin_', '_foreach_atan',
+ '_foreach_atan_', '_foreach_ceil', '_foreach_ceil_', '_foreach_clamp_max', '_foreach_clamp_max_',
+ '_foreach_clamp_min', '_foreach_clamp_min_', '_foreach_copy_', '_foreach_cos', '_foreach_cos_',
+ '_foreach_cosh', '_foreach_cosh_', '_foreach_div', '_foreach_div_', '_foreach_erf',
+ '_foreach_erf_', '_foreach_erfc', '_foreach_erfc_', '_foreach_exp', '_foreach_exp_',
+ '_foreach_expm1', '_foreach_expm1_', '_foreach_floor', '_foreach_floor_', '_foreach_frac',
+ '_foreach_frac_', '_foreach_lerp', '_foreach_lerp_', '_foreach_lgamma', '_foreach_lgamma_',
+ '_foreach_log', '_foreach_log10', '_foreach_log10_', '_foreach_log1p', '_foreach_log1p_',
+ '_foreach_log2', '_foreach_log2_', '_foreach_log_', '_foreach_maximum', '_foreach_maximum_',
+ '_foreach_minimum', '_foreach_minimum_', '_foreach_mul', '_foreach_mul_', '_foreach_neg',
+ '_foreach_neg_', '_foreach_norm', '_foreach_pow', '_foreach_pow_', '_foreach_reciprocal',
+ '_foreach_reciprocal_', '_foreach_round', '_foreach_round_', '_foreach_sigmoid',
+ '_foreach_sigmoid_', '_foreach_sign', '_foreach_sign_', '_foreach_sin', '_foreach_sin_',
+ '_foreach_sinh', '_foreach_sinh_', '_foreach_sqrt', '_foreach_sqrt_', '_foreach_sub',
+ '_foreach_sub_', '_foreach_tan', '_foreach_tan_', '_foreach_tanh', '_foreach_tanh_',
+ '_foreach_trunc', '_foreach_trunc_', '_foreach_zero_', '_from_functional_tensor',
+ '_functional_assert_async', '_functional_assert_scalar', '_functional_sym_constrain_range',
+ '_functional_sym_constrain_range_for_size',
+ '_functionalize_are_all_mutations_hidden_from_autograd',
+ '_functionalize_are_all_mutations_under_no_grad_or_inference_mode', '_functionalize_commit_update',
+ '_functionalize_mark_mutation_hidden_from_autograd', '_functionalize_replace',
+ '_functionalize_sync', '_fused_adam_', '_fused_adamw_', '_fused_dropout',
+ '_fused_moving_avg_obs_fq_helper', '_fused_moving_avg_obs_fq_helper', '_fused_sdp_choice',
+ '_fused_sgd_', '_fw_primal_copy', '_grid_sampler_2d_cpu_fallback',
+ '_has_compatible_shallow_copy_type', '_histogramdd_bin_edges', '_histogramdd_from_bin_cts',
+ '_histogramdd_from_bin_tensors', '_index_put_impl_', '_indices_copy', '_int_mm', '_is_all_true',
+ '_is_any_true', '_is_functional_tensor', '_is_zerotensor', '_lazy_clone', '_linalg_check_errors',
+ '_linalg_det', '_linalg_det', '_linalg_eigh', '_linalg_eigh', '_linalg_slogdet', '_linalg_slogdet',
+ '_linalg_solve_ex', '_linalg_solve_ex', '_linalg_svd', '_linalg_svd', '_log_softmax',
+ '_log_softmax_backward_data', '_logcumsumexp', '_lstm_mps', '_lu_with_info', '_lu_with_info',
+ '_make_dep_token', '_make_dual', '_make_dual_copy', '_make_per_channel_quantized_tensor',
+ '_make_per_tensor_quantized_tensor', '_masked_scale', '_masked_softmax', '_mixed_dtypes_linear',
+ '_mkldnn_reshape', '_mkldnn_transpose', '_mkldnn_transpose_', '_mps_convolution',
+ '_mps_convolution_transpose', '_native_batch_norm_legit', '_native_batch_norm_legit_no_training',
+ '_native_multi_head_attention', '_neg_view', '_neg_view_copy', '_nested_from_padded',
+ '_nested_from_padded_and_nested_example', '_nested_get_jagged_dummy', '_nested_get_lengths',
+ '_nested_get_offsets', '_nested_get_ragged_idx', '_nested_get_values', '_nested_get_values_copy',
+ '_nested_tensor_from_mask', '_nested_tensor_from_mask_left_aligned',
+ '_nested_tensor_from_tensor_list', '_nested_tensor_softmax_with_shape', '_nested_view_from_buffer',
+ '_nested_view_from_buffer_copy', '_nested_view_from_jagged', '_nested_view_from_jagged_copy',
+ '_nnpack_available', '_nnpack_spatial_convolution', '_pack_padded_sequence',
+ '_pad_packed_sequence', '_pin_memory', '_prelu_kernel', '_print', '_propagate_xla_data',
+ '_remove_batch_dim', '_reshape_alias_copy', '_reshape_from_tensor', '_resize_output_',
+ '_rowwise_prune', '_sample_dirichlet', '_saturate_weight_to_fp16',
+ '_scaled_dot_product_attention_math', '_scaled_dot_product_cudnn_attention',
+ '_scaled_dot_product_cudnn_attention', '_scaled_dot_product_efficient_attention',
+ '_scaled_dot_product_efficient_attention', '_scaled_dot_product_flash_attention',
+ '_scaled_dot_product_flash_attention', '_scaled_dot_product_flash_attention_for_cpu',
+ '_scaled_dot_product_flash_attention_for_cpu', '_scaled_mm', '_shape_as_tensor',
+ '_sobol_engine_draw', '_sobol_engine_ff_', '_sobol_engine_initialize_state_',
+ '_sobol_engine_scramble_', '_softmax', '_softmax_backward_data', '_sparse_broadcast_to',
+ '_sparse_broadcast_to_copy', '_sparse_csr_prod', '_sparse_csr_sum',
+ '_sparse_log_softmax_backward_data', '_sparse_semi_structured_linear',
+ '_sparse_softmax_backward_data', '_sparse_sparse_matmul', '_sparse_sum', '_stack',
+ '_standard_gamma', '_standard_gamma_grad', '_sync', '_test_autograd_multiple_dispatch',
+ '_test_autograd_multiple_dispatch_view', '_test_autograd_multiple_dispatch_view_copy',
+ '_test_check_tensor', '_test_functorch_fallback', '_test_parallel_materialize',
+ '_test_serialization_subcmul', '_to_cpu', '_to_functional_tensor', '_to_sparse_semi_structured',
+ '_transform_bias_rescale_qkv', '_transformer_encoder_layer_fwd', '_trilinear',
+ '_triton_multi_head_attention', '_triton_scaled_dot_attention', '_unique', '_unique2',
+ '_unpack_dual', '_unpack_dual', '_unsafe_index', '_unsafe_index_put', '_use_cudnn_ctc_loss',
+ '_use_cudnn_rnn_flatten_weight', '_validate_compressed_sparse_indices',
+ '_validate_sparse_bsc_tensor_args', '_validate_sparse_bsr_tensor_args',
+ '_validate_sparse_compressed_tensor_args', '_validate_sparse_coo_tensor_args',
+ '_validate_sparse_csc_tensor_args', '_validate_sparse_csr_tensor_args', '_values_copy',
+ '_weight_int4pack_mm', '_weight_int8pack_mm', '_weight_norm', '_weight_norm_interface', 'abs',
+ 'abs_', 'absolute', 'acos', 'acos_', 'acosh', 'acosh_', 'adaptive_avg_pool1d',
+ 'adaptive_max_pool1d', 'add', 'addbmm', 'addcdiv', 'addcmul', 'addmm', 'addmv', 'addmv_', 'addr',
+ 'adjoint', 'affine_grid_generator', 'alias_copy', 'all', 'allclose', 'alpha_dropout',
+ 'alpha_dropout_', 'amax', 'amin', 'aminmax', 'aminmax', 'angle', 'any', 'arange', 'arccos',
+ 'arccos_', 'arccosh', 'arccosh_', 'arcsin', 'arcsin_', 'arcsinh', 'arcsinh_', 'arctan', 'arctan2',
+ 'arctan_', 'arctanh', 'arctanh_', 'argmax', 'argmin', 'argsort', 'argwhere', 'as_strided',
+ 'as_strided_', 'as_strided_copy', 'as_strided_scatter', 'as_tensor', 'asarray', 'asin', 'asin_',
+ 'asinh', 'asinh_', 'atan', 'atan2', 'atan_', 'atanh', 'atanh_', 'avg_pool1d', 'baddbmm',
+ 'bartlett_window', 'batch_norm', 'batch_norm_backward_elemt', 'batch_norm_backward_reduce',
+ 'batch_norm_elemt', 'batch_norm_gather_stats', 'batch_norm_gather_stats_with_counts',
+ 'batch_norm_stats', 'batch_norm_update_stats', 'bernoulli', 'bilinear',
+ 'binary_cross_entropy_with_logits', 'bincount', 'binomial', 'bitwise_and', 'bitwise_left_shift',
+ 'bitwise_not', 'bitwise_or', 'bitwise_right_shift', 'bitwise_xor', 'blackman_window', 'bmm',
+ 'broadcast_to', 'bucketize', 'can_cast', 'cat', 'ccol_indices_copy', 'ceil', 'ceil_', 'celu',
+ 'celu_', 'channel_shuffle', 'cholesky', 'cholesky_inverse', 'cholesky_solve',
+ 'choose_qparams_optimized', 'chunk', 'clamp', 'clamp_', 'clamp_max', 'clamp_max_', 'clamp_min',
+ 'clamp_min_', 'clip', 'clip_', 'clone', 'col_indices_copy', 'column_stack', 'combinations',
+ 'complex', 'concat', 'concatenate', 'conj', 'conj_physical', 'conj_physical_', 'constant_pad_nd',
+ 'conv1d', 'conv2d', 'conv3d', 'conv_tbc', 'conv_transpose1d', 'conv_transpose2d',
+ 'conv_transpose3d', 'convolution', 'copysign', 'corrcoef', 'cos', 'cos_', 'cosh', 'cosh_',
+ 'cosine_embedding_loss', 'cosine_similarity', 'count_nonzero', 'cov', 'cross', 'crow_indices_copy',
+ 'ctc_loss', 'cudnn_affine_grid_generator', 'cudnn_batch_norm', 'cudnn_convolution',
+ 'cudnn_convolution_add_relu', 'cudnn_convolution_relu', 'cudnn_convolution_transpose',
+ 'cudnn_grid_sampler', 'cudnn_is_acceptable', 'cummax', 'cummax', 'cummin', 'cummin', 'cumprod',
+ 'cumsum', 'cumulative_trapezoid', 'deg2rad', 'deg2rad_', 'dequantize', 'det', 'detach', 'detach_',
+ 'detach_copy', 'diag', 'diag_embed', 'diagflat', 'diagonal', 'diagonal_copy', 'diagonal_scatter',
+ 'diff', 'digamma', 'dist', 'div', 'divide', 'dot', 'dropout', 'dropout_', 'dsmm', 'dsplit',
+ 'dstack', 'embedding', 'embedding_bag', 'embedding_renorm_', 'empty', 'empty_like',
+ 'empty_permuted', 'empty_quantized', 'empty_strided', 'eq', 'equal', 'erf', 'erf_', 'erfc',
+ 'erfc_', 'erfinv', 'exp', 'exp2', 'exp2_', 'exp_', 'expand_copy', 'expm1', 'expm1_', 'eye',
+ 'fake_quantize_per_channel_affine', 'fake_quantize_per_tensor_affine', 'fbgemm_linear_fp16_weight',
+ 'fbgemm_linear_fp16_weight_fp32_activation', 'fbgemm_linear_int8_weight',
+ 'fbgemm_linear_int8_weight_fp32_activation', 'fbgemm_linear_quantize_weight',
+ 'fbgemm_pack_gemm_matrix_fp16', 'fbgemm_pack_quantized_matrix', 'feature_alpha_dropout',
+ 'feature_alpha_dropout_', 'feature_dropout', 'feature_dropout_', 'fill', 'fill_', 'fix', 'fix_',
+ 'flatten', 'flip', 'fliplr', 'flipud', 'float_power', 'floor', 'floor_', 'floor_divide', 'fmax',
+ 'fmin', 'fmod', 'frac', 'frac_', 'frexp', 'frexp', 'frobenius_norm', 'from_file', 'from_numpy',
+ 'frombuffer', 'full', 'full_like', 'fused_moving_avg_obs_fake_quant', 'gather', 'gcd', 'gcd_',
+ 'ge', 'geqrf', 'geqrf', 'ger', 'get_default_dtype', 'get_num_interop_threads', 'get_num_threads',
+ 'gradient', 'greater', 'greater_equal', 'grid_sampler', 'grid_sampler_2d', 'grid_sampler_3d',
+ 'group_norm', 'gru', 'gru_cell', 'gt', 'hamming_window', 'hann_window', 'hardshrink', 'heaviside',
+ 'hinge_embedding_loss', 'histc', 'histogram', 'histogram', 'histogramdd', 'histogramdd', 'hsmm',
+ 'hsplit', 'hspmm', 'hstack', 'hypot', 'i0', 'i0_', 'igamma', 'igammac', 'imag', 'index_add',
+ 'index_copy', 'index_fill', 'index_put', 'index_put_', 'index_reduce', 'index_select',
+ 'indices_copy', 'init_num_threads', 'inner', 'instance_norm', 'int_repr', 'inverse', 'is_complex',
+ 'is_conj', 'is_distributed', 'is_floating_point', 'is_grad_enabled', 'is_inference',
+ 'is_inference_mode_enabled', 'is_neg', 'is_nonzero', 'is_same_size', 'is_signed',
+ 'is_vulkan_available', 'isclose', 'isfinite', 'isin', 'isinf', 'isnan', 'isneginf', 'isposinf',
+ 'isreal', 'istft', 'kaiser_window', 'kl_div', 'kron', 'kthvalue', 'kthvalue', 'layer_norm', 'lcm',
+ 'lcm_', 'ldexp', 'ldexp_', 'le', 'lerp', 'less', 'less_equal', 'lgamma', 'linspace', 'log',
+ 'log10', 'log10_', 'log1p', 'log1p_', 'log2', 'log2_', 'log_', 'log_softmax', 'logaddexp',
+ 'logaddexp2', 'logcumsumexp', 'logdet', 'logical_and', 'logical_not', 'logical_or', 'logical_xor',
+ 'logit', 'logit_', 'logspace', 'logsumexp', 'lstm', 'lstm_cell', 'lt', 'lu_solve', 'lu_unpack',
+ 'lu_unpack', 'margin_ranking_loss', 'masked_fill', 'masked_scatter', 'masked_select', 'matmul',
+ 'matrix_exp', 'matrix_power', 'max', 'max', 'max_pool1d', 'max_pool1d_with_indices', 'max_pool2d',
+ 'max_pool3d', 'maximum', 'mean', 'median', 'median', 'min', 'min', 'minimum', 'miopen_batch_norm',
+ 'miopen_convolution', 'miopen_convolution_add_relu', 'miopen_convolution_relu',
+ 'miopen_convolution_transpose', 'miopen_depthwise_convolution', 'miopen_rnn',
+ 'mkldnn_adaptive_avg_pool2d', 'mkldnn_convolution', 'mkldnn_linear_backward_weights',
+ 'mkldnn_max_pool2d', 'mkldnn_max_pool3d', 'mkldnn_rnn_layer', 'mm', 'mode', 'mode', 'moveaxis',
+ 'movedim', 'msort', 'mul', 'multinomial', 'multiply', 'mv', 'mvlgamma', 'nan_to_num',
+ 'nan_to_num_', 'nanmean', 'nanmedian', 'nanmedian', 'nanquantile', 'nansum', 'narrow',
+ 'narrow_copy', 'native_batch_norm', 'native_channel_shuffle', 'native_dropout',
+ 'native_group_norm', 'native_layer_norm', 'native_norm', 'ne', 'neg', 'neg_', 'negative',
+ 'negative_', 'nextafter', 'nonzero', 'nonzero_static', 'norm_except_dim', 'normal', 'not_equal',
+ 'nuclear_norm', 'numel', 'ones', 'ones_like', 'orgqr', 'ormqr', 'outer', 'pairwise_distance',
+ 'pdist', 'permute', 'permute_copy', 'pinverse', 'pixel_shuffle', 'pixel_unshuffle', 'poisson',
+ 'poisson_nll_loss', 'polar', 'polygamma', 'positive', 'pow', 'prelu', 'prod', 'promote_types',
+ 'put', 'q_per_channel_axis', 'q_per_channel_scales', 'q_per_channel_zero_points', 'q_scale',
+ 'q_zero_point', 'qr', 'qr', 'quantile', 'quantize_per_channel', 'quantize_per_tensor',
+ 'quantize_per_tensor_dynamic', 'quantized_batch_norm', 'quantized_gru_cell', 'quantized_lstm_cell',
+ 'quantized_max_pool1d', 'quantized_max_pool2d', 'quantized_max_pool3d', 'quantized_rnn_relu_cell',
+ 'quantized_rnn_tanh_cell', 'rad2deg', 'rad2deg_', 'rand', 'rand_like', 'randint', 'randint_like',
+ 'randn', 'randn_like', 'randperm', 'range', 'ravel', 'real', 'reciprocal', 'reciprocal_', 'relu',
+ 'relu_', 'remainder', 'renorm', 'repeat_interleave', 'reshape', 'resize_as_', 'resize_as_sparse_',
+ 'resolve_conj', 'resolve_neg', 'result_type', 'rnn_relu', 'rnn_relu_cell', 'rnn_tanh',
+ 'rnn_tanh_cell', 'roll', 'rot90', 'round', 'round_', 'row_indices_copy', 'row_stack', 'rrelu',
+ 'rrelu_', 'rsqrt', 'rsqrt_', 'rsub', 'saddmm', 'scalar_tensor', 'scatter', 'scatter_add',
+ 'scatter_reduce', 'searchsorted', 'segment_reduce', 'select', 'select_copy', 'select_scatter',
+ 'selu', 'selu_', 'set_flush_denormal', 'set_num_interop_threads', 'set_num_threads', 'sgn',
+ 'sigmoid', 'sigmoid_', 'sign', 'signbit', 'sin', 'sin_', 'sinc', 'sinc_', 'sinh', 'sinh_',
+ 'slice_copy', 'slice_inverse', 'slice_scatter', 'slogdet', 'slogdet', 'smm', 'softmax', 'sort',
+ 'sort', 'sparse_bsc_tensor', 'sparse_bsr_tensor', 'sparse_compressed_tensor', 'sparse_coo_tensor',
+ 'sparse_csc_tensor', 'sparse_csr_tensor', 'split_copy', 'split_with_sizes',
+ 'split_with_sizes_copy', 'spmm', 'sqrt', 'sqrt_', 'square', 'square_', 'squeeze', 'squeeze_copy',
+ 'sspaddmm', 'stack', 'std', 'std_mean', 'sub', 'subtract', 'sum', 'svd', 'svd', 'swapaxes',
+ 'swapdims', 'sym_constrain_range', 'sym_constrain_range_for_size', 't', 't_copy', 'take',
+ 'take_along_dim', 'tan', 'tan_', 'tanh', 'tanh_', 'tensor', 'tensor_split', 'threshold',
+ 'threshold_', 'tile', 'topk', 'topk', 'trace', 'transpose', 'transpose_copy', 'trapezoid', 'trapz',
+ 'triangular_solve', 'triangular_solve', 'tril', 'tril_indices', 'triplet_margin_loss', 'triu',
+ 'triu_indices', 'true_divide', 'trunc', 'trunc_', 'unbind', 'unbind_copy', 'unflatten',
+ 'unfold_copy', 'unique_dim', 'unsafe_chunk', 'unsafe_split', 'unsafe_split_with_sizes',
+ 'unsqueeze', 'unsqueeze_copy', 'values_copy', 'vander', 'var', 'var_mean', 'vdot',
+ 'view_as_complex', 'view_as_complex_copy', 'view_as_real', 'view_as_real_copy', 'view_copy',
+ 'vsplit', 'vstack', 'where', 'xlogy', 'xlogy_', 'zero_', 'zeros', 'zeros_like']
diff --git a/MLPY/Lib/site-packages/torch/__config__.py b/MLPY/Lib/site-packages/torch/__config__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f8cf5710d77a2c2a6e871006b7803f68c85aa7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/__config__.py
@@ -0,0 +1,22 @@
+import torch
+
+
+def show():
+    """
+    Return a human-readable string with descriptions of the
+    configuration of PyTorch.
+    """
+    return torch._C._show_config()
+
+
+# TODO: In principle, we could provide more structured version/config
+# information here. For now only CXX_FLAGS is exposed, as Timer
+# uses them.
+def _cxx_flags():
+    """Returns the CXX_FLAGS used when building PyTorch."""
+    return torch._C._cxx_flags()
+
+
+def parallel_info():
+    r"""Returns detailed string with parallelization settings"""
+    return torch._C._parallel_info()
diff --git a/MLPY/Lib/site-packages/torch/_appdirs.py b/MLPY/Lib/site-packages/torch/_appdirs.py
new file mode 100644
index 0000000000000000000000000000000000000000..13db32eea62e50c360c651fbbfd9dfff0124cdc8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_appdirs.py
@@ -0,0 +1,666 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2005-2010 ActiveState Software Inc.
+# Copyright (c) 2013 Eddy Petrișor
+
+# flake8: noqa
+
+"""
+This file is directly from
+https://github.com/ActiveState/appdirs/blob/3fe6a83776843a46f20c2e5587afcffe05e03b39/appdirs.py
+
+The license of https://github.com/ActiveState/appdirs copied below:
+
+
+# This is the MIT license
+
+Copyright (c) 2010 ActiveState Software Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+
+"""Utilities for determining application-specific dirs.
+
+See <https://github.com/ActiveState/appdirs> for details and usage.
+"""
+# Dev Notes:
+# - MSDN on where to store app data files:
+#   http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
+# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
+# - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+
+__version__ = "1.4.4"
+__version_info__ = tuple(int(segment) for segment in __version__.split("."))
+
+
+import os
+import sys
+
+unicode = str
+
+if sys.platform.startswith("java"):
+    import platform
+
+    os_name = platform.java_ver()[3][0]
+    if os_name.startswith("Windows"):  # "Windows XP", "Windows 7", etc.
+        system = "win32"
+    elif os_name.startswith("Mac"):  # "Mac OS X", etc.
+        system = "darwin"
+    else:  # "Linux", "SunOS", "FreeBSD", etc.
+        # Setting this to "linux2" is not ideal, but only Windows or Mac
+        # are actually checked for and the rest of the module expects
+        # *sys.platform* style strings.
+        system = "linux2"
+else:
+    system = sys.platform
+
+
+def user_data_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific data dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+
+    Typical user data directories are:
+        Mac OS X:               ~/Library/Application Support/<AppName>
+        Unix:                   ~/.local/share/<AppName>    # or in $XDG_DATA_HOME, if defined
+        Win XP (not roaming):   C:\Documents and Settings\<username>\Application Data\<AppAuthor>\<AppName>
+        Win XP (roaming):       C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>
+        Win 7  (not roaming):   C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>
+        Win 7  (roaming):       C:\Users\<username>\AppData\Roaming\<AppAuthor>\<AppName>
+
+    For Unix, we follow the XDG spec and support $XDG_DATA_HOME.
+    That means, by default "~/.local/share/<AppName>".
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        const = roaming and "CSIDL_APPDATA" or "CSIDL_LOCAL_APPDATA"
+        path = os.path.normpath(_get_win_folder(const))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Application Support/")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_DATA_HOME", os.path.expanduser("~/.local/share"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+def site_data_dir(appname=None, appauthor=None, version=None, multipath=False):
+    r"""Return full path to the user-shared data dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "multipath" is an optional parameter only applicable to *nix
+            which indicates that the entire list of data dirs should be
+            returned. By default, the first item from XDG_DATA_DIRS is
+            returned, or '/usr/local/share/<AppName>',
+            if XDG_DATA_DIRS is not set
+
+    Typical site data directories are:
+        Mac OS X:   /Library/Application Support/<AppName>
+        Unix:       /usr/local/share/<AppName> or /usr/share/<AppName>
+        Win XP:     C:\Documents and Settings\All Users\Application Data\<AppAuthor>\<AppName>
+        Vista:      (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
+        Win 7:      C:\ProgramData\<AppAuthor>\<AppName>   # Hidden, but writeable on Win 7.
+
+    For Unix, this is using the $XDG_DATA_DIRS[0] default.
+
+    WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        path = os.path.normpath(_get_win_folder("CSIDL_COMMON_APPDATA"))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+    elif system == "darwin":
+        path = os.path.expanduser("/Library/Application Support")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        # XDG default for $XDG_DATA_DIRS
+        # only first, if multipath is False
+        path = os.getenv(
+            "XDG_DATA_DIRS", os.pathsep.join(["/usr/local/share", "/usr/share"])
+        )
+        pathlist = [
+            os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)
+        ]
+        if appname:
+            if version:
+                appname = os.path.join(appname, version)
+            pathlist = [os.sep.join([x, appname]) for x in pathlist]
+
+        if multipath:
+            path = os.pathsep.join(pathlist)
+        else:
+            path = pathlist[0]
+        return path
+
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+def user_config_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific config dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+
+    Typical user config directories are:
+        Mac OS X:               ~/Library/Preferences/<AppName>
+        Unix:                   ~/.config/<AppName>     # or in $XDG_CONFIG_HOME, if defined
+        Win *:                  same as user_data_dir
+
+    For Unix, we follow the XDG spec and support $XDG_CONFIG_HOME.
+    That means, by default "~/.config/<AppName>".
+    """
+    if system == "win32":
+        path = user_data_dir(appname, appauthor, None, roaming)
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Preferences/")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+def site_config_dir(appname=None, appauthor=None, version=None, multipath=False):
+    r"""Return full path to the user-shared data dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "multipath" is an optional parameter only applicable to *nix
+            which indicates that the entire list of config dirs should be
+            returned. By default, the first item from XDG_CONFIG_DIRS is
+            returned, or '/etc/xdg/<AppName>', if XDG_CONFIG_DIRS is not set
+
+    Typical site config directories are:
+        Mac OS X:   same as site_data_dir
+        Unix:       /etc/xdg/<AppName> or $XDG_CONFIG_DIRS[i]/<AppName> for each value in
+                    $XDG_CONFIG_DIRS
+        Win *:      same as site_data_dir
+        Vista:      (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
+
+    For Unix, this is using the $XDG_CONFIG_DIRS[0] default, if multipath=False
+
+    WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
+    """
+    if system == "win32":
+        path = site_data_dir(appname, appauthor)
+        if appname and version:
+            path = os.path.join(path, version)
+    elif system == "darwin":
+        path = os.path.expanduser("/Library/Preferences")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        # XDG default for $XDG_CONFIG_DIRS
+        # only first, if multipath is False
+        path = os.getenv("XDG_CONFIG_DIRS", "/etc/xdg")
+        pathlist = [
+            os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)
+        ]
+        if appname:
+            if version:
+                appname = os.path.join(appname, version)
+            pathlist = [os.sep.join([x, appname]) for x in pathlist]
+
+        if multipath:
+            path = os.pathsep.join(pathlist)
+        else:
+            path = pathlist[0]
+    return path
+
+
+def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
+    r"""Return full path to the user-specific cache dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "opinion" (boolean) can be False to disable the appending of
+            "Cache" to the base app data dir for Windows. See
+            discussion below.
+
+    Typical user cache directories are:
+        Mac OS X:   ~/Library/Caches/<AppName>
+        Unix:       ~/.cache/<AppName> (XDG default)
+        Win XP:     C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Cache
+        Vista:      C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
+
+    On Windows the only suggestion in the MSDN docs is that local settings go in
+    the `CSIDL_LOCAL_APPDATA` directory. This is identical to the non-roaming
+    app data dir (the default returned by `user_data_dir` above). Apps typically
+    put cache data somewhere *under* the given dir here. Some examples:
+        ...\Mozilla\Firefox\Profiles\<ProfileName>\Cache
+        ...\Acme\SuperApp\Cache\1.0
+    OPINION: This function appends "Cache" to the `CSIDL_LOCAL_APPDATA` value.
+    This can be disabled with the `opinion=False` option.
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+            if opinion:
+                path = os.path.join(path, "Cache")
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Caches")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+def user_state_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific state dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+
+    Typical user state directories are:
+        Mac OS X:  same as user_data_dir
+        Unix:      ~/.local/state/<AppName>   # or in $XDG_STATE_HOME, if defined
+        Win *:     same as user_data_dir
+
+    For Unix, we follow this Debian proposal <https://wiki.debian.org/XDGBaseDirectorySpecification#state>
+    to extend the XDG spec and support $XDG_STATE_HOME.
+
+    That means, by default "~/.local/state/<AppName>".
+    """
+    if system in ["win32", "darwin"]:
+        path = user_data_dir(appname, appauthor, None, roaming)
+    else:
+        path = os.getenv("XDG_STATE_HOME", os.path.expanduser("~/.local/state"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+def user_log_dir(appname=None, appauthor=None, version=None, opinion=True):
+    r"""Return full path to the user-specific log dir for this application.
+
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "opinion" (boolean) can be False to disable the appending of
+            "Logs" to the base app data dir for Windows, and "log" to the
+            base cache dir for Unix. See discussion below.
+
+    Typical user log directories are:
+        Mac OS X:   ~/Library/Logs/<AppName>
+        Unix:       ~/.cache/<AppName>/log  # or under $XDG_CACHE_HOME if defined
+        Win XP:     C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Logs
+        Vista:      C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Logs
+
+    On Windows the only suggestion in the MSDN docs is that local settings
+    go in the `CSIDL_LOCAL_APPDATA` directory. (Note: I'm interested in
+    examples of what some windows apps use for a logs dir.)
+
+    OPINION: This function appends "Logs" to the `CSIDL_LOCAL_APPDATA`
+    value for Windows and appends "log" to the user cache dir for Unix.
+    This can be disabled with the `opinion=False` option.
+    """
+    if system == "darwin":
+        path = os.path.join(os.path.expanduser("~/Library/Logs"), appname)
+    elif system == "win32":
+        path = user_data_dir(appname, appauthor, version)
+        version = False
+        if opinion:
+            path = os.path.join(path, "Logs")
+    else:
+        path = user_cache_dir(appname, appauthor, version)
+        version = False
+        if opinion:
+            path = os.path.join(path, "log")
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+
+
+class AppDirs(object):
+    """Convenience wrapper for getting application dirs."""
+
+    def __init__(
+        self, appname=None, appauthor=None, version=None, roaming=False, multipath=False
+    ):
+        self.appname = appname
+        self.appauthor = appauthor
+        self.version = version
+        self.roaming = roaming
+        self.multipath = multipath
+
+    @property
+    def user_data_dir(self):
+        return user_data_dir(
+            self.appname, self.appauthor, version=self.version, roaming=self.roaming
+        )
+
+    @property
+    def site_data_dir(self):
+        return site_data_dir(
+            self.appname, self.appauthor, version=self.version, multipath=self.multipath
+        )
+
+    @property
+    def user_config_dir(self):
+        return user_config_dir(
+            self.appname, self.appauthor, version=self.version, roaming=self.roaming
+        )
+
+    @property
+    def site_config_dir(self):
+        return site_config_dir(
+            self.appname, self.appauthor, version=self.version, multipath=self.multipath
+        )
+
+    @property
+    def user_cache_dir(self):
+        return user_cache_dir(self.appname, self.appauthor, version=self.version)
+
+    @property
+    def user_state_dir(self):
+        return user_state_dir(self.appname, self.appauthor, version=self.version)
+
+    @property
+    def user_log_dir(self):
+        return user_log_dir(self.appname, self.appauthor, version=self.version)
+
+
+# ---- internal support stuff
+
+
+def _get_win_folder_from_registry(csidl_name):
+    """This is a fallback technique at best. I'm not sure if using the
+    registry for this guarantees us the correct answer for all CSIDL_*
+    names.
+    """
+    import winreg as _winreg
+
+    shell_folder_name = {
+        "CSIDL_APPDATA": "AppData",
+        "CSIDL_COMMON_APPDATA": "Common AppData",
+        "CSIDL_LOCAL_APPDATA": "Local AppData",
+    }[csidl_name]
+
+    key = _winreg.OpenKey(
+        _winreg.HKEY_CURRENT_USER,
+        r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders",
+    )
+    dir, type = _winreg.QueryValueEx(key, shell_folder_name)
+    return dir
+
+
+def _get_win_folder_with_pywin32(csidl_name):
+    from win32com.shell import shell, shellcon
+
+    dir = shell.SHGetFolderPath(0, getattr(shellcon, csidl_name), 0, 0)
+    # Try to make this a unicode path because SHGetFolderPath does
+    # not return unicode strings when there is unicode data in the
+    # path.
+    try:
+        dir = unicode(dir)
+
+        # Downgrade to short path name if have highbit chars. See
+        # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+        has_high_char = False
+        for c in dir:
+            if ord(c) > 255:
+                has_high_char = True
+                break
+        if has_high_char:
+            try:
+                import win32api
+
+                dir = win32api.GetShortPathName(dir)
+            except ImportError:
+                pass
+    except UnicodeError:
+        pass
+    return dir
+
+
+def _get_win_folder_with_ctypes(csidl_name):
+    import ctypes
+
+    csidl_const = {
+        "CSIDL_APPDATA": 26,
+        "CSIDL_COMMON_APPDATA": 35,
+        "CSIDL_LOCAL_APPDATA": 28,
+    }[csidl_name]
+
+    buf = ctypes.create_unicode_buffer(1024)
+    ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)
+
+    # Downgrade to short path name if have highbit chars. See
+    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+    has_high_char = False
+    for c in buf:
+        if ord(c) > 255:
+            has_high_char = True
+            break
+    if has_high_char:
+        buf2 = ctypes.create_unicode_buffer(1024)
+        if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
+            buf = buf2
+
+    return buf.value
+
+
+def _get_win_folder_with_jna(csidl_name):
+    import array
+
+    from com.sun import jna
+    from com.sun.jna.platform import win32
+
+    buf_size = win32.WinDef.MAX_PATH * 2
+    buf = array.zeros("c", buf_size)
+    shell = win32.Shell32.INSTANCE
+    shell.SHGetFolderPath(
+        None,
+        getattr(win32.ShlObj, csidl_name),
+        None,
+        win32.ShlObj.SHGFP_TYPE_CURRENT,
+        buf,
+    )
+    dir = jna.Native.toString(buf.tostring()).rstrip("\0")
+
+    # Downgrade to short path name if have highbit chars. See
+    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+    has_high_char = False
+    for c in dir:
+        if ord(c) > 255:
+            has_high_char = True
+            break
+    if has_high_char:
+        buf = array.zeros("c", buf_size)
+        kernel = win32.Kernel32.INSTANCE
+        if kernel.GetShortPathName(dir, buf, buf_size):
+            dir = jna.Native.toString(buf.tostring()).rstrip("\0")
+
+    return dir
+
+
+if system == "win32":
+    try:
+        import win32com.shell
+
+        _get_win_folder = _get_win_folder_with_pywin32
+    except ImportError:
+        try:
+            from ctypes import windll
+
+            _get_win_folder = _get_win_folder_with_ctypes
+        except ImportError:
+            try:
+                import com.sun.jna
+
+                _get_win_folder = _get_win_folder_with_jna
+            except ImportError:
+                _get_win_folder = _get_win_folder_from_registry
+
+
+# ---- self test code
+
+if __name__ == "__main__":
+    appname = "MyApp"
+    appauthor = "MyCompany"
+
+    props = (
+        "user_data_dir",
+        "user_config_dir",
+        "user_cache_dir",
+        "user_state_dir",
+        "user_log_dir",
+        "site_data_dir",
+        "site_config_dir",
+    )
+
+    print(f"-- app dirs {__version__} --")
+
+    print("-- app dirs (with optional 'version')")
+    dirs = AppDirs(appname, appauthor, version="1.0")
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+
+    print("\n-- app dirs (without optional 'version')")
+    dirs = AppDirs(appname, appauthor)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+
+    print("\n-- app dirs (without optional 'appauthor')")
+    dirs = AppDirs(appname)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+
+    print("\n-- app dirs (with disabled 'appauthor')")
+    dirs = AppDirs(appname, appauthor=False)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
diff --git a/MLPY/Lib/site-packages/torch/_awaits/__init__.py b/MLPY/Lib/site-packages/torch/_awaits/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c9b1fef2960fcc66be6b43ba0e0d92856a799f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_awaits/__init__.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from typing import cast, Callable, Generic, Type, TypeVar
+
+import torch
+
+__all__ = ['Await']
+
+W = TypeVar("W")
+
+class _PyAwaitMeta(type(torch._C._Await), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+
+class _Await(torch._C._Await, Generic[W], metaclass=_PyAwaitMeta):
+    r"""
+    Wrapper around a ``torch._C.Await`` which encapsulates delayed execution
+    of a callable. All manipulations happen with functions ``torch.jit._awaitable``,
+    ``torch.jit._awaitable_wait``, ``torch.jit._awaitable_nowait``.
+
+    Torch scriptable manipulations:
+    ``torch.jit._awaitable(func, *args)``
+    Creates ``Await[W]`` object, where W is return type of func.
+
+    Returns:
+    ``torch.jit._awaitable_wait(Await[W])``
+    Returns the result of the function, specified at ``_awaitable``,  with specified arguments.
+
+    Returns:
+        The result of type ``W`` of the function call. The result is owned by ``Await[W]``
+        and returned on all following ``_awaitable_wait`` calls.
+
+
+    ``torch.jit._awaitable_nowait(W)``
+    Returns:
+        Trivial ``Await[W]`` with specified result.
+
+
+    Only in eager mode:
+    ``fn() -> Callable[Tuple[Any], W]``
+    Returns:
+        Specified at ``_awaitable`` python function ``func``.
+
+    ``args() -> Tuple[Any]``
+    Returns:
+        Specified at ``_awaitable`` python args.
+
+    ``is_nowait() -> _bool``
+    Returns:
+        ``True`` if this object was created via ``_awaitable_nowait`` call (trivial `Await[W]`).
+
+    In eager mode ``Await[W]`` can be used as ``W`` i.e. attributes of W can be called on ``Await[W]``,
+    ``_awaitable_wait()`` call will be transparently added.
+    """
+    pass
diff --git a/MLPY/Lib/site-packages/torch/_awaits/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_awaits/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c4372f600412b883e4b9823c9f9f435e9c687a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_awaits/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_classes.py b/MLPY/Lib/site-packages/torch/_classes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d160312d883b2081317c3bf013ea5c4604614f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_classes.py
@@ -0,0 +1,55 @@
+import types
+
+import torch._C
+
+
+class _ClassNamespace(types.ModuleType):
+    def __init__(self, name):
+        super().__init__("torch.classes" + name)
+        self.name = name
+
+    def __getattr__(self, attr):
+        proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
+        if proxy is None:
+            raise RuntimeError(f"Class {self.name}.{attr} not registered!")
+        return proxy
+
+
+class _Classes(types.ModuleType):
+    __file__ = "_classes.py"
+
+    def __init__(self):
+        super().__init__("torch.classes")
+
+    def __getattr__(self, name):
+        namespace = _ClassNamespace(name)
+        setattr(self, name, namespace)
+        return namespace
+
+    @property
+    def loaded_libraries(self):
+        return torch.ops.loaded_libraries
+
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+
+        The library being loaded may run global initialization code to register
+        custom classes with the PyTorch JIT runtime. This allows dynamically
+        loading custom classes. For this, you should compile your class
+        and the static registration code into a shared library object, and then
+        call ``torch.classes.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+
+        After the library is loaded, it is added to the
+        ``torch.classes.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        torch.ops.load_library(path)
+
+
+# The classes "namespace"
+classes = _Classes()
diff --git a/MLPY/Lib/site-packages/torch/_compile.py b/MLPY/Lib/site-packages/torch/_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..576d4218c4c49cf55a34efd68198dd021f4ba7dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_compile.py
@@ -0,0 +1,30 @@
+"""
+APIs related to torch.compile which lazily import torch._dynamo to avoid
+circular dependencies.
+"""
+import functools
+
+
+def _disable_dynamo(fn=None, recursive=True):
+    """
+    This API should be only used inside torch, external users should still use
+    torch._dynamo.disable. The main goal of this API is to avoid circular
+    imports issues that is common while using _dynamo.disable inside torch
+    itself.
+
+    This API avoids it by lazily importing torch._dynamo from the import time to
+    the invocation of the decorated function.
+    """
+    if fn is not None:
+
+        @functools.wraps(fn)
+        def inner(*args, **kwargs):
+            import torch._dynamo
+
+            return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+
+        return inner
+    else:
+        # decorator usage like @_disable_dynamo(recursive=False). The resulting
+        # object expects the original decorated function as the arg.
+        return functools.partial(_disable_dynamo, recursive=recursive)
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/__init__.py b/MLPY/Lib/site-packages/torch/_custom_op/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20c603890cf093be83d8ecc39bd24bcb647bdd4d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac24eb64d0272731f0947172c19cd171def9f1d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..628deaa6fee76d50c279406f89358802fe352b2c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6c8d06dfef35ab53fad39c9d20c230f6f443bc6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_custom_op/__pycache__/impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/autograd.py b/MLPY/Lib/site-packages/torch/_custom_op/autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be5ab372b96a203e485a4465ab984712f1b6380
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_custom_op/autograd.py
@@ -0,0 +1,274 @@
+import torch
+import torch.utils._pytree as pytree
+from collections import namedtuple
+import functools
+
+
+# NOTE [CustomOp autograd kernel indirection]
+# We register `inner` as the autograd kernel for this custom_op.
+# `inner` either calls the autograd formula registered by the user,
+# or goes into an `autograd_not_implemented` kernel.
+#
+# The reason why this indirection exists is
+# so that we can swap out the autograd kernel (the PyTorch dispatcher
+# doesn't actually allow us to do this). By default, we want
+# the `autograd_not_implemented` behavior, but then the user may come
+# and register something that is actually a backward formula
+def autograd_kernel_indirection(custom_op):
+    autograd_fallback = autograd_not_implemented(custom_op)
+
+    def inner(*args, **kwargs):
+        if custom_op._has_impl('autograd'):
+            kernel = custom_op._get_impl('autograd').func
+            return kernel(*args, **kwargs)
+        # As explained in NOTE ["backward", "save_for_backward", and "autograd"],
+        # after the user gives us "backward" and "save_for_backward", we generate
+        # the "autograd" impl. If the user only provided one, then we tell
+        # the user they've done something wrong.
+        if custom_op._has_impl('save_for_backward') or custom_op._has_impl('backward'):
+            missing = (
+                'save_for_backward' if custom_op._has_impl('backward')
+                else 'backward'
+            )
+            found = 'save_for_backward' if missing == 'backward' else 'backward'
+            loc = custom_op._get_impl(found).location
+            raise RuntimeError(
+                f"We found a '{found}' registration for {custom_op} at "
+                f"{loc} but were unable to find a '{missing}' registration. "
+                f"To use the CustomOp API to register a backward formula, "
+                f"please provide us both a backward function and a "
+                f"'save for backward' function via `impl_backward` and "
+                f"`impl_save_for_backward` respectively.")
+        return autograd_fallback(*args, **kwargs)
+    return inner
+
+
+# TODO(#101191): Use the actual C++ autograd not implemented fallback,
+# or change the default autograd fallback to the autograd not implemented fallback.
+def autograd_not_implemented(custom_op):
+    def kernel(*args, **kwargs):
+        if torch.is_grad_enabled() and pytree.tree_any(
+            lambda x: isinstance(x, torch.Tensor) and x.requires_grad, (args, kwargs)
+        ):
+            raise RuntimeError("Autograd has not been implemented for operator")
+        with torch._C._AutoDispatchBelowAutograd():
+            return custom_op(*args, **kwargs)
+    return kernel
+
+
+def mark_non_differentiable(ctx, output, output_differentiability):
+    # Output types are restricted to be:
+    # - Tensor
+    # - Tensor[]
+    # - int, bool, Scalar, float
+    # See _check_can_register_backward
+    if output_differentiability is not None:
+        if not isinstance(output, tuple):
+            tuple_output = (output,)
+        else:
+            tuple_output = output  # type: ignore[assignment]
+        assert len(output_differentiability) == len(tuple_output)
+        non_differentiable_tensors = []
+        for idx, (differentiable, out) in enumerate(zip(output_differentiability, tuple_output)):
+            if isinstance(out, torch.Tensor):
+                if not differentiable:
+                    non_differentiable_tensors.append(out)
+                continue
+            if isinstance(out, list):
+                if not differentiable:
+                    non_differentiable_tensors.extend(out)
+                continue
+            if differentiable:
+                raise RuntimeError(
+                    f"With output_differentiability={output_differentiability}. "
+                    f"At idx {idx}, we received an object of type {type(out)} that "
+                    f"is not a Tensor, so it cannot have be marked as differentiable in "
+                    f"output_differentiability.")
+        if non_differentiable_tensors:
+            ctx.mark_non_differentiable(*non_differentiable_tensors)
+
+
+def construct_autograd_kernel(
+        schema,
+        output_differentiability,
+        custom_op,
+        op_overload,
+        save_for_backward_fn,
+        backward_fn):
+
+    def apply(*args):
+        flat_args, spec = pytree.tree_flatten(args)
+        out_spec = None
+
+        def forward(ctx, *flat_args):
+            ctx.set_materialize_grads(True)
+            args = pytree.tree_unflatten(list(flat_args), spec)
+            with torch._C._AutoDispatchBelowAutograd():
+                output = op_overload(*args)
+
+            # We use the info about args to give better error messages in backward
+            args_info = namedtuple_args(
+                schema, pytree.tree_map(type, args))
+
+            save_for_backward_fn_inputs = namedtuple_args(schema, args)
+            to_save = save_for_backward_fn(save_for_backward_fn_inputs, output)
+
+            save_pytree_for_backward(ctx, (to_save, args_info))
+            mark_non_differentiable(ctx, output, output_differentiability)
+
+            nonlocal out_spec
+            flat_output, out_spec = pytree.tree_flatten(output)
+            return tuple(flat_output)
+
+        def backward(ctx, *flat_grad_output):
+            assert out_spec is not None
+            grads = pytree.tree_unflatten(list(flat_grad_output), out_spec)
+            saved, args_info = unpack_saved(ctx)
+            # There is nothing on the ctx object for now, it is just there so
+            # that we can add additional things in the future.
+            inner_ctx = object()
+            if not isinstance(grads, tuple):
+                grads = (grads,)
+            grad_inputs_dict = backward_fn(inner_ctx, saved, *grads)
+
+            # Massage the grad_inputs_dict to a form acceptable by
+            # autograd.Function.
+            validate_grad_inputs_dict(grad_inputs_dict, custom_op, args_info)
+            return grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info)
+
+        generated_cls = gen_autograd_function(
+            custom_op._opname + '_customop', forward, backward)
+
+        flat_output = generated_cls.apply(*flat_args)
+        assert out_spec is not None
+        return pytree.tree_unflatten(list(flat_output), out_spec)
+    return apply
+
+
+def gen_autograd_function(name, forward, backward):
+    generated_cls = type(
+        name,
+        (torch.autograd.Function,),
+        {
+            'forward': staticmethod(forward),
+            'backward': staticmethod(backward),
+        }
+    )
+    return generated_cls
+
+
+@functools.lru_cache
+def namedtuple_args_cls(schema):
+    attribs = [arg.name for arg in schema.arguments.flat_all]
+    name = str(schema.name) + "_args"
+    # mypy doesn't support dynamic namedtuple name
+    tuple_cls = namedtuple(name, attribs)  # type: ignore[misc]
+    return tuple_cls
+
+
+def namedtuple_args(schema, args):
+    assert isinstance(args, tuple)
+    tuple_cls = namedtuple_args_cls(schema)
+    return tuple_cls(*args)
+
+
+def validate_grad_inputs_dict(grad_inputs_dict, forward_op, args_info):
+    def error(what):
+        backward = forward_op._get_impl('backward')
+        raise RuntimeError(
+            f"In the backward function defined for {forward_op} at "
+            f"{backward.location} using the CustomOp API, {what}")
+
+    if not isinstance(grad_inputs_dict, dict):
+        error(f"expected the output of the backward function to be a dict but "
+              f"got {type(grad_inputs_dict)}")
+
+    expected_keys = {arg.name for arg in forward_op._schema.arguments.flat_all
+                     if arg.type.is_tensor_like()}
+    actual_keys = grad_inputs_dict.keys()
+    if expected_keys != actual_keys:
+        error(f"expected the returned grad_input dict to have keys "
+              f"{expected_keys} but got {actual_keys}. The backward "
+              f"function must return a gradient (can be None) for each arg "
+              f"to the CustomOp that may be a Tensor or Sequence[Tensor]. "
+              f"Args declared to be non-Tensor-like types should not appear "
+              f"in the grad_input dict")
+
+    for name, grad in grad_inputs_dict.items():
+        arg_info = getattr(args_info, name)
+
+        if isinstance(arg_info, list):
+            if not isinstance(grad, (tuple, list)):
+                error(f"for input '{name}' expected the grad_input dict to "
+                      f"hold a list of gradients but got object of type "
+                      f"{type(grad)}.")
+            if not len(grad) == len(arg_info):
+                error(f"for input '{name}' expected the grad_input dict to "
+                      f"hold a list of {len(arg_info)} gradients but got "
+                      f"{len(grad)}")
+            for idx, (g, info) in enumerate(zip(grad, arg_info)):
+                if g is None:
+                    continue
+                if not isinstance(g, torch.Tensor):
+                    error(f"for input '{name}' expected the grad_input dict to "
+                          f"hold a list of None or Tensor gradients but got "
+                          f"object of {type(g)} at index {idx}")
+                if not issubclass(info, torch.Tensor):
+                    error(f"for input '{name}', got a Tensor as the gradient "
+                          f"for the {idx}-th value but expected None because "
+                          f"the {idx}-th value was not a Tensor (it was "
+                          f"type {arg_info}")
+            continue
+
+        if grad is None:
+            continue
+        if not isinstance(grad, torch.Tensor):
+            error(f"got object of type {type(grad)} as the gradient for input "
+                  f"'{name}', "
+                  f"but expected the gradient to be either None or a Tensor")
+        if not issubclass(arg_info, torch.Tensor):
+            error(f"got a Tensor as the gradient for input '{name}' but "
+                  f"expected None as the gradient because input '{name}' "
+                  f"was not a Tensor (it was type {arg_info}).")
+
+
+def grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info):
+    result = []
+    for name, arg_info in args_info._asdict().items():
+        if name not in grad_inputs_dict:
+            result.append(pytree.tree_map(lambda x: None, arg_info))
+            continue
+        result.append(grad_inputs_dict[name])
+    return tuple(pytree.tree_leaves(result))
+
+# Saves "stuff" (a pytree) onto the ctx object. Use unpack_saved to unpack it.
+# autograd.Function prefers that users use ctx.save_for_backward to
+# save Tensors (to avoid reference cycles) and for non-Tensors to go onto the
+# ctx object.
+def save_pytree_for_backward(ctx, stuff):
+    flat_stuff, spec = pytree.tree_flatten(stuff)
+    num_elts = len(flat_stuff)
+    tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
+                   if isinstance(thing, torch.Tensor)]
+    non_tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
+                       if not isinstance(thing, torch.Tensor)]
+    tensors = [thing for thing in flat_stuff if isinstance(thing, torch.Tensor)]
+    non_tensors = [thing for thing in flat_stuff if not isinstance(thing, torch.Tensor)]
+
+    ctx.spec = spec
+    ctx.num_elts = num_elts
+    ctx.save_for_backward(*tensors)
+    ctx.tensor_idxs = tensor_idxs
+    ctx.saved_non_tensors = non_tensors
+    ctx.non_tensor_idxs = non_tensor_idxs
+
+
+# Inverse operation to save_pytree_for_backward
+def unpack_saved(ctx):
+    flat_stuff = [None] * ctx.num_elts
+    for tensor, idx in zip(ctx.saved_tensors, ctx.tensor_idxs):
+        flat_stuff[idx] = tensor
+    for non_tensor, idx in zip(ctx.saved_non_tensors, ctx.non_tensor_idxs):
+        flat_stuff[idx] = non_tensor
+    stuff = pytree.tree_unflatten(flat_stuff, ctx.spec)
+    return stuff
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/functional.py b/MLPY/Lib/site-packages/torch/_custom_op/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..a15e920c3c018e7156da4db86195f3e3a02fd0ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_custom_op/functional.py
@@ -0,0 +1,187 @@
+import weakref
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import _ExcludeDispatchKeyGuard, DispatchKey, DispatchKeySet
+from torch._ops import OpOverload
+from torch.library import Library
+from torchgen.model import (
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    OperatorName,
+    OptionalType,
+    SchemaKind,
+)
+
+from .autograd import autograd_not_implemented
+
+
+def register_functional_op(
+    lib: Library,
+    new_op_name: str,
+    mutable_op: OpOverload,
+) -> None:
+    """Given a mutable operator, registers the functional variant.
+
+    This API also correctly links the functional variant with the mutable
+    operator for the purposes of functionalization.
+
+    All of the new registrations are performed on the ``lib`` passed in.
+
+    Arguments:
+        lib (Library): Should be a torch.library.Library object that has
+            the same namespace as ``mutable_op``'s namespace.
+            lib will be used to register the new functional op as well
+            as a functionalization kernel for the ``mutable_op``
+            If you don't have a library handy, use
+            ``torch.library.Library(ns, 'FRAGMENT')`` to construct one.
+        new_op_name (str): The name of the functional operator (without the
+            namespace). If no namespace, the new functional variant will be
+            accessible under ``torch.ops.{lib.ns}.new_op_name``.
+        mutable_op (OpOverload): The mutable custom operator. Note
+            that you may need to add a `.default` to it, like
+            `torch.ops.aten.abs_.default`.
+
+    """
+    validate(mutable_op)
+    schema = functional_schema(new_op_name, mutable_op)
+    lib.define(schema)
+
+    functional_impl = construct_functional_impl(mutable_op)
+    lib.impl(new_op_name, functional_impl, 'CompositeExplicitAutograd')
+
+    functional_op = getattr(getattr(torch.ops, lib.ns), new_op_name).default
+
+    # There's no easy way for us to generate the autograd kernel, so we
+    # use autograd_not_implemented. Also, this makes it so that the user
+    # is unable to register an autograd formula themselves. This shouldn't
+    # be a problem if the user doesn't use the functional op direclty
+    # in their program, but we may need to revist this in the future.
+    lib.impl(new_op_name, autograd_not_implemented(functional_op), 'Autograd')
+
+    f_kernel = construct_functionalization_kernel(weakref.proxy(mutable_op), functional_op)
+
+    lib.impl(mutable_op, f_kernel, 'Functionalize')
+
+
+def construct_functional_impl(mutable_op):
+    def functional_impl(*args):
+        # Strategy:
+        # - clone args that would have been mutated
+        # - run mutable_op
+        # - return the cloned args as additional outputs
+        new_args = []
+        extra_rets = []
+        for is_write, arg in zip(mutable_args(mutable_op), args):
+            if is_write:
+                cloned = arg.clone() if arg is not None else None
+                new_args.append(cloned)
+                extra_rets.append(cloned)
+            else:
+                new_args.append(arg)
+        result = mutable_op(*new_args)
+        if result is None:
+            return tuple(extra_rets)
+        if isinstance(result, tuple):
+            return (*result, *extra_rets)
+        return (result, *extra_rets)
+    return functional_impl
+
+
+def construct_functionalization_kernel(mutable_op, functional_op):
+    def kernel(*args):
+        # There's nothing to be functionalized!
+        # We can still end up here because DispatchKey::Functionalize is a mode key
+        if pytree.tree_all_only(torch.Tensor, lambda x: not torch._is_functional_tensor(x), args):
+            with _ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.Functionalize)):
+                return mutable_op(*args)
+
+        # NB: This differs from the codegen -- codegen handles cases where there
+        # are mixed FunctionalTensorWrapper and non-FunctionalTensorWrapper.
+        # This only really matters for XLA (mixed CPU-XLA tensors) and
+        # running functionalization without the PT2 stack (which guarantees to us that
+        # all tensors are FunctionalTensorWrapper).
+        if not pytree.tree_all_only(torch.Tensor, torch._is_functional_tensor, args):
+            raise RuntimeError("{mutable_op}: expected all args to be FunctionalTensorWrapper")
+
+        unwrapped_args = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and torch._is_functional_tensor(arg):
+                torch._sync(arg)
+                unwrapped = torch._from_functional_tensor(arg)
+                unwrapped_args.append(unwrapped)
+            else:
+                unwrapped_args.append(arg)
+
+        with _ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.Functionalize)):
+            output = functional_op(*unwrapped_args)
+
+        num_actual_output = len(mutable_op._schema.returns)
+        actual_output = pytree.tree_map(
+            torch._to_functional_tensor, output[:num_actual_output])
+
+        new_values_to_propagate = output[num_actual_output:]
+        inputs_to_replace = [arg for is_write, arg in zip(mutable_args(mutable_op), args)
+                             if is_write]
+        assert len(new_values_to_propagate) == len(inputs_to_replace)
+        for new_value, arg in zip(new_values_to_propagate, inputs_to_replace):
+            if (arg is None and new_value is None) or (arg is not None and new_value is not None):
+                continue
+            torch._C._propagate_xla_data(arg, new_value)
+            torch._C._replace_(arg, new_value)
+            torch._C._commit_update(arg)
+            torch._sync(arg)
+
+        if len(actual_output) == 1:
+            return actual_output[0]
+        elif len(actual_output) == 0:
+            return None
+        return actual_output
+
+    return kernel
+
+
+def validate(mutable_op: OpOverload):
+    if not isinstance(mutable_op, OpOverload):
+        raise TypeError(
+            f"register_functional_op(mutable_op): expected mutable_op to be instance of "
+            f"OpOverload but got {type(mutable_op)}")
+
+    # There are generally three types of "in-place" or "mutable" ops.
+    # Each of them have their own conventions:
+    # - inplace (first input modified in-place and returned as only output)
+    # - out= (some args modified in-place and returned as outputs)
+    # - mutable (some args modified in-place but none of those returned as outputs)
+    # In theory we can support all three, but we'll just support the last
+    # option right now for simplicity.
+    schema = FunctionSchema.parse(str(mutable_op._schema))
+    if not schema.kind() == SchemaKind.mutable:
+        raise RuntimeError("Expected op to be mutable (as opposed to functional, inplace or out)")
+    for ret in schema.returns:
+        # construct_functionalization_kernel assumes this for simplicity
+        if ret.annotation is not None:
+            raise NotImplementedError(
+                "NYI: register_functional_op(op) where op returns a mutated or aliased value. "
+                "Please file an issue (and as a workaround, modify your operator to "
+                "not return the mutated value or aliases)")
+    for arg in schema.arguments.flat_all:
+        # construct_functionalization_kernel assumes this for simplicity
+        if arg.type.is_tensor_like() and (
+            arg.type != BaseType(BaseTy.Tensor)
+            and arg.type != OptionalType(BaseType(BaseTy.Tensor))
+        ):
+            raise NotImplementedError(
+                "NYI: register_functional_op(op) where op has a List[Tensor] input."
+                "Please file an issue.")
+
+
+def functional_schema(new_op_name, op: OpOverload):
+    schema = FunctionSchema.parse(str(op._schema))
+    schema = schema.signature().with_name(OperatorName.parse(new_op_name))
+    return str(schema)
+
+
+def mutable_args(op: OpOverload):
+    return tuple(False if arg.alias_info is None else arg.alias_info.is_write
+                 for arg in op._schema.arguments)
diff --git a/MLPY/Lib/site-packages/torch/_custom_op/impl.py b/MLPY/Lib/site-packages/torch/_custom_op/impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5afbbce849f18574f8d89cd6bed9c55fda753ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_custom_op/impl.py
@@ -0,0 +1,976 @@
+import dataclasses
+import functools
+import inspect
+import sys
+import typing
+import weakref
+
+from torchgen.model import FunctionSchema, OperatorName, SchemaKind, BaseType, ListType, BaseTy
+
+import torch
+import torch._C as _C
+import torch.library as library
+from torch._library.abstract_impl import AbstractImplCtx
+from torch.library import get_ctx
+
+from .autograd import autograd_kernel_indirection, construct_autograd_kernel
+
+"""
+For a detailed guide on custom ops, please see
+https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+This file includes pieces of the implementation of our custom operator API.
+"""
+
+__all__ = ["custom_op", "CustomOp", "get_ctx", "AbstractImplCtx"]
+
+
+SUPPORTED_DEVICE_TYPE_TO_KEY = {
+    "cpu": "CPU",
+    "cuda": "CUDA",
+}
+
+# We will not let users register CustomOps with anything that could look like
+# PyTorch internals to avoid confusion.
+RESERVED_NS = {
+    "prim",
+    "prims",
+    "aten",
+    "at",
+    "torch",
+    "pytorch",
+}
+
+
+def custom_op(
+    qualname: str, manual_schema: typing.Optional[str] = None
+) -> typing.Callable:
+    r"""Creates a new CustomOp object.
+
+    WARNING: if you're a user, please do not use this directly
+    (instead use the torch._custom_ops APIs).
+    Also please see the following for a detailed guide on custom ops.
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define (create) the op
+    - we need to implement behavior for how the operator interacts with
+      various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+
+    This entrypoint defines the CustomOp object (the first step);
+    you must then perform the second step by calling various methods on
+    the CustomOp object.
+
+    This API is used as a decorator (see examples).
+
+    Arguments:
+        qualname (str): Should be a string that looks like
+            "namespace::operator_name". Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module. The operator_name must be
+            the same as the name of the function you pass to custom_op
+            (see examples).
+        manual_schema (Optional[str]): Each PyTorch operator needs a schema that
+            tells PyTorch the types of the inputs/outputs. If None (default),
+            we will infer the schema from the type annotations on the function
+            (see examples). Otherwise, if you don't want to use type annotations,
+            you may provide us the schema string.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the CustomOp.
+        >>> # We need to provide the decorator a "prototype function"
+        >>> # (a function with Python ellipses as the body).
+        >>> @custom_op("my_library::numpy_sin")
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     ...
+        >>>
+        >>> # numpy_sin is now an instance of class CustomOp
+        >>> print(type(numpy_sin))
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @numpy_sin.impl('cpu')
+        >>> def numpy_sin_impl_cpu(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @numpy_sin.impl('cuda')
+        >>> def numpy_sin_impl_cuda(x):
+        >>>     return torch.from_numpy(np.sin(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> numpy_sin(x)  # calls numpy_sin_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> numpy_sin(x)  # calls numpy_sin_impl_cuda
+
+    """
+
+    def inner(func):
+        if not inspect.isfunction(func):
+            raise ValueError(
+                f"custom_op(...)(func): Expected `func` to be a Python "
+                f"function, got: {type(func)}"
+            )
+
+        ns, name = parse_qualname(qualname)
+        validate_namespace(ns)
+        if func.__name__ != name:
+            raise ValueError(
+                f"custom_op(qualname='{qualname}', ...)(func): expected `func` "
+                f"to have name '{name}' but got '{func.__name__}'. "
+                f"Please either change the name of `func` or the qualname that "
+                f"is passed to `custom_op`"
+            )
+
+        schema = infer_schema(func) if manual_schema is None else manual_schema
+        schema_str = f"{name}{schema}"
+        function_schema = FunctionSchema.parse(schema_str)
+        validate_schema(function_schema)
+        if manual_schema is not None:
+            validate_function_matches_schema(function_schema, func)
+
+        lib = library.Library(ns, "FRAGMENT")
+        lib.define(schema_str)
+        ophandle = find_ophandle_or_throw(ns, function_schema.name)
+        result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
+
+        result.__name__ = func.__name__
+        result.__module__ = func.__module__
+        result.__doc__ = func.__doc__
+
+        library.impl(lib, result._opname, "Autograd")(
+            autograd_kernel_indirection(weakref.proxy(result))
+        )
+
+        torch._C._dispatch_set_report_error_callback(
+            ophandle, functools.partial(report_error_callback, weakref.proxy(result))
+        )
+
+        return result
+
+    return inner
+
+
+# Global dictionary holding references to all CustomOp objects
+# Yes, it keeps all CustomOps alive (see NOTE [CustomOp lifetime])
+# Used to query the CustomOp associated with a specific C++ dispatcher operator.
+# An example usage is FakeTensor: FakeTensor checks if a specific operator
+# has an implementation registered via the CustomOp API.
+# Indexed by qualname (e.g. aten::foo)
+global_registry: typing.Dict[str, "CustomOp"] = {}
+
+
+class CustomOp:
+    r"""Class for custom operators in PyTorch.
+
+    Use the CustomOp API to create user-defined custom operators that behave
+    just like regular PyTorch operators (e.g. torch.sin, torch.mm) when it
+    comes to various PyTorch subsystems (like torch.compile).
+
+    To construct a `CustomOp`, use `custom_op`.
+    """
+
+    def __init__(self, lib, cpp_ns, schema, operator_name, ophandle, *, _private_access=False):
+        super().__init__()
+        if not _private_access:
+            raise RuntimeError(
+                "The CustomOp constructor is private and we do not guarantee "
+                "BC for it. Please use custom_op(...) to create a CustomOp object"
+            )
+        name = f"{cpp_ns}::{operator_name}"
+        self._schema = schema
+        self._cpp_ns = cpp_ns
+        self._lib: library.Library = lib
+        self._ophandle: _C._DispatchOperatorHandle = ophandle
+        # Has the name of the op, e.g. "foo". We cache here for convenience.
+        self._opname: str = operator_name
+        # this is _opname but with namespace. e.g. "custom::foo"
+        self._qualname: str = name
+        self.__name__ = None  # mypy requires this
+        # NB: Some of these impls are registered as kernels to DispatchKeys.
+        # Modifying the _impls dict directly won't do anything in that case.
+        self._impls: typing.Dict[str, typing.Optional[FuncAndLocation]] = {}
+        # See NOTE [CustomOp autograd kernel indirection]
+        self._registered_autograd_kernel_indirection = False
+
+        global_registry[self._qualname] = self
+
+    def _register_autograd_kernel_indirection(self):
+        assert not self._registered_autograd_kernel_indirection
+        self._lib.impl(self._opname, autograd_kernel_indirection(weakref.proxy(self)), "Autograd")
+        self._registered_autograd_kernel_indirection = True
+
+    # Records the impl and the source location in self._impls
+    # Note that this doesn't cause torch.library to use the impl, that
+    # needs to be done in a separate self._lib.impl call.
+    def _register_impl(self, kind, func, stacklevel=2):
+        if self._has_impl(kind):
+            func_and_location = self._impls[kind]
+            assert func_and_location is not None  # Pacify mypy
+            location = func_and_location.location
+            raise RuntimeError(
+                f"Attempting to register a {kind} impl for operator {self._qualname} "
+                f"that already has a {kind} impl registered from Python at "
+                f"{location}. This is not supported."
+            )
+        frame = inspect.getframeinfo(sys._getframe(stacklevel))
+        location = f"{frame.filename}:{frame.lineno}"
+        self._impls[kind] = FuncAndLocation(func, location)
+
+    def _get_impl(self, kind):
+        return self._impls[kind]
+
+    def _has_impl(self, kind):
+        return kind in self._impls
+
+    def _destroy(self):
+        # NOTE: [CustomOp lifetime]
+        # A CustomOp, once created, lives forever. The mechanism is that the
+        # global registry holds a reference to it. However, to make testing
+        # easier, we want to be able to destroy CustomOp objects.
+        # CustomOp._destroy does the job, though it leaves the CustomOp
+        # in a garbage state.
+        del self._lib
+
+        opnamespace = getattr(torch.ops, self._cpp_ns)
+        if hasattr(opnamespace, self._opname):
+            delattr(opnamespace, self._opname)
+
+        del global_registry[self._qualname]
+
+    def __repr__(self):
+        return f'<CustomOp(op="{self._qualname}")>'
+
+    def __call__(self, *args, **kwargs):
+        # Bypass torch.ops.* and directly do OperatorHandle::callBoxed.
+        # Using torch.ops.* is a bit of a pain (it can be slow and it has lifetime
+        # issues from caching operators that make testing CustomOp difficult).
+        result = _C._dispatch_call_boxed(self._ophandle, *args, **kwargs)
+        return result
+
+    def impl(
+        self, device_types: typing.Union[str, typing.Iterable[str]], _stacklevel=2,
+    ) -> typing.Callable:
+        r"""Register an implementation for a device type for this CustomOp object.
+
+        WARNING: if you're a user, please do not use this directly
+        (instead use the torch._custom_ops APIs).
+        Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+        If the CustomOp is passed multiple Tensor inputs with different device
+        types, it will dispatch to the registered implementation for the highest
+        priority device type among those present.
+        The supported device types, in order of priority, are {'cuda', 'cpu'}.
+
+        This API is used as a decorator (see examples).
+
+        Arguments:
+            device_types (str or Iterable[str]): the device type(s) to register the function for.
+
+        Examples::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> @custom_op("my_library::numpy_cos")
+            >>> def numpy_cos(x: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> # Register an implementation for CPU Tensors
+            >>> @numpy_cos.impl('cpu')
+            >>> def numpy_cos_impl_cpu(x):
+            >>>     return torch.from_numpy(np.cos(x.numpy()))
+            >>>
+            >>> # Register an implementation for CUDA Tensors
+            >>> @numpy_cos.impl('cuda')
+            >>> def numpy_cos_impl_cuda(x):
+            >>>     return torch.from_numpy(np.cos(x.cpu().numpy())).to(x.device)
+            >>>
+            >>> x = torch.randn(3)
+            >>> numpy_cos(x)  # calls numpy_cos_impl_cpu
+            >>>
+            >>> x_cuda = x.cuda()
+            >>> numpy_cos(x)  # calls numpy_cos_impl_cuda
+
+        """
+        if isinstance(device_types, str):
+            device_types = [device_types]
+        for device_type in device_types:
+            validate_device_type(device_type)
+
+        def inner(f):
+            for device_type in set(device_types):
+                self._check_doesnt_have_library_impl(device_type)
+                self._register_impl(device_type, f, stacklevel=_stacklevel)
+                dispatch_key = SUPPORTED_DEVICE_TYPE_TO_KEY[device_type]
+                library.impl(self._lib, self._opname, dispatch_key)(f)
+            return f
+
+        return inner
+
+    def _check_doesnt_have_library_impl(self, device_type):
+        if self._has_impl(device_type):
+            return
+        key = SUPPORTED_DEVICE_TYPE_TO_KEY[device_type]
+        if _C._dispatch_has_computed_kernel_for_dispatch_key(self._qualname, key):
+            raise RuntimeError(
+                f"impl(..., device_types={device_type}): the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing torch.library or TORCH_LIBRARY registration.")
+
+    def impl_factory(self) -> typing.Callable:
+        r"""Register an implementation for a factory function."""
+
+        def inner(f):
+            self._register_impl("factory", f)
+            library.impl(self._lib, self._opname, "BackendSelect")(f)
+            return f
+
+        return inner
+
+    def impl_abstract(self, _stacklevel=2) -> typing.Callable:
+        r"""Register an abstract implementation for this operator.
+
+        WARNING: please do not use this directly (and instead use the torch._custom_ops
+        APIs). Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+        An "abstract implementation" specifies the behavior of this operator on
+        Tensors that carry no data. Given some input Tensors with certain properties
+        (sizes/strides/storage_offset/device), it specifies what the properties of
+        the output Tensors are.
+
+        The abstract implementation has the same signature as the operator.
+        It is run for both FakeTensors and meta tensors. To write an abstract
+        implementation, assume that all Tensor inputs to the operator are
+        regular CPU/CUDA/Meta tensors, but they do not have storage, and
+        you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+        The abstract implementation must consist of only PyTorch operations
+        (and may not directly access the storage or data of any input or
+        intermediate Tensors).
+
+        This API is used as a decorator (see examples).
+
+        Examples::
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> # Example 1: an operator without data-dependent output shape
+            >>> @custom_op('my_library::custom_linear')
+            >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> @custom_linear.impl_abstract()
+            >>> def custom_linear_abstract(x, weight):
+            >>>     assert x.dim() == 2
+            >>>     assert weight.dim() == 2
+            >>>     assert bias.dim() == 1
+            >>>     assert x.shape[1] == weight.shape[1]
+            >>>     assert weight.shape[0] == bias.shape[0]
+            >>>     assert x.device == weight.device
+            >>>
+            >>>     return (x @ weight.t()) + bias
+            >>>
+            >>> # Example 2: an operator with data-dependent output shape
+            >>> @custom_op('my_library::custom_nonzero')
+            >>> def custom_nonzero(x: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> @custom_nonzero.impl_abstract()
+            >>> def custom_nonzero_abstract(x):
+            >>>     # Number of nonzero-elements is data-dependent.
+            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # we use the ctx object to construct a new symint that
+            >>>     # represents the data-dependent size.
+            >>>     ctx = torch._custom_op.get_ctx()
+            >>>     nnz = ctx.create_unbacked_symint()
+            >>>     shape = [x.dim(), nnz]
+            >>>     result = x.new_empty(shape, dtype=torch.long)
+            >>>     return result
+            >>>
+            >>> @custom_nonzero.impl(['cpu', 'cuda'])
+            >>> def custom_nonzero_impl(x):
+            >>>     x_np = to_numpy(x)
+            >>>     res = np.stack(np.nonzero(x_np), axis=1)
+            >>>     # unbacked symbolic ints in PyTorch must be >= 2, so we
+            >>>     # constrain the range to at least 2
+            >>>     if res.shape[0] <= 1:
+            >>>         raise RuntimeError("not supported")
+            >>>     return torch.tensor(res, device=x.device)
+
+        """
+
+        def inner(f):
+            self._check_doesnt_have_library_meta_impl()
+            self._register_impl("abstract", f, stacklevel=_stacklevel)
+            location = self._get_impl("abstract").location
+
+            qualname = self._qualname
+
+            # Handle DispatchKey.Meta registration
+            @functools.wraps(f)
+            def f_with_ctx(*args, **kwargs):
+                def error_on_ctx():
+                    raise RuntimeError(
+                        f"Attempted to call get_ctx() for the meta implementation "
+                        f"for {qualname}."
+                        f"You have presumably called get_ctx() because the operator "
+                        f"has a data-dependent output shape; if so, there is no "
+                        f"such meta implementation and this error is the correct "
+                        f"behavior. Otherwise, please remove the call to get_ctx() "
+                        f"in the implementation registered with impl_abstract "
+                        f"at {location}"
+                    )
+
+                with torch._library.abstract_impl.set_ctx_getter(error_on_ctx):
+                    return f(*args, **kwargs)
+
+            self._lib.impl(self._opname, f_with_ctx, "Meta")
+            return f
+
+        return inner
+
+    def _check_can_register_backward(self):
+        def error(detail):
+            raise RuntimeError(
+                f"Cannot use torch._custom_ops APIs to register backward "
+                f"formula for {detail}. Got operator "
+                f"{self._qualname} with schema: {schema}"
+            )
+
+        schema = self._schema
+        if schema.kind() != SchemaKind.functional:
+            error("non-functional operator")
+
+        rets = schema.returns
+        if not schema.returns:
+            error("operator with no returns")
+
+        assert len(rets) > 0
+        is_non_mutating_view = any(
+            r.annotation is not None and not r.annotation.is_write for r in rets
+        )
+        if is_non_mutating_view:
+            error("operator that returns views")
+
+        # We make assumptions about the schema's return types.
+        allowed_return_types = {
+            BaseType(BaseTy.int): "int",
+            BaseType(BaseTy.SymInt): "SymInt",
+            BaseType(BaseTy.bool): "bool",
+            BaseType(BaseTy.float): "float",
+            BaseType(BaseTy.Tensor): "Tensor",
+            ListType(BaseType(BaseTy.Tensor), None): "List[Tensor]",
+        }
+        for ret in schema.returns:
+            if ret.type in allowed_return_types:
+                continue
+            error(f"operator with return not in {list(allowed_return_types.values())} (got {ret.type})")
+
+    def _check_doesnt_have_library_autograd_impl(self):
+        if self._registered_autograd_kernel_indirection:
+            return
+
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+            raise RuntimeError(
+                f"impl_backward/impl_save_for_backward: the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
+                f"CompositeImplicitAutograd operators do not need an autograd formula; "
+                f"instead, the operator will decompose into its constituents and those "
+                f"can have autograd formulas defined on them.")
+
+        # We can improve this by adding "all Autograd<BACKEND> keys", but
+        # realistically people will just be using this API for CPU/CUDA for now.
+        for key in ["Autograd", "AutogradCPU", "AutogradCUDA"]:
+            if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, key):
+                raise RuntimeError(
+                    f"impl_backward/impl_save_for_backward: "
+                    f"the operator {self._qualname} already has an Autograd kernel "
+                    f"registered to DispatchKey::{key} vi a pre-existing "
+                    f"torch.library or TORCH_LIBRARY registration. Please either "
+                    f"remove those registrations or don't use the torch._custom_ops APIs")
+
+    def _check_doesnt_have_library_meta_impl(self):
+        if self._has_impl("abstract"):
+            return
+
+        # If the user's operator is CompositeExplicitAutograd,
+        # allow them to impl_abstract. This is being pragmatic
+        # (existing custom ops may have CompositeExplicitAutograd
+        # registration that don't work with Meta kernels, so this
+        # gives them an escape hatch).
+        if (
+            _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeExplicitAutograd")
+            and not _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta")
+        ):
+            return
+
+        # Otherwise, if the user's already has a Meta kernel or their
+        # op is CompositeImplicitAutograd or some other alias dispatch key,
+        # raise.
+
+        # Special case for CompositeImplicitAutograd
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
+                f"CompositeImplicitAutograd operators do not need an abstract impl; "
+                f"instead, the operator will decompose into its constituents and those "
+                f"can have abstract impls defined on them.")
+
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta"):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self._qualname} "
+                f"already has an DispatchKey::Meta implementation via a "
+                f"pre-existing torch.library or TORCH_LIBRARY registration. "
+                f"Please either remove that registration or don't call impl_abstract.")
+
+    # NOTE ["backward", "save_for_backward", and "autograd"]
+    # As a part of the explicit autograd API, a user must provide us
+    # a "save_for_backward" function and a "backward" function.
+    # When both of these have been provided, then we automatically
+    # construct the "autograd" kernel.
+    def _register_autograd_kernel(self):
+        assert self._has_impl("backward")
+        assert self._has_impl("save_for_backward")
+        kernel = construct_autograd_kernel(
+            self._schema,
+            self._output_differentiability,
+            self,
+            get_op(self._qualname),
+            self._get_impl("save_for_backward").func,
+            self._get_impl("backward").func)
+        self._register_impl("autograd", kernel)
+
+    def impl_save_for_backward(self, _stacklevel=2):
+        r"""Register a function that tells us what to save for backward.
+
+        Please see impl_backward for more details.
+        """
+        def inner(f):
+            self._check_can_register_backward()
+            self._check_doesnt_have_library_autograd_impl()
+            if not self._registered_autograd_kernel_indirection:
+                self._register_autograd_kernel_indirection()
+            self._register_impl("save_for_backward", f, stacklevel=_stacklevel)
+            if self._has_impl("backward"):
+                self._register_autograd_kernel()
+        return inner
+
+    def impl_backward(self, output_differentiability=None, _stacklevel=2):
+        r"""Registers a backward formula.
+
+        WARNING: if you're a user, please do not use this directly
+        (instead use the torch._custom_ops APIs).
+        Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+        In order for the CustomOp to work with autograd, you need to register
+        a backward formula. There are two pieces to this:
+        1. You must give us a function to specify what to save for backward.
+           Call this the "save for backward" function.
+        2. You must give us a function that computes gradients. Call this the
+           "backward" function.
+
+        Use `impl_save_for_backward` to define a "save for backward" function
+        that specifies what gets saved for backward. The function should accept
+        two arguments ``(inputs, output)`` and return the quantities to be saved
+        for backward.
+
+        During runtime, when you call the CustomOp, PyTorch will invoke the
+        "save for backward" function with the inputs and output of the CustomOp.
+
+        Use `impl_backward` to define the "backward" function. The backward
+        function must accept ``(ctx, saved, *grads)``:
+        - ``ctx`` is a context object where we may provide information
+        - ``saved`` is exactly what gets returned from the "save for backward"
+          function
+        - ``grads`` is one or more gradients. The number of gradients matches
+          the number of outputs of the CustomOp.
+
+        The backward function must return a dict that maps the name of
+        an input to the CustomOp to its corresponding gradient. All inputs that
+        were declared to be Tensors in the CustomOp definition must be accounted
+        for in the dict. The gradient may be a Tensor or None.
+
+        """
+        if output_differentiability is not None:
+            def yell():
+                raise RuntimeError(
+                    f"impl_backward(output_differentiability): expected "
+                    f"output_differentiability to be a list of bools with "
+                    f"length equal to the number of outputs of this CustomOp "
+                    f"got: {output_differentiability}")
+
+            if not isinstance(output_differentiability, list):
+                yell()
+            for diff in output_differentiability:
+                if not isinstance(diff, bool):
+                    yell()
+            if len(self._schema.returns) != len(output_differentiability):
+                yell()
+
+        def inner(f):
+            self._check_can_register_backward()
+            self._check_doesnt_have_library_autograd_impl()
+            if not self._registered_autograd_kernel_indirection:
+                self._register_autograd_kernel_indirection()
+            self._register_impl("backward", f, stacklevel=_stacklevel)
+            self._output_differentiability = output_differentiability
+            if self._has_impl("save_for_backward"):
+                self._register_autograd_kernel()
+        return inner
+
+
+@dataclasses.dataclass
+class FuncAndLocation:
+    func: typing.Callable
+    location: str
+
+
+def find_ophandle_or_throw(cpp_ns: str, operator_name: OperatorName):
+    overload_name = (
+        "" if operator_name.overload_name is None else operator_name.overload_name
+    )
+    return _C._dispatch_find_schema_or_throw(
+        f"{cpp_ns}::{str(operator_name.name)}", overload_name
+    )
+
+
+def validate_namespace(ns: str) -> None:
+    if "." in ns:
+        raise ValueError(
+            f'custom_op(..., ns="{ns}"): expected ns to not contain any . (and be a '
+            f"valid variable name)"
+        )
+    if ns in RESERVED_NS:
+        raise ValueError(
+            f"custom_op(..., ns='{ns}'): '{ns}' is a reserved namespace, "
+            f"please choose something else. "
+        )
+
+def validate_schema(schema: FunctionSchema) -> None:
+    if not torch._library.utils.is_functional_schema(schema):
+        raise ValueError(
+            f"custom_op only supports functional operators "
+            f"(ops that do not mutate any inputs, do not return "
+            f"views of the inputs, and has at least one return). "
+            f"Got the following non-functional schema: {schema}"
+        )
+
+    # For simplicity: don't allow self arguments
+    if schema.arguments.self_arg is not None:
+        raise ValueError(
+            f"custom_op does not support arguments named 'self'. Please "
+            f"rename your argument. Got: {schema}"
+        )
+
+
+def parse_qualname(qualname: str) -> typing.Tuple[str, str]:
+    names = qualname.split("::", 1)
+    if len(names) != 2:
+        raise ValueError(f"Expected there to be a namespace in {qualname}, i.e. The "
+                         f"operator name should look something like ns::foo")
+    if '.' in names[1]:
+        raise ValueError(f"The torch.custom_ops APIs do not handle overloads, "
+                         f"i.e. operator names with '.' in them. "
+                         f"Please name your operator something like ns::foo. "
+                         f"Got: {qualname}")
+    return names[0], names[1]
+
+
+def validate_device_type(device_type: str) -> None:
+    if device_type not in SUPPORTED_DEVICE_TYPE_TO_KEY:
+        raise ValueError(
+            f"CustomOp.impl(device_types=[{device_type}, ...]): we only support device_type "
+            f"in {SUPPORTED_DEVICE_TYPE_TO_KEY.keys()}."
+        )
+
+
+def supported_param(param: inspect.Parameter) -> bool:
+    return param.kind in (
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.KEYWORD_ONLY,
+    )
+
+
+def validate_function_matches_schema(
+    schema: FunctionSchema, func: typing.Callable
+) -> None:
+    sig = inspect.signature(func)
+
+    if not all(supported_param(p) for _, p in sig.parameters.items()):
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): positional-only args, "
+            f"varargs, and kwargs are not supported. Please rewrite `func` "
+            f"to not have them. Got `func` with signature: {sig}"
+        )
+
+    if (
+        any(
+            p.annotation is not inspect.Parameter.empty
+            for _, p in sig.parameters.items()
+        )
+        or sig.return_annotation is not inspect.Signature.empty
+    ):
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): When passing in a manual "
+            f"schema, we expect `func` to have no type annotations to avoid "
+            f"ambiguity. Got `func` with signature: {sig}"
+        )
+
+    positional = [
+        (name, param)
+        for name, param in sig.parameters.items()
+        if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+    ]
+    kwargonly = [
+        (name, param)
+        for name, param in sig.parameters.items()
+        if param.kind == inspect.Parameter.KEYWORD_ONLY
+    ]
+
+    def error():
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): When passing in a manual "
+            f"schema, we expect `func`'s signature to match `manual_schema` "
+            f"(aside from type annotations). "
+            f"func's signature: {sig}, manual_schema: {schema}"
+        )
+
+    def error_default_args():
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): "
+            f"neither func nor manual_schema should have default "
+            f"arguments. Got "
+            f"func's signature: {sig}, manual_schema: {schema}"
+        )
+
+    def compare(sig_args, schema_args):
+        if len(sig_args) != len(schema_args):
+            error()
+        for (name, param), arg in zip(sig_args, schema_args):
+            if name != arg.name:
+                error()
+            if param.default is not inspect.Parameter.empty or arg.default is not None:
+                error_default_args()
+
+    compare(positional, schema.arguments.flat_positional)
+    compare(kwargonly, schema.arguments.flat_kwarg_only)
+
+
+def infer_schema(prototype_function: typing.Callable) -> str:
+    sig = inspect.signature(prototype_function)
+
+    def error_fn(what):
+        raise ValueError(
+            f"custom_op(...)(func): {what} " f"Got func with signature {sig})"
+        )
+
+    params = [
+        parse_param(name, param, error_fn) for name, param in sig.parameters.items()
+    ]
+    ret = parse_return(sig.return_annotation, error_fn)
+    return f"({', '.join(params)}) -> {ret}"
+
+
+def parse_param(name, param, error_fn):
+    if not supported_param(param):
+        error_fn("We do not support positional-only args, varargs, or varkwargs.")
+
+    if param.annotation is inspect.Parameter.empty:
+        error_fn(f"Parameter {name} must have a type annotation.")
+
+    if param.annotation not in SUPPORTED_PARAM_TYPES.keys():
+        error_fn(
+            f"Parameter {name} has unsupported type {param.annotation}. "
+            f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
+        )
+
+    if param.default is not inspect.Parameter.empty:
+        error_fn(
+            f"Parameter {name} has a default value; this is not supported. "
+            f"If you want to use default values then create a function with "
+            f"default values that calls the CustomOp"
+        )
+
+    return f"{SUPPORTED_PARAM_TYPES[param.annotation]} {name}"
+
+
+def derived_types(
+    base_type, cpp_type, list_base, optional_base_list, optional_list_base
+):
+    result = [
+        (base_type, cpp_type),
+        (typing.Optional[base_type], f"{cpp_type}?"),
+    ]
+    if list_base:
+        result.append((typing.Sequence[base_type], f"{cpp_type}[]"))  # type: ignore[valid-type]
+    if optional_base_list:
+        result.append((typing.Sequence[typing.Optional[base_type]], f"{cpp_type}?[]"))  # type: ignore[valid-type]
+    if optional_list_base:
+        result.append((typing.Optional[typing.Sequence[base_type]], f"{cpp_type}[]?"))  # type: ignore[valid-type]
+    return result
+
+
+def get_supported_param_types():
+    data = [
+        # (python type, schema type, type[] variant, type?[] variant, type[]? variant
+        (torch.Tensor, "Tensor", True, True, False),
+        (int, "SymInt", True, False, True),
+        (float, "float", True, False, True),
+        (bool, "bool", True, False, True),
+        (str, "str", False, False, False),
+        (torch.types.Number, "Scalar", True, False, False),
+        (torch.dtype, "ScalarType", False, False, False),
+        (torch.device, "Device", False, False, False),
+    ]
+    result = []
+    for line in data:
+        result.extend(derived_types(*line))
+    return dict(result)
+
+
+SUPPORTED_RETURN_TYPES = {
+    torch.Tensor: "Tensor",
+    typing.List[torch.Tensor]: "Tensor[]",
+    int: "SymInt",
+    float: "float",
+    bool: "bool",
+    torch.types.Number: "Scalar",
+}
+
+
+def parse_return(annotation, error_fn):
+    origin = typing.get_origin(annotation)
+    if origin is not tuple:
+        if annotation not in SUPPORTED_RETURN_TYPES.keys():
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+        return SUPPORTED_RETURN_TYPES[annotation]
+
+    args = typing.get_args(annotation)
+    for arg in args:
+        if arg not in SUPPORTED_RETURN_TYPES:
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+
+    return "(" + ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args]) + ")"
+
+
+SUPPORTED_PARAM_TYPES = get_supported_param_types()
+
+
+def report_error_callback(custom_op: typing.Any, key: str) -> None:
+    if key == "Undefined":
+        raise NotImplementedError(
+            f"{custom_op}: There were no Tensor inputs to this operator "
+            f"(e.g. you passed an empty list of Tensors). If your operator is a "
+            f"factory function (that is, it takes no Tensors and constructs "
+            f"a new one), then please use CustomOp.impl_factory to register "
+            f"an implementation for it"
+        )
+    if key == "Meta":
+        raise NotImplementedError(
+            f"{custom_op}: when running with device='Meta' tensors: there is no "
+            f"abstract impl registered for this CustomOp. Please register one via "
+            f"CustomOp.impl_abstract to get this CustomOp to work with Meta tensors"
+        )
+    if key in ("CPU", "CUDA"):
+        device = key.lower()
+        raise NotImplementedError(
+            f"{custom_op}: when running with device='{device}' tensors: there is no "
+            f"{device} impl registered for this CustomOp. Please register one via "
+            f"CustomOp.impl(device_type='{device}')"
+        )
+    raise NotImplementedError(
+        f"{custom_op}: No implementation for dispatch key {key}. It is likely "
+        f"that we have not added this functionality yet, please either open an "
+        f"issue or if you're feeling adventurous, use the low-level "
+        f"torch.library API"
+    )
+
+
+def custom_op_from_existing(op):
+    ns = op.namespace
+    lib = torch.library.Library(ns, "FRAGMENT")
+    name = op.name().split("::")[-1]
+    schema_str = str(op._schema)
+    # CustomOp expects the schema string without the namespace
+    schema_str = schema_str.split("::")[-1]
+    schema = FunctionSchema.parse(schema_str)
+    return CustomOp(lib, ns, schema, name, op, _private_access=True)
+
+
+def get_op(qualname):
+    def error_not_found():
+        raise ValueError(
+            f"Could not find the operator {qualname}. Please make sure you have "
+            f"already registered the operator and (if registered from C++) "
+            f"loaded it via torch.ops.load_library.")
+
+    ns, name = parse_qualname(qualname)
+    if not hasattr(torch.ops, ns):
+        error_not_found()
+    opnamespace = getattr(torch.ops, ns)
+    if not hasattr(opnamespace, name):
+        error_not_found()
+    packet = getattr(opnamespace, name)
+    if not hasattr(packet, 'default'):
+        error_not_found()
+    return packet.default
+
+
+def _find_custom_op(qualname, also_check_torch_library=False):
+    if qualname in global_registry:
+        return global_registry[qualname]
+    if not also_check_torch_library:
+        raise RuntimeError(
+            f"Could not find custom op \"{qualname}\". Did you register it via "
+            f"the torch._custom_ops API?")
+    overload = get_op(qualname)
+    result = custom_op_from_existing(overload)
+    return result
+
+
+def get_abstract_impl(qualname):
+    if qualname not in torch._custom_op.impl.global_registry:
+        return None
+    custom_op = torch._custom_op.impl.global_registry[qualname]
+    if custom_op is None:
+        return None
+    if not custom_op._has_impl("abstract"):
+        return None
+    return custom_op._get_impl("abstract").func
+
+
+def _custom_op_with_schema(qualname, schema, needs_fixed_stride_order=True):
+    ns, name = qualname.split("::")
+    schema_str = f"{name}{schema}"
+    function_schema = FunctionSchema.parse(schema_str)
+    validate_schema(function_schema)
+    tags = [torch._C.Tag.needs_fixed_stride_order] if needs_fixed_stride_order else []
+    lib = library.Library(ns, "FRAGMENT")
+    lib.define(schema_str, tags=tags)
+    ophandle = find_ophandle_or_throw(ns, function_schema.name)
+    result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
+    result._register_autograd_kernel_indirection()
+
+    torch._C._dispatch_set_report_error_callback(
+        ophandle, functools.partial(report_error_callback, weakref.proxy(result))
+    )
+    return get_op(qualname)
diff --git a/MLPY/Lib/site-packages/torch/_custom_ops.py b/MLPY/Lib/site-packages/torch/_custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..77d01d0d23f87632a6e2499764140b5815193cae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_custom_ops.py
@@ -0,0 +1,322 @@
+import inspect
+
+from torch._custom_op.impl import (
+    _custom_op_with_schema,
+    _find_custom_op,
+    infer_schema,
+    parse_qualname,
+    validate_namespace,
+)
+from torch.library import get_ctx
+
+__all__ = [
+    "custom_op",
+    "impl",
+    "impl_abstract",
+    "get_ctx",
+    "impl_save_for_backward",
+    "impl_backward",
+]
+
+
+def custom_op(qualname, func_or_schema=None):
+    r"""Register a new custom operator
+
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define the op (by providing an operator name and schema)
+    - we need to implement behavior for how the operator interacts with
+      various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+
+    This entrypoint defines the custom operator (the first step)
+    you must then perform the second step by calling various
+    ``impl_*`` APIs.
+
+    This API may be used as a decorator (see examples).
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+    Arguments:
+        qualname (str): Should be a string that looks like
+            "namespace::operator_name". Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module.
+        func_or_schema (Union[Callable, str]): Each PyTorch operator needs a
+            schema that tells PyTorch the types of the inputs/outputs.
+            If this is a Callable, we will automatically infer the schema from
+            the type annotations on the function (see examples). Otherwise,
+            if you don't want to use type annotations, you may provide us the
+            schema string.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the custom op.
+        >>> # We need to provide the API a "prototype function"
+        >>> # (a function that returns NotImplementedError), from which
+        >>> # we will infer the types of the inputs and outputs.
+        >>> @torch._custom_ops.custom_op("mylibrary::numpy_sin")
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> # The custom op is now accessible via the torch.ops module:
+        >>> torch.ops.mylibrary.numpy_sin
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_sin", device_types="cpu")
+        >>> def numpy_sin_impl_cpu(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_sin", device_types="cuda")
+        >>> def numpy_sin_impl_cuda(x):
+        >>>     return torch.from_numpy(np.sin(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> torch.ops.mylibrary.numpy_sin(x)  # calls numpy_sin_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> torch.ops.mylibrary.numpy_sin(x)  # calls numpy_sin_impl_cuda
+
+    """
+    ns, name = parse_qualname(qualname)
+    validate_namespace(ns)
+
+    def inner(func):
+        if not inspect.isfunction(func):
+            raise ValueError(
+                f"custom_op(...)(func): Expected `func` to be a Python "
+                f"function, got: {type(func)}"
+            )
+
+        if func.__name__ != name:
+            raise ValueError(
+                f"custom_op(qualname='{qualname}', ...)(func): expected `func` "
+                f"to have name '{name}' but got '{func.__name__}'. "
+                f"Please either change the name of `func` or the qualname that "
+                f"is passed to `custom_op`"
+            )
+
+        schema = infer_schema(func)
+        _custom_op_with_schema(qualname, schema)
+        return func
+
+    if func_or_schema is None:
+        return inner
+    if isinstance(func_or_schema, str):
+        _custom_op_with_schema(qualname, func_or_schema)
+    else:
+        return inner(func_or_schema)
+
+
+def impl(qualname, *, device_types=("cpu", "cuda"), func=None):
+    r"""Register an implementation for a device type for this custom op.
+
+    If the op is passed multiple Tensor inputs with different device
+    types, it will dispatch to the registered implementation for the highest
+    priority device type among those present.
+    The supported device types, in order of priority, are {'cuda', 'cpu'}.
+
+    This API may be used as a decorator (see examples).
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+    Arguments:
+        device_types (str or Iterable[str]): the device type(s) to register the function for.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the custom op.
+        >>> # We need to provide the API a "prototype function"
+        >>> # (a function that returns NotImplementedError), from which
+        >>> # we will infer the types of the inputs and outputs.
+        >>> @torch._custom_ops.custom_op("mylibrary::numpy_cos")
+        >>> def numpy_cos(x: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> # The custom op is now accessible via the torch.ops module:
+        >>> torch.ops.mylibrary.numpy_cos
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_cos", device_types="cpu")
+        >>> def numpy_cos_impl_cpu(x):
+        >>>     return torch.from_numpy(np.cos(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_cos", device_types="cuda")
+        >>> def numpy_cos_impl_cuda(x):
+        >>>     return torch.from_numpy(np.cos(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> torch.ops.mylibrary.numpy_cos(x)  # calls numpy_cos_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> torch.ops.mylibrary.numpy_cos(x)  # calls numpy_cos_impl_cuda
+
+    """
+
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl(device_types, _stacklevel=3)(func)
+        return func
+
+    if func is None:
+        return inner
+    return inner(func)
+
+
+def impl_abstract(qualname, *, func=None):
+    r"""Register an abstract implementation for this operator.
+
+    An "abstract implementation" specifies the behavior of this operator on
+    Tensors that carry no data. Given some input Tensors with certain properties
+    (sizes/strides/storage_offset/device), it specifies what the properties of
+    the output Tensors are.
+
+    The abstract implementation has the same signature as the operator.
+    It is run for both FakeTensors and meta tensors. To write an abstract
+    implementation, assume that all Tensor inputs to the operator are
+    regular CPU/CUDA/Meta tensors, but they do not have storage, and
+    you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+    The abstract implementation must consist of only PyTorch operations
+    (and may not directly access the storage or data of any input or
+    intermediate Tensors).
+
+    This API may be used as a decorator (see examples).
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+    Examples::
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Example 1: an operator without data-dependent output shape
+        >>> @torch._custom_ops.custom_op("mylibrary::custom_linear")
+        >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> @torch._custom_ops.impl_abstract("mylibrary::custom_linear")
+        >>> def custom_linear_abstract(x, weight):
+        >>>     assert x.dim() == 2
+        >>>     assert weight.dim() == 2
+        >>>     assert bias.dim() == 1
+        >>>     assert x.shape[1] == weight.shape[1]
+        >>>     assert weight.shape[0] == bias.shape[0]
+        >>>     assert x.device == weight.device
+        >>>
+        >>>     return (x @ weight.t()) + bias
+        >>>
+        >>> # Example 2: an operator with data-dependent output shape
+        >>> @torch._custom_ops.custom_op('mylibrary::custom_nonzero')
+        >>> def custom_nonzero(x: Tensor) -> Tensor:
+        >>>     ...
+        >>>
+        >>> @torch._custom_ops.impl_abstract("mylibrary::custom_nonzero")
+        >>> def custom_nonzero_abstract(x):
+        >>>     # Number of nonzero-elements is data-dependent.
+        >>>     # Since we cannot peek at the data in an abstract impl,
+        >>>     # we use the ctx object to construct a new symint that
+        >>>     # represents the data-dependent size.
+        >>>     ctx = torch._custom_ops.get_ctx()
+        >>>     nnz = ctx.create_unbacked_symint()
+        >>>     shape = [x.dim(), nnz]
+        >>>     result = x.new_empty(shape, dtype=torch.long)
+        >>>     return result
+        >>>
+        >>> @torch._custom_ops.impl("mylibrary::custom_nonzero")
+        >>> def custom_nonzero_impl(x):
+        >>>     x_np = to_numpy(x)
+        >>>     res = np.stack(np.nonzero(x_np), axis=1)
+        >>>     # unbacked symbolic ints in PyTorch must be >= 2, so we
+        >>>     # constrain the range to at least 2
+        >>>     if res.shape[0] <= 1:
+        >>>         raise RuntimeError("not supported")
+        >>>     return torch.tensor(res, device=x.device)
+
+    """
+    import torch.library
+
+    return torch.library.impl_abstract(qualname, func, _stacklevel=2)
+
+
+def impl_save_for_backward(qualname, *, func=None):
+    r"""Register a function that tells us what to save for backward.
+
+    Please see :func:`impl_backward` for more details.
+    """
+
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl_save_for_backward(_stacklevel=3)(func)
+        return func
+
+    if func is None:
+        return inner
+    return inner(func)
+
+
+def impl_backward(qualname, output_differentiability=None, *, func=None):
+    r"""Registers a backward formula for an operator.
+
+    In order for an operator to work with autograd, you need to register
+    a backward formula. There are two pieces to this:
+    1. You must give us a function to specify what to save for backward.
+       Call this the "save for backward" function.
+    2. You must give us a function that computes gradients. Call this the
+       "backward" function.
+
+    Use `impl_save_for_backward` to define a "save for backward" function
+    that specifies what gets saved for backward. The function should accept
+    two arguments ``(inputs, output)`` and return the quantities to be saved
+    for backward.
+
+    During runtime, when you call the operator in a forwards pass, PyTorch
+    will invoke the "save for backward" function with the inputs and output
+    of the operator.
+
+    Use `impl_backward` to define the "backward" function. The backward
+    function must accept ``(ctx, saved, *grads)``:
+    - ``ctx`` is a context object where we may provide information
+    - ``saved`` is exactly what gets returned from the "save for backward"
+      function
+    - ``grads`` is one or more gradients. The number of gradients matches
+      the number of outputs of the operator.
+
+    The backward function must return a dict that maps the name of
+    an input to the operator to its corresponding gradient. All inputs that
+    were declared to be Tensors in the operator definition must be accounted
+    for in the dict. The gradient may be a Tensor or None.
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+
+    """
+
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl_backward(output_differentiability, _stacklevel=3)(func)
+        return func
+
+    if func is None:
+        return inner
+    return inner(func)
+
+
+def _destroy(qualname):
+    """De-registers a custom op. For testing purposes only"""
+    custom_op = _find_custom_op(qualname)
+    custom_op._destroy()
diff --git a/MLPY/Lib/site-packages/torch/_decomp/__init__.py b/MLPY/Lib/site-packages/torch/_decomp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..361ad0bc40e1c1fa6f5a8cb4959ed6083a5bd639
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_decomp/__init__.py
@@ -0,0 +1,463 @@
+import inspect
+from collections import defaultdict
+from functools import wraps
+from itertools import chain
+from typing import Callable, Dict, List, Sequence, Union
+
+import torch
+import torch.library
+from torch._ops import HigherOrderOperator, OpOverload, OpOverloadPacket
+from torch._prims_common import CustomOutParamAnnotation
+from torch.utils import _pytree as pytree
+
+__all__ = [
+    "decomposition_table",
+    "pre_autograd_decomposition_table",
+    "meta_table",
+    "register_decomposition",
+    "get_decompositions",
+    "core_aten_decompositions",
+]
+
+
+# TODO: relax key type here; torch registrations should be possible to; but
+# right now this type is accurate
+global_decomposition_table: Dict[
+    str, Dict[torch._ops.OperatorBase, Callable]
+] = defaultdict(dict)
+
+decomposition_table = global_decomposition_table["post_autograd"]
+pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"]
+meta_table = global_decomposition_table["meta"]
+
+
+def _add_op_to_registry(registry, op, fn):
+    """
+    This is an internal API for adding an op to the decomposition table.
+
+    If op is OpOverload, it will be added to the registry directly.
+    If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry.
+    """
+    overloads: List[Union[torch._ops.OperatorBase]] = []
+    if isinstance(op, HigherOrderOperator):
+        # There's no concept of overloads for HigherOrderOperator
+        registry[op] = fn
+        return
+    elif isinstance(op, OpOverload):
+        overloads.append(op)
+    else:
+        assert isinstance(op, OpOverloadPacket)
+        for ol in op.overloads():
+            overloads.append(getattr(op, ol))
+
+    for op_overload in overloads:
+        if op_overload in registry:
+            raise RuntimeError(f"duplicate registrations for {op_overload}")
+        # TorchScript dumps a bunch of extra nonsense overloads
+        # which don't have corresponding dispatcher entries, we need
+        # to filter those out, e.g aten.add.float_int
+        if torch._C._dispatch_has_kernel(op_overload.name()):
+            registry[op_overload] = fn
+
+
+def _convert_out_params(f):
+    out_annotation = f.__annotations__.get("out")
+
+    # If there are no out params, do not wrap the function.
+    if not out_annotation:
+        return f
+
+    # Hack to detect when out is a Tuple. There seems to be no pretty way of doing this
+    if getattr(out_annotation, "__origin__", None) is tuple:
+        sig = inspect.signature(f)
+        out_names = sig.return_annotation._fields
+        # If out is a tuple, we need to register a function that unpacks all the out
+        # elements as this is what native_functions.yaml expects
+
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwargs = tuple(kwargs.pop(o, None) for o in out_names)
+            # Either all of the out kwargs are set or none of them
+            is_none = out_kwargs[0] is None
+            assert all((o is None) == is_none for o in out_kwargs)
+            return f(*args, **kwargs, out=None if is_none else out_kwargs)
+
+        out_params = [
+            inspect.Parameter(
+                o,
+                kind=inspect.Parameter.KEYWORD_ONLY,
+                default=None,
+                annotation=t,
+            )
+            for o, t in zip(out_names, out_annotation.__args__)
+        ]
+        # Drop the out parameter and concatenate the new kwargs in the signature
+        params = chain((v for k, v in sig.parameters.items() if k != "out"), out_params)
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params, return_annotation=sig.return_annotation  # type: ignore[arg-type]
+        )
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        for o in out_params:
+            _fn.__annotations__[o.name] = o.annotation
+
+        # Propagate that this function is wrapped by `out_wrapper`
+        _fn._torch_decompositions_out_wrapper = f._torch_decompositions_out_wrapper  # type: ignore[attr-defined]
+
+        return _fn
+
+    # Alternatively, there may be a single tensor out parameter with a name
+    # other than "out". This will need special treatment and is indicated by an
+    # annotation, which we will remove here so it is not exposed after wrapping.
+    custom_out_param_name = f.__annotations__.pop(CustomOutParamAnnotation, None)
+    if custom_out_param_name:
+
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwarg = kwargs.pop(custom_out_param_name, None)
+            return f(*args, **kwargs, out=out_kwarg)
+
+        out_param = inspect.Parameter(
+            custom_out_param_name,
+            kind=inspect.Parameter.KEYWORD_ONLY,
+            default=None,
+            annotation=out_annotation,
+        )
+
+        # Drop the out parameter and concatenate the new kwarg in the signature
+        sig = inspect.signature(f)
+        params = chain(
+            (v for k, v in sig.parameters.items() if k != "out"), (out_param,)
+        )
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params, return_annotation=sig.return_annotation  # type: ignore[arg-type]
+        )
+
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        _fn.__annotations__[out_param.name] = out_param.annotation
+
+        return _fn
+
+    return f
+
+
+def register_decomposition(
+    aten_op, registry=None, *, type="post_autograd", unsafe=False
+):
+    """
+    A decorator to register a function as a decomposition to the Python
+    decomposition table.  Use it like this::
+
+        @register_decomposition(torch.ops.aten.clamp_min)
+        def clamp_min(x):
+            return torch.clamp(self, min=min)
+
+    If you are writing a new decomposition, consider contributing it
+    directly to PyTorch in torch._decomp.decompositions.
+
+    This API is experimental; we are almost certainly going to extend
+    the API when we make decompositions eligible for use in transforms (e.g.,
+    autograd) and not just backend tracing, where we then need to know if a
+    decomposition can be used to simulate a transform.
+
+    By default, we also will register it to the Meta key of dispatcher,
+    and replace the c++ Meta implementation if there is already one.
+
+    unsafe kwarg is for reuse of this function for registering non-function
+    things
+    """
+
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+
+    def decomposition_decorator(fn: Callable) -> Callable:
+        orig_fn = fn
+        if not unsafe:
+            fn = _convert_out_params(fn)
+
+        nonlocal registry
+        if registry is None:
+            registry = global_decomposition_table[type]
+
+        def register(op):
+            _add_op_to_registry(registry, op, fn)
+
+        # To handle allowing multiple aten_ops at once
+        pytree.tree_map_(register, aten_op)
+        return orig_fn
+
+    return decomposition_decorator
+
+
+def get_decompositions(
+    aten_ops: Sequence[Union[torch._ops.OperatorBase, OpOverloadPacket]],
+    type: str = "post_autograd",
+) -> Dict[torch._ops.OperatorBase, Callable]:
+    """
+    Retrieve a dictionary of decompositions corresponding to the list of
+    operator overloads and overload packets passed as input.  Overload
+    packets will include all decomposed overloads in the packet.  If there is
+    no decomposition for a requested operator, it is silently ignored.
+
+    This API is experimental; we are almost certainly going to give an alternate,
+    more recommended formulation, where a user provides the set of operators
+    they know how to implement, and we provide decompositions for everything
+    not in this set.
+    """
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+
+    registry = global_decomposition_table[type]
+    packets_to_overloads = defaultdict(list)
+    for opo in registry:
+        if isinstance(opo, (OpOverload, OpOverloadPacket)):
+            packets_to_overloads[opo.overloadpacket].append(opo)
+    decompositions: Dict[torch._ops.OperatorBase, Callable] = {}
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket) and op in packets_to_overloads:
+            for op_overload in packets_to_overloads[op]:
+                decompositions[op_overload] = registry[op_overload]
+        elif isinstance(op, (torch._ops.OperatorBase)) and op in registry:
+            decompositions[op] = registry[op]
+    return decompositions
+
+
+def remove_decompositions(
+    decompositions: Dict[torch._ops.OperatorBase, Callable],
+    aten_ops: Sequence[Union[OpOverload, OpOverloadPacket]],
+) -> None:
+    """
+    Given a dictionary of decompositions obtained from get_decompositions(), removes
+    operators associated with a list of operator overloads and overload packets passed
+    as input. If the decomposition dictionary does not contain a decomposition that is
+    specified to be removed, it is silently ignored.
+    """
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket):
+            for overload_name in op.overloads():
+                opo = getattr(op, overload_name)
+                decompositions.pop(opo, None)
+        elif isinstance(op, OpOverload):
+            decompositions.pop(op, None)
+
+
+# populate the table
+import torch._decomp.decompositions
+import torch._refs
+
+
+# See NOTE [Core ATen Ops]
+#
+# list was copied from torch/_inductor/decomposition.py
+# excluding decompositions that results in prim ops
+# Resulting opset of decomposition is core aten ops
+def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
+    aten = torch.ops.aten
+    return get_decompositions(
+        [
+            aten.addcdiv,
+            aten.addcdiv_,
+            aten.addcmul,
+            aten.addcmul_,
+            aten.addr,
+            aten.affine_grid_generator,
+            aten.all,
+            aten.aminmax,
+            aten.arange.default,
+            aten.arange.start,
+            aten.avg_pool2d_backward,
+            aten.baddbmm,
+            aten.binary_cross_entropy,
+            aten.binary_cross_entropy_backward,
+            aten.binary_cross_entropy_with_logits,
+            aten.block_diag,
+            aten.celu,
+            aten.celu_,
+            aten.clamp_max,
+            aten.clamp_min,
+            aten.col2im,
+            aten.count_nonzero,
+            aten.linalg_cross,
+            aten.cudnn_batch_norm,
+            aten.cudnn_batch_norm_backward,
+            aten.deg2rad,
+            aten.deg2rad_,
+            aten.detach,
+            aten.diag_embed,
+            aten.diagonal_backward,
+            aten.dot,
+            aten.vdot,
+            aten.elu,
+            aten.elu_,
+            aten.elu_backward,
+            aten._embedding_bag,
+            aten.embedding_dense_backward,
+            aten.empty_like,
+            aten._euclidean_dist.default,
+            aten.expand_as,
+            aten.eye,
+            aten.fill,
+            aten.fill_,
+            aten.floor_divide,
+            aten.frac,
+            aten.frac_,
+            aten._fused_moving_avg_obs_fq_helper,
+            aten.gelu_,
+            aten.gelu_backward,
+            aten.glu,
+            aten.glu_backward,
+            aten.hardshrink,
+            aten.hardsigmoid,
+            aten.hardsigmoid_,
+            aten.hardsigmoid_backward,
+            aten.hardswish,
+            aten.hardswish_,
+            aten.hardswish_backward,
+            aten.hardtanh_,
+            aten.hardtanh_backward,
+            aten.heaviside,
+            aten.heaviside_,
+            aten.huber_loss,
+            aten.huber_loss_backward,
+            aten.im2col,
+            aten.index_add,
+            aten.index_add_,
+            aten.index_copy,
+            aten.index_copy_,
+            aten.index_fill,
+            aten.index_fill_,
+            aten.isin,
+            aten.isneginf,
+            aten.isposinf,
+            aten.l1_loss,
+            aten._lazy_clone,
+            aten._test_parallel_materialize,
+            aten.leaky_relu_,
+            aten.leaky_relu_backward,
+            aten.lerp,
+            aten.lerp_,
+            aten.linspace,
+            aten.logaddexp,
+            aten.logaddexp2,
+            aten.logit,
+            aten.logit_,
+            aten.logit_backward,
+            aten.log_sigmoid_backward,
+            aten.log_sigmoid_forward,
+            aten._log_softmax_backward_data,
+            aten.logspace,
+            aten.logsumexp.default,
+            aten.masked_fill,
+            aten.masked_fill_,
+            aten.mish,
+            aten.mish_,
+            aten.mse_loss,
+            aten.mse_loss_backward,
+            aten.multi_margin_loss,
+            aten.multilabel_margin_loss_forward,
+            aten.mv,
+            aten.mvlgamma,
+            aten.mvlgamma_,
+            aten.nansum,
+            aten.nan_to_num,
+            aten.nan_to_num_,
+            aten.narrow,
+            aten.native_batch_norm_backward,
+            aten.native_dropout_backward,
+            aten.native_group_norm_backward,
+            aten.native_layer_norm_backward,
+            aten.new_empty,
+            aten.new_full,
+            aten.new_ones,
+            aten.new_zeros,
+            aten.nll_loss_backward,
+            aten.nll_loss_forward,
+            aten.norm,
+            aten.ones,
+            aten.ones_like,
+            aten.pixel_shuffle,
+            aten.pixel_unshuffle,
+            aten._prelu_kernel,
+            aten._prelu_kernel_backward,
+            aten._reshape_alias,
+            aten.rad2deg,
+            aten.rad2deg_,
+            aten.reflection_pad1d,
+            aten.reflection_pad2d,
+            aten.reflection_pad3d,
+            aten.replication_pad1d,
+            aten.replication_pad2d,
+            aten.replication_pad3d,
+            aten.renorm,
+            aten.renorm_,
+            aten.replication_pad2d,
+            aten.roll,
+            aten.rot90,
+            aten.rrelu_with_noise,
+            aten.rrelu_with_noise_,
+            aten.rsub,
+            aten._scaled_dot_product_flash_attention_for_cpu.default,
+            aten.select_backward,
+            aten.select_scatter,
+            aten.sgn,
+            aten.sgn_,
+            aten.sigmoid_backward,
+            aten.silu,
+            aten.silu_,
+            aten.silu_backward,
+            aten.sinc,
+            aten.sinc_,
+            aten.slice_backward,
+            aten.smooth_l1_loss,
+            aten.smooth_l1_loss_backward,
+            aten.soft_margin_loss,
+            aten.soft_margin_loss_backward,
+            aten._softmax_backward_data,
+            aten.softplus,
+            aten.softplus_backward,
+            aten.softshrink,
+            aten.special_entr,
+            aten.special_log_ndtr,
+            aten.special_xlog1py,
+            aten.split.Tensor,
+            aten.split_with_sizes_copy,
+            aten.squeeze.default,
+            aten.squeeze.dim,
+            aten.std,
+            aten.std_mean,
+            aten.stack,
+            aten.sum.default,
+            aten.sum.out,
+            aten.t,
+            aten.take,
+            aten.tanh_backward,
+            aten.threshold,
+            aten.threshold_,
+            aten.threshold_backward,
+            aten.trace,
+            aten.transpose.int,
+            aten.tril,
+            aten.tril_,
+            aten.triu,
+            aten.triu_,
+            aten.unbind,
+            aten.unfold_backward,
+            aten.unfold_copy,
+            aten._unsafe_index,
+            aten.unsafe_split.Tensor,
+            aten.unsafe_split_with_sizes,
+            aten._unsafe_view,
+            aten.upsample_linear1d,
+            aten.upsample_bilinear2d,
+            aten.upsample_trilinear3d,
+            aten.upsample_nearest2d_backward,
+            aten.view_as_complex,
+            aten.xlogy,
+            aten.xlogy_,
+            aten.zero,
+            aten.zero_,
+            aten.zeros,
+            aten.zeros_like,
+            aten._chunk_cat,
+            aten._weight_norm_interface,
+        ]
+    )
diff --git a/MLPY/Lib/site-packages/torch/_decomp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d5083628a1709afeefd70ef32d4fe39dc50de62
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96a892bcb3d2194c802567f1b3d99037cadfd481
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_jvp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_jvp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c625285fe7a4dbde286712829cb3f89138f950b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_jvp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_rng.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_rng.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d509e239e7441a2032b870b82e1f206c7d3fb16b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_decomp/__pycache__/decompositions_for_rng.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_decomp/decompositions.py b/MLPY/Lib/site-packages/torch/_decomp/decompositions.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6b39a7b82004cd795ede252139422488d0d019a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_decomp/decompositions.py
@@ -0,0 +1,4659 @@
+import functools
+import numbers
+import operator
+import sys
+from enum import Enum
+from functools import partial, reduce
+from itertools import chain, product
+from typing import Any, Callable, cast, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch._prims as prims
+import torch._prims_common as utils
+import torch.nn.functional as F
+from torch import sym_float, sym_int, Tensor
+from torch._decomp import register_decomposition
+from torch._higher_order_ops.out_dtype import out_dtype
+from torch._prims_common import IntLike, NumberType, TensorLike, TensorSequenceType
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+    _safe_copy_out,
+    out_wrapper,
+)
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map
+
+DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
+
+# None of these functions are publicly accessible; get at them
+# from torch._decomps
+__all__: List[str] = []
+
+aten = torch._ops.ops.aten
+
+
+class Reduction(Enum):
+    NONE = 0
+    MEAN = 1
+    SUM = 2
+
+
+# This wraps a decomposition and performs various type promotion logic within it, depending on the strategy provided
+# We're currently re-using ELEMENTWISE_TYPE_PROMOTION_KIND, although some of the usages are on non-elementwise ops
+# Will need to validate the non-elementwise uses
+def type_casts(
+    f: Callable,
+    type_promotion: utils.ELEMENTWISE_TYPE_PROMOTION_KIND,
+    compute_dtype_only: bool = False,
+):
+    @functools.wraps(f)
+    def inner(*args, **kwargs):
+        flat_args = [
+            x for x in pytree.arg_tree_leaves(*args, **kwargs) if isinstance(x, Tensor)
+        ]
+        computation_dtype, result_dtype = utils.elementwise_dtypes(
+            *flat_args, type_promotion_kind=type_promotion
+        )
+
+        # TODO: pretty sure this is not quite right
+        def increase_prec(x):
+            if isinstance(x, Tensor):
+                return x.to(computation_dtype)
+            else:
+                return x
+
+        def decrease_prec(x):
+            if isinstance(x, Tensor):
+                return x.to(result_dtype)
+            else:
+                return x
+
+        r = f(*tree_map(increase_prec, args), **tree_map(increase_prec, kwargs))
+        if compute_dtype_only:
+            return r
+        else:
+            return tree_map(decrease_prec, r)
+
+    return inner
+
+
+compute_only_pw_cast_for_opmath = partial(
+    type_casts,
+    type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    compute_dtype_only=True,
+)
+pw_cast_for_opmath = partial(
+    type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+)
+pw_cast_for_int_to_real = partial(
+    type_casts, type_promotion=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+)
+
+
+# This expands x until x.dim() == dim. Might be useful as an operator
+def _unsqueeze_to_dim(x: Tensor, dim: int) -> Tensor:
+    for _ in range(dim - x.dim()):
+        x = x.unsqueeze(-1)
+    return x
+
+
+@register_decomposition(aten.tanh_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def tanh_backward(out_grad: Tensor, y: Tensor):
+    return out_grad * (1 - y * y).conj_physical()
+
+
+@register_decomposition(aten.sigmoid_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def sigmoid_backward(out_grad: Tensor, y: Tensor):
+    return out_grad * (y * (1 - y)).conj_physical()
+
+
+@register_decomposition(aten.softplus_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def softplus_backward(out_grad: Tensor, x: Tensor, beta: float, threshold: float):
+    z = (x * beta).exp()
+    return torch.where((x * beta) > threshold, out_grad, out_grad * z / (z + 1.0))
+
+
+@register_decomposition(aten.elu_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def elu_backward(
+    grad_output: Tensor,
+    alpha: float,
+    scale: float,
+    input_scale: float,
+    is_result: bool,
+    self_or_result: Tensor,
+):
+    negcoef = alpha * scale
+    poscoef = scale
+    negiptcoef = input_scale
+    if is_result:
+        return torch.where(
+            self_or_result <= 0,
+            grad_output * negiptcoef * (self_or_result + negcoef),
+            grad_output * poscoef,
+        )
+    else:
+        return torch.where(
+            self_or_result <= 0,
+            grad_output * negiptcoef * negcoef * torch.exp(self_or_result * negiptcoef),
+            grad_output * poscoef,
+        )
+
+
+@register_decomposition([aten.fill.Scalar])
+def fill_scalar(self, value):
+    return torch.full_like(self, value)
+
+
+@register_decomposition([aten.fill.Tensor])
+def fill_tensor(self, value: Tensor):
+    torch._check(
+        value.dim() == 0,
+        lambda: f"fill only supports 0-dimension value tensor but got tensor with {value.dim()} dimensions",
+    )
+    return aten.copy(self, value)
+
+
+@register_decomposition(aten.hardsigmoid)
+@out_wrapper()
+@pw_cast_for_opmath
+def hardsigmoid(self: Tensor) -> Tensor:
+    return torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6
+
+
+@register_decomposition(aten.hardsigmoid_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def hardsigmoid_backward(grad_output: Tensor, self: Tensor):
+    return torch.where(
+        (self > -3.0) & (self < 3.0),
+        grad_output * (1.0 / 6.0),
+        0.0,
+    )
+
+
+@register_decomposition(aten.hardtanh_backward)
+@out_wrapper("grad_input")
+def hardtanh_backward(
+    grad_output: Tensor, self: Tensor, min_val: float, max_val: float
+):
+    return torch.where((self <= min_val) | (self >= max_val), 0.0, grad_output)
+
+
+@register_decomposition(aten.hardswish)
+@out_wrapper()
+@pw_cast_for_opmath
+def hardswish(self: Tensor) -> Tensor:
+    return self * torch.clamp(torch.clamp(self + 3, min=0), max=6) / 6
+
+
+@register_decomposition(aten.hardswish_backward)
+@out_wrapper()
+@pw_cast_for_opmath
+def hardswish_backward(grad_output: Tensor, self: Tensor) -> Tensor:
+    return torch.where(
+        self < -3,
+        0.0,
+        torch.where(self <= 3, grad_output * ((self / 3) + 0.5), grad_output),
+    )
+
+
+@register_decomposition(aten.threshold_backward)
+@out_wrapper("grad_input")
+def threshold_backward(grad_output: Tensor, self: Tensor, threshold: float):
+    return torch.where(self <= threshold, 0, grad_output)
+
+
+@register_decomposition(aten.leaky_relu_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def leaky_relu_backward(
+    grad_output: Tensor, self: Tensor, negative_slope: float, self_is_result: bool
+):
+    return torch.where(self > 0, grad_output, grad_output * negative_slope)
+
+
+@register_decomposition(aten.gelu_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def gelu_backward(grad: Tensor, self: Tensor, approximate: str = "none"):
+    M_SQRT2 = 1.41421356237309504880
+    M_SQRT1_2 = 0.70710678118654752440
+    M_2_SQRTPI = 1.12837916709551257390
+    if approximate == "tanh":
+        kBeta = M_SQRT2 * M_2_SQRTPI * 0.5
+        kKappa = 0.044715
+        x_sq = self * self
+        x_cube = x_sq * self
+        inner = kBeta * (self + kKappa * x_cube)
+        tanh_inner = torch.tanh(inner)
+
+        left = 0.5 * self
+        right = 1 + tanh_inner
+
+        left_derivative = 0.5 * right
+
+        tanh_derivative = 1 - tanh_inner * tanh_inner
+        inner_derivative = kBeta * (1 + 3 * kKappa * x_sq)
+        right_derivative = left * tanh_derivative * inner_derivative
+
+        return grad * (left_derivative + right_derivative)
+    else:
+        kAlpha = M_SQRT1_2
+        kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5
+        cdf = 0.5 * (1 + torch.erf(self * kAlpha))
+        pdf = kBeta * torch.exp(self * self * -0.5)
+        return grad * (cdf + self * pdf)
+
+
+@register_decomposition(aten.mish_backward)
+@pw_cast_for_opmath
+def mish_backward(grad_output: Tensor, input: Tensor):
+    input_tanh_softplus = torch.tanh(F.softplus(input))
+    input_sigmoid = torch.sigmoid(input)
+    out = input * input_sigmoid * (1 - input_tanh_softplus * input_tanh_softplus)
+    return grad_output * (input_tanh_softplus + out)
+
+
+@register_decomposition(aten.silu)
+@out_wrapper()
+@pw_cast_for_opmath
+def silu(self: Tensor) -> Tensor:
+    return self * torch.sigmoid(self)
+
+
+@register_decomposition(aten.silu_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def silu_backward(grad_output: Tensor, self: Tensor) -> Tensor:
+    sigmoid = 1 / (1 + torch.exp(-self))
+    return grad_output * sigmoid * (1 + self * (1 - sigmoid))
+
+
+@register_decomposition(aten._prelu_kernel)
+def _prelu_kernel(self: Tensor, weight: Tensor) -> Tensor:
+    return torch.where(self > 0, self, weight * self)
+
+
+@register_decomposition(aten._prelu_kernel_backward)
+def _prelu_kernel_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    weight: Tensor,
+) -> Tuple[Tensor, Tensor]:
+    input_grad = torch.where(self > 0, grad_output, weight * grad_output)
+    weight_grad = torch.where(self > 0, 0.0, self * grad_output)
+    return (input_grad, weight_grad)
+
+
+@register_decomposition(aten.rrelu_with_noise)
+@aten.rrelu_with_noise.default.py_impl(DispatchKey.AutogradCUDA)
+@out_wrapper()
+@pw_cast_for_opmath
+def rrelu_with_noise(
+    self: Tensor,
+    noise: Tensor,
+    lower: float = 0.125,
+    upper: float = 0.3333333333333333,
+    training: bool = False,
+    generator: Optional[torch.Generator] = None,
+) -> Tensor:
+    assert generator is None
+    if training:
+        not_positive = self <= 0
+        r = aten.uniform(self, lower, upper)
+        output = torch.where(not_positive, self * r, self)
+        noise.copy_(torch.where(not_positive, r, 1))
+        return output
+    else:
+        negative_slope = (lower + upper) / 2
+        return aten.leaky_relu(self, negative_slope)
+
+
+@register_decomposition(aten.rrelu_with_noise_)
+@aten.rrelu_with_noise_.default.py_impl(DispatchKey.AutogradCUDA)
+@pw_cast_for_opmath
+def rrelu_with_noise_(
+    self: Tensor,
+    noise: Tensor,
+    lower: float,
+    upper: float,
+    training: bool = False,
+    generator: Optional[torch.Generator] = None,
+) -> Tensor:
+    return self.copy_(rrelu_with_noise(self, noise, lower, upper, training, generator))
+
+
+@register_decomposition(aten.rrelu_with_noise_backward)
+@out_wrapper()
+@pw_cast_for_opmath
+def rrelu_with_noise_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    noise: Tensor,
+    lower: float,
+    upper: float,
+    training: bool,
+    self_is_result: bool,
+) -> Tensor:
+    if training and upper - lower > 1e-6:
+        return grad_output.mul(noise)
+    else:
+        negative_slope = (lower + upper) / 2
+        return aten.leaky_relu_backward(
+            grad_output, self, negative_slope, self_is_result
+        )
+
+
+@register_decomposition(aten.log_sigmoid_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def log_sigmoid_backward(grad_output: Tensor, self: Tensor, buffer: Tensor) -> Tensor:
+    in_negative = self < 0
+    max_deriv = torch.where(in_negative, 1, 0)
+    sign = torch.where(in_negative, 1, -1)
+    z = torch.exp(-torch.abs(self))
+    return grad_output * (max_deriv - sign * (z / (1 + z)))
+    # CPU has a special formula that uses buffer, but disabled for convenience sake
+    # return (max_deriv - sign * (buffer / (1 + buffer))) * grad_output
+
+
+def apply_loss_reduction(loss: Tensor, reduction: int):
+    if reduction == Reduction.MEAN.value:
+        return torch.mean(loss)
+    elif reduction == Reduction.SUM.value:
+        return torch.sum(loss)
+    else:
+        return loss
+
+
+def to_real_dtype(dtype: torch.dtype):
+    if dtype == torch.complex32:
+        return torch.float16
+    elif dtype == torch.complex64:
+        return torch.float32
+    elif dtype == torch.complex128:
+        return torch.float64
+
+
+# TODO: None of these loss castings are quite correct, see
+# https://github.com/pytorch/pytorch/issues/76870. Also, the ATen kernels
+# perform the pointwise portion in opmath, but don't maintain it between the
+# pointwise portion and the reduction
+
+
+@register_decomposition(aten.mse_loss)
+@out_wrapper()
+@pw_cast_for_opmath
+def mse_loss(
+    self: Tensor, target: Tensor, reduction: int = Reduction.MEAN.value
+) -> Tensor:
+    loss = (self - target) ** 2
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.mse_loss_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def mse_loss_backward(
+    grad_output: Tensor, input: Tensor, target: Tensor, reduction: int
+):
+    norm = 2.0 / input.numel() if reduction == Reduction.MEAN.value else 2.0
+    return norm * (input - target) * grad_output
+
+
+@register_decomposition(aten.smooth_l1_loss)
+@out_wrapper()
+@pw_cast_for_opmath
+def smooth_l1_loss(
+    self: Tensor,
+    target: Tensor,
+    reduction: int = Reduction.MEAN.value,
+    beta: float = 1.0,
+):
+    loss = (self - target).abs()
+    loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.smooth_l1_loss_backward.default)
+@pw_cast_for_opmath
+def smooth_l1_loss_backward(
+    grad_output: Tensor, self: Tensor, target: Tensor, reduction: int, beta: float
+):
+    norm = 1.0 / self.numel() if reduction == Reduction.MEAN.value else 1.0
+    x = self - target
+    abs_x = torch.abs(x)
+    norm_grad = norm * grad_output
+    return torch.where(
+        abs_x < beta,
+        norm_grad * x / beta,
+        norm_grad * torch.sign(x),
+    )
+
+
+@register_decomposition(aten.smooth_l1_loss_backward.grad_input)
+@pw_cast_for_opmath
+def smooth_l1_loss_backward_out(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    reduction: int,
+    beta: float,
+    grad_input: Tensor,
+):
+    result = smooth_l1_loss_backward(grad_output, self, target, reduction, beta)
+    _maybe_resize_out(grad_input, result.shape)
+    return _safe_copy_out(copy_from=result, copy_to=grad_input, exact_dtype=True)
+
+
+@register_decomposition(aten.huber_loss_backward.default)
+@pw_cast_for_opmath
+def huber_loss_backward(
+    grad_output: Tensor, self: Tensor, target: Tensor, reduction: int, delta: float
+):
+    norm = 1.0 / self.numel() if reduction == Reduction.MEAN.value else 1.0
+    x = self - target
+    return torch.where(
+        x < -delta,
+        -norm * grad_output * delta,
+        torch.where(x > delta, norm * grad_output * delta, norm * x * grad_output),
+    )
+
+
+# We cannot use @out_wrapper() here, because the output tensor is not named 'out', it's 'grad_input'
+@register_decomposition(aten.huber_loss_backward.out)
+@pw_cast_for_opmath
+def huber_loss_backward_out(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    reduction: int,
+    delta: float,
+    grad_input: Tensor,
+):
+    result = huber_loss_backward(grad_output, self, target, reduction, delta)
+    _maybe_resize_out(grad_input, result.shape)
+    return _safe_copy_out(copy_from=result, copy_to=grad_input, exact_dtype=True)
+
+
+def _nll_loss_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+) -> Tensor:
+    channel_dim = 0 if self.dim() < 2 else 1
+    if reduction == Reduction.MEAN.value:
+        grad_output = grad_output / total_weight
+
+    target = target.unsqueeze(channel_dim)
+    safe_target = torch.where(target != ignore_index, target, 0)
+    grad_input = torch.zeros_like(self)
+    grad_input = torch.scatter(grad_input, channel_dim, safe_target, -1.0)
+
+    if grad_input.dim() > grad_output.dim() > 0:
+        grad_output = grad_output.unsqueeze(channel_dim)
+
+    if weight is not None:
+        new_shape = [1 for _ in range(self.dim())]
+        new_shape[channel_dim] = weight.shape[0]
+        weight = weight.reshape(new_shape)
+        grad_output = grad_output * weight
+
+    grad_output = torch.where(target != ignore_index, grad_output, 0)
+
+    return grad_input * grad_output
+
+
+@register_decomposition(aten.glu_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def glu_backward(grad_output: Tensor, self: Tensor, dim: int) -> Tensor:
+    assert self.dim() > 0, "glu does not support 0-dimensional tensors"
+    wrap_dim = utils.canonicalize_dim(self.dim(), dim)
+    nIn = self.size(wrap_dim)
+    assert (
+        nIn % 2 == 0
+    ), f"Halving dimension must be even, but dimension {wrap_dim} is size {nIn}"
+    inputSize = nIn // 2
+    firstHalf = self.narrow(wrap_dim, 0, inputSize)
+    secondHalf = self.narrow(wrap_dim, inputSize, inputSize)
+    gradInputFirstHalf = torch.sigmoid(secondHalf)
+    gradInputSecondHalf = (
+        (1.0 - gradInputFirstHalf) * gradInputFirstHalf * firstHalf * grad_output
+    )
+    gradInputFirstHalf = gradInputFirstHalf * grad_output
+    return torch.cat([gradInputFirstHalf, gradInputSecondHalf], dim=wrap_dim)
+
+
+@register_decomposition(aten.nll_loss_backward)
+@out_wrapper("grad_input")
+def nll_loss_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+) -> Tensor:
+    assert 0 <= self.dim() <= 2, "input tensor should be 1D or 2D"
+    assert (
+        target.dim() <= 1
+    ), "0D or 1D target tensor expected, multi-target not supported"
+
+    no_batch_dim = self.dim() == 1 and target.dim() == 0
+    assert no_batch_dim or (
+        self.shape[0] == target.shape[0]
+    ), f"size mismatch (got input: {self.shape}, target: {target.shape})"
+    assert total_weight.numel() == 1, (
+        "expected total_weight to be a single element tensor, got: ",
+        f"{total_weight.shape} ({total_weight.numel()} elements)",
+    )
+
+    assert (
+        weight is None or weight.numel() == self.shape[-1]
+    ), "weight tensor should be defined either for all or no classes"
+
+    if reduction == Reduction.NONE.value and self.dim() == 2:
+        assert grad_output.dim() == 1 and grad_output.shape[0] == self.shape[0], (
+            f"Expected a tensor of dimension 1 and tensor.size[0] == {self.shape[0]} but "
+            f"got: dimension {grad_output.dim()} and tensor.size[0] == {grad_output.shape[0]}"
+        )
+    else:
+        assert (
+            grad_output.dim() <= 1 and grad_output.numel() == 1
+        ), f"Expected a single element grad_output tensor, but got: {grad_output.shape}"
+
+    return _nll_loss_backward(
+        grad_output, self, target, weight, reduction, ignore_index, total_weight
+    )
+
+
+@register_decomposition(aten.nll_loss2d_backward)
+@out_wrapper("grad_input")
+def nll_loss2d_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+) -> Tensor:
+    assert (
+        self.dim() == 4
+    ), f"only batches of spatial inputs supported (4D tensors), but got input of dimension: {self.dim()}"
+
+    assert (
+        target.dim() == 3
+    ), f"only batches of spatial targets supported (3D tensors) but got targets of dimension: {target.dim()}"
+
+    assert (
+        self.shape[0] == target.shape[0]
+        and self.shape[2] == target.shape[1]
+        and self.shape[3] == target.shape[2]
+    ), f"size mismatch (got input: {self.shape}, target: {target.shape}"
+
+    assert total_weight.numel() == 1, (
+        "expected total_weight to be a single element tensor, "
+        f"got: {total_weight.shape} ( {total_weight.numel()}, elements)"
+    )
+
+    return _nll_loss_backward(
+        grad_output, self, target, weight, reduction, ignore_index, total_weight
+    )
+
+
+@register_decomposition(aten.binary_cross_entropy)
+@out_wrapper()
+@pw_cast_for_opmath
+def binary_cross_entropy(
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    # We cannot currently model this without introducing data-dependent control flow
+    # TORCH_CHECK(
+    #     (input_val >= 0) && (input_val <= 1),
+    #     "all elements of input should be between 0 and 1"
+    # )
+    loss = (target - 1) * torch.maximum(
+        torch.log1p(-self), self.new_full((), -100)
+    ) - target * torch.maximum(torch.log(self), self.new_full((), -100))
+    if weight is not None:
+        loss = loss * weight
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.binary_cross_entropy_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def binary_cross_entropy_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    EPSILON = 1e-12
+    result = grad_output * (self - target) / torch.clamp(self * (1 - self), min=EPSILON)
+    if weight is not None:
+        result = result * weight
+    if reduction == Reduction.MEAN.value:
+        result = result / self.numel()
+    return result
+
+
+@register_decomposition(aten.soft_margin_loss)
+@out_wrapper()
+@pw_cast_for_opmath
+def soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    loss = torch.log1p(torch.exp(-input * target))
+    return apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.soft_margin_loss_backward)
+@out_wrapper("grad_input")
+@pw_cast_for_opmath
+def soft_margin_loss_backward(
+    grad_output: Tensor,
+    self: Tensor,
+    target: Tensor,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    grad_input = target * grad_output * (torch.sigmoid(target * self) - 1)
+    if reduction == Reduction.MEAN.value:
+        grad_input = grad_input / self.numel()
+    return grad_input
+
+
+@register_decomposition(aten.dist)
+@out_wrapper()
+def dist(input: Tensor, other: Tensor, p: float = 2):
+    return aten.norm(input - other, p=p)
+
+
+@register_decomposition(aten._euclidean_dist)
+@out_wrapper()
+def _euclidean_dist(x1: Tensor, x2: Tensor) -> Tensor:
+    x1_norm = x1.pow(2).sum(-1, True)
+    x1_pad = torch.ones_like(x1_norm, memory_format=torch.contiguous_format)
+    x2_norm = x2.pow(2).sum(-1, True)
+    x2_pad = torch.ones_like(x2_norm, memory_format=torch.contiguous_format)
+    x1_ = torch.cat([x1.mul(-2), x1_norm, x1_pad], -1)
+    x2_ = torch.cat([x2, x2_pad, x2_norm], -1)
+    result = x1_.matmul(x2_.mT)
+    return result.clamp_min(0).sqrt()
+
+
+@register_decomposition(aten.slice_backward)
+@out_wrapper()
+def slice_backward(
+    grad_output: Tensor,
+    input_sizes: List[int],
+    dim: int,
+    start: int,
+    end: int,
+    step: int,
+):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.slice_scatter(grad_input, grad_output, dim, start, end, step)
+
+
+@register_decomposition(aten.slice.Tensor)
+def slice_forward(
+    # Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1
+    self: Tensor,
+    dim: int = 0,
+    start: Optional[int] = None,
+    end: Optional[int] = None,
+    step: int = 1,
+):
+    ndim = self.dim()
+    if ndim == 0:
+        raise RuntimeError("slice() cannot be applied to a 0-dim tensor.")
+    dim = utils.canonicalize_dim(self.dim(), dim)
+    sizes = list(self.size())
+    strides = list(self.stride())
+
+    if step <= 0:
+        raise RuntimeError("slice step must be positive")
+
+    start_val = start if start is not None else 0
+    end_val = end if end is not None else sys.maxsize  # 2^63 – 1
+
+    if start_val < 0:
+        start_val += sizes[dim]
+
+    if end_val < 0:
+        end_val += sizes[dim]
+
+    if start_val < 0:
+        start_val = 0
+    elif start_val > sizes[dim]:
+        start_val = sizes[dim]
+
+    if end_val < start_val:
+        end_val = start_val
+    elif end_val > sizes[dim]:
+        end_val = sizes[dim]
+
+    storage_offset = self.storage_offset() + start_val * strides[dim]
+    len = end_val - start_val
+    sizes[dim] = (len + step - 1) // step
+    strides[dim] *= step
+
+    if self.is_quantized:
+        raise NotImplementedError(
+            "Slice decomposition for quantized tensors aren't implemented"
+        )
+    else:
+        return self.as_strided(sizes, strides, storage_offset)
+
+
+@register_decomposition(aten.select_backward)
+@out_wrapper()
+def select_backward(grad_output: Tensor, input_sizes: List[int], dim: int, index: int):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.select_scatter(grad_input, grad_output, dim, index)
+
+
+@register_decomposition(aten.diagonal_backward)
+@out_wrapper()
+def diagonal_backward(
+    grad_output: Tensor, input_sizes: List[int], offset: int, dim1: int, dim2: int
+):
+    grad_input = grad_output.new_zeros(input_sizes)
+    return torch.diagonal_scatter(grad_input, grad_output, offset, dim1, dim2)
+
+
+def _cast_grad_to_input_dtype(
+    grad_output: Tensor, grad_input: Tensor, input_dtype: torch.dtype
+):
+    if grad_output.dtype != input_dtype:
+        grad_input = grad_input.to(input_dtype)
+    return grad_input
+
+
+@register_decomposition(aten._softmax_backward_data)
+@out_wrapper("grad_input")
+@compute_only_pw_cast_for_opmath
+def _softmax_backward_data(
+    grad_output: Tensor, output: Tensor, dim: int, input_dtype: torch.dtype
+):
+    new_grad_output = grad_output * output
+    grad_input = new_grad_output - output * torch.sum(
+        new_grad_output, dim=dim, keepdim=True
+    )
+
+    # CPU kernel doesn't respect input_dtype, but following check doesn't work for meta tensor
+    # if grad_output.device == torch.device("cpu"):
+    #     return grad_input.contiguous()
+
+    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype).contiguous()
+
+
+@register_decomposition(aten._log_softmax_backward_data)
+@out_wrapper()
+@compute_only_pw_cast_for_opmath
+def _log_softmax_backward_data(
+    grad_output: Tensor, output: Tensor, dim: int, input_dtype: torch.dtype
+):
+    grad_input = grad_output - torch.exp(output) * torch.sum(
+        grad_output, dim=dim, keepdim=True
+    )
+    return _cast_grad_to_input_dtype(grad_output, grad_input, input_dtype)
+
+
+def _im2col_col2im_indices_along_dim(
+    input_d, kernel_d, dilation_d, padding_d, stride_d, device
+):
+    """Utility function to implement im2col and col2im"""
+    blocks_d = input_d + padding_d * 2 - dilation_d * (kernel_d - 1)
+
+    arange_kw = partial(torch.arange, dtype=torch.int64, device=device)
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = arange_kw(0, blocks_d, stride_d).unsqueeze(0)
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = arange_kw(0, kernel_d * dilation_d, dilation_d).unsqueeze(-1)
+
+    # Broadcast and add kernel starting positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    return blocks_d_indices + kernel_grid
+
+
+@register_decomposition(aten.im2col)
+@out_wrapper()
+def im2col(
+    input: Tensor,
+    kernel_size: List[int],
+    dilation: List[int],
+    padding: List[int],
+    stride: List[int],
+) -> Tensor:
+    torch._check(len(kernel_size) == 2, lambda: "im2col(): only 2D kernel supported")
+    torch._check(len(dilation) == 2, lambda: "im2col(): only 2D dilation supported")
+    torch._check(len(padding) == 2, lambda: "im2col(): only 2D padding supported")
+    torch._check(len(stride) == 2, lambda: "im2col(): only 2D stride supported")
+
+    def check_positive(param, param_name, strict=True):
+        cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
+        torch._check(
+            cond, lambda: "{param_name} should be greater {'than' zero, but got {param}"
+        )
+
+    check_positive(kernel_size, "kernel_size")
+    check_positive(dilation, "dilation")
+    check_positive(dilation, "padding", strict=False)
+    check_positive(stride, "stride")
+
+    shape = input.shape
+    ndim = len(shape)
+    torch._check(
+        ndim in (3, 4) and all(d != 0 for d in shape[-3:]),
+        lambda: "Expected 3D or 4D (batch mode) tensor for input with possible 0 batch size "
+        f"and non-zero dimensions, but got: {tuple(shape)}",
+    )
+    output_size = tuple(
+        1 + (out + 2 * pad - dil * (ker - 1) - 1) // st
+        for out, pad, dil, ker, st in zip(
+            shape[-2:], padding, dilation, kernel_size, stride
+        )
+    )
+    torch._check(
+        all(c > 0 for c in output_size),
+        lambda: f"Given an input with spacial size {tuple(shape[-2:])}, "
+        f"kernel_size={kernel_size}, dilation={dilation}, "
+        f"padding={padding}, stride={stride}, "
+        "the calculated shape of the array of sliding blocks "
+        f"is {output_size}, but its components must be at least one.",
+    )
+    batched_input = ndim == 4
+    if not batched_input:
+        input = input.unsqueeze(0)
+
+    batch_dim, channel_dim, input_h, input_w = input.shape
+
+    stride_h, stride_w = stride
+    padding_h, padding_w = padding
+    dilation_h, dilation_w = dilation
+    kernel_h, kernel_w = kernel_size
+
+    blocks_row_indices = _im2col_col2im_indices_along_dim(
+        input_h, kernel_h, dilation_h, padding_h, stride_h, input.device
+    )
+    blocks_col_indices = _im2col_col2im_indices_along_dim(
+        input_w, kernel_w, dilation_w, padding_w, stride_w, input.device
+    )
+
+    # Note that F.pad takes (padding_left, padding_right, padding_top, padding_bottom)
+    # ugh
+    padded_input = F.pad(input, (padding_w, padding_w, padding_h, padding_h))
+
+    blocks_row_indices = blocks_row_indices.unsqueeze(-1).unsqueeze(-1)
+    output = padded_input[:, :, blocks_row_indices, blocks_col_indices]
+    output = output.permute(0, 1, 2, 4, 3, 5)
+    num_blocks_row = blocks_row_indices.size(1)
+    num_blocks_col = blocks_col_indices.size(1)
+    output = output.reshape(
+        batch_dim, channel_dim * kernel_h * kernel_w, num_blocks_row * num_blocks_col
+    )
+
+    if not batched_input:
+        output = output.squeeze(0)
+    return output
+
+
+@register_decomposition(aten.col2im)
+@out_wrapper()
+@pw_cast_for_opmath
+def col2im(
+    input: Tensor,
+    output_size: List[int],
+    kernel_size: List[int],
+    dilation: List[int],
+    padding: List[int],
+    stride: List[int],
+) -> Tensor:
+    torch._check(len(output_size) == 2, lambda: "only 2D output_size supported")
+    torch._check(len(kernel_size) == 2, lambda: "only 2D kernel supported")
+    torch._check(len(dilation) == 2, lambda: "only 2D dilation supported")
+    torch._check(len(padding) == 2, lambda: "only 2D padding supported")
+    torch._check(len(stride) == 2, lambda: "only 2D stride supported")
+
+    def check_positive(param, param_name, strict=True):
+        cond = all(p > 0 for p in param) if strict else all(p >= 0 for p in param)
+        torch._check(
+            cond, lambda: "{param_name} should be greater than zero, but got {param}"
+        )
+
+    check_positive(kernel_size, "kernel_size")
+    check_positive(dilation, "dilation")
+    check_positive(padding, "padding", strict=False)
+    check_positive(stride, "stride")
+    check_positive(output_size, "output_size")
+
+    shape = input.shape
+    ndim = len(shape)
+    torch._check(
+        ndim in (2, 3) and all(d != 0 for d in shape[-2:]),
+        lambda: "Expected 2D or 3D (batch mode) tensor for input with possible 0 batch size "
+        f"and non-zero dimensions, but got: {tuple(shape)}",
+    )
+    prod_kernel_size = kernel_size[0] * kernel_size[1]
+    torch._check(
+        shape[-2] % prod_kernel_size == 0,
+        lambda: "Expected size of input's first non-batch dimension to be divisible by the "
+        f"product of kernel_size, but got input.shape[-2] = {shape[-2]} and "
+        f"kernel_size={kernel_size}",
+    )
+    col = [
+        1 + (out + 2 * pad - dil * (ker - 1) - 1) // st
+        for out, pad, dil, ker, st in zip(
+            output_size, padding, dilation, kernel_size, stride
+        )
+    ]
+    L = col[0] * col[1]
+    torch._check(
+        shape[-1] == L,
+        lambda: f"Given output_size={output_size}, kernel_size={kernel_size}, "
+        f"dilation={dilation}, padding={padding}, stride={stride}, "
+        f"expected input.size(-1) to be {L} but got {shape[-1]}.",
+    )
+    torch._check(
+        L > 0,
+        lambda: f"Given output_size={output_size}, kernel_size={kernel_size}, "
+        f"dilation={dilation}, padding={padding}, stride={stride}, "
+        f"expected input.size(-1) to be {L} but got {shape[-1]}.",
+    )
+    batched_input = ndim == 3
+    if not batched_input:
+        input = input.unsqueeze(0)
+
+    shape = input.shape
+
+    out_h, out_w = output_size
+    stride_h, stride_w = stride
+    padding_h, padding_w = padding
+    dilation_h, dilation_w = dilation
+    kernel_h, kernel_w = kernel_size
+
+    # col2im is defined as the backwards of im2col, so we differentiate its decomposition by hand
+    input = input.reshape([shape[0], shape[1] // prod_kernel_size] + kernel_size + col)
+    input = input.permute(0, 1, 2, 4, 3, 5)
+
+    indices_row = _im2col_col2im_indices_along_dim(
+        out_h, kernel_h, dilation_h, padding_h, stride_h, input.device
+    )
+    indices_row = _unsqueeze_to_dim(indices_row, 4)
+    indices_col = _im2col_col2im_indices_along_dim(
+        out_w, kernel_w, dilation_w, padding_w, stride_w, input.device
+    )
+
+    output_padded_size = [o + 2 * p for o, p in zip(output_size, padding)]
+    output = input.new_zeros(
+        [shape[0], shape[1] // prod(kernel_size)] + output_padded_size
+    )
+    idx = (None, None, indices_row, indices_col)
+    output = aten._unsafe_index_put(output, idx, input, accumulate=True)
+    output = F.pad(output, (-padding_w, -padding_w, -padding_h, -padding_h))
+
+    if not batched_input:
+        output = output.squeeze(0)
+    return output
+
+
+@register_decomposition(aten.native_dropout_backward)
+@out_wrapper()
+def native_dropout_backward(grad_output: Tensor, mask: Tensor, scale: float):
+    # According to the CUDA kernel implementation we should have this test;
+    # but it seems to fail tests!
+    # torch._check(mask.dtype == torch.bool, lambda: f"Mask should be Bool Scalar Type {mask.dtype}")
+
+    # Mimicking CUDA kernel's behavior for output stride: output follow input's memory format
+    # This different from TensorIterator's behavior
+    r = (grad_output * (mask.type_as(grad_output) * scale)).clone(
+        memory_format=utils.suggest_memory_format(grad_output)
+    )
+    return r
+
+
+@register_decomposition(aten.unfold_backward)
+@out_wrapper()
+def unfold_backward(
+    grad: Tensor, input_size: List[int], dimension: int, size: int, step: int
+) -> Tensor:
+    if len(input_size) == 0:
+        return torch.squeeze_copy(grad, 0)
+    dim = utils.canonicalize_dim(len(input_size), dimension)
+    idx = torch.arange(input_size[dim], device=grad.device, dtype=torch.int32)
+    idx = idx.unfold(0, size, step).flatten()
+    grad = grad.movedim(-1, dim + 1).flatten(dim, dim + 1)
+    # nb. At the moment this generates two kernels in triton
+    # It could potentially be fused into one call to scatter_reduce,
+    # in the case step <= size provided scatter_reduce generates 1 kernel
+    grad_input = grad.new_zeros(input_size)
+    index = (None,) * dim + (idx,)
+    return aten._unsafe_index_put(grad_input, index, grad, accumulate=True).contiguous()
+
+
+@register_decomposition(aten.logit_backward.default)
+@pw_cast_for_opmath
+def logit_backward(
+    grad_output: Tensor, self: Tensor, eps: Optional[float] = None
+) -> Tensor:
+    if eps is not None:
+        lo = eps
+        hi = 1.0 - lo
+        return torch.where(
+            torch.logical_and(self >= lo, self <= hi),
+            grad_output / (self * (1.0 - self)),
+            0.0,
+        )
+    else:
+        return torch.where(
+            torch.logical_and(self >= 0.0, self <= 1.0),
+            grad_output / (self * (1.0 - self)),
+            self.new_full((), float("nan")),
+        )
+
+
+@register_decomposition(aten.dropout)
+@aten.dropout.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.dropout.default.py_impl(DispatchKey.Autograd)
+def dropout(input: Tensor, p: float, train: Optional[bool]):
+    if train and p != 0:
+        return aten.native_dropout(input, p, train)[0]
+    else:
+        return input.clone()
+
+
+@register_decomposition(aten.native_dropout)
+@out_wrapper("out0", "out1")
+def native_dropout(input: Tensor, p: float, train: Optional[bool]):
+    if train and p != 0:
+        if p == 1:
+            return (torch.zeros_like(input), torch.zeros_like(input, dtype=torch.bool))
+        if not input.dtype.is_floating_point:
+            raise RuntimeError(
+                "result type Float can't be cast to the desired output type Long"
+            )
+        bool_mask = torch.rand_like(input) > p
+        res = bool_mask * input * float(1.0 / (1.0 - p))
+        return (res, bool_mask)
+    else:
+        return (input, torch.ones_like(input, dtype=torch.bool))
+
+
+@register_decomposition(aten._softmax)
+@out_wrapper()
+def _softmax(x: Tensor, dim: int, half_to_float: bool):
+    # eager softmax returns a contiguous tensor. Ensure that decomp also returns
+    # a contiguous tensor.
+    x = x.contiguous()
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
+    if x.numel() == 0:
+        unnormalized = torch.exp(x)
+    else:
+        x_max = torch.amax(x, dim, keepdim=True)
+        unnormalized = torch.exp(x - x_max)
+    result = unnormalized / torch.sum(unnormalized, dim, keepdim=True)
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
+
+
+@register_decomposition(aten._log_softmax)
+@out_wrapper()
+def _log_softmax(x: Tensor, dim: int, half_to_float: bool):
+    # eager log_softmax returns a contiguous tensor. Ensure that decomp also
+    # returns a contiguous tensor.
+    x = x.contiguous()
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
+    if x.numel() == 0:
+        shifted = x
+    else:
+        x_max = torch.amax(x, dim, keepdim=True)
+        shifted = x - x_max
+    shifted_logsumexp = torch.log(torch.sum(torch.exp(shifted), dim, keepdim=True))
+    result = shifted - shifted_logsumexp
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
+
+
+@register_decomposition(aten.embedding)
+@out_wrapper()
+def embedding(
+    weight: Tensor,
+    indices: Tensor,
+    padding_idx: int = -1,
+    scale_grad_by_freq: bool = False,
+    sparse: bool = False,
+) -> Tensor:
+    assert weight.dim() == 2, "'weight' must be 2-D"
+    # Nb. scale_grad_by_freq is not used in the forward
+    if indices.ndim <= 1:
+        # We need this one as weight[indices] calls item() in these cases
+        out = weight.index_select(0, indices)
+        if indices.ndim == 0:
+            out = out.squeeze(0)
+        return out
+    else:
+        return weight[indices]
+
+
+@register_decomposition(aten.embedding_dense_backward)
+@out_wrapper()
+def embedding_dense_backward(
+    grad_output: Tensor,
+    indices: Tensor,
+    num_weights: int,
+    padding_idx: int,
+    scale_grad_by_freq: bool,
+):
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        grad_output, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    grad_output = grad_output.to(computation_dtype)
+    indices = _maybe_convert_to_dtype(indices, torch.long)  # type: ignore[assignment]
+    if scale_grad_by_freq:
+        counts = indices.new_zeros((num_weights,))
+        ones = torch.ones_like(indices)
+        counts = aten._unsafe_index_put(counts, [indices], ones, accumulate=True)
+        grad_weights_scale = counts[indices]
+        grad_output = grad_output / grad_weights_scale.unsqueeze(-1)
+
+    mask = _unsqueeze_to_dim(indices == padding_idx, grad_output.ndim)
+    grad = grad_output.masked_fill(mask, 0)
+    grad_weight = grad_output.new_zeros(
+        (num_weights,) + grad_output.shape[indices.ndim :]
+    )
+    return aten._unsafe_index_put(grad_weight, [indices], grad, accumulate=True).to(
+        result_dtype
+    )
+
+
+def prod(x: List[int]):
+    r = 1
+    for i in x:
+        r *= i
+    return r
+
+
+def _pad_chunk(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+) -> List[Tensor]:
+    padded_tensors = []
+    for tensor in tensors:
+        tensor_size = tensor.size()
+        pad_along_dim = (tensor_size[dim] + num_chunks - 1) // num_chunks * num_chunks
+        if pad_along_dim != tensor_size[dim]:
+            # Use aten.constant_pad_nd instead of copy_ for functionalization
+            pad = [0] * 2 * (tensor.ndim - dim - 1) + [
+                0,
+                pad_along_dim - tensor_size[dim],
+            ]
+            tensor = aten.constant_pad_nd(tensor, pad, 0)
+        view_size = tensor_size[:dim] + torch.Size([num_chunks, -1])
+        padded_tensors.append(tensor.view(view_size))
+    return padded_tensors
+
+
+def have_same_ndims(tensors: List[Tensor]):
+    ndim = tensors[0].ndim
+    for tensor in tensors:
+        if tensor.ndim != ndim:
+            return False
+    return True
+
+
+def leading_dimension_matches(tensors: List[Tensor], dim: int):
+    leading_dim_sizes = tensors[0].size()[:dim]
+    for tensor in tensors:
+        torch._check(
+            tensor.size()[:dim] == leading_dim_sizes,
+            lambda: "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors",
+        )
+
+
+def _preprocess_chunk_cat_inputs(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+):
+    torch._check(num_chunks >= 1, lambda: "_chunk_cat expects positive num_chunks")
+    torch._check(
+        len(tensors) > 0, lambda: "_chunk_cat expects a non-empty input tensor list"
+    )
+    expected_dtype = tensors[0].dtype
+    expected_device = tensors[0].device
+    for tensor in tensors:
+        torch._check(tensor.numel() > 0, lambda: "_chunk_cat expects non-empty tensor")
+        torch._check(
+            tensor.dtype == expected_dtype,
+            lambda: "_chunk_cat expects all input tensors with the same dtype",
+        )
+        torch._check(
+            tensor.device == expected_device,
+            lambda: "_chunk_cat expects all inputs tensors on the same device",
+        )
+    if have_same_ndims(tensors):
+        dim = utils.canonicalize_dim(tensors[0].dim(), dim)
+    else:
+        torch._check(
+            dim >= 0,
+            lambda: "_chunk_cat expects non-negative dim when input tensors have different ndims",
+        )
+        for tensor in tensors:
+            torch._check(
+                dim < tensor.ndim,
+                lambda: "_chunk_cat expects dim < ndim for all input tensors",
+            )
+    leading_dimension_matches(tensors, dim)
+    return dim
+
+
+@register_decomposition([aten._chunk_cat.default, aten._chunk_cat.out])
+def _chunk_cat(
+    tensors: List[Tensor],
+    dim: int,
+    num_chunks: int,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    dim = _preprocess_chunk_cat_inputs(tensors, dim, num_chunks)
+    padded_tensors = _pad_chunk(tensors, dim, num_chunks)
+    if out is None:
+        return torch.cat(padded_tensors, dim + 1)
+    else:
+        torch.cat(padded_tensors, dim + 1, out=out)
+        return out
+
+
+@register_decomposition(aten.split_with_sizes)
+def split_with_sizes(
+    self: Tensor, split_sizes: List[int], dim: int = 0
+) -> List[Tensor]:
+    # NB: Perform the check_is_size tests first so that the
+    # sum test does not try to do a replacement
+    for i in range(len(split_sizes)):
+        torch._check_is_size(
+            split_sizes[i],
+            lambda: "split_with_sizes expects split_sizes have only non-negative entries",
+        )
+    torch._check_with(
+        ValueError,
+        sum(split_sizes) == self.shape[dim],
+        lambda: f"Split sizes add up to {sum(split_sizes)} but got the tensor's size of {self.shape[dim]}",
+    )
+    num_splits = len(split_sizes)
+    splits = []
+    start_idx = 0
+
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import expect_true
+
+    for i in range(num_splits):
+        length = split_sizes[i]
+        # We know this is true thanks to the sum, but this assertion helps
+        # out our internal reasoning
+        expect_true(start_idx + length <= self.shape[dim])
+        splits.append(self.narrow(dim, start_idx, length))
+        start_idx += length
+    return splits
+
+
+# out_wrapper currently does not allow optional outputs
+@register_decomposition(
+    [aten.split_with_sizes_copy.default, aten.split_with_sizes_copy.out]
+)
+def split_with_sizes_copy(
+    self: Tensor,
+    split_sizes: List[int],
+    dim: int = 0,
+    out: Optional[List[Tensor]] = None,
+) -> Optional[List[Tensor]]:
+    splits = split_with_sizes(self, split_sizes, dim=dim)
+    if out is None:
+        return [s.clone(memory_format=torch.contiguous_format) for s in splits]
+    else:
+        for output, split in zip(out, splits):
+            _maybe_resize_out(output, split.shape)
+            _safe_copy_out(copy_from=split, copy_to=output, exact_dtype=True)
+        return None
+
+
+@register_decomposition(aten.unsafe_split.Tensor)
+def unsafe_split(input: Tensor, split_size: int, dim: int = 0) -> Tuple[Tensor, ...]:
+    return aten.split.Tensor(input, split_size, dim)
+
+
+@register_decomposition(aten.unsafe_split_with_sizes.default)
+def unsafe_split_with_sizes(
+    input: Tensor, split_sizes: List[int], dim: int = 0
+) -> Tuple[Tensor, ...]:
+    return aten.split_with_sizes.default(input, split_sizes, dim)
+
+
+@register_decomposition(aten.split.Tensor)
+def split(self: Tensor, split_size: int, dim: int = 0) -> Tuple[Tensor, ...]:
+    input_sizes = self.shape
+    dim_size = input_sizes[dim]
+    if split_size == 0:
+        assert dim_size == 0
+        return (self,)
+    chunks = (dim_size + split_size - 1) // split_size
+
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import guard_int
+
+    chunks = guard_int(chunks)
+    split_sizes = [split_size for i in range(chunks)]
+    split_sizes[-1] = split_size - (split_size * chunks - dim_size)
+    return torch.split(self, split_sizes, dim)
+
+
+@aten.tensor_split.tensor_indices_or_sections.py_impl(
+    DispatchKey.CompositeImplicitAutograd
+)
+def tensor_split_tensor_indices_or_sections_py_impl(
+    self: Tensor,
+    tensor_indices_or_sections: Tensor,
+    dim: int = 0,
+) -> Tuple[Tensor, ...]:
+    assert tensor_indices_or_sections.device.type == "cpu"
+    assert tensor_indices_or_sections.dtype == torch.int64
+    split_dim = tensor_indices_or_sections.dim()
+    torch._check(
+        split_dim == 1 or split_dim == 0,
+        lambda: "tensor_split expected tensor_indices_or_sections to be a zero-dimensional "
+        f"or one-dimensional tensor, but got a tensor with {split_dim} dims",
+    )
+    if split_dim == 0:
+        sections = tensor_indices_or_sections.item()
+        assert isinstance(sections, IntLike)
+        return self.tensor_split(sections, dim)
+    else:
+        indices = [i.item() for i in tensor_indices_or_sections]
+        return self.tensor_split(indices, dim)
+
+
+# TODO: this doesn't appear to have enough precision in bfloat16
+@register_decomposition(aten.addmm)
+@out_wrapper()
+@pw_cast_for_opmath
+def addmm(self: Tensor, mat1: Tensor, mat2: Tensor, beta: int = 1, alpha: int = 1):
+    if not self.is_floating_point() and not self.is_complex():
+        beta = int(beta)
+        alpha = int(alpha)
+    out = alpha * torch.mm(mat1, mat2)
+    if beta == 0:
+        return out
+
+    # The output of aten.addmm is contiguous, we need to match this behavior in the decomposition.
+    # The original implementation 'beta * self + out' would return a strided tensor if `self` is strided.
+    # We thus use `out`, the output of torch.mm, which is always contiguous, as the first argument for addition.
+    # This is relying on TensorIterator's behavior that it takes higher precedence on the stride of first input.
+    # Alternative, we can write `(beta * self + out).contiguous()`, but it introduces another copy in some cases.
+    # This implementation is not ideal, and we should revisit this when we have a better solution.
+    return out + beta * self
+
+
+@register_decomposition(aten._addmm_activation)
+@out_wrapper()
+@pw_cast_for_opmath
+def _addmm_activation(
+    self: Tensor,
+    mat1: Tensor,
+    mat2: Tensor,
+    beta: int = 1,
+    alpha: int = 1,
+    use_gelu: bool = False,
+):
+    out = addmm(self, mat1, mat2, beta, alpha)
+    if use_gelu:
+        if self.is_cuda:
+            return aten.gelu(out, approximate="tanh")
+        else:
+            return aten.gelu(out)
+    return aten.relu(out)
+
+
+@register_decomposition(aten.addmv)
+@out_wrapper()
+@pw_cast_for_opmath
+def addmv(self: Tensor, mat1: Tensor, vec: Tensor, beta: int = 1, alpha: int = 1):
+    if not self.is_floating_point() and not self.is_complex():
+        beta = int(beta)
+        alpha = int(alpha)
+    out = alpha * torch.mv(mat1, vec)
+    if beta == 0:
+        return out
+    return out + beta * self
+
+
+@register_decomposition(aten.native_group_norm_backward.default)
+@pw_cast_for_opmath
+def native_group_norm_backward(
+    grad_output: Tensor,
+    input: Tensor,
+    mean: Tensor,
+    rstd: Tensor,
+    gamma: Optional[Tensor],
+    N: int,
+    C: int,
+    HxW: int,
+    group: int,
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    utils.check_same_device(
+        grad_output, input, mean, rstd, allow_cpu_scalar_tensors=False
+    )
+    utils.check_same_shape(input, grad_output, allow_cpu_scalar_tensors=False)
+    utils.check_same_shape(mean, rstd, allow_cpu_scalar_tensors=False)
+    torch._check(
+        input.numel() == N * C * HxW,
+        lambda: f"Expect input to have { N * C * HxW} elements",
+    )
+    torch._check(
+        mean.shape == (N, group),
+        lambda: f"Expect mean to have shape ({N}, {group}, but got {mean.shape}",
+    )
+    torch._check(
+        gamma is None or gamma.numel() == C,
+        lambda: f"Expect gamma to have {C} elements but got {gamma.numel() if gamma is not None else -1}",
+    )
+
+    cpg, _rem = divmod(C, group)
+    torch._check(
+        _rem == 0,
+        lambda: f"Expect number of channels {C} to be evenly-divisible by number of groups {group}",
+    )
+
+    # Compute Internal gradients
+    ds = torch.mul(grad_output, input).view(N, C, HxW).sum(dim=[2])
+    db = grad_output.view(N, C, HxW).sum(dim=[2])
+
+    d_input: Optional[Tensor] = None
+    d_gamma: Optional[Tensor] = None
+    d_bias: Optional[Tensor] = None
+    if output_mask[0]:
+        s = 1.0 / (HxW * cpg)
+        if gamma is not None:
+            ds_val = torch.mul(ds, gamma.unsqueeze(0)).reshape(N, group, cpg).sum(2)
+            db_val = torch.mul(db, gamma.unsqueeze(0)).reshape(N, group, cpg).sum(2)
+            c1 = torch.mul(
+                rstd.unsqueeze(-1),
+                gamma.reshape(1, group, cpg),
+            )
+        else:
+            ds_val = ds.reshape(N, group, cpg).sum(2)
+            db_val = db.reshape(N, group, cpg).sum(2)
+            c1 = torch.mul(
+                rstd.unsqueeze(-1),
+                torch.ones((1, group, cpg), device=rstd.device),
+            )
+        c2 = (db_val * mean - ds_val) * rstd * rstd * rstd * s
+        c3 = -c2 * mean - db_val * rstd * s
+
+        c1 = c1.unsqueeze(-1)
+        c2 = _unsqueeze_to_dim(c2, 4)
+        c3 = _unsqueeze_to_dim(c3, 4)
+        d_input = (
+            torch.mul(grad_output.reshape(N, group, cpg, HxW), c1)
+            + torch.mul(input.reshape(N, group, cpg, HxW), c2)
+            + c3
+        )
+        d_input = d_input.reshape(input.shape).to(input.dtype)
+    if output_mask[1]:
+        d_gamma = (
+            (
+                (ds.view(N, group, cpg) - db.view(N, group, cpg) * mean.unsqueeze(-1))
+                * rstd.unsqueeze(-1)
+            )
+            .sum(dim=[0])
+            .reshape(C)
+        )
+    if output_mask[2]:
+        d_bias = db.sum(dim=[0])
+
+    return (d_input, d_gamma, d_bias)
+
+
+# out_wrapper currently does not allow optional outputs
+@register_decomposition(aten.native_group_norm_backward.out)
+def native_group_norm_backward_out(
+    grad_output: Tensor,
+    input: Tensor,
+    mean: Tensor,
+    rstd: Tensor,
+    gamma: Optional[Tensor],
+    N: int,
+    C: int,
+    HxW: int,
+    group: int,
+    output_mask: List[bool],
+    *,
+    out0: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    result = native_group_norm_backward(
+        grad_output, input, mean, rstd, gamma, N, C, HxW, group, output_mask
+    )
+    grad_input = (out0, out1, out2)
+    for i, r in enumerate(result):
+        if r is not None:
+            _maybe_resize_out(grad_input[i], r.shape)
+            _safe_copy_out(copy_from=r, copy_to=grad_input[i], exact_dtype=True)
+
+    return grad_input
+
+
+def _maybe_cast(x: Optional[Tensor], dtype) -> Optional[Tensor]:
+    if x is not None:
+        return x.to(dtype)
+    return x
+
+
+# TODO: Take a closer look at the type promotion semantics
+@register_decomposition(aten.native_layer_norm_backward.default)
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+    computation_dtype = utils.get_computation_dtype(input.dtype)
+    grad_out_cast, input_cast, weight_cast, bias_cast = (
+        x.to(computation_dtype).contiguous() if x is not None else x
+        for x in (grad_out, input, weight, bias)
+    )
+    assert grad_out_cast is not None
+
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices: List[int] = []
+    outer_dim_indices: List[int] = []
+    for i in range(input_ndim):
+        if i >= axis:
+            inner_dim_indices.append(i)
+        else:
+            outer_dim_indices.append(i)
+
+    N = prod(inner_dims)  # type: ignore[arg-type]
+    M = prod(outer_dims)  # type: ignore[arg-type]
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape) if output_mask[0] else None,
+            input.new_zeros(input_shape[axis:]) if output_mask[1] else None,
+            input.new_zeros(input_shape[axis:]) if output_mask[2] else None,
+        )
+    mean = _unsqueeze_to_dim(mean, input_cast.dim())  # type: ignore[union-attr]
+    rstd = _unsqueeze_to_dim(rstd, input_cast.dim())  # type: ignore[union-attr]
+    x_hat = (input_cast - mean) * rstd
+    if weight_cast is not None:
+        grad_x_hat = grad_out_cast * weight_cast
+    else:
+        grad_x_hat = grad_out_cast
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+
+    inner = a - b - c3
+    d_input: Optional[Tensor] = None
+    d_weight: Optional[Tensor] = None
+    d_bias: Optional[Tensor] = None
+    if output_mask[0]:
+        d_input = (rstd / N) * inner
+
+    if output_mask[1] and weight_cast is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight = torch.sum(grad_out_cast * x_hat, outer_dim_indices, False)
+        else:
+            d_weight = grad_out_cast * x_hat
+
+    if output_mask[2] and bias_cast is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias = torch.sum(grad_out_cast, outer_dim_indices, False)
+        else:
+            d_bias = grad_out_cast.clone()
+
+    return (
+        _maybe_cast(d_input, input.dtype),
+        _maybe_cast(d_weight, input.dtype),
+        _maybe_cast(d_bias, input.dtype),
+    )
+
+
+# out_wrapper currently does not allow optional outputs
+@register_decomposition(aten.native_layer_norm_backward.out)
+def native_layer_norm_backward_out(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+    *,
+    out0: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    result = native_layer_norm_backward(
+        grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask
+    )
+    grad_input = (out0, out1, out2)
+    for i, r in enumerate(result):
+        if r is not None:
+            _maybe_resize_out(grad_input[i], r.shape)
+            _safe_copy_out(copy_from=r, copy_to=grad_input[i], exact_dtype=True)
+
+    return grad_input
+
+
+def native_batch_norm_helper(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+    functional: bool,
+) -> Tuple[Tensor, Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    reduction_dims = [0] + list(range(2, input.dim()))
+    computation_dtype = utils.get_computation_dtype(input.dtype)
+    new_running_mean = running_mean
+    new_running_var = running_var
+    if training:
+        computation_dtype = utils.get_computation_dtype(input.dtype)
+        input_acc = input.to(dtype=computation_dtype)
+        biased_var, mean = torch.var_mean(
+            input_acc, dim=reduction_dims, correction=0, keepdim=True
+        )
+        rstd = torch.rsqrt(biased_var + eps)
+
+        output = (input - mean) * rstd
+
+        save_mean = torch.squeeze(mean, reduction_dims)
+        save_rstd = torch.squeeze(rstd, reduction_dims)
+        if running_mean is not None:
+            new_running_mean = momentum * save_mean + (1 - momentum) * running_mean
+            if not functional:
+                running_mean.copy_(new_running_mean)
+        if running_var is not None:
+            n = input.numel() / input.shape[1]
+            # This doesn't strictly match eager's numerics, which accumulates var sum and then directly applies the correction
+            # But... that would require re-implementing var here, for negligible numerics gain on a tensor whose
+            # numerics probably don't matter.
+            squeezed_var = torch.squeeze(biased_var, reduction_dims)
+            unbiased_var = squeezed_var * (n / (n - 1))
+            new_running_var = momentum * unbiased_var + (1 - momentum) * running_var
+            if not functional:
+                running_var.copy_(new_running_var)
+    else:
+        assert running_mean is not None and running_var is not None
+        running_mean = running_mean.to(dtype=computation_dtype, copy=True)
+        new_running_mean = running_mean
+        running_var = running_var.to(dtype=computation_dtype, copy=True)
+        new_running_var = running_var
+        mean = running_mean
+        invstd = 1 / (torch.sqrt(running_var + eps))
+        # Very annoying inconsistency where CPU and CUDA give different shapes
+        if input.device.type != "cpu":
+            save_mean = running_mean
+            save_rstd = invstd
+        else:
+            save_mean = input.new_zeros((0,))
+            save_rstd = input.new_zeros((0,))
+        mean = _unsqueeze_to_dim(mean, input.dim() - 1)
+        invstd = _unsqueeze_to_dim(invstd, input.dim() - 1)
+        output = (input - mean) * invstd
+
+    if weight is not None:
+        weight = weight.flatten()
+        weight = _unsqueeze_to_dim(weight, input.dim() - 1)
+        output = output * weight
+
+    if bias is not None:
+        bias = bias.flatten()
+        bias = _unsqueeze_to_dim(bias, input.dim() - 1)
+        output = output + bias
+
+    if input.device.type == "cpu":
+        save_mean = save_mean.to(dtype=input.dtype)
+        save_rstd = save_rstd.to(dtype=input.dtype)
+    return (
+        output.to(dtype=input.dtype),
+        save_mean,
+        save_rstd,
+        new_running_mean,
+        new_running_var,
+    )
+
+
+@register_decomposition(aten.native_batch_norm)
+@out_wrapper("out", "save_mean", "save_invstd")
+def native_batch_norm(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+# TODO: this decomposition is NOT here to stay. We would much prefer replacing native_batch_norm
+# with our new correctly schema'd _native_batch_norm_legit and its variants, but
+# we cannot do that immediately in the C++ because it would be forwards incompatible
+# with some mobile use cases.
+#
+# Since this change is most impactful for aot autograd/functionalization, we simply
+# register this decomposition on the Autograd key for the python dispatcher (which is
+# currently only used by aot autograd/functionalization and no one else, really).
+# In two weeks or so, we should remove this decomposition and phase out the current native_batch_norm
+# to be _native_batch_norm_legit and have the right schema (stating that there are input mutations).
+@aten.native_batch_norm.default.py_impl(DispatchKey.Autograd)
+@aten.native_batch_norm.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+def native_batch_norm_decomposition(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    if running_mean is None and running_var is None:
+        return aten._native_batch_norm_legit(
+            input, weight, bias, training, momentum, eps
+        )
+    if running_mean is None:
+        raise RuntimeError(
+            "running_mean is None, but running_var is provided. "
+            "They should both be None or both be provided."
+        )
+    if running_var is None:
+        raise RuntimeError(
+            "running_var is None, but running_mean is provided. "
+            "They should both be None or both be provided."
+        )
+    if training:
+        # HACK: batch norm consolidation should clean this up so this op doesn't take in a training arg.
+        return aten._native_batch_norm_legit(
+            input, weight, bias, running_mean, running_var, training, momentum, eps
+        )
+    else:
+        return aten._native_batch_norm_legit_no_training(
+            input, weight, bias, running_mean, running_var, momentum, eps
+        )
+
+
+@aten.unsafe_chunk.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+def unsafe_chunk_py_impl(tensor, chunks, dim=0) -> List[Tensor]:
+    dim_size = tensor.size(dim)
+    split_size = (dim_size + chunks - 1) // chunks
+
+    if split_size == 0 and dim_size == 0:
+        split_sizes = [split_size for _ in chunks]
+        split_sizes[chunks - 1] = split_size - (split_size * chunks - dim_size)
+        return torch.ops.aten.unsafe_split_with_sizes.default(tensor, split_sizes, dim)
+    return torch.ops.aten.unsafe_split.Tensor(tensor, split_size, dim)
+
+
+@register_decomposition(aten._native_batch_norm_legit_no_training.default)
+def _native_batch_norm_legit_no_training(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    return aten._native_batch_norm_legit.default(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        False,  # training
+        momentum,
+        eps,
+    )
+
+
+@register_decomposition(aten._native_batch_norm_legit.default)
+def _native_batch_norm_legit(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+@register_decomposition(aten._native_batch_norm_legit.no_stats)
+def _native_batch_norm_legit_no_stats(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    output, save_mean, save_rstd, _, _ = native_batch_norm_helper(
+        input, weight, bias, None, None, training, momentum, eps, False
+    )
+    return output, save_mean, save_rstd
+
+
+@register_decomposition(aten._native_batch_norm_legit_functional.default)
+def _native_batch_norm_legit_functional(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    running_mean: Tensor,
+    running_var: Tensor,
+    training: bool,
+    momentum: float,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    (
+        output,
+        save_mean,
+        save_rstd,
+        new_running_mean,
+        new_running_var,
+    ) = native_batch_norm_helper(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, True
+    )
+    assert new_running_mean is not None, "new_running_mean should not be None"
+    assert new_running_var is not None, "new_running_var should not be None"
+    return output, save_mean, save_rstd, new_running_mean, new_running_var
+
+
+@register_decomposition(aten._fused_dropout)
+@out_wrapper("out0", "out1")
+@pw_cast_for_opmath
+def _fused_dropout_decomposition(input, p, generator=None):
+    assert generator is None
+    mask = (torch.rand_like(input) < p).to(dtype=torch.uint8)
+    res = mask.type_as(input) * input * (1.0 / p)
+    return (res, mask)
+
+
+def device_hint(tensor):
+    if isinstance(tensor, torch._subclasses.FakeTensor):
+        return tensor.fake_device
+    else:
+        return None
+
+
+@register_decomposition(aten._to_copy)
+@out_wrapper()
+def _to_copy(
+    x: Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout=None,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+    non_blocking: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+):
+    assert not layout or layout == torch.strided, "TODO"
+    assert not pin_memory, "TODO"
+    if device is None and dtype is None and memory_format is None:
+        return x.clone()
+    dtype_converted = False
+    common_device = device_hint(x)
+
+    if device is not None and device != x.device:
+        # avoid conversions on cpu
+        if dtype is not None and device.type == "cpu":
+            x = torch._prims.convert_element_type(x, dtype)
+            dtype_converted = True
+        x = torch._prims.device_put(x, device)
+
+    if dtype is not None and not dtype_converted:
+        x = torch._prims.convert_element_type(x, dtype)
+        dtype_converted = True
+
+    if memory_format is not None:  # no ref/prim for memory format
+        return torch.clone(x, memory_format=memory_format)
+    return x
+
+
+# Questionable decompositions
+# This is only valid if we're running the graph without autograd, such as if the backward pass has been traced.
+# Note that this decomposition causes issues with in-place ops
+@register_decomposition([aten.detach, aten.lift, aten.lift_fresh])
+@out_wrapper()
+def nop_decomposition(x):
+    return aten.alias(x)
+
+
+# Also register to the Autograd dispatch key, so this decomp can run above autograd.
+# native_batch_norm needs to decompose into other ops before autograd.
+@aten.cudnn_batch_norm.default.py_impl(DispatchKey.Autograd)
+@register_decomposition(aten.cudnn_batch_norm)
+@out_wrapper("out0", "out1", "out2", "out3")
+def cudnn_batch_norm(
+    input: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+):
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+    # Cudnn return running mean and variance when training is True
+    if training:
+        return (a, b, c, input.new_zeros((0,), dtype=torch.uint8))
+    return (
+        a,
+        weight.new_zeros((0,)),
+        weight.new_zeros((0,)),
+        input.new_zeros((0,), dtype=torch.uint8),
+    )
+
+
+def _broadcast_batch_norm_backward(x, broadcast_mask):
+    for axis, mask in enumerate(broadcast_mask):
+        if mask == 1 and not (axis < x.ndim and x.shape[axis] == broadcast_mask[axis]):
+            x = x.unsqueeze(axis)
+    return x
+
+
+@register_decomposition(aten.native_batch_norm_backward.default)
+def native_batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: List[bool],
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    input_dtype = input.dtype
+    if weight is not None:
+        weight_dtype = weight.dtype
+    else:
+        weight_dtype = input_dtype
+    computation_dtype = utils.get_computation_dtype(input.dtype)
+    (
+        grad_out_cast,
+        input_cast,
+        weight_cast,
+        running_mean_cast,
+        running_var_cast,
+        save_mean_cast,
+        save_invstd_cast,
+    ) = (
+        x.to(computation_dtype) if x is not None else x
+        for x in (
+            grad_out,
+            input,
+            weight,
+            running_mean,
+            running_var,
+            save_mean,
+            save_invstd,
+        )
+    )
+    input_shape = input.shape
+    input_rank = input.dim()
+    assert input_rank >= 2, "rank of the input must be at least 2"
+
+    axis = 1
+    num_features = prod(list(input_shape)) / input_shape[axis]
+    mean = save_mean_cast
+    invstd = save_invstd_cast
+    if train:
+        assert save_mean_cast is not None and save_invstd_cast is not None
+    else:
+        assert running_mean_cast is not None and running_var_cast is not None
+        mean = running_mean_cast
+        invstd = torch.rsqrt(running_var_cast + eps)
+
+    broadcast_mask: List[int] = [1] * input_rank
+    broadcast_mask[axis] = input_shape[axis]
+
+    reduction_axes: List[int] = []
+    for i in range(input_rank):
+        if i != axis:
+            reduction_axes.append(i)
+
+    mean = _broadcast_batch_norm_backward(mean, broadcast_mask)  # type: ignore[arg-type]
+    norm = 1.0 / num_features
+    grad_output_sum = torch.sum(grad_out_cast, reduction_axes)  # type: ignore[arg-type]
+    dot_p = torch.sum(grad_out_cast * (input_cast - mean), reduction_axes)  # type: ignore[operator]
+
+    grad_mean = _broadcast_batch_norm_backward(grad_output_sum * norm, broadcast_mask)
+    proj_scale = _broadcast_batch_norm_backward(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask)  # type: ignore[operator]
+
+    if weight_cast is None:
+        grad_scale = _broadcast_batch_norm_backward(invstd, broadcast_mask) * 1.0  # type: ignore[arg-type]
+    else:
+        grad_scale = _broadcast_batch_norm_backward(
+            invstd * weight_cast, broadcast_mask
+        )
+
+    if train:
+        proj = (input_cast - mean) * proj_scale  # type: ignore[operator]
+        grad_input = ((grad_out_cast - proj) - grad_mean) * grad_scale
+    else:
+        grad_input = grad_out_cast * grad_scale
+
+    if output_mask[1]:
+        grad_weight = dot_p * invstd
+    else:
+        grad_weight = None  # "None" doesn't work with vjp, should use zeros for vjp
+
+    if output_mask[2]:
+        grad_bias = grad_output_sum
+    else:
+        grad_bias = None  # "None" doesn't work with vjp, should use zeros for vjp
+
+    return (
+        grad_input.to(input_dtype),
+        _maybe_cast(grad_weight, weight_dtype),
+        _maybe_cast(grad_bias, weight_dtype),
+    )
+
+
+# out_wrapper currently does not allow optional outputs
+@register_decomposition(aten.native_batch_norm_backward.out)
+def native_batch_norm_backward_out(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: List[bool],
+    *,
+    out0: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    result = native_batch_norm_backward(
+        grad_out,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_invstd,
+        train,
+        eps,
+        output_mask,
+    )
+    grad_input = (out0, out1, out2)
+    for i, r in enumerate(result):
+        if r is not None:
+            _maybe_resize_out(grad_input[i], r.shape)
+            _safe_copy_out(copy_from=r, copy_to=grad_input[i], exact_dtype=True)
+
+    return grad_input
+
+
+@register_decomposition(aten.cudnn_batch_norm_backward)
+@out_wrapper("out0", "out1", "out2")
+def cudnn_batch_norm_backward(
+    input: Tensor,
+    grad_output: Tensor,
+    weight: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_var: Optional[Tensor],
+    epsilon: float,
+    reserveSpace: Tensor,
+):
+    return aten.native_batch_norm_backward(
+        grad_output,
+        input,
+        weight,
+        running_mean,
+        running_var,
+        save_mean,
+        save_var,
+        True,
+        epsilon,
+        [True, True, True],
+    )
+
+
+@register_decomposition(aten._adaptive_avg_pool2d)
+@out_wrapper()
+@pw_cast_for_opmath
+def adaptive_avg_pool2d(input: Tensor, output_size: Tuple[int, int]):
+    # Preconditions
+    device = input.device
+    shape = input.shape
+    ndim = len(shape)
+    torch._check(
+        ndim in (3, 4),
+        lambda: f"adaptive_avg_pool2d(): Expected 3D or 4D tensor, but got {ndim}",
+    )
+    for d in input.shape[-2:]:
+        torch._check(
+            d != 0,
+            lambda: "adaptive_avg_pool2d(): Expected input to have non-zero size for "
+            f"non-batch dimensions, but input has shape {tuple(shape)}.",
+        )
+
+    # Optimisation (we should also do this in the kernel implementation)
+    if shape[-2] % output_size[-2] == 0 and shape[-1] % output_size[-1] == 0:
+        stride = tuple(i // o for i, o in zip(shape[-2:], output_size))
+        kernel = tuple(
+            i - (o - 1) * s for i, o, s in zip(shape[-2:], output_size, stride)
+        )
+        return torch.nn.functional.avg_pool2d(input, kernel, stride)
+
+    def start_index(a, b, c):
+        return torch.div(a * c, b, rounding_mode="trunc")
+
+    def end_index(a, b, c):
+        return torch.div((a + 1) * c + b - 1, b, rounding_mode="trunc")
+
+    def compute_idx(in_size, out_size):
+        orange = torch.arange(out_size, device=device, dtype=torch.int64)
+        i0 = start_index(orange, out_size, in_size)
+        # Let length = end_index - start_index, i.e. the length of the pooling kernels
+        # length.max() can be computed analytically as follows:
+        maxlength = in_size // out_size + 1
+        in_size_mod = in_size % out_size
+        # adaptive = True iff there are kernels with different lengths
+        adaptive = not (in_size_mod == 0 or out_size % in_size_mod == 0)
+        if adaptive:
+            maxlength += 1
+        elif in_size_mod == 0:
+            maxlength -= 1
+
+        range_max = torch.arange(maxlength, device=device, dtype=torch.int64)
+        idx = i0.unsqueeze(-1) + range_max
+        if adaptive:
+            # Need to clamp to avoid accessing out-of-bounds memory
+            # TODO make minimum accept scalars
+            maxval = torch.scalar_tensor(
+                in_size - 1, dtype=idx.dtype, device=idx.device
+            )
+            idx = torch.minimum(idx, maxval)
+
+            # Compute the length
+            i1 = end_index(orange, out_size, in_size)
+            length = i1 - i0
+        else:
+            length = maxlength
+        return idx, length, range_max, adaptive
+
+    # length is not None if it's constant, otherwise we'll need to compute it
+    idxh, length_h, range_max_h, adaptive_h = compute_idx(shape[-2], output_size[-2])
+    idxw, length_w, range_max_w, adaptive_w = compute_idx(shape[-1], output_size[-1])
+
+    vals = input[..., _unsqueeze_to_dim(idxh, 4), idxw]
+    # Shortcut for the simpler case
+    if not adaptive_h and not adaptive_w:
+        return torch.mean(vals, dim=(-3, -1))
+
+    def maybe_mask(vals, length, range_max, adaptive, dim):
+        if isinstance(length, IntLike):
+            return vals, length
+        else:
+            # zero-out the things we didn't really want to select
+            assert dim < 0
+            # hack
+            mask = range_max >= length.unsqueeze(-1)
+            if dim == -2:
+                mask = _unsqueeze_to_dim(mask, 4)
+            vals = torch.masked_fill(vals, mask, 0.0)
+            # Compute the length of each window
+            length = _unsqueeze_to_dim(length, -dim)
+            return vals, length
+
+    vals, length_h = maybe_mask(
+        vals, length_h, range_max_h, adaptive=adaptive_h, dim=-2
+    )
+    vals, length_w = maybe_mask(
+        vals, length_w, range_max_w, adaptive=adaptive_w, dim=-1
+    )
+
+    # We unroll the sum as we assume that the kernels are going to be small
+    ret = None
+    for i, j in product(range(vals.shape[-3]), range(vals.shape[-1])):
+        if ret is None:
+            ret = vals[..., i, :, j]
+        else:
+            ret = ret + vals[..., i, :, j]
+    return ret / (length_h * length_w)
+
+
+@register_decomposition(aten.index_add_)
+def index_add_(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    return _index_add(x, dim, index, tensor, inplace=True, alpha=alpha)
+
+
+@register_decomposition(aten.index_add)
+@out_wrapper()
+def index_add(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    return _index_add(x, dim, index, tensor, inplace=False, alpha=alpha)
+
+
+def _index_add(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    inplace: bool,
+    alpha: NumberType = 1,
+):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    torch._check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    index_size = index.size(0) if index.ndim == 1 else 1
+    tensor_size = tensor.size(dim) if tensor.ndim > 0 else 1
+    torch._check(
+        tensor_size == index_size,
+        lambda: f"Number of indices ({index_size}) should be equal to tensor.size(dim) ({tensor_size}), for {dim=}",
+    )
+    if alpha != 1:
+        python_type = utils.dtype_to_type(x.dtype)
+        torch._check(
+            python_type == bool
+            or utils.is_weakly_lesser_type(type(alpha), python_type),
+            lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
+        )
+        tensor = tensor * alpha
+    # Treat scalars as elements of \R^1
+    zero_dim = x.ndim == 0
+    x1 = x.unsqueeze(0) if zero_dim else x
+    idx = (None,) * dim + (index,)
+    index_put = aten.index_put_ if inplace else aten.index_put
+    out = index_put(x1, idx, tensor, accumulate=True)
+    if inplace:
+        return x
+    else:
+        return out.squeeze(0) if zero_dim else out.contiguous()
+
+
+@register_decomposition(aten.pad_sequence.default)
+@aten.pad_sequence.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+def pad_sequence(sequences, batch_first=False, padding_value=0.0):
+    torch._check(len(sequences) > 0, lambda: "received an empty list of sequences")
+    sequences_size = len(sequences)
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    max_len = max(x.size(0) for x in sequences)
+    if batch_first:
+        out_dims = (sequences_size, max_len)
+    else:
+        out_dims = (max_len, sequences_size)
+    out_dims = out_dims + trailing_dims
+    out = sequences[0].new_full(out_dims, padding_value)
+    dim_paddings = (0, 0) * len(trailing_dims)
+    for i in range(sequences_size):
+        currseq = sequences[i]
+        row = aten.constant_pad_nd(
+            currseq, dim_paddings + (0, max_len - currseq.size(0)), padding_value
+        )
+        if batch_first:
+            out = aten.select_scatter(out, row, dim=0, index=i)
+        else:
+            out = aten.select_scatter(out, row, dim=1, index=i)
+    return out
+
+
+@register_decomposition(aten.index_copy_)
+def index_copy_(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    return _index_copy(x, dim, index, tensor, inplace=True)
+
+
+@register_decomposition(aten.index_copy)
+@out_wrapper()
+def index_copy(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    return _index_copy(x, dim, index, tensor, inplace=False)
+
+
+def _index_copy(
+    x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike, *, inplace: bool
+):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    torch._check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    # Treat scalars as elements of \R^1
+    zero_dim = x.ndim == 0
+    x1 = x.unsqueeze(0) if zero_dim else x
+    index = index.unsqueeze(0) if index.ndim == 0 else index
+    idx = (None,) * dim + (index,)
+    index_put = aten.index_put_ if inplace else aten.index_put
+    out = index_put(x1, idx, tensor)
+    if inplace:
+        return x
+    else:
+        return out.squeeze(0) if zero_dim else out.contiguous()
+
+
+# nb: Should use acc_t, not op_math
+@register_decomposition(aten.log_sigmoid_forward)
+@out_wrapper("output", "buffer")
+@pw_cast_for_opmath
+def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
+    min = torch.minimum(self.new_zeros(()), self)
+    z = torch.exp(-torch.abs(self))
+    if self.is_cuda:
+        buffer = self.new_zeros((0,))
+    else:
+        buffer = z
+    return min - torch.log1p(z), buffer
+
+
+@register_decomposition(aten.uniform)
+@out_wrapper()
+def uniform(
+    x: Tensor,
+    low: Union[bool, int, float] = 0.0,
+    high: Union[bool, int, float] = 1.0,
+    generator: Optional[torch.Generator] = None,
+):
+    return prims._uniform_helper(
+        x.shape,
+        low=sym_float(low),
+        high=sym_float(high),
+        dtype=x.dtype,
+        device=x.device,
+        generator=generator,
+    )
+
+
+@register_decomposition(aten.uniform_)
+def uniform_(self, low=0, high=1, generator=None):
+    return self.copy_(uniform(self, low, high, generator))
+
+
+# aten/src/ATen/native/UpSample.cpp compute_output_size
+def upsample_compute_output_size(input_size, output_size, scale_factors):
+    spatial_dimensions = len(input_size) - 2
+    if output_size is not None:
+        torch._check(
+            scale_factors is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        torch._check(len(output_size) == spatial_dimensions, lambda: "")
+        return output_size
+    if scale_factors is not None:
+        # NB: this isn't necessary lol
+        torch._check(
+            output_size is None,
+            lambda: "Must specify exactly one of output_size and scale_factors",
+        )
+        torch._check(len(scale_factors) == spatial_dimensions, lambda: "")
+        output_size = []
+        for i, s in enumerate(scale_factors):
+            if int(s) == s:
+                output_size.append(input_size[i + 2] * int(s))
+            else:
+                output_size.append(sym_int(input_size[i + 2] * s))
+        return output_size
+    torch._check(
+        False, lambda: "Must specify exactly one of output_size and scale_factors"
+    )
+
+
+def get_scale_value(scales, idx):
+    if scales is None:
+        return None
+    return scales[idx]
+
+
+@register_decomposition(aten.upsample_nearest1d.vec)
+@aten.upsample_nearest1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_nearest1d.vec.py_impl(DispatchKey.Autograd)
+def upsample_nearest1d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale = get_scale_value(scale_factors, 0)
+
+    return aten.upsample_nearest1d.default(input, osize, scale)
+
+
+@register_decomposition(aten._upsample_nearest_exact1d.vec)
+@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_nearest_exact1d.vec.py_impl(DispatchKey.Autograd)
+def _upsample_nearest_exact1d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale = get_scale_value(scale_factors, 0)
+
+    return aten._upsample_nearest_exact1d.default(input, osize, scale)
+
+
+@register_decomposition(aten.upsample_nearest2d.vec)
+@aten.upsample_nearest2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_nearest2d.vec.py_impl(DispatchKey.Autograd)
+def upsample_nearest2d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+
+    return aten.upsample_nearest2d.default(input, osize, scale_h, scale_w)
+
+
+@register_decomposition(aten._upsample_nearest_exact2d.vec)
+@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_nearest_exact2d.vec.py_impl(DispatchKey.Autograd)
+def _upsample_nearest_exact2d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+
+    return aten._upsample_nearest_exact2d.default(input, osize, scale_h, scale_w)
+
+
+@register_decomposition(aten.upsample_nearest3d.vec)
+@aten.upsample_nearest3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_nearest3d.vec.py_impl(DispatchKey.Autograd)
+def upsample_nearest3d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_d = get_scale_value(scale_factors, 0)
+    scale_h = get_scale_value(scale_factors, 1)
+    scale_w = get_scale_value(scale_factors, 2)
+
+    return aten.upsample_nearest3d.default(input, osize, scale_d, scale_h, scale_w)
+
+
+@register_decomposition(aten._upsample_nearest_exact3d.vec)
+@aten._upsample_nearest_exact3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_nearest_exact3d.vec.py_impl(DispatchKey.Autograd)
+def _upsample_nearest_exact3d_vec(input, output_size, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_d = get_scale_value(scale_factors, 0)
+    scale_h = get_scale_value(scale_factors, 1)
+    scale_w = get_scale_value(scale_factors, 2)
+
+    return aten._upsample_nearest_exact3d.default(
+        input, osize, scale_d, scale_h, scale_w
+    )
+
+
+def _compute_upsample_nearest_indices(input, output_size, scales, exact=False):
+    # For each dim in output_size, compute the set of input indices used
+    # to produce the upsampled output.
+    indices = []
+    num_spatial_dims = len(output_size)
+    offset = 0.5 if exact else 0.0
+
+    for d in range(num_spatial_dims):
+        # Math matches aten/src/ATen/native/cpu/UpSampleKernel.cpp
+        #
+        # Indices are computed as following:
+        # scale = isize / osize
+        # Case: exact=False
+        # input_index = floor(output_index * scale)
+        # Same as OpenCV INTER_NEAREST
+        #
+        # Case: exact=False
+        # index_f32 = (output_index + 0.5) * scale - 0.5
+        # input_index = round(index_f32)
+        # Same as Pillow and Scikit-Image/Scipy ndi.zoom
+        osize = output_size[d]
+        isize = input.shape[-num_spatial_dims + d]
+        scale = isize / (isize * scales[d]) if scales[d] is not None else isize / osize
+
+        output_indices = torch.arange(osize, dtype=torch.float32, device=input.device)
+        input_indices = ((output_indices + offset) * scale).to(torch.int64)
+        for _ in range(num_spatial_dims - 1 - d):
+            input_indices = input_indices.unsqueeze(-1)
+        indices.append(input_indices)
+    return tuple(indices)
+
+
+@register_decomposition(aten.upsample_nearest1d.default)
+@aten.upsample_nearest1d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def upsample_nearest1d(
+    input: Tensor,
+    output_size: List[int],
+    scales: Optional[float] = None,
+) -> Tensor:
+    (l_indices,) = _compute_upsample_nearest_indices(input, output_size, (scales,))
+    return aten._unsafe_index(input, (None, None, l_indices))
+
+
+@register_decomposition(aten._upsample_nearest_exact1d.default)
+@aten._upsample_nearest_exact1d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def _upsample_nearest_exact1d(
+    input: Tensor,
+    output_size: List[int],
+    scales: Optional[float] = None,
+) -> Tensor:
+    (l_indices,) = _compute_upsample_nearest_indices(
+        input, output_size, (scales,), exact=True
+    )
+    return aten._unsafe_index(input, (None, None, l_indices))
+
+
+def _upsample_nearest2d_common(input, h_indices, w_indices):
+    result = aten._unsafe_index(input, (None, None, h_indices, w_indices))
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(input)
+
+    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+    _, n_channels, _, _ = input.shape
+    if input.device.type == "cuda" and n_channels < 4:
+        memory_format = torch.contiguous_format
+
+    result = result.contiguous(memory_format=memory_format)
+    return result
+
+
+@register_decomposition(aten.upsample_nearest2d.default)
+@aten.upsample_nearest2d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def upsample_nearest2d(
+    input: Tensor,
+    output_size: List[int],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    h_indices, w_indices = _compute_upsample_nearest_indices(
+        input, output_size, (scales_h, scales_w)
+    )
+    return _upsample_nearest2d_common(input, h_indices, w_indices)
+
+
+@register_decomposition(aten._upsample_nearest_exact2d.default)
+@aten._upsample_nearest_exact2d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def _upsample_nearest_exact2d(
+    input: Tensor,
+    output_size: List[int],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    h_indices, w_indices = _compute_upsample_nearest_indices(
+        input, output_size, (scales_h, scales_w), exact=True
+    )
+    return _upsample_nearest2d_common(input, h_indices, w_indices)
+
+
+@register_decomposition(aten.upsample_nearest3d.default)
+@aten.upsample_nearest3d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def upsample_nearest3d(
+    input: Tensor,
+    output_size: List[int],
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    d_indices, h_indices, w_indices = _compute_upsample_nearest_indices(
+        input, output_size, (scales_d, scales_h, scales_w)
+    )
+    result = aten._unsafe_index(input, (None, None, d_indices, h_indices, w_indices))
+
+    return result
+
+
+@register_decomposition(aten._upsample_nearest_exact3d.default)
+@aten._upsample_nearest_exact3d.default.py_impl(DispatchKey.Autograd)
+@pw_cast_for_opmath
+def _upsample_nearest_exact3d(
+    input: Tensor,
+    output_size: List[int],
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    d_indices, h_indices, w_indices = _compute_upsample_nearest_indices(
+        input, output_size, (scales_d, scales_h, scales_w), exact=True
+    )
+    result = aten._unsafe_index(input, (None, None, d_indices, h_indices, w_indices))
+
+    return result
+
+
+def gather_params(params, has_biases, has_projections):
+    if has_biases and has_projections:
+        group_size = 5
+    elif has_biases:
+        group_size = 4
+    elif has_projections:
+        group_size = 3
+    else:
+        group_size = 2
+
+    assert len(params) % group_size == 0, len(params)
+    return [
+        tuple(params[i : i + group_size]) for i in range(0, len(params), group_size)
+    ]
+
+
+def params_hiddens(params, hiddens, i, bidirectional):
+    if bidirectional:
+        cur_params, cur_hidden = params[2 * i], hiddens[2 * i]
+        bidir_params, bidir_hidden = params[2 * i + 1], hiddens[2 * i + 1]
+    else:
+        cur_params, cur_hidden = params[i], hiddens[i]
+        bidir_params, bidir_hidden = None, None
+
+    return cur_params, cur_hidden, bidir_params, bidir_hidden
+
+
+def update_hidden_for_packed(cur_hidden, last_batch_size, batch_size, hiddens):
+    assert last_batch_size > batch_size
+    hiddens.append(cur_hidden.narrow(0, batch_size, last_batch_size - batch_size))
+    return cur_hidden.narrow(0, 0, batch_size)
+
+
+def update_hidden_for_packed_reverse(
+    cur_hidden, last_batch_size, batch_size, inp_hidden
+):
+    if last_batch_size == batch_size:
+        return cur_hidden
+    assert last_batch_size < batch_size
+    return torch.concat(
+        (
+            cur_hidden,
+            inp_hidden.narrow(0, last_batch_size, batch_size - last_batch_size),
+        )
+    )
+
+
+def one_layer_rnn_data(
+    inp, hidden, params, has_biases, hidden_fn, batch_sizes, reverse=False
+):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+
+    step_output = []
+    hiddens: List[torch.Tensor] = []
+
+    last_batch_size = batch_sizes[-1] if reverse else batch_sizes[0]
+    cur_hidden = hidden.narrow(0, 0, last_batch_size)
+    split_inp = torch.split(inp, list(batch_sizes))
+    if reverse:
+        split_inp = split_inp[::-1]
+    for inp in split_inp:
+        i = inp.shape[0]
+
+        if last_batch_size == i:
+            pass  # don't update cur_hidden
+        # this will only happen when reverse=False, since batch sizes are sorted largest -> smallest
+        elif reverse:
+            cur_hidden = update_hidden_for_packed_reverse(
+                cur_hidden, last_batch_size, i, hidden
+            )
+        else:
+            cur_hidden = update_hidden_for_packed(
+                cur_hidden, last_batch_size, i, hiddens
+            )
+
+        cur_hidden = hidden_fn(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias)
+        last_batch_size = i
+        step_output.append(cur_hidden)
+
+    if reverse:
+        step_output.reverse()
+    else:
+        hiddens.append(cur_hidden)
+        hiddens.reverse()
+
+    out = torch.cat(step_output, 0)
+    hidden_out = torch.cat(hiddens, 0) if not reverse else cur_hidden
+    return out, hidden_out
+
+
+def rnn_cell(nonlinearity):
+    def inner(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+        return nonlinearity(F.linear(cur_hidden, hh_weight, hh_bias) + i)
+
+    return inner
+
+
+def rnn_cell_data(nonlinearity):
+    def inner(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+        i = F.linear(i, ih_weight, ih_bias)
+        return nonlinearity(F.linear(cur_hidden, hh_weight, hh_bias) + i)
+
+    return inner
+
+
+def one_layer_rnn(inp, hidden, params, has_biases, hidden_fn, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+
+    precomputed_input = F.linear(inp, ih_weight, ih_bias)
+    precomputed_input = precomputed_input.flip(0) if reverse else precomputed_input
+    cur_hidden = hidden.unsqueeze(0)
+    step_output = []
+    for i in precomputed_input:
+        cur_hidden = hidden_fn(i, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias)
+        step_output.append(cur_hidden)
+
+    if reverse:
+        step_output.reverse()
+
+    out = torch.cat(step_output, 0)
+
+    return out, cur_hidden.squeeze(0)
+
+
+def mkldnn_one_layer_lstm(inp, hidden, params, has_biases, reverse=False):
+    w0 = params[0]
+    w1 = params[1]
+    if has_biases:
+        w2 = params[2]
+        w3 = params[3]
+    else:
+        w2 = torch.zeros(w0.size())
+        w3 = torch.zeros(w1.size())
+
+    hx = hidden[0].unsqueeze(0)
+    cx = hidden[1].unsqueeze(0)
+
+    batch_sizes: List[int] = []
+    mode = 2  # third_party/ideep/include/ideep/abstract_types.hpp: ideep::rnn_kind::LSTM = 2
+    hidden_size = hx.size(2)
+    num_layers = 1
+
+    # _rnn_helper already handles bidirectional and batch_first so we hard-code them to False here
+    bidirectional = False
+    batch_first = False
+
+    train = False
+    # If batch_first, inp has been permuted in _rnn_helper. Convert to contiguous here.
+    # Same as aten/src/ATen/native/mkldnn/RNN.cpp: mkldnn_rnn: input = input.contiguous();
+    inp = inp.contiguous()
+    hx = hx.contiguous()
+    cx = cx.contiguous()
+    outputs = torch.ops.aten.mkldnn_rnn_layer.default(
+        inp,
+        w0,
+        w1,
+        w2,
+        w3,
+        hx,
+        cx,
+        reverse,
+        batch_sizes,
+        mode,
+        hidden_size,
+        num_layers,
+        has_biases,
+        bidirectional,
+        batch_first,
+        train,
+    )
+    y, hy, cy = outputs[0], outputs[1], outputs[2]
+    return y, (hy.squeeze(0), cy.squeeze(0))
+
+
+def _rnn_helper(
+    input,
+    hidden,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+    layer_fn,
+):
+    input = input.transpose(0, 1) if batch_first else input
+    final_hiddens = []
+
+    for i in range(num_layers):
+        cur_params, cur_hidden, bidir_params, bidir_hidden = params_hiddens(
+            params, hidden, i, bidirectional
+        )
+        dropout = dropout if (train and num_layers < i - 1) else 0.0
+        fwd_inp, fwd_hidden = layer_fn(input, cur_hidden, cur_params, has_biases)
+        final_hiddens.append(fwd_hidden)
+
+        if bidirectional:
+            bwd_inp, bwd_hidden = layer_fn(
+                input, bidir_hidden, bidir_params, has_biases, reverse=True
+            )
+            final_hiddens.append(bwd_hidden)
+
+        if bidirectional:
+            input = torch.cat([fwd_inp, bwd_inp], fwd_inp.dim() - 1)  # type: ignore[possibly-undefined]
+        else:
+            input = fwd_inp
+
+        if dropout != 0 and train and i < num_layers - 1:
+            input = torch.dropout(input, dropout, train=True)
+
+    input = input.transpose(0, 1) if batch_first else input
+    return input, final_hiddens
+
+
+@register_decomposition(aten.rnn_tanh.input)
+@aten.rnn_tanh.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_tanh.input.py_impl(DispatchKey.Autograd)
+def rnn_tanh_input(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=rnn_cell(torch.tanh)),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_relu.input)
+@aten.rnn_relu.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_relu.input.py_impl(DispatchKey.Autograd)
+def rnn_relu_input(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=rnn_cell(torch.relu)),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_relu.data)
+@aten.rnn_relu.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_relu.data.py_impl(DispatchKey.Autograd)
+def rnn_relu_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(
+            one_layer_rnn_data,
+            batch_sizes=batch_sizes,
+            hidden_fn=rnn_cell_data(torch.relu),
+        ),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.rnn_tanh.data)
+@aten.rnn_tanh.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.rnn_tanh.data.py_impl(DispatchKey.Autograd)
+def rnn_tanh_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden = hx.unbind(0)
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(
+            one_layer_rnn_data,
+            batch_sizes=batch_sizes,
+            hidden_fn=rnn_cell_data(torch.tanh),
+        ),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+def lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim):
+    gates = F.linear(hx, hh_weight, hh_bias) + inp
+    chunked_gates = gates.chunk(4, chunk_dim)
+    in_gate = chunked_gates[0].sigmoid()
+    forget_gate = chunked_gates[1].sigmoid()
+    cell_gate = chunked_gates[2].tanh()
+    out_gate = chunked_gates[3].sigmoid()
+    cy = forget_gate * cx + (in_gate * cell_gate)
+    hy = out_gate * cy.tanh()
+    hy = hy if hr_weight is None else F.linear(hy, hr_weight, None)
+
+    return hy, cy
+
+
+def one_layer_lstm(inp, hidden, params, has_biases, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+    hr_weight = (
+        params[4] if len(params) == 5 else params[2] if len(params) == 3 else None
+    )
+
+    hx = hidden[0].unsqueeze(0)
+    cx = hidden[1].unsqueeze(0)
+
+    precomputed_input = F.linear(inp, ih_weight, ih_bias)
+    precomputed_input = precomputed_input.flip(0) if reverse else precomputed_input
+    step_output = []
+    for inp in precomputed_input:
+        hx, cx = lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim=2)
+        step_output.append(hx)
+
+    if reverse:
+        step_output.reverse()
+
+    out = torch.cat(step_output, 0)
+
+    return out, (hx.squeeze(1), cx.squeeze(1))
+
+
+def one_layer_lstm_data(inp, hidden, params, has_biases, batch_sizes, reverse=False):
+    ih_weight = params[0]
+    hh_weight = params[1]
+    ih_bias = params[2] if has_biases else None
+    hh_bias = params[3] if has_biases else None
+    hr_weight = (
+        params[4] if len(params) == 5 else params[2] if len(params) == 3 else None
+    )
+
+    step_output = []
+    hiddens = []
+
+    last_batch_size = batch_sizes[-1] if reverse else batch_sizes[0]
+    split_inp = torch.split(inp, list(batch_sizes))
+    if reverse:
+        split_inp = split_inp[::-1]
+
+    orig_hx = hidden[0]
+    orig_cx = hidden[1]
+    hx, cx = orig_hx.narrow(0, 0, last_batch_size), orig_cx.narrow(
+        0, 0, last_batch_size
+    )
+
+    for inp in split_inp:
+        i = inp.shape[0]
+        inp = F.linear(inp, ih_weight, ih_bias)
+
+        # this will only happen when reverse=False, since batch sizes are sorted largest -> smallest
+        if i < last_batch_size:
+            hiddens.append(
+                (
+                    hx.narrow(0, i, last_batch_size - i),
+                    cx.narrow(0, i, last_batch_size - i),
+                )
+            )
+            hx, cx = hx.narrow(0, 0, i), cx.narrow(0, 0, i)
+
+        # this will only happen when reverse=True
+        if i > last_batch_size:
+            hx = torch.concat(
+                (hx, orig_hx.narrow(0, last_batch_size, i - last_batch_size)), 0
+            )
+            cx = torch.concat(
+                (cx, orig_cx.narrow(0, last_batch_size, i - last_batch_size)), 0
+            )
+
+        hx, cx = lstm_cell(inp, hx, cx, hh_weight, hh_bias, hr_weight, chunk_dim=1)
+        last_batch_size = i
+        step_output.append(hx)
+
+    if reverse:
+        step_output.reverse()
+        hidden_out = (hx, cx)
+    else:
+        hiddens.append((hx, cx))
+        hiddens.reverse()
+        hidden0, hidden1 = zip(*hiddens)
+        hidden_out = torch.cat(hidden0, 0), torch.cat(hidden1, 0)
+
+    out = torch.cat(step_output, 0)
+    return out, hidden_out
+
+
+def select_one_layer_lstm_function(input, hx, params):
+    r"""Check whether we could use decompose lstm with mkldnn_rnn_layer.
+    All the below conditions need to be met:
+        * ``torch._C._get_mkldnn_enabled()`` returns ``True``.
+        * All the input args are on CPU.
+        * The dtypes of args are either torch.float or torch.bfloat16.
+        * Inference.
+        * ``has_projections`` returns ``False``.
+
+    Args:
+        * input: the input sequence to LSTM
+        * hx: a tuple of the input hidden state and cell state ``(h_0, c_0)`` to LSTM
+        * params: the weight and bias tensors of LSTM
+    """
+
+    def use_mkldnn(input, hx, params):
+        if not torch._C._get_mkldnn_enabled():
+            return False
+
+        tensors = [input] + list(hx) + list(chain.from_iterable(params))
+        devices = {t.device for t in tensors}
+        if len(devices) != 1:
+            return False
+
+        device = devices.pop()
+        if device != torch.device("cpu"):
+            return False
+        # With autocast, possible to have mixed dtype here
+        dtypes = {t.dtype for t in tensors}
+        for dtype in dtypes:
+            if dtype not in [torch.float, torch.bfloat16]:
+                return False
+
+        if input.requires_grad:
+            return False
+
+        has_projections = hx[0].size(2) != hx[1].size(2)
+        if has_projections:
+            return False
+
+        return True
+
+    # mkldnn_one_layer_lstm does not depend on seq_len while one_layer_lstm
+    # will expand over the seq_len dim
+    if use_mkldnn(input, hx, params):
+        return mkldnn_one_layer_lstm
+    else:
+        return one_layer_lstm
+
+
+@register_decomposition(aten.lstm.input)
+@aten.lstm.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.lstm.input.py_impl(DispatchKey.Autograd)
+def lstm_impl(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    assert len(hx) == 2, "lstm expects two hidden states"
+    params = gather_params(params, has_biases, hx[0].size(2) != hx[1].size(2))
+    hidden = list(zip(hx[0], hx[1]))
+    layer_fn = select_one_layer_lstm_function(input, hx, params)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        layer_fn,
+    )
+    final_hiddens = list(zip(*final_hiddens))
+    return out, torch.stack(final_hiddens[0], 0), torch.stack(final_hiddens[1], 0)
+
+
+@register_decomposition(aten.lstm.data)
+@aten.lstm.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.lstm.data.py_impl(DispatchKey.Autograd)
+def lstm_data_impl(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    assert len(hx) == 2, "lstm expects two hidden states"
+    params = gather_params(params, has_biases, hx[0].size(2) != hx[1].size(2))
+    hidden = list(zip(hx[0], hx[1]))
+    out, final_hiddens = _rnn_helper(
+        data,
+        hidden,
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(one_layer_lstm_data, batch_sizes=batch_sizes),
+    )
+    final_hiddens = list(zip(*final_hiddens))
+    return out, torch.stack(final_hiddens[0], 0), torch.stack(final_hiddens[1], 0)
+
+
+def gru_cell(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+    chunked_igates = inp.chunk(3, 1)
+    chunked_hgates = F.linear(cur_hidden, hh_weight, hh_bias).chunk(3, 2)
+    reset_gate = (chunked_hgates[0] + chunked_igates[0]).sigmoid()
+    input_gate = (chunked_hgates[1] + chunked_igates[1]).sigmoid()
+    new_gate = (chunked_igates[2] + (chunked_hgates[2] * reset_gate)).tanh()
+    return (cur_hidden - new_gate) * input_gate + new_gate
+
+
+def gru_cell_data(inp, cur_hidden, ih_weight, ih_bias, hh_weight, hh_bias):
+    chunked_igates = F.linear(inp, ih_weight, ih_bias).chunk(3, 1)
+    chunked_hgates = F.linear(cur_hidden, hh_weight, hh_bias).chunk(3, 1)
+    reset_gate = (chunked_hgates[0] + chunked_igates[0]).sigmoid()
+    input_gate = (chunked_hgates[1] + chunked_igates[1]).sigmoid()
+    new_gate = (chunked_igates[2] + (chunked_hgates[2] * reset_gate)).tanh()
+    return (cur_hidden - new_gate) * input_gate + new_gate
+
+
+@register_decomposition(aten.gru.data)
+@aten.gru.data.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.gru.data.py_impl(DispatchKey.Autograd)
+def gru_impl_data(
+    data,
+    batch_sizes,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        data,
+        hx.unbind(0),
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        False,
+        partial(one_layer_rnn_data, batch_sizes=batch_sizes, hidden_fn=gru_cell_data),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten.gru.input)
+@aten.gru.input.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.gru.input.py_impl(DispatchKey.Autograd)
+def gru_impl(
+    input,
+    hx,
+    params,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    params = gather_params(params, has_biases, False)
+    out, final_hiddens = _rnn_helper(
+        input,
+        hx.unbind(0),
+        params,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+        partial(one_layer_rnn, hidden_fn=gru_cell),
+    )
+    return out, torch.stack(final_hiddens, 0)
+
+
+@register_decomposition(aten._upsample_bilinear2d_aa.vec)
+@aten._upsample_bilinear2d_aa.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_bilinear2d_aa.vec.py_impl(DispatchKey.Autograd)
+def upsample_bilinear2d_aa_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+    return torch.ops.aten._upsample_bilinear2d_aa(
+        input, osize, align_corners, scale_h, scale_w
+    )
+
+
+@register_decomposition(aten._upsample_bicubic2d_aa.vec)
+@aten._upsample_bicubic2d_aa.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten._upsample_bicubic2d_aa.vec.py_impl(DispatchKey.Autograd)
+def upsample_bicubic2d_aa_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scale_h = get_scale_value(scale_factors, 0)
+    scale_w = get_scale_value(scale_factors, 1)
+    return torch.ops.aten._upsample_bicubic2d_aa(
+        input, osize, align_corners, scale_h, scale_w
+    )
+
+
+@register_decomposition(aten.upsample_bilinear2d.vec)
+@register_decomposition(aten.upsample_trilinear3d.vec)
+@aten.upsample_linear1d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_linear1d.vec.py_impl(DispatchKey.Autograd)
+@aten.upsample_bilinear2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_bilinear2d.vec.py_impl(DispatchKey.Autograd)
+@aten.upsample_trilinear3d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_trilinear3d.vec.py_impl(DispatchKey.Autograd)
+def _upsample_linear_vec(input, output_size, align_corners, scale_factors):
+    osize = upsample_compute_output_size(input.size(), output_size, scale_factors)
+    scales = scale_factors if scale_factors else [None] * len(osize)
+    return _upsample_linear(input, osize, align_corners, scales)
+
+
+@register_decomposition([aten.upsample_linear1d.default, aten.upsample_linear1d.out])
+@out_wrapper()
+def upsample_linear1d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(input, output_size, align_corners, [scales_w])
+
+
+@register_decomposition(
+    [aten.upsample_bilinear2d.default, aten.upsample_bilinear2d.out]
+)
+@aten.upsample_bilinear2d.default.py_impl(DispatchKey.Autograd)
+@out_wrapper()
+def upsample_bilinear2d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(input, output_size, align_corners, [scales_h, scales_w])
+
+
+@register_decomposition(
+    [aten.upsample_trilinear3d.default, aten.upsample_trilinear3d.out]
+)
+@out_wrapper()
+def upsample_trilinear3d(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+) -> Tensor:
+    return _upsample_linear(
+        input, output_size, align_corners, [scales_d, scales_h, scales_w]
+    )
+
+
+def _compute_scale(in_size, out_size, align_corners, scale=None):
+    if align_corners:
+        return (in_size - 1.0) / (out_size - 1.0) if out_size > 1 else 0
+    else:
+        return 1.0 / scale if scale is not None and scale > 0 else in_size / out_size
+
+
+def _compute_source_index(scale, dst_index, align_corners):
+    if align_corners:
+        return scale * dst_index
+    else:
+        return scale * (dst_index + 0.5) - 0.5
+
+
+@pw_cast_for_opmath
+def _upsample_linear(
+    input: Tensor,
+    output_size: List[int],
+    align_corners: bool,
+    scales: List[Optional[float]],
+) -> Tensor:
+    # get dimensions of original image
+    n_batch, n_channels = input.shape[:2]
+    inp_sizes = input.shape[2:]
+    n_dims = len(inp_sizes)
+
+    _, dtype = utils.elementwise_dtypes(
+        input,
+        type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+
+    def get_values(inp_size, out_size, scales, nsqueeze):
+        # First Calculate scaling factor
+        scale_factor = _compute_scale(inp_size, out_size, align_corners, scales)
+        # We have to create arange with int64 dtype and use .to in order to avoid
+        # additional kernels creation in inductor and get a perf slowdown
+        i = torch.arange(out_size, device=input.device).to(dtype=dtype)
+
+        x_f32 = _compute_source_index(scale_factor, i, align_corners).clamp(min=0.0)
+        x_f32 = x_f32.reshape(x_f32.shape[0], *[1] * (nsqueeze))
+        x = x_f32.to(torch.int64)
+        xp1 = (x + 1).clamp(max=inp_size - 1)
+        return x_f32, x, xp1
+
+    values = [
+        get_values(inp_size, out_size, scales, n_dims - 1 - i)
+        for i, (inp_size, out_size, scales) in enumerate(
+            zip(inp_sizes, output_size, scales)
+        )
+    ]
+    xs_f32, xs, xp1s = list(zip(*values))
+
+    vs = []
+    for a in product(*[[0, 1]] * n_dims):
+        idx = [None, None] + [xs[k] if a[k] == 0 else xp1s[k] for k in range(n_dims)]
+        v = aten._unsafe_index(input, idx)
+        v = _maybe_convert_to_dtype(v, dtype)
+        vs.append(v)
+
+    for i in reversed(range(n_dims)):
+        xscale = (xs_f32[i] - xs[i]).clamp(0.0, 1.0).to(dtype)
+        vs = [
+            # x1 * (1 - alpha) + x2 * alpha == x1 + (x2 - x1) * alpha
+            v1 + torch.mul(v2 - v1, xscale)
+            for v1, v2 in zip(vs[::2], vs[1::2])
+        ]
+
+    assert len(vs) == 1
+    result = vs[0]
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(input)
+
+    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+    if input.device.type == "cuda" and n_channels < 16:
+        memory_format = torch.contiguous_format
+
+    assert isinstance(result, torch.Tensor)
+
+    result = result.contiguous(memory_format=memory_format)
+
+    if not input.is_floating_point():
+        result = result.round()
+
+    return result
+
+
+# We should be applying decompositions after all transformations
+@register_decomposition(aten.is_same_size.default)
+def is_same_size(a: Tensor, b: Tensor) -> bool:
+    return a.shape == b.shape
+
+
+@register_decomposition([aten._reshape_alias, aten._unsafe_view])
+@out_wrapper()
+def _reshape_alias(x, shape, *args):
+    return aten.view(x, shape)
+
+
+@register_decomposition([aten._unsafe_index])
+def _index(x, indices):
+    return aten.index(x, indices)
+
+
+def _nll_loss_forward(
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+) -> Tuple[Tensor, Tensor]:
+    # self can be [N, C] or [C]
+    # target can be [N] or []
+
+    n_dims = self.dim()
+    channel_dim = 1
+    if n_dims < 2:
+        channel_dim = 0
+
+    if weight is not None:
+        if n_dims > 1:
+            shape = [
+                1,
+            ] * n_dims
+            shape[channel_dim] = weight.shape[0]
+            w = weight.view(shape)
+        else:
+            w = weight
+        self = self * w
+    safe_target = torch.where(target != ignore_index, target, 0)
+    safe_target_ = safe_target.unsqueeze(channel_dim)
+    # target can be [N, 1] or [1]
+
+    result = -torch.gather(self, channel_dim, safe_target_).squeeze(channel_dim)
+
+    result = torch.where(target != ignore_index, result, 0)
+
+    if reduction == Reduction.NONE.value and n_dims > 1:
+        total_weight = self.new_full((), 0.0)
+        return result, total_weight
+
+    if weight is not None:
+        w = w.expand(self.shape)
+        wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
+        wsum = torch.where(target != ignore_index, wsum, 0)
+        total_weight = wsum.sum()
+    else:
+        total_weight = (target != ignore_index).sum().to(self)
+
+    if reduction == Reduction.SUM.value:
+        result = result.sum()
+    elif reduction == Reduction.MEAN.value:
+        result = result.sum() / total_weight
+
+    return result, total_weight
+
+
+@register_decomposition(aten.nll_loss_forward)
+@out_wrapper("output", "total_weight")
+def nll_loss_forward(
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+) -> Tuple[Tensor, Tensor]:
+    assert self.dim() > 0 and self.dim() <= 2, "input tensor should be 1D or 2D"
+    assert (
+        target.dim() <= 1
+    ), "0D or 1D target tensor expected, multi-target not supported"
+
+    no_batch_dim = self.dim() == 1 and target.dim() == 0
+    assert no_batch_dim or (
+        self.shape[0] == target.shape[0]
+    ), f"size mismatch (got input: {self.shape}, target: {target.shape})"
+
+    n_classes = self.shape[-1]
+
+    assert weight is None or (
+        weight.dim() == 1 and weight.numel() == n_classes
+    ), f"weight tensor should be defined either for all {n_classes} classes or no classes but got weight tensor of shape: {weight.shape}"  # noqa: B950
+
+    return _nll_loss_forward(self, target, weight, reduction, ignore_index)
+
+
+@register_decomposition(aten.nll_loss2d_forward)
+@out_wrapper("output", "total_weight")
+def nll_loss2d_forward(
+    self: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+) -> Tuple[Tensor, Tensor]:
+    return _nll_loss_forward(self, target, weight, reduction, ignore_index)
+
+
+# These are adapted from aten/src/ATen/native/UpSample.h, wich is based on
+# https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+def _upsample_cubic_convolution1(x: Tensor, A: float) -> Tensor:
+    return ((A + 2) * x - (A + 3)) * x * x + 1
+
+
+def _upsample_cubic_convolution2(x: Tensor, A: float) -> Tensor:
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A
+
+
+def _upsample_get_cubic_coefficients(t: Tensor) -> TensorSequenceType:
+    A = -0.75
+    return (
+        _upsample_cubic_convolution2(t + 1.0, A),
+        _upsample_cubic_convolution1(t, A),
+        _upsample_cubic_convolution1(1.0 - t, A),
+        _upsample_cubic_convolution2(2.0 - t, A),
+    )
+
+
+def _upsample_cubic_interp1d(coeffs: TensorSequenceType, ts: Tensor) -> Tensor:
+    coeffs2 = _upsample_get_cubic_coefficients(ts)
+    return _sum_tensors(c1 * c2 for (c1, c2) in zip(coeffs, coeffs2))
+
+
+# Need this instead of just sum() to keep mypy happy
+def _sum_tensors(ts: Iterable[Tensor]) -> Tensor:
+    return reduce(torch.add, ts)
+
+
+def _linspace_from_neg_one(
+    num_steps: int, align_corners: bool, dtype: torch.dtype, device: torch.device
+):
+    if num_steps <= 1:
+        return torch.tensor(0, device=device, dtype=dtype)
+
+    a = ((num_steps - 1) / num_steps) if not align_corners else 1
+    return torch.linspace(-a, a, steps=num_steps, device=device, dtype=dtype)
+
+
+def _make_base_grid_4d(theta: Tensor, h: int, w: int, align_corners: bool):
+    dtype = theta.dtype
+    device = theta.device
+
+    # Using padding and summation generates a single kernel vs using torch.stack where 3 kernels generated
+    # corresponding to each individual tensor: grid_x, grid_y, grid_one
+    grid_x = _linspace_from_neg_one(w, align_corners, dtype, device).view(1, w, 1)
+    grid_y = _linspace_from_neg_one(h, align_corners, dtype, device).view(h, 1, 1)
+    grid_one = torch.ones((1, 1, 1), dtype=dtype, device=device)
+
+    # this is just a temporary hack and we should use torch.stack here once #104480 is merged
+    grid_x = torch.nn.functional.pad(grid_x, pad=(0, 2), mode="constant", value=0)
+    grid_y = torch.nn.functional.pad(grid_y, pad=(1, 1), mode="constant", value=0)
+    grid_one = torch.nn.functional.pad(grid_one, pad=(2, 0), mode="constant", value=0)
+    return grid_x + grid_y + grid_one
+
+
+def _make_base_grid_5d(theta: Tensor, d: int, h: int, w: int, align_corners: bool):
+    dtype = theta.dtype
+    device = theta.device
+
+    grid_x = _linspace_from_neg_one(w, align_corners, dtype, device).view(1, 1, w, 1)
+    grid_y = _linspace_from_neg_one(h, align_corners, dtype, device).view(1, h, 1, 1)
+    grid_z = _linspace_from_neg_one(d, align_corners, dtype, device).view(d, 1, 1, 1)
+    grid_one = torch.ones((1, 1, 1, 1), dtype=dtype, device=device)
+
+    # this is just a temporary hack and we should use torch.stack here once #104480 is merged
+    grid_x = torch.nn.functional.pad(grid_x, pad=(0, 3), mode="constant", value=0)
+    grid_y = torch.nn.functional.pad(grid_y, pad=(1, 2), mode="constant", value=0)
+    grid_z = torch.nn.functional.pad(grid_z, pad=(2, 1), mode="constant", value=0)
+    grid_one = torch.nn.functional.pad(grid_one, pad=(3, 0), mode="constant", value=0)
+    return grid_x + grid_y + grid_z + grid_one
+
+
+def _affine_grid_generator_4d(theta: Tensor, size: List[int], align_corners: bool):
+    n, _, h, w = size
+    base_grid = _make_base_grid_4d(theta, h, w, align_corners=align_corners)
+    # base_grid shape is (h, w, 3) and theta shape is (n, 2, 3)
+    # We do manually a matrix multiplication which is faster than mm()
+    # (h * w, 3, 1) * (n, 1, 3, 2) -> (n, h * w, 2)
+    grid = (base_grid.view(-1, 3, 1) * theta.mT.unsqueeze(1)).sum(-2)
+    return grid.view(n, h, w, 2)
+
+
+def _affine_grid_generator_5d(theta: Tensor, size: List[int], align_corners: bool):
+    n, _, d, h, w = size
+    base_grid = _make_base_grid_5d(theta, d, h, w, align_corners=align_corners)
+    # base_grid shape is (d, h, w, 4) and theta shape is (n, 3, 4)
+    # We do manually a matrix multiplication which is faster than mm()
+    # (d * h * w, 4, 1) * (n, 1, 4, 3) -> (n, h * w, 3)
+    grid = (base_grid.view(-1, 4, 1) * theta.mT.unsqueeze(1)).sum(-2)
+    return grid.view(n, d, h, w, 3)
+
+
+@register_decomposition(aten.affine_grid_generator)
+@out_wrapper()
+@pw_cast_for_opmath
+def affine_grid_generator(theta: Tensor, size: List[int], align_corners: bool):
+    torch._check(
+        len(size) in (4, 5),
+        lambda: "affine_grid_generator needs 4d (spatial) or 5d (volumetric) inputs.",
+    )
+    if len(size) == 4:
+        return _affine_grid_generator_4d(theta, size, align_corners=align_corners)
+    else:
+        return _affine_grid_generator_5d(theta, size, align_corners=align_corners)
+
+
+def _grid_sampler_2d(
+    a: Tensor,
+    grid: Tensor,
+    interpolation_mode: int = 0,
+    padding_mode: int = 0,
+    align_corners: bool = False,
+    _expand_grid: bool = True,
+) -> Tensor:
+    # This method is a copy of grid_sampler_2d implementation and introduced with additional arg _expand_grid to
+    # optionally expand the input grid for performance reasons.
+    # Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
+    # and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
+    # However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
+    # Thus we apply this hack to not expand the grid for this case.
+
+    torch._check(
+        interpolation_mode in (0, 1, 2),
+        lambda: f"Invalid interpolation mode {interpolation_mode}",
+    )
+    torch._check(
+        padding_mode in (0, 1, 2), lambda: f"Invalid padding mode {padding_mode}"
+    )
+
+    def unnormalize(coords: Tensor, size: int) -> Tensor:
+        # Rescale coordinates from [-1, 1] to:
+        #   [0, size - 1] if align_corners is True
+        #   [-.5, size -.5] if align_corners is False
+        mul = (size * 0.5 - 0.5) if align_corners else (size * 0.5)
+        ofs = size * 0.5 - 0.5
+        return coords * mul + ofs
+
+    # Reflects coordinates until they fall between low and high (inclusive).
+    # The bounds are passed as twice their value so that half-integer values
+    # can be represented as ints.
+    def reflect_coordinates(coords: Tensor, twice_low: int, twice_high: int) -> Tensor:
+        if twice_low == twice_high:
+            return torch.zeros_like(coords)
+        coords_min = twice_low / 2
+        coords_span = (twice_high - twice_low) / 2
+        coords2 = (coords - coords_min).abs()
+        extra = torch.fmod(coords2, coords_span)
+        flips = (coords2 / coords_span).floor().to(dtype=torch.int8)
+        return torch.where(
+            flips & 1 == 0, extra + coords_min, coords_span + coords_min - extra
+        )
+
+    def compute_coordinates(coords: Tensor, size: int) -> Tensor:
+        if padding_mode == 0:  # Zero
+            return coords
+        elif padding_mode == 1:  # Borders
+            return torch.clamp(coords, 0, size - 1)
+        else:  # padding_mode == 2, Reflection
+            if align_corners:
+                coords_reflected = reflect_coordinates(coords, 0, 2 * (size - 1))
+            else:
+                coords_reflected = reflect_coordinates(coords, -1, 2 * size - 1)
+            return torch.clamp(coords_reflected, 0, size - 1)
+
+    def compute_source_index(coords: Tensor, size: int) -> Tensor:
+        coords_un = unnormalize(coords, size)
+        return compute_coordinates(coords_un, size)
+
+    N, C, iH, iW = a.shape
+    _, oH, oW, two = grid.shape
+    assert two == 2
+
+    if _expand_grid:
+        # Let's expand grid to [N, C, oH, oW, 2]
+        # This allows to generate a single triton cuda kernel instead of two kernels.
+        # Two kernels are due source indices, weights have shape (N, 1, oH, oW), xnumel=N*oH*oW
+        # and output has shape (N, C, oH, oW), xnumel=N*C*oH*oW
+        # Expanding grid to (N, C, oH, oW, two) unifies xnumel to N*C*oH*oW
+        grid = grid.view(N, 1, oH, oW, two).expand(N, C, oH, oW, 2)
+
+    def in_bounds_cond(xs: Tensor, ys: Tensor) -> Tensor:
+        return torch.logical_and(
+            0 <= xs, torch.logical_and(xs < iW, torch.logical_and(0 <= ys, ys < iH))
+        )
+
+    N_idx = torch.arange(N, device=a.device).view(N, 1, 1, 1)
+    C_idx = torch.arange(C, device=a.device).view(1, C, 1, 1)
+
+    def clip(xs: Tensor, ys: Tensor, ws: Tensor) -> TensorSequenceType:
+        cond = in_bounds_cond(xs, ys)
+        # To clip to inside valid coordinates, we map the coordinates
+        # to (x, y) = (0, 0) and also set the weight to 0
+        # We also change the shape of the tensor to the appropriate one for
+        # broadcasting with N_idx, C_idx for the purposes of advanced indexing
+        c = C if _expand_grid else 1
+        return tuple(
+            torch.where(cond, t, 0).view(N, c, oH, oW)
+            for t in (xs.to(dtype=torch.int64), ys.to(dtype=torch.int64), ws)
+        )
+
+    def get_summand(ix: Tensor, iy: Tensor, w) -> Tensor:
+        # Perform clipping, index into input tensor and multiply by weight
+        idx_x, idx_y, w_ = clip(ix, iy, w)
+        return a[N_idx, C_idx, idx_y, idx_x] * w_
+
+    x = grid[..., 0]
+    y = grid[..., 1]
+
+    if interpolation_mode == 0:  # Bilinear
+        ix = compute_source_index(x, iW)
+        iy = compute_source_index(y, iH)
+
+        ix_nw, iy_nw = ix.floor(), iy.floor()
+        ix_ne, iy_ne = ix_nw + 1, iy_nw
+        ix_sw, iy_sw = ix_nw, iy_nw + 1
+        ix_se, iy_se = ix_ne, iy_sw
+
+        w_nw = (ix_se - ix) * (iy_se - iy)
+        w_ne = (ix - ix_sw) * (iy_sw - iy)
+        w_sw = (ix_ne - ix) * (iy - iy_ne)
+        w_se = (ix - ix_nw) * (iy - iy_nw)
+
+        return _sum_tensors(
+            get_summand(ix, iy, w)
+            for (ix, iy, w) in (
+                (ix_nw, iy_nw, w_nw),
+                (ix_ne, iy_ne, w_ne),
+                (ix_sw, iy_sw, w_sw),
+                (ix_se, iy_se, w_se),
+            )
+        )
+    elif interpolation_mode == 1:  # Nearest
+        ix = compute_source_index(x, iW)
+        iy = compute_source_index(y, iH)
+
+        ix_nearest = ix.round()
+        iy_nearest = iy.round()
+
+        return get_summand(ix_nearest, iy_nearest, 1)
+    else:  # interpolation_mode == 2, Bicubic
+        ix = unnormalize(x, iW)
+        iy = unnormalize(y, iH)
+
+        ix_nw = ix.floor()
+        iy_nw = iy.floor()
+
+        tx = ix - ix_nw
+        ty = iy - iy_nw
+
+        if not _expand_grid:
+            tx = tx.unsqueeze(1)
+            ty = ty.unsqueeze(1)
+
+        def get_value_bounded(ix: Tensor, iy: Tensor) -> Tensor:
+            x = compute_coordinates(ix, iW)
+            y = compute_coordinates(iy, iH)
+            return get_summand(x, y, 1)
+
+        def get_coeff(ofs: int) -> Tensor:
+            iy_ofs = iy_nw + (ofs - 1)
+            cs = (
+                get_value_bounded(ix_nw - 1, iy_ofs),
+                get_value_bounded(ix_nw, iy_ofs),
+                get_value_bounded(ix_nw + 1, iy_ofs),
+                get_value_bounded(ix_nw + 2, iy_ofs),
+            )
+            return _upsample_cubic_interp1d(cs, tx)
+
+        coeffs = tuple(get_coeff(ofs) for ofs in range(4))
+        return _upsample_cubic_interp1d(coeffs, ty)
+
+
+@register_decomposition(aten.grid_sampler_2d)
+@out_wrapper()
+@pw_cast_for_opmath
+def grid_sampler_2d(
+    a: Tensor,
+    grid: Tensor,
+    interpolation_mode: int = 0,
+    padding_mode: int = 0,
+    align_corners: bool = False,
+) -> Tensor:
+    return _grid_sampler_2d(
+        a,
+        grid=grid,
+        interpolation_mode=interpolation_mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+    )
+
+
+@register_decomposition(aten.mv)
+@out_wrapper()
+@pw_cast_for_opmath
+def mv(self, vec):
+    torch._check(
+        self.dim() == 2 and vec.dim() == 1,
+        lambda: f"matrix @ vector expected, got {self.dim()}, {vec.dim()}",
+    )
+    torch._check(
+        self.size(1) == vec.size(0),
+        lambda: f"size mismatch, got input ({self.size(0)}x{self.size(1)}), vec ({vec.size(0)})",
+    )
+    return (self * vec).sum(dim=1)
+
+
+@register_decomposition(aten.binary_cross_entropy_with_logits)
+@out_wrapper()
+def binary_cross_entropy_with_logits(
+    self, target, weight=None, pos_weight=None, reduction=Reduction.MEAN.value
+):
+    if pos_weight is not None:
+        log_weight = (pos_weight - 1) * target + 1
+        loss = (1 - target) * self - (log_weight * F.logsigmoid(self))
+    else:
+        loss = (1 - target) * self - F.logsigmoid(self)
+
+    if weight is not None:
+        loss = loss * weight
+
+    return apply_loss_reduction(loss, reduction)
+
+
+def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> bool:
+    # For comments of the logic of this function see eager in /native/LinearAlgebra.cpp
+
+    t1, t2 = (tensor1, tensor2) if tensor1.ndim >= tensor2.ndim else (tensor2, tensor1)
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if not (t1.ndim >= 3 and t2.ndim <= 2):
+        return False
+    if t2.requires_grad and not is_out:
+        return True
+    if tensor1.ndim == 2:
+        return False
+    if guard_size_oblivious(t1.numel() == 0):
+        return True
+
+    t1_shape = t1.shape
+    t1_stride = t1.stride()
+    return all(
+        st1 == st2 * s2
+        for (st1, st2, s2) in zip(t1_stride[:-2], t1_stride[1:-1], t1_shape[1:-1])
+    )
+
+
+@aten.matmul.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+@out_wrapper(pass_is_out=True)
+def matmul(tensor1, tensor2, *, is_out=False):
+    dim_tensor1 = tensor1.dim()
+    dim_tensor2 = tensor2.dim()
+    assert dim_tensor1 != 0 and dim_tensor2 != 0
+    if dim_tensor1 == 1 and dim_tensor2 == 1:
+        return torch.dot(tensor1, tensor2)
+    elif dim_tensor1 == 2 and dim_tensor2 == 1:
+        return torch.mv(tensor1, tensor2)
+    elif dim_tensor1 == 1 and dim_tensor2 == 2:
+        return torch.squeeze(torch.mm(torch.unsqueeze(tensor1, 0), tensor2), 0)
+    elif dim_tensor1 == 2 and dim_tensor2 == 2:
+        return torch.mm(tensor1, tensor2)
+    elif should_fold(tensor1, tensor2, is_out):
+        # dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) ||
+        # dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2)
+        # and some condition on the strides is fulfilled
+
+        # optimization: use mm instead of bmm by folding the batch of the larger tensor
+        # into its leading matrix dimension
+        transpose = dim_tensor2 > dim_tensor1
+        t1 = tensor2.mT if transpose else tensor1
+        t2 = (
+            tensor2 if not transpose else (tensor1.t() if dim_tensor1 == 2 else tensor1)
+        )
+        # Invariant: t1.dim() >= 3 && (t2.dim() == 1 || t2.dim() == 2)
+        #            and t1 and t2 are matmul-compatible
+
+        # Why not t1.view(-1, sizes_1[-1])?
+        # If the last dim is 0, then view(-1, 0) won't work because the -1 becomes ambiguous.
+        # This can happen in e.g. [3, 5, 0] @ [0, 0].
+        sizes_1 = t1.shape
+        output_shape = list(sizes_1[:-1])
+        folded_dim1 = reduce(operator.mul, output_shape)
+
+        # Readjust output_shape if we are multiplying by a matrix
+        t2_is_matrix = t2.dim() == 2
+        if t2_is_matrix:
+            output_shape.append(t2.shape[1])
+
+        # This will almost always be a view.
+        # It may not be a view if t2->requires_grad(). See should_fold in aten/ for an explanation
+        t1_folded = t1.reshape(folded_dim1, sizes_1[-1])
+        if t2_is_matrix:
+            # This copies if we perform a 2D @ 3D and the first tensor requires_grad
+            # See should_fold native/LinearAlgebra.cpp for why.
+            output = t1_folded.mm(t2).view(output_shape)
+            return output.mT.contiguous() if transpose else output
+        else:
+            return t1_folded.mv(t2).view(output_shape)
+
+    elif dim_tensor1 >= 1 and dim_tensor2 >= 1:
+        # We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
+        # we track m1 vs m2 separately even though they must match for nicer error messages
+        n = tensor1.size(-2) if dim_tensor1 > 1 else 1
+        m1 = tensor1.size(-1)
+        batch_tensor1 = tensor1.shape[:-2]
+        m2 = tensor2.size(-2) if dim_tensor2 > 1 else tensor2.size(-1)
+        p = tensor2.size(-1) if dim_tensor2 > 1 else 1
+
+        batch_tensor2: List[int] = []
+        # TODO: handling of slice
+        for i in range(dim_tensor2 - 2):
+            batch_tensor2.append(tensor2.size(i))
+
+        # Same optimization for the gradients as that in should_fold
+        # If we're going to broadcast, we force it to go through the should_fold branch
+        if (
+            dim_tensor1 == 3
+            and dim_tensor2 == 3
+            and batch_tensor1[0] != batch_tensor2[0]
+        ):
+            if batch_tensor1[0] == 1 and tensor1.requires_grad:
+                return matmul(tensor1.squeeze(0), tensor2)
+            if batch_tensor2[0] == 1 and tensor2.requires_grad:
+                return matmul(tensor1, tensor2.squeeze(0))
+
+        # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
+        expand_batch_portion = list(
+            torch.broadcast_shapes(batch_tensor1, batch_tensor2)
+        )
+
+        tensor1_expand_size = expand_batch_portion + [n, m1]
+
+        expand_batch_product = prod(expand_batch_portion)
+
+        # HACK: We need reshape with symint support
+        tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(
+            expand_batch_product, n, m1
+        )
+
+        vector_rhs = dim_tensor2 == 1
+        if vector_rhs:
+            tensor2_expand_size = expand_batch_portion + [m2]
+            tensor2_expanded = (
+                tensor2.expand(tensor2_expand_size)
+                .reshape(expand_batch_product, m2)
+                .unsqueeze(2)
+            )
+        else:
+            tensor2_expand_size = expand_batch_portion + [m2, p]
+            tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(
+                expand_batch_product, m2, p
+            )
+
+        output_shape = expand_batch_portion
+        if dim_tensor1 > 1:
+            output_shape.append(n)
+
+        if dim_tensor2 > 1:
+            output_shape.append(p)
+
+        if vector_rhs:
+            return tensor1_expanded.bmm(tensor2_expanded).squeeze(-1).view(output_shape)
+        else:
+            return tensor1_expanded.bmm(tensor2_expanded).view(output_shape)
+    else:
+        torch._check(False, lambda: "both arguments to matmul need to be at least 1D")
+
+
+@register_decomposition(aten.upsample_bicubic2d.default)
+@pw_cast_for_opmath
+def upsample_bicubic2d_default(
+    a: Tensor,
+    output_size: Tuple[int, int],
+    align_corners: bool,
+    scale_h: Optional[float] = None,
+    scale_w: Optional[float] = None,
+) -> Tensor:
+    N, C, iH, iW = a.shape
+    oH, oW = output_size
+
+    def compute_scale(in_size, out_size, align_corners, scale=None):
+        if align_corners:
+            return (in_size - 1) / (out_size - 1) if out_size > 1 else 0
+        else:
+            return 1 / scale if scale is not None and scale > 0 else in_size / out_size
+
+    def compute_source_index(scale, dst_index, align_corners):
+        if align_corners:
+            return scale * dst_index
+        else:
+            return scale * (dst_index + 0.5) - 0.5
+
+    height_scale = compute_scale(iH, oH, align_corners, scale_h)
+    width_scale = compute_scale(iW, oW, align_corners, scale_w)
+
+    N_idx = torch.arange(N, device=a.device).view(N, 1, 1, 1)
+    C_idx = torch.arange(C, device=a.device).view(1, C, 1, 1)
+    out_y = torch.arange(oH, device=a.device).view((1, 1, oH, 1))
+    out_x = torch.arange(oW, device=a.device).view((1, 1, 1, oW))
+
+    real_x = compute_source_index(width_scale, out_x, align_corners)
+    in_x = real_x.floor()
+    t_x = real_x - in_x
+    ix = in_x.to(dtype=torch.int64)
+
+    real_y = compute_source_index(height_scale, out_y, align_corners)
+    in_y = real_y.floor()
+    t_y = real_y - in_y
+    iy = in_y.to(dtype=torch.int64)
+
+    iys_ofs = (iy - 1, iy, iy + 1, iy + 2)
+    ixs_ofs = (ix - 1, ix, ix + 1, ix + 2)
+
+    def load_bounded(ys, xs):
+        y_idx = torch.clamp(ys, 0, iH - 1)
+        x_idx = torch.clamp(xs, 0, iW - 1)
+        return aten._unsafe_index(a, [N_idx, C_idx, y_idx, x_idx])
+
+    def get_x_interp(y):
+        coeffs_x = tuple(load_bounded(y, x_ofs) for x_ofs in ixs_ofs)
+        return _upsample_cubic_interp1d(coeffs_x, t_x)
+
+    coeffs_y = tuple(get_x_interp(y_ofs) for y_ofs in iys_ofs)
+    result = _upsample_cubic_interp1d(coeffs_y, t_y)
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(a)
+    result = result.contiguous(memory_format=memory_format)
+    return result
+
+
+@register_decomposition(aten.upsample_bicubic2d.vec)
+@aten.upsample_bicubic2d.vec.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.upsample_bicubic2d.vec.py_impl(DispatchKey.Autograd)
+@out_wrapper()
+@pw_cast_for_opmath
+def upsample_bicubic2d_vec(
+    a: Tensor,
+    output_size: Optional[Tuple[int, int]],
+    align_corners: bool,
+    scale_factors: Optional[Tuple[float, float]] = None,
+) -> Tensor:
+    torch._check(
+        bool(output_size) + bool(scale_factors) == 1,
+        lambda: "Must specify exactly one of output_size and scale_factors.",
+    )
+    if output_size is None:
+        assert scale_factors is not None
+        output_size = cast(
+            Tuple[int, int],
+            tuple(
+                sym_int(sym_float(w) * scale)
+                for w, scale in zip(a.shape[2:], scale_factors)
+            ),
+        )
+    scale_h, scale_w = scale_factors if scale_factors else (None, None)
+    return upsample_bicubic2d_default(a, output_size, align_corners, scale_h, scale_w)
+
+
+@register_decomposition(aten.reflection_pad1d)
+@register_decomposition(aten.reflection_pad2d)
+@register_decomposition(aten.reflection_pad3d)
+@pw_cast_for_opmath
+@out_wrapper()
+def _reflection_pad(a: Tensor, padding: Tuple[int, ...]) -> Tensor:
+    def idx(left, middle, right):
+        dim_idx = torch.arange(-left, middle + right, device=a.device)
+        return middle - 1 - (middle - 1 - dim_idx.abs()).abs()
+
+    return _reflection_or_replication_pad(
+        a,
+        padding,
+        idx,
+    )
+
+
+@register_decomposition(aten.replication_pad1d)
+@register_decomposition(aten.replication_pad2d)
+@register_decomposition(aten.replication_pad3d)
+@pw_cast_for_opmath
+@out_wrapper()
+def _replication_pad(a: Tensor, padding: Tuple[int, ...]) -> Tensor:
+    def idx(left, middle, right):
+        dim_idx = torch.arange(-left, middle + right, device=a.device)
+        return torch.clamp(dim_idx, 0, middle - 1)
+
+    return _reflection_or_replication_pad(
+        a,
+        padding,
+        idx,
+    )
+
+
+def _reflection_or_replication_pad(
+    a: Tensor,
+    padding: Tuple[int, ...],
+    idx_fn: Callable[[int, int, int], Tensor],
+) -> Tensor:
+    dim = len(padding) // 2
+    torch._check(
+        a.dim() in (dim + 1, dim + 2),
+        lambda: f"reflection_pad{dim}d requires {dim + 1}D or {dim + 2}D input",
+    )
+    inp_shape = a.shape[-dim:]
+    nc_dim = a.dim() - dim
+
+    padding_left = [padding[2 * (dim - 1 - i)] for i in range(dim)]
+    padding_right = [padding[2 * (dim - 1 - i) + 1] for i in range(dim)]
+
+    result = a
+    for i in range(dim):
+        idx: List[Any] = [None] * result.dim()
+        idx[i + nc_dim] = idx_fn(padding_left[i], inp_shape[i], padding_right[i])
+        result = aten._unsafe_index(result, idx)
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(result)
+    result = result.contiguous(memory_format=memory_format)
+    return result
+
+
+@register_decomposition(aten.aminmax)
+@out_wrapper("min", "max")
+def aminmax(self, *, dim=None, keepdim=False):
+    amin = torch.amin(self, dim=dim, keepdim=keepdim)
+    amax = torch.amax(self, dim=dim, keepdim=keepdim)
+    return amin, amax
+
+
+@register_decomposition(aten.nansum)
+@out_wrapper()
+def nansum(self, dim=None, keepdim=False, *, dtype=None):
+    return aten.sum(torch.where(torch.isnan(self), 0, self), dim, keepdim, dtype=dtype)
+
+
+@register_decomposition([aten.arange.default, aten.arange.out])
+@out_wrapper()
+def arange_default(
+    end: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+):
+    return aten.arange.start_step(
+        0, end, 1, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_decomposition([aten.arange.start])
+def arange_start(
+    start: NumberType,
+    end: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[torch.device] = None,
+    pin_memory: bool = False,
+):
+    return aten.arange.start_step(
+        start, end, 1, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_decomposition(out_dtype)
+def out_dtype_decomp(*args, **kwargs):
+    from torch._higher_order_ops.out_dtype import out_dtype_dense
+
+    return out_dtype_dense(*args, **kwargs)
+
+
+@register_decomposition(aten.multi_margin_loss)
+@aten.multi_margin_loss.default.py_impl(DispatchKey.Autograd)
+@out_wrapper()
+def multi_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    p: NumberType = 1,
+    margin: NumberType = 1,
+    weight: Optional[Tensor] = None,
+    reduction: int = Reduction.MEAN.value,
+) -> Tensor:
+    input = torch.atleast_2d(input)
+    target = torch.atleast_1d(target)
+    nframe = input.shape[0]
+    dim = input.shape[1]
+    torch._check(p == 1 or p == 2, lambda: "only p == 1 and p == 2 supported")
+    torch._check(
+        input.ndim == 2 and dim != 0,
+        lambda: f"Expected non-empty vector or matrix with optional 0-dim batch size, but got: {input.shape}",
+    )
+    torch._check(
+        target.ndim == 1 and target.numel() == nframe,
+        lambda: f"inconsistent target size, expected {nframe} but got {target.shape}",
+    )
+    if weight is not None:
+        weight = torch.atleast_1d(weight)
+        torch._check(
+            weight.ndim == 1 and weight.numel() == dim,  # type: ignore[union-attr]
+            lambda: f"inconsistent weight size, expected {dim} but got {weight.shape}",  # type: ignore[union-attr]
+        )
+    target = target.unsqueeze(1)
+    u = torch.gather(input, dim=1, index=target)
+    z = margin - u + input
+    z = z.clamp_min(0)
+    z = z if p == 1 else z * z
+    if weight is not None:
+        z = z * weight[target]
+    idx = torch.arange(dim, device=input.device)
+    z = torch.where(idx != target, z, 0)
+    if reduction == Reduction.MEAN.value:
+        return z.mean()
+    elif reduction == Reduction.SUM.value:
+        return z.sum() / z.shape[1]
+    else:
+        return z.mean(dim=1)
+
+
+@register_decomposition(aten.multilabel_margin_loss_forward)
+@aten.multilabel_margin_loss_forward.default.py_impl(DispatchKey.Autograd)
+@out_wrapper("output", "is_target")
+def multilabel_margin_loss_forward(
+    input: Tensor,
+    target: Tensor,
+    reduction: int,
+) -> Tuple[Tensor, Tensor]:
+    orig_input_shape = input.shape
+    orig_target_shape = target.shape
+    input = torch.atleast_2d(input)
+    target = torch.atleast_2d(target)
+    dim = input.shape[1]
+    torch._check(
+        len(orig_input_shape) <= 2 and dim != 0,
+        lambda: f"Expected non-empty vector or matrix with optional 0-dim batch size, but got: {orig_input_shape}",
+    )
+    torch._check(
+        len(orig_target_shape) <= 2 and orig_target_shape == orig_input_shape,
+        lambda: f"inconsistent target size: {orig_target_shape} for input of size: {orig_input_shape}",
+    )
+    # ignores labels after the first -1, detects when -1 is not present
+    idx = torch.arange(dim, device=target.device)
+    is_end = target == -1
+    end_idx = torch.amin(torch.where(is_end, idx, dim), dim=-1, keepdim=True)
+    # target indices
+    target_mask = idx < end_idx
+    # masks target to be able to use gather, which doesn't allow -1
+    tidx0 = torch.where(target_mask, target, 0)
+    u = torch.gather(input, dim=-1, index=tidx0)
+    # is_target
+    tidx1 = torch.where(target_mask, target, -1)
+    is_target = torch.any(idx == tidx1.unsqueeze(dim=-1), dim=1)
+    # loss
+    z = 1.0 - u.T.unsqueeze(dim=-1) + input
+    z = z.clamp_min(0)
+    z = z / dim
+    # masks loss
+    z = torch.where(is_target, 0, z)
+    # reduction
+    if reduction == Reduction.MEAN.value:
+        z = z.sum(dim=(0, -1)).mean()
+    elif reduction == Reduction.SUM.value:
+        z = z.sum()
+    else:
+        z = z.sum(dim=(0, -1))
+    # result
+    is_target = is_target.to(input.dtype).reshape(orig_target_shape)
+    return z, is_target
+
+
+# scaled_dot_product_attention used to be decomposed in pre-autograd, given that
+# it calls _scaled_dot_product_attention_math and
+# _scaled_dot_product_attention_math only has a CompositeImplicitAutograd
+# kernel. As a result it's decomposed into ops with finer granularity.
+# However recent PRs (#103826 #105131 #115913) added new logic in
+# scaled_dot_product_attention and now it calls
+# _scaled_dot_product_flash_attention_for_cpu in export path. This results
+# in _scaled_dot_product_flash_attention_for_cpu showing up in export result.
+# This decomposition ensures scaled_dot_product_attention is still decomposed
+# the same way as before, i.e., going through
+# _scaled_dot_product_attention_math. Notice that this decomp rule should be
+# excluded by inductor.
+@register_decomposition(aten._scaled_dot_product_flash_attention_for_cpu.default)
+def scaled_dot_product_flash_attention_for_cpu(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    *,
+    attn_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+) -> Tuple[Tensor, Tensor]:
+    dtype = query.dtype
+    torch._check(
+        torch.is_floating_point(query),
+        lambda: f"query must be FP32, FP64, BF16, FP16 but got {query.dtype}",
+    )
+    torch._check(
+        query.dim() == 4 and key.dim() == 4 and value.dim() == 4,
+        lambda: f"q, k, v must be a 4 dimensional tensor, got {query.dim()}, {key.dim()}, {value.dim()}",
+    )
+    torch._check(
+        dropout_p == 0.0, lambda: f"dropout probability must be zero, got {dropout_p}"
+    )
+    torch._check(
+        query.shape[3] == value.shape[3] and key.shape[3] == value.shape[3],
+        lambda: "q, k, v should have the same head size",
+    )
+
+    output, attn = aten._scaled_dot_product_attention_math.default(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        dropout_mask=None,
+        scale=scale,
+    )
+    # Why this change?
+    # In pre-dispatch export scaled_dot_product_attention is executed via
+    # * flash_attention.
+    # flash_attention allocates output tensor as (N, L, H, E)
+    #   it then transposes that to get (N, H, L, E) which is supposed to be the return
+    # tensor dim for scaled_dot_product_attention
+    # assume x: [N, H, L, E] is the output sdpa
+    # In MHA code, this output is then permuted via (2, 0, 1, 3) to get
+    # (L, N, H, E) dim tensor
+    # x = x.permute(2, 0, 1, 3).contiguous() and the viewed via
+    # x = x.view(L * N, H * E)
+    # During pre autograd dispatch call to contiguous is not traced because
+    # flash_attention output after the x.permute is already contiguous
+    # on which the view is valid
+    # However, during 2nd stage export, post-dispatch, we run _match variant
+    # instead of flash* to get the decomposition. _match variant returns
+    # x: [N, H, L, E] applying x.permute(2, 0, 1, 3) returns
+    # x: [L, N, H, E] and without converting this to contiguous tensor
+    # subsequent view is not valid and the export fails
+    # solution is to maintain the return tensor view from the decomp to be
+    # exactly same as *flash* variant.
+    # flash variants output is contiguous as [N, L, H, E]
+    # _match variant out is contiguous as [N, H, L, E]
+    # out = out.transpose(1, 2).contiguous gets output as contiguous
+    # in [N, L, H, E].
+    # Subsrequent transpose(1, 2) then returns a view on which
+    # aforementioned code snippet, as showm below, is valid
+    # x = x.permute(2, 0, 1, 3).contiguous() and the viewed via
+    # x = x.view(L * N, H * E)
+
+    # Really the invariant you want to maintain is:
+    # pre-dispatch op-output and its decomposed representation must
+    # return tensor with same view and dims
+    output = output.transpose(1, 2).contiguous(memory_format=torch.contiguous_format)
+    return (output.transpose(1, 2), attn)
+
+
+def register_inplace(aten_op, outplace_op):
+    @register_decomposition(aten_op)
+    def inplace_op(*args, **kwargs):
+        out = outplace_op(*args, **kwargs)
+        return args[0].copy_(out)
+
+    return inplace_op
+
+
+@register_decomposition([aten.baddbmm])
+@out_wrapper()
+@pw_cast_for_opmath
+def baddbmm(self, batch1, batch2, beta=1, alpha=1):
+    if not self.is_floating_point() and not self.is_complex():
+        beta = int(beta)
+        alpha = int(alpha)
+    result = torch.bmm(batch1, batch2)
+    if not isinstance(alpha, numbers.Number) or alpha != 1:
+        result = result * alpha
+    if beta == 0:
+        return result
+    if not isinstance(beta, numbers.Number) or beta != 1:
+        self = self * beta
+    return self + result
+
+
+@register_decomposition(aten.floor_divide)
+@out_wrapper()
+def floor_divide(self, other):
+    return torch.div(self, other, rounding_mode="floor")
+
+
+@register_decomposition(aten.sym_numel)
+def sym_numel(t):
+    return functools.reduce(operator.mul, t.shape, 1)
+
+
+@register_decomposition([aten.sum.default, aten.sum.out])
+def sum_default(
+    self: Tensor,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    if out is None:
+        return aten.sum.dim_IntList(self, [], dtype=dtype)
+    else:
+        return aten.sum.IntList_out(self, [], dtype=dtype, out=out)
+
+
+@register_decomposition([aten.squeeze.default, aten.squeeze.dim])
+def squeeze_default(self: Tensor, dim: Optional[int] = None):
+    if dim is None:
+        return aten.squeeze.dims(self, list(range(self.dim())))
+    else:
+        return aten.squeeze.dims(self, [dim])
+
+
+@register_decomposition(torch.ops.aten._weight_norm_interface)
+def _weight_norm_interface(x, y, dim=0):
+    # https://github.com/pytorch/pytorch/blob/852f8526c52190125446adc9a6ecbcc28fb66182/aten/src/ATen/native/WeightNorm.cpp#L58
+    keep_dim = tuple(i for i in range(len(x.shape)) if i != dim)
+    norm = x.norm(2, keep_dim, keepdim=True)
+    return x * (y / norm), norm
+
+
+@register_decomposition(aten.isin)
+@out_wrapper()
+def isin(elements, test_elements, *, assume_unique=False, invert=False):
+    # handle when either elements or test_elements are Scalars (they can't both be)
+    if not isinstance(elements, torch.Tensor):
+        elements = torch.tensor(elements, device=test_elements.device)
+    if not isinstance(test_elements, torch.Tensor):
+        test_elements = torch.tensor(test_elements, device=elements.device)
+
+    if test_elements.numel() < 10.0 * pow(elements.numel(), 0.145):
+        return isin_default(elements, test_elements, invert=invert)
+    else:
+        return isin_sorting(
+            elements, test_elements, assume_unique=assume_unique, invert=invert
+        )
+
+
+def isin_default(elements, test_elements, *, invert=False):
+    if elements.numel() == 0:
+        return torch.empty_like(elements, dtype=torch.bool)
+
+    x = elements.view(*elements.shape, *((1,) * test_elements.ndim))
+    if not invert:
+        cmp = x == test_elements
+    else:
+        cmp = x != test_elements
+    dim = tuple(range(-1, -test_elements.ndim - 1, -1))
+    return cmp.any(dim=dim)
+
+
+def isin_sorting(elements, test_elements, *, assume_unique=False, invert=False):
+    elements_flat = elements.flatten()
+    test_elements_flat = test_elements.flatten()
+    if assume_unique:
+        # This is the same as the aten implementation. For
+        # assume_unique=False, we cannot use unique() here, so we use a
+        # version with searchsorted instead.
+        all_elements = torch.cat([elements_flat, test_elements_flat])
+        sorted_elements, sorted_order = torch.sort(all_elements, stable=True)
+
+        duplicate_mask = sorted_elements[1:] == sorted_elements[:-1]
+        duplicate_mask = torch.constant_pad_nd(duplicate_mask, [0, 1], False)
+
+        if invert:
+            duplicate_mask = duplicate_mask.logical_not()
+
+        mask = torch.empty_like(duplicate_mask)
+        mask = mask.index_copy(0, sorted_order, duplicate_mask)
+
+        return mask[0 : elements.numel()]
+    else:
+        sorted_test_elements, _ = torch.sort(test_elements_flat)
+        idx = torch.searchsorted(sorted_test_elements, elements_flat)
+        test_idx = torch.where(idx < sorted_test_elements.numel(), idx, 0)
+        cmp = sorted_test_elements[test_idx] == elements_flat
+        cmp = cmp.logical_not() if invert else cmp
+        return cmp.reshape(elements.shape)
+
+
+@register_decomposition(aten.take)
+@out_wrapper()
+def take(self, index):
+    flattened = self.reshape(-1)
+    return flattened[index]
+
+
+register_inplace(aten.addbmm_, aten.addbmm)
+register_inplace(aten.addmm_, aten.addmm)
+register_inplace(aten.addmv_, aten.addmv)
+register_inplace(aten.baddbmm_, aten.baddbmm)
+register_inplace(aten.fill_, aten.fill)
+register_inplace(aten.gelu_, aten.gelu)
+register_inplace(aten.hardswish_, aten.hardswish)
+register_inplace(aten.hardtanh_, aten.hardtanh)
+register_inplace(aten.hardsigmoid_, aten.hardsigmoid)
+register_inplace(aten.__iand__, aten.__and__)
+register_inplace(aten.__ilshift__, aten.__lshift__)
+register_inplace(aten.index_put_, aten.index_put)
+register_inplace(aten.index_reduce_, aten.index_reduce)
+register_inplace(aten.__ior__, aten.__or__)
+register_inplace(aten.__irshift__, aten.__rshift__)
+register_inplace(aten.__ixor__, aten.__xor__)
+register_inplace(aten.leaky_relu_, aten.leaky_relu)
+register_inplace(aten.logit_, aten.logit)
+register_inplace(aten.relu_, aten.relu)
+register_inplace(aten.renorm_, aten.renorm)
+register_inplace(aten.round_, aten.round)
+register_inplace(aten.scatter_, aten.scatter)
+register_inplace(aten.scatter_add_, aten.scatter_add)
+register_inplace(aten.scatter_reduce_, aten.scatter_reduce)
+register_inplace(aten.silu_, aten.silu)
diff --git a/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_jvp.py b/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_jvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf91d9fb83427d13cc61133852ee8bb3fbba6e67
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_jvp.py
@@ -0,0 +1,302 @@
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch._decomp
+from torch import Tensor
+from torch._prims_common.wrappers import _maybe_remove_out_wrapper
+
+decomposition_table = torch._decomp.decomposition_table
+decomposition_table_for_jvp: Dict[torch._ops.OperatorBase, Callable] = {}
+register_decomposition = torch._decomp.register_decomposition
+aten = torch.ops.aten
+
+# NOTE: [forward-mode AD decompositions mechanism]
+#
+# The mechanism is in VariableType,
+#   IF any inputs have forward grad
+#      AND there is no forward AD formula implemented
+#      AND the functions is actually differentiable
+#   run the decomposition
+#      See run_jit_decomposition_with_args_for_jvp
+#      We currently use python decompositions that we torchscript.
+#
+# Note that we would be building the backward graph at the decomposed level
+# too, but that is OK, because we would've errored out otherwise anyway.
+#
+# TODO: The mechanism we are using to register decompositions doesn't
+# seem to be exclusively used for jvp. So open question here is whether
+# torch/csrc/jit/runtime/decomposition_registry.cpp is being used for other things.
+# If that is the case, we may go down the decomposition path unexpectedly
+# (and possibly produce an unintelligible error) vs erroring out earlier and
+# printing that the forward AD formula is not implemented.
+#
+# The solution to this may be to have a explicitly white list control when
+# to enable the decomposition.
+
+
+def maybe_register_decomposition(op):
+    def decorator(f):
+        try:
+            return register_decomposition(op)(f)
+        except Exception:
+            return f
+
+    return decorator
+
+
+# Functions where we need a special decomposition for jvp but there's another version that
+# should be used more generally (ex. for jvp we need to recompute the mean and variance for
+# the backwards of a normalization function. Without jvp, it should use the saved value)
+decomposition_table_for_jvp = {}
+
+
+def register_decomposition_for_jvp(fn):
+    return register_decomposition(fn, registry=decomposition_table_for_jvp)
+
+
+def _register_jit_decomposition_for_jvp(decomp, use_python=False):
+    if decomp in decomposition_table_for_jvp:
+        decomposition_table_used = decomposition_table_for_jvp
+    elif decomp in decomposition_table:
+        decomposition_table_used = decomposition_table
+    else:
+        raise RuntimeError(f"could not find decomposition for {decomp}")
+    decomp_fn = decomposition_table_used[decomp]
+
+    # `out_wrapper` extends a decompositions signature with
+    # an `out` parameter. However jit will use the unwrapped function's
+    # signature instead so we need to unwrap here to prevent an error
+    decomp_fn = _maybe_remove_out_wrapper(decomp_fn)
+
+    if use_python:
+        decomp_fn = torch.jit.ignore(decomp_fn)
+        sig = inspect.signature(decomp_fn)
+
+        # Create a string wrapping the function from the signature
+        # example output:
+        # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
+        #   return decomp_fn(x, y, z)
+        # Thanks copilot!
+        def get_function_def(sig):
+            param_def = [f"{param_str}" for param_str in sig.parameters.values()]
+            param_use = [f"{param_str}" for param_str in sig.parameters.keys()]
+
+            return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
+
+        f_str = get_function_def(sig)
+        graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
+    else:
+        graph = torch.jit.script(decomp_fn).graph
+    torch.jit._register_decomposition(decomp, graph)
+
+
+# The only decompositions here are temporary or hacks for the purposes of jvp
+
+
+# TODO: do these also belong here?
+@maybe_register_decomposition(aten.trace.default)
+def trace(self: Tensor) -> Tensor:
+    return torch.sum(torch.diag(self))
+
+
+@maybe_register_decomposition(aten.log_sigmoid_forward.default)
+def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
+    min = torch.minimum(self.new_zeros(()), self)
+    z = torch.exp(-torch.abs(self))
+    if self.is_cuda:
+        buffer = self.new_zeros((0,))
+    else:
+        buffer = z
+    return min - torch.log1p(z), buffer
+
+
+def recompute_mean_var(
+    input: Tensor, rstd: Tensor, inner_dim_indices: List[int], keepdim: bool
+):
+    # for most norm decompositions, it will be the same as the core version except for here.
+    # We recompute the mean and variance so that they track gradients through input
+
+    mean = torch.mean(input, dim=inner_dim_indices, keepdim=keepdim)
+    var = torch.var(input, dim=inner_dim_indices, unbiased=False, keepdim=keepdim)
+    eps = torch.pow(1 / rstd, 2) - var  # this makes me so sad inside
+    eps = eps.detach()
+    rstd = 1 / torch.sqrt(var + eps)
+    return mean, rstd
+
+
+@register_decomposition_for_jvp(aten.native_layer_norm_backward)
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices = list(range(axis, input_ndim))
+    outer_dim_indices = list(range(0, axis))
+
+    N = 1
+    for i in inner_dims:
+        N *= i
+    M = 1
+    for i in outer_dims:
+        M *= i
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape),
+            input.new_zeros(input_shape[axis:]),
+            input.new_zeros(input_shape[axis:]),
+        )
+
+    mean_, rstd_ = recompute_mean_var(input, rstd, inner_dim_indices, keepdim=True)
+
+    x_hat = (input - mean_) * rstd_
+    if weight is not None:
+        grad_x_hat = grad_out * weight
+    else:
+        grad_x_hat = grad_out
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+    inner = a - b - c3
+
+    if output_mask[0]:
+        d_input: Optional[Tensor] = (rstd_ / N) * inner
+    else:
+        d_input = torch.zeros_like(input)  # should be None but doesn't work with vjp
+
+    if output_mask[1] and weight is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight: Optional[Tensor] = torch.sum(
+                grad_out * x_hat, outer_dim_indices, False
+            )
+        else:
+            d_weight = grad_out * x_hat
+    elif weight is not None:
+        d_weight = torch.zeros_like(weight)  # should be None but doesn't work with vjp
+    else:
+        d_weight = torch.zeros(())  # should be None but doesn't work with vjp
+
+    if output_mask[2] and bias is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
+        else:
+            d_bias = grad_out.clone()
+    elif bias is not None:
+        d_bias = torch.zeros_like(bias)  # should be None but doesn't work with vjp
+    else:
+        d_bias = torch.zeros(())  # should be None but doesn't work with vjp
+
+    return (d_input, d_weight, d_bias)
+
+
+def prod(x: List[int]):
+    r = 1
+    for i in x:
+        r *= i
+    return r
+
+
+@register_decomposition_for_jvp(aten.native_batch_norm_backward)
+def native_batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: List[bool],
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_rank = input.dim()
+    assert input_rank >= 2, "rank of the input must be at least 2"
+
+    axis = 1
+    num_features = prod(input_shape) / input_shape[axis]  # type: ignore[arg-type]
+    mean = save_mean
+    invstd = save_invstd
+    if train:
+        assert (
+            save_mean is not None and save_invstd is not None
+        ), "when train=True, save_mean and save_invstd are required"
+
+        reduciton_dims = [0] + list(range(2, input.dim()))
+        assert invstd is not None  # for typing
+        mean, invstd = recompute_mean_var(input, invstd, reduciton_dims, keepdim=False)
+    else:
+        assert running_mean is not None and running_var is not None
+        mean = running_mean
+        invstd = torch.rsqrt(running_var + eps)
+
+    assert invstd is not None and mean is not None
+
+    broadcast_mask = [1] * input_rank
+    broadcast_mask[axis] = input_shape[axis]
+
+    reduction_axes: List[int] = []
+    for i in range(input_rank):
+        if i != axis:
+            reduction_axes.append(i)
+
+    mean = torch.reshape(mean, broadcast_mask)
+    norm = 1.0 / num_features
+    grad_output_sum = torch.sum(grad_out, reduction_axes)
+    dot_p = torch.sum(grad_out * (input - mean), reduction_axes)
+
+    grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask)
+    proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask)
+
+    if weight is None:
+        grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0
+    else:
+        grad_scale = torch.reshape(invstd * weight, broadcast_mask)
+
+    if train:
+        proj = (input - mean) * proj_scale
+        grad_input = ((grad_out - proj) - grad_mean) * grad_scale
+    else:
+        grad_input = grad_out * grad_scale
+
+    if output_mask[1]:
+        grad_weight = dot_p * invstd
+    elif weight is not None:
+        grad_weight = torch.zeros_like(
+            weight
+        )  # should be None but doesn't work with vjp
+    else:
+        grad_weight = torch.zeros(())  # should be None but doesn't work with vjp
+
+    if output_mask[2]:
+        grad_bias = grad_output_sum
+    else:
+        grad_bias = torch.zeros_like(
+            grad_output_sum
+        )  # should be None but doesn't work with vjp
+
+    return (grad_input, grad_weight, grad_bias)
+
+
+_register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._log_softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.log_sigmoid_forward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)
diff --git a/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_rng.py b/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_rng.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9d21831d3430c6bc3a0d2c7712a55d0ff32c42
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_decomp/decompositions_for_rng.py
@@ -0,0 +1,263 @@
+import functools
+from collections import defaultdict
+from typing import Callable, Dict
+
+import torch
+import torch._decomp as decomp
+from torch._decomp import get_decompositions
+from torch._ops import OpOverload
+
+aten = torch.ops.aten
+
+rng_decompositions: Dict[str, Dict[OpOverload, Callable]] = defaultdict(dict)
+
+
+def register_rng_decomposition(aten_op):
+    return decomp.register_decomposition(aten_op, rng_decompositions)
+
+
+def throw_on_non_cuda(device):
+    raise RuntimeError(
+        f"You are trying to functionalize a {device.type} RNG operator but {device.type} does not "
+        f"use Philox/counter-based RNG. Therefore, functionalizing a {device.type} RNG operator is "
+        "not supported. We are discussing the possibility of a Philox-based RNG implementation for CPU."
+    )
+
+
+# TODO - We have to register many more distributions here, and also higher level
+# ops like dropout which have fused implementation and can hide the rand inside.
+@register_rng_decomposition(aten.rand)
+def rand(shape, dtype=None, layout=torch.strided, device=None, pin_memory=False):
+    if device and device.type != "cuda":
+        throw_on_non_cuda(device)
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    dtype = dtype or torch.float32
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+
+
+@register_rng_decomposition(aten.rand_like)
+def rand_like(
+    x: torch.Tensor,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=torch.preserve_format,
+):
+    device = device or x.device
+    if device.type != "cuda":
+        throw_on_non_cuda(device)
+    dtype = dtype or x.dtype
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        x.shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+
+
+class PhiloxState:
+    """
+    Represents a PhiloxRngState - (seed, offset) where offset = base_offset +
+    relative_offset. seed and base_offset basically point to the rng state just
+    before tracing starts. relative offset tracks the totally consumed offset at
+    trace time.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.seed = torch.tensor(())
+        self.base_offset = torch.tensor(())
+        self.relative_offset = 0
+        self.offset_advanced_alteast_once = False
+
+    def validate_state(self):
+        assert self.seed.numel() != 0 and self.base_offset.numel() != 0
+
+    def advance_offset(self, consumed_offset):
+        self.offset_advanced_alteast_once = True
+        self.relative_offset = self.relative_offset + consumed_offset
+
+    def set_state(self, seed, base_offset, relative_offset=0):
+        self.seed = seed
+        self.base_offset = base_offset
+        self.relative_offset = relative_offset
+
+    def get_state_as_tuple(self):
+        self.validate_state()
+        return (self.seed, self.base_offset + self.relative_offset)
+
+    def get_state_as_tensor(self):
+        # Only needed because we override get_rng_state.
+        self.validate_state()
+        return torch.stack([self.seed, self.base_offset + self.relative_offset])
+
+    def set_state_from_tensor(self, state):
+        # Only needed because we override set_rng_state.
+        self.seed, self.base_offset = torch.unbind(state)
+        self.relative_offset = 0
+
+
+class PhiloxStateTracker:
+    """
+    Singleton class to track the philox rng state during AOT Autograd tracing.
+    For each aot tracing instance, AOT Autograd resets this tracker and keeps
+    track of both forward and backward offsets. At runtime, we only care about
+    the total consumed forward and backward offsets. For dynamic shapes, these
+    offsets are a function of input shapes. Therefore, the AOT generated graphs
+    have additional outputs that compute total consumed forward and backward
+    offsets.
+    """
+
+    running_state: PhiloxState
+    fwd_state: PhiloxState
+    bwd_state: PhiloxState
+
+    def __enter__(self):
+        PhiloxStateTracker.reset()
+        return self
+
+    def __exit__(self, exc_type, exc_cal, exc_tb):
+        PhiloxStateTracker.reset()
+
+    @classmethod
+    def reset(cls):
+        cls.running_state = PhiloxState()
+        cls.fwd_state = PhiloxState()
+        cls.bwd_state = PhiloxState()
+
+    @classmethod
+    def mark_beginning_of_forward(cls):
+        # Tells the tracker to use fwd_state as the running state
+        cls.running_state = cls.fwd_state
+
+    @classmethod
+    def mark_beginning_of_backward(cls):
+        # Tells the tracker to use bwd_state as the running state
+        cls.running_state = cls.bwd_state
+
+    @classmethod
+    def record_state(cls, seed, offset, mode):
+        # Records the seed and offset tensors. These tensors are used to invoke
+        # the philox_rand functional primitives.
+        if mode == "forward":
+            cls.fwd_state.set_state(seed, offset)
+            cls.mark_beginning_of_forward()
+        else:
+            assert mode == "backward"
+            cls.bwd_state.set_state(seed, offset)
+
+    @classmethod
+    def get_state_as_tensor(cls):
+        # The only reason this exists is because we override get_rng_state and
+        # set_rng_state during tracing. get_rng_state expects a tensor output,
+        # so return (seed, offset) tuple upset other parts of the program like
+        # ctx.saved_tensors.
+
+        # A bad consequence is that if user saves and restores rng state, we
+        # have little bit of ugliness in the generated code, where we first
+        # concat the (seed, offset) to create a tensor for get_rng_state, and
+        # then split it back to get (seed, offset) tuple in set_rng_state.
+
+        # TODO: Investigate if there is be a better way to wrap the tuple in a
+        # false Tensor object, and then desugar it later on.
+        return cls.running_state.get_state_as_tensor()
+
+    @classmethod
+    def get_state_as_tuple(cls):
+        return cls.running_state.get_state_as_tuple()
+
+    @classmethod
+    def set_state_from_tensor(cls, x):
+        # This is only needed because we override set_rng_state. Look at the
+        # comment in get_state_from_tensor method.
+        cls.running_state.set_state_from_tensor(x)
+
+    @classmethod
+    def advance_offset(cls, consumed_offset):
+        cls.running_state.advance_offset(consumed_offset)
+
+    @classmethod
+    def get_current_relative_offset(cls):
+        return cls.running_state.relative_offset
+
+    @staticmethod
+    def multiple_of_4(offset):
+        # torch cuda rng state offset must be a multiple of 4. For inductor, as
+        # we sum up all the numel, the result might not be a multiple of 4. This
+        # method achieves that.
+        return (offset + 3) // 4 * 4
+
+    @classmethod
+    def get_updated_fwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.fwd_state.offset_advanced_alteast_once:
+            return cls.fwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.fwd_state.base_offset + cls.fwd_state.relative_offset
+        )
+
+    @classmethod
+    def get_updated_bwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.bwd_state.offset_advanced_alteast_once:
+            return cls.bwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.bwd_state.base_offset + cls.bwd_state.relative_offset
+        )
+
+
+# Adding more decompositions which eventually use rand_like inside decomps.
+# Adding these in rng_decompositions ensures the functionalization of rand_like
+# ops used in these decomps. The list is copied from inductor codebase, which
+# uses it for similar purpose.
+#
+# Caution - These decomps do not have same accuracy as that of eager. However,
+# we can't just disable them with a config flag like fallback_random, because
+# for functionalization of rng ops, we have to decompose these ops.
+extra_random_decomps = get_decompositions(
+    [
+        aten.cauchy,
+        aten.cauchy_,
+        aten.exponential,
+        aten.exponential_,
+        aten.geometric,
+        aten.geometric_,
+        aten.native_dropout,
+        aten.normal,
+        aten.normal_,
+        aten.normal_functional,
+        aten.log_normal,
+        aten.log_normal_,
+        aten.rrelu_with_noise,
+        aten.rrelu_with_noise_,
+        aten.uniform_,
+    ]
+)
+register_extra_random_decomp = functools.partial(
+    decomp.register_decomposition, registry=extra_random_decomps
+)
+
+
+@register_extra_random_decomp([aten.bernoulli_])
+def bernoulli_(self, p=0.5):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
+
+
+@register_extra_random_decomp([aten.bernoulli.p])
+def bernoulli_p(self, p=0.5, *, generator=None):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    assert generator is None
+    return torch.rand_like(self, dtype=torch.float32) < p
+
+
+rng_decompositions.update(extra_random_decomps)  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/_deploy.py b/MLPY/Lib/site-packages/torch/_deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee4b4d3b33430f15fd875103ddc1e291d353c70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_deploy.py
@@ -0,0 +1,105 @@
+import io
+
+import torch
+from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
+from torch.package._package_pickler import create_pickler
+from torch.package._package_unpickler import PackageUnpickler
+from torch.serialization import _maybe_decode_ascii
+
+
+def _save_storages(importer, obj):
+    serialized_storages = []
+    serialized_dtypes = []
+
+    importer = importer if isinstance(importer, torch.package.PackageImporter) else None
+    importers: Importer
+    if importer is not None:
+        importers = OrderedImporter(importer, sys_importer)
+    else:
+        importers = sys_importer
+
+    def persistent_id(obj):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, we can
+                # remove this case
+                storage = obj._untyped_storage
+                dtype = obj.dtype
+            else:
+                storage = obj
+                dtype = torch.uint8
+
+            serialized_storages.append(obj)
+            serialized_dtypes.append(dtype)
+            return ("storage", len(serialized_storages) - 1)
+
+        if hasattr(obj, "__reduce_deploy__"):
+            if _serialized_reduces.get(id(obj)) is None:
+                _serialized_reduces[id(obj)] = (
+                    "reduce_deploy",
+                    id(obj),
+                    *obj.__reduce_deploy__(importers),
+                )
+            return _serialized_reduces[id(obj)]
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = create_pickler(data_buf, importers)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return (
+        data_value,
+        serialized_storages,
+        serialized_dtypes,
+        importer.zip_reader if importer else None,
+    )
+
+
+def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        if typename == "storage":
+            # TODO: Once we decide to break serialization FC, we can
+            # stop wrapping with TypedStorage
+            storage = serialized_storages[data[0]]
+            dtype = serialized_dtypes[data[0]]
+            return torch.storage.TypedStorage(
+                wrap_storage=storage.untyped(), dtype=dtype
+            )
+
+        if typename == "reduce_deploy":
+            reduce_id, func, args = data
+            if reduce_id not in _loaded_reduces:
+                _loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
+            return _loaded_reduces[reduce_id]
+
+        return None
+
+    importer: Importer
+    if zip_reader is not None:
+        importer = OrderedImporter(_get_package(zip_reader), sys_importer)
+    else:
+        importer = sys_importer
+
+    unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
+    unpickler.persistent_load = persistent_load  # type: ignore[method-assign]
+    result = _deploy_objects[id] = unpickler.load()
+    return result
+
+
+def _get_package(zip_reader):
+    if zip_reader not in _raw_packages:
+        _raw_packages[zip_reader] = PackageImporter(zip_reader)
+    return _raw_packages[zip_reader]
+
+
+_raw_packages: dict = {}
+_deploy_objects: dict = {}
+_serialized_reduces: dict = {}
+_loaded_reduces: dict = {}
diff --git a/MLPY/Lib/site-packages/torch/_dispatch/__init__.py b/MLPY/Lib/site-packages/torch/_dispatch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7e7109147349cfef3bf5101730226d77db3b8b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/python.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/python.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2b6b06bf734f77ac55e166c18ee2087518378bb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dispatch/__pycache__/python.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dispatch/python.py b/MLPY/Lib/site-packages/torch/_dispatch/python.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe1d23da4c5abf6fcfb0ad74a70befdd78b4342f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dispatch/python.py
@@ -0,0 +1,178 @@
+import itertools
+import unittest.mock
+from contextlib import contextmanager
+from typing import Iterator
+
+import torch
+import torch._C
+import torch._ops
+import torch.utils._python_dispatch
+import torch.utils._pytree as pytree
+
+__all__ = ["enable_python_dispatcher", "no_python_dispatcher", "enable_pre_dispatch"]
+
+no_python_dispatcher = torch._C._DisablePythonDispatcher
+enable_python_dispatcher = torch._C._EnablePythonDispatcher
+enable_pre_dispatch = torch._C._EnablePreDispatch
+
+CROSSREF_FUNCTIONALIZE = False
+
+
+def all_py_loaded_overloads() -> Iterator[torch._ops.OpOverload]:
+    """
+    Warning: the set of overloads this will report is very subtle.  It is precisely
+    the set of torch.ops functions that have actually been accessed from Python
+    (e.g., we actually called torch.ops.aten.blah at some point.  This is DIFFERENT
+    from the set of registered operators, which will in general be a larger set,
+    as this would include all operators which we ran C++ static initializers or
+    Python operator registration on.  This does not eagerly populate the list on
+    torch.ops.aten; this list is lazy!
+
+    In other words, this is good for traversing over everything that has an
+    OpOverload object allocated in Python.  We use it for cache invalidation, but
+    don't rely on this list being complete.
+
+    Note that even if we did report all C++ registered overloads, this isn't guaranteed
+    to be complete either, as a subsequent lazy load of a library which triggers more
+    registrations could add more things to the set.
+    """
+    for ns in torch.ops:
+        packets = getattr(torch.ops, ns)
+        for op_name in packets:
+            packet = getattr(packets, op_name)
+            for overload in packet:
+                yield getattr(packet, overload)
+
+
+@contextmanager
+def suspend_functionalization():
+    f_tls = torch._C._dispatch_tls_is_dispatch_key_included(
+        torch._C.DispatchKey.Functionalize
+    )
+    f_rv = torch._C._functionalization_reapply_views_tls()
+    if f_tls:
+        torch._disable_functionalization()
+    try:
+        yield
+    finally:
+        if f_tls:
+            torch._enable_functionalization(reapply_views=f_rv)
+
+
+def check_tensor_metadata_matches(nv, rv, desc):
+    assert callable(desc)
+    assert nv.size() == rv.size(), f"{desc()}: sizes {nv.size()} != {rv.size()}"
+    assert nv.dtype == rv.dtype, f"{desc()}: dtype {nv.dtype} != {rv.dtype}"
+    same_strides, idx = torch._prims_common.check_significant_strides(
+        nv, rv, only_cuda=False
+    )
+    assert (
+        same_strides
+    ), f"{desc()}: strides {nv.stride()} != {rv.stride()} (mismatch at index {idx})"
+
+
+def check_metadata_matches(n, r, desc):
+    assert callable(desc)
+    n_vals, n_spec = pytree.tree_flatten(n)
+    r_vals, r_spec = pytree.tree_flatten(r)
+    # TODO: test the specs match; empirically  sometimes we have a tuple
+    # on one side and a list on the other
+    assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
+    for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
+        if not isinstance(rv, torch.Tensor):
+            continue
+        check_tensor_metadata_matches(nv, rv, lambda: f"{desc()} output {i}")
+
+
+class Lit:
+    def __init__(self, s):
+        self.s = s
+
+    def __repr__(self):
+        return self.s
+
+
+def _fmt(a: object) -> object:
+    if isinstance(a, torch.Tensor):
+        return Lit(
+            f"torch.empty_strided({tuple(a.size())}, {a.stride()}, dtype={a.dtype})"
+        )
+    else:
+        return a
+
+
+def make_crossref_functionalize(op, final_key):
+    from torch._subclasses.fake_tensor import FakeTensorMode
+
+    # This case is pretty weird, suppress it for now
+    if op == torch.ops.aten.lift_fresh.default:
+        return final_key
+
+    def handler(*args, **kwargs):
+        fake_mode = FakeTensorMode()
+
+        def fakeify_defun(t):
+            if isinstance(t, torch.Tensor):
+                if torch._is_functional_tensor(t):
+                    r = torch._from_functional_tensor(t)
+                    # NB: This assumes that the inner tensor sizes/strides match
+                    # the outer tensor sizes/strides.  This doesn't necessarily have to
+                    # be the case, see discussion at
+                    # https://github.com/pytorch/pytorch/pull/87610/files/401ddeda1d769bedc88a12de332c7357b60e51a4#r1007264456
+                    assert t.size() == r.size()
+                    assert t.stride() == r.stride()
+                else:
+                    r = t
+                # TODO: suppress guards
+                return fake_mode.from_tensor(r)
+            return t
+
+        def maybe_detach(t):
+            if isinstance(t, torch.Tensor):
+                return t.detach()
+            else:
+                return t
+
+        # TODO: This probably does the wrong thing if you're running other
+        # substantive modes with the normal op outside here
+        with torch.utils._python_dispatch._disable_current_modes(), suspend_functionalization():
+            f_args, f_kwargs = pytree.tree_map(fakeify_defun, (args, kwargs))
+            orig_f_args, orig_f_kwargs = pytree.tree_map(
+                maybe_detach, (f_args, f_kwargs)
+            )
+            with fake_mode:
+                f_r = op(*f_args, **f_kwargs)
+        r = op._op_dk(final_key, *args, **kwargs)
+
+        def desc():
+            fmt_args = ", ".join(
+                itertools.chain(
+                    (repr(pytree.tree_map(_fmt, a)) for a in orig_f_args),
+                    (
+                        f"{k}={pytree.tree_map(_fmt, v)}"
+                        for k, v in orig_f_kwargs.items()
+                    ),
+                )
+            )
+            return f"{op}({fmt_args})"
+
+        check_metadata_matches(f_r, r, desc)
+        return r
+
+    return handler
+
+
+# NB: enabling this is slow, don't do it in a hot loop.  This is purely
+# for debugging purposes.
+@contextmanager
+def enable_crossref_functionalize():
+    for op in all_py_loaded_overloads():
+        op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
+    try:
+        with enable_python_dispatcher(), unittest.mock.patch(
+            "torch._dispatch.python.CROSSREF_FUNCTIONALIZE", True
+        ):
+            yield
+    finally:
+        for op in all_py_loaded_overloads():
+            op._uncache_dispatch(torch._C.DispatchKey.Functionalize)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__init__.py b/MLPY/Lib/site-packages/torch/_dynamo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec689d5485e3429d92a7b2786991028f613f7e5f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/__init__.py
@@ -0,0 +1,96 @@
+import torch
+from . import convert_frame, eval_frame, resume_execution
+from .backends.registry import list_backends, lookup_backend, register_backend
+from .callback import callback_handler, on_compile_end, on_compile_start
+from .code_context import code_context
+from .convert_frame import replay
+from .decorators import (
+    allow_in_graph,
+    assume_constant_result,
+    disable,
+    disallow_in_graph,
+    forbid_in_graph,
+    graph_break,
+    mark_dynamic,
+    mark_static,
+    mark_static_address,
+    maybe_mark_dynamic,
+    run,
+)
+from .eval_frame import (
+    _reset_guarded_backend_cache,
+    explain,
+    export,
+    is_dynamo_supported,
+    is_inductor_supported,
+    optimize,
+    optimize_assert,
+    OptimizedModule,
+    reset_code,
+)
+from .external_utils import is_compiling
+from .utils import graph_break_reasons, guard_failures, orig_code_map, reset_frame_count
+
+__all__ = [
+    "allow_in_graph",
+    "assume_constant_result",
+    "disallow_in_graph",
+    "forbid_in_graph",
+    "graph_break",
+    "mark_dynamic",
+    "maybe_mark_dynamic",
+    "mark_static",
+    "mark_static_address",
+    "optimize",
+    "optimize_assert",
+    "export",
+    "explain",
+    "run",
+    "replay",
+    "disable",
+    "reset",
+    "OptimizedModule",
+    "is_compiling",
+    "register_backend",
+    "list_backends",
+    "lookup_backend",
+]
+
+if torch.manual_seed is torch.random.manual_seed:
+    import torch.jit._builtins
+
+    # Wrap manual_seed with the disable decorator.
+    # Can't do it at its implementation due to dependency issues.
+    torch.manual_seed = disable(torch.manual_seed)
+    # Add the new manual_seed to the builtin registry.
+    torch.jit._builtins._register_builtin(torch.manual_seed, "aten::manual_seed")
+
+
+def reset() -> None:
+    """Clear all compile caches and restore initial state"""
+    with convert_frame.compile_lock:
+        reset_code_caches()
+        convert_frame.input_codes.clear()
+        convert_frame.output_codes.clear()
+        orig_code_map.clear()
+        guard_failures.clear()
+        graph_break_reasons.clear()
+        resume_execution.ContinueExecutionCache.cache.clear()
+        _reset_guarded_backend_cache()
+        reset_frame_count()
+        torch._C._dynamo.compiled_autograd.clear_cache()
+        convert_frame.FRAME_COUNTER = 0
+        convert_frame.FRAME_COMPILE_COUNTER.clear()
+        callback_handler.clear()
+
+
+def reset_code_caches() -> None:
+    """Clear compile caches that are keyed by code objects"""
+    with convert_frame.compile_lock:
+        for weak_code in (
+            convert_frame.input_codes.seen + convert_frame.output_codes.seen
+        ):
+            code = weak_code()
+            if code:
+                reset_code(code)
+        code_context.clear()
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e595649e8f4b26768d9002b430cd67f000b2ba52
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a418daeab7cba1196d6ca2951763b3d4fb682b71
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/_trace_wrapped_higher_order_op.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b784e1e9cab12ea0606a10aff0e3f884f3b4868d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_analysis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..436a535ffe0a4608e5171ad0107f0ca31c32abe0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/bytecode_transformation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/cache_size.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/cache_size.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..086497812b8329dc31fa270d086c31adb9b76b4b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/cache_size.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/callback.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/callback.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f39f945c4cfe3c935e912195e91fbbfa93a9f0bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/callback.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/code_context.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/code_context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ec1ca30017eb99fc1c3cbc0be8985df66a2cca9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/code_context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/codegen.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/codegen.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf78f1c9e841324d3d61945a5b1fb7be4d4417da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/codegen.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8786952d037eb96360f186a4b3ab6a9cd2c0c1ae
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/compiled_autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/comptime.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/comptime.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..785e6ce847c872c2493af27ccb443fcff53a43dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/comptime.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17977d16796b54377d433fd1ad03ad27e5d8e0ab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/convert_frame.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/convert_frame.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..995dfb6aae2d842ac18dcd25c5d391ca205034f6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/convert_frame.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/current_scope_id.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/current_scope_id.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b8705a1d37912144b5447d3081883d2c4116a34
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/current_scope_id.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/debug_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/debug_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64ee3eabbc3a5ad774dce911c1c74f503338176f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/debug_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/decorators.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/decorators.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..203dc81fdf05ab7cba2b282bc9b16c86a5dfd4f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/decorators.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faf6361d1622a5ce826ff33338e5390c42db58b2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/device_interface.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/eval_frame.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/eval_frame.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..091afa63485547d83e344f844d3eced092d7becb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/eval_frame.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/exc.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/exc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6750e39905a221f576b7bf229c20b6c45ea4a89
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/exc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/external_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/external_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adb113430796c14cb5d62a90c9fc959152525426
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/external_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ea2c7d53c5c2de4fa13b4c0bea8201d3ac473e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/funcname_cache.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/guards.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/guards.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90c809468d95116d60637130e32c3f4c522712cf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/guards.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5819875c0851a45db6db33229c576f8915124e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/logging.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/logging.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f15c48bd996de0295bf3194883114576dd39a183
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/logging.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41af299a2ee895157ea2b2d0e501ad4f1df5deba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/mutation_guard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/output_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/output_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12e2db23f2eb3896a6978b2c51b53d574fd8fa38
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/output_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/polyfill.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/polyfill.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a30fcf32cddfd1a82d18ae35c210401bc4977021
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/polyfill.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a25c112870a15184e52b8ae5e54e13332312368c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c13dbe92d536948ae163e36ab52e3db666bc47c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/replay_record.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4e78726dc788e3eef0c96ff94e700666b3df4ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/resume_execution.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec218254c0a181ddaa3423a665b3de12a3ce642b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/side_effects.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/source.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/source.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c01c78c246f42ae1436763f4226cd8df1f1477ad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/source.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/symbolic_convert.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/symbolic_convert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9b1cbf2d7076f565c775c4a1d8ab4453da13666
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/symbolic_convert.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c785bd495a995d217bf36ab4c47da575efc6d39
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/tensor_version_op.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_case.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_case.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df472ea4e106afee7c28a4c3b39411c55187abec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_case.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_minifier_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_minifier_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff27a050d405988417308aa4346f3de302acb332
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/test_minifier_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/testing.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/testing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc4011169374b780ec1fc233ba4431dcc8704ce0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/testing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/trace_rules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/trace_rules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ad0951d1de826c850c95faa750b88ca60530c7a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/trace_rules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d95c8a5c1086c1a7398d8d4a60439ac9b28f8e37
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a63ed4a6a4b72c98c18af0fdb3a53d4db416b4ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/_trace_wrapped_higher_order_op.py b/MLPY/Lib/site-packages/torch/_dynamo/_trace_wrapped_higher_order_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c6061934ad2badeaada3b9aa3501aa16044c3af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -0,0 +1,120 @@
+import torch
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import autograd_not_implemented
+
+from torch._ops import HigherOrderOperator
+from torch._subclasses import FakeTensorMode
+from torch.fx.experimental._backward_state import BackwardState
+
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+from torch.utils._pytree import tree_map_only
+
+
+__all__ = ["trace_wrapped"]
+
+
+# trace_wrapped(*args, fn) is equivalent to fn(*args), but with a twist:
+# if you make_fx trace through this call, we will not actually trace into fn; instead,
+# we will directly insert it as a call_function to fn in the graph.
+# (Unlike make_fx, Dynamo WILL inline into fn.)
+# You can think of this as a one off allow_in_graph equivalent for proxy tensor tracing.
+#
+# Because proxy tensor tracing does not actually run the function, there are
+# requirements on the behavior of fn. We are still figuring it out, but here is the current state:
+#
+# 1) fn SHOULD only take a single argument, which must be a tensor
+# 2) fn MUST return a new tensor with the same metadata as the original tensor
+#    (e.g., zeros_like(input) is a permissible implementation of fn).
+#    This is verified via an extra assert that is inserted into the traced graph.
+# 3) fn MAY have side effects, but it MAY NOT perform metadata mutation on other tensors
+#    participating in proxy tensor tracing (it MAY mutate other tensors, it MAY mutate Python state)
+# These requirements stem from the requirement that we need to continue performing proxy tensor tracing,
+# which assumes accurate fake tensor metadata, without actually running fn.
+# In the future, we may allow for a "meta" function associated with fn to allow for more interesting input-output patterns.
+#
+# Note that tensors / Python state are allowed to be mutated.
+# This is relaxed constraint is not always sound, but it is sound for backward tracing with fake
+# tensors as it takes place in AOTAutograd, as the backward pass is guaranteed not to depend on concrete
+# tensor values (via fake tensor) or Python state (because the autograd engine doesn't depend on Python).
+#
+# The intended use case for this function is to allow AOTAutograd to defer complex
+# backward hooks to compiled autograd. AOTAutograd performs a make_fx trace which preserves
+# the function call as is in the graph, and only when we Dynamo through the backward graph in
+# compiled autograd do we inline into the function.
+
+
+def trace_wrapped(*args, **kwargs):
+    with torch.no_grad():
+        return _trace_wrapped_op(*args, **kwargs)
+
+
+# TODO(jansel): need to ensure this does not get DCEed
+_trace_wrapped_op = HigherOrderOperator("trace_wrapped")
+
+
+def _assert_meta(grad, size, stride, dtype):
+    assert grad.size() == size, "size mismatch"
+    assert grad.stride() == stride, "stride mismatch"
+    assert grad.dtype == dtype, "dtype mismatch"
+    return grad
+
+
+@_trace_wrapped_op.py_impl(ProxyTorchDispatchMode)
+def inner_trace(mode, *args, bw_state=None, **kwargs):
+    def self_invoke(*args, **dyn_kwargs):
+        with torch.no_grad():
+            return _trace_wrapped_op(*args, **dyn_kwargs, **kwargs)
+
+    def unwrap_proxies(x):
+        if isinstance(x, torch.Tensor):
+            return mode.tracer.unwrap_proxy(x)
+        if isinstance(x, (list, tuple)):
+            return type(x)(map(unwrap_proxies, x))
+        if x is None:
+            return None
+        raise AssertionError(f"unhandled type: {type(x)}")
+
+    proxy_kwargs = {}
+    if bw_state is not None:
+        assert isinstance(bw_state, BackwardState) and bw_state.proxy is not None
+        proxy_kwargs["bw_state"] = bw_state.proxy
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        self_invoke,
+        unwrap_proxies(args),
+        proxy_kwargs,
+        name="trace_wrapped",
+    )
+
+    if args[0] is None:
+        grad = args[1]  # module backward hooks
+    else:
+        grad = args[0]  # other backward hooks
+    grad = tree_map_only(torch.Tensor, torch.empty_like, grad)
+    track_tensor_tree(grad, out_proxy, constant=None, tracer=mode.tracer)
+    return grad
+
+
+@_trace_wrapped_op.py_impl(FakeTensorMode)
+def inner_fake(*args, **kwargs):
+    raise RuntimeError("This op should never be invoked here")
+
+
+@_trace_wrapped_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def _trace_wrapped_op_dense(*args, fn, **kwargs):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return fn(*args, **kwargs)
+
+
+_trace_wrapped_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(_trace_wrapped_op, deferred_error=True)
+)
+
+
+@_trace_wrapped_op.py_functionalize_impl
+def _trace_wrapped_functionalized(ctx, *args, **kwargs):
+    unwrapped_args = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        return ctx.wrap_tensors(_trace_wrapped_op(*unwrapped_args, **kwargs))
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__init__.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a33edf30961fc2e309e25d3d67ec87e0928e7e08
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2180a1b0618630d1116d16db89890750edbb81f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/cudagraphs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/cudagraphs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af61ca3f1f8bb79ca1eec2d0b479a90bcfbbddc5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/cudagraphs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/debugging.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/debugging.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..456c39b3c2aa53c54b32c6555acc2cbb3cb073ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/debugging.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a810e5114c8016f57ed69e5bb84e4580c8e3933c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..699c7a32dbc138946bcf4f676b3463861b8fd1bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/inductor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/onnxrt.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/onnxrt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eecd342aa1b5de107dc908f20fcfbc3a555b8975
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/onnxrt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aea8ad1d121538fc9f782103852b21ae270a31bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88c170b964cc7e9ff8a2ecfa2ddcb376a4160af2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tensorrt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/torchxla.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/torchxla.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0341cfc8ce6e91efc47228b398a0620d9493281
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/torchxla.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..067b49623cde62f99536ffb1c80e99146c7fb4dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/backends/__pycache__/tvm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/common.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b0e945f9920280c6e20906be541b102d2595e3f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/common.py
@@ -0,0 +1,112 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import logging
+from unittest.mock import patch
+
+import torch
+from torch._dynamo import disable
+from torch._dynamo.utils import counters, defake
+from torch._functorch.aot_autograd import aot_module_simplified
+from torch.utils._python_dispatch import _disable_current_modes
+
+log = logging.getLogger(__name__)
+
+
+def aot_autograd(**kwargs):
+    def compiler_fn(gm: torch.fx.GraphModule, example_inputs):
+        # Hack to get around circular import problems with aot_eager_decomp_partition
+        if callable(kwargs.get("decompositions")):
+            kwargs["decompositions"] = kwargs["decompositions"]()
+
+        # NB: dont delete counter increment
+        counters["aot_autograd"]["total"] += 1
+        use_fallback = False
+
+        if use_fallback:
+            log.debug("Unable to use AOT Autograd because graph has mutation")
+            counters["aot_autograd"]["not_ok"] += 1
+            return gm
+
+        # OK attempt to compile
+
+        def _wrapped_bw_compiler(*args, **kwargs):
+            # stop TorchDynamo from trying to compile our generated backwards pass
+            return disable(disable(bw_compiler)(*args, **kwargs))
+
+        bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
+        kwargs["bw_compiler"] = _wrapped_bw_compiler
+        kwargs["inference_compiler"] = (
+            kwargs.get("inference_compiler") or kwargs["fw_compiler"]
+        )
+
+        from functorch.compile import nop
+
+        from torch._inductor.debug import enable_aot_logging
+
+        # debug asserts slow down compile time noticeably,
+        # So only default them on when the aot_eager backend is used.
+        if kwargs.get("fw_compiler", None) == nop:
+            patch_config = patch("functorch.compile.config.debug_assert", True)
+        else:
+            patch_config = contextlib.nullcontext()
+
+        try:
+            # NB: NOT cloned!
+            with enable_aot_logging(), patch_config:
+                cg = aot_module_simplified(gm, example_inputs, **kwargs)
+                counters["aot_autograd"]["ok"] += 1
+                return disable(cg)
+        except Exception:
+            counters["aot_autograd"]["not_ok"] += 1
+            raise
+
+    return compiler_fn
+
+
+def mem_efficient_fusion_kwargs(use_decomps):
+    from functorch.compile import (
+        default_decompositions,
+        min_cut_rematerialization_partition,
+        ts_compile,
+    )
+
+    kwargs = {
+        # these are taken from memory_efficient_fusion()
+        "fw_compiler": ts_compile,
+        "bw_compiler": ts_compile,
+        "partition_fn": min_cut_rematerialization_partition,
+    }
+
+    if use_decomps:
+        kwargs["decompositions"] = default_decompositions
+
+    return kwargs
+
+
+def fake_tensor_unsupported(fn):
+    """
+    Decorator for backends that need real inputs.  We swap out fake
+    tensors for zero tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(model, inputs, **kwargs):
+        with _disable_current_modes():
+            inputs = list(map(defake, inputs))
+            return fn(model, inputs, **kwargs)
+
+    return wrapper
+
+
+def device_from_inputs(example_inputs) -> torch.device:
+    for x in example_inputs:
+        if hasattr(x, "device"):
+            return x.device
+
+
+def dtype_from_inputs(example_inputs) -> torch.dtype:
+    for x in example_inputs:
+        if hasattr(x, "dtype"):
+            return x.dtype
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/cudagraphs.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/cudagraphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c57c505905ce797365088111ab28a77ab2b722
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/cudagraphs.py
@@ -0,0 +1,239 @@
+# mypy: ignore-errors
+
+import functools
+import operator
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import torch
+from torch._dynamo.backends.debugging import boxed_nop
+from torch._inductor.cudagraph_trees import cudagraphify_impl
+from torch._inductor.cudagraph_utils import (
+    BoxedDeviceIndex,
+    check_multiple_devices_or_any_cpu_nodes,
+    get_mutation_stack_trace,
+)
+from torch._inductor.utils import (
+    BoxedBool,
+    count_tangents,
+    has_incompatible_cudagraph_ops,
+    num_fw_fixed_arguments,
+    output_node,
+)
+from torch.multiprocessing.reductions import StorageWeakRef
+from .common import aot_autograd
+from .registry import register_backend
+
+perf_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+
+
+def find_input_mutations(g):
+    def meta_fk(meta):
+        return meta["val"] if "val" in meta else meta["fake_result"]
+
+    inputs = defaultdict(set)
+    input_idx = 0
+    mutated_inputs = set()
+    for n in g.nodes:
+        if n.op == "placeholder":
+            if isinstance(meta_fk(n.meta), torch.Tensor):
+                inputs[StorageWeakRef(meta_fk(n.meta)._typed_storage())].add(input_idx)
+            input_idx += 1
+        elif n.op == "call_function":
+            if n.target is operator.getitem:
+                continue
+            schema = n.target._schema
+            for i, arg in enumerate(schema.arguments):
+                if i < len(n.args):
+                    argument = n.args[i]
+                else:
+                    if arg.name not in n.kwargs:
+                        continue
+                    argument = n.kwargs[arg.name]
+                mut_arg = False
+                if arg.alias_info:
+                    if arg.alias_info.is_write:
+                        mut_arg = True
+                if mut_arg:
+                    # TODO: not correct for args that contain tensors in a struct
+                    # like list
+                    mutated_inputs |= inputs[
+                        StorageWeakRef(meta_fk(argument.meta)._typed_storage())
+                    ]
+
+        # TODO: error on unrecognized nodes
+    return mutated_inputs
+
+
+def get_device_node_mapping(gm: torch.fx.GraphModule):
+    device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
+    for n in gm.graph.nodes:
+        t = n.meta.get("val", None)
+        if isinstance(t, torch.Tensor) and t.device not in device_node_mapping:
+            device_node_mapping[t.device] = n
+    return device_node_mapping
+
+
+def check_for_mutation(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
+    mutation_indices = find_input_mutations(aot_model.graph) - set(range(num_fixed))
+    if not mutation_indices:
+        return None
+
+    return get_mutation_stack_trace(aot_model, mutation_indices)
+
+
+def check_for_skip(aot_model: torch.fx.GraphModule, num_fixed) -> Optional[str]:
+    if mut_skip := check_for_mutation(aot_model, num_fixed):
+        return mut_skip
+
+    if skip := check_multiple_devices_or_any_cpu_nodes(
+        get_device_node_mapping(aot_model)
+    ):
+        return skip
+
+    if has_incompatible_cudagraph_ops(aot_model):
+        return "skipping cudagraphs due to incompatible op"
+
+    return None
+
+
+def get_device_index(gm) -> int:
+    device = next(iter(get_device_node_mapping(gm)))
+    assert device.type == "cuda"
+    return device.index
+
+
+def get_stack_traces(gm) -> List[Optional[str]]:
+    output = output_node(gm)
+    assert len(output.args) == 1
+    return [
+        (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+        for arg in output.args[0]
+    ]
+
+
+def cudagraphs(dynamo_model, dynamo_inputs):
+    do_cudagraphs = BoxedBool(True)
+    boxed_device_index = BoxedDeviceIndex(None)
+
+    def forward_cudagraphs(aot_model, aot_inputs, is_inference=False):
+        interp = boxed_nop(aot_model, aot_inputs)
+        fixed = num_fw_fixed_arguments(len(dynamo_inputs), len(aot_inputs))
+        if skip_msg := check_for_skip(aot_model, fixed):
+            BoxedBool.disable(do_cudagraphs)
+            perf_log.warning("skipping cudagraphs due to %s", skip_msg)
+            return interp
+
+        boxed_device_index.set(get_device_index(aot_model))
+
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=boxed_device_index.value,
+            is_backward=False,
+            is_inference=False,
+            stack_traces=get_stack_traces(aot_model),
+        )
+        out._boxed_call = True
+        return out
+
+    def backward_cudagraphs(aot_model, aot_inputs):
+        interp = boxed_nop(aot_model, aot_inputs)
+        if not do_cudagraphs:
+            return aot_model
+
+        fixed = count_tangents(aot_model)
+        if skip_msg := check_for_skip(aot_model, fixed):
+            perf_log.warning("skipping cudagraphs due to %s", skip_msg)
+
+            # See [Backward Generation Handling]
+            manager = torch._inductor.cudagraph_trees.get_manager(
+                boxed_device_index.value, create_if_none_exists=False
+            )
+            assert manager is not None
+
+            def fn(inputs):
+                manager.set_to_running_backward()
+                return aot_model(inputs)
+
+            fn._boxed_call = True
+            return fn
+
+        out = cudagraphify_impl(
+            interp,
+            aot_inputs,
+            range(fixed),
+            device_index=get_device_index(aot_model),
+            is_backward=True,
+            is_inference=False,
+            stack_traces=get_stack_traces(aot_model),
+        )
+        out._boxed_call = True
+        return out
+
+    aot_cudagraphs = aot_autograd(
+        fw_compiler=forward_cudagraphs,
+        bw_compiler=backward_cudagraphs,
+        inference_compiler=functools.partial(forward_cudagraphs, is_inference=True),
+        keep_inference_input_mutations=torch._dynamo.config.cudagraph_backend_keep_input_mutation,
+    )
+    return aot_cudagraphs(dynamo_model, dynamo_inputs)
+
+
+class CudagraphsBackend:
+    compiler_name = "cudagraphs"
+
+    @staticmethod
+    def reset():
+        from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+
+        reset_cudagraph_trees()
+
+    @staticmethod
+    def __call__(model, inputs):
+        return cudagraphs(model, inputs)
+
+
+# aot_cudagraphs only applies CUDA graphs to the graph.  It is also helpful
+# for debugging and can serve as a perf baseline.
+register_backend(name="cudagraphs", compiler_fn=CudagraphsBackend())
+
+
+def cudagraphs_inner(model, inputs, copy_outputs=True, copy_inputs=True):
+    """This isn't registered as a backend, but is used in some benchmarks"""
+    assert isinstance(inputs, (list, tuple))
+    if copy_inputs:
+        static_inputs = [torch.zeros_like(x) for x in inputs]
+    else:
+        static_inputs = list(inputs)
+
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(stream):
+        model(*inputs)
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream):
+        static_outputs = model(*static_inputs)
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+
+    def run(*new_inputs):
+        assert len(static_inputs) == len(new_inputs)
+        if copy_inputs:
+            for dst, src in zip(static_inputs, new_inputs):
+                dst.copy_(src)
+        graph.replay()
+        if copy_outputs:
+            return [x.clone() for x in static_outputs]
+        else:
+            return static_outputs
+
+    return run
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/debugging.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/debugging.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bdc89fb699e2a07099f7d09e9fc4d3b1d8f3a43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/debugging.py
@@ -0,0 +1,289 @@
+# mypy: ignore-errors
+
+import dataclasses
+import functools
+from importlib import import_module
+from typing import Any, List, Optional
+
+from functorch.compile import min_cut_rematerialization_partition
+
+import torch
+from torch import _guards
+from torch._functorch.compilers import ts_compile
+from .common import aot_autograd
+from .registry import register_debug_backend as register_backend
+
+"""
+This file contains TorchDynamo backends intended for debugging uses.
+"""
+
+
+@register_backend
+def eager(gm, fake_tensor_inputs):
+    return gm
+
+
+@register_backend
+def pre_dispatch_eager(gm, fake_tensor_inputs):
+    from torch.fx.experimental.proxy_tensor import make_fx
+
+    def runnable_gm(*args):
+        return torch.fx.Interpreter(gm).run(*args)
+
+    pre_dispatch_gm = make_fx(runnable_gm, pre_dispatch=True)(*fake_tensor_inputs)
+    pre_dispatch_gm.print_readable()
+
+    return pre_dispatch_gm
+
+
+@register_backend
+def eager_debug(gm, fake_tensor_inputs):
+    from torch._subclasses.schema_check_mode import SchemaCheckMode
+
+    # We could add more debugging bits here.
+    # Right now, this backend can be used to check for and error on
+    # custom dispatcher ops that have incorrect schemas.
+    def inner(*args):
+        with SchemaCheckMode():
+            return torch.fx.Interpreter(gm).run(*args)
+
+    return inner
+
+
+@register_backend(name="ts")
+def torchscript(gm, fake_tensor_inputs):
+    return torch.jit.script(gm)
+
+
+# used boxed call to discard inputs when they are no longer needed
+def boxed_nop(fx_g, example_inputs):
+    def run(args):
+        return torch.fx.Interpreter(fx_g).boxed_run(args)
+
+    run._boxed_call = True
+    return run
+
+
+# Useful for debugging purpose
+# aot_eager uses AOT Autograd backend with nop compiler. It is helpful in debugging.
+aot_eager = aot_autograd(
+    fw_compiler=boxed_nop, partition_fn=min_cut_rematerialization_partition
+)
+register_backend(name="aot_eager", compiler_fn=aot_eager)
+
+aot_eager_default_partitioner = aot_autograd(fw_compiler=boxed_nop)
+register_backend(
+    name="aot_eager_default_partitioner", compiler_fn=aot_eager_default_partitioner
+)
+
+# Uses TorchInductor AOT Autograd decomps and partitioner to isolate aot vs
+# inductor problems.
+# aot_eager_decomp_partition just replaces the inductor compiler with nop to help
+# isolate inductor vs aot_eager errors
+aot_eager_decomp_partition = aot_autograd(
+    # these are taken from memory_efficient_fusion()
+    fw_compiler=boxed_nop,
+    bw_compiler=boxed_nop,
+    # NB: lambda here is to delay import of inductor
+    decompositions=lambda: import_module(
+        "torch._inductor.compile_fx"
+    ).select_decomp_table(),
+    partition_fn=functools.partial(
+        min_cut_rematerialization_partition, compiler="inductor"
+    ),
+)
+register_backend(
+    name="aot_eager_decomp_partition", compiler_fn=aot_eager_decomp_partition
+)
+
+# AOT Autograd with torchscript backend. Default partitioner.
+# aot_ts uses torchscript backend. We can use this with both nnc and nvfuser
+# by using the relevant fuser with torch.jit.fuser(...)
+aot_ts = aot_autograd(fw_compiler=ts_compile)
+register_backend(name="aot_ts", compiler_fn=aot_ts)
+
+# These buggy backends are used for inducing bugs so that we can test
+# our repro extraction / minifier scripts
+
+
+class ReluCompileError(Exception):
+    pass
+
+
+class TestingOnlyCompileError(Exception):
+    pass
+
+
+@register_backend
+def relu_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            raise ReluCompileError()
+    return gm
+
+
+@register_backend
+def relu_runtime_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch._assert
+            node.args = (False, "ReluRuntimeError")
+    gm.recompile()
+    return gm
+
+
+@register_backend
+def relu_accuracy_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+    for node in gm.graph.nodes:
+        if node.target == torch.relu:
+            node.target = torch.add
+            node.args = (node.args[0], 1)
+    gm.recompile()
+
+    return gm
+
+
+@register_backend
+def non_leaf_compile_error_TESTING_ONLY(gm: torch.fx.GraphModule, example_inputs):
+    # Require at least one non-trivial thing in the graph,
+    # see https://github.com/pytorch/pytorch/issues/102898
+    for node in gm.graph.nodes:
+        if node.op == "call_function":
+            break
+    else:
+        return gm
+    for t in example_inputs:
+        if not t.is_leaf:
+            raise TestingOnlyCompileError()
+    return gm
+
+
+@dataclasses.dataclass
+class ExplainOutput:
+    """
+    This is the output of :func:`torch._dynamo.explain()`
+    There is no reason to create this class directly.
+    """
+
+    graphs: List[torch.fx.GraphModule]
+    graph_count: int
+    graph_break_count: int
+    break_reasons: List[
+        Any
+    ]  # Type is GraphCompileReason but doesn't matter for this purpose
+    op_count: int
+    ops_per_graph: Optional[List[torch.fx.Node]] = None
+    out_guards: Optional[List[_guards.Guard]] = None
+    compile_times: Optional[str] = None
+
+    def __str__(self):
+        output = f"Graph Count: {self.graph_count}\n"
+        output += f"Graph Break Count: {self.graph_break_count}\n"
+        output += f"Op Count: {self.op_count}\n"
+
+        output += "Break Reasons:\n"
+        for idx, break_reason in enumerate(self.break_reasons):
+            output += f"  Break Reason {idx+1}:\n"
+            output += f"    Reason: {break_reason.reason}\n"
+            output += "    User Stack:\n"
+            for frame_summary in break_reason.user_stack:
+                output += f"      {frame_summary}\n"
+
+        if self.ops_per_graph is not None:
+            output += "Ops per Graph:\n"
+            for idx, ops in enumerate(self.ops_per_graph):
+                output += f"  Ops {idx+1}:\n"
+                for op in ops:
+                    output += f"    {op}\n"
+
+        if self.out_guards is not None:
+            output += "Out Guards:\n"
+            for i, guard in enumerate(self.out_guards):
+                output += f"  Guard {i+1}:\n"
+                output += f"    {str(guard)}"
+
+        if self.compile_times is not None:
+            output += f"Compile Times: {self.compile_times}\n"
+        return output
+
+
+def _explain_graph_detail(
+    gm: torch.fx.GraphModule, graphs, op_count, ops_per_graph, break_reasons
+):
+    """
+    This function is a utility which processes a torch.fx.GraphModule and
+    accumulates information about its ops, graph breaks, and other details. It
+    is intended to be used by the ExplainWithBackend class and
+    `torch._dynamo.explain()` to provide details from Dynamo's graph capture.
+
+    Parameters:
+        gm (torch.fx.GraphModule): The GraphModule to be processed.
+        graphs (list): A list that accumulates all the GraphModules processed.
+        op_count (int): The total count of operations in all GraphModules processed so far.
+        ops_per_graph (list): A list that accumulates the operations of each GraphModule.
+        break_reasons (list): A list that accumulates the reasons for breaks in each GraphModule.
+
+    Returns:
+        tuple: A tuple containing the processed GraphModule, the updated lists of graphs,
+               operations per graph, and break reasons, and the updated operation count.
+    """
+    graphs.append(gm)
+    ops = [node.target for node in gm.graph.nodes if node.op == "call_function"]
+    op_count += len(ops)
+    ops_per_graph.append(ops)
+    if gm.compile_subgraph_reason.graph_break:
+        break_reasons.append(gm.compile_subgraph_reason)
+
+    return gm, graphs, op_count, ops_per_graph, break_reasons
+
+
+class ExplainWithBackend:
+    """
+    This class is intended to be used as a backend for `torch.compile`. It is
+    composable with other backends. When used in this way, it accumulates
+    information about graph breaks, ops, and other info and provides a string
+    representation summarizing this information.
+
+    Attributes:
+        backend (str): The name of the backend to use for optimization.
+        graphs (list): A list of the graphs captured by TorchDynamo.
+        op_count (int): The total number of operations in all optimized graphs.
+        break_reasons (list): A list of graph break reasons with stack traces.
+
+    Example Usage:
+        def fn(x):
+            x = torch.sigmoid(x)
+            return x
+
+        torch._dynamo.reset()
+        eb = ExplainWithBackend("inductor")
+        optimized_fn = torch.compile(fn, backend=eb)
+        result = optimized_fn(torch.randn(5))
+        print(eb.output())
+    """
+
+    def __init__(self, backend):
+        from .registry import lookup_backend
+
+        self.backend = lookup_backend(backend)
+        self.graphs = []
+        self.op_count = 0
+        self.break_reasons = []
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+        gm, self.graphs, self.op_count, _, self.break_reasons = _explain_graph_detail(
+            gm, self.graphs, self.op_count, [], self.break_reasons
+        )
+        return self.backend(gm, example_inputs)
+
+    def output(self) -> ExplainOutput:
+        graph_count = len(self.graphs)
+        output = ExplainOutput(
+            self.graphs,
+            graph_count,
+            graph_count - 1,
+            self.break_reasons,
+            self.op_count,
+        )
+
+        return output
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/distributed.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d6812035e21b663a1e75df98ceb5ed9fbfbe1cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/distributed.py
@@ -0,0 +1,612 @@
+# mypy: ignore-errors
+
+import logging
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, List, Optional
+from unittest import mock
+
+import torch
+from torch import fx
+from torch._dynamo.output_graph import GraphCompileReason
+from torch._dynamo.utils import deepcopy_to_fake_tensor, detect_fake_mode
+from torch._logging import trace_structured
+from torch.fx.node import Node
+
+# Regular log messages should go through 'log'.
+# ddp_graph_log is a separate artifact logger reserved for dumping graphs.
+# See docs/source/logging.rst for more info.
+log = logging.getLogger(__name__)
+ddp_graph_log = torch._logging.getArtifactLogger(__name__, "ddp_graphs")
+
+
+def args_str(args):
+    # a debug helper
+    if torch.is_tensor(args):
+        return f"T[{args.shape}]"
+    elif isinstance(args, tuple):
+        return f"tuple({', '.join([args_str(x) for x in args])})"
+    elif isinstance(args, list):
+        return f"list({', '.join([args_str(x) for x in args])})"
+    else:
+        return str(args)
+
+
+@dataclass
+class Bucket:
+    size: int = 0
+    params: List[str] = field(default_factory=list)
+    nodes: List[fx.Node] = field(default_factory=list)
+
+    # param_ids is just used for unit testing
+    param_ids: List = field(default_factory=list)
+
+    # keep track of any buckets that were extended for logging purposes
+    opcount_increased_to_capture_external_output: int = 0
+    paramsize_before_opcount_increase: int = 0
+
+
+def bucket_has_external_output(bucket: Bucket) -> bool:
+    nodes_in_bucket = set()
+    # we want to iterate in reverse order, but clumsi-luckily the bucket.nodes list was already created backwards
+    # so we don't reverse it here
+    for node in bucket.nodes:
+        # assume node.op != output, since those are filtered in the original iteration
+        nodes_in_bucket.add(node)
+        for user in node.users:
+            if user not in nodes_in_bucket:
+                return True
+    return False
+
+
+def pretty_print_buckets(buckets: List[Bucket], bucket_bytes_cap: int):
+    headers = ("Index", "Size (b)", "Param Names")
+    rows = []
+    extended_buckets = []
+    for idx, bucket in enumerate(reversed(buckets)):
+        if len(bucket.params) > 0:
+            rows.append((idx, bucket.size, bucket.params[0]))
+            for param in bucket.params[1:]:
+                rows.append((None, None, param))
+        if bucket.opcount_increased_to_capture_external_output > 0:
+            extended_buckets.append(
+                (
+                    idx,
+                    bucket.opcount_increased_to_capture_external_output,
+                    bucket.size - bucket.paramsize_before_opcount_increase,
+                )
+            )
+
+    if len(rows):
+        log.info(
+            "\nDDPOptimizer used bucket cap %s and created %d buckets. Enable debug logs for detailed bucket info.",
+            bucket_bytes_cap,
+            len(buckets),
+        )
+
+        if len(extended_buckets):
+            log.warning(
+                "Some buckets were extended beyond their requested parameter capacities"
+                " in order to ensure each subgraph has an output node, required for fx graph partitioning."
+                " This can be the case when a subgraph would have only contained nodes performing inplace mutation,"
+                " and returning no logical outputs. This should not be a problem, unless it results in too few graph"
+                " partitions for optimal DDP performance."
+            )
+
+        try:
+            from tabulate import tabulate
+
+            log.debug(
+                "\nDDPOptimizer produced the following bucket assignments:\n%s",
+                tabulate(rows, headers=headers, tablefmt="simple_grid"),
+            )
+
+            if len(extended_buckets):
+                log.warning(
+                    "DDPOptimizer extended these buckets to ensure per-subgraph output nodes:\n%s",
+                    tabulate(
+                        extended_buckets,
+                        headers=("Index", "Extra Ops", "Extra Param Size (b)"),
+                        tablefmt="simple_grid",
+                    ),
+                )
+        except ImportError:
+            log.debug(
+                "Please `pip install tabulate` in order to display ddp bucket sizes and diagnostic information."
+            )
+    else:
+        log.debug("DDPOptimizer captured no parameters and did not split this graph.")
+
+
+def has_higher_order_op(gm):
+    # Check if there is a higher order op in the graph
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            maybe_param = getattr(gm, node.target)
+            if isinstance(maybe_param, torch.fx.GraphModule):
+                return True
+    return False
+
+
+# 3 (lazy compile): Replace submodules with lazily compiling submodule
+class SubmoduleReplacer(torch.fx.interpreter.Interpreter):
+    def __init__(self, module, compiler):
+        super().__init__(module)
+        self.compiler = compiler
+
+    def lazily_compiled_submod(self, input_mod):
+        """
+        Create a wrapper around submodules which:
+        - lazily compiles each of the partitioned submodules using the user-provided compiler
+        - unpacks singleton tuples/lists into flat arg
+        """
+
+        class LazilyCompiledModule(torch.nn.Module):
+            def __init__(self, submod, compiler, unwrap_singleton_tuple):
+                super().__init__()
+                self.submod = submod
+                self.compiler = compiler
+                self.compiled = False
+                self.unwrap_singleton_tuple = unwrap_singleton_tuple
+
+            def forward(self, *args):
+                if not self.compiled:
+                    # First compile with args as example_inputs
+                    # These args will be fakeified if using Inductor/AOTAutograd
+                    new_submod = self.compiler(self.submod, args)
+                    del self.submod
+                    self.submod = new_submod
+                    self.compiled = True
+                    self.compiler = None
+
+                x = self.submod(*args)
+                # we must let 'input_mod' return a tuple, to make AOT happy.
+                # (aot_autograd compile_fn literally requires that the output of a graph it compiles is a tuple).
+                # however, we don't acutally want this tuple to be returned, since the fx logic that calls the submod
+                # will again wrap outputs from the submod in a tuple.  So we unwrap it, and count on it being re-wrapped
+                if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
+                    return x[0]
+                return x
+
+        unwrap_singleton_tuple = False
+        for sn in input_mod.graph.nodes:
+            if sn.op == "output":
+                if not isinstance(sn.args[0], tuple):
+                    unwrap_singleton_tuple = True
+                    sn.args = (sn.args,)
+
+        input_mod.recompile()
+        input_mod.compile_subgraph_reason = GraphCompileReason(
+            "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+            " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+            [
+                # it's close to useless to get a real stacktrace here, and quite verbose.
+                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+            ],
+        )
+        wrapper = LazilyCompiledModule(
+            input_mod,
+            self.compiler,
+            unwrap_singleton_tuple,
+        )
+        return wrapper
+
+    # We replace the submodules with lazy submodules which compile
+    # the corresponding submodules when they are run with real values
+    # Always returns `None` - we do not need to propagate values in order
+    # to replace submodules.
+    def run_node(self, n: Node) -> Any:
+        if n.op == "call_module":
+            real_mod = self.fetch_attr(n.target)
+
+            ddp_graph_log.debug("\n---%s graph---\n%s", n.target, real_mod.graph)
+
+            assert len(n.kwargs) == 0, "We assume only args for these modules"
+            lazily_compiled_submod = self.lazily_compiled_submod(real_mod)
+
+            # We update the original (outer) graph with a call into the compiled module
+            # instead of the uncompiled one.
+            self.module.delete_submodule(n.target)
+            n.target = "compiled_" + n.target
+            self.module.add_submodule(n.target, lazily_compiled_submod)
+
+
+# 3 (no lazy compile): compile each of the partitioned submodules using the user-provided compiler
+class SubmodCompiler(torch.fx.interpreter.Interpreter):
+    def __init__(self, module, compiler, fake_mode):
+        super().__init__(module)
+        self.compiler = compiler
+        self.fake_mode = fake_mode
+
+    def compile_submod(self, input_mod, args, kwargs):
+        """
+        Compile the submodule,
+        using a wrapper to make sure its output is always a tuple,
+        which is required by AotAutograd based compilers
+        """
+        assert len(kwargs) == 0, "We assume only args for these modules"
+
+        class WrapperModule(torch.nn.Module):
+            def __init__(self, submod, unwrap_singleton_tuple):
+                super().__init__()
+                self.submod = submod
+                self.unwrap_singleton_tuple = unwrap_singleton_tuple
+
+            def forward(self, *args):
+                x = self.submod(*args)
+                # TODO(whc)
+                # for some reason the isinstance check is necessary if I split one node per submod
+                # - even though I supposedly wrapped the output in a tuple in those cases, the real
+                # compiled module was still returning a tensor
+                if self.unwrap_singleton_tuple and isinstance(x, (tuple, list)):
+                    return x[0]
+                return x
+
+        unwrap_singleton_tuple = False
+        for sn in input_mod.graph.nodes:
+            if sn.op == "output":
+                if not isinstance(sn.args[0], tuple):
+                    unwrap_singleton_tuple = True
+                    sn.args = (sn.args,)
+
+        input_mod.recompile()
+        input_mod.compile_subgraph_reason = GraphCompileReason(
+            "DDPOptimizer intentional graph-break (See Note [DDPOptimizer])."
+            " Set `torch._dynamo.config.optimize_ddp = False` to disable.",
+            [
+                # it's close to useless to get a real stacktrace here, and quite verbose.
+                traceback.FrameSummary(__file__, 0, DDPOptimizer),
+            ],
+        )
+
+        wrapper = WrapperModule(
+            self.compiler(input_mod, args),
+            unwrap_singleton_tuple,
+        )
+        return wrapper
+
+    # Note:
+    #
+    # The way distributed works today around fake tensors can be somewhat confusing.
+    # Some of these codepaths are shared in both runtime, and compile time. The presence
+    # of a fake_mode, read off of fake tensor inputs, dictates how we will operate.
+    #
+    # A few things to keep in mind:
+    #
+    # 1) We invoke `compile_submod` with a real module. The output of that gets stored
+    # on the graph via `self.module.add_submodule(n.target, compiled_submod_real)`.
+    #
+    # 2) When running a call_module targeted node, if we have a fake_mode, we fakify the
+    # module we got from self.fetch_attr(n.target). Regardless of fake_mode, we then execute it.
+    #
+    # 3) Fake tensors should always be around during compile time.
+    #
+    # 4) Fake tensors should never be around at runtime.
+    #
+    # 5) We end up with a compilation mode that takes a real submodule and fake tensors,
+    # to match what aot_autograd expects. See Note: [Fake Modules and AOTAutograd]
+    def run_node(self, n: Node) -> Any:
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+        new_args = []
+        assert self.fake_mode
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and not isinstance(
+                arg, torch._subclasses.FakeTensor
+            ):
+                new_args.append(torch._dynamo.utils.to_fake_tensor(arg, self.fake_mode))
+            else:
+                new_args.append(arg)
+
+        log.debug("run_node %s, %s got args %s", n.op, n.target, args_str(args))
+        assert isinstance(args, tuple)
+        assert isinstance(kwargs, dict)
+
+        if n.op == "call_module":
+            real_mod = self.fetch_attr(n.target)
+            if self.fake_mode:
+                curr_submod = deepcopy_to_fake_tensor(real_mod, self.fake_mode)
+            else:
+                curr_submod = real_mod
+
+            ddp_graph_log.debug("\n---%s graph---\n%s", n.target, curr_submod.graph)
+
+            # When calling the compiler on the submod, inputs (new_args) are expected to
+            # be FakeTensors already since Dynamo would have made them FakeTensors in the
+            # non-DDP flow.  However, the parameters are _not_ expected to be FakeTensors,
+            # since this wrapping happens during compilation
+
+            # Note: Returning Fake Tensors on First AOT Autograd Call
+            #
+            # Inductor will optimize strides of outputs when it deems it profitable.
+            # For instance, converting to channels last. When we split the graph here
+            # into multiple inductor compilations, we need to make sure that the
+            # output strides of one compilation is appropriately passed to the subsequent
+            # compilations. However, the mapping from inductor output to dynamo output
+            # is non-trivial due to aot_autograd's deduping, de-aliasing, mutation, re-writing,
+            # subclass handling, etc. In order to replay all this logic we set a flag such that
+            # the first invocation of inductor in aot_autograd will return Fake Tensors with
+            # appropriate strides. Then, all of aot autograd's runtime logic is replayed.
+            # This gives us the appropriately strided outputs here which will reflect runtime strides.
+
+            class FakeifyFirstAOTInvocationGuard:
+                def __init__(self):
+                    self.tc = torch._guards.TracingContext.try_get()
+                    assert self.tc
+                    torch._guards.TracingContext.try_get().fakify_first_call = True
+
+                def __del__(self):
+                    self.tc.fakify_first_call = False
+
+            # For aot_eager and other backends, tracing context is not set
+            has_tracing_context = torch._guards.TracingContext.try_get() is not None
+            if has_tracing_context:
+                g = FakeifyFirstAOTInvocationGuard()
+
+            from torch._dynamo.utils import counters
+
+            init = counters["aot_autograd"]["total"]
+            compiled_submod_real = self.compile_submod(real_mod, new_args, kwargs)
+
+            # TODO - better way of doing this?
+            # Only aot autograd handles fakifying first call
+            invoked_aot_autograd = init != counters["aot_autograd"]["total"]
+
+            # We update the original (outer) graph with a call into the compiled module
+            # instead of the uncompiled one.
+            self.module.delete_submodule(n.target)
+            n.target = "compiled_" + n.target
+            self.module.add_submodule(n.target, compiled_submod_real)
+
+            # Finally, we have to produce inputs for use compiling the next submodule,
+            # and these need to be FakeTensors, so we execute the module under fake_mode
+            # Because parameters are not fake we patch fake tensor mode to allow non fake inputs
+            with self.fake_mode, mock.patch.object(
+                self.fake_mode, "allow_non_fake_inputs", True
+            ):
+                if has_tracing_context and invoked_aot_autograd:
+                    out = compiled_submod_real(*new_args, **kwargs)
+                    # output should be fake or subclass
+                    assert all(
+                        (not isinstance(t, torch.Tensor) or type(t) is not torch.Tensor)
+                        for t in (out if isinstance(out, (list, tuple)) else [out])
+                    )
+                    return out
+                else:
+                    return curr_submod(*new_args, **kwargs)
+        else:
+            # placeholder or output nodes don't need to get compiled, just executed
+            return getattr(self, n.op)(n.target, new_args, kwargs)
+
+
+class DDPOptimizer:
+
+    """Note [DDPOptimizer]
+    DDPOptimizer applies when dynamo compiles models wrapped in DistributedDataParallel (DDP),
+    breaking the dynamo graph into chunks to compile separately, with the breaks aligning to
+    the boundaries of gradient-allreduce buckets chosen by DDP.
+
+    Background/Motivation
+     - DDP uses allreduce collectives to synchronize partial gradients computed on different workers
+     - DDP groups gradient allreduces into 'buckets' to optimize communication efficiency of all-reduce
+     - Parameters grouped into buckets are assumed to be adjacent in time, so they become ready
+       at around the same time during backward and thus can share the same allreduce efficiently
+     - Allreduces must overlap with backward compute for optimal training performance
+     - DDP schedules allreduces using 'hooks' fired from the c++ autograd engine in pytorch, which
+       operates when individual grads become 'ready'
+     - Dynamo+AOTAutograd produces a single fused graph that runs 'atomically' from the perspective of the
+       autograd engine, such that all gradients become 'ready' at the same time.  Hooks fire after the whole
+       fused backward function executes, preventing any overlap of compute and communication
+
+    Algorithm
+     - DDPOptimizer starts off with an FX graph traced by dynamo which represents forward.  It can traverse
+       this graph in reverse order to determine the true order that gradients will become ready during backward.
+     - Parameter sizes are counted in reverse order, up to a bucket size limit, at which point a new bucket is started
+       and a graph break introduced
+     - Each of the subgraphs is compiled by the compiler provided to dynamo by the user, and then fused back together
+       into an outer module that is returned to the user
+
+    Notes
+     - It would be better to enforce (by adding an API to DDP) that the bucket splits chosen here are used by DDP,
+       and that DDP does not need to detect or optimize bucket order by observing execution at runtime, as it does
+       in eager.
+     - If Dynamo can't capture a whole graph for the portion of the model wrapped by DDP, this algorithm will currently
+       produce splits that do not necessarily align with the buckets used by DDP.  This should result in performance
+       degradation approaching the baseline case where graph-splits are not used, but not worse.
+     - If the backend compiler fails to compile a single subgraph, it will execute eagerly despite the rest of the
+       subgraphs being compiled
+     - DDP has a 'parameters_and_buffers_to_ignore' field, which DDPOptimizer attempts to honor by reading markers
+       left by DDP on individual parameters.  In cases where other transformations, such as reparameterization, are
+       also used, the ignore markers could be lost.  If DDPOptimizer fails to ignore a parameter ignored by DDP,
+       it is not catastrophic but could impact performance by choosing sub-optimal bucket splits.
+     - DDPOptimizer always ignores all buffers, regardless of their ignore flag, since buffers do not require gradients,
+       and therefore aren't allreduced by DDP.  (They are broadcast during forward, but this is not covered by
+       DDPOptimizer)
+
+    Debugging
+     - Generally, it is easiest to debug DDPOptimizer in a single process program, using pdb.
+     - In many cases, the log messages are helpful (they show bucket size assignments)-
+       just set TORCH_LOGS env to include any of 'dynamo', 'distributed', or 'dist_ddp'.
+     - See `benchmarks/dynamo/distributed.py` for a simple harness that will run a toy model or a torchbench model
+       in a single process (or with torchrun, in multiple processes)
+
+    Args:
+        bucket_bytes_cap (int): Controls the size of buckets, in bytes, used to determine graphbreaks.  Should be
+            set to match the equivalent parameter on the original DDP module.
+
+        backend_compile_fn (callable): A dynamo compiler function, to be invoked to compile each subgraph.
+
+        first_bucket_cap (int): Controls the size of the first bucket.  Should match DDP's first bucket cap.  DDP
+            special-cases the first bucket size since it is sometimes optimal to start a small allreduce early.
+
+    """
+
+    def __init__(
+        self,
+        bucket_bytes_cap: int,
+        backend_compile_fn,
+        first_bucket_cap: Optional[int] = None,
+    ):
+        if first_bucket_cap is not None:
+            self.first_bucket_cap = first_bucket_cap
+        elif torch.distributed.is_available():
+            # this constant comes from C10D lib which is not always built
+            self.first_bucket_cap = torch.distributed._DEFAULT_FIRST_BUCKET_BYTES
+        else:
+            self.first_bucket_cap = bucket_bytes_cap
+
+        self.bucket_bytes_cap = bucket_bytes_cap
+        assert (
+            self.first_bucket_cap <= self.bucket_bytes_cap
+        ), "First bucket should be smaller/equal to other buckets to get comms warmed up ASAP"
+
+        self.backend_compile_fn = backend_compile_fn
+
+    def _ignore_parameter(self, parameter):
+        return hasattr(parameter, "_ddp_ignored") and parameter._ddp_ignored
+
+    def compile_fn(self, gm: fx.GraphModule, example_inputs: List[torch.Tensor]):
+        """
+        Implements graph splitting, first determining a set of of buckets by counting
+        parameter sizes in reverse graph order, then invoking the user/backend compiler
+        to compile each subgraph. Finally, stiches compiled graphs into one graphmodule
+        and returns its callable.
+        """
+        if has_higher_order_op(gm):
+            # This indicates presence of a higher order op. For now, we
+            # have no way to break the higher order op into two buckets.
+            # Allowing higher order ops in the graph also requires
+            # changes in the split_module, becuase graph splitter
+            # currently assumes that all the args of all ops are
+            # tensors, but in the case of higher order ops, it could be
+            # a graph module. As a workaround, we are shortcircuiting
+            raise NotImplementedError(
+                "DDPOptimizer backend: Found a higher order op in the graph. "
+                "This is not supported. Please turn off DDP optimizer using "
+                "torch._dynamo.config.optimize_ddp=False. Note that this can "
+                "cause performance degradation because there will be one bucket "
+                "for the entire Dynamo graph. Please refer to this issue - "
+                "https://github.com/pytorch/pytorch/issues/104674."
+            )
+
+        # 1: compute the partition map according to DDP bucket logic
+        buckets = [Bucket()]  # (size, param_names)
+        for node in reversed(gm.graph.nodes):
+            if node.op in ("output", "placeholder"):
+                continue
+
+            if (
+                buckets[0].size >= self.bucket_bytes_cap
+                or len(buckets) == 1
+                and buckets[0].size >= self.first_bucket_cap
+            ):
+                if bucket_has_external_output(buckets[0]):
+                    buckets.insert(0, Bucket())
+                else:
+                    # continue building this bucket past the point of filling its parameter capacity,
+                    # to increase chances it contains at least one node that is either a global output or
+                    # passed as input to a subsequent graph
+
+                    if buckets[0].opcount_increased_to_capture_external_output == 0:
+                        buckets[0].paramsize_before_opcount_increase = buckets[0].size
+                    buckets[0].opcount_increased_to_capture_external_output += 1
+
+            if node.op == "call_module":
+                target = gm.get_submodule(node.target)
+                for name, param in target.named_parameters():
+                    if param.requires_grad and not self._ignore_parameter(param):
+                        buckets[0].size += param.untyped_storage().nbytes()
+                        buckets[0].params.append(f"{node.target}_{name}")
+                        buckets[0].param_ids.append(id(param))
+            elif node.op == "get_attr":
+                maybe_param = getattr(gm, node.target)
+                if maybe_param.requires_grad and not self._ignore_parameter(
+                    maybe_param
+                ):
+                    buckets[0].size += maybe_param.untyped_storage().nbytes()
+                    buckets[0].params.append(node.target)
+                    buckets[0].param_ids.append(id(maybe_param))
+
+            # All nodes have to be mapped to a bucket, even if they don't have their own params
+            # Ignored params still end up in buckets, we just don't count them towards the capacity
+            buckets[0].nodes.append(node)
+
+        if len(buckets) > 1 and buckets[0].size == 0:
+            # we collected a small preamble graph with ops that don't include parameters, fuse it back
+            buckets[1].nodes.extend(buckets[0].nodes)
+            assert len(buckets[0].params) == 0, "Params should be empty if size is 0"
+            del buckets[0]
+
+        # stash buckets for testing/debugging purposes
+        self.buckets = buckets
+        pretty_print_buckets(buckets, self.bucket_bytes_cap)
+
+        if len(buckets) == 1:
+            # bypass split/fuse logic if there is only one bucket
+            return self.backend_compile_fn(gm, example_inputs)
+
+        # 2: partition the graphmodule according to bucket capacity
+        partition_map = {}
+        for idx, b in enumerate(buckets):
+            for node in b.nodes:
+                partition_map[node] = idx
+
+        split_gm = fx.passes.split_module.split_module(
+            gm, None, lambda node: partition_map[node]
+        )
+
+        debug_str = (
+            f"\n---orig graph---\n{gm.graph}\n"
+            + f"\n---split graph---\n{split_gm.graph}\n"
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                # only print the submod graphs, not their children
+                debug_str += f"\n---{name} graph---\n{module.graph}\n"
+        debug_str += "\n---------------\n"
+        ddp_graph_log.debug(debug_str)
+
+        trace_structured(
+            "optimize_ddp_split_graph",
+            payload_fn=lambda: split_gm.print_readable(print_output=False),
+        )
+        for name, module in split_gm.named_modules():
+            if "." not in name and len(name):
+                trace_structured(
+                    "optimize_ddp_split_child",
+                    lambda: {"name": name},
+                    payload_fn=lambda: module.print_readable(print_output=False),
+                )
+
+        # NOTE, we want to enable `optimize_ddp_lazy_compile` by default as soon as possible,
+        # becuase it will fix stride mismatch errors (see motivation: https://github.com/pytorch/pytorch/pull/114154).
+        # However, lazy compile currently causes shape mismatch in other cases (`test_graph_split_inductor_transpose`)
+        # and we need to fix them before we can enable it by default.
+        if not torch._dynamo.config.optimize_ddp_lazy_compile:
+            # Today, optimize_ddp=True and keep_output_stride=False can lead to silent
+            # correctness issues. The problem is that ddp_optimizer works by partitioning
+            # the dynamo graph, sending each subgraph through aot autograd to inductor,
+            # and creates example inputs by eagerly interpreting each subgraph to get
+            # an output that with the same metadata that we'd get from eager mode.
+            # This is a problem though, for torch._inductor.config.keep_output_stride.
+            # The above config can cause the outputs of the first graph to have
+            # **different** strides from eager, causing the inputs that we pass
+            # to the second graph to be wrong.
+            # To really fix this, we would need to faithfully ask inductor
+            # what the outputs to each graph it expects are.
+            fake_mode = detect_fake_mode(example_inputs)
+            if fake_mode is None:
+                fake_mode = torch._subclasses.fake_tensor.FakeTensorMode()
+
+        if torch._dynamo.config.optimize_ddp_lazy_compile:
+            submod_compiler = SubmoduleReplacer(split_gm, self.backend_compile_fn)
+        else:
+            submod_compiler = SubmodCompiler(
+                split_gm, self.backend_compile_fn, fake_mode
+            )
+        submod_compiler.run(*example_inputs)
+        split_gm.recompile()
+
+        ddp_graph_log.debug(
+            "\n---final graph---\n%s\n---------------\n", split_gm.graph
+        )
+        return split_gm
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/inductor.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/inductor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a661378b616ce3e7975f0fb330b029ce9c7142
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/inductor.py
@@ -0,0 +1,16 @@
+# mypy: ignore-errors
+
+import sys
+
+from torch._dynamo import register_backend
+
+
+@register_backend
+def inductor(*args, **kwargs):
+    if sys.platform == "win32":
+        raise RuntimeError("Windows not yet supported for inductor")
+
+    # do import here to avoid loading inductor into memory when it is not used
+    from torch._inductor.compile_fx import compile_fx
+
+    return compile_fx(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/onnxrt.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/onnxrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..54e7a3c95f2f42f709bf38d78e088962f7b00df6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/onnxrt.py
@@ -0,0 +1,37 @@
+# mypy: ignore-errors
+
+# This backend is maintained by ONNX team. To direct issues
+# to the right people, please tag related GitHub issues with `module: onnx`.
+#
+# Maintainers' Github IDs: wschin, thiagocrepaldi, BowenBao, abock
+from torch.onnx._internal.onnxruntime import (
+    is_onnxrt_backend_supported,
+    torch_compile_backend,
+)
+from .registry import register_backend
+
+
+def has_onnxruntime():
+    # FIXME(abock): update test/dynamo/test_backends.py to call is_onnxrt_backend_supported()
+    return is_onnxrt_backend_supported()
+
+
+if is_onnxrt_backend_supported():
+    register_backend(name="onnxrt", compiler_fn=torch_compile_backend)
+else:
+
+    def information_displaying_backend(*args, **kwargs):
+        raise ImportError(
+            "onnxrt is not registered as a backend. "
+            "Please make sure all dependencies such as "
+            "numpy, onnx, onnxscript, and onnxruntime-training are installed. "
+            "Suggested procedure to fix dependency problem:\n"
+            "  (1) pip or conda install numpy onnx onnxscript onnxruntime-training.\n"
+            "  (2) Open a new python terminal.\n"
+            "  (3) Call the API `torch.onnx.is_onnxrt_backend_supported()`:\n"
+            "  (4)   If it returns `True`, then you can use `onnxrt` backend.\n"
+            "  (5)   If it returns `False`, please execute the package importing section in "
+            "torch/onnx/_internal/onnxruntime.py under pdb line-by-line to see which import fails."
+        )
+
+    register_backend(name="onnxrt", compiler_fn=information_displaying_backend)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/registry.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..038d9156838b876a4d44f80f07f6e5adccd89d57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/registry.py
@@ -0,0 +1,115 @@
+# mypy: ignore-errors
+
+import functools
+import sys
+from typing import Callable, Dict, List, Optional, Protocol, Sequence, Tuple
+
+import torch
+from torch import fx
+
+
+class CompiledFn(Protocol):
+    def __call__(self, *args: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+        ...
+
+
+CompilerFn = Callable[[fx.GraphModule, List[torch.Tensor]], CompiledFn]
+
+_BACKENDS: Dict[str, CompilerFn] = dict()
+
+
+def register_backend(
+    compiler_fn: Optional[CompilerFn] = None,
+    name: Optional[str] = None,
+    tags: Sequence[str] = (),
+):
+    """
+    Decorator to add a given compiler to the registry to allow calling
+    `torch.compile` with string shorthand.  Note: for projects not
+    imported by default, it might be easier to pass a function directly
+    as a backend and not use a string.
+
+    Args:
+        compiler_fn: Callable taking a FX graph and fake tensor inputs
+        name: Optional name, defaults to `compiler_fn.__name__`
+        tags: Optional set of string tags to categorize backend with
+    """
+    if compiler_fn is None:
+        # @register_backend(name="") syntax
+        return functools.partial(register_backend, name=name, tags=tags)
+    assert callable(compiler_fn)
+    name = name or compiler_fn.__name__
+    assert name not in _BACKENDS, f"duplicate name: {name}"
+    _BACKENDS[name] = compiler_fn
+    compiler_fn._tags = tuple(tags)
+    return compiler_fn
+
+
+register_debug_backend = functools.partial(register_backend, tags=("debug",))
+register_experimental_backend = functools.partial(
+    register_backend, tags=("experimental",)
+)
+
+
+def lookup_backend(compiler_fn):
+    """Expand backend strings to functions"""
+    if isinstance(compiler_fn, str):
+        if compiler_fn not in _BACKENDS:
+            _lazy_import()
+        if compiler_fn not in _BACKENDS:
+            _lazy_import_entry_point(compiler_fn)
+        if compiler_fn not in _BACKENDS:
+            from ..exc import InvalidBackend
+
+            raise InvalidBackend(name=compiler_fn)
+        compiler_fn = _BACKENDS[compiler_fn]
+    return compiler_fn
+
+
+def list_backends(exclude_tags=("debug", "experimental")) -> List[str]:
+    """
+    Return valid strings that can be passed to:
+
+        torch.compile(..., backend="name")
+    """
+    _lazy_import()
+    exclude_tags = set(exclude_tags or ())
+    return sorted(
+        [
+            name
+            for name, backend in _BACKENDS.items()
+            if not exclude_tags.intersection(backend._tags)
+        ]
+    )
+
+
+@functools.lru_cache(None)
+def _lazy_import():
+    from .. import backends
+    from ..utils import import_submodule
+
+    import_submodule(backends)
+
+    from ..repro.after_dynamo import dynamo_minifier_backend
+
+    assert dynamo_minifier_backend is not None
+
+
+@functools.lru_cache(None)
+def _lazy_import_entry_point(backend_name: str):
+    from importlib.metadata import entry_points
+
+    compiler_fn = None
+    group_name = "torch_dynamo_backends"
+    if sys.version_info < (3, 10):
+        backend_eps = entry_points()
+        eps = [ep for ep in backend_eps.get(group_name, ()) if ep.name == backend_name]
+        if len(eps) > 0:
+            compiler_fn = eps[0].load()
+    else:
+        backend_eps = entry_points(group=group_name)
+        if backend_name in backend_eps.names:
+            compiler_fn = backend_eps[backend_name].load()
+
+    if compiler_fn is not None and backend_name not in list_backends(tuple()):
+        register_backend(compiler_fn=compiler_fn, name=backend_name)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/tensorrt.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2ba60cdeb0f6581e049f670088269919fa0fa5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/tensorrt.py
@@ -0,0 +1,14 @@
+# mypy: ignore-errors
+
+# import torch  # type: ignore[import]
+# from .common import device_from_inputs, fake_tensor_unsupported  # type: ignore[import]
+# from .registry import register_backend  # type: ignore[import]
+
+"""
+Placeholder for TensorRT backend for dynamo via torch-tensorrt
+"""
+
+# @register_backend
+# def tensorrt(gm, example_inputs):
+#    import torch_tensorrt # type: ignore[import]
+#    pass
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/torchxla.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/torchxla.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c50e11e8341311a240497ad4c95adbaad7de36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/torchxla.py
@@ -0,0 +1,75 @@
+# mypy: ignore-errors
+
+import logging
+import warnings
+
+from functorch.compile import make_boxed_func
+
+from ..backends.common import aot_autograd
+from .registry import register_backend, register_experimental_backend
+
+log = logging.getLogger(__name__)
+
+
+@register_experimental_backend
+def torchxla_trivial(gm, fake_tensor_inputs):
+    return gm
+
+
+@register_experimental_backend
+def torchxla_trace_once(model, fake_tensor_inputs):
+    warnings.warn(
+        "This backend will be deprecated in 2.2, please use `openxla` backend instead"
+    )
+
+    return xla_backend_helper(model, fake_tensor_inputs)
+
+
+@register_backend
+def openxla_eval(model, fake_tensor_inputs):
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=False)
+
+
+def openxla_eval_boxed(model, fake_tensor_inputs):
+    return xla_backend_helper(model, fake_tensor_inputs, boxed=True)
+
+
+def xla_backend_helper(model, fake_tensor_inputs, boxed=False):
+    try:
+        import torch_xla.core.dynamo_bridge as bridge
+    except ImportError as e:
+        raise ImportError(
+            "Please follow the instruction in https://github.com/pytorch/xla#pytorchxla to install torch_xla"
+        ) from e
+
+    compiled_graph = None
+
+    def fwd(*args):
+        nonlocal model
+        nonlocal compiled_graph
+        if compiled_graph is None:
+            compiled_graph = bridge.extract_compiled_graph(model, args)
+            del model
+        return compiled_graph(*args)
+
+    return make_boxed_func(fwd) if boxed else fwd
+
+
+aot_torchxla_trivial = aot_autograd(
+    fw_compiler=torchxla_trivial,
+)
+register_experimental_backend(
+    name="aot_torchxla_trivial", compiler_fn=aot_torchxla_trivial
+)
+
+aot_torchxla_trace_once = aot_autograd(
+    fw_compiler=torchxla_trace_once,
+)
+register_experimental_backend(
+    name="aot_torchxla_trace_once", compiler_fn=aot_torchxla_trace_once
+)
+
+openxla = aot_autograd(
+    fw_compiler=openxla_eval_boxed,
+)
+register_backend(name="openxla", compiler_fn=openxla)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/backends/tvm.py b/MLPY/Lib/site-packages/torch/_dynamo/backends/tvm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1bd3c7cac8349d0e5a681fe1eaf4ed90386a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/backends/tvm.py
@@ -0,0 +1,172 @@
+# mypy: ignore-errors
+
+import functools
+import importlib
+import logging
+import os
+import tempfile
+
+import torch
+from .common import device_from_inputs, fake_tensor_unsupported
+
+from .registry import register_backend
+
+log = logging.getLogger(__name__)
+
+
+@register_backend
+@fake_tensor_unsupported
+def tvm(gm, example_inputs, *, scheduler=None, trials=20000):
+    import tvm  # type: ignore[import]
+    from tvm import relay  # type: ignore[import]
+    from tvm.contrib import graph_executor  # type: ignore[import]
+
+    jit_mod = torch.jit.trace(gm, example_inputs)
+    device = device_from_inputs(example_inputs)
+    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+    example_outputs = gm(*example_inputs)
+    if len(example_outputs) == 0:
+        log.warning("Explicitly fall back to eager due to zero output")
+        return gm.forward
+    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
+    if device.type == "cuda":
+        dev = tvm.cuda(device.index)
+        target = tvm.target.cuda()
+    else:
+        dev = tvm.cpu(0)
+        target = tvm.target.Target(llvm_target())
+
+    if scheduler is None:
+        scheduler = os.environ.get("TVM_SCHEDULER", None)
+
+    if scheduler == "auto_scheduler":
+        from tvm import auto_scheduler
+
+        log_file = tempfile.NamedTemporaryFile()
+
+        if not os.path.exists(log_file):
+            tasks, task_weights = auto_scheduler.extract_tasks(
+                mod["main"], params, target
+            )
+            for task in tasks:
+                print(task.compute_dag)
+            else:
+                print("No tasks")
+            if len(tasks) != 0:
+                tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+                if not os.path.exists(log_file):
+                    assert trials > 0
+                    tune_option = auto_scheduler.TuningOptions(
+                        num_measure_trials=trials,
+                        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+                        early_stopping=2000,
+                    )
+                    try:
+                        tuner.tune(tune_option)
+                    except Exception:
+                        if os.path.exists(log_file):
+                            os.unlink(log_file)
+                        raise
+
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3, config={"relay.backend.use_auto_scheduler": True}
+            ):
+                lib = relay.build(mod, target=target, params=params)
+    elif scheduler == "meta_schedule":
+        from tvm import meta_schedule as ms
+
+        with tempfile.TemporaryDirectory() as work_dir:
+            if device.type != "cuda":
+                # meta_schedule needs num-cores to be specified
+                # here we use the maximum core count
+                target = tvm.target.Target(
+                    f"{llvm_target()} --num-cores {ms.utils.cpu_count(logical=False)}"
+                )
+            # TODO(shingjan): This could be replaced by tvm.contrib.torch.optimize_torch
+            # once USE_PT_TVMDSOOP is updated and turned on by default in TVM.
+            database = ms.relay_integration.tune_relay(
+                mod=mod,
+                target=target,
+                work_dir=work_dir,
+                max_trials_global=20000,
+                num_trials_per_iter=64,
+                params=params,
+                strategy="evolutionary",
+            )
+            lib = ms.relay_integration.compile_relay(
+                database=database,
+                mod=mod,
+                target=target,
+                params=params,
+            )
+    elif scheduler == "default" or not scheduler:
+        # no autotuning
+        with tvm.transform.PassContext(opt_level=10):
+            lib = relay.build(mod, target=target, params=params)
+    else:
+        raise NotImplementedError(
+            "This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
+            "There are three available options: default, auto_scheduler and meta_schedule."
+        )
+    m = graph_executor.GraphModule(lib["default"](dev))
+
+    def to_torch_tensor(nd_tensor):
+        """A helper function to transfer a NDArray to torch.tensor."""
+        if nd_tensor.dtype == "bool":
+            # DLPack does not support boolean so it can't be handled by
+            # torch.utils.dlpack.from_pack. Workaround by going through
+            # numpy, although this brings additional data copy overhead.
+            return torch.from_numpy(nd_tensor.numpy())
+        return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())
+
+    def to_tvm_tensor(torch_tensor):
+        """A helper function to transfer a torch.tensor to NDArray."""
+        if torch_tensor.dtype == torch.bool:
+            # same reason as above, fallback to numpy conversion which
+            # could introduce data copy overhead
+            return tvm.nd.array(torch_tensor.cpu().numpy())
+        return tvm.nd.from_dlpack(torch_tensor)
+
+    def exec_tvm(*i_args):
+        args = [a.contiguous() for a in i_args]
+        shape_info, _ = m.get_input_info()
+        active_inputs = {name for name, _ in shape_info.items()}
+        for idx, arg in enumerate(args, 0):
+            if arg.dim() != 0:
+                if arg.requires_grad:
+                    arg = arg.detach()
+                inp_name = f"inp_{idx}"
+                if inp_name not in active_inputs:
+                    log.warning(
+                        "input %s skipped as not found in tvm's runtime library",
+                        inp_name,
+                    )
+                    continue
+                m.set_input(
+                    inp_name,
+                    to_tvm_tensor(arg),
+                )
+        m.run()
+        return [to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())]
+
+    return exec_tvm
+
+
+tvm_meta_schedule = functools.partial(tvm, scheduler="meta_schedule")
+tvm_auto_scheduler = functools.partial(tvm, scheduler="auto_scheduler")
+
+
+def has_tvm():
+    try:
+        importlib.import_module("tvm")
+        return True
+    except ImportError:
+        return False
+
+
+@functools.lru_cache(None)
+def llvm_target():
+    if "avx512" in open("/proc/cpuinfo").read():
+        return "llvm -mcpu=skylake-avx512"
+    return "llvm -mcpu=core-avx2"
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/bytecode_analysis.py b/MLPY/Lib/site-packages/torch/_dynamo/bytecode_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..5332ed5b7ec8e77cf449652c4c319ac644454572
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/bytecode_analysis.py
@@ -0,0 +1,250 @@
+import bisect
+import dataclasses
+import dis
+import sys
+from typing import Any, Set, Union
+
+TERMINAL_OPCODES = {
+    dis.opmap["RETURN_VALUE"],
+    dis.opmap["JUMP_FORWARD"],
+    dis.opmap["RAISE_VARARGS"],
+    # TODO(jansel): double check exception handling
+}
+if sys.version_info >= (3, 9):
+    TERMINAL_OPCODES.add(dis.opmap["RERAISE"])
+if sys.version_info >= (3, 11):
+    TERMINAL_OPCODES.add(dis.opmap["JUMP_BACKWARD"])
+    TERMINAL_OPCODES.add(dis.opmap["JUMP_FORWARD"])
+else:
+    TERMINAL_OPCODES.add(dis.opmap["JUMP_ABSOLUTE"])
+JUMP_OPCODES = set(dis.hasjrel + dis.hasjabs)
+JUMP_OPNAMES = {dis.opname[opcode] for opcode in JUMP_OPCODES}
+HASLOCAL = set(dis.haslocal)
+HASFREE = set(dis.hasfree)
+
+stack_effect = dis.stack_effect
+
+
+def get_indexof(insts):
+    """
+    Get a mapping from instruction memory address to index in instruction list.
+    Additionally checks that each instruction only appears once in the list.
+    """
+    indexof = {}
+    for i, inst in enumerate(insts):
+        assert inst not in indexof
+        indexof[inst] = i
+    return indexof
+
+
+def remove_dead_code(instructions):
+    """Dead code elimination"""
+    indexof = get_indexof(instructions)
+    live_code = set()
+
+    def find_live_code(start):
+        for i in range(start, len(instructions)):
+            if i in live_code:
+                return
+            live_code.add(i)
+            inst = instructions[i]
+            if inst.exn_tab_entry:
+                find_live_code(indexof[inst.exn_tab_entry.target])
+            if inst.opcode in JUMP_OPCODES:
+                find_live_code(indexof[inst.target])
+            if inst.opcode in TERMINAL_OPCODES:
+                return
+
+    find_live_code(0)
+
+    # change exception table entries if start/end instructions are dead
+    # assumes that exception table entries have been propagated,
+    # e.g. with bytecode_transformation.propagate_inst_exn_table_entries,
+    # and that instructions with an exn_tab_entry lies within its start/end.
+    if sys.version_info >= (3, 11):
+        live_idx = sorted(live_code)
+        for i, inst in enumerate(instructions):
+            if i in live_code and inst.exn_tab_entry:
+                # find leftmost live instruction >= start
+                start_idx = bisect.bisect_left(
+                    live_idx, indexof[inst.exn_tab_entry.start]
+                )
+                assert start_idx < len(live_idx)
+                # find rightmost live instruction <= end
+                end_idx = (
+                    bisect.bisect_right(live_idx, indexof[inst.exn_tab_entry.end]) - 1
+                )
+                assert end_idx >= 0
+                assert live_idx[start_idx] <= i <= live_idx[end_idx]
+                inst.exn_tab_entry.start = instructions[live_idx[start_idx]]
+                inst.exn_tab_entry.end = instructions[live_idx[end_idx]]
+
+    return [inst for i, inst in enumerate(instructions) if i in live_code]
+
+
+def remove_pointless_jumps(instructions):
+    """Eliminate jumps to the next instruction"""
+    pointless_jumps = {
+        id(a)
+        for a, b in zip(instructions, instructions[1:])
+        if a.opname == "JUMP_ABSOLUTE" and a.target is b
+    }
+    return [inst for inst in instructions if id(inst) not in pointless_jumps]
+
+
+def propagate_line_nums(instructions):
+    """Ensure every instruction has line number set in case some are removed"""
+    cur_line_no = None
+
+    def populate_line_num(inst):
+        nonlocal cur_line_no
+        if inst.starts_line:
+            cur_line_no = inst.starts_line
+
+        inst.starts_line = cur_line_no
+
+    for inst in instructions:
+        populate_line_num(inst)
+
+
+def remove_extra_line_nums(instructions):
+    """Remove extra starts line properties before packing bytecode"""
+
+    cur_line_no = None
+
+    def remove_line_num(inst):
+        nonlocal cur_line_no
+        if inst.starts_line is None:
+            return
+        elif inst.starts_line == cur_line_no:
+            inst.starts_line = None
+        else:
+            cur_line_no = inst.starts_line
+
+    for inst in instructions:
+        remove_line_num(inst)
+
+
+@dataclasses.dataclass
+class ReadsWrites:
+    reads: Set[Any]
+    writes: Set[Any]
+    visited: Set[Any]
+
+
+def livevars_analysis(instructions, instruction):
+    indexof = get_indexof(instructions)
+    must = ReadsWrites(set(), set(), set())
+    may = ReadsWrites(set(), set(), set())
+
+    def walk(state, start):
+        if start in state.visited:
+            return
+        state.visited.add(start)
+
+        for i in range(start, len(instructions)):
+            inst = instructions[i]
+            if inst.opcode in HASLOCAL or inst.opcode in HASFREE:
+                if "LOAD" in inst.opname or "DELETE" in inst.opname:
+                    if inst.argval not in must.writes:
+                        state.reads.add(inst.argval)
+                elif "STORE" in inst.opname:
+                    state.writes.add(inst.argval)
+                elif inst.opname == "MAKE_CELL":
+                    pass
+                else:
+                    raise NotImplementedError(f"unhandled {inst.opname}")
+            if inst.exn_tab_entry:
+                walk(may, indexof[inst.exn_tab_entry.target])
+            if inst.opcode in JUMP_OPCODES:
+                walk(may, indexof[inst.target])
+                state = may
+            if inst.opcode in TERMINAL_OPCODES:
+                return
+
+    walk(must, indexof[instruction])
+    return must.reads | may.reads
+
+
+@dataclasses.dataclass
+class FixedPointBox:
+    value: bool = True
+
+
+@dataclasses.dataclass
+class StackSize:
+    low: Union[int, float]
+    high: Union[int, float]
+    fixed_point: FixedPointBox
+
+    def zero(self):
+        self.low = 0
+        self.high = 0
+        self.fixed_point.value = False
+
+    def offset_of(self, other, n):
+        prior = (self.low, self.high)
+        self.low = min(self.low, other.low + n)
+        self.high = max(self.high, other.high + n)
+        if (self.low, self.high) != prior:
+            self.fixed_point.value = False
+
+    def exn_tab_jump(self, depth):
+        prior = (self.low, self.high)
+        self.low = min(self.low, depth)
+        self.high = max(self.high, depth)
+        if (self.low, self.high) != prior:
+            self.fixed_point.value = False
+
+
+def stacksize_analysis(instructions) -> Union[int, float]:
+    assert instructions
+    fixed_point = FixedPointBox()
+    stack_sizes = {
+        inst: StackSize(float("inf"), float("-inf"), fixed_point)
+        for inst in instructions
+    }
+    stack_sizes[instructions[0]].zero()
+
+    for _ in range(100):
+        if fixed_point.value:
+            break
+        fixed_point.value = True
+
+        for inst, next_inst in zip(instructions, instructions[1:] + [None]):
+            stack_size = stack_sizes[inst]
+            # CALL_FINALLY in Python 3.8 is handled differently when determining stack depth.
+            # See https://github.com/python/cpython/blob/3.8/Python/compile.c#L5450.
+            # Essentially, the stack effect of CALL_FINALLY is computed with jump=True,
+            # but the resulting stack depth is propagated to the next instruction, not the
+            # jump target.
+            is_call_finally = (
+                sys.version_info < (3, 9) and inst.opcode == dis.opmap["CALL_FINALLY"]
+            )
+            if inst.opcode not in TERMINAL_OPCODES:
+                assert next_inst is not None, f"missing next inst: {inst}"
+                stack_sizes[next_inst].offset_of(
+                    stack_size,
+                    stack_effect(inst.opcode, inst.arg, jump=is_call_finally),
+                )
+            if inst.opcode in JUMP_OPCODES and not is_call_finally:
+                stack_sizes[inst.target].offset_of(
+                    stack_size, stack_effect(inst.opcode, inst.arg, jump=True)
+                )
+            if inst.exn_tab_entry:
+                # see https://github.com/python/cpython/blob/3.11/Objects/exception_handling_notes.txt
+                # on why depth is computed this way.
+                depth = inst.exn_tab_entry.depth + int(inst.exn_tab_entry.lasti) + 1
+                stack_sizes[inst.exn_tab_entry.target].exn_tab_jump(depth)
+
+    if False:
+        for inst in instructions:
+            stack_size = stack_sizes[inst]
+            print(stack_size.low, stack_size.high, inst)
+
+    low = min([x.low for x in stack_sizes.values()])
+    high = max([x.high for x in stack_sizes.values()])
+
+    assert fixed_point.value, "failed to reach fixed point"
+    assert low >= 0
+    return high
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/bytecode_transformation.py b/MLPY/Lib/site-packages/torch/_dynamo/bytecode_transformation.py
new file mode 100644
index 0000000000000000000000000000000000000000..767c11e2ab632c8885b3bcec5b8a84fdcbf749db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/bytecode_transformation.py
@@ -0,0 +1,1114 @@
+import copy
+import dataclasses
+import dis
+import itertools
+import sys
+import types
+from typing import Any, Callable, cast, Dict, Iterator, List, Optional, Tuple
+
+from .bytecode_analysis import (
+    get_indexof,
+    propagate_line_nums,
+    remove_extra_line_nums,
+    stacksize_analysis,
+)
+
+
+@dataclasses.dataclass
+class InstructionExnTabEntry:
+    start: "Instruction"
+    end: "Instruction"
+    target: "Instruction"
+    depth: int
+    lasti: bool
+
+    def __repr__(self) -> str:
+        return (
+            f"InstructionExnTabEntry(start={self.start.short_inst_repr()}, "
+            f"end={self.end.short_inst_repr()}, "
+            f"target={self.target.short_inst_repr()}, "
+            f"depth={self.depth}, lasti={self.lasti})"
+        )
+
+    def __eq__(self, o) -> bool:
+        return (
+            self.start is o.start
+            and self.end is o.end
+            and self.target is o.target
+            and self.depth == o.depth
+            and self.lasti == o.lasti
+        )
+
+
+@dataclasses.dataclass
+class Instruction:
+    """A mutable version of dis.Instruction"""
+
+    opcode: int
+    opname: str
+    arg: Optional[int]
+    argval: Any
+    offset: Optional[int] = None
+    starts_line: Optional[int] = None
+    is_jump_target: bool = False
+    positions: Optional["dis.Positions"] = None
+    # extra fields to make modification easier:
+    target: Optional["Instruction"] = None
+    exn_tab_entry: Optional[InstructionExnTabEntry] = None
+
+    def __hash__(self) -> int:
+        return id(self)
+
+    def __eq__(self, other) -> bool:
+        return id(self) == id(other)
+
+    def short_inst_repr(self) -> str:
+        return f"Instruction(opname={self.opname}, offset={self.offset})"
+
+
+def convert_instruction(i: dis.Instruction) -> Instruction:
+    return Instruction(
+        i.opcode,
+        i.opname,
+        i.arg,
+        i.argval,
+        i.offset,
+        i.starts_line,
+        i.is_jump_target,
+        getattr(i, "positions", None),
+    )
+
+
+class _NotProvided:
+    def __repr__(self) -> str:
+        return "_NotProvided"
+
+
+def create_instruction(
+    name, *, arg=None, argval=_NotProvided, target=None
+) -> Instruction:
+    """
+    At most one of `arg`, `argval`, and `target` can be not None/_NotProvided.
+    This is to prevent ambiguity, e.g. does
+        create_instruction("LOAD_CONST", 5)
+    mean load the constant at co_consts[5], or load the constant 5?
+
+    If `arg` is not provided, it will be computed during assembly from
+    `argval` or `target`.
+
+    Do not use for LOAD_GLOBAL - use create_load_global instead.
+    """
+    assert name != "LOAD_GLOBAL"
+    cnt = (arg is not None) + (argval is not _NotProvided) + (target is not None)
+    if cnt > 1:
+        raise RuntimeError(
+            "only one of arg, argval, and target can be not None/_NotProvided"
+        )
+    if arg is not None and not isinstance(arg, int):
+        raise RuntimeError("instruction arg must be int or None")
+    return Instruction(
+        opcode=dis.opmap[name], opname=name, arg=arg, argval=argval, target=target
+    )
+
+
+# Python 3.11 remaps
+def create_jump_absolute(target) -> Instruction:
+    inst = "JUMP_FORWARD" if sys.version_info >= (3, 11) else "JUMP_ABSOLUTE"
+    return create_instruction(inst, target=target)
+
+
+def create_load_global(name, push_null) -> Instruction:
+    """
+    `name` is the name of the global to be loaded.
+    `push_null` specifies whether or not a NULL should be pushed to the stack
+    before the global (Python 3.11+ only).
+
+    Python 3.11 changed the LOAD_GLOBAL instruction in that the first bit of
+    the instruction arg specifies whether a NULL should be pushed to the stack
+    before the global. The remaining bits of the instruction arg contain the
+    name index. See `create_call_function` for why this NULL is needed.
+
+    The instruction's `arg` is actually computed when assembling the bytecode.
+    For Python 3.11, push_null information is propagated through the arg.
+
+    NOTE: we don't use create_instruction since LOAD_GLOBAL is the only instruction
+    where both arg and argval need to be specified.
+    """
+    return Instruction(
+        opcode=dis.opmap["LOAD_GLOBAL"],
+        opname="LOAD_GLOBAL",
+        arg=push_null,
+        argval=name,
+    )
+
+
+def create_dup_top() -> Instruction:
+    if sys.version_info >= (3, 11):
+        return create_instruction("COPY", arg=1)
+    return create_instruction("DUP_TOP")
+
+
+def create_rot_n(n) -> List[Instruction]:
+    """
+    Returns a "simple" sequence of instructions that rotates TOS to the n-th
+    position in the stack. For Python < 3.11, returns a single ROT_*
+    instruction. If no such instruction exists, an error is raised and the
+    caller is expected to generate an equivalent sequence of instructions.
+    For Python >= 3.11, any rotation can be expressed as a simple sequence of
+    swaps.
+    """
+    if n <= 1:
+        # don't rotate
+        return []
+
+    if sys.version_info >= (3, 11):
+        # rotate can be expressed as a sequence of swap operations
+        # e.g. rotate 3 is equivalent to swap 3, swap 2
+        return [create_instruction("SWAP", arg=i) for i in range(n, 1, -1)]
+
+    # ensure desired rotate function exists
+    if sys.version_info < (3, 8) and n >= 4:
+        raise AttributeError(f"rotate {n} not supported for Python < 3.8")
+    if sys.version_info < (3, 10) and n >= 5:
+        raise AttributeError(f"rotate {n} not supported for Python < 3.10")
+
+    if n <= 4:
+        return [create_instruction("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])]
+    return [create_instruction("ROT_N", arg=n)]
+
+
+def create_call_function(nargs, push_null) -> List[Instruction]:
+    """
+    Creates a sequence of instructions that makes a function call.
+
+    `push_null` is used in Python 3.11+ only. It is used in codegen when
+    a function call is intended to be made with the NULL + fn convention,
+    and we know that the NULL has not been pushed yet. We will push a
+    NULL and rotate it to the correct position immediately before making
+    the function call.
+    push_null should default to True unless you know you are calling a function
+    that you codegen'd with a null already pushed, for example
+    (assume `math` is available in the global scope),
+
+    create_load_global("math", True)  # pushes a null
+    create_instruction("LOAD_ATTR", argval="sqrt")
+    create_instruction("LOAD_CONST", argval=25)
+    create_call_function(1, False)
+    """
+    if sys.version_info >= (3, 11):
+        output = []
+        if push_null:
+            output.append(create_instruction("PUSH_NULL"))
+            output.extend(create_rot_n(nargs + 2))
+        output.append(create_instruction("PRECALL", arg=nargs))
+        output.append(create_instruction("CALL", arg=nargs))
+        return output
+    return [create_instruction("CALL_FUNCTION", arg=nargs)]
+
+
+def create_call_method(nargs) -> List[Instruction]:
+    if sys.version_info >= (3, 11):
+        return [
+            create_instruction("PRECALL", arg=nargs),
+            create_instruction("CALL", arg=nargs),
+        ]
+    return [create_instruction("CALL_METHOD", arg=nargs)]
+
+
+def lnotab_writer(
+    lineno: int, byteno: int = 0
+) -> Tuple[List[int], Callable[[int, int], None]]:
+    """
+    Used to create typing.CodeType.co_lnotab
+    See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
+    This is the internal format of the line number table if Python < 3.10
+    """
+    assert sys.version_info < (3, 10)
+    lnotab: List[int] = []
+
+    def update(lineno_new, byteno_new):
+        nonlocal byteno, lineno
+        while byteno_new != byteno or lineno_new != lineno:
+            byte_offset = max(0, min(byteno_new - byteno, 255))
+            line_offset = max(-128, min(lineno_new - lineno, 127))
+            assert byte_offset != 0 or line_offset != 0
+            byteno += byte_offset
+            lineno += line_offset
+            lnotab.extend((byte_offset, line_offset & 0xFF))
+
+    return lnotab, update
+
+
+def linetable_310_writer(first_lineno):
+    """
+    Used to create typing.CodeType.co_linetable
+    See https://github.com/python/cpython/blob/main/Objects/lnotab_notes.txt
+    This is the internal format of the line number table for Python 3.10
+    """
+    assert sys.version_info >= (3, 10) and sys.version_info < (3, 11)
+    linetable: List[int] = []
+    lineno = first_lineno
+    lineno_delta = 0
+    byteno = 0
+
+    def _update(byteno_delta, lineno_delta):
+        while byteno_delta != 0 or lineno_delta != 0:
+            byte_offset = max(0, min(byteno_delta, 254))
+            line_offset = max(-127, min(lineno_delta, 127))
+            assert byte_offset != 0 or line_offset != 0
+            byteno_delta -= byte_offset
+            lineno_delta -= line_offset
+            linetable.extend((byte_offset, line_offset & 0xFF))
+
+    def update(lineno_new, byteno_new):
+        nonlocal lineno, lineno_delta, byteno
+        byteno_delta = byteno_new - byteno
+        byteno = byteno_new
+        _update(byteno_delta, lineno_delta)
+        lineno_delta = lineno_new - lineno
+        lineno = lineno_new
+
+    def end(total_bytes):
+        _update(total_bytes - byteno, lineno_delta)
+
+    return linetable, update, end
+
+
+def encode_varint(n: int) -> List[int]:
+    """
+    6-bit chunk encoding of an unsigned integer
+    See https://github.com/python/cpython/blob/3.11/Objects/locations.md
+    """
+    assert n >= 0
+    b = [n & 63]
+    n >>= 6
+    while n > 0:
+        b[-1] |= 64
+        b.append(n & 63)
+        n >>= 6
+    return b
+
+
+def linetable_311_writer(first_lineno: int):
+    """
+    Used to create typing.CodeType.co_linetable
+    See https://github.com/python/cpython/blob/3.11/Objects/locations.md
+    This is the internal format of the line number table for Python 3.11
+    """
+    assert sys.version_info >= (3, 11)
+    linetable = []
+    lineno = first_lineno
+
+    def update(positions: "dis.Positions", inst_size):
+        nonlocal lineno
+        lineno_new = positions.lineno if positions else None
+
+        def _update(delta, size):
+            assert 0 < size <= 8
+            # first byte - use 13 (no column info) is positions is
+            # malformed, otherwise use 14 (long form)
+            other_varints: Tuple[int, ...] = ()
+            if (
+                positions
+                and positions.lineno is not None
+                and positions.end_lineno is not None
+                and positions.col_offset is not None
+                and positions.end_col_offset is not None
+            ):
+                linetable.append(0b1_1110_000 + size - 1)
+                # for whatever reason, column offset needs `+ 1`
+                # https://github.com/python/cpython/blob/1931c2a438c50e6250725c84dff94fc760b9b951/Python/compile.c#L7603
+                other_varints = (
+                    positions.end_lineno - positions.lineno,
+                    positions.col_offset + 1,
+                    positions.end_col_offset + 1,
+                )
+            else:
+                linetable.append(0b1_1101_000 + size - 1)
+            # encode signed int
+            if delta < 0:
+                delta = ((-delta) << 1) | 1
+            else:
+                delta <<= 1
+            # encode unsigned int
+            linetable.extend(encode_varint(delta))
+            for n in other_varints:
+                linetable.extend(encode_varint(n))
+
+        if lineno_new is None:
+            lineno_delta = 0
+        else:
+            lineno_delta = lineno_new - lineno
+            lineno = lineno_new
+        while inst_size > 8:
+            _update(lineno_delta, 8)
+            inst_size -= 8
+        _update(lineno_delta, inst_size)
+
+    return linetable, update
+
+
+@dataclasses.dataclass
+class ExceptionTableEntry:
+    start: int
+    end: int
+    target: int
+    depth: int
+    lasti: bool
+
+
+def encode_exception_table_varint(n: int) -> List[int]:
+    """
+    Similar to `encode_varint`, but the 6-bit chunks are ordered in reverse.
+    """
+    assert n >= 0
+    b = [n & 63]
+    n >>= 6
+    while n > 0:
+        b.append(n & 63)
+        n >>= 6
+    b.reverse()
+    for i in range(len(b) - 1):
+        b[i] |= 64
+    return b
+
+
+def decode_exception_table_varint(bytes_iter: Iterator[int]) -> int:
+    """
+    Inverse of `encode_exception_table_varint`.
+    """
+    b = next(bytes_iter)
+    val = b & 63
+    while b & 64:
+        val <<= 6
+        b = next(bytes_iter)
+        val |= b & 63
+    return val
+
+
+def check_exception_table(tab: List[ExceptionTableEntry]) -> None:
+    """
+    Verifies that a list of ExceptionTableEntries will make a well-formed
+    jump table: entries are non-empty, sorted, and do not overlap.
+    """
+    for i in range(len(tab) - 1):
+        assert (
+            tab[i].start <= tab[i].end
+            and tab[i].end < tab[i + 1].start
+            and tab[i + 1].start <= tab[i + 1].end
+        )
+
+
+def parse_exception_table(exntab: bytes) -> List[ExceptionTableEntry]:
+    """
+    Parse the exception table according to
+    https://github.com/python/cpython/blob/3.11/Objects/exception_handling_notes.txt
+    """
+    exntab_iter = iter(exntab)
+    tab = []
+    try:
+        while True:
+            start = decode_exception_table_varint(exntab_iter) * 2
+            length = decode_exception_table_varint(exntab_iter) * 2
+            end = start + length - 2
+            target = decode_exception_table_varint(exntab_iter) * 2
+            dl = decode_exception_table_varint(exntab_iter)
+            depth = dl >> 1
+            lasti = bool(dl & 1)
+            tab.append(ExceptionTableEntry(start, end, target, depth, lasti))
+    except StopIteration:
+        check_exception_table(tab)
+        return tab
+
+
+def assemble_exception_table(tab: List[ExceptionTableEntry]) -> bytes:
+    """
+    Inverse of parse_exception_table - encodes list of exception
+    table entries into bytes.
+    """
+    b = []
+    for entry in tab:
+        first_entry = encode_exception_table_varint(entry.start // 2)
+        first_entry[0] |= 1 << 7
+        b.extend(first_entry)
+        length = entry.end - entry.start + 2
+        b.extend(encode_exception_table_varint(length // 2))
+        b.extend(encode_exception_table_varint(entry.target // 2))
+        dl = (entry.depth << 1) + entry.lasti
+        b.extend(encode_exception_table_varint(dl))
+    return bytes(b)
+
+
+def assemble(instructions: List[Instruction], firstlineno: int) -> Tuple[bytes, bytes]:
+    """Do the opposite of dis.get_instructions()"""
+    code: List[int] = []
+    if sys.version_info >= (3, 11):
+        lnotab, update_lineno = linetable_311_writer(firstlineno)
+        num_ext = 0
+        for i, inst in enumerate(instructions):
+            if inst.opname == "EXTENDED_ARG":
+                inst_size = 1
+                num_ext += 1
+                # copy positions from the actual instruction
+                for j in (1, 2, 3):
+                    if instructions[i + j].opname != "EXTENDED_ARG":
+                        inst.positions = instructions[i + j].positions
+                        break
+            else:
+                inst_size = instruction_size(inst) // 2 + num_ext
+                num_ext = 0
+            update_lineno(inst.positions, inst_size)
+            num_ext = 0
+            arg = inst.arg or 0
+            code.extend((inst.opcode, arg & 0xFF))
+            for _ in range(instruction_size(inst) // 2 - 1):
+                code.extend((0, 0))
+    else:
+        if sys.version_info < (3, 10):
+            lnotab, update_lineno = lnotab_writer(firstlineno)
+        else:
+            lnotab, update_lineno, end = linetable_310_writer(firstlineno)
+
+        for inst in instructions:
+            if inst.starts_line is not None:
+                update_lineno(inst.starts_line, len(code))
+            arg = inst.arg or 0
+            code.extend((inst.opcode, arg & 0xFF))
+
+        if sys.version_info >= (3, 10):
+            end(len(code))
+
+    return bytes(code), bytes(lnotab)
+
+
+def _get_instruction_by_offset(offset_to_inst: Dict[int, Instruction], offset: int):
+    """
+    Get the instruction located at a given offset, accounting for EXTENDED_ARGs
+    """
+    for n in (0, 2, 4, 6):
+        if offset_to_inst[offset + n].opcode != dis.EXTENDED_ARG:
+            return offset_to_inst[offset + n]
+    return None
+
+
+def virtualize_jumps(instructions) -> None:
+    """Replace jump targets with pointers to make editing easier"""
+    jump_targets = {inst.offset: inst for inst in instructions}
+
+    for inst in instructions:
+        if inst.opcode in dis.hasjabs or inst.opcode in dis.hasjrel:
+            inst.target = _get_instruction_by_offset(jump_targets, inst.argval)
+
+
+_REL_JUMPS = set(dis.hasjrel)
+
+
+def flip_jump_direction(instruction: Instruction) -> None:
+    if sys.version_info < (3, 11):
+        raise RuntimeError("Cannot flip jump direction in Python < 3.11")
+    if "FORWARD" in instruction.opname:
+        instruction.opname = instruction.opname.replace("FORWARD", "BACKWARD")
+    elif "BACKWARD" in instruction.opname:
+        instruction.opname = instruction.opname.replace("BACKWARD", "FORWARD")
+    else:
+        raise AttributeError("Instruction is not a forward or backward jump")
+    instruction.opcode = dis.opmap[instruction.opname]
+    assert instruction.opcode in _REL_JUMPS
+
+
+def _get_instruction_front(instructions: List[Instruction], idx: int):
+    """
+    i.e. get the first EXTENDED_ARG instruction (if any) when targeting
+    instructions[idx] with a jump.
+    """
+    target = instructions[idx]
+    for offset in (1, 2, 3):
+        if idx >= offset and instructions[idx - offset].opcode == dis.EXTENDED_ARG:
+            target = instructions[idx - offset]
+        else:
+            break
+    return target
+
+
+def devirtualize_jumps(instructions):
+    """Fill in args for virtualized jump target after instructions may have moved"""
+    indexof = get_indexof(instructions)
+    jumps = set(dis.hasjabs).union(set(dis.hasjrel))
+
+    for inst in instructions:
+        if inst.opcode in jumps:
+            target = _get_instruction_front(instructions, indexof[inst.target])
+            if inst.opcode in dis.hasjabs:
+                if sys.version_info < (3, 10):
+                    inst.arg = target.offset
+                elif sys.version_info < (3, 11):
+                    # `arg` is expected to be bytecode offset, whereas `offset` is byte offset.
+                    # Divide since bytecode is 2 bytes large.
+                    inst.arg = int(target.offset / 2)
+                else:
+                    raise RuntimeError("Python 3.11+ should not have absolute jumps")
+            else:  # relative jump
+                # byte offset between target and next instruction
+                inst.arg = int(target.offset - inst.offset - instruction_size(inst))
+                if inst.arg < 0:
+                    if sys.version_info < (3, 11):
+                        raise RuntimeError("Got negative jump offset for Python < 3.11")
+                    inst.arg = -inst.arg
+                    # forward jumps become backward
+                    if "FORWARD" in inst.opname:
+                        flip_jump_direction(inst)
+                elif inst.arg > 0:
+                    # backward jumps become forward
+                    if sys.version_info >= (3, 11) and "BACKWARD" in inst.opname:
+                        flip_jump_direction(inst)
+                if sys.version_info >= (3, 10):
+                    # see bytecode size comment in the absolute jump case above
+                    inst.arg //= 2
+            inst.argval = target.offset
+            inst.argrepr = f"to {target.offset}"
+
+
+def virtualize_exception_table(exn_tab_bytes: bytes, instructions: List[Instruction]):
+    """Replace exception table entries with pointers to make editing easier"""
+    exn_tab = parse_exception_table(exn_tab_bytes)
+    offset_to_inst = {cast(int, inst.offset): inst for inst in instructions}
+    offsets = sorted(offset_to_inst.keys())
+    end_offset_idx = 0
+    exn_tab_iter = iter(exn_tab)
+    try:
+
+        def step():
+            nonlocal end_offset_idx
+            entry = next(exn_tab_iter)
+            # find rightmost offset <= entry.end, since entry.end may not be
+            # an actual instruction, e.g. if the end instruction is LOAD_GLOBAL,
+            # which takes more than 2 bytes, then entry.end points to the end
+            # of the LOAD_GLOBAL instruction, not the beginning.
+            while (
+                end_offset_idx < len(offsets) and offsets[end_offset_idx] <= entry.end
+            ):
+                end_offset_idx += 1
+            assert end_offset_idx > 0
+            end_offset = offsets[end_offset_idx - 1]
+            inst_entry = InstructionExnTabEntry(
+                _get_instruction_by_offset(offset_to_inst, entry.start),
+                _get_instruction_by_offset(offset_to_inst, end_offset),
+                _get_instruction_by_offset(offset_to_inst, entry.target),
+                entry.depth,
+                entry.lasti,
+            )
+            return entry, inst_entry
+
+        entry, inst_entry = step()
+        for inst in instructions:
+            while inst.offset > entry.end:
+                entry, inst_entry = step()
+            if inst.offset >= entry.start:
+                inst.exn_tab_entry = copy.copy(inst_entry)
+    except StopIteration:
+        pass
+
+
+def compute_exception_table(
+    instructions: List[Instruction],
+) -> List[ExceptionTableEntry]:
+    """Compute exception table in list format from instructions with exn_tab_entries"""
+    exn_dict: Dict[Tuple[int, int], Tuple[int, int, bool]] = {}
+    indexof = get_indexof(instructions)
+
+    for inst in instructions:
+        if inst.exn_tab_entry:
+            # account for prefixed EXTENDED_ARGS
+            start = _get_instruction_front(
+                instructions, indexof[inst.exn_tab_entry.start]
+            ).offset
+            # point to the last 2 bytes of the end instruction
+            end = (
+                cast(int, inst.exn_tab_entry.end.offset)
+                + instruction_size(inst.exn_tab_entry.end)
+                - 2
+            )
+            target = _get_instruction_front(
+                instructions, indexof[inst.exn_tab_entry.target]
+            ).offset
+            key = (start, end)
+            val = (target, inst.exn_tab_entry.depth, inst.exn_tab_entry.lasti)
+            if key in exn_dict:
+                assert exn_dict[key] == val
+            exn_dict[key] = val
+
+    # Dynamo may construct nested exception table entries for convenience,
+    # but Python expects exception table entries to not overlap.
+    # NOTE: below, "keys" refer to old instruction entries' starts and ends,
+    # and "entries" refer to the generated exception table entries.
+
+    # Sort keys by increasing start, then decreasing end
+    keys_sorted = sorted(exn_dict.keys(), key=lambda t: (t[0], -t[1]))
+    # smallest byte that the next exception table entry can start at
+    nexti = 0
+    # stack of current nested keys
+    key_stack: List[Tuple[int, int]] = []
+    exn_tab: List[ExceptionTableEntry] = []
+
+    def pop():
+        """
+        Pop the key_stack and append an exception table entry if possible.
+        """
+        nonlocal nexti
+        if key_stack:
+            key = key_stack.pop()
+            if nexti <= key[1]:
+                exn_tab.append(
+                    ExceptionTableEntry(max(key[0], nexti), key[1], *exn_dict[key])
+                )
+                nexti = key[1] + 2
+
+    for key in keys_sorted:
+        # pop keys that are no longer nested over the current key
+        while key_stack and key_stack[-1][1] < key[0]:
+            pop()
+        if key_stack:
+            # create an entry covering to the current key, if possible
+            assert key_stack[-1][0] <= key[0] <= key[1] <= key_stack[-1][1]
+            left = max(nexti, key_stack[-1][0])
+            if left < key[0]:
+                exn_tab.append(
+                    ExceptionTableEntry(left, key[0] - 2, *exn_dict[key_stack[-1]])
+                )
+            nexti = key[0]
+        key_stack.append(key)
+    while key_stack:
+        pop()
+    check_exception_table(exn_tab)
+    return exn_tab
+
+
+def check_inst_exn_tab_entries_nested(
+    tab: List[InstructionExnTabEntry], indexof
+) -> None:
+    """
+    Checks `tab` is a properly sorted list of nested InstructionExnTabEntry's,
+    i.e. no entries partially overlap.
+    "Properly sorted" means entries are sorted by increasing starts, then
+    decreasing ends.
+    """
+    entry_stack: List[Tuple[int, int]] = []
+    for entry in tab:
+        key = (indexof[entry.start], indexof[entry.end])
+        while entry_stack and entry_stack[-1][1] < key[0]:
+            entry_stack.pop()
+        if entry_stack:
+            assert entry_stack[-1][0] <= key[0] <= key[1] <= entry_stack[-1][1]
+        entry_stack.append(key)
+
+
+def propagate_inst_exn_table_entries(instructions: List[Instruction]) -> None:
+    """
+    Copies exception table entries to all instructions in an entry's range.
+    Supports nested exception table entries.
+    """
+    indexof = get_indexof(instructions)
+    entries: Dict[Tuple[int, int], InstructionExnTabEntry] = {}
+    for inst in instructions:
+        if inst.exn_tab_entry:
+            key = (
+                indexof[inst.exn_tab_entry.start],
+                indexof[inst.exn_tab_entry.end],
+            )
+            if key in entries:
+                assert inst.exn_tab_entry == entries[key]
+            entries[key] = inst.exn_tab_entry
+    sorted_entries = [
+        entries[key] for key in sorted(entries.keys(), key=lambda t: (t[0], -t[1]))
+    ]
+    check_inst_exn_tab_entries_nested(sorted_entries, indexof)
+    # Propagation of nested entries works since nested entries come later
+    # in sorted order.
+    for entry in sorted_entries:
+        for i in range(indexof[entry.start], indexof[entry.end] + 1):
+            instructions[i].exn_tab_entry = copy.copy(entry)
+
+
+def check_inst_exn_tab_entries_valid(instructions: List[Instruction]):
+    """
+    Checks that exn_tab_entries of instructions are valid.
+    An entry's start, end, and target must be in instructions.
+    Instructions with an exn_tab_entry are located within
+    the entry's start and end instructions.
+    Instructions do not share exn_tab_entries.
+
+    Implicitly checks for no duplicate instructions.
+    """
+    indexof = get_indexof(instructions)
+    exn_tab_entry_set = set()
+    for i, inst in enumerate(instructions):
+        if inst.exn_tab_entry:
+            assert sys.version_info >= (3, 11)
+            assert id(inst.exn_tab_entry) not in exn_tab_entry_set
+            exn_tab_entry_set.add(id(inst.exn_tab_entry))
+            entry = inst.exn_tab_entry
+            assert entry.start in indexof
+            assert entry.end in indexof
+            assert entry.target in indexof
+            assert indexof[entry.start] <= i <= indexof[entry.end]
+
+
+def strip_extended_args(instructions: List[Instruction]) -> None:
+    instructions[:] = [i for i in instructions if i.opcode != dis.EXTENDED_ARG]
+
+
+def remove_load_call_method(instructions: List[Instruction]) -> List[Instruction]:
+    """LOAD_METHOD puts a NULL on the stack which causes issues, so remove it"""
+    rewrites = {"LOAD_METHOD": "LOAD_ATTR", "CALL_METHOD": "CALL_FUNCTION"}
+    for inst in instructions:
+        if inst.opname in rewrites:
+            inst.opname = rewrites[inst.opname]
+            inst.opcode = dis.opmap[inst.opname]
+    return instructions
+
+
+def remove_jump_if_none(instructions: List[Instruction]) -> None:
+    new_insts = []
+    for inst in instructions:
+        new_insts.append(inst)
+        if "_NONE" in inst.opname:
+            is_op = create_instruction("IS_OP", arg=int("NOT" in inst.opname))
+            is_op.argval = is_op.arg
+            jump_op = create_instruction(
+                "POP_JUMP_FORWARD_IF_TRUE"
+                if "FORWARD" in inst.opname
+                else "POP_JUMP_BACKWARD_IF_TRUE",
+                target=inst.target,
+            )
+            # modify inst in-place to preserve jump target
+            inst.opcode = dis.opmap["LOAD_CONST"]
+            inst.opname = "LOAD_CONST"
+            inst.arg = None
+            inst.argval = None
+            new_insts.extend([is_op, jump_op])
+    instructions[:] = new_insts
+
+
+def explicit_super(code: types.CodeType, instructions: List[Instruction]) -> None:
+    """convert super() with no args into explicit arg form"""
+    cell_and_free = (code.co_cellvars or tuple()) + (code.co_freevars or tuple())
+    if not len(code.co_varnames):
+        # A function with no argument cannot contain a valid "super()" call
+        return
+    output = []
+    for idx, inst in enumerate(instructions):
+        output.append(inst)
+        if inst.opname == "LOAD_GLOBAL" and inst.argval == "super":
+            nexti = instructions[idx + 1]
+            if nexti.opname in ("CALL_FUNCTION", "PRECALL") and nexti.arg == 0:
+                assert "__class__" in cell_and_free
+                output.append(create_instruction("LOAD_DEREF", argval="__class__"))
+                first_var = code.co_varnames[0]
+                if first_var in cell_and_free:
+                    output.append(create_instruction("LOAD_DEREF", argval=first_var))
+                else:
+                    output.append(create_instruction("LOAD_FAST", argval=first_var))
+                nexti.arg = 2
+                nexti.argval = 2
+                if nexti.opname == "PRECALL":
+                    # also update the following CALL instruction
+                    call_inst = instructions[idx + 2]
+                    call_inst.arg = 2
+                    call_inst.argval = 2
+
+    instructions[:] = output
+
+
+def fix_extended_args(instructions: List[Instruction]) -> int:
+    """Fill in correct argvals for EXTENDED_ARG ops"""
+    output: List[Instruction] = []
+
+    def maybe_pop_n(n):
+        for _ in range(n):
+            if output and output[-1].opcode == dis.EXTENDED_ARG:
+                output.pop()
+
+    for inst in instructions:
+        if inst.opcode == dis.EXTENDED_ARG:
+            # Leave this instruction alone for now so we never shrink code
+            inst.arg = 0
+        elif inst.arg and inst.arg > 0xFFFFFF:
+            maybe_pop_n(3)
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 24))
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 16))
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 8))
+        elif inst.arg and inst.arg > 0xFFFF:
+            maybe_pop_n(2)
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 16))
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 8))
+        elif inst.arg and inst.arg > 0xFF:
+            maybe_pop_n(1)
+            output.append(create_instruction("EXTENDED_ARG", arg=inst.arg >> 8))
+        output.append(inst)
+
+    added = len(output) - len(instructions)
+    assert added >= 0
+    instructions[:] = output
+    return added
+
+
+# from https://github.com/python/cpython/blob/v3.11.1/Include/internal/pycore_opcode.h#L41
+# TODO use the actual object instead, can interface from eval_frame.c
+_PYOPCODE_CACHES = {
+    "BINARY_SUBSCR": 4,
+    "STORE_SUBSCR": 1,
+    "UNPACK_SEQUENCE": 1,
+    "STORE_ATTR": 4,
+    "LOAD_ATTR": 4,
+    "COMPARE_OP": 2,
+    "LOAD_GLOBAL": 5,
+    "BINARY_OP": 1,
+    "LOAD_METHOD": 10,
+    "PRECALL": 1,
+    "CALL": 4,
+}
+
+
+def instruction_size(inst) -> int:
+    if sys.version_info >= (3, 11):
+        return 2 * (_PYOPCODE_CACHES.get(dis.opname[inst.opcode], 0) + 1)
+    return 2
+
+
+def check_offsets(instructions) -> None:
+    offset = 0
+    for inst in instructions:
+        assert inst.offset == offset
+        offset += instruction_size(inst)
+
+
+def update_offsets(instructions) -> None:
+    offset = 0
+    for inst in instructions:
+        inst.offset = offset
+        offset += instruction_size(inst)
+
+
+def debug_bytes(*args) -> str:
+    index = range(max(map(len, args)))
+    result = []
+    for arg in (
+        [index] + list(args) + [[int(a != b) for a, b in zip(args[-1], args[-2])]]
+    ):
+        result.append(" ".join(f"{x:03}" for x in arg))
+
+    return "bytes mismatch\n" + "\n".join(result)
+
+
+def debug_checks(code):
+    """Make sure our assembler produces same bytes as we start with"""
+    dode = transform_code_object(code, lambda x, y: None, safe=True)
+    assert code.co_code == dode.co_code, debug_bytes(code.co_code, dode.co_code)
+    assert code.co_lnotab == dode.co_lnotab, debug_bytes(code.co_lnotab, dode.co_lnotab)
+
+
+HAS_LOCAL = set(dis.haslocal)
+HAS_NAME = set(dis.hasname)
+HAS_FREE = set(dis.hasfree)
+HAS_CONST = set(dis.hasconst)
+
+
+def get_const_index(code_options, val) -> int:
+    for i, v in enumerate(code_options["co_consts"]):
+        # NOTE: stronger comparison is required, since we have
+        # examples where two values compare equal but have
+        # different semantic meaning in some cases, e.g.
+        # 0.0 == -0.0 but have different effects in torch.copysign.
+        if val is v:
+            return i
+    code_options["co_consts"] += (val,)
+    return len(code_options["co_consts"]) - 1
+
+
+def fix_vars(instructions: List[Instruction], code_options, varname_from_oparg=None):
+    # compute instruction arg from argval if arg is not provided
+    names = {name: idx for idx, name in enumerate(code_options["co_names"])}
+    if sys.version_info < (3, 11):
+        assert varname_from_oparg is None
+        varnames = {name: idx for idx, name in enumerate(code_options["co_varnames"])}
+        freenames = {
+            name: idx
+            for idx, name in enumerate(
+                code_options["co_cellvars"] + code_options["co_freevars"]
+            )
+        }
+    else:
+        assert callable(varname_from_oparg)
+        allnames = {}
+        for idx in itertools.count():
+            try:
+                name = varname_from_oparg(idx)
+                allnames[name] = idx
+            except IndexError:
+                break
+        varnames = {name: allnames[name] for name in code_options["co_varnames"]}
+        freenames = {
+            name: allnames[name]
+            for name in code_options["co_cellvars"] + code_options["co_freevars"]
+        }
+    for i in range(len(instructions)):
+
+        def should_compute_arg():
+            # argval is prioritized over arg
+            return instructions[i].argval is not _NotProvided
+
+        if instructions[i].opname == "LOAD_GLOBAL":
+            # 3.11 LOAD_GLOBAL requires both arg and argval - see create_load_global
+            assert instructions[i].arg is not None
+            assert instructions[i].argval is not _NotProvided
+            if sys.version_info >= (3, 11):
+                instructions[i].arg = (names[instructions[i].argval] << 1) + (
+                    cast(int, instructions[i].arg) % 2
+                )
+            else:
+                instructions[i].arg = names[instructions[i].argval]
+        elif instructions[i].opcode in HAS_LOCAL:
+            if should_compute_arg():
+                instructions[i].arg = varnames[instructions[i].argval]
+        elif instructions[i].opcode in HAS_NAME:
+            if should_compute_arg():
+                instructions[i].arg = names[instructions[i].argval]
+        elif instructions[i].opcode in HAS_FREE:
+            if should_compute_arg():
+                instructions[i].arg = freenames[instructions[i].argval]
+        elif instructions[i].opcode in HAS_CONST:
+            # NOTE: only update argval if arg is not provided. This assumes
+            # that any additions to co_consts are appended.
+            if instructions[i].arg is None:
+                # cannot use a dictionary since consts may not be hashable
+                idx = get_const_index(code_options, instructions[i].argval)
+                assert idx >= 0
+                instructions[i].arg = idx
+
+
+def get_code_keys() -> List[str]:
+    # Python 3.11 changes to code keys are not fully documented.
+    # See https://github.com/python/cpython/blob/3.11/Objects/clinic/codeobject.c.h#L24
+    # for new format.
+    keys = ["co_argcount"]
+    keys.append("co_posonlyargcount")
+    keys.extend(
+        [
+            "co_kwonlyargcount",
+            "co_nlocals",
+            "co_stacksize",
+            "co_flags",
+            "co_code",
+            "co_consts",
+            "co_names",
+            "co_varnames",
+            "co_filename",
+            "co_name",
+        ]
+    )
+    if sys.version_info >= (3, 11):
+        keys.append("co_qualname")
+    keys.append("co_firstlineno")
+    if sys.version_info >= (3, 10):
+        keys.append("co_linetable")
+    else:
+        keys.append("co_lnotab")
+    if sys.version_info >= (3, 11):
+        # not documented, but introduced in https://github.com/python/cpython/issues/84403
+        keys.append("co_exceptiontable")
+    keys.extend(
+        [
+            "co_freevars",
+            "co_cellvars",
+        ]
+    )
+    return keys
+
+
+def transform_code_object(code, transformations, safe=False) -> types.CodeType:
+    keys = get_code_keys()
+    code_options = {k: getattr(code, k) for k in keys}
+    assert len(code_options["co_varnames"]) == code_options["co_nlocals"]
+
+    instructions = cleaned_instructions(code, safe)
+    propagate_line_nums(instructions)
+
+    transformations(instructions, code_options)
+    return clean_and_assemble_instructions(instructions, keys, code_options)[1]
+
+
+def clean_and_assemble_instructions(
+    instructions: List[Instruction], keys: List[str], code_options: Dict[str, Any]
+) -> Tuple[List[Instruction], types.CodeType]:
+    # also implicitly checks for no duplicate instructions
+    check_inst_exn_tab_entries_valid(instructions)
+
+    code_options["co_nlocals"] = len(code_options["co_varnames"])
+    varname_from_oparg = None
+    if sys.version_info >= (3, 11):
+        # temporary code object with updated names
+        tmp_code = types.CodeType(*[code_options[k] for k in keys])
+        varname_from_oparg = tmp_code._varname_from_oparg  # type: ignore[attr-defined]
+    fix_vars(instructions, code_options, varname_from_oparg=varname_from_oparg)
+
+    dirty = True
+    while dirty:
+        update_offsets(instructions)
+        devirtualize_jumps(instructions)
+        # this pass might change offsets, if so we need to try again
+        dirty = bool(fix_extended_args(instructions))
+
+    remove_extra_line_nums(instructions)
+    bytecode, lnotab = assemble(instructions, code_options["co_firstlineno"])
+    if sys.version_info < (3, 10):
+        code_options["co_lnotab"] = lnotab
+    else:
+        code_options["co_linetable"] = lnotab
+
+    code_options["co_code"] = bytecode
+    code_options["co_stacksize"] = stacksize_analysis(instructions)
+    assert set(keys) - {"co_posonlyargcount"} == set(code_options.keys()) - {
+        "co_posonlyargcount"
+    }
+    if sys.version_info >= (3, 11):
+        code_options["co_exceptiontable"] = assemble_exception_table(
+            compute_exception_table(instructions)
+        )
+    return instructions, types.CodeType(*[code_options[k] for k in keys])
+
+
+def populate_kw_names_argval(instructions, consts):
+    for inst in instructions:
+        if inst.opname == "KW_NAMES":
+            inst.argval = consts[inst.arg]
+
+
+def cleaned_instructions(code, safe=False) -> List[Instruction]:
+    instructions = list(map(convert_instruction, dis.get_instructions(code)))
+    check_offsets(instructions)
+    if sys.version_info >= (3, 11):
+        populate_kw_names_argval(instructions, code.co_consts)
+        virtualize_exception_table(code.co_exceptiontable, instructions)
+    virtualize_jumps(instructions)
+    strip_extended_args(instructions)
+    if not safe:
+        if sys.version_info < (3, 11):
+            remove_load_call_method(instructions)
+        else:
+            remove_jump_if_none(instructions)
+            update_offsets(instructions)
+            devirtualize_jumps(instructions)
+        explicit_super(code, instructions)
+    return instructions
+
+
+_unique_id_counter = itertools.count()
+
+
+def unique_id(name) -> str:
+    return f"{name}_{next(_unique_id_counter)}"
+
+
+def is_generator(code: types.CodeType) -> bool:
+    co_generator = 0x20
+    return (code.co_flags & co_generator) > 0
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/cache_size.py b/MLPY/Lib/site-packages/torch/_dynamo/cache_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbbbd5e26541eae62db34a3e498f6191c00f4b99
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/cache_size.py
@@ -0,0 +1,172 @@
+import logging
+import types
+import weakref
+from dataclasses import dataclass
+from typing import Tuple
+
+from . import config
+
+log = logging.getLogger(__name__)
+"""
+[Note on cache size limit]
+
+Background - TorchDynamo cache is a linked list. Each cache entry is a
+(check_fn, out_code, next pointer). These are stored on the f_code's co_extra
+scratch space. When a frame is invoked, we walk this linked list and run
+check_fn in each cache_entry to decide if the frame needs recompilation. If none
+of the check_fn's returns True, we recompile and add a new entry. To ensure we
+don't end up recompiling infinitely, we put limits on the cache size.
+
+There are two limits
+1) cache_size_limit
+2) accumulated_cache_size_limit
+
+
+Earlier we used to have only limit - maximum number of entries in 1 cache line
+(which is now represented by (2) above). So, why do we need two limits? Lets try
+to understand that.
+
+In general, we want our cache limit value to be a small number (e.g. 8 or even
+lower). This ensures that for frames that cause too many recompilation fall to
+eager quickly. However, there is another problem that prevents us from lowering
+the value of cache_size_limit. This is due to ID_MATCH'd guards. Today, we put
+ID_MATCH guards on nn module if there is a graph break. This means we will have
+many recompilations for the same code object because the ID_MATCH guard fails
+for different instances of the nn module. This is a common pattern in how models
+are authored. Therefore, this requires us to keep the cache_size_limit high.
+
+We resolve this by introducing these two limits. The first limit (1) limits the
+number of cache entries that have an ID_MATCH'd guard for an nn module instance.
+And, (2)nd limit becomes a safeguard mechanism to have a maximum compilations
+for a code object. One important question is - what is the limit for the code
+object that does not have any ID_MATCH guard? For such code objects, we choose
+(1) as the cache size limit.
+
+Lets take an example to understand how these limits help. Suppose, we have 16
+instances of a nn module and we ID_MATCH on the self object. Further, suppose
+the inputs to these functions have varying batch size, leading to one
+recompilation. In total, there will be 32 recompilations, and therefore 32 cache
+entries on the forward code object. In the older case when we had only 1 limit,
+our cache size limit must be >= 32 to capture all these recompilations. Now,
+suppose there is a separate function in the same program which is very dynamic
+and unsuitable for compilation. Such a function will need to undergo 32
+compilations to burst the cache and fallback to eager. These 32 recompilations
+are too many and we want to fallback for these compilation-unfriendly functions
+sooner.
+
+In the new scenario, we can have (1) cache_size_limit = 2, (2)
+accumulated_cache_size_limit = 32. This means that each ID_MATCH'd object can
+have maximum of two cache entries, and the maximum number of cache entries
+(irrespective of ID_MATCH obj) is 32. This covers the case of forward code
+object which has 32 recompilations. For the other function, the one unsuitable
+for recompilation, our limit is 2. So, we will burst the cache in just 2
+recompilations. In this manner, these 2 limits help us resolve the tension
+mentioned earlier.
+"""
+
+
+@dataclass
+class CacheSizeRelevantForFrame:
+    """
+    We track the number of cache entries that have same id_match objects as the
+    given frame.
+
+    TODO(janimesh) - Consider adding a map from tuple_of_match_ids to count -
+    https://github.com/pytorch/pytorch/pull/107496#discussion_r1304564682 - this
+    could be useful for debugging as well.
+    """
+
+    # Total number of CacheEntry objects in the Dynamo linked list
+    num_cache_entries: int = 0
+
+    # Number of CacheEntry objects having same ID_MATCH'd objects as given frame.
+    num_cache_entries_with_same_id_matched_objs: int = 0
+
+    def will_compilation_exceed(self, limit: int) -> bool:
+        # Checks if a compilation will exceed the given limit (thats why >=).
+        return (
+            self.will_compilation_exceed_accumulated_limit()
+            or self.will_compilation_exceed_specific_limit(limit)
+        )
+
+    def will_compilation_exceed_accumulated_limit(self) -> bool:
+        return self.num_cache_entries >= config.accumulated_cache_size_limit
+
+    def will_compilation_exceed_specific_limit(self, limit: int) -> bool:
+        return self.num_cache_entries_with_same_id_matched_objs >= limit
+
+
+def _get_weakref_from_f_locals(frame: types.FrameType, local_name: str):
+    obj = frame.f_locals.get(local_name, None)
+    weak_id = None
+    try:
+        weak_id = weakref.ref(obj)
+    except TypeError:
+        pass  # cannot weakref bool object
+    return weak_id
+
+
+def _has_same_id_matched_objs(frame: types.FrameType, cache_entry) -> bool:
+    """
+    Checks if the ID_MATCH'd objects saved on cache_entry are same as the ones
+    in frame.f_locals.
+    """
+    if not cache_entry:
+        return False
+
+    for (
+        local_name,
+        weakref_from_cache_entry,
+    ) in cache_entry.check_fn.id_matched_objs.items():
+        if weakref_from_cache_entry() is not None:
+            weakref_from_frame = _get_weakref_from_f_locals(frame, local_name)
+            if weakref_from_frame != weakref_from_cache_entry:
+                return False
+
+    # Also covers the case where no ID_MATCH objects are saved in frame.f_locals
+    return True
+
+
+def compute_cache_size(
+    frame: types.FrameType, cache_entry
+) -> CacheSizeRelevantForFrame:
+    # Walk the linked list to calculate the cache size
+    num_cache_entries = 0
+    num_cache_entries_with_same_id_matched_objs = 0
+
+    while cache_entry:
+        num_cache_entries += 1
+        # Track the number of cache entries having same ID_MATCH'd objects as
+        # that of frame.f_locals. This will be used later to compare against the
+        # cache_size_limit.
+        if _has_same_id_matched_objs(frame, cache_entry):
+            num_cache_entries_with_same_id_matched_objs += 1
+        cache_entry = cache_entry.next
+
+    return CacheSizeRelevantForFrame(
+        num_cache_entries, num_cache_entries_with_same_id_matched_objs
+    )
+
+
+def is_recompilation(cache_size: CacheSizeRelevantForFrame) -> bool:
+    """
+    If the frame (earlier parsed by compute_cache_size) has more than 1 cache
+    entry with same ID_MATCH'd objects, then its a recompilation.
+    """
+    # Note that you can have multiple entries in the cache but still not a
+    # recompile, e.g., you can have 64 nn module instances, each one having an
+    # ID_MATCH guard, and each one having just 1 cache entry in the cache.  In
+    # this case, we can have 64 entries in the cache, but no recompilation
+    # because there is only one entry for each id_matched_obj.
+    return cache_size.will_compilation_exceed(1)
+
+
+def exceeds_cache_size_limit(cache_size: CacheSizeRelevantForFrame) -> Tuple[bool, str]:
+    """
+    Checks if we are exceeding the cache size limit.
+    """
+    if cache_size.will_compilation_exceed_accumulated_limit():
+        return True, "accumulated_cache_size_limit"
+    if cache_size.will_compilation_exceed_specific_limit(config.cache_size_limit):
+        return True, "cache_size_limit"
+    return False, ""
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/callback.py b/MLPY/Lib/site-packages/torch/_dynamo/callback.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d6979645af78764a7359ca88bf7880fbefd5a54
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/callback.py
@@ -0,0 +1,82 @@
+class CompilationCallbackHandler:
+    def __init__(self):
+        self.start_callbacks = []
+        self.end_callbacks = []
+
+    def register_start_callback(self, callback):
+        """
+        Register a callback function to be called when the compilation starts.
+
+        Args:
+        - callback (callable): The callback function to register.
+        """
+        self.start_callbacks.append(callback)
+        return callback
+
+    def register_end_callback(self, callback):
+        """
+        Register a callback function to be called when the compilation ends.
+
+        Args:
+        - callback (callable): The callback function to register.
+        """
+        self.end_callbacks.append(callback)
+        return callback
+
+    def remove_start_callback(self, callback):
+        """
+        Remove a registered start callback function.
+
+        Args:
+        - callback (callable): The callback function to remove.
+        """
+        self.start_callbacks.remove(callback)
+
+    def remove_end_callback(self, callback):
+        """
+        Remove a registered end callback function.
+
+        Args:
+        - callback (callable): The callback function to remove.
+        """
+        self.end_callbacks.remove(callback)
+
+    def run_start_callbacks(self):
+        """
+        Execute all registered start callbacks.
+        """
+        for callback in self.start_callbacks:
+            callback()
+
+    def run_end_callbacks(self):
+        """
+        Execute all registered end callbacks.
+        """
+        for callback in self.end_callbacks:
+            callback()
+
+    def clear(self):
+        """
+        Clear all registered callbacks.
+        """
+        self.start_callbacks.clear()
+        self.end_callbacks.clear()
+
+
+callback_handler = CompilationCallbackHandler()
+
+
+def on_compile_start(callback):
+    """
+    Decorator to register a callback function for the start of the compilation.
+    """
+    callback_handler.register_start_callback(callback)
+    return callback
+
+
+def on_compile_end(callback):
+    """
+    Decorator to register a callback function for the end of the compilation.
+    """
+    callback_handler.register_end_callback(callback)
+    return callback
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/code_context.py b/MLPY/Lib/site-packages/torch/_dynamo/code_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5804336fe270f0a29f7a2e17efc80e0bfe0f7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/code_context.py
@@ -0,0 +1,29 @@
+import types
+
+from .utils import ExactWeakKeyDictionary
+
+
+class CodeContextDict:
+    def __init__(self):
+        self.code_context = ExactWeakKeyDictionary()
+
+    def has_context(self, code: types.CodeType):
+        return code in self.code_context
+
+    def get_context(self, code: types.CodeType):
+        ctx = self.code_context.get(code)
+        if ctx is None:
+            ctx = {}
+            self.code_context[code] = ctx
+        return ctx
+
+    def pop_context(self, code: types.CodeType):
+        ctx = self.get_context(code)
+        self.code_context._remove_id(id(code))
+        return ctx
+
+    def clear(self):
+        self.code_context.clear()
+
+
+code_context = CodeContextDict()
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/codegen.py b/MLPY/Lib/site-packages/torch/_dynamo/codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..767bb2c80a5e9b62bd4fb3ac26d87e4c995b958c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/codegen.py
@@ -0,0 +1,398 @@
+import collections
+import dataclasses
+import re
+import sys
+import types
+from typing import Counter, Dict, List, Optional
+
+import torch.nn
+from . import utils
+
+from .bytecode_transformation import (
+    create_call_function,
+    create_dup_top,
+    create_instruction,
+    create_load_global,
+    create_rot_n,
+    Instruction,
+)
+from .exc import unimplemented
+from .source import AttrSource, Source
+from .utils import is_safe_constant, rot_n_helper
+from .variables.base import VariableTracker
+from .variables.nn_module import NNModuleVariable
+from .variables.tensor import (
+    NumpyNdarrayVariable,
+    SymNodeVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+)
+from .variables.torch_function import TensorWithTFOverrideVariable
+
+
+@dataclasses.dataclass
+class GraphOutputEntry:
+    index: int
+    variable: VariableTracker
+
+
+class PyCodegen:
+    """
+    Helper class uses for constructing Python bytecode
+    """
+
+    def __init__(
+        self,
+        tx=None,
+        root: Optional[torch.nn.Module] = None,
+        graph_output_var: Optional[str] = None,
+        tempvars=None,
+    ):
+        self.root = root
+        self.top_of_stack: Optional[VariableTracker] = None
+        self.uses: Counter[VariableTracker] = collections.Counter()
+        self.graph_outputs: Dict[int, GraphOutputEntry] = {}
+        self._output: List[Instruction] = []
+        self.tempvars = tempvars or {}
+        self.tx = tx
+        self.graph_output_var = graph_output_var
+        self.code_options = self.tx.output.code_options
+        self.cell_and_freevars = self.tx.cell_and_freevars
+        self.new_var = self.tx.output.new_var
+        self.mutable_side_effects_from_source = False
+        self.value_from_source: bool = True
+
+    def restore_stack(self, stack_values, *, value_from_source=True):
+        prior = self.mutable_side_effects_from_source
+        self.mutable_side_effects_from_source = True
+        prev = self.value_from_source
+        self.value_from_source &= value_from_source
+        try:
+            self.foreach(stack_values)
+        finally:
+            self.mutable_side_effects_from_source = prior
+            self.value_from_source = prev
+
+    def graph_output_vars(self):
+        return [x.variable for x in self.graph_outputs.values()]
+
+    def call_reconstruct(self, value):
+        res = value.reconstruct(self)
+        assert res is None, f"reconstruct!=None {value}"
+
+    def __call__(self, value, allow_cache=True):
+        """Generate code such that top-of-stack (TOS) is set to value"""
+        if isinstance(value, Source):
+            self.call_reconstruct(value)
+            self.clear_tos()
+            return
+
+        assert isinstance(value, VariableTracker)
+        output = self._output
+        graph_outputs = self.graph_outputs
+
+        if self.top_of_stack is value and allow_cache:
+            output.append(create_dup_top())
+            return
+
+        if self.mutable_side_effects_from_source:
+            # this is needed to get aliasing relationships right
+            # value.mutable_local.source will get mutated to hold `value`
+            # mutable_side_effects_from_source=False is used to codegen the mutation
+            # mutable_side_effects_from_source=True is used to codegen a reference
+            from .side_effects import MutableSideEffects
+
+            if isinstance(value.mutable_local, MutableSideEffects):
+                self(value.mutable_local.source)
+                return
+
+        if allow_cache:
+            if value.mutable_local and value.mutable_local in self.tempvars:
+                output.append(self.create_load(self.tempvars[value.mutable_local]))
+                self.top_of_stack = value
+                return
+            if self.tempvars.get(value) is not None:
+                output.append(self.create_load(self.tempvars[value]))
+                self.top_of_stack = value
+                return
+
+        if value.source is not None and allow_cache and self.value_from_source:
+            self.call_reconstruct(value.source)
+        elif value.is_python_constant() and is_safe_constant(
+            value.as_python_constant()
+        ):
+            output.append(self.create_load_const(value.as_python_constant()))
+        elif isinstance(value, TensorWithTFOverrideVariable):
+            graph_outputs_key = self.add_graph_output(value)
+
+            self.load_import_from(utils.__name__, "to_subclass")
+            self.load_graph_output(graph_outputs[graph_outputs_key].index)
+            output.append(
+                self.create_load_global(
+                    value.global_mangled_class_name(self.tx), False, add=True
+                )
+            )
+            output.extend(create_call_function(2, True))
+        elif isinstance(
+            value,
+            (
+                TensorVariable,
+                SymNodeVariable,
+                UnspecializedPythonVariable,
+                NumpyNdarrayVariable,
+            ),
+        ):
+            graph_outputs_key = self.add_graph_output(value)
+
+            if isinstance(value, NumpyNdarrayVariable):
+                self.load_import_from(utils.__name__, "to_numpy_helper")
+
+            self.load_graph_output(graph_outputs[graph_outputs_key].index)
+
+            if isinstance(value, NumpyNdarrayVariable):
+                output.extend(create_call_function(1, True))
+            elif isinstance(value, UnspecializedPythonVariable) and value.need_unwrap:
+                output.extend(
+                    [self.create_load_attr("item")] + create_call_function(0, True)
+                )
+        elif isinstance(value, NNModuleVariable):
+            parts = value.module_key.split(".")
+            if parts[0] in self.code_options["co_varnames"]:
+                output.append(self.create_load(parts[0]))
+                parts = parts[1:]
+            else:
+                assert self.root is not None
+                output.append(self.create_load_output(self.root))
+            for part in parts:
+                output.append(self.create_load_attr(part))
+        else:
+            self.uses[value] += 1
+            try:
+                self.call_reconstruct(value)
+            except NotImplementedError:
+                unimplemented(f"reconstruct: {value}")
+            if allow_cache and value in self.tempvars:
+                self._output.append(create_dup_top())
+                self.add_cache(value)
+
+        self.top_of_stack = value
+
+    def add_graph_output(self, value):
+        graph_outputs_key = id(value.as_proxy())
+        if graph_outputs_key not in self.graph_outputs:
+            self.graph_outputs[graph_outputs_key] = GraphOutputEntry(
+                len(self.graph_outputs), value
+            )
+        return graph_outputs_key
+
+    def load_graph_output(self, index):
+        output = self._output
+        output.append(self.create_load(self.graph_output_var))
+        output.append(self._create_load_const(index))
+        output.append(create_instruction("BINARY_SUBSCR"))
+
+    def add_cache(self, value):
+        var = self.new_var()
+        self.tempvars[value] = var
+        if value.mutable_local:
+            self.tempvars[value.mutable_local] = var
+        self._output.append(self.create_store(var))
+
+    def foreach(self, items):
+        for i in items:
+            self(i)
+
+    def setup_globally_cached(self, name, value, push_null):
+        """Store value in a new global"""
+        name = re.sub(r"[^a-zA-Z0-9_]+", "_", name)
+        f_globals = self.tx.f_globals
+        if name in f_globals:
+            assert id(f_globals[name]) == id(value)
+        else:
+            f_globals[name] = value
+        return [self.create_load_global(name, push_null, add=True)]
+
+    def clear_tos(self):
+        self.top_of_stack = None
+
+    def append_output(self, inst):
+        assert isinstance(inst, Instruction)
+        self._output.append(inst)
+        self.clear_tos()
+
+    def extend_output(self, insts):
+        assert all(isinstance(x, Instruction) for x in insts)
+        self._output.extend(insts)
+        self.clear_tos()
+
+    def get_instructions(self) -> List[Instruction]:
+        return self._output
+
+    def create_load(self, name) -> Instruction:
+        if name in self.cell_and_freevars():
+            return create_instruction("LOAD_DEREF", argval=name)
+        assert name in self.code_options["co_varnames"], f"{name} missing"
+        return create_instruction("LOAD_FAST", argval=name)
+
+    def create_load_closure(self, name) -> Instruction:
+        assert name in self.cell_and_freevars()
+        return create_instruction("LOAD_CLOSURE", argval=name)
+
+    def create_store(self, name) -> Instruction:
+        if name in self.cell_and_freevars():
+            return create_instruction("STORE_DEREF", argval=name)
+        assert name in self.code_options["co_varnames"]
+        return create_instruction("STORE_FAST", argval=name)
+
+    def create_load_global(self, name, push_null, add=False) -> Instruction:
+        if add:
+            self.tx.output.update_co_names(name)
+        assert name in self.code_options["co_names"], f"{name} not in co_names"
+        return create_load_global(name, push_null)
+
+    def create_load_const(self, value) -> Instruction:
+        assert is_safe_constant(value), f"unsafe constant {value}"
+        return self._create_load_const(value)
+
+    def _create_load_const(self, value) -> Instruction:
+        return create_instruction("LOAD_CONST", argval=value)
+
+    create_load_output = _create_load_const
+
+    def create_load_method(self, name):
+        self.tx.output.update_co_names(name)
+        return create_instruction("LOAD_METHOD", argval=name)
+
+    def create_load_attr(self, name) -> Instruction:
+        if name not in self.code_options["co_names"]:
+            self.code_options["co_names"] += (name,)
+        return create_instruction("LOAD_ATTR", argval=name)
+
+    def load_attr(self, name):
+        self.append_output(self.create_load_attr(name))
+
+    def create_load_attrs(self, names):
+        return [self.create_load_attr(name) for name in names.split(".")]
+
+    def create_store_attr(self, name) -> Instruction:
+        if name not in self.code_options["co_names"]:
+            self.code_options["co_names"] += (name,)
+        return create_instruction("STORE_ATTR", argval=name)
+
+    def store_attr(self, name):
+        self.append_output(self.create_store_attr(name))
+
+    def load_function_name(self, fn_name, push_null, num_on_stack=0):
+        """Load the global fn_name on the stack num_on_stack down"""
+        output = []
+        if push_null and sys.version_info >= (3, 11):
+            output.extend(
+                [create_instruction("PUSH_NULL"), *self.rot_n(num_on_stack + 1)]
+            )
+        output.extend(
+            [
+                self.create_load_global(fn_name, False, add=True),
+                *self.rot_n(num_on_stack + 1),
+            ]
+        )
+        return output
+
+    def rot_n(self, n):
+        try:
+            return create_rot_n(n)
+        except AttributeError:
+            # desired rotate bytecode doesn't exist, generate equivalent bytecode
+            return [
+                create_instruction("BUILD_TUPLE", arg=n),
+                self._create_load_const(rot_n_helper(n)),
+                *create_rot_n(2),
+                create_instruction("CALL_FUNCTION_EX", arg=0),
+                create_instruction("UNPACK_SEQUENCE", arg=n),
+            ]
+
+    def pop_null(self):
+        # POP_TOP doesn't work for null, so we pop nulls by pushing in a
+        # nop function, calling it (which consumes the null), and popping the result.
+        assert sys.version_info >= (3, 11)
+        return [
+            self._create_load_const(lambda: None),
+            *create_call_function(0, False),
+            create_instruction("POP_TOP"),
+        ]
+
+    def call_function(self, nargs: int, push_null: bool):
+        self.extend_output(create_call_function(nargs, push_null=push_null))
+
+    def dup_top(self):
+        self.append_output(create_dup_top())
+
+    def store(self, varname):
+        self.append_output(self.create_store(varname))
+
+    def make_function_with_closure(
+        self, fn_name: str, code: types.CodeType, push_null: bool, num_on_stack=0
+    ):
+        freevars = code.co_freevars
+        assert freevars
+        output = self._output
+        if sys.version_info >= (3, 11) and push_null:
+            output.append(create_instruction("PUSH_NULL"))
+            output.extend(self.rot_n(num_on_stack + 1))
+        for var in freevars:
+            assert var in self.cell_and_freevars()
+            output.append(create_instruction("LOAD_CLOSURE", argval=var))
+        output.append(create_instruction("BUILD_TUPLE", arg=len(freevars)))
+        output.append(self.create_load_const(code))
+        if sys.version_info < (3, 11):
+            output.append(self.create_load_const(fn_name))
+        output.append(create_instruction("MAKE_FUNCTION", arg=0x08))
+        output.extend(self.rot_n(num_on_stack + 1))
+        self.clear_tos()
+
+    def create_load_python_module(self, mod, push_null) -> Instruction:
+        """
+        Generate a LOAD_GLOBAL instruction to fetch a given python module.
+        """
+        output = self.tx.output
+        global_scope = output.global_scope
+        name = re.sub(r"^.*[.]", "", mod.__name__)
+        if global_scope.get(name, None) is mod:
+            return self.create_load_global(name, push_null, add=True)
+        prefix = f"___module_{name}"
+        global_name = self.tx.output.install_global_by_id(prefix, mod)
+        return self.create_load_global(global_name, push_null, add=True)
+
+    def make_call_generated_code(self, fn_name: str) -> None:
+        """Call the generated code function stored in fn_name"""
+        self.extend_output(self.load_function_name(fn_name, True))
+
+        graphargs = self.tx.output.graphargs
+        for arg in graphargs:
+            if arg.is_unspecialized:
+                self.extend_output(
+                    [
+                        self.create_load_python_module(torch, True),
+                        self.create_load_attr("as_tensor"),
+                    ]
+                )
+                self.call_reconstruct(arg)
+                self.extend_output(create_call_function(1, False))
+            else:
+                self.call_reconstruct(arg)
+
+        self.extend_output(create_call_function(len(graphargs), False))
+
+    def load_import_from(self, module_name, object_name) -> None:
+        self(AttrSource(self.tx.import_source(module_name), object_name))
+
+    def create_call_function_kw(self, nargs, kw_names, push_null) -> List[Instruction]:
+        if sys.version_info >= (3, 11):
+            output = create_call_function(nargs, push_null)
+            assert output[-2].opname == "PRECALL"
+            kw_names_inst = create_instruction("KW_NAMES", argval=kw_names)
+            output.insert(-2, kw_names_inst)
+            return output
+        return [
+            self.create_load_const(kw_names),
+            create_instruction("CALL_FUNCTION_KW", arg=nargs),
+        ]
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/compiled_autograd.py b/MLPY/Lib/site-packages/torch/_dynamo/compiled_autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..b130f0c3f5ccd264a434736fe480cae8e96fe571
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/compiled_autograd.py
@@ -0,0 +1,280 @@
+import contextlib
+import functools
+from typing import List, Optional
+
+import torch
+from torch._dynamo.external_utils import call_backward, call_hook
+from torch._dynamo.source import GetItemSource, LocalSource
+from torch._dynamo.utils import counters, lazy_format_graph_code
+from torch._logging import getArtifactLogger, trace_structured
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses import FakeTensorMode
+from torch.fx import GraphModule
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.proxy_tensor import (
+    decompose,
+    disable_autocast_cache,
+    disable_proxy_modes_tracing,
+    fetch_object_proxy,
+    ProxyTorchDispatchMode,
+    PythonKeyTracer,
+    track_tensor_tree,
+)
+from torch.fx.experimental.symbolic_shapes import DimDynamic, ShapeEnv
+from torch.fx.proxy import Proxy
+
+compiled_autograd_log = getArtifactLogger(__name__, "compiled_autograd")
+
+
+def maybe_clone(x):
+    if x is not None:
+        return clone_preserve_strides(x)
+    return x
+
+
+class AutogradCompilerInstance:
+    def __init__(self, compiler_fn) -> None:
+        self.compiler_fn = compiler_fn
+        self.stack = contextlib.ExitStack()
+        self.close = self.stack.close
+        self.shape_env = ShapeEnv()
+        self.fake_tensor_mode = FakeTensorMode(
+            allow_fallback_kernels=True,
+            allow_non_fake_inputs=True,
+            shape_env=self.shape_env,
+        )
+        self.fx_tracer = PythonKeyTracer()
+        self.proxy_mode = ProxyTorchDispatchMode(self.fx_tracer, "symbolic")
+        self.hooks_proxy: Optional[Proxy] = None
+
+    def wrap_fake(self, x, source):
+        assert isinstance(x, torch.Tensor)
+        return self.fake_tensor_mode.from_tensor(x, source=source)
+
+    @staticmethod
+    def source(name, idx) -> GetItemSource:
+        return GetItemSource(LocalSource(name), idx)
+
+    def begin_capture(self, inputs: List[torch.Tensor], sizes: List[int]):
+        counters["compiled_autograd"]["captures"] += 1
+        self.fx_tracer.root = torch.nn.Module()
+        self.fx_tracer.graph = torch.fx.Graph(tracer_cls=PythonKeyTracer)
+        self.fx_tracer.tensor_attrs = {}
+        args_proxy = self.fx_tracer.create_proxy("placeholder", "inputs", (), {})
+        sizes_proxy = self.fx_tracer.create_proxy("placeholder", "sizes", (), {})
+        self.hooks_proxy = self.fx_tracer.create_proxy("placeholder", "hooks", (), {})
+
+        # tensor inputs to fake tensors
+        inputs = [
+            self.wrap_fake(x, self.source("inputs", idx))
+            for idx, x in enumerate(inputs)
+        ]
+        proxies = [args_proxy[i] for i in range(len(inputs))]
+        self.bind_tensors_to_proxies(inputs, proxies)
+
+        # size inputs to symints
+        sizes = [
+            self.shape_env.create_unspecified_symint_and_symbol(
+                val,
+                self.source("sizes", idx),
+                DimDynamic.DYNAMIC,
+            )
+            for idx, val in enumerate(sizes)
+        ]
+        self.bind_tensors_to_proxies(sizes, sizes_proxy)
+
+        # TODO(jansel): are all these modes needed?
+        self.stack.enter_context(decompose({}))
+        self.stack.enter_context(self.fake_tensor_mode)
+        self.stack.enter_context(self.proxy_mode.sym_mode)
+        self.stack.enter_context(self.proxy_mode)
+        self.stack.enter_context(disable_autocast_cache())
+        return inputs, sizes
+
+    def proxy_call_backward(
+        self,
+        inputs,
+        output_metadatas,
+        saved_tensors,
+        backward_idx: int,
+    ):
+        assert self.hooks_proxy is not None
+        backward_fn = self.hooks_proxy[backward_idx]  # type: ignore[index]
+        proxies = self.fx_tracer.create_proxy(
+            kind="call_function",
+            target=call_backward,
+            args=(
+                backward_fn,
+                self.to_proxy(saved_tensors),
+                *self.to_proxy(inputs),
+            ),
+            kwargs={},
+        )
+
+        with disable_proxy_modes_tracing():
+            # create fake Tensors
+            grad_ins: List[Optional[torch.Tensor]] = []
+            for output_metadata in output_metadatas:
+                if output_metadata is None:
+                    grad_ins.append(None)
+                    continue
+
+                layout, device, dtype, size = output_metadata
+                grad_ins.append(
+                    torch.empty(size=size, dtype=dtype, layout=layout, device=device)
+                )
+            self.bind_tensors_to_proxies(grad_ins, proxies)
+        return tuple(grad_ins)
+
+    def proxy_call_hook(self, hook, *args):
+        return self.fx_tracer.create_proxy(
+            "call_function",
+            call_hook,
+            (
+                hook,
+                *[self.to_proxy(x) for x in args],
+            ),
+            {},
+        )
+
+    def tensor_pre_hook(self, inputs, hook_id, i: int):
+        assert self.hooks_proxy is not None
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
+        proxy = self.proxy_call_hook(
+            hook,
+            inputs[i],
+        )
+        with disable_proxy_modes_tracing():
+            inputs[i] = maybe_clone(inputs[i])
+            self.bind_tensors_to_proxies([inputs[i]], [proxy])
+        return inputs
+
+    def pre_hook(self, inputs, hook_id):
+        assert self.hooks_proxy is not None
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
+        proxies = self.proxy_call_hook(
+            hook,
+            inputs,
+        )
+        with disable_proxy_modes_tracing():
+            inputs = [maybe_clone(x) for x in inputs]
+            self.bind_tensors_to_proxies(inputs, proxies)
+        return inputs
+
+    def post_hook(self, outputs, inputs, hook_id):
+        assert self.hooks_proxy is not None
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
+        proxies = self.proxy_call_hook(
+            hook,
+            outputs,
+            inputs,
+        )
+        with disable_proxy_modes_tracing():
+            outputs = [maybe_clone(x) for x in outputs]
+            self.bind_tensors_to_proxies(outputs, proxies)
+        return outputs
+
+    def post_acc_grad_hook(self, input, hook_id):
+        assert isinstance(input, torch.Tensor)
+        assert self.hooks_proxy is not None
+        hook = self.hooks_proxy[hook_id]  # type: ignore[index]
+        proxies = self.proxy_call_hook(
+            hook,
+            input,
+        )
+        with disable_proxy_modes_tracing():
+            input = [maybe_clone(input)]
+            self.bind_tensors_to_proxies(input, proxies)
+        return input
+
+    def end_capture(self, outputs):
+        self.stack.close()
+        self.fx_tracer.create_node(
+            "output",
+            "output",
+            (self.fx_tracer.create_arg(self.to_proxy(outputs)),),
+            {},
+        )
+        graph = GraphModule(
+            self.fx_tracer.root, self.fx_tracer.graph, "CompiledAutograd"
+        )
+        compiled_autograd_log.info(
+            "%s", lazy_format_graph_code("Compiled autograd graph", graph)
+        )
+        trace_structured(
+            "compiled_autograd_graph",
+            payload_fn=lambda: graph.print_readable(print_output=False),
+        )
+        return self.compiler_fn(graph)
+
+    def to_proxy(self, t):
+        if t is None:
+            return None
+        if isinstance(t, list):
+            return [self.to_proxy(x) for x in t]
+        if isinstance(t, tuple):
+            return tuple(self.to_proxy(x) for x in t)
+        assert isinstance(t, (torch.Tensor, torch.SymInt))
+        return fetch_object_proxy(self.fx_tracer)(t).proxy
+
+    def bind_tensors_to_proxies(self, tensors, proxies):
+        if isinstance(proxies, torch.fx.Proxy):
+            proxies = [proxies[i] for i in range(len(tensors))]
+        assert len(tensors) == len(proxies)
+        track_tensor_tree(tensors, proxies, constant=None, tracer=self.fx_tracer)
+
+    def bind_backward_state(self, index: int):
+        assert self.hooks_proxy is not None
+        proxy = self.hooks_proxy[index]  # type: ignore[index]
+        bw_state = BackwardState()
+        track_tensor_tree(bw_state, proxy, constant=None, tracer=self.fx_tracer)
+        return bw_state
+
+
+compiled_autograd_enabled = False
+
+# We may have code like:
+# with enable(compiler_fn):
+#   ...
+#   with disable():
+#     ...
+#   ...
+# The disable() call just want to disable compiled autograd temporarily.
+# But overall the feature is enabled.
+#
+# The code covered by the disable context manager has no way to know if
+# compiled autograd is overall eanbled. Use another variable
+# compiled_autograd_enabled_count to indicate how many times compiled
+# autograd has been enabled in the call stack for this purpose.
+compiled_autograd_enabled_count = 0
+
+
+@contextlib.contextmanager
+def enable(compiler_fn):
+    prior = torch._C._dynamo.compiled_autograd.set_autograd_compiler(
+        functools.partial(AutogradCompilerInstance, compiler_fn)
+    )
+    global compiled_autograd_enabled, compiled_autograd_enabled_count
+    compiled_autograd_enabled = True
+    compiled_autograd_enabled_count += 1
+    try:
+        with torch.autograd.set_multithreading_enabled(False):
+            yield
+    finally:
+        compiled_autograd_enabled_count -= 1
+        if not prior:
+            compiled_autograd_enabled = False
+        torch._C._dynamo.compiled_autograd.set_autograd_compiler(prior)
+
+
+@contextlib.contextmanager
+def disable():
+    prior = torch._C._dynamo.compiled_autograd.set_autograd_compiler(None)
+    global compiled_autograd_enabled
+    compiled_autograd_enabled = False
+    try:
+        yield
+    finally:
+        if prior:
+            compiled_autograd_enabled = True
+        torch._C._dynamo.compiled_autograd.set_autograd_compiler(prior)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/comptime.py b/MLPY/Lib/site-packages/torch/_dynamo/comptime.py
new file mode 100644
index 0000000000000000000000000000000000000000..649bcbac947d51710f430efcb22824a02b29b24f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/comptime.py
@@ -0,0 +1,373 @@
+# This file establishes the public comptime interface to Dynamo.
+# This allows Dynamo users to execute arbitrary Python code while
+# Dynamo is symbolically evaluating their original programs.
+#
+# The goal of the public API is to give users rope, without actually
+# leaking private implementation details of Dynamo.
+
+import builtins
+import dis
+import traceback
+from typing import Optional, Union
+
+import torch
+from torch.fx.experimental.symbolic_shapes import free_symbols
+
+from .exc import unimplemented
+from .variables.constant import ConstantVariable
+from .variables.tensor import SymNodeVariable
+
+
+class ComptimeVar:
+    """
+    A ComptimeVar represents a Python value, at some particular point
+    in time, in the Python code we are symbolically evaluating with
+    torchdynamo.  This must be distinguished from a runtime value, as
+    at compile-time there are some properties of the variable we
+    do not know (for example, if the ComptimeVar represents a Tensor,
+    we only know metadata about the tensor; we do NOT know what the
+    actual data in the Tensor is.)
+    """
+
+    def __init__(self, v):
+        self.__variable = v
+
+    def as_proxy(self):
+        """
+        Returns an fx.Proxy (or tuple/list of fx.Proxy) representing
+        this variable in the FX graph we are assembling to pass
+        to the user compiler.
+
+        This method only works for variables we actually track in
+        the FX graph, aka Tensors (and ints, if you are compiling
+        with dynamic shapes).  In particular, if you have a list
+        or tuple of tensors, you will get a list/tuple of proxies
+        (not a single proxy representing the entire list/tuple).
+        """
+        return self.__variable.as_proxy()
+
+    def is_proxy(self):
+        """
+        Returns True if as_proxy() would succeed.
+        """
+        return self.__variable.is_proxy()
+
+    def as_fake(self):
+        """
+        Returns a "fake" value (either a FakeTensor or a SymInt)
+        representing the variable in question.  This only works
+        for variables that denote Tensor or int.  You can use
+        this to query metadata; e.g., v.as_fake().size(0) will
+        tell you the compile-time known size of the tensor.
+
+        WARNING: Do NOT mutate the returned tensor.
+        """
+        return self.__variable.as_proxy().node.meta["example_value"]
+
+    def size(self, dim: Optional[int] = None) -> Union[int, torch.SymInt]:
+        """
+        Returns the size of the tensor (if dim is None) or the size
+        at the dimension dim.  The returned size may be a SymInt.
+        """
+        return self.as_fake().size(dim)
+
+    def python_type(self):
+        """
+        Returns what type(v) would have returned for the variable
+        at compile time.
+        """
+        return self.__variable.python_type()
+
+    def as_python_constant(self):
+        """
+        Returns the Python value this variable would have, but only if it is
+        completely known at compile-time (e.g., it is constant).
+
+        WARNING: Do NOT mutate the returned constant.  The returned constant
+        may or may not correspond to the actual value this variable may take
+        on at runtime; for example, if the variable in question is a constant
+        list, we may return a copy of that list.
+        """
+        return self.__variable.as_python_constant()
+
+    def is_python_constant(self):
+        """
+        Returns True if as_python_constant would succeed.
+        """
+        return self.__variable.is_python_constant()
+
+    def is_dynamic(self):
+        if isinstance(self.__variable, SymNodeVariable):
+            fs = free_symbols(self.__variable.sym_num)
+            return bool(fs)
+        return False
+
+    def force_static(self):
+        """
+        Forces that a value is static, inducing a guard on its specific value
+        """
+        if isinstance(self.__variable, SymNodeVariable):
+            self.__variable.evaluate_expr()
+        elif isinstance(self.__variable, ConstantVariable):
+            # TODO: Maybe complain if this isn't a int/bool/float variable
+            pass
+        else:
+            raise AssertionError(
+                f"cannot force {self.__variable} ({type(self.__variable)}) static"
+            )
+
+    def _i_will_not_complain_if_bc_breaks_VariableTracker(self):
+        """
+        Returns the internal data structure VariableTracker that Dynamo uses
+        to represent variables at compile time.  There are no BC guarantees on
+        this API and WE RESERVE THE RIGHT TO BREAK YOUR CODE if you rely on
+        it.
+        """
+        return self.__variable
+
+    def __repr__(self):
+        # TODO: The default repr is pretty bad, do better
+        return repr(self.__variable)
+
+    # TODO: API for adding a custom guard
+
+
+class ComptimeContext:
+    """
+    This context class provides access to a public API for Dynamo's internals.
+    If there is something here you would find useful that is missing, please
+    file a feature request at https://github.com/pytorch/pytorch/
+    """
+
+    def __init__(self, tx):
+        self.__tx = tx
+
+    def get_local(self, name: str, *, stacklevel=0) -> ComptimeVar:
+        """
+        Retrieve the compile-time known information about a local.
+        """
+        tx = self.__get_tx(stacklevel)
+        return ComptimeVar(tx.symbolic_locals[name])
+
+    def graph_break(self, msg="ComptimeContext.graph_break"):
+        """
+        Manually trigger a graph break
+        """
+        unimplemented(msg)
+
+    def graph(self):
+        """
+        Retrieve the partially constructed FX graph that would be
+        passed to the user compiler after compilation.
+        """
+        return self.__tx.output.graph
+
+    def assert_static(self, val):
+        """
+        Asserts that the int is static (and not dynamic, per dynamic shapes)
+        """
+        assert (
+            not val.is_dynamic()
+        ), "expected static but got dynamic (run with TORCH_LOGS=dynamic for more info)"
+
+    def print_graph(self, *, verbose=True, file=None):
+        """
+        Print the partially constructed FX graph that would be passed
+        to the user compiler after compilation.
+        """
+        print(
+            self.__tx.output.graph.python_code("self", verbose=verbose).src, file=file
+        )
+
+    def parent(self):
+        return ComptimeContext(self.__tx.parent)
+
+    def __get_tx(self, stacklevel):
+        tx = self.__tx
+        for _ in range(stacklevel):
+            tx = tx.parent
+        return tx
+
+    def print_disas(self, *, file=None, stacklevel=0):
+        """
+        Print the current series of opcodes being executed (not including
+        parent frames), including where you are in the particular opcode
+        stream.
+        """
+        tx = self.__get_tx(stacklevel)
+        print(
+            dis.Bytecode(
+                tx.f_code,
+                current_offset=tx.instructions[tx.instruction_pointer].offset,
+            ).dis(),
+            file=file,
+        )
+
+    def print_value_stack(self, *, file=None, stacklevel=0):
+        """
+        Print the current Python value stack.  Note that this is NOT the same
+        as the traceback; use print_bt() to print that.  Note that at
+        stacklevel=0, this will typically be empty, as comptime cannot
+        currently be used in an expression context where there would be
+        intermediates on the stack.  If you would find this useful, please
+        file a bug at https://github.com/pytorch/pytorch/
+
+        NB: Stack grows downwards in our print
+        """
+        # TODO: improve printing
+        tx = self.__get_tx(stacklevel)
+        for s in tx.stack:
+            print(f"- {s}", file=file)
+
+    def print_locals(self, *, file=None, stacklevel=0):
+        """
+        Print all of the locals available in the current context.
+        By default this view is very limited; you can get more information
+        about any individual local using get_local().
+        """
+        # TODO: improve by improving the VariableTracker printing
+        tx = self.__get_tx(stacklevel)
+        for k, v in tx.symbolic_locals.items():
+            print(f"{k} = {v}", file=file)
+
+    def print_bt(self, *, file=None, stacklevel=0):
+        """
+        Print the user code backtrace, starting at the beginning of the
+        frame Dynamo started evaluating.  Note that this MAY NOT go all
+        the way to the torch.compile invocation, as we may have done
+        a graph break and are compiling an intermediate frame as the
+        starting point.  If you think the other behavior would be better,
+        file a bug at https://github.com/pytorch/pytorch/
+        """
+        stack = []
+        tx = self.__get_tx(stacklevel)
+        while tx is not None:
+            stack.append(tx.frame_summary())
+            tx = getattr(tx, "parent", None)
+        print(
+            "".join(traceback.StackSummary.from_list(reversed(stack)).format()),
+            file=file,
+        )
+
+    def print_guards(self, *, file=None):
+        """
+        Print the currently installed guards for the Dynamo context.
+        This does NOT include guards associated with variables that
+        may or may not be installed in the future if those variables
+        are used.
+        """
+        # TODO: improve print format, current guard format is extremely
+        # verbose
+        print(
+            "\n".join(f"{repr(guard)}" for guard in sorted(self.__tx.output.guards)),
+            file=file,
+        )
+
+    def _i_will_not_complain_if_bc_breaks_InstructionTranslator(self):
+        """
+        Returns the internal data structure InstructionTranslator that Dynamo
+        uses to track state of symbolic evaluation.  There are no BC
+        guarantees on this API and WE RESERVE THE RIGHT TO BREAK YOUR CODE if
+        you rely on it.
+        """
+        return self.__tx
+
+
+class _Comptime:
+    @staticmethod
+    def __call__(fn):
+        """fn gets called at compile time in TorchDynamo, does nothing otherwise"""
+        return
+
+    # Convenience wrappers that are more compact to use
+
+    @staticmethod
+    def graph_break():
+        comptime(lambda ctx: ctx.graph_break())
+
+    @staticmethod
+    def print_graph():
+        comptime(lambda ctx: ctx.print_graph())
+
+    @staticmethod
+    def print_disas(*, stacklevel=0):
+        comptime(
+            lambda ctx: ctx.print_disas(
+                stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
+            )
+        )
+
+    @staticmethod
+    def print_value_stack(*, stacklevel=0):
+        comptime(
+            lambda ctx: ctx.print_value_stack(
+                stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
+            )
+        )
+
+    # This is a more useful variant of print_value_stack that can be used
+    # in an expression context; e.g., x + print_value_stack_and_return(y + z),
+    # you will see x on the stack prior to the addition operation
+    @staticmethod
+    def print_value_stack_and_return(e, *, stacklevel=0):
+        comptime(
+            lambda ctx: ctx.print_value_stack(
+                stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
+            )
+        )
+        return e
+
+    @staticmethod
+    def print_locals(*, stacklevel=0):
+        comptime(
+            lambda ctx: ctx.print_locals(
+                stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
+            )
+        )
+
+    @staticmethod
+    def print_bt(*, stacklevel=0):
+        comptime(
+            lambda ctx: ctx.print_bt(
+                stacklevel=ctx.get_local("stacklevel").as_python_constant() + 1
+            )
+        )
+
+    @staticmethod
+    def print_guards():
+        comptime(lambda ctx: ctx.print_guards())
+
+    @staticmethod
+    def assert_static(val):
+        comptime(lambda ctx: ctx.assert_static(ctx.get_local("val")))
+
+    @staticmethod
+    def force_static(val):
+        comptime(lambda ctx: ctx.get_local("val").force_static())
+
+    @staticmethod
+    def breakpoint():
+        """
+        Like pdb breakpoint(), but drop into pdb whenever this line
+        of code is compiled by dynamo.  Use it by putting
+        this in your model code::
+
+            from torch._dynamo.comptime import comptime
+            comptime.breakpoint()
+
+        And then, inside pdb, you can access 'ctx' to query things
+        about the compilation context::
+
+            (Pdb) !ctx.print_bt()
+            (Pdb) !ctx.print_locals()
+            (Pdb) p ctx.get_local("attention").as_fake()
+        """
+
+        def inner(inner_ctx):
+            ctx = inner_ctx.parent()
+            builtins.breakpoint()
+
+        comptime(inner)
+
+
+comptime = _Comptime()
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/config.py b/MLPY/Lib/site-packages/torch/_dynamo/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc707e2cb92a7e0533bfb5db217c1c885ab1c012
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/config.py
@@ -0,0 +1,423 @@
+import getpass
+import inspect
+import os
+import re
+import sys
+import tempfile
+from os.path import abspath, dirname
+from typing import Any, Callable, Dict, Optional, Set, Type, TYPE_CHECKING, Union
+
+import torch
+
+# to configure logging for dynamo, aot, and inductor
+# use the following API in the torch._logging module
+# torch._logging.set_logs(dynamo=<level>, aot=<level>, inductor<level>)
+# or use the environment variable TORCH_LOGS="dynamo,aot,inductor" (use a prefix + to indicate higher verbosity)
+# see this design doc for more detailed info
+# Design doc: https://docs.google.com/document/d/1ZRfTWKa8eaPq1AxaiHrq4ASTPouzzlPiuquSBEJYwS8/edit#
+# the name of a file to write the logs to
+# [@compile_ignored: debug]
+log_file_name: Optional[str] = None
+
+# [@compile_ignored: debug] Verbose will print full stack traces on warnings and errors
+verbose = os.environ.get("TORCHDYNAMO_VERBOSE", "0") == "1"
+
+# [@compile_ignored: runtime_behaviour] verify the correctness of optimized backend
+verify_correctness = False
+
+# need this many ops to create an FX graph
+minimum_call_count = 1
+
+# turn on/off DCE pass
+dead_code_elimination = True
+
+# disable (for a function) when cache reaches this size
+
+# controls the maximum number of cache entries with a guard on same ID_MATCH'd
+# object. It also controls the maximum size of cache entries if they don't have
+# any ID_MATCH'd guards.
+# [@compile_ignored: runtime_behaviour]
+cache_size_limit = 8
+
+# [@compile_ignored: runtime_behaviour] controls the maximum number of entries for a code object.
+accumulated_cache_size_limit = 64
+
+# whether or not to specialize on int inputs.  This only has an effect with
+# dynamic_shapes; when dynamic_shapes is False, we ALWAYS specialize on int
+# inputs.  Note that assume_static_by_default will also cause ints to get
+# specialized, so this is mostly useful for export, where we want inputs
+# to be dynamic, but accesses to ints should NOT get promoted into inputs.
+specialize_int = False
+
+# legacy config, does nothing now!
+dynamic_shapes = True
+
+use_lazy_graph_module = (
+    os.environ.get("TORCH_COMPILE_USE_LAZY_GRAPH_MODULE", "1") == "1"
+)
+
+# This is a temporarily flag, which changes the behavior of dynamic_shapes=True.
+# When assume_static_by_default is True, we only allocate symbols for shapes marked dynamic via mark_dynamic.
+# NOTE - this flag can be removed once we can run dynamic_shapes=False w/ the mark_dynamic API
+# see [Note - on the state of mark_dynamic]
+assume_static_by_default = True
+
+# This flag changes how dynamic_shapes=True works, and is meant to be used in conjunction
+# with assume_static_by_default=True.
+# With this flag enabled, we always compile a frame as fully static for the first time, and, if we fail
+# any guards due to wobbles in shape, we recompile with *all* the wobbled shapes as being marked dynamic.
+automatic_dynamic_shapes = True
+
+# This flag changes how the shapes of parameters are treated.
+# If this flag is set to True, then the shapes of torch.nn.Parameter as well as of torch.Tensor are attempted to be dynamic
+# If this flag is set to False, then the shapes of torch.nn.Parameter are assumed to be static,
+# while the shapes of torch.Tensor are assumed to be dynamic.
+force_parameter_static_shapes = True
+
+# This flag ensures that the shapes of a nn module are always assumed to be static
+# If the flag is set to True, then the shapes of a nn.module are assumed to be static
+# If the flag is set to False, then the shapes of a nn.module can be dynamic
+force_nn_module_property_static_shapes = True
+
+# Typically, if you mark_dynamic a dimension, we will error if the dimension
+# actually ended up getting specialized.  This knob changes the behavior so
+# that we don't error at all.  This is helpful for our CI where I'm using a
+# heuristic to mark batch dimensions as dynamic and the heuristic may get it
+# wrong.
+allow_ignore_mark_dynamic = False
+
+# Set this to False to assume nn.Modules() contents are immutable (similar assumption as freezing)
+guard_nn_modules = False
+
+# Uses CPython internal dictionary tags to detect mutation. There is some
+# overlap between guard_nn_modules_using_dict_tags and guard_nn_modules flag.
+# guard_nn_modules unspecializes the nn module instance and adds guard for each
+# relevant member of the nn modules. On the other hand,
+# guard_nn_modules_using_dict_tags specializes on each nn module instance but
+# uses low overhead dict version matching to detect mutations, obviating the
+# need to guard on members of the nn modules. With
+# guard_nn_modules_using_dict_tags, the guard_nn_modules is not really required
+# but kept around for debugging and discussing unspecializing nn module
+# variables.
+# TODO(janimesh, voz): Remove both of these flags (or atleast guard_nn_modules)
+# once we have reached stability for the guard_nn_modules_using_dict_tags.
+guard_nn_modules_using_dict_tags = True
+
+# This feature doesn't really work.  We offer this flag for experimental
+# purposes / if you want to help us build out support.
+#
+# torchdynamo has very limited support for tensor subclasses that implement
+# __torch_function__.  Our current support is limited to tensor subclasses
+# that DO NOT store metadata on the tensor (in general, dynamo does not
+# support Python code that stores extra attributes on tensors at present).
+# If your tensor subclass purely changes function call behavior via
+# __torch_function__, you can allow torchdynamo to trace into it by
+# adding it to traceable_tensor_subclasses.  We don't do any safety checks,
+# so it is up to you to ensure that your subclass is well behaved.  See also
+# https://github.com/pytorch/torchdynamo/issues/1948
+#
+# We do NOT currently support __torch_dispatch__.  The implementation is
+# currently buggy, the main show stopper for nontrivial use is
+# https://github.com/pytorch/torchdynamo/issues/1952
+traceable_tensor_subclasses: Set[Type[Any]] = set()
+
+# Suppress errors in torch._dynamo.optimize, instead forcing a fallback to eager.
+# This is a good way to get your model to work one way or another, but you may
+# lose optimization opportunities this way.  Devs, if your benchmark model is failing
+# this way, you should figure out why instead of suppressing it.
+suppress_errors = bool(os.environ.get("TORCHDYNAMO_SUPPRESS_ERRORS", False))
+
+# Record and write an execution record of the current frame to a file
+# if an exception is encountered
+# @compile_ignored[debug]
+replay_record_enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+
+# Rewrite assert statement in python with torch._assert
+rewrite_assert_with_torch_assert = True
+
+# Disable dynamo
+disable = os.environ.get("TORCH_COMPILE_DISABLE", False)
+
+# [@compile_ignored: runtime_behaviour] Get a cprofile trace of Dynamo
+cprofile = os.environ.get("TORCH_COMPILE_CPROFILE", False)
+
+# legacy config, does nothing now!
+skipfiles_inline_module_allowlist: Dict[Any, Any] = {}
+
+# If a string representing a PyTorch module is in this ignorelist,
+# the `allowed_functions.is_allowed` function will not consider it
+# when creating a list of PyTorch functions that will appear in
+# FX IR.
+allowed_functions_module_string_ignorelist = {
+    "torch.distributions",
+    "torch.testing",
+    "torch._refs",
+    "torch._prims",
+    "torch._decomp",
+}
+
+# Debug Flag to try minifier at different stages. Possible values are {None, "aot", "dynamo"}
+# None - Minifier is switched off
+# dynamo - Runs minifier on the TorchDynamo produced graphs, if compilation fails
+# aot - Runs minifier on the Aot Autograd produced graphs, if compilation fails
+# [@compile_ignored: debug]
+repro_after = os.environ.get("TORCHDYNAMO_REPRO_AFTER", None)
+
+# Compiler compilation debug info
+# 1: Dumps the original graph out to repro.py if compilation fails
+# 2: Dumps a minifier_launcher.py if compilation fails.
+# 3: Always dumps a minifier_launcher.py. Good for segfaults.
+# 4: Dumps a minifier_launcher.py if the accuracy fails.
+# [@compile_ignored: debug]
+repro_level = int(os.environ.get("TORCHDYNAMO_REPRO_LEVEL", 2))
+
+# By default, we try to detect accuracy failure by running both forward
+# and backward of a torchdynamo produced graph (if you are using repro_after
+# 'dynamo').  This setting forces us to only test the forward graph and
+# not the backward graph.  This can be helpful if you're trying to debug
+# an inference only problem, but the minifier seems to be choking on the
+# backwards step
+# TODO: Detect this situation automatically so the user doesn't need
+# to manually configure this
+# [@compile_ignored: debug]
+repro_forward_only = os.environ.get("TORCHDYNAMO_REPRO_FORWARD_ONLY") == "1"
+
+# The tolerance we should use when testing if a compiled graph
+# has diverged so that we should treat it as an accuracy failure
+# [@compile_ignored: debug]
+repro_tolerance = 1e-3
+
+# If True, when testing if two models are the same, we will test them against
+# a third fp64 reference and only report a problem if the RMSE relative to the
+# fp64 is greater.  However, this will use more memory; you may disable this
+# if memory usage is too high.
+# [@compile_ignored: runtime_behaviour]
+same_two_models_use_fp64 = True
+
+# Not all backends support scalars. Some calls on torch.Tensor (like .item()) return a scalar type.
+# When this flag is set to False, we introduce a graph break instead of capturing.
+# This requires dynamic_shapes to be True.
+capture_scalar_outputs = False
+
+# Not all backends support operators that have dynamic output shape (e.g.,
+# nonzero, unique).  When this flag is set to False, we introduce a graph
+# break instead of capturing.  This requires dynamic_shapes to be True.
+# If you set this to True, you probably also want capture_scalar_outputs
+# (these are separated for historical reasons).
+capture_dynamic_output_shape_ops = False
+
+# By default, dynamo will treat all ints as backed SymInts, which means (1) it
+# will wait to see the int change over multiple runs before generalizing and
+# (2) it will still always 0/1 specialize an int.  When true, this knob
+# forces dynamo to treat _length_per_key and _offset_per_key on
+# KeyedJaggedTensor from torchrec as size-like unbacked SymInts, so that
+# they (1) generalize immediately and (2) unsoundly never compare equal to
+# 0/1.  This is not on by default as AOTAutograd/Inductor cannot currently
+# compile this code; however, this can be useful for export.
+force_unspec_int_unbacked_size_like_on_torchrec_kjt = False
+
+# Should almost always be true in prod. This relaxes the requirement that cond's true_fn and
+# false_fn produces code with identical guards.
+enforce_cond_guards_match = True
+
+# Specify how to optimize a compiiled DDP module. The flag accepts a bollean
+# value or a string. There are 4 modes.
+# 1. "ddp_optimizer" (or True): with "ddp_ptimizer", Dynamo will automatically
+# split model graph into pieces to match DDP bucket sizes to allow DDP
+# comm/compute overlap.
+# 2. "python_reducer" (experimental): this optimization requires the usage
+# of compiled_autograd. With "python_reducer", DDP will disable the C++ reducer
+# and use the Python reducer to allow compiled_autograd to trace the
+# communication and allow comm/compute overlap without graph-breaks.
+# 3. "python_reducer_without_compiled_forward" (experimental): this mode is
+# similar to "python_reducer". One should only use this optimization mode
+# when compiled_autograd is used but the DDP module is not compiled.
+# 4. "no_optimization" (or False): Dynamo won't split the model graph, nor
+# will Python reducer be used. With this mode, there will be no graph-breaks
+# and the original DDP C++ reducer will be used. There will no comm/compute
+# overlap. This mode CANNOT be used with compiled_autograd.
+# Note that to avoid breaking the existing usage, mode 1 and mode 4 can be
+# specified with a boolean value. True is using ddp_optimizer and False is
+# no optimization.
+optimize_ddp: Union[bool, str] = True
+
+_ddp_optimization_mode = [
+    "ddp_optimizer",
+    "python_reducer",  # experimental mode
+    "python_reducer_without_compiled_forward",  # experimental mode
+    "no_optimization",
+]
+
+
+def _get_optimize_ddp_mode():
+    m = sys.modules[__name__]
+    if isinstance(m.optimize_ddp, bool):
+        if m.optimize_ddp:
+            mode = "ddp_optimizer"
+        else:
+            mode = "no_optimization"
+    elif isinstance(m.optimize_ddp, str):
+        mode = m.optimize_ddp
+    else:
+        raise ValueError(f"Invalid type, {type(optimize_ddp)=}")
+
+    assert mode in m._ddp_optimization_mode, f"Invalid mode {mode=}"
+    return mode
+
+
+# If True, delays DDPOptimizer submodule compilation to 1st run of the model,
+# so that real tensor strides are used in all submodules
+# (instead of using FakeTensor strides which can differ from real tensor strides and causes error in some cases).
+# This feature is not hardened yet and it's known to cause issues to some models, so False by default.
+optimize_ddp_lazy_compile = False
+
+# Whether to skip guarding on FSDP-managed modules
+skip_fsdp_guards = True
+
+# Make dynamo skip guarding on hooks on nn modules
+# Note: unsafe: if your model actually has hooks and you remove them, or doesn't and  you add them,
+# dynamo will not notice and will execute whichever version you first compiled.
+skip_nnmodule_hook_guards = True
+
+# If True, raises exception if TorchDynamo is called with a context manager
+raise_on_ctx_manager_usage = True
+
+# If True, raise when aot autograd is unsafe to use
+raise_on_unsafe_aot_autograd = False
+
+# If true, error if you torch.jit.trace over a dynamo-optimized function.
+# If false, silently suppress dynamo
+error_on_nested_jit_trace = True
+
+# If true, error with a better message if we symbolically trace over a
+# dynamo-optimized function. If false, silently suppress dynamo.
+error_on_nested_fx_trace = True
+
+# Disables graph breaking on rnn. YMMV with backends.
+allow_rnn = False
+
+# If true, error if we try to compile a function that has
+# been seen before.
+# [@compile_ignored: runtime_behaviour]
+error_on_recompile = False
+
+# [@compile_ignored: debug] Whether to report any guard failures (deprecated: does not do anything)
+report_guard_failures = True
+
+# [@compile_ignored: debug] root folder of the project
+base_dir = dirname(dirname(dirname(abspath(__file__))))
+
+# Trace through NumPy or graphbreak
+trace_numpy = True
+
+# Trace through torch.distributed code
+trace_distributed = False
+
+# Default NumPy dtypes when tracing with torch.compile
+# We default to 64bits. For efficiency, one may want to change these to float32
+numpy_default_float = "float64"
+numpy_default_complex = "complex128"
+numpy_default_int = "int64"
+
+# use numpy's PRNG if True, pytorch otherwise
+use_numpy_random_stream = False
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+def default_debug_dir_root():
+    # [@compile_ignored: debug]
+    DEBUG_DIR_VAR_NAME = "TORCH_COMPILE_DEBUG_DIR"
+    if DEBUG_DIR_VAR_NAME in os.environ:
+        return os.path.join(os.environ[DEBUG_DIR_VAR_NAME], "torch_compile_debug")
+    elif is_fbcode():
+        return os.path.join(
+            tempfile.gettempdir(), getpass.getuser(), "torch_compile_debug"
+        )
+    else:
+        return os.path.join(os.getcwd(), "torch_compile_debug")
+
+
+# [@compile_ignored: debug]
+debug_dir_root = default_debug_dir_root()
+
+# [@compile_ignored: debug]
+_save_config_ignore = {
+    "repro_after",
+    "repro_level",
+    # workaround: "cannot pickle PyCapsule"
+    "constant_functions",
+    # workaround: "cannot pickle module"
+    "skipfiles_inline_module_allowlist",
+}
+
+# for backend="cudagraphs", mutations on input be sent to the cudagraph backend
+# or replayed in aot_autograd epilogue. default is False because mutation on inputs
+# can prevent cudagraphing.
+cudagraph_backend_keep_input_mutation = False
+
+# When True, only ops that have the torch.Tag.pt2_compliant tag
+# will be allowed into the graph; all other ops will be disallowed
+# and will fall back to eager-mode PyTorch. Useful to ensure
+# correctness of custom ops.
+only_allow_pt2_compliant_ops = False
+
+capture_autograd_function = True
+
+# enable/disable dynamo tracing for `torch.func` transforms
+capture_func_transforms = False
+
+# enable/disable user-defined triton kernel optimizations
+optimize_user_defined_triton_kernels = True
+
+# If to log Dynamo compilation metrics into log files (for OSS) and Scuba tables (for fbcode).
+log_compilation_metrics = True
+
+# A set of logging functions which will be reordered to the end of graph breaks,
+# allowing dynamo to construct larget graph. Note that there are some
+# limitations to this, such as how it does not correctly print objects that were
+# mutated after the print statement.
+reorderable_logging_functions: Set[Callable[[Any], None]] = set()
+
+# simulates what would happen if we didn't have support for BUILD_SET opcode,
+# used for testing
+inject_BUILD_SET_unimplemented_TESTING_ONLY = False
+
+_autograd_backward_strict_mode_banned_ops = [
+    "stride",
+    "requires_grad",
+    "storage_offset",
+    "layout",
+    "data",
+]
+
+_autograd_backward_strict_mode_banned_ops.extend(
+    [name for name, _ in inspect.getmembers(torch.Tensor) if re.match(r"^is_.*", name)]
+)
+
+# Enables caching of dispatches to fake tensors.
+fake_tensor_cache_enabled = (
+    os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE", "1") == "1"
+)
+
+# Enables cross checking between the fake tensor cache and dispatch.
+fake_tensor_cache_crosscheck_enabled = (
+    os.environ.get("TORCH_FAKE_TENSOR_DISPATCH_CACHE_CROSSCHECK", "0") == "1"
+)
+
+# support `context_fn` in torch.utils.checkpoint.checkpoint API under torch.compile().
+# WARNING: this is an experimental flag and is subject to change.
+_experimental_support_context_fn_in_torch_utils_checkpoint = False
+
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+
+    def _make_closure_patcher(**changes):
+        ...
+
+
+from torch.utils._config_module import install_config_module
+
+install_config_module(sys.modules[__name__])
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/convert_frame.py b/MLPY/Lib/site-packages/torch/_dynamo/convert_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b325df31077d93e5ecddce7a73af613c3a64301
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/convert_frame.py
@@ -0,0 +1,924 @@
+import collections
+import dis
+import functools
+import itertools
+import logging
+import os
+import random
+import sys
+import threading
+import time
+import traceback
+import types
+import typing
+import weakref
+from typing import Any, Callable, Dict, List, Optional, Set
+
+from torch.fx._lazy_graph_module import (  # type: ignore[attr-defined]
+    _use_lazy_graph_module,
+)
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+import torch
+import torch._logging
+from torch._guards import compile_context, CompileContext, CompileId, tracing
+from torch._logging import structured
+from torch._utils_internal import signpost_event
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    GuardOnDataDependentSymNode,
+)
+from torch.fx.graph_module import _forward_from_src as original_forward_from_src
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.utils._python_dispatch import _disable_current_modes
+from torch.utils._traceback import format_traceback_short
+
+from . import config, exc, trace_rules
+from .backends.registry import CompilerFn
+from .bytecode_analysis import remove_dead_code, remove_pointless_jumps
+from .bytecode_transformation import (
+    check_inst_exn_tab_entries_valid,
+    Instruction,
+    is_generator,
+    propagate_inst_exn_table_entries,
+    transform_code_object,
+)
+from .cache_size import (
+    CacheSizeRelevantForFrame,
+    compute_cache_size,
+    exceeds_cache_size_limit,
+    is_recompilation,
+)
+from .eval_frame import always_optimize_code_objects, skip_code, TorchPatcher
+from .exc import (
+    augment_exc_message,
+    BackendCompilerFailed,
+    format_error_msg,
+    InternalTorchDynamoError,
+    TorchRuntimeError,
+    UncapturedHigherOrderOpError,
+    unimplemented,
+    Unsupported,
+)
+from .guards import (
+    CheckFunctionManager,
+    get_and_maybe_log_recompilation_reason,
+    GuardedCode,
+)
+from .hooks import Hooks
+from .output_graph import OutputGraph
+from .replay_record import ExecutionRecord
+from .symbolic_convert import InstructionTranslator, SpeculationLog
+from .trace_rules import is_numpy
+from .types import BytecodeHook
+from .utils import (
+    CleanupManager,
+    CompilationMetrics,
+    counters,
+    dynamo_timed,
+    format_bytecode,
+    frame_phase_timing,
+    gen_record_file_name,
+    increment_frame,
+    is_namedtuple,
+    istype,
+    LazyString,
+    maybe_cprofile,
+    orig_code_map,
+    record_compilation_metrics,
+    reset_graph_break_dup_checker,
+    setup_compile_debug,
+    troubleshooting_url,
+    write_record_to_file,
+)
+
+log = logging.getLogger(__name__)
+bytecode_log = torch._logging.getArtifactLogger(__name__, "bytecode")
+GlobalStateGuard = torch._C._dynamo.guards.GlobalStateGuard
+
+compile_lock = threading.RLock()
+
+
+class Tracker:
+    def __init__(self):
+        self.seen = []
+        self.seen_ids = set()
+
+    def add(self, strong_obj):
+        idx = id(strong_obj)
+        if idx not in self.seen_ids:
+            obj = weakref.ref(strong_obj, lambda _: self.seen_ids.remove(idx))
+            self.seen.append(obj)
+            self.seen_ids.add(idx)
+
+    def __contains__(self, item):
+        return id(item) in self.seen_ids
+
+    def clear(self):
+        self.seen.clear()
+        self.seen_ids.clear()
+
+
+input_codes = Tracker()
+output_codes = Tracker()
+
+initial_global_state: Optional[GlobalStateGuard] = None
+
+
+@functools.wraps(original_forward_from_src)
+def fx_forward_from_src_skip_result(*args, **kwargs):
+    # we monkey patch FX to prevent infinite loop of trying to convert
+    # our generated code
+    result: types.FunctionType = original_forward_from_src(*args, **kwargs)
+    skip_code(result.__code__)
+    return result
+
+
+def preserve_global_state(fn):
+    """
+    Context manager to:
+        1) Save/restore torch.is_grad_enabled() state
+        2) Save/restore python random state
+        3) Save/restore torch random state
+        4) Monkey patch torch.fx.graph_module._forward_from_src
+    """
+
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+        guards = GlobalStateGuard()
+        prior_grad_mode = torch.is_grad_enabled()
+        prior_inference_mode = torch.is_inference_mode_enabled()
+        prior_deterministic = torch.are_deterministic_algorithms_enabled()
+        prior_warn_only = torch.is_deterministic_algorithms_warn_only_enabled()
+        py_rng_state = random.getstate()
+        torch_rng_state = torch.random.get_rng_state()
+        if torch.cuda.is_available():
+            cuda_rng_state = torch.cuda.get_rng_state()
+        prior_fwd_from_src = torch.fx.graph_module._forward_from_src
+        torch.fx.graph_module._forward_from_src = fx_forward_from_src_skip_result
+        cleanup = setup_compile_debug()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            cleanup.close()
+            torch._C._set_grad_enabled(prior_grad_mode)
+            torch.torch.autograd.grad_mode._enter_inference_mode(prior_inference_mode)
+            torch.use_deterministic_algorithms(
+                prior_deterministic, warn_only=prior_warn_only
+            )
+            random.setstate(py_rng_state)
+            torch.random.set_rng_state(torch_rng_state)
+            if torch.cuda.is_available():
+                torch.cuda.set_rng_state(cuda_rng_state)  # type: ignore[possibly-undefined]
+            torch.fx.graph_module._forward_from_src = prior_fwd_from_src
+            assert (
+                guards.check()
+            ), "Global state changed while dynamo tracing, please report a bug"
+
+    _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+    return _fn
+
+
+@TorchPatcher.suppress_torch_distributed_warnings
+def has_tensor_in_frame(frame):
+    """Check if the frame has torch.* related bits"""
+    # Check if the function was decorated using torch._dynamo.optimize
+    if frame.f_code in always_optimize_code_objects:
+        return True
+
+    # Check if there is global import of torch.*
+    for co_name in frame.f_code.co_names:
+        if co_name in frame.f_globals:
+            obj = frame.f_globals[co_name]
+            if isinstance(obj, types.ModuleType) and (
+                obj.__name__.startswith("torch.") or obj is torch
+            ):
+                return True
+            # ... or a global import of numpy.*
+            if np and config.trace_numpy and (obj is np or is_numpy(obj)):
+                return True
+
+    seen_ids: Dict[int, bool] = dict()
+
+    def has_tensor(obj):
+        """Recursively check if the obj has a tensor"""
+        obj_id = id(obj)
+        if obj_id in seen_ids:
+            return seen_ids[obj_id]
+        seen_ids[obj_id] = False
+
+        if isinstance(obj, (torch.Tensor, torch.nn.Module)) or (
+            istype(obj, type) and issubclass(obj, torch.nn.Module)
+        ):
+            seen_ids[obj_id] = True
+            return seen_ids[obj_id]
+        elif (
+            config.trace_numpy
+            and np
+            and (istype(obj, np.ndarray) or isinstance(obj, np.generic))
+        ):
+            seen_ids[obj_id] = True
+            return seen_ids[obj_id]
+        elif istype(obj, (list, tuple)):
+            seen_ids[obj_id] = any(has_tensor(v) for v in obj)
+            return seen_ids[obj_id]
+        elif istype(obj, dict):
+            # Some packages like pytest can be updated during runtime. So, make a
+            # copy of values to avoid issues like "RuntimeError: dictionary
+            # changed size during iteration"
+            values = list(obj.values())
+            seen_ids[obj_id] = any(has_tensor(v) for v in values)
+            return seen_ids[obj_id]
+        elif istype(obj, (str, int, float, type(None), bool)):
+            seen_ids[obj_id] = False
+            return seen_ids[obj_id]
+        elif is_namedtuple(obj) and hasattr(obj, "_fields"):
+            seen_ids[obj_id] = any(has_tensor(getattr(obj, v)) for v in obj._fields)
+            return seen_ids[obj_id]
+        else:
+            # if config.debug:
+            #     print(
+            #         f"Assuming that object of type {type(obj)} does not have a tensor"
+            #     )
+            return False
+
+    # Check if the passed arguments are of type Tensor
+    for value in frame.f_locals.values():
+        if has_tensor(value):
+            return True
+
+    log.debug(
+        "skipping because no torch.* %s \
+            %s %s",
+        frame.f_code.co_name,
+        frame.f_code.co_filename,
+        frame.f_code.co_firstlineno,
+    )
+
+    return False
+
+
+def exception_handler(e, code, frame=None, export=False):
+    record_filename = None
+    if hasattr(e, "exec_record"):
+        record_filename = gen_record_file_name(e, code)
+        write_record_to_file(record_filename, e.exec_record)
+        e.record_filename = record_filename
+
+    augment_exc_message(e, export=export)
+
+
+FRAME_COUNTER = 0
+FRAME_COMPILE_COUNTER: typing.Counter[int] = collections.Counter()
+
+
+def convert_frame_assert(
+    compiler_fn: CompilerFn,
+    one_graph: bool = True,
+    export: bool = False,
+    export_constraints=None,
+):
+    """Fully convert a frame into an FX graph"""
+    reset_graph_break_dup_checker()
+
+    def _convert_frame_assert(
+        frame: types.FrameType, cache_entry, hooks: Hooks, frame_state, *, skip: int = 0
+    ):
+        increment_frame()
+
+        code = frame.f_code
+
+        cache_size = compute_cache_size(frame, cache_entry)
+        recompile_reasons = None
+        if is_recompilation(cache_size):
+            recompile_reasons = get_and_maybe_log_recompilation_reason(
+                cache_entry, frame
+            )
+
+        input_codes.add(code)
+        if code in output_codes:
+            return None
+        if (
+            os.environ.get("TORCHDYNAMO_DEBUG_FUNCTION")
+            and os.environ.get("TORCHDYNAMO_DEBUG_FUNCTION") != code.co_name
+        ):
+            return None
+        if code.co_name == "<genexpr>" and code.co_filename.endswith(
+            (
+                "transformers/file_utils.py",
+                "transformers/utils/generic.py",
+                "diffusers/utils/outputs.py",
+            )
+        ):
+            # not needed, but cleans up torchbench error stats
+            return None
+        if code.co_name == "__setattr__":
+            # setattr could be tricky to handle generally,
+            # but also not likely useful to compile- skip the whole frame
+            return None
+        if code.co_name == "__init__" and code.co_filename.startswith(
+            os.path.dirname(torch.optim.__file__)
+        ):
+            # optimizer support is still incomplete see
+            # test_state_dict in test/dynamo/test_optimizers.py
+            return None
+
+        # Check if the frame is generated by an exec builtin call
+        # TODO - Running exec generated frame seems propagates f_globals to the
+        # next frames.
+        if code.co_name == "<module>" and code.co_filename == "<string>":
+            return None
+
+        if (
+            code.co_name == "<lambda>"
+            and code.co_filename == "<string>"
+            and not bool(frame.f_builtins)
+        ):
+            # namedtuple subclass constructor. Empty builtins cause issue with
+            # len keyword in LIST_LEN guard.
+            return None
+
+        if is_generator(code):
+            unimplemented("generator")
+        exceeded, limit_type = exceeds_cache_size_limit(cache_size)
+        if exceeded:
+
+            def format_func_info(code):
+                return f"'{code.co_name}' ({code.co_filename}:{code.co_firstlineno})"
+
+            def format_guard_failures():
+                assert recompile_reasons, "TODO(whc) any other recompile reasons?"
+                return recompile_reasons[-1]
+
+            log.warning(
+                "torch._dynamo hit config.%s (%s)\n"
+                "   function: %s\n"
+                "   last reason: %s\n"
+                'To log all recompilation reasons, use TORCH_LOGS="recompiles".\n'
+                "To diagnose recompilation issues, see %s.",
+                limit_type,
+                getattr(config, limit_type),
+                format_func_info(code),
+                format_guard_failures(),
+                troubleshooting_url,
+            )
+            unimplemented(f"{limit_type} reached")
+
+        if not has_tensor_in_frame(frame):
+            return None
+
+        global initial_global_state
+        initial_global_state = GlobalStateGuard()
+
+        global FRAME_COUNTER
+        if "_id" not in frame_state:
+            frame_state["_id"] = FRAME_COUNTER
+            FRAME_COUNTER += 1
+        frame_id = frame_state["_id"]
+
+        frame_compile_id = FRAME_COMPILE_COUNTER[frame_id]
+        FRAME_COMPILE_COUNTER[frame_id] += 1
+
+        compile_id = CompileId(frame_id, frame_compile_id)
+
+        signpost_event(
+            "dynamo",
+            "_convert_frame_assert._compile",
+            {
+                "co_name": code.co_name,
+                "co_filename": code.co_filename,
+                "co_firstlineno": code.co_firstlineno,
+                "cache_size": cache_size.num_cache_entries_with_same_id_matched_objs,
+                "accumulated_cache_size": cache_size.num_cache_entries,
+            },
+        )
+
+        return _compile(
+            frame.f_code,
+            frame.f_globals,
+            frame.f_locals,
+            frame.f_builtins,
+            compiler_fn,
+            one_graph,
+            export,
+            export_constraints,
+            hooks,
+            cache_size,
+            frame,
+            frame_state=frame_state,
+            compile_id=compile_id,
+            skip=skip + 1,
+        )
+
+    _convert_frame_assert._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
+
+    def _clone_with_backend(backend):
+        return convert_frame_assert(backend, one_graph, export, export_constraints)
+
+    _convert_frame_assert._clone_with_backend = _clone_with_backend  # type: ignore[attr-defined]
+    return _convert_frame_assert
+
+
+from collections import OrderedDict
+
+from torch.utils.hooks import RemovableHandle
+
+# we have to use `OrderedDict` to make `RemovableHandle` work.
+_bytecode_hooks: Dict[int, BytecodeHook] = OrderedDict()
+
+
+def register_bytecode_hook(hook: BytecodeHook) -> RemovableHandle:
+    """Register hooks for bytecode generated by Dynamo. The hook can do some
+    logging, as well as return a new code object to be used. Please refer
+    to `BytecodeHook` for the hook signature.
+    """
+    handle = RemovableHandle(_bytecode_hooks)
+    _bytecode_hooks[handle.id] = hook
+    return handle
+
+
+@_use_lazy_graph_module(config.use_lazy_graph_module)
+@maybe_cprofile
+def _compile(
+    code: types.CodeType,
+    globals: Dict[str, object],
+    locals: Dict[str, object],
+    builtins: Dict[str, object],
+    compiler_fn: CompilerFn,
+    one_graph: bool,
+    export: bool,
+    export_constraints,
+    hooks: Hooks,
+    cache_size: CacheSizeRelevantForFrame,
+    frame: Optional[types.FrameType] = None,
+    frame_state=None,
+    compile_id=None,
+    *,
+    skip: int = 0,
+) -> Optional[GuardedCode]:
+    from torch.fx.experimental.validator import (
+        bisect,
+        BisectValidationException,
+        translation_validation_enabled,
+        ValidationException,
+    )
+
+    output: Optional[OutputGraph] = None
+    tracer: Optional[InstructionTranslator] = None
+    # This is shared across restarts
+    mutated_closure_cell_contents: Set[str] = set()
+    speculation_log = SpeculationLog()
+    torch._dynamo.callback_handler.run_start_callbacks()
+
+    @preserve_global_state
+    def transform(instructions, code_options):
+        nonlocal output
+        nonlocal tracer
+        speculation_log.restart()
+        tracer = InstructionTranslator(
+            instructions,
+            code,
+            locals,
+            globals,
+            builtins,
+            code_options,
+            compiler_fn,
+            one_graph,
+            export,
+            export_constraints,
+            mutated_closure_cell_contents,
+            frame_state=frame_state,
+            speculation_log=speculation_log,
+        )
+
+        try:
+            with tracing(tracer.output.tracing_context), tracer.set_current_tx():
+                tracer.run()
+        except exc.UnspecializeRestartAnalysis:
+            speculation_log.clear()
+            raise
+        except (exc.SpeculationRestartAnalysis, exc.SkipFrame):
+            raise
+        except Exception:
+            if translation_validation_enabled():
+                bisect(tracer.output.shape_env)
+            raise
+        finally:
+            tracer.output.call_cleanup_hooks()
+
+        output = tracer.output
+        assert output is not None
+        assert output.output_instructions
+        instructions[:] = output.output_instructions
+        code_options.update(output.code_options)
+
+        if config.dead_code_elimination:
+            propagate_inst_exn_table_entries(instructions)
+            check_inst_exn_tab_entries_valid(instructions)
+            instructions[:] = remove_pointless_jumps(remove_dead_code(instructions))
+
+    @dynamo_timed(phase_name="entire_frame_compile")
+    def compile_inner(
+        code: types.CodeType,
+        one_graph: bool,
+        hooks: Hooks,
+        transform: Callable[[List[Instruction], Dict[str, Any]], Any],
+    ) -> Optional[GuardedCode]:
+        nonlocal output
+        for attempt in itertools.count():
+            CompileContext.get().attempt = attempt
+            try:
+                out_code = transform_code_object(code, transform)
+                break
+            except exc.RestartAnalysis as e:
+                log.info(
+                    "Restarting analysis due to %s",
+                    LazyString(format_traceback_short, e.__traceback__),
+                )
+                if attempt > 100:
+                    unimplemented("100+ RestartAnalysis() calls")
+            except exc.SkipFrame as e:
+                log.debug(
+                    "Skipping frame %s %s \
+                    %s %s",
+                    e,
+                    code.co_name,
+                    code.co_filename,
+                    code.co_firstlineno,
+                )
+                if one_graph:
+                    log.debug("No graph captured with one_graph=True")
+                return None
+
+        def log_bytecode(prefix, name, filename, line_no, code):
+            if bytecode_log.isEnabledFor(logging.DEBUG):
+                bytecode_log.debug(
+                    format_bytecode(prefix, name, filename, line_no, code)
+                )
+
+        log_bytecode(
+            "ORIGINAL BYTECODE",
+            code.co_name,
+            code.co_filename,
+            code.co_firstlineno,
+            code,
+        )
+        log_bytecode(
+            "MODIFIED BYTECODE",
+            code.co_name,
+            code.co_filename,
+            code.co_firstlineno,
+            out_code,  # type: ignore[possibly-undefined]
+        )
+
+        for hook in _bytecode_hooks.values():
+            hook_output = hook(code, out_code)
+            if hook_output is not None:
+                out_code = hook_output
+
+        orig_code_map[out_code] = code
+        output_codes.add(out_code)
+
+        assert output is not None
+
+        # Tests for new code objects.
+        # The rationale for these tests can be found in torch/csrc/dynamo/eval_frame.c
+        # Only test once the code object is created.
+        # They are not tested during runtime.
+
+        def count_args(code):
+            import inspect
+
+            return (
+                code.co_argcount
+                + code.co_kwonlyargcount
+                + bool(code.co_flags & inspect.CO_VARARGS)
+                + bool(code.co_flags & inspect.CO_VARKEYWORDS)
+            )
+
+        total_argcount_old = count_args(code)
+        total_argcount_new = count_args(out_code)
+        msg = "arg mismatch: "
+        msg += f"old code object has args {code.co_varnames[:total_argcount_old]}, "
+        msg += f"new code object has args {out_code.co_varnames[:total_argcount_new]}"
+        assert (
+            code.co_varnames[:total_argcount_old]
+            == out_code.co_varnames[:total_argcount_new]
+        ), msg
+
+        msg = "free var mismatch: "
+        msg += f"old code object has free var {code.co_freevars}, "
+        msg += f"new code object has free var {out_code.co_freevars}"
+        assert code.co_freevars == out_code.co_freevars, msg
+
+        msg = "cell var mismatch: "
+        msg += f"old code object has cell var {code.co_cellvars}, "
+        msg += f"new code object has cell var {out_code.co_cellvars}"
+        assert code.co_cellvars == out_code.co_cellvars, msg
+
+        # Skipping Dynamo on a frame without any extracted graph.
+        # This does not affect eager functionality. But this is necessary
+        # for export for cases where Dynamo-reconstructed bytecode can create
+        # new function frames, confusing export in thinking that there
+        # are extra graphs now.
+
+        if output.export and output.is_empty_graph():
+            return None
+
+        assert output.guards is not None
+        CleanupManager.instance[out_code] = output.cleanups
+        check_fn = CheckFunctionManager(
+            output,
+            hooks.guard_fail_fn if hooks else None,
+        )
+
+        guarded_code = GuardedCode(out_code, check_fn.check_fn)
+
+        if not output.is_empty_graph() and hooks.guard_export_fn is not None:
+            # We should not run the guard_export_fn when Dynamo does not
+            # generate any graph. This can happen in export when TorchDynamo
+            # generated bytecode has some reconstruction logic for mutated
+            # variables which can trigger TorchDynamo on the children frames but
+            # they are benign and do not generate any new graphs.
+            hooks.guard_export_fn(output.guards)
+
+        return guarded_code
+
+    with compile_context(CompileContext(compile_id)):
+        log.debug(
+            "torchdynamo start compiling %s %s:%s, stack (elided %s frames):\n%s",
+            code.co_name,
+            code.co_filename,
+            code.co_firstlineno,
+            skip + 2,
+            # -2: omit current frame, omit contextlib decorator
+            "".join(traceback.format_list(traceback.extract_stack()[: -2 - skip])),
+        )
+        # -4: -2 as above, plus trace_structured frames
+        torch._logging.trace_structured(
+            "dynamo_start",
+            lambda: {
+                "stack": structured.from_traceback(
+                    traceback.extract_stack()[: -4 - skip]
+                )
+            },
+        )
+        start_time = time.time()
+        fail_type: Optional[str] = None
+        fail_reason: Optional[str] = None
+        fail_user_frame_filename: Optional[str] = None
+        fail_user_frame_lineno: Optional[int] = None
+        try:
+            guarded_code = compile_inner(code, one_graph, hooks, transform)
+            return guarded_code
+        except (
+            Unsupported,
+            TorchRuntimeError,
+            BackendCompilerFailed,
+            AssertionError,
+            ConstraintViolationError,
+            GuardOnDataDependentSymNode,
+            ValidationException,
+            UncapturedHigherOrderOpError,
+            BisectValidationException,
+        ) as e:
+            fail_type = str(type(e))
+            fail_reason = str(e)
+            exception_handler(e, code, frame, export=export)
+            if e.innermost_user_frame_summary is not None:  # type: ignore[union-attr]
+                fail_user_frame_filename = e.innermost_user_frame_summary.filename  # type: ignore[union-attr]
+                fail_user_frame_lineno = e.innermost_user_frame_summary.lineno  # type: ignore[union-attr]
+            raise
+        except Exception as e:
+            fail_type = str(type(e))
+            fail_reason = str(e)
+            exception_handler(e, code, frame, export=export)
+            if e.innermost_user_frame_summary is not None:  # type: ignore[attr-defined]
+                fail_user_frame_filename = e.innermost_user_frame_summary.filename  # type: ignore[attr-defined]
+                fail_user_frame_lineno = e.innermost_user_frame_summary.lineno  # type: ignore[attr-defined]
+            raise InternalTorchDynamoError(str(e)).with_traceback(
+                e.__traceback__
+            ) from None
+        finally:
+            if tracer:
+                tracer.output.local_scope = {}
+
+            from .utils import curr_frame
+
+            frame_key = str(curr_frame)
+            if (
+                fail_reason is None
+                and output is not None
+                and frame_key in frame_phase_timing
+            ):
+                guard_count = len(output.guards)
+                shape_env_guard_count = len(output.shape_env.guards)
+                graph_op_count = output.count_calls()
+                graph_node_count = len(output.graph.nodes)
+                graph_input_count = len(output.placeholders)
+                entire_frame_compile_time = frame_phase_timing[frame_key].get(
+                    "entire_frame_compile", None
+                )
+                backend_compile_time = frame_phase_timing[frame_key].get(
+                    "backend_compile", None
+                )
+                inductor_compile_time = frame_phase_timing[frame_key].get(
+                    "inductor_compile", None
+                )
+                code_gen_time = frame_phase_timing[frame_key].get("code_gen", None)
+                non_compliant_ops = {op.__qualname__ for op in output.non_compliant_ops}
+                compliant_custom_ops = {
+                    op.__qualname__ for op in output.compliant_custom_ops
+                }
+            else:
+                guard_count = None
+                shape_env_guard_count = None
+                graph_op_count = None
+                graph_node_count = None
+                graph_input_count = None
+                entire_frame_compile_time = None
+                backend_compile_time = None
+                inductor_compile_time = None
+                code_gen_time = None
+                non_compliant_ops = set({})
+                compliant_custom_ops = set({})
+            metrics = CompilationMetrics(
+                frame_key,
+                code.co_name,
+                code.co_filename,
+                code.co_firstlineno,
+                cache_size.num_cache_entries_with_same_id_matched_objs,
+                cache_size.num_cache_entries,
+                guard_count,
+                shape_env_guard_count,
+                graph_op_count,
+                graph_node_count,
+                graph_input_count,
+                start_time,
+                entire_frame_compile_time,
+                backend_compile_time,
+                inductor_compile_time,
+                code_gen_time,
+                fail_type,
+                fail_reason,
+                fail_user_frame_filename,
+                fail_user_frame_lineno,
+                non_compliant_ops,
+                compliant_custom_ops,
+            )
+            record_compilation_metrics(metrics)
+            torch._dynamo.callback_handler.run_end_callbacks()
+
+
+def convert_frame(compiler_fn: CompilerFn, hooks: Hooks):
+    """Try to convert a frame into an FX graph, if error leave frame unmodified"""
+    inner_convert = convert_frame_assert(compiler_fn, one_graph=False)
+
+    def _convert_frame(
+        frame: types.FrameType, cache_entry, hooks: Hooks, frame_state, skip: int = 0
+    ):
+        counters["frames"]["total"] += 1
+        try:
+            result = inner_convert(
+                frame, cache_entry, hooks, frame_state, skip=skip + 1
+            )
+            counters["frames"]["ok"] += 1
+            return result
+        except Exception as e:
+            # These two exception types are "soft" failure, in the sense that
+            # we know this is due to something we didn't implement all the
+            # way, scare the user less about it.  That being said, if you
+            # are trying to understand why a graph break happened, it's still
+            # important to have this information, so offer it.
+            #
+            # NB: NotImplementedError used to be on this list, but actually
+            # it is impossible for it to reach here, as it is converted into
+            # InternalTorchDynamoError.  This behavior seemed reasonable
+            # to me (ezyang, Aug 2023) so I kept it, but maybe at some point
+            # someone wanted these to also get suppressed.  If so, you'll
+            # need to make these exceptions not get wrapped
+
+            # We intentionally don't want to suppress error here.
+            if isinstance(e, UncapturedHigherOrderOpError):
+                raise
+
+            soft_fail = isinstance(e, Unsupported)
+            if not config.suppress_errors and not soft_fail:
+                raise
+
+            # Suppress the error.  NB: It's very important to do the
+            # suppression logging HERE, where the actual suppression
+            # happens. Previously it was somewhere else and so it was
+            # possible to accidentally not log at all.
+            record_filename = getattr(e, "record_filename", None)
+            code = frame.f_code
+            error_msg = format_error_msg(e, code, record_filename, frame)
+
+            if soft_fail:
+                log.info(error_msg, exc_info=True)
+            else:
+                log.warning(error_msg, exc_info=True)
+        return None
+
+    _convert_frame._torchdynamo_orig_callable = compiler_fn  # type: ignore[attr-defined]
+    _convert_frame._clone_with_backend = lambda backend: convert_frame(backend, hooks)  # type: ignore[attr-defined]
+    return _convert_frame
+
+
+# TODO mlazos: add support for same args, or record them
+def replay(filename):
+    from .backends.debugging import eager
+
+    original_replay_val = config.replay_record_enabled
+    config.replay_record_enabled = False
+    with open(filename, "rb") as in_file:
+        record = ExecutionRecord.load(in_file)
+    record.globals = dict(itertools.chain(record.globals.items(), globals().items()))
+
+    try:
+        _compile(
+            record.code,
+            record.globals,
+            record.locals,
+            record.builtins,
+            compiler_fn=eager,
+            one_graph=False,
+            export=False,
+            export_constraints=None,
+            hooks=Hooks(),
+            cache_size=CacheSizeRelevantForFrame(0, 0),
+            frame=None,
+            frame_state={},
+        )
+    finally:
+        config.replay_record_enabled = original_replay_val
+
+
+def first_real_inst_idx(code):
+    if sys.version_info < (3, 11):
+        return 0
+    for inst in dis.get_instructions(code):
+        if inst.opname == "RESUME":
+            return inst.offset // 2
+    raise RuntimeError("RESUME instruction not found in code")
+
+
+def catch_errors_wrapper(callback, hooks: Hooks):
+    @functools.wraps(callback)
+    def catch_errors(frame, cache_entry, frame_state):
+        assert frame_state is not None
+
+        is_skipfile = trace_rules.check(frame.f_code)
+        if (
+            # TODO: the first condition is not covered by any test
+            frame.f_lasti >= first_real_inst_idx(frame.f_code)
+            or is_skipfile
+            or config.disable
+        ):
+            if log.isEnabledFor(logging.DEBUG):
+                skip_reason = (
+                    "traced frame already"
+                    if frame.f_lasti >= first_real_inst_idx(frame.f_code)
+                    else "in skipfiles"
+                    if trace_rules.check(frame.f_code)
+                    else "dynamo tracing is disabled"
+                )
+                if not is_skipfile or config.verbose:
+                    log.debug(
+                        "skipping: %s (reason: %s, file: %s)",
+                        frame.f_code.co_name,
+                        skip_reason,
+                        frame.f_code.co_filename,
+                    )
+            return None
+        if frame.f_code.co_filename == "<string>" and frame.f_code.co_name == "__new__":
+            # nametuple constructor
+            return None
+        if config._get_optimize_ddp_mode() == "ddp_optimizer":
+            ddp_module = DistributedDataParallel._get_active_ddp_module()
+            if ddp_module:
+                with compile_lock:
+                    from torch._dynamo.backends.distributed import DDPOptimizer
+
+                    ddp_optimizer = DDPOptimizer(
+                        bucket_bytes_cap=ddp_module.bucket_bytes_cap,
+                        backend_compile_fn=callback._torchdynamo_orig_callable,
+                    )
+                    assert hasattr(
+                        callback, "_clone_with_backend"
+                    ), "DDPOptimizer only supports callback fns that know how to clone themselves."
+                    hijacked_callback = callback._clone_with_backend(
+                        ddp_optimizer.compile_fn,
+                    )
+                    return hijacked_callback(frame, cache_entry, hooks, frame_state)
+
+        with compile_lock, _disable_current_modes():
+            # skip=1: skip this frame
+            return callback(frame, cache_entry, hooks, frame_state, skip=1)
+
+    catch_errors._torchdynamo_orig_callable = callback  # type: ignore[attr-defined]
+    return catch_errors
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/current_scope_id.py b/MLPY/Lib/site-packages/torch/_dynamo/current_scope_id.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a619bcbc9214c5b60788c05ccd45a3d2c1443f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/current_scope_id.py
@@ -0,0 +1,23 @@
+import contextlib
+import threading
+
+# Global variable to identify which SubgraphTracer we are in.
+# It is sometimes difficult to find an InstructionTranslator to use.
+_current_scope_id = threading.local()
+
+
+def current_scope_id():
+    global _current_scope_id
+    if not hasattr(_current_scope_id, "value"):
+        _current_scope_id.value = 1
+    return _current_scope_id.value
+
+
+@contextlib.contextmanager
+def enter_new_scope():
+    global _current_scope_id
+    try:
+        _current_scope_id.value = current_scope_id() + 1
+        yield
+    finally:
+        _current_scope_id.value = current_scope_id() - 1
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/debug_utils.py b/MLPY/Lib/site-packages/torch/_dynamo/debug_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..23ef196751642b159cfe15b63a55a57ffcbaa7cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/debug_utils.py
@@ -0,0 +1,802 @@
+# mypy: disable-error-code="method-assign"
+
+import copy
+import functools
+import getpass
+import inspect
+import itertools
+import logging
+import os
+import re
+import subprocess
+import tempfile
+import textwrap
+from collections import Counter
+from importlib import import_module
+from typing import Any, Callable, Dict, List, Optional, TypeVar
+
+import torch
+import torch._prims_common as utils
+import torch._subclasses.meta_utils
+from torch import Tensor
+
+from torch._dynamo.testing import rand_strided
+from torch._prims_common import is_float_dtype
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._content_store import ContentStoreReader, ContentStoreWriter
+
+from . import config
+from .utils import clone_inputs, get_debug_dir
+
+log = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+
+inductor_config = import_module("torch._inductor.config")
+use_buck = inductor_config.is_fbcode()
+
+if use_buck:
+    import libfb.py.build_info
+
+
+extra_deps = []
+extra_imports = ""
+if use_buck:
+    extra_deps = [
+        "//caffe2/torch/fb/sparsenn:sparsenn_operators_gpu",
+        "//caffe2/torch/fb/sparsenn:sparsenn_operators",
+        "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu",
+        "//deeplearning/fbgemm/fbgemm_gpu:sparse_ops",
+    ]
+    cur_target = libfb.py.build_info.BuildInfo.get_build_rule().replace("fbcode:", "//")  # type: ignore[possibly-undefined]
+    extra_imports = "\n".join([f'torch.ops.load_library("{x}")' for x in extra_deps])
+
+
+BUCK_CMD_PREFIX = ["buck2", "run", "@mode/dev-nosan"]
+
+
+class BuckTargetWriter:
+    def __init__(self, filename):
+        self.subdir, self.py_file = os.path.split(os.path.abspath(filename))
+        self.target = self.py_file.replace(".py", "")
+
+        # Get main_module path from fbcode
+        self.path = f'{self.subdir.replace("/", ".")}.{self.target}'
+        self.path = self.path[self.path.find("fbcode.") :]
+        self.path = self.path[7:]
+
+        # Get cmd line path
+        tmp = self.subdir
+        tmp = tmp[tmp.find("fbcode/") :][7:]
+        self.cmd_line_path = f"//{tmp}:{self.target}"
+
+    def build(self):
+        extra_cpp_deps = "\n".join([f'        "{x}",' for x in extra_deps])
+        return textwrap.dedent(
+            f"""
+load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
+
+python_binary(
+    name="{self.target}",
+    srcs = ["{self.py_file}"],
+    compile = False,
+    deps = [
+        "//caffe2:torch",
+        "//caffe2/functorch:functorch",
+        "//triton:triton",
+        "{cur_target}",
+    ],
+    cpp_deps = [
+{extra_cpp_deps}
+    ],
+    main_module = "{self.path}",
+    par_style = "xar",
+)
+"""
+        )
+
+    def write(self, print_msg=True):
+        target_file = os.path.join(self.subdir, "TARGETS")
+        with open(target_file, "w") as fd:
+            fd.write(self.build())
+        # log.warning("Wrote isolation TARGETS file at %s", target_file)
+        cmd_split = BUCK_CMD_PREFIX + [self.cmd_line_path]
+        if print_msg:
+            log.warning(
+                "Found an example that reproduces the error. Run this cmd to repro - %s",
+                " ".join(cmd_split),
+            )
+        return cmd_split
+
+
+def minifier_dir():
+    path = os.path.join(get_debug_dir(), "minifier")
+    if path is None:
+        path = f"{tempfile.gettempdir()}/minifier_{getpass.getuser()}"
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+    return path
+
+
+MAX_CONSTANT_NUMEL_INLINE = 4
+
+
+class NNModuleToString:
+    safe_reprs = [
+        torch.nn.Linear,
+        torch.nn.Conv1d,
+        torch.nn.Conv2d,
+        torch.nn.Conv3d,
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.Dropout,
+        torch.nn.Softmax,
+        torch.nn.ReLU,
+        torch.nn.GELU,
+        torch.nn.Identity,
+        torch.nn.MaxPool2d,
+        torch.nn.Embedding,
+        torch.nn.Tanh,
+        torch.nn.ConvTranspose1d,
+        torch.nn.GLU,
+        torch.nn.LSTM,
+        torch.nn.Flatten,
+        torch.nn.AdaptiveAvgPool2d,
+    ]
+
+    @staticmethod
+    def can_convert_to_string(gm):
+        cant_convert = set()
+        for _, module in gm.named_children():
+            if type(module) not in NNModuleToString.safe_reprs:
+                cant_convert.add(module)
+
+        if len(cant_convert) > 0:
+            log.warning("We have not tested reprs of some modules - %s", cant_convert)
+        # TODO - Assuming that all modules can be safely repr'd. Check if that assumption is correct.
+        return True
+
+    @staticmethod
+    def convert(gm):
+        from torch.nn.modules.module import _addindent
+
+        tab = " " * 4
+
+        model_str = textwrap.dedent(
+            """
+            from torch.nn import *
+            class Repro(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+            """
+        )
+
+        for module_name, module in gm.named_children():
+            module_str = f"{module.__repr__()}"
+            # module should be a core torch.nn.Module, so all parameters
+            # should be on the same device.
+            example_param = next(module.parameters(), None)
+            if example_param is not None and example_param.is_cuda:
+                module_str = f"{module_str}.cuda()"
+            model_str += f"{tab*2}self.{module_name} = {module_str}\n"
+
+        for buffer_name, buffer in gm._buffers.items():
+            if buffer is None:
+                continue
+            # Serialize full data for small buffers
+            if buffer.numel() <= MAX_CONSTANT_NUMEL_INLINE:
+                from torch._tensor_str import PRINT_OPTS
+
+                assert PRINT_OPTS.threshold >= MAX_CONSTANT_NUMEL_INLINE
+                tensor_str = repr(buffer)
+            elif torch.is_floating_point(buffer):
+                tensor_str = f"torch.randn({list(buffer.shape)}, dtype={buffer.dtype})"
+            else:
+                tensor_str = (
+                    f"torch.randint(1, size={list(buffer.shape)}, dtype={buffer.dtype})"
+                )
+            if buffer.is_cuda:
+                tensor_str = f"{tensor_str}.cuda()"
+            model_str += f"{tab*2}self.register_buffer('{buffer_name}', {tensor_str})\n"
+
+        for param_name, param in gm._parameters.items():
+            if param is None:
+                continue
+            maybe_device = ""
+            if param.is_cuda:
+                maybe_device = ', device="cuda"'
+            tensor_str = f"torch.nn.Parameter(torch.randn({list(param.shape)}, dtype={param.dtype}{maybe_device}))"
+            model_str += f"{tab*2}self.{param_name} = {tensor_str}\n"
+
+        # TODO - Keep this code for now. But, I don't think we will need this.
+        # attrs = dir(gm)
+        # for attr in attrs:
+        #     if "_tensor_constant" in attr:
+        #         val = getattr(gm, attr)
+        #         model_str += f"    {attr} = {val!r}\n"
+
+        model_str += f"{_addindent(gm.code, 4)}\n"
+        return model_str
+
+
+@functools.lru_cache(None)  # subprocess is expensive
+def _cuda_system_info_comment():
+    if not torch.cuda.is_available():
+        return "# torch.cuda.is_available()==False, no GPU info collected\n"
+
+    model_str = "# CUDA Info: \n"
+    try:
+        cuda_version_out = subprocess.check_output(["nvcc", "--version"])
+        cuda_version_lines = cuda_version_out.decode().split("\n")
+        comment = "".join([f"# {s} \n" for s in cuda_version_lines if s not in [""]])
+        model_str += f"{comment}\n"
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        model_str += "# nvcc not found\n"
+
+    gpu_names = Counter(
+        torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())
+    )
+
+    model_str += "# GPU Hardware Info: \n"
+    for name, count in gpu_names.items():
+        model_str += f"# {name} : {count} \n"
+    model_str += "\n"
+    return model_str
+
+
+def generate_config_string(*, stable_output=False):
+    import torch._functorch.config
+    import torch._inductor.config
+
+    if stable_output:
+        return "# config omitted due to stable_output=True"
+
+    experimental_config = torch.fx.experimental._config.codegen_config()  # type: ignore[attr-defined]
+    return f"""\
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+import torch.fx.experimental._config
+{torch._dynamo.config.codegen_config()}
+{torch._inductor.config.codegen_config()}
+{torch._functorch.config.codegen_config()}
+{experimental_config}
+"""
+
+
+def get_minifier_repro_path():
+    return os.path.join(minifier_dir(), "minifier_launcher.py")
+
+
+def helper_for_dump_minify(contents):
+    minified_repro_path = get_minifier_repro_path()
+    log.warning("Writing minified repro to:\n%s", minified_repro_path)
+
+    if use_buck:
+        BuckTargetWriter(minified_repro_path).write()
+    try:
+        with open(minified_repro_path, "w") as fd:
+            fd.write(contents)
+
+    except OSError as e:
+        log.exception(e)
+        raise NotImplementedError("Could not write to {minified_repro_path}") from e
+
+
+class AccuracyError(Exception):
+    pass
+
+
+def clone_inputs_retaining_gradness(example_inputs):
+    """
+    This clone inputs is different from utils clone_input. In case of minifier,
+    all the tensors are leaf tensors while creating a new graph. So, we set the
+    requires_grad field w/o checking the leafness of the tensor.
+    """
+    cloned_inputs = clone_inputs(example_inputs)
+    for idx in range(len(example_inputs)):
+        if isinstance(cloned_inputs[idx], torch.Tensor):
+            cloned_inputs[idx].requires_grad_(example_inputs[idx].requires_grad)
+    return cloned_inputs
+
+
+def run_fwd_maybe_bwd(gm, args, only_fwd=False, disable_clone=False):
+    """
+    Runs a forward and possibly backward iteration for a given mod and args.
+
+    When disable_clone is True, we will use args as-is without cloning.
+    This is higher fidelity but we may destroy the args in the process.
+    """
+    from torch._functorch.aot_autograd import make_boxed_func
+
+    from .testing import collect_results, reduce_to_scalar_loss, requires_bwd_pass
+
+    gm = copy.deepcopy(gm)
+    if not disable_clone:
+        args = clone_inputs_retaining_gradness(args)
+
+    if hasattr(gm, "zero_grad"):
+        gm.zero_grad(True)
+
+    # TorchInductor returned callable expects lists. So, boxing the call.
+    orig_named_parameters = getattr(gm, "named_parameters", None)
+    orig_named_buffers = getattr(gm, "named_buffers", None)
+    if not hasattr(gm, "_boxed_call") and (
+        orig_named_parameters is not None or orig_named_buffers is not None
+    ):
+        gm = make_boxed_func(gm)
+        if orig_named_parameters is not None:
+            gm.named_parameters = orig_named_parameters
+        if orig_named_buffers is not None:
+            gm.named_buffers = orig_named_buffers
+
+    out = gm(args)
+    if only_fwd:
+        return out
+    if requires_bwd_pass(out):
+        loss = reduce_to_scalar_loss(out)
+        loss.backward()
+    return collect_results(gm, out, None, args)
+
+
+def same_two_models(
+    gm,
+    opt_gm,
+    example_inputs,
+    only_fwd=False,
+    *,
+    require_fp64=False,
+    ignore_non_fp=False,
+):
+    """
+    Check two models have same accuracy.
+
+    require_fp64: if True, raise an error if we unable to calculate the fp64 reference
+    ignore_non_fp: if True, do not compare outputs which are not floating point.  This
+        is mostly useful for the minifier (which wants to avoid quantizing floating point
+        error into integer/boolean error)
+    """
+    from .eval_frame import OptimizedModule
+    from .testing import (
+        named_buffers_for_optimized_module,
+        named_parameters_for_optimized_module,
+    )
+    from .utils import same
+
+    if isinstance(gm, OptimizedModule):
+        gm.named_parameters = named_parameters_for_optimized_module(gm)
+        gm.named_buffers = named_buffers_for_optimized_module(gm)
+
+    if isinstance(opt_gm, OptimizedModule):
+        opt_gm.named_parameters = named_parameters_for_optimized_module(opt_gm)
+        opt_gm.named_buffers = named_buffers_for_optimized_module(opt_gm)
+
+    ref = run_fwd_maybe_bwd(gm, example_inputs, only_fwd)
+
+    fp64_ref = None
+    if config.same_two_models_use_fp64:
+        try:
+            fp64_model, fp64_examples = cast_to_fp64(
+                copy.deepcopy(gm), clone_inputs_retaining_gradness(example_inputs)
+            )
+            fp64_ref = run_fwd_maybe_bwd(fp64_model, fp64_examples, only_fwd)
+        except Exception:
+            if require_fp64:
+                raise RuntimeError("Could not generate fp64 outputs")  # noqa: TRY200
+            log.warning("Could not generate fp64 outputs")
+
+    try:
+        res = run_fwd_maybe_bwd(opt_gm, example_inputs, only_fwd)
+    except Exception as e:
+        # This means that the minified graph is bad/exposes a different problem.
+        # As we are checking accuracy here, lets log the exception and return True.
+        log.exception(
+            "While minifying the program in accuracy minification mode, "
+            "ran into a runtime exception which is likely an unrelated issue."
+            " Skipping this graph."
+        )
+        return True
+
+    passing = same(
+        ref,
+        res,
+        fp64_ref,
+        tol=config.repro_tolerance,
+        equal_nan=True,
+        ignore_non_fp=ignore_non_fp,
+    )
+    return passing
+
+
+def cast_dtype_args_to_fp64(model):
+    for node in model.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target == torch.ops.prims.convert_element_type.default
+        ):
+            assert len(node.args) == 2
+            if is_float_dtype(node.args[1]) and node.args[1] != torch.float64:
+                node.args = (node.args[0], torch.float64)
+        if node.op == "call_function":
+            dtype = node.kwargs.get("dtype")
+            if dtype is not None and is_float_dtype(dtype):
+                new_kwargs = dict(node.kwargs)
+                new_kwargs["dtype"] = torch.float64
+                node.kwargs = new_kwargs
+
+    model.graph.lint()
+    model.recompile()
+    return model
+
+
+def cast_to(dtype, model, inputs):
+    from torch.utils._pytree import tree_map
+
+    model = model.to(dtype)
+    if dtype == torch.float64:
+        # If casting to fp64 for accuracy comparison, we need to
+        # replace dtype arguments embedded in the graph with fp64
+        model = cast_dtype_args_to_fp64(model)
+
+    inputs = tree_map(
+        lambda x: x.to(dtype)
+        if isinstance(x, torch.Tensor) and x.is_floating_point()
+        else x,
+        inputs,
+    )
+    return model, inputs
+
+
+def cast_to_fp64(model, inputs):
+    return cast_to(torch.float64, model, inputs)
+
+
+def backend_accuracy_fails(
+    gm,
+    example_inputs,
+    compiler_fn,
+    only_fwd=False,
+    *,
+    require_fp64=False,
+    ignore_non_fp=False,
+):
+    try:
+        compiled_gm = compiler_fn(
+            copy.deepcopy(gm), clone_inputs_retaining_gradness(example_inputs)
+        )
+        return not same_two_models(
+            gm,
+            compiled_gm,
+            example_inputs,
+            only_fwd,
+            require_fp64=require_fp64,
+            ignore_non_fp=ignore_non_fp,
+        )
+    except Exception as e:
+        # This means that the minified graph is bad/exposes a different problem.
+        # As we are checking accuracy here, lets log the exception and return False.
+        log.exception(
+            "While minifying the program in accuracy minification mode, "
+            "ran into a runtime exception which is likely an unrelated issue."
+            " Skipping this graph"
+        )
+        return False
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                       REPRO SUPPORT CODE
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+# Helper functions for computing what the default values of tensor
+# values should be.  These all coincide with factory functions, e.g., torch.empty
+
+
+def _stride_or_default(
+    stride: Optional["torch._prims_common.StrideType"],
+    *,
+    shape: "torch._prims_common.ShapeType",
+) -> "torch._prims_common.StrideType":
+    return stride if stride is not None else utils.make_contiguous_strides_for(shape)
+
+
+def _mk_defaulter(d: T) -> Callable[[Optional[T]], T]:
+    return lambda x: x if x is not None else d
+
+
+_dtype_or_default = _mk_defaulter(torch.float32)
+_device_or_default = _mk_defaulter(torch.device("cpu"))
+_storage_offset_or_default = _mk_defaulter(0)
+_requires_grad_or_default = _mk_defaulter(False)
+_is_leaf_or_default = _mk_defaulter(False)
+
+
+class NopInputReader:
+    def __init__(self):
+        self.total = 0
+
+    def storage(self, storage_hash, nbytes, *, device=None, dtype_hint=None):
+        self.total += 1
+
+    def tensor(self, *args, **kwargs):
+        pass
+
+    def symint(self, *args, **kwargs):
+        pass
+
+
+# TODO: Support bundling the entire repro into a zip file for ease of
+# transferring around
+class InputReader:
+    def __init__(self, save_dir=None, *, pbar=None):
+        # If None, we will generate random data instead.  It's important
+        # to natively support this use case as it will allow people to
+        # share repros without including the real data, if the problem
+        # reproduces even on random data.
+        if save_dir is None:
+            log.warning("no save_dir specified, will generate random data")
+        self.store = ContentStoreReader(save_dir) if save_dir is not None else None
+        self.args = []
+        self.pbar = pbar
+
+    def storage(self, storage_hash, nbytes, *, device=None, dtype_hint=None):
+        if self.pbar is not None:
+            self.pbar.update(1)
+        device = _device_or_default(device)
+        dtype_hint = _dtype_or_default(dtype_hint)
+        if self.store is not None and storage_hash is not None:
+            try:
+                storage = self.store.read_storage(storage_hash)
+            except FileNotFoundError:
+                pass
+            else:
+                if device != storage.device:
+                    log.warning("device mismatch: %s != %s", device, storage.device)
+                    # TODO: transfer it to the right device?  But failing this
+                    # way would be very mysterious!  Would have been better
+                    # not to store device in the serialized format...
+                return storage
+        log.warning("could not load %s, generating random data instead", storage_hash)
+        shape = (nbytes // dtype_hint.itemsize,)
+        stride = _stride_or_default(None, shape=shape)
+        return rand_strided(shape, stride, dtype_hint, device).untyped_storage()
+
+    def tensor(
+        self,
+        storage,
+        shape,
+        stride=None,
+        *,
+        storage_offset=None,
+        dtype=None,
+        requires_grad=None,
+        is_leaf=None,
+        **metadata,
+    ):
+        stride = _stride_or_default(stride, shape=shape)
+        storage_offset = _storage_offset_or_default(storage_offset)
+        dtype = _dtype_or_default(dtype)
+        is_leaf = _is_leaf_or_default(is_leaf)
+        requires_grad = _requires_grad_or_default(requires_grad)
+        t = torch.tensor(
+            [], dtype=dtype, device=storage.device, requires_grad=requires_grad
+        )
+        with torch.no_grad():
+            t.set_(storage, storage_offset, shape, stride)
+        if not is_leaf:
+            # Fake up some autograd history in a very naughty way
+            with torch.enable_grad():
+                t = t.clone(memory_format=torch.preserve_format)
+            with torch.no_grad():
+                t.set_(storage, storage_offset, shape, stride)
+        assert torch._subclasses.meta_utils.safe_is_leaf(t) == is_leaf
+        torch._utils.set_tensor_metadata(t, metadata)
+        self.args.append(t)
+        return t  # for BC
+
+    def symint(self, val):
+        self.args.append(val)
+        return val  # for BC
+
+
+# Here is our writer strategy:
+#  1. We will stream all of the inputs to disk
+#  2. You can now deterministically randomize the inputs, or reload
+#     the inputs from disk
+#  3. You can YOLO run the script without the inputs, in which case
+#     we'll fill the inputs with random data and pray.  This is the
+#     legacy behavior, but it's also useful if you want to find out
+#     if we're so broken even random inputs trigger it
+#  4. We could offer an in process "check if the randomized thing
+#     works too" but this is delicate so we don't do it
+
+
+class InputWriter:
+    def __init__(self, save_dir, *, stable_hash=False):
+        self._lines = []
+        # TODO: consider ensuring tensor and storage counters line up?
+        self.storage_counter = itertools.count()
+        self.save_dir = save_dir
+        self.store = (
+            ContentStoreWriter(save_dir, stable_hash=stable_hash)
+            if save_dir is not None
+            else None
+        )
+        self.seen_storages = {}
+
+    def lines(self):
+        r = [
+            "def load_args(reader):",
+        ]
+        r.extend(f"    {l}" for l in self._lines)
+        # In case we need to change the internal format of load_args
+        # in an FC-breaking way
+        r.append("load_args._version = 0")
+        return r
+
+    # Storages are untyped, but we need to initialize them with data if
+    # we don't have the real data, so we give a hint saying what kind
+    # of initialization may be appropriate
+    #
+    # If we had a FakeTensor, device_hint tells us what device should be
+    def storage(self, untyped_storage, *, dtype_hint=None, device_hint=None) -> str:
+        ws = StorageWeakRef(untyped_storage)
+        v = self.seen_storages.get(ws)
+        if v is not None:
+            return v
+        v = f"buf{next(self.storage_counter)}"
+        maybe_dtype_hint = ""
+        if _dtype_or_default(None) != _dtype_or_default(dtype_hint):
+            maybe_dtype_hint = f", dtype_hint={dtype_hint!r}"
+        # TODO: being optional on device is kind of pointless as the default
+        # is CPU but most repros we care about are CUDA
+        maybe_device = ""
+        device = untyped_storage.device
+        if device.type == "meta":
+            assert device_hint is not None
+            device = device_hint
+        if _device_or_default(None) != device:
+            maybe_device = f", device={device!r}"
+        nbytes = untyped_storage.nbytes()
+        storage_hash = None
+        if self.store is not None and untyped_storage.device.type != "meta":
+            storage_hash = self.store.write_storage(untyped_storage)
+        self._lines.append(
+            f"{v} = reader.storage({storage_hash!r}, {nbytes!r}{maybe_device}{maybe_dtype_hint})"
+        )
+        self.seen_storages[ws] = v
+        return v
+
+    def tensor(self, name, t) -> None:
+        storage = self.storage(
+            t.untyped_storage(), dtype_hint=t.dtype, device_hint=t.device
+        )
+        args = []
+        # NB: this is positional, must come first
+        if _stride_or_default(None, shape=t.shape) != t.stride():
+            args.append(str(tuple(t.stride())))
+        if _dtype_or_default(None) != t.dtype:
+            args.append(f"dtype={t.dtype!r}")
+        if _storage_offset_or_default(None) != t.storage_offset():
+            args.append(f"storage_offset={t.storage_offset()!r}")
+        tensor_metadata = torch._utils.get_tensor_metadata(t)
+        if tensor_metadata:
+            args.extend(f"{k}={v!r}" for k, v in tensor_metadata.items())
+        if _requires_grad_or_default(None) != t.requires_grad:
+            args.append(f"requires_grad={t.requires_grad!r}")
+        is_leaf = torch._subclasses.meta_utils.safe_is_leaf(t)
+        if _is_leaf_or_default(None) != is_leaf:
+            args.append(f"is_leaf={is_leaf!r}")
+        self._lines.append(
+            "reader.tensor("
+            + ", ".join([storage, str(tuple(t.shape)), *args])
+            + f")  # {name}"
+        )
+
+    # TODO: this doesn't actually symint atm
+    def symint(self, name, val) -> None:
+        if isinstance(val, torch.SymInt):
+            val = val.node.hint
+        self._lines.append(f"reader.symint({val!r})  # {name}")
+
+
+def aot_graph_input_parser(
+    func: Callable[[List[Tensor]], List[Tensor]],
+    device: str = "cuda",
+    sym_shapes: Optional[Dict[str, int]] = None,
+    default_sym_shape: Optional[int] = None,
+) -> Dict[str, Any]:
+    """
+    Takes in a function which has been printed with print_readable() and constructs kwargs to run it.
+
+    Handles Tensor inputs, Symints, and a graph module which might have tensor constants.
+
+    Consider a function `forward` defined as follows:
+
+    def forward(self, primals_1: "f32[1001, 6]", primals_2: "f32[s0]", primals_3: "Sym(s0)",):
+        _tensor_constant0: "i64[4190]" = self._tensor_constant0
+        # Further implementation
+
+    kwargs = aot_graph_input_parser(forward)
+    forward(**kwargs)
+    """
+
+    from torch.fx.graph import dtype_abbrs
+
+    dtype_map = {value: key for key, value in dtype_abbrs.items()}
+    dtype_pattern = "|".join(dtype_abbrs.values())
+
+    # Extracting the source code from the function
+    source = inspect.getsource(func)
+
+    # Regular expressions
+    tensor_assignment_regex = rf"(_tensor_constant\d+): \"({dtype_pattern})\[\s*(.*?)\s*\]\" = self\.(_tensor_constant\d+)"
+    tensor_regex = rf"({dtype_pattern})\[\s*(.*?)\s*\]"
+    sym_shape_regex = r"Sym\((s\d+)\)"
+
+    class TensorContainer:
+        "Container for tensors as attributes"
+        pass
+
+    # Dictionary for tensors from annotations
+    kwargs: Dict[str, Any] = {}
+
+    sym_shapes = sym_shapes or {}
+
+    def get_sym_int(symint):
+        torch._check(
+            symint in sym_shapes or default_sym_shape is not None,
+            lambda: f"{symint} not in symbolic_shapes and default sym shape not passed in",
+        )
+        return sym_shapes.get(symint, default_sym_shape)
+
+    def gen_tensor(shape, dtype) -> Tensor:
+        # Resolve symbolic shapes to concrete values
+        resolved_shape = []
+        dynamic_dims = []
+        for i, dim in enumerate(shape):
+            dim = dim.strip()
+            if "s" in dim:
+                s = get_sym_int(dim)
+                resolved_shape.append(s)
+                dynamic_dims.append(i)
+            else:
+                resolved_shape.append(int(dim))
+
+        constructor = torch.randn if dtype.is_floating_point else torch.zeros
+        out = constructor(resolved_shape, dtype=dtype, device=device)  # type: ignore[call-arg]
+        for d in dynamic_dims:
+            torch._dynamo.mark_dynamic(out, d)
+        return out
+
+    # Parse function annotations for tensor generation
+    annotations = func.__annotations__
+    for param, annotation in annotations.items():
+        # Skip 'return' annotation
+        if param == "return":
+            continue
+
+        match = re.search(tensor_regex, annotation)
+        if match:
+            data_type, shape_str = match.groups()
+            shape = tuple(shape_str.split(","))
+            dtype = dtype_map[data_type]
+            kwargs[param] = gen_tensor(shape, dtype)
+
+        match = re.search(sym_shape_regex, annotation)
+        if match:
+            kwargs[param] = get_sym_int(match.group(1))
+
+    if "self" in inspect.signature(func).parameters:
+        container = TensorContainer()
+        kwargs["self"] = container
+        for match in re.finditer(tensor_assignment_regex, source):
+            attr_name, data_type, shape_str, _ = match.groups()
+            shape = tuple(shape_str.split(","))
+            dtype = dtype_map[data_type]
+            setattr(container, attr_name, gen_tensor(shape, dtype))
+
+    return kwargs
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/decorators.py b/MLPY/Lib/site-packages/torch/_dynamo/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..43a51da6151c6f7fce9e4b979fbb098e35e35ad9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/decorators.py
@@ -0,0 +1,347 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import torch
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from . import trace_rules, variables
+from .comptime import comptime
+from .eval_frame import DisableContext, innermost_fn, RunOnlyContext
+from .exc import IncorrectUsage
+from .external_utils import is_compiling
+
+if TYPE_CHECKING:
+    from torch._C._dynamo.eval_frame import (  # noqa: F401
+        reset_code,
+        set_eval_frame,
+        set_guard_error_hook,
+        skip_code,
+        unsupported,
+    )
+else:
+    for name in dir(torch._C._dynamo.eval_frame):
+        if name.startswith("__"):
+            continue
+        globals()[name] = getattr(torch._C._dynamo.eval_frame, name)
+
+
+def run(fn=None):
+    """Don't do any dynamic compiles, just use prior optimizations"""
+    if fn is not None:
+        fn = innermost_fn(fn)
+        assert callable(fn)
+        return RunOnlyContext()(fn)
+    return RunOnlyContext()
+
+
+def disable(fn=None, recursive=True):
+    """
+    Decorator and context manager to disable TorchDynamo
+
+    If recursive=True, Dynamo is completely skipped on the decorated function
+    frame as well as the recursively invoked functions.
+
+    If recursive=False, Dynamo skips frames associated with the function code,
+    but still process recursively invoked frames.
+    """
+    if recursive:
+        if fn is not None:
+            fn = innermost_fn(fn)
+            assert callable(fn)
+            return DisableContext()(fn)
+        return DisableContext()
+    else:
+        return skip(fn)
+
+
+def skip(fn=None):
+    """
+    Skip frames associated with the function code, but still process recursively
+    invoked frames
+    """
+    if fn is None:
+        return skip
+    fn = innermost_fn(fn)
+    assert callable(fn)
+    skip_code(fn.__code__)
+    fn._torchdynamo_disable = True
+    return fn
+
+
+def assume_constant_result(fn):
+    fn._dynamo_marked_constant = True
+    return fn
+
+
+def allow_in_graph(fn):
+    """
+    Customize which functions TorchDynamo will include in the generated
+    graph. Similar to `torch.fx.wrap()`.
+    ::
+
+        torch._dynamo.allow_in_graph(my_custom_function)
+
+        @torch._dynamo.optimize(...)
+        def fn(a):
+            x = torch.add(x, 1)
+            x = my_custom_function(x)
+            x = torch.add(x, 1)
+            return x
+
+        fn(...)
+
+    Will capture a single graph containing `my_custom_function()`.
+    """
+    if isinstance(fn, (list, tuple)):
+        return [allow_in_graph(x) for x in fn]
+    assert callable(fn), "allow_in_graph expects a callable"
+    if trace_rules.lookup_callable(fn) != variables.TorchInGraphFunctionVariable:
+        trace_rules._disallowed_callable_ids.remove(id(fn))
+        trace_rules._allowed_callable_ids.add(id(fn))
+    return fn
+
+
+def _disallow_in_graph_helper(throw_if_not_allowed):
+    def inner(fn):
+        if isinstance(fn, (list, tuple)):
+            return [disallow_in_graph(x) for x in fn]
+        assert callable(fn), "disallow_in_graph expects a callable"
+        if (
+            throw_if_not_allowed
+            and trace_rules.lookup_callable(fn)
+            != variables.TorchInGraphFunctionVariable
+            and trace_rules.lookup(fn) != variables.TorchInGraphFunctionVariable
+        ):
+            raise IncorrectUsage(
+                "disallow_in_graph is expected to be used on an already allowed callable (like torch.* ops). "
+                "Allowed callables means callables that TorchDynamo puts as-is in the extracted graph."
+            )
+        trace_rules._allowed_callable_ids.remove(id(fn))
+        trace_rules._disallowed_callable_ids.add(id(fn))
+        return fn
+
+    return inner
+
+
+def disallow_in_graph(fn):
+    """
+    Customize which functions TorchDynamo will exclude in the generated
+    graph and force a graph break on.
+    ::
+
+        torch._dynamo.disallow_in_graph(torch.sub)
+
+        @torch._dynamo.optimize(...)
+        def fn(a):
+            x = torch.add(x, 1)
+            x = torch.sub(x, 1)
+            x = torch.add(x, 1)
+            return x
+
+        fn(...)
+
+    Will break the graph on `torch.sub`, and give two graphs each with a
+    single `torch.add()` op.
+    """
+    return _disallow_in_graph_helper(throw_if_not_allowed=True)(fn)
+
+
+@_disallow_in_graph_helper(throw_if_not_allowed=False)
+def graph_break():
+    """Force a graph break"""
+    pass
+
+
+def forbid_in_graph(fn):
+    """
+    Customize which functions TorchDynamo will assert are not present while tracing.
+
+    If you want a graph break on this function instead, use disallow_in_graph.
+    TODO(voz): We now have allow_in_graph, disallow_in_graph, forbid_in_graph - some more robust
+    documentation would not be amiss.
+    """
+    if isinstance(fn, (list, tuple)):
+        return [forbid_in_graph(x) for x in fn]
+    assert callable(fn), "forbid_in_graph applies only to callables"
+    fn._dynamo_forbidden = True
+    return fn
+
+
+# Helper function to flatten a tensor subclass and apply a function to
+# all inner tensors that match the outer dim. Used to reduce duplication
+# across the various marking APIs.
+def _apply_func_to_inner_tensors_of_same_dim(func, t, *args, **kwargs):
+    assert is_traceable_wrapper_subclass(t)
+
+    attrs, ctx = t.__tensor_flatten__()
+    for attr in attrs:
+        inner = getattr(t, attr)
+        if inner.dim() == t.dim():
+            func(inner, *args, **kwargs)
+
+
+@dataclass(frozen=True)
+class _DimRange:
+    """
+    This represents an dimension of a tensor and the corresponding
+    min and max values it can take.  Don't create this
+    class directly; instead, use :func:`mark_dynamic`.
+    """
+
+    dim: int
+    min: int
+    max: int
+
+
+@forbid_in_graph
+def mark_dynamic(t, index, *, min=None, max=None):
+    """
+    Mark a tensor as having a dynamic dim and set corresponding min and max range for the dim.
+
+    [Note - on the state of mark_dynamic]
+
+    The behavior of having a dynamic dimension on a tensor is governed by a few factors:
+
+    1) torch._dynamo.config dynamic_shapes True or False.
+        a) dynamic_shapes=True - dynamic_shapes must be True for mark_dynamic to work.
+        a) dynamic_shapes=False - This config will raise an exception when used in conjunction with
+        mark_dynamic. We will eventually support this.
+
+    2) If the dimension is fully constrained - as in, it does not allow more than a single value
+    in both eager (torch.compile, torch._dynamo.optimize) mode and export mode (torch._dynamo.export),
+    we will raise an error
+
+    3) If the dimension is partially constrained - allowing at least 2 values but not the full unbounded
+    range of shapes, in eager we will pass it through, but export will raise an error.
+
+    4) Attempts to trace this function will explicitly raise. As such, all calls to mark_dynamic must be made
+    before torch.compile.
+
+    """
+    if is_traceable_wrapper_subclass(t):
+        # default behavior: mirror mark_dynamic() on all inner tensors with same dim as t
+        # TODO: Make this configurable via a supported public API
+        _apply_func_to_inner_tensors_of_same_dim(
+            mark_dynamic, t, index, min=min, max=max
+        )
+
+    if isinstance(index, int):
+        if not hasattr(t, "_dynamo_dynamic_indices"):
+            t._dynamo_dynamic_indices = set()
+            t._dynamo_dynamic_range = set()
+        # TODO(voz): Should we bounds check?
+        t._dynamo_dynamic_indices.add(index)
+        t._dynamo_dynamic_range.add(_DimRange(index, min, max))
+        return
+
+    assert isinstance(index, (list, tuple))
+    for i in index:
+        mark_dynamic(t, i, min=min, max=max)
+
+
+@forbid_in_graph
+def maybe_mark_dynamic(t, index):
+    """
+    Mark a tensor as having a dynamic dim, but don't enforce it (i.e., if this
+    dimension ends up getting specialized, don't error).
+    """
+    if is_traceable_wrapper_subclass(t):
+        # default behavior: mirror maybe_mark_dynamic() on all inner tensors with same dim as t
+        # TODO: Make this configurable via a supported public API
+        _apply_func_to_inner_tensors_of_same_dim(maybe_mark_dynamic, t, index)
+
+    if isinstance(index, int):
+        if not hasattr(t, "_dynamo_weak_dynamic_indices"):
+            t._dynamo_weak_dynamic_indices = set()
+        # TODO(voz): Should we bounds check?
+        t._dynamo_weak_dynamic_indices.add(index)
+        return
+
+    assert isinstance(index, (list, tuple))
+    for i in index:
+        maybe_mark_dynamic(t, i)
+
+
+def mark_static(t, index=None):
+    """
+    Mark a tensor as having a static dim.
+
+    This will prevent us from attempting to compile it dynamically
+    when dynamic=True; this can improve trace-time performance.
+
+    This has lower precedence than mark_dynamic.
+
+    Unlike mark_dynamic, this can be done inside a graph, in which case it
+    induces specialization on the tensor.
+    """
+    if is_compiling():
+        if index is None:
+            for s in t.size():
+                comptime.force_static(s)
+        else:
+            comptime.force_static(t.size(index))
+        return
+
+    if is_traceable_wrapper_subclass(t):
+        # default behavior: mirror mark_static() on all inner tensors with same dim as t
+        # TODO: Make this configurable via a supported public API
+        _apply_func_to_inner_tensors_of_same_dim(mark_static, t, index)
+
+    if isinstance(index, int):
+        if not hasattr(t, "_dynamo_static_indices"):
+            t._dynamo_static_indices = set()
+        # TODO(voz): Should we bounds check?
+        t._dynamo_static_indices.add(index)
+    elif index is None:
+        for i in range(t.dim()):
+            mark_static(t, i)
+    else:
+        assert isinstance(index, (list, tuple))
+        for i in index:
+            mark_static(t, i)
+
+
+@forbid_in_graph
+def mark_static_address(t, guard=True):
+    """
+    Marks an input tensor whose data_ptr will not change across multiple calls
+    to a dynamo-compiled function. This indicates to cudagraphs that an extra allocation
+    is not needed for this input. The data_ptr will be guarded if guard=True. Note:
+    Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
+    """
+    if not isinstance(t, torch.Tensor):
+        raise TypeError(f"mark_static_address expects a tensor but recieved {type(t)}")
+
+    if guard:
+        t._dynamo_static_input_type = "guarded"  # type: ignore[attr-defined]
+    else:
+        t._dynamo_static_input_type = "unguarded"  # type: ignore[attr-defined]
+
+
+# Note: this carefully avoids eagerly import einops.
+# TODO: we should delete this whole _allow_in_graph_einops logic by approximately 2024 Q2
+def _allow_in_graph_einops():
+    import einops
+
+    try:
+        # requires einops > 0.6.1, torch >= 2.0
+        from einops._torch_specific import (  # type: ignore[attr-defined]  # noqa: F401
+            _ops_were_registered_in_torchdynamo,
+        )
+
+        # einops > 0.6.1 will call the op registration logic as it is imported.
+        pass
+    except ImportError:
+        # einops <= 0.6.1
+        allow_in_graph(einops.rearrange)
+        allow_in_graph(einops.reduce)
+        if hasattr(einops, "repeat"):
+            allow_in_graph(einops.repeat)  # available since einops 0.2.0
+        if hasattr(einops, "einsum"):
+            allow_in_graph(einops.einsum)  # available since einops 0.5.0
+        if hasattr(einops, "pack"):
+            allow_in_graph(einops.pack)  # available since einops 0.6.0
+        if hasattr(einops, "unpack"):
+            allow_in_graph(einops.unpack)  # available since einops 0.6.0
+
+
+trace_rules.add_module_init_func("einops", _allow_in_graph_einops)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/device_interface.py b/MLPY/Lib/site-packages/torch/_dynamo/device_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..62241e15711f75309983cb5fb605db324b2fa8f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/device_interface.py
@@ -0,0 +1,199 @@
+import inspect
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Type, Union
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+
+get_cuda_stream: Optional[Callable[[int], int]]
+if torch.cuda._is_compiled():
+    from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
+else:
+    get_cuda_stream = None
+
+_device_t = Union[torch.device, str, int, None]
+
+# Recording the device properties in the main process but used in worker process.
+caching_worker_device_properties: Dict[str, Any] = {}
+caching_worker_current_devices: Dict[str, int] = {}
+
+
+class DeviceInterfaceMeta(type):
+    def __new__(metacls, *args, **kwargs):
+        class_member = args[2]
+        if "Event" in class_member:
+            assert inspect.isclass(class_member["Event"]) and issubclass(
+                class_member["Event"], _EventBase
+            ), "DeviceInterface member Event should be inherit from _EventBase"
+        if "Stream" in class_member:
+            assert inspect.isclass(class_member["Stream"]) and issubclass(
+                class_member["Stream"], _StreamBase
+            ), "DeviceInterface member Stream should be inherit from _StreamBase"
+        return super().__new__(metacls, *args, **kwargs)
+
+
+class DeviceInterface(metaclass=DeviceInterfaceMeta):
+    """
+    This is a simple device runtime interface for Inductor. It enables custom
+    backends to be integrated with Inductor in a device-agnostic semantic.
+    """
+
+    class device:
+        def __new__(cls, device: _device_t):
+            raise NotImplementedError()
+
+    class Worker:
+        """
+        Worker API to query device properties that will work in multi processing
+        workers that cannot use the GPU APIs (due to processing fork() and
+        initialization time issues). Properties are recorded in the main process
+        before we fork the workers.
+        """
+
+        @staticmethod
+        def set_device(device: int):
+            raise NotImplementedError()
+
+        @staticmethod
+        def current_device() -> int:
+            raise NotImplementedError()
+
+        @staticmethod
+        def get_device_properties(device: _device_t = None):
+            raise NotImplementedError()
+
+    @staticmethod
+    def current_device():
+        raise NotImplementedError()
+
+    @staticmethod
+    def set_device(device: _device_t):
+        raise NotImplementedError()
+
+    @staticmethod
+    def device_count():
+        raise NotImplementedError()
+
+    @staticmethod
+    def is_available() -> bool:
+        raise NotImplementedError()
+
+    @staticmethod
+    def stream(stream: torch.Stream):
+        raise NotImplementedError()
+
+    @staticmethod
+    def current_stream():
+        raise NotImplementedError()
+
+    @staticmethod
+    def set_stream(stream: torch.Stream):
+        raise NotImplementedError()
+
+    @staticmethod
+    def _set_stream_by_id(stream_id: int, device_index: int, device_type: int):
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_raw_stream():
+        raise NotImplementedError()
+
+    @staticmethod
+    def synchronize(device: _device_t = None):
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_device_properties(device: _device_t = None):
+        raise NotImplementedError()
+
+    @staticmethod
+    def get_compute_capability(device: _device_t = None):
+        raise NotImplementedError()
+
+
+class CudaInterface(DeviceInterface):
+    device = torch.cuda.device
+
+    # register Event and Stream class into the backend interface
+    # make sure Event and Stream are implemented and inherited from the _EventBase and _StreamBase
+    Event = torch.cuda.Event
+    Stream = torch.cuda.Stream
+
+    class Worker:
+        @staticmethod
+        def set_device(device: int):
+            caching_worker_current_devices["cuda"] = device
+
+        @staticmethod
+        def current_device() -> int:
+            if "cuda" in caching_worker_current_devices:
+                return caching_worker_current_devices["cuda"]
+            return torch.cuda.current_device()
+
+        @staticmethod
+        def get_device_properties(device: _device_t = None):
+            if device is not None:
+                if isinstance(device, str):
+                    device = torch.device(device)
+                    assert device.type == "cuda"
+                if isinstance(device, torch.device):
+                    device = device.index
+            if device is None:
+                device = CudaInterface.Worker.current_device()
+
+            if "cuda" not in caching_worker_device_properties:
+                device_prop = [
+                    torch.cuda.get_device_properties(i)
+                    for i in range(torch.cuda.device_count())
+                ]
+                caching_worker_device_properties["cuda"] = device_prop
+
+            return caching_worker_device_properties["cuda"][device]
+
+    current_device = staticmethod(torch.cuda.current_device)
+    set_device = staticmethod(torch.cuda.set_device)
+    device_count = staticmethod(torch.cuda.device_count)
+    stream = staticmethod(torch.cuda.stream)  # type: ignore[assignment]
+    current_stream = staticmethod(torch.cuda.current_stream)
+    set_stream = staticmethod(torch.cuda.set_stream)  # type: ignore[assignment]
+    _set_stream_by_id = staticmethod(torch.cuda._set_stream_by_id)  # type: ignore[assignment]
+    synchronize = staticmethod(torch.cuda.synchronize)
+    get_device_properties = staticmethod(torch.cuda.get_device_properties)  # type: ignore[assignment]
+    get_raw_stream = staticmethod(get_cuda_stream)  # type: ignore[arg-type]
+
+    # Can be mock patched by @patch decorator.
+    @staticmethod
+    def is_available() -> bool:
+        return torch.cuda.is_available()
+
+    @staticmethod
+    def get_compute_capability(device: _device_t = None):
+        major, min = torch.cuda.get_device_capability(device)
+        return major * 10 + min
+
+
+device_interfaces: Dict[str, Type[DeviceInterface]] = {}
+
+
+def register_interface_for_device(
+    device: Union[str, torch.device], device_interface: Type[DeviceInterface]
+):
+    if isinstance(device, torch.device):
+        device = str(device)
+    device_interfaces[device] = device_interface
+
+
+def get_interface_for_device(device: Union[str, torch.device]) -> Type[DeviceInterface]:
+    if isinstance(device, torch.device):
+        device = str(device)
+    if device in device_interfaces:
+        return device_interfaces[device]
+    raise NotImplementedError(f"No interface for device {device}")
+
+
+def get_registered_device_interfaces() -> Iterable[Tuple[str, Type[DeviceInterface]]]:
+    return device_interfaces.items()
+
+
+register_interface_for_device("cuda", CudaInterface)
+for i in range(torch.cuda.device_count()):
+    register_interface_for_device(f"cuda:{i}", CudaInterface)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/eval_frame.py b/MLPY/Lib/site-packages/torch/_dynamo/eval_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..54772e5547291d68d7c16719e5adfa50c2c3b9dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/eval_frame.py
@@ -0,0 +1,1561 @@
+# mypy: disable-error-code="method-assign"
+
+"""
+Functions in this file are responsible for modifying the eval frame
+handler at RUNTIME.  Therefore, all functions in this file are hot.
+Functions that only execute at compile time should be placed
+in torch._dynamo.convert_frame.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import inspect
+import logging
+import os
+import sys
+import textwrap
+import threading
+import traceback
+import types
+import warnings
+import weakref
+from enum import Enum
+from os.path import dirname, join
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from unittest.mock import patch
+
+import torch
+import torch.fx
+import torch.utils._pytree as pytree
+import torch.utils.checkpoint
+from torch import _guards
+from torch._subclasses import fake_tensor
+from torch._utils_internal import log_export_usage
+from torch.export import Constraint
+from torch.export.dynamic_shapes import _process_dynamic_shapes
+from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    DimDynamic,
+    StatelessSymbolicContext,
+)
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+
+from ..fx import GraphModule
+from .backends.registry import CompilerFn, lookup_backend
+
+from .hooks import Hooks
+
+# see discussion at https://github.com/pytorch/pytorch/issues/120699
+reset_code = torch._C._dynamo.eval_frame.reset_code  # noqa: F401
+set_eval_frame = torch._C._dynamo.eval_frame.set_eval_frame  # noqa: F401
+set_guard_error_hook = torch._C._dynamo.eval_frame.set_guard_error_hook  # noqa: F401
+skip_code = torch._C._dynamo.eval_frame.skip_code  # noqa: F401
+unsupported = torch._C._dynamo.eval_frame.unsupported  # noqa: F401
+
+from . import config, convert_frame, external_utils, trace_rules, utils
+from .code_context import code_context
+from .exc import CondOpArgsMismatchError, UserError, UserErrorType
+from .mutation_guard import install_generation_tagging_init
+from .types import CacheEntry, DynamoCallback
+from .utils import common_constant_types, compile_times
+
+log = logging.getLogger(__name__)
+
+from torch._dispatch.python import enable_python_dispatcher
+
+always_optimize_code_objects = utils.ExactWeakKeyDictionary()
+null_context = contextlib.nullcontext
+
+
+import sympy
+
+
+# See https://github.com/python/typing/pull/240
+class Unset(Enum):
+    token = 0
+
+
+unset = Unset.token
+
+guarded_backend_cache = threading.local()
+cached_backends: Dict[int, CompilerFn] = {}
+
+
+def check_current_backend(backend_obj_id: int):
+    """
+    Called from guards to check if we need to recompile due to a backend change
+    """
+    # TODO(jansel): we should move guarded_backend_cache to C++
+    try:
+        if guarded_backend_cache.skip_backend_check_for_run_only_mode:
+            return True
+    except AttributeError:
+        # Go slightly faster next time
+        guarded_backend_cache.skip_backend_check_for_run_only_mode = False
+    try:
+        current_backend = guarded_backend_cache.current_backend
+    except AttributeError:
+        current_backend = None
+    return (
+        # Avoid the dict lookup in case of exact same object
+        id(current_backend) == backend_obj_id
+        or current_backend == cached_backends.get(backend_obj_id, None)
+    )
+
+
+def _reset_guarded_backend_cache():
+    global cached_backends
+    guarded_backend_cache.skip_backend_check_for_run_only_mode = False
+    guarded_backend_cache.current_backend = None
+    for backend in cached_backends.values():
+        if hasattr(backend, "reset"):
+            backend.reset()
+    cached_backends.clear()
+
+
+def backend_cache_manager(callback: DynamoCallback):
+    # callback is False for RunOnlyContext. RunOnlyContext is used
+    # as a way to re-use the previous compiled cache.
+    # We therefore skip the check and re-use whatever code that's already cached.
+    # Note: the cache that's actually used depends on the caching policy.
+    if callback is False:
+
+        def change():
+            try:
+                prev_skip = guarded_backend_cache.skip_backend_check_for_run_only_mode
+            except AttributeError:
+                prev_skip = False
+            guarded_backend_cache.skip_backend_check_for_run_only_mode = True
+
+            def revert():
+                guarded_backend_cache.skip_backend_check_for_run_only_mode = prev_skip
+
+            return revert
+
+    else:
+        backend = innermost_fn(callback)
+
+        def change():
+            cached_backends.setdefault(id(backend), backend)
+            try:
+                prev_backend = guarded_backend_cache.current_backend
+            except AttributeError:
+                prev_backend = None
+            guarded_backend_cache.current_backend = backend
+
+            def revert():
+                guarded_backend_cache.current_backend = prev_backend
+
+            return revert
+
+    return change
+
+
+DONT_WRAP_FILES = {
+    # For tracing into fx modules
+    inspect.getsourcefile(GraphModule),
+    join(dirname(dirname(__file__)), "onnx/_internal/fx/dynamo_graph_extractor.py"),
+}
+
+
+def _debug_get_cache_entry_list(
+    code: Union[types.CodeType, Callable[..., Any]]
+) -> List[CacheEntry]:
+    """
+    Given a code object or a callable object, retrieve the cache entries
+     stored in this code.
+    """
+    if callable(code):
+        code = code.__code__
+    return torch._C._dynamo.eval_frame._debug_get_cache_entry_list(code)
+
+
+class OptimizedModule(torch.nn.Module):
+    """
+    Wraps the original nn.Module object and later patches its
+    forward method to optimized self.forward method.
+    """
+
+    _torchdynamo_orig_callable: Callable[..., Any]
+    get_compiler_config: Callable[[], Any]
+
+    def __init__(self, mod: torch.nn.Module, dynamo_ctx):
+        super().__init__()
+        # Installs the params/buffer
+        self._orig_mod = mod
+        self.dynamo_ctx = dynamo_ctx
+        self._initialize()
+
+    def _initialize(self):
+        # Do this stuff in constructor to lower overhead slightly
+        if isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
+            self._orig_mod.forward
+        ):
+            # This may be a torch.nn.* instance in trace_rules.py which
+            # won't trigger a frame evaluation workaround to add an extra
+            # frame we can capture
+            self.forward = self.dynamo_ctx(external_utils.wrap_inline(self._orig_mod))
+        else:
+            # Invoke hooks outside of dynamo then pickup the inner frame
+            self.forward = self.dynamo_ctx(self._orig_mod.__call__)
+
+        if hasattr(self._orig_mod, "_initialize_hook"):
+            self._forward = self.forward
+            self.forward = self._call_lazy_check
+
+    def __getstate__(self):
+        state = dict(self.__dict__)
+        state.pop("forward", None)
+        state.pop("__call__", None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self._initialize()
+
+    def __getattr__(self, name):
+        if name == "_orig_mod":
+            return self._modules["_orig_mod"]
+        return getattr(self._orig_mod, name)
+
+    def _call_lazy_check(self, *args, **kwargs):
+        if hasattr(self._orig_mod, "_initialize_hook"):
+            # In the case of a lazy module, we want to run
+            # the pre-hooks which initialize it.
+            # Afterwards, lazy module deletes its pre-hooks
+            # to avoid treating it as lazy on subsequent recompile.
+            self._orig_mod._infer_parameters(self._orig_mod, args, kwargs)
+        return self._forward(*args, **kwargs)
+
+    def __dir__(self):
+        orig_mod_attrs = self._orig_mod.__dir__()
+        return orig_mod_attrs + [
+            attr for attr in super().__dir__() if attr not in orig_mod_attrs
+        ]
+
+
+def remove_from_cache(f):
+    """
+    Make sure f.__code__ is not cached to force a recompile
+    """
+    if isinstance(f, types.CodeType):
+        reset_code(f)
+    elif hasattr(f, "__code__"):
+        reset_code(f.__code__)
+    elif hasattr(getattr(f, "forward", None), "__code__"):
+        reset_code(f.forward.__code__)
+    else:
+        from . import reset  # type: ignore[attr-defined]
+
+        reset()
+        log.warning("could not determine __code__ for %s", f)
+
+
+def nothing():
+    pass
+
+
+def always_false():
+    return False
+
+
+def innermost_fn(fn):
+    """
+    In case of nesting of _TorchDynamoContext calls, find the innermost
+    function. TorchDynamo caches on fn.__code__ object, so its necessary to find
+    the innermost function to pass on the optimize, run, disable etc.
+    """
+    unaltered_fn = fn
+    while hasattr(unaltered_fn, "_torchdynamo_orig_callable"):
+        unaltered_fn = unaltered_fn._torchdynamo_orig_callable
+        assert callable(unaltered_fn)
+    return unaltered_fn
+
+
+def make_set_enable_dynamic(enable: bool):
+    assert isinstance(enable, bool)
+    if enable:
+        # Assume everything is dynamic by default
+        return config._make_closure_patcher(assume_static_by_default=False)
+    else:
+        return config._make_closure_patcher(
+            automatic_dynamic_shapes=False, assume_static_by_default=True
+        )
+
+
+class _TorchDynamoContext:
+    def __init__(
+        self,
+        callback: DynamoCallback,
+        on_enter=nothing,
+        backend_ctx_ctor=null_context,
+        patch_fn=nothing,
+        first_ctx=False,
+        *,
+        export=False,
+        dynamic=None,
+        compiler_config=None,
+    ):
+        super().__init__()
+        assert callable(callback) or callback is False or callback is None
+        self.callback: DynamoCallback = callback
+        self.prior: Union[Unset, DynamoCallback] = unset
+        self.first_ctx = first_ctx
+        self.export = export
+        self.compiler_config = compiler_config
+        self.cleanup_fns: List[Callable[[], Any]] = []
+        self.enter_exit_hooks = [backend_cache_manager(self.callback)]
+        patch_fn()
+
+        if dynamic is not None:
+            self.enter_exit_hooks.append(make_set_enable_dynamic(dynamic))
+
+        if on_enter is not nothing:
+            # this case is not common
+            def call_on_enter():
+                on_enter()
+                return nothing
+
+            self.enter_exit_hooks.append(call_on_enter)
+
+        if backend_ctx_ctor is not contextlib.nullcontext:
+            # this case is not common
+            def call_backend_ctx():
+                ctx = backend_ctx_ctor()
+                ctx.__enter__()
+                return functools.partial(ctx.__exit__, None, None, None)
+
+            self.enter_exit_hooks.append(call_backend_ctx)
+
+    def __enter__(self):
+        if config.raise_on_ctx_manager_usage:
+            raise RuntimeError(
+                "torch._dynamo.optimize(...) is used with a context manager. "
+                "Please refer to https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html "
+                "to use torch._dynamo.optimize(...) as an annotation/decorator. "
+            )
+        self.cleanup_fns = [enter() for enter in self.enter_exit_hooks]
+        self.prior = set_eval_frame(self.callback)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self.prior is not unset
+        set_eval_frame(self.prior)
+        self.prior = unset
+        for cleanup in self.cleanup_fns:
+            cleanup()
+        self.cleanup_fns.clear()
+
+    def __call__(self, fn):
+        # public api for compiler config/options
+        def get_compiler_config():
+            return self.compiler_config
+
+        fn = innermost_fn(fn)
+
+        # add context containing GraphModule to any GraphModule forward functions
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        if isinstance(fn, _LazyGraphModule) or (
+            isinstance(getattr(fn, "__self__", None), _LazyGraphModule)
+            and fn.__name__ == "_lazy_forward"
+        ):
+            # Since dynamo will run the forward method for the GraphModule shortly
+            # anyways, it does not hurt to do the real recompilation here if
+            # this is a _LazyGraphModule. This makes it easier for dynamo to
+            # optimize a _LazyGraphModule.
+
+            lazy_gm = fn if isinstance(fn, _LazyGraphModule) else fn.__self__
+
+            _LazyGraphModule.force_recompile(lazy_gm)
+
+            # Assume that the underlying node metadata of `fn`,
+            # a GraphModule instance, accurately represents
+            # all instances of type(fn).
+            code_context.get_context(lazy_gm.forward.__code__)[
+                "orig_graphmodule"
+            ] = weakref.ref(lazy_gm)
+
+            if not isinstance(fn, _LazyGraphModule):
+                # replace fn with the real forward method
+                fn = lazy_gm.forward
+        elif isinstance(fn, GraphModule):
+            code_context.get_context(fn.forward.__code__)[
+                "orig_graphmodule"
+            ] = weakref.ref(fn)
+
+        # Optimize the forward method of torch.nn.Module object
+        if isinstance(fn, torch.nn.Module):
+            mod = fn
+            new_mod = OptimizedModule(mod, self)
+            # Save the function pointer to find the original callable while nesting
+            # of decorators.
+            new_mod._torchdynamo_orig_callable = mod.forward
+
+            # when compiling torch.nn.Module,
+            # provide public api OptimizedModule.get_compiler_config()
+            assert not hasattr(new_mod, "get_compiler_config")
+            new_mod.get_compiler_config = get_compiler_config
+
+            return new_mod
+        assert callable(fn)
+
+        try:
+            filename = inspect.getsourcefile(fn)
+        except TypeError:
+            filename = None
+        if (
+            (filename is None or trace_rules.check(fn))
+            and (
+                getattr(fn, "__name__", "") not in ["_call_impl", "_wrapped_call_impl"]
+            )
+            and filename not in DONT_WRAP_FILES
+        ):
+            # call to a builtin without a frame for us to capture
+            fn = external_utils.wrap_inline(fn)
+
+        callback = self.callback
+
+        if isinstance(self, DisableContext):
+            is_jit_tracing = always_false
+            is_fx_tracing = always_false
+        else:
+            is_jit_tracing = torch._C._is_tracing
+            is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
+
+        @functools.wraps(fn)
+        def _fn(*args, **kwargs):
+            if is_fx_tracing():
+                if config.error_on_nested_fx_trace:
+                    raise RuntimeError(
+                        "Detected that you are using FX to symbolically trace "
+                        "a dynamo-optimized function. This is not supported at the moment."
+                    )
+                else:
+                    return fn(*args, **kwargs)
+
+            if is_jit_tracing():
+                if config.error_on_nested_jit_trace:
+                    raise RuntimeError(
+                        "Detected that you are using FX to torch.jit.trace "
+                        "a dynamo-optimized function. This is not supported at the moment."
+                    )
+                else:
+                    return fn(*args, **kwargs)
+
+            cleanups = [enter() for enter in self.enter_exit_hooks]
+            prior = set_eval_frame(callback)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                set_eval_frame(prior)
+                for cleanup in cleanups:
+                    cleanup()
+
+        # hooks to properly handle inlining
+        if isinstance(self, DisableContext):
+            _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+        else:
+            _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
+
+        # Save the function pointer to find the original callable while nesting
+        # of decorators.
+        _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+
+        # when compiling user function instead of nn.Module
+        # provide public api _fn.get_compiler_config()
+        assert not hasattr(_fn, "get_compiler_config")
+        _fn.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+
+        # If the function is called using torch._dynamo.optimize decorator, we
+        # should prevent any type of skipping.
+        if callback not in (None, False):
+            if not hasattr(fn, "__code__"):
+                raise RuntimeError(
+                    textwrap.dedent(
+                        """
+
+                        torch._dynamo.optimize is called on a non function object.
+                        If this is a callable class, please wrap the relevant code into a function and optimize the
+                        wrapper function.
+
+                        >> class CallableClass:
+                        >>     def __init__(self):
+                        >>         super().__init__()
+                        >>         self.relu = torch.nn.ReLU()
+                        >>
+                        >>     def __call__(self, x):
+                        >>         return self.relu(torch.sin(x))
+                        >>
+                        >>     def print_hello(self):
+                        >>         print("Hello world")
+                        >>
+                        >> mod = CallableClass()
+
+                        If you want to optimize the __call__ function and other code, wrap that up in a function
+
+                        >> def wrapper_fn(x):
+                        >>     y = mod(x)
+                        >>     return y.sum()
+
+                        and then optimize the wrapper_fn
+
+                        >> opt_wrapper_fn = torch._dynamo.optimize(wrapper_fn)
+                        """
+                    )
+                )
+            always_optimize_code_objects[fn.__code__] = True
+
+        return _fn
+
+
+class OptimizeContext(_TorchDynamoContext):
+    def __init__(
+        self,
+        callback,
+        backend_ctx_ctor,
+        first_ctx=False,
+        *,
+        export=False,
+        dynamic=None,
+        compiler_config=None,
+    ):
+        def on_enter():
+            install_generation_tagging_init()
+
+        super().__init__(
+            callback=callback,
+            on_enter=on_enter,
+            backend_ctx_ctor=backend_ctx_ctor,
+            patch_fn=TorchPatcher.patch,
+            first_ctx=first_ctx,
+            export=export,
+            dynamic=dynamic,
+            compiler_config=compiler_config,
+        )
+
+
+class RunOnlyContext(_TorchDynamoContext):
+    def __init__(self):
+        # cudagraph trees relies on generation increment
+        def on_enter():
+            torch._dynamo.mutation_guard.GenerationTracker.generation += 1
+
+        super().__init__(callback=False, on_enter=on_enter)
+
+
+class DisableContext(_TorchDynamoContext):
+    def __init__(self):
+        super().__init__(callback=None)
+
+
+def _optimize_catch_errors(
+    compile_fn,
+    hooks: Hooks,
+    backend_ctx_ctor=null_context,
+    export=False,
+    dynamic=None,
+    compiler_config=None,
+):
+    return OptimizeContext(
+        convert_frame.catch_errors_wrapper(compile_fn, hooks),
+        backend_ctx_ctor=backend_ctx_ctor,
+        first_ctx=True,
+        export=export,
+        dynamic=dynamic,
+        compiler_config=compiler_config,
+    )
+
+
+def get_compiler_fn(compiler_fn):
+    from .repro.after_dynamo import wrap_backend_debug
+
+    if hasattr(compiler_fn, "compiler_name"):
+        compiler_str = compiler_fn.compiler_name
+    elif isinstance(compiler_fn, str):
+        compiler_str = compiler_fn
+    else:
+        compiler_str = None
+    compiler_fn = lookup_backend(compiler_fn)
+    return wrap_backend_debug(compiler_fn, compiler_str)
+
+
+class _NullDecorator(contextlib.nullcontext):  # type: ignore[type-arg]
+    def __call__(self, fn):
+        assert callable(fn)
+        return fn
+
+
+def check_if_dynamo_supported():
+    if sys.version_info >= (3, 12):
+        raise RuntimeError("Python 3.12+ not yet supported for torch.compile")
+
+
+def is_dynamo_supported():
+    try:
+        check_if_dynamo_supported()
+        return True
+    except Exception:
+        return False
+
+
+def check_if_inductor_supported():
+    check_if_dynamo_supported()
+
+    if sys.platform == "win32":
+        raise RuntimeError("Windows not yet supported for inductor")
+
+
+def is_inductor_supported():
+    try:
+        check_if_inductor_supported()
+        return True
+    except Exception:
+        return False
+
+
+def optimize(
+    backend="inductor",
+    *,
+    nopython=False,
+    guard_export_fn=None,
+    guard_fail_fn=None,
+    disable=False,
+    dynamic=None,
+):
+    """
+    The main entrypoint of TorchDynamo.  Do graph capture and call
+    backend() to optimize extracted graphs.
+
+    Args:
+        backend: One of the two things:
+            - Either, a function/callable taking a torch.fx.GraphModule and
+            example_inputs and returning a python callable that runs the
+            graph faster.
+            One can also provide additional context for the backend, like
+            torch.jit.fuser("fuser2"), by setting the backend_ctx_ctor attribute.
+            See AOTAutogradMemoryEfficientFusionWithContext for the usage.
+            - Or, a string backend name in `torch._dynamo.list_backends()`
+        nopython: If True, graph breaks will be errors and there will
+            be a single whole-program graph.
+        disable: If True, turn this decorator into a no-op
+        dynamic: If True, upfront compile as dynamic a kernel as possible.  If False,
+            disable all dynamic shapes support (always specialize).  If None, automatically
+            detect when sizes vary and generate dynamic kernels upon recompile.
+
+    Example Usage::
+
+        @torch._dynamo.optimize()
+        def toy_example(a, b):
+            ...
+    """
+    check_if_dynamo_supported()
+    # Note: The hooks object could be global instead of passed around, *however* that would make
+    # for a confusing API usage and plumbing story wherein we nest multiple .optimize calls.
+    # There is some prior art around this, w/r/t nesting backend calls are enforced to be the same
+    # compiler, however, this feels onerous for callback and hooks, and it feels better to give our users an
+    # easier to understand UX at the cost of a little more plumbing on our end.
+    hooks = Hooks(guard_export_fn=guard_export_fn, guard_fail_fn=guard_fail_fn)
+    torch._C._log_api_usage_once("torch._dynamo.optimize")
+    if disable or os.environ.get("TORCHDYNAMO_DISABLE", "") == "1":
+        return _NullDecorator()
+
+    backend = get_compiler_fn(backend)
+
+    # Find if backend has any extra context manager
+    backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
+
+    if nopython:
+        return optimize_assert(
+            backend,
+            dynamic=dynamic,
+            hooks=hooks,
+        )
+    return _optimize_catch_errors(
+        convert_frame.convert_frame(backend, hooks=hooks),
+        hooks,
+        backend_ctx_ctor,
+        dynamic=dynamic,
+        compiler_config=backend.get_compiler_config()
+        if hasattr(backend, "get_compiler_config")
+        else None,
+    )
+
+
+# TODO(voz): Consider making "explain" output alongside a run / part of a run
+@patch("torch._dynamo.symbolic_convert.explain", True)
+def explain(f, *extra_args, **extra_kwargs):
+    def inner(*args, **kwargs):
+        # TODO(voz): Do we want a decorator for this?
+        from . import reset  # type: ignore[attr-defined]
+
+        reset()
+
+        graphs: List[torch.fx.GraphModule] = []
+        break_reasons: List[Any] = []
+        op_count: int = 0
+        ops_per_graph: List[torch.fx.Node] = []
+        out_guards: List[_guards.Guard] = []
+
+        def dynamo_graph_accumulating_compiler(
+            gm: torch.fx.GraphModule, example_inputs
+        ):
+            from .backends.debugging import _explain_graph_detail
+
+            nonlocal graphs
+            nonlocal op_count
+            nonlocal ops_per_graph
+            nonlocal break_reasons
+
+            gm, graphs, op_count, ops_per_graph, break_reasons = _explain_graph_detail(
+                gm, graphs, op_count, ops_per_graph, break_reasons
+            )
+
+            return gm.forward
+
+        def guard_export_print(guards):
+            nonlocal out_guards
+            out_guards.extend(guards)
+
+        opt_f = optimize(
+            dynamo_graph_accumulating_compiler,
+            nopython=False,
+            guard_export_fn=guard_export_print,
+        )(f)
+        # TODO(voz): We may have instances of `f` that mutate inputs, we should track sideeffects and reject.
+        opt_f(*args, **kwargs)
+
+        graph_count = len(graphs)
+
+        # For the explanation summary, dedupe reasons by the innermost stack frame and dedupe by it.
+        deduped_reasons = {}
+        for reason in break_reasons:
+            innermost_frame = reason.user_stack[-1]
+            # __repr__ uniquely identifies a FrameSummary so we can use it for deduping
+            deduped_reasons[repr(innermost_frame)] = reason
+
+        formatted_list = ""
+        for idx, break_reason in enumerate(deduped_reasons.values()):
+            formatted_stack = "".join(traceback.format_list(break_reason.user_stack))
+            msg = f"{idx + 1}. Reason: {break_reason.reason}\n   User Stack: {formatted_stack}\n"
+            formatted_list += msg
+
+        graph_break_count = graph_count - 1
+        compile_time = compile_times(repr="str")
+
+        # TODO(voz): Do we want a decorator for this?
+        reset()
+        from .backends.debugging import ExplainOutput
+
+        return ExplainOutput(
+            graphs,
+            graph_count,
+            graph_break_count,
+            break_reasons,
+            op_count,
+            ops_per_graph,
+            out_guards,
+            compile_time,
+        )
+
+    if extra_args or extra_kwargs:
+        warnings.warn(
+            "explain(f, *args, **kwargs) is deprecated, use explain(f)(*args, **kwargs) instead.  "
+            "If you don't migrate, we may break your explain call in the future if your user defined kwargs "
+            "conflict with future kwargs added to explain(f)."
+        )
+        return inner(*extra_args, **extra_kwargs)
+    else:
+        return inner
+
+
+class FlattenInputOutputSignature(torch.fx.interpreter.Transformer):
+    def __init__(
+        self,
+        m: torch.fx.GraphModule,
+        flat_args: Tuple[Any],
+        matched_input_elements_positions: List[int],
+        flat_results: List[Any],
+        matched_output_elements_positions: List[int],
+        example_fake_inputs: List[torch.Tensor],
+        flat_args_dynamic_dims: List[Set[int]],
+        fake_mode: Optional[fake_tensor.FakeTensorMode] = None,
+    ):
+        super().__init__(m)
+
+        assert len(flat_args_dynamic_dims) == len(flat_args)
+        matched_input_elements_to_fake = {
+            val: example_fake_inputs[ix]
+            for ix, val in enumerate(matched_input_elements_positions)
+        }
+
+        self.new_args = []
+        for i in range(0, len(flat_args)):
+            arg = super().placeholder(f"arg{i}", (), {})
+            if i in matched_input_elements_to_fake:
+                arg.node.meta["val"] = matched_input_elements_to_fake[i]
+            else:
+                # Fill node.mata["val"] with faketensor from the input,
+                # if it's not found in matched_input_elements_positions
+                if fake_mode is not None and isinstance(flat_args[i], torch.Tensor):
+                    # TODO(zhxchen17) Also preserve all the user constraints here.
+                    arg.node.meta["val"] = fake_mode.from_tensor(
+                        flat_args[i],
+                        symbolic_context=StatelessSymbolicContext(
+                            dynamic_sizes=[
+                                DimDynamic.DYNAMIC
+                                if d in flat_args_dynamic_dims[i]
+                                else DimDynamic.STATIC
+                                for d in range(len(flat_args[i].shape))
+                            ],
+                            constraint_sizes=[None] * len(flat_args[i].shape),
+                        ),
+                    )
+            self.new_args.append(arg)
+        self.old_args_gen = (self.new_args[i] for i in matched_input_elements_positions)
+        self.matched_output_elements_positions = matched_output_elements_positions
+        self.flat_results = flat_results
+
+    def placeholder(self, target, args, kwargs):
+        arg = next(self.old_args_gen)
+        if "val" in self.current_node.meta:
+            arg.node.meta["val"] = self.current_node.meta["val"]
+        if "tensor_dict" in self.current_node.meta:
+            arg.node.meta["tensor_dict"] = self.current_node.meta["tensor_dict"]
+        if "example_value" in self.current_node.meta:
+            arg.node.meta["example_value"] = self.current_node.meta["example_value"]
+        return arg
+
+    def output(self, target, args, kwargs):
+        dynamo_result_flat = args[0]
+        lookup = [*dynamo_result_flat, *self.new_args]
+        new_results_flat = []
+        for i in range(len(self.flat_results)):
+            if self.matched_output_elements_positions[i] is not None:
+                new_results_flat.append(
+                    lookup[self.matched_output_elements_positions[i]]
+                )
+            else:
+                const_val = self.flat_results[i]
+                assert isinstance(const_val, tuple(common_constant_types))
+                new_results_flat.append(const_val)
+        return super().output(target, (new_results_flat,), {})
+
+    def run_node(self, n):
+        self.current_node = n
+        result_proxy = super().run_node(n)
+        if "val" in self.current_node.meta:
+            result_proxy.node.meta["val"] = self.current_node.meta["val"]
+        if "example_value" in self.current_node.meta:
+            result_proxy.node.meta["example_value"] = self.current_node.meta[
+                "example_value"
+            ]
+        if self.current_node.op != "output":
+            result_proxy.node._rename(
+                getattr(self.current_node, "name", result_proxy.node.name)
+            )
+        return result_proxy
+
+    def transform(self):
+        result_gm = super().transform()
+        if "dynamo_flat_name_to_original_fqn" in self.module.meta:
+            result_gm.meta["dynamo_flat_name_to_original_fqn"] = self.module.meta[
+                "dynamo_flat_name_to_original_fqn"
+            ]
+        return result_gm
+
+
+class ExportResult(NamedTuple):
+    graph_module: torch.fx.GraphModule
+    guards: _guards.GuardsSet
+    # NB: Do not add new fields without overriding __iter__; people are
+    # destructuring so it is BC-breaking
+
+
+def check_signature_rewritable(graph):
+    input_errors = []
+    for node in graph.graph.nodes:
+        if node.op == "placeholder":
+            assert hasattr(node, "_dynamo_source")
+            source = node._dynamo_source
+            user_stacks = graph._source_to_user_stacks.get(source)
+            if user_stacks is None:
+                continue
+            assert len(user_stacks) > 0
+            # In some cases we may not have a useful stack.  Look for a
+            # useful stack
+            stack = None
+            for s in user_stacks:
+                if len(s) == 0:
+                    continue
+                stack = s
+                break
+            if stack is None:
+                msg = f"{source.name()}, a closed over free variable"
+            else:
+                tb = "".join(traceback.format_list(stack))
+                extra = ""
+                if len(user_stacks) > 1:
+                    extra = f"(elided {len(user_stacks)-1} more accesses)"
+                msg = f"{source.name()}, accessed at:\n{tb}{extra}"
+            # TODO: option to print ALL of the stack traces at once
+            input_errors.append(msg)
+
+    if input_errors:
+        raise UserError(
+            UserErrorType.INVALID_INPUT,
+            "Cannot export model which references tensors that are neither "
+            "buffers/parameters/constants nor are direct inputs.  For each tensor, if you'd "
+            "like this tensor to be an explicit input, add it as a dummy argument "
+            "to the top-level model definition you are exporting; if you would "
+            "like its value to be embedded as an exported constant, wrap its access "
+            "in a function marked with @assume_constant_result.\n\n"
+            + "\n\n".join(input_errors),
+        )
+
+
+def rewrite_signature(
+    f_sig,
+    graph,
+    fake_mode,
+    flat_args,
+    in_spec,
+    example_fake_inputs,
+    graph_captured_input,
+    graph_captured_output,
+    dynamo_traced_result,
+    flat_args_dynamic_dims,
+):
+    orig_args, orig_kwargs = pytree.tree_unflatten(flat_args, in_spec)
+
+    def check_user_input_output(flat_values, error_type):
+        supported_types = [
+            torch.Tensor,
+            torch.SymInt,
+            torch.SymFloat,
+            torch.SymBool,
+            torch._C.ScriptObject,
+        ] + list(common_constant_types)
+
+        def is_supported_type(val):
+            return isinstance(val, tuple(supported_types))
+
+        value_type = "input" if error_type == UserErrorType.INVALID_INPUT else "output"
+        # We only check that the outputs are not None. Inputs can be None.
+        for v in flat_values:
+            if not is_supported_type(v):
+                if error_type == UserErrorType.INVALID_INPUT and v is None:
+                    continue
+
+                raise UserError(
+                    error_type,
+                    f"It looks like one of the {value_type}s with type `{type(v)}` "
+                    "is not supported or pytree-flattenable. \n"
+                    f"Exported graphs {value_type}s can only contain the "
+                    f"following supported types: {supported_types}. \n"
+                    "If you are using a custom class object, "
+                    "please register a pytree_flatten/unflatten function "
+                    "using `torch.utils._pytree.register_pytree_node` or "
+                    "`torch.export.register_dataclass`.",
+                )
+
+    check_user_input_output(flat_args, UserErrorType.INVALID_INPUT)
+    flat_results_traced, out_spec_traced = pytree.tree_flatten(dynamo_traced_result)
+    check_user_input_output(flat_results_traced, UserErrorType.INVALID_OUTPUT)
+
+    def produce_matching(debug_type, sources, candidates):
+        matched_elements_positions: List[Optional[int]] = []
+        dict_of_source_vals = {}
+        for i, val in enumerate(sources):
+            dict_of_source_vals[id(val)] = i
+
+        for i, val in enumerate(candidates):
+            if isinstance(val, tuple(common_constant_types)):
+                matched_elements_positions.append(None)
+            elif id(val) not in dict_of_source_vals:
+                raise AssertionError(
+                    f"Unexpectedly found a {type(val)} in the {debug_type}.\n"
+                    'Please file an issue along with a paste of the logs from TORCH_LOGS="+export"'
+                )
+            else:
+                matched_elements_positions.append(dict_of_source_vals[id(val)])
+
+        return matched_elements_positions
+
+    matched_input_elements_positions = produce_matching(
+        "inputs", flat_args, graph_captured_input
+    )
+
+    assert graph_captured_output is not None
+    matched_output_elements_positions = produce_matching(
+        "outputs", list(graph_captured_output) + flat_args, flat_results_traced
+    )
+
+    new_graph = FlattenInputOutputSignature(
+        graph,
+        flat_args,
+        matched_input_elements_positions,
+        flat_results_traced,
+        matched_output_elements_positions,
+        example_fake_inputs,
+        flat_args_dynamic_dims,
+        fake_mode,
+    ).transform()
+
+    # Make dynamo graph to have same input/output spec as user code
+    def argument_names(f_sig, args, kwargs) -> List[str]:
+        def signature_to_fullargspec(sig: inspect.Signature):
+            # Get a list of Parameter objects from the Signature object
+            params = list(sig.parameters.values())
+            # Separate positional arguments, keyword-only arguments and varargs/varkw
+            args = [
+                p.name
+                for p in params
+                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+            ]
+            kwonlyargs = [
+                p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
+            ]
+            varargs = next(
+                (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
+                None,
+            )
+            varkw = next(
+                (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
+                None,
+            )
+            # Get default values for positional arguments and keyword-only arguments
+            defaults = tuple(
+                p.default
+                for p in params
+                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+                and p.default is not inspect.Parameter.empty
+            )
+            kwonlydefaults = {
+                p.name: p.default
+                for p in params
+                if p.kind == inspect.Parameter.KEYWORD_ONLY
+                and p.default is not inspect.Parameter.empty
+            }
+            # Get annotations for parameters and return value
+            annotations = {}
+            if sig.return_annotation:
+                annotations = {"return": sig.return_annotation}
+            for parameter in params:
+                annotations[parameter.name] = parameter.annotation
+            # Return a FullArgSpec object with the extracted attributes
+            return inspect.FullArgSpec(
+                args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
+            )
+
+        fullargspec = signature_to_fullargspec(f_sig)
+
+        # 1. Map `args` 1-to-1 to positional arguments in original signature.
+        input_strs = fullargspec.args[: len(args)]
+
+        if len(args) > len(fullargspec.args):
+            # 2. If there are more arguments left in `args`, they map to varargs in original
+            # signature. Assign names as {varargs}_0, {varargs}_1, ...
+            assert fullargspec.varargs is not None, "More arguments than expected"
+            input_strs += [
+                f"{fullargspec.varargs}_{i}"
+                for i in range(0, len(args) - len(input_strs))
+            ]
+        elif len(args) < len(fullargspec.args):
+            # 3. If there are fewer arguments in `args` than `fullargspec.args`,
+            # it implies these are arguments either with default values, or provided in
+            # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
+            # export them as part of the function signature. The latter will be handled
+            # in the next step.
+            for unprovided_arg in fullargspec.args[
+                len(args) : -len(fullargspec.defaults or [])
+            ]:
+                assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
+
+        # 4. Keyword arguments provided in `kwargs`.
+        input_strs += list(kwargs.keys())
+
+        # 5. Keyword-only arguments with default values if not provided are not exported
+        # as part of the function signature.
+        for kwonly_arg in fullargspec.kwonlyargs:
+            kwonlydefaults = fullargspec.kwonlydefaults or {}
+            assert (
+                kwonly_arg in kwargs or kwonly_arg in kwonlydefaults
+            ), f"Missing keyword only argument {kwonly_arg}"
+
+        return input_strs
+
+    new_graph.graph._codegen = _PyTreeCodeGen(
+        _PyTreeInfo(
+            argument_names(f_sig, orig_args, orig_kwargs),
+            in_spec,
+            out_spec_traced,
+        )
+    )
+    new_graph.recompile()
+    return new_graph
+
+
+def export(
+    f: Callable[..., Any],
+    *extra_args,
+    aten_graph: bool = False,
+    pre_dispatch: bool = False,
+    decomposition_table: Optional[
+        Dict[torch._ops.OpOverload, Callable[..., Any]]
+    ] = None,
+    tracing_mode: str = "symbolic",
+    constraints: Optional[List[Constraint]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
+    assume_static_by_default: bool = False,
+    same_signature: bool = True,
+    disable_constraint_solver: bool = False,
+    _log_export_usage: bool = True,
+    **extra_kwargs,
+) -> Callable[..., ExportResult]:
+    """
+    Export an input function f to a format that can be executed outside of PyTorch using the FX graph.
+
+    Args:
+        f (callable): A PyTorch function to be exported.
+
+        aten_graph (bool): If True, exports a graph with ATen operators.
+        If False, exports a graph with Python operators. Default is False.
+
+        pre_dispatch (bool): If True, exports a graph with ATen operators,
+        but before any logic in the PyTorch dispatcher has run.
+        This can be useful if you want to apply further transformations on a graph before running it
+        through autograd, autocast, or any other functionalities that are integrated into the dispatcher.
+        This flag is only valid if aten_graph=True is set.
+        Default is False.
+
+        decomposition_table (dict): A dictionary that maps operators to their decomposition functions.
+        Required if aten_graph or tracing_mode is specified. Default is None.
+
+        tracing_mode (str): If "symbolic", turn on dynamic shapes support. Default is "symbolic".
+
+        constraints: [DEPRECATED: use ``dynamic_shapes`` instead, see below]
+         An optional list of constraints on the dynamic arguments
+         that specify their possible range of shapes. By default, shapes of
+         input torch.Tensors are assumed to be static. If an input torch.Tensor
+         is expected to have dynamic shapes, please use :func:`dynamic_dim`
+         to define :class:`Constraint` objects that specify the dynamics and the possible
+         range of shapes. See :func:`dynamic_dim` docstring for examples on
+         how to use it.
+
+        dynamic_shapes:
+         An optional argument where the type should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
+
+        same_signature (bool): If True, rewrite the returned graph's signature to be the same as f.
+
+        disable_constraint_solver (bool): Whether the dim constraint solver must be disabled.
+
+    Returns:
+        A function that given args and kwargs, returns a tuple of (graph, guards)
+        Graph: An FX graph representing the execution of the input PyTorch function with the provided arguments and options.
+        Guards: The guards we accumulated during tracing f above
+
+    Raises:
+        AssertionError: If decomposition_table is specified without setting aten_graph=True,
+        or if graph breaks during tracing in export.
+
+        AssertionError: If Dynamo input and output is not consistent with traced input/output.
+
+    Note - this headerdoc was authored by ChatGPT, with slight modifications by the author.
+    """
+    if _log_export_usage:
+        log_export_usage(event="export.private_api", flags={"_dynamo"})
+
+    # Deal with "local variable referenced before assignment"
+    _f = f
+    _assume_static_by_default = assume_static_by_default
+
+    def inner(*args, **kwargs):
+        nonlocal constraints
+        if constraints is not None:
+            if _log_export_usage:
+                warnings.warn(
+                    "Using `constraints` to specify dynamic shapes for export is DEPRECATED "
+                    "and will not be supported in the future. "
+                    "Please use `dynamic_shapes` instead (see docs on `torch.export.export`).",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+        else:
+            constraints = _process_dynamic_shapes(_f, args, kwargs, dynamic_shapes)
+        f = _f
+        assume_static_by_default = _assume_static_by_default
+        check_if_dynamo_supported()
+        torch._C._log_api_usage_once("torch._dynamo.export")
+        if decomposition_table is not None:
+            assert (
+                aten_graph
+            ), "Specifying a decomposition_table table or tracing mode is illegal without setting aten_graph=True"
+        if pre_dispatch:
+            assert aten_graph, "pre_dispatch=True can only be used when aten_graph=True"
+        f = innermost_fn(f)
+        call_to_inspect = f.forward if isinstance(f, torch.nn.Module) else f
+        original_signature = inspect.signature(call_to_inspect)
+        graph = None
+        out_guards = None
+        graph_captured_input = None
+        graph_captured_result: Optional[Tuple[torch.Tensor, ...]] = None
+        fake_mode = None
+
+        def guard_export_print(guards: _guards.GuardsSet):
+            nonlocal out_guards
+            assert (
+                out_guards is None
+            ), "whole graph export entails exactly one guard export"
+            out_guards = guards
+
+        example_inputs = []
+
+        def dynamo_normalization_capturing_compiler(
+            gm: torch.fx.GraphModule, inner_example_inputs
+        ):
+            nonlocal graph
+            assert (
+                graph is None
+            ), "Tried to emit a second graph during export. Tracing through 'f' must produce a single graph."
+            graph = gm
+
+            nonlocal fake_mode, example_inputs
+            # NB: do NOT pass inner_example_inputs here, we are detecting the
+            # Dynamo allocated fake mode, which should be DISTINCT from a
+            # potential outer ambient fake mode which the user provided.
+            # example_inputs is always the user specified inputs, so they
+            # would have the wrong fake mode attached to them
+            fake_mode = _guards.detect_fake_mode()
+            example_inputs = inner_example_inputs
+
+            def result_capturing_wrapper(*graph_inputs):
+                nonlocal graph_captured_result
+                nonlocal graph_captured_input
+
+                graph_captured_input = graph_inputs
+                assert graph is not None
+
+                named_parameters = dict(graph.named_parameters(remove_duplicate=False))
+                named_buffers = dict(graph.named_buffers(remove_duplicate=False))
+
+                ambient_fake_mode = (
+                    _guards.detect_fake_mode(graph_inputs)
+                    if _guards.detect_fake_mode(graph_inputs) is not None
+                    else fake_mode
+                )
+
+                with ambient_fake_mode, enable_python_dispatcher():
+                    params_and_buffers = {
+                        **named_parameters,
+                        **named_buffers,
+                    }
+                    fake_params_buffers = dict()
+
+                    for name, value in params_and_buffers.items():
+                        fake_params_buffers[name] = ambient_fake_mode.from_tensor(
+                            value, static_shapes=True
+                        )
+
+                    fake_graph_inputs = pytree.tree_map(
+                        ambient_fake_mode.from_tensor, graph_inputs
+                    )
+                    graph_captured_result = torch.func.functional_call(
+                        graph, fake_params_buffers, fake_graph_inputs
+                    )
+
+                return graph_captured_result
+
+            return result_capturing_wrapper
+
+        # Note: This is needed by rewrite_signature. We need to put it before
+        # optimize_assert since user program may mutate the inputs.
+        flat_args, in_spec = pytree.tree_flatten((args, kwargs))
+
+        remove_from_cache(f)
+        constraint_violation_error = None
+        if tracing_mode != "symbolic":
+            assume_static_by_default = True
+        with config.patch(
+            specialize_int=True,
+            assume_static_by_default=assume_static_by_default,
+            automatic_dynamic_shapes=False,
+            capture_dynamic_output_shape_ops=True,
+            capture_scalar_outputs=True,
+        ):
+            opt_f = optimize_assert(
+                dynamo_normalization_capturing_compiler,
+                hooks=Hooks(
+                    guard_export_fn=guard_export_print,
+                    guard_fail_fn=None,
+                ),
+                export=True,
+                export_constraints=constraints,
+            )(f)
+            # TODO(voz): We may have instances of `f` that mutate inputs, we should track sideeffects and reject.
+            try:
+                result_traced = opt_f(*args, **kwargs)
+            except ConstraintViolationError as e:
+                constraint_violation_error = e
+        remove_from_cache(f)
+
+        if (
+            not disable_constraint_solver
+            and (shape_env := getattr(fake_mode, "shape_env", None)) is not None
+            and (dim_constraints := shape_env.dim_constraints) is not None
+            and not isinstance(
+                call_to_inspect, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)
+            )
+            and not trace_rules.check(call_to_inspect)
+        ):
+            dim_constraints.solve()
+            dim_constraints.remove_redundant_dynamic_results()
+            forced_specializations = dim_constraints.forced_specializations()
+            msg = dim_constraints.prettify_results(
+                original_signature, constraint_violation_error, forced_specializations
+            )
+            if constraint_violation_error:
+                constraint_violation_error.args = (
+                    constraint_violation_error.args[0] + msg,
+                )
+            else:
+                if forced_specializations:
+                    constraint_violation_error = ConstraintViolationError(msg)
+                else:
+                    log.info(
+                        "Summary of dimension constraints:%s",
+                        msg,
+                    )
+
+            # Error if we have any constraints on static values
+            for k in shape_env.var_to_range.keys():
+                if isinstance(k, sympy.Integer):
+                    constraint_violation_error = ConstraintViolationError(
+                        f"{''.join(traceback.format_list(shape_env.var_to_stack[k]))}\n"
+                        "It appears that you're trying to set a constraint on a "
+                        f"value which we evaluated to have a static value of {k}. "
+                        'Set TORCH_LOGS="+export" for more information.'
+                    )
+        if constraint_violation_error:
+            raise constraint_violation_error
+
+        assert (
+            graph is not None
+        ), "Failed to produce a graph during tracing as no tensor operations were found."
+        assert hasattr(graph, "_source_to_user_stacks")
+        assert out_guards is not None, "Failed to produce guards during tracing"
+        assert fake_mode is not None
+
+        log.info(
+            "Dynamo captured graph:\n\n%s", graph.print_readable(print_output=False)
+        )
+
+        # This check need to happened before aten_graph
+        # because placeholder's _source_node attribute is not preserved by make_fx
+        if same_signature:
+            check_signature_rewritable(graph)
+
+        # NB: This is mostly hitting the cache; Dynamo already converted these
+        example_fake_inputs = [fake_mode.from_tensor(t) for t in example_inputs]
+
+        if aten_graph:
+            # Running graph with interpreter is needed for propagating the stack_trace
+            def graph_with_interpreter(*args):
+                with torch.fx.traceback.preserve_node_meta():
+                    return torch.fx.Interpreter(graph).run(*args)
+
+            with maybe_disable_fake_tensor_mode(), enable_python_dispatcher(), (
+                fake_mode
+            ):
+                try:
+                    graph = make_fx(
+                        graph_with_interpreter,
+                        decomposition_table=decomposition_table,
+                        tracing_mode="real",
+                        _allow_non_fake_inputs=True,
+                        pre_dispatch=pre_dispatch,
+                        _allow_fake_constant=False,
+                    )(*example_fake_inputs)
+                except CondOpArgsMismatchError as e:
+                    # Wrap the internal error to the user-facing error
+                    raise UserError(  # noqa: TRY200
+                        UserErrorType.DYNAMIC_CONTROL_FLOW,
+                        str(e),
+                        case_name="cond_operands",
+                    )
+
+            assert graph is not None
+            for node in graph.graph.nodes:
+                if node.op == "get_attr" and isinstance(
+                    getattr(graph, node.target), torch.Tensor
+                ):
+                    node.meta["val"] = fake_mode.from_tensor(
+                        getattr(graph, node.target), static_shapes=True
+                    )
+
+        if same_signature:
+            flat_args_dynamic_dims = [
+                {c.dim for c in (constraints or ()) if c.w_tensor() is x}
+                for x in flat_args
+            ]
+            graph = rewrite_signature(
+                original_signature,
+                graph,
+                fake_mode,
+                flat_args,
+                in_spec,
+                example_fake_inputs,
+                graph_captured_input,
+                graph_captured_result,
+                result_traced,  # type: ignore[possibly-undefined]
+                flat_args_dynamic_dims,
+            )
+        # Store constraints and inputs as metadata for user passes, e.g. turn constraints to runtime check
+        assert graph is not None
+        graph.meta["input_shape_constraints"] = (
+            [constraint.serializable_spec for constraint in constraints]
+            if constraints
+            else []
+        )
+
+        return ExportResult(graph, out_guards)
+
+    if extra_args or extra_kwargs:
+        warnings.warn(
+            "export(f, *args, **kwargs) is deprecated, use export(f)(*args, **kwargs) instead.  "
+            "If you don't migrate, we may break your export call in the future if your user defined kwargs "
+            "conflict with future kwargs added to export(f)."
+        )
+        return inner(*extra_args, **extra_kwargs)
+    else:
+        return inner
+
+
+def optimize_assert(
+    backend,
+    *,
+    hooks=Hooks(None, None),
+    export=False,
+    export_constraints=None,
+    dynamic=None,
+):
+    """
+    The same as `torch._dynamo.optimize(backend, nopython=True)`
+    """
+    backend = get_compiler_fn(backend)
+
+    # Find if backend has any extra context manager
+    backend_ctx_ctor = getattr(backend, "backend_ctx_ctor", null_context)
+
+    return _optimize_catch_errors(
+        convert_frame.convert_frame_assert(
+            backend, export=export, export_constraints=export_constraints
+        ),
+        hooks,
+        backend_ctx_ctor,
+        export=export,
+        dynamic=dynamic,
+    )
+
+
+class TorchPatcher:
+    @staticmethod
+    @functools.lru_cache(None)
+    def patch():
+        # A better way to disable the following would be decorate the source
+        # functions with @torch._disable_dynamo. However, this causes issues
+        # with torch.deploy internally.
+        from .decorators import disable
+
+        torch.jit.trace = disable(torch.jit.trace)
+        torch.jit.trace_module = disable(torch.jit.trace_module)
+        torch.jit._get_trace_graph = disable(torch.jit._get_trace_graph)
+        torch.fx._symbolic_trace.Tracer.trace = disable(
+            torch.fx._symbolic_trace.Tracer.trace
+        )
+        torch.distributions.Distribution.set_default_validate_args(False)
+
+        from ..optim import (
+            adadelta,
+            adagrad,
+            adam,
+            adamax,
+            adamw,
+            asgd,
+            lbfgs,
+            nadam,
+            radam,
+            rmsprop,
+            rprop,
+            sgd,
+            sparse_adam,
+        )
+
+        optimizer_modules = {
+            adadelta,
+            adagrad,
+            adam,
+            adamax,
+            adamw,
+            asgd,
+            lbfgs,
+            nadam,
+            radam,
+            rmsprop,
+            rprop,
+            sgd,
+            sparse_adam,
+        }
+
+        for opt_mod in optimizer_modules:
+            opt_name = opt_mod.__name__.split(".")[-1]
+            fused_fn_name = f"_fused_{opt_name}"
+            single_tensor_fn_name = f"_single_tensor_{opt_name}"
+
+            if hasattr(opt_mod, fused_fn_name):
+                setattr(
+                    opt_mod, fused_fn_name, disable(getattr(opt_mod, fused_fn_name))
+                )
+
+        optimizer_classes = [
+            opt
+            for opt in torch.optim.__dict__.values()
+            if inspect.isclass(opt) and issubclass(opt, torch.optim.Optimizer)
+        ]
+
+        # Note: we don't support sparsity or tracing through backwards
+        excluded_optimizer_classes = {
+            torch.optim.SparseAdam,
+            torch.optim.LBFGS,
+        }
+
+        for opt in optimizer_classes:
+            if opt in excluded_optimizer_classes:
+                opt.step = disable(opt.step)
+
+            if hasattr(opt, "_init_group"):
+                opt._init_group = disable(opt._init_group)
+
+    @staticmethod
+    def suppress_torch_distributed_warnings(fn):
+        def inner_fn(*args, **kwargs):
+            warnings.filterwarnings(
+                "ignore", category=UserWarning, module="torch.distributed"
+            )
+            return fn(*args, **kwargs)
+
+        return inner_fn
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/exc.py b/MLPY/Lib/site-packages/torch/_dynamo/exc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b626595d3487782f27eaa962c4e13fdaea39df75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/exc.py
@@ -0,0 +1,335 @@
+import os
+import textwrap
+from enum import auto, Enum
+from traceback import extract_stack, format_exc, format_list, StackSummary
+from typing import cast, NoReturn, Optional
+
+import torch._guards
+
+from . import config
+
+from .utils import counters
+
+
+def exportdb_error_message(case_name):
+    return (
+        "For more information about this error, see: "
+        + "https://pytorch.org/docs/main/generated/exportdb/index.html#"
+        + case_name.replace("_", "-")
+    )
+
+
+import logging
+
+log = logging.getLogger(__name__)
+graph_breaks_log = torch._logging.getArtifactLogger(__name__, "graph_breaks")
+
+
+class TorchDynamoException(RuntimeError):
+    pass
+
+
+class InternalTorchDynamoError(TorchDynamoException):
+    pass
+
+
+class RestartAnalysis(TorchDynamoException):
+    pass
+
+
+class SpeculationRestartAnalysis(RestartAnalysis):
+    pass
+
+
+class UnspecializeRestartAnalysis(RestartAnalysis):
+    pass
+
+
+class SkipFrame(TorchDynamoException):
+    pass
+
+
+class TorchRuntimeError(TorchDynamoException):
+    pass
+
+
+class InvalidBackend(TorchDynamoException):
+    def __init__(self, name):
+        super().__init__(
+            f"Invalid backend: {name!r}, see `torch._dynamo.list_backends()` for available backends."
+        )
+
+
+class ResetRequired(TorchDynamoException):
+    def __init__(self):
+        super().__init__(
+            textwrap.dedent(
+                """
+                Must call `torch._dynamo.reset()` before changing backends.  Detected two calls to
+                `torch.compile()` with a different backend compiler arguments.
+                """
+            )
+        )
+
+
+class BackendCompilerFailed(TorchDynamoException):
+    def __init__(self, backend_fn, inner_exception):
+        self.backend_name = getattr(backend_fn, "__name__", "?")
+        self.inner_exception = inner_exception
+        msg = f"backend={self.backend_name!r} raised:\n{type(inner_exception).__name__}: {inner_exception}"
+        super().__init__(msg)
+
+
+class Unsupported(TorchDynamoException):
+    def __init__(self, msg):
+        super().__init__(msg)
+        self.real_stack = torch._guards.TracingContext.extract_stack()
+        self.msg = msg
+        self.category: Optional[str] = None
+        self.add_to_stats()
+
+    def remove_from_stats(self):
+        assert self.category is not None
+        counters[self.category][self.msg] -= 1
+        if counters[self.category][self.msg] <= 0:
+            del counters[self.category][self.msg]
+
+    def add_to_stats(self, category="unimplemented"):
+        self.category = category
+        counters[category][self.msg] += 1
+
+
+class RecompileError(TorchDynamoException):
+    pass
+
+
+class ArgsMismatchError(Unsupported):
+    def __init__(self, msg):
+        super().__init__(msg)
+
+
+class AttributeMutationError(Unsupported):
+    def __init__(self, msg):
+        super().__init__(msg)
+
+
+class CondOpArgsMismatchError(ArgsMismatchError):
+    """
+    Internal error from cond() due to arguments mismatch.
+    """
+
+    def __init__(self, msg):
+        super().__init__(msg)
+
+
+class UserErrorType(Enum):
+    DYNAMIC_CONTROL_FLOW = auto()
+    ANTI_PATTERN = auto()
+    STANDARD_LIBRARY = auto()
+    CONSTRAINT_VIOLATION = auto()
+    DYNAMIC_DIM = auto()
+    INVALID_INPUT = auto()
+    INVALID_OUTPUT = auto()
+
+
+class UserError(Unsupported):
+    def __init__(self, error_type: UserErrorType, msg, case_name=None):
+        """
+        Type of errors that would be valid in Eager, but not supported in TorchDynamo.
+        The error message should tell user about next actions.
+
+        error_type: Type of user error
+        msg: Actionable error message
+        case_name: (Optional) Unique name (snake case) for the usage example in exportdb.
+        """
+        if case_name is not None:
+            assert isinstance(case_name, str)
+            if msg.endswith("."):
+                msg += " "
+            else:
+                msg += "\n"
+            msg += exportdb_error_message(case_name)
+        super().__init__(msg)
+        self.error_type = error_type
+        self.message = msg
+
+
+class UncapturedHigherOrderOpError(TorchDynamoException):
+    pass
+
+
+class IncorrectUsage(Exception):
+    pass
+
+
+# These exceptions are ok to fallback to eager/graph_break.
+exceptions_allowed_to_be_fallback = (
+    torch._subclasses.fake_tensor.DataDependentOutputException,
+    torch._subclasses.fake_tensor.DynamicOutputShapeException,
+    torch._subclasses.fake_tensor.UnsupportedOperatorException,
+    torch._subclasses.fake_tensor.UnsupportedFakeTensorException,
+)
+
+
+def unimplemented_with_warning(e: Exception, code, msg: str) -> NoReturn:
+    # This function calls unimplemented internally and eventually graph breaks
+    # or falls to eager. unimplemented itself does not print any user warnings,
+    # i.e., its very silent. This helper function is intended when an error is
+    # encountered in the torch.compile stack which is worth showing as warning
+    # to the user. For example, if AOT Autograd backend fails with a fake tensor
+    # exception, its ok to fallback to eager but not silently. Here, we can use
+    # this function to log the message and the stack trace.
+    graph_break_msg = format_error_msg_verbose(e, code)
+    graph_breaks_log.debug("%s", graph_break_msg)
+    log.warning(msg)
+    raise unimplemented(msg) from e
+
+
+def unimplemented(msg: str) -> NoReturn:
+    assert msg != os.environ.get("BREAK", False)
+    raise Unsupported(msg)
+
+
+def warning(msg: str) -> None:
+    counters["warnings"][msg] += 1
+    assert msg != os.environ.get("BREAK", False)
+
+
+# KeyError has special handling for its args
+# see https://github.com/python/cpython/blob/3.11/Objects/exceptions.c#L2534 for details
+class KeyErrorMsg:
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return str(self.value)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+def augment_exc_message(exc: Exception, msg: str = "\n", export: bool = False) -> None:
+    import traceback
+
+    exc.innermost_user_frame_summary = None  # type: ignore[attr-defined]
+
+    real_stack = get_real_stack(exc)
+    if real_stack is not None and len(real_stack) > 0:
+        exc.innermost_user_frame_summary = real_stack[-1]  # type: ignore[attr-defined]
+        msg += f"\nfrom user code:\n {''.join(traceback.format_list(real_stack))}"
+
+    if config.replay_record_enabled and hasattr(exc, "record_filename"):
+        msg += f"\nLast frame execution written to {exc.record_filename}. To run only this frame while debugging, run\
+ torch._dynamo.replay('{exc.record_filename}').\n"
+
+    if not config.verbose and hasattr(exc, "real_stack"):
+        msg += '\nSet TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information\n'
+
+    if hasattr(exc, "inner_exception") and hasattr(
+        exc.inner_exception, "minifier_path"
+    ):
+        if hasattr(exc.inner_exception, "buck_command"):
+            msg += (
+                f"\nMinifier script written to {exc.inner_exception.minifier_path}. Run "
+                f"this buck command to find the smallest traced graph "
+                f"which reproduces this error: {exc.inner_exception.buck_command}\n"
+            )
+        else:
+            msg += (
+                f"\nMinifier script written to {exc.inner_exception.minifier_path}. Run "
+                "this script to find the smallest traced graph which reproduces this error.\n"
+            )
+
+    if not config.suppress_errors and not export:
+        msg += (
+            "\n\n"
+            "You can suppress this exception and fall back to eager by setting:\n"
+            "    import torch._dynamo\n"
+            "    torch._dynamo.config.suppress_errors = True\n"
+        )
+
+    old_msg = "" if len(exc.args) == 0 else str(exc.args[0])
+
+    if isinstance(exc, KeyError):
+        exc.args = (KeyErrorMsg(old_msg + msg),) + exc.args[1:]
+    else:
+        new_msg = old_msg + msg
+        exc.args = (new_msg,) + exc.args[1:]
+
+
+def get_real_stack(exc: Exception, frame=None) -> Optional[StackSummary]:
+    real_stack = getattr(exc, "real_stack", None)
+    if real_stack is None:
+        return None
+
+    # NB: it's possible for real_stack to be []; we still attempt to
+    # report a stack anyway because the stack_above_dynamo may still
+    # be useful for debugging
+
+    stack_above_dynamo = []
+    if frame is not None:
+        # NB: frame is PyInterpreterFrame on Python 3.11 and later,
+        # not a TRUE frame object.  You can't actually feed it
+        # to traceback because it doesn't have enough information.
+        # To solve this problem, we technically should just materialize
+        # the frame, the same way _PyFrame_GetFrameObject would do
+        # (but we cannot actually do this, because this populates
+        # frame_obj field, which default eval frame doesn't like).
+        #
+        # Fortunately, in this case, we can hack it: there's no need
+        # to actually use the truly top frame, we can just extract
+        # from where we are right now and rely on filter_stack to
+        # get rid of all the dynamo frames.  For ease of testing
+        # we apply this behavior to ALL Python versions
+        stack_above_dynamo = filter_stack(extract_stack())
+
+    return cast(StackSummary, stack_above_dynamo + real_stack)
+
+
+# filter out all frames after entering dynamo
+def filter_stack(stack):
+    user_stack = []
+    for frame in stack:
+        if "convert_frame" in frame.filename:
+            break
+        if "eval_frame" in frame.filename or "torch._dynamo.optimize(" in frame.line:
+            continue
+        user_stack.append(frame)
+
+    return user_stack
+
+
+def format_error_msg_verbose(
+    exc: Exception, code, record_filename=None, frame=None
+) -> str:
+    msg = (
+        f"WON'T CONVERT {code.co_name} {code.co_filename} line {code.co_firstlineno}\n"
+    )
+    msg += "=" * 10 + " TorchDynamo Stack Trace " + "=" * 10 + "\n"
+    msg += format_exc()
+    real_stack = get_real_stack(exc, frame)
+    if real_stack is not None:
+        msg += (
+            "\n"
+            + "=" * 10
+            + " The above exception occurred while processing the following code "
+            + "=" * 10
+            + "\n\n"
+        )
+        msg += "".join(format_list(real_stack))
+        msg += "\n"
+        msg += "=" * 10
+
+    return msg
+
+
+def format_error_msg(exc: Exception, code, record_filename=None, frame=None) -> str:
+    msg = os.linesep * 2
+
+    if config.verbose:
+        msg = format_error_msg_verbose(exc, code, record_filename, frame)
+    else:
+        msg = f"WON'T CONVERT {code.co_name} {code.co_filename}\
+ line {code.co_firstlineno} \ndue to: \n{format_exc()}"
+
+    return msg
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/external_utils.py b/MLPY/Lib/site-packages/torch/_dynamo/external_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0856ec12aa67bb0192259474afd7948f36078221
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/external_utils.py
@@ -0,0 +1,103 @@
+# This module contains functions that *will be allowed* by dynamo
+
+import functools
+
+import torch
+import torch.utils._pytree as pytree
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+
+def is_compiling() -> bool:
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    If need to check specifically that TorchDynamo is used, then use
+    torch.compiler.is_dynamo_compiling().
+
+    TODO(khabinov): we should deprecate this function and use one of these two:
+    * torch.compiler.is_compiling(),
+    * torch.compiler.is_dynamo_compiling().
+    It will depend on the context where to use what.
+    """
+    return torch.compiler.is_compiling()
+
+
+def wrap_inline(fn):
+    """
+    Create an extra frame around fn that is not in skipfiles
+    """
+
+    @functools.wraps(fn)
+    def inner(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    return inner
+
+
+def call_hook(hook, *args):
+    """
+    Used by compiled autograd to handle hook returning None
+    """
+    result = hook(*args)
+    if result is None:
+        return args[0]
+    return result
+
+
+def wrap_numpy(f):
+    r"""Decorator that turns a function from ``np.ndarray``s to ``np.ndarray``s into a function
+    from ``torch.Tensor``s to ``torch.Tensor``s.
+    """
+    if not np:
+        return f
+
+    @functools.wraps(f)
+    def wrap(*args, **kwargs):
+        args, kwargs = pytree.tree_map_only(
+            torch.Tensor, lambda x: x.numpy(), (args, kwargs)
+        )
+        out = f(*args, **kwargs)
+        return pytree.tree_map_only(np.ndarray, lambda x: torch.as_tensor(x), out)
+
+    return wrap
+
+
+class FakeContext:
+    def __init__(self, saved_tensors):
+        # this will cache the results of saved_tensors
+        # and will no longer call into c++ binding
+        self.saved_tensors = saved_tensors
+
+
+def call_backward(backward_fn, saved_tensors, *args):
+    grads = backward_fn(FakeContext(saved_tensors), *args)
+
+    # in eager, we wrap in a tuple when there's only one grad output
+    if type(grads) is not tuple:
+        grads = (grads,)
+
+    return grads
+
+
+def untyped_storage_size(x: torch.Tensor):
+    return x.untyped_storage().size()
+
+
+def call_hook_from_backward_state(*args, bw_state, hook_name: str, **kwargs):
+    return getattr(bw_state, hook_name)(*args, **kwargs)
+
+
+def call_module_hooks_from_backward_state(
+    _, result, *args, bw_state, hooks_name: str, module_name: str
+):
+    module = getattr(bw_state, module_name)
+    hooks = getattr(bw_state, hooks_name)
+    for hook in hooks:
+        new_result = hook(module, result, *args)
+        if new_result is not None:
+            result = new_result
+    return result
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/funcname_cache.py b/MLPY/Lib/site-packages/torch/_dynamo/funcname_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0dc1886f35fecd90de86254e22351d11fdf560
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/funcname_cache.py
@@ -0,0 +1,57 @@
+import tokenize
+
+from typing import Dict, List, Optional
+
+cache: Dict[str, Dict[int, str]] = {}
+
+
+def clearcache() -> None:
+    cache.clear()
+
+
+def _add_file(filename: str) -> None:
+    try:
+        with open(filename) as f:
+            tokens = list(tokenize.generate_tokens(f.readline))
+    except OSError:
+        cache[filename] = {}
+        return
+
+    # NOTE: undefined behavior if file is not valid Python source,
+    # since tokenize will have undefined behavior.
+    result: Dict[int, str] = {}
+    # current full funcname, e.g. xxx.yyy.zzz
+    cur_name = ""
+    cur_indent = 0
+    significant_indents: List[int] = []
+
+    for i, token in enumerate(tokens):
+        if token.type == tokenize.INDENT:
+            cur_indent += 1
+        elif token.type == tokenize.DEDENT:
+            cur_indent -= 1
+            # possible end of function or class
+            if significant_indents and cur_indent == significant_indents[-1]:
+                significant_indents.pop()
+                # pop the last name
+                cur_name = cur_name.rpartition(".")[0]
+        elif (
+            token.type == tokenize.NAME
+            and i + 1 < len(tokens)
+            and tokens[i + 1].type == tokenize.NAME
+            and (token.string == "class" or token.string == "def")
+        ):
+            # name of class/function always follows class/def token
+            significant_indents.append(cur_indent)
+            if cur_name:
+                cur_name += "."
+            cur_name += tokens[i + 1].string
+        result[token.start[0]] = cur_name
+
+    cache[filename] = result
+
+
+def get_funcname(filename: str, lineno: int) -> Optional[str]:
+    if filename not in cache:
+        _add_file(filename)
+    return cache[filename].get(lineno, None)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/guards.py b/MLPY/Lib/site-packages/torch/_dynamo/guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6cd7ab94ea3c9fab444757e96eea58dfc561939
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/guards.py
@@ -0,0 +1,1505 @@
+from __future__ import annotations
+
+import ast
+import builtins
+import collections
+import dataclasses
+import enum
+import functools
+import importlib
+import inspect
+import itertools
+import logging
+import math
+import os
+import re
+import sys
+import textwrap
+import types
+import weakref
+from inspect import currentframe, getframeinfo
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from weakref import ReferenceType
+
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+import torch
+import torch.utils._device
+from torch._dynamo.source import (
+    is_from_local_source,
+    TensorProperty,
+    TensorPropertySource,
+)
+
+from torch._guards import (
+    DuplicateInputs,
+    Guard,
+    GuardBuilderBase,
+    GuardEnvExpr,
+    GuardSource,
+    Source,
+)
+
+from torch._logging import structured
+from torch.fx.experimental.symbolic_shapes import (
+    EqualityConstraint,
+    is_symbolic,
+    SYMPY_INTERP,
+)
+from torch.utils._traceback import format_frame, report_compile_source_on_error
+from torch.utils.weak import TensorWeakRef
+
+from . import config, convert_frame, exc, mutation_guard
+from .eval_frame import set_guard_error_hook
+from .source import AttrSource, DefaultsSource, LocalSource, TypeSource
+from .types import CacheEntry, ExtraState, GuardedCode, GuardFail, GuardFn  # noqa: F401
+from .utils import (
+    common_constant_types,
+    dict_keys_repr,
+    guard_failures,
+    istype,
+    key_is_id,
+    key_to_id,
+    orig_code_map,
+    tensor_always_has_static_shape,
+    tuple_iterator_getitem,
+    tuple_iterator_len,
+)
+
+log = logging.getLogger(__name__)
+guards_log = torch._logging.getArtifactLogger(__name__, "guards")
+recompiles_log = torch._logging.getArtifactLogger(__name__, "recompiles")
+recompiles_verbose_log = torch._logging.getArtifactLogger(
+    __name__, "recompiles_verbose"
+)
+verbose_guards_log = torch._logging.getArtifactLogger(__name__, "verbose_guards")
+
+TensorGuards = torch._C._dynamo.guards.TensorGuards
+check_obj_id = torch._C._dynamo.guards.check_obj_id
+check_type_id = torch._C._dynamo.guards.check_type_id
+dict_version = torch._C._dynamo.guards.dict_version
+
+
+# For user stack printing
+@functools.lru_cache(None)
+def uninteresting_files():
+    import torch._dynamo.external_utils
+
+    mods = [
+        torch._dynamo.external_utils,
+    ]
+    return {inspect.getfile(m) for m in mods}
+
+
+CLOSURE_VARS = {
+    "___check_type_id": check_type_id,
+    "___check_obj_id": check_obj_id,
+    "___odict_getitem": collections.OrderedDict.__getitem__,
+    "___key_to_id": key_to_id,
+    "___dict_version": dict_version,
+    "___dict_contains": lambda a, b: a in b,
+    "___tuple_iterator_len": tuple_iterator_len,
+    "___tuple_iterator_getitem": tuple_iterator_getitem,
+    "__math_isnan": math.isnan,
+    "__numpy_isnan": None if np is None else np.isnan,
+    "inf": float("inf"),
+    "__load_module": importlib.import_module,
+    "utils_device": torch.utils._device,
+    "device": torch.device,
+    "___from_numpy":
+    # If not numpy array, piggy back on e.g. tensor guards to check type
+    (lambda a: torch.as_tensor(a) if isinstance(a, (np.generic, np.ndarray)) else a),
+    "torch": torch,
+    "inspect": inspect,
+}
+
+if sys.version_info[:2] <= (3, 8):
+    # [Note: Python Version <= 3.8]
+    # This branch should be dropped when we drop support for Python 3.8.
+    # Reason: 'ast.unparse' function was introduced in Python 3.9.
+
+    try:
+        import astunparse  # type: ignore[import]
+
+        def _ast_unparse(node: ast.AST) -> str:
+            return astunparse.unparse(node).replace("\n", "")
+
+        HAS_UNPARSE_FUNCTIONS = True
+    except ImportError:
+        HAS_UNPARSE_FUNCTIONS = False
+        pass
+else:
+    HAS_UNPARSE_FUNCTIONS = True
+
+    def _ast_unparse(node: ast.AST) -> str:
+        return ast.unparse(node).replace("\n", "")
+
+
+def strip_function_call(name):
+    """
+    "___odict_getitem(a, 1)" => "a"
+    "a.layers[slice(2)][0]._xyz" ==> "a"
+    "getattr(a.layers[slice(2)][0]._abc, '0')" ==> "a"
+    "getattr(getattr(a.x[3], '0'), '3')" ==> "a"
+    "a.layers[slice(None, -1, None)][0]._xyz" ==> "a"
+    """
+    # recursively find valid object name in function
+    valid_name = re.compile("[A-Za-z_].*")
+    curr = ""
+    for char in name:
+        if char in " (":
+            curr = ""
+        elif char in "),[]":
+            if curr and curr != "None" and valid_name.match(curr):
+                return strip_function_call(curr)
+        else:
+            curr += char
+
+    return strip_getattr_getitem(name)
+
+
+def strip_getattr_getitem(name):
+    """
+    "a[1]" => "a"
+    "a.foo" => "a"
+    """
+    return re.split(r"[.\[]", name)[0]
+
+
+def get_verbose_code_part(code_part, guard):
+    extra = ""
+    if guard.user_stack:
+        for fs in reversed(guard.user_stack):
+            if fs.filename not in uninteresting_files():
+                extra = f"  # {format_frame(fs, line=True)}"
+                break
+    elif guard.stack:
+        extra = f"  # {format_frame(guard.stack.summary()[-1])}"
+
+    return f"{code_part:<60}{extra}"
+
+
+def convert_to_concrete_values(size_or_stride):
+    converted: List[Optional[int]] = []
+    for dim in size_or_stride:
+        if not is_symbolic(dim):
+            converted.append(dim)
+        else:
+            assert isinstance(dim, torch.SymInt)
+            converted.append(dim.node.maybe_as_int())
+    return converted
+
+
+def get_tensor_guard_code_part(value, name, sizes, strides):
+    pytype = type(value)
+    dispatch_key = (
+        torch._C._dispatch_keys(value) | torch._C._dispatch_tls_local_include_set()
+    ) - torch._C._dispatch_tls_local_exclude_set()
+    dtype = value.dtype
+    device_index = value.device.index
+    requires_grad = value.requires_grad
+    guard_str = (
+        f"check_tensor({name}, {pytype.__qualname__}, {dispatch_key}, {dtype}, "
+        f"device={device_index}, requires_grad={requires_grad}, size={sizes}, stride={strides})"
+    )
+    return guard_str
+
+
+# The ready to eval generated code (possibly multiple parts) for a guard, plus
+# the original guard object that created it for provenance
+@dataclasses.dataclass
+class GuardCodeList:
+    code_list: List[str]
+    guard: Guard
+
+
+class GuardBuilder(GuardBuilderBase):
+    def __init__(
+        self,
+        id_ref: Callable[[Any], str],
+        source_ref: Callable[[Source], str],
+        lookup_weakrefs: Callable[[object], ReferenceType[object]],
+        local_scope: Dict[str, object],
+        global_scope: Dict[str, object],
+        check_fn_manager: CheckFunctionManager,
+    ):
+        self.id_ref = id_ref
+        self.source_ref = source_ref
+        self.lookup_weakrefs = lookup_weakrefs
+        self.scope: Dict[str, Dict[str, object]] = {"L": local_scope, "G": global_scope}
+        self.scope["__builtins__"] = builtins.__dict__.copy()
+        for (
+            name,
+            package_module,
+        ) in torch.package.package_importer._package_imported_modules.items():
+            name = name.replace(">", "_").replace("<", "_").replace(".", "_dot_")
+            # Write the package module into the scope so that we can import it
+            self.scope["__builtins__"][name] = package_module
+            # Write the demangled name to the scope so that we can use it
+            self.scope[name] = package_module
+
+        self.argnames: List[str] = []
+        # Code is python expression strings generated for each guard
+        self.code: List[GuardCodeList] = []
+        # shape_env_code is only used by builder and is used for
+        # shape env code.  This exists only because we need to make sure
+        # shape env guards get run after tensor match guards (since the
+        # tensor match guards make sure we actually have tensors)
+        self.shape_env_code: List[GuardCodeList] = []
+
+        # [Note - On Eager Tensor Guards]
+        # Most of the time, we generate Python code in a guard to directly
+        # check various properties.  However, tensors are a bit special;
+        # it is too slow to check their properties one-by-one in Python.
+        # Instead, there is a C++ function TensorGuards.check which takes
+        # all of the tensor arguments and checks them all against compile-time
+        # examples entirely in C++.  Thus, every time we process a
+        # TENSOR_MATCH guard, we just add another entry to
+        # tensor_check_names/tensor_check_examples, saying "for this local,
+        # check it against this example", and it all ends up getting
+        # swept up into a single call to ___check_tensors.  Invariant:
+        # len(tensor_check_names) == len(tensor_check_examples).
+        # TODO: something here
+        self.tensor_check_names: List[str] = []
+        self.tensor_check_examples: List[torch.Tensor] = []
+        self.tensor_check_guards: List[Guard] = []
+
+        self.check_fn_manager: CheckFunctionManager = check_fn_manager
+        # Keep track of weak references of objects with ID_MATCH guard. This
+        # info is stored alongside optimized_code and check_fn and is used to
+        # limit the number of cache entries with same ID_MATCH'd object.
+        self.id_matched_objs: Dict[str, ReferenceType[object]] = {}
+
+    # Warning: use this with care!  This lets you access what the current
+    # value of the value you are guarding on is.  You probably don't want
+    # to actually durably save this value though (because it's specific
+    # to this frame!)  Instead, you should be reading out some property
+    # (like its type) which is what you permanently install into the
+    # guard code.
+    def get(self, name: str) -> Any:
+        return eval(name, self.scope, CLOSURE_VARS)
+
+    # Registers the usage of the source name referenced by the
+    # string (or stored in the Guard) as being guarded upon.  It's important
+    # to call this before generating some code that makes use of 'guard',
+    # because without this call, we won't actually bind the variable
+    # you reference in the actual guard closure (oops!)
+    def arg_ref(self, guard: Union[str, Guard]) -> str:
+        name: str
+        if isinstance(guard, str):
+            name = guard
+        else:
+            name = guard.name
+        base = strip_getattr_getitem(strip_function_call(name))
+        if base not in self.argnames:
+            if re.match(r"[a-zA-Z0-9_]+", base):
+                if re.match(r"^\d+$", base):
+                    log.warning("invalid var name: %s", guard)
+                self.argnames.append(base)
+
+        return name
+
+    def _guard_on_attribute(self, guard: Guard, attr_name: str, guard_fn):
+        attr_source = AttrSource(guard.originating_source, attr_name)
+        # Copy the stack info
+        new_guard = Guard(
+            attr_source, guard_fn, stack=guard.stack, user_stack=guard.user_stack
+        )
+        new_guard.create(self)
+
+    def TYPE_MATCH(self, guard: Guard) -> None:
+        # ___check_type_id is same as `id(type(x)) == y`
+        t = type(self.get(guard.name))
+        obj_id = self.id_ref(t)
+        code = f"___check_type_id({self.arg_ref(guard)}, {obj_id})"
+        self._produce_guard_code(guard, [code])
+
+    def DICT_VERSION(self, guard: Guard):
+        # ___check_dict_version is same as `dict_version(x) == y`
+        ref = self.arg_ref(guard)
+        version = dict_version(self.get(guard.name))
+        code = f"___dict_version({ref}) == {version}"
+        self._produce_guard_code(guard, [code])
+
+    def DICT_CONTAINS(self, guard: Guard, key: str, invert: bool):
+        dict_ref = self.arg_ref(guard)
+
+        maybe_not = "not " if invert else ""
+        code = f"{maybe_not}___dict_contains({key!r}, {dict_ref})"
+        return self._produce_guard_code(guard, [code])
+
+    def BOOL_FALSE(self, guard: Guard):
+        # Guard on the runtime value being 'False',
+        # can be faster than seemingly equivalent checks like DICT_KEYS for empty dict
+        #
+        # WARNING: this guard is not safe to use generally.  It only works if the runtime
+        # value is of a type that supports bool(), and some types e.g. Tensor do not.
+        # Only use this guard in cases you can guarantee the runtime type will be friendly.
+        # (e.g. Specialized NNModule with mutation protection via setattr)
+        #
+        # Why not simply check the runtime type inside this guard?  It's slow enough to defeat
+        # the purpose of using this guard, which itself is supposed to be a faster alternative
+        # to DICT_KEYS.
+        ref = self.arg_ref(guard)
+        code = f"not {ref}"
+        self._produce_guard_code(guard, [code])
+
+    def ID_MATCH(self, guard: Guard):
+        # ___check_obj_id is same as `id(x) == y`
+        if isinstance(guard.originating_source, TypeSource):
+            # optional optimization to produce cleaner/faster guard code
+            return self.TYPE_MATCH(
+                Guard(guard.originating_source.base, GuardBuilder.TYPE_MATCH)  # type: ignore[arg-type]
+            )
+
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        code = f"___check_obj_id({ref}, {self.id_ref(val)})"
+        self._produce_guard_code(guard, [code])
+
+        # Keep track of ID_MATCH'd objects. This will be used to modify the
+        # cache size logic
+        if isinstance(guard.originating_source, LocalSource):
+            # TODO(janimesh) - This is currently restricted to nn.Module objects
+            # because many other ID_MATCH'd objects fail - like DeviceMesh.
+            # Increase the scope of ID_MATCH'd objects.
+            if isinstance(val, torch.nn.Module):
+                local_name = guard.originating_source.local_name
+                weak_id = self.lookup_weakrefs(val)
+                if weak_id is not None:
+                    self.id_matched_objs[local_name] = weak_id
+
+    def NAME_MATCH(self, guard: Guard):
+        obj = self.get(guard.name)
+        self._guard_on_attribute(guard, "__name__", GuardBuilder.EQUALS_MATCH)
+
+    def DATA_PTR_MATCH(self, guard: Guard):
+        obj = self.get(guard.name)
+        code = f"{self.arg_ref(guard)}.data_ptr() == {obj.data_ptr()}"
+        self._produce_guard_code(guard, [code])
+
+    def HASATTR(self, guard: Guard):
+        assert isinstance(
+            guard.originating_source, AttrSource
+        ), f"invalid source {guard.name}"
+        base_source = guard.originating_source.base
+        base = base_source.name()
+        attr = guard.originating_source.member
+
+        ref = self.arg_ref(base)
+        val = hasattr(self.get(base), attr)
+        code = None
+        if val:
+            code = f"hasattr({ref}, {attr!r})"
+        else:
+            code = f"not hasattr({ref}, {attr!r})"
+
+        self._produce_guard_code(guard, [code], provided_guarded_object=self.get(base))
+
+    def FUNCTORCH_STACK_MATCH(self, guard: Guard):
+        # Invalidate functorch code if current level is different than
+        # the one when FX graph was generated
+        # if torch._C._functorch.peek_interpreter_stack() is not None:
+        cis = torch._functorch.pyfunctorch.retrieve_all_functorch_interpreters()
+        states = [ci.get_state() for ci in cis]
+        code = [f"torch._functorch.pyfunctorch.compare_functorch_state({states})"]
+        self._produce_guard_code(guard, code)
+
+    def EQUALS_MATCH(self, guard: Guard):
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        t = type(val)
+        if np:
+            np_types: Tuple[Type[Any], ...] = (
+                np.int8,
+                np.int16,
+                np.int32,
+                np.int64,
+                np.uint8,
+                np.uint16,
+                np.uint32,
+                np.uint64,
+                np.float16,
+                np.float32,
+                np.float64,
+            )
+        else:
+            np_types = ()
+        ok_types = tuple(
+            common_constant_types
+            | {
+                type,
+                list,
+                tuple,
+                set,
+                frozenset,
+                slice,
+                range,
+                torch.Size,
+                *np_types,
+            }
+        )
+        if istype(val, dict):
+            assert all(
+                istype(x, ok_types) for x in itertools.chain(val.keys(), val.values())
+            )
+        else:
+            assert istype(
+                val,
+                ok_types,
+            ), f"Unexpected type {type(val)}, not in {ok_types}"
+
+        # Special case for nan because float("nan") == float("nan") evaluates to False
+        if istype(val, float) and math.isnan(val):
+            self.TYPE_MATCH(guard)
+            code = list()
+            code.append(f"__math_isnan({ref})")
+            self._produce_guard_code(guard, code)
+            return
+        # Python math library doesn't support complex nan, so we need to use numpy
+        elif istype(val, complex) and np.isnan(val):
+            self.TYPE_MATCH(guard)
+            code = list()
+            code.append(f"__numpy_isnan({ref})")
+            self._produce_guard_code(guard, code)
+            return
+
+        code = list()
+
+        # If matching equality against list/tuple, we must also check that
+        # the internal types match.  (TODO: what about nested lists?)
+        if istype(val, (list, tuple)):
+            # NB: SEQUENCE_LENGTH takes care of the outer __check_type_id test
+            self.SEQUENCE_LENGTH(guard)
+
+            for idx, elem in enumerate(val):
+                code.append(
+                    f"___check_type_id({ref}[{idx}], {self.id_ref(type(elem))})"
+                )
+        else:
+            # Add type check to prevent equality check between tensor and non-tensor.
+            self.TYPE_MATCH(guard)
+
+        if istype(val, torch.Size):
+            val = tuple(val)
+
+        # Code object can not be compared against their string representation
+        # I.e `eval(f"{compile('2+2','','exec')!r}")` raises SyntaxError
+        assert not istype(val, types.CodeType)
+
+        # TODO: It feels like it would be better to just implement our own
+        # equality test in C that handles all of the necessary type checking
+        # and NaN tests
+        code.append(f"{ref} == {val!r}")
+        self._produce_guard_code(guard, code)
+
+    def CONSTANT_MATCH(self, guard: Guard):
+        val = self.get(guard.name)
+        if istype(val, (bool, type(None), types.CodeType)):
+            self.ID_MATCH(guard)
+        else:
+            self.EQUALS_MATCH(guard)
+
+    def NN_MODULE(self, guard: Guard):
+        self.ID_MATCH(guard)
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+
+        def setup_guard():
+            assert istype(val.training, bool)
+            self._guard_on_attribute(guard, "training", GuardBuilder.CONSTANT_MATCH)
+
+        if hasattr(val, "training"):
+            # There are cases where a monkeypatched object has a guard made between __new__ and __init__
+            setup_guard()
+        else:
+            exc.unimplemented(f"Guard setup for uninitialized class {type(val)}")
+
+    def FUNCTION_MATCH(self, guard: Guard):
+        """things like torch.add and user defined functions"""
+        if guard.is_local():
+            return self.ID_MATCH(guard)
+
+    def CLOSURE_MATCH(self, guard: Guard):
+        """matches a closure by __code__ id."""
+        if guard.is_local():
+            val = self.get(guard.name)
+            # Strictly only want user-defined functions
+            if type(val) == types.FunctionType and hasattr(val, "__code__"):
+                self._guard_on_attribute(guard, "__code__", GuardBuilder.HASATTR)
+                self._guard_on_attribute(guard, "__code__", GuardBuilder.FUNCTION_MATCH)
+            else:
+                self.FUNCTION_MATCH(guard)
+
+    def BUILTIN_MATCH(self, guard: Guard):
+        return self.FUNCTION_MATCH(guard)
+
+    def PYMODULE_MATCH(self, guard: Guard):
+        return self.FUNCTION_MATCH(guard)
+
+    def SEQUENCE_LENGTH(self, guard):
+        # This guard is used to check lenght of PySequence objects like list,
+        # tuple, collections.deque etc
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+        t = type(value)
+
+        self.TYPE_MATCH(guard)
+        code = list()
+        if len(value) == 0:
+            code.append(f"not {ref}")
+        else:
+            code.append(f"len({ref}) == {len(value)}")
+
+        self._produce_guard_code(guard, code)
+
+    def DICT_LENGTH(self, guard):
+        self.SEQUENCE_LENGTH(guard)
+
+    def TUPLE_ITERATOR_LEN(self, guard):
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+        t = type(value)
+
+        self.TYPE_MATCH(guard)
+        code = list()
+        code.append(f"___tuple_iterator_len({ref}) == {tuple_iterator_len(value)}")
+
+        self._produce_guard_code(guard, code)
+
+    # TODO(voz): Deduplicate w/ AOTAutograd dupe input guards
+    def DUPLICATE_INPUT(self, guard, source_b):
+        ref_a = self.arg_ref(guard)
+        ref_b = self.arg_ref(source_b.name())
+
+        code = [f"{ref_b} is {ref_a}"]
+        self._produce_guard_code(guard, code)
+
+    def DICT_KEYS(self, guard):
+        # Guard on the keys and their order
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+        t = type(value)
+
+        self.TYPE_MATCH(guard)
+        code = list()
+        any_key_is_id = any(key_is_id(k) for k in value.keys())
+        const_keys_repr = dict_keys_repr(
+            key_to_id(value),
+            local=is_from_local_source(guard.originating_source),
+        )
+        if any_key_is_id:
+            code.append(f"___key_to_id({ref}) == {const_keys_repr}")
+        else:
+            code.append(f"list({ref}.keys()) == {const_keys_repr}")
+
+        self._produce_guard_code(guard, code)
+
+    def WEAKREF_ALIVE(self, guard):
+        self._produce_guard_code(guard, [f"{self.arg_ref(guard)} is not None"])
+
+    def NN_MODULE_PARAM_NAMES(self, guard):
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+        t = type(value)
+        keys = {k for k, v in value.named_parameters()}
+
+        self.TYPE_MATCH(guard)
+        code = list()
+        code.append(f"{{k for k, v in {ref}.named_parameters()}} == {keys!r}")
+
+        self._produce_guard_code(guard, code)
+
+    def DICT_CONST_KEYS(self, guard):
+        """Constant keys match"""
+        ref = self.arg_ref(guard)
+        value = self.get(guard.name)
+        t = type(value)
+
+        self.TYPE_MATCH(guard)
+        code = list()
+        code.append(f"list({ref}.keys()) == {list(value.keys())!r}")
+
+        self._produce_guard_code(guard, code)
+
+    def OBJECT_MUTATION(self, guard: Guard):
+        mutation_guard.watch(self.get(guard.name), self.check_fn_manager)
+
+    def GRAD_MODE(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def DETERMINISTIC_ALGORITHMS(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def TORCH_FUNCTION_STATE(self, guard: Guard):
+        pass  # we always guard on this via GlobalStateGuard()
+
+    def DEFAULT_DEVICE(self, guard: Guard):
+        """Guard on CURRENT_DEVICE per torch.utils._device"""
+        assert guard.source is GuardSource.GLOBAL
+        import torch.utils._device as m
+
+        self._produce_guard_code(
+            guard, [f"utils_device.CURRENT_DEVICE == {m.CURRENT_DEVICE!r}"]
+        )
+
+    def BACKEND_MATCH(self, guard: Guard):
+        """Guard on backend matching based on id of current_backend"""
+        assert guard.source is GuardSource.GLOBAL
+        backend_id = (
+            f"{id(torch._dynamo.eval_frame.guarded_backend_cache.current_backend)}"
+        )
+        code = [f"___check_current_backend({backend_id})"]
+        self._produce_guard_code(guard, code)
+
+    def SHAPE_ENV(self, guard: Guard):
+        # Let's handle ShapeEnv guards.  To do this, we will resolve
+        # shape variables to sources from tracked_fakes.  This must happen after
+        # tensor checks.
+        assert guard.name == ""
+        output_graph = self.check_fn_manager.output_graph
+        # NB: self.output_graph can be None in the debug_nops tests
+        fs = output_graph.tracked_fakes
+        input_contexts = [a.symbolic_context for a in fs]
+
+        def get_sources(t_id, dim):
+            # Looks up base sources mapped to a tensor id and uses them to create
+            # sources for the corresponding tensor dimension.
+            return [
+                TensorPropertySource(source, TensorProperty.SIZE, dim)
+                for source in output_graph.tracked_fakes_id_to_source[t_id]
+            ]
+
+        if output_graph.export_constraints:
+            from sympy import Symbol
+
+            source_pairs: List[Tuple[Source, Source]] = []
+            derived_equalities: List[  # type: ignore[type-arg]
+                Tuple[Source, Union[Source, Symbol], Callable]
+            ] = []
+            phantom_symbols: Dict[str, Symbol] = {}
+            for constraint in output_graph.export_constraints:
+                if constraint.t_id in output_graph.tracked_fakes_id_to_source:
+                    torch.export.dynamic_shapes._process_equalities(
+                        constraint,
+                        get_sources,
+                        output_graph.shape_env,
+                        source_pairs,
+                        derived_equalities,
+                        phantom_symbols,
+                    )
+                else:
+                    log.warning("Untracked tensor used in export constraints")
+            equalities_inputs = EqualityConstraint(
+                source_pairs=source_pairs,
+                derived_equalities=derived_equalities,
+                phantom_symbols=list(phantom_symbols.values()),
+                warn_only=False,
+            )
+        else:
+            equalities_inputs = None
+        guards = output_graph.shape_env.produce_guards(
+            [a.fake for a in fs],
+            [a.source for a in fs],
+            input_contexts=input_contexts,
+            equalities_inputs=equalities_inputs,
+            source_ref=self.source_ref,
+            # Export keeps static.
+            ignore_static=(not self.check_fn_manager.output_graph.export),
+        )
+        # When exporting, we may work with the shape constraints some more in
+        # postprocessing, so don't freeze yet
+        if not self.check_fn_manager.output_graph.export:
+            output_graph.shape_env.freeze()
+        for shape_guard in guards:
+            self._produce_guard_code(guard, [shape_guard], shape_env=True)
+
+    def TENSOR_MATCH(self, guard: Guard, value=None):
+        if guard.is_nn_module() or guard.originating_source.is_dict_key():
+            self.ID_MATCH(guard)
+        else:
+            if isinstance(value, TensorWeakRef):
+                value = value()
+
+            value = value if value is not None else self.get(guard.name)
+            assert isinstance(value, torch.Tensor)
+
+            tensor_name = self.arg_ref(guard)
+            # [Note - On Export Tensor Guards]
+            #
+            # In eager mode, tensor guards are evaluated through C++, in guards.cpp
+            # see [Note - On Eager Tensor Guards] for more info.
+            #
+            # In export mode, we instead maintain parallel logic between C++ and python
+            # here, with an exception of checking the dispatch key - with the idea that a dispatch key
+            # is an entirely runtime notion that would make no sense to keep in an exported graph.
+            #
+            # Now, this idea is okay, but to paraphrase @ezyang, this mental model is sufficient for now, although
+            # not entirely true.
+            # For example, suppose one of the input tensors had the negative dispatch key.
+            # You should end up with a graph that is specialized for tensors that have a negative dispatch key.
+            # If you allow a Tensor that does NOT have this bit set, you will accidentally run it "as if" it were negated.
+            # Now, negative key only shows up for complex numbers, and most likely, the exported to target doesn't
+            # support this feature at all, but the point stands that :some: tensor state only shows up on dispatch key.
+            # TODO(voz): Either populate a dispatch_key check into the guards, or error on users passing in an unsupported
+            # subset of keys during export.
+            #
+            # The list of tensor fields and calls we care about can be found in `terms` below.
+            # TODO(voz): We are missing storage offset in all our tensor guards?
+            code: List[str] = list()
+            if self.check_fn_manager.output_graph.export:
+                self.TYPE_MATCH(guard)
+                terms = [
+                    "dtype",
+                    "device",
+                    "requires_grad",
+                    "ndimension()",
+                ]
+
+                for term in terms:
+                    real_value = self.get(tensor_name + "." + term)
+                    if istype(real_value, (torch.device, torch.dtype)):
+                        # copy pasted from EQUALS_MATCH
+                        code.append(f"str({tensor_name}.{term}) == {str(real_value)!r}")
+                    else:
+                        code.append(f"{tensor_name}.{term} == {real_value}")
+            else:
+                self.tensor_check_names.append(tensor_name)
+                self.tensor_check_examples.append(value)
+                self.tensor_check_guards.append(guard)
+
+            # A frame is valid for reuse with dynamic dimensions if the new
+            # (user-requested) dynamic dimensions are a subset of the old
+            # (already compiled) dynamic dimensions.
+            #
+            # It's a little non-obvious why you'd want this: in particular,
+            # if an already compiled frame matches all of the guards, why
+            # not just use it, why force a recompile?
+            #
+            # We force it for two reasons:
+            #
+            #   - The user *required* us to compile with a new dynamic dimension,
+            #     we should not ignore that and serve up the old, specialized
+            #     frame.  Listen to the user!
+            #
+            #   - In fact, we are obligated to *raise an error* if we fail to
+            #     make the requested dimension dynamic.  If we don't
+            #     recompile, we can't tell if that dimension can actually be
+            #     made dynamic.
+            #
+            # If the new dynamic dims are a subset of the old, we already know
+            # we can make them dynamic (since we made them dynamic in old).
+            # This is slightly unsound, because maybe your input size is
+            # [s0, s0, s1] and so you can do it dynamic if you say dynamic
+            # dims {0, 1, 2} but you can't if you only do {0, 2} (because now
+            # the second s0 is specialized).  But we're not entirely sure if
+            # this is a good idea anyway lol... (if you want to try removing
+            # this logic, be my guest!  -- ezyang 2024)
+            #
+            assert guard.source is not None
+            static, reason = tensor_always_has_static_shape(
+                value, is_tensor=True, guard_source=guard.source
+            )
+            if not static:
+                if hasattr(value, "_dynamo_dynamic_indices"):
+                    code.append(
+                        f"(({tensor_name}._dynamo_dynamic_indices.issubset({value._dynamo_dynamic_indices})) if hasattr({tensor_name}, '_dynamo_dynamic_indices') else True)"  # noqa: B950
+                    )
+                # In the case of us not having any dynamic dimension indices, we compiled the frame with no chance of
+                # raising for this specific tensor - and any inputs with more dynamic user directives specified must be recompiled.
+                else:
+                    code.append(
+                        f"hasattr({tensor_name}, '_dynamo_dynamic_indices') == False"
+                    )
+            if len(code) > 0:
+                self._produce_guard_code(guard, code)
+
+    # A util that appends guarded code, or, in the case of export, adds data onto guards
+    def _produce_guard_code(
+        self, guard, code_list, provided_guarded_object=None, shape_env=False
+    ):
+        # WARNING: It is important that cur_frame/caller do NOT stay in
+        # the current frame, because they will keep things live longer
+        # than they should.  See TestMisc.test_release_module_memory
+        cur_frame = currentframe()
+        assert cur_frame is not None
+        caller = cur_frame.f_back
+        del cur_frame
+        assert caller is not None
+        func_name = getframeinfo(caller)[2]
+        del caller
+        # We use func_name for export, so might as well get a nice defensive check out of it
+        assert func_name in dir(
+            self.__class__
+        ), f"_produce_guard_code must be called from inside GuardedCode. Called from {func_name}"
+
+        if shape_env:
+            self.shape_env_code.append(GuardCodeList(code_list, guard))
+        else:
+            self.code.append(GuardCodeList(code_list, guard))
+
+        # Not all guards have names, some can be installed globally (see asserts on HAS_GRAD)
+        if provided_guarded_object is None:
+            name_valid = guard.name is not None and guard.name != ""
+
+            guarded_object = self.get(guard.name) if name_valid else None
+        else:
+            guarded_object = provided_guarded_object
+
+        guarded_object_type = (
+            weakref.ref(type(guarded_object)) if guarded_object is not None else None
+        )
+        obj_ref = None
+        # Not necessary to have weakref for Enum type, but there is a bug that
+        # makes hasattr(guarded_object.__class__, "__weakref__") return True.
+        if hasattr(guarded_object.__class__, "__weakref__") and not isinstance(
+            guarded_object, enum.Enum
+        ):
+            obj_ref = weakref.ref(guarded_object)
+
+        guard.set_export_info(
+            func_name,
+            guarded_object_type,
+            code_list,
+            obj_ref,
+        )
+
+
+# Common Sub-Expression Elimination for Python expressions.
+#
+# There are 2 steps to this pass:
+#     1. Count the frequency of each sub-expression (i.e. inner
+#        node in the AST tree)
+#
+#     2. Replace those that occur more than once by a fresh variable 'v'.
+#        'v' will be defined in the 'preface' list (output argument to
+#        'NodeTransformer')
+#
+# NB: the use of 'ast.unparse' while visiting the nodes makes this pass
+# quadratic on the depth of the tree.
+#
+# NB: this pass creates a new variable for each AST node that is repeated
+# more than 'USE_THRESHOLD'. e.g. if 'a.b.c.d' is used 10 times, 'a.b.c'
+# and 'a.b' are also used 10 times. So, there will be a new variable for
+# each of them.
+class PyExprCSEPass:
+    # Maximum number of times a given expression can be used without being
+    # replaced by a fresh variable.
+    USE_THRESHOLD = 1
+
+    # Ad-Hoc: AST nodes this pass focuses on.
+    ALLOWED_NODE_TYPES = (ast.Attribute, ast.Call, ast.Subscript)
+
+    @dataclasses.dataclass
+    class Config:
+        expr_count: Dict[str, int]
+        expr_to_name: Dict[str, str]
+
+    class ExprCounter(ast.NodeVisitor):
+        def __init__(self, config: PyExprCSEPass.Config) -> None:
+            self._config = config
+
+        def visit(self, node: ast.AST) -> Any:
+            if isinstance(node, PyExprCSEPass.ALLOWED_NODE_TYPES):
+                self._config.expr_count[_ast_unparse(node)] += 1
+            super().visit(node)
+
+    class Replacer(ast.NodeTransformer):
+        def __init__(
+            self,
+            config: PyExprCSEPass.Config,
+            gen_name: Callable[[], str],
+        ) -> None:
+            super().__init__()
+            self._config = config
+            self._gen_name = gen_name
+            self.preface: List[str] = []
+
+        def visit(self, node: ast.AST) -> Any:
+            if isinstance(node, PyExprCSEPass.ALLOWED_NODE_TYPES):
+                expr = _ast_unparse(node)
+
+                # Replacement only occurs if a given expression is used more
+                # than once.
+                if self._config.expr_count[expr] > PyExprCSEPass.USE_THRESHOLD:
+                    if expr not in self._config.expr_to_name:
+                        # Parent 'visit' is called so that we CSE the inner expressions first.
+                        #
+                        # The resulting expression is used as right-hand-side of the variable
+                        # assignment. i.e. we are CSE-ing the children before the parents.
+                        #
+                        # Indexing still uses the old 'node', since that's what was counted
+                        # by the 'NodeVisitor'.
+                        node_ = super().visit(node)
+                        expr_ = _ast_unparse(node_)
+                        var_name = self._gen_name()
+                        self.preface.append(f"{var_name} = {expr_}")
+                        self._config.expr_to_name[expr] = var_name
+                    else:
+                        var_name = self._config.expr_to_name[expr]
+                    return ast.Name(var_name, ast.Load())
+
+            return super().visit(node)
+
+    def __init__(self) -> None:
+        self._counter = 0
+        self._config = self.Config(
+            expr_count=collections.defaultdict(lambda: 0), expr_to_name={}
+        )
+
+    def _new_var(self, prefix: str = "_var") -> str:
+        name = f"{prefix}{self._counter}"
+        self._counter += 1
+        return name
+
+    def count(self, exprs: List[str]) -> None:
+        counter = self.ExprCounter(self._config)
+        for e in exprs:
+            try:
+                counter.visit(ast.parse(e))
+            except SyntaxError as ex:
+                log.exception("Failed to visit expr at line %s.\n%s", ex.lineno, e)
+                raise
+
+    def replace(self, expr: str) -> Tuple[List[str], str]:
+        replacer = self.Replacer(self._config, self._new_var)
+        new_node = replacer.visit(ast.parse(expr))
+        return replacer.preface, _ast_unparse(new_node)
+
+
+def must_add_nn_module_guards(guard):
+    # For config.guard_nn_modules=False, we can skip all the guards that
+    # originate from inside of nn module except for a few categories.
+    return (
+        # Guard for defaults
+        isinstance(guard.originating_source, DefaultsSource)
+        # Guard using dict tags if the config flag is set
+        or (
+            config.guard_nn_modules_using_dict_tags
+            and guard.create_fn is GuardBuilder.NN_MODULE
+        )
+    )
+
+
+class DeletedGuardFn:
+    pass
+
+
+# NB: Naively, you'd expect this to only be a function that produces
+# the callable that constitutes the guard.  However, there is some
+# delicate handling for invalidating this check function when the
+# locals/globals get invalidated, so there's some extra state
+# we have to hold in this manager class.
+class CheckFunctionManager:
+    def __init__(
+        self,
+        output_graph=None,
+        guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
+    ):
+        guards = output_graph.guards if output_graph else None
+        self._weakrefs: Dict[int, ReferenceType[object]] = {}
+        self.output_graph = output_graph
+        w_builder = None
+
+        def source_ref(source):
+            guard_source = source.guard_source()
+            if guard_source is GuardSource.CONSTANT:
+                # No need to track constants
+                return source.name()
+            assert w_builder
+            r_builder = w_builder()
+            assert r_builder is not None
+            return r_builder.arg_ref(source.name())
+
+        builder = GuardBuilder(
+            self.id_ref,
+            source_ref,
+            self.lookup_weakrefs,
+            output_graph.local_scope,
+            output_graph.global_scope,
+            self,
+        )
+
+        # Break retain cycle. See test_release_scope_memory
+        def cleanup_builder(weak_b):
+            b = weak_b()
+            if b:
+                b.scope = None
+
+        # Break retain cycle. See test_release_input_memory
+        w_builder = weakref.ref(builder, cleanup_builder)
+
+        for guard in sorted(guards or [], key=Guard.sort_key):
+            if (
+                not config.guard_nn_modules
+                and guard.is_nn_module()
+                # Default func args must be guarded on.
+                # TODO: we could make use of 'DefaultsSource' and offer a .guard.is_defaults() API
+                and "__defaults__" not in guard.name
+                and "__kwdefaults__" not in guard.name
+                and (config.skip_nnmodule_hook_guards or "hooks" not in guard.name)
+            ):
+                continue
+
+            guard.create(builder)
+        self.check_fn = self.compile_check_fn(builder, guards, guard_fail_fn)
+        # Keep track of weak references of objects with ID_MATCH guard. This
+        # info is stored alongside optimized_code and check_fn and is used to
+        # limit the number of cache entries with same ID_MATCH'd object.
+        # TODO(janimesh) - Currently this information is stored as an attr on
+        # the check_fn itself to avoid changing CacehEntry datastructure in
+        # eval_frame.c. In future, we should probably replace check_fn with a
+        # queryable data structure such that this information is already present
+        # in some form.
+        self.check_fn.id_matched_objs = builder.id_matched_objs
+
+        # NB - We have to very careful of cleaning up here. Because of the
+        # invalidate function, we can create a weakref finalizer that keeps
+        # `self` alive for very long. Sometimes by mistake, we can run
+        # invalidate for a type/object (check id_ref method) that Python can
+        # leak by design, preventing us from calling the finalizer. In that
+        # case, the `self` will be alive even though the cache entry will be
+        # deleted (check invalidate method), which can cause a memory leak,
+        # e.g., not setting output_graph = None can keep hold of nn_modules.
+        self._weakrefs.clear()
+        self.output_graph = None
+
+    def compile_check_fn(self, builder, guards_out, guard_fail_fn):
+        # see parallel handling of ".0" / "___implicit0" in _eval_frame.c
+        largs = builder.argnames
+        largs += ["**___kwargs_ignored"]
+
+        guards_log.debug("GUARDS:")
+
+        # Don't report this guard, it's always the same, useless!
+        code_parts = ["___check_global_state()"]
+        verbose_code_parts = code_parts[:]
+        structured_guard_fns = []
+
+        def add_code_part(code_part, guard, log_only=False):
+            verbose_code_part = get_verbose_code_part(code_part, guard)
+            guards_log.debug("%s", verbose_code_part)
+
+            structured_guard_fns.append(
+                lambda: {
+                    "code": code_part,
+                    "stack": structured.from_traceback(guard.stack.summary())
+                    if guard.stack
+                    else None,
+                    "user_stack": structured.from_traceback(guard.user_stack)
+                    if guard.user_stack
+                    else None,
+                }
+            )
+
+            if verbose_guards_log.isEnabledFor(logging.DEBUG):
+                maybe_stack = ""
+                maybe_user_stack = ""
+                if guard is not None:
+                    if guard.stack:
+                        maybe_stack = f"\nStack:\n{''.join(guard.stack.format())}"
+                    if guard.user_stack:
+                        maybe_user_stack = (
+                            f"\nUser stack:\n{''.join(guard.user_stack.format())}"
+                        )
+                verbose_guards_log.debug(
+                    "Guard: %s%s%s",
+                    code_part,
+                    maybe_stack,
+                    maybe_user_stack,
+                )
+
+            if not log_only:
+                code_parts.append(code_part)
+                verbose_code_parts.append(verbose_code_part)
+
+        seen = set()
+        for gcl in builder.code:
+            for code in gcl.code_list:
+                if code not in seen:
+                    add_code_part(code, gcl.guard)
+                    seen.add(code)
+
+        tensor_check_names = builder.tensor_check_names
+        check_tensors_fn = None
+        check_tensors_verbose_fn = None
+        if tensor_check_names:
+            assert (
+                not self.output_graph.export
+            ), "Illegal to set tensor_check_names in export."
+            tensor_check_examples = builder.tensor_check_examples
+
+            dynamic_dims_sizes = [
+                convert_to_concrete_values(
+                    self.output_graph.tensor_weakref_to_sizes_strides[t]["size"]
+                )
+                for t in tensor_check_examples
+            ]
+
+            dynamic_dims_strides = [
+                convert_to_concrete_values(
+                    self.output_graph.tensor_weakref_to_sizes_strides[t]["stride"]
+                )
+                for t in tensor_check_examples
+            ]
+
+            tensor_guards = TensorGuards(
+                *tensor_check_examples,
+                dynamic_dims_sizes=dynamic_dims_sizes,
+                dynamic_dims_strides=dynamic_dims_strides,
+            )
+            check_tensors_fn = tensor_guards.check
+            check_tensors_verbose_fn = tensor_guards.check_verbose
+            tensor_check_args = ", ".join(
+                tensor_check_names + ["tensor_check_names=tensor_check_names"]
+            )
+            # Do this manually, to un-stagger the guards in log message
+            code_parts.append(f"___check_tensors({tensor_check_args})")
+            verbose_code_parts.append(f"___check_tensors({tensor_check_args})")
+            tensor_check_guards = builder.tensor_check_guards
+
+            for i, name in enumerate(tensor_check_names):
+                # This is a copy of what guards.cpp checks against
+                # Keep this in sync with TensorCheck constructor
+                t = tensor_check_examples[i]
+                sizes = dynamic_dims_sizes[i]
+                strides = dynamic_dims_strides[i]
+                code_part = get_tensor_guard_code_part(t, name, sizes, strides)
+                add_code_part(code_part, tensor_check_guards[i], log_only=True)
+
+        aotautograd_guards: List[GuardEnvExpr] = (
+            self.output_graph.tracing_context.guards_context.aotautograd_guards
+            if self.output_graph
+            else []
+        )
+        for guard in aotautograd_guards:
+            if isinstance(guard, DuplicateInputs):
+                source_a = guard.input_source_a
+                source_b = guard.input_source_b
+                add_code_part(f"{source_a.name()} is {source_b.name()}", None)
+            else:
+                raise RuntimeError(f"Unknown GuardEnvExpr: {guard}")
+
+        # TODO: the "guard" here is actually just the top level SHAPE_ENV
+        # which is useless.  Get ShapeEnv to pass in more provenance.
+        for gcl in builder.shape_env_code:
+            for code in gcl.code_list:
+                add_code_part(code, gcl.guard)
+
+        # OK, all done generating guards
+        torch._logging.trace_structured(
+            "dynamo_guards", payload_fn=lambda: [f() for f in structured_guard_fns]
+        )
+
+        global_state = convert_frame.initial_global_state
+        if global_state is None:
+            # we should only hit this case in NopTests()
+            global_state = convert_frame.GlobalStateGuard()
+        closure_vars = {
+            "___check_tensors": check_tensors_fn,
+            "___check_tensors_verbose": check_tensors_verbose_fn,
+            "___check_global_state": global_state.check,
+            "___check_current_backend": torch._dynamo.eval_frame.check_current_backend,
+            "tensor_check_names": tensor_check_names,
+            **SYMPY_INTERP,
+            **CLOSURE_VARS,
+        }
+
+        unique_code_parts = list(unique(code_parts))
+        make_guard_fn_args = ", ".join(closure_vars.keys())
+        guard_body, pycode = build_guard_function(unique_code_parts, make_guard_fn_args)
+
+        if os.environ.get("TORCHDYNAMO_PRINT_GUARDS", None) == "1":
+            print("GUARDS\n", guard_body)
+
+        out: Dict[str, Any] = dict()
+
+        # We don't put builder.scope as the globals in exec call because
+        # guard_fn.__globals__ becomes equal to builder.scope. This causes
+        # guard_fn to hold a referece to f_locals sitting in builder.scope["L"]
+        globals_for_guard_fn = {"G": builder.scope["G"]}
+        try:
+            exec(pycode, globals_for_guard_fn, out)
+        except SyntaxError as ex:
+            log.exception("Failed to exec guard at line %s.\n%s", ex.lineno, pycode)
+            raise
+        guard_fn = out["___make_guard_fn"](*closure_vars.values())
+        guard_fn.closure_vars = closure_vars
+        # TODO(whc) maybe '.code_parts' was only kept around for the guard callback? so we don't need both
+        guard_fn.args = largs
+        guard_fn.code_parts = code_parts
+        guard_fn.verbose_code_parts = verbose_code_parts
+        # Grab only G, but preserve "G" because guards access it as "G"
+        guard_fn.global_scope = globals_for_guard_fn
+        guard_fn.guard_fail_fn = guard_fail_fn
+        # will be populated by a non-owning reference to CacheEntry/ExtraState
+        # when the CacheEntry is constructed
+        guard_fn.cache_entry = None
+        guard_fn.extra_state = None
+        return guard_fn
+
+    def invalidate(self):
+        # Some tests reveal that CheckFunctionManager has no attribute
+        # check_fn, but this case should not be of any concern.
+        # This case doesn't seem easy to repro.
+        if (
+            hasattr(self, "check_fn")
+            and self.check_fn is not DeletedGuardFn
+            and (cache_entry := self.check_fn.cache_entry) is not None
+            and (extra_state := self.check_fn.extra_state) is not None
+        ):
+            assert isinstance(cache_entry, CacheEntry)
+            assert isinstance(extra_state, ExtraState)
+            extra_state.invalidate(cache_entry)
+            self.check_fn.cache_entry = None
+            self.check_fn.extra_state = None
+            self.check_fn = DeletedGuardFn
+
+    def id_ref(self, obj):
+        """add a weakref, return the id"""
+        try:
+            if id(obj) not in self._weakrefs:
+                # We will clear the _weakrefs dict at the end of __init__
+                # function, which will delete the callbacks as well. Therefore,
+                # we are using a finalizer which is kept alive.
+                self._weakrefs[id(obj)] = weakref.ref(obj)
+                weakref.finalize(obj, self.invalidate)
+        except TypeError:
+            pass  # cannot weakref bool object
+        return id(obj)
+
+    def lookup_weakrefs(self, obj):
+        """Lookup the _weakrefs created in id_ref function for ID_MATCH'd objects"""
+        if id(obj) in self._weakrefs:
+            return self._weakrefs[id(obj)]
+        return None
+
+
+def build_guard_function(code_parts, closure_args) -> Tuple[str, str]:
+    from torch._inductor.utils import IndentedBuffer
+
+    if HAS_UNPARSE_FUNCTIONS:
+        csepass = PyExprCSEPass()
+        csepass.count(code_parts)
+
+        def replace(expr: str) -> Tuple[List[str], str]:
+            return csepass.replace(expr)
+
+    else:
+
+        def replace(expr: str) -> Tuple[List[str], str]:
+            return [], expr
+
+    # Generate the inner body of the guard function.
+    # i.e. if-chain of the guard expressions.
+    guard_body = IndentedBuffer()
+    for expr in code_parts:
+        preface, expr = replace(expr)
+        guard_body.writelines(preface)
+        guard_body.writeline(f"if not ({expr}):")
+        with guard_body.indent():
+            guard_body.writeline("return False")
+
+    # Wrap the inner body into the actual guard function.
+    guard = IndentedBuffer()
+    guard.writeline("def guard(L):")
+    with guard.indent():
+        guard.splice(guard_body)
+        guard.writeline("return True")
+
+    # Wrap the whole guard function into another function
+    # with the closure variables.
+    make_guard_fn = IndentedBuffer()
+    make_guard_fn.writeline(f"def ___make_guard_fn({closure_args}):")
+    with make_guard_fn.indent():
+        make_guard_fn.splice(guard)
+        make_guard_fn.writeline("return guard")
+
+    return guard_body.getvalue(), make_guard_fn.getvalue()
+
+
+def is_recompiles_enabled():
+    return torch._logging._internal.log_state.is_artifact_enabled("recompiles")
+
+
+def is_recompiles_verbose_enabled():
+    return torch._logging._internal.log_state.is_artifact_enabled("recompiles_verbose")
+
+
+def get_guard_fail_reason(
+    guard_fn: GuardFn,
+    code: types.CodeType,
+    f_locals: Dict[str, object],
+) -> str:
+    """
+    Return the reason why `guard_fn` failed.
+    Updates `guard_failures` with the generated reason.
+    Only the first failed check of guard_fn is reported.
+    """
+    scope = {"L": f_locals, "G": guard_fn.global_scope["G"]}
+    scope.update(guard_fn.closure_vars)
+    scope["___check_tensors"] = scope["___check_tensors_verbose"]
+    reasons: List[str] = []
+    for part in guard_fn.verbose_code_parts:
+        global_scope = dict(guard_fn.global_scope)
+        global_scope["__compile_source__"] = part
+        with report_compile_source_on_error():
+            try:
+                fail_reason = eval(part, global_scope, scope)
+            except Exception as e:
+                if is_recompiles_verbose_enabled():
+                    continue
+                else:
+                    raise
+        # Only ___check_tensors knows how to return a fancy fail reason;
+        # for everything else we just report the code that failed
+
+        if isinstance(fail_reason, bool) and not fail_reason:
+            fail_reason = part
+        if isinstance(fail_reason, str):
+            reasons.append(fail_reason)
+            if not is_recompiles_verbose_enabled():
+                break
+
+    reason_str = "\n".join(reasons)
+    guard_failures[orig_code_map[code]].append(reason_str)
+
+    try:
+        if guard_fn.guard_fail_fn is not None:
+            guard_fn.guard_fail_fn(
+                GuardFail(reason_str or "unknown reason", orig_code_map[code])
+            )
+    except Exception as e:
+        log.exception(
+            "Failure in guard_fail_fn callback - raising here will cause a NULL Error on guard eval",
+        )
+
+    return reason_str
+
+
+def get_and_maybe_log_recompilation_reason(
+    cache_entry, frame: types.FrameType
+) -> List[str]:
+    """
+    Return the list of guard failure reasons using cache_entry.
+    Logs the recompilation reason if `recompiles` logging is enabled.
+    Raises a RecompileError if `config.error_on_recompile` is enabled.
+    """
+    reasons = []
+    while cache_entry is not None:
+        reason = get_guard_fail_reason(
+            cache_entry.check_fn, cache_entry.code, frame.f_locals
+        )
+        if reason:
+            reasons.append(reason)
+        cache_entry = cache_entry.next
+
+    code = frame.f_code
+
+    # at least one of "recompiles" or "recompiles_verbose" is enabled
+    do_recompiles_log = is_recompiles_enabled() or is_recompiles_verbose_enabled()
+
+    if do_recompiles_log or config.error_on_recompile:
+        if is_recompiles_verbose_enabled():
+            failures = "\n\n".join(
+                f"guard {i} failures:\n" + textwrap.indent(reason, "- ")
+                for i, reason in enumerate(reasons)
+            )
+        else:
+            failures = textwrap.indent("\n".join(reasons), "- ")
+        guard_failure_details = (
+            f"triggered by the following guard failure(s):\n{failures}"
+        )
+        message = (
+            f"Recompiling function {code.co_name} in {code.co_filename}:{code.co_firstlineno}\n"
+            f"{textwrap.indent(guard_failure_details, '    ')}"
+        )
+        if do_recompiles_log:
+            if is_recompiles_verbose_enabled():
+                recompiles_verbose_log.debug(message)
+            else:
+                recompiles_log.debug(message)
+        if config.error_on_recompile:
+            raise exc.RecompileError(message)
+
+    return reasons
+
+
+def guard_error_hook(
+    guard_fn: GuardFn,
+    code: types.CodeType,
+    f_locals: Dict[str, object],
+    index: int,
+    last: bool,
+):
+    print(
+        f"ERROR RUNNING GUARDS {code.co_name} {code.co_filename}:{code.co_firstlineno}"
+    )
+    print("lambda " + ", ".join(guard_fn.args) + ":")
+    print(" ", " and\n  ".join(guard_fn.code_parts))
+    local_scope = {"L": f_locals, **guard_fn.closure_vars}
+    for guard in guard_fn.code_parts:
+        try:
+            eval(guard, guard_fn.global_scope, local_scope)
+        except:  # noqa: B001,E722
+            print(f"Malformed guard:\n{guard}")
+
+
+set_guard_error_hook(guard_error_hook)
+
+
+def unique(seq):
+    seen = set()
+    for x in seq:
+        if x not in seen:
+            yield x
+            seen.add(x)
+
+
+def make_dupe_guard(obj_source, dupe_source):
+    # Note - we may end up in a situation where we invoke something like
+    # def fn(x, y)
+    # with fn(x, x)
+    # Prior to the addition of tracking to all relevant objects, we would handle this just fine by
+    # eagerly re-entering VB and rewrapping inputs, correctly creating graphargs and placeholders. However,
+    # with tracking on inputs, duplicate inputs or aliased relationships may end up getting erased here -
+    # In the fn(x, x) example call above look like a graph with a single input.
+    # In order to ensure that we do not reuse fn(x, x) for fn(x, y), we create a duplicate input guard.
+
+    # Note - we may not have a source, that is fine, it just means we had an object that is safe to have
+    # leave unsourced - like a local list created and discharged entirely within a local scope.
+    if dupe_source and dupe_source != obj_source:
+        ser_source_is_local = is_from_local_source(dupe_source)
+        source_is_local = is_from_local_source(obj_source)
+        # Note - both must be local, or global, or we will run afoul of a lack of merging in how we currently
+        # reconcile guards builder scopes in compile_check_fn. This technically means we miss a guard here,
+        # so maybe we should do this refactor before we land this...
+        # TODO(voz): Combine local and global guard builders.
+        if ser_source_is_local == source_is_local:
+            # Note - this is a little aggressive - these being duplicate input does not always matter.
+            # However, this should always be a sound guard to add here.
+            return functools.partial(GuardBuilder.DUPLICATE_INPUT, source_b=dupe_source)
+    return None
+
+
+def install_guard(*guards, skip=0):
+    """
+    Add dynamo guards to the current tracing context.
+
+    Args:
+        guards: guard(s) to add
+        skip: number of stack frames to ignore for debug stack trace
+    """
+    from torch._guards import TracingContext
+
+    collect_debug_stack = guards_log.isEnabledFor(
+        logging.DEBUG
+    ) or verbose_guards_log.isEnabledFor(logging.DEBUG)
+    add = TracingContext.get().guards_context.dynamo_guards.add
+    for guard in guards:
+        assert isinstance(guard, Guard)
+        add(guard, collect_debug_stack=collect_debug_stack, skip=skip + 1)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/hooks.py b/MLPY/Lib/site-packages/torch/_dynamo/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..edffccc7c73e96e6af4dfe079d9d8aa2b504ccce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/hooks.py
@@ -0,0 +1,12 @@
+import dataclasses
+
+from typing import Callable, Optional
+
+from torch._guards import GuardsSet
+from .types import GuardFail
+
+
+@dataclasses.dataclass
+class Hooks:
+    guard_export_fn: Optional[Callable[[GuardsSet], None]] = None
+    guard_fail_fn: Optional[Callable[[GuardFail], None]] = None
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/logging.py b/MLPY/Lib/site-packages/torch/_dynamo/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e0fb984b3307f5986bdddafbb56f3997a5d9e25
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/logging.py
@@ -0,0 +1,57 @@
+import itertools
+import logging
+
+from torch.hub import _Faketqdm, tqdm
+
+# Disable progress bar by default, not in dynamo config because otherwise get a circular import
+disable_progress = True
+
+
+# Return all loggers that torchdynamo/torchinductor is responsible for
+def get_loggers():
+    return [
+        logging.getLogger("torch.fx.experimental.symbolic_shapes"),
+        logging.getLogger("torch._dynamo"),
+        logging.getLogger("torch._inductor"),
+    ]
+
+
+# Creates a logging function that logs a message with a step # prepended.
+# get_step_logger should be lazily called (i.e. at runtime, not at module-load time)
+# so that step numbers are initialized properly. e.g.:
+
+# @functools.lru_cache(None)
+# def _step_logger():
+#     return get_step_logger(logging.getLogger(...))
+
+# def fn():
+#     _step_logger()(logging.INFO, "msg")
+
+_step_counter = itertools.count(1)
+
+# Update num_steps if more phases are added: Dynamo, AOT, Backend
+# This is very inductor centric
+# _inductor.utils.has_triton() gives a circular import error here
+
+if not disable_progress:
+    try:
+        import triton  # noqa: F401
+
+        num_steps = 3
+    except ImportError:
+        num_steps = 2
+    pbar = tqdm(total=num_steps, desc="torch.compile()", delay=0)
+
+
+def get_step_logger(logger):
+    if not disable_progress:
+        pbar.update(1)
+        if not isinstance(pbar, _Faketqdm):
+            pbar.set_postfix_str(f"{logger.name}")
+
+    step = next(_step_counter)
+
+    def log(level, msg, **kwargs):
+        logger.log(level, "Step %s: %s", step, msg, **kwargs)
+
+    return log
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/mutation_guard.py b/MLPY/Lib/site-packages/torch/_dynamo/mutation_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd48febe14843bfe8f1a57bac1f77ad55651afe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/mutation_guard.py
@@ -0,0 +1,126 @@
+# mypy: disable-error-code="method-assign"
+
+import functools
+import weakref
+
+import torch.nn
+from torch.nn import Module
+
+from .utils import ExactWeakKeyDictionary, is_lazy_module
+
+
+class MutationTracker:
+    db = ExactWeakKeyDictionary()
+
+    def __init__(self):
+        self.mutation_count = 0
+        self.watchers = []
+
+    def on_mutation(self, name):
+        self.mutation_count += 1
+        tmp = self.watchers
+        self.watchers = []
+        for ref in tmp:
+            guarded = ref()
+            if guarded is not None:
+                guarded.invalidate(ref)
+
+    def track(self, guarded_code):
+        self.watchers.append(weakref.ref(guarded_code))
+
+
+def watch(obj, guarded_code):
+    """invalidate guarded_code when obj is mutated"""
+    ensure_patched(type(obj))
+
+    if obj not in MutationTracker.db:
+        MutationTracker.db[obj] = MutationTracker()
+    tracker = MutationTracker.db[obj]
+    tracker.track(guarded_code)
+
+
+def ensure_patched(cls):
+    if getattr(cls, "___needs_mutation_patch", True):
+        cls.___needs_mutation_patch = False
+        original_setattr = cls.__setattr__
+
+        @functools.wraps(original_setattr)
+        def custom_setattr(self, key, value):
+            try:
+                MutationTracker.db[self].on_mutation(key)
+            except KeyError:
+                pass
+            return original_setattr(self, key, value)
+
+        cls.__setattr__ = custom_setattr
+
+
+class GenerationTracker:
+    generation = 0
+    dynamic_classes = ExactWeakKeyDictionary()
+    generation_values = ExactWeakKeyDictionary()
+
+    @classmethod
+    def tag(cls, obj):
+        cls.generation_values[obj] = cls.generation
+
+    @staticmethod
+    def mark_class_dynamic(cls):
+        assert issubclass(cls, torch.nn.Module)
+        GenerationTracker.dynamic_classes[cls] = True
+
+    @classmethod
+    def get_generation_value(cls, obj):
+        if obj not in cls.generation_values:
+            return -1
+        return cls.generation_values[obj]
+
+    @classmethod
+    def check(cls, obj):
+        return (
+            obj in cls.generation_values
+            and cls.generation_values[obj] == cls.generation
+        )
+
+
+def is_dynamic_nn_module(obj):
+    """Check for nn.Modules() created dynamically or mutated"""
+    if isinstance(obj, torch.nn.Module) and "forward" in obj.__dict__:
+        # A monkey patched `.forward` indicates something wacky is going on
+        return True
+    if hasattr(obj, "torchdynamo_force_dynamic"):
+        return obj.torchdynamo_force_dynamic
+    if is_lazy_module(obj):
+        return False
+    dyn = GenerationTracker.dynamic_classes.get(type(obj)) or GenerationTracker.check(
+        obj
+    )
+    return dyn
+
+
+def install_generation_tagging_init():
+    """
+    Monkey patch torch.nn.Module.__init__ and torch.nn.Module.__setstate__
+    so we can detect nn.Module instances created dynamically inside forward methods.
+    """
+
+    if getattr(Module, "___needs_generation_tag_patch", True):
+        init = Module.__init__
+
+        def patched_init(self, *args, **kwargs):
+            init(self, *args, **kwargs)
+            GenerationTracker.tag(self)
+
+        Module.__init__ = patched_init
+
+        setstate = Module.__setstate__
+
+        def patched_setstate(self, state):
+            setstate(self, state)
+            GenerationTracker.tag(self)
+
+        Module.__setstate__ = patched_setstate
+
+        Module.___needs_generation_tag_patch = False  # type: ignore[attr-defined]
+
+    GenerationTracker.generation += 1
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/output_graph.py b/MLPY/Lib/site-packages/torch/_dynamo/output_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8164aacad717dc4dd3c84feb2f36bbe6dbdcbbe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/output_graph.py
@@ -0,0 +1,2073 @@
+import collections
+import contextlib
+import copy
+import functools
+import itertools
+import logging
+import operator
+import re
+import sys
+import traceback
+import weakref
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+
+import sympy
+
+import torch._guards
+
+import torch._logging
+
+import torch.nn
+import torch.utils._pytree as pytree
+from torch import fx
+from torch._guards import (
+    Checkpointable,
+    GlobalContextCheckpointState,
+    GuardsCheckpointState,
+    Source,
+    TracingContext,
+)
+from torch._utils_internal import signpost_event
+from torch.fx._lazy_graph_module import _make_graph_module  # type: ignore[attr-defined]
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.sym_node import SymNode
+from torch.fx.experimental.symbolic_shapes import free_symbols, is_symbolic, ShapeEnv
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils._sympy.interp import sympy_interp
+from torch.utils._sympy.reference import PythonReferenceAnalysis
+from torch.utils.weak import WeakTensorKeyDictionary
+
+from . import config, logging as torchdynamo_logging, variables
+from .backends.registry import CompiledFn, CompilerFn
+from .bytecode_transformation import (
+    create_call_function,
+    create_instruction,
+    Instruction,
+    unique_id,
+)
+from .code_context import code_context
+from .codegen import PyCodegen
+from .current_scope_id import enter_new_scope
+from .exc import (
+    BackendCompilerFailed,
+    exceptions_allowed_to_be_fallback,
+    SkipFrame,
+    unimplemented,
+    unimplemented_with_warning,
+)
+from .guards import GuardBuilder, install_guard
+from .mutation_guard import is_dynamic_nn_module
+from .side_effects import SideEffects
+from .source import (
+    AttrSource,
+    BackwardStateSource,
+    ConstantSource,
+    GlobalStateSource,
+    is_constant_source,
+    is_from_local_source,
+    LocalSource,
+    ParamBufferSource,
+    ShapeEnvSource,
+    TensorProperty,
+    TensorPropertySource,
+)
+from .utils import (
+    checkpoint_params,
+    CleanupHook,
+    clone_inputs,
+    count_calls,
+    counters,
+    dynamo_timed,
+    get_instruction_source_311,
+    get_static_address_type,
+    graph_break_reasons,
+    increment_op_count,
+    lazy_format_graph_code,
+    lazy_format_graph_tabular,
+    LazyString,
+    nn_module_proxy,
+    same,
+)
+from .variables.base import VariableTracker
+from .variables.builder import (
+    BackwardStateGraphArg,
+    GraphArg,
+    TrackedFake,
+    VariableBuilder,
+    wrap_fx_proxy,
+)
+from .variables.nn_module import NNModuleVariable
+from .variables.tensor import (
+    NumpyNdarrayVariable,
+    SymNodeVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+)
+
+from .variables.torch_function import TensorWithTFOverrideVariable
+
+log = logging.getLogger(__name__)
+graph_tabular_log = torch._logging.getArtifactLogger(__name__, "graph")
+graph_code_log = torch._logging.getArtifactLogger(__name__, "graph_code")
+graph_sizes_log = torch._logging.getArtifactLogger(__name__, "graph_sizes")
+trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
+
+
+class OutputGraphState(NamedTuple):
+    input_source_to_var: Dict[Source, VariableTracker]
+    tracked_fakes: List[TrackedFake]
+    guard_state: GuardsCheckpointState
+    nn_modules: Optional[Dict[str, torch.nn.Module]]
+    register_finalizer_fns: List[Callable[[fx.GraphModule], None]]
+    global_state: Optional[Dict[str, bool]]
+    param_name_to_source: Optional[Dict[str, Source]]
+    side_effects: SideEffects
+    timestamp: int
+    non_compliant_ops: Set[torch._ops.OpOverload]
+    compliant_custom_ops: Set[torch._ops.OpOverload]
+
+    def diff(self, other: "OutputGraphState", *, prefix: str = "") -> Optional[str]:
+        for k in self._fields:
+            if k == "guard_state":
+                r = self.guard_state.diff(other.guard_state)
+                if r is not None:
+                    return r
+                continue
+            elif k == "side_effects":
+                r = self.side_effects.diff(other.side_effects)
+                if r is not None:
+                    return r
+                continue
+
+            sv = getattr(self, k)
+            ov = getattr(other, k)
+            if sv != ov:
+                return f"{prefix}{k} mismatch: {sv} != {ov}"
+        return None
+
+    # Back compat .guards api
+    @property
+    def guards(self):
+        return self.guard_state.dynamo_guards
+
+
+@functools.lru_cache(None)
+def _step_logger():
+    return torchdynamo_logging.get_step_logger(log)
+
+
+@dataclass
+class GraphCompileReason:
+    """Stores why a given output graph was compiled; i.e. what caused the graph break."""
+
+    reason: str
+    user_stack: List[traceback.FrameSummary]
+
+    # Indicates if this was a graph compile reason due to graph break.
+    graph_break: bool = True
+
+    def __post_init__(self):
+        if self.graph_break:
+            graph_break_reasons.append(self)
+
+
+def _get_gen_rand_values_fn(random_calls):
+    def _gen_rand_values():
+        return [fn(*args, **kwargs) for fn, args, kwargs in random_calls]
+
+    return _gen_rand_values
+
+
+class FakeRootModule(torch.nn.Module):
+    """Trick the constructor of fx.GraphModule"""
+
+    def __init__(self, nn_modules: Dict[str, torch.nn.Module]):
+        super().__init__()
+        for k, v in nn_modules.items():
+            setattr(self, k, v)
+
+    def __repr__(self):
+        return "FakeRootModule(...)"
+
+
+class WrapperBackend:
+    def __init__(self, backend: CompilerFn):
+        self.backend: CompilerFn = backend
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        self.restore = checkpoint_params(gm)
+        self.gm = gm
+        copy_gm = copy.deepcopy(self.gm)
+        self.candidate = self.backend(copy_gm, example_inputs)
+
+        if self.candidate is None or self.candidate is self.gm.forward:
+            return self.gm.forward
+
+        if not config.verify_correctness:
+            return self.candidate
+
+        # if verify_correctness=True
+        try:
+            correct = self.gm.forward(*clone_inputs(example_inputs))
+            result = self.candidate(*clone_inputs(example_inputs))
+
+            # TODO: replace `same` function with the one in testing
+            if same(correct, result):
+                return self.candidate
+
+            raise RuntimeError(f"incorrect results of backend {self}")
+            return self.gm.forward
+
+        except Exception:
+            log.exception("error in verify_correctness")
+            raise
+        finally:
+            self.restore()
+
+
+Scope = Dict[str, object]
+
+
+class OutputGraph(Checkpointable[OutputGraphState]):
+    """
+    Wrapper class to hold outputs of InstructionTranslator.  Mainly the
+    generated fx.Graph.
+
+    OutputGraph is 1:1 with a frame being processed. Each frame is associated
+    with some root InstructionTranslator. When user code calls a function,
+    we construct a InliningInstructionTranslator that continues to write into
+    the root InstructionTranslator's OutputGraph.
+    """
+
+    def __init__(
+        self,
+        code_options: Dict[str, Any],
+        compiler_fn: Optional[CompilerFn],
+        root_tx,
+        export: bool,
+        export_constraints,
+        frame_state,
+        local_scope: Scope,
+        global_scope: Scope,
+        f_code,
+    ):
+        super().__init__()
+        self.tracers = [SubgraphTracer(self, export_root=export)]
+        # Map from graph input's `Source` to its `VariableTracker` to
+        # de-duplicate graph inputs by source and reuse the tracker
+        self.input_source_to_var: Dict[Source, VariableTracker] = {}
+        self.export = export
+        self.export_constraints = export_constraints
+        self.frame_state = frame_state
+        self.tensor_weakref_to_sizes_strides = WeakTensorKeyDictionary()
+        self.cleanup_hooks: List[Callable[[], Any]] = []
+        # compile_id is an id number for the current torch.compile
+        self.compile_id: int = next(_compile_id_counter)
+        # Set of globals installed via install_global* APIs
+        self.installed_globals: Set[str] = set()
+
+        # TODO: maybe should just pass the entire f_code in here?  Not
+        # sure...
+        self.co_fields = {
+            "co_name": f_code.co_name,
+            "co_filename": f_code.co_filename,
+            "co_firstlineno": f_code.co_firstlineno,
+        }
+
+        # tracked_fakes says where any tensor that was wrapped to fake came
+        # from.  It is similar to GraphArg, in that all GraphArgs will get
+        # will get added to TrackedFakes, but TrackedFakes also contains
+        # GraphArgs that got pruned, and things like Tensor attributes which
+        # aren't explicit graph inputs.  Used by shape guard
+        self.tracked_fakes: List[TrackedFake] = []
+
+        # List of symbols for which we have exact bindings in the arguments
+        # already
+        self.bound_symbols: Set[sympy.Symbol] = set()
+
+        shape_env = ShapeEnv(
+            # Reference Cycle!
+            # Share a reference to the list of TrackedFake.
+            #
+            # ShapeEnv needs this in order to be able to reproduce the call
+            # to produce_guards at an arbitrary time point. That is because
+            # TrackedFake instances may have its metadata changed throughout
+            # the program execution.
+            tracked_fakes=self.tracked_fakes,
+            allow_scalar_outputs=config.capture_scalar_outputs,
+            allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
+            co_fields=self.co_fields,
+        )
+
+        # In export mode, we force the shape_env to strictly disallow any constraining
+        # of the user marked dynamic dims
+        fake_mode = torch._subclasses.FakeTensorMode(
+            shape_env=shape_env,
+            # TODO (tmanlaibaatar) Remove this once we always lift params and buffers
+            allow_non_fake_inputs=True if self.export else False,
+        )
+        self.tracing_context: TracingContext = TracingContext(fake_mode)
+        self.init_ambient_guards()
+
+        # Map each tensor id to a list of sources. This is necessary because
+        # tensor ids cannot be recovered from tracked fakes (in general).
+        # We use this map to interpret (i.e., check for violations of) constraints,
+        # specifically equality constraints, which have shared tensor ids in them.
+        # This map should also be generally useful, e.g., for (de)serialization.
+        self.tracked_fakes_id_to_source: Dict[
+            int, List[Source]
+        ] = collections.defaultdict(list)
+        # Stores the full fqn of a param or buffer to the relevant source.
+        self.param_name_to_source: Optional[Dict[str, Source]] = dict()
+        self.side_effects = SideEffects()
+        self.code_options = dict(code_options)
+        self.output_instructions: List[Instruction] = []
+        # used to track nodes that are added between calls of copy_graphstate
+        # and restore_graphstate
+        self.timestamp = 0
+
+        # A list of register_finalizer_fns to apply to the output graph module
+        self.register_finalizer_fns: List[Callable[[fx.GraphModule], None]] = []
+
+        # Not checkpointed
+        self.compiler_fn: Optional[CompilerFn] = compiler_fn
+        self.global_scope = global_scope
+        self.local_scope = local_scope
+        self.root_tx = root_tx
+        from torch._dynamo.symbolic_convert import InstructionTranslatorBase
+
+        # Given a source, what are the user stacks of all locations that
+        # accessed it?
+        #
+        # For efficiency, we only populate this:
+        #   - During export, and
+        #   - If the source could potentially lead to a spurious export input
+        #
+        # Feel free to populate this more frequently if other use-cases arise,
+        # but be aware that we have to generate full stacks for each
+        # recording!
+        self.source_to_user_stacks: Dict[Source, List[traceback.StackSummary]] = {}
+
+        self._current_tx: List[InstructionTranslatorBase] = []
+        self.cleanups: List[CleanupHook] = []
+        self.should_exit = False
+        self.unspec_variable_map: Dict[str, UnspecializedPythonVariable] = {}
+        self.torch_function_enabled = torch._C._is_torch_function_enabled()
+        # Tracks if the output graph has a user defined allowed function in the
+        # graph. This is used later to determine if we should fallback to eager
+        # for certain exceptions. THe idea is that if the user has applied
+        # allow_in_graph, they would like to see the error instead of falling
+        # back for backend errors.
+        self.has_user_defined_allowed_in_graph = False
+
+        # Tracks a list of called ops that were not tagged with "pt2_compliant_tag".
+        # This information is useful for logging.
+        self.non_compliant_ops: Set[torch._ops.OpOverload] = set({})
+
+        # Tracks a list of called custom ops that were tagged with "pt2_compliant_tag".
+        # This information is useful for logging.
+        self.compliant_custom_ops: Set[torch._ops.OpOverload] = set({})
+
+        # We save the global torch state here to be restored in case of graph
+        # breaks. The relevant issue is seen here
+        # https://github.com/pytorch/pytorch/pull/100570#issuecomment-1543427086
+        # where inlining of a function changes the global state (because of the
+        # presence of torch.no_grad) and there is a graph break.
+        self.save_global_state()
+
+        # Tracks the original FQNs of the constant tensors from the original graph,
+        # i.e. buffers and parameters.
+        self.dynamo_flat_name_to_original_fqn: Dict[str, str] = {}
+
+        # All calls to random() are replaced with a single call to __gen_rand_values
+        # functions that returns a tuple of random values for each original call.
+        # random_calls tracks calls to random() and random_values_var stores the name of
+        # the variable that stores __gen_rand_values results.
+        self.random_calls: List[
+            Tuple[Callable[..., object], Tuple[object, ...], Dict[str, object]]
+        ] = []
+        self.random_values_var = None
+
+        # Bytecode to insert right before we call the graph
+        self.pregraph_bytecode: List[Instruction] = []
+
+        # Use to pass values to backward hooks when using compiled autograd
+        self.backward_state: Dict[str, VariableTracker] = {}
+        self.backward_state_proxy: Optional[torch.fx.Proxy] = None
+        self.backward_state_var: Optional[str] = None
+
+    def add_backward_state_hook(self, hook: VariableTracker):
+        name = f"hook{len(self.backward_state)}"
+        assert name not in self.backward_state
+        self.backward_state[name] = hook
+        return name, self.get_backward_state_proxy()
+
+    def get_backward_state_proxy(self):
+        if self.backward_state_proxy is None:
+            if self.export:
+                unimplemented("backward_state does not support export")
+            self.backward_state_proxy = self.root_tracer.create_graph_input(
+                "dynamo_backward_state", BackwardState, source=BackwardStateSource()
+            )
+            self.backward_state_proxy.node.meta["grapharg"] = BackwardStateGraphArg()
+            self.backward_state_proxy.node.meta["example_value"] = BackwardState()
+            self.backward_state_var = self.new_var()
+        return self.backward_state_proxy
+
+    # This gets its own helper function so guards DEBUG logs are more informative
+    def init_ambient_guards(self):
+        # Register a SHAPE_ENV guard to make sure we setup shape guards
+        # that show up in ShapeEnv
+        self.guards.add(ShapeEnvSource().make_guard(GuardBuilder.SHAPE_ENV))
+
+        self.guards.add(
+            GlobalStateSource().make_guard(GuardBuilder.DETERMINISTIC_ALGORITHMS)
+        )
+
+        self.guards.add(GlobalStateSource().make_guard(GuardBuilder.GRAD_MODE))
+
+        self.guards.add(GlobalStateSource().make_guard(GuardBuilder.DEFAULT_DEVICE))
+
+        self.guards.add(
+            GlobalStateSource().make_guard(GuardBuilder.TORCH_FUNCTION_STATE)
+        )
+
+        self.guards.add(GlobalStateSource().make_guard(GuardBuilder.BACKEND_MATCH))
+
+    def add_cleanup_hook(self, fn: Callable[[], Any]):
+        self.cleanup_hooks.append(fn)
+
+    def call_cleanup_hooks(self):
+        for hook in reversed(self.cleanup_hooks):
+            hook()
+        self.cleanup_hooks.clear()
+
+    @property
+    def root_tracer(self):
+        return self.tracers[0]
+
+    @property
+    def current_tracer(self):
+        return self.tracers[-1]
+
+    def is_root_tracer(self):
+        # Helper to tell if we are inside the higher order operator tracing.
+        return len(self.tracers) == 1
+
+    @property
+    def graph(self):
+        return self.current_tracer.graph
+
+    # TODO(rzou): can delete after we refactor speculate_subgraph to use nested GraphTracer.
+    @graph.setter
+    def graph(self, value):
+        self.current_tracer.graph = value
+
+    @property
+    def input_name_to_proxy(self):
+        return self.current_tracer.input_name_to_proxy
+
+    @property
+    def real_value_cache(self):
+        return self.current_tracer.real_value_cache
+
+    # If you are here, and you're looking for create_graph_input,
+    # to avoid ambiguity, please call one of the following:
+    # - self.current_tracer.create_graph_input
+    # - self.root_tracer.create_graph_input
+    # See NOTE [HigherOrderOperator tracing design] for more context.
+
+    def create_proxy(self, *args, **kwargs):
+        return self.current_tracer.create_proxy(*args, **kwargs)
+
+    def create_node(self, *args, **kwargs):
+        return self.current_tracer.create_node(*args, **kwargs)
+
+    def remove_node(self, *args, **kwargs):
+        return self.current_tracer.remove_node(*args, **kwargs)
+
+    @contextlib.contextmanager
+    def subtracer(self, source_target, prior_tracer):
+        new_scope_ctx = enter_new_scope()
+        try:
+            if prior_tracer:
+                # Lineage MUST stay preserved
+                assert prior_tracer.parent is self.current_tracer
+            new_scope_ctx.__enter__()
+            tracer = (
+                prior_tracer
+                if prior_tracer
+                else SubgraphTracer(
+                    self, parent=self.current_tracer, source_target=source_target
+                )
+            )
+            self.tracers.append(tracer)
+            yield tracer
+        finally:
+            new_scope_ctx.__exit__(None, None, None)
+            self.tracers.pop()
+
+    @property
+    def output(self):
+        return self
+
+    @property
+    def fake_mode(self):
+        return self.tracing_context.fake_mode
+
+    @property
+    def shape_env(self):
+        return self.tracing_context.fake_mode.shape_env
+
+    @property
+    def guards(self) -> torch._guards.GuardsSet:
+        return self.tracing_context.guards_context.dynamo_guards
+
+    @property
+    def nn_modules(self) -> Dict[str, Any]:
+        return self.tracing_context.module_context.nn_modules
+
+    def save_global_state(self, out=None):
+        """
+        Saves to out if it is provided. Else saves to the tracing context's global_state.
+        """
+        global_state = (
+            out if out is not None else self.tracing_context.global_context.global_state
+        )
+
+        # TODO - Consider having a torch level API for torch_function_state. As
+        # of now, we create a ref cycle by passing the
+        # output.set_torch_function_state to
+        # output.tracing_context.global_context.global_state. In the interim,
+        # the problem can be solved by manually set
+        # output.tracing_context.global_context.global_state to None at cleanup.
+        global_state["torch_function_enabled"] = (
+            self.set_torch_function_state,
+            self.torch_function_enabled,
+        )
+        global_state["grad_enabled"] = (torch.set_grad_enabled, torch.is_grad_enabled())
+        global_state["autocast_enabled"] = (
+            torch.set_autocast_enabled,
+            torch.is_autocast_enabled(),
+        )
+        global_state["autocast_cpu_enabled"] = (
+            torch.set_autocast_cpu_enabled,
+            torch.is_autocast_cpu_enabled(),
+        )
+        global_state["autocast_gpu_dtype"] = (
+            torch.set_autocast_gpu_dtype,
+            torch.get_autocast_gpu_dtype(),
+        )
+        global_state["autocast_cpu_dtype"] = (
+            torch.set_autocast_cpu_dtype,
+            torch.get_autocast_cpu_dtype(),
+        )
+        global_state["autocast_cache_enabled"] = (
+            torch.set_autocast_cache_enabled,
+            torch.is_autocast_cache_enabled(),
+        )
+
+    def push_tx(self, tx):
+        self._current_tx.append(tx)
+
+    def pop_tx(self):
+        return self._current_tx.pop()
+
+    @property
+    def current_tx(self):
+        return self.root_tx if not self._current_tx else self._current_tx[-1]
+
+    def copy_graphstate(self) -> OutputGraphState:
+        """Create a checkpoint of the current state by copying everything"""
+        assert self.param_name_to_source is not None
+        guards_graph_state = self.tracing_context.guards_context.copy_graphstate()
+        module_state = self.tracing_context.module_context.copy_graphstate()
+        global_state = self.tracing_context.global_context.copy_graphstate()
+        state = OutputGraphState(
+            dict(self.input_source_to_var),
+            list(self.tracked_fakes),
+            guards_graph_state,
+            module_state,
+            list(self.register_finalizer_fns),
+            global_state,
+            dict(self.param_name_to_source),
+            self.side_effects.clone(),
+            self.timestamp,
+            set(self.non_compliant_ops),
+            set(self.compliant_custom_ops),
+        )
+        self.timestamp += 1
+        return state
+
+    def restore_graphstate(self, state: OutputGraphState):
+        """Restore a checkpoint created by self.copy_graphstate()"""
+        (
+            self.input_source_to_var,
+            self.tracked_fakes,
+            guards_state,
+            module_state,
+            self.register_finalizer_fns,
+            global_state,
+            self.param_name_to_source,
+            self.side_effects,
+            self.timestamp,
+            self.non_compliant_ops,
+            self.compliant_custom_ops,
+        ) = state
+        self.tracing_context.guards_context.restore_graphstate(guards_state)
+        self.tracing_context.module_context.restore_graphstate(module_state)
+        self.tracing_context.global_context.restore_graphstate(global_state)
+
+        # FX deepcopy doesn't work for a partially created graph, so just remove new nodes
+        removed_nodes = 0
+        for node in reversed(list(self.graph.nodes)):
+            if (
+                node.meta["creation_timestamp"] > self.timestamp
+                # placeholders here may have been lazily added by existing objects
+                and node.op != "placeholder"
+            ):
+                # Erasing node alone does not remove the meta information
+                # So, remove the help tensor explicitly
+                if "example_value" in node.meta:
+                    del node.meta["example_value"]
+                self.remove_node(node)
+                self.real_value_cache.pop(node, None)
+                removed_nodes += 1
+        log.debug("restore_graphstate: removed %s nodes", removed_nodes)
+
+    def add_symbol_bindings(self, arg: GraphArg):
+        # Insert implicit size vars as necessary.  With dynamic shapes, we
+        # maintain the invariant that every sizevar gets a direct SymInt input
+        # into the graph.  This means downstream graph transforms can assume
+        # every size variable is explicitly bound and accessible, instead of
+        # having to pull it out implicitly from tensors.
+
+        if self.export:
+            return
+
+        assert arg.fake_tensor is not None
+
+        def bind_symint(s, prop):
+            if not (is_symbolic(s) and isinstance(s.node.expr, sympy.Symbol)):
+                return
+            s0 = s.node.expr
+            if s0 in self.bound_symbols:
+                return
+            self.bound_symbols.add(s0)
+            log.debug("bind_symint %s %s", s, prop.name())
+            # TODO: don't readd symint if we already have it in graph
+            # (this is harmless because we do remove the unused ones later)
+            proxy = self.root_tracer.create_graph_input(
+                str(s0),
+                torch.SymInt,
+                before=True,
+                source=prop,
+            )
+            proxy.node.meta["example_value"] = s
+            proxy.node.meta["grapharg"] = GraphArg(
+                prop,
+                s,
+                is_unspecialized=False,
+                fake_tensor=None,
+                is_tensor=False,
+            )
+
+        def handle_tensor(t, src):
+            for i, s in enumerate(t.size()):
+                bind_symint(s, TensorPropertySource(src, TensorProperty.SIZE, i))
+            for i, s in enumerate(t.stride()):
+                bind_symint(s, TensorPropertySource(src, TensorProperty.STRIDE, i))
+            bind_symint(
+                t.storage_offset(),
+                TensorPropertySource(src, TensorProperty.STORAGE_OFFSET),
+            )
+            if is_traceable_wrapper_subclass(t):
+                attrs, ctx = t.__tensor_flatten__()
+                for attr in attrs:
+                    inner_t = getattr(t, attr)
+                    handle_tensor(inner_t, AttrSource(src, attr))
+
+        handle_tensor(arg.fake_tensor, arg.source)
+
+    def count_calls(self):
+        return count_calls(self.graph)
+
+    def is_empty_graph(self):
+        return len(list(self.graph.nodes)) == 0
+
+    def get_submodule(self, keys):
+        assert keys
+        obj: Union[torch.nn.Module, Dict[str, torch.nn.Module]] = self.nn_modules
+        for k in keys.split("."):
+            if isinstance(obj, dict):
+                obj = obj[k]
+            else:
+                obj = getattr(obj, k)
+        return obj
+
+    def new_var(self, name="tmp"):
+        existing = set(self.code_options["co_varnames"])
+        for i in itertools.count():
+            var = f"{name}_{i}"
+            if var not in existing:
+                self.code_options["co_varnames"] += (var,)
+                return var
+
+    def update_co_names(self, name):
+        """Ensure self.code_options.co_names contains name"""
+        if name not in self.code_options["co_names"]:
+            self.code_options["co_names"] += (name,)
+
+    @staticmethod
+    def module_key_name(*names):
+        # create a new unique name
+        name = "_".join(map(str, names))
+        # Strip the guard lookup L/G access
+        name = re.sub(r"^[GL]\['?(.*?)'?\]$", r"\1", name)
+        # e.g. replace abc.xyz[123].qkv with abc.xyz_123.qkv
+        name = re.sub(r"\[(\d+)\]", r"_\g<1>", name)
+        # e.g. replace abc.xyz_123.qkv with abc_xyz_123_qkv
+        name = re.sub(r"[^a-zA-Z0-9]", "_", name)
+
+        if not name or not name[0].isalpha():
+            name = "sub" + name
+
+        return name
+
+    def register_attr_or_module(
+        self,
+        target: Union[torch.nn.Module, torch.Tensor, Any],
+        *names,
+        **options,
+    ):
+        if is_dynamic_nn_module(target):
+            return variables.UnspecializedNNModuleVariable(target, **options)
+
+        options = dict(options)
+        assert "source" in options
+        source = options["source"]
+        assert not isinstance(source, ParamBufferSource)
+
+        if isinstance(target, torch.Tensor):
+            tracer = self.current_tracer
+            if not self.is_root_tracer():
+                # For higher order ops, we don't want to insert the get_attr in
+                # innermost graph. Instead, we want to raise the params/buffers
+                # as inputs to the higher-order graph, and register them as
+                # get_attrs in the root tracer.
+
+                # Note that Dynamo will still call lift_tracked_freevar_to_input
+                # when these inputs are encountered for the inner graph. The
+                # only difference is what happens at the root tracer for
+                # nn.Parameters vs free inputs. The free inputs are registered
+                # as placeholders in the root graph, whereas the nn.Parameters
+                # are registered as get_attr nodes in the root graph.
+                tracer = self.root_tracer
+
+            if not is_constant_source(source):
+                install_guard(source.make_guard(GuardBuilder.TENSOR_MATCH))
+
+            if get_static_address_type(target) == "guarded":
+                install_guard(source.make_guard(GuardBuilder.DATA_PTR_MATCH))
+
+            def wrap_name(module_key):
+                assert self.param_name_to_source is not None
+                self.param_name_to_source[module_key] = source
+
+                return wrap_fx_proxy(
+                    self.root_tx,
+                    tracer.create_proxy("get_attr", module_key, tuple(), {}),
+                    example_value=target,
+                    **options,
+                )
+
+        elif isinstance(target, torch.nn.Module):
+            assert isinstance(target, torch.nn.Module)
+
+            install_guard(source.make_guard(GuardBuilder.NN_MODULE))
+
+            def wrap_name(module_key):
+                return NNModuleVariable(type(target), module_key, target, **options)
+
+        elif isinstance(target, (torch.SymInt, torch.SymFloat)):
+            # HACKY CODE REGION BEGIN
+            # WE ARE PIGGYBACKING ON EXISTING INFRA TO REGISTER ATTRS
+            # This ultimately gets written to self.nn_modules, which is unfortunate
+            # Attrs that are tenors and symints and such need to be migrated to have their
+            # own storage
+            # alas, this is like this for now
+
+            def wrap_name(module_key):
+                return SymNodeVariable.create(
+                    self,
+                    self.create_proxy("get_attr", module_key, tuple(), {}),
+                    sym_num=target,
+                    **options,
+                )
+
+            # HACKY CODE REGION END
+        else:
+
+            def wrap_name(module_key):
+                self.output.update_co_names(module_key)
+                self.global_scope[module_key] = target
+                return VariableBuilder(self, ConstantSource(source_name=module_key))(
+                    target
+                )
+
+        for k, v in self.nn_modules.items():
+            if v is target:
+                # it already exists
+                return wrap_name(k)
+
+        name = OutputGraph.module_key_name(*names)
+
+        base = name
+        for i in itertools.count():
+            if name not in self.nn_modules:
+                self.nn_modules[name] = target
+                if isinstance(target, torch.nn.Module):
+
+                    def register_leaf_name(leaf_name):
+                        assert self.param_name_to_source is not None
+                        new_source = ParamBufferSource(source, leaf_name)
+                        new_name = f"{name}.{leaf_name}"
+                        self.param_name_to_source[new_name] = new_source
+                        if isinstance(source, LocalSource):
+                            self.dynamo_flat_name_to_original_fqn[
+                                OutputGraph.module_key_name(new_source.name())
+                            ] = leaf_name
+
+                    # annoying, but there are cases when we do not have parameters
+                    # see test_nn_moduledict_contains
+                    if hasattr(target, "_parameters"):
+                        for leaf_name, _ in target.named_parameters():
+                            register_leaf_name(leaf_name)
+                    if hasattr(target, "_buffers"):
+                        for leaf_name, _ in target.named_buffers():
+                            register_leaf_name(leaf_name)
+
+                return wrap_name(name)
+            name = f"{base}_{i}"
+
+        raise AssertionError("unreachable")
+
+    def compile_subgraph(
+        self, tx, partial_convert=False, reason: Optional[GraphCompileReason] = None
+    ):
+        """
+        Generate a subgraph to continue execution on user code.
+        Automatically restore live variables.
+        """
+        assert reason is not None
+
+        from .decorators import disable
+
+        self.partial_convert = partial_convert
+        self.compile_subgraph_reason = reason
+        self.should_exit = True
+
+        log.debug("COMPILING GRAPH due to %s", reason)
+
+        if not all(block.can_restore() for block in tx.block_stack):
+            unimplemented("compile_subgraph with block_depth != 0")
+
+        prefix_insts: List[Instruction] = []
+        if sys.version_info >= (3, 11):
+            # prefix instructions (Python 3.11+)
+            for inst in tx.prefix_insts:
+                if inst.opname == "MAKE_CELL":
+                    prefix_insts.append(
+                        create_instruction("MAKE_CELL", argval=inst.argval)
+                    )
+                elif inst.opname == "COPY_FREE_VARS":
+                    prefix_insts.append(
+                        create_instruction(
+                            "COPY_FREE_VARS", arg=len(tx.code_options["co_freevars"])
+                        )
+                    )
+                else:
+                    prefix_insts.append(copy.copy(inst))
+        assert not (
+            self.pregraph_bytecode and self.export
+        ), "export does not support pregraph_bytecode"
+        prefix_insts.extend(self.pregraph_bytecode)
+
+        def append_prefix_insts():
+            self.add_output_instructions(prefix_insts)
+            prefix_insts.clear()
+
+        for block in reversed(tx.block_stack):
+            block.exit(tx)
+
+        self.cleanup_graph()
+        tx.prune_dead_locals()
+        stack_values = list(tx.stack)
+        # Use nn.Module "proxies" in the constructed GraphModule so that
+        # the resulting GM does not hold additional strong references to the original modules.
+        # This prevents a strong ref cycle where Dynamo created code holds on to references
+        # to modules that also have Dynamo code cache invalidation checks.
+        # When cache invalidation runs, the generated GM will be invalidated, which also deletes
+        # the proxies.
+        nn_modules_proxies = {
+            name: nn_module_proxy(mod) for name, mod in self.nn_modules.items()
+        }
+        root = FakeRootModule(nn_modules_proxies)
+        # Add all the local vars to the "stack" so restore at the end
+        restore_vars = []
+        val_to_names: Dict[VariableTracker, List[str]] = {}
+        if stack_values:
+            val_to_names[stack_values[-1]] = list()
+        # NB: Typically (i.e., for graph compile from RETURN_VALUE),
+        # symbolic_locals will be empty at this point, as prune_dead_locals
+        # will clear out all of symbolic_locals because RETURN_VALUE is the
+        # last instruction and no more locals are used.  The fanciness here
+        # is only needed for partial graphs.
+        for k, v in tx.symbolic_locals.items():
+            # Note! this explicitly uses .local_name for matching
+            # Failure to do so will cause spurious registrations in val_to_names.
+            # This will in turn result in spurious variables showing up in the graph.
+            # This was very tricky to debug. For an example, dump the graph at call_user_compiler
+            # while running test_subgraphs.py
+            if isinstance(v.source, LocalSource) and v.source.local_name == k:
+                continue  # no need to restore initial state
+            if v not in val_to_names:
+                val_to_names[v] = list()
+            val_to_names[v].append(k)
+        for v in val_to_names.keys():
+            restore_vars.extend(val_to_names[v])
+            stack_values.extend([v] * len(val_to_names[v]))
+
+        # to handle random calls
+        if len(self.random_calls) > 0:
+            append_prefix_insts()
+            random_calls_instructions = []
+            self.random_values_var = self.new_var("random_values")
+            rand_fn = disable(_get_gen_rand_values_fn(self.random_calls))
+            rand_fn_name = self.install_global("__gen_rand_values", rand_fn)
+            codegen = PyCodegen(tx, root)
+            random_calls_instructions.extend(
+                codegen.load_function_name(rand_fn_name, True)
+            )
+            random_calls_instructions.extend(create_call_function(0, False))
+            random_calls_instructions.append(
+                codegen.create_store(tx.output.random_values_var),
+            )
+            self.add_output_instructions(random_calls_instructions)
+
+        if (
+            stack_values
+            and all(
+                not isinstance(
+                    v,
+                    (
+                        UnspecializedPythonVariable,
+                        NumpyNdarrayVariable,
+                        TensorWithTFOverrideVariable,
+                    ),
+                )
+                for v in stack_values
+            )
+            and all(isinstance(x, TensorVariable) for x in stack_values)
+            and len(set(stack_values)) == len(stack_values)
+            and self.side_effects.is_empty()
+            and not len(tx.debug_locals) != 0
+            and not self.backward_state
+        ):
+            append_prefix_insts()
+            # optimization to generate better code in a common case
+            self.add_output_instructions(
+                self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
+                + [create_instruction("UNPACK_SEQUENCE", arg=len(stack_values))]
+            )
+        else:
+            graph_output_var = self.new_var("graph_out")
+            pass1 = PyCodegen(tx, root, graph_output_var)
+            self.codegen_suffix(tx, stack_values, pass1)
+
+            # one more time now that we have established tempvars
+            pass2 = PyCodegen(
+                tx,
+                root,
+                graph_output_var,
+                tempvars={val: None for val, count in pass1.uses.items() if count > 1},
+            )
+            self.codegen_suffix(tx, stack_values, pass2)
+
+            output = []
+            if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
+                output.extend(
+                    self.compile_and_call_fx_graph(tx, pass2.graph_output_vars(), root)
+                )
+
+                if len(pass2.graph_outputs) != 0:
+                    output.append(pass2.create_store(graph_output_var))
+                else:
+                    output.append(create_instruction("POP_TOP"))
+            append_prefix_insts()
+            self.add_output_instructions(output + pass2.get_instructions())
+
+        # restore all the live local vars
+        self.add_output_instructions(
+            [PyCodegen(tx).create_store(var) for var in reversed(restore_vars)]
+        )
+
+    def codegen_suffix(self, tx, stack_values, cg):
+        if self.backward_state:
+            assert not self.export
+            for name, val in self.backward_state.items():
+                cg(val)
+                cg.append_output(cg.create_load(self.backward_state_var))
+                cg.store_attr(name)
+        self.side_effects.codegen_hooks(cg)
+        self.side_effects.codegen_save_tempvars(cg)
+
+        # Return variables used for logging at the end
+        for debug_var, args in tx.debug_locals:
+            cg(debug_var)
+            for arg in args:
+                cg(arg)
+            cg.extend_output(create_call_function(len(args), True))
+
+        cg.restore_stack(stack_values, value_from_source=not tx.export)
+        self.side_effects.codegen_update_mutated(cg)
+
+    def cleanup_graph(self):
+        """
+        Remove "creation_timestamp" from node meta
+
+        Remove this pattern from the graph:
+            torch._C._set_grad_enabled(False)
+            torch._C._set_grad_enabled(True)
+        """
+        assert self.should_exit
+        nodes = list(self.graph.nodes)
+        for node in nodes:
+            node.meta.pop("creation_timestamp", None)
+
+        grad_enabled = torch.is_grad_enabled()
+        for node1, node2 in zip(nodes, nodes[1:]):
+            if (
+                node1.target is torch._C._set_grad_enabled
+                and tuple(node1.args) == (not grad_enabled,)
+                and not node1._erased
+            ):
+                grad_enabled = node1.args[0]
+                if (
+                    node2.target is torch._C._set_grad_enabled
+                    and tuple(node2.args) == (not grad_enabled,)
+                    and not node2._erased
+                ):
+                    grad_enabled = node2.args[0]
+                    self.graph.erase_node(node1)
+                    self.graph.erase_node(node2)
+
+    def get_graph_sizes_structured(self):
+        ret = {}
+        for node in self.graph.nodes:
+            example_value = node.meta.get("example_value", None)
+            if isinstance(example_value, torch._subclasses.FakeTensor):
+                size = example_value.size()
+                ret[node.name] = [s if isinstance(s, int) else repr(s) for s in size]
+        return ret
+
+    def get_graph_sizes(self, name: str):
+        graph_sizes_str = "TRACED GRAPH TENSOR SIZES\n"
+        graph_sizes_str += f"===== {name} =====\n"
+        for node in self.graph.nodes:
+            example_value = node.meta.get("example_value", None)
+            if isinstance(example_value, torch._subclasses.FakeTensor):
+                size = example_value.size()
+                graph_sizes_str += f"{node.name}: {tuple(size)}\n"
+                concrete_size = []
+                has_symint = False
+                for sz in size:
+                    if isinstance(sz, int):
+                        concrete_size.append(sz)
+                    elif isinstance(sz, torch.SymInt):
+                        has_symint = True
+                        concrete_size.append(sz.node.hint)
+                    else:
+                        break
+                else:
+                    if has_symint:
+                        graph_sizes_str += (
+                            f"{node.name} (concrete): {tuple(concrete_size)}\n"
+                        )
+        return graph_sizes_str
+
+    @contextlib.contextmanager
+    def restore_global_state(self):
+        """
+        Momentarily restores the global state to what it was prior to tracing the current output
+        """
+        prior_global_state = self.tracing_context.global_context.copy_graphstate()
+        current_global_state: Dict[str, Tuple[Any, bool]] = {}
+        self.save_global_state(out=current_global_state)
+        try:
+            # Set to state prior to tracing the graph
+            self.tracing_context.global_context.restore_graphstate(prior_global_state)
+            yield
+        finally:
+            # Reset to state at the current time (e.g. before calling the user compiler)
+            self.tracing_context.global_context.restore_graphstate(
+                GlobalContextCheckpointState(current_global_state)
+            )
+
+    @torch._guards.TracingContext.clear_frame()
+    def compile_and_call_fx_graph(self, tx, rv, root):
+        """
+        Generate code from self.graph and return the Instruction()s to
+        call that generated code.
+        """
+        from .decorators import disable
+
+        assert self.should_exit
+
+        name = unique_id("__compiled_fn")
+
+        assert isinstance(rv, list)
+        assert isinstance(root, FakeRootModule)
+        self.create_node(
+            "output",
+            "output",
+            (self.current_tracer.create_arg(tuple(x.as_proxy() for x in rv)),),
+            {},
+        )
+        self.insert_deferred_runtime_asserts(root, name)
+        # NB: deferred runtime asserts can keep graphargs live, so make sure
+        # those are inserted before pruning
+        self.remove_unused_graphargs()
+        ncalls = count_calls(self.graph)
+        counters["stats"]["calls_captured"] += ncalls
+
+        # free a bit of memory
+        self.real_value_cache.clear()
+
+        gm = _make_graph_module(root, self.graph)
+        for register_finalizer in self.register_finalizer_fns:
+            register_finalizer(gm)
+
+        gm.compile_subgraph_reason = self.compile_subgraph_reason
+        gm.meta[
+            "dynamo_flat_name_to_original_fqn"
+        ] = self.dynamo_flat_name_to_original_fqn.copy()
+
+        graph_code_log.debug("%s", lazy_format_graph_code(name, gm))
+        torch._logging.trace_structured(
+            "dynamo_output_graph",
+            lambda: {"sizes": self.get_graph_sizes_structured()},
+            payload_fn=lambda: gm.print_readable(print_output=False),
+        )
+        graph_tabular_log.debug("%s", lazy_format_graph_tabular(name, gm))
+        graph_sizes_log.debug("%s", LazyString(lambda: self.get_graph_sizes(name)))
+        self.call_cleanup_hooks()
+        old_fake_mode = self.tracing_context.fake_mode
+        if not self.export:
+            # TODO(voz): The way export uses gm, and fake tensors, is not supported with us resetting
+            backend_fake_mode = torch._subclasses.FakeTensorMode(
+                shape_env=old_fake_mode.shape_env,
+            )
+            # TODO(voz): Ostensibily, this should be scoped and
+            # restore back to old_fake_mode, but doing so currently violates
+            # a lot of fake_tensor ownership assumptions and runs afoul of detect_fake_mode
+            self.tracing_context.fake_mode = backend_fake_mode
+
+        with self.restore_global_state():
+            compiled_fn = self.call_user_compiler(gm)
+        compiled_fn = disable(compiled_fn)
+
+        counters["stats"]["unique_graphs"] += 1
+        # This is safe because we pre-process name to be unique
+        self.install_global_unsafe(name, compiled_fn)
+
+        cg = PyCodegen(tx)
+        cg.make_call_generated_code(name)
+        return cg.get_instructions()
+
+    @property
+    def placeholders(self) -> List[fx.Node]:
+        r = []
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                r.append(node)
+                continue
+            break
+        return r
+
+    @property
+    def graphargs(self) -> List[GraphArg]:
+        return [node.meta["grapharg"] for node in self.placeholders]
+
+    @dynamo_timed(phase_name="backend_compile")
+    def call_user_compiler(self, gm: fx.GraphModule) -> CompiledFn:
+        assert self.compiler_fn is not None
+        tot = 0
+        placeholders = []
+        for node in gm.graph.nodes:
+            if node.op in ("call_function", "call_method", "call_module"):
+                tot += 1
+            if node.op == "placeholder":
+                placeholders.append(node)
+        increment_op_count(tot)
+        for pl in placeholders:
+            arg = pl.meta["grapharg"]
+            # TODO: Why isn't this stored in meta :think:
+            pl._dynamo_source = arg.source
+
+        gm._param_name_to_source = self.param_name_to_source  # type: ignore[assignment]
+        gm._source_to_user_stacks = self.source_to_user_stacks  # type: ignore[assignment]
+
+        try:
+            name = (
+                self.compiler_fn.__name__
+                if hasattr(self.compiler_fn, "__name__")
+                else ""
+            )
+            _step_logger()(logging.INFO, f"calling compiler function {name}")
+            compiler_fn = self.compiler_fn
+            if config.verify_correctness:
+                compiler_fn = WrapperBackend(compiler_fn)
+            compiled_fn = compiler_fn(gm, self.example_inputs())
+            _step_logger()(logging.INFO, f"done compiler function {name}")
+            assert callable(compiled_fn), "compiler_fn did not return callable"
+        except exceptions_allowed_to_be_fallback as e:
+            if self.has_user_defined_allowed_in_graph:
+                raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+                    e.__traceback__
+                ) from None
+            msg = (
+                "Backend compiler failed with a fake tensor exception at \n"
+                f"{self.root_tx.format_frame_summary()}"
+                "Adding a graph break."
+            )
+            unimplemented_with_warning(e, self.root_tx.f_code, msg)
+        except SkipFrame as e:
+            # The backend compiler has requested that we skip the frame, instead of
+            # aborting execution.
+            raise e
+        except Exception as e:
+            raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
+                e.__traceback__
+            ) from None
+
+        signpost_event(
+            "dynamo",
+            "OutputGraph.call_user_compiler",
+            {
+                **self.co_fields,
+                "op_count": tot,
+                "node_count": len(gm.graph.nodes),
+                "input_count": len(placeholders),
+            },
+        )
+
+        return compiled_fn
+
+    def example_inputs(self) -> List[torch.Tensor]:
+        result = []
+        for arg in self.graphargs:
+            result.append(arg.example)
+        return result
+
+    def remove_unused_graphargs(self) -> None:
+        assert self.should_exit
+        # Miniature DCE pass, but only for obviously trivial operations
+        for node in reversed(list(self.graph.nodes)):
+            if len(list(node.users)) == 0:
+                if node.op == "get_attr":
+                    self.remove_node(node)
+                elif node.op == "call_function" and node.target is operator.getitem:
+                    self.remove_node(node)
+
+        def placeholder_binds_symbol(node):
+            arg = node.meta["grapharg"]
+            example = arg.example
+            if isinstance(example, torch.SymInt) and isinstance(
+                example.node.expr, sympy.Symbol
+            ):
+                return example.node.expr
+            return None
+
+        def remove_unused(node):
+            log.debug("REMOVE UNUSED GRAPHARG %s", node.meta["grapharg"].source.name())
+            # I'm not really sure why you need to delete these from the
+            # node since the node is going to get removed
+            del node.meta["grapharg"]
+            self.remove_node(node)
+            self.real_value_cache.pop(node, None)
+
+        used_symbols = set()
+        recheck_placeholders = []
+        for node in self.placeholders:
+            binds_symbol = placeholder_binds_symbol(node) is not None
+            # Don't delete symbol bindings yet
+            if binds_symbol:
+                if not node.users:
+                    recheck_placeholders.append(node)
+            else:
+                if not node.users and not isinstance(
+                    node.meta["grapharg"], BackwardStateGraphArg
+                ):
+                    remove_unused(node)
+                else:
+                    # Register the free symbols as uses
+                    arg = node.meta["grapharg"]
+                    if isinstance(arg, BackwardStateGraphArg):
+                        continue
+                    fake = (
+                        arg.fake_tensor if arg.fake_tensor is not None else arg.example
+                    )
+                    used_symbols |= free_symbols(fake)
+
+        # After removing unused graphargs, prune unused binds_symbol
+        for node in recheck_placeholders:
+            symbol = placeholder_binds_symbol(node)
+            if symbol is not None:
+                if symbol not in used_symbols:
+                    remove_unused(node)
+                else:
+                    # Make sure we delete later occurrences of the same symbol
+                    used_symbols.remove(symbol)
+
+    # TODO: this is a generic pass that should live outside of Dynamo
+    def insert_deferred_runtime_asserts(self, root, name) -> None:
+        """
+        During tracing, we may have discovered that some data-dependent values
+        had runtime assert on them; e.g., torch.empty(x.item()) induces a runtime
+        that x.item() >= 0.  This asserts can happen unpredictably during fake
+        tensor propagation, so we cannot conveniently insert them into the FX graph
+        when they occur.  Instead, we accumulate them in the ShapeEnv, and in this
+        pass insert them into the graph as proper tests.
+        """
+        # TODO: Request simplification on runtime asserts before emitting them
+        ras_by_symbol = self.shape_env.deferred_runtime_asserts.copy()
+
+        if not any(ras for ras in ras_by_symbol.values()):
+            return
+
+        gm = fx.GraphModule(root, self.graph)
+        graph_code_log.debug(
+            "%s",
+            lazy_format_graph_code(f"pre insert_deferred_runtime_asserts {name}", gm),
+        )
+
+        # We are going to mutate the dict
+        symbol_to_proxy = {}
+        placeholders = set()
+        last_placeholder = None
+        for node in self.graph.nodes:
+            if node.op != "placeholder":
+                last_placeholder = node
+                break
+            placeholders.add(node)
+        assert last_placeholder is not None
+
+        # Identify what symbols we need to reify.  This isn't strictly needed
+        # but helps reduce churn on the graph
+        needed_symbols: Set[sympy.Symbol] = set()
+        for ras in ras_by_symbol.values():
+            for ra in ras:
+                needed_symbols.update(free_symbols(ra.expr))
+
+        log.debug("needed_symbols = %s", needed_symbols)
+
+        for node in self.graph.nodes:
+            # Placeholders can match symbols, but when we destructure them
+            # with size we have to make sure we insert the nodes after all
+            # the placeholders
+            with self.graph.inserting_before(
+                node.next if node not in placeholders else last_placeholder.next
+            ):
+                if "example_value" not in node.meta:
+                    continue
+
+                defs = []
+
+                # For every new unbacked symbol, we need an fx.Node representing
+                # precisely this value.  There are a few places where the unbacked
+                # symbol could have come from, and we will check them to setup
+                # these nodes.
+                #
+                # For a case like item(), this is trivial (no new node is added.)
+                #
+                # For nonzero(), we need to add something like i0 = out.size(0)
+                #
+                # We could end up with duplicate nodes this way but it is not a
+                # big deal.
+                #
+                # We also do this to setup backed SymInts, but those are all going
+                # to be matched from placeholders
+                def match_symbol(symint, cb):
+                    if (
+                        isinstance(symint, torch.SymInt)
+                        and isinstance(symint.node, SymNode)
+                        and isinstance(s := symint.node.expr, sympy.Symbol)
+                        and s not in symbol_to_proxy
+                        and s in needed_symbols
+                    ):
+                        symbol_to_proxy[s] = fx.Proxy(cb())
+                        log.debug("symbol_to_proxy[%s] = %s", s, symbol_to_proxy[s])
+                        defs.append(s)
+
+                match_symbol(node.meta["example_value"], lambda: node)
+                if isinstance(t := node.meta["example_value"], torch.Tensor):
+                    for i, s in enumerate(t.size()):
+                        match_symbol(
+                            s, lambda: self.graph.call_method("size", (node, i))
+                        )
+                    for i, s in enumerate(t.stride()):
+                        match_symbol(
+                            s, lambda: self.graph.call_method("stride", (node, i))
+                        )
+                    match_symbol(
+                        t.storage_offset(),
+                        lambda: self.graph.call_method("storage_offset", (node,)),
+                    )
+
+                for i0 in defs:
+                    ras = ras_by_symbol.pop(i0, [])
+                    # Before we perform any asserts, first apply range
+                    # refinement.  This is important, because if we are going
+                    # to retrace the graph (and we typically are if we send
+                    # the graph to AOTAutograd), we need to make sure we apply
+                    # range refinement (ala _check_is_size) first, BEFORE we
+                    # run any of the asserts.  Otherwise, we may decide to
+                    # perform substitutions based on the asserts which we then
+                    # can't back out, because value ranges can only be applied
+                    # to asserts.)
+                    #
+                    # A perhaps better long term plan is to avoid this order
+                    # dependence by making it possible to refine ranges on
+                    # arbitrary expressions, not just symbols.  But it is not
+                    # so easy to make use of this information, see
+                    # https://twitter.com/ezyang/status/1745801370299482492
+                    # We actually made an attempt at this in
+                    # https://github.com/pytorch/pytorch/pull/119043
+                    # which didn't work.
+                    #
+                    # Another ideas for how to do this:
+                    # - Have bound_sympy be the source of truth of the ranges of any expression
+                    # - Cache intermediate results for every subexpression of bound_sympy
+                    # - This cache should be possible to edit to refine ranges
+                    #
+                    # One issue with this proposal is that if
+                    # we have a bound on 2x, we are not going to be able to
+                    # apply it for 4x.  Similarly, we may have bounds for an
+                    # equivalent expression that we are not applying because
+                    # it's not a perfect match (e.g. x < y vs y > x)".
+                    #
+                    # The first issue we already have it and it's impossible
+                    # to solve in general, so any implementation on a best
+                    # effort basis should do.
+                    #
+                    # The second issue is a preexisting one. It can be mitigated
+                    # with a normalisation algorithm. In general, it may also
+                    # be on a best effort basis, but since our grammar is not
+                    # terribly difficult, chances are we could even fully
+                    # normalise SymPy expressions... who knows.
+
+                    if i0 in self.shape_env.size_like:
+                        self.graph.call_function(
+                            torch._check_is_size, (symbol_to_proxy[i0].node,)
+                        )
+
+                    vr = self.shape_env.var_to_range[i0]
+                    if not self.shape_env._default_unspecified_value_range().issubset(
+                        vr
+                    ):
+                        # The runtime range is constrained, so add a runtime
+                        # assert and also explicitly refine the range
+                        # (refinement should not be necessary once runtime
+                        # asserts cause refinement, but that's NYI)
+                        def convert(s):
+                            try:
+                                return int(s)
+                            except TypeError:
+                                return None
+
+                        self.graph.call_function(
+                            torch._constrain_as_value,
+                            (
+                                symbol_to_proxy[i0].node,
+                                convert(vr.lower),
+                                convert(vr.upper),
+                            ),
+                        )
+
+                    for ra in ras:
+                        log.debug("inserting runtime assert %s", ra.expr)
+                        # Need to process ALL free symbols, not just unbacked ones
+                        fvs = free_symbols(ra.expr)
+                        missing = fvs - symbol_to_proxy.keys()
+                        if missing:
+                            i1 = sorted(missing)[0]
+                            # TODO: Remove relaxing assert on unbacked_symint https://github.com/pytorch/pytorch/issues/119689
+                            # assert self.shape_env.is_unbacked_symint(i1), i1
+                            ras_by_symbol.setdefault(i1, []).append(ra)
+                        else:
+                            # Convert the sympy expression into a sequence of FX
+                            # nodes
+                            res = sympy_interp(
+                                PythonReferenceAnalysis, symbol_to_proxy, ra.expr
+                            ).node
+                            self.graph.call_function(
+                                torch.ops.aten._assert_scalar.default,
+                                # TODO: use ra.msg here, but it's pretty
+                                # useless right now
+                                (
+                                    res,
+                                    f"Deferred runtime assertion failed {ra.expr}",
+                                ),
+                            )
+
+    def add_output_instructions(self, prefix: List[Instruction]) -> None:
+        """
+        We call this on the creation of a new compiled subgraph that is inserted
+        before user code.
+        """
+        self.output_instructions.extend(prefix)
+        self.should_exit = True
+
+    def install_global_unsafe(self, name, value) -> None:
+        """
+        WARNING: prefer the safer `install_global_by_id/install_global`.
+        torch.compile instances should be independent of each other;
+        one footgun is to have one instance depend on the existence of
+        a global installed by another instance. This can happen if we mangle
+        a global the same way across both instances.
+        """
+        assert name not in self.installed_globals
+        self.installed_globals.add(name)
+        self.cleanups.append(CleanupHook.create(self.global_scope, name, value))
+
+    def install_global_by_id(self, prefix, value) -> str:
+        """
+        Installs a global if it hasn't been installed already.
+        This is determined by (prefix, id(value)) pair.
+
+        Returns the name of the newly installed global.
+        """
+        # NB: need self.compile_id to distinguish this global
+        # from another global created in a different torch.compile instance
+        name = f"{prefix}_{id(value)}_c{self.compile_id}"
+        if name in self.installed_globals:
+            return name
+        self.install_global_unsafe(name, value)
+        return name
+
+    def install_global(self, prefix, value) -> str:
+        """
+        Installs a global, generating a unique name for it.
+
+        Returns the name of the newly installed global.
+        """
+        # NB: unique_id is unique, even across torch.compile instances
+        name = unique_id(prefix)
+        self.install_global_unsafe(name, value)
+        return name
+
+    def cleanup(self) -> None:
+        # There is a reference cycle between tracer and OutputGraph, causing
+        # some of the tensor objects to be held alive for longer than necessary.
+        self.root_tx = None
+        self.nn_modules.clear()
+        self.param_name_to_source = None
+
+        for node in self.graph.nodes:
+            if "grapharg" in node.meta:
+                del node.meta["grapharg"]
+        self.real_value_cache.clear()
+        self.input_name_to_proxy.clear()
+        self.side_effects.clear()
+        self.register_finalizer_fns.clear()
+        self.dynamo_flat_name_to_original_fqn.clear()
+        self.tracing_context.clear()
+
+    def set_torch_function_state(self, enabled: bool) -> None:
+        self.torch_function_enabled = enabled
+
+    def add_graph_finalizer(
+        self, register_finalizer: Callable[[fx.GraphModule], None]
+    ) -> None:
+        self.register_finalizer_fns.append(register_finalizer)
+
+    def example_value_from_input_node(self, node: torch.fx.Node):
+        """Extract the non-fake example tensor"""
+        if node.op == "placeholder":
+            return node.meta["grapharg"].example
+        assert node.op == "get_attr"
+        return self.nn_modules[node.target]  # type: ignore[index]
+
+
+err_epilogue = (
+    "With the current config, we will graph break "
+    "(and fall back to eager-mode PyTorch) on all ops "
+    "that have do not have the 'pt2_compliant_tag'. "
+    "Please see the following doc for how to mark this op as PT2 compliant "
+    "https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ"
+)
+
+
+def check_pt2_compliant_op(output_graph, kind, target, args, kwargs):
+    if kind != "call_function":
+        return
+
+    def encountered_compliant_op(target):
+        if target.namespace in {"prim", "prims", "aten"}:
+            return
+        output_graph.compliant_custom_ops.add(target)
+
+    def encountered_non_compliant_op(target, msg):
+        output_graph.non_compliant_ops.add(target)
+        if config.only_allow_pt2_compliant_ops:
+            unimplemented(msg + " " + err_epilogue)
+
+    if isinstance(target, torch._ops.OpOverload):
+        if torch.Tag.pt2_compliant_tag in target.tags:
+            encountered_compliant_op(target)
+            return
+        encountered_non_compliant_op(
+            target,
+            f"Encountered the torch.ops.OpOverload {target} "
+            f"that is not PT2 compliant.",
+        )
+        return
+
+    if isinstance(target, torch._ops.OpOverloadPacket):
+        overloads = tuple(target.overloads())
+        # Optimization: Overload resolution is expensive.
+        # If there's only one overload, we know what it will resolve to.
+        if len(overloads) == 1:
+            op = getattr(target, overloads[0])
+            if torch.Tag.pt2_compliant_tag in op.tags:
+                encountered_compliant_op(op)
+                return
+            encountered_non_compliant_op(
+                op,
+                f"Encountered the non-overloaded "
+                f"torch.ops.OpOverloadPacket {target} "
+                f"that is not PT2 compliant. ",
+            )
+            return
+
+        args, kwargs = torch._dynamo.utils.get_fake_values_from_nodes(
+            output_graph.current_tx, (args, kwargs), False
+        )
+        try:
+            overload = torch._C._jit_resolve_packet(
+                target._qualified_op_name, *args, **kwargs
+            )
+        except RuntimeError as e:
+            unimplemented(str(e))
+
+        op = getattr(target, overload)
+        if torch.Tag.pt2_compliant_tag in op.tags:
+            encountered_compliant_op(op)
+        else:
+            encountered_non_compliant_op(
+                op,
+                f"Encountered the torch.ops.OpOverloadPacket {target} "
+                f"which resolves to the overload ({overload}) that is "
+                f"not PT2 compliant.",
+            )
+
+
+_compile_id_counter = itertools.count()
+
+
+class SubgraphTracer(fx.Tracer):
+    """
+    Holds an FX graph that is being traced. OutputGraph owns a SubgraphTracer
+    and the separation of responsibilities is that SubgraphTracer is
+    responsible for building the graph while OutputGraph is responsible for
+    compiling and executing the graph.
+    """
+
+    def __init__(
+        self, output_graph, parent=None, export_root=False, source_target=None
+    ):
+        super().__init__()
+        self.output_graph = weakref.proxy(output_graph)
+        self.graph = torch.fx.Graph()
+
+        # The export is only ever set for the ROOT tracer.  It controls
+        # whether or not certain inputs are allowed to be added or not.
+        # Look at call sites of create_graph_input to see how it is used.
+        if export_root:
+            assert parent is None
+        self.export_root = export_root
+        # Map from graph input name to its placeholder proxy object, where the
+        # map's keys give all current placeholder node names and can be used to
+        # create unique node names
+        self.input_name_to_proxy: Dict[str, fx.Proxy] = {}
+        # Node => computed real value (see utils.get_real_value)
+        self.real_value_cache: Dict[fx.Node, torch.Tensor] = {}
+
+        # SubgraphTracers can be nested. See NOTE [HigherOrderOperator tracing design]
+        self.parent = parent
+        # A dict mapping previously free variables (Proxy objects)
+        # to new Proxy objects that wrap inputs to this subgraph.
+        #
+        # This dict serves two purposes:
+        # - Proxies are associated with VariableTrackers. If we see
+        # the same VariableTracker twice (and it is a free variable),
+        # then we want to use the same Proxy in the current subgraph to
+        # record the tracing.
+        # - If we are tracing a HigherOrderOperator's body_fn, then we
+        # need to keep track of what free variables were lifted so we can
+        # rewrite the HigherOrderOperator call using the traced body_fn.
+        # Dicts maintain the order of args for the HigherOrderOperator call.
+        self.lifted_freevars = {}
+        self.prev_inst = None
+
+        self._cur_code = None
+        self._orig_gm_meta = None
+        self._orig_gm_lineno_map = None
+        self._orig_gm_firstlineno = None
+        # Each SubgraphTracer is associated with a source target, which indicates
+        # which operator this subgraph is attached to. We compute a source_fn_stack
+        # based on the source target. For the root tracer, it's set to [].
+        # This is useful for debugging and transforming the exported graph.
+        if self.parent is None:
+            self.source_fn_stack = []
+        else:
+            self.source_fn_stack = self.parent.source_fn_stack + [
+                (self.graph._target_to_str(source_target), source_target)
+            ]
+
+    def create_proxy(
+        self,
+        kind,
+        target,
+        args,
+        kwargs,
+        name=None,
+        type_expr=None,
+        proxy_factory_fn=None,
+    ):
+        # NOTE: [Nested SubgraphTracer and free_variable handling]
+        # --------------------------------------------------------
+        # Read NOTE [HigherOrderOperator tracing design] first.
+        #
+        # Let's say we're in the middle of introspecting the body of a possibly
+        # nested HigherOrderOperator, and we see a free variable.
+        #
+        # There are two cases:
+        # 1. We see a free variable that is already tracked by Dynamo.
+        # 2. We see a free variable that has not been tracked by Dynamo
+        #
+        # In case 1, we call `maybe_lift_tracked_freevar_to_input` (below)
+        # which will lift the freevar to be an input of this subgraph
+        # and also recursively lift it to be an input on the parent(s).
+        #
+        # In case 2, before the call to `create_proxy`, the InstructionTranslator
+        # will see the freevar when it gets loaded by Python bytecode.
+        # E.g. for Python 3.11 the bytecodes that may do this are LOAD_DEREF or
+        # LOAD_GLOBAL.
+        # There, the InstructionTranslator asks Dynamo to begin tracking the
+        # freevar by building a new Variable.
+        # Building a new Variable automatically lifts the freevar to be an
+        # input of the root SubgraphTracer.
+        #
+        # The implications for the code below are:
+        # - We will always be in Case 1 when we get to this code.
+        # - Any "free variable" we encounter here is guaranteed to already be
+        #   bound, that is, it is either a graph input of the root graph, or
+        #   some local variable of the root graph or a subgraph.
+        # - The additional work we need to do here is *only* that we need to
+        #   lift this free variable into inputs (recursively) of each nested
+        #   higher-order-op subgraph until we hit the subgraph where the free
+        #   variable is bound
+        if self.parent is not None:
+            flat_args, tree_spec = pytree.tree_flatten((args, kwargs))
+            new_flat_args = []
+            for arg in flat_args:
+                maybe_new_arg = self.maybe_lift_tracked_freevar_to_input(arg)
+                new_flat_args.append(maybe_new_arg)
+
+            args, kwargs = pytree.tree_unflatten(new_flat_args, tree_spec)
+
+        rv = super().create_proxy(
+            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+        )
+
+        # append stack trace to fx node
+        tx = self.output_graph.current_tx
+
+        # log detailed location of line of code in 3.11
+        if sys.version_info >= (3, 11) and kind in (
+            "call_function",
+            "call_method",
+            "call_module",
+        ):
+            cur_inst = tx.current_instruction
+            if (
+                cur_inst is not self.prev_inst
+                and cur_inst.positions is not None
+                and cur_inst.positions.lineno is not None
+            ):
+                tx_code = tx.f_code
+                header = tx.get_line_of_code_header(lineno=cur_inst.positions.lineno)
+
+                def get_trace_call_log_str():
+                    line = get_instruction_source_311(tx_code, cur_inst).rstrip()
+                    return f"TRACE FX call {rv.node.name} from {header}\n{line}"
+
+                trace_call_log.debug("%s", LazyString(get_trace_call_log_str))
+                self.prev_inst = cur_inst
+
+        # update reference to original meta if we're tracing a new code object
+        is_retracing = False
+        if tx.f_code is not self._cur_code:
+            orig_graphmodule_maybe = code_context.get_context(tx.f_code).get(
+                "orig_graphmodule", lambda: None
+            )()
+            if isinstance(orig_graphmodule_maybe, torch.fx.GraphModule):
+                is_retracing = True
+                self._orig_gm_meta = [
+                    nd.meta for nd in orig_graphmodule_maybe.graph.nodes
+                ]
+                self._orig_gm_lineno_map = orig_graphmodule_maybe._lineno_map
+                self._orig_gm_firstlineno = (
+                    orig_graphmodule_maybe.forward.__code__.co_firstlineno
+                )
+            else:
+                self._orig_gm_meta = None
+                self._orig_gm_lineno_map = None
+                self._orig_gm_firstlineno = None
+        nn_module_stack = tx.nn_module_stack
+        if nn_module_stack:
+            rv.node.meta["nn_module_stack"] = nn_module_stack.copy()
+
+        if kind in {"call_function", "call_method"}:
+            rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
+                (rv.node.name, target)
+            ]
+        elif kind == "call_module":
+            if self.parent is not None:
+                unimplemented("Invoking an nn.Module inside HigherOrderOperator")
+            # For modules we store the class
+            rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
+                (
+                    rv.node.name,
+                    rv.node.meta["nn_module_stack"][target][1],
+                )
+            ]
+
+        # preserve original meta if it is available
+        if (
+            self._orig_gm_meta
+            and self._orig_gm_lineno_map
+            and self._orig_gm_firstlineno
+        ):
+            lineno = tx.current_instruction.starts_line
+            node_idx = None
+            if lineno is not None:
+                node_idx = self._orig_gm_lineno_map.get(
+                    lineno - self._orig_gm_firstlineno, None
+                )
+            if node_idx is not None:
+                meta = self._orig_gm_meta[node_idx]
+                for field in fx.proxy._COPY_META_FIELDS:
+                    if field in meta:
+                        rv.node.meta[field] = meta[field]
+                if "stack_trace" in meta:
+                    rv.node.meta["stack_trace"] = meta["stack_trace"]
+
+        if not is_retracing:
+            if "nn_module_stack" not in rv.node.meta:
+                nn_module_stack = tx.nn_module_stack
+                if nn_module_stack:
+                    rv.node.meta["nn_module_stack"] = nn_module_stack.copy()
+
+            if "source_fn_stack" not in rv.node.meta:
+                if kind in {"call_function", "call_method"}:
+                    rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
+                        (rv.node.name, target)
+                    ]
+                elif kind == "call_module":
+                    if self.parent is not None:
+                        unimplemented(
+                            "Invoking an nn.Module inside HigherOrderOperator"
+                        )
+                    # For modules we store the class
+                    rv.node.meta["source_fn_stack"] = self.source_fn_stack + [
+                        (
+                            rv.node.name,
+                            rv.node.meta["nn_module_stack"][target][1],
+                        )
+                    ]
+
+        if "stack_trace" not in rv.node.meta:
+            frame_summaries: List[traceback.FrameSummary] = []
+            while tx:
+                frame_summaries.append(tx.frame_summary())
+                tx = getattr(tx, "parent", None)
+            # Reverse the frame_summaries, such that the innermost frame is at the last
+            frame_summaries.reverse()
+
+            # official from_list stub doesn't have new-style type
+            msgs = traceback.StackSummary.from_list(frame_summaries).format()
+            rv.node.stack_trace = "".join(msgs)
+
+        return rv
+
+    def create_node(
+        self, op, target, args=None, kwargs=None, name=None, type_expr=None
+    ):
+        check_pt2_compliant_op(self.output_graph, op, target, args, kwargs)
+        if self.parent is not None:
+            flat_args = pytree.arg_tree_leaves(*args, **kwargs)
+            for arg in flat_args:
+                if not isinstance(arg, torch.fx.Node):
+                    continue
+                assert (
+                    arg.graph == self.graph
+                ), "create_node using arg not from this SubgraphTracer"
+
+        node = super().create_node(op, target, args, kwargs, name, type_expr)
+        node.meta["creation_timestamp"] = self.output_graph.timestamp
+        return node
+
+    # Note: we did not override erase_node since
+    # we call self.graph.erase_node elsewhere
+    def remove_node(self, node):
+        if len(node.users) > 0:
+            user_graph_nodes: List[torch.fx.Node] = []
+            for user in node.users.keys():
+                # For the case where user.graph == self.graph, that is a real bug and will raise
+                # properly.
+                if user.graph != self.graph:
+                    # This is a nested graph, which needs to be deleted.
+                    # If we do not do this, we will raise on attempting to remove this.
+                    # As we only get here during restoration cleanup, this is sound.
+                    user_graph_nodes.extend(reversed(list(user.graph.nodes)))
+            for other_graph_node in user_graph_nodes:
+                other_graph_node.graph.erase_node(other_graph_node)
+        self.graph.erase_node(node)
+        self.input_name_to_proxy.pop(node.name, None)
+
+    # when before=True, we will insert this input before the most recent
+    # inserted proxy.  This is a hack to get around an ordering problem,
+    # where we first insert a tensor argument, and then insert bindings
+    # for SymInts that may occur in the tensor argument.
+    # Remove this if https://github.com/pytorch/pytorch/issues/99007 gets
+    # fixed.
+    def create_graph_input(self, name, type_expr=None, before=False, source=None):
+        log.debug(
+            "create_graph_input %s %s",
+            name,
+            source.name() if source is not None else "(none)",
+        )
+        if source is None:
+            assert (
+                self.parent is not None
+            ), "you are required to provide a source for inputs on the root tracer"
+
+        # In eager, we are generally OK with adding graph inputs whenever we
+        # want, because we take care of writing the bytecode that knows how
+        # to source all the inputs.
+        #
+        # In export, this is bad, because you want a self-contained export
+        # object which only depends on the inputs you explicitly passed to it.
+        # So we are a bit more strict about what sources can become inputs
+        # in export
+        if self.export_root:
+            if not is_from_local_source(source, allow_cell_or_freevar=False):
+                self.output_graph.source_to_user_stacks.setdefault(source, []).append(
+                    TracingContext.extract_stack()
+                )
+
+        # unique
+        if name in self.input_name_to_proxy:
+            for i in itertools.count():
+                candidate_name = f"{name}_{i}"
+                if candidate_name not in self.input_name_to_proxy:
+                    name = candidate_name
+                    break
+
+        if self.input_name_to_proxy:
+            prev_name = next(reversed(self.input_name_to_proxy))
+            node = self.input_name_to_proxy[prev_name].node
+            if before:
+                ctx = self.graph.inserting_before(node)
+            else:
+                ctx = self.graph.inserting_after(node)
+        else:
+            ctx = self.graph.inserting_before(None)
+        with ctx:
+            proxy = self.create_proxy("placeholder", name, (), {}, type_expr=type_expr)
+            if self.input_name_to_proxy and before:
+                k, v = self.input_name_to_proxy.popitem()
+                self.input_name_to_proxy[name] = proxy
+                self.input_name_to_proxy[k] = v
+            else:
+                self.input_name_to_proxy[name] = proxy
+            return proxy
+
+    # See NOTE: [Nested SubgraphTracer and free_variable handling] for more details
+    def lift_tracked_freevar_to_input(self, proxy):
+        # You're doing something wrong if we are the root SubgraphTracer because
+        # Dynamo adds tensors to graph inputs before creating a proxy for them.
+        assert (
+            self.parent is not None
+        ), "lift_tracked_freevar_to_input should not be called on root SubgraphTracer"
+        # Proxys are associated with VariableTracker.
+        # It is possible that we've already lifted the Proxy to be an input.
+        # If that is the case, just return the already lifted Proxy.
+        if proxy in self.lifted_freevars:
+            return self.lifted_freevars[proxy]
+        new_proxy = self.create_graph_input(proxy.node.name)
+        new_proxy.node.meta["example_value"] = proxy.node.meta["example_value"]
+        self.lifted_freevars[proxy] = new_proxy
+        if self.parent is not None and proxy.tracer != self.parent:
+            self.parent.lift_tracked_freevar_to_input(proxy)
+        return new_proxy
+
+    def maybe_lift_tracked_freevar_to_input(self, arg):
+        """
+        If arg is a free variable, then lift it to be an input.
+        Returns the new lifted arg (if arg was a freevar), else the
+        original arg.
+        """
+        if not isinstance(arg, torch.fx.Proxy):
+            return arg
+        elif arg.tracer == self:
+            return arg
+        return self.lift_tracked_freevar_to_input(arg)
+
+
+# NOTE: [HigherOrderOperator tracing design]
+# Ignoring HigherOrderOperators for a moment,
+# OutputGraph represents the graph being built by Dynamo that may be compiled
+# and executed. It holds a root SubgraphTracer where the FX graph is built.
+#
+# HigherOrderOperators are operators that take functions as their arguments.
+# When Dynamo encounters a HigherOrderOperator, then it attempts to introspect
+# the function passed to it (call this the "body function"), capture it into a
+# GraphModule, and rewrite the call to the HigherOrderOperator to use the
+# GraphModule.
+#
+# The way we handle the capture of body functions is through having
+# (possibly nested) SubgraphTracers, one per body function.
+#
+# Mechanically, we do the introspection by:
+# - Creating a new SubgraphTracer via OutputGraph.subtracer
+# - Executing the body function.
+# This constructs the graph of the body function in the new SubgraphTracer
+# while modifying the state of the OutputGraph. For example:
+# - the OutputGraph can receive new GraphArgs (if we discover any new
+#   untracked Tensors)
+# - side effects from the body function get accumulated into
+#   OutputGraph.side_effects
+# - guards produced by the body function get accumulated into OutputGraph.guards
+#
+# The traced function has some special properties that make it easier for us
+# to transform later down the line:
+# - we lift all free variables to being inputs.
+#
+# If the introspection fails (due to the existence of graph breaks), then
+# we roll back the current OutputGraph state and graph break on the
+# HigherOrderOperator.
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/polyfill.py b/MLPY/Lib/site-packages/torch/_dynamo/polyfill.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6add71c1b2f2787c98bfd3823dfd564f28d687
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/polyfill.py
@@ -0,0 +1,47 @@
+# mypy: ignore-errors
+
+"""
+Python polyfills for common builtins.
+"""
+import math
+
+import torch
+
+
+def all(iterator):
+    for elem in iterator:
+        if not elem:
+            return False
+    return True
+
+
+def any(iterator):
+    for elem in iterator:
+        if elem:
+            return True
+    return False
+
+
+def index(iterator, item, start=0, end=None):
+    for i, elem in enumerate(list(iterator))[start:end]:
+        if item == elem:
+            return i
+    # This will not run in dynamo
+    raise ValueError(f"{item} is not in {type(iterator)}")
+
+
+def repeat(item, count):
+    for i in range(count):
+        yield item
+
+
+def radians(x):
+    return math.pi / 180.0 * x
+
+
+def accumulate_grad(x, new_grad):
+    new_grad = torch.clone(new_grad)
+    if x.grad is None:
+        x.grad = new_grad
+    else:
+        x.grad.add_(new_grad)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/profiler.py b/MLPY/Lib/site-packages/torch/_dynamo/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f26c3c7d010d2aa622e8061464c7be191fbd1297
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/profiler.py
@@ -0,0 +1,155 @@
+import dataclasses
+import os
+from typing import Any, List
+
+import torch
+
+from .utils import print_once
+
+
+@dataclasses.dataclass
+class ProfileMetrics:
+    microseconds: float = 0.0
+    operators: int = 0
+    fusions: int = 0
+    graphs: int = 0
+
+    def __iadd__(self, other: "ProfileMetrics"):
+        self.microseconds += other.microseconds
+        self.operators += other.operators
+        self.fusions += other.fusions
+        return self
+
+    def __add__(self, other: "ProfileMetrics"):
+        assert isinstance(other, ProfileMetrics)
+        return ProfileMetrics(
+            self.microseconds + other.microseconds,
+            self.operators + other.operators,
+            self.fusions + other.fusions,
+        )
+
+    def __truediv__(self, other):
+        if isinstance(other, int):
+            other = ProfileMetrics(other, other, other)
+        return ProfileMetrics(
+            self.microseconds / max(1, other.microseconds),
+            self.operators / max(1, other.operators),
+            self.fusions / max(1, other.fusions),
+        )
+
+    def __str__(self):
+        return f"{self.operators:4.0%} ops {self.microseconds:4.0%} time"
+
+    def tocsv(self):
+        return [self.operators, self.microseconds]
+
+
+class ProfileResult:
+    def __init__(self, captured, total, unique_graphs):
+        self.captured: ProfileMetrics = captured or ProfileMetrics()
+        self.total: ProfileMetrics = total or ProfileMetrics()
+        self.unique_graphs: int = unique_graphs
+
+    def __iadd__(self, other: "ProfileResult"):
+        self.captured += other.captured
+        self.total += other.total
+        self.unique_graphs += other.unique_graphs
+        return self
+
+    def percent(self):
+        return self.captured / self.total
+
+    def __str__(self):
+        return (
+            f"{self.unique_graphs:2} graphs {self.captured.graphs:2} graph calls "
+            f"{self.captured.operators:4}/{self.total.operators:4} = "
+            + str(self.percent())
+        )
+
+    def tocsv(self):
+        return [
+            self.unique_graphs,
+            self.captured.graphs,
+            self.captured.operators,
+            self.total.operators,
+        ] + self.percent().tocsv()
+
+
+def should_print_missing():
+    return os.environ.get("TORCHDYNAMO_PRINT_MISSING") == "1"
+
+
+def print_missing(stack):
+    if any("/torch/autograd/profiler.py" in x for x in stack):
+        return
+    stack = [
+        x for x in stack if ("<built-in" not in x and "site-packages/torch/" not in x)
+    ]
+    print_once("MISSING", " >> ".join(stack[-3:]))
+
+
+class Profiler:
+    unique_graphs = 0
+
+    def __init__(self):
+        self.prof = torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU],
+            with_stack=should_print_missing(),
+        )
+
+    def results(self):
+        captured_regions = 0
+        captured_ops = 0
+        captured_microseconds = 0
+        total_ops = 0
+        total_microseconds = 0
+
+        last_op_end_time = -1
+        captured_region_end_time = -1
+        events = sorted(self.prof.events(), key=lambda x: x.time_range.start)
+        for e in events:
+            if e.name == "TORCHDYNAMO":
+                captured_region_end_time = e.time_range.end
+                captured_regions += 1
+                # ignore `handle = torch.zeros(1)` in record_function.__init__()
+                total_ops -= 1
+            elif e.time_range.start >= last_op_end_time:
+                last_op_end_time = e.time_range.end
+                if e.time_range.end <= captured_region_end_time:
+                    captured_ops += 1
+                    captured_microseconds += e.time_range.elapsed_us()
+                elif should_print_missing():
+                    print_missing(e.stack)
+                total_ops += 1
+                total_microseconds += e.time_range.elapsed_us()
+            else:
+                pass  # ops recursively called from other ops (ignored)
+
+        unique_graphs = Profiler.unique_graphs
+        Profiler.unique_graphs = 0
+        # we counted one extra op that is part of the profiler setup code
+        total_ops -= 1
+
+        return ProfileResult(
+            captured=ProfileMetrics(
+                microseconds=captured_microseconds,
+                operators=captured_ops,
+                fusions=captured_ops - captured_regions,
+                graphs=captured_regions,
+            ),
+            total=ProfileMetrics(
+                microseconds=total_microseconds,
+                operators=total_ops,
+                fusions=total_ops - 1,
+            ),
+            unique_graphs=unique_graphs,
+        )
+
+
+def fx_insert_profiling(gm: torch.fx.GraphModule, example_inputs: List[Any]):
+    def _wrapped(*args):
+        with torch.profiler.record_function("TORCHDYNAMO"):
+            return gm.forward(*args)
+
+    Profiler.unique_graphs += 1
+    return _wrapped
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/replay_record.py b/MLPY/Lib/site-packages/torch/_dynamo/replay_record.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4690ed78b14bc765edbfeafa62bf4e35907a16
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/replay_record.py
@@ -0,0 +1,110 @@
+import dataclasses
+from dataclasses import field
+from types import CodeType, ModuleType
+from typing import Any, Dict
+
+from torch.utils._import_utils import import_dill
+
+dill = import_dill()
+
+
+@dataclasses.dataclass
+class ModuleRecord:
+    module: ModuleType
+    accessed_attrs: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class DummyModule:
+    name: str
+    is_torch: bool = False
+
+    @property
+    def __name__(self):
+        return self.name
+
+
+@dataclasses.dataclass
+class ExecutionRecord:
+    code: CodeType
+    globals: Dict[str, Any] = field(default_factory=dict)
+    locals: Dict[str, Any] = field(default_factory=dict)
+    builtins: Dict[str, Any] = field(default_factory=dict)
+    code_options: Dict[str, Any] = field(default_factory=dict)
+
+    def dump(self, f):
+        assert dill is not None, "replay_record requires `pip install dill`"
+        dill.dump(self, f)
+
+    @classmethod
+    def load(cls, f):
+        assert dill is not None, "replay_record requires `pip install dill`"
+        return dill.load(f)
+
+
+@dataclasses.dataclass
+class ExecutionRecorder:
+    LOCAL_MOD_PREFIX = "___local_mod_"
+
+    code: CodeType
+    globals: Dict[str, Any] = field(default_factory=dict)
+    locals: Dict[str, Any] = field(default_factory=dict)
+    builtins: Dict[str, Any] = field(default_factory=dict)
+    code_options: Dict[str, Any] = field(default_factory=dict)
+    name_to_modrec: Dict[str, Any] = field(default_factory=dict)
+
+    def add_local_var(self, name, var):
+        if isinstance(var, ModuleType):
+            self.locals[name] = self._add_mod(var)
+        else:
+            self.locals[name] = var
+
+    def add_global_var(self, name, var):
+        if isinstance(var, ModuleType):
+            self.globals[name] = self._add_mod(var)
+        else:
+            self.globals[name] = var
+
+    def add_local_mod(self, name, mod):
+        assert isinstance(mod, ModuleType)
+
+        self.add_global_var(name, mod)
+
+    def record_module_access(self, mod, name, val):
+        if isinstance(val, ModuleType):
+            self.name_to_modrec[mod.__name__].accessed_attrs[name] = self._add_mod(val)
+            return
+
+        if mod.__name__ in self.name_to_modrec:
+            self.name_to_modrec[mod.__name__].accessed_attrs[name] = val
+
+    def get_record(self):
+        return ExecutionRecord(
+            self.code,
+            ExecutionRecorder._resolve_modules(self.globals),
+            ExecutionRecorder._resolve_modules(self.locals),
+            self.builtins.copy(),
+            self.code_options.copy(),
+        )
+
+    def _add_mod(self, mod):
+        if mod.__name__ not in self.name_to_modrec:
+            self.name_to_modrec[mod.__name__] = ModuleRecord(mod)
+
+        return self.name_to_modrec[mod.__name__]
+
+    # Convert ModuleRecords -> DummyModule tree
+    @classmethod
+    def _resolve_modules(cls, vars):
+        def resolve_module(var):
+            if not isinstance(var, ModuleRecord):
+                return var
+
+            dummy_mod = DummyModule(var.module.__name__)
+            for attr_name, attr_value in var.accessed_attrs.items():
+                attr_value = resolve_module(attr_value)
+                dummy_mod.__setattr__(attr_name, attr_value)
+
+            return dummy_mod
+
+        return {k: resolve_module(v) for k, v in vars.items()}
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/__init__.py b/MLPY/Lib/site-packages/torch/_dynamo/repro/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f7c37cb0533e34b507fcb72806e318b6e569a41
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81e858dd1f7e713cb81ef3bab109137a40888af9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_aot.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a8604c8d018209ecbb642bab8d04b238dc4fcea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/repro/__pycache__/after_dynamo.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/after_aot.py b/MLPY/Lib/site-packages/torch/_dynamo/repro/after_aot.py
new file mode 100644
index 0000000000000000000000000000000000000000..3658d7430396b7bda82470dc7c5bde8025a18727
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/repro/after_aot.py
@@ -0,0 +1,932 @@
+import argparse
+import copy
+import functools
+import io
+import logging
+import os
+import shutil
+import subprocess
+import sys
+import textwrap
+import uuid
+from importlib import import_module
+from tempfile import TemporaryFile
+from typing import Any, Callable, Dict, Union
+
+import torch
+import torch.fx as fx
+import torch.nn as nn
+from torch._dynamo.debug_utils import (
+    _cuda_system_info_comment,
+    AccuracyError,
+    backend_accuracy_fails,
+    BuckTargetWriter,
+    cast_to_fp64,
+    extra_imports,
+    generate_config_string,
+    helper_for_dump_minify,
+    InputReader,
+    InputWriter,
+    MAX_CONSTANT_NUMEL_INLINE,
+    minifier_dir,
+    NNModuleToString,
+    NopInputReader,
+    same_two_models,
+)
+from torch._dynamo.utils import clone_inputs, counters, same
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.symbolic_shapes import (
+    fx_placeholder_targets,
+    has_free_symbols,
+)
+from torch.hub import tqdm
+
+from .. import config
+
+log = logging.getLogger(__name__)
+
+
+inductor_config = import_module("torch._inductor.config")
+use_buck = inductor_config.is_fbcode()
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           MAIN ENTRY POINT
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def wrap_compiler_debug(unconfigured_compiler_fn, compiler_name: str):
+    """
+    Minifier for Fx Graph modules after Aot Autograd has finished. We wrap both
+    forward and backward call separately with the backend compiler_fn - like
+    inductor or nvfuser. Intercepting after Aot Autograd presents neat
+    abstraction, where all the params are lifted as graph inputs, making it easy
+    to save the graph as a string.
+    """
+
+    @functools.wraps(unconfigured_compiler_fn)
+    def debug_wrapper(gm, example_inputs, **kwargs):
+        from torch._subclasses import FakeTensorMode
+
+        compiler_fn = functools.partial(unconfigured_compiler_fn, **kwargs)
+
+        from torch._functorch.aot_autograd import get_aot_graph_name
+
+        graph_name = get_aot_graph_name()
+
+        # TODO: why do we need to deepcopy the original graph?
+        orig_graph = copy.deepcopy(gm.graph)
+        assert config.repro_after in ("dynamo", "aot", None)
+
+        try:
+            # Call the compiler_fn - which is either aot_autograd or inductor
+            # with fake inputs
+            inner_compiled_fn = compiler_fn(gm, example_inputs)
+        except Exception as e:
+            # TODO: Failures here are troublesome because no real inputs,
+            # need a different serialization strategy
+            if config.repro_after == "aot":
+                if config.repro_level == 1:
+                    dump_compiler_graph_state(
+                        fx.GraphModule(gm, orig_graph),
+                        example_inputs,
+                        compiler_name,
+                    )
+                elif config.repro_level == 2:
+                    dump_to_minify(
+                        fx.GraphModule(gm, orig_graph),
+                        example_inputs,
+                        compiler_name,
+                    )
+                log.error("CompilerError")
+            raise
+
+        # We may run regular PyTorch compute that may trigger Dynamo, do NOT
+        # recursively attempt to accuracy minify in that case!
+        def deferred_for_real_inputs(real_inputs):
+            # This is a bit obscure: if we recursively try to accuracy minify
+            # the SAME function, this would trigger.  But most of the time
+            # we should never hit this branch
+            if config.repro_after != "aot":
+                return inner_compiled_fn(real_inputs)
+            with config.patch(repro_after=None):
+                return inner_debug_fn(real_inputs)
+
+        def inner_debug_fn(real_inputs):
+            """
+            Aot Autograd fw_compiler and bw_compiler can have fake tensors. So,
+            example_inputs can be fake tensors. We can call compiler_fn (which is
+            inductor or nvfuser) with fake tensors but the actually compiled_fn
+            should be called with real tensors. Therefore, the actual invocation
+            is deferred.
+            """
+            # Copy the tensor attrs like shape, stride etc by converting to Fake Tensor
+            # because inductor clears the tensor list in its codegen. And example_inputs
+            # are available only for the first invocation.
+            fake_mode = FakeTensorMode()
+            copy_tensor_attrs = [
+                fake_mode.from_tensor(x) if isinstance(x, torch.Tensor) else x
+                for x in real_inputs
+            ]
+            if config.repro_level == 3:
+                # Always dump the original module in case we have segfaults
+                dump_to_minify(
+                    fx.GraphModule(gm, orig_graph), real_inputs, compiler_name
+                )
+
+            if config.repro_level == 4:
+                if compiler_name != "inductor":
+                    raise NotImplementedError(
+                        "Accuracy minification is supported for inductor only"
+                    )
+                if backend_aot_accuracy_fails(gm, real_inputs, compiler_fn):
+                    log.warning(
+                        "Accuracy failed for the AOT Autograd graph %s", graph_name
+                    )
+                    dump_compiler_graph_state(
+                        fx.GraphModule(gm, orig_graph),
+                        real_inputs,
+                        f"{compiler_name}_accuracy",
+                    )
+                    dump_to_minify(
+                        fx.GraphModule(gm, orig_graph),
+                        real_inputs,
+                        f"{compiler_name}_accuracy",
+                    )
+                    raise AccuracyError("Bad accuracy detected")
+                else:
+                    # Call the compiled function with real inputs
+                    return inner_compiled_fn(real_inputs)
+            else:
+                try:
+                    # Call the compiled function with real inputs
+                    out = inner_compiled_fn(real_inputs)
+                    # sync cuda kernels to ensure IMA detection
+                    for arg in example_inputs:
+                        if isinstance(arg, torch.Tensor) and arg.is_cuda:
+                            torch.cuda.synchronize()
+                            break
+                    return out
+                except Exception as e:
+                    if config.repro_level == 1:
+                        dump_compiler_graph_state(
+                            fx.GraphModule(gm, orig_graph),
+                            copy_tensor_attrs,
+                            compiler_name,
+                        )
+                    elif config.repro_level == 2:
+                        dump_to_minify(
+                            fx.GraphModule(gm, orig_graph),
+                            copy_tensor_attrs,
+                            compiler_name,
+                        )
+                    raise
+
+        if config.repro_after == "aot":
+            compiled_fn = deferred_for_real_inputs
+            compiled_fn._boxed_call = True  # type: ignore[attr-defined]
+            return compiled_fn
+        else:
+            return inner_compiled_fn
+
+    return debug_wrapper
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           DUMP REPROS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def generate_compiler_repro_string(gm, args, *, stable_output=False, save_dir=None):
+    model_str = textwrap.dedent(
+        f"""
+import torch
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+import torch._inductor.inductor_prims
+
+{generate_config_string(stable_output=stable_output)}
+
+isolate_fails_code_str = None
+
+{extra_imports}
+
+        """
+    )
+    if not stable_output:
+        model_str += f"# torch version: {torch.version.__version__}\n"
+        if hasattr(torch.version, "cuda"):
+            model_str += f"# torch cuda version: {torch.version.cuda}\n"
+        if hasattr(torch.version, "git_version"):
+            model_str += f"# torch git version: {torch.version.git_version}\n\n\n"
+        model_str += _cuda_system_info_comment()
+
+    model_str += NNModuleToString.convert(gm)
+
+    # get hint shape/stride when dynamic shape enabled
+    def hint_if_symint(x):
+        return tuple(i.node.hint if isinstance(i, torch.SymInt) else i for i in x)
+
+    writer = InputWriter(save_dir)
+    for placeholder, arg in zip(fx_placeholder_targets(gm), args):
+        if isinstance(arg, (int, torch.SymInt)):
+            writer.symint(placeholder, arg)
+        elif isinstance(arg, torch.Tensor):
+            # TODO: improve these names with FQN
+            writer.tensor(placeholder, arg)
+        else:
+            raise TypeError(f"arg is neither SymInt/int nor torch.Tensor, {arg}")
+
+    model_str += "\n".join(writer.lines()) + "\n"
+
+    model_str += "mod = Repro()\n"
+    return model_str
+
+
+def save_graph_repro(
+    fd,
+    gm,
+    args,
+    compiler_name,
+    *,
+    stable_output=False,
+    save_dir=None,
+    command="run",
+    accuracy=None,
+    tracing_mode=None,
+    check_str=None,
+):
+    fd.write(
+        generate_compiler_repro_string(
+            gm,
+            args,
+            stable_output=stable_output,
+            save_dir=save_dir,
+        )
+    )
+    if accuracy is None:
+        accuracy = "_accuracy" in compiler_name
+    if tracing_mode is None:
+        tracing_mode = "real"
+        if any(has_free_symbols(a) for a in args):
+            tracing_mode = "symbolic"
+    fd.write("if __name__ == '__main__':\n")
+    fd.write("    from torch._dynamo.repro.after_aot import run_repro\n")
+    fd.write(
+        f"    with torch.no_grad():\n"
+        f"        run_repro(mod, load_args, accuracy={accuracy!r}, command={command!r}, "
+        f"save_dir={save_dir!r}, tracing_mode={tracing_mode!r}, check_str={check_str!r}"
+        ")\n"
+    )
+
+
+def dump_compiler_graph_state(gm, args, compiler_name, *, accuracy=None):
+    subdir = os.path.join(minifier_dir(), "checkpoints")
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    file_name = os.path.join(subdir, f"{len(gm.graph.nodes)}.py")
+    log.warning(
+        "Writing checkpoint with %s nodes to %s", len(gm.graph.nodes), file_name
+    )
+    with open(file_name, "w") as fd:
+        save_graph_repro(
+            fd, gm, args, compiler_name, save_dir=subdir, accuracy=accuracy
+        )
+    curdir = os.getcwd()
+    repro_path = os.path.join(curdir, "repro.py")
+    try:
+        shutil.copyfile(file_name, repro_path)
+        log.warning("Copying repro file for convenience to %s", repro_path)
+        if use_buck:
+            BuckTargetWriter(file_name).write()
+    except OSError:
+        log.warning("No write permissions for %s", repro_path)
+        pass
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           DUMP MINIFIER
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def dump_to_minify(gm, args, compiler_name: str):
+    out = io.StringIO()
+    # TODO: factor this out
+    subdir = os.path.join(minifier_dir(), "checkpoints")
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    save_graph_repro(out, gm, args, compiler_name, save_dir=subdir, command="minify")
+    return helper_for_dump_minify(out.getvalue())
+
+
+def isolate_fails(
+    fx_g,
+    args,
+    compiler_name: str,
+    env=None,
+    save_dir=None,
+    accuracy=None,
+    tracing_mode=None,
+    check_str=None,
+):
+    if env is None:
+        env = {}
+    subdir = os.path.join(os.getcwd(), "isolate")
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    file_name = os.path.join(subdir, f"{str(uuid.uuid4())[:5]}.py")
+    with open(file_name, "w") as fd:
+        save_graph_repro(
+            fd,
+            fx_g,
+            args,
+            compiler_name,
+            save_dir=save_dir,
+            command="minifier-query",
+            accuracy=accuracy,
+            tracing_mode=tracing_mode,
+            check_str=check_str,
+        )
+    # with open(file_name, "r") as fd:
+    #     print(fd.read())
+    new_env = os.environ.copy()
+    new_env = {**new_env, **env}
+    stdout, stderr = TemporaryFile(), TemporaryFile()
+
+    if use_buck:
+        cmd = BuckTargetWriter(file_name).write(print_msg=False)
+    else:
+        cmd = ["python", file_name]
+
+    p = subprocess.Popen(
+        cmd,
+        cwd=subdir,
+        stdout=stdout,
+        stderr=stderr,
+        env=new_env,
+    )
+    p.wait()
+
+    stdout.seek(0)
+    stderr.seek(0)
+    print(
+        textwrap.indent(stdout.read().decode("utf-8"), prefix=">>  "), file=sys.stdout
+    )
+    print(
+        textwrap.indent(stderr.read().decode("utf-8"), prefix=">>  "), file=sys.stderr
+    )
+    # print(f"Isolated test failed - {file_name}")
+    return p.returncode != 0
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                       MINIFIER TOOLS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def inductor_fails(fx_g, args, check_str=None):
+    has_cuda = False
+    for arg in args:
+        if isinstance(arg, torch.Tensor) and arg.is_cuda:
+            has_cuda = True
+            break
+
+    def sync():
+        if has_cuda:
+            # Ensures that segfaults are surfaced
+            torch.cuda.synchronize()
+
+    from torch._inductor.compile_fx import compile_fx_inner
+
+    try:
+        result = fx_g(*args)
+        assert isinstance(result, (tuple, list))
+        assert not any(isinstance(x, (tuple, list)) for x in result)
+    except Exception:
+        return False
+
+    sync()
+
+    try:
+        compile_mod = compile_fx_inner(fx_g, args)
+        compile_mod(args)
+        sync()
+    except Exception as e:
+        if check_str is not None and check_str not in repr(e):
+            return False
+        print(repr(e))
+        return True
+    return False
+
+
+def inductor_accuracy_fails(
+    fx_g, args, check_str=None, *, require_fp64=False, ignore_non_fp=False
+):
+    from torch._inductor.compile_fx import compile_fx_inner
+
+    return backend_aot_accuracy_fails(
+        fx_g,
+        args,
+        compile_fx_inner,
+        require_fp64=require_fp64,
+        ignore_non_fp=ignore_non_fp,
+    )
+
+
+backend_aot_accuracy_fails = functools.partial(backend_accuracy_fails, only_fwd=True)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           REPRO MAIN
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def repro_common(options, mod, load_args):
+    # Invariant for graphs we generate with the repro script
+    assert not any(mod.named_parameters())
+    for n, b in mod.named_buffers():
+        if b.numel() > MAX_CONSTANT_NUMEL_INLINE:
+            log.warning(
+                "Constant %s was not serialized, generated random data instead. "
+                "If you think this is affecting you, please comment on "
+                "https://github.com/pytorch/pytorch/issues/100468",
+                n,
+            )
+
+    if not hasattr(load_args, "_version"):
+        log.warning(
+            "load_args does not have a _version attribute, please file a bug to PyTorch "
+            "and describe how you generate this repro script"
+        )
+    else:
+        if load_args._version > 0:
+            log.warning(
+                "load_args is version %s, but this version of PyTorch only supports "
+                "version 0.  We will try to run it anyway but there may be an incompatibility; "
+                "if so, try upgrading your version of PyTorch.",
+                load_args._version,
+            )
+
+    nop_reader = NopInputReader()
+    load_args(nop_reader)
+
+    with tqdm(desc="Loading inputs", total=nop_reader.total) as pbar:
+        input_reader = InputReader(save_dir=options.save_dir, pbar=pbar)
+        load_args(input_reader)
+        args = input_reader.args
+
+    # Turn mod into a GraphModule the slow way
+    # TODO: speed this up
+    mod = make_fx(mod, tracing_mode=options.tracing_mode)(*args)
+
+    torch._inductor.config.generate_intermediate_hooks = True
+
+    return mod, args
+
+
+ACCURACY_FAILS: Dict[str, Callable[[nn.Module, Any], bool]] = {
+    "": inductor_fails,
+    # This might look inverted but it's not.  strict_accuracy means "we will
+    # minify any time we see anything that diverges", whereas accuracy is more
+    # conservative, and will only minify if there is a meaningful fp64
+    # divergence
+    "accuracy": functools.partial(
+        inductor_accuracy_fails, require_fp64=True, ignore_non_fp=True
+    ),
+    "strict_accuracy": inductor_accuracy_fails,
+}
+
+
+def repro_minifier_query(options, mod, load_args):
+    mod, args = repro_common(options, mod, load_args)
+    fail_fn = functools.partial(
+        ACCURACY_FAILS[options.accuracy], check_str=options.check_str
+    )
+    if fail_fn(mod, args):
+        sys.exit(1)
+    else:
+        sys.exit(0)
+
+
+def repro_minify(options, mod, load_args):
+    from functorch.compile import minifier
+
+    mod, args = repro_common(options, mod, load_args)
+    compiler_name = "inductor_accuracy" if options.accuracy != "" else "inductor"
+
+    favored_device = 1 if torch.cuda.device_count() >= 2 else 0
+    env_variables = {"CUDA_VISIBLE_DEVICES": str(favored_device)}
+
+    module_fails: Any
+    if options.isolate:
+        module_fails = functools.partial(
+            isolate_fails,
+            env=env_variables,
+            compiler_name=compiler_name,
+            save_dir=options.save_dir,
+            accuracy=options.accuracy,
+            tracing_mode=options.tracing_mode,
+        )
+    else:
+        module_fails = ACCURACY_FAILS[options.accuracy]
+
+    minifier(
+        mod,
+        args,
+        module_fails=functools.partial(module_fails, check_str=options.check_str),
+        dump_state=functools.partial(
+            dump_compiler_graph_state, compiler_name=compiler_name
+        ),
+        save_dir=options.save_dir,
+        offload_to_disk=options.offload_to_disk,
+        skip_offload=options.skip_saving_eager_intermediates,
+        skip_sanity=options.skip_sanity,
+        max_granularity=options.max_granularity,
+    )
+
+
+def repro_analyze(options, mod, load_args):
+    from torch._inductor.compile_fx import compile_fx_inner
+    from torch._inductor.hooks import intermediate_hook
+
+    mod, args = repro_common(options, mod, load_args)
+
+    # TODO: The logic for cloning inputs/models here is intentionally
+    # modeled off of run_fwd_maybe_bwd, but arguably it is better not to
+    # clone inputs (as you are doubling your effective GPU memory usage).
+    # It is certainly faster though!  It probably makes sense to let the
+    # user specify the offload strategy.
+
+    with tqdm(desc="Compiling"):
+        compiled = compile_fx_inner(mod, args)
+    total = counters["inductor"]["intermediate_hooks"]
+
+    known_names = set()
+
+    def save_hook(name, val):
+        known_names.add(name)
+        if not options.skip_saving_inductor_intermediates:
+            writer.write_tensor(os.path.join("inductor", name), val)
+        pbar.update(1)  # type: ignore[has-type]
+
+    writer = torch.utils._content_store.ContentStoreWriter(
+        options.save_dir, stable_hash=options.stable_hash
+    )
+    reader = torch.utils._content_store.ContentStoreReader(options.save_dir)
+
+    new_args = clone_inputs(args)
+    with intermediate_hook(save_hook), tqdm(
+        desc="Saving inductor intermediates", total=total
+    ) as pbar:
+        compiled(new_args)
+        assert not new_args
+
+    def compare_tuples(tuple1, tuple2):
+        diff_indices = [i for i in range(len(tuple1)) if tuple1[i] != tuple2[i]]
+        diff_values = [(tuple1[i], tuple2[i]) for i in diff_indices]
+
+        if not diff_values:
+            return None
+        else:
+            return " and ".join(f"{a} != {b}" for a, b in diff_values)
+
+    def check_hook(name, val):
+        meta = writer.compute_tensor_metadata(val)
+        meta2 = reader.read_tensor_metadata(os.path.join("inductor", name))
+        reason = compare_tuples(meta, meta2)
+        if reason is not None:
+            pbar.write(f"NONDETERMINISTIC INDUCTOR at {name} ({reason})")
+        pbar.update(1)
+
+    if not options.skip_check_deterministic:
+        new_args = clone_inputs(args)
+        with intermediate_hook(check_hook), tqdm(
+            desc="Checking inductor determinism", total=total
+        ) as pbar:
+            compiled(new_args)
+            assert not new_args
+
+    class WriterInterp(fx.Interpreter):
+        def __init__(self, mod, subdir):
+            super().__init__(mod)
+            self.subdir = subdir
+
+        def run_node(self, n):
+            r = super().run_node(n)
+            name = n.name
+            if name in known_names:
+                pbar.update(1)
+                writer.write_tensor(os.path.join(self.subdir, name), r)
+            return r
+
+    # NB: the module cast doesn't actually do anything, since there are no
+    # parameters/buffers on the module
+    if not options.skip_saving_float64_intermediates:
+        new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))
+        with tqdm(desc="Saving float64 intermediates", total=total) as pbar:
+            WriterInterp(new_mod, "float64").boxed_run(new_args)
+        assert not new_args
+
+    class ExactReaderInterp(fx.Interpreter):
+        def run_node(self, n):
+            r = super().run_node(n)
+            name = n.name
+            if name in known_names:
+                meta = writer.compute_tensor_metadata(r)
+                meta2 = reader.read_tensor_metadata(os.path.join("float64", name))
+                reason = compare_tuples(meta, meta2)
+                if reason is not None:
+                    pbar.write(f"NONDETERMINISTIC FLOAT64 at {name} ({reason})")
+                pbar.update(1)
+            return r
+
+    # TODO: check eager determinism
+
+    if not options.skip_check_deterministic:
+        new_mod, new_args = cast_to_fp64(copy.deepcopy(mod), clone_inputs(args))
+        with tqdm(desc="Checking float64 determinism", total=total) as pbar:
+            ExactReaderInterp(new_mod).boxed_run(new_args)
+            assert not new_args
+
+    # Now that we've saved everything, interp through the eager graph
+    # and do comparisons
+    class ReaderInterp(fx.Interpreter):
+        def run_node(self, n):
+            r = super().run_node(n)
+            name = n.name
+            if name in known_names:
+                inductor = reader.read_tensor(os.path.join("inductor", name))
+                float64 = reader.read_tensor(os.path.join("float64", name))
+                logged = False
+
+                def log_error(msg, *args):
+                    nonlocal logged
+                    logged = True
+                    pbar.write(f"DIVERGED at {name}: {msg % args}")
+
+                if not same(
+                    r,
+                    inductor,
+                    float64,
+                    tol=torch._dynamo.config.repro_tolerance,
+                    equal_nan=True,
+                    log_error=log_error,
+                ):
+                    assert logged
+                pbar.update(1)
+            return r
+
+    with tqdm(desc="Checking divergence", total=total) as pbar:
+        ReaderInterp(mod).boxed_run(args)
+    assert not args
+
+
+def repro_run(options, mod, load_args):
+    from torch._inductor.compile_fx import compile_fx_inner
+
+    mod, args = repro_common(options, mod, load_args)
+
+    from torch.cuda import synchronize
+
+    compiled = compile_fx_inner(mod, args)
+
+    if options.accuracy != "":
+        # We don't really respect --accuracy vs --strict-accuracy here, it
+        # seems counterintuitive
+        if not same_two_models(mod, compiled, args, only_fwd=True):
+            raise AccuracyError("Bad accuracy detected")
+    else:
+        need_sync = False
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and arg.is_cuda:
+                need_sync = True
+                break
+        ref = compiled(list(args))
+        if need_sync:
+            synchronize()  # ensure segfaults are surfaced
+    return lambda: compiled(list(args))
+
+
+# TODO: lazily load the inputs or something, rather than cloning them
+def run_repro(
+    mod,
+    load_args,
+    *,
+    command="run",
+    accuracy: Union[bool, str] = "",
+    save_dir=None,
+    tracing_mode=None,
+    patch_code=None,
+    check_str=None,
+    **kwargs,
+):
+    for k in kwargs:
+        log.warning(
+            "Unrecognized kwarg %s; perhaps this repro was made on a newer version of PyTorch",
+            k,
+        )
+
+    if accuracy is True:
+        accuracy = "accuracy"
+    elif accuracy is False:
+        accuracy = ""
+
+    if patch_code is not None:
+        log.warning(
+            "patch_code no longer works on this version of PyTorch, silently ignoring"
+        )
+
+    parser = argparse.ArgumentParser(
+        description=f"""\
+An after_aot repro script, typically triggering a bug in PyTorch Inductor.
+When run with no arguments, this script defaults to running '{command}'.
+Extra flags may be available; to find out more, try '{command} --help'.
+There are also alternate subcommands available, see below.
+
+default settings on this script:
+  {accuracy=}
+  {tracing_mode=}
+  {save_dir=}
+  {check_str=}
+""",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    def common_flags(parser):
+        accuracy_group = parser.add_mutually_exclusive_group()
+        accuracy_group.add_argument(
+            "--no-accuracy",
+            dest="accuracy",
+            action="store_const",
+            const="",
+            default=accuracy,
+            help="do not test accuracy, just run the module and see if it errors",
+        )
+        accuracy_group.add_argument(
+            "--accuracy",
+            action="store_const",
+            const="accuracy",
+            default=accuracy,
+            help="""\
+test if the RMSE between the compiled module and the fp64 reference is greater
+than eager and the fp64 reference. This is usually more reliable than the
+standard allclose test, as we expect numeric differences from compiling, often
+improving accuracy over eager.  RMSE test allows for compiled module to
+diverge greatly from eager, as long as this divergence moves it closer to the
+'true' mathematical value of the network.  Caveats: (1) double precision can
+still suffer from rounding error, so it is not a perfect reference (see for
+example 'Herbie: Automatically Improving Floating Point Accuracy') for
+approaches that detect the necessary working precision and compute it in
+arbitrary precision floating point; unfortunately, this is not practical for
+tensor computation; (2) if there are not enough samples in the output being
+compared, we may get unlucky and have an unlucky greater RMSE than eager; this
+could be overcome by applying a more rigorous statistical test at some
+p-value, which we leave for future work.
+""",
+        )
+        accuracy_group.add_argument(
+            "--strict-accuracy",
+            dest="accuracy",
+            action="store_const",
+            const="strict_accuracy",
+            default=accuracy,
+            help="""\
+by default, when doing accuracy minification we will reject reductions which
+change the divergence from a floating point divergence to a integral/boolean
+divergence.  This is because some operations like ReLU involve temporarily
+sharp boundaries that smooth out again afterwards; without requiring
+divergence on floating point, the minifier will often fixate on divergent
+boolean tensor even though this is not the true source of the divergence.
+However, rejecting these reductions makes it more difficult for the minifier
+to make process.  Using this option will let the minifier progress for ALL
+divergences--you just might not end up with a useful repro in the end.""",
+        )
+
+        parser.add_argument(
+            "--save-dir",
+            type=str,
+            default=save_dir,
+            metavar="DIR",
+            help="directory where saved inputs live",
+        )
+        parser.add_argument(
+            "--no-save-dir",
+            dest="save_dir",
+            action="store_const",
+            const=None,
+            help="don't use any directory for saved inputs",
+        )
+        parser.add_argument(
+            "--tracing-mode",
+            type=str,
+            metavar="{real,fake,symbolic}",
+            default=tracing_mode,
+            help="how to trace the repro module into a GraphModule with metadata",
+        )
+
+    subparsers = parser.add_subparsers(
+        dest="command", metavar="{run,minify,analyze}", required=True
+    )
+
+    parser_run = subparsers.add_parser(
+        "run",
+        help="just run the repro",
+    )
+    common_flags(parser_run)
+
+    parser_minify = subparsers.add_parser(
+        "minify", help="run the minifier on the repro"
+    )
+    common_flags(parser_minify)
+    parser_minify_isolate = parser_minify.add_mutually_exclusive_group()
+    parser_minify_isolate.add_argument(
+        "--isolate",
+        action="store_true",
+        default=True,
+        help="run in separate processes to avoid interference (default)",
+    )
+    parser_minify_isolate.add_argument(
+        "--no-isolate",
+        dest="isolate",
+        action="store_false",
+        help="speed up by running all compilation in same process",
+    )
+    parser_minify.add_argument(
+        "--skip-saving-eager-intermediates",
+        action="store_true",
+        help="skip saving eager intermediates on --minify",
+    )
+    # TODO: make this an option for --analyze too
+    parser_minify.add_argument(
+        "--offload-to-disk",
+        action="store_true",
+        help="during minification, offload delta debugging intermediates to disk.  Use if you're OOMing",
+    )
+    parser_minify.add_argument(
+        "--skip-sanity",
+        action="store_true",
+        help="skip sanity check at beginning of minification on original graph",
+    )
+    parser_minify.add_argument(
+        "--max-granularity",
+        type=int,
+        default=None,
+        help="start at this granularity and work down; must be power of 2",
+    )
+    parser_minify.add_argument(
+        "--check-str",
+        type=str,
+        default=check_str,
+        help="require minified program to fail with error containing this string",
+    )
+
+    parser_analyze = subparsers.add_parser(
+        "analyze", help="run the accuracy analyzer on the repro"
+    )
+    common_flags(parser_analyze)
+    parser_analyze.add_argument(
+        "--skip-saving-inductor-intermediates",
+        action="store_true",
+        help="skip saving inductor intermediates on --analyze",
+    )
+    parser_analyze.add_argument(
+        "--skip-saving-float64-intermediates",
+        action="store_true",
+        help="skip saving float64 intermediates",
+    )
+    parser_analyze.add_argument(
+        "--skip-check-deterministic",
+        action="store_true",
+        help="skip checking that the network is deterministic",
+    )
+    parser_analyze.add_argument(
+        "--stable-hash",
+        action="store_true",
+        help="use SHA-1 checksum instead of fast (but possibly unsound) hash",
+    )
+
+    # Run the repro in the context of minification, inverting exit code meaning
+    parser_minifier_query = subparsers.add_parser(
+        "minifier-query",
+    )
+    common_flags(parser_minifier_query)
+    parser_minifier_query.add_argument(
+        "--check-str",
+        type=str,
+        default=check_str,
+        help="require minified program to fail with error containing this string",
+    )
+
+    args = None
+    if len(sys.argv) <= 1:
+        args = [command, *sys.argv[1:]]
+
+    options = parser.parse_args(args)
+    COMMAND_FNS = {
+        "minify": repro_minify,
+        "analyze": repro_analyze,
+        "minifier-query": repro_minifier_query,
+        "run": repro_run,
+    }
+    return COMMAND_FNS[options.command](options, mod, load_args)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/repro/after_dynamo.py b/MLPY/Lib/site-packages/torch/_dynamo/repro/after_dynamo.py
new file mode 100644
index 0000000000000000000000000000000000000000..854807d5a0654e06d496fb37f7b5e76602e53e4e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/repro/after_dynamo.py
@@ -0,0 +1,566 @@
+import argparse
+import copy
+import functools
+import logging
+import os
+import shutil
+import sys
+import textwrap
+from importlib import import_module
+from typing import Union
+
+import torch
+import torch.fx as fx
+
+from torch._dynamo.debug_utils import (
+    AccuracyError,
+    backend_accuracy_fails,
+    BUCK_CMD_PREFIX,
+    BuckTargetWriter,
+    extra_imports,
+    generate_config_string,
+    helper_for_dump_minify,
+    InputReader,
+    InputWriter,
+    minifier_dir,
+    NNModuleToString,
+    NopInputReader,
+    run_fwd_maybe_bwd,
+    same_two_models,
+)
+from torch.fx.experimental.symbolic_shapes import fx_placeholder_targets
+from torch.hub import tqdm
+
+from .. import config
+from ..backends.registry import lookup_backend, register_debug_backend
+from ..debug_utils import clone_inputs_retaining_gradness
+
+log = logging.getLogger(__name__)
+
+
+inductor_config = import_module("torch._inductor.config")
+use_buck = inductor_config.is_fbcode()
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           MAIN ENTRY POINT
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def wrap_backend_debug(unconfigured_compiler_fn, compiler_name: str):
+    """
+    A minifier decorator that wraps the TorchDynamo produced Fx graph modules.
+    As opposed to wrap_compiler_debug, this wrapper intercepts at the
+    TorchDynamo produced Fx Graph Module. This makes it backend-agnostic to some
+    level, e.g., it is useful for minifying issues related to Aot Autograd
+    tracing.  If an error is found, we minify and save the minified repro in
+    repro.tar.gz.
+    """
+
+    @functools.wraps(unconfigured_compiler_fn)
+    def debug_wrapper(gm, example_inputs, **kwargs):
+        compiler_fn = functools.partial(unconfigured_compiler_fn, **kwargs)
+        assert config.repro_after in ("dynamo", "aot", None)
+
+        if config.repro_after == "dynamo":
+
+            def add_paths(exc):
+                exc.minifier_path = os.path.join(minifier_dir(), "minifier_launcher.py")
+                if use_buck:
+                    exc.buck_command = " ".join(
+                        BUCK_CMD_PREFIX
+                        + [BuckTargetWriter(exc.minifier_path).cmd_line_path]
+                    )
+
+            if config.repro_level == 3:
+                dump_to_minify_after_dynamo(gm, example_inputs, compiler_name)
+
+            # Check for either accuracy (level 4) or other type of failures.
+            if config.repro_level == 4:
+                # Check Accuracy
+                compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                if backend_accuracy_fails(gm, example_inputs, compiler_fn):
+                    log.warning(
+                        "Accuracy failed for the TorchDynamo produced graph. Creating script to minify the error."
+                    )
+                    dump_to_minify_after_dynamo(
+                        fx.GraphModule(gm, copy.deepcopy(gm.graph)),
+                        example_inputs,
+                        compiler_name,
+                    )
+                    exc = AccuracyError("Bad accuracy detected.")
+                    add_paths(exc)
+                    raise exc
+            else:
+                try:
+                    compiled_gm = compiler_fn(copy.deepcopy(gm), example_inputs)
+                    run_fwd_maybe_bwd(compiled_gm, example_inputs)
+                except Exception as exc:
+                    log.warning(
+                        "Compiled Fx GraphModule failed. Creating script to minify the error."
+                    )
+                    if config.repro_level == 1:
+                        dump_state_fn = functools.partial(
+                            dump_backend_state, compiler_name=compiler_name
+                        )
+                        dump_state_fn(
+                            fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs
+                        )
+                    elif config.repro_level == 2:
+                        dump_to_minify_after_dynamo(
+                            fx.GraphModule(gm, copy.deepcopy(gm.graph)),
+                            example_inputs,
+                            compiler_name,
+                        )
+                    add_paths(exc)
+                    raise
+        else:
+            compiled_gm = compiler_fn(gm, example_inputs)
+
+        return compiled_gm
+
+    debug_wrapper._torchdynamo_orig_callable = unconfigured_compiler_fn  # type: ignore[attr-defined]
+    if hasattr(unconfigured_compiler_fn, "compiler_name"):
+        debug_wrapper.__name__ = unconfigured_compiler_fn.compiler_name
+    if hasattr(unconfigured_compiler_fn, "get_compiler_config"):
+        debug_wrapper.get_compiler_config = unconfigured_compiler_fn.get_compiler_config  # type: ignore[attr-defined]
+    return debug_wrapper
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           REPRO DUMPERS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def generate_dynamo_fx_repro_string(
+    gm,
+    args,
+    compiler_name,
+    check_accuracy=False,
+    *,
+    stable_output=False,
+    save_dir=None,
+    command="run",
+):
+    """
+    Generate a repro string for backend-agnostic minified version.
+    """
+
+    model_str = NNModuleToString.convert(gm)
+
+    # TODO: Figure out why torch.compile'd hash isn't work on this codepath
+    writer = InputWriter(save_dir, stable_hash=True)
+    for placeholder, arg in zip(fx_placeholder_targets(gm), args):
+        if isinstance(arg, (int, torch.SymInt)):
+            writer.symint(placeholder, arg)
+        elif isinstance(arg, torch.Tensor):
+            # TODO: improve these names with FQN
+            writer.tensor(placeholder, arg)
+        else:
+            raise TypeError(f"arg is neither SymInt/int nor torch.Tensor, {arg}")
+    load_args = "\n".join(writer.lines())
+
+    return textwrap.dedent(
+        f"""
+from math import inf
+import torch
+from torch import tensor, device
+import torch.fx as fx
+import torch._dynamo
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.debug_utils import run_fwd_maybe_bwd
+
+{generate_config_string(stable_output=stable_output)}
+
+{extra_imports}
+
+{model_str}
+mod = Repro()
+
+{load_args}
+
+if __name__ == '__main__':
+    from torch._dynamo.repro.after_dynamo import run_repro
+    run_repro(mod, load_args, accuracy={check_accuracy!r}, command={command!r},
+        save_dir={save_dir!r}, autocast={torch.is_autocast_enabled()!r}, backend={compiler_name!r})
+"""
+    )
+
+
+def dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy=False):
+    """
+    Saves the repro to a repro.py file
+    """
+    curdir = os.getcwd()
+    subdir = os.path.join(os.getcwd(), "checkpoints")
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    file_name = os.path.join(subdir, f"minified_{len(gm.graph.nodes)}_nodes.py")
+    log.warning(
+        "Writing checkpoint with %s nodes to %s", len(gm.graph.nodes), file_name
+    )
+
+    with open(file_name, "w") as fd:
+        fd.write(
+            generate_dynamo_fx_repro_string(
+                gm, args, compiler_name, check_accuracy, save_dir=subdir
+            )
+        )
+    latest_repro = os.path.join(curdir, "repro.py")
+    log.warning("Copying %s to %s for convenience", file_name, latest_repro)
+
+    if use_buck:
+        BuckTargetWriter(latest_repro).write()
+
+    shutil.copyfile(file_name, latest_repro)
+
+
+def dump_backend_state(gm, args, compiler_name, check_accuracy=False):
+    """
+    Dumps the dynamo graph to repro the issue.
+    1) It tries to convert Fx GraphModule to a string. If we can, it writes to a
+    repro.py file.
+    2) If we can't convert Fx GraphModule to a string, we use to_folder to save
+    the module and save a tar file.
+    """
+    assert NNModuleToString.can_convert_to_string(gm)
+    return dump_backend_repro_as_file(gm, args, compiler_name, check_accuracy)
+    # return dump_backend_repro_as_tarfile(gm, args, compiler_name)
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                       MINIFIER DUMPER
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def dump_to_minify_after_dynamo(gm, args, compiler_name):
+    # TODO: factor this out
+    subdir = os.path.join(minifier_dir(), "checkpoints")
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    helper_for_dump_minify(
+        generate_dynamo_fx_repro_string(
+            gm,
+            args,
+            compiler_name,
+            check_accuracy=config.repro_level == 4,
+            save_dir=subdir,
+            command="minify",
+        )
+    )
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                       MINIFIER BACKENDS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+@register_debug_backend
+def dynamo_minifier_backend(gm, example_inputs, compiler_name):
+    from functorch.compile import minifier
+
+    compiler_fn = lookup_backend(compiler_name)
+
+    # TODO: It's inconsistent to pass SymInt inputs but REAL tensors.
+    # We should pass ints and look at the GraphModule placeholders
+    # to resolve them to SymInt (if necessary)
+    example_inputs = [
+        i.node.hint if isinstance(i, torch.SymInt) else i for i in example_inputs
+    ]
+
+    try:
+        compiled_gm = compiler_fn(gm, example_inputs)
+        run_fwd_maybe_bwd(compiled_gm, example_inputs)
+        raise ValueError("No issue was detected")
+    except Exception as exc:
+        orig_failure = str(exc)
+        log.warning(
+            "Compiled Fx GraphModule failed. Creating script to minify the error."
+        )
+        dump_state_fn = functools.partial(
+            dump_backend_state, compiler_name=compiler_name
+        )
+        dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
+        fails_fn = functools.partial(
+            backend_fails,
+            compiler_fn=compiler_fn,
+            orig_failure=orig_failure,
+        )
+        minifier(
+            gm,
+            example_inputs,
+            module_fails=fails_fn,
+            dump_state=dump_state_fn,
+        )
+    return gm
+
+
+@register_debug_backend
+def dynamo_accuracy_minifier_backend(gm, example_inputs, compiler_name):
+    from functorch.compile import minifier
+
+    compiler_fn = lookup_backend(compiler_name)
+
+    # Set the eval mode to remove randomness.
+    gm.eval()
+
+    # Check Accuracy
+    if backend_accuracy_fails(
+        gm, example_inputs, compiler_fn, only_fwd=config.repro_forward_only
+    ):
+        log.warning("Accuracy failed for the TorchDynamo produced graph")
+        dump_state_fn = functools.partial(
+            dump_backend_state, compiler_name=compiler_name, check_accuracy=True
+        )
+        fails_fn = functools.partial(
+            backend_accuracy_fails,
+            compiler_fn=compiler_fn,
+            only_fwd=config.repro_forward_only,
+        )
+        dump_state_fn(fx.GraphModule(gm, copy.deepcopy(gm.graph)), example_inputs)
+        minifier(
+            gm,
+            example_inputs,
+            module_fails=fails_fn,
+            dump_state=dump_state_fn,
+        )
+    else:
+        log.error("Input graph does not fail accuracy testing")
+    return gm
+
+
+def backend_fails(gm, example_inputs, compiler_fn, orig_failure):
+    """
+    Minifier uses this function to identify if the minified graph module fails
+    with the same error.
+
+    One caveat is that minifier can potentially go into a wrong direction when
+    the resulting graph module fails for a different reason. To avoid this, we
+    save the string for the original exception and check similarity between new
+    and old exception. They can be somewhat different in some cases, when the
+    exception string depends on the failing node information. So, we have a
+    loose similarity metric to guide the minifier path.
+    """
+    from difflib import SequenceMatcher
+
+    try:
+        # Run the original gm to check eager validity
+        run_fwd_maybe_bwd(gm, clone_inputs_retaining_gradness(example_inputs))
+        compiled_gm = compiler_fn(gm, example_inputs)
+        run_fwd_maybe_bwd(compiled_gm, clone_inputs_retaining_gradness(example_inputs))
+        return False
+    except Exception as e:
+        new_failure = str(e)
+        if SequenceMatcher(None, orig_failure, new_failure).ratio() > 0.5:
+            return True
+        return False
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+#                           REPRO MAIN
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
+
+
+def run_load_args(options, mod, load_args):
+    if not hasattr(load_args, "_version"):
+        log.warning(
+            "load_args does not have a _version attribute, please file a bug to PyTorch "
+            "and describe how you generate this repro script"
+        )
+    else:
+        if load_args._version > 0:
+            log.warning(
+                "load_args is version %s, but this version of PyTorch only supports "
+                "version 0.  We will try to run it anyway but there may be an incompatibility; "
+                "if so, try upgrading your version of PyTorch.",
+                load_args._version,
+            )
+
+    nop_reader = NopInputReader()
+    load_args(nop_reader)
+
+    with tqdm(desc="Loading inputs", total=nop_reader.total) as pbar:
+        input_reader = InputReader(save_dir=options.save_dir, pbar=pbar)
+        load_args(input_reader)
+        args = input_reader.args
+
+    return args
+
+
+def repro_minify(options, mod, load_args):
+    args = run_load_args(options, mod, load_args)
+
+    # Setup debug minifier compiler
+    if not options.accuracy:
+        compiler_fn = lookup_backend("dynamo_minifier_backend")
+    else:
+        compiler_fn = lookup_backend("dynamo_accuracy_minifier_backend")
+
+    if options.backend is None:
+        raise RuntimeError(
+            "Compiler name is None - this likely means that a custom compiler "
+            "was called by torchdynamo. Please remove this error, import your "
+            "custom compiler function, and replace the backend=None "
+            "line in run_repro to backend=<my_imported_custom_function>"
+        )
+
+    dynamo_minifier_backend = functools.partial(
+        compiler_fn,
+        compiler_name=options.backend,
+    )
+    opt_mod = torch._dynamo.optimize(dynamo_minifier_backend)(mod)
+
+    with torch.cuda.amp.autocast(enabled=options.autocast):
+        opt_mod(*args)
+
+
+def repro_run(options, mod, load_args):
+    opt_mod = torch._dynamo.optimize(options.backend)(mod)
+
+    if options.accuracy != "":
+        mod.eval()
+        opt_mod.eval()
+
+        with torch.cuda.amp.autocast(enabled=options.autocast):
+            # TODO: disable clone
+            args = run_load_args(options, mod, load_args)
+            assert same_two_models(mod, mod, args), "Eager itself failed"
+            if not same_two_models(mod, opt_mod, args):
+                raise AccuracyError("Dynamo failed")
+    else:
+        with torch.cuda.amp.autocast(enabled=options.autocast):
+            args = run_load_args(options, mod, load_args)
+            ref = run_fwd_maybe_bwd(
+                mod, args, only_fwd=options.only_fwd, disable_clone=True
+            )
+            del args
+
+            args = run_load_args(options, mod, load_args)
+            res = run_fwd_maybe_bwd(
+                opt_mod, args, only_fwd=options.only_fwd, disable_clone=True
+            )
+
+
+def run_repro(
+    mod,
+    load_args,
+    *,
+    command="run",
+    accuracy: Union[bool, str] = "",
+    save_dir=None,
+    autocast=False,
+    backend="inductor",
+    **kwargs,
+):
+    for k in kwargs:
+        log.warning(
+            "Unrecognized kwarg %s; perhaps this repro was made on a newer version of PyTorch",
+            k,
+        )
+
+    if accuracy is True:
+        accuracy = "accuracy"
+    elif accuracy is False:
+        accuracy = ""
+
+    parser = argparse.ArgumentParser(
+        description=f"""\
+An after_dynamo repro script, typically triggering a bug in Dynamo or
+AOTAutograd.  When run with no arguments, this script defaults to running
+'{command}'.  Extra flags may be available; to find out more, try '{command}
+--help'.  There are also alternate subcommands available, see below.
+
+default settings on this script:
+  {accuracy=}
+  {save_dir=}
+""",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    def common_flags(parser):
+        accuracy_group = parser.add_mutually_exclusive_group()
+        accuracy_group.add_argument(
+            "--no-accuracy",
+            dest="accuracy",
+            action="store_const",
+            const="",
+            default=accuracy,
+            help="do not test accuracy, just run the module and see if it errors",
+        )
+        accuracy_group.add_argument(
+            "--accuracy",
+            action="store_const",
+            const="accuracy",
+            default=accuracy,
+            help="test accuracy",
+        )
+        parser.add_argument(
+            "--save-dir",
+            type=str,
+            default=save_dir,
+            metavar="DIR",
+            help="directory where saved inputs live",
+        )
+        parser.add_argument(
+            "--no-save-dir",
+            dest="save_dir",
+            action="store_const",
+            const=None,
+            help="don't use any directory for saved inputs",
+        )
+        parser.add_argument(
+            "--no-isolate",
+            dest="isolate",
+            action="store_false",
+            default=False,
+            help="no isolate (doesn't do anything for after_dynamo)",
+        )
+        parser.add_argument(
+            "--autocast",
+            default=autocast,
+            action="store_true",
+            help="use torch.cuda.amp.autocast",
+        )
+        parser.add_argument(
+            "--no-autocast",
+            dest="autocast",
+            action="store_false",
+            help="don't use torch.cuda.amp.autocast",
+        )
+        parser.add_argument(
+            "--backend",
+            type=str,
+            default=backend,
+            metavar="BACKEND",
+            help="torch.compile backend to use",
+        )
+
+    subparsers = parser.add_subparsers(
+        dest="command", metavar="{run,minify}", required=True
+    )
+
+    parser_run = subparsers.add_parser(
+        "run",
+        help="just run the repro",
+    )
+    common_flags(parser_run)
+    parser_run.add_argument(
+        "--only-fwd",
+        action="store_true",
+        help="don't run backwards compilation for testing",
+    )
+
+    parser_minify = subparsers.add_parser(
+        "minify", help="run the minifier on the repro"
+    )
+    common_flags(parser_minify)
+
+    args = None
+    if len(sys.argv) <= 1:
+        args = [command, *sys.argv[1:]]
+
+    options = parser.parse_args(args)
+    COMMAND_FNS = {
+        "minify": repro_minify,
+        "run": repro_run,
+    }
+    COMMAND_FNS[options.command](options, mod, load_args)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/resume_execution.py b/MLPY/Lib/site-packages/torch/_dynamo/resume_execution.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df133548254a35808b5c54b4f58fc05050fdd8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/resume_execution.py
@@ -0,0 +1,648 @@
+import copy
+import dataclasses
+import sys
+import types
+from typing import Any, cast, Dict, List, Optional, Tuple
+
+from .bytecode_transformation import (
+    create_call_function,
+    create_call_method,
+    create_dup_top,
+    create_instruction,
+    create_jump_absolute,
+    Instruction,
+    InstructionExnTabEntry,
+    transform_code_object,
+    unique_id,
+)
+from .utils import ExactWeakKeyDictionary
+
+# taken from code.h in cpython
+CO_OPTIMIZED = 0x0001
+CO_NEWLOCALS = 0x0002
+CO_VARARGS = 0x0004
+CO_VARKEYWORDS = 0x0008
+CO_NESTED = 0x0010
+CO_GENERATOR = 0x0020
+CO_NOFREE = 0x0040
+CO_COROUTINE = 0x0080
+CO_ITERABLE_COROUTINE = 0x0100
+CO_ASYNC_GENERATOR = 0x0200
+
+
+@dataclasses.dataclass(frozen=True)
+class ReenterWith:
+    stack_index: int
+    target_values: Optional[Tuple[Any, ...]] = None
+
+    # If we do not want to destroy the stack, we can do the same thing as a
+    # `SETUP_WITH` block, only that we store the context manager in a local_symbol
+    def try_except(self, code_options, cleanup: List[Instruction]):
+        """
+        Codegen based off of:
+        load args
+        enter context
+        try:
+            (rest)
+        finally:
+            exit context
+        """
+        load_args = []
+        if self.target_values:
+            load_args = [
+                create_instruction("LOAD_CONST", argval=val)
+                for val in self.target_values
+            ]
+        ctx_name = unique_id(f"___context_manager_{self.stack_index}")
+        if ctx_name not in code_options["co_varnames"]:
+            code_options["co_varnames"] += (ctx_name,)
+        for name in ["__enter__", "__exit__"]:
+            if name not in code_options["co_names"]:
+                code_options["co_names"] += (name,)
+
+        except_jump_target = create_instruction(
+            "NOP" if sys.version_info < (3, 11) else "PUSH_EXC_INFO"
+        )
+        cleanup_complete_jump_target = create_instruction("NOP")
+
+        setup_finally = [
+            *load_args,
+            *create_call_function(len(load_args), True),
+            create_instruction("STORE_FAST", argval=ctx_name),
+            create_instruction("LOAD_FAST", argval=ctx_name),
+            create_instruction("LOAD_METHOD", argval="__enter__"),
+            *create_call_method(0),
+            create_instruction("POP_TOP"),
+        ]
+
+        if sys.version_info < (3, 11):
+            setup_finally.append(
+                create_instruction("SETUP_FINALLY", target=except_jump_target)
+            )
+        else:
+            exn_tab_begin = create_instruction("NOP")
+            exn_tab_end = create_instruction("NOP")
+            exn_tab_begin.exn_tab_entry = InstructionExnTabEntry(
+                exn_tab_begin,
+                exn_tab_end,
+                except_jump_target,
+                self.stack_index + 1,
+                False,
+            )
+            setup_finally.append(exn_tab_begin)
+
+        def create_reset():
+            return [
+                create_instruction("LOAD_FAST", argval=ctx_name),
+                create_instruction("LOAD_METHOD", argval="__exit__"),
+                create_instruction("LOAD_CONST", argval=None),
+                create_dup_top(),
+                create_dup_top(),
+                *create_call_method(3),
+                create_instruction("POP_TOP"),
+            ]
+
+        if sys.version_info < (3, 9):
+            epilogue = [
+                create_instruction("POP_BLOCK"),
+                create_instruction("BEGIN_FINALLY"),
+                except_jump_target,
+                *create_reset(),
+                create_instruction("END_FINALLY"),
+            ]
+        elif sys.version_info < (3, 11):
+            epilogue = [
+                create_instruction("POP_BLOCK"),
+                *create_reset(),
+                create_instruction("JUMP_FORWARD", target=cleanup_complete_jump_target),
+                except_jump_target,
+                *create_reset(),
+                create_instruction("RERAISE"),
+                cleanup_complete_jump_target,
+            ]
+        else:
+            finally_exn_tab_end = create_instruction("RERAISE", arg=0)
+            finally_exn_tab_target = create_instruction("COPY", arg=3)
+            except_jump_target.exn_tab_entry = InstructionExnTabEntry(
+                except_jump_target,
+                finally_exn_tab_end,
+                finally_exn_tab_target,
+                self.stack_index + 2,
+                True,
+            )
+            epilogue = [
+                exn_tab_end,
+                *create_reset(),
+                create_instruction("JUMP_FORWARD", target=cleanup_complete_jump_target),
+                except_jump_target,  # PUSH_EXC_INFO
+                *create_reset(),
+                finally_exn_tab_end,  # RERAISE 0
+                finally_exn_tab_target,  # COPY 3
+                create_instruction("POP_EXCEPT"),
+                create_instruction("RERAISE", arg=1),
+                cleanup_complete_jump_target,
+            ]
+
+        cleanup[:] = epilogue + cleanup
+        return setup_finally
+
+    def __call__(self, code_options, cleanup):
+        """
+        Codegen based off of:
+        with ctx(args):
+            (rest)
+        """
+        load_args = []
+        if self.target_values:
+            load_args = [
+                create_instruction("LOAD_CONST", argval=val)
+                for val in self.target_values
+            ]
+        if sys.version_info < (3, 9):
+            with_cleanup_start = create_instruction("WITH_CLEANUP_START")
+            begin_finally = create_instruction("BEGIN_FINALLY")
+            cleanup[:] = [
+                create_instruction("POP_BLOCK"),
+                begin_finally,
+                with_cleanup_start,
+                create_instruction("WITH_CLEANUP_FINISH"),
+                create_instruction("END_FINALLY"),
+            ] + cleanup
+
+            return [
+                *load_args,
+                create_instruction("CALL_FUNCTION", arg=len(load_args)),
+                create_instruction("SETUP_WITH", target=with_cleanup_start),
+                create_instruction("POP_TOP"),
+            ], None
+        elif sys.version_info < (3, 11):
+            with_except_start = create_instruction("WITH_EXCEPT_START")
+            pop_top_after_with_except_start = create_instruction("POP_TOP")
+
+            cleanup_complete_jump_target = create_instruction("NOP")
+
+            cleanup[:] = [
+                create_instruction("POP_BLOCK"),
+                create_instruction("LOAD_CONST", argval=None),
+                create_instruction("DUP_TOP"),
+                create_instruction("DUP_TOP"),
+                create_instruction("CALL_FUNCTION", arg=3),
+                create_instruction("POP_TOP"),
+                create_instruction("JUMP_FORWARD", target=cleanup_complete_jump_target),
+                with_except_start,
+                create_instruction(
+                    "POP_JUMP_IF_TRUE", target=pop_top_after_with_except_start
+                ),
+                create_instruction("RERAISE"),
+                pop_top_after_with_except_start,
+                create_instruction("POP_TOP"),
+                create_instruction("POP_TOP"),
+                create_instruction("POP_EXCEPT"),
+                create_instruction("POP_TOP"),
+                cleanup_complete_jump_target,
+            ] + cleanup
+
+            return [
+                *load_args,
+                create_instruction("CALL_FUNCTION", arg=len(load_args)),
+                create_instruction("SETUP_WITH", target=with_except_start),
+                create_instruction("POP_TOP"),
+            ], None
+        else:
+            pop_top_after_with_except_start = create_instruction("POP_TOP")
+            cleanup_complete_jump_target = create_instruction("NOP")
+
+            def create_load_none():
+                return create_instruction("LOAD_CONST", argval=None)
+
+            exn_tab_1_begin = create_instruction("POP_TOP")
+            exn_tab_1_end = create_instruction("NOP")
+            exn_tab_1_target = create_instruction("PUSH_EXC_INFO")
+            exn_tab_2_end = create_instruction("RERAISE", arg=2)
+            exn_tab_2_target = create_instruction("COPY", arg=3)
+
+            exn_tab_1_begin.exn_tab_entry = InstructionExnTabEntry(
+                exn_tab_1_begin,
+                exn_tab_1_end,
+                exn_tab_1_target,
+                self.stack_index + 1,
+                True,
+            )
+            exn_tab_1_target.exn_tab_entry = InstructionExnTabEntry(
+                exn_tab_1_target,
+                exn_tab_2_end,
+                exn_tab_2_target,
+                self.stack_index + 3,
+                True,
+            )
+            pop_top_after_with_except_start.exn_tab_entry = InstructionExnTabEntry(
+                pop_top_after_with_except_start,
+                pop_top_after_with_except_start,
+                exn_tab_2_target,
+                self.stack_index + 3,
+                True,
+            )
+
+            cleanup[:] = [
+                exn_tab_1_end,
+                create_load_none(),
+                create_load_none(),
+                create_load_none(),
+                *create_call_function(2, False),
+                create_instruction("POP_TOP"),
+                create_instruction("JUMP_FORWARD", target=cleanup_complete_jump_target),
+                exn_tab_1_target,  # PUSH_EXC_INFO
+                create_instruction("WITH_EXCEPT_START"),
+                create_instruction(
+                    "POP_JUMP_FORWARD_IF_TRUE",
+                    target=pop_top_after_with_except_start,
+                ),
+                exn_tab_2_end,  # RERAISE 2
+                exn_tab_2_target,  # COPY 3
+                create_instruction("POP_EXCEPT"),
+                create_instruction("RERAISE", arg=1),
+                pop_top_after_with_except_start,
+                create_instruction("POP_EXCEPT"),
+                create_instruction("POP_TOP"),
+                create_instruction("POP_TOP"),
+                cleanup_complete_jump_target,
+            ] + cleanup
+
+            return [
+                *load_args,
+                *create_call_function(len(load_args), True),
+                create_instruction("BEFORE_WITH"),
+                exn_tab_1_begin,  # POP_TOP
+            ], exn_tab_1_target
+
+
+@dataclasses.dataclass
+class ResumeFunctionMetadata:
+    code: types.CodeType
+    instructions: List[Instruction] = dataclasses.field(default_factory=list)
+    # Python 3.11+ fields
+    # NOTE: Python 3.11 removed blocks, but for our purposes, a "block" consists
+    # of instructions of all exception table entries that have the same target.
+
+    # map from PUSH_EXC_INFO's in the prefix to original block target offset
+    prefix_block_target_offset_remap: List[int] = dataclasses.field(
+        default_factory=list
+    )
+    # map from new block target offsets to original block target offsets
+    block_target_offset_remap: Optional[Dict[int, int]] = None
+
+
+def _filter_iter(l1, l2, cond):
+    """
+    Two-pointer conditional filter.
+    e.g. _filter_iter(insts, sorted_offsets, lambda i, o: i.offset == o)
+    returns the instructions with offsets in sorted_offsets
+    """
+    it = iter(l2)
+    res = []
+    try:
+        cur = next(it)
+        for val in l1:
+            if cond(val, cur):
+                res.append(val)
+                cur = next(it)
+    except StopIteration:
+        pass
+    return res
+
+
+class ContinueExecutionCache:
+    cache = ExactWeakKeyDictionary()
+    generated_code_metadata = ExactWeakKeyDictionary()
+
+    @classmethod
+    def lookup(cls, code, lineno, *key):
+        if code not in cls.cache:
+            cls.cache[code] = dict()
+        key = tuple(key)
+        if key not in cls.cache[code]:
+            cls.cache[code][key] = cls.generate(code, lineno, *key)
+        return cls.cache[code][key]
+
+    @classmethod
+    def generate(
+        cls,
+        code,
+        lineno,
+        offset: int,
+        setup_fn_target_offsets: Tuple[int],  # only used in Python 3.11+
+        nstack: int,
+        argnames: Tuple[str],
+        setup_fns: Tuple[ReenterWith],
+        null_idxes: Tuple[int],
+    ) -> types.CodeType:
+        assert offset is not None
+        assert not (
+            code.co_flags
+            & (CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR)
+        )
+        assert code.co_flags & CO_OPTIMIZED
+        if code in ContinueExecutionCache.generated_code_metadata:
+            return cls.generate_based_on_original_code_object(
+                code,
+                lineno,
+                offset,
+                setup_fn_target_offsets,
+                nstack,
+                argnames,
+                setup_fns,
+                null_idxes,
+            )
+
+        is_py311_plus = sys.version_info >= (3, 11)
+        meta = ResumeFunctionMetadata(code)
+
+        def update(instructions: List[Instruction], code_options: Dict[str, Any]):
+            meta.instructions = copy.deepcopy(instructions)
+
+            args = [f"___stack{i}" for i in range(nstack)]
+            args.extend(v for v in argnames if v not in args)
+            freevars = tuple(code_options["co_cellvars"] or []) + tuple(
+                code_options["co_freevars"] or []
+            )
+            code_options[
+                "co_name"
+            ] = f"torch_dynamo_resume_in_{code_options['co_name']}_at_{lineno}"
+            if is_py311_plus:
+                qualified_path = code_options["co_qualname"].rsplit(".", maxsplit=1)
+                if len(qualified_path) == 1:
+                    code_options["co_qualname"] = code_options["co_name"]
+                else:
+                    assert len(qualified_path) == 2
+                    module_name, co_name = qualified_path
+                    code_options[
+                        "co_qualname"
+                    ] = f"{module_name}.torch_dynamo_resume_in_{co_name}_at_{lineno}"
+            code_options["co_firstlineno"] = lineno
+            code_options["co_cellvars"] = tuple()
+            code_options["co_freevars"] = freevars
+            code_options["co_argcount"] = len(args)
+            code_options["co_posonlyargcount"] = 0
+            code_options["co_kwonlyargcount"] = 0
+            code_options["co_varnames"] = tuple(
+                args + [v for v in code_options["co_varnames"] if v not in args]
+            )
+            code_options["co_flags"] = code_options["co_flags"] & ~(
+                CO_VARARGS | CO_VARKEYWORDS
+            )
+            target = next(i for i in instructions if i.offset == offset)
+
+            prefix = []
+            if is_py311_plus:
+                if freevars:
+                    prefix.append(
+                        create_instruction("COPY_FREE_VARS", arg=len(freevars))
+                    )
+                prefix.append(create_instruction("RESUME", arg=0))
+
+            cleanup: List[Instruction] = []
+            hooks = {fn.stack_index: fn for fn in setup_fns}
+            hook_target_offsets = {
+                fn.stack_index: setup_fn_target_offsets[i]
+                for i, fn in enumerate(setup_fns)
+            }
+            offset_to_inst = {inst.offset: inst for inst in instructions}
+            # map old hook targets to new targets generated by the hook
+            old_hook_target_remap = {}
+            null_idxes_i = 0
+            for i in range(nstack):
+                while (
+                    null_idxes_i < len(null_idxes)
+                    and null_idxes[null_idxes_i] == i + null_idxes_i
+                ):
+                    prefix.append(create_instruction("PUSH_NULL"))
+                    null_idxes_i += 1
+                prefix.append(create_instruction("LOAD_FAST", argval=f"___stack{i}"))
+                if i in hooks:
+                    hook = hooks.pop(i)
+                    hook_insts, exn_target = hook(code_options, cleanup)
+                    prefix.extend(hook_insts)
+                    if is_py311_plus:
+                        hook_target_offset = hook_target_offsets.pop(i)
+                        old_hook_target = offset_to_inst[hook_target_offset]
+                        meta.prefix_block_target_offset_remap.append(hook_target_offset)
+                        old_hook_target_remap[old_hook_target] = exn_target
+            if is_py311_plus:
+                # reverse the mapping since targets of later/nested contexts are inserted
+                # into the mapping later, but show up earlier in the prefix.
+                meta.prefix_block_target_offset_remap = list(
+                    reversed(meta.prefix_block_target_offset_remap)
+                )
+
+            assert not hooks
+
+            prefix.append(create_jump_absolute(target))
+
+            # because the line number table monotonically increases from co_firstlineno
+            # remove starts_line for any instructions before the graph break instruction
+            # this will ensure the instructions after the break have the correct line numbers
+            for inst in instructions:
+                if inst.offset == target.offset:
+                    break
+                inst.starts_line = None
+                if sys.version_info >= (3, 11):
+                    inst.positions = None
+
+            if cleanup:
+                prefix.extend(cleanup)
+                prefix.extend(cls.unreachable_codes(code_options))
+
+            # remap original instructions' exception table entries
+            if old_hook_target_remap:
+                assert is_py311_plus
+                for inst in instructions:
+                    if (
+                        inst.exn_tab_entry
+                        and inst.exn_tab_entry.target in old_hook_target_remap
+                    ):
+                        inst.exn_tab_entry.target = old_hook_target_remap[
+                            inst.exn_tab_entry.target
+                        ]
+
+            # TODO(jansel): add dead code elimination here
+            instructions[:] = prefix + instructions
+
+        new_code = transform_code_object(code, update)
+        ContinueExecutionCache.generated_code_metadata[new_code] = meta
+        return new_code
+
+    @staticmethod
+    def unreachable_codes(code_options) -> List[Instruction]:
+        """Codegen a `raise None` to make analysis work for unreachable code"""
+        return [
+            create_instruction("LOAD_CONST", argval=None),
+            create_instruction("RAISE_VARARGS", arg=1),
+        ]
+
+    @classmethod
+    def generate_based_on_original_code_object(
+        cls, code, lineno, offset: int, setup_fn_target_offsets: Tuple[int, ...], *args
+    ):
+        """
+        This handles the case of generating a resume into code generated
+        to resume something else.  We want to always generate starting
+        from the original code object so that if control flow paths
+        converge we only generated 1 resume function (rather than 2^n
+        resume functions).
+        """
+
+        meta: ResumeFunctionMetadata = ContinueExecutionCache.generated_code_metadata[
+            code
+        ]
+        new_offset = None
+
+        def find_new_offset(
+            instructions: List[Instruction], code_options: Dict[str, Any]
+        ):
+            nonlocal new_offset
+            (target,) = (i for i in instructions if i.offset == offset)
+            # match the functions starting at the last instruction as we have added a prefix
+            (new_target,) = (
+                i2
+                for i1, i2 in zip(reversed(instructions), reversed(meta.instructions))
+                if i1 is target
+            )
+            assert target.opcode == new_target.opcode
+            new_offset = new_target.offset
+
+        transform_code_object(code, find_new_offset)
+
+        if sys.version_info >= (3, 11):
+            # setup_fn_target_offsets currently contains the target offset of
+            # each setup_fn, based on `code`. When we codegen the resume function
+            # based on the original code object, `meta.code`, the offsets in
+            # setup_fn_target_offsets must be based on `meta.code` instead.
+            if not meta.block_target_offset_remap:
+                block_target_offset_remap = meta.block_target_offset_remap = {}
+
+                def remap_block_offsets(
+                    instructions: List[Instruction], code_options: Dict[str, Any]
+                ):
+                    # NOTE: each prefix block generates exactly one PUSH_EXC_INFO,
+                    # so we can tell which block a prefix PUSH_EXC_INFO belongs to,
+                    # by counting. Then we can use meta.prefix_block-target_offset_remap
+                    # to determine where in the original code the PUSH_EXC_INFO offset
+                    # replaced.
+                    prefix_blocks: List[Instruction] = []
+                    for inst in instructions:
+                        if len(prefix_blocks) == len(
+                            meta.prefix_block_target_offset_remap
+                        ):
+                            break
+                        if inst.opname == "PUSH_EXC_INFO":
+                            prefix_blocks.append(inst)
+
+                    # offsets into prefix
+                    for inst, o in zip(
+                        prefix_blocks, meta.prefix_block_target_offset_remap
+                    ):
+                        block_target_offset_remap[cast(int, inst.offset)] = o
+
+                    # old bytecode targets are after the prefix PUSH_EXC_INFO's
+                    old_start_offset = (
+                        cast(int, prefix_blocks[-1].offset) if prefix_blocks else -1
+                    )
+                    # offsets into old bytecode
+                    old_inst_offsets = sorted(
+                        n for n in setup_fn_target_offsets if n > old_start_offset
+                    )
+                    targets = _filter_iter(
+                        instructions, old_inst_offsets, lambda inst, o: inst.offset == o
+                    )
+                    new_targets = _filter_iter(
+                        zip(reversed(instructions), reversed(meta.instructions)),
+                        targets,
+                        lambda v1, v2: v1[0] is v2,
+                    )
+                    for new, old in zip(new_targets, targets):
+                        block_target_offset_remap[old.offset] = new[1].offset
+
+                transform_code_object(code, remap_block_offsets)
+
+            # if offset is not in setup_fn_target_offsets, it is an error
+            setup_fn_target_offsets = tuple(
+                meta.block_target_offset_remap[n] for n in setup_fn_target_offsets
+            )
+        return ContinueExecutionCache.lookup(
+            meta.code, lineno, new_offset, setup_fn_target_offsets, *args
+        )
+
+
+"""
+# partially finished support for with statements
+
+def convert_locals_to_cells(
+        instructions: List[Instruction],
+        code_options: Dict[str, Any]):
+
+    code_options["co_cellvars"] = tuple(
+        var
+        for var in code_options["co_varnames"]
+        if var not in code_options["co_freevars"]
+        and not var.startswith("___stack")
+    )
+    cell_and_free = code_options["co_cellvars"] + code_options["co_freevars"]
+    for inst in instructions:
+        if str(inst.argval).startswith("___stack"):
+            continue
+        elif inst.opname == "LOAD_FAST":
+            inst.opname = "LOAD_DEREF"
+        elif inst.opname == "STORE_FAST":
+            inst.opname = "STORE_DEREF"
+        elif inst.opname == "DELETE_FAST":
+            inst.opname = "DELETE_DEREF"
+        else:
+            continue
+        inst.opcode = dis.opmap[inst.opname]
+        assert inst.argval in cell_and_free, inst.argval
+        inst.arg = cell_and_free.index(inst.argval)
+
+def patch_setup_with(
+    instructions: List[Instruction],
+    code_options: Dict[str, Any]
+):
+    nonlocal need_skip
+    need_skip = True
+    target_index = next(
+        idx for idx, i in enumerate(instructions) if i.offset == offset
+    )
+    assert instructions[target_index].opname == "SETUP_WITH"
+    convert_locals_to_cells(instructions, code_options)
+
+    stack_depth_before = nstack + stack_effect(instructions[target_index].opcode,
+                                               instructions[target_index].arg)
+
+    inside_with = []
+    inside_with_resume_at = None
+    stack_depth = stack_depth_before
+    idx = target_index + 1
+    for idx in range(idx, len(instructions)):
+        inst = instructions[idx]
+        if inst.opname == "BEGIN_FINALLY":
+            inside_with_resume_at = inst
+            break
+        elif inst.target is not None:
+            unimplemented("jump from with not supported")
+        elif inst.opname in ("BEGIN_FINALLY", "WITH_CLEANUP_START", "WITH_CLEANUP_FINISH", "END_FINALLY",
+                             "POP_FINALLY", "POP_EXCEPT",
+                             "POP_BLOCK", "END_ASYNC_FOR"):
+            unimplemented("block ops not supported")
+        inside_with.append(inst)
+        stack_depth += stack_effect(inst.opcode, inst.arg)
+    assert inside_with_resume_at
+
+    instructions = [
+        create_instruction("LOAD_FAST", f"___stack{i}") for i in range(nstack)
+    ] + [
+        create_instruction("SETUP_WITH", target=instructions[target_index].target)
+        ... call the function ...
+        unpack_tuple
+    ] + [
+        create_instruction("JUMP_ABSOLUTE", target=inside_with_resume_at)
+    ]
+"""
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/side_effects.py b/MLPY/Lib/site-packages/torch/_dynamo/side_effects.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1de34e052142d61a91cb0c80dd92d58da99b85d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/side_effects.py
@@ -0,0 +1,542 @@
+import inspect
+from typing import Any, Dict, List, Optional, Union
+
+import torch.nn
+
+from . import utils, variables
+from .bytecode_transformation import (
+    create_call_function,
+    create_call_method,
+    create_instruction,
+)
+from .codegen import PyCodegen
+from .exc import unimplemented
+from .source import LocalSource, Source
+from .utils import nn_module_new, object_new
+from .variables.base import (
+    is_side_effect_safe,
+    MutableLocalBase,
+    MutableLocalSource,
+    VariableTracker,
+)
+
+
+class MutableSideEffects(MutableLocalBase):
+    """
+    VariableTracker.mutable_local marker to indicate a list passed as
+    an input that if we mutate we need to re-apply those mutations after
+    the graph runs.
+    """
+
+    def __init__(self, source: Source, is_modified: bool = False):
+        super().__init__(MutableLocalSource.Existing)
+        self.source = source
+        self.is_modified = is_modified
+
+
+class AttributeMutation(MutableLocalBase):
+    """
+    VariableTracker.mutable_local marker to track changes to attributes
+    """
+
+    def __init__(self, typ: MutableLocalSource, source: Optional[Source]):
+        super().__init__(typ)
+        self.source = source
+
+
+class AttributeMutationExisting(AttributeMutation):
+    def __init__(self, source: Source):
+        super().__init__(MutableLocalSource.Existing, source)
+        self.source = source
+
+
+class AttributeMutationNew(AttributeMutation):
+    def __init__(self, source: Optional[Source], cls_source: Optional[Source]):
+        super().__init__(MutableLocalSource.Local, source)
+        self.cls_source = cls_source
+
+
+class SideEffects:
+    """
+    Track side effects (list mutation, setattr, etc) that need to be
+    applied after an FX graph is run.
+    """
+
+    id_to_variable: Dict[int, VariableTracker]
+    store_attr_mutations: Dict[MutableLocalBase, Dict[str, VariableTracker]]
+    keepalive: List[Any]
+
+    def __init__(
+        self,
+        id_to_variable=None,
+        store_attr_mutations=None,
+        keepalive=None,
+        save_for_backward=None,
+        tensor_hooks=None,
+    ):
+        super().__init__()
+        self.id_to_variable = id_to_variable or {}
+        self.store_attr_mutations = store_attr_mutations or {}
+        self.keepalive = keepalive or []
+        self.save_for_backward = save_for_backward or []
+        self.tensor_hooks = tensor_hooks or {}
+
+    def __eq__(self, other: object) -> bool:
+        assert isinstance(other, SideEffects)
+        # NB: do NOT test keepalive
+        return (
+            self.id_to_variable == other.id_to_variable
+            and self.store_attr_mutations == other.store_attr_mutations
+            and self.save_for_backward == other.save_for_backward
+            and self.tensor_hooks == other.tensor_hooks
+        )
+
+    def diff(self, other: "SideEffects") -> Optional[str]:
+        if self.id_to_variable != other.id_to_variable:
+            sk_itv = self.id_to_variable.keys()
+            ok_itv = other.id_to_variable.keys()
+            if sk_itv != ok_itv:
+                return f"id_to_variable keys: {sk_itv} != {ok_itv}"
+            # Feel free to augment this with more fancy diffing logic
+            # if needed for debugging
+            return "id_to_variable: unknown diff"
+        elif self.store_attr_mutations != other.store_attr_mutations:
+            sk_sam = self.store_attr_mutations.keys()
+            ok_sam = other.store_attr_mutations.keys()
+            if sk_sam != ok_sam:
+                return f"store_attr_mutations keys: {sk_sam} != {ok_sam}"
+            return "store_attr_mutations: unknown diff"
+        elif self.save_for_backward != other.save_for_backward:
+            return "save_for_backward"
+        elif self.tensor_hooks != other.tensor_hooks:
+            return "tensor_hooks"
+        else:
+            return None
+
+    def clone(self):
+        """Create a shallow copy"""
+        return self.__class__(
+            id_to_variable=dict(self.id_to_variable),
+            store_attr_mutations={
+                k: dict(v) for k, v in self.store_attr_mutations.items()
+            },
+            keepalive=list(self.keepalive),
+            save_for_backward=self.save_for_backward,
+            tensor_hooks=self.tensor_hooks,
+        )
+
+    def apply(self, fn, cache=None, skip_fn=lambda _: False):
+        if cache is None:
+            cache = dict()
+
+        self.id_to_variable = {
+            k: VariableTracker.apply(fn, v, cache, skip_fn)
+            for k, v in self.id_to_variable.items()
+        }
+        self.store_attr_mutations = {
+            k: VariableTracker.apply(fn, v, cache, skip_fn)
+            for k, v in self.store_attr_mutations.items()
+        }
+        self.save_for_backward = VariableTracker.apply(
+            fn, self.save_for_backward, cache, skip_fn
+        )
+        self.tensor_hooks = VariableTracker.apply(fn, self.tensor_hooks, cache, skip_fn)
+
+    def __contains__(self, item):
+        return id(item) in self.id_to_variable
+
+    def __getitem__(self, item):
+        return self.id_to_variable[id(item)]
+
+    def check_allowed_side_effect(self, item):
+        from torch._dynamo.variables.misc import AutogradFunctionContextVariable
+
+        # People do things like self.dim = dim inside autograd.Function.
+        # These are benign.
+        if isinstance(item, AutogradFunctionContextVariable):
+            return True
+        if not is_side_effect_safe(item.mutable_local):
+            unimplemented(
+                "HigherOrderOperator: Mutating a variable not in the current scope (SideEffects)"
+            )
+
+    def store_attr(self, item: VariableTracker, name: str, value: VariableTracker):
+        assert self.is_attribute_mutation(item)
+        self.check_allowed_side_effect(item)
+        if item.mutable_local not in self.store_attr_mutations:
+            self.store_attr_mutations[item.mutable_local] = {}
+        self.store_attr_mutations[item.mutable_local][name] = value
+
+    def load_attr(self, item, name, deleted_ok=False):
+        assert self.is_attribute_mutation(item)
+        result = self.store_attr_mutations[item.mutable_local][name]
+        if not deleted_ok and isinstance(result, variables.DeletedVariable):
+            unimplemented("read deleted attribute")
+        return result
+
+    def store_cell(self, cellvar, value):
+        assert isinstance(cellvar, variables.NewCellVariable)
+        assert isinstance(value, variables.VariableTracker)
+        self.store_attr(cellvar, "cell_contents", value)
+
+    def load_cell(self, cellvar):
+        assert isinstance(cellvar, variables.NewCellVariable)
+        return self.load_attr(cellvar, "cell_contents")
+
+    def load_global(self, gvar: VariableTracker, name: str):
+        assert isinstance(gvar, variables.VariableTracker)
+        return self.load_attr(gvar, name)
+
+    def store_global(self, gvar: VariableTracker, name: str, value: VariableTracker):
+        assert isinstance(gvar, variables.VariableTracker)
+        assert isinstance(value, variables.VariableTracker)
+        self.store_attr(gvar, name, value)
+
+    @staticmethod
+    def cls_supports_mutation_side_effects(cls):
+        return inspect.getattr_static(cls, "__setattr__", None) in (
+            object.__setattr__,
+            torch.nn.Module.__setattr__,
+        )
+
+    def is_attribute_mutation(self, item):
+        return isinstance(item.mutable_local, AttributeMutation)
+
+    def has_pending_mutation(self, item):
+        return self.is_attribute_mutation(item) and bool(
+            self.store_attr_mutations.get(item.mutable_local)
+        )
+
+    def is_modified(self, item):
+        if isinstance(item.mutable_local, AttributeMutationNew):
+            return True
+        if self.is_attribute_mutation(item):
+            return item.mutable_local in self.store_attr_mutations
+        return item.mutable_local.is_modified
+
+    def _track_obj(
+        self,
+        item: Any,
+        variable: VariableTracker,
+        mutable_cls=MutableSideEffects,
+    ):
+        """Start tracking a new variable for mutation"""
+        assert variable.source is not None
+        variable.mutable_local = mutable_cls(variable.source)
+        self.id_to_variable[id(item)] = variable
+        self.keepalive.append(item)
+        return variable
+
+    track_mutable = _track_obj
+
+    def track_object_existing(
+        self,
+        item: Any,
+        variable: VariableTracker,
+    ):
+        return self._track_obj(item, variable, mutable_cls=AttributeMutationExisting)
+
+    def track_object_new(
+        self,
+        cls_source: Source,
+        user_cls: Any,
+        variable_cls: Any,
+        options,
+    ):
+        if user_cls is torch.autograd.function.FunctionCtx:
+            obj = torch.autograd.Function()
+        elif issubclass(user_cls, torch.nn.Module):
+            obj = nn_module_new(user_cls)
+        else:
+            obj = object_new(user_cls)
+        variable = variable_cls(
+            obj,
+            mutable_local=AttributeMutationNew(None, cls_source),
+            **options,
+        )
+        self.id_to_variable[id(obj)] = variable
+        self.keepalive.append(obj)
+        return variable
+
+    def track_cell_new(
+        self,
+    ):
+        obj = object()
+        variable = variables.NewCellVariable(
+            mutable_local=AttributeMutationNew(None, None),
+        )
+        self.id_to_variable[id(obj)] = variable
+        self.keepalive.append(obj)
+        return variable
+
+    def track_cell_existing(self, source: Source, item: Any):
+        variable = variables.NewCellVariable(
+            mutable_local=AttributeMutationExisting(source),
+        )
+        self.id_to_variable[id(item)] = variable
+        self.keepalive.append(item)
+        return variable
+
+    def track_global_existing(self, source: Source, item: Any):
+        variable = variables.NewGlobalVariable(
+            mutable_local=AttributeMutationExisting(source),
+        )
+        self.id_to_variable[id(item)] = variable
+        self.keepalive.append(item)
+        return variable
+
+    def track_save_for_backward(self, ctx, args):
+        assert isinstance(ctx, variables.AutogradFunctionContextVariable)
+        self.save_for_backward.append((ctx, args))
+
+    def track_tensor_variables_from_runahead_side_effects(self, other):
+        # In higher order ops we want to keep track of tensors seen in the
+        # speculate_subgraph so that we don't lift them again as a new input in
+        # other speculate_subgraph or in the root tracer.
+        for other_item in other.keepalive:
+            other_id = id(other_item)
+            other_variable = other.id_to_variable[other_id]
+            if other_id not in self.id_to_variable and isinstance(
+                other_variable, variables.TensorVariable
+            ):
+                self.track_object_existing(other_item, other_variable)
+
+    def prune_dead_object_new(self, tx):
+        live_new_objects = set()
+        skip_obj = None
+
+        def visit(var: VariableTracker):
+            if (
+                isinstance(var.mutable_local, AttributeMutationNew)
+                and var.mutable_local is not skip_obj
+            ):
+                live_new_objects.add(var.mutable_local)
+            return var
+
+        def is_live(var: Union[MutableLocalBase, VariableTracker]):
+            if isinstance(var, AttributeMutationNew):
+                return var in live_new_objects
+            if isinstance(var, VariableTracker):
+                return is_live(var.mutable_local)
+            return True
+
+        VariableTracker.apply(visit, (tx.stack, tx.symbolic_locals))
+        for var in self.id_to_variable.values():
+            if not isinstance(var.mutable_local, AttributeMutationNew):
+                VariableTracker.apply(visit, var)
+
+        for skip_obj, setattrs in self.store_attr_mutations.items():
+            VariableTracker.apply(visit, setattrs)
+
+        self.id_to_variable = {
+            k: v for k, v in self.id_to_variable.items() if is_live(v)
+        }
+        self.store_attr_mutations = {
+            k: v for k, v in self.store_attr_mutations.items() if is_live(k)
+        }
+
+    def mutation(self, var):
+        self.check_allowed_side_effect(var)
+        if isinstance(var.mutable_local, MutableSideEffects):
+            var.mutable_local = MutableSideEffects(var.mutable_local.source, True)
+
+    def _get_modified_vars(self):
+        return [var for var in self.id_to_variable.values() if self.is_modified(var)]
+
+    def codegen_save_tempvars(self, cg: PyCodegen):
+        for var in self._get_modified_vars():
+            if isinstance(
+                var.mutable_local, (AttributeMutationExisting, AttributeMutationNew)
+            ) and isinstance(var, variables.NewCellVariable):
+                cg.load_import_from(utils.__name__, "make_cell")
+                cg.extend_output(create_call_function(0, True))
+                cg.add_cache(var)
+                if isinstance(var.mutable_local, AttributeMutationNew):
+                    var.mutable_local.source = LocalSource(cg.tempvars[var])  # type: ignore[attr-defined]
+            elif isinstance(var.mutable_local, AttributeMutationNew):
+                if isinstance(var, variables.AutogradFunctionContextVariable):
+                    unimplemented("AutogradFunctionContextVariable escaped")
+                if "__call_nn_module_init" in self.store_attr_mutations.get(
+                    var.mutable_local, {}
+                ):
+                    assert isinstance(var, variables.UnspecializedNNModuleVariable)
+                    cg.load_import_from(utils.__name__, "nn_module_new")
+                else:
+                    cg.load_import_from(utils.__name__, "object_new")
+                cg(var.mutable_local.cls_source)
+                cg.extend_output(create_call_function(1, True))
+                cg.add_cache(var)
+                var.mutable_local.source = LocalSource(cg.tempvars[var])
+            elif var in cg.tempvars:
+                assert cg.tempvars.get(var) is None
+                # subsequent usage should point to the original variable
+                cg(var.mutable_local.source)
+                cg.add_cache(var)
+
+        for ctx, args in self.save_for_backward:
+            cg(ctx.source)
+            cg.extend_output(
+                [create_instruction("LOAD_METHOD", argval="save_for_backward")]
+            )
+            for arg in args:
+                cg(arg)
+            cg.extend_output(
+                [
+                    *create_call_method(len(args)),
+                    create_instruction("POP_TOP"),
+                ]
+            )
+
+    def register_hook(self, tensor, hook, handle, name):
+        assert isinstance(tensor, variables.TensorVariable)
+        assert isinstance(hook, variables.VariableTracker)
+        assert (
+            isinstance(handle, variables.RemovableHandleVariable)
+            and handle.mutable_local
+        )
+        assert hasattr(torch.Tensor, name)
+        idx = len(self.tensor_hooks.keys())
+        # duplicate index possible because of self.remove_hook()
+        while idx in self.tensor_hooks:
+            idx += 1
+        self.tensor_hooks[idx] = (tensor, hook, handle, name)
+        assert not handle.idx
+        handle.idx = idx
+
+    def remove_hook(self, idx):
+        del self.tensor_hooks[idx]
+
+    def codegen_hooks(self, cg):
+        for (
+            tensor,
+            hook,
+            handle,
+            name,
+        ) in self.tensor_hooks.values():
+            # Note: [On tensor.register_hook]
+            #
+            # register_hook on a tensor, AKA backward hooks, have slightly nuanced differences in how they are implemented
+            # when it comes to hooks on objects with sources (inputs, params) vs objects without sources (intermediaries).
+            #
+            # For tensors with a source, we bypass direct inclusion of register_hook calls in the graph.
+            # Instead, these are tracked and stashed as a global variable, enabling their association with tensors in
+            # the residuals. During dynamo's frame creation, these hooks are invoked seamlessly on known reconstructible/fetch-able
+            # tensors. Because a source indicates knowledge of this object outside the torch compile region, and
+            # because we are running residuals firmly before .backward() can be run, it is sound to invoke
+            # `register_hook` on a known tensor.
+            #
+            # For tensors without a source, we support a limited subset of hooks. Global functions only, and
+            # compiled_autograd must be enabled or we will graph break.
+            #
+            # Handling the Handle: When a user retains the register_hook result in a handle, we intercept the
+            # STORE_FAST operation to record the user-designated local variable name. This ensures the reconstructed
+            # bytecode retains this name. If no handle is defined, we simply pop the generated value to keep the
+            # stack intact.
+            #
+            # Dynamo Tensor Hooks Workflow:
+            # - Functions passed to register_hook are lifted globally.
+            # - For tensors with sources:
+            #   - In the "side_effects" phase of codegen, we iterate over tensors with hooks to:
+            #     - Generate the tensor.
+            #     - Issue a register_hook call on the tensor, linking to the globally stored function.
+            #     - Incorporate a handle if one was established in the eager phase.
+            #  - For tensors without sources:
+            #    - We don't generate any instructions for registering a hook.
+            #    - Handles from intermediary hooks are NYI.
+            #    - We produce a call function that utilizes the trace_wrapped higher order op, closing over it.
+            #    - We then manually insert the call function above into the graph.
+            # - The handle's exact user-specified name, "user_code_variable_name", is discerned and associated during STORE_FAST.
+            assert tensor.source, "Hooks on non input tensors NYI - should not get here"
+            cg(tensor)
+            cg.extend_output([cg.create_load_attr(name)])
+            cg(hook)
+            cg.extend_output(create_call_function(1, True))
+
+            # Adding the handle to the cache means RemovableHandleVariable().reconstruct() will
+            # be associated with the return value of register_hook().  This consumes the top of stack.
+            cg.add_cache(handle)
+
+    def codegen_update_mutated(self, cg: PyCodegen):
+        suffixes = []
+        for var in self._get_modified_vars():
+            if isinstance(var, variables.ListVariable):
+                # old[:] = new
+                cg(var, allow_cache=False)
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                cg.extend_output(
+                    [
+                        cg.create_load_const(None),
+                        cg.create_load_const(None),
+                        create_instruction("BUILD_SLICE", arg=2),
+                    ]
+                )
+                suffixes.append([create_instruction("STORE_SUBSCR")])
+            elif isinstance(var, variables.ConstDictVariable):
+                cg.tx.output.update_co_names("clear")
+                cg.tx.output.update_co_names("update")
+
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                cg.extend_output([create_instruction("LOAD_METHOD", argval="update")])
+                cg(var, allow_cache=False)
+
+                cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                cg.extend_output([create_instruction("LOAD_METHOD", argval="clear")])
+
+                suffixes.append(
+                    [
+                        *create_call_method(0),  # clear
+                        create_instruction("POP_TOP"),
+                        *create_call_method(1),  # update
+                        create_instruction("POP_TOP"),
+                    ]
+                )
+            elif self.is_attribute_mutation(var):
+                for name, value in self.store_attr_mutations.get(
+                    var.mutable_local, {}
+                ).items():
+                    if isinstance(var, variables.NewGlobalVariable):
+                        cg.tx.output.update_co_names(name)
+                        cg(value)
+                        suffixes.append(
+                            [create_instruction("STORE_GLOBAL", argval=name)]
+                        )
+                    elif name == "__call_nn_module_init":
+                        pass  # handled in codegen_save_tempvars
+                    elif isinstance(value, variables.DeletedVariable):
+                        if isinstance(
+                            var.mutable_local, AttributeMutationExisting
+                        ) and hasattr(getattr(var, "value", None), name):
+                            cg.tx.output.update_co_names(name)
+                            cg(var.mutable_local.source)
+                            suffixes.append(
+                                [create_instruction("DELETE_ATTR", argval=name)]
+                            )
+                    else:
+                        cg.tx.output.update_co_names(name)
+                        cg(value)
+                        cg(var.mutable_local.source)
+                        suffixes.append([create_instruction("STORE_ATTR", argval=name)])
+            elif isinstance(var, variables.TupleIteratorVariable):
+                for _ in range(var.index):
+                    cg.load_import_from(utils.__name__, "iter_next")
+                    cg(var.mutable_local.source)  # type: ignore[attr-defined]
+                    cg.extend_output(create_call_function(1, True))
+                    cg.append_output(create_instruction("POP_TOP"))
+            else:
+                raise AssertionError(type(var))
+
+        # do all the actual mutations at the very end to handle dependencies
+        for suffix in reversed(suffixes):
+            cg.extend_output(suffix)
+
+    def is_empty(self):
+        return not (
+            any(map(self.is_modified, self.id_to_variable.values()))
+            or self.tensor_hooks
+            or self.save_for_backward
+            or self.tensor_hooks
+        )
+
+    def clear(self):
+        self.keepalive.clear()
+        self.id_to_variable.clear()
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/source.py b/MLPY/Lib/site-packages/torch/_dynamo/source.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6ca75c4eaa1f910ff8d9384074a873072c48c37
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/source.py
@@ -0,0 +1,545 @@
+import collections
+import dataclasses
+import enum
+from typing import Any, Optional, Union
+
+from torch._guards import ChainedSource, GuardSource, Source
+
+from . import utils
+from .bytecode_transformation import create_call_function, create_instruction
+from .utils import enum_repr
+
+# It shouldn't be supported to construct an NNModuleVariable inside an FSDP module,
+# so those cases are omitted intentionally
+_GUARD_SOURCE_NN_MODULE = {
+    GuardSource.LOCAL: GuardSource.LOCAL_NN_MODULE,
+    GuardSource.GLOBAL: GuardSource.GLOBAL_NN_MODULE,
+    GuardSource.LOCAL_NN_MODULE: GuardSource.LOCAL_NN_MODULE,
+    GuardSource.GLOBAL_NN_MODULE: GuardSource.GLOBAL_NN_MODULE,
+}
+
+_GUARD_SOURCE_FSDP_MODULE = {
+    GuardSource.LOCAL: GuardSource.LOCAL_FSDP_MODULE,
+    GuardSource.GLOBAL: GuardSource.GLOBAL_FSDP_MODULE,
+    GuardSource.LOCAL_NN_MODULE: GuardSource.LOCAL_FSDP_MODULE,
+    GuardSource.GLOBAL_NN_MODULE: GuardSource.GLOBAL_FSDP_MODULE,
+    GuardSource.LOCAL_FSDP_MODULE: GuardSource.LOCAL_FSDP_MODULE,
+    GuardSource.GLOBAL_FSDP_MODULE: GuardSource.GLOBAL_FSDP_MODULE,
+}
+
+_GUARD_SOURCE_NOT_NN_MODULE = {
+    GuardSource.LOCAL: GuardSource.LOCAL,
+    GuardSource.GLOBAL: GuardSource.GLOBAL,
+    GuardSource.LOCAL_NN_MODULE: GuardSource.LOCAL,
+    GuardSource.GLOBAL_NN_MODULE: GuardSource.GLOBAL,
+    GuardSource.LOCAL_FSDP_MODULE: GuardSource.LOCAL,
+    GuardSource.GLOBAL_FSDP_MODULE: GuardSource.GLOBAL,
+}
+
+
+def is_constant_source(source):
+    if isinstance(source, ConstantSource):
+        return True
+    try:
+        if source.guard_source() == GuardSource.CONSTANT:
+            return True
+    except NotImplementedError:
+        pass
+
+    return False
+
+
+def reconstruct_getitem(
+    source: Union["GetItemSource", "ODictGetItemSource"], codegen, index_is_slice
+):
+    source.base.reconstruct(codegen)
+    if isinstance(source.index, Source):
+        source.index.reconstruct(codegen)
+    else:
+        if index_is_slice:
+            assert isinstance(source, GetItemSource)
+            codegen.append_output(codegen.create_load_const(source.unpack_slice()))
+        else:
+            codegen.append_output(codegen.create_load_const(source.index))
+
+
+@dataclasses.dataclass(frozen=True)
+class LocalSource(Source):
+    local_name: str
+    cell_or_freevar: bool = False
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load(self.local_name))
+
+    def guard_source(self):
+        return GuardSource.LOCAL
+
+    def name(self):
+        return f"L[{repr(self.local_name)}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class SyntheticLocalSource(Source):
+    local_name: str
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load(self.local_name))
+
+    def guard_source(self):
+        return GuardSource.SYNTHETIC_LOCAL
+
+    def name(self):
+        return f"SYNTHETIC_LOCAL[{self.local_name!r}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class RandomValueSource(Source):
+    random_call_index: int
+
+    def guard_source(self):
+        return GuardSource.RANDOM_VALUE
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load(codegen.tx.output.random_values_var))
+        codegen.append_output(codegen.create_load_const(self.random_call_index))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def name(self):
+        return f"random_value_{self.random_call_index}"
+
+
+@dataclasses.dataclass(frozen=True)
+class GlobalSource(Source):
+    global_name: str
+
+    def reconstruct(self, codegen):
+        codegen.append_output(
+            codegen.create_load_global(self.global_name, False, add=True)
+        )
+
+    def guard_source(self):
+        return GuardSource.GLOBAL
+
+    def name(self):
+        return f"G[{repr(self.global_name)}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class GlobalWeakRefSource(Source):
+    global_name: str
+
+    def reconstruct(self, codegen):
+        codegen.append_output(
+            codegen.create_load_global(self.global_name, True, add=True)
+        )
+        codegen.extend_output(create_call_function(0, False))
+
+    def guard_source(self):
+        return GuardSource.GLOBAL
+
+    def name(self):
+        return f"G[{repr(self.global_name)}]()"
+
+
+@dataclasses.dataclass(frozen=True)
+class AttrSource(ChainedSource):
+    member: str
+    get_static: bool = False
+
+    def __post_init__(self):
+        assert self.base, "Can't construct an AttrSource without a valid base source"
+        if "." in self.member:
+            member_parts = self.member.split(".")
+            object.__setattr__(
+                self, "base", AttrSource(self.base, ".".join(member_parts[:-1]))
+            )
+            object.__setattr__(self, "member", member_parts[-1])
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+        codegen.extend_output(codegen.create_load_attrs(self.member))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        if self.get_static:
+            return f"inspect.getattr_static({self.base.name()}, {self.member!r})"
+        elif not self.member.isidentifier():
+            return f"getattr({self.base.name()}, {self.member!r})"
+        return f"{self.base.name()}.{self.member}"
+
+
+@dataclasses.dataclass(frozen=True)
+class ParamBufferSource(AttrSource):
+    def guard_source(self):
+        return _GUARD_SOURCE_NN_MODULE[self.base.guard_source()]
+
+
+# This source is intended to be used in places where a source is needed but it is expected
+# that the symbol will be simplified out later on. Symbols with ephemeral sources are
+# prioritized to be simplified out when e.g. compared against a symbol without an ephemeral
+# source. Guarding on this source is an error.
+#
+# Example: During subclass view fake-ification, any close-over ViewFunc state should be
+# symbolicized / fake-ified to avoid invalid specialization during view replay. This source
+# is useful for symbols utilized in the middle of the view chain that are not expected to be
+# present within the final view shape metadata.
+@dataclasses.dataclass(frozen=True)
+class EphemeralSource(Source):
+    desc: Optional[str] = None
+
+    def guard_source(self):
+        return GuardSource.EPHEMERAL
+
+    def name(self):
+        return f"<ephemeral{': ' + self.desc if self.desc is not None else ''}>"
+
+    def make_guard(self):
+        raise NotImplementedError()
+
+    def is_ephemeral(self):
+        return True
+
+
+class TensorProperty(enum.Enum):
+    SIZE = 0
+    STRIDE = 1
+    STORAGE_OFFSET = 2
+
+    def method_name(self):
+        if self is TensorProperty.SIZE:
+            return "size"
+        elif self is TensorProperty.STRIDE:
+            return "stride"
+        elif self is TensorProperty.STORAGE_OFFSET:
+            return "storage_offset"
+
+
+@dataclasses.dataclass(frozen=True)
+class TensorPropertySource(ChainedSource):
+    prop: TensorProperty
+    idx: Optional[int] = None  # None for STORAGE_OFFSET
+
+    def __post_init__(self):
+        assert self.base is not None
+        if self.prop is TensorProperty.STORAGE_OFFSET:
+            assert self.idx is None
+        else:
+            assert self.idx is not None
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_attr(self.prop.method_name()))
+        if self.idx is not None:
+            codegen.append_output(codegen.create_load_const(self.idx))
+        codegen.extend_output(
+            create_call_function(1 if self.idx is not None else 0, True)
+        )
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        if self.prop is TensorProperty.SIZE:
+            return f"{self.base.name()}.size()[{self.idx}]"
+        elif self.prop is TensorProperty.STRIDE:
+            return f"{self.base.name()}.stride()[{self.idx}]"
+        elif self.prop is TensorProperty.STORAGE_OFFSET:
+            assert self.idx is None
+            return f"{self.base.name()}.storage_offset()"
+        else:
+            raise AssertionError(f"unhandled {self.prop}")
+
+
+@dataclasses.dataclass(frozen=True)
+class NegateSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError()
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        # NB: use method call so that function stripping regexes work
+        return f"{self.base.name()}.__neg__()"
+
+
+@dataclasses.dataclass(frozen=True)
+class ConvertIntSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"cast_symbool_to_symint_guardless({self.base.name()})"
+
+
+@dataclasses.dataclass(frozen=True)
+class DefaultsSource(ChainedSource):
+    idx_key: Union[int, str]
+    is_kw: bool = False
+    field: str = dataclasses.field(init=False, repr=False, compare=False)
+    _name: str = dataclasses.field(init=False, repr=False, compare=False)
+
+    def __post_init__(self):
+        assert (
+            self.base
+        ), "Base must be a valid source in order to properly track and guard this Defaults to its origin."
+        if self.is_kw:
+            assert isinstance(self.idx_key, str)
+            object.__setattr__(self, "field", "__kwdefaults__")
+            object.__setattr__(
+                self, "_name", f"{self.base.name()}.{self.field}['{self.idx_key}']"
+            )
+        else:
+            assert isinstance(self.idx_key, int)
+            object.__setattr__(self, "field", "__defaults__")
+            object.__setattr__(
+                self, "_name", f"{self.base.name()}.{self.field}[{self.idx_key}]"
+            )
+
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+        codegen.extend_output(codegen.create_load_attrs(self.field))
+        codegen.append_output(codegen.create_load_const(self.idx_key))
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return self._name
+
+
+@dataclasses.dataclass(frozen=True)
+class GetItemSource(ChainedSource):
+    index: Any
+    index_is_slice: bool = False
+
+    def __post_init__(self):
+        assert self.base is not None
+        if isinstance(self.index, slice):
+            # store the hashable version of the slice so the whole GetItemSource is hashable
+            super().__setattr__("index", self.index.__reduce__())
+            super().__setattr__("index_is_slice", True)
+
+    def reconstruct(self, codegen):
+        reconstruct_getitem(self, codegen, index_is_slice=self.index_is_slice)
+        codegen.append_output(create_instruction("BINARY_SUBSCR"))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def unpack_slice(self):
+        assert self.index_is_slice
+        slice_class, slice_args = self.index
+        return slice_class(*slice_args)
+
+    def name(self):
+        # Index can be of following types
+        # 1) ConstDictKeySource
+        # 2) enum.Enum
+        # 3) index is a slice - example 1:4
+        # 4) index is a constant - example string, integer
+        if isinstance(self.index, Source):
+            if not isinstance(self.index, ConstDictKeySource):
+                raise ValueError(
+                    "GetItemSource index must be a constant, enum or ConstDictKeySource"
+                )
+            return f"{self.base.name()}[{self.index.name()}]"
+        elif self.index_is_slice:
+            return f"{self.base.name()}[{self.unpack_slice()!r}]"
+        elif isinstance(self.index, enum.Enum):
+            return f"{self.base.name()}[{enum_repr(self.index, self.guard_source().is_local())}]"
+        else:
+            return f"{self.base.name()}[{self.index!r}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class ConstDictKeySource(GetItemSource):
+    def is_dict_key(self):
+        return True
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from(utils.__name__, "dict_keys_getitem")
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_const(self.index))
+        codegen.extend_output(create_call_function(2, True))
+
+    def name(self):
+        # The list creation will be CSE'd by PyExprCSEPass
+        return f"list({self.base.name()}.keys())[{self.index!r}]"
+
+
+@dataclasses.dataclass(frozen=True)
+class TupleIteratorGetItemSource(GetItemSource):
+    def reconstruct(self, codegen):
+        codegen.load_import_from(utils.__name__, "tuple_iterator_getitem")
+        self.base.reconstruct(codegen)
+        codegen.append_output(codegen.create_load_const(self.index))
+        codegen.extend_output(create_call_function(2, True))
+
+    def name(self):
+        return f"___tuple_iterator_getitem({self.base.name()}, {self.index!r})"
+
+
+@dataclasses.dataclass(frozen=True)
+class TypeSource(ChainedSource):
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("builtins", "type")
+        self.base.reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        return f"type({self.base.name()})"
+
+
+@dataclasses.dataclass(frozen=True)
+class ODictGetItemSource(ChainedSource):
+    index: Any
+
+    def __post_init__(self):
+        assert self.base is not None
+
+    def reconstruct(self, codegen):
+        codegen.append_output(
+            codegen._create_load_const(collections.OrderedDict.__getitem__)
+        )
+        reconstruct_getitem(self, codegen, index_is_slice=False)
+        codegen.extend_output(create_call_function(2, True))
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def name(self):
+        if isinstance(self.index, type):
+            rep = f'__load_module("{self.index.__module__}").{self.index.__qualname__}'
+            return f"___odict_getitem({self.base.name()}, {rep})"
+        elif isinstance(self.index, Source):
+            return f"___odict_getitem({self.base.name()}, {self.index.name()})"
+        else:
+            return f"___odict_getitem({self.base.name()}, {self.index!r})"
+
+
+@dataclasses.dataclass(frozen=True)
+class NNModuleSource(ChainedSource):
+    def reconstruct(self, codegen):
+        self.base.reconstruct(codegen)
+
+    def guard_source(self):
+        return _GUARD_SOURCE_NN_MODULE[self.base.guard_source()]
+
+    def name(self):
+        return self.base.name()
+
+
+@dataclasses.dataclass(frozen=True)
+class NotNNModuleSource(NNModuleSource):
+    def guard_source(self):
+        return _GUARD_SOURCE_NOT_NN_MODULE[self.base.guard_source()]
+
+
+@dataclasses.dataclass(frozen=True)
+class FSDPNNModuleSource(NNModuleSource):
+    def guard_source(self):
+        return _GUARD_SOURCE_FSDP_MODULE[self.base.guard_source()]
+
+
+@dataclasses.dataclass(frozen=True)
+class GlobalStateSource(Source):
+    def name(self):
+        return ""
+
+    def guard_source(self):
+        return GuardSource.GLOBAL
+
+
+@dataclasses.dataclass(frozen=True)
+class ConstantSource(Source):
+    source_name: str
+
+    def reconstruct(self, codegen):
+        codegen.append_output(
+            codegen.create_load_global(self.source_name, False, add=False)
+        )
+
+    def guard_source(self):
+        return GuardSource.CONSTANT
+
+    def name(self):
+        return self.source_name
+
+    def make_guard(self, fn):
+        raise NotImplementedError()
+
+
+@dataclasses.dataclass(frozen=True)
+class NumpyTensorSource(ChainedSource):
+    def name(self) -> str:
+        return f"___from_numpy({self.base.name()})"
+
+    def guard_source(self):
+        return self.base.guard_source()
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("torch", "as_tensor")
+        self.base.reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
+
+
+# This is a synthetic source that is associated with the singleton
+# shape env guard we always register for all frames.  We get the actual
+# guard contents from the ambient ShapeEnv
+@dataclasses.dataclass(frozen=True)
+class ShapeEnvSource(Source):
+    def name(self):
+        return ""
+
+    def guard_source(self):
+        return GuardSource.SHAPE_ENV
+
+
+@dataclasses.dataclass(frozen=True)
+class BackwardStateSource(Source):
+    def name(self):
+        return ""
+
+    def guard_source(self):
+        return GuardSource.BACKWARD_STATE
+
+
+def is_from_local_source(source: Source, *, allow_cell_or_freevar=True):
+    if isinstance(source, ChainedSource):
+        return is_from_local_source(
+            source.base, allow_cell_or_freevar=allow_cell_or_freevar
+        )
+    if not isinstance(source, LocalSource):
+        return False
+    if not allow_cell_or_freevar and source.cell_or_freevar:
+        return False
+    return True
+
+
+# TODO: can probably write a generic "test this on everything in the chain"
+# helper
+def is_from_defaults(source: Source):
+    if isinstance(source, DefaultsSource):
+        return True
+    if isinstance(source, ChainedSource):
+        return is_from_defaults(source.base)
+    return False
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/symbolic_convert.py b/MLPY/Lib/site-packages/torch/_dynamo/symbolic_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a624fd411db334d373c1340afb86db2255b5c0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/symbolic_convert.py
@@ -0,0 +1,2603 @@
+import collections
+import contextlib
+import copy
+import dataclasses
+import dis
+import functools
+import importlib
+import inspect
+import itertools
+import linecache
+import logging
+import operator
+import sys
+import textwrap
+import threading
+import traceback
+import types
+import typing
+import weakref
+from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, Type
+from unittest.mock import patch
+
+import torch
+import torch._logging
+from torch._guards import Checkpointable, tracing, TracingContext
+
+from . import config, exc, logging as torchdynamo_logging, trace_rules, variables
+from .bytecode_analysis import (
+    get_indexof,
+    JUMP_OPNAMES,
+    livevars_analysis,
+    propagate_line_nums,
+)
+from .bytecode_transformation import (
+    cleaned_instructions,
+    create_call_function,
+    create_instruction,
+    create_jump_absolute,
+    Instruction,
+    is_generator,
+    unique_id,
+)
+from .code_context import code_context
+from .codegen import PyCodegen
+from .current_scope_id import current_scope_id
+from .exc import ArgsMismatchError, BackendCompilerFailed, unimplemented, Unsupported
+from .funcname_cache import get_funcname
+from .guards import GuardBuilder, install_guard
+from .output_graph import GraphCompileReason, OutputGraph, OutputGraphState
+from .replay_record import DummyModule, ExecutionRecorder
+from .resume_execution import ContinueExecutionCache, ReenterWith
+from .source import (
+    AttrSource,
+    GetItemSource,
+    GlobalSource,
+    GlobalWeakRefSource,
+    LocalSource,
+    Source,
+)
+from .trace_rules import is_builtin_constant, is_forbidden
+from .utils import (
+    counters,
+    get_fake_value,
+    get_instruction_source_311,
+    graph_break_dup_warning_checker,
+    istype,
+    LazyString,
+    proxy_args_kwargs,
+)
+from .variables.base import (
+    _is_top_level_scope,
+    is_side_effect_safe,
+    MutableLocal,
+    typestr,
+    VariableTracker,
+)
+from .variables.builder import VariableBuilder, wrap_fx_proxy
+from .variables.builtin import BuiltinVariable
+from .variables.constant import ConstantVariable
+from .variables.ctx_manager import (
+    ContextWrappingVariable,
+    GenericContextWrappingVariable,
+    WithExitFunctionVariable,
+)
+from .variables.dicts import ConstDictVariable, SetVariable
+from .variables.functions import (
+    BaseUserFunctionVariable,
+    NestedUserFunctionVariable,
+    SkipFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+)
+from .variables.lists import (
+    BaseListVariable,
+    ListIteratorVariable,
+    ListVariable,
+    SliceVariable,
+    TupleVariable,
+)
+from .variables.misc import (
+    ClosureVariable,
+    GetAttrVariable,
+    InlinedClosureVariable,
+    NullVariable,
+    PythonModuleVariable,
+    UnknownVariable,
+)
+from .variables.nn_module import NNModuleVariable
+from .variables.tensor import (
+    supported_const_comparison_ops,
+    supported_tensor_comparison_ops,
+    SymNodeVariable,
+    TensorVariable,
+)
+from .variables.user_defined import (
+    RemovableHandleVariable,
+    UserDefinedClassVariable,
+    UserDefinedObjectVariable,
+    UserDefinedVariable,
+)
+
+log = logging.getLogger(__name__)
+graph_break_log = torch._logging.getArtifactLogger(__name__, "graph_breaks")
+trace_call_log = torch._logging.getArtifactLogger(__name__, "trace_call")
+trace_source_log = torch._logging.getArtifactLogger(__name__, "trace_source")
+tls = threading.local()
+
+
+@dataclasses.dataclass
+class SpeculationEntry:
+    filename: str
+    lineno: int
+    instruction_pointer: int
+    failed: bool = False
+    reason: Optional[GraphCompileReason] = None
+
+    def fail_and_restart_analysis(self):
+        """
+        Start tracing of the current frame over again, and don't take this branch.
+        """
+        self.failed = True
+        raise exc.SpeculationRestartAnalysis()
+
+
+@dataclasses.dataclass
+class SpeculationLog:
+    """
+    SpeculationLog replaces the prior copy_graphstate/restore_graphstate
+    checkpointing.  Rather than saving/restoring state, we restart the
+    dynamo conversion process over from the beginning -- but when we
+    hit the start of the speculation that failed, we instead generate
+    a graph break.
+    """
+
+    entries: List[SpeculationEntry] = dataclasses.field(default_factory=list)
+    index: int = 0
+
+    def restart(self):
+        self.index = 0
+
+    def clear(self):
+        self.entries.clear()
+        self.index = 0
+
+    def next(self, filename: str, lineno: int, instruction_pointer) -> SpeculationEntry:
+        """
+        Lookup or create a SpeculationEntry() that is shared across
+        RestartAnalysis calls.  Args are used only for debug checks.
+        """
+        if len(self.entries) == self.index:
+            self.entries.append(SpeculationEntry(filename, lineno, instruction_pointer))
+        entry = self.entries[self.index]
+        self.index += 1
+        assert (
+            entry.instruction_pointer == instruction_pointer
+            and entry.filename == filename
+            and entry.lineno == lineno
+        ), textwrap.dedent(
+            f"""
+            SpecuationLog diverged at {self.index} of {len(self.entries)}:
+            - Run1: {entry.filename}:{entry.lineno} (ip={entry.instruction_pointer})
+            - Run2: {filename}:{lineno} (ip={instruction_pointer})
+            Please submit a bug report.
+            """
+        )
+        return entry
+
+
+@functools.lru_cache(None)
+def _step_logger():
+    return torchdynamo_logging.get_step_logger(log)
+
+
+@dataclasses.dataclass
+class BlockStackEntry:
+    target: Instruction
+    stack_index: Optional[int] = None
+    with_context: Optional[ContextWrappingVariable] = None
+
+    def can_restore(self):
+        return self.with_context is not None
+
+    def resume_fn(self):
+        assert self.stack_index is not None
+        if self.with_context and self.with_context.target_values:
+            return ReenterWith(self.stack_index, tuple(self.with_context.target_values))
+        else:
+            return ReenterWith(self.stack_index)
+
+    def exit(self, tx):
+        assert self.with_context is not None
+        return self.with_context.exit(tx)
+
+
+class InstructionTranslatorGraphState(NamedTuple):
+    output: OutputGraphState
+    symbolic_locals: Dict[str, VariableTracker]
+    stack: List[VariableTracker]
+    block_stack: List[BlockStackEntry]
+    instruction_pointer: Optional[int]
+    current_instruction: Instruction
+    next_instruction: Optional[Instruction]
+    lineno: int
+
+    def diff(self, other: "InstructionTranslatorGraphState") -> Optional[str]:
+        for k in self._fields:
+            if k == "output":
+                return self.output.diff(other.output, prefix=f"{k}.")
+            sv = getattr(self, k)
+            ov = getattr(other, k)
+            if sv != ov:
+                return f"{k} mismatch: {sv} != {ov}"
+        return None
+
+
+def stack_op(fn: typing.Callable[..., object]):
+    nargs = len(inspect.signature(fn).parameters)
+    fn_var = BuiltinVariable(fn)
+
+    @functools.wraps(fn)
+    def impl(self: "InstructionTranslatorBase", inst: Instruction):
+        self.push(fn_var.call_function(self, self.popn(nargs), {}))
+
+    return impl
+
+
+def _detect_and_normalize_assert_statement(
+    self: "InstructionTranslatorBase",
+    truth_fn: typing.Callable[[object], bool],
+    push: bool,
+):
+    # Detect if this jump instruction is assert and normalize the assert
+    # by pushing dummy error message when nothing is given.
+    #
+    # Python 3.9 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_ASSERTION_ERROR
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS
+    #
+    # Python 3.8 assertion is in following format:
+    # 18 POP_JUMP_IF_TRUE       28
+    # 20 LOAD_GLOBAL              0 (Assertion type)
+    # 22 LOAD_CONST               3 ('Assert message') -> optional instruction
+    # 24 CALL_FUNCTION            1                    -> optional instruction
+    # 26 RAISE_VARARGS            1
+
+    if (truth_fn is not operator.truth) or push:
+        return False
+
+    assert isinstance(self.instruction_pointer, int)
+    current_instruction_pointer = self.instruction_pointer
+    inst = self.instructions[current_instruction_pointer]
+    # Detect LOAD_ASSERTION_ERROR or LOAD_GLOBAL 0
+    if sys.version_info < (3, 9):
+        if inst.opname != "LOAD_GLOBAL" or inst.argval != "AssertionError":
+            return False
+    else:
+        if inst.opname != "LOAD_ASSERTION_ERROR":
+            return False
+
+    current_instruction_pointer += 1
+
+    # Use dummy error message if its hard to extract
+    error_msg = "assertion error"
+
+    inst = self.instructions[current_instruction_pointer]
+    # DETECT RAISE_VARARGS or LOAD CONST
+    if inst.opname == "LOAD_CONST":
+        if not isinstance(inst.argval, str):
+            return False
+        error_msg = inst.argval
+
+        # if it is LOAD_CONSTANT, it must be followed by CALL_FUNCTION
+        # (PRECALL for Python 3.11+)
+        current_instruction_pointer += 1
+        inst = self.instructions[current_instruction_pointer]
+        if inst.opname not in ("CALL_FUNCTION", "PRECALL"):
+            return False
+
+        # for Python 3.11+, PRECALL should be followed by CALL, then RAISE_VARARGS
+        # for Python < 3.11, CALL_FUNCTION should be followed by RAISE_VARARGS
+        current_instruction_pointer += 1
+        if inst.opname == "PRECALL":
+            current_instruction_pointer += 1
+        inst = self.instructions[current_instruction_pointer]
+
+    if inst.opname != "RAISE_VARARGS":
+        return False
+
+    self.push(ConstantVariable.create(error_msg))
+
+    return True
+
+
+def generic_jump(truth_fn: typing.Callable[[object], bool], push: bool):
+    def inner(self: "InstructionTranslatorBase", inst: Instruction):
+        value: VariableTracker = self.pop()
+        if (
+            config.rewrite_assert_with_torch_assert
+            and _detect_and_normalize_assert_statement(self, truth_fn, push)
+        ):
+            error_msg: VariableTracker = self.pop()
+            # Skip over things like `assert True`
+            if value.is_python_constant() and bool(value.as_python_constant()):
+                self.jump(inst)
+                return
+
+            # TODO maybe should respect DtoH sync intention of users later??
+            # Manually insert torch._assert_async instead of python assert and jump over
+            # assert related instructions as we don't need them anymore.
+
+            # if we see Tensor as assert statement, no need to call scalar_tensor
+            if isinstance(value, TensorVariable):
+                self.output.create_proxy(
+                    "call_function",
+                    torch._assert_async,
+                    *proxy_args_kwargs((value, error_msg), {}),
+                )
+                self.jump(inst)
+                return
+
+            if isinstance(value, SymNodeVariable):
+                # if the assertion is normal shape expression.
+                # just install guard and bail out.
+                sym_expr = value.sym_num
+                if not isinstance(sym_expr, torch.SymBool):
+                    sym_expr = sym_expr != 0
+
+                result = torch.fx.experimental.symbolic_shapes.expect_true(sym_expr)
+                if not result:
+                    raise unimplemented(
+                        "Assertion failed on symbolic shapes. Did you make sure eager mode succeeds?"
+                    )
+                self.jump(inst)
+                return
+
+            scalar_to_tensor_proxy = self.output.create_proxy(
+                "call_function", torch.scalar_tensor, *proxy_args_kwargs((value,), {})
+            )
+
+            scalar_to_tensor = wrap_fx_proxy(
+                self,
+                scalar_to_tensor_proxy,
+                example_value=get_fake_value(scalar_to_tensor_proxy.node, self),
+            )
+
+            self.output.create_proxy(
+                "call_function",
+                torch._assert_async,
+                *proxy_args_kwargs((scalar_to_tensor, error_msg), {}),
+            )
+            self.jump(inst)
+            return
+
+        if value.is_python_constant():
+            if truth_fn(value.as_python_constant()):
+                push and self.push(value)
+                self.jump(inst)
+        elif (
+            isinstance(value, (TensorVariable)) and self.should_compile_partial_graph()
+        ):
+            # compile a partial subgraph prefix then jump into user code
+            if self.has_backedge():
+                msg = (
+                    "Skipping frame because there is a graph break in a for/while loop\n"
+                    f"{self.frame_summary()}"
+                )
+                log.info(msg)
+                raise exc.SkipFrame(msg)
+
+            self.push(value)
+            log.debug("generic_jump triggered compile")
+            self.output.compile_subgraph(
+                self,
+                reason=GraphCompileReason(
+                    f"generic_jump {typestr(value)}", [self.frame_summary()]
+                ),
+            )
+            self.pop()
+
+            if_next = self.create_call_resume_at(self.next_instruction)
+            push and self.push(value)
+            if_jump = self.create_call_resume_at(inst.target)
+
+            self.output.add_output_instructions(
+                [create_instruction(inst.opname, target=if_jump[0])] + if_next + if_jump
+            )
+        elif isinstance(value, NNModuleVariable):
+            # Equivalent of "self.nn_module is not None"
+            mod = self.output.get_submodule(value.module_key)
+            if truth_fn(mod):
+                push and self.push(value)
+                self.jump(inst)
+        elif isinstance(value, UserDefinedObjectVariable):
+            x = value.var_getattr(self, "__bool__")
+            # if __bool__ is missing, trying __len__ to infer a truth value.
+            if isinstance(x, GetAttrVariable):
+                x = value.var_getattr(self, "__len__")
+
+            # __bool__ or __len__ is function
+            if isinstance(x, UserMethodVariable):
+                result = x.call_function(self, [], {})
+                if isinstance(result, ConstantVariable) and isinstance(
+                    result.value, (bool, int)
+                ):
+                    if truth_fn(result.value):
+                        push and self.push(value)
+                        self.jump(inst)
+                else:
+                    unimplemented(
+                        "generic_jump on UserDefined with __bool__ returning non-constant"
+                    )
+            # __bool__ or __len__ is non-function or not existed in the user defined object
+            else:
+                if truth_fn(True):
+                    push and self.push(value)
+                    self.jump(inst)
+        elif not isinstance(value, TensorVariable) and value.has_unpack_var_sequence(
+            self
+        ):
+            if truth_fn(len(value.unpack_var_sequence(self))):
+                push and self.push(value)
+                self.jump(inst)
+        elif isinstance(value, SymNodeVariable):
+            eval_result = value.evaluate_expr(self.output)
+            if truth_fn(eval_result):
+                push and self.push(value)
+                self.jump(inst)
+        elif isinstance(value, variables.BackwardHookVariable):
+            if truth_fn(True):
+                push and self.push(value)
+                self.jump(inst)
+        else:
+            from .source import is_constant_source
+
+            if value.source is not None and is_constant_source(value.source):
+                if truth_fn(value.get_real_value()):  # type: ignore[attr-defined]
+                    push and self.push(value)
+                    self.jump(inst)
+            else:
+                # TODO link the torch.cond doc later
+                raise exc.UserError(
+                    exc.UserErrorType.DYNAMIC_CONTROL_FLOW,
+                    "Dynamic control flow is not supported at the moment. Please use "
+                    "functorch.experimental.control_flow.cond to explicitly capture the control flow.",
+                    case_name="cond_operands",
+                )
+
+    return inner
+
+
+explain = False
+
+
+def break_graph_if_unsupported(*, push):
+    def decorator(inner_fn):
+        @functools.wraps(inner_fn)
+        def wrapper(self: "InstructionTranslatorBase", inst: Instruction):
+            speculation = self.speculate()
+            if speculation.failed:
+                assert speculation.reason is not None
+                return handle_graph_break(self, inst, speculation.reason)
+            try:
+                TracingContext.set_current_loc(
+                    self.f_code.co_filename, self.lineno, self.f_code.co_name
+                )
+                return inner_fn(self, inst)
+            except Unsupported as excp:
+                if self.generic_context_manager_depth > 0:
+                    # We don't support graph break under GenericContextWrappingVariable,
+                    # If there is, we roll back to the checkpoint and fall back.
+                    excp.remove_from_stats()
+                    unimplemented("Graph break under GenericContextWrappingVariable")
+
+                if isinstance(excp, exc.UncapturedHigherOrderOpError):
+                    raise
+
+                if not self.should_compile_partial_graph():
+                    raise
+
+                user_stack = excp.real_stack
+                # TODO: Also report the traceback from the parent frame
+                user_stack_formatted = "".join(traceback.format_list(user_stack))
+                frame_loc = (user_stack[-1].filename, user_stack[-1].lineno)
+                # torch._dynamo.explain() formats this a little nicer, and presents a slightly
+                # more actionable user code pointer
+                if (
+                    graph_break_log.isEnabledFor(logging.DEBUG)
+                    and not explain
+                    and graph_break_dup_warning_checker.add(frame_loc)
+                ):
+                    # This log line is exercised from
+                    #   python test/dynamo/test_exc.py -k test_graph_break_log
+                    graph_break_log.debug(
+                        "Graph break: from user code at:\n%s",
+                        user_stack_formatted,
+                        exc_info=True,
+                    )
+                else:
+                    # This log line MUST NOT contain the string "Graph break",
+                    # exercised by
+                    #   python test/dynamo/test_misc.py -k test_duplicate_graph_break_log
+                    log.debug(
+                        "Unsupported break in user code at %s:%s (details suppressed)",
+                        *frame_loc,
+                    )
+
+                if self.has_backedge():
+                    msg = (
+                        "Skipping frame because there is a graph break in a for/while loop\n"
+                        f"{self.frame_summary()}"
+                    )
+                    log.info(msg)
+                    raise exc.SkipFrame(msg) from excp
+
+                excp.remove_from_stats()
+                excp.add_to_stats("graph_break")
+                speculation.reason = GraphCompileReason(excp.msg, user_stack)
+            speculation.fail_and_restart_analysis()
+
+        def handle_graph_break(
+            self: "InstructionTranslatorBase",
+            inst: Instruction,
+            reason: GraphCompileReason,
+        ):
+            self.output.compile_subgraph(self, reason=reason)
+            cg = PyCodegen(self)
+            cleanup: List[Instruction] = []
+            # Reconstruct the context variables in the block stack
+            for b in self.block_stack:
+                assert b.with_context is not None
+                cg(b.with_context)
+                cg.extend_output(b.resume_fn().try_except(cg.code_options, cleanup))
+            self.output.add_output_instructions(cg.get_instructions())
+            del cg
+
+            if sys.version_info >= (3, 11) and inst.opname == "CALL":
+                kw_names = (
+                    self.kw_names.as_python_constant()
+                    if self.kw_names is not None
+                    else ()
+                )
+                if len(kw_names) > 0:
+                    self.output.add_output_instructions(
+                        [create_instruction("KW_NAMES", argval=kw_names)]
+                    )
+                self.output.add_output_instructions(
+                    create_call_function(inst.arg, False)
+                )
+            else:
+                # copy instruction, but without exception table data
+                assert inst.target is None
+                inst_copy = copy.copy(inst)
+                inst_copy.exn_tab_entry = None
+                self.output.add_output_instructions([inst_copy])
+
+            self.output.add_output_instructions(cleanup)
+
+            if sys.version_info >= (3, 11) and inst.opname == "CALL":
+                # stack effect for PRECALL + CALL is split between the two instructions
+                stack_effect = dis.stack_effect(
+                    dis.opmap["PRECALL"], inst.arg
+                ) + dis.stack_effect(dis.opmap["CALL"], inst.arg)
+            else:
+                stack_effect = dis.stack_effect(inst.opcode, inst.arg)
+            self.popn(push - stack_effect)
+
+            for _ in range(push):
+                self.push(UnknownVariable())
+            self.output.add_output_instructions(
+                self.create_call_resume_at(self.next_instruction)
+            )
+
+        return wrapper
+
+    return decorator
+
+
+class InstructionTranslatorBase(Checkpointable[InstructionTranslatorGraphState]):
+    output: OutputGraph
+    symbolic_locals: Dict[str, VariableTracker]
+    symbolic_globals: Dict[str, VariableTracker]
+    stack: List[VariableTracker]
+    instruction_pointer: Optional[int]
+    current_instruction: Instruction
+    next_instruction: Optional[Instruction]
+    block_stack: List[BlockStackEntry]
+    lineno: int
+    kw_names: Optional[ConstantVariable]
+    accept_prefix_inst: bool
+    prefix_insts: List[Instruction]
+    inline_depth: int
+    inconsistent_side_effects: bool
+    current_speculation: Optional[SpeculationEntry]
+
+    def mark_inconsistent_side_effects(self):
+        """
+        InstructionTranslator has encountered instructions which may cause
+        dynamo to see a different version of history from eager
+        See: https://github.com/pytorch/pytorch/issues/110765
+        """
+        self.inconsistent_side_effects = True
+
+    def has_backedge(self):
+        cur_offset = self.current_instruction.offset
+        assert self.instruction_pointer is not None
+        for inst in self.instructions[self.instruction_pointer :]:
+            if inst.opname in JUMP_OPNAMES:
+                jump_offset = inst.argval
+                if jump_offset < cur_offset:
+                    return True
+        return False
+
+    def cell_and_freevars(self):
+        if not hasattr(self, "_cell_and_freevars"):
+            self._cell_and_freevars = tuple(
+                self.code_options["co_cellvars"] or []
+            ) + tuple(self.code_options["co_freevars"] or [])
+        return self._cell_and_freevars
+
+    def prune_dead_locals(self):
+        reads = livevars_analysis(self.instructions, self.current_instruction)
+        # implicit use by super()
+        # reads = reads | {"__class__"}
+        # output variables?
+        reads = reads | set(self.cell_and_freevars())
+        self.symbolic_locals = {
+            k: v for k, v in self.symbolic_locals.items() if k in reads
+        }
+        self.output.side_effects.prune_dead_object_new(self)
+
+    def call_function(
+        self,
+        fn: VariableTracker,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ):
+        assert isinstance(fn, VariableTracker)
+        assert isinstance(args, list)
+        assert isinstance(kwargs, dict)
+        assert all(
+            isinstance(x, VariableTracker)
+            for x in itertools.chain(args, kwargs.values())
+        )
+        inner_fn = None
+        if hasattr(fn, "value"):
+            inner_fn = fn.value
+        if hasattr(fn, "fn"):
+            inner_fn = fn.fn
+        if inner_fn and callable(inner_fn) and is_forbidden(inner_fn):
+            raise AssertionError(f"Attempt to trace forbidden callable {inner_fn}")
+        self.push(fn.call_function(self, args, kwargs))
+
+    def inline_user_function_return(self, fn, args, kwargs):
+        """
+        A call to some user defined function by inlining it.
+        """
+        return InliningInstructionTranslator.inline_call(self, fn, args, kwargs)
+
+    def get_line_of_code_header(self, lineno=None):
+        if lineno is None:
+            lineno = self.lineno
+        inline_depth_str = (
+            f" (inline depth: {self.inline_depth})" if self.inline_depth > 0 else ""
+        )
+        funcname = get_funcname(self.f_code.co_filename, lineno)
+        funcname_str = "" if funcname is None else f" ({funcname})"
+        return f"{self.f_code.co_filename}:{lineno} in {self.f_code.co_name}{funcname_str}{inline_depth_str}"
+
+    def get_log_starts_line_log_str(self):
+        log_str = f"TRACE starts_line {self.get_line_of_code_header()}\n"
+        line = linecache.getline(self.f_code.co_filename, self.lineno).rstrip()
+        log_str += f"    {line}"
+        return log_str
+
+    def log_starts_line(self):
+        trace_source_log.debug("%s", LazyString(self.get_log_starts_line_log_str))
+
+    def step(self):
+        """Process exactly one instruction, return False we should exit"""
+        assert isinstance(self.instruction_pointer, int)
+        inst = self.instructions[self.instruction_pointer]
+        self.current_instruction = inst
+        self.instruction_pointer += 1
+        if self.instruction_pointer < len(self.instructions):
+            self.next_instruction = self.instructions[self.instruction_pointer]
+        else:
+            self.instruction_pointer = None
+            self.next_instruction = None
+        if inst.starts_line and self.lineno != inst.starts_line:
+            self.lineno = inst.starts_line
+            self.log_starts_line()
+
+        if (
+            len(self.stack) == 0
+            and self.should_compile_partial_graph()
+            and self.is_non_empty_graph()
+        ):
+            self.current_speculation = self.speculate()
+            if self.current_speculation.failed:
+                return self.step_graph_break(inst)
+
+        log.debug("TRACE %s %s %s", inst.opname, inst.argval, self.stack)
+
+        # 3.11 no longer uses a block stack, but we still keep track of one
+        # so that we know which contexts are currently active.
+        # For our purposes, all exception table entries with the same target
+        # are considered to be part of the same "block".
+        if sys.version_info >= (3, 11):
+            entry = inst.exn_tab_entry
+            if not (
+                # still in the same block
+                self.block_stack
+                and entry
+                and self.block_stack[-1].target is entry.target
+            ):
+                if not entry:
+                    # no longer in any block
+                    # It is possible for NOPs to be between two instructions
+                    # in the same block, but the NOPs are not covered by an
+                    # exception table entry. In this case, assume that we
+                    # are still in the same block.
+                    if self.block_stack and inst.opname != "NOP":
+                        # If we really escape from a block and the current
+                        # instruction is not in another block, then there
+                        # should be no other nested blocks that we are in.
+                        assert len(self.block_stack) == 1
+                        self.block_stack.pop()
+                elif (
+                    # current instruction is in the previous block
+                    len(self.block_stack) > 1
+                    and self.block_stack[-2].target is entry.target
+                ):
+                    # exit the current block
+                    self.block_stack.pop()
+                else:
+                    # current instruction is in a new block
+                    # push block to stack - note, BEFORE_WITH blocks won't
+                    # be pushed here since BEFORE_WITH pushes the block, and
+                    # the current instruction would be counted as being in that block.
+                    self.block_stack.append(
+                        BlockStackEntry(entry.target, len(self.stack))
+                    )
+
+        try:
+            if not hasattr(self, inst.opname):
+                unimplemented(f"missing: {inst.opname}")
+            TracingContext.set_current_loc(
+                self.f_code.co_filename, self.lineno, self.f_code.co_name
+            )
+            getattr(self, inst.opname)(inst)
+
+            return inst.opname != "RETURN_VALUE"
+        except Unsupported:
+            if self.current_speculation is None:
+                log.debug("empty checkpoint")
+                raise
+            log.debug("step triggered compile", exc_info=True)
+
+        self.current_speculation.fail_and_restart_analysis()
+
+    def step_graph_break(self, continue_inst):
+        # generate code from checkpoint
+        assert not self.output.output_instructions
+        assert self.current_speculation is not None
+        self.output.compile_subgraph(
+            self,
+            partial_convert=True,
+            reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
+        )
+        self.output.add_output_instructions(
+            [create_jump_absolute(continue_inst)] + self.instructions
+        )
+
+    def run_ctx_mgr(self):
+        # NB: Don't push the top level frame summary; set_current_loc will
+        # take care of it.  However, DO make sure we attach real_stack to
+        # exceptions
+        return TracingContext.current_frame(None)
+
+    def run(self):
+        with self.run_ctx_mgr():
+            try:
+                self.output.push_tx(self)
+                while (
+                    self.instruction_pointer is not None
+                    and not self.output.should_exit
+                    and self.step()
+                ):
+                    pass
+            except BackendCompilerFailed:
+                raise
+            except Exception as e:
+                if config.replay_record_enabled:
+                    e.exec_record = self.exec_recorder.get_record()  # type: ignore[attr-defined]
+                raise
+            finally:
+                self.output.pop_tx()
+                # Cleanup the outputGraph to delete the held tensors. We perform the
+                # cleanup only for InstructionTranslator and not
+                # InliningInstructionTranslator. The InliningInstructionTranslator
+                # mutates the output object and is restored to original state if
+                # there was an exception.
+                if isinstance(self, InstructionTranslator):
+                    self.output.cleanup()
+
+    def push(self, val: Optional[VariableTracker]):
+        assert val is None or isinstance(
+            val, VariableTracker
+        ), f"push expects VariableTracker, got {typestr(val)}"
+        self.stack.append(val)  # type: ignore[arg-type]
+
+    def push_many(self, vals: List[VariableTracker]):
+        for val in vals:
+            self.push(val)
+
+    def pop(self) -> VariableTracker:
+        return self.stack.pop()
+
+    def popn(self, n: int) -> List[VariableTracker]:
+        assert n >= 0
+        return list(reversed([self.pop() for _ in range(n)]))
+
+    def LOAD_FAST(self, inst):
+        name = inst.argval
+        if name in self.f_locals and config.replay_record_enabled:
+            self.exec_recorder.add_local_var(name, self.f_locals[name])
+
+        if name.startswith(".") and name not in self.symbolic_locals:
+            # This happens in dict/list comprehensions
+            name = name.replace(".", "implicit")
+        assert name not in self.cell_and_freevars()
+        if name not in self.symbolic_locals:
+            unimplemented("undefined LOAD_FAST")
+        self.push(self.symbolic_locals[name])
+        if name.startswith("___stack"):
+            self.symbolic_locals.pop(name)
+
+    def LOAD_DEREF(self, inst):
+        assert inst.argval in self.cell_and_freevars()
+
+        if inst.argval in self.f_locals and config.replay_record_enabled:
+            self.exec_recorder.add_local_var(inst.argval, self.f_locals[inst.argval])
+
+        if inst.argval not in self.symbolic_locals:
+            unimplemented(f"undefined LOAD_DEREF {inst.argval}")
+        self.push(self.symbolic_locals[inst.argval])
+
+    def STORE_FAST(self, inst):
+        loaded_vt = self.pop()
+        name = inst.argval
+        # Only rename at the top-level scope, this is to avoid the confusion between
+        # mutating a variable vs renaming it (e.g. a = b) during speculating a higher order op,
+        # where mutation is prohibited and it's difficult to differentiate it with renaming.
+        if _is_top_level_scope(current_scope_id()):
+            loaded_vt = loaded_vt.rename(self, name)
+        self.symbolic_locals[name] = loaded_vt
+
+    def DELETE_FAST(self, inst):
+        del self.symbolic_locals[inst.argval]
+
+    STORE_DEREF = STORE_FAST
+
+    def LOAD_CLOSURE(self, inst):
+        self.push(ClosureVariable(name=inst.argval))
+
+    def LOAD_CONST(self, inst):
+        # For empty tuples, create empty TupleVariable
+        if isinstance(inst.argval, tuple) and not inst.argval:
+            self.push(TupleVariable([]))
+        else:
+            self.push(ConstantVariable.create(value=inst.argval))
+
+    def get_global_source(self, name):
+        source: Source
+        if self.output.global_scope is self.f_globals:
+            source = GlobalSource(name)
+        else:
+            if "__name__" in self.f_globals:
+                source = AttrSource(
+                    self.import_source(self.f_globals["__name__"]), name
+                )
+            else:
+                mangled_name = self.output.install_global_by_id(
+                    "___unnamed_scope", self.f_globals
+                )
+                source = GetItemSource(GlobalSource(mangled_name), name)
+        return source
+
+    def LOAD_GLOBAL(self, inst):
+        if sys.version_info >= (3, 11):
+            if inst.arg % 2:
+                self.PUSH_NULL(inst)
+
+        name = inst.argval
+
+        if config.replay_record_enabled:
+            if name in self.f_globals:
+                self.exec_recorder.add_global_var(name, self.f_globals[name])
+            else:
+                assert name in self.f_builtins
+                self.exec_recorder.builtins[name] = self.f_builtins[name]
+
+        if inst.argval == "AssertionError":
+            unimplemented("assert with non-string message")
+
+        if name in self.symbolic_globals:
+            variable = self.output.side_effects[self.symbolic_globals[name]]
+            self.push(self.output.side_effects.load_global(variable, name))
+            return
+
+        try:
+            value = self.f_globals[name]
+        except KeyError:
+            return self.load_builtin(inst)
+
+        source = self.get_global_source(name)
+        self.push(VariableBuilder(self, source)(value))
+
+    def STORE_GLOBAL(self, inst):
+        value = self.pop()
+        name = inst.argval
+        source = self.get_global_source(name)
+        if name not in self.symbolic_globals:
+            self.symbolic_globals[name] = object()  # type: ignore[assignment]  # sentinel object
+        variable = self.output.side_effects.track_global_existing(
+            source, self.symbolic_globals[name]
+        )
+        if isinstance(value, RemovableHandleVariable):
+            unimplemented("Storing handles in globals - NYI")
+        self.output.side_effects.store_global(variable, name, value)
+
+    def import_source(self, module_name):
+        """Create an alias to a module for use in guards"""
+        if "torch_package" in module_name:
+            value = torch.package.package_importer._package_imported_modules[
+                module_name
+            ]
+            alias = (
+                module_name.replace(">", "_").replace("<", "_").replace(".", "_dot_")
+            )
+        else:
+            value = importlib.import_module(module_name)
+            alias = f"__import_{module_name.replace('.', '_dot_')}"
+        f_globals = self.output.global_scope
+        assert alias not in f_globals or f_globals[alias] is value
+        f_globals[alias] = value
+        self.output.update_co_names(alias)
+        return GlobalSource(alias)
+
+    def resolve_name(self, name, package, level):
+        """
+        Copied from the Cpython implementation of __import__
+        Resolve a relative module name to an absolute one.
+        https://github.com/python/cpython/blob/5a094f0255eea1db58fb2cf14c200971e64ec36e/Lib/importlib/_bootstrap.py#L902
+        """
+        bits = package.rsplit(".", level - 1)
+        if len(bits) < level:
+            raise ImportError("attempted relative import beyond top-level package")
+        base = bits[0]
+        return f"{base}.{name}" if name else base
+
+    def calc_package(self):
+        """
+        Copied from the Cpython implementation of __import__
+        https://github.com/python/cpython/blob/5a094f0255eea1db58fb2cf14c200971e64ec36e/Lib/importlib/_bootstrap.py#L1090
+        """
+        package = self.f_globals.get("__package__")
+        spec = self.f_globals.get("__spec__")
+        if package is not None:
+            if spec is not None and package != spec.parent:
+                log.warning(
+                    "__package__ != __spec__.parent (%r != %r)",
+                    package,
+                    spec.parent,
+                    stacklevel=3,
+                )
+            return package
+        elif spec is not None:
+            return spec.parent
+        else:
+            log.warning(
+                "can't resolve package from __spec__ or __package__, "
+                "falling back on __name__ and __path__",
+                stacklevel=3,
+            )
+            package = self.f_globals["__name__"]
+            if "__path__" not in self.f_globals:
+                package = package.rpartition(".")[0]
+        return package
+
+    def IMPORT_NAME(self, inst):
+        level, fromlist = self.popn(2)
+        level = level.as_python_constant()
+        fromlist = fromlist.as_python_constant()
+        module_name = inst.argval
+
+        # Are we replaying? if so, load recorded module
+        recorded_name = (
+            f"{ExecutionRecorder.LOCAL_MOD_PREFIX}_{level}_{fromlist}_{module_name}"
+        )
+        if recorded_name in self.f_globals:
+            value = self.f_globals[recorded_name]
+            source = GlobalSource(recorded_name)
+        else:
+            value = __import__(
+                module_name,
+                fromlist=fromlist,
+                level=level,
+                globals=self.f_globals,
+            )
+
+            if level != 0:
+                pkg = self.calc_package()
+                module_name = self.resolve_name(module_name, pkg, level)
+
+            # For __import__, when the name variable is of the form package.module,
+            # normally, the top-level package (the name up till the first dot) is
+            # returned, not the module named by module_name. However, when a
+            # non-empty fromlist argument is given, the module named by name is
+            # returned. Therefore, we set the source correctly here.
+            if not fromlist:
+                top_level_module_name = module_name.partition(".")[0]
+                source = self.import_source(top_level_module_name)
+            else:
+                source = self.import_source(module_name)
+
+        if config.replay_record_enabled:
+            self.exec_recorder.add_local_mod(recorded_name, value)
+
+        if istype(value, (types.ModuleType, DummyModule)):
+            self.push(PythonModuleVariable(value, source=source))
+        else:
+            unimplemented(f"IMPORT_NAME {typestr(value)}")
+
+    def IMPORT_FROM(self, inst):
+        self.DUP_TOP(inst)
+        self.LOAD_ATTR(inst)
+
+    def load_builtin(self, inst):
+        if inst.argval not in self.f_builtins:
+            raise NameError(f"name '{inst.argval}' is not defined")
+        val = self.f_builtins[inst.argval]
+
+        if callable(val):
+            self.push(VariableBuilder(self, GlobalSource(inst.argval))(val))
+        else:
+            assert is_builtin_constant(val)
+            self.push(ConstantVariable.create(value=val))
+
+    def jump(self, inst):
+        self.instruction_pointer = self.indexof[inst.target]
+
+    JUMP_FORWARD = jump
+    JUMP_ABSOLUTE = jump
+
+    POP_JUMP_IF_FALSE = generic_jump(operator.not_, False)
+    POP_JUMP_IF_TRUE = generic_jump(operator.truth, False)
+    JUMP_IF_FALSE_OR_POP = generic_jump(operator.not_, True)
+    JUMP_IF_TRUE_OR_POP = generic_jump(operator.truth, True)
+
+    def SETUP_LOOP(self, inst):
+        # only exists in python<=3.7
+        self.block_stack.append(BlockStackEntry(inst.target))
+
+    def SETUP_EXCEPT(self, inst):
+        # only exists in python<=3.7
+        self.block_stack.append(BlockStackEntry(inst.target))
+
+    def POP_BLOCK(self, inst):
+        self.block_stack.pop()
+
+    def SETUP_WITH(self, inst):
+        self.setup_or_before_with(inst)
+
+    def SETUP_FINALLY(self, inst):
+        self.block_stack.append(BlockStackEntry(inst.target))
+
+    def BEGIN_FINALLY(self, inst):
+        self.push(None)
+
+    def WITH_CLEANUP_START(self, inst):
+        exit, exc = self.popn(2)
+        assert exc is None
+        self.push(exc)
+        self.push(exit.call_function(self, [ConstantVariable.create(None)] * 3, {}))
+
+    def WITH_CLEANUP_FINISH(self, inst):
+        self.popn(2)
+        self.push(None)
+
+    def CALL_FINALLY(self, inst):
+        """
+        pushes the address of the next instruction onto the stack and increments
+        bytecode counter by delta
+        """
+        # Python 3.8 only
+        assert self.next_instruction is not None
+        addr = self.indexof[self.next_instruction]
+        self.push(ConstantVariable.create(addr))
+        self.instruction_pointer = self.indexof[inst.target]
+
+    def END_FINALLY(self, inst):
+        # Python 3.8 only
+        # https://docs.python.org/3.8/library/dis.html#opcode-END_FINALLY
+        tos = self.pop()
+        if isinstance(tos, ConstantVariable):
+            self.instruction_pointer = tos.as_python_constant()
+        else:
+            pass
+
+    def POP_FINALLY(self, inst):
+        # Python 3.8 only
+        preserve_tos = inst.argval
+        if preserve_tos:
+            tos = self.pop()
+        _ = self.pop()
+        if preserve_tos:
+            self.push(tos)  # type: ignore[possibly-undefined]
+
+    def FOR_ITER(self, inst):
+        it = self.pop().realize()
+        if isinstance(it, (variables.ListIteratorVariable, variables.IteratorVariable)):
+            try:
+                val, next_iter = it.next_variables(self)
+                self.push(next_iter)
+                self.push(val)
+            except StopIteration:
+                self.jump(inst)
+        else:
+            unimplemented(f"FOR_ITER {typestr(it)}")
+
+    def COMPARE_OP(self, inst):
+        left, right = self.popn(2)
+        op = inst.argval
+        supported_any = dict(
+            itertools.chain(
+                supported_tensor_comparison_ops.items(),
+                supported_const_comparison_ops.items(),
+            )
+        )
+        if (
+            isinstance(
+                left,
+                (
+                    TensorVariable,
+                    SymNodeVariable,
+                    NNModuleVariable,
+                    BaseListVariable,
+                    UserDefinedVariable,
+                    BaseUserFunctionVariable,
+                    ConstDictVariable,
+                ),
+            )
+            and isinstance(right, ConstantVariable)
+            and right.value is None
+            and op in supported_const_comparison_ops
+        ):
+            # <non-None> is None
+            self.push(
+                ConstantVariable.create(
+                    supported_const_comparison_ops[op](object(), right.value)
+                )
+            )
+
+        elif (
+            left.is_python_constant()
+            and right.is_python_constant()
+            and op in supported_any
+        ):
+            # constant fold
+            self.push(
+                ConstantVariable.create(
+                    supported_any[op](
+                        left.as_python_constant(), right.as_python_constant()
+                    ),
+                )
+            )
+        elif op in ("in", "not in"):
+            self.push(right.call_method(self, "__contains__", [left], {}))
+            if op == "not in":
+                self.UNARY_NOT(inst)
+        else:
+            self.push(
+                BuiltinVariable(supported_any[op]).call_function(
+                    self, [left, right], {}
+                )
+            )
+
+    def GET_ITER(self, inst):
+        self.call_function(BuiltinVariable(iter), [self.pop()], {})
+
+    @break_graph_if_unsupported(push=1)
+    def CALL_FUNCTION(self, inst):
+        args = self.popn(inst.argval)
+        fn = self.pop()
+        self.call_function(fn, args, {})
+
+    @break_graph_if_unsupported(push=1)
+    def CALL_FUNCTION_EX(self, inst):
+        kwargsvars: VariableTracker
+        if inst.argval == 0:
+            kwargsvars = ConstDictVariable({})
+            argsvars = self.pop()
+        elif inst.argval == 1:
+            kwargsvars = self.pop()
+            argsvars = self.pop()
+        else:
+            unimplemented("CALL_FUNCTION_EX")
+        fn = self.pop()
+        if sys.version_info >= (3, 11):
+            null = self.pop()
+            assert isinstance(null, NullVariable)
+
+        if (
+            isinstance(fn, GetAttrVariable)
+            and isinstance(fn.obj, TensorVariable)
+            and fn.name == "view"
+            and isinstance(argsvars, (ConstantVariable, TensorVariable))
+        ):
+            # Hack to handle special case in some bert models.  Converts
+            # x.view(*shape) into x.view(shape), which is correct for view()
+            # but not generally.  See test_transpose_for_scores().
+            argsvars = TupleVariable([argsvars])
+
+        if not isinstance(
+            argsvars, BaseListVariable
+        ) and argsvars.has_unpack_var_sequence(self):
+            argsvars = TupleVariable(argsvars.unpack_var_sequence(self))
+
+        if not isinstance(argsvars, BaseListVariable) or not isinstance(
+            kwargsvars, ConstDictVariable
+        ):
+            unimplemented(f"non-static call {typestr(argsvars)} {typestr(kwargsvars)}")
+
+        # Map to a dictionary of str -> VariableTracker
+        kwargsvars = kwargsvars.keys_as_python_constant()
+        self.call_function(fn, argsvars.items, kwargsvars)
+
+    @break_graph_if_unsupported(push=1)
+    def CALL_FUNCTION_KW(self, inst):
+        argnames = self.pop()
+        args = self.popn(inst.argval)
+        fn = self.pop()
+        assert isinstance(argnames, TupleVariable) and argnames.is_python_constant()
+        argnames = argnames.as_python_constant()
+        args, kwargs_list = args[: -len(argnames)], args[-len(argnames) :]
+        kwargs = dict(zip(argnames, kwargs_list))
+        assert len(kwargs) == len(argnames)
+        self.call_function(fn, args, kwargs)
+
+    def LOAD_METHOD_SUPER(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        arg = inst.argval[0]
+        argval = self.code_options["co_names"][arg]
+        if sys.version_info < (3, 11):
+            self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+        else:
+            self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
+
+    def LOAD_ATTR_SUPER(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        arg = inst.argval[0]
+        argval = self.code_options["co_names"][arg]
+        self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+
+    def LOAD_METHOD(self, inst):
+        self.LOAD_ATTR(inst)
+        obj = self.pop()
+        if sys.version_info >= (3, 11):
+            # always follow the NULL + fn convention, since if obj
+            # is actually a method, self is already bound to it, so it
+            # doesn't need to be passed in as an arg.
+            self.PUSH_NULL(inst)
+            self.push(obj)
+        else:
+            self.push(obj)
+            self.push(None)
+
+    def CALL_METHOD(self, inst):
+        args = self.popn(inst.argval)
+        dummy = self.pop()
+        assert dummy is None
+        fn = self.pop()
+        self.call_function(fn, args, {})
+
+    def LOAD_ATTR(self, inst):
+        obj = self.pop()
+        result = BuiltinVariable(getattr).call_function(
+            self, [obj, ConstantVariable.create(inst.argval)], {}
+        )
+        self.push(result)
+
+    def STORE_ATTR(self, inst):
+        speculation = self.speculate()
+        if speculation.failed:
+            return self.store_attr_graph_break(inst)
+        val, obj = self.popn(2)
+
+        if isinstance(obj, NNModuleVariable):
+            # We don't allow side effects during export
+            # https://github.com/pytorch/torchdynamo/issues/1475
+            assert (
+                not self.export
+            ), f"Mutating module attribute {inst.argval} during export."
+
+        try:
+            BuiltinVariable(setattr).call_function(
+                self, [obj, ConstantVariable.create(inst.argval), val], {}
+            )
+            return
+        except Unsupported as e:
+            if not self.should_compile_partial_graph():
+                raise
+            log.debug("STORE_ATTR triggered compile", exc_info=True)
+            e.remove_from_stats()
+            e.add_to_stats("graph_break")
+        speculation.fail_and_restart_analysis()
+
+    def store_attr_graph_break(self, inst):
+        self.output.compile_subgraph(
+            self, reason=GraphCompileReason("store_attr", [self.frame_summary()])
+        )
+        self.output.add_output_instructions([copy.copy(inst)])
+        self.popn(2)
+        self.output.add_output_instructions(
+            self.create_call_resume_at(self.next_instruction)
+        )
+
+    def DELETE_ATTR(self, inst):
+        obj = self.pop()
+        BuiltinVariable(delattr).call_function(
+            self, [obj, ConstantVariable.create(inst.argval)], {}
+        )
+
+    def create_call_resume_at(self, offset):
+        raise AssertionError(
+            f"create_call_resume_at not overridden by subclass {type(self)}"
+        )
+
+    def should_compile_partial_graph(self) -> bool:
+        raise AssertionError(
+            f"should_compile_partial_graph not overridden by subclass {type(self)}"
+        )
+
+    @break_graph_if_unsupported(push=0)
+    def STORE_SUBSCR(self, inst):
+        val, obj, key = self.popn(3)
+        result = obj.call_method(self, "__setitem__", [key, val], {})
+
+    def BUILD_TUPLE(self, inst):
+        items = self.popn(inst.argval)
+        self.push(TupleVariable(items))
+
+    def BUILD_SLICE(self, inst):
+        items = self.popn(inst.argval)
+        self.push(SliceVariable(items))
+
+    def BUILD_LIST(self, inst):
+        items = self.popn(inst.argval)
+        self.push(ListVariable(items, mutable_local=MutableLocal()))
+
+    def BUILD_SET(self, inst):
+        if config.inject_BUILD_SET_unimplemented_TESTING_ONLY:
+            unimplemented("missing: BUILD_SET")
+        items = self.popn(inst.argval)
+        new_set = SetVariable(items, mutable_local=MutableLocal())
+        self.push(new_set)
+
+    def BUILD_LIST_UNPACK(self, inst, cls=ListVariable):
+        seqs = self.popn(inst.argval)
+        items = list()
+        for seq in seqs:
+            try:
+                items.extend(seq.unpack_var_sequence(self))
+            except NotImplementedError:
+                unimplemented(f"BUILD_LIST_UNPACK {seq}")
+        self.push(cls(items, mutable_local=MutableLocal()))
+
+    def BUILD_TUPLE_UNPACK(self, inst):
+        self.BUILD_LIST_UNPACK(inst, cls=TupleVariable)
+
+    BUILD_TUPLE_UNPACK_WITH_CALL = BUILD_TUPLE_UNPACK
+
+    def BUILD_MAP(self, inst):
+        items = self.popn(inst.argval * 2)
+        d = dict(zip(items[::2], items[1::2]))
+        self.push(ConstDictVariable(d, mutable_local=MutableLocal()))
+
+    def BUILD_MAP_UNPACK(self, inst):
+        items = self.popn(inst.argval)
+        # ensure everything is a dict
+        items = [BuiltinVariable(dict).call_function(self, [x], {}) for x in items]
+        result = dict()
+        for x in items:
+            assert isinstance(x, ConstDictVariable)
+            result.update(x.items)
+        self.push(
+            ConstDictVariable(
+                result,
+                mutable_local=MutableLocal(),
+            )
+        )
+
+    BUILD_MAP_UNPACK_WITH_CALL = BUILD_MAP_UNPACK
+
+    def BUILD_CONST_KEY_MAP(self, inst):
+        keys = self.pop()
+        values = self.popn(inst.argval)
+        assert isinstance(keys, TupleVariable)
+        assert keys.is_python_constant()
+
+        keys = keys.unpack_var_sequence(self)
+        assert len(keys) == len(values)
+
+        self.push(
+            ConstDictVariable(
+                dict(zip(keys, values)),
+                mutable_local=MutableLocal(),
+            )
+        )
+
+    def MAP_ADD(self, inst):
+        k, v = self.popn(2)
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg].realize()
+        assert isinstance(obj, ConstDictVariable)
+        obj.call_method(self, "__setitem__", (k, v), {})  # type: ignore[arg-type]
+
+    def SET_ADD(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg]
+        assert isinstance(obj, SetVariable)
+        assert obj.mutable_local
+        return obj.call_method(self, "add", [v], {})
+
+    def LIST_APPEND(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg].realize()
+        assert isinstance(obj, ListVariable)
+        assert obj.mutable_local
+        self.output.side_effects.mutation(obj)
+        obj.items.append(v)
+
+    def MAKE_FUNCTION(self, inst):
+        flags = inst.arg
+        old_stack = list(self.stack)
+        if sys.version_info < (3, 11):
+            fn_name = self.pop()
+        code = self.pop()
+        if sys.version_info >= (3, 11):
+            # MAKE_FUNCTION behavior actually changed in 3.11, see
+            # https://github.com/python/cpython/pull/93189/
+            assert hasattr(code.value, "co_qualname")  # type: ignore[attr-defined]
+            fn_name = ConstantVariable.create(value=code.value.co_qualname)  # type: ignore[attr-defined]
+        defaults = None
+        closure = None
+        annotations = None
+        kwdefaults = None
+
+        if flags & 0x08:
+            closure = self.pop()
+        if flags & 0x04:
+            annotations = self.pop()
+        if flags & 0x02:
+            kwdefaults = self.pop()
+        if flags & 0x01:
+            defaults = self.pop()
+
+        self.push(
+            NestedUserFunctionVariable(
+                fn_name,
+                code,
+                self.f_globals,
+                defaults,
+                kwdefaults,
+                annotations,
+                closure,
+                closure_scope=self,
+            )
+        )
+
+    def UNPACK_SEQUENCE(self, inst):
+        seq = self.pop()
+        if isinstance(seq, TensorVariable):
+            val = seq.unpack_var_sequence(self, idxes=range(inst.argval))
+        elif isinstance(seq, GetAttrVariable) and isinstance(seq.obj, TensorVariable):
+            # x, y = a.shape
+            proxy = getattr(seq.obj.as_proxy(), seq.name)
+            val = [wrap_fx_proxy(self, proxy[i]) for i in range(inst.argval)]
+        elif seq.has_unpack_var_sequence(self):
+            val = seq.unpack_var_sequence(self)
+        else:
+            unimplemented(f"UNPACK_SEQUENCE {seq}")
+        if len(val) != inst.argval:
+            unimplemented("UNPACK_SEQUENCE length mismatch")
+        for i in reversed(val):
+            self.push(i)
+
+    def UNPACK_EX(self, inst):
+        assert 0 <= inst.argval <= 0xFFFF
+        prefix = inst.argval & 0xFF  # low byte
+        suffix = inst.argval >> 8  # high byte
+        seq = self.pop()
+        if seq.has_unpack_var_sequence(self):
+            vals = list(seq.unpack_var_sequence(self))
+            assert len(vals) >= prefix + suffix
+            vals_prefix = vals[:prefix]
+            vals_list = vals[prefix : len(vals) - suffix]
+            vals_suffix = vals[len(vals) - suffix :]
+            for item in reversed(vals_suffix):
+                self.push(item)
+            self.push(TupleVariable(vals_list))
+            for item in reversed(vals_prefix):
+                self.push(item)
+        else:
+            unimplemented(f"UNPACK_EX {seq}")
+
+    def NOP(self, inst):
+        pass
+
+    def POP_TOP(self, inst):
+        self.pop()
+
+    def ROT_TWO(self, inst):
+        a = self.pop()
+        b = self.pop()
+        self.push(a)
+        self.push(b)
+
+    def ROT_THREE(self, inst):
+        a = self.pop()
+        b = self.pop()
+        c = self.pop()
+        self.push(a)
+        self.push(c)
+        self.push(b)
+
+    def ROT_FOUR(self, inst):
+        a = self.pop()
+        b = self.pop()
+        c = self.pop()
+        d = self.pop()
+        self.push(a)
+        self.push(d)
+        self.push(c)
+        self.push(b)
+
+    def DUP_TOP(self, inst):
+        a = self.pop()
+        self.push(a)
+        self.push(a)
+
+    def DUP_TOP_TWO(self, inst):
+        a = self.pop()
+        b = self.pop()
+        self.push(b)
+        self.push(a)
+        self.push(b)
+        self.push(a)
+
+    def FORMAT_VALUE(self, inst):
+        flags = inst.arg
+        if (flags & 0x04) == 0x04:
+            fmt_spec = self.pop()
+        else:
+            fmt_spec = ConstantVariable.create("")
+
+        value = self.pop()
+        if isinstance(value, SymNodeVariable):
+            value = ConstantVariable.create(str(value.sym_num))
+        if (flags & 0x03) == 0x01:
+            value = BuiltinVariable(str).call_function(self, [value], {})
+        elif (flags & 0x03) == 0x02:
+            value = BuiltinVariable(repr).call_function(self, [value], {})
+        elif (flags & 0x03) == 0x03:
+            value = BuiltinVariable(ascii).call_function(self, [value], {})
+
+        fmt_var = ConstantVariable.create("{:" + fmt_spec.as_python_constant() + "}")
+
+        self.call_function(BuiltinVariable(str.format), [fmt_var, value], {})
+
+    def BUILD_STRING(self, inst):
+        format_string_parts: List[str] = []
+        args: List[VariableTracker] = []
+        kwargs: Dict[str, VariableTracker] = {}
+        for part in self.popn(inst.arg):
+            if isinstance(part, ConstantVariable):
+                format_string_parts.append("{}")
+                args.append(part)
+            elif isinstance(part, variables.StringFormatVariable):
+                format_string_parts.append(part.format_string)
+                args.extend(part.sym_args)
+                if set(kwargs.keys()) & set(part.sym_kwargs.keys()):
+                    unimplemented(
+                        f"BUILD_STRING key conflict {kwargs} & {part.sym_kwargs}"
+                    )
+                kwargs.update(part.sym_kwargs)
+            else:
+                unimplemented(f"BUILD_STRING {part}")
+        self.push(
+            variables.StringFormatVariable.create(
+                "".join(format_string_parts), args, kwargs
+            )
+        )
+
+    def IS_OP(self, inst):
+        assert inst.argval == 0 or inst.argval == 1
+        if inst.argval == 0:
+            new_argval = "is"
+        else:
+            new_argval = "is not"
+        new_inst = create_instruction("COMPARE_OP", argval=new_argval)
+        self.COMPARE_OP(new_inst)
+
+    def CONTAINS_OP(self, inst):
+        assert inst.argval == 0 or inst.argval == 1
+        left, right = self.popn(2)
+        op = inst.argval
+        self.push(right.call_method(self, "__contains__", [left], {}))
+        if op == 1:
+            self.UNARY_NOT(inst)
+
+    def LIST_EXTEND(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg]
+        assert isinstance(obj, ListVariable)
+        assert obj.mutable_local
+        obj.call_method(self, "extend", [v], {})
+
+    def LIST_TO_TUPLE(self, inst):
+        self.push(BuiltinVariable(tuple).call_function(self, [self.pop()], {}))
+
+    def DICT_MERGE(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg].realize()
+        assert isinstance(obj, ConstDictVariable)
+        assert obj.mutable_local
+        obj.call_method(self, "update", [v], {})
+
+    DICT_UPDATE = DICT_MERGE
+
+    def GEN_START(self, inst):
+        self.pop()
+
+    def GET_LEN(self, inst):
+        tos = self.stack[-1]
+        if tos.is_python_constant():
+            self.push(ConstantVariable.create(len(tos.as_python_constant())))
+        else:
+            self.push(tos.call_method(self, "__len__", [], {}))
+
+    def MATCH_MAPPING(self, inst):
+        tos = self.stack[-1]
+        assert isinstance(tos, ConstDictVariable)
+        if isinstance(tos.items, collections.abc.Mapping):
+            self.push(ConstantVariable.create(True))
+        else:
+            self.push(ConstantVariable.create(False))
+
+    def MATCH_SEQUENCE(self, inst):
+        tos = self.stack[-1]
+        assert tos.is_python_constant()
+        tos_value = tos.as_python_constant()
+        if isinstance(tos_value, collections.abc.Sequence) and not isinstance(
+            tos_value, (str, bytes, bytearray)
+        ):
+            self.push(ConstantVariable.create(True))
+        else:
+            self.push(ConstantVariable.create(False))
+
+    def MATCH_KEYS(self, inst):
+        tos = self.stack[-1]
+        tos1 = self.stack[-2]
+        assert isinstance(tos1, ConstDictVariable)
+
+        if all(k in tos1 for k in tos):  # type: ignore[attr-defined]
+            self.push(TupleVariable([tos1.getitem_const(k) for k in tos]))  # type: ignore[attr-defined]
+            if sys.version_info < (3, 11):
+                self.push(ConstantVariable.create(True))
+        else:
+            self.push(ConstantVariable.create(None))
+            if sys.version_info < (3, 11):
+                self.push(ConstantVariable.create(False))
+
+    def LOAD_ASSERTION_ERROR(self, inst):
+        unimplemented("assert with non-string message")
+
+    UNARY_POSITIVE = stack_op(operator.pos)
+    UNARY_NEGATIVE = stack_op(operator.neg)
+    UNARY_NOT = stack_op(operator.not_)
+    UNARY_INVERT = stack_op(operator.invert)
+
+    BINARY_POWER = stack_op(operator.pow)
+    BINARY_MULTIPLY = stack_op(operator.mul)
+    BINARY_MATRIX_MULTIPLY = stack_op(operator.matmul)
+    BINARY_FLOOR_DIVIDE = stack_op(operator.floordiv)
+    BINARY_TRUE_DIVIDE = stack_op(operator.truediv)
+    BINARY_MODULO = stack_op(operator.mod)
+    BINARY_REMAINDER = stack_op(operator.mod)
+    BINARY_ADD = stack_op(operator.add)
+    BINARY_SUBTRACT = stack_op(operator.sub)
+    BINARY_SUBSCR = break_graph_if_unsupported(push=1)(stack_op(operator.getitem))
+    BINARY_LSHIFT = stack_op(operator.lshift)
+    BINARY_RSHIFT = stack_op(operator.rshift)
+    BINARY_AND = stack_op(operator.and_)
+    BINARY_OR = stack_op(operator.or_)
+    BINARY_XOR = stack_op(operator.xor)
+
+    INPLACE_POWER = stack_op(operator.ipow)
+    INPLACE_MULTIPLY = stack_op(operator.imul)
+    INPLACE_MATRIX_MULTIPLY = stack_op(operator.imatmul)
+    INPLACE_FLOOR_DIVIDE = stack_op(operator.ifloordiv)
+    INPLACE_TRUE_DIVIDE = stack_op(operator.itruediv)
+    INPLACE_MODULO = stack_op(operator.imod)
+    INPLACE_REMAINDER = stack_op(operator.imod)
+    INPLACE_ADD = stack_op(operator.iadd)
+    INPLACE_SUBTRACT = stack_op(operator.isub)
+    INPLACE_LSHIFT = stack_op(operator.ilshift)
+    INPLACE_RSHIFT = stack_op(operator.irshift)
+    INPLACE_AND = stack_op(operator.iand)
+    INPLACE_XOR = stack_op(operator.ixor)
+    INPLACE_OR = stack_op(operator.ior)
+
+    # 3.11 opcodes
+    def RESUME(self, inst):
+        if inst.arg == 0:
+            self.append_prefix_inst(inst)
+            self.accept_prefix_inst = False
+        else:
+            assert not self.accept_prefix_inst
+
+    def BINARY_OP(self, inst):
+        if sys.version_info >= (3, 11):
+            opname = dis._nb_ops[inst.arg][0][3:]  # type: ignore[attr-defined]
+            if opname.startswith("INPLACE"):
+                return getattr(self, "INPLACE_" + opname[8:])(inst)
+            return getattr(self, "BINARY_" + opname)(inst)
+        else:
+            unimplemented("BINARY_OP requires Python 3.11+")
+
+    def PRECALL(self, inst):
+        pass
+
+    def KW_NAMES(self, inst):
+        kw_names = self.code_options["co_consts"][inst.arg]
+        assert isinstance(kw_names, tuple)
+        for name in kw_names:
+            assert isinstance(name, str)
+        assert self.kw_names is None
+        self.kw_names = ConstantVariable.create(value=kw_names)  # type: ignore[assignment]
+
+    def PUSH_NULL(self, inst):
+        self.push(NullVariable())
+
+    @break_graph_if_unsupported(push=1)
+    def CALL(self, inst):
+        # see https://docs.python.org/3.11/library/dis.html#opcode-CALL
+        # for convention
+        contents = self.popn(inst.arg + 2)
+        if isinstance(contents[0], NullVariable):
+            fn = contents[1]
+            args = []
+        else:
+            fn = contents[0]
+            args = [contents[1]]
+        kw_names = self.kw_names.value if self.kw_names else ()
+        if kw_names:
+            args = args + contents[2 : -len(kw_names)]
+            kwargs_list = contents[-len(kw_names) :]
+            kwargs = dict(zip(kw_names, kwargs_list))
+            assert len(kwargs) == len(kw_names)
+        else:
+            args = args + contents[2:]
+            kwargs = {}
+        self.call_function(fn, args, kwargs)
+        self.kw_names = None
+
+    def COPY(self, inst):
+        self.push(self.stack[-inst.arg])
+
+    def SWAP(self, inst):
+        self.stack[-1], self.stack[-inst.arg] = self.stack[-inst.arg], self.stack[-1]
+
+    JUMP_BACKWARD = jump
+    JUMP_BACKWARD_NO_INTERRUPT = jump
+
+    POP_JUMP_FORWARD_IF_TRUE = generic_jump(operator.truth, False)
+    POP_JUMP_BACKWARD_IF_TRUE = generic_jump(operator.truth, False)
+    POP_JUMP_FORWARD_IF_FALSE = generic_jump(operator.not_, False)
+    POP_JUMP_BACKWARD_IF_FALSE = generic_jump(operator.not_, False)
+
+    def CACHE(self, inst):
+        pass
+
+    def BEFORE_WITH(self, inst):
+        self.setup_or_before_with(inst)
+
+    def setup_or_before_with(self, inst):
+        ctx = self.pop()
+        if not isinstance(ctx, ContextWrappingVariable):
+            unimplemented(f"{inst.opname} {ctx}")
+
+        if isinstance(ctx, GenericContextWrappingVariable):
+            self.generic_context_manager_depth += 1
+
+        exit = WithExitFunctionVariable(
+            ctx,
+            inst.target,
+        )
+        if sys.version_info >= (3, 11):
+            # see create_call_resume_at for block stack details
+            assert self.next_instruction
+            assert self.next_instruction.exn_tab_entry
+            target = self.next_instruction.exn_tab_entry.target
+        else:
+            target = inst.target
+        if isinstance(self, InstructionTranslator):
+            self.block_stack.append(BlockStackEntry(target, len(self.stack), ctx))
+        else:
+            self.block_stack.append(BlockStackEntry(target))
+
+        self.push(exit)
+        self.push(ctx.enter(self))
+
+    def append_prefix_inst(self, inst):
+        assert self.accept_prefix_inst
+        self.prefix_insts.append(inst)
+
+    def MAKE_CELL(self, inst):
+        self.append_prefix_inst(inst)
+
+    def COPY_FREE_VARS(self, inst):
+        self.append_prefix_inst(inst)
+
+    def RETURN_GENERATOR(self, inst):
+        self.append_prefix_inst(inst)
+
+    def copy_graphstate(self) -> InstructionTranslatorGraphState:
+        """Create a checkpoint of the current state by copying everything"""
+        return InstructionTranslatorGraphState(
+            self.output.copy_graphstate(),
+            dict(self.symbolic_locals),
+            list(self.stack),
+            list(self.block_stack),
+            self.instruction_pointer,
+            self.current_instruction,
+            self.next_instruction,
+            self.lineno,
+        )
+
+    def restore_graphstate(self, state: InstructionTranslatorGraphState):
+        """Restore a checkpoint created by self.copy_graphstate()"""
+        (
+            output_state,
+            self.symbolic_locals,
+            self.stack,
+            self.block_stack,
+            self.instruction_pointer,
+            self.current_instruction,
+            self.next_instruction,
+            self.lineno,
+        ) = state
+        self.output.restore_graphstate(output_state)
+
+    def is_non_empty_graph(self):
+        if self.output.count_calls() > 1:
+            # perf optimization only
+            self.is_non_empty_graph = lambda: True  # type: ignore[method-assign]
+            return True
+        return False
+
+    def format_frame_summary(self, additional_stack_frames=None):
+        if additional_stack_frames is None:
+            additional_stack_frames = []
+        return "".join(
+            traceback.format_list(
+                [self.frame_summary()] + list(reversed(additional_stack_frames))
+            )
+        )
+
+    def frame_summary(self):
+        return traceback.FrameSummary(
+            getattr(self.f_code, "co_filename", "<unknown>"),
+            self.lineno,
+            getattr(self.f_code, "co_name", "<unknown>"),
+            lookup_line=False,
+        )
+
+    def store_global_weakref_by_id(self, prefix, value):
+        global_name = self.output.install_global_by_id(prefix, weakref.ref(value))
+        install_guard(
+            GlobalWeakRefSource(global_name).make_guard(GuardBuilder.WEAKREF_ALIVE)
+        )
+        return global_name
+
+    @property
+    def fake_mode(self):
+        return self.output.tracing_context.fake_mode
+
+    def find_symbolic_locals_name(self, tensor_variable):
+        for key, value in self.symbolic_locals.items():
+            if value is tensor_variable:
+                return key
+        return None
+
+    @contextlib.contextmanager
+    def strict_translation_mode(self):
+        self.strict_checks_enabled = True
+        try:
+            yield
+        finally:
+            self.strict_checks_enabled = False
+
+    def speculate(self) -> SpeculationEntry:
+        return self.speculation_log.next(
+            self.f_code.co_filename, self.lineno, self.instruction_pointer
+        )
+
+    def __init__(
+        self,
+        output: OutputGraph,
+        instructions: List[Instruction],
+        f_locals: Dict[str, Any],
+        f_globals: Dict[str, Any],
+        f_builtins: Dict[str, Any],
+        code_options: Dict[str, Any],
+        symbolic_locals: Dict[str, VariableTracker],
+        symbolic_globals: Dict[str, VariableTracker],
+        f_code: types.CodeType,
+        export: bool,
+        inline_depth: int,
+        speculation_log: SpeculationLog,
+    ):
+        super().__init__()
+        self.speculation_log = speculation_log
+
+        # Mutable state checkpointed by copy_graphstate()
+        self.output = output
+        self.symbolic_locals = symbolic_locals
+        self.symbolic_globals = symbolic_globals
+        self.stack = []
+        self.instruction_pointer = 0
+        self.current_instruction = create_instruction("NOP")
+        self.next_instruction = None
+        self.block_stack = []
+        # states before SETUP_WITH for checkpointing and fallback
+        self.generic_context_manager_depth = 0
+        self.lineno = code_options["co_firstlineno"]
+        self.kw_names = None
+        self.accept_prefix_inst = True
+        self.prefix_insts = []
+
+        # Properties of the input/output code
+        self.instructions: List[Instruction] = instructions
+        self.indexof: Dict[Instruction, int] = get_indexof(self.instructions)
+        self.f_locals: Dict[
+            str, Any
+        ] = f_locals  # needed for recording accessed locals for replay
+        self.f_globals: Dict[str, Any] = f_globals
+        self.f_builtins: Dict[str, Any] = f_builtins
+        self.code_options: Dict[str, Any] = code_options
+        self.f_code: types.CodeType = f_code
+
+        # Execution record for replaying errors
+        self.exec_recorder = ExecutionRecorder(code=f_code, code_options=code_options)
+        # Stack of module being parsed, current nn.module is at the end of ordered dict.
+        # The first field of tuple is the fully qualified name of current module
+        # in original hierarchy.  The second field is the type of current nn.module
+        self.nn_module_stack: Dict[str, Tuple[str, Type[Any]]] = {}
+        # Flag to indicate whether tracing is used for export.
+        self.export = export
+
+        self.current_speculation = None
+
+        self.strict_checks_enabled = False
+
+        if sys.version_info >= (3, 10):
+            from .resume_execution import (
+                CO_ASYNC_GENERATOR,
+                CO_COROUTINE,
+                CO_GENERATOR,
+                CO_ITERABLE_COROUTINE,
+            )
+
+            if f_code.co_flags & (
+                CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR
+            ):
+                self.push(BuiltinVariable(None))
+
+        self.inline_depth = inline_depth
+        self.inconsistent_side_effects = False
+        linecache.lazycache(f_code.co_filename, f_globals)
+        self.log_starts_line()
+
+
+class InstructionTranslator(InstructionTranslatorBase):
+    mutated_closure_cell_contents: Set[str]
+
+    @staticmethod
+    def current_tx() -> "InstructionTranslator":
+        return tls.current_tx
+
+    @contextlib.contextmanager
+    def set_current_tx(self):
+        prior = getattr(tls, "current_tx", None)
+        tls.current_tx = self
+        try:
+            yield
+        finally:
+            tls.current_tx = prior
+
+    def __init__(
+        self,
+        instructions: List[Instruction],
+        f_code,
+        f_locals,
+        f_globals,
+        f_builtins,
+        code_options,
+        compiler_fn,
+        one_graph,
+        export,
+        export_constraints,
+        mutated_closure_cell_contents: Set[str],
+        frame_state,
+        speculation_log: SpeculationLog,
+    ):
+        _step_logger()(
+            logging.INFO,
+            f"torchdynamo start tracing {f_code.co_name} {code_options['co_filename']}:{code_options['co_firstlineno']}",
+        )
+        super().__init__(
+            output=OutputGraph(
+                code_options,
+                compiler_fn,
+                self,
+                export,
+                export_constraints,
+                frame_state,
+                local_scope=f_locals,
+                global_scope=f_globals,
+                f_code=f_code,
+            ),
+            instructions=instructions,
+            f_locals=f_locals,
+            f_globals=f_globals,
+            f_builtins=f_builtins,
+            code_options=code_options,
+            symbolic_locals={},  # set below
+            # A global var is inserted only after a STORE_GLOBAL happens to it
+            symbolic_globals={},
+            f_code=f_code,
+            export=export,
+            inline_depth=0,
+            speculation_log=speculation_log,
+        )
+
+        self._throw_if_in_functorch()
+
+        # as soon as we create the tracing context we should keep it active, so any calls
+        # into dynamo apis can rely on finding it
+        with tracing(self.output.tracing_context), self.set_current_tx():
+            self.one_graph: bool = one_graph
+            self.export = export
+            self.mutated_closure_cell_contents = mutated_closure_cell_contents
+            if self.export:
+                assert (
+                    self.one_graph
+                ), "Export without one graph - something has gone wrong."
+
+            vars = list(code_options["co_varnames"])
+            cells_and_freevars = [x for x in self.cell_and_freevars() if x not in vars]
+            vars.extend(cells_and_freevars)
+            cells_and_freevars_set = set(cells_and_freevars)
+
+            self.symbolic_locals = {
+                k: variables.LazyVariableTracker.create(
+                    f_locals[k],
+                    source=LocalSource(k, cell_or_freevar=k in cells_and_freevars_set),
+                )
+                for k in vars
+                if k in f_locals
+            }
+            self.debug_locals: List[Tuple[VariableTracker, List[VariableTracker]]] = []
+            if export:
+                # export gets confused if we never realize unused inputs
+                # in export mode just eagerly realize everything
+                self.symbolic_locals = VariableTracker.apply(
+                    lambda x: x.realize(), self.symbolic_locals
+                )
+
+            self._freevars_ids = dict()
+            for name in self.code_options["co_freevars"]:
+                if name in f_locals:
+                    self._freevars_ids[name] = id(f_locals[name])
+
+    def _throw_if_in_functorch(self):
+        # Fallback to eager in case of a graph break inside vmap
+        eager = torch._dynamo.lookup_backend("eager")
+        compiler_fn = inspect.getattr_static(
+            self.output.compiler_fn, "compiler_fn", self.output.compiler_fn
+        )
+        ci = torch._C._functorch.peek_interpreter_stack()
+        forbidden_keys = (
+            torch._C._functorch.TransformType.Vmap,
+            torch._C._functorch.TransformType.Grad,
+        )
+        if ci is not None and ci.key() in forbidden_keys and compiler_fn is not eager:
+            # if it reaches here, it means Dynamo failed to inline a functorch function
+            name = ci.key().name.lower()
+            msg = f"torch.func.{name}(fn) requires the function to be inlined by dynamo"
+            unimplemented(msg)
+
+    def get_example_value(self, source: Source):
+        if isinstance(source, LocalSource):
+            return self.f_locals[source.local_name]
+        if isinstance(source, GlobalSource):
+            return self.f_globals[source.global_name]
+        raise KeyError()
+
+    def run(self):
+        super().run()
+
+    def match_nested_cell(self, name, cell):
+        """Match a cell in this method to one in a function we are inlining"""
+        try:
+            value = cell.cell_contents
+        except ValueError:
+            return None
+        # TODO(jansel): check the id of the cell rather than the contents
+        if id(value) != self._freevars_ids.get(name):
+            return None
+        return self.symbolic_locals[name]
+
+    def should_compile_partial_graph(self):
+        return (
+            all(b.can_restore() for b in self.block_stack)
+            and not self.one_graph
+            and self.generic_context_manager_depth == 0
+        )
+
+    def create_call_resume_at(self, inst):
+        self.instruction_pointer = None
+
+        if inst.opname == "RETURN_VALUE":
+            return [create_instruction("RETURN_VALUE")]
+
+        reads = livevars_analysis(self.instructions, inst)
+        argnames = tuple(
+            k
+            for k in self.symbolic_locals.keys()
+            if k in reads and k not in self.cell_and_freevars()
+        )
+
+        cg = PyCodegen(self)
+
+        # Python does not allow null to be an arg to a function, so
+        # we remove nulls from the stack and restore them in the
+        # prologue of the resume function
+
+        # sorted list of indices of nulls on the stack
+        null_idxes: List[int] = []
+        if sys.version_info >= (3, 11):
+            # find indices of NullVariables
+            for i, var in enumerate(self.stack):
+                if isinstance(var, NullVariable):
+                    null_idxes.append(i)
+            # generate bytecode to pop the nulls
+            null_cnt = 0
+            for i, var in enumerate(reversed(self.stack)):
+                if isinstance(var, NullVariable):
+                    for j in range(2, i + 2 - null_cnt):
+                        cg.append_output(create_instruction("SWAP", arg=j))
+                    cg.extend_output(cg.pop_null())
+                    null_cnt += 1
+
+        # we popped all nulls from the stack at runtime,
+        # so we should not count NullVariables
+        stack_len = len(self.stack) - len(null_idxes)
+        nargs = stack_len + len(argnames)
+
+        name = unique_id(f"__resume_at_{inst.offset}")
+
+        new_code: types.CodeType = ContinueExecutionCache.lookup(
+            self.f_code,
+            self.lineno,
+            inst.offset,
+            tuple(b.target.offset for b in self.block_stack),
+            stack_len,
+            argnames,
+            tuple(b.resume_fn() for b in self.block_stack),
+            tuple(null_idxes),
+        )
+
+        # Add original GraphModule context to the resume function to handle
+        # the case of a graph break while tracing a GraphModule
+        orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
+            "orig_graphmodule", lambda: None
+        )()
+        if orig_graphmodule_maybe is not None:
+            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                orig_graphmodule_maybe
+            )
+
+        if new_code.co_freevars:
+            cg.make_function_with_closure(name, new_code, True, stack_len)
+        else:
+            # This is safe: we pre-generate a unique name
+            self.output.install_global_unsafe(
+                name, types.FunctionType(new_code, self.f_globals, name)
+            )
+            cg.extend_output(cg.load_function_name(name, True, stack_len))
+
+        cg.extend_output([cg.create_load(k) for k in argnames])
+        cg.extend_output(create_call_function(nargs, False))
+        cg.append_output(create_instruction("RETURN_VALUE"))
+        return cg.get_instructions()
+
+    def symbolic_locals_contain_module_class(self):
+        for v in self.symbolic_locals.values():
+            if isinstance(v, UserDefinedClassVariable) and issubclass(
+                v.as_python_constant(), torch.nn.Module
+            ):
+                return True
+        return False
+
+    def RETURN_VALUE(self, inst):
+        if (
+            self.output.count_calls() == 0
+            and not self.inconsistent_side_effects
+            and not self.symbolic_locals_contain_module_class()
+            and not self.export
+        ):
+            raise exc.SkipFrame("because no content in function call")
+        self.instruction_pointer = None
+        _step_logger()(
+            logging.INFO,
+            f"torchdynamo done tracing {self.f_code.co_name} (RETURN_VALUE)",
+        )
+        log.debug("RETURN_VALUE triggered compile")
+        self.output.compile_subgraph(
+            self,
+            reason=GraphCompileReason(
+                "return_value", [self.frame_summary()], graph_break=False
+            ),
+        )
+        self.output.add_output_instructions([create_instruction("RETURN_VALUE")])
+
+
+class InliningInstructionTranslator(InstructionTranslatorBase):
+    """Trace and inline a called method"""
+
+    symbolic_result: Optional[TensorVariable]
+
+    @classmethod
+    def inline_call(cls, parent, func, args, kwargs):
+        with patch.dict(counters, {"unimplemented": counters["inline_call"]}):
+            return cls.inline_call_(parent, func, args, kwargs)
+
+    @staticmethod
+    def check_inlineable(func):
+        if func.has_self():
+            unimplemented("inline with __self__")
+
+        result = trace_rules.check_verbose(func, is_inlined_call=True)
+        if result.skipped:
+            from torch._dynamo.variables.misc import produce_trampoline_autograd_apply
+
+            # _origin marks this as coming from an internal dynamo known function that is safe to
+            # trace through.
+            if hasattr(getattr(func, "fn", None), "_origin") and func.fn._origin in [
+                produce_trampoline_autograd_apply,
+            ]:
+                # Known sound
+                return trace_rules.SkipResult(
+                    False, "allowlist in dynamo known function"
+                )
+            fn_qualname = func.fn.__qualname__ if hasattr(func, "fn") else ""
+            unimplemented(
+                f"'inline in skipfiles: {fn_qualname} | {func.get_name()} {func.get_filename()}, {result.reason}'"
+            )
+
+        if isinstance(func, UserFunctionVariable) and inspect.getattr_static(
+            func.get_function(), "_torchdynamo_disable", False
+        ):
+            unimplemented(
+                f"call torch._dynamo.disable() wrapped function {func.get_function()}"
+            )
+        else:
+            return result
+
+    @staticmethod
+    def inline_call_(
+        parent, func: VariableTracker, args: List[VariableTracker], kwargs
+    ):
+        if isinstance(func, SkipFunctionVariable):
+            unimplemented("inline with functions in skip files")
+        assert isinstance(
+            func,
+            (UserFunctionVariable, NestedUserFunctionVariable),
+        )
+        result = InliningInstructionTranslator.check_inlineable(func)
+        assert result.skipped is False
+        try:
+            sub_locals, closure_cells = func.bind_args(parent, args, kwargs)
+        except TypeError as e:
+            # Wrap the general TypeError during bind_args() to the internal ArgsMismatchError with detailed info
+            raise ArgsMismatchError(  # noqa: TRY200
+                "{reason}.\n  func = {func}, args = {args}, kwargs = {kwargs}".format(
+                    reason=str(e),
+                    func=f"'{func.get_name()}' {func.get_filename()}:{func.get_code().co_firstlineno}",
+                    args=[arg.python_type() for arg in args],
+                    kwargs=kwargs,
+                ),
+            )
+
+        for v in itertools.chain(sub_locals.values(), closure_cells.values()):
+            if not isinstance(v, VariableTracker):
+                unimplemented(f"unconverted arg {v}")
+
+        code: types.CodeType = func.get_code()
+        if code.co_name in ("__setitem__", "__setattr__") and not (
+            args is not None
+            and len(args) > 0
+            and isinstance(args[0], variables.CustomizedDictVariable)
+        ):
+            unimplemented(f"inline {code.co_name}")
+
+        suffix = ""
+        # TODO: mlazos, add support for enabling multiple artifact logs
+        # with a single alias
+        if torch._logging._internal.log_state.is_artifact_enabled("output_code"):
+            suffix = f"\n{dis.Bytecode(code).dis()}"
+        if sys.version_info >= (3, 11):
+            cur_inst = parent.current_instruction
+            parent_code = parent.f_code
+            header = parent.get_line_of_code_header(lineno=cur_inst.positions.lineno)
+
+            def get_trace_call_log_str():
+                line = get_instruction_source_311(parent_code, cur_inst).rstrip()
+                return f"TRACE inlined call {code.co_name} from {header}\n{line}"
+
+            trace_call_log.debug("%s", LazyString(get_trace_call_log_str))
+        log.debug("INLINING %s%s, %s", code, suffix, result.reason)
+
+        # Detect inline GraphModule calls in order to propagate node metadata,
+        # by checking if the first argument (self) is a variable tracking a GraphModule.
+        if args and isinstance(args[0], NNModuleVariable):
+            module = parent.output.get_submodule(args[0].module_key)
+            if isinstance(module, torch.fx.GraphModule):
+                # The inline call might not actually be a call to `forward`,
+                # but it is enough to add a context for `forward` in case it is called.
+                code_context.get_context(module.forward.__code__)[
+                    "orig_graphmodule"
+                ] = weakref.ref(module)
+
+        tracer: InliningInstructionTranslator
+        if is_generator(code):
+            tracer = InliningGeneratorInstructionTranslator(
+                parent, code, sub_locals, parent.symbolic_globals, closure_cells, func
+            )
+        else:
+            tracer = InliningInstructionTranslator(
+                parent, code, sub_locals, parent.symbolic_globals, closure_cells, func
+            )
+
+        strict_ctx: Any = contextlib.nullcontext()
+        if parent.strict_checks_enabled:
+            strict_ctx = tracer.strict_translation_mode()
+        try:
+            with strict_ctx:
+                tracer.run()
+        except exc.SkipFrame as e:
+            msg = f"SKIPPED INLINING {code}: {e}"
+            log.debug(msg)
+            raise Unsupported(msg) from e
+        except Exception as e:
+            log.debug("FAILED INLINING %s", code)
+            raise
+        assert tracer.symbolic_result is not None
+        func.export_freevars(parent, tracer)
+
+        if tracer.f_globals is parent.f_globals:
+            # Merge symbolic_globals back if parent and child are in the same namespace
+            parent.symbolic_globals.update(tracer.symbolic_globals)
+
+        parent.inconsistent_side_effects |= tracer.inconsistent_side_effects
+
+        log.debug("DONE INLINING %s", code)
+
+        if is_generator(code):
+            assert isinstance(tracer, InliningGeneratorInstructionTranslator)
+            assert tracer.symbolic_result.as_python_constant() is None
+            return ListIteratorVariable(
+                tracer.generated_items,
+                mutable_local=MutableLocal(),
+            )
+        else:
+            return tracer.symbolic_result
+
+    def __init__(
+        self,
+        parent: InstructionTranslatorBase,
+        code: types.CodeType,
+        symbolic_locals: Dict[str, VariableTracker],
+        symbolic_globals: Dict[str, VariableTracker],
+        closure_cells: Dict[str, VariableTracker],
+        funcvar: BaseUserFunctionVariable,
+    ):
+        f_globals = funcvar.get_globals()  # type: ignore[attr-defined]
+        f_builtins = f_globals["__builtins__"]
+        if not isinstance(f_builtins, dict):
+            f_builtins = f_builtins.__dict__
+        instructions = cleaned_instructions(code)
+        propagate_line_nums(instructions)
+        super().__init__(
+            output=parent.output,
+            f_locals={},
+            f_globals=f_globals,
+            f_builtins=f_builtins,
+            symbolic_locals=symbolic_locals,
+            symbolic_globals=symbolic_globals,
+            instructions=instructions,
+            code_options={k: getattr(code, k) for k in dir(code)},
+            f_code=code,
+            export=parent.export,
+            inline_depth=parent.inline_depth + 1,
+            speculation_log=parent.speculation_log,
+        )
+        self.parent = parent
+        self.symbolic_result = None
+        self.closure_cells = closure_cells
+        self.nn_module_stack = parent.nn_module_stack.copy()
+
+    @property
+    def fake_mode(self):
+        return self.parent.fake_mode
+
+    def run_ctx_mgr(self):
+        return TracingContext.current_frame(self.parent.frame_summary())
+
+    def STORE_DEREF(self, inst):
+        if inst.argval in self.closure_cells:
+            cell = self.closure_cells[inst.argval]
+            val = self.pop()
+            if isinstance(cell, ClosureVariable):
+                if not self.output.is_root_tracer():
+                    unimplemented(
+                        "HigherOrderOperator: Mutating a variable not in the current scope (ClosureVariable)"
+                    )
+                self.output.root_tx.symbolic_locals[cell.name] = val
+            else:
+                self.output.side_effects.store_cell(cell, val)
+        else:
+            maybe_cell = self.symbolic_locals.get(inst.argval)
+            if isinstance(
+                maybe_cell,
+                variables.NewCellVariable,
+            ):
+                self.output.side_effects.store_cell(
+                    self.symbolic_locals[inst.argval], self.pop()
+                )
+            else:
+                if (
+                    maybe_cell is not None
+                    and maybe_cell.source.name()
+                    not in self.output.root_tx.mutated_closure_cell_contents
+                ):
+                    # Why is the source name here unique?
+                    # mutated_closure_cell_contents is a per-frame
+                    # concept, and sources identify, e.g., particular
+                    # locals from the frame.  If you had two locals,
+                    # they'll get different source names, and therefore
+                    # differ here.
+                    self.output.root_tx.mutated_closure_cell_contents.add(
+                        maybe_cell.source.name()
+                    )
+                    raise exc.UnspecializeRestartAnalysis()
+                unimplemented("write to __closure__ while inlining")
+
+    def LOAD_DEREF(self, inst):
+        if inst.argval in self.closure_cells:
+            cell = self.closure_cells[inst.argval]
+            if isinstance(cell, ClosureVariable):
+                self.push(self.output.root_tx.symbolic_locals[cell.name])
+            else:
+                self.push(self.output.side_effects.load_cell(cell))
+        else:
+            maybe_sym_local = self.symbolic_locals.get(inst.argval, None)
+            if isinstance(maybe_sym_local, variables.NewCellVariable):
+                self.push(self.output.side_effects.load_cell(maybe_sym_local))
+            else:
+                super().LOAD_DEREF(inst)
+
+    def LOAD_CLOSURE(self, inst):
+        assert inst.argval in self.cell_and_freevars()
+        if inst.argval in self.closure_cells:
+            self.push(self.closure_cells[inst.argval])
+        else:
+            self.push(InlinedClosureVariable(name=inst.argval))
+
+    def check_replace_is_safe(self, oldvar):
+        if not is_side_effect_safe(oldvar.mutable_local):
+            unimplemented(
+                "HigherOrderOperator: Mutating a variable not in the current scope (replace_all)"
+            )
+
+    def should_compile_partial_graph(self):
+        return False  # inlining functions is all-or-nothing
+
+    def create_call_resume_at(self, offset):
+        unimplemented("cant resume while inlining")
+
+    def RETURN_VALUE(self, inst):
+        self.symbolic_result = self.pop()  # type: ignore[assignment]
+        self.instruction_pointer = None
+
+
+class InliningGeneratorInstructionTranslator(InliningInstructionTranslator):
+    generated_items: List[VariableTracker]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.generated_items = []
+
+    def YIELD_VALUE(self, inst: Instruction):
+        self.generated_items.append(self.pop())
+        # TODO(jansel): figure out why this is needed, it isn't in the docs for YIELD_VALUE
+        self.push(ConstantVariable.create(None))
+
+    def GET_YIELD_FROM_ITER(self, inst):
+        tos = self.stack[-1]
+        if not isinstance(tos, ListIteratorVariable):
+            self.pop()
+            res = BuiltinVariable(iter).call_function(self, [tos], {})
+            self.push(res)
+        return self.YIELD_FROM(inst)
+
+    def YIELD_FROM(self, inst):
+        while True:
+            tos = self.stack[-1].realize()
+            if isinstance(tos, ConstantVariable) and tos.value is None:
+                self.pop()
+                return
+            if isinstance(
+                tos, (variables.ListIteratorVariable, variables.IteratorVariable)
+            ):
+                try:
+                    val, next_iter = tos.next_variables(self)
+                    self.push(val)
+                    # TODO(voz): Unclear if we need the push None in YIELD_VALUE?
+                    self.YIELD_VALUE(inst)
+                    self.pop()
+                    self.push(next_iter)
+                except StopIteration:
+                    return
+            else:
+                unimplemented(f"YIELD_FROM {typestr(tos)}")
+
+    def SEND(self, inst):
+        assert len(self.stack) >= 2
+        val = self.pop()
+        tos = self.stack[-1]
+        if isinstance(tos, ListIteratorVariable):
+            if isinstance(val, ConstantVariable) and val.value is None:
+                self.push(val)
+                self.instruction_pointer = self.indexof[inst.target]
+            else:
+                # invoke send
+                # Unreachable code - if you hit this, you are implementing generator support and have
+                # lifted the `unimplemented("generator")` in frame conversion. This codepath handles
+                # subgenerator and lines up with this line in Python 3.11
+                # https://github.com/python/cpython/blob/3.11/Python/ceval.c#L2597
+                unimplemented("Unreachable sub-generator code")
+        else:
+            unimplemented(f"SEND {typestr(tos)}")
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/tensor_version_op.py b/MLPY/Lib/site-packages/torch/_dynamo/tensor_version_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c20e1cd5ff504e40a14eb599d11880e61e9f9c1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/tensor_version_op.py
@@ -0,0 +1,57 @@
+import torch
+from torch._prims import _make_prim, RETURN_TYPE
+from torch._subclasses import FakeTensorMode
+from torch._subclasses.functional_tensor import FunctionalTensorMode
+
+_tensor_version = _make_prim(
+    schema="_tensor_version(Tensor self) -> SymInt",
+    return_type=RETURN_TYPE.NEW,
+    meta=torch.ops.aten._version.default,
+    impl_aten=torch.ops.aten._version.default,
+    doc="Tracable unbacked SymInt version of torch.Tensor._version",
+)
+
+
+@_tensor_version.py_impl(FakeTensorMode)
+def _tensor_version_fake(self):
+    """
+    The initial dynamo capture of _tensor_version + _unsafe_set_version_counter turns the
+    `._version` into an unbacked SymInt so that we don't need to specialize on the `._version`
+    of input tensors to the graph.
+    """
+    return self.fake_mode.shape_env.create_unbacked_symint()
+
+
+_unsafe_set_version_counter = _make_prim(
+    schema="_unsafe_set_version_counter(Tensor self, SymInt version) -> ()",
+    return_type=RETURN_TYPE.NEW,
+    meta=lambda self, version: None,
+    impl_aten=torch._C._autograd._unsafe_set_version_counter,
+    doc="Tracable+SymInt version of torch._C._autograd._unsafe_set_version_counter",
+)
+torch.fx.node.has_side_effect(_unsafe_set_version_counter)
+
+
+"""
+When we functionalize _tensor_version + _unsafe_set_version_counter,
+the ops disappear from the traced graph.  We run them eagerly on the
+fake tensors used for tracing, in order to get past asserts that would
+fail in autograd.
+
+Why is this ok?
+1) Versions on functional tensors don't make any sense since you can't mutate a functional tensor.
+2) The whole point of version munging is to trick autograd into doing what we want, and after
+   AotAtuograd there is no longer any need for these ops.
+
+Note this is similar to how no_grad is handled.
+"""
+
+
+@_tensor_version.py_impl(FunctionalTensorMode)
+def _tensor_version_functional(self):
+    return self._version
+
+
+@_unsafe_set_version_counter.py_impl(FunctionalTensorMode)
+def _unsafe_set_version_counter_functional(self, version):
+    torch._C._autograd._unsafe_set_version_counter(self, version)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/test_case.py b/MLPY/Lib/site-packages/torch/_dynamo/test_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..939d811ef3b5572b74d254f109d7733ba79bec74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/test_case.py
@@ -0,0 +1,78 @@
+import contextlib
+import importlib
+import logging
+import sys
+
+import torch
+import torch.testing
+from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
+    IS_WINDOWS,
+    TEST_WITH_CROSSREF,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase as TorchTestCase,
+)
+
+from . import config, reset, utils
+
+log = logging.getLogger(__name__)
+
+
+def run_tests(needs=()):
+    from torch.testing._internal.common_utils import run_tests
+
+    if (
+        TEST_WITH_TORCHDYNAMO
+        or IS_WINDOWS
+        or TEST_WITH_CROSSREF
+        or sys.version_info >= (3, 12)
+    ):
+        return  # skip testing
+
+    if isinstance(needs, str):
+        needs = (needs,)
+    for need in needs:
+        if need == "cuda" and not torch.cuda.is_available():
+            return
+        else:
+            try:
+                importlib.import_module(need)
+            except ImportError:
+                return
+    run_tests()
+
+
+class TestCase(TorchTestCase):
+    _exit_stack: contextlib.ExitStack
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._exit_stack.close()
+        super().tearDownClass()
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack = contextlib.ExitStack()  # type: ignore[attr-defined]
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
+            config.patch(
+                raise_on_ctx_manager_usage=True,
+                suppress_errors=False,
+                log_compilation_metrics=False,
+            ),
+        )
+
+    def setUp(self):
+        self._prior_is_grad_enabled = torch.is_grad_enabled()
+        super().setUp()
+        reset()
+        utils.counters.clear()
+
+    def tearDown(self):
+        for k, v in utils.counters.items():
+            print(k, v.most_common())
+        reset()
+        utils.counters.clear()
+        super().tearDown()
+        if self._prior_is_grad_enabled is not torch.is_grad_enabled():
+            log.warning("Running test changed grad mode")
+            torch.set_grad_enabled(self._prior_is_grad_enabled)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/test_minifier_common.py b/MLPY/Lib/site-packages/torch/_dynamo/test_minifier_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..46e7a272ff888aded2dd3c49354cbefa82e7f5ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/test_minifier_common.py
@@ -0,0 +1,244 @@
+import dataclasses
+import io
+import logging
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import traceback
+from typing import Optional
+from unittest.mock import patch
+
+import torch
+import torch._dynamo
+import torch._dynamo.test_case
+from torch.utils._traceback import report_compile_source_on_error
+
+
+@dataclasses.dataclass
+class MinifierTestResult:
+    minifier_code: str
+    repro_code: str
+
+    def _get_module(self, t):
+        match = re.search(r"class Repro\(torch\.nn\.Module\):\s+([ ].*\n| *\n)+", t)
+        assert match is not None, "failed to find module"
+        r = match.group(0)
+        r = re.sub(r"\s+$", "\n", r, flags=re.MULTILINE)
+        r = re.sub(r"\n{3,}", "\n\n", r)
+        return r.strip()
+
+    def minifier_module(self):
+        return self._get_module(self.minifier_code)
+
+    def repro_module(self):
+        return self._get_module(self.repro_code)
+
+
+class MinifierTestBase(torch._dynamo.test_case.TestCase):
+    DEBUG_DIR = tempfile.mkdtemp()
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
+            torch._dynamo.config.patch(debug_dir_root=cls.DEBUG_DIR)
+        )
+        # These configurations make new process startup slower.  Disable them
+        # for the minification tests to speed them up.
+        cls._exit_stack.enter_context(  # type: ignore[attr-defined]
+            torch._inductor.config.patch(
+                {
+                    # https://github.com/pytorch/pytorch/issues/100376
+                    "pattern_matcher": False,
+                    # multiprocess compilation takes a long time to warmup
+                    "compile_threads": 1,
+                    # https://github.com/pytorch/pytorch/issues/100378
+                    "cpp.vec_isa_ok": False,
+                }
+            )
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        if os.getenv("PYTORCH_KEEP_TMPDIR", "0") != "1":
+            shutil.rmtree(cls.DEBUG_DIR)
+        else:
+            print(f"test_minifier_common tmpdir kept at: {cls.DEBUG_DIR}")
+        cls._exit_stack.close()  # type: ignore[attr-defined]
+
+    def _gen_codegen_fn_patch_code(self, device, bug_type):
+        assert bug_type in ("compile_error", "runtime_error", "accuracy")
+        return f"""\
+{torch._dynamo.config.codegen_config()}
+{torch._inductor.config.codegen_config()}
+torch._inductor.config.{"cpp" if device == "cpu" else "triton"}.inject_relu_bug_TESTING_ONLY = {bug_type!r}
+"""
+
+    def _maybe_subprocess_run(self, args, *, isolate, cwd=None):
+        if not isolate:
+            assert len(args) >= 2, args
+            assert args[0] == "python3", args
+            if args[1] == "-c":
+                assert len(args) == 3, args
+                code = args[2]
+                args = ["-c"]
+            else:
+                assert len(args) >= 2, args
+                with open(args[1]) as f:
+                    code = f.read()
+                args = args[1:]
+
+            # WARNING: This is not a perfect simulation of running
+            # the program out of tree.  We only interpose on things we KNOW we
+            # need to handle for tests.  If you need more stuff, you will
+            # need to augment this appropriately.
+
+            # NB: Can't use save_config because that will omit some fields,
+            # but we must save and reset ALL fields
+            dynamo_config = torch._dynamo.config.shallow_copy_dict()
+            inductor_config = torch._inductor.config.shallow_copy_dict()
+            try:
+                stderr = io.StringIO()
+                log_handler = logging.StreamHandler(stderr)
+                log = logging.getLogger("torch._dynamo")
+                log.addHandler(log_handler)
+                try:
+                    prev_cwd = os.getcwd()
+                    if cwd is not None:
+                        os.chdir(cwd)
+                    with patch("sys.argv", args), report_compile_source_on_error():
+                        exec(code, {"__name__": "__main__", "__compile_source__": code})
+                    rc = 0
+                except Exception:
+                    rc = 1
+                    traceback.print_exc(file=stderr)
+                finally:
+                    log.removeHandler(log_handler)
+                    if cwd is not None:
+                        os.chdir(prev_cwd)  # type: ignore[possibly-undefined]
+                    # Make sure we don't leave buggy compiled frames lying
+                    # around
+                    torch._dynamo.reset()
+            finally:
+                torch._dynamo.config.load_config(dynamo_config)
+                torch._inductor.config.load_config(inductor_config)
+
+            # TODO: return a more appropriate data structure here
+            return subprocess.CompletedProcess(
+                args,
+                rc,
+                b"",
+                stderr.getvalue().encode("utf-8"),
+            )
+        else:
+            return subprocess.run(args, capture_output=True, cwd=cwd, check=False)
+
+    # Run `code` in a separate python process.
+    # Returns the completed process state and the directory containing the
+    # minifier launcher script, if `code` outputted it.
+    def _run_test_code(self, code, *, isolate):
+        proc = self._maybe_subprocess_run(
+            ["python3", "-c", code], isolate=isolate, cwd=self.DEBUG_DIR
+        )
+
+        print("test stdout:", proc.stdout.decode("utf-8"))
+        print("test stderr:", proc.stderr.decode("utf-8"))
+        repro_dir_match = re.search(
+            r"(\S+)minifier_launcher.py", proc.stderr.decode("utf-8")
+        )
+        if repro_dir_match is not None:
+            return proc, repro_dir_match.group(1)
+        return proc, None
+
+    # Runs the minifier launcher script in `repro_dir`
+    def _run_minifier_launcher(self, repro_dir, isolate, *, minifier_args=()):
+        self.assertIsNotNone(repro_dir)
+        launch_file = os.path.join(repro_dir, "minifier_launcher.py")
+        with open(launch_file) as f:
+            launch_code = f.read()
+        self.assertTrue(os.path.exists(launch_file))
+
+        args = ["python3", launch_file, "minify", *minifier_args]
+        if not isolate:
+            args.append("--no-isolate")
+        launch_proc = self._maybe_subprocess_run(args, isolate=isolate, cwd=repro_dir)
+        print("minifier stdout:", launch_proc.stdout.decode("utf-8"))
+        stderr = launch_proc.stderr.decode("utf-8")
+        print("minifier stderr:", stderr)
+        self.assertNotIn("Input graph did not fail the tester", stderr)
+
+        return launch_proc, launch_code
+
+    # Runs the repro script in `repro_dir`
+    def _run_repro(self, repro_dir, *, isolate=True):
+        self.assertIsNotNone(repro_dir)
+        repro_file = os.path.join(repro_dir, "repro.py")
+        with open(repro_file) as f:
+            repro_code = f.read()
+        self.assertTrue(os.path.exists(repro_file))
+
+        repro_proc = self._maybe_subprocess_run(
+            ["python3", repro_file], isolate=isolate, cwd=repro_dir
+        )
+        print("repro stdout:", repro_proc.stdout.decode("utf-8"))
+        print("repro stderr:", repro_proc.stderr.decode("utf-8"))
+        return repro_proc, repro_code
+
+    # Template for testing code.
+    # `run_code` is the code to run for the test case.
+    # `patch_code` is the code to be patched in every generated file; usually
+    # just use this to turn on bugs via the config
+    def _gen_test_code(self, run_code, repro_after, repro_level):
+        return f"""\
+import torch
+import torch._dynamo
+{torch._dynamo.config.codegen_config()}
+{torch._inductor.config.codegen_config()}
+torch._dynamo.config.repro_after = "{repro_after}"
+torch._dynamo.config.repro_level = {repro_level}
+torch._dynamo.config.debug_dir_root = "{self.DEBUG_DIR}"
+{run_code}
+"""
+
+    # Runs a full minifier test.
+    # Minifier tests generally consist of 3 stages:
+    # 1. Run the problematic code
+    # 2. Run the generated minifier launcher script
+    # 3. Run the generated repro script
+    #
+    # If possible, you should run the test with isolate=False; use
+    # isolate=True only if the bug you're testing would otherwise
+    # crash the process
+    def _run_full_test(
+        self, run_code, repro_after, expected_error, *, isolate, minifier_args=()
+    ) -> Optional[MinifierTestResult]:
+        if isolate:
+            repro_level = 3
+        elif expected_error is None or expected_error == "AccuracyError":
+            repro_level = 4
+        else:
+            repro_level = 2
+        test_code = self._gen_test_code(run_code, repro_after, repro_level)
+        print("running test", file=sys.stderr)
+        test_proc, repro_dir = self._run_test_code(test_code, isolate=isolate)
+        if expected_error is None:
+            # Just check that there was no error
+            self.assertEqual(test_proc.returncode, 0)
+            self.assertIsNone(repro_dir)
+            return None
+        # NB: Intentionally do not test return code; we only care about
+        # actually generating the repro, we don't have to crash
+        self.assertIn(expected_error, test_proc.stderr.decode("utf-8"))
+        self.assertIsNotNone(repro_dir)
+        print("running minifier", file=sys.stderr)
+        minifier_proc, minifier_code = self._run_minifier_launcher(
+            repro_dir, isolate=isolate, minifier_args=minifier_args
+        )
+        print("running repro", file=sys.stderr)
+        repro_proc, repro_code = self._run_repro(repro_dir, isolate=isolate)
+        self.assertIn(expected_error, repro_proc.stderr.decode("utf-8"))
+        self.assertNotEqual(repro_proc.returncode, 0)
+        return MinifierTestResult(minifier_code=minifier_code, repro_code=repro_code)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/testing.py b/MLPY/Lib/site-packages/torch/_dynamo/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d9fed83380a5fdab8a73ba1a38e13249dc83a23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/testing.py
@@ -0,0 +1,372 @@
+import contextlib
+import dis
+import functools
+import logging
+import os.path
+import random
+import re
+import sys
+import types
+import unittest
+from typing import List, Optional, Sequence, Union
+from unittest.mock import patch
+
+np: Optional[types.ModuleType] = None
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+import torch
+from torch import fx
+from torch._dynamo.output_graph import OutputGraph
+
+from . import config, eval_frame, optimize_assert, reset
+from .bytecode_transformation import (
+    create_instruction,
+    debug_checks,
+    is_generator,
+    transform_code_object,
+)
+from .guards import CheckFunctionManager, GuardedCode
+from .utils import same
+
+unsupported = eval_frame.unsupported
+three = 3
+
+log = logging.getLogger(__name__)
+
+
+def clone_me(x):
+    if x is None:
+        return None
+    return x.detach().clone().requires_grad_(x.requires_grad)
+
+
+def named_parameters_for_optimized_module(mod):
+    assert isinstance(mod, eval_frame.OptimizedModule)
+    return mod._orig_mod.named_parameters
+
+
+def named_buffers_for_optimized_module(mod):
+    assert isinstance(mod, eval_frame.OptimizedModule)
+    return mod._orig_mod.named_buffers
+
+
+def remove_optimized_module_prefix(name) -> str:
+    return re.sub(r"^_orig_mod[.]", "", name)
+
+
+def collect_results(model, prediction, loss, example_inputs):
+    results = []
+    results.append(prediction)
+    results.append(loss)
+    # if isinstance(loss, torch.Tensor) and loss.item() > 1:
+    #     log.warning(
+    #         f"High loss value alert - {loss:.2f}. Can result in unstable gradients."
+    #     )
+
+    grads = dict()
+    params = dict()
+    for name, param in model.named_parameters():
+        if isinstance(model, eval_frame.OptimizedModule):
+            name = remove_optimized_module_prefix(name)
+        param_copy = param
+        grad = param.grad
+        # Treat None and zero grad as same
+        if param.grad is None:
+            grad = torch.zeros_like(param)
+        grads[name + ".grad"] = grad
+        params[name] = param_copy
+    results.append(grads)
+    results.append(params)
+    buffers = dict()
+    for name, buffer in model.named_buffers():
+        if isinstance(model, eval_frame.OptimizedModule):
+            name = remove_optimized_module_prefix(name)
+        buffers[name] = buffer
+    results.append(buffers)
+    for example in example_inputs:
+        if isinstance(example, (tuple, list)):
+            for inp in example:
+                if isinstance(inp, torch.Tensor):
+                    results.append(inp.grad)
+        else:
+            if isinstance(example, torch.Tensor):
+                results.append(example.grad)
+    return results
+
+
+def requires_bwd_pass(out):
+    if isinstance(out, torch.Tensor):
+        return out.requires_grad
+    elif isinstance(out, (list, tuple)):
+        return any(requires_bwd_pass(x) for x in out)
+    elif out is None:
+        return False
+    elif isinstance(out, int):
+        return False
+    raise NotImplementedError("Don't know how to reduce", type(out))
+
+
+def reduce_to_scalar_loss(out):
+    """Reduce the output of a model to get scalar loss"""
+    if isinstance(out, torch.Tensor):
+        # Mean does not work on integer tensors
+        return out.sum() / out.numel()
+    elif isinstance(out, (list, tuple)):
+        return sum([reduce_to_scalar_loss(x) for x in out]) / len(out)
+    elif type(out).__name__ in (
+        "MaskedLMOutput",
+        "Seq2SeqLMOutput",
+        "CausalLMOutputWithCrossAttentions",
+    ):
+        return reduce_to_scalar_loss(out.logits)
+    elif type(out).__name__ == "SquashedNormal":
+        return out.mean.sum()
+    elif isinstance(out, dict):
+        return sum([reduce_to_scalar_loss(value) for value in out.values()]) / len(
+            out.keys()
+        )
+    raise NotImplementedError("Don't know how to reduce", type(out))
+
+
+def debug_dir() -> str:
+    path = os.path.join(os.path.dirname(__file__), "../debug")
+    if not os.path.exists(path):
+        os.mkdir(path)
+    return path
+
+
+def debug_dump(name, code: types.CodeType, extra="") -> None:
+    with open(os.path.join(debug_dir(), name), "w") as fd:
+        fd.write(
+            f"{dis.Bytecode(code).info()}\n\n{dis.Bytecode(code).dis()}\n\n{extra}\n"
+        )
+
+
+def debug_insert_nops(
+    frame, cache_size, hooks, _, *, skip: int = 0
+) -> Optional[GuardedCode]:
+    """used to debug jump updates"""
+
+    def insert_nops(instructions, code_options):
+        instructions.insert(0, create_instruction("NOP"))
+        instructions.insert(0, create_instruction("NOP"))
+
+    if is_generator(frame.f_code):
+        return None
+
+    debug_checks(frame.f_code)
+    code = transform_code_object(frame.f_code, insert_nops)
+    graph = OutputGraph(
+        code_options={},
+        compiler_fn=None,
+        root_tx=None,
+        export=False,
+        export_constraints=None,
+        frame_state={"_id": 0},
+        # TODO: shouldn't this be f_locals/f_globals from frame?
+        local_scope=locals(),
+        global_scope=globals(),
+        f_code=frame.f_code,
+    )
+
+    return GuardedCode(code, CheckFunctionManager(graph).check_fn)
+
+
+class CompileCounter:
+    def __init__(self):
+        self.frame_count = 0
+        self.op_count = 0
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        self.frame_count += 1
+        for node in gm.graph.nodes:
+            if "call" in node.op:
+                self.op_count += 1
+        return gm.forward
+
+    def clear(self):
+        self.frame_count = 0
+        self.op_count = 0
+
+
+class CompileCounterWithBackend:
+    def __init__(self, backend):
+        self.frame_count = 0
+        self.op_count = 0
+        self.backend = backend
+        self.graphs = []
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        from .backends.registry import lookup_backend
+
+        self.frame_count += 1
+        for node in gm.graph.nodes:
+            if "call" in node.op:
+                self.op_count += 1
+        self.graphs.append(gm)
+        return lookup_backend(self.backend)(gm, example_inputs)
+
+
+# Equivalent to backend="eager", but also records graphs that
+# we can assert on
+class EagerAndRecordGraphs:
+    def __init__(self):
+        self.graphs = []
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        self.graphs.append(gm)
+        return gm
+
+
+def strip_comment(code) -> str:
+    code = str(code)
+    return re.sub(r"(?m)^ *#.*\n?", "", code)
+
+
+def remove_trailing_space(code) -> str:
+    return "\n".join([line.rstrip() for line in code.split("\n")])
+
+
+def normalize_gm(gm_str) -> str:
+    # strip comments as comments have path to files which may differ from
+    # system to system.
+    return remove_trailing_space(strip_comment(gm_str))
+
+
+def standard_test(
+    self,
+    fn,
+    nargs,
+    expected_ops=None,
+    expected_ops_dynamic=None,
+    expected_frame_count=1,
+):
+    if not config.assume_static_by_default and expected_ops_dynamic is not None:
+        expected_ops = expected_ops_dynamic
+
+    actual = CompileCounter()
+
+    args1 = [torch.randn(10, 10) for _ in range(nargs)]
+    args2 = [torch.randn(10, 10) for _ in range(nargs)]
+    correct1 = fn(*args1)
+    correct2 = fn(*args2)
+    reset()
+    opt_fn = optimize_assert(actual)(fn)
+    val1a = opt_fn(*args1)
+    val2a = opt_fn(*args2)
+    val1b = opt_fn(*args1)
+    val2b = opt_fn(*args2)
+    reset()
+    self.assertTrue(same(val1a, correct1))
+    self.assertTrue(same(val1b, correct1))
+    self.assertTrue(same(val2a, correct2))
+    self.assertTrue(same(val2b, correct2))
+    self.assertEqual(actual.frame_count, expected_frame_count)
+    if expected_ops is not None:
+        self.assertEqual(actual.op_count, expected_ops)
+
+
+def dummy_fx_compile(gm: fx.GraphModule, example_inputs):
+    return gm.forward
+
+
+def format_speedup(speedup, pvalue, is_correct=True, pvalue_threshold=0.1):
+    if not is_correct:
+        return "ERROR"
+    if pvalue > pvalue_threshold:
+        return f"{speedup:.3f}x SAME"
+    return f"{speedup:.3f}x p={pvalue:.2f}"
+
+
+def rand_strided(
+    size: Sequence[int],
+    stride: Sequence[int],
+    dtype: torch.dtype = torch.float32,
+    device: Union[str, torch.device] = "cpu",
+    extra_size: int = 0,
+):
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(size, stride))
+        + 1
+        + extra_size
+    )
+    if dtype.is_floating_point:
+        buffer = torch.randn(needed_size, dtype=dtype, device=device)
+    else:
+        buffer = torch.zeros(size=[needed_size], dtype=dtype, device=device)
+    return torch.as_strided(buffer, size, stride)
+
+
+def _make_fn_with_patches(fn, *patches):
+    @functools.wraps(fn)
+    def _fn(*args, **kwargs):
+        with contextlib.ExitStack() as stack:
+            for module, attr, val in patches:
+                stack.enter_context(patch.object(module, attr, val))
+
+            return fn(*args, **kwargs)
+
+    return _fn
+
+
+def make_test_cls_with_patches(cls, cls_prefix, fn_suffix, *patches, xfail_prop=None):
+    DummyTestClass = type(f"{cls_prefix}{cls.__name__}", cls.__bases__, {})
+    DummyTestClass.__qualname__ = DummyTestClass.__name__
+
+    for name in dir(cls):
+        if name.startswith("test_"):
+            fn = getattr(cls, name)
+            if not callable(fn):
+                setattr(DummyTestClass, name, getattr(cls, name))
+                continue
+            new_name = f"{name}{fn_suffix}"
+            new_fn = _make_fn_with_patches(fn, *patches)
+            new_fn.__name__ = new_name
+            if xfail_prop is not None and hasattr(fn, xfail_prop):
+                new_fn = unittest.expectedFailure(new_fn)
+            setattr(DummyTestClass, new_name, new_fn)
+        # NB: Doesn't handle slots correctly, but whatever
+        elif not hasattr(DummyTestClass, name):
+            setattr(DummyTestClass, name, getattr(cls, name))
+
+    return DummyTestClass
+
+
+# test Python 3.11+ specific features
+def skipIfNotPy311(fn):
+    if sys.version_info >= (3, 11):
+        return fn
+    return unittest.skip(fn)
+
+
+# Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
+# and test/dynamo/test_dynamic_shapes.py
+def expectedFailureDynamic(fn):
+    fn._expected_failure_dynamic = True
+    return fn
+
+
+# Controls tests generated in test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+def expectedFailureCodegenDynamic(fn):
+    fn._expected_failure_codegen_dynamic = True
+    return fn
+
+
+# Controls test generated in test/inductor/test_cpp_wrapper.py
+def expectedFailureDynamicWrapper(fn):
+    fn._expected_failure_dynamic_wrapper = True
+    return fn
+
+
+def reset_rng_state(use_xla=False):
+    torch.manual_seed(1337)
+    random.seed(1337)
+    if np:
+        np.random.seed(1337)
+    if use_xla:
+        import torch_xla.core.xla_model as xm
+
+        xm.set_rng_state(1337, str(xm.xla_device()))
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/trace_rules.py b/MLPY/Lib/site-packages/torch/_dynamo/trace_rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa3638568af181f1293c225f03b2a8aec3df99ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/trace_rules.py
@@ -0,0 +1,3460 @@
+import _collections_abc
+import _weakrefset
+import abc
+import builtins
+import collections
+import contextlib
+import copy
+import copyreg
+import dataclasses
+import enum
+import functools
+import importlib
+import inspect
+import itertools
+import linecache
+import logging
+import multiprocessing
+import operator
+import os
+import posixpath
+import random
+import re
+import selectors
+import signal
+import sys
+import tempfile
+import threading
+import tokenize
+import traceback
+import types
+import typing
+import unittest
+import weakref
+from collections import defaultdict
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Union
+
+np: Optional[types.ModuleType] = None
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    pass
+
+import torch
+import torch._inductor.test_operators
+import torch.distributed
+import torch.utils._content_store
+from ..utils import _config_module
+from .utils import getfile, hashable, NP_SUPPORTED_MODULES, unwrap_if_wrapper
+
+from .variables import (
+    BuiltinVariable,
+    FunctorchHigherOrderVariable,
+    NestedUserFunctionVariable,
+    SkipFunctionVariable,
+    TorchInGraphFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+)
+
+from .variables.base import VariableTracker
+
+
+"""
+Map of function objects to their tracing rules (Dynamo variables).
+* TorchInGraphFunctionVariable: The functions should be put into the FX graph or can be constant folded. E.g.,
+  - torch.add: should be put into the FX graph.
+  - torch.is_floating_point: constant folded.
+* SkipFunctionVariable: The objects should be skipped from tracing.
+* UserFunctionVariable: The functions should be inlined.
+
+For developers: If you add/remove a torch level API, it may trigger failures from
+test/dynamo/test_trace_rules.py:test_torch_name_rule_map_updated. To fix the failures:
+If you are adding a new torch level API or Dynamo implementation:
+* Add the name with the corresponding tracing rule to this map
+  if you are adding a new in graph function or Dynamo implementation for an existing function.
+* Remove the object name from test/dynamo/test_trace_rules.ignored_c_binding_in_graph_function_names if it's there.
+
+If you are removing an existing torch level API:
+* Remove the entry represented the API from this map or test/dynamo/test_trace_rules.ignored_c_binding_in_graph_function_names
+  depends on where it is.
+
+
+"""
+manual_torch_name_rule_map = {
+    "torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
+    "torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
+    "torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
+    "torch.jit.is_scripting": TorchInGraphFunctionVariable,
+    "torch.jit.is_tracing": TorchInGraphFunctionVariable,
+    "torch.jit.annotate": TorchInGraphFunctionVariable,
+    "torch.distributed.is_available": TorchInGraphFunctionVariable,
+    "torch.distributed.is_initialized": TorchInGraphFunctionVariable,
+    "torch.distributed.get_rank": TorchInGraphFunctionVariable,
+    "torch.distributed.get_world_size": TorchInGraphFunctionVariable,
+    "torch.distributed._tensor.api.DTensor#from_local": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d._get_group_size_by_name": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d._resolve_group_name_by_ranks_and_tag": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d._get_group_tag": TorchInGraphFunctionVariable,
+    "torch.distributed.distributed_c10d.get_process_group_ranks": TorchInGraphFunctionVariable,
+    "torch._utils.is_compiling": TorchInGraphFunctionVariable,
+    "torch.overrides.get_default_nowrap_functions": TorchInGraphFunctionVariable,
+    "torch.fx._symbolic_trace.is_fx_tracing": TorchInGraphFunctionVariable,
+    "torch._dynamo.external_utils.is_compiling": TorchInGraphFunctionVariable,
+    "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
+    "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
+    "torch.autograd._profiler_enabled": SkipFunctionVariable,
+    # We graph break on RNG state setters or getters like
+    # `torch.get_rng_state` or `torch.set_rng_state`. These functions
+    # are not aten operations and therefore they are completely ignored
+    # by the AOT dispatcher. As a result, the AOT graph does not have
+    # these setter or getter functions, producing an incorrect graph
+    # when it comes to rng states.
+    "torch.default_generator#get_state": SkipFunctionVariable,
+    "torch._C.Generator#get_state": SkipFunctionVariable,
+    "torch.get_rng_state": SkipFunctionVariable,
+    "torch.cuda.get_rng_state": SkipFunctionVariable,
+    "torch.default_generator#set_state": SkipFunctionVariable,
+    "torch._C.Generator#set_state": SkipFunctionVariable,
+    "torch.set_rng_state": SkipFunctionVariable,
+    "torch.cuda.set_rng_state": SkipFunctionVariable,
+    # https://github.com/pytorch/pytorch/issues/107187
+    "torch.manual_seed": SkipFunctionVariable,
+    # https://github.com/pytorch/pytorch/issues/93501
+    "torch.nn.utils.rnn.pack_padded_sequence": SkipFunctionVariable,
+    "torch.nn.Parameter": TorchInGraphFunctionVariable,
+    "torch._nested_tensor_from_mask": SkipFunctionVariable,
+    "torch._nested_from_padded": SkipFunctionVariable,
+    # symbol operators implemented in Python
+    "torch.sym_not": TorchInGraphFunctionVariable,
+    "torch.sym_float": TorchInGraphFunctionVariable,
+    "torch.sym_int": TorchInGraphFunctionVariable,
+    "torch.sym_max": TorchInGraphFunctionVariable,
+    "torch.sym_min": TorchInGraphFunctionVariable,
+    "torch.sym_sqrt": TorchInGraphFunctionVariable,
+    "torch.sym_ite": TorchInGraphFunctionVariable,
+    "torch.Tensor#_make_wrapper_subclass": SkipFunctionVariable,
+    "torch.Tensor#__init__": SkipFunctionVariable,
+    "torch.cuda.set_device": SkipFunctionVariable,
+    "torch.cuda.current_device": SkipFunctionVariable,
+    "torch._C.autocast_decrement_nesting": SkipFunctionVariable,
+    "torch._C.autocast_increment_nesting": SkipFunctionVariable,
+    "torch.autograd.grad": SkipFunctionVariable,
+    "torch._C.clear_autocast_cache": SkipFunctionVariable,
+    "torch.distributions.constraints.is_dependent": SkipFunctionVariable,
+    "torch.jit.isinstance": SkipFunctionVariable,
+    "torch._C.set_anomaly_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_cache_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_cpu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_cpu_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_gpu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_ipu_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_ipu_enabled": SkipFunctionVariable,
+    "torch._C.set_autocast_xla_dtype": SkipFunctionVariable,
+    "torch._C.set_autocast_xla_enabled": SkipFunctionVariable,
+    "torch.resize_as_": SkipFunctionVariable,
+    "torch.resize_as_sparse_": SkipFunctionVariable,
+    "torch.get_default_device": TorchInGraphFunctionVariable,
+    # functorch/vmap
+    "torch._functorch.vmap._check_int_or_none": UserFunctionVariable,
+    "torch._functorch.vmap._check_out_dims_is_int_or_int_pytree": UserFunctionVariable,
+    "torch._functorch.vmap._check_randomness_arg": UserFunctionVariable,
+    "torch._functorch.vmap._chunked_vmap": UserFunctionVariable,
+    "torch._functorch.vmap._concat_chunked_outputs": UserFunctionVariable,
+    "torch._functorch.vmap._create_batched_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._flat_vmap": UserFunctionVariable,
+    "torch._functorch.vmap._flatten_chunks_output": UserFunctionVariable,
+    "torch._functorch.vmap._get_chunked_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._get_name": UserFunctionVariable,
+    "torch._functorch.vmap._maybe_remove_batch_dim": UserFunctionVariable,
+    "torch._functorch.vmap._num_outputs": UserFunctionVariable,
+    "torch._functorch.vmap._process_batched_inputs": UserFunctionVariable,
+    "torch._functorch.vmap._unwrap_batched": UserFunctionVariable,
+    "torch._functorch.vmap._validate_and_get_batch_size": UserFunctionVariable,
+    "torch._functorch.vmap.doesnt_support_saved_tensors_hooks": UserFunctionVariable,
+    "torch._functorch.vmap.get_chunk_sizes": UserFunctionVariable,
+    # lazy_load_decompositions uses a lock that is not supported yet in dynamo
+    # "torch._functorch.vmap.lazy_load_decompositions": UserFunctionVariable,
+    "torch._functorch.vmap.restore_vmap": UserFunctionVariable,
+    "torch._functorch.apis.vmap": UserFunctionVariable,
+    "torch._functorch.vmap.unwrap_batched": UserFunctionVariable,
+    "torch._functorch.vmap.vmap_impl": FunctorchHigherOrderVariable,
+    "torch._functorch.vmap.wrap_batched": UserFunctionVariable,
+    # functorch/grad
+    "torch._functorch.eager_transforms.grad_impl": FunctorchHigherOrderVariable,
+    "torch._functorch.apis.grad_and_value": UserFunctionVariable,
+    "torch._functorch.eager_transforms._as_tuple": UserFunctionVariable,
+    "torch._functorch.eager_transforms._check_unique_non_empty": UserFunctionVariable,
+    "torch._functorch.eager_transforms._create_differentiable": UserFunctionVariable,
+    "torch._functorch.eager_transforms._slice_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms._undo_create_differentiable": UserFunctionVariable,
+    "torch._functorch.eager_transforms._validate_and_wrap_argnum": UserFunctionVariable,
+    "torch._functorch.eager_transforms._validate_and_wrap_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms._wrap_all_tensors": UserFunctionVariable,
+    "torch._functorch.eager_transforms._wrap_tensor_for_grad": UserFunctionVariable,
+    # functorch/jacrev
+    "torch._functorch.eager_transforms.jacrev": UserFunctionVariable,
+    "torch._functorch.eager_transforms.error_if_complex": UserFunctionVariable,
+    "torch._functorch.eager_transforms._chunked_standard_basis_for_": UserFunctionVariable,
+    "torch._functorch.eager_transforms._safe_zero_index": UserFunctionVariable,
+    # functorch/vjp
+    "torch._functorch.eager_transforms.vjp": UserFunctionVariable,
+    "torch._functorch.eager_transforms._vjp_with_argnums": UserFunctionVariable,
+    "torch._functorch.eager_transforms.assert_non_empty_tensor_output": UserFunctionVariable,
+    "torch._constrain_as_size": UserFunctionVariable,
+    "torch._constrain_as_value": UserFunctionVariable,
+    "torch._tensor._convert": UserFunctionVariable,
+    "torch.jit._unwrap_optional": UserFunctionVariable,
+    "torch.backends.mha.get_fastpath_enabled": UserFunctionVariable,
+    "torch._C._functorch._add_batch_dim": TorchInGraphFunctionVariable,
+    "torch._C._functorch._remove_batch_dim": TorchInGraphFunctionVariable,
+    "torch._C._functorch._wrap_for_grad": TorchInGraphFunctionVariable,
+    "torch._C._functorch._unwrap_for_grad": TorchInGraphFunctionVariable,
+    "torch._C._functorch.is_batchedtensor": TorchInGraphFunctionVariable,
+    "torch._dynamo.mark_static": UserFunctionVariable,
+    "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
+    "torch.cuda._get_device_properties": TorchInGraphFunctionVariable,
+    "torch.utils.hooks.BackwardHook": TorchInGraphFunctionVariable,
+    "torch.sparse_bsc_tensor": SkipFunctionVariable,
+    "torch.sparse_bsr_tensor": SkipFunctionVariable,
+    "torch.sparse_csc_tensor": SkipFunctionVariable,
+    "torch.sparse_csr_tensor": SkipFunctionVariable,
+    "torch.sparse_compressed_tensor": SkipFunctionVariable,
+    "torch._C._autograd._unsafe_set_version_counter": TorchInGraphFunctionVariable,
+}
+
+
+# In graph functions (including constant folding) that are C bindings
+torch_c_binding_in_graph_functions = dict.fromkeys(
+    [
+        "math.acos",
+        "math.acosh",
+        "math.asin",
+        "math.asinh",
+        "math.atan",
+        "math.atan2",
+        "math.atanh",
+        "math.ceil",
+        "math.comb",
+        "math.copysign",
+        "math.cos",
+        "math.cosh",
+        "math.degrees",
+        "math.dist",
+        "math.erf",
+        "math.erfc",
+        "math.exp",
+        "math.expm1",
+        "math.fabs",
+        "math.factorial",
+        "math.floor",
+        "math.fmod",
+        "math.frexp",
+        "math.fsum",
+        "math.gamma",
+        "math.gcd",
+        "math.hypot",
+        "math.isclose",
+        "math.isfinite",
+        "math.isinf",
+        "math.isnan",
+        "math.isqrt",
+        "math.ldexp",
+        "math.lgamma",
+        "math.log",
+        "math.log10",
+        "math.log1p",
+        "math.log2",
+        "math.modf",
+        "math.nextafter",
+        "math.perm",
+        "math.pow",
+        "math.prod",
+        "math.radians",
+        "math.remainder",
+        "math.sin",
+        "math.sinh",
+        "math.tan",
+        "math.tanh",
+        "math.trunc",
+        "math.ulp",
+        "torch._adaptive_avg_pool2d",
+        "torch._adaptive_avg_pool3d",
+        "torch._add_batch_dim",
+        "torch._add_relu_",
+        "torch._add_relu",
+        "torch._addmm_activation",
+        "torch._aminmax",
+        "torch._amp_foreach_non_finite_check_and_unscale_",
+        "torch._amp_update_scale_",
+        "torch._assert_async",
+        "torch._assert_tensor_metadata",
+        "torch._batch_norm_impl_index",
+        "torch._C._activate_cuda_trace",
+        "torch._C._add_cached_tensor",
+        "torch._C._add_docstr",
+        "torch._C._are_functorch_transforms_active",
+        "torch._C._autograd_init",
+        "torch._C._awaitable_nowait",
+        "torch._C._awaitable_wait",
+        "torch._C._awaitable",
+        "torch._C._backport_for_mobile_from_buffer_to_buffer",
+        "torch._C._backport_for_mobile_from_buffer",
+        "torch._C._backport_for_mobile_to_buffer",
+        "torch._C._backport_for_mobile",
+        "torch._C._broadcast_coalesced",
+        "torch._C._broadcast_out",
+        "torch._C._broadcast",
+        "torch._C._c10d_init",
+        "torch._C._calculate_package_version_based_on_upgraders",
+        "torch._C._can_use_flash_attention",
+        "torch._C._can_use_mem_efficient_attention",
+        "torch._C._check_onnx_proto",
+        "torch._C._check_sparse_tensor_invariants",
+        "torch._C._collect_all",
+        "torch._C._commit_update",
+        "torch._C._compile_graph_to_code_table",
+        "torch._C._construct_CUDA_Tensor_From_Storage_And_Metadata",
+        "torch._C._construct_storage_from_data_pointer",
+        "torch._C._conv_determine_backend_memory_format",
+        "torch._C._cpu._is_cpu_support_vnni",
+        "torch._C._crash_if_aten_asan",
+        "torch._C._crash_if_csrc_asan",
+        "torch._C._crash_if_csrc_ubsan",
+        "torch._C._crash_if_debug_asserts_fail",
+        "torch._C._crash_if_vptr_ubsan",
+        "torch._C._create_function_from_graph",
+        "torch._C._create_function_from_trace_with_dict",
+        "torch._C._create_function_from_trace",
+        "torch._C._create_graph_by_tracing",
+        "torch._C._create_module_with_type",
+        "torch._C._create_object_with_type",
+        "torch._C._cuda_attach_out_of_memory_observer",
+        "torch._C._cuda_beginAllocateCurrentStreamToPool",
+        "torch._C._cuda_canDeviceAccessPeer",
+        "torch._C._cuda_changeCurrentAllocator",
+        "torch._C._cuda_checkPoolLiveAllocations",
+        "torch._C._cuda_clearCublasWorkspaces",
+        "torch._C._cuda_cudaCachingAllocator_raw_alloc",
+        "torch._C._cuda_cudaCachingAllocator_raw_delete",
+        "torch._C._cuda_cudaCachingAllocator_set_allocator_settings",
+        "torch._C._cuda_cudaHostAllocator",
+        "torch._C._cuda_customAllocator",
+        "torch._C._cuda_emptyCache",
+        "torch._C._cuda_endAllocateCurrentStreamToPool",
+        "torch._C._cuda_exchangeDevice",
+        "torch._C._cuda_get_conv_benchmark_empty_cache",
+        "torch._C._cuda_get_cudnn_benchmark_limit",
+        "torch._C._cuda_get_sync_debug_mode",
+        "torch._C._cuda_getAllocator",
+        "torch._C._cuda_getAllocatorBackend",
+        "torch._C._cuda_getArchFlags",
+        "torch._C._cuda_getCheckpointState",
+        "torch._C._cuda_getCompiledVersion",
+        "torch._C._cuda_getCurrentBlasHandle",
+        "torch._C._cuda_getCurrentRawStream",
+        "torch._C._cuda_getCurrentStream",
+        "torch._C._cuda_getDefaultStream",
+        "torch._C._cuda_getDevice",
+        "torch._C._cuda_getDeviceCount",
+        "torch._C._cuda_hasPrimaryContext",
+        "torch._C._cuda_init",
+        "torch._C._cuda_ipc_collect",
+        "torch._C._cuda_isCurrentStreamCapturing",
+        "torch._C._cuda_isHistoryEnabled",
+        "torch._C._cuda_isInBadFork",
+        "torch._C._cuda_jiterator_compile_and_launch_kernel",
+        "torch._C._cuda_lock_mutex",
+        "torch._C._cuda_maybeExchangeDevice",
+        "torch._C._cuda_memorySnapshot",
+        "torch._C._cuda_memoryStats",
+        "torch._C._cuda_record_memory_history_legacy",
+        "torch._C._cuda_record_memory_history",
+        "torch._C._cuda_releasePool",
+        "torch._C._cuda_resetAccumulatedMemoryStats",
+        "torch._C._cuda_resetPeakMemoryStats",
+        "torch._C._cuda_set_cudnn_benchmark_limit",
+        "torch._C._cuda_set_sync_debug_mode",
+        "torch._C._cuda_setCheckpointPoolState",
+        "torch._C._cuda_setDevice",
+        "torch._C._cuda_setMemoryFraction",
+        "torch._C._cuda_setStream",
+        "torch._C._cuda_sleep",
+        "torch._C._cuda_synchronize",
+        "torch._C._cuda_unlock_mutex",
+        "torch._C._cudnn_set_conv_benchmark_empty_cache",
+        "torch._C._cudnn.getCompileVersion",
+        "torch._C._cudnn.getRuntimeVersion",
+        "torch._C._cudnn.getVersionInt",
+        "torch._C._current_autograd_node",
+        "torch._C._current_graph_task_execution_order",
+        "torch._C._current_graph_task_id",
+        "torch._C._cxx_flags",
+        "torch._C._debug_get_fusion_group_inlining",
+        "torch._C._debug_only_are_vmap_fallback_warnings_enabled",
+        "torch._C._debug_only_display_vmap_fallback_warnings",
+        "torch._C._debug_set_autodiff_subgraph_inlining",
+        "torch._C._debug_set_fusion_group_inlining",
+        "torch._C._demangle",
+        "torch._C._disabled_torch_dispatch_impl",
+        "torch._C._disabled_torch_function_impl",
+        "torch._C._dispatch_call_boxed",
+        "torch._C._dispatch_check_all_invariants",
+        "torch._C._dispatch_check_invariants",
+        "torch._C._dispatch_dump_table",
+        "torch._C._dispatch_dump",
+        "torch._C._dispatch_find_dangling_impls",
+        "torch._C._dispatch_find_schema_or_throw",
+        "torch._C._dispatch_get_all_op_names",
+        "torch._C._dispatch_get_backend_keyset_from_autograd",
+        "torch._C._dispatch_get_registrations_for_dispatch_key",
+        "torch._C._dispatch_has_backend_fallback",
+        "torch._C._dispatch_has_computed_kernel_for_dispatch_key",
+        "torch._C._dispatch_has_kernel_for_any_dispatch_key",
+        "torch._C._dispatch_has_kernel_for_dispatch_key",
+        "torch._C._dispatch_has_kernel",
+        "torch._C._dispatch_is_alias_key",
+        "torch._C._dispatch_is_included_in_alias",
+        "torch._C._dispatch_is_main_interpreter",
+        "torch._C._dispatch_isTensorSubclassLike",
+        "torch._C._dispatch_key_for_device",
+        "torch._C._dispatch_key_name",
+        "torch._C._dispatch_key_parse",
+        "torch._C._dispatch_key_set",
+        "torch._C._dispatch_keys",
+        "torch._C._dispatch_keyset_full_after",
+        "torch._C._dispatch_keyset_full",
+        "torch._C._dispatch_keyset_to_string",
+        "torch._C._dispatch_library",
+        "torch._C._dispatch_num_backends",
+        "torch._C._dispatch_print_registrations_for_dispatch_key",
+        "torch._C._dispatch_pystub",
+        "torch._C._dispatch_set_report_error_callback",
+        "torch._C._dispatch_tls_is_dispatch_key_excluded",
+        "torch._C._dispatch_tls_is_dispatch_key_included",
+        "torch._C._dispatch_tls_local_exclude_set",
+        "torch._C._dispatch_tls_local_include_set",
+        "torch._C._dispatch_tls_set_dispatch_key_excluded",
+        "torch._C._dispatch_tls_set_dispatch_key_included",
+        "torch._C._dist_autograd_init",
+        "torch._C._dump_local_tls_set",
+        "torch._C._dump_upgraders_map",
+        "torch._C._enable_mobile_interface_call_export",
+        "torch._C._enter_dual_level",
+        "torch._C._error_if_any_worker_fails",
+        "torch._C._exit_dual_level",
+        "torch._C._export_operator_list",
+        "torch._C._export_opnames",
+        "torch._C._faulty_agent_init",
+        "torch._C._fft.fft_fft",
+        "torch._C._fft.fft_fft2",
+        "torch._C._fft.fft_fftfreq",
+        "torch._C._fft.fft_fftn",
+        "torch._C._fft.fft_fftshift",
+        "torch._C._fft.fft_hfft",
+        "torch._C._fft.fft_hfft2",
+        "torch._C._fft.fft_hfftn",
+        "torch._C._fft.fft_ifft",
+        "torch._C._fft.fft_ifft2",
+        "torch._C._fft.fft_ifftn",
+        "torch._C._fft.fft_ifftshift",
+        "torch._C._fft.fft_ihfft",
+        "torch._C._fft.fft_ihfft2",
+        "torch._C._fft.fft_ihfftn",
+        "torch._C._fft.fft_irfft",
+        "torch._C._fft.fft_irfft2",
+        "torch._C._fft.fft_irfftn",
+        "torch._C._fft.fft_rfft",
+        "torch._C._fft.fft_rfft2",
+        "torch._C._fft.fft_rfftfreq",
+        "torch._C._fft.fft_rfftn",
+        "torch._C._free_And_Remove_DeleterFn",
+        "torch._C._freeze_module",
+        "torch._C._from_dlpack",
+        "torch._C._functionality_to_backend_keys",
+        "torch._C._functionalization_reapply_views_tls",
+        "torch._C._fuse_to_static_module",
+        "torch._C._gather_out",
+        "torch._C._gather",
+        "torch._C._generate_upgraders_graph",
+        "torch._C._get_autograd_fallback_mode",
+        "torch._C._get_backcompat_broadcast_warn",
+        "torch._C._get_backcompat_keepdim_warn",
+        "torch._C._get_caught_jit_exception_class_name",
+        "torch._C._get_caught_jit_exception_original_msg",
+        "torch._C._get_constant_bool_symnode",
+        "torch._C._get_cpp_backtrace",
+        "torch._C._get_cpu_capability",
+        "torch._C._get_cublas_allow_bf16_reduced_precision_reduction",
+        "torch._C._get_cublas_allow_fp16_reduced_precision_reduction",
+        "torch._C._get_cublas_allow_tf32",
+        "torch._C._get_cudnn_allow_tf32",
+        "torch._C._get_cudnn_benchmark",
+        "torch._C._get_cudnn_deterministic",
+        "torch._C._get_cudnn_enabled",
+        "torch._C._get_custom_class_python_wrapper",
+        "torch._C._get_default_device",
+        "torch._C._get_deterministic_algorithms_warn_only",
+        "torch._C._get_deterministic_algorithms",
+        "torch._C._get_deterministic_fill_uninitialized_memory",
+        "torch._C._get_dispatch_mode",
+        "torch._C._get_dispatch_stack_at",
+        "torch._C._get_file_format",
+        "torch._C._get_flash_sdp_enabled",
+        "torch._C._get_float32_matmul_precision",
+        "torch._C._get_function_stack_at",
+        "torch._C._get_graph_executor_optimize",
+        "torch._C._get_linalg_preferred_backend",
+        "torch._C._get_math_sdp_enabled",
+        "torch._C._get_max_operator_version",
+        "torch._C._get_mem_efficient_sdp_enabled",
+        "torch._C._get_mkldnn_enabled",
+        "torch._C._get_cudnn_sdp_enabled",
+        "torch._C._set_sdp_use_cudnn",
+        "torch._C._get_mobile_model_contained_types_from_buffer",
+        "torch._C._get_mobile_model_contained_types",
+        "torch._C._get_model_bytecode_version_from_buffer",
+        "torch._C._get_model_bytecode_version",
+        "torch._C._get_model_extra_files_from_buffer",
+        "torch._C._get_model_extra_files",
+        "torch._C._get_model_ops_and_info_from_buffer",
+        "torch._C._get_model_ops_and_info",
+        "torch._C._get_module_info_from_flatbuffer",
+        "torch._C._get_nnpack_enabled",
+        "torch._C._get_obj_in_tls",
+        "torch._C._get_operation_overload",
+        "torch._C._get_operator_version_map",
+        "torch._C._get_privateuse1_backend_name",
+        "torch._C._get_qengine",
+        "torch._C._get_schema",
+        "torch._C._get_nested_int",
+        "torch._C._get_tensor_metadata",
+        "torch._C._get_tracing_state",
+        "torch._C._get_upgrader_ranges",
+        "torch._C._get_upgraders_entry_map",
+        "torch._C._get_upgraders_map_size",
+        "torch._C._get_value_trace",
+        "torch._C._get_version_calculator_flag",
+        "torch._C._get_warnAlways",
+        "torch._C._graph_pool_handle",
+        "torch._C._group_tensors_by_device_and_dtype",
+        "torch._C._hack_do_not_use_clone_module_with_class",
+        "torch._C._has_distributed",
+        "torch._C._has_Standard_Deleter",
+        "torch._C._has_storage",
+        "torch._C._has_tensorexpr_cpp_tests",
+        "torch._C._run_tensorexpr_cpp_tests",
+        "torch._C._has_torch_function_unary",
+        "torch._C._has_torch_function_variadic",
+        "torch._C._has_torch_function",
+        "torch._C._import_ir_module_from_package",
+        "torch._C._increment_version",
+        "torch._C._infer_size",
+        "torch._C._init_names",
+        "torch._C._initExtension",
+        "torch._C._is_alias_of",
+        "torch._C._is_any_autocast_enabled",
+        "torch._C._is_cached_tensor",
+        "torch._C._is_fwd_grad_enabled",
+        "torch._C._is_key_in_tls",
+        "torch._C._is_multithreading_enabled",
+        "torch._C._is_torch_function_enabled",
+        "torch._C._is_torch_function_mode_enabled",
+        "torch._C._is_tracing",
+        "torch._C._is_view_replay_enabled",
+        "torch._C._is_xnnpack_enabled",
+        "torch._C._itt.is_available",
+        "torch._C._itt.mark",
+        "torch._C._itt.rangePop",
+        "torch._C._itt.rangePush",
+        "torch._C._ivalue_debug_python_object",
+        "torch._C._ivalue_tags_match",
+        "torch._C._jit_assert_is_instance",
+        "torch._C._jit_can_fuse_on_cpu_legacy",
+        "torch._C._jit_can_fuse_on_cpu",
+        "torch._C._jit_can_fuse_on_gpu",
+        "torch._C._jit_cat_wo_conditionals",
+        "torch._C._jit_check_alias_annotation",
+        "torch._C._jit_clear_class_registry",
+        "torch._C._jit_debug_fuser_num_cached_kernel_specs",
+        "torch._C._jit_debug_module_iterators",
+        "torch._C._jit_decay_packed_param_input_types",
+        "torch._C._jit_decomposition_graph_for_node",
+        "torch._C._jit_differentiate",
+        "torch._C._jit_erase_non_input_shape_information",
+        "torch._C._jit_flatten",
+        "torch._C._jit_fuser_get_fused_kernel_code",
+        "torch._C._jit_get_all_schemas",
+        "torch._C._jit_get_custom_class_schemas",
+        "torch._C._jit_get_emit_hooks",
+        "torch._C._jit_get_inline_everything_mode",
+        "torch._C._jit_get_logging_option",
+        "torch._C._jit_get_num_profiled_runs",
+        "torch._C._jit_get_operation",
+        "torch._C._jit_get_schemas_for_operator",
+        "torch._C._jit_get_te_cuda_pointwise_block_count",
+        "torch._C._jit_get_te_cuda_pointwise_block_size",
+        "torch._C._jit_get_te_cuda_pointwise_loop_levels",
+        "torch._C._jit_get_te_generate_block_code",
+        "torch._C._jit_get_te_must_use_llvm_cpu",
+        "torch._C._jit_get_tracer_state_warn",
+        "torch._C._jit_has_cpp_tests",
+        "torch._C._jit_init",
+        "torch._C._jit_interpret_graph",
+        "torch._C._jit_is_onnx_log_enabled",
+        "torch._C._jit_is_script_object",
+        "torch._C._jit_llga_enabled",
+        "torch._C._jit_nvfuser_can_be_enabled",
+        "torch._C._jit_nvfuser_clear_comparison_callback",
+        "torch._C._jit_nvfuser_enabled",
+        "torch._C._jit_nvfuser_horizontal_mode",
+        "torch._C._jit_nvfuser_set_comparison_callback",
+        "torch._C._jit_nvfuser_single_node_mode",
+        "torch._C._jit_object_is_non_holding",
+        "torch._C._jit_onnx_convert_pattern_from_subblock",
+        "torch._C._jit_onnx_create_full_scope_name",
+        "torch._C._jit_onnx_list_model_parameters",
+        "torch._C._jit_onnx_log",
+        "torch._C._jit_opt_conditionals",
+        "torch._C._jit_override_can_fuse_on_cpu_legacy",
+        "torch._C._jit_override_can_fuse_on_cpu",
+        "torch._C._jit_override_can_fuse_on_gpu",
+        "torch._C._jit_pass_autocast",
+        "torch._C._jit_pass_batch_mm",
+        "torch._C._jit_pass_canonicalize_graph_fuser_ops",
+        "torch._C._jit_pass_canonicalize",
+        "torch._C._jit_pass_complete_shape_analysis",
+        "torch._C._jit_pass_concat_frozen_linear",
+        "torch._C._jit_pass_constant_loop_unrolling",
+        "torch._C._jit_pass_constant_pooling",
+        "torch._C._jit_pass_constant_propagation_immutable_types",
+        "torch._C._jit_pass_constant_propagation",
+        "torch._C._jit_pass_convert_frozen_ops_to_mkldnn",
+        "torch._C._jit_pass_create_autodiff_subgraphs",
+        "torch._C._jit_pass_create_functional_graphs",
+        "torch._C._jit_pass_cse",
+        "torch._C._jit_pass_custom_pattern_based_rewrite_graph",
+        "torch._C._jit_pass_custom_pattern_based_rewrite",
+        "torch._C._jit_pass_dbr_quant_remove_redundant_aliases",
+        "torch._C._jit_pass_dce_allow_deleting_nodes_with_side_effects",
+        "torch._C._jit_pass_dce",
+        "torch._C._jit_pass_decompose_ops",
+        "torch._C._jit_pass_dedup_module_uses",
+        "torch._C._jit_pass_erase_number_types",
+        "torch._C._jit_pass_erase_shape_information",
+        "torch._C._jit_pass_filter_non_tensor_arguments",
+        "torch._C._jit_pass_fixup_onnx_controlflow_node",
+        "torch._C._jit_pass_fold_convbn",
+        "torch._C._jit_pass_fold_frozen_conv_add_or_sub",
+        "torch._C._jit_pass_fold_frozen_conv_bn",
+        "torch._C._jit_pass_fold_frozen_conv_mul_or_div",
+        "torch._C._jit_pass_fold_frozen_linear_bn",
+        "torch._C._jit_pass_fold_prepacking_ops",
+        "torch._C._jit_pass_functional_to_inplace_activation",
+        "torch._C._jit_pass_fuse_add_relu",
+        "torch._C._jit_pass_fuse_addmm",
+        "torch._C._jit_pass_fuse_clamp_w_prepacked_linear_conv",
+        "torch._C._jit_pass_fuse_frozen_conv_add_relu",
+        "torch._C._jit_pass_fuse_linear",
+        "torch._C._jit_pass_fuse_quantized_add_relu",
+        "torch._C._jit_pass_fuse_tensorexprs",
+        "torch._C._jit_pass_fuse",
+        "torch._C._jit_pass_inline_fork_wait",
+        "torch._C._jit_pass_inline_functional_graphs",
+        "torch._C._jit_pass_inline",
+        "torch._C._jit_pass_inplace_to_functional_activation",
+        "torch._C._jit_pass_insert_observer_method_for_ondevice_ptq",
+        "torch._C._jit_pass_insert_observers",
+        "torch._C._jit_pass_insert_prepack_unpack",
+        "torch._C._jit_pass_insert_prepacked_ops",
+        "torch._C._jit_pass_insert_quant_dequant_for_ondevice_ptq",
+        "torch._C._jit_pass_insert_quant_dequant",
+        "torch._C._jit_pass_integer_value_refinement",
+        "torch._C._jit_pass_lint",
+        "torch._C._jit_pass_loop_unrolling",
+        "torch._C._jit_pass_lower_all_tuples",
+        "torch._C._jit_pass_lower_graph",
+        "torch._C._jit_pass_metal_fold_prepacking_ops",
+        "torch._C._jit_pass_metal_fuse_clamp_w_prepacked_conv",
+        "torch._C._jit_pass_metal_insert_prepacked_ops",
+        "torch._C._jit_pass_metal_optimize_for_mobile",
+        "torch._C._jit_pass_onnx_assign_output_shape",
+        "torch._C._jit_pass_onnx_assign_scoped_names_for_node_and_value",
+        "torch._C._jit_pass_onnx_autograd_function_process",
+        "torch._C._jit_pass_onnx_block",
+        "torch._C._jit_pass_onnx_cast_all_constant_to_floating",
+        "torch._C._jit_pass_onnx_clear_scope_records",
+        "torch._C._jit_pass_onnx_constant_fold",
+        "torch._C._jit_pass_onnx_deduplicate_initializers",
+        "torch._C._jit_pass_onnx_eliminate_unused_items",
+        "torch._C._jit_pass_onnx_eval_peephole",
+        "torch._C._jit_pass_onnx_function_extraction",
+        "torch._C._jit_pass_onnx_function_substitution",
+        "torch._C._jit_pass_onnx_graph_shape_type_inference",
+        "torch._C._jit_pass_onnx_lint",
+        "torch._C._jit_pass_onnx_node_shape_type_inference",
+        "torch._C._jit_pass_onnx_peephole",
+        "torch._C._jit_pass_onnx_preprocess_caffe2",
+        "torch._C._jit_pass_onnx_preprocess",
+        "torch._C._jit_pass_onnx_quantization_insert_permutes",
+        "torch._C._jit_pass_onnx_remove_inplace_ops_for_onnx",
+        "torch._C._jit_pass_onnx_remove_print",
+        "torch._C._jit_pass_onnx_scalar_type_analysis",
+        "torch._C._jit_pass_onnx_set_dynamic_input_shape",
+        "torch._C._jit_pass_onnx_track_scope_attributes",
+        "torch._C._jit_pass_onnx_unpack_quantized_weights",
+        "torch._C._jit_pass_onnx",
+        "torch._C._jit_pass_optimize_for_inference",
+        "torch._C._jit_pass_optimize_for_mobile",
+        "torch._C._jit_pass_optimize_frozen_graph",
+        "torch._C._jit_pass_pattern_based_rewrite",
+        "torch._C._jit_pass_peephole_list_idioms",
+        "torch._C._jit_pass_peephole",
+        "torch._C._jit_pass_prepare_division_for_onnx",
+        "torch._C._jit_pass_propagate_device",
+        "torch._C._jit_pass_propagate_dtype",
+        "torch._C._jit_pass_propagate_shapes_on_graph_and_build_compute",
+        "torch._C._jit_pass_propagate_shapes_on_graph",
+        "torch._C._jit_pass_quant_finalize_for_ondevice_ptq",
+        "torch._C._jit_pass_quant_finalize",
+        "torch._C._jit_pass_quant_fusion",
+        "torch._C._jit_pass_refine_integer_values",
+        "torch._C._jit_pass_refine_tuple_types",
+        "torch._C._jit_pass_remove_dropout",
+        "torch._C._jit_pass_remove_expands",
+        "torch._C._jit_pass_remove_inplace_ops",
+        "torch._C._jit_pass_remove_mutation",
+        "torch._C._jit_pass_replace_old_ops_with_upgraders",
+        "torch._C._jit_pass_replicate_dequantize",
+        "torch._C._jit_pass_run_decompositions",
+        "torch._C._jit_pass_specialize_autogradzero",
+        "torch._C._jit_pass_swap_functional_linear",
+        "torch._C._jit_pass_transform_conv1d_to_conv2d",
+        "torch._C._jit_pass_transpose_frozen_linear",
+        "torch._C._jit_pass_vulkan_fold_prepacking_ops",
+        "torch._C._jit_pass_vulkan_fuse_clamp_w_prepacked_conv",
+        "torch._C._jit_pass_vulkan_insert_prepacked_ops",
+        "torch._C._jit_pass_vulkan_optimize_for_mobile",
+        "torch._C._jit_register_decomposition_for_schema",
+        "torch._C._jit_register_shape_compute_graph_for_node",
+        "torch._C._jit_resolve_packet",
+        "torch._C._jit_run_cpp_tests",
+        "torch._C._jit_script_class_compile",
+        "torch._C._jit_script_compile_overload",
+        "torch._C._jit_script_compile",
+        "torch._C._jit_script_interface_compile",
+        "torch._C._jit_set_autocast_mode",
+        "torch._C._jit_set_bailout_depth",
+        "torch._C._jit_set_emit_hooks",
+        "torch._C._jit_set_fusion_strategy",
+        "torch._C._jit_set_inline_everything_mode",
+        "torch._C._jit_set_llga_enabled",
+        "torch._C._jit_set_logging_option",
+        "torch._C._jit_set_logging_stream",
+        "torch._C._jit_set_num_profiled_runs",
+        "torch._C._jit_set_nvfuser_enabled",
+        "torch._C._jit_set_nvfuser_guard_mode",
+        "torch._C._jit_set_nvfuser_horizontal_mode",
+        "torch._C._jit_set_nvfuser_single_node_mode",
+        "torch._C._jit_set_nvfuser_skip_node_kind",
+        "torch._C._jit_set_onnx_log_enabled",
+        "torch._C._jit_set_onnx_log_output_stream",
+        "torch._C._jit_set_profiling_executor",
+        "torch._C._jit_set_profiling_mode",
+        "torch._C._jit_set_symbolic_shapes_test_mode",
+        "torch._C._jit_set_te_cuda_pointwise_block_count",
+        "torch._C._jit_set_te_cuda_pointwise_block_size",
+        "torch._C._jit_set_te_cuda_pointwise_loop_levels",
+        "torch._C._jit_set_te_generate_block_code",
+        "torch._C._jit_set_te_must_use_llvm_cpu",
+        "torch._C._jit_set_texpr_dynamic_shape_enabled",
+        "torch._C._jit_set_texpr_fuser_enabled",
+        "torch._C._jit_set_texpr_reductions_enabled",
+        "torch._C._jit_set_tracer_state_warn",
+        "torch._C._jit_set_utf8_decoding_ignore",
+        "torch._C._jit_shape_compute_graph_for_node",
+        "torch._C._jit_symbolic_shapes_test_mode_enabled",
+        "torch._C._jit_texpr_dynamic_shape_enabled",
+        "torch._C._jit_texpr_fallback_allowed",
+        "torch._C._jit_texpr_fuser_enabled",
+        "torch._C._jit_texpr_reductions_enabled",
+        "torch._C._jit_texpr_set_fallback_allowed",
+        "torch._C._jit_to_backend_selective",
+        "torch._C._jit_to_backend",
+        "torch._C._jit_to_static_module",
+        "torch._C._jit_trace_graph",
+        "torch._C._jit_trace_module",
+        "torch._C._jit_tree_views.FalseLiteral",
+        "torch._C._jit_tree_views.NoneLiteral",
+        "torch._C._jit_tree_views.TrueLiteral",
+        "torch._C._jit_try_infer_type",
+        "torch._C._jit_unflatten",
+        "torch._C._last_executed_optimized_graph",
+        "torch._C._len_torch_dispatch_stack",
+        "torch._C._len_torch_function_stack",
+        "torch._C._linalg._linalg_eigvals",
+        "torch._C._linalg.linalg_cholesky_ex",
+        "torch._C._linalg.linalg_cholesky",
+        "torch._C._linalg.linalg_cond",
+        "torch._C._linalg.linalg_cross",
+        "torch._C._linalg.linalg_det",
+        "torch._C._linalg.linalg_diagonal",
+        "torch._C._linalg.linalg_eig",
+        "torch._C._linalg.linalg_eigh",
+        "torch._C._linalg.linalg_eigvals",
+        "torch._C._linalg.linalg_eigvalsh",
+        "torch._C._linalg.linalg_householder_product",
+        "torch._C._linalg.linalg_inv_ex",
+        "torch._C._linalg.linalg_inv",
+        "torch._C._linalg.linalg_ldl_factor_ex",
+        "torch._C._linalg.linalg_ldl_factor",
+        "torch._C._linalg.linalg_ldl_solve",
+        "torch._C._linalg.linalg_lstsq",
+        "torch._C._linalg.linalg_lu_factor_ex",
+        "torch._C._linalg.linalg_lu_factor",
+        "torch._C._linalg.linalg_lu_solve",
+        "torch._C._linalg.linalg_lu",
+        "torch._C._linalg.linalg_matmul",
+        "torch._C._linalg.linalg_matrix_exp",
+        "torch._C._linalg.linalg_matrix_norm",
+        "torch._C._linalg.linalg_matrix_power",
+        "torch._C._linalg.linalg_matrix_rank",
+        "torch._C._linalg.linalg_multi_dot",
+        "torch._C._linalg.linalg_norm",
+        "torch._C._linalg.linalg_pinv",
+        "torch._C._linalg.linalg_qr",
+        "torch._C._linalg.linalg_slogdet",
+        "torch._C._linalg.linalg_solve_ex",
+        "torch._C._linalg.linalg_solve_triangular",
+        "torch._C._linalg.linalg_solve",
+        "torch._C._linalg.linalg_svd",
+        "torch._C._linalg.linalg_svdvals",
+        "torch._C._linalg.linalg_tensorinv",
+        "torch._C._linalg.linalg_tensorsolve",
+        "torch._C._linalg.linalg_vander",
+        "torch._C._linalg.linalg_vecdot",
+        "torch._C._linalg.linalg_vector_norm",
+        "torch._C._llvm_enabled",
+        "torch._C._load_for_lite_interpreter_from_buffer",
+        "torch._C._load_for_lite_interpreter",
+        "torch._C._load_jit_module_from_bytes",
+        "torch._C._load_jit_module_from_file",
+        "torch._C._load_mobile_module_from_bytes",
+        "torch._C._load_mobile_module_from_file",
+        "torch._C._log_api_usage_metadata",
+        "torch._C._log_api_usage_once",
+        "torch._C._logging_set_logger",
+        "torch._C._meta_in_tls_dispatch_include",
+        "torch._C._mps_acquireEvent",
+        "torch._C._mps_currentAllocatedMemory",
+        "torch._C._mps_deviceSynchronize",
+        "torch._C._mps_driverAllocatedMemory",
+        "torch._C._mps_elapsedTimeOfEvents",
+        "torch._C._mps_emptyCache",
+        "torch._C._mps_get_default_generator",
+        "torch._C._mps_is_available",
+        "torch._C._mps_is_in_bad_fork",
+        "torch._C._mps_is_on_macos_13_or_newer",
+        "torch._C._mps_profilerStartTrace",
+        "torch._C._mps_profilerStopTrace",
+        "torch._C._mps_queryEvent",
+        "torch._C._mps_recordEvent",
+        "torch._C._mps_releaseEvent",
+        "torch._C._mps_setMemoryFraction",
+        "torch._C._mps_synchronizeEvent",
+        "torch._C._mps_waitForEvent",
+        "torch._C._multiprocessing_init",
+        "torch._C._nccl_all_gather",
+        "torch._C._nccl_all_reduce",
+        "torch._C._nccl_broadcast",
+        "torch._C._nccl_init_rank",
+        "torch._C._nccl_reduce_scatter",
+        "torch._C._nccl_reduce",
+        "torch._C._nccl_unique_id",
+        "torch._C._nccl_version_suffix",
+        "torch._C._nccl_version",
+        "torch._C._nested.nested_tensor",
+        "torch._C._nested.nested_to_padded_tensor",
+        "torch._C._new_symbolic_shape_symbol",
+        "torch._C._nn_module_to_mobile",
+        "torch._C._nn._conv_depthwise2d",
+        "torch._C._nn._pad_circular",
+        "torch._C._nn._pad_enum",
+        "torch._C._nn._parse_to",
+        "torch._C._nn._test_ambiguous_defaults",
+        "torch._C._nn._test_optional_filled_intlist",
+        "torch._C._nn._test_optional_floatlist",
+        "torch._C._nn._test_optional_intlist",
+        "torch._C._nn._test_string_default",
+        "torch._C._nn._test_warn_in_autograd",
+        "torch._C._nn._upsample_bicubic2d_aa",
+        "torch._C._nn._upsample_bilinear2d_aa",
+        "torch._C._nn._upsample_nearest_exact1d",
+        "torch._C._nn._upsample_nearest_exact2d",
+        "torch._C._nn._upsample_nearest_exact3d",
+        "torch._C._nn.adaptive_avg_pool2d",
+        "torch._C._nn.adaptive_avg_pool3d",
+        "torch._C._nn.adaptive_max_pool2d",
+        "torch._C._nn.adaptive_max_pool3d",
+        "torch._C._nn.avg_pool2d",
+        "torch._C._nn.avg_pool3d",
+        "torch._C._nn.binary_cross_entropy",
+        "torch._C._nn.col2im",
+        "torch._C._nn.conv_depthwise3d",
+        "torch._C._nn.cross_entropy_loss",
+        "torch._C._nn.elu_",
+        "torch._C._nn.elu",
+        "torch._C._nn.flatten_dense_tensors",
+        "torch._C._nn.fractional_max_pool2d",
+        "torch._C._nn.fractional_max_pool3d",
+        "torch._C._nn.gelu_",
+        "torch._C._nn.gelu",
+        "torch._C._nn.glu",
+        "torch._C._nn.hardsigmoid_",
+        "torch._C._nn.hardsigmoid",
+        "torch._C._nn.hardswish_",
+        "torch._C._nn.hardswish",
+        "torch._C._nn.hardtanh_",
+        "torch._C._nn.hardtanh",
+        "torch._C._nn.huber_loss",
+        "torch._C._nn.im2col",
+        "torch._C._nn.l1_loss",
+        "torch._C._nn.leaky_relu_",
+        "torch._C._nn.leaky_relu",
+        "torch._C._nn.linear",
+        "torch._C._nn.log_sigmoid",
+        "torch._C._nn.max_pool2d_with_indices",
+        "torch._C._nn.max_pool3d_with_indices",
+        "torch._C._nn.max_unpool2d",
+        "torch._C._nn.max_unpool3d",
+        "torch._C._nn.mish_",
+        "torch._C._nn.mish",
+        "torch._C._nn.mkldnn_linear",
+        "torch._C._nn.mkldnn_reorder_conv2d_weight",
+        "torch._C._nn.mkldnn_reorder_conv3d_weight",
+        "torch._C._nn.mse_loss",
+        "torch._C._nn.multi_margin_loss",
+        "torch._C._nn.multilabel_margin_loss",
+        "torch._C._nn.nll_loss_nd",
+        "torch._C._nn.nll_loss",
+        "torch._C._nn.nll_loss2d",
+        "torch._C._nn.one_hot",
+        "torch._C._nn.pad_sequence",
+        "torch._C._nn.pad",
+        "torch._C._nn.reflection_pad1d",
+        "torch._C._nn.reflection_pad2d",
+        "torch._C._nn.reflection_pad3d",
+        "torch._C._nn.relu6_",
+        "torch._C._nn.relu6",
+        "torch._C._nn.replication_pad1d",
+        "torch._C._nn.replication_pad2d",
+        "torch._C._nn.replication_pad3d",
+        "torch._C._nn.rrelu_with_noise_",
+        "torch._C._nn.rrelu_with_noise",
+        "torch._C._nn.scaled_dot_product_attention",
+        "torch._C._nn.silu_",
+        "torch._C._nn.silu",
+        "torch._C._nn.slow_conv_dilated2d",
+        "torch._C._nn.slow_conv_dilated3d",
+        "torch._C._nn.slow_conv_transpose2d",
+        "torch._C._nn.slow_conv_transpose3d",
+        "torch._C._nn.slow_conv3d",
+        "torch._C._nn.smooth_l1_loss",
+        "torch._C._nn.soft_margin_loss",
+        "torch._C._nn.softplus",
+        "torch._C._nn.softshrink",
+        "torch._C._nn.thnn_conv2d",
+        "torch._C._nn.unflatten_dense_tensors",
+        "torch._C._nn.upsample_bicubic2d",
+        "torch._C._nn.upsample_bilinear2d",
+        "torch._C._nn.upsample_linear1d",
+        "torch._C._nn.upsample_nearest1d",
+        "torch._C._nn.upsample_nearest2d",
+        "torch._C._nn.upsample_nearest3d",
+        "torch._C._nn.upsample_trilinear3d",
+        "torch._C._non_sym_sizes",
+        "torch._C._overlaps",
+        "torch._C._parallel_info",
+        "torch._C._parse_dispatch_key",
+        "torch._C._parse_source_def",
+        "torch._C._pop_torch_dispatch_stack",
+        "torch._C._pop_torch_function_stack",
+        "torch._C._propagate_and_assign_input_shapes",
+        "torch._C._propagate_shapes",
+        "torch._C._propagate_xla_data",
+        "torch._C._push_on_torch_dispatch_stack",
+        "torch._C._push_on_torch_function_stack",
+        "torch._C._quantize_ondevice_ptq_dynamic",
+        "torch._C._register_py_class_for_device",
+        "torch._C._remove_cached_tensor",
+        "torch._C._remove_worker_pids",
+        "torch._C._rename_privateuse1_backend",
+        "torch._C._replace_",
+        "torch._C._replace_overloaded_method_decl",
+        "torch._C._resolve_type_from_object",
+        "torch._C._resolve_type",
+        "torch._C._rocm_is_backward_pass",
+        "torch._C._rpc_init",
+        "torch._C._run_emit_module_hook",
+        "torch._C._save_jit_module_to_bytes",
+        "torch._C._save_jit_module",
+        "torch._C._save_mobile_module_to_bytes",
+        "torch._C._save_mobile_module",
+        "torch._C._save_parameters",
+        "torch._C._scatter_out",
+        "torch._C._scatter",
+        "torch._C._select_conv_backend",
+        "torch._C._set_autograd_fallback_mode",
+        "torch._C._set_backcompat_broadcast_warn",
+        "torch._C._set_backcompat_keepdim_warn",
+        "torch._C._set_cached_tensors_enabled",
+        "torch._C._set_check_sparse_tensor_invariants",
+        "torch._C._set_conj",
+        "torch._C._set_cublas_allow_bf16_reduced_precision_reduction",
+        "torch._C._set_cublas_allow_fp16_reduced_precision_reduction",
+        "torch._C._set_cublas_allow_tf32",
+        "torch._C._set_cudnn_allow_tf32",
+        "torch._C._set_cudnn_benchmark",
+        "torch._C._set_cudnn_deterministic",
+        "torch._C._set_cudnn_enabled",
+        "torch._C._set_default_dtype",
+        "torch._C._set_default_mobile_cpu_allocator",
+        "torch._C._set_default_tensor_type",
+        "torch._C._set_deterministic_algorithms",
+        "torch._C._set_deterministic_fill_uninitialized_memory",
+        "torch._C._set_dispatch_mode",
+        "torch._C._set_float32_matmul_precision",
+        "torch._C._set_fwd_grad_enabled",
+        "torch._C._set_grad_enabled",
+        "torch._C._set_graph_executor_optimize",
+        "torch._C._set_linalg_preferred_backend",
+        "torch._C._set_meta_in_tls_dispatch_include",
+        "torch._C._set_mkldnn_enabled",
+        "torch._C._set_multithreading_enabled",
+        "torch._C._set_neg",
+        "torch._C._set_nnpack_enabled",
+        "torch._C._set_print_stack_traces_on_fatal_signal",
+        "torch._C._set_qengine",
+        "torch._C._set_sdp_use_flash",
+        "torch._C._set_sdp_use_math",
+        "torch._C._set_sdp_use_mem_efficient",
+        "torch._C._set_should_use_format_with_string_table",
+        "torch._C._set_storage_access_error_msg",
+        "torch._C._set_tensor_metadata",
+        "torch._C._set_tracing_state",
+        "torch._C._set_value_trace",
+        "torch._C._set_view_replay_enabled",
+        "torch._C._set_warnAlways",
+        "torch._C._set_worker_pids",
+        "torch._C._set_worker_signal_handlers",
+        "torch._C._should_allow_numbers_as_tensors",
+        "torch._C._show_config",
+        "torch._C._sparse._sparse_addmm",
+        "torch._C._sparse._sparse_log_softmax",
+        "torch._C._sparse._sparse_mm_reduce_impl",
+        "torch._C._sparse._sparse_mm",
+        "torch._C._sparse._sparse_softmax",
+        "torch._C._sparse._spdiags",
+        "torch._C._sparse.sparse_sampled_addmm",
+        "torch._C._special.special_airy_ai",
+        "torch._C._special.special_bessel_j0",
+        "torch._C._special.special_bessel_j1",
+        "torch._C._special.special_bessel_y0",
+        "torch._C._special.special_bessel_y1",
+        "torch._C._special.special_chebyshev_polynomial_t",
+        "torch._C._special.special_chebyshev_polynomial_u",
+        "torch._C._special.special_chebyshev_polynomial_v",
+        "torch._C._special.special_chebyshev_polynomial_w",
+        "torch._C._special.special_digamma",
+        "torch._C._special.special_entr",
+        "torch._C._special.special_erf",
+        "torch._C._special.special_erfc",
+        "torch._C._special.special_erfcx",
+        "torch._C._special.special_erfinv",
+        "torch._C._special.special_exp2",
+        "torch._C._special.special_expit",
+        "torch._C._special.special_expm1",
+        "torch._C._special.special_gammainc",
+        "torch._C._special.special_gammaincc",
+        "torch._C._special.special_gammaln",
+        "torch._C._special.special_hermite_polynomial_h",
+        "torch._C._special.special_hermite_polynomial_he",
+        "torch._C._special.special_i0",
+        "torch._C._special.special_i0e",
+        "torch._C._special.special_i1",
+        "torch._C._special.special_i1e",
+        "torch._C._special.special_laguerre_polynomial_l",
+        "torch._C._special.special_legendre_polynomial_p",
+        "torch._C._special.special_log_ndtr",
+        "torch._C._special.special_log_softmax",
+        "torch._C._special.special_log1p",
+        "torch._C._special.special_logit",
+        "torch._C._special.special_logsumexp",
+        "torch._C._special.special_modified_bessel_i0",
+        "torch._C._special.special_modified_bessel_i1",
+        "torch._C._special.special_modified_bessel_k0",
+        "torch._C._special.special_modified_bessel_k1",
+        "torch._C._special.special_multigammaln",
+        "torch._C._special.special_ndtr",
+        "torch._C._special.special_ndtri",
+        "torch._C._special.special_polygamma",
+        "torch._C._special.special_psi",
+        "torch._C._special.special_round",
+        "torch._C._special.special_scaled_modified_bessel_k0",
+        "torch._C._special.special_scaled_modified_bessel_k1",
+        "torch._C._special.special_shifted_chebyshev_polynomial_t",
+        "torch._C._special.special_shifted_chebyshev_polynomial_u",
+        "torch._C._special.special_shifted_chebyshev_polynomial_v",
+        "torch._C._special.special_shifted_chebyshev_polynomial_w",
+        "torch._C._special.special_sinc",
+        "torch._C._special.special_softmax",
+        "torch._C._special.special_spherical_bessel_j0",
+        "torch._C._special.special_xlog1py",
+        "torch._C._special.special_xlogy",
+        "torch._C._special.special_zeta",
+        "torch._C._stash_obj_in_tls",
+        "torch._C._storage_id",
+        "torch._C._storage_Use_Count",
+        "torch._C._supported_qengines",
+        "torch._C._te.abs",
+        "torch._C._te.acos",
+        "torch._C._te.annotate_input_shapes",
+        "torch._C._te.asin",
+        "torch._C._te.atan",
+        "torch._C._te.atan2",
+        "torch._C._te.ceil",
+        "torch._C._te.Compute",
+        "torch._C._te.Compute2",
+        "torch._C._te.construct_codegen",
+        "torch._C._te.cos",
+        "torch._C._te.cosh",
+        "torch._C._te.erf",
+        "torch._C._te.erfc",
+        "torch._C._te.exp",
+        "torch._C._te.expm1",
+        "torch._C._te.fixup_missing_shape_info",
+        "torch._C._te.floor",
+        "torch._C._te.fmod",
+        "torch._C._te.frac",
+        "torch._C._te.ifThenElse",
+        "torch._C._te.is_graph_compilable",
+        "torch._C._te.isnan",
+        "torch._C._te.lgamma",
+        "torch._C._te.log",
+        "torch._C._te.log10",
+        "torch._C._te.log1p",
+        "torch._C._te.log2",
+        "torch._C._te.lower",
+        "torch._C._te.make_shapes_symbolic",
+        "torch._C._te.pow",
+        "torch._C._te.Reduce",
+        "torch._C._te.remainder",
+        "torch._C._te.remove_graph_output",
+        "torch._C._te.remove_unused_self_argument",
+        "torch._C._te.replace_list_output_with_tuple",
+        "torch._C._te.round",
+        "torch._C._te.rsqrt",
+        "torch._C._te.sigmoid",
+        "torch._C._te.simplify",
+        "torch._C._te.sin",
+        "torch._C._te.sinh",
+        "torch._C._te.sqrt",
+        "torch._C._te.tan",
+        "torch._C._te.tanh",
+        "torch._C._te.trim_graph",
+        "torch._C._te.trunc",
+        "torch._C._tensor_impl_raw_handle",
+        "torch._C._test_only_add_entry_to_op_version_map",
+        "torch._C._test_only_populate_upgraders",
+        "torch._C._test_only_remove_entry_to_op_version_map",
+        "torch._C._test_only_remove_upgraders",
+        "torch._C._to_dlpack",
+        "torch._C._to_functionality_key",
+        "torch._C._tracer_set_force_outplace",
+        "torch._C._tracer_set_get_unique_name_fn",
+        "torch._C._tracer_warn_use_python",
+        "torch._C._unset_default_mobile_cpu_allocator",
+        "torch._C._unset_dispatch_mode",
+        "torch._C._valgrind_supported_platform",
+        "torch._C._valgrind_toggle_and_dump_stats",
+        "torch._C._valgrind_toggle",
+        "torch._C._verbose.mkl_set_verbose",
+        "torch._C._verbose.mkldnn_set_verbose",
+        "torch._C._vmapmode_decrement_nesting",
+        "torch._C._vmapmode_increment_nesting",
+        "torch._C._warn_deprecation",
+        "torch._C._warn",
+        "torch._C._will_engine_execute_node",
+        "torch._C._wrap_tensor_impl",
+        "torch._C.fork",
+        "torch._C.get_autocast_cpu_dtype",
+        "torch._C.get_autocast_gpu_dtype",
+        "torch._C.get_autocast_ipu_dtype",
+        "torch._C.get_autocast_xla_dtype",
+        "torch._C.get_default_dtype",
+        "torch._C.get_num_interop_threads",
+        "torch._C.get_num_threads",
+        "torch._C.import_ir_module_from_buffer",
+        "torch._C.import_ir_module",
+        "torch._C.init_num_threads",
+        "torch._C.is_anomaly_check_nan_enabled",
+        "torch._C.is_anomaly_enabled",
+        "torch._C.is_autocast_cache_enabled",
+        "torch._C.is_autocast_cpu_enabled",
+        "torch._C.is_autocast_enabled",
+        "torch._C.is_autocast_ipu_enabled",
+        "torch._C.is_autocast_xla_enabled",
+        "torch._C.is_grad_enabled",
+        "torch._C.is_inference_mode_enabled",
+        "torch._C.merge_type_from_type_comment",
+        "torch._C.parse_ir",
+        "torch._C.parse_schema",
+        "torch._C.parse_type_comment",
+        "torch._C.read_vitals",
+        "torch._C.set_flush_denormal",
+        "torch._C.set_num_interop_threads",
+        "torch._C.set_num_threads",
+        "torch._C.set_vital",
+        "torch._C.unify_type_list",
+        "torch._C.vitals_enabled",
+        "torch._C.wait",
+        "torch._cast_Byte",
+        "torch._cast_Char",
+        "torch._cast_Double",
+        "torch._cast_Float",
+        "torch._cast_Half",
+        "torch._cast_Int",
+        "torch._cast_Long",
+        "torch._cast_Short",
+        "torch._choose_qparams_per_tensor",
+        "torch._chunk_cat",
+        "torch._coalesce",
+        "torch._compute_linear_combination",
+        "torch._conj_copy",
+        "torch._conj_physical",
+        "torch._conj",
+        "torch._convert_indices_from_coo_to_csr",
+        "torch._convert_indices_from_csr_to_coo",
+        "torch._convert_weight_to_int4pack",
+        "torch._convolution_mode",
+        "torch._convolution",
+        "torch._copy_from_and_resize",
+        "torch._copy_from",
+        "torch._cslt_compress",
+        "torch._cslt_sparse_mm",
+        "torch._ctc_loss",
+        "torch._cudnn_ctc_loss",
+        "torch._cudnn_init_dropout_state",
+        "torch._cudnn_rnn_flatten_weight",
+        "torch._cudnn_rnn",
+        "torch._cufft_clear_plan_cache",
+        "torch._cufft_get_plan_cache_max_size",
+        "torch._cufft_get_plan_cache_size",
+        "torch._cufft_set_plan_cache_max_size",
+        "torch._cummax_helper",
+        "torch._cummin_helper",
+        "torch._debug_has_internal_overlap",
+        "torch._dim_arange",
+        "torch._dirichlet_grad",
+        "torch._disable_functionalization",
+        "torch._efficientzerotensor",
+        "torch._embedding_bag_forward_only",
+        "torch._embedding_bag",
+        "torch._empty_affine_quantized",
+        "torch._empty_per_channel_affine_quantized",
+        "torch._enable_functionalization",
+        "torch._euclidean_dist",
+        "torch._fake_quantize_learnable_per_channel_affine",
+        "torch._fake_quantize_learnable_per_tensor_affine",
+        "torch._fake_quantize_per_tensor_affine_cachemask_tensor_qparams",
+        "torch._fft_c2c",
+        "torch._fft_c2r",
+        "torch._fft_r2c",
+        "torch._fill_mem_eff_dropout_mask_",
+        "torch._foobar",
+        "torch._foreach_abs_",
+        "torch._foreach_abs",
+        "torch._foreach_acos_",
+        "torch._foreach_acos",
+        "torch._foreach_add_",
+        "torch._foreach_add",
+        "torch._foreach_addcdiv_",
+        "torch._foreach_addcdiv",
+        "torch._foreach_addcmul_",
+        "torch._foreach_addcmul",
+        "torch._foreach_asin_",
+        "torch._foreach_asin",
+        "torch._foreach_atan_",
+        "torch._foreach_atan",
+        "torch._foreach_ceil_",
+        "torch._foreach_ceil",
+        "torch._foreach_clamp_max_",
+        "torch._foreach_clamp_max",
+        "torch._foreach_clamp_min_",
+        "torch._foreach_clamp_min",
+        "torch._foreach_copy_",
+        "torch._foreach_cos_",
+        "torch._foreach_cos",
+        "torch._foreach_cosh_",
+        "torch._foreach_cosh",
+        "torch._foreach_div_",
+        "torch._foreach_div",
+        "torch._foreach_erf_",
+        "torch._foreach_erf",
+        "torch._foreach_erfc_",
+        "torch._foreach_erfc",
+        "torch._foreach_exp_",
+        "torch._foreach_exp",
+        "torch._foreach_expm1_",
+        "torch._foreach_expm1",
+        "torch._foreach_floor_",
+        "torch._foreach_floor",
+        "torch._foreach_frac_",
+        "torch._foreach_frac",
+        "torch._foreach_lerp_",
+        "torch._foreach_lerp",
+        "torch._foreach_lgamma_",
+        "torch._foreach_lgamma",
+        "torch._foreach_log_",
+        "torch._foreach_log",
+        "torch._foreach_log10_",
+        "torch._foreach_log10",
+        "torch._foreach_log1p_",
+        "torch._foreach_log1p",
+        "torch._foreach_log2_",
+        "torch._foreach_log2",
+        "torch._foreach_maximum_",
+        "torch._foreach_maximum",
+        "torch._foreach_minimum_",
+        "torch._foreach_minimum",
+        "torch._foreach_mul_",
+        "torch._foreach_mul",
+        "torch._foreach_neg_",
+        "torch._foreach_neg",
+        "torch._foreach_norm",
+        "torch._foreach_pow_",
+        "torch._foreach_pow",
+        "torch._foreach_reciprocal_",
+        "torch._foreach_reciprocal",
+        "torch._foreach_round_",
+        "torch._foreach_round",
+        "torch._foreach_sigmoid_",
+        "torch._foreach_sigmoid",
+        "torch._foreach_sign_",
+        "torch._foreach_sign",
+        "torch._foreach_sin_",
+        "torch._foreach_sin",
+        "torch._foreach_sinh_",
+        "torch._foreach_sinh",
+        "torch._foreach_sqrt_",
+        "torch._foreach_sqrt",
+        "torch._foreach_sub_",
+        "torch._foreach_sub",
+        "torch._foreach_tan_",
+        "torch._foreach_tan",
+        "torch._foreach_tanh_",
+        "torch._foreach_tanh",
+        "torch._foreach_trunc_",
+        "torch._foreach_trunc",
+        "torch._foreach_zero_",
+        "torch._freeze_functional_tensor",
+        "torch._from_functional_tensor",
+        "torch._functional_assert_async",
+        "torch._functional_sym_constrain_range_for_size",
+        "torch._functional_sym_constrain_range",
+        "torch._functionalize_are_all_mutations_hidden_from_autograd",
+        "torch._functionalize_commit_update",
+        "torch._functionalize_enable_reapply_views",
+        "torch._functionalize_has_data_mutation",
+        "torch._functionalize_has_metadata_mutation",
+        "torch._functionalize_is_multi_output_view",
+        "torch._functionalize_mark_mutation_hidden_from_autograd",
+        "torch._functionalize_replace",
+        "torch._functionalize_sync",
+        "torch._functionalize_was_storage_changed",
+        "torch._fused_adam_",
+        "torch._fused_adamw_",
+        "torch._fused_dropout",
+        "torch._fused_moving_avg_obs_fq_helper",
+        "torch._fused_sdp_choice",
+        "torch._fw_primal_copy",
+        "torch._grid_sampler_2d_cpu_fallback",
+        "torch._has_compatible_shallow_copy_type",
+        "torch._histogramdd_bin_edges",
+        "torch._histogramdd_from_bin_cts",
+        "torch._histogramdd_from_bin_tensors",
+        "torch._index_put_impl_",
+        "torch._indices_copy",
+        "torch._int_mm",
+        "torch._is_all_true",
+        "torch._is_any_true",
+        "torch._is_functional_tensor",
+        "torch._is_zerotensor",
+        "torch._linalg_check_errors",
+        "torch._linalg_det",
+        "torch._linalg_eigh",
+        "torch._linalg_slogdet",
+        "torch._linalg_solve_ex",
+        "torch._linalg_svd",
+        "torch._log_softmax_backward_data",
+        "torch._log_softmax",
+        "torch._logcumsumexp",
+        "torch._lstm_mps",
+        "torch._lu_with_info",
+        "torch._make_dep_token",
+        "torch._make_dual_copy",
+        "torch._make_dual",
+        "torch._make_per_channel_quantized_tensor",
+        "torch._make_per_tensor_quantized_tensor",
+        "torch._masked_scale",
+        "torch._masked_softmax",
+        "torch._mirror_autograd_meta_to",
+        "torch._mixed_dtypes_linear",
+        "torch._mkldnn_reshape",
+        "torch._mkldnn_transpose_",
+        "torch._mkldnn_transpose",
+        "torch._mps_convolution_transpose",
+        "torch._mps_convolution",
+        "torch._native_batch_norm_legit_no_training",
+        "torch._native_batch_norm_legit",
+        "torch._native_multi_head_attention",
+        "torch._neg_view_copy",
+        "torch._neg_view",
+        "torch._nested_from_padded_and_nested_example",
+        "torch._nested_tensor_from_mask_left_aligned",
+        "torch._nested_tensor_from_tensor_list",
+        "torch._nested_tensor_softmax_with_shape",
+        "torch._nested_view_from_buffer_copy",
+        "torch._nested_view_from_buffer",
+        "torch._nnpack_available",
+        "torch._nnpack_spatial_convolution",
+        "torch._pack_padded_sequence",
+        "torch._pad_packed_sequence",
+        "torch._pin_memory",
+        "torch._prelu_kernel",
+        "torch._propagate_xla_data",
+        "torch._remove_batch_dim",
+        "torch._reshape_alias_copy",
+        "torch._reshape_from_tensor",
+        "torch._resize_output_",
+        "torch._rowwise_prune",
+        "torch._sample_dirichlet",
+        "torch._saturate_weight_to_fp16",
+        "torch._scaled_dot_product_attention_math",
+        "torch._scaled_dot_product_efficient_attention",
+        "torch._scaled_dot_product_flash_attention",
+        "torch._scaled_dot_product_flash_attention_for_cpu",
+        "torch._scaled_dot_product_cudnn_attention",
+        "torch._scaled_mm",
+        "torch._shape_as_tensor",
+        "torch._sobol_engine_draw",
+        "torch._sobol_engine_ff_",
+        "torch._sobol_engine_initialize_state_",
+        "torch._sobol_engine_scramble_",
+        "torch._softmax_backward_data",
+        "torch._softmax",
+        "torch._sparse_broadcast_to_copy",
+        "torch._sparse_broadcast_to",
+        "torch._sparse_csr_prod",
+        "torch._sparse_csr_sum",
+        "torch._sparse_log_softmax_backward_data",
+        "torch._sparse_semi_structured_linear",
+        "torch._sparse_softmax_backward_data",
+        "torch._sparse_sparse_matmul",
+        "torch._sparse_sum",
+        "torch._stack",
+        "torch._standard_gamma_grad",
+        "torch._standard_gamma",
+        "torch._test_autograd_multiple_dispatch_view_copy",
+        "torch._test_autograd_multiple_dispatch_view",
+        "torch._test_autograd_multiple_dispatch",
+        "torch._test_check_tensor",
+        "torch._test_functorch_fallback",
+        "torch._test_serialization_subcmul",
+        "torch._to_cpu",
+        "torch._to_functional_tensor",
+        "torch._to_sparse_semi_structured",
+        "torch._transform_bias_rescale_qkv",
+        "torch._transformer_encoder_layer_fwd",
+        "torch._trilinear",
+        "torch._triton_multi_head_attention",
+        "torch._triton_scaled_dot_attention",
+        "torch._unique",
+        "torch._unique2",
+        "torch._unpack_dual",
+        "torch._unsafe_index_put",
+        "torch._unsafe_index",
+        "torch._use_cudnn_ctc_loss",
+        "torch._use_cudnn_rnn_flatten_weight",
+        "torch._values_copy",
+        "torch._weight_int4pack_mm",
+        "torch._weight_int8pack_mm",
+        "torch._weight_norm_interface",
+        "torch._weight_norm",
+        "torch.abs_",
+        "torch.abs",
+        "torch.absolute",
+        "torch.acos_",
+        "torch.acos",
+        "torch.acosh_",
+        "torch.acosh",
+        "torch.adaptive_avg_pool1d",
+        "torch.adaptive_max_pool1d",
+        "torch.add",
+        "torch.addbmm",
+        "torch.addcdiv",
+        "torch.addcmul",
+        "torch.addmm",
+        "torch.addmv_",
+        "torch.addmv",
+        "torch.addr",
+        "torch.adjoint",
+        "torch.affine_grid_generator",
+        "torch.alias_copy",
+        "torch.all",
+        "torch.allclose",
+        "torch.alpha_dropout_",
+        "torch.alpha_dropout",
+        "torch.amax",
+        "torch.amin",
+        "torch.aminmax",
+        "torch.angle",
+        "torch.any",
+        "torch.arange",
+        "torch.arccos_",
+        "torch.arccos",
+        "torch.arccosh_",
+        "torch.arccosh",
+        "torch.arcsin_",
+        "torch.arcsin",
+        "torch.arcsinh_",
+        "torch.arcsinh",
+        "torch.arctan_",
+        "torch.arctan",
+        "torch.arctan2",
+        "torch.arctanh_",
+        "torch.arctanh",
+        "torch.argmax",
+        "torch.argmin",
+        "torch.argsort",
+        "torch.argwhere",
+        "torch.as_strided_",
+        "torch.as_strided_copy",
+        "torch.as_strided_scatter",
+        "torch.as_strided",
+        "torch.as_tensor",
+        "torch.asarray",
+        "torch.asin_",
+        "torch.asin",
+        "torch.asinh_",
+        "torch.asinh",
+        "torch.atan_",
+        "torch.atan",
+        "torch.atan2",
+        "torch.atanh_",
+        "torch.atanh",
+        "torch.avg_pool1d",
+        "torch.baddbmm",
+        "torch.bartlett_window",
+        "torch.batch_norm_backward_elemt",
+        "torch.batch_norm_backward_reduce",
+        "torch.batch_norm_elemt",
+        "torch.batch_norm_gather_stats_with_counts",
+        "torch.batch_norm_gather_stats",
+        "torch.batch_norm_stats",
+        "torch.batch_norm_update_stats",
+        "torch.batch_norm",
+        "torch.bernoulli",
+        "torch.bilinear",
+        "torch.binary_cross_entropy_with_logits",
+        "torch.bincount",
+        "torch.binomial",
+        "torch.bitwise_and",
+        "torch.bitwise_left_shift",
+        "torch.bitwise_not",
+        "torch.bitwise_or",
+        "torch.bitwise_right_shift",
+        "torch.bitwise_xor",
+        "torch.blackman_window",
+        "torch.bmm",
+        "torch.broadcast_to",
+        "torch.bucketize",
+        "torch.can_cast",
+        "torch.cat",
+        "torch.ccol_indices_copy",
+        "torch.ceil_",
+        "torch.ceil",
+        "torch.celu_",
+        "torch.celu",
+        "torch.channel_shuffle",
+        "torch.cholesky_inverse",
+        "torch.cholesky_solve",
+        "torch.cholesky",
+        "torch.choose_qparams_optimized",
+        "torch.chunk",
+        "torch.clamp_",
+        "torch.clamp_max_",
+        "torch.clamp_max",
+        "torch.clamp_min_",
+        "torch.clamp_min",
+        "torch.clamp",
+        "torch.clip_",
+        "torch.clip",
+        "torch.clone",
+        "torch.col_indices_copy",
+        "torch.column_stack",
+        "torch.combinations",
+        "torch.complex",
+        "torch.concat",
+        "torch.concatenate",
+        "torch.conj_physical_",
+        "torch.conj_physical",
+        "torch.conj",
+        "torch.constant_pad_nd",
+        "torch.conv_tbc",
+        "torch.conv_transpose1d",
+        "torch.conv_transpose2d",
+        "torch.conv_transpose3d",
+        "torch.conv1d",
+        "torch.conv2d",
+        "torch.conv3d",
+        "torch.convolution",
+        "torch.copysign",
+        "torch.corrcoef",
+        "torch.cos_",
+        "torch.cos",
+        "torch.cosh_",
+        "torch.cosh",
+        "torch.cosine_embedding_loss",
+        "torch.cosine_similarity",
+        "torch.count_nonzero",
+        "torch.cov",
+        "torch.cross",
+        "torch.crow_indices_copy",
+        "torch.ctc_loss",
+        "torch.cudnn_affine_grid_generator",
+        "torch.cudnn_batch_norm",
+        "torch.cudnn_convolution_add_relu",
+        "torch.cudnn_convolution_relu",
+        "torch.cudnn_convolution_transpose",
+        "torch.cudnn_convolution",
+        "torch.cudnn_grid_sampler",
+        "torch.cudnn_is_acceptable",
+        "torch.cummax",
+        "torch.cummin",
+        "torch.cumprod",
+        "torch.cumsum",
+        "torch.cumulative_trapezoid",
+        "torch.deg2rad_",
+        "torch.deg2rad",
+        "torch.dequantize",
+        "torch.det",
+        "torch.detach_",
+        "torch.detach_copy",
+        "torch.detach",
+        "torch.diag_embed",
+        "torch.diag",
+        "torch.diagflat",
+        "torch.diagonal_copy",
+        "torch.diagonal_scatter",
+        "torch.diagonal",
+        "torch.diff",
+        "torch.digamma",
+        "torch.dist",
+        "torch.div",
+        "torch.divide",
+        "torch.dot",
+        "torch.dropout_",
+        "torch.dropout",
+        "torch.dsmm",
+        "torch.dsplit",
+        "torch.dstack",
+        "torch.embedding_bag",
+        "torch.embedding_renorm_",
+        "torch.embedding",
+        "torch.empty_like",
+        "torch.empty_permuted",
+        "torch.empty_quantized",
+        "torch.empty_strided",
+        "torch.empty",
+        "torch.eq",
+        "torch.equal",
+        "torch.erf_",
+        "torch.erf",
+        "torch.erfc_",
+        "torch.erfc",
+        "torch.erfinv",
+        "torch.exp_",
+        "torch.exp",
+        "torch.exp2_",
+        "torch.exp2",
+        "torch.expand_copy",
+        "torch.expm1_",
+        "torch.expm1",
+        "torch.eye",
+        "torch.fake_quantize_per_channel_affine",
+        "torch.fake_quantize_per_tensor_affine",
+        "torch.fbgemm_linear_fp16_weight_fp32_activation",
+        "torch.fbgemm_linear_fp16_weight",
+        "torch.fbgemm_linear_int8_weight_fp32_activation",
+        "torch.fbgemm_linear_int8_weight",
+        "torch.fbgemm_linear_quantize_weight",
+        "torch.fbgemm_pack_gemm_matrix_fp16",
+        "torch.fbgemm_pack_quantized_matrix",
+        "torch.feature_alpha_dropout_",
+        "torch.feature_alpha_dropout",
+        "torch.feature_dropout_",
+        "torch.feature_dropout",
+        "torch.fill_",
+        "torch.fill",
+        "torch.fix_",
+        "torch.fix",
+        "torch.flatten",
+        "torch.flip",
+        "torch.fliplr",
+        "torch.flipud",
+        "torch.float_power",
+        "torch.floor_",
+        "torch.floor_divide",
+        "torch.floor",
+        "torch.fmax",
+        "torch.fmin",
+        "torch.fmod",
+        "torch.frac_",
+        "torch.frac",
+        "torch.frexp",
+        "torch.frobenius_norm",
+        "torch.from_file",
+        "torch.from_numpy",
+        "torch.frombuffer",
+        "torch.full_like",
+        "torch.full",
+        "torch.fused_moving_avg_obs_fake_quant",
+        "torch.gather",
+        "torch.gcd_",
+        "torch.gcd",
+        "torch.ge",
+        "torch.geqrf",
+        "torch.ger",
+        "torch.get_device",
+        "torch.gradient",
+        "torch.greater_equal",
+        "torch.greater",
+        "torch.grid_sampler_2d",
+        "torch.grid_sampler_3d",
+        "torch.grid_sampler",
+        "torch.group_norm",
+        "torch.gru_cell",
+        "torch.gru",
+        "torch.gt",
+        "torch.hamming_window",
+        "torch.hann_window",
+        "torch.hardshrink",
+        "torch.heaviside",
+        "torch.hinge_embedding_loss",
+        "torch.histc",
+        "torch.histogram",
+        "torch.histogramdd",
+        "torch.hsmm",
+        "torch.hsplit",
+        "torch.hspmm",
+        "torch.hstack",
+        "torch.hypot",
+        "torch.i0_",
+        "torch.i0",
+        "torch.igamma",
+        "torch.igammac",
+        "torch.imag",
+        "torch.index_add",
+        "torch.index_copy",
+        "torch.index_fill",
+        "torch.index_put_",
+        "torch.index_put",
+        "torch.index_reduce",
+        "torch.index_select",
+        "torch.indices_copy",
+        "torch.inner",
+        "torch.instance_norm",
+        "torch.int_repr",
+        "torch.inverse",
+        "torch.is_complex",
+        "torch.is_conj",
+        "torch.is_distributed",
+        "torch.is_floating_point",
+        "torch.is_inference",
+        "torch.is_neg",
+        "torch.is_nonzero",
+        "torch.is_same_size",
+        "torch.is_signed",
+        "torch.is_vulkan_available",
+        "torch.isclose",
+        "torch.isfinite",
+        "torch.isin",
+        "torch.isinf",
+        "torch.isnan",
+        "torch.isneginf",
+        "torch.isposinf",
+        "torch.isreal",
+        "torch.istft",
+        "torch.kaiser_window",
+        "torch.kl_div",
+        "torch.kron",
+        "torch.kthvalue",
+        "torch.layer_norm",
+        "torch.lcm_",
+        "torch.lcm",
+        "torch.ldexp_",
+        "torch.ldexp",
+        "torch.le",
+        "torch.lerp",
+        "torch.less_equal",
+        "torch.less",
+        "torch.lgamma",
+        "torch.linspace",
+        "torch.log_",
+        "torch.log_softmax",
+        "torch.log",
+        "torch.log10_",
+        "torch.log10",
+        "torch.log1p_",
+        "torch.log1p",
+        "torch.log2_",
+        "torch.log2",
+        "torch.logaddexp",
+        "torch.logaddexp2",
+        "torch.logcumsumexp",
+        "torch.logdet",
+        "torch.logical_and",
+        "torch.logical_not",
+        "torch.logical_or",
+        "torch.logical_xor",
+        "torch.logit_",
+        "torch.logit",
+        "torch.logspace",
+        "torch.logsumexp",
+        "torch.lstm_cell",
+        "torch.lstm",
+        "torch.lt",
+        "torch.lu_solve",
+        "torch.lu_unpack",
+        "torch.margin_ranking_loss",
+        "torch.masked_fill",
+        "torch.masked_scatter",
+        "torch.masked_select",
+        "torch.matmul",
+        "torch.matrix_exp",
+        "torch.matrix_power",
+        "torch.max_pool1d_with_indices",
+        "torch.max_pool1d",
+        "torch.max_pool2d",
+        "torch.max_pool3d",
+        "torch.max",
+        "torch.maximum",
+        "torch.mean",
+        "torch.median",
+        "torch.min",
+        "torch.minimum",
+        "torch.miopen_batch_norm",
+        "torch.miopen_convolution_add_relu",
+        "torch.miopen_convolution_relu",
+        "torch.miopen_convolution_transpose",
+        "torch.miopen_convolution",
+        "torch.miopen_depthwise_convolution",
+        "torch.miopen_rnn",
+        "torch.mkldnn_adaptive_avg_pool2d",
+        "torch.mkldnn_convolution",
+        "torch.mkldnn_linear_backward_weights",
+        "torch.mkldnn_max_pool2d",
+        "torch.mkldnn_max_pool3d",
+        "torch.mkldnn_rnn_layer",
+        "torch.mm",
+        "torch.mode",
+        "torch.moveaxis",
+        "torch.movedim",
+        "torch.msort",
+        "torch.mul",
+        "torch.multinomial",
+        "torch.multiply",
+        "torch.mv",
+        "torch.mvlgamma",
+        "torch.nan_to_num_",
+        "torch.nan_to_num",
+        "torch.nanmean",
+        "torch.nanmedian",
+        "torch.nanquantile",
+        "torch.nansum",
+        "torch.narrow_copy",
+        "torch.narrow",
+        "torch.native_batch_norm",
+        "torch.native_channel_shuffle",
+        "torch.native_dropout",
+        "torch.native_group_norm",
+        "torch.native_layer_norm",
+        "torch.native_norm",
+        "torch.ne",
+        "torch.neg_",
+        "torch.neg",
+        "torch.negative_",
+        "torch.negative",
+        "torch.nextafter",
+        "torch.nonzero_static",
+        "torch.nonzero",
+        "torch.norm_except_dim",
+        "torch.normal",
+        "torch.not_equal",
+        "torch.nuclear_norm",
+        "torch.numel",
+        "torch.obj",
+        "torch.ones_like",
+        "torch.ones",
+        "torch.orgqr",
+        "torch.ormqr",
+        "torch.outer",
+        "torch.pairwise_distance",
+        "torch.pdist",
+        "torch.permute_copy",
+        "torch.permute",
+        "torch.pinverse",
+        "torch.pixel_shuffle",
+        "torch.pixel_unshuffle",
+        "torch.poisson_nll_loss",
+        "torch.poisson",
+        "torch.polar",
+        "torch.polygamma",
+        "torch.positive",
+        "torch.pow",
+        "torch.prelu",
+        "torch._print",
+        "torch.prod",
+        "torch.promote_types",
+        "torch.put",
+        "torch.q_per_channel_axis",
+        "torch.q_per_channel_scales",
+        "torch.q_per_channel_zero_points",
+        "torch.q_scale",
+        "torch.q_zero_point",
+        "torch.qr",
+        "torch.quantile",
+        "torch.quantize_per_channel",
+        "torch.quantize_per_tensor_dynamic",
+        "torch.quantize_per_tensor",
+        "torch.quantized_batch_norm",
+        "torch.quantized_gru_cell",
+        "torch.quantized_lstm_cell",
+        "torch.quantized_max_pool1d",
+        "torch.quantized_max_pool2d",
+        "torch.quantized_max_pool3d",
+        "torch.quantized_rnn_relu_cell",
+        "torch.quantized_rnn_tanh_cell",
+        "torch.rad2deg_",
+        "torch.rad2deg",
+        "torch.rand_like",
+        "torch.rand",
+        "torch.randint_like",
+        "torch.randint",
+        "torch.randn_like",
+        "torch.randn",
+        "torch.randperm",
+        "torch.range",
+        "torch.ravel",
+        "torch.real",
+        "torch.reciprocal_",
+        "torch.reciprocal",
+        "torch.relu_",
+        "torch.relu",
+        "torch.remainder",
+        "torch.renorm",
+        "torch.repeat_interleave",
+        "torch.reshape",
+        "torch.resolve_conj",
+        "torch.resolve_neg",
+        "torch.result_type",
+        "torch.rnn_relu_cell",
+        "torch.rnn_relu",
+        "torch.rnn_tanh_cell",
+        "torch.rnn_tanh",
+        "torch.roll",
+        "torch.rot90",
+        "torch.round_",
+        "torch.round",
+        "torch.row_indices_copy",
+        "torch.row_stack",
+        "torch.rrelu_",
+        "torch.rrelu",
+        "torch.rsqrt_",
+        "torch.rsqrt",
+        "torch.rsub",
+        "torch.saddmm",
+        "torch.scalar_tensor",
+        "torch.scatter_add",
+        "torch.scatter_reduce",
+        "torch.scatter",
+        "torch.searchsorted",
+        "torch.segment_reduce",
+        "torch.select_copy",
+        "torch.select_scatter",
+        "torch.select",
+        "torch.selu_",
+        "torch.selu",
+        "torch.sgn",
+        "torch.sigmoid_",
+        "torch.sigmoid",
+        "torch.sign",
+        "torch.signal.windows.windows.sqrt",
+        "torch.signbit",
+        "torch.sin_",
+        "torch.sin",
+        "torch.sinc_",
+        "torch.sinc",
+        "torch.sinh_",
+        "torch.sinh",
+        "torch.slice_copy",
+        "torch.slice_scatter",
+        "torch.slogdet",
+        "torch.smm",
+        "torch.softmax",
+        "torch.sort",
+        "torch.split_copy",
+        "torch.split_with_sizes_copy",
+        "torch.split_with_sizes",
+        "torch.spmm",
+        "torch.sqrt_",
+        "torch.sqrt",
+        "torch.square_",
+        "torch.square",
+        "torch.squeeze_copy",
+        "torch.squeeze",
+        "torch.sspaddmm",
+        "torch.stack",
+        "torch.std_mean",
+        "torch.std",
+        "torch.sub",
+        "torch.subtract",
+        "torch.sum",
+        "torch.svd",
+        "torch.swapaxes",
+        "torch.swapdims",
+        "torch.sym_constrain_range_for_size",
+        "torch.sym_constrain_range",
+        "torch.t_copy",
+        "torch.t",
+        "torch.take_along_dim",
+        "torch.take",
+        "torch.tan_",
+        "torch.tan",
+        "torch.tanh_",
+        "torch.tanh",
+        "torch.tensor_split",
+        "torch.tensor",
+        "torch.threshold_",
+        "torch.threshold",
+        "torch.tile",
+        "torch.topk",
+        "torch.trace",
+        "torch.transpose_copy",
+        "torch.transpose",
+        "torch.trapezoid",
+        "torch.trapz",
+        "torch.triangular_solve",
+        "torch.tril_indices",
+        "torch.tril",
+        "torch.triplet_margin_loss",
+        "torch.triu_indices",
+        "torch.triu",
+        "torch.true_divide",
+        "torch.trunc_",
+        "torch.trunc",
+        "torch.unbind_copy",
+        "torch.unbind",
+        "torch.unflatten",
+        "torch.unfold_copy",
+        "torch.unsafe_chunk",
+        "torch.unsafe_split_with_sizes",
+        "torch.unsafe_split",
+        "torch.unsqueeze_copy",
+        "torch.unsqueeze",
+        "torch.values_copy",
+        "torch.vander",
+        "torch.var_mean",
+        "torch.var",
+        "torch.vdot",
+        "torch.view_as_complex_copy",
+        "torch.view_as_complex",
+        "torch.view_as_real_copy",
+        "torch.view_as_real",
+        "torch.view_copy",
+        "torch.vsplit",
+        "torch.vstack",
+        "torch.where",
+        "torch.xlogy_",
+        "torch.xlogy",
+        "torch.zero_",
+        "torch.zeros",
+        "torch._fused_sgd_",
+        "torch.slice_inverse",
+        "torch._assert_scalar",
+        "torch._functional_assert_scalar",
+    ],
+    TorchInGraphFunctionVariable,
+)
+
+
+if sys.version_info >= (3, 9):
+    torch_c_binding_in_graph_functions["math.lcm"] = TorchInGraphFunctionVariable
+if sys.version_info >= (3, 11):
+    torch_c_binding_in_graph_functions["math.exp2"] = TorchInGraphFunctionVariable
+    torch_c_binding_in_graph_functions["math.cbrt"] = TorchInGraphFunctionVariable
+
+
+# In graph functions (including constant folding) that are not C bindings
+torch_non_c_binding_in_graph_functions = dict.fromkeys(
+    [
+        "torch.__future__.get_overwrite_module_params_on_conversion",
+        "torch.__future__.set_overwrite_module_params_on_conversion",
+        "torch.__getattr__",
+        "torch._assert",
+        "torch._check_index",
+        "torch._check_is_size",
+        "torch._check_not_implemented",
+        "torch._check_tensor_all_with",
+        "torch._check_tensor_all",
+        "torch._check_type",
+        "torch._check_value",
+        "torch._check_with",
+        "torch._check",
+        "torch._compile._disable_dynamo",
+        "torch._functorch.apis.chunk_vmap",
+        "torch._functorch.autograd_function.custom_function_call_functionalize",
+        "torch._functorch.autograd_function.custom_function_call_grad",
+        "torch._functorch.autograd_function.custom_function_call_vmap_generate_rule",
+        "torch._functorch.autograd_function.custom_function_call_vmap",
+        "torch._functorch.autograd_function.generate_single_level_function",
+        "torch._functorch.autograd_function.get_tangents_in_dims",
+        "torch._functorch.autograd_function.has_overriden_vmap_rule",
+        "torch._functorch.autograd_function.reductify_leaf",
+        "torch._functorch.autograd_function.reductify",
+        "torch._functorch.autograd_function.validate_vmap_returns_tuple_of_two_elements",
+        "torch._functorch.autograd_function.vmapify_autograd_function",
+        "torch._functorch.autograd_function.wrap_outputs_maintaining_identity",
+        "torch._functorch.batch_norm_replacement.batch_norm_without_running_stats",
+        "torch._functorch.batch_norm_replacement.replace_all_batch_norm_modules_",
+        "torch._functorch.deprecated.combine_state_for_ensemble",
+        "torch._functorch.deprecated.functionalize",
+        "torch._functorch.deprecated.get_warning",
+        "torch._functorch.deprecated.grad_and_value",
+        "torch._functorch.deprecated.hessian",
+        "torch._functorch.deprecated.jacfwd",
+        "torch._functorch.deprecated.jacrev",
+        "torch._functorch.deprecated.jvp",
+        "torch._functorch.deprecated.make_functional_with_buffers",
+        "torch._functorch.deprecated.make_functional",
+        "torch._functorch.deprecated.setup_docs",
+        "torch._functorch.deprecated.vjp",
+        "torch._functorch.deprecated.warn_deprecated",
+        "torch._functorch.eager_transforms._any_differentiable",
+        "torch._functorch.eager_transforms._autograd_grad",
+        "torch._functorch.eager_transforms._construct_standard_basis_for",
+        "torch._functorch.eager_transforms._vjp_treespec_compare",
+        "torch._functorch.eager_transforms._set_tensor_requires_grad",
+        "torch._functorch.eager_transforms._is_differentiable",
+        "torch._functorch.eager_transforms._jvp_with_argnums",
+        "torch._functorch.eager_transforms._maybe_unwrap_functional_tensor",
+        "torch._functorch.eager_transforms._maybe_wrap_functional_tensor",
+        "torch._functorch.eager_transforms._replace_args",
+        "torch._functorch.eager_transforms._unwrap_all_tensors_from_functional",
+        "torch._functorch.eager_transforms._wrap_all_tensors_to_functional",
+        "torch._functorch.eager_transforms.assert_flat_tuple_of_tensors",
+        "torch._functorch.eager_transforms.assert_non_empty_list_of_tensors",
+        "torch._functorch.eager_transforms.assert_output_is_tensor_or_tensors",
+        "torch._functorch.eager_transforms.functionalize",
+        "torch._functorch.eager_transforms.hessian",
+        "torch._functorch.eager_transforms.jacfwd",
+        "torch._functorch.eager_transforms.jvp",
+        "torch._functorch.eager_transforms.lazy_dynamo_disable",
+        "torch._functorch.eager_transforms.linearize",
+        "torch._functorch.eager_transforms.noop",
+        "torch._functorch.eager_transforms.safe_unflatten",
+        "torch._functorch.eager_transforms.safe_unpack_dual",
+        "torch._functorch.functional_call.construct_stacked_leaf",
+        "torch._functorch.functional_call.functional_call",
+        "torch._functorch.functional_call.stack_module_state",
+        "torch._functorch.pyfunctorch.coerce_cinterpreter",
+        "torch._functorch.pyfunctorch.dispatch_functorch",
+        "torch._functorch.pyfunctorch.nested",
+        "torch._functorch.pyfunctorch.retrieve_current_functorch_interpreter",
+        "torch._functorch.pyfunctorch.temporarily_pop_interpreter_stack",
+        "torch._functorch.utils.enable_single_level_autograd_function",
+        "torch._functorch.utils.exposed_in",
+        "torch._functorch.utils.unwrap_dead_wrappers",
+        "torch._functorch.vmap.lazy_load_decompositions",
+        "torch._guards.compile_context",
+        "torch._guards.detect_fake_mode",
+        "torch._guards.tracing",
+        "torch._higher_order_ops.map._has_potential_branch_input_alias",
+        "torch._higher_order_ops.map._has_potential_branch_input_mutation",
+        "torch._higher_order_ops.map._stack_pytree",
+        "torch._higher_order_ops.map._unstack_pytree",
+        "torch._higher_order_ops.map.create_fw_bw_graph",
+        "torch._higher_order_ops.map.map_autograd",
+        "torch._higher_order_ops.map.map_dense",
+        "torch._higher_order_ops.map.map_fake_tensor_mode",
+        "torch._higher_order_ops.map.map_functionalize",
+        "torch._higher_order_ops.map.map_proxy_torch_dispatch_mode",
+        "torch._higher_order_ops.map.map_wrapper",
+        "torch._higher_order_ops.map.trace_map",
+        "torch._higher_order_ops.out_dtype.elementwise_dtypes",
+        "torch._higher_order_ops.out_dtype.is_int_mm",
+        "torch._higher_order_ops.out_dtype.out_dtype_dense",
+        "torch._higher_order_ops.out_dtype.out_dtype_fake_tensor_mode",
+        "torch._higher_order_ops.out_dtype.out_dtype_fallback",
+        "torch._higher_order_ops.out_dtype.out_dtype_func",
+        "torch._higher_order_ops.out_dtype.out_dtype_proxy",
+        "torch._higher_order_ops.out_dtype.trace_out_dtype",
+        "torch._higher_order_ops.utils.autograd_not_implemented_inner",
+        "torch._higher_order_ops.utils.autograd_not_implemented",
+        "torch._linalg_utils._symeig",
+        "torch._linalg_utils.basis",
+        "torch._linalg_utils.bform",
+        "torch._linalg_utils.conjugate",
+        "torch._linalg_utils.eig",
+        "torch._linalg_utils.get_floating_dtype",
+        "torch._linalg_utils.is_sparse",
+        "torch._linalg_utils.lstsq",
+        "torch._linalg_utils.matmul",
+        "torch._linalg_utils.matrix_rank",
+        "torch._linalg_utils.qform",
+        "torch._linalg_utils.solve",
+        "torch._linalg_utils.symeig",
+        "torch._linalg_utils.transjugate",
+        "torch._linalg_utils.transpose",
+        "torch._load_global_deps",
+        "torch._lowrank._svd_lowrank",
+        "torch._lowrank.get_approximate_basis",
+        "torch._lowrank.pca_lowrank",
+        "torch._lowrank.svd_lowrank",
+        "torch._ops._compute_keyset",
+        "torch._ops._get_tensors",
+        "torch._ops._to_flat_tuple",
+        "torch._ops.add_cached_op",
+        "torch._ops.dl_open_guard",
+        "torch._ops.get_cached_ops",
+        "torch._ops.key_extractor",
+        "torch._ops.reset_cached_ops",
+        "torch._ops.resolve_key",
+        "torch._preload_cuda_deps",
+        "torch._register_device_module",
+        "torch._running_with_deploy",
+        "torch._utils._dummy_type",
+        "torch._weights_only_unpickler._get_allowed_globals",
+        "torch._weights_only_unpickler.load",
+        "torch.align_tensors",
+        "torch.amp.autocast_mode._enter_autocast",
+        "torch.amp.autocast_mode._exit_autocast",
+        "torch.amp.autocast_mode.autocast_decorator",
+        "torch.are_deterministic_algorithms_enabled",
+        "torch.atleast_1d",
+        "torch.atleast_2d",
+        "torch.atleast_3d",
+        "torch.autograd._calculate_shape",
+        "torch.autograd._is_checkpoint_valid",
+        "torch.autograd._make_grads",
+        "torch.autograd._register_py_tensor_class_for_device",
+        "torch.autograd._tensor_or_tensors_to_tuple",
+        "torch.autograd.backward",
+        "torch.autograd.forward_ad.enter_dual_level",
+        "torch.autograd.forward_ad.exit_dual_level",
+        "torch.autograd.forward_ad.make_dual",
+        "torch.autograd.forward_ad.unpack_dual",
+        "torch.autograd.function._iter_filter",
+        "torch.autograd.function._iter_jit_values",
+        "torch.autograd.function._iter_None_tensors",
+        "torch.autograd.function._iter_tensors_permissive",
+        "torch.autograd.function._iter_tensors",
+        "torch.autograd.function._jit_unwrap_structured",
+        "torch.autograd.function._map_tensor_data",
+        "torch.autograd.function._nested_map",
+        "torch.autograd.function._unflatten",
+        "torch.autograd.function.once_differentiable",
+        "torch.autograd.function.traceable",
+        "torch.autograd.functional._as_tuple_nocheck",
+        "torch.autograd.functional._as_tuple",
+        "torch.autograd.functional._autograd_grad",
+        "torch.autograd.functional._check_requires_grad",
+        "torch.autograd.functional._construct_standard_basis_for",
+        "torch.autograd.functional._fill_in_zeros",
+        "torch.autograd.functional._grad_postprocess",
+        "torch.autograd.functional._grad_preprocess",
+        "torch.autograd.functional._jacfwd",
+        "torch.autograd.functional._tuple_postprocess",
+        "torch.autograd.functional._validate_v",
+        "torch.autograd.functional.hessian",
+        "torch.autograd.functional.hvp",
+        "torch.autograd.functional.jacobian",
+        "torch.autograd.functional.jvp",
+        "torch.autograd.functional.vhp",
+        "torch.autograd.functional.vjp",
+        "torch.autograd.grad_mode._enter_inference_mode",
+        "torch.autograd.grad_mode._exit_inference_mode",
+        "torch.autograd.graph._get_sid",
+        "torch.autograd.graph._get_tid",
+        "torch.autograd.graph.allow_mutation_on_saved_tensors",
+        "torch.autograd.graph.get_gradient_edge",
+        "torch.autograd.graph.increment_version",
+        "torch.autograd.graph.register_multi_grad_hook",
+        "torch.autograd.variable",
+        "torch.backends.__allow_nonbracketed_mutation",
+        "torch.backends.cpu.get_cpu_capability",
+        "torch.backends.cuda.can_use_efficient_attention",
+        "torch.backends.cuda.can_use_flash_attention",
+        "torch.backends.cuda.enable_flash_sdp",
+        "torch.backends.cuda.enable_math_sdp",
+        "torch.backends.cuda.enable_mem_efficient_sdp",
+        "torch.backends.cuda.flash_sdp_enabled",
+        "torch.backends.cuda.is_built",
+        "torch.backends.cuda.math_sdp_enabled",
+        "torch.backends.cuda.mem_efficient_sdp_enabled",
+        "torch.backends.cuda.cudnn_sdp_enabled",
+        "torch.backends.cuda.enable_cudnn_sdp",
+        "torch.backends.cuda.preferred_linalg_library",
+        "torch.backends.cuda.sdp_kernel",
+        "torch.backends.cudnn._init",
+        "torch.backends.cudnn.flags",
+        "torch.backends.cudnn.is_acceptable",
+        "torch.backends.cudnn.is_available",
+        "torch.backends.cudnn.set_flags",
+        "torch.backends.cudnn.version",
+        "torch.backends.disable_global_flags",
+        "torch.backends.flags_frozen",
+        "torch.backends.mkl.is_available",
+        "torch.backends.mkldnn.flags",
+        "torch.backends.mkldnn.is_available",
+        "torch.backends.mkldnn.set_flags",
+        "torch.backends.mps._init",
+        "torch.backends.mps.is_available",
+        "torch.backends.mps.is_built",
+        "torch.backends.mps.is_macos13_or_newer",
+        "torch.backends.openmp.is_available",
+        "torch.backends.quantized._get_qengine_id",
+        "torch.backends.quantized._get_qengine_str",
+        "torch.block_diag",
+        "torch.broadcast_tensors",
+        "torch.cartesian_prod",
+        "torch.cdist",
+        "torch.chain_matmul",
+        "torch.compile",
+        "torch.compiled_with_cxx11_abi",
+        "torch.cpu._is_cpu_support_vnni",
+        "torch.cpu.current_device",
+        "torch.cpu.current_stream",
+        "torch.cpu.device_count",
+        "torch.cpu.is_available",
+        "torch.cpu.set_device",
+        "torch.cpu.stream",
+        "torch.cpu.synchronize",
+        "torch.cuda._check_capability",
+        "torch.cuda._check_cubins",
+        "torch.cuda._device_count_nvml",
+        "torch.cuda._get_device",
+        "torch.cuda._get_generator",
+        "torch.cuda._get_nvml_device_index",
+        "torch.cuda._get_pynvml_handler",
+        "torch.cuda._get_rng_state_offset",
+        "torch.cuda._is_compiled",
+        "torch.cuda._lazy_call",
+        "torch.cuda._lazy_init",
+        "torch.cuda._memory_viz._block_extra_legacy",
+        "torch.cuda._memory_viz._block_extra",
+        "torch.cuda._memory_viz._format_size",
+        "torch.cuda._memory_viz._format_viz",
+        "torch.cuda._memory_viz._frame_filter",
+        "torch.cuda._memory_viz._frame_fmt",
+        "torch.cuda._memory_viz._frames_fmt",
+        "torch.cuda._memory_viz._profile_to_snapshot",
+        "torch.cuda._memory_viz._report_free",
+        "torch.cuda._memory_viz._write_blocks",
+        "torch.cuda._memory_viz.calc_active",
+        "torch.cuda._memory_viz.compare",
+        "torch.cuda._memory_viz.format_flamegraph",
+        "torch.cuda._memory_viz.memory",
+        "torch.cuda._memory_viz.profile_plot",
+        "torch.cuda._memory_viz.segment_plot",
+        "torch.cuda._memory_viz.segments",
+        "torch.cuda._memory_viz.segsum",
+        "torch.cuda._memory_viz.trace_plot",
+        "torch.cuda._memory_viz.trace",
+        "torch.cuda._nvml_based_avail",
+        "torch.cuda._parse_visible_devices",
+        "torch.cuda._raw_device_count_nvml",
+        "torch.cuda._raw_device_uuid_nvml",
+        "torch.cuda._register_triton_kernels",
+        "torch.cuda._set_rng_state_offset",
+        "torch.cuda._set_stream_by_id",
+        "torch.cuda._sleep",
+        "torch.cuda._transform_uuid_to_ordinals",
+        "torch.cuda._utils._get_device_index",
+        "torch.cuda.amp.autocast_mode._cast",
+        "torch.cuda.amp.autocast_mode.custom_bwd",
+        "torch.cuda.amp.autocast_mode.custom_fwd",
+        "torch.cuda.amp.common.amp_definitely_not_available",
+        "torch.amp.grad_scaler._refresh_per_optimizer_state",
+        "torch.cuda.can_device_access_peer",
+        "torch.cuda.check_error",
+        "torch.cuda.clock_rate",
+        "torch.cuda.cudart",
+        "torch.cuda.current_blas_handle",
+        "torch.cuda.current_stream",
+        "torch.cuda.default_stream",
+        "torch.cuda.device_count",
+        "torch.cuda.get_arch_list",
+        "torch.cuda.get_device_capability",
+        "torch.cuda.get_device_name",
+        "torch.cuda.get_device_properties",
+        "torch.cuda.get_gencode_flags",
+        "torch.cuda.get_sync_debug_mode",
+        "torch.cuda.graphs.graph_pool_handle",
+        "torch.cuda.graphs.is_current_stream_capturing",
+        "torch.cuda.graphs.make_graphed_callables",
+        "torch.cuda.init",
+        "torch.cuda.ipc_collect",
+        "torch.cuda.is_available",
+        "torch.cuda.is_bf16_supported",
+        "torch.cuda.is_initialized",
+        "torch.cuda.jiterator._create_jit_fn",
+        "torch.cuda.jiterator._create_multi_output_jit_fn",
+        "torch.cuda.memory_usage",
+        "torch.cuda.memory._dump_snapshot",
+        "torch.cuda.memory._free_mutex",
+        "torch.cuda.memory._get_current_allocator",
+        "torch.cuda.memory._host_allocator",
+        "torch.cuda.memory._record_memory_history_impl",
+        "torch.cuda.memory._record_memory_history_legacy",
+        "torch.cuda.memory._record_memory_history",
+        "torch.cuda.memory._save_memory_usage",
+        "torch.cuda.memory._save_segment_usage",
+        "torch.cuda.memory._set_allocator_settings",
+        "torch.cuda.memory._snapshot",
+        "torch.cuda.memory.caching_allocator_alloc",
+        "torch.cuda.memory.caching_allocator_delete",
+        "torch.cuda.memory.change_current_allocator",
+        "torch.cuda.memory.empty_cache",
+        "torch.cuda.memory.get_allocator_backend",
+        "torch.cuda.memory.list_gpu_processes",
+        "torch.cuda.memory.max_memory_allocated",
+        "torch.cuda.memory.max_memory_cached",
+        "torch.cuda.memory.max_memory_reserved",
+        "torch.cuda.memory.mem_get_info",
+        "torch.cuda.memory.memory_allocated",
+        "torch.cuda.memory.memory_cached",
+        "torch.cuda.memory.memory_reserved",
+        "torch.cuda.memory.memory_snapshot",
+        "torch.cuda.memory.memory_stats_as_nested_dict",
+        "torch.cuda.memory.memory_stats",
+        "torch.cuda.memory.memory_summary",
+        "torch.cuda.memory.reset_accumulated_memory_stats",
+        "torch.cuda.memory.reset_max_memory_allocated",
+        "torch.cuda.memory.reset_max_memory_cached",
+        "torch.cuda.memory.reset_peak_memory_stats",
+        "torch.cuda.memory.set_per_process_memory_fraction",
+        "torch.cuda.nccl._check_sequence_type",
+        "torch.cuda.nccl.all_gather",
+        "torch.cuda.nccl.all_reduce",
+        "torch.cuda.nccl.broadcast",
+        "torch.cuda.nccl.init_rank",
+        "torch.cuda.nccl.is_available",
+        "torch.cuda.nccl.reduce_scatter",
+        "torch.cuda.nccl.reduce",
+        "torch.cuda.nccl.unique_id",
+        "torch.cuda.nccl.version",
+        "torch.cuda.nvtx.mark",
+        "torch.cuda.nvtx.range_end",
+        "torch.cuda.nvtx.range_pop",
+        "torch.cuda.nvtx.range_push",
+        "torch.cuda.nvtx.range_start",
+        "torch.cuda.nvtx.range",
+        "torch.cuda.power_draw",
+        "torch.cuda.profiler.init",
+        "torch.cuda.profiler.profile",
+        "torch.cuda.profiler.start",
+        "torch.cuda.profiler.stop",
+        "torch.cuda.random.get_rng_state_all",
+        "torch.cuda.random.initial_seed",
+        "torch.cuda.random.manual_seed_all",
+        "torch.cuda.random.manual_seed",
+        "torch.cuda.random.seed_all",
+        "torch.cuda.random.seed",
+        "torch.cuda.random.set_rng_state_all",
+        "torch.cuda.set_stream",
+        "torch.cuda.set_sync_debug_mode",
+        "torch.cuda.stream",
+        "torch.cuda.synchronize",
+        "torch.cuda.temperature",
+        "torch.cuda.utilization",
+        "torch.einsum",
+        "torch.functional._check_list_size",
+        "torch.functional._consecutive_return_counts",
+        "torch.functional._consecutive_return_inverse_false",
+        "torch.functional._consecutive_return_inverse_true",
+        "torch.functional._consecutive_return_inverse",
+        "torch.functional._consecutive_return_output",
+        "torch.functional._lu_impl",
+        "torch.functional._lu_no_infos",
+        "torch.functional._lu_with_infos",
+        "torch.functional._meshgrid",
+        "torch.functional._return_counts",
+        "torch.functional._return_inverse_false",
+        "torch.functional._return_inverse_true",
+        "torch.functional._return_inverse",
+        "torch.functional._return_output",
+        "torch.functional._unique_consecutive_impl",
+        "torch.functional._unique_impl",
+        "torch.functional._unravel_index",
+        "torch.functional.broadcast_shapes",
+        "torch.functional.lu",
+        "torch.functional.unique",
+        "torch.functional.unravel_index",
+        "torch.futures.collect_all",
+        "torch.futures.wait_all",
+        "torch.get_deterministic_debug_mode",
+        "torch.get_float32_matmul_precision",
+        "torch.is_deterministic_algorithms_warn_only_enabled",
+        "torch.is_storage",
+        "torch.is_tensor",
+        "torch.is_warn_always_enabled",
+        "torch.masked._ops._any",
+        "torch.masked._ops._apply_docstring_templates",
+        "torch.masked._ops._canonical_dim",
+        "torch.masked._ops._combine_input_and_mask",
+        "torch.masked._ops._generate_docstring",
+        "torch.masked._ops._input_mask",
+        "torch.masked._ops._output_mask",
+        "torch.masked._ops._reduction_identity",
+        "torch.masked._ops._sparse_coo_flatten_indices",
+        "torch.masked._ops._sparse_coo_scatter_reduction_helper",
+        "torch.masked._ops._sparse_coo_where",
+        "torch.masked._ops._sparse_csr_segment_reduction_helper",
+        "torch.masked._ops._sparse_csr_where",
+        "torch.masked._ops._std_var",
+        "torch.masked._ops._where",
+        "torch.masked._ops.amax",
+        "torch.masked._ops.amin",
+        "torch.masked._ops.argmax",
+        "torch.masked._ops.argmin",
+        "torch.masked._ops.corresponding_real_dtype",
+        "torch.masked._ops.cumprod",
+        "torch.masked._ops.cumsum",
+        "torch.masked._ops.log_softmax",
+        "torch.masked._ops.logaddexp",
+        "torch.masked._ops.logsumexp",
+        "torch.masked._ops.mean",
+        "torch.masked._ops.median",
+        "torch.masked._ops.norm",
+        "torch.masked._ops.normalize",
+        "torch.masked._ops.prod",
+        "torch.masked._ops.softmax",
+        "torch.masked._ops.softmin",
+        "torch.masked._ops.std",
+        "torch.masked._ops.sum",
+        "torch.masked._ops.var",
+        "torch.meshgrid",
+        "torch.mps._get_default_mps_generator",
+        "torch.mps.current_allocated_memory",
+        "torch.mps.driver_allocated_memory",
+        "torch.mps.empty_cache",
+        "torch.mps.get_rng_state",
+        "torch.mps.manual_seed",
+        "torch.mps.profiler.profile",
+        "torch.mps.profiler.start",
+        "torch.mps.profiler.stop",
+        "torch.mps.seed",
+        "torch.mps.set_per_process_memory_fraction",
+        "torch.mps.set_rng_state",
+        "torch.mps.synchronize",
+        "torch.nested._internal.nested_tensor.get_tensor_symint",
+        "torch.nested._internal.nested_tensor.is_expandable_to",
+        "torch.nested._internal.nested_tensor.jagged_from_list",
+        "torch.nested._internal.nested_tensor.jagged_from_tensor_and_lengths",
+        "torch.nested._internal.nested_tensor.nested_view_from_values_offsets",
+        "torch.nested._internal.nested_tensor.nested_view_from_values_offsets_lengths",
+        "torch.nested.as_nested_tensor",
+        "torch.nested.narrow",
+        "torch.nested.nested_tensor",
+        "torch.nn._reduction.get_enum",
+        "torch.nn._reduction.legacy_get_enum",
+        "torch.nn._reduction.legacy_get_string",
+        "torch.nn.factory_kwargs",
+        "torch.nn.functional._adaptive_max_pool1d",
+        "torch.nn.functional._adaptive_max_pool2d",
+        "torch.nn.functional._adaptive_max_pool3d",
+        "torch.nn.functional._canonical_mask",
+        "torch.nn.functional._fractional_max_pool2d",
+        "torch.nn.functional._fractional_max_pool3d",
+        "torch.nn.functional._get_softmax_dim",
+        "torch.nn.functional._in_projection_packed",
+        "torch.nn.functional._in_projection",
+        "torch.nn.functional._is_integer",
+        "torch.nn.functional._max_pool1d",
+        "torch.nn.functional._max_pool2d",
+        "torch.nn.functional._max_pool3d",
+        "torch.nn.functional._mha_shape_check",
+        "torch.nn.functional._no_grad_embedding_renorm_",
+        "torch.nn.functional._none_or_dtype",
+        "torch.nn.functional._threshold",
+        "torch.nn.functional._unpool_output_size",
+        "torch.nn.functional._verify_batch_size",
+        "torch.nn.functional._verify_spatial_size",
+        "torch.nn.functional.adaptive_avg_pool2d",
+        "torch.nn.functional.adaptive_avg_pool3d",
+        "torch.nn.functional.adaptive_max_pool1d_with_indices",
+        "torch.nn.functional.adaptive_max_pool1d",
+        "torch.nn.functional.adaptive_max_pool2d_with_indices",
+        "torch.nn.functional.adaptive_max_pool2d",
+        "torch.nn.functional.adaptive_max_pool3d_with_indices",
+        "torch.nn.functional.adaptive_max_pool3d",
+        "torch.nn.functional.affine_grid",
+        "torch.nn.functional.alpha_dropout",
+        "torch.nn.functional.assert_int_or_pair",
+        "torch.nn.functional.batch_norm",
+        "torch.nn.functional.binary_cross_entropy_with_logits",
+        "torch.nn.functional.binary_cross_entropy",
+        "torch.nn.functional.celu",
+        "torch.nn.functional.cosine_embedding_loss",
+        "torch.nn.functional.cross_entropy",
+        "torch.nn.functional.ctc_loss",
+        "torch.nn.functional.dropout",
+        "torch.nn.functional.dropout1d",
+        "torch.nn.functional.dropout2d",
+        "torch.nn.functional.dropout3d",
+        "torch.nn.functional.elu",
+        "torch.nn.functional.embedding_bag",
+        "torch.nn.functional.embedding",
+        "torch.nn.functional.feature_alpha_dropout",
+        "torch.nn.functional.fold",
+        "torch.nn.functional.fractional_max_pool2d_with_indices",
+        "torch.nn.functional.fractional_max_pool2d",
+        "torch.nn.functional.fractional_max_pool3d_with_indices",
+        "torch.nn.functional.fractional_max_pool3d",
+        "torch.nn.functional.gaussian_nll_loss",
+        "torch.nn.functional.glu",
+        "torch.nn.functional.grid_sample",
+        "torch.nn.functional.group_norm",
+        "torch.nn.functional.gumbel_softmax",
+        "torch.nn.functional.hardsigmoid",
+        "torch.nn.functional.hardswish",
+        "torch.nn.functional.hardtanh",
+        "torch.nn.functional.hinge_embedding_loss",
+        "torch.nn.functional.huber_loss",
+        "torch.nn.functional.instance_norm",
+        "torch.nn.functional.interpolate",
+        "torch.nn.functional.kl_div",
+        "torch.nn.functional.l1_loss",
+        "torch.nn.functional.layer_norm",
+        "torch.nn.functional.leaky_relu",
+        "torch.nn.functional.local_response_norm",
+        "torch.nn.functional.log_softmax",
+        "torch.nn.functional.lp_pool1d",
+        "torch.nn.functional.lp_pool2d",
+        "torch.nn.functional.margin_ranking_loss",
+        "torch.nn.functional.max_pool1d_with_indices",
+        "torch.nn.functional.max_pool1d",
+        "torch.nn.functional.max_pool2d_with_indices",
+        "torch.nn.functional.max_pool2d",
+        "torch.nn.functional.max_pool3d_with_indices",
+        "torch.nn.functional.max_pool3d",
+        "torch.nn.functional.max_unpool1d",
+        "torch.nn.functional.max_unpool2d",
+        "torch.nn.functional.max_unpool3d",
+        "torch.nn.functional.mish",
+        "torch.nn.functional.mse_loss",
+        "torch.nn.functional.multi_head_attention_forward",
+        "torch.nn.functional.multi_margin_loss",
+        "torch.nn.functional.multilabel_margin_loss",
+        "torch.nn.functional.multilabel_soft_margin_loss",
+        "torch.nn.functional.nll_loss",
+        "torch.nn.functional.normalize",
+        "torch.nn.functional.poisson_nll_loss",
+        "torch.nn.functional.relu",
+        "torch.nn.functional.relu6",
+        "torch.nn.functional.rrelu",
+        "torch.nn.functional.selu",
+        "torch.nn.functional.sigmoid",
+        "torch.nn.functional.silu",
+        "torch.nn.functional.smooth_l1_loss",
+        "torch.nn.functional.soft_margin_loss",
+        "torch.nn.functional.softmax",
+        "torch.nn.functional.softmin",
+        "torch.nn.functional.softsign",
+        "torch.nn.functional.tanh",
+        "torch.nn.functional.tanhshrink",
+        "torch.nn.functional.triplet_margin_loss",
+        "torch.nn.functional.unfold",
+        "torch.nn.functional.upsample_bilinear",
+        "torch.nn.functional.upsample_nearest",
+        "torch.nn.functional.upsample",
+        "torch.nn.grad._pair",
+        "torch.nn.grad._single",
+        "torch.nn.grad._triple",
+        "torch.nn.grad.conv1d_input",
+        "torch.nn.grad.conv1d_weight",
+        "torch.nn.grad.conv2d_input",
+        "torch.nn.grad.conv2d_weight",
+        "torch.nn.grad.conv3d_input",
+        "torch.nn.grad.conv3d_weight",
+        "torch.nn.modules.activation._arg_requires_grad",
+        "torch.nn.modules.activation._check_arg_device",
+        "torch.nn.modules.activation._is_make_fx_tracing",
+        "torch.nn.modules.container._addindent",
+        "torch.nn.modules.transformer._detect_is_causal_mask",
+        "torch.nn.modules.transformer._generate_square_subsequent_mask",
+        "torch.nn.modules.transformer._get_activation_fn",
+        "torch.nn.modules.transformer._get_clones",
+        "torch.nn.modules.transformer._get_seq_len",
+        "torch.nn.modules.utils._list_with_default",
+        "torch.nn.modules.utils._ntuple",
+        "torch.nn.modules.utils._quadruple",
+        "torch.nn.modules.utils._reverse_repeat_tuple",
+        "torch.nn.modules.utils.consume_prefix_in_state_dict_if_present",
+        "torch.nn.parameter.is_lazy",
+        "torch.norm",
+        "torch.quantization.default_eval_fn",
+        "torch.random._seed_custom_device",
+        "torch.random.fork_rng",
+        "torch.random.initial_seed",
+        "torch.random.seed",
+        "torch.return_types.pytree_register_structseq",
+        "torch.set_default_device",
+        "torch.set_default_dtype",
+        "torch.set_default_tensor_type",
+        "torch.set_deterministic_debug_mode",
+        "torch.set_float32_matmul_precision",
+        "torch.set_warn_always",
+        "torch.signal.windows.windows._add_docstr",
+        "torch.signal.windows.windows._window_function_checks",
+        "torch.signal.windows.windows.bartlett",
+        "torch.signal.windows.windows.blackman",
+        "torch.signal.windows.windows.cosine",
+        "torch.signal.windows.windows.exponential",
+        "torch.signal.windows.windows.gaussian",
+        "torch.signal.windows.windows.general_cosine",
+        "torch.signal.windows.windows.general_hamming",
+        "torch.signal.windows.windows.hamming",
+        "torch.signal.windows.windows.hann",
+        "torch.signal.windows.windows.kaiser",
+        "torch.signal.windows.windows.merge_dicts",
+        "torch.signal.windows.windows.nuttall",
+        "torch.signal.windows.windows.parse_kwargs",
+        "torch.sparse.semi_structured.to_sparse_semi_structured",
+        "torch.sparse.sum",
+        "torch.split",
+        "torch.stft",
+        "torch.sym_float",
+        "torch.sym_int",
+        "torch.sym_ite",
+        "torch.sym_max",
+        "torch.sym_min",
+        "torch.sym_not",
+        "torch.tensordot",
+        "torch.typename",
+        "torch.unique_consecutive",
+        "torch.use_deterministic_algorithms",
+    ],
+    TorchInGraphFunctionVariable,
+)
+
+
+torch_name_rule_map = [
+    manual_torch_name_rule_map,
+    torch_c_binding_in_graph_functions,
+    torch_non_c_binding_in_graph_functions,
+]
+
+
+"""
+Generate the torch object - Dynamo tracing rule (the wrapping variable) map.
+"""
+
+
+@functools.lru_cache(None)
+def get_torch_obj_rule_map():
+    d: Dict[Any, VariableTracker] = dict()
+    for m in torch_name_rule_map:
+        for k, v in m.items():  # type: ignore[attr-defined]
+            obj = load_object(k)
+            if obj is not None:
+                if obj in d and d[obj] != v:
+                    raise AssertionError(
+                        f"Duplicate torch object {obj} with different rules: {v}, {d[obj]}"
+                    )
+                else:
+                    d[obj] = v
+    return d
+
+
+def _load_obj_from_str(fully_qualified_name):
+    module, obj_name = fully_qualified_name.rsplit(".", maxsplit=1)
+    return getattr(importlib.import_module(module), obj_name)
+
+
+"""
+Load string represented torch objects.
+"""
+
+
+def load_object(name):
+    try:
+        x = name.split("#")
+        if len(x) == 2:
+            obj = _load_obj_from_str(x[0])
+            val = getattr(obj, x[1])
+        else:
+            assert len(x) == 1, f"Invalid obj name {name}"
+            val = _load_obj_from_str(x[0])
+        val = unwrap_if_wrapper(val)
+    except (AttributeError, ImportError):
+        val = None
+    return val
+
+
+"""
+Get all torch.Tensor methods which are allowed to be in graph functions.
+"""
+
+
+@functools.lru_cache(None)
+def get_tensor_method():
+    s = set()
+    for name in dir(torch.Tensor):
+        method = getattr(torch.Tensor, name)
+        if isinstance(
+            method, (types.MethodDescriptorType, types.WrapperDescriptorType)
+        ):
+            s.add(method)
+    return frozenset(s)
+
+
+"""
+Return if a torch object is ATen op or torch.Tensor method.
+"""
+
+
+def is_aten_op_or_tensor_method(obj):
+    return obj in get_tensor_method() or isinstance(
+        obj,
+        (torch._ops.OpOverloadPacket, torch._ops.OpOverload),
+    )
+
+
+class FunctionIdSet:
+    """
+    Track a set of `id()`s of objects which are either allowed or not
+    allowed to go into the generated FX graph.  Use to test for torch.*,
+    numpy.*, builtins.*, etc.
+
+    Support user modification to permit customization of what can be
+    added to the graph and what will cause a graph break.
+    """
+
+    function_ids: Optional[Set[int]] = None
+    function_names: Optional[Dict[int, str]] = None
+
+    def __init__(self, lazy_initializer: Callable[[], Union[Dict[int, str], Set[int]]]):
+        self.lazy_initializer = lazy_initializer
+
+    def __call__(self):
+        if self.function_ids is None:
+            value = self.lazy_initializer()
+            if isinstance(value, dict):
+                self.function_ids = set(value.keys())
+                self.function_names = value
+            else:
+                assert isinstance(value, set)
+                self.function_ids = value
+        return self.function_ids
+
+    def get_name(self, idx: int, default: str):
+        self()  # lazy init
+        assert self.function_names is not None
+        return self.function_names.get(idx, default)
+
+    def add(self, idx: int):
+        function_ids = self()  # lazy init
+        function_ids.add(idx)
+
+    def remove(self, idx: int):
+        function_ids = self()
+        if idx in function_ids:
+            function_ids.remove(idx)
+
+    def __contains__(self, idx: int):
+        return idx in self()
+
+
+@FunctionIdSet
+def _allowed_callable_ids() -> Dict[int, str]:
+    rv: Dict[int, str] = {}
+    return rv
+
+
+@FunctionIdSet
+def _disallowed_callable_ids() -> Dict[int, str]:
+    rv: Dict[int, str] = {}
+    return rv
+
+
+@FunctionIdSet
+def _builtin_function_ids() -> Dict[int, str]:
+    rv = {
+        id(v): f"builtins.{k}"
+        for k, v in builtins.__dict__.items()
+        if not k.startswith("_") and callable(v)
+    }
+    rv.update(
+        {
+            id(v): f"operator.{k}"
+            for k, v in operator.__dict__.items()
+            if not k.startswith("_") and callable(v)
+        }
+    )
+    rv.update(
+        {id(v): f"functools.{v.__name__}" for v in (itertools.chain, itertools.islice)}
+    )
+    rv.update(
+        {
+            id(cast): "typing.cast",
+            id(functools.reduce): "functools.reduce",
+            id(copy.deepcopy): "copy.deepcopy",
+        }
+    )
+    return rv
+
+
+@FunctionIdSet
+def _numpy_function_ids() -> Dict[int, str]:
+    rv = dict()
+    for mod in NP_SUPPORTED_MODULES:
+        rv.update(
+            {
+                id(v): f"{mod.__name__}.{k}"
+                for k, v in mod.__dict__.items()
+                if callable(v)
+                and (getattr(v, "__module__", None) or mod.__name__) == mod.__name__
+            }
+        )
+    return rv
+
+
+@FunctionIdSet
+def _builtin_constant_ids() -> Dict[int, str]:
+    """
+    Collects constant builtins by eliminating callable items.
+    """
+    rv = {
+        id(v): f"builtins.{k}"
+        for k, v in builtins.__dict__.items()
+        if not k.startswith("_") and not callable(v)
+    }
+    return rv
+
+
+_lazy_module_init: Dict[str, List[Callable[[], None]]] = defaultdict(list)
+
+
+def add_module_init_func(name: str, init_func: Callable[[], None]) -> None:
+    """Register a module without eagerly importing it"""
+    # If the module is already imported, eagerly run init
+    assert "." not in name, f"Expected a root module name, but got {name}"
+    if name in sys.modules:
+        init_func()
+
+    # Module is not yet imported, delay processing until needed
+    assert name not in _lazy_module_init
+    _lazy_module_init[name].append(init_func)
+
+
+def _maybe_init_lazy_module(obj: object) -> None:
+    module = getattr(obj, "__module__", None)
+    if module is None:
+        return
+
+    base_module = module.split(".")[0]
+    init_funcs = _lazy_module_init.pop(base_module, None)
+    if init_funcs is not None:
+        for fn in init_funcs:
+            fn()
+
+
+def is_callable_allowed(obj) -> bool:
+    _maybe_init_lazy_module(obj)
+    return id(obj) in _allowed_callable_ids
+
+
+def is_callable_disallowed(obj) -> bool:
+    _maybe_init_lazy_module(obj)
+    return id(obj) in _disallowed_callable_ids
+
+
+def is_forbidden(obj) -> bool:
+    _maybe_init_lazy_module(obj)
+    return getattr(obj, "_dynamo_forbidden", False)
+
+
+def is_builtin_callable(obj) -> bool:
+    return id(obj) in _builtin_function_ids
+
+
+def is_builtin_constant(obj) -> bool:
+    return id(obj) in _builtin_constant_ids
+
+
+def is_numpy(obj) -> bool:
+    if np is None:
+        return False
+    return isinstance(obj, (np.ndarray, np.generic)) or id(obj) in _numpy_function_ids
+
+
+"""
+A note on skip/inline rules:
+
+Dynamo consults this file to determine whether function should be inlined or skipped.
+
+A skip applies at the frame boundary, meaning dynamo either triggers a graph break
+at the beginning of the frame or attempts to trace/inline the whole frame. When skipping
+a frame, recursively called frames are still traced by dynamo unless also skipped.
+
+Skipfiles (skipped at the file level instead of function level) still apply on a
+frame-by-frame boundary as dynamo traces, but apply to all functions in that file.
+
+@skip is a helper decorator that can be applied to your function to cause it to be
+included here.
+
+Dynamo skip/inline rules & priorities are defined as follows:
+* Inline is the default behavior and will be used unless explicitly skipped.
+* Dynamo has two SKIPLIST: BUILTIN_SKIPLIST and THIRDPARTY_SKIPLIST.
+    * BUILTIN_SKIPLIST contains builtin python modules, such as abc, collections, etc.
+    * THIRDPARTY_SKIPLIST contains common third party libraries, such as numpy, pandas, etc.
+* Functions in these two SKIPLISTs are always skipped, except:
+    * They have explicitly defined rule in `manual_torch_name_rule_map`;
+    * The corresponding python module has been put into MOD_INLINELIST.
+* PyTorch(torch) is in the BUILTIN_SKIPLIST by default, but there are many cases
+    where we want inline the functions under torch namespace.
+    We should specify inline for the functions in `manual_torch_name_rule_map` or
+    put the corresponding python module into MOD_INLINELIST to make dynamo inline them.
+* If you call functions under skipped modules/files, Dynamo will wrap these functions
+    as SkipFunctionVariable. There are a few functions(e.g, collections.OrderedDict) that
+    we have special handling at SkipFunctionVariable.call_function.
+
+Overall: *_INLINELIST has precedence over *_SKIPLIST has precedence over DEFAULT (inline)
+
+To figure out what the behavior is, check the following list in order:
+* `manual_torch_name_rule_map` (Inline if YES)
+* MOD_INLINELIST (Inline if YES)
+* BUILTIN_SKIPLIST & THIRDPARTY_SKIPLIST (Skip if YES)
+* Inline by default
+
+In general, if you want to force inline a function or module, please consider adding
+the function's python module to MOD_INLINELIST first.
+Use the `manual_torch_name_rule_map` only when there are other functions under the same module that
+you don't want to inline them.
+"""
+
+
+BUILTIN_SKIPLIST = (
+    abc,
+    collections,
+    contextlib,
+    copy,
+    copyreg,
+    dataclasses,
+    enum,
+    functools,
+    importlib,
+    inspect,
+    linecache,
+    logging,
+    multiprocessing,
+    operator,
+    os,
+    posixpath,
+    random,
+    re,
+    selectors,
+    signal,
+    tempfile,
+    threading,
+    tokenize,
+    torch,  # torch/* is skipped by default unless specified in FUNC_INLINELIST or MOD_INLINELIST
+    traceback,
+    types,
+    typing,
+    unittest,
+    weakref,
+    _collections_abc,
+    _weakrefset,
+)
+
+# third party libraries skiplist is defined by str, because users may not use these libraries.
+# we should use lazy import & skip in the future.
+THIRDPARTY_SKIPLIST = (
+    "fx2trt_oss",
+    "hypothesis",
+    "networkx",
+    "numpy",
+    "omegaconf",
+    "onnx",
+    "onnxruntime",
+    "onnx_tf",
+    "pandas",
+    "sklearn",
+    "tabulate",
+    "tensorflow",
+    "tensorrt",
+    "torch2trt",
+    "tqdm",
+    "tree",
+    "tvm",
+    "xarray",
+)
+
+
+def _strip_init_py(s):
+    # TODO: Once we require py3.9 use removesuffix instead.
+    suffix = "__init__.py"
+    if s.endswith(suffix):
+        return s[: -len(suffix)]
+    else:
+        return s
+
+
+def _module_dir(m: types.ModuleType):
+    # Protect against a module not exporting __file__ - this can happen for
+    # frozen modules, for example.
+    file = getattr(m, "__file__", None)
+    return file and _strip_init_py(file)
+
+
+# These are legacy workarounds, don't add new modules to this list.
+# Please use the MOD_INLINELIST instead to force inline functions under particular modules.
+LEGACY_MOD_INLINELIST = {
+    "torch._dynamo.external_utils",
+    "torch._export.db.examples",
+    "torch._export.wrappers",
+    "torch._functorch.apis",
+    "torch._functorch.deprecated",
+    "torch._higher_order_ops.cond",
+    "torch.ao.quantization.pt2e.export_utils",
+    "torch.ao.quantization.pt2e.qat_utils",
+    "torch.ao.quantization.pt2e.representation.rewrite",
+    "torch.ao.quantization.pt2e.utils",
+    "torch.ao.quantization.quantizer.xnnpack_quantizer",
+    "torch.optim",
+}
+
+if torch.distributed.is_available():
+    LEGACY_MOD_INLINELIST |= {
+        "torch.distributed._tensor.api",
+        "torch.distributed._tensor.device_mesh",
+        "torch.distributed.device_mesh",
+        "torch.distributed.algorithms._checkpoint.checkpoint_wrapper",
+        "torch.distributed.tensor.parallel._data_parallel_utils",
+        "torch.distributed.tensor.parallel._utils",
+        "torch.distributed.tensor.parallel.style",
+        # we have to add replicate to LEGACY_MOD_INLINELIST to ensure
+        # the forward_hook won't be ignored.
+        "torch.distributed._composable.replicate",
+    }
+
+
+# Force inline functions under these modules, even they are in *_SKIPLIST.
+# We are using python module name instead of file or directory object to avoid circular dependency.
+# Please keep this sorted alphabetically.
+MOD_INLINELIST = {
+    "torch._refs",
+    "torch._prims",
+    "torch._decomp",
+    "torch._dynamo._trace_wrapped_higher_order_op",
+    "torch._dynamo.comptime",
+    "torch._dynamo.polyfill",
+    "torch._functorch.vmap",
+    "torch._functorch.eager_transforms",
+    "torch._inductor.test_operators",
+    "torch.amp.autocast_mode",
+    "torch.ao.nn",
+    "torch.autograd.function",
+    "torch.backends.cuda",
+    "torch.cuda.amp.autocast_mode",
+    "torch.distributions",
+    "torch.fx._pytree",
+    "torch.fx.passes.shape_prop",
+    "torch.nn",
+    "torch.random",
+    "torch.sparse",
+    "torch.testing",
+    "torch.testing._internal.hypothesis_utils",
+    "torch.utils._content_store",
+    "torch.utils._contextlib",
+    "torch.utils._foreach_utils",
+    "torch.utils._pytree",
+    "torch.utils.hooks",
+    "torch._tensor",
+    "torch._higher_order_ops.strict_mode",
+    "torch._higher_order_ops.while_loop",
+}
+
+
+if torch.distributed.is_available():
+    MOD_INLINELIST.add("torch.distributed")
+    MOD_INLINELIST.add("torch.distributed._functional_collectives")
+    MOD_INLINELIST.add("torch.distributed._composable.replicate")
+
+
+@functools.lru_cache(None)
+def get_legacy_mod_inlinelist():
+    inlinelist = set()
+    for m in LEGACY_MOD_INLINELIST:
+        inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
+    return inlinelist
+
+
+@functools.lru_cache(None)
+def get_mod_inlinelist():
+    inlinelist = set()
+    for m in MOD_INLINELIST:
+        inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
+    return inlinelist
+
+
+# skip some standard python builtin libs
+SKIP_DIRS = [
+    "<frozen importlib",
+    "<__array_function__ internals>",
+    _config_module.__file__,
+]
+SKIP_DIRS.extend(filter(None, (_module_dir(m) for m in BUILTIN_SKIPLIST)))
+
+SKIP_DIRS_RE = re.compile(r"match nothing^")
+
+is_fbcode = importlib.import_module("torch._inductor.config").is_fbcode()
+# Skip fbcode paths(including torch.package paths) containing
+# one of the following strings.
+FBCODE_SKIP_DIRS = {
+    "torchrec/distributed",
+    "torchrec/fb/distributed",
+    "caffe2/torch/fb/sparsenn/pooled_embeddings_modules.py",
+}
+FBCODE_SKIP_DIRS_RE = re.compile(f".*({'|'.join(map(re.escape, FBCODE_SKIP_DIRS))})")
+
+
+def _recompile_re():
+    global SKIP_DIRS_RE
+    SKIP_DIRS_RE = re.compile(f"^({'|'.join(map(re.escape, SKIP_DIRS))})")
+
+
+def add(import_name: str):
+    if isinstance(import_name, types.ModuleType):
+        return add(import_name.__name__)
+    assert isinstance(import_name, str)
+    from importlib.util import find_spec
+
+    module_spec = find_spec(import_name)
+    if not module_spec:
+        return
+    origin = module_spec.origin
+    if origin is None:
+        return
+    global SKIP_DIRS_RE
+    SKIP_DIRS.append(_strip_init_py(origin))
+    _recompile_re()
+
+
+@dataclasses.dataclass
+class SkipResult:
+    skipped: bool
+    reason: Optional[str]
+
+
+def check_file(filename, is_inlined_call=False):
+    """Should skip this file?"""
+    if filename is None:
+        return SkipResult(True, "filename is None")
+    if any(filename.startswith(d) for d in get_legacy_mod_inlinelist()):
+        return SkipResult(
+            False,
+            "inlined according trace_rules.LEGACY_MOD_INLINELIST",
+        )
+    if is_inlined_call and is_torch_inline_allowed(filename):
+        return SkipResult(
+            False,
+            "inlined according trace_rules.MOD_INLINELIST",
+        )
+    if is_fbcode and bool(FBCODE_SKIP_DIRS_RE.match(filename)):
+        return SkipResult(
+            True,
+            "skipped according trace_rules.FBCODE_SKIP_DIRS",
+        )
+    if bool(SKIP_DIRS_RE.match(filename)):
+        return SkipResult(True, "skipped according trace_rules.SKIP_DIRS")
+    else:
+        return SkipResult(False, "inlined by default")
+
+
+@dataclasses.dataclass
+class FunctionInfo:
+    py_obj: Optional[object]
+    name: Optional[str]
+    filename: str
+    code: Optional[types.CodeType]
+
+
+"""
+This is the main entry point to determine whether an object (function) should be inlined or skipped.
+Let's illustrate the logic with an example:
+    @torch.compile
+    def f1(x, y):
+        ......
+        f2(x, y)
+        ......
+
+    def f2(x, y):
+        ......
+        f3(x, y)
+        ......
+
+    def f3(x, y):
+        ......
+
+There are mainly three call sites of check/check_verbose:
+* The compile region entrance (like function f1), the correspoinding code is located at eval_frame.py.
+* When tracing the recursively called functions (like function f2 and f3).
+    * Dynamo decides inline/skip everytime it encounters a new recursively function call, and the call site
+      is in InliningInstructionTranslator.check_inlineable of symbolic_convert.py.
+    * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again
+      and the call site is in catch_errors_wrapper.catch_errors of convert_frame.py.
+* For global variables and function arguments, Dynamo needs to decide if they are wrapped as SkipFunctionVariable in builder.py.
+
+`is_inlined_call` is used to indicate if the current function call is inlined (f2 is inlined call if it passes check)
+or not (f3 is not inlined call if f2 is skipped). Inside of the `check_verbose` function, there are more rules
+to be checked if this `is_inlined_call`.
+The reason to have this flag is that if the upper level function call (e.g, f2) is skipped,
+we don't want to inline the lower level function call (e.g, f3) by default.
+"""
+
+
+def check_verbose(obj, is_inlined_call=False):
+    if isinstance(
+        obj, (UserFunctionVariable, UserMethodVariable, NestedUserFunctionVariable)
+    ):
+        try:
+            py_obj = obj.get_function()
+        except NotImplementedError:
+            py_obj = None
+        fi = FunctionInfo(py_obj, obj.get_name(), obj.get_filename(), obj.get_code())
+    elif isinstance(obj, types.CodeType):
+        fi = FunctionInfo(None, obj.co_name, obj.co_filename, obj)
+    elif isinstance(obj, (types.FunctionType, types.MethodType)):
+        fi = FunctionInfo(
+            obj, obj.__name__, getfile(obj), obj.__code__  # type: ignore[union-attr] # FIXME Add MethodType.__code__ to typeshed
+        )
+    else:
+        fi = FunctionInfo(obj, None, getfile(obj), None)
+
+    # Consulte the central trace rules defined in torch._dynamo.trace_rules.
+    rule = torch._dynamo.trace_rules.lookup_inner(
+        fi.py_obj, fi.name, fi.filename, is_inlined_call
+    )
+    if rule in [UserFunctionVariable, FunctorchHigherOrderVariable]:
+        return SkipResult(
+            False,
+            "inlined according trace_rules.lookup",
+        )
+    else:
+        assert rule == SkipFunctionVariable, rule
+        return SkipResult(
+            True,
+            "skipped according trace_rules.lookup",
+        )
+
+
+def check(obj, is_inlined_call=False):
+    return check_verbose(obj, is_inlined_call).skipped
+
+
+# skip common third party libs
+for _name in THIRDPARTY_SKIPLIST:
+    add(_name)
+
+_recompile_re()
+
+
+def is_torch_inline_allowed(filename):
+    return any(filename.startswith(d) for d in get_mod_inlinelist())
+
+
+@functools.lru_cache(None)
+def dynamo_dir():
+    import torch._dynamo
+
+    return _module_dir(torch._dynamo)
+
+
+def is_torch(filename):
+    if filename.startswith(dynamo_dir()):
+        return False
+    return filename.startswith(_module_dir(torch))
+
+
+"""
+Main entry point for looking up the trace rule (the Dynamo variable) for a given callable object.
+"""
+
+
+def lookup_callable(obj):
+    if not hashable(obj):
+        return None
+    # Custom allow/disallow in graph takes precedence over the general lookup.
+    if is_callable_disallowed(obj):
+        return SkipFunctionVariable
+    if is_callable_allowed(obj):
+        return TorchInGraphFunctionVariable
+    if is_builtin_callable(obj):
+        return BuiltinVariable
+
+
+"""
+Main entry point for looking up the trace rule (the Dynamo variable) for a given function object.
+E.g, the lookup result of `torch.sin` is `TorchInGraphFunctionVariable`.
+"""
+
+
+def lookup(obj):
+    return lookup_inner(obj)
+
+
+def lookup_inner(obj, name=None, filename=None, is_direct_call=True):
+    # Step 1: lookup obj's tracing rule in `torch_name_rule_map`.
+    # The rules defined in `torch_name_rule_map` mainly includes two parts:
+    # - Manually defined rules for any functions.
+    # - The list of torch in graph functions.
+    if not hashable(obj):
+        return None
+    if obj is not None:
+        if is_aten_op_or_tensor_method(obj):
+            return TorchInGraphFunctionVariable
+        rule = get_torch_obj_rule_map().get(obj, None)
+        if rule is not None:
+            return rule
+
+    # Step 2: lookup obj's tracing rule by function name.
+    if is_direct_call:
+        if name == "patched_init":
+            return SkipFunctionVariable
+        elif name == "__torch_function__":
+            return UserFunctionVariable
+
+    # Step 3: lookup obj's tracing rule by filename.
+    if filename is None:
+        filename = getfile(obj)
+
+    if check_file(filename, is_direct_call).skipped:
+        return SkipFunctionVariable
+    else:
+        return UserFunctionVariable
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/types.py b/MLPY/Lib/site-packages/torch/_dynamo/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99182b472d457a0faa0a4cc06ef263b1a52aa83
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/types.py
@@ -0,0 +1,99 @@
+import dataclasses
+import sys
+import types
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Protocol, Union
+
+from typing_extensions import TypeAlias
+
+
+if sys.version_info >= (3, 11):
+    from torch._C._dynamo import eval_frame
+
+    DynamoFrameType: TypeAlias = eval_frame._PyInterpreterFrame
+else:
+    DynamoFrameType: TypeAlias = types.FrameType
+
+import torch
+
+# This class has a `check_fn` field for the guard,
+#  and a `code` field for the code object.
+CacheEntry = torch._C._dynamo.eval_frame._CacheEntry
+
+ExtraState = torch._C._dynamo.eval_frame._ExtraState
+
+# We use a dict to store additional data per frame.
+FrameState = Dict[Any, Any]
+
+
+class GuardFail(NamedTuple):
+    # A string repr of the piece of failed guard code we eval-ed
+    reason: str
+    # A code object where we failed a guard
+    orig_code: types.CodeType
+
+
+class GuardFn(Protocol):
+    closure_vars: Dict[str, object]
+    args: List[str]
+    code_parts: List[str]
+    verbose_code_parts: List[str]
+    global_scope: Dict[str, object]
+    guard_fail_fn: Optional[Callable[[GuardFail], None]]
+    cache_entry: Optional[CacheEntry]
+    extra_state: Optional[ExtraState]
+
+    # maps locals of user function to bool
+    def __call__(self, f_locals: Dict[str, object]) -> bool:
+        ...
+
+
+@dataclasses.dataclass
+class GuardedCode:
+    code: types.CodeType
+    check_fn: GuardFn
+
+
+class DynamoCallbackFn(Protocol):
+    def __call__(
+        self,
+        frame: DynamoFrameType,
+        cache_entry: Optional[CacheEntry],
+        frame_state: FrameState,
+    ) -> Optional[GuardedCode]:
+        ...
+
+
+DynamoCallback = Union[DynamoCallbackFn, None, bool]
+
+
+class DynamoGuardHook(Protocol):
+    def __call__(
+        self,
+        guard_fn: GuardFn,
+        code: types.CodeType,
+        f_locals: Dict[str, object],
+        index: int,
+        last: bool,
+    ) -> None:
+        ...
+
+
+class ProfilerStartHook(Protocol):
+    def __call__(
+        self,
+        name: str,
+        # TODO(whc) how do I annotate a _RecordFunction here?
+    ) -> Any:
+        ...
+
+
+class ProfilerEndHook(Protocol):
+    def __call__(self, record: Any) -> None:
+        ...
+
+
+class BytecodeHook(Protocol):
+    def __call__(
+        self, code: types.CodeType, new_code: types.CodeType
+    ) -> Optional[types.CodeType]:
+        ...
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/utils.py b/MLPY/Lib/site-packages/torch/_dynamo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..07da9c0262e61baa16db423a829aabe6921f4785
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/utils.py
@@ -0,0 +1,2563 @@
+import atexit
+import collections
+import contextlib
+import copy
+import cProfile
+import dataclasses
+import datetime
+import dis
+import enum
+import functools
+import gc
+import inspect
+import itertools
+import linecache
+import logging
+import math
+import operator
+import os
+import pstats
+import re
+import subprocess
+import sys
+import textwrap
+import threading
+import time
+import types
+import typing
+import weakref
+from contextlib import contextmanager
+from functools import lru_cache, wraps
+from pathlib import Path
+from types import MethodWrapperType
+from typing import (
+    Any,
+    Callable,
+    cast,
+    ClassVar,
+    Counter,
+    DefaultDict,
+    Deque,
+    Dict,
+    Iterator,
+    KeysView,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    ValuesView,
+)
+
+from ..utils.hooks import RemovableHandle
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+try:
+    import torch._logging
+    import torch._numpy as tnp
+    from torch._guards import detect_fake_mode  # noqa: F401n
+    from torch._logging import LazyString
+    from . import config
+
+    # NOTE: Make sure `NP_SUPPORTED_MODULES` and `NP_TO_TNP_MODULE` are in sync.
+    if np:
+        NP_SUPPORTED_MODULES: Tuple[types.ModuleType, ...] = (
+            np,
+            np.fft,
+            np.linalg,
+            np.random,
+        )
+
+        NP_TO_TNP_MODULE = {
+            np: tnp,
+            np.fft: tnp.fft,
+            np.linalg: tnp.linalg,
+            np.random: tnp.random,
+        }
+    else:
+        NP_SUPPORTED_MODULES = tuple()
+
+        NP_TO_TNP_MODULE = {}
+    from torch._subclasses.fake_tensor import FakeTensor, is_fake, maybe_get_fake_mode
+except ImportError:
+    pass
+
+import importlib
+
+import torch
+import torch._functorch.config
+import torch.fx.experimental.symbolic_shapes
+from torch import fx
+from torch._dispatch.python import enable_python_dispatcher
+from torch._utils_internal import log_compilation_event
+
+from torch.nn.modules.lazy import LazyModuleMixin
+from torch.utils._pytree import tree_map_only
+
+
+counters: DefaultDict[str, Counter[str]] = collections.defaultdict(collections.Counter)
+optimus_scuba_log: Dict[str, Any] = {}
+troubleshooting_url = "https://pytorch.org/docs/master/compile/troubleshooting.html"
+nnmodule_doc_url = "https://pytorch.org/docs/master/compile/nn-module.html"
+nnmodule_doc_url_msg = f"See {nnmodule_doc_url} for more information and limitations."
+log = logging.getLogger(__name__)
+
+# profiling compilation time by function
+compilation_time_metrics: Dict[str, List[float]] = {}
+
+# profiling compilation time by frame phase
+frame_phase_timing: Dict[str, Dict[str, float]] = {}
+
+timer_counter = itertools.count()
+
+
+def tabulate(rows, headers):
+    try:
+        import tabulate
+
+        return tabulate.tabulate(rows, headers=headers)
+    except ImportError:
+        return "\n".join(
+            ", ".join(map(str, row)) for row in itertools.chain([headers], rows)
+        )
+
+
+def maybe_cprofile(func):
+    if config.cprofile:
+        return cprofile_wrapper(func)
+    return func
+
+
+def cprofile_wrapper(func):
+    @wraps(func)
+    def profile_wrapper(*args, **kwargs):
+        global timer_counter
+        profile_cnt = next(timer_counter)
+        profile_path = Path(func.__name__ + f"{profile_cnt}.profile")
+        prof = cProfile.Profile()
+        prof.enable()
+        start_ts = time.time()
+        retval = prof.runcall(func, *args, **kwargs)
+        profile_latency = time.time() - start_ts
+        prof.disable()
+        print(
+            f"### Cprofile for {func.__name__} iter {profile_cnt} took {profile_latency:.3f} seconds ###"
+        )
+        ps = pstats.Stats(prof)
+        prof.dump_stats(profile_path)
+        svg_path = profile_path.with_suffix(".svg")
+        try:
+            gprof2dot_process = subprocess.Popen(
+                [
+                    "gprof2dot",
+                    "-f",
+                    "pstats",
+                    "--node-label=total-time-percentage",
+                    "--node-label=self-time-percentage",
+                    "--node-label=total-time",
+                    str(profile_path),
+                ],
+                stdout=subprocess.PIPE,
+            )
+            subprocess.check_call(
+                ["dot", "-Tsvg", "-o", str(svg_path)],
+                stdin=gprof2dot_process.stdout,
+            )
+            print(f"Generated SVG from profile at {str(svg_path)}")
+        except FileNotFoundError:
+            print(
+                "Failed to generate SVG from profile -- dumping stats instead."
+                "Try installing gprof2dot and dot for a better visualization"
+            )
+            ps.sort_stats(pstats.SortKey.TIME).print_stats(20)
+            ps.sort_stats(pstats.SortKey.CUMULATIVE).print_stats(20)
+        return retval
+
+    return profile_wrapper
+
+
+curr_frame = 0
+
+
+# Note: Called for you by dynamo - you almost never ever want to invoke this yourself.
+def increment_frame():
+    global curr_frame
+    curr_frame = curr_frame + 1
+
+
+# Note: Called for you by dynamo - you almost never ever want to invoke this yourself.
+def reset_frame_count():
+    global curr_frame
+    frame_phase_timing.clear()
+    compilation_time_metrics.clear()
+    curr_frame = 0
+
+
+op_count = 0
+
+
+def increment_op_count(cnt):
+    global op_count
+    op_count += cnt
+
+
+# Print a report of time spent so far
+# Ex:
+# TIMING:
+# entire_frame_compile:8.574629999999999
+# backend_compile:5.26806
+def print_time_report():
+    total = 0.0
+    total_by_key = {}
+    for timings in frame_phase_timing.values():
+        for key, timing in timings.items():
+            total += timing
+            if key not in total_by_key:
+                total_by_key[key] = timing
+            else:
+                total_by_key[key] += timing
+
+    out = "TIMING:"
+    for key, value in total_by_key.items():
+        out = f"{out} {key}:{round(value, 5)}"
+
+    print(out)
+
+
+# dynamo_timed API works as a function decorator
+# By wrapping a function in dynamo_timed, we can store a record in compilation_time_metrics
+# where the key is the functions name.
+# For example:
+#
+#  @dynamo_timed
+#  def _foo(...):
+#
+# Would show up as an entry in our timing dict:
+# OrderedDict([('bar.<locals>._foo', [0.083690, 0.23949, 3.1425e-05])])
+# This is extremely useful for granular debugging.
+#
+# For a higher-level mode, pass a phase_name into dynamo_timed
+# phase_names record an extra record into a separate compilation timing structure,
+# one keyed on frame+name rather than function.
+# The frame is incremented outside of this function, in def increment_frame() above.
+
+
+def dynamo_timed(original_function=None, phase_name=None):
+    def dynamo_timed_inner(func):
+        if config.cprofile:
+            return func
+
+        @wraps(func)
+        def time_wrapper(*args, **kwargs):
+            key = func.__qualname__
+            if key not in compilation_time_metrics:
+                compilation_time_metrics[key] = []
+            with torch.profiler.record_function(f"{key} (dynamo_timed)"):
+                t0 = time.time()
+                r = func(*args, **kwargs)
+                time_spent = time.time() - t0
+            compilation_time_metrics[key].append(time_spent)
+            if phase_name:
+                frame_key = str(curr_frame)
+                if frame_key not in frame_phase_timing:
+                    frame_phase_timing[frame_key] = {}
+                if phase_name not in frame_phase_timing[frame_key]:
+                    frame_phase_timing[frame_key][phase_name] = time_spent
+                else:
+                    frame_phase_timing[frame_key][phase_name] += time_spent
+            return r
+
+        return time_wrapper
+
+    if original_function:
+        return dynamo_timed_inner(original_function)
+    return dynamo_timed_inner
+
+
+def compile_times(repr="str", aggregate=False):
+    """
+    Get metrics about torchdynamo frontend/backend compilation times.
+
+    Accumulates information from functions tagged with `@dynamo_timed`.
+
+    repr='str' returns a printable string for user interaction, and 'csv'
+    returns headers, rows which can be logged for output
+
+    aggregate causes values from multiple compilations (e.g. split graphs)
+    to be accumulated into one value.  If false, expect more than one value
+    per metric.
+    """
+
+    def fmt_fn(values, item_fn=lambda x: x):
+        if aggregate:
+            return item_fn(sum(values))
+        return ", ".join(map(item_fn, values))
+
+    if repr == "str":
+        rows = [
+            (k, fmt_fn(compilation_time_metrics[k], item_fn=lambda x: f"{x:.4f}"))
+            for k in compilation_time_metrics
+        ]
+        out = "TorchDynamo compilation metrics:\n"
+        out += tabulate(rows, headers=("Function", "Runtimes (s)"))
+        return out
+    elif repr == "csv":
+        values = [
+            fmt_fn(v, item_fn=lambda x: f"{x:.6f}")
+            for v in compilation_time_metrics.values()
+        ]
+        headers = list(compilation_time_metrics.keys())
+        return headers, values
+
+
+@atexit.register
+def dump_compile_times():
+    log.info(compile_times(repr="str", aggregate=True))
+
+
+tensortype_to_dtype = {
+    torch.FloatTensor: (torch.float32, torch.float),
+    torch.DoubleTensor: (torch.float64, torch.double),
+    torch.HalfTensor: (torch.float16, torch.half),
+    torch.BFloat16Tensor: (torch.bfloat16,),
+    torch.ByteTensor: (torch.uint8,),
+    torch.CharTensor: (torch.int8,),
+    torch.LongTensor: (torch.int64, torch.long),
+    torch.IntTensor: (torch.int32, torch.int),
+    torch.ShortTensor: (torch.int16, torch.short),
+    torch.BoolTensor: (torch.bool,),
+}
+
+
+class DuplicateWarningChecker:
+    def __init__(self, maxsize=4096):
+        self.maxsize = maxsize
+        self.reset()
+
+    def reset(self):
+        self.set = collections.OrderedDict()
+
+    def add(self, key):
+        if key in self.set:
+            self.set.move_to_end(key, last=True)
+            if not config.verbose:
+                return False
+        else:
+            self.set[key] = None
+            while len(self.set) > self.maxsize:
+                self.set.popitem(last=False)
+        return True
+
+
+graph_break_dup_warning_checker = DuplicateWarningChecker()
+
+
+def setup_compile_debug():
+    compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+
+    if compile_debug:
+        torch._logging.set_logs(
+            dynamo=logging.DEBUG,
+            aot=logging.DEBUG,
+            inductor=logging.DEBUG,
+            output_code=True,  # this is off by default
+        )
+        return add_file_handler()
+
+    return contextlib.ExitStack()
+
+
+def reset_graph_break_dup_checker():
+    graph_break_dup_warning_checker.reset()
+
+
+def add_file_handler():
+    log_path = os.path.join(get_debug_dir(), "torchdynamo")
+    os.makedirs(log_path, exist_ok=True)
+
+    log_file_handler = logging.FileHandler(os.path.join(log_path, "debug.log"))
+    logger = logging.getLogger("torch._dynamo")
+    logger.addHandler(log_file_handler)
+
+    exitstack = contextlib.ExitStack()
+    exitstack.callback(lambda: logger.removeHandler(log_file_handler))
+    return exitstack
+
+
+def setup_log_file():
+    exitstack = contextlib.ExitStack()
+    if config.log_file_name is not None:
+        log_file_handler = logging.FileHandler(config.log_file_name)
+        for logger in torch._logging._internal.get_loggers():
+            logger.addHandler(log_file_handler)
+            exitstack.callback(lambda: logger.removeHandler(log_file_handler))
+        return exitstack
+
+    return exitstack
+
+
+def gen_record_file_name(exc, code):
+    return f"{get_debug_dir()}/error_recordings/\
+{code.co_name}_{type(exc).__name__}_{code.co_firstlineno}.rec"
+
+
+def write_record_to_file(filename, exec_record):
+    try:
+        if os.path.exists(filename):
+            log.warning(
+                "Unable to write execution record %s; file already exists.", filename
+            )
+        else:
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            with open(filename, "wb") as f:
+                exec_record.dump(f)
+    except Exception:
+        log.exception("Unable to write execution record %s", filename)
+
+
+def count_calls(g: fx.Graph):
+    c = 0
+    for n in g.nodes:
+        if "call" in n.op:
+            c += 1
+    return c
+
+
+def identity(x):
+    return x
+
+
+def hashable(x):
+    try:
+        hash(x)
+        return True
+    except TypeError:
+        return False
+    # cannot hash writable memoryview object
+    except ValueError:
+        return False
+
+
+def nothing(*args, **kwargs):
+    pass
+
+
+class ExactWeakKeyDictionary:
+    """Similar to weakref.WeakKeyDictionary, but use `is`/`id` rather than `==` to compare equality"""
+
+    def __init__(self):
+        self.values = dict()
+        self.refs = dict()
+
+    def __getitem__(self, key):
+        return self.values[id(key)]
+
+    def get(self, key, default=None):
+        return self.values.get(id(key), default)
+
+    def __contains__(self, key):
+        return id(key) in self.values
+
+    def __setitem__(self, key, value):
+        idx = id(key)
+        if idx not in self.refs:
+            self.refs[idx] = weakref.ref(key, lambda ref: self._remove_id(idx))
+        self.values[idx] = value
+
+    def _remove_id(self, idx):
+        if idx in self.values:
+            del self.values[idx]
+        if idx in self.refs:
+            del self.refs[idx]
+
+    def clear(self):
+        self.refs.clear()
+        self.values.clear()
+
+
+def istype(obj, allowed_types):
+    """isinstance() without subclasses"""
+    if isinstance(allowed_types, (tuple, list, set)):
+        return type(obj) in allowed_types
+    return type(obj) is allowed_types
+
+
+def is_typing(value):
+    # _Final catches most of typing classes:
+    #   - Any
+    #   - Callable
+    #   - Union
+    #   ...
+    #
+    # NB: we intentionally ignore classes that inherit from Generic, since they
+    # can be used as both TypingVariable as well as UserDefinedClassVariable.
+    return isinstance(value, typing._Final) or value is typing.Generic  # type: ignore[attr-defined]
+
+
+def is_numpy_int_type(value):
+    if not np:
+        return False
+
+    return istype(
+        value,
+        (
+            np.int8,
+            np.int16,
+            np.int32,
+            np.int64,
+            np.uint8,
+            np.uint16,
+            np.uint32,
+            np.uint64,
+        ),
+    )
+
+
+def is_numpy_float_type(value):
+    if not np:
+        return False
+
+    return istype(
+        value,
+        (
+            np.float16,
+            np.float32,
+            np.float64,
+        ),
+    )
+
+
+def is_function_or_wrapper(value):
+    return (
+        is_function(value)
+        or isinstance(value, functools._lru_cache_wrapper)
+        and is_function(inspect.getattr_static(value, "__wrapped__"))
+        or isinstance(value, (torch._ops.OpOverloadPacket, torch._ops.OpOverload))
+    )
+
+
+def is_function(value):
+    return isinstance(
+        value,
+        (
+            types.FunctionType,
+            types.BuiltinFunctionType,
+            types.MethodDescriptorType,
+            types.WrapperDescriptorType,
+            torch.jit.ScriptFunction,
+        ),
+    )
+
+
+def unwrap_if_wrapper(fn):
+    return unwrap_with_attr_name_if_wrapper(fn)[0]
+
+
+def unwrap_with_attr_name_if_wrapper(fn):
+    # unpack @functools.lru_cache wrapped function
+    if isinstance(fn, functools._lru_cache_wrapper):
+        fn = inspect.getattr_static(fn, "__wrapped__")
+        attr_name = "__wrapped__"
+    # unpack @torch._dynamo.optimize()(fn) wrapped function
+    elif is_function(fn) and inspect.getattr_static(fn, "_torchdynamo_inline", False):
+        fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
+        attr_name = "_torchdynamo_inline"
+    # unpack torch.jit.script_if_tracing
+    elif is_function(fn) and inspect.getattr_static(
+        fn, "__script_if_tracing_wrapper", False
+    ):
+        fn = inspect.getattr_static(fn, "__original_fn", fn)
+        attr_name = "__original_fn"
+    else:
+        attr_name = None
+    return fn, attr_name
+
+
+def is_numpy_ndarray(value):
+    if not np:
+        return False
+
+    return istype(value, np.ndarray)
+
+
+def istensor(obj):
+    """Check of obj is a tensor"""
+    tensor_list = (
+        torch.Tensor,
+        torch.nn.Parameter,
+        *config.traceable_tensor_subclasses,
+    )
+    tensor_list = tensor_list + (torch._subclasses.FakeTensor,)
+    return istype(obj, tensor_list)
+
+
+def is_lazy_module(mod):
+    return isinstance(mod, LazyModuleMixin)
+
+
+@functools.lru_cache(4096)
+def print_once(*args):
+    print(*args)
+
+
+def make_cell(val=None):
+    """Some black magic to create a cell object that usually only exists in a closure"""
+    x = val
+
+    def f():
+        return x
+
+    assert f.__closure__ is not None and len(f.__closure__) == 1
+    return f.__closure__[0]
+
+
+def proxy_args_kwargs(args, kwargs):
+    try:
+        proxy_args = tuple(arg.as_proxy() for arg in args)
+        proxy_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
+        return proxy_args, proxy_kwargs
+    except NotImplementedError as e:
+        from .exc import unimplemented
+        from .variables.base import typestr
+
+        raise unimplemented(
+            f"call_function args: {typestr(*args)} {typestr(*list(kwargs.values()))}"
+        ) from e
+
+
+@dataclasses.dataclass
+class CompilationMetrics:
+    frame_key: str
+    co_name: str
+    co_filename: str
+    co_firstlineno: int
+    cache_size: int
+    accumulated_cache_size: int
+    guard_count: Optional[int]
+    shape_env_guard_count: Optional[int]
+    graph_op_count: Optional[int]
+    graph_node_count: Optional[int]
+    graph_input_count: Optional[int]
+    start_time: float
+    entire_frame_compile_time_s: Optional[float]
+    backend_compile_time_s: Optional[float]
+    inductor_compile_time_s: Optional[float]
+    code_gen_time_s: Optional[float]
+    fail_type: Optional[str]
+    fail_reason: Optional[str]
+    fail_user_frame_filename: Optional[str]
+    fail_user_frame_lineno: Optional[int]
+    non_compliant_ops: Set[str]
+    compliant_custom_ops: Set[str]
+
+
+DEFAULT_COMPILATION_METRICS_LIMIT = 64
+
+
+_compilation_metrics: Deque[CompilationMetrics] = collections.deque(
+    maxlen=DEFAULT_COMPILATION_METRICS_LIMIT
+)
+
+
+def record_compilation_metrics(compilation_metrics: CompilationMetrics):
+    global _compilation_metrics
+    _compilation_metrics.append(compilation_metrics)
+    if config.log_compilation_metrics:
+        log_compilation_event(compilation_metrics)
+
+
+def set_compilation_metrics_limit(new_size: int) -> None:
+    global _compilation_metrics
+    while len(_compilation_metrics) > new_size:
+        _compilation_metrics.popleft()
+    new_deque = collections.deque(_compilation_metrics, maxlen=new_size)
+    _compilation_metrics = new_deque
+
+
+def clear_compilation_metrics() -> None:
+    global _compilation_metrics
+    _compilation_metrics.clear()
+
+
+def get_compilation_metrics() -> List[CompilationMetrics]:
+    return list(_compilation_metrics)
+
+
+@dataclasses.dataclass
+class CleanupHook:
+    """Remove a global variable when hook is called"""
+
+    scope: Dict[str, Any]
+    name: str
+
+    def __call__(self, *args):
+        CleanupManager.count -= 1
+        del self.scope[self.name]
+
+    @staticmethod
+    def create(scope, name, val):
+        assert name not in scope
+        CleanupManager.count += 1
+        scope[name] = val
+        return CleanupHook(scope, name)
+
+
+class CleanupManager(ExactWeakKeyDictionary):
+    count = 0
+    instance: ClassVar["CleanupManager"]
+
+    def _remove_id(self, idx):
+        for hook in self.values[idx]:
+            hook()
+        super()._remove_id(idx)
+
+
+CleanupManager.instance = CleanupManager()
+
+
+def clone_tensor(x):
+    """Clone the tensor and its gradient"""
+    y = x.clone().requires_grad_(x.requires_grad)
+    if x.is_leaf and x.grad is not None:
+        y.grad = x.grad.clone()
+    return y
+
+
+def clone_input(x, *, dtype=None):
+    """copy while preserving strides"""
+    # TODO: this is questionable
+    if is_fake(x):
+        # this func fails on fake tensors in __torch_dispatch__
+        return x
+
+    def torch_clone(x):
+        y = torch.clone(x)
+        if x.is_leaf:
+            y.requires_grad_(x.requires_grad)
+        if x.is_leaf and x.grad is not None:
+            y.grad = clone_input(x.grad, dtype=dtype)
+        if hasattr(x, "_dynamo_dynamic_indices"):
+            y._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy()  # type: ignore[attr-defined]
+        return y
+
+    with torch.no_grad():
+        if x.device.type == "xla":
+            # Access data_ptr() for a xla tensor will cause crash
+            return torch_clone(x)
+
+        needed_size = sum(
+            (shape - 1) * stride for shape, stride in zip(x.size(), x.stride())
+        )
+        if x.is_quantized:
+            result = torch.empty_quantized((needed_size + 32,), x)
+        else:
+            result = torch.empty(
+                needed_size + 32, dtype=dtype or x.dtype, device=x.device
+            )
+        cache_line_offset = (
+            (x.data_ptr() - result.data_ptr()) % 32
+        ) // x.element_size()
+        result.as_strided_(x.size(), x.stride(), cache_line_offset)
+        try:
+            result.copy_(x.clone())
+            if x.is_leaf:
+                result.requires_grad_(x.requires_grad)
+            if x.is_leaf and x.grad is not None:
+                result.grad = clone_input(x.grad, dtype=dtype)
+        except RuntimeError:
+            # RuntimeError: unsupported operation: more than one element of the written-to
+            # tensor refers to a single memory location. Please clone() the tensor before
+            # performing the operation.
+            return torch_clone(x)
+        if hasattr(x, "_dynamo_dynamic_indices"):
+            result._dynamo_dynamic_indices = x._dynamo_dynamic_indices.copy()  # type: ignore[attr-defined]
+        return result
+
+
+def clone_inputs(example_inputs):
+    res: Union[Dict[Any, Any], List[Any]]
+    if type(example_inputs) is dict:
+        res = dict(example_inputs)
+        for key, value in res.items():
+            if isinstance(value, tuple):
+                res[key] = clone_inputs(value)
+            else:
+                assert isinstance(value, torch.Tensor), type(value)
+                res[key] = clone_input(value)
+        return res
+
+    res = list(example_inputs)
+    for i in range(len(res)):
+        if isinstance(res[i], torch.Tensor):
+            res[i] = clone_input(res[i])
+    return res
+
+
+def skip_frame_if_in_functorch_mode(val: torch.Tensor):
+    try:
+        val.data_ptr()  # will throw for functorch tensors
+    except RuntimeError as e:
+        from .exc import SkipFrame
+
+        # This will be GradTrackingTensor/BatchedTensor/etc
+        functorch_subclass_name = re.sub(r"\(.*", "", repr(val))
+        raise SkipFrame(
+            f"torch.compile cannot be run in context: {functorch_subclass_name}"
+        ) from e
+
+
+@contextmanager
+def preserve_rng_state():
+    disable_functorch = torch._C._DisableFuncTorch
+    disable_current_modes = torch.utils._python_dispatch._disable_current_modes
+    with disable_current_modes(), disable_functorch():
+        rng_state = torch.clone(torch.random.get_rng_state())
+        skip_frame_if_in_functorch_mode(rng_state)
+        if torch.cuda.is_available():
+            cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
+    try:
+        yield
+    finally:
+        with torch.utils._python_dispatch._disable_current_modes():
+            torch.random.set_rng_state(rng_state)
+            if torch.cuda.is_available():
+                torch.cuda.set_rng_state(cuda_rng_state)  # type: ignore[possibly-undefined]
+
+
+def is_jit_model(model0):
+    return isinstance(
+        model0,
+        (
+            torch.jit._trace.TopLevelTracedModule,
+            torch.jit._script.RecursiveScriptModule,
+            torch.jit.ScriptFunction,
+            torch.jit.ScriptModule,
+        ),
+    )
+
+
+def torchscript(model, example_inputs, verbose=False):
+    if is_jit_model(model):
+        # already done?
+        return model
+
+    try:
+        return torch.jit.trace(model, example_inputs)
+    except Exception:
+        try:
+            return torch.jit.script(model)
+        except Exception:
+            if verbose:
+                log.exception("jit error")
+            else:
+                log.error("Both torch.jit.trace and torch.jit.script failed")
+    return None
+
+
+def getfile(obj):
+    try:
+        return inspect.getfile(obj)
+    except (TypeError, OSError):
+        return None
+
+
+def is_namedtuple(obj):
+    """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
+    return is_namedtuple_cls(type(obj))
+
+
+def is_namedtuple_cls(cls):
+    """Test if an object is a namedtuple or a torch.return_types.* quasi-namedtuple"""
+    try:
+        if issubclass(cls, tuple):
+            bases = getattr(cls, "__bases__", []) or [None]
+            module = getattr(cls, "__module__", None)
+            return module == "torch.return_types" or (
+                bases[0] is tuple and hasattr(cls, "_make") and hasattr(cls, "_fields")
+            )
+    except TypeError:
+        pass
+    return False
+
+
+@functools.lru_cache(1)
+def namedtuple_fields(cls):
+    """Get the fields of a namedtuple or a torch.return_types.* quasi-namedtuple"""
+    if cls is slice:
+        return ["start", "stop", "step"]
+
+    assert issubclass(cls, tuple)
+    if hasattr(cls, "_fields"):
+        # normal namedtuples
+        return cls._fields
+
+    @dataclasses.dataclass
+    class Marker:
+        index: int
+
+    # frustrating ones e.g. torch.return_types.max
+    assert cls.__module__ == "torch.return_types"
+    obj = cls(map(Marker, range(cls.n_fields)))
+    fields: List[Optional[str]] = [None] * cls.n_fields
+    for name in dir(obj):
+        if name[0] != "_" and isinstance(getattr(obj, name), Marker):
+            fields[getattr(obj, name).index] = name
+    return fields
+
+
+def checkpoint_params(gm):
+    with torch.no_grad():
+        rng_state = torch.clone(torch.random.get_rng_state())
+        if torch.cuda.is_available():
+            cuda_rng_state = torch.clone(torch.cuda.get_rng_state())
+        saved_state = []
+        for param in itertools.chain(gm.parameters(), gm.buffers()):
+            saved_state.append((param, param._version, torch.clone(param)))
+
+    def restore():
+        with torch.no_grad():
+            torch.random.set_rng_state(rng_state)
+            if torch.cuda.is_available():
+                torch.cuda.set_rng_state(cuda_rng_state)
+            for param, version, original_value in saved_state:
+                if param._version != version:
+                    param.copy_(original_value)
+
+    return restore
+
+
+def timed(model, example_inputs, times=1):
+    if torch.cuda.is_available():
+        synchronize = torch.cuda.synchronize
+    else:
+        synchronize = nothing
+
+    synchronize()
+    gc.collect()
+    torch.manual_seed(1337)
+    t0 = time.perf_counter()
+    for _ in range(times):
+        result = model(*example_inputs)
+        synchronize()
+    t1 = time.perf_counter()
+    return result, t1 - t0  # type: ignore[possibly-undefined]
+
+
+def check_is_cuda(gm, example_inputs):
+    return all(x.is_cuda for x in itertools.chain(example_inputs, gm.parameters(True)))
+
+
+@lru_cache(32)
+def rot_n_helper(n):
+    assert n > 1
+    vars = [f"v{i}" for i in range(n)]
+    rotated = reversed(vars[-1:] + vars[:-1])
+    fn = eval(f"lambda {','.join(vars)}: ({','.join(rotated)})")
+    fn.__name__ = f"rot_{n}_helper"
+    return fn
+
+
+common_constant_types = {
+    int,
+    float,
+    complex,
+    bool,
+    str,
+    bytes,
+    type(None),
+    Ellipsis.__class__,
+    types.CodeType,
+    torch.device,
+    torch.dtype,
+    torch.memory_format,
+    torch.layout,
+}
+
+
+def is_safe_constant(v):
+    if istype(v, (tuple, frozenset)):
+        return all(map(is_safe_constant, v))
+    return isinstance(v, (enum.Enum, type)) or istype(
+        v,
+        common_constant_types | {slice},
+    )
+
+
+def specialize_symnode(arg):
+    from .variables import ConstantVariable, SymNodeVariable
+
+    # Guard and specialize
+    if isinstance(arg, SymNodeVariable):
+        return ConstantVariable.create(arg.evaluate_expr())
+
+    return arg
+
+
+def guard_if_dyn(arg):
+    from .variables import ConstantVariable
+
+    arg = specialize_symnode(arg)
+
+    if isinstance(arg, ConstantVariable):
+        return arg.as_python_constant()
+
+    return arg
+
+
+def check_constant_args(args, kwargs):
+    return all(x.is_python_constant() for x in itertools.chain(args, kwargs.values()))
+
+
+def check_unspec_python_args(args, kwargs):
+    from .variables.constant import ConstantVariable
+    from .variables.tensor import UnspecializedPythonVariable
+
+    unspec_count = 0
+    for x in itertools.chain(args, kwargs.values()):
+        if isinstance(x, UnspecializedPythonVariable):
+            unspec_count += 1
+        elif not isinstance(x, (UnspecializedPythonVariable, ConstantVariable)):
+            return False
+        else:
+            pass
+
+    return unspec_count > 0
+
+
+def check_numpy_ndarray_args(args, kwargs):
+    from .variables.tensor import NumpyNdarrayVariable
+
+    return any(
+        isinstance(x, NumpyNdarrayVariable)
+        for x in itertools.chain(args, kwargs.values())
+    )
+
+
+dict_keys: Type[KeysView[Any]] = type(dict().keys())
+dict_values: Type[ValuesView[Any]] = type(dict().values())
+odict_values: Type[ValuesView[Any]] = type(collections.OrderedDict().values())
+tuple_iterator: Type[Iterator[Any]] = type(iter(tuple()))
+tuple_iterator_len = tuple_iterator.__length_hint__  # type: ignore[attr-defined]
+object_new = object.__new__
+
+
+def nn_module_new(cls):
+    obj = object_new(cls)
+    torch.nn.Module.__init__(obj)
+    return obj
+
+
+def product(it):
+    return functools.reduce(operator.mul, it, 1)
+
+
+def tuple_iterator_getitem(it, index):
+    _, (obj,), start = it.__reduce__()
+    return obj[start + index]
+
+
+iter_next = next
+
+
+def to_subclass(t, cls):
+    return t.as_subclass(cls)
+
+
+def dict_keys_getitem(d, n):
+    return next(itertools.islice(iter(d), n, n + 1))
+
+
+def enum_repr(value, local):
+    # enum class can override __str__ method. Use __class__ and name attribute
+    # to extract the class name and key name.
+    name = value.__class__.__name__
+    val = value.name
+    scope = "L" if local else "G"
+    local_name = f'{scope}["{name}"].{val}'
+    return local_name
+
+
+def _get_fake_tensor(vt):
+    fake_tensor = vt.as_proxy().node.meta.get("example_value")
+    if not is_fake(fake_tensor):
+        from .exc import unimplemented
+
+        unimplemented("Cannot check Tensor object identity without its fake value")
+    return fake_tensor
+
+
+def iter_contains(items, search, tx, check_tensor_identity=False):
+    from .variables import (
+        BuiltinVariable,
+        ConstantVariable,
+        TensorVariable,
+        VariableTracker,
+    )
+
+    if search.is_python_constant():
+        found_const = any(
+            x.is_python_constant()
+            and x.as_python_constant() == search.as_python_constant()
+            for x in items
+        )
+        return ConstantVariable.create(found_const)
+
+    must_check_tensor_id = False
+    if check_tensor_identity and isinstance(search, TensorVariable):
+        must_check_tensor_id = True
+        # Match of Tensor means match of FakeTensor
+        search = _get_fake_tensor(search)
+
+    found: Optional[VariableTracker] = None
+    for x in items:
+        if must_check_tensor_id:
+            if isinstance(x, TensorVariable):
+                if search is _get_fake_tensor(x):  # Object equivalence
+                    return ConstantVariable.create(True)
+        else:
+            check = BuiltinVariable(operator.eq).call_function(tx, [x, search], {})
+            if found is None:
+                found = check
+            else:
+                found = BuiltinVariable(operator.or_).call_function(
+                    tx, [check, found], {}
+                )
+    if found is None:
+        found = ConstantVariable.create(False)
+    return found
+
+
+def key_is_id(k):
+    """Returns whether it indexes dictionaries using its id"""
+    return isinstance(k, (torch.Tensor, torch.nn.Module, MethodWrapperType))
+
+
+def key_to_id(value):
+    return [id(k) if key_is_id(k) else k for k in value.keys()]
+
+
+def const_repr(x, *, local) -> str:
+    from .trace_rules import is_builtin_callable
+
+    if isinstance(x, (list, tuple)):
+        elems_repr = ",".join(const_repr(s, local=local) for s in x)
+        if isinstance(x, list):
+            return f"[{elems_repr}]"
+        else:
+            assert isinstance(x, tuple)
+            if len(x) == 1:
+                return f"({elems_repr},)"
+            else:
+                return f"({elems_repr})"
+    elif isinstance(x, enum.Enum):
+        # To workaround repr(Enum) returning invalid global reference before python 3.11
+        # by calling enum_repr and removing quotes to render enum in guard code.
+        return enum_repr(x, local=local).replace("'", "")
+    elif is_builtin_callable(x):
+        return x.__name__
+    elif isinstance(x, type):
+
+        def fullname(o):
+            klass = o.__class__
+            module = klass.__module__
+            if module == "builtins":
+                return klass.__qualname__  # avoid outputs like 'builtins.str'
+            return module + "." + klass.__qualname__
+
+        return fullname(x)
+    else:
+        return f"{x!r}"
+
+
+def dict_keys_repr(const_keys, *, local) -> str:
+    keys_str = ",".join(const_repr(s, local=local) for s in const_keys)
+    return "[" + keys_str + "]"
+
+
+GLOBAL_KEY_PREFIX = "__dict_key"
+
+
+from torch._subclasses import UnsupportedFakeTensorException  # noqa: F401
+
+
+def wrap_fake_exception(fn):
+    try:
+        return fn()
+    except UnsupportedFakeTensorException as e:
+        from .exc import unimplemented
+
+        msg = f"Unsupported: {e.reason} with fake tensor propagation."
+        log.warning(msg)
+        raise unimplemented(msg) from e
+
+
+def deepcopy_to_fake_tensor(obj, fake_mode):
+    with torch._subclasses.fake_tensor.FakeCopyMode(fake_mode):
+        return wrap_fake_exception(lambda: copy.deepcopy(obj))
+
+
+def rmse(ref, res):
+    """
+    Calculate root mean squared error
+    """
+    return torch.sqrt(torch.mean(torch.square(ref - res)))
+
+
+def same(
+    ref,
+    res,
+    fp64_ref=None,
+    cos_similarity=False,
+    tol=1e-4,
+    equal_nan=False,
+    exact_dtype=True,
+    relax_numpy_equality=False,
+    ignore_non_fp=False,
+    log_error=log.error,
+):
+    """Check correctness to see if ref and res match"""
+    if fp64_ref is None:
+        fp64_ref = ref
+    if isinstance(ref, (list, tuple, torch.nn.ParameterList, torch.Size)):
+        assert isinstance(res, (list, tuple)), f"type mismatch {type(ref)} {type(res)}"
+        if len(ref) != len(res):
+            log_error("Length mismatch")
+            return False
+        return len(ref) == len(res) and all(
+            same(
+                ai,
+                bi,
+                fp64_refi,
+                cos_similarity,
+                tol,
+                equal_nan,
+                exact_dtype,
+                relax_numpy_equality,
+                ignore_non_fp,
+                log_error=log_error,
+            )
+            for ai, bi, fp64_refi in zip(ref, res, fp64_ref)
+        )
+    elif isinstance(ref, dict):
+        assert isinstance(res, dict)
+        assert set(ref.keys()) == set(
+            res.keys()
+        ), f"keys mismatch {set(ref.keys())} == {set(res.keys())}"
+        for k in sorted(ref.keys()):
+            if not (
+                same(
+                    ref[k],
+                    res[k],
+                    fp64_ref[k],
+                    cos_similarity=cos_similarity,
+                    tol=tol,
+                    equal_nan=equal_nan,
+                    exact_dtype=exact_dtype,
+                    relax_numpy_equality=relax_numpy_equality,
+                    ignore_non_fp=ignore_non_fp,
+                    log_error=log_error,
+                )
+            ):
+                log_error("Accuracy failed for key name %s", k)
+                return False
+        return True
+    elif isinstance(ref, (torch.Tensor, float)):
+        assert not isinstance(ref, torch._subclasses.FakeTensor)
+        assert not isinstance(res, torch._subclasses.FakeTensor)
+
+        def to_tensor(t):
+            return t if isinstance(t, torch.Tensor) else torch.tensor(t)
+
+        ref, res, fp64_ref = (to_tensor(val) for val in (ref, res, fp64_ref))
+
+        if ref.is_sparse:
+            assert res.is_sparse
+            ref = ref.to_dense()
+            res = res.to_dense()
+        assert isinstance(res, torch.Tensor), f"type mismatch {type(ref)} {type(res)}"
+        if exact_dtype:
+            if ref.dtype != res.dtype:
+                log_error("dtype mismatch %s, %s", ref.dtype, res.dtype)
+                return False
+            if ref.dtype == torch.bool:
+                if ignore_non_fp:
+                    return True
+                # triton stores bool as int8, so add this for more accurate checking
+                r = torch.allclose(
+                    ref.to(dtype=torch.uint8),
+                    res.to(dtype=torch.uint8),
+                    atol=tol,
+                    rtol=tol,
+                    equal_nan=equal_nan,
+                )
+                if not r:
+                    log_error("Accuracy failed: uint8 tensor did not match")
+                return r
+
+        if cos_similarity:
+            ref = ref.flatten().to(torch.float32)
+            res = res.flatten().to(torch.float32)
+            if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=True):
+                # early exit that handles zero/nan better
+                # cosine_similarity(zeros(10), zeros(10), dim=0) is 0
+                return True
+            score = torch.nn.functional.cosine_similarity(ref, res, dim=0, eps=1e-6)
+            if score < 0.99:
+                log.warning("Similarity score=%s", score.cpu().detach().item())
+            return score >= 0.99
+        else:
+            if not exact_dtype:
+                ref = ref.to(res.dtype)
+
+            # First try usual allclose
+            if torch.allclose(ref, res, atol=tol, rtol=tol, equal_nan=equal_nan):
+                return True
+
+            # Check error from fp64 version
+            if fp64_ref.dtype == torch.float64:
+                ref_error = rmse(fp64_ref, ref).item()
+                # ref unable to produce this with stable numerics in this precision, ignore
+                if math.isnan(ref_error):
+                    log.warning(
+                        "Found nan in reference. Consider running in higher precision."
+                    )
+
+                res_error = rmse(fp64_ref, res).item()
+
+                # In the case of using AMP (Automatic Mixed Precision), certain models have
+                # failed the benchmark's correctness check. However, the end-to-end model's
+                # accuracy when comparing AMP with FP32 is within a difference of less than 0.1%.
+                # Thus, it's possible that the correctness check failures for these models are
+                # false alarms. We use multiplier of 3 instead of 2 to avoid these false alarms.
+                multiplier = 3.0 if res.dtype == torch.bfloat16 else 2.0
+
+                if (
+                    fp64_ref.numel() < 1000
+                    or (ref.ndim == 4 and ref.shape[-1] == ref.shape[-2] == 1)
+                    # large tol means a benchmark has been specified as REQUIRE_HIGHER_TOLERANCE
+                    or tol >= 2 * 1e-2
+                ):
+                    # In the presence of noise, noise might dominate our error
+                    # metric for smaller tensors.
+                    # Similary, for 1x1 kernels, there seems to be high noise with amp.
+                    multiplier = 3.0
+
+                passes_test = res_error <= (multiplier * ref_error + tol / 10.0)
+                if not passes_test:
+                    log_error(
+                        "RMSE (res-fp64): %.5f, (ref-fp64): %.5f and shape=%s",
+                        res_error,
+                        ref_error,
+                        res.size(),
+                    )
+                    # import pdb; pdb.set_trace()
+                return passes_test
+
+            if ignore_non_fp:
+                return True
+
+            log_error("Accuracy failed: allclose not within tol=%s", tol)
+            return False
+    elif isinstance(ref, (str, int, type(None), bool, torch.device)):
+        if ignore_non_fp:
+            return True
+        r = ref == res
+        if not r:
+            log_error("Accuracy failed (%s): %s != %s", type(ref), ref, res)
+        return r
+    elif is_numpy_int_type(ref) or is_numpy_float_type(ref):
+        if relax_numpy_equality and not (
+            is_numpy_int_type(res) or is_numpy_float_type(res)
+        ):
+            ref = ref.item()
+        r = (type(ref) is type(res)) and (ref == res)
+        if not r:
+            log_error("Accuracy failed (numpy): %s != %s", ref, res)
+        return r
+    elif is_numpy_ndarray(ref):
+        return (type(ref) is type(res)) and same(
+            torch.as_tensor(ref),
+            torch.as_tensor(res),
+            fp64_ref,
+            cos_similarity=cos_similarity,
+            tol=tol,
+            equal_nan=equal_nan,
+            exact_dtype=exact_dtype,
+            relax_numpy_equality=relax_numpy_equality,
+            ignore_non_fp=ignore_non_fp,
+            log_error=log_error,
+        )
+    elif type(ref).__name__ in (
+        "MaskedLMOutput",
+        "Seq2SeqLMOutput",
+        "CausalLMOutputWithCrossAttentions",
+        "LongformerMaskedLMOutput",
+        "Instances",
+        "SquashedNormal",
+        "Boxes",
+        "Normal",
+        "TanhTransform",
+        "Foo",
+        "Variable",
+    ):
+        assert type(ref) is type(res)
+        return all(
+            same(
+                getattr(ref, key),
+                getattr(res, key),
+                getattr(fp64_ref, key),
+                cos_similarity=cos_similarity,
+                tol=tol,
+                equal_nan=equal_nan,
+                exact_dtype=exact_dtype,
+                relax_numpy_equality=relax_numpy_equality,
+                ignore_non_fp=ignore_non_fp,
+                log_error=log_error,
+            )
+            for key in ref.__dict__.keys()
+        )
+    else:
+        raise RuntimeError(f"unsupported type: {type(ref).__name__}")
+
+
+def format_func_info(code):
+    short_filename = code.co_filename.split("/")[-1]
+    return f"'{code.co_name}' ({short_filename}:{code.co_firstlineno})"
+
+
+@contextlib.contextmanager
+def disable_cache_limit():
+    prior = config.cache_size_limit
+    config.cache_size_limit = sys.maxsize
+    prior_acc_limit = config.accumulated_cache_size_limit
+    config.accumulated_cache_size_limit = sys.maxsize
+
+    try:
+        yield
+    finally:
+        config.cache_size_limit = prior
+        config.accumulated_cache_size_limit = prior_acc_limit
+
+
+# map from transformed code back to original user code
+orig_code_map = ExactWeakKeyDictionary()
+
+# keep a record of code_obj -> list of guard failure reasons for logging
+guard_failures: DefaultDict[Any, List[Any]] = collections.defaultdict(list)
+
+# Keep a record of graph break reasons for logging
+graph_break_reasons: List["torch._dynamo.output_graph.GraphCompileReason"] = list()
+
+# keep record of compiled code, if we are in "error if recompile"
+# to track code that dynamo has compiled previously
+seen_code_map = ExactWeakKeyDictionary()
+
+
+class CompileProfiler:
+    """Utility for profiling how and what dynamo would compile.
+
+    Can be used for
+     * diagnosing recompilation issues
+     * determining an appropriate compile cache limit
+     * (TODO)confirming which functions got compiled/skipped
+    """
+
+    def __init__(self):
+        self.frame_count = 0
+        self.op_count = 0
+        self.backend_ctx_ctor = disable_cache_limit
+
+    def __call__(self, gm: torch.fx.GraphModule, example_inputs):
+        self.frame_count += 1
+        for node in gm.graph.nodes:
+            if "call" in node.op:
+                self.op_count += 1
+        return gm.forward
+
+    # no-op __enter__ and __exit__ to preserve BC
+    def __enter__(self):
+        return self
+
+    def __exit__(self, typ, val, traceback):
+        pass
+
+    def get_metrics(self):
+        return {"guard_failures": guard_failures}
+
+    def report(self):
+        metrics = self.get_metrics()
+        gf = metrics["guard_failures"]
+
+        def num_recompiles(code):
+            return len(gf[code])
+
+        def recompile_reasons(code):
+            return "\n".join([str(x) for x in gf[code]])
+
+        summarized_gf = [
+            [format_func_info(code), num_recompiles(code), recompile_reasons(code)]
+            for code in gf
+        ]
+
+        def graph_break_report():
+            if "graph_break" in counters:
+                graph_breaks = counters["graph_break"]
+                return tabulate(
+                    [[msg, graph_breaks[msg]] for msg in graph_breaks],
+                    headers=["Graph Break Reason", "Count"],
+                )
+
+        def recompilation_report():
+            if len(gf):
+                max_recompiles = max([num_recompiles(code) for code in gf])
+                recomp_table = tabulate(
+                    summarized_gf,
+                    headers=["Function", "Recompiles", "Recompile Reasons"],
+                )
+                return recomp_table + textwrap.dedent(
+                    f"""
+
+                    Set torch._dynamo.config.cache_size_limit to {max_recompiles} to avoid being cache limited.
+                """
+                )
+
+        report = textwrap.dedent(
+            """
+            Torchdynamo Profiler Report
+            ===========================
+
+            Graph Breaks
+            ------------
+            Graph breaks happen when torchdynamo encounters code it can't safely trace.
+            If you want to find out why breaks are happening, check below for each break reason
+            You may gain additional insight by passing `fullgraph=True` to torch.compile,
+            to stop at the first break.
+
+        """
+        )
+        report += graph_break_report() or "No graph breaks detected."
+        report += textwrap.dedent(
+            """
+
+            Recompilation
+            -------------
+            These subgraphs were recompiled more than once due to guard failures
+            Guard failures indicate some condition assumed to be static by the tracer changed,
+            making it unsafe to reuse the compiled program.
+
+        """
+        )
+        report += recompilation_report() or "No recompilation detected.\n"
+        return report
+
+
+# return same dir unless user changes config between calls
+@functools.lru_cache(None)
+def _get_debug_dir(root_dir):
+    dir_name = (
+        "run_"
+        + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+        # use pid to avoid conflicts among ranks
+        + "-pid_"
+        + str(os.getpid())
+    )
+    return os.path.join(root_dir, dir_name)
+
+
+def get_debug_dir():
+    debug_root = config.debug_dir_root
+    return _get_debug_dir(debug_root)
+
+
+def extract_fake_example_value(node, required=True):
+    if "example_value" in node.meta and is_fake(node.meta["example_value"]):
+        return node.meta["example_value"]
+    elif required:
+        from torch._dynamo.exc import unimplemented
+
+        unimplemented("`FakeTensor` example value was required but not available")
+    else:
+        return None
+
+
+def ensure_graph_fake(e, tx):
+    assert maybe_get_fake_mode(e) is tx.fake_mode
+    return e
+
+
+def get_fake_values_from_nodes(tx, nodes, allow_non_graph_fake):
+    def visit(n: torch.fx.Node):
+        if n.op == "call_function" and "example_value" not in n.meta:
+            # fake tensor validity is checked inside get_fake_value using
+            # ensure_graph_fake
+            return get_fake_value(n, tx, allow_non_graph_fake)
+
+        out = n.meta["example_value"]
+        if not allow_non_graph_fake and isinstance(out, torch.Tensor):
+            return ensure_graph_fake(out, tx)
+        return out
+
+    return torch.fx.node.map_arg(nodes, visit)
+
+
+def get_fake_value(node, tx, allow_non_graph_fake=False):
+    """
+    Run the computation represented by `node` using fake tensors and return the result.
+
+    allow_non_graph_fake: whether to allow the return result to be:
+        1. non-fake or 2. fake that is not created by this instance of Dynamo.
+        If `True`, you must be prepared to deal with such return values, ideally
+        by further wrapping them as this graph's fakes.
+    """
+    from torch.utils._sympy.value_ranges import ValueRangeError
+    from .exc import (
+        TorchRuntimeError,
+        unimplemented,
+        Unsupported,
+        UserError,
+        UserErrorType,
+    )
+
+    op = node.op
+
+    # FX Node should always return the same fake value
+    if "example_value" in node.meta and is_fake(node.meta["example_value"]):
+        return node.meta["example_value"]
+
+    args, kwargs = get_fake_values_from_nodes(
+        tx, (node.args, node.kwargs), allow_non_graph_fake
+    )
+
+    nnmodule = None
+    if op == "call_method" and len(args) > 0 and isinstance(args[0], torch.nn.Module):
+        # If the first argument is nn.Module, should copy to fake mode.
+        args = (deepcopy_to_fake_tensor(args[0], tx.fake_mode),) + tuple(args[1:])
+
+    if op == "call_module":
+        nnmodule = tx.output.nn_modules[node.target]
+
+        if is_lazy_module(nnmodule) and hasattr(nnmodule, "_initialize_hook"):
+            # In the case of a lazy module, we want to run
+            # the pre-hooks which initialize it.
+            # Afterwards, lazy module deletes its pre-hooks
+            # to avoid treating it as lazy on subsequent recompile.
+            nnmodule._infer_parameters(nnmodule, args)
+
+        # no matter it's lazy module or not, we should copy to fake mode.
+        nnmodule = deepcopy_to_fake_tensor(nnmodule, tx.fake_mode)
+
+    try:
+        with tx.fake_mode, enable_python_dispatcher():
+            ret_val = wrap_fake_exception(
+                lambda: run_node(tx.output, node, args, kwargs, nnmodule)
+            )
+    except Unsupported:
+        raise
+    except RuntimeError as e:
+        cause: BaseException = e
+        if e.__cause__ is not None:
+            cause = e.__cause__
+
+        if isinstance(
+            cause, torch._subclasses.fake_tensor.DataDependentOutputException
+        ):
+            unimplemented(
+                f"data dependent operator: {cause.func}; "
+                "to enable, set torch._dynamo.config.capture_scalar_outputs = True"
+            )
+        elif isinstance(
+            cause, torch._subclasses.fake_tensor.DynamicOutputShapeException
+        ):
+            unimplemented(
+                f"dynamic shape operator: {cause.func}; "
+                "to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True"
+            )
+        elif isinstance(
+            cause, torch._subclasses.fake_tensor.UnsupportedOperatorException
+        ):
+            op = cause.func
+            import_suggestion = ""
+            if isinstance(op, torch._ops.OpOverload):
+                maybe_pystub = torch._C._dispatch_pystub(
+                    op._schema.name, op._schema.overload_name
+                )
+                if maybe_pystub is not None:
+                    module, ctx = maybe_pystub
+                    import_suggestion = (
+                        f"It's possible that the support was implemented in "
+                        f"module `{module}` and you may need to `import {module}`"
+                        f"({ctx}), otherwise "
+                    )
+            unimplemented(
+                f"unsupported operator: {cause.func} ({import_suggestion}see "
+                "https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.64r4npvq0w0"
+                " for how to fix)"
+            )
+        elif isinstance(
+            cause, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
+        ):
+            raise UserError(  # noqa: TRY200
+                UserErrorType.CONSTRAINT_VIOLATION,
+                "Tried to use data-dependent value in the subsequent computation. "
+                "This can happen when we encounter unbounded dynamic value that is unknown during tracing time.  "
+                "You will need to explicitly give hint to the compiler. Please take a look at "
+                f"constrain_as_value OR constrain_as_size APIs.  {cause}",
+                case_name="constrain_as_size_example",
+            )
+        elif isinstance(cause, ValueRangeError):
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, e.args[0]) from e
+        raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None
+
+    if not allow_non_graph_fake:
+        _ = tree_map_only(
+            torch.Tensor, functools.partial(ensure_graph_fake, tx=tx), ret_val
+        )
+    return ret_val
+
+
+_current_node = threading.local()
+
+
+def get_current_node():
+    return getattr(_current_node, "value", None)
+
+
+@contextmanager
+def set_current_node(node):
+    old = get_current_node()
+    _current_node.value = node
+    try:
+        yield
+    finally:
+        _current_node.value = old
+
+
+def run_node(tracer, node, args, kwargs, nnmodule):
+    """
+    Runs a given node, with the given args and kwargs.
+
+    Behavior is dictated by a node's op.
+
+    run_node is useful for extracting real values out of nodes.
+    See get_real_value for more info on common usage.
+
+    Note: The tracer arg is only used for 'get_attr' ops
+    Note: The nnmodule arg is only used for 'call_module' ops
+
+    Nodes that are not call_function, call_method, call_module, or get_attr will
+    raise an AssertionError.
+    """
+    op = node.op
+
+    with set_current_node(node):
+
+        def make_error_message(e):
+            return f"Failed running {op} {node.target}(*{args}, **{kwargs}):\n" + str(e)
+
+        try:
+            if op == "call_function":
+                return node.target(*args, **kwargs)
+            elif op == "call_method":
+                return getattr(args[0], node.target)(*args[1:], **kwargs)
+            elif op == "call_module":
+                assert nnmodule is not None
+                return nnmodule(*args, **kwargs)
+            elif op == "get_attr":
+                return tracer.get_submodule(node.target)
+            elif op == "placeholder":
+                assert "example_value" in node.meta
+                return node.meta["example_value"]
+
+        except (NotImplementedError, UnsupportedFakeTensorException) as e:
+            # NB: mimic how wrap_fake_exception does it
+            from .exc import unimplemented
+
+            raise unimplemented(make_error_message(e)) from e
+        except Exception as e:
+            raise RuntimeError(make_error_message(e)).with_traceback(
+                e.__traceback__
+            ) from e
+
+    raise AssertionError(op)
+
+
+def get_real_value(node, tracer):
+    """
+    Run the actual computation represented by `node` and return the result.
+    This will execute any dependent nodes in the graph as well.
+    """
+    from .exc import TorchRuntimeError
+
+    cache = tracer.real_value_cache
+    if node in cache:
+        return cache[node]
+
+    op = node.op
+    args, kwargs = torch.fx.node.map_arg(
+        (node.args, node.kwargs),
+        lambda n: get_real_value(n, tracer),
+    )
+
+    if op == "call_module":
+        nn_module = tracer.output_graph.nn_modules[node.target]
+        if not is_lazy_module(nn_module):
+            nn_module = copy.deepcopy(nn_module)
+        else:
+            # In the case of a lazy module, we want to run
+            # the pre-hooks which initialize it
+            nn_module(*args, **kwargs)
+    else:
+        nn_module = None
+
+    try:
+        real_value = run_node(tracer, node, args, kwargs, nn_module)
+        cache[node] = real_value
+    except RuntimeError as e:
+        raise TorchRuntimeError(str(e)).with_traceback(e.__traceback__) from None
+    return real_value
+
+
+def assert_no_fake_params_or_buffers(gm):
+    from torch._subclasses.fake_tensor import FakeTensorConfig
+
+    def stack_or_hint(t):
+        if FakeTensorConfig.debug:
+            import traceback
+
+            return f"FAKE TENSOR CREATION TRACEBACK: \n {traceback.format_list(t._debug_trace)}"
+        else:
+            return "Enable TORCH_FAKE_TENSOR_DEBUG=1 to get creation stack traces on fake tensors."
+
+    for name, buffer in gm.named_buffers():
+        assert not isinstance(
+            buffer, torch._subclasses.FakeTensor
+        ), f"Unexpected fake buffer {name} {stack_or_hint(buffer)}"
+    for name, param in gm.named_parameters():
+        assert not isinstance(
+            param, torch._subclasses.FakeTensor
+        ), f"Unexpected fake param {name} {stack_or_hint(param)}"
+
+
+def fqn(obj: Any):
+    """
+    Returns the fully qualified name of the object.
+    """
+    return f"{obj.__module__}.{obj.__qualname__}"
+
+
+def ifdynstaticdefault(count1, count2):
+    if torch._dynamo.config.assume_static_by_default:
+        return count1
+    else:
+        return count2
+
+
+def import_submodule(mod: types.ModuleType):
+    """
+    Ensure all the files in a given submodule are imported
+    """
+    for filename in sorted(os.listdir(os.path.dirname(cast(str, mod.__file__)))):
+        if filename.endswith(".py") and filename[0] != "_":
+            importlib.import_module(f"{mod.__name__}.{filename[:-3]}")
+
+
+def object_has_getattribute(value: Any):
+    try:
+        if isinstance(
+            inspect.getattr_static(type(value), "__getattribute__"),
+            types.FunctionType,
+        ):
+            return True
+    except AttributeError:
+        pass
+    return False
+
+
+def get_custom_getattr(value: Any):
+    try:
+        getattr_fn = inspect.getattr_static(type(value), "__getattr__")
+    except AttributeError:
+        getattr_fn = None
+    if getattr_fn is torch.nn.Module.__getattr__:
+        # ignore this case of getattr
+        getattr_fn = None
+    return getattr_fn
+
+
+class TensorStaticReason(enum.Enum):
+    PARAMETER = 2
+    NOT_TENSOR = 4
+    NN_MODULE_PROPERTY = 5
+
+
+def tensor_static_reason_to_message(reason: TensorStaticReason):
+    if reason == TensorStaticReason.PARAMETER:
+        return "mark_dynamic on parameter, parameters are always static today."
+    if reason == TensorStaticReason.NOT_TENSOR:
+        return "mark_dynamic on a non tensor, how did this happen?"
+    if reason == TensorStaticReason.NN_MODULE_PROPERTY:
+        return "tensor is static because it is nn module associated."
+    raise AssertionError(f"Illegal reason {reason}")
+
+
+def tensor_always_has_static_shape(
+    tensor: Union[torch.Tensor, Any],
+    is_tensor: bool,
+    guard_source: "torch._guards.GuardSource",
+) -> Tuple[bool, Optional[TensorStaticReason]]:
+    """
+    Given a tensor, source, and is_tensor flag, determine if a shape should be static.
+
+    Args:
+    tensor - the real tensor to evaluate, parameters force a static shape.
+    is_tensor - internal dynamo check, essentially "is_tensor": target_cls is TensorVariable,
+    tensors not in a TensorVariable for whatever reason are forced static.
+
+    Returns a tuple, where the first element is the bool of whether or not this tensor should have a static shape.
+    The second element is a TensorStaticReason, useful for passing to tensor_static_reason_to_message if needed.
+    """
+    if guard_source.is_nn_module() and config.force_nn_module_property_static_shapes:
+        return True, TensorStaticReason.NN_MODULE_PROPERTY
+    if type(tensor) is torch.nn.Parameter and config.force_parameter_static_shapes:
+        return True, TensorStaticReason.PARAMETER
+    if not is_tensor:
+        return True, TensorStaticReason.NOT_TENSOR
+    return False, None
+
+
+def lazy_format_graph_code(name, gm, maybe_id=None):
+    def format_name():
+        if maybe_id is not None:
+            return f"{name} {maybe_id}"
+        else:
+            return name
+
+    return LazyString(
+        lambda: _format_graph_code(
+            f"===== {format_name()} =====\n",
+            gm.forward.__code__.co_filename,
+            gm.print_readable(print_output=False),
+        )
+    )
+
+
+def _format_graph_code(name, filename, graph_str):
+    return f"TRACED GRAPH\n {name} {filename} {graph_str}\n"
+
+
+def lazy_format_graph_tabular(fn_name, gm):
+    def inner():
+        try:
+            from tabulate import tabulate  # TODO: Check that this is installed
+        except ImportError:
+            return (
+                "Tabulate module missing, please install tabulate to log the graph in tabular format, logging code instead:\n"
+                + str(lazy_format_graph_code(fn_name, gm))
+            )
+
+        node_specs = [
+            [n.op, n.name, n.target, n.args, n.kwargs] for n in gm.graph.nodes
+        ]
+        graph_str = tabulate(
+            node_specs, headers=["opcode", "name", "target", "args", "kwargs"]
+        )
+        return _format_graph_code(fn_name, gm.forward.__code__.co_filename, graph_str)
+
+    return LazyString(inner)
+
+
+def format_bytecode(prefix, name, filename, line_no, code):
+    return f"{prefix} {name} {filename} line {line_no} \n{dis.Bytecode(code).dis()}\n"
+
+
+forward_hook_names = ["_forward_pre_hooks", "_forward_hooks"]
+backward_hook_names = ["_backward_pre_hooks", "_backward_hooks"]
+state_dict_hook_names = [
+    "_state_dict_pre_hooks",
+    "_state_dict_hooks",
+    "_load_state_dict_pre_hooks",
+    "_load_state_dict_post_hooks",
+]
+all_hook_names = forward_hook_names + backward_hook_names + state_dict_hook_names
+
+
+def nn_module_get_all_hooks(
+    mod,
+    check_forward_hooks=False,
+    check_backward_hooks=False,
+    check_state_dict_hooks=False,
+):
+    reset_code = torch._C._dynamo.eval_frame.reset_code
+    """
+    Sometimes its useful to differentiate between types of hooks such as forward/backward/pre
+    hooks executed during module.__call__, and state_dict hooks which are executed separately.
+    """
+    hook_dicts_to_check = []
+    check_all_hooks = (
+        not check_forward_hooks
+        and not check_backward_hooks
+        and not check_state_dict_hooks
+    )
+    if check_forward_hooks or check_all_hooks:
+        hook_dicts_to_check.extend(forward_hook_names)
+    if check_backward_hooks or check_all_hooks:
+        hook_dicts_to_check.extend(backward_hook_names)
+    if check_state_dict_hooks:
+        hook_dicts_to_check.extend(state_dict_hook_names)
+
+    all_hooks = []
+    for hook_dict_name in hook_dicts_to_check:
+        hooks = getattr(mod, hook_dict_name, [])
+        for hook_name in hooks:
+            hook = hooks[hook_name]
+
+            all_hooks.append(hook)
+    return all_hooks
+
+
+def nnmodule_has_hooks(
+    mod,
+    check_forward_hooks=False,
+    check_backward_hooks=False,
+    check_state_dict_hooks=False,
+):
+    """
+    Helper function to check if a module has any hooks attached to it.
+    """
+    hooks = nn_module_get_all_hooks(
+        mod,
+        check_forward_hooks=check_forward_hooks,
+        check_backward_hooks=check_backward_hooks,
+        check_state_dict_hooks=check_state_dict_hooks,
+    )
+    return bool(hooks)
+
+
+def to_numpy_helper(value):
+    """Convert tensor and tnp.ndarray to numpy.ndarray."""
+    if is_fake(value):
+        return value
+    if isinstance(value, tnp.ndarray):
+        return to_numpy_helper(value.tensor)
+    elif isinstance(value, torch.Tensor):
+        return value.numpy(force=True)
+    elif isinstance(value, (tuple, list)):
+        return type(value)(to_numpy_helper(obj) for obj in value)
+    else:
+        return value
+
+
+def numpy_to_tensor(value):
+    """Convert tnp.ndarray to tensor, leave other types intact. If a list/tuple, loop through it to convert."""
+    assert np is not None
+    if isinstance(value, np.ndarray):
+        return torch.as_tensor(value)
+    if isinstance(value, tnp.ndarray):
+        return value.tensor
+    elif isinstance(value, (tuple, list)):
+        return type(value)(numpy_to_tensor(obj) for obj in value)
+    else:
+        return value
+
+
+class numpy_to_tensor_wrapper:
+    def __init__(self, f):
+        self.f = f
+        self.__name__ = "wrapped_" + self.f.__name__
+
+    def __repr__(self):
+        return f"<Wrapped function <original {self.f.__name__}>>"
+
+    def __call__(self, *args, **kwargs):
+        out = self.f(*args, **kwargs)
+        return numpy_to_tensor(out)
+
+
+def numpy_attr_wrapper(obj, name):
+    if isinstance(obj, tnp.ndarray):
+        out = getattr(obj, name)
+        return numpy_to_tensor(out)
+    elif isinstance(obj, torch.Tensor):
+        out = getattr(tnp.ndarray(obj), name)
+        return numpy_to_tensor(out)
+
+
+class numpy_method_wrapper:
+    """Convert obj from torch.Tensor to tnp.ndarray and call method. Then convert result back to torch.Tensor."""
+
+    def __init__(self, method: str):
+        self.method = method
+        self.__name__ = "wrapped_" + self.method
+
+    def __repr__(self):
+        return f"<Wrapped method <original {self.method}>>"
+
+    def __call__(self, *args, **kwargs):
+        obj = args[0]
+        if isinstance(obj, torch.Tensor):
+            obj = tnp.ndarray(obj)
+        method_callable = getattr(obj, self.method)
+        out = method_callable(*args[1:], **kwargs)
+        return numpy_to_tensor(out)
+
+
+class numpy_operator_wrapper:
+    """Implements dunder methods for tnp.ndarray via functions from the operator library"""
+
+    def __init__(self, op: Callable[..., Any]):
+        self.op = op
+        self.__name__ = f"wrapped_{op.__name__}"
+
+    def __repr__(self):
+        return f"<Wrapped operator <original {self.__name__}>>"
+
+    def __call__(self, *args, **kwargs):
+        assert not kwargs
+
+        args = (
+            tnp.ndarray(arg) if isinstance(arg, torch.Tensor) else arg for arg in args
+        )
+        out = self.op(*args)
+        return numpy_to_tensor(out)
+
+
+def defake(x):
+    if not isinstance(x, FakeTensor):
+        return x
+    size: "torch._prims_common.ShapeType"
+    stride: "torch._prims_common.StrideType"
+    if x._has_symbolic_sizes_strides:
+        size = []
+        for s in x.size():
+            if isinstance(s, torch.SymInt):
+                size.append(s.node.shape_env.size_hint(s.node.expr))
+            else:
+                size.append(s)
+        stride = []
+        for s in x.stride():
+            if isinstance(s, torch.SymInt):
+                stride.append(s.node.shape_env.size_hint(s.node.expr))
+            else:
+                stride.append(s)
+    else:
+        size = x.size()
+        stride = x.stride()
+    y = torch.empty_strided(
+        size,
+        stride,
+        dtype=x.dtype,
+        device=x.device,
+        requires_grad=x.requires_grad,
+    )
+    y.zero_()
+    return y
+
+
+def is_utils_checkpoint(obj):
+    # Lazy import to avoid circular dependencies
+    import torch.utils.checkpoint
+
+    return obj is torch.utils.checkpoint.checkpoint
+
+
+def build_checkpoint_variable(**options):
+    import torch._higher_order_ops.wrap as higher_order_ops
+    from .variables.higher_order_ops import TorchHigherOrderOperatorVariable
+
+    # TODO - This is a temporary situation where we have two versions of
+    # checkpointing implementation. We will converge on one and remove the other.
+    activation_checkpoint_op: "torch._ops.HigherOrderOperator" = (
+        higher_order_ops.tag_activation_checkpoint
+    )
+    if torch._functorch.config.functionalize_rng_ops:
+        activation_checkpoint_op = higher_order_ops.wrap_activation_checkpoint
+
+    return TorchHigherOrderOperatorVariable.make(
+        activation_checkpoint_op,
+        **options,
+    )
+
+
+def is_compile_supported(device_type):
+    from .eval_frame import is_dynamo_supported
+
+    compile_supported = is_dynamo_supported()
+    if device_type == "cpu":
+        pass
+    elif device_type == "cuda" and compile_supported:
+        from torch.utils._triton import has_triton
+
+        compile_supported = has_triton()
+    else:
+        compile_supported = False
+    return compile_supported
+
+
+# The following 3.11 source code functions are adapted from
+# https://github.com/python/cpython/blob/v3.11.4/Lib/traceback.py
+# in order to output source code corresponding to bytecode in 3.11+.
+# We need our own versions since we want to support multiline expressions.
+def _fix_offset(str: str, offset: int) -> int:
+    """
+    Convert byte offset `offset` of `str` into character offset.
+    Byte offset is used for 3.11+ instruction column data.
+    Takes things like unicode characters into consideration.
+
+    Unchanged from CPython implementation.
+    """
+    as_utf8 = str.encode("utf-8")
+    return len(as_utf8[:offset].decode("utf-8", errors="replace"))
+
+
+@dataclasses.dataclass
+class _Anchors:
+    # inclusive
+    left_end_lineno: int
+    left_end_offset: int
+    right_start_lineno: int
+    # exclusive
+    right_start_offset: int
+
+
+def _extract_anchors_from_expr(segment: str) -> Optional[_Anchors]:
+    """
+    Given source code `segment` corresponding to a bytecode
+    instruction, determine:
+        - for binary ops, the location of the binary op
+        - for indexing, the location of the brackets.
+    `segment` is expected to be a valid Python expression
+    """
+    assert sys.version_info >= (3, 11)
+
+    import ast
+
+    try:
+        # Without brackets, `segment` is parsed as a statement.
+        # We expect an expression, so wrap `segment` in
+        # brackets to handle multi-line expressions.
+        tree = ast.parse("(\n" + segment + "\n)")
+    except SyntaxError:
+        return None
+
+    if len(tree.body) != 1:
+        return None
+
+    lines = segment.split("\n")
+
+    # get character index given byte offset
+    def normalize(lineno, offset):
+        return _fix_offset(lines[lineno], offset)
+
+    # Gets the next valid character index in `lines`, if
+    # the current location is not valid. Handles empty lines.
+    def next_valid_char(lineno, col):
+        while lineno < len(lines) and col >= len(lines[lineno]):
+            col = 0
+            lineno += 1
+        assert lineno < len(lines) and col < len(lines[lineno])
+        return lineno, col
+
+    # Get the next valid character index in `lines`.
+    def increment(lineno, col):
+        col += 1
+        lineno, col = next_valid_char(lineno, col)
+        assert lineno < len(lines) and col < len(lines[lineno])
+        return lineno, col
+
+    # Get the next valid character at least on the next line
+    def nextline(lineno, col):
+        col = 0
+        lineno += 1
+        lineno, col = next_valid_char(lineno, col)
+        assert lineno < len(lines) and col < len(lines[lineno])
+        return lineno, col
+
+    statement = tree.body[0]
+    if isinstance(statement, ast.Expr):
+        expr = statement.value
+        if isinstance(expr, ast.BinOp):
+            # ast gives locations for BinOp subexpressions, e.g.
+            # ( left_expr ) + ( right_expr )
+            #   left^^^^^       right^^^^^
+            # -2 since end_lineno is 1-indexed and because we added an extra
+            # bracket to `segment` when calling ast.parse
+            cur_lineno = cast(int, expr.left.end_lineno) - 2
+            cur_col = normalize(cur_lineno, expr.left.end_col_offset)
+            cur_lineno, cur_col = next_valid_char(cur_lineno, cur_col)
+
+            # Heuristic to find the operator character.
+            # The original CPython implementation did not look for ), \, or #,
+            # leading to incorrect anchor location, e.g.
+            # (x) + (y)
+            # ~~^~~~~~~
+            while (ch := lines[cur_lineno][cur_col]).isspace() or ch in ")\\#":
+                if ch in "\\#":
+                    cur_lineno, cur_col = nextline(cur_lineno, cur_col)
+                else:
+                    cur_lineno, cur_col = increment(cur_lineno, cur_col)
+
+            # binary op is 1 or 2 characters long, on the same line
+            right_col = cur_col + 1
+            if (
+                right_col < len(lines[cur_lineno])
+                and not (ch := lines[cur_lineno][right_col]).isspace()
+                and ch not in "\\#"
+            ):
+                right_col += 1
+            # right_col can be invalid since it is exclusive
+
+            return _Anchors(cur_lineno, cur_col, cur_lineno, right_col)
+        elif isinstance(expr, ast.Subscript):
+            # ast gives locations for value and slice subexpressions, e.g.
+            # ( value_expr ) [ slice_expr ]
+            #   value^^^^^     slice^^^^^
+            # subscript^^^^^^^^^^^^^^^^^^^^
+            # find left bracket (first '[' after value)
+            left_lineno = cast(int, expr.value.end_lineno) - 2
+            left_col = normalize(left_lineno, expr.value.end_col_offset)
+            left_lineno, left_col = next_valid_char(left_lineno, left_col)
+            while lines[left_lineno][left_col] != "[":
+                left_lineno, left_col = increment(left_lineno, left_col)
+            # find right bracket (final character of expression)
+            right_lineno = cast(int, expr.end_lineno) - 2
+            right_col = normalize(right_lineno, expr.end_col_offset)
+            return _Anchors(left_lineno, left_col, right_lineno, right_col)
+        elif isinstance(expr, ast.Call):
+            # ( func_expr ) (args, kwargs)
+            #   func^^^^^
+            # call^^^^^^^^^^^^^^^^^^^^^^^^
+            # find left bracket (first '(' after func)
+            left_lineno = cast(int, expr.func.end_lineno) - 2
+            left_col = normalize(left_lineno, expr.func.end_col_offset)
+            left_lineno, left_col = next_valid_char(left_lineno, left_col)
+            while lines[left_lineno][left_col] != "(":
+                left_lineno, left_col = increment(left_lineno, left_col)
+            # find right bracket (final character of expression)
+            right_lineno = cast(int, expr.end_lineno) - 2
+            right_col = normalize(right_lineno, expr.end_col_offset)
+            return _Anchors(left_lineno, left_col, right_lineno, right_col)
+
+    return None
+
+
+def get_instruction_source_311(code: types.CodeType, inst: dis.Instruction) -> str:
+    """
+    Python 3.11+ only. Returns lines of source code (from code object `code`)
+    corresponding to `inst`'s location data, and underlines relevant code to `inst`.
+
+    Example: CALL on `g`:
+    f(g(
+      ^^
+        h(x)))
+        ^^^^^
+
+    We need our own implementation since `format_frame_summary` in
+    Python's `traceback` module doesn't handle multi-line expressions
+    (and their anchor extraction code is not completely correct).
+    """
+    assert inst.positions is not None
+    if inst.positions.lineno is None:
+        return ""
+    # The rstrip + "\n" pattern is used throughout this function to handle
+    # linecache.getline errors. Error lines are treated as empty strings "", but we want
+    # to treat them as blank lines "\n".
+    first_line = linecache.getline(code.co_filename, inst.positions.lineno).rstrip()
+    if inst.positions.end_lineno is None:
+        return first_line
+    if inst.positions.col_offset is None or inst.positions.end_col_offset is None:
+        return first_line
+
+    # character index of the start of the instruction
+    start_offset = _fix_offset(first_line, inst.positions.col_offset)
+    # character index of the end of the instruction
+    # compute later since end may be a different line
+    end_offset = None
+    # expression corresponding to the instruction so we can get anchors
+    segment = ""
+    # underline markers to be printed - start with `~` marker and replace with `^` later
+    markers = []
+
+    # Compute segment and initial markers
+    if inst.positions.end_lineno == inst.positions.lineno:
+        end_offset = _fix_offset(first_line, inst.positions.end_col_offset)
+        segment = first_line[start_offset:end_offset]
+        markers.append(" " * start_offset + "~" * (end_offset - start_offset))
+    else:
+        segment = first_line[start_offset:] + "\n"
+        markers.append(" " * start_offset + "~" * (len(first_line) - start_offset))
+        last_line = linecache.getline(
+            code.co_filename, inst.positions.end_lineno
+        ).rstrip()
+        end_offset = _fix_offset(last_line, inst.positions.end_col_offset)
+        for lineno in range(inst.positions.lineno + 1, inst.positions.end_lineno):
+            line = linecache.getline(code.co_filename, lineno).rstrip()
+            segment += line + "\n"
+            # don't underline leading spaces
+            num_spaces = len(line) - len(line.lstrip())
+            markers.append(" " * num_spaces + "~" * (len(line) - num_spaces))
+        segment += last_line[:end_offset]
+        num_spaces = len(last_line) - len(last_line.lstrip())
+        markers.append(" " * num_spaces + "~" * (end_offset - num_spaces))
+
+    anchors: Optional[_Anchors] = None
+    try:
+        anchors = _extract_anchors_from_expr(segment)
+    except AssertionError:
+        pass
+
+    # replace `~` markers with `^` where necessary
+    if anchors is None:
+        markers = [marker.replace("~", "^") for marker in markers]
+    else:
+        # make markers mutable
+        mutable_markers: List[List[str]] = [list(marker) for marker in markers]
+
+        # anchor positions do not take start_offset into account
+        if anchors.left_end_lineno == 0:
+            anchors.left_end_offset += start_offset
+        if anchors.right_start_lineno == 0:
+            anchors.right_start_offset += start_offset
+
+        # Turn `~`` markers between anchors to `^`
+        for lineno in range(len(markers)):
+            for col in range(len(mutable_markers[lineno])):
+                if lineno < anchors.left_end_lineno:
+                    continue
+                if lineno == anchors.left_end_lineno and col < anchors.left_end_offset:
+                    continue
+                if (
+                    lineno == anchors.right_start_lineno
+                    and col >= anchors.right_start_offset
+                ):
+                    continue
+                if lineno > anchors.right_start_lineno:
+                    continue
+                if mutable_markers[lineno][col] == "~":
+                    mutable_markers[lineno][col] = "^"
+
+        # make markers into strings again
+        markers = ["".join(marker) for marker in mutable_markers]
+
+    result = ""
+    for i in range(len(markers)):
+        result += (
+            linecache.getline(code.co_filename, inst.positions.lineno + i).rstrip()
+            + "\n"
+        )
+        result += markers[i] + "\n"
+    return result
+
+
+def get_static_address_type(t):
+    if isinstance(t, torch.Tensor):
+        return getattr(t, "_dynamo_static_input_type", None)
+
+    return None
+
+
+def is_rng_state_getter_or_setter(value):
+    getters = (
+        # The following two functions are not identical, so don't remove anyone!
+        torch._C.Generator.get_state,
+        torch.default_generator.get_state,
+        torch.get_rng_state,
+        torch.cuda.get_rng_state,
+    )
+    setters = (
+        torch._C.Generator.set_state,
+        torch.default_generator.set_state,
+        torch.set_rng_state,
+        torch.cuda.set_rng_state,
+    )
+    return value in (*setters, *getters)
+
+
+def is_tensor_base_attr_getter(value):
+    return (
+        isinstance(value, types.MethodWrapperType)
+        and value.__name__ == "__get__"
+        and value.__self__.__objclass__ is torch._C._TensorBase  # type: ignore[attr-defined]
+    )
+
+
+def is_torch_function_object(value):
+    return hasattr(value, "__torch_function__")
+
+
+def has_torch_function(vt: "torch._dynamo.variables.base.VariableTracker") -> bool:
+    from torch._dynamo.variables import UserDefinedObjectVariable
+    from torch._dynamo.variables.torch_function import TensorWithTFOverrideVariable
+
+    return isinstance(vt, TensorWithTFOverrideVariable) or (
+        isinstance(vt, UserDefinedObjectVariable)
+        and hasattr(vt.value, "__torch_function__")
+    )
+
+
+# see note [Tensor Fakification and Symbol Caching]
+def to_fake_tensor(t, fake_mode):
+    symbolic_context = None
+    source = None
+    if tracing_context := torch._guards.TracingContext.try_get():
+        if t in tracing_context.tensor_to_context:
+            symbolic_context = tracing_context.tensor_to_context[t]
+            source = symbolic_context.tensor_source
+
+    return fake_mode.from_tensor(
+        t, static_shapes=False, symbolic_context=symbolic_context, source=source
+    )
+
+
+def get_first_attr(obj, *attrs):
+    """
+    Return the first available attribute or throw an exception if none is present.
+    """
+    for attr in attrs:
+        if hasattr(obj, attr):
+            return getattr(obj, attr)
+
+    raise AssertionError(f"{obj} does not has any of the attributes: {attrs}")
+
+
+@contextlib.contextmanager
+def maybe_enable_compiled_autograd(should_enable):
+    def compiler_fn(gm):
+        def inner_compiler(gm_, example_inputs_):
+            torch._dynamo.utils.counters["compiled_autograd"]["compiles"] += 1
+            return torch._inductor.compile(gm_, example_inputs_)
+
+        return torch.compile(gm, backend=inner_compiler, fullgraph=True, dynamic=True)
+
+    if should_enable:
+        with torch._dynamo.compiled_autograd.enable(compiler_fn) as ctx:
+            yield ctx
+    else:
+        yield
+
+
+def invalid_removeable_handle():
+    # need a subclass so weakref works
+    class Invalid(dict):  # type: ignore[type-arg]
+        pass
+
+    return RemovableHandle(Invalid())
+
+
+# Returns a "proxy" (new object with the same class and dict) for (non-GraphModule) nn.Module's.
+# Attribute changes to the original object/proxy will be reflected in the other.
+# This is useful for cases where we want a keep-alive reference to a module without increasing
+# its reference count.
+def nn_module_proxy(mod):
+    if not isinstance(mod, torch.nn.Module):
+        return mod
+    if isinstance(mod, torch.fx.GraphModule):
+        # Dynamo-generated GM's shouldn't contain user-created GM's
+        return mod
+    proxy = mod.__class__.__new__(mod.__class__)
+    proxy.__dict__ = mod.__dict__
+    return proxy
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__init__.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..89e0853c7ee0366809e82a6bf80fb7c718f748a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/__init__.py
@@ -0,0 +1,151 @@
+# mypy: ignore-errors
+
+from .base import VariableTracker
+from .builtin import BuiltinVariable
+from .constant import ConstantVariable, EnumVariable
+from .ctx_manager import (
+    ContextWrappingVariable,
+    DeterministicAlgorithmsVariable,
+    DisabledSavedTensorsHooksVariable,
+    GradIncrementNestingCtxManagerVariable,
+    GradInplaceRequiresGradCtxManagerVariable,
+    GradModeVariable,
+    InferenceModeVariable,
+    StreamContextVariable,
+    StreamVariable,
+    VmapIncrementNestingCtxManagerVariable,
+    WithExitFunctionVariable,
+)
+from .dicts import (
+    ConstDictVariable,
+    CustomizedDictVariable,
+    DataClassVariable,
+    DefaultDictVariable,
+    SetVariable,
+)
+from .distributed import BackwardHookVariable
+from .functions import (
+    FunctoolsPartialVariable,
+    NestedUserFunctionVariable,
+    SkipFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+)
+from .higher_order_ops import (
+    FunctorchHigherOrderVariable,
+    TorchHigherOrderOperatorVariable,
+)
+from .iter import (
+    CountIteratorVariable,
+    CycleIteratorVariable,
+    IteratorVariable,
+    ItertoolsVariable,
+    RepeatIteratorVariable,
+)
+from .lazy import LazyVariableTracker
+from .lists import (
+    BaseListVariable,
+    ListIteratorVariable,
+    ListVariable,
+    NamedTupleVariable,
+    RangeVariable,
+    RestrictedListSubclassVariable,
+    SliceVariable,
+    TupleIteratorVariable,
+    TupleVariable,
+)
+from .misc import (
+    AutogradFunctionContextVariable,
+    AutogradFunctionVariable,
+    ClosureVariable,
+    DeletedVariable,
+    GetAttrVariable,
+    InspectSignatureVariable,
+    LambdaVariable,
+    MethodWrapperVariable,
+    NewCellVariable,
+    NewGlobalVariable,
+    NumpyVariable,
+    PythonModuleVariable,
+    StringFormatVariable,
+    SuperVariable,
+    TypingVariable,
+    UnknownVariable,
+)
+from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
+from .sdpa import SDPAParamsVariable
+from .tensor import (
+    FakeItemVariable,
+    NumpyNdarrayVariable,
+    SymNodeVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+    UntypedStorageVariable,
+)
+from .torch import TorchCtxManagerClassVariable, TorchInGraphFunctionVariable
+from .user_defined import (
+    RemovableHandleVariable,
+    UserDefinedClassVariable,
+    UserDefinedObjectVariable,
+)
+
+__all__ = [
+    "AutogradFunctionContextVariable",
+    "AutogradFunctionVariable",
+    "BackwardHookVariable",
+    "BaseListVariable",
+    "BuiltinVariable",
+    "ClosureVariable",
+    "ConstantVariable",
+    "ConstDictVariable",
+    "ContextWrappingVariable",
+    "CountIteratorVariable",
+    "CustomizedDictVariable",
+    "CycleIteratorVariable",
+    "DataClassVariable",
+    "DefaultDictVariable",
+    "DeletedVariable",
+    "DeterministicAlgorithmsVariable",
+    "EnumVariable",
+    "FakeItemVariable",
+    "GetAttrVariable",
+    "GradModeVariable",
+    "InspectSignatureVariable",
+    "IteratorVariable",
+    "ItertoolsVariable",
+    "LambdaVariable",
+    "LazyVariableTracker",
+    "ListIteratorVariable",
+    "ListVariable",
+    "NamedTupleVariable",
+    "NestedUserFunctionVariable",
+    "NewCellVariable",
+    "NewGlobalVariable",
+    "NNModuleVariable",
+    "NumpyNdarrayVariable",
+    "NumpyVariable",
+    "PythonModuleVariable",
+    "RangeVariable",
+    "RemovableHandleVariable",
+    "RepeatIteratorVariable",
+    "RestrictedListSubclassVariable",
+    "SDPAParamsVariable",
+    "SkipFunctionVariable",
+    "SliceVariable",
+    "StringFormatVariable",
+    "SuperVariable",
+    "TensorVariable",
+    "TorchCtxManagerClassVariable",
+    "TorchInGraphFunctionVariable",
+    "TupleVariable",
+    "UnknownVariable",
+    "UnspecializedNNModuleVariable",
+    "UnspecializedPythonVariable",
+    "UntypedStorageVariable",
+    "UserDefinedClassVariable",
+    "UserDefinedObjectVariable",
+    "UserFunctionVariable",
+    "UserMethodVariable",
+    "VariableTracker",
+    "WithExitFunctionVariable",
+]
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7c33bf1053649edb407ba14dbb0668426acf35d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/base.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e084b7472a6f0558dac4477445aa4b50522bb23e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builder.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..470039655a5c1ebe9413c87e942e669b911be475
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builtin.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builtin.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b5163e9d1e1a7a88851274fd482ac6d575e6264
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/builtin.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/constant.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/constant.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..525ef4a6399508a7262bc45b3b21410ad65c8a98
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/constant.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/ctx_manager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/ctx_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3075f3bbcbac44781c5c19b910e4a9f7eda28bd9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/ctx_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/dicts.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/dicts.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0687cd3e0712fd3a2f7c7a87c5111575afbef68
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/dicts.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e79f54fdc8b89365afa17534868da8ab341ad8f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fbf35a7a02e46cdb98a3413d32c7311147c677d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/higher_order_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/higher_order_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1871d4873bd89ef3fa0a7799f86d278df189c3b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/higher_order_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/iter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/iter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a2c8f5f6f9cfdeff64a1041eeb1ae760f1138b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/iter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lazy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lazy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e766dbd987a9de19fa8d2d0702d5c024c1237b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lazy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lists.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lists.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df4e8f0a6a47caaf474131885557d6aec59db829
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/lists.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/misc.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/misc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d693d5180318e148ffa6311533c0edb711e850c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/misc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/nn_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/nn_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81c483f70811a7c8a4f368f0b2d8b84065d65d0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/nn_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ff635deec1661a7d0b2d6eb5c8f79850b090b89
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/sdpa.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/sdpa.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00e300696957ba17cc4a6309ff2cc7001aa55f4f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/sdpa.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92df6fe544a84ca6650bbab84704de865f81cb2b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6580e1cb9beb5a3de9c6b386886b7656a3cbedb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch_function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..772ae91d5f7afe8967f11e9aa77ffd70fe7ef5ca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/torch_function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/user_defined.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/user_defined.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a5694184b0fb9d328aaa3886b6474aac562df8b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_dynamo/variables/__pycache__/user_defined.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/base.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1771d6d7e64d427086be3943751180f5216f6ad9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/base.py
@@ -0,0 +1,420 @@
+# mypy: ignore-errors
+
+import collections
+from enum import Enum
+from typing import Any, Callable, Dict, List
+
+from .. import variables
+from ..current_scope_id import current_scope_id
+from ..exc import unimplemented
+from ..source import AttrSource, Source
+from ..utils import identity, istype
+
+
+class MutableLocalSource(Enum):
+    """
+    If the VariableTracker.mutable_local represents a Variable that:
+    - already existed that Dynamo began tracking while introspection (Existing)
+    - is a new variable that is created during Dynamo introspection (Local)
+    """
+
+    Existing = 0
+    Local = 1
+
+
+class ParentsTracker:
+    """
+    This is a perf optimization to limit the number of objects we need to visit in tx.replace_all.
+    This must be a seperate object so that it is not cloned in apply.
+    """
+
+    def __init__(self):
+        # logically this is a set, but we use a dict to ensure deterministic ordering
+        self.parents: Dict[ParentsTracker, bool] = dict()
+
+    def add(self, parent):
+        self.parents[parent] = True
+
+    def recursive_parents(self):
+        rv = dict(self.parents)
+        worklist = list(self.parents)
+        while worklist:
+            for parent in worklist.pop().parents:
+                if parent not in rv:
+                    assert isinstance(parent, ParentsTracker)
+                    rv[parent] = True
+                    worklist.append(parent)
+        return rv.keys()
+
+
+class MutableLocalBase:
+    """
+    Base class for Variable.mutable_local
+    """
+
+    def __init__(self, typ: MutableLocalSource):
+        # In HigherOrderOperator tracing, we need to distinguish
+        # between MutableLocals inside the HigherOrderOperator and
+        # ones outside it. For example, it is not safe to mutate
+        # `a` in the following example because it was constructed
+        # in a different scope.
+        #
+        # def f(x):
+        #     a = 1
+        #     def g(x):
+        #         nonlocal a
+        #         a = 2
+        #         return x
+        #     return wrap(g, x) + a
+        #
+        # We use self.scope to distinguish this.
+        # scope == 0: The object was an existing variable
+        # scope == 1: The object was created while Dynamo
+        #             was introspecting a function
+        #             (and no HigherOrderOps were involved)
+        # scope >= 2: The object was created through
+        #             Dynamo introspection of a HigherOrderOp.
+        #             The exact number corresponds to the level
+        #             of nested HigherOrderOps.
+        if typ is MutableLocalSource.Existing:
+            self.scope = 0
+        elif typ is MutableLocalSource.Local:
+            self.scope = current_scope_id()
+        else:
+            unimplemented(f"Unsupported MutableLocalSource: {typ}")
+
+
+class MutableLocal(MutableLocalBase):
+    """
+    Marker used to indicate this (list, iter, etc) was constructed in
+    local scope and can be mutated safely in analysis without leaking
+    state.
+    """
+
+    def __init__(self):
+        super().__init__(MutableLocalSource.Local)
+
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return self is other
+
+
+def _is_top_level_scope(scope_id):
+    return scope_id == 1
+
+
+def is_side_effect_safe(m: MutableLocalBase):
+    scope_id = current_scope_id()
+
+    # In the top-level scope (if no HigherOrderOperators are involved),
+    # we are allowed to modify variables created in this scope as well
+    # as existing variables.
+    if _is_top_level_scope(scope_id):
+        return True
+    # Otherwise, only allow local mutation of variables created in the current scope
+    return m.scope == scope_id
+
+
+class VariableTrackerMeta(type):
+    def __call__(cls, *args, **kwargs):
+        """Call __post_init__"""
+        obj = type.__call__(cls, *args, **kwargs)
+        obj.__post_init__(*args, **kwargs)
+        return obj
+
+    def __instancecheck__(cls, instance) -> bool:
+        """Make isinstance work with LazyVariableTracker"""
+        if type.__instancecheck__(
+            variables.LazyVariableTracker, instance
+        ) and cls not in (
+            VariableTracker,
+            variables.LazyVariableTracker,
+        ):
+            instance = instance.realize()
+        return type.__instancecheck__(cls, instance)
+
+
+class VariableTracker(metaclass=VariableTrackerMeta):
+    """
+    Base class for tracked locals and stack values
+
+    VariableTracker instances are immutable and should be copied in
+    order to change them.
+    """
+
+    # fields to leave unmodified in apply()
+    _nonvar_fields = {
+        "value",
+        "guards",
+        "source",
+        "mutable_local",
+        "parents_tracker",
+        "user_code_variable_name",
+    }
+
+    def clone(self, **kwargs):
+        """Shallow copy with some (optional) changes"""
+        args = dict(self.__dict__)
+        args.update(kwargs)
+        return self.__class__(**args)
+
+    @classmethod
+    def copy(cls, value):
+        """Deeper (but not full) copy, leaving FX and user objects alone"""
+        return cls.apply(identity, value)
+
+    @classmethod
+    def apply(
+        cls,
+        fn: Callable[["VariableTracker"], "VariableTracker"],
+        value,
+        cache=None,
+        skip_fn=lambda _: False,  # Whether we should skip applying to this var
+    ):
+        """
+        Walk this object and call fn on all the VariableTracker
+        instances
+        """
+        if cache is None:
+            cache = dict()
+
+        idx = id(value)
+        if idx in cache:
+            return cache[idx][0]
+
+        if isinstance(value, VariableTracker):
+            if not skip_fn(value):
+
+                def update_object_dict(v):
+                    changed = False
+                    rv = v.__dict__
+                    for key in rv.keys():
+                        if key not in v._nonvar_fields:
+                            prior = rv[key]
+                            rv[key] = cls.apply(fn, prior, cache, skip_fn)
+                            changed = changed or prior is not rv[key]
+
+                    return v
+
+                value = value.unwrap()
+                was_realized = value.is_realized()
+                result = fn(update_object_dict(value))
+                if not was_realized and value.is_realized():
+                    # running fn() resulted in value getting realized,
+                    # which means we missed updating the contents of result
+                    result = update_object_dict(result.unwrap())
+            else:
+                result = fn(value)
+                if result is not None:
+                    result = result.unwrap()
+        elif istype(value, list):
+            result = [cls.apply(fn, v, cache, skip_fn) for v in value]
+        elif istype(value, tuple):
+            result = tuple(cls.apply(fn, v, cache, skip_fn) for v in value)
+        elif istype(value, (dict, collections.OrderedDict)):
+            result = {
+                k: cls.apply(fn, v, cache, skip_fn) for k, v in list(value.items())
+            }
+        else:
+            result = value
+
+        # save `value` to keep it alive and ensure id() isn't reused
+        cache[idx] = (result, value)
+        return result
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}()"
+
+    def python_type(self):
+        """
+        Abstract method to be implemented by subclasses of VariableTracker.
+
+        This method should return the type represented by the instance of the subclass.
+        The purpose is to provide a standardized way to retrieve the Python type information
+        of the variable being tracked.
+
+        Returns:
+            type: The Python type (such as int, str, list, etc.) of the variable tracked by
+                the subclass. If the type cannot be determined or is not relevant,
+                leaving it undefined or invoking super() is always sound.
+
+        Note:
+            This is an abstract method and may be overridden in subclasses.
+
+        Example:
+            class SetVariable(VariableTracker):
+                def python_type(self):
+                    return set
+
+        Raises:
+            NotImplementedError: If the method is not implemented in a subclass.
+        """
+        raise NotImplementedError(f"{self} has no type")
+
+    def as_python_constant(self):
+        """For constants"""
+        raise NotImplementedError(f"{self} is not a constant")
+
+    def guard_as_python_constant(self):
+        """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
+        try:
+            return self.as_python_constant()
+        except NotImplementedError as e:
+            unimplemented(str(e))
+
+    def is_python_constant(self):
+        try:
+            self.as_python_constant()
+            return True
+        except NotImplementedError:
+            return False
+
+    def make_guard(self, fn):
+        if self.source:
+            return self.source.make_guard(fn)
+        raise NotImplementedError()
+
+    def const_getattr(self, tx, name: str) -> Any:
+        """getattr(self, name) returning a python constant"""
+        raise NotImplementedError()
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        """getattr(self, name) returning a new variable"""
+        value = self.const_getattr(tx, name)
+        if not variables.ConstantVariable.is_literal(value):
+            raise NotImplementedError()
+        source = None
+        if self.source:
+            source = AttrSource(self.source, name)
+        return variables.ConstantVariable.create(value, source=source)
+
+    def is_proxy(self):
+        try:
+            self.as_proxy()
+            return True
+        except NotImplementedError:
+            return False
+
+    def as_proxy(self):
+        raise NotImplementedError(str(self))
+
+    def maybe_fx_node(self):
+        try:
+            proxy = self.as_proxy()
+            import torch.fx
+
+            if isinstance(proxy, torch.fx.Proxy):
+                return proxy.node
+            return None
+        except NotImplementedError:
+            return None
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError()
+
+    def can_reconstruct(self, tx):
+        """If it is possible to reconstruct the Python object this
+        VariableTracker represents."""
+        assert tx is tx.output.root_tx, "Only root tx can reconstruct"
+        try:
+            from ..codegen import PyCodegen
+
+            cg = PyCodegen(tx)
+            self.reconstruct(cg)
+            return True
+        except NotImplementedError:
+            return False
+
+    def unpack_var_sequence(self, tx) -> List["VariableTracker"]:
+        raise NotImplementedError()
+
+    def has_unpack_var_sequence(self, tx) -> bool:
+        try:
+            self.unpack_var_sequence(tx)
+            return True
+        except NotImplementedError:
+            return False
+
+    def inspect_parameter_names(self) -> List[str]:
+        unimplemented(f"inspect_parameter_names: {self}")
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        unimplemented(f"hasattr {self.__class__.__name__} {name}")
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        unimplemented(f"call_function {self} {args} {kwargs}")
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "__len__" and self.has_unpack_var_sequence(tx):
+            assert not (args or kwargs)
+            return variables.ConstantVariable.create(len(self.unpack_var_sequence(tx)))
+        elif (
+            name == "__getattr__"
+            and len(args) == 1
+            and args[0].is_python_constant()
+            and not kwargs
+        ):
+            return self.var_getattr(tx, args[0].as_python_constant())
+        raise unimplemented(f"call_method {self} {name} {args} {kwargs}")
+
+    def rename(self, tx, name):
+        return self
+
+    def realize(self) -> "VariableTracker":
+        """Used by LazyVariableTracker to build the real VariableTracker"""
+        return self
+
+    def recursive_realize(self):
+        """Realize all objects under this"""
+        return VariableTracker.apply(lambda x: x.realize(), self)
+
+    def unwrap(self) -> "VariableTracker":
+        """Used by LazyVariableTracker to return the real VariableTracker if it already exists"""
+        return self
+
+    def is_realized(self):
+        """Used by LazyVariableTracker to indicate an unrealized node"""
+        return True
+
+    def __init__(
+        self,
+        *,
+        source: Source = None,
+        mutable_local: MutableLocal = None,
+        parents_tracker: ParentsTracker = None,
+    ):
+        super().__init__()
+        self.source = source
+        self.mutable_local = mutable_local
+        self.parents_tracker = parents_tracker
+
+    def __post_init__(self, *args, **kwargs):
+        if self.parents_tracker is None:
+            self.parents_tracker = ParentsTracker()
+        # visit children 1 level deep and ensure parent is set properly
+        VariableTracker.apply(
+            lambda node: node.parents_tracker.add(self.parents_tracker),
+            [v for k, v in self.__dict__.items() if k not in self._nonvar_fields],
+            skip_fn=lambda _: True,
+        )
+
+
+def typestr(*objs):
+    if len(objs) == 1:
+        (obj,) = objs
+        if isinstance(obj, VariableTracker):
+            return str(obj)
+        else:
+            return type(obj).__name__
+    else:
+        return " ".join(map(typestr, objs))
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/builder.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d4c0f423413637b4916192cab40da6defd0eadc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/builder.py
@@ -0,0 +1,1976 @@
+# mypy: ignore-errors
+
+import abc
+import collections
+import contextlib
+import dataclasses
+import enum
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import re
+import sys
+import types
+from typing import List, NamedTuple, Optional, Union
+
+from torch.utils._sympy.value_ranges import ValueRanges
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+import torch
+
+from torch import SymInt
+from torch._guards import GuardSource, TracingContext
+from torch._ops import HigherOrderOperator
+from torch._streambase import _EventBase, _StreamBase
+from torch._subclasses.fake_tensor import FakeTensor, is_fake, maybe_get_fake_mode
+from torch._subclasses.meta_utils import is_sparse_any
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.symbolic_shapes import (
+    _constrain_range_for_size,
+    DimDynamic,
+    RelaxedUnspecConstraint,
+    StatefulSymbolicContext,
+    SubclassSymbolicContext,
+    SymbolicContext,
+)
+from torch.fx.immutable_collections import immutable_list
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils.weak import TensorWeakRef
+from .. import config, mutation_guard, replay_record, trace_rules
+
+from ..device_interface import get_registered_device_interfaces
+from ..exc import InternalTorchDynamoError, unimplemented
+from ..guards import GuardBuilder, install_guard, make_dupe_guard
+from ..side_effects import SideEffects
+from ..source import (
+    AttrSource,
+    ConstantSource,
+    ConstDictKeySource,
+    ConvertIntSource,
+    GetItemSource,
+    is_constant_source,
+    is_from_defaults,
+    LocalSource,
+    NumpyTensorSource,
+    RandomValueSource,
+    Source,
+    TupleIteratorGetItemSource,
+)
+from ..trace_rules import is_callable_allowed, is_numpy
+from ..utils import (
+    build_checkpoint_variable,
+    clone_input,
+    common_constant_types,
+    get_fake_value,
+    get_static_address_type,
+    is_function_or_wrapper,
+    is_namedtuple,
+    is_typing,
+    is_utils_checkpoint,
+    istype,
+    odict_values,
+    preserve_rng_state,
+    tensor_always_has_static_shape,
+    tuple_iterator,
+    tuple_iterator_getitem,
+    tuple_iterator_len,
+    unwrap_with_attr_name_if_wrapper,
+    wrap_fake_exception,
+)
+
+from .base import MutableLocal, typestr, VariableTracker
+from .constant import ConstantVariable, EnumVariable
+from .ctx_manager import (
+    AutocastModeVariable,
+    EventVariable,
+    NullContextVariable,
+    PreserveVersionContextVariable,
+    StreamContextVariable,
+    StreamVariable,
+)
+from .dicts import (
+    ConstDictVariable,
+    DataClassVariable,
+    DefaultDictVariable,
+    HFPretrainedConfigVariable,
+    PythonSysModulesVariable,
+    SetVariable,
+)
+from .distributed import (
+    DeviceMeshVariable,
+    PlacementClassVariable,
+    PlacementVariable,
+    ProcessGroupVariable,
+)
+from .functions import (
+    CollectiveFunctionRewriteVariable,
+    FunctoolsPartialVariable,
+    TritonKernelVariable,
+    UserMethodVariable,
+)
+from .higher_order_ops import TorchHigherOrderOperatorVariable
+from .iter import ItertoolsVariable
+from .lazy import LazyVariableTracker
+from .lists import (
+    BaseListVariable,
+    ListVariable,
+    NamedTupleVariable,
+    RangeVariable,
+    RestrictedListSubclassVariable,
+    SizeVariable,
+    SliceVariable,
+    TupleIteratorVariable,
+    TupleVariable,
+)
+from .misc import (
+    AutogradFunctionContextVariable,
+    AutogradFunctionVariable,
+    ComptimeVariable,
+    DebuggingVariable,
+    GetAttrVariable,
+    GetSetDescriptorVariable,
+    InspectSignatureVariable,
+    LambdaVariable,
+    MethodWrapperVariable,
+    NumpyVariable,
+    PythonModuleVariable,
+    SavedTensorBox,
+    TypingVariable,
+)
+from .nn_module import FSDPManagedNNModuleVariable, UnspecializedNNModuleVariable
+from .optimizer import OptimizerVariable
+
+from .sdpa import SDPAParamsVariable
+from .tensor import (
+    NumpyNdarrayVariable,
+    SymNodeVariable,
+    TensorSubclassVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+)
+from .torch import TorchCtxManagerClassVariable, TorchInGraphFunctionVariable
+from .torch_function import build_torch_function_fn, TensorWithTFOverrideVariable
+from .user_defined import (
+    KeyedJaggedTensorVariable,
+    UserDefinedClassVariable,
+    UserDefinedObjectVariable,
+)
+
+
+log = logging.getLogger(__name__)
+
+
+DimList = List
+
+
+class _missing:
+    pass
+
+
+@dataclasses.dataclass
+class GraphArg:
+    source: Source
+    # TODO: storing a SymInt here but not a FakeTensor is a pretty strange
+    # thing to do.  Probably should have example (which stores an int) and
+    # fake_example
+    _example: Union[TensorWeakRef, torch.SymInt]
+    is_unspecialized: bool
+    fake_tensor: Optional[torch._subclasses.fake_tensor.FakeTensor]
+    # UnspecializedPythonVariable often masquerades as a tensor.
+    # We MUST NOT generate shape guard code
+    # that actually tries to access tensor properties on these values.
+    # is_tensor lets us tell if this graph arg actually is a tensor
+    # or not.
+    is_tensor: bool = True
+    # Sometimes, the Tensor we pass to example is freshly allocated (smh).
+    # Then we cannot only keep a weak reference to it.  This lets you
+    # stash a strong reference too.
+    example_strong_ref: Optional[torch.Tensor] = None
+
+    @property
+    def example(self):
+        if isinstance(self._example, TensorWeakRef):
+            r = self._example()
+            assert r is not None
+            return r
+        else:
+            return self._example
+
+    def __post_init__(self):
+        if isinstance(self._example, torch.Tensor):
+            self._example = TensorWeakRef(self._example)
+            assert is_fake(self.fake_tensor)
+
+    def reconstruct(self, codegen):
+        self.source.reconstruct(codegen)
+
+    def erase(self):
+        self._example = None
+        self.example_strong_ref = None
+
+    def __eq__(self, other):
+        return self.source.name() == other.source.name()
+
+
+class BackwardStateGraphArg(GraphArg):
+    def __init__(self):
+        super().__init__(
+            source=None,
+            _example=BackwardState(),
+            is_unspecialized=False,
+            fake_tensor=None,
+            is_tensor=False,
+        )
+
+    def reconstruct(self, codegen):
+        assert codegen.tx.output.backward_state_var
+        codegen.load_import_from(BackwardState.__module__, "BackwardState")
+        codegen.call_function(0, True)
+        codegen.dup_top()
+        codegen.store(codegen.tx.output.backward_state_var)
+
+
+@dataclasses.dataclass
+class FrameStateSizeEntry:
+    scalar: Optional[int]
+    size: Optional[List[int]]
+
+
+class VariableBuilder:
+    """Wrap a python value in a VariableTracker() instance"""
+
+    def __init__(
+        self,
+        tx,
+        source: Source,
+    ):
+        assert (
+            source is not None
+        ), "Consider SourcelessBuilder for ephemeral objects, usually objects created locally."
+        assert TracingContext.try_get() is not None, "Expected active TracingContext"
+        super().__init__()
+        self.tx = tx
+        self.source = source
+        self.name = source.name()
+
+    def __call__(self, value):
+        if value in self.tx.output.side_effects:
+            side_effect_result = self.tx.output.side_effects[value]
+            dup_guard = make_dupe_guard(self.source, side_effect_result.source)
+            if dup_guard:
+                self.install_guards(dup_guard)
+            return side_effect_result
+        vt = self._wrap(value)
+        vt.source = self.source
+        if self._can_lift_attrs_to_inputs(vt):
+            vt = self.tx.output.side_effects.track_object_existing(value, vt)
+        return vt
+
+    def _can_lift_attrs_to_inputs(self, vt):
+        if type(vt) in [
+            TensorVariable,
+            TensorWithTFOverrideVariable,
+            UserDefinedObjectVariable,
+            NumpyNdarrayVariable,
+        ]:
+            return True
+        return False
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _common_constants():
+        return {
+            # We zero-one specialize shapes, so specialize these constants
+            # too
+            0,
+            1,
+            # NB: There used to be more constants here, but honestly it was
+            # pretty confusing.  Note we specialize floats by default, and
+            # DON'T specialize ints by default.  This all only matters with
+            # dynamic_shapes
+        }
+
+    def get_source(self):
+        return self.source
+
+    def install_guards(self, *guards):
+        source = self.get_source()
+        if (
+            isinstance(source, ConstantSource)
+            or source.guard_source() == GuardSource.CONSTANT
+        ):
+            return None
+        install_guard(*[source.make_guard(guard) for guard in guards], skip=1)
+        return {}
+
+    def set_source_and_track_mutable(self, value, var):
+        assert isinstance(var, VariableTracker)
+        var.source = self.source
+        return self.tx.output.side_effects.track_mutable(value, var)
+
+    @classmethod
+    @functools.lru_cache(None)
+    def _type_dispatch(cls):
+        # NB: Careful not to close over self to avoid ref cycle from lru_cache
+        entries = [
+            (
+                (
+                    torch.Tensor,
+                    torch.nn.Parameter,
+                    torch._subclasses.FakeTensor,
+                    torch._subclasses.functional_tensor.FunctionalTensor,
+                ),
+                cls.wrap_tensor,
+            ),
+            (
+                (tuple, list, odict_values, collections.deque, torch.Size),
+                cls.wrap_listlike,
+            ),
+            (tuple_iterator, cls.wrap_tuple_iterator),
+            ((slice, range), cls.wrap_slice_range),
+            (tuple(common_constant_types), cls.wrap_literal),
+        ]
+
+        if config.trace_numpy and np:
+            entries.append((np.ndarray, cls.wrap_numpy_ndarray))
+
+        result = {}
+        for ts, fn in entries:
+            for t in ts if isinstance(ts, tuple) else (ts,):
+                assert t not in result
+                result[t] = fn
+
+        return result
+
+    @classmethod
+    @functools.lru_cache(None)
+    def _id_dispatch(cls):
+        from ..comptime import comptime
+
+        entries = [
+            (
+                inspect.signature,
+                lambda self, value: LambdaVariable(
+                    InspectSignatureVariable.create,
+                    source=self.source,
+                    **self.install_guards(GuardBuilder.CLOSURE_MATCH),
+                ),
+            ),
+            (comptime, lambda self, value: ComptimeVariable()),
+            (
+                dataclasses.fields,
+                lambda self, value: LambdaVariable(
+                    _dataclasses_fields_lambda,
+                    source=self.source,
+                    **self.install_guards(GuardBuilder.FUNCTION_MATCH),
+                ),
+            ),
+        ]
+
+        result = {}
+        for ts, fn in entries:
+            for t in ts if isinstance(ts, (tuple, list)) else (ts,):
+                assert t not in result
+                result[id(t)] = fn
+
+        return result
+
+    def _wrap(self, value):
+        # import here to avoid circular dependencies
+        from torch.utils._triton import has_triton
+
+        if has_triton():
+            from triton.runtime.autotuner import Autotuner
+            from triton.runtime.jit import JITFunction
+        else:
+
+            class JITFunction:
+                pass
+
+            class Autotuner:
+                pass
+
+        # Handle exact type() match
+        type_dispatch = self._type_dispatch().get(type(value))
+        if type_dispatch is not None:
+            return type_dispatch(self, value)
+
+        # Handle exact id() match
+        id_dispatch = self._id_dispatch().get(id(value))
+        if id_dispatch is not None:
+            return id_dispatch(self, value)
+
+        # Note - There are some nested values where types mismatch!
+        # We want to get those out and wrap those.
+        value = inspect.getattr_static(value, "_torchdynamo_inline", value)
+
+        # Everything else (NB: order matters!)
+        if is_traceable_wrapper_subclass(value) or istype(
+            value, config.traceable_tensor_subclasses
+        ):
+            return self.wrap_tensor(value)
+        elif is_namedtuple(value):
+            return self.wrap_listlike(value)
+
+        elif value is torch.utils._pytree.SUPPORTED_NODES:
+            # For SUPPORTED_NODES, we guard on the dictionary version (PEP509)
+            # under the assumption that the values themselves don't change.
+            self.install_guards(GuardBuilder.DICT_VERSION)
+            result = {
+                ConstantVariable.create(k): UserDefinedObjectVariable(
+                    v,
+                    source=GetItemSource(
+                        self.get_source(), ConstDictKeySource(self.get_source(), i)
+                    ),
+                )
+                for i, (k, v) in enumerate(value.items())
+            }
+            return ConstDictVariable(result, type(value))
+        elif value is sys.modules:
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return PythonSysModulesVariable(source=self.source)
+        elif istype(value, (dict, collections.defaultdict, collections.OrderedDict)):
+            if not value and self.get_source().is_nn_module():
+                # It is faster to guard on 'false' property than to guard
+                # on actual dict keys, but we can't do this fast guard in general because
+                # it omits a crucial type check that ensures the value is actually still a dict at runtime.
+
+                # Why is this OK for (specialized) nnmodules? We set up a setattr hook
+                # to check for module property mutations, which does a reasonable,
+                # but not completely secure job ensuring a property wasn't changed.
+                self.install_guards(GuardBuilder.BOOL_FALSE)
+            else:
+                self.install_guards(GuardBuilder.DICT_LENGTH)
+
+            # Optimisation for the common case strings, ints, etc
+            all_const = all(ConstantVariable.is_literal(k) for k in value.keys())
+            if all_const:
+                self.install_guards(GuardBuilder.DICT_CONST_KEYS)
+
+            # We need all the keys to be hashable. We do this within the
+            # _HashableTracker class in dicts.py
+            def build_key_value(i, k, v):
+                if all_const:
+                    key = ConstantVariable.create(k)
+                    source_key = k
+                else:
+                    source_key = ConstDictKeySource(self.get_source(), i)
+                    key = LazyVariableTracker.create(k, source_key)
+
+                source_value = GetItemSource(self.get_source(), source_key)
+                value = LazyVariableTracker.create(v, source_value)
+
+                return key, value
+
+            result = dict(
+                build_key_value(i, k, v) for i, (k, v) in enumerate(value.items())
+            )
+
+            if istype(value, collections.defaultdict):
+                factory_source = AttrSource(self.source, "default_factory")
+                result = DefaultDictVariable(
+                    result,
+                    type(value),
+                    default_factory=VariableBuilder(self.tx, factory_source)(
+                        value.default_factory
+                    ),
+                    source=self.source,
+                )
+            else:
+                result = ConstDictVariable(result, type(value), source=self.source)
+
+            return self.set_source_and_track_mutable(value, result)
+        elif isinstance(value, torch.nn.Module):
+            return self.wrap_module(value)
+        elif ConstantVariable.is_literal(value):  # non-atomic literals
+            return self.wrap_literal(value)
+        elif istype(value, frozenset) and (
+            ConstantVariable.is_literal(x) for x in value
+        ):
+            # For frozenset, we can guard by object ID instead of value
+            # equality, this allows us to handle non-literal values
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return ConstantVariable.create(value=value, source=self.source)
+        elif isinstance(value, enum.Enum):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return EnumVariable(value=value, source=self.source)
+        elif DebuggingVariable.is_reorderable_logging_function(value):
+            # Put this above builtin_callable so that print() can be handled
+            # along with other builtin debugging functions
+            self.install_guards(GuardBuilder.BUILTIN_MATCH)
+            return DebuggingVariable(value, source=self.source)
+        elif is_utils_checkpoint(value):
+            return build_checkpoint_variable(source=self.source)
+        elif isinstance(value, functools.partial):
+            func_src = AttrSource(self.get_source(), "func")
+            func_obj = VariableBuilder(self.tx, func_src)(value.func)
+
+            args = []
+            args_source = AttrSource(self.get_source(), "args")
+            for i, arg in enumerate(value.args):
+                args.append(
+                    VariableBuilder(self.tx, GetItemSource(args_source, i))(arg)
+                )
+
+            keywords = {}
+            keywords_source = AttrSource(self.get_source(), "keywords")
+            for k, v in value.keywords.items():
+                if not ConstantVariable.is_literal(k):
+                    unimplemented("functools.partial with non-literal keyword")
+                keywords[k] = VariableBuilder(
+                    self.tx, GetItemSource(keywords_source, k)
+                )(v)
+
+            install_guard(
+                self.get_source().make_guard(GuardBuilder.TYPE_MATCH),
+                keywords_source.make_guard(GuardBuilder.DICT_KEYS),
+                args_source.make_guard(GuardBuilder.SEQUENCE_LENGTH),
+            )
+            return FunctoolsPartialVariable(func_obj, args, keywords)
+        elif is_typing(value):
+            # typing.List, typing.Mapping, etc.
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return TypingVariable(
+                value,
+                source=self.source,
+            )
+        elif np is not None and isinstance(value, np.generic):
+            # numpy array scalars: convert to 0D arrays
+            return self.wrap_numpy_ndarray(np.asarray(value))
+        elif is_numpy(value):
+            assert np
+            self.install_guards(
+                GuardBuilder.FUNCTION_MATCH
+                if callable(value)
+                else GuardBuilder.TYPE_MATCH
+            )
+            return NumpyVariable(value, source=self.source)
+        # NB: These can't be put in type_dispatch, they have to run later
+        elif CollectiveFunctionRewriteVariable.can_rewrite(value):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return CollectiveFunctionRewriteVariable.create(
+                self.tx,
+                value,
+                source=self.source,
+            )
+        elif istype(value, torch.autograd.function.FunctionMeta):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return AutogradFunctionVariable(
+                value,
+                source=self.source,
+            )
+        elif isinstance(value, torch.autograd.function.FunctionCtx):
+            saved_tensors_source = AttrSource(self.source, "saved_tensors")
+            install_guard(
+                self.source.make_guard(GuardBuilder.TYPE_MATCH),
+                saved_tensors_source.make_guard(GuardBuilder.SEQUENCE_LENGTH),
+            )
+            saved_tensors = [
+                VariableBuilder(self.tx, GetItemSource(saved_tensors_source, n))(v)
+                for n, v in enumerate(value.saved_tensors)
+            ]
+            return self.tx.output.side_effects.track_object_existing(
+                value,
+                AutogradFunctionContextVariable(
+                    value,
+                    source=self.source,
+                    saved_tensors=SavedTensorBox(saved_tensors),
+                ),
+            )
+        elif (
+            isinstance(value, types.MethodType)
+            and istype(
+                getattr(value, "__self__", None), torch.autograd.function.FunctionMeta
+            )
+            and getattr(value, "__name__", "") == "apply"
+            and value == getattr(value.__self__, "apply", None)
+        ):
+            # handle aliased autograd function `apply` calls
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return GetAttrVariable(
+                AutogradFunctionVariable(
+                    value.__self__, source=AttrSource(self.source, member="__self__")
+                ),
+                "apply",
+            )
+        elif callable(value) and trace_rules.lookup_callable(value) is not None:
+            if is_callable_allowed(value):
+                self.tx.output.has_user_defined_allowed_in_graph = True
+            return trace_rules.lookup_callable(value).create_with_source(
+                value, source=self.source
+            )
+        elif np and isinstance(value, np.number):
+            return self.wrap_unspecialized_primitive(value)
+        elif DataClassVariable.is_matching_object(value):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return DataClassVariable.wrap(self, value)
+        elif HFPretrainedConfigVariable.is_matching_object(value):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return HFPretrainedConfigVariable(value)
+        elif isinstance(value, HigherOrderOperator):
+            self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.NAME_MATCH)
+            return TorchHigherOrderOperatorVariable.make(value, source=self.source)
+        elif isinstance(value, torch.cuda.StreamContext):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            stream_source = AttrSource(self.source, "stream")
+            stream_var = VariableBuilder(self.tx, stream_source)(value.stream)
+            return StreamContextVariable.create(self.tx, stream_var)
+        elif isinstance(value, _StreamBase):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return StreamVariable(
+                None,
+                value,
+                value.device,
+                source=self.source,
+            )
+        elif isinstance(value, (torch._C._SDPAParams)):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return SDPAParamsVariable.create(self.tx, value, self.source)
+        elif isinstance(value, _EventBase):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return EventVariable(
+                None,
+                value,
+                source=self.source,
+            )
+        elif (
+            isinstance(value, torch._C._TensorMeta)
+            and value in config.traceable_tensor_subclasses
+        ):
+            return TensorSubclassVariable(value, source=self.source)
+        elif (
+            istype(value, contextlib.nullcontext)
+            and inspect.getattr_static(value, "enter_result", None) is None
+        ):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return NullContextVariable(source=self.source)
+        elif KeyedJaggedTensorVariable.is_matching_object(value):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            result = KeyedJaggedTensorVariable(value, source=self.source)
+            # TODO: this doing it manually is bad
+            return self.tx.output.side_effects.track_object_existing(value, result)
+        elif isinstance(value, torch.optim.Optimizer):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return OptimizerVariable(value, source=self.source)
+        elif ProcessGroupVariable.is_process_group(value):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return ProcessGroupVariable(value, source=self.source)
+        elif DeviceMeshVariable.is_device_mesh(value):
+            # TODO: see if we need to add custom guard instead of a simple ID_MATCH
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return DeviceMeshVariable(value, source=self.source)
+        elif PlacementClassVariable.is_placement_type(value):
+            # TODO: see if we need to add custom guard instead of a simple ID_MATCH
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return PlacementClassVariable(value, source=self.source)
+        elif PlacementVariable.is_placement(value):
+            # TODO: see if we need to add custom guard instead of a simple ID_MATCH
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return PlacementVariable(
+                value,
+                source=self.source,
+            )
+        elif istype(value, type) and value in itertools.__dict__.values():
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return ItertoolsVariable(value, source=self.source)
+        elif isinstance(value, torch.SymBool):
+            # Note: the idea here is to re-use the infra we've built for SymInt by simulating the
+            # user provided SymBool with a SymInt in dynamo.
+
+            # Concretely,
+            # 1. We create a SymInt in dynamo's shape_env, whose source is constructed as ConvertIntSource(self.source).
+            # so that guards on the SymInts can be effectively applied on the original SymBool in user program.
+            # 2. We create a SymBool based on the SymInt in dynamo's ShapeEnv. Because the original user program
+            # depends on the value being a SymBool. This allows dynamo to interpret the user's program correctly.
+
+            value_hint = value.node.require_hint()
+            new_source = ConvertIntSource(self.source)
+
+            new_symint = self.tx.output.shape_env.create_unspecified_symint_and_symbol(
+                int(value_hint),
+                new_source,
+                dynamic_dim=DimDynamic.DYNAMIC,
+            )
+
+            sym_node_proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                type(new_symint),
+                source=new_source,
+            )
+
+            sym_node_proxy.node.meta["grapharg"] = GraphArg(
+                new_source,
+                new_symint,
+                False,
+                None,
+                is_tensor=False,
+                example_strong_ref=new_symint,
+            )
+            self.tx.output.bound_symbols.add(new_symint.node.expr)
+            self.tx.output.tracked_fakes.append(
+                TrackedFake(new_symint, new_source, None)
+            )
+            return SymNodeVariable(
+                sym_node_proxy,
+                new_symint == 1,
+            )
+        elif isinstance(value, (JITFunction, Autotuner)):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return TritonKernelVariable(
+                value,
+                None,  # No kernel idx provided
+                None,  # No grid provided
+                source=self.source,
+            )
+        elif isinstance(value, torch.amp.autocast_mode.autocast):
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return AutocastModeVariable(
+                target_values=[
+                    value.device,
+                    value.fast_dtype,
+                    value._enabled,
+                    value._cache_enabled,
+                ],
+                source=self.source,
+            )
+        elif TorchCtxManagerClassVariable.is_matching_cls(value):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return TorchCtxManagerClassVariable(value, source=self.source)
+        elif is_function_or_wrapper(value):
+            value, attr_name = unwrap_with_attr_name_if_wrapper(value)
+            # For these wrappers, Dynamo points to the wrapped function,
+            # so source needs to be updated as well.
+            if attr_name is not None:
+                self.source = AttrSource(self.source, attr_name)
+            return trace_rules.lookup(value).create_with_source(
+                value, source=self.source
+            )
+        # Don't use istype, since some python modules are not subclasses of types.ModuleType directly.
+        # E.g, type(torch.ops) -> <class 'torch._ops._Ops'>,
+        # type(torch.backends.cudnn) -> <class 'torch.backends.cudnn.CudnnModule'>
+        elif isinstance(value, (types.ModuleType, replay_record.DummyModule)):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return PythonModuleVariable(
+                value,
+                source=self.source,
+            )
+        elif isinstance(value, types.MethodType) and isinstance(
+            value.__self__, (torch.nn.Module, torch.utils._pytree.TreeSpec)
+        ):
+            # don't let MethodTypes fall through to UserDefinedObject,
+            # which doesn't support 'CALL_FUNCTION'
+
+            # TODO(whc): Why do we limit this to methods on NNModules?
+            # I don't have a good reason for this, but it preserves the existing behavior
+            # for MBartForConditionalGeneration, which generates many graph breaks and OOMs otherwise.
+            # I suspect we probably want to relax this check and dig deeper there.
+
+            # In order to construct a MethodVariable in Dynamo, we start with an actual method obj from python,
+            # but need to separately wrap its underlying `__func__` and its `self` argument.  We wrap `self` here
+            # and then `__func__` gets wrapped inside UserMethodVariable.
+            self_obj = VariableBuilder(
+                self.tx, source=AttrSource(self.source, "__self__")
+            )(value.__self__)
+            assert self_obj and isinstance(
+                self_obj, VariableTracker
+            ), "Failed to produce a valid self obj"
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return UserMethodVariable(
+                value.__func__,
+                self_obj,
+                source=self.source,
+            )
+        elif isinstance(value, types.GetSetDescriptorType):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return GetSetDescriptorVariable(value)
+        elif isinstance(value, types.MethodWrapperType):
+            self.install_guards(GuardBuilder.FUNCTION_MATCH)
+            return MethodWrapperVariable(value)
+        elif issubclass(type(value), type):
+            if value in (torch.utils.hooks.BackwardHook, torch.nn.Parameter):
+                # TODO(jansel): combine this case with the one above
+                return trace_rules.lookup(value).create_with_source(
+                    value, source=self.source
+                )
+            if value is torch.autograd._unsafe_preserve_version_counter:
+                self.install_guards(GuardBuilder.FUNCTION_MATCH)
+                return PreserveVersionContextVariable.constructor(self.tx)
+            # This is a userdefined class, so install an ID_MATCH even if its a
+            # global variable.
+            self.install_guards(GuardBuilder.ID_MATCH)
+            return UserDefinedClassVariable(
+                value,
+                source=self.source,
+            )
+        elif RestrictedListSubclassVariable.is_matching_cls(type(value)):
+            self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
+            return self.set_source_and_track_mutable(
+                value,
+                RestrictedListSubclassVariable(
+                    [
+                        LazyVariableTracker.create(
+                            value=value[i], source=GetItemSource(self.source, i)
+                        )
+                        for i in range(len(value))
+                    ],
+                    user_cls=type(value),
+                    user_cls_source=AttrSource(self.source, "__class__"),
+                ),
+            )
+        else:
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            result = UserDefinedObjectVariable(value, source=self.source)
+            if not SideEffects.cls_supports_mutation_side_effects(type(value)):
+                # don't allow STORE_ATTR mutation with custom __setattr__
+                return result
+            return self.tx.output.side_effects.track_object_existing(value, result)
+
+    def wrap_listlike(self, value: Union[tuple, list, odict_values, NamedTuple]):
+        if config.specialize_int and type(value) is torch.Size:
+            self.install_guards(GuardBuilder.CONSTANT_MATCH)
+            return ConstantVariable.create(value=value)
+        # One can index a tensor with a list/tuple. Therefore, we need to
+        # have a stricter match.
+        self.install_guards(GuardBuilder.SEQUENCE_LENGTH)
+
+        for item in value:
+            if item is value:
+                unimplemented("list elements are pointing to the list itself")
+
+        output = [
+            LazyVariableTracker.create(item, source=GetItemSource(self.get_source(), i))
+            for i, item in enumerate(value)
+        ]
+
+        result = BaseListVariable.cls_for_instance(value)(
+            output, mutable_local=MutableLocal()
+        )
+        if istype(value, list):
+            return self.set_source_and_track_mutable(value, result)
+        return result
+
+    def wrap_tuple_iterator(self, value: tuple_iterator):
+        self.install_guards(GuardBuilder.TUPLE_ITERATOR_LEN)
+        output = [
+            VariableBuilder(self.tx, TupleIteratorGetItemSource(self.get_source(), i))(
+                tuple_iterator_getitem(value, i)
+            )
+            for i in range(tuple_iterator_len(value))
+        ]
+        result = TupleIteratorVariable(
+            output, mutable_local=MutableLocal(), source=self.source
+        )
+
+        return self.set_source_and_track_mutable(value, result)
+
+    def wrap_slice_range(self, value: Union[slice, range]):
+        items = [
+            VariableBuilder(self.tx, AttrSource(self.get_source(), k))(
+                getattr(value, k)
+            )
+            for k in ("start", "stop", "step")
+        ]
+        self.install_guards(GuardBuilder.TYPE_MATCH)
+        if isinstance(value, slice):
+            return SliceVariable(items, source=self.source)
+        else:
+            return RangeVariable(items, source=self.source)
+
+    def wrap_module(self, value: torch.nn.Module):
+        from ..eval_frame import OptimizedModule
+
+        if istype(value, OptimizedModule):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            self.source = AttrSource(self.source, "_orig_mod")
+            return self.wrap_module(value._orig_mod)
+
+        if (
+            isinstance(value, (torch.nn.RNN, torch.nn.GRU, torch.nn.LSTM))
+            and not config.allow_rnn
+        ):
+            unimplemented("TorchDynamo purposely graph breaks on RNN, GRU, LSTMs")
+        if mutation_guard.is_dynamic_nn_module(value):
+            # created dynamically, don't specialize on it
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            result = UnspecializedNNModuleVariable(value, source=self.source)
+            if not SideEffects.cls_supports_mutation_side_effects(type(value)):
+                # don't allow STORE_ATTR mutation with custom __setattr__
+                return result
+            return self.tx.output.side_effects.track_object_existing(value, result)
+        elif issubclass(
+            value.__class__, torch.nn.parallel.distributed.DistributedDataParallel
+        ):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            return UnspecializedNNModuleVariable(value)
+        elif getattr(value, "_is_fsdp_managed_module", False):
+            # See note [Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+            # in fully_sharded_data_parallel.py for more information
+
+            # we can't do this assert inside FSDP constructor,
+            # since we don't know yet whether dynamo will be used
+            assert getattr(
+                value, "_fsdp_use_orig_params", False
+            ), "Dynamo only supports FSDP with use_orig_params=True"
+
+            # Note on FSDP guarding
+            # 1. We expect FSDP wrapping mutates an nn module irreversably (no way to de-wrap).
+            # 2. Eager FSDP already assumes (requires, but without enforcement) that users don't mutate their
+            #    model parameters/structure after FSDP wrapping, because FSDP wouldn't notice or update its FlatParams.
+            #
+            # Due to (1), once we enter this path we expect not to go back nor have to guard on type
+            # or _is_fsdp_managed_module.
+            #
+            # TODO(whc) We could add a guard on the opposite case, where a user compiled/ran
+            # pre-FSDP-wrapped model, then wrapped, to ensure that we recompile with the FSDP handling.
+            #
+            # Due to (2), we skip guards on inner contents of fsdp_managed modules, by using FSDPNNModuleSource as the
+            # guard source.  This behavior is gated on config.skip_fsdp_guards.
+            #
+            # ID_MATCH is required to disambiguate cases as simple as a unit test that constructs 2 models and wraps
+            # them differently with different FSDP configs.  (test_dynamo_distributed.py -k test_fsdp_aot_eager)
+            self.install_guards(GuardBuilder.TYPE_MATCH, GuardBuilder.ID_MATCH)
+            return FSDPManagedNNModuleVariable(value, source=self.get_source())
+        else:
+            return self.tx.output.register_attr_or_module(
+                value,
+                self.name,
+                source=self.get_source(),
+                # Guards are added inside register_attr_or_module
+            )
+
+    def wrap_literal(self, value):
+        unspec = not config.specialize_int
+        if unspec and type(value) is int:
+            # unspecializing int by default, but still
+            # specialize for the following conditions
+            if not TracingContext.get().force_unspec_int_unbacked_size_like and (
+                value in self._common_constants()
+                # Assume integers from global variables want to be specialized
+                or not self.source.guard_source().is_local()
+                # Assume that integers that came from NN modules want to be
+                # specialized (as we don't expect users to be changing the
+                # NN modules on the fly)
+                or self.source.guard_source().is_nn_module()
+                or is_from_defaults(self.source)
+            ):
+                self.install_guards(GuardBuilder.CONSTANT_MATCH)
+                return ConstantVariable.create(value=value, source=self.source)
+            else:
+                return self.wrap_unspecialized_primitive(value)
+        else:
+            self.install_guards(GuardBuilder.CONSTANT_MATCH)
+            return ConstantVariable.create(value=value)
+
+    def assert_not_wrapped_by_this_graph(self, value: torch.Tensor):
+        if is_fake(value) and maybe_get_fake_mode(value) is self.tx.fake_mode:
+            raise InternalTorchDynamoError(
+                "Cannot wrap a Tensor that has already been",
+                "wrapped by this instance of Dynamo",
+            )
+
+    def wrap_tensor(self, value: torch.Tensor):
+        source = self.get_source()
+
+        # We cannot already be tracking the tensor, which implies
+        # it would have already been wrapped
+        assert value not in self.tx.output.side_effects
+
+        if (
+            source.guard_source().is_nn_module()
+            or get_static_address_type(value) is not None
+        ) and not source.guard_source().is_fsdp_module():
+            self.assert_not_wrapped_by_this_graph(value)
+            return self.tx.output.register_attr_or_module(
+                value, self.name, source=source
+            )
+
+        if is_constant_source(source):
+            self.assert_not_wrapped_by_this_graph(value)
+            return self.tx.output.register_attr_or_module(
+                value,
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                source=source,
+                # Guards are added inside register_attr_or_module
+            )
+
+        if type(value) in config.traceable_tensor_subclasses:
+            # Ordinarily, we would fakeify a tensor so that it can get dynamic
+            # shapes and be computed on without triggering actual operations.
+            # However, how can we fakeify a tensor subclass?  Ordinary
+            # inheritance (nor multiple inheritance) won't work work.
+            #
+            # Instead, our plan is to *manually simulate* the tensor subclass
+            # inheriting from a fake tensor with dynamo.  This means our
+            # data representation for a tensor subclass will be a fake tensor
+            # + tensor subclass type + any extra data the subclass may have
+            # been storing on the tensor.  Because all Python accesses are
+            # mediated through TensorWithTFOverrideVariable, we can ensure
+            # that we dispatch differently, e.g., according to
+            # __torch_function__
+            #
+            # To simplify things for now, the __dict__ tracking bits haven't
+            # been implemented yet, but they can be added into this design at
+            # a later point in time.
+            subclass_type = type(value)
+        else:
+            assert type(value) in (
+                torch.Tensor,
+                torch.nn.Parameter,
+                torch._subclasses.fake_tensor.FakeTensor,
+                torch._subclasses.functional_tensor.FunctionalTensor,
+            ) or is_traceable_wrapper_subclass(value), type(value)
+            subclass_type = None
+
+        # NB: this just says we accessed a tensor from the same source again
+        # (e.g., a tensor lives in a global foo, and we LOAD_GLOBAL it twice).
+        # This is distinct from two distinct sources mapping to the same
+        # Tensor (per id())!  No guard is necessary here.  See below for the
+        # other case.
+        is_duplicate_tensor = source in self.tx.output.input_source_to_var
+        if is_duplicate_tensor:
+            return self.tx.output.input_source_to_var[source]
+
+        # By this point, we should have deduplicated all tensors
+        self.assert_not_wrapped_by_this_graph(value)
+
+        # tx.output has multiple tracers if we're introspecting HigherOrderOperator.
+        # When we've discovered an untracked tensor, then we actually need
+        # to get Dynamo to track the tensor (which is what this function does)
+        # and put it as a graph input on the root tracer. Later on,
+        # if the input is actually used in the body of the HigherOrderOperator,
+        # then the relevant SubgraphTracer will lift it to being an input of
+        # the subgraph.
+        # See NOTE [HigherOrderOperator tracing design] for more details.
+
+        tensor_proxy = self.tx.output.root_tracer.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(value), source=source
+        )
+        options = {}
+        if type(value) in config.traceable_tensor_subclasses:
+            options["torch_function_fn"] = build_torch_function_fn(
+                self.tx, value, self.source
+            )
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+
+        if (
+            isinstance(value, torch.Tensor)
+            and value.is_nested
+            and not isinstance(value, torch.nested._internal.nested_tensor.NestedTensor)
+        ):
+            unimplemented("torch.compile does not support strided NestedTensor")
+
+        if is_sparse_any(value):
+            unimplemented(
+                f"torch.compile does not support sparse Tensor with {value.layout} layout"
+            )
+
+        tensor_variable = wrap_fx_proxy(
+            tx=self.tx,
+            proxy=tensor_proxy,
+            example_value=value,
+            subclass_type=subclass_type,
+            source=source,
+            **options,
+        )
+
+        self.install_guards(
+            functools.partial(
+                GuardBuilder.TENSOR_MATCH,
+                value=value
+                if isinstance(source, NumpyTensorSource)
+                else TensorWeakRef(value),
+            )
+        )
+
+        # We install TYPE_MATCH guards for traceable wrapper subclass object,
+        # and recursively install corresponding guard for each inner attribute.
+        if is_traceable_wrapper_subclass(value):
+            self.install_guards(GuardBuilder.TYPE_MATCH)
+            attrs, _ = value.__tensor_flatten__()
+            for attr in attrs:
+                inner_value = getattr(value, attr)
+                inner_source = AttrSource(self.source, attr)
+                VariableBuilder(self.tx, inner_source)(inner_value).recursive_realize()
+
+        self.tx.output.input_source_to_var[source] = tensor_variable
+        assert "tensor_dict" not in tensor_proxy.node.meta
+        tensor_proxy.node.meta["tensor_dict"] = value.__dict__.copy()
+
+        # Note: this information is conveyed via subclass_type now
+        fake_tensor_value = tensor_variable.proxy.node.meta["example_value"]
+        if maybe_get_fake_mode(fake_tensor_value) is not self.tx.fake_mode:
+            raise InternalTorchDynamoError("Wrapped Tensor must be this graph's fake")
+
+        grapharg = GraphArg(source, value, False, fake_tensor_value)
+        tensor_proxy.node.meta["grapharg"] = grapharg
+        self.tx.output.add_symbol_bindings(grapharg)
+        return tensor_variable
+
+    def wrap_numpy_ndarray(self, value):
+        assert np is not None
+        assert isinstance(value, np.ndarray)
+
+        source = NumpyTensorSource(self.get_source())
+
+        from torch._numpy import _util
+
+        readonly = not value.flags.writeable
+        if readonly:
+            try:
+                value.flags.writeable = True
+            except ValueError:
+                # One can not easily make nditer elements writable,
+                # but warning is not the end of the world
+                assert isinstance(value.base, np.nditer)
+                pass
+
+        try:
+            tensor_value = _util._try_convert_to_tensor(value)
+            if readonly:
+                from torch._prims_common import clone_preserve_strides
+
+                tensor_value = clone_preserve_strides(tensor_value)
+        except NotImplementedError as e:
+            # failed to convert to tensor, graph break
+            unimplemented(str(e))
+
+        # We do this because we want the full behavior of guarding the numpy ndarray as if it were
+        # a tensor. It's a little annoying to make a VT to throw out, but there's so many side effects here
+        # that there's not another great way to do this atm.
+        # This creates the right graphargs, as well as registration for guards in tensor names and shape env.
+        VariableBuilder(self.tx, source)(tensor_value).recursive_realize()
+        proxy = self.tx.output.root_tracer.create_graph_input(
+            re.sub(r"[^a-zA-Z0-9]+", "_", self.name), type(tensor_value), source=source
+        )
+        options = {"source": source}
+        numpy_ndarray_variable = wrap_fx_proxy_cls(
+            target_cls=NumpyNdarrayVariable,
+            tx=self.tx,
+            proxy=proxy,
+            example_value=tensor_value,
+            **options,
+        )
+
+        self.tx.output.input_source_to_var[source] = numpy_ndarray_variable
+        example_value = numpy_ndarray_variable.proxy.node.meta["example_value"]
+
+        # is_unspecialized should be true because we are wrapping a np.ndarray as argument input, and it needs to be
+        # converted to a tensor.
+        grapharg = GraphArg(
+            source,
+            tensor_value,
+            is_unspecialized=True,
+            fake_tensor=example_value,
+            is_tensor=True,
+            example_strong_ref=tensor_value,
+        )
+        proxy.node.meta["grapharg"] = grapharg
+
+        return numpy_ndarray_variable
+
+    def wrap_unspecialized_primitive(self, value):
+        if self.name in self.tx.output.unspec_variable_map:
+            return self.tx.output.unspec_variable_map[self.name]
+        else:
+            shape_env = self.tx.output.shape_env
+            if TracingContext.get().force_unspec_int_unbacked_size_like and isinstance(
+                value, int
+            ):
+                wrapped_value = shape_env.create_unbacked_symint()
+                _constrain_range_for_size(wrapped_value)
+                self.tx.output.bound_symbols.add(wrapped_value.node.expr)
+                self.tx.output.tracked_fakes.append(
+                    TrackedFake(wrapped_value, self.source, None)
+                )
+
+            # NB: We do not do float.  For motivation, see
+            # https://docs.google.com/document/d/1INSCdYu1PxXcr43HrD82OudeEuS-qxQe1yZmLg2wy6A/edit
+            # but the general idea is that we generate kernels that can
+            # take unspecialized floats and use them in sizevar computation
+            elif (
+                isinstance(value, int)
+                and not is_constant_source(self.get_source())
+                and not isinstance(self.get_source(), RandomValueSource)
+            ):
+                if torch._dynamo.config.specialize_int:
+                    # If specialize_int is False, also return
+                    # a constant (but this should have been handled
+                    # in the caller, TBH)
+                    self.install_guards(GuardBuilder.CONSTANT_MATCH)
+                    return ConstantVariable.create(value=value, source=self.source)
+
+                name = self.source.name()
+                if name not in self.tx.output.frame_state:
+                    # Note - this essentially means that if this name gets reused as a tensor,
+                    # it will start fully dynamic. That should always be a safe option, and not awfully inefficient.
+                    # Alternatively, if we want to improve pef here, we can add a third state of unset, but I am not
+                    # sure that is necessary for now.
+                    frame_state_entry = FrameStateSizeEntry(scalar=value, size=None)
+                else:
+                    frame_state_entry = self.tx.output.frame_state[name]
+                    if frame_state_entry.scalar != value:
+                        log.debug(
+                            "automatic dynamic int %s val %s != %s",
+                            name,
+                            value,
+                            frame_state_entry.scalar,
+                        )
+                        frame_state_entry.scalar = None
+                self.tx.output.frame_state[name] = frame_state_entry
+
+                # TODO: This should be dynamic, as we in general do not
+                # know if bare integers are actually going to be sizevars
+                # and it is inappropriate to eagerly duck size them with
+                # real sizevars
+                if (
+                    config.automatic_dynamic_shapes and frame_state_entry.scalar is None
+                ) or not config.assume_static_by_default:
+                    dynamic_dim = DimDynamic.DYNAMIC
+                else:  # assume_static_by_default
+                    # TODO: dynamic_dim = DimDynamic.STATIC should work but
+                    # for some reason it doesn't
+                    self.install_guards(GuardBuilder.CONSTANT_MATCH)
+                    return ConstantVariable.create(value=value)
+
+                wrapped_value = shape_env.create_unspecified_symint_and_symbol(
+                    value,
+                    source=self.source,
+                    dynamic_dim=dynamic_dim,
+                )
+                self.tx.output.bound_symbols.add(wrapped_value.node.expr)
+
+                self.tx.output.tracked_fakes.append(
+                    TrackedFake(wrapped_value, self.source, None)
+                )
+            else:
+                wrapped_value = torch.tensor(value)
+            if not isinstance(self.get_source(), RandomValueSource):
+                install_guard(self.get_source().make_guard(GuardBuilder.TYPE_MATCH))
+            options = {"source": self.get_source()}
+            if isinstance(wrapped_value, torch.Tensor):
+                options.update({"raw_value": value})
+
+            proxy = self.tx.output.root_tracer.create_graph_input(
+                re.sub(r"[^a-zA-Z0-9]+", "_", self.name),
+                type(wrapped_value),
+                source=self.get_source(),
+            )
+
+            unspec_var = wrap_fx_proxy_cls(
+                UnspecializedPythonVariable,
+                tx=self.tx,
+                proxy=proxy,
+                example_value=wrapped_value,
+                **options,
+            )
+            self.tx.output.unspec_variable_map[self.name] = unspec_var
+            if not is_constant_source(self.get_source()):
+                if self.tx.export and not isinstance(self.get_source(), LocalSource):
+                    raise AssertionError(
+                        "Dynamo attempts to add additional input during export: value={}, source={}".format(
+                            wrapped_value, self.get_source()
+                        )
+                    )
+                fake_tensor_value = None
+                if isinstance(unspec_var, ConstantVariable):
+                    example_value = unspec_var.value
+                else:
+                    example_value = unspec_var.proxy.node.meta["example_value"]
+                if is_fake(example_value):
+                    fake_tensor_value = example_value
+                    assert fake_tensor_value.fake_mode is self.tx.fake_mode, (
+                        f"fake mode ({fake_tensor_value.fake_mode}) from fake tensor metadata doesn't match mode"
+                        "({self.tx.fake_mode}) from InstructionTranslator"
+                    )
+
+                proxy.node.meta["grapharg"] = GraphArg(
+                    self.get_source(),
+                    wrapped_value,
+                    isinstance(wrapped_value, torch.Tensor),
+                    fake_tensor_value,
+                    is_tensor=False,
+                    example_strong_ref=wrapped_value,
+                )
+            return unspec_var
+
+
+def _dataclasses_fields_lambda(obj):
+    if isinstance(obj, UserDefinedObjectVariable):
+        value = obj.value
+    elif isinstance(obj, DataClassVariable):
+        value = obj.user_cls
+    else:
+        unimplemented(f"Dataclass fields handling fails for type {obj}")
+    items = []
+    for field in dataclasses.fields(value):
+        source = None
+        if obj.source:
+            source = GetItemSource(
+                AttrSource(obj.source, "__dataclass_fields__"), field.name
+            )
+        items.append(UserDefinedObjectVariable(field, source=source))
+    return TupleVariable(items)
+
+
+def wrap_fx_proxy(tx, proxy, example_value=None, subclass_type=None, **options):
+    kwargs = {
+        "tx": tx,
+        "proxy": proxy,
+        "example_value": example_value,
+        "subclass_type": subclass_type,
+        **options,
+    }
+    if subclass_type is None:
+        return wrap_fx_proxy_cls(target_cls=TensorVariable, **kwargs)
+    else:
+        result = wrap_fx_proxy_cls(target_cls=TensorWithTFOverrideVariable, **kwargs)
+        result.install_global(tx)
+        return result
+
+
+# Note: Unfortunate split due to some gross classes existing that subclass TensorVariable
+# Should be compositional instead
+#
+# This is a horribly complicated function that does too many things, to
+# explain what it does, let's first talk about the classic usage wrap_fx_proxy
+# for a TensorVariable.  There are two primary modes of use:
+#
+#   1. Wrapping a pre-existing Tensor.  In this case, example_value is set
+#      to the pre-existing Tensor.  (Note that this example_value will NOT
+#      be the final example_value we put into node.meta['example_value'],
+#      instead it is converted into a fake tensor using
+#      wrap_to_fake_tensor_and_record and registered as a graph input.)
+#
+#   2. "Wrapping" the result of some Tensor operation Dynamo traced over. In
+#      this case, example_value is None (and we are going to figure it out
+#      ourselves using FakeTensors, via get_fake_value, which will run
+#      the operation represented by the (singular!) FX node referenced by
+#      the passed in proxy.)
+#
+# The expectation is you end up with a Tensor output, and everything is
+# straightforwardly traced into the graph.
+#
+# In all cases, the returned `TensorVariable` subclass will have an `example_value`
+# and that `example_value` must be a `FakeTensor` produced by the currently running
+# instance of Dynamo.
+#
+# Upon closer inspection, you may notice that there are a slurry of non-Tensor
+# output cases.  What gives?  Well, we sometimes trace operations into the
+# graph that don't involve tensors.
+#
+#   * Some operators return tuples; we need to recursively handle their
+#     contents
+#
+#   * Some operators have side effects that will affect subsequent AOTAutograd
+#     tracing but don't otherwise return anything.
+#
+#   * Some operators return symbolic ints/floats/bools which can go in the
+#     graph and be traced (but only if they're actually symbolic!  If they're
+#     static you don't want to put them in the graph, which means you
+#     shouldn't call this function.)
+#
+# The common theme is that you only use this function WHEN YOU ARE TRACING
+# SOMETHING INTO THE GRAPH.  This is sort of obvious, because you can't call
+# this function without a proxy.
+def wrap_fx_proxy_cls(
+    target_cls, tx, proxy, example_value=None, subclass_type=None, **options
+):
+    from ..symbolic_convert import InstructionTranslatorBase
+
+    assert isinstance(tx, InstructionTranslatorBase)
+    if "guards" in options and options["guards"] is not None:
+        tx.output.guards.update(options["guards"])
+
+    assert "example_value" not in proxy.node.meta, f"{proxy.node.meta['example_value']}"
+
+    initial_example_value = example_value
+
+    def _clone_input(value):
+        if isinstance(value, torch.Tensor):
+            # tensor subclasses will not be converted to FakeTensors and need to be cloned
+            if not (
+                isinstance(value, FakeTensor)
+                or (
+                    # Is functional tensor fakeified by this instance of Dynamo
+                    torch._is_functional_tensor(value)
+                    and maybe_get_fake_mode(value) is tx.fake_mode
+                )
+                or value.is_nested
+            ):
+                # NB: ensure strides are preserved
+                value = clone_input(value)
+
+        return value
+
+    with preserve_rng_state():
+        if example_value is None:
+            # only allow_non_graph_fake in this instance because we handle the non-fake
+            # cases properly below.
+            example_value = get_fake_value(proxy.node, tx, allow_non_graph_fake=True)
+
+        # Handle recursive calls here
+        elif maybe_get_fake_mode(example_value) is tx.fake_mode:
+            pass
+
+        elif isinstance(example_value, torch.Tensor):
+            if tx.export:
+                # The legacy behavior for real value cache with subclasses was
+                # to perform a clone WITHOUT preserving the subclass.  It's
+                # not entirely clear this is what you actually want though.
+                with torch._C.DisableTorchFunctionSubclass():
+                    proxy.tracer.real_value_cache[proxy.node] = _clone_input(
+                        example_value
+                    )
+            # NB: If we're ignoring subclass, then the expectation is you will
+            # take the returned TensorVariable and wrap it into a more
+            # accurate TensorVariable that is able to track subclass-ness;
+            # otherwise this is wrong!
+            kwargs = {
+                "is_tensor": target_cls
+                in (TensorVariable, TensorWithTFOverrideVariable),
+            }
+            assert "source" in options and options["source"] is not None
+            kwargs["source"] = options["source"]
+            example_value = wrap_to_fake_tensor_and_record(
+                example_value, tx=tx, **kwargs
+            )
+        if isinstance(example_value, torch.Tensor) and (
+            maybe_get_fake_mode(example_value) is not tx.fake_mode
+        ):
+            raise InternalTorchDynamoError(
+                "`example_value` needs to be a `FakeTensor`"
+                f"wrapped by this instance of Dynamo. Found: {example_value}"
+            )
+
+    if isinstance(example_value, torch.Tensor):
+        is_parameter = isinstance(example_value, torch.nn.Parameter)
+
+        # NB: In most (all?) cases, this does not actually do a clone.
+        # (WARNING: this means that if we mutate metadata on the fake
+        # tensor, the stored example value will update too!)
+        example_value = _clone_input(example_value)
+        proxy.node.meta["example_value"] = example_value
+        specialized_props = target_cls.specialize(example_value)
+        # TODO: not sure about this fake mode test
+        if (
+            isinstance(example_value, torch._subclasses.fake_tensor.FakeTensor)
+            and example_value.fake_mode is tx.fake_mode
+        ):
+            tensor_type = subclass_type if subclass_type else torch.Tensor
+            specialized_props["class_type"] = (
+                torch.nn.Parameter if is_parameter else tensor_type
+            )
+
+        options.update(specialized_props)
+        return target_cls(proxy, **options)
+    elif (
+        hasattr(proxy.node.target, "__name__")
+        and proxy.node.target.__name__ == "set_state"
+        and isinstance(proxy.node.target.__self__, torch._C.Generator)
+        or proxy.node.target == torch.random.set_rng_state
+    ):
+        return TorchInGraphFunctionVariable(proxy.node.target)
+    elif (
+        proxy.node.target == torch._C._DisableFuncTorch
+        or proxy.node.target == torch.cuda._is_in_bad_fork
+    ):
+        return UserDefinedObjectVariable(example_value)
+    elif istype(example_value, torch.Size) and all(
+        isinstance(x, int) for x in example_value
+    ):
+        sizes = [ConstantVariable.create(x) for x in example_value]
+        return SizeVariable(sizes, **options)
+    elif isinstance(example_value, (tuple, list)):
+        proxy.node.meta["example_value"] = example_value
+        unpacked = []
+        for i, val in enumerate(example_value):
+            if val is None:
+                # nn.MultiheadAttention() can return None, see issue #175
+                unpacked.append(
+                    ConstantVariable.create(None, **options),
+                )
+            else:
+                unpacked.append(
+                    wrap_fx_proxy_cls(
+                        target_cls,
+                        tx,
+                        proxy.tracer.create_proxy(
+                            "call_function", operator.getitem, (proxy, i), {}
+                        ),
+                        example_value=val,
+                        **options,
+                    )
+                )
+        if isinstance(example_value, torch.Size):
+            # NB: Keep the old proxy around.  See SizeVariable for an
+            # explanation why
+            return SizeVariable(unpacked, proxy, **options)
+        elif istype(example_value, tuple):
+            return TupleVariable(unpacked, **options)
+        elif istype(example_value, (list, immutable_list)):
+            return ListVariable(unpacked, mutable_local=MutableLocal(), **options)
+        else:
+            assert example_value.__class__.__module__ == "torch.return_types" or hasattr(
+                example_value, "_fields"
+            ), f"expected {example_value.__class__.__module__} == torch.return_types or named tuple but got {type(example_value)}"
+            return NamedTupleVariable(unpacked, example_value.__class__, **options)
+    elif example_value is None or proxy.node.target is torch.manual_seed:
+        return ConstantVariable.create(None, **options)
+    elif isinstance(example_value, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+        proxy.node.meta["example_value"] = example_value
+        return SymNodeVariable(proxy, example_value, **options)
+    elif (
+        inspect.isclass(proxy.node.target)
+        and issubclass(proxy.node.target, _StreamBase)
+    ) or proxy.node.target in [
+        device_interface.current_stream
+        for _, device_interface in get_registered_device_interfaces()
+    ]:
+        proxy.node.meta["example_value"] = example_value
+        return StreamVariable(proxy, example_value, example_value.device, **options)
+    elif (
+        inspect.isclass(proxy.node.target) and issubclass(proxy.node.target, _EventBase)
+    ) or proxy.node.target in [
+        device_interface.Event
+        for _, device_interface in get_registered_device_interfaces()
+    ]:
+        proxy.node.meta["example_value"] = example_value
+        return EventVariable(proxy, example_value, **options)
+    elif proxy.node.target == "query" and proxy.node.op == "call_method":
+        proxy.node.meta["example_value"] = example_value
+        return ConstantVariable(example_value, **options)
+    elif (
+        example_value is not None
+        and isinstance(example_value, _EventBase)
+        and proxy.node.target == "record_event"
+        and proxy.node.op == "call_method"
+    ):
+        proxy.node.meta["example_value"] = example_value
+        return EventVariable(proxy, example_value, **options)
+    elif isinstance(example_value, int) and proxy.node.target in [
+        torch.sym_int,
+        getattr,
+        operator.getitem,
+        torch._utils._element_size,
+        torch.seed,
+        operator.mod,
+        torch._C._functorch._vmap_increment_nesting,
+        torch._C._functorch._vmap_decrement_nesting,
+        torch._functorch.vmap._validate_and_get_batch_size,
+        torch._C._functorch._grad_increment_nesting,
+        torch._C._functorch._grad_decrement_nesting,
+        # some mac builds are missing torch.distributed.get_rank()
+        getattr(torch.distributed, "get_rank", _missing),
+        getattr(torch.distributed, "get_world_size", _missing),
+        # This always wants to be in the graph, even if the constraint
+        # results in a constant int
+        torch._constrain_as_value,
+        torch._constrain_as_size,
+    ]:
+        proxy.node.meta["example_value"] = example_value
+        return ConstantVariable.create(example_value, **options)
+    elif isinstance(example_value, torch.backends.cuda.SDPAParams):
+        from .sdpa import SDPAParamsVariable
+
+        proxy.node.meta["example_value"] = example_value
+        return SDPAParamsVariable(proxy, **options)
+    elif isinstance(example_value, bool) and proxy.node.target in [
+        torch.backends.cuda.can_use_flash_attention,
+        torch.backends.cuda.can_use_efficient_attention,
+    ]:
+        proxy.node.meta["example_value"] = example_value
+        return ConstantVariable.create(example_value, **options)
+    else:
+        unimplemented(
+            "torch.* op returned non-Tensor "
+            + f"{typestr(example_value)} {proxy.node.op} {proxy.node.target}"
+        )
+
+
+# Tracks the sources of all fake tensors we wrap in Dynamo.
+# Used by shape guard computation.
+@dataclasses.dataclass
+class TrackedFake:
+    fake: Union[FakeTensor, SymInt]
+    source: Source
+    # Is None when fake is SymInt
+    symbolic_context: Optional[SymbolicContext]
+
+    def __hash__(self) -> int:
+        return hash((self.fake, self.source.name()))
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, TrackedFake):
+            return self.fake is other.fake and self.source.name() == other.source.name()
+        return False
+
+
+# Performs automatic dynamic dim determination.
+# Returns a SymbolicContext
+def _automatic_dynamic(
+    e, tx, source, static_shapes, outer_only=False
+) -> SymbolicContext:
+    # strided NT not supported
+    if e.is_nested and not isinstance(
+        e, torch.nested._internal.nested_tensor.NestedTensor
+    ):
+        unimplemented("torch.compile does not support strided NestedTensor")
+
+    name = source.name()
+    prior_policy = tx.output.tracing_context.tensor_to_context.get(e, None)
+    shape_env_to_source_to_symbol_cache = (
+        prior_policy.shape_env_to_source_to_symbol_cache if prior_policy else None
+    )
+
+    # Get base context if the tensor is a view
+    view_base_context: Optional[SymbolicContext] = None
+    if e._is_view():
+        base_source = AttrSource(source, "_base")
+        view_base_context = _automatic_dynamic(e._base, tx, base_source, static_shapes)
+
+    if is_traceable_wrapper_subclass(e) and not outer_only:
+        # Get symbolic context for outer tensor
+        outer_context = _automatic_dynamic(
+            e, tx, source, static_shapes, outer_only=True
+        )
+
+        # Get symbolic contexts for inner tensors
+        attrs, _ = type(e).__tensor_flatten__(e)
+        inner_contexts = {}  # mapping from attr -> symbolic context
+        for attr in attrs:
+            inner_tensor = getattr(e, attr)
+            inner_source = AttrSource(source, attr)
+            inner_context = _automatic_dynamic(
+                inner_tensor, tx, inner_source, static_shapes
+            )
+            inner_contexts[attr] = inner_context
+
+        return SubclassSymbolicContext(
+            dynamic_sizes=outer_context.dynamic_sizes,
+            constraint_sizes=outer_context.constraint_sizes,
+            view_base_context=view_base_context,
+            tensor_source=outer_context.tensor_source,
+            shape_env_to_source_to_symbol_cache=outer_context.shape_env_to_source_to_symbol_cache,
+            inner_contexts=inner_contexts,
+        )
+
+    if static_shapes:
+        return StatefulSymbolicContext(
+            dynamic_sizes=[DimDynamic.STATIC] * e.dim(),
+            constraint_sizes=[None] * e.dim(),
+            view_base_context=view_base_context,
+            tensor_source=source,
+            shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
+        )
+
+    # We preserve the dynamism of inputs. For example, when users call
+    # make_fx(torch.cond, tracing_mode="symbolic")(*args), inputs have SymInt sizes.
+    from torch.fx.experimental.symbolic_shapes import is_nested_int
+
+    if any(isinstance(s, SymInt) and not is_nested_int(s) for s in e.size()):
+        return StatefulSymbolicContext(
+            dynamic_sizes=[
+                DimDynamic.DYNAMIC if isinstance(s, SymInt) else DimDynamic.STATIC
+                for s in e.size()
+            ],
+            constraint_sizes=[None] * e.dim(),
+            view_base_context=view_base_context,
+            tensor_source=source,
+            shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
+        )
+
+    # Prep for automatic dynamic
+    frame_state_entry = None
+    if name not in tx.output.frame_state:
+        # If there is no entry for this source, add the tensor to frame state with its current static size.
+        # E.g., {} -> {"x": [2, 4]}
+        frame_state_entry = FrameStateSizeEntry(None, None)
+        frame_state_entry.size = list(e.size())
+    else:
+        frame_state_entry = tx.output.frame_state[name]
+        if frame_state_entry.size is not None:
+            if e.ndim != len(frame_state_entry.size):
+                # If there is already an entry, and the dim mismatches, replace the frame state entry with None.
+                # E.g. {"x": [2, 3, 4]} -> {"x": None}
+                log.debug(
+                    "automatic dynamic %s dim %s != %s",
+                    name,
+                    e.ndim,
+                    frame_state_entry.size,
+                )
+                frame_state_entry.size = None
+            else:
+                # If there is already an entry, and the dim matches, for every size in the frame state which
+                # disagrees with the current static size, replace it with None. E.g., {"x": [2, 3]} -> {"x": [2, None]}
+                for i, dim in enumerate(frame_state_entry.size):
+                    if dim is not None and e.size()[i] != dim:
+                        log.debug(
+                            "automatic dynamic %s size(%s) %s != %s",
+                            name,
+                            i,
+                            e.size(i),
+                            dim,
+                        )
+                        frame_state_entry.size[i] = None
+
+    # TODO: index export_constraints ahead of time so we don't have to
+    # do a linear scan every time here
+    t_id = id(e)
+    dim2constraint = {}
+
+    def update_dim2constraint(dim, constraint_range, debug_name):
+        if dim in dim2constraint:
+            from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+
+            old_constraint_range, old_debug_name = dim2constraint[dim]
+            new_constraint_range = StrictMinMaxConstraint(
+                vr=constraint_range.vr & old_constraint_range.vr,
+                warn_only=False,
+            )
+            # It is possible for (non-None) old_debug_name and debug_name to be different
+            # but this will only happen the corresponding Dims can be derived equal.
+            new_debug_name = old_debug_name or debug_name
+            dim2constraint[dim] = new_constraint_range, new_debug_name
+        else:
+            dim2constraint[dim] = constraint_range, debug_name
+
+    if tx.output.export_constraints:
+        for constraint in tx.output.export_constraints:
+            if constraint.t_id == t_id:
+                update_dim2constraint(
+                    constraint.dim, constraint.constraint_range, constraint.debug_name
+                )
+            if constraint.shared is not None and constraint.shared.t_id == t_id:
+                # We process constraint ranges for each shared dimension separately
+                # so that we can directly check range constraint violations on them
+                # without looking up which other shared dimensions have this info.
+                # In other words, for this t_id, we will have processed all of its
+                # constraint ranges, no matter where / how they were specified, by
+                # by the end of this loop.
+                update_dim2constraint(
+                    constraint.shared.dim,
+                    constraint.constraint_range,
+                    constraint.debug_name,
+                )
+
+    dynamic_dims = []
+    constraint_dims = []
+    for i in range(e.dim()):
+        # NB: mark dynamic has precedence over static
+        marked_dynamic = i in getattr(e, "_dynamo_dynamic_indices", set())
+        marked_weak_dynamic = i in getattr(e, "_dynamo_weak_dynamic_indices", set())
+        marked_static = i in getattr(e, "_dynamo_static_indices", set())
+
+        # NB: both static and dynamic have precedence over
+        automatic_dynamic = config.automatic_dynamic_shapes and (
+            frame_state_entry.size is None or frame_state_entry.size[i] is None
+        )
+
+        # Reflect the user directive in the frame_state
+        # For dynamic, apply None always
+        if frame_state_entry.size and marked_dynamic:
+            log.debug("automatic dynamic %s marked dynamic", name)
+            frame_state_entry.size[i] = None
+
+        # We will process constraints first, as they will imply that we
+        # have a dynamic dimension
+        # Precedence: export constraints > eager constraints
+        constraint = dim2constraint.get(i)
+        if constraint is None:
+            if marked_dynamic and not config.allow_ignore_mark_dynamic:
+                if hasattr(e, "_dynamo_dynamic_range"):
+                    dim_range = [
+                        dr for dr in e._dynamo_dynamic_range if dr.dim == i
+                    ].pop()
+                    if dim_range.min is None and dim_range.max is None:
+                        constraint_dim = RelaxedUnspecConstraint(warn_only=False)
+                    else:
+                        from torch.fx.experimental.symbolic_shapes import (
+                            StrictMinMaxConstraint,
+                        )
+
+                        constraint_dim = StrictMinMaxConstraint(
+                            vr=ValueRanges(lower=dim_range.min, upper=dim_range.max),
+                            warn_only=False,
+                        )
+                else:
+                    constraint_dim = RelaxedUnspecConstraint(warn_only=False)
+
+            elif not marked_static and automatic_dynamic:
+                constraint_dim = RelaxedUnspecConstraint(warn_only=True)
+            else:
+                constraint_dim = None
+        else:
+            constraint_dim, debug_name = constraint
+            if debug_name is not None:
+                dim_name = f"{name}.size()[{i}]"
+                tx.output.shape_env.source_name_to_debug_name[dim_name] = debug_name
+        constraint_dims.append(constraint_dim)
+
+        # Now, figure out if the dim is dynamic/duck/static
+        if (
+            constraint_dim is not None
+            or marked_dynamic
+            or marked_weak_dynamic
+            or is_nested_int(e.shape[i])
+        ):
+            # NB: We could assert static_shapes is False here, but it
+            # seems better to allow the user to override symbolic_context in this
+            # case
+            dynamic = DimDynamic.DYNAMIC
+        elif static_shapes or config.assume_static_by_default or marked_static:
+            dynamic = DimDynamic.STATIC
+        else:
+            dynamic = DimDynamic.DUCK
+
+        dynamic_dims.append(dynamic)
+
+    tx.output.frame_state[name] = frame_state_entry
+
+    return StatefulSymbolicContext(
+        dynamic_sizes=dynamic_dims,
+        constraint_sizes=constraint_dims,
+        view_base_context=view_base_context,
+        tensor_source=source,
+        shape_env_to_source_to_symbol_cache=shape_env_to_source_to_symbol_cache,
+    )
+
+
+# See note [Tensor Fakification and Symbol Caching]
+def wrap_to_fake_tensor_and_record(
+    e, tx, *, source: Optional[Source], is_tensor: bool, parent_context=None
+):
+    if (
+        type(e) in (torch.Tensor, torch.nn.Parameter, FakeTensor)
+        or isinstance(e, torch.Tensor)
+        or is_traceable_wrapper_subclass(e)
+    ):
+        assert source is not None
+        static_shapes, reason = tensor_always_has_static_shape(
+            e, is_tensor, guard_source=source.guard_source()
+        )
+
+        if not parent_context:
+            symbolic_context = _automatic_dynamic(e, tx, source, static_shapes)
+        else:
+            # Parent contexts are passed in when we are recursively creating
+            # fake tensors for subclasses. A better design would be not to create a
+            # parent/child relationship, but to recursively call _automatic_dynamic
+            # as we recursively call wrap_to_fake_tensor_and_record. This runs
+            # into bugs around how meta_utils knows and works to create fake tensors
+            # with tensor subclasses. Ideally, dynamo would drive both the recursive
+            # wrap_to_fake_tensor_and_record and _automatic_dynamic policy creation.
+            assert isinstance(source, AttrSource)
+            inner_context_name = source.member
+            symbolic_context = parent_context.inner_contexts[inner_context_name]
+
+        log.debug(
+            "wrap_to_fake %s %s %s %s",
+            source.name(),
+            tuple(e.shape),
+            symbolic_context,
+            type(e),
+        )
+        fake_e = wrap_fake_exception(
+            lambda: tx.fake_mode.from_tensor(
+                e,
+                source=source,
+                symbolic_context=symbolic_context,
+            )
+        )
+
+        if is_traceable_wrapper_subclass(fake_e):
+            attrs, _ = fake_e.__tensor_flatten__()
+            for attr in attrs:
+                fake_inner = getattr(fake_e, attr)
+                inner = getattr(e, attr)
+                inner_source = AttrSource(source, attr)
+                wrap_to_fake_tensor_and_record(
+                    inner,
+                    tx,
+                    source=inner_source,
+                    is_tensor=isinstance(fake_inner, torch.Tensor),
+                    parent_context=symbolic_context,
+                )
+
+        tx.output.tracing_context.tensor_to_context[e] = symbolic_context
+        tx.output.tensor_weakref_to_sizes_strides[e] = {
+            "size": fake_e.size(),
+            "stride": fake_e.stride(),
+        }
+
+        if (
+            is_tensor
+            and not (static_shapes and source.is_nn_module())
+            and not is_constant_source(source)
+        ):
+            tx.output.tracked_fakes.append(
+                TrackedFake(fake_e, source, symbolic_context)
+            )
+            tx.output.tracked_fakes_id_to_source[id(e)].append(source)
+
+        return fake_e
+    else:
+        return e
+
+
+class SourcelessBuilder:
+    """
+    Like builder, but stateless and does not require a source. Useful for simple type->VT objects, or objects
+    that are being created/evaporated during inlining (ex: consider a locally made list of tensors we then iterate over
+    .), such a list should not show up as an artifact from inputs, nor in reconstruction, nor in the graph. However,
+    there may be reasons to represent it as a ListVariable internally.
+
+    NOTE - Objects produced here are born UNGUARDED due to the nature of sources!
+
+    NOTE - This class is very new! It will have some rough edges, but it was created to stem the bleeding of giant
+    if/else type->VariableTracker trees that were cropping up all over dynamo.
+    """
+
+    def __call__(self, tx, value) -> VariableTracker:
+        if isinstance(value, VariableTracker):
+            # This is always valid to call, and useful for recursive calls.
+            return value
+        if isinstance(value, dataclasses._HAS_DEFAULT_FACTORY_CLASS):
+            return UserDefinedObjectVariable(value)
+        if ConstantVariable.is_literal(value):
+            return SourcelessBuilder.wrap_constant_literal(value)
+        elif callable(value) and trace_rules.lookup_callable(value) is not None:
+            if is_callable_allowed(value):
+                self.tx.output.has_user_defined_allowed_in_graph = True
+            return trace_rules.lookup_callable(value)(value)
+        elif is_function_or_wrapper(value):
+            return trace_rules.lookup(value)(value)
+        elif isinstance(value, enum.Enum):
+            return EnumVariable(value)
+        elif isinstance(value, (type, abc.ABCMeta)):
+            return UserDefinedClassVariable(value)
+        elif isinstance(value, dict):
+            items = {self(tx, k): self(tx, v) for k, v in value.items()}
+            return ConstDictVariable(items, mutable_local=MutableLocal())
+        elif isinstance(value, set):
+            # Nb. value is a set here so the iteration below is non-deterministic!
+            return SetVariable(
+                [self(tx, x) for x in value], mutable_local=MutableLocal()
+            )
+        elif isinstance(value, (tuple, list)):
+            cls = BaseListVariable.cls_for(type(value))
+            return cls([self(tx, x) for x in value], mutable_local=MutableLocal())
+        elif isinstance(value, types.MethodWrapperType):
+            return MethodWrapperVariable(value)
+        elif PlacementVariable.is_placement(value):
+            return PlacementVariable(value)
+        elif DeviceMeshVariable.is_device_mesh(value):
+            return DeviceMeshVariable(value)
+        unimplemented(f"Unexpected type in sourceless builder {type(value)}")
+
+    @staticmethod
+    def wrap_constant_literal(value):
+        assert ConstantVariable.is_literal(value)
+        return ConstantVariable.create(value=value)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/builtin.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ff064f65e9ad9c20011fa69e4d9e077a76dfbb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/builtin.py
@@ -0,0 +1,1748 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import inspect
+import itertools
+import logging
+import math
+import operator
+import types
+from collections import defaultdict, OrderedDict
+from typing import Dict, List
+
+import torch
+from torch import sym_float, sym_int
+
+from .. import config, polyfill, variables
+from ..exc import (
+    AttributeMutationError,
+    unimplemented,
+    Unsupported,
+    UserError,
+    UserErrorType,
+)
+from ..guards import GuardBuilder, install_guard
+from ..replay_record import DummyModule
+from ..source import AttrSource, GetItemSource, is_constant_source, TypeSource
+from ..utils import (
+    check_constant_args,
+    check_numpy_ndarray_args,
+    check_unspec_python_args,
+    extract_fake_example_value,
+    get_fake_value,
+    guard_if_dyn,
+    istype,
+    numpy_operator_wrapper,
+    proxy_args_kwargs,
+    tensortype_to_dtype,
+)
+from .base import MutableLocal, typestr, VariableTracker
+from .constant import ConstantVariable
+from .ctx_manager import EventVariable, StreamVariable
+from .dicts import (
+    ConstDictVariable,
+    DefaultDictVariable,
+    DictView,
+    is_hashable,
+    SetVariable,
+)
+from .lists import (
+    BaseListVariable,
+    ListIteratorVariable,
+    ListVariable,
+    SizeVariable,
+    TupleIteratorVariable,
+    TupleVariable,
+)
+from .tensor import (
+    FakeItemVariable,
+    SymNodeVariable,
+    TensorVariable,
+    UnspecializedPythonVariable,
+)
+from .user_defined import UserDefinedVariable
+
+log = logging.getLogger(__name__)
+
+
+IN_PLACE_DESUGARING_MAP = {
+    operator.iadd: operator.add,
+    operator.isub: operator.sub,
+    operator.imul: operator.mul,
+    operator.ifloordiv: operator.floordiv,
+    operator.itruediv: operator.truediv,
+    operator.imod: operator.mod,
+    operator.imatmul: operator.imatmul,
+    operator.ilshift: operator.lshift,
+    operator.irshift: operator.rshift,
+    operator.ipow: operator.pow,
+    operator.iand: operator.and_,
+    operator.ior: operator.or_,
+    operator.ixor: operator.xor,
+}
+
+
+def _polyfill_call_impl(name):
+    """Create a BuiltinVariable.call_{name} method that inlines through polyfill.{name}"""
+
+    def call_fn(self, tx, *args, **kwargs):
+        return tx.inline_user_function_return(
+            variables.UserFunctionVariable(fn), args, kwargs
+        )
+
+    fn = getattr(polyfill, name)
+    call_fn.__name__ = f"call_{name}"
+    return call_fn
+
+
+class BuiltinVariable(VariableTracker):
+    _SENTINEL = object()
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.BUILTIN_MATCH))
+        return BuiltinVariable(value, source=source)
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _constant_fold_functions():
+        fns = {
+            abs,
+            all,
+            any,
+            bool,
+            callable,
+            chr,
+            divmod,
+            float,
+            getattr,
+            int,
+            len,
+            max,
+            min,
+            ord,
+            pow,
+            repr,
+            round,
+            str,
+            str.format,
+            sum,
+            type,
+            operator.abs,
+            operator.pos,
+            operator.neg,
+            operator.not_,
+            operator.truth,
+            operator.invert,
+            operator.pow,
+            operator.mul,
+            operator.matmul,
+            operator.floordiv,
+            operator.truediv,
+            operator.mod,
+            operator.add,
+            operator.sub,
+            operator.getitem,
+            operator.length_hint,
+            operator.lshift,
+            operator.rshift,
+            operator.and_,
+            operator.or_,
+            operator.xor,
+            operator.ipow,
+            operator.imul,
+            operator.imatmul,
+            operator.ifloordiv,
+            operator.itruediv,
+            operator.imod,
+            operator.iadd,
+            operator.isub,
+            operator.ilshift,
+            operator.irshift,
+            operator.iand,
+            operator.ixor,
+            operator.ior,
+            operator.index,
+        }
+        fns.update(x for x in math.__dict__.values() if isinstance(x, type(math.sqrt)))
+        return fns
+
+    def can_constant_fold_through(self):
+        return self.fn in self._constant_fold_functions()
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _fx_graph_functions():
+        fns = {
+            operator.abs,
+            operator.pos,
+            operator.neg,
+            operator.not_,
+            operator.invert,
+            operator.pow,
+            operator.mul,
+            operator.matmul,
+            operator.floordiv,
+            operator.truediv,
+            operator.mod,
+            operator.add,
+            operator.lt,
+            operator.gt,
+            operator.ge,
+            operator.le,
+            operator.ne,
+            operator.eq,
+            operator.sub,
+            operator.getitem,
+            operator.length_hint,
+            operator.lshift,
+            operator.rshift,
+            operator.and_,
+            operator.or_,
+            operator.xor,
+            operator.ipow,
+            operator.imul,
+            operator.imatmul,
+            operator.ifloordiv,
+            operator.itruediv,
+            operator.imod,
+            operator.iadd,
+            operator.isub,
+            operator.ilshift,
+            operator.irshift,
+            operator.iand,
+            operator.ixor,
+            operator.ior,
+        }
+        return fns
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _binops():
+        # function -> ([forward name, reverse name, in-place name], in-place op)
+        fns = {
+            operator.add: (["__add__", "__radd__", "__iadd__"], operator.iadd),
+            operator.sub: (["__sub__", "__rsub__", "__isub__"], operator.isub),
+            operator.mul: (["__mul__", "__rmul__", "__imul__"], operator.imul),
+            operator.truediv: (
+                ["__truediv__", "__rtruediv__", "__itruediv__"],
+                operator.itruediv,
+            ),
+            operator.floordiv: (
+                ["__floordiv__", "__rfloordiv__", "__ifloordiv__"],
+                operator.ifloordiv,
+            ),
+            operator.mod: (["__mod__", "__rmod__", "__imod__"], operator.imod),
+            pow: (["__pow__", "__rpow__", "__ipow__"], operator.ipow),
+            operator.pow: (["__pow__", "__rpow__", "__ipow__"], operator.ipow),
+            operator.lshift: (
+                ["__lshift__", "__rlshift__", "__ilshift__"],
+                operator.ilshift,
+            ),
+            operator.rshift: (
+                ["__rshift__", "__rrshift__", "__irshift__"],
+                operator.irshift,
+            ),
+            # NB: The follow binary operators are not supported for now, since the
+            # corresponding magic methods aren't defined on SymInt / SymFloat:
+            # operator.matmul
+            # divmod
+            # operator.and_
+            # operator.or_
+            # operator.xor
+        }
+        return fns
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _binop_handlers():
+        # Multiple dispatch mechanism defining custom binop behavior for certain type
+        # combinations. Handlers are attempted in order, and will be used if the type checks
+        # match. They are expected to have the signature:
+        # fn(tx, arg0: VariableTracker, arg1: VariableTracker, options) -> VariableTracker
+
+        # Override table contains: op_fn -> [list of handlers]
+        op_handlers = {}
+        for (
+            op,
+            (magic_method_names, in_place_op),
+        ) in BuiltinVariable._binops().items():
+            op_handlers[op] = []
+            op_handlers[in_place_op] = []
+
+            forward_name, reverse_name, inplace_name = magic_method_names
+
+            # User-defined args (highest precedence)
+            def user_defined_handler(
+                tx,
+                a,
+                b,
+                options,
+                forward_name=forward_name,
+                reverse_name=reverse_name,
+            ):
+                # Manually handle reversing logic if needed (e.g. call __radd__)
+
+                # TODO: If we expand this to handle tensor args, we need to manually
+                # handle cases like this:
+                #
+                # class A(int):
+                #     def __radd__(self, other):
+                #         print("woof")
+                # torch.randn(3) + A(3)
+                #
+                # In this example, A.__radd__() is not called -> nothing is printed, because
+                # Tensor.__add__ only does a subtype test against int, ignoring the subclass.
+                # To be fully correct, we should not call A.__radd__() here, and there may be
+                # other cases to reason about and add exceptions for.
+                if isinstance(a, UserDefinedVariable):
+                    return a.call_method(tx, forward_name, [b], {})
+                else:
+                    return b.call_method(tx, reverse_name, [a], {})
+
+            op_handlers[op].append(
+                ((UserDefinedVariable, VariableTracker), user_defined_handler)
+            )
+            op_handlers[op].append(
+                ((VariableTracker, UserDefinedVariable), user_defined_handler)
+            )
+
+            def user_defined_inplace_handler(
+                tx, a, b, options, forward_name=inplace_name
+            ):
+                return a.call_method(tx, forward_name, [b], {})
+
+            op_handlers[in_place_op].append(
+                ((UserDefinedVariable, VariableTracker), user_defined_inplace_handler)
+            )
+            op_handlers[in_place_op].append(
+                ((VariableTracker, UserDefinedVariable), user_defined_inplace_handler)
+            )
+
+            # Dynamic shape args
+            def dynamic_handler(tx, a, b, options, fn=op):
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
+                    tx,
+                    tx.output.create_proxy(
+                        "call_function", fn, *proxy_args_kwargs([a, b], {})
+                    ),
+                    **options,
+                )
+
+            op_handlers[op].append(
+                ((SymNodeVariable, VariableTracker), dynamic_handler)
+            )
+            op_handlers[op].append(
+                ((VariableTracker, SymNodeVariable), dynamic_handler)
+            )
+
+            # NB: Prefer out-of-place op when calling in-place op to generate valid graph
+            op_handlers[in_place_op].append(
+                ((SymNodeVariable, VariableTracker), dynamic_handler)
+            )
+            op_handlers[in_place_op].append(
+                ((VariableTracker, SymNodeVariable), dynamic_handler)
+            )
+
+        # Special cases - lower precedence but still prefer these over constant folding
+
+        # List-like addition (e.g. [1, 2] + [3, 4])
+        def tuple_add_handler(tx, a, b, options):
+            return TupleVariable(a.items + list(b.unpack_var_sequence(tx)), **options)
+
+        def size_add_handler(tx, a, b, options):
+            return SizeVariable(a.items + list(b.unpack_var_sequence(tx)), **options)
+
+        list_like_addition_handlers = [
+            # NB: Prefer the tuple-specific logic over base logic because of
+            # some SizeVariable weirdness. Specifically, the tuple-specific logic
+            # drops the subclass type (e.g. SizeVariable) and returns TupleVariables.
+            (
+                (SizeVariable, SizeVariable),
+                size_add_handler,
+            ),
+            (
+                (TupleVariable, TupleVariable),
+                tuple_add_handler,
+            ),
+            (
+                (TupleVariable, ConstantVariable),
+                tuple_add_handler,
+            ),
+            (
+                (ConstantVariable, TupleVariable),
+                lambda tx, a, b, options: TupleVariable(
+                    list(a.unpack_var_sequence(tx)) + b.items, **options
+                ),
+            ),
+            (
+                (BaseListVariable, BaseListVariable),
+                lambda tx, a, b, options: type(a)(a.items + b.items, **options),
+            ),
+        ]
+        op_handlers[operator.add].extend(list_like_addition_handlers)
+
+        def list_iadd_handler(tx, a, b, _):
+            if not a.mutable_local or not b.has_unpack_var_sequence(tx):
+                # Handler doesn't apply
+                return None
+
+            seq = b.unpack_var_sequence(tx)
+            tx.output.side_effects.mutation(a)
+            a.items.extend(seq)
+            return a
+
+        list_like_iadd_handlers = [
+            (
+                (ListVariable, VariableTracker),
+                list_iadd_handler,
+            ),
+            (
+                (TupleVariable, TupleVariable),
+                tuple_add_handler,
+            ),
+            (
+                (TupleVariable, ConstantVariable),
+                tuple_add_handler,
+            ),
+        ]
+        op_handlers[operator.iadd].extend(list_like_iadd_handlers)
+
+        # List-like expansion (e.g. [1, 2, 3] * 3)
+        def expand_list_like(tx, lst, const, options):
+            return lst.__class__(
+                items=lst.items * const.as_python_constant(),
+                mutable_local=MutableLocal(),
+                **options,
+            )
+
+        list_like_expansion_handlers = [
+            ((ListVariable, ConstantVariable), expand_list_like),
+            ((TupleVariable, ConstantVariable), expand_list_like),
+            (
+                (ConstantVariable, ListVariable),
+                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
+            ),
+            (
+                (ConstantVariable, TupleVariable),
+                lambda tx, a, b, options: expand_list_like(tx, b, a, options),
+            ),
+        ]
+        op_handlers[operator.mul].extend(list_like_expansion_handlers)
+
+        return op_handlers
+
+    @staticmethod
+    def _find_binop_handler(op, a, b):
+        handlers = BuiltinVariable._binop_handlers()
+        if op not in handlers:
+            return None
+
+        # Return first handler that matches the type checks
+        for (type1, type2), handler in handlers[op]:
+            if isinstance(a, type1) and isinstance(b, type2):
+                return handler
+
+        return None
+
+    def can_insert_in_graph(self):
+        return self.fn in self._fx_graph_functions()
+
+    def __init__(self, fn, **kwargs):
+        super().__init__(**kwargs)
+        self.fn = fn
+
+    def __str__(self):
+        if self.fn is None:
+            name = "None"
+        else:
+            name = self.fn.__name__
+
+        return f"{self.__class__.__name__}({name})"
+
+    def python_type(self):
+        return type(self.fn)
+
+    def as_python_constant(self):
+        return self.fn
+
+    def as_proxy(self):
+        DTYPE = {
+            bool: torch.bool,
+            int: torch.int64,
+            float: torch.float64,
+        }
+        if self.fn in DTYPE:
+            return DTYPE[self.fn]
+        return super().as_proxy()
+
+    def reconstruct(self, codegen):
+        name = self.fn.__name__
+        assert self.fn.__module__ == "builtins"
+        assert name not in codegen.tx.f_globals, "shadowed global"
+        codegen.append_output(codegen.create_load_global(name, False, add=True))
+
+    def constant_args(self, *args, **kwargs):
+        return check_constant_args(args, kwargs)
+
+    def tensor_args(self, *args, **kwargs):
+        return any(
+            isinstance(i, variables.TensorVariable)
+            for i in itertools.chain(args, kwargs.values())
+        ) and not any(
+            isinstance(i, variables.GetAttrVariable)
+            for i in itertools.chain(args, kwargs.values())
+        )
+
+    def python_and_tensor_constant_only(self, *args, **kwargs):
+        tensor_args = []
+        non_tensor_args = []
+        for i in itertools.chain(args, kwargs.values()):
+            if isinstance(i, variables.TensorVariable):
+                tensor_args.append(i)
+            else:
+                non_tensor_args.append(i)
+        return all(
+            is_constant_source(t.source) if t.source is not None else False
+            for t in tensor_args
+        ) and self.constant_args(*non_tensor_args)
+
+    def unspec_python_args(self, *args, **kwargs):
+        return check_unspec_python_args(args, kwargs)
+
+    @staticmethod
+    def unwrap_unspec_args_kwargs(args, kwargs):
+        return [x.as_python_constant() for x in args], {
+            k: v.as_python_constant() for k, v in kwargs.items()
+        }
+
+    def has_constant_handler(self, args, kwargs):
+        constant_args = check_constant_args(args, kwargs)
+        unspec_python_args = self.unspec_python_args(*args, **kwargs)
+        return self.can_constant_fold_through() and (
+            constant_args or unspec_python_args
+        )
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import UserFunctionVariable
+        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
+
+        args = [v.realize() for v in args]
+        kwargs = {k: v.realize() for k, v in kwargs.items()}
+        assert isinstance(args, (list, tuple))
+        assert isinstance(kwargs, dict)
+        tensor_args = self.tensor_args(*args, **kwargs)
+
+        # args[0] is list and args[1] is unspec
+        if self.fn is operator.getitem and not isinstance(
+            args[0], variables.TensorVariable
+        ):
+            tensor_args = False
+
+        if (
+            self.can_insert_in_graph()
+            and tensor_args
+            and not (
+                self.fn is operator.getitem
+                and isinstance(args[0], ConstDictVariable)
+                and isinstance(args[1], variables.TensorVariable)
+            )
+        ):
+            try:
+                fn = self.fn
+
+                # Constant fold for constant tensor and python constants
+                if tensor_args and self.python_and_tensor_constant_only(
+                    *args, **kwargs
+                ):
+                    from ..bytecode_transformation import unique_id
+                    from .functions import invoke_and_store_as_constant
+
+                    return invoke_and_store_as_constant(
+                        tx, fn, unique_id(fn.__name__), args, kwargs
+                    )
+
+                if self.fn in IN_PLACE_DESUGARING_MAP and isinstance(
+                    args[0], variables.ConstantVariable
+                ):
+                    # In-place operators like += usually mustate tensor
+                    # values, but in the edge case of immutable values they
+                    # re-bind the variable.
+                    #
+                    # The easiest way to keep the graph consistent in this
+                    # scenario is to de-sugar eagerly.
+                    fn, args = IN_PLACE_DESUGARING_MAP[self.fn], [args[0], args[1]]
+
+                if self.fn is operator.getitem and isinstance(args[1], SymNodeVariable):
+                    # Standard indexing will force specialization due to
+                    # __index__.  Rewrite as a regular torch op which will
+                    # trace fine
+                    fn, args = torch.select, [
+                        args[0],
+                        variables.ConstantVariable.create(0),
+                        args[1],
+                    ]
+
+                # Interaction between ndarray and tensors:
+                #   We prefer the tensor op whenever there are tensors involved
+                if check_numpy_ndarray_args(args, kwargs) and not any(
+                    type(arg) == variables.TensorVariable for arg in args
+                ):
+                    proxy = tx.output.create_proxy(
+                        "call_function",
+                        numpy_operator_wrapper(self.fn),
+                        *proxy_args_kwargs(args, kwargs),
+                    )
+
+                    return wrap_fx_proxy_cls(variables.NumpyNdarrayVariable, tx, proxy)
+
+                proxy = tx.output.create_proxy(
+                    "call_function",
+                    fn,
+                    *proxy_args_kwargs(args, kwargs),
+                )
+                if any(isinstance(arg, FakeItemVariable) for arg in args):
+                    return wrap_fx_proxy_cls(
+                        FakeItemVariable,
+                        tx,
+                        proxy,
+                    )
+                elif self.unspec_python_args(*args, **kwargs):
+                    _args, _kwargs = self.unwrap_unspec_args_kwargs(args, kwargs)
+                    raw_value = self.fn(*_args, **_kwargs)
+
+                    need_unwrap = any(
+                        x.need_unwrap
+                        for x in itertools.chain(args, kwargs.values())
+                        if isinstance(x, variables.UnspecializedPythonVariable)
+                    )
+
+                    return wrap_fx_proxy_cls(
+                        UnspecializedPythonVariable,
+                        tx,
+                        proxy,
+                        raw_value=raw_value,
+                        need_unwrap=need_unwrap,
+                    )
+                elif all(isinstance(x, SymNodeVariable) for x in args):
+                    return SymNodeVariable.create(tx, proxy, None)
+                else:
+                    # Work around for vision_maskrcnn due to precision difference
+                    # specialize the dividend when float divide by tensor
+                    if self.fn is operator.truediv and isinstance(
+                        args[0], variables.UnspecializedPythonVariable
+                    ):
+                        args[0] = args[0].convert_to_constant(tx)
+                    return wrap_fx_proxy(tx, proxy)
+
+            except NotImplementedError:
+                unimplemented(f"partial tensor op: {self} {args} {kwargs}")
+
+        # Handle cases like int(torch.seed())
+        # Also handle sym_float to sym_int cases
+        if self.fn in (int, float) and isinstance(
+            args[0], (SymNodeVariable, variables.TensorVariable)
+        ):
+            if isinstance(args[0], variables.TensorVariable):
+                item = args[0].call_method(tx, "item", [], {})
+            else:
+                item = args[0]
+            fn_ = sym_int if self.fn is int else sym_float
+            out = wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    fn_,
+                    (item.as_proxy(),),
+                    {},
+                ),
+            )
+            return out
+
+        # Handle `str` on a user defined function
+        if self.fn == str and args and isinstance(args[0], (UserFunctionVariable)):
+            return variables.ConstantVariable.create(value=str(args[0].fn))
+
+        # Handle binary ops (e.g. __add__ / __radd__, __iadd__, etc.)
+        # NB: Tensor args are handled above and not here
+        if len(kwargs) == 0 and len(args) == 2:
+            # Try to find a handler for the arg types; otherwise, fall through to constant handler
+            binop_handler = BuiltinVariable._find_binop_handler(
+                self.fn, args[0], args[1]
+            )
+            if binop_handler:
+                res = binop_handler(tx, args[0], args[1], {})
+                if res is not None:
+                    return res
+
+        handler = getattr(self, f"call_{self.fn.__name__}", None)
+
+        if handler:
+            try:
+                result = handler(tx, *args, **kwargs)
+                if result is not None:
+                    return result
+            except TypeError:
+                # Check if binding is bad. inspect signature bind is expensive.
+                # So check only when handler call fails.
+                try:
+                    inspect.signature(handler).bind(tx, *args, **kwargs)
+                except TypeError as e:
+                    has_constant_handler = self.has_constant_handler(args, kwargs)
+                    if not has_constant_handler:
+                        log.warning(
+                            "incorrect arg count %s %s and no constant handler",
+                            handler,
+                            e,
+                        )
+                        unimplemented(f"invalid handler args {handler} {args} {kwargs}")
+                else:
+                    raise
+            except Unsupported as exc:
+                has_constant_handler = self.has_constant_handler(args, kwargs)
+                if not has_constant_handler:
+                    raise
+                # Actually, we will handle this just fine
+                exc.remove_from_stats()
+
+        # NB: call to has_constant_handler is deliberately delayed post generic
+        # handler because has_constant_handler calls as_python_constant
+        # internally which realizes LazyVariableTracker for ConstantVariables,
+        # unnecessarily putting guards on objects which might not actually be used.
+        has_constant_handler = self.has_constant_handler(args, kwargs)
+        if has_constant_handler:
+            from .builder import SourcelessBuilder
+
+            # constant fold
+            return SourcelessBuilder()(
+                tx,
+                self.as_python_constant()(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                ),
+            )
+
+        return super().call_function(tx, args, kwargs)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if self.fn == dict and name == "fromkeys":
+            return BuiltinVariable.call_custom_dict_fromkeys(tx, dict, *args, **kwargs)
+        if self.fn == itertools.chain and name == "from_iterable":
+            assert len(args) == 1
+            assert len(kwargs) == 0
+            obj = args[0]
+            items = []
+            for item in obj.unpack_var_sequence(tx):
+                items.extend(item.unpack_var_sequence(tx))
+            return variables.TupleVariable(items)
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def _call_min_max(self, tx, *args):
+        if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
+            # expand iterable
+            items = args[0].unpack_var_sequence(tx)
+            return self._call_min_max_seq(tx, items)
+        elif len(args) == 2:
+            return self._call_min_max_binary(tx, args[0], args[1])
+        elif len(args) > 2:
+            return self._call_min_max_seq(tx, args)
+
+    def _call_min_max_seq(self, tx, items):
+        assert len(items) > 0
+        if len(items) == 1:
+            return items[0]
+
+        return functools.reduce(functools.partial(self._call_min_max_binary, tx), items)
+
+    def _call_min_max_binary(self, tx, a, b):
+        if self.tensor_args(a, b):
+            if not isinstance(a, variables.TensorVariable):
+                a, b = b, a
+            assert isinstance(a, variables.TensorVariable)
+
+            # result of an item call is a scalar convert to a tensor
+            if isinstance(a, FakeItemVariable):
+                a = variables.TorchInGraphFunctionVariable(torch.tensor).call_function(
+                    tx, [a], {}
+                )
+
+            # Dynamic input does not get resolved, rather, gets stored as call_function
+            if isinstance(a, SymNodeVariable) or isinstance(b, SymNodeVariable):
+                from .builder import wrap_fx_proxy_cls
+
+                return wrap_fx_proxy_cls(
+                    type(a),
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        self.fn,
+                        *proxy_args_kwargs([a, b], {}),
+                    ),
+                )
+
+            # convert min/max to torch ops
+            if b.is_python_constant():
+                if isinstance(a, variables.NumpyNdarrayVariable):
+                    import numpy as np
+
+                    fn = variables.NumpyVariable(np.clip)
+                else:
+                    fn = variables.TorchInGraphFunctionVariable(torch.clamp)
+                kwargs = {"min": b} if (self.fn is max) else {"max": b}
+                result = fn.call_function(tx, [a], kwargs)
+            else:
+                if isinstance(a, variables.NumpyNdarrayVariable):
+                    import numpy as np
+
+                    fn = {max: np.maximum, min: np.minimum}[self.fn]
+                    fn = variables.NumpyVariable(fn)
+                else:
+                    fn = {max: torch.maximum, min: torch.minimum}[self.fn]
+                    fn = variables.TorchInGraphFunctionVariable(fn)
+                result = fn.call_function(tx, [a, b], {})
+
+            # return unspec if both a, b are unspec or const
+            if all(
+                isinstance(
+                    i,
+                    (
+                        variables.UnspecializedPythonVariable,
+                        variables.ConstantVariable,
+                    ),
+                )
+                for i in [a, b]
+            ):
+                if any(isinstance(val, FakeItemVariable) for val in [a, b]):
+                    return variables.FakeItemVariable.from_tensor_variable(result)
+
+                if b.is_python_constant():
+                    raw_b = b.as_python_constant()
+                else:
+                    raw_b = b.raw_value
+                if self.fn is max:
+                    raw_res = max(a.raw_value, raw_b)
+                else:
+                    raw_res = min(a.raw_value, raw_b)
+
+                need_unwrap = any(
+                    x.need_unwrap
+                    for x in [a, b]
+                    if isinstance(x, variables.UnspecializedPythonVariable)
+                )
+                return variables.UnspecializedPythonVariable.from_tensor_variable(
+                    result, raw_res, need_unwrap
+                )
+            # otherwise return tensor
+            else:
+                return result
+        elif isinstance(a, SymNodeVariable) or isinstance(b, SymNodeVariable):
+            fn = torch.sym_max if self.fn is max else torch.sym_min
+            proxy = tx.output.create_proxy(
+                "call_function", fn, *proxy_args_kwargs([a, b], {})
+            )
+            return SymNodeVariable.create(tx, proxy, None)
+
+    call_min = _call_min_max
+    call_max = _call_min_max
+
+    def call_abs(self, tx, arg: "VariableTracker"):
+        # Call arg.__abs__()
+        abs_method = BuiltinVariable(getattr).call_function(
+            tx, [arg, ConstantVariable.create("__abs__")], {}
+        )
+        return abs_method.call_function(tx, [], {})
+
+    def call_pos(self, tx, arg: "VariableTracker"):
+        # Call arg.__pos__()
+        pos_method = BuiltinVariable(getattr).call_function(
+            tx, [arg, ConstantVariable.create("__pos__")], {}
+        )
+        return pos_method.call_function(tx, [], {})
+
+    def call_round(self, tx, arg, *args, **kwargs):
+        # Call arg.__round__()
+        round_method = BuiltinVariable(getattr).call_function(
+            tx, [arg, ConstantVariable.create("__round__")], {}
+        )
+        return round_method.call_function(tx, args, kwargs)
+
+    def call_range(self, tx, *args):
+        if self.unspec_python_args(*args) or self.constant_args(*args):
+            return variables.RangeVariable(args)
+        elif self._dynamic_args(*args):
+            args = [
+                variables.ConstantVariable.create(guard_if_dyn(arg)) for arg in args
+            ]
+            return variables.RangeVariable(args)
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    def _dynamic_args(self, *args, **kwargs):
+        return any(isinstance(x, SymNodeVariable) for x in args) or any(
+            isinstance(x, SymNodeVariable) for x in kwargs.values()
+        )
+
+    def call_slice(self, tx, *args):
+        return variables.SliceVariable(args)
+
+    def _dyn_proxy(self, tx, *args, **kwargs):
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_function", self.fn, *proxy_args_kwargs(args, kwargs)
+            ),
+        )
+
+    def _call_iter_tuple_list(self, tx, obj=None, *args, **kwargs):
+        if self._dynamic_args(*args, **kwargs):
+            return self._dyn_proxy(tx, *args, **kwargs)
+
+        if isinstance(obj, variables.IteratorVariable):
+            # For non-list iterators, we will guard on vars that
+            # determine the control flow
+            return obj
+
+        cls = variables.BaseListVariable.cls_for(self.fn)
+        if obj is None:
+            return cls(
+                [],
+                mutable_local=MutableLocal(),
+            )
+        elif obj.has_unpack_var_sequence(tx):
+            if obj.source and not is_constant_source(obj.source):
+                if isinstance(obj, TupleIteratorVariable):
+                    install_guard(
+                        obj.source.make_guard(GuardBuilder.TUPLE_ITERATOR_LEN)
+                    )
+                else:
+                    install_guard(obj.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
+
+            return cls(
+                list(obj.unpack_var_sequence(tx)),
+                mutable_local=MutableLocal(),
+            )
+
+    def call_iter(self, tx, obj, *args, **kwargs):
+        # Handle the case where we are iterating over a tuple, list or iterator
+        ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
+
+        if ret is None:
+            # If the object doesn't implement a __iter__ method, it will be an error in eager mode when calling iter on it anyway.
+            # If the object implements a __iter__ method, inlining effectively forwards the call to another iter call
+            # (e.g. when __iter__ just returns iter(self.list)) or return a user-defined iterator.
+            return obj.call_method(tx, "__iter__", args, kwargs)
+        return ret
+
+    call_tuple = _call_iter_tuple_list
+    call_list = _call_iter_tuple_list
+
+    def call_callable(self, tx, arg):
+        from .functions import BaseUserFunctionVariable
+
+        if isinstance(
+            arg, (variables.UserDefinedClassVariable, BaseUserFunctionVariable)
+        ):
+            return variables.ConstantVariable.create(True)
+        elif isinstance(arg, UserDefinedVariable):
+            return variables.ConstantVariable.create(callable(arg.value))
+        elif isinstance(arg, (ConstantVariable, SymNodeVariable, TensorVariable)):
+            return variables.ConstantVariable.create(False)
+
+    def call_cast(self, _, *args, **kwargs):
+        if len(args) == 2:
+            return args[1]
+
+        unimplemented(f"unsupported args to builtin cast(): {args} {kwargs}")
+
+    def call_dict(self, tx, *args, **kwargs):
+        return BuiltinVariable.call_custom_dict(tx, dict, *args, **kwargs)
+
+    @staticmethod
+    def call_custom_dict(tx, user_cls, *args, **kwargs):
+        if not kwargs:
+            if not args:
+                args = ({},)
+            assert len(args) == 1
+            arg = args[0]
+            if isinstance(arg, dict):
+                return ConstDictVariable(arg, user_cls, mutable_local=MutableLocal())
+            elif isinstance(arg, variables.ConstDictVariable):
+                return arg.clone(user_cls=user_cls, mutable_local=MutableLocal())
+            elif isinstance(
+                arg,
+                (
+                    ListVariable,
+                    TupleVariable,
+                    ListIteratorVariable,
+                ),
+            ):
+                items = dict(
+                    x.unpack_var_sequence(tx) for x in arg.unpack_var_sequence(tx)
+                )
+                return ConstDictVariable(items, user_cls, mutable_local=MutableLocal())
+        elif not args and kwargs:
+            items = {ConstantVariable.create(k): v for k, v in kwargs.items()}
+            return variables.ConstDictVariable(
+                items, user_cls=user_cls, mutable_local=MutableLocal()
+            )
+        unimplemented(f"{user_cls.__name__}(): {args} {kwargs}")
+
+    @staticmethod
+    def call_custom_dict_fromkeys(tx, user_cls, *args, **kwargs):
+        assert user_cls in {dict, OrderedDict, defaultdict}
+        if kwargs:
+            # Only `OrderedDict.fromkeys` accepts `value` passed by keyword
+            assert user_cls is OrderedDict
+            assert len(args) == 1 and len(kwargs) == 1 and "value" in kwargs
+            args = (*args, kwargs.pop("value"))
+        if len(args) == 0:
+            raise UserError(TypeError, "fromkeys expected at least 1 argument, got 0")
+        if len(args) == 1:
+            args = (*args, ConstantVariable.create(None))
+        assert len(args) == 2
+        arg, value = args
+        DictVariableType = (
+            ConstDictVariable if user_cls is not defaultdict else DefaultDictVariable
+        )
+
+        if isinstance(arg, dict):
+            arg = [ConstantVariable.create(k) for k in arg.keys()]
+            return DictVariableType(
+                dict.fromkeys(arg, value), user_cls, mutable_local=MutableLocal()
+            )
+        elif arg.has_unpack_var_sequence(tx) and all(
+            is_hashable(v) for v in arg.unpack_var_sequence(tx)
+        ):
+            keys = arg.unpack_var_sequence(tx)
+            return DictVariableType(
+                dict.fromkeys(keys, value), user_cls, mutable_local=MutableLocal()
+            )
+        unimplemented(f"{user_cls.__name__}.fromkeys(): {args} {kwargs}")
+
+    def call_set(self, tx, *args, **kwargs):
+        # Can we merge this implementation and call_dict's one?
+        assert not kwargs
+        if not args:
+            return SetVariable([], mutable_local=MutableLocal())
+        assert len(args) == 1
+        arg = args[0]
+        if isinstance(arg, variables.SetVariable):
+            return arg.clone(mutable_local=MutableLocal())
+        elif arg.has_unpack_var_sequence(tx):
+            items = arg.unpack_var_sequence(tx)
+            return SetVariable(items, mutable_local=MutableLocal())
+        else:
+            unimplemented(f"set(): {args} {kwargs}")
+
+    def call_zip(self, tx, *args, **kwargs):
+        if kwargs:
+            assert len(kwargs) == 1 and "strict" in kwargs
+        if all(x.has_unpack_var_sequence(tx) for x in args):
+            unpacked = [arg.unpack_var_sequence(tx) for arg in args]
+            if kwargs.pop("strict", False) and len(unpacked) > 0:
+                if not all(len(u) == len(unpacked[0]) for u in unpacked):
+                    raise UserError(
+                        ValueError,
+                        "zip() has one argument of len differing from others",
+                    )
+            items = [variables.TupleVariable(list(item)) for item in zip(*unpacked)]
+            return variables.TupleVariable(items)
+
+    def call_enumerate(self, tx, *args):
+        if len(args) == 1:
+            start = 0
+        else:
+            assert len(args) == 2
+            assert isinstance(args[1], variables.ConstantVariable)
+            start = args[1].as_python_constant()
+        if args[0].has_unpack_var_sequence(tx):
+            items = [
+                variables.TupleVariable(
+                    [variables.ConstantVariable.create(idx), var],
+                )
+                for idx, var in enumerate(args[0].unpack_var_sequence(tx), start)
+            ]
+            return variables.TupleVariable(items)
+
+    def call_len(self, tx, *args, **kwargs):
+        return args[0].call_method(tx, "__len__", args[1:], kwargs)
+
+    def call_getitem(self, tx, *args, **kwargs):
+        return args[0].call_method(tx, "__getitem__", args[1:], kwargs)
+
+    def call_isinstance(self, tx, arg, isinstance_type):
+        try:
+            arg_type = arg.python_type()
+        except NotImplementedError:
+            unimplemented(
+                f"isinstance({arg}, {isinstance_type}): can't determine type of {arg}"
+            )
+
+        isinstance_type = isinstance_type.as_python_constant()
+
+        if isinstance(arg, variables.TensorVariable) and arg.dtype is not None:
+
+            def _tensor_isinstance(tensor_var, tensor_type):
+                def check_type(ty):
+                    if ty not in tensortype_to_dtype:
+                        return issubclass(arg.python_type(), ty)
+
+                    dtypes = tensortype_to_dtype[ty]
+                    return arg.dtype in dtypes
+
+                if type(tensor_type) is tuple:
+                    return any(check_type(ty) for ty in tensor_type)
+                else:
+                    return check_type(tensor_type)
+
+            return variables.ConstantVariable.create(
+                _tensor_isinstance(arg, isinstance_type)
+            )
+        # UserDefinedObject with C extensions can have torch.Tensor attributes,
+        # so break graph.
+        if isinstance(arg, variables.UserDefinedObjectVariable) and isinstance(
+            arg.value, types.MemberDescriptorType
+        ):
+            unimplemented(
+                f"isinstance called on UserDefinedClass {arg} {isinstance_type}"
+            )
+        # handle __instancecheck__ defined in user class
+        if (
+            isinstance(arg, variables.UserDefinedObjectVariable)
+            and "__instancecheck__" in isinstance_type.__class__.__dict__
+        ):
+            return variables.ConstantVariable.create(
+                isinstance_type.__class__.__instancecheck__(isinstance_type, arg.value)
+            )
+
+        try:
+            val = issubclass(arg_type, isinstance_type)
+        except TypeError:
+            val = arg_type is isinstance_type
+        return variables.ConstantVariable.create(val)
+
+    def call_issubclass(self, tx, left_ty, right_ty):
+        """Checks if first arg is subclass of right arg"""
+        left_ty = left_ty.as_python_constant()
+        right_ty = right_ty.as_python_constant()
+
+        return variables.ConstantVariable(issubclass(left_ty, right_ty))
+
+    def call_super(self, tx, a, b):
+        return variables.SuperVariable(a, b)
+
+    def call_next(self, tx, arg):
+        if isinstance(
+            arg, (variables.ListIteratorVariable, variables.IteratorVariable)
+        ):
+            val, next_iter = arg.next_variables(tx)
+            return val
+        elif isinstance(arg, variables.BaseListVariable):
+            return arg.items[0]
+
+    def call_hasattr(self, tx, obj, attr):
+        if attr.is_python_constant():
+            name = attr.as_python_constant()
+            return obj.call_hasattr(tx, name)
+
+    def call_map(self, tx, fn, seq):
+        if seq.has_unpack_var_sequence(tx):
+            items = [fn.call_function(tx, [x], {}) for x in seq.unpack_var_sequence(tx)]
+            return variables.TupleVariable(items)
+
+    def call_sum(self, tx, seq, start=_SENTINEL):
+        # Special case for sum on tuple of floats and ints
+        if isinstance(seq, (variables.ListVariable, variables.TupleVariable)) and all(
+            isinstance(x, variables.ConstantVariable)
+            and isinstance(x.value, (int, float))
+            for x in seq.items
+        ):
+            if start is self._SENTINEL:
+                return variables.ConstantVariable.create(
+                    sum(x.value for x in seq.items),
+                )
+            if isinstance(start, variables.ConstantVariable) and isinstance(
+                start.value, (int, float)
+            ):
+                return variables.ConstantVariable.create(
+                    sum((x.value for x in seq.items), start=start.value),
+                )
+        if seq.has_unpack_var_sequence(tx):
+            if start is self._SENTINEL:
+                start = variables.ConstantVariable.create(0)
+            items = seq.unpack_var_sequence(tx)
+            return BuiltinVariable(functools.reduce).call_function(
+                tx,
+                [
+                    BuiltinVariable(operator.add),
+                    variables.TupleVariable(items),
+                    start,
+                ],
+                {},
+            )
+
+    def call_reduce(self, tx, function, iterable, initial=_SENTINEL):
+        if iterable.has_unpack_var_sequence(tx):
+            items = iterable.unpack_var_sequence(tx)
+            if initial is self._SENTINEL:
+                value, items = items[0], items[1:]
+            else:
+                value = initial
+            for element in items:
+                value = function.call_function(tx, [value, element], {})
+            return value
+
+    def call_getattr(
+        self, tx, obj: VariableTracker, name_var: VariableTracker, default=None
+    ):
+        from .. import trace_rules
+        from . import (
+            ConstantVariable,
+            GetAttrVariable,
+            PythonModuleVariable,
+            TorchInGraphFunctionVariable,
+            UserFunctionVariable,
+        )
+        from .builder import SourcelessBuilder, VariableBuilder
+
+        name = name_var.as_python_constant()
+
+        if not name_var.is_python_constant():
+            unimplemented("non-const getattr() name")
+
+        if tx.output.side_effects.is_attribute_mutation(obj):
+            try:
+                # re-read a pending side effect?
+                return tx.output.side_effects.load_attr(obj, name)
+            except KeyError:
+                pass
+
+        if default is not None:
+            hasattr_var = self.call_hasattr(tx, obj, name_var)
+            assert hasattr_var.as_python_constant() in (True, False)
+            if not hasattr_var.as_python_constant():
+                return default
+
+        options = {}
+        if obj.source:
+            source = AttrSource(obj.source, name)
+            options["source"] = source
+        else:
+            source = None
+
+        if name == "__bases__":
+            try:
+                value = obj.as_python_constant()
+                if isinstance(value, type):
+                    bases = value.__bases__
+                    if source is not None:
+                        tuple_args = [
+                            VariableBuilder(tx, GetItemSource(source, i))(b)
+                            for i, b in enumerate(bases)
+                        ]
+                    else:
+                        tuple_args = [SourcelessBuilder()(tx, b) for b in bases]
+
+                    return variables.TupleVariable(tuple_args, **options)
+            except NotImplementedError:
+                pass
+
+        if isinstance(obj, variables.NNModuleVariable):
+            return obj.var_getattr(tx, name)
+        elif isinstance(
+            obj,
+            (
+                variables.TensorVariable,
+                variables.NamedTupleVariable,
+                variables.ConstantVariable,
+                variables.UserDefinedClassVariable,
+                variables.UserDefinedObjectVariable,
+            ),
+        ):
+            try:
+                return obj.var_getattr(tx, name)
+            except NotImplementedError:
+                return GetAttrVariable(obj, name, **options)
+        elif isinstance(obj, TorchInGraphFunctionVariable):
+            # Get OpOverload from an OpOverloadPacket, e.g., torch.ops.aten.add.default.
+            member = getattr(obj.value, name)
+            if isinstance(
+                member, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)
+            ) and trace_rules.is_aten_op_or_tensor_method(member):
+                return TorchInGraphFunctionVariable(member, **options)
+        elif isinstance(obj, (PythonModuleVariable, DummyModule)):
+            if obj.is_torch:
+                member = getattr(obj.value, name)
+            else:
+                member = obj.value.__dict__[name]
+
+            if config.replay_record_enabled:
+                tx.exec_recorder.record_module_access(obj.value, name, member)
+
+            if source is not None:
+                return VariableBuilder(tx, source)(member)
+            else:
+                return SourcelessBuilder()(tx, member)
+        elif istype(obj, UserFunctionVariable) and name in ("__name__", "__module__"):
+            return ConstantVariable.create(getattr(obj.fn, name))
+        else:
+            try:
+                return obj.var_getattr(tx, name)
+            except NotImplementedError:
+                return GetAttrVariable(obj, name, **options)
+
+    def call_setattr(
+        self, tx, obj: VariableTracker, name_var: VariableTracker, val: VariableTracker
+    ):
+        from .distributed import PlacementVariable
+
+        if isinstance(
+            obj,
+            (
+                variables.DataClassVariable,
+                variables.CustomizedDictVariable,
+                PlacementVariable,
+            ),
+        ):
+            return obj.call_method(tx, "__setattr__", [name_var, val], {})
+        elif (
+            tx.output.side_effects.is_attribute_mutation(obj)
+            and name_var.is_python_constant()
+        ):
+            name = name_var.as_python_constant()
+            if isinstance(obj, variables.TensorVariable):
+                from .builder import wrap_fx_proxy
+
+                if name == "requires_grad":
+                    # TODO(voz): Make it work properly
+                    unimplemented(
+                        "mutating requires_grad can introduce a new leaf from non-leaf or vice versa in "
+                        "the middle of the graph, which aot_autograd does not currently know how to handle. "
+                    )
+                if name == "data":
+                    # Remove the old reference in tracked fakes - if we don't do this
+                    # new .data value size and shape differences will cause
+                    # tracked fakes to produce incorrect guards. This is sound because the TensorVariable
+                    # coming out of set_() below will be a new one, and get
+                    # installed in tracked fakes.
+                    to_remove = []
+                    for tf in tx.output.tracked_fakes:
+                        if tf.source == obj.source:
+                            to_remove.append(tf)
+                    for tf in to_remove:
+                        tx.output.tracked_fakes.remove(tf)
+
+                    # Step 1 - disable grads
+                    with dynamo_disable_grad(tx), torch.no_grad():
+                        # Step 2 - call `set_`
+                        out = wrap_fx_proxy(
+                            tx,
+                            tx.output.create_proxy(
+                                "call_function",
+                                torch.Tensor.set_,
+                                *proxy_args_kwargs([obj, val], {}),
+                            ),
+                        )
+
+                    # Step 3 - drop the version counter - this is a step required to get
+                    # .data setting to play correctly with the autograd engine.
+                    # Esentially, dynamo is trying to faithful preserve the (absurd)
+                    # behavior of .data= from eager mode
+                    def _lower_version_count_by_1(x):
+                        version = x._version
+                        if version > 0:
+                            version = version - 1
+                        torch._C._autograd._unsafe_set_version_counter(x, version)
+                        return x
+
+                    tx.output.create_proxy(
+                        "call_function",
+                        _lower_version_count_by_1,
+                        (out.as_proxy(),),
+                        {},
+                    )
+                    _lower_version_count_by_1(obj.as_proxy().node.meta["example_value"])
+                    # This handles options prop, guards and ends with a clone
+                    # Step 4 - replace all reference to the current object with the new one
+                    return out
+
+            tx.output.side_effects.store_attr(obj, name, val)
+            return val
+        elif isinstance(obj, variables.UserDefinedObjectVariable):
+            unimplemented(
+                f"setattr(UserDefinedObjectVariable) {type(obj.value).__setattr__}"
+            )
+        elif isinstance(obj, variables.NNModuleVariable):
+            if not tx.output.is_root_tracer():
+                raise AttributeMutationError(
+                    "Can't inplace modify module params/buffers inside HigherOrderOp"
+                )
+            if name_var.is_python_constant() and isinstance(
+                val, variables.TensorVariable
+            ):
+                assigning_fake_val = get_fake_value(val.as_proxy().node, tx)
+
+                try:
+                    getattr_var = obj.var_getattr(tx, name_var.as_python_constant())
+                except AttributeError:
+                    getattr_var = None
+
+                if isinstance(getattr_var, variables.TensorVariable):
+                    # get_fake_val will get the same fake tensor
+                    existing_fake_attr = get_fake_value(getattr_var.as_proxy().node, tx)
+
+                    # same tensor identiy, setattr is a no-op
+                    mod_setattr = inspect.getattr_static(obj.module_type, "__setattr__")
+                    if (
+                        existing_fake_attr is assigning_fake_val
+                        and mod_setattr is torch.nn.Module.__setattr__
+                    ):
+                        return getattr_var
+
+            obj.convert_to_unspecialized(tx)
+        # FIXME (tmanlaibaatar) this is utter hack to unblock HuggingFace export
+        # Export generally doesn't want to allow mutations on objects directly,
+        # but we don't have good way to do this rn. For now, we make it an undefined
+        # behaviour and just set attributes directly on the PretrainedConfig object
+        # for now.
+        elif isinstance(obj, variables.dicts.HFPretrainedConfigVariable) and tx.export:
+            if name_var.is_python_constant() and isinstance(
+                val, variables.ConstantVariable
+            ):
+                setattr(
+                    obj.obj, name_var.as_python_constant(), val.as_python_constant()
+                )
+                return ConstantVariable(None)
+
+    def call_delattr(self, tx, obj: VariableTracker, name_var: VariableTracker):
+        return self.call_setattr(tx, obj, name_var, variables.DeletedVariable())
+
+    def call_type(self, tx, obj: VariableTracker):
+        from .builder import SourcelessBuilder, VariableBuilder
+
+        try:
+            py_type = obj.python_type()
+        except NotImplementedError as error:
+            raise UserError(
+                UserErrorType.INVALID_INPUT,
+                str(error),
+                case_name="unknown_python_type",
+            ) from None
+
+        if obj.source is None:
+            return SourcelessBuilder()(tx, py_type)
+        else:
+            return VariableBuilder(tx, TypeSource(obj.source))(py_type)
+
+    def call_reversed(self, tx, obj: VariableTracker):
+        if obj.has_unpack_var_sequence(tx):
+            items = list(reversed(obj.unpack_var_sequence(tx)))
+            return variables.TupleVariable(items)
+
+    def call_sorted(self, tx, obj: VariableTracker, **kwargs):
+        if (
+            obj.has_unpack_var_sequence(tx)
+            and not isinstance(obj, variables.TensorVariable)
+            and all(x.is_python_constant() for x in obj.unpack_var_sequence(tx))
+        ):
+            function = kwargs.pop("key", None)
+            reverse = kwargs.pop(
+                "reverse", ConstantVariable.create(False)
+            ).as_python_constant()
+            assert len(kwargs) == 0
+            if function:
+                items = sorted(
+                    obj.unpack_var_sequence(tx),
+                    key=lambda x: function.call_function(
+                        tx, [x], {}
+                    ).as_python_constant(),
+                    reverse=reverse,
+                )
+            else:
+                items = sorted(
+                    obj.unpack_var_sequence(tx),
+                    key=lambda x: x.as_python_constant(),
+                    reverse=reverse,
+                )
+            return variables.ListVariable(items)
+
+    def call_chain(self, tx, *args):
+        if all(obj.has_unpack_var_sequence(tx) for obj in args):
+            items = []
+            for obj in args:
+                items.extend(obj.unpack_var_sequence(tx))
+            return variables.TupleVariable(items)
+
+    def call_islice(self, tx, iterable, *args):
+        if iterable.has_unpack_var_sequence(tx) and all(
+            x.is_python_constant() for x in args
+        ):
+            const_args = [x.as_python_constant() for x in args]
+            items = iterable.unpack_var_sequence(tx)
+            items = list(itertools.islice(items, *const_args))
+            return variables.TupleVariable(items)
+
+    # neg is a constant fold function, so we only get here if constant fold is not valid
+    def call_neg(self, tx, a):
+        if isinstance(a, SymNodeVariable):
+            return SymNodeVariable.create(
+                tx,
+                (operator.neg)(a.as_proxy()),
+                sym_num=None,
+            )
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    def call_format(self, tx, _format_string, *args, **kwargs):
+        format_string = _format_string.as_python_constant()
+        return variables.StringFormatVariable.create(format_string, args, kwargs)
+
+    def call_id(self, tx, *args):
+        if len(args) > 0 and isinstance(args[0], variables.NNModuleVariable):
+            nn_mod_variable = args[0]
+            mod = tx.output.get_submodule(nn_mod_variable.module_key)
+            return variables.ConstantVariable.create(id(mod))
+        else:
+            unimplemented(f"call_id with args {args}")
+
+    def call_deepcopy(self, tx, x):
+        unimplemented(f"copy.deepcopy {repr(x)}")
+
+    def _comparison(self, tx, left, right):
+        """
+        Used to implement comparison operators for different types.
+        For example, list1 < list2 is implemented differently from tensor1 < tensor2
+        """
+        from . import (
+            BaseListVariable,
+            ConstantVariable,
+            NNModuleVariable,
+            TensorVariable,
+            UserDefinedObjectVariable,
+            UserFunctionVariable,
+        )
+        from .lists import SizeVariable
+        from .tensor import (
+            supported_const_comparison_ops,
+            supported_tensor_comparison_ops,
+        )
+
+        op = self.fn
+
+        def _unimplemented():
+            unimplemented(f"comparison {typestr(left)} {op} {typestr(right)}")
+
+        if (
+            all(
+                isinstance(x, (NNModuleVariable, ConstantVariable))
+                for x in [left, right]
+            )
+            and op in supported_const_comparison_ops.values()
+        ):
+            left = (
+                tx.output.get_submodule(left.module_key)
+                if isinstance(left, NNModuleVariable)
+                else left.as_python_constant()
+            )
+            right = (
+                tx.output.get_submodule(right.module_key)
+                if isinstance(right, NNModuleVariable)
+                else right.as_python_constant()
+            )
+            return ConstantVariable.create(op(left, right))
+
+        if isinstance(left, UserFunctionVariable):
+            if op not in supported_const_comparison_ops.values():
+                _unimplemented()
+            if not isinstance(right, UserFunctionVariable):
+                _unimplemented()
+            return ConstantVariable.create(op(left.fn, right.fn))
+
+        # Note, we have a rare BaseListVariable subtype mismatch with valid comparison
+        # x = torch.randn([3, 3])
+        # x.size() == (3, 3) # True
+        # (3, 3) == x.size() # True
+        if isinstance(left, (SizeVariable, TupleVariable)) and isinstance(
+            right, (TupleVariable, SizeVariable)
+        ):
+            return BaseListVariable.list_compare(tx, op, left, right)
+
+        if isinstance(left, BaseListVariable):
+            if not type(left) == type(right):  # Mismatch in BaseListVariable subclasses
+                _unimplemented()
+            return BaseListVariable.list_compare(tx, op, left, right)
+
+        # If they implement set semantics (e.g. SetVariable or DictKeys)
+        if hasattr(left, "set_items") and hasattr(right, "set_items"):
+            return ConstantVariable.create(op(left.set_items, right.set_items))
+
+        if isinstance(left, TensorVariable) or isinstance(right, TensorVariable):
+            from .builder import wrap_fx_proxy_cls
+
+            if op in [operator.is_, operator.is_not]:
+                is_result = (
+                    isinstance(left, TensorVariable)
+                    and isinstance(right, TensorVariable)
+                    and id(extract_fake_example_value(left.as_proxy().node))
+                    == id(extract_fake_example_value(right.as_proxy().node))
+                )
+                if op is operator.is_:
+                    return ConstantVariable.create(is_result)
+                else:
+                    return ConstantVariable.create(not is_result)
+
+            if op not in supported_tensor_comparison_ops.values():
+                _unimplemented()
+            if (
+                isinstance(left, TensorVariable)
+                and isinstance(right, TensorVariable)
+                and (left.size and right.size) is not None
+                and left.size != right.size
+            ):
+                try:
+                    torch.broadcast_shapes(left.size, right.size)
+                except RuntimeError:
+                    # not broadcastable, can't be compared
+                    _unimplemented()
+            tensor_cls = left if isinstance(left, TensorVariable) else right
+            proxy = tx.output.create_proxy(
+                "call_function", op, (left.as_proxy(), right.as_proxy()), {}
+            )
+            return wrap_fx_proxy_cls(
+                type(tensor_cls),  # handle Ndarrays and Tensors
+                tx,
+                proxy,
+            )
+
+        if isinstance(left, SymNodeVariable) or isinstance(right, SymNodeVariable):
+            if op not in supported_tensor_comparison_ops.values():
+                _unimplemented()
+
+            proxy = tx.output.create_proxy(
+                "call_function", op, (left.as_proxy(), right.as_proxy()), {}
+            )
+            return SymNodeVariable.create(
+                tx,
+                proxy,
+                sym_num=None,
+            )
+
+        if isinstance(left, UserDefinedObjectVariable) and isinstance(
+            right, UserDefinedObjectVariable
+        ):
+            return ConstantVariable.create(op(left.value, right.value))
+
+        if isinstance(left, (StreamVariable, EventVariable)) or isinstance(
+            right, (StreamVariable, EventVariable)
+        ):
+            if type(left) == type(right) and op is operator.eq:
+                return ConstantVariable(op(left.value, right.value))
+
+            if isinstance(right, ConstantVariable) or isinstance(
+                left, ConstantVariable
+            ):
+                return ConstantVariable(op(left.value, right.value))
+
+        if op.__name__.startswith("is_"):
+            # If the two objects are of different type, we can safely return False and True for `is` and `is not`, respectively
+            if type(left) is not type(right):
+                return ConstantVariable.create(op.__name__ != "is_")
+
+        if isinstance(left, BuiltinVariable) and isinstance(right, BuiltinVariable):
+            return ConstantVariable.create(op(left.fn, right.fn))
+
+        _unimplemented()
+
+    def call_and_(self, tx, a, b):
+        # Rely on constant_handler
+        if isinstance(a, ConstantVariable) and isinstance(b, ConstantVariable):
+            return None
+        if isinstance(a, (SymNodeVariable, ConstantVariable)) and isinstance(
+            b, (SymNodeVariable, ConstantVariable)
+        ):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.and_, *proxy_args_kwargs([a, b], {})
+                ),
+                sym_num=None,
+            )
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items & b.set_items))
+        # None no-ops this handler and lets the driving function proceed
+
+    def call_or_(self, tx, a, b):
+        # Rely on constant_handler
+        if isinstance(a, ConstantVariable) and isinstance(b, ConstantVariable):
+            return None
+        if isinstance(a, (SymNodeVariable, ConstantVariable)) and isinstance(
+            b, (SymNodeVariable, ConstantVariable)
+        ):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.or_, *proxy_args_kwargs([a, b], {})
+                ),
+                sym_num=None,
+            )
+        if hasattr(a, "set_items") and hasattr(b, "set_items"):
+            return SetVariable(list(a.set_items | b.set_items))
+        # None no-ops this handler and lets the driving function proceed
+        return None
+
+    def call_not_(self, tx, a):
+        if isinstance(a, SymNodeVariable):
+            return SymNodeVariable.create(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", operator.not_, *proxy_args_kwargs([a], {})
+                ),
+                sym_num=None,
+            )
+
+        # Unwrap the underlying ConstDictVariable
+        if isinstance(a, DictView):
+            a = a.dv_dict
+        if isinstance(a, (ListVariable, ConstDictVariable)):
+            return ConstantVariable.create(len(a.items) == 0)
+
+        return None
+
+    call_eq = _comparison
+    call_gt = _comparison
+    call_lt = _comparison
+    call_ge = _comparison
+    call_le = _comparison
+    call_ne = _comparison
+    call_is_ = _comparison
+    call_is_not = _comparison
+
+    call_all = _polyfill_call_impl("all")
+    call_any = _polyfill_call_impl("any")
+
+
+@contextlib.contextmanager
+def dynamo_disable_grad(tx):
+    from . import GradModeVariable
+
+    org_value = torch.is_grad_enabled()
+    gmv = GradModeVariable.create(tx, False)
+    try:
+        gmv.enter(tx)
+        yield
+    finally:
+        gmv.exit(tx)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/constant.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe122599b3034a695c16dc7411a752b83bc81f68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/constant.py
@@ -0,0 +1,213 @@
+# mypy: ignore-errors
+
+import operator
+from typing import Dict, List
+
+import torch
+from torch._dynamo.source import GetItemSource
+
+from .. import variables
+from ..exc import unimplemented, UserError, UserErrorType
+from ..guards import GuardBuilder, install_guard
+from ..utils import common_constant_types, istype, np
+from .base import typestr, VariableTracker
+
+_type_to_assert_reason = {
+    # NB - We CAN have ConstantVariable.create(set) because of how sets interact with guards.
+    # A locally created set should always become a SetVariable, as the items in the set will already either be sourced
+    # from somewhere else, or unsourced. An input set would imply sources derived from set contents. For example, an
+    # input list's contents will have a source like some_list[0], some_list[1][1], etc. For a set, arbitrary access is
+    # not possible. This is a solvable problem, but one we have not taken on yet. As such, input sets are not allowed to
+    # become SetVariables. The solution here is to create a ConstantSetVariable that is more like a ConstantVariable.
+    # As this does not exist, we cannot add sets to this invariant.
+    list: "List types must use ListVariable.",
+    dict: "Dict types must use ConstDictVariable.",
+    torch.Tensor: "Tensor types must use TensorVariable.",
+    torch.SymInt: "SymInts must use SymNodeVariable. "
+    "If the underlying value is static, we will create a ConstantVariable and specialize.",
+    torch.SymFloat: "SymInts must use SymNodeVariable",
+}
+
+
+class ConstantVariable(VariableTracker):
+    @staticmethod
+    def create(value, **kwargs) -> VariableTracker:
+        source = kwargs.get("source", None)
+        is_literal = ConstantVariable.is_literal(value)
+        if not is_literal:
+            for disallowed_type, reason in _type_to_assert_reason.items():
+                assert not isinstance(value, disallowed_type), reason
+
+        # Routing for list and tuple literals.
+        if is_literal and isinstance(value, (list, tuple)):
+            items = []
+            for i, x in enumerate(value):
+                item_source = GetItemSource(source, i) if source else None
+                if item_source:
+                    install_guard(item_source.make_guard(GuardBuilder.CONSTANT_MATCH))
+                items.append(
+                    ConstantVariable.create(
+                        x,
+                        source=item_source,
+                    )
+                )
+            return variables.BaseListVariable.cls_for(type(value))(items, **kwargs)
+
+        return ConstantVariable(value, **kwargs)
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        if not ConstantVariable.is_literal(value):
+            for disallowed_type, reason in _type_to_assert_reason.items():
+                assert not isinstance(value, disallowed_type), reason
+
+        assert not isinstance(
+            value, (list, tuple)
+        ), "ConstantVariable(list) is banned - please create a ListVariable(items)"
+        if np is not None and isinstance(value, np.number):
+            self.value = value.item()
+        else:
+            self.value = value
+
+    def as_proxy(self):
+        return self.value
+
+    def __str__(self):
+        return f"ConstantVariable({type(self.value).__name__}: {repr(self.value)})"
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    @property
+    def items(self):
+        """
+        Need this when adding a BaseListVariable and a ConstantVariable together.
+        Happens in detectron2.
+        """
+        return self.unpack_var_sequence(tx=None)
+
+    def getitem_const(self, arg: VariableTracker):
+        return ConstantVariable.create(
+            self.value[arg.as_python_constant()],
+        )
+
+    @staticmethod
+    def is_literal(obj):
+        if type(obj) in common_constant_types:
+            return True
+        # The structure within is_literal get routed to variables.BaseListVariable
+        if type(obj) in (list, tuple, set, frozenset, torch.Size):
+            return all(ConstantVariable.is_literal(x) for x in obj)
+        return False
+
+    def unpack_var_sequence(self, tx):
+        try:
+            return [ConstantVariable.create(x) for x in self.as_python_constant()]
+        except TypeError as e:
+            raise NotImplementedError from e
+
+    def const_getattr(self, tx, name):
+        if isinstance(self.value, type):
+            raise UserError(
+                UserErrorType.ANTI_PATTERN,
+                "Can't access members of type(obj) for a generated custom object. "
+                "Please use __class__ instead",
+                case_name="type_reflection_method",
+            )
+        member = getattr(self.value, name)
+        if callable(member):
+            raise NotImplementedError()
+        return member
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from .tensor import SymNodeVariable
+
+        if name == "format" and istype(self.value, str):
+            return variables.BuiltinVariable(str.format).call_function(
+                tx, [self, *args], kwargs
+            )
+
+        if any(isinstance(x, SymNodeVariable) for x in args):
+            # Promote to SymNodeVariable for operations involving dynamic shapes.
+            return variables.SymNodeVariable(self.as_proxy(), self.value).call_method(
+                tx, name, args, kwargs
+            )
+
+        try:
+            const_args = [a.as_python_constant() for a in args]
+            const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+        except NotImplementedError:
+            return super().call_method(tx, name, args, kwargs)
+
+        def has_arith_binop(num_ty):
+            return (
+                isinstance(self.value, num_ty)
+                and hasattr(operator, name)
+                and len(args) == 1
+                and args[0].is_python_constant()
+            )
+
+        if isinstance(self.value, str) and name in str.__dict__.keys():
+            method = getattr(self.value, name)
+            return ConstantVariable.create(method(*const_args, **const_kwargs))
+        elif has_arith_binop(int) or has_arith_binop(float):
+            op = getattr(operator, name)
+            add_target = const_args[0]
+            if isinstance(add_target, (torch.SymInt, torch.SymFloat)):
+                from .tensor import SymNodeVariable
+
+                # Addition between a non sym and sym makes a sym
+                # sym_num = tx.output.register_attr_or_module(
+                #     add_target, f"sym_shape_{add_target}", source=None
+                # )
+                proxy = tx.output.create_proxy(
+                    "call_function", op, (self.value, add_target), {}
+                )
+                return SymNodeVariable.create(tx, proxy, add_target)
+            return ConstantVariable.create(op(self.value, add_target))
+        elif name == "__len__" and not (args or kwargs):
+            return ConstantVariable.create(len(self.value))
+        elif name == "__contains__" and len(args) == 1 and args[0].is_python_constant():
+            assert not kwargs
+            search = args[0].as_python_constant()
+            result = search in self.value
+            return ConstantVariable.create(result)
+
+        unimplemented(f"const method call {typestr(self.value)}.{name}")
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        result = hasattr(self.value, name)
+        return variables.ConstantVariable.create(result)
+
+
+class EnumVariable(VariableTracker):
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def as_proxy(self):
+        return self.value
+
+    def __str__(self):
+        return f"EnumVariable({type(self.value)})"
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    def const_getattr(self, tx, name):
+        member = getattr(self.value, name)
+        if callable(member):
+            raise NotImplementedError()
+        return member
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/ctx_manager.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/ctx_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..d33a927f9b075702e4ea3331da0667a73f904373
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/ctx_manager.py
@@ -0,0 +1,825 @@
+# mypy: ignore-errors
+
+import dataclasses
+import inspect
+from typing import Callable, Dict, List, Optional
+
+import torch._C
+from torch._guards import Guard
+
+from .. import variables
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..device_interface import get_interface_for_device
+from ..exc import unimplemented, Unsupported
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GlobalStateSource
+from .base import VariableTracker
+from .functions import (
+    NestedUserFunctionVariable,
+    UserFunctionVariable,
+    UserMethodVariable,
+    WrappedUserFunctionVariable,
+    WrappedUserMethodVariable,
+)
+
+
+@dataclasses.dataclass
+class ContextMangerState:
+    """
+    Mutating `self` in VariableTracker is not allowed because we copy
+    them.  This is a mutable container pointed to by context managers
+    that won't get copied, so it is safe to mutate.
+    """
+
+    cleanup_fn: Optional[Callable] = None
+    proxy: Optional[torch.fx.Proxy] = None
+
+    def cleanup(self):
+        if self.cleanup_fn is not None:
+            self.cleanup_fn()
+            self.cleanup_fn = None
+
+    def cleanup_assert(self):
+        assert self.cleanup_fn, "multiple exits?"
+        self.cleanup()
+
+
+class ContextWrappingVariable(VariableTracker):
+    _nonvar_fields = {
+        "cm_obj",
+        "target_values",
+        "initial_values",
+        "state",
+        *VariableTracker._nonvar_fields,
+    }
+
+    def __init__(self, target_values, initial_values=None, *, state=None, **kwargs):
+        super().__init__(**kwargs)
+        self.target_values = target_values
+        self.initial_values = initial_values
+        self.state = ContextMangerState() if state is None else state
+
+    def enter(self, tx):
+        self._call_func(tx, self.target_values)
+        self.set_cleanup_hook(tx)
+        return variables.ConstantVariable.create(None)
+
+    def set_cleanup_hook(self, tx, fn=None):
+        if fn is None:
+
+            def fn():
+                self._call_func(tx, self.initial_values)
+
+        self.state.cleanup_fn = fn
+        tx.output.add_cleanup_hook(self.state.cleanup)
+
+    def exit(self, tx, *args):
+        self.state.cleanup_assert()
+        return variables.ConstantVariable.create(None)
+
+    def reconstruct(self, codegen):
+        codegen(
+            AttrSource(codegen.tx.import_source(self.module_name()), self.fn_name())
+        )
+
+    def module_name(self):
+        raise NotImplementedError("module_name called on base")
+
+    def fn_name(self):
+        raise NotImplementedError("fn_name called on base")
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        assert len(args) == 1
+        if isinstance(args[0], NestedUserFunctionVariable):
+            args[0] = UserFunctionVariable(args[0].get_function())
+        assert isinstance(args[0], (UserMethodVariable, UserFunctionVariable))
+
+        if isinstance(args[0], UserMethodVariable):
+            return WrappedUserMethodVariable(args[0], self)
+
+        if isinstance(args[0], UserFunctionVariable):
+            return WrappedUserFunctionVariable(args[0], self)
+
+
+class GenericContextWrappingVariable(ContextWrappingVariable):
+    def __init__(self, target_values, initial_values=None, *, cm_obj=None, **kwargs):
+        assert cm_obj is not None
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        self.cm_obj = cm_obj
+
+    def enter(self, tx):
+        source = None if self.source is None else AttrSource(self.source, "__enter__")
+        try:
+            return variables.UserMethodVariable(
+                self.cm_obj.__enter__.__func__,
+                variables.UserDefinedObjectVariable(self.cm_obj),
+                source=source,
+            ).call_function(tx, [], {})
+        except Unsupported as e:
+            raise unimplemented(
+                f"Unsupported context manager {self.cm_obj}'s __enter__ function"
+            ) from e
+
+    def exit(self, tx, *args):
+        source = None if self.source is None else AttrSource(self.source, "__exit__")
+        try:
+            x = variables.UserMethodVariable(
+                self.cm_obj.__exit__.__func__,
+                variables.UserDefinedObjectVariable(self.cm_obj),
+                source=source,
+            ).call_function(
+                tx,
+                [
+                    variables.ConstantVariable.create(None),
+                    variables.ConstantVariable.create(None),
+                    variables.ConstantVariable.create(None),
+                ],
+                {},
+            )
+        except Unsupported as e:
+            raise unimplemented(
+                f"Unsupported context manager {self.cm_obj}'s __exit__ function"
+            ) from e
+
+        tx.generic_context_manager_depth -= 1
+        return x
+
+
+class GradInplaceRequiresGradCtxManagerVariable(ContextWrappingVariable):
+    """represents torch grad requries grad"""
+
+    @staticmethod
+    def create(tx, target_values, **kwargs):
+        return GradInplaceRequiresGradCtxManagerVariable(
+            target_values=target_values,
+            initial_values=None,
+            **kwargs,
+        )
+
+    def enter(self, tx):
+        [enabled] = self.target_values
+        self.prev_state = torch._C._functorch.get_inplace_requires_grad_allowed()
+        torch._C._functorch.set_inplace_requires_grad_allowed(enabled)
+        self.set_cleanup_hook(
+            tx,
+            lambda: torch._C._functorch.set_inplace_requires_grad_allowed(
+                self.prev_state
+            ),
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch.set_inplace_requires_grad_allowed,
+            (enabled,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function",
+            torch._C._functorch.set_inplace_requires_grad_allowed,
+            (self.prev_state,),
+            {},
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class GradIncrementNestingCtxManagerVariable(ContextWrappingVariable):
+    """represents torch.func.grad increment/decrement nesting"""
+
+    # A guard is needed as the grad level is baked into the torch FX graph
+    # This is fine if grad is only called from within the function
+    # being compiled. But the FX graph may be invalid in the case of a grad
+    # call from eager that calls the compiled function, as the grad levels
+    # may be different.
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+
+    @staticmethod
+    def create(tx, **kwargs):
+        var = GradIncrementNestingCtxManagerVariable(
+            target_values=None,
+            initial_values=None,
+            **kwargs,
+        )
+        return var
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        grad_level = torch._C._functorch._grad_increment_nesting()
+        self.set_cleanup_hook(tx, lambda: torch._C._functorch._grad_decrement_nesting())
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._grad_increment_nesting,
+            (),
+            {},
+        )
+        return variables.ConstantVariable.create(grad_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function", torch._C._functorch._grad_decrement_nesting, (), {}
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class VmapIncrementNestingCtxManagerVariable(ContextWrappingVariable):
+    """represents torch VMap increment/decrement nesting"""
+
+    # A guard is needed as the vmap level is baked into the torch FX graph
+    # generated. This is fine if vmap is only called from within the function
+    # being compiled. But the FX graph may be invalid in the case of a vmap
+    # call from eager that calls the compiled function, as the vmap levels
+    # may be different.
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.FUNCTORCH_STACK_MATCH)
+
+    @staticmethod
+    def create(tx, target_values, **kwargs):
+        var = VmapIncrementNestingCtxManagerVariable(
+            target_values=target_values,
+            initial_values=None,
+            **kwargs,
+        )
+        return var
+
+    def enter(self, tx):
+        install_guard(self._guards_singleton)
+        batch_size, randomness = self.target_values
+        vmap_level = torch._C._functorch._vmap_increment_nesting(batch_size, randomness)
+        self.set_cleanup_hook(tx, lambda: torch._C._functorch._vmap_decrement_nesting())
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch._C._functorch._vmap_increment_nesting,
+            (batch_size, randomness),
+            {},
+        )
+        return variables.ConstantVariable.create(vmap_level)
+
+    def exit(self, tx, *args):
+        self.state.cleanup()
+        tx.output.create_node(
+            "call_function", torch._C._functorch._vmap_decrement_nesting, (), {}
+        )
+        return variables.ConstantVariable.create(None)
+
+
+class GradModeVariable(ContextWrappingVariable):
+    """represents torch.{no_grad,enable_grad,set_grad_mode}()"""
+
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.GRAD_MODE)
+
+    @staticmethod
+    def create(tx, target_value, initialized=False, **kwargs):
+        var = GradModeVariable(
+            target_values=[target_value],
+            initial_values=[torch.is_grad_enabled()],
+            **kwargs,
+        )
+        if initialized:
+            var._call_func(tx, var.target_values)
+        return var
+
+    def __init__(self, target_values, initial_values=None, initialized=True, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        install_guard(self._guards_singleton)
+
+    def enter(self, tx):
+        self._call_func(tx, self.target_values)
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx, *args):
+        self._call_func(tx, self.initial_values)
+        return variables.ConstantVariable.create(None)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ):
+        self._call_func(tx, self.initial_values)  # undo eager initialization
+        return super().call_function(tx, args, kwargs)
+
+    def _call_func(self, tx, values):
+        assert len(values) == 1
+        value = values[0]
+        # Coalesce grad mode mutations
+        if torch.is_grad_enabled() != value:
+            tx.output.create_node(
+                "call_function", torch._C._set_grad_enabled, (value,), {}
+            )
+            torch._C._set_grad_enabled(value)
+
+    def module_name(self):
+        return "torch"
+
+    def fn_name(self):
+        return "set_grad_enabled"
+
+
+class InferenceModeVariable(ContextWrappingVariable):
+    @staticmethod
+    def create(tx, target_value, **kwargs):
+        var = InferenceModeVariable(
+            [target_value], initial_values=torch.is_inference_mode_enabled(), **kwargs
+        )
+        return var
+
+    def __init__(
+        self,
+        target_values,
+        initial_values=None,
+        **kwargs,
+    ):
+        if initial_values is None:
+            # This must be called here since function defaults are evaluated at import time
+            initial_values = torch.is_inference_mode_enabled()
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        self.target_values = target_values
+
+    def exit(self, tx, *args):
+        self.state.cleanup_assert()
+        tx.output.create_node(
+            "call_function",
+            torch.autograd.grad_mode._exit_inference_mode,
+            (self.state.proxy,),
+            {},
+        )
+
+    def enter(self, tx):
+        ctx = torch.autograd.grad_mode._enter_inference_mode(*self.target_values)
+        self.set_cleanup_hook(
+            tx, lambda: torch.autograd.grad_mode._exit_inference_mode(ctx)
+        )
+        self.state.proxy = tx.output.create_node(
+            "call_function",
+            torch.autograd.grad_mode._enter_inference_mode,
+            (*self.target_values,),
+            {},
+        )
+
+    def module_name(self):
+        return "torch"
+
+    def fn_name(self):
+        return "inference_mode"
+
+
+class TorchFunctionDisableVariable(ContextWrappingVariable):
+    """represents whether torch function overrides are enabled or not"""
+
+    _guards_singleton = Guard(GlobalStateSource(), GuardBuilder.TORCH_FUNCTION_STATE)
+
+    @staticmethod
+    def create(tx, **kwargs):
+        var = TorchFunctionDisableVariable(
+            target_values=[False],
+            initial_values=[tx.output.torch_function_enabled],
+            **kwargs,
+        )
+        # mlazos: I think this is here to make sure we don't reinvoke on clone()
+        var._call_func(tx, [False])
+        var.set_cleanup_hook(tx)
+        return var
+
+    def __init__(self, target_values, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        install_guard(self._guards_singleton)
+
+    def enter(self, tx):
+        return variables.ConstantVariable.create(None)
+
+    def _call_func(self, tx, values):
+        assert len(values) == 1
+        tx.output.set_torch_function_state(values[0])
+
+
+class DeterministicAlgorithmsVariable(ContextWrappingVariable):
+    """represents torch.{are_deterministic_algorithms_enabled,use_deterministic_algorithms}()"""
+
+    _guards_singleton = Guard(
+        GlobalStateSource(), GuardBuilder.DETERMINISTIC_ALGORITHMS
+    )
+
+    @staticmethod
+    def create(tx, target_value, **kwargs):
+        var = DeterministicAlgorithmsVariable(
+            target_values=[target_value],
+            initial_values=[torch.are_deterministic_algorithms_enabled()],
+            **kwargs,
+        )
+        var._call_func(tx, [target_value])
+        var.set_cleanup_hook(tx)
+        return var
+
+    def __init__(self, target_values, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        install_guard(self._guards_singleton)
+
+    def enter(self, tx):
+        return variables.ConstantVariable.create(None)
+
+    def _call_func(self, tx, values):
+        assert len(values) == 1
+        value = values[0]
+        tx.output.create_node(
+            "call_function", torch._C._set_deterministic_algorithms, (value,), {}
+        ),
+        torch._C._set_deterministic_algorithms(value)
+
+    def module_name(self):
+        return "torch"
+
+    def fn_name(self):
+        return "use_deterministic_algorithms"
+
+
+class DisabledSavedTensorsHooksVariable(ContextWrappingVariable):
+    """represents torch.autograd.graph.disable_saved_tensors_hook."""
+
+    @staticmethod
+    def create(tx, target_value, **kwargs):
+        var = DisabledSavedTensorsHooksVariable(
+            target_values=[target_value],
+            initial_values=[
+                torch._C._autograd._saved_tensors_hooks_get_disabled_error_message()
+            ],
+            **kwargs,
+        )
+        var._call_func(tx, [target_value])
+        var.set_cleanup_hook(tx)
+        return var
+
+    def __init__(self, target_values, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+
+    def enter(self, tx):
+        return variables.ConstantVariable.create(None)
+
+    def _call_func(self, tx, values):
+        assert len(values) == 1
+        value = values[0]
+        if value is not None:
+            # Disable `saved_tensors_hooks` with message (`value`)
+            # OR
+            # we are exiting this context and restoring the previous message.
+            tx.output.create_node(
+                "call_function",
+                torch._C._autograd._saved_tensors_hooks_disable,
+                (value,),
+                {},
+            )
+            torch._C._autograd._saved_tensors_hooks_disable(value)
+        else:
+            # We are exiting this context and if prev_message was None, we re-enable `saved_tensors_hooks`.
+            tx.output.create_node(
+                "call_function", torch._C._autograd._saved_tensors_hooks_enable, (), {}
+            )
+            torch._C._autograd._saved_tensors_hooks_enable()
+
+    def module_name(self):
+        return "torch.autograd.graph"
+
+    def fn_name(self):
+        return "disable_saved_tensors_hooks"
+
+
+class AutocastModeVariable(ContextWrappingVariable):
+    @staticmethod
+    def create(func, args, kwargs):
+        assert func in [
+            torch.amp.autocast_mode.autocast,
+            torch.cuda.amp.autocast,
+            torch.cpu.amp.autocast,
+        ]
+        # device_type : str,
+        # dtype : Optional[_dtype] = None,
+        # enabled : bool = True,
+        # cache_enabled : Optional[bool] = None):cache_enabled
+        bound_args = inspect.signature(func).bind(*args, **kwargs)
+        bound_args.apply_defaults()
+        target_values = []
+        kwargs.clear()
+
+        for key in ["device_type", "dtype", "enabled", "cache_enabled"]:
+            if key == "device_type" and func in [
+                torch.cuda.amp.autocast,
+                torch.cpu.amp.autocast,
+            ]:
+                arg = "cuda" if func is torch.cuda.amp.autocast else "cpu"
+            else:
+                arg = bound_args.arguments[key]
+            if isinstance(arg, VariableTracker):
+                target_values.append(arg.as_python_constant())
+            else:
+                target_values.append(arg)
+
+        var = AutocastModeVariable(target_values, initial_values=None, **kwargs)
+        return var
+
+    def __init__(self, target_values, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        self.target_values = target_values
+
+    def exit(self, tx, *args):
+        self.state.cleanup_assert()
+        tx.output.create_node(
+            "call_function", torch.amp._exit_autocast, (self.state.proxy,), {}
+        )
+
+    def enter(self, tx):
+        ctx = torch.amp._enter_autocast(*self.target_values)
+        self.set_cleanup_hook(tx, lambda: torch.amp._exit_autocast(ctx))
+        self.state.proxy = tx.output.create_node(
+            "call_function", torch.amp._enter_autocast, (*self.target_values,), {}
+        )
+
+    def module_name(self):
+        return "torch.amp.autocast_mode"
+
+    def fn_name(self):
+        return "autocast"
+
+
+class NullContextVariable(ContextWrappingVariable):
+    """
+    This class represents Python contextlib.nullcontext.
+    It's used as a placeholder for other context managers that Dynamo doesn't
+    support yet, e.g, torch.autograd.profiler.record_function.
+    """
+
+    def __init__(self, target_values=None, **kwargs):
+        super().__init__(target_values=target_values, **kwargs)
+
+    def enter(self, tx):
+        return variables.ConstantVariable.create(None)
+
+    def exit(self, tx, *args):
+        return variables.ConstantVariable.create(None)
+
+    def module_name(self):
+        return "contextlib"
+
+    def fn_name(self):
+        return "nullcontext"
+
+
+class StreamContextVariable(ContextWrappingVariable):
+    @staticmethod
+    def create(tx, target_value, **kwargs):
+        from .builder import wrap_fx_proxy_cls
+
+        current_stream_method = get_interface_for_device(
+            target_value.device
+        ).current_stream
+        current_stream = wrap_fx_proxy_cls(
+            StreamVariable,
+            tx,
+            tx.output.create_proxy(
+                "call_function",
+                current_stream_method,
+                (None,),
+                {},
+            ),
+        )
+        return StreamContextVariable(
+            target_values=[target_value],
+            initial_values=[current_stream],
+            device=target_value.device,
+            **kwargs,
+        )
+
+    def __init__(self, target_values, device, initial_values=None, **kwargs):
+        super().__init__(
+            target_values=target_values, initial_values=initial_values, **kwargs
+        )
+        self.device = device
+        self.set_stream = get_interface_for_device(self.device).set_stream
+        self.set_stream_id = get_interface_for_device(self.device)._set_stream_by_id
+
+    def enter(self, tx):
+        # stream generated inside the traced function
+        if self.target_values[0].as_proxy() is not None:
+            tx.output.create_proxy(
+                "call_function",
+                self.set_stream,
+                (self.target_values[0].as_proxy(),),
+                {},
+            )
+        # stream passed from outside the traced function
+        else:
+            stream = self.target_values[0].value
+            tx.output.create_proxy(
+                "call_function",
+                self.set_stream_id,
+                (stream.stream_id, stream.device_index, stream.device_type),
+                {},
+            )
+        self.set_stream(self.target_values[0].value)
+        self.set_cleanup_hook(tx, lambda: self.set_stream(self.initial_values[0].value))
+
+    def exit(self, tx, *args):
+        tx.output.create_proxy(
+            "call_function",
+            self.set_stream,
+            (self.initial_values[0].as_proxy(),),
+            {},
+        )
+        self.state.cleanup_assert()
+
+
+class PreserveVersionContextVariable(ContextWrappingVariable):
+    """
+    Wraps torch.autograd._unsafe_preserve_version_counter
+    """
+
+    @staticmethod
+    def constructor(tx):
+        return variables.LambdaVariable(
+            lambda tensor: PreserveVersionContextVariable(
+                tensor,
+                tensor.var_getattr(tx, "_version"),
+            )
+        )
+
+    def __init__(self, tensor, prev_version, **kwargs):
+        kwargs.setdefault("target_values", None)
+        super().__init__(**kwargs)
+        self.tensor = tensor
+        self.prev_version = prev_version
+
+    def enter(self, tx):
+        pass
+
+    def exit(self, tx, *args):
+        from ..tensor_version_op import _unsafe_set_version_counter
+
+        return variables.TorchInGraphFunctionVariable(
+            _unsafe_set_version_counter
+        ).call_function(tx, [self.tensor, self.prev_version], {})
+
+    def reconstruct(self, codegen):
+        unimplemented(
+            "torch.autograd._unsafe_preserve_version_counter with graph break"
+        )
+
+
+class StreamVariable(VariableTracker):
+    def __init__(self, proxy, value, device, **kwargs):
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        assert (
+            value.device.type == device.type
+        ), "stream value is not equal to the passed device"
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+        self.device = device
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        assert hasattr(self.value, name), f"no stream method found named {name}"
+        assert name in [
+            "wait_stream",
+            "synchronize",
+            "query",
+            "record_event",
+            "wait_event",
+        ], f" unsupported stream method {name}"
+
+        from ..utils import proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait_stream", "synchronize", "wait_event"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return variables.ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=variables.ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        elif name == "record_event":
+            return wrap_fx_proxy_cls(
+                target_cls=EventVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        else:
+            unimplemented(self.device + " stream method " + name + " unsupported")
+
+    def as_proxy(self):
+        return self.proxy
+
+    def reconstruct(self, codegen):
+        # If we got here, this stream is fully subsumed by the graph - this means it is
+        # not an input or global
+        assert not self.source
+        # Since we just proved that - for other such structures, like lists and dicts, reconstruction
+        # is fine and sound according to dynamo principles of treating collectives. However,
+        # streams are special in that we want to preserve the identity of the stream as the same as in the graph
+        # Normally, we would do this via codegen for the proxy mapping to an output - we cannot do this yet, as we do not
+        # yet have a plan for how we want to handle the case where the stream is used as an input or an output. Pending
+        # design, to unblock current work, we lift the stream into a global and then codegen bytecode to load it from there.
+        prefix = f"_stream_{self.device}"
+        name = codegen.tx.output.install_global_by_id(prefix, self.value)
+        codegen.append_output(
+            codegen.create_load_global(name, push_null=False, add=True)
+        )
+
+
+class EventVariable(VariableTracker):
+    def __init__(self, proxy, value, **kwargs):
+        if proxy is not None and "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == value
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.value = value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from ..utils import proxy_args_kwargs
+        from .builder import wrap_fx_proxy_cls
+
+        if name in ("wait", "record", "synchronize"):
+            tx.output.create_proxy(
+                "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+            )
+            return variables.ConstantVariable(None)
+        elif name == "query":
+            return wrap_fx_proxy_cls(
+                target_cls=variables.ConstantVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method", name, *proxy_args_kwargs([self] + args, kwargs)
+                ),
+            )
+        else:
+            unimplemented(f"event method {name} unsupported")
+
+    def as_proxy(self):
+        return self.proxy
+
+
+class WithExitFunctionVariable(VariableTracker):
+    def __init__(self, ctx: ContextWrappingVariable, target, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(ctx, ContextWrappingVariable)
+        self.ctx = ctx
+        self.target = target
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        assert not kwargs
+        return self.ctx.exit(tx, *args)
+
+    def reconstruct(self, codegen):
+        # Note here we reconstruct the context manager rather than the
+        # exit function.  The handler generated by BlockStackEntry
+        # will re-enter the context in the resume function.
+        codegen(
+            AttrSource(
+                codegen.tx.import_source(self.ctx.module_name()), self.ctx.fn_name()
+            )
+        )
+
+        if codegen.tx.output.partial_convert:
+            codegen.extend_output(
+                [codegen.create_load_const(val) for val in self.ctx.target_values]
+            )
+            codegen.extend_output(
+                create_call_function(len(self.ctx.target_values), True)
+            )
+            codegen.append_output(create_instruction("SETUP_WITH", target=self.target))
+            codegen.append_output(create_instruction("POP_TOP"))
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/dicts.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/dicts.py
new file mode 100644
index 0000000000000000000000000000000000000000..10bd5c9ad7a6ee2c64fe3f229f5a9166ba7eec5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/dicts.py
@@ -0,0 +1,897 @@
+# mypy: ignore-errors
+
+import collections
+import dataclasses
+import functools
+import inspect
+import sys
+from typing import Dict, List, Optional
+
+from torch._subclasses.fake_tensor import is_fake
+
+from .. import variables
+from ..bytecode_transformation import (
+    create_call_function,
+    create_call_method,
+    create_instruction,
+)
+from ..eval_frame import skip_code
+
+from ..exc import unimplemented
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GetItemSource
+from ..utils import dict_keys, dict_values, istype, specialize_symnode
+from .base import MutableLocal, VariableTracker
+from .constant import ConstantVariable
+
+# [Adding a new supported class within the keys of ConstDictVarialble]
+# - Add its tracker type to is_hashable
+# - (perhaps) Define how it is compared in _HashableTracker._eq_impl
+
+
+def is_hashable(x):
+    if isinstance(x, variables.TensorVariable):
+        # Tensors are hashable if they have an example_value (a fake tensor)
+        # Most VT's should have one.
+        # It'd be nice if at some point we could assert that they all have one
+        return x.as_proxy().node.meta.get("example_value") is not None
+    elif isinstance(x, variables.TupleVariable):
+        return all(is_hashable(e) for e in x.items)
+    else:
+        return isinstance(
+            x,
+            (
+                variables.BuiltinVariable,
+                variables.SymNodeVariable,
+                variables.ConstantVariable,
+                variables.EnumVariable,
+                variables.user_defined.UserDefinedClassVariable,
+                variables.UserFunctionVariable,
+                variables.SkipFunctionVariable,
+                variables.misc.NumpyVariable,
+                variables.NNModuleVariable,
+                variables.MethodWrapperVariable,
+                variables.TorchInGraphFunctionVariable,
+                variables.TypingVariable,
+                variables.FunctoolsPartialVariable,
+            ),
+        )
+
+
+class ConstDictVariable(VariableTracker):
+    class _HashableTracker:
+        """
+        Auxiliary opaque internal class that wraps a VariableTracker and makes it hashable
+        This should not be seen or touched by anything outside of ConstDictVariable and its children
+        Note that it's also fine to put VTs into dictionaries and sets, but doing so does not take into account aliasing
+        """
+
+        def __init__(self, vt):
+            # We specialize SymNodes
+            vt = specialize_symnode(vt)
+            # TODO Temorarily remove to figure out what keys are we breaking on
+            # and add proper support for them
+            if not is_hashable(vt):
+                unimplemented(f"Dict key of type {type(vt)}. Key: {vt}")
+            self.vt = vt
+
+        @property
+        def underlying_value(self):
+            if isinstance(self.vt, variables.TensorVariable):
+                x = self.vt.as_proxy().node.meta["example_value"]
+            elif isinstance(self.vt, variables.TupleVariable):
+                Hashable = ConstDictVariable._HashableTracker
+                x = tuple(Hashable(e).underlying_value for e in self.vt.items)
+            elif isinstance(self.vt, variables.NNModuleVariable):
+                return self.vt.module
+            elif isinstance(self.vt, variables.UserFunctionVariable):
+                return self.vt.get_function()
+            else:
+                x = self.vt.as_python_constant()
+            return x
+
+        def __hash__(self):
+            return hash(self.underlying_value)
+
+        @staticmethod
+        def _eq_impl(a, b):
+            # TODO: Put this in utils and share it between variables/builtin.py and here
+            if type(a) != type(b):
+                return False
+            elif isinstance(a, tuple):
+                Hashable = ConstDictVariable._HashableTracker
+                return len(a) == len(b) and all(
+                    Hashable._eq_impl(u, v) for u, v in zip(a, b)
+                )
+            elif is_fake(a):
+                return a is b
+            else:
+                return a == b
+
+        def __eq__(self, other: "ConstDictVariable._HashableTracker") -> bool:
+            Hashable = ConstDictVariable._HashableTracker
+            assert isinstance(other, Hashable) or ConstantVariable.is_literal(
+                other
+            ), type(other)
+            if isinstance(other, Hashable):
+                return Hashable._eq_impl(self.underlying_value, other.underlying_value)
+
+            # constant
+            return Hashable._eq_impl(self.underlying_value, other)
+
+    def __init__(
+        self, items: Dict[VariableTracker, VariableTracker], user_cls=dict, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        Hashable = ConstDictVariable._HashableTracker
+
+        # Keys will just be HashableTrackers when cloning, in any other case they'll be VariableTrackers
+        assert all(
+            isinstance(x, (VariableTracker, Hashable))
+            and isinstance(v, VariableTracker)
+            for x, v in items.items()
+        )
+
+        def make_hashable(key):
+            return key if isinstance(key, Hashable) else Hashable(key)
+
+        self.items = {make_hashable(x): v for x, v in items.items()}
+        self.user_cls = user_cls
+
+    def as_proxy(self):
+        return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
+
+    def as_python_constant(self):
+        return {
+            k.vt.as_python_constant(): v.as_python_constant()
+            for k, v in self.items.items()
+        }
+
+    def keys_as_python_constant(self):
+        return {k.vt.as_python_constant(): v for k, v in self.items.items()}
+
+    def python_type(self):
+        return self.user_cls
+
+    def __contains__(self, vt):
+        assert isinstance(vt, VariableTracker)
+        Hashable = ConstDictVariable._HashableTracker
+        return is_hashable(vt) and Hashable(vt) in self.items
+
+    def reconstruct(self, codegen):
+        # instructions to load collections.OrderedDict if necessary
+        if self.user_cls is collections.OrderedDict:
+            codegen.extend_output(
+                [
+                    codegen.create_load_python_module(collections, True),
+                    codegen.create_load_attr("OrderedDict"),
+                ]
+            )
+        # instructions to build the dict keys and values
+        for key, value in self.items.items():
+            codegen(key.vt)
+            codegen(value)
+        # BUILD_MAP and calling collections.OrderedDict if necessary
+        if self.user_cls is collections.OrderedDict:
+            codegen.extend_output(
+                [
+                    create_instruction("BUILD_MAP", arg=len(self.items)),
+                    *create_call_function(1, False),
+                ]
+            )
+        # BUILD_MAP only if user_cls is dict
+        else:
+            codegen.append_output(create_instruction("BUILD_MAP", arg=len(self.items)))
+
+    def getitem_const(self, arg: VariableTracker):
+        key = ConstDictVariable._HashableTracker(arg)
+        if key not in self.items:
+            raise KeyError(arg.value)
+        return self.items[key]
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from . import (
+            BuiltinVariable,
+            ConstantVariable,
+            ListIteratorVariable,
+            ListVariable,
+            TupleVariable,
+        )
+
+        Hashable = ConstDictVariable._HashableTracker
+
+        arg_hashable = args and is_hashable(args[0])
+
+        if name == "__getitem__":
+            assert len(args) == 1
+            return self.getitem_const(args[0])
+        elif name == "items":
+            assert not (args or kwargs)
+            return TupleVariable(
+                [TupleVariable([k.vt, v]) for k, v in self.items.items()]
+            )
+        elif name == "keys":
+            assert not (args or kwargs)
+            return DictKeys(self)
+        elif name == "values":
+            assert not (args or kwargs)
+            return DictValues(self)
+        elif name == "copy":
+            assert not (args or kwargs)
+            return self.clone(items=self.items.copy(), mutable_local=MutableLocal())
+        elif name == "__len__":
+            assert not (args or kwargs)
+            return ConstantVariable.create(len(self.items))
+        elif name == "__setitem__" and arg_hashable and self.mutable_local:
+            assert not kwargs and len(args) == 2
+            tx.output.side_effects.mutation(self)
+            self.items[Hashable(args[0])] = args[1]
+            return ConstantVariable.create(None)
+        elif name in ("pop", "get") and len(args) in (1, 2) and args[0] not in self:
+            # missing item, return the default value
+            if len(args) == 1:
+                return ConstantVariable(None)
+            else:
+                return args[1]
+        elif name == "pop" and arg_hashable and self.mutable_local:
+            tx.output.side_effects.mutation(self)
+            return self.items.pop(Hashable(args[0]))
+        elif name == "clear":
+            tx.output.side_effects.mutation(self)
+            self.items.clear()
+            return ConstantVariable.create(None)
+        elif (
+            name == "update"
+            and len(args) == 1
+            and isinstance(
+                args[0],
+                (
+                    ConstDictVariable,
+                    ListVariable,
+                    TupleVariable,
+                    ListIteratorVariable,
+                ),
+            )
+            and self.mutable_local
+        ):
+            tx.output.side_effects.mutation(self)
+            if isinstance(args[0], ConstDictVariable):
+                dict_vt = args[0]
+            else:
+                dict_vt = BuiltinVariable.call_custom_dict(tx, dict, args[0])
+            self.items.update(dict_vt.items)
+            # Wrap strings
+            kwargs = {
+                Hashable(ConstantVariable.create(k)): v for k, v in kwargs.items()
+            }
+            self.items.update(kwargs)
+            return ConstantVariable.create(None)
+        elif name in ("get", "__getattr__") and args[0] in self:
+            return self.getitem_const(args[0])
+        elif name == "__contains__" and len(args) == 1:
+            return ConstantVariable.create(args[0] in self)
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+    def unpack_var_sequence(self, tx):
+        return [x.vt for x in self.items.keys()]
+
+
+class DefaultDictVariable(ConstDictVariable):
+    def __init__(self, items, user_cls, default_factory=None, **kwargs):
+        super().__init__(items, user_cls, **kwargs)
+        assert user_cls is collections.defaultdict
+        self.default_factory = default_factory
+
+    def is_python_constant(self):
+        # Return false for unsupported defaults. This ensures that a bad handler
+        # path is not taken in BuiltinVariable for getitem.
+        if self.default_factory not in [list, tuple, dict] and not self.items:
+            return False
+        return super().is_python_constant()
+
+    @staticmethod
+    def is_supported_arg(arg):
+        if isinstance(arg, variables.BuiltinVariable):
+            return arg.fn in [list, tuple, dict]
+        else:
+            return isinstance(arg, variables.functions.BaseUserFunctionVariable)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "__getitem__":
+            assert len(args) == 1
+
+            if args[0] in self:
+                return self.getitem_const(args[0])
+            else:
+                if self.default_factory is None:
+                    raise KeyError(f"{args[0]}")
+                else:
+                    default_var = self.default_factory.call_function(tx, [], {})
+                    super().call_method(
+                        tx, "__setitem__", (args[0], default_var), kwargs
+                    )
+                    return default_var
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+
+class SetVariable(ConstDictVariable):
+    """We model a sets as dictonary with None values"""
+
+    def __init__(
+        self,
+        items: List[VariableTracker],
+        **kwargs,
+    ):
+        items = dict.fromkeys(items, SetVariable._default_value())
+        super().__init__(items, **kwargs)
+
+    @property
+    def set_items(self):
+        return set(self.items.keys())
+
+    @staticmethod
+    def _default_value():
+        # Variable to fill in he keys of the dictinary
+        return ConstantVariable.create(None)
+
+    def as_proxy(self):
+        return {k.vt.as_proxy() for k in self.set_items}
+
+    def python_type(self):
+        return set
+
+    def as_python_constant(self):
+        return {k.vt.as_python_constant() for k in self.set_items}
+
+    def reconstruct(self, codegen):
+        codegen.foreach([x.vt for x in self.set_items])
+        codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> "VariableTracker":
+        # We foward the calls to the dictionary model
+        if name == "add":
+            assert not kwargs
+            assert len(args) == 1
+            name = "__setitem__"
+            args = (args[0], SetVariable._default_value())
+        elif name == "pop":
+            assert not kwargs
+            assert not args
+            # Choose an item at random and pop it via the Dict.pop method
+            result = self.set_items.pop().vt
+            super().call_method(tx, name, (result,), kwargs)
+            return result
+        return super().call_method(tx, name, args, kwargs)
+
+    def getitem_const(self, arg: VariableTracker):
+        raise RuntimeError("Illegal to getitem on a set")
+
+
+class DictView(VariableTracker):
+    """
+    Models _PyDictViewObject
+
+    This is an "abstract" class. Subclasses will override kv and the items method
+    """
+
+    kv: Optional[str] = None
+
+    def __init__(self, dv_dict: ConstDictVariable, **kwargs):
+        super().__init__(**kwargs)
+        assert self.kv in ("keys", "values")
+        assert isinstance(dv_dict, ConstDictVariable)
+        self.dv_dict = dv_dict
+
+    @property
+    def view_items(self):
+        return getattr(self.dv_dict.items, self.kv)()
+
+    @property
+    def view_items_vt(self):
+        # Returns an iterable of the unpacked items
+        # Implement in the subclasses
+        raise NotImplementedError()
+
+    def unpack_var_sequence(self, tx):
+        def unwrap(x):
+            return x.vt if self.kv == "keys" else x
+
+        return [unwrap(x) for x in self.view_items]
+
+    def reconstruct(self, codegen):
+        codegen(self.dv_dict)
+        codegen.extend_output(
+            [
+                create_instruction("LOAD_METHOD", argval=self.kv),
+                *create_call_method(0),
+            ]
+        )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__len__":
+            return self.dv_dict.call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+
+class DictKeys(DictView):
+    kv = "keys"
+
+    @property
+    def set_items(self):
+        return set(self.view_items)
+
+    @property
+    def view_items_vt(self):
+        # Returns an iterable of the unpacked items
+        return [x.vt for x in self.view_items]
+
+    def python_type(self):
+        return dict_keys
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__contains__":
+            return self.dv_dict.call_method(tx, name, args, kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+
+class DictValues(DictView):
+    # DictValues is an iterable but cannot be compared.
+    kv = "values"
+
+    @property
+    def view_items_vt(self):
+        return list(self.view_items)
+
+    def python_type(self):
+        return dict_values
+
+
+def _is_matching_transformers_cls(cls) -> bool:
+    mod = sys.modules.get("transformers.file_utils")
+    return mod is not None and issubclass(cls, mod.ModelOutput)
+
+
+def _is_matching_diffusers_cls(cls) -> bool:
+    mod = sys.modules.get("diffusers.utils")
+    return mod is not None and issubclass(cls, mod.BaseOutput)
+
+
+def _call_hasattr_customobj(self, tx, name: str) -> "VariableTracker":
+    """Shared method between DataClassVariable and CustomizedDictVariable where items are attrs"""
+    if name in self.items or hasattr(self.user_cls, name):
+        return ConstantVariable(True)
+    elif istype(self.mutable_local, MutableLocal) and self.source is None:
+        # Something created locally can't have any extra fields on it
+        return ConstantVariable(False)
+    elif self.mutable_local is None and self.source:
+        # Maybe add a guard
+        try:
+            example = tx.output.root_tx.get_example_value(self.source)
+            install_guard(
+                AttrSource(self.source, name).make_guard(GuardBuilder.HASATTR)
+            )
+            return ConstantVariable(hasattr(example, name))
+        except KeyError:
+            pass
+    unimplemented(
+        f"hasattr({self.__class__.__name__}, {name}) {self.mutable_local} {self.source}"
+    )
+
+
+class DataClassVariable(ConstDictVariable):
+    """
+    This is a bit of a hack to deal with
+    transformers.file_utils.ModelOutput() from huggingface.
+
+    ModelOutput causes trouble because it a a mix of a dataclass and a
+    OrderedDict and it calls super() methods implemented in C.
+    """
+
+    # ModelOutput() excludes None, though generic datclasses don't
+    include_none = False
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _patch_once():
+        try:
+            from transformers.file_utils import ModelOutput
+
+            for obj in ModelOutput.__dict__.values():
+                if callable(obj):
+                    skip_code(obj.__code__)
+        except ImportError:
+            pass
+
+        try:
+            from diffusers.utils import BaseOutput
+
+            for obj in BaseOutput.__dict__.values():
+                if callable(obj):
+                    skip_code(obj.__code__)
+        except ImportError:
+            pass
+
+    @staticmethod
+    def is_matching_cls(cls):
+        return _is_matching_transformers_cls(cls) or _is_matching_diffusers_cls(cls)
+
+    @classmethod
+    def is_matching_object(cls, obj):
+        return cls.is_matching_cls(type(obj))
+
+    @classmethod
+    def create(cls, user_cls, args, kwargs, options):
+        DataClassVariable._patch_once()
+
+        skip_code(user_cls.__init__.__code__)
+        keys = [f.name for f in dataclasses.fields(user_cls)]
+        bound = inspect.signature(user_cls).bind(*args, **kwargs)
+        bound.apply_defaults()
+        assert set(bound.arguments.keys()) == set(keys)
+        items = {}
+        for key in keys:
+            val = bound.arguments[key]
+            key = ConstantVariable.create(key)
+            if isinstance(val, VariableTracker):
+                items[key] = val
+            else:
+                if cls.include_none:
+                    assert variables.ConstantVariable.is_literal(val)
+                    items[key] = variables.ConstantVariable.create(val)
+                else:
+                    assert val is None, f"unexpected {val}"
+
+        if len(items) == 1 and not isinstance(items[keys[0]], variables.TensorVariable):
+            unimplemented("DataClassVariable iterator constructor")
+            # TODO(jansel): implement unpacking logic in ModelOutput.__post_init__
+
+        return cls(items, user_cls, **options)
+
+    @classmethod
+    def wrap(cls, builder, obj):
+        user_cls = type(obj)
+        keys = [f.name for f in dataclasses.fields(user_cls)]
+
+        excluded = []
+        items = {}
+        for key in keys:
+            # __init__ function of a dataclass might not have yet defined the key
+            if hasattr(obj, key):
+                val = getattr(obj, key)
+                var = builder.__class__(
+                    tx=builder.tx, source=AttrSource(builder.source, key)
+                )(val)
+                if val is not None or cls.include_none:
+                    key = ConstantVariable.create(key)
+                    items[key] = var
+                else:
+                    excluded.append(var)
+        return cls(items, user_cls)
+
+    def __init__(self, items, user_cls, **options):
+        super().__init__(items, user_cls, **options)
+        assert self.is_matching_cls(user_cls)
+
+    def as_proxy(self):
+        raise NotImplementedError()
+
+    def reconstruct(self, codegen):
+        codegen.extend_output([codegen._create_load_const(self.user_cls)])
+        # All the keys are just wrapped strings
+        d = self.keys_as_python_constant()
+        codegen.foreach(d.values())
+        keys = tuple(d.keys())
+        codegen.extend_output(codegen.create_call_function_kw(len(keys), keys, True))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "__getitem__":
+            assert not kwargs and len(args) == 1
+            val = args[0]
+            if val.python_type() == str:
+                return self.getitem_const(val)
+            else:
+                return self.call_method(tx, "to_tuple", [], {}).call_method(
+                    tx, "__getitem__", args, kwargs
+                )
+        elif name == "to_tuple":
+            assert not (args or kwargs)
+            return variables.TupleVariable(list(self.items.values()))
+        elif name == "__setattr__":
+            name = "__setitem__"
+        return super().call_method(tx, name, args, kwargs)
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        name_vt = ConstantVariable.create(name)
+        if name_vt in self:
+            return self.call_method(tx, "__getitem__", [name_vt], {})
+        elif not self.include_none:
+            defaults = {f.name: f.default for f in dataclasses.fields(self.user_cls)}
+            if name in defaults:
+                assert variables.ConstantVariable.is_literal(defaults[name])
+                return variables.ConstantVariable.create(defaults[name])
+        super().var_getattr(tx, name)
+
+    call_hasattr = _call_hasattr_customobj
+
+
+class CustomizedDictVariable(ConstDictVariable):
+    @staticmethod
+    def is_matching_cls(cls):
+        # True if using default OrderedDict.__init__ and did not implement __post_init__
+        if (
+            issubclass(cls, collections.OrderedDict)
+            and cls.__init__ is collections.OrderedDict.__init__
+            and not hasattr(cls, "__post_init__")
+        ):
+            return True
+        # hack for HF usecase:
+        #   assume dataclass annotation for ModelOutput subclass
+        #   assume self.create is AA to ModelOutput.__post_init__
+        return _is_matching_transformers_cls(cls) or _is_matching_diffusers_cls(cls)
+
+    @classmethod
+    def is_matching_object(cls, obj):
+        return cls.is_matching_cls(type(obj))
+
+    # called from user_defined.py
+    # when is_matching_cls(cls) is true
+    @classmethod
+    def create(cls, user_cls, args, kwargs, options):
+        # avoid tracing when returning ModelOutput from forward func
+        for attr_name in ("__init__", "__post_init__", "__setattr__", "__setitem__"):
+            if hasattr(user_cls, attr_name):
+                fn = getattr(user_cls, attr_name)
+                assert callable(fn), f"expect callable attr {attr_name}"
+                if hasattr(fn, "__code__"):
+                    skip_code(fn.__code__)
+
+        if dataclasses.is_dataclass(user_cls):
+            # @dataclass CustomDict(a=1, b=2)
+            bound = inspect.signature(user_cls).bind(*args, **kwargs)
+            bound.apply_defaults()
+
+            def make_var(x):
+                if isinstance(x, VariableTracker):
+                    return x
+                elif ConstantVariable.is_literal(x):
+                    return ConstantVariable.create(x)
+                else:
+                    unimplemented(
+                        "expect VariableTracker or ConstantVariable.is_literal"
+                    )
+
+            items = {
+                ConstantVariable.create(k): make_var(v)
+                for k, v in bound.arguments.items()
+            }
+        elif not args:
+            # CustomDict(a=1, b=2) in the general (non-dataclass) case.
+            items = {ConstantVariable.create(k): v for k, v in kwargs.items()}
+        elif len(args) == 1 and isinstance(args[0], ConstDictVariable) and not kwargs:
+            # CustomDict({'a': 1, 'b': 2})
+            items = args[0].items
+        else:
+            unimplemented("custom dict init with args/kwargs unimplemented")
+
+        return cls(items, user_cls, **options)
+
+    # called from builder.py
+    @classmethod
+    def wrap(cls, builder, obj):
+        raise NotImplementedError()
+
+    def __init__(self, items, user_cls, **options):
+        super().__init__(items, user_cls, **options)
+        assert self.is_matching_cls(user_cls)
+
+    def as_proxy(self):
+        raise NotImplementedError()
+
+    # 'RETURN_VALUE triggered compile'
+    # called from torch/_dynamo/codegen.py
+    def reconstruct(self, codegen):
+        codegen.extend_output([codegen._create_load_const(self.user_cls)])
+        # All the keys are just wrapped strings
+        d = self.keys_as_python_constant()
+        codegen.foreach(d.values())
+        keys = tuple(d.keys())
+        codegen.extend_output(codegen.create_call_function_kw(len(keys), keys, True))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        fn = getattr(self.user_cls, name)
+        source = None if self.source is None else AttrSource(self.source, name)
+
+        if hasattr(fn, "__objclass__") and fn.__objclass__ in (
+            dict,
+            collections.OrderedDict,
+        ):
+            # for python dict method without overridden
+            return super().call_method(tx, name, args, kwargs)
+        elif name in ("__getitem__", "to_tuple", "__setitem__", "__setattr__"):
+            # for user overridden method
+            return tx.inline_user_function_return(
+                variables.UserFunctionVariable(fn, source=source),
+                [self] + list(args),
+                kwargs,
+            )
+
+        unimplemented("custom dict: call_method unimplemented name=%s", name)
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        name_vt = ConstantVariable.create(name)
+        if name_vt in self:
+            return self.call_method(tx, "__getitem__", [name_vt], {})
+        super().var_getattr(tx, name)
+
+    call_hasattr = _call_hasattr_customobj
+
+
+@functools.lru_cache(None)
+def _install_PretrainedConfig_patch():
+    import transformers
+
+    # We need to monkeypatch transformers here, sadly.
+    # TODO(voz): Upstream to transformers lib
+
+    def _dynamo_overriden_transformers_eq(self, other):
+        if not hasattr(other, "__dict__"):
+            return False
+        return self.__dict__ == other.__dict__
+
+    transformers.configuration_utils.PretrainedConfig.__eq__ = (
+        _dynamo_overriden_transformers_eq
+    )
+
+
+class HFPretrainedConfigVariable(VariableTracker):
+    """
+    Hack for HuggingFace PretrainedConfig
+    """
+
+    @staticmethod
+    def is_matching_cls(cls):
+        mod = sys.modules.get("transformers.configuration_utils")
+        is_match = mod is not None and issubclass(cls, mod.PretrainedConfig)
+
+        # Lazily install monkeypatch the first time we see it in dynamo
+        if is_match:
+            _install_PretrainedConfig_patch()
+        return is_match
+
+    @classmethod
+    def is_matching_object(cls, obj):
+        return cls.is_matching_cls(type(obj))
+
+    def __init__(self, obj, **kwargs):
+        super().__init__(**kwargs)
+        self.obj = obj
+        assert self.is_matching_cls(type(obj))
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        from . import ConstantVariable
+
+        return ConstantVariable.create(getattr(self.obj, name))
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        return variables.ConstantVariable.create(hasattr(self.obj, name))
+
+
+class PythonSysModulesVariable(VariableTracker):
+    """Special case for sys.modules.
+
+    Without this we will guard on the exact set of modules imported in the
+    lifetime of the python program.
+    """
+
+    def python_type(self):
+        return dict
+
+    def reconstruct(self, codegen):
+        codegen.extend_output(
+            [
+                codegen.create_load_python_module(sys, True),
+                codegen.create_load_attr("modules"),
+            ]
+        )
+
+    def call_method(
+        self, tx, name, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ):
+        from .builder import VariableBuilder
+
+        if name == "__getitem__":
+            return self.call_getitem(tx, *args, **kwargs)
+        elif name == "get":
+            return self.call_get(tx, *args, **kwargs)
+        elif name == "__contains__":
+            return self.call_contains(tx, *args, **kwargs)
+
+        # Fallback to dict implementation
+        real_dict = VariableBuilder(tx, self.source)(sys.modules)
+        return real_dict.call_method(tx, name, args, kwargs)
+
+    def _contains_helper(self, tx, key: VariableTracker):
+        k = key.as_python_constant()
+        has_key = k in sys.modules
+        install_guard(
+            self.make_guard(
+                functools.partial(GuardBuilder.DICT_CONTAINS, key=k, invert=not has_key)
+            )
+        )
+        return k, has_key
+
+    def call_contains(self, tx, key: VariableTracker):
+        k, has_key = self._contains_helper(tx, key)
+        return ConstantVariable.create(value=has_key)
+
+    def call_get(
+        self, tx, key: VariableTracker, default: Optional[VariableTracker] = None
+    ):
+        from .builder import VariableBuilder
+
+        k, has_key = self._contains_helper(tx, key)
+
+        if has_key:
+            return VariableBuilder(
+                tx,
+                GetItemSource(self.source, k),
+            )(sys.modules[k])
+
+        if default is not None:
+            return default
+
+        return ConstantVariable.create(value=None)
+
+    def call_getitem(self, tx, key: VariableTracker):
+        from .builder import VariableBuilder
+
+        k, has_key = self._contains_helper(tx, key)
+        return VariableBuilder(
+            tx,
+            GetItemSource(self.source, k),
+        )(sys.modules[k])
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/distributed.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..066be295b93170c4f07a41be528650813649db43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/distributed.py
@@ -0,0 +1,388 @@
+# mypy: ignore-errors
+import functools
+import inspect
+from typing import Dict, List
+
+import torch
+from ...fx.experimental._backward_state import BackwardState
+from .. import compiled_autograd, variables
+from .._trace_wrapped_higher_order_op import trace_wrapped
+from ..exc import unimplemented
+from ..external_utils import call_module_hooks_from_backward_state
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GlobalSource
+from ..utils import istype
+from .base import VariableTracker
+from .constant import ConstantVariable
+
+
+class DistributedVariable(VariableTracker):
+    """
+    The base distributed variable that encapsulates common methods
+    for the distributed objects (i.e. ProcessGroup, DeviceMesh, etc.).
+    Concrete distributed objects could inherit this class and add object
+    specific logic.
+
+    i.e. It provides the check on the distributed package existance
+    and hold the tracking value for the corresponding distributed object.
+    """
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        if not DistributedVariable.is_available():
+            unimplemented("torch.distributed package is not available!")
+        self.value = value
+
+    def python_type(self):
+        return type(self.value)
+
+    @staticmethod
+    def is_available():
+        # check if the distributed package is available or not
+        return torch.distributed.is_available()
+
+
+def is_from_local(value):
+    if not DistributedVariable.is_available():
+        return False
+    from torch.distributed._tensor import DTensor
+
+    return inspect.isfunction(value) and value is DTensor.from_local
+
+
+def is_constant_pg_functions(value):
+    if not DistributedVariable.is_available():
+        return False
+
+    from torch.distributed.distributed_c10d import (
+        _get_group_size_by_name,
+        _get_group_tag,
+        _rank_not_in_group,
+        _resolve_group_name_by_ranks_and_tag,
+        get_process_group_ranks,
+    )
+
+    constant_processgroup_functions = [
+        _get_group_size_by_name,
+        _get_group_tag,
+        _rank_not_in_group,
+        get_process_group_ranks,
+        _resolve_group_name_by_ranks_and_tag,
+    ]
+
+    return inspect.isfunction(value) and value in constant_processgroup_functions
+
+
+class PlacementClassVariable(DistributedVariable):
+    @staticmethod
+    def is_placement_type(value):
+        # we can't rely on importing/accessing torch distributed, it is not always built.
+        if not DistributedVariable.is_available():
+            return False
+
+        from torch.distributed._tensor.placement_types import Placement
+
+        return type(value) is type and issubclass(value, Placement)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if (
+            inspect.getattr_static(self.value, "__new__", None) in (object.__new__,)
+            and self.source
+        ):
+            # NOTE: we don't need to track mutations to the placement class as they
+            # suppose to be immutable.
+            new_obj = object.__new__(self.value)
+            var = PlacementVariable(new_obj)
+            if inspect.getattr_static(self.value, "__init__", None):
+                var.call_method(tx, "__init__", args, kwargs)
+                return var
+
+        return super().call_function(tx, args, kwargs)
+
+
+class PlacementVariable(DistributedVariable):
+    @staticmethod
+    def is_placement(value):
+        # we can't rely on importing/accessing torch distributed, it is not always built.
+        if not DistributedVariable.is_available():
+            return False
+
+        from torch.distributed._tensor.placement_types import Placement
+
+        return isinstance(value, Placement)
+
+    def as_python_constant(self):
+        return self.value
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        if name == "dim":
+            return ConstantVariable.create(self.value.dim)
+        return super().var_getattr(tx, name)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from . import ConstantVariable
+
+        # Placement types dynamo tracking only allows following methods
+        # and __setattr__  is for case like `Shard(dim)` and methods.
+        # Methods in the list must satisfy:
+        #    1. Input arguments are constants and do not need to be guarded on;
+        #    2. Output is constant with respect to their inputs
+        constant_fold_functions = [
+            "__init__",
+            "__setattr__",
+            "is_shard",
+            "is_partial",
+            "is_replicate",
+        ]
+
+        if name in constant_fold_functions:
+            try:
+                value_type = type(self.value)
+                assert (
+                    inspect.getattr_static(value_type, "__getattr__", None) is None
+                ), "no custom getattr allowed!"
+                method = inspect.getattr_static(value_type, name)
+            except AttributeError:
+                method = None
+            if method is object.__init__:
+                return ConstantVariable.create(None)
+
+            args = [x.as_python_constant() for x in args]
+            kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+            if name == "__setattr__":
+                method(self.value, *args, **kwargs)
+                return self
+            constant_val = method(self.value, *args, **kwargs)
+            return ConstantVariable.create(constant_val)
+
+        return super().call_method(tx, name, args, kwargs)
+
+
+class DeviceMeshVariable(DistributedVariable):
+    @staticmethod
+    def is_device_mesh(value):
+        # we can't rely on importing/accessing torch distributed, it is not always built.
+        if not DistributedVariable.is_available():
+            return False
+
+        from torch.distributed.device_mesh import DeviceMesh
+
+        return istype(value, DeviceMesh)
+
+    def as_python_constant(self):
+        return self.value
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        if name == "ndim":
+            return ConstantVariable.create(self.value.ndim)
+        return super().var_getattr(tx, name)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "size":
+            const_args = [x.as_python_constant() for x in args]
+            const_kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+            return ConstantVariable.create(self.value.size(*const_args, **const_kwargs))
+        if name == "get_coordinate":
+            return ConstantVariable.create(self.value.get_coordinate())
+        if name == "get_group":
+            return ConstantVariable.create(self.value.get_group())
+        if name == "_get_or_create_default_group":
+            return ProcessGroupVariable(self.value._get_or_create_default_group())
+        return super().call_method(tx, name, args, kwargs)
+
+
+class ProcessGroupVariable(DistributedVariable):
+    """
+    We don't want a ProcessGroup object to end up in our output graph.
+
+    But it's common for dynamo to intercept a PG that is then used to get info like
+    rank() or world_size(), as well as passed to utility functions in distributed_c10d
+    which desugar it into plain types like a ranklist and tag.
+
+    For convenience and proper guarding, we construct a variable type.
+
+    TODO: make it possible to use ProcessGroupVariable as input to simple functions
+          like _expand_group without dynamo complaining about making a proxy for it.
+          It is not a tensor-like type, and we don't want a proxy- but dynamo assumes
+          torch library functions are dealing with tensor-like types and would have proxies
+          for their args.
+    TODO: should we make this inherit VT instead of UDOV? Do we want any of the default behaviors
+          or just graph-break whenever one of our special cases is not hit?
+    """
+
+    def as_python_constant(self):
+        return self.value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "rank":
+            return variables.ConstantVariable.create(self.value.rank())
+        if name == "size":
+            return variables.ConstantVariable.create(self.value.size())
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def var_getattr(self, tx, name):
+        if name == "group_name":
+            return variables.ConstantVariable.create(self.value.group_name)
+        if name in ["rank", "size"]:
+            return variables.LambdaVariable(
+                lambda *args, **kwargs: self.call_method(tx, name, args, kwargs)
+            )
+        # TODO should this just raise unimplemented?
+        return super().var_getattr(tx, name)
+
+    @staticmethod
+    def is_process_group(value):
+        # we can't rely on importing/accessing torch distributed, it is not always built.
+        if not DistributedVariable.is_available():
+            return False
+        from torch._C._distributed_c10d import ProcessGroup
+        from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
+
+        return istype(value, (ProcessGroup, FakeProcessGroup))
+
+    @staticmethod
+    def get_global_pg_variable():
+        """
+        Make a ProcessGroupVariable from torch.distributed.group.WORLD and
+        intall guards.
+        """
+        import torch.distributed as dist
+
+        source = AttrSource(
+            AttrSource(
+                base=AttrSource(
+                    base=GlobalSource(global_name="torch"),
+                    member="distributed",
+                    get_static=False,
+                ),
+                member="group",
+                get_static=False,
+            ),
+            member="WORLD",
+            get_static=False,
+        )
+        install_guard(source.make_guard(GuardBuilder.ID_MATCH))
+        return ProcessGroupVariable(
+            dist.group.WORLD,
+            source=source,
+        )
+
+
+class BackwardHookVariable(VariableTracker):
+    """
+    Handles torch.utils.hooks.BackwardHook for module-level backward
+    hooks.
+    """
+
+    @staticmethod
+    def create(
+        tx,
+        module: VariableTracker,
+        user_hooks: VariableTracker,
+        user_pre_hooks: VariableTracker,
+    ):
+        if not compiled_autograd.compiled_autograd_enabled:
+            unimplemented("module-level backwards hooks require compiled autograd")
+
+        def _in_graph_bw_hooks(bw_state: BackwardState):
+            """
+            Rather than installing the user hooks in the graph (which
+            don't survive AotAutograd), we install hooks that will call
+            trace_wrapped in the backward pass that CompiledAutograd
+            can turn into actual hook calls.
+            """
+            return torch.utils.hooks.BackwardHook(
+                None,
+                (
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_module_hooks_from_backward_state,
+                        bw_state=bw_state,
+                        hooks_name=user_hooks_name,
+                        module_name=module_name,
+                    ),
+                ),
+                (
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_module_hooks_from_backward_state,
+                        bw_state=bw_state,
+                        hooks_name=user_pre_hooks_name,
+                        module_name=module_name,
+                    ),
+                ),
+            )
+
+        module_name, bw_state_proxy = tx.output.add_backward_state_hook(module)
+        user_pre_hooks_name, _ = tx.output.add_backward_state_hook(user_pre_hooks)
+        user_hooks_name, _ = tx.output.add_backward_state_hook(user_hooks)
+        proxy = tx.output.create_proxy(
+            "call_function",
+            _in_graph_bw_hooks,
+            (bw_state_proxy,),
+            {},
+        )
+        proxy.node.meta["example_value"] = torch.utils.hooks.BackwardHook(None, (), ())
+        return BackwardHookVariable(proxy, module, user_hooks, user_pre_hooks)
+
+    def __init__(
+        self,
+        proxy: torch.fx.Proxy,
+        module: VariableTracker,
+        user_hooks: VariableTracker,
+        user_pre_hooks: VariableTracker,
+        **options,
+    ):
+        super().__init__(**options)
+        self.proxy = proxy
+        self.module = module
+        self.user_hooks = user_hooks
+        self.user_pre_hooks = user_pre_hooks
+
+    def as_proxy(self):
+        return self.proxy
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if name in ("setup_input_hook", "setup_output_hook"):
+            return self._setup_hook(tx, name, *args, **kwargs)
+        return super().call_method(tx, name, args, kwargs)
+
+    def _setup_hook(self, tx, hook_method_name, args):
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                hook_method_name,
+                (self.as_proxy(), args.as_proxy()),
+                {},
+            ),
+        )
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/functions.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c7c64010e19361d2b596553e7ffd38985486a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/functions.py
@@ -0,0 +1,947 @@
+# mypy: ignore-errors
+
+import collections
+import functools
+import inspect
+import itertools
+import types
+from typing import Dict, List, Optional, TYPE_CHECKING, Union
+
+import torch
+
+from .. import variables
+from ..bytecode_transformation import create_call_function, create_rot_n
+from ..exc import unimplemented, Unsupported
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, ConstantSource, DefaultsSource, GetItemSource
+from ..utils import check_constant_args, get_first_attr, identity, istype, make_cell
+from .base import MutableLocal, typestr, VariableTracker
+from .constant import ConstantVariable
+from .distributed import ProcessGroupVariable
+
+if TYPE_CHECKING:
+    from torch._guards import Source
+
+
+def wrap_bound_arg(tx, val, source=None):
+    # Source propagation is best effort since not every object we encounter has a source to begin with.
+    if isinstance(val, VariableTracker):
+        return val
+    elif not source:
+        from torch._dynamo.variables.builder import SourcelessBuilder
+
+        return SourcelessBuilder()(tx, val)
+    else:
+        # Create a lazy variable to avoid guarding on __defaults__ unless really
+        # needed.
+        return variables.LazyVariableTracker.create(val, source)
+
+
+def wrap_args_kwargs(tx, result):
+    for k, v in list(result.items()):
+        if isinstance(v, (tuple, dict)):
+            # args/kwargs
+            result[k] = wrap_bound_arg(tx, v)
+
+
+def init_cellvars(parent, result, code):
+    closure_cells = dict()
+    side_effects = parent.output.side_effects
+
+    # for name in itertools.chain(code.co_cellvars, code.co_freevars):
+    for name in code.co_cellvars:
+        closure_cells[name] = side_effects.track_cell_new()
+        if name in result:
+            side_effects.store_cell(closure_cells[name], result.pop(name))
+
+    return closure_cells
+
+
+def _create_nested_fn(
+    code, f_globals, name, defaults, closure, kwdefaults, annotations
+):
+    from types import FunctionType
+
+    func = FunctionType(code, f_globals, name, defaults, closure)
+    func.__kwdefaults__ = kwdefaults
+
+    if isinstance(annotations, tuple):
+        from itertools import pairwise
+
+        annotations = dict(pairwise(annotations))
+
+    # TypeError: __annotations__ must be set to a dict object
+    assert annotations is None or isinstance(annotations, dict)
+    func.__annotations__ = annotations
+
+    return func
+
+
+class BaseUserFunctionVariable(VariableTracker):
+    def get_filename(self):
+        return self.get_code().co_filename
+
+    def get_name(self):
+        return self.get_code().co_name
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        return tx.inline_user_function_return(
+            self, list(self.self_args()) + list(args), kwargs
+        )
+
+    def call_hasattr(self, tx, name: str) -> VariableTracker:
+        result = False
+
+        try:
+            result = hasattr(self.get_function(), name)
+        except NotImplementedError:
+            if name == "__name__" and isinstance(self, NestedUserFunctionVariable):
+                result = True
+        return variables.ConstantVariable.create(result)
+
+    def inspect_parameter_names(self):
+        return list(inspect.signature(self.get_function()).parameters)
+
+    def closure_vars(self, tx):
+        return {}
+
+
+class UserFunctionVariable(BaseUserFunctionVariable):
+    """Some unsupported user-defined global function"""
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
+        return cls(
+            value,
+            source=source,
+        )
+
+    def __init__(self, fn, is_constant=False, **kwargs):
+        super().__init__(**kwargs)
+        if getattr(fn, "_dynamo_marked_constant", False):
+            # This method should be treated as a constant for the purposes of compilation
+            self.is_constant = True
+        else:
+            self.is_constant = False
+
+        assert isinstance(
+            fn, (types.FunctionType, torch.jit.ScriptFunction)
+        ), f"expected FunctionType found {typestr(fn)} {fn}"
+        # unpack @torch._dynamo.optimize()(fn) wrapped function
+        fn = inspect.getattr_static(fn, "_torchdynamo_inline", fn)
+        # unpack torch.jit.script_if_tracing
+        if inspect.getattr_static(fn, "__script_if_tracing_wrapper", False):
+            fn = inspect.getattr_static(fn, "__original_fn", fn)
+        self.fn: types.FunctionType = fn
+
+    def as_python_constant(self):
+        if istype(self, UserFunctionVariable):
+            return self.fn
+        # subclasses (such as methods) usually aren't a constant
+        return super().as_python_constant()
+
+    def self_args(self):
+        return []
+
+    def get_function(self):
+        return self.fn
+
+    def get_code(self):
+        return self.fn.__code__
+
+    def python_type(self):
+        return types.FunctionType
+
+    def has_self(self):
+        return getattr(self.fn, "__self__", None) is not None
+
+    def get_globals(self):
+        return self.fn.__globals__
+
+    def bind_args(self, parent, args, kwargs):
+        assert not self.is_constant
+        tx = parent.output.root_tx
+        wrap = functools.partial(wrap_bound_arg, tx=tx)
+
+        fn: types.FunctionType = self.fn
+        defaults = fn.__defaults__ or []
+        defaults_sources = [
+            None if self.source is None else DefaultsSource(self.source, idx)
+            for idx, _ in enumerate(defaults)
+        ]
+        fake_func = types.FunctionType(
+            fn.__code__,
+            fn.__globals__,
+            fn.__name__,
+            tuple(
+                [
+                    wrap(val=arg, source=source)
+                    for arg, source in zip(defaults, defaults_sources)
+                ]
+            ),
+            fn.__closure__,
+        )
+        if fn.__kwdefaults__:
+            kwdefaults_sources = {
+                k: None
+                if self.source is None
+                else DefaultsSource(self.source, k, is_kw=True)
+                for k in fn.__kwdefaults__
+            }
+            fake_func.__kwdefaults__ = {
+                k: wrap(val=v, source=kwdefaults_sources[k])
+                for k, v in fn.__kwdefaults__.items()
+            }
+
+        bound = inspect.signature(fake_func).bind(*args, **kwargs)
+        bound.apply_defaults()
+        result = dict(bound.arguments.items())
+
+        wrap_args_kwargs(tx, result)
+        closure_cells = init_cellvars(parent, result, fn.__code__)
+        closure = self.fn.__closure__ or ()
+        assert len(closure) == len(self.fn.__code__.co_freevars)
+        for idx, name, cell in zip(
+            itertools.count(), self.fn.__code__.co_freevars, closure
+        ):
+            if name == "__class__":
+                source = AttrSource(self.source, "__class__") if self.source else None
+                result[name] = variables.UserDefinedClassVariable(
+                    cell.cell_contents,
+                    source=source,
+                )
+            else:
+                var = tx.match_nested_cell(name, cell)
+                if var is not None:
+                    # optimization for cleaner codegen
+                    result[name] = var
+                elif self.source:
+                    from .builder import VariableBuilder
+
+                    side_effects = parent.output.side_effects
+                    if cell in side_effects:
+                        out = side_effects[cell]
+                    else:
+                        closure_cell = GetItemSource(
+                            AttrSource(self.source, "__closure__"), idx
+                        )
+                        closure_cell_contents = AttrSource(
+                            closure_cell, "cell_contents"
+                        )
+                        try:
+                            contents_var = VariableBuilder(
+                                parent, closure_cell_contents
+                            )(cell.cell_contents)
+                        except ValueError:
+                            # Cell has not yet been assigned
+                            contents_var = variables.DeletedVariable()
+
+                        if (
+                            closure_cell_contents.name()
+                            not in tx.mutated_closure_cell_contents
+                        ):
+                            # Optimistically don't allocate the cell, to
+                            # reduce the number of side effects.  This is
+                            # important for cond, as without it, any accesses
+                            # to closures create side effects and cond doesn't
+                            # support side effects.  If we're wrong and this
+                            # closure cell gets written to, we will restart
+                            # the analysis with this cell's name in the
+                            # mutated list here
+                            result[name] = contents_var
+                            continue
+
+                        # cells are written to with "cell_contents",
+                        # so the source should just be the closure_cell, not its contents
+                        out = side_effects.track_cell_existing(closure_cell, cell)
+                        side_effects.store_cell(
+                            out,
+                            contents_var,
+                        )
+
+                    result[name] = out
+
+                else:
+                    from .builder import SourcelessBuilder
+
+                    result[name] = SourcelessBuilder()(tx, cell.cell_contents)
+
+        return result, closure_cells
+
+    def export_freevars(self, parent, child):
+        pass
+
+    def call_hasattr(self, tx, name: str) -> VariableTracker:
+        result = hasattr(self.fn, name)
+        return variables.ConstantVariable.create(result)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if self.is_constant:
+            return invoke_and_store_as_constant(
+                tx, self.fn, self.get_name(), args, kwargs
+            )
+
+        return super().call_function(tx, args, kwargs)
+
+
+class UserMethodVariable(UserFunctionVariable):
+    """Some unsupported user-defined method"""
+
+    def __init__(self, fn, obj, **kwargs):
+        super().__init__(fn=fn, **kwargs)
+        self.obj = obj
+
+    def __str__(self):
+        return f"{self.__class__.__name__}({self.fn}, {self.obj})"
+
+    def self_args(self):
+        return [self.obj]
+
+    def python_type(self):
+        return types.MethodType
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        # For nn.Module methods, redirecting to NNModuleVariable.call_method for optimized solution
+        # rather than simple inlining. E.g, putting `call_method` op in FX graph for `forward` method
+        # since we ensure `forward` of allowed modules can be traced by AOT safely.
+        # Note this is not only for allowed modules, as user customized modules can extend from
+        # allowed modules but using parent's `forward` method, which is also covered by this branch.
+
+        # If we are tracing the higher order op, we want Dynamo to step inside
+        # the module call so that Dynamo can see the underlying parameters and
+        # buffers and raise them as inputs to the graph. The is_root_tracer
+        # check bypasses the if condition for non-root tracers and directly
+        # calls the super().call_function at the end, which is basically
+        # equivalent of inlining the method.
+        if tx.output.is_root_tracer() and isinstance(
+            self.obj, variables.NNModuleVariable
+        ):
+            module_attr = getattr(self.fn, "__module__", "")
+            if (
+                module_attr is not None
+                and module_attr.startswith("torch.nn.")
+                or self.is_constant
+            ):
+                return self.obj.call_method(
+                    tx, self.fn.__name__, args, kwargs, constant=self.is_constant
+                )
+        return super().call_function(tx, args, kwargs)
+
+    def inspect_parameter_names(self):
+        return super().inspect_parameter_names()[1:]
+
+
+class WrappedUserMethodVariable(UserMethodVariable):
+    def __init__(self, wrapped, context, **kwargs):
+        kwargs.pop("fn", None)
+        kwargs.pop("obj", None)
+        super().__init__(wrapped.fn, wrapped.obj, **kwargs)
+        self.wrapped = wrapped
+        self.context = context
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        self.context.enter(tx)
+        result = super().call_function(tx, args, kwargs)
+        self.context.exit(tx)
+        return result
+
+
+class WrappedUserFunctionVariable(UserFunctionVariable):
+    def __init__(self, wrapped, context, **kwargs):
+        kwargs.pop("fn", None)
+        kwargs.pop("obj", None)
+        super().__init__(wrapped.fn, **kwargs)
+        self.wrapped = wrapped
+        self.context = context
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        self.context.enter(tx)
+        result = super().call_function(tx, args, kwargs)
+        self.context.exit(tx)
+        return result
+
+
+def invoke_and_store_as_constant(tx, fn, name, args, kwargs):
+    def convert(x):
+        if isinstance(x, variables.TensorVariable):
+            return x.get_real_value()
+        return x.as_python_constant()
+
+    args = [convert(x) for x in args]
+    kwargs = {k: convert(v) for k, v in kwargs.items()}
+    res = fn(*args, **kwargs)
+    return tx.output.register_attr_or_module(
+        res,
+        name,
+        source=ConstantSource(name),
+    )
+
+
+class NestedUserFunctionVariable(BaseUserFunctionVariable):
+    _nonvar_fields = {
+        "closure_scope",
+        "f_globals",
+        *BaseUserFunctionVariable._nonvar_fields,
+    }
+
+    def __init__(
+        self,
+        fn_name,
+        code,
+        f_globals,
+        defaults,
+        kwdefaults,
+        annotations,
+        closure,
+        closure_scope,
+        wrapped_reconstructible=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        assert isinstance(fn_name.as_python_constant(), str)
+        assert isinstance(code.as_python_constant(), types.CodeType)
+        assert isinstance(f_globals, dict)
+        self.fn_name = fn_name
+        self.code = code
+        self.f_globals = f_globals
+        self.defaults = defaults
+        self.kwdefaults = kwdefaults
+        self.annotations = annotations
+        self.closure = closure
+        if closure is None:
+            closure_scope = None
+        self.closure_scope = closure_scope
+        # Either a source or a VT with .can_reconstruct() == True
+        self.wrapped_reconstructible: Optional[
+            Union[Source, VariableTracker]
+        ] = wrapped_reconstructible
+
+    def self_args(self):
+        return []
+
+    def get_code(self):
+        return self.code.as_python_constant()
+
+    def get_function(self):
+        if self.closure:
+            raise NotImplementedError()
+        func = types.FunctionType(
+            self.code.as_python_constant(),
+            self.f_globals,
+            self.fn_name.as_python_constant(),
+        )
+        if self.defaults:
+            func.__defaults__ = self.defaults.as_python_constant()
+        if self.kwdefaults:
+            func.__kwdefaults__ = self.kwdefaults.as_python_constant()
+        if self.annotations:
+            annotations = self.annotations.as_python_constant()
+            if isinstance(annotations, tuple):
+                from itertools import pairwise
+
+                annotations = dict(pairwise(annotations))
+
+            # TypeError: __annotations__ must be set to a dict object
+            assert isinstance(annotations, dict)
+            func.__annotations__ = annotations
+        return func
+
+    def has_closure(self):
+        return self.closure is not None
+
+    def has_self(self):
+        return False
+
+    def get_globals(self):
+        return self.f_globals
+
+    def bind_args(self, parent, args, kwargs):
+        from .misc import InlinedClosureVariable
+
+        code = self.get_code()
+        func = types.FunctionType(
+            code,
+            self.f_globals,
+            self.fn_name.as_python_constant(),
+            tuple(self.defaults.items) if self.defaults else None,
+            tuple(make_cell(None) for _ in range(len(self.get_code().co_freevars))),
+        )
+        if self.kwdefaults:
+            func.__kwdefaults__ = self.kwdefaults.keys_as_python_constant()
+        bound = inspect.signature(func).bind(*args, **kwargs)
+        bound.apply_defaults()
+        result = dict(bound.arguments.items())
+        wrap_args_kwargs(parent.output.root_tx, result)
+        closure_cells = init_cellvars(parent, result, code)
+
+        for idx, name in enumerate(code.co_freevars):
+            cell = self.closure.items[idx]
+            assert getattr(cell, name, name) == name
+            assert name not in result
+            if isinstance(cell, InlinedClosureVariable):
+                # InlinedClosureVariable's are created from LOAD_CLOSURE's from
+                # InliningInstructionTranslators when the variable name is not found in closure_cells.
+                # They should remain outside of closure_cells, so that our callee (the
+                # InliningInstructionTranslator that traces `func`) handles
+                # the cell correctly - that is, the cell's contents are treated as if they
+                # are local variables, like in UserFunctionVariable's bind_args for freevars.
+                cand = parent
+                while cand and name not in cand.symbolic_locals:
+                    cand = cand.parent
+                if cand is None:
+                    raise RuntimeError(
+                        f"Couldn't find {name} in the symbolic_locals of the inline interpreter stack"
+                    )
+                result[name] = cand.symbolic_locals[name]
+            else:
+                closure_cells[name] = self.closure.items[idx]
+
+        return result, closure_cells
+
+    def export_freevars(self, parent, child):
+        code = self.get_code()
+        for var in code.co_freevars:
+            if var in child.symbolic_locals:
+                parent.symbolic_locals[var] = child.symbolic_locals[var]
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from(__name__, "_create_nested_fn")
+        codegen(self.code)
+        codegen.extend_output([codegen._create_load_const(self.f_globals)])
+        codegen(ConstantVariable.create(self.code.value.co_name))
+
+        if self.defaults:
+            codegen(self.defaults)
+        else:
+            codegen.extend_output([codegen.create_load_const(None)])
+
+        if self.closure:
+            codegen(self.closure)
+        else:
+            codegen.extend_output([codegen.create_load_const(None)])
+
+        if self.kwdefaults:
+            codegen(self.kwdefaults)
+        else:
+            codegen.extend_output([codegen.create_load_const(None)])
+
+        if self.annotations:
+            try:
+                annotations = self.annotations.as_python_constant()
+                codegen.extend_output([codegen._create_load_const(annotations)])
+            except NotImplementedError:
+                codegen(self.annotations)
+        else:
+            codegen.extend_output([codegen.create_load_const(None)])
+
+        codegen.extend_output(create_call_function(7, push_null=True))
+
+        if self.wrapped_reconstructible:
+            codegen.load_import_from("functools", "wraps")
+            codegen(self.wrapped_reconstructible)
+            codegen.extend_output(create_call_function(1, True))
+            codegen.extend_output(create_rot_n(2))
+            codegen.extend_output(create_call_function(1, True))
+
+
+class SkipFunctionVariable(VariableTracker):
+    def __init__(self, value, reason=None, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+        self.reason = reason
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+        return cls(
+            value,
+            source=source,
+        )
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def fold_through_function_to_wrapper():
+        return {
+            collections.namedtuple: variables.UserDefinedClassVariable,
+        }
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if inspect.getattr_static(self.value, "_torchdynamo_disable", False):
+            unimplemented(f"call torch._dynamo.disable() wrapped function {self.value}")
+        # Fold through the functions(e.g, collections.namedtuple)
+        # that inputs & outputs are all python constants
+        elif (
+            self.value in self.fold_through_function_to_wrapper().keys()
+            and check_constant_args(args, kwargs)
+        ):
+            value = self.value(
+                *[x.as_python_constant() for x in args],
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+            return self.fold_through_function_to_wrapper().get(self.value)(
+                value, mutable_local=MutableLocal()
+            )
+        elif (
+            self.value is functools.wraps
+            and not kwargs
+            and len(args) == 1
+            and (
+                args[0].source is not None or args[0].can_reconstruct(tx.output.root_tx)
+            )
+        ):
+
+            def wraps(fn):
+                if isinstance(fn, variables.NestedUserFunctionVariable):
+                    if args[0].source:
+                        reconstructible = args[0].source
+                    else:
+                        reconstructible = args[0]
+                    return fn.clone(wrapped_reconstructible=reconstructible)
+                unimplemented(f"functools.wraps({fn})")
+
+            return variables.LambdaVariable(wraps)
+        else:
+            try:
+                path = inspect.getfile(self.value)
+            except TypeError:
+                path = f"Builtin {self.value.__name__}"
+            msg = f"'skip function {self.value.__qualname__} in file {path}'"
+            msg += f"', {self.reason}'" if self.reason else ""
+            unimplemented(msg)
+
+
+def _traceable_collective_remaps():
+    # We can't rely on importing from distributed, since it's not always built
+    if torch.distributed.is_available():
+        from torch.distributed._functional_collectives import (
+            traceable_collective_remaps,
+        )
+
+        return traceable_collective_remaps
+    return {}
+
+
+def _traceable_collectives_source(tx, fn):
+    assert torch.distributed.is_available(), "Illegal invocation."
+    assert fn in _traceable_collective_remaps().values()
+
+    inner_name = fn.__name__
+    path_source = tx.import_source("torch.distributed._functional_collectives")
+    return AttrSource(path_source, inner_name)
+
+
+class CollectiveFunctionRewriteVariable(UserFunctionVariable):
+    """
+    Some of the torch.distributed.* collective APIs are possible to rewrite to 'traceable' collectives.
+
+    This class provides both a way to check if a function is remappable, and perform the remapping.
+
+    In the case that a function is 'remappable' but only for some combinations of call-time arguments,
+    we check the args at `call_function` time and fall back to graph-breaking if needed.  This is no worse
+    than status-quo as we currently graph-break on all distributed.* collectives.
+    """
+
+    def __init__(self, fn, *, replacement_var, **kwargs):
+        super().__init__(fn, **kwargs)
+        assert isinstance(replacement_var, UserFunctionVariable)
+        self.replacement_var = replacement_var
+
+    @staticmethod
+    def create(tx, old_fn, source, **options):
+        new_fn, new_source = CollectiveFunctionRewriteVariable.rewrite(tx, old_fn)
+        return CollectiveFunctionRewriteVariable(
+            old_fn,
+            replacement_var=UserFunctionVariable(new_fn, source=new_source, **options),
+            source=source,
+            **options,
+        )
+
+    @staticmethod
+    def can_rewrite(variable):
+        return (
+            inspect.isfunction(variable) and variable in _traceable_collective_remaps()
+        )
+
+    @staticmethod
+    def rewrite(tx, fn):
+        new_fn = _traceable_collective_remaps()[fn]
+        return new_fn, _traceable_collectives_source(tx, new_fn)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        # call_function must check any unsupported arguments and graph-break.
+        # It's safe to assume args/kwargs from orig_fn map 1:1 to args/kwargs of remapped_fn,
+        # since that's the contract for putting a mapping in `traceable_collective_remaps`
+        import torch.distributed as dist
+        from torch.distributed._functional_collectives import REDUCE_OP_TO_STR
+
+        # Merge args into kwargs so positional and keyword args
+        # can be processed the same way.
+        signature = inspect.signature(self.fn)
+        kwargs = dict(signature.bind(*args, **kwargs).arguments)
+        args = ()
+
+        if "async_op" in kwargs and kwargs["async_op"].as_python_constant():
+            unimplemented(
+                f"CollectiveFunctionRewriteVariable can't support async_op=True for {self.fn}"
+            )
+
+        if kwargs.get("group") is None or kwargs["group"].value is None:
+            kwargs["group"] = ProcessGroupVariable.get_global_pg_variable()
+
+        if self.fn == dist.all_reduce:
+            reduce_op_var = kwargs.get("op")
+            reduce_op = (
+                reduce_op_var.value
+                if reduce_op_var is not None
+                else signature.parameters["op"].default
+            )
+            if reduce_op not in REDUCE_OP_TO_STR:
+                raise ValueError(f"Unsupported all_reduce op: {reduce_op}")
+            kwargs["op"] = variables.ConstantVariable.create(
+                REDUCE_OP_TO_STR[reduce_op]
+            )
+        return self.replacement_var.call_function(tx, args, kwargs)
+
+
+class FunctoolsPartialVariable(VariableTracker):
+    def __init__(self, func: VariableTracker, args, keywords, **kwargs):
+        super().__init__(**kwargs)
+        self.func = func
+        assert isinstance(args, list)
+        self.args = args
+        assert isinstance(keywords, dict)
+        self.keywords = keywords
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("functools", "partial")
+        codegen(self.func)
+        if self.args:
+            codegen.foreach(self.args)
+        if not self.keywords:
+            codegen.extend_output(create_call_function(len(self.args) + 1, True))
+            return
+
+        codegen.foreach(self.keywords.values())
+        keys = tuple(self.keywords.keys())
+        codegen.extend_output(
+            codegen.create_call_function_kw(len(keys) + len(self.args) + 1, keys, True)
+        )
+
+    def get_function(self):
+        return self.as_python_constant()
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        merged_args = self.args + args
+        merged_kwargs = {**self.keywords, **kwargs}
+        return self.func.call_function(tx, merged_args, merged_kwargs)
+
+    def call_hasattr(self, tx, name: str) -> VariableTracker:
+        # functools.partial uses slots, so attributes are constant
+        return variables.ConstantVariable.create(
+            hasattr(functools.partial(identity), name)
+        )
+
+    def as_python_constant(self):
+        return functools.partial(
+            self.func.as_python_constant(),
+            *[arg.as_python_constant() for arg in self.args],
+            **{k: v.as_python_constant() for k, v in self.keywords.items()},
+        )
+
+    def guard_as_python_constant(self):
+        """Similar to as_python_constant(), but add ID_MATCH guards to try to force things to become constants"""
+        return functools.partial(
+            self.func.guard_as_python_constant(),
+            *[v.guard_as_python_constant() for v in self.args],
+            **{k: v.guard_as_python_constant() for k, v in self.keywords.items()},
+        )
+
+
+class TritonKernelVariable(VariableTracker):
+    def __init__(self, kernel, kernel_idx, grid, **kwargs):
+        from triton.runtime.autotuner import Autotuner
+
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+
+        super().__init__(**kwargs)
+
+        assert kernel is not None
+
+        self.kernel = kernel
+        self.kernel_idx = kernel_side_table.add_kernel(kernel)
+
+        assert kernel_idx is None or self.kernel_idx == kernel_idx
+
+        self.grid = grid
+
+        if isinstance(kernel, Autotuner):
+            # We only support configs and keys arguments of triton.autotune
+            # Make sure other arguments are defaulted
+            defaults = inspect.signature(Autotuner.__init__).parameters
+
+            # Newer version of triton change attribute name from warmup to num_warmup and rep to num_rep.
+            # The call to get_first_attr is to maintain backward-compatibility.
+            if (
+                (
+                    "warmup" in defaults
+                    and defaults["warmup"].default
+                    != get_first_attr(kernel, "num_warmups", "warmup")
+                )
+                or (
+                    "rep" in defaults
+                    and defaults["rep"].default
+                    != get_first_attr(kernel, "num_reps", "rep")
+                )
+                or (
+                    "prune_configs_by" in defaults
+                    and defaults["prune_configs_by"].default
+                    != kernel.early_config_prune
+                )
+                # Set via reset_to_zero argument
+                or len(kernel.reset_idx) != 0
+                or len(kernel.restore_idx) != 0
+            ):
+                raise Unsupported(
+                    "Only configs and keys are supported for triton.autotune"
+                )
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from triton.runtime.autotuner import Autotuner
+
+        from .constant import ConstantVariable
+        from .dicts import ConstDictVariable
+        from .lists import BaseListVariable
+
+        if self.grid is None:
+            raise Unsupported("Triton kernels should always be called with a grid")
+
+        # Both for grid's meta as well as for the kernel, we need combined
+        # args and kwargs normalized
+        names = (
+            variables.ConstantVariable.create(name) for name in self.kernel.arg_names
+        )
+        kwargs = {variables.ConstantVariable.create(k): v for k, v in kwargs.items()}
+        normalized_args = {**dict(zip(names, args)), **kwargs}
+
+        configs = (
+            [config.kwargs for config in self.kernel.configs]
+            if isinstance(self.kernel, Autotuner)
+            else [{}]
+        )
+        grids = []
+        for config_args in configs:
+            # If the grid is a function, then lets execute it and convert it to
+            # a list
+            grid = self.grid
+            if isinstance(grid, (NestedUserFunctionVariable, UserFunctionVariable)):
+                # Populate the special "meta" argument to call the grid function
+                config_args = {
+                    ConstantVariable.create(k): ConstantVariable.create(v)
+                    for k, v in config_args.items()
+                }
+                meta = ConstDictVariable({**normalized_args, **config_args}, dict)
+                grid = grid.call_function(tx, [meta], {})
+
+            # Now, the grid must be a list either originally or through above
+            # modification
+            if isinstance(grid, BaseListVariable):
+                grids.append(grid.as_proxy())
+            else:
+                unimplemented(f"grid for the triton kernel is {type(grid)}")
+
+        for i in range(len(grids)):
+            if not isinstance(grids[i], tuple):
+                raise Unsupported("Only tuple grids are supported")
+            # inductor expects all grids to be 3-tuple so lets make it
+            if len(grids[i]) == 1:
+                grids[i] = (grids[i][0], 1, 1)
+            elif len(grids[i]) == 2:
+                grids[i] = (grids[i][0], grids[i][1], 1)
+            elif len(grids[i]) > 3:
+                raise Unsupported("Grid can have at most rank 3")
+
+        assert len(grids) != 0
+        if len(set(grids)) == 1:
+            # If there's only one unique grid, lets simplify
+            grids = [grids[0]]
+
+        from torch._higher_order_ops.triton_kernel_wrap import (
+            triton_kernel_wrapper_mutation,
+        )
+
+        # Combine args and kwargs and pass as a dict so that if user defined triton
+        # kernel uses variables as 'grid' or 'kernel', it does not conflict with
+        # parameters of the wrapper function
+        meta = ConstDictVariable(normalized_args, dict)
+        tx.output.create_proxy(
+            "call_function",
+            triton_kernel_wrapper_mutation,
+            (),
+            {
+                "kernel_idx": self.kernel_idx,
+                "grid": grids,
+                "kwargs": meta.as_proxy(),
+            },
+        )
+
+        return variables.ConstantVariable(
+            None,
+        )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "__getitem__":
+            # __getitem__ should only be called if we don't already have a grid
+            # Only grid needs to be passed
+            if self.grid is not None or len(args) != 1:
+                raise Unsupported(
+                    "Triton kernels should be called with only a single grid"
+                )
+
+            return TritonKernelVariable(
+                kernel=self.kernel,
+                kernel_idx=self.kernel_idx,
+                grid=args[0],
+            )
+        elif name == "run":
+            if "grid" not in kwargs:
+                raise Unsupported("Triton kernel requires to be called with a grid")
+            grid = kwargs.pop("grid")
+            kwargs.pop("warmup", None)
+            # rewrite kernel.run(*args, grid=grid) to kernel[grid](*args)
+            return TritonKernelVariable(
+                kernel=self.kernel, kernel_idx=self.kernel_idx, grid=grid
+            ).call_function(tx, args, kwargs)
+
+        # Bail out to parent's implementation
+        return super().call_method(tx, name, args, kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/higher_order_ops.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/higher_order_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..010188362fff8be4e86b0643d74f4f1946dce2f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/higher_order_ops.py
@@ -0,0 +1,1660 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import logging
+import types
+
+from typing import Dict, List, Optional
+
+import torch._C
+import torch.fx
+import torch.nn
+import torch.onnx.operators
+from torch._dynamo.utils import deepcopy_to_fake_tensor, get_fake_value, get_real_value
+from torch._dynamo.variables.base import VariableTracker
+from torch._dynamo.variables.builtin import BuiltinVariable
+from torch._dynamo.variables.functions import UserFunctionVariable
+from torch._dynamo.variables.tensor import SymNodeVariable
+from torch._guards import Source
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils import _pytree as pytree
+
+from ..exc import (
+    UncapturedHigherOrderOpError,
+    unimplemented,
+    Unsupported,
+    UserError,
+    UserErrorType,
+)
+from ..source import AttrSource, FSDPNNModuleSource, GetItemSource, NNModuleSource
+from ..utils import proxy_args_kwargs
+from .dicts import ConstDictVariable
+from .lists import ListVariable, TupleVariable
+from .nn_module import NNModuleVariable, UnspecializedNNModuleVariable
+
+
+log = logging.getLogger(__name__)
+
+
+def raise_hard_error_if_graph_break(reason):
+    def deco(fn):
+        @functools.wraps(fn)
+        def graph_break_as_hard_error(*args, **kwargs):
+            try:
+                return fn(*args, **kwargs)
+            except Unsupported as e:
+                msg = " Scroll up to find out what causes the graph break."
+                raise UncapturedHigherOrderOpError(reason + msg) from e
+
+        return graph_break_as_hard_error
+
+    return deco
+
+
+@contextlib.contextmanager
+def dynamo_enable_grad(tx, enable=True):
+    from . import GradModeVariable
+
+    org_value = torch.is_grad_enabled()
+    try:
+        GradModeVariable.create(tx, enable, initialized=True)
+        yield
+    finally:
+        GradModeVariable.create(tx, org_value, initialized=True)
+
+
+def only_consist_of(var, types, allow_none=False):
+    if isinstance(var, types):
+        return True
+    if allow_none and var.is_python_constant() and var.as_python_constant() is None:
+        return True
+    if isinstance(var, (TupleVariable, ListVariable)):
+        return all(only_consist_of(item, types, allow_none) for item in var.items)
+    if isinstance(var, ConstDictVariable):
+        return all(
+            only_consist_of(item, types, allow_none) for item in var.items.values()
+        )
+    return False
+
+
+# A more read-able syntax sugar for creating a UserFunctionVariable for f
+# and run call_function on it. Make it return a function to preserve the calling
+# convention of the original f.
+def _make_inlined(tx, f):
+    assert callable(f), "Expect f to be a python callable."
+
+    def inline_call(*args, **kwargs):
+        return UserFunctionVariable(f).call_function(tx, args, kwargs)
+
+    return inline_call
+
+
+def _call_function_and_unflatten_output(tx, fn, args, kwargs, ret_vt, ret_treespec):
+    from .builder import wrap_fx_proxy
+
+    flat_example_value = pytree.tree_map_only(
+        torch.fx.Proxy,
+        lambda a: a.node.meta["example_value"],
+        ret_vt.as_proxy(),
+    )
+
+    # Store the invocation as a call
+    flat_variable = wrap_fx_proxy(
+        tx=tx,
+        proxy=tx.output.create_proxy(
+            "call_function",
+            fn,
+            args=args,
+            kwargs=kwargs,
+        ),
+        example_value=flat_example_value,
+    )
+
+    # Transform variable back into a list (previously made into a tuple by
+    # speculate_subgraph function) so as to respect the pytree API typing.
+    flat_list_variable = BuiltinVariable(list).call_function(tx, [flat_variable], {})
+    return (
+        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_treespec)
+        if ret_treespec
+        else flat_variable
+    )
+
+
+def _assert_tensors_nonaliasing(inputs, outputs):
+    input_tensor_ids = {
+        id(t) for t in pytree.tree_leaves(inputs) if isinstance(t, torch.Tensor)
+    }
+    output_tensor_ids = {
+        id(t) for t in pytree.tree_leaves(outputs) if isinstance(t, torch.Tensor)
+    }
+    assert input_tensor_ids.isdisjoint(
+        output_tensor_ids
+    ), "inputs to function body cannot alias outputs"
+
+
+def validate_args_and_maybe_create_graph_inputs(
+    sub_args,
+    tracer,
+    tx,
+    set_subgraph_inputs,
+    description,
+):
+    from . import AutogradFunctionContextVariable, ConstantVariable, EnumVariable
+    from .builder import wrap_fx_proxy_cls
+
+    assert tracer.parent is not None
+
+    if set_subgraph_inputs == "flatten_manual":
+        flat_args, tree_spec = _make_inlined(tx, pytree.tree_flatten)(
+            ListVariable(sub_args)
+        ).unpack_var_sequence(tx)
+
+        flat_inputs = validate_args_and_maybe_create_graph_inputs(
+            flat_args.unpack_var_sequence(tx),
+            tracer,
+            tx,
+            set_subgraph_inputs="manual",
+            description=description,
+        )
+
+        return _make_inlined(tx, pytree.tree_unflatten)(
+            ListVariable(flat_inputs), tree_spec
+        ).unpack_var_sequence(tx)
+    else:
+        args = []
+        for a in sub_args:
+            assert isinstance(a, VariableTracker)
+            if set_subgraph_inputs == "automatic":
+                args.append(a)
+                continue
+
+            if isinstance(a, (ConstantVariable, EnumVariable)):
+                # This arg is not used in the body of the higher order op.
+                # Currently, this new input is added to make the calls
+                # happy, which expect a fixed number of arguments. In
+                # future, we can clean this up.
+                tracer.create_graph_input("const")
+                new_arg = a
+            # Weird special case, we probably want to delete it or fold it
+            # into the next case (of `a` being placeable into a graph)
+            elif isinstance(a, AutogradFunctionContextVariable):
+                tracer.create_graph_input(a.as_proxy().node.name)
+                new_arg = a
+            # If `a` can be put into a graph
+            elif a.maybe_fx_node() is not None:
+                node = a.maybe_fx_node()
+                new_proxy = tracer.create_graph_input(node.name)
+                example_value = (
+                    node.meta["example_value"] if "example_value" in node.meta else None
+                )
+                new_arg = wrap_fx_proxy_cls(
+                    target_cls=type(a),
+                    tx=tx,
+                    proxy=new_proxy,
+                    example_value=example_value,
+                )
+            # If `a` cannot be put into a graph
+            else:
+                # HOPs work much better if they use speculate_subgraph(set_subgraph_inputs="automatic").
+                raise unimplemented(
+                    f"{description} with body that accepts non-Tensors as input. "
+                    f"Got: {a.python_type()}"
+                )
+            args.append(new_arg)
+        return args
+
+
+# This helper function is used to make sure two graphs share the same input signature. For example,
+# in torch.cond, two branches might lift different set of tensors as inputs. This function helps to
+# dedup the inputs and modify the graphs to take the same set of inputs.
+def _merge_graph_inputs(
+    l_graph, l_lifted_freevars, l_name, r_graph, r_lifted_freevars, r_name
+):
+    def dedup_and_sort_lifted_freevars(l_lifted_freevars, r_lifted_freevars):
+        # The nn module attributes are guaranteed to be registered into the top-level graph module during
+        # higher order op speculation. Therefore, get_attr nodes in two branches with the same
+        # target refer to the same attribute and we can safely deduplicate them with their target.
+        #
+        # Note: ideally, dynamo should just create a single proxy for the same attribute of a nn module. But
+        # true_branch and false_branch belong to two separate tracing contexts, they may register the same
+        # attribute to top level seperately. This creates two get_attr proxies for the same attribute
+        # that have different meta data such as stack_trace (one stack trace for the true_branch,
+        # and the other for false_branch). It seems better to discard the proxy explicitly in cond
+        # than make dynamo create a single proxy for the same get_attr target.
+        def shared_getattrs(l_lifted_proxies, r_lifted_proxies):
+            true_targets = {
+                proxy.node.target: proxy
+                for proxy in l_lifted_proxies
+                if proxy.node.op == "get_attr"
+            }
+            l_shared_getattrs = {}
+            r_shared_getattrs = {}
+
+            for false_proxy in r_lifted_proxies:
+                if (
+                    false_proxy.node.op == "get_attr"
+                    and false_proxy.node.target in true_targets
+                ):
+                    true_proxy = true_targets[false_proxy.node.target]
+                    l_shared_getattrs[true_proxy] = true_proxy
+                    r_shared_getattrs[false_proxy] = true_proxy
+            return l_shared_getattrs, r_shared_getattrs
+
+        l_shared_getattrs, r_shared_getattrs = shared_getattrs(
+            l_lifted_freevars.keys(), r_lifted_freevars.keys()
+        )
+
+        l_shared_freevars = (l_lifted_freevars.keys() & r_lifted_freevars.keys()).union(
+            l_shared_getattrs.keys()
+        )
+        r_shared_freevars = (l_lifted_freevars.keys() & r_lifted_freevars.keys()).union(
+            r_shared_getattrs.keys()
+        )
+        unique_l_freevars = l_lifted_freevars.keys() - l_shared_freevars
+        unique_r_freevars = r_lifted_freevars.keys() - r_shared_freevars
+
+        def _sort_by_name(vars):
+            return sorted(vars, key=lambda var: var.node.name)
+
+        return (
+            list(_sort_by_name(list(l_shared_freevars))),
+            list(_sort_by_name(list(r_shared_freevars))),
+            list(_sort_by_name(list(unique_l_freevars))),
+            list(_sort_by_name(list(unique_r_freevars))),
+        )
+
+    (l_shared, r_shared, unique_l, unique_r) = dedup_and_sort_lifted_freevars(
+        l_lifted_freevars, r_lifted_freevars
+    )
+
+    # Let's say we capture cond(pred, true_fn, false_fn, (x,))
+    # With set_graph_input set to automatic,
+    # true_fn has lifted variables x, a, b, c
+    # false_fn has lifted variables x, a, b, d
+    # Then fixup_branch_inps make sure both branches have the same signature, i.e.:
+    # - true_fn(x, a, b, c_true_branch, d_false_branch)
+    # - false_fn(x, a, b, c_true_branch, d_false_branch)
+    #
+    # More formally, the signature has three parts in the following order:
+    # 1. used in both branches: x, a, b
+    # 2. only used in true branches: c, suffixed with _true_branch
+    # 3. only used in false branches: d, suffixed with _false_branch
+    # Within each part, we re-order the nodes by name to have a derterministic ordering for testing.
+    def fixup_branch_inps(graph, lifted_freevars, shared, unique_l, unique_r):
+        def _insert_or_replace_phs(new_args, name_suffix):
+            for arg in new_args:
+                new_ph = graph.placeholder(arg.node.name + name_suffix)
+                # Override with new_ph if there exists a old placeholder.
+                if arg in lifted_freevars:
+                    old_ph = lifted_freevars[arg].node
+                    old_ph.replace_all_uses_with(new_ph)
+                    # replace_all_uses_with doesn't clean users. Clean it mannually so that we could erase it.
+                    old_ph.users = {}
+                    graph.erase_node(old_ph)
+
+        first_not_ph_node = next(
+            node for node in graph.nodes if node.op != "placeholder"
+        )
+        with graph.inserting_before(first_not_ph_node):
+            _insert_or_replace_phs(shared, "")
+            _insert_or_replace_phs(unique_l, "_" + l_name)
+            _insert_or_replace_phs(unique_r, "_" + r_name)
+
+    fixup_branch_inps(l_graph, l_lifted_freevars, l_shared, unique_l, unique_r)
+    fixup_branch_inps(r_graph, r_lifted_freevars, r_shared, unique_l, unique_r)
+    return l_graph, r_graph, l_shared, r_shared, unique_l, unique_r
+
+
+# See NOTE [HigherOrderOperator tracing design] for details of the design
+def speculate_subgraph(
+    tx,
+    f,
+    sub_args,
+    sub_kwargs,
+    description,
+    *,
+    # source_target is the .value of HigherOrderOpVariable and is the
+    # target of the proxy that we created for the higherOrderOperator.
+    source_target=None,
+    always_restore=False,
+    enable_grad=None,
+    # NOTE [argument `set_subgraph_inputs`]
+    # set_subgraph_inputs controls what how to construct subgraphs' placeholders from sub_args.
+    # 1. if your HOP supports arbitrary inputs, use set_subtraph_inputs="automatic" (most recommended).
+    # 2. if your HOP supports only Tensor and symnode inputs, use set_subgraph_inputs="flatten_manual" (recommended).
+    # If sub_args contain Pytree structure (e.g. dict/list/tuple/set), the sub_args will be flattened first.
+    # Then the flattend args are manually set as subgraph's placeholders.
+    # 3. if your HOP must preserve inputs that are not tensor or symnode as placeholders e.g. AutogradFunctionContextVariable
+    # use set_subgraph_inputs="manual" (not recommended). We do not recommend it in general because it has the
+    # restriction that user need to manually control how to create placeholders and VariableTrackers for the args.
+    set_subgraph_inputs="automatic",
+    restore_side_effects=True,
+    should_flatten_outputs=False,
+    # Pass in an originating tracer - this is needed for preserving context
+    # across fwd-bwd for autograd.Function
+    tracer=None,
+):
+    if sub_kwargs is None:
+        sub_kwargs = {}
+
+    assert set_subgraph_inputs in {
+        "automatic",
+        "flatten_manual",
+        "manual",
+    }, "Please use one of the supported set_subgraph_inputs options."
+
+    # See NOTE [Temporary argument `set_subgraph_inputs`]
+    if sub_kwargs and set_subgraph_inputs != "automatic":
+        unimplemented("Use `set_subgraph_inputs=automatic` when passing `sub_kwargs`.")
+
+    try:
+        f, sub_args, sub_kwargs = VariableTracker.apply(
+            # ensure guards on args get installed in parent subgraph
+            lambda x: x.realize(),
+            (f, sub_args, sub_kwargs),
+        )
+
+        with tx.output.subtracer(source_target, tracer) as subtracer:
+            args = validate_args_and_maybe_create_graph_inputs(
+                sub_args, subtracer, tx, set_subgraph_inputs, description
+            )
+
+            validate_args_and_maybe_create_graph_inputs(
+                sub_kwargs.values(),
+                subtracer,
+                tx,
+                set_subgraph_inputs="automatic",
+                description=description,
+            )
+
+            autograd_ctx = (
+                dynamo_enable_grad(tx, enable_grad)
+                if enable_grad is not None
+                else contextlib.nullcontext()
+            )
+
+            # For handling side effects, we can make an argument that we don't
+            # have to do anything here. The side effects infra does a good job
+            # of graph breaking if we mutate any nonlocal or global variable
+            # while subtracing. As a result if tracing succeeds, side effects
+            # data structure will only contain read-only data structures that
+            # are put there for tracking purposes.
+            # But on the other hand, there is an argument that if we ever write
+            # a new side effect in Dynamo which does not go through the side
+            # effect infra, we can end up in bad state.
+            # Therefore we restore the side effects after tracing. The catch is
+            # that we have to special handle tensor variables. If we have seen a
+            # nonlocal variable tensor during subtracing, we want to keep a
+            # track of that tensor, so that later subtracing or the root tracer
+            # itself does not create a new proxy for the already observed tensor
+            # variable.
+            if restore_side_effects:
+                prev_side_effects = tx.output.side_effects.clone()
+
+            with autograd_ctx:
+                output = f.call_function(tx, args, sub_kwargs)
+
+            if restore_side_effects:
+                new_side_effects = tx.output.side_effects.clone()
+                prev_side_effects.track_tensor_variables_from_runahead_side_effects(
+                    new_side_effects
+                )
+                tx.output.side_effects = prev_side_effects
+
+            treespec = None
+            if should_flatten_outputs:
+                # Flatten the speculated subgraph output.
+                output, treespec = _make_inlined(tx, pytree.tree_flatten)(
+                    output
+                ).unpack_var_sequence(tx)
+                # Actually, transform the list (returned by flatten) into a tuple
+                # for dynamo consistency.
+                output = BuiltinVariable(tuple).call_function(tx, [output], {})
+
+            # Register output to graph
+            # Modeled off of compile_and_call_fx_graph
+            # TODO: support pytree output
+            # We check always_restore because we dont use the output or side effects of always_restore code,
+            # like bwd.
+            if always_restore:
+                # Nothing left to do here
+                return (output, treespec), tx.output.graph, subtracer.lifted_freevars
+            else:
+                from . import TensorVariable
+
+                if not only_consist_of(output, TensorVariable, allow_none=True):
+                    unimplemented(
+                        "HigherOrderOperator body's output must consist of tensors only"
+                    )
+
+                # The output proxies might not belong to this SubgraphTracer
+                # (if they are free variables that were never lifted)
+                # so lift them here.
+                output_proxies = output.as_proxy()
+                output_proxies = pytree.tree_map(
+                    subtracer.maybe_lift_tracked_freevar_to_input, output_proxies
+                )
+
+                tx.output.create_node(
+                    "output",
+                    "output",
+                    (subtracer.create_arg((output_proxies,))),
+                    {},
+                )
+                graph = tx.output.graph
+                graph.lint()
+                lifted_freevars = subtracer.lifted_freevars
+
+                return (
+                    (output, treespec),
+                    graph,
+                    lifted_freevars,
+                )
+
+    except Unsupported as ex:
+        f_name = f"{type(f).__name__}"
+        if isinstance(f, UserFunctionVariable):
+            f_name = f.get_name()
+        msg = (
+            f"speculate_subgraph: while introspecting {description}, we were unable "
+            f"to trace function `{f_name}` into a single graph. This means "
+            f"that Dynamo was unable to prove safety for this API and will "
+            f"fall back to eager-mode PyTorch, which could lead to a slowdown."
+        )
+        log.info(msg)
+        log.info(ex)
+        raise ex
+
+
+def make_attr(tx, name):
+    node = tx.output.create_proxy(
+        "get_attr",
+        name,
+        (),
+        {},
+    )
+    return node
+
+
+def add_subgraph(tx, source, name, gm):
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"{name}_{i}"
+        if candidate in tx.output.nn_modules:
+            i += 1
+        else:
+            next_name = candidate
+
+    gm.__name__ = next_name
+    if source.guard_source().is_fsdp_module():
+        src = FSDPNNModuleSource(GetItemSource(source, next_name))
+    else:
+        src = NNModuleSource(GetItemSource(source, next_name))
+    gm.torchdynamo_force_dynamic = False
+    tx.output.register_attr_or_module(gm, next_name, source=src)
+    return next_name
+
+
+class TorchHigherOrderOperatorVariable(VariableTracker):
+    def __init__(self, value, source: Optional[Source] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+        self.source = source
+
+    @staticmethod
+    def make(value, source=None, **kwargs):
+        if value.__name__ == "cond":
+            return CondHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "while_loop":
+            return WhileLoopHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ in ("map", "map_impl"):
+            return MapHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "executorch_call_delegate":
+            return ExecutorchCallDelegateHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "out_dtype":
+            return OutDtypeHigherOrderVariable(value, source, **kwargs)
+        elif value is torch._functorch.eager_transforms.grad_impl:
+            return FunctorchGradHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "wrap":
+            return WrapHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ in (
+            "wrap_activation_checkpoint",
+            "tag_activation_checkpoint",
+        ):
+            return CheckpointHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "_export_tracepoint":
+            return ExportTracepointHigherOrderVariable(value, source, **kwargs)
+        elif value.__name__ == "trace_wrapped":
+            return TraceWrappedHigherOrderOperatorVariable(value, source, **kwargs)
+        elif value.__name__ == "strict_mode":
+            return StrictModeHigherOrderVariable(value, source, **kwargs)
+        else:
+            unimplemented(f"HigherOrderOperator {value.__name__}")
+
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        unimplemented(f"HigherOrderOperator {self.value.__name__}")
+
+
+class CondHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @raise_hard_error_if_graph_break(
+        reason="Cond doesn't work unless it is captured completely with torch.compile."
+    )
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import (
+            ConstantVariable,
+            ListVariable,
+            NestedUserFunctionVariable,
+            TensorVariable,
+            UserFunctionVariable,
+        )
+
+        args, kwargs = VariableTracker.apply(lambda x: x.realize(), (args, kwargs))
+
+        for i, k in enumerate(["pred", "true_fn", "false_fn", "operands"]):
+            if v := kwargs.pop(k, None):
+                assert i == len(
+                    args
+                ), "did not provide the right number of non-keyword args"
+                args.append(v)
+
+        if kwargs:
+            unimplemented(f"torch.cond: Got unexpected kwargs: {list(kwargs.keys())}")
+
+        # TODO(voz): Support fake tensor dispatch for recursive
+        # ops - see torch/dispatch/_dispatcher.py
+        if len(args) != 4:
+            unimplemented(
+                f"Expected 4 arguments but got {len(args)}.\n"
+                f"Usage: cond(pred, true_fn, false_fn, operands)",
+            )
+        # predicate
+        if type(args[0]) not in (ConstantVariable, TensorVariable, SymNodeVariable):
+            unimplemented(
+                f"Expected pred to be bool or a boolean tensor with single "
+                f"item but got {str(type(args[0]))} "
+                f"with original python type {str(args[0].python_type())}.",
+            )
+
+        # operands
+        if not isinstance(args[3], (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected a tuple but got {args[3].python_type()}",
+            )
+        operands = args[3].unpack_var_sequence(tx)
+        if not only_consist_of(args[3], (TensorVariable,)):
+            unimplemented(
+                "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
+            )
+
+        # branches
+        assert isinstance(
+            args[1],
+            (
+                UserFunctionVariable,
+                NestedUserFunctionVariable,
+                NNModuleVariable,
+                UnspecializedNNModuleVariable,
+            ),
+        ), str(
+            type(args[1])
+        )  # true_fn
+
+        assert isinstance(
+            args[2],
+            (
+                UserFunctionVariable,
+                NestedUserFunctionVariable,
+                NNModuleVariable,
+                UnspecializedNNModuleVariable,
+            ),
+        ), str(
+            type(args[2])
+        )  # false_fn
+
+        # Our strategy for tracing the true/false branches of cond
+        # are to checkpoint our graphstate, run the true branch,
+        # roll it back to the checkpoint, and run the false
+        # branch, and then merge the graphstates.  Well, perhaps
+        # "merge" is too strong a word: we mostly assert that
+        # the resulting graphstates have to be the same.
+        #
+        # We only permit guards to diverge (we union the guards from
+        # both branches).  In particular, this means that side
+        # effects are NOT permitted inside true/false branches; this
+        # would be difficult to implement, because of the path
+        # explosion problem.
+
+        def speculate_branch(branch):
+            # NB: 0 is predicate
+            ix = 1 if branch else 2
+            # TODO: Support kwargs
+            (
+                (ret_val, ret_treespec),
+                ret_graph,
+                ret_lifted_freevars,
+            ) = speculate_subgraph(
+                tx,
+                args[ix],
+                operands,
+                {},
+                "cond",
+                source_target=self.value,
+                should_flatten_outputs=True,
+            )
+
+            if not only_consist_of(ret_val, (TensorVariable,)):
+                unimplemented(
+                    "Expected branches to return a possibly nested list/tuple/dict of tensors but it consists of non tensors.",
+                )
+            return ret_val, ret_treespec, ret_graph, ret_lifted_freevars
+
+        (true_r, true_treespec, true_graph, true_lifted_freevars) = speculate_branch(
+            True
+        )
+        true_nn_modules = dict(tx.output.nn_modules)
+
+        (
+            false_r,
+            false_treespec,
+            false_graph,
+            false_lifted_freevars,
+        ) = speculate_branch(False)
+        false_nn_modules = dict(tx.output.nn_modules)
+
+        same_treespec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
+            true_treespec, false_treespec
+        )
+        if not same_treespec.as_python_constant():
+            unimplemented("Expected branches to return the same pytree structure.")
+
+        def diff_meta(tensor_vars1, tensor_vars2):
+            assert all(
+                isinstance(var, TensorVariable) for var in tensor_vars1 + tensor_vars2
+            )
+            all_diffs = []
+            for i, (var1, var2) in enumerate(zip(tensor_vars1, tensor_vars2)):
+                # We check the meta data associated with meta["example_value"]
+                meta1 = _extract_tensor_metadata(
+                    var1.proxy.node.meta["example_value"], include_contiguity=False
+                )
+                meta2 = _extract_tensor_metadata(
+                    var2.proxy.node.meta["example_value"], include_contiguity=False
+                )
+                if meta1 != meta2:
+                    all_diffs.append((f"pair{i}:", meta1, meta2))
+            return all_diffs
+
+        if diffs := diff_meta(
+            true_r.unpack_var_sequence(tx), false_r.unpack_var_sequence(tx)
+        ):
+            unimplemented(
+                f"Expected branches to return tensors with same metadata. [(tensor_pair, difference)...]:{diffs}"
+            )
+
+        (
+            true_graph,
+            false_graph,
+            true_shared,
+            false_shared,
+            unique_true,
+            unique_false,
+        ) = _merge_graph_inputs(
+            true_graph,
+            true_lifted_freevars,
+            "true_branch",
+            false_graph,
+            false_lifted_freevars,
+            "false_branch",
+        )
+
+        true_name = add_subgraph(
+            tx,
+            self.source,
+            "cond_true",
+            torch.fx.GraphModule(true_nn_modules, true_graph),
+        )
+        false_name = add_subgraph(
+            tx,
+            self.source,
+            "cond_false",
+            torch.fx.GraphModule(false_nn_modules, false_graph),
+        )
+
+        true_node = make_attr(tx, true_name)
+        false_node = make_attr(tx, false_name)
+
+        p_args = (
+            args[0].as_proxy(),
+            true_node,
+            false_node,
+            # We pick true_shared but it shouldn't matter
+            true_shared + unique_true + unique_false,
+        )
+
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.cond, p_args, {}, true_r, true_treespec
+        )
+
+
+class WhileLoopHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @raise_hard_error_if_graph_break(
+        reason="while_loop doesn't work unless it is captured completely with torch.compile."
+    )
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from . import NestedUserFunctionVariable, TensorVariable, UserFunctionVariable
+
+        args, kwargs = VariableTracker.apply(lambda x: x.realize(), (args, kwargs))
+
+        for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
+            if v := kwargs.pop(k, None):
+                assert i == len(
+                    args
+                ), "did not provide the right number of non-keyword args"
+                args.append(v)
+
+        if kwargs:
+            unimplemented(
+                f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}"
+            )
+
+        if len(args) != 3:
+            unimplemented(
+                f"Expected 3 arguments but got {len(args)}.\n"
+                f"Usage: while_loop(cond_fn, body_fn, operands)",
+            )
+
+        def _check_supported_callable(fn_var):
+            assert isinstance(
+                fn_var,
+                (
+                    UserFunctionVariable,
+                    NestedUserFunctionVariable,
+                    NNModuleVariable,
+                    UnspecializedNNModuleVariable,
+                ),
+            ), str(type(fn_var))
+
+        _check_supported_callable(args[0])
+        _check_supported_callable(args[1])
+
+        # operands
+        if not isinstance(args[2], (ListVariable, TupleVariable)):
+            unimplemented(
+                f"Expected a tuple but got {args[2].python_type()}",
+            )
+
+        operands = args[2].unpack_var_sequence(tx)
+        if not only_consist_of(args[2], (TensorVariable,)):
+            unimplemented(
+                "Expect operands to be a tuple of pytrees that only consists of tensor leaves."
+            )
+
+        (
+            (cond_r, cond_treespec),
+            cond_graph,
+            cond_lifted_freevars,
+        ) = speculate_subgraph(
+            tx, args[0], operands, {}, "while_loop", source_target=self.value
+        )
+        cond_nn_modules = dict(tx.output.nn_modules)
+        if not isinstance(cond_r, TensorVariable):
+            unimplemented(
+                f"Expected cond_fn to return a tensor but got {cond_r.python_type()}",
+            )
+
+        cond_r_meta = _extract_tensor_metadata(
+            cond_r.proxy.node.meta["example_value"], include_contiguity=False
+        )
+        if not cond_r_meta.dtype == torch.bool or not cond_r_meta.shape == torch.Size(
+            []
+        ):
+            unimplemented(
+                f"Expected cond_fn to return a tensor with shape (,) but got {cond_r_meta.shape}"
+            )
+
+        (
+            (body_r, body_treespec),
+            body_graph,
+            body_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[1],
+            operands,
+            {},
+            "while_loop",
+            source_target=self.value,
+            should_flatten_outputs=True,
+        )
+        body_nn_modules = dict(tx.output.nn_modules)
+
+        (
+            cond_graph,
+            body_graph,
+            cond_shared,
+            body_shared,
+            cond_unique,
+            body_unique,
+        ) = _merge_graph_inputs(
+            cond_graph,
+            cond_lifted_freevars,
+            "cond_fn",
+            body_graph,
+            body_lifted_freevars,
+            "body_fn",
+        )
+        # We pick cond_shared but it shouldn't matter
+        merged_input = tuple(cond_shared + cond_unique + body_unique)
+
+        cond_name = add_subgraph(
+            tx,
+            self.source,
+            "cond_fn",
+            torch.fx.GraphModule(cond_nn_modules, cond_graph),
+        )
+        body_name = add_subgraph(
+            tx,
+            self.source,
+            "body_fn",
+            torch.fx.GraphModule(body_nn_modules, body_graph),
+        )
+
+        cond_node = make_attr(tx, cond_name)
+        body_node = make_attr(tx, body_name)
+
+        p_args = (
+            cond_node,
+            body_node,
+            merged_input,
+        )
+
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.while_loop, p_args, {}, body_r, body_treespec
+        )
+
+
+def non_single_tensor_return_unsupported(api, ret):
+    from . import TensorVariable
+
+    if not isinstance(ret, TensorVariable):
+        raise Unsupported(
+            f"{api} over function that returns something " f"other than one Tensor"
+        )
+
+
+class MapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from . import NestedUserFunctionVariable, TensorVariable, UserFunctionVariable
+        from .builder import wrap_fx_proxy_cls
+
+        if len(kwargs) > 0:
+            unimplemented(
+                "torch.ops.higher_order.map: kwargs are not supported in the map operator."
+            )
+
+        assert type(args[0].realize()) in (
+            UserFunctionVariable,
+            NestedUserFunctionVariable,
+        )
+        assert type(args[1].realize()) is TensorVariable
+
+        sample_shape = get_fake_value(args[1].as_proxy().node, tx).size()
+
+        if len(sample_shape) < 1 or sample_shape[0] == 0:
+            unimplemented(
+                "map() operator doesn't support scalar or zero-sized tensors during tracing."
+            )
+
+        # To get the example output from map() we will need to provide at least one sample to
+        # the loop body. In our case we will always use xs[0], and our map() won't support zero
+        # sized tensor during tracing.
+        first_dim = wrap_fx_proxy_cls(
+            target_cls=TensorVariable, tx=tx, proxy=args[1].as_proxy()[0]
+        )
+
+        # TODO: Support kwargs
+        (
+            (body_r, body_spec),
+            body_graph,
+            body_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[0],
+            [
+                first_dim,
+                *args[2:],
+            ],
+            {},
+            "torch.ops.higher_order.map",
+            source_target=self.value,
+            set_subgraph_inputs="flatten_manual",
+            should_flatten_outputs=True,
+        )
+
+        body_nn_modules = dict(tx.output.nn_modules)
+
+        body_name = add_subgraph(
+            tx,
+            self.source,
+            "map_body",
+            torch.fx.GraphModule(body_nn_modules, body_graph),
+        )
+
+        body_node = make_attr(tx, body_name)
+
+        p_args = (
+            body_node,
+            [args[1].as_proxy()],
+            [arg.as_proxy() for arg in args[2:]] + list(body_lifted_freevars.keys()),
+        )
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.map_impl, p_args, {}, body_r, body_spec
+        )
+
+
+class ExecutorchCallDelegateHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        # This is operator for delegation within Executorch which calls a
+        # specific function in the given lowered module with the given
+        # operators. The actual operator is defined in the Executorch codebase.
+        # This is a bad hierarchical violation since
+        # executorch_call_delegate sits at a higher level than dynamo, but
+        # there's no real solution to this issue yet.
+        if len(kwargs) > 0:
+            unimplemented(
+                "executorch_call_delegate: kwargs arguments were not enabled."
+            )
+        lowered_module = tx.output.get_submodule(args[0].module_key)
+
+        lowered_node = make_attr(tx, args[0].module_key)
+
+        p_args = tuple(arg.as_proxy() for arg in args[1:])
+        real_sub_args = pytree.tree_map_only(
+            torch.fx.Proxy, lambda a: get_real_value(a.node, tx.output), p_args
+        )
+
+        example_res = lowered_module.original_module.module()(*real_sub_args)
+
+        # NOTE [Guaranteeing the 1-1 correspondence of FakeTensors and real tensors]:
+        # executorch modules promise not to alias inputs and outputs.
+        # Thus, output FakeTensors will correctly not alias input FakeTensors.
+        _assert_tensors_nonaliasing(real_sub_args, example_res)
+
+        example_value = deepcopy_to_fake_tensor(example_res, tx.fake_mode)
+
+        p_args = (lowered_node,) + p_args
+
+        # Store the invocation as a call
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(p_args),
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
+
+
+class FunctorchGradHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import ConstantVariable
+        from .builder import wrap_fx_proxy
+
+        # TODO: Support `fn` with kwargs.
+        if not torch._dynamo.config.capture_func_transforms:
+            unimplemented(
+                "torch.func.grad capture is disabled, "
+                "it can be turned on by setting "
+                "`torch._dynamo.config.capture_func_transforms=True`"
+            )
+        # [NOTE] Here we are (roughly) modelling the following
+        #
+        #   grad_fn = torch.func.grad(fn, argnums=.., has_aux=..)
+        #   grad_output = grad_fn(x)
+        grad_args = (args[0], args[1], args[2])
+
+        # get arguments
+        func, argnums, has_aux = grad_args
+        kwargs = args[4].items
+        if len(kwargs) > 0:
+            # Since speculate_subgraph doesn't support kwargs, we can't handle this for now.
+            unimplemented(
+                "torch.func.grad: kwargs arguments are currently unsupported."
+            )
+
+        # Trace through the `func`
+        # NOTE [HACK: Enable autograd while tracing function]
+        # `torch.func.grad` should not be affected by `no_grad` outside of `grad`.
+        # So, we enable_grad right before the function to which `grad` is applied
+        # (the parts explicitly disabled with `no_grad` inside the function are still disabled).
+        # Eg.
+        # def f(x):
+        #     with no_grad():  # This will disable grad tracking under it.
+        #        y = x * 2
+        #
+        #     return x ** 2 - y  # grad tracking should be enabled irrespective of outside `no_grad`.
+        #
+        # with no_grad():  # This will not disable grad tracking inside of grad(f).
+        #     grad_o = torch.func.grad(f)(x)
+        # TODO: Support kwargs
+        (body_r, _), body_graph, body_lifted_freevars = speculate_subgraph(
+            tx,
+            func,
+            args[3].items,
+            {},
+            "torch.func.grad",
+            source_target=self.value,
+            # See NOTE [HACK: Enable autograd while tracing function]
+            enable_grad=True,
+            set_subgraph_inputs="manual",
+        )
+
+        body_name = add_subgraph(
+            tx,
+            self.source,
+            "grad_body",
+            torch.fx.GraphModule(tx.output.nn_modules, body_graph),
+        )
+        body_node = make_attr(tx, body_name)
+        grad_proxy_args = (
+            body_node,
+            *(arg.as_proxy() for arg in grad_args[1:]),
+        )
+
+        # Model `grad_fn = grad(fn, *grad_args, **grad_kwargs)`
+        grad_fn = tx.output.create_proxy(
+            "call_function",
+            torch.func.grad,
+            args=tuple(grad_proxy_args),
+            kwargs={},
+            name="grad_proxy",
+        )
+
+        # Pass lifted freevars to the call to `grad_fn`
+        args = args[3].items
+        grad_fn_args = tuple(arg.as_proxy() for arg in args) + tuple(
+            body_lifted_freevars
+        )
+
+        # Call grad_fn with inputs.
+        # grad_output = grad_fn(*grad_fn_args, **grad_fn_kwargs)
+        grad_output = grad_fn(*grad_fn_args)
+
+        # `grad_fn(*grad_fn_args, **grad_fn_kwargs)`
+        # Output of grad_fn is
+        # For has_aux=False, Tuple[gradients of inputs indicated by argnums].
+        # For has_aux=True, Tuple[Tuple[gradients of inputs indicated by argnums], aux values]
+        # NOTE: example_value should match `grad_output`.
+        def _from_args(idx):
+            return args[idx].as_proxy().node.meta["example_value"].contiguous()
+
+        def to_python_ints(argnums):
+            if not isinstance(argnums, (ConstantVariable, TupleVariable)):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"argnums is expected to be int or tuple of ints. Got {argnums}.",
+                )
+
+            if isinstance(argnums, ConstantVariable):
+                if not isinstance(argnums.value, (int, tuple)):
+                    raise UserError(
+                        UserErrorType.INVALID_INPUT,
+                        f"argnums is expected to be int or tuple of ints. Got {argnums}.",
+                    )
+                return argnums.value
+            else:
+                const_vars = argnums.unpack_var_sequence(tx)
+                if not all(
+                    isinstance(var, ConstantVariable) and isinstance(var.value, int)
+                    for var in const_vars
+                ):
+                    raise UserError(
+                        UserErrorType.INVALID_INPUT,
+                        f"argnums is expected to contain int only. Got {const_vars}.",
+                    )
+                return tuple(var.value for var in const_vars)
+
+        argnums_v = to_python_ints(argnums)
+        example_value = pytree.tree_map(_from_args, argnums_v)
+
+        if has_aux.value:
+            # case : has_aux = True
+            # NOTE: Currently speculate subgraph allows body_r to be
+            # Tensor or Tuple/List of Tensor.
+            # Since `grad` expects output with has_aux
+            # to be (output, aux), only valid output currently is
+            # (output, some_tensor)
+            body_r_proxy = body_r.as_proxy()
+            aux = body_r_proxy[1].node.meta["example_value"]
+            example_value = (example_value, aux)
+
+        fx_proxy = wrap_fx_proxy(tx=tx, proxy=grad_output, example_value=example_value)
+
+        # Call contiguous on all the computed grads.
+        if not has_aux.value:
+            if isinstance(argnums_v, int):
+                return fx_proxy.call_method(tx, "contiguous", (), {})
+            else:
+                grads = fx_proxy
+                items = []
+                for idx in range(len(argnums_v)):
+                    proxy = grads.call_method(
+                        tx, "__getitem__", (ConstantVariable.create(idx),), {}
+                    ).call_method(tx, "contiguous", (), {})
+                    items.append(proxy)
+                return TupleVariable(items)
+        else:  # case: has_aux.value = True
+            # fx_proxy -> Tuple(grads, aux)
+            grads = fx_proxy.call_method(
+                tx, "__getitem__", (ConstantVariable.create(0),), {}
+            )
+            aux = fx_proxy.call_method(
+                tx, "__getitem__", (ConstantVariable.create(1),), {}
+            )
+            if isinstance(argnums_v, int):
+                return TupleVariable([grads.call_method(tx, "contiguous", (), {}), aux])
+            else:
+                items = []
+                for idx in range(len(argnums_v)):
+                    proxy = grads.call_method(
+                        tx, "__getitem__", (ConstantVariable.create(idx),), {}
+                    ).call_method(tx, "contiguous", (), {})
+                    items.append(proxy)
+                return TupleVariable([TupleVariable(items), aux])
+
+
+class FunctorchHigherOrderVariable(UserFunctionVariable):
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if not torch._dynamo.config.capture_func_transforms:
+            name = self.get_name()
+            assert name in ("grad_impl", "vmap_impl")
+            fn = name.split("_")[0]
+            unimplemented(
+                f"torch.func.{fn} capture is disabled, "
+                "it can be turned on by setting "
+                "`torch._dynamo.config.capture_func_transforms=True`"
+            )
+        return super().call_function(tx, args, kwargs)
+
+
+class WrapHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def create_wrapped_node(self, tx, args, kwargs, description):
+        # See NOTE [HigherOrderOperator tracing design] for more details
+
+        (
+            (body_r, treespec),
+            body_graph,
+            body_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[0],  # function
+            [*args[1:]],
+            kwargs,
+            description,
+            source_target=self.value,
+            should_flatten_outputs=True,
+        )
+
+        body_gmod = torch.fx.GraphModule(tx.output.nn_modules, body_graph)
+        body_name = add_subgraph(
+            tx,
+            self.source,
+            "wrap_body",
+            body_gmod,
+        )
+
+        body_node = make_attr(tx, body_name)
+
+        # Since, we call `speculate_subgraph` with `set_subgraph_inputs="automatic`,
+        # all the arguments are lifted.
+        lifted_args = tuple(arg for arg in body_lifted_freevars.keys())
+
+        proxy_args = (body_node,) + lifted_args
+        example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            body_r.as_proxy(),
+        )
+
+        return proxy_args, {}, example_value, body_r, treespec, body_gmod
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        # This flattens the kwargs into lifted args
+        p_args, p_kwargs, example_value, body_r, treespec, _ = self.create_wrapped_node(
+            tx, args, kwargs, "wrap"
+        )
+
+        if len(p_kwargs) > 0:
+            unimplemented("kwargs should have been flattened into lifted args")
+
+        return _call_function_and_unflatten_output(
+            tx, self.value, tuple(p_args), p_kwargs, body_r, treespec
+        )
+
+
+class OutDtypeHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        if len(kwargs) > 0:
+            unimplemented("out_dtype does not handle kwargs")
+
+        p_args = tuple(arg.as_proxy() for arg in args)
+        op = p_args[0]
+        output_dtype = p_args[1]
+        fake_sub_args = pytree.tree_map_only(
+            torch.fx.Proxy, lambda a: a.node.meta["example_value"], p_args[2:]
+        )
+        # This is a simplified implementation of this operator just for tracing.
+        # Actual implementation may also first promote the arguments
+        example_value = op(*fake_sub_args).to(dtype=output_dtype)
+
+        # Store the invocation as a call
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(p_args),
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
+
+
+class StrictModeHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    @raise_hard_error_if_graph_break(
+        reason="strict_mode HOO doesn't work unless it is captured completely with torch.compile."
+    )
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        callable = args[0]
+
+        unpacked_sequence = args[1].unpack_var_sequence(tx)
+        # TODO (tmanlaibaatar) support pytree here
+        for arg in unpacked_sequence:
+            if isinstance(arg, (ListVariable, TupleVariable, ConstDictVariable)):
+                unimplemented("strict_mode HOO only works for flat inputs for now")
+
+        if kwargs:
+            unimplemented(
+                f"strict_mode HOO received unexpected kwargs: {list(kwargs.keys())}"
+            )
+
+        (
+            (ret_val, ret_treespec),
+            ret_graph,
+            ret_lifted_freevars,
+        ) = speculate_subgraph(
+            tx,
+            args[0],
+            unpacked_sequence,
+            {},
+            "strict_mode",
+            source_target=self.value,
+            should_flatten_outputs=True,
+        )
+
+        strict_mode_nn_modules = dict(tx.output.nn_modules)
+
+        strict_mode_name = add_subgraph(
+            tx,
+            self.source,
+            "strict_mode_body",
+            torch.fx.GraphModule(strict_mode_nn_modules, ret_graph),
+        )
+
+        strict_mode_node = make_attr(tx, strict_mode_name)
+        p_args = (
+            strict_mode_node,
+            tuple(arg for arg in ret_lifted_freevars.keys()),
+        )
+
+        flat_example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            ret_val.as_proxy(),
+        )
+
+        # Store the invocation as a call
+        flat_variable = wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                torch.ops.higher_order.strict_mode,
+                args=tuple(p_args),
+                kwargs={},
+            ),
+            example_value=flat_example_value,
+        )
+
+        return _call_function_and_unflatten_output(
+            tx, torch.ops.higher_order.strict_mode, p_args, {}, ret_val, ret_treespec
+        )
+
+
+class CheckpointHigherOrderVariable(WrapHigherOrderVariable):
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        from torch._higher_order_ops.wrap import TagActivationCheckpoint
+        from torch.utils.checkpoint import noop_context_fn
+        from .builder import wrap_fx_proxy
+
+        context_fn = None
+        if "context_fn" in kwargs and kwargs["context_fn"] != noop_context_fn:
+            ctx = kwargs.pop("context_fn")
+            if isinstance(ctx, torch._dynamo.variables.UserFunctionVariable):
+                context_fn = ctx.fn
+            elif isinstance(
+                ctx, torch._dynamo.variables.functions.FunctoolsPartialVariable
+            ):
+                context_fn = ctx.as_python_constant()
+            else:
+                raise NotImplementedError(
+                    f"checkpoint not implemented for {type(ctx)} context_fn"
+                )
+
+        checkpoint_kwargs, gmod_kwargs = TagActivationCheckpoint.divide_kwargs(kwargs)
+
+        # Here we use checkpoint_kwargs (and not gmod kwargs). gmod_kwargs are
+        # already flattened above and managed inside the fx graph.
+        (
+            p_args,
+            _,
+            example_value,
+            body_r,
+            treespec,
+            checkpointed_gmod,
+        ) = self.create_wrapped_node(
+            tx, args, gmod_kwargs, "torch.utils.checkpoint.checkpoint"
+        )
+        if context_fn is not None:
+            checkpointed_gmod.meta["_checkpoint_context_fn"] = context_fn
+
+        _, checkpoint_kwargs = proxy_args_kwargs([], checkpoint_kwargs)
+
+        # Store the invocation as a call
+        variable = wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=tuple(p_args),
+                kwargs=checkpoint_kwargs,
+            ),
+            example_value=example_value,
+        )
+
+        if treespec is None:
+            return variable
+
+        # Transform variable back into a list (previously made into a tuple by
+        # speculate_subgraph function) so as to respect the pytree API typing.
+        variable = BuiltinVariable(list).call_function(tx, [variable], {})
+
+        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
+
+
+class ExportTracepointHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        p_args = tuple(arg.as_proxy() for arg in args)
+        p_kwargs = {key: arg.as_proxy() for key, arg in kwargs.items()}
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                self.value,
+                args=p_args,
+                kwargs=p_kwargs,
+            ),
+            example_value=None,
+        )
+
+
+class TraceWrappedHigherOrderOperatorVariable(TorchHigherOrderOperatorVariable):
+    """
+    Handles torch._dynamo._trace_wrapped_higher_order_op.inner_trace
+    by unwrapping the higher order op and inlining through it.  This op
+    is created by dynamo to survive through AotAutograd, then unwrapped
+    here in the call to dynamo from compiled autograd.
+    """
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        kwargs = dict(kwargs)
+        fn = kwargs.pop("fn")
+        return fn.call_function(tx, args, kwargs)
+
+
+class AutogradFunctionApplyVariable(VariableTracker):
+    def __init__(self, fwd_graph, bwd_graph, parent_source, **kwargs):
+        super().__init__(**kwargs)
+        self.fwd_graph = fwd_graph
+        self.bwd_graph = bwd_graph
+        self.parent_source = parent_source
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import (
+            AutogradFunctionContextVariable,
+            UserDefinedClassVariable,
+            UserFunctionVariable,
+            UserMethodVariable,
+        )
+        from .builder import wrap_fx_proxy
+
+        """
+        Consider the following:
+        class MySin(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                ctx.save_for_backward(x)
+                return x.sin()
+            @staticmethod
+            def backward(ctx, grad):
+                x, = ctx.saved_tensors
+                return grad * x.cos()
+        We want the resulting graphs to look like:
+        def fwd(ctx, x):
+            # (output, saved tensors / attrs)
+            return (x.sin(), [x])
+        # bwd(ctx, grad0, grad1, ..., gradn, *saved_tensors_or_attrs)
+        def bwd(ctx, grad, x):
+            return grad * x.cos()
+        To accomplish this, we're going to:
+        1. Construct a ctx object
+        2. (fwd_out, _), fwd_graph, fwd_freevars = speculate_subgraph on MySin.forward (manually_set_inputs=True)
+        3. (bwd_out, _), bwd_graph, bwd_freevars = speculate_subgraph on MySin.backward, while manually setting
+        the ctx and grad inputs.
+        4. Manually rewriting the fwd graph's output to be (output, stuff_that_gets_used in bwd_graph)
+        Getting from 3 to 4 is pretty elegant: stuff_that_gets_used in bwd graph is
+        just the bwd_freevars returned from speculate_subgraph, assuming MySin.backward
+        doesn't capture any arguments.
+        All these steps work if MySin.backward doesn't capture any values. This is a
+        limitation in general that we should check for.
+        """
+
+        prev_side_effects = tx.output.side_effects.clone()
+        fwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
+            tx.output,
+            parent=tx.output.current_tracer,
+            source_target="autograd.Function",
+        )
+
+        fwd_src = AttrSource(self.parent_source, member="forward")
+        ctx = AutogradFunctionContextVariable.create(tx)
+        if isinstance(self.fwd_graph, types.FunctionType):
+            fwd_fn = UserFunctionVariable(self.fwd_graph, source=fwd_src)
+            fwd_args = [ctx, *args]
+        elif isinstance(self.fwd_graph, types.MethodType):
+            fwd_fn = UserMethodVariable(
+                self.fwd_graph.__func__,
+                UserDefinedClassVariable(self.fwd_graph.__class__),
+                source=fwd_src,
+            )
+            fwd_args = [fwd_fn.obj, ctx, *args]
+        else:
+            unimplemented("non-function or method")
+
+        # Speculate subgraph on the fwd
+        (fwd_out, _), fwd_graph, fwd_freevars = speculate_subgraph(
+            tx,
+            fwd_fn,
+            fwd_args,
+            kwargs,
+            "autograd.Function",
+            set_subgraph_inputs="manual",
+            restore_side_effects=False,
+            tracer=fwd_tracer,
+        )
+
+        if fwd_freevars:
+            unimplemented("NYI")
+
+        if ctx.mutable_local in tx.output.side_effects.store_attr_mutations:
+            if (
+                "_materialize_non_diff_grads"
+                in tx.output.side_effects.store_attr_mutations[ctx.mutable_local]
+            ):
+                unimplemented("NYI")
+
+        bwd_tracer = torch._dynamo.output_graph.SubgraphTracer(
+            tx.output,
+            parent=fwd_tracer,
+            source_target="autograd.Function",
+        )
+
+        # Speculate subgraph on the backward. We make the
+        # bwd tracer a child of the fwd tracer, because backward may rely on
+        # tensors/attrs created in the fwd tracer.
+
+        from .lists import BaseListVariable
+
+        if isinstance(fwd_out, BaseListVariable):
+            bwd_args = [ctx, *fwd_out.items]
+        else:
+            bwd_args = [ctx, fwd_out]
+
+        bwd_src = AttrSource(self.parent_source, member="backward")
+        if isinstance(self.bwd_graph, types.FunctionType):
+            bwd_fn = UserFunctionVariable(self.bwd_graph, source=bwd_src)
+        elif isinstance(self.bwd_graph, types.MethodType):
+            bwd_fn = UserMethodVariable(
+                self.bwd_graph.__func__,
+                UserDefinedClassVariable(self.bwd_graph.__class__),
+                source=bwd_src,
+            )
+            bwd_args = [bwd_fn.obj, *bwd_args]
+        else:
+            unimplemented("non-function or method")
+
+        with tx.output.subtracer(fwd_fn, fwd_tracer), tx.strict_translation_mode():
+            (bwd_out, _), bwd_graph, bwd_freevars = speculate_subgraph(
+                tx,
+                bwd_fn,
+                bwd_args,
+                kwargs,
+                "autograd.Function",
+                enable_grad=False,
+                set_subgraph_inputs="manual",
+                restore_side_effects=False,
+                tracer=bwd_tracer,
+            )
+
+        # TODO: assert that bwd_graph didn't capture values that were
+        # not created inside fwd_graph.
+
+        # TODO(oulgen): Ideally, we would not do a linear search for output
+        # node but as things currently are there could be nodes after the
+        # output node
+        # This is bug prone as if there's code after the output node, then
+        # graph.output will append the output at the very end
+        # This might be a behavior difference
+
+        # Rewrite the output of fwd_graph to (output, stuff_necessary_for_bwd)
+        for node in fwd_graph.nodes:
+            if node.op == "output":
+                fwd_graph.erase_node(node)
+                break
+
+        new_fwd_graph_outputs = (fwd_out.as_proxy(), list(bwd_freevars.keys()))
+        new_fwd_graph_outputs = pytree.tree_map(lambda x: x.node, new_fwd_graph_outputs)
+        fwd_graph.output(new_fwd_graph_outputs)
+
+        # Store fwd_body
+        fwd_nn_modules = tx.copy_graphstate().output.nn_modules
+        fwd_name = add_subgraph(
+            tx,
+            fwd_src,
+            "fwd_body",
+            torch.fx.GraphModule(fwd_nn_modules.nn_modules, fwd_graph),
+        )
+
+        fwd_node = make_attr(tx, fwd_name)
+
+        # Store bwd_body
+        bwd_nn_modules = tx.copy_graphstate().output.nn_modules
+        bwd_name = add_subgraph(
+            tx,
+            bwd_src,
+            "bwd_body",
+            torch.fx.GraphModule(bwd_nn_modules.nn_modules, bwd_graph),
+        )
+
+        bwd_node = make_attr(tx, bwd_name)
+
+        tx.output.side_effects = prev_side_effects
+
+        p_args = (fwd_node, bwd_node, *(arg.as_proxy() for arg in args))
+        example_value = pytree.tree_map_only(
+            torch.fx.Proxy,
+            lambda a: a.node.meta["example_value"],
+            fwd_out.as_proxy(),
+        )
+
+        # Store the invocation as a call
+        from torch._functorch.autograd_function import autograd_function_apply
+
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                autograd_function_apply,
+                args=p_args,
+                kwargs={},
+            ),
+            example_value=example_value,
+        )
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/iter.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/iter.py
new file mode 100644
index 0000000000000000000000000000000000000000..39968559a9caddb04ad42e221cf09a17f3b22e64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/iter.py
@@ -0,0 +1,260 @@
+# mypy: ignore-errors
+
+MAX_CYCLE = 3000
+
+import itertools
+import operator
+
+from typing import Dict, List, Optional
+
+from .. import polyfill, variables
+from ..exc import unimplemented
+
+from .base import MutableLocal, VariableTracker
+from .constant import ConstantVariable
+
+
+class ItertoolsVariable(VariableTracker):
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def __repr__(self):
+        return f"ItertoolsVariable({self.value})"
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if (
+            self.value is itertools.product
+            and not kwargs
+            and all(arg.has_unpack_var_sequence(tx) for arg in args)
+        ):
+            seqs = [arg.unpack_var_sequence(tx) for arg in args]
+            items = []
+            for item in itertools.product(*seqs):
+                items.append(variables.TupleVariable(list(item)))
+            return variables.ListIteratorVariable(items, mutable_local=MutableLocal())
+        elif (
+            self.value is itertools.chain
+            and not kwargs
+            and all(arg.has_unpack_var_sequence(tx) for arg in args)
+        ):
+            seqs = [arg.unpack_var_sequence(tx) for arg in args]
+            items = list(itertools.chain.from_iterable(seqs))
+            return variables.ListIteratorVariable(items, mutable_local=MutableLocal())
+        elif self.value is itertools.accumulate:
+            from .builtin import BuiltinVariable
+
+            if any(key not in ["initial", "func"] for key in kwargs.keys()):
+                unimplemented(
+                    "Unsupported kwargs for itertools.accumulate: "
+                    f"{','.join(set(kwargs.keys()) - {'initial', 'func'})}"
+                )
+
+            acc = kwargs.get("initial")
+
+            if len(args) in [1, 2] and args[0].has_unpack_var_sequence(tx):
+                seq = args[0].unpack_var_sequence(tx)
+
+                if "func" in kwargs and len(args) == 1:
+                    func = kwargs["func"].call_function
+                elif len(args) == 2:
+                    func = args[1].call_function
+                elif len(args) == 1:
+                    # Default to operator.add
+                    func = BuiltinVariable(operator.add).call_function
+                else:
+                    unimplemented(
+                        "itertools.accumulate can only accept one of: `func` kwarg, pos 2 arg"
+                    )
+            else:
+                unimplemented("Unsupported arguments for itertools.accumulate")
+
+            items = []
+            if acc is not None:
+                items.append(acc)
+            for item in seq:
+                if acc is None:
+                    acc = item
+                else:
+                    try:
+                        acc = func(tx, [acc, item], {})
+                    except Exception:
+                        raise unimplemented(  # noqa: TRY200
+                            f"Unexpected failure in invoking function during accumulate. Failed running func {func}({item}{acc})"
+                        )
+                items.append(acc)
+
+            return variables.ListIteratorVariable(items, mutable_local=MutableLocal())
+        elif (
+            self.value is itertools.combinations
+            and not kwargs
+            and len(args) == 2
+            and args[0].has_unpack_var_sequence(tx)
+            and args[1].is_python_constant()
+        ):
+            iterable = args[0].unpack_var_sequence(tx)
+            r = args[1].as_python_constant()
+
+            items = []
+            for item in itertools.combinations(iterable, r):
+                items.append(variables.TupleVariable(list(item)))
+            return variables.ListIteratorVariable(items, mutable_local=MutableLocal())
+        elif self.value is itertools.groupby:
+            if any(kw != "key" for kw in kwargs.keys()):
+                unimplemented(
+                    "Unsupported kwargs for itertools.groupby: "
+                    f"{','.join(set(kwargs.keys()) - {'key'})}"
+                )
+
+            def retrieve_const_key(key):
+                if isinstance(key, variables.SymNodeVariable):
+                    return key.evaluate_expr()
+                elif isinstance(key, variables.ConstantVariable):
+                    return key.as_python_constant()
+                else:
+                    raise unimplemented(
+                        "Unsupported key type for itertools.groupby: " + str(type(key))
+                    )
+
+            if len(args) == 1 and args[0].has_unpack_var_sequence(tx):
+                seq = args[0].unpack_var_sequence(tx)
+                keyfunc = (
+                    (
+                        lambda x: (
+                            retrieve_const_key(
+                                kwargs.get("key").call_function(tx, [x], {})
+                            )
+                        )
+                    )
+                    if "key" in kwargs
+                    else None
+                )
+            else:
+                unimplemented("Unsupported arguments for itertools.groupby")
+
+            result = []
+            try:
+                for k, v in itertools.groupby(seq, key=keyfunc):
+                    result.append(
+                        variables.TupleVariable(
+                            [
+                                variables.ConstantVariable.create(k)
+                                if variables.ConstantVariable.is_literal(k)
+                                else k,
+                                variables.ListIteratorVariable(
+                                    list(v), mutable_local=MutableLocal()
+                                ),
+                            ],
+                            mutable_local=MutableLocal(),
+                        )
+                    )
+            except Exception:
+                raise unimplemented(  # noqa: TRY200
+                    "Unexpected failure when calling itertools.groupby"
+                )
+            return variables.ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif self.value is itertools.repeat:
+            if len(args) < 2:
+                return variables.RepeatIteratorVariable(
+                    *args, mutable_local=MutableLocal()
+                )
+
+            from .builder import SourcelessBuilder
+
+            return tx.inline_user_function_return(
+                SourcelessBuilder()(tx, polyfill.repeat), args, kwargs
+            )
+        elif self.value is itertools.count:
+            return variables.CountIteratorVariable(*args, mutable_local=MutableLocal())
+        elif self.value is itertools.cycle:
+            return variables.CycleIteratorVariable(*args, mutable_local=MutableLocal())
+        else:
+            return super().call_function(tx, args, kwargs)
+
+
+class IteratorVariable(VariableTracker):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def next_variables(self, tx):
+        unimplemented("abstract method, must implement")
+
+
+class RepeatIteratorVariable(IteratorVariable):
+    def __init__(self, item: VariableTracker, **kwargs):
+        super().__init__(**kwargs)
+        self.item = item
+
+    # Repeat needs no mutation, clone self
+    def next_variables(self, tx):
+        return self.item, self
+
+
+class CountIteratorVariable(IteratorVariable):
+    def __init__(self, item: int = 0, step: int = 1, **kwargs):
+        super().__init__(**kwargs)
+        if not isinstance(item, VariableTracker):
+            item = ConstantVariable.create(item)
+        if not isinstance(step, VariableTracker):
+            step = ConstantVariable.create(step)
+        self.item = item
+        self.step = step
+
+    def next_variables(self, tx):
+        assert self.mutable_local
+        tx.output.side_effects.mutation(self)
+        next_item = self.item.call_method(tx, "__add__", [self.step], {})
+        self.item = next_item
+        return self.item, self
+
+
+class CycleIteratorVariable(IteratorVariable):
+    def __init__(
+        self,
+        iterator: IteratorVariable,
+        saved: List[VariableTracker] = None,
+        saved_index: int = 0,
+        item: Optional[VariableTracker] = None,
+        **kwargs,
+    ):
+        if saved is None:
+            saved = []
+        super().__init__(**kwargs)
+        self.iterator = iterator
+        self.saved = saved
+        self.saved_index = saved_index
+        self.item = item
+
+    def next_variables(self, tx):
+        assert self.mutable_local
+
+        if self.iterator is not None:
+            try:
+                new_item, _ = self.iterator.next_variables(tx)
+                if len(self.saved) > MAX_CYCLE:
+                    unimplemented(
+                        "input iterator to itertools.cycle has too many items"
+                    )
+                tx.output.side_effects.mutation(self)
+                self.saved.append(new_item)
+                self.item = new_item
+                if self.item is None:
+                    return self.next_variables(tx)
+                return self.item, self
+            except StopIteration:
+                self.iterator = None
+                return self.next_variables(tx)
+        elif len(self.saved) > 0:
+            tx.output.side_effects.mutation(self)
+            self.saved_index = (self.saved_index + 1) % len(self.saved)
+            return self.item, self
+        else:
+            raise StopIteration
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/lazy.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f0b43475d62d72d46bdf60b6f97b5d1bb9526e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/lazy.py
@@ -0,0 +1,106 @@
+# mypy: ignore-errors
+
+import functools
+from typing import Optional
+
+from .base import VariableTracker
+
+
+class LazyCache:
+    """Container to cache the real VariableTracker"""
+
+    def __init__(self, value, source):
+        assert source
+        self.value = value
+        self.source = source
+        self.vt: Optional[VariableTracker] = None
+
+    def realize(self, parents_tracker):
+        assert self.vt is None
+        from ..symbolic_convert import InstructionTranslator
+        from .builder import VariableBuilder
+
+        tx = InstructionTranslator.current_tx()
+        self.vt = VariableBuilder(tx, self.source)(self.value)
+        self.vt.parents_tracker.add(parents_tracker)
+        del self.value
+        del self.source
+
+
+class LazyVariableTracker(VariableTracker):
+    """
+    A structure that defers the creation of the actual VariableTracker
+    for a given underlying value until it is accessed.
+
+    The `realize` function invokes VariableBuilder to produce the real object.
+    Once a LazyVariableTracker has been realized, internal bookkeeping will
+    prevent double realization.
+
+    This object should be utilized for processing containers, or objects that
+    reference other objects where we may not want to take on creating all the
+    VariableTrackers right away.
+    """
+
+    _nonvar_fields = {"_cache", *VariableTracker._nonvar_fields}
+
+    @staticmethod
+    def create(value, source, **options):
+        return LazyVariableTracker(LazyCache(value, source), source=source, **options)
+
+    def __init__(self, _cache, **kwargs):
+        assert isinstance(_cache, LazyCache)
+        super().__init__(**kwargs)
+        self._cache = _cache
+
+    def realize(self) -> VariableTracker:
+        """Force construction of the real VariableTracker"""
+        if self._cache.vt is None:
+            self._cache.realize(self.parents_tracker)
+        return self._cache.vt
+
+    def unwrap(self):
+        """Return the real VariableTracker if it already exists"""
+        if self.is_realized():
+            return self._cache.vt
+        return self
+
+    def is_realized(self):
+        return self._cache.vt is not None
+
+    def clone(self, **kwargs):
+        assert kwargs.get("_cache", self._cache) is self._cache
+        if kwargs.get("source", self.source) is not self.source:
+            self.realize()
+        return VariableTracker.clone(self.unwrap(), **kwargs)
+
+    def __str__(self):
+        if self.is_realized():
+            return self.unwrap().__str__()
+        return VariableTracker.__str__(self.unwrap())
+
+    def __getattr__(self, item):
+        return getattr(self.realize(), item)
+
+    # most methods are auto-generated below, these are the ones we want to exclude
+    apply = VariableTracker.apply
+    copy = VariableTracker.copy
+    __post_init__ = VariableTracker.__post_init__
+    __repr__ = VariableTracker.__repr__
+
+
+def _create_realize_and_forward(name):
+    @functools.wraps(getattr(VariableTracker, name))
+    def realize_and_forward(self, *args, **kwargs):
+        return getattr(self.realize(), name)(*args, **kwargs)
+
+    return realize_and_forward
+
+
+def _populate():
+    for name, value in VariableTracker.__dict__.items():
+        if name not in LazyVariableTracker.__dict__:
+            if callable(value):
+                setattr(LazyVariableTracker, name, _create_realize_and_forward(name))
+
+
+_populate()
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/lists.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/lists.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba727aa1e0bfffd773939ef63467b6f156b80538
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/lists.py
@@ -0,0 +1,811 @@
+# mypy: ignore-errors
+
+import collections
+import functools
+import inspect
+import operator
+import types
+from typing import Dict, List, Optional
+
+import torch
+import torch.fx
+from ..._guards import Source
+
+from .. import polyfill, variables
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..exc import unimplemented
+from ..source import AttrSource, GetItemSource
+from ..utils import (
+    get_fake_value,
+    guard_if_dyn,
+    is_namedtuple,
+    istype,
+    iter_contains,
+    namedtuple_fields,
+    odict_values,
+)
+from .base import MutableLocal, VariableTracker
+from .constant import ConstantVariable
+from .functions import UserFunctionVariable, UserMethodVariable
+
+
+class BaseListVariable(VariableTracker):
+    @staticmethod
+    def cls_for_instance(obj):
+        if is_namedtuple(obj):
+            return functools.partial(NamedTupleVariable, tuple_cls=type(obj))
+        return BaseListVariable.cls_for(type(obj))
+
+    @staticmethod
+    def cls_for(obj):
+        return {
+            iter: ListIteratorVariable,
+            list: ListVariable,
+            slice: SliceVariable,
+            torch.Size: SizeVariable,
+            tuple: TupleVariable,
+            odict_values: ListVariable,
+            torch.nn.ParameterList: ListVariable,
+            torch.nn.ModuleList: ListVariable,
+            collections.deque: DequeVariable,
+        }[obj]
+
+    def __init__(
+        self,
+        items: List[VariableTracker],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        assert isinstance(items, list)
+        assert all(isinstance(x, VariableTracker) for x in items)
+        self.items: List[VariableTracker] = items
+
+    def _as_proxy(self):
+        return [x.as_proxy() for x in self.items]
+
+    def modified(self, items, **kwargs):
+        return type(self)(items, **kwargs)
+
+    @property
+    def value(self):
+        return self.as_python_constant()
+
+    def as_python_constant(self):
+        return self.python_type()([x.as_python_constant() for x in self.items])
+
+    def as_proxy(self):
+        assert self.python_type() is not SizeVariable
+        return self.python_type()(self._as_proxy())
+
+    def getitem_const(self, arg: VariableTracker):
+        from .tensor import SymNodeVariable
+
+        if isinstance(arg, SymNodeVariable):
+            index = arg.sym_num
+        else:
+            index = arg.as_python_constant()
+
+        if isinstance(index, slice):
+            if self.source is not None:
+                return self.clone(
+                    items=self.items[index],
+                    source=GetItemSource(self.source, index),
+                    mutable_local=MutableLocal() if self.mutable_local else None,
+                )
+            else:
+                return self.clone(
+                    items=self.items[index],
+                    mutable_local=MutableLocal() if self.mutable_local else None,
+                )
+        else:
+            assert isinstance(index, (int, torch.SymInt))
+            return self.items[index]
+
+    def unpack_var_sequence(self, tx):
+        return list(self.items)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__getitem__":
+            from .tensor import TensorVariable
+
+            assert not kwargs and len(args) == 1
+            if isinstance(args[0], TensorVariable):
+                value = get_fake_value(args[0].as_proxy().node, tx)
+                if value.constant is not None and value.constant.numel() == 1:
+                    value = variables.ConstantVariable.create(value.constant.item())
+                else:
+                    unimplemented("__getitem__ with non-constant tensor")
+            else:
+                value = args[0]
+            return self.getitem_const(value)
+        elif name == "__contains__":
+            assert len(args) == 1
+            assert not kwargs
+            return iter_contains(self.items, args[0], tx)
+        elif name == "index":
+            from .builder import SourcelessBuilder
+
+            return tx.inline_user_function_return(
+                SourcelessBuilder()(tx, polyfill.index), [self] + list(args), kwargs
+            )
+
+        return super().call_method(tx, name, args, kwargs)
+
+    @staticmethod
+    def list_compare(tx, op, left, right):
+        from .builtin import BuiltinVariable
+
+        eq_result = BaseListVariable.list_eq(tx, left, right)
+        if op is operator.eq:
+            return eq_result
+        elif op is operator.ne:
+            return BuiltinVariable(operator.not_).call_function(tx, [eq_result], {})
+        else:
+            unimplemented(f"list_compare {left} {op} {right}")
+
+    @staticmethod
+    def list_eq(tx, left, right):
+        from .builtin import BuiltinVariable
+
+        # Most list-like variables implement comparison ops the same way,
+        # so they can re-use this helper.
+        # There are quirks though, like how `tuple([2]) == torch.Size([2])`,
+        # but `tuple([2]) != list([2])`
+        if len(left.items) != len(right.items):
+            return ConstantVariable.create(False)
+        if len(left.items) == 0:
+            return ConstantVariable.create(True)
+
+        # Generic list comparison works by iterating over left aka self and right the compared-to list.
+        # If we hit here, their lengths are the same and they cannot be expressed as python constants.
+        # So, we iterate over the zipped list items.
+        comps = []
+        for l, r in zip(left.items, right.items):
+            comp = BuiltinVariable(operator.eq).call_function(tx, [l, r], {})
+            if comp.is_python_constant() and not comp.as_python_constant():
+                # early exit in false case
+                return comp
+            comps.append(comp)
+
+        return functools.reduce(
+            lambda a, b: BuiltinVariable(operator.and_).call_function(tx, [a, b], {}),
+            comps,
+        )
+
+
+class RangeVariable(BaseListVariable):
+    def __init__(self, items, **kwargs):
+        items_to_map = items
+        start = variables.ConstantVariable.create(0)
+        stop = None
+        step = variables.ConstantVariable.create(1)
+
+        if len(items_to_map) == 1:
+            (stop,) = items_to_map
+        elif len(items_to_map) == 2:
+            start, stop = items_to_map
+        elif len(items_to_map) == 3:
+            start, stop, step = items_to_map
+        else:
+            raise AssertionError()
+
+        assert stop is not None
+        super().__init__([start, stop, step], **kwargs)
+
+    def python_type(self):
+        return range
+
+    def as_python_constant(self):
+        return range(*[x.as_python_constant() for x in self.items])
+
+    def as_proxy(self):
+        return self.python_type()(*self._as_proxy())
+
+    def unpack_var_sequence(self, tx):
+        return [variables.ConstantVariable.create(x) for x in self.as_python_constant()]
+
+    def reconstruct(self, codegen):
+        assert "range" not in codegen.tx.f_globals
+        codegen.append_output(codegen.create_load_python_module(range, True))
+        codegen.foreach(self.items)
+        codegen.extend_output(create_call_function(3, False))
+
+    def var_getattr(self, tx, name):
+        fields = ["start", "stop", "step"]
+        if name not in fields:
+            unimplemented(f"range.{name}")
+        return self.items[fields.index(name)]
+
+
+class CommonListMethodsVariable(BaseListVariable):
+    """
+    Implement methods common to List and other List-like things
+    """
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "append" and self.mutable_local:
+            assert not kwargs
+            (arg,) = args
+            tx.output.side_effects.mutation(self)
+            self.items.append(arg)
+            return ConstantVariable.create(None)
+        elif (
+            name == "extend"
+            and self.mutable_local
+            and args
+            and args[0].has_unpack_var_sequence(tx)
+        ):
+            assert not kwargs
+            (arg,) = args
+            seq = arg.unpack_var_sequence(tx)
+            tx.output.side_effects.mutation(self)
+            self.items.extend(seq)
+            return ConstantVariable.create(None)
+        elif name == "insert" and self.mutable_local:
+            assert not kwargs
+            idx, value = args
+            const_idx = idx.as_python_constant()
+            tx.output.side_effects.mutation(self)
+            self.items.insert(const_idx, value)
+            return ConstantVariable.create(None)
+        elif name == "pop" and self.mutable_local:
+            assert not kwargs
+            tx.output.side_effects.mutation(self)
+            return self.items.pop(*[a.as_python_constant() for a in args])
+        elif name == "clear" and self.mutable_local:
+            assert not kwargs and not args
+            tx.output.side_effects.mutation(self)
+            self.items.clear()
+            return ConstantVariable.create(None)
+        elif (
+            name == "__setitem__"
+            and self.mutable_local
+            and args
+            and args[0].is_python_constant()
+        ):
+            assert not kwargs
+            key, value = args
+            tx.output.side_effects.mutation(self)
+            if isinstance(key, SliceVariable):
+                self.items[key.as_python_constant()] = list(value.items)
+            else:
+                self.items[key.as_python_constant()] = value
+            return ConstantVariable.create(None)
+        elif name == "copy":
+            # List copy() doesn't have args and kwargs
+            assert not kwargs
+            assert not args
+            items = list(self.items)
+            return self.modified(items, mutable_local=MutableLocal())
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+
+class ListVariable(CommonListMethodsVariable):
+    def python_type(self):
+        return list
+
+    def reconstruct(self, codegen):
+        codegen.foreach(self.items)
+        codegen.append_output(create_instruction("BUILD_LIST", arg=len(self.items)))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if (
+            name == "__setitem__"
+            and self.mutable_local
+            and args
+            and args[0].is_python_constant()
+        ):
+            assert not kwargs
+            key, value = args
+            tx.output.side_effects.mutation(self)
+            if isinstance(key, SliceVariable):
+                if not value.has_unpack_var_sequence(tx):
+                    unimplemented(
+                        f"Missing dynamo support for expanding {value} into a list for slice assignment."
+                    )
+                self.items[key.as_python_constant()] = value.unpack_var_sequence(tx)
+            else:
+                self.items[key.as_python_constant()] = value
+            return ConstantVariable.create(None)
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        if self.python_type() is not list:
+            return super().call_hasattr(tx, name)
+        return variables.ConstantVariable.create(hasattr([], name))
+
+
+class DequeVariable(CommonListMethodsVariable):
+    def python_type(self):
+        return collections.deque
+
+    def reconstruct(self, codegen):
+        assert "deque" not in codegen.tx.f_globals
+        codegen.append_output(
+            codegen.create_load_python_module(collections.deque, True)
+        )
+        codegen.foreach(self.items)
+        codegen.extend_output(create_call_function(len(self.items), False))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if (
+            name == "__setitem__"
+            and self.mutable_local
+            and args
+            and args[0].is_python_constant()
+        ):
+            assert not kwargs
+            key, value = args
+            assert key.is_python_constant() and isinstance(
+                key.as_python_constant(), int
+            )
+            tx.output.side_effects.mutation(self)
+            self.items[key.as_python_constant()] = value
+            return ConstantVariable.create(None)
+        elif name == "extendleft" and self.mutable_local:
+            assert not kwargs
+
+            (arg,) = args
+            prefix = arg.unpack_var_sequence(tx)
+            prefix.reverse()
+            tx.output.side_effects.mutation(self)
+            self.items = prefix + list(self.items)
+            return ConstantVariable.create(None)
+        elif name == "popleft" and self.mutable_local:
+            assert not args
+            assert not kwargs
+            item = self.items[0]
+            tx.output.side_effects.mutation(self)
+            self.items = self.items[1:]
+            return item
+        elif name == "appendleft" and self.mutable_local:
+            assert not kwargs
+            tx.output.side_effects.mutation(self)
+            self.items = [args[0]] + list(self.items)
+            return ConstantVariable.create(None)
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+
+class TupleVariable(BaseListVariable):
+    def python_type(self):
+        return tuple
+
+    def reconstruct(self, codegen):
+        codegen.foreach(self.items)
+        codegen.append_output(create_instruction("BUILD_TUPLE", arg=len(self.items)))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        return super().call_method(tx, name, args, kwargs)
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        if self.python_type() is not tuple:
+            return super().call_hasattr(tx, name)
+        return variables.ConstantVariable.create(hasattr((), name))
+
+
+class SizeVariable(TupleVariable):
+    """torch.Size(...)"""
+
+    def __init__(
+        self,
+        items: List[VariableTracker],
+        proxy: Optional[torch.fx.Proxy] = None,
+        **kwargs,
+    ):
+        self.proxy = proxy
+        super().__init__(items, **kwargs)
+
+    def python_type(self):
+        return torch.Size
+
+    def as_proxy(self):
+        if self.proxy is not None:
+            return self.proxy
+
+        # torch.Size needs special handling.  Normally, we pun a list-like
+        # container to directly contain Proxy/Node objects from FX, and FX
+        # knows to look inside containers (via map_aggregate).  But torch.Size
+        # is weird; although it subclasses from tuple, it doesn't allow
+        # members which aren't int-like (rejecting Proxy and Node).  This
+        # means we can't use the normal representation trick
+        # torch.Size([proxy0, proxy1]).  I looked into seeing if I could
+        # relax torch.Size in PyTorch proper, but if torch.Size constructor
+        # sees a type that it doesn't recognize, it will try to call
+        # __index__() on it, so there is no BC way to actually change this
+        # behavior (though it occurs to me that I could have just added a
+        # YOLO no checking alternate constructor.)
+        #
+        # To work around this problem, I represent a torch.Size proxy as
+        # a straight up proxy, that would have been constructed by taking
+        # the constituent proxies as arguments.  This trick can be generally
+        # used for any construct that we need a proxy for but we can't
+        # directly represent as an aggregate; I don't see very many examples
+        # of this in torchdynamo though!
+
+        # Look for a proxy.  If there are none, do the legacy behavior
+        tracer = None
+        proxies = self._as_proxy()
+        for proxy in proxies:
+            if isinstance(proxy, torch.fx.Proxy):
+                tracer = proxy.tracer
+                break
+
+        if tracer is None:
+            return torch.Size(proxies)
+
+        proxy = tracer.create_proxy("call_function", torch.Size, (proxies,), {})
+        proxy.node.meta["example_value"] = torch.Size(
+            [
+                p.node.meta["example_value"] if not isinstance(p, int) else p
+                for p in proxies
+            ]
+        )
+        return proxy
+
+    def reconstruct(self, codegen):
+        codegen.load_import_from("torch", "Size")
+        codegen.foreach(self.items)
+        build_torch_size = [
+            create_instruction("BUILD_TUPLE", arg=len(self.items)),
+        ] + create_call_function(1, True)
+        codegen.extend_output(build_torch_size)
+
+    def unpack_var_sequence(self, tx):
+        return list(self.items)
+
+    def numel(self, tx):
+        from .builtin import BuiltinVariable
+        from .tensor import SymNodeVariable
+
+        const_result = 1
+        sym_sizes = []
+
+        for v in self.items:
+            if isinstance(v, ConstantVariable):
+                const_result *= v.value
+            else:
+                assert isinstance(v, SymNodeVariable), type(v)
+                # Delay proxy calls  until we know it will be necessary
+                sym_sizes.append(v)
+
+        result = ConstantVariable.create(const_result)
+        if sym_sizes and const_result == 1:
+            # Skip multiplying by 1
+            result, *sym_sizes = sym_sizes
+
+        if not sym_sizes or const_result == 0:
+            return result
+
+        mul = BuiltinVariable(operator.mul)
+        for v in sym_sizes:
+            result = mul.call_function(tx, [result, v], {})
+        return result
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name == "__getitem__":
+            assert not kwargs and len(args) == 1
+            out = self.get_item_dyn(tx, args[0])
+            return out
+        elif name == "numel":
+            assert not args and not kwargs
+            return self.numel(tx)
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def get_item_dyn(self, tx, arg: VariableTracker):
+        from .tensor import SymNodeVariable
+
+        if isinstance(arg, SymNodeVariable):
+            index = arg.sym_num
+        else:
+            index = arg.as_python_constant()
+        if isinstance(index, slice):
+            return SizeVariable(self.items[index])
+        else:
+            assert isinstance(index, (int, torch.SymInt))
+            return self.items[index]
+
+
+class NamedTupleVariable(TupleVariable):
+    def __init__(self, items, tuple_cls, **kwargs):
+        super().__init__(items, **kwargs)
+        self.tuple_cls = tuple_cls
+
+    def python_type(self):
+        return self.tuple_cls
+
+    def as_python_constant(self):
+        return self.python_type()(*[x.as_python_constant() for x in self.items])
+
+    def as_proxy(self):
+        assert self.python_type() is not SizeVariable
+        return self.python_type()(*self._as_proxy())
+
+    def reconstruct(self, codegen):
+        create_fn = getattr(self.tuple_cls, "_make", self.tuple_cls)
+        codegen.append_output(codegen._create_load_const(create_fn))
+        codegen.foreach(self.items)
+        codegen.extend_output(
+            [
+                create_instruction("BUILD_TUPLE", arg=len(self.items)),
+            ]
+            + create_call_function(1, True)
+        )
+
+    def var_getattr(self, tx, name):
+        def check_and_create_method():
+            method = inspect.getattr_static(self.tuple_cls, name, None)
+            if isinstance(method, classmethod):
+                # We need the unbounded cls method to avoid the inline __self__
+                return UserMethodVariable(
+                    method.__func__,
+                    variables.UserDefinedClassVariable(self.tuple_cls),
+                )
+            elif isinstance(method, staticmethod):
+                return UserFunctionVariable(method.__func__)
+            elif inspect.isfunction(method):
+                return UserMethodVariable(method, self)
+            else:
+                return None
+
+        fields = namedtuple_fields(self.tuple_cls)
+        if name not in fields:
+            method = check_and_create_method()
+            if not method:
+                super().var_getattr(tx, name)
+            return method
+        return self.items[fields.index(name)]
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        fields = namedtuple_fields(self.tuple_cls)
+        return variables.ConstantVariable.create(name in fields)
+
+
+class SliceVariable(BaseListVariable):
+    def __init__(self, items, **kwargs):
+        items_to_map = items
+        start, stop, step = [variables.ConstantVariable.create(None)] * 3
+
+        if len(items_to_map) == 1:
+            (stop,) = items_to_map
+        elif len(items_to_map) == 2:
+            start, stop = items_to_map
+        elif len(items_to_map) == 3:
+            start, stop, step = items_to_map
+        else:
+            raise AssertionError()
+
+        if isinstance(start, variables.TensorVariable) or isinstance(
+            stop, variables.TensorVariable
+        ):
+            unimplemented("Dynamic slicing on data-dependent value is not supported")
+
+        super().__init__([start, stop, step], **kwargs)
+
+    def as_proxy(self):
+        return slice(*self._as_proxy())
+
+    def python_type(self):
+        return slice
+
+    def as_python_constant(self):
+        return slice(*[guard_if_dyn(x) for x in self.items])
+
+    def reconstruct(self, codegen):
+        codegen.foreach(self.items)
+        codegen.append_output(create_instruction("BUILD_SLICE", arg=len(self.items)))
+
+    def var_getattr(self, tx, name):
+        fields = ["start", "stop", "step"]
+        if name not in fields:
+            unimplemented(f"slice.{name}")
+        return self.items[fields.index(name)]
+
+
+class ListIteratorVariable(VariableTracker):
+    def __init__(self, items, index: int = 0, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(items, list)
+        # Removing this check as it slows things down too much
+        # https://github.com/pytorch/pytorch/pull/87533#issuecomment-1287574492
+
+        # assert all(isinstance(x, VariableTracker) for x in items)
+        self.items = items
+        self.index = index
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(length={len(self.items)}, index={repr(self.index)})"
+
+    def next_variables(self, tx):
+        assert self.mutable_local
+        old_index = self.index
+        if old_index >= len(self.items):
+            raise StopIteration()
+        tx.output.side_effects.mutation(self)
+        self.index += 1
+        return self.items[old_index], self
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ):
+        if name == "__contains__":
+            assert len(args) == 1
+            assert not kwargs
+            return iter_contains(self.items[self.index :], args[0], tx)
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def as_python_constant(self):
+        if self.index > 0:
+            raise NotImplementedError()
+        return iter([x.as_python_constant() for x in self.items])
+
+    def unpack_var_sequence(self, tx):
+        return list(self.items[self.index :])
+
+    def reconstruct(self, codegen):
+        remaining_items = self.items[self.index :]
+        codegen.foreach(remaining_items)
+        codegen.extend_output(
+            [
+                create_instruction("BUILD_TUPLE", arg=len(remaining_items)),
+                create_instruction("GET_ITER"),
+            ]
+        )
+
+
+class TupleIteratorVariable(ListIteratorVariable):
+    pass
+
+
+class RestrictedListSubclassVariable(ListVariable):
+    """
+    This is a special case of UserDefinedObjectVariable where:
+        1) The user subclasses list
+        2) None of the list methods are overriden, merely some new methods are added
+
+    In these cases, we can prevent graph breaks by not using the general
+    UserDefinedObjectVariable machinery and instead treating it like
+    a ListVariable.
+    """
+
+    _nonvar_fields = {"user_cls", "user_cls_source", *ListVariable._nonvar_fields}
+    _allowed_names = {
+        "__call__",
+        "__module__",
+        "__dict__",
+        "__doc__",
+        "__name__",
+        "__qualname__",
+    }
+    _disallowed_names = {
+        "__getattribute__",
+        "__getattr__",
+        "__setattr__",
+    }
+
+    @classmethod
+    def _is_non_conflicting_subclass(
+        cls,
+        user_cls: type,
+        python_cls: type,
+    ):
+        """Ensures user_cls inherits from python_cls (e.g. list) and does not override any methods on python_cls"""
+        if (
+            not istype(user_cls, type)
+            or user_cls.__bases__ != (python_cls,)
+            or user_cls.__mro__ != (user_cls, python_cls, object)
+        ):
+            return False  # not subclass
+        return not any(
+            hasattr(python_cls, name) or name in cls._disallowed_names
+            for name in set(user_cls.__dict__.keys()) - cls._allowed_names
+        )
+
+    @classmethod
+    def is_matching_cls(cls, user_cls: type):
+        return cls._is_non_conflicting_subclass(user_cls, list)
+
+    def __init__(self, items, *, user_cls: type, user_cls_source: Source, **kwargs):
+        super().__init__(items=items, **kwargs)
+        self.user_cls = user_cls
+        self.user_cls_source = user_cls_source
+        assert istype(user_cls, type)
+        assert isinstance(user_cls_source, Source)
+
+    def python_type(self):
+        return self.user_cls
+
+    def as_proxy(self):
+        return [x.as_proxy() for x in self.items]
+
+    def as_python_constant(self):
+        raise NotImplementedError()
+
+    def is_python_constant(self):
+        return False
+
+    @property
+    def value(self):
+        raise AttributeError("value")
+
+    def modified(self, items, **kwargs):
+        return type(self)(
+            items,
+            user_cls=self.user_cls,
+            user_cls_source=self.user_cls_source,
+            **kwargs,
+        )
+
+    def reconstruct(self, codegen):
+        codegen(self.user_cls_source)
+        super().reconstruct(codegen)
+        codegen.extend_output(create_call_function(1, True))
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List["VariableTracker"],
+        kwargs: Dict[str, "VariableTracker"],
+    ) -> "VariableTracker":
+        if name in self.user_cls.__dict__:
+            method = self.user_cls.__dict__[name]
+            if isinstance(method, types.FunctionType):
+                # inline the method
+                source = AttrSource(self.user_cls_source, name)
+                return UserMethodVariable(method, self, source=source).call_function(
+                    tx, args, kwargs
+                )
+            unimplemented(
+                f"RestrictedListSubclassVariable method {self.user_cls.__name__}.{name}"
+            )
+        return super().call_method(tx, name, args, kwargs)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        return self.call_method(tx, "__call__", args, kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/misc.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4012a88966c5b5121a51d594c4e21263d59c03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/misc.py
@@ -0,0 +1,886 @@
+# mypy: ignore-errors
+
+import collections
+import dataclasses
+import functools
+import inspect
+import itertools
+import sys
+import types
+from typing import Dict, List
+
+import torch._C
+import torch._numpy as tnp
+import torch.utils._pytree as pytree
+from .. import config, variables
+from ..bytecode_transformation import create_call_function, create_instruction
+from ..exc import unimplemented
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GetItemSource, ODictGetItemSource, TypeSource
+from ..utils import (
+    check_constant_args,
+    check_unspec_python_args,
+    identity,
+    is_tensor_base_attr_getter,
+    proxy_args_kwargs,
+)
+from .base import VariableTracker
+from .functions import NestedUserFunctionVariable, UserFunctionVariable
+from .user_defined import UserDefinedObjectVariable
+
+
+class SuperVariable(VariableTracker):
+    def __init__(self, typevar, objvar=None, specialized=False, **kwargs):
+        super().__init__(**kwargs)
+        # typevar is the fist argument to super(). In the case where no argument
+        # is provided to super(), it is the __class__ object where
+        # the super() function is being called
+        self.typevar = typevar
+        # objvar here must be an instance or subtype of typevar.
+        # In the case where super() is called without arguments, it is the first argument
+        # to the current function where super() is called from (self for regular method,
+        # cls for a classmethod)
+        self.objvar = objvar
+        self.specialized = specialized  # directly get attr from self.typevar if true
+
+    def reconstruct(self, codegen):
+        codegen(variables.BuiltinVariable(super))
+        codegen(self.typevar)
+        if self.objvar is not None:
+            codegen(self.objvar)
+            codegen.extend_output(create_call_function(2, True))
+        else:
+            codegen.extend_output(create_call_function(1, True))
+
+    def _resolved_getattr_and_source(self, tx, name):
+        assert self.objvar, "1-arg super not implemented"
+        if self.specialized:
+            return getattr(self.typevar.as_python_constant(), name)
+        search_type = self.typevar.as_python_constant()
+
+        # The rest of this function does two things:
+        #   - Walk the mro to find where the attribute comes from to be
+        #     able to provide accurate source
+        #   - Call the getattr to get the object
+
+        # Find the class object, where the function lives.
+        # When objvar is "self", use type(self), when objvar is "cls", use it as-is
+        type_to_use = self.objvar.python_type()
+        type_to_use_source = (
+            TypeSource(self.objvar.source) if self.objvar.source else None
+        )
+        if issubclass(type_to_use, type):
+            type_to_use = self.objvar.value
+            type_to_use_source = self.objvar.source
+
+        source = None
+        if self.objvar.source is not None:
+            # Walk the mro tuple to find out the actual class where the
+            # attribute resides.
+            search_mro = type_to_use.__mro__
+            start_index = search_mro.index(search_type) + 1
+            for index in range(start_index, len(search_mro)):
+                if hasattr(search_mro[index], name):
+                    # Equivalent of something like type(L['self']).__mro__[1].attr_name
+                    source = AttrSource(
+                        GetItemSource(AttrSource(type_to_use_source, "__mro__"), index),
+                        name,
+                    )
+                    break
+
+        # TODO(jansel): there is a small chance this could trigger user code, prevent that
+        return getattr(super(search_type, type_to_use), name), source
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        # Check if getattr is a constant. If not, delay the actual work by
+        # wrapping the result in GetAttrVariable. Mostly super is called with a
+        # method, so most of the work is delayed to call_function.
+        #
+        # We could have just implemented a const_getattr. However, super is
+        # special when it comes to finding sources. Compared to other VTs, super
+        # requires the attr name to walk the mro and find the actual source (and
+        # not just AttrSource).
+        value, source = self._resolved_getattr_and_source(self, name)
+        if not variables.ConstantVariable.is_literal(value):
+            return GetAttrVariable(self, name)
+        if source:
+            install_guard(source.make_guard(GuardBuilder.CONSTANT_MATCH))
+            return variables.ConstantVariable.create(value, source=source)
+        return variables.ConstantVariable.create(value)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        inner_fn, source = self._resolved_getattr_and_source(self, name)
+
+        if inner_fn is object.__init__:
+            return LambdaVariable(identity)
+        elif inner_fn is torch.nn.Module.__init__:
+            objvar = self.objvar
+            from ..side_effects import AttributeMutationNew
+
+            if (
+                isinstance(objvar, variables.UserDefinedObjectVariable)
+                and isinstance(objvar.mutable_local, AttributeMutationNew)
+                and not (args or kwargs)
+            ):
+                tx.output.side_effects.store_attr(
+                    objvar,
+                    "__call_nn_module_init",
+                    variables.ConstantVariable.create(True),
+                )
+                return variables.ConstantVariable.create(None)
+            else:
+                unimplemented("super() nn.Module.__init__")
+        elif isinstance(inner_fn, types.FunctionType):
+            return variables.UserFunctionVariable(
+                inner_fn, source=source
+            ).call_function(tx, [self.objvar] + args, kwargs)
+        elif isinstance(inner_fn, types.MethodType):
+            return variables.UserMethodVariable(
+                inner_fn.__func__, self.objvar, source=source
+            ).call_function(tx, args, kwargs)
+        elif (
+            inner_fn is collections.OrderedDict.__getitem__
+            and isinstance(self.objvar, variables.UserDefinedObjectVariable)
+            and self.objvar.source
+            and len(args) == 1
+            and len(kwargs) == 0
+            and args[0].is_python_constant()
+        ):
+            from .builder import VariableBuilder
+
+            key = args[0].as_python_constant()
+            return VariableBuilder(tx, ODictGetItemSource(self.objvar.source, key))(
+                collections.OrderedDict.__getitem__(self.objvar.value, key)
+            )
+        elif inner_fn in (
+            collections.OrderedDict.__setitem__,
+            object.__setattr__,
+        ) and isinstance(self.objvar, variables.CustomizedDictVariable):
+            assert not kwargs and len(args) == 2
+            return super(variables.CustomizedDictVariable, self.objvar).call_method(
+                tx, "__setitem__", args, kwargs
+            )
+        else:
+            unimplemented(f"non-function or method super: {inner_fn}")
+
+
+class UnknownVariable(VariableTracker):
+    """
+    It could be anything!
+    """
+
+
+class DelayGraphBreakVariable(UnknownVariable):
+    """
+    Used to insert a dummy variable in the stack to do the graph break at CALL_FUNCTION.
+    """
+
+
+class ComptimeVariable(VariableTracker):
+    """
+    This variable is special, it lets you execute arbitrary code at
+    Dynamo compile time
+    """
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError("comptime is special form")
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        from ..comptime import comptime
+
+        # To support the comptime.print_graph convenience accessors
+        from .functions import UserFunctionVariable
+
+        return UserFunctionVariable(
+            getattr(comptime, name), source=AttrSource(self.source, name)
+        )
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from ..comptime import ComptimeContext
+
+        # TODO: support an expression form as well
+
+        assert not kwargs
+        assert len(args) == 1
+        fn = args[0]
+        if isinstance(fn, UserFunctionVariable):
+            fn.get_function()(ComptimeContext(tx))
+        elif isinstance(fn, NestedUserFunctionVariable):
+            # We have to manually bind the freevars ourselves
+            code = fn.get_code()
+            assert not fn.closure, (
+                "comptime function must not have free variables, "
+                f"but these variables were free: {code.co_freevars}"
+            )
+            func = types.FunctionType(
+                code,
+                fn.f_globals,
+                fn.fn_name.as_python_constant(),
+                tuple(fn.defaults.items) if fn.defaults else None,
+                # We could automatically promote free variables into
+                # ComptimeVar but this is confusing if you access
+                # a free variable that we actually DO have the runtime
+                # value for
+                # tuple(make_cell(ComptimeVar(i)) for i in fn.closure.items)
+                tuple(),
+            )
+            func(ComptimeContext(tx))
+        else:
+            raise RuntimeError(f"unsupported argument to comptime: {type(fn)}")
+
+        return variables.ConstantVariable.create(None)
+
+
+class ClosureVariable(UnknownVariable):
+    def __init__(self, name, **kwargs):
+        super().__init__(**kwargs)
+        self.name = name
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load_closure(self.name))
+
+
+# closure variable created by an inlined function
+class InlinedClosureVariable(UnknownVariable):
+    def __init__(self, name, **kwargs):
+        super().__init__(**kwargs)
+        self.name = name
+
+    def reconstruct(self, codegen):
+        codegen.append_output(codegen.create_load_closure(self.name))
+
+
+class NewCellVariable(VariableTracker):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+class NewGlobalVariable(VariableTracker):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+class InspectSignatureVariable(VariableTracker):
+    """represents inspect.signature(...)"""
+
+    @staticmethod
+    def create(callable, **kwargs):
+        if kwargs:
+            unimplemented(f"inspect.signature with {kwargs}")
+        return InspectSignatureVariable(callable)
+
+    def __init__(self, inspected: VariableTracker, **kwargs):
+        super().__init__(**kwargs)
+        self.inspected = inspected
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        if name == "parameters":
+            return variables.ConstDictVariable(
+                {
+                    variables.ConstantVariable.create(name): InspectParameterVariable()
+                    for name in self.inspected.inspect_parameter_names()
+                },
+                user_cls=dict,
+            )
+        return super().var_getattr(tx, name)
+
+
+class InspectParameterVariable(VariableTracker):
+    """This is not implemented, if used will graph break."""
+
+    pass
+
+
+def produce_trampoline_autograd_apply(fn_cls):
+    def trampoline_autograd_apply(*args, **kwargs):
+        return fn_cls.apply(*args, **kwargs)
+
+    trampoline_autograd_apply._origin = produce_trampoline_autograd_apply
+    return trampoline_autograd_apply
+
+
+class AutogradFunctionVariable(VariableTracker):
+    """represents a torch.autograd.Function subclass"""
+
+    def __init__(self, fn_cls, **kwargs):
+        super().__init__(**kwargs)
+        self.fn_cls = fn_cls
+
+    def call_apply(self, tx, args, kwargs):
+        requires_grad = False
+
+        def visit(node):
+            nonlocal requires_grad
+            if isinstance(node, variables.TensorVariable):
+                if node.requires_grad is not False:
+                    requires_grad = True
+            if isinstance(node, variables.NNModuleVariable):
+                if node.is_training(tx):
+                    requires_grad = True
+            return node
+
+        VariableTracker.apply(visit, (args, kwargs))
+
+        if (
+            requires_grad
+            and torch.is_grad_enabled()
+            and config.capture_autograd_function
+        ):
+            # Note - this is the same check used in autograd/function.py, except inverted.
+            # If we want to support functorch transforms here, we will need to enable this.
+            if (
+                self.fn_cls.setup_context
+                != torch.autograd.function._SingleLevelFunction.setup_context
+            ):
+                unimplemented(
+                    "NYI - autograd.Function with custom setup_context method"
+                )
+
+            vjp_fn = self.fn_cls.vjp  # type: ignore[attr-defined]
+            if vjp_fn is not torch.autograd.Function.vjp:
+                unimplemented("NYI - User defind vjp")
+
+            jvp_fn = self.fn_cls.jvp  # type: ignore[attr-defined]
+            if jvp_fn is not torch.autograd.Function.jvp:
+                unimplemented("NYI - User defind jvp")
+
+            from .higher_order_ops import AutogradFunctionApplyVariable
+
+            source = self.source
+            if source is None:
+                source = AttrSource(
+                    tx.import_source(self.fn_cls.__module__), self.fn_cls.__name__
+                )
+
+            return AutogradFunctionApplyVariable(
+                self.fn_cls.forward,
+                self.fn_cls.backward,
+                source,
+                source=AttrSource(source, member="apply"),
+            ).call_function(tx, args, kwargs)
+
+        if self.source:
+            source = AttrSource(self.source, "forward")
+        else:
+            source = None
+
+        fn = self.fn_cls.forward
+        ctx = AutogradFunctionContextVariable.create(tx)
+        args = [ctx, *args]
+        if isinstance(fn, types.FunctionType):
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, args, kwargs
+            )
+        elif isinstance(fn, types.MethodType):
+            return variables.UserMethodVariable(
+                fn.__func__,
+                variables.UserDefinedClassVariable(self.fn_cls),
+                source=source,
+            ).call_function(tx, args, kwargs)
+        else:
+            unimplemented(
+                f"non-function or method in subclass of torch.autograd.Function: {fn}"
+            )
+
+    def call_function(self, tx, args, kwargs):
+        return AutogradFunctionVariable(self.fn_cls)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ):
+        from ..trace_rules import is_callable_allowed
+        from .builder import wrap_fx_proxy
+
+        if name == "apply":
+            if is_callable_allowed(self.fn_cls):
+                trampoline_autograd_apply = produce_trampoline_autograd_apply(
+                    self.fn_cls
+                )
+                return wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        trampoline_autograd_apply,
+                        *proxy_args_kwargs(args, kwargs),
+                    ),
+                )
+            else:
+                return self.call_apply(tx, args, kwargs)
+
+        else:
+            unimplemented(f"Unsupported method: {name}")
+
+
+@dataclasses.dataclass
+class SavedTensorBox:
+    tensors: List[VariableTracker] = dataclasses.field(default_factory=list)
+
+
+class AutogradFunctionContextVariable(UserDefinedObjectVariable):
+    """
+    Tracks an autograd.Function() context using mutation tracking in side_effects.py
+    """
+
+    _nonvar_fields = {
+        "proxy",
+        "inference",
+        *UserDefinedObjectVariable._nonvar_fields,
+    }
+
+    def __init__(
+        self,
+        value,
+        value_type=None,
+        inference=False,
+        proxy=None,
+        saved_tensors=None,
+        **kwargs,
+    ):
+        super().__init__(value=value, value_type=value_type, **kwargs)
+        self.inference = inference
+        self.proxy = proxy
+        self.saved_tensors = saved_tensors
+
+    @staticmethod
+    def create(tx):
+        proxy = tx.output.create_proxy(
+            "call_function", torch.autograd.function.FunctionCtx, tuple(), {}
+        )
+        out = tx.output.side_effects.track_object_new(
+            None,
+            torch.autograd.function.FunctionCtx,
+            functools.partial(
+                AutogradFunctionContextVariable,
+                inference=True,
+                proxy=proxy,
+                saved_tensors=SavedTensorBox(),
+            ),
+            {},
+        )
+        proxy.node.meta["example_value"] = out.value
+        return out
+
+    def as_proxy(self):
+        if self.proxy is None:
+            unimplemented("proxy not set")
+        return self.proxy
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name != "save_for_backward":
+            unimplemented(f"autograd.Function context method: {name}")
+        if self.saved_tensors is None:
+            unimplemented(
+                "save_for_backward only supported on a newly constructed FunctionCtx"
+            )
+
+        if not self.inference:
+            assert self.source and not kwargs
+            tx.output.side_effects.track_save_for_backward(self, args)
+
+        # In eager mode, multiple calls to .save_for_backward() will overwrite previous calls.
+        if len(self.saved_tensors.tensors) > 0:
+            self.saved_tensors.tensors = []
+        for arg in args:
+            self.saved_tensors.tensors.append(arg)
+        return variables.ConstantVariable.create(None)
+
+    def var_getattr(self, tx, name):
+        if name == "save_for_backward":
+            return LambdaVariable(
+                lambda *args, **kwargs: self.call_method(tx, name, args, kwargs)
+            )
+        if name == "saved_tensors" and self.saved_tensors is not None:
+            return variables.TupleVariable(list(self.saved_tensors.tensors))
+        return super().var_getattr(tx, name)
+
+
+class LambdaVariable(VariableTracker):
+    def __init__(self, fn, **kwargs):
+        super().__init__(**kwargs)
+        self.fn = fn
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        return self.fn(*args, **kwargs)
+
+
+class GetAttrVariable(VariableTracker):
+    def __init__(self, obj, name, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(obj, VariableTracker)
+        assert isinstance(name, str)
+        self.obj = obj
+        self.name = name
+
+    def __str__(self):
+        return f"{self.__class__.__name__}({self.obj}, {self.name})"
+
+    @staticmethod
+    def create_getattr_proxy(base_proxy: torch.fx.Proxy, attr):
+        return getattr(base_proxy, attr)
+
+    def as_proxy(self):
+        return GetAttrVariable.create_getattr_proxy(self.obj.as_proxy(), self.name)
+
+    def const_getattr(self, tx, name):
+        if not isinstance(self.obj, variables.NNModuleVariable):
+            raise NotImplementedError()
+        step1 = tx.output.get_submodule(self.obj.module_key)
+        if self.name not in step1.__dict__:
+            raise NotImplementedError()
+        step2 = inspect.getattr_static(step1, self.name)
+        if name not in step2.__dict__:
+            raise NotImplementedError()
+        return inspect.getattr_static(step2, name)
+
+    def reconstruct(self, codegen):
+        codegen(self.obj)
+        codegen.extend_output(codegen.create_load_attrs(self.name))
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        return self.obj.call_method(tx, self.name, args, kwargs)
+
+
+class MethodWrapperVariable(VariableTracker):
+    def __init__(self, method_wrapper, **kwargs):
+        super().__init__(**kwargs)
+        self.method_wrapper = method_wrapper
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if is_tensor_base_attr_getter(self.method_wrapper) and isinstance(
+            args[0], variables.TensorVariable
+        ):
+            assert len(args) == 1 and len(kwargs) == 0
+
+            return args[0].var_getattr(tx, self.method_wrapper.__self__.__name__)
+
+        super().call_function(tx, args, kwargs)
+
+    def is_python_constant(self):
+        return True
+
+    def as_python_constant(self):
+        return self.method_wrapper
+
+
+class GetSetDescriptorVariable(VariableTracker):
+    def __init__(self, desc, **kwargs):
+        super().__init__(**kwargs)
+        self.desc = desc
+
+    def var_getattr(self, tx, name):
+        if name == "__get__" and self.source:
+            from .builder import VariableBuilder
+
+            return VariableBuilder(tx, AttrSource(self.source, "__get__"))(
+                self.desc.__get__
+            )
+        else:
+            return super().var_getattr(tx, name)
+
+    def is_python_constant(self):
+        return True
+
+    def as_python_constant(self):
+        return self.desc
+
+
+class PythonModuleVariable(VariableTracker):
+    def __init__(self, value: types.ModuleType, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+        self.is_torch = self.value is torch or self.value.__name__.startswith("torch.")
+
+    def python_type(self):
+        return types.ModuleType
+
+    def as_python_constant(self):
+        return self.value
+
+    def __repr__(self):
+        return f"PythonModuleVariable({self.value})"
+
+    def call_hasattr(self, tx, name):
+        if self.is_torch:
+            result = hasattr(self.value, name)
+            return variables.ConstantVariable.create(result)
+        return super().call_hasattr(tx, name)
+
+
+class TypingVariable(VariableTracker):
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if name == "__getitem__" and len(args) == 1:
+            return variables.ConstantVariable.create(
+                self.value[args[0].as_python_constant()],
+            )
+        unimplemented("typing")
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+
+@functools.lru_cache(maxsize=1)
+def get_np_to_tnp_map():
+    from ..utils import NP_TO_TNP_MODULE
+
+    np_fn_to_tnp_fn = {}
+
+    for np_mod, tnp_mod in NP_TO_TNP_MODULE.items():
+        for fn_name, tnp_fn in tnp_mod.__dict__.items():
+            if callable(tnp_fn):
+                # some internal details do leak from tnp
+                # which are not part of numpy API.
+                if np_fn := getattr(np_mod, fn_name, None):
+                    np_fn_to_tnp_fn[np_fn] = tnp_fn
+
+    return np_fn_to_tnp_fn
+
+
+class NumpyVariable(VariableTracker):
+    """
+    Wrapper around `numpy.*`. Currently, is able to trace a small subset of numpy functions as well as numpy dtypes.
+    """
+
+    constant_fold_functions = (tnp.issubdtype,)
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    @classmethod
+    def can_constant_fold_through(cls, fn):
+        mod = fn.__module__.split(".")
+        assert len(mod) >= 2 and mod[:2] == ["torch", "_numpy"]
+        return fn in cls.constant_fold_functions
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        if not config.trace_numpy:
+            unimplemented(f"numpy.{self.value}()")
+
+        from ..utils import numpy_to_tensor_wrapper
+
+        from .tensor import NumpyNdarrayVariable
+
+        # lookup method name in tnp. Things like np.dtype(float) are not supported yet.
+        if self.value.__name__ == "dtype":
+            unimplemented(
+                f"numpy dtype function is not supported yet. Got type {type(self.value)}."
+            )
+        else:  # We are dealing with a callable.
+            func = get_np_to_tnp_map().get(self.value)
+            if func is None:
+                unimplemented(
+                    f"Can't find numpy function {self.value} in torch._numpy. "
+                    " Please file an issue to request support for this function."
+                )
+
+            if (
+                func.__module__ == "torch._numpy.random"
+                and config.use_numpy_random_stream
+            ):
+                msg = f"delegate '{func.__qualname__}' to NumPy itself via "
+                msg += f"confg.use_numpy_random_stream={config.use_numpy_random_stream}"
+                unimplemented(msg)
+
+            args, kwargs = NumpyNdarrayVariable.patch_args(func.__name__, args, kwargs)
+
+            constant_args = check_constant_args(args, kwargs)
+            unspec_python_args = check_unspec_python_args(args, kwargs)
+
+            if self.can_constant_fold_through(func) and (
+                constant_args or unspec_python_args
+            ):
+                # constant fold
+                return variables.ConstantVariable.create(
+                    self.as_python_constant()(
+                        *[x.as_python_constant() for x in args],
+                        **{k: v.as_python_constant() for k, v in kwargs.items()},
+                    ),
+                )
+
+            # TODO Add all the functions that go from constants to constants to can_constant_fold_through
+            proxy = tx.output.create_proxy(
+                "call_function",
+                numpy_to_tensor_wrapper(func),
+                *proxy_args_kwargs(args, kwargs),
+            )
+            return NumpyNdarrayVariable.create(tx, proxy)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        unimplemented("numpy")
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    def as_proxy(self):
+        if config.trace_numpy and isinstance(self.value, type):
+            # This handles numpy dtype attributes such as np.float32
+            # We return a string as we don't want to serialize non-PyTorch objects in the output FX graph
+            # In torch/_numpy we normalize strings to their dtypes when the input is a dtype, as NumPy does
+            return self.value.__name__
+
+        return super().as_proxy()
+
+
+# Used to keep track of NULLs pushed on the stack for Python 3.11 function calls
+class NullVariable(VariableTracker):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __str__(self):
+        return "NullVariable"
+
+    def reconstruct(self, codegen):
+        if sys.version_info < (3, 11):
+            unimplemented("cannot reconstruct NullVariable in < Python 3.11")
+        codegen.append_output(create_instruction("PUSH_NULL"))
+
+
+class DeletedVariable(VariableTracker):
+    """Marker used to implement delattr()"""
+
+
+class StringFormatVariable(VariableTracker):
+    """
+    Represents a call to str.format(), we delay calling format until after the graph.
+    """
+
+    _nonvar_fields = {"format_string", *VariableTracker._nonvar_fields}
+
+    @classmethod
+    def create(cls, format_string, sym_args, sym_kwargs):
+        if all(
+            x.is_python_constant()
+            for x in itertools.chain(sym_args, sym_kwargs.values())
+        ):
+            return variables.ConstantVariable.create(
+                format_string.format(
+                    *[v.as_python_constant() for v in sym_args],
+                    **{k: v.as_python_constant() for k, v in sym_kwargs.items()},
+                )
+            )
+        return cls(format_string, list(sym_args), dict(sym_kwargs))
+
+    def __init__(self, format_string, sym_args, sym_kwargs, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(format_string, str)
+        self.format_string = format_string
+        self.sym_args = sym_args
+        self.sym_kwargs = sym_kwargs
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.format_string!r}, {self.sym_args!r}, {self.sym_kwargs!r})"
+
+    def reconstruct(self, codegen):
+        if sys.version_info >= (3, 11):
+            codegen.append_output(create_instruction("PUSH_NULL"))
+        codegen.append_output(codegen.create_load_const(self.format_string))
+        codegen.append_output(codegen.create_load_attr("format"))
+        codegen(variables.TupleVariable(self.sym_args))
+        kwargs = {
+            variables.ConstantVariable.create(k): v for k, v in self.sym_kwargs.items()
+        }
+        codegen(variables.ConstDictVariable(kwargs))
+        codegen.append_output(create_instruction("CALL_FUNCTION_EX", arg=1))
+
+
+class DebuggingVariable(VariableTracker):
+    """
+    Represents a call to a debugging function like print(), or something
+    registered to config.reorderable_logging_functions.
+    """
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    @staticmethod
+    def is_reorderable_logging_function(obj):
+        return (
+            callable(obj)
+            and isinstance(obj, (types.FunctionType, types.BuiltinFunctionType))
+            and obj in torch._dynamo.config.reorderable_logging_functions
+        )
+
+    def call_function(self, tx, args, kwargs):
+        if tx.export:
+            # For export cases, we can just make debugging functions no-ops
+            return
+
+        if not self.can_reorder_logs(self.value, args, kwargs):
+            unimplemented(
+                f"Reordering debugging function {self.value} "
+                f"with inputs {args} {kwargs} is not yet implemented."
+            )
+
+        tx.debug_locals.append((self, list(args)))
+
+    def reconstruct(self, codegen):
+        return self.source.reconstruct(codegen)
+
+    @staticmethod
+    def can_reorder_logs(fn, args, kwargs) -> True:
+        """
+        Run some additional checks for what sort of function calls can we
+        actually reorder.
+        """
+
+        allowed_input_types = (
+            variables.TensorVariable,
+            variables.ConstantVariable,
+            StringFormatVariable,
+        )
+
+        flat_args = pytree.tree_leaves([args, kwargs])
+        for arg in flat_args:
+            if not isinstance(arg, allowed_input_types):
+                return False
+
+        return True
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/nn_module.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/nn_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da90048ac33766ebbe0a42bed7391c793e89226
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/nn_module.py
@@ -0,0 +1,813 @@
+# mypy: ignore-errors
+
+import functools
+import inspect
+import itertools
+import types
+from contextlib import contextmanager, nullcontext
+from typing import Any, Dict, List
+
+import torch.nn
+
+from .. import trace_rules, variables
+from ..exc import unimplemented, UnspecializeRestartAnalysis, Unsupported
+from ..guards import GuardBuilder, install_guard
+from ..mutation_guard import GenerationTracker
+from ..source import (
+    AttrSource,
+    FSDPNNModuleSource,
+    GetItemSource,
+    NNModuleSource,
+    NotNNModuleSource,
+)
+from ..utils import (
+    get_custom_getattr,
+    get_fake_value,
+    is_lazy_module,
+    is_namedtuple,
+    is_safe_constant,
+    istensor,
+    istype,
+    nnmodule_has_hooks,
+    object_has_getattribute,
+    proxy_args_kwargs,
+)
+from .base import MutableLocal, typestr, VariableTracker
+from .functions import invoke_and_store_as_constant
+from .lists import SliceVariable
+from .user_defined import UserDefinedObjectVariable
+
+
+def initialize_lazy_module(tx, mod, args, kwargs):
+    """
+    Fairly coupled helper used by NNModuleVariable and UnspecializedNNModuleVariable.
+
+    Used to cause lazy module to be initialized (and delete its init hook) before tracing. Especially
+    useful now that 'allowed' modules graph-break on hooks, calling this first ensures there is no hook
+    by the time we trace __call__ and thus no graph-break for lazy allowed modules.
+    """
+    if hasattr(mod, "_initialize_hook"):
+
+        def convert_to_fake(x):
+            if is_namedtuple(x):
+                return type(x)(*(convert_to_fake(elem) for elem in x))
+            elif isinstance(x, dict):
+                return {k: convert_to_fake(v) for k, v in x.items()}
+            elif isinstance(x, (list, tuple, set)):
+                return type(x)(convert_to_fake(elem) for elem in x)
+            elif isinstance(x, torch.fx.Proxy):
+                return get_fake_value(x.node, tx)
+            else:
+                return x
+
+        proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
+        fake_args = [convert_to_fake(arg) for arg in proxy_args]
+        fake_kwargs = {k: convert_to_fake(v) for k, v in proxy_kwargs.items()}
+        mod._infer_parameters(mod, fake_args, fake_kwargs)
+
+
+@contextmanager
+def record_nn_module_stack(module_key: str, source, tx, mod: torch.nn.Module):
+    fully_qualified_name = source.name()
+    try:
+        tx.nn_module_stack[module_key] = (fully_qualified_name, type(mod))
+        yield
+    finally:
+        del tx.nn_module_stack[module_key]
+
+
+class NNModuleVariable(VariableTracker):
+    _nonvar_fields = {"module_type", "module_key", *VariableTracker._nonvar_fields}
+
+    def __init__(
+        self, module_type: type, module_key: str, module: torch.nn.Module, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.module_type = module_type
+        self.module_key = module_key
+        self.module = module
+        assert self.source
+
+    def python_type(self):
+        return self.module_type
+
+    def _wrap_submodule(self, tx, source, submod, *key_extra, **options):
+        return
+
+    def unpack_var_sequence(self, tx):
+        # implement list/iter/tuple/etc calls
+        base = tx.output.get_submodule(self.module_key)
+        if isinstance(base, torch.nn.ModuleDict):
+            result = []
+            for name, submod in base.items():
+                name_var = variables.ConstantVariable.create(name)
+                tx.output.register_attr_or_module(
+                    submod,
+                    self.module_key,
+                    name,
+                    source=NNModuleSource(GetItemSource(self.source, name)),
+                )
+                result.append(name_var)
+            return result
+
+        assert isinstance(
+            base, (torch.nn.ModuleList, torch.nn.ParameterList, torch.nn.Sequential)
+        ), typestr(base)
+        assert self.source
+        result = []
+        for idx, submod in enumerate(base):
+            result.append(
+                tx.output.register_attr_or_module(
+                    submod,
+                    self.module_key,
+                    idx,
+                    source=NNModuleSource(GetItemSource(self.source, idx)),
+                )
+            )
+        return result
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        mod = tx.output.get_submodule(self.module_key)
+        result = hasattr(mod, name)
+        install_guard(
+            NNModuleSource(AttrSource(self.source, name)).make_guard(
+                GuardBuilder.HASATTR
+            )
+        )
+        return variables.ConstantVariable.create(result)
+
+    def is_training(self, tx):
+        mod = tx.output.get_submodule(self.module_key)
+        return getattr(mod, "training", False)
+
+    def convert_to_unspecialized(self, tx):
+        """Restart analysis treating this module as an UnspecializedNNModuleVariable"""
+        mod = tx.output.get_submodule(self.module_key)
+        GenerationTracker.tag(mod)
+
+        # Mark the class dynamic unless its module initialization
+        if tx.f_code.co_name != "__init__":
+            GenerationTracker.mark_class_dynamic(type(mod))
+        raise UnspecializeRestartAnalysis()
+
+    def _custom_getattr_fallback(self, base, tx, name, options):
+        """Check for a __getattr__ and handle it specially if it is implemented"""
+        if object_has_getattribute(base):
+            unimplemented("torch.nn.Module with a custom __getattribute__ defined")
+
+        getattr_fn = get_custom_getattr(base)
+        if getattr_fn is None:
+            return None
+
+        if not isinstance(getattr_fn, types.FunctionType):
+            unimplemented("torch.nn.Module with a non-function custom __getattr__")
+
+        return variables.UserMethodVariable(getattr_fn, self, **options).call_function(
+            tx, [variables.ConstantVariable.create(name)], {}
+        )
+
+    def var_getattr(self, tx, name):
+        from .builder import VariableBuilder
+
+        if self.source:
+            source = AttrSource(self.source, name)
+        else:
+            source = None
+
+        base = tx.output.get_submodule(self.module_key)
+        base_dict = object.__getattribute__(base, "__dict__")
+        object_member = True
+        all_class_attribute_names = set()
+        for x in inspect.getmro(base.__class__):
+            all_class_attribute_names.update(x.__dict__.keys())
+
+        if not self.source:
+            unimplemented("GETATTR with no source")
+
+        if name in base_dict:
+            subobj = base_dict[name]
+        elif (
+            "_modules" in base_dict
+            and name in base_dict["_modules"]
+            and name not in all_class_attribute_names
+        ):
+            subobj = base_dict["_modules"][name]
+        elif "_parameters" in base_dict and name in base_dict["_parameters"]:
+            subobj = base_dict["_parameters"][name]
+        elif "_buffers" in base_dict and name in base_dict["_buffers"]:
+            subobj = base_dict["_buffers"][name]
+        else:
+            try:
+                subobj = inspect.getattr_static(base, name)
+                object_member = False
+            except AttributeError:
+                # see if we can fallback to __getattr__, which is not checked by getattr_static
+                result = self._custom_getattr_fallback(
+                    base=base, tx=tx, name=name, options={"source": source}
+                )
+                if result is not None:
+                    return result
+                # if we can't find a __getattr__, just raise the AttributeError
+                raise
+
+        if name == "__class__" and not object_member:
+            return variables.UserDefinedClassVariable(base.__class__, source=source)
+
+        if object_member:
+            return VariableBuilder(tx, NNModuleSource(source))(subobj)
+        else:
+            if istype(subobj, property):
+                return variables.UserFunctionVariable(
+                    subobj.fget,
+                    source=source,
+                ).call_function(tx, [(self)], {})
+            elif istype(subobj, classmethod):
+                return variables.UserMethodVariable(
+                    subobj.__func__,
+                    variables.UserDefinedObjectVariable(type(base)),
+                    source=source,
+                )
+            elif istype(subobj, staticmethod):
+                return variables.UserFunctionVariable(
+                    subobj.__get__(base), source=source
+                )
+            elif istype(subobj, types.FunctionType):
+                return variables.UserMethodVariable(subobj, self, source=source)
+            elif is_safe_constant(subobj) or istensor(subobj):
+                # Support possibly common cases of class members
+                return VariableBuilder(tx, NNModuleSource(source))(subobj)
+            else:
+                unimplemented(f"class property {typestr(base)} {typestr(subobj)}")
+
+        return variables.GetAttrVariable(self, name, source=source)
+
+    def call_function(
+        self,
+        tx,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        mod = tx.output.get_submodule(self.module_key)
+
+        with record_nn_module_stack(self.module_key, self.source, tx, mod):
+            is_lazy = is_lazy_module(mod)
+            if (
+                isinstance(mod, torch.nn.Sequential)
+                and mod.__class__.forward is torch.nn.Sequential.forward
+            ):
+                if nnmodule_has_hooks(mod):
+                    # We do not want to unroll sequential if it has hooks, since evaporating it
+                    # will cause hooks to not fire!
+                    # This terminates and restart the tracing process
+                    self.convert_to_unspecialized(tx)
+
+                # Unroll sequential
+                assert (
+                    not is_lazy
+                ), "Expected lazy sequential isn't a valid combination?"
+                assert not kwargs
+                (arg,) = args
+                # TODO: Use named_children when it supports remove_duplicate=False.
+                for child_name, submod in mod._modules.items():
+                    tx.call_function(
+                        tx.output.register_attr_or_module(
+                            submod,
+                            self.module_key,
+                            child_name,
+                            source=NNModuleSource(AttrSource(self.source, child_name)),
+                        ),
+                        [arg],
+                        {},
+                    )
+                    arg = tx.pop()
+                return arg
+
+            if is_lazy:
+                # The module type will change after it is called
+                if mod.cls_to_become is not None:
+                    self.module_type = mod.cls_to_become
+
+                # The pre-hook runs to initialize the module shapes, then deletes itself.  After this,
+                # the module is more or less not lazy and can be treated as a normal module regardless of
+                # is_allowed or other variations.
+                initialize_lazy_module(tx, mod, args, kwargs)
+
+            # If we are tracing the higher order op, we want Dynamo to step
+            # inside the module call so that Dynamo can see the underlying
+            # parameters and buffers and raise them as inputs to the graph.
+            if tx.output.is_root_tracer() and mod.__module__.startswith(
+                ("torch.nn.", "torch.ao.")
+            ):
+                if nnmodule_has_hooks(
+                    mod, check_forward_hooks=True, check_backward_hooks=True
+                ):
+                    # End of fn, this bubbles up and restarts tracing.
+                    self.convert_to_unspecialized(tx)
+
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_module",
+                        self.module_key,
+                        *proxy_args_kwargs(args, kwargs),
+                    ),
+                )
+            else:
+                assert self.source, (
+                    "Must provide a valid source in order to inline, "
+                    "since inlined function may have default args which must be guarded."
+                )
+                if isinstance(mod, torch.fx.GraphModule):
+                    # TODO: do we want to support __call__ for GM's?
+                    # If so at least some changes are needed, we don't allow inlining
+                    # the call_wrapped currently, and maybe other issues too
+                    fn = mod.forward
+                else:
+                    fn = mod._call_impl
+                fn_source = AttrSource(self.source, "__call__")
+                if istype(fn, types.MethodType):
+                    fn = fn.__func__
+                    fn_source = AttrSource(fn_source, "__func__")
+                    args = [self] + args
+                else:
+                    assert istype(fn, types.FunctionType)
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(fn, source=fn_source),
+                    args,
+                    kwargs,
+                )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+        constant=False,
+    ) -> "VariableTracker":
+        from . import ConstantVariable, ListIteratorVariable, TupleVariable
+
+        key = self.module_key
+        module = tx.output.get_submodule(key)
+
+        def generic_call_method_helper(name):
+            # Helper function to put a `call_method` node in FX graph,
+            # with nn.Module as the first arg.
+            mod_proxy = tx.output.create_proxy(
+                "get_attr",
+                self.module_key,
+                tuple(),
+                {},
+            )
+            mod_proxy.node.meta["example_value"] = module
+
+            proxy_args, proxy_kwargs = proxy_args_kwargs(args, kwargs)
+
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method",
+                    name,
+                    args=(mod_proxy, *proxy_args),
+                    kwargs=proxy_kwargs,
+                ),
+            )
+
+        if name in ["_call_impl", "_wrapped_call_impl"]:
+            # Example: `self.layer.__call__(x)`
+            # This is used for explicit calling `__call__` in a forward function.
+            # Dynamo inlines `__call__`, includes hooks.
+            return self.call_function(tx, args, kwargs)
+        elif name == "forward":
+            # Example: `self.layer.forward(x)`
+            # This is used for explicit calling `forward` in a forward function.
+            # Dynamo puts `call_method` node in FX, doesn't trigger hooks.
+            with record_nn_module_stack(self.module_key, self.source, tx, module):
+                return generic_call_method_helper(name)
+
+        if name == "_check_input_dim" and trace_rules.is_torch_inline_allowed(
+            inspect.getfile(module.__class__._check_input_dim)
+        ):
+            return ConstantVariable.create(True)
+
+        if name == "_get_item_by_idx":
+            assert args[1].is_python_constant()
+            assert isinstance(args[0], TupleVariable)
+            mod_var = args[0].items[args[1].value]
+            if isinstance(mod_var, UnspecializedNNModuleVariable):
+                return mod_var
+            key = mod_var.module_key
+            submod = tx.output.get_submodule(key)
+            return tx.output.register_attr_or_module(
+                submod,
+                key,
+                key,
+                source=NNModuleSource(GetItemSource(self.source, key)),
+            )
+
+        if constant:
+            fn = getattr(module, name)
+            name = f"{module.__class__.__name__}_{name}_result"
+            return invoke_and_store_as_constant(tx, fn, name, args, kwargs)
+
+        def assert_all_args_kwargs_const():
+            if not all(
+                x.is_python_constant() for x in itertools.chain(args, kwargs.values())
+            ):
+                raise unimplemented(f"non-const NNModule method {name}")
+
+        def get_kwargs(*names):
+            assert_all_args_kwargs_const()
+            fn = getattr(module, name)
+            bound_args = inspect.signature(fn).bind(
+                *([x.as_python_constant() for x in args]),
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+            bound_args.apply_defaults()
+            bound_args = bound_args.arguments
+            return {k: bound_args[k] for k in names}
+
+        def wrap_values(items):
+            result = []
+            for name, submod in items:
+                result.append(
+                    tx.output.register_attr_or_module(
+                        submod,
+                        key,
+                        name,
+                        source=NNModuleSource(gen_source(self.source, name)),
+                    )
+                )
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+
+        def named_embed(name, obj):
+            return TupleVariable(
+                [
+                    ConstantVariable.create(name),
+                    tx.output.register_attr_or_module(
+                        obj,
+                        key,
+                        name,
+                        source=NNModuleSource(gen_source(self.source, name)),
+                    ),
+                ]
+            )
+
+        def gen_source(source, name):
+            name_split = name.split(".")
+            if name_split[0] == "":
+                return source
+            while len(name_split) > 0:
+                x = name_split.pop(0)
+                source = AttrSource(source, x)
+            return source
+
+        if name == "named_children":
+            assert not (args or kwargs)
+            result = []
+            for name, submod in module.named_children():
+                result.append(named_embed(name, submod))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "named_parameters":
+            result = []
+            for name, param in module.named_parameters(
+                **get_kwargs("prefix", "recurse")
+            ):
+                result.append(named_embed(name, param))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "named_buffers":
+            result = []
+            for name, buffer in module.named_buffers(
+                **get_kwargs("prefix", "recurse", "remove_duplicate")
+            ):
+                result.append(named_embed(name, buffer))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "named_modules":
+            result = []
+            for name, submod in module.named_modules(
+                **get_kwargs("memo", "prefix", "remove_duplicate")
+            ):
+                result.append(named_embed(name, submod))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "children":
+            assert not (args or kwargs)
+            return wrap_values(module.named_children())
+        elif name == "modules":
+            return wrap_values(module.named_modules())
+        elif name == "parameters":
+            return wrap_values(module.named_parameters(**get_kwargs("recurse")))
+        elif name == "buffers":
+            return wrap_values(module.named_buffers(**get_kwargs("recurse")))
+        elif name == "keys":
+            assert not (args or kwargs)
+            result = []
+            for name in module.keys():
+                result.append(ConstantVariable.create(name))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "values":
+            assert not (args or kwargs)
+            return wrap_values(module.items())
+        elif name == "items":
+            assert not (args or kwargs)
+            result = []
+            for name, submod in module.items():
+                result.append(named_embed(name, submod))
+            return ListIteratorVariable(result, mutable_local=MutableLocal())
+        elif name == "__len__":
+            assert not (args or kwargs)
+            return ConstantVariable.create(len(module))
+        elif (
+            name == "__contains__"
+            and isinstance(module, (torch.nn.ModuleDict, torch.nn.ParameterDict))
+            and args
+            and args[0].is_python_constant()
+        ):
+            return ConstantVariable.create(
+                args[0].as_python_constant() in module._modules
+            )
+        elif name == "__getitem__":
+            assert not kwargs and len(args) == 1
+            builtin_supported = (
+                torch.nn.ModuleDict.__getitem__,
+                torch.nn.ModuleList.__getitem__,
+                torch.nn.ParameterDict.__getitem__,
+                torch.nn.ParameterList.__getitem__,
+                torch.nn.Sequential.__getitem__,
+            )
+
+            if type(module).__getitem__ not in builtin_supported:
+                assert isinstance(args[0], variables.ConstantVariable), typestr(args[0])
+                key = args[0].as_python_constant()
+                assert isinstance(key, (str, int))
+                fn = getattr(module, name).__func__
+
+                assert isinstance(fn, types.FunctionType)
+
+                src = AttrSource(AttrSource(self.source, name), "__func__")
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(fn, source=src),
+                    [self] + list(args),
+                    kwargs,
+                )
+
+            assert self.source
+
+            if isinstance(args[0], SliceVariable):
+                # Build a TupleVariable of NNModules
+                result = []
+                submods = []
+
+                # Turn the slice into the list of integers
+                keys = list(range(len(module)))[args[0].as_python_constant()]
+                for idx, submod in enumerate(module[args[0].as_python_constant()]):
+                    key = keys[idx]
+                    src = NNModuleSource(GetItemSource(self.source, key))
+                    result.append(
+                        tx.output.register_attr_or_module(
+                            submod,
+                            key,
+                            source=src,
+                        )
+                    )
+                    submods.append(submod)
+
+                new_module = torch.nn.Sequential(*submods)
+                new_module_variable = tx.output.register_attr_or_module(
+                    new_module,
+                    f"{self}.__getitem__(slice)",
+                    source=NNModuleSource(
+                        GetItemSource(self.source, args[0].as_python_constant())
+                    ),
+                )
+                return new_module_variable
+
+            from .tensor import SymNodeVariable
+
+            if isinstance(args[0], SymNodeVariable):
+                key = args[0].evaluate_expr(tx.output)
+            else:
+                key = args[0].as_python_constant()
+
+            submod = module[key]
+            return tx.output.register_attr_or_module(
+                submod,
+                self.module_key,
+                key,
+                source=NNModuleSource(GetItemSource(self.source, key)),
+            )
+        elif (
+            name == "_get_abs_string_index"
+            or (
+                isinstance(module, torch.nn.modules.conv._ConvNd)
+                and name == "_conv_forward"
+            )
+            or (
+                isinstance(module, torch.nn.modules.conv._ConvTransposeNd)
+                and name == "_output_padding"
+            )
+        ):
+            # Inline the function
+            fn = getattr(module, name).__func__
+            fn_source = AttrSource(AttrSource(self.source, name), "__func__")
+            return tx.inline_user_function_return(
+                variables.UserFunctionVariable(fn, source=fn_source),
+                [self] + args,
+                kwargs,
+            )
+        # A loose heuristic, but seems to be generally good before we drop into the
+        # manual handling of inputs
+        elif (
+            name in module.__class__.__dict__
+            and callable(module.__class__.__dict__[name])
+            and all(
+                isinstance(x, variables.TensorVariable)
+                for x in itertools.chain(args, kwargs.values())
+            )
+        ):
+            return generic_call_method_helper(name)
+        else:
+            return super().call_method(tx, name, args, kwargs)
+
+
+class UnspecializedNNModuleVariable(UserDefinedObjectVariable):
+    _nonvar_fields = {"value_type", *UserDefinedObjectVariable._nonvar_fields}
+
+    """
+    The above class will specialize on the id() of a module and place
+    parameters on the torch.fx.GraphModule.  Giving one graph per
+    module instance.  This version treats nn.Modules() like other user
+    defined objects and will pass parameters into the FX graph as inputs.
+    Giving one graph per module class.
+    """
+
+    def __init__(self, value, **kwargs):
+        if type(value) is torch.jit._script.RecursiveScriptModule:
+            raise Unsupported(
+                "ScriptModules aren't supported in UnspecializedNNModuleVariable"
+                " becuase their .forward function isn't a static member of their type"
+            )
+        if "value_type" in kwargs:
+            lazy_value_to_become = getattr(kwargs["value_type"], "cls_to_become", None)
+            if type(value) is lazy_value_to_become:
+                # We may have cloned a variabletracker for a LazyModule earlier (e.g. tracking side-effects)
+                # and then later we called and mutated the LazyModule into a MaterializedModule.
+                # We do not do the mutation upon first seeing a LazyModule since we preserve eager semantics to only
+                # mutate upon first call, but this requires we update multiple copies of the VariableTracker post-mutation.
+                kwargs["value_type"] = type(value)
+
+        super().__init__(value=value, **kwargs)
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _nn_module_method_ids():
+        return {
+            id(x.__code__)
+            for x in torch.nn.Module.__dict__.values()
+            if hasattr(x, "__code__")
+        }
+
+    def unpack_var_sequence(self, tx):
+        from .builder import VariableBuilder
+
+        try:
+            fn = inspect.getattr_static(self.value_type, "__iter__")
+        except AttributeError as e:
+            raise NotImplementedError from e
+
+        if fn in (
+            torch.nn.ModuleList.__iter__,
+            torch.nn.ParameterList.__iter__,
+            torch.nn.Sequential.__iter__,
+        ):
+            assert self.source
+            return [
+                VariableBuilder(tx, source=GetItemSource(self.source, idx))(item)
+                for idx, item in enumerate(self.value)
+            ]
+
+        return super().unpack_var_sequence(tx)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        mod = self.value
+        # see comment on lazy module handling in NNModuleVariable.call_function for context
+        if is_lazy_module(mod):
+            if mod.cls_to_become is not None:
+                self.value_type = mod.cls_to_become
+            initialize_lazy_module(tx, mod, args, kwargs)
+        name = "_call_impl"
+        fn = getattr(self.value_type, name)
+        if self.source:
+            source = AttrSource(AttrSource(self.source, "__class__"), name)
+        else:
+            source = None
+
+        ctx = (
+            record_nn_module_stack(str(id(mod)), self.source, tx, mod)
+            if self.source
+            else nullcontext()
+        )
+        with ctx:
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, [self] + list(args), kwargs
+            )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from .builder import VariableBuilder
+
+        if name in ["_call_impl", "_wrapped_call_impl"]:
+            fn = getattr(self.value_type, name)
+            if self.source:
+                source = AttrSource(AttrSource(self.source, "__class__"), name)
+            else:
+                source = None
+
+            return variables.UserFunctionVariable(fn, source=source).call_function(
+                tx, [self] + list(args), kwargs
+            )
+
+        if name not in getattr(self.value, "__dict__", {}):
+            try:
+                method = inspect.getattr_static(type(self.value), name)
+            except AttributeError:
+                method = None
+
+            if method is torch.nn.Module.parameters:
+                assert not args or kwargs
+                if tx.output.side_effects.has_pending_mutation(self):
+                    unimplemented("Module.parameters() with pending mutation")
+                install_guard(
+                    self.source.make_guard(GuardBuilder.NN_MODULE_PARAM_NAMES)
+                )
+                items = []
+                for name, value in self.value.named_parameters():
+                    items.append(
+                        VariableBuilder(tx, AttrSource(self.source, name))(value)
+                    )
+                return variables.ListIteratorVariable(
+                    items, mutable_local=MutableLocal()
+                )
+            elif isinstance(method, staticmethod):
+                source = AttrSource(
+                    AttrSource(AttrSource(self.source, "__class__"), name), "__func__"
+                )
+                return tx.inline_user_function_return(
+                    variables.UserFunctionVariable(method.__func__, source=source),
+                    args,
+                    kwargs,
+                )
+
+            if id(method.__code__) in self._nn_module_method_ids():
+                unimplemented(f"UnspecializedNNModuleVariable missing {name}")
+
+        return super().call_method(tx, name, args, kwargs)
+
+
+class FSDPManagedNNModuleVariable(UnspecializedNNModuleVariable):
+    """
+    Tracing behavior: trace into submodules and treat them as Unspecialized, do not
+    register parameters to the top-level, treat them as function inputs.
+
+    Guards behavior: if 'skip_fsdp_guards', many guards that would be installed
+    by a vanilla UnspecializedNNModuleVariable are simply dropped, on the basis
+    that a user wrapping their model in FSDP(model) is already opting into a
+    requirement to not modify internal model state, which would already break FSDP without
+    compilation.
+    """
+
+    def __init__(self, value, **kwargs):
+        source = kwargs.get("source", None)
+        assert (
+            source is not None
+        ), "FSDPManagedNNModule depends on having an accurate source to control guarding."
+
+        super().__init__(value=value, **kwargs)
+        self.source = source
+
+    @staticmethod
+    def _wrap_source(source):
+        if not isinstance(source, (FSDPNNModuleSource, NotNNModuleSource)):
+            if torch._dynamo.config.skip_fsdp_guards:
+                return FSDPNNModuleSource(source)
+            else:
+                # this makes us behave like a usual UnspecializedNNModuleVariable for guarding purposes
+                return NotNNModuleSource(source)
+        else:
+            return source
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == "source":
+            value = FSDPManagedNNModuleVariable._wrap_source(value)
+
+        return super().__setattr__(name, value)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/optimizer.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdfda8c3c0d532ab0073a83e41070a133d4fe31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/optimizer.py
@@ -0,0 +1,230 @@
+# mypy: ignore-errors
+
+import weakref
+from typing import Dict, List
+
+import torch
+
+from ..decorators import mark_static_address
+
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, ConstDictKeySource, GetItemSource, GlobalWeakRefSource
+from ..utils import GLOBAL_KEY_PREFIX
+
+from .base import VariableTracker
+from .constant import ConstantVariable
+from .dicts import ConstDictVariable
+from .lists import ListVariable
+from .misc import GetAttrVariable
+from .user_defined import UserDefinedObjectVariable
+
+
+class ArgMappingException(Exception):
+    pass
+
+
+class GuardInstallException(Exception):
+    pass
+
+
+class OptimizerVariable(UserDefinedObjectVariable):
+    def __init__(
+        self,
+        value,
+        grad_to_source=None,
+        static_tensor_names=None,
+        tensor_to_source=None,
+        **kwargs,
+    ):
+        super().__init__(value, **kwargs)
+
+        for group in self.value.param_groups:
+            if "capturable" in group:
+                group["capturable"] = True
+
+            for p in group["params"]:
+                mark_static_address(p, guard=False)
+
+        self.grad_to_source = grad_to_source or {}
+        self.tensor_to_source = tensor_to_source or {}
+        self.static_tensor_names = static_tensor_names or set()
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        """This is an optimization to avoid tracing the very slow initialization of the optimizer"""
+        if name == "_init_group":
+            try:
+                py_args, py_kwargs = self.get_python_args(*args, **kwargs)
+                ret_val = self.value._init_group(*py_args, **py_kwargs)
+                self.map_sources_and_install_guards(tx)
+                self.update_list_args(tx, args, kwargs, py_args, py_kwargs)
+                # stash a weak_ptr to optimizer to invalidate code
+                # if the optimizer object dies
+                mangled_name = f"__optimizer_{id(self.value)}"
+                tx.store_global_weakref_by_id(mangled_name, self.value)
+                self.create_finalizer(tx)
+
+                # This is currently safe only because the only actual `ret_val`s returned
+                # by the `_init_group` of existing optimizers are properties that are invariant
+                # to the input tensors (e.g. dtype, layout). Changing these would trigger a
+                # recompilation and hence never result in the wrong specialization of `ret_val`.
+                return ConstantVariable.create(ret_val)
+            except (ArgMappingException, GuardInstallException) as _:
+                # trace normally if we can't map args or install guards correctly
+                pass
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def var_getattr(self, tx, name):
+        if name == "_init_group":
+            return GetAttrVariable(self, name)
+
+        return super().var_getattr(tx, name)
+
+    def get_python_args(self, *args, **kwargs):
+        """Get python values equivalent to the variable tracker args"""
+
+        def map_arg(arg):
+            if isinstance(arg, ConstantVariable):
+                return arg.as_python_constant()
+            elif isinstance(arg, ListVariable) and not arg.items:
+                return []
+            elif (
+                isinstance(arg, ConstDictVariable)
+                and isinstance(arg.source, GetItemSource)
+                and isinstance(arg.source.base, AttrSource)
+                and arg.source.base.member == "param_groups"
+            ):
+                return self.value.param_groups[arg.source.index]
+
+            raise ArgMappingException()
+
+        new_args = [map_arg(arg) for arg in args]
+        new_kwargs = {k: map_arg(v) for k, v in kwargs.items()}
+
+        return new_args, new_kwargs
+
+    def map_sources_and_install_guards(self, tx):
+        self.grad_to_source = {}
+        self.tensor_to_source = {}
+
+        from .builder import VariableBuilder
+
+        param_groups_vt = VariableBuilder(tx, AttrSource(self.source, "param_groups"))(
+            self.value.param_groups
+        ).recursive_realize()
+
+        for g_ind, (group, group_vt) in enumerate(
+            zip(self.value.param_groups, param_groups_vt.items)
+        ):
+            group_source = group_vt.source
+            params_vt = group_vt.getitem_const(ConstantVariable.create("params"))
+            for p_ind, (p, p_vt) in enumerate(
+                zip(group["params"], params_vt.unpack_var_sequence(tx))
+            ):
+                param_source = p_vt.source
+                self.tensor_to_source[p] = param_source
+                grad_source = AttrSource(
+                    param_source,
+                    "grad",
+                )
+                if p.grad is not None:
+                    self.grad_to_source[p.grad] = grad_source
+                else:
+                    install_guard(grad_source.make_guard(GuardBuilder.CONSTANT_MATCH))
+
+        # state guards take a long time to generate
+        # so we manually generate them here
+        state_source = AttrSource(self.source, "state")
+        install_guard(state_source.make_guard(GuardBuilder.DICT_KEYS))
+        for idx, (p, value) in enumerate(self.value.state.items()):
+            tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, p)
+            p_state_source = GetItemSource(
+                state_source, ConstDictKeySource(state_source, idx)
+            )
+            install_guard(p_state_source.make_guard(GuardBuilder.DICT_KEYS))
+            for k, v in value.items():
+                if (
+                    isinstance(v, torch.Tensor)
+                    and v not in self.grad_to_source
+                    and v not in self.tensor_to_source
+                ):
+                    self.tensor_to_source[v] = GetItemSource(p_state_source, k)
+                elif v is None or isinstance(v, (bool, int, float, str)):
+                    install_guard(
+                        GetItemSource(p_state_source, k).make_guard(
+                            GuardBuilder.CONSTANT_MATCH
+                        )
+                    )
+                else:
+                    raise GuardInstallException()
+
+    def wrap_tensor(self, tx, tensor_value):
+        """Wrap state tensor in a TensorVariable"""
+        from .builder import VariableBuilder
+
+        # If we have a source for a tensor already use it,
+        # if we have not seen a tensor before, stash and use a
+        # global weak ref source, since it must be an optimizer tensor
+        # that we have missed
+
+        if tensor_value in self.tensor_to_source:
+            # mark these tensors as static for cudagraphs
+            mark_static_address(tensor_value, guard=False)
+            builder = VariableBuilder(tx, self.tensor_to_source[tensor_value])
+            self.static_tensor_names.add(tx.output.module_key_name(builder.name))
+        elif tensor_value in self.grad_to_source:
+            builder = VariableBuilder(tx, self.grad_to_source[tensor_value])
+        else:
+            # mark these tensors as static for cudagraphs
+            mark_static_address(tensor_value, guard=False)
+
+            global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
+            builder = VariableBuilder(tx, GlobalWeakRefSource(global_name))
+            self.static_tensor_names.add(tx.output.module_key_name(builder.name))
+
+        result = builder(tensor_value)
+        return result
+
+    def update_list_args(self, tx, args, kwargs, py_args, py_kwargs):
+        """Update the args and kwargs to the traced optimizer call"""
+        for arg, py_arg in zip(args, py_args):
+            if isinstance(arg, ListVariable):
+                assert isinstance(
+                    py_arg, list
+                ), "py_arg should be a list in optimizer variable"
+                for i, val in enumerate(py_arg):
+                    tx.output.side_effects.mutation(arg)
+                    if isinstance(val, torch.Tensor):
+                        arg.items.append(self.wrap_tensor(tx, val))
+                    else:
+                        from .builder import SourcelessBuilder, VariableBuilder
+
+                        if arg.source:
+                            arg.items.append(
+                                VariableBuilder(tx, GetItemSource(arg.source, i))(val)
+                            )
+                        else:
+                            arg.items.append(SourcelessBuilder()(tx, val))
+
+    def create_finalizer(self, tx):
+        names_to_delete = self.static_tensor_names
+        value = self.value
+        tc = tx.output.tracing_context
+
+        def init_finalizer(gm):
+            def clear_static_tensor_refs():
+                for name in names_to_delete:
+                    gm._buffers.pop(name, None)
+                    gm._parameters.pop(name, None)
+                    if tc.params_flat:
+                        tc.params_flat.clear()
+
+            weakref.finalize(value, clear_static_tensor_refs)
+
+        tx.output.add_graph_finalizer(init_finalizer)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/sdpa.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/sdpa.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dced6918b945e51d0d44a6ece0308ccb6d0e7b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/sdpa.py
@@ -0,0 +1,84 @@
+# mypy: ignore-errors
+
+from inspect import getattr_static
+
+from ..bytecode_transformation import create_call_function
+from ..exc import Unsupported
+from .base import VariableTracker
+
+
+class SDPAParamsVariable(VariableTracker):
+    """Represents the c++ params struct for scaled dot product attention.
+    This is a read-only container."""
+
+    @staticmethod
+    def create(tx, value, source):
+        from torch.backends.cuda import SDPAParams
+        from ..source import AttrSource
+        from .builder import VariableBuilder
+        from .torch import TorchInGraphFunctionVariable
+
+        query_var = VariableBuilder(tx, AttrSource(source, "query"))(value.query)
+        key_var = VariableBuilder(tx, AttrSource(source, "key"))(value.key)
+        value_var = VariableBuilder(tx, AttrSource(source, "value"))(value.value)
+        attn_mask_var = VariableBuilder(tx, AttrSource(source, "attn_mask"))(
+            value.attn_mask
+        )
+        dropout_var = VariableBuilder(tx, AttrSource(source, "dropout"))(value.dropout)
+        is_causal_var = VariableBuilder(tx, AttrSource(source, "is_causal"))(
+            value.is_causal
+        )
+        param_vars = [
+            query_var,
+            key_var,
+            value_var,
+            attn_mask_var,
+            dropout_var,
+            is_causal_var,
+        ]
+        return TorchInGraphFunctionVariable(SDPAParams).call_function(
+            tx, param_vars, {}
+        )
+
+    def __init__(self, proxy, param_vars, **kwargs):
+        self.proxy = proxy
+        self.param_vars = param_vars
+        super().__init__(**kwargs)
+
+    def reconstruct(self, codegen):
+        assert self.source is None
+        assert self.param_vars is not None
+        codegen.load_import_from("torch._C", "_SDPAParams")
+        codegen.foreach(self.param_vars)
+        codegen.extend_output(create_call_function(len(self.param_vars), True))
+
+    def as_proxy(self):
+        return self.proxy
+
+    def var_getattr(self, tx, name: str) -> VariableTracker:
+        import torch._C
+        from ..source import AttrSource
+        from .builder import wrap_fx_proxy
+        from .misc import GetAttrVariable
+
+        try:
+            getattr_static(torch._C._SDPAParams, name)
+        except AttributeError:
+            # Using raise from is too verbose here
+            raise Unsupported(  # noqa: TRY200
+                f"Unsupported torch._C._SDPAParams attribute {name}"
+            )
+
+        proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
+        if self.source is not None:
+            return wrap_fx_proxy(
+                tx=tx, proxy=proxy, source=AttrSource(self.source, name)
+            )
+        else:
+            return wrap_fx_proxy(tx=tx, proxy=proxy)
+
+    @staticmethod
+    def is_sdpa_params(value):
+        from torch.backends.cuda import SDPAParams
+
+        return value is SDPAParams
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/tensor.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..604cfd792e05aaf6f16b8984974b1af4f83e519a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/tensor.py
@@ -0,0 +1,1189 @@
+# mypy: ignore-errors
+
+import functools
+
+import inspect
+import operator
+import types
+from typing import Dict, List
+
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+from ..bytecode_transformation import create_call_method
+from ..external_utils import call_hook_from_backward_state
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+
+import sympy
+
+import torch._numpy as tnp
+
+import torch.fx
+import torch.random
+from torch._dynamo import compiled_autograd
+from torch._subclasses.meta_utils import is_sparse_any
+
+from torch.fx.experimental.symbolic_shapes import (
+    guard_scalar,
+    GuardOnDataDependentSymNode,
+    has_free_symbols,
+    is_symbolic,
+    SymTypes,
+)
+
+from .. import config, variables
+from .._trace_wrapped_higher_order_op import trace_wrapped
+
+from ..exc import unimplemented, UserError, UserErrorType
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource
+from ..utils import (
+    fqn,
+    get_custom_getattr,
+    get_fake_value,
+    get_real_value,
+    guard_if_dyn,
+    object_has_getattribute,
+    product,
+    proxy_args_kwargs,
+    tensortype_to_dtype,
+)
+from .base import VariableTracker
+from .constant import ConstantVariable
+from .lists import SizeVariable
+
+supported_tensor_comparison_ops = {
+    ">": operator.gt,
+    "<": operator.lt,
+    ">=": operator.ge,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+}
+supported_const_comparison_ops = {
+    "is": operator.is_,
+    "is not": operator.is_not,
+    "==": operator.eq,
+    "!=": operator.ne,
+}
+
+
+class TensorVariable(VariableTracker):
+    """A torch.Tensor input or an intermediate value in the FX graph"""
+
+    _nonvar_fields = {
+        "proxy",
+        "dtype",
+        "device",
+        "layout",
+        "ndim",
+        "size",
+        "stride",
+        "requires_grad",
+        "is_quantized",
+        "is_contiguous",
+        "is_sparse",
+        "class_type",
+        "specialized_value",
+        *VariableTracker._nonvar_fields,
+    }
+
+    def get_real_value(self):
+        """
+        Get the actual value represented by this variable if computation is run
+        using the user-provided inputs.
+        NOTE: this runs actual tensor computation and may be
+        slow and memory-intensive.
+        """
+        return get_real_value(self.proxy.node, self.proxy.tracer)
+
+    def __init__(
+        self,
+        proxy: torch.fx.Proxy,
+        *,
+        dtype,
+        device,
+        layout,
+        ndim,
+        requires_grad,
+        is_quantized,
+        is_sparse,
+        class_type,
+        size=None,
+        stride=None,
+        is_contiguous=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        self.dtype = dtype
+        self.device = device
+        self.layout = layout
+        self.ndim = ndim
+        self.size = size
+        self.stride = stride
+        self.requires_grad = requires_grad
+        self.is_quantized = is_quantized
+        self.is_contiguous = is_contiguous
+        self.is_sparse = is_sparse
+        self.class_type = class_type
+
+    def as_proxy(self):
+        return self.proxy
+
+    def python_type(self):
+        return self.class_type
+
+    @staticmethod
+    def specialize(value: torch.Tensor):
+        props = {
+            "dtype": value.dtype,
+            "device": value.device,
+            "layout": value.layout,
+            "ndim": int(value.ndim),
+            "requires_grad": value.requires_grad,
+            "is_quantized": value.is_quantized,
+            "is_sparse": value.is_sparse,
+            "class_type": type(value),
+        }
+        if is_sparse_any(value) and not has_free_symbols(value):
+            props["size"] = tuple(
+                [int(s) if is_symbolic(s) else s for s in value.size()]
+            )
+        elif not has_free_symbols(value):
+            # this is a fully static shape, and the keys on props here inform specialization.
+            # We have to cast to int here, because these might get accessed as ConstantVariable, which has
+            # a strict no-symint policy. If we got here due to not having free symbols, this is a known constant
+            # already. We could remove the discrepancy here, by having ConstantVariable be more permissive for
+            # constant backed SymInts, but that assert being strict has led to some good signal in hunting bugs, and
+            # I'd like to keep it around for now.
+            props["size"] = tuple(
+                # the non is_symbolic case applies to the jagged layout
+                # NestedTensor case as singleton ints are not symbolic
+                [int(s) if is_symbolic(s) else s for s in value.size()]
+            )
+            props["stride"] = tuple(value.stride())
+            if torch._C._functorch.is_batchedtensor(value):
+                # Batched tensors does not support contiguity patterns, so
+                # we refrain from computing the `is_contiguous` property
+                props["is_contiguous"] = None
+            else:
+                props["is_contiguous"] = tuple(
+                    [
+                        x
+                        for x in torch._prims_common._memory_formats
+                        if value.is_contiguous(memory_format=x)
+                    ]
+                )
+        return props
+
+    def dynamic_getattr(self, tx, name):
+        fake_val = self.proxy.node.meta["example_value"]
+        # For getattrs on tensors without sources,
+        # we can do better than the default (creating a GetAttrVariable)
+        # if:
+        # (1) the tensor is a traceable tensor subclass
+        # (2) We are getattr'ing an inner tensor from that subclass
+        if not self.source and is_traceable_wrapper_subclass(fake_val):
+            fake_val = self.proxy.node.meta["example_value"]
+            attrs, ctx = fake_val.__tensor_flatten__()
+            proxy = getattr(self.as_proxy(), name)
+            example_value = getattr(fake_val, name)
+            if name in attrs:
+                # attrs returned from tensor_flatten are always tensors
+                assert isinstance(example_value, torch.Tensor)
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(tx=tx, proxy=proxy, example_value=example_value)
+            # any other attributes on the subclass (that are not methods)
+            # are assumed to be constant metadata.
+            elif not callable(example_value):
+                from .builder import SourcelessBuilder
+
+                return SourcelessBuilder()(tx, example_value)
+
+        if not (self.source and self.source.subguards_allowed()):
+            raise NotImplementedError()
+
+        # For local source, we associate the real value. We use this real value
+        # for implementing getattr fallthrough on the variable tracker base class.
+
+        # Note - this scope construction is mirrored in guards
+        # A subsequent PR will introduce a util.
+        scope = {"L": tx.output.local_scope, "G": tx.output.global_scope}
+        try:
+            # We raise in case we get a typerror bug w/ SuperSource.
+            # SuperSource has bugs in it atm, and can produce code like
+            # eval("super(L['mod'].model.model.encoder.embed_positions.forward__class__,
+            # L['mod'].model.model.encoder.embed_positions)", scope)
+            # Which is incorrect, and violates the invariant that all sources should be eval()-able against the scope.
+            _input_associated_real_value = eval(self.source.name(), scope)
+        except Exception as exc:
+            raise NotImplementedError() from exc
+
+        if _input_associated_real_value is None:
+            raise NotImplementedError()
+
+        if object_has_getattribute(_input_associated_real_value):
+            raise NotImplementedError()
+
+        if get_custom_getattr(_input_associated_real_value):
+            raise NotImplementedError()
+
+        real_value = getattr(_input_associated_real_value, name)
+        if callable(real_value):
+            # Callables have more nuanced handling, and we should let the existing system delegate here.
+            # Raising was past behavior and so should always be sound to fall back.
+            # Note - at a certain point we may want to handle
+            raise NotImplementedError()
+
+        from ..guards import GuardBuilder
+        from .builder import VariableBuilder
+
+        attr_source = AttrSource(self.source, name)
+        install_guard(attr_source.make_guard(GuardBuilder.HASATTR))
+        return VariableBuilder(tx, attr_source)(real_value)
+
+    def method_attr_ndim(self, tx):
+        if self.ndim is not None:
+            return ConstantVariable.create(self.ndim)
+        else:
+            return self.call_method(tx, "dim", [], {})
+
+    def method_attr_dtype(self, tx):
+        if self.dtype is not None:
+            return ConstantVariable.create(self.dtype)
+
+    def method_attr_device(self, tx):
+        if self.device is not None:
+            return ConstantVariable.create(self.device)
+
+    def method_attr_layout(self, tx):
+        if self.layout is not None:
+            return ConstantVariable.create(self.layout)
+
+    def method_attr_is_cuda(self, tx):
+        if self.device is not None:
+            return ConstantVariable.create(self.device.type == "cuda")
+
+    def method_attr_shape(self, tx):
+        if self.size is not None:
+            sizes = [variables.ConstantVariable.create(x) for x in self.size]
+            return SizeVariable(sizes)
+        else:
+            return self.call_method(tx, "size", [], {})
+
+    def method_attr_requires_grad(self, tx):
+        if self.requires_grad is not None:
+            return ConstantVariable.create(self.requires_grad)
+
+    def method_attr_is_quantized(self, tx):
+        if self.is_quantized is not None:
+            return ConstantVariable.create(self.is_quantized)
+
+    def method_attr_is_sparse(self, tx):
+        if self.is_sparse is not None:
+            return ConstantVariable.create(self.is_sparse)
+
+    def method_attr_data(self, tx):
+        return self.call_method(tx, "detach", [], {})
+
+    def method_attr__version(self, tx):
+        from ..tensor_version_op import _tensor_version
+
+        return variables.TorchInGraphFunctionVariable(_tensor_version).call_function(
+            tx, [self], {}
+        )
+
+    def var_getattr(self, tx, name):
+        from . import UserDefinedClassVariable
+
+        if tx.strict_checks_enabled:
+            if name in self._strict_mode_banned_ops():
+                unimplemented(f"Illegal getattr invocation {name} in strict mode")
+
+        if name == "__class__":
+            return UserDefinedClassVariable(self.python_type())
+
+        handler = getattr(self, f"method_attr_{name}", None)
+        result = handler(tx) if handler is not None else None
+
+        # Add a guard for type matching, these guards are checked before tensor guards
+        # In some cases, a <tensor>.<attr> guard can be evaluated first, and break if
+        # <tensor> is later changed to another type
+        if (
+            result is not None
+            and self.source
+            and self.source.subguards_allowed()
+            and not (
+                name not in ("grad", "requires_grad") and result.is_python_constant()
+            )
+        ):
+            install_guard(self.make_guard(GuardBuilder.TYPE_MATCH))
+            result.source = AttrSource(self.source, name)
+
+        # It's hard to get inplace view (metadata mutation) on graph input work properly across
+        # dynamo/aot/inductor, just fall back.
+        if self.source is not None and hasattr(torch.ops.aten, name):
+            fn = getattr(torch.ops.aten, name)
+            if (
+                hasattr(fn, "overloads")
+                and hasattr(fn, fn.overloads()[0])
+                and torch.Tag.inplace_view in getattr(fn, fn.overloads()[0]).tags
+            ):
+                # Delay the graph break to the actual call of unsqueeze_/resize_/resize_as_ etc.
+                return variables.misc.DelayGraphBreakVariable(
+                    source=AttrSource(self.source, name)
+                )
+
+        # For attributes (not methods) that were not caught in the special handling above,
+        # (e.g. tensor.real), we handle these generically, assuming that the output type is
+        # a tensor.
+        if result is None and name != "grad":
+
+            def try_generic_attr_handling():
+                from .builder import wrap_fx_proxy
+                from .misc import GetAttrVariable
+
+                try:
+                    static_attr = inspect.getattr_static(torch.Tensor, name)
+                except AttributeError:
+                    return None
+
+                # Make sure this is an attribute, not a method.
+                # type(torch.Tensor.H) should be "getset_descriptor"
+                # This is a because of CPython implementation, see THPVariableType:
+                # these attributes are implemented under tp_getset, which appear
+                # as `getset_descriptor`s, (compared to, say, methods which appear
+                # as `method_descriptor`s)
+                if type(static_attr) != types.GetSetDescriptorType:
+                    return None
+
+                proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
+                if self.source is not None:
+                    return wrap_fx_proxy(
+                        tx=tx, proxy=proxy, source=AttrSource(self.source, name)
+                    )
+                else:
+                    return wrap_fx_proxy(tx=tx, proxy=proxy)
+
+            result = try_generic_attr_handling()
+
+        if result is None:
+            result = self.dynamic_getattr(tx, name)
+
+        if result is None:
+            raise NotImplementedError()
+        return result
+
+    def has_unpack_var_sequence(self, tx):
+        return self.ndim > 0
+
+    def unpack_var_sequence(self, tx, idxes=None):
+        from .builder import wrap_fx_proxy_cls
+
+        if idxes is None:
+            if self.size:
+                length = self.size[0]
+            else:
+                dyn_length = self.call_method(
+                    tx, "size", [ConstantVariable.create(0)], {}
+                )
+                # SymNodeVariable for symbolic sizes, ConstantVariable for constants OR values produced through
+                # symbolic_shapes, but that end up as int/sympy.Integer
+                assert isinstance(dyn_length, (SymNodeVariable, ConstantVariable))
+                if isinstance(dyn_length, SymNodeVariable):
+                    length = dyn_length.evaluate_expr(tx.output)
+                else:
+                    length = dyn_length.value
+            idxes = range(length)
+        return [
+            wrap_fx_proxy_cls(target_cls=type(self), tx=tx, proxy=self.as_proxy()[i])
+            for i in idxes
+        ]
+
+    def _strict_mode_banned_ops(self):
+        return torch._dynamo.config._autograd_backward_strict_mode_banned_ops
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if tx.strict_checks_enabled:
+            if name in self._strict_mode_banned_ops():
+                unimplemented(f"Illegal method invocation {name} in strict mode")
+
+        """
+        Dispatch to a method-specific handler defined below.  If the
+        handler returns None (or doesn't exist) we put the method call
+        in the graph.
+        """
+        try:
+            handler_method = getattr(self, f"method_{name}")
+        except AttributeError:
+            pass
+        else:
+            try:
+                result = handler_method(*args, **kwargs)
+                if result:
+                    return result
+            except TypeError as e:
+                unimplemented(f"unhandled args for {name}: {e}")
+
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                name,
+                *proxy_args_kwargs([self, *args], kwargs),
+            ),
+        )
+
+    def method_size(self, *args, **kwargs):
+        return self._method_size_stride("size", *args, **kwargs)
+
+    def method_stride(self, *args, **kwargs):
+        return self._method_size_stride("stride", *args, **kwargs)
+
+    def _method_size_stride(self, name, dim=None):
+        dim = guard_if_dyn(dim)
+
+        def make_const_size_variable(x, **options):
+            return SizeVariable(
+                [ConstantVariable.create(y, **options) for y in x], **options
+            )
+
+        RetVariable = (
+            make_const_size_variable if name == "size" else ConstantVariable.create
+        )
+
+        # Technically, this should not be necessary, but I'm including it
+        # for enhanced BC, in case example_value is sometimes not set
+        # (it really should always be set though!)
+        if (r := getattr(self, name)) is not None:
+            if dim is None:
+                return RetVariable(r)
+            else:
+                return ConstantVariable.create(r[dim])
+
+        # It might still be constant!  Consult the fake tensor and see
+        if (fake := self.proxy.node.meta.get("example_value")) is not None:
+            if dim is None:
+                fake_r = getattr(fake, name)()
+                if not has_free_symbols(fake_r):
+                    # int conversion for safety, in case a SymInt refined
+                    # to constant
+                    return RetVariable(tuple(int(r) for r in fake_r))
+            else:
+                fake_r = getattr(fake, name)(dim)
+                if not has_free_symbols(fake_r):
+                    return ConstantVariable.create(int(fake_r))
+
+    def method_numel(self):
+        if self.size is not None:
+            return ConstantVariable.create(product(self.size))
+
+        # It might still be constant!  Consult the fake tensor and see
+        if (fake := self.proxy.node.meta.get("example_value")) is not None:
+            fake_r = fake.numel()
+            if not has_free_symbols(fake_r):
+                return ConstantVariable.create(int(fake_r))
+
+    method_nelement = method_numel
+
+    def method_dim(self):
+        if self.ndim is not None:
+            return ConstantVariable.create(self.ndim)
+
+    method_ndimension = method_dim
+
+    def method_is_floating_point(self):
+        if self.dtype is not None:
+            return ConstantVariable.create(self.dtype.is_floating_point)
+
+    def method_is_contiguous(self, memory_format=None):
+        memory_format = (
+            memory_format.as_python_constant()
+            if memory_format is not None
+            else torch.contiguous_format
+        )
+        if self.is_contiguous is not None:
+            return ConstantVariable.create(memory_format in self.is_contiguous)
+        elif (fake := self.proxy.node.meta.get("example_value")) is not None:
+            return ConstantVariable.create(
+                fake.is_contiguous(memory_format=memory_format)
+            )
+
+    def method_type(self, dtype=None, non_blocking=False, **kwargs):
+        if (
+            dtype is None
+            and self.dtype is not None
+            and isinstance(self.device, torch.device)
+        ):
+            tensortype = next(
+                k for k, v in tensortype_to_dtype.items() if self.dtype in v
+            )
+            if self.device.type == "cuda":
+                return ConstantVariable.create(f"torch.cuda.{tensortype.__name__}")
+            else:
+                return ConstantVariable.create(f"torch.{tensortype.__name__}")
+        elif (
+            dtype is not None
+            and fqn(type(dtype.as_python_constant())) == "torch.tensortype"
+        ):
+            # torch.FloatTensor, etc. are all of type "torch.tensortype".
+            # torch.fx's tracer fails on these types, because it doesn't support arguments of torch.tensortype type.
+            # So, we pass it in as a string (which is also supported, see above implementation for .type() with 0 args)
+            tensor_type = dtype.as_python_constant()
+            tensor_type_const = ConstantVariable.create(fqn(tensor_type))
+
+            from ..symbolic_convert import InstructionTranslator
+            from .builder import wrap_fx_proxy
+
+            tx = InstructionTranslator.current_tx()
+
+            if non_blocking:
+                kwargs = {"non_blocking": non_blocking, **kwargs}
+
+            return wrap_fx_proxy(
+                tx,
+                tx.output.create_proxy(
+                    "call_method",
+                    "type",
+                    *proxy_args_kwargs([self, tensor_type_const], kwargs),
+                ),
+            )
+
+    def method_as_subclass(self, cls):
+        if isinstance(cls, TensorSubclassVariable) and cls.source:
+            from ..symbolic_convert import InstructionTranslator
+            from .builder import VariableBuilder
+            from .torch_function import TensorWithTFOverrideVariable
+
+            tx = InstructionTranslator.current_tx()
+
+            # [Note: __torch_function__] coerce this tensor variable into a TensorWithTFOverrideVariable
+            # in eager, this is just a type change. This isn't sound if a __torch_function__ tensor subclass
+            # defines a constructor, but if only a __torch_function__ impl is defined, this is okay to call.
+            # It is up to the user whether this is correct behavior or not.
+            py_cls = cls.as_python_constant()
+            torch_fn = VariableBuilder(
+                tx,
+                AttrSource(AttrSource(cls.source, "__torch_function__"), "__func__"),
+            )(py_cls.__torch_function__.__func__)
+
+            return TensorWithTFOverrideVariable.from_tensor_var(
+                tx, self, py_cls, torch_fn
+            )
+
+    def method_get_device(self):
+        if isinstance(self.device, torch.device):
+            index = self.device.index if self.device.type != "cpu" else -1
+            return ConstantVariable.create(index)
+
+    def method_element_size(self):
+        return ConstantVariable.create(self.dtype.itemsize)
+
+    def method_numpy(self, *, force=False):
+        if not config.trace_numpy:
+            unimplemented("Tensor.numpy(). config.trace_numpy is False")
+        if not np:
+            unimplemented("Tensor.numpy(). NumPy is not available")
+        if self.layout != torch.strided:
+            raise TypeError(
+                f"can't convert {self.layout} layout tensor to numpy. Use Tensor.dense() first"
+            )
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+
+        # We don't check that the tensor is on CPU when force is False, as this
+        # allows us to execute NumPy code on CUDA. Same for requires_grad=True
+        if force and force.as_python_constant():
+            # If the user set force=True we try to preserve the semantics (no gradients, move to CPU...)
+            t = self.call_method(tx, "detach", [], {})
+            proxy = tx.output.create_proxy("call_method", "cpu", (t.as_proxy(),), {})
+        else:
+            # Hacky way to create a view of self that will be marked as NumpyNdarrayVariable
+            proxy = tx.output.create_proxy(
+                "call_method", "view_as", *proxy_args_kwargs([self, self], {})
+            )
+        return NumpyNdarrayVariable.create(tx, proxy)
+
+    def method_tolist(self):
+        from ..symbolic_convert import InstructionTranslator
+        from .builder import SourcelessBuilder
+
+        tx = InstructionTranslator.current_tx()
+
+        def tolist(tensor, sub_proxy):
+            def wrap(i, sub_proxy):
+                return SymNodeVariable.create(
+                    tx,
+                    sub_proxy.item(),
+                    sym_num=tx.output.shape_env.create_unbacked_symint(),
+                )
+
+            if tensor.dtype not in [
+                torch.int8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                unimplemented("Input tensor for tolist must be an integer tensor")
+
+            if tensor.dim() == 0:
+                return wrap(tensor, sub_proxy)
+
+            if tensor.dim() == 1:
+                return [wrap(val, sub_proxy[i]) for i, val in enumerate(tensor)]
+
+            return [
+                tolist(sub_tensor, sub_proxy=sub_proxy[i])
+                for i, sub_tensor in enumerate(tensor)
+            ]
+
+        tensor = self.as_proxy().node.meta["example_value"]
+        out = tolist(tensor, self.as_proxy())
+        return SourcelessBuilder()(tx, out)
+
+    def method_backward(self, *args, **kwargs):
+        unimplemented("Tensor.backward")
+
+    def method_data_ptr(self, *args, **kwargs):
+        unimplemented("Tensor.data_ptr")
+
+    def method_item(self, *args, **kwargs):
+        if not config.capture_scalar_outputs:
+            unimplemented("Tensor.item")
+
+    def method___len__(self):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        return self.call_method(tx, "size", [ConstantVariable.create(0)], {})
+
+    def method___setitem__(self, key, value):
+        def has_bool_key(v):
+            if isinstance(v, TensorVariable):
+                return v.dtype in (torch.bool, torch.int8)
+            elif isinstance(v, variables.TupleVariable):
+                return any(has_bool_key(item) for item in v.items)
+            else:
+                return False
+
+        if (
+            has_bool_key(key)
+            and isinstance(value, TensorVariable)
+            and value.requires_grad
+            and torch.is_grad_enabled()
+        ):
+            unimplemented(
+                "boolean masking setitem backwards, see https://github.com/pytorch/pytorch/issues/114123"
+            )
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        tx.output.create_proxy(
+            "call_function",
+            operator.setitem,
+            *proxy_args_kwargs([self, key, value], {}),
+        )
+        return ConstantVariable.create(None)
+
+    def method_resize_(self, *args, **kwargs):
+        unimplemented("Tensor.resize_")
+
+    def method_resize_as_(self, *args, **kwargs):
+        unimplemented("Tensor.resize_as_")
+
+    def method_set_(self, *args, **kwargs):
+        if len(args) > 1:
+            # torch.Tensor.set_() has several overloads.
+            # aten::set_.source_Tensor(Tensor) gets special handling
+            # in AOTAutograd and functionalization, because it is the most common
+            # overload and is used by FSDP.
+            # graph-breaking on aten::set_source_Tensor_storage_offset for now,
+            # unless we find that we need to make it work.
+            unimplemented("Tensor.set_.source_Tensor_storage_offset")
+
+    def method_add_(self, other, *, alpha=None):
+        if alpha is not None:
+            from ..symbolic_convert import InstructionTranslator
+
+            tx = InstructionTranslator.current_tx()
+            result = variables.TorchInGraphFunctionVariable(torch.mul).call_function(
+                tx, [other, alpha], {}
+            )
+            return self.call_method(tx, "add_", [result], {})
+
+    def method_addcdiv_(self, tensor1, tensor2, *, value=None):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        if value is not None:
+            result = variables.TorchInGraphFunctionVariable(torch.div).call_function(
+                tx, [tensor1, tensor2], {}
+            )
+            result = variables.TorchInGraphFunctionVariable(torch.mul).call_function(
+                tx, [result, value], {}
+            )
+            return self.call_method(tx, "add_", [result], {})
+
+    def method___contains__(self, arg):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+
+        # Rewrite __contains__ here so that downstream passes can trace through
+        # without dealing with unbacked symbool. Roughly the code we translate is:
+        # def __contains__(self, x):
+        #     return (x == self).any().item()
+        result = variables.TorchInGraphFunctionVariable(torch.eq).call_function(
+            tx, [self, arg], {}
+        )
+        result = variables.TorchInGraphFunctionVariable(torch.any).call_function(
+            tx, [result], {}
+        )
+        return result.call_method(tx, "item", [], {})
+
+    def method_redistribute(self, *args, **kwargs):
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+        # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
+        # and rewrite args to have only proxyable args, then insert call_function
+        args_as_value = [x.as_python_constant() for x in args]
+        kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
+
+        def redistribute_fn_with_prim_types(x):
+            return x.redistribute(*args_as_value, **kwargs_as_value)
+
+        # attach the same function name for better debugging
+        redistribute_fn_with_prim_types.__name__ = "prim_redistribute"
+
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx=tx,
+            proxy=tx.output.create_proxy(
+                "call_function",
+                redistribute_fn_with_prim_types,
+                *proxy_args_kwargs([self], {}),
+            ),
+        )
+
+    def method_register_hook(self, *args, **kwargs):
+        return self._method_register_hook("register_hook", *args, **kwargs)
+
+    def method_register_post_accumulate_grad_hook(self, *args, **kwargs):
+        return self._method_register_hook(
+            "register_post_accumulate_grad_hook", *args, **kwargs
+        )
+
+    def _method_register_hook(self, name: str, hook: VariableTracker):
+        # Note - do not arbitrarily add hooks here - make sure they match the same contract
+        # see [On tensor.register_hook]
+        from ..symbolic_convert import InstructionTranslator
+
+        tx = InstructionTranslator.current_tx()
+
+        if not self.source:
+            if not compiled_autograd.compiled_autograd_enabled:
+                # TODO(voz):
+                # We can relax this by speculating the callable and ensuring that it doesn't modify arbitrary
+                # python state.
+                # We *Must* be in compiled_autograd here because backward hooks can contain anything, and it is unsafe to run
+                # them in a compiled bwd without re-entering dynamo as compiled_autograd does.
+                #
+                # Discussion point 1 - Should we bypass this if nopython/fullgraph = True?
+                #   No. Because this was going to be a graph break anyway - this check does not
+                # introduce new graph breaks where there were none.
+                #
+                # Discussion point 2 - Should we defer this check to backwards?
+                #   No. Because compiled autograd is not yet ready for prime time. As such, if we defer, a user
+                # would have no recourse - their forward traces just fine, but will fail at backwards unless
+                # compiled_autograd is enabled. If compiled_autograd fails (there are a lot of failures today)
+                # then they have nothing they can do except disable compile.
+                unimplemented(
+                    "Compilation of intermediate hooks requires compiled autograd"
+                )
+
+            hook_name, bw_state_proxy = tx.output.add_backward_state_hook(hook)
+
+            def _register_hook_trampoline(tensor, bw_state):
+                register_hook = getattr(tensor, name)
+                register_hook(
+                    functools.partial(
+                        trace_wrapped,
+                        fn=call_hook_from_backward_state,
+                        bw_state=bw_state,
+                        hook_name=hook_name,
+                    )
+                )
+                # TODO(jansel): returning None here is wrong, it should be
+                # RemovableHandle, but we need some extra work to support
+                # this properly.
+                return None
+
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
+                tx,
+                tx.output.create_proxy(
+                    "call_function",
+                    _register_hook_trampoline,
+                    (self.as_proxy(), bw_state_proxy),
+                    {},
+                ),
+            )
+
+        handle_variable = variables.RemovableHandleVariable(
+            mutable_local=variables.base.MutableLocal(),
+        )
+        tx.output.side_effects.register_hook(self, hook, handle_variable, name)
+        return handle_variable
+
+    def method_requires_grad_(self, requires_grad=True):
+        if requires_grad is not True:
+            requires_grad = requires_grad.as_python_constant()
+
+        if self.as_proxy().node.meta["example_value"].requires_grad != requires_grad:
+            unimplemented("Tensor.requires_grad_")
+        else:
+            return self
+
+    def method_new(self, *args, **kwargs):
+        # Convert x.new(torch.Size) into x.new_empty(torch.Size),
+        # as Tensor.new acts differently with a Size input versus a tuple input.
+        if (len(args) == 1 and isinstance(args[0], SizeVariable)) or (
+            len(args) >= 1
+            and all(
+                isinstance(a, ConstantVariable) and a.python_type() == int for a in args
+            )
+        ):
+            from ..symbolic_convert import InstructionTranslator
+
+            return self.call_method(
+                InstructionTranslator.current_tx(), "new_empty", args, kwargs
+            )
+
+    def method_untyped_storage(self):
+        return UntypedStorageVariable(
+            self, self.as_proxy().node.meta["example_value"].untyped_storage()
+        )
+
+    def rename(self, tx, name):
+        self.proxy.node._rename(name)
+        return super().rename(tx, name)
+
+
+class SymNodeVariable(VariableTracker):
+    """
+    Represents a symbolic size, e.g., as returned by tensor.size(0)
+    """
+
+    @classmethod
+    def create(cls, tx, proxy, sym_num, **options):
+        if "example_value" in proxy.node.meta:
+            assert proxy.node.meta["example_value"] == sym_num
+        if sym_num is None:
+            sym_num = get_fake_value(proxy.node, tx)
+        proxy.node.meta["example_value"] = sym_num
+
+        if isinstance(sym_num, (sympy.Integer, int, bool)):
+            sym_num = int(sym_num) if isinstance(sym_num, sympy.Integer) else sym_num
+            return ConstantVariable.create(sym_num)
+
+        return SymNodeVariable(proxy, sym_num, **options)
+
+    def __init__(self, proxy, sym_num, **kwargs):
+        super().__init__(**kwargs)
+        self.proxy = proxy
+        # TODO: Should we allow non SymTypes here?  Today it is allowed
+        self.sym_num = sym_num
+
+    def python_type(self):
+        if isinstance(self.sym_num, SymTypes):
+            return self.sym_num.node.pytype
+        else:
+            return type(self.sym_num)
+
+    def as_proxy(self):
+        return self.proxy
+
+    def evaluate_expr(self, output_graph=None):
+        try:
+            return guard_scalar(self.sym_num)
+        except GuardOnDataDependentSymNode as e:
+            raise UserError(  # noqa: TRY200
+                UserErrorType.ANTI_PATTERN,
+                f"Consider annotating your code using torch._constrain_as_*(). {str(e)}",
+                case_name="constrain_as_size_example",
+            )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from .builder import wrap_fx_proxy
+
+        return wrap_fx_proxy(
+            tx,
+            tx.output.create_proxy(
+                "call_method",
+                name,
+                *proxy_args_kwargs([self, *args], kwargs),
+            ),
+        )
+
+
+class NumpyNdarrayVariable(TensorVariable):
+    """
+    Represents a np.ndarray, but backed by torch Tensor via torch._numpy.ndarray.
+    Use this for Tensor.numpy() call.
+    """
+
+    @staticmethod
+    def create(tx, proxy, **options):
+        from .builder import wrap_fx_proxy_cls
+
+        return wrap_fx_proxy_cls(
+            target_cls=NumpyNdarrayVariable,
+            tx=tx,
+            proxy=proxy,
+            **options,
+        )
+
+    def var_getattr(self, tx, name):
+        # NB: This INTENTIONALLY does not call super(), because there is
+        # no intrinsic reason ndarray properties are related to Tensor
+        # properties.  The inheritance here is for implementation sharing.
+
+        from ..utils import numpy_attr_wrapper
+        from .builder import wrap_fx_proxy
+
+        result = None
+
+        example_value = self.as_proxy().node.meta["example_value"]
+        example_ndarray = tnp.ndarray(example_value)
+
+        def insert_into_graph():
+            return wrap_fx_proxy(
+                tx,
+                tx.output.create_proxy(
+                    "call_function", numpy_attr_wrapper, (self.as_proxy(), name), {}
+                ),
+            )
+
+        if name in ["T", "real", "imag"]:
+            proxy = tx.output.create_proxy(
+                "call_function",
+                numpy_attr_wrapper,
+                (self.as_proxy(), name),
+                {},
+            )
+            result = NumpyNdarrayVariable.create(tx, proxy)
+
+        # These are awkward to implement.  The standard playbook for torch._numpy
+        # interop is to trace a call into the torch._numpy wrapper which works for
+        # Tensor operations.  However, we don't want to do this for calls
+        # that don't return Tensors, because in those cases we may not want
+        # to trace the attribute access into the graph at all (it is sort
+        # of harmless to do so, because AOTAutograd will eliminate them,
+        # but it's best not to trace them in to begin with.)  But in any
+        # case, tracing these into the graph is like trying to fit a square
+        # peg into a round hole; best not to do it.  So instead we
+        # painstakingly implement these by hand
+        #
+        # NB: only ALWAYS specialized attributes can go here; notably,
+        # size/shape not allowed!
+        elif name in ("ndim", "itemsize"):
+            return ConstantVariable.create(getattr(example_ndarray, name))
+        elif name in ("shape", "stride"):
+            if not has_free_symbols(r := getattr(example_ndarray, name)):
+                return ConstantVariable.create(tuple(int(r) for r in r))
+            return insert_into_graph()
+        elif name == "size":
+            if not has_free_symbols(r := example_ndarray.size):
+                return ConstantVariable.create(int(r))
+            return insert_into_graph()
+        elif name in ["base", "flags", "dtype"]:
+            unimplemented(f"TODO: add support for ndarray.{name}")
+        elif name in ["__version__"]:
+            unimplemented("delegate np.__version__ to NumPy")
+        if result is None:
+            raise NotImplementedError()
+        return result
+
+    @staticmethod
+    def patch_args(name, args, kwargs):
+        if name == "clip":
+            kwargs_rename = {"a_min": "min", "a_max": "max"}
+            kwargs = {kwargs_rename.get(k, k): v for k, v in kwargs.items()}
+        return args, kwargs
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from ..utils import numpy_method_wrapper
+
+        args, kwargs = self.patch_args(name, args, kwargs)
+
+        if name in ["__len__", "size", "tolist"]:
+            # delegate back to TensorVariable
+            return super().call_method(tx, name, args, kwargs)
+        if name == "tobytes":
+            unimplemented("tobytes is not modelled in torch._numpy")
+        proxy = tx.output.create_proxy(
+            "call_function",
+            numpy_method_wrapper(name),
+            *proxy_args_kwargs([self] + list(args), kwargs),
+        )
+        return NumpyNdarrayVariable.create(tx, proxy)
+
+    def python_type(self):
+        return np.ndarray
+
+
+class UnspecializedPythonVariable(TensorVariable):
+    """
+    This is a 1-element tensor represents unspecialized python float/int.
+    """
+
+    def __init__(
+        self, proxy: torch.fx.Proxy, *, raw_value=None, need_unwrap=True, **kwargs
+    ):
+        super().__init__(proxy, **kwargs)
+        self.raw_value = raw_value
+        self.need_unwrap = need_unwrap
+
+    @classmethod
+    def from_tensor_variable(cls, tensor_variable, raw_value, need_unwrap=True):
+        # Convert a `TensorVariable` instance into an `UnspecializedPythonVariable` instance.
+        return UnspecializedPythonVariable(
+            **dict(tensor_variable.__dict__),
+            raw_value=raw_value,
+            need_unwrap=need_unwrap,
+        )
+
+
+class FakeItemVariable(TensorVariable):
+    """An unspecialized python variable which prevents access to the underlying raw value.
+    This is needed if item is called on a FakeTensor."""
+
+    def __init__(self, proxy: torch.fx.Proxy, **kwargs):
+        need_unwrap = kwargs.pop("need_unwrap", False)
+        super().__init__(proxy, **kwargs)
+        self.need_unwrap = need_unwrap
+
+    @classmethod
+    def from_tensor_variable(cls, tensor_variable):
+        return FakeItemVariable(**dict(tensor_variable.__dict__))
+
+
+class TensorSubclassVariable(VariableTracker):
+    def __init__(self, value, *args, **kwargs):
+        self.value = value
+        super().__init__(*args, **kwargs)
+
+    def call_function(
+        self, tx, args: List[VariableTracker], kwargs: Dict[str, VariableTracker]
+    ) -> VariableTracker:
+        if len(args) == 1 and isinstance(args[0], TensorVariable):
+            from .builder import VariableBuilder
+            from .torch_function import TensorWithTFOverrideVariable
+
+            torch_fn = VariableBuilder(
+                tx, AttrSource(self.source, "__torch_function__")
+            )(self.value.__torch_function__)
+
+            return TensorWithTFOverrideVariable.from_tensor_var(
+                tx, args[0], self.value, torch_fn
+            )
+
+        return super().call_function(tx, args, kwargs)
+
+    def as_python_constant(self):
+        return self.value
+
+    def python_type(self):
+        return type(self.value)
+
+
+class UntypedStorageVariable(VariableTracker):
+    _nonvar_fields = {
+        "example_value",
+        *VariableTracker._nonvar_fields,
+    }
+
+    def __init__(
+        self,
+        from_tensor: TensorVariable,
+        example_value: torch.UntypedStorage,
+        **kwargs,
+    ):
+        super().__init__(**kwargs),
+        self.from_tensor = from_tensor
+        # Example_value will always have device="meta"
+        self.example_value = example_value
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: List[VariableTracker],
+        kwargs: Dict[str, VariableTracker],
+    ) -> VariableTracker:
+        if name == "size":
+            assert not args
+            assert not kwargs
+            result = self.example_value.size()
+            if not has_free_symbols(result):
+                # avoid creating a node in the graph
+                return ConstantVariable.create(int(result))
+            else:
+                from ..external_utils import untyped_storage_size
+                from .builder import wrap_fx_proxy
+
+                return wrap_fx_proxy(
+                    tx,
+                    tx.output.create_proxy(
+                        "call_function",
+                        untyped_storage_size,
+                        (self.from_tensor.as_proxy(),),
+                        {},
+                    ),
+                )
+        if name == "resize_" and len(args) == 1:
+            assert not kwargs
+            tx.output.create_proxy(
+                "call_function",
+                torch.ops.inductor.resize_storage_bytes_,
+                (self.from_tensor.as_proxy(), args[0].as_proxy()),
+                {},
+            )
+            return self
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def reconstruct(self, codegen):
+        codegen(self.from_tensor)
+        codegen.append_output(codegen.create_load_method("untyped_storage"))
+        codegen.extend_output(create_call_method(0))
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/torch.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac021468c7d37479eeeb8e95dfb58fb628ceb78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/torch.py
@@ -0,0 +1,823 @@
+# mypy: ignore-errors
+
+import inspect
+import logging
+
+import math
+import re
+from typing import Dict, List
+
+import torch._C
+import torch._refs
+import torch.fx
+import torch.nn
+import torch.onnx.operators
+from torch._logging import warning_once
+
+from torch._streambase import _StreamBase
+from ..._guards import TracingContext
+from .. import config, polyfill, variables
+from ..codegen import PyCodegen
+from ..device_interface import get_registered_device_interfaces
+from ..exc import unimplemented
+from ..guards import GuardBuilder, install_guard
+from ..source import SyntheticLocalSource
+from ..utils import (
+    check_constant_args,
+    check_unspec_python_args,
+    guard_if_dyn,
+    has_torch_function,
+    hashable,
+    product,
+    proxy_args_kwargs,
+    unwrap_if_wrapper,
+)
+from .base import VariableTracker
+from .ctx_manager import (
+    AutocastModeVariable,
+    NullContextVariable,
+    TorchFunctionDisableVariable,
+)
+from .distributed import is_constant_pg_functions, is_from_local, ProcessGroupVariable
+from .lists import ListVariable, TupleVariable
+from .torch_function import can_dispatch_torch_function, dispatch_torch_function
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+log = logging.getLogger(__name__)
+
+supported_ctx_manager_classes = {
+    torch.profiler.profiler.profile,
+    torch.autograd.profiler.profile,
+    torch.autograd.profiler.record_function,
+    torch._C.DisableTorchFunctionSubclass,
+    torch._functorch.vmap.vmap_increment_nesting,
+    torch._functorch.eager_transforms.grad_increment_nesting,
+    torch._functorch.eager_transforms.enable_inplace_requires_grad,
+    torch.amp.autocast_mode.autocast,
+    torch.autograd.grad_mode.enable_grad,
+    torch.autograd.grad_mode.inference_mode,
+    torch.autograd.grad_mode.no_grad,
+    torch.autograd.grad_mode.set_grad_enabled,
+    torch.autograd.graph.disable_saved_tensors_hooks,
+    torch.cpu.amp.autocast_mode.autocast,
+    torch.cuda.amp.autocast_mode.autocast,
+}
+
+
+REWRITE_OPS_TO_TENSOR_SIZE_METHOD = [
+    torch.onnx.operators.shape_as_tensor,
+    torch._shape_as_tensor,
+]
+
+constant_fold_functions = [
+    torch._assert,
+    torch._utils._get_device_index,
+    torch._C._get_cublas_allow_tf32,
+    torch.cuda.get_device_properties,
+    torch.cuda.is_available,
+    torch.distributed.is_available,
+    torch.get_autocast_gpu_dtype,
+    torch.get_default_dtype,
+    torch.is_autocast_cache_enabled,
+    torch.is_autocast_cpu_enabled,
+    torch.is_autocast_enabled,
+    torch.is_complex,
+    torch.is_floating_point,
+    torch.nn.functional._Reduction.get_enum,
+    torch.promote_types,
+    torch._C._get_privateuse1_backend_name,
+]
+
+
+if torch.distributed.is_available():
+    constant_fold_functions.extend(
+        [
+            torch.distributed.is_initialized,
+            torch.distributed.get_rank,
+            torch.distributed.get_world_size,
+        ]
+    )
+
+
+tracing_state_functions = {
+    torch.jit.is_scripting: False,
+    torch.jit.is_tracing: False,
+    torch._C._get_tracing_state: None,
+    torch.fx._symbolic_trace.is_fx_tracing: False,
+    torch.onnx.is_in_onnx_export: False,
+    torch._dynamo.external_utils.is_compiling: True,
+    torch._utils.is_compiling: True,
+    torch.compiler.is_compiling: True,
+    torch.compiler.is_dynamo_compiling: True,
+}
+
+
+class BaseTorchVariable(VariableTracker):
+    """common base for all torch.* functions, classes, modules and other things"""
+
+    @classmethod
+    def create_with_source(cls, value, source):
+        install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+        return cls(
+            value,
+            source=source,
+        )
+
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def reconstruct(self, codegen):
+        try:
+            name = f"{self.value.__module__}.{self.value.__name__}"
+        except Exception:
+            name = f"torch_obj_{id(self.value)}"
+        unique_var_name = "__" + re.sub(r"[^a-zA-Z0-9_]+", "_", name)
+        codegen.extend_output(
+            codegen.setup_globally_cached(unique_var_name, self.value, False)
+        )
+
+    def as_proxy(self):
+        return self.value
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_python_constant(self):
+        return self.value
+
+    def call_hasattr(self, tx, name):
+        result = hasattr(self.value, name)
+        return variables.ConstantVariable.create(result)
+
+    def can_constant_fold_through(self):
+        if self.value in constant_fold_functions:
+            return True
+        return getattr(self.value, "__module__", None) == "math"
+
+
+class TorchCtxManagerClassVariable(BaseTorchVariable):
+    """Points to a context manager class in torch.* that dynamo has implementations"""
+
+    def __repr__(self):
+        return f"TorchCtxManagerClassVariable({self.value})"
+
+    @staticmethod
+    def is_matching_cls(value):
+        # Unwrap if it's a functools.lru_cache wrapper
+        value = unwrap_if_wrapper(value)
+        # We can't do isinstance(value, type) check because some ctx managers
+        # are implemented as a function decorated by contextlib.contextmanager,
+        # E.g., torch._functorch.vmap.vmap_increment_nesting.
+        return hashable(value) and value in supported_ctx_manager_classes
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import (
+            DisabledSavedTensorsHooksVariable,
+            GradIncrementNestingCtxManagerVariable,
+            GradInplaceRequiresGradCtxManagerVariable,
+            GradModeVariable,
+            InferenceModeVariable,
+            StreamVariable,
+            VmapIncrementNestingCtxManagerVariable,
+        )
+
+        if self.value is torch.no_grad:
+            if len(args) == 1 and isinstance(
+                args[0], variables.functions.BaseUserFunctionVariable
+            ):
+                ctx = GradModeVariable.create(tx, False)
+                return ctx.call_function(tx, args, kwargs)
+            else:
+                return GradModeVariable.create(tx, False)
+        elif self.value is torch.enable_grad:
+            if len(args) == 1 and isinstance(
+                args[0], variables.functions.BaseUserFunctionVariable
+            ):
+                ctx = GradModeVariable.create(tx, True)
+                return ctx.call_function(tx, args, kwargs)
+            return GradModeVariable.create(tx, True)
+        elif self.value is torch.set_grad_enabled and len(args) == 1:
+            return GradModeVariable.create(
+                tx, args[0].as_python_constant(), initialized=True
+            )
+        elif self.value is torch.inference_mode:
+            assert len(args) <= 1 and len(kwargs) == 0
+            inf_mode = args[0].as_python_constant() if len(args) == 1 else True
+            return InferenceModeVariable.create(tx, inf_mode)
+        elif inspect.isclass(self.value) and issubclass(self.value, _StreamBase):
+            from torch._dynamo.variables.builder import wrap_fx_proxy_cls
+
+            return wrap_fx_proxy_cls(
+                StreamVariable,
+                tx,
+                tx.output.create_proxy(
+                    "call_function",
+                    self.value,
+                    (),
+                    {},
+                ),
+            )
+        elif self.value in (
+            torch.amp.autocast_mode.autocast,
+            torch.cuda.amp.autocast,
+            torch.cpu.amp.autocast,
+        ):
+            return AutocastModeVariable.create(self.value, args, kwargs)
+        elif self.value in (
+            torch.profiler.profile,
+            torch.profiler.record_function,
+            torch.autograd.profiler.profile,
+            torch.autograd.profiler.record_function,
+        ):
+            warning_once(log, "Profiler function %s will be ignored", self.value)
+            return NullContextVariable()
+        elif self.value is torch._C.DisableTorchFunctionSubclass:
+            assert not (args or kwargs)
+            return TorchFunctionDisableVariable.create(tx)
+        elif self.value is torch._functorch.vmap.vmap_increment_nesting:
+            assert len(args) == 2
+            return VmapIncrementNestingCtxManagerVariable.create(
+                tx,
+                [guard_if_dyn(x) for x in args],
+            )
+        elif self.value is torch._functorch.eager_transforms.grad_increment_nesting:
+            assert len(args) == 0
+            return GradIncrementNestingCtxManagerVariable.create(tx)
+        elif (
+            self.value is torch._functorch.eager_transforms.enable_inplace_requires_grad
+        ):
+            assert len(args) == 1
+            return GradInplaceRequiresGradCtxManagerVariable.create(
+                tx,
+                [guard_if_dyn(x) for x in args],
+            )
+        elif self.value is torch.autograd.graph.disable_saved_tensors_hooks:
+            assert len(args) == 1
+            return DisabledSavedTensorsHooksVariable.create(
+                tx, args[0].as_python_constant()
+            )
+
+
+class TorchInGraphFunctionVariable(BaseTorchVariable):
+    """Points to a torch function/method that should be put in FX graph"""
+
+    def __repr__(self):
+        return f"TorchInGraphFunctionVariable({self.value})"
+
+    def get_function(self):
+        return self.value
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from . import (
+            ConstantVariable,
+            DeterministicAlgorithmsVariable,
+            GradModeVariable,
+            SDPAParamsVariable,
+            StreamContextVariable,
+            SymNodeVariable,
+            TensorVariable,
+            UserDefinedObjectVariable,
+        )
+
+        from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
+
+        constant_args = check_constant_args(args, kwargs)
+        unspec_python_args = check_unspec_python_args(args, kwargs)
+
+        if self.can_constant_fold_through() and (constant_args or unspec_python_args):
+            # constant fold
+            return ConstantVariable.create(
+                self.as_python_constant()(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                ),
+            )
+        elif self.value in tracing_state_functions:
+            assert not args and not kwargs
+            # See: https://github.com/pytorch/pytorch/issues/110765
+            if self.value in (
+                torch._utils.is_compiling,
+                torch._dynamo.external_utils.is_compiling,
+                torch.compiler.is_compiling,
+                torch.compiler.is_dynamo_compiling,
+            ):
+                tx.mark_inconsistent_side_effects()
+            return ConstantVariable.create(tracing_state_functions[self.value])
+        elif self.value is torch.overrides.get_default_nowrap_functions.__wrapped__:
+            # [Note: __torch_function__] we return empty here because we restrict
+            # the set of functions that we trace __torch_function__ on to
+            # functions outside of the actual set. Implementing this properly will require implementing
+            # some variable types to track and compare tensor getset descriptors
+            from .builder import SourcelessBuilder
+
+            return SourcelessBuilder()(
+                tx, torch.overrides.get_default_nowrap_functions()
+            )
+        elif self.value == torch.ops.inductor.accumulate_grad_.default:
+            from .builder import SourcelessBuilder
+
+            return tx.inline_user_function_return(
+                SourcelessBuilder()(tx, polyfill.accumulate_grad), args, kwargs
+            )
+        elif self.value == math.radians and not (constant_args or unspec_python_args):
+            # Use polyfill to convert math.radians(x) into math.pi * x / 180.0
+            from .builder import SourcelessBuilder
+
+            return tx.inline_user_function_return(
+                SourcelessBuilder()(tx, polyfill.radians), args, kwargs
+            )
+        elif self.value in (torch.is_tensor, torch.overrides.is_tensor_like):
+            assert len(args) == 1
+            if isinstance(args[0], TensorVariable) or (
+                self.value is torch.overrides.is_tensor_like
+                and isinstance(args[0], UserDefinedObjectVariable)
+                and hasattr(args[0].value, "__torch_function__")
+            ):
+                return ConstantVariable.create(True)
+            else:
+                return ConstantVariable.create(False)
+        elif self.value in (
+            torch.is_floating_point,
+            torch.is_complex,
+        ):
+            input_arg = None
+            if args:
+                input_arg = args[0]
+            else:
+                assert "input" in kwargs
+                input_arg = kwargs["input"]
+            if isinstance(input_arg, TensorVariable) and input_arg.dtype is not None:
+                if self.value is torch.is_floating_point:
+                    return ConstantVariable.create(input_arg.dtype.is_floating_point)
+                elif self.value is torch.is_complex:
+                    return ConstantVariable.create(input_arg.dtype.is_complex)
+                else:
+                    raise AssertionError(f"calling {self.value}")
+        elif (
+            self.value is torch.numel
+            and isinstance(args[0], TensorVariable)
+            and args[0].size is not None
+        ):
+            return ConstantVariable.create(product(args[0].size))
+        elif self.value in REWRITE_OPS_TO_TENSOR_SIZE_METHOD:
+            assert len(args) == 1
+            assert isinstance(args[0], TensorVariable)
+            return args[0].call_method(tx, "size", [], {})
+        elif self.value in (
+            torch.nn.modules.utils._single,
+            torch.nn.modules.utils._pair,
+            torch.nn.modules.utils._triple,
+            torch.nn.modules.utils._quadruple,
+            torch.nn.modules.utils._ntuple,
+        ):
+            return self._call_ntuple(tx, args, kwargs)
+        elif self.value is torch.is_grad_enabled:
+            assert not (args or kwargs)
+            install_guard(GradModeVariable._guards_singleton)
+            return ConstantVariable.create(torch.is_grad_enabled())
+        elif self.value is torch.use_deterministic_algorithms and len(args) == 1:
+            return DeterministicAlgorithmsVariable.create(
+                tx, args[0].as_python_constant()
+            )
+        elif self.value is torch.are_deterministic_algorithms_enabled:
+            assert not (args or kwargs)
+            install_guard(DeterministicAlgorithmsVariable._guards_singleton)
+            return ConstantVariable.create(torch.are_deterministic_algorithms_enabled())
+        elif self.value is torch._C._is_torch_function_enabled:
+            assert not (args or kwargs)
+            install_guard(TorchFunctionDisableVariable._guards_singleton)
+            return ConstantVariable.create(tx.output.torch_function_enabled)
+        elif self.value in (
+            torch.overrides.has_torch_function,
+            torch.overrides.has_torch_function_variadic,
+            torch.overrides.has_torch_function_unary,
+        ):
+            assert not kwargs
+            elems = (
+                args[0].unpack_var_sequence(tx)
+                if len(args) == 1 and isinstance(args[0], TupleVariable)
+                else args
+            )
+            return ConstantVariable.create(
+                any(has_torch_function(x) for x in elems),
+            )
+        elif any(
+            self.value is method
+            for method in [
+                device_interface.stream
+                for _, device_interface in get_registered_device_interfaces()
+            ]
+        ):
+            assert len(args) == 1
+            return StreamContextVariable.create(tx, args[0])
+        elif self.value is torch.from_numpy:
+            if not config.trace_numpy:
+                unimplemented("torch.from_numpy. config.trace_numpy is False")
+            if not np:
+                unimplemented("torch.from_numpy. NumPy is not available")
+            return wrap_fx_proxy_cls(
+                target_cls=TensorVariable,
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    torch.as_tensor,
+                    *proxy_args_kwargs(args, {}),
+                ),
+                example_value=None,
+            )
+        elif can_dispatch_torch_function(tx, args, kwargs):
+            return dispatch_torch_function(tx, self, args, kwargs)
+        elif self.value is torch.jit.annotate:
+            assert len(args) == 2
+            return args[1]
+        elif self.value is torch.backends.cudnn.is_acceptable:
+            # is_acceptable(tensor) returns true if
+            #   (a) tensor dtype/device are supported by cudnn
+            #   (b) cudnn is available
+            #   (c) some initialization has completed
+            # technically, it depends on some global state from (c) (torch.backends.cudnn.__cudnn_version)
+            assert (
+                len(args) == 1 or "tensor" in kwargs
+            ), "Expect 1 input to cudnn.is_acceptable"
+            tensor_variable = args[0] if len(args) > 0 else kwargs["tensor"]
+            assert isinstance(
+                tensor_variable, TensorVariable
+            ), "Expect input to cudnn.is_acceptable to be a tensor"
+            tensor_inp = torch.tensor(
+                0, dtype=tensor_variable.dtype, device=tensor_variable.device
+            )
+            return ConstantVariable.create(
+                torch.backends.cudnn.is_acceptable(tensor_inp)
+            )
+        elif self.value is torch.utils.hooks.BackwardHook:
+            return variables.BackwardHookVariable.create(tx, *args, **kwargs)
+        elif self.value is torch.nn.Parameter:
+            return self.call_nn_parameter(tx, *args, **kwargs)
+        elif (
+            self.value == torch.numel
+            and len(args) == 1
+            and isinstance(args[0], TensorVariable)
+            and len(kwargs) == 0
+        ):
+            # TODO(voz): This is rewritten as a call_method because
+            # torch.numel(x) w/ sym shapes raises a RuntimeError and x.numel() does not
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_method",
+                    "numel",
+                    *proxy_args_kwargs(args, kwargs),
+                ),
+            )
+        # TODO: These special cases shouldn't be necessary; we should
+        # generically support torch.ops that return int
+        elif (
+            self.value in (torch.ops.aten.sym_size, torch.ops.aten.sym_size.int)
+            and len(args) == 2
+            and len(kwargs) == 0
+            and isinstance(args[0], TensorVariable)
+        ):
+            # we see this when retracing already traced code
+            return args[0].call_method(tx, "size", [args[1]], {})
+        elif (
+            self.value in (torch.ops.aten.sym_stride, torch.ops.aten.sym_stride.int)
+            and len(args) == 2
+            and len(kwargs) == 0
+            and isinstance(args[0], TensorVariable)
+        ):
+            return args[0].call_method(tx, "stride", [args[1]], {})
+        elif (
+            self.value == torch.addcdiv
+            and len(args) == 3
+            and "value" in kwargs
+            and len(kwargs) == 1
+        ):
+            # decompose addcdiv into constituent ops, prevents a graph break due to converting
+            # value to a scalar
+            result = TorchInGraphFunctionVariable(torch.div).call_function(
+                tx, args[1:], {}
+            )
+            result = TorchInGraphFunctionVariable(torch.mul).call_function(
+                tx, [result, kwargs["value"]], {}
+            )
+            return TorchInGraphFunctionVariable(torch.add).call_function(
+                tx, [args[0], result], {}
+            )
+        elif (
+            self.value is torch._assert
+            and len(args) >= 1
+            and (
+                (args[0].is_python_constant() and args[0].as_python_constant())
+                or (
+                    isinstance(args[0], variables.SymNodeVariable)
+                    and args[0].evaluate_expr()
+                )
+            )
+        ):
+            return ConstantVariable(None)
+        elif SDPAParamsVariable.is_sdpa_params(self.value):
+            return wrap_fx_proxy(
+                tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    torch._C._SDPAParams,
+                    *proxy_args_kwargs(args, kwargs),
+                ),
+                param_vars=args,
+            )
+        elif is_constant_pg_functions(self.value):
+            # because the input is a "ProcessGroupVariable", we'll be guarding on its
+            # ID_MATCH based on how it was constructed.
+
+            # We desugar it at trace-time into ranks by directly calling util
+            # bake the result into the trace
+            if len(args) == 1:
+                # group or group name
+                assert isinstance(args[0], (ProcessGroupVariable, ConstantVariable))
+            elif len(args) == 2:
+                # ranks + tag
+                assert isinstance(args[0], ListVariable) and isinstance(
+                    args[1], ConstantVariable
+                )
+            else:
+                raise AssertionError(
+                    f"Invalid group value ({args}) for constant pg "
+                    f"function {self.value}"
+                )
+            args_as_value = [arg.as_python_constant() for arg in args]
+            invocation_result = self.value(*args_as_value)
+
+            # Note - while we *could* cook up sources around invocations, like a FunctionSource
+            # the space of invoking functions in the middle of the guard chain is very iffy. As such,
+            # guard propagation via options is the best we can do.
+            from .builder import SourcelessBuilder
+
+            return SourcelessBuilder()(tx, invocation_result)
+        elif is_from_local(self.value):
+            # rewrite non-primitive args/kwargs to be included in the on-the-fly prim function
+            # and rewrite args to have only proxyable args, then insert call_function
+            args_as_value = [x.as_python_constant() for x in args[1:]]
+            kwargs_as_value = {k: v.as_python_constant() for k, v in kwargs.items()}
+
+            def fn_with_prim_types(x):
+                return self.value(x, *args_as_value, **kwargs_as_value)
+
+            # attach the same function name for better debugging
+            fn_with_prim_types.__name__ = "prim " + self.value.__name__
+
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    fn_with_prim_types,
+                    *proxy_args_kwargs([args[0]], {}),
+                ),
+            )
+        elif (
+            self.value is torch.nested.nested_tensor
+            and kwargs.get("layout", torch.strided) == torch.strided
+        ):
+            raise unimplemented("torch.compile does not support strided NestedTensor")
+        elif self.value is torch.nn.functional.one_hot and (
+            len(args) + len(kwargs) == 1
+            or (
+                len(args) == 2
+                and args[1].is_python_constant()
+                and args[1].as_python_constant() == -1
+            )
+        ):
+            raise unimplemented(
+                "torch.nn.functional.one_hot with data-dependent output shape"
+            )
+        elif (
+            self.value is torch.fx.experimental.symbolic_shapes.guard_size_oblivious
+            and len(args) == 1
+            and isinstance(args[0], SymNodeVariable)
+        ):
+            # TODO: this probably should be folded somewhere else but I'm not
+            # sure where
+            # TODO: some of the other symbolic_shapes special tools can also
+            # get this treatment too
+            (cond,) = args
+            return variables.ConstantVariable.create(
+                torch.fx.experimental.symbolic_shapes.guard_size_oblivious(cond.sym_num)
+            )
+        elif self.value is torch._C._autograd._unsafe_set_version_counter:
+            from ..tensor_version_op import _unsafe_set_version_counter
+
+            return TorchInGraphFunctionVariable(
+                _unsafe_set_version_counter
+            ).call_function(tx, args, kwargs)
+        else:
+            any_symints_or_symfloats = any(isinstance(x, SymNodeVariable) for x in args)
+            all_ints_or_floats = all(
+                isinstance(x, (variables.ConstantVariable, variables.SymNodeVariable))
+                for x in args
+            )
+            bin_ops = {"add", "sub", "mul", "div", "sqrt"}
+            if (
+                getattr(self.value, "__module__", "") == "torch"
+                and self.value.__name__ in bin_ops
+                and any_symints_or_symfloats
+                and all_ints_or_floats
+            ):
+                msg = f"""\
+Calling {str(self.value)} on only torch.SymInt arguments is not yet supported.
+To support this behavior, we need to allow const-propping tensors that store symint data.
+For now, dynamo will explicitly graph break when it encounters user code with this behavior.
+"""
+                log.warning(msg)
+                raise unimplemented(msg)
+
+            # TODO(voz): Replace w/ dynamic shape rewrite table.
+            # Ideally, we would be able to do this at ctor time, but alas we need a combination
+            # of value + args to determine this.
+            fn_ = self.value
+            if any(isinstance(x, SymNodeVariable) for x in args):
+                torch_sym_op = f"_sym_{self.value.__name__}"
+                if getattr(self.value, "__module__", None) == "math" and hasattr(
+                    torch, torch_sym_op
+                ):
+                    fn_ = getattr(torch, torch_sym_op)
+
+            if fn_ is torch.tensor:
+
+                def check_any_unspec(x):
+                    # NB: This includes UnspecializedPythonVariable
+                    if isinstance(x, (TensorVariable, SymNodeVariable)):
+                        return True
+                    elif isinstance(x, (ListVariable, TupleVariable)):
+                        return any(check_any_unspec(y) for y in x.items)
+                    # TODO: there maybe other recursive structures you need to
+                    # check
+                    else:
+                        return False
+
+                data_arg = None
+                if args:
+                    data_arg = args[0]
+                elif "data" in kwargs:
+                    data_arg = kwargs["data"]
+
+                # NB: OK to pass torch.tensor(tensor), this will trace fine
+                if not isinstance(data_arg, TensorVariable) and check_any_unspec(
+                    data_arg
+                ):
+                    # This is slower and less canonical, so only use it if we
+                    # have to
+                    fn_ = torch._refs.tensor
+
+            tensor_variable = wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    fn_,
+                    *proxy_args_kwargs(args, kwargs),
+                ),
+            )
+
+            if (
+                isinstance(tensor_variable, TensorVariable)
+                and "requires_grad" in kwargs
+                and kwargs["requires_grad"].as_python_constant()
+            ):
+                unimplemented(
+                    """factory functions that return tensors that require grad are not supported.
+Either create the tensor outside the compiled region, or do not set the tensor to require_grad"""
+                )
+
+            if "out" in kwargs and not (
+                isinstance(kwargs["out"], variables.ConstantVariable)
+                and kwargs["out"].as_python_constant() is None
+            ):
+                # out variants of torch operators like torch.sort and
+                # torch.sigmoid mutate the tensors in the out field. Track such
+                # tensors and rewrite the symbolic locals.
+                if isinstance(tensor_variable, TupleVariable):
+                    assert isinstance(kwargs["out"], (TupleVariable, ListVariable))
+                    output_tensor_names = [
+                        tx.find_symbolic_locals_name(x) for x in kwargs["out"].items
+                    ]
+                    for idx, name in enumerate(output_tensor_names):
+                        if name in tx.symbolic_locals:
+                            tx.symbolic_locals[name] = tensor_variable.items[idx]
+                    for out_tensor, result_tensor in zip(
+                        kwargs["out"].items, tensor_variable.items
+                    ):
+                        if (
+                            out_tensor.source
+                            and out_tensor in tx.output.graphargs
+                            and out_tensor.size != result_tensor.size
+                        ):
+                            # It's hard to get out variants with resizing on graph inputs work
+                            # properly across dynamo/aot/inductor, just fall back.
+                            unimplemented("out variants with resizing on graph inputs")
+                elif isinstance(tensor_variable, TensorVariable):
+                    assert isinstance(kwargs["out"], TensorVariable)
+                    assert "example_value" in kwargs["out"].proxy.node.meta
+                    fake_tensor = tensor_variable.proxy.node.meta["example_value"]
+                    fake_out = kwargs["out"].proxy.node.meta["example_value"]
+                    if (
+                        kwargs["out"].source
+                        and kwargs["out"] in tx.output.graphargs
+                        and fake_out.shape != fake_tensor.shape
+                    ):
+                        # It's hard to get out variants with resizing on graph inputs work
+                        # properly across dynamo/aot/inductor, just fall back.
+                        unimplemented("out variants with resizing on graph inputs")
+                    if not torch._prims_common.is_contiguous(fake_out):
+                        # It's difficult to handle strides correctly in functionalization
+                        # when calling an out= op with a non-contiguous out argument
+                        unimplemented(
+                            "out= op was called where output tensor was non-contiguous"
+                        )
+                    name = tx.find_symbolic_locals_name(kwargs["out"])
+                    if name in tx.symbolic_locals:
+                        tx.symbolic_locals[name] = tensor_variable
+                else:
+                    unimplemented(f"out variant of {type(kwargs['out'])}")
+
+            return tensor_variable
+
+    def _call_ntuple(self, tx, args, kwargs):
+        """inline behavior of torch.nn.modules.utils._ntuple"""
+        if self.value is torch.nn.modules.utils._ntuple:
+            count = args[0].as_python_constant()
+        else:
+            count = self.value.__closure__[0].cell_contents
+        assert isinstance(count, int)
+        assert not kwargs
+
+        def handle_ntuple(value):
+            if value.has_unpack_var_sequence(tx):
+                return variables.TupleVariable(
+                    list(value.unpack_var_sequence(tx)),
+                )
+            elif value.is_python_constant():
+                # constant prop through it
+                return variables.ConstantVariable.create(
+                    torch.nn.modules.utils._ntuple(count)(value.as_python_constant()),
+                )
+            else:
+                unimplemented(f"torch.nn.modules.utils._ntuple({value})")
+
+        if self.value is torch.nn.modules.utils._ntuple:
+            return variables.LambdaVariable(handle_ntuple)
+        else:
+            return handle_ntuple(args[0])
+
+    @classmethod
+    def call_nn_parameter(cls, tx, data=None, requires_grad=True):
+        """A call to torch.nn.Parameter() gets lifted to before the graph"""
+        if isinstance(requires_grad, variables.VariableTracker):
+            try:
+                requires_grad = requires_grad.as_python_constant()
+            except NotImplementedError:
+                unimplemented("Parameter(requires_grad=...) not constant")
+
+        if not isinstance(data, variables.TensorVariable):
+            unimplemented(f"Parameter(data={data}) not implemented")
+
+        # this results in cleaner graphs, but only works for inputs
+        if data.source:
+            return cls._nn_param_via_prefix_insert(tx, data, requires_grad)
+
+        unimplemented("Parameter() on non-input")
+
+    @staticmethod
+    def _nn_param_via_prefix_insert(tx, data, requires_grad):
+        # Alternate version if we have a .source
+        from .builder import VariableBuilder
+
+        varname = tx.output.new_var()
+
+        # construct the nn.Parmeter before the graph save it to varname
+        cg = PyCodegen(tx)
+        cg.load_import_from("torch.nn", "Parameter")
+        cg(data.source)
+        cg(variables.ConstantVariable(requires_grad))
+        cg.call_function(2, True)
+        cg.store(varname)
+        tx.output.pregraph_bytecode.extend(cg.get_instructions())
+
+        # add the newly constructed nn.Parameter as a graph input
+        source = SyntheticLocalSource(varname)
+        example_value = torch.nn.Parameter(
+            tx.output.example_value_from_input_node(data.as_proxy().node)
+        )
+        result = VariableBuilder(tx, source)(example_value)
+        # No need to guard on this since we already guarded on `data`.
+        # These guards would fail since varname doesn't exist until after the function starts
+        TracingContext.get().guards_context.dynamo_guards.remove_guards_with_source(
+            source
+        )
+        return result
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/torch_function.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/torch_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..857767346ca644638952231fd03dea0b1797f5c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/torch_function.py
@@ -0,0 +1,270 @@
+# mypy: ignore-errors
+
+import inspect
+from typing import Dict, List
+
+import torch.utils._pytree as pytree
+
+from torch.overrides import _get_overloaded_args, get_default_nowrap_functions
+from ..exc import unimplemented
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GlobalSource
+from ..utils import has_torch_function, is_tensor_base_attr_getter
+from .base import VariableTracker
+from .constant import ConstantVariable
+from .lists import TupleVariable
+from .tensor import TensorSubclassVariable, TensorVariable
+from .user_defined import UserDefinedObjectVariable
+
+
+# [Note: __torch_function__] This feature is a prototype and has some rough edges (contact mlazos with issues):
+# At a high level, a torch function tensor subclass is represented as a TensorWithTFOverrideVariable, which dispatches
+# __torch_function__ on attribute accesses, method calls, and torch API calls.
+# The following is not supported:
+# - triggering __torch_function__ on tensor subclass non-tensor custom attributes
+# - graph breaking on mutating guardable tensor properties within a __torch_function__ context, this can cause
+# excessive recompiles in certain degenerate cases
+# - Matching the exact eager behavior of *ignoring* __torch_function__ objects in non-tensor argument positions of Torch API calls
+
+# The following is supported:
+# - static method impls of __torch_function__ on custom objects; this will trigger on torch API calls with the object as
+# any argument
+# - triggering __torch_function__ on torch API calls with tensor subclass arguments
+# - __torch_function__ calls on base tensor attribute access and method calls for tensor subclass instances
+# - matches the dispatch ordering behavior of eager __torch_function__ with subclass/object argumnents in any argument position
+
+# See https://docs.google.com/document/d/1WBxBSvW3NXhRp9ncmtokJloMLCtF4AYNhJaffvHe8Kw/edit#heading=h.vacn73lozd9w
+# for more information on the design.
+
+# To enable subclass behavior, add your tensor subclass type to traceable_tensor_subclasses in dynamo/config.py
+
+
+banned_attrs = [
+    fn.__self__.__name__
+    for fn in get_default_nowrap_functions()
+    if is_tensor_base_attr_getter(fn)
+]
+
+
+def _get_subclass_type(var):
+    assert isinstance(var, (TensorWithTFOverrideVariable, UserDefinedObjectVariable))
+    return var.python_type()
+
+
+def _get_subclass_type_var(tx, var):
+    assert isinstance(var, (TensorWithTFOverrideVariable, UserDefinedObjectVariable))
+    if isinstance(var, TensorWithTFOverrideVariable):
+        return var.class_type_var(tx)
+    elif isinstance(var, UserDefinedObjectVariable):
+        from .builder import SourcelessBuilder, VariableBuilder
+
+        if var.source:
+            return VariableBuilder(tx, var.source)(var.python_type())
+        else:
+            return SourcelessBuilder()(tx, var.python_type())
+
+
+def _is_attr_overidden(tx, var, name):
+    import torch
+
+    overridden = False
+    try:
+        attr_val = inspect.getattr_static(var.python_type(), name)
+        overridden |= attr_val != getattr(torch.Tensor, name)
+    except AttributeError:
+        pass
+
+    return overridden
+
+
+def call_torch_function(
+    tx, torch_function_type, torch_function_var, fn, types, args, kwargs
+):
+    from .builder import SourcelessBuilder
+
+    # signature:
+    # def __torch_function__(cls, func, types, args=(), kwargs=None):
+    tf_args = (
+        torch_function_type,
+        fn,
+        types,
+        SourcelessBuilder()(tx, tuple(args)),
+        SourcelessBuilder()(tx, kwargs),
+    )
+    return tx.inline_user_function_return(torch_function_var, tf_args, {})
+
+
+def build_torch_function_fn(tx, value, source):
+    from .builder import SourcelessBuilder, VariableBuilder
+
+    if source:
+        return VariableBuilder(
+            tx,
+            AttrSource(AttrSource(source, "__torch_function__"), "__func__"),
+        )(value.__torch_function__.__func__)
+    else:
+        return SourcelessBuilder()(tx, value.__torch_function__.__func__)
+
+
+def can_dispatch_torch_function(tx, args, kwargs):
+    if tx.output.torch_function_enabled:
+        all_args = pytree.arg_tree_leaves(*args, **kwargs)
+        return any(has_torch_function(arg) for arg in all_args)
+    else:
+        return False
+
+
+def dispatch_torch_function(tx, fn, args, kwargs):
+    """Gathers all args that are TensorWithTFOverrideVariable and dispatches based on the ordering in _get_overloaded_args"""
+
+    all_args = pytree.arg_tree_leaves(*args, **kwargs)
+    overloaded_args = _get_overloaded_args(
+        [arg for arg in all_args if has_torch_function(arg)],
+        _get_subclass_type,
+    )
+
+    for arg in overloaded_args:
+        res = arg.call_torch_function(
+            tx,
+            fn,
+            TupleVariable([_get_subclass_type_var(tx, arg) for arg in overloaded_args]),
+            args,
+            kwargs,
+        )
+
+        if not (isinstance(res, ConstantVariable) and res.value is NotImplemented):
+            return res
+
+    unimplemented(
+        f"All __torch_function__ overrides for call {fn} with args {args} and kwargs {kwargs} returned NotImplemented"
+    )
+
+
+class TensorWithTFOverrideVariable(TensorVariable):
+    """
+    Represents a tensor subclass instance with a __torch_function__ override.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.torch_function_fn = kwargs.pop("torch_function_fn")
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def from_tensor_var(cls, tx, tensor_var, class_type, torch_function_fn):
+        import torch
+
+        kwargs = dict(tensor_var.__dict__)
+        assert (
+            kwargs.pop("class_type") is torch.Tensor
+        ), "invalid class type in TensorWithTFOverrideVariable.from_tensor_var"
+        var = cls(torch_function_fn=torch_function_fn, class_type=class_type, **kwargs)
+        var.install_global(tx)
+        return var
+
+    def install_global(self, tx):
+        # stash the subclass type to rewrap an output tensor if needed
+        # this is needed because the actual type needs to be available
+        # each time the compiled artifact is run and outputs a wrapped tensor.
+        if self.global_mangled_class_name(tx) not in tx.output.global_scope:
+            # Safe because global_mangled_class_name figures it out
+            tx.output.install_global_unsafe(
+                self.global_mangled_class_name(tx), self.class_type
+            )
+
+    def python_type(self):
+        return self.class_type
+
+    def class_type_var(self, tx):
+        return TensorSubclassVariable(
+            self.class_type, source=GlobalSource(self.global_mangled_class_name(tx))
+        )
+
+    def global_mangled_class_name(self, tx):
+        # The global_mangled_class_name should be different for different
+        # invocations of torch.compile. Otherwise, we can run into a situation
+        # where multiple torch.compile invocations re-use the same global name,
+        # but the global's lifetime is tied to the first invocation (and
+        # may be deleted when the first torch.compile invocation is deleted)
+        # We mangle it based off of the output_graph's id.
+        compile_id = tx.output.compile_id
+        return f"__subclass_{self.class_type.__name__}_{id(self.class_type)}_c{id}"
+
+    def var_getattr(self, tx, name):
+        # [Note: __torch_function__] We currently only support attributes that are defined on
+        # base tensors, custom attribute accesses will graph break.
+        import torch
+        from .builder import SourcelessBuilder
+
+        if name in banned_attrs or not hasattr(torch.Tensor, name):
+            unimplemented(
+                f"Accessing {name} on a tensor subclass with a __torch_function__ override is not supported"
+            )
+
+        if _is_attr_overidden(tx, self, name):
+            unimplemented(
+                f"Accessing overridden method/attribute {name} on a tensor"
+                " subclass with a __torch_function__ override is not supported"
+            )
+
+        if tx.output.torch_function_enabled:
+            if self.source:
+                install_guard(
+                    AttrSource(AttrSource(self.source, "__class__"), name).make_guard(
+                        GuardBuilder.FUNCTION_MATCH
+                    )
+                )
+            get_fn = SourcelessBuilder()(tx, getattr(torch.Tensor, name).__get__)
+
+            return self.call_torch_function(
+                tx,
+                get_fn,
+                TupleVariable([self.class_type_var(tx)]),
+                [self],
+                {},
+            )
+        else:
+            return super().var_getattr(tx, name)
+
+    def call_torch_function(self, tx, fn, types, args, kwargs):
+        return call_torch_function(
+            tx,
+            self.class_type_var(tx),
+            self.torch_function_fn,
+            fn,
+            types,
+            args,
+            kwargs,
+        )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        # This code block implements inlining the __torch_function__ override
+        # of `call_method`.
+        if tx.output.torch_function_enabled:
+            import torch
+            from .builder import SourcelessBuilder, VariableBuilder
+
+            if _is_attr_overidden(tx, self, name):
+                unimplemented(
+                    f"Calling overridden method {name} on a tensor"
+                    " subclass with a __torch_function__ override is not supported"
+                )
+
+            # [Note: __torch_function__] Currently we only support methods that are defined on tensor
+            # we will graph break in other cases this will need a bigger overhaul of extracting methods/comparing them for equality
+            # We've established with the above check that the method is not overridden, so we guard that the method is the same
+            # as the impl defined on tensor and retrieve it
+            if self.source:
+                func_var = VariableBuilder(
+                    tx, AttrSource(AttrSource(self.source, "__class__"), name)
+                )(inspect.getattr_static(self.python_type(), name))
+            else:
+                func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name))
+            return dispatch_torch_function(tx, func_var, [self] + args, kwargs)
+        else:
+            return super().call_method(tx, name, args, kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_dynamo/variables/user_defined.py b/MLPY/Lib/site-packages/torch/_dynamo/variables/user_defined.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f3bb3ff6926b5f17ca1420a378217dd54ef743
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_dynamo/variables/user_defined.py
@@ -0,0 +1,946 @@
+# mypy: ignore-errors
+
+import collections
+import contextlib
+import functools
+import importlib
+import inspect
+import itertools
+import random
+import sys
+import threading
+import types
+from typing import Dict, List
+
+from ..bytecode_transformation import create_call_function
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+try:
+    from torch.utils._cxx_pytree import PyTreeSpec
+except ImportError:
+    PyTreeSpec = type(None)
+
+import torch._dynamo.config
+
+import torch.nn
+from torch._guards import TracingContext
+
+from .. import variables
+from ..exc import unimplemented
+from ..guards import GuardBuilder, install_guard
+from ..source import AttrSource, GetItemSource, ODictGetItemSource, RandomValueSource
+from ..utils import (
+    all_hook_names,
+    build_checkpoint_variable,
+    check_constant_args,
+    get_custom_getattr,
+    has_torch_function,
+    is_namedtuple_cls,
+    is_utils_checkpoint,
+    istype,
+    namedtuple_fields,
+    object_has_getattribute,
+    proxy_args_kwargs,
+    tensortype_to_dtype,
+)
+from .base import MutableLocal, VariableTracker
+from .ctx_manager import GenericContextWrappingVariable, NullContextVariable
+from .dicts import DefaultDictVariable
+
+
+class UserDefinedVariable(VariableTracker):
+    pass
+
+
+class UserDefinedClassVariable(UserDefinedVariable):
+    def __init__(self, value, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+
+    def as_python_constant(self):
+        return self.value
+
+    def python_type(self):
+        return type(self.value)
+
+    def as_proxy(self):
+        return self.value
+
+    def __str__(self):
+        return f"UserDefinedClassVariable({self.value})"
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _constant_fold_classes():
+        return {
+            torch.device,
+            torch.finfo,
+            torch.iinfo,
+            torch.Size,
+        }
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _in_graph_classes():
+        return set(tensortype_to_dtype.keys()) | {
+            torch.Tensor,
+            torch.cuda.Stream,
+            torch.cuda.Event,
+        }
+
+    def can_constant_fold_through(self):
+        return self.value in self._constant_fold_classes()
+
+    def var_getattr(self, tx, name: str) -> "VariableTracker":
+        from .. import trace_rules
+        from . import ConstantVariable
+        from .builder import VariableBuilder
+
+        if name == "__name__":
+            return ConstantVariable.create(self.value.__name__)
+
+        source = AttrSource(self.source, name) if self.source is not None else None
+        try:
+            obj = inspect.getattr_static(self.value, name)
+        except AttributeError:
+            obj = None
+
+        if isinstance(obj, staticmethod):
+            func = obj.__get__(self.value)
+            if source is not None:
+                return trace_rules.lookup(func).create_with_source(func, source=source)
+            else:
+                return trace_rules.lookup(func)(func)
+        elif isinstance(obj, classmethod):
+            return variables.UserMethodVariable(obj.__func__, self, source=source)
+        elif source and inspect.ismemberdescriptor(obj):
+            return VariableBuilder(tx, source)(obj.__get__(self.value))
+
+        # Special handling of collections.OrderedDict.fromkeys()
+        # Wrap it as GetAttrVariable(collections.OrderedDict, "fromkeys") to make it consistent with
+        # collections.defaultdict, and both will be handled at UserDefinedClassVariable.call_method().
+        # Otherwise, it would be wrapped as UserDefinedObjectVariable(collections.OrderedDict.fromkeys),
+        # and we need duplicate code to handle both cases.
+        if self.value is collections.OrderedDict and name == "fromkeys":
+            return super().var_getattr(tx, name)
+
+        if name in getattr(self.value, "__dict__", {}) or (
+            self.value.__module__.startswith("torch.")
+            or self.value.__module__ == "torch"
+        ):
+            if source:
+                return VariableBuilder(tx, source)(obj)
+        elif ConstantVariable.is_literal(obj):
+            return ConstantVariable.create(obj)
+
+        return super().var_getattr(tx, name)
+
+    def _call_cross_entropy_loss(self, tx, args, kwargs):
+        """
+        functional: input, target, weight=None, size_average=None, ignore_index=- 100, reduce=None, reduction='mean',
+        label_smoothing=0.0
+
+        non functional ctor: weight=None, size_average=None, ignore_index=- 100, reduce=None, reduction='mean',
+        label_smoothing=0.0
+
+        non functional loss call: input, target, optional_output
+        """
+        from . import ConstantVariable
+
+        def normalize_args(
+            weight=ConstantVariable.create(None),
+            size_average=ConstantVariable.create(None),
+            ignore_index=ConstantVariable.create(-100),
+            reduce=ConstantVariable.create(None),
+            reduction=ConstantVariable.create("mean"),
+            label_smoothing=ConstantVariable.create(0.0),
+        ):
+            return (
+                weight,
+                size_average,
+                ignore_index,
+                reduce,
+                reduction,
+                label_smoothing,
+            )
+
+        (
+            weight,
+            size_average,
+            ignore_index,
+            reduce_arg,
+            reduction,
+            label_smoothing,
+        ) = normalize_args(*args, **kwargs)
+
+        def fake_cross_entropy_loss(input, target):
+            from .builder import wrap_fx_proxy
+
+            return wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    torch.nn.functional.cross_entropy,
+                    *proxy_args_kwargs(
+                        [
+                            input,
+                            target,
+                            weight,
+                            size_average,
+                            ignore_index,
+                            reduce_arg,
+                            reduction,
+                            label_smoothing,
+                        ],
+                        {},
+                    ),
+                ),
+            )
+
+        return variables.LambdaVariable(fake_cross_entropy_loss)
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        if (
+            name == "__subclasses__"
+            and len(args) == 0
+            and not kwargs
+            and "__subclasses__" not in self.value.__dict__
+        ):
+            options = {"mutable_local": MutableLocal()}
+            subs_as_vars: List[VariableTracker] = list()
+            for sub in self.value.__subclasses__():
+                source = AttrSource(tx.import_source(sub.__module__), sub.__name__)
+                subs_as_vars.append(
+                    variables.UserDefinedClassVariable(sub, source=source)
+                )
+
+            return variables.ListVariable(subs_as_vars, **options)
+        elif (
+            self.value in {collections.OrderedDict, collections.defaultdict}
+            and name == "fromkeys"
+        ):
+            from .builtin import BuiltinVariable
+
+            return BuiltinVariable.call_custom_dict_fromkeys(
+                tx, self.value, *args, **kwargs
+            )
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from ..side_effects import SideEffects
+        from .builder import SourcelessBuilder, wrap_fx_proxy
+        from .builtin import BuiltinVariable
+
+        constant_args = check_constant_args(args, kwargs)
+
+        if self.can_constant_fold_through() and constant_args:
+            # constant fold
+            return variables.ConstantVariable.create(
+                self.as_python_constant()(
+                    *[x.as_python_constant() for x in args],
+                    **{k: v.as_python_constant() for k, v in kwargs.items()},
+                ),
+            )
+        elif self.value is torch.nn.CrossEntropyLoss:
+            return self._call_cross_entropy_loss(tx, args, kwargs)
+        elif self.value is contextlib.nullcontext:
+            return NullContextVariable()
+        elif self.value is collections.OrderedDict:
+            return BuiltinVariable.call_custom_dict(
+                tx, collections.OrderedDict, *args, **kwargs
+            )
+        elif (
+            self.value is collections.defaultdict
+            and len(args) <= 1
+            and DefaultDictVariable.is_supported_arg(args[0])
+        ):
+            return DefaultDictVariable(
+                {},
+                collections.defaultdict,
+                args[0],
+                mutable_local=MutableLocal(),
+            )
+        elif self.value is collections.deque and not kwargs:
+            if len(args) == 0:
+                items = []
+            elif len(args) == 1 and args[0].has_unpack_var_sequence(tx):
+                items = args[0].unpack_var_sequence(tx)
+            else:
+                unimplemented("deque() with more than 1 arg not supported")
+            return variables.lists.DequeVariable(items, mutable_local=MutableLocal())
+        elif self.value is functools.partial:
+            if not args:
+                unimplemented("functools.partial malformed")
+            # The first arg, a callable (the ctor below will assert on types)
+            fn = args[0]
+            rest_args = args[1:]
+            # guards for the produced FunctoolsPartialVariable are installed in FunctoolsPartialVariable ctor from the
+            # args and keywords
+            return variables.functions.FunctoolsPartialVariable(
+                fn, args=rest_args, keywords=kwargs
+            )
+        elif (
+            issubclass(type(self.value), type)
+            and hasattr(
+                self.value, "__enter__"
+            )  # TODO(voz): These can invoke user code!
+            and hasattr(
+                self.value, "__exit__"
+            )  # TODO(voz): These can invoke user code!
+            and check_constant_args(args, kwargs)
+            and self.value.__init__ == object.__init__
+            and len(kwargs) == 0  # TODO(ybliang): support kwargs
+        ):
+            unwrapped_args = [x.as_python_constant() for x in args]
+            return GenericContextWrappingVariable(
+                unwrapped_args,
+                cm_obj=self.value(*unwrapped_args),
+            )
+
+        elif is_namedtuple_cls(self.value):
+            fields = namedtuple_fields(self.value)
+            # check if this a quasi-namedtuple or a real one
+            if self.value.__module__ == "torch.return_types":
+                # create pseudo-defaults from values of the quasi-namedtuple
+                field_defaults = dict(zip(fields, args[0].items))
+            else:
+                field_defaults = self.value._field_defaults
+
+            items = list(args)
+            items.extend([None] * (len(fields) - len(items)))
+
+            var_tracker_kwargs = {}
+            for field_name, var_tracker in zip(fields, items):
+                if var_tracker is None:
+                    if field_name in kwargs:
+                        field_var = kwargs[field_name]
+                    else:
+                        assert field_name in field_defaults
+                        field_var = SourcelessBuilder()(tx, field_defaults[field_name])
+                    var_tracker_kwargs[field_name] = field_var
+
+            for name, value in var_tracker_kwargs.items():
+                assert name in fields
+                items[fields.index(name)] = value
+
+            assert all(x is not None for x in items)
+            return variables.NamedTupleVariable(items, self.value)
+        elif (
+            inspect.getattr_static(self.value, "__new__", None) in (object.__new__,)
+            and SideEffects.cls_supports_mutation_side_effects(self.value)
+            and self.source
+        ):
+            var = tx.output.side_effects.track_object_new(
+                self.source,
+                self.value,
+                variables.UnspecializedNNModuleVariable
+                if issubclass(self.value, torch.nn.Module)
+                else UserDefinedObjectVariable,
+                {},
+            )
+            if (
+                inspect.getattr_static(self.value, "__init__", None)
+                is torch.nn.Module.__init__
+            ):
+                tx.output.side_effects.store_attr(
+                    var,
+                    "__call_nn_module_init",
+                    variables.ConstantVariable.create(True),
+                )
+                return var
+            else:
+                var.call_method(tx, "__init__", args, kwargs)
+                return var
+        elif variables.CustomizedDictVariable.is_matching_cls(self.value):
+            options = {"mutable_local": MutableLocal()}
+            return variables.CustomizedDictVariable.create(
+                self.value, args, kwargs, options
+            )
+        elif variables.DataClassVariable.is_matching_cls(self.value):
+            options = {"mutable_local": MutableLocal()}
+            return variables.DataClassVariable.create(self.value, args, kwargs, options)
+        elif (
+            variables.RestrictedListSubclassVariable.is_matching_cls(self.value)
+            and self.source
+        ):
+            return variables.RestrictedListSubclassVariable(
+                variables.BuiltinVariable(list).call_function(tx, args, kwargs).items,
+                user_cls=self.value,
+                user_cls_source=self.source,
+                mutable_local=MutableLocal(),
+            )
+        elif self.value in self._in_graph_classes():
+            # torch.LongTensor cannot accept a list of FakeTensors.
+            # So we stack the list of FakeTensors instead.
+            if (
+                np
+                and self.value in tensortype_to_dtype
+                and len(args) == 1
+                and isinstance(args[0], variables.ListVariable)
+                and len(args[0].items) > 1
+                and all(isinstance(x, variables.TensorVariable) for x in args[0].items)
+            ):
+                # Stack FakeTensor
+                stacked = wrap_fx_proxy(
+                    tx=tx,
+                    proxy=tx.output.create_proxy(
+                        "call_function",
+                        torch.stack,
+                        *proxy_args_kwargs(args, kwargs),
+                    ),
+                )
+                args = [stacked]
+
+            tensor_variable = wrap_fx_proxy(
+                tx=tx,
+                proxy=tx.output.create_proxy(
+                    "call_function",
+                    self.value,
+                    *proxy_args_kwargs(args, kwargs),
+                ),
+            )
+
+            return tensor_variable
+
+        return super().call_function(tx, args, kwargs)
+
+    def const_getattr(self, tx, name):
+        if name == "__name__":
+            return self.value.__name__
+        return super().const_getattr(tx, name)
+
+
+class UserDefinedObjectVariable(UserDefinedVariable):
+    """
+    Mostly objects of defined type.  Catch-all for something where we only know the type.
+    """
+
+    _nonvar_fields = {"value", "value_type", *UserDefinedVariable._nonvar_fields}
+
+    def __init__(self, value, value_type=None, **kwargs):
+        super().__init__(**kwargs)
+        self.value = value
+        self.value_type = value_type or type(value)
+        assert type(value) is self.value_type
+
+    def __str__(self):
+        inner = self.value_type.__name__
+        if inner in [
+            "builtin_function_or_method",
+            "getset_descriptor",
+            "method_descriptor",
+            "method",
+        ]:
+            inner = str(getattr(self.value, "__name__", None))
+        return f"{self.__class__.__name__}({inner})"
+
+    def python_type(self):
+        return self.value_type
+
+    def guard_as_python_constant(self):
+        if self.source:
+            install_guard(self.source.make_guard(GuardBuilder.ID_MATCH))
+            return self.value
+        return super().guard_as_python_constant()
+
+    def torch_function_check(self):
+        assert has_torch_function(
+            self
+        ), f"calling torch function on object without __torch_function__ {self}"
+
+    def get_torch_fn(self, tx):
+        self.torch_function_check()
+        from .torch_function import build_torch_function_fn
+
+        return build_torch_function_fn(tx, self.value, self.source)
+
+    def call_torch_function(self, tx, fn, types, args, kwargs):
+        self.torch_function_check()
+
+        from .torch_function import _get_subclass_type_var, call_torch_function
+
+        return call_torch_function(
+            tx,
+            _get_subclass_type_var(tx, self),
+            self.get_torch_fn(tx),
+            fn,
+            types,
+            args,
+            kwargs,
+        )
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def _supported_random_functions():
+        fns = {
+            random.random,
+            random.randint,
+            random.randrange,
+            random.uniform,
+        }
+        return fns
+
+    def _maybe_get_baseclass_method(self, name):
+        if name not in getattr(self.value, "__dict__", {}):
+            try:
+                return inspect.getattr_static(type(self.value), name)
+            except AttributeError:
+                pass
+        return None
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        from . import (
+            BuiltinVariable,
+            ConstantVariable,
+            TupleVariable,
+            UserMethodVariable,
+        )
+
+        method = self._maybe_get_baseclass_method(name)
+        if method is not None:
+            if method is object.__init__:
+                return ConstantVariable.create(None)
+
+            # [NOTE] OrderedDict, dict subtypes must always have source
+            # We cannot instantiate such subtypes in-graph due to builtin __new__
+            if method is collections.OrderedDict.keys:
+                # subclass of OrderedDict
+                assert not (args or kwargs)
+                assert self.source  # OrderedDict, dict subtypes must always have source
+                keys = list(self.value.keys())
+                assert all(map(ConstantVariable.is_literal, keys))
+                install_guard(self.source.make_guard(GuardBuilder.DICT_CONST_KEYS))
+                return TupleVariable([ConstantVariable.create(k) for k in keys])
+
+            if (
+                method in (collections.OrderedDict.__contains__, dict.__contains__)
+                and len(args) == 1
+                and isinstance(args[0], (ConstantVariable, BuiltinVariable))
+                and inspect.getattr_static(type(self.value), "keys")
+                in (collections.OrderedDict.keys, dict.keys)
+            ):
+                assert not kwargs
+                assert self.source  # OrderedDict, dict subtypes must always have source
+                install_guard(self.source.make_guard(GuardBuilder.DICT_CONST_KEYS))
+                return ConstantVariable.create(
+                    args[0].as_python_constant() in self.value
+                )
+
+            if method is collections.OrderedDict.items and isinstance(
+                self.value, collections.OrderedDict
+            ):
+                assert self.source  # OrderedDict, dict subtypes must always have source
+                assert not (args or kwargs)
+                items = []
+                keys = self.call_method(tx, "keys", [], {})
+                for key in keys.unpack_var_sequence(tx):
+                    items.append(
+                        TupleVariable(
+                            [key, self.odict_getitem(tx, key)],
+                        )
+                    )
+                return TupleVariable(items)
+
+            if method is collections.OrderedDict.__getitem__ and len(args) == 1:
+                assert not kwargs
+                assert self.source  # OrderedDict, dict subtypes must always have source
+                return self.odict_getitem(tx, args[0])
+
+            # check for methods implemented in C++
+            if isinstance(method, types.FunctionType):
+                source = (
+                    None
+                    if self.source is None
+                    else AttrSource(AttrSource(self.source, "__class__"), name)
+                )
+                # TODO(jansel): add a guard to check for monkey patching?
+                return UserMethodVariable(method, self, source=source).call_function(
+                    tx, args, kwargs
+                )
+
+            if method is list.__len__ and self.source and not (args or kwargs):
+                install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
+                return ConstantVariable(len(self.value))
+
+        return super().call_method(tx, name, args, kwargs)
+
+    def unpack_var_sequence(self, tx):
+        if (
+            self.source
+            and self._maybe_get_baseclass_method("__iter__") is list.__iter__
+            and self._maybe_get_baseclass_method("__len__") is list.__len__
+            and self._maybe_get_baseclass_method("__getitem__") is list.__getitem__
+        ):
+            install_guard(self.source.make_guard(GuardBuilder.SEQUENCE_LENGTH))
+            return [
+                variables.LazyVariableTracker.create(
+                    self.value[k],
+                    source=GetItemSource(self.source, k),
+                )
+                for k in range(len(self.value))
+            ]
+        return super().unpack_var_sequence(tx)
+
+    def is_supported_random(self):
+        try:
+            return self.value in self._supported_random_functions()
+        except TypeError:
+            # TypeError: unhashable type
+            return False
+
+    def call_function(
+        self, tx, args: "List[VariableTracker]", kwargs: "Dict[str, VariableTracker]"
+    ) -> "VariableTracker":
+        from .. import trace_rules
+        from .builder import VariableBuilder
+
+        if (
+            self.is_supported_random()
+            and all(k.is_python_constant() for k in args)
+            and all(v.is_python_constant() for v in kwargs.values())
+        ):
+            args = [x.as_python_constant() for x in args]
+            kwargs = {k: v.as_python_constant() for k, v in kwargs.items()}
+            random_call_index = len(tx.output.random_calls)
+            example_value = self.value(*args, **kwargs)
+            source = RandomValueSource(random_call_index)
+            tx.output.random_calls.append((self.value, args, kwargs))
+            return VariableBuilder(tx, source).wrap_unspecialized_primitive(
+                example_value
+            )
+        elif istype(self.value, types.MethodType):
+            func = self.value.__func__
+            obj = self.value.__self__
+            if (
+                func is torch.utils._contextlib._DecoratorContextManager.clone
+                and variables.TorchCtxManagerClassVariable.is_matching_cls(
+                    obj.__class__
+                )
+                and not (args or kwargs)
+            ):
+                return variables.TorchCtxManagerClassVariable(
+                    obj.__class__
+                ).call_function(tx, args, kwargs)
+
+            if (
+                func is torch.autograd.grad_mode.inference_mode.clone
+                and obj.__class__ is torch.autograd.grad_mode.inference_mode
+            ):
+                # simulate the inference_mode.clone implementation
+                var = variables.ConstantVariable(obj.mode)
+                return variables.TorchCtxManagerClassVariable(
+                    obj.__class__
+                ).call_function(tx, [var], kwargs)
+        elif (
+            istype(self.value, functools.partial)
+            and trace_rules.lookup(self.value.func)
+            == variables.TorchInGraphFunctionVariable
+            and all(
+                variables.ConstantVariable.is_literal(v)
+                for v in itertools.chain(self.value.args, self.value.keywords.values())
+            )
+        ):
+            if self.source:
+                install_guard(
+                    AttrSource(self.source, "func").make_guard(GuardBuilder.ID_MATCH),
+                    AttrSource(self.source, "args").make_guard(
+                        GuardBuilder.CONSTANT_MATCH
+                    ),
+                    AttrSource(self.source, "keywords").make_guard(
+                        GuardBuilder.CONSTANT_MATCH
+                    ),
+                )
+
+            partial_args = [
+                variables.ConstantVariable.create(v) for v in self.value.args
+            ]
+            partial_args.extend(args)
+            partial_kwargs = {
+                k: variables.ConstantVariable.create(v)
+                for k, v in self.value.keywords.items()
+            }
+            partial_kwargs.update(kwargs)
+            if is_utils_checkpoint(self.value.func):
+                return build_checkpoint_variable().call_function(
+                    tx, partial_args, partial_kwargs
+                )
+            return variables.TorchInGraphFunctionVariable(
+                self.value.func
+            ).call_function(tx, partial_args, partial_kwargs)
+        elif callable(self.value):
+            if self.source:
+                install_guard(self.source.make_guard(GuardBuilder.FUNCTION_MATCH))
+            return self.call_method(tx, "__call__", args, kwargs)
+
+        return super().call_function(tx, args, kwargs)
+
+    def _check_for_getattribute(self):
+        if object_has_getattribute(self.value):
+            unimplemented("UserDefinedObjectVariable with custom __getattribute__")
+
+    def _check_for_getattr(self):
+        return get_custom_getattr(self.value)
+
+    def _getattr_static(self, name):
+        if (
+            isinstance(self.value, (torch.nn.Module, PyTreeSpec))
+            or "__slots__" in self.value.__class__.__dict__
+            or type(self.value) == threading.local
+        ):
+            # getattr_static doesn't work on these
+            subobj = getattr(self.value, name)
+        else:
+            subobj = inspect.getattr_static(self.value, name)
+        return subobj
+
+    def var_getattr(self, tx, name):
+        from .. import trace_rules
+        from . import ConstantVariable
+        from .builder import VariableBuilder
+
+        value = self.value
+        source = AttrSource(self.source, name) if self.source else None
+        self._check_for_getattribute()
+        getattr_fn = self._check_for_getattr()
+
+        class NO_SUCH_SUBOBJ:
+            pass
+
+        try:
+            subobj = self._getattr_static(name)
+        except AttributeError:
+            subobj = NO_SUCH_SUBOBJ
+            if isinstance(getattr_fn, types.FunctionType):
+                return variables.UserMethodVariable(
+                    getattr_fn, self, source=source
+                ).call_function(tx, [ConstantVariable.create(name)], {})
+            elif getattr_fn is not None:
+                unimplemented("UserDefined with non-function __getattr__")
+
+        if isinstance(subobj, property):
+            # Rewrite the source being explicit about reading it statically.
+            if self.source:
+                source = AttrSource(self.source, name, get_static=True)
+                source = AttrSource(source, "fget")
+            return variables.UserMethodVariable(
+                subobj.fget, self, source=source
+            ).call_function(tx, [], {})
+        elif isinstance(subobj, torch.distributions.utils.lazy_property):
+            subobj_var = UserDefinedObjectVariable(subobj, source=source)
+            return variables.UserMethodVariable(
+                subobj.__get__.__func__, subobj_var, source=source
+            ).call_function(tx, [self], {})
+        elif isinstance(subobj, staticmethod):
+            func = subobj.__get__(self.value)
+            if source is not None:
+                return trace_rules.lookup(func).create_with_source(func, source=source)
+            else:
+                return trace_rules.lookup(func)(func)
+        elif isinstance(subobj, classmethod):
+            return variables.UserMethodVariable(
+                subobj.__func__, self.var_getattr(tx, "__class__"), source=source
+            )
+        elif isinstance(subobj, types.FunctionType) or (
+            isinstance(subobj, types.MethodType)
+            and isinstance(self.value, torch.nn.Module)
+        ):
+            # Since we get subobj via self._getattr_static, which may not trigger dynamic lookup.
+            # Static lookup can't tell us it's a method or function correctly,
+            # so we trigger dynamic lookup here to get the correct type.
+            dynamic_subobj = getattr(self.value, name)
+
+            while dynamic_subobj is subobj and hasattr(subobj, "_torchdynamo_inline"):
+                subobj = subobj._torchdynamo_inline
+                dynamic_subobj = subobj
+                source = AttrSource(source, "_torchdynamo_inline") if source else None
+
+            if isinstance(subobj, types.MethodType):
+                if dynamic_subobj.__self__ is not self.value:
+                    unimplemented("__self__ mismatch for bound method")
+                func = subobj.__func__
+            else:
+                assert isinstance(subobj, types.FunctionType)
+                func = subobj
+
+            if inspect.ismethod(dynamic_subobj):
+                return variables.UserMethodVariable(func, self, source=source)
+            elif inspect.isfunction(dynamic_subobj):
+                if is_utils_checkpoint(func):
+                    return build_checkpoint_variable(source=source)
+                elif source is not None:
+                    return trace_rules.lookup(func).create_with_source(
+                        func, source=source
+                    )
+                else:
+                    return trace_rules.lookup(func)(func)
+
+        if (
+            name in getattr(value, "__dict__", {})
+            or ConstantVariable.is_literal(subobj)
+            or isinstance(
+                subobj,
+                (
+                    torch.Tensor,
+                    torch.nn.Module,
+                ),
+            )
+        ):
+            if source:
+                return VariableBuilder(tx, source)(subobj)
+            elif ConstantVariable.is_literal(subobj):
+                return ConstantVariable.create(subobj)
+
+        if (
+            name not in getattr(value, "__dict__", {})
+            and type(value).__module__.startswith("torch.")
+            and "torch.optim" not in type(value).__module__
+            and not callable(value)
+            and not isinstance(subobj, types.MethodDescriptorType)
+        ):
+            if not source:
+                assert getattr(
+                    importlib.import_module(type(value).__module__),
+                    type(value).__name__,
+                ) is type(value)
+                source = AttrSource(
+                    AttrSource(
+                        tx.import_source(type(value).__module__), type(value).__name__
+                    ),
+                    name,
+                )
+
+            return VariableBuilder(tx, source)(subobj)
+        options = {"source": source}
+        if isinstance(
+            subobj,
+            (
+                torch.distributions.constraints._Interval,
+                torch.distributions.constraints._Real,
+                torch.distributions.constraints.Constraint,
+            ),
+        ):
+            return UserDefinedObjectVariable(subobj, **options)
+        elif isinstance(self.value, torch.nn.Module) and name in all_hook_names:
+            assert isinstance(subobj, collections.OrderedDict)
+            if not subobj:
+                return variables.ConstDictVariable(
+                    subobj, collections.OrderedDict, **options
+                )
+
+        if name == "__class__":
+            return UserDefinedClassVariable(type(self.value), **options)
+
+        return variables.GetAttrVariable(self, name, **options)
+
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        if tx.output.side_effects.is_attribute_mutation(self):
+            try:
+                result = tx.output.side_effects.load_attr(self, name, deleted_ok=True)
+                return variables.ConstantVariable.create(
+                    not isinstance(result, variables.DeletedVariable)
+                )
+            except KeyError:
+                pass
+        if self.source:
+            install_guard(
+                AttrSource(self.source, name).make_guard(GuardBuilder.HASATTR)
+            )
+        if self._check_for_getattribute() or self._check_for_getattr():
+            unimplemented("hasattr with custom __getattr__")
+
+        try:
+            self._getattr_static(name)
+            return variables.ConstantVariable.create(True)
+        except AttributeError:
+            return variables.ConstantVariable.create(False)
+
+    def odict_getitem(self, tx, key):
+        from .builder import VariableBuilder
+        from .dicts import is_hashable
+
+        # TODO this should probably be merged with the dict handling
+
+        index = (
+            key.source
+            if is_hashable(key) and key.source is not None
+            else key.as_python_constant()
+        )
+
+        return VariableBuilder(
+            tx,
+            ODictGetItemSource(self.source, index),
+        )(collections.OrderedDict.__getitem__(self.value, key.as_python_constant()))
+
+
+class KeyedJaggedTensorVariable(UserDefinedObjectVariable):
+    @staticmethod
+    def is_matching_object(obj):
+        mod = sys.modules.get("torchrec.sparse.jagged_tensor")
+        return mod is not None and type(obj) is mod.KeyedJaggedTensor
+
+    def __init__(self, value, **kwargs):
+        from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+        assert type(value) is KeyedJaggedTensor
+        super().__init__(value, **kwargs)
+
+    def var_getattr(self, tx, name):
+        if (
+            torch._dynamo.config.force_unspec_int_unbacked_size_like_on_torchrec_kjt
+            and self.source is not None
+            and name in ("_length_per_key", "_offset_per_key")
+        ):
+            with TracingContext.patch(force_unspec_int_unbacked_size_like=True):
+                return super().var_getattr(tx, name)
+        return super().var_getattr(tx, name)
+
+
+class RemovableHandleVariable(VariableTracker):
+    REMOVED = -1
+
+    def __init__(
+        self,
+        mutable_local=None,
+        # index of the registration in the side_effects owned register_hook/handle list, used during removal.
+        idx=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.mutable_local = mutable_local
+        self.idx = idx
+
+    def call_method(self, tx, method_name, args, kwargs):
+        if method_name == "remove":
+            if self.idx != self.REMOVED:
+                tx.output.side_effects.remove_hook(self.idx)
+                self.idx = self.REMOVED
+            return variables.ConstantVariable.create(None)
+        super().call_method(tx, method_name, args, kwargs)
+
+    def reconstruct(self, codegen):
+        if self.idx == self.REMOVED:
+            # Hook has already been removed, return a dummy handle
+            codegen.load_import_from("torch._dynamo.utils", "invalid_removeable_handle")
+            codegen.extend_output(create_call_function(0, True))
+            return
+        # unreachable due to codegen.add_cache() when the hook is installed
+        super().reconstruct(codegen)
diff --git a/MLPY/Lib/site-packages/torch/_export/__init__.py b/MLPY/Lib/site-packages/torch/_export/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..421614c6e2707e7e661d599fdd847fdc331aab30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/__init__.py
@@ -0,0 +1,406 @@
+import copy
+import dataclasses
+import functools
+import io
+import json
+import os
+import re
+import sys
+import types
+import warnings
+import weakref
+import zipfile
+from collections import OrderedDict
+from contextlib import contextmanager
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
+
+import sympy
+
+import torch
+import torch._dynamo
+import torch.fx
+import torch.utils._pytree as pytree
+
+from torch._decomp import core_aten_decompositions, get_decompositions
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.exc import UserError, UserErrorType
+from torch._dynamo.source import ConstantSource
+from torch._export.passes.collect_tracepoints_pass import CollectTracepointsPass
+from torch._functorch.aot_autograd import aot_export_module, GraphSignature
+from torch._functorch.eager_transforms import functionalize
+from torch._guards import detect_fake_mode
+from torch._inductor import config
+from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch._utils_internal import log_export_usage
+from torch.export._tree_utils import reorder_kwargs
+from torch.export._unlift import _create_stateful_graph_module
+from torch.export.dynamic_shapes import (
+    _process_constraints,
+    _process_dynamic_shapes,
+    Constraint,
+    dims,
+    dynamic_dim,
+)
+from torch.export.exported_program import (
+    _disable_prexisiting_fake_mode,
+    ExportedProgram,
+    ModuleCallEntry,
+    ModuleCallSignature,
+)
+from torch.export.graph_signature import (
+    _sig_to_specs,
+    ArgumentSpec,
+    ConstantArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    OutputKind,
+    OutputSpec,
+    SymIntArgument,
+    TensorArgument,
+)
+from torch.fx import traceback as fx_traceback
+from torch.fx._compatibility import compatibility
+from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    GuardOnDataDependentSymNode,
+    ShapeEnv,
+    StrictMinMaxConstraint,
+)
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from torch.utils._sympy.value_ranges import ValueRangeError, ValueRanges
+
+from .passes.add_runtime_assertions_for_constraints_pass import (
+    _AddRuntimeAssertionsForInlineConstraintsPass,
+)
+from .wrappers import _wrap_submodules
+
+
+@dataclasses.dataclass
+class ExportDynamoConfig:
+    """
+    Manage Export-specific configurations of Dynamo.
+    """
+    allow_rnn: bool = True
+
+
+@compatibility(is_backward_compatible=False)
+def capture_pre_autograd_graph(
+    f: torch.nn.Module,
+    args: Tuple[Any],
+    kwargs: Optional[Dict[str, Any]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any]]] = None,
+) -> torch.nn.Module:
+    """
+    A helper function that is intended to trace a module before any pre-autograd
+    decomposition is run. The produced module will be "non-functional" and
+    composed of aten operators. Later this API will be deleted in favor of more general
+    torch.export API.
+
+    Args:
+      f: nn.Module to be traced
+
+      args: example positional inputs.
+
+      kwargs: optional example keyword inputs.
+
+      dynamic_shapes: Should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
+
+    Returns:
+        An nn.Module containing the traced method.
+
+    """
+    from torch.export._trace import _convert_input_to_fake, DEFAULT_EXPORT_DYNAMO_CONFIG
+    from torch.export.dynamic_shapes import _process_dynamic_shapes
+
+    log_export_usage(event="export.private_api", flags={"capture_pre_autograd_graph"})
+
+    assert isinstance(f, torch.nn.Module), "Expected an nn.Module instance."
+
+    if kwargs is None:
+        kwargs = {}
+
+    constraints = _process_dynamic_shapes(f, args, kwargs, dynamic_shapes)
+
+    # Do not decompose dropout for exported models, because in eval mode the dropout
+    # op disappears from the graph, which makes it difficult to switch to train mode.
+    # See https://github.com/pytorch/pytorch/pull/115258#issuecomment-1900755832.
+    decomp_table = {
+        op: op.decompose
+        for op in FunctionalTensor.maybe_aliasing_or_mutating_ops
+        if op != torch.ops.aten.dropout.default
+    }
+    with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
+        m = torch._dynamo.export(
+            f,
+            constraints=constraints,
+            assume_static_by_default=True,
+            tracing_mode="symbolic",
+            decomposition_table=decomp_table,
+            pre_dispatch=True,
+            aten_graph=True,
+            _log_export_usage=False,
+        )(
+            *args,
+            **kwargs,
+        )[0]
+
+        _, _, _, fake_mode = _convert_input_to_fake(m, args, kwargs)
+
+        m.meta["inline_constraints"] = {
+            k: v
+            for k, v in fake_mode.shape_env.var_to_range.items()
+            if re.match(r"^[if]\d+$", str(k))
+        }
+
+        if isinstance(f, torch.nn.Module):
+            from torch.export._trace import _restore_state_dict
+            _restore_state_dict(f, m)
+
+        flat_args, _ = pytree.tree_flatten((args, kwargs or {}))
+        range_constraints = _process_constraints(fake_mode, m, 0, flat_args)
+
+        module = _create_stateful_graph_module(
+            m,
+            range_constraints=range_constraints,
+        )
+
+    error_message = \
+        """
+        Calling train() or eval() is not supported for exported models.
+        Alternatively, you may override these methods to do custom user behavior as follows:
+
+            def _my_train(self, mode: bool = True):
+                ...
+
+            def _my_eval(self):
+                ...
+
+            model.train = types.MethodType(_my_train, model)
+            model.eval = types.MethodType(_my_eval, model)
+        """
+
+    def _train(self, mode: bool = True):
+        raise NotImplementedError(error_message)
+
+    def _eval(self, mode: bool = True):
+        raise NotImplementedError(error_message)
+
+    module.train = types.MethodType(_train, module)  # type: ignore[method-assign]
+    module.eval = types.MethodType(_eval, module)  # type: ignore[method-assign]
+    return module
+
+
+def save(
+    ep: ExportedProgram,
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    opset_version: Optional[Dict[str, int]] = None,
+) -> None:
+    if not isinstance(ep, ExportedProgram):
+        raise TypeError(f"save() expects an ExportedProgram but got {type(ep)}")
+
+    from .serde.serialize import serialize, SerializedArtifact
+    from .serde.schema import SCHEMA_VERSION
+    artifact: SerializedArtifact = serialize(ep, opset_version)
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    with zipfile.ZipFile(f, 'w') as zipf:
+        # Save every field the SerializedArtifact to a file
+        assert isinstance(artifact.exported_program, bytes)
+        zipf.writestr("serialized_exported_program.json", artifact.exported_program)
+        zipf.writestr("serialized_state_dict.pt", artifact.state_dict)
+        zipf.writestr("serialized_constants.pt", artifact.constants)
+
+        zipf.writestr('version', ".".join(map(str, SCHEMA_VERSION)))
+
+        # Add extra files if provided
+        if extra_files:
+            for extra_file_name, content in extra_files.items():
+                encoded_content = content.encode('utf-8')
+                zipf.writestr(f"extra_files/{extra_file_name}", encoded_content)
+
+
+def load(
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    expected_opset_version: Optional[Dict[str, int]] = None,
+) -> ExportedProgram:
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+
+    extra_files = extra_files or {}
+
+    with zipfile.ZipFile(f, 'r') as zipf:
+        # Check the version
+        version = zipf.read('version').decode().split('.')
+        from .serde.schema import SCHEMA_VERSION
+
+        assert len(version) == len(SCHEMA_VERSION)
+        if version[0] != str(SCHEMA_VERSION[0]):
+            raise RuntimeError(
+                f"Serialized version {version} does not match our current "
+                f"schema version {SCHEMA_VERSION}."
+            )
+
+        from .serde.serialize import deserialize, SerializedArtifact
+
+        # Load serialized_ep and serialized_state_dict from the zip file
+
+        serialized_exported_program: Optional[bytes] = None
+        serialized_state_dict: Optional[bytes] = None
+        serialized_constants: Optional[bytes] = None
+
+        for file_info in zipf.infolist():
+            file_content = zipf.read(file_info.filename)
+
+            if file_info.filename == "serialized_exported_program.json":
+                serialized_exported_program = file_content
+            elif file_info.filename == "serialized_state_dict.json":
+                warnings.warn("This version of file is deprecated")
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.json":
+                warnings.warn("This version of file is deprecated")
+                serialized_constants = file_content
+            elif file_info.filename == "serialized_state_dict.pt":
+                serialized_state_dict = file_content
+            elif file_info.filename == "serialized_constants.pt":
+                serialized_constants = file_content
+            elif file_info.filename.startswith("extra_files"):
+                filename = file_info.filename.split("/", 1)[1]
+                extra_files[filename] = file_content.decode('utf-8')
+
+        assert serialized_exported_program is not None
+        assert serialized_state_dict is not None
+        assert serialized_constants is not None
+        artifact: SerializedArtifact = SerializedArtifact(
+            serialized_exported_program,
+            serialized_state_dict,
+            serialized_constants,
+        )
+
+        # Deserialize ExportedProgram
+        ep = deserialize(artifact, expected_opset_version)
+
+        return ep
+
+
+def aot_compile(
+    f: Callable,
+    args: Tuple[Any],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Dict[str, Any]] = None,
+    options: Optional[Dict[str, Any]] = None,
+    remove_runtime_assertions: bool = False,
+    disable_constraint_solver: bool = False,
+) -> str:
+    """
+    Note: this function is not stable yet
+
+    Traces either an nn.Module's forward function or just a callable with PyTorch
+    operations inside, generates executable cpp code from the program, and returns
+    the path to the generated shared library
+
+    Args:
+        f: the `nn.Module` or callable to trace.
+
+        args: example positional inputs.
+
+        kwargs: optional example keyword inputs.
+
+        dynamic_shapes: Should either be:
+            1) a dict from argument names of ``f`` to their dynamic shape specifications,
+            2) a tuple that specifies dynamic shape specifications for each input in original order.
+            If you are specifying dynamism on keyword args, you will need to pass them in the order that
+            is defined in the original function signature.
+
+            The dynamic shape of a tensor argument can be specified as either
+            (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+            not required to include static dimension indices in this dict, but when they are,
+            they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+            where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+            are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+            recursively specified by using mappings or sequences of contained specifications.
+
+        options: A dictionary of options to control inductor
+
+        disable_constraint_solver: Whether the dim constraint solver must be disabled.
+
+    Returns:
+        Path to the generated shared library
+    """
+    from torch.export._trace import _export_to_torch_ir
+    from torch._inductor.decomposition import select_decomp_table
+
+    constraints = _process_dynamic_shapes(f, args, kwargs, dynamic_shapes)
+
+    if config.is_predispatch:
+        gm = torch.export._trace._export(f, args, kwargs, constraints, pre_dispatch=True).module()
+    else:
+        # We want to export to Torch IR here to utilize the pre_grad passes in
+        # inductor, which run on Torch IR.
+        gm = _export_to_torch_ir(
+            f,
+            args,
+            kwargs,
+            constraints,
+            disable_constraint_solver=disable_constraint_solver,
+            # Disabling this flag, because instead we can rely on the mapping
+            # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
+            restore_fqn=False,
+        )
+    flat_example_inputs = pytree.arg_tree_leaves(*args, **(kwargs or {}))
+
+    with torch.no_grad():
+        so_path = torch._inductor.aot_compile(gm, flat_example_inputs, options)  # type: ignore[arg-type]
+
+    return so_path
+
+def aot_load(so_path: str, device: str) -> Callable:
+    """
+    Loads a shared library generated by aot_compile and returns a callable
+
+    Args:
+        so_path: Path to the shared library
+
+    Returns:
+        A callable
+    """
+    if device == "cpu":
+        runner = torch._C._aoti.AOTIModelContainerRunnerCpu(so_path, 1)  # type: ignore[call-arg]
+    elif device == "cuda" or device.startswith("cuda:"):
+        runner = torch._C._aoti.AOTIModelContainerRunnerCuda(so_path, 1, device)  # type: ignore[assignment, call-arg]
+    else:
+        raise RuntimeError("Unsupported device " + device)
+
+    def optimized(*args, **kwargs):
+        call_spec = runner.get_call_spec()  # type: ignore[attr-defined]
+        in_spec = pytree.treespec_loads(call_spec[0])
+        out_spec = pytree.treespec_loads(call_spec[1])
+        flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
+        flat_outputs = runner.run(flat_inputs)  # type: ignore[attr-defined]
+        return pytree.tree_unflatten(flat_outputs, out_spec)
+
+    return optimized
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47a873e020bcef25adda07cd8fe067e024c37627
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/error.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/error.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f3f52b4fece6c2f40f8867bf5c57a18d3bdb63c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/error.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/exported_program.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/exported_program.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff05a230c447af0434ca2940daf32afb067fc987
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/exported_program.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3abfbec9fcf44398b1a49abf1a99c6d4f6f6b16
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/pass_base.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/pass_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edcc8c3d545d734b12cab912584e891c2fbff8ca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/pass_base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93d51de431c6c9a0b8848d9dbf6608fbda7a73d3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/verifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/verifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaca8665b1b7e4a39c4f67b1088ab85fe3f832e7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/verifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/__pycache__/wrappers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/__pycache__/wrappers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b18aab158e50dea5f34efab89f7b8e162b8a643f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/__pycache__/wrappers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/__init__.py b/MLPY/Lib/site-packages/torch/_export/db/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6272b424658450437a313fc71bedbce73da3205
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/MLPY/Lib/site-packages/torch/_export/db/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb7e1b1135a74a20ff51df42755a6e22571b1f1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/__pycache__/case.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/case.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7cf69d8542739bbc1c3ca88b7bfb6e459d37cc8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/case.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/__pycache__/gen_example.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/gen_example.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..017f6403ce886c3f68dc4eb7c3c6fc7f55ef6360
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/gen_example.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/__pycache__/logging.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/logging.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d44b63727c7f375f9b92f4b15c0b063a28c9e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/__pycache__/logging.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/case.py b/MLPY/Lib/site-packages/torch/_export/db/case.py
new file mode 100644
index 0000000000000000000000000000000000000000..086d16b1a0c9d607b34c4f03c3b636956cdc29fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/case.py
@@ -0,0 +1,188 @@
+import inspect
+import re
+import string
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from types import ModuleType
+
+import torch
+
+_TAGS: Dict[str, Dict[str, Any]] = {
+    "torch": {
+        "cond": {},
+        "dynamic-shape": {},
+        "escape-hatch": {},
+        "map": {},
+        "dynamic-value": {},
+        "operator": {},
+        "mutation": {},
+    },
+    "python": {
+        "assert": {},
+        "builtin": {},
+        "closure": {},
+        "context-manager": {},
+        "control-flow": {},
+        "data-structure": {},
+        "standard-library": {},
+        "object-model": {},
+    },
+}
+
+
+class SupportLevel(Enum):
+    """
+    Indicates at what stage the feature
+    used in the example is handled in export.
+    """
+
+    SUPPORTED = 1
+    NOT_SUPPORTED_YET = 0
+
+
+class ExportArgs:
+    __slots__ = ("args", "kwargs")
+
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+
+InputsType = Union[Tuple[Any, ...], ExportArgs]
+
+
+def check_inputs_type(x):
+    if not isinstance(x, (ExportArgs, tuple)):
+        raise ValueError(
+            f"Expecting inputs type to be either a tuple, or ExportArgs, got: {type(x)}"
+        )
+
+
+def _validate_tag(tag: str):
+    parts = tag.split(".")
+    t = _TAGS
+    for part in parts:
+        assert set(part) <= set(
+            string.ascii_lowercase + "-"
+        ), f"Tag contains invalid characters: {part}"
+        if part in t:
+            t = t[part]
+        else:
+            raise ValueError(f"Tag {tag} is not found in registered tags.")
+
+
+@dataclass(frozen=True)
+class ExportCase:
+    example_inputs: InputsType
+    description: str  # A description of the use case.
+    model: torch.nn.Module
+    name: str
+    extra_inputs: Optional[InputsType] = None  # For testing graph generalization.
+    # Tags associated with the use case. (e.g dynamic-shape, escape-hatch)
+    tags: Set[str] = field(default_factory=set)
+    support_level: SupportLevel = SupportLevel.SUPPORTED
+    dynamic_shapes: Optional[Dict[str, Any]] = None
+
+    def __post_init__(self):
+        check_inputs_type(self.example_inputs)
+        if self.extra_inputs is not None:
+            check_inputs_type(self.extra_inputs)
+
+        for tag in self.tags:
+            _validate_tag(tag)
+
+        if not isinstance(self.description, str) or len(self.description) == 0:
+            raise ValueError(f'Invalid description: "{self.description}"')
+
+
+_EXAMPLE_CASES: Dict[str, ExportCase] = {}
+_MODULES: Set[ModuleType] = set()
+_EXAMPLE_CONFLICT_CASES: Dict[str, List[ExportCase]] = {}
+_EXAMPLE_REWRITE_CASES: Dict[str, List[ExportCase]] = {}
+
+
+def register_db_case(case: ExportCase) -> None:
+    """
+    Registers a user provided ExportCase into example bank.
+    """
+    if case.name in _EXAMPLE_CASES:
+        if case.name not in _EXAMPLE_CONFLICT_CASES:
+            _EXAMPLE_CONFLICT_CASES[case.name] = [_EXAMPLE_CASES[case.name]]
+        _EXAMPLE_CONFLICT_CASES[case.name].append(case)
+        return
+
+    _EXAMPLE_CASES[case.name] = case
+
+
+def to_snake_case(name):
+    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
+    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower()
+
+
+def _make_export_case(m, name, configs):
+    if not issubclass(m, torch.nn.Module):
+        raise TypeError("Export case class should be a torch.nn.Module.")
+    m = m()
+
+    if "description" not in configs:
+        # Fallback to docstring if description is missing.
+        assert (
+            m.__doc__ is not None
+        ), f"Could not find description or docstring for export case: {m}"
+        configs = {**configs, "description": m.__doc__}
+    return ExportCase(**{**configs, "model": m, "name": name})
+
+
+def export_case(**kwargs):
+    """
+    Decorator for registering a user provided case into example bank.
+    """
+
+    def wrapper(m):
+        configs = kwargs
+        module = inspect.getmodule(m)
+        if module in _MODULES:
+            raise RuntimeError("export_case should only be used once per example file.")
+
+        assert module is not None
+        _MODULES.add(module)
+        normalized_name = to_snake_case(m.__name__)
+        module_name = module.__name__.split(".")[-1]
+        if module_name != normalized_name:
+            raise RuntimeError(
+                f'Module name "{module.__name__}" is inconsistent with exported program '
+                + f'name "{m.__name__}". Please rename the module to "{normalized_name}".'
+            )
+
+        case = _make_export_case(m, module_name, configs)
+        register_db_case(case)
+        return case
+
+    return wrapper
+
+
+def export_rewrite_case(**kwargs):
+    def wrapper(m):
+        configs = kwargs
+
+        parent = configs.pop("parent")
+        assert isinstance(parent, ExportCase)
+        key = parent.name
+        if key not in _EXAMPLE_REWRITE_CASES:
+            _EXAMPLE_REWRITE_CASES[key] = []
+
+        configs["example_inputs"] = parent.example_inputs
+        case = _make_export_case(m, to_snake_case(m.__name__), configs)
+        _EXAMPLE_REWRITE_CASES[key].append(case)
+        return case
+
+    return wrapper
+
+
+def normalize_inputs(x: InputsType) -> ExportArgs:
+    if isinstance(x, tuple):
+        return ExportArgs(*x)
+
+    assert isinstance(x, ExportArgs)
+    return x
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__init__.py b/MLPY/Lib/site-packages/torch/_export/db/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7085667b2a451ddcba198435be515d97eb2f3f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/__init__.py
@@ -0,0 +1,52 @@
+import glob
+import importlib
+from os.path import basename, dirname, isfile, join
+
+import torch
+from torch._export.db.case import (
+    _EXAMPLE_CASES,
+    _EXAMPLE_CONFLICT_CASES,
+    _EXAMPLE_REWRITE_CASES,
+    SupportLevel,
+)
+
+
+modules = glob.glob(join(dirname(__file__), "*.py"))
+__all__ = [
+    basename(f)[:-3] for f in modules if isfile(f) and not f.endswith("__init__.py")
+]
+
+# Import all module in the current directory.
+from . import *  # noqa: F403
+
+
+def all_examples():
+    return _EXAMPLE_CASES
+
+
+if len(_EXAMPLE_CONFLICT_CASES) > 0:
+
+    def get_name(case):
+        model = case.model
+        if isinstance(model, torch.nn.Module):
+            model = type(model)
+        return model.__name__
+
+    msg = "Error on conflict export case name.\n"
+    for case_name, cases in _EXAMPLE_CONFLICT_CASES.items():
+        msg += f"Case name {case_name} is associated with multiple cases:\n  "
+        msg += f"[{','.join(map(get_name, cases))}]\n"
+
+    raise RuntimeError(msg)
+
+
+def filter_examples_by_support_level(support_level: SupportLevel):
+    return {
+        key: val
+        for key, val in all_examples().items()
+        if val.support_level == support_level
+    }
+
+
+def get_rewrite_cases(case):
+    return _EXAMPLE_REWRITE_CASES.get(case.name, [])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d213f48892a5362258d8f17152b5556ae7fe0868
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/assume_constant_result.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/assume_constant_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e56a8106b277a2ad8ff0612ba8fd151f876df81
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/assume_constant_result.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/autograd_function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/autograd_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a77993402701d38184ac47e9be89aa4327fd00e8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/autograd_function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/class_method.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/class_method.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6ea78164a18845c0cc28300d1e9cb2e4bd81c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/class_method.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_class_method.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_class_method.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3698f59ffd8fcabe91ba99906e1a3037fa04ff6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_class_method.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nested_function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nested_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a59d2b46b7aa2858609ffbc2654578d2f0f3574f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nested_function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nonlocal_variables.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nonlocal_variables.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18f4280924ecaf5e943f9231f65a9cc467798835
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_branch_nonlocal_variables.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_closed_over_variable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_closed_over_variable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d09cb5ab75692b0f1c2caaa31e73feaf942735b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_closed_over_variable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_operands.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_operands.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0866761fd265fbb590d0bbd2dd0d777a6566926
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_operands.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_predicate.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_predicate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1102dd642ab2d978aedf6f15f6174b6f66efb3e3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/cond_predicate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_size_example.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_size_example.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8bfd05e301e55815e0a9ef9bf1b0dd23352be6a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_size_example.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_value_example.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_value_example.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0812ec54b93d1532e48be2780e0a57a757034af8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/constrain_as_value_example.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/decorator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/decorator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..950d387d7a4393942bb877661fa0d4ec7830ce50
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/decorator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dictionary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dictionary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f583e3cba6fac03e245c4f783c2fc30e03879a7f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dictionary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_assert.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_assert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f006003516ba2729ada4af41b63607bf01a048b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_assert.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_constructor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_constructor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc0269859d6378d0e2b5ab893414b4dabc1b174
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_constructor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_if_guard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_if_guard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a816906161146bebd8b8bfd0a533bac91f69015
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_if_guard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_map.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_map.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6aeaaa86ea3a64fcd77ad99557c5b7d50c233a8d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_map.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_round.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_round.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5092b910a24fb8bdbc94c77068b2d7160a094b28
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_round.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_slicing.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_slicing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b4c30b30e79e9a6434262151130c546510de453
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_slicing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_view.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_view.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bffab1a442688cdf0893e8a97db107fd7d80b2a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_view.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/fn_with_kwargs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/fn_with_kwargs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e45755d4798633e39ffcbb760bc18d296d4f9c75
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/fn_with_kwargs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_contains.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_contains.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..525103ae8bb92f52657fbee87c3180564fdef10f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_contains.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_unpack.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_unpack.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b6be96b3268c50aefbd19506e6a7c7d89ca2991
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/list_unpack.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/model_attr_mutation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/model_attr_mutation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9188363730330871f269050d93cc45f039cbf49c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/model_attr_mutation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/nested_function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/nested_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f29fbb78380b9a20dae18f559b6e23e9c38e5d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/nested_function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/null_context_manager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/null_context_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a05eea1696dc488b4e97377e1b9ba6ddd029f207
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/null_context_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/optional_input.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/optional_input.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c93e96b8263319c4d0e53f36ba2ee60544e328a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/optional_input.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/pytree_flatten.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/pytree_flatten.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..386f484904f03b5813ab67f69001edd7cc2efc94
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/pytree_flatten.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/scalar_output.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/scalar_output.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ec76cadfb60bd7b7e1b4462cdee34a2a7d5b845
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/scalar_output.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/specialized_attribute.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/specialized_attribute.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93677443c666ea2e97d22ca4a4bb2429d5da95c7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/specialized_attribute.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_for_loop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_for_loop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df6477633a29f491039b60f43ee1d762b9fb7f7f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_for_loop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_if.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_if.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..661cfcf37e9ebedf65eb778e9b37d53e5075349e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/static_if.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/tensor_setattr.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/tensor_setattr.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..252b9158bd0f859293dd23f951c1a37ff3e8c847
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/tensor_setattr.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/torch_sym_min.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/torch_sym_min.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b0e3365d1c0cb26a562063059c0f6d3fc0f542
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/torch_sym_min.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/type_reflection_method.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/type_reflection_method.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fca22cab8edc29cb9d9657628d83c9f41efb362
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/type_reflection_method.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/user_input_mutation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/user_input_mutation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5bfe0b0452c70d96a664419199f657f57938d9e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/db/examples/__pycache__/user_input_mutation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/assume_constant_result.py b/MLPY/Lib/site-packages/torch/_export/db/examples/assume_constant_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0d20d2ecf482c783f6278923595488c62ac3559
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/assume_constant_result.py
@@ -0,0 +1,24 @@
+import torch
+import torch._dynamo as torchdynamo
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2), torch.tensor(4)),
+    tags={"torch.escape-hatch"},
+)
+class AssumeConstantResult(torch.nn.Module):
+    """
+    Applying `assume_constant_result` decorator to burn make non-tracable code as constant.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @torchdynamo.assume_constant_result
+    def get_item(self, y):
+        return y.int().item()
+
+    def forward(self, x, y):
+        return x[: self.get_item(y)]
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/autograd_function.py b/MLPY/Lib/site-packages/torch/_export/db/examples/autograd_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9093b370e85a10e808221b5b9d6341530af9c85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/autograd_function.py
@@ -0,0 +1,26 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+class MyAutogradFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x.clone()
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output + 1
+
+
+@export_case(
+    example_inputs=(torch.randn(3, 2),),
+)
+class AutogradFunction(torch.nn.Module):
+    """
+    TorchDynamo does not keep track of backward() on autograd functions. We recommend to
+    use `allow_in_graph` to mitigate this problem.
+    """
+
+    def forward(self, x):
+        return MyAutogradFunction.apply(x)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/class_method.py b/MLPY/Lib/site-packages/torch/_export/db/examples/class_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..aafe70b02cd5441fd27d87bb4f7d2fe1bcc73fbd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/class_method.py
@@ -0,0 +1,24 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 4),),
+)
+class ClassMethod(torch.nn.Module):
+    """
+    Class methods are inlined during tracing.
+    """
+
+    @classmethod
+    def method(cls, x):
+        return x + 1
+
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(4, 2)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return self.method(x) * self.__class__.method(x) * type(self).method(x)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_class_method.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_class_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..450f08ff50be2b6a7651bf40526d9a236ecded01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_class_method.py
@@ -0,0 +1,46 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import cond
+
+
+class MySubModule(torch.nn.Module):
+    def foo(self, x):
+        return x.cos()
+
+    def forward(self, x):
+        return self.foo(x)
+
+
+@export_case(
+    example_inputs=(torch.ones(3),),
+    tags={
+        "torch.cond",
+        "torch.dynamic-shape",
+    },
+)
+class CondBranchClassMethod(torch.nn.Module):
+    """
+    The branch functions (`true_fn` and `false_fn`) passed to cond() must follow these rules:
+      - both branches must take the same args, which must also match the branch args passed to cond.
+      - both branches must return a single tensor
+      - returned tensor must have the same tensor metadata, e.g. shape and dtype
+      - branch function can be free function, nested function, lambda, class methods
+      - branch function can not have closure variables
+      - no inplace mutations on inputs or global variables
+
+
+    This example demonstrates using class method in cond().
+
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.subm = MySubModule()
+
+    def bar(self, x):
+        return x.sin()
+
+    def forward(self, x):
+        return cond(x.shape[0] <= 2, self.subm.forward, self.bar, [x])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nested_function.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nested_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e84f11edc7ccdbd07811f12e9c3b601bf0cf04
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nested_function.py
@@ -0,0 +1,44 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import cond
+
+
+@export_case(
+    example_inputs=(torch.ones(3),),
+    tags={
+        "torch.cond",
+        "torch.dynamic-shape",
+    },
+)
+class CondBranchNestedFunction(torch.nn.Module):
+    """
+    The branch functions (`true_fn` and `false_fn`) passed to cond() must follow these rules:
+      - both branches must take the same args, which must also match the branch args passed to cond.
+      - both branches must return a single tensor
+      - returned tensor must have the same tensor metadata, e.g. shape and dtype
+      - branch function can be free function, nested function, lambda, class methods
+      - branch function can not have closure variables
+      - no inplace mutations on inputs or global variables
+
+    This example demonstrates using nested function in cond().
+
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        def true_fn(x):
+            def inner_true_fn(y):
+                return x + y
+
+            return inner_true_fn(x)
+
+        def false_fn(x):
+            def inner_false_fn(y):
+                return x - y
+
+            return inner_false_fn(x)
+
+        return cond(x.shape[0] < 10, true_fn, false_fn, [x])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nonlocal_variables.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nonlocal_variables.py
new file mode 100644
index 0000000000000000000000000000000000000000..46070590037719de7047f36a56624c70f28f143b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_branch_nonlocal_variables.py
@@ -0,0 +1,63 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import cond
+
+
+@export_case(
+    example_inputs=(torch.ones(6),),
+    tags={
+        "torch.cond",
+        "torch.dynamic-shape",
+    },
+)
+class CondBranchNonlocalVariables(torch.nn.Module):
+    """
+    The branch functions (`true_fn` and `false_fn`) passed to cond() must follow these rules:
+    - both branches must take the same args, which must also match the branch args passed to cond.
+    - both branches must return a single tensor
+    - returned tensor must have the same tensor metadata, e.g. shape and dtype
+    - branch function can be free function, nested function, lambda, class methods
+    - branch function can not have closure variables
+    - no inplace mutations on inputs or global variables
+
+    This example demonstrates how to rewrite code to avoid capturing closure variables in branch functions.
+
+    The code below will not work because capturing closure variables is not supported.
+    ```
+    my_tensor_var = x + 100
+    my_primitive_var = 3.14
+
+    def true_fn(y):
+        nonlocal my_tensor_var, my_primitive_var
+        return y + my_tensor_var + my_primitive_var
+
+    def false_fn(y):
+        nonlocal my_tensor_var, my_primitive_var
+        return y - my_tensor_var - my_primitive_var
+
+    return cond(x.shape[0] > 5, true_fn, false_fn, [x])
+    ```
+
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        my_tensor_var = x + 100
+        my_primitive_var = 3.14
+
+        def true_fn(x, y, z):
+            return x + y + z
+
+        def false_fn(x, y, z):
+            return x - y - z
+
+        return cond(
+            x.shape[0] > 5,
+            true_fn,
+            false_fn,
+            [x, my_tensor_var, torch.tensor(my_primitive_var)],
+        )
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_closed_over_variable.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_closed_over_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3745271bc911ffa9097bdfd6457fe1e8ba14c07
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_closed_over_variable.py
@@ -0,0 +1,23 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import cond
+
+
+@export_case(
+    example_inputs=(torch.tensor(True), torch.ones(3, 2)),
+    tags={"torch.cond", "python.closure"},
+)
+class CondClosedOverVariable(torch.nn.Module):
+    """
+    torch.cond() supports branches closed over arbitrary variables.
+    """
+
+    def forward(self, pred, x):
+        def true_fn(val):
+            return x * 2
+
+        def false_fn(val):
+            return x - 2
+
+        return cond(pred, true_fn, false_fn, [x + 1])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_operands.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_operands.py
new file mode 100644
index 0000000000000000000000000000000000000000..d225c2a39e33ad50bf32ed9ddddd4350e3f51ea8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_operands.py
@@ -0,0 +1,39 @@
+import torch
+
+from torch._export.db.case import export_case
+from torch.export import Dim
+from functorch.experimental.control_flow import cond
+
+x = torch.randn(3, 2)
+y = torch.ones(2)
+dim0_x = Dim("dim0_x")
+
+@export_case(
+    example_inputs=(x, y),
+    tags={
+        "torch.cond",
+        "torch.dynamic-shape",
+    },
+    extra_inputs=(torch.randn(2, 2), torch.ones(2)),
+    dynamic_shapes={"x": {0: dim0_x}, "y": None},
+)
+class CondOperands(torch.nn.Module):
+    """
+    The operands passed to cond() must be:
+    - a list of tensors
+    - match arguments of `true_fn` and `false_fn`
+
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        def true_fn(x, y):
+            return x + y
+
+        def false_fn(x, y):
+            return x - y
+
+        return cond(x.shape[0] > 2, true_fn, false_fn, [x, y])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/cond_predicate.py b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_predicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..df23cc4df7a52ba8c1fd8b1946b46d31e3d9d248
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/cond_predicate.py
@@ -0,0 +1,29 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import cond
+
+
+@export_case(
+    example_inputs=(torch.ones(6, 4, 3),),
+    tags={
+        "torch.cond",
+        "torch.dynamic-shape",
+    },
+)
+class CondPredicate(torch.nn.Module):
+    """
+    The conditional statement (aka predicate) passed to cond() must be one of the following:
+      - torch.Tensor with a single element
+      - boolean expression
+
+    NOTE: If the `pred` is test on a dim with batch size < 2, it will be specialized.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        pred = x.dim() > 2 and x.shape[2] > 10
+
+        return cond(pred, lambda x: x.cos(), lambda y: y.sin(), [x])
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_size_example.py b/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_size_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6274acf9dde24214fbf36ec3daad3a6cbf84c58
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_size_example.py
@@ -0,0 +1,27 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.tensor(4),),
+    tags={
+        "torch.dynamic-value",
+        "torch.escape-hatch",
+    },
+)
+class ConstrainAsSizeExample(torch.nn.Module):
+    """
+    If the value is not known at tracing time, you can provide hint so that we
+    can trace further. Please look at constrain_as_value and constrain_as_size APIs
+    constrain_as_size is used for values that NEED to be used for constructing
+    tensor.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        a = x.item()
+        torch._constrain_as_size(a, min=0, max=5)
+        return torch.ones((a, 5))
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_value_example.py b/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_value_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa32144602c18d5b0456aa6b4f37e4f7457f0e0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/constrain_as_value_example.py
@@ -0,0 +1,30 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.tensor(4), torch.randn(5, 5)),
+    tags={
+        "torch.dynamic-value",
+        "torch.escape-hatch",
+    },
+)
+class ConstrainAsValueExample(torch.nn.Module):
+    """
+    If the value is not known at tracing time, you can provide hint so that we
+    can trace further. Please look at constrain_as_value and constrain_as_size APIs.
+    constrain_as_value is used for values that don't need to be used for constructing
+    tensor.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        a = x.item()
+        torch._constrain_as_value(a, min=0, max=5)
+
+        if a < 6:
+            return y.sin()
+        return y.cos()
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/decorator.py b/MLPY/Lib/site-packages/torch/_export/db/examples/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9e56fb94535fa367864f2e74d74e736619d4be8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/decorator.py
@@ -0,0 +1,26 @@
+import functools
+
+import torch
+
+from torch._export.db.case import export_case
+
+
+def test_decorator(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs) + 1
+
+    return wrapper
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2), torch.ones(3, 2)),
+)
+class Decorator(torch.nn.Module):
+    """
+    Decorators calls are inlined into the exported function during tracing.
+    """
+
+    @test_decorator
+    def forward(self, x, y):
+        return x + y
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dictionary.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dictionary.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa52d5a91519486d911e417f2cd95cd924381319
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dictionary.py
@@ -0,0 +1,21 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2), torch.tensor(4)),
+    tags={"python.data-structure"},
+)
+class Dictionary(torch.nn.Module):
+    """
+    Dictionary structures are inlined and flattened along tracing.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        elements = {}
+        elements["x2"] = x * x
+        y = y * elements["x2"]
+        return {"y": y}
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_assert.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_assert.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c79d37d413da7c22e60beb3cfed4b722810527
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_assert.py
@@ -0,0 +1,22 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"python.assert"},
+)
+class DynamicShapeAssert(torch.nn.Module):
+    """
+    A basic usage of python assertion.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        # assertion with error message
+        assert x.shape[0] > 2, f"{x.shape[0]} is greater than 2"
+        # assertion without error message
+        assert x.shape[0] > 1
+        return x
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_constructor.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..28cc1eea66307807d2c277bad6667a589c4002c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_constructor.py
@@ -0,0 +1,19 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"torch.dynamic-shape"},
+)
+class DynamicShapeConstructor(torch.nn.Module):
+    """
+    Tensor constructors should be captured with dynamic shape inputs rather
+    than being baked in with static shape.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return torch.ones(x.shape[0] * 2)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_if_guard.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_if_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..977e5e95276297fe4dffd5b3f1a2d22603c3ebbd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_if_guard.py
@@ -0,0 +1,21 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2, 2),),
+    tags={"torch.dynamic-shape", "python.control-flow"},
+)
+class DynamicShapeIfGuard(torch.nn.Module):
+    """
+    `if` statement with backed dynamic shape predicate will be specialized into
+    one particular branch and generate a guard. However, export will fail if the
+    the dimension is marked as dynamic shape from higher level API.
+    """
+
+    def forward(self, x):
+        if x.shape[0] == 3:
+            return x.cos()
+
+        return x.sin()
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_map.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c09a72f528fd8c3003db9b24178f4430d2bf73b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_map.py
@@ -0,0 +1,23 @@
+import torch
+
+from torch._export.db.case import export_case
+from functorch.experimental.control_flow import map
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2), torch.ones(2)),
+    tags={"torch.dynamic-shape", "torch.map"},
+)
+class DynamicShapeMap(torch.nn.Module):
+    """
+    functorch map() maps a function over the first tensor dimension.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, xs, y):
+        def body(x, y):
+            return x + y
+
+        return map(body, xs, y)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_round.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_round.py
new file mode 100644
index 0000000000000000000000000000000000000000..df35d45193681199307d7b580fb99b8e7f8ba6b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_round.py
@@ -0,0 +1,24 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+from torch.export import Dim
+
+x = torch.ones(3, 2)
+dim0_x = Dim("dim0_x")
+
+@export_case(
+    example_inputs=(x,),
+    tags={"torch.dynamic-shape", "python.builtin"},
+    support_level=SupportLevel.NOT_SUPPORTED_YET,
+    dynamic_shapes={"x": {0: dim0_x}},
+)
+class DynamicShapeRound(torch.nn.Module):
+    """
+    Calling round on dynamic shapes is not supported.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x[: round(x.shape[0] / 2)]
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_slicing.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_slicing.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b9b50b38ce63f68105a750cca0ebd7357c0239c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_slicing.py
@@ -0,0 +1,20 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"torch.dynamic-shape"},
+)
+class DynamicShapeSlicing(torch.nn.Module):
+    """
+    Slices with dynamic shape arguments should be captured into the graph
+    rather than being baked in.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x[: x.shape[0] - 2, x.shape[1] - 1 :: 2]
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_view.py b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_view.py
new file mode 100644
index 0000000000000000000000000000000000000000..c414df8c8dbadd772ffca59ab08c20392005fff2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/dynamic_shape_view.py
@@ -0,0 +1,22 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(10, 10),),
+    tags={"torch.dynamic-shape"},
+)
+class DynamicShapeView(torch.nn.Module):
+    """
+    Dynamic shapes should be propagated to view arguments instead of being
+    baked into the exported graph.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        new_x_shape = x.size()[:-1] + (2, 5)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/fn_with_kwargs.py b/MLPY/Lib/site-packages/torch/_export/db/examples/fn_with_kwargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bbee5fc57cb18b6ee91a823d488e92e7a1d8b62
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/fn_with_kwargs.py
@@ -0,0 +1,32 @@
+import torch
+
+from torch._export.db.case import export_case, ExportArgs, SupportLevel
+
+
+@export_case(
+    example_inputs=ExportArgs(
+        torch.randn(4),
+        (torch.randn(4), torch.randn(4)),
+        *[torch.randn(4), torch.randn(4)],
+        mykw0=torch.randn(4),
+        input0=torch.randn(4), input1=torch.randn(4)
+    ),
+    tags={"python.data-structure"},
+    support_level=SupportLevel.SUPPORTED,
+)
+class FnWithKwargs(torch.nn.Module):
+    """
+    Keyword arguments are not supported at the moment.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, pos0, tuple0, *myargs, mykw0, **mykwargs):
+        out = pos0
+        for arg in tuple0:
+            out = out * arg
+        for arg in myargs:
+            out = out * arg
+        out = out * mykw0
+        out = out * mykwargs["input0"] * mykwargs["input1"]
+        return out
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/list_contains.py b/MLPY/Lib/site-packages/torch/_export/db/examples/list_contains.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c9eba71529240e13850d6bf033d0c6b550c0ab1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/list_contains.py
@@ -0,0 +1,21 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"torch.dynamic-shape", "python.data-structure", "python.assert"},
+)
+class ListContains(torch.nn.Module):
+    """
+    List containment relation can be checked on a dynamic shape or constants.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        assert x.size(-1) in [6, 2]
+        assert x.size(0) not in [4, 5, 6]
+        assert "monkey" not in ["cow", "pig"]
+        return x + x
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/list_unpack.py b/MLPY/Lib/site-packages/torch/_export/db/examples/list_unpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68c5cf0f2a917a794e54ea91103c168f3c85729
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/list_unpack.py
@@ -0,0 +1,27 @@
+from typing import List
+
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=([torch.ones(3, 2), torch.tensor(4), torch.tensor(5)],),
+    tags={"python.control-flow", "python.data-structure"},
+)
+class ListUnpack(torch.nn.Module):
+    """
+    Lists are treated as static construct, therefore unpacking should be
+    erased after tracing.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, args: List[torch.Tensor]):
+        """
+        Lists are treated as static construct, therefore unpacking should be
+        erased after tracing.
+        """
+        x, *y = args
+        return x + y[0]
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/model_attr_mutation.py b/MLPY/Lib/site-packages/torch/_export/db/examples/model_attr_mutation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aca91755613189e46988075a7dfcb1e9547c0d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/model_attr_mutation.py
@@ -0,0 +1,25 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"python.object-model"},
+    support_level=SupportLevel.NOT_SUPPORTED_YET,
+)
+class ModelAttrMutation(torch.nn.Module):
+    """
+    Attribute mutation is not supported.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.attr_list = [torch.ones(3, 2), torch.ones(3, 2)]
+
+    def recreate_list(self):
+        return [torch.zeros(3, 2), torch.zeros(3, 2)]
+
+    def forward(self, x):
+        self.attr_list = self.recreate_list()
+        return x.sum() + self.attr_list[0].sum()
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/nested_function.py b/MLPY/Lib/site-packages/torch/_export/db/examples/nested_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c6f90c86b1fb6905edc3874dc929011048a8cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/nested_function.py
@@ -0,0 +1,27 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2), torch.ones(2)),
+    tags={"python.closure"},
+)
+class NestedFunction(torch.nn.Module):
+    """
+    Nested functions are traced through. Side effects on global captures
+    are not supported though.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, a, b):
+        x = a + b
+        z = a - b
+
+        def closure(y):
+            nonlocal x
+            x += 1
+            return x * y + z
+
+        return closure(x)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/null_context_manager.py b/MLPY/Lib/site-packages/torch/_export/db/examples/null_context_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9856a9f41d0eae30d070d38fb9eec67f92e0779
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/null_context_manager.py
@@ -0,0 +1,26 @@
+import contextlib
+
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"python.context-manager"},
+)
+class NullContextManager(torch.nn.Module):
+    """
+    Null context manager in Python will be traced out.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        """
+        Null context manager in Python will be traced out.
+        """
+        ctx = contextlib.nullcontext()
+        with ctx:
+            return x.sin() + x.cos()
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/optional_input.py b/MLPY/Lib/site-packages/torch/_export/db/examples/optional_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cbf1604c51e85029f7708312f7a5dcbbc0478f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/optional_input.py
@@ -0,0 +1,19 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+
+
+@export_case(
+    example_inputs=(torch.randn(2, 3),),
+    tags={"python.object-model"},
+    support_level=SupportLevel.NOT_SUPPORTED_YET,
+)
+class OptionalInput(torch.nn.Module):
+    """
+    Tracing through optional input is not supported yet
+    """
+
+    def forward(self, x, y=torch.ones(2, 3)):
+        if y is not None:
+            return x + y
+        return x
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/pytree_flatten.py b/MLPY/Lib/site-packages/torch/_export/db/examples/pytree_flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc565e7e507940f87da4dcb2290e17892f3050f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/pytree_flatten.py
@@ -0,0 +1,20 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+from torch.utils import _pytree as pytree
+
+
+@export_case(
+    example_inputs=({1: torch.randn(3, 2), 2: torch.randn(3, 2)},),
+    support_level=SupportLevel.SUPPORTED,
+)
+class PytreeFlatten(torch.nn.Module):
+    """
+    Pytree from PyTorch can be captured by TorchDynamo.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        y, spec = pytree.tree_flatten(x)
+        return y[0] + 1
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/scalar_output.py b/MLPY/Lib/site-packages/torch/_export/db/examples/scalar_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca92154efe844f824255995c6eab6071343012f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/scalar_output.py
@@ -0,0 +1,23 @@
+import torch
+
+from torch._export.db.case import export_case
+from torch.export import Dim
+
+x = torch.ones(3, 2)
+dim1_x = Dim("dim1_x")
+
+@export_case(
+    example_inputs=(x,),
+    tags={"torch.dynamic-shape"},
+    dynamic_shapes={"x": {1: dim1_x}},
+)
+class ScalarOutput(torch.nn.Module):
+    """
+    Returning scalar values from the graph is supported, in addition to Tensor
+    outputs. Symbolic shapes are captured and rank is specialized.
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.shape[1] + 1
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/specialized_attribute.py b/MLPY/Lib/site-packages/torch/_export/db/examples/specialized_attribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcdcbaa1a2ee3f840c4f01c1d5d37e27cef52a10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/specialized_attribute.py
@@ -0,0 +1,29 @@
+from enum import Enum
+
+import torch
+
+from torch._export.db.case import export_case
+
+
+class Animal(Enum):
+    COW = "moo"
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+)
+class SpecializedAttribute(torch.nn.Module):
+    """
+    Model attributes are specialized.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.a = "moo"
+        self.b = 4
+
+    def forward(self, x):
+        if self.a == Animal.COW.value:
+            return x * x + self.b
+        else:
+            raise ValueError("bad")
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/static_for_loop.py b/MLPY/Lib/site-packages/torch/_export/db/examples/static_for_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdd70566f839315cdbe62022c63e08c6709add4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/static_for_loop.py
@@ -0,0 +1,22 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"python.control-flow"},
+)
+class StaticForLoop(torch.nn.Module):
+    """
+    A for loop with constant number of iterations should be unrolled in the exported graph.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        ret = []
+        for i in range(10):  # constant
+            ret.append(i + x)
+        return ret
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/static_if.py b/MLPY/Lib/site-packages/torch/_export/db/examples/static_if.py
new file mode 100644
index 0000000000000000000000000000000000000000..78b43cfd93d4a596c72a5a2d0003bfe0497416c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/static_if.py
@@ -0,0 +1,23 @@
+import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2, 2),),
+    tags={"python.control-flow"},
+)
+class StaticIf(torch.nn.Module):
+    """
+    `if` statement with static predicate value should be traced through with the
+    taken branch.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        if len(x.shape) == 3:
+            return x + torch.ones(1, 1, 1)
+
+        return x
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/tensor_setattr.py b/MLPY/Lib/site-packages/torch/_export/db/examples/tensor_setattr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6d86efe02decd7d7498d9fc82535a7464c3a0dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/tensor_setattr.py
@@ -0,0 +1,17 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+
+
+@export_case(
+    example_inputs=(torch.randn(3, 2), "attr"),
+    tags={"python.builtin"},
+    support_level=SupportLevel.SUPPORTED,
+)
+class TensorSetattr(torch.nn.Module):
+    """
+    setattr() call onto tensors is not supported.
+    """
+    def forward(self, x, attr):
+        setattr(x, attr, torch.randn(3, 2))
+        return x + 4
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/torch_sym_min.py b/MLPY/Lib/site-packages/torch/_export/db/examples/torch_sym_min.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79a22b66e522a0f88b837c2a9c6bc4d1ffc69f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/torch_sym_min.py
@@ -0,0 +1,17 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"torch.operator"},
+    support_level=SupportLevel.NOT_SUPPORTED_YET,
+)
+class TorchSymMin(torch.nn.Module):
+    """
+    torch.sym_min operator is not supported in export.
+    """
+
+    def forward(self, x):
+        return x.sum() + torch.sym_min(x.size(0), 100)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/type_reflection_method.py b/MLPY/Lib/site-packages/torch/_export/db/examples/type_reflection_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..031328c7dc3afcdde43f40bf2dade03657172ee7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/type_reflection_method.py
@@ -0,0 +1,41 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel, export_rewrite_case
+
+
+class A:
+    @classmethod
+    def func(cls, x):
+        return 1 + x
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 4),),
+    tags={"python.builtin"},
+    support_level=SupportLevel.SUPPORTED,
+)
+class TypeReflectionMethod(torch.nn.Module):
+    """
+    type() calls on custom objects followed by attribute accesses are not allowed
+    due to its overly dynamic nature.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        a = A()
+        return type(a).func(x)
+
+
+@export_rewrite_case(parent=TypeReflectionMethod)
+class TypeReflectionMethodRewrite(torch.nn.Module):
+    """
+    Custom object class methods will be inlined.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return A.func(x)
diff --git a/MLPY/Lib/site-packages/torch/_export/db/examples/user_input_mutation.py b/MLPY/Lib/site-packages/torch/_export/db/examples/user_input_mutation.py
new file mode 100644
index 0000000000000000000000000000000000000000..43906a88b15e172edb7d5b917b8d9a37b515f11e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/examples/user_input_mutation.py
@@ -0,0 +1,18 @@
+import torch
+
+from torch._export.db.case import export_case, SupportLevel
+
+
+@export_case(
+    example_inputs=(torch.ones(3, 2),),
+    tags={"torch.mutation"},
+    support_level=SupportLevel.SUPPORTED,
+)
+class UserInputMutation(torch.nn.Module):
+    """
+    Directly mutate user input in forward
+    """
+
+    def forward(self, x):
+        x.mul_(2)
+        return x.cos()
diff --git a/MLPY/Lib/site-packages/torch/_export/db/gen_example.py b/MLPY/Lib/site-packages/torch/_export/db/gen_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcba6c92ef121ac11e77e657c58babeac4e79ad0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/gen_example.py
@@ -0,0 +1,28 @@
+import os
+import sys
+
+import torch._export.db.examples as examples
+
+TEMPLATE = '''import torch
+
+from torch._export.db.case import export_case
+
+
+@export_case(
+    example_inputs=(torch.randn(3, 2),),
+    tags={{}},
+)
+def {case_name}(x):
+    """
+    """
+
+    return
+'''
+
+if __name__ == "__main__":
+    assert len(sys.argv) == 2
+    root_dir = examples.__name__.replace(".", "/")
+    assert os.path.exists(root_dir)
+    with open(os.path.join(root_dir, sys.argv[1] + ".py"), "w") as f:
+        print("Writing to", f.name, "...")
+        f.write(TEMPLATE.format(case_name=sys.argv[1]))
diff --git a/MLPY/Lib/site-packages/torch/_export/db/logging.py b/MLPY/Lib/site-packages/torch/_export/db/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..44f68caff77429c97ce11153e0cf29a2550422a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/db/logging.py
@@ -0,0 +1,2 @@
+def exportdb_error_message(case_name: str):
+    return ""
diff --git a/MLPY/Lib/site-packages/torch/_export/error.py b/MLPY/Lib/site-packages/torch/_export/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d2e594c11d381912fc357ca1250f9ae151bffd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/error.py
@@ -0,0 +1,56 @@
+from enum import Enum
+
+
+class ExportErrorType(Enum):
+    # User providing invalid inputs to either tracer, or other public facing APIs
+    INVALID_INPUT_TYPE = 1
+
+    # User returning values from their models that we don’t support.
+    INVALID_OUTPUT_TYPE = 2
+
+    # Generated IR does not conform to Export IR Specification.
+    VIOLATION_OF_SPEC = 3
+
+    # User’s code contains types and functionalities we don’t support.
+    NOT_SUPPORTED = 4
+
+    # User's code didn't provide necessary details for us to successfully trace and export.
+    # For example, we use a lot of decorators and ask users to annotate their model.
+    MISSING_PROPERTY = 5
+
+    # User is using an API without proper initialization step.
+    UNINITIALIZED = 6
+
+
+def internal_assert(pred: bool, assert_msg: str) -> None:
+    """
+    This is exir's custom assert method. It internally just throws InternalError.
+    Note that the sole purpose is to throw our own error while maintaining similar syntax
+    as python assert.
+    """
+
+    if not pred:
+        raise InternalError(assert_msg)
+
+
+class InternalError(Exception):
+    """
+    Raised when an internal invariance is violated in EXIR stack.
+    Should hint users to report a bug to dev and expose the original
+    error message.
+    """
+
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
+
+
+class ExportError(Exception):
+    """
+    This type of exception is raised for errors that are directly caused by the user
+    code. In general, user errors happen during model authoring, tracing, using our public
+    facing APIs, and writing graph passes.
+    """
+
+    def __init__(self, error_code: ExportErrorType, message: str) -> None:
+        prefix = f"[{error_code}]: "
+        super().__init__(prefix + message)
diff --git a/MLPY/Lib/site-packages/torch/_export/exported_program.py b/MLPY/Lib/site-packages/torch/_export/exported_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdd7788a3da1403fb6c302cad7d7510096b9f904
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/exported_program.py
@@ -0,0 +1,50 @@
+import warnings
+
+
+import torch
+import torch.fx
+
+
+# TODO(ycao): This is added to avoid breaking existing code temporarily.
+# Remove when migration is done.
+from torch.export.graph_signature import (
+    ExportBackwardSignature,
+    ExportGraphSignature,
+)
+
+from torch.export.exported_program import (
+    ExportedProgram,
+    ModuleCallEntry,
+    ModuleCallSignature,
+)
+
+
+
+__all__ = [
+    "ExportBackwardSignature",
+    "ExportGraphSignature",
+    "ExportedProgram",
+    "ModuleCallEntry",
+    "ModuleCallSignature",
+]
+
+
+def _create_graph_module_for_export(root, graph):
+    try:
+        gm = torch.fx.GraphModule(root, graph)
+    except SyntaxError:
+        # If custom objects stored in memory are being used in the graph,
+        # the generated python code will result in a syntax error on the custom
+        # object, since it is unable to parse the in-memory object. However
+        # we can still run the graph eagerly through torch.fx.Interpreter,
+        # so we will bypass this error.
+        warnings.warn(
+            "Unable to execute the generated python source code from "
+            "the graph. The graph module will no longer be directly callable, "
+            "but you can still run the ExportedProgram, and if needed, you can "
+            "run the graph module eagerly using torch.fx.Interpreter."
+        )
+        gm = torch.fx.GraphModule(root, torch.fx.Graph())
+        gm._graph = graph
+
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/_export/non_strict_utils.py b/MLPY/Lib/site-packages/torch/_export/non_strict_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e4cf6a9e2fd60e59b2769572b40ca6f57b7539
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/non_strict_utils.py
@@ -0,0 +1,258 @@
+import inspect
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Tuple, Union
+
+import torch
+from torch._dynamo.source import (
+    AttrSource,
+    GetItemSource,
+    LocalSource,
+    TensorProperty,
+    TensorPropertySource,
+)
+from torch._dynamo.variables.builder import TrackedFake
+from torch._export.passes.add_runtime_assertions_for_constraints_pass import InputDim
+from torch._guards import Source
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.export import Constraint
+from torch.export.graph_signature import CustomObjArgument
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    DimDynamic,
+    EqualityConstraint,
+    ShapeEnv,
+    StatelessSymbolicContext,
+)
+from torch.utils._pytree import (
+    GetAttrKey,
+    KeyPath,
+    MappingKey,
+    SequenceKey,
+    tree_map_with_path,
+)
+
+
+def key_path_to_source(kp: KeyPath) -> Source:
+    """
+    Given a key path, return the source for the key path.
+    """
+    source: Source = LocalSource("args")
+    for k in kp:
+        if isinstance(k, SequenceKey):
+            source = GetItemSource(source, k.idx)
+        elif isinstance(k, MappingKey):
+            source = GetItemSource(source, k.key)
+        elif isinstance(k, GetAttrKey):
+            source = AttrSource(source, k.name)
+        else:
+            raise ValueError(f"Unknown KeyEntry {k}")
+
+    return source
+
+
+def _is_constant_argument(t):
+    return t is None or isinstance(t, (int, float, bool, str))
+
+
+def fakify(
+    mode: FakeTensorMode,
+    kp: KeyPath,
+    t: Any,
+    t_constraints: Dict[int, Dict[int, Constraint]],
+    sources: Dict[Tuple[int, int], List[Source]],
+):
+    source = key_path_to_source(kp)
+    if _is_constant_argument(t) or isinstance(t, torch.ScriptObject):
+        return t
+    if not isinstance(t, torch.Tensor):
+        raise ValueError(f"Unsupported input type {type(t)}")
+    n_dims = len(t.shape)
+    symbolic_context = StatelessSymbolicContext(
+        dynamic_sizes=[DimDynamic.STATIC] * n_dims,
+        constraint_sizes=[None] * n_dims,
+    )
+    t_id = id(t)
+    if t_id in t_constraints:
+        for i, constraint in t_constraints[t_id].items():
+            symbolic_context.constraint_sizes[i] = constraint.constraint_range
+            symbolic_context.dynamic_sizes[i] = DimDynamic.DYNAMIC
+            src = TensorPropertySource(base=source, prop=TensorProperty.SIZE, idx=i)
+            sources[(t_id, i)].append(src)
+            mode.shape_env.source_name_to_debug_name[src.name()] = constraint.debug_name
+    fake = mode.from_tensor(t, source=source, symbolic_context=symbolic_context)
+    mode.shape_env.tracked_fakes.append(TrackedFake(fake, source, symbolic_context))
+    return fake
+
+
+def make_fake_params_buffers(
+    fake_mode: FakeTensorMode,
+    params_buffers: Dict[str, torch.Tensor],
+) -> Dict[str, Union[torch.Tensor, torch.nn.Parameter]]:
+    faked_params_buffers = {}
+    for key, value in params_buffers.items():
+        faked_params_buffers[key] = fake_mode.from_tensor(value, static_shapes=True)
+    return faked_params_buffers
+
+
+def make_fake_inputs(nn_module, args, kwargs, constraints):
+    """
+    Given an nn module, example inputs, and constraints, return a new fake mode,
+    fake inputs created in that mode whose dynamic shape dimensions are constrained
+    by the given ranges, and sources for pairs of dynamic shape dimensions that are
+    constrained to be equal.
+    """
+    # TODO(avik): refactor Dynamo to avoid duplication of the following code
+    # between non-strict and strict.
+    # Specifically, here (non-strict) we do the following pre-tracing steps:
+    #   - Fakify inputs.
+    #   - Process input shape equalities.
+    # In strict, these steps are spread across multiple files:
+    #   - output_graph.py fakifies inputs.
+    #   - [post-tracing] guards.py processes input shape equalities.
+
+    t_constraints: Dict[int, Dict[int, Constraint]] = defaultdict(dict)
+    for constraint in constraints:
+        t_constraints[constraint.t_id][constraint.dim] = constraint
+        if constraint.shared is not None:
+            t_constraints[constraint.shared.t_id][constraint.shared.dim] = constraint
+
+    code = nn_module.forward.__code__
+    co_fields = {
+        "co_name": code.co_name,
+        "co_filename": code.co_filename,
+        "co_firstlineno": code.co_firstlineno,
+    }
+
+    fake_mode = FakeTensorMode(
+        shape_env=ShapeEnv(tracked_fakes=[], co_fields=co_fields),
+        allow_non_fake_inputs=True,
+    )
+    if fake_mode.shape_env is None or fake_mode.shape_env.tracked_fakes is None:
+        raise ValueError(
+            "Detected fake_mode does not have a shape_env with tracked fakes. "
+            "If you constructed the module under a FakeTensorMode, "
+            "please initialize it like: FakeTensorMode(shape_env=ShapeEnv(tracked_fakes=[]))"
+        )
+
+    with fake_mode:
+        original_signature = inspect.signature(nn_module.forward)
+        sources: Dict[Tuple[int, int], List[Source]] = defaultdict(list)
+        fake_args, fake_kwargs = tree_map_with_path(
+            lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),
+            (args, kwargs),
+        )
+
+        from sympy import Symbol
+
+        source_pairs: List[Tuple[Source, Source]] = []
+        derived_equalities: List[Tuple[Source, Union[Source, Symbol], Callable]] = []
+        phantom_symbols: Dict[str, Symbol] = {}
+        for constraint in constraints:
+            torch.export.dynamic_shapes._process_equalities(
+                constraint,
+                lambda t_id, dim: sources[(t_id, dim)],
+                fake_mode.shape_env,
+                source_pairs,
+                derived_equalities,
+                phantom_symbols,
+            )
+
+        equalities_inputs = EqualityConstraint(
+            source_pairs=source_pairs,
+            derived_equalities=derived_equalities,
+            phantom_symbols=list(phantom_symbols.values()),
+            warn_only=False,
+        )
+        return fake_mode, fake_args, fake_kwargs, equalities_inputs, original_signature
+
+
+def make_constraints(
+    fake_mode,
+    equalities_inputs,
+    original_signature,
+    gm,
+):
+    """
+    Given a fake mode, sources pairs corresponding to equal dynamic shape dimensions,
+    and a graph module, produce guards on the fake mode's shape env (raising constraint
+    violations if any), solve (to suggest simplifications or fixes), and return the
+    resulting range constraints and equality constraints.
+    """
+    # TODO(avik): refactor Dynamo to avoid duplication of the following code
+    # between non-strict and strict.
+    # Specifically, here (non-strict) we do the following post-tracing steps:
+    #   - Produce guards.
+    #   - Solve constraints.
+    #   - Install shape metadata in IR.
+    # In strict, these steps are spread across multiple files:
+    #   - guards.py produces guards.
+    #   - eval_frame.py solves constraints
+    #   - _trace.py installs shape metadata in IR.
+
+    shape_env = fake_mode.shape_env
+    placeholders = [tf.fake for tf in shape_env.tracked_fakes]
+    sources = [tf.source for tf in shape_env.tracked_fakes]
+    input_contexts = [tf.symbolic_context for tf in shape_env.tracked_fakes]
+    constraint_violation_error = None
+    try:
+        shape_env.produce_guards(
+            placeholders,
+            sources,
+            input_contexts=input_contexts,
+            equalities_inputs=equalities_inputs,
+            ignore_static=False,
+        )
+    except ConstraintViolationError as e:
+        constraint_violation_error = e
+
+    shape_env.frozen = True
+    dim_constraints = shape_env.dim_constraints
+    if dim_constraints is None:
+        # Expected when shape_env.produce_guards throws an early constraint violation error.
+        # There is nothing to solve for in this case.
+        # TODO(avik): Maybe record the constraint violation error instead and replay later?
+        assert constraint_violation_error
+        raise constraint_violation_error
+    dim_constraints.solve()
+    dim_constraints.remove_redundant_dynamic_results()
+    forced_specializations = dim_constraints.forced_specializations()
+    msg = dim_constraints.prettify_results(
+        original_signature, constraint_violation_error, forced_specializations
+    )
+    if constraint_violation_error:
+        constraint_violation_error.args = (constraint_violation_error.args[0] + msg,)
+    elif forced_specializations:
+        constraint_violation_error = ConstraintViolationError(msg)
+    if constraint_violation_error:
+        raise constraint_violation_error
+
+    range_constraints = {}
+    input_dims = defaultdict(list)
+    free_symbols = set()
+    for node in gm.graph.nodes:
+        if node.op != "placeholder":
+            continue
+        if _is_constant_argument(node.meta["val"]) or isinstance(
+            node.meta["val"], CustomObjArgument
+        ):
+            continue
+        for i, d in enumerate(node.meta["val"].shape):
+            if isinstance(d, torch.SymInt):
+                # Look up the range constraint for the symbol corresponding to this shape dimension
+                # and store it indexed by the symbolic expression corresponding to it.
+                # NOTE(avik): Use node._expr instead of node.expr for the lookup here because
+                # we want the symbol, not its replacement, which could be an expression. Maybe
+                # there's a better way to do this, e.g., by (re)computing value ranges for expressions?
+                range_constraints[d.node.expr] = shape_env.var_to_range[d.node._expr]
+                input_dims[d.node.expr].append(InputDim(input_name=node.name, dim=i))
+                free_symbols.update(d.node.expr.free_symbols)
+
+    for symbol in free_symbols:
+        if symbol not in range_constraints:
+            # Placeholders can have symbolic shapes that are derived expressions.
+            # The above code will record direct range constraints for them
+            # so that we can do runtime assertions. In addition, for serde checks
+            # we want to record range constraints for their root symbols.
+            range_constraints[symbol] = shape_env.var_to_range[symbol]
+
+    return range_constraints
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_base.py b/MLPY/Lib/site-packages/torch/_export/pass_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e187ee6b7213d4c722200bee34946ecc0025bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/pass_base.py
@@ -0,0 +1,435 @@
+import operator
+import traceback
+import typing
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+from functorch.experimental.control_flow import _unstack_pytree
+from torch import fx
+from torch._dispatch.python import enable_python_dispatcher
+from torch._export.pass_infra.node_metadata import NodeMetadata
+from torch._export.pass_infra.proxy_value import ProxyValue
+from torch._subclasses import FakeTensor, UnsupportedFakeTensorException
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx import traceback as fx_traceback
+from torch.fx.experimental.proxy_tensor import PythonKeyTracer
+from torch.fx.graph import CodeGen
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.utils import _pytree as pytree
+
+
+__all__ = ["_ExportPassBaseDeprecatedDoNotUse"]
+
+
+Argument = Any
+Value = Any
+Fn = Callable[..., Any]
+PassType = Callable[[torch.fx.GraphModule], Optional[PassResult]]
+
+
+_TORCH_SYM_OPS: Set[Callable] = {
+    torch.sym_int,
+    torch.sym_ite,
+    torch.sym_max,
+    torch.sym_min,
+    torch.sym_not,
+    torch.sym_sqrt,
+}
+
+
+class ExportPassBaseError(RuntimeError):
+    pass
+
+
+class _ExportPassBaseDeprecatedDoNotUse(PassBase):
+    """
+    Interpreter-based pass class to help users maintain the IR spec while writing
+    transformations.
+    """
+
+    @staticmethod
+    def _create_dummy_node_metadata():
+        return NodeMetadata({"stack_trace": "".join(traceback.format_stack(limit=1))})
+
+
+    class ExportTracer(PythonKeyTracer):
+        def __init__(self, callback: "_ExportPassBaseDeprecatedDoNotUse", codegen: CodeGen) -> None:
+            super().__init__()
+            self.callback = callback
+            self.root = torch.nn.Module()
+            self.graph = torch.fx.Graph()
+            self.graph.set_codegen(codegen)
+            self.tensor_attrs: Dict[str, torch.Tensor] = {}  # type: ignore[assignment]
+            self.fake_tensor_mode: Optional[FakeTensorMode] = None
+            self.submodules: Dict[torch.nn.Module, str] = {}
+
+        def trace(self) -> None:
+            raise ExportPassBaseError("ExportTracer doesn't support trace().")
+
+        def create_arg(self, a: Argument) -> torch.fx.Node:
+            if isinstance(a, torch.nn.Module):
+                if a not in self.submodules:
+                    name_submodule = f"submodule_{len(self.submodules)}"
+                    self.root.add_module(name_submodule, a)
+                    self.submodules[a] = name_submodule
+            elif isinstance(a, FakeTensor):
+                if not hasattr(a, "constant") or a.constant is None:
+                    raise ExportPassBaseError(f"Cannot add {a} to graph.")
+                a = a.constant
+            node = super().create_arg(a)
+            if (
+                isinstance(a, torch.Tensor)
+                and isinstance(node, torch.fx.Node)
+                and node.op == "get_attr"
+            ):
+                self.set_metadata(node, a)
+                self.callback.on_attr(ProxyValue(a, node))
+            return node
+
+        def set_metadata(
+            self, node: torch.fx.Node, value: Argument,
+        ) -> None:
+            # propagate the fake tensor or sym nodes
+            def make_val(
+                x: Argument,
+            ) -> Union[FakeTensor, torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool, str, None]:
+                if isinstance(x, FakeTensor):
+                    return x
+                elif isinstance(x, torch.Tensor):
+                    if x.is_quantized:
+                        # TODO (tmanlaibaatar) properly support Quantized FakeTensor
+                        x = torch.dequantize(x)
+
+                    try:
+                        assert self.fake_tensor_mode is not None
+                        # TODO we should allocate static shapes
+                        # for param/buffer values
+                        if isinstance(x, torch.nn.Parameter):
+                            fake_tensor = self.fake_tensor_mode.from_tensor(
+                                x, static_shapes=True
+                            )
+                        else:
+                            fake_tensor = self.fake_tensor_mode.from_tensor(x)
+                    except UnsupportedFakeTensorException:
+                        # TODO: This is just a workaround to get over the
+                        # x.as_subclass error
+                        print(
+                            "Fakeifying a Tensor subclass is not supported \
+                            right now. Instead a TensorMetadata is used."
+                        )
+                        fake_tensor = None
+                    return fake_tensor
+                elif isinstance(x, (torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool, str)):
+                    return x
+                else:
+                    return None
+
+            node.meta["val"] = pytree.tree_map(make_val, value)
+
+            # Set the tensor_metadata for values that do not have a corresponding FakeTensor
+            def make_tensor_meta(x: Argument) -> Optional[TensorMetadata]:
+                if not isinstance(x, FakeTensor) and isinstance(x, torch.Tensor):
+                    if x.is_quantized:
+                        # TODO (tmanlaibaatar) properly support Quantized FakeTensor
+                        x = torch.dequantize(x)
+
+                    try:
+                        assert self.fake_tensor_mode is not None
+                        _ = self.fake_tensor_mode.from_tensor(x)
+                        tensor_meta = None
+                    except UnsupportedFakeTensorException:
+                        # TODO: This is just a workaround to get over the
+                        # x.as_subclass error
+                        tensor_meta = _extract_tensor_metadata(x)
+                    return tensor_meta
+                else:
+                    return None
+
+            node.meta["tensor_meta"] = pytree.tree_map(make_tensor_meta, value)
+
+    class ExportInterpreter(fx.Interpreter):
+        def __init__(self, callback: "_ExportPassBaseDeprecatedDoNotUse", gm: fx.GraphModule) -> None:
+            super().__init__(gm)
+            self.callback = callback
+            self.node: torch.fx.Node = next(iter(gm.graph.nodes))
+
+        def placeholder(
+            self,
+            target: str,
+            args: Tuple[Argument, ...],
+            kwargs: Dict[str, Argument],
+        ) -> ProxyValue:
+            arg = super().placeholder(target, args, kwargs)
+            return self.callback.placeholder(target, arg, NodeMetadata(self.node.meta))
+
+        def output(
+            self,
+            target: torch.fx.node.Target,
+            args: Tuple[Argument, ...],
+            kwargs: Dict[str, Argument],
+        ) -> ProxyValue:
+            return self.callback.output(args[0], NodeMetadata(self.node.meta)).data
+
+        def call_function(
+            self,
+            target: torch.fx.node.Target,
+            args: Tuple[Argument, ...],
+            kwargs: Dict[str, Argument],
+        ) -> ProxyValue:
+            meta = NodeMetadata(self.node.meta)
+
+            if target == operator.getitem:
+                value, key = args
+                return self.callback.call_getitem(value, key, meta)
+            elif getattr(target, "__module__", None) in {"_operator", "math"}:
+                assert callable(target)
+                return self.callback.call_sym(target, args, meta)
+            elif target in _TORCH_SYM_OPS:
+                assert callable(target)
+                return self.callback.call_sym(target, args, meta)
+            elif isinstance(target, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
+                return self.callback.call_operator(
+                    target,
+                    args,
+                    kwargs,
+                    meta,
+                )
+            elif target == torch.ops.higher_order.cond:
+                pred, true_fn, false_fn, inputs = args
+                return self.callback.call_cond(pred, true_fn, false_fn, inputs, meta)
+            elif target == torch.ops.higher_order.map_impl:
+                f, mapped_args, operands = args  # type: ignore[assignment]
+                return self.callback.call_map(f, mapped_args, operands, meta)
+            # For other unregistered HigherOrderOps, just interpret them blindly
+            elif isinstance(target, torch._ops.HigherOrderOperator):
+                return self.callback._fx(
+                    "call_function",
+                    target,
+                    args,
+                    kwargs,
+                    meta,
+                )
+            else:
+                raise ExportPassBaseError(f"Unsupported target type: {target}")
+
+        def get_attr(
+            self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+        ) -> Argument:
+            return super().get_attr(target, args, kwargs)
+
+        def call_module(
+            self,
+            target: torch.fx.node.Target,
+            args: Tuple[Argument, ...],
+            kwargs: Dict[str, Argument],
+        ) -> None:
+            raise ExportPassBaseError("call_module is not supported.")
+
+        def call_method(
+            self, target: str, args: Tuple[Argument, ...], kwargs: Dict[str, Argument]
+        ) -> None:
+            raise ExportPassBaseError("call_method is not supported.")
+
+        def run_node(self, n: torch.fx.Node) -> Argument:
+            self.node = n
+            self.callback.node_debug_str = n.format_node()
+            return super().run_node(n)
+
+    def __init__(self) -> None:
+        self.interpreter = torch.fx.Interpreter(
+            torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+        )
+        self.tracer = self.ExportTracer(self, CodeGen())
+        self.fake_tensor_mode: Optional[FakeTensorMode] = None
+        self._initialized = True
+        self.node_debug_str: typing.Optional[str] = None
+
+    def _fx(
+        self,
+        kind: str,
+        target: torch.fx.node.Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        args_data, kwargs_data = pytree.tree_map_only(
+            ProxyValue, lambda x: x.data, (args, kwargs)
+        )
+        res_data = getattr(self.interpreter, kind)(target, args_data, kwargs_data)
+        args_proxy, kwargs_proxy = pytree.tree_map_only(
+            ProxyValue, lambda x: x.proxy, (args, kwargs)
+        )
+
+        name = None
+        if isinstance(target, torch._ops.OpOverload):
+            name = self.tracer.graph._target_to_str(target.overloadpacket.__name__)
+
+        res_proxy = self.tracer.create_proxy(kind, target, args_proxy, kwargs_proxy, name=name)
+        res_proxy.node.meta.update(meta.data)
+        self.tracer.set_metadata(res_proxy.node, res_data)
+        return ProxyValue(res_data, res_proxy)
+
+    def inputs(self, graph_module: torch.fx.GraphModule) -> List[Argument]:
+        # TODO(angelayi): Update this with what we decide to do for metadata in
+        # the exported graph module
+        if (args := graph_module.meta.get("args", None)) is not None:
+            return list(args)
+
+        def extract_input(node: torch.fx.Node) -> Optional[FakeTensor]:
+            if "val" in node.meta:
+                fake = node.meta["val"]
+                if hasattr(fake, "constant") and fake.constant is not None:
+                    return fake.constant
+                return fake
+            elif tensor_meta := node.meta.get("tensor_meta"):
+                assert self.fake_tensor_mode is not None
+                return FakeTensor(
+                    self.fake_tensor_mode,
+                    torch.empty(
+                        tensor_meta.shape,
+                        dtype=tensor_meta.dtype,
+                        device="meta",
+                        requires_grad=tensor_meta.requires_grad,
+                        memory_format=tensor_meta.memory_format,
+                    ),
+                    torch.device("cpu"),
+                )
+            elif len(node.users) == 0:
+                return None
+            raise ExportPassBaseError(
+                f"Cannot construct an input for graph module: {graph_module}.",
+            )
+
+        return [
+            extract_input(node)
+            for node in graph_module.graph.nodes
+            if node.op == "placeholder"
+        ]
+
+    def on_attr(self, attr: ProxyValue) -> None:
+        pass
+
+    def placeholder(self, name: str, arg: Argument, meta: NodeMetadata) -> ProxyValue:
+        arg_proxy = self.tracer.create_proxy("placeholder", name, (), {})
+        arg_proxy.node.meta = meta.data
+        self.tracer.set_metadata(arg_proxy.node, arg)
+        return ProxyValue(arg, arg_proxy)
+
+    def call_operator(
+        self,
+        op,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        return self._fx("call_function", op, args, kwargs, meta)
+
+    def call_sym(
+        self,
+        target: Fn,
+        args: Tuple[Argument, ...],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        return self._fx("call_function", target, args, {}, meta)
+
+    def call_cond(
+        self,
+        pred: ProxyValue,
+        true_fn: torch.fx.GraphModule,
+        false_fn: torch.fx.GraphModule,
+        inputs: List[Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        true_branch = self.call_submodule(true_fn, tuple(inputs))
+        false_branch = self.call_submodule(false_fn, tuple(inputs))
+        assert true_branch is not None
+        assert false_branch is not None
+        return self._fx(
+            "call_function",
+            torch.ops.higher_order.cond,
+            (pred, true_branch.graph_module, false_branch.graph_module, list(inputs)),
+            {},
+            meta,
+        )
+
+    def call_map(
+        self,
+        f: torch.fx.GraphModule,
+        mapped_args: List[ProxyValue],
+        operands: List[ProxyValue],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        xs = _unstack_pytree([arg.data for arg in mapped_args])[0]
+        f_branch = self.call_submodule(f, tuple(xs + [arg.data for arg in operands]))
+        assert f_branch is not None
+        return self._fx(
+            "call_function",
+            torch.ops.higher_order.map_impl,
+            (f_branch.graph_module, mapped_args, operands),
+            {},
+            meta,
+        )
+
+    def call_getitem(
+        self, value: ProxyValue, key: int, meta: NodeMetadata
+    ) -> ProxyValue:
+        return self._fx("call_function", operator.getitem, (value, key), {}, meta)
+
+    def output(self, results: List[Argument], meta: NodeMetadata) -> ProxyValue:
+        return self._fx("output", "output", (results,), {}, meta)
+
+    def call_submodule(
+        self, graph_module: fx.GraphModule, inputs: Tuple[Argument, ...]
+    ) -> PassResult:
+        prev_tracer, self.tracer = self.tracer, self.ExportTracer(
+            self, graph_module.graph._codegen
+        )
+        self.tracer.fake_tensor_mode = prev_tracer.fake_tensor_mode
+        interpreter = self.ExportInterpreter(self, graph_module)
+        prev_interpreter, self.interpreter = self.interpreter, torch.fx.Interpreter(
+            torch.fx.GraphModule(torch.nn.Module(), torch.fx.Graph())
+        )
+        inputs_data = pytree.tree_map_only(ProxyValue, lambda x: x.data, inputs)
+        with fx_traceback.preserve_node_meta():
+            interpreter.run(*inputs_data)
+
+        new_graph_module = torch.fx.GraphModule(self.tracer.root, self.tracer.graph)
+
+        self.tracer = prev_tracer
+        self.interpreter = prev_interpreter
+        return PassResult(
+            new_graph_module,
+            True,
+        )
+
+    def call(self, graph_module: fx.GraphModule) -> PassResult:
+        if not getattr(self, "_initialized", False):
+            raise ExportPassBaseError(
+                "ExportPass is not initialized with __init__().",
+            )
+
+        inputs = self.inputs(graph_module)
+
+        fake_tensor_mode = None
+        for i in inputs:
+            if isinstance(i, FakeTensor):
+                assert (
+                    fake_tensor_mode is None or fake_tensor_mode is i.fake_mode
+                ), "Multiple fake tensor mode detected."
+                fake_tensor_mode = i.fake_mode
+        if fake_tensor_mode is None:
+            self.tracer.fake_tensor_mode = FakeTensorMode(allow_non_fake_inputs=True)
+            fake_tensor_mode = nullcontext()  # type: ignore[assignment]
+            dispatcher_mode = nullcontext()  # type: ignore[assignment]
+        else:
+            fake_tensor_mode.allow_non_fake_inputs = True
+            self.tracer.fake_tensor_mode = fake_tensor_mode
+            dispatcher_mode = enable_python_dispatcher()  # type: ignore[assignment]
+        self.fake_tensor_mode = self.tracer.fake_tensor_mode
+
+        with fake_tensor_mode, dispatcher_mode:  # type: ignore[assignment, union-attr]
+            result = self.call_submodule(graph_module, tuple(inputs))
+
+        return result
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/__init__.py b/MLPY/Lib/site-packages/torch/_export/pass_infra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dcfaa11cf470664a75df9eb99cac9db6e8a4e4c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/node_metadata.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/node_metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73550538234b4300e6936735a9c355bc833d0d26
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/node_metadata.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/proxy_value.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/proxy_value.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da2b6c2f0c5d4ec80ffe0310a4583192eed8b36c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/pass_infra/__pycache__/proxy_value.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/node_metadata.py b/MLPY/Lib/site-packages/torch/_export/pass_infra/node_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83ea3bb9eedadc349f4bdba8cbbf22850bb5afc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/pass_infra/node_metadata.py
@@ -0,0 +1,32 @@
+from typing import Any, Dict, Set
+
+
+NodeMetadataValue = Any
+
+
+PROTECTED_KEYS: Set[str] = {
+    "val",
+    "stack_trace",
+    "nn_module_stack",
+    "debug_handle",
+    "tensor_meta",
+}
+
+
+class NodeMetadata:
+    def __init__(self, data: Dict[str, Any]) -> None:
+        self.data: Dict[str, Any] = data.copy()
+
+    def __getitem__(self, key: str) -> NodeMetadataValue:
+        return self.data[key]
+
+    def __setitem__(self, key: str, value: NodeMetadataValue) -> NodeMetadataValue:
+        if key in PROTECTED_KEYS:
+            raise RuntimeError(f"Could not override node key: {key}")
+        self.data[key] = value
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.data
+
+    def copy(self) -> "NodeMetadata":
+        return NodeMetadata(self.data.copy())
diff --git a/MLPY/Lib/site-packages/torch/_export/pass_infra/proxy_value.py b/MLPY/Lib/site-packages/torch/_export/pass_infra/proxy_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f0b90c8ddc6f0b2bde286b14edaa610394c054a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/pass_infra/proxy_value.py
@@ -0,0 +1,41 @@
+# pyre-strict
+from typing import Union
+
+import torch
+
+
+class ProxyValue:
+    # pyre-ignore
+    def __init__(self, data, proxy: Union[torch.fx.Proxy, torch.fx.Node]):
+        # pyre-ignore
+        self.data = data
+        self.proxy_or_node = proxy
+
+    @property
+    def node(self) -> torch.fx.Node:
+        if isinstance(self.proxy_or_node, torch.fx.Node):
+            return self.proxy_or_node
+        assert isinstance(self.proxy_or_node, torch.fx.Proxy)
+        return self.proxy_or_node.node
+
+    @property
+    def proxy(self) -> torch.fx.Proxy:
+        if not isinstance(self.proxy_or_node, torch.fx.Proxy):
+            raise RuntimeError(
+                f"ProxyValue doesn't have attached Proxy object. Node: {self.proxy_or_node.format_node()}"
+            )
+        return self.proxy_or_node
+
+    def to_tensor(self) -> torch.Tensor:
+        assert isinstance(self.data, torch.Tensor)
+        return self.data
+
+    def is_tensor(self) -> bool:
+        return isinstance(self.data, torch.Tensor)
+
+    # pyre-ignore
+    def __iter__(self):
+        yield from self.data
+
+    def __bool__(self) -> bool:
+        return bool(self.data)
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__init__.py b/MLPY/Lib/site-packages/torch/_export/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ad040cae5672be1b58bfe523d4fb57e41d2344
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/__init__.py
@@ -0,0 +1 @@
+from .replace_view_ops_with_view_copy_ops_pass import ReplaceViewOpsWithViewCopyOpsPass
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a207bfb87ba5b3c9807d0be604f5f8e257f3479
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee7284e2e6f899ad0095460dd0234407d140b997
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7206c4f192ecc2c1a008d964282738dad6cc9d56
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a10ecf49e7586a4fb78d706407d2b148d56b2ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/lift_constants_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/lift_constants_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5a81b99b27430b8ffa3b6963c410b15bf3c093b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/lift_constants_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65e76302967ab8d0c164923daf337c0b0680b32a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c488b63a14a95f3f06f8ca1fc560c91d636c4d5c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_sym_size_ops_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_sym_size_ops_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e2d36525e02ca7d32c59a820699a65fa837812e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_sym_size_ops_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b06419e739f00028c83fffe989390d3688a55cdb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8457fb4d736163c9d16a83e4fcd2efd18149e19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py
@@ -0,0 +1,231 @@
+import math
+import operator
+import traceback
+from functools import partial
+from typing import Callable, Dict, List, NamedTuple, Set
+
+import sympy
+
+import torch
+import torch.fx
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse, ProxyValue, PassResult
+from torch.utils._sympy.value_ranges import ValueRanges
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+
+__all__ = ["InputDim"]
+
+
+class InputDim(NamedTuple):
+    input_name: str
+    dim: int
+
+
+def _convert_to_int(val):
+    # Convert simple sympy Integers into concrete int
+    if val == sympy.oo:
+        return math.inf
+    if val == -sympy.oo:
+        return -math.inf
+    if isinstance(val, sympy.Integer):
+        return int(val)
+    raise RuntimeError(
+        "Export constraints cannot be non-integer expressions"
+    )
+
+
+def _convert_range_to_int(range: ValueRanges):
+    assert isinstance(range, ValueRanges)
+    min_val = _convert_to_int(range.lower)
+    max_val = _convert_to_int(range.upper)
+    return min_val, max_val
+
+
+class _AddRuntimeAssertionsForInlineConstraintsPass(_ExportPassBaseDeprecatedDoNotUse):
+    def __init__(
+        self,
+        range_constraints: Dict[sympy.Symbol, ValueRanges],
+    ):
+        super().__init__()
+        self.range_constraints: Dict[sympy.Symbol, ValueRanges] = range_constraints
+        self._asserts_generated_unbacked_symbols: Set[sympy.Symbol] = set()
+        self.counter = 0
+
+    def _assert_range_constraint(self, proxy, lower, upper, assert_msg):
+        if lower > -math.inf:
+            self._insert_assert_async(operator.ge, proxy, lower, assert_msg)
+
+        if upper < math.inf:
+            self._insert_assert_async(operator.le, proxy, upper, assert_msg)
+
+    def _insert_assert_async(self, operator, lower, upper, assert_msg):
+        """
+        Inserts assert_async call_function nodes in the graph. This function is
+        called **during** the interpreter-based pass.
+        """
+        self.counter += 1
+        cmp = super().call_operator(operator, (lower, upper), {}, self._create_dummy_node_metadata())
+        cmp_tensor = super().call_operator(torch.ops.aten.scalar_tensor.default, (cmp,), {}, self._create_dummy_node_metadata())
+        super().call_operator(
+            torch.ops.aten._assert_async.msg,
+            (cmp_tensor, assert_msg),
+            {},
+            self._create_dummy_node_metadata(),
+        )
+
+    def call_operator(self, op, args, kwargs, meta) -> ProxyValue:
+        ret = super().call_operator(op, args, kwargs, meta)
+        if "val" not in meta:
+            return ret
+
+        val = meta["val"]
+
+        # In general, we may have to deal the case such as: ret[1].shape[0].
+        # We need first find out what symbols require assertion, then we need to follow the path
+        # from ret to the symbol, construct the proxies along the way and construct the messages
+        # piece-wise at the same time.
+        #
+        # We use post-order traversal to collect all the proxies callbacks needed, construct
+        # the error message callbacks, and at the top-level traversal tree we execute all the callbacks.
+        # We need the callbacks because, in order to call the function to create a proxy for shape[0], we
+        # need the proxy for shape, which further requires the proxy for ret[1], etc.
+        def add_assertions(val):
+            call_backs: List[Callable] = []
+            messages: List[str] = []
+            if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+                symbol = val.node.expr
+                if symbol in self.existing_inline_assertions:
+                    return call_backs, messages
+                if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(symbol):
+                    if symbol in self._asserts_generated_unbacked_symbols:
+                        return call_backs, messages
+                    # We only care about unbacked symints for these inline
+                    # constraints, which are prefixed with 'u'
+                    constraint = self.range_constraints[symbol]
+                    min_val, max_val = _convert_range_to_int(constraint)
+                    assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
+                    call_backs.append(
+                        partial(self._assert_range_constraint, lower=min_val, upper=max_val)
+                    )
+                    messages.append(assert_msg)
+                    self._asserts_generated_unbacked_symbols.add(symbol)
+
+            elif isinstance(val, torch.Tensor):
+                for i, sym in enumerate(val.shape):
+                    cbs, msgs = add_assertions(sym)
+                    for cb, msg in zip(cbs, msgs):
+                        def sym_size_cb(proxy, assert_msg, dim):
+                            dim_proxy = super(
+                                _AddRuntimeAssertionsForInlineConstraintsPass,
+                                self
+                            ).call_operator(
+                                torch.ops.aten.sym_size.int,
+                                (proxy, dim),
+                                {},
+                                self._create_dummy_node_metadata(),
+                            )
+                            cb(proxy=dim_proxy, assert_msg=assert_msg)
+                        call_backs.append(partial(sym_size_cb, dim=i))
+                        messages.append(f".shape[{i}]" + msg)
+            return call_backs, messages
+
+        callbacks, messages = add_assertions(val)
+        for cb, msg in zip(callbacks, messages):
+            cb(proxy=ret, assert_msg=f"{ret.node}" + msg)
+        return ret
+
+    def call(self, graph_module):
+        self.existing_inline_assertions = _get_existing_inline_assertions(
+            graph_module, self.range_constraints
+        )
+
+        # Add runtime asserts for inline constraints
+        val = super().call(graph_module)
+
+        # Sometimes this pass would return a wrong graph where we have mismatched
+        # node names in signature. Before we fix it, let's just skip it.
+        if self.counter == 0 and type(self) is _AddRuntimeAssertionsForInlineConstraintsPass:
+            return PassResult(graph_module, False)
+
+        # Populate the stack trace with dummy vals to respect IR
+        for node in val.graph_module.graph.nodes:
+            if not node.meta.get("stack_trace", None):
+                node.meta["stack_trace"] = "".join(traceback.format_stack(limit=1))
+
+        return PassResult(val.graph_module, val.modified)
+
+
+def _get_existing_inline_assertions(
+    graph_module: torch.fx.GraphModule,
+    range_constraints: Dict[sympy.Symbol, ValueRanges],
+) -> Dict[sympy.Symbol, ValueRanges]:
+    existing_inline_assertions: Dict[sympy.Symbol, ValueRanges] = {}
+
+    for module in graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+
+        # Find all the existing inline assertions. They will look something like:
+        # %_local_scalar_dense = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%arg1_1,), kwargs = {})
+        # %ge = call_function[target=operator.ge](args = (%_local_scalar_dense, 0), kwargs = {})
+        # %scalar_tensor = call_function[target=torch.ops.aten.scalar_tensor.default](args = (%ge,), kwargs = {})
+        # %_assert_async = call_function[target=torch.ops.aten._assert_async.msg](args = (%scalar_tensor, "..."), kwargs = {})
+        for node in module.graph.nodes:
+            if node.target != torch.ops.aten._assert_async.msg:
+                continue
+
+            scalar_tensor_arg = node.args[0]
+            if not (
+                scalar_tensor_arg.op == "call_function" and
+                scalar_tensor_arg.target == torch.ops.aten.scalar_tensor.default
+            ):
+                continue
+
+            compare_arg = scalar_tensor_arg.args[0]
+            if not (
+                compare_arg.op == "call_function" and
+                compare_arg.target in (operator.le, operator.ge) and
+                len(compare_arg.args) == 2
+            ):
+                continue
+
+            compare_op = compare_arg.target
+            maybe_symint_arg, compare_int = compare_arg.args
+
+            # x >= 0 will sometimes be canonicalized to -x <= 0, so in some
+            # cases the operation before the comparison is to multiply by -1. We
+            # can undo the canonicalization here
+            if (
+                maybe_symint_arg.op == "call_function" and
+                maybe_symint_arg.target == operator.mul and
+                maybe_symint_arg.args[0] == -1
+            ):
+                maybe_symint_arg = maybe_symint_arg.args[1]
+                compare_op = operator.ge
+                compare_int = -1 * compare_int
+
+            if not (
+                "val" in maybe_symint_arg.meta and
+                isinstance(maybe_symint_arg.meta["val"], torch.SymInt)
+            ):
+                continue
+
+            symint = maybe_symint_arg.meta["val"].node.expr
+            if not isinstance(symint, sympy.Symbol):
+                continue
+
+            if symint not in range_constraints:
+                raise RuntimeError(f"Unable to find symint {symint} in {range_constraints}")
+
+            found_range = existing_inline_assertions.get(symint, ValueRanges(-math.inf, math.inf))
+
+            if compare_arg.target == operator.le:
+                existing_inline_assertions[symint] = ValueRanges(
+                    lower=found_range.lower, upper=compare_int
+                )
+            elif compare_arg.target == operator.ge:
+                existing_inline_assertions[symint] = ValueRanges(
+                    lower=compare_int, upper=found_range.upper
+                )
+
+    return existing_inline_assertions
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/collect_tracepoints_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/collect_tracepoints_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..102bb87d75441317caa7fa4e0d0ef0ee5c89668c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/collect_tracepoints_pass.py
@@ -0,0 +1,66 @@
+import operator
+
+import torch
+
+from torch.export.exported_program import ConstantArgument, TensorArgument
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+__all__ = ["CollectTracepointsPass"]
+
+
+class CollectTracepointsPass(PassBase):
+    """
+    Performs constant folding and constant propagation.
+    """
+
+    def __init__(self, specs, sig) -> None:
+        super().__init__()
+        self.specs = specs
+        self.sig = sig
+
+    def call(self, gm):
+        def get_arg_spec(arg):
+            if isinstance(arg, torch.fx.Node):
+                if isinstance(arg.meta.get("val"), torch.Tensor):
+                    return TensorArgument(name=arg.name)
+                else:
+                    raise AssertionError(
+                        "Symint input is not implemented yet for submodule call signature."
+                    )
+            else:
+                return ConstantArgument(value=arg)
+
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    for i, arg in enumerate(node.args):
+                        kind = node.kwargs["kind"]
+                        if kind == "module_call_inputs":
+                            self.specs[node.kwargs["path"]].inputs.append(
+                                get_arg_spec(arg)
+                            )
+                        elif kind == "module_call_outputs":
+                            self.specs[node.kwargs["path"]].outputs.append(
+                                get_arg_spec(arg)
+                            )
+                        else:
+                            raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                        if isinstance(arg, torch.fx.Node):
+                            for user in node.users:
+                                assert user.op == "call_function"
+                                assert user.target == operator.getitem
+                                assert isinstance(user.args[1], int)
+                                if user.args[1] == i:
+                                    user.replace_all_uses_with(arg)
+                                    self.sig.replace_all_uses(user.name, arg.name)
+                                    break
+                    users = list(node.users)
+                    for user in users:
+                        assert len(user.users) == 0
+                        gm.graph.erase_node(user)
+                    gm.graph.erase_node(node)
+            return PassResult(gm, True)
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e5178bd6e3a7e6812a95409160728753a49a7c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py
@@ -0,0 +1,94 @@
+import copy
+from typing import Dict, Optional, Tuple, List
+
+import torch
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse, PassResult, Argument
+from torch._export.pass_infra.node_metadata import NodeMetadata
+from torch._export.pass_infra.proxy_value import ProxyValue
+from torch._ops import OpOverload
+
+aten = torch.ops.aten
+
+_NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS: Dict[OpOverload, OpOverload] = {
+    aten.sym_constrain_range.default: aten._functional_sym_constrain_range,
+    aten._assert_async.msg: aten._functional_assert_async.msg,
+}
+
+
+class _FunctionalizeSideEffectfulOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Functionalize ops with side effect in graph module by replacing the op with
+    functional version of it. A new dependency token (`dep_token`) will be
+    created and propagated through functional ops to output.
+    For example:
+    ```
+    def f(x):
+        sym_constrain_range(x.shape[0], min=1, max=3)
+        return x.add(3)
+    ```
+    Will be transformed to:
+    ```
+    def f(x):
+        dep_token0 = _make_dep_token()
+        dep_token1 = _functional_sym_constrain_range(
+            x.shape[0], min=1, max=3, dep_token=dep_token0
+        )
+
+        return x.add(3), dep_token1
+    ```
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._dep_token: Optional[ProxyValue] = None
+        self._next_dep_token_index: Optional[int] = None
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        # Early return if no non-functional assertions.
+        if not any(
+            n.target in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS
+            for n in graph_module.graph.nodes
+        ):
+            return PassResult(graph_module=graph_module, modified=False)
+
+        gm = copy.deepcopy(graph_module)
+        self._dep_token = None
+        self._next_dep_token_index = None
+        return super().call(gm)
+
+    def call_operator(
+        self,
+        op: OpOverload,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS:
+            return super().call_operator(op, args, kwargs, meta)
+
+        if self._dep_token is None:
+            self._dep_token = super().call_operator(
+                aten._make_dep_token,
+                args=(),
+                kwargs={},
+                meta=self._create_dummy_node_metadata(),
+            )
+            self._dep_token.node.name = "dep_token0"
+            self._next_dep_token_index = 1
+
+        self._dep_token = super().call_operator(
+            _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS[op],
+            args=args,
+            kwargs={**kwargs, "dep_token": self._dep_token},
+            meta=meta,
+        )
+        assert self._next_dep_token_index is not None
+        self._dep_token.node.name = f"dep_token{self._next_dep_token_index}"
+        self._next_dep_token_index += 1
+
+        return self._dep_token
+
+    def output(self, results: List[Argument], meta: NodeMetadata) -> ProxyValue:
+        assert self._dep_token is not None
+
+        return super().output(results=(*results, self._dep_token), meta=meta)  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/lift_constants_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/lift_constants_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..253746402d79f9339183ca966fdb75aef1c5c683
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/lift_constants_pass.py
@@ -0,0 +1,248 @@
+import collections
+from typing import Any, Dict, Union
+
+import torch
+from torch._export.verifier import SpecViolationError
+from torch._guards import detect_fake_mode
+from torch.export.exported_program import (
+    ArgumentSpec,
+    CustomObjArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    TensorArgument,
+)
+
+
+class ConstantAttrMap(collections.abc.MutableMapping):
+    """A mapping class that understands how to use module constants (tensors and
+    ScriptObjects) as keys. We store tensors normally, but ScriptObjects are
+    stored by hash, because different torch.ScriptObjects can point to the same
+    underlying value (but we guarantee that they will `hash()` to the same value
+    if that's the case).
+    """
+
+    def __init__(self):
+        # Underlying dict that we use to implement this mapping.
+        self._constant_attrs: Dict[Union[int, torch.Tensor], Any] = {}
+        # Map from the hash(ScriptObject) to the ScriptObject itself. Used for
+        # APIs like `__iter__` that should look like they're returning the
+        # original ScriptObjects.
+        self._script_object_map: Dict[int, torch.ScriptObject] = {}
+
+    def __getitem__(self, key: Union[torch.Tensor, torch.ScriptObject]) -> Any:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        assert isinstance(real_key, (int, torch.Tensor))
+        return self._constant_attrs[real_key]
+
+    def __setitem__(
+        self, key: Union[torch.Tensor, torch.ScriptObject], value: Any
+    ) -> None:
+        if isinstance(key, torch.ScriptObject):
+            self._constant_attrs[hash(key)] = value
+            self._script_object_map[hash(key)] = key
+        elif isinstance(key, torch.Tensor):
+            self._constant_attrs[key] = value
+        else:
+            raise TypeError(
+                f"Expected key to be a tensor or ScriptObject, got {type(key)}"
+            )
+
+    def __delitem__(self, key):
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+
+        del self._constant_attrs[real_key]
+
+    def __iter__(self):
+        for key in self._constant_attrs:
+            if isinstance(key, int):
+                yield self._script_object_map[key]
+            else:
+                yield key
+
+    def __len__(self):
+        return len(self._constant_attrs)
+
+    def __contains__(self, key: object) -> bool:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        return real_key in self._constant_attrs
+
+
+def get_constant_fqn(node: torch.fx.Node, constant_name: str) -> str:
+    # The FQN of the constant tensor in the state dict should
+    # correspond to the module where the constant tensor was
+    # originally used.
+    parent_fqn = list(node.meta["nn_module_stack"].values())[-1][0]
+    if len(parent_fqn) > 0:
+        return f"{parent_fqn}.{constant_name}"
+    else:
+        return constant_name
+
+
+def lift_constants_pass(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> Dict[str, Union[torch.Tensor, torch._C.ScriptObject]]:
+    """
+    Takes a graph module, graph signature, and modifies them implace to lift any
+    constants (tensors or custom classes) as inputs to the graph. Returns a
+    dictionary of names to constants.
+
+    Arguments:
+        gm (torch.fx.GraphModule): The graph module containing the graph and constants to lift.
+        graph_signature (ExportGraphSignature): This graph signature will be
+            mutated to add additional CONSTANT_TENSOR and CUSTOM_OBJ inputs.
+        constant_attrs (ConstantAttr): A mapping from a constant value to its
+            fully-qualified path in `gm`. This is used to maintain consistent
+            location of constants between the original module and the exported
+            version.
+
+    Returns:
+        A dictionary of fqn => constant value.
+    """
+    all_constants: Dict[str, Union[torch.Tensor, torch._C.ScriptObject]] = {}
+
+    inputs = graph_signature.input_specs
+    num_custom_obj = sum(
+        input_specs.kind == InputKind.CUSTOM_OBJ for input_specs in inputs
+    )
+    num_tensor_constants = sum(
+        input_specs.kind == InputKind.CONSTANT_TENSOR for input_specs in inputs
+    )
+
+    fake_mode = detect_fake_mode(
+        tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder")
+    )
+
+    first_user_input_loc, first_user_input = 0, None
+    for node in gm.graph.nodes:
+        if node.op == "placeholder" and node.name in graph_signature.user_inputs:
+            first_user_input = node
+            break
+        first_user_input_loc += 1
+
+    lifted_objs = ConstantAttrMap()
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            constant_val = getattr(gm, node.target)
+            if constant_val in lifted_objs:
+                # We already lifted this constant elsewhere. Just rewrite uses
+                # of this get_attr to point to the already-existing placeholder
+                # node.
+                const_placeholder_node = lifted_objs[constant_val]
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                continue
+
+            # For ScriptObject and Tensor constants:
+            # First check if the constant was an attribute on some module by
+            # consulting `constant_attrs` map. If it is, use the fqn that keeps
+            # its location consistent with the eager module.
+            #
+            # If it's not in the `constant_attrs` map, that means it's an inline
+            # constant (e.g. x + torch.tensor(0)), and thus did not have a
+            # specific location in the eager module. In that case, just generate
+            # some name and attach it to the module in which it was used.
+            if isinstance(constant_val, torch.ScriptObject):
+                constant_kind = InputKind.CUSTOM_OBJ
+                constant_fqn = constant_attrs.get(constant_val)
+                if constant_fqn is not None:
+                    _, _, constant_name = constant_fqn.rpartition(".")
+                else:
+                    constant_name = f"_lifted_custom_obj{num_custom_obj}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_custom_obj += 1
+            elif isinstance(constant_val, torch.Tensor):
+                constant_kind = InputKind.CONSTANT_TENSOR
+                constant_fqn = constant_attrs.get(constant_val)
+                if constant_fqn is not None:
+                    _, _, constant_name = constant_fqn.rpartition(".")
+                else:
+                    constant_name = f"_lifted_tensor_constant{num_tensor_constants}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_tensor_constants += 1
+            elif isinstance(constant_val, torch.fx.GraphModule):
+                continue
+            elif "LoweredBackendModule" in type(constant_val).__name__:
+                continue
+            else:
+                raise SpecViolationError(
+                    f"getattr node {node} referencing unsupported type {type(constant_val)}"
+                )
+
+            with gm.graph.inserting_before(first_user_input):
+                # Insert the constant node before the first user input
+                const_placeholder_node = gm.graph.placeholder(constant_name)
+                # match target name with its node name in case there is name collision
+                # and suffix is added to node name in fx
+                const_placeholder_node.target = const_placeholder_node.name
+
+                for k, v in node.meta.items():
+                    const_placeholder_node.meta[k] = v
+
+                input_spec_arg: ArgumentSpec
+                if isinstance(constant_val, torch.Tensor):
+                    if fake_mode is not None:
+                        const_placeholder_node.meta["val"] = fake_mode.from_tensor(
+                            constant_val, static_shapes=True
+                        )
+                        const_placeholder_node.meta["val"].constant = constant_val
+                    else:
+                        const_placeholder_node.meta["val"] = constant_val
+                    input_spec_arg = TensorArgument(name=const_placeholder_node.name)
+                elif isinstance(constant_val, torch._C.ScriptObject):
+                    class_fqn = constant_val._type().qualified_name()  # type: ignore[attr-defined]
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name, class_fqn=class_fqn
+                    )
+                else:
+                    raise SpecViolationError(
+                        f"tried to lift unsupported type {type(constant_val)} from node {node.format_node()}"
+                    )
+
+                lifted_objs[constant_val] = const_placeholder_node
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+
+                # Add the constant as a buffer to the graph signature
+                graph_signature.input_specs.insert(
+                    first_user_input_loc,
+                    InputSpec(
+                        kind=constant_kind,
+                        arg=input_spec_arg,
+                        target=constant_fqn,
+                    ),
+                )
+                all_constants[constant_fqn] = constant_val
+                first_user_input_loc += 1
+
+    return all_constants
+
+
+def rewrite_script_object_meta(
+    gm: torch.fx.GraphModule,
+) -> Dict[str, Union[torch.Tensor, torch.ScriptObject]]:
+    """When tracing, we produce a graph with an actual ScriptObject in the
+    meta["val"]. Eventually we want to change this behavior, when FakeMode infra
+    for ScriptObjects lands.
+
+    For now, we rewrie meta["val"] to be a placeholder CustomObjArgument
+    """
+    constants: Dict[str, Union[torch.Tensor, torch._C.ScriptObject]] = {}
+    for node in gm.graph.nodes:
+        if "val" not in node.meta or not isinstance(
+            node.meta["val"], torch.ScriptObject
+        ):
+            continue
+
+        old_meta = node.meta["val"]
+        class_fqn = old_meta._type().qualified_name()  # type: ignore[attr-defined]
+        new_meta = CustomObjArgument(node.name, class_fqn)
+        constants[node.name] = old_meta
+        node.meta["val"] = new_meta
+
+    return constants
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/remove_runtime_assertions.py b/MLPY/Lib/site-packages/torch/_export/passes/remove_runtime_assertions.py
new file mode 100644
index 0000000000000000000000000000000000000000..350e9893991f577c28f97a5c6977019f943d61f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/remove_runtime_assertions.py
@@ -0,0 +1,26 @@
+import torch
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+class _RemoveRuntimeAssertionsPass(PassBase):
+    """
+    Remove runtime assertions inserted by the
+    _AddRuntimeAssertionsForInlineConstraintsPass.
+    """
+
+    def call(self, graph_module) -> PassResult:
+        modified = False
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.target == torch.ops.aten._assert_async.msg:
+                    assert_async_node = node
+                    if len(assert_async_node.users) > 0:
+                        continue
+                    module.graph.erase_node(assert_async_node)
+                    # the upstream scalar_tensor <- {le, ge} <- sym_size
+                    # linear chain of nodes of nodes is removed by the
+                    # downstream dead code elimination
+                    modified = True
+        return PassResult(graph_module, modified)
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba62a622ef499b075cd1f339e956ed1a522ecbc9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py
@@ -0,0 +1,141 @@
+import torch
+from torch._higher_order_ops.wrap import wrap_with_set_grad_enabled
+
+from ..utils import (
+    node_inline_,
+    node_replace_,
+    nodes_filter,
+    nodes_first,
+    nodes_map,
+    sequential_split,
+)
+
+
+def _is_set_grad_enabled_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target == torch._C._set_grad_enabled
+    )
+
+
+def _is_set_grad_enabled_sub_mod(node: torch.fx.Node, omit_if_same_with_ambient=False):
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target == torch._C._set_grad_enabled
+        ):
+            return (
+                first_non_ph.args[0] != torch.is_grad_enabled()
+                if omit_if_same_with_ambient
+                else True
+            )
+    return False
+
+
+def _replace_with_hop(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    set_grad_nodes = nodes_filter(sub_graph.nodes, _is_set_grad_enabled_node)
+    if len(set_grad_nodes) > 0:
+        assert len(set_grad_nodes) == 1
+        set_grad_node = set_grad_nodes[0]
+        enable_grad_val = set_grad_node.args[0]
+        with graph.inserting_before(node):
+            get_attr_node = graph.get_attr(node.target)
+            output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+            if output_node is not None:
+                assert len(output_node.args) == 1
+                output_args = output_node.args[0]
+                if isinstance(output_args, (tuple, list)):
+                    call_func_node = graph.call_function(
+                        wrap_with_set_grad_enabled,
+                        (enable_grad_val, get_attr_node, *node.args),
+                        {},
+                    )
+                    # Create the metadata
+                    call_func_node.meta["val"] = tuple(
+                        arg.meta["val"] for arg in output_args
+                    )
+                    node_replace_(node, call_func_node, delete_old=True)
+
+                    # Rename the name of getitem nodes to the actual name of its contents
+                    # for passing verifier and better readability, also propagate metadata
+                    for get_item_node in call_func_node.users.keys():
+                        idx: int = get_item_node.args[1]
+                        output_node = output_args[idx]
+                        get_item_node._rename(output_node.name)
+                        get_item_node.meta = output_node.meta
+                        pass
+
+                elif isinstance(output_args, torch.fx.Node):
+                    call_func_node = graph.create_node(
+                        "call_function",
+                        wrap_with_set_grad_enabled,
+                        (enable_grad_val, get_attr_node, *node.args),
+                        {},
+                        output_args.name,
+                    )
+                    call_func_node.meta = output_args.meta
+                    node_replace_(node, call_func_node, delete_old=True)
+                else:
+                    raise NotImplementedError(
+                        f"repalce_set_grad_with_hop_pass doesnt' support output type {type(output_args)}"
+                    )
+            else:
+                raise NotImplementedError(
+                    "Cannot replace a call_module with a hop if it has no output. This module will gets DCEed."
+                )
+        sub_graph.erase_node(set_grad_node)
+
+
+def _remove_set_grad_and_inline(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    nodes_map(
+        sub_graph.nodes,
+        lambda n: sub_graph.erase_node(n) if _is_set_grad_enabled_node(n) else n,
+    )
+    node_inline_(node)
+
+
+def replace_set_grad_with_hop_pass(gm: torch.fx.GraphModule):
+    # If there is no set_grad_enabled node, return the original graph module
+    need_replacing = False
+    for node in gm.graph.nodes:
+        if _is_set_grad_enabled_node(node):
+            need_replacing = True
+
+    if not need_replacing:
+        return gm
+
+    new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+
+    def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
+        if _is_set_grad_enabled_sub_mod(node, omit_if_same_with_ambient=True):
+            _replace_with_hop(node)
+        else:
+            _remove_set_grad_and_inline(node)
+
+    nodes_map(
+        list(new_gm.graph.nodes),
+        lambda node: _maybe_inline_or_replace_with_hop(node)
+        if node.op == "call_module"
+        else node,
+    )
+    new_gm.graph.lint()
+    return new_gm
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/replace_sym_size_ops_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/replace_sym_size_ops_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef419ccf13ec12dd0f8356703a86c86541ff7649
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/replace_sym_size_ops_pass.py
@@ -0,0 +1,18 @@
+from typing import Dict
+
+import torch
+
+replacements: Dict[torch._ops.OpOverloadPacket, torch._ops.OpOverload] = {
+    torch.ops.aten.sym_size: torch.ops.aten.sym_size.int,
+    torch.ops.aten.sym_stride: torch.ops.aten.sym_stride.int,
+    torch.ops.aten.sym_numel: torch.ops.aten.sym_numel.default,
+}
+
+
+def _replace_sym_size_ops_pass(gm: torch.fx.GraphModule):
+    for module in gm.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        for node in module.graph.nodes:
+            if node.target in replacements:
+                node.target = replacements[node.target]
diff --git a/MLPY/Lib/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py b/MLPY/Lib/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d7ef8f62ffaf0c02e0f5fdc2de8f742b71b80f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py
@@ -0,0 +1,71 @@
+from typing import Dict, Optional, Set
+
+import torch
+from torch._ops import OpOverload, OpOverloadPacket, HigherOrderOperator
+from torch._export.error import InternalError
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+
+
+__all__ = ["ReplaceViewOpsWithViewCopyOpsPass"]
+
+
+_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS: Dict[OpOverload, OpOverload] = {
+    torch.ops.aten._unsafe_view.default: torch.ops.aten.view_copy.default,
+}
+
+# TODO (tmanlaibaatar) remove this after https://github.com/pytorch/pytorch/pull/100749
+_BLACK_LISTED_OPS: Set[OpOverloadPacket] = {
+    torch.ops.aten.sym_size,
+    torch.ops.aten.sym_stride,
+    torch.ops.aten.sym_numel,
+}
+
+def is_view_op(schema: torch._C.FunctionSchema) -> bool:
+    if len(schema.arguments) == 0:
+        return False
+    alias_info = schema.arguments[0].alias_info
+    return (alias_info is not None) and (not alias_info.is_write)
+
+
+def get_view_copy_of_view_op(schema: torch._C.FunctionSchema) -> Optional[OpOverload]:
+    if is_view_op(schema) and schema.name.startswith("aten::"):
+        view_op_name = schema.name.split("::")[1]
+        view_op_overload = (
+            schema.overload_name
+            if schema.overload_name != ""
+            else "default"
+        )
+        view_copy_op_name = view_op_name + "_copy"
+        if not hasattr(torch.ops.aten, view_copy_op_name):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+
+        view_copy_op_overload_packet = getattr(torch.ops.aten, view_copy_op_name)
+
+        if not hasattr(view_copy_op_overload_packet, view_op_overload):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+
+        return getattr(view_copy_op_overload_packet, view_op_overload)
+
+    return None
+
+
+class ReplaceViewOpsWithViewCopyOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Our backend expects pure functional operators. For efficiency
+    purposes, we keep view ops around while functionalizing the exported
+    program. This pass replaces view ops with view copy ops for backends that
+    need AOT memory planning.
+    """
+    def call_operator(self, op, args, kwargs, meta):
+        if op in _NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS:
+            return super().call_operator(
+                (_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS[op]), args, kwargs, meta
+            )
+
+        if op in _BLACK_LISTED_OPS or isinstance(op, HigherOrderOperator):
+            return super().call_operator(op, args, kwargs, meta)
+
+        if view_copy_op := get_view_copy_of_view_op(op._schema):
+            return super().call_operator(view_copy_op, args, kwargs, meta)
+
+        return super().call_operator(op, args, kwargs, meta)
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__init__.py b/MLPY/Lib/site-packages/torch/_export/serde/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04145db13a2851ad1182c4c119d5e94fd451b0f6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62cb6611cdf644620503aa2d39bd454aaece2d74
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema_check.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema_check.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59653df7695d0c15ed021f06f92350d9efa2c9b8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/schema_check.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/serialize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/serialize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f067437eb665beed578023e9d73142a27727d4d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/serialize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/union.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/union.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3891a74a84760fef9b69a38715b5ab32bb35f60e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/union.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/upgrade.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/upgrade.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63055613e48eeb3cd0b0cf704f96116864aff249
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_export/serde/__pycache__/upgrade.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/schema.py b/MLPY/Lib/site-packages/torch/_export/serde/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c67046d551664cc8c04bf2f02eafea2e4076a0d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/schema.py
@@ -0,0 +1,346 @@
+# NOTE: This is a placeholder for iterating on export serialization schema design.
+#       Anything is subject to change and no guarantee is provided at this point.
+
+from dataclasses import dataclass, field
+from enum import IntEnum
+from typing import Dict, List, Optional, Tuple
+
+from torch._export.serde.union import _Union
+
+# NOTE: Please update this value if any modifications are made to the schema
+SCHEMA_VERSION = (5, 1)
+TREESPEC_VERSION = 1
+
+
+class ScalarType(IntEnum):
+    UNKNOWN = 0
+    BYTE = 1
+    CHAR = 2
+    SHORT = 3
+    INT = 4
+    LONG = 5
+    HALF = 6
+    FLOAT = 7
+    DOUBLE = 8
+    COMPLEXHALF = 9
+    COMPLEXFLOAT = 10
+    COMPLEXDOUBLE = 11
+    BOOL = 12
+    BFLOAT16 = 13
+
+
+class Layout(IntEnum):
+    Unknown = 0
+    SparseCoo = 1
+    SparseCsr = 2
+    SparseCsc = 3
+    SparseBsr = 4
+    SparseBsc = 5
+    _mkldnn = 6
+    Strided = 7
+
+
+class MemoryFormat(IntEnum):
+    Unknown = 0
+    ContiguousFormat = 1
+    ChannelsLast = 2
+    ChannelsLast3d = 3
+    PreserveFormat = 4
+
+
+@dataclass
+class Device:
+    type: str
+    index: Optional[int] = None
+
+
+@dataclass(repr=False)
+class SymExprHint(_Union):
+    as_int: int
+    as_float: float
+    as_bool: bool
+
+
+# This is for storing the symbolic expressions behind symints/symfloats/symbools
+# For example, we can get something like
+# SymExpr(expr_str="s0 + s1", hint=SymExprHint(as_int=4)
+# if we also have the hint that s0 and s1 are both 2.
+@dataclass
+class SymExpr:
+    expr_str: str
+    hint: Optional[SymExprHint] = None
+
+
+@dataclass(repr=False)
+class SymInt(_Union):
+    as_expr: SymExpr
+    as_int: int
+
+
+@dataclass(repr=False)
+class SymBool(_Union):
+    as_expr: SymExpr
+    as_bool: bool
+
+
+@dataclass
+class TensorMeta:
+    dtype: ScalarType
+    sizes: List[SymInt]
+    requires_grad: bool
+    device: Device
+    strides: List[SymInt]
+    storage_offset: SymInt
+    layout: Layout
+
+
+# In most cases we will use the "as_name" field to store arguments which are
+# SymInts.
+# The "as_int" field is used in the case where we have a list containing a mix
+# of SymInt and ints (ex. [1, s0, ...]). We will serialize this type of list to
+# be List[SymIntArgument] and map the SymInts to the "as_name" field, and ints
+# to the "as_int" field.
+@dataclass(repr=False)
+class SymIntArgument(_Union):
+    as_name: str
+    as_int: int
+
+
+# In most cases we will use the "as_name" field to store arguments which are
+# SymBools.
+# The "as_bool" field is used in the case where we have a list containing a mix
+# of SymBool and bools (ex. [True, i0, ...]). We will serialize this type of list to
+# be List[SymboolArgument] and map the SymBools to the "as_name" field, and bools
+# to the "as_bool" field.
+@dataclass(repr=False)
+class SymBoolArgument(_Union):
+    as_name: str
+    as_bool: bool
+
+
+@dataclass
+class TensorArgument:
+    name: str
+
+
+# This is use for storing the contents of a list which contain optional tensors
+# (Tensor?[], ex. [Tensor, None, ...]), where the list will be serialized to the
+# type List[OptionalTensorArgument], with tensor values seiralized to the
+# "as_tensor" field, and None values serialized to the "as_none" field.
+@dataclass(repr=False)
+class OptionalTensorArgument(_Union):
+    as_tensor: str
+    as_none: Tuple[()]
+
+
+@dataclass
+class GraphArgument:
+    name: str
+    graph: 'Graph'
+
+
+@dataclass
+class CustomObjArgument:
+    name: str
+    class_fqn: str
+
+
+# This is actually a union type
+@dataclass(repr=False)
+class Argument(_Union):
+    as_none: Tuple[()]
+    as_tensor: TensorArgument
+    as_tensors: List[TensorArgument]
+    as_int: int
+    as_ints: List[int]
+    as_float: float
+    as_floats: List[float]
+    as_string: str
+    as_strings: List[str]
+    as_sym_int: SymIntArgument
+    as_sym_ints: List[SymIntArgument]
+    as_scalar_type: ScalarType
+    as_memory_format: MemoryFormat
+    as_layout: Layout
+    as_device: Device
+    as_bool: bool
+    as_bools: List[bool]
+    as_sym_bool: SymBoolArgument
+    as_sym_bools: List[SymBoolArgument]
+    as_graph: GraphArgument
+    as_optional_tensors: List[OptionalTensorArgument]
+    as_custom_obj: CustomObjArgument
+    as_operator: str
+
+
+@dataclass
+class NamedArgument:
+    # Argument name from the operator schema
+    name: str
+    arg: Argument
+
+
+@dataclass
+class Node:
+    target: str
+    inputs: List[NamedArgument]
+    outputs: List[Argument]
+    metadata: Dict[str, str]
+
+
+@dataclass
+class Graph:
+    inputs: List[Argument]
+    outputs: List[Argument]
+    nodes: List[Node]
+    tensor_values: Dict[str, TensorMeta]
+    sym_int_values: Dict[str, SymInt]
+    sym_bool_values: Dict[str, SymBool]
+    # This is for deserializing the submodule graphs from higher order ops
+    # (ex. cond, map) where single tensor returns will just return a single
+    # tensor, rather than following export schema and returning a singleton
+    # list.
+    is_single_tensor_return: bool = False
+    custom_obj_values: Dict[str, CustomObjArgument] = field(default_factory=dict)
+
+
+@dataclass
+class UserInputSpec:
+    # Actually, only tensors and SymInts are allowed here
+    arg: Argument
+
+
+@dataclass
+class InputToParameterSpec:
+    arg: TensorArgument
+    parameter_name: str
+
+
+@dataclass
+class InputToBufferSpec:
+    arg: TensorArgument
+    buffer_name: str
+    persistent: bool
+
+
+
+@dataclass
+class InputToTensorConstantSpec:
+    arg: TensorArgument
+    tensor_constant_name: str
+
+
+@dataclass
+class InputToCustomObjSpec:
+    arg: CustomObjArgument
+    custom_obj_name: str
+
+
+@dataclass(repr=False)
+class InputSpec(_Union):
+    user_input: UserInputSpec
+    parameter: InputToParameterSpec
+    buffer: InputToBufferSpec
+    tensor_constant: InputToTensorConstantSpec
+    custom_obj: InputToCustomObjSpec
+
+
+@dataclass
+class UserOutputSpec:
+    arg: Argument
+
+
+@dataclass
+class LossOutputSpec:
+    arg: TensorArgument
+
+
+@dataclass
+class BufferMutationSpec:
+    arg: TensorArgument
+    buffer_name: str
+
+
+@dataclass
+class GradientToParameterSpec:
+    arg: TensorArgument
+    parameter_name: str
+
+
+@dataclass
+class GradientToUserInputSpec:
+    arg: TensorArgument
+    user_input_name: str
+
+
+@dataclass
+class UserInputMutationSpec:
+    arg: TensorArgument
+    user_input_name: str
+
+
+@dataclass(repr=False)
+class OutputSpec(_Union):
+    user_output: UserOutputSpec
+    loss_output: LossOutputSpec
+    buffer_mutation: BufferMutationSpec
+    gradient_to_parameter: GradientToParameterSpec
+    gradient_to_user_input: GradientToUserInputSpec
+    user_input_mutation: UserInputMutationSpec
+
+
+@dataclass
+class GraphSignature:
+    input_specs: List[InputSpec]
+    output_specs: List[OutputSpec]
+
+
+@dataclass
+class RangeConstraint:
+    min_val: int
+    max_val: int
+
+
+@dataclass
+class ModuleCallSignature:
+    inputs: List[Argument]
+    outputs: List[Argument]
+
+    # These are serialized by calling pytree.treespec_loads
+    # And deserialized by calling pytree.treespec_dumps
+    in_spec: str
+    out_spec: str
+
+
+@dataclass
+class ModuleCallEntry:
+    fqn: str
+    signature: Optional[ModuleCallSignature] = None
+
+
+@dataclass
+class GraphModule:
+    graph: Graph
+    signature: GraphSignature
+    # This is used for unflattening, by tracking the calling structure of all of
+    # the modules in order to unflatten the modules back to the eager calling
+    # conventions.
+    module_call_graph: List[ModuleCallEntry]
+
+
+# Invariant: Every time a change is made to the schema, one of the versions
+#            should be upadted.
+@dataclass
+class SchemaVersion:
+    major: int  # Major version number is bumped every time a breaking change is made.
+    minor: int  # Minor version number is bumped when a compatible change is made.
+
+
+@dataclass
+class ExportedProgram:
+    graph_module: GraphModule
+    # Key is the opset namespace (ex. aten), and value is the version number
+    opset_version: Dict[str, int]
+    range_constraints: Dict[str, RangeConstraint]
+    schema_version: SchemaVersion
+    dialect: str
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/schema.yaml b/MLPY/Lib/site-packages/torch/_export/serde/schema.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..23e12619579ce9fe18c298d4d21160b69f2ab33b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/schema.yaml
@@ -0,0 +1,389 @@
+# @generated by update_schema.py
+# checksum<<4c9986f3aba283b1746995fff8fe7005b370c7e288adec65c03030349a4bab60>>
+Argument:
+  kind: union
+  fields:
+    as_none:
+      type: Tuple[()]
+    as_tensor:
+      type: TensorArgument
+    as_tensors:
+      type: List[TensorArgument]
+    as_int:
+      type: int
+    as_ints:
+      type: List[int]
+    as_float:
+      type: float
+    as_floats:
+      type: List[float]
+    as_string:
+      type: str
+    as_strings:
+      type: List[str]
+    as_sym_int:
+      type: SymIntArgument
+    as_sym_ints:
+      type: List[SymIntArgument]
+    as_scalar_type:
+      type: ScalarType
+    as_memory_format:
+      type: MemoryFormat
+    as_layout:
+      type: Layout
+    as_device:
+      type: Device
+    as_bool:
+      type: bool
+    as_bools:
+      type: List[bool]
+    as_sym_bool:
+      type: SymBoolArgument
+    as_sym_bools:
+      type: List[SymBoolArgument]
+    as_graph:
+      type: GraphArgument
+    as_optional_tensors:
+      type: List[OptionalTensorArgument]
+    as_custom_obj:
+      type: CustomObjArgument
+    as_operator:
+      type: str
+BufferMutationSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    buffer_name:
+      type: str
+CustomObjArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    class_fqn:
+      type: str
+Device:
+  kind: struct
+  fields:
+    type:
+      type: str
+    index:
+      type: Optional[int]
+      default: None
+ExportedProgram:
+  kind: struct
+  fields:
+    graph_module:
+      type: GraphModule
+    opset_version:
+      type: Dict[str, int]
+    range_constraints:
+      type: Dict[str, RangeConstraint]
+    schema_version:
+      type: SchemaVersion
+    dialect:
+      type: str
+GradientToParameterSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    parameter_name:
+      type: str
+GradientToUserInputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    user_input_name:
+      type: str
+Graph:
+  kind: struct
+  fields:
+    inputs:
+      type: List[Argument]
+    outputs:
+      type: List[Argument]
+    nodes:
+      type: List[Node]
+    tensor_values:
+      type: Dict[str, TensorMeta]
+    sym_int_values:
+      type: Dict[str, SymInt]
+    sym_bool_values:
+      type: Dict[str, SymBool]
+    is_single_tensor_return:
+      type: bool
+      default: 'False'
+    custom_obj_values:
+      type: Dict[str, CustomObjArgument]
+      default: '{}'
+GraphArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    graph:
+      type: Graph
+GraphModule:
+  kind: struct
+  fields:
+    graph:
+      type: Graph
+    signature:
+      type: GraphSignature
+    module_call_graph:
+      type: List[ModuleCallEntry]
+GraphSignature:
+  kind: struct
+  fields:
+    input_specs:
+      type: List[InputSpec]
+    output_specs:
+      type: List[OutputSpec]
+InputSpec:
+  kind: union
+  fields:
+    user_input:
+      type: UserInputSpec
+    parameter:
+      type: InputToParameterSpec
+    buffer:
+      type: InputToBufferSpec
+    tensor_constant:
+      type: InputToTensorConstantSpec
+    custom_obj:
+      type: InputToCustomObjSpec
+InputToBufferSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    buffer_name:
+      type: str
+    persistent:
+      type: bool
+InputToCustomObjSpec:
+  kind: struct
+  fields:
+    arg:
+      type: CustomObjArgument
+    custom_obj_name:
+      type: str
+InputToParameterSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    parameter_name:
+      type: str
+InputToTensorConstantSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    tensor_constant_name:
+      type: str
+Layout:
+  kind: enum
+  fields:
+    Unknown: 0
+    SparseCoo: 1
+    SparseCsr: 2
+    SparseCsc: 3
+    SparseBsr: 4
+    SparseBsc: 5
+    _mkldnn: 6
+    Strided: 7
+LossOutputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+MemoryFormat:
+  kind: enum
+  fields:
+    Unknown: 0
+    ContiguousFormat: 1
+    ChannelsLast: 2
+    ChannelsLast3d: 3
+    PreserveFormat: 4
+ModuleCallEntry:
+  kind: struct
+  fields:
+    fqn:
+      type: str
+    signature:
+      type: Optional[ModuleCallSignature]
+      default: None
+ModuleCallSignature:
+  kind: struct
+  fields:
+    inputs:
+      type: List[Argument]
+    outputs:
+      type: List[Argument]
+    in_spec:
+      type: str
+    out_spec:
+      type: str
+NamedArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+    arg:
+      type: Argument
+Node:
+  kind: struct
+  fields:
+    target:
+      type: str
+    inputs:
+      type: List[NamedArgument]
+    outputs:
+      type: List[Argument]
+    metadata:
+      type: Dict[str, str]
+OptionalTensorArgument:
+  kind: union
+  fields:
+    as_tensor:
+      type: str
+    as_none:
+      type: Tuple[()]
+OutputSpec:
+  kind: union
+  fields:
+    user_output:
+      type: UserOutputSpec
+    loss_output:
+      type: LossOutputSpec
+    buffer_mutation:
+      type: BufferMutationSpec
+    gradient_to_parameter:
+      type: GradientToParameterSpec
+    gradient_to_user_input:
+      type: GradientToUserInputSpec
+    user_input_mutation:
+      type: UserInputMutationSpec
+RangeConstraint:
+  kind: struct
+  fields:
+    min_val:
+      type: int
+    max_val:
+      type: int
+ScalarType:
+  kind: enum
+  fields:
+    UNKNOWN: 0
+    BYTE: 1
+    CHAR: 2
+    SHORT: 3
+    INT: 4
+    LONG: 5
+    HALF: 6
+    FLOAT: 7
+    DOUBLE: 8
+    COMPLEXHALF: 9
+    COMPLEXFLOAT: 10
+    COMPLEXDOUBLE: 11
+    BOOL: 12
+    BFLOAT16: 13
+SchemaVersion:
+  kind: struct
+  fields:
+    major:
+      type: int
+    minor:
+      type: int
+SymBool:
+  kind: union
+  fields:
+    as_expr:
+      type: SymExpr
+    as_bool:
+      type: bool
+SymBoolArgument:
+  kind: union
+  fields:
+    as_name:
+      type: str
+    as_bool:
+      type: bool
+SymExpr:
+  kind: struct
+  fields:
+    expr_str:
+      type: str
+    hint:
+      type: Optional[SymExprHint]
+      default: None
+SymExprHint:
+  kind: union
+  fields:
+    as_int:
+      type: int
+    as_float:
+      type: float
+    as_bool:
+      type: bool
+SymInt:
+  kind: union
+  fields:
+    as_expr:
+      type: SymExpr
+    as_int:
+      type: int
+SymIntArgument:
+  kind: union
+  fields:
+    as_name:
+      type: str
+    as_int:
+      type: int
+TensorArgument:
+  kind: struct
+  fields:
+    name:
+      type: str
+TensorMeta:
+  kind: struct
+  fields:
+    dtype:
+      type: ScalarType
+    sizes:
+      type: List[SymInt]
+    requires_grad:
+      type: bool
+    device:
+      type: Device
+    strides:
+      type: List[SymInt]
+    storage_offset:
+      type: SymInt
+    layout:
+      type: Layout
+UserInputMutationSpec:
+  kind: struct
+  fields:
+    arg:
+      type: TensorArgument
+    user_input_name:
+      type: str
+UserInputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: Argument
+UserOutputSpec:
+  kind: struct
+  fields:
+    arg:
+      type: Argument
+SCHEMA_VERSION:
+- 5
+- 1
+TREESPEC_VERSION: 1
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/schema_check.py b/MLPY/Lib/site-packages/torch/_export/serde/schema_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..adee0a3f450412f1252ffeead9833c4945232017
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/schema_check.py
@@ -0,0 +1,285 @@
+import dataclasses
+import hashlib
+import re
+import typing
+from enum import IntEnum
+from typing import Any, Dict, Optional, Union
+
+from torch._export.serde import schema
+from torch._export.serde.union import _Union
+
+
+class SchemaUpdateError(Exception):
+    pass
+
+
+def _check(x, msg):
+    if not x:
+        raise SchemaUpdateError(msg)
+
+
+def _staged_schema():
+    ret: Dict[str, Any] = {}
+    defs = {}
+
+    def _handle_aggregate(ty):
+        def dump_type(t):
+            if isinstance(t, type):
+                return t.__name__
+            elif isinstance(t, str):
+                assert t in defs
+                return t
+            elif o := typing.get_origin(t):
+                # Lemme know if there's a better way to do this.
+                if o == list:
+                    head = "List"
+                elif o == dict:
+                    head = "Dict"
+                elif o == tuple:
+                    if typing.get_args(t) == ():
+                        return "Tuple[()]"
+                    head = "Tuple"
+                elif o == Union:
+                    args = typing.get_args(t)
+                    assert len(args) == 2 and args[1] == type(None)
+                    return f"Optional[{dump_type(args[0])}]"
+                else:
+                    raise AssertionError(f"Type {t} is not supported in export schema.")
+                return (
+                    f"{head}[{', '.join([dump_type(x) for x in typing.get_args(t)])}]"
+                )
+            elif t == ():
+                return "()"
+            else:
+                raise AssertionError(f"Type {t} is not supported in export schema.")
+
+        def dump_field(f):
+            t = dump_type(f.type)
+            ret = {"type": t}
+
+            value = dataclasses.MISSING
+            if f.default is not dataclasses.MISSING:
+                value = f.default
+            elif f.default_factory is not dataclasses.MISSING:
+                value = f.default_factory()
+
+            if t.startswith("Optional[") and value is not None:
+                raise AssertionError(
+                    f"Optional field {ty.__name__}.{f.name} must have default value to be None."
+                )
+
+            if value is not dataclasses.MISSING:
+                default = str(value)
+                ret["default"] = default
+            return ret
+
+        return {f.name: dump_field(f) for f in dataclasses.fields(ty)}
+
+    def _handle_int_enum(name, ty):
+        ret[name] = {"kind": "enum", "fields": {x.name: x.value for x in ty}}
+
+    def _handle_struct(name, ty):
+        ret[name] = {"kind": "struct", "fields": _handle_aggregate(ty)}
+
+    def _handle_union(name, ty):
+        ret[name] = {"kind": "union", "fields": _handle_aggregate(ty)}
+
+    for name in dir(schema):
+        if name.startswith("_"):
+            continue
+
+        value = getattr(schema, name)
+
+        if hasattr(value, "__module__") and value.__module__ != schema.__name__:
+            continue
+
+        defs[name] = value
+
+    for name, value in defs.items():
+        if isinstance(value, type):
+            if issubclass(value, IntEnum):
+                _handle_int_enum(name, value)
+            elif dataclasses.is_dataclass(value):
+                if issubclass(value, _Union):
+                    _handle_union(name, value)
+                else:
+                    _handle_struct(name, value)
+            else:
+                raise AssertionError(f"Unknown schema type {name}: {value}")
+        elif isinstance(value, (int, tuple)):
+            assert name in ("SCHEMA_VERSION", "TREESPEC_VERSION")
+        else:
+            raise AssertionError(f"Unknown variable {name}: {value}")
+
+    ret["SCHEMA_VERSION"] = list(defs["SCHEMA_VERSION"])
+    assert all(x > 0 for x in ret["SCHEMA_VERSION"])
+    ret["TREESPEC_VERSION"] = defs["TREESPEC_VERSION"]
+    assert ret["TREESPEC_VERSION"] > 0
+    return ret
+
+
+def _diff_schema(dst, src):
+    additions = {key: src[key] for key in src.keys() - dst.keys()}
+    subtractions = {key: dst[key] for key in dst.keys() - src.keys()}
+
+    common_keys = src.keys() & dst.keys()
+
+    versions = {"SCHEMA_VERSION", "TREESPEC_VERSION"}
+    common_keys -= versions
+
+    for key in common_keys:
+        src_kind = src[key]["kind"]
+        src_fields = src[key]["fields"]
+        dst_kind = dst[key]["kind"]
+        dst_fields = dst[key]["fields"]
+        _check(
+            src_kind == dst_kind,
+            f"Type {key} changed kind from {dst_kind} to {src_kind}",
+        )
+        assert isinstance(src_fields, dict) and isinstance(dst_fields, dict)
+        added_fields = {
+            key: src_fields[key] for key in src_fields.keys() - dst_fields.keys()
+        }
+        subtracted_fields = {
+            key: dst_fields[key] for key in dst_fields.keys() - src_fields.keys()
+        }
+        common_fields = src_fields.keys() & dst_fields.keys()
+
+        for field in common_fields:
+            src_field = src_fields[field]
+            dst_field = dst_fields[field]
+            if src_kind == "struct":
+                _check(
+                    src_field["type"] == dst_field["type"],
+                    f"Type of the field {key}.{field} changed from {dst_field['type']} to {src_field['type']}",
+                )
+                if "default" in src_field and "default" not in dst_field:
+                    added_fields[field] = {}
+                    added_fields[field]["default"] = src_field["default"]
+                if "default" not in src_field and "default" in dst_field:
+                    subtracted_fields[field] = {}
+                    subtracted_fields[field]["default"] = dst_field["default"]
+            elif src_kind == "enum":
+                _check(
+                    src_field == dst_field,
+                    f"Value of the enum field {key}.{field} changed from {dst_field} to {src_field}",
+                )
+            elif src_kind == "union":
+                _check(
+                    src_field["type"] == dst_field["type"],
+                    f"Type of the field {key}.{field} changed from {dst_field['type']} to {src_field['type']}",
+                )
+            else:
+                raise AssertionError(f"Unknown kind {src_kind}: {key}")
+        if len(added_fields) > 0:
+            assert key not in additions
+            additions[key] = {}
+            additions[key]["fields"] = added_fields
+        if len(subtracted_fields) > 0:
+            assert key not in subtractions
+            subtractions[key] = {}
+            subtractions[key]["fields"] = subtracted_fields
+
+    return additions, subtractions
+
+
+def _hash_schema(s):
+    return hashlib.sha256(repr(s).encode("utf-8")).hexdigest()
+
+
+@dataclasses.dataclass
+class _Commit:
+    result: Dict[str, Any]
+    checksum_result: str
+    path: str
+    additions: Dict[str, Any]
+    subtractions: Dict[str, Any]
+    base: Dict[str, Any]
+    checksum_base: Optional[str]
+
+
+def update_schema():
+    import importlib.resources
+
+    if importlib.resources.is_resource(__package__, "schema.yaml"):
+        content = importlib.resources.read_text(__package__, "schema.yaml")
+        match = re.search("checksum<<([A-Fa-f0-9]{64})>>", content)
+        _check(match is not None, "checksum not found in schema.yaml")
+        assert match is not None
+        checksum_base = match.group(1)
+        from yaml import load, Loader
+
+        dst = load(content, Loader=Loader)
+        assert isinstance(dst, dict)
+    else:
+        checksum_base = None
+        dst = {"SCHEMA_VERSION": None, "TREESPEC_VERSION": None}
+
+    src = _staged_schema()
+    additions, subtractions = _diff_schema(dst, src)
+    return _Commit(
+        result=src,
+        checksum_result=_hash_schema(src),
+        path=__package__.replace(".", "/") + "/schema.yaml",
+        additions=additions,
+        subtractions=subtractions,
+        base=dst,
+        checksum_base=checksum_base,
+    )
+
+
+def check(commit: _Commit, force_unsafe: bool = False):
+    next_version = None
+    reason = ""
+    # Step 1: Detect major schema updates.
+    if len(commit.additions) > 0:
+        for k, v in commit.additions.items():
+            if k not in commit.base:
+                continue
+            kind = commit.result[k]["kind"]
+            fields = v["fields"]
+            for f, d in fields.items():
+                if "default" not in d and kind == "struct":
+                    reason += (
+                        f"Field {k}.{f} is added to schema.py without a default value as an incomparible change "
+                        + "which requires major version bump.\n"
+                    )
+                    next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]
+
+    if len(commit.subtractions) > 0:
+        for k, v in commit.subtractions.items():
+            if k not in commit.result:
+                continue
+            for f in v["fields"]:
+                reason = f"Field {k}.{f} is removed from schema.py as an incompatible change which requires major version bump.\n"
+            next_version = [commit.base["SCHEMA_VERSION"][0] + 1, 1]
+
+    if force_unsafe:
+        reason += "--force-unsafe is used."
+        next_version = commit.result["SCHEMA_VERSION"]
+    else:
+        # Step 2: Detect minor schema updates.
+        if next_version is None and len(commit.additions) > 0:
+            for k, v in commit.additions.items():
+                for f in v["fields"]:
+                    reason += (
+                        f"Field {k}.{f} is added to schema.py as an compatible change "
+                        + "which still requires minor version bump.\n"
+                    )
+            next_version = [
+                commit.base["SCHEMA_VERSION"][0],
+                commit.base["SCHEMA_VERSION"][1] + 1,
+            ]
+        if next_version is None and len(commit.subtractions) > 0:
+            for k, v in commit.subtractions.items():
+                for f in v["fields"]:
+                    reason += (
+                        f"Field {k}.{f} is removed from schema.py as an compatible change "
+                        + "which still requires minor version bump.\n"
+                    )
+            next_version = [
+                commit.base["SCHEMA_VERSION"][0],
+                commit.base["SCHEMA_VERSION"][1] + 1,
+            ]
+
+    return next_version, reason
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/serialize.py b/MLPY/Lib/site-packages/torch/_export/serde/serialize.py
new file mode 100644
index 0000000000000000000000000000000000000000..35d49430baf4f7a79fa4853ed8f2ff71bbf53f06
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/serialize.py
@@ -0,0 +1,2434 @@
+import base64
+import copy
+import dataclasses
+import heapq
+import inspect
+import io
+import json
+import logging
+import math
+import operator
+import typing
+import copyreg
+
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import sympy
+
+import torch
+import torch.export.exported_program as ep
+from torch._export.serde.schema import SchemaVersion
+from torch._export.verifier import load_verifier
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx.experimental import symbolic_shapes
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import treespec_dumps, treespec_loads
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from .schema import (  # type: ignore[attr-defined]
+    Argument,
+    BufferMutationSpec,
+    CustomObjArgument,
+    Device,
+    ExportedProgram,
+    GradientToParameterSpec,
+    GradientToUserInputSpec,
+    Graph,
+    GraphArgument,
+    GraphModule,
+    GraphSignature,
+    InputSpec,
+    InputToBufferSpec,
+    InputToCustomObjSpec,
+    InputToParameterSpec,
+    InputToTensorConstantSpec,
+    Layout,
+    LossOutputSpec,
+    MemoryFormat,
+    ModuleCallEntry,
+    ModuleCallSignature,
+    NamedArgument,
+    Node,
+    OptionalTensorArgument,
+    OutputSpec,
+    RangeConstraint,
+    ScalarType,
+    SCHEMA_VERSION,
+    SymBool,
+    SymBoolArgument,
+    SymExpr,
+    SymExprHint,
+    SymInt,
+    SymIntArgument,
+    TensorArgument,
+    TensorMeta,
+    TREESPEC_VERSION,
+    UserInputMutationSpec,
+    UserInputSpec,
+    UserOutputSpec,
+)
+from .union import _Union
+
+
+__all__ = [
+    "serialize",
+    "GraphModuleSerializer",
+    "ExportedProgramSerializer",
+    "GraphModuleDeserializer",
+    "ExportedProgramDeserializer",
+]
+
+from .upgrade import GraphModuleOpUpgrader
+
+log = logging.getLogger(__name__)
+
+
+class SerializeError(RuntimeError):
+    pass
+
+
+def _reverse_map(d: Dict[Any, Enum]):
+    return {v.value: k for k, v in d.items()}
+
+
+MetaType = Union[FakeTensor, int, torch.SymInt, bool, torch.SymBool, ep.CustomObjArgument]
+
+
+ST_DELIMITER = ";"
+
+_TORCH_TO_SERIALIZE_DTYPE = {
+    torch.uint8: ScalarType.BYTE,
+    torch.int8: ScalarType.CHAR,
+    torch.int16: ScalarType.SHORT,
+    torch.int32: ScalarType.INT,
+    torch.int64: ScalarType.LONG,
+    torch.float16: ScalarType.HALF,
+    torch.float32: ScalarType.FLOAT,
+    torch.float64: ScalarType.DOUBLE,
+    torch.complex32: ScalarType.COMPLEXHALF,
+    torch.complex64: ScalarType.COMPLEXFLOAT,
+    torch.complex128: ScalarType.COMPLEXDOUBLE,
+    torch.bool: ScalarType.BOOL,
+    torch.bfloat16: ScalarType.BFLOAT16
+}
+
+
+_SERIALIZE_TO_TORCH_DTYPE = _reverse_map(_TORCH_TO_SERIALIZE_DTYPE)  # type: ignore[arg-type]
+
+
+_TORCH_TO_SERIALIZE_LAYOUT = {
+    torch.sparse_coo: Layout.SparseCoo,
+    torch.sparse_csr: Layout.SparseCsr,
+    torch.sparse_csc: Layout.SparseCsc,
+    torch.sparse_bsr: Layout.SparseBsr,
+    torch.sparse_bsc: Layout.SparseBsc,
+    torch._mkldnn: Layout._mkldnn,  # type: ignore[attr-defined]
+    torch.strided: Layout.Strided,
+}
+
+
+_SERIALIZE_TO_TORCH_LAYOUT = _reverse_map(_TORCH_TO_SERIALIZE_LAYOUT)  # type: ignore[arg-type]
+
+
+_TORCH_TO_SERIALIZE_MEMORY_FORMAT = {
+    torch.contiguous_format: MemoryFormat.ContiguousFormat,
+    torch.channels_last: MemoryFormat.ChannelsLast,
+    torch.channels_last_3d: MemoryFormat.ChannelsLast3d,
+    torch.preserve_format: MemoryFormat.PreserveFormat,
+}
+
+
+_SERIALIZE_TO_TORCH_MEMORY_FORMAT = _reverse_map(_TORCH_TO_SERIALIZE_MEMORY_FORMAT)  # type: ignore[arg-type]
+
+
+_SYM_INT_OPS = {
+    operator.mul,
+    operator.add,
+    operator.sub,
+    operator.floordiv,
+    operator.mod,
+    torch.sym_int,
+    torch.sym_ite,
+    torch.sym_max,
+    torch.sym_min,
+    torch.sym_sqrt,
+}
+
+
+_SYM_BOOL_OPS = {
+    operator.eq,
+    operator.ne,
+    operator.le,
+    operator.ge,
+    operator.lt,
+    operator.gt,
+    torch.sym_not,
+}
+
+
+@dataclass
+class SerializedArtifact:
+    exported_program: Union[ExportedProgram, bytes]
+    state_dict: bytes
+    constants: bytes
+
+
+def deserialize_device(d: Device) -> torch.device:
+    if d.index is None:
+        return torch.device(type=d.type)  # type: ignore[call-overload]
+    return torch.device(type=d.type, index=d.index)
+
+
+def serialize_sym_int(s: Union[int, torch.SymInt]) -> SymInt:
+    if isinstance(s, (torch.SymInt, int)):
+        if symbolic_shapes.is_concrete_int(s):
+            return SymInt.create(as_int=int(s))
+        else:
+            assert isinstance(s, torch.SymInt)
+            if s.node.hint is None:
+                return SymInt.create(as_expr=SymExpr(str(s)))
+            else:
+                return SymInt.create(as_expr=SymExpr(str(s), hint=SymExprHint.create(as_int=s.node.hint)))
+    else:
+        raise SerializeError(
+            f"SymInt should be either symbol or int, got `{s}` of type `{type(s)}`"
+        )
+
+
+def serialize_sym_bool(s: Union[bool, torch.SymBool]) -> SymBool:
+    if isinstance(s, (torch.SymBool, bool)):
+        if symbolic_shapes.is_concrete_bool(s):
+            return SymBool.create(as_bool=bool(s))
+        else:
+            return SymBool.create(as_expr=SymExpr(expr_str=str(s)))
+    else:
+        raise SerializeError(
+            f"SymBool should be either symbol or bool, got `{s}` of type `{type(s)}`"
+        )
+
+
+def serialize_tensor_meta(t: torch.Tensor) -> TensorMeta:
+    """
+    Extract a TensorMeta describing `t`.
+    """
+    return TensorMeta(
+        dtype=_TORCH_TO_SERIALIZE_DTYPE[t.dtype],
+        sizes=[serialize_sym_int(s) for s in t.shape],
+        requires_grad=t.requires_grad,
+        device=Device(type=t.device.type, index=t.device.index),
+        strides=[serialize_sym_int(s) for s in t.stride()],
+        storage_offset=serialize_sym_int(0),  # TODO needs to be fixed.
+        layout=_TORCH_TO_SERIALIZE_LAYOUT[t.layout],
+    )
+
+
+_CURRENT_DESERIALIZER: Optional["GraphModuleDeserializer"] = None
+
+
+def _reduce_fake_tensor(fake_tensor: FakeTensor):
+    is_parameter = isinstance(fake_tensor, torch.nn.Parameter)
+    tensor_meta = serialize_tensor_meta(fake_tensor)
+    tensor_meta_bytes = json.dumps(_dataclass_to_dict(tensor_meta), cls=EnumEncoder).encode("utf-8")
+    return _reconstruct_fake_tensor, (tensor_meta_bytes, is_parameter)
+
+
+def _reconstruct_fake_tensor(serialized_tensor_meta: bytes, is_parameter: bool) -> FakeTensor:
+    # Deserialize the bytes into a TensorMeta
+    json_tensor_meta = json.loads(serialized_tensor_meta.decode("utf-8"))
+    tensor_meta = _dict_to_dataclass(TensorMeta, json_tensor_meta)
+    # Find the current fake mode
+    assert _CURRENT_DESERIALIZER is not None, "Need access to current deserializer state"
+    fake_tensor = _CURRENT_DESERIALIZER.deserialize_tensor_meta(tensor_meta)
+    if is_parameter:
+        fake_tensor = torch.nn.Parameter(fake_tensor)  # type: ignore[assignment]
+    return fake_tensor
+
+
+def serialize_torch_artifact(artifact: Dict[str, Any]) -> bytes:
+    assert FakeTensor not in copyreg.dispatch_table, "Refusing to stomp on existing FakeTensor reducer"
+    try:
+        copyreg.pickle(FakeTensor, _reduce_fake_tensor)
+        buffer = io.BytesIO()
+        # This is a workaround for backend's tensor deserialization problem:
+        # unpickleTensor() always create a tensor on the device where it was originally saved
+        # This behavior is bad for multi-gpu training, as we wish to directly load the tensor
+        # on the designated device.
+        # For now, we simply move the tensor to cpu before saving.
+        # TODO: this should be fixed by deserialization instead.
+        torch.save(artifact, buffer)
+        return buffer.getvalue()
+    finally:
+        del copyreg.dispatch_table[FakeTensor]
+
+
+def deserialize_torch_artifact(serialized: bytes):
+    if len(serialized) == 0:
+        return {}
+    buffer = io.BytesIO(serialized)
+    buffer.seek(0)
+    artifact = torch.load(buffer)
+    assert isinstance(artifact, dict)
+    return artifact
+
+
+def _sympy_int_to_int(val: sympy.Expr):
+    # Convert simple sympy Integers into concrete int
+    if val == sympy.oo:
+        return math.inf
+    if val == -sympy.oo:
+        return -math.inf
+    if isinstance(val, sympy.Integer):
+        return int(val)
+    raise RuntimeError(
+        "Export constraints cannot be non-integer expressions"
+    )
+
+
+def _int_to_sympy_int(val) -> sympy.Expr:
+    # Convert concrete int into simple sympy Integers
+    if val == math.inf:
+        return sympy.oo
+    if val == -math.inf:
+        return -sympy.oo
+    return sympy.Integer(val)
+
+
+def serialize_range_constraints(
+    range_constraints: Dict[sympy.Symbol, ValueRanges]
+) -> Dict[str, RangeConstraint]:
+    return {
+        str(k): RangeConstraint(
+            _sympy_int_to_int(v.lower),  # type: ignore[arg-type]
+            _sympy_int_to_int(v.upper),  # type: ignore[arg-type]
+        )
+        for k, v in range_constraints.items()
+    }
+
+
+def _is_single_tensor_return(target: torch._ops.OpOverload) -> bool:
+    returns = target._schema.returns
+    return len(returns) == 1 and isinstance(returns[0].real_type, torch.TensorType)
+
+
+def _is_single_tensor_list_return(target: torch._ops.OpOverload) -> bool:
+    returns = target._schema.returns
+    if len(returns) != 1:
+        return False
+    return_type = returns[0].real_type
+    return isinstance(return_type, torch.ListType) and isinstance(
+        return_type.getElementType(), torch.TensorType
+    )
+
+
+@dataclass
+class GraphState:
+    inputs: List[Argument] = field(default_factory=list)
+    outputs: List[Argument] = field(default_factory=list)
+    nodes: List[Node] = field(default_factory=list)
+    tensor_values: Dict[str, TensorMeta] = field(default_factory=dict)
+    sym_int_values: Dict[str, SymInt] = field(default_factory=dict)
+    sym_bool_values: Dict[str, SymBool] = field(default_factory=dict)
+    is_single_tensor_return: bool = False
+    custom_obj_values: Dict[str, CustomObjArgument] = field(default_factory=dict)
+
+
+class GraphModuleSerializer:
+    def __init__(
+        self,
+        graph_signature: ep.ExportGraphSignature,
+        module_call_graph: List[ep.ModuleCallEntry]
+    ):
+        self.graph_state = GraphState()
+        self.graph_signature = graph_signature
+        self.module_call_graph = module_call_graph
+        self.custom_objs: Dict[str, torch._C.ScriptObject] = {}
+
+    @contextmanager
+    def save_graph_state(self):
+        saved = self.graph_state
+        self.graph_state = GraphState()
+        try:
+            yield
+        finally:
+            self.graph_state = saved
+
+    def handle_placeholder(self, node: torch.fx.Node):
+        assert node.op == "placeholder"
+        if isinstance(node.meta['val'], torch.Tensor):
+            graph_input = Argument.create(as_tensor=TensorArgument(name=node.name))
+            self.graph_state.tensor_values[node.name] = serialize_tensor_meta(node.meta["val"])
+        elif isinstance(node.meta['val'], torch.SymInt):
+            raise AssertionError("SymInt graph input is not implemented yet.")
+        elif isinstance(node.meta['val'], (int, bool, str, float, type(None))):
+            graph_input = self.serialize_input(node.meta['val'])
+        elif isinstance(node.meta['val'], ep.CustomObjArgument):
+            class_fqn = node.meta["val"].class_fqn
+            graph_input = Argument.create(as_custom_obj=CustomObjArgument(name=node.name, class_fqn=class_fqn))
+            self.graph_state.custom_obj_values[node.name] = self.serialize_script_obj_meta(node.meta["val"])
+        else:
+            raise AssertionError(f"Unimplemented graph input type: {node.meta['val']}")
+        self.graph_state.inputs.append(graph_input)
+
+    def handle_output(self, node: torch.fx.Node):
+        assert node.op == "output"
+        assert len(node.args) == 1, "FX.Node's args should have one arg"
+        node_args = node.args[0]
+        if isinstance(node_args, torch.fx.Node):
+            # For singleton tensor returns
+            self.graph_state.is_single_tensor_return = True
+            self.graph_state.outputs = [self.serialize_input(node_args)]
+        else:
+            assert isinstance(node_args, (tuple, list))
+            self.graph_state.outputs = [self.serialize_input(arg) for arg in node_args]
+
+    def serialize_operator(self, target) -> str:
+        if isinstance(target, str):
+            return target
+        elif target.__module__.startswith("torch._ops"):
+            # TODO(zhxchen17) Maybe provide a function name helper in FX.
+            # From torch.fx.node._get_qualified_name
+            module = target.__module__.replace("torch._ops", "torch.ops")
+            return f"{module}.{target.__name__}"
+        else:  # TODO(zhxchen17) Don't catch all here.
+            return f"{target.__module__}.{target.__name__}"
+
+    def handle_call_function(self, node: torch.fx.Node):
+        assert node.op == "call_function"
+
+        # getitem has been handled in the producer node, skip it here
+        if node.target is operator.getitem:
+            return
+
+        if node.target in _SYM_INT_OPS:
+            assert len(node.kwargs) == 0
+            meta_val = node.meta["val"]
+            ex_node = Node(
+                target=self.serialize_operator(node.target),
+                inputs=self.serialize_sym_op_inputs(node.target, node.args),
+                outputs=[Argument.create(as_sym_int=self.serialize_sym_int_output(node.name, meta_val))],
+                metadata=self.serialize_metadata(node),
+            )
+        elif node.target in _SYM_BOOL_OPS:
+            assert len(node.kwargs) == 0
+            meta_val = node.meta["val"]
+            ex_node = Node(
+                target=self.serialize_operator(node.target),
+                inputs=self.serialize_sym_op_inputs(node.target, node.args),
+                outputs=[Argument.create(as_sym_bool=self.serialize_sym_bool_output(node.name, meta_val))],
+                metadata=self.serialize_metadata(node),
+            )
+        elif isinstance(node.target, torch._ops.OpOverload):
+            ex_node = Node(
+                target=self.serialize_operator(node.target),
+                inputs=self.serialize_inputs(node.target, node.args, node.kwargs),
+                outputs=self.serialize_outputs(node),
+                # TODO: create a new tensor_values here, meta might have faketensor info
+                metadata=self.serialize_metadata(node),
+            )
+        elif isinstance(node.target, torch._ops.HigherOrderOperator):
+            ex_node = Node(
+                target=self.serialize_operator(node.target),
+                inputs=self.serialize_hoo_inputs(node.args, node.kwargs),
+                outputs=self.serialize_hoo_outputs(node),
+                metadata=self.serialize_metadata(node),
+            )
+        else:
+            raise SerializeError(f"Serializing {node.target} is not supported")
+
+        self.graph_state.nodes.append(ex_node)
+
+    def handle_get_attr(self, node):
+        pass
+
+    def serialize_metadata(self, node: torch.fx.Node) -> Dict[str, str]:
+        ret = {}
+        if stack_trace := node.meta.get("stack_trace"):
+            ret["stack_trace"] = stack_trace
+
+        if nn_module_stack := node.meta.get("nn_module_stack"):
+            def export_nn_module_stack(val):
+                assert isinstance(val, tuple) and len(val) == 2
+                path, ty = val
+
+                assert isinstance(path, str)
+
+                # node.meta["nn_module_stack"] could have two forms:
+                # 1. (path: str, module_type: 'type'), e.g.
+                #    ('', <class 'sigmoid.inference.MySimpleModel'>)
+                # 2. (path: str, module_type: str), e.g.
+                #    ('', 'sigmoid.inference.MySimpleModel')
+                # ExportedProgram directly produced by torch.export() has form 1
+                # ExportedProgram deserialized from disk has form 2
+                # TODO: This is not ideal, we should fix this.
+                if isinstance(ty, str):
+                    normalized_ty = ty
+                else:
+                    normalized_ty = ty.__module__ + "." + ty.__qualname__
+
+                return path + "," + normalized_ty
+
+            # Serialize to "key,orig_path,type_str"
+            nn_module_list = [
+                f"{k},{export_nn_module_stack(v)}"
+                for k, v in nn_module_stack.items()
+            ]
+            ret["nn_module_stack"] = ST_DELIMITER.join(nn_module_list)
+
+        if source_fn_st := node.meta.get("source_fn_stack"):
+            source_fn_list = [f"{source_fn[0]},{self.serialize_operator(source_fn[1])}" for source_fn in source_fn_st]
+            ret["source_fn_stack"] = ST_DELIMITER.join(source_fn_list)
+
+        return ret
+
+    def serialize_script_obj_meta(self, script_obj_meta: ep.CustomObjArgument) -> CustomObjArgument:
+        return CustomObjArgument(
+            name=script_obj_meta.name,
+            class_fqn=script_obj_meta.class_fqn,
+        )
+
+    def serialize_sym_op_inputs(self, op, args) -> List[NamedArgument]:
+        serialized_args = []
+        args_names = inspect.signature(op).parameters.keys()
+        for args_name, arg in zip(args_names, args):
+            serialized_args.append(
+                NamedArgument(name=args_name, arg=self.serialize_input(arg))
+            )
+        return serialized_args
+
+    def serialize_inputs(
+        self, target: torch._ops.OpOverload, args, kwargs=None
+    ) -> List[NamedArgument]:
+        assert isinstance(target, torch._ops.OpOverload)
+        kwargs = kwargs or {}
+        serialized_args = []
+        for i, schema_arg in enumerate(target._schema.arguments):
+            if schema_arg.name in kwargs:
+                serialized_args.append(
+                    NamedArgument(
+                        name=schema_arg.name,
+                        arg=self.serialize_input(kwargs[schema_arg.name]),
+                    )
+                )
+            elif not schema_arg.kwarg_only and i < len(args):
+                serialized_args.append(
+                    NamedArgument(
+                        name=schema_arg.name,
+                        arg=self.serialize_input(args[i]),
+                    )
+                )
+            else:
+                # We intentionally don't serialize the missing arguments
+                # with default values
+                pass
+
+
+        return serialized_args
+
+    def serialize_hoo_inputs(self, args, kwargs) -> List[NamedArgument]:
+        """
+        For serializing HOO inputs since HOOs do not have a schema.
+        """
+        inputs = [
+            NamedArgument(
+                name="",
+                arg=self.serialize_input(a),
+            ) for a in args
+        ]
+        inputs.extend([
+            NamedArgument(
+                name=name,
+                arg=self.serialize_input(a)
+            ) for name, a in kwargs.items()
+        ])
+        return inputs
+
+    def is_sym_int_arg(self, arg) -> bool:
+        return isinstance(arg, int) or (
+            isinstance(arg, torch.fx.Node) and arg.name in self.graph_state.sym_int_values
+        )
+
+    def is_sym_bool_arg(self, arg) -> bool:
+        return isinstance(arg, bool) or (
+            isinstance(arg, torch.fx.Node) and arg.name in self.graph_state.sym_bool_values
+        )
+
+    def serialize_input(self, arg) -> Argument:
+        import torch._inductor.ir as inductor_ir
+        inductor_tensor_buffers = (
+            inductor_ir.Buffer,
+            inductor_ir.ReinterpretView,
+        )
+
+        if isinstance(arg, torch.fx.Node):
+            if arg.op == "get_attr":
+                assert isinstance(arg.target, str)
+                attr = getattr(arg.graph.owning_module, arg.target)
+
+                if isinstance(attr, torch.Tensor):
+                    raise SerializeError("getattr nodes containing tensors should not appear in the graph")
+                elif isinstance(attr, torch.fx.GraphModule):
+                    with self.save_graph_state():
+                        graph = self.serialize_graph(attr)
+                    return Argument.create(as_graph=GraphArgument(name=arg.target, graph=graph))
+                else:
+                    raise SerializeError(f"Unsupported getattr attribute {arg.target} with type: {type(attr)}")
+            elif self.is_sym_int_arg(arg):
+                return Argument.create(as_sym_int=SymIntArgument.create(as_name=arg.name))
+            elif self.is_sym_bool_arg(arg):
+                return Argument.create(as_sym_bool=SymBoolArgument.create(as_name=arg.name))
+            else:
+                if isinstance(arg.meta["val"], ep.CustomObjArgument):
+                    return Argument.create(as_custom_obj=CustomObjArgument(name=arg.name, class_fqn=arg.meta["val"].class_fqn))
+                return Argument.create(as_tensor=TensorArgument(name=arg.name))
+        elif isinstance(arg, inductor_tensor_buffers):
+            # Other branches are for arguments in fx node.
+            # This is a special branch for handling buffers (representing tensor arguments)
+            # for inductor's ExternalFallbackNode
+            # export_extern_kernel_node() is using this function to serialize arguments
+            arg_name = arg.get_name()
+            assert arg_name is not None, "Buffer must have valid name"
+            return Argument.create(as_tensor=TensorArgument(name=arg_name))
+        elif isinstance(arg, torch.SymInt):
+            # This is a special branch for handling SymInt args in inductor's
+            # ExternalFallbackNode.
+            # For regular FX graph, SymInt arg should be a fx.Node with
+            # self.is_sym_int_arg(arg) being true
+            return Argument.create(as_sym_int=SymIntArgument.create(as_name=str(arg)))
+        elif isinstance(arg, bool):
+            return Argument.create(as_bool=arg)
+        elif isinstance(arg, str):
+            return Argument.create(as_string=arg)
+        elif isinstance(arg, int):
+            return Argument.create(as_int=arg)
+        elif isinstance(arg, float):
+            return Argument.create(as_float=arg)
+        elif arg is None:
+            return Argument.create(as_none=())
+        elif isinstance(arg, (list, tuple)):
+            # Must check bool first, as bool is also treated as int
+            if all(isinstance(a, bool) for a in arg):
+                return Argument.create(as_bools=list(arg))
+            elif all(isinstance(a, int) for a in arg):
+                return Argument.create(as_ints=list(arg))
+            elif all(isinstance(a, float) for a in arg):
+                return Argument.create(as_floats=list(arg))
+            elif all(isinstance(a, str) for a in arg):
+                return Argument.create(as_strings=list(arg))
+            elif all(isinstance(a, torch.SymInt) for a in arg):
+                # This is a special branch for handling SymInt args in inductor's
+                # ExternalFallbackNode.
+                # For regular FX graph, SymInt arg should be a fx.Node with
+                # self.is_sym_int_arg(arg) being true
+                return Argument.create(
+                    as_sym_ints=[SymIntArgument.create(as_name=str(a)) for a in arg]
+                )
+            elif all(self.is_sym_int_arg(a) for a in arg):
+                # list of sym_ints
+                values = []
+                for a in arg:
+                    if isinstance(a, torch.fx.Node):
+                        values.append(SymIntArgument.create(as_name=a.name))
+                    elif isinstance(a, int):
+                        values.append(SymIntArgument.create(as_int=a))
+                return Argument.create(as_sym_ints=values)
+            elif all(self.is_sym_bool_arg(a) for a in arg):
+                # list of sym_bools
+                values = []
+                for a in arg:
+                    if isinstance(a, torch.fx.Node):
+                        values.append(SymBoolArgument.create(as_name=a.name))
+                    elif isinstance(a, bool):
+                        values.append(SymBoolArgument.create(as_bool=a))
+                return Argument.create(as_sym_bools=values)
+            elif all(isinstance(a, torch.fx.Node) for a in arg):
+                # list of tensors
+                arguments = []
+                for a in arg:
+                    if a.op == "get_attr":
+                        raise SerializeError("getattr nodes containing tensors should not appear in the graph")
+                    arguments.append(TensorArgument(name=a.name))
+                return Argument.create(as_tensors=arguments)
+            elif all(isinstance(a, (torch.fx.Node, type(None))) for a in arg):
+                # list of optional tensors
+                def serialize_optional_tensor_args(a):
+                    if a is None:
+                        return OptionalTensorArgument.create(as_none=())
+                    elif isinstance(a, torch.fx.Node):
+                        return OptionalTensorArgument.create(as_tensor=a.name)
+                    else:
+                        raise SerializeError(f"Unsupported list/tuple argument: {a}")
+                return Argument.create(
+                    as_optional_tensors=list(map(serialize_optional_tensor_args, arg))
+                )
+            elif all(isinstance(a, inductor_tensor_buffers) for a in arg):
+                # list of inductor buffers
+                return Argument.create(
+                    as_tensors=[TensorArgument(name=a.get_name()) for a in arg],
+                )
+            elif all(isinstance(a, (*inductor_tensor_buffers, type(None))) for a in arg):
+                # list of inductor buffers as optional tensors
+                def serialize_optional_tensor_args(a):
+                    if a is None:
+                        return OptionalTensorArgument.create(as_none=())
+                    elif isinstance(a, inductor_tensor_buffers):
+                        return OptionalTensorArgument.create(as_tensor=a.get_name())
+                    else:
+                        raise SerializeError(f"Unsupported list/tuple argument: {a}")
+                return Argument.create(
+                    as_optional_tensors=list(map(serialize_optional_tensor_args, arg))
+                )
+            else:
+                raise SerializeError(f"Unsupported list/tuple argument type: {[type(a) for a in arg]}")
+        elif isinstance(arg, torch.dtype):
+            return Argument.create(as_scalar_type=_TORCH_TO_SERIALIZE_DTYPE[arg])
+        elif isinstance(arg, torch.device):
+            return Argument.create(as_device=Device(type=arg.type, index=arg.index))
+        elif isinstance(arg, torch.memory_format):
+            return Argument.create(as_memory_format=_TORCH_TO_SERIALIZE_MEMORY_FORMAT[arg])
+        elif isinstance(arg, torch.layout):
+            return Argument.create(as_layout=_TORCH_TO_SERIALIZE_LAYOUT[arg])
+        elif isinstance(arg, torch._C.ScriptObject):
+            if not (
+                arg._has_method("__getstate__") and  # type: ignore[attr-defined]
+                arg._has_method("__setstate__")  # type: ignore[attr-defined]
+            ):
+                raise SerializeError(
+                    f"Unable to serialize custom class {arg}. Please define "
+                    "serialization methods via def_pickle()."
+                )
+            # Custom objects through torchind are serializable with pickle,
+            # through implementing the .def_pickle function.  This should result
+            # in the object containing a __getstate__ and __setstate__
+            # serialize/deserialize function.
+            custom_obj_name = f"_custom_obj_{len(self.custom_objs)}"
+            self.custom_objs[custom_obj_name] = arg
+            class_fqn = arg._type().qualified_name()  # type: ignore[attr-defined]
+            return Argument.create(as_custom_obj=CustomObjArgument(custom_obj_name, class_fqn))
+        elif isinstance(arg, torch._ops.OpOverload):
+            return Argument.create(as_operator=self.serialize_operator(arg))
+        else:
+            raise SerializeError(f"Unsupported argument type: {type(arg)}")
+
+    def serialize_tensor_output(self, name, meta_val) -> TensorArgument:
+        assert name not in self.graph_state.tensor_values
+        self.graph_state.tensor_values[name] = serialize_tensor_meta(meta_val)
+        return TensorArgument(name=name)
+
+    def serialize_sym_int_output(self, name, meta_val) -> SymIntArgument:
+        assert name not in self.graph_state.sym_int_values
+        self.graph_state.sym_int_values[name] = serialize_sym_int(meta_val)
+        return SymIntArgument.create(as_name=name)
+
+    def serialize_sym_bool_output(self, name, meta_val) -> SymIntArgument:
+        assert name not in self.graph_state.sym_bool_values
+        self.graph_state.sym_bool_values[name] = serialize_sym_bool(meta_val)
+        return SymBoolArgument.create(as_name=name)
+
+    def serialize_input_spec(self, spec: ep.InputSpec) -> InputSpec:
+        if spec.kind == ep.InputKind.USER_INPUT:
+            return InputSpec.create(
+                user_input=UserInputSpec(
+                    arg=self.serialize_argument_spec(spec.arg)
+                )
+            )
+        elif spec.kind == ep.InputKind.PARAMETER:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return InputSpec.create(
+                parameter=InputToParameterSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    parameter_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.InputKind.BUFFER:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            assert spec.persistent is not None
+            return InputSpec.create(
+                buffer=InputToBufferSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    buffer_name=spec.target,
+                    persistent=spec.persistent,
+                )
+            )
+        elif spec.kind == ep.InputKind.CONSTANT_TENSOR:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return InputSpec.create(
+                tensor_constant=InputToTensorConstantSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    tensor_constant_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.InputKind.CUSTOM_OBJ:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.CustomObjArgument)
+            return InputSpec.create(
+                custom_obj=InputToCustomObjSpec(
+                    arg=CustomObjArgument(name=spec.arg.name, class_fqn=spec.arg.class_fqn),
+                    custom_obj_name=spec.target,
+                )
+            )
+        else:
+            raise AssertionError(f"Unknown argument kind: {spec}")
+
+    def serialize_output_spec(self, spec: ep.OutputSpec) -> OutputSpec:
+        if spec.kind == ep.OutputKind.USER_OUTPUT:
+            return OutputSpec.create(
+                user_output=UserOutputSpec(
+                    arg=self.serialize_argument_spec(spec.arg)
+                )
+            )
+        elif spec.kind == ep.OutputKind.LOSS_OUTPUT:
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                loss_output=LossOutputSpec(
+                    arg=TensorArgument(name=spec.arg.name)
+                )
+            )
+        elif spec.kind == ep.OutputKind.BUFFER_MUTATION:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                buffer_mutation=BufferMutationSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    buffer_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.OutputKind.GRADIENT_TO_PARAMETER:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                gradient_to_parameter=GradientToParameterSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    parameter_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.OutputKind.GRADIENT_TO_USER_INPUT:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                gradient_to_user_input=GradientToUserInputSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    user_input_name=spec.target,
+                )
+            )
+        elif spec.kind == ep.OutputKind.USER_INPUT_MUTATION:
+            assert spec.target is not None
+            assert isinstance(spec.arg, ep.TensorArgument)
+            return OutputSpec.create(
+                user_input_mutation=UserInputMutationSpec(
+                    arg=TensorArgument(name=spec.arg.name),
+                    user_input_name=spec.target,
+                )
+            )
+        else:
+            raise AssertionError(f"Unknown argument kind: {spec}")
+
+    def serialize_signature(self, sig: ep.ExportGraphSignature) -> GraphSignature:
+        return GraphSignature(
+            input_specs=[self.serialize_input_spec(s) for s in sig.input_specs],
+            output_specs=[self.serialize_output_spec(s) for s in sig.output_specs],
+        )
+
+    def serialize_argument_spec(self, x: ep.ArgumentSpec) -> Argument:
+        if isinstance(x, ep.TensorArgument):
+            return Argument.create(as_tensor=TensorArgument(name=x.name))
+        elif isinstance(x, ep.SymIntArgument):
+            return Argument.create(as_sym_int=SymIntArgument.create(as_name=x.name))
+        elif isinstance(x, ep.ConstantArgument):
+            return self.serialize_input(x.value)
+        elif isinstance(x, ep.CustomObjArgument):
+            return Argument.create(as_custom_obj=CustomObjArgument(name=x.name, class_fqn=x.class_fqn))
+        else:
+            raise AssertionError("TODO")
+
+    def serialize_module_call_signature(self, module_call_signature: ep.ModuleCallSignature) -> ModuleCallSignature:
+        return ModuleCallSignature(
+            inputs=[self.serialize_argument_spec(x) for x in module_call_signature.inputs],
+            outputs=[self.serialize_argument_spec(x) for x in module_call_signature.outputs],
+            in_spec=treespec_dumps(module_call_signature.in_spec, TREESPEC_VERSION),
+            out_spec=treespec_dumps(module_call_signature.out_spec, TREESPEC_VERSION),
+        )
+
+    def serialize_module_call_graph(self, module_call_graph: List[ep.ModuleCallEntry]) -> List[ModuleCallEntry]:
+        return [
+            ModuleCallEntry(
+                fqn=entry.fqn,
+                signature=self.serialize_module_call_signature(entry.signature) if entry.signature else None,
+            ) for entry in module_call_graph
+        ]
+
+    def serialize_outputs(self, node: torch.fx.Node) -> List[Argument]:
+        """For a given node, return the dataclass representing its output values.
+
+        [NOTE: Multiple outputs] We handle aggregates differently than FX. For
+        FX, it looks like:
+
+            x = call_function("multiple_return", ...)
+            element0 = call_function(getitem, x, 0)
+            foo = call_function("use_output", element0)
+
+        We do not want the intermediate `getitem` call, so our serialized thing looks like:
+
+            element0, element1, element2 = call_function("multiple_return", ...)
+            foo = call_function("use_output", element0)
+
+        We want names to be consistent across these two schemes, so that we can
+        mostly reuse the names coming from FX. This function computes a mapping from
+        the FX representation to our representation, preserving the names.
+        """
+        assert node.op == "call_function" and isinstance(node.target, torch._ops.OpOverload)
+
+        assert isinstance(node.target, torch._ops.OpOverload)
+        returns = node.target._schema.returns
+
+        if len(returns) == 0:
+            return []
+
+        meta_val = node.meta["val"]
+
+        def output_node_at_index(node, index):
+            for user in node.users:
+                assert user.target is operator.getitem, f"{user} is not a getitem node"
+                if index == user.args[1]:
+                    return user
+            return None
+
+        # Check single value return
+        if _is_single_tensor_list_return(node.target):
+            # e.g "-> Tensor[]"
+            tensor_args = []
+            for idx, meta in enumerate(meta_val):
+                user_node = output_node_at_index(node, idx)
+                name = (
+                    user_node.name
+                    if user_node is not None
+                    else f"{node.name}_unused_{idx}"
+                )
+                tensor_args.append(self.serialize_tensor_output(name, meta))
+            return [Argument.create(as_tensors=tensor_args)]
+        elif len(returns) == 1:
+            return [self.serialize_output(node.name, meta_val)]
+
+        # There are a two possibilities at this point:
+        # - This operator returns a tuple of Tensors, e.g. "-> (Tensor, Tensor)"
+        # - This operator returns a tuple of mixed of Tensor and Tensors, e.g. "-> (Tensor, Tensor[])"
+        #
+        # Either way, start by gathering a list of TensorArguments with the correct names.
+        # For consistent naming with FX, consult the downstream `getitem` node and
+        # make sure our outputs have the same name.
+
+        output_arguments = []
+        for idx, (meta, return_schema) in enumerate(zip(meta_val, returns)):
+            if meta is None:
+                assert isinstance(return_schema.real_type, (torch.OptionalType, torch.TensorType))
+                # When the return type is annoated as Tensor type, the op can also return an
+                # undefined Tensor which will be implicitly converted to None in Python.
+                output_arguments.append(Argument.create(as_none=()))
+            elif isinstance(meta, FakeTensor):
+                assert isinstance(return_schema.real_type, torch.TensorType)
+                user_node = output_node_at_index(node, idx)
+                name = (
+                    user_node.name
+                    if user_node is not None
+                    else f"{node.name}_unused_{idx}"
+                )
+                output_arguments.append(self.serialize_output(name, meta))
+            elif isinstance(meta, list):
+                # for List[Tensor] return type
+                assert isinstance(
+                    return_schema.real_type, torch.ListType
+                ) and isinstance(
+                    return_schema.real_type.getElementType(), torch.TensorType
+                )
+                user_node = output_node_at_index(node, idx)
+                assert user_node is not None
+
+                args = []
+                for i, m in enumerate(meta):
+                    if m is None:
+                        continue
+                    sub_user_node = output_node_at_index(user_node, i)
+                    assert sub_user_node is not None, f"No user found at index {i}"
+
+                    args.append(self.serialize_tensor_output(sub_user_node.name, m))
+                output_arguments.append(Argument.create(as_tensors=args))
+            elif isinstance(meta, (int, SymInt)):
+                user_node = output_node_at_index(node, idx)
+                name = (
+                    user_node.name
+                    if user_node is not None
+                    else f"{node.name}_unused_{idx}"
+                )
+                output_arguments.append(self.serialize_output(name, meta))
+            else:
+                raise ValueError(f"Unhandled output type {type(meta)} from node {node.format_node()}")
+
+        return output_arguments
+
+    def serialize_hoo_outputs(self, node: torch.fx.Node) -> List[Argument]:
+        """
+        For serializing HOO outputs since HOOs do not have a schema.
+        """
+        meta_val = node.meta["val"]
+
+        if isinstance(meta_val, tuple):
+            # Note: Since we don't have a schema, we just serialize all tuple
+            # outputs to be a list of values. Even if the output is supposed to
+            # be a tensor list (Tensor[]), we will serialize it to be a list of
+            # tensors (Tensor, Tensor, Tensor). An exception is that if there's
+            # a singleton tensor, we will serialize this to be a singleton
+            # tensor list so that the deserializer knows to insert getitem nodes.
+
+            idx_to_name = {}
+            for user in node.users:
+                if user.target is not operator.getitem:
+                    continue
+                idx_to_name[user.args[1]] = user.name
+
+            for idx in range(len(meta_val)):
+                # FX does not emit a getitem node for any outputs that are unused.
+                # However, we need a name for them so that the number of outputs will
+                # correctly match the schema. Just assign a dummy name.
+                if idx not in idx_to_name:
+                    idx_to_name[idx] = f"{node.name}_unused_{idx}"
+
+            if len(meta_val) == 1:
+                tensors = []
+                for i, v in enumerate(meta_val):
+                    assert isinstance(v, torch.Tensor)
+                    tensors.append(self.serialize_tensor_output(idx_to_name[i], v))
+                return [Argument.create(as_tensors=tensors)]
+
+            else:
+                return [
+                    self.serialize_output(idx_to_name[i], element_meta_val)
+                    for i, element_meta_val in enumerate(meta_val)
+                ]
+
+        else:
+            return [self.serialize_output(node.name, meta_val)]
+
+    def serialize_output(self, name: str, meta_val: Any) -> Argument:
+        # Check single value return
+        if meta_val is None:
+            return Argument.create(as_none=())
+        if isinstance(meta_val, torch.Tensor):
+            # e.g "-> Tensor"
+            return Argument.create(as_tensor=self.serialize_tensor_output(name, meta_val))
+        elif isinstance(meta_val, (int, torch.SymInt)):
+            # e.g "-> SymInt"
+            return Argument.create(as_sym_int=self.serialize_sym_int_output(name, meta_val))
+        elif isinstance(meta_val, torch.SymBool):
+            # e.g "-> SymBool"
+            return Argument.create(as_sym_bool=self.serialize_sym_bool_output(name, meta_val))
+
+        # list outputs should've been handled earlier
+        raise SerializeError(f"Unable to serialize output {meta_val}")
+
+    def _handle_getitem_users(self, node: torch.fx.Node) -> List[TensorArgument]:
+        meta_val = node.meta["val"]
+
+        idx_to_name = {}
+        for user in node.users:
+            assert user.target is operator.getitem, f"User node {user} of {node} is incorrect"
+            idx_to_name[user.args[1]] = user.name
+
+        for idx, _ in enumerate(meta_val):
+            # FX does not emit a getitem node for any outputs that are unused.
+            # However, we need a name for them so that the number of outputs will
+            # correctly match the schema. Just assign a dummy name.
+            if idx not in idx_to_name:
+                idx_to_name[idx] = f"{node.name}_unused_{idx}"
+
+        arg_list = []
+        for i, element_meta_val in enumerate(meta_val):
+            arg_list.append(
+                self.serialize_tensor_output(idx_to_name[i], element_meta_val)
+            )
+
+        return arg_list
+
+    def serialize_graph(self, graph_module: torch.fx.GraphModule) -> Graph:
+        assert isinstance(graph_module, torch.fx.GraphModule)
+        for node in graph_module.graph.nodes:
+            try:
+                getattr(self, f"handle_{node.op}")(node)
+            except Exception as e:
+                raise SerializeError(f"Failed serializing node {node} in graph: {node.format_node()}") from e
+
+        return Graph(
+            inputs=self.graph_state.inputs,
+            nodes=self.graph_state.nodes,
+            tensor_values=self.graph_state.tensor_values,
+            sym_int_values=self.graph_state.sym_int_values,
+            sym_bool_values=self.graph_state.sym_bool_values,
+            custom_obj_values=self.graph_state.custom_obj_values,
+            outputs=self.graph_state.outputs,
+            is_single_tensor_return=self.graph_state.is_single_tensor_return,
+        )
+
+    def serialize(self, graph_module: torch.fx.GraphModule) -> GraphModule:
+        graph = self.serialize_graph(graph_module)
+
+        return GraphModule(
+            graph=graph,
+            signature=self.serialize_signature(self.graph_signature),
+            module_call_graph=self.serialize_module_call_graph(self.module_call_graph),
+        )
+
+
+class ExportedProgramSerializer:
+    def __init__(self, opset_version: Optional[Dict[str, int]] = None):
+        self.opset_version: Dict[str, int] = {}
+        if opset_version:
+            self.opset_version.update(opset_version)
+        if "aten" not in self.opset_version:
+            self.opset_version["aten"] = torch._C._get_max_operator_version()
+
+    def serialize(self, exported_program: ep.ExportedProgram) -> SerializedArtifact:
+        """
+        Args:
+            exported_program: Exported Program to serialize
+        """
+        if type(self) == ExportedProgramSerializer:
+            exported_program._validate()
+
+        gm_serializer = GraphModuleSerializer(
+            exported_program.graph_signature,
+            exported_program.module_call_graph
+        )
+        serialized_graph_module = gm_serializer.serialize(exported_program.graph_module)
+        serialized_range_constraints = serialize_range_constraints(exported_program.range_constraints)
+
+        # TODO: Directly serialize exported_program.constants once
+        # CustomClassHolders get stored in the ExportedProgram rather than in
+        # the graph
+        constants = {}
+        for n, c in gm_serializer.custom_objs.items():
+            constants[n] = c
+        for n, t in exported_program.constants.items():
+            assert n not in constants
+            constants[n] = t
+
+        serialized_ep = ExportedProgram(
+            graph_module=serialized_graph_module,
+            opset_version=self.opset_version,
+            range_constraints=serialized_range_constraints,
+            schema_version=SchemaVersion(
+                major=SCHEMA_VERSION[0],
+                minor=SCHEMA_VERSION[1],
+            ),
+            dialect=exported_program.dialect,
+        )
+
+        # Test canonical form is well defined.
+        canonicalize(serialized_ep)
+
+        return SerializedArtifact(
+            serialized_ep,
+            serialize_torch_artifact(exported_program.state_dict),
+            serialize_torch_artifact(constants),
+        )
+
+
+class GraphModuleDeserializer:
+    @dataclasses.dataclass
+    class Result:
+        graph_module: torch.fx.GraphModule
+        signature: ep.ExportGraphSignature
+        module_call_graph: List[ep.ModuleCallEntry]
+        names_to_symbols: Dict[str, sympy.Symbol]
+        state_dict: Dict[str, Union[torch.Tensor, torch.nn.Parameter]]
+        constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]]
+
+    def __init__(self):
+        self.serialized_name_to_node: Dict[str, torch.fx.Node] = {}
+        self.serialized_name_to_meta: Dict[str, MetaType] = {}
+        self.graph = torch.fx.Graph()
+        self.module = torch.nn.Module()
+
+    @contextmanager
+    def save_graph_module(self) -> Iterator[None]:
+        saved = self.graph, self.module, self.serialized_name_to_node, self.serialized_name_to_meta
+        self.graph = torch.fx.Graph()
+        self.module = torch.nn.Module()
+        self.serialized_name_to_node = {}
+        self.serialized_name_to_meta = {}
+        try:
+            yield
+        finally:
+            self.graph, self.module, self.serialized_name_to_node, self.serialized_name_to_meta = saved
+
+    def deserialize_operator(self, serialized_target: str):
+        if serialized_target.startswith("_operator"):  # TODO(zhxchen17) Follow up on this.
+            module = operator
+            serialized_target_names = serialized_target.split(".")[1:]
+        elif serialized_target.startswith("torch"):
+            module = torch  # type: ignore[misc]
+            serialized_target_names = serialized_target.split(".")[1:]
+        else:  # TODO(zhxchen17) Don't catch all here.
+            return serialized_target
+
+        target = module
+        for name in serialized_target_names:
+            if not hasattr(target, name):
+                return serialized_target
+            else:
+                target = getattr(target, name)
+        return target
+
+    def deserialize_sym_int(self, s: SymInt) -> Union[int, torch.SymInt]:
+        val = s.value
+        if s.type == "as_expr":
+            if val.expr_str in self.symbol_name_to_symbol:
+                sym = self.symbol_name_to_symbol[val.expr_str]
+            else:
+                sym = sympy.sympify(val.expr_str, locals=self.symbol_name_to_symbol)
+                # NOTE(avik): Assumptions on symbols are not explicitly serialized.
+                # This seems dangerous: it might cause unknown differences in shape env behavior
+                # on deserialization? Probably deserves a follow-up.
+
+                # Here we force symbols corresponding to SymInts to be at least integers.
+                # Otherwise some expressions that the shape env would otherwise evaluate to False,
+                # e.g., 2*s = 9, can have rational solutions, e.g., 9/2.
+                sym = sym.subs({s: sympy.Symbol(s.name, integer=True) for s in sym.free_symbols})
+                if isinstance(sym, sympy.Symbol):
+                    self.symbol_name_to_symbol[val.expr_str] = sym
+
+                    if vr := self.symbol_name_to_range.get(val.expr_str):
+                        symbolic_shapes._constrain_symbol_range(
+                            self.shape_env,
+                            sym,
+                            compiler_min=vr.lower,  # type: ignore[arg-type]
+                            compiler_max=vr.upper,  # type: ignore[arg-type]
+                        )
+                else:
+                    # Placeholders, in particular, can have shapes as symbolic expressions.
+                    # We need to populate the shape env with the range constraints of their
+                    # free symbols, otherwise evaluating such expressions will error.
+                    self.symbol_name_to_symbol[val.expr_str] = sym
+                    free_symbols = sym.free_symbols
+                    for s in free_symbols:
+                        if s.name not in self.symbol_name_to_symbol:
+                            self.symbol_name_to_symbol[s.name] = s
+                        if vr := self.symbol_name_to_range.get(s.name):
+                            symbolic_shapes._constrain_symbol_range(
+                                self.shape_env,
+                                s,
+                                compiler_min=vr.lower,  # type: ignore[arg-type]
+                                compiler_max=vr.upper,  # type: ignore[arg-type]
+                            )
+
+
+            if val.hint is None:
+                hint = None
+            else:
+                assert val.hint.type == "as_int"
+                hint = val.hint.value
+
+            return self.shape_env.create_symintnode(sym, hint=hint)
+        elif s.type == "as_int":
+            assert isinstance(val, int)
+            return val
+        else:
+            raise SerializeError(
+                f"SymInt has invalid field type {s.type} with value {s.value}"
+            )
+
+    def deserialize_sym_bool(self, s: SymBool) -> Union[bool, torch.SymBool]:
+        val = s.value
+        if s.type == "as_expr":
+            expr = sympy.sympify(val.expr_str, locals=self.symbol_name_to_symbol)
+            return self.shape_env.create_symboolnode(expr)
+        elif s.type == "as_bool":
+            assert isinstance(val, bool)
+            return val
+        else:
+            raise SerializeError(
+                f"SymBool has invalid field type {s.type} with value {s.value}"
+            )
+
+    def deserialize_tensor_meta(
+        self,
+        tensor_meta: TensorMeta,
+    ) -> FakeTensor:
+        with self.fake_tensor_mode:
+            return cast(
+                FakeTensor,
+                torch.empty_strided(
+                    tuple(self.deserialize_sym_int(val) for val in tensor_meta.sizes),  # type: ignore[misc]
+                    tuple(self.deserialize_sym_int(val) for val in tensor_meta.strides),  # type: ignore[misc]
+                    device=deserialize_device(tensor_meta.device),
+                    dtype=_SERIALIZE_TO_TORCH_DTYPE[tensor_meta.dtype],
+                ),
+            )
+
+    def deserialize_script_obj_meta(self, script_obj_meta: CustomObjArgument) -> ep.CustomObjArgument:
+        return ep.CustomObjArgument(
+            name=script_obj_meta.name,
+            class_fqn=script_obj_meta.class_fqn,
+        )
+
+    def deserialize_graph_output(self, output) -> torch.fx.Node:
+        if output.type == "as_tensor":
+            return self.serialized_name_to_node[output.as_tensor.name]
+        elif output.type == "as_sym_int":
+            return self.serialized_name_to_node[output.as_sym_int.as_name]
+        elif output.type == "as_sym_bool":
+            return self.serialized_name_to_node[output.as_sym_bool.as_name]
+        else:
+            raise SerializeError(f"Unable to deserialize output node {output}")
+
+    def deserialize_graph(self, serialized_graph: Graph) -> torch.fx.Graph:
+        # Handle the tensor metas.
+        for name, tensor_value in serialized_graph.tensor_values.items():
+            meta_val = self.deserialize_tensor_meta(tensor_value)
+            self.serialized_name_to_meta[name] = meta_val
+
+        for name, sym_int_value in serialized_graph.sym_int_values.items():
+            self.serialized_name_to_meta[name] = self.deserialize_sym_int(sym_int_value)
+
+        for name, sym_bool_value in serialized_graph.sym_bool_values.items():
+            self.serialized_name_to_meta[name] = self.deserialize_sym_bool(sym_bool_value)
+
+        for name, script_obj_meta in serialized_graph.custom_obj_values.items():
+            self.serialized_name_to_meta[name] = self.deserialize_script_obj_meta(script_obj_meta)
+
+        # Inputs: convert to placeholder nodes in FX.
+        for i, input_ in enumerate(serialized_graph.inputs):
+            if input_.type in ("as_tensor", "as_sym_int", "as_custom_obj"):
+                node_name = input_.value.name
+                placeholder_node = self.graph.placeholder(node_name)
+                self.sync_fx_node(node_name, placeholder_node)
+            elif input_.type in ("as_int", "as_float", "as_bool", "as_none", "as_string"):
+                node_name = f"arg{i}"
+                placeholder_node = self.graph.placeholder(node_name)
+                placeholder_node.meta["val"] = self.deserialize_input(input_)
+            else:
+                raise SerializeError(f"Invalid input type {input_}")
+
+        # Nodes: convert to call_function nodes.
+        for serialized_node in serialized_graph.nodes:
+            try:
+                target = self.deserialize_operator(serialized_node.target)
+                self.deserialize_node(serialized_node, target)
+
+            except Exception as e:
+                raise SerializeError(f"Failed deserializing node {serialized_node}") from e
+
+        # Outputs: convert to a single `output` node.
+        outputs = []
+        for output in serialized_graph.outputs:
+            outputs.append(self.deserialize_graph_output(output))
+
+        if serialized_graph.is_single_tensor_return:
+            assert len(outputs) == 1
+            outputs = outputs[0]  # type: ignore[assignment]
+        else:
+            outputs = tuple(outputs)  # type: ignore[assignment]
+
+        output_node = self.graph.output(outputs)
+
+        if serialized_graph.is_single_tensor_return:
+            output_node.meta["val"] = output_node.args[0].meta["val"]
+        else:
+            output_node.meta["val"] = tuple(
+                arg.meta["val"] for arg in output_node.args[0]
+            )
+
+        return self.graph
+
+    def deserialize_node(self, serialized_node: Node, target: Callable) -> None:
+        if target in _SYM_BOOL_OPS or target in _SYM_INT_OPS:
+            name = serialized_node.outputs[0].value.as_name
+            args = self.deserialize_sym_op_inputs(serialized_node.inputs)
+
+            fx_node = self.graph.create_node("call_function", target, args, {}, name)
+            self.deserialize_sym_op_outputs(serialized_node, fx_node)
+
+        elif isinstance(target, torch._ops.HigherOrderOperator):
+            args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
+            # If HOP returns a single tensor, name the
+            # newly-created node after it. This ensures that these tensor values
+            # have names that are consistent with serialized.
+            #
+            # HOPs don't have schema yet, just check the output lengths and as_tensor attribute
+            name = (
+                serialized_node.outputs[0].as_tensor.name
+                if len(serialized_node.outputs) == 1 and hasattr(serialized_node.outputs[0], "as_tensor")
+                else None
+            )
+            fx_node = self.graph.create_node(
+                "call_function", target, args, kwargs, name
+            )
+            self.deserialize_outputs(serialized_node, fx_node)
+            fx_node.meta.update(self.deserialize_metadata(serialized_node.metadata))
+
+        elif isinstance(target, torch._ops.OpOverload):
+            # For convenience: if this node returns a single tensor, name the
+            # newly-created node after it. This ensures that these tensor values
+            # have names that are consistent with serialized.
+            name = (
+                serialized_node.outputs[0].as_tensor.name
+                if _is_single_tensor_return(target)
+                else None  # FX will generate a name for us.
+            )
+            args, kwargs = self.deserialize_inputs(target, serialized_node)
+            fx_node = self.graph.create_node("call_function", target, args, kwargs, name)
+            self.deserialize_outputs(serialized_node, fx_node)
+        else:
+            raise SerializeError(f"Unsupported target type for node {serialized_node}: {target}")
+
+        fx_node.meta.update(self.deserialize_metadata(serialized_node.metadata))
+
+    def deserialize_input_spec(self, i: InputSpec) -> ep.InputSpec:
+        if i.type == "user_input":
+            return ep.InputSpec(
+                kind=ep.InputKind.USER_INPUT,
+                arg=self.deserialize_argument_spec(i.user_input.arg),
+                target=None
+            )
+        elif i.type == "parameter":
+            return ep.InputSpec(
+                kind=ep.InputKind.PARAMETER,
+                arg=ep.TensorArgument(name=i.parameter.arg.name),
+                target=i.parameter.parameter_name,
+            )
+        elif i.type == "buffer":
+            return ep.InputSpec(
+                kind=ep.InputKind.BUFFER,
+                arg=ep.TensorArgument(name=i.buffer.arg.name),
+                target=i.buffer.buffer_name,
+                persistent=i.buffer.persistent,
+            )
+        elif i.type == "tensor_constant":
+            return ep.InputSpec(
+                kind=ep.InputKind.CONSTANT_TENSOR,
+                arg=ep.TensorArgument(name=i.tensor_constant.arg.name),
+                target=i.tensor_constant.tensor_constant_name,
+            )
+        elif i.type == "custom_obj":
+            return ep.InputSpec(
+                kind=ep.InputKind.CUSTOM_OBJ,
+                arg=ep.CustomObjArgument(name=i.custom_obj.arg.name, class_fqn=i.custom_obj.arg.class_fqn),
+                target=i.custom_obj.custom_obj_name,
+            )
+        else:
+            raise AssertionError(f"Unknown input spec {i}")
+
+    def deserialize_output_spec(self, o: OutputSpec) -> ep.OutputSpec:
+        if o.type == "user_output":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.USER_OUTPUT,
+                arg=self.deserialize_argument_spec(o.user_output.arg),
+                target=None,
+            )
+        elif o.type == "loss_output":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.LOSS_OUTPUT,
+                arg=ep.TensorArgument(name=o.loss_output.arg.name),
+                target=None,
+            )
+        elif o.type == "buffer_mutation":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.BUFFER_MUTATION,
+                arg=ep.TensorArgument(name=o.buffer_mutation.arg.name),
+                target=o.buffer_mutation.buffer_name
+            )
+        elif o.type == "gradient_to_parameter":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.GRADIENT_TO_PARAMETER,
+                arg=ep.TensorArgument(name=o.gradient_to_parameter.arg.name),
+                target=o.gradient_to_parameter.parameter_name
+            )
+        elif o.type == "gradient_to_user_input":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.GRADIENT_TO_USER_INPUT,
+                arg=ep.TensorArgument(name=o.gradient_to_user_input.arg.name),
+                target=o.gradient_to_user_input.user_input_name
+            )
+        elif o.type == "user_input_mutation":
+            return ep.OutputSpec(
+                kind=ep.OutputKind.USER_INPUT_MUTATION,
+                arg=ep.TensorArgument(name=o.user_input_mutation.arg.name),
+                target=o.user_input_mutation.user_input_name
+            )
+        else:
+            raise AssertionError(f"Unknown output spec {o}")
+
+    def deserialize_signature(self, sig: GraphSignature) -> ep.ExportGraphSignature:
+        return ep.ExportGraphSignature(
+            input_specs=[self.deserialize_input_spec(i) for i in sig.input_specs],
+            output_specs=[self.deserialize_output_spec(o) for o in sig.output_specs]
+        )
+
+    def deserialize(
+        self,
+        serialized_graph_module: GraphModule,
+        serialized_state_dict: bytes,
+        constants: bytes,
+        symbol_name_to_range: Optional[Dict[str, symbolic_shapes.ValueRanges]] = None,
+    ) -> Result:
+        global _CURRENT_DESERIALIZER
+        assert _CURRENT_DESERIALIZER is None
+        _CURRENT_DESERIALIZER = self
+        try:
+            self.shape_env = symbolic_shapes.ShapeEnv(assume_static_by_default=True)
+            self.fake_tensor_mode = FakeTensorMode(
+                allow_fallback_kernels=False,
+                allow_non_fake_inputs=True,
+                shape_env=self.shape_env,
+            )
+            self.symbol_name_to_symbol: Dict[str, sympy.Symbol] = {}
+            self.symbol_name_to_range = {} if symbol_name_to_range is None else symbol_name_to_range
+            self.signature = self.deserialize_signature(serialized_graph_module.signature)
+            self.constants = deserialize_torch_artifact(constants)
+            self.deserialize_graph(serialized_graph_module.graph)
+
+            module_call_graph = self.deserialize_module_call_graph(serialized_graph_module.module_call_graph)
+            return GraphModuleDeserializer.Result(
+                graph_module=ep._create_graph_module_for_export(self.module, self.graph),
+                signature=self.signature,
+                module_call_graph=module_call_graph,
+                names_to_symbols=self.symbol_name_to_symbol,
+                state_dict=deserialize_torch_artifact(serialized_state_dict),
+                constants=self.constants,
+            )
+        finally:
+            _CURRENT_DESERIALIZER = None
+
+    def sync_fx_node(self, name: str, fx_node: torch.fx.Node):
+        if name in self.serialized_name_to_node:
+            raise SerializeError(f"Node {name} has already been deserialized before.")
+        self.serialized_name_to_node[name] = fx_node
+        assert "val" not in fx_node.meta
+        fx_node.meta["val"] = self.serialized_name_to_meta[name]
+
+    def deserialize_sym_op_inputs(self, inputs):
+        return tuple(self.deserialize_input(input.arg) for input in inputs)
+
+    def deserialize_inputs(self, target: torch._ops.OpOverload, serialized_node: Node):
+        schema_args = target._schema.arguments
+        actual_args = {
+            input.name: self.deserialize_input(input.arg) for input in serialized_node.inputs
+        }
+        args = []
+        kwargs = {}
+        for schema_arg in schema_args:
+            is_positional = not schema_arg.has_default_value() and not schema_arg.kwarg_only
+            if is_positional:
+                args.append(actual_args[schema_arg.name])
+            else:
+                if schema_arg.name in actual_args:
+                    kwargs[schema_arg.name] = actual_args[schema_arg.name]
+        return tuple(args), kwargs
+
+    def deserialize_hoo_inputs(self, inputs: List[NamedArgument]):
+        """
+        For deserializing HOO inputs since HOOs do not have a schema.
+        """
+        args = []
+        kwargs = {}
+        for input_ in inputs:
+            if input_.name != "":
+                kwargs[input_.name] = self.deserialize_input(input_.arg)
+            else:
+                args.append(self.deserialize_input(input_.arg))
+        return (tuple(args), kwargs)
+
+    def deserialize_input(self, inp: Argument) -> Any:
+        value = inp.value
+        typ_ = inp.type
+        if typ_ == "as_none":
+            # None should converted as None, but is encoded as bool in serialized
+            # Convert serialized object to torch equivalent
+            return None
+        elif typ_ == "as_tensor":
+            return self.serialized_name_to_node[inp.as_tensor.name]
+        elif typ_ == "as_scalar_type":
+            return _SERIALIZE_TO_TORCH_DTYPE[inp.as_scalar_type]
+        elif typ_ == "as_memory_format":
+            return _SERIALIZE_TO_TORCH_MEMORY_FORMAT[inp.as_memory_format]
+        elif typ_ == "as_layout":
+            return _SERIALIZE_TO_TORCH_LAYOUT[inp.as_layout]
+        elif typ_ == "as_graph":
+            assert isinstance(value, GraphArgument)
+            with self.save_graph_module():
+                self.deserialize_graph(value.graph)
+                submodule = ep._create_graph_module_for_export(self.module, self.graph)
+            self.module.register_module(value.name, submodule)
+            return self.graph.create_node(
+                "get_attr",
+                value.name,
+                name=value.name,
+            )
+        elif typ_ == "as_device":
+            return deserialize_device(inp.as_device)
+        elif typ_ == "as_int":
+            return inp.as_int
+        elif typ_ == "as_float":
+            return inp.as_float
+        elif typ_ == "as_bool":
+            return inp.as_bool
+        elif typ_ == "as_string":
+            return inp.as_string
+        elif typ_ == "as_sym_int":
+            return self.deserialize_sym_argument(inp.as_sym_int)
+        elif typ_ == "as_sym_bool":
+            return self.deserialize_sym_argument(inp.as_sym_bool)
+        elif isinstance(value, list):
+            if len(value) == 0:
+                return []
+            elif typ_ == "as_tensors":
+                result = []
+                for arg in value:
+                    result.append(self.serialized_name_to_node[arg.name])
+                return result
+            elif typ_ in ("as_ints", "as_floats", "as_bools", "as_strings"):
+                # convert from serialized.python.types.List to python list
+                return list(value)
+            elif typ_ in ("as_sym_ints", "as_sym_bools"):
+                return [self.deserialize_sym_argument(arg) for arg in value]
+            elif typ_ == "as_optional_tensors":
+                def deserialize_optional_tensor_args(a):
+                    if a.type == "as_none":
+                        return None
+                    elif a.type == "as_tensor":
+                        return self.serialized_name_to_node[a.value]
+                    else:
+                        raise SerializeError(f"Unhandled argument {inp}")
+                return list(map(deserialize_optional_tensor_args, value))
+            else:
+                raise SerializeError(f"Unhandled argument {inp}")
+        elif typ_ == "as_custom_obj":
+            if inp.as_custom_obj.name in self.serialized_name_to_node:
+                # Custom object has been lifted as an input
+                return self.serialized_name_to_node[inp.as_custom_obj.name]
+            return self.constants[inp.as_custom_obj.name]
+        elif typ_ == "as_operator":
+            return self.deserialize_operator(inp.as_operator)
+        else:
+            raise SerializeError(f"Unhandled argument {inp}")
+
+    def deserialize_sym_argument(self, sym_arg):
+        if isinstance(sym_arg, SymIntArgument):
+            if sym_arg.type == "as_int":
+                return sym_arg.as_int
+            elif sym_arg.type == "as_name":
+                return self.serialized_name_to_node[sym_arg.as_name]
+        elif isinstance(sym_arg, SymBoolArgument):
+            if sym_arg.type == "as_bool":
+                return sym_arg.as_bool
+            elif sym_arg.type == "as_name":
+                return self.serialized_name_to_node[sym_arg.as_name]
+        raise SerializeError(f"Unknown symbolic argument type: {sym_arg}")
+
+    def deserialize_sym_op_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
+        self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
+
+    def deserialize_outputs(self, serialized_node: Node, fx_node: torch.fx.Node):
+        # Check single value return
+        if len(serialized_node.outputs) == 0:
+            return
+        if (
+            len(serialized_node.outputs) == 1
+            and serialized_node.outputs[0].type == "as_tensor"
+        ):
+            self.sync_fx_node(serialized_node.outputs[0].as_tensor.name, fx_node)
+            return
+        elif (
+            len(serialized_node.outputs) == 1 and
+            isinstance(serialized_node.outputs[0].value, (SymIntArgument, SymBoolArgument))
+        ):
+            self.sync_fx_node(serialized_node.outputs[0].value.as_name, fx_node)
+            return
+
+        self.deserialize_multiple_outputs(serialized_node, fx_node)
+
+    def deserialize_multiple_outputs(self, serialized_node: Node, fx_node: torch.fx.Node) -> None:
+        deserialized_metadata = self.deserialize_metadata(serialized_node.metadata)
+
+        def generate_getitem(meta_val, fx_node: torch.fx.Node, arg: Union[TensorArgument, SymIntArgument], idx: int):
+            if isinstance(arg, TensorArgument):
+                name = arg.name
+            elif isinstance(arg, SymIntArgument):
+                name = arg.as_name
+            else:
+                raise AssertionError(f"generate_getitem got unknown argument type {type(arg)}")
+            individual_output = self.graph.create_node(
+                "call_function",
+                operator.getitem,
+                (fx_node, idx),
+                name=name,
+            )
+            self.sync_fx_node(name, individual_output)
+            meta_val.append(self.serialized_name_to_meta[name])
+            # The derived `getitem` nodes should have the same stacktrace as the
+            # original `fx_node`
+            individual_output.meta.update(deserialized_metadata)
+
+        def generate_getitems(meta_val, fx_node: torch.fx.Node, args):
+            for idx, arg in enumerate(args):
+                if isinstance(arg, Argument):
+                    arg = arg.value
+                if isinstance(arg, (TensorArgument, SymIntArgument)):
+                    generate_getitem(meta_val, fx_node, arg, idx)
+                elif isinstance(arg, (list, tuple)):
+                    list_output = self.graph.create_node(
+                        "call_function",
+                        operator.getitem,
+                        (fx_node, idx),
+                    )
+                    meta_val.append([])
+                    generate_getitems(meta_val[-1], list_output, arg)
+                    list_output.meta.update(deserialized_metadata)
+                    list_output.meta['val'] = meta_val[-1]
+                else:
+                    raise NotImplementedError(f"Unimplemented node output type: {arg}")
+
+        # Convert multiple return types to FX format.
+        # In FX, each node only returns one value. So in order to represent
+        # multiple return values, we have to emit a `getitem` node for each
+        # return value.
+        # This performs the inverse mapping of the `serialize_outputs` call in
+        # serialization, see [NOTE: Multiple outputs]
+        meta_val: List[Any] = []
+        if len(serialized_node.outputs) == 1:
+            assert isinstance(serialized_node.outputs[0].value, list)
+            assert isinstance(serialized_node.outputs[0].value[0], TensorArgument)
+            generate_getitems(meta_val, fx_node, serialized_node.outputs[0].as_tensors)
+        else:
+            generate_getitems(meta_val, fx_node, serialized_node.outputs)
+
+        # also update the metaval for `fx_node` to be a list(meta)
+        fx_node.meta["val"] = tuple(meta_val)
+        self.serialized_name_to_node[fx_node.name] = fx_node
+
+    def deserialize_metadata(self, metadata: Dict[str, str]) -> Dict[str, Any]:
+        ret: Dict[str, Any] = {}
+        if stack_trace := metadata.get("stack_trace"):
+            ret["stack_trace"] = stack_trace
+
+        def deserialize_meta_func(serialized_target: str):
+            module = None
+            if serialized_target.startswith("torch.nn"):
+                module = torch.nn
+                serialized_target_names = serialized_target.split(".")[2:]
+            elif serialized_target.startswith("torch"):
+                module = torch
+                serialized_target_names = serialized_target.split(".")[1:]
+            else:
+                return self.deserialize_operator(serialized_target)
+
+            target = module
+            for name in serialized_target_names:
+                if not hasattr(target, name):
+                    return serialized_target
+                else:
+                    target = getattr(target, name)
+            return target
+
+        if nn_module_stack_str := metadata.get("nn_module_stack"):
+            # Originally serialized to "key,orig_path,type_str"
+            def import_nn_module_stack(key, path, ty):
+                return key, (path, ty)
+            nn_module_stack = dict(
+                import_nn_module_stack(*item.split(","))
+                for item in nn_module_stack_str.split(ST_DELIMITER)
+            )
+            ret["nn_module_stack"] = nn_module_stack
+
+        if source_fn_st_str := metadata.get("source_fn_stack"):
+            # Originally serializes to "fx_node_name,op_str"
+            source_fn_st = []
+            for source_fn_str in source_fn_st_str.split(ST_DELIMITER):
+                name, target_str = source_fn_str.split(",")
+                source_fn_st.append((name, deserialize_meta_func(target_str)))
+            ret["source_fn_stack"] = source_fn_st
+        return ret
+
+    def deserialize_argument_spec(self, x: Argument) -> ep.ArgumentSpec:
+        if x.type == "as_tensor":
+            return ep.TensorArgument(name=x.as_tensor.name)
+        elif x.type == "as_sym_int":
+            return ep.SymIntArgument(name=x.as_sym_int.as_name)
+        else:
+            return ep.ConstantArgument(value=self.deserialize_input(x))
+
+    def deserialize_module_call_signature(self, module_call_signature: ModuleCallSignature) -> ep.ModuleCallSignature:
+        return ep.ModuleCallSignature(
+            inputs=[self.deserialize_argument_spec(x) for x in module_call_signature.inputs],
+            outputs=[self.deserialize_argument_spec(x) for x in module_call_signature.outputs],
+            in_spec=treespec_loads(module_call_signature.in_spec),
+            out_spec=treespec_loads(module_call_signature.out_spec),
+        )
+
+    def deserialize_module_call_graph(self, module_call_graph: List[ModuleCallEntry]) -> List[ep.ModuleCallEntry]:
+        return [
+            ep.ModuleCallEntry(
+                fqn=entry.fqn,
+                signature=self.deserialize_module_call_signature(entry.signature) if entry.signature else None,
+            ) for entry in module_call_graph
+        ]
+
+
+class ExportedProgramDeserializer:
+    def __init__(self, expected_opset_version: Optional[Dict[str, int]] = None):
+        self.expected_opset_version: Dict[str, int] = {}
+        if expected_opset_version:
+            self.expected_opset_version.update(expected_opset_version)
+        if "aten" not in self.expected_opset_version:
+            self.expected_opset_version["aten"] = torch._C._get_max_operator_version()
+
+    def deserialize_range_constraints(
+        self,
+        symbol_name_to_range: Dict[str, symbolic_shapes.ValueRanges],
+        symbol_name_to_symbol: Dict[str, sympy.Symbol],
+    ) -> Dict[sympy.Symbol, ValueRanges]:
+        range_constraints = {}
+        for k, v in symbol_name_to_range.items():
+            if symbol := symbol_name_to_symbol.get(k):
+                range_constraints[symbol] = v  # type: ignore[arg-type]
+            else:
+                log.warning(f"Symbol {k} did not appear in the graph that was deserialized")  # noqa: G004
+        return range_constraints
+
+    def deserialize(
+        self, serialized_artifact: SerializedArtifact
+    ) -> ep.ExportedProgram:
+        assert isinstance(serialized_artifact.exported_program, ExportedProgram)
+
+        if serialized_artifact.exported_program.schema_version.major != SCHEMA_VERSION[0]:
+            raise SerializeError(
+                f"Serialized schema version {serialized_artifact.exported_program.schema_version} "
+                f"does not match our current schema version {SCHEMA_VERSION}."
+            )
+
+        symbol_name_to_range = {
+            k: symbolic_shapes.ValueRanges(_int_to_sympy_int(v.min_val), _int_to_sympy_int(v.max_val))
+            for k, v in serialized_artifact.exported_program.range_constraints.items()
+        }
+        res = (
+            GraphModuleDeserializer()
+            .deserialize(
+                serialized_artifact.exported_program.graph_module,
+                serialized_artifact.state_dict,
+                serialized_artifact.constants,
+                symbol_name_to_range,
+            )
+        )
+        range_constraints = self.deserialize_range_constraints(
+            symbol_name_to_range, res.names_to_symbols,
+        )
+        model_opset_version: Optional[Dict[str, int]] = serialized_artifact.exported_program.opset_version
+        self._validate_model_opset_version(model_opset_version)
+
+        upgrader = GraphModuleOpUpgrader(self.expected_opset_version, model_opset_version)
+
+        exported_program = ep.ExportedProgram(
+            root=res.graph_module,
+            graph=res.graph_module.graph,
+            graph_signature=res.signature,
+            state_dict=res.state_dict,  # type: ignore[arg-type]
+            range_constraints=range_constraints,
+            module_call_graph=res.module_call_graph,
+            example_inputs=None,
+            verifier=load_verifier(serialized_artifact.exported_program.dialect),
+            constants=res.constants,
+        )
+        return upgrader.upgrade(exported_program)
+
+    def _validate_model_opset_version(self, model_opset_version: Optional[Dict[str, int]]):
+        """Compare model_opset_version with expected_opset_version and raise error if we can't resolve the version
+        difference.
+        E.g., model_opset_version = {"aten": 3, "custom": 4}
+        expected_opset_version = {"aten": 4, "custom": 4}
+        This means we can use an upgrader for ATen to reconcile the deserialized model.
+
+        The logic of this method:
+
+        For common op namespaces:
+        1. if model version < expected version, this case can be handled by upgraders.
+        2. if model version > expected version, we need downgraders but not implemented yet.
+        3. if model version == expected version, we don't need extra handling.
+
+        For op namespace only in model_opset_version, we should give a warning because it is missing from
+        expected_opset_version.
+        """
+        if not model_opset_version:
+            raise RuntimeError("Serialized model should have opset version.")
+        common_namespaces = {key for key in model_opset_version if key in self.expected_opset_version}
+        for namespace in common_namespaces:
+            assert (
+                isinstance(model_version := model_opset_version[namespace], int)
+            ), f"model_opset_version value should be int, got {model_opset_version[namespace]}"
+
+            assert (
+                isinstance(compiler_version := self.expected_opset_version[namespace], int)
+            ), f"expected_opset_version value should be int, got {self.expected_opset_version[namespace]}"
+
+            # TODO(larryliu0820): Add support for upgrader & downgrader
+            if model_version != compiler_version:
+                raise NotImplementedError(
+                    f"Model opset version {model_opset_version} doesn't match to compiler opset version "
+                    f"{self.expected_opset_version}! Upgrader/downgrader is not implemented yet."
+                )
+        for namespace in model_opset_version:
+            if namespace in common_namespaces:
+                continue
+            log.warning("Compiler doesn't have a version table for op namespace: {ns}. ", extra={"ns": namespace})
+
+
+class EnumEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Enum):
+            return obj.value
+        if isinstance(obj, bytes):
+            return base64.b64encode(obj).decode('utf-8')
+        return super().default(obj)
+
+
+def _dataclass_to_dict(obj):
+    if isinstance(obj, _Union):
+        return {obj.type: _dataclass_to_dict(obj.value)}
+    elif dataclasses.is_dataclass(obj):
+        return {
+            f.name: _dataclass_to_dict(getattr(obj, f.name))
+            for f in dataclasses.fields(obj)
+            if not (f.default is None and getattr(obj, f.name) is None)
+        }
+    elif isinstance(obj, list):
+        return [_dataclass_to_dict(x) for x in obj]
+    elif isinstance(obj, tuple):
+        return tuple(_dataclass_to_dict(x) for x in obj)
+    elif isinstance(obj, dict):
+        return {k: _dataclass_to_dict(v) for k, v in obj.items()}
+    else:
+        return obj
+
+
+def serialize(
+    exported_program: ep.ExportedProgram,
+    opset_version: Optional[Dict[str, int]] = None,
+) -> SerializedArtifact:
+    serialized_artifact = (
+        ExportedProgramSerializer(opset_version).serialize(exported_program)
+    )
+    assert isinstance(serialized_artifact.exported_program, ExportedProgram)
+
+
+    json_program = json.dumps(
+        _dataclass_to_dict(serialized_artifact.exported_program), cls=EnumEncoder
+    )
+    json_bytes = json_program.encode('utf-8')
+    artifact = SerializedArtifact(
+        json_bytes,
+        serialized_artifact.state_dict,
+        serialized_artifact.constants
+    )
+    return artifact
+
+
+def _dict_to_dataclass(cls, data):
+    assert not isinstance(cls, str), f"Unresolved class type: '{cls}'."
+    if typing.get_origin(cls) == typing.Union and type(None) in typing.get_args(cls):
+        if data is None:
+            return None
+        ty_args = typing.get_args(cls)
+        assert len(ty_args) == 2
+        return _dict_to_dataclass(ty_args[0], data)
+    elif isinstance(cls, type) and issubclass(cls, _Union):
+        assert isinstance(data, dict)
+        assert len(data) == 1
+        _type = next(iter(data.keys()))
+        _value = next(iter(data.values()))
+        assert isinstance(_type, str)
+        field_type = cls.__annotations__[_type]
+        return cls.create(**{_type: _dict_to_dataclass(field_type, _value)})
+    elif dataclasses.is_dataclass(cls):
+        obj = cls(**data)  # type: ignore[assignment]
+        type_hints = typing.get_type_hints(cls)
+        for f in dataclasses.fields(cls):
+            name = f.name
+            new_field_obj = _dict_to_dataclass(type_hints[name], getattr(obj, name))
+            setattr(obj, name, new_field_obj)
+        return obj
+    elif isinstance(data, list):
+        if len(data) == 0:
+            return data
+        d_type = typing.get_args(cls)[0]
+        return [
+            _dict_to_dataclass(d_type, d)
+            for d in data
+        ]
+    elif isinstance(data, dict):
+        v_type = typing.get_args(cls)[1]
+        return {
+            k: _dict_to_dataclass(v_type, v)
+            for k, v in data.items()
+        }
+    return data
+
+
+def deserialize(
+    artifact: SerializedArtifact,
+    expected_opset_version: Optional[Dict[str, int]] = None,
+) -> ep.ExportedProgram:
+    assert isinstance(artifact.exported_program, bytes)
+    exported_program_str = artifact.exported_program.decode('utf-8')
+    exported_program_dict = json.loads(exported_program_str)
+    serialized_exported_program = _dict_to_dataclass(ExportedProgram, exported_program_dict)
+    return (
+        ExportedProgramDeserializer(expected_opset_version)
+        .deserialize(
+            SerializedArtifact(
+                serialized_exported_program,
+                artifact.state_dict,
+                artifact.constants
+            )
+        )
+    )
+
+
+def _canonicalize_graph(sorted_inputs, sorted_outputs, graph) -> Tuple[Graph, Dict[str, str]]:
+    def _get_argument(a: Argument):
+        if a.type == "as_none":
+            return None
+        elif a.type == "as_tensor":
+            return a.as_tensor
+        elif a.type == "as_tensors":
+            return a.as_tensors
+        elif a.type == "as_int":
+            return None
+        elif a.type == "as_ints":
+            return None
+        elif a.type == "as_float":
+            return None
+        elif a.type == "as_floats":
+            return None
+        elif a.type == "as_string":
+            return None
+        elif a.type == "as_strings":
+            return None
+        elif a.type == "as_sym_int":
+            return a.as_sym_int
+        elif a.type == "as_sym_ints":
+            return a.as_sym_ints
+        elif a.type == "as_scalar_type":
+            return None
+        elif a.type == "as_memory_format":
+            return None
+        elif a.type == "as_layout":
+            return None
+        elif a.type == "as_device":
+            return None
+        elif a.type == "as_bool":
+            return None
+        elif a.type == "as_bools":
+            return None
+        elif a.type == "as_sym_bool":
+            return a.as_sym_bool
+        elif a.type == "as_sym_bools":
+            return a.as_sym_bools
+        elif a.type == "as_graph":
+            return None
+        elif a.type == "as_optional_tensors":
+            return a.as_optional_tensors
+        elif a.type == "as_custom_obj":
+            return None
+        elif a.type == "as_operator":
+            return None
+        else:
+            raise AssertionError(f"Unknown input type to the ExportedProgram: {a}")
+
+    # Stage 1: Reorder named items.
+    def for_args(f, a):
+        assert isinstance(a, Argument)
+        pytree.tree_map(f, _get_argument(a))
+
+    def sort_nodes(nodes):
+        @dataclass
+        class Edges:
+            outs: List[int]
+            ins: int
+
+        graph_inputs: Set[str] = set()
+        def_table: Dict[str, int] = {}
+        edges: Dict[int, Edges] = {}
+        candidates: List[Tuple[str, List[Tuple[str, List[int]]], int]] = []
+        rank: Dict[str, int] = {}
+        ret: List[Node] = []
+
+        def get_name(a) -> Optional[str]:
+            if a is None:
+                return None
+            if isinstance(a, TensorArgument):
+                return a.name
+            elif isinstance(a, (SymIntArgument, SymBoolArgument)):
+                if a.type == "as_name":
+                    return a.as_name
+                elif a.type in ("as_int", "as_bool"):
+                    return None
+                else:
+                    raise AssertionError(f"Unknown argument type: {a}")
+            elif isinstance(a, OptionalTensorArgument):
+                if a.type == "as_tensor":
+                    assert isinstance(a.as_tensor, str)
+                    return a.as_tensor
+                elif a.type == "as_none":
+                    return None
+                else:
+                    raise AssertionError(f"Unknown optional tensor type: {a}")
+            else:
+                raise AssertionError(f"Unknown argument type: {a}")
+
+        for i in sorted_inputs:
+            def add_input(a):
+                if s := get_name(a):
+                    graph_inputs.add(s)
+
+            for_args(add_input , i)
+
+        for idx, node in enumerate(nodes):
+            def add_def(a):
+                if s := get_name(a):
+                    assert s not in def_table
+                    def_table[s] = idx
+
+            for o in node.outputs:
+                for_args(add_def, o)
+
+            edges[idx] = Edges([], 0)
+
+        for idx, user in enumerate(nodes):
+            def add_edge(a):
+                if s := get_name(a):
+                    if s not in def_table:
+                        assert s in graph_inputs
+                        return
+                    src = def_table[s]
+                    edges[src].outs.append(idx)
+                    edges[idx].ins += 1
+
+            for i in user.inputs:
+                for_args(add_edge, i.arg)
+
+        def add_rank(a):
+            if s := get_name(a):
+                assert s not in rank
+                rank[s] = len(rank)
+
+        def get_rank(a):
+            if s := get_name(a):
+                return rank[s]
+            else:
+                return -1
+
+        for i in sorted_inputs:
+            for_args(add_rank, i)
+
+        def add_candidate(idx: int):
+            def get_ranks(i):
+                ranks = []
+                for_args(lambda x: ranks.append(get_rank(x)), i)
+                return ranks
+            node = nodes[idx]
+            args_rank = [(a.name, get_ranks(a.arg)) for a in node.inputs]
+            heapq.heappush(candidates, (node.target, args_rank, idx))
+
+        for idx, e in edges.items():
+            if e.ins == 0:
+                add_candidate(idx)
+
+        while len(candidates) > 0:
+            _, _, idx = heapq.heappop(candidates)
+            node = nodes[idx]
+            for o in node.outputs:
+                for_args(add_rank, o)
+            ret.append(node)
+            assert idx in edges
+            for user in edges[idx].outs:
+                e = edges[user]
+                assert e.ins > 0
+                e.ins -= 1
+                if e.ins == 0:
+                    add_candidate(user)
+            edges[idx].outs.clear()
+
+        return ret
+
+    sorted_nodes = sort_nodes(graph.nodes)
+    assert len(sorted_nodes) == len(graph.nodes)
+
+    # Stage 2: Rename nodes.
+    name_table: Dict[str, str] = {}
+
+    def rename_def(a):
+        def _rename(arg_name, values):
+            new_name = f"_{len(name_table)}"
+            assert arg_name not in name_table
+            name_table[arg_name] = new_name
+            assert arg_name in values
+            values[new_name] = values.pop(arg_name)
+            return new_name
+
+        if a is None:
+            return
+        if isinstance(a, TensorArgument):
+            a.name = _rename(a.name, graph.tensor_values)
+        elif isinstance(a, SymIntArgument):
+            if a.type == "as_name":
+                a.as_name = _rename(a.as_name, graph.sym_int_values)
+        elif isinstance(a, SymBoolArgument):
+            if a.type == "as_name":
+                a.as_name = _rename(a.as_name, graph.sym_bool_values)
+        else:
+            raise AssertionError(f"Unknown argument type: {a}")
+
+    def replace_use(a):
+        if a is None:
+            return
+        if isinstance(a, TensorArgument):
+            a.name = name_table.get(a.name, a.name)
+        elif isinstance(a, SymIntArgument):
+            if a.type == "as_name":
+                a.as_name = name_table.get(a.as_name, a.as_name)
+        elif isinstance(a, SymBoolArgument):
+            if a.type == "as_name":
+                a.as_name = name_table.get(a.as_name, a.as_name)
+        elif isinstance(a, OptionalTensorArgument):
+            if a.type == "as_tensor":
+                assert isinstance(a.as_tensor, str)
+                a.as_tensor = name_table.get(a.as_tensor, a.as_tensor)
+        else:
+            raise AssertionError(f"Unknown argument type: {a}")
+
+    for i in sorted_inputs:
+        for_args(rename_def, i)
+
+    for n in sorted_nodes:
+        for o in n.outputs:
+            for_args(rename_def, o)
+
+    for n in sorted_nodes:
+        for i in n.inputs:
+            for_args(replace_use, i.arg)
+
+    for o in sorted_outputs:
+        for_args(replace_use, o)
+
+    # Stage 3: Remove unstable fields.
+    for n in sorted_nodes:
+        n.metadata.clear()
+
+    # Stage 4: Aggregate values.
+    sorted_tensor_values = dict(sorted(graph.tensor_values.items(), key=lambda x: x[0]))
+    sorted_sym_int_values = dict(sorted(graph.sym_int_values.items(), key=lambda x: x[0]))
+    sorted_sym_bool_values = dict(sorted(graph.sym_bool_values.items(), key=lambda x: x[0]))
+
+    # Stage 5: Recurse in subgraphs.
+    counter = 0
+    for node in sorted_nodes:
+        for i in node.inputs:
+            a = i.arg
+            if a.type == "as_graph":
+                a.as_graph.graph = _canonicalize_graph(
+                    a.as_graph.graph.inputs,
+                    a.as_graph.graph.outputs,
+                    a.as_graph.graph
+                )
+                a.as_graph.name = f"_g{counter}"
+                counter += 1
+
+    graph = Graph(
+        inputs=sorted_inputs,
+        outputs=sorted_outputs,
+        nodes=sorted_nodes,
+        tensor_values=sorted_tensor_values,
+        sym_int_values=sorted_sym_int_values,
+        sym_bool_values=sorted_sym_bool_values,
+        is_single_tensor_return=graph.is_single_tensor_return,
+    )
+    return graph, name_table
+
+
+def canonicalize(ep: ExportedProgram) -> ExportedProgram:
+    """
+    Normalize a serialized ExportedProgram, so that different eager program which
+    shares the same semantics can get a single representation on disk.
+
+    This function canonicalizes an ExportedProgram by:
+
+    1. Sorting nodes in topological order.
+    2. Rename nodes to have unique names.
+    3. Remove unstable fields.
+    4. Aggregate the above program fields.
+    5. Recurse in subgraphs.
+
+    Args:
+        ep (ExportedProgram): The ExportedProgram to canonicalize.
+
+    Returns:
+        ExportedProgram: The canonicalized exported program.
+    """
+    ep = copy.deepcopy(ep)
+
+    opset_version = dict(sorted(ep.opset_version.items(), key=lambda x: x[0]))
+    range_constraints = dict(sorted(ep.range_constraints.items(), key=lambda x: x[0]))
+    module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
+    signature = ep.graph_module.signature
+    graph = ep.graph_module.graph
+
+    assert len(graph.inputs) == len(signature.input_specs)
+    assert len(graph.outputs) == len(signature.output_specs)
+
+    def rank_input(inp) -> Tuple[int, Optional[str], int]:
+        idx, (arg, spec) = inp
+        assert isinstance(spec, InputSpec)
+        if spec.type == "user_input":
+            return 5, None, idx
+        elif spec.type == "parameter":
+            return 1, spec.parameter.parameter_name, idx
+        elif spec.type == "buffer":
+            return 2, spec.buffer.buffer_name, idx
+        elif spec.type == "tensor_constant":
+            return 3, spec.tensor_constant.tensor_constant_name, idx
+        elif spec.type == "custom_obj":
+            return 4, spec.custom_obj.custom_obj_name, idx
+        else:
+            raise AssertionError(f"Unknown input type: {spec}")
+
+    def rank_output(out) -> Tuple[int, Optional[str], int]:
+        idx, (arg, spec) = out
+        assert isinstance(spec, OutputSpec)
+        if spec.type == "user_output":
+            return 3, None, idx
+        elif spec.type == "loss_output":
+            return 3, None, idx
+        elif spec.type == "buffer_mutation":
+            return 1, spec.buffer_mutation.buffer_name, idx
+        elif spec.type == "gradient_to_parameter":
+            return 4, spec.gradient_to_parameter.parameter_name, idx
+        elif spec.type == "gradient_to_user_input":
+            return 5, None, idx
+        elif spec.type == "user_input_mutation":
+            return 2, None, idx
+        else:
+            raise AssertionError(f"Unknown output type: {spec}")
+
+    sorted_ins = sorted(enumerate(zip(graph.inputs, signature.input_specs)), key=rank_input)
+    sorted_inputs, input_specs = zip(*(i for idx, i in sorted_ins))  # type: ignore[assignment]
+
+    sorted_outs = sorted(enumerate(zip(graph.outputs, signature.output_specs)), key=rank_output)
+    sorted_outputs, output_specs = zip(*(i for idx, i in sorted_outs))  # type: ignore[assignment]
+
+    sorted_graph, replace_table = _canonicalize_graph(sorted_inputs, sorted_outputs, graph)
+
+    def replace_input(inp):
+        assert isinstance(spec, InputSpec)
+        if spec.type == "user_input":
+            arg = spec.user_input.arg
+            if arg.type == "as_tensor":
+                t = arg.as_tensor
+                t.name = replace_table[t.name]
+            elif arg.type == "as_sym_int":
+                s = arg.as_sym_int
+                if s.type == "as_name":
+                    s.as_name = replace_table[s.as_name]
+                elif s.type == "as_int":
+                    pass
+                else:
+                    raise AssertionError(f"Unknown sym_int type: {s}")
+            elif arg.type in ("as_none", "as_int", "as_float", "as_string", "as_custom_obj"):
+                return
+            else:
+                raise AssertionError(f"Unknown input type: {arg}")
+        elif spec.type == "parameter":
+            t = spec.parameter.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "buffer":
+            t = spec.buffer.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "tensor_constant":
+            t = spec.tensor_constant.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "custom_obj":
+            return
+        else:
+            raise AssertionError(f"Unknown input type: {spec}")
+
+    def replace_output(out):
+        assert isinstance(spec, OutputSpec)
+        if spec.type == "user_output":
+            arg = spec.user_output.arg
+            if arg.type == "as_tensor":
+                t = arg.as_tensor
+                t.name = replace_table[t.name]
+            elif arg.type == "as_sym_int":
+                s = arg.as_sym_int
+                if s.type == "as_name":
+                    s.as_name = replace_table[s.as_name]
+                elif s.type == "as_int":
+                    pass
+                else:
+                    raise AssertionError(f"Unknown sym_int type: {s}")
+            elif arg.type in ("as_none", "as_int", "as_float", "as_string"):
+                return
+            else:
+                raise AssertionError(f"Unknown input type: {arg}")
+        elif spec.type == "loss_output":
+            t = spec.loss_output.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "buffer_mutation":
+            t = spec.buffer_mutation.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "gradient_to_parameter":
+            t = spec.gradient_to_parameter.arg
+            t.name = replace_table[t.name]
+        elif spec.type == "gradient_to_user_input":
+            g = spec.gradient_to_user_input
+            g.arg.name = replace_table[g.arg.name]
+            g.user_input_name = replace_table[g.user_input_name]
+        elif spec.type == "user_input_mutation":
+            u = spec.user_input_mutation
+            u.arg.name = replace_table[u.arg.name]
+            u.user_input_name = replace_table[u.user_input_name]
+        else:
+            raise AssertionError(f"Unknown output type: {spec}")
+
+    for spec in input_specs:
+        replace_input(spec)
+
+    for spec in output_specs:
+        replace_output(spec)
+
+    return ExportedProgram(
+        graph_module=GraphModule(
+            graph=sorted_graph,
+            signature=GraphSignature(
+                input_specs=list(input_specs),
+                output_specs=list(output_specs),
+            ),
+            module_call_graph=module_call_graph,
+        ),
+        opset_version=opset_version,
+        range_constraints=range_constraints,
+        schema_version=ep.schema_version,
+        dialect=ep.dialect,
+    )
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/union.py b/MLPY/Lib/site-packages/torch/_export/serde/union.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a47a712c2f971b9a474e230f8b54547e2acad3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/union.py
@@ -0,0 +1,69 @@
+import functools
+from dataclasses import fields
+from typing import Hashable, Set
+
+
+class _UnionTag(str):
+    _cls: Hashable
+
+    @staticmethod
+    def create(t, cls):
+        tag = _UnionTag(t)
+        assert not hasattr(tag, "_cls")
+        tag._cls = cls
+        return tag
+
+    def __eq__(self, cmp) -> bool:
+        assert isinstance(cmp, str)
+        other = str(cmp)
+        assert other in _get_field_names(
+            self._cls
+        ), f"{other} is not a valid tag for {self._cls}. Available tags: {_get_field_names(self._cls)}"
+        return str(self) == other
+
+    def __hash__(self):
+        return hash(str(self))
+
+
+@functools.lru_cache(maxsize=None)
+def _get_field_names(cls) -> Set[str]:
+    return {f.name for f in fields(cls)}
+
+
+class _Union:
+    _type: _UnionTag
+
+    @classmethod
+    def create(cls, **kwargs):
+        assert len(kwargs) == 1
+        obj = cls(**{**{f.name: None for f in fields(cls)}, **kwargs})  # type: ignore[arg-type]
+        obj._type = _UnionTag.create(next(iter(kwargs.keys())), cls)
+        return obj
+
+    def __post_init__(self):
+        assert not any(f.name in ("type", "_type", "create", "value") for f in fields(self))  # type: ignore[arg-type, misc]
+
+    @property
+    def type(self) -> str:
+        try:
+            return self._type
+        except AttributeError as e:
+            raise RuntimeError(
+                f"Please use {type(self).__name__}.create to instantiate the union type."
+            ) from e
+
+    @property
+    def value(self):
+        return getattr(self, self.type)
+
+    def __getattribute__(self, name):
+        attr = super().__getattribute__(name)
+        if attr is None and name in _get_field_names(type(self)) and name != self.type:  # type: ignore[arg-type]
+            raise AttributeError(f"Field {name} is not set.")
+        return attr
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        return f"{type(self).__name__}({self.type}={getattr(self, self.type)})"
diff --git a/MLPY/Lib/site-packages/torch/_export/serde/upgrade.py b/MLPY/Lib/site-packages/torch/_export/serde/upgrade.py
new file mode 100644
index 0000000000000000000000000000000000000000..121edbe29b8aa20124a28109e35b6e991aa31b40
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/serde/upgrade.py
@@ -0,0 +1,201 @@
+import logging
+from collections import defaultdict
+from typing import Tuple, Dict, Optional, List
+
+import torch
+from torch.export import export
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+from torch._export.pass_infra.node_metadata import NodeMetadata
+from torch._export.pass_infra.proxy_value import ProxyValue
+from torch._subclasses import FakeTensor
+from torch.fx.node import Target, Argument
+from torch.library import Library
+from torch.utils._pytree import tree_unflatten
+import torch._export.exported_program as ep
+import re
+
+lib = Library("aten", "FRAGMENT")
+impl_lib = Library("aten", "IMPL")
+
+log = logging.getLogger(__name__)
+
+
+def get_target_version(versioned_upgrader_name: str) -> int:
+    """div_Scalar_0_3 is the name of the upgrader, meaning it applies to div.Scalar of version 0 to 3 and is
+    upgrading to version 4."""
+    if not re.match("^.*_[0-9]+_[0-9]+$", versioned_upgrader_name):
+        raise RuntimeError(f"Upgrader name {versioned_upgrader_name} is invalid")
+
+    return int(versioned_upgrader_name.split('_')[-1]) + 1
+
+
+def get_upgraders() -> Dict[str, Tuple[str, str]]:
+    """Getting upgraders entry map and operator version map and merge them into one dict."""
+    upgraders = torch._C._get_upgraders_entry_map()
+    op_version_map = torch._C._get_operator_version_map()
+    output: Dict[str, Tuple[str, str]] = defaultdict(tuple)  # type: ignore[arg-type]
+    for opname, entry_list in op_version_map.items():
+        if not entry_list:
+            raise RuntimeError(f"Op version map has an empty entry for opname {opname}")
+        entry = entry_list[0]
+        old_schema = entry.old_schema
+        upgrader_name = entry.upgrader_name
+        upgrader_str = upgraders.get(upgrader_name, None)
+        if not upgrader_str:
+            raise RuntimeError(f"Can't find upgrader for op {opname} and upgrader name {upgrader_name}")
+        output[upgrader_name] = (old_schema, upgrader_str)
+    return output
+
+
+class GraphModuleOpUpgrader:
+    """This upgrader is able to upgrade the old version of ops in a given GraphModule, if all upgraders are available.
+    To use it, retrieve upgraders from somewhere (TorchScript API or new API) and pass it into this upgrader. In
+    __init__() it does the following:
+    1. parse the upgrader list and reorder for upgrading purpose.
+    2. register old versions of operators as custom ops.
+    3. prepare upgrader passes.
+
+    In `upgrade()` API run these upgrader passes.
+
+    An example of op_upgraders input:
+    {
+        "aten::div__Scalar_0_3": (                              # versioned op name
+            "div._Scalar(self: Tensor, other: Scalar)",         # old schema
+            '''
+            def div__Scalar_0_3(self: torch.Tensor, other) -> torch.Tensor:     # upgrader in literal string
+              if (self.is_floating_point() or isinstance(other, float)):
+                return self.true_divide_(other)
+              return self.divide_(other, rounding_mode='trunc')
+            ''',
+        ),
+    },
+
+    Note that we require the upgrader function to be runnable in Python (which is a stricter requirement than the
+    original TorchScript upgrader).
+    """
+
+    class UpgraderPass(_ExportPassBaseDeprecatedDoNotUse):
+        def __init__(self, old_target: Target, new_target: Target):
+            super().__init__()
+            self.old_target = old_target
+            self.new_target = new_target
+
+        def call_operator(
+                self,
+                op,
+                args: Tuple[Argument, ...],
+                kwargs: Dict[str, Argument],
+                meta: NodeMetadata,
+        ) -> ProxyValue:
+            if op == self.old_target:
+                return super().call_operator(self.new_target, args, kwargs, meta)
+            return super().call_operator(op, args, kwargs, meta)
+
+    def __init__(
+            self,
+            compiler_opset_version: Optional[Dict[str, int]] = None,
+            model_opset_version: Optional[Dict[str, int]] = None,
+            op_upgraders: Optional[Dict[str, Tuple[str, str]]] = None,
+    ):
+        self.op_upgraders: Dict[str, Tuple[str, str]] = get_upgraders() if not op_upgraders else op_upgraders
+        self.compiler_opset_version = compiler_opset_version if compiler_opset_version else {}
+        self.model_opset_version = model_opset_version if model_opset_version else {}
+        self.upgrader_passes: List[GraphModuleOpUpgrader.UpgraderPass] = GraphModuleOpUpgrader._populate_passes(
+            self._parse_upgraders(self.op_upgraders))
+
+    def _parse_upgraders(self, op_upgraders: Optional[Dict[str, Tuple[str, str]]] = None) -> List[Tuple[str, str]]:
+        """Reorder op_upgraders by version number, return an ordered list of tuples, containing old op schema as well
+        as the upgrader function string literal."""
+        # TODO(larryliu0820): Add support for custom ops
+        op_namespace = "aten"
+        if not op_upgraders or op_namespace not in self.model_opset_version or op_namespace not in self.compiler_opset_version:
+            return []
+        model_ver = self.model_opset_version[op_namespace]
+        curr_ver = self.compiler_opset_version[op_namespace]
+
+        # key is the target version. div__Scalar_0_3 should have a key of 4.
+        versioned_upgraders: Dict[int, Tuple[str, str]] = {get_target_version(name): v for name, v in
+                                                           op_upgraders.items()}
+        target_upgraders: List[Tuple[str, str]] = []
+        # we need all upgraders from model_ver + 1 to curr_ver, inclusively
+        for ver in range(model_ver + 1, curr_ver + 1):
+            if ver in versioned_upgraders:
+                target_upgraders.append(versioned_upgraders[ver])
+            else:
+                # we may be able to get away with missing upgraders, if that operator is missing from given graph
+                # module.
+                log.warning("Missing an upgrader to upgrade to version {ver}.", extra={"ver": ver})
+
+        return target_upgraders
+
+    @staticmethod
+    def _populate_passes(upgraders: List[Tuple[str, str]]) -> List[UpgraderPass]:
+        """Given a list of upgraders, loop through it from lower version to higher version and create passes for all
+        upgraders. se torch.Library API to register old ops. Op name will be
+        <name>_<valid_from_ver>_<valid_till_ver>. Register upgraders as CompositeImplicitAutograd kernels. For example:
+
+        lib = Library("aten", "FRAGMENT")
+        lib.define(old_schema)
+
+        impl_lib = Library("aten", "IMPL")
+        impl_lib.impl("div__Scalar_0_3", div__Scalar_0_3, "CompositeImplicitAutograd")
+
+        @:var upgraders: a list of tuples. The first element of the tuple is the old schema and the second is the
+        upgrader function literal text.
+        @:return upgrader passes, order matters
+        """
+
+        upgrader_passes = []
+
+        def register_old_op(name: str, schema: str, impl_str: str):
+            """Registers an old version operator using impl_name as old op name."""
+            lib.define(schema)
+            try:
+                exec(impl_str)
+            except Exception as e:
+                raise RuntimeError(f"Invalid upgrader string: {impl_str}") from e
+            impl_lib.impl(name, locals()[name], "CompositeImplicitAutograd")
+
+        for (schema, upgrader_str) in upgraders:
+            upgrader_name = upgrader_str.split('(')[0].split(' ')[-1]
+            op_name = schema.split('(')[0].split("::")[-1]
+            schema = schema.replace(op_name, upgrader_name)
+            try:
+                register_old_op(name=upgrader_name, schema=schema, impl_str=upgrader_str)
+            except RuntimeError as e:
+                if "with the same name and overload name multiple times" in str(e):
+                    print(f"Registering {upgrader_name} multiple times")
+                else:
+                    raise RuntimeError from e
+            old_op_target = getattr(torch.ops.aten, upgrader_name).default
+            # for example, the operator instance of "aten::div" is torch.op.aten.div.default. We need to append the
+            # "default" at the end.
+            op_name, overload_name = (op_name, "default") if "." not in op_name else tuple(op_name.split(".")[:2])
+            new_op_target = getattr(getattr(torch.ops.aten, op_name), overload_name)
+            # Note that the graph will have op names in the graph, but actually they are of old versions.
+            upgrader_passes.append(
+                GraphModuleOpUpgrader.UpgraderPass(old_target=new_op_target, new_target=old_op_target))
+
+        return upgrader_passes
+
+    def upgrade(self, exported_program: ep.ExportedProgram) -> ep.ExportedProgram:
+        """Run each upgrader pass and then retrace to decompose it. Each upgrader pass replaces the old version of
+        operators with a custom operator. The custom operator contains a CompositeImplicitAutograd kernel (the
+        upgrading function itself). After retrace, this custom operator will be decomposed into the ops used in the
+        upgrader. After all passes are applied, the exported program will be upgraded to the target version."""
+        if not self.upgrader_passes:
+            return exported_program
+
+        args = [n.meta.get("val", None) for n in exported_program.graph.nodes if n.op == "placeholder"]
+        args_real_tensors = [torch.ones(tuple(arg.size()), dtype=arg.dtype) if isinstance(arg, FakeTensor) else arg for
+                             arg in args]
+        assert exported_program.call_spec.in_spec is not None
+        args, kwargs = tree_unflatten(args_real_tensors, exported_program.call_spec.in_spec)
+        assert kwargs == {}
+
+        for _pass in self.upgrader_passes:
+            upgraded_program = exported_program._transform_do_not_use(_pass)
+            # NB: we have to retrace the graph_module instead of ep because of some failure.
+            exported_program = export(upgraded_program.module(), args, kwargs)
+
+        return exported_program
diff --git a/MLPY/Lib/site-packages/torch/_export/utils.py b/MLPY/Lib/site-packages/torch/_export/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..65881b4dd625bc3a1c5b5e0de2dfb756ad70d560
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/utils.py
@@ -0,0 +1,401 @@
+import dataclasses
+import math
+import operator
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Type
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensor
+
+from torch.export import ExportedProgram
+from torch.utils._pytree import (
+    _register_pytree_node,
+    Context,
+    FlattenFunc,
+    FromDumpableContextFn,
+    KeyPath,
+    keystr,
+    MappingKey,
+    SequenceKey,
+    ToDumpableContextFn,
+    UnflattenFunc,
+)
+
+
+def _check_input_constraints_for_graph(
+    input_placeholders: List[torch.fx.Node], flat_args_with_path, range_constraints
+):
+    def get_keystr(key_path: KeyPath) -> str:
+        """For a given index into the flat_args, return a human readable string
+        describing how to access it, e.g. "*args["foo"][0].bar"
+        """
+        # Prefix the keypath with "*args" or "**kwargs" to make it clearer where
+        # the arguments come from. Ultimately we ought to serialize the
+        # original arg names for the best error message here.
+        args_kwargs_key_path = key_path[0]
+        assert isinstance(args_kwargs_key_path, SequenceKey)
+        if args_kwargs_key_path.idx == 0:
+            return f"*args{keystr(key_path[1:])}"
+        else:
+            kwarg_key = key_path[1]
+            assert isinstance(kwarg_key, MappingKey)
+            name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
+            return f"{name}{keystr(key_path[2:])}"
+
+    import sympy
+
+    from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+        _convert_range_to_int,
+    )
+    from torch.utils._sympy.solve import try_solve
+
+    if len(flat_args_with_path) != len(input_placeholders):
+        raise RuntimeError(
+            "Unexpected number of inputs "
+            f"(expected {len(input_placeholders)}, got {len(flat_args_with_path)})"
+        )
+    # NOTE: export already guarantees that the same symbol is used in metadata
+    # for all InputDims related by equality constraints, so we can just unify
+    # symbols with given input dimension values to check equality constraints.
+    unification_map: "Dict[sympy.Symbol, Any]" = {}
+    for (key_path, arg), node in zip(flat_args_with_path, input_placeholders):
+        node_val = node.meta.get("val")
+        if isinstance(node_val, FakeTensor):
+            if not isinstance(arg, torch.Tensor):
+                raise RuntimeError(
+                    f"Expected input at {get_keystr(key_path)} to be a tensor, but got {type(arg)}",
+                )
+
+            if len(node_val.shape) != len(arg.shape):
+                raise RuntimeError(
+                    f"Unexpected number of dimensions in input at {get_keystr(key_path)}.shape "
+                    f"(expected {node_val.shape}, got {arg.shape})"
+                )
+
+            for j, (arg_dim, node_dim) in enumerate(zip(arg.shape, node_val.shape)):
+                # TODO(avik): Assert the following property in the IR verifier:
+                # node_dim is either an int or a SymInt containing an int or a unary sympy.Expr
+                if (
+                    isinstance(node_dim, torch.SymInt)
+                    and len(node_dim.node.expr.free_symbols) == 1
+                ):
+                    symbol = next(iter(node_dim.node.expr.free_symbols))
+                    if symbol in unification_map:
+                        existing_dim = node_dim.node.expr.subs(unification_map)
+                        if arg_dim != existing_dim:
+                            raise RuntimeError(
+                                f"Expected input at {get_keystr(key_path)}.shape[{j}] to be equal to "
+                                f"{existing_dim}, but got {arg_dim}",
+                            )
+                    else:
+                        if (
+                            isinstance(arg_dim, torch.SymInt)
+                            and not arg_dim.node.expr.is_number
+                        ):
+                            # This can happen when, say, arg is a fake tensor.
+                            # We do not run checks on symbolic shapes of fake inputs as
+                            # such checks can affect the shape env.
+                            pass
+                        else:
+                            solution = try_solve(
+                                sympy.Eq(node_dim.node.expr, arg_dim), symbol
+                            )
+                            if solution is None:
+                                raise RuntimeError(  # noqa: TRY200
+                                    f"Expected input {node.name}.shape[{j}] = {arg_dim} to be "
+                                    f"of the form {node_dim.node.expr}, where {symbol} is an integer"
+                                )
+                            else:
+                                unification_map[symbol] = int(solution[1])
+
+                    if node_dim.node.expr in range_constraints:
+                        min_val, max_val = _convert_range_to_int(
+                            range_constraints[node_dim.node.expr]
+                        )
+                        # NOTE: we allow dimensions to be 0/1 at runtime
+                        if min_val > 2:
+                            if arg_dim < min_val:
+                                raise RuntimeError(
+                                    f"Expected input at {get_keystr(key_path)}.shape[{j}] to be >= "
+                                    f"{min_val}, but got {arg_dim}",
+                                )
+                        if max_val < math.inf:
+                            if arg_dim > max_val:
+                                raise RuntimeError(
+                                    f"Expected input at {get_keystr(key_path)}.shape[{j}] to be <= "
+                                    f"{max_val}, but got {arg_dim}",
+                                )
+                else:
+                    if arg_dim != node_dim:
+                        raise RuntimeError(
+                            f"Expected input at {get_keystr(key_path)}.shape[{j}] to be equal to "
+                            f"{node_dim}, but got {arg_dim}",
+                        )
+        elif isinstance(node_val, (int, float, str)):
+            if type(arg) != type(node_val) or arg != node_val:
+                raise RuntimeError(
+                    f"Expected input at {get_keystr(key_path)} to be equal to {node_val}, but got {arg}",
+                )
+
+
+def register_dataclass_as_pytree_node(
+    cls: Type[Any],
+    flatten_fn: Optional[FlattenFunc] = None,
+    unflatten_fn: Optional[UnflattenFunc] = None,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    return_none_fields: bool = False,
+) -> None:
+    assert dataclasses.is_dataclass(
+        cls
+    ), f"Only dataclasses can be registered with this function: {cls}"
+
+    def default_flatten_fn(obj: Any) -> Tuple[List[Any], Context]:
+        flattened = []
+        flat_names = []
+        none_names = []
+        for f in dataclasses.fields(obj):
+            name, val = f.name, getattr(obj, f.name)
+            if val is not None or return_none_fields:
+                flattened.append(val)
+                flat_names.append(name)
+            else:
+                none_names.append(name)
+        return flattened, [flat_names, none_names]
+
+    def default_unflatten_fn(values: Iterable[Any], context: Context) -> Any:
+        flat_names, none_names = context
+        return cls(**dict(zip(flat_names, values)), **dict.fromkeys(none_names))
+
+    flatten_fn = flatten_fn if flatten_fn is not None else default_flatten_fn
+    unflatten_fn = unflatten_fn if unflatten_fn is not None else default_unflatten_fn
+
+    if (to_dumpable_context is None) ^ (from_dumpable_context is None):
+        raise ValueError(
+            f"Both to_dumpable_context and from_dumpable_context for {cls} must "
+            "be None or registered."
+        )
+
+    _register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+    )
+
+
+def is_param(program: ExportedProgram, node: torch.fx.Node) -> bool:
+    """
+    Checks if the given node is a parameter within the exported program
+    """
+
+    return node.name in program.graph_signature.inputs_to_parameters
+
+
+def get_param(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> Optional[torch.nn.Parameter]:
+    """
+    Returns the parameter associated with the given node in the exported program.
+    Returns None if the node is not a parameter within the exported program
+    """
+
+    if is_param(program, node):
+        parameter_name = program.graph_signature.inputs_to_parameters[node.name]
+        return program.state_dict[parameter_name]
+
+    return None
+
+
+def is_buffer(program: ExportedProgram, node: torch.fx.Node) -> bool:
+    """
+    Checks if the given node is a buffer within the exported program
+    """
+
+    return node.name in program.graph_signature.inputs_to_buffers
+
+
+def get_buffer(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> Optional[torch.Tensor]:
+    """
+    Returns the buffer associated with the given node in the exported program.
+    Returns None if the node is not a buffer within the exported program
+    """
+
+    if is_buffer(program, node):
+        buffer_name = program.graph_signature.inputs_to_buffers[node.name]
+        if buffer_name in program.graph_signature.non_persistent_buffers:
+            return program.constants[buffer_name]
+        else:
+            return program.state_dict[buffer_name]
+
+    return None
+
+
+def is_lifted_tensor_constant(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> bool:
+    """
+    Checks if the given node is a lifted tensor constant within the exported program
+    """
+
+    return node.name in program.graph_signature.inputs_to_lifted_tensor_constants
+
+
+def get_lifted_tensor_constant(
+    program: ExportedProgram,
+    node: torch.fx.Node,
+) -> Optional[torch.Tensor]:
+    """
+    Returns the lifted tensor constant associated with the given node in the exported program.
+    Returns None if the node is not a lifted tensor constant within the exported program
+    """
+
+    if is_lifted_tensor_constant(program, node):
+        lifted_tensor_name = program.graph_signature.inputs_to_lifted_tensor_constants[
+            node.name
+        ]
+        return program.constants[lifted_tensor_name]
+
+    return None
+
+
+def sequential_split(gm: torch.fx.GraphModule, node_call_back) -> torch.fx.GraphModule:
+    """
+    Splits the graph module into multiple submodules based on the node_call_back.
+    The node_call_back should return True if the node is a delimiter. Delimiter will be
+    the first node in the next submodule.
+    """
+    from torch.fx.passes.split_module import split_module
+
+    split_map = {}
+    split_id = 0
+    for node in gm.graph.nodes:
+        if node_call_back(node):
+            split_id += 1
+        split_map[node] = split_id
+
+    new_gm = split_module(
+        gm,
+        gm,
+        lambda node: split_map[node],
+        keep_original_order=True,
+        keep_original_node_name=True,
+    )
+    # Keep the codegen from original graph module to preserve e.g. pytree info.
+    new_gm.graph._codegen = gm.graph._codegen
+    new_gm.recompile()
+    return new_gm
+
+
+def nodes_filter(nodes: List[torch.fx.Node], node_call_back) -> List[torch.fx.Node]:
+    """Returns the nodes that match the node_call_back as a list."""
+    return [node for node in nodes if node_call_back(node)]
+
+
+def nodes_first(
+    nodes: List[torch.fx.Node], node_call_back=None
+) -> Optional[torch.fx.Node]:
+    """
+    Returns the first node that matches the node_call_back. If no node matches, returns None.
+    When node_call_back is None, returns the first node in the node list.
+    """
+    ret = nodes_filter(nodes, node_call_back if node_call_back else lambda node: True)
+    if len(ret) > 0:
+        return ret[0]
+    return None
+
+
+def nodes_count(nodes: List[torch.fx.Node], node_call_back) -> int:
+    """Returns the number of nodes that match the node_call_back."""
+    return len(nodes_filter(nodes, node_call_back))
+
+
+def nodes_map(nodes: List[torch.fx.Node], node_call_back) -> List[torch.fx.Node]:
+    """
+    Sequentially visit the nodes list and invoke node_call_back on each element.
+    Returns the nodes list after the node_call_back is invoked on each element.
+    """
+    for node in nodes:
+        node_call_back(node)
+    return nodes
+
+
+def node_replace_(
+    old_node: torch.fx.Node, new_node: torch.fx.Node, delete_old: bool = False
+) -> None:
+    """
+    Replace all uses of old_node with new_node.
+    """
+    old_node.replace_all_uses_with(new_node)
+    if delete_old:
+        old_node.users.clear()
+        old_node.graph.erase_node(old_node)
+
+
+def node_inline_(call_mod_node: torch.fx.Node) -> None:
+    """
+    Inline the submodule of the given node into the parent module.
+    Note: we only support the case where submodule takes tensors inputs.
+    """
+    assert call_mod_node.op == "call_module"
+    gm = call_mod_node.graph.owning_module
+
+    assert isinstance(call_mod_node.target, str)
+    sub_gm = getattr(gm, call_mod_node.target)
+
+    phs = (node for node in sub_gm.graph.nodes if node.op == "placeholder")
+    body = (
+        node for node in sub_gm.graph.nodes if node.op not in ("placeholder", "output")
+    )
+    output = [node for node in sub_gm.graph.nodes if node.op == "output"]
+
+    for ph, arg in zip(phs, call_mod_node.args):
+        assert isinstance(arg, torch.fx.Node)
+        node_replace_(ph, arg, delete_old=True)
+
+    with gm.graph.inserting_before(call_mod_node):
+        for node in body:
+            new_node = gm.graph.node_copy(node)
+            node_replace_(node, new_node, delete_old=True)
+
+        if len(output) > 0:
+            assert len(output) == 1 and len(output[0].args) == 1
+            new_output = output[0].args[0]
+
+            if isinstance(new_output, torch.fx.Node):
+                node_replace_(call_mod_node, new_output, delete_old=True)
+            elif isinstance(new_output, (list, tuple)):
+                # Inline the get_item calls for the output node.
+                get_item_users = nodes_filter(
+                    list(call_mod_node.users.keys()),
+                    lambda node: node.op == "call_function"
+                    and node.target == operator.getitem,
+                )
+                # get_item_node.args[1] is the idx referring to new_output[idx]
+                nodes_map(
+                    get_item_users,
+                    lambda get_item_node: node_replace_(
+                        get_item_node,
+                        new_output[get_item_node.args[1]],
+                        delete_old=True,
+                    ),
+                )
+                call_mod_node.graph.erase_node(call_mod_node)
+            else:
+                raise NotImplementedError(
+                    f"Unsupported output type {type(new_output)}. Expect it to be a Node or a list/tuple of Nodes."
+                )
+        else:
+            call_mod_node.graph.erase_node(call_mod_node)
+
+    gm.delete_all_unused_submodules()
+    gm.recompile()
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/_export/verifier.py b/MLPY/Lib/site-packages/torch/_export/verifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f18f5f8d9b5abca6dae71b57caf1c3c72079c31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/verifier.py
@@ -0,0 +1,416 @@
+import inspect
+import math
+import operator
+from collections.abc import Iterable
+from typing import Any, Dict, final, List, Optional, Tuple, Type
+
+import torch
+from torch._ops import HigherOrderOperator, OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.export.exported_program import ExportedProgram
+from torch.export.graph_signature import (
+    CustomObjArgument,
+    InputKind,
+    SymIntArgument,
+    TensorArgument,
+)
+from torch.fx import GraphModule
+from torch.fx.experimental.symbolic_shapes import SymBool, SymFloat, SymInt
+
+
+class SpecViolationError(Exception):
+    pass
+
+
+def is_functional(op: OpOverload) -> bool:
+    return not op._schema.is_mutable
+
+
+def _check_has_fake_tensor(node: torch.fx.Node) -> None:
+    # TODO(angelayi): remove this in favor of _check_val
+    return _check_val(node)
+
+
+def _check_val(node: torch.fx.Node) -> None:
+    def _check_correct_val(val):
+        if val is None:
+            return True
+        elif isinstance(val, (int, bool, str, float)):
+            return True
+        elif isinstance(val, (torch.memory_format, torch.dtype, torch.device, torch.layout)):
+            return True
+        elif isinstance(val, (FakeTensor, torch.Tensor)):  # TODO(zhxchen17) Remove Tensor.
+            return True
+        elif isinstance(val, (SymInt, SymFloat, SymBool)):
+            return True
+        elif isinstance(val, CustomObjArgument):
+            return True
+        elif isinstance(val, Iterable):
+            return all(_check_correct_val(x) for x in val)
+        return False
+
+    def _no_returns(op):
+        if not isinstance(op, OpOverload):
+            return False
+        return len(op._schema.returns) == 0
+
+    if "val" not in node.meta:
+        if node.op == "call_function" and _no_returns(node.target):
+            return
+        raise SpecViolationError(f"Node.meta {node.name} is missing val field.")
+
+    val = node.meta["val"]
+    if not _check_correct_val(val):
+        raise SpecViolationError(f"Node.meta {node.name} has invalid val field {val}")
+
+
+class _VerifierMeta(type):
+    _registry: Dict[str, Type['Verifier']] = {}
+
+    def __new__(metacls, name, bases, attrs):
+        if bases:
+            if "check" in attrs or "_check_graph_module" in attrs:
+                raise SyntaxError("Overriding method check is not allowed.")
+            assert "dialect" in attrs and attrs["dialect"] != "ATEN"
+        else:
+            assert "check" in attrs
+            assert "_check_graph_module" in attrs
+            assert attrs["dialect"] == "ATEN"
+
+        assert isinstance(attrs["dialect"], str)
+        ret = type.__new__(metacls, name, bases, attrs)
+        metacls._registry[attrs["dialect"]] = ret  # type: ignore[assignment]
+        return ret
+
+def getattr_recursive(obj: Any, target: str) -> Any:
+    target_atoms = target.split('.')
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+class Verifier(metaclass=_VerifierMeta):
+    dialect = "ATEN"
+
+    def allowed_builtin_ops(self) -> List:
+        return [
+            operator.getitem,
+            operator.add,
+            operator.mul,
+            operator.sub,
+            operator.truediv,
+            operator.ge,
+            operator.le,
+            operator.gt,
+            operator.lt,
+            operator.eq,
+            operator.ne,
+            operator.floordiv,
+            operator.mod,
+            operator.and_,
+            operator.or_,
+            operator.not_,
+            operator.pow,
+            operator.neg,
+            operator.abs,
+            math.ceil,
+            math.floor,
+        ]
+
+    def allowed_op_types(self) -> Tuple[Type[Any], ...]:
+        return (OpOverload, HigherOrderOperator)
+
+    def allowed_getattr_types(self) -> Tuple[Type[Any], ...]:
+        return (torch.fx.GraphModule,)
+
+    def check_valid_op(self, op):
+        pass
+
+    def check_additional(self, gm: GraphModule) -> None:
+        """
+        Additional checks that are specific to some dialects.
+        """
+        pass
+
+    @final
+    def check(self, ep: ExportedProgram) -> None:
+        self._check_graph_module(ep.graph_module)
+        _verify_exported_program_signature(ep)
+
+    @final
+    def _check_graph_module(self, gm: torch.fx.GraphModule) -> None:
+        def _allowed_getattr_types() -> Tuple[Type[Any], ...]:
+            ret = self.allowed_getattr_types()
+            assert not any(t is object for t in ret)
+            return ret
+
+        def _check_valid_op(op) -> None:
+            def _allowed_builtin_ops() -> List:
+                ret = self.allowed_builtin_ops()
+                assert all(inspect.isbuiltin(op) for op in ret)
+                return ret
+
+            def _allowed_op_types() -> Tuple[Type[Any], ...]:
+                ret = self.allowed_op_types()
+                assert not any(t is object for t in ret)
+                return ret
+
+            # TODO Remove this allowlist.
+            _allowed_torch_functions = (
+                torch.autograd.grad_mode.set_grad_enabled,
+                torch.sym_int,
+                torch.sym_ite,
+                torch.sym_max,
+                torch.sym_min,
+                torch.sym_not,
+                torch.sym_sqrt,
+                # TODO (tmanlaibaatar)
+                # Predispatch export is able to contain autograd ops.
+                # These will be modeled as HOO later
+                torch._C._set_grad_enabled
+
+            )
+
+            if not isinstance(op, _allowed_op_types()):
+                if op not in _allowed_builtin_ops() and op not in _allowed_torch_functions:
+                    raise SpecViolationError(
+                        f"Operator '{op}' is not an allowed operator type: {_allowed_op_types()}\n"
+                        f"Valid builtin ops: {_allowed_builtin_ops()}"
+                        f"Valid torch functions: {_allowed_torch_functions}"
+                    )
+
+            if isinstance(op, OpOverload):
+                # All ops functional
+                if not is_functional(op):
+                    raise SpecViolationError(
+                        f"operator '{op}' is not functional"
+                    )
+            self.check_valid_op(op)
+
+        for mod in gm.modules():
+            if not isinstance(mod, torch.fx.GraphModule):
+                continue
+
+            mod.graph.lint()
+            for node in mod.graph.nodes:
+                # TODO(T140410192): should have fake tensor for all dialects
+                if node.op in {"call_module", "call_method"}:
+                    raise SpecViolationError(
+                        f"call_module is not valid: got a class '{node.target}' ",
+                    )
+
+                elif node.op == "call_function":
+                    _check_val(node)
+
+                    _check_valid_op(node.target)
+
+                elif node.op == "get_attr":
+                    if not isinstance(node.target, str):
+                        raise SpecViolationError(
+                            f"Expected get_attr target to be string, but got {type(node.target)}"
+                        )
+
+                    attr = getattr_recursive(mod, node.target)
+                    if isinstance(attr, torch.nn.Module):
+                        def _is_type(name, ty):
+                            return isinstance(getattr(attr, name, None), ty)
+                        if type(attr).__name__ == "LoweredBackendModule":
+                            if _is_type("backend_id", str) \
+                                    and _is_type("processed_bytes", bytes) \
+                                    and _is_type("compile_specs", list) \
+                                    and hasattr(attr, "original_module"):
+                                continue
+                            else:
+                                backend_id = getattr(attr, "backend_id", None)
+                                processed_bytes = getattr(attr, "processed_bytes", None)
+                                compile_specs = getattr(attr, "compile_specs", None)
+                                raise SpecViolationError(
+                                    f"Invalid get_attr type {type(attr)}. \n"
+                                    f"LoweredBackendModule fields: "
+                                    f"backend_id(str) : {type(backend_id)}, "
+                                    f"processed_bytes(bytes) : {type(processed_bytes)}, "
+                                    f"compile_specs(list) : {type(compile_specs)}"
+                                )
+
+                    if not isinstance(attr, _allowed_getattr_types()):
+                        raise SpecViolationError(
+                            f"Invalid get_attr type {type(attr)}. \n"
+                            f"Valid get_attr types: {_allowed_getattr_types()}"
+                        )
+
+
+                elif node.op == "placeholder":
+                    _check_val(node)
+                # TODO(zhxchen17)
+                # elif node.op == "output":
+                #     _check_flattened_outputs()
+
+        self.check_additional(gm)
+
+
+def _verify_exported_program_signature(exported_program) -> None:
+    # Check ExportedProgram signature matches
+    gs = exported_program.graph_signature
+
+    # Check every node in the signature exists in the graph
+    input_node_names = [node.name for node in exported_program.graph.nodes if node.op == "placeholder"]
+
+    if len(input_node_names) != len(gs.input_specs):
+        raise SpecViolationError(
+            f"Number of graph inputs ({len(input_node_names)}) "
+            f"does not match number of inputs in the graph signature ({len(gs.user_inputs)})"
+        )
+
+    for input_spec, node in zip(gs.input_specs, input_node_names):
+        if isinstance(input_spec.arg, (TensorArgument, SymIntArgument)):
+            if input_spec.arg.name != node:
+                raise SpecViolationError(
+                    f"Input spec name {input_spec.arg.name} does not match node name {node}"
+                )
+
+        if input_spec.kind == InputKind.USER_INPUT:
+            continue
+
+        elif input_spec.kind == InputKind.PARAMETER:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Parameter {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+
+            param = input_spec.target
+            if param not in exported_program.state_dict:
+                raise SpecViolationError(
+                    f"Parameter {param} is not in the state dict."
+                )
+
+            if not isinstance(exported_program.state_dict[param], torch.nn.Parameter):
+                raise SpecViolationError(
+                    f"State dict entry for parameter {param} is not an instance of torch.nn.Parameter."
+                )
+
+        elif input_spec.kind == InputKind.BUFFER:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Buffer {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+
+            buffer = input_spec.target
+            if input_spec.persistent is None:
+                raise SpecViolationError(
+                    f"Buffer {buffer} is missing a persistence flag"
+                )
+
+            if input_spec.persistent is True and buffer not in exported_program.state_dict:
+                raise SpecViolationError(
+                    f"Buffer {buffer} is not in the state dict."
+                )
+
+            if input_spec.persistent is False and buffer in exported_program.state_dict:
+                raise SpecViolationError(
+                    f"Non-persistent buffer {buffer} is in the state dict, it should not be."
+                )
+        elif input_spec.kind == InputKind.CONSTANT_TENSOR:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+
+            tensor_const = input_spec.target
+            if tensor_const not in exported_program.constants:
+                raise SpecViolationError(
+                    f"Constant tensor {tensor_const} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.CUSTOM_OBJ:
+            if not isinstance(input_spec.arg, CustomObjArgument):
+                raise SpecViolationError(
+                    f"Custom object {input_spec.name} is not a custom object argument. Found {input_spec.arg} instead."
+                )
+            if input_spec.target is None:
+                raise SpecViolationError(
+                    f"InputSpec for {input_spec.name} has no target."
+                )
+
+            custom_obj = input_spec.target
+            if custom_obj not in exported_program.constants:
+                raise SpecViolationError(
+                    f"Custom object {custom_obj} is not in the constants dictionary."
+                )
+        elif input_spec.kind == InputKind.TOKEN:
+            if not isinstance(input_spec.arg, TensorArgument):
+                raise SpecViolationError(
+                    f"Constant tensor {input_spec.name} is not a tensor argument. Found {input_spec.arg} instead."
+                )
+        else:
+            raise SpecViolationError(
+                f"Unknown InputKind {input_spec.kind}."
+            )
+
+    # Check outputs
+    output_node = list(exported_program.graph.nodes)[-1]
+    assert output_node.op == "output"
+    output_nodes = [
+        arg.name if isinstance(arg, torch.fx.Node) else arg
+        for arg in output_node.args[0]
+    ]
+
+    if len(output_nodes) != len(gs.output_specs):
+        raise SpecViolationError(
+            f"Number of output nodes {len(output_nodes)} is different "
+            "Than the number of outputs specified by the graph signature: \n"
+            f"Number of mutated buffers: {len(gs.buffers_to_mutate)}. \n"
+            f"Number of user outputs: {len(gs.user_outputs)}. \n"
+        )
+
+    num_tokens = len(gs.output_tokens)
+    end = len(gs.buffers_to_mutate) + len(gs.user_inputs_to_mutate) + num_tokens
+    mutate_nodes: List[str] = output_nodes[num_tokens:end]
+    user_output_nodes = output_nodes[end:end + len(gs.user_outputs)]
+
+    for mutation_node in mutate_nodes:
+        if mutation_node in gs.buffers_to_mutate:
+            if gs.buffers_to_mutate[mutation_node] not in gs.buffers:
+                raise SpecViolationError(
+                    f"Buffer output {mutation_node} does not point to a buffer that exists. \n"
+                    f"Dict of buffers that are mutated, in order: {gs.buffers_to_mutate} \n"
+                    f"Buffer nodes available: {gs.buffers} \n"
+                )
+        elif mutation_node in gs.user_inputs_to_mutate:
+            if gs.user_inputs_to_mutate[mutation_node] not in gs.user_inputs:
+                raise SpecViolationError(
+                    f"User input output {mutation_node} does not point to a user input that exists. \n"
+                    f"Dict of user inputs that are mutated, in order: {gs.user_inputs_to_mutate} \n"
+                    f"User input nodes available: {gs.user_inputs} \n")
+        else:
+            raise SpecViolationError(
+                f"Mutation node {mutation_node} is neither a buffer nor a user input. "
+                f"Buffers to mutate: {gs.buffers_to_mutate}, User inputs to mutate: {gs.user_inputs_to_mutate}"
+            )
+
+    for user_output_node, user_output_name in zip(user_output_nodes, gs.user_outputs):
+        if user_output_node != user_output_name:
+            raise SpecViolationError(
+                f"User output {user_output_node} is not in the correct "
+                "order or is not found in the "
+                f"exported program's user_output list: {gs.user_outputs}. "
+            )
+
+
+def load_verifier(dialect: str) -> Optional[Type[Verifier]]:
+    if dialect == "ATEN":
+        return _VerifierMeta._registry.get(dialect)
+    return _VerifierMeta._registry[dialect]
diff --git a/MLPY/Lib/site-packages/torch/_export/wrappers.py b/MLPY/Lib/site-packages/torch/_export/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54fab43f565586425aa20bc0d576a0b5d2304c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_export/wrappers.py
@@ -0,0 +1,114 @@
+from contextlib import contextmanager
+
+import torch
+import torch._custom_ops
+from torch._C import DispatchKey
+from torch._higher_order_ops.strict_mode import strict_mode
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.utils import _pytree as pytree
+
+
+_export_tracepoint = HigherOrderOperator("_export_tracepoint")
+
+
+@_export_tracepoint.py_impl(ProxyTorchDispatchMode)
+def export_tracepoint_dispatch_mode(mode, *args, **kwargs):
+    if not mode.enable_tracing:
+        return _export_tracepoint(*args, **kwargs)
+    p_args, p_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, (args, kwargs))
+    proxy = mode.tracer.create_proxy(
+        "call_function", _export_tracepoint, p_args, p_kwargs
+    )
+    return track_tensor_tree(args, proxy, constant=None, tracer=mode.tracer)
+
+
+@_export_tracepoint.py_impl(FakeTensorMode)
+def export_tracepoint_fake_tensor_mode(mode, *args, **kwargs):
+    with mode:
+        return args
+
+
+@_export_tracepoint.py_functionalize_impl
+def export_tracepoint_functional(ctx, *args, **kwargs):
+    unwrapped_args = ctx.unwrap_tensors(args)
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+
+    with ctx.redispatch_to_next():
+        out = _export_tracepoint(*unwrapped_args, **unwrapped_kwargs)
+        return ctx.wrap_tensors(out)
+
+
+_export_tracepoint.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(_export_tracepoint, deferred_error=True)
+)
+
+
+@_export_tracepoint.py_impl(DispatchKey.CPU)
+def export_tracepoint_cpu(*args, **kwargs):
+    return args
+
+
+def _wrap_submodule(mod, path, module_call_specs):
+    assert isinstance(mod, torch.nn.Module)
+    assert path != ""
+    submodule = mod
+    for name in path.split("."):
+        if not hasattr(submodule, name):
+            raise RuntimeError(f"Couldn't find submodule at path {path}")
+        submodule = getattr(submodule, name)
+
+    def update_module_call_signatures(path, in_spec, out_spec):
+        if path in module_call_specs:
+            assert module_call_specs[path]["in_spec"] == in_spec
+            assert module_call_specs[path]["out_spec"] == out_spec
+        module_call_specs[path] = {"in_spec": in_spec, "out_spec": out_spec}
+
+    def check_flattened(flat_args):
+        for a in flat_args:
+            if not (isinstance(a, (torch.Tensor, str, int, float, bool)) or a is None):
+                raise AssertionError(
+                    f"Only Tensors or scalars are supported as pytree flattened inputs, got: {a}"
+                )
+
+    def pre_hook(module, args, kwargs):
+        flat_args, in_spec = pytree.tree_flatten((args, kwargs))
+        check_flattened(flat_args)
+        flat_args = _export_tracepoint(*flat_args, kind="module_call_inputs", path=path)
+        args, kwargs = pytree.tree_unflatten(flat_args, in_spec)
+        return args, kwargs
+
+    def post_hook(module, args, kwargs, res):
+        _, in_spec = pytree.tree_flatten((args, kwargs))
+        flat_res, out_spec = pytree.tree_flatten(res)
+        check_flattened(flat_res)
+        flat_res = _export_tracepoint(*flat_res, kind="module_call_outputs", path=path)
+        update_module_call_signatures(path, in_spec, out_spec)
+        return pytree.tree_unflatten(flat_res, out_spec)
+
+    pre_handle = submodule.register_forward_pre_hook(pre_hook, with_kwargs=True)
+    post_handle = submodule.register_forward_hook(post_hook, with_kwargs=True)
+    return pre_handle, post_handle
+
+
+@contextmanager
+def _wrap_submodules(f, preserve_signature, module_call_signatures):
+    handles = []
+
+    try:
+        for path in preserve_signature:
+            handles.extend(_wrap_submodule(f, path, module_call_signatures))
+        yield
+    finally:
+        for handle in handles:
+            handle.remove()
+
+
+def _mark_strict_experimental(cls):
+    def call(self, *args):
+        return strict_mode(self, args)
+
+    cls.__call__ = call
+    return cls
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__init__.py b/MLPY/Lib/site-packages/torch/_functorch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6272b424658450437a313fc71bedbce73da3205
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b758d3e2d95c16826b4bf374a7b49d2f740049bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/aot_autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/aot_autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb7dee310dc9193b98ee711d6ed526f3b746c40c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/aot_autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/apis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/apis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b439abd9c4148fd7ddc73a24776dcc0c53b1c1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/apis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/autograd_function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/autograd_function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a73f004dfb0a0f823eab9914c80edfd2424526
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/autograd_function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/batch_norm_replacement.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/batch_norm_replacement.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da0a385c9315d4cb4a3ff907c08343649359bb69
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/batch_norm_replacement.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/benchmark_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/benchmark_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..662290fc3a2a7f78d1eca7225e0132eeec81a002
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/benchmark_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compile_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compile_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b945c861ce33fadad4918c4d23aeaf619d505ac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compile_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compilers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compilers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec07be0e799d7bc180b83c93b5ee3c71d0497555
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/compilers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf417d23ae5f99fca379e890c17744327054dbe3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/deprecated.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/deprecated.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..030f4e0673faa99a864f0f15f9c664fc7ec82942
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/deprecated.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/eager_transforms.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/eager_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b851d56544d09cfc20dc3356af8740b55d33fdc5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/eager_transforms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/functional_call.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/functional_call.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db43ae021e74ea48dd2224405531a7a206712bda
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/functional_call.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/fx_minifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/fx_minifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b9c8d00a6ebd133686da291198e9a975289432
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/fx_minifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/make_functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/make_functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e1cd7fea0883a2b7f44d4f97740dd7dbb1d037
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/make_functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/partitioners.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/partitioners.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f83843bccac71106ad6e9974309907fcfe430566
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/partitioners.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pyfunctorch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pyfunctorch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37c6e9c2b148fc73e937d10d966275366432f5f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pyfunctorch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/python_key.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/python_key.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed260af0b0a3c1a88ae51660eb47131a1cfee183
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/python_key.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pytree_hacks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pytree_hacks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c51d69d485a98437390f10ec3a8325411908d6e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/pytree_hacks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/top_operators_github_usage.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/top_operators_github_usage.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ac175edc5e3155ebd92791f828614687fa4c1d0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/top_operators_github_usage.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a6b21a9cd7cc97191f85aa5d51da2918bbcf08a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/__pycache__/vmap.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/vmap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3c5bdaaf58e96d3e33dfa126b81482f4361ec1d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/__pycache__/vmap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__init__.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6272b424658450437a313fc71bedbce73da3205
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..755ea62025a77913b08d8087a21b0da20bf5b9cc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/collect_metadata_analysis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/collect_metadata_analysis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc0934c637a5247067b129c1e673b44a2f7896f1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/collect_metadata_analysis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/dispatch_and_compile_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/dispatch_and_compile_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f50a36592b97bad1c4cab808d51032615fea3e4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/dispatch_and_compile_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/functional_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/functional_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94b1756759610e018a77beacccbb7df56fe8da31
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/functional_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/input_output_analysis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/input_output_analysis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..062d7de61791353c23a4f6d333cbf3e287429787
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/input_output_analysis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/jit_compile_runtime_wrappers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/jit_compile_runtime_wrappers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5fca0b24baba8423826ea386e4b757b24198899
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/jit_compile_runtime_wrappers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/logging_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/logging_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8d16b8e3ab858e4ba3bfeb3fbe2c0c713088c4a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/logging_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/runtime_wrappers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/runtime_wrappers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9448ca347a82df52867fbbe47d1ce3c9ea8618a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/runtime_wrappers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/schemas.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/schemas.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae39509cf9419b9b8642fda371a887d79455ba9e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/schemas.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/subclass_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/subclass_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c78339d055cccfa13264fad6e0f1805f5a5edaad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/subclass_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/traced_function_transforms.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/traced_function_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95ccf683116ac78d922c6d27eccb56e0e97ff78b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/traced_function_transforms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..748b8621ba23f9c214e7e04328c5ee8b4a8e315e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..c35aa2f6739c1d394afe041b9b95491852775f36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -0,0 +1,626 @@
+"""
+This module is one of the analysis modules - it takes as input a function or graph
+and some preexisting properties, and returns some data that is useful for deciding
+how to further proceed with compilation or construct runtime wrappers.
+
+In particular, the analysis here constructs view and mutation metadata from running
+a functionalized version of the graph under compilation.
+"""
+
+import collections
+import logging
+from functools import wraps
+from typing import Callable, DefaultDict, Dict, List
+
+import torch
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode
+from torch._subclasses.meta_utils import safe_is_leaf
+from torch.fx.experimental.symbolic_shapes import is_concrete_int
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    transform_subclass,
+)
+from .functional_utils import (
+    are_all_mutations_hidden_from_autograd,
+    are_all_mutations_under_no_grad_or_inference_mode,
+    from_fun,
+    has_data_mutation,
+    has_metadata_mutation,
+    has_same_metadata,
+    to_fun,
+)
+from .schemas import (
+    InputAliasInfo,
+    MutationType,
+    OutputAliasInfo,
+    OutputType,
+    ViewAndMutationMeta,
+)
+from .subclass_utils import create_subclass_meta
+
+from .utils import _get_autocast_states, KNOWN_TYPES, strict_zip
+
+zip = strict_zip
+
+log = logging.getLogger(__name__)
+
+
+# This is a version of functionalization that is specifically designed
+# for the AOTAutograd use case.
+#
+# Unlike functorch's variant, this doesn't use the functorch level system,
+# instead it directly uses PyTorch's conventional dispatcher to hit the
+# functionalization key.  In particular, this means that FunctionalTensorWrapper
+# can have autograd data stored directly on it.
+#
+# In typical AOTAutograd usage, the dispatch key order will look like:
+#
+#   Autograd - Functionalization ~~~~> Proxy Mode - Fake Tensor
+#       outer tensor                        inner tensor
+#
+# Returns:
+# - ViewAndMutationMeta, telling us metadata about the inputs and outputs, and
+#   The list of outputs from the forward, but **only** the outputs that we need
+#   to pass in as tangents into the backward.
+#   Specifically, aliased outputs from the forward get regenerated, and don't participate
+#   in the compiled backward function.
+def run_functionalized_fw_and_collect_metadata(
+    f,
+    *,
+    keep_input_mutations: bool,
+    # TODO: refactor to kill this flag
+    is_train: bool = False,
+    pre_dispatch: bool = False,
+) -> Callable[..., ViewAndMutationMeta]:
+    memo: Dict[Tensor, Tensor] = {}
+
+    def _to_fun(t):
+        if isinstance(t, Tensor):
+            if t in memo:
+                return memo[t]
+            r = to_fun(t)
+            memo[t] = r
+            return r
+        else:
+            return t
+
+    @wraps(f)
+    def inner(*flat_args):
+        # This function is meant to be run with the forward, which expects a flat list of tensor/symint/other args.
+        assert all(isinstance(a, tuple(KNOWN_TYPES)) for a in flat_args)
+
+        input_info: List[InputAliasInfo] = []
+        output_info: List[OutputAliasInfo] = []
+
+        prior_grad_enabled = torch.is_grad_enabled()
+        prior_autocast_states = _get_autocast_states()
+
+        # See Note [Disabling Functionalize TLS Above Python Functionalization]
+        disable_above = torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+
+        # It doesn't matter if we run this under predispatch or not because it is
+        # only for figuring out metadata
+        mode = FunctionalTensorMode(_allow_token_discovery=True)
+        with disable_above, mode:
+            # precondition: The passed in function already handles unflattening inputs + flattening outputs
+            flat_f_args = pytree.tree_map(_to_fun, flat_args)
+            flat_f_outs = f(*flat_f_args)
+
+        if prior_autocast_states != _get_autocast_states():
+            raise RuntimeError(
+                "AOTAutograd does not support tracing graphs that mutate the autocast state. "
+                "Dynamo will only insert autocast context managers (e.g. with torch.autocast(..)) into the graph, "
+                "which will unwind all of their mutations to autocast state before the graph exits. "
+                "If you encounter this error while using torch.compile, please file a bug."
+            )
+
+        # Inspect the state of the input tensor functional wrapper to detect input mutation info
+        # If inp[i] has a metadata-only mutation, then maybe_inputs_with_mutated_metadata[i] contains the updated version
+        for i, (arg, f_arg) in enumerate(zip(flat_args, flat_f_args)):
+            # NB: Mutation of non-contiguous tensor subclass input can result in a mismatch in
+            # strides between the functionalized arg inner tensors and non-functionalized arg inner
+            # tensors. This is a problem as the inner tensor stride change may not be reflected
+            # correctly in the outer tensor, so disallow this for now.
+            mutates_data = has_data_mutation(f_arg)
+            if (
+                mutates_data
+                and not arg.is_contiguous()
+                and is_traceable_wrapper_subclass(arg)
+            ):
+                raise RuntimeError(
+                    "Mutations on non-contiguous inputs are currently not allowed on "
+                    "tensor subclasses"
+                )
+
+            if not isinstance(arg, Tensor):
+                new_arg = arg
+            else:
+                new_arg = from_fun(f_arg)
+            mutates_metadata = has_metadata_mutation(
+                f_arg, arg, check_only_storage_mutation=False
+            )
+            if mutates_metadata and is_traceable_wrapper_subclass(arg):
+                raise RuntimeError(
+                    "Metadata mutations are currently not allowed on tensor subclasses"
+                )
+            mutates_storage_metadata = has_metadata_mutation(
+                f_arg, arg, check_only_storage_mutation=True
+            )
+            mutations_hidden_from_autograd = are_all_mutations_hidden_from_autograd(
+                f_arg
+            )
+            mutations_under_no_grad_or_inference_mode = (
+                mutates_data
+                and are_all_mutations_under_no_grad_or_inference_mode(f_arg)
+            )
+
+            # Here, we're saying that if an input experienced a set call, inp.set_(other),
+            # then we can effectively not have to worry about whether its data was mutated.
+            # There are 3 cases:
+            # (1) We mutate inp *after* the set_() call. other is a graph intermediate.
+            #     In this case, we're not really mutating the input storage of "inp";
+            #     we're mutating the storage of an intermdiate value (other),
+            #     and slamming that storage into the input tensor. So no data mutation is necessary.
+            # (2) We mutate inp *after* the set_() call. other is a graph *input*.
+            #     In this case, the data mutation will be properly handled in the runtime
+            #     epilogue during the processing of "other"
+            # (3) We mutate inp *before* the set_() call.
+            #     This case is *not* currently handled.
+            #     TODO: discuss this in the PR. Both supporting this, and detecting + erroring out,
+            #     seem painful to get working.
+            if mutates_storage_metadata:
+                mutates_data = False
+
+            requires_grad = isinstance(f_arg, torch.Tensor) and f_arg.requires_grad
+
+            input_info.append(
+                InputAliasInfo(
+                    is_leaf=isinstance(arg, Tensor) and safe_is_leaf(arg),
+                    mutates_data=mutates_data,
+                    mutates_metadata=mutates_metadata,
+                    mutations_hidden_from_autograd=mutations_hidden_from_autograd,
+                    mutates_storage_metadata=mutates_storage_metadata,
+                    mutations_under_no_grad_or_inference_mode=mutations_under_no_grad_or_inference_mode,
+                    requires_grad=requires_grad,
+                    keep_input_mutations=keep_input_mutations,
+                )
+            )
+
+        # If a function involves creating a tensor, and returning a view of it, such that its _base is the intermediate,
+        # We need to make sure our graph returns the _base as a graph output, and we manually recreate the view
+        # to return to the user. Why? The backend compiler is free to (incorrectly) not set requires_grad
+        # on the base tensor, but we are obligated to properly set requires-gradness on the real output.
+
+        inp_storage_refs = {
+            StorageWeakRef(inpt.untyped_storage()): idx
+            for idx, inpt in enumerate(flat_f_args)
+            if isinstance(inpt, Tensor)
+        }
+
+        # We need inp tensor id's to be able to tell if an outputs **are** inputs.
+        inp_tensor_ids = {id(inpt) for inpt in flat_f_args if isinstance(inpt, Tensor)}
+        # We need output tensor id's to tell if any output._base` attributes **are** other outputs.
+        # (This is also a dict because we need to know that output's index, so we can regenerate
+        # the alias from it).
+        out_tensor_ids = {id(o): i for i, o in enumerate(flat_f_outs)}
+
+        # Keep track of which outputs alias other outputs
+        out_tensor_alias_counts: DefaultDict = collections.defaultdict(int)
+        # This tells us, for a given group of outputs that alias each other,
+        # whether they e.g. all came from an unbind call
+        num_aliased_tensors_that_are_multi_output_views: DefaultDict = (
+            collections.defaultdict(int)
+        )
+        out_storage_to_tensors: DefaultDict = collections.defaultdict(set)
+        curr_storage = None
+        for o in flat_f_outs:
+            if isinstance(o, torch.Tensor):
+                curr_storage = StorageWeakRef(o.untyped_storage())
+                out_tensor_alias_counts[curr_storage] += 1
+                # Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+                # This is an optimization on top of the "alias of intermediates" logic,
+                # which you can read more about under Note [AOT Autograd: outputs aliasing inputs or intermediates!]
+                #
+                # Before describing the optimization: this is important for AOTAutograd to have good
+                # perf around, multi-output views. HOWEVER:
+                # - There is a more generic change to AOTAutograd that we'd like to make, that subsumes this case,
+                #   around using pre-dispatch tracing to partition out a graph so we can faithfully replay all
+                #   views without having to regenerate them at runtime.
+                # - It's loosely described in this doc (more details will be added soon):
+                #   https://docs.google.com/document/d/1DlfFq8TKbuAn2zyJxLfoW-X1qkkm5PLdHFtySo03QAk/edit
+                # - Once that change lands, we should just rip out this "optimization", since:
+                #   (1) It will be fully unnecessary
+                #   (2) Although it is only a few lines of code, it is a bit difficult to reason about
+                #       its correctness with the autograd engine in all cases.
+                #
+                #
+                # What is this optimization? Consider the below case:
+                # def f(x):
+                #     intermediate = x.mul(2)
+                #     # x and intermediate here require grad
+                #     o1, o2, ... o10 = intermediate.unbind(-1)
+                #     return intermediate, o1, o2, ... o10
+                # Now, the "intermediate base" handling in AOTAutograd implies that we must do the following:
+                #   (1) return "intermediate as an extra output of the compiled graph
+                #   (2) regenerate each aliased output off of "intermediate", **outside** of the autograd.Function.
+                # The reason AOTAutograd ordinarily does this is for safety: the autograd engine needs to know
+                # that o1 through o10 are all aliased, and if we blindly return o1 through o10 from the autograd.Function,
+                # this information will be hidden.
+                # In particular, mutating one alias might require autograd to update autograd metadata on the other aliases
+                # (like their grad_fn, for example, when the autograd engine needs to do view-replay).
+                #
+                # However, intermediate_base logic can be bad for backward performance (we sometimes generate
+                # as_strided calls during the intermediate base logic, which can have a slow backward formula).
+                # Is it possible to find a set of conditions where it is **safe** to hide the output aliasing from autograd?
+                #
+                # For a set of outputs of the graph that alias each other, o_1...o_k, consider:
+                # (1) They came from the same multi-output view op, e.g. o_1, ..., o_k = intermediate.unbind(0)
+                # (2) If there are any other aliases of o_1 through o_k (in the example above, intermediate),
+                #     **at most** 1 can escape from the graph (e.g. there is not some other graph input/output
+                #     o_other, that aliases these outputs)
+                # (3) o_1...o_k all require_grad, they all share the same ._base, and their ._base requires grad.
+                #     This condition is important because it's what causes slowness in the intermediate_base
+                #     codepath of aot_autograd. Ordinarily, o_1...o_k would all get a grad_fn, and
+                #     aot_autograd's view-replay might give each output an AsStridedBackward as its grad_fn.
+                #     "K" AsStridedBackward calls will be *much* slower than a single UnbindBackward.
+                # In this setup, is it possible to mutate one of the outputs o_i in a way that would affect the autograd meta
+                # of the other aliases?
+                #
+                # Claim: No! Consider a few example (which I'm pretty sure cover all cases of mutation w.r.t. autograd):
+                # (a) What happens if we mutate any of o_1 through o_k directly?
+                #     Autograd raises an error:
+                #     "RuntimeError: Output 0 of UnbindBackward0 is a view and is being modified inplace. This view is
+                #      the output of a function that returns multiple views. Such functions do not allow the output
+                #      views to be modified inplace. You should replace the inplace operation by an out-of-place one."
+                # (b) What if we take a view of o_k and mutate it, o_k.view(o_k.shape).mul_(2)?
+                #     Autograd raises the same error- the "multi-output-view"ness of an alias propagates to future views.
+                # (c) What if we mutate o_k under no_grad?
+                #     Autograd raises the same error
+                # (d) What if we detach and mutate, e.g. o_k.detach().mul_(2)?
+                #     Autograd allows this, *but* autograd updates all alias's grad_fn's to be error functions when accessed.
+                #     Autograd raises the same error
+                # (e) What if we try to mutate another alias of o_1...o_k, that was **not** created from a multi-output view?
+                #     We promised that there is at most **one** such alias, e.g. intermediate in the example above.
+                #     You can mutate intermediate, but in eager mode this will change the grad_fn of o_1...o_k
+                #     to be error fn's.
+                #     Since intermediate was the *only* non-multi-output-alias, there are no other aliases
+                #     of `intermediate` around that were produced by the compiled fn and have a valid grad_fn.
+                #
+                # Coming back to this optimization:
+                # Given that it is not possible for mutating one of these aliases to affect the autograd metadata of another alias
+                # without causing an error in eager mode, we will simple hide the aliasing from autograd during torch.compile
+                # if all of the above conditions are met.
+                # This has the slight downside that it's possible to write some "bad" code that autograd will raise an error on
+                # in eager but fail to during torch.compile, but it has the benefit that this code has much better performance.
+                # NOTE: if and when we eventually update AOTAutograd to do the "view graph slicing" defined here:
+                # https://docs.google.com/document/d/1DlfFq8TKbuAn2zyJxLfoW-X1qkkm5PLdHFtySo03QAk/edit,
+                # then this optimization will probably matter less and might be ok to remove.
+                is_cur_tensor_multi_out_view = isinstance(
+                    o, FunctionalTensor
+                ) and torch._functionalize_is_multi_output_view(  # type: ignore[attr-defined]
+                    o.elem
+                )
+                if is_cur_tensor_multi_out_view:
+                    num_aliased_tensors_that_are_multi_output_views[curr_storage] += 1
+                out_storage_to_tensors[curr_storage].add(o)
+
+        # maps the id of an intermediate base to its index in the output of the compiled forward
+        intermediate_base_tensor_id_to_output_idx: Dict[int, int] = {}
+        intermediate_bases: List[torch.Tensor] = []
+        # Why Do We Care If Storage Changed?
+        # It's important to understand the implications of storage changes in complex scenarios. Take this example:
+        #
+        # def f(x):
+        #     x_storage = x.untyped_storage()
+        #     non_leaf_tensor = torch.ones(4, requires_grad=True).clone()
+        #
+        #     # Using no_grad() and _unsafe_preserve_version_counter to simulate the .data = operation
+        #     with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(x):
+        #         x.set_(non_leaf_tensor.untyped_storage())
+        #
+        #     out = x.view(-1)
+        #
+        #     # Restoring x to its original storage, again simulating .data = operation
+        #     with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(x):
+        #         x.set_(x_storage)
+        #
+        #     return out
+        #
+        # In this scenario, 'x' and 'out' have different shapes and are stored at different memory addresses, aka no aliasing.
+        # However, due to how set_() and more specificlaly, set is functionalized, is defined to preserve eager semantics,
+        # the autograd engine mistakenly assumes that 'x' and 'out' are aliased, treating 'x' as 'out._base'.
+        # This misinterpretation leads to an 'alias_of_input' flag, causing an unnecessary as_strided() call to be generated,
+        # which could lead to issues later in the code.
+        for o in flat_f_outs:
+            functional_tensor_storage_changed = isinstance(
+                o, FunctionalTensor
+            ) and torch._functionalize_was_storage_changed(  # type: ignore[attr-defined]
+                o.elem
+            )
+            curr_storage = (
+                None
+                if not isinstance(o, torch.Tensor)
+                else StorageWeakRef(o.untyped_storage())
+            )
+            outs_with_identical_metadata_that_require_grad = (
+                []
+                if not isinstance(o, Tensor)
+                else [
+                    curr
+                    for curr in out_storage_to_tensors[curr_storage]
+                    if has_same_metadata(o, curr)
+                    and curr.requires_grad
+                    and o is not curr
+                ]
+            )
+
+            # See Note [Accessing .grad_fn on FunctionalTensor]
+            # In-place operations on views will trigger a lazy rebase of the autograd graph;
+            # this runs during access to the .grad_fn. The rebase logic will invoke view ops
+            # on FunctionalTensors, so we must enable a FunctionalTensorMode here to ensure
+            # these op calls succeed.
+            grad_fn = None
+            if isinstance(o, Tensor):
+                with FunctionalTensorMode():
+                    grad_fn = o.grad_fn
+
+            is_result_of_custom_autograd_fn = False
+            # Need to check for both custom cpp (CppFunction) and python (BackwardCFunction)
+            # autograd fns
+            if type(grad_fn).__name__ == "CppFunction":
+                is_result_of_custom_autograd_fn = True
+            if isinstance(grad_fn, torch.autograd.function.BackwardCFunction):
+                is_result_of_custom_autograd_fn = True
+
+            if not isinstance(o, Tensor):
+                output_type = OutputType.non_alias
+                base_idx = None
+            elif (
+                curr_storage in inp_storage_refs
+                and grad_fn is not None
+                and is_result_of_custom_autograd_fn
+            ):
+                output_type = OutputType.custom_function_view
+                base_idx = None
+            elif (
+                curr_storage in inp_storage_refs
+                and not functional_tensor_storage_changed
+            ):
+                base_idx = inp_storage_refs[curr_storage]
+                is_input_tensor = id(o) in inp_tensor_ids
+                num_aliased_outs = out_tensor_alias_counts[curr_storage]
+                num_multi_output_view_outs = (
+                    num_aliased_tensors_that_are_multi_output_views[curr_storage]
+                )
+                num_aliased_outs_that_are_not_multi_output_views = (
+                    num_aliased_outs - num_multi_output_view_outs
+                )
+                if (
+                    grad_fn is not None
+                    and num_aliased_outs_that_are_not_multi_output_views == 0
+                ):
+                    # See Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+                    # In particular, given:
+                    # def f(x):
+                    #     return list(x.unbind(0))
+                    # The main reason we ordinarily try to regenerate these output aliases outside of the
+                    # compiled autograd.Function is because if any of the outputs are later mutated,
+                    # autograd needs to perform view-replay to regenerate them.
+                    # However, autograd does not allow users to mutate multi-output views
+                    # in any way that can change the autograd metadata of other aliases.
+                    # So we hide this aliasing from autograd here.
+                    log.debug(
+                        "Encountered AOTAutograd case: differentiable outputs that \
+alias each other from a multi-output view call"
+                    )
+                    output_type = OutputType.non_alias
+                elif is_input_tensor:
+                    output_type = OutputType.is_input
+                else:
+                    output_type = OutputType.alias_of_input
+
+            # We only need to handle the intermediate base case when both
+            # the intermediate base and the output require gradients.
+            # See Note [AOT Autograd: outputs aliasing inputs or intermediates!]
+            elif o._base is not None and o.requires_grad and o._base.requires_grad:
+                num_aliased_outs = out_tensor_alias_counts[curr_storage]
+                num_multi_output_view_outs = (
+                    num_aliased_tensors_that_are_multi_output_views[curr_storage]
+                )
+                num_aliased_outs_that_are_not_multi_output_views = (
+                    num_aliased_outs - num_multi_output_view_outs
+                )
+                # Note: [AOTAutograd: differentiable outputs that alias each other from a multi-output view call]
+                if (
+                    out_tensor_alias_counts[curr_storage] == 1
+                    or num_aliased_outs_that_are_not_multi_output_views <= 1
+                ):
+                    # Note [Intermediate Bases Optimization]
+                    # Normally if we have an output that aliases an intermediate,
+                    # we need to add the extra "intermediate base" logic further down
+                    # to prevent autograd from yelling at us if the user later tries to
+                    # mutate that output.
+                    # However, the common case here is if we have an output that aliases an intermediate,
+                    # but doesn't alias any other outputs.
+                    # In that case, autograd shouldn't have to worry about the aliasing at all
+                    # (if that output is mutated, there are no other live aliases for autograd to worry about).
+                    # The "intermediate bases" can hurt inductor perf by forcing more variables to become outputs.
+                    # So as an optimization, we won't do intermediate base handling in this case.
+                    # Instead, we'll hide the aliasing from autograd using aten._unsafe_view().
+                    if (
+                        out_tensor_alias_counts[curr_storage] != 1
+                        and num_aliased_outs_that_are_not_multi_output_views <= 1
+                    ):
+                        log.debug(
+                            "Encountered AOTAutograd case: differentiable outputs that alias each other \
+from a multi-output view call"
+                        )
+                    output_type = OutputType.unsafe_view_alias
+                    base_idx = None
+                else:
+                    # First, check if o's ._base is an existing output
+                    maybe_existing_out_idx = out_tensor_ids.get(id(o._base), None)
+                    if maybe_existing_out_idx is not None:
+                        # Special case where the output is an alias of a graph intermediate, but that intermediate
+                        # is itself also a user output.
+                        output_type = (
+                            OutputType.alias_of_intermediate_base_is_user_output
+                        )
+                        base_idx = maybe_existing_out_idx
+                    else:
+                        # Next, check if o's ._base is an intermediate base that we already returned
+                        maybe_existing_base_output_idx = (
+                            intermediate_base_tensor_id_to_output_idx.get(
+                                id(o._base), None
+                            )
+                        )
+                        if maybe_existing_base_output_idx is not None:
+                            output_type = OutputType.alias_of_intermediate
+                            base_idx = maybe_existing_base_output_idx
+                        else:
+                            # Otherwise, take o._base and explicitly return it as an output in the compiled graph
+                            new_out_idx = len(intermediate_bases)
+                            base_idx = new_out_idx
+                            # Indicate to the logic later on (when we trace the joint)
+                            # that this particular output should get it's ._base appended to the forward graph outputs
+                            output_type = (
+                                OutputType.alias_of_intermediate_save_as_output
+                            )
+                            intermediate_base_tensor_id_to_output_idx[
+                                id(o._base)
+                            ] = new_out_idx
+                            intermediate_bases.append(o._base)
+            elif (
+                # See https://github.com/pytorch/pytorch/issues/100348 for this case.
+                # This protects against the specific case where a user fn returns (output, output.detach())
+                out_tensor_alias_counts[curr_storage] > 1
+                and len(outs_with_identical_metadata_that_require_grad) > 0
+                and not o.requires_grad
+            ):
+                assert len(outs_with_identical_metadata_that_require_grad) > 0
+                # In theory we could use any of these tensors to regenerate the aliased outputs from,
+                # since they all alias each other and have identical metatadata
+                out_alias = outs_with_identical_metadata_that_require_grad[0]
+                existing_out_idx = out_tensor_ids[id(out_alias)]
+                output_type = OutputType.alias_of_intermediate_base_is_user_output
+                base_idx = existing_out_idx
+            else:
+                output_type = OutputType.non_alias
+                base_idx = None
+
+            if isinstance(o, torch.Tensor):
+                dynamic_dims = {
+                    i for i, s in enumerate(o.shape) if not is_concrete_int(s)
+                }
+            else:
+                dynamic_dims = None
+            out_info = OutputAliasInfo(
+                output_type=output_type,
+                raw_type=type(o),
+                base_idx=base_idx,
+                dynamic_dims=dynamic_dims,
+                requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
+            )
+            output_info.append(out_info)
+
+        # See Note [AOT Autograd: Views to avoid tangents aliasing inputs]
+        def view_avoid_dupes_with_primals(t):
+            if isinstance(t, Tensor) and is_traceable_wrapper_subclass(t):
+                return transform_subclass(
+                    t, lambda _, inner_t: view_avoid_dupes_with_primals(inner_t)
+                )
+            if isinstance(t, Tensor):
+                return t.view(t.shape)
+            return t
+
+        # This analysis function returns *only* the outputs that are meant to be tangents to the backwards.
+        # Anything that aliases (inputs returned in the fw due to metadata mutations, or outputs that alias inputs/intermediates)
+        # are *regenerated* later, and not used directly in the autograd graph
+        f_input_tangents = [
+            inp
+            for inp, info in zip(flat_f_args, input_info)
+            if info.mutation_type == MutationType.MUTATED_OUT_GRAPH
+            and info.mutates_data
+            and info.requires_grad
+        ]
+        f_output_tangents = [
+            o
+            for o, info in zip(flat_f_outs, output_info)
+            if info.output_type
+            in [
+                OutputType.non_alias,
+                OutputType.unsafe_view_alias,
+                OutputType.custom_function_view,
+            ]
+            and issubclass(info.raw_type, torch.Tensor)
+            and info.requires_grad
+        ]
+        # intermediate bases are also included in the backward graph
+        f_tangents = f_input_tangents + f_output_tangents + intermediate_bases
+        traced_tangents = pytree.tree_map(from_fun, f_tangents)
+        traced_tangents = pytree.tree_map(
+            view_avoid_dupes_with_primals, traced_tangents
+        )
+        user_outs = pytree.tree_map(from_fun, f_output_tangents)
+
+        f_mutated_inputs = [
+            inp
+            for inp, info in zip(flat_f_args, input_info)
+            if info.mutation_type == MutationType.MUTATED_OUT_GRAPH
+        ]
+        f_metadata_mutated_inputs = [
+            inp for inp, info in zip(flat_f_args, input_info) if info.mutates_metadata
+        ]
+        # This logic (annoyingly) re-figures out exactly what the outputs to the compiled fw graph will be.
+        # When handling subclasses, we need info about **all** outputs of compiled forward graph,
+        # so we know precisely which graph outputs to wrap back into tensor subclasses
+        # Ideally we would refactor this so not have an is_train flag, and have the separate
+        # inference and training paths decide which inputs/output to ask for subclass info on.
+        # However, we currently stash indexing information on each SubclassMeta about its order
+        # in the graph outputs list.
+        f_fw_graph_outs = list(flat_f_outs)
+        if is_train or not keep_input_mutations:
+            f_fw_graph_outs = f_mutated_inputs + f_fw_graph_outs
+        else:
+            # even when "keep_input_mutations" is True,
+            # we never keep metadata-only mutations in the fw graph
+            f_fw_graph_outs = f_metadata_mutated_inputs + f_fw_graph_outs
+        if is_train:
+            f_fw_graph_outs = f_fw_graph_outs + intermediate_bases
+        fw_graph_outs = pytree.tree_map(from_fun, f_fw_graph_outs)
+
+        grad_enabled_mutation = None
+        if torch.is_grad_enabled() != prior_grad_enabled:
+            grad_enabled_mutation = torch.is_grad_enabled()
+            torch.set_grad_enabled(
+                prior_grad_enabled
+            )  # Restore the prior state after tracing it
+            log.debug(
+                (
+                    "grad_mode mutation encountered in graph. "
+                    "Will emit mutation epilogue, to set grad_mode=%s"
+                ),
+                grad_enabled_mutation,
+            )
+
+        metadata = ViewAndMutationMeta(
+            input_info=input_info,
+            output_info=output_info,
+            num_intermediate_bases=len(intermediate_bases),
+            keep_input_mutations=keep_input_mutations,
+            traced_tangents=traced_tangents,
+            subclass_inp_meta=create_subclass_meta(flat_args),
+            subclass_fw_graph_out_meta=create_subclass_meta(fw_graph_outs),
+            subclass_tangent_meta=create_subclass_meta(traced_tangents),
+            is_train=is_train,
+            grad_enabled_mutation=grad_enabled_mutation,
+            tokens=mode._tokens,
+        )
+        return metadata
+
+    return inner
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..e38f5cab3dd8f0482feb364c6cf64995a197f6ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/dispatch_and_compile_graph.py
@@ -0,0 +1,192 @@
+"""
+This module dispatches the graphs to either the forward-only or joint compilation
+pathways, taking into account the AOTConfig and the collected ViewAndMutationMetadata.
+"""
+
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import lazy_format_graph_code
+from torch._logging import getArtifactLogger, trace_structured
+from torch._subclasses.functional_tensor import FunctionalTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from .functional_utils import (
+    assert_functional_graph,
+    propagate_input_mutation_stacktraces,
+)
+from .schemas import AOTConfig, SubclassMeta, ViewAndMutationMeta
+from .traced_function_transforms import (
+    aot_dispatch_subclass,
+    create_functionalized_fn,
+    create_joint,
+    fn_input_mutations_to_outputs,
+    fn_prepped_for_autograd,
+)
+
+aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
+
+
+def _create_graph(f, args, *, aot_config: AOTConfig) -> torch.fx.GraphModule:
+    # FunctionalTensorMode must be enabled here.
+    # See Note [Accessing .grad_fn on FunctionalTensor]
+    with enable_python_dispatcher(), FunctionalTensorMode(
+        pre_dispatch=aot_config.pre_dispatch, export=aot_config.is_export
+    ):
+        fx_g = make_fx(
+            f,
+            decomposition_table=aot_config.decompositions,
+            record_module_stack=True,
+            pre_dispatch=aot_config.pre_dispatch,
+        )(*args)
+
+    return fx_g
+
+
+def aot_dispatch_base_graph(
+    flat_fn,
+    flat_args: List[Tensor],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> Union[Callable, Tuple[Callable, List[Any], Optional[SubclassMeta]]]:
+    # aot_dispatch_base requires functionalization, but doesn't need to handle as many cases as the autograd case.
+    # The cases that aot_dispatch_base doesn't need to handle include:
+    # - outputs that are aliases of graph intermediates
+    # - outputs that are aliases of graph inputs
+    # While cases that it does need to handle include:
+    # - input mutations (including when inputs are aliases of each other)
+    # - input metadata mutations
+    fn_to_trace = fn_input_mutations_to_outputs(
+        flat_fn,
+        fw_metadata,
+        keep_data_input_mutations=aot_config.keep_inference_input_mutations,
+    )
+
+    fn_to_trace, updated_flat_args = create_functionalized_fn(
+        fn_to_trace,
+        flat_args,
+        meta=fw_metadata,
+        aot_config=aot_config,
+        trace_joint=False,
+    )
+
+    (
+        fn_to_trace,
+        updated_flat_args_subclasses_desugared,
+        maybe_subclass_meta,
+    ) = aot_dispatch_subclass(
+        fn_to_trace,
+        updated_flat_args,
+        is_joint_structure=False,
+        meta=fw_metadata,
+        fw_only=flat_fn,
+    )
+
+    fw_module = _create_graph(
+        fn_to_trace,
+        updated_flat_args_subclasses_desugared,
+        aot_config=aot_config,
+    )
+
+    # As long as we opted to remove input mutations, then
+    # there should be *NO* mutating ops in the graph at this point.
+    copy_count = assert_functional_graph(fw_module.graph)
+
+    fw_module.graph.eliminate_dead_code()
+    fw_module.recompile()
+
+    copy_count2 = assert_functional_graph(fw_module.graph)
+    propagate_input_mutation_stacktraces(fw_module.graph)
+
+    assert copy_count == copy_count2
+
+    if aot_config.enable_log:
+        aot_graphs_log.info(
+            "%s", lazy_format_graph_code("Forward graph", fw_module, aot_config.aot_id)
+        )
+        trace_structured(
+            "aot_forward_graph",
+            payload_fn=lambda: fw_module.print_readable(print_output=False),
+        )
+
+    # TODO: should factor this into a separate function for export that always only returns just the graph.
+    if aot_config.is_export:
+        assert (
+            maybe_subclass_meta is None
+        ), "aot_export_module does not support tensor subclass inputs for now."
+        return fw_module
+    return fw_module, list(updated_flat_args_subclasses_desugared), maybe_subclass_meta
+
+
+# Has the precondition that there
+# are no duplicate arguments in flat_args (e.g., the same Tensor
+# object never shows up twice.  However, two tensor inputs MAY alias
+# the same storage, so long as they have separate TensorImpls.)
+def aot_dispatch_autograd_graph(
+    flat_fn,
+    flat_args: List[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+) -> Union[Callable, Tuple[Callable, List[Any], Optional[SubclassMeta]]]:
+    # traced_tangents corresponds to the set of outputs in the traced forward that should get grad_outputs in the traced backward.
+    # It includes outputs of the original forward, *and* any updated inputs due to input mutations.
+    # However, it does *not* include any outputs that are aliases of inputs or intermediates, or any metadata-only input mutations.
+    traced_tangents = pytree.tree_map(
+        lambda x: x.detach().contiguous() if isinstance(x, Tensor) else x,
+        fw_metadata.traced_tangents,
+    )
+
+    joint_inputs = (flat_args, traced_tangents)
+
+    fn_prepared_for_autograd = fn_prepped_for_autograd(
+        flat_fn,
+        fw_metadata,
+    )
+    joint_fn_to_trace = create_joint(fn_prepared_for_autograd, aot_config=aot_config)
+
+    joint_fn_to_trace, updated_joint_inputs = create_functionalized_fn(
+        joint_fn_to_trace,
+        joint_inputs,
+        meta=fw_metadata,
+        aot_config=aot_config,
+        trace_joint=True,
+    )
+
+    subclass_tracing_info = aot_dispatch_subclass(
+        joint_fn_to_trace,
+        updated_joint_inputs,
+        is_joint_structure=True,
+        meta=fw_metadata,
+        fw_only=flat_fn,
+    )
+
+    joint_fn_to_trace = subclass_tracing_info.plain_tensor_trace_fn
+    updated_joint_inputs = subclass_tracing_info.plain_tensor_args
+    maybe_subclass_meta = subclass_tracing_info.maybe_subclass_meta
+
+    fx_g = _create_graph(joint_fn_to_trace, updated_joint_inputs, aot_config=aot_config)
+
+    # There should be *NO* mutating ops in the graph at this point.
+    assert_functional_graph(fx_g.graph)
+
+    # Redundant with the check above, but worth having in case tracing introduced
+    # a fake tensor. Unlikely.
+    # See Note: [Fake Modules and AOTAutograd]
+    torch._dynamo.utils.assert_no_fake_params_or_buffers(fx_g)
+    fx_g.graph.eliminate_dead_code()
+    fx_g.recompile()
+    # TODO: in AOTAutograd, we create metadata like _indices_of_inps_to_detach to detect
+    # when we need to manually detach() some inputs in the forward.
+    # Higher order ops might eventually need to do the same.
+    if aot_config.is_export:
+        assert (
+            maybe_subclass_meta is None
+        ), "aot_export_module does not support tensor subclass inputs for now."
+        return fx_g
+    return fx_g, updated_joint_inputs, maybe_subclass_meta
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/functional_utils.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/functional_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1124e4f3a1dfdb1719dcf0732763ee819c9b4c1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/functional_utils.py
@@ -0,0 +1,370 @@
+"""
+This file contains utilities related to functionalization in AOTAutograd:
+1. converting to/from functional tensors
+2. detecting Tensor mutations - both metadata and Tensor value
+3. regenerating/replaying views from their base
+4. checking if a graph is functional i.e. whether it contains any mutation ops
+"""
+
+import torch
+from torch import Tensor
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch.fx.experimental.symbolic_shapes import definitely_true, sym_eq
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    transform_subclass,
+)
+
+
+def to_fun(t):
+    if isinstance(t, Tensor):
+        if is_traceable_wrapper_subclass(t):
+            # See Note [Functionalization always runs last]
+            # This means that if we want to "functionalize" a subclass, we need to ensure that the functional wrapper
+            # goes at the bottom.
+            # recurse here, so we can support nested wrapper subclasses
+            out = transform_subclass(t, lambda _, inner_t: to_fun(inner_t))
+            torch._mirror_autograd_meta_to(t, out)  # type: ignore[attr-defined]
+            return out
+        else:
+            return FunctionalTensor.to_functional(t)
+    else:
+        return t
+
+
+def sync_functional_tensor(t):
+    if is_traceable_wrapper_subclass(t):
+        attrs, ctx = t.__tensor_flatten__()  # type: ignore[attr-defined]
+        for attr in attrs:
+            sync_functional_tensor(getattr(t, attr))
+    else:
+        torch._sync(t)
+
+
+# When subclasses are involved, t here will usually look something like:
+# SubclassA(SubclassB(FunctionalTensor(_to_fun_tensor(FakeTensor))))
+def from_fun(t):
+    if isinstance(t, Tensor) and is_traceable_wrapper_subclass(t):
+        # See Note [Functionalization always runs last]
+        # This means that if we want to "functionalize" a subclass, we need to ensure that the functional wrapper
+        # goes at the bottom.
+        # recurse here, so we can support nested wrapper subclasses
+        out = transform_subclass(t, lambda _, inner_t: from_fun(inner_t))
+        torch._mirror_autograd_meta_to(t, out)  # type: ignore[attr-defined]
+        return out
+
+    if not isinstance(t, FunctionalTensor):
+        # quick sanity assert
+        if isinstance(t, torch.Tensor):
+            assert not torch._is_functional_tensor(t)  # type: ignore[attr-defined]
+        return t
+    sync_functional_tensor(t)
+    return torch._from_functional_tensor(t.elem)
+
+
+def is_fun(t):
+    if isinstance(t, Tensor) and is_traceable_wrapper_subclass(t):
+        # See Note [Functionalization always runs last]
+        # This means that if we want to "functionalize" a subclass, we need to ensure that the functional wrapper
+        # goes at the bottom.
+        # recurse here, so we can support nested wrapper subclasses
+        t_attrs, _ = t.__tensor_flatten__()  # type: ignore[attr-defined]
+        t_inners = [getattr(t, attr) for attr in t_attrs]
+        any_fun = any(is_fun(x) for x in t_inners)
+        all_fun = all(is_fun(x) for x in t_inners)
+        assert any_fun == all_fun
+        return any_fun
+
+    return isinstance(t, FunctionalTensor)
+
+
+# t here is either
+# (1) A FunctionalTensor(_to_functional_tensor(FakeTensor))
+# (2) A traceable tensor subclass that holds a FunctionalTensor
+# (3) Not a tensor
+def has_data_mutation(t):
+    if is_traceable_wrapper_subclass(t):
+        attrs, _ = t.__tensor_flatten__()
+        # A tensor subclass was updated if any of its inner elements were updated
+        return any(has_data_mutation(getattr(t, attr)) for attr in attrs)
+    else:
+        if isinstance(t, torch.Tensor):
+            assert isinstance(t, FunctionalTensor)
+            return torch._functionalize_has_data_mutation(t.elem)  # type: ignore[attr-defined]
+        return False
+
+
+def are_all_mutations_hidden_from_autograd(t):
+    if is_traceable_wrapper_subclass(t):
+        attrs, _ = t.__tensor_flatten__()
+        # If all inner elements are mutations hidden from autograd, then it is a mutation hidden from autograd.
+        return all(
+            are_all_mutations_hidden_from_autograd(getattr(t, attr)) for attr in attrs
+        )
+    elif isinstance(t, torch.Tensor):
+        assert isinstance(t, FunctionalTensor)
+        return torch._functionalize_are_all_mutations_hidden_from_autograd(t.elem)
+    else:
+        return False
+
+
+def are_all_mutations_under_no_grad_or_inference_mode(t):
+    if is_traceable_wrapper_subclass(t):
+        attrs, _ = t.__tensor_flatten__()
+        return all(
+            are_all_mutations_under_no_grad_or_inference_mode(getattr(t, attr))
+            for attr in attrs
+        )
+    else:
+        assert isinstance(t, FunctionalTensor)
+        return torch._functionalize_are_all_mutations_under_no_grad_or_inference_mode(
+            t.elem
+        )
+
+
+# f_arg here is either
+# (1) A FunctionalTensor(_to_functional_tensor(FakeTensor))
+# (2) A traceable tensor subclass that holds a FunctionalTensor
+# (3) Not a tensor
+# Assumption: arg promises to be the "original" tensor wrapped by f_arg
+# Note: "storage mutations" coming from set_() are a type of metadata mutation. So:
+# - check_only_storage_mutation=True: only return true if there was a storage mutation
+# - check_only_storage_mutation=Flse: return true if there was any metadata mutation (including a storage mutation)
+def has_metadata_mutation(f_arg, arg, *, check_only_storage_mutation: bool):
+    if is_traceable_wrapper_subclass(f_arg):
+        attrs, _ = f_arg.__tensor_flatten__()
+        # A tensor subclass was updated if any of its inner elements were updated
+        f_inner_ts = [getattr(f_arg, attr) for attr in attrs]
+        inner_ts = [getattr(arg, attr) for attr in attrs]
+        return any(
+            has_metadata_mutation(
+                f_inner_t,
+                inner_t,
+                check_only_storage_mutation=check_only_storage_mutation,
+            )
+            for f_inner_t, inner_t in zip(f_inner_ts, inner_ts)
+        )
+    else:
+        if not isinstance(f_arg, torch.Tensor):
+            assert not isinstance(arg, torch.Tensor)
+            return False
+        assert isinstance(f_arg, FunctionalTensor)
+        assert isinstance(arg, FakeTensor)
+
+        arg_after = torch._from_functional_tensor(f_arg.elem)
+        # This is true if the current tensor experienced at least one set_() call
+        maybe_storage_changed = torch._functionalize_was_storage_changed(f_arg.elem)  # type: ignore[attr-defined]
+        # However, multiple set_() calls can cancel out. So we also check whether the
+        # storage of the tensor has changed.
+        # Note: if an input experienced two set_() calls that cancel out, **and**
+        # it experiences an data mutation, we pessimistically think that the set_()
+        # call is necessary here. We could in theory fix this, but this will
+        # hopefully never happen in user code, and is not needed for fsdp.
+        same_storages = StorageWeakRef(arg.untyped_storage()) == StorageWeakRef(
+            arg_after.untyped_storage()
+        )
+        has_storage_metadata_mutation = maybe_storage_changed and not same_storages
+        if check_only_storage_mutation:
+            return has_storage_metadata_mutation
+
+        # storage metadata mutation is a type of metadata mutation, so return true if we saw one
+        if has_storage_metadata_mutation:
+            return True
+
+        maybe_metadata_mutated = torch._functionalize_has_metadata_mutation(f_arg.elem)  # type: ignore[attr-defined]
+        # This is true if the current tensor experienced at least one metadata mutation.
+        # So if false, we know there was no metadata mutation
+        if not maybe_metadata_mutated:
+            return False
+
+        # However, multi metadata mutations can cancel out.
+        # So we also check if the concrete sizes/strides on the tensor have changed.
+        same_sizes = arg.shape == arg_after.shape
+        same_strides = arg.stride() == arg_after.stride()
+        same_offsets = arg.storage_offset() == arg_after.storage_offset()
+        has_metadata_mutation_ = maybe_metadata_mutated and not (
+            same_sizes and same_strides and same_offsets
+        )
+        # We consider a tensor to have been metadata mutated if its storage was mutated through a set_() call.
+        return has_metadata_mutation_
+
+
+def gen_alias_from_base(aliased_base_tensor, target_meta_tensor, target_requires_grad):
+    # Try to do view-replay if possible.
+    # fall back to .as_strided() if we can't.
+    if target_meta_tensor._base is not None:
+        # The base that we want to replay our view off of might have a different shape than the view's original base.
+        b = target_meta_tensor._base
+        abt = aliased_base_tensor
+        # Don't unnecessarily call as_strided if nothing changed; as_strided's
+        # backward is poorly implemented and slow
+        if abt is not b and (
+            abt.size() != b.size()
+            or abt.stride() != b.stride()
+            or abt.storage_offset() != b.storage_offset()
+        ):
+            reshaped_base_tensor = aliased_base_tensor.as_strided(
+                b.size(), b.stride(), b.storage_offset()
+            )
+        else:
+            reshaped_base_tensor = aliased_base_tensor
+        out = target_meta_tensor._view_func(reshaped_base_tensor)
+        # This shape mismatch can happen due to a bug in inplace/view handling in autograd.
+        # Try putting a breakpoint here and running
+        # `test/functorch/test_aotdispatch TestAOTAutograd.test_output_all_alias_types`
+        # Also, https://github.com/pytorch/pytorch/issues/49825
+        #
+        # As a stopgap, we'll fall back to as_strided.
+        if out is not None and out.shape == target_meta_tensor.shape:
+            if aliased_base_tensor.requires_grad and not target_requires_grad:
+                out = out.detach()
+            elif not aliased_base_tensor.requires_grad and target_requires_grad:
+                out.requires_grad_(True)
+            return out
+    size = target_meta_tensor.size()
+    stride = target_meta_tensor.stride()
+    storage_offset = target_meta_tensor.storage_offset()
+    if aliased_base_tensor.is_complex() and not target_meta_tensor.is_complex():
+        aliased_out = torch.view_as_real(aliased_base_tensor).as_strided(
+            size, stride, storage_offset
+        )
+    elif not aliased_base_tensor.is_complex() and target_meta_tensor.is_complex():
+        aliased_out = torch.view_as_complex(aliased_base_tensor).as_strided(
+            size, stride, storage_offset
+        )
+    else:
+        aliased_out = aliased_base_tensor.as_strided(size, stride, storage_offset)
+    # For outputs aliasing inputs, we need to check if the requires-gradness has changed.
+    if aliased_base_tensor.requires_grad and not target_requires_grad:
+        aliased_out = aliased_out.detach()
+    elif not aliased_base_tensor.requires_grad and target_requires_grad:
+        aliased_out.requires_grad_(True)
+    # For outputs aliasing inputs, we need to check if the dtype has changed.
+    # as_strided() is the "most generic" view, but it does not cover cross-dtype views
+    if aliased_out.dtype != target_meta_tensor.dtype:
+        aliased_out = aliased_out.view(target_meta_tensor.dtype)
+    return aliased_out
+
+
+def has_same_metadata(t1, t2):
+    return (
+        definitely_true(sym_eq(t1.size(), t2.size()))
+        and definitely_true(sym_eq(t1.stride(), t2.stride()))
+        and definitely_true(t1.storage_offset() == t2.storage_offset())
+        and t1.is_conj() == t2.is_conj()
+        and t1.is_neg() == t2.is_neg()
+    )
+
+
+# new_arg and arg here are either:
+# (1) both a FakeTensor
+# (2) both a traceable tensor subclass that holds a FakeTensor
+# Pre-condition: the two args are the "old" and "new" inputs from running functionalization.
+# When we run functionalization and wrap our inputs into FunctionalTensors,
+# we can detect whether or not an input was mutated by checking to see if the inner tensor has changed
+#
+# Normally it would be enough just to check if arg is new_arg, which is normally enough for functionalization
+# to confirm that inputs were not mutated when running the user's model with functionalization on.
+# But when we have subclass inputs, we can't rely on that:
+# `from_fun(to_fun(x)) is x` will return False, because the call to `from_fun` constructs
+# a brand new subclass instance: we are calling __tensor_unflatten__, and going
+# from Subclass(FakeTensor) to Subclass(FunctionalTensor(FakeTensor))
+def was_tensor_updated(arg, new_arg):
+    if is_traceable_wrapper_subclass(arg):
+        assert is_traceable_wrapper_subclass(new_arg)
+        attrs, _ = arg.__tensor_flatten__()
+        new_attrs, _ = new_arg.__tensor_flatten__()
+        assert attrs == new_attrs
+        # A tensor subclass was updated if any of its inner elements were updated
+        return any(
+            was_tensor_updated(getattr(arg, attr), getattr(new_arg, attr))
+            for attr in attrs
+        )
+    else:
+        return arg is not new_arg
+
+
+# new_arg and arg here are either:
+# (1) both a FakeTensor
+# (2) both a traceable tensor subclass that holds a FakeTensor
+# Pre-condition: the two args are the "old" and "new" inputs from running functionalization.
+# When we run functionalization and wrap our inputs into FunctionalTensors,
+# we can detect whether or not an input was mutated by checking to see if the inner tensor has changed,
+# but shares storage with the old input
+def was_tensor_metadata_updated(arg, new_arg):
+    if is_traceable_wrapper_subclass(arg):
+        assert is_traceable_wrapper_subclass(new_arg)
+        attrs, _ = arg.__tensor_flatten__()
+        new_attrs, _ = new_arg.__tensor_flatten__()
+        assert attrs == new_attrs
+        # A tensor subclass was updated if any of its inner elements were updated
+        return any(
+            was_tensor_metadata_updated(getattr(arg, attr), getattr(new_arg, attr))
+            for attr in attrs
+        )
+    else:
+        return arg is not new_arg and StorageWeakRef(
+            arg.untyped_storage()
+        ) == StorageWeakRef(new_arg.untyped_storage())
+
+
+# Returns the number of detected copy_
+def assert_functional_graph(fx_g: torch.fx.Graph) -> int:
+    placeholders = set()
+    copy_count = 0
+    # NB: It would also be nice to verify that the mutations all happen at the
+    # end, but we also do some administrative views after mutations so this
+    # isn't actually true.  (TODO: Could this cause problems for Inductor?)
+    for n in fx_g.nodes:
+        if n.op == "placeholder":
+            placeholders.add(n)
+        if isinstance(n.target, torch._ops.OpOverload):
+            if n.target is torch.ops.aten.copy_.default:
+                suffix = True
+                # Can only copy_ into an input, and can only do so once
+                assert n.args[0] in placeholders
+                placeholders.remove(n.args[0])
+                copy_count += 1
+            else:
+                assert (
+                    not n.target._schema.is_mutable
+                ), f"aot_autograd expected to have an entirely functional graph, but found {n.format_node()}"
+    return copy_count
+
+
+def propagate_input_mutation_stacktraces(fx_g: torch.fx.Graph) -> None:
+    placeholders = set()
+    for n in fx_g.nodes:
+        if n.op == "placeholder":
+            placeholders.add(n)
+        if isinstance(n.target, torch._ops.OpOverload):
+            if n.target is torch.ops.aten.copy_.default:
+                # Can only copy_ into an input, and can only do so once
+                assert n.args[0] in placeholders
+                placeholders.remove(n.args[0])
+                copy_from_node = n.args[1]
+                # Pre-condition: every node has a "stack_trace" field in its meta,
+                # but copy_() nodes do not (since we manually added them during functionalization).
+                # Instead, we manually propagate here.
+                if "stack_trace" in copy_from_node.meta:
+                    assert "stack_trace" not in n.meta, str(n)
+                    n.meta["stack_trace"] = copy_from_node.meta["stack_trace"]
+
+
+def _check_if_mutation_can_be_in_graph(
+    keep_input_mutations: bool,
+    mutates_data,
+    mutates_metadata,
+    mutations_hidden_from_autograd,
+    mutations_under_no_grad_or_inference_mode,
+    requires_grad,
+):
+    if keep_input_mutations:
+        return mutates_data and (
+            (not mutates_metadata and not requires_grad)
+            or mutations_hidden_from_autograd
+            or mutations_under_no_grad_or_inference_mode
+        )
+    return False
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/input_output_analysis.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/input_output_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe926247820f2ccc9d9063e0e3bb671ac0ce1096
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -0,0 +1,432 @@
+"""
+This module is one of the analysis modules - it takes as input a function or graph
+and some preexisting properties, and returns some data that is useful for deciding
+how to further proceed with compilation or construct runtime wrappers.
+
+In particular, the following analyses are provided:
+1. Refine the view and mutation metadata collected previously - removing duplicate
+   inputs or mapping views to their bases.
+2. We also analyze the function signature for export graphs.
+"""
+
+import itertools
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._subclasses.functional_tensor import FunctionalTensor
+from torch.fx.experimental.symbolic_shapes import is_concrete_int
+from .schemas import (
+    BackwardSignature,
+    GraphSignature,
+    InputAliasInfo,
+    OutputAliasInfo,
+    OutputType,
+    ViewAndMutationMeta,
+)
+from .utils import strict_zip
+
+zip = strict_zip
+
+
+def remove_dupe_metadata(
+    m: ViewAndMutationMeta,
+    keep_arg_mask: List[bool],
+    add_dupe_map: List[int],
+) -> ViewAndMutationMeta:
+    assert len(m.input_info) == len(keep_arg_mask)
+    # Easy invariant: the first argument should never be a dupe (it will be kept)
+    assert len(keep_arg_mask) > 0 and keep_arg_mask[0]
+
+    # Filter dupe'd mutated inputs out of traced_tangents
+    num_data_mutations = len([x for x in m.input_info if x.mutates_data])
+    other_traced_tangents = m.traced_tangents[num_data_mutations:]
+    inp_traced_tangents = m.traced_tangents[:num_data_mutations]
+    filtered_inp_traced_tangents = [
+        x
+        for i, x in enumerate(inp_traced_tangents)
+        if keep_arg_mask[m.mutated_inp_runtime_indices[i]]
+    ]
+    traced_tangents = filtered_inp_traced_tangents + other_traced_tangents
+
+    return ViewAndMutationMeta(
+        input_info=[x for i, x in enumerate(m.input_info) if keep_arg_mask[i]],
+        # For outputs that are views of inputs, we store the index of the input that the output
+        # was generated from. Need to update that index to account for removed dupes.
+        output_info=[
+            OutputAliasInfo(
+                output_type=o.output_type,
+                raw_type=o.raw_type,
+                dynamic_dims=o.dynamic_dims,
+                base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
+                requires_grad=o.requires_grad,
+            )
+            for o in m.output_info
+        ],
+        num_intermediate_bases=m.num_intermediate_bases,
+        keep_input_mutations=m.keep_input_mutations,
+        traced_tangents=traced_tangents,
+        # We are guaranteed not to get here, since dupes are not supported today with subclass inputs.
+        subclass_inp_meta=[],
+        subclass_fw_graph_out_meta=[],
+        subclass_tangent_meta=[],
+        is_train=m.is_train,
+    )
+
+
+# Given our ViewAndMutation metadata, this fn constructs a new set of metadata,
+# after adding synthetic base arguments to the function.
+# Most of the work in this fn is slogging through all of the metadata corresponding to inputs,
+# and updating it with our synthetic base calling convention.
+#
+# When config.debug_assert is set, we automatically regenerate the metadata
+# and compare it to this output for sanity.
+#
+# In addition to the updated metadata, also return the list of input indices
+# that will need to be updated in the synthetic base epilogue
+
+
+# Given our ViewAndMutation metadata, this fn constructs a new set of metadata,
+# after adding synthetic base arguments to the function.
+# Most of the work in this fn is slogging through all of the metadata corresponding to inputs,
+# and updating it with our synthetic base calling convention.
+#
+# When config.debug_assert is set, we automatically regenerate the metadata
+# and compare it to this output for sanity.
+#
+# In addition to the updated metadata, also return the list of input indices
+# that will need to be updated in the synthetic base epilogue
+def create_synthetic_base_metadata(
+    m: ViewAndMutationMeta,
+    # Maps each outer argument idx to its inner idx (or, if this outer arg is generated from a
+    # synthetic base, you get a tuple of (i, TensorMeta), telling you the base tensor idx, and view metadata)
+    synthetic_base_info: List[Union[int, Tuple[int, torch.Tensor]]],
+    outer_args: List[Any],
+    inner_args: List[Any],
+) -> Tuple[ViewAndMutationMeta, List[int]]:
+    # maps inner arg indices to outer arg indices
+    synthetic_base_to_indices: Dict[int, List[int]] = {}
+    for inner_idx in range(len(inner_args)):
+        outer_aliased_indices_of_current_base_arg = [
+            outer_idx
+            for outer_idx, inner_idx_or_tuple in enumerate(synthetic_base_info)
+            if (isinstance(inner_idx_or_tuple, int) and inner_idx_or_tuple == inner_idx)
+            or (
+                isinstance(inner_idx_or_tuple, tuple)
+                and inner_idx_or_tuple[0] == inner_idx
+            )
+        ]
+        synthetic_base_to_indices[inner_idx] = outer_aliased_indices_of_current_base_arg
+
+    # given the requires_grad info on mutated inputs,
+    # generate the requires_grad info on those same mutated inputs, but after constructing synthetic bases.
+    input_infos = []
+    for outer_indices in synthetic_base_to_indices.values():
+        # leaf-ness should be all-or-nothing for aliased tensor.
+        # (aka if "a" and "b" are views, then a.is_leaf == b.is_leaf)
+        any_leaf = any(m.input_info[x].is_leaf for x in outer_indices)
+        all_leaf = all(m.input_info[x].is_leaf for x in outer_indices)
+        assert any_leaf == all_leaf
+
+        mutates_data = (
+            True
+            if len(outer_indices) > 1
+            else m.input_info[outer_indices[0]].mutates_data
+        )
+        mutates_metadata = (
+            False
+            if len(outer_indices) > 1
+            else m.input_info[outer_indices[0]].mutates_metadata
+        )
+        requires_grad = any(m.input_info[x].requires_grad for x in outer_indices)
+        mutations_hidden_from_autograd = all(
+            m.input_info[x].mutations_hidden_from_autograd for x in outer_indices
+        )
+        mutations_under_no_grad_or_inference_mode = all(
+            m.input_info[x].mutations_under_no_grad_or_inference_mode
+            for x in outer_indices
+        )
+
+        inpt_info = InputAliasInfo(
+            # If len(outer_indices) > 1, then this input is a synthetic base.
+            # The invariant is that to the rest of aot autograd, synthetic bases only show up if
+            # one of their aliases gets a data mutation. And if any of their aliases get metadata
+            # mutations, they will be hidden from the rest of aot autograd.
+            mutates_data=mutates_data,
+            mutates_metadata=mutates_metadata,
+            mutations_hidden_from_autograd=all(
+                m.input_info[x].mutations_hidden_from_autograd for x in outer_indices
+            ),
+            mutates_storage_metadata=False
+            if len(outer_indices) > 1
+            else m.input_info[outer_indices[0]].mutates_storage_metadata,
+            mutations_under_no_grad_or_inference_mode=mutations_under_no_grad_or_inference_mode,
+            is_leaf=any_leaf,
+            requires_grad=requires_grad,
+            keep_input_mutations=m.keep_input_mutations,
+        )
+        input_infos.append(inpt_info)
+
+    # Find any inputs that fulfill the following criteria:
+    # (1) They are part of a synthetic base (because they alias another input,
+    #      and at least one input experiences a data mutation)
+    # (2) They experience a metadata mutation
+    outer_aliased_arg_idx_with_metadata_mutations = [
+        outer_idx
+        for outer_idx, inpt_info in enumerate(m.input_info)
+        if inpt_info.mutates_metadata
+        and not isinstance(synthetic_base_info[outer_idx], int)
+    ]
+
+    # grab the original requires grad info on the outputs, except the ones from the mutated inputs
+    input_metadata_output_info = [
+        OutputAliasInfo(
+            output_type=OutputType.alias_of_input,
+            raw_type=FunctionalTensor,
+            dynamic_dims={
+                i
+                for i, s in enumerate(outer_args[outer_idx].shape)
+                if not is_concrete_int(s)
+            },
+            base_idx=synthetic_base_info[outer_idx][0],  # type: ignore[index]
+            requires_grad=outer_args[outer_idx].requires_grad,
+        )
+        for outer_idx in outer_aliased_arg_idx_with_metadata_mutations
+    ]
+    existing_output_infos = []
+    for o in m.output_info:
+        new_base_idx = (
+            None
+            if o.base_idx is None
+            else (
+                synthetic_base_info[o.base_idx]
+                if isinstance(synthetic_base_info[o.base_idx], int)
+                else synthetic_base_info[o.base_idx][0]  # type: ignore[index]
+            )
+        )
+        # If base_idx is changed for OutputType.is_input, we need to update the output type to reflect the change
+        new_output_type = (
+            OutputType.alias_of_input
+            if o.output_type == OutputType.is_input and o.base_idx != new_base_idx
+            else o.output_type
+        )
+        existing_output_infos.append(
+            OutputAliasInfo(
+                output_type=new_output_type,
+                raw_type=o.raw_type,
+                dynamic_dims=o.dynamic_dims,
+                # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
+                base_idx=new_base_idx,  # type: ignore[arg-type]
+                requires_grad=o.requires_grad,
+            )
+        )
+
+    inner_mutated_tangents = [
+        x
+        for inner_idx, x in enumerate(inner_args)
+        if input_infos[inner_idx].mutates_data and input_infos[inner_idx].requires_grad
+    ]
+
+    output_info = existing_output_infos + input_metadata_output_info
+    # Regenerate traced tangents to include mutated inputs including synthetic bases
+    traced_tangents = (
+        inner_mutated_tangents + m.traced_tangents[len(inner_mutated_tangents) :]
+    )
+
+    return (
+        ViewAndMutationMeta(
+            input_info=input_infos,
+            output_info=output_info,
+            num_intermediate_bases=m.num_intermediate_bases,
+            keep_input_mutations=m.keep_input_mutations,
+            traced_tangents=traced_tangents,
+            # We are guaranteed not to get here, since synthetic_base codepaths are not supported today with subclass inputs.
+            subclass_inp_meta=[],
+            subclass_fw_graph_out_meta=[],
+            subclass_tangent_meta=[],
+            is_train=m.is_train,
+        ),
+        outer_aliased_arg_idx_with_metadata_mutations,
+    )
+
+
+def _get_last_mem_address(x):
+    out = x.storage_offset()
+    for size, stride in zip(x.size(), x.stride()):
+        out += (size - 1) * stride
+    return out
+
+
+# Assumption: x and y are known to share a storage, and we are trying to determine
+# if their memory is actually completely disjoint, based on sizes/strides/storage_offset
+def _tensors_definitely_do_not_overlap(x, y):
+    if x is y:
+        return False
+    if x.numel() == 0 or y.numel() == 0:
+        return True
+
+    # Make x always on the left
+    if x.storage_offset() > y.storage_offset():
+        x, y = y, x
+    # Short-circuit in the "obvious" overlapping case: both tensors are contiguous
+    if x.is_contiguous() and y.is_contiguous():
+        if x.storage_offset() + x.numel() > y.storage_offset():
+            # definitely overlap
+            return False
+        else:
+            # definitely no overlap
+            return True
+
+    # Short-circuit: if last memory address of x is < start of y, then not overlapping.
+    x_last = _get_last_mem_address(x)
+    if x_last < y.storage_offset():
+        return True
+
+    if x.dim() == 2 and y.dim() == 2 and x.stride(1) == 1 and y.stride(1) == 1:
+        # This cases is needed for the shampoo optimizer.
+        # All tensors are 2d (non-contiguous), have the same outer stride, and have an inner stride of 1
+        # (so rows are contiguous)
+        if x.stride(0) == y.stride(0):
+            offset_delta = y.storage_offset() - x.storage_offset()
+            if offset_delta < x.size(1):
+                # definitely overlaps (row 0 of y overlaps with row 0 of x)
+                # Example:
+                #   base = torch.arange(32).reshape(4, 8)
+                #   x = base.narrow(1, 0, 4)
+                #     x: size=(4, 4), stride=(8, 1), offset=0
+                #   y = base.narrow(1, 3, 4)
+                #     y: size=(4, 4), stride=(8, 1), offset=3
+                return False
+            x_total_elems_covered = x.stride(0) * (x.size(0) - 1) + x.size(1)
+            if x_total_elems_covered <= offset_delta:
+                # definitely does not overlap (last byte of x is before start of y)
+                # Example:
+                #   x: size=(4, 4), stride=(8, 1), offset=0 (last byte is 27)
+                #   y: size=(4, 4), stride=(8, 1), offset=28 (start byte is 28)
+                return True
+            # At this point, we want to check if the 0th row of y
+            # overlaps with **some** row of x.
+            # We can check this by shifting y backward by the shared stride, repeatedly,
+            # until the first row of y is before the first row of x.
+            # Then we can check if these rows overlap.
+            # We can accomplish this by modding our offset by the stride.
+            offset_delta_mod = offset_delta % x.stride(0)
+            # Example:
+            # 0 1 2 3
+            # 9 10 11 12
+            # 18 19 20 21
+            # 27 28 29 30
+            #   x: size=(4, 4), stride=(9, 1), offset=0
+            #   y: size=(4, 4), stride=(9, 1), offset=22 (this would not overlap)
+            #   y: size=(4, 4), stride=(9, 1), offset=23 (this would not overlap)
+            #   y: size=(4, 4), stride=(9, 1), offset=24 (this would overlap)
+            #   y: size=(4, 4), stride=(9, 1), offset=25 (this would overlap)
+            # If the interval [modded_offset, modded_offset + x_size] falls entirely
+            # without
+            if offset_delta_mod + y.size(1) <= x.stride(0):
+                return True
+            else:
+                return False
+    return False
+
+
+def compute_overlapping_inputs(fwd_inputs, aliased_input_indices):
+    actual_aliased_indices = set()
+    for j in range(len(aliased_input_indices)):
+        for i in range(j):
+            i_ = aliased_input_indices[i]
+            j_ = aliased_input_indices[j]
+            if not _tensors_definitely_do_not_overlap(fwd_inputs[i_], fwd_inputs[j_]):
+                actual_aliased_indices.add(i_)
+                actual_aliased_indices.add(j_)
+    return actual_aliased_indices
+
+
+def _graph_input_names(gm):
+    return [node.name for node in gm.graph.nodes if node.op == "placeholder"]
+
+
+def _graph_output_names(gm):
+    output_node = next(iter(reversed(gm.graph.nodes)))
+    assert output_node.op == "output" and len(output_node.args) == 1
+    return_args = output_node.args[0]
+    return [getattr(return_arg, "name", None) for return_arg in return_args]
+
+
+def create_graph_signature(
+    fx_g: torch.fx.GraphModule,
+    fw_metadata: ViewAndMutationMeta,
+    in_spec: pytree.TreeSpec,
+    out_spec: pytree.TreeSpec,
+    *,
+    user_args_flat: List[Tensor],
+    params_and_buffers_flat: List[Tensor],
+    param_names: List[str],
+    buffer_names: List[str],
+    trace_joint: bool,
+    num_user_fw_outs: Optional[int],
+    loss_index: Optional[int],
+) -> GraphSignature:
+    # Retrieve graph input names
+    graph_input_names = _graph_input_names(fx_g)
+    # Retrieve graph output names
+    graph_output_names = _graph_output_names(fx_g)
+
+    num_params_buffers = len(param_names) + len(buffer_names)
+    num_tokens = len(fw_metadata.tokens)
+    # We have enough restrictions on the graph (no de-duping, synthetic bases, etc),
+    # Such that # graph inps = # user inps + # params + # buffers
+    num_user_args = len(graph_input_names) - num_params_buffers - num_tokens
+
+    if trace_joint:
+        assert num_user_fw_outs is not None
+        num_fw_outs = num_user_fw_outs + fw_metadata.num_mutated_inp_runtime_indices
+        backward_output_names = graph_output_names[num_fw_outs:]
+
+        grad_index = itertools.count(0)
+        gradients_to_parameters = {
+            backward_output_names[next(grad_index)]: param_names[i]
+            for i, param in enumerate(params_and_buffers_flat)
+            if param.requires_grad
+        }
+
+        gradients_to_user_inputs = {
+            backward_output_names[next(grad_index)]: graph_input_names[
+                i + len(params_and_buffers_flat)
+            ]
+            for i, user_input in enumerate(user_args_flat)
+            if user_input.requires_grad
+        }
+
+        assert len(gradients_to_parameters) + len(gradients_to_user_inputs) == len(
+            backward_output_names
+        )
+
+        # Check that we have fully accounted for all graph outputs
+        backward_signature = BackwardSignature(
+            gradients_to_parameters,
+            gradients_to_user_inputs,
+            graph_output_names[loss_index],
+        )
+    else:
+        backward_signature = None
+        num_user_fw_outs = (
+            len(graph_output_names)
+            - fw_metadata.num_mutated_inp_runtime_indices
+            - num_tokens
+        )
+
+    return GraphSignature.from_tracing_metadata(
+        in_spec=in_spec,
+        out_spec=out_spec,
+        graph_input_names=graph_input_names,
+        graph_output_names=graph_output_names,
+        view_mutation_metadata=fw_metadata,
+        named_parameters=param_names,
+        named_buffers=buffer_names,
+        num_user_inputs=num_user_args,
+        num_user_outputs=num_user_fw_outs,
+        loss_index=loss_index,
+        backward_signature=backward_signature,
+    )
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1044e7757ceb1860c7ce71910f5fc7d158551040
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -0,0 +1,936 @@
+"""
+These are the runtime wrappers that are associated with JIT-compiling.
+
+This includes the forward-only and joint JIT runtime wrappers.
+
+This module depends heavily on the runtime wrapper building blocks defined
+in `runtime_wrappers`.
+"""
+
+import logging
+from contextlib import nullcontext
+from functools import wraps
+from typing import Any, List, Optional
+
+import torch
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dynamo.utils import lazy_format_graph_code
+from torch._guards import detect_fake_mode, tracing, TracingContext
+from torch._logging import getArtifactLogger, trace_structured
+from torch._prims_common import CUDARngStateHelper
+from torch._subclasses import FakeTensor
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.proxy_tensor import is_sym_node
+from torch.fx.experimental.symbolic_shapes import fx_placeholder_vals
+from .. import config
+from .dispatch_and_compile_graph import (
+    aot_dispatch_autograd_graph,
+    aot_dispatch_base_graph,
+)
+from .logging_utils import describe_input, format_guard_bug_msg, track_graph_compiling
+
+from .runtime_wrappers import (
+    aot_dispatch_subclass_wrapper,
+    create_runtime_wrapper,
+    functionalized_rng_runtime_epilogue,
+)
+from .schemas import (
+    AOTConfig,
+    MutationType,
+    OutputType,
+    SubclassMeta,
+    TensorAlias,
+    ViewAndMutationMeta,
+)
+from .subclass_utils import (
+    compute_inner_mutated_inp_indices_from_subclass_meta,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses,
+)
+
+from .utils import (
+    _get_symint_hints,
+    call_func_at_runtime_with_args,
+    make_boxed_func,
+    normalize_as_list,
+    strict_zip,
+)
+
+zip = strict_zip
+
+log = logging.getLogger(__name__)
+aot_joint_log = getArtifactLogger(__name__, "aot_joint_graph")
+aot_graphs_log = getArtifactLogger(__name__, "aot_graphs")
+
+aten = torch.ops.aten
+
+
+def _compute_output_meta_with_inductor_strides(fw_module, fwd_output_strides):
+    out = [n.meta["val"] for n in (list(fw_module.graph.nodes)[-1].args[0])]
+    # will only be set for inductor
+    if not fwd_output_strides:
+        return out
+    with TracingContext.get().fake_mode.shape_env.suppress_guards():
+        for i in range(len(out)):
+            if not isinstance(out[i], Tensor):
+                continue
+            if all(s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])):
+                continue
+            out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
+    return out
+
+
+def aot_dispatch_base(
+    flat_fn,
+    flat_args: List[Tensor],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+):
+    fw_module, updated_flat_args, maybe_subclass_meta = aot_dispatch_base_graph(  # type: ignore[misc]
+        flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+    )
+
+    disable_amp = torch._C._is_any_autocast_enabled()
+    context = torch._C._DisableAutocast if disable_amp else nullcontext
+    fakified_out = None
+
+    with context(), track_graph_compiling(aot_config, "inference"):
+        compiler = (
+            aot_config.inference_compiler
+            if aot_config.inference_compiler is not None
+            else aot_config.fw_compiler
+        )
+        if config.functionalize_rng_ops:
+            # Add the seed and offset as example inputs to pass to the compiler
+            fake_mode = detect_fake_mode()
+            seed, offset = CUDARngStateHelper.get_torch_state_as_tuple(fake_mode)
+            updated_flat_args.extend([seed, offset])
+
+        if tracing_context := torch._guards.TracingContext.try_get():
+            tracing_context.fw_metadata = (
+                fw_metadata
+                if maybe_subclass_meta is None
+                else maybe_subclass_meta.fw_metadata
+            )
+
+        with TracingContext.report_output_strides() as fwd_output_strides:
+            compiled_fw = compiler(fw_module, updated_flat_args)
+
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        if tracing_context and tracing_context.fakify_first_call:
+            fakified_out = _compute_output_meta_with_inductor_strides(
+                fw_module, fwd_output_strides
+            )
+
+    # However, create_runtime_wrapper does not expect the rng offsets in the
+    # output. So, we have to create another wrapper and take out the offset. As
+    # a result, we have to account for not boxed_call compilers as well.
+    if not hasattr(compiled_fw, "_boxed_call"):
+        compiled_fw = make_boxed_func(compiled_fw)
+
+    # Create a wrapper to set up the rng functionalize bits
+    @wraps(compiled_fw)
+    def rng_functionalization_wrapper(args):
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        nonlocal fakified_out
+        if fakified_out is not None:
+            out = fakified_out
+            fakified_out = None
+            return out
+
+        # args is a list because compiled_fw is boxed_call
+        if fw_metadata.is_rng_op_functionalized:
+            # Add the seed and offset to args
+            seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
+            args.extend([seed, offset])
+            out = compiled_fw(args)
+            out = functionalized_rng_runtime_epilogue(fw_metadata, out)
+            return out
+        else:
+            return compiled_fw(args)
+
+    if maybe_subclass_meta is not None:
+        compiled_fw_func = aot_dispatch_subclass_wrapper(
+            rng_functionalization_wrapper,
+            subclass_metas=fw_metadata.subclass_fw_graph_out_meta,
+            num_fw_outs_saved_for_bw=None,
+        )
+    else:
+        compiled_fw_func = rng_functionalization_wrapper
+
+    if not hasattr(compiled_fw_func, "_boxed_call"):
+        compiled_fw_func = make_boxed_func(compiled_fw_func)
+
+    compiled_fn = create_runtime_wrapper(
+        compiled_fw_func,
+        runtime_metadata=fw_metadata,
+        indices_of_inps_to_detach=[],
+        trace_joint=False,
+        keep_input_mutations=aot_config.keep_inference_input_mutations,
+        disable_amp=disable_amp,
+    )
+
+    return compiled_fn
+
+
+def aot_dispatch_autograd(
+    flat_fn,
+    flat_args: List[Any],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+):
+    fw_metadata.deterministic = torch.are_deterministic_algorithms_enabled()
+    fx_g, joint_inputs, maybe_subclass_meta = aot_dispatch_autograd_graph(  # type: ignore[misc]
+        flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
+    )
+
+    # Copied from aot_dispatch_autograd_graph.
+    disable_amp = torch._C._is_any_autocast_enabled()
+
+    if aot_config.enable_log:
+        aot_joint_log.info(
+            "%s", lazy_format_graph_code("Joint graph", fx_g, aot_config.aot_id)
+        )
+        trace_structured(
+            "aot_joint_graph",
+            payload_fn=lambda: fx_g.print_readable(print_output=False),  # type: ignore[union-attr]
+        )
+
+    fakify_first_call = False
+    fakified_out = None
+
+    with torch.no_grad():
+        inner_meta = (
+            fw_metadata
+            if maybe_subclass_meta is None
+            else maybe_subclass_meta.fw_metadata
+        )
+        with track_graph_compiling(aot_config, "joint"):
+            # See Note: [Partitioner handling for Subclasses, Part 1]
+            # See Note: [Recomputing subclass mutation handling]
+            mutated_inp_runtime_indices = (
+                compute_inner_mutated_inp_indices_from_subclass_meta(
+                    fw_metadata, inner_meta
+                )
+            )
+            num_mutated_inp_runtime_indices = len(mutated_inp_runtime_indices)
+            num_inner_fwd_outputs = (
+                num_mutated_inp_runtime_indices
+                + inner_meta.num_outputs
+                + inner_meta.num_intermediate_bases
+                + inner_meta.num_outputs_rng_offset
+                + len(
+                    fw_metadata.tokens
+                )  # See Note [Side-Effectful Tokens in AOTAutograd]
+            )
+            fw_module, bw_module = aot_config.partition_fn(
+                fx_g, joint_inputs, num_fwd_outputs=num_inner_fwd_outputs
+            )
+
+            fw_outs = next(n for n in fw_module.graph.nodes if n.op == "output").args[0]
+            # we only need to bookkeep the symints that are saved for bw, not any symints
+            # the user forward might have returned in its own output
+            fw_outs_saved_for_bw = fw_outs[num_inner_fwd_outputs:]
+            num_fw_outs_saved_for_bw = len(fw_outs_saved_for_bw)
+            symint_outs_saved_for_bw = [
+                n for n in fw_outs_saved_for_bw if is_sym_node(n)
+            ]
+            fw_metadata.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            inner_meta.num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+            _num_symints_saved_for_bw = len(symint_outs_saved_for_bw)
+
+        # Note [Detaching inputs that never need gradients]
+        # See https://github.com/pytorch/pytorch/issues/97745
+        # Suppose we have a function like this that we want to compile:
+        #
+        # def f(x, y):
+        #     return torch.mul(x, y.detach())
+        #
+        # What gradients should we compute for x and y?
+        # By default, AOTAutograd will compute a gradient for **every** input that requires gradients,
+        # and so we'll compute:
+        #    x_grad_input = y
+        #    y_grad_input = None
+        # Does this preserve the semantics of eager mode?
+        # Unfortunately, no.
+        # Doing the above will cause autograd to **continue** to backprop the autograd tape
+        # that was generated from constructing y.
+        #
+        # This is **different** from what would have happened in eager mode.
+        # In eager mode, if we backprop through the output of this function, autograd will only traverse
+        # the bit of the autograd tape corresponding to "x".
+        # In particular, if a user had previously backpropped through y's autograd tape,
+        # And then they try to backprop through the output of the above function,
+        # then we'll hit the dreaded "Trying to backward through the graph a second time" error.
+        #
+        # You might think: If autograd sees that a gradient is None, shouldn't it stop early,
+        # instead of continuing the backprop through the ancestors of that node in the graph?
+        #
+        # Autograd has two passes:
+        # (1) a first pass that traverses the autograd graph and figures out which nodes need to be executed
+        # (2) a second pass that actually goes ahead and executes each node when it becomes ready,
+        #     propagating gradients
+        # By the time we're executing a node and we see that it produces a None, the set of nodes to execute
+        # is already locked-in.
+        #
+        # The fix: instead, we can recognize statically that the graph we're compiling will never contribute
+        # gradients to y, and prevent autograd from trying to traverse y's autograd tape at all.
+        # We can do this by manually detach'ing y before sending it through the `CompiledFunction`.
+        #
+        # Note that this solution is not bulletproof.
+        # It's possible to construct a case where eager may or may not have have tried to autograd through y,
+        # depending on the actual grad_outputs that were passed in during the backward.
+        # There is no easy fix for this: the simplest fix would be to run with `retain_graph=True`,
+        # allowing autograd to re-use the graph.
+        #
+        # An example of this case is:
+        # def f(x):
+        #     return x.detach() * 2, x * 3
+        # If we were to only backprop through outs[0], in eager, we would stop
+        # If we backward only on the first output, we shouldn't send a grad through x.
+        # But the custom autograd function doesn't know that: it will materialize zero grads for x * 3
+        # and we will end up with a zero grad at x.
+        # If we later backprop through the second output, this will also require backprop'ing through x.
+        # Meaning we'll need to use `retain_graph=True` to be able to backprop through x the second time.
+        _indices_of_inps_to_detach = []
+        bw_outs = next(n for n in bw_module.graph.nodes if n.op == "output").args[0]
+
+        # TODO: we should apply the below "detach inputs if their gradients are statically known to be None"
+        # optimization even if we have subclass inputs/outputs (we do not handle this today).
+        # Computing which our our inputs get None gradients is a bit more complicated,
+        # if any of our inputs are subclasses. Why?
+        # (a) we need to make sure that we call .detach() on the input subclasses, since autograd sees subclasses.
+        # (b) The grad_outputs that we AOT computed in our backward graph are the desugared tensor tensors,
+        #     so we need to figure out which subclass fw inputs they map to.
+        if maybe_subclass_meta is None:
+            assert (
+                len(bw_outs)
+                == len(fw_metadata.input_info) + inner_meta.num_outputs_rng_offset
+            )
+            for i, (bw_out) in enumerate(bw_outs):
+                if bw_out is None:
+                    _indices_of_inps_to_detach.append(i)
+
+        if aot_config.enable_log:
+            aot_graphs_log.info(
+                "%s",
+                lazy_format_graph_code("Forward graph", fw_module, aot_config.aot_id),
+            )
+            aot_graphs_log.info(
+                "%s",
+                lazy_format_graph_code("Backward graph", bw_module, aot_config.aot_id),
+            )
+            trace_structured(
+                "aot_forward_graph",
+                payload_fn=lambda: fw_module.print_readable(print_output=False),
+            )
+            trace_structured(
+                "aot_backward_graph",
+                payload_fn=lambda: bw_module.print_readable(print_output=False),
+            )
+
+        with track_graph_compiling(aot_config, "forward"):
+            # flat_args at this point might still be subclasses-
+            # make sure to pass the unwrapped fake tensors into the compiler!
+            adjusted_flat_args = joint_inputs[0]
+            if config.functionalize_rng_ops:
+                # Update example inputs for the fw_compiler
+                fake_mode = detect_fake_mode()
+                seed, offset = CUDARngStateHelper.get_torch_state_as_tuple(fake_mode)
+                adjusted_flat_args.extend([seed, offset])
+                # We are not clearing flat_args here because
+                # 1) There is a check in the debug compiler at the end
+                # 2) It does not matter as these are fake tensors
+
+            if tracing_context := torch._guards.TracingContext.try_get():
+                tracing_context.fw_metadata = inner_meta
+
+            with TracingContext.report_output_strides() as fwd_output_strides:
+                compiled_fw_func = aot_config.fw_compiler(fw_module, adjusted_flat_args)
+            if not hasattr(compiled_fw_func, "_boxed_call"):
+                compiled_fw_func = make_boxed_func(compiled_fw_func)
+
+            # see note: [Returning Fake Tensors on First AOT Autograd Call]
+            if tracing_context and tracing_context.fakify_first_call:
+                fakified_out = _compute_output_meta_with_inductor_strides(
+                    fw_module, fwd_output_strides
+                )
+                fakify_first_call = True
+
+            if maybe_subclass_meta is not None:
+                # Why do we need to pass in num_fw_outs_saved_for_bw?
+                # See Note: [Partitioner handling for Subclasses, Part 2]
+                compiled_fw_func = aot_dispatch_subclass_wrapper(
+                    compiled_fw_func,
+                    subclass_metas=fw_metadata.subclass_fw_graph_out_meta,
+                    num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+                )
+                if not hasattr(compiled_fw_func, "_boxed_call"):
+                    compiled_fw_func = make_boxed_func(compiled_fw_func)
+
+        # NB: It's important to compile backwards ahead of time, as this may
+        # add extra guards which we need to apply to the Dynamo cache at
+        # forwards
+        with track_graph_compiling(aot_config, "backward"):
+            placeholder_list = fx_placeholder_vals(bw_module)
+
+            forward_saved_for_backwards_strides = None
+            if fwd_output_strides is not None:
+                forward_saved_for_backwards_strides = fwd_output_strides[
+                    inner_meta.tensors_saved_for_backwards_slice
+                ]
+
+            # saved activations can have different stride to eager if
+            # the compiler does layout optimization. We should restride the
+            # tensor passed in for compiling the backward graph using the
+            # saved tensor's stride.
+            for i in range(len(placeholder_list)):
+                ph_arg = placeholder_list[i]
+                if not isinstance(ph_arg, torch.Tensor):
+                    continue
+
+                if forward_saved_for_backwards_strides is None:
+                    continue
+
+                real_stride = None
+                # Per all_args calling convention
+                j = i - len(symint_outs_saved_for_bw)
+                if 0 <= j < len(forward_saved_for_backwards_strides):
+                    real_stride = forward_saved_for_backwards_strides[j]
+                if real_stride is None:
+                    continue
+
+                # Comparing ph_arg.stride() with real_stride directly may
+                # cause dynamic dimensions in ph_arg being specialized to static
+                # value. Using the hints to avoid that.
+                if _get_symint_hints(ph_arg.stride()) != real_stride:
+                    # Note that here we use the stride of the real tensor to
+                    # restride a FakeTensor. This does not cause trouble
+                    # for dynamic shape since this code path only get
+                    # executed if layout optimization is enabled. And we
+                    # disable layout optimization for dynamic shape right
+                    # now.
+                    #
+                    # A solution that decide stride order based on real
+                    # tensor's stride and then apply that stride order to
+                    # the FakeTensor does not work smoothly since some
+                    # tensor's layout is not 'dense'. E.g. mixnet_l has a
+                    # tensor with size [8, 64, 112, 112] and strides
+                    # (2408448, 1, 21504, 192). The solution mentioned will
+                    # decide a stride of (802816, 1, 7168, 64) for this
+                    # tensor which is wrong.
+                    placeholder_list[i] = ph_arg.as_strided(ph_arg.size(), real_stride)
+
+            compiled_bw_func = None
+            if len(symint_outs_saved_for_bw):
+                context = torch._C._DisableAutocast if disable_amp else nullcontext
+                with context():
+                    try:
+                        compiled_bw_func = aot_config.bw_compiler(
+                            bw_module, placeholder_list
+                        )
+                    except Exception:
+                        log.warning(
+                            "failed to eagerly compile backwards for dynamic, suppressing in case backwards not needed",
+                            exc_info=True,
+                        )
+            # Compiled autograd will run the bw_module in the backward pass,
+            # so recompilation need happen anyway if the backward pass is ever
+            # called.
+            #
+            # The reason we do the GraphModule recompilation here is because
+            # the lazy recompilation will cause issue in the backward pass
+            # with compiled autograd.
+            #
+            # Do the _LazyGraphModule.force_recompile here rather than when
+            # bw_module is first generated by the partitioner because the bw_module.recompile
+            # may be called in some code path later and cause the _LazyGraphModule.forward
+            # becomes the lazy version again. One example is when dynamic shape is enabled
+            # upfront, the bw_compiler will be called above which can cause extra
+            # graph module recompilation on bw_module.
+            if torch._dynamo.compiled_autograd.compiled_autograd_enabled_count:
+                from torch.fx._lazy_graph_module import _LazyGraphModule
+
+                _LazyGraphModule.force_recompile(bw_module)
+
+    saved_context = TracingContext.try_get()
+
+    backward_state_indices = [
+        idx for idx, x in enumerate(flat_args) if isinstance(x, BackwardState)
+    ]
+    assert len(backward_state_indices) <= 1
+
+    class CompiledFunction(torch.autograd.Function):
+        compiled_fw = compiled_fw_func
+        compiled_bw = compiled_bw_func
+        metadata: ViewAndMutationMeta = fw_metadata  # type: ignore[assignment]
+        maybe_subclass_metadata: Optional[SubclassMeta] = maybe_subclass_meta
+        num_symints_saved_for_bw = _num_symints_saved_for_bw
+        _compiled_autograd_should_lift = False
+        _fakify_first_call = fakify_first_call
+
+        @staticmethod
+        def _compiled_autograd_key(ctx):
+            return (ctx._autograd_function_id, *ctx.symints)
+
+        @staticmethod
+        def forward(ctx, *deduped_flat_tensor_args):
+            args = deduped_flat_tensor_args
+            if backward_state_indices:
+                bw_state = args[backward_state_indices[0]]
+                assert isinstance(bw_state, BackwardState)
+                ctx._compiled_autograd_backward_state = bw_state
+
+            marked_dirty_inps = []
+            for i in fw_metadata.mutated_graph_handled_indices_seen_by_autograd:
+                arg = deduped_flat_tensor_args[i]
+                if not (arg.requires_grad and arg.is_leaf):  # would error
+                    ctx.mark_dirty(arg)
+                marked_dirty_inps.append(arg)
+
+            if not CompiledFunction._fakify_first_call:
+                if CompiledFunction.metadata.is_rng_op_functionalized:
+                    # Add the seed and offset to args
+                    seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
+                    args = (*args, seed, offset)
+                # There is a pretty complicated calling convention around what the compiled fw returns.
+                # The full list of outputs and their relative order is:
+                # (*tokens, *mutated_inputs, *fw_outs, *fw_intermediate_bases, *saved_tensors, *saved_symints)
+                # - Note that in the synthetic bases case, mutated_inputs will correspond to an updated version
+                #   of the original view, and not the synthetic base
+
+                fw_outs = call_func_at_runtime_with_args(
+                    CompiledFunction.compiled_fw,
+                    args,
+                    disable_amp=disable_amp,
+                )
+            else:
+                nonlocal fakified_out
+                assert fakified_out is not None
+                CompiledFunction._fakify_first_call = False
+                fw_outs = fakified_out
+                fakified_out = None
+
+            num_outputs = CompiledFunction.metadata.num_outputs
+            num_outputs_aliased = CompiledFunction.metadata.num_outputs_aliased
+            num_mutated_runtime_inps = (
+                CompiledFunction.metadata.num_mutated_inp_runtime_indices
+            )
+            num_tokens = len(CompiledFunction.metadata.tokens)
+            num_forward_returns = CompiledFunction.metadata.num_forward_returns
+            num_forward = CompiledFunction.metadata.num_forward
+
+            # Partitioners must put symint arguments at the end separate from tensor arguments
+            tensors_saved_for_backwards = fw_outs[
+                CompiledFunction.metadata.tensors_saved_for_backwards_slice
+            ]
+            assert all(isinstance(x, torch.Tensor) for x in tensors_saved_for_backwards)
+            # See Note [Detaching saved tensors in AOTAutograd]
+            ctx.save_for_backward(
+                *(
+                    x.detach() if x._is_view() else x
+                    for x in tensors_saved_for_backwards
+                )
+            )
+            symint_outs = fw_outs[
+                CompiledFunction.metadata.symints_saved_for_backwards_slice
+            ]
+            assert all(
+                isinstance(x, (int, float, torch.SymInt, torch.SymFloat))
+                for x in symint_outs
+            ), str([type(x) for x in symint_outs])
+            ctx.symints = symint_outs
+
+            raw_returns = fw_outs[0 : num_forward_returns + num_tokens]
+
+            # Wrap all autograd.Function.forward() outputs that are aliases
+            # so that autograd.Function doesn't treat them as tensors
+            if num_mutated_runtime_inps > 0:
+                for i, idx in enumerate(
+                    CompiledFunction.metadata.mutated_inp_runtime_indices
+                ):
+                    # We could make this faster by only looping over inputs with metadata-only mutations
+                    # (instead of looping over inputs with either data or metadata mutations), but there shouldn't be many.
+                    info = CompiledFunction.metadata.input_info[idx]
+                    if info.mutates_metadata and not info.mutates_data:
+                        raw_returns[i] = TensorAlias(raw_returns[i])
+
+                if config.debug_assert:
+                    user_mutated_inputs_raw = raw_returns[0:num_mutated_runtime_inps]
+                    mut_inp_infos = [
+                        x
+                        for x in CompiledFunction.metadata.input_info
+                        if x.mutates_data or x.mutates_metadata
+                    ]
+                    assert len(user_mutated_inputs_raw) == len(mut_inp_infos)
+
+            if CompiledFunction.metadata.num_unsafe_view_outputs > 0:
+                for idx in CompiledFunction.metadata.unsafe_view_out_indices:
+                    raw_return_idx = num_mutated_runtime_inps + idx
+                    o = raw_returns[raw_return_idx]
+                    raw_returns[raw_return_idx] = torch.ops.aten._unsafe_view(
+                        o, o.shape
+                    )
+
+            if num_outputs_aliased > 0:
+                for idx in CompiledFunction.metadata.aliased_out_indices:
+                    raw_return_idx = num_mutated_runtime_inps + idx
+                    raw_returns[raw_return_idx] = TensorAlias(
+                        raw_returns[raw_return_idx]
+                    )
+
+                if config.debug_assert:
+                    intermediates_raw = raw_returns[
+                        num_mutated_runtime_inps + num_outputs :
+                    ]
+                    assert not any(
+                        isinstance(x, TensorAlias) for x in intermediates_raw
+                    )
+
+            # invariant: intermediate bases always require gradients, so we don't have to
+            # consider marking them as non-differentiable.
+            raw_returns_not_including_intermediate_bases = raw_returns[
+                : num_mutated_runtime_inps + num_outputs
+            ]
+            raw_returns_meta = [
+                x
+                for x in CompiledFunction.metadata.input_info
+                if x.mutation_type == MutationType.MUTATED_OUT_GRAPH
+            ] + CompiledFunction.metadata.output_info
+
+            fw_outs_not_requiring_grad = [
+                x
+                for (i, x) in enumerate(raw_returns_not_including_intermediate_bases)
+                if isinstance(x, torch.Tensor) and not raw_returns_meta[i].requires_grad
+            ]
+            ctx.mark_non_differentiable(*fw_outs_not_requiring_grad)
+            ctx._materialize_non_diff_grads = False
+
+            functionalized_rng_runtime_epilogue(
+                CompiledFunction.metadata,
+                fw_outs[num_forward_returns:num_forward],
+                return_new_outs=False,
+            )
+            return tuple(raw_returns) + tuple(marked_dirty_inps)
+
+        @staticmethod
+        def backward(ctx, *flat_args):
+            # Calling convention: we expect a grad_out passed to the backward:
+            # - for every output of the fw that does *not* alias an input or graph intermediate
+            # - for every updated_input generated by the fw that does *not* alias an input (aka only data-mutations)
+            # - for every graph intermediate that we need to use to generate an output later.
+            # The other outputs in the autograd.Function.forward that do *not* show up in the backward include:
+            # - outputs that alias inputs or graph intermediates
+            # - updated inputs due to metadata-only mutations.
+            # We need to return them in the forward, but ensure that they all do not get gradients in the backward,
+            # and we filter them out here before passing the remaining grad_outputs into the compiled backward.
+            num_intermediate_bases = CompiledFunction.metadata.num_intermediate_bases
+            num_graph_handled_inputs = (
+                CompiledFunction.metadata.num_mutated_graph_handled_indices_seen_by_autograd
+            )
+            num_mutated_runtime_inps = (
+                CompiledFunction.metadata.num_mutated_inp_runtime_indices
+            )
+            expected_grad_outs = (
+                CompiledFunction.metadata.num_outputs
+                + num_mutated_runtime_inps
+                + num_intermediate_bases
+            )
+            deterministic = CompiledFunction.metadata.deterministic
+            global_deterministic = torch.are_deterministic_algorithms_enabled()
+            if deterministic is not None:
+                torch._check(
+                    not (not deterministic and global_deterministic),
+                    lambda: (
+                        "This compiled backward function is being run with "
+                        "torch.use_deterministic_algorithms(True), "
+                        "but it was previously generated during the forward function while "
+                        "torch.use_deterministic_algorithms(False) was set."
+                    ),
+                )
+
+            if num_graph_handled_inputs > 0:
+                flat_args = flat_args[:-num_graph_handled_inputs]
+            assert len(flat_args) == expected_grad_outs
+            out_info = CompiledFunction.metadata.output_info
+
+            inp_tangents, out_tangents, intermediate_base_tangents = (
+                flat_args[0:num_mutated_runtime_inps],
+                flat_args[
+                    num_mutated_runtime_inps : num_mutated_runtime_inps
+                    + CompiledFunction.metadata.num_outputs
+                ],
+                flat_args[
+                    num_mutated_runtime_inps + CompiledFunction.metadata.num_outputs :
+                ],
+            )
+            # input_info contains info on *every* input,
+            # But in the backward(), we are only given grad outputs for every mutated input
+            # We then need to filter out the grad outputs that correspond to metadata-only mutations or don't require grad
+            input_info = CompiledFunction.metadata.input_info
+            inp_tangents_filtered = [
+                x
+                for x, info_idx in zip(
+                    inp_tangents, CompiledFunction.metadata.mutated_inp_runtime_indices
+                )
+                if input_info[info_idx].mutates_data
+                and input_info[info_idx].requires_grad
+            ]
+            # We also need to filter out grad outputs that correspond to outputs aliasing inputs/intermediates
+            out_tangents_filtered = [
+                x
+                for x, info in zip(out_tangents, out_info)
+                if info.output_type
+                in [
+                    OutputType.non_alias,
+                    OutputType.unsafe_view_alias,
+                    OutputType.custom_function_view,
+                ]
+                and issubclass(info.raw_type, torch.Tensor)
+                and info.requires_grad
+            ]
+            # intermediate bases always require gradients, and always participate in the backward graph.
+            flat_bw_args_with_grads = [
+                *inp_tangents_filtered,
+                *out_tangents_filtered,
+                *intermediate_base_tangents,
+            ]
+            num_flat_bw_args_with_grads = len(flat_bw_args_with_grads)
+
+            # sanity asserts
+            # metadata_only_inps = [
+            #     x for x, info_idx in zip(inp_tangents, mutated_inp_indices)
+            #     if not input_info[info_idx].mutates_data
+            # ]
+            # aliased_outputs = [
+            #     x for x, info in zip(out_tangents, out_info) if info.output_type != OutputType.non_alias]
+            # assert all(x is None for x in metadata_only_inps)
+            # assert all(x is None for x in aliased_outputs)
+
+            rng_args = []
+            if CompiledFunction.metadata.is_rng_op_functionalized:
+                # Add the seed and offset to args
+                rng_args = CUDARngStateHelper.get_torch_state_as_tuple()
+
+            all_args = [
+                *ctx.symints,
+                *ctx.saved_tensors,
+                *flat_bw_args_with_grads,
+                *rng_args,
+            ]
+            del flat_bw_args_with_grads
+
+            tangents_start_idx = (
+                len(all_args) - num_flat_bw_args_with_grads - len(rng_args)
+            )
+            tangents_end_idx = len(all_args) - len(rng_args)
+
+            # Note: [AOTAutograd Backward Guards]
+            # During AOTDispatch, we eagerly create and trace out a joint fw-bw graph.
+            # Doing so requires us to "guess" about some of the metadata of our grad_outputs.
+            #
+            # In particular: if an output to the forward is a plain tensor or a subclass,
+            # its corresponding grad_output in the backward **may or may not** be
+            # a plain tensor or a subclass. The main cases are:
+            # (1) If an output is a plain tensor, its grad_out will also be a plain tensor,
+            #     *unless* the output is used in some subclass compute later in the forward graph,
+            #     which will cause its grad_output to become a subclass
+            # (2) If an output is a subclass, its grad_out will also be a subclass,
+            #     *unless* the output of the forward did not actually participate in the gradient computation,
+            #     in which case autograd will insert a plain tensor of zeros for the grad_output.
+            #     We could avoid this case with `torch.autograd.Function.set_materialize_grads`,
+            #     although this is not turned on today in AOTAutgrad and would require more work.
+            #
+            # Today, we make a guess on subclass-ness based on the above examples,
+            # and hard-error in the backward if we guessed wrong.
+            #
+            # In the future, we should add backward guards that would allow us to
+            # properly handle this case instead of erroring: we would need to retrace the backward graph,
+            # since we might produce an entirely different trace if our grad_outputs are subclass or not.
+            assert (
+                len(CompiledFunction.metadata.output_types)
+                == num_flat_bw_args_with_grads
+            )
+            grad_output_types = [
+                type(x) for x in all_args[-num_flat_bw_args_with_grads:]
+            ]
+            # In general, we can add more asserts/guards here for when we partitioned
+            # with incorrect assumptions about the grad_outputs.
+            # Normalize FakeTensor -> torch.Tensor
+            # - during tracing our types are FakeTensor
+            # - at runtime in the backward our types are torch.Tensor...
+            # - unless we're running compiled backward, in which case they are also FakeTensor
+            grad_output_types_ = [
+                torch.Tensor if x is FakeTensor else x for x in grad_output_types
+            ]
+            assert (
+                grad_output_types_ == CompiledFunction.metadata.output_types
+            ), f"""\
+We incorrectly attempted to compile the backward with incorrect subclass metadata.
+If you run into this error, please file an issue.
+Expected grad_output types: {str(CompiledFunction.metadata.output_types)}
+Got grad_output types: {str(grad_output_types)}"""
+
+            # TODO: figure out how to refactor the backward properly so I can use aot_dispatch_subclass_wrapper() here.
+            if CompiledFunction.maybe_subclass_metadata is not None:
+                # Get the number of tangents after unwrapping
+                len_tangents = len(
+                    unwrap_tensor_subclasses(
+                        all_args[tangents_start_idx:tangents_end_idx],
+                        is_joint_structure=False,
+                    )
+                )
+                all_args = unwrap_tensor_subclasses(all_args, is_joint_structure=False)
+                tangents_start_idx = len(all_args) - len_tangents - len(rng_args)
+                tangents_end_idx = tangents_start_idx + len_tangents
+
+            # Make the tangents contiguous. Note that we must do this after subclass desugaring
+            # because inputs to inductor have to be contiguous
+            all_args = [
+                t.contiguous()
+                if (
+                    (tangents_start_idx <= i < tangents_end_idx)
+                    and (not t.is_contiguous())
+                )
+                else t
+                for i, t in enumerate(all_args)
+            ]
+
+            def call_compiled_backward():
+                if ctx._is_compiled_autograd_tracing():
+                    # For compiled autograd, run raw FX graph so that it can be inlined into the larger graph
+                    symints = ctx._get_compiled_autograd_symints()
+                    assert len(symints) == len(ctx.symints)
+                    all_args[: len(symints)] = symints
+                    if backward_state_indices:
+                        assert ctx._compiled_autograd_backward_state.proxy is not None
+                        all_args.append(ctx._compiled_autograd_backward_state)
+                    context = torch._C._DisableAutocast if disable_amp else nullcontext
+                    with context():
+                        out = normalize_as_list(bw_module(*all_args))
+                    out = functionalized_rng_runtime_epilogue(
+                        CompiledFunction.metadata, out
+                    )
+                    return tuple(out)
+                assert (
+                    not backward_state_indices
+                ), "BackwardState requires CompiledAutograd"
+                ctx.maybe_clear_saved_tensors()
+                if CompiledFunction.compiled_bw is None:
+                    context = torch._C._DisableAutocast if disable_amp else nullcontext
+                    with tracing(saved_context), context(), track_graph_compiling(
+                        aot_config, "backward"
+                    ):
+                        CompiledFunction.compiled_bw = aot_config.bw_compiler(
+                            bw_module, placeholder_list
+                        )
+
+                out = call_func_at_runtime_with_args(
+                    CompiledFunction.compiled_bw,
+                    all_args,
+                    steal_args=True,
+                    disable_amp=disable_amp,
+                )
+
+                out = functionalized_rng_runtime_epilogue(
+                    CompiledFunction.metadata, out
+                )
+                return tuple(out)
+
+            if torch.is_grad_enabled() and any(
+                t.requires_grad for t in all_args if isinstance(t, torch.Tensor)
+            ):
+                # Ensure that the graph is connected, and error if double backward is performed.
+                # See comment for why once_differentiable is not sufficient:
+                # https://github.com/pytorch/pytorch/pull/92348/files#r1072962107
+                class CompiledFunctionBackward(torch.autograd.Function):
+                    # CompiledFunctionBackward is not yet supported in dynamo skipfiles
+                    _compiled_autograd_should_lift = False
+
+                    @staticmethod
+                    def forward(ctx, *unused_args):
+                        outs = call_compiled_backward()
+                        # TODO: figure out how to refactor the backward properly so I can use aot_dispatch_subclass_wrapper() here.
+                        if CompiledFunction.maybe_subclass_metadata is not None:
+                            assert (
+                                CompiledFunction.maybe_subclass_metadata.grad_input_metas
+                                is not None
+                            )
+                            outs_wrapped = wrap_tensor_subclasses(
+                                outs,
+                                subclass_metas=CompiledFunction.maybe_subclass_metadata.grad_input_metas,
+                            )
+                            return outs_wrapped
+                        return outs
+
+                    @staticmethod
+                    def backward(ctx, *args):
+                        raise RuntimeError(
+                            "torch.compile with aot_autograd does not currently support double backward"
+                        )
+
+                CompiledFunctionBackward._compiled_autograd_key = (  # type: ignore[method-assign]
+                    CompiledFunction._compiled_autograd_key
+                )
+
+                # Pass args even though they're unused, so that the graph is built
+                out = CompiledFunctionBackward.apply(*all_args)
+            else:
+                out = call_compiled_backward()
+
+            # TODO: figure out how to refactor the backward properly so I can use aot_dispatch_subclass_wrapper() here.
+            if CompiledFunction.maybe_subclass_metadata is not None:
+                assert (
+                    CompiledFunction.maybe_subclass_metadata.grad_input_metas
+                    is not None
+                )
+                outs_wrapped = wrap_tensor_subclasses(
+                    out,
+                    subclass_metas=CompiledFunction.maybe_subclass_metadata.grad_input_metas,
+                )
+                return outs_wrapped
+            return out
+
+    compiled_function = create_runtime_wrapper(
+        CompiledFunction.apply,
+        runtime_metadata=fw_metadata,
+        indices_of_inps_to_detach=_indices_of_inps_to_detach,
+        trace_joint=True,
+        keep_input_mutations=aot_config.keep_inference_input_mutations,
+        disable_amp=disable_amp,
+    )
+
+    if not config.debug_assert:
+        return compiled_function
+
+    flat_requires_grad = [
+        a.requires_grad if isinstance(a, Tensor) else None for a in flat_args
+    ]
+
+    @wraps(compiled_function)
+    def debug_compiled_function(*args):
+        # TODO: Check aliasing relationships
+        # TODO: Check strides for metadata mutation
+        # (NB: ideally, this logic is factored out of this function and
+        # you move these debug checks there)
+
+        # Check requires grad.  Bad case is when we compiled with
+        # requires_grad = False, but input requires_grad = True
+        # (vice versa is OK; we compute a gradient and then throw
+        # it away when it hits the input.)
+        for i, a in enumerate(args):
+            can_require_grad = flat_requires_grad[i]
+            if can_require_grad is None:
+                assert not isinstance(a, Tensor)
+            elif not can_require_grad:
+                assert not a.requires_grad, format_guard_bug_msg(
+                    aot_config,
+                    f"{describe_input(i, aot_config)} would not require grad",
+                )
+
+        return compiled_function(*args)
+
+    return debug_compiled_function
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/logging_utils.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..01dfe5f031611bdf1c7a9737d1226a858ba27832
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/logging_utils.py
@@ -0,0 +1,135 @@
+"""
+Contains utils for logging in AOTAutograd, including managing the names of the graphs under
+compilation, capturing user-friendly tracebacks, and debug messages.
+"""
+
+import collections
+from contextlib import contextmanager
+from typing import List, Tuple
+
+import torch
+import torch.fx.traceback as fx_traceback
+
+# This is a list since looking forward, we can have this arbitrarily nested.
+graph_being_compiled: List[str] = []
+# TODO: It would be nice to reset the numbering every time aot_id goes
+# up, but this is annoying to do right now (because we don't know if
+# an aot_id will come back from the dead), so right now this also happens
+# to be a globally unique number too (at the cost of wobbling if you change
+# how the graphs compile)
+nth_graph: int = 0
+model_name: str = "model"
+
+
+def set_model_name(name):
+    global model_name
+    model_name = name
+
+
+def get_aot_compilation_context() -> Tuple[List[str], str, int]:
+    return list(graph_being_compiled), model_name, nth_graph
+
+
+def get_aot_graph_name() -> str:
+    """
+    Returns the name of the graph being compiled.
+    """
+    global model_name, graph_being_compiled, nth_graph
+    return f"{model_name}__{'_'.join(graph_being_compiled)}_{nth_graph}"
+
+
+get_graph_being_compiled = get_aot_graph_name
+
+
+@contextmanager
+def track_graph_compiling(aot_config, graph_name):
+    global graph_being_compiled
+    # TODO: Don't shove the aot_id in here; set it in the context
+    graph_being_compiled = [f"{aot_config.aot_id}_{graph_name}"]
+    try:
+        yield
+    finally:
+        global nth_graph
+        nth_graph += 1
+        graph_being_compiled = []
+
+
+# Set up hooks so that during backward the fx's stack_trace is properly set
+callback_set = False
+
+
+def setup_stacktrace_preservation_hooks(roots: List):
+    def iter_graph(roots):
+        if not roots:
+            return
+        seen = set()
+        q = collections.deque()  # type: ignore[var-annotated]
+        for node in roots:
+            if node is not None and node not in seen:
+                seen.add(node)
+                q.append(node)
+
+        while q:
+            node = q.popleft()
+            for fn, _idx in node.next_functions:
+                if fn in seen or fn is None:
+                    continue
+                seen.add(fn)
+                q.append(fn)
+
+            yield node
+
+    def get_callback(saved_stack_):
+        def callback():
+            global callback_set
+            fx_traceback.set_stack_trace(saved_stack_)
+            callback_set = False
+
+        return callback
+
+    def get_prehook(stack_, seq_nr):
+        def prehook(grad_output):
+            global callback_set
+
+            if not callback_set:
+                torch.autograd.variable.Variable._execution_engine.queue_callback(  # type: ignore[attr-defined]
+                    get_callback(fx_traceback.format_stack())
+                )
+                callback_set = True
+
+            fx_traceback.set_stack_trace(stack_)
+            fx_traceback.set_grad_fn_seq_nr(seq_nr)
+
+        return prehook
+
+    def get_posthook(special_stack_, seq_nr):
+        def posthook(grad_input, grad_output):
+            fx_traceback.set_stack_trace(special_stack_)
+            fx_traceback.reset_grad_fn_seq_nr()
+
+        return posthook
+
+    for node in iter_graph(roots):
+        forward_node_stack = node.metadata.get("traceback_", [])
+        node.register_prehook(get_prehook(forward_node_stack, node._sequence_nr()))
+
+        special_stack = forward_node_stack.copy()
+        special_stack.append(
+            "Gradient addition node due to multiple use of tensor around:"
+        )
+        node.register_hook(get_posthook(special_stack, node._sequence_nr()))
+
+
+def describe_input(i, aot_config):
+    if i < aot_config.num_params_buffers:
+        return f"parameter/buffer {i}"
+    else:
+        return f"input {i - aot_config.num_params_buffers}"
+
+
+def format_guard_bug_msg(aot_config, expected):
+    return (
+        f"At compilation time, graph {aot_config.aot_id} was compiled under the "
+        f"assumption that {expected}, but at runtime this was not the case.  "
+        "This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."
+    )
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e8828661c87c739c4115090f062d61bf39f697
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -0,0 +1,1021 @@
+"""
+This module defines runtime wrappers, which, based on previous analysis attempts to:
+1. process the inputs and outputs
+2. apply mutations
+3. handle functionalized randomness
+4. deduplicate inputs and consolidate views into their bases (see input_output_analysis)
+"""
+
+import collections
+import pprint
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.dlpack
+from torch import Tensor
+from torch._guards import DuplicateInputs, TracingContext
+from torch._prims_common import CUDARngStateHelper
+from torch.multiprocessing.reductions import StorageWeakRef
+from .. import config
+from .collect_metadata_analysis import run_functionalized_fw_and_collect_metadata
+
+from .functional_utils import gen_alias_from_base
+from .input_output_analysis import (
+    compute_overlapping_inputs,
+    create_synthetic_base_metadata,
+    remove_dupe_metadata,
+)
+from .logging_utils import describe_input, format_guard_bug_msg
+from .schemas import (
+    AOTConfig,
+    InputAliasInfo,
+    OutputType,
+    SubclassCreationMeta,
+    TensorAlias,
+    ViewAndMutationMeta,
+)
+from .subclass_utils import (
+    requires_subclass_dispatch,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses,
+)
+
+from .utils import (
+    call_func_at_runtime_with_args,
+    make_boxed_func,
+    partial_flatten_asdict,
+    strict_zip,
+)
+
+
+zip = strict_zip
+
+
+# The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
+# that needs to run after the compiled function.
+#
+# This function accepts a trace_joint flag, indicating whether or not we're generating the runtime
+# epilogue for a forward-only inference graph, or for an autograd.Function.apply function.
+# This is because there are some minor differences in how we treat these cases at runtime:
+# - resize_() is currently handled in the inference case, but not fully handled in the autograd case.
+# - the autograd cases inserts TensorAlias wrapper objects for outputs that alias inputs
+def create_runtime_wrapper(
+    compiled_fn,
+    *,
+    runtime_metadata: ViewAndMutationMeta,
+    indices_of_inps_to_detach: List[int],
+    trace_joint: bool,
+    keep_input_mutations: bool,
+    disable_amp: bool,
+):
+    num_tokens = len(runtime_metadata.tokens)
+
+    if not hasattr(compiled_fn, "_boxed_call"):
+        compiled_fn = make_boxed_func(compiled_fn)
+
+    def runtime_wrapper(*args):
+        # Pass in effect tokens (See Note [Side-Effectful Tokens in AOTAutograd])
+        args = (*[torch.tensor([])] * num_tokens, *args)
+
+        if trace_joint:
+            args_ = list(args)
+            # See Note [Detaching inputs that never need gradients]
+            for idx in indices_of_inps_to_detach:
+                if isinstance(args_[idx], torch.Tensor):
+                    args_[idx] = args_[idx].detach()
+            with torch.autograd._force_original_view_tracking(True):
+                all_outs = call_func_at_runtime_with_args(
+                    compiled_fn,
+                    args_,
+                    disable_amp=disable_amp,
+                )
+        else:
+            # When we have an inference graph, we run with torch.no_grad.
+            # It's possible to get an inference graph with inputs that require grad,
+            # in which case we want to make sure autograd is disabled
+            # (since e.g., inductor will generate aten.addmm.out calls which autograd will complain on)
+            if torch.is_grad_enabled():
+                with torch.no_grad():
+                    all_outs = call_func_at_runtime_with_args(
+                        compiled_fn,
+                        args,
+                        disable_amp=disable_amp,
+                    )
+            else:
+                all_outs = call_func_at_runtime_with_args(
+                    compiled_fn,
+                    args,
+                    disable_amp=disable_amp,
+                )
+
+        num_mutated_runtime_inps = runtime_metadata.num_mutated_inp_runtime_indices
+        num_intermediate_bases = runtime_metadata.num_intermediate_bases
+
+        if keep_input_mutations and trace_joint:
+            num_input_mutations_handled_by_autograd = (
+                runtime_metadata.num_mutated_graph_handled_indices_seen_by_autograd
+            )
+            # autograd.Function requires us to return the mutated inputs as extra outputs to the autograd.Function.forward
+            if num_input_mutations_handled_by_autograd > 0:
+                all_outs = all_outs[:-num_input_mutations_handled_by_autograd]
+
+        assert (
+            len(all_outs)
+            == num_mutated_runtime_inps
+            + runtime_metadata.num_outputs
+            + num_intermediate_bases
+            + num_tokens
+        )
+
+        # Toss out the effect tokens (See Note [Side-Effectful Tokens in AOTAutograd])
+        all_outs = all_outs[num_tokens:]
+
+        # Step 3: After running the compiled fw, apply updates to mutated inputs
+        num_mutations_to_apply = runtime_metadata.num_mutated_inp_runtime_indices
+        if num_mutations_to_apply > 0:
+            updated_inputs = all_outs[:num_mutations_to_apply]
+            fw_outs = all_outs[num_mutations_to_apply:]
+
+            for i, inpt_idx in enumerate(runtime_metadata.mutated_inp_runtime_indices):
+                meta = runtime_metadata.input_info[inpt_idx]
+                if not meta.mutates_data and not meta.mutates_metadata:
+                    continue
+                original_inpt = args[inpt_idx]
+                updated_inpt = updated_inputs[i]
+                if meta.mutates_storage_metadata:
+                    # mutates_storage_metadata means our input saw a x.set_(y) call.
+                    # What if x **also** saw a data and/or a metadata mutation?
+                    # (1) If the [meta]data mutation occurred after the set_(),
+                    #     then there is no need to copy_() the data.
+                    #     When we perform x.set_(x_updated), we are guaranteed that
+                    #     x_updated already has the final version of the data/metadata
+                    # (2) If a data mutation occurred before the set_().
+                    #     This case seems very difficult to support.
+                    #     TODO: discuss on the PR and decide if we want to tr to
+                    #     either support it, or detect and ban it.
+                    if trace_joint:
+                        assert isinstance(updated_inpt, TensorAlias)
+                        updated_inpt = updated_inpt.alias
+                    with torch.no_grad():
+                        original_inpt.set_(updated_inpt)
+                    continue
+                if meta.mutates_metadata and not meta.mutates_data:
+                    if trace_joint:
+                        assert isinstance(updated_inpt, TensorAlias)
+                        updated_inpt = updated_inpt.alias
+                    # We need to grab the size/stride/storage_offset from the compiled forward,
+                    # and use that to mutate the metadata of the input
+                    original_inpt.as_strided_(
+                        updated_inpt.size(),
+                        updated_inpt.stride(),
+                        updated_inpt.storage_offset(),
+                    )
+                else:
+                    if meta.mutates_data and meta.mutates_metadata:
+                        original_inpt.as_strided_(
+                            updated_inpt.size(),
+                            updated_inpt.stride(),
+                            updated_inpt.storage_offset(),
+                        )
+                    else:
+                        assert meta.mutates_data
+                    if meta.is_leaf and original_inpt.requires_grad:
+                        # We can hit this situation in this case:
+                        #   def f(x):
+                        #       x.detach().mul_(2)
+                        #       return x + 1
+                        # AOTAutograd will see a mutation in the above case, and try to
+                        # apply a copy_() here, in the epilogue.
+                        # But if x required gradients, and is a leaf, then autograd
+                        # will yell at us for trying to mutate it.
+                        # However, it's only possible to end up in this scenario (like the above)
+                        # if all of the mutations to the leaf input were non-autograd-tracking mutations
+                        # (aka mutations under no_grad(), or on detached views).
+                        # In that case, we fully want to hide the mutation from autograd, so detaching is ok.
+                        original_inpt.detach().copy_(updated_inpt)
+                    else:
+                        original_inpt.copy_(updated_inpt)
+        else:
+            fw_outs = all_outs
+
+        # Step 4: Manually regenerate any outputs that are aliased to inputs, instead of
+        # compiling them.
+        if runtime_metadata.num_outputs_aliased > 0:
+            # The compiled forward also returned intermediate bases. We don't want to return them to the user.
+            if runtime_metadata.num_intermediate_bases > 0:
+                fw_outs_no_intermediate_bases = fw_outs[
+                    : -runtime_metadata.num_intermediate_bases
+                ]
+                intermediate_bases = fw_outs[-runtime_metadata.num_intermediate_bases :]
+            else:
+                fw_outs_no_intermediate_bases = fw_outs
+                intermediate_bases = []
+
+            assert len(fw_outs_no_intermediate_bases) == len(
+                runtime_metadata.output_info
+            )
+            fw_outs_including_aliases = []
+            for i, (o, info) in enumerate(
+                zip(fw_outs_no_intermediate_bases, runtime_metadata.output_info)
+            ):
+                if info.output_type in [
+                    OutputType.non_alias,
+                    OutputType.unsafe_view_alias,
+                    OutputType.custom_function_view,
+                ]:
+                    fw_outs_including_aliases.append(o)
+                    continue
+                if trace_joint:
+                    assert isinstance(o, TensorAlias)
+                    o_ = o.alias
+                else:
+                    o_ = o
+
+                o_grad = runtime_metadata.output_info[i].requires_grad
+                if info.output_type == OutputType.alias_of_input:
+                    aliased_base_tensor = args[info.base_idx]  # type: ignore[index]
+                    regenerated_out = gen_alias_from_base(
+                        aliased_base_tensor, o_, o_grad
+                    )
+                    fw_outs_including_aliases.append(regenerated_out)
+                    continue
+                elif info.output_type == OutputType.is_input:
+                    aliased_base_tensor = args[info.base_idx]  # type: ignore[index]
+                    regenerated_out = aliased_base_tensor
+                    fw_outs_including_aliases.append(regenerated_out)
+                    continue
+                elif info.output_type == OutputType.alias_of_intermediate:
+                    base_tensor_list = intermediate_bases
+                elif (
+                    info.output_type == OutputType.alias_of_intermediate_save_as_output
+                ):
+                    base_tensor_list = intermediate_bases
+                else:
+                    assert (
+                        info.output_type
+                        == OutputType.alias_of_intermediate_base_is_user_output
+                    )
+                    base_tensor_list = fw_outs_no_intermediate_bases
+                aliased_base_tensor = base_tensor_list[info.base_idx]
+                # TODO: handle the custom autograd function case here.
+                # We need a way to check whether a tensor came from a custom autograd fn from python,
+                # AND a way to replay that custom view fn.
+                regenerated_out = gen_alias_from_base(aliased_base_tensor, o_, o_grad)
+                fw_outs_including_aliases.append(regenerated_out)
+            ret_outs = fw_outs_including_aliases
+        else:
+            ret_outs = fw_outs
+
+        if runtime_metadata.dynamic_outputs:
+            for t, o in zip(ret_outs, runtime_metadata.output_info):
+                if o.dynamic_dims is None:
+                    continue
+                if hasattr(t, "_dynamo_weak_dynamic_indices"):
+                    t._dynamo_weak_dynamic_indices |= o.dynamic_dims
+                else:
+                    t._dynamo_weak_dynamic_indices = o.dynamic_dims.copy()
+        if runtime_metadata.grad_enabled_mutation is not None:
+            torch.set_grad_enabled(runtime_metadata.grad_enabled_mutation)
+        return ret_outs
+
+    return runtime_wrapper
+
+
+# Calling convention: If we are running functionalized RNG, then outs consists
+# of (user_outs, rng_offset)
+def functionalized_rng_runtime_epilogue(
+    metadata: ViewAndMutationMeta, outs, return_new_outs=True
+):
+    if metadata.is_rng_op_functionalized:
+        assert metadata.num_outputs_rng_offset == 1
+        new_rng_offset = outs[-1]
+        CUDARngStateHelper.set_new_offset(new_rng_offset)
+        if return_new_outs:
+            user_outs = outs[:-1]
+            return user_outs
+        else:
+            return None
+    return outs
+
+
+# This wrapper handles the AOTDispatch runtime logic for tensor subclasses.
+# At runtime, we have a compiled function that knows how to operate on the domain of DenseTensor -> DenseTensor,
+# But the user might have passed us some tensor subclass inputs (or expect some subclass tensor outputs).
+# This function handles the wrapping and unwrapping of tensor subclasses at runtime.
+def aot_dispatch_subclass_wrapper(
+    runtime_fn: Callable,
+    *,
+    subclass_metas: List[Union[int, SubclassCreationMeta]],
+    num_fw_outs_saved_for_bw: Optional[int],
+) -> Callable:
+    def inner_fn(args):
+        unwrapped_args = unwrap_tensor_subclasses(args, is_joint_structure=False)
+        # expectation: runtime_fn is a boxed fn
+        unwrapped_outs = runtime_fn(unwrapped_args)
+        wrapped_outs = wrap_tensor_subclasses(
+            unwrapped_outs,
+            subclass_metas=subclass_metas,
+            num_fw_outs_saved_for_bw=num_fw_outs_saved_for_bw,
+            is_runtime=True,
+        )
+        return wrapped_outs
+
+    # box it
+    inner_fn._boxed_call = True  # type: ignore[attr-defined]
+    return inner_fn
+
+
+# MOTIVATION:
+#
+# When tracing functions for future execution, one must be careful not to pass
+# in the same input tensor multiple times (e.g., f(x, x), as this can result
+# in graphs that are ONLY valid if you later pass a new tensor in exactly the
+# same way (e.g., f(y, y)).  (NB: we really mean duplicate; two distinct
+# tensors that alias each other is a different situation that is covered by
+# aot_dispatch_deduplicated_autograd). Here are two examples:
+#
+# (1) Suppose you have a function:
+#
+#   def f(x, y):
+#       return x + y
+#
+# If you make_fx(f)(x, x), you will trace out:
+#
+#   def f(x, y):
+#       return y + y
+#
+# Oops!
+#
+# (2) For most tensors x and y, you can compute f's gradient with respect to
+# these to inputs by saying torch.autograd.grad(f(x, y), (x, y)).  However,
+# if x is y, you will trace out a program that gets incorrect gradients:
+#
+#   >>> x = torch.randn(1, requires_grad=True)
+#   >>> torch.autograd.grad(x + x, (x, x))
+#   (tensor([2.]), tensor([2.]))
+#
+# In other words, the gradient is double-counted.  Deduplicating the arguments
+# gives you an appropriate gradient:
+#
+#   >>> y = torch.randn(1, requires_grad=True)
+#   >>> torch.autograd.grad(x + y, (x, y))
+#   (tensor([1.]), tensor([1.]))
+#
+# HOW TO DEDUPLICATE:
+#
+# There are a few strategies, in order of preference:
+#
+# 1. For every duplicate argument to the function, detach it into
+#    a separate leaf tensor, so that it is no longer duplicated.
+#
+#       PRO: The resulting compiled graph works for any configuration
+#       of duplicated arguments.
+#
+#       CON: It does not (naively) work if you mutate the metadata of inputs:
+#
+#           def f(x, y):
+#               x.transpose_(0, 1)
+#               y.transpose_(0, 2)
+#
+#           x = torch.randn(2, 3, 4)
+#           f(x, x)
+#
+#       The ordering of the transposes inside f dictates whether or not
+#       you get [4, 2, 3] or [3, 4, 2].  This means that you cannot precompute
+#       what metadata mutations should get applied to each input; you need to
+#       assume they aren't duplicates (what we do today) or preserve
+#       the original metadata mutations exactly in order, so that they work
+#       for any duplicate configuration.
+#
+#       CON: It does not (naively) work if you mutate the data of inputs.
+#       In particular, leaf tensors that require grad cannot be mutated,
+#       this makes it impossible to differentiate with respect to the original
+#       base.
+#
+# 2. For every duplicate argument to the function, remove it, so it is
+#    no longer part of the "true" signature:
+#
+#       PRO: Implemented naively, it still works for metadata/data mutation.
+#
+#       CON: The resulting compiled graph is duplicate-specialized: it only
+#       works if future calls duplicate arguments in exactly the same way.
+#       Horribly, Dynamo doesn't guard on this at the moment.  But even if
+#       it did, you could still end up recompiling a bunch of each duplicate.
+#
+# Our strategy is to do (1) if we can, and do (2) otherwise, erroring if
+# Dynamo's guards are not enough.  In practice, this seems to cover
+# everything.
+#
+def aot_wrapper_dedupe(
+    flat_fn,
+    flat_args: List[Tensor],
+    aot_config: AOTConfig,
+    *,
+    compiler_fn,
+    fw_metadata,
+):
+    # Use information about whether or not flat_fn mutates its arguments
+    # or not to handle dupe args
+
+    # Strategy 1: For any input that is not mutated, we can leafify it if we
+    # need to remove a duplicate.
+    leaf_flat_args = []
+    args_set = set()
+    ok = True
+
+    for i, a in enumerate(flat_args):
+        if not isinstance(a, torch.Tensor):
+            leaf_flat_args.append(a)
+        elif a not in args_set:
+            args_set.add(a)
+            leaf_flat_args.append(a)
+        elif (
+            not fw_metadata.input_info[i].mutates_data
+            and not fw_metadata.input_info[i].mutates_metadata
+        ):
+            leaf_flat_args.append(a.detach().requires_grad_(a.requires_grad))
+        else:
+            ok = False
+            break
+
+    if ok:
+        return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
+
+    if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
+        raise RuntimeError(
+            """\
+Encountered duplicate inputs that are mutated in the graph, but at least one input/output
+to the graph is a tensor subclass. This is not supported today. You can try to
+remove the aliasing yourself as a workaround, or otherwise file an issue on github."""
+        )
+
+    # export path: ban duplicate inputs for now, add later if requested.
+    if aot_config.is_export:
+        raise RuntimeError(
+            f"""\
+Encountered duplicated inputs that are mutated in the graph you are trying to export.
+This functionality is currently not supported. If needed, please file a github issue.
+
+fw_metadata={str(fw_metadata)}
+        """
+        )
+
+    # Strategy 2: Duplicate specialize.
+    #
+    # In Haskell types, suppose you have:
+    #
+    #   add_dupe_args :: DedupedArgs -> Args
+    #   remove_dupe_args :: Args -> DedupedArgs
+    #
+    #   compiler_fn
+    #       :: (DedupedArgs -> R) -> DedupedArgs -> AOTConfig -> (DedupedArgs -> R)
+    #   deped_compiler_fn
+    #       :: (Args -> R) -> Args -> AOTConfig -> (Args -> R)
+    #
+    # Then the code below can be written in point-free style as:
+    #
+    #   deduped_compiler_fn f a c =
+    #       compiler_fn (f . add_dupe_args) (remove_dupe_args a) c . remove_dupe_args
+    #
+    # Suppose you have:
+    #
+    #   [a, b, a, c]
+    #
+    # We want:
+    #
+    #   remove_dupe_args([a, b, a, c]) == [a, b, c]
+    #   add_dupe_args([a, b, c]) == [a, b, a, c]
+    #
+    # This is done via (respectively):
+    #
+    #   seen_args = {a: 0, b: 1, c: 2}
+    #   enumerate(add_dupe_map) = [  # how to get args from the deduped list
+    #       (0, 0),
+    #       (1, 1),
+    #       (2, 0),
+    #       (3, 2),
+    #   ]
+    #   keep_arg_mask = [True, True, False, True]
+
+    seen_args: Dict[Tensor, int] = {}
+    keep_arg_mask = []
+    # Implicitly map duped arg position (list index) to de-duped arg position
+    add_dupe_map: List[int] = []
+    duped_arg_len = len(flat_args)
+
+    j = 0  # index into deduped_flat_args
+    for t in flat_args:
+        if isinstance(t, torch.Tensor):
+            if t in seen_args:
+                keep_arg_mask.append(False)
+                add_dupe_map.append(seen_args[t])
+                continue
+            seen_args[t] = j
+
+        keep_arg_mask.append(True)
+        add_dupe_map.append(j)
+        j += 1
+    assert (
+        len(add_dupe_map) == duped_arg_len
+    ), f"Expects add_dupe_map to have length {duped_arg_len} but got {len(add_dupe_map)}"
+
+    # NB: Hot path, avoid set lookups here
+    # TODO: Can avoid the zip here too, probably
+    def remove_dupe_args(args):
+        return [t for t, keep in zip(args, keep_arg_mask) if keep]
+
+    def add_dupe_args(args):
+        return [args[add_dupe_map[i]] for i in range(duped_arg_len)]
+
+    deduped_flat_args = remove_dupe_args(flat_args)
+
+    # Update our input metadata to remove duped input metadata.
+    updated_fw_metadata = remove_dupe_metadata(fw_metadata, keep_arg_mask, add_dupe_map)
+
+    if (
+        tracing_context := TracingContext.try_get()
+        and aot_config.aot_autograd_arg_pos_to_source
+    ):
+        # TODO(voz): This structure is 1:1, we could consider an alternate structure like
+        # kept_pos:[dupe_arg_pos], however, add_dupe_map is 1:1 so we would need a new structure there,
+        # which feels like needless complexity for a tiny bit of efficiency at this point.
+        for dupe_arg_pos, (kept_pos, keep_arg) in enumerate(
+            zip(add_dupe_map, keep_arg_mask)
+        ):
+            if not keep_arg:
+                dupe_arg_source = aot_config.aot_autograd_arg_pos_to_source[
+                    dupe_arg_pos
+                ]
+                kept_arg_source = aot_config.aot_autograd_arg_pos_to_source[kept_pos]
+                tracing_context.guards_context.aotautograd_guards.append(  # type: ignore[attr-defined]
+                    DuplicateInputs(kept_arg_source, dupe_arg_source)
+                )
+
+    @wraps(flat_fn)
+    def wrapped_flat_fn(*args):
+        return flat_fn(*add_dupe_args(args))
+
+    if config.debug_assert:
+        ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
+            wrapped_flat_fn,
+            keep_input_mutations=fw_metadata.keep_input_mutations,
+            is_train=fw_metadata.is_train,
+        )(*deduped_flat_args)
+        assert (
+            ref_fw_metadata == updated_fw_metadata
+        ), f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}"
+
+    compiled_fn = compiler_fn(
+        wrapped_flat_fn, deduped_flat_args, aot_config, fw_metadata=updated_fw_metadata
+    )
+
+    if not hasattr(compiled_fn, "_boxed_call"):
+        compiled_fn = make_boxed_func(compiled_fn)
+
+    @wraps(compiled_fn)
+    def wrapped_compiled_fn(args):
+        deduped_args = remove_dupe_args(args)
+        args.clear()
+        return compiled_fn(deduped_args)
+
+    wrapped_compiled_fn._boxed_call = True  # type: ignore[attr-defined]
+
+    # This can be uncommented when we properly guard for duplicates,
+    # but right now we must not do it.
+    # if not config.debug_assert:
+    #     return wrapped_compiled_fn
+
+    @wraps(wrapped_compiled_fn)
+    def debugged_compiled_fn(args):
+        # Test that the computed remove/add arg functions are an inverse
+        new_args = add_dupe_args(remove_dupe_args(args))
+        seen: Dict[Any, None] = {}
+        for i, (x, y) in enumerate(zip(new_args, args)):
+            seen[y] = None
+            assert x is y, format_guard_bug_msg(
+                aot_config,
+                f"{describe_input(i, aot_config)} would be a duplicate of "
+                f"{describe_input(add_dupe_map[i], aot_config)}",
+            )
+        # This is only an error if there is metadata mutation on both of
+        # the duped arguments; in this case, we need to know what order
+        # the metadata mutation applies in.  You'll get the correct result
+        # otherwise, because a graph that assumes distinct inputs works if
+        # you dupe the inputs (the gradient contributions from each input
+        # will get summed up appropriately.)
+        #
+        # TODO: work out how to setup this assert correctly
+        """
+        assert len(seen) == unique_args, format_guard_bug_msg(aot_config,
+            f"there would be {unique_args} distinct arguments"
+        )
+        """
+        return wrapped_compiled_fn(args)
+
+    debugged_compiled_fn._boxed_call = True  # type: ignore[attr-defined]
+
+    return debugged_compiled_fn
+
+
+# This layer handles the situation where you have two inputs that alias each other,
+# and one of the inputs is mutated.
+# We need to take special care to ensure that the mutation is applied to the other aliases in the graph.
+#
+# pre-condition: aot_wrapper_dedup has already run.
+# (This function will in theory work if there are duplicate args.
+# However, the synthetic base code path is a bit sub-optimal, and running with dupe'd inputs
+# would cause us to hit that path more frequently).
+def aot_wrapper_synthetic_base(
+    flat_fn,
+    flat_args: List[Tensor],
+    aot_config: AOTConfig,
+    *,
+    fw_metadata: ViewAndMutationMeta,
+    # Currently, the only reason we need to plumb this bool is because
+    # the synthetic base code prohibits more cases in the autograd case than the inference case.
+    needs_autograd: bool,
+    compiler_fn,
+):
+    is_inference = not needs_autograd
+    flat_args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
+        flat_args,
+        fw_metadata.input_info,
+        is_inference=is_inference,
+    )
+    # Happy path: we don't need synthetic bases
+    if synthetic_base_info is None:
+        return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
+
+    # export path: ban synthetic bases for now, add later if requested.
+    if requires_subclass_dispatch(flat_args, fw_metadata):
+        raise RuntimeError(
+            """\
+Encountered aliased inputs that are mutated in the graph, but at least one input/output
+to the graph is a tensor subclass. This is not supported today. You can try to
+remove the aliasing yourself as a workaround, or otherwise file an issue on github."""
+        )
+
+    if aot_config.is_export:
+        raise RuntimeError(
+            f"""\
+Encountered aliased inputs that are mutated in the graph you are trying to export.
+This functionality is currently not supported. If needed, please file a github issue.
+
+synthetic_base_info={str(synthetic_base_info)}
+
+fw_metadata={str(fw_metadata)}
+        """
+        )
+
+    assert len(fw_metadata.input_info) == len(synthetic_base_info)
+
+    # Update our forward metadata to take synthetic bases into account
+    (
+        fw_metadata_updated,
+        aliased_arg_idx_with_metadata_mutations,
+    ) = create_synthetic_base_metadata(
+        fw_metadata, synthetic_base_info, flat_args, flat_args_with_synthetic_bases
+    )
+
+    num_aliased_args_with_metadata_mutations = len(
+        aliased_arg_idx_with_metadata_mutations
+    )
+
+    def _unpack_synthetic_bases(primals: Tuple[Any, ...]) -> List[Any]:
+        f_args_inner = []
+        for inner_idx_or_tuple in synthetic_base_info:
+            if isinstance(inner_idx_or_tuple, int):
+                f_args_inner.append(primals[inner_idx_or_tuple])
+            else:
+                inner_base_idx, view_tensor = inner_idx_or_tuple
+                base = primals[inner_base_idx]
+                view_arg = gen_alias_from_base(
+                    base, view_tensor, view_tensor.requires_grad
+                )
+                f_args_inner.append(view_arg)
+        return f_args_inner
+
+    @wraps(flat_fn)
+    def wrapped_flat_fn(*args):
+        unpacked_args = _unpack_synthetic_bases(args)
+        # This is a bit subtle. The goal of this entire function (aot_dispatch_synthetic_bases)
+        # is to relieve the downstream logic from having to reason about mutations on inputs that alias
+        # each other, by replacing aliased inputs with a synthetic base.
+        # One area where this breaks down a bit however is if one of those aliased inputs
+        # experienced a metadata mutation.
+        # We are now obligated to reapply the metadata mutation directly to the user's input;
+        # it isn't enough to apply mutations back to the synthetic base in the downstream logic.
+        #
+        # The way we handle this is by pretending that those aliased inputs that experience metadata mutations
+        # are additional outputs in the user's forward function.
+        # The downstream logic will just treat these as "user outputs that alias inputs".
+        # However, we will manually grab them at runtime here, use them to reapply the metadata mutation
+        # to the user inputs, and not return them to the user.
+        aliased_args_with_metadata_mutations = [
+            x
+            for i, x in enumerate(unpacked_args)
+            if i in aliased_arg_idx_with_metadata_mutations
+        ]
+        if len(aliased_args_with_metadata_mutations) > 0:
+            return *(flat_fn(*unpacked_args)), *aliased_args_with_metadata_mutations
+        else:
+            return flat_fn(*unpacked_args)
+
+    if config.debug_assert:
+        ref_fw_metadata = run_functionalized_fw_and_collect_metadata(
+            wrapped_flat_fn,
+            keep_input_mutations=fw_metadata.keep_input_mutations,
+            is_train=fw_metadata.is_train,
+        )(*flat_args_with_synthetic_bases)
+        assert ref_fw_metadata == fw_metadata_updated, (
+            f"ref_metadata={pprint.pformat(partial_flatten_asdict(ref_fw_metadata))}, "
+            f"\nactual_metadata={pprint.pformat(partial_flatten_asdict(fw_metadata_updated))}"
+        )
+
+    compiled_fn = compiler_fn(
+        wrapped_flat_fn,
+        flat_args_with_synthetic_bases,
+        aot_config,
+        fw_metadata=fw_metadata_updated,
+    )
+
+    if not hasattr(compiled_fn, "_boxed_call"):
+        compiled_fn = make_boxed_func(compiled_fn)
+
+    @wraps(compiled_fn)
+    def wrapped_compiled_fn(args):
+        args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
+            args, fw_metadata.input_info, is_inference=is_inference
+        )
+        assert synthetic_base_info is not None
+        aliased_args_w_metadata_mutations = [
+            args[i] for i in aliased_arg_idx_with_metadata_mutations
+        ]
+        args.clear()
+        outs = compiled_fn(args_with_synthetic_bases)
+        if num_aliased_args_with_metadata_mutations > 0:
+            # This code does not handle **all** input metadata mutations.
+            # Instead, it only handles metadata mutations on inputs that were converted into synthetic bases
+            # (which only happens if at least one aliased input experienced a data mutation).
+            # e.g:
+            # def f(a, b):
+            #     a.mul_(2)
+            #     b.t_(1, 0)
+            # f(x.view(2, 2), x.view(2, 2))
+            mutated_metadata_inps = outs[-num_aliased_args_with_metadata_mutations:]
+            user_outs = outs[:-num_aliased_args_with_metadata_mutations]
+            for inp, mutated_inp in zip(
+                aliased_args_w_metadata_mutations, mutated_metadata_inps
+            ):
+                inp.as_strided_(
+                    mutated_inp.size(),
+                    mutated_inp.stride(),
+                    mutated_inp.storage_offset(),
+                )
+            return user_outs
+        return outs
+
+    return wrapped_compiled_fn
+
+
+# Note [Handling mutations on an input that aliases other inputs]
+# The easiest example to show-case this edge case is here:
+#
+# def f(a, b):
+#     a.mul_(2)
+#     out = a + b
+#     return out
+# b = torch.ones(...)
+# a = b.view(-1)
+# f(a, b)
+#
+# In this situation, if a and b happened to be aliased, we need to trace something different!
+# Suppose we had b = a.view(-1)
+# (In this case, that means that `a._base is b`)
+#
+# We need to ensure that the aliasing relationship between a and b is preserved.
+# We do that detecting the specific situation above (mutate an input that aliases another input),
+# and when we do that, we create a synthetic base argument. Then inside of the traced forward,
+# we regenerate a and b off of that base.
+# The complete example of the transformed function looks like this:
+#
+# // The traced forward takes in a synthetic base, and regenerates the aliased inputs as views
+# // We could consider getting view-replay support here to minimize as_strided_scatter ops in the graph
+# def traced_forward(base):
+#     a = base.as_strided(...)
+#     b = base.as_strided(...)
+#     a_updated = a.mul(2)
+#     base_updated = torch.as_strided_scatter(base, a_updated, ...)
+#     b_updated = base_updated.as_strided(...)
+#     out = a_updated + b_updated
+#     return a_updated, out
+#
+# def compiled_fn(a, b):
+#     // we detect that a is the "differentiable base" here
+#     base = a
+#     // In other situations, we might do either:
+#     // (1) a and b are both views off of some larger differentiable base
+#     //     assert a._base is b._base and a._base is not None
+#     //     base = a._base
+#     // (2) a and b both don't require gradients. Create a base from the storage
+#     //     assert a._base is None and b._base is None
+#     //     base = torch.Tensor(a.storage())
+#     a_updated, out = traced_forward(base)
+#     a.copy_(a_updated)
+#     return out
+#
+# This function:
+# (1) Merges input views into a synthetic base argument, when any of those input views are mutated
+# (2) Returns metadata telling the autograd.Function how to modify their arguments properly,
+#     to respect the new calling convention.
+#
+# The calling convention is as follows.
+# Any inputs that were originally views of one another get yanked, and replaced with a synthetic base.
+# The argument list ordering goes [base1, ..., baseN], [arg1, ..., argN],
+# Where the ordering of the bases is determined from the ordering of the original view args.
+# baseA will come before baseB if the earliest original argument coming from baseA
+# showed up earlier in the argument list than the earliest original argument coming from baseB.
+#
+# Example, given some tensors a, b, c, d
+# call site:
+#   f(a, c.view(-1), b.view(-1), b, c, d)
+# Modified argument list:
+#   c_base comes first because the first c view came earlier in arg list than the first b view
+#   a and d still show up in the modified arg list, but b and c don't- they're regenerated from their bases
+#   b_base = torch.Tensor(b.storage())
+#   c_base = torch.Tensor(c.storage())
+#   f(c_base, b_base, a, d)
+def merge_view_inputs(
+    fwd_inputs: List[Any],
+    mutated_input_info: List[InputAliasInfo],
+    *,
+    # The autograd case currently has more restrictions than the inference case.
+    is_inference: bool,
+) -> Tuple[List[Any], Optional[List[Union[int, Tuple[int, torch.Tensor]]]]]:
+    def _are_differentiable_views(view1, view2):
+        if view1 is view2:
+            return True
+        if view1._base is None and view2._base is None:
+            return False
+        if view1._base is view2._base or view1._base is view2 or view1 is view2._base:
+            return True
+        return False
+
+    def _same_dtype_views(view1, view2):
+        if view1.dtype != view2.dtype:
+            return False
+        if view1._base is not None and view1.dtype != view1._base.dtype:
+            return False
+        if view2._base is not None and view2.dtype != view2._base.dtype:
+            return False
+        return True
+
+    assert len(fwd_inputs) == len(mutated_input_info)
+    storage_ref_to_idx: Dict[StorageWeakRef, List[int]] = collections.defaultdict(list)
+    base_args = []
+    other_args = []
+    for i, inpt in enumerate(fwd_inputs):
+        if isinstance(inpt, Tensor):
+            storage_ref = StorageWeakRef(inpt.untyped_storage())
+            storage_ref_to_idx[storage_ref].append(i)
+        else:
+            other_args.append(inpt)
+    # Note [Synthetic Base Info Metadata]
+    # This list contains metadata that tells you what the i'th argument in the inner calling convention should be.
+    # It's either:
+    # - another int (corresponding to the index in the argument list of the element from the outer calling convention)
+    # - idx, view_tensor, where we can generate the new output with view_tensor._view_func(old_args[idx])
+    #   idx corresponds to which synthetic base from the outer calling context to view
+    inner_calling_convention_meta: Dict[int, Union[int, Tuple[int, torch.Tensor]]] = {}
+    for aliased_input_indices in storage_ref_to_idx.values():
+        if len(aliased_input_indices) <= 1 or not any(
+            # We only care about mutations that affect all aliases,
+            # so metadata mutations on an input doesn't require us to do synthetic base handling.
+            mutated_input_info[inpt_idx].mutates_data
+            for inpt_idx in aliased_input_indices
+        ):
+            for curr_idx in aliased_input_indices:
+                other_args.append(fwd_inputs[curr_idx])
+            continue
+
+        # Here, we attempt to do a more complicated check to detect false aliasing
+        # (e.g. if all the tensors have the same storage, but don't actually overlap)
+        # In theory, we could have a large group of tensors that all share storages, where only *some* of them
+        # have overlapping memory.
+        # I don't bother with that case for now: here, we only bail out earlier if we detect that **every** pair
+        # of tensors in the current group that shares a storage is non-overlapping.
+        aliased_input_indices_no_false_sharing = compute_overlapping_inputs(
+            fwd_inputs, aliased_input_indices
+        )
+        if len(aliased_input_indices_no_false_sharing) <= 1:
+            for curr_idx in aliased_input_indices:
+                other_args.append(fwd_inputs[curr_idx])
+            continue
+
+        # We detected an input that was mutated, AND aliases with another input.
+        # we need to replace this set of aliased inputs with a single synthetic base.
+        # For now, I'm banning a bunch of cases. We expect dynamo to properly detect these cases
+        # and error out. We can fix them later.
+        # These checks are transitive, so we don't need to check every pair.
+        for idx1, idx2 in zip(
+            aliased_input_indices, aliased_input_indices[1:], strict=False
+        ):
+            view1 = fwd_inputs[idx1]
+            view2 = fwd_inputs[idx2]
+            # The "inputs that are aliased but have different differentiable bases" case
+            # is more complicated and hopefully pretty rare. Not currently handled.
+            if not is_inference:
+                assert _are_differentiable_views(
+                    view1, view2
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+            # Regenerating views when reinterpreting complex / real tensors seems non-trivial,
+            # not handling for now
+            assert _same_dtype_views(
+                view1, view2
+            ), "aot_autograd() does not yet handle input mutations on views with different dtypes."
+        non_none_bases = [
+            fwd_inputs[i]._base
+            for i in aliased_input_indices
+            if fwd_inputs[i]._base is not None
+        ]
+        aliases_with_none_bases = [
+            fwd_inputs[i] for i in aliased_input_indices if fwd_inputs[i]._base is None
+        ]
+        if len(non_none_bases) == 0:
+            # Case where none of the aliases have a ._base
+            # we generate a synthetic base without gradients, and generate views off of it
+            # We hit this case when we have input tensors to the graph that share a storage,
+            # but do not have a ._base field.
+            # Wondering when we hit this case?
+            # The _base field simply says that autograd knows about the aliasing relationship,
+            # but sometimes we create tensors which are aliased out of the same storage but guaranteed
+            # to be disjoint. In these cases, we will skip setting up the _base relationship
+            # for performance reasons (because the fact that the tensors share the same storage
+            # is unobservable unless you (1) do naughty things with resize_/as_strided
+            # or (2) look at the storage--as we are doing here.)
+            # One particular example of this is optimizer steps on the LSTM module:
+            # LSTM parameters are packed into a contiguous storage for efficiency reasons when
+            # calling cuDNN kernels, so when these parameters get passed to the optimizer we will
+            # find they share the same storage, but do not have _base set since they are all disjoint.
+            #
+            # NOTE: There is one case where this is unsafe:
+            # torch.Tensor(storage) will ALWAYS create a 1D tensor, which is not necessarily
+            # the same shape as the "actual" base that the tensor came from.
+            # For the most part this is fine, because we always use as_strided()
+            # to generate the original aliased inputs again.
+            # If we were to use view-replay though, this could cause the aliased views
+            # to have incorrect sizes.
+            example_idx = aliased_input_indices[0]
+            example_alias = fwd_inputs[example_idx]
+            # Note that this function is re-used at both trace time and runtime.
+            # At trace time, we're under a FakeMode so synthetic_base becomes a FakeTensor.
+            synthetic_base = torch.empty(
+                (0,), dtype=example_alias.dtype, device=example_alias.device
+            )
+            # We don't actually have a convenient way of going from storage -> tensor,
+            # So using set_() here (we suffer some minor overhead, but this case is rare).
+            synthetic_base.set_(example_alias.untyped_storage())
+        else:
+            # Case where all of the aliases require gradients, and have the same _base.
+            synthetic_base = non_none_bases[0]
+            for other_base in non_none_bases[1:]:
+                assert (
+                    other_base is synthetic_base
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+            for alias in aliases_with_none_bases:
+                assert (
+                    alias is synthetic_base
+                ), "aot_autograd() does not yet handle non-differentiable view input mutations."
+        base_args.append(synthetic_base)
+        for curr_view_idx in aliased_input_indices:
+            curr_view = fwd_inputs[curr_view_idx]
+            base_idx = len(base_args) - 1
+            # We store just enough info here so that we can regenerate the view later.
+            # Regeneration: curr_view._view_func(args[base_idx])
+            inner_calling_convention_meta[curr_view_idx] = (base_idx, curr_view)
+    if len(base_args) == 0:
+        assert len(other_args) == len(fwd_inputs)
+        # If no synthetic bases are necessary, just return the original inputs.
+        return fwd_inputs, None
+    else:
+        # Otherwise, return:
+        # (1) The new args according to the updated calling convention: (synthetic_bases, other_args)
+        # (2) Metadata telling functionalization how to generate the inner argument list given the outer calling convention.
+        #     We post-process it into a list, where meta[i] tells you info about the i'th argument in the inner calling convention.
+        args_to_functionalization = base_args + other_args
+        arg_to_old_idx_map = {arg: i for (i, arg) in enumerate(fwd_inputs)}
+        for i, other_arg in enumerate(other_args):
+            new_idx = len(base_args) + i
+            old_idx = arg_to_old_idx_map[other_arg]
+            inner_calling_convention_meta[old_idx] = new_idx
+        # post process into a list
+        post_processed_calling_convention_meta: List[
+            Union[int, Tuple[int, torch.Tensor]]
+        ] = [-1 for _ in range(len(inner_calling_convention_meta))]
+        for k, v in inner_calling_convention_meta.items():
+            post_processed_calling_convention_meta[k] = v
+        # Quick assert: every argument in the inner calling convention should be accounted for.
+        for x in post_processed_calling_convention_meta:
+            assert x != -1
+        return args_to_functionalization, post_processed_calling_convention_meta
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/schemas.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a435f1fa6da6685d2909c9e93ecf2c392a7351
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/schemas.py
@@ -0,0 +1,696 @@
+"""
+The various dataclasses, Enums, namedtuples etc used in AOTAutograd. This includes
+input/output types, metadata, config, function signatures etc.
+"""
+
+import collections
+import functools
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, List, NewType, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch._guards import Source
+from torch._subclasses import FakeTensor
+from torch._subclasses.fake_tensor import is_fake
+
+from .. import config
+
+from .functional_utils import _check_if_mutation_can_be_in_graph
+from .utils import strict_zip
+
+zip = strict_zip
+
+OutputType = Enum(
+    "OutputType",
+    (
+        # output is not an alias
+        "non_alias",
+        # output aliases an input
+        "alias_of_input",
+        # output **is** an input tensor
+        "is_input",
+        # output has a ._base tensor, which is a graph intermediate.
+        # We need to return its ._base as a graph output,
+        # so its requires_grad info is populated correctly.
+        # Instructs the runtime code to regenerate the current output
+        # from a base tensor, graph_intermediates[base_idx]
+        "alias_of_intermediate_save_as_output",
+        # Same as above; but we don't need to explicitly add its ._base
+        # as a graph output, because it already **is** a graph output.
+        "alias_of_intermediate",
+        # Same as above; but the output's ._base is **already** a user output.
+        # Instructs the runtime code to regenerate the current output from
+        # a base tensor, user_outputs[base_idx]
+        "alias_of_intermediate_base_is_user_output",
+        # See Note [Intermediate Bases Optimization]
+        "unsafe_view_alias",
+        # output is an alias, but has a custom autograd.Function backward.
+        # In this case, we don't want to do view-replay, since we won't be able to replay the custom function.
+        # Instead, we'll treat this output "normally", and trace its backward into the graph.
+        "custom_function_view",
+    ),
+)
+
+
+# This class stores info about every user output.
+@dataclass(frozen=True)
+class OutputAliasInfo:
+    # Tells us if this output is:
+    # (1) a regular (non-aliased) output
+    # (2) an alias of a forward input
+    # (3) **is** a forward input (special case of "alias_of_input")
+    # (4) an alias of an intermediate (aka an alias of an output of the inner traced forward)
+    # (5) an alias of an intermediate, that explicitly requires returning the intermediate
+    #     as a graph output
+    # (6) an alias of an intermediate, where that intermediate is also a user output
+    output_type: OutputType
+    # The raw type of the output (torch.Tensor, SymInt, etc)
+    raw_type: type
+    # If (1) above, then
+    # - base_idx is None
+    # If (2) or (3) above, then
+    # - Tells us that the base of this alias is user_fwd_input[base_idx]
+    #   (This is an index into the inputs *before* we make synthetic bases)
+    # If (4) or (5) above, then
+    # - Tells us that the base of this alias is output_graph_intermediates[base_idx]
+    #   here, this refers to the index of the *direct* traced
+    # If (6) above, then:
+    # - Tells us that the base of this alias is output_user_fwds[base_idx]
+    #   here, this refers to the index of the *direct* traced
+    base_idx: Optional[int]
+    # If it is a Tensor, what the dynamic dims are (otherwise is None)
+    dynamic_dims: Optional[Set[int]]
+    # requires_grad
+    requires_grad: bool
+
+
+class MutationType(Enum):
+    NOT_MUTATED = 1
+    MUTATED_IN_GRAPH = 2
+    MUTATED_OUT_GRAPH = 3
+
+
+# This class tells us info about user inputs.
+@dataclass(frozen=True)
+class InputAliasInfo:
+    is_leaf: bool
+    mutates_data: bool
+    mutates_metadata: bool
+    mutations_hidden_from_autograd: bool
+    mutations_under_no_grad_or_inference_mode: bool
+    mutates_storage_metadata: bool
+    requires_grad: bool
+    keep_input_mutations: bool
+
+    def __post_init__(self):
+        if self.mutates_storage_metadata:
+            # For convenience, we guarantee that this is always true.
+            # In practice, If we call .set_(), then at runtime there is no need
+            # to additionally fix  up the tensor metadata, since our runtime
+            # call to inp.set_(updated_inp) will already have the right metadata
+            assert self.mutates_metadata
+
+    @functools.cached_property
+    def mutation_type(self) -> MutationType:
+        if (not self.mutates_data) and (not self.mutates_metadata):
+            return MutationType.NOT_MUTATED
+
+        if _check_if_mutation_can_be_in_graph(
+            self.keep_input_mutations,
+            self.mutates_data,
+            self.mutates_metadata,
+            self.mutations_hidden_from_autograd,
+            self.mutations_under_no_grad_or_inference_mode,
+            self.requires_grad,
+        ):
+            return MutationType.MUTATED_IN_GRAPH
+
+        return MutationType.MUTATED_OUT_GRAPH
+
+
+@dataclass
+class SubclassCreationMeta:
+    """
+    Used for AOTDispatch.
+    This dataclass gives us the information we need to reconstruct a tensor subclass
+    from our flat inputs.
+    Why is this important? The graph that we'd like to trace out contains flat tensor inputs,
+    But the user's original model may have subclass inputs and outputs.
+    So we need to wrap/unwrap subclasses as necessary to translate between the user's
+    view (subclass inps/outs), and the backend compiler's view (graph with no subclass args).
+
+    Complications arise mostly from the fact that a subclass can hold more than one inner tensor;
+    So for a given subclass input/output, we need to carefully track which indices map
+    to the subclass tensor in the corresponding "dense-tensor-only" graph.
+    """
+
+    # In the inner graph that only takes in dense tensor inputs,
+    # this maps to the first index of "tensors that should go in this subclass wrapper"
+    flat_tensor_start_idx: int
+    # The number of tensors that live in this subclass wrapper
+    arg_count: int
+    # Stores the original subclass itself.
+    # This is needed because we need the autograd metadata on the original subclass
+    # (this is guaranteed to be a wrapper subclass that holds a fake tensor,
+    #  so holding onto this at runtime shouldn't leak memory)
+    original_subclass: torch.Tensor
+    # meta and inner_keys are produced by the subclass's __tensor_flatten__.
+    # We need to keep them around along with outer_size / outer_stride to plumb them
+    # into __tensor_unflatten__.
+    meta: Any
+    inner_keys: List[Any]
+    outer_size: Tuple[int, ...]
+    outer_stride: Tuple[int, ...]
+
+    def creation_fn(self, all_args, *, is_runtime: bool):
+        curr_args = all_args[
+            self.flat_tensor_start_idx : self.flat_tensor_start_idx + self.arg_count
+        ]
+        assert len(curr_args) == len(
+            self.inner_keys
+        ), f"inner_keys: {str(self.inner_keys)}. len(curr_args): {len(curr_args)}"
+        # NB: Sometimes we have real inner tensors and symbolic metadata.
+        # TODO: Resolve this so we always have matching real / symbolic tensors / metadata.
+        out = type(self.original_subclass).__tensor_unflatten__(  # type: ignore[attr-defined]
+            dict(zip(self.inner_keys, curr_args)),
+            self.meta,
+            self.outer_size,
+            self.outer_stride,
+        )
+        if not is_runtime:
+            # After wrapping up the inner dense tensors into a subclass, we need to make sure that our new wrapper
+            # has correct autograd metadata, since we'll be tracing through the autograd engine with the subclass.
+            # We don't trace through the autograd engine at runtime though, so no need
+            # to compute this extra metadata then!
+            torch._mirror_autograd_meta_to(self.original_subclass, out)  # type: ignore[attr-defined]
+
+        return out
+
+    def __post_init__(self):
+        # sanity assert to make sure we don't leak memory
+        assert is_fake(self.original_subclass)
+
+
+# This class encapsulates all aliasing + mutation info we need about the forward graph
+# See a more detailed overview of the edge case handling at
+# https://docs.google.com/document/d/19UoIh_SVrMy_b2Sx5ZaeOJttm6P0Qmyss2rdBuyfoic/edit
+@dataclass(eq=False)
+class ViewAndMutationMeta:
+    # length = # user inputs
+    # This gives us info about every input, and what sort of mutation happened to it (if any)
+    input_info: List[InputAliasInfo]
+
+    # length = # user outputs
+    # This gives us info about every output (mostly around whether it aliases other tensors)
+    output_info: List[OutputAliasInfo]
+
+    # length = the number of intermediate bases appended as outputs to the end of the forward graph.
+    # Note: this is not necessarily the same thing as:
+    #   len([x for x in output_info if x.output_type == OutputType.alias_of_intermediate])
+    # Because outputs might share a ._base, or an output's ._base might itself be
+    # another user output (in both cases, we won't redundantly append bases to the end of the graph)
+    num_intermediate_bases: int
+
+    # For inference only: instructs us to keep data-only input mutations directly in the graph
+    keep_input_mutations: bool
+
+    # length = (# inputs w data mutations) + (# user outputs that are non_aliasing tensors)
+    #        + (# intermediate bases)
+    # These are the FakeTensor (or potential SymInt) outputs that we traced from our
+    # metadata pass of the user's forward function.
+    # Their only use today is to pass them as a best-guess for tangents when tracing the joint.
+    # Stashing them as part of our "metadata" makes it simpler if we want to run our analysis
+    # pass once, and re-use the output throughout AOTAutograd
+    traced_tangents: List[Any]
+
+    # Each of these is a list telling us about subclasses for the inputs/outputs/grad_outs
+    # They are used throughout AOTDispatch to tell us how to generate a list of subclass tensors,
+    # Given a (potentially larger) list of plain torch tensors.
+
+    # Taking subclass_inp_meta as an example:
+    #   subclass_inp_meta[i] = j (an int) tells us:
+    #     "The i'th user input is not a subclass, and corresponds to inputs[j] of the plain-tensor graph."
+    #   subclass_inp_meta[i] = SubclassCreationMeta(flat_tensor_start_idx=3, arg_count=2)
+    #     "The i'th user input is subclass holding two inner tensors, which are
+    #      inputs[3] and inputs[4] of the plain-tensor graph".
+
+    # length = # user inputs
+    subclass_inp_meta: List[Union[int, SubclassCreationMeta]]
+    # So, the full set of outputs to the forward graph looks something like:
+    # (*mutated_inps, *user_outs, *intermediate_bases, *saved_for_bw_tensors)
+    # where the first 3 of those 4 can be subclasses
+    # (but not saved_for_bw tensors, since these are internal to the compiler
+    # and not user visible, so there's no point in wrapping/unwrapping them at runtime).
+    # This list contains subclass information on all of the fw graph outputs
+    # except for saved_for_bw_tensors.
+    subclass_fw_graph_out_meta: List[Union[int, SubclassCreationMeta]]
+    # length = # backward graph inputs
+    subclass_tangent_meta: List[Union[int, SubclassCreationMeta]]
+    # TODO: we should kill this
+    # (need to default it to not break internal)
+    is_train: bool = False
+
+    num_symints_saved_for_bw: Optional[int] = None
+
+    # The grad_enabled mutation that will be emitted in the runtime_wrapper epilogue
+    # NOTE: AOTAutograd will assume that the ambient `is_grad_enabled` is the grad mode
+    # that is intended to be in effect prior to running the graph, in keeping with
+    # equivalence to eager mode. It is the responsibility of upstream graph acquisition
+    # to reset the grad mode to its pre-graph value prior to calling aot_autograd.
+    grad_enabled_mutation: Optional[bool] = None
+
+    # Keeps track of whether `torch.use_deterministic_algorithms` was turned on
+    # when the forward was run. If deterministic mode was turned off during the
+    # forward, but is turned on during the backward call, then an error is
+    # raised
+    deterministic: Optional[bool] = None
+
+    # Map of effect type (ex. _EffectType.ORDERED) to token.  If there are
+    # side-effectful operators, FunctionalTensorMode will populate this
+    # dictionary telling us how many tokens we will need during tracing.
+    tokens: Dict[Any, torch.Tensor] = field(default_factory=dict)
+
+    def __post_init__(self):
+        # pre-compute the indices of the inputs that are mutated.
+        # When keep_input_mutations is set, we don't need to worry about our epilogue
+        # handling data-only mutations, because we keep them directly in the graph.
+
+        mutated_inp_runtime_indices = [
+            i
+            for i, m in enumerate(self.input_info)
+            if (m.mutation_type == MutationType.MUTATED_OUT_GRAPH)
+        ]
+
+        mutated_graph_handled_indices = [
+            i
+            for i, m in enumerate(self.input_info)
+            if m.mutation_type == MutationType.MUTATED_IN_GRAPH
+        ]
+        self.mutated_graph_handled_indices = mutated_graph_handled_indices
+        self.num_mutated_graph_handled_indices = len(self.mutated_graph_handled_indices)
+
+        mutated_graph_handled_indices_seen_by_autograd = [
+            i
+            for i in mutated_graph_handled_indices
+            if not self.input_info[i].mutations_hidden_from_autograd
+        ]
+
+        self.mutated_graph_handled_indices_seen_by_autograd = (
+            mutated_graph_handled_indices_seen_by_autograd
+        )
+        self.num_mutated_graph_handled_indices_seen_by_autograd = len(
+            self.mutated_graph_handled_indices_seen_by_autograd
+        )
+
+        aliased_out_indices = [
+            i
+            for i, m in enumerate(self.output_info)
+            if m.output_type
+            not in [
+                OutputType.non_alias,
+                OutputType.unsafe_view_alias,
+                OutputType.custom_function_view,
+            ]
+        ]
+        unsafe_view_out_indices = [
+            i
+            for i, m in enumerate(self.output_info)
+            if m.output_type is OutputType.unsafe_view_alias
+        ]
+
+        # This is pre-computed in post_init for perf.
+        # It contains the index of every element
+        # of input_info that corresponds to a mutation (data or metadata or both)
+        self.mutated_inp_runtime_indices = mutated_inp_runtime_indices
+        self.num_mutated_inp_runtime_indices = len(self.mutated_inp_runtime_indices)
+
+        # This is pre-computed for perf.
+        # It contains the index of every element
+        # of output_info that corresponds to an alias (either of an input or intermediate)
+        self.aliased_out_indices = aliased_out_indices
+        self.unsafe_view_out_indices = unsafe_view_out_indices
+        self.num_outputs = len(self.output_info)
+        self.num_outputs_non_aliased = len(
+            [
+                x
+                for x in self.output_info
+                if x.output_type
+                in [
+                    OutputType.non_alias,
+                    OutputType.unsafe_view_alias,
+                    OutputType.custom_function_view,
+                ]
+            ]
+        )
+        self.num_outputs_aliased_to_inputs = len(
+            [
+                x
+                for x in self.output_info
+                if x.output_type
+                in [
+                    OutputType.alias_of_input,
+                    OutputType.is_input,
+                ]
+            ]
+        )
+        self.num_unsafe_view_outputs = len(self.unsafe_view_out_indices)
+        self.num_outputs_aliased_to_intermediates = len(
+            [
+                x
+                for x in self.output_info
+                if x.output_type
+                in [
+                    OutputType.alias_of_intermediate,
+                    OutputType.alias_of_intermediate_save_as_output,
+                    OutputType.alias_of_intermediate_base_is_user_output,
+                ]
+            ]
+        )
+        self.num_outputs_aliased = (
+            self.num_outputs_aliased_to_inputs
+            + self.num_outputs_aliased_to_intermediates
+        )
+
+        self.dynamic_outputs = any(o.dynamic_dims for o in self.output_info)
+        # See Note: [AOTAutograd Backward Guards]
+        # This is pre-computed for fast asserts on the types of our grad_outputs in the backward.
+        # Eventually, we should kill this and replace with real backward guards.
+        # (we want to precompute the "runtime" types, so replace FakeTensor with torch.Tensor)
+        self.output_types = [
+            torch.Tensor if isinstance(x, FakeTensor) else type(x)
+            for x in self.traced_tangents
+        ]
+
+        self.is_rng_op_functionalized = config.functionalize_rng_ops
+        # All of the above metadata is collected by tracing the fw function.
+        # However, extra outputs for rng offsets behave differently. Both fwd
+        # and bwd graphs have their own outputs for the total consumed offsets.
+        # Unlike mutated inputs, we don't have to worry about sending the right
+        # set of tensors between fwd and bwd. Fwd and bwd offsets are
+        # independent and simpler to handle. Therefore, we track them
+        # separately.
+        self.num_outputs_rng_offset = 1 if self.is_rng_op_functionalized else 0
+
+        # Our forward() returns both (mutated_inputs, outputs, output_intermediate_bases, saved_tensors, saved_symints)
+        self.num_forward_returns = (
+            self.num_mutated_inp_runtime_indices
+            + self.num_outputs
+            + self.num_intermediate_bases
+        )
+        # In case of functionalization of rng ops, the fw_module returns one
+        # additional output for rng offset. This rng offset is used right
+        # away to advance the rng state, and is not passed on to the raw
+        # outputs. However, we need to know the exact boundary to identify
+        # which tensors to be saved for the bwd graph.  num_forward captures
+        # this information.
+        self.num_forward = self.num_forward_returns + self.num_outputs_rng_offset
+
+    @property
+    def tensors_saved_for_backwards_slice(self):
+        assert self.num_symints_saved_for_bw is not None
+        if self.num_symints_saved_for_bw > 0:
+            return slice(self.num_forward, -self.num_symints_saved_for_bw)
+        else:
+            return slice(self.num_forward, None)
+
+    @property
+    def symints_saved_for_backwards_slice(self):
+        assert self.num_symints_saved_for_bw is not None
+        if self.num_symints_saved_for_bw > 0:
+            return slice(-self.num_symints_saved_for_bw, None)
+        else:
+            return slice(0, 0)  # empty slice
+
+    def __eq__(self, other):
+        if not isinstance(other, ViewAndMutationMeta):
+            return NotImplemented
+        return (
+            self.input_info == other.input_info
+            and self.output_info == other.output_info
+            and self.num_intermediate_bases == other.num_intermediate_bases
+            and self.keep_input_mutations == other.keep_input_mutations
+            and self.is_rng_op_functionalized == other.is_rng_op_functionalized
+            and self.num_outputs_rng_offset == other.num_outputs_rng_offset
+            and len(self.traced_tangents) == len(other.traced_tangents)
+            and all(
+                x.shape == y.shape and x.dtype == y.dtype
+                for x, y, in zip(self.traced_tangents, other.traced_tangents)
+            )
+        )
+
+
+@dataclass(eq=False)
+class SubclassMeta:
+    # A copy of all forward metadata, but computed on the *dense* tensor forward (after desugaring subclasses)
+    # So for example, if the user had a model containing two `TwoTensor` inputs,
+    # Then `SubclassMeta.fw_metadata.input_infos` would have length 4 here.
+    fw_metadata: ViewAndMutationMeta
+
+    # Note: [Computing Subclass Metadata about grad_inputs]
+    # Given a list of flattened, plain tensor grad_inputs, this tells us how to reconstruct the grad_input subclasses
+    #
+    # You might think: why not just assume that all grad_inputs will have the same subclass-ness as the original inputs?
+    # (AOTAutograd generally assumes other properties, e.g. that grad_outputs are contiguous)
+    #
+    # This doesn't really work though. take this example:
+    #
+    # def f(DoubleTensor, DenseTensor):
+    #     return DoubleTensor  * DenseTensor
+    #
+    # In the above example, the .grad field of *both* DoubleTensor and DenseTensor will be a DoubleTensor.
+    # When we trace out a joint fw-bw graph, we'll end up returning two subclasses for the two grad_inputs.
+    # This means that our backward graph will return 4 outputs (two dense tensors for each DoubleTensor grad_input)
+    # and we need to properly store the metadata that tells us how to turn these 4 outputs back into DoubleTensors.
+    #
+    # Note that this info **cannot** easily be figured out from ViewAndMutationMeta.
+    # We can only compute this info by tracing the entire joint and examining the grad_inputs that we computed.
+    #
+    # See Note: [AOTAutograd Backward Guards]
+    # This will also eventually require us to install backward guards,
+    # in case we made incorrect assumptions about the subclass-ness of our grad_outputs
+    #
+    # Optional field because we don't compute for inference graphs
+    grad_input_metas: Optional[List[Union[int, SubclassCreationMeta]]]
+
+    def __init__(self):
+        # The fields in this class get set after its construction.
+        pass
+
+
+# This class exists because:
+# - the autograd.Function.forward() in aot autograd returns outputs that might alias inputs
+# - we only care about the metadata on those aliases, so we can regenerate them.
+#   We do not want them to participate in the autograd.Function.
+# We do that by wrapping them in an opaque class, so the autograd.Function
+# does not know to treat them as tensors.
+@dataclass(frozen=True)
+class TensorAlias:
+    alias: torch.Tensor
+
+
+@dataclass
+class BackwardSignature:
+    """
+    Provides information about the backward section of an exported
+    joint forward-backward graph.
+    For a particular fx GraphModule, this class contains information on:
+    (1) A mapping from each gradient (backwards output) to the parameter
+        it corresponds to (forward input)
+    (2) A mapping from each gradient (backwards output) to the user input
+        it corresponds to (forward input)
+    (3) Which of the forward outputs corresponds to the loss, that we backprop on.
+
+    Each string name is the `node.name` of the corresponding node in the fx graph.
+    """
+
+    gradients_to_parameters: Dict[str, str]
+    gradients_to_user_inputs: Dict[str, str]
+    loss_output: str
+
+
+GraphOutputName = NewType("GraphOutputName", str)
+GraphInputName = NewType("GraphInputName", str)
+FQN = NewType("FQN", str)
+
+
+@dataclass
+class GraphSignature:
+    """
+    Provides information about an exported module.
+    For a particular fx GraphModule, this class contains information on:
+    (1) Which graph inputs are parameters, buffers, or user inputs
+    (2) (for params/buffers) a mapping from the name of each graph argument
+        to its parameter/buffer FQN in the original nn.Module.
+    (3) If there are input mutations, these are represented as extra outputs
+        in the fx GraphModule. We provide a mapping from these
+        extra output names to the names of the actual inputs.
+    (4) The pytree metadata on how to flatten/unflatten inputs and outputs.
+        The corresponding FX GraphModule only accepts and returns
+        pytree-flattened inputs/outputs.
+    (5) (Optionally) if the FX is a joint forward-backward graph, we provide
+        a signature on the backward section of the joint graph.
+    """
+
+    parameters: List[FQN]
+    buffers: List[FQN]
+
+    user_inputs: List[GraphInputName]
+    user_outputs: List[GraphOutputName]
+    inputs_to_parameters: Dict[GraphInputName, FQN]
+    inputs_to_buffers: Dict[GraphInputName, FQN]
+
+    # If the user's module mutates a buffer,
+    # it's represented in the graph as an extra graph output.
+    # This dict is a mapping from
+    # "graph outputs that correspond to updated buffers"
+    # to the FQN names of those mutated buffers.
+    buffers_to_mutate: Dict[GraphOutputName, FQN]
+    user_inputs_to_mutate: Dict[GraphOutputName, GraphInputName]
+
+    in_spec: pytree.TreeSpec
+    out_spec: pytree.TreeSpec
+
+    backward_signature: Optional[BackwardSignature]
+
+    input_tokens: List[GraphInputName]
+    output_tokens: List[GraphOutputName]
+
+    @classmethod
+    def from_tracing_metadata(
+        cls,
+        *,
+        in_spec: pytree.TreeSpec,
+        out_spec: pytree.TreeSpec,
+        graph_input_names: List[str],
+        graph_output_names: List[str],
+        view_mutation_metadata: ViewAndMutationMeta,
+        named_parameters: List[str],
+        named_buffers: List[str],
+        num_user_inputs: int,
+        num_user_outputs: int,
+        loss_index: Optional[int],
+        backward_signature: Optional[BackwardSignature],
+    ) -> "GraphSignature":
+        graph_inputs = graph_input_names
+        graph_outputs = graph_output_names
+        parameters = list(named_parameters)
+        buffers = list(named_buffers)
+        num_tokens = len(view_mutation_metadata.tokens)
+
+        # Calling convention assumptions:
+        # (1) graph inputs = (input_tokens, params, buffers, user_inputs)
+        # (2) graph outputs = (output_tokens, mutated_inputs, user_outs, param_gradients)
+        # (If we are capturing an inference graph, this convention is identical
+        #  except that param_gradients is empty)
+        # See Note [Side-Effectful Tokens in AOTAutograd] for information on tokens
+
+        # Address input calling conventions:
+        start, stop = 0, num_tokens
+        input_tokens = graph_inputs[start:stop]
+
+        start, stop = stop, stop + len(parameters)
+        inputs_to_parameters = dict(zip(graph_inputs[start:stop], parameters))
+
+        start, stop = stop, stop + len(buffers)
+        inputs_to_buffers = dict(
+            zip(
+                graph_inputs[start:stop],
+                buffers,
+            )
+        )
+
+        start, stop = stop, stop + num_user_inputs
+        user_inputs = graph_inputs[start:stop]
+
+        # We should've gone through all the inputs now
+        assert len(graph_inputs) - stop == 0
+
+        # Address output calling conventions:
+        start, stop = 0, num_tokens
+        output_tokens = graph_outputs[start:stop]
+
+        names = [*input_tokens, *parameters, *buffers, *user_inputs]
+        mutations = []
+        for idx, input_info in enumerate(view_mutation_metadata.input_info):
+            if input_info.mutates_data:
+                # Only buffers can be mutated, not parameters
+                assert idx >= len(parameters)
+                mutations.append(names[idx + num_tokens])
+
+        assert len(mutations) == view_mutation_metadata.num_mutated_inp_runtime_indices
+
+        start, stop = (
+            stop,
+            stop + view_mutation_metadata.num_mutated_inp_runtime_indices,
+        )
+        outputs_to_mutations = dict(zip(graph_outputs[start:stop], mutations))
+
+        user_inputs_to_mutate = {}
+        buffers_to_mutate = {}
+        for output_name, mutation_name in outputs_to_mutations.items():
+            if mutation_name in user_inputs:
+                user_inputs_to_mutate[output_name] = mutation_name
+            else:
+                assert mutation_name in buffers
+                buffers_to_mutate[output_name] = mutation_name
+
+        start, stop = stop, stop + num_user_outputs
+        user_outputs = graph_outputs[start:stop]
+
+        unused_outputs = len(graph_outputs) - stop
+        if backward_signature is not None:
+            unused_outputs -= len(backward_signature.gradients_to_parameters) + len(
+                backward_signature.gradients_to_user_inputs
+            )
+        assert unused_outputs == 0
+
+        return GraphSignature(
+            parameters=parameters,  # type: ignore[arg-type]
+            buffers=buffers,  # type: ignore[arg-type]
+            user_inputs=user_inputs,  # type: ignore[arg-type]
+            user_outputs=user_outputs,  # type: ignore[arg-type]
+            inputs_to_buffers=inputs_to_buffers,  # type: ignore[arg-type]
+            inputs_to_parameters=inputs_to_parameters,  # type: ignore[arg-type]
+            user_inputs_to_mutate=user_inputs_to_mutate,
+            buffers_to_mutate=buffers_to_mutate,  # type: ignore[arg-type]
+            in_spec=in_spec,
+            out_spec=out_spec,
+            backward_signature=backward_signature,
+            input_tokens=input_tokens,  # type: ignore[arg-type]
+            output_tokens=output_tokens,  # type: ignore[arg-type]
+        )
+
+
+@dataclass
+class AOTConfig:
+    """
+    Configuration for AOTDispatcher
+    """
+
+    fw_compiler: Callable
+    bw_compiler: Callable
+    partition_fn: Callable
+    decompositions: Dict[Callable, Callable]
+    num_params_buffers: int
+    aot_id: int
+    keep_inference_input_mutations: bool
+    is_export: bool = False
+    no_tangents: bool = False
+    dynamic_shapes: bool = False
+    aot_autograd_arg_pos_to_source: Optional[List[Source]] = None
+    inference_compiler: Optional[Callable] = None
+    enable_log: bool = True
+    # this is always false outside of export.
+    pre_dispatch: bool = False
+
+    def __post_init__(self):
+        if self.pre_dispatch:
+            assert self.is_export, "Can only have pre_dispatch IR for export."
+
+
+SubclassTracingInfo = collections.namedtuple(
+    "SubclassTracingInfo",
+    ["plain_tensor_trace_fn", "plain_tensor_args", "maybe_subclass_meta"],
+)
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/subclass_utils.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/subclass_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..563223c3dbbdce4bd1e476cb4bd6db9501a54004
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/subclass_utils.py
@@ -0,0 +1,295 @@
+"""
+This file contains utilities for tracing through __torch_dispatch__ based tensor subclasses and modes.
+AOTAutograd's responsibility is to trace through all pytorch capabilities that live in the pytorch dispatcher,
+and this includes tensor subclasses that implement __torch_dispatch__.
+"""
+
+from typing import Any, List, Optional, Tuple, Union
+
+import torch.utils._pytree as pytree
+
+from torch import Tensor
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+from .schemas import MutationType, SubclassCreationMeta, ViewAndMutationMeta
+from .utils import strict_zip
+
+zip = strict_zip
+
+
+def requires_subclass_dispatch(args, fw_metadata: ViewAndMutationMeta) -> bool:
+    args_flattened = pytree.arg_tree_leaves(*args)
+    any_subclass_args = any(
+        is_traceable_wrapper_subclass(x)
+        for x in args_flattened
+        if isinstance(x, Tensor)
+    )
+    from torch._functorch._aot_autograd.schemas import SubclassCreationMeta
+
+    any_subclass_outputs = any(
+        type(x) is SubclassCreationMeta for x in fw_metadata.subclass_fw_graph_out_meta
+    )
+    # This tells us whether or not we need to perform any unwrapping/wrapping of tensor subclasses at runtime.
+    return any_subclass_args or any_subclass_outputs
+
+
+# Given a flat list of arguments, some of which may be tensor subclasses,
+# computes metadata about "how to reconstruct the current list of subclasses,
+# if we were given their flattened dense tensors instead"
+def create_subclass_meta(
+    curr_args: Union[List[Any], Tuple[Any, ...]],
+) -> List[Union[int, SubclassCreationMeta]]:
+    idx = 0
+    infos: List[Union[int, SubclassCreationMeta]] = []
+    for a in curr_args:
+        if isinstance(a, Tensor) and is_traceable_wrapper_subclass(a):
+            attrs, meta = a.__tensor_flatten__()  # type: ignore[attr-defined]
+            start_idx = idx
+            cnt = len(attrs)
+            curr_cnt = cnt
+            infos.append(
+                SubclassCreationMeta(
+                    flat_tensor_start_idx=start_idx,
+                    arg_count=curr_cnt,
+                    original_subclass=a,
+                    meta=meta,
+                    inner_keys=attrs,
+                    outer_size=a.shape,
+                    outer_stride=a.stride(),
+                )
+            )
+        else:
+            infos.append(idx)
+            cnt = 1
+        idx += cnt
+    return infos
+
+
+# Output structure:
+# - List[Tensor] if tracing an inference graph
+# - Tuple[List[Tensor], List[Tensor]] if tracing a joint graph.
+# This function effectively concats each inner list of subclass tensors
+# into a (potentially longer) list of inner tensors.
+#
+# This function takes in a pytree of arguments and unwraps any tensor subclasses.
+# Annoyingly, we can't use pytrees to perform the unwrapping, because unwrapping returns
+# a list of tensors that we would then need to concat together.
+# Instead, we specialize the logic for the inference vs. joint graph case.
+# NOTE: this function is hot, since we unwrap tensor subclass inputs at runtime
+def unwrap_tensor_subclasses(wrapped_args, *, is_joint_structure: bool):
+    def concat_inner_tensors_from_subclasses(xs):
+        xs_inner = []
+        for x in xs:
+            if isinstance(x, Tensor) and is_traceable_wrapper_subclass(x):
+                attrs, _ = x.__tensor_flatten__()  # type: ignore[attr-defined]
+                xs_inner += [getattr(x, attr) for attr in attrs]
+            else:
+                xs_inner += [x]
+        return xs_inner
+
+    if is_joint_structure:
+        assert isinstance(wrapped_args, tuple) and len(wrapped_args) == 2
+        assert isinstance(wrapped_args[0], (tuple, list)) and isinstance(
+            wrapped_args[1], (tuple, list)
+        )
+        unwrapped_args_fw = concat_inner_tensors_from_subclasses(wrapped_args[0])
+        unwrapped_args_tangents = concat_inner_tensors_from_subclasses(wrapped_args[1])
+        unwrapped_args = (unwrapped_args_fw, unwrapped_args_tangents)
+    else:
+        assert isinstance(wrapped_args, (list, tuple))
+        unwrapped_args_fw = concat_inner_tensors_from_subclasses(wrapped_args)
+        unwrapped_args = unwrapped_args_fw
+    return unwrapped_args
+
+
+# Turns a flattened list of tensor arguments into (maybe) subclass tensors.
+# This function is used both at trace time and runtime, so we have an is_runtime flag telling us which context we're in.
+def wrap_tensor_subclasses(
+    unwrapped_args: Union[Tuple[Any, ...], List[Any]],
+    *,
+    subclass_metas: List[Union[int, SubclassCreationMeta]],
+    num_fw_outs_saved_for_bw: Optional[int] = None,
+    is_runtime: bool = False,
+) -> Tuple[Any, ...]:
+    wrapped_args = []
+    num_args_tallied = 0
+    for subclass_meta in subclass_metas:
+        if isinstance(subclass_meta, int):
+            wrapped_args.append(unwrapped_args[subclass_meta])
+            num_args_tallied += 1
+        else:
+            assert isinstance(subclass_meta, SubclassCreationMeta)
+            wrapped_args.append(
+                subclass_meta.creation_fn(unwrapped_args, is_runtime=is_runtime)
+            )
+            num_args_tallied += subclass_meta.arg_count
+
+    # Note: [Partitioner handling for Subclasses, Part 2]
+    # At the beginning of AOTAutograd, we collect metadata on the inputs and outputs of the user fw,
+    # to figure out which inputs/outputs are subclasses, and how to reconstruct the subclasses after flattening them.
+    #
+    # When this function is called at runtime in the forward,
+    # we have been passed a list of (flattened) dense-tensor fw-outs, and need to reconstruct any subclass fw outs.
+    #
+    # One reasonable question that you should ask: when should the dense_tensor -> subclass_tensor wrapping happen?
+    # Answer: we do it **inside of our compiled autograd.Function**.
+    # This seems like morally the right place: autograd happens above subclass desugaring,
+    # so autograd should see actual tensor subclasses at runtime, and not flattened dense tensors.
+    #
+    # This causes a tricky interaction though: when we run the min-cut partitioner to divvy up the joint graph
+    # into a forward and backward graph, we end up with some activations that show up as extra outputs
+    # in the compiled forward graph, that are **not** user outputs.
+    # These activations are not visible to the user, and so there's no need for us to wrap them back into subclasses.
+    #
+    # On top of that, when we first computed subclass metadata (in `run_functionalized_fw_and_collect_metadata`),
+    # we computed subclass metadata on every forward output, but this did **not** include activations
+    # created by the partitioner.
+    # as a result, `unwrapped_args` here will correspond to (*unwrapped_user_fw_outs, *activations),
+    # but `subclass_metas` will only correspond to subclass metatadata on `user_fw_outs`.
+    # We then need to make sure that we return (*wrapped_user_fw_outs, *activations).
+    if num_fw_outs_saved_for_bw is not None:
+        assert len(unwrapped_args) == num_args_tallied + num_fw_outs_saved_for_bw, (
+            f"Expected the number actual unwrapped-subclass outputs {len(unwrapped_args)} to equal "
+            f"the number of args calculated from subclasses ({num_args_tallied}) plus the number of "
+            f"additional activations saved for the backward pass ({num_fw_outs_saved_for_bw})"
+        )
+        activations = unwrapped_args[num_args_tallied:]
+        if isinstance(wrapped_args, tuple) and isinstance(activations, tuple):
+            return wrapped_args + activations
+        return tuple(list(wrapped_args) + list(activations))
+    else:
+        assert len(unwrapped_args) == num_args_tallied
+        return tuple(wrapped_args)
+
+
+# Given a bunch of "dense" tensor arguments, this function (potentially) wraps them into tensor subclasses.
+# This function carefully handles the inference vs. joint cases:
+# - when is_joint_structure is True, args is (primals, tangents)
+# - when is_joint_structure is False, args is [*primals]
+def wrap_tensor_subclasses_maybe_joint(
+    unwrapped_args, *, is_joint_structure: bool, meta: ViewAndMutationMeta
+) -> Union[Tuple[Any, ...], List[Any]]:
+    # Since this function is re-used for both inference and joint graphs,
+    if is_joint_structure:
+        assert isinstance(unwrapped_args, tuple) and len(unwrapped_args) == 2
+        assert isinstance(unwrapped_args[0], (tuple, list)) and isinstance(
+            unwrapped_args[1], (tuple, list)
+        )
+        primals, tangents = unwrapped_args[0], unwrapped_args[1]
+        wrapped_primals = wrap_tensor_subclasses(
+            primals, subclass_metas=meta.subclass_inp_meta
+        )
+        wrapped_tangents = wrap_tensor_subclasses(
+            tangents, subclass_metas=meta.subclass_tangent_meta
+        )
+        return (wrapped_primals, wrapped_tangents)
+    else:
+        wrapped_args = wrap_tensor_subclasses(
+            unwrapped_args, subclass_metas=meta.subclass_inp_meta
+        )
+        return wrapped_args
+
+
+# TODO: UNUSED. delete?
+def create_metadata_for_subclass(meta: ViewAndMutationMeta) -> ViewAndMutationMeta:
+    # input infos
+    input_info = []
+    for inp, subclass_meta in zip(meta.input_info, meta.subclass_inp_meta):
+        num_inps = 1 if isinstance(subclass_meta, int) else subclass_meta.arg_count
+        for _ in range(num_inps):
+            input_info.append(inp)
+
+    # output infos
+    output_info = []
+    subclass_out_meta_user_outs_only = meta.subclass_fw_graph_out_meta[
+        meta.num_mutated_inp_runtime_indices :
+    ]
+    if meta.num_intermediate_bases > 0:
+        subclass_out_meta_user_outs_only = subclass_out_meta_user_outs_only[
+            : -meta.num_intermediate_bases
+        ]
+    # sanity assert
+    assert len(meta.output_info) == len(subclass_out_meta_user_outs_only)
+    # Assume that the information on the output is shared by all of its inner tensors.
+    for out, subclass_meta in zip(meta.output_info, subclass_out_meta_user_outs_only):
+        num_outs = 1 if isinstance(subclass_meta, int) else subclass_meta.arg_count
+        for _ in range(num_outs):
+            output_info.append(out)
+
+    # A bit hacky, but we don't actually care about all of the metadata here.
+    # This metadata is used **underneath** both autograd and subclass de-sugaring,
+    # So all we really care about is stuff like:
+    # - num inputs/outputs (needed by the partitioner)
+    # - input mutations (**not** used today, since we don't handle input mutations inside the subclass,
+    #   although we should handle this eventually)
+    #   TODO: add a test case to assert we error when this happens, instead of getting silent correctness
+    num_intermediate_bases = None
+    keep_input_mutations = meta.keep_input_mutations
+    traced_tangents = None
+    subclass_inp_meta = None
+    subclass_fw_graph_out_meta = None
+    subclass_tangent_meta = None
+
+    metadata = ViewAndMutationMeta(
+        input_info=input_info,  # type: ignore[arg-type]
+        output_info=output_info,  # type: ignore[arg-type]
+        num_intermediate_bases=num_intermediate_bases,  # type: ignore[arg-type]
+        keep_input_mutations=keep_input_mutations,  # type: ignore[arg-type]
+        traced_tangents=traced_tangents,  # type: ignore[arg-type]
+        subclass_inp_meta=subclass_inp_meta,  # type: ignore[arg-type]
+        subclass_fw_graph_out_meta=subclass_fw_graph_out_meta,  # type: ignore[arg-type]
+        subclass_tangent_meta=subclass_tangent_meta,  # type: ignore[arg-type]
+    )
+    return metadata
+
+
+def compute_inner_mutated_inp_indices_from_subclass_meta(
+    fw_metadata: ViewAndMutationMeta,
+    inner_metadata: ViewAndMutationMeta,
+) -> List[int]:
+    # Note: [Recomputing subclass mutation handling]
+    #
+    # Generally, if a subclass requires grad, its components will not require grad.
+    # But for the purposes of tracking returned tensors, we should treat those component
+    # tensors as if they require grad.
+    #
+    # For example, if the subclass tensor requires grad and will be mutated in a way that
+    # requires us to handle the mutation outside of the graph, we need to return it
+    # from the forward graph. The inner_meta data won't consider the component tensors
+    # as if they need to be returned, because they don't require grad; but really, we
+    # should handle those tensors the same way we handle the subclass tensor itself; i.e.
+    # if we'd include the subclass tensor as part of the outputs, then we should also
+    # include the component tensors.
+    #
+    # To do this, we patch num_mutated_inp_runtime_indices below by expanding the inputs
+    # from the outer subclass tensors and propagating
+
+    updated_input_info = []
+    inner_idx = 0
+    if not fw_metadata.subclass_inp_meta:
+        # Sometimes we don't have subclass info, e.g. synthetic_base codepaths
+        return inner_metadata.mutated_inp_runtime_indices
+    assert len(fw_metadata.subclass_inp_meta) == len(fw_metadata.input_info)
+    for outer_idx, inp_meta in enumerate(fw_metadata.subclass_inp_meta):
+        if isinstance(inp_meta, int):
+            assert outer_idx < len(fw_metadata.input_info)
+            if inner_metadata is not None:
+                assert inner_idx < len(inner_metadata.input_info)
+                assert (
+                    inner_metadata.input_info[inner_idx]
+                    == fw_metadata.input_info[outer_idx]
+                )
+            updated_input_info.append(fw_metadata.input_info[outer_idx])
+            inner_idx += 1
+        else:
+            for _ in range(inp_meta.arg_count):
+                updated_input_info.append(fw_metadata.input_info[outer_idx])
+                inner_idx += 1
+    if inner_metadata is not None:
+        assert len(inner_metadata.input_info) == len(updated_input_info)
+
+    return [
+        i
+        for i, inp in enumerate(updated_input_info)
+        if inp.mutation_type == MutationType.MUTATED_OUT_GRAPH
+    ]
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0834c398563ece8bc7ef46aa9dd16ddd1ba638
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/traced_function_transforms.py
@@ -0,0 +1,698 @@
+"""
+This module is responsible for transforming functions to be traced into a form
+that is easier for the downstream infra (e.g. Autograd, FX, AOTAutograd analysis)
+to handle.
+
+It does so by:
+1. functionalization (including RNG functionalzation)
+2. creating a joint graph when required
+3. transforming mutations into extra outputs
+4. dispatching subclasses
+"""
+
+import warnings
+from contextlib import nullcontext
+from functools import wraps
+from typing import Any, Callable, List, Tuple, Union
+from unittest.mock import patch
+
+import torch
+import torch.fx.traceback as fx_traceback
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._decomp.decompositions_for_rng import PhiloxStateTracker
+from torch._guards import detect_fake_mode
+from torch._prims_common import CUDARngStateHelper
+from torch.fx.experimental.symbolic_shapes import definitely_false, sym_eq
+from torch.nn.utils import stateless
+
+from .. import config
+from .collect_metadata_analysis import run_functionalized_fw_and_collect_metadata
+from .functional_utils import (
+    from_fun,
+    has_data_mutation,
+    has_metadata_mutation,
+    is_fun,
+    sync_functional_tensor,
+    to_fun,
+)
+from .logging_utils import setup_stacktrace_preservation_hooks
+from .schemas import (
+    AOTConfig,
+    MutationType,
+    OutputType,
+    SubclassMeta,
+    SubclassTracingInfo,
+    ViewAndMutationMeta,
+)
+from .subclass_utils import (
+    create_subclass_meta,
+    requires_subclass_dispatch,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses_maybe_joint,
+)
+from .utils import maybe_to_fresh_input
+
+
+# This function returns a new function that returns mutated inputs as outputs.
+# if keep_data_input_mutations is set, then we assume that data-only mutations
+# will be left in the graph, and we only return metadata-mutated inputs as outputs.
+def fn_input_mutations_to_outputs(
+    fn: Callable,
+    meta: ViewAndMutationMeta,
+    keep_data_input_mutations: bool,
+) -> Any:
+    @wraps(fn)
+    def inner_fn(*args):
+        outs = fn(*args)
+        assert len(meta.output_info) == len(outs)
+        # The compiled fw will return mutated input tensors, *including* metadata-only mutation.
+        # However, if keep_data_input_mutations is set, the compiled fw only needs to return metadata-mutated inputs.
+        # (because data-only input mutations are handled directly in the compiled graph)
+        mutated_inputs_to_return = [
+            x for (i, x) in enumerate(args) if i in meta.mutated_inp_runtime_indices
+        ]
+        return *mutated_inputs_to_return, *outs
+
+    return inner_fn
+
+
+# This function takes in a fn with external aliasing and mutation,
+# and returns a new fn with no external aliasing and mutation,
+# as needed for autograd.
+# The main transformations are:
+# - Return mutated inputs as extra outputs
+# - Clone mutated inputs that require gradients,
+#   because autograd will require us to pass the pre-mutated inputs into autograd.grad
+# - Return intermediate bases of outputs as additional outputs,
+#   needed to appease autograd.Function
+# The new function returns:
+# (1) The updated outputs
+# (2) A boolean mask of len(new_fn_outputs),
+#     that can be used to tell autograd.grad which outputs should get tangents
+#     if we trace the backward.
+def fn_prepped_for_autograd(
+    fn: Callable,
+    meta: ViewAndMutationMeta,
+) -> Any:
+    @wraps(fn)
+    def inner_fn(*args):
+        args_maybe_cloned = [
+            maybe_to_fresh_input(i, t, meta) for i, t in enumerate(args)
+        ]
+
+        outs = fn(*args_maybe_cloned)
+        assert isinstance(outs, (tuple, list))
+        outs = list(outs)
+        assert len(meta.output_info) == len(outs)
+
+        mutated_inputs_to_return = [
+            x
+            for (i, x) in enumerate(args_maybe_cloned)
+            if i in meta.mutated_inp_runtime_indices
+        ]
+
+        intermediate_bases = []
+        for i, (o, info) in enumerate(zip(outs, meta.output_info)):
+            if info.output_type == OutputType.alias_of_intermediate_save_as_output:
+                intermediate_bases.append(o._base)
+
+        assert meta.num_intermediate_bases == len(intermediate_bases)
+
+        # the compiled forward should return (mutated_inputs, user_outs, intermediate_bases)
+        fw_outs_to_return = *mutated_inputs_to_return, *outs, *intermediate_bases
+
+        # Also return a boolean mask specifying which outputs to this function will be used as tangents
+        mutated_inputs_grad_mask = [
+            meta.input_info[meta.mutated_inp_runtime_indices[i]].mutates_data
+            and meta.input_info[meta.mutated_inp_runtime_indices[i]].requires_grad
+            for (i, x) in enumerate(mutated_inputs_to_return)
+        ]
+
+        # Pass any (non-aliased) outputs in as tangents, since they'll be returned as outputs in the fw
+        # For outputs that are aliases of intermediates, we will have returned the output's _base as an output in the graph instead,
+        # which we *should* send to grad()
+        output_grad_mask = [
+            meta.output_info[i].output_type
+            in [
+                OutputType.non_alias,
+                OutputType.unsafe_view_alias,
+                OutputType.custom_function_view,
+            ]
+            # Also, only tensor outputs should participate in the backward
+            # (in particular, Symint outputs in the forward graph shouldn't get tangents)
+            and issubclass(meta.output_info[i].raw_type, Tensor)
+            and meta.output_info[i].requires_grad
+            for (i, x) in enumerate(outs)
+        ]
+
+        intermediate_base_grad_mask = [True for _ in range(len(intermediate_bases))]
+
+        out_grad_mask = (
+            mutated_inputs_grad_mask + output_grad_mask + intermediate_base_grad_mask
+        )
+        assert len(out_grad_mask) == len(fw_outs_to_return)
+
+        # Take care to grab and sync the updated inputs from primals_after_cloning (the inputs we actually mutate!)
+        # and not primals (the preserved inputs, pre-mutation, that we pass to grad())
+        # This is annoying: our joint function needs to be aware of functionalization
+        # (syncing mutated inputs before calling autograd.grad())
+        # In theory, we could make the autograd engine do this automatically, although that probably isn't any cleaner.
+        for arg in args_maybe_cloned:
+            if not isinstance(arg, Tensor):
+                continue
+            sync_functional_tensor(arg)
+
+        return fw_outs_to_return, out_grad_mask
+
+    return inner_fn
+
+
+# Given a fn, computes the joint.
+# NOTE: fn is expects the following behavior:
+# (1) fn() needs to return a tuple of (outs, mask),
+#     where `mask` tells us which outputs are meant to have tangents.
+#     we don't know this info automatically, because we don't actually want to blindly
+#     compute tangents for every output that requires grad.
+#     Specifically, outputs that alias inputs won't participate in the backward and get tangents.
+# (2) fn() cannot mutate any inputs that require gradient.
+#     otherwise, when we compute autograd.grad(), we will not take those input mutations into account
+#     (the way this is handled is that we ensure any inputs that normally get mutated are cloned first)
+def create_joint(fn: Callable, *, aot_config: AOTConfig) -> Any:
+    def inner_fn(primals: List[Any], tangents: List[Any]):
+        outs, tangent_mask = fn(*primals)
+        assert len(tangent_mask) == len(outs)
+        outs_to_grad = [
+            o for needs_tangent, o in zip(tangent_mask, outs) if needs_tangent
+        ]
+        assert len(outs_to_grad) == len(tangents)
+
+        # Get the inputs that need gradients
+        grad_primals = []
+        inputs_needs_grads = []
+        # Note that we're not using primals here,
+        # being carefully not to pass any mutated inputs into autograd.grad()
+        for p in primals:
+            is_grad_tensor = isinstance(p, Tensor) and p.requires_grad
+            inputs_needs_grads.append(is_grad_tensor)
+            if is_grad_tensor:
+                grad_primals.append(p)
+
+        # Get the outputs that need gradients
+        needed_outs = []
+        needed_tangents = []
+        for out, tangent in zip(outs_to_grad, tangents):
+            if isinstance(out, Tensor) and out.requires_grad:
+                # A bit sketchy, but fixes e.g. test_aot_autograd_exhaustive_matmul_cpu_float32
+                # The issue is that we are sensitive to decomps that don't accurately maintain
+                # their output's _base.shape compared to eager mode, and this helps mitigate a bit.
+                # The not definitely_false is also sketchy; if unbacked
+                # symints are involved, we're just going to assume that the
+                # decomps setup the base shape correctly
+                needed_outs.append(
+                    out
+                    if not definitely_false(sym_eq(out.shape, tangent.shape))
+                    else out.view(tangent.shape)
+                )
+                needed_tangents.append(tangent)
+
+        setup_stacktrace_preservation_hooks([out.grad_fn for out in needed_outs])
+
+        if config.functionalize_rng_ops:
+            PhiloxStateTracker.mark_beginning_of_backward()
+        backward_out: Tuple[Tensor, ...] = tuple()
+        # Call the backwards pass
+        if grad_primals:
+            with fx_traceback.preserve_node_meta():
+                # for full graph export, we always export a joint graph where we assume no tangents are needed.
+                if aot_config.no_tangents:
+                    assert len(needed_tangents) == 1 and needed_tangents[0].numel() == 1
+                    backward_out = torch.autograd.grad(
+                        needed_outs,
+                        grad_primals,
+                        allow_unused=True,
+                    )
+                else:
+                    backward_out = torch.autograd.grad(
+                        needed_outs,
+                        grad_primals,
+                        grad_outputs=needed_tangents,
+                        allow_unused=True,
+                    )
+        backward_out_iter = iter(backward_out)
+        return outs, [
+            next(backward_out_iter) if i else None for i in inputs_needs_grads
+        ]
+
+    def inner_fn_with_anomaly(*args):
+        with fx_traceback.preserve_node_meta(), warnings.catch_warnings():
+            warnings.filterwarnings("ignore", "Anomaly Detection has been enabled.")
+            with torch.autograd.detect_anomaly(check_nan=False):
+                return inner_fn(*args)
+
+    return inner_fn_with_anomaly
+
+
+def create_functionalized_rng_ops_wrapper(func, args, trace_joint=True) -> Any:
+    # Functionalization of rng ops changes the calling convention of the joint graph.
+    # It goes from (primals, tangents) to (seed, offset, primals, tangents)
+    # At runtime, we pass on the current seed and offset. This is hidden from
+    # the user.
+    fake_mode = detect_fake_mode()
+    if fake_mode is None:
+        fake_mode = nullcontext()
+
+    def override_get_rng_state(device: Union[int, str, torch.device] = "cuda"):
+        out = PhiloxStateTracker.get_state_as_tensor()
+        return out
+
+    def override_set_rng_state(x, device: Union[int, str, torch.device] = "cuda"):
+        PhiloxStateTracker.set_state_from_tensor(x)
+
+    def append_rng_offsets(args):
+        if trace_joint:
+            # args signature before: Tuple(fwd_outputs), Tuple(bwd_outputs)
+            # args signature after: Tuple(fwd_outputs, new_fwd_rng_offset), Tuple(bwd_offset, new_bwd_rng_offset)
+            return (
+                (*args[0], PhiloxStateTracker.get_updated_fwd_offset()),
+                (*args[1], PhiloxStateTracker.get_updated_bwd_offset()),
+            )
+        else:
+            # args signature before: Tuple(fwd_outputs)
+            # args signature after: Tuple(fwd_outputs, new_fwd_rng_offset)
+            return (*args, PhiloxStateTracker.get_updated_fwd_offset())
+
+    def traced_joint(
+        primals, tangents, fwd_seed, fwd_base_offset, bwd_seed, bwd_base_offset
+    ):
+        with patch("torch.cuda.get_rng_state", override_get_rng_state), patch(
+            "torch.cuda.set_rng_state", override_set_rng_state
+        ):
+            return append_rng_offsets(func(primals, tangents))
+
+    def traced_forward(*primals_fwd_seed_fwd_base_offset):
+        # The signature is (*primals, seed, offset)
+        with patch("torch.cuda.get_rng_state", override_get_rng_state), patch(
+            "torch.cuda.set_rng_state", override_set_rng_state
+        ):
+            return append_rng_offsets(func(*primals_fwd_seed_fwd_base_offset[:-2]))
+
+    if trace_joint:
+        # Get the current seed and offset to setup tracing.
+        fwd_seed, fwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        bwd_seed, bwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        PhiloxStateTracker.record_state(fwd_seed, fwd_base_offset, "forward")
+        PhiloxStateTracker.record_state(bwd_seed, bwd_base_offset, "backward")
+        return traced_joint, (
+            *args,
+            fwd_seed,
+            fwd_base_offset,
+            bwd_seed,
+            bwd_base_offset,
+        )
+    else:
+        # Get the current seed and offset to setup tracing.
+        fwd_seed, fwd_base_offset = CUDARngStateHelper.get_torch_state_as_tuple(
+            fake_mode
+        )
+        PhiloxStateTracker.record_state(fwd_seed, fwd_base_offset, "forward")
+        return traced_forward, (*args, fwd_seed, fwd_base_offset)
+
+
+# This creates the final function that we want to trace using make_fx(),
+# in both aot_dispatch_autograd and aot_dispatch_base.
+# Preconditions:
+# - fn corresponds to the user's fw function
+# - fn arguments have been flattened, duplicate arguments have been handled
+# - In the returned function, the "primals" arguments *includes* synthetic bases.
+# This function does the work of functionalizing the input function,
+# and performing copy_() calls at the end of the function if `keep_input_mutations` is set.
+# The function returned has signature that is either:
+# (1) "traced_fn(primals: List[Any])" if trace_joint is False
+# (2) "traced_fn(primals: List[Any], tangents: List[Any])" if trace_joint is True
+# Returns a new (functionalized) function, and updated arguments to call it with.
+def create_functionalized_fn(
+    fn,
+    args,
+    *,
+    meta: ViewAndMutationMeta,
+    aot_config: AOTConfig,
+    trace_joint: bool,
+) -> Any:
+    @wraps(fn)
+    def _functionalized_f_helper(*args):
+        # See Note [Disabling Functionalize TLS Above Python Functionalization]
+        disable_above = torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+
+        # See Note [Side-Effectful Tokens in AOTAutograd]
+        if trace_joint:
+            assert (
+                isinstance(args, tuple)
+                and len(args) == 2
+                and isinstance(args[0], (list, tuple))
+            )
+            tokens = args[0][: len(meta.tokens)]
+            actual_args = args[0][len(meta.tokens) :]
+            args = (actual_args, args[1])
+        else:
+            tokens = args[: len(meta.tokens)]
+            args = args[len(meta.tokens) :]
+        assert all(token.numel() == 0 for token in tokens)
+
+        with disable_above:
+            # Wrap inputs into functional wrappers
+            f_args = pytree.tree_map(to_fun, args)
+            f_tokens = pytree.tree_map(to_fun, tokens)
+
+            # Populate the current FunctionalTensorMode with the tokens per
+            # operator. See Note [FunctionalTensorMode is Stateful]
+            functional_tensor_mode = (
+                torch.utils._python_dispatch._detect_functional_mode()
+            )
+            assert functional_tensor_mode is not None
+            for i, k in enumerate(meta.tokens.keys()):
+                functional_tensor_mode._tokens[k] = f_tokens[i]
+
+            # Run the joint
+            f_outs = fn(*f_args)
+
+            # Return both the tokens and the outputs
+            # See Note [Side-Effectful Tokens in AOTAutograd]
+            f_outs = (*functional_tensor_mode._tokens.values(), *f_outs)
+
+        if trace_joint:
+            # We support a limited amount of mutation of graph inputs during the backward pass.
+            # (This is used e.g. by Float8, which needs to update buffers during the backward pass)
+            # Here, we perform extra checks for primals that were mutated in the **backward**
+            # We're doing the checks here instead of doing them with the rest of the input mutation handling because:
+            # - We need to detect inputs that were mutated in the backward **separately** from mutations that happened
+            #   during the forward, because the handling is different: some input mutations from the the forward
+            #   can be only handled in a fw-only runtime epilogue, and in theory if we wanted to handle those same
+            #   types of mutations in the backward we would need a bw-only runtime epilogue.
+            # - We could in theory have our analysis pass differentiate mutations in the fw from mutations in
+            #   the bw by running our analysis first on the fw-only graph, and then on the joint graph. This would
+            #   require an extra round of tracing though, so it's more efficient to do in-line here.
+            assert (
+                isinstance(args, tuple)
+                and len(args) == 2
+                and isinstance(args[0], (list, tuple))
+            )
+            # Only look at mutations that happened to forward inputs (e.g. fw buffers that were saved for bw)
+            primals_before = args[0]
+            primals_after = pytree.tree_map(from_fun, f_args[0])
+            for f_inpt, before, after, inpt_info in zip(
+                f_args[0], primals_before, primals_after, meta.input_info
+            ):
+                # Ban metadata mutations on fw inputs during the bw
+                if not inpt_info.mutates_metadata:
+                    assert not has_metadata_mutation(
+                        f_inpt, before, check_only_storage_mutation=False
+                    ), "Found a graph input that had its metadata mutated in the backward. This is not supported"
+                # Allow data mutations on fw inputs during the bw, but only if they do not require grad
+                # So we can guarantee that we can keep the mutations in the graph
+                if has_data_mutation(f_inpt) and not inpt_info.mutates_data:
+                    assert (
+                        not inpt_info.requires_grad
+                    ), "Found a graph input that requires_grad and was mutated in the backward. This is not supported"
+                    # Otherwise, put the mutation in the graph
+                    before.copy_(after)
+            # Now that we covered mutations to *forward* inputs during the backward,
+            # we also need to cover mutations to *backward-only* inputs during the backward (e.g. mutation to a grad_out).
+            # Today, we will just error in all cases of this happening unless someone needs us to support it.
+            tangents_before = args[1]
+            tangents_after = pytree.tree_map(from_fun, f_args[1])
+            for f_inpt, before, after in zip(
+                f_args[1], tangents_before, tangents_after
+            ):
+                assert not has_metadata_mutation(
+                    f_inpt, before, check_only_storage_mutation=False
+                ) and not has_data_mutation(
+                    f_inpt
+                ), "Found an input to the backward that was mutated during the backward pass. This is not supported"
+
+        if aot_config.keep_inference_input_mutations:
+            # Note: This is a bit annoying. There's a layering issue here, where:
+            # (1) functionalization needs to operate on **synthetic base** inputs, before unpacking them into the "real" inputs.
+            # (2) For keep_input_mutations, we support tracing a call to copy_() directly on mutated inputs.
+            #     However, we **only** want to support this for inputs that have data-only (and no metadata) mutations,
+            #     because inductor (and backends in generally) would prefer not to see these (e.g. as_strided_(), resize_()).
+            #     This makes it pretty difficult for this logic to operate on synthetic bases.
+            # (3) In addition, there are cases where it's significantly cheaper to perform the copy on the individual
+            #     (unpacked) input aliases, instead of the synthetic base.
+            # Example case where (3) could be important:
+            #
+            #     def f(x, y):
+            #         x.mul_(2)
+            #         y.mul_(3)
+            #         return x, y
+            #    a = torch.ones(1'000'000)
+            #    x, y = out(a[0:9], a[1:10])
+            #
+            # It would be much better to add copy_() calls into the graph for the two tiny slices, instead of materializing
+            # a giant "updated synthetic base" and copying into a's entire storage.
+            #
+            # For now, we are pessimistically not performing the optimization from (3);
+            # we will materialize an "updated" synthetic base, and copy it back to the synthetic input base.
+            # This allows us to factor aot autograd much more nicely, since only one area of the code needs to worry
+            # about synthetic bases.
+            for i, (inpt_old, inpt_f) in enumerate(
+                zip(args, f_args) if not trace_joint else zip(args[0], f_args[0])
+            ):
+                if not isinstance(inpt_f, torch.Tensor):
+                    continue
+                assert is_fun(inpt_f)
+                inpt_new = from_fun(inpt_f)
+                if meta.input_info[i].mutation_type == MutationType.MUTATED_IN_GRAPH:
+                    # We found an input that had a (data-only) mutation.
+                    # Since keep_input_mutations is set, we need to faithfully apply a copy_()
+                    # so the compiler will see the input mutation in the graph.
+                    if meta.input_info[i].mutations_hidden_from_autograd:
+                        # Hidden from autograd = run under no_grad, **and** don't bump VC
+                        with torch.no_grad(), torch.autograd._unsafe_preserve_version_counter(
+                            inpt_old
+                        ):
+                            inpt_old.copy_(inpt_new)
+                    elif meta.input_info[i].mutations_under_no_grad_or_inference_mode:
+                        # Under no_grad = run under no_grad (we still bump the VC though)
+                        # (inference_mode will also bump the VC, as long as the tensor in question
+                        # was created outside of inference_mode)
+                        with torch.no_grad():
+                            inpt_old.copy_(inpt_new)
+                    else:
+                        inpt_old.copy_(inpt_new)
+
+            # When an output tensor is a functionalized mutated input, and we
+            # were able to move the mutation in to the graph then we can return
+            # the mutated input directly. This prevents duplicating the
+            # tensors contents.
+            flat_outs, outs_spec = pytree.tree_flatten(f_outs)
+            flat_outs = [from_fun(o) for o in flat_outs]
+            num_outs = len(meta.output_info)
+
+            for i, outp in enumerate(flat_outs[:num_outs]):
+                info = meta.output_info[i]
+                if info.output_type != OutputType.is_input:
+                    continue
+
+                assert info.base_idx is not None
+                if (
+                    meta.input_info[info.base_idx].mutation_type
+                    == MutationType.MUTATED_IN_GRAPH
+                ):
+                    flat_outs[i] = args[info.base_idx]
+            return pytree.tree_unflatten(flat_outs, outs_spec)
+
+        return pytree.tree_map(from_fun, f_outs)
+
+    # Kinda annoying, but needed to make sure that the fx graph we trace out has "primals"
+    # and "tangents" as its input names (which are special-cased by the partitioner)
+    # TODO (tmanlaibaatar) revisit this if we ever need to turn on non-strict joint graph export
+    def joint_helper(primals, tangents):
+        return _functionalized_f_helper(primals, tangents)
+
+    helper = joint_helper if trace_joint else _functionalized_f_helper
+    if config.functionalize_rng_ops:
+        # Setup the wrapper for functionalization of rng ops
+        helper, args = create_functionalized_rng_ops_wrapper(helper, args, trace_joint)
+
+    # Additionally pass in tokens as inputs
+    # See Note [Side-Effectful Tokens in AOTAutograd]
+    additional_token_inputs = [torch.tensor([])] * len(meta.tokens)
+    if trace_joint:
+        args = ([*additional_token_inputs, *args[0]], *args[1:])
+    else:
+        args = [*additional_token_inputs, *args]
+
+    return helper, args
+
+
+# Given a function operating on Subclass -> Subclass, returns an function that operates on Tensor -> Tensor
+# Also returns:
+# - the new set of arguments to pass into this function (now that tensor subclasses have been eliminated)
+# - the updated ViewAndMutationMeta for this dense -> dense function.
+# The other important arguments are:
+# - flat_fn_maybe_joint: when is_joint_structure=True, this is the joint fw-bw function.
+#                        when is_joint_structure=False, this is just the forward function.
+# - fw_only: this is *always* the forward-only function.
+#   Why do we need this? We need to collect updated ViewAndMutationMeta on our new dense -> dense functions.
+#   In particular, we need this to tell the partitioner how many dense forward outputs there are.
+def aot_dispatch_subclass(
+    flat_fn_maybe_joint,
+    args: List[Any],
+    *,
+    is_joint_structure: bool,
+    meta: ViewAndMutationMeta,
+    fw_only: Callable,
+) -> SubclassTracingInfo:
+    # Skip logic if we don't need to trace through any subclasses
+    req_subclass_dispatch = requires_subclass_dispatch(args, meta)
+    if not req_subclass_dispatch:
+        return SubclassTracingInfo(
+            plain_tensor_trace_fn=flat_fn_maybe_joint,
+            plain_tensor_args=args,
+            maybe_subclass_meta=None,
+        )
+
+    # TODO: add subclass guards (later PR).
+
+    # What's going on here? We need to compute subclass metadata about the outputs of the joint (grad_inputs).
+    # Annoying: we don't know the grad input metas until we're in the middle of tracing the joint,
+    # so we set it later, while we're tracing the joint (see inner_fn() below).
+    # Another option would be to run our run_functionalized_fw_and_collect_metadata() function
+    # directly on the joint, but this would hurt compile time (adding yet another pass through the joint).
+    subclass_meta = SubclassMeta()
+
+    def inner_fn(fn, args, *, use_trace_joint: bool):
+        # Step 1: wrap tensor inputs into subclasses if necessary
+        all_args = wrap_tensor_subclasses_maybe_joint(
+            args, is_joint_structure=use_trace_joint, meta=meta
+        )
+
+        # Step 2: call the inner function, with our (maybe subclass) inputs
+        wrapped_outs = fn(*all_args)
+
+        if use_trace_joint:
+            # See Note: [Computing Subclass Metadata about grad_inputs]
+            # We also stash subclass info on our grad_inputs, if we're tracing the joint.
+            nonlocal subclass_meta
+            assert isinstance(wrapped_outs, tuple) and len(wrapped_outs) == 2
+            # Don't need fw outs since we already have subclass metadata on them
+            grad_inputs = wrapped_outs[1]
+            subclass_meta.grad_input_metas = create_subclass_meta(grad_inputs)
+
+        # Step 3: Unwrap any subclass outputs back into dense tensors
+        unwrapped_outs = unwrap_tensor_subclasses(
+            wrapped_outs, is_joint_structure=use_trace_joint
+        )
+        return unwrapped_outs
+
+    def joint_fn(primals, tangents):
+        return inner_fn(flat_fn_maybe_joint, (primals, tangents), use_trace_joint=True)
+
+    def fw_fn(*primals):
+        return inner_fn(flat_fn_maybe_joint, primals, use_trace_joint=False)
+
+    def metadata_fn(*primals):
+        return inner_fn(fw_only, primals, use_trace_joint=False)
+
+    args_unwrapped = unwrap_tensor_subclasses(
+        args, is_joint_structure=is_joint_structure
+    )
+
+    if is_joint_structure:
+        primals_unwrapped = args_unwrapped[0]
+        fn_to_trace = joint_fn
+    else:
+        primals_unwrapped = args_unwrapped
+        fn_to_trace = fw_fn
+
+    # Note: [Partitioner handling for Subclasses, Part 1]
+    # The way the partitioner works is that:
+    # (1) we pass is a single graph containing the joint fw/bw,
+    #     where the # of graph outputs corresponds to # fw_outputs + # grad_inputs
+    # (2) The partitioner accepts an arguments, num_fwd_outputs,
+    #     and assumes that the first "num_fwd_outputs" graph outputs correspond
+    #     to outputs of the forward graph.
+    # How do tensor subclasses enter the picture?
+    # the num_fwd_outputs in the final graph is actually non-trivial to compute,
+    # because it can be influenced by input mutations and intermediate bases.
+    # So we compute it by inspecting the current ViewAndMutationMeta object.
+    # However, the original ViewAndMutationMeta that we computed was created
+    # on the subclass -> subclass graph,
+    # which can have a different number of outputs than the dense -> dense graph.
+    # That's why we createa a fresh metadata object on the dense -> dense function here,
+    # and plumb it back up to the partitioner.
+    # See Note: [Partitioner handling for Subclasses, Part 2] for more info.
+    meta_updated = run_functionalized_fw_and_collect_metadata(
+        metadata_fn,
+        keep_input_mutations=meta.keep_input_mutations,
+        is_train=meta.is_train,
+    )(*primals_unwrapped)
+
+    subclass_meta.fw_metadata = meta_updated
+
+    return SubclassTracingInfo(
+        plain_tensor_trace_fn=fn_to_trace,
+        plain_tensor_args=args_unwrapped,
+        maybe_subclass_meta=subclass_meta,
+    )
+
+
+class PropagateUnbackedSymInts(torch.fx.Interpreter):
+    def run_node(self, n: torch.fx.Node):
+        import sympy
+
+        result = super().run_node(n)
+        # TODO: handle Tensor returns
+        if "example_value" in n.meta:
+            if isinstance(result, torch.SymInt) and isinstance(
+                result.node.expr, sympy.Symbol
+            ):
+                torch._check(result == n.meta["example_value"])
+
+        return result
+
+
+def create_functional_call(mod, params_spec, params_len, store_orig_mod=False):
+    # Redundant with dynamo, but worth having in case this gets invoked elsewhere.
+    # https://github.com/pytorch/pytorch/issues/103569
+
+    def functional_call(*args, **kwargs):
+        with stateless._reparametrize_module(
+            mod, pytree.tree_unflatten(args[:params_len], params_spec)
+        ):
+            if isinstance(mod, torch.fx.GraphModule):
+                with fx_traceback.preserve_node_meta(), warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore", "Anomaly Detection has been enabled."
+                    )
+                    with torch.autograd.detect_anomaly(check_nan=False):
+                        out = PropagateUnbackedSymInts(mod).run(
+                            *args[params_len:], **kwargs
+                        )
+            else:
+                out = mod(*args[params_len:], **kwargs)
+
+        if not isinstance(out, (tuple, list)):
+            raise RuntimeError(
+                "Graph output must be a tuple(). This is so that we can avoid "
+                "pytree processing of the outputs. Please change the module to "
+                "have tuple outputs or use aot_module instead."
+            )
+        return out
+
+    # Note [Preserving the nn module stack metadata during export non-strict mode]
+    # This path is currently only used by the non-strict export flow,
+    # where we cannot rely on dynamo to preserve nn stack metadata in our captured graph.
+    # Instead, we stash the original user nn module here, and rely on `make_fx` to grab
+    # this stashed module and use it to track nn module stack metadata
+    if store_orig_mod and not hasattr(functional_call, "_orig_mod"):
+        functional_call._orig_mod = mod  # type: ignore[attr-defined]
+
+    return functional_call
diff --git a/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/utils.py b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6c7858fd8ea9df1bb6aee96c11e505127815eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/_aot_autograd/utils.py
@@ -0,0 +1,226 @@
+"""
+Contains various utils for AOTAutograd, including those for handling collections.
+"""
+
+import dataclasses
+import warnings
+from contextlib import nullcontext
+from functools import wraps
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.proxy_tensor import py_sym_types
+
+KNOWN_TYPES = [
+    torch.Tensor,
+    BackwardState,
+    int,
+    str,
+    float,
+    bool,
+    type(None),
+    *py_sym_types,
+]
+
+original_zip = zip
+
+
+def strict_zip(*iterables, strict=True, **kwargs):
+    if not strict:
+        return original_zip(*iterables, **kwargs)
+
+    shortest_length = min(len(it) for it in iterables)
+    for iterable in iterables:
+        if len(iterable) != shortest_length:
+            raise ValueError(
+                "The iterables have different lengths and strict mode is enabled."
+            )
+
+    return original_zip(*iterables, **kwargs)
+
+
+def _get_symint_hints(exprs):
+    """
+    Get the hints of a list/tuple of int/SymInt.
+    """
+    if isinstance(exprs, (list, tuple)):
+        return type(exprs)(_get_symint_hints(e) for e in exprs)
+    elif isinstance(exprs, torch.SymInt):
+        return exprs.node.shape_env.size_hint(exprs.node.expr)
+    else:
+        return exprs
+
+
+def partial_flatten_asdict(obj: Any) -> Any:
+    if dataclasses.is_dataclass(obj):
+        return {
+            field.name: getattr(obj, field.name) for field in dataclasses.fields(obj)
+        }
+    elif isinstance(obj, (list, tuple)):
+        return obj.__class__([partial_flatten_asdict(item) for item in obj])
+    elif isinstance(obj, dict):
+        return {k: partial_flatten_asdict(v) for k, v in obj.items()}
+    else:
+        return obj
+
+
+def normalize_as_list(x):
+    if isinstance(x, tuple):
+        return list(x)
+    elif isinstance(x, list):
+        return x
+    return [x]
+
+
+def _get_autocast_states():
+    return [
+        torch.is_autocast_enabled(),
+        torch.is_autocast_cpu_enabled(),
+        torch.get_autocast_gpu_dtype(),
+        torch.get_autocast_cpu_dtype(),
+        torch.is_autocast_cache_enabled(),
+    ]
+
+
+def make_boxed_func(f):
+    def g(args):
+        return f(*args)
+
+    g._boxed_call = True  # type: ignore[attr-defined]
+    return g
+
+
+def make_boxed_compiler(compiler):
+    @wraps(compiler)
+    def f(fx_g, inps):
+        out_f = compiler(fx_g, inps)
+        fx_g = make_boxed_func(out_f)
+        return fx_g
+
+    return f
+
+
+def call_func_at_runtime_with_args(f, args, steal_args=False, disable_amp=False):
+    if not steal_args:
+        args = list(args)
+    assert isinstance(args, list)
+
+    context = torch._C._DisableAutocast if disable_amp else nullcontext
+    with context():
+        if hasattr(f, "_boxed_call"):
+            out = normalize_as_list(f(args))
+        else:
+            # TODO: Please remove soon
+            # https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670
+            warnings.warn(
+                "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
+                "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
+                "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
+            )
+            out = normalize_as_list(f(*args))
+    return out
+
+
+# Inspired by autodidax (thanks!)
+class PytreeThunk:
+    spec: Optional[pytree.TreeSpec] = None
+    # These are some kinda dumb microoptimizations that save about 3-4 us of overhead.
+    is_simple: Optional[
+        bool
+    ] = None  # if the output spec is a tuple/list, we won't bother unflattening it.
+    is_really_simple: Optional[bool] = None  # if the output spec is a LeafSpec
+
+    def set(self, spec: pytree.TreeSpec) -> None:
+        assert self.spec is None or self.spec == spec
+        assert spec is not None
+        self.spec: pytree.TreeSpec = spec
+        if self.spec.type in {tuple, list} and all(
+            child.is_leaf() for child in spec.children_specs
+        ):
+            self.is_simple = True
+        if self.spec.is_leaf():
+            self.is_really_simple = True
+
+    def unflatten(self, x: List[Any]) -> Any:
+        if self.is_really_simple:
+            return x[0]
+        if self.is_simple:
+            return x
+        assert self.spec is not None
+        return pytree.tree_unflatten(x, self.spec)
+
+
+# Creates a function that returns flattened inputs and outputs
+# Also returns the output tree spec, which is needed to recover the "unflattened"
+# output tree structure later.
+def create_tree_flattened_fn(fn, args, kwargs=None) -> Tuple[Callable, PytreeThunk]:
+    if kwargs is None:
+        kwargs = {}
+    # Save the args_spec for flat_tensor_args to unflatten while tracing
+    _, tensor_args_spec = pytree.tree_flatten((args, kwargs))
+    out_spec = PytreeThunk()
+
+    def flat_fn(*flat_args):
+        # The input are flattened tensor args. Prepare the args in the
+        # order that original function expects. Add static args as well.
+        # They will appear as tensor constants in the traced graph.
+        nonlocal out_spec
+        args, kwargs = pytree.tree_unflatten(flat_args, tensor_args_spec)
+        tree_out = fn(*args, **kwargs)
+        flat_out, spec = pytree.tree_flatten(tree_out)
+        for i in flat_out:
+            is_known_type = False
+            for j in KNOWN_TYPES:
+                if isinstance(i, j):
+                    is_known_type = True
+                    break
+            if not is_known_type:
+                raise RuntimeError(
+                    f"Found {type(i)} in output, which is not a known type. "
+                    "If this type holds tensors, you need to register a pytree for it. "
+                    "See https://github.com/pytorch/functorch/issues/475 for a brief "
+                    "explanation why. If you don't need to register a pytree, please "
+                    "leave a comment explaining your use case and we'll make this more "
+                    "ergonomic to deal with"
+                )
+        out_spec.set(spec)
+        return flat_out
+
+    # Can't use functools.wraps here because the wrapper has different
+    # calling convention
+    if hasattr(fn, "_orig_mod"):
+        flat_fn._orig_mod = fn._orig_mod  # type: ignore[attr-defined]
+
+    return flat_fn, out_spec
+
+
+# This function takes in a tensor t, and returns one of t, t.view(), or t.clone().
+# When tracing the joint forward + backward, for any inputs in the graph that are mutated,
+# we need to clone them first (and similarly for metadata-only mutations, we need to view them first).
+# The idea is that when we trace the backward, we need to pass in the *original* primals
+# to autograd.grad(), before they were mutated.
+# Note: when we have synthetic base inputs, we need to clone them *before* creating views off of them.
+# This means that "idx" here represents the index of the (potentially) synthetic base.
+# What we need to do is:
+# (1) map the current (post-synthetic-base calling convention) input argument index
+#     to int index pre-synthetic-base-calling-convention.
+# (2) There could be multiple, if this index corresponds to a synthetic base
+#     that has multiple input aliases.
+# (3) If any of those corresponding inputs get metadata mutations, then we clone the base.
+def maybe_to_fresh_input(idx, t, meta):
+    if not isinstance(t, torch.Tensor):
+        return t
+    if idx in meta.mutated_inp_runtime_indices:
+        # We only need to bother cloning mutated inputs that participate in autograd.
+        mutated_inp_idx = meta.mutated_inp_runtime_indices.index(idx)
+        if meta.input_info[idx].requires_grad and meta.input_info[idx].mutates_data:
+            # Make sure the primal we pass to autograd.grad()
+            # sees the tensor before the mutation
+            return t.clone()
+        if meta.input_info[idx] and meta.input_info[idx].mutates_metadata:
+            # Make sure the primal we pass to autograd.grad()
+            # sees the tensor before the metadata mutation
+            return t.view(t.shape)
+    return t
diff --git a/MLPY/Lib/site-packages/torch/_functorch/aot_autograd.py b/MLPY/Lib/site-packages/torch/_functorch/aot_autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec817eb7f1d6d1524e5d470b76df5fdc2fc9774a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/aot_autograd.py
@@ -0,0 +1,1246 @@
+# mypy: ignore-errors
+
+import itertools
+from contextlib import nullcontext
+from functools import partial, wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+import torch.utils._pytree as pytree
+import torch.utils.dlpack
+from torch import Tensor
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo import compiled_autograd
+from torch._dynamo.utils import dynamo_timed, preserve_rng_state
+from torch._guards import detect_fake_mode
+from torch._subclasses import FakeTensor, FakeTensorMode
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.experimental.symbolic_shapes import (
+    ShapeEnv
+)
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch._decomp.decompositions_for_rng import PhiloxStateTracker, rng_decompositions
+from . import config
+from .partitioners import default_partition
+
+from ._aot_autograd.utils import (  # noqa: F401
+    strict_zip,
+    _get_symint_hints,
+    KNOWN_TYPES,
+    partial_flatten_asdict,
+    normalize_as_list,
+    _get_autocast_states,
+    make_boxed_func,
+    make_boxed_compiler,
+    call_func_at_runtime_with_args,
+    create_tree_flattened_fn,
+    maybe_to_fresh_input,
+)
+from ._aot_autograd.logging_utils import (  # noqa: F401
+    graph_being_compiled,
+    nth_graph,
+    model_name,
+    set_model_name,
+    get_aot_compilation_context,
+    get_aot_graph_name,
+    get_graph_being_compiled,
+    track_graph_compiling,
+    callback_set,
+    setup_stacktrace_preservation_hooks,
+    describe_input,
+    format_guard_bug_msg,
+)
+from ._aot_autograd.functional_utils import (  # noqa: F401
+    is_fun,
+    to_fun,
+    from_fun,
+    sync_functional_tensor,
+    has_metadata_mutation,
+    has_data_mutation,
+    are_all_mutations_hidden_from_autograd,
+    are_all_mutations_under_no_grad_or_inference_mode,
+    gen_alias_from_base,
+    assert_functional_graph,
+    _check_if_mutation_can_be_in_graph,
+)
+from ._aot_autograd.schemas import (  # noqa: F401
+    OutputType,
+    OutputAliasInfo,
+    MutationType,
+    InputAliasInfo,
+    SubclassCreationMeta,
+    ViewAndMutationMeta,
+    SubclassMeta,
+    TensorAlias,
+    BackwardSignature,
+    GraphOutputName,
+    GraphInputName,
+    FQN,
+    GraphSignature,
+    AOTConfig,
+)
+from ._aot_autograd.subclass_utils import (  # noqa: F401
+    requires_subclass_dispatch,
+    unwrap_tensor_subclasses,
+    wrap_tensor_subclasses,
+    wrap_tensor_subclasses_maybe_joint,
+    create_metadata_for_subclass,
+)
+from ._aot_autograd.collect_metadata_analysis import (  # noqa: F401
+    run_functionalized_fw_and_collect_metadata,
+)
+from ._aot_autograd.input_output_analysis import (  # noqa: F401
+    remove_dupe_metadata,
+    create_synthetic_base_metadata,
+    _tensors_definitely_do_not_overlap,
+    compute_overlapping_inputs,
+    create_graph_signature,
+)
+from ._aot_autograd.traced_function_transforms import (  # noqa: F401
+    fn_input_mutations_to_outputs,
+    fn_prepped_for_autograd,
+    create_functionalized_fn,
+    create_functionalized_rng_ops_wrapper,
+    aot_dispatch_subclass,
+    create_functional_call,
+    create_joint,
+)
+from ._aot_autograd.runtime_wrappers import (  # noqa: F401
+    create_runtime_wrapper,
+    functionalized_rng_runtime_epilogue,
+    aot_dispatch_subclass_wrapper,
+    aot_wrapper_dedupe,
+    aot_wrapper_synthetic_base,
+    merge_view_inputs,
+)
+from ._aot_autograd.dispatch_and_compile_graph import (  # noqa: F401
+    aot_dispatch_base_graph,
+    aot_dispatch_autograd_graph,
+)
+from ._aot_autograd.jit_compile_runtime_wrappers import (  # noqa: F401
+    aot_dispatch_base,
+    aot_dispatch_autograd,
+)
+
+zip = strict_zip
+
+# This global counter increments every time we compile a graph with
+# AOTAutograd.  You can use this to correlate runtime error messages
+# with compile time (e.g., if you get an error at runtime saying
+# compiled graph 3 failed, you can set a breakpoint at compile time
+# for this graph number to investigate further at compile time.)
+#
+# NB: this is different from get_aot_compilation_context, which tracks
+# each underlying graph that is compiled.  In contrast, AOT_COUNTER
+# corresponds to top-level invocations of aot_module/aot_function;
+# one counter is allocated per entire compiled block (but this block
+# may involve compiling multiple subgraphs; e.g., for forwards/backwards)
+AOT_COUNTER = itertools.count()
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# AOT Autograd contains a pretty non-trivial amount of logic to handle edge cases around aliasing and mutation
+# that are external to the graph (they show up as side effects in some way when you run the graph).
+#
+# Take a look at `test_aotdispatch.py TestAOTAutograd.test_input_mutation*` tests for some examples functions
+# and what they're compiled graphs looks like.
+# Below is a very long comment detailing several edge cases, and showing how AOT Autograd handles them.
+#
+# Note [AOT Autograd: input data mutations]
+#
+# If we compile a function that mutates inputs, then those input mutations are real side effects
+# that a user expects to see after running the compiled graph.
+# However, the graph that we want to send to a backend needs to be *entirely* functional.
+# The way we reconcile this difference is that we remove the mutations completely from the graph that we compile
+# but we update the graph to return (updated_inputs, user_outputs).
+# In the epilogue that runs after the compiled graph is executed, we copy the updated inputs back to the originals.
+#
+# Example: original user code:
+# def f(x):
+#     x.mul_(2)
+#     out = x.mul(3)
+#     return out
+#
+# After AOT Autograd compiles, we end up with a:
+# (a) compiled graph
+# (b) autograd.Function.forward() method, that executes the compiled graph
+# (c) wrapper function, that calls the autograd.Function.forward() and performs the epilogue
+#
+# The output of (a, b, c) are all written below.
+#
+# def compiled_forward_graph(x):
+#     x_updated = x.mul(2)
+#     out = x_updated.mul(3)
+#     return x_updated, out
+#
+# # x_updated gets a gradient in the compiled backward
+# def compiled_backward_graph(grad_x_updated, grad_out):
+#     grad_x = ...
+#     return grad_x
+#
+# def autograd.Function.forward(x):
+#     x_updated, out = compiled_forward_graph(x)
+#     return x_updated, out
+#
+# def compiled_wrapper(x):
+#     x_updated, out = autograd.Function.apply(x)
+#     x.copy_(x_updated)
+#     return out
+#
+# Another important thing to note is that updated inputs (due to data mutations) *do* participate
+# in the compiled backward graph! Since the compiled forward graph gets N extra outputs
+# (due to updated inputs showing up as graph outputs),
+# The compiled backward gets an additional N inputs.
+# That way, during the x.copy_(x_updated) bit in the epilogue, gradients will flow from the updated input
+# back to the original input.
+
+
+# Note [AOT Autograd: input metadata mutations]
+#
+# For the same reason as input mutations, we also don't put input metadata mutations in the graph.
+# Instead, we return the updated version of the input (a view), and mutate the input's metadata outside of the graph
+#
+# Example: original user code:
+# def f(x):
+#     x.t_()
+#     out = x.mul(3)
+#     return out
+#
+# AOT Autograd output (compiled graph, autograd.Function.forward(), wrapper function):
+# def compiled_forward_graph(x):
+#     x_updated = x.t()
+#     out = x_updated.mul(3)
+#     return x_updated, out
+#
+# # x_updated does *not* get a gradient in the compiled backward
+# def compiled_backward_graph(grad_out):
+#     grad_x = ...
+#     return grad_x
+#
+# def autograd.Function.forward(x):
+#     x_updated, out = compiled_forward_graph(x)
+#     return x_updated, out
+#
+# def compiled_wrapper(x):
+#     x_updated, out = autograd.Function.apply(x)
+#     x.as_strided_(x_updated)
+#     return out
+
+
+# Note [AOT Autograd: outputs aliasing inputs or intermediates!]
+#
+# AOT Autograd needs special handling for outputs that alias graph inputs or intermediates!
+# Why?
+# (1) autograd.Function.forward() has a limitation, where views that returned in the forward cannot later be mutated.
+# (2) views don't need to be compiled in the graph anyway - it's cheap to generate them outside of the compiled graph,
+#     in an epilogue.
+# For outputs that alias inputs, we do the following:
+# (a) *still* return the aliased output as a graph output
+# (b) In the AOT Autograd wrapper/epilogue, we don't return that aliased output. Instead, we use it to regenerate the output.
+#
+# For outputs that alias *intermediates*, we do the following:
+# (a) Return the output in the compiled forward, **and** return it's ._base (a graph intermediates) as an output in the forward
+# (b) Use (output, graph_intermediate) to regenerate the alias, and return that to the user (instead of the compiled fw output).
+# You might wonder why we return the aliased output directly in the graph (and making the graph compute it),
+# only to not return it and instead generate a fresh alias off of the intermediate,
+# instead of (say) just storing metadata about the size/stride of the output somewhere to generate the alias. There are two reasons:
+# (1) Getting the actual alias tensor allows us to use view-replay to generate the alias, instead of an as_strided() call
+# (2) Inductor (and other backends) are free to change the memory format of graph outputs, if it results in better performance.
+#     This can result in problems if a user later tries to .view() that output expecting it to have one set of strides,
+#     when it has a different set of strides.
+#     By including the view op directly in the graph, inductor takes that into account when deciding what memory format
+#     the graph intermediate should be.
+#
+# Another important thing to note is how our traced backward() graph handles aliases.
+# (this applies to outputs aliasing inputs, outputs aliasing intermediates,
+#  *and* updated inputs returned in the compiled forward due to metadata-only mutations).
+# Any outputs that alias (either inputs or intermediates) do NOT participate in the compiled backward graph
+# It would be wasteful to include them in the compiled backward(), because we regenerate them eagerly
+# at the end of the forward.
+#
+# Example: original user code:
+# def f(x):
+#     out1 = x.t()
+#     intermediate = x.mul(2)
+#     out2 = intermediate.view(-1)
+#     return out1, out2
+#
+# AOT Autograd output (compiled graph, autograd.Function.forward(), wrapper function):
+# def compiled_forward_graph(x):
+#     out1 = x.t()
+#     intermediate = x.mul(2)
+#     out2 = intermediate.view(-1)
+#     # the compiled graph also returns the intermediate
+#     return out1, out2, intermediate
+#
+# # intermediate gets a gradient in the compiled backward.
+# # both output aliases (out1 and out2) do not.
+# def compiled_backward_graph(grad_intermediate):
+#     grad_x = ...
+#     return grad_x
+#
+# def autograd.Function.forward(x):
+#     out1, out2, intermediate = compiled_forward_graph(x)
+#     return out1, out2, intermediate
+#
+# def compiled_wrapper(x):
+#     out1, out2, intermediate = autograd.Function.apply(x)
+#     # regenerate out1 from the input
+#     out1_regenerated = out1._view_func(x)
+#     # regenerate out1 from the intermediate
+#     out2_regenerated = out2._view_func(intermediate)
+#     return out1_regenerated, out2_regenerated
+
+
+# Note [AOT Autograd: mutations to inputs that alias other inputs]
+#
+# Another edge case that is (only partially) handled today is when an input is mutated, but itself aliases another input.
+# AOT Autograd needs to **ensure** that functionalization knows that the two inputs are aliased to each other.
+# That way, when the aliased input is accessed later in the graph, functionalization knows to "update" the alias
+# given the mutation that occurred.
+#
+# This is handled by updating the calling convention: we create a "synthetic base" that becomes a new input
+# in the compiled function, and we regenerate the original (aliased) inputs directly off of the base
+# inside of the compiled function.
+#
+# This logic is fully encapsulated in aot_wrapper_synthetic_base()
+#
+# Example: original user code:
+# def f(x, x_view):
+#     x.mul_(2)
+#     out = x * x_view
+#     return out
+# f(x, x.view(-1))
+#
+# AOT Autograd output (compiled graph, autograd.Function.forward(), wrapper function):
+# def compiled_forward_graph(base)
+#     x = generate_x(base)
+#     x_view = generate_x_view(base)
+#     x_updated = x.mul(2)
+#     x_view_updated = x_updated.view(-1)
+#     out = x_updated * x_view_updated
+#     return x_updated, out
+#
+# # The calling convention change from (aliases) -> (base) happens
+# # *outside* of the autograd.Function.forward().
+# # That means the forward() only has 1 input (base),
+# # and the backward() only has 1 output (grad_base)
+# def compiled_backward_graph(grad_out):
+#     grad_base = ...
+#     return grad_base
+#
+# def autograd.Function.forward(base):
+#     x_updated, out = compiled_forward_graph(base)
+#     return x_updated, out
+#
+# # The compiled wrapper is where we create synthetic bases.
+# # The info on which inputs are mutated is also tracked *before* synthetic base creation.
+# def compiled_wrapper(x, x_view):
+#     base = merge_view_inputs(x, x_view)
+#     x_updated, out = autograd.Function.apply(base)
+#     # x and x_view are aliased in eager mode, so this mutation to x will automatically affect x_view.
+#     x.copy_(x_updated)
+#     return out
+
+
+# Note [AOT Autograd: Views to avoid tangents aliasing inputs]
+#
+# We view every forward output when creating out tangent tensors to handle the problematic
+# case in which a subclass does extra aliasing between graph outputs/inputs in a way that
+# is not visible above the sublass.
+#
+# Ordinarily, when constructing the joint function that we want to trace in AOTAutograd,
+# we're guaranteed that the tangent tensors that we pass
+# into the joint are distinct tensors from the primals. This is because when
+# decide which forward outputs to create tangents for, we only create tangents
+# for forward outputs that are not aliases of inputs (See Note
+# [AOT Autograd: outputs aliasing inputs or intermediates!]).
+#
+# However, when wrapper tensor subclasses enter the picture, it is possible
+# to have an output of the forward that is a subclass that is not an
+# input / alias of an input, but one of its inner tensors is an alias!
+# NestedTensor is an example: Performing an out-of-place pointwise op on a
+# NestedTensor constructs a fresh NestedTensor that holds onto the input's
+# offsets tensor directly.
+#
+# Having tangent tensors that are the same as the (primal) forward inputs,
+# can cause problems during tracing as make_fx() will specialize on our
+# duplicate inputs: If we passed in the same tensor for primals_1 and
+# tangents_1 during tracing, make_fx() will happily sub out all usages of
+# tangents_1 with primals_1 in the graph, which is not what we want.
+#
+# To work around this, we view every forward output when creating out tangent
+# tensors so that tangents can never be the same as forward inputs even if
+# forward inputs alias forward outputs.
+
+# Note [Side-Effectful Tokens in AOTAutograd]
+#
+# We allow some some side-effectful operators in
+# the post-AOTAutograd (functional) graph, such as prints and torchbind operations.
+# To ensure that these side-effects are compatible to future graph passes that
+# assume that the graph is functional, we will thread "effect tokens" to show
+# data dependence between these side-effectful operators. Practically speaking,
+# effect tokens are just dummy values (torch.tensor([])). The graph would look
+# like the following:
+#
+# def gm(self, token0, reader):
+#    token1, frame = with_token(ordered_effect_op, (reader,), token0)
+#    frame = frame * 2
+#    token2, frame2 = with_token(ordered_effect_op, (reader,), token1)
+#    frame2 = frame2 * 2
+#    return token2, frame, frame2
+#
+# We will pass the token as an input to the graph, thread it through
+# side-effectful operators using the `with_effects` high order operator, and then
+# return the updated token as an output.
+# So the signature of the graph input would look something like
+# (*tokens, *params_buffers, *user_inputs), and the signature of the graph
+# output would look something like (*tokens, *outputs).
+
+#
+#
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+aot_autograd_decompositions = {}
+
+@dynamo_timed
+def create_aot_dispatcher_function(
+    flat_fn, flat_args: List[Any], aot_config: AOTConfig
+):
+    """
+    Traces the forward and backward graphs of the attr:`flat_fn` to generate a
+    joint graph. The joint graph is an Fx graph with Aten ops. Please refer to
+    the tracing mechanism to understand the graph capturing details.
+
+    The joint graph is then passed through attr:`partition_fn` to isolate the
+    forward and backward portions, which are then respectively compiled via the
+    provided attr:`fw_compiler` and attr:`bw_compiler`.
+
+    The resulting compiled forward and backward graphs are then wrapped up in a
+    ``torch.autograd.Function`` object.
+
+    The calling convention here is that the first aot_config.num_params_buffers
+    inputs in flat_args are parameters and buffers, and the rest are inputs.
+
+    We use this to assume that parameters/buffer's shapes don't change.
+
+    Note: this function is used both by aot_function and aot_export (controlled by aot_config.is_export)
+        When aot_config.is_export is True, we return an FX graph + metadata
+        When aot_config.is_export is False, we return an ordinary runtime function
+    """
+
+    # This is the main entry point.
+    # TODO: Chillee argues that dynamo itself should pass in fake tensors to
+    # the list of arguments when compiling; at the moment we do not do this
+
+    if aot_config.decompositions is None:
+        aot_config.decompositions = {}
+
+
+    aot_config.decompositions = {
+        **aot_autograd_decompositions,
+        **aot_config.decompositions,
+    }
+
+    if config.functionalize_rng_ops:
+        # Update the decompositions with functionalized random decompositions
+        aot_config.decompositions = {
+            **rng_decompositions,
+            **aot_config.decompositions,
+        }
+
+    # Check flat_args to see if they're already fake.  If so, use that fake
+    # mode instead.
+
+    fake_mode = detect_fake_mode(flat_args)
+    if fake_mode is None:
+        shape_env = ShapeEnv() if aot_config.dynamic_shapes else None
+        fake_mode = FakeTensorMode(shape_env=shape_env)
+    else:
+        shape_env = fake_mode.shape_env
+
+    python_dispatcher_mode = (
+        enable_python_dispatcher() if shape_env is not None else nullcontext()
+    )
+
+    with torch.autograd.set_multithreading_enabled(
+        False
+    ), preserve_rng_state(), fake_mode, python_dispatcher_mode, PhiloxStateTracker():
+
+        def process_inputs(flat_args):
+            def convert(idx, x):
+                if shape_env is not None:
+                    from torch._dynamo.source import ConstantSource
+                    if isinstance(x, int):
+                        # We always specialize on scalar values in export.
+                        if aot_config.is_export:
+                            return x
+                        source = ConstantSource(f"sym_{idx}")
+                        return shape_env.create_symintnode(
+                            shape_env.create_symbol(x, source),
+                            hint=x,
+                            source=source
+                        )
+                if not isinstance(x, torch.Tensor):
+                    return x
+                if isinstance(x, FakeTensor):
+                    assert x.fake_mode is fake_mode
+                    return x
+                if is_traceable_wrapper_subclass(x):
+                    attrs, _ = x.__tensor_flatten__()
+                    if all(isinstance(getattr(x, attr), FakeTensor) for attr in attrs):
+                        assert all(getattr(x, attr).fake_mode is fake_mode for attr in attrs)
+                        return x
+
+
+                # see note [Tensor Fakification and Symbol Caching]
+                symbolic_context = None
+                source = None
+                if tracing_context := torch._guards.TracingContext.try_get():
+                    if x in tracing_context.tensor_to_context:
+                        symbolic_context = tracing_context.tensor_to_context[x]
+                        source = symbolic_context.tensor_source
+                if (
+                    idx < aot_config.num_params_buffers
+                    and config.static_weight_shapes
+                    and not symbolic_context
+                ):
+                    # TODO: Ensure that this codepath is never exercised from
+                    # Dynamo
+                    return fake_mode.from_tensor(x, static_shapes=True)
+
+                return fake_mode.from_tensor(
+                    x, static_shapes=False, symbolic_context=symbolic_context, source=source
+                )
+            return [convert(idx, x) for idx, x in enumerate(flat_args)]
+
+        fake_flat_args = process_inputs(flat_args)
+
+        needs_autograd = (
+            any(x.requires_grad for x in fake_flat_args if isinstance(x, Tensor))
+            and torch.is_grad_enabled()
+        )
+
+        with enable_python_dispatcher():
+            # Patch set_rng_state as set_rng_state with fake tensors is
+            # nonsensical. This does not affect the collection of metadata.
+            with patch("torch.cuda.set_rng_state", lambda *args: None):
+                fw_metadata = run_functionalized_fw_and_collect_metadata(
+                    flat_fn,
+                    keep_input_mutations=aot_config.keep_inference_input_mutations,
+                    is_train=needs_autograd,
+                    pre_dispatch=aot_config.pre_dispatch,
+                )(*fake_flat_args)
+
+                req_subclass_dispatch = requires_subclass_dispatch(fake_flat_args, fw_metadata)
+
+                if needs_autograd and not any(x.requires_grad for x in fw_metadata.output_info):
+                    # We realized that none of the outputs require grad,
+                    # so we actually have an inference graph.
+                    needs_autograd = False
+                    # A bit silly: right now in the subclass codepath, our ViewAndMutationMeta
+                    # changes depending on whether we pass in is_train / keep_input_mutations,
+                    # so we're forced to recompute the metadata.
+                    # TODO: refactor the subclass path of run_functionalized_fw_and_collect_metadata
+                    # so that this is unnecessary.
+                    if req_subclass_dispatch:
+                        fw_metadata = run_functionalized_fw_and_collect_metadata(
+                            flat_fn,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations and not needs_autograd,
+                            is_train=needs_autograd,
+                            pre_dispatch=aot_config.pre_dispatch,
+                        )(*fake_flat_args)
+                    else:
+                        fw_metadata = ViewAndMutationMeta(
+                            input_info=fw_metadata.input_info,
+                            output_info=fw_metadata.output_info,
+                            num_intermediate_bases=fw_metadata.num_intermediate_bases,
+                            keep_input_mutations=aot_config.keep_inference_input_mutations and not needs_autograd,
+                            traced_tangents=fw_metadata.traced_tangents,
+                            subclass_inp_meta=fw_metadata.subclass_inp_meta,
+                            subclass_fw_graph_out_meta=fw_metadata.subclass_fw_graph_out_meta,
+                            subclass_tangent_meta=fw_metadata.subclass_tangent_meta,
+                            is_train=needs_autograd,
+                        )
+
+
+        if fw_metadata.num_intermediate_bases > 0:
+            assert not req_subclass_dispatch, f"""\
+torch.compile is currently being used with tensor subclass inputs:
+{','.join([str(type(x)) for x in fake_flat_args])}. We are attempting to a compile a graph with two graph outputs
+that alias one another, which is currently unsupported in the subclass use case. If you run into this,
+please file a github issue"""
+
+        if aot_config.is_export:
+            # aot_export: ban input metadata mutations for now to keep shared code paths simpler.
+            # Keeping .resize_() in the graph will require some work
+            # Allowing it but keeping the graph functional will require some calling convention changes.
+            if len([x for x in fw_metadata.input_info if x.mutates_metadata]) != 0:
+                raise RuntimeError(f"""\
+Found an input that received a metadata mutation, through e.g. a call to `.resize_()` or `.transpose_()`.
+This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
+
+fw_metadata={str(fw_metadata)}""")
+            # In export, banning data mutations on inputs that require grad for now.
+            # This should be rare, and is tricky to get right. When we trace the backward,
+            # we currently trace with autograd.grad instead of .backward(), which makes it difficult
+            # to ensure that we run autograd all the way through the input **before** it saw the mutation.
+            if len([x for x in fw_metadata.input_info if x.requires_grad and x.mutates_data]) != 0:
+                raise RuntimeError(f"""\
+Found a graph input that requires gradients, and received a mutation.
+This is currently banned in the aot_export workflow. If you need this functionality, please file a github issue.
+
+fw_metadata={str(fw_metadata)}""")
+            if req_subclass_dispatch:
+                raise RuntimeError("""\
+aot_export is not currently supported with traceable tensor subclass.
+If you need this feature, please comment on <CREATE_ISSUE_LINK>""")
+
+            # Need to decide on a strategy for functionalized RNG: toggling via global config seems bad,
+            # and turning it on will require a non-trivial calling convention change for any export runtime.
+            if config.functionalize_rng_ops:
+                raise RuntimeError("""\
+Functionalized RNG is not currently supported in the aot_export workflow. Please file a github issue,
+or otherwise set torch._functorch.config.functionalize_rng_ops = False.""")
+
+        # crappy version of dispatcher
+        # TODO: Do this properly
+        if needs_autograd:
+            # For now, aot_dispatch_autograd knows to explicitly return a graph
+            # when run with export, and an opaque callable otherwise.
+            # In theory we could factor these out, but I wanted to let the dust
+            # settle on how functionalized rng fits into export first.
+            compiler_fn = aot_dispatch_autograd_graph if aot_config.is_export else aot_dispatch_autograd
+        else:
+            # aot_dispatch_base_graph contains only the "graph bits", while aot_dispatch_base
+            # includes some extra work around handling a runtime epilogue.
+            compiler_fn = aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base
+
+        compiler_fn = partial(aot_wrapper_synthetic_base, compiler_fn=compiler_fn, needs_autograd=needs_autograd)
+        compiler_fn = partial(aot_wrapper_dedupe, compiler_fn=compiler_fn)
+        # You can put more passes here
+
+        compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
+        if aot_config.is_export:
+            # During export, we don't get back a callable - we get back the raw fx graph
+            # (either a joint or an inference-only graph)
+            assert isinstance(compiled_fn, torch.fx.GraphModule)
+            return compiled_fn, fw_metadata
+
+        if not hasattr(compiled_fn, "_boxed_call"):
+            compiled_fn = make_boxed_func(compiled_fn)
+
+        return compiled_fn
+
+
+def aot_function(
+    fn: Callable,
+    fw_compiler: Callable,
+    bw_compiler: Optional[Callable] = None,
+    partition_fn: Callable = default_partition,
+    decompositions: Optional[Dict] = None,
+    num_params_buffers: int = 0,
+    keep_inference_input_mutations: bool = False,
+    inference_compiler: Optional[Callable] = None,
+    *,
+    # Whether or not to trace with dynamic shapes
+    dynamic=False,
+    enable_log=True,
+) -> Callable:
+    """
+    Traces the forward and backward graph of :attr:`fn` using torch dispatch
+    mechanism, and then compiles the generated forward and backward graphs
+    through :attr:`fw_compiler` and :attr:`bw_compiler`.
+
+    :func:`aot_function` traces the forward and backward graph ahead of time,
+    and generates a joint forward and backward graph.  :attr:`partition_fn` is
+    then used to separate out forward and backward graphs. The partitioner
+    function can be used to perform optimizations such as recomputation. One can
+    set `decompositions` dictionary to decompose the operators into a sequence
+    of core or simpler operators supported by the backend compilers.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        fn (Callable): A Python function that takes one ore more arguments. Must
+            return one or more Tensors.
+        fw_compiler (Callable): A Python function that accepts an Fx graph with
+            Aten ops and input args, and returns a Callable that semantically is
+            equivalent to the input Fx graph.
+        bw_compiler (Optional[Callable]): A Python function that accepts an
+            Fx graph with Aten ops and input args, and returns a Callable that
+            semantically is equivalent to the input Fx graph.  Default: None
+            (when None, it defaults to the :attr:`fw_compiler`)
+        partition_fn (Callable): A Python function that takes a joint forward
+            and backward graph, and partitions it into separate forward and
+            backward graphs.
+        decompositions (Dict): A dictionary to define the decomposition of
+            larger Aten ops into simpler or core Aten ops.
+        inference_compiler (Optional[Callable]): A Python function that accepts an
+            Fx graph with Aten ops and input args, and returns a Callable that
+            semantically is equivalent to the input Fx graph. inference_compiler is invoked
+            if no autograd is needed. Default: None
+            (when None, it defaults to the :attr:`fw_compiler`)
+    Returns:
+        Returns a ``Callable`` that retains the eager behavior of the original
+        :attr:`fn`, but with forward and backward graph compiled via
+        :attr:`fw_compile` and :attr:`bw_compile`.
+
+    A simple example usage of :func:`aot_function` is as follows. This example
+    will print the forward and backward graphs of the function ``fn``
+
+        >>> fn = lambda x : x.sin().cos()
+        >>> def print_compile_fn(fx_module, args):
+        >>>     print(fx_module)
+        >>>     return fx_module
+        >>> aot_fn = aot_function(fn, print_compile_fn)
+        >>> x = torch.randn(4, 5, requires_grad=True)
+        >>> aot_fn(x)
+    """
+
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    if inference_compiler is None:
+        inference_compiler = fw_compiler
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        inference_compiler=inference_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=num_params_buffers,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+        dynamic_shapes=dynamic,
+        aot_autograd_arg_pos_to_source=None,
+        is_export=False,
+        no_tangents=False,
+        enable_log=enable_log,
+    )
+    cached_res = None
+
+    @wraps(fn)
+    def returned_function(*args, **kwargs):
+        nonlocal cached_res
+        # Now flatten the tensor args
+        flat_args = pytree.arg_tree_leaves(*args, **kwargs)
+
+        # Compile the function and save it in the cache
+        if cached_res is None:
+            flat_fn, out_spec = create_tree_flattened_fn(fn, args, kwargs)
+
+            compiled_fn = create_aot_dispatcher_function(
+                flat_fn,
+                flat_args,
+                aot_config,
+            )
+            cached_res = (compiled_fn, out_spec)
+
+        cached_fn, out_spec = cached_res
+        out = cached_fn(flat_args)
+        return out_spec.unflatten(out)
+
+    return returned_function
+
+
+def aot_module(mod: nn.Module, *args, **kwargs) -> nn.Module:
+    """
+    Traces the forward and backward graph of :attr:`mod` using torch dispatch
+    tracing mechanism. It is wrapper function, that underneath uses
+    :func:`aot_function` to perform tracing and compilation.
+
+    :func:`aot_module` lifts the parameters and buffers of ``nn.Module`` as inputs
+    to a new callable which is then compiled through :func:`aot_function`.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        mod (Callable): A ``nn.Module`` module.
+        args : args to be passed to :func:`aot_function`
+        kwargs : kwargs to be passed to :func:`aot_function`
+
+    Returns:
+        Returns a ``nn.Module`` that retains the eager behavior of the original
+        :attr:`mod`, but with forward and backward graph compiled.
+
+    """
+    # See Note: [Fake Modules and AOTAutograd]
+    torch._dynamo.utils.assert_no_fake_params_or_buffers(mod)
+
+    def functional_call(named_params, named_buffers, *args, **kwargs):
+        params_and_buffers = {**named_params, **named_buffers}
+        return torch.func.functional_call(mod, params_and_buffers, args, kwargs)
+
+    named_params = dict(mod.named_parameters(remove_duplicate=False))
+    named_buffers = dict(mod.named_buffers(remove_duplicate=False))
+    num_params_buffers = len(named_params) + len(named_buffers)
+    compiled_f = aot_function(
+        functional_call, *args, num_params_buffers=num_params_buffers, **kwargs
+    )
+
+    class AOTModule(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.orig_module = mod
+
+        def forward(self, *args, **kwargs):
+            return compiled_f(
+                named_params,
+                named_buffers,
+                *args,
+                **kwargs,
+            )
+
+    return AOTModule()
+
+
+def aot_module_simplified(
+    mod: nn.Module,
+    args,
+    fw_compiler: Callable,
+    bw_compiler: Optional[Callable] = None,
+    partition_fn: Callable = default_partition,
+    decompositions: Optional[Dict] = None,
+    keep_inference_input_mutations=False,
+    inference_compiler: Optional[Callable] = None,
+) -> nn.Module:
+    """
+    This is the simplified or low overhead version of aot_module. For frontends
+    like TorchDynamo, the input functions/modules to AOT are static and have
+    unpacked inputs/outputs. This gives us an opportunity to remove the
+        (1) pytree overhead to parse inputs/outputs,
+        (2) AOT Autograd cache,
+        (3) Reading of params/buffers in every forward call
+
+    :func:`aot_module_simplified` removes these overheads.
+    """
+    params = {
+        **dict(mod.named_parameters(remove_duplicate=False)),
+        **dict(mod.named_buffers(remove_duplicate=False)),
+    }
+    params_flat, params_spec = pytree.tree_flatten(params)
+    params_flat = list(params_flat)
+    params_len = len(params_flat)
+
+    functional_call = create_functional_call(mod, params_spec, params_len)
+
+    if bw_compiler is None:
+        bw_compiler = fw_compiler
+    if inference_compiler is None:
+        inference_compiler = fw_compiler
+
+    seen_sources = set()
+
+    full_args = []
+    # First, the params
+    full_args.extend(params_flat)
+
+    if tracing_context := torch._guards.TracingContext.try_get():
+        tracing_context.params_flat = params_flat
+
+    aot_autograd_arg_pos_to_source = None
+    # Then, the params 1:1 mapped sources, if relevant.
+    if hasattr(mod, "_param_name_to_source"):
+        aot_autograd_arg_pos_to_source = []
+        # We now know this came from dynamo, and (1) we care about guards,
+        # so setting up aot_autograd_arg_pos_to_source for downstream dedup guards
+        # can now be done safely. (2) Dynamo logic protects the 1:1 sizing below.
+        for name in params.keys():
+            assert name in mod._param_name_to_source, f"{name} not found."
+            source = mod._param_name_to_source[name]
+            assert source not in seen_sources, source
+            seen_sources.add(source)
+            aot_autograd_arg_pos_to_source.append(source)
+
+    # Next, the input args
+    full_args.extend(args)
+
+    if hasattr(mod, "graph"):
+        # Non dynamo entrypoints can get to here...
+        for i, node in enumerate(mod.graph.nodes):
+            if node.op == "placeholder":
+                if hasattr(node, "_dynamo_source"):
+                    # ... but not here!
+                    if aot_autograd_arg_pos_to_source is None:
+                        aot_autograd_arg_pos_to_source = []
+                    source = node._dynamo_source
+                    assert source not in seen_sources, source
+                    seen_sources.add(source)
+                    aot_autograd_arg_pos_to_source.append(source)
+
+    if aot_autograd_arg_pos_to_source is not None:
+        assert len(full_args) == len(aot_autograd_arg_pos_to_source)
+
+    dynamic_shapes = False
+    for x in full_args:
+        if isinstance(x, FakeTensor):
+            dynamic_shapes = x.fake_mode.shape_env is not None
+            break
+
+    aot_config = AOTConfig(
+        fw_compiler=fw_compiler,
+        bw_compiler=bw_compiler,
+        inference_compiler=inference_compiler,
+        partition_fn=partition_fn,
+        decompositions=decompositions,
+        num_params_buffers=params_len,
+        aot_id=next(AOT_COUNTER),
+        keep_inference_input_mutations=keep_inference_input_mutations,
+        dynamic_shapes=dynamic_shapes,
+        aot_autograd_arg_pos_to_source=aot_autograd_arg_pos_to_source,
+        is_export=False,
+        no_tangents=False,
+    )
+
+    with compiled_autograd.disable():
+        compiled_fn = create_aot_dispatcher_function(
+            functional_call,
+            full_args,
+            aot_config,
+        )
+
+    # TODO: There is something deeply wrong here; compiled_fn running with
+    # the boxed calling convention, but aot_module_simplified somehow
+    # historically returned a function that was not the boxed calling
+    # convention.  This should get fixed...
+    def forward(*runtime_args):
+        full_args = []
+        full_args.extend(params_flat)
+        full_args.extend(runtime_args)
+        return compiled_fn(full_args)
+
+    # Just for convenience
+    forward.zero_grad = mod.zero_grad
+    forward.named_parameters = mod.named_parameters
+    forward.named_buffers = mod.named_buffers
+
+    return forward
+
+
+def aot_export_module(
+    mod: nn.Module,
+    args,
+    *,
+    decompositions: Optional[Dict] = None,
+    # If true, we'll return a joint forward-backward graph,
+    # As well as metadata on the loss + gradients in the backward.
+    trace_joint: bool,
+    # If trace_joint is True, we expect your module to return a scalar loss.
+    # Your module can return multiple outputs, so you must specify which output the loss is.
+    output_loss_index: Optional[int] = None,
+    pre_dispatch: bool = False,
+    kwargs=None,
+) -> Tuple[torch.fx.GraphModule, GraphSignature]:
+    """
+    This function takes in a module, and returns:
+    (1) an FX graph that can be exported
+    (2) some metadata about the graph
+
+    If `trace_joint=True` we will return a joint graph of the forward + backward.
+
+    The traced FX graph will have the following properties compared to the original module:
+    (1) Inputs and outputs to the module will be pytree-flattened
+    (2) Parameters and buffers on the module will be lifted into graph inputs,
+        graph_inputs = (*parameters, *buffers, *user_inputs)
+    (3) The graph will be fully functionalized
+    (4) Any input mutations will be converted into additional outputs in the graph,
+        meaning whoever calls this graph is responsible for applying the mutations
+        back to the original inputs.
+    (5) If is_joint is provided the graph will return parameter gradients in addition to user outputs.
+        The graph output will look like:
+        graph_outputs = (*updated_inputs, *user_outputs, *param_gradients)
+
+    There are also several restrictions on what modules can use this API. In particular:
+    (1) If trace_joint is specified, we expect the loss function to be **fused**
+        into the module forward. One of the outputs to the forward must be a scalar loss,
+        which is specified with `output_loss_index`.
+        All other outputs to the forward are presumed to not require gradients.
+    (2) This API cannot capture optimizers (although in theory we could build an API for this).
+    (3) Metadata mutations on params/buffers/inputs are banned.
+    (4) Data mutations on anything that requires gradients are banned (parameters)
+    (5) If an input is mutated, it is not allowed to alias any other inputs.
+    (6) Parameters must not be duplicated.
+    """
+    if pre_dispatch and trace_joint:
+        raise RuntimeError("pre_dispatch is not supported when trace_joint is True.")
+    named_parameters = dict(mod.named_parameters(remove_duplicate=False))
+    named_buffers = dict(mod.named_buffers(remove_duplicate=False))
+
+    params_and_buffers = {
+        **dict(named_parameters),
+        **dict(named_buffers),
+    }
+    params_and_buffers_flat, params_spec = pytree.tree_flatten(params_and_buffers)
+    params_and_buffers_flat = tuple(params_and_buffers_flat)
+    params_len = len(params_and_buffers_flat)
+
+    kwargs = kwargs or {}
+
+    functional_call = create_functional_call(mod, params_spec, params_len, store_orig_mod=True)
+
+    num_fw_outs = None
+
+    if trace_joint:
+        # This helper effectively just adds some extra asserts about what the backward will look like:
+        # Outputs must include a scalar loss, that we compute gradients w.r.t.
+        # We don't compute gradients w.r.t. anything else: so just in case we detach()
+        # and other output tensors.
+        def fn_to_trace(*args):
+            nonlocal num_fw_outs
+            out = functional_call(*args)
+            if output_loss_index is None:
+                raise RuntimeError("""\
+If trace_joint=Trueit is required that one of your forward outputs must be a scalar loss.
+You must specify the which (index) output is the loss with output_loss_index.""")
+            if isinstance(out, (torch.Tensor)):
+                out = (out,)
+            if not isinstance(out, (tuple, list)):
+                raise RuntimeError(f"Expected forward output to be either a tensor or a list/tuple of tensors. found {type(out)}")
+
+            for i, o in enumerate(out):
+                # We only want to create a backward graph w.r.t. the loss that the user passed in.
+                # This implies that every other output should not require gradients.
+                # Instead of making this an error (and forcing the user to detach all other outputs
+                # of their forward),
+                # we'll automatically detach them here.
+                if o.requires_grad and i != output_loss_index:
+                    raise RuntimeError(f"""\
+Found an output of the forward that requires gradients, that was not the scalar loss.
+We require all outputs to the forward that are not the scalar loss to not require gradient,
+because we will only compute a backward graph against the scalar loss.
+You can fix this by calling .detach() on each of your forward outputs that is not the loss.
+You specified that output index {output_loss_index} is the loss, but we found that
+the output at index {i} requires gradients.""")
+            out_loss = out[output_loss_index]
+            num_fw_outs = len(out)
+            if not out_loss.requires_grad:
+                raise RuntimeError(f"""\
+The output at index {output_loss_index} was marked as the loss, but it does not require gradients""")
+            if out_loss.numel() != 1:
+                raise RuntimeError(f"""\
+We require the output marked as the loss (at index {output_loss_index}) to be a scalar, but it has shape {out_loss.shape}""")
+            return out
+        ctx = nullcontext
+    else:
+        # Run under no_grad, so our tracing machinery only traces an inference graph.
+        ctx = torch.no_grad
+        fn_to_trace = functional_call
+
+    full_args = []
+    # First, the params
+    # NB: It is REQUIRED that parameters come first, Inductor infers "fixed"
+    # parameters by looking at the difference in parameter count outside
+    # and inside AOTAutograd, and assumes the prefix of arguments are fixed
+    # arguments
+    full_args.extend(params_and_buffers_flat)
+    # Next, the input args
+    full_args.extend(args)
+
+    with ctx():
+        fx_g, metadata, in_spec, out_spec = _aot_export_function(
+            fn_to_trace,
+            full_args,
+            decompositions=decompositions,
+            num_params_buffers=params_len,
+            no_tangents=True,
+            pre_dispatch=pre_dispatch,
+            kwargs=kwargs,
+        )
+    if trace_joint:
+        def flattened_joint(*args):
+            # The idea here is that the joint graph that AOTAutograd creates has some strict properties:
+            # (1) It accepts two arguments (primals, tangents), and pytree_flattens them
+            # (2) It returns a tuple of (fw_outs, gradients)
+            # This is a very useful convention for anyone who wants to partition the joint graph
+            # into a separate forward and backward graph.
+            # However,
+            # (1) for people exporting a single joint graph, it would be preferable not to have
+            #     any pytrees in the graph.
+            # (2) We are guaranteed in the aot_export_module case that the forward outputs a loss,
+            #     and there are therefore no tangents that are needed to run the joint graph.
+            # (3) AOTAutograd creates a grad_input for every input in the forward,
+            #     including None's for inputs that are not grad-requiring tensors.
+            #     we don't want these in our export graph.
+            #     and there are therefore no tangents that are needed to run the joint graph.
+            # This function "fixes" both of the above by removing any tangent inputs,
+            # and removing pytrees from the original FX graph.
+            fake_tangents = [None for _ in range(metadata.num_outputs + metadata.num_mutated_inp_runtime_indices)]
+            fw_outs, gradients = fx_g(args, fake_tangents)
+            assert len(gradients) == len(args)
+            output_gradients = []
+            for i, (a, grad) in enumerate(zip(args, gradients)):
+                if isinstance(a, torch.Tensor) and a.requires_grad:
+                    assert grad is not None, """\
+Found a parameter that did not receive a gradient.
+"This is most likely a bug, but if this needs to be supported please comment on this Github issue:
+https://github.com/pytorch/pytorch/issues/101192
+"""
+                    output_gradients.append(grad)
+                else:
+                    assert grad is None
+            return *fw_outs, *output_gradients
+        fx_g = make_fx(flattened_joint)(*full_args)
+
+    user_args_flat = pytree.arg_tree_leaves(*args, **kwargs)
+    return fx_g, create_graph_signature(
+        fx_g,
+        metadata,
+        in_spec,
+        out_spec,
+        user_args_flat=user_args_flat,
+        params_and_buffers_flat=params_and_buffers_flat,
+        param_names=list(named_parameters.keys()),
+        buffer_names=list(named_buffers.keys()),
+        trace_joint=trace_joint,
+        num_user_fw_outs=num_fw_outs,
+        loss_index=output_loss_index,
+    )
+
+def aot_export_joint_simple(
+    func: Callable,
+    args,
+    *,
+    trace_joint: bool,
+    # It looks like the main consequence of this API is that for dynamic shapes,
+    # it will assume that parms/buffers are static.
+    # With the new inferred dynamic shapes API, maybe this doesn't matter?
+    num_params_buffers: int = 0,
+    decompositions: Optional[Dict] = None,
+) -> torch.fx.GraphModule:
+    """
+    A simplified version of export. Used by higher order operators.
+
+    This function makes a high-level "no calling convention changes" guarantee:
+    - If no inputs require grad (so we export an inference graph),
+      there are *no* calling convention change between the exported graph, and "func".
+    - If at least one input requires grad (so we trace out and export a joint fw-bw graph),
+      Then if you were partition the graph into a separate forward and backward graph,
+      The forward graph will have no calling convention changes compared to "func".
+
+    The above also relies on some strong restrictions around which functions this API accepts:
+    (1) `args` cannot contain any pytrees (they must have been pytree_flattened already)
+    (2) `func` cannot mutate any inputs
+    (3) The outputs of `func` cannot alias any inputs.
+
+    Note: this function is only lightly tested today. It will probably be tested more heavily by higher order ops.
+    """
+    if trace_joint:
+        ctx = nullcontext
+    else:
+        # Run under no_grad, so our tracing machinery only traces an inference graph.
+        ctx = torch.no_grad
+
+    with ctx():
+        fx_g, metadata, in_spec, out_spec = _aot_export_function(
+            func,
+            args,
+            decompositions=decompositions,
+        )
+        in_spec, _kw_in_spec = in_spec.children_specs
+    # At this point, we can just directly return the (joint or inference graph) that we traced.
+    # First though: a bunch of assertions to make sure that our graph doesn't require
+    # any calling convention changes compared to the original function.
+    # These restrictions are *in addition to* the general restrictions on export.
+
+    # No input mutations
+    if len([x for x in metadata.input_info if x.mutates_data or x.mutates_metadata]) != 0:
+        raise RuntimeError(f"aot_export_joint_simple does not support input mutations. {str(metadata)}")
+    # No output aliasing
+    if len([x for x in metadata.output_info if x.output_type != OutputType.non_alias]) != 0:
+        raise RuntimeError(f"aot_export_joint_simple does not support outputs that alias inputs. {str(metadata)}")
+    # No pytrees
+    if in_spec.is_leaf():
+        raise RuntimeError(f"aot_export_joint_simple requires inputs to be a single list/tuple. in_spec={str(in_spec)}")
+    if not all(child.is_leaf() for child in in_spec.children_specs):
+        raise RuntimeError(f"aot_export_joint_simple requires individual inputs not to be pytrees. in_spec={str(in_spec)}")
+    if out_spec.is_leaf():
+        raise RuntimeError(f"aot_export_joint_simple requires outputs to be a single list/tuple. out_spec={str(out_spec)}")
+    if not all(child.is_leaf() for child in out_spec.children_specs):
+        raise RuntimeError(f"aot_export_joint_simple requires individual outputs not to be pytrees. out_spec={str(out_spec)}")
+    # TODO: we might have to temporarily patch config.functionalize_rng
+    # so that it doesn't run when we're exporting a higher order op.
+
+    if config.debug_assert:
+        # Smoke test that after partitioning, we can run the forward without any calling convention changes.
+        fw_module, bw_module = aot_config.default_partition(  # noqa: F821
+            fx_g, args, num_fwd_outputs=len(fw_metadata.output_infos)  # noqa: F821
+        )
+        # Attempt to run the fw_module with the original user inputs
+        fake_mode = detect_fake_mode(args)
+        if fake_mode is None:
+            fake_mode = FakeTensorMode()
+        with fake_mode:
+            fw_module(*args)
+    return fx_g
+
+# Private for now because we aren't providing a contract on what to return
+# for joint graphs (we could when there's a clearer use case)
+# In the future, we may need to add more export API's that provide their own strong guarantees.
+# This is meant as a general helper function for handling various export-y use cases.
+def _aot_export_function(
+    func: Callable,
+    args,
+    *,
+    num_params_buffers: int = 0,
+    decompositions: Optional[Dict] = None,
+    # If we're exporting a joint graph and we don't want any tangent inputs in the graph
+    # (because we are backpropping through a scalar 1 loss),
+    # we need to explicitly specify not to include tangents in the graph.
+    # It's not enough just to check that our tangent is a scalar, since we also
+    # need to know if it is a 1 (no need to make it a graph input), or something else
+    # (requiring it to be a graph input).
+    # We don't know this info at trace time though, so we need to make it an explicit config.
+    no_tangents: bool = False,
+    pre_dispatch: bool = False,
+    kwargs=None,
+) -> Tuple[torch.fx.GraphModule, ViewAndMutationMeta, pytree.TreeSpec, pytree.TreeSpec]:
+    kwargs = kwargs or {}
+
+    flat_fn, out_spec = create_tree_flattened_fn(func, args, kwargs)
+    flat_args, in_spec = pytree.tree_flatten((args, kwargs))
+
+    dynamic_shapes = False
+    for x in flat_args:
+        if isinstance(x, FakeTensor):
+            dynamic_shapes = x.fake_mode.shape_env is not None
+            break
+
+    # The export use case doesn't care about several bits of AOTConfig
+    # (1) compilers (we just export the graph)
+    # (2) partitioners (export is only full graph, user can partition themselves)
+    aot_config = AOTConfig(
+        fw_compiler=None,
+        bw_compiler=None,
+        inference_compiler=None,
+        partition_fn=None,
+        decompositions=decompositions,
+        num_params_buffers=num_params_buffers,
+        aot_id=next(AOT_COUNTER),
+        # For now there's no use case involving keeping input mutations in the graph
+        # (which we can only do in the inference case anyway).
+        # We can add this later if we need to.
+        keep_inference_input_mutations=False,
+        dynamic_shapes=dynamic_shapes,
+        aot_autograd_arg_pos_to_source=None,
+        is_export=True,
+        no_tangents=no_tangents,
+        pre_dispatch=pre_dispatch,
+    )
+
+    fx_g, meta = create_aot_dispatcher_function(
+        flat_fn,
+        flat_args,
+        aot_config,
+    )
+    return fx_g, meta, in_spec, out_spec.spec
+
+
+compiled_function = aot_function
+compiled_module = aot_module
diff --git a/MLPY/Lib/site-packages/torch/_functorch/apis.py b/MLPY/Lib/site-packages/torch/_functorch/apis.py
new file mode 100644
index 0000000000000000000000000000000000000000..17358cd7bf45b02eef863dfc286036b420f57eba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/apis.py
@@ -0,0 +1,401 @@
+# NOTE: We allow Dynamo to see this file (via torch/_dynamo/trace_rules.py) so that it can
+#       trace through functorch transforms.
+#       Currently, we can't allow Dynamo to see `eager_transforms.py`/`vmap.py` as that break a lot of thing
+#       and there isn't a mechanism to selectively expose only some functions (eg. grad) from a file
+#       to Dynamo.
+from torch._functorch.vmap import (vmap_impl, _check_randomness_arg,
+                                   Callable, in_dims_t, out_dims_t, _check_out_dims_is_int_or_int_pytree,
+                                   _process_batched_inputs, _chunked_vmap)
+from torch._functorch.utils import exposed_in, argnums_t
+import functools
+
+# vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
+# sends those into func, and then unwraps the output BatchedTensors. Operations
+# on BatchedTensors perform the batched operations that the user is asking for.
+#
+# vmap's randomness behavior differs from JAX's, which would require a PRNG key
+# to be passed everywhere.
+
+
+@exposed_in('torch.func')
+def vmap(
+        func: Callable,
+        in_dims: in_dims_t = 0,
+        out_dims: out_dims_t = 0,
+        randomness: str = 'error',
+        *,
+        chunk_size=None) -> Callable:
+    """
+    vmap is the vectorizing map; ``vmap(func)`` returns a new function that
+    maps ``func`` over some dimension of the inputs. Semantically, vmap
+    pushes the map into PyTorch operations called by ``func``, effectively
+    vectorizing those operations.
+
+    vmap is useful for handling batch dimensions: one can write a function
+    ``func`` that runs on examples and then lift it to a function that can
+    take batches of examples with ``vmap(func)``. vmap can also be used to
+    compute batched gradients when composed with autograd.
+
+    .. note::
+        :func:`torch.vmap` is aliased to :func:`torch.func.vmap` for
+        convenience. Use whichever one you'd like.
+
+    Args:
+        func (function): A Python function that takes one or more arguments.
+            Must return one or more Tensors.
+        in_dims (int or nested structure): Specifies which dimension of the
+            inputs should be mapped over. ``in_dims`` should have a
+            structure like the inputs. If the ``in_dim`` for a particular
+            input is None, then that indicates there is no map dimension.
+            Default: 0.
+        out_dims (int or Tuple[int]): Specifies where the mapped dimension
+            should appear in the outputs. If ``out_dims`` is a Tuple, then
+            it should have one element per output. Default: 0.
+        randomness (str): Specifies whether the randomness in this
+            vmap should be the same or different across batches. If 'different',
+            the randomness for each batch will be different. If 'same', the
+            randomness will be the same across batches. If 'error', any calls to
+            random functions will error. Default: 'error'. WARNING: this flag
+            only applies to random PyTorch operations and does not apply to
+            Python's random module or numpy randomness.
+        chunk_size (None or int): If None (default), apply a single vmap over inputs.
+            If not None, then compute the vmap :attr:`chunk_size` samples at a time.
+            Note that :attr:`chunk_size=1` is equivalent to computing the vmap with a for-loop.
+            If you run into memory issues computing the vmap, please try a non-None chunk_size.
+
+    Returns:
+        Returns a new "batched" function. It takes the same inputs as
+        ``func``, except each input has an extra dimension at the index
+        specified by ``in_dims``. It takes returns the same outputs as
+        ``func``, except each output has an extra dimension at the index
+        specified by ``out_dims``.
+
+    .. warning:
+        :func:`vmap` works best with functional-style code. Please do not
+        perform any side-effects in ``func``, with the exception of
+        in-place PyTorch operations. Examples of side-effects include mutating
+        Python data structures and assigning values to variables not captured
+        in ``func``.
+
+    One example of using :func:`vmap` is to compute batched dot products. PyTorch
+    doesn't provide a batched ``torch.dot`` API; instead of unsuccessfully
+    rummaging through docs, use :func:`vmap` to construct a new function.
+
+        >>> torch.dot                            # [D], [D] -> []
+        >>> batched_dot = torch.func.vmap(torch.dot)  # [N, D], [N, D] -> [N]
+        >>> x, y = torch.randn(2, 5), torch.randn(2, 5)
+        >>> batched_dot(x, y)
+
+    :func:`vmap` can be helpful in hiding batch dimensions, leading to a simpler
+    model authoring experience.
+
+        >>> batch_size, feature_size = 3, 5
+        >>> weights = torch.randn(feature_size, requires_grad=True)
+        >>>
+        >>> def model(feature_vec):
+        >>>     # Very simple linear model with activation
+        >>>     return feature_vec.dot(weights).relu()
+        >>>
+        >>> examples = torch.randn(batch_size, feature_size)
+        >>> result = torch.vmap(model)(examples)
+
+    :func:`vmap` can also help vectorize computations that were previously difficult
+    or impossible to batch. One example is higher-order gradient computation.
+    The PyTorch autograd engine computes vjps (vector-Jacobian products).
+    Computing a full Jacobian matrix for some function f: R^N -> R^N usually
+    requires N calls to ``autograd.grad``, one per Jacobian row. Using :func:`vmap`,
+    we can vectorize the whole computation, computing the Jacobian in a single
+    call to ``autograd.grad``.
+
+        >>> # Setup
+        >>> N = 5
+        >>> f = lambda x: x ** 2
+        >>> x = torch.randn(N, requires_grad=True)
+        >>> y = f(x)
+        >>> I_N = torch.eye(N)
+        >>>
+        >>> # Sequential approach
+        >>> jacobian_rows = [torch.autograd.grad(y, x, v, retain_graph=True)[0]
+        >>>                  for v in I_N.unbind()]
+        >>> jacobian = torch.stack(jacobian_rows)
+        >>>
+        >>> # vectorized gradient computation
+        >>> def get_vjp(v):
+        >>>     return torch.autograd.grad(y, x, v)
+        >>> jacobian = torch.vmap(get_vjp)(I_N)
+
+    :func:`vmap` can also be nested, producing an output with multiple batched dimensions
+
+        >>> torch.dot                            # [D], [D] -> []
+        >>> batched_dot = torch.vmap(torch.vmap(torch.dot))  # [N1, N0, D], [N1, N0, D] -> [N1, N0]
+        >>> x, y = torch.randn(2, 3, 5), torch.randn(2, 3, 5)
+        >>> batched_dot(x, y) # tensor of size [2, 3]
+
+    If the inputs are not batched along the first dimension, ``in_dims`` specifies
+    the dimension that each inputs are batched along as
+
+        >>> torch.dot                            # [N], [N] -> []
+        >>> batched_dot = torch.vmap(torch.dot, in_dims=1)  # [N, D], [N, D] -> [D]
+        >>> x, y = torch.randn(2, 5), torch.randn(2, 5)
+        >>> batched_dot(x, y)   # output is [5] instead of [2] if batched along the 0th dimension
+
+    If there are multiple inputs each of which is batched along different dimensions,
+    ``in_dims`` must be a tuple with the batch dimension for each input as
+
+        >>> torch.dot                            # [D], [D] -> []
+        >>> batched_dot = torch.vmap(torch.dot, in_dims=(0, None))  # [N, D], [D] -> [N]
+        >>> x, y = torch.randn(2, 5), torch.randn(5)
+        >>> batched_dot(x, y) # second arg doesn't have a batch dim because in_dim[1] was None
+
+    If the input is a Python struct, ``in_dims`` must be a tuple containing a struct
+    matching the shape of the input:
+
+        >>> f = lambda dict: torch.dot(dict['x'], dict['y'])
+        >>> x, y = torch.randn(2, 5), torch.randn(5)
+        >>> input = {'x': x, 'y': y}
+        >>> batched_dot = torch.vmap(f, in_dims=({'x': 0, 'y': None},))
+        >>> batched_dot(input)
+
+    By default, the output is batched along the first dimension. However, it can be batched
+    along any dimension by using ``out_dims``
+
+        >>> f = lambda x: x ** 2
+        >>> x = torch.randn(2, 5)
+        >>> batched_pow = torch.vmap(f, out_dims=1)
+        >>> batched_pow(x) # [5, 2]
+
+    For any function that uses kwargs, the returned function will not batch the kwargs but will
+    accept kwargs
+
+        >>> x = torch.randn([2, 5])
+        >>> def fn(x, scale=4.):
+        >>>   return x * scale
+        >>>
+        >>> batched_pow = torch.vmap(fn)
+        >>> assert torch.allclose(batched_pow(x), x * 4)
+        >>> batched_pow(x, scale=x) # scale is not batched, output has shape [2, 2, 5]
+
+    .. note::
+        vmap does not provide general autobatching or handle variable-length
+        sequences out of the box.
+    """
+    _check_randomness_arg(randomness)
+    if not (chunk_size is None or chunk_size > 0):
+        raise ValueError(f"vmap: chunk_size should be None or greater than 0. (got {chunk_size})")
+
+    # @functools.wraps(func)
+    def wrapped(*args, **kwargs):
+        return vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs)
+
+    return wrapped
+
+
+def chunk_vmap(
+        func: Callable,
+        in_dims: in_dims_t = 0,
+        out_dims: out_dims_t = 0,
+        randomness: str = 'error',
+        chunks=2) -> Callable:
+    """
+    chunk_vmap is the vectorizing map (vmap) using chunks of input data. It is a mix of vmap (which vectorizes
+    everything) and map (which executes things sequentially). ``chunk_vmap`` vectorizes the input with number of
+    chunks at a time. For more details about vectorizing map, see :func:`vmap`.
+
+    .. note::
+        Please use :func:`vmap` with ``chunk_size`` argument instead of this API.
+
+    Args:
+        func (function): A Python function that takes one or more arguments.
+            Must return one or more Tensors.
+        in_dims (int or nested structure): Specifies which dimension of the
+            inputs should be mapped over. ``in_dims`` should have a
+            structure like the inputs. If the ``in_dim`` for a particular
+            input is None, then that indicates there is no map dimension.
+            Default: 0.
+        out_dims (int or Tuple[int]): Specifies where the mapped dimension
+            should appear in the outputs. If ``out_dims`` is a Tuple, then
+            it should have one element per output. Default: 0.
+        randomness (str): Specifies whether the randomness in this
+            vmap should be the same or different across batches. If 'different',
+            the randomness for each batch will be different. If 'same', the
+            randomness will be the same across batches. If 'error', any calls to
+            random functions will error. Default: 'error'. WARNING: this flag
+            only applies to random PyTorch operations and does not apply to
+            Python's random module or numpy randomness.
+        chunks (int): Number of chunks to use to split the input data. Default is 2.
+            If equals to 1 then :func:`vmap` is called.
+
+    Returns:
+        Returns a new "batched" function. It takes the same inputs as
+        ``func``, except each input has an extra dimension at the index
+        specified by ``in_dims``. It takes returns the same outputs as
+        ``func``, except each output has an extra dimension at the index
+        specified by ``out_dims``.
+    """
+    _check_randomness_arg(randomness)
+
+    if chunks == 1:
+        return vmap(func, in_dims=in_dims, out_dims=out_dims, randomness=randomness)
+
+    def _get_chunk_flat_args(flat_args_, flat_in_dims_, chunks_):
+        flat_args_chunks = tuple(
+            t.chunk(chunks_, dim=in_dim) if in_dim is not None else [t, ] * chunks_
+            for t, in_dim in zip(flat_args_, flat_in_dims_)
+        )
+        # transpose chunk dim and flatten structure
+        # chunks_flat_args is a list of flatten args
+        chunks_flat_args = zip(*flat_args_chunks)
+        return chunks_flat_args
+
+    @functools.wraps(func)
+    def wrapped_with_chunks(*args, **kwargs):
+        _check_out_dims_is_int_or_int_pytree(out_dims, func)
+        _, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
+        # Chunk flat arguments
+        chunks_flat_args = _get_chunk_flat_args(flat_args, flat_in_dims, chunks)
+
+        # Apply vmap on chunks
+        return _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, randomness, **kwargs)
+
+    return wrapped_with_chunks
+
+
+@exposed_in("torch.func")
+def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
+    """``grad`` operator helps computing gradients of ``func`` with respect to the
+    input(s) specified by ``argnums``. This operator can be nested to
+    compute higher-order gradients.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+            Must return a single-element Tensor. If specified ``has_aux`` equals ``True``,
+            function can return a tuple of single-element Tensor and other auxiliary objects:
+            ``(output, aux)``.
+        argnums (int or Tuple[int]): Specifies arguments to compute gradients with respect to.
+            ``argnums`` can be single integer or tuple of integers. Default: 0.
+        has_aux (bool): Flag indicating that ``func`` returns a tensor and other
+            auxiliary objects: ``(output, aux)``. Default: False.
+
+    Returns:
+        Function to compute gradients with respect to its inputs. By default, the output of
+        the function is the gradient tensor(s) with respect to the first argument.
+        If specified ``has_aux`` equals ``True``, tuple of gradients and output auxiliary objects
+        is returned. If ``argnums`` is a tuple of integers, a tuple of output gradients with
+        respect to each ``argnums`` value is returned.
+
+    Example of using ``grad``:
+
+        >>> # xdoctest: +SKIP
+        >>> from torch.func import grad
+        >>> x = torch.randn([])
+        >>> cos_x = grad(lambda x: torch.sin(x))(x)
+        >>> assert torch.allclose(cos_x, x.cos())
+        >>>
+        >>> # Second-order gradients
+        >>> neg_sin_x = grad(grad(lambda x: torch.sin(x)))(x)
+        >>> assert torch.allclose(neg_sin_x, -x.sin())
+
+    When composed with ``vmap``, ``grad`` can be used to compute per-sample-gradients:
+
+        >>> # xdoctest: +SKIP
+        >>> from torch.func import grad, vmap
+        >>> batch_size, feature_size = 3, 5
+        >>>
+        >>> def model(weights, feature_vec):
+        >>>     # Very simple linear model with activation
+        >>>     assert feature_vec.dim() == 1
+        >>>     return feature_vec.dot(weights).relu()
+        >>>
+        >>> def compute_loss(weights, example, target):
+        >>>     y = model(weights, example)
+        >>>     return ((y - target) ** 2).mean()  # MSELoss
+        >>>
+        >>> weights = torch.randn(feature_size, requires_grad=True)
+        >>> examples = torch.randn(batch_size, feature_size)
+        >>> targets = torch.randn(batch_size)
+        >>> inputs = (weights, examples, targets)
+        >>> grad_weight_per_example = vmap(grad(compute_loss), in_dims=(None, 0, 0))(*inputs)
+
+    Example of using ``grad`` with ``has_aux`` and ``argnums``:
+
+        >>> # xdoctest: +SKIP
+        >>> from torch.func import grad
+        >>> def my_loss_func(y, y_pred):
+        >>>    loss_per_sample = (0.5 * y_pred - y) ** 2
+        >>>    loss = loss_per_sample.mean()
+        >>>    return loss, (y_pred, loss_per_sample)
+        >>>
+        >>> fn = grad(my_loss_func, argnums=(0, 1), has_aux=True)
+        >>> y_true = torch.rand(4)
+        >>> y_preds = torch.rand(4, requires_grad=True)
+        >>> out = fn(y_true, y_preds)
+        >>> # > output is ((grads w.r.t y_true, grads w.r.t y_preds), (y_pred, loss_per_sample))
+
+    .. note::
+        Using PyTorch ``torch.no_grad`` together with ``grad``.
+
+        Case 1: Using ``torch.no_grad`` inside a function:
+
+            >>> # xdoctest: +SKIP
+            >>> def f(x):
+            >>>     with torch.no_grad():
+            >>>         c = x ** 2
+            >>>     return x - c
+
+        In this case, ``grad(f)(x)`` will respect the inner ``torch.no_grad``.
+
+        Case 2: Using ``grad`` inside ``torch.no_grad`` context manager:
+
+            >>> # xdoctest: +SKIP
+            >>> with torch.no_grad():
+            >>>     grad(f)(x)
+
+        In this case, ``grad`` will respect the inner ``torch.no_grad``, but not the
+        outer one. This is because ``grad`` is a "function transform": its result
+        should not depend on the result of a context manager outside of ``f``.
+
+    """
+    # To avoid cyclical dependency.
+    import torch._functorch.eager_transforms as eager_transforms
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return eager_transforms.grad_impl(func, argnums, has_aux, args, kwargs)
+    return wrapper
+
+
+@exposed_in("torch.func")
+def grad_and_value(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
+    """
+    Returns a function to compute a tuple of the gradient and primal, or
+    forward, computation.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+            Must return a single-element Tensor. If specified ``has_aux``
+            equals ``True``, function can return a tuple of single-element
+            Tensor and other auxiliary objects: ``(output, aux)``.
+        argnums (int or Tuple[int]): Specifies arguments to compute gradients
+            with respect to. ``argnums`` can be single integer or tuple of
+            integers. Default: 0.
+        has_aux (bool): Flag indicating that ``func`` returns a tensor and
+            other auxiliary objects: ``(output, aux)``. Default: False.
+
+    Returns:
+        Function to compute a tuple of gradients with respect to its inputs
+        and the forward computation. By default, the output of the function is
+        a tuple of the gradient tensor(s) with respect to the first argument
+        and the primal computation. If specified ``has_aux`` equals
+        ``True``, tuple of gradients and tuple of the forward computation with
+        output auxiliary objects is returned. If ``argnums`` is a tuple of
+        integers, a tuple of a tuple of the output gradients with respect to
+        each ``argnums`` value and the forward computation is returned.
+
+    See :func:`grad` for examples
+    """
+    from torch._functorch import eager_transforms
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return eager_transforms.grad_and_value_impl(func, argnums, has_aux, args, kwargs)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/_functorch/autograd_function.py b/MLPY/Lib/site-packages/torch/_functorch/autograd_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fa7b3bc6dcff645c1d167765b3c4b6446b8a4c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/autograd_function.py
@@ -0,0 +1,659 @@
+import torch
+from torch._ops import HigherOrderOperator
+from torch._C._functorch import TransformType
+from torch._functorch.utils import enable_single_level_autograd_function
+import torch.utils._pytree as pytree
+from torch._C._functorch import (
+    _wrap_for_grad,
+    _unwrap_for_grad,
+    current_level,
+)
+from torch._functorch.vmap import (
+    wrap_batched,
+    unwrap_batched,
+    restore_vmap,
+    _add_batch_dim,
+)
+from torch._functorch.apis import vmap
+from torch._functorch.vmap import _broadcast_to_and_flatten
+from torch.autograd.forward_ad import _set_fwd_grad_enabled
+from typing import Any, NamedTuple, Tuple
+
+# autograd.Function technically runs before the regular PyTorch dispatcher.
+# This is how features like autocast and torch_dispatch (e.g. PythonTLSSnapshot)
+# work with it. One day we might decide to change this, but until then,
+# we need to give the illusion that autograd.Function runs before those things.
+#
+# We do this by using creating a custom HigherOrderOperator that only functorch
+# dispatches specially.
+class CustomFunctionHigherOrderOperator(HigherOrderOperator):
+    def __init__(self):
+        super().__init__('custom_function_call')
+
+    def __call__(self, autograd_function, *args, **kwargs):
+        # When custom_function_call is done dispatching through functorch,
+        # it should just invoke the autograd.Function. This is consistent
+        # with the autograd.Function behavior of being invoked before the
+        # PyTorch dispatcher.
+        #
+        # This will lead us into trouble later down the line, but this is
+        # pre-existing. There is an invariant that a function traced by
+        # make_fx should have the same behavior when provided the same
+        # Tensor. However, make_fx sees autograd.Function as a composite
+        # (because autograd.Function happens before the Python dispatch key)
+        # and only traces the forward pass.
+        if torch._C._are_functorch_transforms_active():
+            return super().__call__(autograd_function, *args, **kwargs)
+        return autograd_function.apply(*args, **kwargs)
+
+
+# "custom_function_call"
+# This is the mechanism for an autograd.Function that works with functorch transforms.
+# It wraps an autograd.Function; interactions with functorch transforms are defined
+# via PyDispatcher and HigherOrderOperator rather than through the traditional PyTorch
+# dispatcher.
+custom_function_call = CustomFunctionHigherOrderOperator()
+
+
+# The grad rule for custom_function_call is to construct a new _SingleLevelFunction
+# (autograd.Function that only works with a single layer (level) of functorch) that:
+# - unwraps the inputs
+# - redispatches to custom_function_call
+# - wraps the outputs
+# and whose backward pass calls the original autograd.Function's backward.
+#
+# Why do we need to redispatch to custom_function_call?
+# -----------------------------------------------------
+# This is consistent with how ATen operators work with functorch's grad transform:
+# they always redispatch to the original operator.
+# Consider torch.sin, and let's say we do grad0(grad1(torch.sin))(x)
+#
+# grad1 will:
+# - set up the autograd graph
+# - unwrap the inputs
+# - redispatch to at::sin (*)
+# - rewrap the outputs on the return
+#
+# On the redispatch in (*), grad0 will:
+# - set up the autograd graph
+# - unwrap the inputs
+# - redispatch to at::sin
+# - rewrap the outputs on the return
+#
+# To "set up the autograd graph", we generate a _SingleLevelFunction
+# and apply it.
+@custom_function_call.py_impl(TransformType.Grad)
+@custom_function_call.py_impl(TransformType.Jvp)
+def custom_function_call_grad(interpreter, autograd_function, *operands):
+    Generated = generate_single_level_function(interpreter, autograd_function)
+    with enable_single_level_autograd_function():
+        flat_out = Generated.apply(*operands)
+    return flat_out
+
+
+def generate_single_level_function(interpreter, autograd_function):
+    level = interpreter.level()
+
+    def forward(*operands):
+        unwrapped_operands = pytree.tree_map_only(
+            torch.Tensor,
+            lambda x: _unwrap_for_grad(x, level),
+            operands)
+        # Both enable_grad() and _set_fwd_grad_enabled() are necessary no matter
+        # the transform. _SingleLevelFunction will turn off both fwd and bwd
+        # gradient computation and we need to turn it back on here.
+        with torch.enable_grad(), _set_fwd_grad_enabled(True), interpreter.lower():
+            unwrapped_output = custom_function_call(autograd_function, *unwrapped_operands)
+
+        # See NOTE [mark_dirty object identity check]
+        def wrap_fn(output):
+            return _wrap_for_grad(output, level)
+
+        return wrap_outputs_maintaining_identity(
+            unwrapped_output,
+            unwrapped_operands,
+            operands,
+            wrap_fn)
+
+    def setup_context(ctx, inputs, output):
+        return autograd_function.setup_context(ctx, inputs, output)
+
+    # backward is only used if the transform is TransformType.Grad
+    def backward(ctx, *grads):
+        result = autograd_function.backward(ctx, *grads)
+        return result
+
+    # jvp is only used if the transform is TransformType.Jvp
+    def jvp(ctx, *tangents):
+        result = autograd_function.jvp(ctx, *tangents)
+        return result
+
+    # This is the sequence of magic words to dynamically generate a Subclass with
+    # a given name. A Tensor's .grad_fn field has a class name that is the original
+    # autograd.Function's name + Backward, so we do this to generate some
+    # meaningful name.
+    name = f'{autograd_function.__name__}Generated'
+    Generated = type(
+        name,
+        (torch.autograd.function._SingleLevelFunction,),
+        {
+            'forward': staticmethod(forward),
+            'backward': staticmethod(backward),
+            'jvp': staticmethod(jvp),
+            'setup_context': staticmethod(setup_context),
+        },
+    )
+    return Generated
+
+# wrap_outputs_maintaining_identity handles outputs from the vmap,
+# backward (vjp), and jvp staticmethod. The way it distinguishes
+# between the vmap case and the {backward, jvp} case is if the out_dims
+# are specified or not.
+#
+# NB: we cannot use out_dims=None as the deciding factor. This because
+# out_dims=None can still happen in the vmap staticmethod! What the
+# user is saying in that case is that their output does not have a
+# dimension that is being vmapped over, which is valid.
+NO_OUT_DIMS = "not specified"
+
+# NOTE [mark_dirty object identity check]
+# autograd.Function's ctx.mark_dirty expect a returned input
+# to have the same object identity as the input.
+# Mode-only functorch will greatly simplify this logic.
+def wrap_outputs_maintaining_identity(
+        outputs, unwrapped_inputs, orig_inputs, wrap_fn, out_dims=NO_OUT_DIMS):
+    flat_unwrapped_inputs = pytree.arg_tree_leaves(*unwrapped_inputs)
+    flat_orig_inputs = pytree.arg_tree_leaves(*orig_inputs)
+
+    unwrapped_input_to_orig_input = {
+        id(unwrapped): orig
+        for unwrapped, orig in zip(flat_unwrapped_inputs, flat_orig_inputs)
+    }
+
+    flat_outputs, spec = pytree.tree_flatten(outputs)
+    result = []
+
+    out_dims_specified = out_dims != NO_OUT_DIMS
+
+    if out_dims_specified:
+        flat_out_dims = _broadcast_to_and_flatten(out_dims, spec)
+        # _broadcast_to_and_flatten returns None if it is unable to broadcast.
+        # TODO: update following link from master to stable once that's out
+        if flat_out_dims is None:
+            raise RuntimeError(
+                f"The autograd.Function's vmap staticmethod returned an "
+                f"incompatible (output, out_dims) tuple. "
+                f"Expected out_dims={out_dims} "
+                f"to be compatible with the structure of `output`. "
+                f"out_dims has structure {pytree.tree_flatten(out_dims)[1]} "
+                f"but output has structure {spec}. "
+                f"For more details, please see "
+                f"https://pytorch.org/docs/master/notes/extending.func.html"
+            )
+
+    for i, output in enumerate(flat_outputs):
+        if not isinstance(output, torch.Tensor):
+            result.append(output)
+            continue
+        if id(output) in unwrapped_input_to_orig_input:
+            result.append(unwrapped_input_to_orig_input[id(output)])
+            continue
+        if out_dims_specified:
+            result.append(wrap_fn(output, flat_out_dims[i]))  # type: ignore[possibly-undefined, index]
+        else:
+            result.append(wrap_fn(output))
+
+    return pytree.tree_unflatten(result, spec)
+
+
+# NOTE: [functorch vjp and autograd interaction]
+# There's an edge case with the functorch vjp and autograd interaction
+# that will eventually be fixed by mode-only functorch.
+# The TL;DR is that there's no way to unwrap a dead GradTensorWrapper,
+# so we (the framework) need to do it manually. Regular PyTorch operators
+# automatically do so this is consistent.
+#
+# class MyExp(torch.autograd.Function):
+#     @staticmethod
+#     def forward(x):
+#         return x.exp()
+#
+#     @staticmethod
+#     def setup_context(ctx, inputs, output):
+#         y = output
+#         ctx.save_for_backward(y)
+#
+#     @staticmethod
+#     def backward(gy):
+#         y, = ctx.saved_tensors()
+#         return MyMul.apply(gy, y)
+#
+# x = torch.randn([], requires_grad=True)
+# gy = torch.randn([], requires_grad=True)
+# _, vjp_fn = vjp(MySin.apply, x)
+# result = vjp_fn(gy)
+#
+# MyMul is an autograd.Function that is not shown here.
+# It saves a `y` for backward (since gy requires grad).
+#
+# in vjp_fn(gy), we get:
+# > MyMul.apply(gy, GradTensorWrapper(y, level=dead))
+# Because the y that is saved for backward by MyExp is a GradTensorWrapper
+# but is now dead since we are outside the vjp context.
+#
+# PyTorch dispatcher operations, upon seeing a dead GradTensorWrapper,
+# will automatically unwrap the GradTensorWrapper when applied.
+# But since autograd.Function technically sits above the regular PyTorch
+# dispatcher, it doesn't get this treatment. So we manually do
+# the unwrapping to be consistent with regular PyTorch dispatcher operations.
+
+
+class VmapInfo(NamedTuple):
+    batch_size: int
+    randomness: str
+
+
+def has_overriden_vmap_rule(autograd_function):
+    return autograd_function.vmap is not torch.autograd.Function.vmap
+
+
+def validate_vmap_returns_tuple_of_two_elements(result):
+    base_error_msg = (
+        "Expected the vmap staticmethod to have two returns, an output "
+        "and out_dims with pytree structure compatible with the output. "
+    )
+    if not isinstance(result, tuple):
+        raise RuntimeError(base_error_msg + f"Got a {type(result)} instead")
+    if not len(result) == 2:
+        raise RuntimeError(base_error_msg + f"Got {len(result)} returns instead")
+
+@custom_function_call.py_impl(TransformType.Vmap)
+def custom_function_call_vmap(interpreter, autograd_function, *operands):
+    if autograd_function.generate_vmap_rule:
+        if has_overriden_vmap_rule(autograd_function):
+            # TODO: Update link to stable once that's out
+            # https://github.com/pytorch/pytorch/issues/92029
+            raise RuntimeError(
+                f"You tried to vmap over {autograd_function.__name__}, but "
+                f"it has both generate_vmap_rule=True and an overriden vmap "
+                f"staticmethod. Please set generate_vmap_rule=False or delete "
+                f"the overriden vmap staticmethod to avoid ambiguity. "
+                f"For more details, please see "
+                f"https://pytorch.org/docs/master/notes/extending.func.html")
+        return custom_function_call_vmap_generate_rule(interpreter, autograd_function, *operands)
+
+    if not has_overriden_vmap_rule(autograd_function):
+        # TODO: Update link to stable once that's out
+        # https://github.com/pytorch/pytorch/issues/92029
+        raise RuntimeError(
+            f"You tried to vmap over {autograd_function.__name__}, but "
+            f"it does not have vmap support. Please override and implement the "
+            f"vmap staticmethod or set generate_vmap_rule=True. "
+            f"For more details, please see "
+            f"https://pytorch.org/docs/master/notes/extending.func.html")
+
+    current_level = interpreter.level()
+    info = VmapInfo(
+        batch_size=interpreter.batch_size(),
+        randomness=interpreter.randomness(),
+    )
+    unwrapped_operands, in_dims = unwrap_batched(operands, current_level)
+
+    # If none of the tensors are batched at the current level, then we skip the
+    # current level. This saves the user from needing to handle this case in
+    # their vmap staticmethod (and is consistent with our C++ batching rule API)
+    if pytree.tree_all(lambda dim: dim is None, in_dims):
+        with interpreter.lower():
+            return custom_function_call(autograd_function, *operands)
+
+    with interpreter.lower():
+        result = autograd_function.vmap(info, in_dims, *unwrapped_operands)
+    validate_vmap_returns_tuple_of_two_elements(result)
+    unwrapped_output, out_dims = result
+
+    # See NOTE [mark_dirty object identity check]
+    def wrap_fn(output, out_dim):
+        return output if out_dim is None else _add_batch_dim(output, out_dim, current_level)
+
+    return wrap_outputs_maintaining_identity(
+        unwrapped_output,
+        unwrapped_operands,
+        operands,
+        wrap_fn,
+        out_dims=out_dims)
+
+
+def custom_function_call_vmap_generate_rule(interpreter, autograd_function, *operands):
+    unwrapped_operands, in_dims = unwrap_batched(operands, interpreter.level())
+    vmapped_function, get_out_dims = vmapify_autograd_function(
+        autograd_function, in_dims, interpreter.batch_size(), interpreter.randomness())
+
+    with interpreter.lower():
+        output = custom_function_call(vmapped_function, *unwrapped_operands)
+
+    out_dims = get_out_dims()
+    return wrap_batched(output, out_dims, interpreter.level())
+
+
+@custom_function_call.py_impl(TransformType.Functionalize)
+def custom_function_call_functionalize(interpreter, autograd_function, generate_vmap_rule, *operands):
+    raise RuntimeError("NYI: Functionalize rule for custom_function_call")
+
+
+def vmapify_autograd_function(autograd_function, in_dims, batch_size, randomness):
+    # The following values are saved from the forward() and setup_context()
+    # and used in backward().
+    # Why do we save the values out here instead of on the ctx object?
+    # - out_dims: There's no way to retrieve this from forward()
+    # - input_shapes, saved_tensors_bdims: I'm a bit scared of nesting
+    #   vmap(vmap( but not completely sure if it is a problem. If we
+    #   assigned those fields to the ctx object, the worry is that they
+    #   get overwritten.
+    init_val = "not populated"
+    out_dims = init_val
+    input_shapes: Any = init_val
+    saved_tensors_bdims: Any = init_val
+
+    def forward(*operands):
+        nonlocal out_dims
+        outputs, out_dims = restore_vmap(
+            autograd_function.forward, in_dims, batch_size, randomness)(*operands)
+        return outputs
+
+    def setup_context(ctx, inputs, outputs):
+        input_shapes_ = None
+        saved_tensors_bdims_ = None
+
+        def inner(inputs, outputs):
+            # wrapped_ctx.save_for_backward will:
+            # - unwrap batchedtensors into (tensor, bdim)
+            # - save_for_backward(*unwrapped_tensors)
+            # - assign the bdims to wrapped_ctx._pt_saved_tensors_bdims
+            wrapped_ctx = CtxCustomSave(ctx, current_level())
+            autograd_function.setup_context(wrapped_ctx, inputs, outputs)
+
+            # input_shapes are used for reductify later to reduce expanded gradients
+            # to the correct shape.
+            # See NOTE: [Why can't we rely on autograd to reduce expanded gradients?]
+            # for more details
+            nonlocal input_shapes_
+            input_shapes_ = tuple(inp.shape if isinstance(inp, torch.Tensor) else None
+                                  for inp in inputs)
+            nonlocal saved_tensors_bdims_
+            saved_tensors_bdims_ = wrapped_ctx._pt_saved_tensors_bdims
+
+        # See NOTE: [Why do we need to run setup_context under a vmap?]
+        restore_vmap(
+            inner,
+            (in_dims, out_dims),
+            batch_size,
+            randomness,
+        )(inputs, outputs)
+
+        nonlocal input_shapes
+        input_shapes = input_shapes_
+        nonlocal saved_tensors_bdims
+        saved_tensors_bdims = saved_tensors_bdims_
+
+    def jvp(ctx, *tangents):
+        assert out_dims != init_val
+        assert saved_tensors_bdims != init_val
+
+        def jvp_no_context(saved_tensors, tangents):
+            wrapped_ctx = CtxWithSavedTensors(ctx, saved_tensors)
+            return autograd_function.jvp(wrapped_ctx, *tangents)
+
+        tangent_in_dims = get_tangents_in_dims(in_dims, tangents)
+        out_tangents, out_tangents_dims = restore_vmap(
+            jvp_no_context, (saved_tensors_bdims, tangent_in_dims), batch_size, randomness)(
+                ctx.saved_tensors, tangents)
+
+        result = reductify(out_tangents, out_tangents_dims, out_dims, batch_size)
+        return result
+
+    def backward(ctx, *grad_outputs):
+        assert out_dims != init_val
+        assert input_shapes != init_val
+        assert saved_tensors_bdims != init_val
+
+        def backward_no_context(inputs):
+            saved_tensors, grad_outputs = inputs
+            wrapped_ctx = CtxWithSavedTensors(ctx, saved_tensors)
+            return autograd_function.backward(wrapped_ctx, *grad_outputs)
+
+        grad_ins, grad_ins_dims = restore_vmap(
+            backward_no_context, ((saved_tensors_bdims, out_dims),), batch_size, randomness)(
+                (ctx.saved_tensors, grad_outputs))
+        result = reductify(grad_ins, grad_ins_dims, in_dims, batch_size, input_shapes)
+        return result
+
+    name = f'Vmapped{autograd_function.__name__}'
+    Generated = type(
+        name,
+        (torch.autograd.Function,),
+        {
+            'forward': staticmethod(forward),
+            'backward': staticmethod(backward),
+            'jvp': staticmethod(jvp),
+            'setup_context': staticmethod(setup_context),
+            'generate_vmap_rule': True
+        }
+    )
+
+    def get_out_dims():
+        assert out_dims != init_val
+        return out_dims
+
+    return Generated, get_out_dims
+
+
+# tangents might be None, so we need to replace
+# the corresponding in_dims with None.
+def get_tangents_in_dims(input_dims, tangents):
+    flat_in_dims, spec = pytree.tree_flatten(input_dims)
+    flat_tangents = pytree.arg_tree_leaves(*tangents)
+    result = [None if tangent is None else in_dim
+              for in_dim, tangent in zip(flat_in_dims, flat_tangents)]
+    return pytree.tree_unflatten(result, spec)
+
+
+# NOTE: [Why do we need to run setup_context under a vmap?]
+# Consider the following autograd.Function
+#
+# class Sum(torch.autograd.Function):
+#    @staticmethod
+#    def forward(x):
+#        return x.sum()
+#    @staticmethod
+#    def setup_context(ctx, inputs, outputs):
+#        ctx.x_shape = inputs[0]
+#    @staticmethod
+#    def backward(ctx, gy):
+#        return gy.expand(ctx.x_shape)
+#
+# x = torch.randn(B, 4)
+# in_dims = 0
+# vmap(Sum.apply, in_dims)(x)
+#
+# Let’s assume for a moment that we didn’t vmap setup_context in VmappedSum:
+#
+# class VmappedSum(torch.autograd.Function):
+#    @staticmethod
+#    def forward(x):
+#        return vmap(Sum.forward, in_dims)(x)
+#
+#    @staticmethod
+#    def setup_context(ctx, inputs, outputs):
+#        Sum.setup_context(ctx, inputs, outputs)
+#
+#    @staticmethod
+#    def backward(ctx, gy):
+#        def backward_no_context(gy):
+#            return gy.expand(ctx.x_shape)
+#
+#        dims = (0,)
+#        gx = vmap(backward_no_context, dims)(gy)
+#        return gx
+#
+# We end up saving [B, 4] as x_shape. In the backward, gy has shape [B],
+# and we’re doing:
+#
+# def backward_no_context(gy):
+#     return gy.expand([B, 4])
+#
+# gx = vmap(backward_no_context, dims)(gy: "Tensor[B]")
+#
+# This gives us the wrong result (gx has shape [B, B, 4], but it should
+# have shape [4]). Performing vmap over setup_context means the shape
+# saved has shape [4] and leads to a correct result shape for gx.
+
+# Wraps a ctx object. Forwards all attr accesses to the underlying object
+# except for the attrs in _pt_attrs
+class WrappedCtx:
+    _pt_reserved_attrs: Tuple[str, ...] = ('_pt_reserved_attrs', '_pt_inner_ctx')
+
+    def __init__(self, ctx):
+        if not isinstance(ctx, WrappedCtx):
+            reserved_attrs = type(self)._pt_reserved_attrs
+            for name in reserved_attrs:
+                if not hasattr(ctx, name):
+                    continue
+                raise RuntimeError(
+                    f'PyTorch reserves the {reserved_attrs} field on ctx. '
+                    'Please name your fields on ctx something else to avoid name '
+                    'collision.')
+        self._pt_inner_ctx = ctx
+
+    def __getattr__(self, name):
+        return getattr(self._pt_inner_ctx, name)
+
+    def __setattr__(self, name, value):
+        if name in type(self)._pt_reserved_attrs:
+            self.__dict__[name] = value
+            return
+        return setattr(self._pt_inner_ctx, name, value)
+
+# Wraps ctx to create a new ctx object that overrides saved_tensors.
+class CtxWithSavedTensors(WrappedCtx):
+    _pt_reserved_attrs = ('_pt_new_saved_tensors', *WrappedCtx._pt_reserved_attrs)
+
+    def __init__(self, ctx, new_saved_tensors):
+        super().__init__(ctx)
+        self._pt_new_saved_tensors = new_saved_tensors
+
+    @property
+    def saved_tensors(self):
+        return self._pt_new_saved_tensors
+
+class CtxCustomSave(WrappedCtx):
+    _pt_reserved_attrs = ('_pt_saved_tensors_bdims', '_pt_current_level',
+                          *WrappedCtx._pt_reserved_attrs)
+
+    def __init__(self, ctx, current_level):
+        super().__init__(ctx)
+        self._pt_saved_tensors_bdims = ()
+        self._pt_current_level = current_level
+
+    def save_for_backward(self, *tensors):
+        unwrapped_tensors, bdims = unwrap_batched(tensors, self._pt_current_level)
+        self._pt_inner_ctx.save_for_backward(*unwrapped_tensors)
+        self._pt_saved_tensors_bdims = bdims
+
+    def save_for_forward(self, *tensors):
+        unwrapped_tensors, bdims = unwrap_batched(tensors, self._pt_current_level)
+        self._pt_inner_ctx.save_for_forward(*unwrapped_tensors)
+        self._pt_saved_tensors_bdims = bdims
+
+
+def reductify(grad_input, grad_input_bdim, input_bdim, batch_size,
+              target_shape_without_bdim_to_reduce_to=None):
+    if not isinstance(grad_input, tuple):
+        grad_input = (grad_input,)
+    if not isinstance(grad_input_bdim, tuple):
+        grad_input_bdim = (grad_input_bdim,)
+    if not isinstance(input_bdim, tuple):
+        input_bdim = (input_bdim,)
+
+    if target_shape_without_bdim_to_reduce_to is None:
+        target_shape_without_bdim_to_reduce_to = len(grad_input) * (None,)
+    result = tuple(
+        reductify_leaf(gi, gi_bdim, i_bdim, batch_size, maybe_ishape)
+        for gi, gi_bdim, i_bdim, maybe_ishape in
+        zip(grad_input, grad_input_bdim, input_bdim, target_shape_without_bdim_to_reduce_to)
+    )
+    return result
+
+
+def reductify_leaf(grad_input, grad_input_bdim, input_bdim, batch_size,
+                   target_shape_without_bdim_to_reduce_to=None):
+    if grad_input is None:
+        return None
+
+    if grad_input_bdim is None and input_bdim is None:
+        return grad_input
+
+    if grad_input_bdim is not None and input_bdim is None:
+        return grad_input.sum(grad_input_bdim)
+
+    # NOTE: [Why can't we rely on autograd to reduce expanded gradients?]
+    # For reverse-mode AD,
+    # given a grad_input and input, it is valid for the user to return a
+    # grad_input that has a broadcasted shape when compared to the input.
+    # In this situation, autograd automatically reduces the grad_input to
+    # the shape of the input.
+    #
+    # However, when input_bdim is not None, we have problems.
+    #
+    # [example 1]
+    # grad_input: Tensor[3, 4], input: Tensor[B, 4]
+    # We can expand grad_input to Tensor[B, 3, 4], but that isn't broadcastable
+    # from [B, 4].
+    #
+    # [example 2]
+    # grad_input: Tensor[3, B, 4], input: Tensor[B, 4]
+    # We can swizzle grad_input to Tensor[B, 3, 4], but that isn't broadcastable
+    # from [B, 4].
+    #
+    # This means that we need to also reduce the grad_input to the shape of the
+    # input. This behavior is controlled by the `target_shape_without_bdim_to_reduce_to` flag;
+    # if not-None then we do the reducing manually, otherwise, we do not do a reduction.
+    assert input_bdim is not None
+
+    if grad_input_bdim is None:
+        grad_input = grad_input.unsqueeze(input_bdim)
+        new_shape = list(grad_input.shape)
+        new_shape[input_bdim] = batch_size
+        grad_input = grad_input.expand(new_shape)
+        grad_input_bdim = input_bdim
+
+    if target_shape_without_bdim_to_reduce_to is not None:
+        return vmap(torch.Tensor.sum_to_size, in_dims=(grad_input_bdim, None), out_dims=input_bdim)(
+            grad_input, target_shape_without_bdim_to_reduce_to)
+
+    if input_bdim != grad_input_bdim:
+        grad_input = grad_input.movedim(grad_input_bdim, input_bdim)
+    return grad_input
+
+
+class AutogradFunctionApply(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("autograd_function_apply")
+
+    def __call__(self, fwd, bwd, *fwd_args):
+        saved_values = None
+
+        class ApplyTemplate(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                nonlocal saved_values
+                output, saved_values = fwd(None, *args)
+                return output
+
+            @staticmethod
+            def backward(ctx, *grad):
+                return bwd(None, *grad, *saved_values)
+
+        return ApplyTemplate.apply(*fwd_args)
+
+
+autograd_function_apply = AutogradFunctionApply()
diff --git a/MLPY/Lib/site-packages/torch/_functorch/batch_norm_replacement.py b/MLPY/Lib/site-packages/torch/_functorch/batch_norm_replacement.py
new file mode 100644
index 0000000000000000000000000000000000000000..d741bc215dbb3bdffbf6dbe486b38c5494476754
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/batch_norm_replacement.py
@@ -0,0 +1,24 @@
+import torch.nn as nn
+from torch._functorch.utils import exposed_in
+
+
+def batch_norm_without_running_stats(module: nn.Module):
+    if isinstance(module, nn.modules.batchnorm._BatchNorm) and module.track_running_stats:
+        module.running_mean = None
+        module.running_var = None
+        module.num_batches_tracked = None
+        module.track_running_stats = False
+
+
+@exposed_in("torch.func")
+def replace_all_batch_norm_modules_(root: nn.Module) -> nn.Module:
+    """
+    In place updates :attr:`root` by setting the ``running_mean`` and ``running_var`` to be None and
+    setting track_running_stats to be False for any nn.BatchNorm module in :attr:`root`
+    """
+    # base case
+    batch_norm_without_running_stats(root)
+
+    for obj in root.modules():
+        batch_norm_without_running_stats(obj)
+    return root
diff --git a/MLPY/Lib/site-packages/torch/_functorch/benchmark_utils.py b/MLPY/Lib/site-packages/torch/_functorch/benchmark_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac2701b70591b818743bcf1db72848274637c85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/benchmark_utils.py
@@ -0,0 +1,195 @@
+# mypy: ignore-errors
+
+import contextlib
+import time
+import os
+import json
+
+import torch
+from torch.profiler import profile, ProfilerActivity
+
+
+def synchronize():
+    pass
+
+
+def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1,
+                      devices=None, kwargs_for_f=None, kwargs_for_profiler=None):
+    """
+    Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
+    [num_runs] times to [trace_filename].
+
+    [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
+    Return total runtime without the profiler
+
+    Outputs to trace_filename
+    """
+
+    if devices is None:
+        devices = ["cuda"]
+
+    global synchronize
+    if devices != ["cpu"] and torch.cuda.is_available():
+        synchronize = torch.cuda.synchronize
+
+    if kwargs_for_f is None:
+        kwargs_for_f = {}
+    if kwargs_for_profiler is None:
+        kwargs_for_profiler = {}
+
+    with optimize_ctx:
+        torch.manual_seed(1337)
+        for _ in range(5):  # warmup runs
+            f(input, **kwargs_for_f)
+            synchronize()
+        torch.manual_seed(1337)
+        t0 = time.perf_counter()
+        for _ in range(num_runs):
+            f(input, **kwargs_for_f)
+            synchronize()
+        t1 = time.perf_counter()
+    timing = t1 - t0
+
+    with profile(activities=activities, **kwargs_for_profiler) as prof:
+        with optimize_ctx:
+            synchronize()
+            torch.manual_seed(1337)
+            for _ in range(num_runs):
+                f(input, **kwargs_for_f)
+                synchronize()
+    prof.export_chrome_trace(trace_filename)
+
+    return timing
+
+
+def get_chrome_trace_events(filename):
+    f = open(filename)
+    data = json.load(f)
+    events = data["traceEvents"]
+    return events
+
+
+def is_gpu_compute_event(event):
+    global gpu_pids
+    return "pid" in event and event["pid"] in gpu_pids and "ph" in event and event["ph"] == "X"
+
+
+def get_sorted_gpu_events(events):
+    sorted_gpu_events = []
+    for event in events:
+        if not is_gpu_compute_event(event):
+            continue
+        sorted_gpu_events.append(event)
+    return sorted(sorted_gpu_events, key=lambda x: x["ts"])
+
+
+def get_duration(sorted_gpu_events):
+    if len(sorted_gpu_events) == 0:
+        return 0
+    event = sorted_gpu_events[0]
+    current_end_time = event["ts"] + event["dur"]
+    total_duration = event["dur"]
+    for event in sorted_gpu_events[1:]:
+        start_time = max(event["ts"], current_end_time)
+        end_time = event["ts"] + event["dur"]
+        total_duration = total_duration + max(end_time - start_time, 0)
+        current_end_time = max(current_end_time, end_time)
+    return total_duration
+
+
+def get_sorted_gpu_mm_conv_events(events):
+    def is_mm_conv_event(event):
+        return "name" in event and ("gemm" in event["name"] or "conv" in event["name"]
+                                    or "cutlass" in event["name"] or "wgrad" in event["name"])
+    gpu_events = get_sorted_gpu_events(events)
+    sorted_events = []
+    for event in gpu_events:
+        if not is_mm_conv_event(event):
+            continue
+        sorted_events.append(event)
+    return sorted_events
+
+
+gpu_pids = []
+
+
+def compute_utilization(filename: str, total_length: float):
+    """
+    Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
+    and percent of times spent on matmul and convolution
+
+    Args:
+        filename(str): Name of chrome traces file produced by pytorch profiler
+
+        total_length(float): total length of the process without profiler in second
+
+    Return:
+        tuple: (GPU Utilization, percent of time spent on matmul and convolution)
+    """
+    events = get_chrome_trace_events(filename)
+
+    # get pids of GPU events
+    global gpu_pids
+    gpu_pids = []
+    for event in events:
+        if "name" not in event:
+            continue
+        if event["name"] == 'process_labels' and "GPU" in event["args"]["labels"]:
+            gpu_pids.append(event["pid"])
+
+    total_length = total_length * 1e6
+    sorted_gpu_events = get_sorted_gpu_events(events)
+    utilization = get_duration(sorted_gpu_events) / total_length
+
+    sorted_gpu_mm_conv_events = get_sorted_gpu_mm_conv_events(events)
+    mm_conv_utilization = get_duration(sorted_gpu_mm_conv_events) / total_length
+
+    return utilization, mm_conv_utilization
+
+
+def benchmark_utilization(f, input, trace_folder, optimize_ctx=None, trace_file_name="tmp_chrome_trace", num_runs=1):
+    """
+    Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
+    running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
+    It will produce a chrome trace file in trace_folder/trace_file_name.json
+
+    Example:
+
+    ```
+    def f(a):
+        return a.sum()
+    a = torch.rand(2**20, device="cuda")
+    utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
+    ```
+
+    Args:
+        f: function to benchmark
+
+        input: input to :attr:`f`
+
+        trace_folder: name of the folder to store the chrome trace
+
+        optimize_ctx: the context in which f will run
+
+        trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"
+
+        num_runs: number of times to run f, excluding the warm-up runs, default to 1.
+
+    Return:
+        tuple: (GPU Utilization, percent of time spent on matmul and convolution)
+
+    """
+    isExist = os.path.exists(trace_folder)
+    if not isExist:
+        os.makedirs(trace_folder)
+        print("create folder " + trace_folder)
+
+    if optimize_ctx is None:
+        optimize_ctx = contextlib.nullcontext()
+
+    chrome_trace_file_name = os.path.join(trace_folder, trace_file_name + ".json")
+    total_length = dump_chrome_trace(f, input, chrome_trace_file_name, optimize_ctx,
+                                     [ProfilerActivity.CUDA], num_runs=num_runs, devices="cuda")
+    utilization, mm_conv_utilization = compute_utilization(chrome_trace_file_name, total_length)
+
+    return utilization, mm_conv_utilization
diff --git a/MLPY/Lib/site-packages/torch/_functorch/compile_utils.py b/MLPY/Lib/site-packages/torch/_functorch/compile_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e62358d67b8c01115c3163887216005a92e7f4d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/compile_utils.py
@@ -0,0 +1,97 @@
+# mypy: ignore-errors
+
+
+import torch
+import torch.fx as fx
+from torch.utils._pytree import tree_flatten
+from torch.utils import _pytree as pytree
+
+aten = torch.ops.aten
+
+
+def get_aten_target(node):
+    if hasattr(node.target, 'overloadpacket'):
+        return node.target.overloadpacket
+    return node.target
+
+
+rand_ops = [aten.dropout, aten._fused_dropout, aten._standard_gamma,
+            aten.bernoulli, aten.multinomial, aten.native_dropout,
+            aten.normal, aten.poisson, aten.binomial, aten.rrelu,
+            aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm]
+
+
+# return a new copy of torch.fx.graph.Graph with CSE applied to the input graph
+def fx_graph_cse(fx_g: torch.fx.graph.Graph):
+    new_graph = fx.Graph()
+    env = {}  # map from node in the old graph to node in the new graph
+    hash_env = {}  # map from hash to a node in the new graph
+    token_map = {}  # map from hash to token
+    for n in fx_g.nodes:
+        # The placeholder, output, and get_attr nodes are copied to the new graph without change
+        # do not CSE away random operations
+        if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or get_aten_target(n) in rand_ops:
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+        else:  # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method'
+            # substitute args and kwargs members to their mapping in env if exists
+            # specs can be used to reconstruct nested list/dictionaries
+            def substitute(arg_list):
+                arg_list, spec = tree_flatten(arg_list)
+                for i in range(len(arg_list)):
+                    v = arg_list[i]
+                    if isinstance(v, torch.fx.node.Node) and v in env:
+                        arg_list[i] = env[v]
+                    if isinstance(v, (torch.SymBool, torch.SymInt, torch.SymFloat)):
+                        arg_list[i] = v.node
+                return tuple(arg_list), spec
+            args, args_spec = substitute(n.args)
+            kwargs, kwargs_spec = substitute(n.kwargs)
+
+            # each token corresponds to a unique node
+            # nodes with the same token can be substituted
+            token = {"target": n.target, "args": args, "args_spec": args_spec,
+                     "kwargs": kwargs, "kwargs_spec": kwargs_spec}
+
+            # hash substituted args to a number, do not hash specs because specs are not hashable
+            # We need to add type into hash to avoid situations like:
+            # hash((primals_2, 1.0)) == hash((primals_2, 1))
+            hash_arg = hash((tuple((a, type(a)) for a in args), tuple((a, type(a)) for a in kwargs)))
+            hash_val = (n.target, hash_arg)
+
+            # check if a node has a substitute and can be eliminated
+            hash_val_in_hash_env = hash_val in hash_env
+            if hash_val_in_hash_env and token_map[hash_val] == token:
+                env[n] = hash_env[hash_val]
+                continue
+
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+            if not hash_val_in_hash_env:
+                hash_env[hash_val] = new_node
+                token_map[hash_val] = token
+
+    return new_graph
+
+
+def strip_overloads(gm):
+    """
+    Modifies the target of graph nodes in :attr:`gm` to strip overloads.
+
+    Args:
+        gm(fx.GraphModule): The input Fx graph module to be modified
+    """
+    for node in gm.graph.nodes:
+        if isinstance(node.target, torch._ops.OpOverload):
+            node.target = node.target.overloadpacket
+    gm.recompile()
+
+
+def get_placeholders(graph):
+    return list(filter(lambda x: x.op == 'placeholder', graph.nodes))
+
+def get_outputs(graph):
+    for node in graph.nodes:
+        if node.op == 'output':
+            return pytree.tree_leaves(node.args[0])
+    raise AssertionError("No output node found")
diff --git a/MLPY/Lib/site-packages/torch/_functorch/compilers.py b/MLPY/Lib/site-packages/torch/_functorch/compilers.py
new file mode 100644
index 0000000000000000000000000000000000000000..409b1bbb8f0979c6a7e3d1756d5bb31a1cbdaa79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/compilers.py
@@ -0,0 +1,441 @@
+# mypy: ignore-errors
+
+import copy
+import logging
+import os
+import pickle
+import random
+from contextlib import contextmanager
+from functools import partial
+from typing import Callable, Union
+import sympy
+
+import torch
+from torch import SymInt
+import torch.fx as fx
+import torch.nn as nn
+from torch._decomp import get_decompositions
+from torch.fx.experimental.symbolic_shapes import bind_symbols
+
+from .aot_autograd import aot_function, aot_module, make_boxed_compiler
+from .compile_utils import strip_overloads
+from .partitioners import (
+    default_partition,
+    draw_graph,
+    min_cut_rematerialization_partition,
+)
+import torch.utils._pytree as pytree
+
+
+log = logging.getLogger(__name__)
+
+
+# These canonicalizations are needed here (and not decompositions), as the ops
+# we're trying to canonicalize to CompositeImplicitAutograd.
+def _canonicalize(fx_g):
+    for node in fx_g.graph.nodes:
+        if node.target == torch.ops.aten._to_copy:
+            node.target = torch.ops.aten.to
+    fx_g.recompile()
+    return fx_g
+
+
+@contextmanager
+def _disable_jit_autocast():
+    old_jit_autocast_flag = torch._C._jit_set_autocast_mode(False)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_autocast_mode(old_jit_autocast_flag)
+
+
+@make_boxed_compiler
+def ts_compile(fx_g: fx.GraphModule, inps) -> Callable:
+    """
+    Compiles the :attr:`fx_g` with Torchscript compiler.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        fx_g(fx.GraphModule): The input Fx graph module to be compiled.
+
+    Returns:
+        Torch scripted model.
+    """
+
+    with _disable_jit_autocast():
+        strip_overloads(fx_g)
+
+        for node in fx_g.graph.nodes:
+            if (
+                node.target == torch.ops.aten._to_copy
+                and len(node.args) == 1
+                and len(node.kwargs) == 1
+                and "dtype" in node.kwargs
+            ):
+                node.target = torch.ops.aten.to
+
+        for node in fx_g.graph.nodes:
+            new_kwargs = {}
+            for k, v in node.kwargs.items():
+                if isinstance(v, torch.device):
+                    v = v.type
+                new_kwargs[k] = v
+            node.kwargs = new_kwargs
+
+        fx_g.graph.lint()
+
+        fx_g.recompile()
+
+        f = torch.jit.script(fx_g)
+
+        torch._C._jit_pass_remove_mutation(f.graph)
+
+        f = torch.jit.freeze(f.eval())
+        f = torch.jit.optimize_for_inference(f)
+        if not any(isinstance(t, torch._subclasses.FakeTensor) for t in inps):
+            f(*inps)
+    return f
+
+
+def _draw_graph_compile(fx_g, _, name, clear_meta=True):
+    print(fx_g.code)
+    draw_graph(fx_g, name, clear_meta=clear_meta)
+    return fx_g
+
+
+def draw_graph_compile(name):
+    return make_boxed_compiler(
+        partial(_draw_graph_compile, name=name)
+    )
+
+
+@make_boxed_compiler
+def nop(fx_g: fx.GraphModule, _) -> Callable:
+    """
+    Returns the :attr:`fx_g` Fx graph module as it is. This is a no-op compiler
+    and can be used to check accuracy.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    """
+    return fx_g
+
+class DebugInterpreter(fx.Interpreter):
+    def run(self, *args):
+        self.symbol_mapping = bind_symbols(self.module, *args)
+        super().run(*args)
+
+    def run_node(self, n):
+
+        def subst_symint(ni):
+            if not isinstance(ni, SymInt):
+                return ni
+            r = sympy.expand(ni.node.expr.xreplace(self.symbol_mapping))
+            assert r.is_number, r
+            return int(r)
+
+        def subst_symint_tuple(nis):
+            return tuple(subst_symint(ni) for ni in nis)
+
+        def check_significant_strides(a, b):
+            if subst_symint(a.numel()) > 0:
+                for idx in range(a.ndim):
+                    if subst_symint(a.stride(idx)) != b.stride(idx) and subst_symint(a.size(idx)) > 1:
+                        return False
+            return True
+
+        def check(nv, rv, desc):
+            assert callable(desc)
+            assert nv.dtype == rv.dtype, f"{desc()}: {nv.dtype} != {rv.dtype}"
+            assert subst_symint_tuple(nv.size()) == rv.size(), \
+                f"{desc()}: {nv.size()} aka {subst_symint_tuple(nv.size())} != {rv.size()}"
+            same_strides = check_significant_strides(nv, rv)
+            assert same_strides, f"{desc()}: {nv.stride()} aka {subst_symint_tuple(nv.stride())} != {rv.stride()}"
+
+        r = super().run_node(n)
+        if 'val' in n.meta:
+            n_vals, n_spec = pytree.tree_flatten(n.meta['val'])
+            r_vals, r_spec = pytree.tree_flatten(r)
+            # TODO: There is some sort of problem where we record that an
+            # operator returned a tuple/list, and then later it turns out the
+            # real version of the operator returned a list/tuple. Need to
+            # figure out what's actually going on here, the error itself is
+            # harmless enough as we only getitem out the outputs.
+            # assert n_spec == r_spec, f"{n_spec} != {r_spec}"
+            assert len(n_vals) == len(r_vals), f"{len(n_vals)} != {len(r_vals)}"
+            for i, nv, rv in zip(range(len(n_vals)), n_vals, r_vals):
+                if not isinstance(rv, torch.Tensor):
+                    continue
+                check(nv, rv, lambda: f"output {i} where {self.symbol_mapping}")
+        return r
+
+
+@make_boxed_compiler
+def debug_nop(fx_g: fx.GraphModule, _) -> Callable:
+    """
+    Returns a (slow) interpreter over the FX graph module that also checks
+    various debugging properties (e.g., that tracing strides matched real
+    strides.)
+    """
+    return DebugInterpreter(fx_g).run
+
+@make_boxed_compiler
+def simple_ts_compile(fx_g, _):
+    strip_overloads(fx_g)
+    f = torch.jit.script(fx_g)
+    f = torch.jit.freeze(f.eval())
+    return f
+
+
+def nnc_jit(f):
+    return aot_function(f, simple_ts_compile)
+
+
+aten = torch.ops.aten
+default_decompositions = {
+    aten.detach,
+    aten.gelu_backward,
+    aten.leaky_relu_backward,
+    aten.sigmoid_backward,
+    aten.threshold_backward,
+    aten.hardtanh_backward,
+    aten.hardsigmoid_backward,
+    aten.hardswish_backward,
+    aten.tanh_backward,
+    aten.silu_backward,
+    aten.elu_backward,
+    aten.cudnn_batch_norm,
+    aten.cudnn_batch_norm_backward,
+    aten.masked_fill.Scalar,
+    aten.masked_fill.Tensor,
+    aten.elu,
+    aten.leaky_relu,
+    aten.hardtanh,
+    aten.hardswish,
+    aten.hardsigmoid,
+    aten.conj_physical,
+    aten.is_same_size,
+}
+
+default_decompositions = get_decompositions(default_decompositions)
+
+
+@make_boxed_compiler
+def print_compile(fx_g, _):
+    print(fx_g.code)
+    return fx_g
+
+
+def memory_efficient_fusion(
+    fn: Union[Callable, nn.Module],
+    **kwargs,
+):
+    """
+    Wrapper function over :func:`aot_function` and :func:`aot_module` to perform
+    memory efficient fusion. It uses the
+    :func:`min_cut_rematerialization_partition` partitioner to perform efficient
+    recomputation. It uses NVFuser to compile the generated forward and backward
+    graphs.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        fn (Union[Callable, nn.Module]): A Python function or a ``nn.Module``
+            that takes one ore more arguments. Must return one or more Tensors.
+        **kwargs: Any other overrides you want to make to the settings
+
+    Returns:
+        Returns a ``Callable``  or ``nn.Module`` that retains the eager behavior
+        of the original :attr:`fn`, but whose forward and backward graphs have
+        gone through recomputation optimizations, and the graphs have been
+        compiled with nvfuser.
+
+    """
+    config = {
+        "fw_compiler": ts_compile,
+        "bw_compiler": ts_compile,
+        "partition_fn": min_cut_rematerialization_partition,
+        "decompositions": default_decompositions,
+    }
+    config.update(kwargs)
+    if isinstance(fn, torch.nn.Module):
+        return aot_module(fn, **config)
+    else:
+        return aot_function(fn, **config)
+
+
+def debug_compile(fx_g, inps):
+    fx_g.to_folder("foo")
+    print(
+        f"""
+##############################################################
+# To minimize FX graph, copy and paste the below and run it  #
+##############################################################
+
+import torch
+import torch.fx as fx
+from functorch.compile import minifier, check_nvfuser_subprocess, check_nvfuser_correctness_subprocess
+
+inps = {[(i.shape, i.dtype) for i in inps]}
+inps = [torch.ones(shape, dtype=dtype, device='cuda') for (shape, dtype) in inps]
+from foo import FxModule
+mod = FxModule().cuda()
+
+with torch.jit.fuser("fuser2"):
+  # check_nvfuser_subprocess can be replaced with check_nvfuser_correctness_subprocess
+  minifier(fx.symbolic_trace(mod), inps, check_nvfuser_subprocess)
+"""
+    )
+    from foo import FxModule
+
+    FxModule().cuda()(*inps)
+
+    return ts_compile(fx_g, inps)
+
+
+graph_index = 0
+
+
+def get_inputs(input_data_path):
+    """
+    Return a random input for the given inputs meta generated from _save_fx_default.
+    """
+    inputs = []
+    with (open(input_data_path, "rb")) as f:
+        inputs_meta = pickle.load(f)
+        inputs = []
+        for meta in inputs_meta:
+            if len(meta) == 1:
+                type = meta
+                input = type(random.rand())
+            else:
+                type, shape, stride, dtype, device = meta
+                if dtype in {
+                    torch.int,
+                    torch.int32,
+                    torch.int64,
+                    torch.bool,
+                    torch.int,
+                    torch.uint8,
+                    int,
+                    float,
+                }:
+                    input = torch.randint(0, 1, shape, dtype=dtype, device=device)
+                else:
+                    input = torch.rand(shape, dtype=dtype, device=device)
+            inputs.append(input)
+    return inputs
+
+
+def _save_fx_default(current_name, folder_name, dump_example_input, gm, example_inputs):
+    """
+    The forward, backward, and joint computation graph will be stored in
+    {folder_name}/{current_name}/{current_name}_forward_{graph_index},
+    {folder_name}/{current_name}/{current_name}_backward_{graph_index}, and
+    {folder_name}/{current_name}/{current_name}_joint_{graph_index} respectively.
+    The input shape of the graphs will be stored in the .input files.
+    These files can be loaded with pickle,
+    and is a list of format (type, shape, stride, dtype, device).
+    In the case of type = int or float, it is just (type,).
+    For joint graph input, it is a nested list [[],[]]
+    where the two inner lists have the same format.
+    If dump_example_input is True, example_inputs will be stored in .pt file.
+    Since each function might produce multiple graphs,
+    the graph_index is used to distinguish difference graphs
+    """
+    from functorch.compile import aot_module_simplified
+
+    def get_input_meta(args):
+        input_meta = []
+        if len(args) > 0 and isinstance(args[0], tuple):  # joint input
+            input_meta += get_input_meta(args[0])
+            input_meta += get_input_meta(args[1])
+            return input_meta
+        for arg in args:
+            if type(arg) == int or type(arg) == float:
+                input_meta.append((type(arg),))
+            else:
+                input_meta.append(
+                    (type(arg), arg.shape, arg.stride(), arg.dtype, arg.device)
+                )
+        return input_meta
+
+    def graph_saver_helper(gm_to_save, args, type_name):
+        global graph_index
+        if len(gm_to_save.graph.nodes) == 0:
+            log.log(
+                logging.WARNING,
+                "No nodes in graph {%s}_{%s}_{%s}.",
+                current_name,
+                type_name,
+                graph_index,
+            )
+            return
+
+        gm = copy.deepcopy(gm_to_save)
+        gm.graph.set_codegen(torch.fx.graph.CodeGen())  # remove codegen
+        gm.recompile()
+
+        input_meta = get_input_meta(args)
+
+        os.makedirs(f"{folder_name}/{current_name}", exist_ok=True)
+        gm.to_folder(
+            f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}"
+        )
+        pickle.dump(
+            input_meta,
+            open(
+                f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}/{current_name}_{type_name}_{graph_index}.input",  # noqa: B950
+                "wb",
+            ),
+        )  # noqa: E501
+        if dump_example_input:
+            torch.save(
+                args,
+                f"{folder_name}/{current_name}/{current_name}_{type_name}_{graph_index}/{current_name}_{type_name}_{graph_index}.pt",  # noqa: B950
+            )  # noqa: E501
+
+    def graph_saver_forward(gm, fw_args):
+        graph_saver_helper(gm, fw_args, "forward")
+        return gm
+
+    def graph_saver_backward(gm, bw_args):
+        graph_saver_helper(gm, bw_args, "backward")
+        global graph_index
+        graph_index += 1
+        return gm
+
+    def graph_saver_joint(gm, joint_args):
+        graph_saver_helper(gm, joint_args, "joint")
+        return default_partition(gm, joint_args)
+
+    return aot_module_simplified(
+        gm,
+        example_inputs,
+        fw_compiler=graph_saver_forward,
+        bw_compiler=graph_saver_backward,
+        partition_fn=graph_saver_joint,
+        decompositions=default_decompositions,
+    )
+
+
+# WARNING: This isn't tested anywhere!!
+def graph_dumper_aot(current_name, folder_name, dump_example_input=False):
+    """
+    Dump the forward, backward, and joint computation graph.
+    Example Usage:
+    save_fx_func = graph_dumper_aot(current_name, folder_name, dump_example_input = False)
+    optimize_ctx = torchdynamo.optimize(
+        save_fx_func
+    )
+    with torch.enable_grad():
+        with optimize_ctx:
+            result = forward_and_backward_pass(model, example_inputs)
+    """
+    global graph_index
+    graph_index = 0
+    return partial(_save_fx_default, current_name, folder_name, dump_example_input)
diff --git a/MLPY/Lib/site-packages/torch/_functorch/config.py b/MLPY/Lib/site-packages/torch/_functorch/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e24bdbc7f253a5bc8cb02296098d046441501f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/config.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Global flags for aot autograd
+"""
+import os
+import sys
+from typing import TYPE_CHECKING
+
+# Converts torch rng ops to their functional philox rng equivalents. Note that
+# we functionalize only CUDA rng ops today.
+functionalize_rng_ops = False
+
+# can be useful for debugging if we are incorrectly creating meta fake tensors
+fake_tensor_allow_meta = os.environ.get("FAKE_ALLOW_META", True)
+
+# Enables optional asserts in hotpath code to check for errors.  If
+# you are seeing weird accuracy problems, try turning this on.
+# This is currently off by default as it will harm tracing time,
+# but it is on by default for aot_eager.
+debug_assert = False
+
+debug_partitioner = os.environ.get("AOT_PARTITIONER_DEBUG", False)
+
+static_weight_shapes = True
+
+# Applies CSE to the graph before partitioning
+cse = True
+
+# Restricts the amount of computation AOTAutograd can do.
+max_dist_from_bw = 3
+
+# Enable aggressive_recomputation in the min-cut algorithm in partitioners to reduce
+# memory usage with some penalty of performance. It allows more ops to be considered
+# as recomputable except random ops and compute-intensive ops.
+aggressive_recomputation = False
+
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+
+from torch.utils._config_module import install_config_module
+
+# adds patch, save_config, invalid config checks, etc
+install_config_module(sys.modules[__name__])
diff --git a/MLPY/Lib/site-packages/torch/_functorch/deprecated.py b/MLPY/Lib/site-packages/torch/_functorch/deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf90a602b74d700be2723ca9d731c6ac49d2eebb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/deprecated.py
@@ -0,0 +1,125 @@
+import torch._functorch.apis as apis
+import torch._functorch.eager_transforms as _impl
+import torch._functorch.make_functional as _nn_impl
+from torch._functorch.vmap import in_dims_t, out_dims_t
+from torch._functorch.eager_transforms import argnums_t
+import torch.nn as nn
+import textwrap
+from typing import Any, Callable, Optional, Tuple, Union
+import warnings
+
+"""
+The APIs in this file are exposed as `functorch.*`. They are thin wrappers
+around the torch.func.* APIs that have deprecation warnings -- we're trying
+to move people to the torch.func.* equivalents.
+
+NB: We don't use *args, **kwargs in the signatures because that changes the
+documentation.
+"""
+
+def get_warning(api, new_api=None, replace_newlines=False):
+    if new_api is None:
+        new_api = f'torch.func.{api}'
+    warning = (
+        f"We've integrated functorch into PyTorch. As the final step of the \n"
+        f"integration, functorch.{api} is deprecated as of PyTorch \n"
+        f"2.0 and will be deleted in a future version of PyTorch >= 2.3. \n"
+        f"Please use {new_api} instead; see the PyTorch 2.0 release notes \n"
+        f"and/or the torch.func migration guide for more details \n"
+        f"https://pytorch.org/docs/master/func.migrating.html"
+    )
+    if replace_newlines:
+        warning = warning.replace("\n", "")
+    return warning
+
+
+def warn_deprecated(api, new_api=None):
+    warning = get_warning(api, new_api, replace_newlines=True)
+    warnings.warn(warning, stacklevel=2)
+
+
+def setup_docs(functorch_api, torch_func_api=None, new_api_name=None):
+    api_name = functorch_api.__name__
+    if torch_func_api is None:
+        torch_func_api = getattr(_impl, api_name)
+    # See https://docs.python.org/3/using/cmdline.html#cmdoption-OO
+    if torch_func_api.__doc__ is None:
+        return
+
+    warning = get_warning(api_name, new_api_name)
+    warning_note = "\n.. warning::\n\n" + textwrap.indent(warning, "    ")
+    warning_note = textwrap.indent(warning_note, "    ")
+    functorch_api.__doc__ = torch_func_api.__doc__ + warning_note
+
+def vmap(
+        func: Callable,
+        in_dims: in_dims_t = 0,
+        out_dims: out_dims_t = 0,
+        randomness: str = 'error',
+        *,
+        chunk_size=None) -> Callable:
+    warn_deprecated('vmap', 'torch.vmap')
+    return apis.vmap(func, in_dims, out_dims, randomness, chunk_size=chunk_size)
+
+def grad(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
+    warn_deprecated('grad')
+    return apis.grad(func, argnums, has_aux)
+
+def grad_and_value(func: Callable, argnums: argnums_t = 0, has_aux: bool = False) -> Callable:
+    warn_deprecated('grad_and_value')
+    return apis.grad_and_value(func, argnums, has_aux)
+
+def vjp(func: Callable, *primals, has_aux: bool = False):
+    warn_deprecated('vjp')
+    return _impl.vjp(func, *primals, has_aux=has_aux)
+
+def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, has_aux: bool = False):
+    warn_deprecated('jvp')
+    return _impl.jvp(func, primals, tangents, strict=strict, has_aux=has_aux)
+
+def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False,
+           chunk_size: Optional[int] = None,
+           _preallocate_and_copy=False):
+    warn_deprecated('jacrev')
+    return _impl.jacrev(func, argnums, has_aux=has_aux, chunk_size=chunk_size,
+                        _preallocate_and_copy=_preallocate_and_copy)
+
+def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, randomness: str = "error"):
+    warn_deprecated('jacfwd')
+    return _impl.jacfwd(func, argnums, has_aux, randomness=randomness)
+
+def hessian(func, argnums=0):
+    warn_deprecated('hessian')
+    return _impl.hessian(func, argnums=argnums)
+
+def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:
+    warn_deprecated('functionalize')
+    return _impl.functionalize(func, remove=remove)
+
+def make_functional(model: nn.Module, disable_autograd_tracking: bool = False):
+    warn_deprecated('make_functional', 'torch.func.functional_call')
+    return _nn_impl.make_functional(model, disable_autograd_tracking)
+
+def make_functional_with_buffers(model: nn.Module, disable_autograd_tracking: bool = False):
+    warn_deprecated('make_functional_with_buffers', 'torch.func.functional_call')
+    return _nn_impl.make_functional_with_buffers(model, disable_autograd_tracking)
+
+def combine_state_for_ensemble(models):
+    warn_deprecated('combine_state_for_ensemble', 'torch.func.stack_module_state')
+    return _nn_impl.combine_state_for_ensemble(models)
+
+setup_docs(vmap, apis.vmap, 'torch.vmap')
+setup_docs(grad, apis.grad)
+setup_docs(grad_and_value, apis.grad_and_value)
+setup_docs(vjp)
+setup_docs(jvp)
+setup_docs(jacrev)
+setup_docs(jacfwd)
+setup_docs(hessian)
+setup_docs(functionalize)
+setup_docs(make_functional, _nn_impl.make_functional,
+           'torch.func.functional_call')
+setup_docs(make_functional_with_buffers, _nn_impl.make_functional,
+           'torch.func.functional_call')
+setup_docs(combine_state_for_ensemble, _nn_impl.combine_state_for_ensemble,
+           'torch.func.stack_module_state')
diff --git a/MLPY/Lib/site-packages/torch/_functorch/eager_transforms.py b/MLPY/Lib/site-packages/torch/_functorch/eager_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff9a3767e0751d9360f7e5cefacf790cbc715f0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/eager_transforms.py
@@ -0,0 +1,1640 @@
+# mypy: ignore-errors
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Union, Tuple, List, Any, Optional
+import torch
+from functools import partial, wraps
+import contextlib
+from torch.utils._pytree import (
+    tree_flatten,
+    tree_unflatten,
+    tree_map,
+    tree_map_only,
+    tree_map_,
+    treespec_pprint,
+)
+from torch.utils import _pytree as pytree
+from torch.fx.experimental import const_fold
+from torch.fx.experimental.proxy_tensor import make_fx
+import torch.autograd.forward_ad as fwAD
+from torch._subclasses.functional_tensor import FunctionalTensor
+
+from .vmap import doesnt_support_saved_tensors_hooks, get_chunk_sizes
+from .apis import vmap
+
+from torch._C._functorch import (
+    _wrap_for_grad,
+    _unwrap_for_grad,
+    _grad_increment_nesting,
+    _grad_decrement_nesting,
+    _jvp_increment_nesting,
+    _jvp_decrement_nesting,
+    _wrap_functional_tensor,
+    _unwrap_functional_tensor,
+    _func_decrement_nesting,
+    _func_increment_nesting,
+    _assert_wrapped_functional,
+    _propagate_functional_input_mutation,
+    set_inplace_requires_grad_allowed,
+    get_inplace_requires_grad_allowed,
+)
+from torch._functorch.utils import exposed_in, argnums_t
+
+
+def lazy_dynamo_disable(func):
+    import torch._dynamo
+    return torch._dynamo.disable(func)
+
+@contextlib.contextmanager
+def enable_inplace_requires_grad(enabled):
+    prev_state = get_inplace_requires_grad_allowed()
+    set_inplace_requires_grad_allowed(enabled)
+    try:
+        yield
+    finally:
+        set_inplace_requires_grad_allowed(prev_state)
+
+
+def _vjp_treespec_compare(primals_out, cotangents):
+    # Revert this once #116264 gets fixed
+    _, primals_out_spec = tree_flatten(primals_out)
+    _, cotangents_spec = tree_flatten(cotangents)
+    # Dynamo fails to trace operator.ne below. To bypass this limitation, this
+    # function is not inlined.
+    if primals_out_spec != cotangents_spec:
+        raise RuntimeError(
+            f'Expected pytree structure of cotangents to be the same '
+            f'as pytree structure of outputs to the function. '
+            f'cotangents: {treespec_pprint(cotangents_spec)}, '
+            f'primal output: {treespec_pprint(primals_out_spec)}')
+
+
+def _set_tensor_requires_grad(x):
+    # avoid graph-break on x.requires_grad_()
+    # https://github.com/pytorch/pytorch/pull/110053
+    return x.requires_grad_()
+
+def _create_differentiable(inps, level=None):
+    def create_differentiable(x):
+        if isinstance(x, torch.Tensor):
+            with enable_inplace_requires_grad(True):
+                return _set_tensor_requires_grad(x)
+        raise ValueError(f'Thing passed to transform API must be Tensor, '
+                         f'got {type(x)}')
+    return tree_map(create_differentiable, inps)
+
+
+def _undo_create_differentiable(inps, level=None):
+    def unwrap_tensors(x):
+        if isinstance(x, torch.Tensor):
+            return _unwrap_for_grad(x, level)
+        # TODO: Remove the following hack for namedtuples
+        if isinstance(x, tuple):
+            return tree_map(unwrap_tensors, tuple(x))
+
+        raise RuntimeError(f"Expected tensors, got unsupported type {type(x)}")
+
+    return tree_map(unwrap_tensors, inps)
+
+
+def _is_differentiable(maybe_tensor):
+    if not isinstance(maybe_tensor, torch.Tensor):
+        return False
+    return maybe_tensor.requires_grad
+
+
+def _any_differentiable(tensor_or_tuple_of_tensors):
+    flat_args, _ = tree_unflatten(tensor_or_tuple_of_tensors)
+    return any(tuple(map(_is_differentiable, flat_args)))
+
+
+def _wrap_tensor_for_grad(maybe_tensor, level):
+    if not isinstance(maybe_tensor, torch.Tensor):
+        return maybe_tensor
+    return _wrap_for_grad(maybe_tensor, level)
+
+
+def _wrap_all_tensors(tensor_pytree, level):
+    return tree_map(partial(_wrap_tensor_for_grad, level=level), tensor_pytree)
+
+
+def _as_tuple(val):
+    if isinstance(val, tuple):
+        return val
+    return (val,)
+
+# Version of autograd.grad that handles outputs that don't depend on inputs
+
+
+def _autograd_grad(outputs, inputs, grad_outputs=None, retain_graph=False, create_graph=True):
+    if grad_outputs is None:
+        diff_outputs = tuple(out for out in outputs if out.requires_grad)
+    else:
+        result = tuple((out, go) for out, go in zip(outputs, grad_outputs) if out.requires_grad)
+        if len(result) == 0:
+            diff_outputs, grad_outputs = (), ()
+        else:
+            diff_outputs, grad_outputs = zip(*result)
+    if len(diff_outputs) == 0:
+        return tuple(torch.zeros_like(inp) for inp in inputs)
+    grad_inputs = torch.autograd.grad(diff_outputs, inputs, grad_outputs,
+                                      retain_graph=retain_graph,
+                                      create_graph=create_graph,
+                                      allow_unused=True)
+    grad_inputs = tuple(torch.zeros_like(inp) if gi is None else gi
+                        for gi, inp in zip(grad_inputs, inputs))
+    return grad_inputs
+
+# NOTE [grad and vjp interaction with no_grad]
+#
+# def f(x):
+#   with torch.no_grad():
+#     c = x ** 2
+#   return x - c
+#
+# The thing to consider is if enable_grad is on/off before grad gets called.
+#
+# Case 1: enable_grad is on.
+# grad(f)(x)
+# In this case, `grad` should respect the inner torch.no_grad.
+#
+# Case 2: enable_grad is off
+# with torch.no_grad():
+#   grad(f)(x)
+# In this case, `grad` should respect the inner torch.no_grad, but not the
+# outer one. This is because `grad` is a "function transform": its result
+# should not depend on the result of a context manager outside of `f`.
+#
+# This gives us the following desired behavior:
+# - (nested) grad transforms must obey torch.no_grad inside them
+# - (nested) grad transforms should not obey torch.no_grad outside them
+#
+# To achieve this behavior, upon entering grad/vjp:
+# - we save the current ("previous") is_grad_enabled (*)
+# - we unconditionally enable grad.
+#
+# Inside DynamicLayerBackFallback, when we're temporarily popping `grad` layer
+# off the stack:
+# - if grad_mode is disabled, then we do nothing. (there is a torch.no_grad
+#   active, all subsequent grad transforms must obey it).
+# - if grad_mode is enabled, and the previous is_grad_enabled (*) is False,
+#   then we temporarily restore the previous `is_grad_enabled`. This is
+#   because we're crossing the boundary from a `grad` outside the
+#   no_grad to a `grad` inside the no_grad.
+#
+# NB: vjp has some interesting behavior because the vjp's callable can be called
+# under a different grad_mode than the forward computation...
+#
+# NB: forward-mode AD: forward-mode AD doesn't respect torch.no_grad, but
+# it respects c10::AutoFwGradMode. We've implemented the same logic for
+# our jvp transform (it will have special handling if FwGradMode is disabled).
+
+
+# How do we increment and decrement the nesting? I don't think we can.
+@exposed_in("torch.func")
+def vjp(func: Callable, *primals, has_aux: bool = False):
+    """
+    Standing for the vector-Jacobian product, returns a tuple containing the
+    results of ``func`` applied to ``primals`` and a function that, when
+    given ``cotangents``, computes the reverse-mode Jacobian of ``func`` with
+    respect to ``primals`` times ``cotangents``.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments. Must
+            return one or more Tensors.
+        primals (Tensors): Positional arguments to ``func`` that must all be
+            Tensors. The returned function will also be computing the
+            derivative with respect to these arguments
+        has_aux (bool): Flag indicating that ``func`` returns a
+            ``(output, aux)`` tuple where the first element is the output of
+            the function to be differentiated and the second element is
+            other auxiliary objects that will not be differentiated.
+            Default: False.
+
+    Returns:
+        Returns a ``(output, vjp_fn)`` tuple containing the output of ``func``
+        applied to ``primals`` and a function that computes the vjp of
+        ``func`` with respect to all ``primals`` using the cotangents passed
+        to the returned function. If ``has_aux is True``, then instead returns a
+        ``(output, vjp_fn, aux)`` tuple.
+        The returned ``vjp_fn`` function will return a tuple of each VJP.
+
+    When used in simple cases, :func:`vjp` behaves the same as :func:`grad`
+
+        >>> x = torch.randn([5])
+        >>> f = lambda x: x.sin().sum()
+        >>> (_, vjpfunc) = torch.func.vjp(f, x)
+        >>> grad = vjpfunc(torch.tensor(1.))[0]
+        >>> assert torch.allclose(grad, torch.func.grad(f)(x))
+
+    However, :func:`vjp` can support functions with multiple outputs by
+    passing in the cotangents for each of the outputs
+
+        >>> x = torch.randn([5])
+        >>> f = lambda x: (x.sin(), x.cos())
+        >>> (_, vjpfunc) = torch.func.vjp(f, x)
+        >>> vjps = vjpfunc((torch.ones([5]), torch.ones([5])))
+        >>> assert torch.allclose(vjps[0], x.cos() + -x.sin())
+
+    :func:`vjp` can even support outputs being Python structs
+
+        >>> x = torch.randn([5])
+        >>> f = lambda x: {'first': x.sin(), 'second': x.cos()}
+        >>> (_, vjpfunc) = torch.func.vjp(f, x)
+        >>> cotangents = {'first': torch.ones([5]), 'second': torch.ones([5])}
+        >>> vjps = vjpfunc(cotangents)
+        >>> assert torch.allclose(vjps[0], x.cos() + -x.sin())
+
+    The function returned by :func:`vjp` will compute the partials with
+    respect to each of the ``primals``
+
+        >>> x, y = torch.randn([5, 4]), torch.randn([4, 5])
+        >>> (_, vjpfunc) = torch.func.vjp(torch.matmul, x, y)
+        >>> cotangents = torch.randn([5, 5])
+        >>> vjps = vjpfunc(cotangents)
+        >>> assert len(vjps) == 2
+        >>> assert torch.allclose(vjps[0], torch.matmul(cotangents, y.transpose(0, 1)))
+        >>> assert torch.allclose(vjps[1], torch.matmul(x.transpose(0, 1), cotangents))
+
+    ``primals`` are the positional arguments for ``f``. All kwargs use their
+    default value
+
+        >>> x = torch.randn([5])
+        >>> def f(x, scale=4.):
+        >>>   return x * scale
+        >>>
+        >>> (_, vjpfunc) = torch.func.vjp(f, x)
+        >>> vjps = vjpfunc(torch.ones_like(x))
+        >>> assert torch.allclose(vjps[0], torch.full(x.shape, 4.))
+
+    .. note::
+        Using PyTorch ``torch.no_grad`` together with ``vjp``.
+        Case 1: Using ``torch.no_grad`` inside a function:
+
+            >>> def f(x):
+            >>>     with torch.no_grad():
+            >>>         c = x ** 2
+            >>>     return x - c
+
+        In this case, ``vjp(f)(x)`` will respect the inner ``torch.no_grad``.
+
+        Case 2: Using ``vjp`` inside ``torch.no_grad`` context manager:
+
+            >>> # xdoctest: +SKIP(failing)
+            >>> with torch.no_grad():
+            >>>     vjp(f)(x)
+
+        In this case, ``vjp`` will respect the inner ``torch.no_grad``, but not the
+        outer one. This is because ``vjp`` is a "function transform": its result
+        should not depend on the result of a context manager outside of ``f``.
+    """
+    return _vjp_with_argnums(func, *primals, has_aux=has_aux)
+
+
+@contextlib.contextmanager
+def grad_increment_nesting():
+    try:
+        grad_level = _grad_increment_nesting()
+        yield grad_level
+    finally:
+        _grad_decrement_nesting()
+
+
+@doesnt_support_saved_tensors_hooks
+def _vjp_with_argnums(func: Callable, *primals, argnums: Optional[argnums_t] = None, has_aux: bool = False):
+    # This is the same function as vjp but also accepts an argnums argument
+    # All args are the same as vjp except for the added argument
+    # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
+    #         If None, computes the gradients with respect to all inputs (used for vjp). Default: None
+    #
+    # WARN: Users should NOT call this function directly and should just be calling vjp.
+    # It is only separated so that inputs passed to jacrev but not differentiated get the correct wrappers.
+    #
+    # NOTE: All error messages are produced as if vjp was being called, even if this was called by jacrev
+    #
+    # Returns the same two elements as :func:`vjp` but the function returned, vjp_fn, returns a tuple of VJPs
+    # for only the primal elements given by argnums.
+    with grad_increment_nesting() as level:
+        # See NOTE [grad and vjp interaction with no_grad]
+        with torch.enable_grad():
+            primals = _wrap_all_tensors(primals, level)
+            # Note for the reviewer: This is extremely odd but it passes the
+            # assertion "len(self.block_stack) == 1" on symbolic_convert.py
+            # The equivalent "if argnums is None" fails for some reason
+            if not isinstance(argnums, int) and not argnums:
+                diff_primals = _create_differentiable(primals, level)
+            else:
+                diff_primals = _slice_argnums(primals, argnums, as_tuple=False)
+                tree_map_(partial(_create_differentiable, level=level), diff_primals)
+            primals_out = func(*primals)
+
+            if has_aux:
+                if not (isinstance(primals_out, tuple) and len(primals_out) == 2):
+                    raise RuntimeError(
+                        "vjp(f, *primals): output of function f should be a tuple: (output, aux) "
+                        "if has_aux is True"
+                    )
+                primals_out, aux = primals_out
+                aux = _undo_create_differentiable(aux, level)
+
+            flat_primals_out, primals_out_spec = tree_flatten(primals_out)
+            assert_non_empty_tensor_output(flat_primals_out, 'vjp(f, *primals)')
+            flat_diff_primals, primals_spec = tree_flatten(diff_primals)
+            results = _undo_create_differentiable(primals_out, level)
+
+            for primal_out in flat_primals_out:
+                assert isinstance(primal_out, torch.Tensor)
+                if primal_out.is_floating_point() or primal_out.is_complex():
+                    continue
+                raise RuntimeError("vjp(f, ...): All outputs of f must be "
+                                   "floating-point or complex Tensors, got Tensor "
+                                   f"with dtype {primal_out.dtype}")
+
+        def wrapper(cotangents, retain_graph=True, create_graph=None):
+            if create_graph is None:
+                create_graph = torch.is_grad_enabled()
+            flat_cotangents, cotangents_spec = tree_flatten(cotangents)
+            _vjp_treespec_compare(primals_out, cotangents)
+            result = _autograd_grad(flat_primals_out, flat_diff_primals, flat_cotangents,
+                                    retain_graph=retain_graph, create_graph=create_graph)
+            return tree_unflatten(result, primals_spec)
+
+    if has_aux:
+        return results, wrapper, aux
+    else:
+        return results, wrapper
+
+
+def _safe_zero_index(x):
+    assert len(x) == 1
+    return x[0]
+
+# jacrev and jacfwd don't support complex functions
+# Helper function to throw appropriate error.
+def error_if_complex(func_name, args, is_input):
+    flat_args = pytree.tree_leaves(args)
+    for idx, arg in enumerate(flat_args):
+        if isinstance(arg, torch.Tensor) and arg.dtype.is_complex:
+            input_or_output = ("inputs" if is_input else "outputs")
+            err_msg = (f"{func_name}: Expected all {input_or_output} "
+                       f"to be real but received complex tensor at flattened input idx: {idx}")
+            raise RuntimeError(err_msg)
+
+@exposed_in("torch.func")
+def jacrev(func: Callable, argnums: Union[int, Tuple[int]] = 0, *, has_aux=False,
+           chunk_size: Optional[int] = None,
+           _preallocate_and_copy=False):
+    """
+    Computes the Jacobian of ``func`` with respect to the arg(s) at index
+    ``argnum`` using reverse mode autodiff
+
+    .. note::
+        Using :attr:`chunk_size=1` is equivalent to computing the jacobian
+        row-by-row with a for-loop i.e. the constraints of :func:`vmap` are
+        not applicable.
+
+    Args:
+        func (function): A Python function that takes one or more arguments,
+            one of which must be a Tensor, and returns one or more Tensors
+        argnums (int or Tuple[int]): Optional, integer or tuple of integers,
+            saying which arguments to get the Jacobian with respect to.
+            Default: 0.
+        has_aux (bool): Flag indicating that ``func`` returns a
+            ``(output, aux)`` tuple where the first element is the output of
+            the function to be differentiated and the second element is
+            auxiliary objects that will not be differentiated.
+            Default: False.
+        chunk_size (None or int): If None (default), use the maximum chunk size
+            (equivalent to doing a single vmap over vjp to compute the jacobian).
+            If 1, then compute the jacobian row-by-row with a for-loop.
+            If not None, then compute the jacobian :attr:`chunk_size` rows at a time
+            (equivalent to doing multiple vmap over vjp). If you run into memory issues computing
+            the jacobian, please try to specify a non-None chunk_size.
+
+    Returns:
+        Returns a function that takes in the same inputs as ``func`` and
+        returns the Jacobian of ``func`` with respect to the arg(s) at
+        ``argnums``. If ``has_aux is True``, then the returned function
+        instead returns a ``(jacobian, aux)`` tuple where ``jacobian``
+        is the Jacobian and ``aux`` is auxiliary objects returned by ``func``.
+
+    A basic usage with a pointwise, unary operation will give a diagonal array
+    as the Jacobian
+
+        >>> from torch.func import jacrev
+        >>> x = torch.randn(5)
+        >>> jacobian = jacrev(torch.sin)(x)
+        >>> expected = torch.diag(torch.cos(x))
+        >>> assert torch.allclose(jacobian, expected)
+
+    If you would like to compute the output of the function as well as the
+    jacobian of the function, use the ``has_aux`` flag to return the output
+    as an auxiliary object:
+
+        >>> from torch.func import jacrev
+        >>> x = torch.randn(5)
+        >>>
+        >>> def f(x):
+        >>>   return x.sin()
+        >>>
+        >>> def g(x):
+        >>>   result = f(x)
+        >>>   return result, result
+        >>>
+        >>> jacobian_f, f_x = jacrev(g, has_aux=True)(x)
+        >>> assert torch.allclose(f_x, f(x))
+
+    :func:`jacrev` can be composed with vmap to produce batched
+    Jacobians:
+
+        >>> from torch.func import jacrev, vmap
+        >>> x = torch.randn(64, 5)
+        >>> jacobian = vmap(jacrev(torch.sin))(x)
+        >>> assert jacobian.shape == (64, 5, 5)
+
+    Additionally, :func:`jacrev` can be composed with itself to produce
+    Hessians
+
+        >>> from torch.func import jacrev
+        >>> def f(x):
+        >>>   return x.sin().sum()
+        >>>
+        >>> x = torch.randn(5)
+        >>> hessian = jacrev(jacrev(f))(x)
+        >>> assert torch.allclose(hessian, torch.diag(-x.sin()))
+
+    By default, :func:`jacrev` computes the Jacobian with respect to the first
+    input. However, it can compute the Jacboian with respect to a different
+    argument by using ``argnums``:
+
+        >>> from torch.func import jacrev
+        >>> def f(x, y):
+        >>>   return x + y ** 2
+        >>>
+        >>> x, y = torch.randn(5), torch.randn(5)
+        >>> jacobian = jacrev(f, argnums=1)(x, y)
+        >>> expected = torch.diag(2 * y)
+        >>> assert torch.allclose(jacobian, expected)
+
+    Additionally, passing a tuple to ``argnums`` will compute the Jacobian
+    with respect to multiple arguments
+
+        >>> from torch.func import jacrev
+        >>> def f(x, y):
+        >>>   return x + y ** 2
+        >>>
+        >>> x, y = torch.randn(5), torch.randn(5)
+        >>> jacobian = jacrev(f, argnums=(0, 1))(x, y)
+        >>> expectedX = torch.diag(torch.ones_like(x))
+        >>> expectedY = torch.diag(2 * y)
+        >>> assert torch.allclose(jacobian[0], expectedX)
+        >>> assert torch.allclose(jacobian[1], expectedY)
+
+    .. note::
+        Using PyTorch ``torch.no_grad`` together with ``jacrev``.
+        Case 1: Using ``torch.no_grad`` inside a function:
+
+            >>> def f(x):
+            >>>     with torch.no_grad():
+            >>>         c = x ** 2
+            >>>     return x - c
+
+        In this case, ``jacrev(f)(x)`` will respect the inner ``torch.no_grad``.
+
+        Case 2: Using ``jacrev`` inside ``torch.no_grad`` context manager:
+
+            >>> with torch.no_grad():
+            >>>     jacrev(f)(x)
+
+        In this case, ``jacrev`` will respect the inner ``torch.no_grad``, but not the
+        outer one. This is because ``jacrev`` is a "function transform": its result
+        should not depend on the result of a context manager outside of ``f``.
+    """
+    if not (chunk_size is None or chunk_size > 0):
+        raise ValueError("jacrev: `chunk_size` should be greater than 0.")
+
+    @wraps(func)
+    def wrapper_fn(*args):
+        error_if_complex("jacrev", args, is_input=True)
+        vjp_out = _vjp_with_argnums(func, *args, argnums=argnums, has_aux=has_aux)
+        if has_aux:
+            output, vjp_fn, aux = vjp_out
+        else:
+            output, vjp_fn = vjp_out
+
+        # See NOTE: [Computing jacobian with vmap and vjp for multiple outputs]
+        flat_output, output_spec = tree_flatten(output)
+
+        error_if_complex("jacrev", flat_output, is_input=False)
+
+        # NB: vjp already checks that all outputs are tensors
+        # Step 1: Construct grad_outputs by splitting the standard basis
+        flat_output_numels = tuple(out.numel() for out in flat_output)
+
+        primals = _slice_argnums(args, argnums)
+        flat_primals, primals_spec = tree_flatten(primals)
+
+        def compute_jacobian_stacked():
+            # Helper function to compute chunked Jacobian
+            # The intermediate chunked calculation are only
+            # scoped at this function level.
+            chunked_results = []
+            for flat_basis_chunk in _chunked_standard_basis_for_(flat_output,
+                                                                 flat_output_numels,
+                                                                 chunk_size=chunk_size):
+                if chunk_size == 1:
+                    # sanity check.
+                    for t in flat_basis_chunk:
+                        assert t.size(0) == 1
+
+                    flat_basis_chunk = tree_map(lambda t: torch.squeeze(t, 0), flat_basis_chunk)
+
+                basis = tree_unflatten(flat_basis_chunk, output_spec)
+
+                if chunk_size == 1:
+                    # Behaviour with `chunk_size=1` is same as `for-loop`
+                    # i.e. user shouldn't deal with the limitations of vmap.
+                    chunked_result = vjp_fn(basis)
+                else:  # chunk_size is None or chunk_size != 1
+                    chunked_result = vmap(vjp_fn)(basis)
+
+                flat_results = pytree.tree_leaves(chunked_result)
+
+                if chunk_size == 1:
+                    flat_results = tree_map(lambda t: torch.unsqueeze(t, 0), flat_results)
+
+                chunked_results.append(flat_results)
+
+            if len(chunked_results) == 1:
+                # Short-circuit if we used a single chunk
+                return chunked_results[0]
+
+            # Concatenate chunks.
+            flat_results = []
+            # Iterate and concat the jacobians of different
+            # inputs.
+            for idx in range(len(flat_primals)):
+                r = tuple(r_[idx] for r_ in chunked_results)
+                flat_results.append(torch.cat(r, 0))
+
+            return flat_results
+
+        def compute_jacobian_preallocate_and_copy():
+            # Helper function to compute chunked Jacobian
+            # The intermediate chunked calculation are only
+            # scoped at this function level.
+            out_vec_size = sum(flat_output_numels)
+
+            # Don't pre-allocate if we have a single chunk.
+            if not (chunk_size is None or chunk_size >= out_vec_size):
+                stacked_results = [primal.new_zeros(out_vec_size, *primal.shape) for primal in flat_primals]
+
+            for idx, flat_basis_chunk in enumerate(_chunked_standard_basis_for_(flat_output,
+                                                                                flat_output_numels,
+                                                                                chunk_size=chunk_size)):
+                if chunk_size == 1:
+                    # sanity check.
+                    for t in flat_basis_chunk:
+                        assert t.size(0) == 1
+
+                    flat_basis_chunk = [torch.squeeze(t, 0) for t in flat_basis_chunk]
+
+                basis = tree_unflatten(flat_basis_chunk, output_spec)
+
+                if chunk_size == 1:
+                    # Behaviour with `chunk_size=1` is same as `for-loop`
+                    # i.e. user shouldn't deal with the limitations of vmap.
+                    chunked_result = vjp_fn(basis)
+                else:  # chunk_size is None or chunk_size != 1
+                    chunked_result = vmap(vjp_fn)(basis)
+
+                flat_results = pytree.tree_leaves(chunked_result)
+
+                # Short-circuit if we have a single chunk.
+                if chunk_size is None or chunk_size >= out_vec_size:
+                    if chunk_size == 1:  # and out_vec_size == 1
+                        # Since we squeezed the output dim
+                        flat_results = tree_map(lambda t: torch.unsqueeze(t, 0), flat_results)
+                    return flat_results
+
+                for r, sr in zip(flat_results, stacked_results):
+                    sr[idx * chunk_size: (idx + 1) * chunk_size].copy_(r)
+
+            return stacked_results
+
+        if _preallocate_and_copy:
+            flat_jacobians_per_input = compute_jacobian_preallocate_and_copy()
+        else:
+            flat_jacobians_per_input = compute_jacobian_stacked()
+
+        # Step 2: The returned jacobian is one big tensor per input. In this step,
+        # we split each Tensor by output.
+        flat_jacobians_per_input = [result.split(flat_output_numels, dim=0) for result in flat_jacobians_per_input]
+        flat_input_flat_output = [
+            tuple(split.view(out.shape + primal.shape)
+                  for split, out in zip(splits, flat_output))
+            for splits, primal in zip(flat_jacobians_per_input, flat_primals)
+        ]
+
+        # Step 3: Right now, `jacobian` is a List[List[Tensor]].
+        # The outer List corresponds to the number of primals,
+        # the inner List corresponds to the number of outputs.
+        # We need to:
+        # a. Exchange the order of the outer List and inner List
+        # b. tree_unflatten the inner Lists (which correspond to the primals)
+        # c. handle the argnums=int case
+        # d. tree_unflatten the outer List (which corresponds to the outputs)
+        flat_output_flat_input = tuple(zip(*flat_input_flat_output))
+
+        flat_output_input = tuple(tree_unflatten(flat_input, primals_spec)
+                                  for flat_input in flat_output_flat_input)
+
+        if isinstance(argnums, int):
+            flat_output_input = tuple(_safe_zero_index(flat_input)
+                                      for flat_input in flat_output_input)
+        output_input = tree_unflatten(flat_output_input, output_spec)
+        if has_aux:
+            return output_input, aux
+        return output_input
+    return wrapper_fn
+
+# NOTE: [Computing jacobian with vmap and vjp for multiple outputs]
+#
+# Let's consider f(x) = (x**2, x.sum()) and let x = torch.randn(3).
+# It turns out we can compute the jacobian of this function with a single
+# call to autograd.grad by using vmap over the correct grad_outputs.
+#
+# Firstly, one way to compute the jacobian is to stack x**2 and x.sum()
+# into a 4D vector. E.g., use g(x) = torch.stack([x**2, x.sum()])
+#
+# To get the first row of the jacobian, we call
+# >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([1, 0, 0, 0]))
+# To get the 2nd row of the jacobian, we call
+# >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([0, 1, 0, 0]))
+# and so on.
+#
+# Using vmap, we can vectorize all 4 of these computations into one by
+# passing the standard basis for R^4 as the grad_output.
+# vmap(partial(autograd.grad, g(x), x))(torch.eye(4)).
+#
+# Now, how do we compute the jacobian *without stacking the output*?
+# We can just split the standard basis across the outputs. So to
+# compute the jacobian of f(x), we'd use
+# >>> autograd.grad(f(x), x, grad_outputs=_construct_standard_basis_for(...))
+# The grad_outputs looks like the following:
+# ( torch.tensor([[1, 0, 0],
+#                 [0, 1, 0],
+#                 [0, 0, 1],
+#                 [0, 0, 0]]),
+#   torch.tensor([[0],
+#                 [0],
+#                 [0],
+#                 [1]]) )
+#
+# But we're not done yet!
+# >>> vmap(partial(autograd.grad(f(x), x, grad_outputs=...)))
+# returns a Tensor of shape [4, 3]. We have to remember to split the
+# jacobian of shape [4, 3] into two:
+# - one of shape [3, 3] for the first output
+# - one of shape [   3] for the second output
+
+
+def _chunked_standard_basis_for_(tensors, tensor_numels, chunk_size=None):
+    # This function:
+    # - constructs a N=sum(tensor_numels) standard basis. i.e. an NxN identity matrix.
+    # - Splits the identity matrix into chunks with each chunk size determined by `tensor_numels`.
+    # - Each chunk corresponds to one tensor. The chunk has the same dtype and
+    #   device as the tensor
+    #
+    # For example, with tensor_numels = [1, 2, 1], this function returns:
+    # ( tensor([[1],     tensor([[0, 0],      tensor([[0],
+    #           [0],             [1, 0],              [0],
+    #           [0],             [0, 1],              [0],
+    #           [0]])  ,         [0, 0]])  ,          [1]])  )
+    #
+    # Precondition: tensor_numels == tuple(tensor.numel() for tensor in tensors)
+    # Precondition: tensors always has at least one element.
+    #
+    # See NOTE: [Computing jacobian with vmap and grad for multiple tensors]
+    # for context behind this function.
+    # NOTE: Argument `chunk_size` is used to generate chunked basis instead of
+    #       one huge basis matrix. `chunk_size` dictates the maximum size of the
+    #       basis matrix along dim=0.
+    assert len(tensors) == len(tensor_numels)
+    assert len(tensors) > 0
+    assert chunk_size is None or chunk_size > 0
+    total_numel = sum(tensor_numels)
+    if chunk_size and chunk_size < total_numel:
+        chunk_numels = get_chunk_sizes(total_numel, chunk_size)
+    else:  # chunk_size is None or chunk_size >= total_numel
+        chunk_size = total_numel
+        chunk_numels = [total_numel]
+
+    diag_start_indices = (0, *torch.tensor(tensor_numels).cumsum(dim=0)[:-1].neg().unbind())
+
+    for chunk_idx, total_numel in enumerate(chunk_numels):
+        chunks = tuple(tensor.new_zeros(total_numel, tensor_numel)
+                       for tensor, tensor_numel in zip(tensors, tensor_numels))
+
+        for chunk, diag_start_idx in zip(chunks, diag_start_indices):
+            chunk.diagonal(diag_start_idx + chunk_idx * chunk_size).fill_(1)
+        chunks = tuple(chunk.view(total_numel, *tensor.shape)
+                       for chunk, tensor in zip(chunks, tensors))
+        yield chunks
+
+def _construct_standard_basis_for(tensors, tensor_numels):
+    for basis in _chunked_standard_basis_for_(tensors, tensor_numels, chunk_size=None):
+        return basis
+
+
+def _validate_and_wrap_argnum(argnum, num_args):
+    if not isinstance(argnum, int):
+        raise RuntimeError(f'argnum must be int, got: {type(argnum)}')
+    if argnum >= 0 and argnum < num_args:
+        return argnum
+    if argnum < 0 and argnum >= -num_args:
+        return argnum + num_args
+    raise RuntimeError(f'Got argnum={argnum}, but only {num_args} positional inputs')
+
+
+def _check_unique_non_empty(argnums):
+    if isinstance(argnums, tuple):
+        if len(argnums) == 0:
+            raise RuntimeError("argnums must be non-empty")
+        if len(set(argnums)) != len(argnums):
+            raise RuntimeError(f"argnums elements must be unique, got {argnums}")
+
+
+def _replace_args(old_args, new_args, argnums):
+    if isinstance(argnums, int):
+        if len(new_args) != 1:
+            raise RuntimeError(f'new_args should be of size 1, was of size {len(new_args)}')
+        return tuple(new_args[0] if i == argnums else old_args[i] for i in range(len(old_args)))
+    if isinstance(argnums, tuple):
+        if len(new_args) != len(argnums):
+            raise RuntimeError(
+                "new_args should have the same size as argnums. "
+                f"Argnums size {len(argnums)}, new_args size {len(new_args)}")
+
+        def get_right_elem(i):
+            return new_args[argnums.index(i)] if i in argnums else old_args[i]
+
+        return tuple(get_right_elem(i) for i in range(len(old_args)))
+    raise RuntimeError(f'argnums must be int or Tuple[int, ...], got: {type(argnums)}')
+
+
+def _validate_and_wrap_argnums(argnums, num_args):
+    if isinstance(argnums, int):
+        return _validate_and_wrap_argnum(argnums, num_args)
+    if isinstance(argnums, tuple):
+        return tuple(_validate_and_wrap_argnum(argnum, num_args) for argnum in argnums)
+    raise AssertionError("Should never get here")
+
+
+def _slice_argnums(args, argnums, as_tuple=True):
+    if not isinstance(argnums, int) and not isinstance(argnums, tuple):
+        raise RuntimeError(f'argnums must be int or Tuple[int, ...], got: {type(argnums)}')
+    argnums = _validate_and_wrap_argnums(argnums, len(args))
+    _check_unique_non_empty(argnums)
+    if isinstance(argnums, int):
+        if as_tuple:
+            return (args[argnums],)
+        else:
+            return args[argnums]
+    return tuple(args[i] for i in argnums)
+
+
+JVP_NESTING = 0
+
+
+@contextlib.contextmanager
+def noop():
+    yield
+
+
+def assert_flat_tuple_of_tensors(elts: Any, api: str, argname: str) -> None:
+    if not isinstance(elts, tuple):
+        raise RuntimeError(
+            f'{api}: Expected {argname} to be a tuple of Tensors, got {type(elts)}')
+    for elt in elts:
+        if isinstance(elt, torch.Tensor):
+            continue
+        raise RuntimeError(
+            f'{api}: Expected {argname} to be a tuple of Tensors, got '
+            f'a tuple with an element of type {type(elt)}')
+    if len(elts) == 0:
+        raise RuntimeError(
+            f'{api}: Expected {argname} to be a non-empty tuple of Tensors.')
+
+
+def assert_non_empty_tensor_output(output: List[Any], api: str) -> None:
+    if (len(output) == 1 and output[0] is None) or len(output) < 1:
+        raise RuntimeError(
+            f'{api}: Expected f to be a function that has non-empty output (got output = {output})'
+        )
+    for o in output:
+        if not isinstance(o, torch.Tensor):
+            raise RuntimeError(
+                f'{api}: expected f(*primals) to return only tensors'
+                f', got unsupported type {type(o)}'
+            )
+
+
+def assert_output_is_tensor_or_tensors(output: Any, api: str) -> None:
+    if isinstance(output, torch.Tensor):
+        return
+    if not isinstance(output, tuple):
+        raise RuntimeError(
+            f'{api}: Expected output of f to be a Tensor or Tensors, got '
+            f'{type(output)}')
+    if len(output) == 0:
+        raise RuntimeError(
+            f'{api}: Expected output of f to be a non-empty tuple of Tensors.')
+    for out in output:
+        if isinstance(out, torch.Tensor):
+            continue
+        raise RuntimeError(
+            f'{api}: Expected output of f to be a Tensor or Tensors, got '
+            f'{type(out)} as an output')
+
+
+def assert_non_empty_list_of_tensors(output: List[torch.Tensor], api: str, argname: str) -> None:
+    if len(output) == 0:
+        raise RuntimeError(
+            f'{api}: Expected {argname} to contain at least one Tensor.')
+    for out in output:
+        if isinstance(out, torch.Tensor):
+            continue
+        raise RuntimeError(
+            f'{api}: Expected {argname} to only contain Tensors, got '
+            f'{type(out)}')
+
+
+jvp_str = 'jvp(f, primals, tangents)'
+
+
+def safe_unpack_dual(dual, strict):
+    if not isinstance(dual, torch.Tensor):
+        raise RuntimeError(
+            f'{jvp_str}: expected f(*args) to return only tensors'
+            f', got unsupported type {type(dual)}'
+        )
+
+    primal, tangent = fwAD.unpack_dual(dual)
+    if tangent is None:
+        if strict:
+            raise RuntimeError(
+                'jvp(f, primals, tangents, strict=True): '
+                'The output of f is independent of '
+                'the inputs. This is not allowed with strict=True.')
+        tangent = torch.zeros_like(primal)
+    return primal, tangent
+
+
+@exposed_in("torch.func")
+def jvp(func: Callable, primals: Any, tangents: Any, *, strict: bool = False, has_aux: bool = False):
+    """
+    Standing for the Jacobian-vector product, returns a tuple containing
+    the output of `func(*primals)` and the "Jacobian of ``func`` evaluated at
+    ``primals``" times ``tangents``. This is also known as forward-mode autodiff.
+
+    Args:
+        func (function): A Python function that takes one or more arguments,
+            one of which must be a Tensor, and returns one or more Tensors
+        primals (Tensors): Positional arguments to ``func`` that must all be
+            Tensors. The returned function will also be computing the
+            derivative with respect to these arguments
+        tangents (Tensors): The "vector" for which Jacobian-vector-product is
+            computed. Must be the same structure and sizes as the inputs to
+            ``func``.
+        has_aux (bool): Flag indicating that ``func`` returns a
+            ``(output, aux)`` tuple where the first element is the output of
+            the function to be differentiated and the second element is
+            other auxiliary objects that will not be differentiated.
+            Default: False.
+
+    Returns:
+        Returns a ``(output, jvp_out)`` tuple containing the output of ``func``
+        evaluated at ``primals`` and the Jacobian-vector product.
+        If ``has_aux is True``, then instead returns a ``(output, jvp_out, aux)`` tuple.
+
+    .. note::
+        You may see this API error out with "forward-mode AD not implemented
+        for operator X". If so, please file a bug report and we will prioritize it.
+
+    jvp is useful when you wish to compute gradients of a function R^1 -> R^N
+
+        >>> from torch.func import jvp
+        >>> x = torch.randn([])
+        >>> f = lambda x: x * torch.tensor([1., 2., 3])
+        >>> value, grad = jvp(f, (x,), (torch.tensor(1.),))
+        >>> assert torch.allclose(value, f(x))
+        >>> assert torch.allclose(grad, torch.tensor([1., 2, 3]))
+
+    :func:`jvp` can support functions with multiple inputs by passing in the
+    tangents for each of the inputs
+
+         >>> from torch.func import jvp
+         >>> x = torch.randn(5)
+         >>> y = torch.randn(5)
+         >>> f = lambda x, y: (x * y)
+         >>> _, output = jvp(f, (x, y), (torch.ones(5), torch.ones(5)))
+         >>> assert torch.allclose(output, x + y)
+
+    """
+
+    return _jvp_with_argnums(func, primals, tangents, argnums=None, strict=strict, has_aux=has_aux)
+
+
+@doesnt_support_saved_tensors_hooks
+def _jvp_with_argnums(func: Callable, primals: Any, tangents: Any, argnums: Optional[argnums_t], *,
+                      strict: bool = False, has_aux: bool):
+    # This is the same function as jvp but also accepts an argnums argument
+    # Most args are the same as jvp except for the added argument
+    # argnums (Optional[int or tuple[int]]): Optional, specifies the argument(s) to compute gradients with respect to.
+    #         If None, computes the gradients with respect to all inputs (used for jvp). Default: None
+    # Because of this, tangents must be of length argnums and matches up to the corresponding primal whose index is
+    # given by argnums
+    #
+    # WARN: Users should NOT call this function directly and should just be calling jvp.
+    # It is only separated so that inputs passed to jacfwd but not differentiated get the correct wrappers.
+    #
+    # NOTE: All error messages are produced as if jvp was being called, even if this was called by jacfwd
+    #
+    # Returns the same two elements as :func:`jvp` but the returned tuple, ``jvp_out``, only has JVPs with respect to
+    # the primals given by argnums
+    if not isinstance(primals, tuple):
+        raise RuntimeError(
+            f'{jvp_str}: Expected primals to be a tuple. '
+            f'E.g. it should be valid to call f(*primals).')
+    diff_args = primals if argnums is None else _slice_argnums(primals, argnums)
+    flat_primals, primals_spec = tree_flatten(diff_args)
+    flat_tangents, tangents_spec = tree_flatten(tangents)
+    if primals_spec != tangents_spec:
+        raise RuntimeError(
+            f'{jvp_str}: Expected primals and tangents to have the same python '
+            f'structure. For example, if primals is a tuple of 3 tensors, '
+            f'tangents also must be. Got primals with structure {primals_spec} '
+            f'and tangents with structure {tangents_spec}')
+    assert_non_empty_list_of_tensors(flat_primals, jvp_str, 'primals')
+    assert_non_empty_list_of_tensors(flat_tangents, jvp_str, 'tangents')
+
+    level = _jvp_increment_nesting()
+    try:
+        global JVP_NESTING
+        JVP_NESTING += 1
+        with fwAD._set_fwd_grad_enabled(True):
+            ctx = fwAD.dual_level if JVP_NESTING == 1 else noop
+            with ctx():
+                flat_duals = tuple(fwAD.make_dual(p, t)
+                                   for p, t in zip(flat_primals, flat_tangents))
+                duals = tree_unflatten(flat_duals, primals_spec)
+                if argnums is not None:
+                    primals = _wrap_all_tensors(primals, level)
+                    duals = _replace_args(primals, duals, argnums)
+                result_duals = func(*duals)
+                if has_aux:
+                    if not (isinstance(result_duals, tuple) and len(result_duals) == 2):
+                        raise RuntimeError(
+                            f"{jvp_str}: output of function f should be a tuple: (output, aux) "
+                            "if has_aux is True"
+                        )
+                    result_duals, aux = result_duals
+                    aux = _undo_create_differentiable(aux, level)
+
+                result_duals, spec = tree_flatten(result_duals)
+                assert_non_empty_tensor_output(result_duals, jvp_str)
+
+                primals_out, tangents_out = \
+                    zip(*[safe_unpack_dual(dual, strict) for dual in result_duals])
+                primals_out = tree_map(
+                    partial(_undo_create_differentiable, level=level), primals_out)
+                tangents_out = tree_map(
+                    partial(_undo_create_differentiable, level=level), tangents_out)
+
+                primals_out_unflatten = tree_unflatten(primals_out, spec)
+                tangents_out_unflatten = tree_unflatten(tangents_out, spec)
+                if has_aux:
+                    return primals_out_unflatten, tangents_out_unflatten, aux
+
+                return primals_out_unflatten, tangents_out_unflatten
+    finally:
+        _jvp_decrement_nesting()
+        JVP_NESTING -= 1
+
+
+def safe_unflatten(tensor, dim, shape):
+    if len(shape) == 0:
+        assert tensor.shape[dim] == 1
+        return tensor.squeeze(dim)
+    return tensor.unflatten(dim, shape)
+
+
+@exposed_in("torch.func")
+def jacfwd(func: Callable, argnums: argnums_t = 0, has_aux: bool = False, *, randomness: str = "error"):
+    """
+    Computes the Jacobian of ``func`` with respect to the arg(s) at index
+    ``argnum`` using forward-mode autodiff
+
+    Args:
+        func (function): A Python function that takes one or more arguments,
+            one of which must be a Tensor, and returns one or more Tensors
+        argnums (int or Tuple[int]): Optional, integer or tuple of integers,
+            saying which arguments to get the Jacobian with respect to.
+            Default: 0.
+        has_aux (bool): Flag indicating that ``func`` returns a
+            ``(output, aux)`` tuple where the first element is the output of
+            the function to be differentiated and the second element is
+            auxiliary objects that will not be differentiated.
+            Default: False.
+        randomness(str): Flag indicating what type of randomness to use.
+            See :func:`vmap` for more detail. Allowed: "different", "same", "error".
+            Default: "error"
+
+    Returns:
+        Returns a function that takes in the same inputs as ``func`` and
+        returns the Jacobian of ``func`` with respect to the arg(s) at
+        ``argnums``. If ``has_aux is True``, then the returned function
+        instead returns a ``(jacobian, aux)`` tuple where ``jacobian``
+        is the Jacobian and ``aux`` is auxiliary objects returned by ``func``.
+
+    .. note::
+        You may see this API error out with "forward-mode AD not implemented
+        for operator X". If so, please file a bug report and we will prioritize it.
+        An alternative is to use :func:`jacrev`, which has better operator coverage.
+
+    A basic usage with a pointwise, unary operation will give a diagonal array
+    as the Jacobian
+
+        >>> from torch.func import jacfwd
+        >>> x = torch.randn(5)
+        >>> jacobian = jacfwd(torch.sin)(x)
+        >>> expected = torch.diag(torch.cos(x))
+        >>> assert torch.allclose(jacobian, expected)
+
+    :func:`jacfwd` can be composed with vmap to produce batched
+    Jacobians:
+
+        >>> from torch.func import jacfwd, vmap
+        >>> x = torch.randn(64, 5)
+        >>> jacobian = vmap(jacfwd(torch.sin))(x)
+        >>> assert jacobian.shape == (64, 5, 5)
+
+    If you would like to compute the output of the function as well as the
+    jacobian of the function, use the ``has_aux`` flag to return the output
+    as an auxiliary object:
+
+        >>> from torch.func import jacfwd
+        >>> x = torch.randn(5)
+        >>>
+        >>> def f(x):
+        >>>   return x.sin()
+        >>>
+        >>> def g(x):
+        >>>   result = f(x)
+        >>>   return result, result
+        >>>
+        >>> jacobian_f, f_x = jacfwd(g, has_aux=True)(x)
+        >>> assert torch.allclose(f_x, f(x))
+
+    Additionally, :func:`jacrev` can be composed with itself or :func:`jacrev`
+    to produce Hessians
+
+        >>> from torch.func import jacfwd, jacrev
+        >>> def f(x):
+        >>>   return x.sin().sum()
+        >>>
+        >>> x = torch.randn(5)
+        >>> hessian = jacfwd(jacrev(f))(x)
+        >>> assert torch.allclose(hessian, torch.diag(-x.sin()))
+
+    By default, :func:`jacfwd` computes the Jacobian with respect to the first
+    input. However, it can compute the Jacboian with respect to a different
+    argument by using ``argnums``:
+
+        >>> from torch.func import jacfwd
+        >>> def f(x, y):
+        >>>   return x + y ** 2
+        >>>
+        >>> x, y = torch.randn(5), torch.randn(5)
+        >>> jacobian = jacfwd(f, argnums=1)(x, y)
+        >>> expected = torch.diag(2 * y)
+        >>> assert torch.allclose(jacobian, expected)
+
+    Additionally, passing a tuple to ``argnums`` will compute the Jacobian
+    with respect to multiple arguments
+
+        >>> from torch.func import jacfwd
+        >>> def f(x, y):
+        >>>   return x + y ** 2
+        >>>
+        >>> x, y = torch.randn(5), torch.randn(5)
+        >>> jacobian = jacfwd(f, argnums=(0, 1))(x, y)
+        >>> expectedX = torch.diag(torch.ones_like(x))
+        >>> expectedY = torch.diag(2 * y)
+        >>> assert torch.allclose(jacobian[0], expectedX)
+        >>> assert torch.allclose(jacobian[1], expectedY)
+
+    """
+    @wraps(func)
+    def wrapper_fn(*args):
+        error_if_complex("jacfwd", args, is_input=True)
+        primals = args if argnums is None else _slice_argnums(args, argnums)
+        flat_primals, primals_spec = tree_flatten(primals)
+        flat_primals_numels = tuple(p.numel() for p in flat_primals)
+        flat_basis = _construct_standard_basis_for(flat_primals, flat_primals_numels)
+        basis = tree_unflatten(flat_basis, primals_spec)
+
+        def push_jvp(basis):
+            output = _jvp_with_argnums(func, args, basis, argnums=argnums, has_aux=has_aux)
+            # output[0] is the output of `func(*args)`
+            error_if_complex("jacfwd", output[0], is_input=False)
+            if has_aux:
+                _, jvp_out, aux = output
+                return jvp_out, aux
+            _, jvp_out = output
+            return jvp_out
+
+        results = vmap(push_jvp, randomness=randomness)(basis)
+        if has_aux:
+            results, aux = results
+            # aux is in the standard basis format, e.g. NxN matrix
+            # We need to fetch the first element as original `func` output
+            flat_aux, aux_spec = tree_flatten(aux)
+            flat_aux = [value[0] for value in flat_aux]
+            aux = tree_unflatten(flat_aux, aux_spec)
+
+        jac_outs, spec = tree_flatten(results)
+        # Most probably below output check can never raise an error
+        # as jvp should test the output before
+        # assert_non_empty_output(jac_outs, 'jacfwd(f, ...)(*args)')
+
+        jac_outs_ins = tuple(
+            tuple(
+                safe_unflatten(jac_out_in, -1, primal.shape)
+                for primal, jac_out_in in
+                zip(flat_primals, jac_out.movedim(0, -1).split(flat_primals_numels, dim=-1))
+            )
+            for jac_out in jac_outs
+        )
+        jac_outs_ins = tuple(tree_unflatten(jac_ins, primals_spec) for jac_ins in jac_outs_ins)
+
+        if isinstance(argnums, int):
+            jac_outs_ins = tuple(jac_ins[0] for jac_ins in jac_outs_ins)
+        if has_aux:
+            return tree_unflatten(jac_outs_ins, spec), aux
+        return tree_unflatten(jac_outs_ins, spec)
+    return wrapper_fn
+
+
+@exposed_in("torch.func")
+def hessian(func, argnums=0):
+    """
+    Computes the Hessian of ``func`` with respect to the arg(s) at index
+    ``argnum`` via a forward-over-reverse strategy.
+
+    The forward-over-reverse strategy (composing ``jacfwd(jacrev(func))``) is
+    a good default for good performance. It is possible to compute Hessians
+    through other compositions of :func:`jacfwd` and :func:`jacrev` like
+    ``jacfwd(jacfwd(func))`` or ``jacrev(jacrev(func))``.
+
+    Args:
+        func (function): A Python function that takes one or more arguments,
+            one of which must be a Tensor, and returns one or more Tensors
+        argnums (int or Tuple[int]): Optional, integer or tuple of integers,
+            saying which arguments to get the Hessian with respect to.
+            Default: 0.
+
+    Returns:
+        Returns a function that takes in the same inputs as ``func`` and
+        returns the Hessian of ``func`` with respect to the arg(s) at
+        ``argnums``.
+
+    .. note::
+        You may see this API error out with "forward-mode AD not implemented
+        for operator X". If so, please file a bug report and we will prioritize it.
+        An alternative is to use ``jacrev(jacrev(func))``, which has better
+        operator coverage.
+
+    A basic usage with a R^N -> R^1 function gives a N x N Hessian:
+
+        >>> from torch.func import hessian
+        >>> def f(x):
+        >>>   return x.sin().sum()
+        >>>
+        >>> x = torch.randn(5)
+        >>> hess = hessian(f)(x)  # equivalent to jacfwd(jacrev(f))(x)
+        >>> assert torch.allclose(hess, torch.diag(-x.sin()))
+
+    """
+    return jacfwd(jacrev(func, argnums), argnums)
+
+
+@doesnt_support_saved_tensors_hooks
+def grad_and_value_impl(func, argnums, has_aux, args, kwargs) -> Callable:
+    with grad_increment_nesting() as level:
+        output, aux, grad_input = None, None, None
+        # See NOTE [grad and vjp interaction with no_grad]
+        with torch.enable_grad():
+            args = _wrap_all_tensors(args, level)
+            kwargs = _wrap_all_tensors(kwargs, level)
+            diff_args = _slice_argnums(args, argnums, as_tuple=False)
+            tree_map_(partial(_create_differentiable, level=level), diff_args)
+
+            output = func(*args, **kwargs)
+            if has_aux:
+                if not (isinstance(output, tuple) and len(output) == 2):
+                    raise RuntimeError(
+                        "grad_and_value(f)(*args): output of function f should be a tuple: (output, aux) "
+                        "if has_aux is True"
+                    )
+                output, aux = output
+
+            if not isinstance(output, torch.Tensor):
+                raise RuntimeError('grad_and_value(f)(*args): Expected f(*args) '
+                                   f'to return a Tensor, got {type(output)}')
+            if output.dim() != 0:
+                raise RuntimeError('grad_and_value(f)(*args): Expected f(*args) '
+                                   'to return a scalar Tensor, got tensor with '
+                                   f'{output.dim()} dims. Maybe you wanted to '
+                                   'use the vjp or jacrev APIs instead?')
+
+            flat_diff_args, spec = tree_flatten(diff_args)
+
+            # NB: need create_graph so that backward pass isn't run in no_grad mode
+            flat_outputs = _as_tuple(output)
+            flat_grad_input = _autograd_grad(flat_outputs, flat_diff_args, create_graph=True)
+            grad_input = tree_unflatten(flat_grad_input, spec)
+
+            grad_input = _undo_create_differentiable(grad_input, level)
+            output = _undo_create_differentiable(output, level)
+            if has_aux:
+                aux = _undo_create_differentiable(aux, level)
+
+        if has_aux:
+            return grad_input, (output, aux)
+        return grad_input, output
+
+
+def grad_impl(func: Callable, argnums: argnums_t, has_aux: bool, args, kwargs):
+    results = grad_and_value_impl(func, argnums, has_aux, args, kwargs)
+    if has_aux:
+        grad, (_, aux) = results
+        return grad, aux
+    grad, _ = results
+    return grad
+
+def _maybe_wrap_functional_tensor(maybe_tensor, level, *, _python_functionalize: bool = False):
+    if not isinstance(maybe_tensor, torch.Tensor):
+        return maybe_tensor
+    wrapped = _wrap_functional_tensor(maybe_tensor, level)
+    _assert_wrapped_functional(maybe_tensor, wrapped)
+    if _python_functionalize:
+        out = FunctionalTensor(wrapped)
+        torch._mirror_autograd_meta_to(maybe_tensor, out)
+        return out
+    return wrapped
+
+
+def _wrap_all_tensors_to_functional(tensor_pytree, level, *, _python_functionalize: bool = False):
+    return tree_map(partial(lambda x: _maybe_wrap_functional_tensor(
+        x, level, _python_functionalize=_python_functionalize)), tensor_pytree)
+
+
+def _maybe_unwrap_functional_tensor(maybe_tensor, *, reapply_views: bool):
+    if not isinstance(maybe_tensor, torch.Tensor):
+        return maybe_tensor
+    if isinstance(maybe_tensor, FunctionalTensor):
+        maybe_tensor = maybe_tensor.elem
+
+    if not torch._is_functional_tensor(maybe_tensor):
+        # If it's not a functional tensor, just return it.
+        # This can happen if we functionalize a fn that returns a global,
+        # which was never wrapped properly.
+        return maybe_tensor
+    # Sync any pending updates on the output tensor
+    torch._sync(maybe_tensor)
+    return _unwrap_functional_tensor(maybe_tensor, reapply_views)
+
+
+def _unwrap_all_tensors_from_functional(tensor_pytree, *, reapply_views: bool):
+    return tree_map(lambda t: _maybe_unwrap_functional_tensor(t, reapply_views=reapply_views), tensor_pytree)
+
+
+@exposed_in("torch.func")
+def functionalize(func: Callable, *, remove: str = 'mutations') -> Callable:
+    """
+    functionalize is a transform that can be used to remove (intermediate)
+    mutations and aliasing from a function, while preserving the function's
+    semantics.
+
+    ``functionalize(func)`` returns a new function with the same semantics
+    as ``func``, but with all intermediate mutations removed.
+    Every inplace operation performed on an intermediate tensor:
+    ``intermediate.foo_()``
+    gets replaced by its out-of-place equivalent:
+    ``intermediate_updated = intermediate.foo()``.
+
+    functionalize is useful for shipping a pytorch program off to
+    backends or compilers that aren't able to easily represent
+    mutations or aliasing operators.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+        remove (str): An optional string argument, that takes on either
+            the value 'mutations' or 'mutations_and_views'.
+            If 'mutations' is passed in then all mutating operators
+            will be replaced with their non-mutating equivalents.
+            If 'mutations_and_views' is passed in, then additionally, all aliasing
+            operators will be replaced with their non-aliasing equivalents.
+            Default: 'mutations'.
+
+    Returns:
+        Returns a new "functionalized" function. It takes the same inputs as
+        ``func``, and has the same behavior, but any mutations
+        (and optionally aliasing) performed on intermediate tensors
+        in the function will be removed.
+
+    functionalize will also remove mutations (and views) that were performed on function inputs.
+    However to preserve semantics, functionalize will "fix up" the mutations after
+    the transform has finished running, by detecting if any tensor inputs "should have"
+    been mutated, and copying the new data back to the inputs if necessary.
+
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> import torch
+        >>> from torch.fx.experimental.proxy_tensor import make_fx
+        >>> from torch.func import functionalize
+        >>>
+        >>> # A function that uses mutations and views, but only on intermediate tensors.
+        >>> def f(a):
+        ...     b = a + 1
+        ...     c = b.view(-1)
+        ...     c.add_(1)
+        ...     return b
+        ...
+        >>> inpt = torch.randn(2)
+        >>>
+        >>> out1 = f(inpt)
+        >>> out2 = functionalize(f)(inpt)
+        >>>
+        >>> # semantics are the same (outputs are equivalent)
+        >>> print(torch.allclose(out1, out2))
+        True
+        >>>
+        >>> f_traced = make_fx(f)(inpt)
+        >>> f_no_mutations_traced = make_fx(functionalize(f))(inpt)
+        >>> f_no_mutations_and_views_traced = make_fx(functionalize(f, remove='mutations_and_views'))(inpt)
+        >>>
+        >>> print(f_traced.code)
+
+
+
+        def forward(self, a_1):
+            add = torch.ops.aten.add(a_1, 1);  a_1 = None
+            view = torch.ops.aten.view(add, [-1])
+            add_ = torch.ops.aten.add_(view, 1);  view = None
+            return add
+
+        >>> print(f_no_mutations_traced.code)
+
+
+
+        def forward(self, a_1):
+            add = torch.ops.aten.add(a_1, 1);  a_1 = None
+            view = torch.ops.aten.view(add, [-1]);  add = None
+            add_1 = torch.ops.aten.add(view, 1);  view = None
+            view_1 = torch.ops.aten.view(add_1, [2]);  add_1 = None
+            return view_1
+
+        >>> print(f_no_mutations_and_views_traced.code)
+
+
+
+        def forward(self, a_1):
+            add = torch.ops.aten.add(a_1, 1);  a_1 = None
+            view_copy = torch.ops.aten.view_copy(add, [-1]);  add = None
+            add_1 = torch.ops.aten.add(view_copy, 1);  view_copy = None
+            view_copy_1 = torch.ops.aten.view_copy(add_1, [2]);  add_1 = None
+            return view_copy_1
+
+
+        >>> # A function that mutates its input tensor
+        >>> def f(a):
+        ...     b = a.view(-1)
+        ...     b.add_(1)
+        ...     return a
+        ...
+        >>> f_no_mutations_and_views_traced = make_fx(functionalize(f, remove='mutations_and_views'))(inpt)
+        >>> #
+        >>> # All mutations and views have been removed,
+        >>> # but there is an extra copy_ in the graph to correctly apply the mutation to the input
+        >>> # after the function has completed.
+        >>> print(f_no_mutations_and_views_traced.code)
+
+
+
+        def forward(self, a_1):
+            view_copy = torch.ops.aten.view_copy(a_1, [-1])
+            add = torch.ops.aten.add(view_copy, 1);  view_copy = None
+            view_copy_1 = torch.ops.aten.view_copy(add, [2]);  add = None
+            copy_ = torch.ops.aten.copy_(a_1, view_copy_1);  a_1 = None
+            return view_copy_1
+
+
+    There are a few "failure modes" for functionalize that are worth calling out:
+      (1) Like other torch.func transforms, `functionalize()` doesn't work with functions
+          that directly use `.backward()`. The same is true for torch.autograd.grad.
+          If you want to use autograd, you can compute gradients directly
+          with `functionalize(grad(f))`.
+      (2) Like other torch.func transforms, `functionalize()` doesn't work with global state.
+          If you call `functionalize(f)` on a function that takes views / mutations of
+          non-local state, functionalization will simply no-op and pass the view/mutation
+          calls directly to the backend.
+          One way to work around this is is to ensure that any non-local state creation
+          is wrapped into a larger function, which you then call functionalize on.
+      (3) `resize_()` has some limitations: functionalize will only work on programs
+          that use resize_()` as long as the tensor being resized is not a view.
+      (4) `as_strided()` has some limitations: functionalize will not work on
+          `as_strided()` calls that result in tensors with overlapping memory.
+
+
+    Finally, a helpful mental model for understanding functionalization is that
+    most user pytorch programs are writing with the public torch API.
+    When executed, torch operators are generally decomposed into
+    our internal C++ "ATen" API.
+    The logic for functionalization happens entirely at the level of ATen.
+    Functionalization knows how to take every aliasing operator in ATen,
+    and map it to its non-aliasing equivalent
+    (e.g. ``tensor.view({-1})`` -> ``at::view_copy(tensor, {-1})``),
+    and how to take every mutating operator in ATen,
+    and map it to its non-mutating equivalent
+    (e.g. ``tensor.add_(1)`` -> ``at::add(tensor, -1)``),
+    while tracking aliases and mutations out-of-line to know when to fix things up.
+    Information about which ATen operators are aliasing or mutating all comes from
+    https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml.
+    """
+    if remove == 'mutations':
+        reapply_views = True
+    elif remove == 'mutations_and_views':
+        reapply_views = False
+    else:
+        raise RuntimeError(
+            f"functionalize(f, remove='mutations'): received invalid argument for remove={remove}."
+            " Valid options are:\n"
+            "     remove='mutations': all inplace and out= operators will be removed from the program, and replaced"
+            " with their out-of-place equivalents.\n"
+            "     remove='mutations_and_views': In addition to the above, all aliasing operators {view} will be"
+            " replaced with their non-aliasing counterparts, {view}_copy.\n"
+        )
+
+    @doesnt_support_saved_tensors_hooks
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        try:
+            func_level = _func_increment_nesting(reapply_views)
+            func_args = _wrap_all_tensors_to_functional(args, func_level)
+            func_kwargs = _wrap_all_tensors_to_functional(kwargs, func_level)
+
+            flattened_unwrapped_args = pytree.arg_tree_leaves(*args)
+            flattened_wrapped_args = pytree.arg_tree_leaves(*func_args)
+            flattened_unwrapped_kwargs = pytree.arg_tree_leaves(**kwargs)
+            flattened_wrapped_kwargs = pytree.arg_tree_leaves(**func_kwargs)
+
+            func_outputs = func(*func_args, **func_kwargs)
+            outputs = _unwrap_all_tensors_from_functional(func_outputs, reapply_views=reapply_views)
+            flat_outputs, func_out_spec = tree_flatten(outputs)
+
+            for a in flattened_wrapped_args + flattened_wrapped_kwargs:
+                if isinstance(a, torch.Tensor):
+                    # Call sync_() on the inputs, to ensure that any pending mutations have been applied.
+                    torch._sync(a)
+
+            # And if any mutations were applied to the inputs, we need to propagate them back to the user.
+            for unwrapped, wrapped in zip(flattened_unwrapped_args, flattened_wrapped_args):
+                if isinstance(unwrapped, torch.Tensor) and isinstance(wrapped, torch.Tensor):
+                    _propagate_functional_input_mutation(unwrapped, wrapped)
+            for unwrapped, wrapped in zip(flattened_unwrapped_kwargs, flattened_wrapped_kwargs):
+                if isinstance(unwrapped, torch.Tensor) and isinstance(wrapped, torch.Tensor):
+                    _propagate_functional_input_mutation(unwrapped, wrapped)
+
+            return outputs
+        finally:
+            _func_decrement_nesting()
+    return wrapped
+
+@exposed_in("torch.func")
+def linearize(func: Callable, *primals) -> Tuple[Any, Callable]:
+    '''
+    Returns the value of ``func`` at ``primals`` and linear approximation
+    at ``primals``.
+
+    Args:
+        func (Callable): A Python function that takes one or more arguments.
+        primals (Tensors): Positional arguments to ``func`` that must all be
+            Tensors. These are the values at which the function is linearly approximated.
+
+    Returns:
+        Returns a ``(output, jvp_fn)`` tuple containing the output of ``func``
+        applied to ``primals`` and a function that computes the jvp of
+        ``func`` evaluated at ``primals``.
+
+    linearize is useful if jvp is to be computed multiple times at ``primals``. However,
+    to achieve this, linearize saves intermediate computation and has higher memory requirements
+    than directly applying `jvp`. So, if all the ``tangents`` are known, it maybe more efficient
+    to compute vmap(jvp) instead of using linearize.
+
+    .. note::
+        linearize evaluates ``func`` twice. Please file an issue for an implementation
+        with a single evaluation.
+
+    Example::
+        >>> import torch
+        >>> from torch.func import linearize
+        >>> def fn(x):
+        ...     return x.sin()
+        ...
+        >>> output, jvp_fn = linearize(fn, torch.zeros(3, 3))
+        >>> jvp_fn(torch.ones(3, 3))
+        tensor([[1., 1., 1.],
+                [1., 1., 1.],
+                [1., 1., 1.]])
+        >>>
+
+    '''
+    # Note: We evaluate `fn` twice.
+    # Once for returning the output and other while
+    # tracing the graph.
+    # If this becomes a bottle-neck, we should update
+    # make_fx such that it also returns the output.
+
+    output = func(*primals)
+    _, output_spec = tree_flatten(output)
+
+    flat_primals, primals_argspec = tree_flatten(primals)
+
+    # tangents for tracing
+    flat_tangents = tuple(p.new_empty(()).expand_as(p) for p in flat_primals)
+
+    # function to trace
+    def trace_fn(flat_tangents):
+        with fwAD.dual_level():
+            flat_duals = tuple(fwAD.make_dual(p, t) for p, t in zip(flat_primals, flat_tangents))
+            duals = tree_unflatten(flat_duals, primals_argspec)
+            output = func(*duals)
+            tangents = tree_map_only(torch.Tensor, lambda t: fwAD.unpack_dual(t)[1], output)
+
+        return tangents
+
+    jvp_graph = make_fx(trace_fn)(flat_tangents)
+    const_folded_jvp_graph = const_fold.split_const_subgraphs(jvp_graph)
+
+    # Hold only the meta-data regarding the primals.
+    flat_primals_shape = tuple(p.shape for p in flat_primals)
+    flat_primals_device = tuple(p.device for p in flat_primals)
+    flat_primals_dtype = tuple(p.dtype for p in flat_primals)
+
+    def forward_ad_checks(flat_tangents):
+        for idx, t in enumerate(flat_tangents):
+            if t.shape != flat_primals_shape[idx]:
+                msg = (f"tangent:{idx} with shape {t.shape} in flattened "
+                       f"pytree doesn't match the shape {flat_primals_shape[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+            if t.device != flat_primals_device[idx]:
+                msg = (f"tangent:{idx} with device {t.device} in flattened "
+                       f"pytree doesn't match the device {flat_primals_device[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+            if t.dtype != flat_primals_dtype[idx]:
+                msg = (f"tangent:{idx} with dtype {t.dtype} in flattened "
+                       f"pytree doesn't match the dtype {flat_primals_dtype[idx]} "
+                       "of the corresponding primal.")
+                raise RuntimeError(msg)
+
+    # jvp_fn : callable to return
+    #   It takes care of checking the argspec of tangents,
+    #   calling the folded fx graph and unflattening fx graph output
+    def jvp_fn(*tangents):
+        flat_tangents, tangent_argspec = tree_flatten(tangents)
+        if tangent_argspec != primals_argspec:
+            raise RuntimeError(f"Expected the tangents {tangent_argspec} to have "
+                               f"the same argspec as the primals {primals_argspec}")
+
+        forward_ad_checks(flat_tangents)
+
+        flat_output = const_folded_jvp_graph(*flat_tangents)
+        # const folded graph can return flat output,
+        # so transform output.
+        return tree_unflatten(flat_output, output_spec)
+
+    return output, jvp_fn
diff --git a/MLPY/Lib/site-packages/torch/_functorch/functional_call.py b/MLPY/Lib/site-packages/torch/_functorch/functional_call.py
new file mode 100644
index 0000000000000000000000000000000000000000..16240d61ffd9b4c138f198adf6be6bddc890e651
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/functional_call.py
@@ -0,0 +1,248 @@
+from collections import Counter
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch._functorch.utils import exposed_in
+
+
+@exposed_in("torch.func")
+def functional_call(
+    module: "torch.nn.Module",
+    parameter_and_buffer_dicts: Union[Dict[str, Tensor], Sequence[Dict[str, Tensor]]],
+    args: Union[Any, Tuple],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    tie_weights: bool = True,
+    strict: bool = False,
+):
+    r"""Performs a functional call on the module by replacing the module parameters
+    and buffers with the provided ones.
+
+    .. note:: If the module has active parametrizations, passing a value in the
+        :attr:`parameter_and_buffer_dicts` argument with the name set to the regular parameter
+        name will completely disable the parametrization.
+        If you want to apply the parametrization function to the value passed
+        please set the key as ``{submodule_name}.parametrizations.{parameter_name}.original``.
+
+    .. note:: If the module performs in-place operations on parameters/buffers, these will be reflected
+        in the ``parameter_and_buffer_dicts`` input.
+
+
+         Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # does self.foo = self.foo + 1
+            >>> print(mod.foo)  # tensor(0.)
+            >>> functional_call(mod, a, torch.ones(()))
+            >>> print(mod.foo)  # tensor(0.)
+            >>> print(a['foo'])  # tensor(1.)
+
+    .. note:: If the module has tied weights, whether or not functional_call respects the tying is determined by the
+        tie_weights flag.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # has both self.foo and self.foo_tied which are tied. Returns x + self.foo + self.foo_tied
+            >>> print(mod.foo)  # tensor(1.)
+            >>> mod(torch.zeros(()))  # tensor(2.)
+            >>> functional_call(mod, a, torch.zeros(()))  # tensor(0.) since it will change self.foo_tied too
+            >>> functional_call(mod, a, torch.zeros(()), tie_weights=False)  # tensor(1.)--self.foo_tied is not updated
+            >>> new_a = {'foo': torch.zeros(()), 'foo_tied': torch.zeros(())}
+            >>> functional_call(mod, new_a, torch.zeros()) # tensor(0.)
+
+    An example of passing multiple dictionaries
+
+    .. code-block:: python
+
+            a = ({'weight': torch.ones(1, 1)}, {'buffer': torch.zeros(1)})  # two separate dictionaries
+            mod = nn.Bar(1, 1)  # return self.weight @ x + self.buffer
+            print(mod.weight)  # tensor(...)
+            print(mod.buffer)  # tensor(...)
+            x = torch.randn((1, 1))
+            print(x)
+            functional_call(mod, a, x)  # same as x
+            print(mod.weight)  # same as before functional_call
+
+
+    And here is an example of applying the grad transform over the parameters
+    of a model.
+
+    .. code-block:: python
+
+        import torch
+        import torch.nn as nn
+        from torch.func import functional_call, grad
+
+        x = torch.randn(4, 3)
+        t = torch.randn(4, 3)
+        model = nn.Linear(3, 3)
+
+        def compute_loss(params, x, t):
+            y = functional_call(model, params, x)
+            return nn.functional.mse_loss(y, t)
+
+        grad_weights = grad(compute_loss)(dict(model.named_parameters()), x, t)
+
+    .. note:: If the user does not need grad tracking outside of grad transforms, they can detach all of the
+        parameters for better performance and memory usage
+
+        Example::
+
+            >>> detached_params = {k: v.detach() for k, v in model.named_parameters()}
+            >>> grad_weights = grad(compute_loss)(detached_params, x, t)
+            >>> grad_weights.grad_fn  # None--it's not tracking gradients outside of grad
+
+        This means that the user cannot call ``grad_weight.backward()``. However, if they don't need autograd tracking
+        outside of the transforms, this will result in less memory usage and faster speeds.
+
+    Args:
+        module (torch.nn.Module): the module to call
+        parameters_and_buffer_dicts (Dict[str, Tensor] or tuple of Dict[str, Tensor]): the parameters that will be used in
+            the module call. If given a tuple of dictionaries, they must have distinct keys so that all dictionaries can
+            be used together
+        args (Any or tuple): arguments to be passed to the module call. If not a tuple, considered a single argument.
+        kwargs (dict): keyword arguments to be passed to the module call
+        tie_weights (bool, optional): If True, then parameters and buffers tied in the original model will be treated as
+            tied in the reparameterized version. Therefore, if True and different values are passed for the tied
+            parameters and buffers, it will error. If False, it will not respect the originally tied parameters and
+            buffers unless the values passed for both weights are the same. Default: True.
+        strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and
+            buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will
+            error. Default: False.
+
+    Returns:
+        Any: the result of calling ``module``.
+    """
+    if isinstance(parameter_and_buffer_dicts, dict):
+        parameters_and_buffers = parameter_and_buffer_dicts
+    elif isinstance(parameter_and_buffer_dicts, Sequence):
+        if not all(isinstance(d, dict) for d in parameter_and_buffer_dicts):
+            raise ValueError(
+                "Expected all elements of parameter_and_buffer_dicts to be dictionaries"
+            )
+        all_keys = [k for d in parameter_and_buffer_dicts for k in d.keys()]
+        repeated_keys = [key for key, n in Counter(all_keys).items() if n > 1]
+        if len(repeated_keys) > 0:
+            raise ValueError(
+                f"{repeated_keys} appeared in multiple dictionaries; behavior of functional call is ambiguous"
+            )
+        parameters_and_buffers = {
+            k: v for d in parameter_and_buffer_dicts for k, v in d.items()
+        }
+    else:
+        raise ValueError(
+            f"Expected parameter_and_buffer_dicts to be a dict, or a list/tuple of dicts, "
+            f"but got {type(parameter_and_buffer_dicts)}"
+        )
+
+    return nn.utils.stateless._functional_call(
+        module,
+        parameters_and_buffers,
+        args,
+        kwargs,
+        tie_weights=tie_weights,
+        strict=strict,
+    )
+
+
+@exposed_in("torch.func")
+def stack_module_state(
+    models: List[nn.Module],
+) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """stack_module_state(models) -> params, buffers
+
+    Prepares a list of torch.nn.Modules for ensembling with :func:`vmap`.
+
+    Given a list of ``M`` ``nn.Modules`` of the same class, returns two dictionaries
+    that stack all of their parameters and buffers together, indexed by name.
+    The stacked parameters are optimizable (i.e. they are new leaf nodes in the
+    autograd history that are unrelated to the original parameters and can be
+    passed directly to an optimizer).
+
+    Here's an example of how to ensemble over a very simple model:
+
+    .. code-block:: python
+
+        num_models = 5
+        batch_size = 64
+        in_features, out_features = 3, 3
+        models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
+        data = torch.randn(batch_size, 3)
+
+        def wrapper(params, buffers, data):
+            return torch.func.functional_call(model[0], (params, buffers), data)
+
+        params, buffers = stack_module_state(models)
+        output = vmap(wrapper, (0, 0, None))(params, buffers, data)
+
+        assert output.shape == (num_models, batch_size, out_features)
+
+    When there's submodules, this follows state dict naming conventions
+
+    .. code-block:: python
+
+        import torch.nn as nn
+        class Foo(nn.Module):
+            def __init__(self, in_features, out_features):
+                super().__init__()
+                hidden = 4
+                self.l1 = nn.Linear(in_features, hidden)
+                self.l2 = nn.Linear(hidden, out_features)
+
+            def forward(self, x):
+                return self.l2(self.l1(x))
+
+        num_models = 5
+        in_features, out_features = 3, 3
+        models = [Foo(in_features, out_features) for i in range(num_models)]
+        params, buffers = stack_module_state(models)
+        print(list(params.keys()))  # "l1.weight", "l1.bias", "l2.weight", "l2.bias"
+
+    .. warning::
+        All of the modules being stacked together must be the same (except for
+        the values of their parameters/buffers). For example, they should be in the
+        same mode (training vs eval).
+    """
+    if len(models) == 0:
+        raise RuntimeError("stack_module_state: Expected at least one model, got 0.")
+    if not (all(m.training for m in models) or all(not m.training for m in models)):
+        raise RuntimeError(
+            "stack_module_state: Expected all models to have the same training/eval mode."
+        )
+    model0_typ = type(models[0])
+    if not all(type(m) == model0_typ for m in models):
+        raise RuntimeError(
+            "stack_module_state: Expected all models to be of the same class."
+        )
+    all_params = [dict(model.named_parameters()) for model in models]
+    params = {
+        k: construct_stacked_leaf(tuple(params[k] for params in all_params), k)
+        for k in all_params[0]
+    }
+    all_buffers = [dict(model.named_buffers()) for model in models]
+    buffers = {
+        k: construct_stacked_leaf(tuple(buffers[k] for buffers in all_buffers), k)
+        for k in all_buffers[0]
+    }
+
+    return params, buffers
+
+
+def construct_stacked_leaf(
+    tensors: Union[Tuple[Tensor, ...], List[Tensor]], name: str
+) -> Tensor:
+    all_requires_grad = all(t.requires_grad for t in tensors)
+    none_requires_grad = all(not t.requires_grad for t in tensors)
+    if not all_requires_grad and not none_requires_grad:
+        raise RuntimeError(
+            f"Expected {name} from each model to have the same .requires_grad"
+        )
+    result = torch.stack(tensors)
+    if all_requires_grad:
+        result = result.detach().requires_grad_()
+    return result
diff --git a/MLPY/Lib/site-packages/torch/_functorch/fx_minifier.py b/MLPY/Lib/site-packages/torch/_functorch/fx_minifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c0c3009de5b3af81b687259dfc9c2a9c53f0e84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/fx_minifier.py
@@ -0,0 +1,445 @@
+# mypy: ignore-errors
+
+import torch.fx as fx
+import copy
+import torch
+import math
+import sys
+from typing import Callable, List
+from functools import wraps, partial
+from dataclasses import dataclass
+from .compile_utils import get_placeholders, get_outputs
+from torch.utils._content_store import ContentStoreWriter
+from torch.hub import tqdm
+from torch.multiprocessing.reductions import StorageWeakRef
+import os
+
+is_tuple = object()
+
+@dataclass
+class LoadTensorMeta:
+    size: List[int]
+    stride: List[int]
+    dtype: torch.dtype
+    device: torch.device
+
+class ConcreteProp(torch.fx.Interpreter):
+    def __init__(self, mod, *, writer=None, skip_offload=False):
+        super().__init__(mod)
+        self.writer = writer
+        self.skip_offload = skip_offload
+        self.seen_storages = set()
+
+    def run_node(self, n):
+        self.pbar.update(1)
+        r = super().run_node(n)
+        name = n.name
+
+        if isinstance(r, torch.Tensor):
+            if self.writer is None:
+                n.meta['concrete_value'] = r
+            else:
+                if StorageWeakRef(r.untyped_storage()) in self.seen_storages:
+                    # Refuse to offload tensors which alias other live
+                    # tensors, because this will violate operator contracts
+                    n.meta['concrete_value'] = None
+                else:
+                    if not self.skip_offload:
+                        self.writer.write_tensor(os.path.join("eager", name), r)
+                    n.meta['concrete_value'] = LoadTensorMeta(
+                        r.size(),
+                        r.stride(),
+                        r.dtype,
+                        r.device
+                    )
+                    self.seen_storages.add(StorageWeakRef(r.untyped_storage()))
+        else:
+            n.meta['concrete_value'] = is_tuple
+
+        return r
+
+    def propagate(self, *args):
+        with tqdm(
+            desc="Saving intermediates for delta debugging",
+            total=len(self.module.graph.nodes),
+            disable=self.writer is None
+        ) as pbar:
+            self.pbar = pbar
+            r = super().run(*args)
+            if not self.skip_offload:
+                pbar.set_description("Saved!  To skip next time, run with --skip-saving-eager-intermediates")
+            return r
+
+def is_load_tensor_node(node):
+    return node.op == 'call_function' and node.target is torch.ops.debugprims.load_tensor.default
+
+
+# inplace modifies node/inps
+def _convert_node_to_placeholder(graph, node, inps):
+    if node.op == 'output' or node.op == "placeholder":
+        return False
+
+    if is_load_tensor_node(node):
+        return False
+
+    concrete_val = node.meta.get('concrete_value', None)
+
+    if isinstance(concrete_val, torch.Tensor):
+        node.op = 'placeholder'
+        node.target = node.name
+        node.args = ()
+        node.kwargs = {}
+
+        inps.append(concrete_val)
+        return True
+
+    elif concrete_val is None:
+        return False
+
+    elif concrete_val is is_tuple:
+        r = False
+        for tuple_user in list(node.users):
+            r = _convert_node_to_placeholder(graph, tuple_user, inps) or r
+        # NB: We must not erase the node at this point, because
+        # we are iterating over the nodes and this would change
+        # the iteration order
+        # graph.erase_node(node)
+        return r
+
+    elif isinstance(concrete_val, LoadTensorMeta):
+        node.op = 'call_function'
+        node.target = torch.ops.debugprims.load_tensor.default
+        node.args = (os.path.join("eager", node.name), concrete_val.size, concrete_val.stride)
+        node.kwargs = {
+            'device': concrete_val.device,
+            'dtype': concrete_val.dtype,
+        }
+        return True
+
+    return False
+
+def create_minified_hlo_graph(minified_fx_graph, inputs):
+    """
+    Takes minified FX graph as primary input, and ports it to HLO via StableHLO
+    Provides minified HLO graph as output, and archive them to local directory
+    """
+    hlo_dir = f"{os.getcwd()}/hlo_files"
+    os.makedirs(hlo_dir, exists_ok=True)
+
+    from torch_xla.stablehlo import save_torch_model_as_stablehlo
+    save_torch_model_as_stablehlo(minified_fx_graph, inputs, hlo_dir)
+
+def dump_state(fx_g, inps):
+    print(f"""
+# Working Repro with {len(fx_g.graph.nodes)} nodes
+inps = {[(i.shape, i.dtype, i.device.type) for i in inps]}
+inps = [torch.zeros(())] + [torch.ones(shape, dtype=dtype, device=device) for (shape, dtype, device) in inps]
+{fx_g.code}
+""")
+
+def is_power_of_two(n):
+    if n == 0:
+        return False
+    return (n & (n - 1)) == 0
+
+@dataclass
+class ReproState:
+    graph: fx.Graph
+    inps: List[torch.Tensor]
+
+    def __post_init__(self):
+        ph_nodes = get_placeholders(self.graph)
+        assert len(ph_nodes) == len(self.inps)
+
+def minifier(
+    fail_f: fx.GraphModule, inps, module_fails, dump_state: Callable = dump_state, *,
+    save_dir=None, offload_to_disk=False, skip_offload=False, skip_sanity=False,
+    max_granularity=None
+):
+    """
+    Minimizes a FX graph with given inputs, such that the resulting FX graph still returns True for module_fails.
+
+    Does 2 main strategies:
+    1. Truncates suffix: Removes some suffix from the graph and sets a new output.
+    2. Delta Debugging: Tries replacing half of the graph with inputs. If fails,
+        tries replacing quarter of the graph, etc.
+
+    >>> # xdoctest: +SKIP(failing)
+    >>> failing_function = fx.symbolic_trace(f)
+    >>> minimize(failing_function, [torch.randn(5)], lambda fx_g, inps: fx_g(*inps))
+
+    note: module_fails returns True if it fails.
+    """
+    assert isinstance(inps, (tuple, list))
+
+    failing_graph = fail_f.graph
+    cur_size = len(failing_graph.nodes)
+
+    if max_granularity is not None and not is_power_of_two(max_granularity):
+        raise RuntimeError(f"max_granularity {max_granularity} not power of two")
+
+    num_queries = 0
+
+    def deepcopy_fx_graph(fx_graph):
+        return fx.GraphModule(fail_f, copy.deepcopy(fx_graph)).graph
+
+
+    def graph_fails(graph, inps):
+        nonlocal num_queries
+        graph = copy.deepcopy(graph)
+        num_queries += 1
+        mod = fx.GraphModule(fail_f, graph)
+        mod.graph.lint()
+        return module_fails(mod, inps)
+
+    writer = None
+    if offload_to_disk:
+        writer = ContentStoreWriter(save_dir)
+
+    ConcreteProp(fail_f, writer=writer, skip_offload=skip_offload).propagate(*inps)
+    if not skip_sanity and not graph_fails(failing_graph, inps):
+        raise RuntimeError("Input graph did not fail the tester")
+    print(f"Started off with {cur_size} nodes", file=sys.stderr)
+
+    def _register_strategy(strategy: Callable, name: str):
+        @wraps(strategy)
+        def new_func(old_state: ReproState, granularity=1):
+            print(file=sys.stderr)
+            print(
+                f"Strategy: {name} (G: {granularity}) "
+                f"({len(old_state.graph.nodes)} nodes, {len(old_state.inps)} inputs)",
+                file=sys.stderr
+            )
+            new_state = strategy(deepcopy_fx_graph(old_state.graph), list(old_state.inps), granularity)
+            if new_state is not None:
+                new_nodes = len(new_state.graph.nodes)
+                old_nodes = len(old_state.graph.nodes)
+                new_inps = len(new_state.inps)
+                old_inps = len(old_state.inps)
+                new_outs = len(get_outputs(new_state.graph))
+                old_outs = len(get_outputs(old_state.graph))
+                progress_made = False
+                if new_nodes < old_nodes:
+                    progress_made = True
+                    print(f"SUCCESS: Went from {old_nodes} to {new_nodes} nodes", file=sys.stderr)
+                if new_inps > old_inps:
+                    progress_made = True
+                    print(f"SUCCESS: Went from {old_inps} to {new_inps} inputs", file=sys.stderr)
+                if new_outs < old_outs:
+                    progress_made = True
+                    print(f"SUCCESS: Went from {old_outs} to {new_outs} outputs", file=sys.stderr)
+
+                if not progress_made:
+                    raise RuntimeError("Success raised but no progress made?")
+
+                if not graph_fails(new_state.graph, new_state.inps):
+                    print("WARNING: Something went wrong, not applying this minification", file=sys.stderr)
+                    return None
+                return new_state
+            else:
+                print(f"FAIL: {name}", file=sys.stderr)
+            return None
+
+        return new_func
+
+    def register_strategy(name: str):
+        return partial(_register_strategy, name=name)
+
+    @register_strategy("Truncate suffix")
+    def remove_suffix(cur_graph, cur_inps, granularity):
+        tested = set()
+        new_graph = fx.Graph()
+        env = {}
+        for idx, node in enumerate(cur_graph.nodes):
+            new_node = new_graph.node_copy(node, lambda x: env[x])
+            if node.op not in ['placeholder', 'output']:
+                # If idx is divisible by (granularity * 2), it would have been checked already.
+                if idx % granularity == 0 and (idx % (granularity * 2) != 0) and idx not in tested:
+                    output_node = new_graph.output((new_node,))
+                    if len(new_graph.nodes) < len(cur_graph.nodes) and graph_fails(new_graph, cur_inps):
+                        return ReproState(new_graph, cur_inps)
+                    else:
+                        tested.add(idx)
+                        new_graph.erase_node(output_node)
+            env[node] = new_node
+        return None
+
+    @register_strategy("Remove outputs")
+    def remove_outputs(cur_graph, cur_inps, granularity):
+        granularity = max(1, granularity // 2)
+        for idx, node in enumerate(cur_graph.nodes):
+            node.idx = idx
+            if node.op == 'output':
+                output = node
+                break
+
+        if isinstance(output.args[0], fx.Node):
+            return None
+
+        output_args = sorted(output.args[0], key=lambda x: x.idx if isinstance(x, fx.Node) else int(1e9))
+        if len(output_args) == 1:
+            return None
+
+        for idx in range(0, len(output_args), granularity):
+            output.args = (output_args[:idx] + output_args[idx + granularity:],)
+            if graph_fails(cur_graph, cur_inps):
+                return ReproState(cur_graph, cur_inps)
+        return None
+
+
+    def remove_unused_inputs_unchecked(cur_state: ReproState):
+        cur_graph = cur_state.graph
+        cur_inps = cur_state.inps
+        ph_nodes = get_placeholders(cur_graph)
+        assert len(ph_nodes) == len(cur_inps)
+
+        new_inps = []
+        for idx in range(len(ph_nodes)):
+            if len(ph_nodes[idx].users) == 0:
+                cur_graph.erase_node(ph_nodes[idx])
+            else:
+                new_inps.append(cur_inps[idx])
+        if len(new_inps) < len(cur_inps):
+            return ReproState(cur_graph, new_inps)
+        return None
+
+    def remove_unused_inputs_checked(cur_state: ReproState):
+        new_state = remove_unused_inputs_unchecked(cur_state)
+        if new_state is not None and graph_fails(new_state.graph, new_state.inps):
+            return new_state
+        return None
+
+    def _remove_unused_wrapper(cur_graph, cur_inps, granularity):
+        return remove_unused_inputs_checked(ReproState(cur_graph, cur_inps))
+
+    remove_unused_inputs = register_strategy("Remove unused inputs")(_remove_unused_wrapper)
+
+    @register_strategy("Eliminate dead code")
+    def eliminate_dead_code(cur_graph, cur_inps, granularity):
+        if cur_graph.eliminate_dead_code() and graph_fails(cur_graph, cur_inps):
+            return ReproState(cur_graph, cur_inps)
+        return None
+
+
+    def _consolidate_placeholders(cur_graph, inps):
+        new_graph = fx.Graph()
+        env = {}
+        seen_non_placeholder = False
+
+        # Move all placeholders to the front; also, if any load_tensor
+        # is at the front, convert it into an input (because it can be live
+        # all the time)
+        for node in cur_graph.nodes:
+            if node.op == 'placeholder':
+                new_node = new_graph.node_copy(node, lambda x: env[x])
+                env[node] = new_node
+            elif not seen_non_placeholder and is_load_tensor_node(node):
+                new_node = new_graph.placeholder(node.name)
+                env[node] = new_node
+                inps.append(torch.ops.debugprims.load_tensor.default(*node.args, **node.kwargs))
+            else:
+                seen_non_placeholder = True
+
+        # Move everyone else
+        for node in cur_graph.nodes:
+            if node not in env:
+                new_node = new_graph.node_copy(node, lambda x: env[x])
+                env[node] = new_node
+        return new_graph
+
+    @register_strategy("Delta Debugging")
+    def delta_debugging(cur_graph: fx.Graph, cur_inps, granularity):
+        num_nodes = len(cur_graph.nodes)
+        for start_range in range(0, num_nodes, granularity):
+            is_removing = False
+            new_graph = deepcopy_fx_graph(cur_graph)
+            new_inps = cur_inps[:]
+            end_range = min(num_nodes, start_range + granularity)
+            for idx in range(start_range, end_range):
+                new_node = list(new_graph.nodes)[idx]
+                if _convert_node_to_placeholder(new_graph, new_node, new_inps):
+                    is_removing = True
+            if not is_removing:
+                continue
+            new_graph.eliminate_dead_code()
+            new_graph = _consolidate_placeholders(new_graph, new_inps)
+            new_state = remove_unused_inputs_unchecked(ReproState(new_graph, new_inps))
+            if new_state is None:
+                new_state = ReproState(new_graph, new_inps)
+            if graph_fails(new_state.graph, new_state.inps):
+                return ReproState(new_state.graph, new_state.inps)
+
+        return None
+
+    @register_strategy("Consolidate Inputs")
+    def consolidate_inputs(cur_graph, cur_inps, granularity):
+        old_len = len(cur_inps)
+        cur_graph = _consolidate_placeholders(cur_graph, cur_inps)
+        if len(cur_inps) > old_len and graph_fails(cur_graph, cur_inps):
+            return ReproState(cur_graph, cur_inps)
+        return None
+
+    failing_state = ReproState(failing_graph, inps)
+
+    def try_granularity(failing_state, granularity, use_non_granular):
+        print(f"Trying granularity {granularity}", file=sys.stderr)
+
+        strategies = []
+        num_nodes = len(failing_state.graph.nodes)
+        num_outputs = len(get_outputs(failing_state.graph))
+        if num_outputs > num_nodes // 2:
+            strategies += [remove_outputs]
+
+        if use_non_granular:
+            strategies += [eliminate_dead_code, remove_unused_inputs, consolidate_inputs]
+
+        strategies += [remove_suffix, delta_debugging]
+
+        for strategy in strategies:
+            new_state = strategy(failing_state, granularity)
+            if new_state is not None:
+                return new_state
+        return None
+
+    while True:
+        dump_state(fx.GraphModule(fail_f, failing_state.graph), failing_state.inps)
+        granularity = int(2**(math.floor(math.log2(len(failing_state.graph.nodes)))))
+        if max_granularity is not None:
+            granularity = min(max_granularity, granularity)
+        new_state = try_granularity(failing_state, granularity, use_non_granular=True)
+        if new_state is not None:
+            failing_state = new_state
+            continue
+
+        granularity //= 2
+        has_progress = False
+        while granularity >= 1:
+            new_state = try_granularity(failing_state, granularity, use_non_granular=False)
+            if new_state is not None:
+                failing_state = new_state
+                has_progress = True
+                break
+            granularity //= 2
+        if has_progress:
+            continue
+
+        new_state = remove_outputs(failing_state, 1)
+        if new_state is not None:
+            failing_state = new_state
+            continue
+
+        break
+
+    if not graph_fails(failing_state.graph, failing_state.inps):
+        raise RuntimeError("Uh oh, something went wrong :( Final graph is not failing")
+
+    print(f"Made {num_queries} queries", file=sys.stderr)
+    failing_fx = fx.GraphModule(fail_f, failing_state.graph)
+
+    # If XLA debugging environment is enabled, create minified HLO graph as well
+    if "XLA_HLO_DEBUG" in os.environ:
+        create_minified_hlo_graph(failing_fx, failing_state.inps)
+
+    dump_state(failing_fx, failing_state.inps)
+    print("Wrote minimal repro out to repro.py", file=sys.stderr)
+    return failing_fx, failing_state.inps
diff --git a/MLPY/Lib/site-packages/torch/_functorch/make_functional.py b/MLPY/Lib/site-packages/torch/_functorch/make_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a393055e24fe28faeadb5cb72f72809dc6ae95d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/make_functional.py
@@ -0,0 +1,615 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    NoReturn,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+# Utilities to make nn.Module "functional"
+# In particular the goal is to be able to provide a function that takes as input
+# the parameters and evaluate the nn.Module using fixed inputs.
+
+
+def raise_parameter_tying_error() -> NoReturn:
+    raise RuntimeError(
+        "make_functional(module): we don't yet support models that "
+        "do parameter tying (also sometimes known as weight sharing). "
+        "Please try to rewrite your model by replacing all instances of the "
+        "tied parameter with another and/or comment your support in "
+        "https://github.com/pytorch/functorch/issues/446"
+    )
+
+
+def create_names_map(
+    named_params: Union[Dict[str, Tensor], Iterable[Tuple[str, Tensor]]],
+    tied_named_params: Union[Dict[str, Tensor], Iterable[Tuple[str, Tensor]]],
+) -> Dict[str, List[str]]:
+    """
+    named_params is a dictionary of tensors: {'A': A, 'B': B}
+    tied_named_params is another dictionary of tensors {'A': A, 'B': B, 'B_tied': B}
+    with potentially tied (or 'duplicated') tensors
+
+    This function creates a mapping from the names in named_params to the
+    names in tied_named_params: {'A': ['A'], 'B': ['B', 'B_tied']}.
+    """
+    named_params = dict(named_params)
+    tied_named_params = dict(tied_named_params)
+
+    tensors_dict_keys = set(named_params.keys())
+    tied_tensors_dict_keys = set(tied_named_params.keys())
+    assert tensors_dict_keys.issubset(tied_tensors_dict_keys)
+
+    tensor_to_mapping: Dict[Tensor, Tuple[str, List[str]]] = {}
+    for key, tensor in named_params.items():
+        tensor_to_mapping[tensor] = (key, [])
+    for key, tensor in tied_named_params.items():
+        assert tensor in tensor_to_mapping
+        tensor_to_mapping[tensor][1].append(key)
+    return dict(tensor_to_mapping.values())
+
+
+def _extract_members(
+    mod: nn.Module,
+    named_members: Callable[..., Iterable[Tuple[str, Tensor]]],
+    subclass: Callable[[Tensor], Tensor],
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
+    all_named_members = tuple(named_members(remove_duplicate=False))
+    unique_named_members = tuple(named_members(remove_duplicate=True))
+    names_map = create_names_map(unique_named_members, all_named_members)
+
+    # Remove all the members in the model
+    memo = {}
+    accessor = NamedMemberAccessor(mod)
+    for name, p in all_named_members:
+        if p not in memo:
+            memo[p] = subclass(torch.empty_like(p, device="meta"))
+        replacement = memo[p]
+        accessor.set_tensor(name, replacement)
+
+    if len(unique_named_members) == 0:
+        names, params = (), ()
+    else:
+        names, params = zip(*unique_named_members)  # type: ignore[assignment]
+    return params, names, names_map
+
+
+def extract_weights(
+    mod: nn.Module,
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
+    """
+    This function removes all the Parameters from the model and
+    return them as a tuple as well as their original attribute names.
+    The weights must be re-loaded with `load_weights` before the model
+    can be used again.
+    Note that this function modifies the model in place and after this
+    call, mod.parameters() will be empty.
+    """
+    return _extract_members(mod, mod.named_parameters, nn.Parameter)
+
+
+def extract_buffers(
+    mod: nn.Module,
+) -> Tuple[Tuple[Tensor, ...], Tuple[str, ...], Dict[str, List[str]]]:
+    return _extract_members(mod, mod.named_buffers, lambda x: x)
+
+
+def load_weights(
+    mod: nn.Module,
+    names: Sequence[str],
+    params: Sequence[Tensor],
+    as_params: bool = False,
+) -> None:
+    """
+    Reload a set of weights so that `mod` can be used again to perform a forward pass.
+    Note that the `params` are regular Tensors (that can have history) and so are left
+    as Tensors. This means that mod.parameters() will still be empty after this call.
+    """
+    accessor = NamedMemberAccessor(mod)
+    if as_params:
+        params = [nn.Parameter(p) for p in params]
+    accessor.set_tensors(names, params)
+
+
+def _swap_state(
+    mod: nn.Module, names_map: Dict[str, List[str]], elems: Iterable[Tensor]
+) -> List[Tensor]:
+    result: List[Tensor] = []
+    accessor = NamedMemberAccessor(mod)
+    for (_, attr_names), elem in zip(names_map.items(), elems):
+        for i, attr_name in enumerate(attr_names):
+            if i == 0:
+                result.append(accessor.swap_tensor(attr_name, elem))
+            else:
+                accessor.set_tensor(attr_name, elem)
+    return result
+
+
+def load_buffers(
+    mod: nn.Module,
+    names: Sequence[str],
+    buffers: Sequence[Tensor],
+    as_params: bool = False,
+) -> None:
+    accessor = NamedMemberAccessor(mod)
+    accessor.set_tensors(names, buffers)
+
+
+def load_state(
+    model: nn.Module,
+    weights: Sequence[Tensor],
+    weight_names: Sequence[str],
+    buffers: Sequence[Tensor] = (),
+    buffer_names: Sequence[str] = (),
+) -> nn.Module:
+    """load_state(model, weights, weight_names, buffers=(), buffer_names=()) -> model
+
+    load_state takes `weights` and `buffers` and assigns them to the model.
+    This is the inverse operation of `make_functional_deprecated_v1`.
+    """
+    assert len(weight_names) == len(weights)
+    load_weights(model, weight_names, weights)
+    if len(buffers) > 0:
+        assert len(buffer_names) == len(buffers)
+        load_buffers(model, buffer_names, buffers)
+    return model
+
+
+def make_functional_deprecated_v1(model: nn.Module):
+    """make_functional_deprecated_v1(model) -> weights, func, weight_names
+
+    Given an nn.Module, make_functional_deprecated_v1 extracts the state (weights)
+    and returns a functional version of the model, `func`. This makes
+    it so that it is possible use transforms over the parameters of
+    `model`.
+
+    `func` can be invoked as follows:
+    ```
+    x = torch.randn(4, 3)
+    model = nn.Linear(3, 3)
+    weights, func, _ = make_functional_deprecated_v1(model)
+    func(weights, (x,))
+    ```
+
+    And here is an example of applying the grad transform:
+    ```
+    x = torch.randn(4, 3)
+    model = nn.Linear(3, 3)
+    weights, _, func = make_functional_deprecated_v1(model)
+    grad_weights = grad(func)(weights, (x,))
+    ```
+
+    To put the state back into a model, use `load_state`.
+    """
+    buffers = list(model.buffers())
+    if len(buffers) > 0:
+        raise RuntimeError(
+            "make_functional_deprecated_v1(model): `model` has buffers. Please use "
+            "make_functional_with_buffers_deprecated_v1(model) instead."
+        )
+    weights, descriptors, _ = extract_weights(model)
+
+    def fun(weights, data):
+        mutable_model = copy.deepcopy(model)
+        load_weights(mutable_model, descriptors, weights)
+        return mutable_model(*data)
+
+    return weights, fun, descriptors
+
+
+def make_functional_with_buffers_deprecated_v1(model: nn.Module):
+    """make_functional_with_buffers_deprecated_v1(model) -> weights, buffers, func, weight_names, buffer_names
+
+    Given an nn.Module, make_functional_with_buffers_deprecated_v1 extracts the state (weights and buffers)
+    and returns a functional version of the model, `func`.
+
+    `func` can be invoked as follows:
+    ```
+    x = torch.randn(4, 3)
+    model = nn.Linear(3, 3)
+    weights, buffers, func, _, _ = make_functional_with_buffers_deprecated_v1(model)
+    func(weights, buffers, (x,))
+    ```
+
+    And here is an example of applying the grad transform:
+    ```
+    x = torch.randn(4, 3)
+    model = nn.Linear(3, 3)
+    weights, buffers, func, _, _ = make_functional_with_buffers_deprecated_v1(model)
+    func(weights, buffers, (x,))
+    grad_weights = grad(func)(weights, buffers, (x,))
+    ```
+
+    To put the state back into a model, use `load_state`.
+    """
+    weights, weight_descriptors, _ = extract_weights(model)
+    buffers, buf_descriptors, _ = extract_buffers(model)
+
+    def fun(weights, buffers, data):
+        mutable_model = copy.deepcopy(model)
+        load_weights(mutable_model, weight_descriptors, weights)
+        load_buffers(mutable_model, buf_descriptors, buffers)
+        return mutable_model(*data)
+
+    return weights, buffers, fun, weight_descriptors, buf_descriptors
+
+
+class FunctionalModuleWithBuffers(nn.Module):
+    """
+    This is the callable object returned by :func:`make_functional_with_buffers`.
+    """
+
+    def __init__(
+        self,
+        stateless_model: nn.Module,
+        param_names: Tuple[str, ...],
+        buffer_names: Tuple[str, ...],
+        param_names_map: Dict[str, List[str]],
+        buffer_names_map: Dict[str, List[str]],
+    ) -> None:
+        super().__init__()
+        self.stateless_model = stateless_model
+        self.param_names = param_names
+        self.buffer_names = buffer_names
+
+        self.all_names_map = dict(param_names_map)
+        self.all_names_map.update(buffer_names_map)
+
+    @staticmethod
+    def _create_from(
+        model: nn.Module, disable_autograd_tracking: bool = False
+    ) -> Tuple["FunctionalModuleWithBuffers", Tuple[Tensor, ...], Tuple[Tensor, ...]]:
+        # TODO: We don't need to copy the model to create a stateless copy
+        model_copy = copy.deepcopy(model)
+        params, param_names, param_names_map = extract_weights(model_copy)
+        buffers, buffer_names, buffer_names_map = extract_buffers(model_copy)
+        if disable_autograd_tracking:
+            for param in params:
+                param.requires_grad_(False)
+        return (
+            FunctionalModuleWithBuffers(
+                model_copy, param_names, buffer_names, param_names_map, buffer_names_map
+            ),
+            params,
+            buffers,
+        )
+
+    def forward(
+        self, params: Iterable[Tensor], buffers: Iterable[Tensor], *args, **kwargs
+    ) -> Any:
+        # Temporarily load the state back onto self.stateless_model
+        old_state = _swap_state(
+            self.stateless_model,
+            self.all_names_map,
+            tuple(params) + tuple(buffers),
+        )
+        try:
+            return self.stateless_model(*args, **kwargs)
+        finally:
+            # Remove the loaded state on self.stateless_model
+            _swap_state(self.stateless_model, self.all_names_map, old_state)
+
+
+class FunctionalModule(nn.Module):
+    """
+    This is the callable object returned by :func:`make_functional`.
+    """
+
+    def __init__(
+        self,
+        stateless_model: nn.Module,
+        param_names: Tuple[str, ...],
+        names_map: Dict[str, List[str]],
+    ) -> None:
+        super().__init__()
+        self.stateless_model = stateless_model
+        self.param_names = param_names
+        self.names_map = names_map
+
+    @staticmethod
+    def _create_from(
+        model: nn.Module, disable_autograd_tracking: bool = False
+    ) -> Tuple["FunctionalModule", Tuple[Tensor, ...]]:
+        # TODO: We don't need to copy the model to create a stateless copy
+        model_copy = copy.deepcopy(model)
+        params, param_names, names_map = extract_weights(model_copy)
+        if disable_autograd_tracking:
+            for param in params:
+                param.requires_grad_(False)
+        return FunctionalModule(model_copy, param_names, names_map), params
+
+    def forward(self, params: Iterable[Tensor], *args, **kwargs) -> Any:
+        # Temporarily load the state back onto self.stateless_model
+        old_state = _swap_state(self.stateless_model, self.names_map, params)
+        try:
+            return self.stateless_model(*args, **kwargs)
+        finally:
+            # Remove the loaded state on self.stateless_model
+            _swap_state(self.stateless_model, self.names_map, old_state)
+
+
+def make_functional(
+    model: nn.Module, disable_autograd_tracking: bool = False
+) -> Tuple[FunctionalModule, Tuple[Tensor, ...]]:
+    """make_functional(model, disable_autograd_tracking=False) -> func, params
+
+    Given a ``torch.nn.Module``, :func:`make_functional` extracts the state
+    (params) and returns a functional version of the model, ``func``. This
+    makes it so that it is possible use transforms over the parameters of
+    ``model``.
+
+    ``func`` can be invoked as follows:
+
+    .. code-block:: python
+
+        import torch
+        import torch.nn as nn
+        from functorch import make_functional
+
+        x = torch.randn(4, 3)
+        model = nn.Linear(3, 3)
+        func, params = make_functional(model)
+        func(params, x)
+
+    And here is an example of applying the grad transform over the parameters
+    of a model.
+
+    .. code-block:: python
+
+        import torch
+        import torch.nn as nn
+        from functorch import make_functional, grad
+
+        x = torch.randn(4, 3)
+        t = torch.randn(4, 3)
+        model = nn.Linear(3, 3)
+        func, params = make_functional(model)
+
+        def compute_loss(params, x, t):
+            y = func(params, x)
+            return nn.functional.mse_loss(y, t)
+
+        grad_weights = grad(compute_loss)(params, x, t)
+
+    If the model has any buffers, please use :func:`make_functional_with_buffers` instead.
+
+    Args:
+        model (torch.nn.Module): Input model.
+        disable_autograd_tracking (bool): Flag to disable gradients tracking for output parameters.
+            The returned params are unrelated to the set of params from the original model. If False (default),
+            the params will have ``requires_grad=True`` on them (aka they will be trackable with regular
+            PyTorch autograd), matching the requires_grad-ness of the params from the original model.
+            Otherwise, the returned params will have ``requires_grad=False``. Default, False.
+            If you plan on using regular PyTorch autograd (e.g., if you want to call ``.backward()`` or
+            ``torch.autograd.grad()``, then set ``disable_autograd_tracking=False``.
+            Otherwise, if you're only planning on using functorch's gradient transforms,
+            then please set ``disable_autograd_tracking=True`` to avoid unnecessarily tracking
+            history with PyTorch autograd.
+
+    """
+    buffers = list(model.buffers())
+    if len(buffers) > 0:
+        raise RuntimeError(
+            "make_functional(model): `model` has buffers. Please use "
+            "make_functional_with_buffers(model) instead."
+        )
+    return FunctionalModule._create_from(
+        model, disable_autograd_tracking=disable_autograd_tracking
+    )
+
+
+def make_functional_with_buffers(
+    model: nn.Module, disable_autograd_tracking: bool = False
+) -> Tuple[FunctionalModuleWithBuffers, Tuple[Tensor, ...], Tuple[Tensor, ...]]:
+    """make_functional_with_buffers(model, disable_autograd_tracking=False) -> func, params, buffers
+
+    Given a ``torch.nn.Module``, make_functional_with_buffers extracts the
+    state (params and buffers) and returns a functional version of the model
+    ``func`` that can be invoked like a function.
+
+    ``func`` can be invoked as follows:
+
+    .. code-block:: python
+
+        import torch
+        import torch.nn as nn
+        from functorch import make_functional_with_buffers
+
+        x = torch.randn(4, 3)
+        model = nn.Linear(3, 3)
+        func, params, buffers = make_functional_with_buffers(model)
+        func(params, buffers, x)
+
+    And here is an example of applying the grad transform over the parameters
+    of a model:
+
+    .. code-block:: python
+
+        import torch
+        import torch.nn as nn
+        from functorch import make_functional_with_buffers, grad
+
+        x = torch.randn(4, 3)
+        t = torch.randn(4, 3)
+        model = nn.Linear(3, 3)
+        func, params, buffers = make_functional_with_buffers(model)
+
+        def compute_loss(params, buffers, x, t):
+            y = func(params, buffers, x)
+            return nn.functional.mse_loss(y, t)
+
+        grad_weights = grad(compute_loss)(params, buffers, x, t)
+
+    Args:
+        model (torch.nn.Module): Input model.
+        disable_autograd_tracking (bool): Flag to disable gradients tracking for output parameters.
+            The returned params are unrelated to the set of params from the original model. If False (default),
+            the params will have ``requires_grad=True`` on them (aka they will be trackable with regular
+            PyTorch autograd), matching the requires_grad-ness of the params from the original model.
+            Otherwise, the returned params will have ``requires_grad=False``. Default, False.
+            If you plan on using regular PyTorch autograd (e.g., if you want to call ``.backward()`` or
+            ``torch.autograd.grad()``, then set ``disable_autograd_tracking=False``.
+            Otherwise, if you're only planning on using functorch's gradient transforms,
+            then please set ``disable_autograd_tracking=True`` to avoid unnecessarily tracking
+            history with PyTorch autograd.
+
+    """
+    return FunctionalModuleWithBuffers._create_from(
+        model, disable_autograd_tracking=disable_autograd_tracking
+    )
+
+
+def transpose_stack(
+    tuple_of_tuple_of_tensors: Tuple[Tuple[Tensor, ...], ...]
+) -> Tuple[Tensor, ...]:
+    tuple_of_tuple_of_tensors = tuple(zip(*tuple_of_tuple_of_tensors))
+    results = tuple(
+        torch.stack(shards).detach() for shards in tuple_of_tuple_of_tensors
+    )
+    return results
+
+
+def combine_state_for_ensemble(
+    models: Sequence[nn.Module],
+) -> Tuple[FunctionalModuleWithBuffers, Tuple[Tensor, ...], Tuple[Tensor, ...]]:
+    """combine_state_for_ensemble(models) -> func, params, buffers
+
+    Prepares a list of torch.nn.Modules for ensembling with :func:`vmap`.
+
+    Given a list of ``M`` ``nn.Modules`` of the same class, stacks all of their
+    parameters and buffers together to make ``params`` and ``buffers``.
+    Each parameter and buffer in the result will have an additional dimension
+    of size ``M``.
+
+    :func:`combine_state_for_ensemble` also returns ``func``, a functional
+    version of one of the models in :attr:`models`. One cannot directly run
+    ``func(params, buffers, *args, **kwargs)`` directly, you probably want to
+    use ``vmap(func, ...)(params, buffers, *args, **kwargs)``
+
+    Here's an example of how to ensemble over a very simple model:
+
+    .. code-block:: python
+
+        num_models = 5
+        batch_size = 64
+        in_features, out_features = 3, 3
+        models = [torch.nn.Linear(in_features, out_features) for i in range(num_models)]
+        data = torch.randn(batch_size, 3)
+
+        fmodel, params, buffers = combine_state_for_ensemble(models)
+        output = vmap(fmodel, (0, 0, None))(params, buffers, data)
+
+        assert output.shape == (num_models, batch_size, out_features)
+
+    .. warning::
+        All of the modules being stacked together must be the same (except for
+        the values of their parameters/buffers). For example, they should be in the
+        same mode (training vs eval).
+
+        This API is subject to change -- we're investigating better ways to
+        create ensembles and would love your feedback how to improve this.
+    """
+    if len(models) == 0:
+        raise RuntimeError(
+            "combine_state_for_ensemble: Expected at least one model, got 0."
+        )
+    if not (all(m.training for m in models) or all(not m.training for m in models)):
+        raise RuntimeError(
+            "combine_state_for_ensemble: Expected all models to "
+            "have the same training/eval mode."
+        )
+    model0_typ = type(models[0])
+    if not all(type(m) == model0_typ for m in models):
+        raise RuntimeError(
+            "combine_state_for_ensemble: Expected all models to be of the same class."
+        )
+    funcs, params, buffers = zip(
+        *[make_functional_with_buffers(model) for model in models]
+    )
+    params = transpose_stack(params)
+    buffers = transpose_stack(buffers)
+    return funcs[0], params, buffers
+
+
+def functional_init(
+    model_class: Type[nn.Module],
+    ensemble_shape: Union[Tuple[()], Tuple[int]] = (),
+    device: torch.types.Device = "cpu",
+):
+    def wrapped(*args, **kwargs):
+        if len(ensemble_shape) >= 2:
+            raise ValueError("NYI: ensemble_shape with more than 1 element")
+        if len(ensemble_shape) == 0:
+            model = model_class(*args, **kwargs).to(device)
+            return make_functional_deprecated_v1(model)
+        num_models = ensemble_shape[0]  # type: ignore[misc]
+        if num_models <= 0:
+            raise ValueError(f"num_models {num_models} should be > 0")
+        # NB: Not very efficient, more of a POC
+        models = tuple(
+            model_class(*args, **kwargs).to(device) for _ in range(num_models)
+        )
+        _, fn, names = make_functional_deprecated_v1(model_class(*args, **kwargs))
+        weights = tuple(make_functional_deprecated_v1(model)[0] for model in models)
+        weights = tuple(zip(*weights))
+        weights = tuple(torch.stack(shards).detach() for shards in weights)
+        return weights, fn, names
+
+    return wrapped
+
+
+def functional_init_with_buffers(
+    model_class: Type[nn.Module],
+    ensemble_shape: Union[Tuple[()], Tuple[int]] = (),
+    device: torch.types.Device = "cpu",
+):
+    def wrapped(*args, **kwargs):
+        if len(ensemble_shape) >= 2:
+            raise ValueError("NYI: ensemble_shape with more than 1 element")
+        if len(ensemble_shape) == 0:
+            model = model_class(*args, **kwargs).to(device)
+            return make_functional_deprecated_v1(model)
+        num_models = ensemble_shape[0]  # type: ignore[misc]
+        if num_models <= 0:
+            raise ValueError(f"num_models {num_models} should be > 0")
+        # NB: Not very efficient, more of a POC
+        models = tuple(
+            model_class(*args, **kwargs).to(device) for _ in range(num_models)
+        )
+        (
+            _,
+            _,
+            fn,
+            weight_names,
+            buffer_names,
+        ) = make_functional_with_buffers_deprecated_v1(model_class(*args, **kwargs))
+        weights, buffers = zip(
+            *tuple(
+                make_functional_with_buffers_deprecated_v1(model)[:2]
+                for model in models
+            )
+        )
+        weights = tuple(zip(*weights))
+        weights = tuple(torch.stack(shards).detach() for shards in weights)
+        buffers = tuple(zip(*buffers))
+        buffers = tuple(torch.stack(shards).detach() for shards in buffers)
+        return weights, buffers, fn, weight_names, buffer_names
+
+    return wrapped
diff --git a/MLPY/Lib/site-packages/torch/_functorch/partitioners.py b/MLPY/Lib/site-packages/torch/_functorch/partitioners.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bffcaca654eaafcfa15d934008f8b37e629f88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/partitioners.py
@@ -0,0 +1,981 @@
+# mypy: ignore-errors
+
+from torch.fx.experimental.proxy_tensor import is_sym_node, py_sym_types
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.fx.experimental.symbolic_shapes import (
+    hint_int, free_symbols, is_symbol_binding_fx_node, find_symbol_binding_fx_nodes
+)
+from torch.fx.experimental._backward_state import BackwardState
+import torch
+import torch.fx as fx
+import operator
+import math
+import torch.utils._pytree as pytree
+import copy
+import os
+import itertools
+import sympy
+from collections import defaultdict
+from torch.fx.passes import graph_drawer
+from typing import List, Optional, Set, Tuple, Union
+from .compile_utils import fx_graph_cse, get_aten_target
+from . import config
+import functools
+
+
+AOT_PARTITIONER_DEBUG = config.debug_partitioner
+
+
+def must_recompute(node):
+    return node.meta.get("recompute", False)
+
+def has_recomputable_ops(fx_g):
+    found = False
+    for node in fx_g.graph.nodes:
+        if must_recompute(node):
+            return True
+    return False
+
+def has_recomputable_rng_ops(fx_g):
+    for node in fx_g.graph.nodes:
+        if must_recompute(node) and hasattr(node.target, "tags") and torch.Tag.nondeterministic_seeded in node.target.tags:
+            return True
+    return False
+
+def sym_node_size(node):
+    if isinstance(node.meta["val"], (torch.SymInt, torch.SymBool)):
+        return 1
+    assert isinstance(node.meta["val"], torch.SymFloat)
+    return 4
+
+class InvalidNodeBase:
+    def __repr__(self):
+        return "Invalid Node"
+
+
+InvalidNode = InvalidNodeBase()
+
+
+def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs):
+    """
+    Given a graph, extracts out a subgraph that takes the specified nodes as
+    inputs and returns the specified outputs.
+
+    This includes specifying non-placeholder nodes as inputs.
+
+    The general strategy is to initialize all inputs with proxies as we
+    encounter them, and trace through the graph, only keeping values which take
+    in valid proxies. Then, all dead code is eliminated.
+    """
+    new_graph = fx.Graph()
+    env = {}
+
+    # Add new placeholder nodes in the order specified by the inputs
+    for node in inputs:
+        new_node = new_graph.placeholder(node.name)
+        # Can't use node_copy here as we may be turning previous call_function into placeholders
+        new_node.meta = node.meta
+        env[node] = new_node
+
+    for node in joint_graph.nodes:
+        if node in inputs:
+            continue
+        elif node.op == 'placeholder':
+            env[node] = InvalidNode
+        elif node.op == 'call_function':
+            all_args = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            all_args = [isinstance(env[x], InvalidNodeBase) for x in all_args if isinstance(x, fx.Node)]
+            if any(all_args):
+                env[node] = InvalidNode
+                continue
+            env[node] = new_graph.node_copy(node, lambda x: env[x])
+        elif node.op == 'get_attr':
+            env[node] = new_graph.node_copy(node, lambda x: env[x])
+        elif node.op == 'output':
+            pass
+    output_values = []
+    for x in outputs:
+        if isinstance(x, fx.Node):
+            if x not in env:
+                raise RuntimeError(f"Node {x} couldn't be found in env")
+            assert not isinstance(env[x], InvalidNodeBase), f"Node {x} was invalid, but is output"
+            output_values.append(env[x])
+        else:
+            output_values.append(x)
+    new_graph.output(output_values)
+
+    new_graph.eliminate_dead_code()
+    new_graph.lint()
+    return new_graph
+
+
+def _is_primal(node):
+    return (
+        node.op == "placeholder"
+        and "tangents" not in node.target
+        and not _is_bwd_seed_offset(node)
+        and not _is_fwd_seed_offset(node)
+    )
+
+def _is_tangent(node):
+    return node.op == "placeholder" and "tangents" in node.target
+
+def _is_bwd_seed_offset(node):
+    return node.op == "placeholder" and ("bwd_seed" in node.target or "bwd_base_offset" in node.target)
+
+def _is_fwd_seed_offset(node):
+    return node.op == "placeholder" and ("fwd_seed" in node.target or "fwd_base_offset" in node.target)
+
+def _is_backward_state(node):
+    return node.op == "placeholder" and isinstance(node.meta.get("val"), BackwardState)
+
+
+def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
+    outputs = pytree.arg_tree_leaves(*(node.args for node in joint_module.graph.nodes if node.op == 'output'))
+    fwd_outputs = outputs[:num_fwd_outputs]
+    bwd_outputs = outputs[num_fwd_outputs:]
+    return fwd_outputs, bwd_outputs
+
+
+def _remove_by_name(saved_values, name):
+    for saved_value in saved_values:
+        if saved_value.name == name:
+            saved_values.remove(saved_value)
+            break
+
+def _placeholders(nodes):
+    # Avoid making an entire pass over the graph if we only care about the input placeholders
+    result = []
+    for node in nodes:
+        if node.op == 'placeholder':
+            result.append(node)
+        else:
+            break  # placeholders are all at the start of graph
+    return result
+
+
+def _extract_fwd_bwd_modules(joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs):
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+    placeholders = _placeholders(joint_module.graph.nodes)
+    primal_inputs = [*filter(_is_primal, placeholders)]
+    tangent_inputs = [*filter(_is_tangent, placeholders)]
+    fwd_seed_offset_inputs = [*filter(_is_fwd_seed_offset, placeholders)]
+    bwd_seed_offset_inputs = [*filter(_is_bwd_seed_offset, placeholders)]
+    backward_state_inputs = [*filter(_is_backward_state, placeholders)]
+
+    bwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs,
+        bwd_outputs
+    )
+
+    for node in _placeholders(bwd_graph.nodes):
+        assert node.op == 'placeholder'
+        # This is to filter out saved values that don't actually end up being used by the backwards pass
+        if not node.users:
+            _remove_by_name(saved_values, node.name)
+            _remove_by_name(saved_sym_nodes, node.name)
+        elif _is_backward_state(node):
+            # BackwardState is saved directly
+            _remove_by_name(saved_values, node.name)
+            assert backward_state_inputs
+
+
+    # Now that we have the finalized list of saved values, we need to ensure
+    # we propagate all symbols which are referenced by backwards inputs.
+    # These are not directly used in the graph but are required for downstream
+    # sizevar assignment
+    saved_symbols: Set[sympy.Symbol] = set()
+    saved_sym_nodes_binding = []
+    saved_sym_nodes_derived = []
+
+    # Some symbols may already be bound in the directly saved_sym_nodes,
+    # keep track of them so we don't re-bind them
+    for node in saved_sym_nodes:
+        symbol = is_symbol_binding_fx_node(node)
+        if symbol:
+            saved_symbols.add(symbol)
+            saved_sym_nodes_binding.append(node)
+        else:
+            saved_sym_nodes_derived.append(node)
+
+    # Now go through all of the prospective backward inputs and track any
+    # other symbols we need to bind
+    symbol_bindings = find_symbol_binding_fx_nodes(joint_module.graph)
+    for node in itertools.chain(saved_sym_nodes_derived, saved_values, tangent_inputs):
+        if "val" not in node.meta:
+            continue
+        new_symbols = free_symbols(node.meta["val"]) - saved_symbols
+        # NB: Deterministic order please!
+        for s in sorted(new_symbols, key=lambda s: s.name):
+            # NB: For well formed graphs, the symbol should always be present,
+            # but we also have ways to produce ill-formed graphs, e.g., direct
+            # make_fx usages, so don't choke in this case
+            if s not in symbol_bindings:
+                continue
+            saved_sym_nodes_binding.append(symbol_bindings[s])
+        saved_symbols |= new_symbols
+
+
+    # Update saved_sym_nodes that are now reordered to have all bindings at
+    # front. This can also be used later on to figure out the position of saved
+    # sym nodes in the output of fwd graph.
+    saved_sym_nodes.clear()
+    saved_sym_nodes.extend(saved_sym_nodes_binding + saved_sym_nodes_derived)
+
+    # Now, we re-generate the fwd/bwd graphs.
+    # NB: This might increase compilation time, but I doubt it matters
+    fwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        primal_inputs + fwd_seed_offset_inputs,
+        fwd_outputs + saved_values + saved_sym_nodes
+    )
+    bwd_graph = _extract_graph_with_inputs_outputs(
+        joint_module.graph,
+        saved_sym_nodes + saved_values + tangent_inputs + bwd_seed_offset_inputs + backward_state_inputs,
+        bwd_outputs
+    )
+
+    fwd_module = fx._lazy_graph_module._make_graph_module(joint_module, fwd_graph)
+    bwd_module = fx._lazy_graph_module._make_graph_module(joint_module, bwd_graph)
+    return fwd_module, bwd_module
+
+
+def default_partition(
+    joint_module: fx.GraphModule, _joint_inputs, *, num_fwd_outputs
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
+    """
+    Partitions the :attr:`joint_module` in a manner that closely resembles the
+    behavior observed in the original ``.forward()`` and ``.backward()`` of the
+    callable, i.e., the resulting forward graph contains those operators that
+    are executed in the original ``.forward()`` callable passed to
+    :func:`aot_function`.
+
+    The default partitioner collects the operators that are between the forward
+    inputs and the forward outputs. This helps in finding the tensors which have
+    to be stashed for the backward pass. These stashed tensors become the output
+    of the generated forward graph. The remaining operators are then placed in
+    the backward graph.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        joint_module(fx.GraphModule): The joint forward and backward graph. This
+            is the result of AOT Autograd tracing.
+
+    Returns:
+        Returns the generated forward and backward Fx graph modules.
+    """
+    if has_recomputable_ops(joint_module):
+        return min_cut_rematerialization_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+    primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+    fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+    inputs = primal_inputs + fwd_seed_offset_inputs
+    fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+    forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
+    forward_node_names = {node.name for node in forward_only_graph.nodes if node.op != 'output'}
+    saved_values = []
+    saved_sym_nodes = []
+
+    for node in joint_module.graph.nodes:
+        if node.name not in forward_node_names:
+            continue
+        if is_sym_node(node):
+            # Symints must be kept separate from tensors so that PythonFunction only calls
+            # save_for_backward on tensors and stashes symints in autograd .ctx
+            saved_sym_nodes.append(node)
+        elif (
+            'tensor_meta' not in node.meta
+            and node.op == 'call_function'
+        ):
+            # Since we can't save tuple of tensor values, we need to flatten out what we're saving
+            users = node.users
+            assert all(user.target == operator.getitem for user in users)
+            saved_values.extend(users)
+        else:
+            backward_usages = [n for n in node.users if n.name not in forward_node_names]
+            if 'tensor_meta' in node.meta and all(is_sym_node(n) for n in backward_usages):
+                # If we have a tensor in the forward, where only its sizes/strides are needed in the backward,
+                # and not the actual tensor data,
+                # then it will be a lot cheaper to save only the sizes/strides, and not the actual tensor.
+                #
+                # Note that saving the tensor could also cause compilation problems:
+                # If the user mutated an input in the forward and uses its sizes/strides in the backward,
+                # then we would be obligated to clone the input before saving it to appease autograd.
+                # (This is how we originally found this bug).
+                saved_sym_nodes.extend(backward_usages)
+            else:
+                saved_values.append(node)
+    saved_values = list(dict.fromkeys(saved_values).keys())
+    saved_sym_nodes = list(dict.fromkeys(saved_sym_nodes).keys())
+
+    return _extract_fwd_bwd_modules(joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+
+
+def _prod(x):
+    s = 1
+    for i in x:
+        s *= i
+    return s
+
+def _tensor_nbytes(numel, dtype):
+    return numel * dtype.itemsize
+
+def _size_of(node: fx.Node) -> int:
+    if 'val' in node.meta:
+        val = node.meta['val']
+        if isinstance(val, py_sym_types):
+            if isinstance(val, torch.SymInt):
+                return 1
+            else:
+                return 999999
+        # NB: The fallback values here are meaningless, maybe we should respect
+        # torch._inductor.config.unbacked_symint_fallback (but this is a
+        # layering violation)
+        elif isinstance(val, (list, tuple)):
+            return sum(_tensor_nbytes(hint_int(n.numel(), fallback=4098), n.dtype) for n in val if isinstance(n, torch.Tensor))
+        elif isinstance(val, torch.Tensor):
+            return _tensor_nbytes(hint_int(val.numel(), fallback=4098), val.dtype)
+
+        raise RuntimeError(f"Unknown metadata type {type(val)}")
+
+    # Only needed since we don't always trace with fake tensors.
+    if 'tensor_meta' in node.meta:
+        metadata = node.meta['tensor_meta']
+        # TODO: What is to_size_hint suppose to be?
+        numel = _prod(map(to_size_hint, metadata.shape))  # noqa: F821
+        dtype = metadata.dtype
+    else:
+        return 0
+
+    return _tensor_nbytes(numel, dtype)
+
+
+# Used for some investigative purposes
+def _count_ops(graph):
+    from collections import defaultdict
+    cnt = defaultdict(int)
+    for node in graph.nodes:
+        if node.op == 'call_function':
+            cnt[node.target.__name__] += 1
+    print(sorted(cnt.items(), key=lambda x: x[1], reverse=True))
+
+
+@functools.lru_cache(None)
+def pointwise_ops():
+    ops = []
+    for attr_name in dir(torch.ops.aten):
+        opoverloadpacket = getattr(torch.ops.aten, attr_name)
+        if not isinstance(opoverloadpacket, torch._ops.OpOverloadPacket):
+            continue
+
+        for overload in opoverloadpacket.overloads():
+            op_overload = getattr(opoverloadpacket, overload)
+            if torch.Tag.pointwise in op_overload.tags:
+                # currently aot autograd uses packet not overload
+                ops.append(opoverloadpacket)
+                break
+
+    return ops
+
+def get_depth(node, depth_map):
+    if node in depth_map:
+        return depth_map[node]
+
+    # Base case
+    if node.op == "placeholder":
+        depth_map[node] = 0
+        return depth_map[node]
+
+    # Handle output node
+    if node.op == "output":
+        args = node.args[0]
+        for arg in args:
+            if isinstance(arg, torch.fx.node.Node):
+                get_depth(arg, depth_map)
+        return
+
+    # Get the depth of args and set the depth of this node
+    arg_depths = [get_depth(arg, depth_map) for arg in node.all_input_nodes if isinstance(arg, torch.fx.node.Node)]
+    # factory ops like full, rand might not have any input args
+    if len(arg_depths) == 0:
+        arg_depths = [0]
+    depth_map[node] = max(arg_depths) + 1
+    return depth_map[node]
+
+
+def sort_depths(args, depth_map):
+    arg_depths = {arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)}
+    return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True)
+
+
+def reordering_to_mimic_autograd_engine(gm):
+    """
+    This pass finds the first bwd node in the graph (by looking at users of
+    tangents) and then reorders the graph by walking from this node to all the
+    way to the end of the graph. At each op in this traveral, we insert this op
+    in a new graph and try to bring only the relevant subgraph from the other
+    non-bwd edges relevant for this op. This closely mimics the behavior of
+    autograd engine.
+
+    Why is this pass required in the first place?
+
+    This is an artifact of how partitioners work today. The starting point of
+    partitioner is a joint graph, which is fwd and then bwd graph. In the case
+    of checkpointing, we keep portions of fwd graph in their original place in
+    the joint graph, while obtaining a bwd graph. As a result, the resulting bwd
+    graph has copies of recomputed fwd subgraphs followed by the original bwd
+    graph. If we run this naively, this leads to bad memory footprint, because
+    the fwd subgraphs are live for way longer duration than necessary. This pass
+    reorders the operations such that we prioritize the ops for the original bwd
+    graph while only realizing those ops from the fwd graph that are necessary
+    at any given point in the graph.
+    """
+
+    new_graph = fx.Graph()
+    env = {}
+
+    # Add new placeholder nodes in the order specified by the inputs
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            new_node = new_graph.placeholder(node.name)
+            # Can't use node_copy here as we may be turning previous call_function into placeholders
+            new_node.meta = node.meta
+            env[node] = new_node
+
+
+    order = {}
+    for idx, node in enumerate(gm.graph.nodes):
+        order[node] = idx
+
+    # Populate depth for the nodes. Depth is the distance from the inputs.
+    depths = {}
+    output_node = next(node for node in gm.graph.nodes if node.op == "output")
+    get_depth(output_node, depths)
+
+    def insert_node_in_graph(node):
+        if node in env:
+            return env[node]
+
+        # Bias traversal towards the nodes that have higher depth - prioritizes
+        # critical path first.
+        for arg, _ in sort_depths(node.all_input_nodes, depths):
+            env[arg] = insert_node_in_graph(arg)
+        env[node] = new_graph.node_copy(node, lambda x: env[x])
+        return env[node]
+
+    # Find first bwd node in the graph
+    tangent_inputs = list(filter(_is_tangent, gm.graph.nodes))
+    first_node_in_bwd = None
+    minimum_order = math.inf
+    for tangent in tangent_inputs:
+        for user in tangent.users:
+            if order[user] < minimum_order:
+                minimum_order = order[user]
+                first_node_in_bwd = user
+    assert first_node_in_bwd is not None
+
+    # Build the graph op-by-op by starting from the node all the way to the end
+    for node in list(gm.graph.nodes)[order[first_node_in_bwd]:]:
+        insert_node_in_graph(node)
+
+    # The output node is already built by the traversal.
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    return new_gm
+
+
+def functionalize_rng_ops(joint_module, fw_module, bw_module, num_sym_nodes):
+    # During user-driven activation checkpointing, we have to ensure that a rng
+    # op in fwd yields the same output as the recomputed rng op in the bwd.  To
+    # do this, we use functionalize wrappers to wrap the random ops and share
+    # rng state between the fwd and bwd graphs.
+
+    # There are 3 main steps to do this
+    # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd.
+    # Step 2 - Modify the fwd pass such that
+    #   1) Replace rand with run_and_save_rng_state wrapper
+    #   2) Replace the users of the original op with the output[1] of this op.
+    #   3) Collect all the rng_state - output[0] of each op, and make them
+    #   output nodes. Special care needs to be taken here because fwd outputs
+    #   has symints at the very end.
+    # Step 3 - Modify the bwd pass such that
+    #   1) Add the input nodes just before the tangents for the stashed rng states
+    #   2) Replace rand with run_with_save_rng_state wrappers
+    #   3) Use the stashed states as inputs to these ops
+
+    # Unique id to generate name
+    uid = itertools.count()
+
+    def get_rng_ops(gmod):
+        random_nodes = {}
+        for node in gmod.graph.nodes:
+            if (
+                node.op == "call_function"
+                and hasattr(node.target, "tags")
+                and torch.Tag.nondeterministic_seeded in node.target.tags
+            ):
+                random_nodes[node.name] = node
+        return random_nodes
+
+    def get_device(node):
+        """
+        Check the example value of the node outputs to find the device type.
+        """
+        if "val" not in node.meta:
+            return None
+
+        candidates = node.meta["val"]
+        if not isinstance(candidates, tuple):
+            candidates = (candidates,)
+
+        for candidate in candidates:
+            if isinstance(candidate, torch.Tensor):
+                if candidate.device.type == "cuda":
+                    return "cuda"
+
+        return "cpu"
+
+    def get_sample_rng_state(device):
+        if device == "cuda":
+            return torch.cuda.get_rng_state()
+        return torch.get_rng_state()
+
+    # Step 1 - Construct a mapping of rng node between the fwd and its counterpart in bwd.
+    joint_graph_rng_ops = get_rng_ops(joint_module)
+    fw_graph_rng_ops = get_rng_ops(fw_module)
+    bw_graph_rng_ops = get_rng_ops(bw_module)
+    recomputable_rng_ops_map = dict()
+    for node in joint_module.graph.nodes:
+        if (
+            must_recompute(node)
+            and hasattr(node.target, "tags")
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            base_node = joint_graph_rng_ops[node.name]
+            fw_node = fw_graph_rng_ops[node.name]
+            bw_node = bw_graph_rng_ops[node.name]
+            recomputable_rng_ops_map[base_node] = {"fwd": fw_node, "bwd": bw_node}
+
+    run_and_save_rng = torch._prims.rng_prims.run_and_save_rng_state
+    run_with_rng_state = torch._prims.rng_prims.run_with_rng_state
+
+    for node in bw_module.graph.nodes:
+        if node.op == "placeholder" and "tangent" in node.name:
+            bw_tangent_start_node = node
+            break
+
+
+    fw_rng_state_outputs = []
+    for base_node, node_pair in recomputable_rng_ops_map.items():
+        # Step 2 - Modify the fwd pass such that
+        fw_node = node_pair["fwd"]
+        bw_node = node_pair["bwd"]
+        fw_graph = fw_module.graph
+        with fw_graph.inserting_before(fw_node):
+            functional_fw_node = fw_graph.create_node(
+                "call_function",
+                run_and_save_rng,
+                args=(fw_node.target, *fw_node.args),
+                kwargs=fw_node.kwargs
+            )
+            state = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 0), kwargs={})
+            rng_output = fw_graph.create_node("call_function", operator.getitem, args=(functional_fw_node, 1,), kwargs={})
+            fw_node.replace_all_uses_with(rng_output)
+            fw_graph.erase_node(fw_node)
+            fw_rng_state_outputs.append(state)
+
+
+        # Step 3 - Modify the bwd pass such that
+        bw_graph = bw_module.graph
+        with bw_graph.inserting_before(bw_tangent_start_node):
+            state_name = f"rng_state_output_{next(uid)}"
+            bw_rng_state_node = bw_graph.placeholder(state_name)
+            bw_rng_state_node.meta["val"] = get_sample_rng_state(get_device(fw_node))
+
+        with bw_graph.inserting_before(bw_node):
+            rng_output = bw_graph.create_node(
+                "call_function",
+                run_with_rng_state,
+                args=(bw_rng_state_node, bw_node.target, *bw_node.args),
+                kwargs=bw_node.kwargs
+            )
+
+            bw_node.replace_all_uses_with(rng_output)
+            bw_graph.erase_node(bw_node)
+
+
+    # Add the rng states in the output of the fwd graph. AOT Autograd assumes
+    # that symints are at the end of forward graph outputs. So, insert the new
+    # rng states accordingly.
+    fw_output_node = next(node for node in fw_module.graph.nodes if node.op == "output")
+    fw_outputs = fw_output_node.args[0]
+    sym_node_start_idx = len(fw_outputs) - num_sym_nodes
+    outputs = fw_outputs[:sym_node_start_idx] + fw_rng_state_outputs + fw_outputs[sym_node_start_idx:]
+    fw_module.graph.output(outputs)
+    fw_module.graph.erase_node(fw_output_node)
+    fw_module.recompile()
+    bw_module.recompile()
+    return fw_module, bw_module
+
+
+def cleanup_recompute_tags(joint_module):
+    """
+    If there are two consecutive checkpointed blocks with no operator in
+    between, we would still want to stash the tensor at the boundary of
+    checkpointed blocks. The following pass makes the last output node
+    non-recomputable to allow for that.
+    """
+    for node in joint_module.graph.nodes:
+        if must_recompute(node):
+            for user in node.users:
+                if must_recompute(user) and user.meta["recompute"] > node.meta["recompute"]:
+                    node.meta["recompute"] = 0
+    return joint_module
+
+
+def min_cut_rematerialization_partition(
+    joint_module: fx.GraphModule, _joint_inputs, compiler="inductor", recomputable_ops=None,
+    *, num_fwd_outputs
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
+    """
+    Partitions the joint graph such that the backward recomputes the forward.
+    Recomputing helps in trading off memory bandwidth with computation.
+
+    To create the fwd and bwd graph, we copy the joint graph, manually set the
+    outputs to just original forward or backward outputs. And then we run the
+    resulting graphs through dead code elimination.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        joint_module(fx.GraphModule): The joint forward and backward graph. This
+            is the result of AOT Autograd tracing.
+        _joint_inputs: The inputs to the joint graph. This is unused.
+        compiler: This option determines the default set of recomputable ops.
+            Currently, there are two options: ``nvfuser`` and ``inductor``.
+        recomputable_ops: This is an optional set of recomputable ops. If this
+            is not None, then this set of ops will be used instead of the
+            default set of ops.
+        num_fwd_outputs: The number of outputs from the forward graph.
+
+    Returns:
+        Returns the generated forward and backward Fx graph modules.
+    """
+    try:
+        import networkx as nx
+    except ImportError as e:
+        raise RuntimeError("Need networkx installed to perform smart recomputation "
+                           "heuristics") from e
+
+    joint_module.graph.eliminate_dead_code()
+    joint_module.recompile()
+
+    fx_g = joint_module.graph
+
+    #  add the CSE pass
+    if config.cse:
+        cse_graph = fx_graph_cse(fx_g)
+        joint_module.graph = cse_graph
+    full_bw_graph = joint_module.graph
+
+    graph_has_recomputable_ops = has_recomputable_ops(joint_module)
+    graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
+    if graph_has_recomputable_ops:
+        joint_module = cleanup_recompute_tags(joint_module)
+
+    name_to_node = {}
+    for node in joint_module.graph.nodes:
+        name_to_node[node.name] = node
+
+    def classify_nodes(joint_module):
+        required_bw_nodes = set()
+        for node in joint_module.graph.nodes:
+            if node.op == 'placeholder' and "tangents" in node.target:
+                required_bw_nodes.add(node)
+            if node in required_bw_nodes:
+                for user in node.users:
+                    required_bw_nodes.add(user)
+
+        primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+        fwd_seed_offset_inputs = list(filter(_is_fwd_seed_offset, joint_module.graph.nodes))
+        inputs = primal_inputs + fwd_seed_offset_inputs
+        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(joint_module, num_fwd_outputs=num_fwd_outputs)
+        required_bw_nodes.update(o for o in bwd_outputs if o is not None)
+        forward_only_graph = _extract_graph_with_inputs_outputs(joint_module.graph, inputs, fwd_outputs)
+        required_fw_nodes = {name_to_node[node.name] for node in forward_only_graph.nodes
+                             if node.op != 'output'}
+        unclaimed_nodes = {node for node in joint_module.graph.nodes
+                           if node not in required_fw_nodes and node not in required_bw_nodes}
+        return fwd_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes, inputs
+
+    orig_fw_outputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes, inputs = classify_nodes(joint_module)
+
+    # networkx blows up on graphs with no required backward nodes
+    # Since there's nothing to partition anyway, and the default partitioner can "handle"
+    # this case, send our graph over to the default partitioner.
+    if len(required_bw_nodes) == 0:
+        return default_partition(joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs)
+
+    for node in reversed(joint_module.graph.nodes):
+        if node not in required_fw_nodes:
+            node.dist_from_bw = 0
+        else:
+            node.dist_from_bw = int(1e9)
+            for user in node.users:
+                node.dist_from_bw = min(node.dist_from_bw, user.dist_from_bw + 1)
+
+    aten = torch.ops.aten
+    prims = torch.ops.prims
+
+    # compiler == "nvfuser" is the default set of recomputable ops
+    default_recomputable_ops = [aten.add, aten.sub, aten.div, aten.atan2, aten.mul, aten.max, aten.min, aten.pow, aten.remainder, aten.fmod, aten.__and__, aten.__or__, aten.__xor__, aten.__lshift__, aten.__rshift__, aten.eq, aten.ne, aten.ge, aten.gt, aten.le, aten.lt, aten.abs, aten.bitwise_not, aten.ceil, aten.floor, aten.frac, aten.neg, aten.relu, aten.round, aten.silu, aten.trunc, aten.log, aten.log10, aten.log1p, aten.log2, aten.lgamma, aten.exp, aten.expm1, aten.erf, aten.erfc, aten.cos, aten.acos, aten.cosh, aten.sin, aten.asin, aten.sinh, aten.tan, aten.atan, aten.tanh, aten.atanh, aten.sqrt, aten.rsqrt, aten.reciprocal, aten.sigmoid, aten.softplus, aten.threshold, aten.threshold_backward, aten.clamp, aten.where, aten.lerp, aten.addcmul, aten.gelu, aten.gelu_backward, aten.sum, aten.mean, aten._grad_sum_to_size, aten.sum_to_size, aten.amax, aten.to, aten.type_as, operator.getitem, aten.squeeze, aten.unsqueeze, aten.rsub, aten._to_copy]  # noqa: E501,B950
+    view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
+    if compiler == "inductor":
+        default_recomputable_ops += [prims.div, prims.convert_element_type, aten.clone, aten._to_copy, aten.full_like, prims.var, prims.sum, aten.var, aten.std, prims.broadcast_in_dim, aten.select, aten.permute, aten._unsafe_view, aten.view, aten.expand, aten.slice, aten.reshape, aten.broadcast_tensors, aten.scalar_tensor, aten.ones, aten.new_zeros, aten.lift_fresh_copy, aten.arange, aten.triu, aten.var_mean, aten.isinf, aten.any, aten.full, aten.as_strided, aten.zeros, aten.argmax, aten.maximum]  # noqa: E501,B950
+        view_ops += [aten.view, aten.slice, aten.permute, aten.t, prims.broadcast_in_dim, aten.expand, aten.as_strided]
+        # Natalia said that we should allow recomputing indexing :)
+        default_recomputable_ops += [aten.index]
+    default_recomputable_ops += view_ops
+
+    default_recomputable_ops += pointwise_ops()
+
+    default_recomputable_ops += [
+        aten.zeros_like,
+    ]
+
+    default_recomputable_ops += [
+        method_to_operator(m)
+        for m in magic_methods
+    ]
+
+    recomputable_ops = set(recomputable_ops) if recomputable_ops is not None else set(default_recomputable_ops)
+
+    random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
+    compute_intensive_ops = [aten.mm, aten.convolution, aten.convolution_backward, aten.bmm, aten.addmm, aten.upsample_bilinear2d, aten._softmax, aten._softmax_backward_data, aten.native_layer_norm, aten.native_layer_norm_backward, aten.native_batch_norm, aten.native_batch_norm_backward, aten._native_batch_norm_legit]  # noqa: E501,B950
+
+    fusible_ops = recomputable_ops | set(random_ops)
+    if AOT_PARTITIONER_DEBUG:
+        joint_module_ops = {
+            str(node.target._overloadpacket)
+            for node in joint_module.graph.nodes
+            if node.op == "call_function" and hasattr(node.target, "_overloadpacket")
+        }
+        ops_ignored = joint_module_ops - {str(i) for i in recomputable_ops}
+        print("Ops banned from rematerialization: ", ops_ignored)
+        print()
+
+    def is_materialized_backwards(node):
+        cur_nodes = {node}
+        while len(cur_nodes) > 0:
+            cur = cur_nodes.pop()
+            for user in cur.users:
+                if user not in required_fw_nodes and not is_fusible(cur, user):
+                    return True
+                if user not in required_fw_nodes and get_aten_target(user) in view_ops:
+                    cur_nodes.add(user)
+
+        return False
+
+    def ban_recomputation(node):
+        if "recompute" in node.meta:
+            return node.meta["recompute"] == 0
+        elif config.aggressive_recomputation:
+            ignored_ops = random_ops + compute_intensive_ops
+            return (node.op == 'call_function' and get_aten_target(node) in ignored_ops)
+        else:
+            if node.op != 'call_function':
+                return False
+            if get_aten_target(node) not in recomputable_ops:
+                return True
+            if node.target == operator.getitem:
+                return False
+            if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
+                return False
+
+            # If a node *must* be materialized in the backwards pass, then we
+            # should never recompute it. This is a pretty subtle point.  In
+            # general, the assumption we make is that recomputing a node in the
+            # backwards pass is "free". However, if a node must be materialized
+            # in the backwards pass, then recomputing it is never free.
+            if is_materialized_backwards(node):
+                return True
+
+            # Arbitrary hack that sometimes seems to help things. The above
+            # modification appears to have made this heuristic a lot less critical
+            # for performance.
+            # TODO: Investigate why this hack helps.
+            # TODO: Investigate the interaction with compiler assisted
+            # activation checkpointing. Removing the heuristic improves both
+            # memory footprint and speedup.
+            if not graph_has_recomputable_ops:
+                if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
+                    return True
+            # If the output of an op is 4x smaller (arbitrary choice),
+            # then we don't allow recomputation.
+            input_tensors_size = sum(_size_of(i) for i in node.args if isinstance(i, fx.Node))
+            output_size = _size_of(node)
+            return (output_size * 4 < input_tensors_size)
+
+    def is_fusible(a, b):
+        # We can perform "memory fusion" into a cat, but cat cannot be a
+        # producer to a fusion
+        if get_aten_target(b) == aten.cat:
+            return True
+        return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops
+
+    def is_materialized(node):
+        if node.op == 'placeholder':
+            return True
+
+        return not all(is_fusible(node, user) for user in node.users)
+
+    def get_node_weight(node) -> int:
+        mem_sz = _size_of(node)
+
+        # Heuristic to bias towards nodes closer to the backwards pass
+        # Complete guess about current value
+        mem_sz = int(mem_sz * (1.1 ** max(min(node.dist_from_bw, 100), 1)))
+        # mem_sz = int(mem_sz + node.dist_from_bw)
+
+        if is_materialized(node):
+            return mem_sz
+        else:
+            return mem_sz * 2
+
+    nx_graph = nx.DiGraph()
+    for node in full_bw_graph.nodes:
+        if node.op == 'output':
+            continue
+
+        if node in required_bw_nodes:
+            if node not in inputs:
+                nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
+                continue
+            # If someone saves a input for backward as-is and backward
+            # returns that tensor as-is as a grad input, then the node x would
+            # be both a required_bw_node and an input. In this case we
+            # (1) connect x_in to to the source, (2) x_out to the sink, and
+            # (3) assign the proper weight to the x_in-x_out edge, so that
+            # x would be part of cut nodes. A case where this happens is if
+            # NestedTensor saves a offset tensor as part of the singleton int
+            # in sizes.
+            nx_graph.add_edge(node.name + "_out", "sink", capacity=math.inf)
+
+        if _is_primal(node) or _is_fwd_seed_offset(node):
+            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+
+        # If a node can't be recomputed (too expensive or involves randomness),
+        # we prevent it from being recomputed by adding an inf edge to the source
+        # We only need to ban nodes in the fw pass, as those are the only ones that would be recomputed.
+        if ban_recomputation(node) and node in required_fw_nodes:
+            nx_graph.add_edge("source", node.name + "_in", capacity=math.inf)
+
+        # Checks if a node is actually a tuple. Can be simplified to just an isinstance check if we always use faketensors.
+        is_non_tensor_node = (('val' not in node.meta and 'tensor_meta' not in node.meta) or
+                              ('val' in node.meta and not isinstance(node.meta['val'], torch.Tensor)))
+
+        if is_sym_node(node):
+            weight = sym_node_size(node)
+        elif is_non_tensor_node:
+            weight = 0 if isinstance(node.meta.get("val"), BackwardState) else math.inf
+        else:
+            weight = get_node_weight(node)
+
+        # Creates the weights on the "node" edge
+        nx_graph.add_edge(node.name + "_in", node.name + "_out", capacity=weight)
+        for user in node.users:
+            nx_graph.add_edge(node.name + "_out", user.name + "_in", capacity=math.inf)
+
+    try:
+        cut_value, partition = nx.minimum_cut(nx_graph, "source", "sink")
+    except Exception:
+        print('Failed to compute min-cut on following graph:')
+        print('\n'.join(nx.readwrite.edgelist.generate_edgelist(nx_graph)))
+        raise
+
+    reachable, non_reachable = partition
+    cutset = set()
+    for u, nbrs in ((n, nx_graph[n]) for n in reachable):
+        cutset.update((u, v) for v in nbrs if v in non_reachable)
+
+    cut_nodes = set()
+    for node_in, node_out in cutset:
+        assert node_in[:-3] == node_out[:-4]
+        node_name = node_in[:-3]
+        cut_nodes.add(node_name)
+
+    # To make this stuff deterministic
+    node_idx = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
+    saved_values = sorted((name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x])
+    # save_for_backward on tensors and stashes symints in autograd .ctx
+    saved_sym_nodes = list(filter(is_sym_node, saved_values))
+    saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
+    # NB: saved_sym_nodes will be mutated to reflect the actual saved symbols
+    fw_module, bw_module = _extract_fwd_bwd_modules(
+        joint_module, saved_values, saved_sym_nodes=saved_sym_nodes, num_fwd_outputs=num_fwd_outputs)
+
+    if graph_has_recomputable_ops:
+        if graph_has_recomputable_rng_ops:
+            fw_module, bw_module = functionalize_rng_ops(
+                joint_module, fw_module, bw_module, len(saved_sym_nodes)
+            )
+        bw_module = reordering_to_mimic_autograd_engine(bw_module)
+
+    if AOT_PARTITIONER_DEBUG:
+        print("Theoretical Activations Stored: ", sum([_size_of(i) for i in saved_values]) / 1e9)
+        fw_module_nodes = {node.name for node in fw_module.graph.nodes if node.op == 'call_function'}
+        bw_module_nodes = {node.name for node in bw_module.graph.nodes if node.op == 'call_function'}
+        remat_nodes = fw_module_nodes & bw_module_nodes
+
+        counts = defaultdict(int)
+        for node in fw_module.graph.nodes:
+            if node.name in remat_nodes and hasattr(node.target, '_overloadpacket'):
+                counts[str(node.target._overloadpacket)] += 1
+        print(f"# remat/fw/bw: {len(remat_nodes)}/{len(fw_module_nodes)}/{len(bw_module_nodes)}")
+        print("Count of Ops Rematerialized: ", sorted(counts.items(), key=lambda x: x[1], reverse=True))
+    return fw_module, bw_module
+
+
+def draw_graph(
+    traced: torch.fx.GraphModule,
+    fname: str,
+    figname: str = "fx_graph",
+    clear_meta: bool = True,
+    prog: Union[str, List[str]] = None,
+    parse_stack_trace: bool = False,
+    dot_graph_shape: Optional[str] = None,
+) -> None:
+    if clear_meta:
+        new_graph = copy.deepcopy(traced.graph)
+        traced = fx.GraphModule(traced, new_graph)
+        for node in traced.graph.nodes:
+            node.meta = {}
+    base, ext = os.path.splitext(fname)
+    if not ext:
+        ext = ".svg"
+    print(f"Writing FX graph to file: {base}{ext}")
+    g = graph_drawer.FxGraphDrawer(
+        traced,
+        figname,
+        parse_stack_trace=parse_stack_trace,
+        dot_graph_shape=dot_graph_shape,
+    )
+    x = g.get_main_dot_graph()
+    write_method = getattr(x, "write_" + ext.lstrip("."))
+    fname = f"{base}{ext}"
+    if prog is None:
+        write_method(fname)
+    else:
+        write_method(fname, prog=prog)
+
+
+def draw_joint_graph(
+    graph: torch.fx.GraphModule,
+    joint_inputs,
+    file_name: str = "full_graph.png",
+    dot_graph_shape: Optional[str] = None,
+):
+    draw_graph(graph, file_name, dot_graph_shape=dot_graph_shape)
+    return default_partition(graph, joint_inputs)
diff --git a/MLPY/Lib/site-packages/torch/_functorch/pyfunctorch.py b/MLPY/Lib/site-packages/torch/_functorch/pyfunctorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ae0c7c353673d8156d77e70b0f81b1ce2eea83b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/pyfunctorch.py
@@ -0,0 +1,252 @@
+from abc import ABC, abstractmethod
+import contextlib
+from typing import Any, List, Tuple
+import torch
+import torch.utils._pytree as pytree
+from torch._C._functorch import (
+    TransformType,
+    RandomnessType,
+    CInterpreter,
+    CGradInterpreterPtr,
+    CFunctionalizeInterpreterPtr,
+    CVmapInterpreterPtr,
+    CJvpInterpreterPtr,
+    pop_dynamic_layer_stack,
+    push_dynamic_layer_stack,
+)
+from torch.autograd.forward_ad import _set_fwd_grad_enabled
+
+"""
+This file contains the functorch integration with PyDispatcher.
+
+PyDispatcher does not understand functorch's DynamicLayerStack dispatching
+logic because it is entirely implemented in C++ in the fallbacks for two
+dispatch keys, FuncTorchDynamicLayer{Front, Back}Mode (PyDispatcher is unable
+to directly reuse C++ boxed fallbacks).
+
+Instead of trying to hammer PyDispatcher into understanding those fallbacks,
+we re-implement the logic of peeking the top of the stack for an interpreter,
+selecting the interpreter to dispatch on, etc, in Python. This leads to a
+simpler design.
+
+The main difference between C++ functorch and PyDispatcher's functorch logic
+is that:
+- C++ functorch needs to manually tweak dispatch keys to ping-pong between
+  DynamicLayerFrontMode and DynamicLayerBackMode.
+- PyDispatcher's functorch logic pops an Interpreter from the top of the stack
+  and asks it to execute the rule associated with the Interpreter.
+
+In C++ we do the ping-pong because e.g. vmap rules are associated with the
+batched DispatchKey, but in PyDispatcher we are able to avoid this by asking
+the user to register a batching rule directly to a transform that an
+interpreter then invokes.
+"""
+
+
+# FuncTorchInterpreter is the Python version of Interpreter (recall that
+# the DynamicLayerStack is a stack of interpreters).
+# It is a wrapper around the actual C++ Interpreter object.
+#
+# Keep the methods in sync with aten/src/ATen/functorch/Interpreter.h
+class FuncTorchInterpreter(ABC):
+    def __init__(self, cptr: Any):
+        self._cptr = cptr
+
+    # Process an operation. eg for vmap, this is invoking a batching rule.
+    # Conceptually this is analogous to Interpreter::process in C++
+    @abstractmethod
+    def process(self, op, args, kwargs):
+        pass
+
+    # lower an operation from this Interpreter to the next Interpreter on the stack.
+    # Concretely, this involves temporarily popping the current Interpreter.
+    # Conceptually this is analogous to Interpreter::sendToNextInterpreter in C++
+    def lower(self):
+        return temporarily_pop_interpreter_stack()
+
+    def level(self):
+        return self._cptr.level()
+
+    def key(self):
+        return self._cptr.key()
+
+    def get_state(self):
+        raise NotImplementedError()
+
+    def check_state(self, state):
+        return state == self.get_state()
+
+
+@contextlib.contextmanager
+def temporarily_pop_interpreter_stack():
+    try:
+        saved = pop_dynamic_layer_stack()
+        yield
+    finally:
+        push_dynamic_layer_stack(saved)
+
+
+class VmapInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Vmap
+        # NOTE: [Interpreter cdata vs cptr]
+        # cdata is a generic CInterpreter. We wrap it in a CVmapInterpreterPtr
+        # so that we can access methods specific to the vmap interpreter
+        self._cdata = cdata
+        self._cptr = CVmapInterpreterPtr(cdata)
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Vmap]
+        return kernel(self, *args, **kwargs)
+
+    def batch_size(self):
+        return self._cptr.batchSize()
+
+    def randomness(self):
+        typ = self._cptr.randomness()
+        if typ == RandomnessType.Error:
+            return "error"
+        elif typ == RandomnessType.Same:
+            return "same"
+        elif typ == RandomnessType.Different:
+            return "different"
+        raise RuntimeError(f"Unknown RandomnessType: {typ}")
+
+    def get_state(self):
+        return (self.key().name, self.level(), self.randomness())
+
+
+@contextlib.contextmanager
+def nested(*contexts):
+    with contextlib.ExitStack() as stack:
+        for ctx in contexts:
+            stack.enter_context(ctx)
+        yield contexts
+
+
+class GradInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Grad
+        # See NOTE: [Interpreter cdata vs cptr]
+        self._cdata = cdata
+        self._cptr = CGradInterpreterPtr(cdata)
+
+    def lift(self, args, kwargs):
+        args, kwargs = pytree.tree_map_only(torch.Tensor, self._cptr.lift, [args, kwargs])
+        return args, kwargs
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Grad]
+        args, kwargs = self.lift(args, kwargs)
+        return kernel(self, *args, **kwargs)
+
+    # GradInterpreter has custom lower because of the no_grad interaction
+    # See NOTE [grad and vjp interaction with no_grad]
+    # This logic is mirrored from C++ GradInterpreterPtr::sendToNextInterpreter
+    def lower(self):
+        prev_grad_mode = self.prev_grad_mode()
+        if not prev_grad_mode:
+            return nested(torch.no_grad(), super().lower())
+        return super().lower()
+
+    def prev_grad_mode(self):
+        return self._cptr.prevGradMode()
+
+    def get_state(self):
+        return (self.key().name, self.level(), self.prev_grad_mode())
+
+
+class JvpInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Jvp
+        # See NOTE: [Interpreter cdata vs cptr]
+        self._cdata = cdata
+        self._cptr = CJvpInterpreterPtr(cdata)
+
+    def lift(self, args, kwargs):
+        args, kwargs = pytree.tree_map_only(torch.Tensor, self._cptr.lift, [args, kwargs])
+        return args, kwargs
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Jvp]
+        args, kwargs = self.lift(args, kwargs)
+        return kernel(self, *args, **kwargs)
+
+    # Jvp has custom lower because of the no_fwd_grad interaction
+    # See NOTE [grad and vjp interaction with no_grad] for related info.
+    # This logic is mirrored from C++ JvpInterpreterPtr::sendToNextInterpreter
+    def lower(self):
+        prev_fwd_grad_mode = self.prev_fwd_grad_mode()
+        if not prev_fwd_grad_mode:
+            return nested(_set_fwd_grad_enabled(False), super().lower())
+        return super().lower()
+
+    def prev_fwd_grad_mode(self):
+        return self._cptr.prevFwdGradMode()
+
+
+class FunctionalizeInterpreter(FuncTorchInterpreter):
+    def __init__(self, cdata: CInterpreter):
+        assert cdata.key() == TransformType.Functionalize
+        self._cdata = cdata
+        self._cptr = CFunctionalizeInterpreterPtr(cdata)
+
+    def process(self, op, args, kwargs):
+        kernel = op.functorch_table[TransformType.Functionalize]
+        return kernel(self, *args, **kwargs)
+
+    def functionalize_add_back_views(self):
+        return self._cptr.functionalizeAddBackViews()
+
+
+def coerce_cinterpreter(cinterpreter: CInterpreter) -> FuncTorchInterpreter:
+    key = cinterpreter.key()
+    if key == TransformType.Grad:
+        return GradInterpreter(cinterpreter)
+    if key == TransformType.Vmap:
+        return VmapInterpreter(cinterpreter)
+    if key == TransformType.Jvp:
+        return JvpInterpreter(cinterpreter)
+    if key == TransformType.Functionalize:
+        return FunctionalizeInterpreter(cinterpreter)
+    raise RuntimeError(f"NYI: PyDispatcher has not implemented support for {key}")
+
+
+def retrieve_current_functorch_interpreter() -> FuncTorchInterpreter:
+    interpreter = torch._C._functorch.peek_interpreter_stack()
+    assert interpreter is not None
+    return coerce_cinterpreter(interpreter)
+
+
+def retrieve_all_functorch_interpreters() -> List[FuncTorchInterpreter]:
+    cis = torch._C._functorch.get_interpreter_stack()
+    if cis is None:
+        return []
+    return [coerce_cinterpreter(ci) for ci in cis]
+
+
+def compare_functorch_state(states: List[Tuple[Any, ...]]) -> bool:
+    # There are four possible cases covered here:
+    # 1. Current stack empty AND stack when generated not empty -> Invalidate
+    # 2. Current stack not empty AND stack when generated empty -> Invalidate
+    # 3. Current stack and generated stack empty -> Valid FX graph
+    # 4. Current stack and generated stack not empty -> Valid if both states match
+    peek = torch._C._functorch.peek_interpreter_stack()
+    if (peek is None and len(states) != 0) or (peek is not None and len(states) == 0):
+        return False
+
+    cis = retrieve_all_functorch_interpreters()
+    return len(cis) == len(states) and \
+        all(ci.check_state(state) for ci, state in zip(cis, states))
+
+
+def dispatch_functorch(op, args, kwargs):
+    interpreter = retrieve_current_functorch_interpreter()
+    # In traditional PyTorch operators, DispatchKey::FuncTorchTensorWrapper's
+    # unwrap_dead_tensors fallback handles unwrapping dead tensor wrappers.
+    # PyDispatcher sidesteps the PyTorch dispatcher when dealing with functorch
+    # transforms, so we manually unwrap the dead tensors here.
+    # This logic won't need to exist when we have mode-only functorch.
+    args, kwargs = pytree.tree_map_only(
+        torch.Tensor, torch._C._functorch.unwrap_if_dead, (args, kwargs))
+    return interpreter.process(op, args, kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_functorch/python_key.py b/MLPY/Lib/site-packages/torch/_functorch/python_key.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee0f6c14124fa1d8a2bd59b198d38e9d4368f5ec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/python_key.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+__all__ = ["make_fx", "dispatch_trace", "PythonKeyTracer", "pythonkey_decompose"]
+from torch.fx.experimental.proxy_tensor import make_fx, dispatch_trace, PythonKeyTracer, decompose
+
+pythonkey_decompose = decompose
diff --git a/MLPY/Lib/site-packages/torch/_functorch/pytree_hacks.py b/MLPY/Lib/site-packages/torch/_functorch/pytree_hacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..c016206a0267b555a39e2a441b308a3156a656c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/pytree_hacks.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+
+# TODO: remove this file when the migration of the pytree utility is done
+from torch.utils._pytree import tree_map_, treespec_pprint
+
+
+__all__ = ["tree_map_", "treespec_pprint"]
+
+
+with warnings.catch_warnings():
+    warnings.simplefilter("always")
+    warnings.warn(
+        "torch._functorch.pytree_hacks is deprecated and will be removed in a future release. "
+        "Please use torch.utils._pytree instead.",
+        DeprecationWarning,
+    )
diff --git a/MLPY/Lib/site-packages/torch/_functorch/top_operators_github_usage.py b/MLPY/Lib/site-packages/torch/_functorch/top_operators_github_usage.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac5422f8ffe833bf2f7346eb67474cfeac99af0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/top_operators_github_usage.py
@@ -0,0 +1,625 @@
+# mypy: ignore-errors
+
+"""
+From https://docs.google.com/spreadsheets/d/12R3nCOLskxPYjjiNkdqy4OdQ65eQp_htebXGODsjSeA/edit#gid=0
+Try to keep this list in sync with that.
+"""
+top_torch = [
+    ("t", 6837449),
+    ("tensor", 585786),
+    ("mode", 462182),
+    ("cat", 394818),
+    ("max", 368038),
+    ("zeros", 329495),
+    ("load", 327756),
+    ("no_grad", 294694),
+    ("save", 265130),
+    ("from_numpy", 243063),
+    ("manual_seed", 165044),
+    ("ones", 153696),
+    ("randn", 150796),
+    ("stack", 133358),
+    ("sum", 130772),
+    ("arange", 98087),
+    ("rand", 94715),
+    ("mean", 88546),
+    ("exp", 73883),
+    ("zeros_like", 72831),
+    ("min", 72248),
+    ("sigmoid", 66798),
+    ("log", 62135),
+    ("matmul", 47811),
+    ("clamp", 45304),
+    ("sqrt", 44911),
+    ("abs", 43535),
+    ("tanh", 42793),
+    ("empty", 40311),
+    ("argmax", 38435),
+    ("bmm", 33984),
+    ("pow", 33571),
+    ("norm", 31125),
+    ("mm", 30995),
+    ("is_tensor", 29546),
+    ("ones_like", 29512),
+    ("nonzero", 28681),
+    ("full", 28373),
+    ("unsqueeze", 27911),
+    ("where", 26585),
+    ("randperm", 26450),
+    ("eye", 24342),
+    ("mul", 23236),
+    ("topk", 22537),
+    ("as_tensor", 21967),
+    ("sort", 21412),
+    ("squeeze", 20863),
+    ("randint", 20771),
+    ("linspace", 20041),
+    ("add", 19201),
+    ("transpose", 18663),
+    ("split", 18325),
+    ("gather", 17904),
+    ("set_grad_enabled", 16013),
+    ("sin", 15669),
+    ("cos", 15562),
+    ("div", 15513),
+    ("index_select", 14866),
+    ("multinomial", 14331),
+    ("flatten", 14267),
+    ("isnan", 14170),
+    ("randn_like", 13096),
+    ("eq", 12680),
+    ("einsum", 12480),
+    ("round", 12367),
+    ("floor", 11628),
+    ("allclose", 11000),
+    ("reshape", 10605),
+    ("diag", 10167),
+    ("chunk", 9581),
+    ("std", 9379),
+    ("set_default_tensor_type", 9281),
+    ("triu", 8559),
+    ("meshgrid", 8292),
+    ("set_num_threads", 8126),
+    ("unique", 7964),
+    ("full_like", 7780),
+    ("tril", 7538),
+    ("dot", 7275),
+    ("sign", 6943),
+    ("equal", 6916),
+    ("normal", 6750),
+    ("cumsum", 6556),
+    ("dist", 6058),
+    ("isfinite", 6030),
+    ("gt", 5935),
+    ("set_printoptions", 5888),
+    ("range", 5491),
+    ("empty_like", 5351),
+    ("flip", 5342),
+    ("masked_select", 5341),
+    ("bernoulli", 5262),
+    ("atan", 5253),
+    ("var", 5247),
+    ("prod", 5200),
+    ("erf", 5088),
+    ("inverse", 5072),
+    ("addmm", 4854),
+    ("logsumexp", 4582),
+    ("fft", 4436),
+    ("lt", 4421),
+    ("log2", 4316),
+    ("enable_grad", 4238),
+    ("rand_like", 4187),
+    ("argsort", 3972),
+    ("seed", 3932),
+    ("mv", 3547),
+    ("ger", 3309),
+    ("ge", 3248),
+    ("atan2", 3210),
+    ("ceil", 3202),
+    ("ne", 3075),
+    ("bincount", 3063),
+    ("acos", 3055),
+    ("rsqrt", 3031),
+    ("svd", 3029),
+    ("numel", 3003),
+    ("log1p", 2840),
+    ("unbind", 2808),
+    ("le", 2714),
+    ("isinf", 2707),
+    ("cross", 2646),
+    ("set_default_dtype", 2536),
+    ("argmin", 2535),
+    ("sparse_coo_tensor", 2489),
+    ("log10", 2304),
+    ("kthvalue", 2192),
+    ("set_rng_state", 2158),
+    ("get_rng_state", 1996),
+    ("get_default_dtype", 1879),
+    ("det", 1868),
+    ("qr", 1864),
+    ("histc", 1852),
+    ("symeig", 1832),
+    ("trace", 1801),
+    ("median", 1795),
+    ("addcmul", 1751),
+    ("remainder", 1717),
+    ("baddbmm", 1693),
+    ("lgamma", 1665),
+    ("repeat_interleave", 1598),
+    ("fmod", 1576),
+    ("reciprocal", 1575),
+    ("tan", 1560),
+    ("initial_seed", 1532),
+    ("take", 1529),
+    ("stft", 1487),
+    ("get_num_threads", 1477),
+    ("real", 1459),
+    ("cholesky", 1406),
+    ("quantize_per_tensor", 1392),
+    ("diag_embed", 1364),
+    ("lerp", 1363),
+    ("asin", 1345),
+    ("eig", 1333),
+    ("trunc", 1290),
+    ("diagonal", 1287),
+    ("cosh", 1279),
+    ("rfft", 1269),
+    ("cumprod", 1260),
+    ("addr", 1211),
+    ("roll", 1198),
+    ("narrow", 1188),
+    ("digamma", 1172),
+    ("square", 1163),
+    ("sinh", 1131),
+    ("logspace", 1084),
+    ("broadcast_tensors", 1070),
+    ("irfft", 1013),
+    ("frac", 997),
+    ("hann_window", 994),
+    ("solve", 989),
+    ("logdet", 977),
+    ("expm1", 968),
+    ("cdist", 946),
+    ("addmv", 903),
+    ("randint_like", 888),
+    ("tensordot", 888),
+    ("ifft", 877),
+    ("true_divide", 854),
+    ("erfinv", 830),
+    ("addcdiv", 819),
+    ("addbmm", 813),
+    ("renorm", 781),
+    ("pinverse", 753),
+    ("isclose", 740),
+    ("erfc", 729),
+    ("is_storage", 725),
+    ("triangular_solve", 723),
+    ("rot90", 709),
+    ("logical_not", 686),
+    ("geqrf", 681),
+    ("slogdet", 677),
+    ("lu", 665),
+    ("hamming_window", 659),
+    ("orgqr", 651),
+    ("ormqr", 622),
+    ("is_floating_point", 602),
+    ("diagflat", 562),
+    ("cholesky_solve", 559),
+    ("tril_indices", 552),
+    ("chain_matmul", 551),
+    ("triu_indices", 548),
+    ("angle", 522),
+    ("poisson", 505),
+    ("matrix_power", 485),
+    ("unique_consecutive", 471),
+    ("quantize_per_channel", 465),
+    ("std_mean", 458),
+    ("bartlett_window", 447),
+    ("var_mean", 428),
+    ("lstsq", 421),
+    ("logical_and", 419),
+    ("mvlgamma", 411),
+    ("blackman_window", 400),
+    ("bitwise_not", 395),
+    ("cholesky_inverse", 388),
+    ("as_strided", 384),
+    ("floor_divide", 353),
+    ("cartesian_prod", 321),
+    ("lu_solve", 317),
+    ("set_flush_denormal", 310),
+    ("empty_strided", 283),
+    ("logical_xor", 282),
+    ("polygamma", 282),
+    ("logical_or", 280),
+    ("set_num_interop_threads", 278),
+    ("combinations", 274),
+    ("trapz", 270),
+    ("matrix_rank", 260),
+    ("lu_unpack", 255),
+    ("result_type", 244),
+    ("conj", 231),
+    ("cummax", 230),
+    ("lobpcg", 229),
+    ("bitwise_xor", 217),
+    ("promote_types", 213),
+    ("get_num_interop_threads", 211),
+    ("cummin", 205),
+    ("bitwise_and", 198),
+    ("dequantize", 192),
+    ("bitwise_or", 191),
+    ("imag", 191),
+    ("can_cast", 184),
+    ("istft", 180),
+    ("compiled_with_cxx11_abi", 159),
+    ("is_complex", 151),
+    ("block_diag", 136),
+    ("pca_lowrank", 124),
+    ("absolute", 122),
+    ("svd_lowrank", 108),
+    ("neg", 2),
+]
+
+top_nn_functional = [
+    ("nn.functional.softmax", 10522),
+    ("nn.functional.relu", 8572),
+    ("nn.functional.interpolate", 7277),
+    ("nn.functional.pad", 5207),
+    ("nn.functional.log_softmax", 4699),
+    ("nn.functional.normalize", 2338),
+    ("nn.functional.cross_entropy", 2083),
+    ("nn.functional.grid_sample", 1970),
+    ("nn.functional.one_hot", 1967),
+    ("nn.functional.mse_loss", 1920),
+    ("nn.functional.conv2d", 1593),
+    ("nn.functional.dropout", 1516),
+    ("nn.functional.softplus", 1385),
+    ("nn.functional.sigmoid", 1128),
+    ("nn.functional.linear", 1036),
+    ("nn.functional.gelu", 930),
+    ("nn.functional.avg_pool2d", 899),
+    ("nn.functional.max_pool2d", 876),
+    ("nn.functional.nll_loss", 863),
+    ("nn.functional.embedding", 737),
+    ("nn.functional.tanh", 664),
+    ("nn.functional.leaky_relu", 640),
+    ("nn.functional.adaptive_avg_pool2d", 633),
+    ("nn.functional.cosine_similarity", 627),
+    ("nn.functional.unfold", 609),
+    ("nn.functional.conv1d", 596),
+    ("nn.functional.binary_cross_entropy_with_logits", 591),
+    ("nn.functional.l1_loss", 571),
+    ("nn.functional.binary_cross_entropy", 492),
+    ("nn.functional.elu", 416),
+    ("nn.functional.batch_norm", 413),
+    ("nn.functional.upsample", 413),
+    ("nn.functional.fold", 305),
+    ("nn.functional.affine_grid", 298),
+    ("nn.functional.max_pool1d", 297),
+    ("nn.functional.torch", 294),
+    ("nn.functional.threshold", 263),
+    ("nn.functional.smooth_l1_loss", 262),
+    ("nn.functional.pairwise_distance", 253),
+    ("nn.functional.logsigmoid", 243),
+    ("nn.functional.adaptive_max_pool2d", 235),
+    ("nn.functional.relu6", 213),
+    ("nn.functional.pixel_shuffle", 209),
+    ("nn.functional.avg_pool3d", 203),
+    ("nn.functional.bilinear", 203),
+    ("nn.functional.conv_transpose2d", 201),
+    ("nn.functional.gumbel_softmax", 197),
+    ("nn.functional.max_unpool2d", 196),
+    ("nn.functional.kl_div", 191),
+    ("nn.functional.hardtanh", 189),
+    ("nn.functional.ctc_loss", 185),
+    ("nn.functional.layer_norm", 178),
+    ("nn.functional.conv3d", 172),
+    ("nn.functional.max_unpool3d", 167),
+    ("nn.functional.hardshrink", 165),
+    ("nn.functional.hardswish", 156),
+    ("nn.functional.selu", 156),
+    ("nn.functional.glu", 155),
+    ("nn.functional.assert_int_or_pair", 150),
+    ("nn.functional.hardsigmoid", 146),
+    ("nn.functional.upsample_bilinear", 146),
+    ("nn.functional.max_pool3d", 140),
+    ("nn.functional.adaptive_avg_pool3d", 139),
+    ("nn.functional.instance_norm", 124),
+    ("nn.functional.embedding_bag", 122),
+    ("nn.functional.upsample_nearest", 110),
+    ("nn.functional.avg_pool1d", 105),
+    ("nn.functional.prelu", 102),
+    ("nn.functional.celu", 92),
+    ("nn.functional.dropout2d", 86),
+    ("nn.functional.hinge_embedding_loss", 82),
+    ("nn.functional.softsign", 81),
+    ("nn.functional.max_unpool1d", 74),
+    ("nn.functional.silu", 74),
+    ("nn.functional.softshrink", 70),
+    ("nn.functional.leaky_relu_", 68),
+    ("nn.functional.softmin", 67),
+    ("nn.functional.channel_shuffle", 66),
+    ("nn.functional.multilabel_margin_loss", 66),
+    ("nn.functional.dropout3d", 65),
+    ("nn.functional.multi_margin_loss", 65),
+    ("nn.functional.lp_pool2d", 64),
+    ("nn.functional.conv_transpose1d", 62),
+    ("nn.functional.triplet_margin_loss", 62),
+    ("nn.functional.tanhshrink", 61),
+    ("nn.functional.adaptive_max_pool1d", 59),
+    ("nn.functional.cosine_embedding_loss", 58),
+    ("nn.functional.multi_head_attention_forward", 58),
+    ("nn.functional.max_pool1d_with_indices", 53),
+    ("nn.functional.poisson_nll_loss", 53),
+    ("nn.functional.margin_ranking_loss", 52),
+    ("nn.functional.soft_margin_loss", 52),
+    ("nn.functional.adaptive_max_pool3d", 51),
+    ("nn.functional.group_norm", 51),
+    ("nn.functional.local_response_norm", 51),
+    ("nn.functional.multilabel_soft_margin_loss", 51),
+    ("nn.functional.relu_", 50),
+    ("nn.functional.alpha_dropout", 49),
+    ("nn.functional.feature_alpha_dropout", 49),
+    ("nn.functional.lp_pool1d", 49),
+    ("nn.functional.adaptive_max_pool1d_with_indices", 48),
+    ("nn.functional.adaptive_max_pool2d_with_indices", 48),
+    ("nn.functional.adaptive_max_pool3d_with_indices", 48),
+    ("nn.functional.fractional_max_pool2d", 48),
+    ("nn.functional.fractional_max_pool2d_with_indices", 48),
+    ("nn.functional.fractional_max_pool3d", 48),
+    ("nn.functional.fractional_max_pool3d_with_indices", 48),
+    ("nn.functional.max_pool2d_with_indices", 48),
+    ("nn.functional.max_pool3d_with_indices", 48),
+    ("nn.functional.handle_torch_function", 47),
+    ("nn.functional.has_torch_function", 47),
+    ("nn.functional.adaptive_avg_pool1d", 43),
+    ("nn.functional.pdist", 43),
+    ("nn.functional.rrelu_", 37),
+    ("nn.functional.elu_", 34),
+    ("nn.functional.boolean_dispatch", 33),
+    ("nn.functional.hardtanh_", 26),
+    ("nn.functional.triplet_margin_with_distance_loss", 23),
+    ("nn.functional.selu_", 20),
+    ("nn.functional.pixel_unshuffle", 19),
+    ("nn.functional.conv_transpose3d", 18),
+    ("nn.functional.gaussian_nll_loss", 15),
+    ("nn.functional.has_torch_function_unary", 15),
+    ("nn.functional.has_torch_function_variadic", 15),
+    ("nn.functional.celu_", 13),
+    ("nn.functional.huber_loss", 7),
+    ("nn.functional.mish", 4),
+    ("nn.functional.threshold_", 3),
+    ("nn.functional.grad", 2),
+    ("nn.functional.conv_tbc", 1),
+    ("nn.functional.math", 1),
+]
+
+top_nn_module = [
+    ("nn.Module", 927129, None),
+    ("nn.Linear", 530688, "nn.functional.linear"),
+    ("nn.Sequential", 384968, None),
+    ("nn.Conv2d", 383320, "nn.functional.conv2d"),
+    ("nn.ReLU", 318877, "nn.functional.relu"),
+    ("nn.BatchNorm2d", 233265, "nn.functional.batch_norm"),
+    ("nn.Dropout", 179268, "nn.functional.dropout"),
+    ("nn.ModuleList", 171225, None),
+    ("nn.Parameter", 153291, None),
+    ("nn.CrossEntropyLoss", 152696, "nn.functional.cross_entropy"),
+    ("nn.MaxPool2d", 138619, "nn.functional.max_pool2d"),
+    ("nn.Embedding", 111844, "nn.functional.embedding"),
+    ("nn.DataParallel", 104238, None),
+    ("nn.MSELoss", 82954, "nn.functional.mse_loss"),
+    ("nn.Sigmoid", 75810, "nn.functional.sigmoid"),
+    ("nn.LeakyReLU", 65632, "nn.functional.leaky_relu"),
+    ("nn.BatchNorm1d", 65374, "nn.functional.batch_norm"),
+    ("nn.Softmax", 65114, "nn.functional.softmax"),
+    ("nn.Tanh", 59445, "nn.functional.tanh"),
+    ("nn.AdaptiveAvgPool2d", 59071, "nn.functional.adaptive_avg_pool2d"),
+    ("nn.AvgPool2d", 58377, "nn.functional.avg_pool2d"),
+    ("nn.ConvTranspose2d", 57524, "nn.functional.conv_transpose2d"),
+    ("nn.LSTM", 57411, None),
+    ("nn.Conv1d", 41108, "nn.functional.conv1d"),
+    ("nn.LayerNorm", 36089, "nn.functional.layer_norm"),
+    ("nn.BCELoss", 34005, "nn.functional.binary_cross_entropy"),
+    ("nn.Upsample", 32527, "nn.functional.interpolate"),
+    ("nn.BCEWithLogitsLoss", 29944, "nn.functional.binary_cross_entropy_with_logits"),
+    ("nn.GRU", 25421, None),
+    ("nn.Dropout2d", 23512, "nn.functional.dropout2d"),
+    ("nn.LogSoftmax", 22897, "nn.functional.log_softmax"),
+    ("nn.L1Loss", 22778, "nn.functional.l1_loss"),
+    ("nn.GroupNorm", 22183, "nn.functional.group_norm"),
+    ("nn.NLLLoss", 21751, "nn.functional.nll_loss"),
+    ("nn.Conv3d", 20874, "nn.functional.conv3d"),
+    ("nn.Identity", 17911, None),
+    ("nn.InstanceNorm2d", 16426, "nn.functional.instance_norm"),
+    ("nn.BatchNorm3d", 16378, "nn.functional.batch_norm"),
+    ("nn.PReLU", 13472, "nn.functional.prelu"),
+    ("nn.ReLU6", 12622, "nn.functional.relu6"),
+    ("nn.ELU", 12508, "nn.functional.elu"),
+    ("nn.LSTMCell", 10885, None),
+    ("nn.Flatten", 10384, "torch.flatten"),
+    ("nn.ModuleDict", 10255, None),
+    ("nn.ReflectionPad2d", 9954, "nn.functional.pad"),
+    ("nn.MaxPool3d", 9526, "nn.functional.max_pool3d"),
+    ("nn.MaxPool1d", 9154, "nn.functional.max_pool1d"),
+    ("nn.RNN", 9154, None),
+    ("nn.ZeroPad2d", 8847, "nn.functional.pad"),
+    ("nn.ParameterList", 7702, None),
+    ("nn.SyncBatchNorm", 6814, None),
+    ("nn.PixelShuffle", 6571, "nn.functional.pixel_shuffle"),
+    ("nn.SmoothL1Loss", 6517, "nn.functional.smooth_l1_loss"),
+    ("nn.Hardswish", 6458, "nn.functional.hardswish"),
+    ("nn.AdaptiveMaxPool2d", 6071, "nn.functional.adaptive_max_pool2d"),
+    ("nn.SELU", 6043, "nn.functional.selu"),
+    ("nn.ConvTranspose3d", 6039, "nn.functional.conv_transpose3d"),
+    ("nn.GRUCell", 5840, None),
+    ("nn.ReplicationPad2d", 5600, "nn.functional.pad"),
+    ("nn.KLDivLoss", 5541, "nn.functional.kl_div"),
+    ("nn.ConvTranspose1d", 5183, "nn.functional.conv_transpose1d"),
+    ("nn.Softplus", 5120, "nn.functional.softplus"),
+    ("nn.SiLU", 4895, "nn.functional.silu"),
+    ("nn.AvgPool3d", 4523, "nn.functional.avg_pool3d"),
+    ("nn.CosineSimilarity", 4058, "nn.functional.cosine_similarity"),
+    ("nn.GELU", 3932, "nn.functional.gelu"),
+    ("nn.UpsamplingBilinear2d", 3673, "nn.functional.interpolate"),
+    ("nn.InstanceNorm1d", 3658, "nn.functional.instance_norm"),
+    ("nn.Transformer", 3604, None),
+    ("nn.MultiheadAttention", 3435, "nn.functional.multi_head_attention_forward"),
+    ("nn.AvgPool1d", 3195, "nn.functional.avg_pool1d"),
+    ("nn.Dropout3d", 2964, "nn.functional.dropout3d"),
+    ("nn.AdaptiveAvgPool3d", 2915, "nn.functional.adaptive_avg_pool3d"),
+    ("nn.InstanceNorm3d", 2893, "nn.functional.instance_norm"),
+    ("nn.Hardtanh", 2613, "nn.functional.hardtanh"),
+    ("nn.MarginRankingLoss", 2568, "nn.functional.margin_ranking_loss"),
+    ("nn.GLU", 2526, "nn.functional.glu"),
+    ("nn.AdaptiveAvgPool1d", 2481, "nn.functional.adaptive_avg_pool1d"),
+    ("nn.EmbeddingBag", 2344, "nn.functional.embedding_bag"),
+    ("nn.TransformerEncoderLayer", 2292, None),
+    ("nn.TransformerEncoder", 2091, None),
+    ("nn.MaxUnpool2d", 2031, "nn.functional.max_unpool2d"),
+    ("nn.UpsamplingNearest2d", 2004, "nn.functional.interpolate"),
+    ("nn.ConstantPad1d", 1904, "nn.functional.pad"),
+    ("nn.ConstantPad2d", 1791, "nn.functional.pad"),
+    ("nn.CTCLoss", 1789, "nn.functional.ctc_loss"),
+    ("nn.AdaptiveMaxPool1d", 1713, "nn.functional.adaptive_max_pool1d"),
+    ("nn.AdaptiveLogSoftmaxWithLoss", 1665, None),
+    ("nn.Bilinear", 1664, "nn.functional.bilinear"),
+    ("nn.RNNCell", 1653, None),
+    ("nn.MultiLabelSoftMarginLoss", 1624, "nn.functional.multilabel_soft_margin_loss"),
+    ("nn.Unfold", 1452, "nn.functional.unfold"),
+    ("nn.RReLU", 1431, "nn.functional.rrelu"),
+    ("nn.CosineEmbeddingLoss", 1357, "nn.functional.cosine_embedding_loss"),
+    ("nn.LocalResponseNorm", 1331, "nn.functional.local_response_norm"),
+    ("nn.Softmax2d", 1300, "nn.functional.softmax"),
+    ("nn.PairwiseDistance", 1241, "nn.functional.pairwise_distance"),
+    ("nn.LogSigmoid", 1235, "nn.functional.logsigmoid"),
+    ("nn.TripletMarginLoss", 1230, "nn.functional.triplet_margin_loss"),
+    ("nn.RNNBase", 1133, None),
+    ("nn.Threshold", 1043, "nn.functional.threshold"),
+    ("nn.AdaptiveMaxPool3d", 1025, "nn.functional.adaptive_max_pool3d"),
+    ("nn.CELU", 1018, "nn.functional.celu"),
+    ("nn.NLLLoss2d", 966, "nn.functional.nll_loss"),
+    ("nn.Softsign", 877, "nn.functional.softsign"),
+    ("nn.ReplicationPad1d", 862, "nn.functional.pad"),
+    ("nn.SoftMarginLoss", 856, "nn.functional.soft_margin_loss"),
+    ("nn.ParameterDict", 742, None),
+    ("nn.ReflectionPad1d", 731, "nn.functional.pad"),
+    ("nn.Softshrink", 713, "nn.functional.softshrink"),
+    ("nn.AlphaDropout", 710, "nn.functional.alpha_dropout"),
+    ("nn.Tanhshrink", 681, "nn.functional.tanhshrink"),
+    ("nn.PoissonNLLLoss", 676, "nn.functional.poisson_nll_loss"),
+    ("nn.MaxUnpool3d", 660, "nn.functional.max_unpool3d"),
+    ("nn.Fold", 630, "nn.functional.fold"),
+    ("nn.MultiMarginLoss", 622, "nn.functional.multi_margin_loss"),
+    ("nn.TransformerDecoderLayer", 614, None),
+    ("nn.TransformerDecoder", 607, None),
+    ("nn.Hardshrink", 592, "nn.functional.hardshrink"),
+    ("nn.ConstantPad3d", 582, "nn.functional.pad"),
+    ("nn.MultiLabelMarginLoss", 580, "nn.functional.multilabel_margin_loss"),
+    ("nn.LPPool2d", 550, "nn.functional.lp_pool2d"),
+    ("nn.Softmin", 537, "nn.functional.softmin"),
+    ("nn.MaxUnpool1d", 518, "nn.functional.max_unpool1d"),
+    ("nn.FractionalMaxPool2d", 484, "nn.functional.fractional_max_pool2d"),
+    ("nn.Hardsigmoid", 477, "nn.functional.hardsigmoid"),
+    ("nn.ReplicationPad3d", 470, "nn.functional.pad"),
+    ("nn.HingeEmbeddingLoss", 442, "nn.functional.hinge_embedding_loss"),
+    ("nn.LPPool1d", 386, "nn.functional.lp_pool1d"),
+    ("nn.FractionalMaxPool3d", 252, "nn.functional.fractional_max_pool3d"),
+    ("nn.Container", 217, None),
+    ("nn.Unflatten", 206, "nn.functional.unflatten"),
+    ("nn.FeatureAlphaDropout", 136, "nn.functional.feature_alpha_dropout"),
+    ("nn.TripletMarginWithDistanceLoss", 107, "nn.functional.triplet_margin_with_distance_loss"),
+    ("nn.ChannelShuffle", 90, "nn.functional.channel_shuffle"),
+    ("nn.RNNCellBase", 88, None),
+    ("nn.LazyLinear", 81, "nn.functional.linear"),
+    ("nn.UninitializedParameter", 60, None),
+    ("nn.CrossMapLRN2d", 59, None),
+    ("nn.GaussianNLLLoss", 55, "nn.functional.gaussian_nll_loss"),
+    ("nn.PixelUnshuffle", 45, "nn.functional.pixel_unshuffle"),
+    ("nn.Mish", 31, "nn.functional.mish"),
+    ("nn.ReflectionPad3d", 22, "nn.functional.pad"),
+    ("nn.HuberLoss", 18, "nn.functional.huber_loss"),
+    ("nn.LazyConv2d", 15, None),
+    ("nn.LazyConv1d", 9, None),
+    ("nn.LazyConv3d", 8, None),
+    ("nn.LazyConvTranspose1d", 8, None),
+    ("nn.LazyConvTranspose2d", 8, None),
+    ("nn.LazyConvTranspose3d", 8, None),
+    ("nn.LazyBatchNorm1d", 3, None),
+    ("nn.LazyBatchNorm2d", 3, None),
+    ("nn.LazyBatchNorm3d", 3, None),
+    ("nn.UninitializedBuffer", 3, None),
+]
+
+# No rankings because these are a little hard to get rankings for
+method_only_ops = [
+    'bfloat16',
+    'bool',
+    'byte',
+    'char',
+    'contiguous',
+    'cpu',
+    'cuda',
+    'detach',
+    'double',
+    'expand',
+    'expand_as',
+    'float',
+    'get_device',
+    'half',
+    'hardshrink',
+    'index_add',
+    'index_copy',
+    'index_fill',
+    'index_put',
+    'int',
+    'is_contiguous',
+    'is_pinned',
+    'is_set_to',
+    'is_shared',
+    'is_signed',
+    'item',
+    'long',
+    'masked_scatter',
+    'masked_fill',
+    'narrow_copy',
+    'numpy',
+    'pin_memory',
+    'repeat',
+    'reshape_as',
+    'select',
+    'short',
+    'storage_offset',
+    'sum_to_size',
+    'to',
+    'to_mkldnn',
+    'tolist',
+    'type',
+    'type_as',
+    'unfold',
+    'view',
+    'view_as',
+]
+
+
+def get_nn_functional_top_list():
+    top_nn_functional_ = dict(top_nn_functional)
+    for _, count, functional_name in top_nn_module:
+        if functional_name is None:
+            continue
+        if functional_name == 'torch.flatten':
+            continue
+        if functional_name not in top_nn_functional_:
+            top_nn_functional_[functional_name] = count
+        else:
+            top_nn_functional_[functional_name] += count
+
+    top_nn_functional_ = list(top_nn_functional_.items())
+    top_nn_functional_.sort(key=lambda x: x[1], reverse=True)
+    return top_nn_functional_
+
+
+usage_count = {}
+for k, v in get_nn_functional_top_list():
+    usage_count[k] = v
+for k, v in top_torch:
+    usage_count[k] = v
diff --git a/MLPY/Lib/site-packages/torch/_functorch/utils.py b/MLPY/Lib/site-packages/torch/_functorch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e0f7d65aab4c49dc5c91f3d3cee0639a0a3b55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/utils.py
@@ -0,0 +1,41 @@
+import contextlib
+import torch
+from torch._C._functorch import (
+    set_single_level_autograd_function_allowed,
+    get_single_level_autograd_function_allowed,
+    unwrap_if_dead,
+)
+from typing import Union, Tuple
+
+@contextlib.contextmanager
+def enable_single_level_autograd_function():
+    try:
+        prev_state = get_single_level_autograd_function_allowed()
+        set_single_level_autograd_function_allowed(True)
+        yield
+    finally:
+        set_single_level_autograd_function_allowed(prev_state)
+
+def unwrap_dead_wrappers(args):
+    # NB: doesn't use tree_map_only for performance reasons
+    result = tuple(
+        unwrap_if_dead(arg) if isinstance(arg, torch.Tensor) else arg
+        for arg in args
+    )
+    return result
+
+# Allows one to expose an API in a private submodule publicly as per the definition
+# in PyTorch's public api policy.
+#
+# It is a temporary solution while we figure out if it should be the long-term solution
+# or if we should amend PyTorch's public api policy. The concern is that this approach
+# may not be very robust because it's not clear what __module__ is used for.
+# However, both numpy and jax overwrite the __module__ attribute of their APIs
+# without problem, so it seems fine.
+def exposed_in(module):
+    def wrapper(fn):
+        fn.__module__ = module
+        return fn
+    return wrapper
+
+argnums_t = Union[int, Tuple[int, ...]]
diff --git a/MLPY/Lib/site-packages/torch/_functorch/vmap.py b/MLPY/Lib/site-packages/torch/_functorch/vmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..09339f809caf7a64348db8bde1ea174de13b2240
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_functorch/vmap.py
@@ -0,0 +1,452 @@
+# mypy: ignore-errors
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import contextlib
+import functools
+import threading
+from torch import Tensor
+from typing import Any, Callable, Optional, Tuple, Union, List
+from torch.utils._pytree import (
+    tree_flatten,
+    tree_unflatten,
+    tree_map_,
+    _broadcast_to_and_flatten,
+    TreeSpec,
+)
+from functools import partial
+import os
+import itertools
+
+from torch._C._functorch import (
+    _add_batch_dim,
+    _remove_batch_dim,
+    _vmap_decrement_nesting,
+    _vmap_increment_nesting,
+    is_batchedtensor,
+)
+
+in_dims_t = Union[int, Tuple]
+out_dims_t = Union[int, Tuple[int, ...]]
+
+
+def doesnt_support_saved_tensors_hooks(f):
+    message = (
+        "torch.func transforms don't yet support saved tensor hooks. "
+        "Please open an issue with your use case."
+    )
+
+    @functools.wraps(f)
+    def fn(*args, **kwargs):
+        with torch.autograd.graph.disable_saved_tensors_hooks(message):
+            return f(*args, **kwargs)
+    return fn
+
+
+# Checks that all args-to-be-batched have the same batch dim size
+def _validate_and_get_batch_size(
+        flat_in_dims: List[Optional[int]],
+        flat_args: List) -> int:
+    batch_sizes = [arg.size(in_dim) for in_dim, arg in zip(flat_in_dims, flat_args)
+                   if in_dim is not None]
+    if len(batch_sizes) == 0:
+        raise ValueError('vmap: Expected at least one Tensor to vmap over')
+    if batch_sizes and any(size != batch_sizes[0] for size in batch_sizes):
+        raise ValueError(
+            f'vmap: Expected all tensors to have the same size in the mapped '
+            f'dimension, got sizes {batch_sizes} for the mapped dimension')
+    return batch_sizes[0]
+
+
+def _num_outputs(batched_outputs: Union[Tensor, Tuple[Tensor, ...]]) -> int:
+    if isinstance(batched_outputs, tuple):
+        return len(batched_outputs)
+    return 1
+
+# If value is a tuple, check it has length `num_elements`.
+# If value is not a tuple, make a tuple with `value` repeated `num_elements` times
+
+
+def _as_tuple(value: Any, num_elements: int, error_message_lambda: Callable[[], str]) -> Tuple:
+    if not isinstance(value, tuple):
+        return (value,) * num_elements
+    if len(value) != num_elements:
+        raise ValueError(error_message_lambda())
+    return value
+
+
+def _process_batched_inputs(
+    in_dims: in_dims_t, args: Tuple, func: Callable
+) -> Tuple[int, List[Any], List[Any], TreeSpec]:
+    if not isinstance(in_dims, int) and not isinstance(in_dims, tuple):
+        raise ValueError(
+            f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
+            f'expected `in_dims` to be int or a (potentially nested) tuple '
+            f'matching the structure of inputs, got: {type(in_dims)}.')
+    if len(args) == 0:
+        raise ValueError(
+            f'vmap({_get_name(func)})(<inputs>): got no inputs. Maybe you forgot to add '
+            f'inputs, or you are trying to vmap over a function with no inputs. '
+            f'The latter is unsupported.')
+
+    flat_args, args_spec = tree_flatten(args)
+    flat_in_dims = _broadcast_to_and_flatten(in_dims, args_spec)
+    if flat_in_dims is None:
+        raise ValueError(
+            f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
+            f'in_dims is not compatible with the structure of `inputs`. '
+            f'in_dims has structure {tree_flatten(in_dims)[1]} but inputs '
+            f'has structure {args_spec}.')
+
+    for i, (arg, in_dim) in enumerate(zip(flat_args, flat_in_dims)):
+        if not isinstance(in_dim, int) and in_dim is not None:
+            raise ValueError(
+                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
+                f'Got in_dim={in_dim} for an input but in_dim must be either '
+                f'an integer dimension or None.')
+        if isinstance(in_dim, int) and not isinstance(arg, Tensor):
+            raise ValueError(
+                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
+                f'Got in_dim={in_dim} for an input but the input is of type '
+                f'{type(arg)}. We cannot vmap over non-Tensor arguments, '
+                f'please use None as the respective in_dim')
+        if in_dim is not None and (in_dim < -arg.dim() or in_dim >= arg.dim()):
+            raise ValueError(
+                f'vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): '
+                f'Got in_dim={in_dim} for some input, but that input is a Tensor '
+                f'of dimensionality {arg.dim()} so expected in_dim to satisfy '
+                f'-{arg.dim()} <= in_dim < {arg.dim()}.')
+        if in_dim is not None and in_dim < 0:
+            flat_in_dims[i] = in_dim % arg.dim()
+
+    return _validate_and_get_batch_size(flat_in_dims, flat_args), flat_in_dims, flat_args, args_spec
+
+# Creates BatchedTensors for every Tensor in arg that should be batched.
+# Returns the (potentially) batched arguments and the batch_size.
+
+
+def _create_batched_inputs(
+        flat_in_dims: List[Any], flat_args: List[Any], vmap_level: int, args_spec) -> Tuple:
+    # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
+    batched_inputs = [arg if in_dim is None else
+                      _add_batch_dim(arg, in_dim, vmap_level)
+                      for in_dim, arg in zip(flat_in_dims, flat_args)]
+    return tree_unflatten(batched_inputs, args_spec)
+
+
+def _maybe_remove_batch_dim(name, batched_output, vmap_level, batch_size, out_dim):
+
+    if out_dim is None:
+        if isinstance(batched_output, torch.Tensor) and is_batchedtensor(batched_output):
+            raise ValueError(
+                f'vmap({name}, ...): `{name}` can not return a '
+                f'BatchedTensor when out_dim is None'
+            )
+        return batched_output
+
+    # out_dim is non None
+    if not isinstance(batched_output, torch.Tensor):
+        raise ValueError(f'vmap({name}, ...): `{name}` must only return '
+                         f'Tensors, got type {type(batched_output)}. '
+                         'Did you mean to set out_dim= to None for output?')
+
+    return _remove_batch_dim(batched_output, vmap_level, batch_size, out_dim)
+
+
+# Undos the batching (and any batch dimensions) associated with the `vmap_level`.
+def _unwrap_batched(
+        batched_outputs: Union[Tensor, Tuple[Tensor, ...]],
+        out_dims: out_dims_t,
+        vmap_level: int, batch_size: int, func: Callable) -> Tuple:
+    flat_batched_outputs, output_spec = tree_flatten(batched_outputs)
+
+    def incompatible_error():
+        raise ValueError(
+            f'vmap({_get_name(func)}, ..., out_dims={out_dims})(<inputs>): '
+            f'out_dims is not compatible with the structure of `outputs`. '
+            f'out_dims has structure {tree_flatten(out_dims)[1]} but outputs '
+            f'has structure {output_spec}.')
+
+    if isinstance(batched_outputs, torch.Tensor):
+        # Some weird edge case requires us to spell out the following
+        # see test_out_dims_edge_case
+        if isinstance(out_dims, int):
+            flat_out_dims = [out_dims]
+        elif isinstance(out_dims, tuple) and len(out_dims) == 1:
+            flat_out_dims = out_dims
+        elif out_dims is None:
+            flat_out_dims = [out_dims]
+        else:
+            incompatible_error()
+    else:
+        flat_out_dims = _broadcast_to_and_flatten(out_dims, output_spec)
+        if flat_out_dims is None:
+            incompatible_error()
+
+    flat_outputs = [
+        _maybe_remove_batch_dim(_get_name(func), batched_output, vmap_level, batch_size, out_dim)
+        for batched_output, out_dim in zip(flat_batched_outputs, flat_out_dims)
+    ]
+    return tree_unflatten(flat_outputs, output_spec)
+
+
+def _check_int_or_none(x, func, out_dims):
+    if isinstance(x, int):
+        return
+    if x is None:
+        return
+    raise ValueError(
+        f'vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be '
+        f'an int, None or a python collection of ints representing where in the outputs the '
+        f'vmapped dimension should appear.')
+
+
+def _check_out_dims_is_int_or_int_pytree(out_dims: out_dims_t, func: Callable) -> None:
+    if isinstance(out_dims, int):
+        return
+    tree_map_(partial(_check_int_or_none, func=func, out_dims=out_dims), out_dims)
+
+
+def _get_name(func: Callable):
+    if hasattr(func, '__name__'):
+        return func.__name__
+
+    # Not all callables have __name__, in fact, only static functions/methods do.
+    # A callable created via functools.partial or an nn.Module, to name some
+    # examples, don't have a __name__.
+    return repr(func)
+
+
+DECOMPOSITIONS_LOADED = False
+DECOMPOSITIONS_LOCK = threading.Lock()
+VMAP_DECOMPOSITIONS_LIB = None
+
+# torch.package, Python 3.11, and torch.jit-less environments are unhappy with
+# decompositions. Only load them when needed if possible.
+def lazy_load_decompositions():
+    global DECOMPOSITIONS_LOADED
+    if DECOMPOSITIONS_LOADED:
+        return
+
+    with DECOMPOSITIONS_LOCK:
+        if DECOMPOSITIONS_LOADED:
+            return
+
+        if not (os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__):
+            DECOMPOSITIONS_LOADED = True
+            return
+
+        # use an alternate way to register an operator into the decomposition table
+        # _register_jit_decomposition doesn't work for some operators, e.g. addr,
+        #  because the Tensor types generated cannot be unioned by torchscript
+        # decomp should be type OpOverload
+        global VMAP_DECOMPOSITIONS_LIB
+        VMAP_DECOMPOSITIONS_LIB = torch.library.Library("aten", "IMPL", "FuncTorchBatched")
+
+        from torch._decomp import decomposition_table
+
+        def _register_python_decomposition_vmap(decomp):
+            if decomp in decomposition_table:
+                VMAP_DECOMPOSITIONS_LIB.impl(decomp, decomposition_table[decomp])
+            else:
+                raise RuntimeError(f"could not find decomposition for {decomp}")
+
+        _register_python_decomposition_vmap(torch.ops.aten.mse_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.smooth_l1_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.huber_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_forward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.nll_loss2d_backward.default)
+        _register_python_decomposition_vmap(torch.ops.aten.addr.default)
+
+        DECOMPOSITIONS_LOADED = True
+
+def vmap_impl(func, in_dims, out_dims, randomness, chunk_size, *args, **kwargs):
+    lazy_load_decompositions()
+    _check_out_dims_is_int_or_int_pytree(out_dims, func)
+    batch_size, flat_in_dims, flat_args, args_spec = _process_batched_inputs(in_dims, args, func)
+
+    if chunk_size is not None:
+        chunks_flat_args = _get_chunked_inputs(flat_args, flat_in_dims, batch_size, chunk_size)
+        return _chunked_vmap(func, flat_in_dims, chunks_flat_args,
+                             args_spec, out_dims, randomness, **kwargs)
+
+    # If chunk_size is not specified.
+    return _flat_vmap(
+        func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+    )
+
+def get_chunk_sizes(total_elems, chunk_size):
+    n_chunks = n_chunks = total_elems // chunk_size
+    chunk_sizes = [chunk_size] * n_chunks
+    # remainder chunk
+    remainder = total_elems % chunk_size
+    if remainder != 0:
+        chunk_sizes.append(remainder)
+    return chunk_sizes
+
+def _get_chunked_inputs(flat_args, flat_in_dims, batch_size, chunk_size):
+    split_idxs = (batch_size,)
+    if chunk_size is not None:
+        chunk_sizes = get_chunk_sizes(batch_size, chunk_size)
+        split_idxs = tuple(itertools.accumulate(chunk_sizes))
+
+    flat_args_chunks = tuple(
+        t.tensor_split(split_idxs, dim=in_dim) if in_dim is not None else [t, ] * len(split_idxs)
+        for t, in_dim in zip(flat_args, flat_in_dims)
+    )
+
+    # transpose chunk dim and flatten structure
+    # chunks_flat_args is a list of flatten args
+    chunks_flat_args = zip(*flat_args_chunks)
+    return chunks_flat_args
+
+
+def _flatten_chunks_output(chunks_output_):
+    # chunks_output is a list of chunked outputs
+    # flatten chunked outputs:
+    flat_chunks_output = []
+    arg_spec = None
+    for output in chunks_output_:
+        flat_output, arg_specs = tree_flatten(output)
+        flat_chunks_output.append(flat_output)
+        if arg_spec is None:
+            arg_spec = arg_specs
+
+    # transpose chunk dim and flatten structure
+    # flat_output_chunks is flat list of chunks
+    flat_output_chunks = list(zip(*flat_chunks_output))
+    return flat_output_chunks, arg_spec
+
+
+def _concat_chunked_outputs(out_dims, arg_spec, flat_output_chunks):
+    # concat chunks on out_dim
+    flat_out_dims = _broadcast_to_and_flatten(out_dims, arg_spec)
+    assert len(flat_out_dims) == len(flat_output_chunks)
+    flat_output = []
+    for idx, out_dim in enumerate(flat_out_dims):
+        flat_output.append(torch.cat(flat_output_chunks[idx], dim=out_dim))
+        # release tensors
+        flat_output_chunks[idx] = None
+
+    return flat_output
+
+
+# Applies vmap on chunked_input and returns concatenated output over the chunks.
+def _chunked_vmap(func, flat_in_dims, chunks_flat_args, args_spec, out_dims, randomness, **kwargs):
+
+    chunks_output = []
+    rs = torch.get_rng_state() if randomness == "same" else None
+    for flat_args in chunks_flat_args:
+        batch_size = _validate_and_get_batch_size(flat_in_dims, flat_args)
+
+        # The way we compute split the input in `_get_chunked_inputs`,
+        # we may get a tensor with `0` batch-size. We skip any computation
+        # in that case.
+        # Eg.
+        # >>> chunk_size = 1
+        # >>> batch_size = 6
+        # >>> t = torch.zeros(batch_size, 1)
+        # >>> t.tensor_split([1, 2, 3, 4, 5, 6])
+        # (tensor([[0.]]), tensor([[0.]]), tensor([[0.]]), tensor([[0.]]),
+        #  tensor([[0.]]), tensor([[0.]]), tensor([], size=(0, 1)))
+        if batch_size == 0:
+            continue
+
+        if rs is not None:
+            torch.set_rng_state(rs)
+        chunks_output.append(
+            _flat_vmap(
+                func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs
+            )
+        )
+
+    flat_output_chunks, arg_spec = _flatten_chunks_output(chunks_output)
+
+    # chunked output tensors are held by both `flat_output_chunks` and `chunks_output`.
+    # eagerly remove the reference from `chunks_output`.
+    del chunks_output
+
+    # concat chunks on out_dim
+    flat_output = _concat_chunked_outputs(out_dims, arg_spec, flat_output_chunks)
+
+    # finally unflatten the output
+    return tree_unflatten(flat_output, arg_spec)
+
+
+# Vmap refactored helper functions:
+def _check_randomness_arg(randomness):
+    if randomness not in ['error', 'different', 'same']:
+        raise RuntimeError(f"Only allowed values for randomness are 'error', 'different', or 'same'. Got {randomness}")
+
+
+@contextlib.contextmanager
+def vmap_increment_nesting(batch_size, randomness):
+    try:
+        vmap_level = _vmap_increment_nesting(batch_size, randomness)
+        yield vmap_level
+    finally:
+        _vmap_decrement_nesting()
+
+
+@doesnt_support_saved_tensors_hooks
+def _flat_vmap(func, batch_size, flat_in_dims, flat_args, args_spec, out_dims, randomness, **kwargs):
+
+    with vmap_increment_nesting(batch_size, randomness) as vmap_level:
+        batched_inputs = _create_batched_inputs(flat_in_dims, flat_args, vmap_level, args_spec)
+        batched_outputs = func(*batched_inputs, **kwargs)
+        return _unwrap_batched(batched_outputs, out_dims, vmap_level, batch_size, func)
+
+
+# `restore_vmap` is a private helper function. It is vmap but has the following
+# differences:
+# - instead of returning outputs, it returns an (outputs, out_dims) tuple.
+#   out_dims is a pytree of same shape as outputs and contains Optional[int]
+#   specifying where the vmapped dimension, if it exists, is in the corresponding output.
+# - does no validation on in_dims or inputs (vmap expects at least one Tensor to be vmapped).
+#   restore_vmap allows for no inputs to have the vmap dimension
+# - does no validation on outputs (vmap expects only Tensor outputs)
+#   restore_vmap allows for return of arbitrary outputs (not just Tensors)
+#
+# The TL;DR is that restore_vmap is more general than vmap and has a slightly
+# different API. The relaxations are so that we can "pause" vmap in the middle
+# of its execution and then "restore" it later (this is what we do in
+# the generate_vmap_rule=True implementation of autograd.Function).
+#
+# restore_vmap can be technically used in the implementation of vmap, but doing
+# that refactor is a bit technically challenging because:
+# - vmap couples the tensor-wrapping code with error checking
+# - vmap's tensor unwrapping code is in C++; we would need to rewrite part of it
+#   in python because it overlaps with unwrap_batched
+@doesnt_support_saved_tensors_hooks
+def restore_vmap(func, in_dims, batch_size, randomness):
+    def inner(*args, **kwargs):
+        with vmap_increment_nesting(batch_size, randomness) as vmap_level:
+            batched_inputs = wrap_batched(args, in_dims, vmap_level)
+            batched_outputs = func(*batched_inputs, **kwargs)
+            return unwrap_batched(batched_outputs, vmap_level)
+    return inner
+
+
+def wrap_batched(args, bdims, level):
+    flat_args, spec = tree_flatten(args)
+    flat_bdims = _broadcast_to_and_flatten(bdims, spec)
+    assert flat_bdims is not None
+    result = _create_batched_inputs(flat_bdims, flat_args, level, spec)
+    return result
+
+
+def unwrap_batched(args, level):
+    flat_args, spec = tree_flatten(args)
+    if len(flat_args) == 0:
+        return args, ()
+    result = [torch._C._functorch._unwrap_batched(arg, level) if isinstance(arg, torch.Tensor)
+              else (arg, None) for arg in flat_args]
+    output, bdims = zip(*result)
+    return tree_unflatten(output, spec), tree_unflatten(bdims, spec)
diff --git a/MLPY/Lib/site-packages/torch/_guards.py b/MLPY/Lib/site-packages/torch/_guards.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d8cc55c0eec746678ae3fa922791676f1ae77b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_guards.py
@@ -0,0 +1,879 @@
+from __future__ import annotations
+
+import contextlib
+
+import dataclasses
+import enum
+import functools
+import logging
+import threading
+import traceback
+import unittest.mock
+import weakref
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    TypeVar,
+)
+
+import torch
+from torch.utils import _pytree as pytree
+from torch.utils._traceback import CapturedTraceback
+from torch.utils.weak import WeakTensorKeyDictionary
+
+log = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+
+    import sympy
+
+
+"""
+torch._guards is the definitional source of truth for general purpose guard structures.
+
+An important thing to keep in mind here is the preservation of layering. There should be no dynamo notions,
+and no guard installation notions here.
+"""
+
+
+class CompileId(NamedTuple):
+    frame_id: int
+    # This id is per-frame, and counts how many times we've compiled this
+    # frame.  This could have been a global id but having this be per-frame
+    # gives you a better intuitive sense for how many recompiles have occurred
+    # so far.
+    frame_compile_id: int
+    # TODO: consider also tracking the recompilation count
+
+    def __str__(self):
+        return f"{self.frame_id}/{self.frame_compile_id}"
+
+
+class TraceId(NamedTuple):
+    compile_id: CompileId
+    # This starts off as 0, and every time we restart analysis it goes
+    # up by one
+    attempt: int
+
+    def __str__(self):
+        if self.attempt == 0:
+            return str(self.compile_id)
+        else:
+            return f"{self.compile_id}_{self.attempt}"
+
+
+class GuardSource(enum.Enum):
+    LOCAL = 0
+    GLOBAL = 1
+    LOCAL_NN_MODULE = 2
+    GLOBAL_NN_MODULE = 3
+    CONSTANT = 4
+    RANDOM_VALUE = 5
+    SHAPE_ENV = 6
+    LOCAL_FSDP_MODULE = 7
+    GLOBAL_FSDP_MODULE = 8
+    BACKWARD_STATE = 9
+    EPHEMERAL = 10
+    SYNTHETIC_LOCAL = 11
+
+    def is_fsdp_module(self) -> bool:
+        return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
+
+    def is_nn_module(self) -> bool:
+        return (
+            self
+            in (
+                GuardSource.GLOBAL_NN_MODULE,
+                GuardSource.LOCAL_NN_MODULE,
+            )
+            or self.is_fsdp_module()
+        )
+
+    def is_local(self):
+        return self in (
+            GuardSource.LOCAL,
+            GuardSource.LOCAL_NN_MODULE,
+            GuardSource.LOCAL_FSDP_MODULE,
+        )
+
+
+"""
+Base class for a "GuardBuilder" role.
+
+The GuardBuilderBase role is to represent a scope within which to build a guard. The name is a little
+confusing, as its not a builder, but for the sake of avoiding a lot of renames and keeping the original reference
+to torchdynamo's GuardBuilder.
+
+Note: create_fn is invoked with a GuardBuilderBase and a Guard. A GuardBuilder is chosen based
+on GuardSource's select function.
+
+There is value in keeping this GuardBuilderBase empty to keep layering clean.
+"""
+
+
+class GuardBuilderBase:
+    pass
+
+
+class ShapeGuard(NamedTuple):
+    expr: sympy.Expr
+    stack: CapturedTraceback
+
+
+@dataclasses.dataclass
+class Guard:
+    # originating_source is the source that called the make_guard method to
+    # construct this guard object. The property name specifies what exactly it
+    # is the guard is guarding on.  The meaning of the name is dependent on the
+    # create_fn; you must look at the use-site inside create_fn to know what
+    # name means.
+    #
+    # That being said, although you might think this is just a "name", name is
+    # usually an arbitrary Python expression that will be evaluated with all
+    # globals (and locals, if you create a LOCAL guard) to extract the Python
+    # object that we want to perform guard tests on.  This evaluation
+    # typically happens in GuardBuilder.eval.  In these cases, name is
+    # typically produced by originating_source.name() (not to be confused with
+    # GuardSource - the property source).
+    #
+    # Occasionally, name is not a valid Python expression; sometimes
+    # it is meaningless.  Example create_fns that are like this include
+    # GRAD_MODE and SHAPE_ENV.
+    originating_source: Source
+    create_fn: Callable[[GuardBuilderBase, Guard], None]
+
+    # Export only. These values are written to at time of guard check_fn creation.
+    guard_types: Optional[List[str]] = None
+    code_list: Optional[List[str]] = None
+    obj_weakref: Optional[object] = None
+    guarded_class_weakref: Optional[type] = None
+
+    stack: Optional[CapturedTraceback] = None
+    user_stack: Optional[traceback.StackSummary] = None
+    _hash: Optional[int] = None
+
+    def __hash__(self):
+        if self._hash is None:
+            self._hash = hash((self.name, self.source, id(self.create_fn)))
+        return self._hash
+
+    def sort_key(self):
+        return (
+            self.source.value if self.source else -1,
+            len(self.name),
+            self.name,
+            self.inner_create_fn().__code__.co_firstlineno,
+        )
+
+    def __lt__(self, other):
+        return self.sort_key() < other.sort_key()
+
+    def inner_create_fn(self):
+        if isinstance(self.create_fn, functools.partial):
+            return self.create_fn.func
+        else:
+            return self.create_fn
+
+    @property
+    def name(self) -> str:
+        return self.originating_source.name()
+
+    @property
+    def source(self) -> GuardSource:
+        return self.originating_source.guard_source()
+
+    @staticmethod
+    def weakref_to_str(obj_weakref):
+        """
+        This is a workaround of a Python weakref bug.
+
+        `obj_weakref` is instance returned by `weakref.ref`,
+        `str(obj_weakref)` is buggy if the original obj overrides __getattr__, e.g:
+
+            class MyConfig(dict):
+                def __getattr__(self, x):
+                    return self[x]
+
+            obj = MyConfig(offset=5)
+            obj_weakref = weakref.ref(obj)
+            str(obj_weakref)  # raise error: KeyError: '__name__'
+        """
+        if isinstance(obj_weakref, weakref.ReferenceType):
+            obj = obj_weakref()
+            if obj is not None:
+                return f"<weakref at {hex(id(obj_weakref))}; to '{obj.__class__.__name__}' at {hex(id(obj))}>"
+            else:
+                return f"<weakref at {hex(id(obj_weakref))}; dead>"
+        else:
+            return str(obj_weakref)
+
+    def __repr__(self):
+        s = f"""
+        {self.source.name.lower() if self.source else ""} {repr(self.name)} {self.inner_create_fn().__name__}
+        {{
+            'guard_types': {self.guard_types},
+            'code': {self.code_list},
+            'obj_weakref': {self.weakref_to_str(self.obj_weakref)}
+            'guarded_class': {self.guarded_class_weakref}
+        }}
+        """
+        return s
+
+    def __str__(self):
+        output = f"Name: {repr(self.name)}\n"
+        source = self.source.name.lower() if self.source else ""
+        output += f"    Source: {source}\n"
+        output += f"    Create Function: {self.inner_create_fn().__name__}\n"
+        output += f"    Guard Types: {self.guard_types}\n"
+        output += f"    Code List: {self.code_list}\n"
+        output += f"    Object Weakref: {self.weakref_to_str(self.obj_weakref)}\n"
+        output += f"    Guarded Class Weakref: {self.guarded_class_weakref}\n"
+        return output
+
+    def create(self, builder: GuardBuilderBase):
+        try:
+            return self.create_fn(builder, self)
+        except Exception:
+            log.error("Error while creating guard:\n%s", str(self).rstrip())
+            if self.stack:
+                log.error("Created at:\n%s", "".join(self.stack.format()[-4:]).rstrip())
+            raise
+
+    def is_nn_module(self):
+        return self.source.is_nn_module()
+
+    def is_fsdp_module(self):
+        return self.source.is_fsdp_module()
+
+    def is_local(self):
+        return self.source.is_local()
+
+    def set_export_info(self, guard_type, guarded_class, code_list, obj_weakref):
+        if not self.guard_types:
+            self.guard_types = list()
+
+        self.guard_types.append(guard_type)
+
+        assert self.guarded_class_weakref in (
+            guarded_class,
+            None,
+        ), "Guarded class id must be identical, or None"
+        self.guarded_class_weakref = guarded_class
+
+        if not self.code_list:
+            self.code_list = code_list
+        else:
+            self.code_list.extend(code_list)
+
+        assert self.obj_weakref in (
+            obj_weakref,
+            None,
+        ), "Guarded object must be identical, or None"
+        self.obj_weakref = obj_weakref
+
+
+T = TypeVar("T")
+
+"""
+Parent structure for guard env expressions.
+A GuardEnvExpr can have any subtype.
+Note: All subtypes must be handled exhaustively in
+torch._dynamo.guards._parse_guard_env_guards to avoid a RuntimeError.
+"""
+
+
+@dataclasses.dataclass
+class GuardEnvExpr:
+    pass
+
+
+"""
+A class representing a pair of duplicate inputs.
+input_pos_a and input_pos_b are input positions we have deduped.
+"""
+
+
+@dataclasses.dataclass
+class DuplicateInputs(GuardEnvExpr):
+    input_source_a: Source
+    input_source_b: Source
+
+    def __post_init__(self):
+        assert self.input_source_a != self.input_source_b
+
+
+"""
+Checkpointable is an interface for driving state snapshotting, left purposely vague for now.
+
+copy_graphstate() -> T, a somewhat legacy name, is expected to emit a snapshot of any type that
+can also be taken in at restore_graphstate(T) calls.
+
+When to snapshot, is, at the moment, an implementation detail of upstream callers. Checkpointable
+does not provide any garuantees around consistency, idempotency, or safety of calling its APIs, yet.
+
+In the future, it will have a closer coupling to a generic Checkpoint management system.
+"""
+
+
+class Checkpointable(ABC, Generic[T]):
+    @abstractmethod
+    def copy_graphstate(self) -> T:
+        ...
+
+    @abstractmethod
+    def restore_graphstate(self, state: T):
+        ...
+
+
+class GuardsCheckpointState:
+    """
+    The GuardCheckpointState - it is the T of Checkpointable[T] for GuardsContext
+    """
+
+    dynamo_guards: Set[Guard] = set()
+
+    def __init__(self, dynamo_guards):
+        self.dynamo_guards = dynamo_guards
+
+    def diff(self, other):
+        """
+        Produces a delta against another GuardsCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        Guard type objects.
+        """
+        r = self.dynamo_guards.difference(other.dynamo_guards)
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContextCheckpointState:
+    nn_modules: Dict[str, torch.nn.Module] = {}
+
+    def __init__(self, nn_modules):
+        self.nn_modules = nn_modules
+
+    def diff(self, other):
+        """
+        Produces a delta against another ModuleContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        module key names.
+        """
+        r = set(self.nn_modules.keys()).difference(set(other.nn_modules.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class ModuleContext(Checkpointable[ModuleContextCheckpointState]):
+    def __init__(self):
+        self.nn_modules: Dict[str, Any] = {}
+
+    def copy_graphstate(self):
+        return ModuleContextCheckpointState(dict(self.nn_modules))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, ModuleContextCheckpointState)
+        self.nn_modules = state.nn_modules
+
+
+class GlobalContextCheckpointState:
+    global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def __init__(self, global_states):
+        self.global_state = global_states
+
+    def diff(self, other):
+        """
+        Produces a delta against another GlobalContextCheckpointState.
+
+        Returns None if no delta is found, otherwise, return a set() of mismatched
+        global key names.
+        """
+        r = set(self.global_state.keys()).difference(set(other.global_state.keys()))
+        if len(r) == 0:
+            return None
+        return r
+
+    def __eq__(self, other):
+        return self.diff(other) is None
+
+
+class GlobalContext(Checkpointable[GlobalContextCheckpointState]):
+    """
+    This keeps track of the global torch state during tracing of a function.
+    For example, torch.is_grad_enabled.
+    """
+
+    _supported_global_states = {
+        "grad_enabled",
+        "torch_function_enabled",
+        "autocast_enabled",
+        "autocast_cpu_enabled",
+        "autocast_gpu_dtype",
+        "autocast_cpu_dtype",
+        "autocast_cache_enabled",
+    }
+
+    def __init__(self):
+        self.global_state: Dict[str, Tuple[Callable, ...]] = {}
+
+    def copy_graphstate(self):
+        return GlobalContextCheckpointState(dict(self.global_state))
+
+    def restore_graphstate(self, state):
+        assert isinstance(state, GlobalContextCheckpointState)
+        self.global_state = state.global_state
+        assert (
+            len(self.global_state) == len(self._supported_global_states)
+            and set(self.global_state.keys()) == self._supported_global_states
+        ), "Global state mismatch"
+        for func, args in self.global_state.values():
+            func(args)
+
+
+"""
+A GuardsContext is a checkpointable representation of all the guards in the current tracing
+context. It's lifecycle is bound 1:1 to the tracing context, and it should never be instantiated
+directly outside of it. For passing around internal state representations of this object,
+prefer to extract them with copy_graphstate to produce a GuardsCheckpointState.
+"""
+
+
+# Like a Set[Guard] but will record the user stack on all guards at the
+# time they were installed at their destination
+class GuardsSet:
+    def __init__(self, inner=None):
+        if inner is None:
+            inner = set()
+        self.inner = inner
+
+    def __iter__(self):
+        return iter(self.inner)
+
+    def __len__(self):
+        return len(self.inner)
+
+    # Subtraction along with bool is typically used to determine the delta of
+    # added guards between checkpoints for higher order ops
+    def __sub__(self, other):
+        return GuardsSet(self.inner - other.inner)
+
+    def __bool__(self):
+        return bool(self.inner)
+
+    def add(self, guard: Guard, *, collect_debug_stack=True, skip=0):
+        if guard in self.inner:
+            return
+        if collect_debug_stack:
+            if guard.stack is None:
+                guard.stack = CapturedTraceback.extract(skip=1 + skip)
+            if guard.user_stack is None:
+                guard.user_stack = TracingContext.extract_stack()
+        self.inner.add(guard)
+
+    def update(self, *others: Set[Guard]):
+        for o in others:
+            for g in o:
+                self.add(g, skip=1)
+
+    def remove_guards_with_source(self, source):
+        """Delete all guards with a given source"""
+        self.inner = {g for g in self.inner if g.originating_source != source}
+
+
+class GuardsContext(Checkpointable[GuardsCheckpointState]):
+    def __init__(self):
+        self.dynamo_guards: GuardsSet = GuardsSet()
+        self.aotautograd_guards: List[GuardEnvExpr] = []
+
+    def copy_graphstate(self):
+        return GuardsCheckpointState(set(self.dynamo_guards.inner))
+
+    def restore_graphstate(self, state):
+        # NB: "steals" the passed in state
+        assert isinstance(state, GuardsCheckpointState)
+        self.dynamo_guards = GuardsSet(state.dynamo_guards)
+
+
+_TLS = threading.local()
+
+"""
+TracingContext is the source of truth for all currently accumulated information
+needed to trace. Its lifecycle is kept 1:1 when using TorchDynamo, but other systems
+are open to managing their own TracingContext with that in mind.
+
+The purpose of TracingContext is not to be a dumping ground, or god object, but rather to avoid
+having to plumb complex subsystems across multiple verticals.
+
+Ex: A common example is guard accumulation between dynamo, shape_env, aot_autograd, and inductor.
+Accessing the current tracing context via
+TracingContext.get() allows users to accumulate their own guards for processing, without needing to know how
+to plumb objects back up to where frame interpretation happened.
+
+Note that you can end up with multiple TracingContext for a single compilation
+of a frame, as we reset the TracingContext whenever we restart analysis.
+CompileContext is a more overarching context that encompasses multiple restarts.
+"""
+
+
+class CompileContext:
+    @staticmethod
+    def get() -> CompileContext:
+        assert _TLS.compile_context is not None
+        return _TLS.compile_context
+
+    @staticmethod
+    def try_get() -> Optional[CompileContext]:
+        return getattr(_TLS, "compile_context", None)
+
+    def __init__(self, compile_id):
+        assert compile_id is None or isinstance(compile_id, CompileId)
+        self.compile_id: Optional[CompileId] = compile_id
+        self.attempt = 0
+
+    @staticmethod
+    def current_compile_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        return self.compile_id
+
+    @staticmethod
+    def current_trace_id():
+        self = CompileContext.try_get()
+        if self is None:
+            return None
+        if self.compile_id is None:
+            return None
+        return TraceId(self.compile_id, self.attempt)
+
+
+class TracingContext:
+    """
+    Provides the currently installed TracingContext, or None.
+
+    Note that it is a staticmethod, and invocations outside of `with tracing()` (see below), are valid but
+    will return None.
+    """
+
+    @staticmethod
+    def try_get() -> Optional[TracingContext]:
+        return getattr(_TLS, "tracing_context", None)
+
+    @staticmethod
+    def get() -> TracingContext:
+        if ctx := TracingContext.try_get():
+            return ctx
+        raise RuntimeError(
+            "TracingContext.get() must be called within an ongoing trace."
+        )
+
+    def __init__(self, fake_mode):
+        self.guards_context = GuardsContext()
+        self.module_context = ModuleContext()
+        self.global_context = GlobalContext()
+        self.fake_mode = fake_mode
+        self.frame_summary_stack = []
+        # This is morally part of frame_summary_stack, but it is kept separate
+        # for clarity.  As we process a frame, this variable gets updated
+        # to keep track of what line we are in the function.  We make a
+        # function call, this gets cleared and the frame location is pushed
+        # to frame_summary_stack (prepping this variable for the inner frame's
+        # progress)
+        self.loc_in_frame = None
+        # this is only set after aot_autograd
+        self.fw_metadata = None
+        self.params_flat = None
+        # this is for extended return calling convention from backend
+        # compiler to aot_autograd
+        # Per output, what the compiler specified stride of the output is,
+        # or None if no stride is known.  This is always the HINT, it
+        # is never a SymInt (it would be better if it was a SymInt, but
+        # I can't conveniently get this from Inductor atm.  Also, be
+        # careful not to accidentally induce guards on the SymInt if
+        # you ever do change this in aot_autograd.py; you should check
+        # on permutations preferentially.)
+        self.output_strides: Optional[List[Optional[List[int]]]] = None
+        # When this is True, whenever we encounter an int in Dynamo tracing,
+        # we will (1) force unspec it and (2) force it as a size-like unbacked
+        # integer.  This is currently used when processing certain lists of
+        # ints that are known to be size-like and may have 0/1 entries that we
+        # must not specialize on.
+        self.force_unspec_int_unbacked_size_like = False
+        # See note [Tensor Fakification and Symbol Caching]
+        self.tensor_to_context = WeakTensorKeyDictionary()
+
+        # If this true, Aot Autograd will return output Fake Tensors with appropiate
+        # meta on the first invocation
+        # see note: [Returning Fake Tensors on First AOT Autograd Call]
+        self.fakify_first_call = False
+
+    def clear(self):
+        # Look at the note in output_graph.py in function `save_global_state`
+        # for the context on clearing global context.
+        self.global_context.global_state = {}
+
+    @staticmethod
+    @contextmanager
+    def patch(**kwargs):
+        prior = {}
+        ctx = TracingContext.get()
+
+        for key in kwargs.keys():
+            # KeyError on invalid entry
+            prior[key] = getattr(ctx, key)
+        for key, val in kwargs.items():
+            setattr(ctx, key, val)
+        try:
+            yield
+        finally:
+            for key, val in prior.items():
+                setattr(ctx, key, val)
+
+    @staticmethod
+    def extract_stack():
+        self = TracingContext.try_get()
+        if self is None:
+            return traceback.StackSummary()
+        stack = self.frame_summary_stack
+        if self.loc_in_frame is not None:
+            stack = stack + [self.loc_in_frame]
+        return traceback.StackSummary.from_list(stack)
+
+    # Call this when you want to call into some code that isn't necessarily
+    # associated with the current frame state
+    @staticmethod
+    @contextlib.contextmanager
+    def clear_frame():
+        tc = TracingContext.get()
+        with unittest.mock.patch.object(
+            tc, "frame_summary_stack", []
+        ), unittest.mock.patch.object(tc, "loc_in_frame", None):
+            try:
+                yield
+            except Exception as e:
+                # Prevent real_stack from getting attached
+                #
+                # The invariant is that if an Exception as real_stack, we've
+                # appropriately attached a user stack and we no longer need to
+                # attach anything. Because we cannot conveniently interpose
+                # when an exception is thrown, we instead interpose everywhere
+                # we set what the user stack is set (using the context
+                # manager). However, our compiler stack does "tail calls"
+                # (when it calls into user compiler), at which point the
+                # parent exception frames would incorrectly attach an
+                # incorrect frame.
+                #
+                # However, if, somehow, someone raised an exception with this
+                # scope that had a stack (for example, because they are
+                # restoring the user stack state appropriately as they process
+                # node by node), we should respect it. Thus, we cannot
+                # unconditionally set None.
+                if not hasattr(e, "real_stack"):
+                    e.real_stack = None  # type: ignore[attr-defined]
+                raise
+
+    @staticmethod
+    @contextlib.contextmanager
+    def current_frame(frame_summary):
+        # frame_summary can be None to solely take advantage of real_stack
+        # attachment to thrown exceptions
+        tc = TracingContext.get()
+        if frame_summary is not None:
+            tc.frame_summary_stack.append(frame_summary)
+        old = tc.loc_in_frame
+        tc.loc_in_frame = None
+        try:
+            yield
+        except Exception as e:
+            if not hasattr(e, "real_stack"):
+                e.real_stack = tc.extract_stack()  # type: ignore[attr-defined]
+            raise
+        finally:
+            if frame_summary is not None:
+                tc.frame_summary_stack.pop()
+            tc.loc_in_frame = old
+
+    @staticmethod
+    @contextlib.contextmanager
+    def report_output_strides():
+        tc = TracingContext.try_get()
+        if tc is None:
+            yield None
+            return
+        old_output_strides = tc.output_strides
+        tc.output_strides = []
+        try:
+            yield tc.output_strides
+        finally:
+            tc.output_strides = old_output_strides
+
+    @staticmethod
+    def set_current_loc(filename, lineno, frame_name):
+        TracingContext.get().loc_in_frame = traceback.FrameSummary(
+            filename, lineno, frame_name
+        )
+
+
+@contextmanager
+def compile_context(context: CompileContext):
+    old_context = getattr(_TLS, "compile_context", None)
+    _TLS.compile_context = context
+    try:
+        yield context
+    finally:
+        _TLS.compile_context = old_context
+
+
+@contextmanager
+def tracing(context: Optional[TracingContext]):
+    """
+    This function installs the passed in tracing context as a dynamic scoped
+    global variable.
+
+    Calls to TracingContext.get() while not under a `with tracing()` context
+    will return None.
+    """
+    old_context = getattr(_TLS, "tracing_context", None)
+    _TLS.tracing_context = context
+    try:
+        yield context
+    except Exception as e:
+        if not hasattr(e, "real_stack") and context is not None:
+            e.real_stack = context.extract_stack()  # type: ignore[attr-defined]
+        raise
+    finally:
+        if (
+            context is not None
+            and context.fake_mode is not None
+            and context.fake_mode.shape_env is not None
+        ):
+            context.fake_mode.shape_env.cleanup()
+        _TLS.tracing_context = old_context
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+# TODO(voz): Consider a toplevel torch/_source.py
+@dataclasses.dataclass(frozen=True)
+class Source:
+    def is_dict_key(self):
+        return False
+
+    def is_ephemeral(self):
+        return False
+
+    def reconstruct(self, codegen):
+        raise NotImplementedError()
+
+    def guard_source(self) -> GuardSource:
+        raise NotImplementedError()
+
+    def name(self) -> str:
+        raise NotImplementedError()
+
+    def make_guard(self, fn) -> Guard:
+        if self.guard_source() is GuardSource.CONSTANT:
+            raise NotImplementedError()
+        return Guard(self, fn)
+
+    def is_nn_module(self) -> bool:
+        return self.guard_source().is_nn_module()
+
+    def subguards_allowed(self):
+        """True if you can guard on attributes of this"""
+        return self.guard_source() != GuardSource.SYNTHETIC_LOCAL
+
+
+# Subclasses can be found in torch/_dynamo/source.py
+@dataclasses.dataclass(frozen=True)
+class ChainedSource(Source):
+    base: Source
+
+    def is_dict_key(self):
+        # Recurse until you either hit a ConstDictKey or a Source
+        return self.base.is_dict_key()
+
+    def is_ephemeral(self):
+        return self.base.is_ephemeral()
+
+
+def detect_fake_mode(inputs: Any = None):
+    """
+    Attempts to "detect" what the current fake mode is.  If there is one ambiently
+    available from TracingContext, we preferentially use that.  Otherwise, we
+    heuristically detect the fake mode via the following sources, in order of
+    priority:
+
+        - Currently active fake mode on stack
+        - Fake mode associated with passed in tensors (inputs does not
+          have to be flattened)
+    """
+    from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+
+    fake_modes = []
+
+    if context := TracingContext.try_get():
+        fake_mode = context.fake_mode
+        if fake_mode is not None:
+            fake_modes.append((fake_mode, "tracing context", 0))
+
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for i, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            fake_modes.append((m, "active fake mode", i))
+
+    flat_inputs = pytree.tree_leaves(inputs)
+    for i, flat_input in enumerate(flat_inputs):
+        if isinstance(flat_input, FakeTensor):
+            fake_modes.append((flat_input.fake_mode, "fake tensor input", i))
+
+    if fake_modes:
+        fake_mode, desc1, i1 = fake_modes[0]
+        for m, desc2, i2 in fake_modes[1:]:
+            assert fake_mode is m, (
+                f"fake mode ({fake_mode}) from {desc1} {i1} doesn't match mode ({m}) from {desc2} {i2}\n\n"
+                f"fake mode from {desc1} {i1} allocated at:\n{fake_mode.stack}\n"
+                f"fake mode from {desc2} {i2} allocated at:\n{m.stack}"
+            )
+        return fake_mode
+    else:
+        return None
+
+
+def active_fake_mode():
+    """
+    Inspects the dispatch mode stack for an active fake mode and returns it.
+    Returns None if no fake mode is active.
+    """
+    from torch._subclasses.fake_tensor import FakeTensorMode
+    from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+    for _, m in enumerate(reversed(_get_current_dispatch_mode_stack())):
+        if isinstance(m, FakeTensorMode):
+            return m
+
+    return None
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__init__.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..495dfee3daef493bfc660f86ef35684ebe1ffb96
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/__init__.py
@@ -0,0 +1 @@
+from .cond import cond
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95b44c3453ef0b5cf4ad49bed85131dd15ab6203
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/auto_functionalize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/auto_functionalize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3f0bc470864d4b958acdc3d6716da65e8b5dfd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/auto_functionalize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/cond.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/cond.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50f32b9b59ecfea401079f63bc4e3aa78fb8396d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/cond.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/effects.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/effects.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd256f5f70859f52b6aa0b573e6136a4e900e0ea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/effects.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/map.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/map.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..098c82dee9c4ffac5d979b07ff5c62ae03a0bead
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/map.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/out_dtype.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/out_dtype.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72334da357e1f1478e870c87088beb05de6b5603
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/out_dtype.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/strict_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/strict_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..033f7d0f2433ed9feec15912f0428d517cc001fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/strict_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/torchbind.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/torchbind.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78fef3654e3168dbf745b265ee1fa52985afd724
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/torchbind.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/triton_kernel_wrap.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/triton_kernel_wrap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb51c4f6d573c9b037c463625617090f482e5e92
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/triton_kernel_wrap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2fb118c719a4ebe3e4ce939280978e1a37fed4c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/while_loop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/while_loop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c81a8f0af7aad1e005863aff178a78c8841dc39e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/while_loop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/wrap.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/wrap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42a1d81ca48cd19b489e87fb36a777683f11d005
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_higher_order_ops/__pycache__/wrap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/auto_functionalize.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/auto_functionalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..da7f441c309b379be7e309ad792c0256cec4df14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/auto_functionalize.py
@@ -0,0 +1,261 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+# NOTE: [auto-functionalizing custom ops]
+# Users may wish to torch.compile custom ops that mutate their inputs.
+# torch.compile will automatically support this op without anyone needing
+# to provide a functionalization kernel for it. Here's how.
+#
+# Let's say we have a hypothetical mylib::sin_(Tensor(a!) x) -> ()
+# op. First, when FakeTensor sees this op:
+# - If the schema says it returns nothing, we can generate a trivial
+#   FakeTensor rule for it (that returns nothing).
+# - Otherwise, the user needs to provide a FakeTensor rule (abstract impl)
+#
+# Next, when Python FunctionalTensor sees the op, it will functionalize
+# it by emitting a call to an auto_functionalize(op, ["x"], {"x": ...})
+# HOP and replacing the mutated inputs with corresponding outputs of this HOP.
+# This HOP effectively runs the functional version of the op when
+# called: it clones inputs that will be mutated, runs the op, and
+# then returns (output, Tensors with the new values)
+
+
+class AutoFunctionalized(HigherOrderOperator):
+    """auto_functionalized(_mutable_op, **kwargs)
+
+    This HOP runs a "functional" version of _mutable_op.
+
+    Concretely, it looks at all the arguments that are mutable through
+    _mutable_op's operator schema, clones those kwargs, runs
+    `out = _mutable_op(**kwargs)` with the cloned values, and then returns the
+    operator output concatenated with the cloned values that were mutated.
+
+    We have some restrictions on `_mutable_op`.
+    See `can_auto_functionalize` for the restrictions. We can likely lift
+    many of these if users request it.
+
+    The reason why _mutable_op is prefixed with an
+    underscore is to prevent collisions with kwarg names in **kwargs.
+    """
+
+    def __init__(self):
+        super().__init__("auto_functionalized")
+
+    def __call__(
+        self,
+        _mutable_op: torch._ops.OpOverload,
+        **kwargs: Dict[str, Any],
+    ) -> Tuple[Any, Tuple[Tensor, ...]]:
+        assert can_auto_functionalize(_mutable_op)
+        assert isinstance(kwargs, dict)
+        return super().__call__(_mutable_op, **kwargs)
+
+
+auto_functionalized = AutoFunctionalized()
+
+
+def can_auto_functionalize(op: torch._ops.OperatorBase) -> bool:
+    if not isinstance(op, torch._ops.OpOverload):
+        return False
+
+    if torch._library.utils.is_builtin(op):
+        # We control the built-ins. These may (in rare cases)
+        # do input metadata mutation (which we have banned on custom ops)
+        return False
+    schema = op._schema
+    if not schema.is_mutable:
+        return False
+    schema = op._schema
+
+    for arg in schema.arguments:
+        if arg.alias_info is None:
+            continue
+        if not arg.alias_info.is_write:
+            continue
+        if type(arg.type) is torch.TensorType:
+            continue
+        if (
+            type(arg.type) is torch.OptionalType
+            and type(arg.type.getElementType()) is torch.TensorType
+        ):
+            continue
+        # Not yet supported: other Tensor types. This includes things like
+        # Tensor[], Tensor?[], Tensor[]?.
+        return False
+
+    # The returns must not alias anything
+    for ret in schema.returns:
+        if ret.alias_info is None and type(ret.type) is torch.TensorType:
+            continue
+        # Not yet supported: List[Tensor] return.
+        return False
+    return True
+
+
+@auto_functionalized.py_impl(DispatchKey.CompositeExplicitAutograd)
+def auto_functionalized_dense(
+    _mutable_op: torch._ops.OpOverload,
+    _only_clone_these_tensors: Optional[Tuple[str, ...]] = None,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    new_kwargs = dict(**kwargs)
+    result = []
+
+    _mutable_args_names = get_mutable_arg_names(_mutable_op)
+    for name in _mutable_args_names:
+        if (
+            _only_clone_these_tensors is not None
+            and name not in _only_clone_these_tensors
+        ):
+            new_kwargs[name] = kwargs[name]
+        else:
+            new_kwargs[name] = (
+                clone_preserve_strides(kwargs[name])
+                if kwargs[name] is not None
+                else None
+            )
+        result.append(new_kwargs[name])
+    out = _mutable_op(**new_kwargs)
+
+    if isinstance(out, tuple):
+        return (*out, *result)  # type: ignore[return-value]
+    else:
+        return (out, *result)  # type: ignore[return-value]
+
+
+@auto_functionalized.py_impl(FakeTensorMode)
+def auto_functionalized_fake(
+    mode,
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    with mode:
+        result = auto_functionalized_dense(_mutable_op, **kwargs)
+        return result
+
+
+@auto_functionalized.py_impl(ProxyTorchDispatchMode)
+def auto_functionalized_proxy(
+    mode,
+    _mutable_op: torch._ops.OpOverload,
+    **kwargs: Dict[str, Any],
+) -> Tuple[Any, Tuple[Tensor, ...]]:
+    if not mode.enable_tracing:
+        return auto_functionalized(_mutable_op, **kwargs)
+
+    with disable_proxy_modes_tracing():
+        out = auto_functionalized(_mutable_op, **kwargs)
+
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        auto_functionalized,
+        (_mutable_op,),
+        proxy_kwargs,
+    )
+    result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    return result
+
+
+auto_functionalized.fallthrough(DispatchKey.AutogradCPU)
+auto_functionalized.fallthrough(DispatchKey.AutogradCUDA)
+
+
+def get_mutable_arg_names(op: torch._ops.OpOverload) -> List[str]:
+    """
+    Returns the list of argument names that get mutated according to the
+    schema.
+    """
+    mutable_args_names = [
+        arg.name
+        for arg in op._schema.arguments
+        if arg.alias_info is not None and arg.alias_info.is_write
+    ]
+    return mutable_args_names
+
+
+def do_auto_functionalize(
+    op: torch._ops.OpOverload, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Any:
+    """Functionalizes a call to op(*args, **kwargs) by emitting a call to
+    `outs = auto_functionalized(op, normalized_kwargs)`
+    and replacing the mutated (args, kwargs) with the corresponding outputs.
+
+    The normalized_kwargs are just the (args, kwargs), but all in kwarg form.
+    This makes handling easier for the auto_functionalized HOP.
+    """
+    from torch._subclasses.functional_tensor import PythonFunctionalizeAPI
+
+    ctx = PythonFunctionalizeAPI()
+
+    # All of the (args, kwargs), but all as kwargs. The names for the
+    # args come from the schema. This makes it easier for us to work with them.
+    normalized_kwargs = {}
+    schema = op._schema
+    for idx, arg in enumerate(schema.arguments):
+        # NB: torch_dispatch kwargs are the args defined as kwarg-only in the schema
+        if arg.name in kwargs:
+            normalized_kwargs[arg.name] = kwargs[arg.name]
+        elif idx < len(args):
+            # if its out of bounds we don't need to do anything
+            # as it means the the optional arg was passed with its default
+            # value
+            normalized_kwargs[arg.name] = args[idx]
+        else:
+            normalized_kwargs[arg.name] = arg.default_value
+
+    unwrapped_kwargs = ctx.unwrap_tensors(normalized_kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        unwrapped_outs = auto_functionalized(
+            op, **unwrapped_kwargs  # type: ignore[arg-type]
+        )
+
+    # List of the name of args that get mutated (according to the schema)
+    mutable_args_names = get_mutable_arg_names(op)
+
+    unwrapped_actual_out: Union[Any, Tuple[Any]] = unwrapped_outs[
+        : -len(mutable_args_names)
+    ]
+    unwrapped_mutable_out = unwrapped_outs[-len(mutable_args_names) :]
+
+    if len(op._schema.returns) == 0:
+        assert unwrapped_actual_out[0] is None
+        unwrapped_actual_out = None
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_actual_out) == 1
+        unwrapped_actual_out = unwrapped_actual_out[0]
+    else:
+        assert len(unwrapped_actual_out) == len(op._schema.returns)
+
+    for name, unwrapped_out in zip(mutable_args_names, unwrapped_mutable_out):
+        # Can be None if input was `Tensor(a!)?`
+        if unwrapped_out is None:
+            continue
+        assert isinstance(unwrapped_out, torch.Tensor)
+        orig_arg = normalized_kwargs[name]
+        ctx.replace(orig_arg, unwrapped_out)
+        ctx.commit_update(orig_arg)
+        ctx.sync(orig_arg)
+
+    return ctx.wrap_tensors(unwrapped_actual_out)  # type: ignore[arg-type]
+
+
+@auto_functionalized.py_functionalize_impl
+def auto_functionalized_func(ctx, _mutable_op, **kwargs):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    with ctx.redispatch_to_next():
+        result = auto_functionalized(_mutable_op, **unwrapped_kwargs)
+    return ctx.wrap_tensors(result)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/cond.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/cond.py
new file mode 100644
index 0000000000000000000000000000000000000000..000db491f82cecc0705d61b34e3e146563ec11c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/cond.py
@@ -0,0 +1,349 @@
+import torch
+import torch._subclasses.functional_tensor
+
+import torch.utils._pytree as pytree
+
+from torch._C import DispatchKey
+from torch._C._functorch import (
+    _add_batch_dim,
+    get_unwrapped,
+    is_batchedtensor,
+    maybe_get_bdim,
+)
+from torch._functorch.utils import exposed_in
+
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+
+@exposed_in("torch")
+def cond(pred, true_fn, false_fn, operands):
+    r"""
+    Conditionally applies `true_fn` or `false_fn`.
+
+    .. warning::
+        `torch.cond` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    `cond` is structured control flow operator. That is, it is like a Python if-statement,
+    but has restrictions on `true_fn`, `false_fn`, and `operands` that enable it to be
+    capturable using torch.compile and torch.export.
+
+    Assuming the constraints on `cond`'s arguments are met, `cond` is equivalent to the following::
+
+        def cond(pred, true_branch, false_branch, operands):
+            if pred:
+                return true_branch(*operands)
+            else:
+                return false_branch(*operands)
+
+    Args:
+        pred (Union[bool, torch.Tensor]): A boolean expression or a tensor with one element,
+          indicating which branch function to apply.
+
+        true_fn (Callable): A callable function (a -> b) that is within the
+          scope that is being traced.
+
+        false_fn (Callable): A callable function (a -> b) that is within the
+          scope that is being traced. The true branch and false branch must
+          have consistent input and outputs, meaning the inputs have to be
+          the same, and the outputs have to be the same type and shape.
+
+        operands (Tuple of possibly nested dict/list/tuple of torch.Tensor): A tuple of inputs to the true/false functions.
+
+    Example::
+
+        def true_fn(x: torch.Tensor):
+            return x.cos()
+        def false_fn(x: torch.Tensor):
+            return x.sin()
+        return cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+
+    Restrictions:
+        - The conditional statement (aka `pred`) must meet one of the following constraints:
+
+          - It's a `torch.Tensor` with only one element, and torch.bool dtype
+
+          - It's a boolean expression, e.g. `x.shape[0] > 10` or `x.dim() > 1 and x.shape[1] > 10`
+
+        - The branch function (aka `true_fn`/`false_fn`) must meet all of the following constraints:
+
+          - The function signature must match with operands.
+
+          - The function must return a tensor with the same metadata, e.g. shape,
+            dtype, etc.
+
+          - The function cannot have in-place mutations on inputs or global variables.
+            (Note: in-place tensor operations such as `add_` for intermediate results
+            are allowed in a branch)
+
+    .. warning::
+        Temporal Limitations:
+
+        - `cond` only supports **inference** right now. Autograd will be supported in the future.
+
+        - The **output** of branches must be a **single Tensor**. Pytree of tensors will be supported in the future.
+
+    """
+
+    if torch.compiler.is_dynamo_compiling():
+        return cond_op(pred, true_fn, false_fn, operands)
+
+    def _validate_input(pred, true_fn, false_fn, operands):
+        if not isinstance(pred, (bool, torch.Tensor, torch.SymBool)):
+            raise RuntimeError(f"Expected pred to be bool or tensor, but got {pred}.")
+
+        if isinstance(pred, torch.Tensor) and pred.numel() != 1:
+            raise RuntimeError(
+                f"Expected pred to be bool or single-element tensor, but got {pred}."
+            )
+
+        if not callable(true_fn) or not callable(false_fn):
+            raise RuntimeError("Expect both branches to be callbale.")
+
+        if not isinstance(operands, (tuple, list)) or pytree.tree_any(
+            lambda t: not isinstance(t, torch.Tensor), operands
+        ):
+            raise RuntimeError(
+                "Expect operands to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor leaves, but got {operands}."
+            )
+
+    _validate_input(pred, true_fn, false_fn, operands)
+
+    if not torch._dynamo.is_dynamo_supported():
+        raise RuntimeError("torch.cond requires dynamo support.")
+
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            return torch.compile(cond_op, backend="eager", fullgraph=True)(
+                pred, true_fn, false_fn, operands
+            )
+
+
+"""
+We're going to define a `cond_op` operation.
+In order to do this, we need implementations for each of the dispatch keys.
+"""
+cond_op = HigherOrderOperator("cond")
+
+
+def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
+    assert isinstance(
+        operands, (list, tuple)
+    ), "Cond operands must be a list or tuple of tensors"
+    assert all(
+        isinstance(o, torch.Tensor) for o in operands
+    ), "Cond operands must be a list of tensors"
+
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+
+    with disable_proxy_modes_tracing():
+        true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands)
+        false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands)
+
+    true_outs = []
+    false_outs = []
+    for node in true_graph.graph.nodes:
+        if node.op == "output":
+            true_outs.extend(node.args)
+
+    for node in false_graph.graph.nodes:
+        if node.op == "output":
+            false_outs.extend(node.args)
+
+    flat_true_outs = pytree.arg_tree_leaves(*true_outs)
+    flat_false_outs = pytree.arg_tree_leaves(*false_outs)
+    if len(flat_true_outs) != len(flat_false_outs):
+        raise torch._dynamo.exc.CondOpArgsMismatchError(
+            f"Expected to return same number of outputs but got:"
+            f"\n  {true_fn.__name__} returns {len(flat_true_outs)} item(s)"
+            f"\n  {false_fn.__name__} returns {len(flat_false_outs)} item(s)"
+        )
+
+    for i in range(0, len(flat_true_outs)):
+        true_out = flat_true_outs[i]
+        false_out = flat_false_outs[i]
+        if true_out.meta["tensor_meta"] != false_out.meta["tensor_meta"]:
+            raise torch._dynamo.exc.CondOpArgsMismatchError(
+                f"Expected each tensor to have same metadata but got:"
+                f"\n  {true_fn.__name__} returns {true_out.meta['tensor_meta']}"
+                f"\n  {false_fn.__name__} returns {false_out.meta['tensor_meta']}"
+            )
+
+    # There are probably better ways - I know that create_arg has some self incrementing name
+    # magic to it, but since we explicitly have to get the name for register_module,
+    # I was not sure how to do that. This kinda simulates it.
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"true_graph_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+
+    true_name = next_name
+    false_name = f"false_graph_{i}"
+    assert not hasattr(proxy_mode.tracer.root, false_name)
+
+    proxy_mode.tracer.root.register_module(true_name, true_graph)
+    proxy_mode.tracer.root.register_module(false_name, false_graph)
+
+    args = (pred, true_graph, false_graph, operands)
+
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="conditional"
+    )
+
+    # At this point, we're *guaranteed* that whether an output came from the
+    # true or false branch is indistinguishable. So, as this is just for tracing
+    # purposes, choose the true branch.
+
+    # TODO: Uhh.... it shouldn't matter, but changing this to true_fn results in
+    # a FakeTensorMode error :
+    # `Current active mode <class 'torch._subclasses.fake_tensor.FakeTensorMode'> not registered`
+    # TODO Sometimes the operands are not completely FakeTensor, something seems went wrong in
+    # dynamo? Because of that it runs real computation sometimes and re-triggering downstream dispatch keys.
+    out = false_fn(*operands)
+
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@cond_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def cond_op_dense(pred, true_fn, false_fn, operands):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    if pred:
+        return true_fn(*operands)
+    else:
+        return false_fn(*operands)
+
+
+cond_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(cond_op, deferred_error=True)
+)
+
+
+@cond_op.py_impl(ProxyTorchDispatchMode)
+def inner(mode, pred, true_fn, false_fn, operands):
+    if mode.enable_tracing:
+        return trace_cond(mode, cond_op, pred, true_fn, false_fn, operands)
+    else:
+        return cond_op(pred, true_fn, false_fn, operands)
+
+
+@cond_op.py_impl(FakeTensorMode)
+def cond_fake_tensor_mode(mode, pred, true_fn, false_fn, operands):
+    with mode:
+        true_outs = true_fn(*operands)
+        flat_true_outs = pytree.tree_leaves(true_outs)
+        flat_false_outs = pytree.tree_leaves(false_fn(*operands))
+    if len(flat_true_outs) != len(flat_false_outs):
+        raise RuntimeError("Unmatched number of outputs from cond() branches.")
+
+    for true_out, false_out in zip(flat_true_outs, flat_false_outs):
+        true_meta = _extract_tensor_metadata(true_out)
+        false_meta = _extract_tensor_metadata(false_out)
+        if true_meta != false_meta:
+            raise torch._dynamo.exc.CondOpArgsMismatchError(
+                f"Expected each tensor to have same metadata but got:"
+                f"\n  {true_fn.__name__} returns {true_meta}"
+                f"\n  {false_fn.__name__} returns {false_meta}"
+            )
+    return true_outs
+
+
+@cond_op.py_functionalize_impl
+def cond_func(ctx, pred, true_fn, false_fn, inputs):
+    unwrapped_inputs = ctx.unwrap_tensors(inputs)
+    unwrapped_pred = ctx.unwrap_tensors(pred)
+    with ctx.redispatch_to_next() as m:
+        functional_true = ctx.functionalize(true_fn)
+        functional_false = ctx.functionalize(false_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for branch in [functional_true, functional_false]:
+            if _has_potential_branch_input_mutation(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    "One of torch.cond branch might be modifying the input!"
+                )
+        for branch in [true_fn, false_fn]:
+            if _has_potential_branch_input_alias(
+                branch, unwrapped_inputs, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    "One of torch.cond branch might be aliasing the input!"
+                )
+
+        cond_return = cond_op(
+            unwrapped_pred, functional_true, functional_false, unwrapped_inputs
+        )
+        return ctx.wrap_tensors(cond_return)
+
+
+@cond_op.py_impl(torch._C._functorch.TransformType.Vmap)
+def cond_batch_rule(interpreter, pred, true_fn, false_fn, inputs):
+    assert isinstance(
+        inputs, (list, tuple)
+    ), "Cond inputs must be a list or tuple of tensors"
+    assert all(
+        isinstance(i, torch.Tensor) for i in inputs
+    ), "Cond inputs must be a list of tensors"
+
+    pred_ = get_unwrapped(pred) if is_batchedtensor(pred) else pred
+
+    # unbatched tensors are not vmapped
+    tensors, in_dims = zip(
+        *[
+            (get_unwrapped(t), maybe_get_bdim(t)) if is_batchedtensor(t) else (t, None)
+            for t in inputs
+        ]
+    )
+
+    if is_batchedtensor(pred):
+        # prepend "pred" and vmap everything
+        tensors = (pred_,) + tensors
+        in_dims = (0,) + in_dims
+
+        def fn(p, *args):
+            t = true_fn(*args)
+            f = false_fn(*args)
+            return torch.where(p, t[0], f[0])
+
+        with interpreter.lower():
+            result = torch.vmap(fn, in_dims=in_dims)(*tensors)
+
+    else:
+        # predicate is known at this stage and it is a boolean expression or a
+        # tensor with one element.
+        true_fn = torch.vmap(true_fn, in_dims=in_dims)
+        false_fn = torch.vmap(false_fn, in_dims=in_dims)
+
+        with interpreter.lower():
+            result = cond_op(pred, true_fn, false_fn, tensors)
+
+    if not isinstance(result, tuple):
+        result = (result,)
+    lvl = interpreter.level()
+    return tuple([_add_batch_dim(r, 0, lvl) for r in result])
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/effects.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/effects.py
new file mode 100644
index 0000000000000000000000000000000000000000..c47ba873970465be0acd7df86e5afc09773ec88b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/effects.py
@@ -0,0 +1,204 @@
+from enum import Enum
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+class _EffectType(Enum):
+    ORDERED = "Ordered"
+
+
+SIDE_EFFECTS: Dict[torch._ops.OpOverload, _EffectType] = {
+    torch.ops.aten._print.default: _EffectType.ORDERED,
+}
+
+
+class WithEffects(HigherOrderOperator):
+    """
+    with_effects(token, op, args, kwargs) -> (new_token, op_results)
+
+    This HOP helps ensure ordering between side effectful ops like prints or ops
+    using torchbind objects. This is needed to ensure a traced graph from
+    AOTAutograd is functional so that future optimization passes do not reorder
+    these operators. This is done through threading "effect tokens" through the
+    graph to enforce data dependence between side effectful ops.
+
+    The tokens are basically dummy values (torch.tensor([])). We create a token
+    per "effect type", which are enumerated in the _EffectType enum.
+    """
+
+    def __init__(self):
+        super().__init__("with_effects")
+
+    def __call__(
+        self,
+        token,
+        op: torch._ops.OpOverload,
+        *args: Tuple[Any, ...],
+        **kwargs: Dict[str, Any],
+    ) -> Tuple[Any, ...]:
+        assert isinstance(op, torch._ops.OpOverload)
+        assert not has_aliasing(op), "Ops with aliasing is not supported"
+        assert has_effects(op, args, kwargs)
+        assert isinstance(kwargs, dict)
+        return super().__call__(token, op, *args, **kwargs)
+
+
+with_effects = WithEffects()
+
+
+def has_aliasing(op: torch._ops.OpOverload):
+    for arg in op._schema.arguments:
+        if arg.alias_info is not None:
+            return True
+    for arg in op._schema.returns:
+        if arg.alias_info is not None:
+            return True
+    return False
+
+
+def has_effects(op, args, kwargs) -> bool:
+    return (
+        isinstance(op, torch._ops.OpOverload)
+        and not has_aliasing(op)
+        and get_effect_key(op, args, kwargs) is not None
+    )
+
+
+def get_effect_key(op, args, kwargs) -> Optional[_EffectType]:
+    if op in SIDE_EFFECTS:
+        return SIDE_EFFECTS[op]
+
+    for arg in args:
+        if isinstance(arg, torch.ScriptObject):
+            return _EffectType.ORDERED
+
+    return None
+
+
+@with_effects.py_impl(DispatchKey.CompositeExplicitAutograd)
+def with_effects_dense(
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    out = op(*args, **kwargs)
+    new_token = torch.tensor([])
+    if isinstance(out, tuple):
+        return (new_token, *out)
+    return (new_token, out)
+
+
+@with_effects.py_impl(FakeTensorMode)
+def with_effects_fake(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    with mode:
+        result = with_effects_dense(token, op, *args, **kwargs)
+        return result
+
+
+@with_effects.py_impl(ProxyTorchDispatchMode)
+def with_effects_proxy(
+    mode,
+    token: torch.Tensor,
+    op: torch._ops.OpOverload,
+    *args: Tuple[Any, ...],
+    **kwargs: Dict[str, Any],
+) -> Tuple[torch.Tensor, ...]:
+    if not mode.enable_tracing:
+        return with_effects(token, op, *args, **kwargs)
+
+    with disable_proxy_modes_tracing():
+        out = with_effects(token, op, *args, **kwargs)
+
+    proxy_token = mode.tracer.unwrap_proxy(token)
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+    proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+
+    out_proxy = mode.tracer.create_proxy(
+        "call_function",
+        with_effects,
+        (proxy_token, op, *proxy_args),
+        proxy_kwargs,
+    )
+    result = track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    return result
+
+
+with_effects.fallthrough(DispatchKey.AutogradCPU)
+with_effects.fallthrough(DispatchKey.AutogradCUDA)
+
+
+def handle_effects(
+    allow_token_discovery: bool,
+    tokens: Dict[_EffectType, torch.Tensor],
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+) -> Any:
+    """
+    Args:
+        allow_token_discovery: Whether or not we are discovering tokens. If this
+        is true, we will create a token for every side effect type seen that
+        does not have a token assigned yet.  If this is false, the tokens
+        should've all been created ahead of time, so we will error if there is
+        no token mapping to every effect type.
+
+        tokens: Map of effect type to tokens. This is to chain operators of the
+        same effects together so that they do not get reordered in later
+        optimization passes.
+    """
+
+    # Get a token. We can't do `tokens.get(op, torch.tensor([]))` because
+    # this will create an empty tensor during proxy mode tracing if the token
+    # doesn't exist. But the tokens should always exist during proxy mode tracing.
+    key = get_effect_key(op, args, kwargs)
+    assert key is not None
+    if key not in tokens:
+        assert allow_token_discovery, f"Could not find a token for effect {key}"
+        tokens[key] = torch.tensor([])
+    token = tokens[key]
+
+    from torch._subclasses.functional_tensor import PythonFunctionalizeAPI
+
+    ctx = PythonFunctionalizeAPI()
+
+    unwrapped_token = ctx.unwrap_tensors([token])[0]  # type: ignore[arg-type]
+    unwrapped_args = ctx.unwrap_tensors(args)  # type: ignore[arg-type]
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)  # type: ignore[arg-type]
+    with ctx.redispatch_to_next():
+        (new_token, *unwrapped_outs) = with_effects(
+            unwrapped_token, op, *unwrapped_args, **unwrapped_kwargs  # type: ignore[arg-type]
+        )
+
+    if len(op._schema.returns) == 0:
+        assert unwrapped_outs[0] is None
+        unwrapped_outs = None  # type: ignore[assignment]
+    elif len(op._schema.returns) == 1:
+        assert len(unwrapped_outs) == 1
+        unwrapped_outs = unwrapped_outs[0]
+    else:
+        assert len(unwrapped_outs) == len(op._schema.returns)
+
+    # Add the newly created token into the tokens map for a following call to
+    # use this token.
+    wrapped_token = ctx.wrap_tensors(new_token)
+    assert isinstance(wrapped_token, torch.Tensor)
+    tokens[key] = wrapped_token
+
+    return ctx.wrap_tensors(unwrapped_outs)  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/map.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/map.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f89ea23c90f41c02051eada86fc771f0bab221
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/map.py
@@ -0,0 +1,358 @@
+import torch
+import torch.utils._pytree as pytree
+from torch._C import DispatchKey
+from torch._dispatch.python import suspend_functionalization
+from torch._functorch.aot_autograd import AOTConfig, create_joint, from_fun
+
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._subclasses.functional_tensor import (
+    disable_functional_mode,
+    FunctionalTensor,
+)
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.multiprocessing.reductions import StorageWeakRef
+
+
+# TODO: We add this to prevent dymamo from tracing into map_wrapper,
+# remove the wrapper call when it's ready.
+class MapWrapper(HigherOrderOperator):
+    def __call__(self, xs, *args):
+        return map_wrapper(xs, *args)
+
+
+map = MapWrapper("map")
+map_impl = HigherOrderOperator("map_impl")
+
+dummy_aot_config = AOTConfig(
+    fw_compiler=None,  # type: ignore[arg-type]
+    bw_compiler=None,  # type: ignore[arg-type]
+    partition_fn=None,  # type: ignore[arg-type]
+    decompositions={},
+    num_params_buffers=0,
+    aot_id=0,
+    keep_inference_input_mutations=False,
+)
+
+
+def create_fw_bw_graph(f, num_mapped_args, *args):
+    mapped_xs = args[:num_mapped_args]
+    pos_args = args[num_mapped_args:]
+
+    # Note: We create "clean" environments for make_fx by suspending all dispatch keys
+    # between Autograd and Python key. Currently, we only suspend functionalization but more can be
+    # added when required. Will encounter two problems if we don't suspend functionalization:
+    #
+    # 1. make_fx fails to capture operations on input: the inputs are wrapped as _to_functional_tensor_wrapper,
+    # but they will be unwrapped before entering ProxyTorchDispatchMode as part of the dispatching.
+    # However, it's the outside wrapper that tracer creates proxies for. This casuses tracer fail to
+    # fetch the proxy for the inputs and fail to capture any operations on them.
+    #
+    # 2. make_fx fails to capture output: the outputs after ProxyTorchDispatchMode are further
+    # wrapped as FunctionalTensorWrapper in Functionalize key after return. However, the tracer
+    # only associates the inner tensor with proxy in ProxyTorchDispatchMode. Therefore,
+    # when creating the output node, it fails to associate the wrapped tensor with its proxy.
+    # Instead, it will create _tensor_constant as output.
+
+    with suspend_functionalization(), disable_functional_mode():
+        with disable_proxy_modes_tracing():
+
+            def _from_fun(t):
+                if isinstance(t, torch.Tensor):
+                    if t.dtype != torch.bool:
+                        return torch.empty_strided(
+                            t.size(),
+                            t.stride(),
+                            dtype=t.dtype,
+                            requires_grad=t.requires_grad,
+                        )
+                    else:
+                        # clone of a functional tensor produces a functional tensor
+                        # but we want to avoid it so we clone a non-functional version
+                        maybe_unfunc_t = t
+                        if isinstance(t, FunctionalTensor):
+                            torch._sync(t)
+                            maybe_unfunc_t = from_fun(t)
+                        elif torch._is_functional_tensor(t):
+                            # need to handle both types of functionalization here:
+                            # these are the tensors that came from the user,
+                            # which could be either FunctionalTensorWrapper or FunctionalTensor
+                            torch._sync(t)
+                            maybe_unfunc_t = torch._from_functional_tensor(t)
+                        return maybe_unfunc_t.clone()
+                return t
+
+            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
+            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
+
+            example_pos_args = [
+                _from_fun(arg) if isinstance(arg, torch.Tensor) else arg
+                for arg in pos_args
+            ]
+            example_flat_out = pytree.tree_map(
+                _from_fun, f(*example_xs, *example_pos_args)
+            )
+            if any(
+                not isinstance(out, torch.Tensor)
+                for out in example_flat_out
+                if out is not None
+            ):
+                raise RuntimeError(
+                    "Expect outputs of map only contains tensors or None. "
+                    f"Got types {[type(out) for out in example_flat_out]}."
+                )
+            example_grad = [_from_fun(out) for out in example_flat_out]
+
+            fw_graph = make_fx(f)(*example_xs, *example_pos_args)
+
+        def joint_f(*example_args):
+            joint_mapped_args = example_args[:joint_num_mapped]
+            args = example_args[joint_num_mapped:]
+
+            mapped_input = joint_mapped_args[:num_mapped_args]
+            mapped_grads = joint_mapped_args[num_mapped_args:]
+
+            def fw_with_masks(*args):
+                fw_out = f(*args)
+                return fw_out, [
+                    True
+                    if isinstance(ret, torch.Tensor) and ret.requires_grad
+                    else False
+                    for ret in fw_out
+                ]
+
+            joint = create_joint(fw_with_masks, aot_config=dummy_aot_config)
+            _, grads = joint(
+                list(mapped_input) + list(args),
+                [
+                    grad
+                    for grad in mapped_grads
+                    if grad is not None and grad.requires_grad
+                ],
+            )
+
+            # In order to keep map functional for backward graph,
+            # we clone outputs that are aliasing inputs
+            input_storage = {
+                StorageWeakRef(arg._typed_storage())
+                for arg in example_args
+                if isinstance(arg, torch.Tensor)
+            }
+
+            def maybe_clone(t):
+                if (
+                    isinstance(t, torch.Tensor)
+                    and StorageWeakRef(t._typed_storage()) in input_storage
+                ):
+                    return t.clone()
+                return t
+
+            return pytree.tree_map(maybe_clone, grads)
+
+        joint_num_mapped = len(example_grad) + len(example_xs)
+        joint_graph = make_fx(joint_f)(*example_xs, *example_grad, *example_pos_args)
+        return fw_graph, joint_graph
+
+
+def map_wrapper(f, xs, *args):
+    flat_xs, xs_spec = pytree.tree_flatten(xs)
+    if not all(isinstance(t, torch.Tensor) for t in flat_xs):
+        raise RuntimeError(f"Mapped xs can only consist of tensors. Got xs {flat_xs}.")
+
+    num_mapped_args = len(flat_xs)
+    shapes = [xs.shape for xs in flat_xs]
+    leading_dim_size = shapes[0][0]
+    if leading_dim_size == 0:
+        raise RuntimeError("Leading dimensions of mapped xs cannot be 0.")
+
+    if any(cur_shape[0] != leading_dim_size for cur_shape in shapes):
+        raise RuntimeError(
+            f"Leading dimensions of mapped xs must be consistent. Got shapes {shapes}."
+        )
+
+    out_spec = None
+
+    def flat_fn(*flat_args):
+        xs = pytree.tree_unflatten(list(flat_args[:num_mapped_args]), xs_spec)
+        unflattened_out = f(xs, *flat_args[num_mapped_args:])
+        flat_out, tmp_out_spec = pytree.tree_flatten(unflattened_out)
+
+        nonlocal out_spec
+        out_spec = tmp_out_spec
+        return flat_out
+
+    return pytree.tree_unflatten(
+        map_impl(flat_fn, flat_xs, args), out_spec  # type: ignore[arg-type]
+    )
+
+
+class MapAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, fw_graph, joint_graph, num_mapped_args, *flat_args):
+        ctx.save_for_backward(*flat_args)
+        ctx._joint_graph = joint_graph
+        ctx._num_mapped_args = num_mapped_args
+        with torch._C._AutoDispatchBelowAutograd():
+            return (
+                *map_impl(
+                    fw_graph, flat_args[:num_mapped_args], flat_args[num_mapped_args:]
+                ),
+            )
+
+    @staticmethod
+    def backward(ctx, *flat_grads):
+        fw_args = ctx.saved_tensors
+        fw_mapped_args = fw_args[: ctx._num_mapped_args]
+        pos_args = fw_args[ctx._num_mapped_args :]
+
+        grads = map_impl(
+            ctx._joint_graph,
+            fw_mapped_args + flat_grads,
+            pos_args,
+        )
+        return None, None, None, *grads
+
+
+def trace_map(proxy_mode, func_overload, f, xs, pos_args):
+    leading_dim_size = xs[0].shape[0]
+
+    example_input = _unstack_pytree(xs)[0]
+    body_graph = f
+
+    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+    body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args)
+
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"body_graph_{i}"
+        if hasattr(proxy_mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+
+    proxy_mode.tracer.root.register_module(next_name, body_graph)
+
+    with disable_proxy_modes_tracing():
+        example_outs = body_graph(*example_input, *pos_args)
+
+        def expand_tensor(t):
+            if isinstance(t, torch.Tensor):
+                return t.expand(leading_dim_size, *t.shape)
+            return t
+
+        expanded_outs = pytree.tree_map(expand_tensor, example_outs)
+
+    node_args = (body_graph, list(xs), list(pos_args))
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="map_impl"
+    )
+    return track_tensor_tree(
+        expanded_outs, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
+
+
+def _unstack_pytree(xs):
+    flat_xs, inspec = pytree.tree_flatten(xs)
+    if not all(isinstance(xs, torch.Tensor) for xs in flat_xs):
+        raise RuntimeError(f"Leaves of xs must be Tensor {flat_xs}")
+
+    if not all(xs.shape[0] == flat_xs[0].shape[0] for xs in flat_xs):
+        raise RuntimeError(
+            f"Leaves of xs must have same leading dimension size {[xs.shape for xs in flat_xs]}"
+        )
+
+    a = zip(*flat_xs)
+
+    pytrees = []
+    for tuple in a:
+        pytrees.append(pytree.tree_unflatten(tuple, inspec))
+    return pytrees
+
+
+def _stack_pytree(pytrees):
+    flat_out = []
+    out_spec = None
+    for pt in pytrees:
+        flat_pt, out_spec = pytree.tree_flatten(pt)
+        flat_out.append(flat_pt)
+    assert out_spec is not None
+    b = zip(*flat_out)
+    stacked_out = []
+    for leaves in b:
+        if all(isinstance(leaf, torch.Tensor) for leaf in leaves):
+            stacked_out.append(torch.stack(leaves))
+        elif all(leaf is None for leaf in leaves):
+            # Backward graph can return None output when forward inputs doesn't require grad.
+            # When we eagerly execute backward graph, we need to call _stack_pytree on its output,
+            # therefore we need to deal with None output.
+            stacked_out.append(None)  # type: ignore[arg-type]
+        else:
+            raise RuntimeError(f"Cannot stack {leaves}.")
+    return pytree.tree_unflatten(stacked_out, out_spec)
+
+
+@map_impl.py_impl(DispatchKey.CompositeExplicitAutograd)
+def map_dense(f, xs, pos_args):
+    pytrees = []
+    for inp in _unstack_pytree(xs):
+        pytrees.append(f(*inp, *pos_args))
+    return _stack_pytree(pytrees)
+
+
+@map_impl.py_impl(DispatchKey.Autograd)
+def map_autograd(f, xs, pos_args):
+    num_mapped_args = len(xs)
+    fw_graph, bw_graph = create_fw_bw_graph(f, num_mapped_args, *xs, *pos_args)
+    flat_out = MapAutogradOp.apply(fw_graph, bw_graph, num_mapped_args, *xs, *pos_args)
+    return flat_out
+
+
+@map_impl.py_impl(ProxyTorchDispatchMode)
+def map_proxy_torch_dispatch_mode(mode, f, xs, args):
+    if mode.enable_tracing:
+        return trace_map(mode, map_impl, f, xs, args)
+    else:
+        return map_impl(f, xs, args)
+
+
+@map_impl.py_impl(FakeTensorMode)
+def map_fake_tensor_mode(mode, f, xs, args):
+    with mode:
+        return map_dense(f, xs, args)
+
+
+@map_impl.py_functionalize_impl
+def map_functionalize(ctx, f, xs, pos_args):
+    unwrapped_xs = ctx.unwrap_tensors(xs)
+    unwrapped_args = ctx.unwrap_tensors(pos_args)
+    wrapped_fn = ctx.functionalize(f)
+
+    with ctx.redispatch_to_next():
+        with disable_proxy_modes_tracing():
+            example_inputs = (*_unstack_pytree(unwrapped_xs)[0], *unwrapped_args)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        if _has_potential_branch_input_mutation(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
+            raise UnsupportedAliasMutationException("torch.map is mutating the input!")
+
+        if _has_potential_branch_input_alias(
+            f, example_inputs, pre_dispatch=pre_dispatch
+        ):
+            raise UnsupportedAliasMutationException("torch.map is aliasing the input!")
+
+        map_return = map_impl(wrapped_fn, unwrapped_xs, unwrapped_args)
+        return ctx.wrap_tensors(map_return)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/out_dtype.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/out_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f30903e02cfaea25620a189f6d681900aa32dc3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/out_dtype.py
@@ -0,0 +1,170 @@
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+    maybe_handle_decomp,
+)
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch._prims_common import elementwise_dtypes, ELEMENTWISE_TYPE_PROMOTION_KIND
+from torch._higher_order_ops.utils import autograd_not_implemented
+
+# TODO to figure out a more generic approach
+ALLOWABLE_OPS = [
+    torch.ops.aten.linear.default,
+    torch.ops.aten.mm.default,
+    torch.ops.aten.conv2d.default,
+    torch.ops.aten.convolution.default,
+    torch.ops.aten.mul.Tensor,
+    torch.ops.aten.mul.Scalar,
+    torch.ops.aten.div.Tensor,
+    torch.ops.aten.div.Scalar,
+]
+
+
+class OutDtypeOperator(HigherOrderOperator):
+    """
+    The out_dtype operator takes an existing ATen functional operator, an
+    `out_dtype` argument, and arguments to the original operator, and executes
+    the original operator and returns a Tensor with the `out_dtype` precision.
+    This operator does not mandate a compute precision so it allows the
+    representation to not be opinionated about the exact implementation.
+
+    The general implementation for all operators will be the following:
+        1. Promote inputs dtypes based on default PyTorch dtype promotion rules,
+            using the dtypes of all input Tensors/Scalars and the `out_dtype`
+            arugument.
+        2. Execute the operator
+        3. Cast the output to `out_dtype`
+    """
+
+
+    def __init__(self):
+        super().__init__("out_dtype")
+        # TODO(ydwu4): Subclassing HigherOrderOperator causes __module__ to
+        # become different (torch._higher_order_ops.out_dtype) which will result
+        # in torch.fx to record the op incorrectly in the graph.
+        self.__module__ = "torch.ops.higher_order"
+
+    def __call__(self, op, output_dtype, *args):
+        if not isinstance(op, torch._ops.OpOverload):
+            raise ValueError("out_dtype's first argument must be an OpOverload")
+        if op._schema.is_mutable:
+            raise ValueError("out_dtype's first argument needs to be a functional operator")
+        if not (
+            len(op._schema.returns) == 1 and
+            isinstance(op._schema.returns[0].type, torch.TensorType)
+        ):
+            raise ValueError(
+                "out_dtype's can only apply to ops that return a single tensor"
+                f"Instead got {[r.type for r in op._schema.returns]}"
+            )
+
+        if op not in ALLOWABLE_OPS:
+            raise ValueError(
+                f"out_dtype only allows the following operators: {ALLOWABLE_OPS}."
+            )
+
+        res = super().__call__(op, output_dtype, *args)
+
+        return res
+
+
+out_dtype = OutDtypeOperator()
+
+def trace_out_dtype(proxy_mode, func_overload, op, output_dtype, *args):
+    # NB: Long-term we should put the decomposition logic into
+    # ProxyTorchDispatchMode so that people do not need to call maybe_handle_decomp
+    # in all HigherOrderOp proxy implementations.
+    r = maybe_handle_decomp(proxy_mode, func_overload, (op, output_dtype, *args), {})
+    if r is not NotImplemented:
+        return r
+
+    with disable_proxy_modes_tracing():
+        # This is a simplified implementation of this operator just for tracing.
+        # Actual implementation may also first promote the arguments
+        out = op(*args).to(dtype=output_dtype)
+
+    node_args = (op, output_dtype, *args)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function", func_overload, proxy_args, {}, name="out_dtype"
+    )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@out_dtype.py_impl(DispatchKey.CompositeExplicitAutograd)
+def out_dtype_dense(
+    op: torch._ops.OpOverload,
+    output_dtype: torch.dtype,
+    *args
+):
+    if is_int_mm(op, output_dtype, args):
+        return torch._int_mm(*args)
+    return out_dtype_fallback(op, output_dtype, *args)
+
+
+def is_int_mm(op, output_dtype, args):
+    return (
+        op == torch.ops.aten.mm.default and
+        output_dtype == torch.int32 and
+        len(args) == 2 and
+        args[0].dtype == torch.int8 and
+        args[1].dtype == torch.int8 and
+        args[0].is_cuda and
+        args[1].is_cuda
+    )
+
+
+def out_dtype_fallback(op, output_dtype, *args):
+    flat_inputs = pytree.arg_tree_leaves(*args) + [torch.ones(1, dtype=output_dtype)]
+    promote_dtype: torch.dtype = elementwise_dtypes(
+        *flat_inputs,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    )[0]
+
+    casted_args = pytree.tree_map_only(
+        torch.Tensor, lambda arg: arg.to(dtype=promote_dtype), args
+    )
+    res = op(*casted_args).to(dtype=output_dtype)
+    return res
+
+
+out_dtype.py_impl(DispatchKey.Autograd)(autograd_not_implemented(out_dtype, deferred_error=True))
+
+
+@out_dtype.py_impl(ProxyTorchDispatchMode)
+def out_dtype_proxy(
+    mode: ProxyTorchDispatchMode,
+    op: torch._ops.OpOverload,
+    output_dtype: torch.dtype,
+    *args
+):
+    if mode.enable_tracing:
+        return trace_out_dtype(mode, out_dtype, op, output_dtype, *args)
+    else:
+        return out_dtype(op, output_dtype, *args)
+
+
+@out_dtype.py_impl(FakeTensorMode)
+def out_dtype_fake_tensor_mode(
+    mode: FakeTensorMode,
+    op: torch._ops.OpOverload,
+    output_dtype: torch.dtype,
+    *args
+):
+    with mode:
+        return out_dtype_dense(op, output_dtype, *args)
+
+
+@out_dtype.py_functionalize_impl
+def out_dtype_func(ctx, op, output_dtype, *args):
+    unwrapped_args = tuple(ctx.unwrap_tensors(arg) for arg in args)
+
+    with ctx.redispatch_to_next():
+        res = out_dtype(op, output_dtype, *unwrapped_args)
+    return ctx.wrap_tensors(res)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/strict_mode.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/strict_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..73e20dc817cefcb4ef0f91d4b72f127a738741ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/strict_mode.py
@@ -0,0 +1,100 @@
+import torch
+import torch._subclasses.functional_tensor
+
+import torch.utils._pytree as pytree
+
+from torch._C import DispatchKey
+from torch._functorch.utils import exposed_in
+
+from torch._higher_order_ops.utils import _set_compilation_env, autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    make_fx,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+
+@exposed_in("torch")
+def strict_mode(callable, operands):
+    if torch.compiler.is_dynamo_compiling():
+        return strict_mode_op(callable, operands)
+
+    with _set_compilation_env():
+        with torch._dynamo.utils.disable_cache_limit():
+            return torch.compile(strict_mode_op, backend="eager", fullgraph=True)(
+                callable, operands
+            )
+
+
+strict_mode_op = HigherOrderOperator("strict_mode")
+
+
+@strict_mode_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def strict_mode_op_dense(callable, operands):
+    mode = _get_current_dispatch_mode()
+    assert mode is None, "Mode should never be enabled for CPU/CUDA key"
+    return callable(*operands)
+
+
+strict_mode_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(strict_mode_op, deferred_error=True)
+)
+
+
+@strict_mode_op.py_impl(ProxyTorchDispatchMode)
+def inner(mode, callable, operands):
+    if mode.enable_tracing:
+        return trace_strict_mode(mode, strict_mode_op, callable, operands)
+    else:
+        return strict_mode_op(callable, operands)
+
+
+def trace_strict_mode(mode, strict_mode_op, callable, operands):
+    pre_dispatch = getattr(mode, "pre_dispatch", False)
+
+    with disable_proxy_modes_tracing():
+        graph = make_fx(callable, pre_dispatch=pre_dispatch)(*operands)
+
+    next_name = None
+    i = 0
+    while not next_name:
+        candidate = f"strict_graph_{i}"
+        if hasattr(mode.tracer.root, candidate):
+            i += 1
+        else:
+            next_name = candidate
+
+    graph_name = next_name
+    mode.tracer.root.register_module(graph_name, graph)
+
+    args = (graph, operands)
+
+    proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+
+    out_proxy = mode.tracer.create_proxy(
+        "call_function", strict_mode_op, proxy_args, {}, name="strict_mode"
+    )
+
+    out = graph(*operands)
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+
+
+@strict_mode_op.py_impl(FakeTensorMode)
+def strict_mode_fake_tensor_mode(mode, callable, operands):
+    with mode:
+        true_outs = callable(*operands)
+    return true_outs
+
+
+@strict_mode_op.py_functionalize_impl
+def strict_mode_func(ctx, callable, inputs):
+    unwrapped_inputs = ctx.unwrap_tensors(inputs)
+    with ctx.redispatch_to_next():
+        functional_callable = ctx.functionalize(callable)
+
+        cond_return = strict_mode_op(functional_callable, unwrapped_inputs)
+        return ctx.wrap_tensors(cond_return)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/torchbind.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/torchbind.py
new file mode 100644
index 0000000000000000000000000000000000000000..385054682e6c4188dccff26694bb29f512884a0b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/torchbind.py
@@ -0,0 +1,94 @@
+from contextlib import contextmanager
+
+import torch
+from torch._C import DispatchKey  # @manual
+from torch._functorch._aot_autograd.utils import KNOWN_TYPES
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
+from torch.fx.node import has_side_effect
+from torch.utils import _pytree as pytree
+
+# The call_torchbind operator represents a method invocation on a torchbind
+# object. The calling convention is:
+#   call_torchbind(self: ScriptObject, method_name: str, *method_args, **method_kwargs)
+# We do not expect users to write this operator directly. Instead it will be
+# emitted by Dynamo when tracing encounters a torchbind object.
+call_torchbind = HigherOrderOperator("call_torchbind")
+
+# Register this operator as side-effectful with FX.
+# TODO: this is not really sufficient. While passes (hopefully) check
+# Node.is_impure() and make good decisions, we also assume we can execute the
+# graph as many times as we want without changing behavior, which is NOT true of
+# ops that mutate torchbind object state.
+has_side_effect(call_torchbind)
+
+_orig_scriptmethod_call = torch.ScriptMethod.__call__
+
+
+def torchbind_method_redispatch(self, *args, **kwargs):
+    if isinstance(self.raw_owner, torch.ScriptObject):
+        return call_torchbind(self.raw_owner, self.name, *args, **kwargs)
+    return _orig_scriptmethod_call(self, *args, **kwargs)
+
+
+@contextmanager
+def enable_torchbind_tracing():
+    """Context manager that acts as a feature flag to enable torchbind tracing
+    behavior. Once torchbind tracing has been stabilized, we can remove this and
+    turn it always on.
+    """
+    try:
+        KNOWN_TYPES.append(torch.ScriptObject)
+        torch.ScriptMethod.__call__ = torchbind_method_redispatch  # type: ignore[method-assign]
+        yield
+    finally:
+        assert (
+            KNOWN_TYPES.pop() is torch.ScriptObject
+        ), "Someone else messed with KNOWN_TYPES during tracing, exploding."
+        torch.ScriptMethod.__call__ = _orig_scriptmethod_call  # type: ignore[method-assign]
+
+
+@call_torchbind.py_impl(DispatchKey.CompositeExplicitAutograd)
+def call_torchbind_impl(obj, method, *args, **kwargs):
+    return _orig_scriptmethod_call(getattr(obj, method), *args, **kwargs)
+
+
+@call_torchbind.py_impl(ProxyTorchDispatchMode)
+def inner(mode, *args, **kwargs):
+    if mode.enable_tracing:
+        proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, args)
+        proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+
+        out_proxy = mode.tracer.create_proxy(
+            "call_function",
+            call_torchbind,
+            proxy_args,
+            proxy_kwargs,
+        )
+        out = call_torchbind_impl(*args, **kwargs)
+
+        return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+    else:
+        return call_torchbind(*args, **kwargs)
+
+
+# TODO: currently we just run the C++ implementation with fake tensors.
+# But we should make it possible to register a fake torchbind implementation.
+@call_torchbind.py_impl(FakeTensorMode)
+def call_torchbind_fake(mode, *args, **kwargs):
+    with mode:
+        return call_torchbind_impl(*args, **kwargs)
+
+
+call_torchbind.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(call_torchbind, deferred_error=True)
+)
+
+
+@call_torchbind.py_functionalize_impl
+def call_torchbind_func(ctx, *args, **kwargs):
+    args = ctx.unwrap_tensors(args)
+    with ctx.redispatch_to_next():
+        return ctx.wrap_tensors(call_torchbind(*args, **kwargs))
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a41e585079dcba91d0060eb6835e02fd4e801af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -0,0 +1,842 @@
+import dataclasses
+import logging
+import threading
+import warnings
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Union
+
+import torch.utils._pytree as pytree
+from torch import Tensor
+from torch._C import DispatchKey
+from torch._ops import HigherOrderOperator
+from torch._prims_common import clone_preserve_strides
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+log = logging.getLogger("torch._dynamo")
+
+
+###############################################################################
+# Kernel Side Table
+
+
+# We cannot put Triton Kernels into the FX graph as the graph nodes
+# do not support arbitrary functions.
+# Use a side table.
+# We use two dicts so that fetching both the kernel and id are O(1)
+class KernelSideTable:
+    id_to_kernel: Dict[int, Any] = dict()
+    kernel_to_id: Dict[Any, int] = dict()
+    lock = threading.Lock()
+
+    # Returns index on the table
+    def add_kernel(self, kernel) -> int:
+        with self.lock:
+            if kernel in self.kernel_to_id:
+                return self.kernel_to_id[kernel]
+
+            idx = len(self.id_to_kernel)
+            self.id_to_kernel[idx] = kernel
+            self.kernel_to_id[kernel] = idx
+            return idx
+
+    # Returns the triton kernel at the given index
+    def get_kernel(self, idx: int):
+        # No need to lock here as fetching from dict is atomic
+        assert idx in self.id_to_kernel
+        return self.id_to_kernel[idx]
+
+    # Resets the table (only meant to be used in unit tests)
+    # This is only safe assuming single threaded execution
+    def reset_table(self) -> None:
+        self.id_to_kernel = dict()
+        self.kernel_to_id = dict()
+
+
+kernel_side_table = KernelSideTable()
+
+
+###############################################################################
+# Mutation Tracker
+
+
+@dataclasses.dataclass(frozen=True)
+class Param:
+    idx: int
+
+
+@dataclasses.dataclass(frozen=True)
+class Intermediate:
+    idx: int
+
+    def fake(self):
+        return self.idx < 0
+
+
+@dataclasses.dataclass(frozen=True)
+class Op:
+    name: str
+    fn_call_name: Optional[str]
+    args: List[Union[Param, Intermediate]]
+    ret: Intermediate = dataclasses.field(repr=False)
+
+    def __post_init__(self):
+        if self.name == "tt.call":
+            assert self.fn_call_name is not None
+        else:
+            assert self.fn_call_name is None
+
+
+def generate_ttir(kernel, kwargs):
+    """
+    Uses Triton's internal code generation to create TTIR
+    """
+    from triton.compiler.compiler import ASTSource
+    from triton.runtime.autotuner import Autotuner
+    from triton.runtime.jit import JITFunction
+
+    import torch
+    from torch._subclasses.fake_tensor import FakeTensor
+
+    if isinstance(kernel, Autotuner):
+        if len(kernel.configs) > 0:
+            # If we are autotuning, then it doesn't matter which version gets
+            # picked for tracing purposes, so lets pick the first one
+            kwargs = {**kwargs, **kernel.configs[0].kwargs}
+        kernel = kernel.fn
+
+    assert isinstance(kernel, JITFunction)
+
+    if len(kwargs) != len(kernel.arg_names):
+        raise Exception("Incorrect number of arguments passed to kernel")
+
+    # Replace all SymExprs with a regular value for TTIR generation
+    # Replace all FakeTensor with real tensors
+    # These replacements are needed for triton's type, key and config functions
+    ordered_args: Dict[str, Any] = {}
+    for name in kernel.arg_names:
+        a = kwargs[name]
+        if isinstance(a, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+            ordered_args[name] = 2
+        elif isinstance(a, FakeTensor):
+            ordered_args[name] = torch.empty(2, dtype=a.dtype)
+        else:
+            ordered_args[name] = a
+
+    ordered_tensor_names = [
+        name for name, arg in ordered_args.items() if isinstance(arg, Tensor)
+    ]
+    specialization = kernel._get_config(*ordered_args.values())
+    constants = {
+        i: arg
+        for i, arg in enumerate(ordered_args.values())
+        if not isinstance(arg, Tensor)
+    }
+
+    # Build kernel signature -- doesn't include constexpr arguments.
+    signature = {
+        i: kernel._type_of(kernel._key_of(arg))
+        for i, arg in enumerate(ordered_args.values())
+        if i not in kernel.constexprs
+    }
+
+    def get_backend():
+        from triton.compiler.backends.cuda import CUDABackend
+        from triton.runtime.driver import driver
+
+        target = driver.get_current_target()
+        return CUDABackend(target)
+
+    backend = get_backend()
+
+    options = backend.parse_options(dict())
+    # triton._C.libtriton.triton.ir.load_dialects(context)
+    # backend.load_dialects(context)
+
+    src = ASTSource(kernel, signature, constants, specialization)
+    ttir_module = src.make_ir(options)
+    if not ttir_module.verify():
+        raise Exception("Verification for TTIR module has failed")
+
+    return ttir_module, ordered_tensor_names
+
+
+def ttir_to_functions(ttir_module) -> Dict[str, Dict[Intermediate, List[Op]]]:
+    """
+    Walk the `ttir_module` bottom up to mine the `functions` from
+    the structured MLIR entities representing the Triton kernel
+    (mlir::Operation, mlir::Block, mlir::Region).
+    """
+    functions: Dict[str, Dict[Intermediate, List[Op]]] = {}
+
+    # block id --> op result (Intermediate) --> one or more ops
+    op_stack: Dict[int, Dict[Intermediate, List[Op]]] = defaultdict(
+        lambda: defaultdict(list)
+    )
+    region_id_to_block_ids: Dict[int, List[int]] = defaultdict(list)
+    block_id_to_block_arg_ids: Dict[int, List[int]] = {}
+    replacements: Dict[int, Union[Intermediate, Param]] = {}
+    reindex_map: Dict[int, int] = {}
+    next_fake_intermediate = 0
+
+    def reindex(idx):
+        if idx not in reindex_map:
+            reindex_map[idx] = len(reindex_map)
+        return reindex_map[idx]
+
+    def mlir_to_functions(op) -> None:
+        name: str = op.get_name()
+        if name == "builtin.module":
+            # this wraps all tt.func ops
+            return
+
+        operand_ids: List[int] = [
+            reindex(op.get_operand(i).id()) for i in range(op.get_num_operands())
+        ]
+        result_ids: List[int] = [
+            reindex(op.get_result(i).id()) for i in range(op.get_num_results())
+        ]
+
+        child_block_ids: List[int] = []
+        for i in [op.get_region(i).id() for i in range(op.get_num_regions())]:
+            # as the walk is bottom-up, the region_id_to_block_ids[i]
+            # must be populated by the time we process the enclosing op
+            child_block_ids.extend(region_id_to_block_ids[i])
+
+        parent_block_id = -1
+        parent_block = op.get_block()
+        if parent_block is not None:
+            parent_block_id = parent_block.id()
+            if parent_block_id not in block_id_to_block_arg_ids:
+                block_id_to_block_arg_ids[parent_block_id] = []
+                for i in range(parent_block.get_num_arguments()):
+                    block_id_to_block_arg_ids[parent_block_id].append(
+                        reindex(parent_block.get_argument(i).id()),
+                    )
+                # the region info is collected via ops' parent blocks to be
+                # used later when the region's encloding op is traversed
+                parent_region = parent_block.get_parent()
+                if parent_region is not None:
+                    region_id_to_block_ids[parent_region.id()].append(parent_block_id)
+
+        nonlocal next_fake_intermediate
+
+        if name == "tt.func":
+            # for function ops: gather and inline
+            # the ops from all child blocks
+            fn_ops = defaultdict(list)
+            for child_block_id in child_block_ids:
+                for result, block_fn_ops in op_stack.pop(child_block_id).items():
+                    for block_fn_op in block_fn_ops:
+                        fn_ops[result].append(block_fn_op)
+
+            # replace the corresponding Intermediates in the
+            # child op args with the function args (Params)
+            for i, idx in enumerate(block_id_to_block_arg_ids[child_block_ids[0]]):
+                replacements[idx] = Param(i)
+
+            for fn_op_list in fn_ops.values():
+                for fn_op in fn_op_list:
+                    for i in range(len(fn_op.args)):
+                        arg = fn_op.args[i]
+                        if isinstance(arg, Intermediate) and arg.idx in replacements:
+                            fn_op.args[i] = replacements[arg.idx]
+
+            # next function capture starts
+            # with empty replacements
+            replacements.clear()
+
+            fn_name = op.get_str_attr("sym_name")
+            functions[fn_name] = fn_ops
+        elif child_block_ids:
+            if name in ("scf.if", "scf.for", "scf.while"):
+                # for blocked control flow ops: inline the enclosed
+                # ops into the parent block + rewire the last op in
+                # each child block (yield) to return the scf result
+                yield_ops = []
+                for block_id in child_block_ids:
+                    # the block args used as operands of the ops in the block
+                    # (and nested blocks inlined in the current block by now)
+                    # are replaced by new fake Intermediates to avoid "this
+                    # operand is not returned by anything other op in the fn"
+                    # error in the downstream analysis
+                    for idx in block_id_to_block_arg_ids[block_id]:
+                        next_fake_intermediate -= 1
+                        replacements[idx] = Intermediate(next_fake_intermediate)
+
+                    if block_id in op_stack:
+                        block_ops = op_stack.pop(block_id)
+                        if not block_ops:
+                            continue
+                        last_ret, last_ops = block_ops.popitem()
+                        if all(op.name == "scf.yield" for op in last_ops):
+                            # if last_ops are scf.yield, treat them separately
+                            yield_ops.extend(last_ops)
+                        else:
+                            # otherwise, return last_ops to the block
+                            block_ops[last_ret] = last_ops
+                        for op_result, child_ops in block_ops.items():
+                            op_stack[parent_block_id][op_result].extend(child_ops)
+
+                scf_results = [Intermediate(idx) for idx in result_ids]
+                for scf_result in scf_results:
+                    for yield_op in yield_ops:
+                        op_stack[parent_block_id][scf_result].append(yield_op)
+            else:
+                # TODO(oulgen): add support for tt.reduce
+                raise Exception(
+                    f"Unknown blocked function: {name}. Can't capture the TTIR."
+                )
+        else:
+            callee = None
+            if name == "tt.call":
+                callee = op.get_flat_symbol_ref_attr("callee")
+            args: List[Union[Param, Intermediate]] = [
+                Intermediate(operand) for operand in operand_ids
+            ]
+            block_ops = op_stack[parent_block_id]
+            if result_ids:
+                for result_id in result_ids:
+                    res = Intermediate(result_id)
+                    block_ops[res].append(Op(name, callee, args, res))
+            else:
+                next_fake_intermediate -= 1
+                fake_res = Intermediate(next_fake_intermediate)
+                block_ops[fake_res].append(Op(name, callee, args, fake_res))
+
+    ttir_module.walk(mlir_to_functions)
+
+    return functions
+
+
+def parse_ttir(ttir, kwargs):
+    """
+    Given a Triton emitted TTIR text, this function lexes and parses the
+    code using a minimal grammar defined inside. During the lexing/parsing,
+    we drop any constant value and type information as they are not
+    necessary to us.
+    Being able to choose what we need makes this not a general purpose TTIR
+    parser which further makes parsing much simpler.
+    """
+    # TODO(oulgen):
+    # - Support closures (e.g. "tt.reduce")
+
+    try:
+        import lark  # type: ignore[import-not-found]
+        from lark import Lark, Transformer, v_args
+    except ModuleNotFoundError:
+        warnings.warn(
+            "Using slow path for user-defined Triton kernels. `pip install lark` to fix this."
+        )
+        raise
+
+    # Ops looks like one of the following forms:
+    #
+    # %14 = tt.addptr %13, %4 : tensor<4x!tt.ptr<f32, 1>>, tensor<4xi32>
+    # tt.store %14, %12, %5 {cache = 1 : i32, evict = 1 : i32} : tensor<4xf32>
+    # %15 = "tt.atomic_rmw"(%14, %12, %5) <{atomic_rmw_op = 5 : i32, scope = 1 : i32, sem = 4 : i32}> : (tensor<4x!tt.ptr<f32, 1>>, tensor<4xf32>, tensor<4xi1>) -> tensor<4xf32>  # noqa: B950
+    grammar = """
+        start: (module_block | loc_line)+
+
+        loc_line: "#loc" /.+/ NEWLINE
+
+        module_block: "module" "{" func_block+ "}" LOC
+
+        func_block: "tt.func" ("public"|"private") FN_NAME "(" /.+/ NEWLINE stmt* "}" LOC -> process_func
+
+        ?stmt: op | if | for | while | condition_stmt | label_stmt | cf_stmt
+
+        if: [assign_lhs "="] "scf.if" args rest stmt* "}" "else" "{" stmt* "}" LOC -> process_if
+        for: [assign_lhs "="] "scf.for" args rest stmt* "}" divisibility_annot? LOC -> process_for
+        while: [assign_lhs "="] "scf.while" args rest stmt* "}" "do" "{" stmt* "}" LOC -> process_while
+
+        condition_stmt: "scf.condition" "(" arg ")" args rest
+        label_stmt: LABEL ":" "// pred:" LABEL
+                  | LABEL "(" /.+/ NEWLINE
+        cf_stmt: "cf" "." NAME /.+/ NEWLINE
+
+        op: OP_NAME LOC
+          | [assign_lhs "="] OP_NAME [FN_NAME] args rest?  -> process_op
+
+        ?rest: (":" | "{" | "\\"" | "->" | "<" | "=") /.+/ NEWLINE
+        divisibility_annot: "{" "tt.divisibility_arg1" /[^}]+/ "}"
+
+        args: | "(" ")" | "("? arg ("," arg)* ")"?
+
+        ?arg: INTERMEDIATE
+            | INTERMEDIATE_CONSTANT
+            | CONSTANT
+            | PARAM
+            | "[" args "]"
+            | arg_with_index
+
+        ?arg_with_index: arg "#" DIGIT+
+
+        ?assign_lhs: (INTERMEDIATE | INTERMEDIATE_CONSTANT) [":" DIGIT+]
+
+        PARAM.5: "%arg" DIGIT+
+        INTERMEDIATE.4: "%" DIGIT+
+        INTERMEDIATE_CONSTANT.3: "%" NAME
+        CONSTANT: FLOAT | DIGIT+ | NAME ("<" DIGIT+ ">")?
+        LABEL: "^bb" DIGIT+
+
+        NAME: (LETTER | DIGIT | "_")+
+        NON_CF_NAME: /(?!(cf))/ NAME
+        FN_NAME: "@" (NAME | ESCAPED_STRING)
+        OP_NAME: "\\""? NON_CF_NAME ("." NAME)+ "\\""?
+
+        LOC.5: "loc(#loc" DIGIT* ")"
+
+        %import common.LETTER
+        %import common.DIGIT
+        %import common.WS
+        %import common.NEWLINE
+        %import common.ESCAPED_STRING
+        %import common.FLOAT
+        %ignore WS
+    """
+
+    next_fake_intermediate = 0
+
+    def convert(token):
+        if isinstance(token, lark.tree.Tree):
+            if token.data == "args":
+                res = []
+                for a in token.children:
+                    c = convert(a)
+                    if isinstance(c, list):
+                        res.extend(c)
+                    else:
+                        res.append(c)
+                return res
+            elif token.data in {"assign_lhs", "arg_with_index"}:
+                # Drop length/index qualifier
+                return convert(token.children[0])
+            else:
+                raise AssertionError(f"Tree node with {token.data}")
+
+        if token is None or (
+            isinstance(token, lark.lexer.Token)
+            and token.type in ("CONSTANT", "INTERMEDIATE_CONSTANT")
+        ):
+            nonlocal next_fake_intermediate
+            next_fake_intermediate -= 1
+            return Intermediate(next_fake_intermediate)
+
+        assert isinstance(token, lark.lexer.Token)
+
+        if token.type == "INTERMEDIATE":
+            return Intermediate(int(token.value[len("%") :]))
+        if token.type == "PARAM":
+            return Param(int(token.value[len("%arg") :]))
+
+        raise AssertionError(f"{type(token.type)} => {token.value} invalid")
+
+    # In alternative representation, function names are quoted.
+    # It should be possible to move this into the grammar alltogether.
+    def convert_name(token):
+        if token is None:
+            return None
+        s = token.value
+        if len(s) > 2 and s[0] == '"' and s[-1] == '"':
+            return s[1:-1]
+        return s
+
+    functions: Dict[str, Dict[Intermediate, List[Op]]] = {}
+
+    def extend_dict_list(d1, d2):
+        for key, values in d2.items():
+            d1[key].extend(values)
+
+    @v_args(inline=True)
+    class TransformOps(Transformer):
+        def process_op(self, ret, op_name, fn_name, args, *rest):
+            return Op(
+                convert_name(op_name),
+                convert_name(fn_name),
+                convert(args),
+                convert(ret),
+            )
+
+        def process_func(self, name, _args, *stmts):
+            ops: Dict[Intermediate, List[Op]] = defaultdict(list)
+            for e in stmts:
+                if isinstance(e, Op):
+                    ops[e.ret].append(e)
+                elif isinstance(e, dict):
+                    extend_dict_list(ops, e)
+            functions[name.value] = ops
+
+        def _process_scf(self, ret, stmts):
+            ret = convert(ret)
+            ops: Dict[Intermediate, List[Op]] = defaultdict(list)
+            for e in stmts:
+                if isinstance(e, Op):
+                    if e.name == "scf.yield":
+                        ops[ret].append(Op(e.name, None, e.args, ret))
+                    else:
+                        ops[e.ret].append(e)
+                elif isinstance(e, dict):
+                    extend_dict_list(ops, e)
+            return ops
+
+        def process_if(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+
+        def process_for(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+
+        def process_while(self, ret, _args, _rest, *stmts):
+            return self._process_scf(ret, stmts)
+
+    parser = Lark(
+        grammar, parser="lalr", maybe_placeholders=True, transformer=TransformOps()
+    )
+    parser.parse(ttir)
+    return functions
+
+
+class MemoizeWithCycleCheck:
+    def __init__(self, fn):
+        self.fn = fn
+        self.reset()
+
+    def __call__(self, functions, fn_name, num_args):
+        key = (fn_name, num_args)
+        if key not in self.cache:
+            self.cache[key] = None
+            self.cache[key] = self.fn(functions, fn_name, num_args)
+        if self.cache[key] is None:
+            raise Exception("Recursion is not supported")
+        return self.cache[key]
+
+    def reset(self):
+        self.cache = {}
+
+
+@MemoizeWithCycleCheck
+def analyze_kernel_mutations(functions, fn_name, num_args):
+    """
+    Analyzes the graph to detect all sinks from a predefined list of sinks
+    by using triton's MemWrite trait list. NOTE: What if triton exposed this?
+    From each sink, it traverses the CFG backwards to identify all the input
+    pointers that are mutated.
+    """
+    # Name of mutation op to mutated parameter indices
+    # List from Triton Github include/triton/Dialect/Triton/IR/TritonOps.td
+    # All the OPs that have MemWrite trait.
+    # What if Triton exposed this?
+    MUTATION_OPS = {"tt.store": [0], "tt.atomic_cas": [0], "tt.atomic_rmw": [0]}
+    # Ops that we want to bail out on
+    UNKNOWN_OPS = {"tt.elementwise_inline_asm"}
+
+    stack: List[Union[Param, Intermediate]] = []
+    visited = set()
+    ops = functions[fn_name]
+    for op_list in ops.values():
+        for op in op_list:
+            if op.name in UNKNOWN_OPS:
+                raise Exception(
+                    f"ttir analysis hit an op we do not know how to analyze: {op.name}"
+                )
+
+            if op.name == "tt.call":
+                assert op.fn_call_name in functions
+                mutations = analyze_kernel_mutations(
+                    functions, op.fn_call_name, len(op.args)
+                )
+                stack.extend(arg for arg, mutated in zip(op.args, mutations) if mutated)
+            else:
+                for idx in MUTATION_OPS.get(op.name, []):
+                    stack.append(op.args[idx])
+
+    # The following is an iterative DFS algorithm
+    mutated = [False] * num_args
+    while stack:
+        arg = stack.pop()
+        if arg in visited:
+            continue
+
+        visited.add(arg)
+
+        if isinstance(arg, Param):
+            if arg.idx >= num_args:
+                # This is an argument defined in the kernel, not passed in
+                continue
+            mutated[arg.idx] = True
+        elif isinstance(arg, Intermediate) and not arg.fake():
+            for op in ops[arg]:
+                # Skip arguments to load
+                if op.name != "tt.load":
+                    stack.extend(op.args)
+    return mutated
+
+
+def identify_mutated_tensors(kernel, kwargs):
+    """
+    Given a triton kernel and the arguments for this kernel, this function
+    1) Retrieves the TTIR converted version of the kernel from Triton's API.
+    2) Parses the TTIR and creates a control flow graph
+    3) Analyzes the graph to detect all input tensor mutations
+    """
+
+    ttir_module = None
+    functions = None
+    try:
+        from torch._dynamo import config
+
+        if not config.optimize_user_defined_triton_kernels:
+            raise Exception("optimize_user_defined_triton_kernels is False")
+
+        ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
+
+        # extract functions from TTIR
+        if hasattr(ttir_module, "walk"):
+            # use MLIR bindings exposed by Triton code
+            functions = ttir_to_functions(ttir_module)
+        else:
+            # parse string representation of Triton IR
+            functions = parse_ttir(str(ttir_module), kwargs)
+
+        assert functions is not None
+        kernel_name = next(iter(functions.keys()))
+        # Triton codegen modifies the name
+        assert kernel.fn.__name__ in kernel_name
+        # Reset the cache between top level invocations
+        # The cache for analyze kernel mutations is mainly used for cycle
+        # detection, so each top level invocation needs a clean cache
+        analyze_kernel_mutations.reset()
+        mutations = analyze_kernel_mutations(
+            functions, kernel_name, len(ordered_tensor_names)
+        )
+
+        return [
+            ordered_tensor_names[i] for i, mutated in enumerate(mutations) if mutated
+        ]
+    except Exception as e:
+        import traceback
+
+        warnings.warn(
+            "Encountered an exception in identify_mutated_tensors, "
+            "assuming every input is mutated:\n"
+            "".join(
+                traceback.TracebackException.from_exception(e).format()  # noqa: G001
+            )
+        )
+        if ttir_module is not None:
+            log.debug("TTIR:\n%s", str(ttir_module))
+        if functions is not None:
+            log.debug("functions:")
+            for name, fn in functions.items():
+                log.debug("===\t%s\t===", name)
+                for ret, ops in fn.items():
+                    log.debug("%s\t=>\t%s", ret, ops)
+        return [key for key, value in kwargs.items() if isinstance(value, Tensor)]
+
+
+###############################################################################
+# Triton Kernel Wrappers
+
+
+# Used for wrapping a Triton Kernel
+class TritonKernelWrapperMutation(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("triton_kernel_wrapper_mutation")
+
+
+triton_kernel_wrapper_mutation = TritonKernelWrapperMutation()
+
+
+# Used for wrapping a Triton Kernel in a functional manner
+class TritonKernelWrapperFunctional(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("triton_kernel_wrapper_functional")
+
+
+triton_kernel_wrapper_functional = TritonKernelWrapperFunctional()
+
+
+@triton_kernel_wrapper_mutation.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_mutation_dense(*, kernel_idx, grid, kwargs):
+    from torch._inductor.codegen.wrapper import user_defined_kernel_grid_fn_code
+
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+
+    if len(grid) == 1:
+        grid_fn = grid[0]
+    else:
+        fn_name, code = user_defined_kernel_grid_fn_code(
+            kernel.fn.__name__, kernel.configs, grid
+        )
+        namespace: Dict[str, Any] = {}
+        exec(code, namespace)
+        grid_fn = namespace[fn_name]
+
+    kernel[grid_fn](**kwargs)
+
+
+@triton_kernel_wrapper_mutation.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_mutation_fake_tensor_mode(mode, *, kernel_idx, grid, kwargs):
+    with mode:
+        return None
+
+
+def trace_triton_kernel_wrapper(proxy_mode, func_overload, node_args):
+    with disable_proxy_modes_tracing():
+        out = func_overload(**node_args)
+
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, node_args)
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        func_overload,
+        (),
+        proxy_args,
+        name=func_overload.__name__ + "_proxy",
+    )
+    return track_tensor_tree(out, out_proxy, constant=None, tracer=proxy_mode.tracer)
+
+
+@triton_kernel_wrapper_mutation.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_mutation_proxy_torch_dispatch_mode(
+    mode, *, kernel_idx, grid, kwargs
+):
+    if mode.enable_tracing:
+        trace_triton_kernel_wrapper(
+            mode,
+            triton_kernel_wrapper_mutation,
+            {"kernel_idx": kernel_idx, "grid": grid, "kwargs": kwargs},
+        )
+    else:
+        triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+
+    return None
+
+
+@triton_kernel_wrapper_mutation.py_functionalize_impl
+def triton_kernel_wrapper_mutation_functionalize(ctx, kernel_idx, grid, kwargs):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    kernel = kernel_side_table.get_kernel(kernel_idx)
+    # TODO(oulgen): Preexisting bug, if two kernel inputs are views of each
+    # other, and one gets mutated in kernel, and later another gets mutated,
+    # they are no longer equal. Fix this by graph breaking on this condition
+    # earlier in dynamo.
+    tensors_to_clone = identify_mutated_tensors(kernel, unwrapped_kwargs)
+    with ctx.redispatch_to_next():
+        unwrapped_outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+
+    assert set(unwrapped_outputs.keys()).issubset(set(kwargs.keys()))
+    for key, output_arg in unwrapped_outputs.items():
+        if not isinstance(output_arg, Tensor):
+            continue
+        input_arg = kwargs[key]
+        assert isinstance(input_arg, Tensor)
+
+        ctx.replace(input_arg, output_arg)
+        # indicate that above replace is hidden from autograd
+        ctx.mark_mutation_hidden_from_autograd(input_arg)
+        ctx.commit_update(input_arg)
+        ctx.sync(input_arg)
+        # sync calls replace_ under the hood, so again indicate that
+        # this indirect replace is hidden from autograd
+        ctx.mark_mutation_hidden_from_autograd(input_arg)
+    return None
+
+
+@triton_kernel_wrapper_functional.py_impl(DispatchKey.CompositeExplicitAutograd)
+def triton_kernel_wrapper_functional_dense(
+    *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    kwargs = {
+        key: (clone_preserve_strides(val) if key in tensors_to_clone else val)
+        for key, val in kwargs.items()
+    }
+    triton_kernel_wrapper_mutation(kernel_idx=kernel_idx, grid=grid, kwargs=kwargs)
+    return {key: val for key, val in kwargs.items() if key in tensors_to_clone}
+
+
+@triton_kernel_wrapper_functional.py_impl(FakeTensorMode)
+def triton_kernel_wrapper_functional_fake_tensor_mode(
+    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    # TODO(oulgen): For performance reasons, we want to ensure that these
+    # `clone_preserve_strides` calls are never executed at runtime
+    # (inductor should always optimize them away).
+    # Requires https://github.com/pytorch/pytorch/issues/109240
+    with mode:
+        return {
+            key: clone_preserve_strides(val)
+            for key, val in kwargs.items()
+            if key in tensors_to_clone
+        }
+
+
+@triton_kernel_wrapper_functional.py_impl(ProxyTorchDispatchMode)
+def triton_kernel_wrapper_functional_proxy_torch_dispatch_mode(
+    mode, *, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    if mode.enable_tracing:
+        return trace_triton_kernel_wrapper(
+            mode,
+            triton_kernel_wrapper_functional,
+            {
+                "kernel_idx": kernel_idx,
+                "grid": grid,
+                "kwargs": kwargs,
+                "tensors_to_clone": tensors_to_clone,
+            },
+        )
+    else:
+        return triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+
+
+@triton_kernel_wrapper_functional.py_functionalize_impl
+def triton_kernel_wrapper_functional_functionalize(
+    ctx, kernel_idx, grid, kwargs, tensors_to_clone
+):
+    unwrapped_kwargs = ctx.unwrap_tensors(kwargs)
+    with ctx.redispatch_to_next():
+        outputs = triton_kernel_wrapper_functional(
+            kernel_idx=kernel_idx,
+            grid=grid,
+            kwargs=unwrapped_kwargs,
+            tensors_to_clone=tensors_to_clone,
+        )
+        return ctx.wrap_tensors(outputs)
+
+
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_mutation.fallthrough(DispatchKey.AutogradCPU)
+
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonDispatcher)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.PythonTLSSnapshot)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.ADInplaceOrView)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.BackendSelect)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCPU)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutocastCUDA)  # type: ignore[attr-defined]
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
+triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/utils.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8673fa6e06981c5f1a2a1c303c0a789af64859
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/utils.py
@@ -0,0 +1,183 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Any, Callable
+
+import torch
+import torch.fx.traceback as fx_traceback
+import torch.utils._pytree as pytree
+from torch._ops import HigherOrderOperator
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.multiprocessing.reductions import StorageWeakRef
+
+
+@dataclass
+class UnsupportedAliasMutationException(RuntimeError):
+    reason: str
+
+
+def autograd_not_implemented_inner(
+    operator: HigherOrderOperator, delayed_error: bool, *args: Any, **kwargs: Any
+) -> Any:
+    """If autograd is enabled and any of the arguments require grad this will either
+    raise an error or return a DelayedError depending on the value of delayed.
+
+    Args:
+        operator: The HigherOrderOperator to call with the *args and **kwargs with
+        op_name: The name of the HigherOrderOperator
+        delayed_error: If True, return a DelayedError instead of raising an error
+        args: The flattened operands to the HigherOrderOperator
+        kwargs: The keyword arguments to the HigherOrderOperator
+
+    Raises:
+        RuntimeError: If autograd is enabled and any of the arguments to the HigherOrderOperator
+    """
+    with torch._C._AutoDispatchBelowAutograd():
+        result = operator(*args, **kwargs)
+        flat_operands = pytree.arg_tree_leaves(*args)
+        if torch.is_grad_enabled() and any(
+            f.requires_grad for f in flat_operands if isinstance(f, torch.Tensor)
+        ):
+            if delayed_error:
+                err_fn = torch._C._functions.DelayedError(
+                    f"Autograd not implemented for {str(operator)}",
+                    1,
+                )
+
+                def fake_requires_grad(tensor):
+                    if torch.is_floating_point(tensor) or torch.is_complex(tensor):
+                        tensor = tensor.detach()
+                        tensor.requires_grad = True
+                    return tensor
+
+                return pytree.tree_map_only(
+                    torch.Tensor, lambda x: err_fn(fake_requires_grad(x)), result
+                )
+            else:
+                raise RuntimeError(f"Autograd not implemented for {str(operator)}")
+        return result
+
+
+def autograd_not_implemented(op: HigherOrderOperator, deferred_error: bool) -> Callable:
+    def inner(*args, **kwargs):
+        return autograd_not_implemented_inner(op, deferred_error, *args, **kwargs)
+
+    return inner
+
+
+def _maybe_run_with_interpreter(fn):
+    maybe_interpreted_fn = fn
+    if isinstance(fn, torch.fx.GraphModule) and fx_traceback.has_preserved_node_meta():
+        # Running graph with interpreter is needed for propagating the stack_trace
+        def graph_with_interpreter(*args):
+            with fx_traceback.preserve_node_meta():
+                return torch.fx.Interpreter(fn).run(*args)
+
+        maybe_interpreted_fn = graph_with_interpreter
+    return maybe_interpreted_fn
+
+
+# We'll use the current decomposition table to make sure operators in subgraphs are
+# decomposed properly.
+# We also need to maybe run with interpreter for propagating stack_trace
+def reenter_make_fx(fn, pre_dispatch=False):
+    decomp_table = torch.fx.experimental.proxy_tensor.CURRENT_DECOMPOSITION_TABLE
+    return make_fx(
+        _maybe_run_with_interpreter(fn),
+        decomposition_table=decomp_table,
+        pre_dispatch=pre_dispatch,
+    )
+
+
+@contextmanager
+def _set_compilation_env():
+    _old_is_tracing = torch.fx._symbolic_trace._is_fx_tracing_flag
+    try:
+        # We need to turn off the is_fx_tracing_flag. Remove this flag check from dyanmo
+        # once we are confident fx tracing works with dynamo.
+        torch.fx._symbolic_trace._is_fx_tracing_flag = False
+        yield
+    finally:
+        torch.fx._symbolic_trace._is_fx_tracing_flag = _old_is_tracing
+
+
+def _has_potential_branch_input_mutation(branch, inputs, pre_dispatch=False):
+    """
+    Dispatch-trace the branch with inputs and check if
+    producing graph has mutable op on the input. This is
+    bit restrictive as the branch must be traceable.
+    """
+    try:
+        gm = make_fx(branch, pre_dispatch=pre_dispatch)(*inputs)
+    except UnsupportedAliasMutationException:
+        # this can happen when nested cond_op is
+        # functionalized
+        return True
+    except Exception as e:
+        raise e
+
+    def _detect_input_mutation(gm):
+        input_nodes = set()
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                input_nodes.add(node)
+            if node.op == "call_function":
+                target = node.target
+                if (
+                    isinstance(target, torch._ops.OpOverload)
+                    and target._schema.is_mutable
+                ):
+                    for arg in node.args:
+                        if arg in input_nodes:
+                            return True
+
+        for _, module in gm.named_children():
+            if isinstance(module, torch.fx.GraphModule):
+                if _detect_input_mutation(module):
+                    return True
+
+        return False
+
+    return _detect_input_mutation(gm)
+
+
+def _has_potential_branch_input_alias(branch, inputs, pre_dispatch=False):
+    """
+    Dispatch-trace the branch with inputs and check if
+    producing graph has output aliasing the branch input. This is
+    bit restrictive as the branch must be traceable.
+    """
+    try:
+        gm = make_fx(branch, pre_dispatch=pre_dispatch)(*inputs)
+    except UnsupportedAliasMutationException:
+        # this can happen when nested cond_op is
+        # functionalized
+        return True
+    except Exception as e:
+        raise e
+
+    def _detect_input_alias(gm):
+        input_storages = set()
+        for node in gm.graph.nodes:
+            # We need to check existence of "val" because we reuse the logic here
+            # for map operator, where num_mapped_args is a scalar
+            # and doesn't have a "val" meta.
+            if node.op == "placeholder" and "val" in node.meta:
+                input_storages.add(StorageWeakRef(node.meta["val"]._typed_storage()))
+            if node.op == "output":
+
+                def check_alias(out):
+                    if out is not None and "val" in out.meta:
+                        out_storage = StorageWeakRef(out.meta["val"]._typed_storage())
+                        return out_storage in input_storages
+                    return False
+
+                if any(pytree.tree_leaves(pytree.tree_map(check_alias, node.args))):
+                    return True
+
+        for _, module in gm.named_children():
+            if isinstance(module, torch.fx.GraphModule) and _detect_input_alias(module):
+                return True
+
+        return False
+
+    return _detect_input_alias(gm)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/while_loop.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/while_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..441560ee191f2cbf83781f32950df697cab555f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/while_loop.py
@@ -0,0 +1,232 @@
+import torch
+import torch.utils._pytree as pytree
+
+from torch._C import DispatchKey
+
+from torch._higher_order_ops.utils import (
+    _has_potential_branch_input_alias,
+    _has_potential_branch_input_mutation,
+    _set_compilation_env,
+    autograd_not_implemented,
+    reenter_make_fx,
+    UnsupportedAliasMutationException,
+)
+from torch._ops import HigherOrderOperator
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+
+
+class WhileLoopOp(HigherOrderOperator):
+    def __call__(self, cond_fn, body_fn, operands):
+        if not isinstance(cond_fn, torch.fx.GraphModule) or not isinstance(
+            body_fn, torch.fx.GraphModule
+        ):
+            raise RuntimeError(
+                "cond_fn and body_fn must be torch.fx.GraphModule, got "
+                f"{type(cond_fn)} and {type(body_fn)}"
+            )
+        if not isinstance(operands, tuple):
+            raise RuntimeError("operands must be a tuple, got " f"{type(operands)}")
+        if not all(isinstance(t, (torch.Tensor, int, float, bool)) for t in operands):
+            raise RuntimeError(
+                "operands must be a tuple of tensors, ints, floats, or bools, got "
+                f"{operands}"
+            )
+        return super().__call__(cond_fn, body_fn, operands)
+
+
+while_loop_op = HigherOrderOperator("while_loop")
+
+
+def while_loop(cond_fn, body_fn, operands):
+    r"""
+    Run body_fn(*operands) while cond_fn(*operands) returns a True scalar tensor. Returns the output of body_fn or
+    initial operands.
+
+    .. warning::
+        `torch.while_loop` is a prototype feature in PyTorch. It has limited support for input and output types and
+        doesn't support training currently. Please look forward to a more stable implementation in a future version of PyTorch.
+        Read more about feature classification at: https://pytorch.org/blog/pytorch-feature-classification-changes/#prototype
+
+    `while_loop` is a structured control flow operator. It preserves the loop semantic across the torch.compile and torch.export.
+
+    `while_loop` is equivalent to the following:
+
+        def while_loop(cond_fn, body_fn, operands):
+            val = operands
+            while cond_fn(*val):
+                val = body_fn(*val)
+            return val
+
+    Args:
+        cond_fn (Callable): A callable function that returns a boolean Scalar tensor.
+
+        body_fn (Callable): A callable function that takes the same inputs as `cond_fn` and returns a tuple of tensors
+
+        operands (Tuple of possibly nested dict/list/tuple of tensors): A tuple of inputs to cond_fn and body_fn. It's also
+            the initial value of states that are carried across iterations.
+
+    Example:
+
+        def cond_fn(iter, x):
+            return iter.sum() < 10
+
+        def body_fn(iter, x):
+            return iter + 1, x.sin()
+
+        while_loop(cond_fn, body_fn, (torch.zeros(1), torch.randn(3, 4)))
+
+    Restrictions:
+
+        - body_fn must return tensors with the same metadata (e.g.shape, dtype) as inputs.
+
+        - body_fn and cond_fn must not in-place mutate the operands. A clone before the mutation is required.
+
+        - body_fn and cond_fn must not mutate python varialbles (e.g. list/dict) created outside of the body_fn.
+
+        - body_fn and cond_fn's output cannot aliase any of the inputs. A clone is required.
+
+    .. warning::
+        Temporal Limitations:
+
+        - 'while_loop' only supports **inference** right now. Autograd will be supported in the future.
+
+    """
+    if torch.compiler.is_dynamo_compiling():
+        return while_loop_op(cond_fn, body_fn, operands)
+
+    def _validate_input(cond_fn, body_fn, operands):
+        if not callable(cond_fn) or not callable(body_fn):
+            raise RuntimeError("Expect cond_fn and body_fn to be callbale.")
+
+        if not isinstance(operands, (tuple, list)) or pytree.tree_any(
+            lambda t: not isinstance(t, torch.Tensor), operands
+        ):
+            raise RuntimeError(
+                "Expect operands to be a tuple of possibly nested dict/list/tuple that only"
+                f"consists of tensor leaves, but got {operands}."
+            )
+
+    _validate_input(cond_fn, body_fn, operands)
+
+    with _set_compilation_env(), torch._dynamo.utils.disable_cache_limit():
+        return torch.compile(while_loop_op, backend="eager", fullgraph=True)(
+            cond_fn, body_fn, operands
+        )
+
+
+@while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
+def while_loop_dense(cond_fn, body_fn, operands):
+    init_val = operands
+
+    def _is_boolean_scalar_tensor(pred):
+        return (
+            isinstance(pred, torch.Tensor)
+            and pred.size() == torch.Size([])
+            and pred.dtype == torch.bool
+        )
+
+    if not isinstance(operands, tuple):
+        raise RuntimeError(f"operands must be a tuple but got {type(operands)}")
+
+    while pred := cond_fn(*init_val):
+        if not _is_boolean_scalar_tensor(pred):
+            raise RuntimeError(
+                f"cond_fn must return a boolean scalar tensor but got {pred}"
+            )
+        out = body_fn(*init_val)
+        assert isinstance(
+            out, tuple
+        ), f"body_fn should return a tuple but got {type(out)}"
+        assert len(out) == len(
+            init_val
+        ), "body_fn should return the same number of elements as operands"
+        init_val = out
+    return init_val
+
+
+while_loop_op.py_impl(DispatchKey.Autograd)(
+    autograd_not_implemented(while_loop_op, deferred_error=True)
+)
+
+
+@while_loop_op.py_impl(ProxyTorchDispatchMode)
+def while_loop_tracing(mode, cond_fn, body_fn, operands):
+    def _trace_while_loop(proxy_mode, while_loop_op, cond_fn, body_fn, operands):
+        pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
+        with disable_proxy_modes_tracing():
+            cond_graph = reenter_make_fx(cond_fn, pre_dispatch)(*operands)
+            body_graph = reenter_make_fx(body_fn, pre_dispatch)(*operands)
+
+        next_name = None
+        i = 0
+        while not next_name:
+            candidate = f"while_loop_cond_graph_{i}"
+            if hasattr(proxy_mode.tracer.root, candidate):
+                i += 1
+            else:
+                next_name = candidate
+        cond_graph_name = next_name
+        body_graph_name = f"while_loop_body_graph_{i}"
+        assert not hasattr(proxy_mode.tracer.root, body_graph_name)
+
+        proxy_mode.tracer.root.register_module(cond_graph_name, cond_graph)
+        proxy_mode.tracer.root.register_module(body_graph_name, body_graph)
+
+        args = (cond_graph, body_graph, operands)
+
+        proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
+
+        out_proxy = proxy_mode.tracer.create_proxy(
+            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+        )
+
+        # body_fn return output with the same pytree and tensor meta data as operands
+        # so we could just return the output after one iteration.
+        out = body_fn(*operands)
+        return track_tensor_tree(
+            out, out_proxy, constant=None, tracer=proxy_mode.tracer
+        )
+
+    if mode.enable_tracing:
+        return _trace_while_loop(mode, while_loop_op, cond_fn, body_fn, operands)
+    else:
+        return while_loop_op(cond_fn, body_fn, operands)
+
+
+@while_loop_op.py_impl(FakeTensorMode)
+def while_loop_fake_tensor_mode(mode, cond_fn, body_fn, operands):
+    return body_fn(*operands)
+
+
+@while_loop_op.py_functionalize_impl
+def while_loop_func(ctx, cond_fn, body_fn, operands):
+    unwrapped_operands = ctx.unwrap_tensors(operands)
+    with ctx.redispatch_to_next() as m:
+        functional_cond_fn = ctx.functionalize(cond_fn)
+        functional_body_fn = ctx.functionalize(body_fn)
+        pre_dispatch = hasattr(ctx, "mode") and ctx.mode.pre_dispatch
+        for fn, fn_name in [
+            (functional_cond_fn, "cond_fn"),
+            (functional_body_fn, "body_fn"),
+        ]:
+            if _has_potential_branch_input_mutation(
+                fn, unwrapped_operands, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be modifying the input!"
+                )
+
+        for fn in [functional_cond_fn, functional_body_fn]:
+            if _has_potential_branch_input_alias(
+                fn, unwrapped_operands, pre_dispatch=pre_dispatch
+            ):
+                raise UnsupportedAliasMutationException(
+                    f"torch.while_loop's {fn_name} might be aliasing the input!"
+                )
+        ret = while_loop_op(functional_cond_fn, functional_body_fn, unwrapped_operands)
+        return ctx.wrap_tensors(ret)
diff --git a/MLPY/Lib/site-packages/torch/_higher_order_ops/wrap.py b/MLPY/Lib/site-packages/torch/_higher_order_ops/wrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..949c24f2abe4700a1d975f4def130f9fd04b2996
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_higher_order_ops/wrap.py
@@ -0,0 +1,183 @@
+import inspect
+import logging
+
+import torch
+from torch._ops import HigherOrderOperator
+from torch.utils.checkpoint import checkpoint, uid
+import torch._dynamo.config
+
+log = logging.getLogger(__name__)
+
+
+
+# Used for testing the HigherOrderOperator mechanism
+class Wrap(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("wrap")
+
+    def __call__(self, func, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+
+        @disable
+        def wrapper():
+            result = func(*args, **kwargs)
+            return result
+
+        return wrapper()
+
+wrap = Wrap()
+
+class WrapWithSetGradEnabled(HigherOrderOperator):
+    def __init__(self):
+        super().__init__("wrap_with_set_grad_enabled")
+
+    def __call__(self, enable_grad, wrapped_func, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo  # noqa: F401
+        from torch._dynamo import disable
+
+        @disable
+        def wrapper():
+            with torch.set_grad_enabled(enable_grad):
+                return wrapped_func(*args, **kwargs)
+        return wrapper()
+
+wrap_with_set_grad_enabled = WrapWithSetGradEnabled()
+
+class WrapActivationCheckpoint(HigherOrderOperator):
+    """
+    This operator is used to wrap torch.utils.checkpoint. This avoids
+    TorchDynamo to look into saved tensor hooks and directly passes the control
+    to AOT Autograd, which is ok with tracing saved tensor hooks. As a result of
+    AOT tracing torch.utils.checkpoint code, we have a backward graph with
+    recomputed forward nodes.
+
+    However, we might deprecate this operator soon. The difficulty arises in the
+    functionalization of rng ops. Today, there are two different
+    functionalization of rng ops - one at AOT autograd and other at Inductor.
+    And they are difficult to map to each other. The rng states also complicate
+    pattern matching in Inductor. Due to the ease of implementation, we are
+    currently inclined towards functionalization at Inductor level, which means
+    that duplication/recomputation is done as a compiler pass in the
+    partitioners. See TagActivationCheckpoint for more information.
+    """
+    def __init__(self):
+        super().__init__("wrap_activation_checkpoint")
+
+    def __call__(self, function, *args, **kwargs):
+        # use_reentrant is set to False because this op is going to be traced.
+        # And we ensure that AOT Autograd traces through the non reentrant
+        # version of checkpointing.
+        import torch.fx.traceback as fx_traceback
+        from torch.fx import Interpreter
+        kwargs["use_reentrant"] = False
+        kwargs["preserve_rng_state"] = False
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        with fx_traceback.preserve_node_meta():
+            return checkpoint(Interpreter(function).run, *args, **kwargs)
+
+wrap_activation_checkpoint = WrapActivationCheckpoint()
+
+class TagActivationCheckpoint(HigherOrderOperator):
+    """
+    This operator is supposed to be used only with torch.compile stack. This
+    accepts a Fx graph module which needs to be checkpointed. This operator adds
+    "recomputable" tag to the nodes of the Fx graph that should be recomputed.
+
+    The goal is to:
+    1. Avoid using Dynamo to trace through saved tensor hooks.
+    2. For selective checkpointing case, let AOTAutograd trace through
+       saved tensor hooks but has special logic with TorchDispatchMode to override
+       the usual saved_tensor_hooks fn logic in order to tag the nodes.
+    3. Rely on the partitioners to actually duplicate the nodes.
+    This sits well in the torch.compile stack, because by the time graph
+    reaches partitioner, inductor has already run its functionalization of rng
+    ops (by setting fixed seed for each random op, see `replace_random_passes`).
+    Therefore, the duplication of nodes, by design, respects the rng states in
+    the forward and recomputed forward in backward.
+    """
+
+    def __init__(self):
+        super().__init__("tag_activation_checkpoint")
+
+    @staticmethod
+    def divide_kwargs(kwargs):
+        """
+        checkpoint fn can have mixed kwargs between checkpointed fn and
+        checkpoint fn itself. For example
+        >> def gn(x, y, z=None):
+        >>     a = torch.matmul(x, y)
+        >>     if z is not None:
+        >>         return torch.matmul(a, z)
+        >>     return a
+        >> def fn(x, y, z):
+        >>     return torch.cos(checkpoint(gn, x, y, use_reentrant=False, z=z))
+        In the above case, z belongs to checkpointed function gn, but
+        use_reentrant belongs to the checkpoint function. This function splits
+        the kwargs into checkpoint_kwargs and gmod_kwargs (or
+        checkpointed_fn_kwargs).
+        We do sorting to ensure same graph from run to run for better
+        debuggability. It is not required for correctness.
+        """
+        ckpt_signature = inspect.signature(checkpoint)
+        checkpoint_keys = set()
+        for name in ckpt_signature.parameters:
+            if name in ("function", "args", "kwargs"):
+                continue
+            checkpoint_keys.add(name)
+
+        # `preserve_rng_state` is not a regular kwarg
+        checkpoint_keys.add("preserve_rng_state")
+
+        checkpoint_kwargs = {name: kwargs[name] for name in kwargs.keys() if name in checkpoint_keys}
+        gmod_kwargs = {name: kwargs[name] for name in kwargs.keys() if name not in checkpoint_keys}
+        return checkpoint_kwargs, gmod_kwargs
+
+    def tag_nodes(self, gmod):
+        unique_graph_id = next(uid)
+        for node in gmod.graph.nodes:
+            if node.op in ("call_function", "call_method", "call_module"):
+                node.meta["recompute"] = unique_graph_id
+        return gmod
+
+    def __call__(self, gmod, *args, **kwargs):
+        import torch.fx.traceback as fx_traceback
+        from torch.fx import Interpreter
+        if "_checkpoint_context_fn" in gmod.meta:
+            assert torch._dynamo.config._experimental_support_context_fn_in_torch_utils_checkpoint, \
+                "Passing context_fn to torch.utils.checkpoint is currently not supported under torch.compile"
+            log.warning("""
+Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
+Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
+""")
+            # use_reentrant is set to False because this op is going to be traced.
+            # And we ensure that AOT Autograd traces through the non reentrant
+            # version of checkpointing.
+            kwargs["use_reentrant"] = False
+            # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
+            # `torch.random.fork_rng` op (which is not supported yet under CUDA).
+            # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
+            # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
+            # instead of in AOTAutograd).
+            kwargs["preserve_rng_state"] = False
+            kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
+            # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
+            # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
+            gmod = self.tag_nodes(gmod)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            with fx_traceback.preserve_node_meta():
+                return checkpoint(Interpreter(gmod).run, *args, **kwargs)
+        else:
+            gmod = self.tag_nodes(gmod)
+            # Using interpreter allows preservation of metadata through torch.compile stack.
+            # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
+            # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
+            # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
+            with fx_traceback.preserve_node_meta():
+                return Interpreter(gmod).run(*args)
+
+tag_activation_checkpoint = TagActivationCheckpoint()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..29d0966ca407eee3107feb137924ca246c8bf8d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/__init__.py
@@ -0,0 +1,150 @@
+from typing import Any, Dict, List, Optional
+
+import torch.fx
+import torch.utils._pytree as pytree
+
+__all__ = ["compile", "list_mode_options", "list_options", "cudagraph_mark_step_begin"]
+
+
+def compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    options: Optional[Dict[str, Any]] = None,
+):
+    """
+    Compile a given FX graph with TorchInductor.  This allows compiling
+    FX graphs captured without using TorchDynamo.
+
+    Args:
+        gm: The FX graph to compile.
+        example_inputs:  List of tensor inputs.
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+
+    Returns:
+        Callable with same behavior as gm but faster.
+    """
+    from .compile_fx import compile_fx
+
+    return compile_fx(gm, example_inputs, config_patches=options)
+
+
+def aot_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    options: Optional[Dict[str, Any]] = None,
+) -> str:
+    """
+    Ahead-of-time compile a given FX graph with TorchInductor into a shared library.
+
+    Args:
+        gm: The FX graph to compile.
+        example_inputs:  List of tensor inputs.
+        options:  Optional dict of config options.  See `torch._inductor.config`.
+
+    Returns:
+        Path to the generated shared library
+    """
+    from .compile_fx import compile_fx_aot
+
+    # We will serialize the pytree info into the .so as constant strings
+    in_spec = None
+    out_spec = None
+    if isinstance(gm.graph._codegen, torch.fx.graph._PyTreeCodeGen):
+        codegen = gm.graph._codegen
+        gm.graph._codegen = torch.fx.graph.CodeGen()
+        gm.recompile()
+
+        if codegen.pytree_info.in_spec is not None:
+            in_spec = codegen.pytree_info.in_spec
+        if codegen.pytree_info.out_spec is not None:
+            out_spec = codegen.pytree_info.out_spec
+
+    else:
+        if hasattr(gm, "_in_spec"):
+            in_spec = gm._in_spec
+        if hasattr(gm, "_out_spec"):
+            out_spec = gm._out_spec
+
+    serialized_in_spec = pytree.treespec_dumps(in_spec) if in_spec is not None else ""
+    serialized_out_spec = (
+        pytree.treespec_dumps(out_spec) if out_spec is not None else ""
+    )
+
+    options = (
+        {
+            "aot_inductor.serialized_in_spec": serialized_in_spec,
+            "aot_inductor.serialized_out_spec": serialized_out_spec,
+        }
+        if options is None
+        else {
+            **options,
+            "aot_inductor.serialized_in_spec": serialized_in_spec,
+            "aot_inductor.serialized_out_spec": serialized_out_spec,
+        }
+    )
+
+    return compile_fx_aot(
+        gm,
+        example_inputs,
+        config_patches=options,
+    )
+
+
+def list_mode_options(
+    mode: Optional[str] = None, dynamic: Optional[bool] = None
+) -> Dict[str, Any]:
+    r"""Returns a dictionary describing the optimizations that each of the available
+    modes passed to `torch.compile()` performs.
+
+    Args:
+        mode (str, optional): The mode to return the optimizations for.
+        If None, returns optimizations for all modes
+        dynamic (bool, optional): Whether dynamic shape is enabled.
+
+    Example::
+        >>> torch._inductor.list_mode_options()
+    """
+
+    mode_options: Dict[str, Dict[str, bool]] = {
+        "default": {},
+        # enable cudagraphs
+        "reduce-overhead": {
+            "triton.cudagraphs": True,
+        },
+        # enable max-autotune
+        "max-autotune-no-cudagraphs": {
+            "max_autotune": True,
+        },
+        # enable max-autotune
+        # enable cudagraphs
+        "max-autotune": {
+            "max_autotune": True,
+            "triton.cudagraphs": True,
+        },
+    }
+    return mode_options[mode] if mode else mode_options  # type: ignore[return-value]
+
+
+def list_options() -> List[str]:
+    r"""Returns a dictionary describing the optimizations and debug configurations
+    that are available to `torch.compile()`.
+
+    The options are documented in `torch._inductor.config`.
+
+    Example::
+
+        >>> torch._inductor.list_options()
+    """
+
+    from torch._inductor import config
+
+    current_config: Dict[str, Any] = config.shallow_copy_dict()
+
+    return list(current_config.keys())
+
+
+def cudagraph_mark_step_begin():
+    "Indicates that a new iteration of inference or training is about to begin."
+    from .cudagraph_trees import mark_step_begin
+
+    mark_step_begin()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e2b7ec8f37182fb250dbf3e4587b6704e4bf17d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/autotune_process.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/autotune_process.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0dfb6b968369008e03efba6b0ca1a1fe2960bb98
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/autotune_process.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/bounds.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/bounds.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..179dd21e5c284562023d272eaa40fddf635818e8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/bounds.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/codecache.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/codecache.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c583798d146f856bfecebc2ae7616888c7ab189
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/codecache.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comm_analysis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comm_analysis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02221c291e1257e245ba2eed3b59e5e9d741db48
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comm_analysis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comms.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fb6a2898d345f539d56b6b31ec947c5f571cef0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/comms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/compile_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/compile_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ed23f9890aec154f3baee41a09470f3055d13d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/compile_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e997f070d47106ff1da57fba32988c8df33167a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a040f1f5a7abe6a598ac127fb9bdebdd9ff7e616
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/coordinate_descent_tuner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/coordinate_descent_tuner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5ef0cafbc661712b55743fe0594e417054e6b16
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/coordinate_descent_tuner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dec02e720073645cbd8923d621cd894542471ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5de6b439f891a6a01b0062a89fc49c3ea4d819d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/cudagraph_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/debug.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/debug.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2da29903ee89721451d88323f07d06a3dad925b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/debug.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/decomposition.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/decomposition.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7e4a82e50a000f075a5fdc2d9ba735eed10994f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/decomposition.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/dependencies.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/dependencies.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd3a676b9d12e5933ef7b3c3d9a6a71578de690b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/dependencies.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/exc.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/exc.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5558306c4a66b4524af71882f03ec8a3882ef38
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/exc.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/freezing.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/freezing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dcc3d749c8f13d332c1e87e215ede1f67ff9677
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/freezing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/fx_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/fx_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..378403e528e52d88d8f2b7bebd311d4f1df7031f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/fx_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e421cb07dd72e689707bbd300fc4d4126908d87
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68f461be153419be45bda309e40d6d21b227be6e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/index_propagation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/index_propagation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..339304400bd2b5eb368c4352657f8ad5e008c607
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/index_propagation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/inductor_prims.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/inductor_prims.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54e719af4d20b3bdc98c5c7af6c091b4727ed1d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/inductor_prims.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ir.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ir.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7aa0f1fd2f7d32cc7fd71c056ece6f85883f3f0b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ir.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/lowering.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/lowering.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d48384d16d681589d091555c9e2a4ae5146e6ffc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/lowering.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/metrics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/metrics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91b0adcb1e9c505739472e75ac7d1d0c5ed57832
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/metrics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ops_handler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ops_handler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b4b8056f00d5d6aede3597bb20b5c20c497fc53
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/ops_handler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/optimize_indexing.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/optimize_indexing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f76c9b94117b7ba712465957ceaea6b75a0026c1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/optimize_indexing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/pattern_matcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/pattern_matcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c35ef5d63bca08a05f5a9ae675ffd88f1b7863f9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/pattern_matcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/quantized_lowerings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/quantized_lowerings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2935c40933a47db275600f656d3434e622de6372
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/quantized_lowerings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4346f248be45913fefccf40a96d5ade9cd6df848
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/select_algorithm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/select_algorithm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3b62fa63180ae3e5e76aa29515f76049a6a217
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/select_algorithm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/sizevars.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/sizevars.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c00066a2b1d1ef038a1981ae830429c258cb0e69
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/sizevars.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_case.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_case.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a5893a8cfe488a90f9ca95bfb09c360e0bfde9e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_case.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_operators.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_operators.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ed2266fbc463da832d2a87cdb1ca910e0604d3b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/test_operators.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_helpers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_helpers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5a4778bc17b3187d522f31511b6cd83295abba6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_helpers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_heuristics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_heuristics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..088c6234fd9d34579889ab66defb15660a4a98d1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/triton_heuristics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bd1425fc743095ca9c80886571f7425ff4d47de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/virtualized.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/virtualized.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6658f177060f9d2ed34bda0db17be0611b62b63
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/virtualized.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/__pycache__/wrapper_benchmark.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/wrapper_benchmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c93490f3f0057705110ec80787430a0eb9d4310
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/__pycache__/wrapper_benchmark.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/autotune_process.py b/MLPY/Lib/site-packages/torch/_inductor/autotune_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3bf9fcf23f95982bc9904304dede907d0d6513
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/autotune_process.py
@@ -0,0 +1,656 @@
+from __future__ import annotations
+
+import contextlib
+import dataclasses
+import functools
+import logging
+import os
+import queue
+import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from ctypes import byref, c_size_t, c_void_p
+from multiprocessing.process import BaseProcess
+from multiprocessing.queues import Queue
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    TYPE_CHECKING,
+    Union,
+)
+
+import torch
+from torch import multiprocessing
+from torch._dynamo.testing import rand_strided
+
+from torch._inductor import ir
+from torch._inductor.codecache import CUDACodeCache, DLLWrapper, PyCodeCache
+
+if TYPE_CHECKING:
+    from torch._inductor.select_algorithm import TritonTemplateCaller
+
+from . import config
+from .utils import do_bench
+from .virtualized import V
+
+CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+EXIT_HANDLER_REGISTERED = False
+
+log = logging.getLogger(__name__)
+
+
+# Used to synchronize between parent and child processes
+class Ping:
+    pass
+
+
+class Pong:
+    pass
+
+
+@contextlib.contextmanager
+def set_cuda_visible_device(device: Optional[int]):
+    """
+    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
+    specified single device. If device is None, don't manipulate the environment.
+    """
+    if device is None:
+        yield
+        return
+
+    current = os.environ.get(CUDA_VISIBLE_DEVICES)
+    os.environ[CUDA_VISIBLE_DEVICES] = str(device)
+    try:
+        yield
+    finally:
+        if current is None:
+            del os.environ[CUDA_VISIBLE_DEVICES]
+        else:
+            os.environ[CUDA_VISIBLE_DEVICES] = current
+
+
+@dataclasses.dataclass
+class TuningProcess:
+    """
+    Abstraction for launching a helper process to benchmark kernels. Spawns
+    the parent process and uses multiprocessing queues to send benchmark
+    requests and return results.
+    """
+
+    device: Optional[int] = None
+    process: Optional[BaseProcess] = None
+    request_queue: Optional[Queue[Any]] = None
+    response_queue: Optional[Queue[Any]] = None
+
+    @staticmethod
+    def process_main(
+        request_queue: Queue[Any],
+        response_queue: Queue[Any],
+    ) -> None:
+        """
+        Entry point for the child process.
+        """
+        log.debug(
+            "Entering TuningProcess child. Visible devices = %s",
+            os.environ.get(CUDA_VISIBLE_DEVICES),
+        )
+        try:
+            TuningProcess.workloop(request_queue, response_queue)
+        except Exception as ex:
+            log.exception("Exception in TuningProcess: %s", ex)
+
+    @staticmethod
+    def workloop(request_queue: Queue[Any], response_queue: Queue[Any]) -> None:
+        """
+        Work loop for the benchmarking subprocess.
+        """
+        while True:
+            obj = request_queue.get()
+
+            if obj is None:
+                break  # None is a sentinel for the child to terminate
+            elif isinstance(obj, Ping):
+                response_queue.put(Pong())
+            elif isinstance(obj, BenchmarkRequest):
+                response_queue.put(obj.benchmark())
+            else:
+                raise RuntimeError(f"Invalid request type {type(obj)}")
+
+    def valid(self) -> bool:
+        """
+        True if the sub-process has been initialized.
+        """
+        return (
+            self.process is not None
+            and self.request_queue is not None
+            and self.response_queue is not None
+        )
+
+    def clear(self) -> None:
+        """
+        Reset to an uninitialized state.
+        """
+        self.process = self.request_queue = self.response_queue = None
+
+    def initialize(self) -> None:
+        """
+        Create child process, request/response queues, and do the warm up.
+        Set the environment to make only the provided GPU device visible
+        to the process.
+        """
+        if self.valid():
+            return
+
+        # cuda runtime does not work with "fork", use "spawn" to start processes.
+        ctx = multiprocessing.get_context("spawn")
+        self.request_queue = ctx.Queue()
+        self.response_queue = ctx.Queue()
+
+        self.process = ctx.Process(
+            target=self.process_main,
+            args=(
+                self.request_queue,
+                self.response_queue,
+            ),
+        )
+        assert self.process is not None
+        with set_cuda_visible_device(self.device):
+            self.process.start()
+
+    def put(self, obj: Any) -> None:
+        """
+        Push a work item to the child process.
+        """
+        # In case of a prior crash, ensure the subprocess is running
+        self.initialize()
+        assert self.request_queue is not None
+        self.request_queue.put(obj)
+
+    def get(self) -> Any:
+        """
+        Get a response from the child process.
+        """
+        assert self.process is not None
+        assert self.response_queue is not None
+        while True:
+            try:
+                return self.response_queue.get(timeout=1.0)
+            except queue.Empty:
+                status = self.process.exitcode
+                if status is None:
+                    # child process is still running
+                    continue
+                # child process crashed
+                self.clear()
+                raise
+
+    def terminate(self) -> None:
+        """
+        Signal the child process to terminate.
+        """
+        if self.valid():
+            assert self.process is not None
+            assert self.request_queue is not None
+            self.request_queue.put(None)
+
+    def wait(self) -> None:
+        """
+        Wait for the child process to exit.
+        """
+        if self.process is not None:
+            self.process.join()
+            self.clear()
+
+
+@dataclasses.dataclass
+class TuningProcessPool:
+    """
+    Maintains a pool of TuningProcesses to benchmark kernels in parallel
+    across devices. By default, we create one TuningProcess per device and
+    set the sub-process environment to make only that device visible.
+    """
+
+    processes: Optional[queue.Queue[TuningProcess]] = None
+    executor: Optional[ThreadPoolExecutor] = None
+
+    def initialize(self) -> None:
+        """
+        Start the child processes.
+        """
+        assert (self.processes is None) == (self.executor is None)
+        if self.processes is not None:
+            return
+
+        devices = self.get_device_list()
+        log.debug("Sub-process autotune device list: %s", devices)
+
+        # Launch the child processes and push a msg to "warm up"
+        self.processes = queue.Queue()
+        for device in devices:
+            p = TuningProcess(device=device)
+            p.initialize()
+            p.put(Ping())
+            self.processes.put(p)
+
+        # Wait for the initialization to finish
+        for p in self.processes.queue:
+            assert isinstance(p.get(), Pong)
+
+        # Use a thread pool to manage distributing work to the subprocesses.
+        # Threads block on an available process, so it makes sense to match
+        # the number of threads with the number of devices.
+        self.executor = ThreadPoolExecutor(max_workers=len(devices))
+
+        # Register the exit handler for the parent process so it will terminate
+        # the child processes.
+        global EXIT_HANDLER_REGISTERED
+        if not EXIT_HANDLER_REGISTERED:
+            EXIT_HANDLER_REGISTERED = True
+            import atexit
+
+            atexit.register(self.terminate)
+
+    def get_device_list(self) -> Sequence[Optional[int]]:
+        """
+        Gather the list of devices to be used in the pool.
+        """
+        if not config.autotune_multi_device:
+            # Don't use multiple devices
+            return [None]
+
+        count = torch.cuda.device_count()
+
+        # If the user specified the visible devices in the env, use those.
+        if CUDA_VISIBLE_DEVICES in os.environ:
+            devices = [int(d) for d in os.environ[CUDA_VISIBLE_DEVICES].split(",")]
+            assert len(devices) <= count
+            return devices
+
+        return list(range(count))
+
+    def terminate(self) -> None:
+        """
+        Signal all child processes to terminate.
+        """
+        if self.executor is not None:
+            self.executor.shutdown()
+            self.executor = None
+
+        if self.processes is not None:
+            for p in self.processes.queue:
+                p.terminate()
+            for p in self.processes.queue:
+                p.wait()
+            self.processes = None
+
+    def target(self, choice: TritonTemplateCaller) -> float:
+        """
+        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
+        remove it from the queue, execute the benchmark in that subprocess, and return
+        the TuningProcess to the queue.
+        """
+        assert choice.bmreq is not None
+        assert self.processes is not None
+
+        process = self.processes.get()
+        process.put(choice.bmreq)
+        try:
+            return process.get()
+        except queue.Empty:
+            warnings.warn(
+                f"Failed to benchmark choice '{choice}'. It will be ignored. "
+                "Please debug the root cause in case the choice can bring perf gains."
+            )
+            # set to INF so this choice will be ignored
+            return float("inf")
+        finally:
+            self.processes.put(process)
+
+    def benchmark(
+        self,
+        choices: List[TritonTemplateCaller],
+    ) -> Dict[TritonTemplateCaller, float]:
+        """
+        Benchmark each choice in a separate process.
+        """
+        assert self.processes is not None, "Tuning process pool is not initialized"
+        assert self.executor is not None
+
+        results = {}
+
+        # Use a ThreadExecutorPool to spread the work across the subprocesses and
+        # to grab subprocesses as soon as they're free.
+        for choice, result in zip(choices, self.executor.map(self.target, choices)):
+            results[choice] = result
+
+        return results
+
+
+tuning_pool = TuningProcessPool()
+
+
+LayoutOrBuffer = Union[ir.Layout, ir.Buffer]
+
+
+@dataclasses.dataclass
+class TensorMeta:
+    device: torch.device
+    dtype: torch.dtype
+    sizes: torch._prims_common.ShapeType
+    strides: torch._prims_common.StrideType
+    offset: int
+
+    @classmethod
+    def from_irnodes(
+        cls, irnodes: Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]
+    ) -> Union[TensorMeta, List[TensorMeta]]:
+        if isinstance(irnodes, Sequence):
+            result: List[Any] = [cls.from_irnodes(x) for x in irnodes]
+            assert all(isinstance(x, TensorMeta) for x in result)
+            return result
+
+        node = irnodes
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+
+        dtype = node.get_dtype()
+        assert dtype is not None
+
+        return TensorMeta(
+            device=node.get_device(),
+            dtype=dtype,
+            sizes=V.graph.sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            strides=V.graph.sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            offset=V.graph.sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+        )
+
+    def to_tensor(self) -> torch.Tensor:
+        return rand_strided(
+            self.sizes,
+            self.strides,
+            device=self.device,
+            dtype=self.dtype,
+            extra_size=self.offset,
+        )
+
+
+@dataclasses.dataclass
+class BenchmarkRequest:
+    """
+    Only handle triton template benchmark for now. The extern kernel benchmark
+    can be done inside the same process since they usually don't cause crash.
+
+    Important: Instances of this class and subclasses have to be serializable
+    across process boundaries. Do not put CUDA Tensors in here!
+    """
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+    ):
+        # the kernel name defined in the module
+        self.kernel_name = kernel_name
+
+        if isinstance(input_tensor_meta, TensorMeta):
+            input_tensor_meta = [input_tensor_meta]
+        self.input_tensor_meta = input_tensor_meta
+
+        if isinstance(output_tensor_meta, (tuple, list)):
+            assert len(output_tensor_meta) == 1
+            output_tensor_meta = output_tensor_meta[0]
+        self.output_tensor_meta = output_tensor_meta
+
+        self.extra_args = extra_args
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        raise NotImplementedError()
+
+    def cleanup_run_fn(self) -> None:
+        pass
+
+    def benchmark(
+        self,
+        *input_tensors: torch.Tensor,
+        output_tensor: Optional[torch.Tensor] = None,
+    ) -> float:
+        debug = log.isEnabledFor(logging.DEBUG)
+        if debug:
+            start_ts = time.time()
+
+        # create args and out tensor
+        if output_tensor is None:
+            assert len(input_tensors) == 0
+            input_tensors = tuple(x.to_tensor() for x in self.input_tensor_meta)
+            output_tensor = self.output_tensor_meta.to_tensor()
+
+        if debug:
+            create_tensor_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+
+        fn = self.make_run_fn(*input_tensors, output_tensor=output_tensor)
+
+        if debug:
+            load_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            start_ts = time.time()
+
+        out = do_bench(fn)
+        torch.cuda.synchronize()  # shake out any CUDA errors
+
+        if debug:
+            bench_elapse = time.time() - start_ts  # type: ignore[possibly-undefined]
+            log.debug(
+                "InChildProcess %s: load %f, create tensor %f, bench %f",
+                str(self),
+                load_elapse,  # type: ignore[possibly-undefined]
+                create_tensor_elapse,  # type: ignore[possibly-undefined]
+                bench_elapse,
+            )
+        self.cleanup_run_fn()
+        return out
+
+
+class TestBenchmarkRequest(BenchmarkRequest):
+    """
+    Supports unit testing. Defined in this file so that the TuningProcess
+    sub-process knows how to unpickle these objects.
+    """
+
+    def __init__(self, value: Optional[float] = None) -> None:
+        self.value = value
+
+    def benchmark(
+        self, *input_tensors: torch.Tensor, output_tensor: Optional[torch.Tensor] = None
+    ) -> float:
+        if self.value is None:
+            raise Exception("Failed to run")
+        return self.value
+
+
+class TritonBenchmarkRequest(BenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        module_path: str,  # the path of the module defining the triton kernel
+        module_cache_key: str,
+        grid: List[int],
+        num_stages: int,
+        num_warps: int,
+        matrix_instr_nonkdim: int = 0,  # only used for hip to choose the shape of mfma instruction.
+    ):
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.module_path = module_path
+        self.module_cache_key = module_cache_key
+        self.grid = grid
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.matrix_instr_nonkdim = matrix_instr_nonkdim
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path)
+        log.debug(
+            "benchmark module key: %s, path: %s",
+            self.module_cache_key,
+            self.module_path,
+        )
+
+        run_method = getattr(mod, self.kernel_name).run
+        extra_args = list(self.extra_args)
+
+        # Newer version of triton add warmup argument to JITFunction.run.
+        # This code handles backward-compatibility.
+        warmup_arg = {}
+        import inspect
+
+        if "warmup" in inspect.signature(run_method).parameters:
+            warmup_arg["warmup"] = False
+
+        if torch.version.hip and self.matrix_instr_nonkdim != 0:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
+                matrix_instr_nonkdim=self.matrix_instr_nonkdim,
+            )
+        else:
+            return functools.partial(
+                run_method,
+                *input_tensors,
+                output_tensor,
+                *self.extra_args,
+                grid=self.grid,
+                **warmup_arg,
+                num_stages=self.num_stages,
+                num_warps=self.num_warps,
+            )
+
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}"
+
+
+class CUDABenchmarkRequest(BenchmarkRequest):
+    # Important: Instances of this class have to be serializable
+    # across process boundaries. Do not put CUDA Tensors in here!
+
+    def __init__(
+        self,
+        kernel_name: str,
+        input_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        output_tensor_meta: Union[TensorMeta, List[TensorMeta]],
+        extra_args: Iterable[Any],
+        source_code: str,
+    ):
+        super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args)
+        self.source_code = source_code
+        self.workspace_size: int = 0
+        self.workspace: Optional[torch.Tensor] = None
+        self.DLL: Optional[DLLWrapper] = None
+        self.hash_key: str = ""
+        self.source_file: str = ""
+        self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so")
+
+    def precompile(self):
+        # Prepopulate CUDACodeCache
+        # may happen in separate Threadpool
+        log.debug("Precompiling %s", self)
+        CUDACodeCache.load(self.source_code, "so")
+        log.debug("Done precompiling %s", self)
+
+    def make_run_fn(
+        self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor
+    ) -> Callable[[], None]:
+        self.DLL, self.hash_key, self.source_file = CUDACodeCache.load(
+            self.source_code, "so"
+        )
+        args = [
+            c_void_p(tensor.data_ptr())
+            for tensor in list(input_tensors) + [output_tensor]
+        ]
+        log.debug(
+            "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%s",
+            self.kernel_name,
+            self.source_file,
+            self.hash_key,
+            self.DLL,
+            args,
+            self.extra_args,
+        )
+        run_method = getattr(self.DLL, self.kernel_name)
+        stream_ptr = c_void_p(torch.cuda.current_stream().cuda_stream)
+
+        # Retrieve workspace_size and initialize workspace.
+        c_workspace_size = c_size_t()
+        run_method(
+            *args,  # input ptrs and output ptrs
+            *self.extra_args,
+            byref(
+                c_workspace_size
+            ),  # set workspace size ptr to retrieve workspace size
+            None,  # null workspace ptr
+            stream_ptr,
+        )
+        self.workspace_size = c_workspace_size.value
+        # TODO: Support non-zero workspace_size.
+        assert self.workspace_size == 0, (
+            "Things need to be fixed to support non-zero workspace_size: "
+            "1) max autotune cache needs to store workspace size; "
+            "2) memory allocation needs to allocate / deallocate workspace correctly; "
+        )
+
+        # Generate partial function.
+        return functools.partial(
+            run_method,
+            *args,
+            *self.extra_args,
+            None,  # null workspace size ptr
+            None,  # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0
+            stream_ptr,
+        )
+
+    def cleanup_run_fn(self) -> None:
+        if self.DLL is not None:
+            self.DLL.close()
+        self.workspace = None
+
+    def __str__(self) -> str:
+        return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}"
+
+
+def benchmark_in_sub_process(
+    choices: List[TritonTemplateCaller],
+) -> Dict[TritonTemplateCaller, float]:
+    """
+    Do benchmarking in a subprocess and return the perf number (latency).
+    """
+    return tuning_pool.benchmark(choices)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/bounds.py b/MLPY/Lib/site-packages/torch/_inductor/bounds.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b52e523e99ffbe5cd89eee073825c6d0c3c65a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/bounds.py
@@ -0,0 +1,124 @@
+import operator
+from functools import partial
+from typing import Any, Callable, Dict
+
+from sympy import Expr
+
+import torch
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRangeAnalysis, ValueRanges
+from .ir import InterpreterShim, LoopBody, LoopBodyBlock
+from .utils import cache_on_self, dominated_nodes
+from .virtualized import V
+
+
+class BoundVars:
+    """
+    Performs Value Range Analysis on LoopBody's fx graph by calling BoundVars.run()
+    It exposes the ranges of the nodes in the `bounds` variable
+
+    Note. A current limitation of this analysis is that it just works on a per-loop basis.
+    We should be able to propagate the bounds between across the whole graph. This may benefit
+    the case a bounded variable is returned by a kernel and fed into another.
+    """
+
+    def __init__(self, loop_body: LoopBody) -> None:
+        self.loop_body = loop_body
+        self.replacement_vals = {
+            k: ValueRanges[Expr](0, v - 1)
+            if (isinstance(v, int) or v.is_number)
+            else bound_sympy(v)
+            for k, v in loop_body.var_ranges.items()
+        }
+        # avoid computing these values, pessimistically assume that they are unbounded
+        self.unbounded_vars = dominated_nodes(
+            node
+            for node in self.loop_body.get_nodes()
+            if node.target in ["load", "reduction", operator.getitem]
+            or "masked_subblock" in node.target
+        )
+        # To access this variable call `get_bounds()`
+        self._bounds: Dict[torch.fx.Node, ValueRanges[Expr]] = {}
+
+    @cache_on_self
+    def get_bounds(self) -> Dict[torch.fx.Node, ValueRanges[Expr]]:
+        submodules = self.swap_submodules(self.loop_body.submodules)
+
+        # Initialize the environment with the unbounded variables
+        for node in self.unbounded_vars:
+            # we need to evaluate masked_subblock to recurse, and we need to set indirect values
+            if not isinstance(node.target, str) or (
+                "masked_subblock" not in node.target
+                and "set_indirect" not in node.target
+            ):
+                self._bounds[node] = ValueRanges[Expr].unknown()
+
+        with V.set_ops_handler(ValueRangeAnalysis()):
+            interpreter = InterpreterShim(self.loop_body.root_block.graph, submodules)
+            interpreter.run(V.get_ops_handler(), initial_env=self._bounds)
+        return self._bounds
+
+    def swap_submodules(
+        self, submodules: Dict[str, Callable[..., Any]]
+    ) -> Dict[str, Callable[..., ValueRanges[Expr]]]:
+        result: Dict[str, Callable[..., ValueRanges[Expr]]] = {}
+        for key in submodules.keys():
+            if key == "get_index":
+                result[key] = self.get_index
+            elif "masked_subblock" in key:
+                subblock = self.loop_body.subblocks[key]
+                # The result within the lambda will reference to the final
+                # set of modules at the end of the for-loop as it stores a reference to it
+
+                # bind subblock in a function because python lambdas close over by reference
+                # moving the lambda out of make_fn would close over the reference to subblock,
+                # so all lambdas would have the same subblock reference that is the final
+                # subblock in the loop
+                def make_fn(subblock):
+                    return lambda mask, value: self.masked_subblock(
+                        subblock, self._bounds, mask, value, result
+                    )
+
+                result[key] = make_fn(subblock)
+
+            elif "set_indirect" in key:
+                idx = int(key[len("set_indirect") :])
+                var = self.loop_body.indirect_vars[idx]
+                indirect = partial(self.set_indirect, var)
+                result[key] = indirect
+            else:
+                assert "scan" in key
+                result[key] = submodules[key]
+
+        return result
+
+    def masked_subblock(
+        self,
+        subblock: LoopBodyBlock,
+        env: Dict[torch.fx.Node, ValueRanges[Expr]],
+        mask: Any,
+        value: Any,
+        submodules: Dict[str, Callable[..., Any]],
+    ) -> ValueRanges[Expr]:
+        interp = InterpreterShim(subblock.graph, submodules)
+        interp.run(V.get_ops_handler(), initial_env=env)
+        output = [node for node in subblock.graph.nodes if node.target == "output"]
+        assert len(output) == 1
+        # dont bother unioning with value since the load from buffer will be
+        # pessimistically assumed to be inf anyway
+        return interp.env[output[0]]
+
+    def set_indirect(self, old: Expr, new: ValueRanges[Expr]) -> ValueRanges[Expr]:
+        assert isinstance(new, ValueRanges)
+        self.replacement_vals[old] = new
+        return new
+
+    def get_index(self, name: Expr) -> ValueRanges[Expr]:
+        expr = self.loop_body.indexing_exprs[name]
+        bound = self.replacement_vals.get(expr)
+        if bound is None:
+            bound = bound_sympy(expr, self.replacement_vals)
+        # The following assertion is true at the time of this writing
+        # We don't assert is as to not execute bound_sympy when bound is not None
+        # assert bound is None or bound == bound_sympy(expr, self.replacement_vals)
+        self.replacement_vals[name] = bound
+        return bound
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codecache.py b/MLPY/Lib/site-packages/torch/_inductor/codecache.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b477691fe5971afe19c9f7945def2a5276c4d37
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codecache.py
@@ -0,0 +1,2727 @@
+from __future__ import annotations
+
+import base64
+import copyreg
+import dataclasses
+import functools
+import hashlib
+import importlib
+import io
+import json
+import logging
+import multiprocessing
+import os
+import pathlib
+import pickle
+import pkgutil
+import platform
+import re
+import shlex
+import shutil
+import signal
+import subprocess
+import sys
+import sysconfig
+import tempfile
+import textwrap
+import threading
+import warnings
+import weakref
+from bisect import bisect_right
+from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
+from copy import copy
+from ctypes import c_void_p, cdll, CDLL
+from functools import partial
+from pathlib import Path
+from threading import Thread
+from time import sleep, time
+from types import ModuleType
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+
+import torch
+
+from torch._dynamo.device_interface import (
+    get_interface_for_device,
+    get_registered_device_interfaces,
+)
+from torch._dynamo.utils import counters, dynamo_timed
+from torch._inductor import config, exc, metrics
+from torch._inductor.codegen.cuda import cuda_env
+from torch._inductor.utils import cache_dir, developer_warning, is_linux
+from torch._subclasses.fake_tensor import (
+    extract_tensor_metadata,
+    FakeTensor,
+    TensorMetadata,
+)
+from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv
+
+if TYPE_CHECKING:
+    from torch._inductor.graph import GraphLowering
+    from torch._inductor.select_algorithm import ChoiceCaller
+
+from torch.hub import _Faketqdm, tqdm
+
+_HERE = os.path.abspath(__file__)
+_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
+_LINKER_SCRIPT = os.path.join(_TORCH_PATH, "_inductor/script.ld")
+
+if config.is_fbcode():
+    from triton.fb import build_paths
+    from triton.fb.build import _run_build_command
+
+    from torch._inductor.fb.utils import (
+        log_global_cache_errors,
+        log_global_cache_stats,
+        log_global_cache_vals,
+        use_global_cache,
+    )
+else:
+
+    def log_global_cache_errors(*args, **kwargs):
+        pass
+
+    def log_global_cache_stats(*args, **kwargs):
+        pass
+
+    def log_global_cache_vals(*args, **kwargs):
+        pass
+
+    def use_global_cache() -> bool:
+        return False
+
+
+LOCK_TIMEOUT = 600
+
+# timing metrics for time spent in the compilation
+_cumulative_compile_time = 0.0
+_t0: Optional[float] = None
+
+
+def _compile_start() -> None:
+    global _t0
+    if _t0 is None:
+        _t0 = time()
+
+
+def _compile_end() -> None:
+    global _cumulative_compile_time, _t0
+    if _t0 is not None:
+        t1 = time()
+        _cumulative_compile_time += t1 - _t0
+        _t0 = None
+        # print("CUMULATIVE COMPILE TIME", _cumulative_compile_time)
+
+
+log = logging.getLogger(__name__)
+
+
+def cpp_wrapper_cache_dir(name: str) -> str:
+    cu_str = (
+        "cpu"
+        if torch.version.cuda is None
+        else f'cu{torch.version.cuda.replace(".", "")}'
+    )
+    python_version = f"py{sys.version_info.major}{sys.version_info.minor}"
+    build_folder = f"{python_version}_{cu_str}"
+
+    cpp_wrapper_dir = os.path.join(cache_dir(), build_folder)
+    cpp_wrapper_build_directory = os.path.join(cpp_wrapper_dir, name)
+    os.makedirs(cpp_wrapper_build_directory, exist_ok=True)
+    return cpp_wrapper_build_directory
+
+
+def get_cpp_wrapper_cubin_path_name():
+    return "cubin_path" if torch.version.hip is None else "hsaco_path"
+
+
+class CacheBase:
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_system() -> Dict[str, Any]:
+        try:
+            import triton
+
+            triton_version = triton.__version__
+        except ModuleNotFoundError:
+            triton_version = None
+
+        try:
+            system: Dict[str, Any] = {
+                "device": {
+                    "name": torch.cuda.get_device_properties(
+                        torch.cuda.current_device()
+                    ).name,
+                },
+                "version": {
+                    "cuda": torch.version.cuda,
+                    "triton": triton_version,
+                },
+            }
+        except (AssertionError, RuntimeError):
+            # If cuda is not installed, none of the above config is relevant.
+            system = {}
+
+        system["hash"] = hashlib.sha256(
+            json.dumps(system, sort_keys=True).encode("utf-8")
+        ).hexdigest()
+
+        return system
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_local_cache_path() -> Path:
+        return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"]))
+
+    @staticmethod
+    @functools.lru_cache(None)
+    def get_global_cache_path() -> Optional[Path]:
+        return (
+            Path(os.path.join(config.global_cache_dir, CacheBase.get_system()["hash"]))
+            if config.global_cache_dir is not None
+            else None
+        )
+
+    def __init__(self) -> None:
+        if not torch.cuda.is_available():
+            return
+
+        self.system = CacheBase.get_system()
+
+        self.local_cache_path = CacheBase.get_local_cache_path()
+        self.global_cache_path = CacheBase.get_global_cache_path()
+
+    def get_local_cache(self) -> Dict[str, Any]:
+        if not self.local_cache_path.is_file():
+            return {}
+        with open(self.local_cache_path) as local_cache_fp:
+            local_cache = json.load(local_cache_fp)
+        return local_cache["cache"]
+
+    def update_local_cache(self, local_cache: Dict[str, Any]) -> None:
+        if not os.path.exists(self.local_cache_path.parent):
+            os.makedirs(self.local_cache_path.parent, exist_ok=True)
+
+        write_atomic(
+            str(self.local_cache_path),
+            json.dumps({"system": self.system, "cache": local_cache}, indent=4),
+        )
+
+
+class LocalCache(CacheBase):
+    def lookup(self, *keys: str) -> Optional[Dict[str, Any]]:
+        cache = self.get_local_cache()
+
+        sub_cache = cache
+        for key in keys:
+            if key in cache:
+                sub_cache = cache[key]
+            else:
+                return None
+
+        return sub_cache
+
+    def set_value(self, *keys: str, value: Any) -> None:
+        cache = self.get_local_cache()
+
+        sub_cache = cache
+        for key in keys[0:-1]:
+            sub_cache.setdefault(key, {})
+            sub_cache = sub_cache[key]
+        sub_cache[keys[-1]] = value
+
+        self.update_local_cache(cache)
+
+
+class PersistentCache(CacheBase):
+    @functools.lru_cache(None)
+    def get_global_cache(self):
+        if self.global_cache_path is None or not self.global_cache_path.is_file():
+            return {}
+        with open(self.global_cache_path) as global_cache_fp:
+            global_cache = json.load(global_cache_fp)
+        return global_cache["cache"]
+
+    def lookup(
+        self,
+        choices: List[ChoiceCaller],
+        op: str,
+        inputs: str,
+        benchmark: Callable[[Any], Dict[ChoiceCaller, float]],
+    ) -> Dict[ChoiceCaller, float]:
+        """
+        Check to see if we have benchmarked the given choice callers. For each
+        choice caller:
+
+            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
+            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
+            3.
+                a. `max_autotune_gemm=True`: benchmark the choice, update
+                    local_cache[op][inputs][choice], and return the benchmark.
+                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
+        """
+        precision = torch.get_float32_matmul_precision()
+
+        log_stats = partial(log_global_cache_stats, self.system, op, inputs, precision)
+        log_vals = partial(log_global_cache_vals, self.system, op, inputs, precision)
+        log_errors = partial(
+            log_global_cache_errors, self.system, op, inputs, precision
+        )
+        timings = {}
+
+        def check_cache(cache, callback=None) -> bool:
+            """Check if `cache` contains data for all the choices"""
+            hit = True
+            for choice in choices:
+                choice_hash = choice.hash_key()
+                if choice_hash in cache.get(op, {}).get(inputs, {}).get(precision, {}):
+                    # cache hit
+                    timings[choice] = cache[op][inputs][precision][choice_hash]
+                else:
+                    # cache miss
+                    hit = False
+                    break
+            if callback:
+                callback(cached=hit)
+            return hit
+
+        if config.max_autotune or config.max_autotune_gemm:
+            local_cache = self.get_local_cache()
+            # check local cache first since it is data specific to the current machine
+            if not check_cache(local_cache) and not (
+                use_global_cache()
+                and check_cache(self.get_global_cache(), callback=log_stats)
+            ):
+                try:
+                    # re-benchmark everything to try to get consistent numbers from the same machine
+                    timings = benchmark(choices)
+                    assert all(choice in timings for choice in choices)
+                    local_cache.setdefault(op, {})
+                    local_cache[op].setdefault(inputs, {}).setdefault(precision, {})
+                    for choice, timing in timings.items():
+                        local_cache[op][inputs][precision][choice.hash_key()] = timing
+                except RuntimeError as e:
+                    # catch and log autotuning failures
+                    log_errors(e)
+                    raise e
+
+                self.update_local_cache(local_cache)
+
+                timings_to_log = {
+                    choice.hash_key(): timings[choice] for choice in choices
+                }
+                log_vals(timings_to_log)
+        elif use_global_cache():
+            # only check global cache, not local one
+            check_cache(self.get_global_cache(), callback=log_stats)
+            # may have a partial cache hit, where not everything is benchmarked
+
+        return timings
+
+
+def get_lock_dir() -> str:
+    lock_dir = os.path.join(cache_dir(), "locks")
+    if not os.path.exists(lock_dir):
+        os.makedirs(lock_dir, exist_ok=True)
+    return lock_dir
+
+
+def sha256_hash(data: bytes) -> str:
+    # [:51] to strip off the "Q====" suffix common to every hash value.
+    return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower()
+
+
+def code_hash(code: Union[str, bytes], extra: str = ""):
+    hashing_str = code if isinstance(code, bytes) else code.encode("utf-8")
+    if extra != "":
+        hashing_str = hashing_str + b"||" + extra.encode("utf-8")
+    return "c" + sha256_hash(hashing_str)
+
+
+def get_path(
+    basename: str, extension: str, specified_dir: str = ""
+) -> Tuple[str, str, str]:
+    if specified_dir:
+        if os.path.isabs(specified_dir):
+            subdir = specified_dir
+        else:
+            subdir = os.path.join(cache_dir(), specified_dir)
+    else:
+        subdir = os.path.join(cache_dir(), basename[1:3])
+    path = os.path.join(subdir, f"{basename}.{extension}")
+    return basename, subdir, path
+
+
+def get_hash(content: Union[str, bytes], extra: str = "", hash_type: str = "code"):
+    if hash_type == "code":
+        return code_hash(content, extra)
+    if hash_type in ["cubin", "hsaco"]:
+        return code_hash(repr(content))
+    raise AssertionError(f"Unknown hash type {hash_type}")
+
+
+def write(
+    content: Union[str, bytes],
+    extension: str,
+    extra: str = "",
+    hash_type: str = "code",
+    specified_dir: str = "",
+) -> Tuple[str, str]:
+    # use striped content to compute hash so we don't end up with different
+    # hashes just because the content begins/ends with differnet number of
+    # spaces.
+    key: str = get_hash(content.strip(), extra, hash_type)
+    basename, subdir, path = get_path(key, extension, specified_dir)
+    if not os.path.exists(subdir):
+        os.makedirs(subdir, exist_ok=True)
+    if not os.path.exists(path):
+        write_atomic(path, content)
+    return basename, path
+
+
+def write_atomic(path: str, content: Union[str, bytes]) -> None:
+    # Write into temporary file first to avoid conflicts between threads
+    # Avoid using a named temporary file, as those have restricted permissions
+    assert isinstance(
+        content, (str, bytes)
+    ), "Only strings and byte arrays can be saved in the cache"
+    path = pathlib.Path(path)
+    tmp_path = path.parent / f".{os.getpid()}.{threading.get_ident()}.tmp"
+    write_mode = "w" if isinstance(content, str) else "wb"
+    with tmp_path.open(write_mode) as f:
+        f.write(content)
+    tmp_path.rename(path)
+
+
+@dataclasses.dataclass
+class TensorMetadataAndValues:
+    """
+    TensorMetadata plus the elements as a list of raw values.
+    Used for hashing inlined constants.
+    """
+
+    tensor_metadata: TensorMetadata
+    values: List[Any]
+
+
+def _ident(x: Any) -> Any:
+    return x
+
+
+def _reduce_fake_tensor(t):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
+    """
+    metadata = extract_tensor_metadata(t)
+    return (_ident, (metadata,))
+
+
+def _reduce_tensor(t):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle Tensors.
+    """
+    if t.is_mkldnn:
+        # TODO: These tensors don't currently pickle, so we can't cache a
+        # compiled graph containing them. Just fail now. If mkldnn tensors
+        # get pickling support, we can remove this.
+        raise BypassFxGraphCache()
+
+    # If we see tensors, we know they're constants stored as attributes on
+    # the GraphModule. See tensor lowering; small constants are inlined. If
+    # we see a small tensor, therefore, no reference will ultimately remain
+    # in the generated code. So we need to include its value in the cache key.
+    # Large constants are effectively treated as inputs and we consider only
+    # their metadata.
+    metadata = extract_tensor_metadata(t)
+    if len(t.shape) == 0 or torch._inductor.graph.GraphLowering.can_inline_constant(t):
+        return (_ident, (TensorMetadataAndValues(metadata, t.tolist()),))
+    else:
+        return (_ident, (metadata,))
+
+
+def _reduce_symint(s):
+    """
+    See FxGraphCachePickler. Custom reducer to pickle SymInts.
+    """
+    # For hashing purposes, we only care about the name of the symbol and
+    # not the backed value. We evaluate guards stored with a cached graph
+    # to ensure a cached entity with SymInt args is safe to reuse.
+    return (_ident, (str(s),))
+
+
+class FxGraphCachePickler(pickle.Pickler):
+    """
+    Custom pickler to customize the pickling of some objects (Tensors), only for the
+    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
+    objects that don't pickle and/or vary between runs, and we want to capture the
+    data that allow us to compute a stable, but safe hash.
+    """
+
+    dispatch_table = copyreg.dispatch_table.copy()
+    dispatch_table[FakeTensor] = _reduce_fake_tensor
+    dispatch_table[torch.Tensor] = _reduce_tensor
+    dispatch_table[torch.SymInt] = _reduce_symint
+
+    @staticmethod
+    def dumps(obj) -> bytes:
+        """
+        Pickle an object using the FxGraphCachePickler.
+        """
+        with io.BytesIO() as stream:
+            pickler = FxGraphCachePickler(stream)
+            pickler.dump(obj)
+            return stream.getvalue()
+
+    @staticmethod
+    def get_hash(obj: Any) -> str:
+        """
+        Serialize an object using the FxGraphCachePickler and return a hash
+        of the pickled object.
+        """
+        serialized_data = FxGraphCachePickler.dumps(obj)
+        return sha256_hash(serialized_data)
+
+
+@functools.lru_cache(None)
+def get_inductor_code_hash() -> bytes:
+    """
+    Compute a hash of all inductor code modules. Used by the FxGraph cache
+    so any inductor code changes would result in new cache keys.
+    """
+    inductor_root = os.path.dirname(__file__)
+
+    contents: Dict[str, bytes] = {}
+    for lib in pkgutil.iter_modules([inductor_root]):
+        spec = lib.module_finder.find_spec(lib.name, None)
+        assert spec is not None
+        module = spec.origin
+        assert module is not None
+        with open(module, "rb") as f:
+            contents[module] = f.read()
+
+    return hashlib.sha256(pickle.dumps(contents)).digest()
+
+
+@dataclasses.dataclass
+class OrderedSetHolder:
+    """
+    See FxGraphHashDetails. Holds a sorted list to support stable hashing
+    of set kwargs.
+    """
+
+    items: List[Any]
+
+
+class BypassFxGraphCache(Exception):
+    """
+    Exception to indicate that the FxGraphCache should be bypassed.
+    """
+
+    pass
+
+
+class FxGraphHashDetails:
+    """
+    Object to capture all the details for a compiled FX graph relevant to computing
+    a safe and stable cache key.
+    """
+
+    # Excluded kwargs param that are not stable between runs
+    EXCLUDED_KWARGS = ["graph_id"]
+
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        fx_kwargs: Dict[str, Any],
+    ):
+        self.gm = gm
+        self.example_inputs = example_inputs
+
+        # Order kwargs so hashing is stable to changes in kwarg order.
+        self.fx_kwargs = {}
+        for k in sorted(fx_kwargs):
+            if k not in self.EXCLUDED_KWARGS:
+                if type(fx_kwargs[k]) is set:
+                    # Special case to handle set params. Python sets can't be
+                    # ordered, so sort the elements and store them in a proxy.
+                    self.fx_kwargs[k] = OrderedSetHolder(sorted(fx_kwargs[k]))
+                else:
+                    self.fx_kwargs[k] = fx_kwargs[k]
+
+        # 'Deterministic algorithms' can affect codegen via lowering to cuda kernels.
+        self.deterministic_algorithms_settings = (
+            torch.are_deterministic_algorithms_enabled(),
+            torch.is_deterministic_algorithms_warn_only_enabled(),
+            torch.utils.deterministic.fill_uninitialized_memory,  # type: ignore[attr-defined]
+        )
+
+        # Global settings affecting matmul codegen.
+        self.cuda_matmul_settings = (
+            torch.backends.cuda.matmul.allow_tf32,
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction,
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction,
+        )
+
+        # Also hash on various system info (including the triton compiler version).
+        self.torch_version = torch.__version__
+        self.system_info = CacheBase.get_system()
+
+        # And the inductor configuration and code.
+        self.inductor_code_hash = get_inductor_code_hash()
+        try:
+            self.inductor_config = config.save_config()
+        except TypeError as e:
+            # Some configs options are callables, e.g., post_grad_custom_pre_pass,
+            # and may not pickle.
+            log.debug("Can't pickle inductor config: %s", e)
+            raise BypassFxGraphCache() from e
+
+    def debug_str(self) -> str:
+        """
+        Get a printable string describing in more detail all the attributes
+        comprising this object. Useful for debugging when one graph hashes
+        to a different value than another.
+        """
+
+        def get_str(obj) -> str:
+            if isinstance(obj, torch.Tensor):
+                return str(extract_tensor_metadata(obj))
+            elif isinstance(obj, bytes):
+                return "<bytes>"
+            else:
+                return str(obj)
+
+        lines = []
+        for attr, obj in vars(self).items():
+            if isinstance(obj, list):
+                for ii in range(len(obj)):
+                    h = FxGraphCachePickler.get_hash(obj[ii])
+                    lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}")
+            elif isinstance(obj, dict):
+                for k, v in obj.items():
+                    h = FxGraphCachePickler.get_hash(v)
+                    lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}")
+            else:
+                h = FxGraphCachePickler.get_hash(obj)
+                lines.append(f"[{h}] {attr}: {get_str(obj)}")
+        return "\n".join(lines)
+
+
+def compiled_fx_graph_hash(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    fx_kwargs: Dict[str, Any],
+) -> str:
+    """
+    Generate a unique hash of the FX graph for caching.
+    """
+    details = FxGraphHashDetails(gm, example_inputs, fx_kwargs)
+    # The prefix distinguishes among the other kinds of objects we
+    # cache in this module.
+    key = "f" + FxGraphCachePickler.get_hash(details)
+    log.debug("FX graph cache hash details for key %s:\n%s", key, details.debug_str())
+    return key
+
+
+class FxGraphCache:
+    """
+    Supports caching and reusing compiled Fx graphs.
+
+    The overall strategy is as follows:
+    - This cache stores entries on disk. When saving an entry, we can't
+      serialize callables (that could be C++, Triton, etc.), so we serialize
+      their own disk cache location. We then recreate the compiled artifact
+      after fetching from disk.
+    - For indexing the cache, we gather the fields relevant to identifying an
+      FxGraph (the graph module, graph inputs, system settings etc.) into an
+      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
+      See FxGraphCachePickler.
+    - Among the metadata we store, we also include a guards expression that's
+      appropriate for validating any symbols for Tensor arguments that have
+      symbolic bounds. On cache lookup then, we evaluate those guards in the
+      current context to validate that a cached entry can be served.
+    - A given graph could have multiple compiled versions, corresponding to
+      different sets of guards. Therefore, we store cache entries in the form:
+          <temp dir>/<fx graph hash>/<serialized metatdata>
+    - On lookup, we compute the key from the graph details, iterate over all
+      leaf files in the corresponding subdirectory, deserialize the entry, and
+      evaluate its guards expression. If the evaluation succeeds, we have a
+      cache hit. If it fails, we compile the graph and store a new entry.
+    - Finally, on a cache hit, we need to make sure any guards that would
+      have been created during compilation are added to the current context.
+    """
+
+    # TODO(masnesral): Investigate whether it's beneficial to store compiled graphs
+    # in an in-memory cache after loading from disk.
+    @staticmethod
+    def _get_tmp_dir() -> str:
+        """
+        Get the toplevel temporary directory for storing compiled graphs.
+        """
+        return os.path.join(cache_dir(), "fxgraph")
+
+    @staticmethod
+    def _get_tmp_dir_for_key(key: str) -> str:
+        """
+        Return the disk location for a given cache key.
+        """
+        return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key)
+
+    @staticmethod
+    def _filter_symints(inputs: List[Any]) -> List[torch.SymInt]:
+        """
+        Get the SymInt objects from the input list.
+        """
+        return [s for s in inputs if isinstance(s, torch.SymInt)]
+
+    @staticmethod
+    def _get_shape_env() -> Optional[ShapeEnv]:
+        """
+        Helper to get the shape env from the tracing context.
+        """
+        ctx = torch._guards.TracingContext.try_get()
+        if not ctx:
+            return None
+        return ctx.fake_mode.shape_env
+
+    @staticmethod
+    def _lookup_graph(
+        key: str,
+        example_inputs: List[torch.Tensor],
+    ) -> Optional[CompiledFxGraph]:
+        """
+        Lookup a compiled graph in the cache by key. On a hit, return the
+        deserialized CompiledFxGraph object. On a miss, return None.
+        """
+        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+        if not os.path.exists(subdir):
+            return None
+
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+
+        # Iterate over any entries in the subdir for this key and evaluate
+        # their guards to determine whether there's a hit.
+        graph = None
+
+        for path in sorted(os.listdir(subdir)):
+            with open(os.path.join(subdir, path), "rb") as f:
+                candidate: CompiledFxGraph = pickle.load(f)
+
+            guards_expr = candidate.guards_expr
+            if not guards_expr:
+                # No guards to evaluate, so this is a hit.
+                graph = candidate
+                break
+
+            # Evaluate the guard expression in the current context.
+            symints = FxGraphCache._filter_symints(example_inputs)
+
+            # If there's not a cache hit, we don't want the evaluation to
+            # affect the current env, e.g., cause the creation of new guards,
+            # so we evaluate with the hints instead of the symbols.
+            assert all(has_hint(s) for s in symints)
+            hints = [hint_int(s) for s in symints]
+            hit = bool(shape_env.evaluate_guards_expression(guards_expr, hints))
+            log.debug(
+                "fx graph cache key %s evaluating guards for %s with values %s => %s",
+                key,
+                guards_expr,
+                hints,
+                hit,
+            )
+            if hit:
+                # Now re-evaluate with the symints to add any guards to the current env.
+                check = bool(shape_env.evaluate_guards_expression(guards_expr, symints))
+                assert check is True
+                log.debug(
+                    "fx graph cache key %s post-load guards: %s", key, shape_env.guards
+                )
+                graph = candidate
+                break
+
+        # Increment the cached metrics by the amounts recorded when the FX
+        # graph was compiled for this cache entry. Pretending these counters
+        # were incremented normally is useful for testing with the cache enabled.
+        if graph is not None:
+            metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas)
+
+        return graph
+
+    @staticmethod
+    def _save_graph(
+        key: str, compiled_graph: CompiledFxGraph, example_inputs: List[torch.Tensor]
+    ):
+        """
+        Store a serialized CompiledFxGraph on disk.
+        """
+        disk_compiled_graph = copy(compiled_graph)
+        # Important as compiled models are not pickleable:
+        disk_compiled_graph.compiled_artifact = None
+
+        # Before serializing, compute the guard expression that will be used to
+        # ensure that a CompiledFxGraph is valid when loaded from the cache. It's
+        # sufficient to consider only the SymInt args to the fx graph since the
+        # Tensor shapes are already captured in the hash for the cache key. Any
+        # Tensor arg with a symbolic shape will have a SymInt arg for the graph.
+        shape_env = FxGraphCache._get_shape_env()
+        assert shape_env is not None
+        symints = FxGraphCache._filter_symints(example_inputs)
+        disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints)
+
+        try:
+            content = pickle.dumps(disk_compiled_graph)
+        except Exception as e:
+            log.debug("fx graph cache unable to serialize compiled graph: %s", e)
+            counters["inductor"]["fxgraph_cache_pickle_error"] += 1
+            return
+
+        subdir = FxGraphCache._get_tmp_dir_for_key(key)
+        if not os.path.exists(subdir):
+            os.makedirs(subdir, exist_ok=True)
+
+        # Use a hash of the serialized CompiledFxGraph to get a unique file
+        # name. The specific name doesn't matter since a lookup involves
+        # iterating over all entries in the parent subdir.
+        path = os.path.join(subdir, sha256_hash(content))
+        write_atomic(path, content)
+
+    @staticmethod
+    def _check_can_cache():
+        """
+        Check some conditions that would preclude caching and raise BypassFxGraphCache
+        to bypass in case caching is not possible.
+        """
+        if config.freezing or config.aot_inductor.use_runtime_constant_folding:
+            # Freezing can embed constants that wouldn't be static across runs.
+            raise BypassFxGraphCache()
+
+        if FxGraphCache._get_shape_env() is None:
+            # The treatment of guards in the caching implementation requires that
+            # we have a shape env.
+            log.debug("fx graph cache no shape env")
+            raise BypassFxGraphCache()
+
+    @staticmethod
+    def load(
+        compile_fx_fn: Callable[..., Any],
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        fx_kwargs: Dict[str, Any],
+    ):
+        """
+        Load a compiled graph from the cache. If a cached entry does not exist,
+        compile the graph and save it to the cache.
+        """
+        from filelock import FileLock
+
+        compiled_graph = None
+        try:
+            FxGraphCache._check_can_cache()
+            key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs)
+
+            lock_path = os.path.join(get_lock_dir(), key + ".lock")
+            with FileLock(lock_path, timeout=LOCK_TIMEOUT):
+                compiled_graph = FxGraphCache._lookup_graph(key, example_inputs)
+                if compiled_graph is None:
+                    log.debug("fx graph cache miss for key %s", key)
+                    counters["inductor"]["fxgraph_cache_miss"] += 1
+                    compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
+                    FxGraphCache._save_graph(key, compiled_graph, example_inputs)
+                else:
+                    log.debug("fx graph cache hit for key %s", key)
+                    counters["inductor"]["fxgraph_cache_hit"] += 1
+        except BypassFxGraphCache:
+            counters["inductor"]["fxgraph_cache_bypass"] += 1
+
+        if not compiled_graph:
+            compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs)
+
+        return compiled_graph
+
+    @staticmethod
+    def clear():
+        """
+        Clear out the on-disk cache.
+        """
+        try:
+            shutil.rmtree(FxGraphCache._get_tmp_dir())
+        except FileNotFoundError:
+            pass
+
+
+@dataclasses.dataclass
+class CompiledFxGraph:
+    """
+    Class holding a compiled FX graph. This is the object serialized on disk
+    to support FxGraph caching.
+    """
+
+    compiled_artifact: Optional[Callable[..., Any]]
+    current_callable: Optional[Callable[..., Any]]
+    cache_key: Optional[str]
+    artifact_path: Optional[str]
+    cache_linemap: Optional[List[Tuple[int, str]]]
+    device_types: Set[str]
+    device_idxs: Set[int]
+    mutated_inputs: Set[str]
+    mutated_input_idxs: Set[int]
+    constants: Dict[str, torch.Tensor]
+    output_strides: Optional[List[Optional[Tuple[int, ...]]]]
+    disabled_cudagraphs_reason: Optional[str]
+    metrics_deltas: metrics.CachedMetricsDeltas
+    # This is a string representation of an expression we serialize
+    # with the object so the guards can be evaluated in a different
+    # context in order to verify the validity of serving a cached
+    # fx graph. The expression must be generated by:
+    # ShapeEnv.produce_guards_expression()
+    guards_expr: Optional[str]
+
+    _boxed_call: Optional[bool] = None
+
+    def __init__(
+        self,
+        compiled_artifact: Optional[Callable[..., Any]],
+        graph: GraphLowering,
+        output_strides: List[Optional[Tuple[int, ...]]],
+        disabled_cudagraphs_reason: Optional[str],
+        metrics_deltas: metrics.CachedMetricsDeltas,
+    ):
+        self.compiled_artifact = compiled_artifact
+        self.current_callable = None
+        self.cache_key = graph.cache_key
+        self.artifact_path = graph.cache_path
+        self.cache_linemap = graph.cache_linemap
+        self.device_types = graph.device_types
+        self.device_idxs = graph.device_idxs
+        self.mutated_inputs = graph.mutated_inputs
+        self.mutated_input_idxs = set(graph.mutated_input_idxs)
+        self.constants = graph.constants
+        self.output_strides = output_strides
+        self.disabled_cudagraphs_reason = disabled_cudagraphs_reason
+        self.metrics_deltas = metrics_deltas
+        self.guards_expr = None
+
+    def __call__(self, inputs: List[Any]) -> Any:
+        return self.get_current_callable()(inputs)
+
+    def get_current_callable(self) -> Callable[..., Any]:
+        if self.current_callable is None:
+            # This prevents a circular reference that makes CompiledFxGraph
+            # get stuck without getting garbage collected
+            return functools.partial(_run_from_cache, weakref.proxy(self))
+        else:
+            return self.current_callable
+
+
+def _run_from_cache(compiled_graph: CompiledFxGraph, inputs: List[Any]) -> Any:
+    # We can't really serialize callables that may be C++/Triton/etc.,
+    # so we serialize their disk cache location instead
+    # TODO: When making an API that can save compiled models e2e to disk
+    # this will need to be better
+    if compiled_graph.compiled_artifact is None:
+        from .codecache import PyCodeCache
+
+        assert compiled_graph.cache_key
+        assert compiled_graph.artifact_path
+        compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path(
+            compiled_graph.cache_key,
+            compiled_graph.artifact_path,
+            compiled_graph.cache_linemap,
+            compiled_graph.constants,
+        ).call
+
+    return compiled_graph.compiled_artifact(inputs)
+
+
+def cpp_compiler() -> str:
+    if config.is_fbcode():
+        return build_paths.cc()
+    if isinstance(config.cpp.cxx, (list, tuple)):
+        search = tuple(config.cpp.cxx)
+    else:
+        search = (config.cpp.cxx,)
+    return cpp_compiler_search(search)
+
+
+@functools.lru_cache(1)
+def cpp_compiler_search(search: str) -> str:
+    for cxx in search:
+        try:
+            if cxx is None:
+                # gxx package is only available for Linux
+                # according to https://anaconda.org/conda-forge/gxx/
+                if sys.platform != "linux":
+                    continue
+                # Do not install GXX by default
+                if not os.getenv("TORCH_INDUCTOR_INSTALL_GXX"):
+                    continue
+                from filelock import FileLock
+
+                lock_dir = get_lock_dir()
+                lock = FileLock(
+                    os.path.join(lock_dir, "g++.lock"), timeout=LOCK_TIMEOUT
+                )
+                with lock:
+                    cxx = install_gcc_via_conda()
+            subprocess.check_output([cxx, "--version"])
+            return cxx
+        except (subprocess.SubprocessError, FileNotFoundError, ImportError):
+            continue
+    raise exc.InvalidCxxCompiler()
+
+
+def install_gcc_via_conda() -> str:
+    """On older systems, this is a quick way to get a modern compiler"""
+    prefix = os.path.join(cache_dir(), "gcc")
+    cxx_path = os.path.join(prefix, "bin", "g++")
+    if not os.path.exists(cxx_path):
+        log.info("Downloading GCC via conda")
+        conda = os.environ.get("CONDA_EXE", "conda")
+        if conda is None:
+            conda = shutil.which("conda")
+        if conda is not None:
+            subprocess.check_call(
+                [
+                    conda,
+                    "create",
+                    f"--prefix={prefix}",
+                    "--channel=conda-forge",
+                    "--quiet",
+                    "-y",
+                    "python=3.8",
+                    "gxx",
+                ],
+                stdout=subprocess.PIPE,
+            )
+    return cxx_path
+
+
+def is_gcc() -> bool:
+    return bool(re.search(r"(gcc|g\+\+)", cpp_compiler()))
+
+
+def is_clang() -> bool:
+    return bool(re.search(r"(clang|clang\+\+)", cpp_compiler()))
+
+
+@functools.lru_cache(None)
+def is_apple_clang() -> bool:
+    cxx = cpp_compiler()
+    version_string = subprocess.check_output([cxx, "--version"]).decode("utf8")
+    return "Apple" in version_string.splitlines()[0]
+
+
+class VecISA:
+    _bit_width: int
+    _macro: str
+    _arch_flags: str
+    _dtype_nelements: Dict[torch.dtype, int]
+
+    # Note [Checking for Vectorized Support in Inductor]
+    # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions
+    # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions
+    # like exp, pow, sin, cos and etc.
+    # But PyTorch and TorchInductor might use different compilers to build code. If
+    # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so
+    # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass
+    # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest
+    # gcc/g++ compiler by default while it could support the AVX512 compilation.
+    # Therefore, there would be a conflict sleef version between PyTorch and
+    # TorchInductor. Hence, we dry-compile the following code to check whether current
+    # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM
+    # also needs the logic
+    # In fbcode however, we are using the same compiler for pytorch and for inductor codegen,
+    # making the runtime check unnecessary.
+    _avx_code = """
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#endif
+
+__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0};
+
+extern "C" void __avx_chk_kernel() {
+    auto tmp0 = at::vec::Vectorized<float>(1);
+    auto tmp1 = tmp0.exp();
+    tmp1.store(in_out_ptr0);
+}
+"""  # noqa: B950
+
+    _avx_py_load = """
+import torch
+from ctypes import cdll
+cdll.LoadLibrary("__lib_path__")
+"""
+
+    def bit_width(self) -> int:
+        return self._bit_width
+
+    def nelements(self, dtype: torch.dtype = torch.float) -> int:
+        return self._dtype_nelements[dtype]
+
+    def build_macro(self) -> str:
+        return self._macro
+
+    def build_arch_flags(self) -> str:
+        return self._arch_flags
+
+    def __hash__(self) -> int:
+        return hash(str(self))
+
+    @functools.lru_cache(None)
+    def __bool__(self) -> bool:
+        if config.cpp.vec_isa_ok is not None:
+            return config.cpp.vec_isa_ok
+
+        if config.is_fbcode():
+            return True
+
+        key, input_path = write(VecISA._avx_code, "cpp")
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            output_path = input_path[:-3] + "so"
+            build_cmd = shlex.split(
+                cpp_compile_command(
+                    input_path, output_path, warning_all=False, vec_isa=self
+                )
+            )
+            try:
+                # Check build result
+                compile_file(input_path, output_path, build_cmd)
+                subprocess.check_call(
+                    [
+                        sys.executable,
+                        "-c",
+                        VecISA._avx_py_load.replace("__lib_path__", output_path),
+                    ],
+                    stderr=subprocess.DEVNULL,
+                    env={**os.environ, "PYTHONPATH": ":".join(sys.path)},
+                )
+            except Exception as e:
+                return False
+
+            return True
+
+
+@dataclasses.dataclass
+class VecAVX512(VecISA):
+    _bit_width = 512
+    _macro = "-DCPU_CAPABILITY_AVX512"
+    _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma"
+    _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+
+    def __str__(self) -> str:
+        return "avx512"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+@dataclasses.dataclass
+class VecAVX2(VecISA):
+    _bit_width = 256
+    _macro = "-DCPU_CAPABILITY_AVX2"
+    _arch_flags = "-mavx2 -mfma"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "avx2"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+@dataclasses.dataclass
+class VecZVECTOR(VecISA):
+    _bit_width = 256
+    _macro = "-DCPU_CAPABILITY_ZVECTOR -DCPU_CAPABILITY=ZVECTOR -DHAVE_ZVECTOR_CPU_DEFINITION"
+    _arch_flags = "-mvx -mzvector"
+    _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16}
+
+    def __str__(self) -> str:
+        return "zvector"
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+class InvalidVecISA(VecISA):
+    _bit_width = 0
+    _macro = ""
+    _arch_flags = ""
+    _dtype_nelements = {}
+
+    def __str__(self) -> str:
+        return "INVALID_VEC_ISA"
+
+    def __bool__(self) -> bool:  # type: ignore[override]
+        return False
+
+    __hash__: Callable[[VecISA], Any] = VecISA.__hash__
+
+
+invalid_vec_isa = InvalidVecISA()
+supported_vec_isa_list = [VecAVX512(), VecAVX2()]
+
+
+# Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content
+# might have too much redundant content that is useless for ISA check. Hence,
+# we only cache some key isa information.
+@functools.lru_cache(None)
+def valid_vec_isa_list() -> List[VecISA]:
+    if sys.platform != "linux":
+        return []
+
+    if platform.machine() == "s390x":
+        return [VecZVECTOR()]
+
+    isa_list = []
+    with open("/proc/cpuinfo") as _cpu_info:
+        _cpu_info_content = _cpu_info.read()
+        for isa in supported_vec_isa_list:
+            if str(isa) in _cpu_info_content and isa:
+                isa_list.append(isa)
+        return isa_list
+
+
+def pick_vec_isa() -> VecISA:
+    if config.is_fbcode():
+        return VecAVX2()
+
+    _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list()
+    if not _valid_vec_isa_list:
+        return invalid_vec_isa
+
+    # If the simdlen is None, it indicates determin the vectorization length automatically
+    if config.cpp.simdlen is None:
+        assert _valid_vec_isa_list
+        return _valid_vec_isa_list[0]
+
+    for isa in _valid_vec_isa_list:
+        if config.cpp.simdlen == isa.bit_width():
+            return isa
+
+    return invalid_vec_isa
+
+
+def get_compile_only(compile_only: bool = True) -> str:
+    return "-c" if compile_only else ""
+
+
+def get_shared(shared: bool = True, compile_only: bool = False) -> str:
+    if not shared:
+        return ""
+    if compile_only:
+        return "-fPIC"
+    if platform.system() == "Darwin" and "clang" in cpp_compiler():
+        # This causes undefined symbols to behave the same as linux
+        return "-shared -fPIC -undefined dynamic_lookup"
+    else:
+        return "-shared -fPIC"
+
+
+def get_warning_all_flag(warning_all: bool = True) -> str:
+    return "-Wall" if warning_all else ""
+
+
+def get_glibcxx_abi_build_flags() -> str:
+    return "-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))
+
+
+def cpp_flags() -> str:
+    flags = ["-std=c++17", "-Wno-unused-variable", "-Wno-unknown-pragmas"]
+    if is_clang():
+        flags.append("-Werror=ignored-optimization-argument")
+    return " ".join(flags)
+
+
+def cpp_wrapper_flags() -> str:
+    return "-DTORCH_INDUCTOR_CPP_WRAPPER"
+
+
+def optimization_flags() -> str:
+    base_flags = "-O0 -g" if config.aot_inductor.debug_compile else "-O3 -DNDEBUG"
+    base_flags += " -ffast-math -fno-finite-math-only"
+    if not config.cpp.enable_unsafe_math_opt_flag:
+        base_flags += " -fno-unsafe-math-optimizations"
+    if not config.cpp.enable_floating_point_contract_flag:
+        base_flags += " -ffp-contract=off"
+
+    if config.is_fbcode():
+        # FIXME: passing `-fopenmp` adds libgomp.so to the generated shared library's dependencies.
+        # This causes `ldopen` to fail in fbcode, because libgomp does not exist in the default paths.
+        # We will fix it later by exposing the lib path.
+        return base_flags
+
+    if sys.platform == "darwin":
+        # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang`
+        # Also, `-march=native` is unrecognized option on M1
+        base_flags += " -Xclang"
+    else:
+        if platform.machine() == "ppc64le":
+            base_flags += " -mcpu=native"
+        else:
+            base_flags += " -march=native"
+
+    # Internal cannot find libgomp.so
+    if not config.is_fbcode():
+        base_flags += " -fopenmp"
+    return base_flags
+
+
+def use_custom_generated_macros() -> str:
+    return "-D C10_USING_CUSTOM_GENERATED_MACROS"
+
+
+def use_fb_internal_macros() -> str:
+    if config.is_fbcode():
+        openmp_lib = build_paths.openmp_lib()
+        preprocessor_flags = " ".join(
+            (
+                "-D C10_USE_GLOG",
+                "-D C10_USE_MINIMAL_GLOG",
+                "-D C10_DISABLE_TENSORIMPL_EXTENSIBILITY",
+            )
+        )
+        return f"-Wp,-fopenmp {openmp_lib} {preprocessor_flags}"
+    else:
+        return ""
+
+
+def use_standard_sys_dir_headers() -> str:
+    if config.is_fbcode():
+        return "-nostdinc"
+    else:
+        return ""
+
+
+@functools.lru_cache(None)
+def is_conda_llvm_openmp_installed() -> bool:
+    try:
+        command = "conda list llvm-openmp --json"
+        output = subprocess.check_output(command.split()).decode("utf8")
+        return len(json.loads(output)) > 0
+    except subprocess.SubprocessError:
+        return False
+
+
+@functools.lru_cache(None)
+def homebrew_libomp() -> Tuple[bool, str]:
+    try:
+        # check if `brew` is installed
+        subprocess.check_output(["which", "brew"])
+        # get the location of `libomp` if it is installed
+        # this is the location that `libomp` **would** be installed
+        # see https://github.com/Homebrew/brew/issues/10261#issuecomment-756563567 for details
+        libomp_path = (
+            subprocess.check_output(["brew", "--prefix", "libomp"])
+            .decode("utf8")
+            .strip()
+        )
+        # check if `libomp` is installed
+        omp_available = os.path.exists(libomp_path)
+        return omp_available, libomp_path
+    except subprocess.SubprocessError:
+        return False, ""
+
+
+def get_include_and_linking_paths(
+    include_pytorch: bool = False,
+    vec_isa: VecISA = invalid_vec_isa,
+    cuda: bool = False,
+    aot_mode: bool = False,
+) -> Tuple[List[str], str, str, str, str]:
+    if (
+        config.is_fbcode()
+        and "CUDA_HOME" not in os.environ
+        and "CUDA_PATH" not in os.environ
+    ):
+        os.environ["CUDA_HOME"] = os.path.dirname(build_paths.cuda())
+    from torch.utils import cpp_extension
+
+    macros = ""
+    build_arch_flags = ""
+    if sys.platform == "linux" and (
+        include_pytorch
+        or vec_isa != invalid_vec_isa
+        or cuda
+        or config.cpp.enable_kernel_profile
+    ):
+        # Note - We include pytorch only on linux right now. There is more work
+        # to do to enable OMP build on darwin where PyTorch is built with IOMP
+        # and we need a way to link to what PyTorch links.
+        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        lpaths = cpp_extension.library_paths(cuda) + [
+            sysconfig.get_config_var("LIBDIR")
+        ]
+
+        libs = []
+
+        # No need to manually specify libraries in fbcode.
+        if not config.is_fbcode():
+            libs += ["torch", "torch_cpu"]
+            libs += ["gomp"]
+            if not aot_mode:
+                libs += ["torch_python"]
+        else:
+            # internal remote execution is able to find omp, but not gomp
+            libs += ["omp"]
+            if aot_mode:
+                ipaths += [os.path.dirname(cpp_prefix_path())]
+                if cuda:
+                    # This is a special treatment for Meta internal cuda-12 where all libs
+                    # are in lib/cuda-12 and lib/cuda-12/stubs
+                    for i, path in enumerate(lpaths):
+                        if path.startswith(
+                            os.environ["CUDA_HOME"]
+                        ) and not os.path.exists(f"{path}/libcudart_static.a"):
+                            for root, dirs, files in os.walk(path):
+                                if "libcudart_static.a" in files:
+                                    lpaths[i] = os.path.join(path, root)
+                                    lpaths.append(os.path.join(lpaths[i], "stubs"))
+                                    break
+        macros = vec_isa.build_macro()
+        if macros:
+            if config.is_fbcode() and vec_isa != invalid_vec_isa:
+                cap = str(vec_isa).upper()
+                macros = " ".join(
+                    [
+                        vec_isa.build_arch_flags(),
+                        f"-D CPU_CAPABILITY={cap}",
+                        f"-D CPU_CAPABILITY_{cap}",
+                        f"-D HAVE_{cap}_CPU_DEFINITION",
+                    ]
+                )
+
+        if cuda:
+            if macros is None:
+                macros = ""
+            macros += " -D USE_ROCM" if torch.version.hip else " -D USE_CUDA"
+
+        if cuda:
+            if torch.version.hip is not None:
+                libs += ["c10_hip", "torch_hip"]
+                macros += " -D __HIP_PLATFORM_AMD__"
+            else:
+                if config.is_fbcode():
+                    libs += ["cuda"]
+                else:
+                    libs += ["c10_cuda", "cuda", "torch_cuda"]
+        build_arch_flags = vec_isa.build_arch_flags()
+    else:
+        # Note - this is effectively a header only inclusion. Usage of some header files may result in
+        # symbol not found, if those header files require a library.
+        # For those cases, include the lpath and libs command as we do for pytorch above.
+        # This approach allows us to only pay for what we use.
+        ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")]
+        if aot_mode:
+            ipaths += [os.path.dirname(cpp_prefix_path())]
+        lpaths = []
+        if sys.platform == "darwin":
+            # only Apple builtin compilers (Apple Clang++) require openmp
+            omp_available = not is_apple_clang()
+
+            # check the `OMP_PREFIX` environment first
+            if os.getenv("OMP_PREFIX") is not None:
+                header_path = os.path.join(os.getenv("OMP_PREFIX"), "include", "omp.h")  # type: ignore[arg-type]
+                valid_env = os.path.exists(header_path)
+                if valid_env:
+                    ipaths.append(os.path.join(os.getenv("OMP_PREFIX"), "include"))  # type: ignore[arg-type]
+                    lpaths.append(os.path.join(os.getenv("OMP_PREFIX"), "lib"))  # type: ignore[arg-type]
+                else:
+                    warnings.warn("environment variable `OMP_PREFIX` is invalid.")
+                omp_available = omp_available or valid_env
+
+            libs = [] if omp_available else ["omp"]
+
+            # prefer to use openmp from `conda install llvm-openmp`
+            if not omp_available and os.getenv("CONDA_PREFIX") is not None:
+                omp_available = is_conda_llvm_openmp_installed()
+                if omp_available:
+                    conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib")  # type: ignore[arg-type]
+                    ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include"))  # type: ignore[arg-type]
+                    lpaths.append(conda_lib_path)
+                    # Prefer Intel OpenMP on x86 machine
+                    if os.uname().machine == "x86_64" and os.path.exists(
+                        os.path.join(conda_lib_path, "libiomp5.dylib")
+                    ):
+                        libs = ["iomp5"]
+
+            # next, try to use openmp from `brew install libomp`
+            if not omp_available:
+                omp_available, libomp_path = homebrew_libomp()
+                if omp_available:
+                    ipaths.append(os.path.join(libomp_path, "include"))
+                    lpaths.append(os.path.join(libomp_path, "lib"))
+
+            # if openmp is still not available, we let the compiler to have a try,
+            # and raise error together with instructions at compilation error later
+        else:
+            libs = ["omp"] if config.is_fbcode() else ["gomp"]
+
+    # Unconditionally import c10 for non-abi-compatible mode to use TORCH_CHECK - See PyTorch #108690
+    if not config.abi_compatible:
+        libs += ["c10"]
+        lpaths += [cpp_extension.TORCH_LIB_PATH]
+
+    # third party libs
+    if config.is_fbcode():
+        ipaths.append(build_paths.sleef())
+        ipaths.append(build_paths.openmp())
+        ipaths.append(build_paths.cc_include())
+        ipaths.append(build_paths.libgcc())
+        ipaths.append(build_paths.libgcc_arch())
+        ipaths.append(build_paths.libgcc_backward())
+        ipaths.append(build_paths.glibc())
+        ipaths.append(build_paths.linux_kernel())
+        ipaths.append(build_paths.cuda())
+        # We also need to bundle includes with absolute paths into a remote directory
+        # (later on, we copy the include paths from cpp_extensions into our remote dir)
+        ipaths.append("include")
+
+    static_link_libs = []
+    if aot_mode and cuda and config.is_fbcode():
+        # For Meta internal cuda-12, it is recommended to static link cudart
+        static_link_libs = ["-Wl,-Bstatic", "-lcudart_static", "-Wl,-Bdynamic"]
+
+    lpaths_str = " ".join(["-L" + p for p in lpaths])
+    libs_str = " ".join(static_link_libs + ["-l" + p for p in libs])
+    return ipaths, lpaths_str, libs_str, macros, build_arch_flags
+
+
+def cpp_compile_command(
+    input: Union[str, List[str]],
+    output: str,
+    warning_all: bool = True,
+    shared: bool = True,
+    include_pytorch: bool = False,
+    vec_isa: VecISA = invalid_vec_isa,
+    cuda: bool = False,
+    aot_mode: bool = False,
+    compile_only: bool = False,
+    use_absolute_path: bool = False,
+) -> str:
+    ipaths, lpaths, libs, macros, build_arch_flags = get_include_and_linking_paths(
+        include_pytorch, vec_isa, cuda, aot_mode
+    )
+    if isinstance(input, str):
+        input = [input]
+    ipaths_str = " ".join(["-I" + p for p in ipaths])
+    clang_flags = ""
+    if config.is_fbcode():
+        if aot_mode and not use_absolute_path:
+            inp_name = input
+            out_name = output
+            linker_script = _LINKER_SCRIPT
+        else:
+            # We need to copy any absolute-path torch includes
+            inp_name = [os.path.basename(i) for i in input]
+            out_name = os.path.basename(output)
+            linker_script = os.path.basename(_LINKER_SCRIPT)
+        assert is_clang()
+        # Use clang runtime instead of libgcc
+        clang_flags += " --rtlib=compiler-rt"
+        clang_flags += " -fuse-ld=lld"
+        clang_flags += f" -Wl,--script={linker_script}"
+        linker_paths = "-B" + build_paths.glibc_lib()
+        linker_paths += " -L" + build_paths.glibc_lib()
+    else:
+        inp_name = input
+        out_name = output
+        linker_paths = ""  # let the compiler pick
+    if compile_only:
+        libs, lpaths = "", ""
+    inp_name_str = " ".join(inp_name)
+    return re.sub(
+        r"[ \n]+",
+        " ",
+        f"""
+            {cpp_compiler()} {inp_name_str} {get_shared(shared, compile_only)}
+            {get_warning_all_flag(warning_all)} {cpp_flags()}
+            {get_glibcxx_abi_build_flags()}
+            {ipaths_str} {lpaths} {libs} {build_arch_flags}
+            {macros} {linker_paths} {clang_flags}
+            {optimization_flags()}
+            {use_custom_generated_macros()}
+            {use_fb_internal_macros()}
+            {use_standard_sys_dir_headers()}
+            {get_compile_only(compile_only)}
+            -o {out_name}
+        """,
+    ).strip()
+
+
+def run_command_and_check(cmd: str):
+    cmd = shlex.split(cmd)
+    try:
+        subprocess.check_call(cmd)
+    except subprocess.CalledProcessError as e:
+        raise exc.CppCompileError(cmd, e.output) from e
+
+
+@functools.lru_cache(None)
+def split_aot_inductor_output_path(path: str) -> Tuple[str, str]:
+    """Returns the path where the AOT Inductor compiled kernels are stored."""
+    if path.endswith(".so"):
+        return os.path.split(path)
+    else:
+        return path, ""
+
+
+class CudaKernelParamCache:
+    cache: Dict[str, Dict[str, str]] = dict()
+    clear = staticmethod(cache.clear)
+
+    @classmethod
+    def set(cls, key: str, params: Dict[str, str], cubin: str) -> None:
+        bin_type = "cubin" if torch.version.hip is None else "hsaco"
+        _, path = write(
+            cubin,
+            bin_type,
+            hash_type=bin_type,
+            specified_dir=split_aot_inductor_output_path(
+                config.aot_inductor.output_path
+            )[0],
+        )
+
+        params[get_cpp_wrapper_cubin_path_name()] = path
+
+        cls.cache[key] = params
+
+    @classmethod
+    def get(cls, key: str) -> Optional[Dict[str, str]]:
+        return cls.cache.get(key, None)
+
+    @classmethod
+    def get_keys(cls):
+        return cls.cache.keys()
+
+
+class AotCodeCompiler:
+    @classmethod
+    def compile(
+        cls,
+        graph: GraphLowering,
+        source_code: str,
+        serialized_extern_kernel_nodes: Optional[str],
+        cuda: bool,
+    ) -> str:
+        picked_vec_isa = pick_vec_isa()
+        cpp_command = repr(
+            cpp_compile_command(
+                "i", "o", vec_isa=picked_vec_isa, cuda=cuda, aot_mode=graph.aot_mode
+            )
+        )
+        fbcode_aot_cpu_re = False
+        use_absolute_path = False
+        if config.is_fbcode():
+            ld_command = build_paths.ld()
+            if not cuda and graph.aot_mode:  # Meta internal AOTInductor CPU
+                objcopy_command = build_paths.objcopy_fallback()
+                fbcode_aot_cpu_re = True
+                use_absolute_path = True
+            else:
+                objcopy_command = build_paths.objcopy()
+        else:
+            ld_command = "ld"
+            objcopy_command = "objcopy"
+
+        (
+            specified_output_path,
+            specified_so_name,
+        ) = split_aot_inductor_output_path(config.aot_inductor.output_path)
+        key, input_path = write(
+            source_code,
+            "cpp",
+            extra=cpp_command,
+            specified_dir=specified_output_path,
+        )
+
+        def _compile_consts_linux(consts: bytes) -> str:
+            _, consts_path = write(
+                consts,
+                "bin",
+                specified_dir=specified_output_path,
+            )
+
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            if fbcode_aot_cpu_re:
+                cmd = f"{ld_command} -r -b binary -o {os.path.basename(consts_o)} {os.path.basename(consts_path)}"
+                compile_file(consts_path, consts_o, cmd.split())
+                os.chmod(consts_o, 0o644)
+            else:
+                cmd = f"{ld_command} -r -b binary -o {consts_o} {consts_path}"
+                run_command_and_check(cmd)
+            log.debug("aot constant binary command: %s", cmd)
+
+            cmd = (
+                f"{objcopy_command} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                f" {consts_o} {consts_o}"
+            )
+            log.debug("aot constant obj command: %s", cmd)
+            run_command_and_check(cmd)
+
+            cmd = f"rm {consts_path}"
+            log.debug("aot constant bin removal command: %s", cmd)
+            run_command_and_check(cmd)
+
+            if fbcode_aot_cpu_re:
+                body = re.sub(r"[\W]", "_", os.path.basename(consts_path))
+            else:
+                body = re.sub(r"[\W]", "_", consts_path)
+
+            symbol_list = []
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_start=_binary_constants_bin_start {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_size=_binary_constants_bin_size {consts_o}"
+            )
+            symbol_list.append(
+                f"{objcopy_command} --redefine-sym _binary_{body}_end=_binary_constants_bin_end {consts_o}"
+            )
+            log.debug("aot constant binary redefine symbol: %s", " ".join(symbol_list))
+            for cmd in symbol_list:
+                run_command_and_check(cmd)
+            return consts_o
+
+        def _compile_consts_darwin(consts: bytes) -> str:
+            is_large_consts = len(consts) > 1024
+            consts_asm = "\t.section\t__TEXT,__const\n"
+            consts_asm += "\t.globl\t__binary_constants_bin_start\n"
+            consts_asm += "__binary_constants_bin_start:\n"
+            if not is_large_consts:
+                for c in consts:
+                    consts_asm += f"\t.byte {c}\n"
+                # Add one element even if constants are empty
+                # Otherwise assembler will not put them in data section
+                if not consts:
+                    consts_asm += "\t.space 1\n"
+            else:
+                consts_asm += "\t.quad 0x1234567899abcdef\n"
+                consts_asm += f"\t.space {len(consts) - 8}\n"
+            consts_asm += ".globl\t__binary_constants_bin_end\n"
+            consts_asm += "__binary_constants_bin_end:\n"
+            _, consts_path = write(
+                consts_asm,
+                "S",
+                specified_dir=specified_output_path,
+            )
+            consts_o = os.path.splitext(consts_path)[0] + ".o"
+            cmd = f"{cpp_compiler()} -c -o {consts_o} {consts_path}"
+            run_command_and_check(cmd)
+            if is_large_consts:
+                with open(consts_o, "r+b") as f:
+                    f.seek(0)
+                    hdr = f.read(1024)
+                    # Search for magic number and write the actual data over it
+                    start_idx = hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12")
+                    assert start_idx != -1
+                    f.seek(start_idx)
+                    pos = 0
+                    while pos < len(consts):
+                        rc = f.write(consts[pos:])
+                        pos += rc
+            return consts_o
+
+        from filelock import FileLock
+
+        lock_dir = get_lock_dir()
+        lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+        with lock:
+            # Currently, this only support serializing extern nodes in fbcode
+            # Eventually, we should also have a serializer for OSS.
+            if config.is_fbcode() and serialized_extern_kernel_nodes:
+                output_json = os.path.splitext(input_path)[0] + ".json"
+                with open(output_json, "w") as f:
+                    f.write(serialized_extern_kernel_nodes)
+
+            output_so = (
+                config.aot_inductor.output_path
+                if specified_so_name
+                else os.path.splitext(input_path)[0] + ".so"
+            )
+
+            output_o = os.path.splitext(input_path)[0] + ".o"
+            cmd = cpp_compile_command(
+                input=input_path,
+                output=output_o,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                compile_only=True,
+                use_absolute_path=use_absolute_path,
+            )
+            log.debug("aot compilation command: %s", cmd)
+            if fbcode_aot_cpu_re:
+                compile_file(input_path, output_o, cmd.split())
+                os.chmod(output_o, 0o644)
+            else:
+                run_command_and_check(cmd)
+
+            def _to_bytes(t: torch.Tensor) -> bytes:
+                # This serializes the tensor's untyped_storage to bytes by accessing
+                # the raw data of the underlying structure.
+                import ctypes
+
+                if t.numel() == 0:
+                    return b""
+
+                t_cpu = t.untyped_storage().cpu()
+                raw_array = ctypes.cast(
+                    t_cpu.data_ptr(),
+                    ctypes.POINTER(ctypes.c_ubyte * t_cpu.nbytes()),
+                )
+
+                return bytes(raw_array.contents)
+
+            aot_constants = b"".join(
+                _to_bytes(tensor)
+                for name, tensor in graph.constants.items()
+                if name not in graph.folded_constants
+            )
+            consts_o = {
+                "linux": _compile_consts_linux,
+                "darwin": _compile_consts_darwin,
+            }[sys.platform](aot_constants)
+
+            cmd = cpp_compile_command(
+                input=[output_o, consts_o],
+                output=output_so,
+                vec_isa=picked_vec_isa,
+                cuda=cuda,
+                aot_mode=graph.aot_mode,
+                use_absolute_path=use_absolute_path,
+            )
+            log.debug("aot linkage command: %s", cmd)
+            if fbcode_aot_cpu_re:
+                compile_file([output_o, consts_o], output_so, cmd.split())
+                os.chmod(output_so, 0o755)
+            else:
+                run_command_and_check(cmd)
+
+        return output_so
+
+
+# Putting this fn in cpp.py (unfortunately) causes a deadlock, which is why it's in codecache.py.
+# Why? importing from cpp.py invokes codecache.pick_vec_isa(), which takes out a lock.
+# Cycle goes:
+# - CppCodeCache.load()
+# - pick_vec_isa()
+# - valid_vec_isa_list()
+# - VecISA.__bool__() <-- takes out a lock
+# - compile_file() <-- imports cpp_prefix_path from cpp, which causes us to try to take out the same lock.
+@functools.lru_cache
+def cpp_prefix_path() -> str:
+    path = Path(__file__).parent / "codegen/cpp_prefix.h"
+    with path.open() as f:
+        content = f.read()
+        _, filename = write(
+            content,
+            "h",
+        )
+    return filename
+
+
+def cpp_prefix() -> str:
+    filename = cpp_prefix_path()
+    if config.is_fbcode():
+        # We need relative paths, since we bundle up
+        # everything that we compile into a folder for remote compilation.
+        return f'#include "{os.path.basename(filename)}"'
+    else:
+        return f'#include "{filename}"'
+
+
+# Given a path to an input cpp file and an output path,
+# Attempts to compile the file, storing the output in "output_path"
+@dynamo_timed
+def compile_file(
+    input_path: Union[str, List[str]], output_path: str, cmd: List[str]
+) -> None:
+    input_paths = [input_path] if isinstance(input_path, str) else input_path
+    input_files = [
+        os.path.basename(ip) if config.is_fbcode() else ip for ip in input_paths
+    ]
+    try:
+        if config.is_fbcode():
+            # Need to copy our header into the same folder as the sourcecode.
+            header_path = cpp_prefix_path()
+            header_name = os.path.basename(header_path)
+            output_name = os.path.basename(output_path)
+            # When we build remotely, we need to make sure to carefully copy any files
+            # that are required during the compilation process into our build directly.
+            # This is where all of the ATen/c10/Torch includes come from.
+            torch_includes_path = os.path.join(_TORCH_PATH, "include")
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                # Copy everything to tmp compilation folder
+                shutil.copy(header_path, os.path.join(tmp_dir, header_name))
+                shutil.copy(_LINKER_SCRIPT, os.path.join(tmp_dir, "script.ld"))
+                for p, f in zip(input_paths, input_files):
+                    shutil.copy(p, os.path.join(tmp_dir, f))
+                dest_include_path = os.path.join(tmp_dir, "include")
+                shutil.copytree(torch_includes_path, dest_include_path)
+                # Run the build
+                output_file_path = _run_build_command(cmd, tmp_dir, output_name)
+                # Copy output from the build
+                if os.path.exists(output_path):
+                    os.remove(output_path)
+                shutil.copy(output_file_path, output_path)
+        else:
+            subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        output = e.output.decode("utf-8")
+        openmp_problem = "'omp.h' file not found" in output or "libomp" in output
+        if openmp_problem and sys.platform == "darwin":
+            instruction = (
+                "\n\nOpenMP support not found. Please try one of the following solutions:\n"
+                "(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ "
+                "that has builtin OpenMP support;\n"
+                "(2) install OpenMP via conda: `conda install llvm-openmp`;\n"
+                "(3) install libomp via brew: `brew install libomp`;\n"
+                "(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path"
+                " with `include/omp.h` under it."
+            )
+            output += instruction
+        raise exc.CppCompileError(cmd, output) from e
+
+
+_libgomp: Optional[CDLL] = None
+
+
+class CppCodeCache:
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags: Dict[str, Any] = {}
+
+    @staticmethod
+    def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]:
+        return cdll.LoadLibrary(path)
+
+    @classmethod
+    def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
+        try:
+            return cls._load_library_inner(path, key)
+        except (ImportError, OSError) as e:
+            if "gomp" in str(e) and os.path.exists("/usr/lib64/libgomp.so.1"):
+                # hacky workaround for fbcode/buck
+                global _libgomp
+                _libgomp = cdll.LoadLibrary("/usr/lib64/libgomp.so.1")
+                return cls._load_library_inner(path, key)
+            if "failed to map segment from shared object" in str(e):
+                raise OSError(
+                    f"{e}.  The most common reason this may occur is if the {tempfile.gettempdir()} folder "
+                    "is mounted with noexec (e.g., by default Docker mounts tmp file systems "
+                    f"as noexec).  Please remount {tempfile.gettempdir()} with exec enabled, or set another "
+                    "temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable."
+                ) from e
+            raise
+
+    @classmethod
+    def load(cls, source_code: str, cuda: bool = False) -> Union[CDLL, ModuleType]:
+        cls.cpp_compile_command_flags.update({"cuda": cuda})
+        picked_vec_isa = pick_vec_isa()
+        cpp_command = repr(
+            cpp_compile_command(
+                "i", "o", vec_isa=picked_vec_isa, **cls.cpp_compile_command_flags
+            )
+        )
+        key, input_path = write(source_code, "cpp", extra=cpp_command)
+        if key not in cls.cache:
+            from filelock import FileLock
+
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                output_path = input_path[:-3] + "so"
+                if not os.path.exists(output_path):
+                    cmd = shlex.split(
+                        cpp_compile_command(
+                            input=input_path,
+                            output=output_path,
+                            vec_isa=picked_vec_isa,
+                            **cls.cpp_compile_command_flags,
+                        )
+                    )
+                    compile_file(input_path, output_path, cmd)
+                cls.cache[key] = cls._load_library(output_path, key)
+                cls.cache[key].key = key  # type: ignore[union-attr]
+
+        return cls.cache[key]
+
+
+# Customized Python binding for cpp kernels
+class CppPythonBindingsCodeCache(CppCodeCache):
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        # kernels have no dependency on libtorch
+        "include_pytorch": False,
+        "shared": True,
+    }
+    entry_function = "kernel"
+    call_entry_function = "kernel(%s);Py_RETURN_NONE;"
+    extra_parse_arg = ""
+    suffix_template = textwrap.dedent(
+        """
+        // Python bindings to call %s():
+        #define PY_SSIZE_T_CLEAN
+        #include <Python.h>
+        #include <sstream>
+        #include <cstdlib>
+
+        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
+        // We manually link it below to workaround issues with fbcode build.
+        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);
+
+        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
+            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
+            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
+        }
+        template <> inline long parse_arg<long>(PyObject* args, size_t n) {
+            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
+            if(result == -1 && PyErr_Occurred())
+                [[unlikely]] throw std::runtime_error("expected int arg");
+            return result;
+        }
+
+        %s
+
+        static PyObject* %s_py(PyObject* self, PyObject* args) {
+            try {
+                if(!PyTuple_CheckExact(args))
+                    [[unlikely]] throw std::runtime_error("tuple args required");
+                if(PyTuple_GET_SIZE(args) != %s)
+                    [[unlikely]] throw std::runtime_error("requires %s args");
+                %s
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return nullptr;
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return nullptr;
+            }
+        }
+
+        static PyMethodDef py_methods[] = {
+            {"%s", %s_py, METH_VARARGS, ""},
+            {NULL, NULL, 0, NULL}};
+
+        static struct PyModuleDef py_module =
+            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};
+
+        PyMODINIT_FUNC PyInit_%s(void) {
+            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
+            if(!str_addr) {
+                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
+                return nullptr;
+            }
+            std::istringstream iss(str_addr);
+            uintptr_t addr = 0;
+            iss >> addr;
+            _torchinductor_pyobject_tensor_data_ptr =
+                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
+            return PyModule_Create(&py_module);
+        }
+        """
+    )
+
+    @classmethod
+    def _load_library_inner(cls, path: str, key: str) -> ModuleType:
+        os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str(
+            torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
+        )
+        return importlib.machinery.ExtensionFileLoader(
+            f"{key}.{cls.entry_function}", path
+        ).load_module()  # type: ignore[call-arg]
+
+    @classmethod
+    def load_pybinding(
+        cls,
+        argtypes: List[str],
+        source_code: str,
+        cuda: bool = False,
+        num_outputs: int = -1,
+    ) -> Any:
+        """
+        Wrap a C++ function in fast Python bindings.
+
+        Args:
+            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
+            source_code: C++ source code containing a ENTRY_FUNCTION() function
+
+        Returns:
+            A python version of ENTRY_FUNCTION()
+        """
+        parseargs = ", ".join(
+            f"parse_arg<{argtype.replace('const ', '')}>(args, {n})"
+            for n, argtype in enumerate(argtypes)
+        )
+        suffix = cls.suffix_template % (
+            cls.entry_function,
+            cls.extra_parse_arg % num_outputs if cls.extra_parse_arg else "",
+            cls.entry_function,
+            len(argtypes),
+            len(argtypes),
+            cls.call_entry_function % parseargs,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+        )
+        result = cls.load(source_code + suffix, cuda)
+        assert isinstance(result, ModuleType)
+        return getattr(result, cls.entry_function)
+
+
+class CppWrapperCodeCache(CppPythonBindingsCodeCache):
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        "include_pytorch": True,
+        "shared": True,
+    }
+    entry_function = "inductor_entry_cpp"
+    call_entry_function = "return THPVariable_WrapList(inductor_entry_cpp(%s));"
+    extra_parse_arg = textwrap.dedent(
+        """
+        #include <torch/csrc/autograd/python_variable.h>
+        #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+
+        template <> inline std::vector<at::Tensor> parse_arg<std::vector<at::Tensor>>(PyObject* args, size_t n) {
+            return THPVariable_UnpackList(PyTuple_GET_ITEM(args, n));
+        }
+
+        std::vector<at::Tensor> inductor_entry_cpp(std::vector<at::Tensor>&& inputs) {
+            auto input_handles = unsafe_alloc_new_handles_from_tensors(inputs);
+            // For outputs, we only allocate a vector to hold returned tensor handles,
+            // not allocating the actual output tensor storage here
+            std::vector<AtenTensorHandle> output_handles(%s);
+
+            try {
+                inductor_entry_impl(input_handles.data(), output_handles.data());
+            } catch(std::exception const& e) {
+                PyErr_SetString(PyExc_RuntimeError, e.what());
+                return {};
+            } catch(...) {
+                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
+                return {};
+            }
+
+            return alloc_tensors_by_stealing_from_handles(output_handles.data(), output_handles.size());
+        }
+        """
+    )
+
+
+class PyCodeCache:
+    cache: Dict[str, ModuleType] = dict()
+    linemaps: Dict[str, List[Tuple[Any, ...]]] = dict()
+    clear = staticmethod(cache.clear)
+
+    @classmethod
+    def write(cls, source_code: str, extra: str = "") -> Tuple[str, str]:
+        return write(source_code, "py", extra=extra)
+
+    @classmethod
+    def load(
+        cls,
+        source_code: str,
+        extra: str = "",
+        linemap: Optional[List[Tuple[int, str]]] = None,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> ModuleType:
+        key, path = write(source_code, "py", extra=extra)
+        return cls.load_by_key_path(key, path, linemap, attrs)
+
+    @classmethod
+    def load_by_key_path(
+        cls,
+        key: str,
+        path: str,
+        linemap: Optional[List[Tuple[int, str]]] = None,
+        attrs: Optional[Dict[str, Any]] = None,
+    ) -> ModuleType:
+        if linemap is None:
+            linemap = []
+        if key not in cls.cache:
+            with open(path) as f:
+                try:
+                    code = compile(f.read(), path, "exec")
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to import {path}\n{type(e).__name__}: {e}"
+                    ) from None
+                mod = ModuleType(f"{__name__}.{key}")
+                mod.__file__ = path
+                mod.key = key  # type: ignore[attr-defined]
+                exec(code, mod.__dict__, mod.__dict__)
+                sys.modules[mod.__name__] = mod
+                # another thread might set this first
+                cls.cache.setdefault(key, mod)
+                # unzip into separate lines/nodes lists
+                cls.linemaps[path] = list(zip(*linemap))
+
+                if attrs is not None:
+                    for k, v in attrs.items():
+                        setattr(mod, k, v)
+
+        return cls.cache[key]
+
+    @classmethod
+    @functools.lru_cache(None)
+    def stack_frames_for_code(
+        cls, path: str, lineno: int
+    ) -> Optional[List[Dict[str, Any]]]:
+        if path not in cls.linemaps:
+            return None
+        # [(starting_line, <fx node>), ...]
+        lines, nodes = cls.linemaps[path]
+        p = bisect_right(lines, lineno)
+        if p == 0:
+            return None
+        entry = nodes[p - 1]
+        if not entry:
+            return None
+
+        def parse_stack_trace(stack_trace: str) -> List[Dict[str, Any]]:
+            # ideally fx stores stack traces as data rather than a string
+            # but this is not along a performance critical path
+            regex = r'File "(.+)", line (\d+), in (.+)\n'
+            matches = re.findall(regex, stack_trace)
+            return [
+                {"filename": f, "line": int(l), "name": n}
+                for f, l, n in reversed(matches)
+            ]
+
+        return parse_stack_trace(entry)
+
+
+class TritonCodeCache:
+    @classmethod
+    def load(cls, kernel_name: str, source_code: str) -> ModuleType:
+        mod = PyCodeCache.load(source_code)
+        return getattr(mod, kernel_name)
+
+
+def _cuda_compiler() -> Optional[str]:
+    if cuda_env.nvcc_exist(config.cuda.cuda_cxx):
+        return config.cuda.cuda_cxx
+    if cuda_env.nvcc_exist(os.getenv("CUDACXX")):
+        return os.getenv("CUDACXX", "")
+    if cuda_env.nvcc_exist(os.getenv("CUDA_HOME")):
+        return os.path.join(os.getenv("CUDA_HOME", ""), "bin/nvcc")
+    return "nvcc"
+
+
+def _cutlass_include_paths() -> List[str]:
+    cutlass_path = config.cuda.cutlass_dir
+    return [
+        os.path.join(cutlass_path, "include"),
+        os.path.join(cutlass_path, "tools/library/include"),
+        os.path.join(cutlass_path, "tools/library/src"),
+        os.path.join(cutlass_path, "tools/util/include"),
+    ]
+
+
+def _cuda_lib_options() -> List[str]:
+    from torch.utils import cpp_extension
+
+    extra_ldflags: List[str] = []
+    if is_linux():
+        extra_lib_dir = "lib64"
+        if not os.path.exists(
+            cpp_extension._join_cuda_home(extra_lib_dir)
+        ) and os.path.exists(cpp_extension._join_cuda_home("lib")):
+            # 64-bit CUDA may be installed in "lib"
+            # Note that it's also possible both don't exist (see _find_cuda_home) - in that case we stay with "lib64"
+            extra_lib_dir = "lib"
+        extra_ldflags.append(f"-L{cpp_extension._join_cuda_home(extra_lib_dir)}")
+        extra_ldflags.append(
+            f'-L{cpp_extension._join_cuda_home(extra_lib_dir, "stubs")}'
+        )
+        extra_ldflags.append("-lcuda")
+        extra_ldflags.append("-lcudart")
+    else:
+        raise NotImplementedError(
+            "Unsupported env, failed to find cuda libs! Currently only Linux is supported."
+        )
+    return extra_ldflags
+
+
+def _nvcc_host_compiler_options() -> List[str]:
+    return [
+        "-fPIC",
+        "-fno-strict-aliasing",
+        "-fvisibility=hidden",
+        "-Wconversion",
+    ]
+
+
+def _nvcc_compiler_options() -> List[str]:
+    arch = cuda_env.get_cuda_arch()
+    if arch == "90":
+        # Required by cutlass compilation.
+        arch = "90a"
+    code = [f"sm_{arch}", f"compute_{arch}"]
+    if config.cuda.enable_cuda_lto:
+        code += [f"lto_{arch}"]
+    options = [
+        "-t=0",
+        "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+        "-w",
+        f"-gencode=arch=compute_{arch},code=[{','.join(code)}]",
+        config.cuda.compile_opt_level,
+        "-std=c++17",
+        "--expt-relaxed-constexpr",
+        "-DNDEBUG",
+    ]
+    if config.cuda.enable_debug_info:
+        options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"])
+    if config.cuda.enable_ptxas_info:
+        options.extend(
+            [
+                "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                "--source-in-ptx",
+            ]
+        )  # Annotate the ptx file with source information
+    if config.cuda.use_fast_math:
+        options.extend(
+            [
+                "--use_fast_math",
+                "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+            ]
+        )
+    return options
+
+
+def cuda_compile_command(
+    src_files: List[str],
+    dst_file: str,
+    dst_file_ext: str,
+) -> str:
+    include_paths = _cutlass_include_paths()
+    cuda_lib_options = _cuda_lib_options()
+    nvcc_host_compiler_options = _nvcc_host_compiler_options()
+    nvcc_compiler_options = _nvcc_compiler_options()
+    options = (
+        nvcc_compiler_options
+        + [
+            f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+            for opt in nvcc_host_compiler_options
+        ]
+        + ["-I" + path for path in include_paths]
+        + cuda_lib_options
+    )
+    src_file = " ".join(src_files)
+    res = ""
+    if dst_file_ext == "o":
+        res = f"{_cuda_compiler()} {' '.join(options)} -c -o {dst_file} {src_file}"
+    elif dst_file_ext == "so":
+        options.append("-shared")
+        res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}"
+    else:
+        raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!")
+    log.debug("CUDA command: %s", res)
+    return res
+
+
+class DLLWrapper:
+    """A wrapper for a dynamic library."""
+
+    def __init__(
+        self,
+        lib_path: str,
+    ):
+        self.lib_path = lib_path
+        self.DLL = cdll.LoadLibrary(lib_path)
+        self.is_open = True
+
+    def close(self):
+        if self.is_open:
+            self._dlclose()
+            self.is_open = False
+
+    def _dlclose(self):
+        f_dlclose = None
+
+        if is_linux():
+            syms = CDLL(None)
+            if not hasattr(syms, "dlclose"):
+                # Apline Linux
+                syms = CDLL("libc.so")
+
+            if hasattr(syms, "dlclose"):
+                f_dlclose = syms.dlclose
+        else:
+            raise NotImplementedError("Unsupported env, failed to do dlclose!")
+
+        if f_dlclose is not None:
+            f_dlclose.argtypes = [c_void_p]
+            f_dlclose(self.DLL._handle)
+        else:
+            log.warning(
+                "dll unloading function was not found, library may not be unloaded properly!"
+            )
+
+    def __getattr__(self, name):
+        if not self.is_open:
+            raise RuntimeError(f"Cannot use closed DLL library: {self.lib_path}")
+
+        method = getattr(self.DLL, name)
+
+        def _wrapped_func(*args):
+            err = method(*args)
+            if err:
+                raise RuntimeError(f"Error in function: {method.__name__}")
+
+        return _wrapped_func
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+    def __del__(self):
+        self.close()
+
+
+class CUDACodeCache:
+    @dataclasses.dataclass
+    class CacheEntry:
+        input_path: str
+        output_path: str
+
+    cache: Dict[str, CacheEntry] = dict()
+    clear = staticmethod(cache.clear)
+    _SOURCE_CODE_SUFFIX = "cu"
+
+    @classmethod
+    def write(cls, source_code, dst_file_ext) -> Tuple[str, str]:
+        """
+        Writes source code into a file with dst_file_ext as the file extension.
+        Returns the hash key of source code, and the path to the file.
+        """
+
+        cuda_command = repr(
+            cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext)
+        )
+        key, input_path = write(
+            source_code, cls._SOURCE_CODE_SUFFIX, extra=cuda_command
+        )
+        return key, input_path
+
+    @classmethod
+    def compile(cls, source_code, dst_file_ext) -> Tuple[str, str, str]:
+        """
+        Compiles CUDA source_code into a file with dst_file_ext extension.
+        Returns a tuple of dst_file_path, hash_key, source_code_path
+        """
+
+        key, input_path = cls.write(source_code, dst_file_ext)
+        if key not in cls.cache:
+            from filelock import FileLock
+
+            lock_dir = get_lock_dir()
+            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
+            with lock:
+                output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext
+                if not os.path.exists(output_path):
+                    cmd = cuda_compile_command(
+                        [input_path], output_path, dst_file_ext
+                    ).split(" ")
+                    try:
+                        subprocess.check_output(
+                            cmd, stderr=subprocess.STDOUT, env=os.environ
+                        )
+                    except subprocess.CalledProcessError as error:
+                        raise exc.CUDACompileError(cmd, error.output) from error
+                cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path)
+
+        return (cls.cache[key].output_path, key, input_path)
+
+    @classmethod
+    def load(cls, source_code, dst_file_ext) -> Tuple[DLLWrapper, str, str]:
+        """
+        Compiles source code and loads the generated .so file.
+        Returns a tuple of DLLWrapper, hash_key, source_code_path
+        """
+
+        if dst_file_ext != "so":
+            raise RuntimeError(
+                f"Only support loading a .so file for now. "
+                f"Requested file extension: {dst_file_ext}. Source code: {source_code}"
+            )
+        dst_file_path, hash_key, source_code_path = cls.compile(
+            source_code, dst_file_ext
+        )
+        return (DLLWrapper(dst_file_path), hash_key, source_code_path)
+
+
+def caching_device_properties():
+    for _, device_interface in get_registered_device_interfaces():
+        if device_interface.is_available():
+            device_interface.Worker.get_device_properties()
+
+
+def _set_triton_ptxas_path() -> None:
+    if os.environ.get("TRITON_PTXAS_PATH") is not None:
+        return
+    ptxas_path = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
+    )
+    if not os.path.exists(ptxas_path):
+        return
+    if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
+        os.environ["TRITON_PTXAS_PATH"] = ptxas_path
+    else:
+        warnings.warn(f"{ptxas_path} exists but is not an executable")
+
+
+def _worker_compile(
+    kernel_name: str, source_code: str, cc: int, device: torch.device
+) -> None:
+    device_interface = get_interface_for_device(device.type)
+    device_interface.Worker.set_device(device.index)
+    kernel = TritonCodeCache.load(kernel_name, source_code)
+    kernel.precompile(warm_cache_only_with_cc=cc)
+
+
+def _load_kernel(kernel_name: str, source_code: str) -> ModuleType:
+    _set_triton_ptxas_path()
+    kernel = TritonCodeCache.load(kernel_name, source_code)
+    kernel.precompile()
+    return kernel
+
+
+class TritonFuture:
+    kernel: ModuleType
+
+    def __init__(
+        self,
+        kernel_name: str,
+        source_code: str,
+        future: Future[Any],
+    ) -> None:
+        self.kernel_name = kernel_name
+        self.source_code = source_code
+        self.future = future
+
+    # @dynamo_utils.dynamo_timed
+    def result(self) -> ModuleType:
+        t0 = time()
+        if hasattr(self, "kernel"):
+            return self.kernel
+        # If the worker failed this will throw an exception.
+        self.future.result()
+        kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code)
+        latency = time() - t0
+        if latency > 50:
+            developer_warning(
+                f"Detected long compilation time of {latency} seconds for kernel name {self.kernel_name}"
+            )
+            developer_warning(self.source_code)
+        del self.kernel_name, self.source_code, self.future
+        return kernel
+
+
+# If this process dies abnormally (e.g. segfault)
+# it will not shut down the workers. Instead
+# the workers will have their parent reassigned to the
+# init process. This launches a separate thread to
+# watch for the worker getting reassigned,
+# and cleans it up in this case.
+#
+# This function cannot be an inner function since otherwise mp_context="spawn" would
+# not work for ProcessPoolExecutor since inner functions cannot be pickled.
+def _async_compile_initializer(orig_ppid) -> None:
+    def run() -> None:
+        while True:
+            sleep(1)
+            if orig_ppid != os.getppid():
+                os.kill(os.getpid(), signal.SIGKILL)
+
+    global _watchdog_thread
+    _watchdog_thread = Thread(target=run, daemon=True)
+    _watchdog_thread.start()
+    # Ignore Ctrl-C (i.e. SIGINT) sent to pool workers to avoid meaningless log spam.
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
+_watchdog_thread: Optional[Thread] = None
+
+# Used to keep track of all process pools invoked so far.
+_pool_set: Set[ProcessPoolExecutor] = set()
+
+
+def shutdown_compile_workers() -> None:
+    """Shut down all outstanding compile-worker pools."""
+    global _pool_set
+    for pool in _pool_set:
+        pool.shutdown()
+    _pool_set.clear()
+
+
+class AsyncCompile:
+    def __init__(self) -> None:
+        pass
+
+    @staticmethod
+    @functools.lru_cache(1)
+    def pool() -> ThreadPoolExecutor:
+        assert config.compile_threads > 1
+        return ThreadPoolExecutor(config.compile_threads)
+
+    @staticmethod
+    @functools.lru_cache(1)
+    def process_pool() -> ProcessPoolExecutor:
+        # ensure properties have been calculated before processes
+        # are forked
+        caching_device_properties()
+        assert config.compile_threads > 1
+        orig_ppid = os.getpid()
+
+        ctx = multiprocessing.get_context(config.worker_start_method)
+        pool = ProcessPoolExecutor(
+            config.compile_threads,
+            mp_context=ctx,
+            initializer=partial(_async_compile_initializer, orig_ppid),
+        )
+
+        global _pool_set
+        _pool_set.add(pool)
+
+        # when this pool is created in a subprocess object, the normal exit handler
+        # doesn't run, and we need to register our own handler.
+        # exitpriority has to be high, because another one of the finalizers will
+        # kill the worker thread that sends the shutdown message to the workers...
+        multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize)
+        return pool
+
+    @classmethod
+    def warm_pool(cls) -> None:
+        if config.compile_threads <= 1:
+            return
+        _compile_start()
+        pool = cls.process_pool()
+
+        # We have to fork processes for compiler workers, but the more memory and other resources that are loaded, the
+        # slower the os.fork time is, quite drastically. It also holds the GIL so we can't put it on another thread.
+
+        # Examples:
+        # A simple x + x + x script: 10ms seconds in the middle of the program, 2ms at startup
+        # tf_efficientnet_b0 benchmark: 50ms! in the middle of the program , 3ms at startup
+
+        # So we want to start the workers early when it is still cheap, and also to allow the workers to get
+        # ready before we have work for them.
+
+        # ProcessPoolExecutor also does not launch the workers until it finds a point when all the workers are idle.
+        # But if we waited until then fork time will be long and we will be waiting for the processes to initialize.
+
+        # We force them to start here with some YOLOing of the internal methods.
+        if hasattr(pool, "_start_queue_management_thread"):
+            pool._start_queue_management_thread()
+        else:
+            for _ in range(config.compile_threads):
+                pool._adjust_process_count()
+            if hasattr(pool, "_start_executor_manager_thread"):
+                pool._start_executor_manager_thread()
+        _compile_end()
+
+    @classmethod
+    def submit(cls, task: Callable[..., Any]) -> Any:
+        if config.compile_threads <= 1:
+            return task()
+        return cls.pool().submit(task)
+
+    @classmethod
+    def map(cls, fn: Callable[..., Any], seq: List[Any]) -> List[Any]:
+        if config.compile_threads <= 1 or len(seq) <= 1:
+            return list(map(fn, seq))
+        return [t.result() for t in [cls.pool().submit(fn, x) for x in seq]]
+
+    def triton(
+        self, kernel_name: str, source_code: str, device_str: str = "cuda"
+    ) -> Union[TritonFuture, ModuleType]:
+        _compile_start()
+
+        if config.compile_threads > 1:
+            device_interface = get_interface_for_device(device_str)
+            device = torch.device(device_str, device_interface.current_device())
+            cc = device_interface.get_compute_capability(device)
+            future = self.process_pool().submit(
+                _worker_compile, kernel_name, source_code, cc, device
+            )
+            return TritonFuture(kernel_name, source_code, future)
+        else:
+            return _load_kernel(kernel_name, source_code)
+
+    def multi_kernel(self, *args, **kwargs) -> ModuleType:
+        """
+        Async compile the python shim for multi-kernel.
+        """
+
+        def task():
+            from torch._inductor.codegen.multi_kernel import MultiKernelCall
+
+            return MultiKernelCall(*args, **kwargs)
+
+        return self.submit(task)
+
+    def cpp(self, source_code: str) -> ModuleType:
+        def task():
+            return CppCodeCache.load(source_code).kernel
+
+        return self.submit(task)
+
+    def cpp_pybinding(self, argtypes: List[str], source_code: str) -> ModuleType:
+        return self.submit(
+            functools.partial(
+                CppPythonBindingsCodeCache.load_pybinding, argtypes, source_code
+            )
+        )
+
+    def cuda(self, source_code, dst_file_ext):
+        def task():
+            return CUDACodeCache.load(source_code, dst_file_ext)[0]
+
+        return self.submit(task)
+
+    def wait(self, scope: Dict[str, Any]) -> None:
+        num_kernels = len(
+            [
+                value
+                for key, value in scope.items()
+                if isinstance(value, (Future, TritonFuture))
+            ]
+        )
+        pbar = tqdm(
+            total=num_kernels,
+            desc="Inductor Compilation",
+            disable=config.disable_progress,
+            delay=0,
+        )
+        if config.compile_threads > 1:
+            for key, result in scope.items():
+                if config.verbose_progress and not isinstance(pbar, _Faketqdm):
+                    pbar.set_postfix_str(key)
+                if isinstance(result, (Future, TritonFuture)):
+                    scope[key] = result.result()
+                    pbar.update(1)
+
+        _compile_end()
+
+
+if os.environ.get("TORCH_TNT_IN_USE", "0") == "1":
+    # When TorchTNT is used, calling warm_pool() here will cause the
+    # compile workers created not being able to be shut down inside
+    # shutdown_compile_workers(). This may cause significant QPS drop.
+    log.info("Do not call AsyncCompile.warm_pool() because TorchTNT is in use.")
+else:
+    AsyncCompile.warm_pool()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f28ee719d08e4899440b3c58de77e45fad9741ae
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..157be12cca2e1adfe810231c509aebd50d0ccae3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1d507d9746e255ff60d7cc3befa56709975b4e6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81dfe7c70d342ca6064ef9224c83f1c488e85bed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cpu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..776ebba7288dc168e98adbb78fdc49e0af1fe1b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cpp_wrapper_cuda.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cuda_combined_scheduling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cuda_combined_scheduling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be6c870c52e1a0237b3123c49fa937e808eb2c83
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/cuda_combined_scheduling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/memory_planning.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/memory_planning.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6bee7f4a75a6c50ffa74c1850ae7de9e8f3657d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/memory_planning.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/multi_kernel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/multi_kernel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa308694c34e7785120e8f2a80f33d8af2721130
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/multi_kernel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27237f95eaa88518ac8f9f1e0b77e7ebfab0be8f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_foreach.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_foreach.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45caa2e6f8b232c041c9ed35d47011aed4befd6e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_foreach.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_split_scan.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_split_scan.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25da24eed95159e51ba2bb1ae14df8326763d743
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_split_scan.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f8cf841a22d6afa04bac20220b3344c0f691c92
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/triton_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a646ae6fc021badf00a4dd1cb43cbf29f3daef8a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/__pycache__/wrapper.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/implementation.cpp b/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/implementation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09ba9a2733120fd3fc5680cbfac412e3d314ff4e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/implementation.cpp
@@ -0,0 +1,87 @@
+// NOTE: Like interface.cpp, this file will be copied into AOTInductor
+// generated output. This file is intended to keep implementation
+// details separate from the implementation of the AOTI public
+// interface. Note also that #includes should go into interface.cpp
+// for simplicity of maintenance.
+
+namespace torch {
+namespace aot_inductor {
+template <typename T>
+void convert_output_to_handle(
+    const ArrayRefTensor<T>& output,
+    AtenTensorHandle& handle) {
+  handle = output.expensiveCopyToTensor();
+}
+
+template <typename... Ts, std::size_t... Is>
+void convert_outputs_to_handles_helper(
+    const std::tuple<ArrayRefTensor<Ts>...>& outputs,
+    AtenTensorHandle* output_handles,
+    std::index_sequence<Is...>) {
+  (convert_output_to_handle(std::get<Is>(outputs), output_handles[Is]), ...);
+}
+template <typename... Ts>
+void convert_outputs_to_handles(
+    const std::tuple<ArrayRefTensor<Ts>...>& outputs,
+    AtenTensorHandle* output_handles) {
+  convert_outputs_to_handles_helper(
+      outputs, output_handles, std::make_index_sequence<sizeof...(Ts)>());
+}
+
+template <typename T>
+void convert_handle_to_arrayref_tensor(
+    AtenTensorHandle handle,
+    ArrayRefTensor<T>& input) {
+  void* data_ptr;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle, &data_ptr));
+  int64_t dim;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(handle, &dim));
+  int64_t numel;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(handle, &numel));
+  int64_t* sizes;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle, &sizes));
+  int64_t* strides;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle, &strides));
+  int32_t dtype;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(handle, &dtype));
+  int32_t device_type;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(handle, &device_type));
+  int32_t device_index;
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(handle, &device_index));
+
+  input = ArrayRefTensor<T>(
+      MiniArrayRef<T>(reinterpret_cast<T*>(data_ptr), numel),
+      MiniArrayRef<const int64_t>(sizes, dim),
+      MiniArrayRef<const int64_t>(strides, dim),
+      device_type,
+      device_index);
+}
+
+template <typename... Ts, std::size_t... Is>
+void convert_handles_to_inputs_helper(
+    AtenTensorHandle* input_handles,
+    std::tuple<ArrayRefTensor<Ts>...>& inputs,
+    std::index_sequence<Is...>) {
+  (convert_handle_to_arrayref_tensor(input_handles[Is], std::get<Is>(inputs)),
+   ...);
+}
+
+template <typename... Ts>
+void convert_handles_to_inputs(
+    AtenTensorHandle* input_handles,
+    std::tuple<ArrayRefTensor<Ts>...>& inputs) {
+  convert_handles_to_inputs_helper(
+      input_handles, inputs, std::make_index_sequence<sizeof...(Ts)>());
+}
+
+template <typename T>
+void assert_numel(const ArrayRefTensor<T>& tensor, int64_t numel) {
+  if (tensor.numel() != numel) {
+    std::stringstream err;
+    err << "incorrect numel for input tensor. expected " << numel << ", got " << tensor.numel();
+    throw std::runtime_error(err.str());
+  }
+}
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/interface.cpp b/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d88881f3afcaf758442b202224f88bae6d47afe8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/aoti_runtime/interface.cpp
@@ -0,0 +1,354 @@
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)                 \
+  try {                                                      \
+    __VA_ARGS__                                              \
+  } catch (const std::exception& e) {                        \
+    std::cerr << "Error: " << e.what() << std::endl;         \
+    return AOTI_RUNTIME_FAILURE;                             \
+  } catch (...) {                                            \
+    std::cerr << "Unknown exception occurred." << std::endl; \
+    return AOTI_RUNTIME_FAILURE;                             \
+  }                                                          \
+  return AOTI_RUNTIME_SUCCESS;
+
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() : prev_mode(aoti_torch_grad_mode_is_enabled()) {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  bool prev_mode;
+};
+
+extern "C" {
+
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0" << std::endl;
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+
+} // extern "C"
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/common.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..843bbd64fc74a7efeea3c8cb0f0adb80049b6e27
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/common.py
@@ -0,0 +1,1755 @@
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import operator
+import re
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+
+import sympy
+from sympy.printing.printer import Printer
+
+import torch
+import torch.fx
+from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.value_ranges import ValueRanges
+
+from .. import config, metrics
+from ..utils import (
+    DeferredLineBase,
+    do_bench,
+    free_symbol_startswith,
+    IndentedBuffer,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_subs,
+    unique,
+)
+from ..virtualized import ops, OpsHandler, OpsValue, ReductionType, StoreMode, V
+
+if TYPE_CHECKING:
+    from ..ir import TensorBox
+
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+
+
+def data_type_logger(msg):
+    if schedule_log.isEnabledFor(logging.DEBUG):
+        schedule_log.debug("Data type propagation: %s", msg)
+
+
+@dataclasses.dataclass
+class WorkspaceArg:
+    """A temporary buffer used for a single kernel, then discarded.
+
+    Not registered as a traditional buffer since there are no users,
+    so it would be dead code eliminated.
+    """
+
+    nbytes: sympy.Expr
+    zero_fill: bool
+
+
+@dataclasses.dataclass
+class TensorArg:
+    name: str
+    buffer: str
+    dtype: torch.dtype
+    offset: sympy.Expr = sympy.Integer(0)
+
+
+@dataclasses.dataclass
+class SizeArg:
+    name: str
+    expr: sympy.Expr
+
+
+@dataclasses.dataclass
+class DeviceCodegen:
+    scheduling: type
+    wrapper_codegen: type
+
+
+KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg]
+
+device_codegens: Dict[str, DeviceCodegen] = {}
+
+
+class DeviceOpOverrides:
+    def import_get_raw_stream_as(self, name):
+        raise NotImplementedError()
+
+    def set_device(self, device_idx):
+        raise NotImplementedError()
+
+    def synchronize(self):
+        raise NotImplementedError()
+
+    def device_guard(self, device_idx):
+        raise NotImplementedError()
+
+
+device_op_overrides_dict: Dict[str, DeviceOpOverrides] = {}
+
+
+# The code generated by Inductor consists of two main parts: kernel code and wrapper code.
+# For any new backend looking to integrate with Inductor, customization of these two main
+# parts are necessary to generate its specific code.
+#
+# Kernel code generation is determined by different Scheduling. Consequently, a new
+# backend needs to provide a custom Scheduling for its unique kernel code generation. Currently,
+# CppScheduling and TritonScheduling serve the C++/OpenMP and Triton backends, respectively.
+#
+# For the Wrapper, Inductor provides a WrapperCodeGen class to generate the Python wrapper code
+# that bridges kernels. This allows out-of-tree backends to inherit from WrapperCodeGen,
+# and override specific member functions to create backend-specific Python wrapper code.
+#
+# Other classes, such as CppKernel and TritonKernel, used for code generation, typically form part
+# of the logic for either Scheduling or WrapperCodeGen. So the Scheduling and WrapperCodeGen interfaces
+# provide flexibility to the backend. A backend can choose to implement these classes from scratch,
+# or reuse them by extending and overriding as necessary. And Inductor provides the registration API,
+# register_backend_for_device, to equip a new backend at runtime.
+#
+# Intel has developed a new backend on top of Triton to support Intel GPUs, leveraging these interfaces.
+# This backend can be used as a reference:
+# https://github.com/intel/intel-extension-for-pytorch/blob/5dcc9d57e5422cf295e1a1ee97896d6b6a554a85/intel_extension_for_pytorch/_inductor/__init__.py#L9
+def register_backend_for_device(
+    device: str, device_scheduling: type, device_wrapper_codegen: type
+):
+    device_codegens[device] = DeviceCodegen(device_scheduling, device_wrapper_codegen)
+
+
+def get_scheduling_for_device(device: str):
+    return device_codegens[device].scheduling if device in device_codegens else None
+
+
+def get_wrapper_codegen_for_device(device: str):
+    return (
+        device_codegens[device].wrapper_codegen if device in device_codegens else None
+    )
+
+
+def index_prevent_reordering(index: List[sympy.Expr], index_vars, sizes):
+    from ..ir import FlexibleLayout
+
+    # added contiguous index prevents reordering
+    return [*index, sympy_dot(index_vars, FlexibleLayout.contiguous_strides(sizes))]
+
+
+def register_device_op_overrides(device: str, device_op_overrides: DeviceOpOverrides):
+    device_op_overrides_dict[device] = device_op_overrides
+
+
+def get_device_op_overrides(device: str):
+    assert isinstance(device, str)
+
+    if not device_op_overrides_dict.keys():
+        from .cuda import device_op_overrides  # noqa: F401
+
+    if device in device_op_overrides_dict.keys():
+        return device_op_overrides_dict[device]
+
+    return DeviceOpOverrides()
+
+
+@functools.lru_cache(None)
+def boolean_ops():
+    return (
+        "is_inf",
+        "is_nan",
+        "bitwise_xor",
+        "logical_not",
+        "signbit",
+        "le",
+        "lt",
+        "ge",
+        "gt",
+        "eq",
+        "ne",
+    )
+
+
+DTYPE_TO_COMPUTATION_DTYPE = {
+    torch.bfloat16: torch.float,
+    torch.float16: torch.float,
+    **{
+        dtype: dtype
+        for dtype in [
+            torch.bool,
+            torch.float32,
+            torch.float64,
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+            torch.uint8,
+            torch.uint16,
+            torch.uint32,
+            torch.uint64,
+        ]
+    },
+}
+
+
+class DataTypePropagation:
+    def __init__(self, body) -> None:
+        self.body = body
+        self.graphs: Dict[Union[Callable[..., Any], str], Any] = {
+            "root": body.root_block.graph
+        }
+        for k, v in body.subblocks.items():
+            self.graphs[k] = v.graph
+
+    def deduce_node_dtype_by_inputs(self, node: torch.fx.Node):
+        inputs = node.all_input_nodes
+        input_nodes = [
+            n for n in inputs if isinstance(n, torch.fx.Node) and n.op != "placeholder"
+        ]
+        if len(input_nodes) == 0:
+            return None
+
+        all_input_nodes_propogated = all(
+            OptimizationContext.key in n.meta
+            and n.meta[OptimizationContext.key].dtype is not None
+            for n in input_nodes
+        )
+        if not all_input_nodes_propogated:
+            return None
+
+        return functools.reduce(
+            torch.promote_types,
+            [n.meta[OptimizationContext.key].dtype for n in input_nodes],
+        )
+
+    def deduce_node_dtype_by_subgraph(self, node: torch.fx.Node):
+        sub_graph = self.graphs[node.target]
+        dtype = self.propagate_graph(sub_graph)
+        assert dtype
+        return dtype
+
+    def deduce_node_dtype(self, node: torch.fx.Node):
+        if node.target in boolean_ops():
+            return torch.bool
+
+        if node.op == "placeholder":
+            return None
+
+        if node.target == "output":
+            # we can infer output node if it only have 1 arg
+            if len(node.args) != 1:
+                return None
+
+        if node.target in (
+            "to_dtype",
+            "index_expr",
+        ):
+            return node.args[-1]
+
+        if node.target in (
+            "rand",
+            "randn",
+        ):
+            return torch.float
+
+        if node.target in (
+            "get_index",
+            "index_expr",
+        ):
+            return torch.int64
+
+        if node.target in (
+            "load",
+            "store",
+            "store_reduction",
+        ):
+            buf_name = node.args[1]
+            return V.graph.get_dtype(buf_name)  # type: ignore[arg-type]
+
+        if node.target == operator.getitem:
+            return self.deduce_node_dtype(node.args[0])  # type: ignore[arg-type]
+
+        assert isinstance(node.target, str)
+
+        if node.target == "reduction":
+            return node.args[1]
+
+        if node.target == "constant":
+            return DTYPE_TO_COMPUTATION_DTYPE[node.args[-1]]  # type: ignore[index]
+
+        if node.target.startswith("masked_subblock"):
+            return self.deduce_node_dtype_by_subgraph(node)
+
+        return self.deduce_node_dtype_by_inputs(node)
+
+    def propagate_graph(self, graph: torch.fx.Graph):
+        assert graph.nodes
+        graph_dtype = None
+        # For masked_subblock, we use output's dtype to represent
+        # the dtype of this subgraph. For other cases, graph_dtype
+        # might be None
+        for node in graph.nodes:
+            if OptimizationContext.key in node.meta:
+                opt_ctx = node.meta[OptimizationContext.key]
+            else:
+                opt_ctx = OptimizationContext()
+
+            opt_ctx.dtype = self.deduce_node_dtype(node)
+            node.meta[OptimizationContext.key] = opt_ctx
+            if node.target == "output":
+                graph_dtype = opt_ctx.dtype
+        return graph_dtype
+
+    def propagate(self):
+        self.propagate_graph(self.graphs["root"])
+
+    @classmethod
+    def propagate_loopbody(cls, body):
+        return cls(body).propagate()
+
+    @classmethod
+    def propagate_scheduler_node(cls, node):
+        from ..ir import LoopBody
+        from ..scheduler import SchedulerNode
+
+        assert isinstance(node, SchedulerNode)
+        assert isinstance(node._body, LoopBody)
+        DataTypePropagation.propagate_loopbody(node._body)
+
+
+class ExprPrinter(Printer):
+    @staticmethod
+    def paren(string):
+        def all_in_parens(string):
+            if string[0] != "(" or len(string) < 2:
+                return False
+            count = 1
+            for i, char in enumerate(string[1:]):
+                if char == "(":
+                    count += 1
+                elif char == ")":
+                    count -= 1
+                if count == 0 and i != len(string) - 2:
+                    return False
+            assert count == 0
+            return True
+
+        if (
+            isinstance(string, CSEVariable)
+            or re.match(r"^[a-z0-9_.]+$", string, re.I)
+            or re.match(r"^\([^)]*\)$", string, re.I)
+            or string == ""
+        ):
+            return string
+        # don't put extra parens for strings that are already wrapped in parens
+        if all_in_parens(string):
+            return string
+        return f"({string})"
+
+    def _print_Infinity(self, expr):
+        return "math.inf"
+
+    def _print_NegativeInfinity(self, expr):
+        return "-math.inf"
+
+    def _print_Relational(self, expr):
+        return f" {expr.rel_op} ".join(map(self.paren, map(self._print, expr.args)))
+
+    def _print_Mul(self, expr):
+        return "*".join(map(self.paren, map(self._print, expr.args)))
+
+    def _print_Add(self, expr):
+        return " + ".join(map(self.paren, map(self._print, expr.args)))
+
+    def _print_Mod(self, expr):
+        return " % ".join(map(self.paren, map(self._print, expr.args)))
+
+    def _print_FloorDiv(self, expr):
+        raise NotImplementedError(f"_print_FloorDiv not implemented for {type(self)}")
+
+    def _print_CleanDiv(self, expr):
+        return self._print_FloorDiv(expr)
+
+    def _print_GreaterThan(self, expr):
+        # GreaterThan:          >=
+        # StrictlyGreaterThan:  >
+        # Go figure...
+        return " >= ".join(map(self.paren, map(self._print, expr.args)))
+
+    def _print_align(self, expr):
+        assert len(expr.args) == 1
+        return f"align({self._print(expr.args[0])})"
+
+
+class PythonPrinter(ExprPrinter):
+    def _print_ModularIndexing(self, expr):
+        x, div, mod = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        mod = self.paren(self.doprint(mod))
+        if div != "1":
+            x = f"({x} // {div})"
+        return f"{x} % {mod}"
+
+    def _print_FloorDiv(self, expr):
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        return f"({x} // {div})"
+
+    def _helper_sqrt(self, expr):
+        return f"math.sqrt({self._print(expr)})"
+
+    def _print_Pow(self, expr):
+        # Pow() confuses triton
+        base, exp = expr.args
+        # NB: Remember this is sizevar computation!  You don't typically
+        # expect to have to do floating point computation including exponents
+        # in sizevar compute.  Instead of adding support for floating
+        # point pow, you should make upstream retranslate the Sympy expression
+        # into Tensor expressions earlier and do that instead.
+        if exp == 0.5:
+            return self._helper_sqrt(base)
+        elif exp == -0.5:
+            return "1/" + self._helper_sqrt(base)
+        base = self._print(base)
+        assert exp == int(exp), exp
+        exp = int(exp)
+        if exp > 0:
+            return "*".join([self.paren(base)] * exp)
+        elif exp < 0:
+            return "1/" + self.paren("*".join([self.paren(base)] * abs(exp)))
+        else:  # exp == 0
+            return "1"
+
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        return f"math.floor({self._print(expr.args[0])})"
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        return f"math.ceil({self._print(expr.args[0])})"
+
+    def _print_Abs(self, expr):
+        assert len(expr.args) == 1
+        return f"abs({self._print(expr.args[0])})"
+
+    def _print_Max(self, expr):
+        assert len(expr.args) >= 2
+        return f"max({', '.join(map(self._print, expr.args))})"
+
+    def _print_Min(self, expr):
+        assert len(expr.args) >= 2
+        return f"min({', '.join(map(self._print, expr.args))})"
+
+    def _print_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"math.cos({self._print(expr.args[0])})"
+
+    def _print_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.cosh({self._print(expr.args[0])})"
+
+    def _print_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"math.acos({self._print(expr.args[0])})"
+
+    def _print_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"math.sin({self._print(expr.args[0])})"
+
+    def _print_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.sinh({self._print(expr.args[0])})"
+
+    def _print_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"math.asin({self._print(expr.args[0])})"
+
+    def _print_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"math.tan({self._print(expr.args[0])})"
+
+    def _print_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"math.tanh({self._print(expr.args[0])})"
+
+    def _print_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"math.atan({self._print(expr.args[0])})"
+
+    def _print_Round(self, expr):
+        assert len(expr.args) == 1
+        return f"round({self._print(expr.args[0])})"
+
+    def _print_RoundDecimal(self, expr):
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        assert isinstance(ndigits, sympy.Integer)
+        return f"round({self._print(number)}, {ndigits})"
+
+
+class OpOverrides:
+    def __init__(self, parent):
+        super().__init__()
+        self._parent = parent
+
+    def __getattr__(self, item):
+        return getattr(self._parent, item)
+
+    @staticmethod
+    def identity(value):
+        # used to trigger cse
+        return value
+
+    @staticmethod
+    def constant(value, dtype):
+        return repr(value)
+
+    @staticmethod
+    def reciprocal(x):
+        return ops.truediv("1", x)
+
+    @staticmethod
+    def square(x):
+        return ops.mul(x, x)
+
+    @staticmethod
+    def bitwise_not(x):
+        return f"~{ExprPrinter.paren(x)}"
+
+    @staticmethod
+    def logical_not(a):
+        return f"{ExprPrinter.paren(a)} == 0"
+
+    @staticmethod
+    def bitwise_and(x, y):
+        return f"{ExprPrinter.paren(x)} & {ExprPrinter.paren(y)}"
+
+    @staticmethod
+    def bitwise_or(x, y):
+        return f"{ExprPrinter.paren(x)} | {ExprPrinter.paren(y)}"
+
+    @staticmethod
+    def bitwise_xor(x, y):
+        return f"{ExprPrinter.paren(x)} ^ {ExprPrinter.paren(y)}"
+
+    @staticmethod
+    def bitwise_left_shift(x, y):
+        return f"{ExprPrinter.paren(x)} << {ExprPrinter.paren(y)}"
+
+    @staticmethod
+    def bitwise_right_shift(x, y):
+        return f"{ExprPrinter.paren(x)} >> {ExprPrinter.paren(y)}"
+
+    @staticmethod
+    def remainder(a, b):
+        r = ops.mod(a, b)
+        return ops.where(f"(({r} != 0) & (({r} < 0) != ({b} < 0)))", ops.add(r, b), r)
+
+    @staticmethod
+    def load_seed(name, offset):
+        return ops.load(name, sympy.Integer(offset))
+
+    @classmethod
+    def _initialize_pointwise_overrides(cls, target):
+        assert target in {"triton", "cpp", "cppvec"}, target
+
+        def pointwise_factory_1(impl):
+            def func(x):
+                return impl.format(x=x)
+
+            return func
+
+        def pointwise_factory_2(impl):
+            def func(x, y):
+                return impl.format(x=x, y=y)
+
+            return func
+
+        for funcname, data in pointwise_overrides_data.items():
+            impl = getattr(data, target)
+            if isinstance(impl, str):
+                nof_args = 2 if "{y}" in impl else 1
+                # extend the following dictionary with factory
+                # functions for a specific number of arguments as
+                # needed:
+                factory = {1: pointwise_factory_1, 2: pointwise_factory_2}[nof_args]
+                setattr(cls, funcname, staticmethod(factory(impl)))
+
+
+@dataclasses.dataclass
+class OverridesData:
+    name: str
+    cpp: str
+    triton: Optional[str] = None  # None when not impl in libdevice/triton
+    cppvec: Optional[str] = None  # None when not impl in aten/.../vec
+    type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND = (
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+pointwise_overrides_data: Dict[str, OverridesData] = dict(
+    airy_ai=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="airy_ai_forward({x})",
+        name="special_airy_ai",
+    ),
+    bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="bessel_j0_forward({x})",
+        triton="libdevice.j0({x})",
+        name="special_bessel_j0",
+    ),
+    bessel_j1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="bessel_j1_forward({x})",
+        triton="libdevice.j1({x})",
+        name="special_bessel_j1",
+    ),
+    bessel_y0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="bessel_y0_forward({x})",
+        triton="libdevice.y0({x})",
+        name="special_bessel_y0",
+    ),
+    bessel_y1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="bessel_y1_forward({x})",
+        triton="libdevice.y1({x})",
+        name="special_bessel_y1",
+    ),
+    digamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_digamma({x})",
+        cppvec="{x}.digamma()",
+        name="digamma",
+    ),
+    # no cpp nor triton implementation for entr, it is defined as decomposition
+    # erf, erfc
+    erfcx=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_erfcx({x})",
+        triton="libdevice.erfcx({x})",
+        name="special_erfcx",
+    ),
+    # erfinv, exp2, expit, gammaln
+    igamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_igamma({x}, {y})",
+        name="igamma",
+    ),
+    igammac=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_igammac({x}, {y})",
+        name="igammac",
+    ),
+    gammainc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_igamma({x}, {y})",
+        name="special_gammainc",
+    ),
+    gammaincc=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_igammac({x}, {y})",
+        name="special_gammaincc",
+    ),
+    i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_i0({x})",
+        triton="libdevice.cyl_bessel_i0({x})",
+        cppvec="{x}.i0()",
+        name="i0",
+    ),
+    i0e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_i0e({x})",
+        cppvec="{x}.i0e()",
+        name="special_i0e",
+    ),
+    i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_i1({x})",
+        triton="libdevice.cyl_bessel_i1({x})",
+        name="special_i1",
+    ),
+    i1e=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_i1e({x})",
+        name="special_i1e",
+    ),
+    log_ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_log_ndtr({x})",
+        name="special_log_ndtr",
+    ),
+    # logit
+    modified_bessel_i0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="modified_bessel_i0_forward({x})",
+        triton="libdevice.cyl_bessel_i0({x})",
+        name="special_modified_bessel_i0",
+    ),
+    modified_bessel_i1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="modified_bessel_i1_forward({x})",
+        triton="libdevice.cyl_bessel_i1({x})",
+        name="special_modified_bessel_i1",
+    ),
+    modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="modified_bessel_k0_forward({x})",
+        name="special_modified_bessel_k0",
+    ),
+    modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="modified_bessel_k1_forward({x})",
+        name="special_modified_bessel_k1",
+    ),
+    # multigamma
+    ndtr=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_ndtr({x})",
+        name="special_ndtr",
+    ),
+    ndtri=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_ndtri({x})",
+        name="special_ndtri",
+    ),
+    polygamma=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="calc_polygamma({y}, {x})",
+        name="polygamma",
+    ),
+    # psi - alias to digamma
+    # round
+    scaled_modified_bessel_k0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="scaled_modified_bessel_k0_forward({x})",
+        name="special_scaled_modified_bessel_k0",
+    ),
+    scaled_modified_bessel_k1=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="scaled_modified_bessel_k1_forward({x})",
+        name="special_scaled_modified_bessel_k1",
+    ),
+    # sinc
+    spherical_bessel_j0=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="spherical_bessel_j0_forward({x})",
+        name="special_spherical_bessel_j0",
+    ),
+    zeta=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="zeta({x}, {y})",
+        name="special_zeta",
+    ),
+    chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_chebyshev_polynomial_t",
+    ),
+    chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_chebyshev_polynomial_u",
+    ),
+    chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_chebyshev_polynomial_v",
+    ),
+    chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_chebyshev_polynomial_w",
+    ),
+    legendre_polynomial_p=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="legendre_polynomial_p_forward({x}, {y})",
+        name="special_legendre_polynomial_p",
+    ),
+    shifted_chebyshev_polynomial_t=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="shifted_chebyshev_polynomial_t_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_t",
+    ),
+    shifted_chebyshev_polynomial_u=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="shifted_chebyshev_polynomial_u_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_u",
+    ),
+    shifted_chebyshev_polynomial_v=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="shifted_chebyshev_polynomial_v_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_v",
+    ),
+    shifted_chebyshev_polynomial_w=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="shifted_chebyshev_polynomial_w_forward({x}, {y})",
+        name="special_shifted_chebyshev_polynomial_w",
+    ),
+    hermite_polynomial_h=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="hermite_polynomial_h_forward({x}, {y})",
+        name="special_hermite_polynomial_h",
+    ),
+    hermite_polynomial_he=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="hermite_polynomial_he_forward({x}, {y})",
+        name="special_hermite_polynomial_he",
+    ),
+    laguerre_polynomial_l=OverridesData(
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        cpp="laguerre_polynomial_l_forward({x}, {y})",
+        name="special_laguerre_polynomial_l",
+    ),
+)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_OpOverrides(h: OpOverrides) -> OpsHandler[str]:
+    return h
+
+
+class DeferredLine(DeferredLineBase):
+    """A line that can be 'unwritten' by adding name to V.graph.removed_buffers"""
+
+    def __init__(self, name, line):
+        super().__init__(line)
+        self.name = name
+        assert not isinstance(line, DeferredLineBase)
+
+    def __call__(self):
+        if all(
+            self.name not in x
+            for x in (
+                V.graph.removed_buffers,
+                V.kernel.removed_buffers,
+                V.graph.inplaced_to_remove,
+                V.kernel.inplaced_to_remove,
+            )
+        ):
+            return self.line
+        return None
+
+    def _new_line(self, line):
+        return DeferredLine(self.name, line)
+
+
+class BracesBuffer(IndentedBuffer):
+    def indent(self, offset=1):
+        @contextlib.contextmanager
+        def ctx():
+            for _ in range(offset):
+                self.writeline("{")
+                self._indent += 1
+            for _ in range(-offset):
+                self._indent -= 1
+                self.writeline("}")
+            yield
+            for _ in range(-offset):
+                self.writeline("{")
+                self._indent += 1
+            for _ in range(offset):
+                self._indent -= 1
+                self.writeline("}")
+
+        return ctx()
+
+
+class InplacedBuffer(NamedTuple):
+    inner_name: str
+    other_names: List[str]
+
+
+class KernelArgs:
+    @staticmethod
+    def _lookup(prefix, odict, name):
+        assert isinstance(name, (str, sympy.Symbol))
+        if name not in odict:
+            odict[name] = f"{prefix}{len(odict)}"
+        return odict[name]
+
+    def __init__(self, sizevars=None):
+        self.input_buffers = dict()
+        self.output_buffers = dict()
+        self.inplace_buffers = dict()
+        self.sizevars = sizevars or dict()
+        self.workspace_arg = None
+
+    def __repr__(self):
+        return "KernelArgs({})".format(
+            ", ".join(
+                map(
+                    repr,
+                    [
+                        self.input_buffers,
+                        self.output_buffers,
+                        self.inplace_buffers,
+                        self.sizevars,
+                    ],
+                )
+            )
+        )
+
+    def _buffer_is_marked_removed(self, name):
+        return isinstance(name, str) and name.startswith("REMOVED")
+
+    def input(self, name):
+        if V.graph.scheduler:
+            name = V.graph.scheduler.mutation_real_name.get(name, name)
+        assert name not in V.graph.removed_buffers, name
+        if name in self.output_buffers:
+            return self.output_buffers[name]
+        if name in self.inplace_buffers:
+            return self.inplace_buffers[name].inner_name
+        if name.startswith("seed"):
+            return self._lookup("seed", self.input_buffers, name)
+        return self._lookup("in_ptr", self.input_buffers, name)
+
+    def output(self, name):
+        if V.graph.scheduler:
+            name = V.graph.scheduler.mutation_real_name.get(name, name)
+        assert name not in V.graph.removed_buffers, name
+        if name in self.inplace_buffers:
+            return self.inplace_buffers[name].inner_name
+        return self._lookup("out_ptr", self.output_buffers, name)
+
+    def make_inplace(self, input_name, output_name):
+        assert output_name not in self.inplace_buffers
+        if input_name in self.inplace_buffers:
+            buf = self.inplace_buffers[input_name]
+            buf.other_names.append(output_name)
+            self.inplace_buffers[output_name] = buf
+        else:
+            buf = InplacedBuffer(
+                f"in_out_ptr{len(unique(self.inplace_buffers.values()))}",
+                [input_name, output_name],
+            )
+            self.inplace_buffers[input_name] = buf
+            self.inplace_buffers[output_name] = buf
+
+    def workspace(self, nbytes: sympy.Expr, zero_fill: bool):
+        if self.workspace_arg is None:
+            self.workspace_arg = WorkspaceArg(nbytes, zero_fill)
+            return "ws_ptr", 0
+
+        offset = self.workspace_arg.nbytes
+        zero_fill = zero_fill or self.workspace_arg.zero_fill
+        self.workspace_arg = WorkspaceArg(offset + nbytes, zero_fill)
+        return "ws_ptr", offset
+
+    def seed_offset(self, name, value):
+        if value in self.sizevars:
+            return self.sizevars[value]
+        if name in self.sizevars.values():
+            name = (
+                f"{name}{sum(1 for v in self.sizevars.values() if v.startswith(name))}"
+            )
+        self.sizevars[value] = name
+        return name
+
+    def size(self, name):
+        if str(name) == "seed":
+            self.sizevars["seed"] = "seed"
+            return "seed"
+        return self._lookup("ks", self.sizevars, name)
+
+    def call_names(self):
+        return chain(
+            self.input_buffers.keys(), self.output_buffers.keys(), self.sizevars.keys()
+        )
+
+    def wrap_ptr_arg(self, buf, dtype):
+        return buf
+
+    def wrap_size_arg(self, size):
+        return str(size)
+
+    def cpp_argdefs(self):
+        from .cpp import DTYPE_TO_CPP, INDEX_TYPE
+
+        call_args = []
+        arg_defs = []
+        arg_types = []
+        for inplaced in unique(self.inplace_buffers.values()):
+            if self._buffer_is_marked_removed(inplaced):
+                continue
+            outer = inplaced.other_names[-1]
+            inner = inplaced.inner_name
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"{cpp_dtype}* {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
+        for outer, inner in self.input_buffers.items():
+            if outer in self.inplace_buffers:
+                continue
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"const {cpp_dtype}* {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"const {cpp_dtype}*")
+        for outer, inner in self.output_buffers.items():
+            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
+                continue
+            dtype = V.graph.get_dtype(outer)
+            cpp_dtype = DTYPE_TO_CPP[dtype]
+            arg_defs.append(f"{cpp_dtype}* {inner}")
+            call_args.append(self.wrap_ptr_arg(outer, dtype))
+            arg_types.append(f"{cpp_dtype}*")
+        for outer, inner in self.sizevars.items():
+            arg_defs.append(f"const {INDEX_TYPE} {inner}")
+            call_args.append(self.wrap_size_arg(outer))
+            arg_types.append(f"const {INDEX_TYPE}")
+            if V.graph.wrapper_code:
+                V.graph.wrapper_code.ensure_size_computed(outer)
+        assert self.workspace_arg is None, "Workspace not supported on CPU "
+        return arg_defs, call_args, arg_types
+
+    def python_argdefs(self):
+        arg_defs = []
+        call_args = []
+        precompile_args: List[Union[TensorArg, SizeArg, WorkspaceArg]] = []
+        for inplaced in unique(self.inplace_buffers.values()):
+            if self._buffer_is_marked_removed(inplaced):
+                continue
+            arg_defs.append(inplaced.inner_name)
+            call_args.append(inplaced.other_names[-1])
+            precompile_args.append(
+                TensorArg(
+                    name=inplaced.inner_name,
+                    buffer=inplaced.other_names[-1],
+                    dtype=V.graph.get_dtype(inplaced.other_names[-1]),
+                )
+            )
+        for outer, inner in chain(
+            self.input_buffers.items(), self.output_buffers.items()
+        ):
+            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
+                continue
+            arg_defs.append(inner)
+            call_args.append(outer)
+            precompile_args.append(
+                TensorArg(
+                    name=inner,
+                    buffer=outer,
+                    dtype=V.graph.get_dtype(outer),
+                )
+            )
+        for outer, inner in self.sizevars.items():
+            arg_defs.append(inner)
+            call_args.append(outer)
+            precompile_args.append(SizeArg(inner, outer))
+            if V.graph.wrapper_code:
+                V.graph.wrapper_code.ensure_size_computed(outer)
+        if self.workspace_arg is not None:
+            arg_defs.append("ws_ptr")
+            call_args.append("workspace")
+            precompile_args.append(self.workspace_arg)
+
+        return arg_defs, call_args, precompile_args
+
+    def aliases(self):
+        for inplaced in unique(self.inplace_buffers.values()):
+            if self._buffer_is_marked_removed(inplaced):
+                continue
+            for other in inplaced.other_names:
+                if (
+                    other in V.graph.inplaced_to_remove
+                    or other in V.kernel.inplaced_to_remove
+                ):
+                    continue
+                if other in self.input_buffers:
+                    yield self.input_buffers[other], inplaced.inner_name
+                if other in self.output_buffers:
+                    yield self.output_buffers[other], inplaced.inner_name
+
+    def is_removed(self, name):
+        def _is_removed(name, buffers):
+            return name not in buffers or self._buffer_is_marked_removed(buffers[name])
+
+        return _is_removed(name, self.output_buffers) and _is_removed(
+            name, self.inplace_buffers
+        )
+
+    # Includes inplace buffers, excludes removed buffers.  Essentially,
+    # after you do a call into this kernel, which buffers actually contain
+    # updated data?  Modeled off of python_argdefs.
+    def live_output_buffers(self):
+        live_outs = set()
+        for inplaced in unique(self.inplace_buffers.values()):
+            if self._buffer_is_marked_removed(inplaced):
+                continue
+            live_outs.add(inplaced.other_names[-1])
+        for outer, inner in self.output_buffers.items():
+            if outer in self.inplace_buffers or self._buffer_is_marked_removed(inner):
+                continue
+            live_outs.add(outer)
+        return live_outs
+
+
+class CSEVariable:
+    """A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
+    To do so, the backends can simply overload `Kernel.create_cse_var`
+    The "CSEVariable.update_on_args" method gives you a hook for annotations
+    See example of TritonCSEVariable in triton.py
+    """
+
+    def __init__(self, name, bounds: ValueRanges[Any]):
+        assert isinstance(bounds, ValueRanges)
+        self.name = name
+        self.bounds = bounds
+
+    def __str__(self):
+        return self.name
+
+    def __hash__(self) -> int:
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        return type(other) == type(self) and other.name == self.name
+
+    def update_on_args(self, name, args, kwargs):
+        pass
+
+
+class CppWrapperKernelArgs(KernelArgs):
+    def wrap_ptr_arg(self, buf, dtype):
+        from .cpp import DTYPE_TO_CPP
+
+        if config.abi_compatible:
+            # In the abi_compatible model, we just return the buf here.
+            # We will form correct call args later in wrapper.generate_kernel_all.
+            return buf
+        else:
+            return f"({DTYPE_TO_CPP[dtype]}*)({buf}.data_ptr())"
+
+    def wrap_size_arg(self, size):
+        return f"{size}"
+
+
+class CSE:
+    """Common subexpression elimination"""
+
+    def __init__(
+        self,
+        prefix="",
+        suffix="",
+        name_prefix="tmp",
+        iter_buffers=None,
+        store_cache=None,
+        reduction_cache=None,
+        varname_map=None,
+    ):
+        self.prefix = prefix
+        self.suffix = suffix
+        self.cache = {}
+        self.name_prefix = name_prefix
+        self.store_cache = store_cache or {}
+        self.reduction_cache = reduction_cache or {}
+        self.iter_buffer_ids = iter_buffers or itertools.count()
+        self.invalidated_stores = set()
+        self.varname_map = varname_map or {}
+
+    def invalidate(self, keep_vars: Set[str]):
+        for name, tmp in list(self.store_cache.items()):
+            if tmp not in keep_vars:
+                del self.store_cache[name]
+                self.invalidated_stores.add(name)
+        self.cache = {k: v for k, v in self.cache.items() if v in keep_vars}
+
+    def clone(self):
+        # Note(fdrocha): reduction_cache is not being cloned, not sure if this is intentional
+        return CSE(
+            prefix=self.prefix,
+            suffix=self.suffix,
+            name_prefix=self.name_prefix,
+            iter_buffers=self.iter_buffer_ids,
+            store_cache=self.store_cache,
+            varname_map=self.varname_map,
+        )
+
+    def generate(
+        self,
+        buffer: IndentedBuffer,
+        expr: Union[str, CSEVariable, OpsValue, IndentedBuffer],
+        *,
+        bounds: ValueRanges[Any] = ValueRanges.unknown(),
+        write=True,
+        assignment=True,
+    ) -> CSEVariable:
+        if isinstance(expr, OpsValue):
+            expr = expr.value
+
+        assert isinstance(expr, (str, CSEVariable, IndentedBuffer)), type(expr)
+        assert write or assignment
+        if isinstance(expr, CSEVariable):
+            # If the expressions were always created with all the information, we could
+            # assert expr.bounds == bounds, but sometimes the expression is created
+            # with the loose ValueRanges.unknown(), so we need to tighten the bounds
+            expr.bounds = expr.bounds.tighten(bounds)
+            return expr
+        cache_key = expr.getvalue() if isinstance(expr, IndentedBuffer) else expr
+        var = self.cache.get(cache_key, None)
+        if not var:
+            var = self.newvar(bounds) if assignment else None
+            self.cache[cache_key] = var
+            if write:
+                if V.kernel.current_node:
+                    V.kernel.current_node.codegen_originating_info(
+                        buffer, only_once=True
+                    )
+                if isinstance(expr, IndentedBuffer):
+                    if assignment:
+                        buffer.writeline(f"{self.prefix}{var} =")
+                    buffer.splice(expr)
+                    buffer.writeline(self.suffix)
+                else:
+                    if assignment:
+                        line = f"{self.prefix}{var} = {expr}{self.suffix}"
+                    else:
+                        line = f"{expr}{self.suffix}"
+                    buffer.writeline(line)
+        else:
+            var.bounds = var.bounds.tighten(bounds)
+
+        return var
+
+    def newvar(self, bounds: ValueRanges[Any] = ValueRanges.unknown()) -> CSEVariable:
+        var_name = f"{self.name_prefix}{next(self.iter_buffer_ids)}"
+        var = V.kernel.create_cse_var(var_name, bounds)
+        self.varname_map[var_name] = var
+        return var
+
+
+class IndirectAssertLine(DeferredLineBase):
+    def __init__(self, line, assert_fn, var, mask, size_map):
+        self.var = var
+        self.mask = mask
+        self.line = line
+        self.assert_fn = assert_fn
+        self.size_map = size_map
+
+    def __call__(self):
+        size, size_str = self.size_map[(self.var, self.mask)]
+
+        # We assert if we've not been able to prove the bound
+        assert_min = (self.var.bounds.lower >= 0) != sympy.true
+        assert_max = (self.var.bounds.upper < size) != sympy.true
+
+        # FooBar interview question
+        if not (assert_min or assert_max):
+            return None
+        elif assert_min and assert_max:
+            # The conditions need to be in parens because of Python's operator precedence.
+            # It'd be less error-prone to use and/or/not, which is suported by triton
+            cond = f"(0 <= {self.var}) & ({self.var} < {size_str})"
+            cond_print = f"0 <= {self.var} < {size_str}"
+        elif assert_min:
+            cond = f"0 <= {self.var}"
+            cond_print = cond
+        else:
+            assert assert_max
+            cond = f"{self.var} < {size_str}"
+            cond_print = cond
+
+        if self.mask:
+            cond = f"({cond}) | ~{self.mask}"
+        return self.line.format(
+            assert_fn=self.assert_fn, cond=cond, cond_print=cond_print
+        )
+
+    def _new_line(self, line):
+        return IndirectAssertLine(
+            line, self.assert_fn, self.var, self.mask, self.size_map
+        )
+
+
+class CodeGen:
+    def __init__(self):
+        super().__init__()
+        self.exit_stack = contextlib.ExitStack()
+
+    def __enter__(self):
+        self.exit_stack.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
+
+
+class Kernel(CodeGen):
+    newvar_prefix = ""
+    suffix = ""
+    overrides: Optional[Callable[[OpsHandler[Any]], OpsHandler[Any]]] = None
+    # TODO: these look dead, but with all the getattr it's hard to tell...
+    load_format: None = None
+    store_format: None = None
+
+    def __init__(self, args=None, increase_kernel_count=True):
+        super().__init__()
+        if increase_kernel_count:
+            metrics.generated_kernel_count += 1
+        self.args = args or KernelArgs()
+        self.loads = IndentedBuffer()
+        self.compute = IndentedBuffer()
+        self.stores = IndentedBuffer()
+        self.cse: CSE = CSE(self.newvar_prefix, self.suffix)
+        self.must_keep_buffers = set()
+        self.store_buffer_names = set()
+        self._load_mask = None
+        # set in set_current_node
+        self.current_node = None
+        self.node_to_bounds: Optional[Dict[torch.fx.Node, ValueRanges[Any]]] = None
+        # Upper bounds for indirect_indexing and their str representation
+        # NB: None, None is never stored in map, but it is the assumed
+        # "not set" value for the dict
+        self.indirect_max_sizes: Dict[
+            Tuple[CSEVariable, str], Union[Tuple[sympy.Expr, str], Tuple[None, None]]
+        ] = {}
+
+        self.removed_buffers = set()
+        self.inplaced_to_remove = set()
+
+        # key: the buffer to write
+        # value: the buffer to read and whose memory can be reused for
+        #   the buffer specified by key
+        self.inplace_update_buffers = dict()
+        # Set minimum number of elements processed per thread.
+        self.min_elem_per_thread = 1
+        self.kernel_name = None
+
+    @contextlib.contextmanager
+    def set_current_node(self, node):
+        prior = self.current_node
+        self.current_node = node
+        self.node_to_bounds = node._body.bounds().get_bounds()
+        try:
+            yield
+        finally:
+            self.current_node = prior
+
+    @contextlib.contextmanager
+    def swap_buffers(self, lb, cb=None, sb=None):
+        if cb is None:
+            cb = lb
+        loads = self.loads
+        compute = self.compute
+        stores = self.stores
+        cse = self.cse
+        self.loads = lb
+        self.compute = cb
+        self.stores = sb
+        self.cse = cse.clone()
+        try:
+            yield
+        finally:
+            self.loads = loads
+            self.compute = compute
+            self.stores = stores
+            self.cse = cse
+
+    def load(self, name: str, index: sympy.Expr) -> CSEVariable:
+        raise NotImplementedError()
+
+    def indirect_load(self, name: str, index: sympy.Expr):
+        """A load the depends on an index we have read"""
+        prior = self.loads
+        try:
+            # put the load in the compute section as it might have deps
+            self.loads = self.compute
+            return self.load(name, index)
+        finally:
+            self.loads = prior
+
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        raise NotImplementedError()
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        raise NotImplementedError()
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+        raise NotImplementedError()
+
+    def scan(
+        self,
+        dtype: torch.dtype,
+        combine_fn: Callable[[CSEVariable, CSEVariable], CSEVariable],
+        value: CSEVariable,
+        init: int,
+    ) -> CSEVariable:
+        raise NotImplementedError()
+
+    def bucketize(
+        self,
+        values: CSEVariable,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ) -> CSEVariable:
+        """
+        See [Note: Inductor bucketize op]
+        """
+        raise NotImplementedError()
+
+    @property
+    def assert_function(self) -> str:
+        raise NotImplementedError()
+
+    def index_to_str(self, index: sympy.Expr) -> str:
+        raise NotImplementedError()
+
+    def __enter__(self):
+        # TODO: hoist this to top level
+        class CSEProxy:
+            self.name = "CSEProxy"
+
+            @staticmethod
+            def __getattr__(name: str) -> Callable[..., CSEVariable]:  # type: ignore[misc]
+                def inner(*args, **kwargs):
+                    # TritonTemplateKernel has no current_node
+                    buf_bounds = ValueRanges.unknown()
+                    if hasattr(V.interpreter, "current_node"):
+                        fx_node = V.interpreter.current_node
+                        assert isinstance(self.node_to_bounds, dict)
+                        buf_bounds = self.node_to_bounds.get(
+                            fx_node, ValueRanges.unknown()
+                        )
+
+                    value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
+
+                    def do_cse(v):
+                        csevar = self.cse.generate(self.compute, v, bounds=buf_bounds)
+                        csevar.update_on_args(name, args, kwargs)
+                        return csevar
+
+                    return pytree.tree_map(do_cse, value)
+
+                return inner
+
+            @staticmethod
+            def indirect_indexing(
+                var: CSEVariable, size: sympy.Expr, check: bool = True
+            ):
+                # Skip CSE since this doesn't return an expression
+
+                if var.bounds.lower < 0:  # type: ignore[operator]
+                    new_bounds = ValueRanges.unknown()
+                    if var.bounds != ValueRanges.unknown() and isinstance(
+                        size, sympy.Number
+                    ):
+                        # Take the negative part of the bound and add size to it
+                        # Then take union of that and the positive part
+                        # This is a tighter bound than that of a generic ops.where, as we have info on the cond
+                        neg = var.bounds & ValueRanges(-sympy.oo, -1)
+                        new_bounds = ValueRanges(neg.lower + size, neg.upper + size)
+                        # We don't have a good way of representing the empty range
+                        if var.bounds.upper >= 0:  # type: ignore[operator]
+                            pos = var.bounds & ValueRanges(0, sympy.oo)
+                            new_bounds = new_bounds | pos
+
+                    stm = ops.add(var, self.rename_indexing(size))
+                    # Mixed negative and non-negative
+                    if var.bounds.upper >= 0:  # type: ignore[operator]
+                        lt = ops.lt(var, "0")
+                        stm = ops.where(lt, stm, var)
+                    new_var = self.cse.generate(self.compute, stm, bounds=new_bounds)
+
+                    new_var.update_on_args("index_wrap", (var,), {})
+                    var = new_var
+
+                if self.generate_assert(check):
+                    mask = self.load_mask(var)
+
+                    # An assertion line may have been written already, if so just
+                    # update the max size.
+                    map_key = (var, mask)
+                    existing_size, _ = self.indirect_max_sizes.get(
+                        map_key, (None, None)
+                    )
+                    if existing_size is not None:
+                        size = sympy.Min(size, existing_size)
+                    else:
+                        line = (
+                            '{assert_fn}({cond}, "index out of bounds: {cond_print}")'
+                        )
+                        self.compute.writeline(
+                            IndirectAssertLine(
+                                line,
+                                self.assert_function,
+                                var,
+                                mask,
+                                self.indirect_max_sizes,
+                            )
+                        )
+
+                    self.indirect_max_sizes[map_key] = (size, self.index_to_str(size))
+                return sympy_index_symbol(str(var))
+
+            @staticmethod
+            def load(name: str, index: sympy.Expr) -> CSEVariable:
+                if name in self.cse.invalidated_stores:
+                    # A load from an invalidated store requires us to
+                    # keep the actual buffer around
+                    V.kernel.must_keep_buffers.add(name)
+                if free_symbol_startswith(index, "tmp"):
+                    return self.indirect_load(name, index)
+                store_cache = self.cse.store_cache
+                if name in store_cache:
+                    return store_cache[name]
+                return self.load(name, index)
+
+            @staticmethod
+            def store(
+                name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+            ) -> None:
+                self.store_buffer_names.add(name)
+                if mode is None:
+                    self.cse.store_cache[name] = value
+                    if self.current_node:
+                        for other_name in self.current_node.get_mutations():
+                            self.cse.store_cache[other_name] = value
+                if name not in V.graph.removed_buffers:
+                    return self.store(name, index, value, mode=mode)
+                else:
+                    return None  # type: ignore[return-value]
+
+            @staticmethod
+            def store_reduction(name: str, index: sympy.Expr, value: CSEVariable):
+                self.store_buffer_names.add(name)
+                self.cse.store_cache[name] = value
+                if self.current_node:
+                    for other_name in self.current_node.get_mutations():
+                        self.cse.store_cache[other_name] = value
+
+                if name not in V.graph.removed_buffers:
+                    return self.store_reduction(name, index, value)
+
+            @staticmethod
+            def reduction(
+                dtype: torch.dtype,
+                src_dtype: torch.dtype,
+                reduction_type: ReductionType,
+                value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+            ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+                return self.reduction(dtype, src_dtype, reduction_type, value)
+
+            @staticmethod
+            def scan(
+                dtype: torch.dtype,
+                combine_fn: Callable[[CSEVariable, CSEVariable], CSEVariable],
+                value: CSEVariable,
+                init: int,
+            ) -> CSEVariable:
+                return self.scan(dtype, combine_fn, value, init)
+
+            @staticmethod
+            def bucketize(
+                values: CSEVariable,
+                offsets_name: str,
+                offsets_size: sympy.Expr,
+                indexing_dtype: torch.dtype,
+                right: bool,
+            ) -> CSEVariable:
+                """
+                [Note: Inductor bucketize op]
+
+                Given values (tensor) and offsets_name (reference to the name of a 1D
+                tensor), calculate the bucket that each value belongs to.
+
+                e.g. for values [-1, 0, 1, 2, 3, 4, 5, 9], offsets [0, 4, 4, 8], right=True
+                return =        [ 0, 1, 1, 1, 1, 3, 3, 4].
+
+                When right == False, bucket i refers to range (offsets[i], offsets[i+1]].
+                When right == True,  bucket i refers to range [offsets[i], offsets[i+1]).
+
+                Offsets must be non-decreasing or the result is undefined.
+                """
+                return self.bucketize(
+                    values, offsets_name, offsets_size, indexing_dtype, right
+                )
+
+        # Use mypy to check protocol implemented correctly
+        def _typecheck_CSEProxy(h: CSEProxy) -> OpsHandler[CSEVariable]:
+            return h
+
+        super().__enter__()
+        assert self.overrides
+        parent_handler = self.overrides(V.get_ops_handler())
+        self.exit_stack.enter_context(V.set_ops_handler(CSEProxy()))
+        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Note that V.graph.scheduler can be None when codegening triton template
+        kernels.
+        """
+        if V.graph.scheduler:
+            V.graph.scheduler.remove_kernel_local_buffers()
+        super().__exit__(exc_type, exc_val, exc_tb)
+
+    def generate_assert(self, check):
+        return (check or config.debug_index_asserts) and config.assert_indirect_indexing
+
+    def load_mask(self, var) -> str:
+        # only the triton kernel requires mask
+        return ""
+
+    def rename_indexing(self, index) -> sympy.Expr:
+        # adds the necessary kernel args for index expressions
+        # and renames variables in index expressions to kernel arg names
+        if isinstance(index, (list, tuple)):
+            return [self.rename_indexing(x) for x in index]  # type: ignore[return-value]
+        index = V.graph.sizevars.simplify(index)
+        sorted_symbols = sorted(index.free_symbols, key=lambda s: s.name)
+        replacements = {
+            x: self.args.size(x)
+            for x in sorted_symbols
+            if x.name.startswith(("s", "u", "ps"))
+            or (x.name.startswith("i") and not x.name.startswith("idx"))
+        }
+        return sympy_subs(index, replacements)
+
+    def create_cse_var(self, *args, **kwargs):
+        return CSEVariable(*args, **kwargs)
+
+
+@dataclasses.dataclass
+class OptimizationContext:
+    key: ClassVar[str] = "opt_ctx"
+
+    # Load value as mask
+    is_load_as_mask: bool = False
+
+    dtype: Optional[torch.dtype] = None
+    ops_name: str = ""
+
+    # Load uint8/int8 value as float32
+    is_load_int8_as_float: bool = False
+
+
+@functools.lru_cache(None)
+def jinja2_env():
+    try:
+        import jinja2
+
+        return jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+        )
+    except ImportError:
+        return None
+
+
+PrimitiveInfoType = Union[int, float, bool, str, List[Union[int, str, float, bool]]]
+
+
+class ChoiceCaller:
+    """
+    Represents a possible choice used in autotune_process.py.
+    During autotuning, self.benchmark() is first called to get benchmark result,
+    and if this choice is selected, self.output_node() is called to get the output_node.
+
+    Children classes: TritonTemplateCaller, CUDATemplateCaller.
+    """
+
+    def __init__(self, name, input_nodes, layout):
+        super().__init__()
+        self.name = name
+        self.layout = layout
+        self.input_nodes = input_nodes
+
+    def benchmark(self, *args, out) -> float:
+        algo = self.to_callable()
+        return do_bench(lambda: algo(*args, out=out))
+
+    def call_name(self) -> str:
+        raise NotImplementedError()
+
+    def to_callable(self):
+        raise NotImplementedError()
+
+    def hash_key(self) -> str:
+        raise NotImplementedError()
+
+    def output_node(self) -> "TensorBox":
+        raise NotImplementedError()
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {}
+
+
+class KernelTemplate:
+    """
+    Base class for defining kernel templates.
+
+    Children classes: TritonTemplate, CUDATemplate
+    """
+
+    @staticmethod
+    def _template_from_string(source):
+        env = jinja2_env()
+        if env is not None:
+            return env.from_string(source)
+        return None
+
+    @staticmethod
+    def _fake_get_dtype(fake_out):
+        _get_dtype_real = V.graph.get_dtype
+
+        def get_dtype(name):
+            if name == fake_out.get_name():
+                return fake_out.get_dtype()
+            return _get_dtype_real(name)
+
+        return get_dtype
+
+    def __init__(self, name: str):
+        self.name = name
+
+    def maybe_append_choice(self, choices, **kwargs):
+        """
+        Maybe generates a new ChoiceCaller and appends it into existing choices.
+
+        choices: A list of ChoiceCallers.
+        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
+        """
+
+        try:
+            choices.append(self.generate(**kwargs))
+        except NotImplementedError:
+            pass
+
+    def generate(self, **kwargs) -> ChoiceCaller:
+        """
+        Generates a ChoiceCaller instance from the given arguments.
+        """
+
+        raise NotImplementedError()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bb2282bc9cf036c8734294c9c24a533f756fb23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp.py
@@ -0,0 +1,4038 @@
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import re
+import sys
+from copy import copy, deepcopy
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+
+import sympy
+
+import torch
+import torch.fx
+from torch._inductor import dependencies
+from torch._inductor.ir import StorageBox, TensorBox
+from torch._prims_common import is_float_dtype
+from torch.utils import _pytree as pytree
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
+
+from .. import codecache, config, ir, metrics
+from ..codegen.wrapper import WrapperCodeGen
+from ..optimize_indexing import range_expressable_in_32_bits
+from ..scheduler import (
+    BaseScheduling,
+    ForeachKernelSchedulerNode,
+    FusedSchedulerNode,
+    SchedulerNode,
+)
+from ..utils import (
+    cache_on_self,
+    get_fused_kernel_name,
+    is_welford_reduction,
+    parallel_num_threads,
+    sympy_index_symbol,
+    sympy_product,
+    sympy_subs,
+)
+
+from ..virtualized import ops, OpsValue, V
+from .common import (
+    BracesBuffer,
+    CppWrapperKernelArgs,
+    CSE,
+    CSEVariable,
+    DataTypePropagation,
+    DeferredLine,
+    DTYPE_TO_COMPUTATION_DTYPE,
+    ExprPrinter,
+    IndentedBuffer,
+    Kernel,
+    KernelArgs,
+    OpOverrides,
+    OptimizationContext,
+)
+
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+
+DTYPE_TO_CPP = {
+    torch.float32: "float",
+    torch.float64: "double",
+    torch.float16: "half",
+    torch.int64: "long",
+    torch.int32: "int",
+    torch.int16: "short",
+    torch.int8: "signed char",
+    torch.uint64: "unsigned long",
+    torch.uint32: "unsigned int",
+    torch.uint16: "unsigned short",
+    torch.uint8: "unsigned char",
+    torch.uint32: "unsigned int",
+    torch.uint64: "unsigned long",
+    torch.bool: "bool",
+    torch.bfloat16: "bfloat16",
+    torch.complex64: "complex64",
+    torch.float8_e4m3fn: "float8_e4m3fn",
+    torch.float8_e5m2: "float8_e5m2",
+}
+
+DTYPE_TO_ATEN = {
+    torch.float32: "at::kFloat",
+    torch.float64: "at::kDouble",
+    torch.float16: "at::kHalf",
+    torch.int64: "at::kLong",
+    torch.int32: "at::kInt",
+    torch.int16: "at::kShort",
+    torch.int8: "at::kChar",
+    torch.uint64: "at::kUInt64",
+    torch.uint32: "at::kUInt32",
+    torch.uint16: "at::kUInt16",
+    torch.uint8: "at::kByte",
+    torch.uint32: "at::kUInt32",
+    torch.uint64: "at::kUInt64",
+    torch.bool: "at::kBool",
+    torch.bfloat16: "at::kBFloat16",
+    torch.complex32: "at::kComplexHalf",
+    torch.complex64: "at::kComplexFloat",
+    torch.complex128: "at::kComplexDouble",
+    torch.float8_e4m3fn: "at::kFloat8_e4m3fn",
+    torch.float8_e5m2: "at::kFloat8_e5m2",
+    torch.float8_e4m3fnuz: "at::kFloat8_e4m3fnuz",
+    torch.float8_e5m2fnuz: "at::kFloat8_e5m2fnuz",
+}
+
+DEVICE_TO_ATEN = {
+    "cpu": "at::kCPU",
+    "cuda": "at::kCUDA",
+}
+
+INDEX_TYPE = "long"
+
+NATIVE_OMP_RTYPES = {"+", "*", "^", "||", "min", "max"}
+RTYPE_TO_CPP = {
+    "sum": "+",
+    "prod": "*",
+    "xor_sum": "^",
+    "min": "min",
+    "max": "max",
+    "argmin": "argmin",
+    "argmax": "argmax",
+    "any": "||",
+    "welford_reduce": "welford",
+    "welford_combine": "welford",
+}
+VECTORIZABLE_RTYPES = {
+    "max",
+    "min",
+    "sum",
+    "prod",
+    "xor_sum",
+    "welford_reduce",
+    "welford_combine",
+}
+
+PYTHON_TO_CPP = {
+    "Tensor": "at::Tensor",
+    "int": "long",
+    "float": "double",
+    "bool": "bool",
+    "str": "std::string",
+    "ScalarType": "c10::ScalarType",
+    "MemoryFormat": "at::MemoryFormat",
+    "Layout": "at::Layout",
+    "Device": "at::Device",
+    "number": "at::Scalar",
+}
+
+CONTAINER_PYTHON_TO_CPP = {
+    "List": "std::vector",
+    "Optional": "c10::optional",
+}
+
+DTYPE_LOWP_FP = [
+    torch.bfloat16,
+    torch.float16,
+]
+
+
+def value_to_cpp(value, cpp_type):
+    if value == float("-inf"):
+        return f"-std::numeric_limits<{cpp_type}>::infinity()"
+    elif value == float("inf"):
+        return f"std::numeric_limits<{cpp_type}>::infinity()"
+    elif isinstance(value, bool):
+        return f"static_cast<{cpp_type}>({str(value).lower()})"
+    elif math.isnan(value):
+        return f"std::numeric_limits<{cpp_type}>::quiet_NaN()"
+    else:
+        return f"static_cast<{cpp_type}>({repr(value)})"
+
+
+def reduction_init(reduction_type, dtype):
+    if dtype in DTYPE_LOWP_FP:
+        # Since load promotes all half-precision inputs to float, the initial
+        # constant for reduction must be promoted as well
+        dtype = torch.float32
+    if reduction_type in ("xor_sum", "sum", "any"):
+        return 0
+    if reduction_type == "prod":
+        return 1
+    if reduction_type in {"max", "argmax"}:
+        return (
+            f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+            if is_float_dtype(dtype)
+            else f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::min()"
+        )
+    if reduction_type in {"min", "argmin"}:
+        return (
+            f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
+            if is_float_dtype(dtype)
+            else f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::max()"
+        )
+    if is_welford_reduction(reduction_type):
+        return f"Welford<{DTYPE_TO_CPP[dtype]}>()"
+    raise AssertionError(reduction_type)
+
+
+def reduction_acc_type(reduction_type, dtype):
+    assert reduction_type not in {"argmin", "argmax"}
+    scalar_type = DTYPE_TO_CPP[DTYPE_TO_COMPUTATION_DTYPE[dtype]]
+    if is_welford_reduction(reduction_type):
+        return f"Welford<{scalar_type}>"
+
+    return scalar_type
+
+
+def reduction_combine(reduction_type, var, next_value):
+    if reduction_type == "sum":
+        return f"{var} + {next_value}"
+    if reduction_type == "prod":
+        return f"{var} * {next_value}"
+    if reduction_type == "xor_sum":
+        return f"{var} ^ {next_value}"
+    if reduction_type == "any":
+        return f"{var} || {next_value}"
+    if reduction_type in ("min", "max"):
+        return f"{reduction_type}_propagate_nan({var}, {next_value})"
+    if reduction_type == "welford_reduce":
+        return f"welford_combine({var}, {next_value})"
+    if reduction_type == "welford_combine":
+        if isinstance(next_value, tuple):
+            mean, m2, weight = next_value
+        else:
+            mean, m2, weight = reduction_project(reduction_type, next_value)
+        return f"welford_combine({var}, {{{mean}, {m2}, {weight}}})"
+    raise AssertionError(reduction_type)
+
+
+def reduction_project(reduction_type, acc):
+    if is_welford_reduction(reduction_type):
+        return f"{acc}.mean", f"{acc}.m2", f"{acc}.weight"
+    elif reduction_type in {"argmin", "argmax"}:
+        return f"{acc}.index"
+    return acc
+
+
+def is_to_lowp_dtype(expr):
+    to_exprs = ["cvt_fp32_to_lowp_fp", "c10::convert"]
+    if any(to_expr in expr for to_expr in to_exprs):
+        if "half" in expr:
+            return torch.half
+        if "bfloat16" in expr:
+            return torch.bfloat16
+    return None
+
+
+def get_lowp_to_fp32_expr(lowp_var, src_dtype, kernel):
+    if isinstance(kernel, CppVecKernel):
+        return f"cvt_lowp_fp_to_fp32<{DTYPE_TO_CPP[src_dtype]}>({lowp_var})"
+    else:
+        assert isinstance(kernel, CppKernel)
+        return f"c10::convert<float>({lowp_var})"
+
+
+index_value_name_counter = 1
+
+
+def argmax_argmin_prefix(reduction_type, src_dtype, tmpvar):
+    global index_value_name_counter
+    struct_name = f"IndexValue_{index_value_name_counter}"
+    index_value_name_counter += 1
+
+    # A small annoyance, due to it being a little cumbersome to just throw {} into strings
+    prefix = [
+        f"struct {struct_name} {{size_t index; {DTYPE_TO_CPP[src_dtype]} value;}};",
+        f"{struct_name} {tmpvar}{{0, {reduction_init(reduction_type, src_dtype)}}};",
+    ]
+
+    if reduction_type in ["argmax", "argmin"]:
+        compare_op = "greater_or_nan" if reduction_type == "argmax" else "less_or_nan"
+        prefix.extend(
+            [
+                "#if !defined(__clang_major__) || __clang_major__ > 9",
+                f"#pragma omp declare reduction({reduction_type} : {struct_name} :\\",
+                f"    omp_out = {compare_op}(omp_in.value, omp_out.value, omp_in.index, omp_out.index) ? omp_in : omp_out)\\",
+                f"\tinitializer(omp_priv = {{0, {reduction_init(reduction_type, src_dtype)}}})",
+                "#endif",
+            ]
+        )
+
+    return prefix
+
+
+@functools.lru_cache
+def stride_at(index: sympy.Expr, var: sympy.Symbol):
+    replacement = {var: var + 1}
+    new_index = sympy_subs(index, replacement)  # type: ignore[arg-type]
+    return sympy.simplify(new_index - index)
+
+
+@functools.lru_cache
+def simplify_index_in_vec_range(index: sympy.Expr, var: sympy.Expr, vec_length: int):
+    """
+    Simplifies the index expression within the range of a vectorized loop.
+    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
+    this function transforms the `index` into an equivalent form. It handles
+    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
+    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
+    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.
+
+    NOTE:
+    The simplified index expression is intended for analysis purposes only, not
+    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
+    which are not dependent on the loop variable `var` in the vectorized range. Check
+    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.
+
+    Examples:
+    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
+       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
+       when `div` is divisible by 16.
+    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
+       variable when `mod` is divisible by 16.
+    """
+
+    div_freevar_id = 0
+    mod_freevar_id = 0
+
+    def visit_indexing_div(divisor):
+        nonlocal div_freevar_id
+        result = FloorDiv(var, divisor)
+        if sympy.gcd(divisor, vec_length) == vec_length:
+            result = sympy.Symbol(f"{var}_div_c{div_freevar_id}")
+            div_freevar_id += 1
+        return result
+
+    def visit_modular_indexing(divisor, modulus):
+        nonlocal mod_freevar_id
+        result = ModularIndexing(var, divisor, modulus)
+        if sympy.gcd(divisor, vec_length) == vec_length:
+            result = sympy.Symbol(f"{var}_mod_c{mod_freevar_id}")
+            mod_freevar_id += 1
+        elif divisor == 1 and sympy.gcd(modulus, vec_length) == vec_length:
+            result = var + sympy.Symbol(f"{var}_mod_c{mod_freevar_id}")
+            mod_freevar_id += 1
+        return result
+
+    original_index = index
+
+    div = sympy.Wild("divisor")
+    if index.has(FloorDiv):
+        index = index.replace(FloorDiv(var, div), visit_indexing_div)
+
+    mod = sympy.Wild("modulus")
+    if index.has(ModularIndexing):
+        index = index.replace(ModularIndexing(var, div, mod), visit_modular_indexing)
+
+    index = sympy.simplify(index)
+    if index != original_index:
+        return simplify_index_in_vec_range(index, var, vec_length)
+
+    return index
+
+
+@functools.lru_cache
+def stride_at_vec_range(index: sympy.Expr, var: sympy.Symbol, vec_length: int):
+    index_vec_simplified = simplify_index_in_vec_range(index, var, vec_length)
+    return stride_at(index_vec_simplified, var)
+
+
+class CppPrinter(ExprPrinter):
+    def _print_Integer(self, expr):
+        return f"{int(expr)}L"
+
+    def _print_Where(self, expr):
+        c = self.paren(self.doprint(expr.args[0]))
+        p = self.paren(self.doprint(expr.args[1]))
+        q = self.paren(self.doprint(expr.args[2]))
+        return f"{c} ? {p} : {q}"
+
+    def _print_ModularIndexing(self, expr):
+        x, div, mod = expr.args
+        x = self.paren(self.doprint(x))
+        if div != 1:
+            div = self.paren(self.doprint(div))
+            if expr.is_integer:
+                x = f"c10::div_floor_integer({x}, {div})"
+            else:
+                x = f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
+        mod = self.paren(self.doprint(mod))
+        return f"static_cast<{INDEX_TYPE}>({x}) % static_cast<{INDEX_TYPE}>({mod})"
+
+    def _print_FloorDiv(self, expr):
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        if expr.is_integer:
+            return f"c10::div_floor_integer({x}, {div})"
+        return f"c10::div_floor_floating(static_cast<double>({x}), static_cast<double>({div}))"
+
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        r = f"std::floor({self._print(expr.args[0])})"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Pow(self, expr):
+        # Uses float constants to perform FP div
+        base, exp = expr.args
+        base = self._print(base)
+
+        if exp == 0.5 or exp == -0.5:
+            return f"std::sqrt({base})" if exp == 0.5 else f"1.0/std::sqrt({base})"
+        assert exp.is_integer
+        exp = int(exp)
+        if exp > 0:
+            r = "*".join([self.paren(base)] * exp)
+        elif exp < 0:
+            r = "1.0/" + self.paren("*".join([self.paren(base)] * abs(exp)))
+        else:  # exp == 0
+            r = "1.0"
+
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Rational(self, expr):
+        # Uses float constants to perform FP div
+        if expr.q == 1:
+            r = f"{expr.p}"
+        else:
+            r = f"{expr.p}.0/{expr.q}.0"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        r = f"std::ceil({self._print(expr.args[0])})"
+        return f"static_cast<{INDEX_TYPE}>({r})" if expr.is_integer else r
+
+    def _print_Min(self, expr):
+        args = [self._print(a) for a in expr.args]
+        if len(args) == 2:
+            return f"std::min({args[0]}, {args[1]})"
+        else:
+            # Initializer list overload
+            il = "{" + ", ".join(args) + "}"
+            return f"std::min({il})"
+
+    def _print_Max(self, expr):
+        args = [self._print(a) for a in expr.args]
+        if len(args) == 2:
+            return f"std::max({args[0]}, {args[1]})"
+        else:
+            # Initializer list overload
+            il = "{" + ", ".join(args) + "}"
+            return f"std::max({il})"
+
+    def _print_Abs(self, expr):
+        assert len(expr.args) == 1
+        return f"std::abs({self._print(expr.args[0])})"
+
+    def _print_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"std::cos({self._print(expr.args[0])})"
+
+    def _print_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::cosh({self._print(expr.args[0])})"
+
+    def _print_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"std::acos({self._print(expr.args[0])})"
+
+    def _print_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"std::sin({self._print(expr.args[0])})"
+
+    def _print_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::sinh({self._print(expr.args[0])})"
+
+    def _print_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"std::asin({self._print(expr.args[0])})"
+
+    def _print_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"std::tan({self._print(expr.args[0])})"
+
+    def _print_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"std::tanh({self._print(expr.args[0])})"
+
+    def _print_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"std::atan({self._print(expr.args[0])})"
+
+    def _print_Round(self, expr):
+        assert len(expr.args) == 1
+        return f"std::lrint({self._print(expr.args[0])})"
+
+    def _print_RoundDecimal(self, expr):
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        if number.is_integer:
+            # ndigits < 0 should have been filtered by the sympy function
+            assert ndigits < 0
+            raise ValueError(
+                f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
+            )
+        return f"static_cast<double>(std::nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits})"
+
+
+# A function to print, useful for printing sympy symbols.
+cexpr = CppPrinter().doprint
+
+
+def cexpr_index(index):
+    return f"static_cast<{INDEX_TYPE}>({cexpr(index)})"
+
+
+class RecordOptimizationContext:
+    def __init__(self, func_name: str = ""):
+        self.func_name = func_name
+        self.current_node: Optional[torch.fx.Node] = None
+        self.opt_ctx: Optional[OptimizationContext] = None
+
+    def __enter__(self):
+        assert V.interpreter
+        assert V.interpreter.current_node
+
+        self.current_node = V.interpreter.current_node
+        assert self.current_node is not None
+        if OptimizationContext.key in self.current_node.meta:
+            self.opt_ctx = self.current_node.meta[OptimizationContext.key]
+        else:
+            self.opt_ctx = OptimizationContext()
+        assert self.opt_ctx is not None
+        self.opt_ctx.ops_name = self.func_name
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self.current_node
+        assert self.opt_ctx
+        self.current_node.meta[OptimizationContext.key] = self.opt_ctx
+
+    def get_opt_ctx(self):
+        return self.opt_ctx
+
+    def get_fx_node(self):
+        assert self.current_node
+        return self.current_node
+
+
+def get_opt_ctx(node: torch.fx.Node) -> OptimizationContext:
+    return node.meta.get(OptimizationContext.key, None)
+
+
+def get_current_node_opt_ctx() -> OptimizationContext:
+    assert V.interpreter.current_node
+    return get_opt_ctx(V.interpreter.current_node)
+
+
+class CppVecUnsupportedError(Exception):
+    pass
+
+
+class CppCSEVariable(CSEVariable):
+    def __init__(self, name, bounds: ValueRanges[Any]):
+        super().__init__(name, bounds)
+        self.is_vec = False
+        self.dtype: Optional[torch.dtype] = None
+        self.dependent_itervars: Set[sympy.Symbol] = set()
+
+    def update_on_args(self, name, args, kwargs):
+        if name == "load":
+            # args[1] is index
+            self._set_dependent_itervars(args[1])
+        else:
+            # propagate relevant itervars and is_vec from args
+            self.dependent_itervars.update(
+                *[
+                    arg.dependent_itervars
+                    for arg in args
+                    if isinstance(arg, CppCSEVariable)
+                ]
+            )
+            if name == "index_expr":
+                self._set_dependent_itervars(args[0])
+            if any(arg.is_vec for arg in args if isinstance(arg, CppCSEVariable)):
+                self.is_vec = True
+        # NOTE [dtype of CppCSEVariable]
+        # Deciding dtype according to the current optimization context is not
+        # always accurate since the dtypes are initialized during dtype propagation
+        # at the beginning of the codegen. It is possible that some ops are invoked
+        # during the codegen of the current op and take different dtypes from the
+        # current op.
+        # TODO(jgong5): A more accurate way of deciding the dtype of the variables is to
+        # propagate the dtypes here inside `update_on_args`.
+        if (
+            hasattr(V.interpreter, "current_node")
+            and get_current_node_opt_ctx() is not None
+        ):
+            self.dtype = get_current_node_opt_ctx().dtype
+
+    def _set_dependent_itervars(self, index: sympy.Expr):
+        """
+        Set the relevant itervars for this variable based on the `index` expression.
+        This includes the itervars directly used in the `index` as well as relevant itervars
+        of other cse variables used in the `index`.
+        """
+        for s in index.free_symbols:
+            if s in V.kernel.itervars:
+                self.dependent_itervars.add(s)  # type: ignore[arg-type]
+            elif s.name in V.kernel.cse.varname_map:  # type: ignore[attr-defined]
+                self.dependent_itervars.update(
+                    V.kernel.cse.varname_map[s.name].dependent_itervars  # type: ignore[attr-defined]
+                )
+
+    def depends_on(self, itervar: sympy.Symbol):
+        return itervar in self.dependent_itervars
+
+
+class CppOverrides(OpOverrides):
+    """Map element-wise ops to C++"""
+
+    @staticmethod
+    def add(a, b):
+        return f"decltype({a})({a} + {b})"
+
+    @staticmethod
+    def sub(a, b):
+        return f"decltype({a})({a} - {b})"
+
+    @staticmethod
+    def mul(a, b):
+        return f"decltype({a})({a} * {b})"
+
+    @staticmethod
+    def to_dtype(x, dtype, src_dtype=None):
+        assert dtype in DTYPE_TO_CPP, f"{dtype} missing from {__name__}.DTYPE_TO_CPP"
+        return f"c10::convert<{DTYPE_TO_CPP[dtype]}>({x})"
+
+    @staticmethod
+    def to_dtype_bitcast(x, dtype, src_dtype):
+        assert dtype in DTYPE_TO_CPP, f"{dtype} missing from {__name__}.DTYPE_TO_CPP"
+        if src_dtype in (torch.float16, torch.bfloat16):
+            # c10::bit_cast requires the source and target have the bitwidth.
+            # Because the input tensor's dtype could be promoted, e.g. from float16 to
+            # float, we have to cast the tensor to its original source dtype before
+            # invoking bit_cast. We also need to convert the bit-casted tensor
+            # back to float to make sure we keep using higher precision values
+            # for the rest of the computation.
+            cast_x = f"c10::convert<{DTYPE_TO_CPP[src_dtype]}>({x})"
+            cast_x = f"c10::bit_cast<{DTYPE_TO_CPP[dtype]}>({cast_x})"
+            return f"c10::convert<{DTYPE_TO_CPP[torch.float32]}>({cast_x})"
+        else:
+            return f"c10::bit_cast<{DTYPE_TO_CPP[dtype]}>({x})"
+
+    @staticmethod
+    def abs(x):
+        return f"std::abs({x})"
+
+    @staticmethod
+    def sin(x):
+        return f"std::sin({x})"
+
+    @staticmethod
+    def cos(x):
+        return f"std::cos({x})"
+
+    @staticmethod
+    def neg(x):
+        return f"decltype({x})(-{x})"
+
+    @staticmethod
+    def exp(x):
+        # return f"Sleef_expf_u10({x})"
+        return f"std::exp({x})"
+
+    @staticmethod
+    def exp2(x):
+        return f"std::exp2({x})"
+
+    @staticmethod
+    def expm1(x):
+        return f"std::expm1({x})"
+
+    @staticmethod
+    def erf(x):
+        return f"std::erf({x})"
+
+    @staticmethod
+    def erfc(x):
+        return f"std::erfc({x})"
+
+    @staticmethod
+    def erfinv(x):
+        return f"calc_erfinv({x})"
+
+    @staticmethod
+    def sqrt(x):
+        return f"std::sqrt({x})"
+
+    @staticmethod
+    def rsqrt(x):
+        return f"1 / std::sqrt({x})"
+
+    @staticmethod
+    def log1p(x):
+        bug = config.cpp.inject_log1p_bug_TESTING_ONLY
+        if bug == "accuracy":
+            return f"{x} + decltype({x})(1)"
+        elif bug is None:
+            return f"std::log1p({x})"
+        else:
+            raise AssertionError(
+                f"unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    @staticmethod
+    def tan(x):
+        return f"std::tan({x})"
+
+    @staticmethod
+    def tanh(x):
+        return f"std::tanh({x})"
+
+    @staticmethod
+    def signbit(x):
+        return f"std::signbit({x})"
+
+    @staticmethod
+    def pow(a, b):
+        return f"std::pow({a}, {b})"
+
+    @staticmethod
+    def log(x):
+        return f"std::log({x})"
+
+    @staticmethod
+    def round(x):
+        return f"std::nearbyint({x})"
+
+    @staticmethod
+    def floor(x):
+        return f"std::floor({x})"
+
+    @staticmethod
+    def floordiv(a, b):
+        # a and b are integer type
+        quot = f"{a} / {b}"
+        rem = f"{a} % {b}"
+        return f"(({a} < 0) != ({b} < 0) ? ({rem} != 0 ? {quot} - 1 : {quot}) : {quot})"
+
+    @staticmethod
+    def ceil(x):
+        return f"std::ceil({x})"
+
+    @staticmethod
+    def trunc(x):
+        return f"std::trunc({x})"
+
+    @staticmethod
+    def truncdiv(a, b):
+        # a and b are integer type
+        return f"{a} / {b}"
+
+    @staticmethod
+    def fmod(a, b):
+        return f"std::fmod({a}, {b})"
+
+    @staticmethod
+    def isinf(x):
+        return f"std::isinf({x})"
+
+    @staticmethod
+    def isnan(x):
+        return f"std::isnan({x})"
+
+    @staticmethod
+    def lgamma(x):
+        return f"std::lgamma({x})"
+
+    @staticmethod
+    def acos(x):
+        return f"std::acos({x})"
+
+    @staticmethod
+    def acosh(x):
+        return f"std::acosh({x})"
+
+    @staticmethod
+    def cosh(x):
+        return f"std::cosh({x})"
+
+    @staticmethod
+    def sinh(x):
+        return f"std::sinh({x})"
+
+    @staticmethod
+    def asin(x):
+        return f"std::asin({x})"
+
+    @staticmethod
+    def asinh(x):
+        return f"std::asinh({x})"
+
+    @staticmethod
+    def atan2(x, y):
+        return f"std::atan2({x}, {y})"
+
+    @staticmethod
+    def atan(x):
+        return f"std::atan({x})"
+
+    @staticmethod
+    def atanh(x):
+        return f"std::atanh({x})"
+
+    @staticmethod
+    def copysign(x, y):
+        return f"std::copysign({x}, {y})"
+
+    @staticmethod
+    def frexp(x):
+        cache_keys = f"frexp({x})[0]", f"frexp({x})[1]"
+        if all(cache_key in V.kernel.cse.cache for cache_key in cache_keys):
+            return tuple(V.kernel.cse.cache[cache_key] for cache_key in cache_keys)
+
+        code = BracesBuffer()
+        exponent = V.kernel.cse.newvar()
+        mantissa = V.kernel.cse.newvar()
+        code.writeline(f"int32_t {exponent};")
+        code.writeline(f"auto {mantissa} = std::frexp({x}, &{exponent});")
+        V.kernel.compute.splice(code)
+        cse_vars = (mantissa, exponent)
+        for cache_key, cse_var in zip(cache_keys, cse_vars):
+            V.kernel.cse.cache[cache_key] = cse_var
+        return mantissa, exponent
+
+    @staticmethod
+    def hypot(x, y):
+        return f"std::hypot({x}, {y})"
+
+    @staticmethod
+    def log10(x):
+        return f"std::log10({x})"
+
+    @staticmethod
+    def nextafter(x, y):
+        return f"std::nextafter({x}, {y})"
+
+    @staticmethod
+    def relu(x):
+        bug = config.cpp.inject_relu_bug_TESTING_ONLY
+        if bug == "compile_error":
+            return "compile error!"
+        elif bug == "runtime_error":
+            return f"{x}; throw 1"
+        elif bug == "accuracy":
+            return f"{x} + decltype({x})(1)"
+        elif bug is None:
+            return f"std::max({x}, decltype({x})(0))"
+        else:
+            raise AssertionError(
+                f"unrecognized config cpp.inject_relu_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    @staticmethod
+    def minimum(a, b):
+        return f"min_propagate_nan({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"max_propagate_nan({a}, {b})"
+
+    @staticmethod
+    def where(a, b, c):
+        return f"{a} ? {b} : {c}"
+
+    @staticmethod
+    def mod(a, b):
+        return f"mod({a}, {b})"
+
+    @staticmethod
+    def constant(val, dtype):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx and opt_ctx.dtype is not None
+        dtype = opt_ctx.dtype
+        if dtype in DTYPE_LOWP_FP:
+            # Since load promotes all half-precision inputs to float, constants
+            # must be promoted as well
+            dtype = torch.float32
+        return value_to_cpp(val, DTYPE_TO_CPP[dtype])
+
+    @staticmethod
+    def index_expr(expr, dtype):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx and opt_ctx.dtype is not None
+        dtype = opt_ctx.dtype
+        return ops.to_dtype(cexpr(V.kernel.rename_indexing(expr)), dtype)
+
+    @staticmethod
+    def masked(mask, body, other):
+        code = BracesBuffer()
+
+        # Write masked operation into a lambda
+        body_var = V.kernel.cse.newvar()
+        code.writeline(f"auto {body_var} = [&]")
+        with V.kernel.swap_buffers(code), code.indent():
+            result = body()
+            code.writeline(f"return {result};")
+        code.writeline(";")
+        V.kernel.compute.splice(code)
+
+        # Use the lambda's return type as the type of other
+        other_code = value_to_cpp(other, f"decltype({body_var}())")
+        return f"{mask} ? {body_var}() : {other_code}"
+
+    @staticmethod
+    def logical_and(a, b):
+        return f"{a} && {b}"
+
+    @staticmethod
+    def logical_not(a):
+        return f"!{a}"
+
+    @staticmethod
+    def logical_or(a, b):
+        return f"{a} || {b}"
+
+    @staticmethod
+    def logical_xor(a, b):
+        return f"{a} != {b}"
+
+    @staticmethod
+    def bitwise_and(a, b):
+        return f"decltype({a})({a} & {b})"
+
+    @staticmethod
+    def bitwise_not(a):
+        return f"decltype({a})(~{a})"
+
+    @staticmethod
+    def bitwise_or(a, b):
+        return f"decltype({a})({a} | {b})"
+
+    @staticmethod
+    def bitwise_xor(a, b):
+        return f"decltype({a})({a} ^ {b})"
+
+    @staticmethod
+    def bitwise_left_shift(a, b):
+        return f"decltype({a})({a} << {b})"
+
+    @staticmethod
+    def bitwise_right_shift(a, b):
+        return f"decltype({a})({a} >> {b})"
+
+    @staticmethod
+    def rand(seed: sympy.Expr, offset: sympy.Expr):
+        return f"normalized_rand_cpu({seed}, {offset})"
+
+    @staticmethod
+    def randn(seed: sympy.Expr, offset: sympy.Expr):
+        return f"randn_cpu({seed}, {offset})"
+
+    @staticmethod
+    def randint64(seed: sympy.Expr, offset: sympy.Expr, low, high):
+        return f"randint64_cpu({seed}, {offset}, {low}, {high})"
+
+    @staticmethod
+    def sigmoid(x):
+        return f"decltype({x})(1) / (decltype({x})(1) + std::exp(-{x}))"
+
+    @staticmethod
+    def sign(x):
+        code = BracesBuffer()
+        scalar_zero = f"decltype({x})(0)"
+        scalar_one = f"decltype({x})(1)"
+        code.writeline("[&]()")
+        with code.indent():
+            code.writeline(f"auto left = {x} > 0 ? {scalar_one} : {scalar_zero};")
+            code.writeline(f"auto right = {x} < 0 ? {scalar_one} : {scalar_zero};")
+            code.writeline("return left - right;")
+        code.writeline("()")
+        return code
+
+
+CppOverrides._initialize_pointwise_overrides("cpp")
+
+
+class CppVecOverrides(CppOverrides):
+    """Map element-wise ops to aten vectorization C++"""
+
+    def __new__(cls, *args, **kargs):
+        self = super().__new__(cls)
+
+        def wrap(func):
+            # `CppVecKernel` generates both scalar ops and vector ops according to
+            # whether the inputs are scalars or vectors while all ops in `CppVecOverrides`
+            # (except for some ops explained below) assume the inputs are vectors. We wrap the ops in
+            # `CppVecOverrides` to broadcast scalar inputs to vectors if needed or fallback to
+            # `CppOverrides` when all inputs are scalars.
+            #
+            # Notes on ops handled separately in their own functions:
+            # `ops.masked`:
+            #     needs recursive handling of masked body.
+            # `ops.index_expr`:
+            #     needs to further analyze the dependency of the index expression on
+            #     the tiling itervar.
+            def wrapper(*args, **kwargs):
+                scalars = [
+                    arg
+                    for arg in args
+                    if isinstance(arg, CppCSEVariable) and not arg.is_vec
+                ]
+                vectors = [
+                    arg
+                    for arg in args
+                    if isinstance(arg, CppCSEVariable) and arg.is_vec
+                ]
+                new_args = list(args)
+                if scalars and vectors:
+                    # broadcast scalar args to vector if needed
+                    new_args = []
+                    vec_dtype = vectors[0].dtype
+                    for arg in args:
+                        if isinstance(arg, CppCSEVariable) and not arg.is_vec:
+                            assert isinstance(V.kernel, CppVecKernel)
+                            # align scalar data type to the vector for binary ops
+                            if len(args) == 2 and arg.dtype != vec_dtype:
+                                arg = ops.to_dtype(arg, vec_dtype)
+                                arg = arg.value if isinstance(arg, OpsValue) else arg
+                                # See NOTE [dtype of CppCSEVariable]: we have to fix arg.dtype since
+                                # the dtype from optimization context could be wrong.
+                                assert isinstance(arg, CppCSEVariable)
+                                arg.dtype = vec_dtype
+                            new_arg = V.kernel.broadcast(arg)
+                            new_args.append(new_arg)
+                        else:
+                            new_args.append(arg)
+                if vectors:
+                    return func(*new_args, **kwargs)
+                else:
+                    # fallback to scalar ops
+                    scalar_ops = super(CppVecOverrides, self)
+                    scalar_func = getattr(
+                        scalar_ops, func.__name__, scalar_ops.__getattr__(func.__name__)  # type: ignore[attr-defined]
+                    )
+                    assert scalar_func is not None
+                    return scalar_func(*args, **kwargs)
+
+            return wrapper
+
+        for name, method in vars(CppVecOverrides).items():
+            if getattr(method, "__class__", None) == staticmethod and name not in [
+                "masked",
+                "index_expr",
+            ]:
+                setattr(self, name, wrap(method.__func__))
+        return self
+
+    @staticmethod
+    def add(a, b):
+        return f"{a} + {b}"
+
+    @staticmethod
+    def sub(a, b):
+        return f"{a} - {b}"
+
+    @staticmethod
+    def mul(a, b):
+        return f"{a} * {b}"
+
+    @staticmethod
+    def truediv(a, b):
+        return f"{a} / {b}"
+
+    @staticmethod
+    def abs(x):
+        return f"{x}.abs()"
+
+    @staticmethod
+    def sin(x):
+        return f"{x}.sin()"
+
+    @staticmethod
+    def cos(x):
+        return f"{x}.cos()"
+
+    @staticmethod
+    def exp(x):
+        return f"{x}.exp()"
+
+    @staticmethod
+    def exp2(x):
+        return f"{x}.exp2()"
+
+    @staticmethod
+    def expm1(x):
+        # decompose for a better performance
+        vec_one = f"decltype({x})(1)"
+        return f"{x}.exp() - {vec_one}"
+
+    @staticmethod
+    def erf(x):
+        return f"{x}.erf()"
+
+    @staticmethod
+    def erfc(x):
+        return f"{x}.erfc()"
+
+    @staticmethod
+    def erfinv(x):
+        return f"{x}.erfinv()"
+
+    @staticmethod
+    def sqrt(x):
+        return f"{x}.sqrt()"
+
+    @staticmethod
+    def eq(x, y):
+        return f"to_float_mask({x} == {y})"
+
+    @staticmethod
+    def ne(x, y):
+        return f"to_float_mask({x} != {y})"
+
+    @staticmethod
+    def lt(x, y):
+        return f"to_float_mask({x} < {y})"
+
+    @staticmethod
+    def gt(x, y):
+        return f"to_float_mask({x} > {y})"
+
+    @staticmethod
+    def le(x, y):
+        return f"to_float_mask({x} <= {y})"
+
+    @staticmethod
+    def ge(x, y):
+        return f"to_float_mask({x} >= {y})"
+
+    @staticmethod
+    def and_(x, y):
+        return f"{x} & {y}"
+
+    @staticmethod
+    def rsqrt(x):
+        return f"{x}.rsqrt()"
+
+    @staticmethod
+    def pow(a, b):
+        return f"{a}.pow({b})"
+
+    @staticmethod
+    def log(x):
+        return f"{x}.log()"
+
+    @staticmethod
+    def round(x):
+        return f"{x}.round()"
+
+    @staticmethod
+    def floor(x):
+        return f"{x}.floor()"
+
+    @staticmethod
+    def ceil(x):
+        return f"{x}.ceil()"
+
+    @staticmethod
+    def trunc(x):
+        return f"{x}.trunc()"
+
+    @staticmethod
+    def fmod(a, b):
+        return f"{a}.fmod({b})"
+
+    @staticmethod
+    def lgamma(x):
+        return f"{x}.lgamma()"
+
+    @staticmethod
+    def logical_and(a, b):
+        return f"({a} != 0) & ({b} != 0)"
+
+    @staticmethod
+    def logical_not(a):
+        return f"{a} == 0"
+
+    @staticmethod
+    def logical_or(a, b):
+        return f"({a} != 0) | ({b} != 0)"
+
+    @staticmethod
+    def logical_xor(a, b):
+        return f"({a} != 0) ^ ({b} != 0)"
+
+    @staticmethod
+    def tan(a):
+        return f"{a}.tan()"
+
+    @staticmethod
+    def tanh(a):
+        vec_one = f"decltype({a})(1)"
+        vec_two = f"decltype({a})(2)"
+        vec_minus_two = f"decltype({a})(-2)"
+        return f"{vec_two} / ({vec_one} + ({vec_minus_two} * {a}).exp()) - {vec_one}"
+
+    @staticmethod
+    def reciprocal(a):
+        return f"{a}.reciprocal()"
+
+    @staticmethod
+    def atan(x):
+        return f"{x}.atan()"
+
+    @staticmethod
+    def acos(x):
+        return f"{x}.acos()"
+
+    @staticmethod
+    def asin(x):
+        return f"{x}.asin()"
+
+    @staticmethod
+    def cosh(x):
+        return f"{x}.cosh()"
+
+    @staticmethod
+    def sinh(x):
+        return f"{x}.sinh()"
+
+    @staticmethod
+    def log10(x):
+        return f"{x}.log10()"
+
+    @staticmethod
+    def nextafter(x):
+        return f"{x}.nextafter()"
+
+    @staticmethod
+    def copysign(a, b):
+        return f"{a}.copysign({b})"
+
+    @staticmethod
+    def atan2(a, b):
+        return f"{a}.atan2({b})"
+
+    @staticmethod
+    def hypot(a, b):
+        return f"{a}.hypot({b})"
+
+    @staticmethod
+    def atanh(x):
+        # For real x, atanh(x) = 1/2 * log((1+x)/(1-x))
+        vec_one = f"decltype({x})(1)"
+        vec_one_half = f"decltype({x})(0.5)"
+        return f"{vec_one_half} * (({vec_one} + {x})/({vec_one} - {x})).log()"
+
+    @staticmethod
+    def asinh(x):
+        # For real x, asinh(x) = log(x + sqrt(1 + x**2))
+        vec_one = f"decltype({x})(1)"
+        return f"({x} + ({vec_one} + {x}*{x}).sqrt()).log()"
+
+    @staticmethod
+    def acosh(x):
+        return f"{x}.acosh()"
+
+    @staticmethod
+    def relu(x):
+        bug = config.cpp.inject_relu_bug_TESTING_ONLY
+        if bug == "compile_error":
+            return "compile error!"
+        elif bug == "runtime_error":
+            return f"{x}; throw 1"
+        elif bug == "accuracy":
+            return f"{x} + decltype({x})(1)"
+        elif bug is None:
+            return f"at::vec::clamp_min({x}, decltype({x})(0))"
+        else:
+            raise AssertionError(
+                f"unrecognized config cpp.inject_relu_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    # TODO: this seems to be dead
+    @staticmethod
+    def sigmoid(x):
+        return f"decltype({x})(1)/(decltype({x})(1) + {x}.neg().exp())"
+
+    @staticmethod
+    def neg(x):
+        return f"{x}.neg()"
+
+    @staticmethod
+    def floordiv(a, b):
+        # a and b are integer type
+        _t = f"decltype({a})"
+        quot = f"{a} / {b}"
+        has_rem = f"({a} % {b} != {_t}(0))"
+        is_neg = f"(({a} < {_t}(0)) != ({b} < {_t}(0)))"
+        return f"{_t}::blendv({quot}, {quot} - {_t}(1), {has_rem} & {is_neg})"
+
+    @staticmethod
+    def truncdiv(a, b):
+        # a and b are integer type
+        return f"{a} / {b}"
+
+    @staticmethod
+    def minimum(a, b):
+        return f"at::vec::minimum({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"at::vec::maximum({a}, {b})"
+
+    @staticmethod
+    def square(a):
+        return f"{a} * {a}"
+
+    @staticmethod
+    def where(a, b, c):
+        assert isinstance(b, CppCSEVariable)
+        if b.dtype != torch.float:
+            raise CppVecUnsupportedError(
+                "where with non-float tensor is not supported in vectorized codegen"
+            )
+        return f"decltype({b})::blendv({c}, {b}, {a})"
+
+    @staticmethod
+    def sign(x):
+        code = BracesBuffer()
+        vec_zero = f"decltype({x})(0)"
+        vec_one = f"decltype({x})(1)"
+        blendv_l = f"decltype({x})::blendv({vec_zero}, {vec_one}, {vec_zero} < {x})"
+        blendv_r = f"decltype({x})::blendv({vec_zero}, {vec_one}, {x} < {vec_zero})"
+        code.writeline("[&]()")
+        with code.indent():
+            code.writeline(f"auto left = {blendv_l};")
+            code.writeline(f"auto right = {blendv_r};")
+            code.writeline("return left - right;")
+        code.writeline("()")
+        return code
+
+    @staticmethod
+    def to_dtype(x, dtype, src_dtype=None):
+        assert dtype in [
+            torch.bool,
+            torch.float,
+            torch.bfloat16,
+            torch.float16,
+            torch.uint8,
+            torch.int8,
+            torch.int32,
+            torch.int64,
+        ], f"{__name__} does not support {dtype}"
+        node: torch.fx.Node = V.interpreter.current_node
+        assert node and isinstance(node, torch.fx.Node)
+        opt_ctx_x = get_opt_ctx(node.args[1])
+        assert opt_ctx_x
+        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype == torch.bool:
+            return f"vec_convert_to_mask({x})"
+        if opt_ctx_x.dtype == torch.bool and dtype in (torch.float, torch.float32):
+            return f"mask_convert_to_float({x})"
+        if opt_ctx_x.dtype == torch.bool and dtype in DTYPE_LOWP_FP:
+            return f"mask_convert_to_lowp<{DTYPE_TO_CPP[dtype]}>({x})"
+        if opt_ctx_x.dtype == torch.bool and dtype == torch.int64:
+            return f"mask_convert_to_int64({x})"
+        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype in DTYPE_LOWP_FP:
+            return f"cvt_fp32_to_lowp_fp<{DTYPE_TO_CPP[dtype]}>({x})"
+        if opt_ctx_x.dtype in DTYPE_LOWP_FP and dtype in (torch.float, torch.float32):
+            return f"cvt_lowp_fp_to_fp32<{DTYPE_TO_CPP[opt_ctx_x.dtype]}>({x})"
+        if opt_ctx_x.dtype in (torch.uint8, torch.int8) and dtype in (
+            torch.float,
+            torch.float32,
+        ):
+            # Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+            return f"at::vec::convert_int8_to_float({x})"
+        if opt_ctx_x.dtype in (torch.float, torch.float32) and dtype in (
+            torch.uint8,
+            torch.int8,
+        ):
+            # if we already handle the saturation previously.
+            # * Pattern match of quantization op in the loop body.
+            # * Skip the explicit saturation and clamp inside at::vec::convert_float_to_int8.
+            return f"at::vec::convert_float_to_int8<{DTYPE_TO_CPP[dtype]}>({x})"
+        if opt_ctx_x.dtype == torch.int32 and dtype == torch.float:
+            return f"at::vec::convert_to_fp_of_same_size<float>({x})"
+        if opt_ctx_x.dtype == torch.float and dtype == torch.int32:
+            return f"at::vec::convert_to_int_of_same_size({x})"
+        if opt_ctx_x.dtype == torch.int64 and dtype == torch.float:
+            return f"cvt_int64_to_fp32({x})"
+        if opt_ctx_x.dtype == torch.float and dtype == torch.int64:
+            return f"cvt_fp32_to_int64({x})"
+        if opt_ctx_x.dtype == torch.int32 and dtype == torch.int64:
+            return f"cvt_int32_to_int64({x})"
+        if opt_ctx_x.dtype == torch.int64 and dtype == torch.int32:
+            return f"cvt_int64_to_int32({x})"
+        # TODO(jgong5): support conversion for other types
+        # currently we only allow load/store torch.uint8 and handle conversion there
+        return f"({x})"
+
+    @staticmethod
+    def log1p(x):
+        bug = config.cpp.inject_log1p_bug_TESTING_ONLY
+        if bug == "accuracy":
+            return f"{x} + decltype({x})(1)"
+        elif bug is None:
+            return f"{x}.log1p()"
+        else:
+            raise AssertionError(
+                f"unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    @staticmethod
+    def masked(mask, body, other):
+        assert isinstance(V.kernel, CppVecKernel)
+        code = BracesBuffer()
+        var = V.kernel.cse.newvar()
+        with V.kernel.masked(mask) as new_mask:
+            code.writeline(f"auto {var} = [&]")
+            with V.kernel.swap_buffers(code), code.indent():
+                result = body()
+                code.writeline(f"return {result};")
+        code.writeline(";")
+        V.kernel.compute.splice(code)
+
+        body_code = f"{var}()"
+        body_code_vec = (
+            body_code
+            if result.is_vec
+            else f"{V.kernel._get_vec_type(torch.float)}({body_code})"
+        )
+        other_code = value_to_cpp(other, "float")
+        other_code_vec = f"{V.kernel._get_vec_type(torch.float)}({other_code})"
+        assert isinstance(new_mask, CppCSEVariable), new_mask
+        if new_mask.is_vec or result.is_vec:
+            if result.dtype != torch.float:
+                raise CppVecUnsupportedError(
+                    "masked with non-float tensor is not supported in vectorized codegen"
+                )
+            type = f"decltype({body_code_vec})"
+            float_mask = f"to_float_mask({new_mask})"
+            code = BracesBuffer()
+            code.writeline("[&]")
+            with V.kernel.swap_buffers(code), code.indent():
+                code.writeline(f"if (all_zero({float_mask}))")
+                with code.indent():
+                    code.writeline(f"return {other_code_vec};")
+                code.writeline("else")
+                with code.indent():
+                    code.writeline(
+                        f"return {type}::blendv({other_code_vec}, {body_code_vec}, {float_mask});"
+                    )
+            code.writeline("()")
+            csevar = V.kernel.cse.generate(
+                V.kernel.compute,
+                code,
+            )
+        else:
+            csevar = V.kernel.cse.generate(
+                V.kernel.compute, f"{mask} ? {body_code} : {other_code}"
+            )
+        # `result` is explicitly added to the args for correct propagation
+        # of relevant itervars and vectorization status.
+        csevar.update_on_args("masked", (mask, body, other, result), {})
+        return csevar
+
+    @staticmethod
+    def index_expr(expr, dtype):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx and opt_ctx.dtype is not None
+        dtype = opt_ctx.dtype
+        assert dtype == torch.int32
+        assert isinstance(V.kernel, CppVecKernel)
+        index = V.kernel.rename_indexing(expr)
+        tiling_var = V.kernel.itervars[V.kernel.tiling_idx]
+        stride = stride_at_vec_range(index, tiling_var, V.kernel.tiling_factor)
+        if stride.is_number and not V.kernel.index_indirect_depends_on(
+            index, tiling_var
+        ):
+            if stride == 0:
+                return CppOverrides.index_expr(expr, dtype)
+            value = ops.to_dtype(cexpr(index), dtype)
+            if isinstance(value, OpsValue):
+                value = value.value
+            csevar = V.kernel.arange(value, stride)
+        else:
+            csevar = V.kernel.load_non_contiguous(None, index, dtype, V.kernel.compute)
+        csevar.update_on_args("index_expr", (expr, dtype), {})
+        return csevar
+
+
+CppVecOverrides._initialize_pointwise_overrides("cppvec")
+
+
+class CppTile2DOverrides(CppVecOverrides):
+    @staticmethod
+    def index_expr(expr, dtype):
+        assert isinstance(V.kernel, CppTile2DKernel)
+        expr = V.kernel.transform_indexing(expr)
+        return CppVecOverrides.index_expr(expr, dtype)
+
+
+class CppKernel(Kernel):
+    overrides = CppOverrides  # type: ignore[assignment]
+    sexpr = cexpr
+    newvar_prefix = "auto "
+    suffix = ";"
+
+    def __init__(self, args, num_threads):
+        super().__init__(args)
+        self.call_ranges: Optional[Tuple[sympy.Expr, ...]] = None
+        self.ranges: List[sympy.Expr] = []
+        self.itervars: List[sympy.Symbol] = []
+        self.reduction_depth = None
+        self.reduction_prefix = IndentedBuffer()
+        self.reduction_suffix = IndentedBuffer()
+        self.reduction_var_map = {}
+        self.reduction_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
+        self.preloads = IndentedBuffer()
+        self.poststores = IndentedBuffer()
+        self.num_threads = num_threads  # num_threads the kernel specialized for
+        self.reduction_omp_dec: Dict[Tuple[str, str], str] = {}
+
+    @contextlib.contextmanager
+    def masked(self, mask):
+        """Context manager to add an additional mask to loads and stores."""
+        prior = self._load_mask
+        if prior:
+            mask = ops.and_(mask, prior)
+            if isinstance(mask, OpsValue):
+                mask = mask.value
+                assert isinstance(mask, CppCSEVariable)
+                # see NOTE [dtype of CppCSEVariable]
+                # mask's dtype should be bool
+                mask.dtype = torch.bool
+
+        self._load_mask = mask
+        try:
+            yield mask
+        finally:
+            self._load_mask = prior
+
+    def cache_fp32_cse_var_before_lowp_store(self, var_to_store):
+        """
+        https://github.com/pytorch/pytorch/issues/115260
+        For FusedSchedulerNode[node1, node2], the node2 loads what node1 stores and the buffer is
+        in low-precision floating point data type. When the output of node1 also serves as the output of the
+        kernel, the result of nodes would be different from the case when output of node1 is not the output
+        of the kernel (where we don't need to insert `to_dtype` for legalization). To address the problem, on
+        storing the lowp node1 output, we also add the inverse dtype conversion to high precision data type
+        to the cse cache.
+
+        Example (pseudo code):
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            store(buf, node1_output_lowp)
+            node2_input_lowp = load(buf)
+            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)
+
+        Without cse cache trick:
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            store(buf, node1_output_lowp)
+            node2_input_lowp = node_output_lowp # hit store cache
+            node2_input = to_dtype(node2_input_lowp, dtype=torch.float)
+
+        With cse cache trick:
+            node1_output = ...
+            node1_output_lowp = to_dtype(node1_output, dtype=torch.bfloat16)
+            # also add `to_dtype(node1_input_lowp, dtype=torch.float)` -> `node1_output` to cse cache
+            store(buf, node1_output_lowp)
+            node2_input_lowp = node_output_lowp # hit store cache
+            node2_input = node1_output # hit cse cache
+        """
+
+        if var_to_store.dtype not in DTYPE_LOWP_FP:
+            # only need to cache fp32 cse var while var_to_store is lowp data
+            return
+
+        def find_fp32_var(var, cache):
+            fp32_cse_var = None
+            fp32_cse_var_name = None
+            lowp_dtype = None
+            for expr, cse_var in cache.items():
+                if cse_var == var:
+                    lowp_dtype = is_to_lowp_dtype(expr)
+                    if lowp_dtype:
+                        m = re.search(r"tmp\d+", expr)
+                        assert m
+                        fp32_cse_var_name = m.group()
+            if fp32_cse_var_name:
+                for cse_var in cache.values():
+                    if cse_var.name == fp32_cse_var_name:
+                        fp32_cse_var = cse_var
+                        break
+                assert fp32_cse_var is not None
+            return fp32_cse_var, lowp_dtype
+
+        fp32_var, lowp_dtype = find_fp32_var(var_to_store, self.cse.cache)
+        if fp32_var:
+            self.cse.cache[
+                get_lowp_to_fp32_expr(var_to_store, lowp_dtype, self)
+            ] = fp32_var
+
+    def scale_index_with_offset(
+        self, index: sympy.Expr, scale=1, itervar_idx=-1, offset=0
+    ):
+        var = self.itervars[itervar_idx]
+        replacement = {var: var * scale + offset}
+        new_index = sympy_subs(index, replacement)
+        return new_index
+
+    def index_to_str(self, index: sympy.Expr) -> str:
+        """
+        Convert an index expr to a string that can be used in cpp code.
+        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
+        """
+        return cexpr(self.rename_indexing(index))
+
+    def index_indirect_depends_on(self, index: sympy.Expr, itervar: sympy.Symbol):
+        """
+        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
+        """
+        return any(
+            self.cse.varname_map[s.name].depends_on(itervar)  # type: ignore[attr-defined]
+            for s in index.free_symbols
+            if s.name in self.cse.varname_map  # type: ignore[attr-defined]
+            and isinstance(self.cse.varname_map[s.name], CppCSEVariable)  # type: ignore[attr-defined]
+        )
+
+    def index_depends_on(self, index: sympy.Expr, itervar: sympy.Symbol):
+        return itervar in index.free_symbols or self.index_indirect_depends_on(
+            index, itervar
+        )
+
+    def load(self, name: str, index: sympy.Expr):
+        var = self.args.input(name)
+        index = self.rename_indexing(index)
+        line = f"{var}[{cexpr_index(index)}]"
+        if V.graph.get_dtype(name) in [torch.float16]:
+            line = f"static_cast<float>({line})"
+        csevar = self.cse.generate(self.loads, line)
+        csevar.update_on_args("load", (name, index), {})
+        return csevar
+
+    def store(self, name, index, value, mode=None):
+        assert "buf" in name
+        var = self.args.output(name)
+        self.cache_fp32_cse_var_before_lowp_store(value)
+        index = self.rename_indexing(index)
+        if mode is None:
+            line = f"{var}[{cexpr_index(index)}] = {value};"
+        elif mode == "atomic_add":
+            if not config.cpp.dynamic_threads and self.num_threads == 1:
+                line = f"{var}[{cexpr_index(index)}] += {value};"
+            else:
+                dtype = V.graph.get_dtype(name)
+                # mirroring static_cast<float>(...) in load:
+                value = f"static_cast<{DTYPE_TO_CPP[dtype]}>({value})"
+                line = f"atomic_add(&{var}[{cexpr_index(index)}], {value});"
+        else:
+            raise NotImplementedError(f"store mode={mode}")
+        self.stores.writeline(DeferredLine(name, line))
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        argmax_or_argmin = reduction_type in {"argmax", "argmin"}
+
+        reduction_key = src_dtype, reduction_type, value
+        if reduction_key in self.reduction_cse.reduction_cache:
+            return self.reduction_cse.reduction_cache[reduction_key]
+
+        acc = self.reduction_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        self.reduction_var_map[acc] = reduction_type
+        if argmax_or_argmin:
+            self.reduction_prefix.writelines(
+                argmax_argmin_prefix(reduction_type, src_dtype, acc)
+            )
+            compare_op = (
+                "greater_or_nan" if reduction_type == "argmax" else "less_or_nan"
+            )
+            assert self.reduction_depth is not None
+            index = self.itervars[self.reduction_depth]
+            for i in range(self.reduction_depth + 1, len(self.itervars)):
+                index = index * self.ranges[i] + self.itervars[i]
+            self.stores.writelines(
+                [
+                    f"if(!({compare_op}({acc}.value, {value}, {acc}.index, {cexpr_index(index)}))) {{",
+                    f"    {acc}.index = {cexpr_index(index)}; {acc}.value = {value};",
+                    "}",
+                ],
+            )
+        else:
+            acc_type = reduction_acc_type(reduction_type, dtype)
+
+            if (reduction_type, acc_type) not in self.reduction_omp_dec:
+                if RTYPE_TO_CPP[reduction_type] not in NATIVE_OMP_RTYPES:
+                    # Scalar reduction for other reductions are declared by default
+                    self.reduction_prefix.splice(
+                        f"""\
+    #pragma omp declare reduction(\
+    {RTYPE_TO_CPP[reduction_type]}:{acc_type}:\
+    omp_out = {reduction_combine(reduction_type, "omp_out", "omp_in")}) \
+    initializer(omp_priv={{{reduction_init(reduction_type, dtype)}}})
+                """
+                    )
+                self.reduction_omp_dec[reduction_type, acc_type] = RTYPE_TO_CPP[
+                    reduction_type
+                ]
+
+            self.reduction_prefix.writeline(
+                f"{acc_type} {acc} = {reduction_init(reduction_type, dtype)};"
+            )
+            self.stores.writeline(
+                f"{acc} = {reduction_combine(reduction_type, acc, value)};"
+            )
+
+        result = reduction_project(reduction_type, acc)
+        self.reduction_cse.reduction_cache[reduction_key] = result
+        return result
+
+    def store_reduction(self, name, index, value):
+        index = self.rename_indexing(index)
+        var = self.args.output(name)
+        self.reduction_suffix.writeline(
+            DeferredLine(name, f"{var}[{cexpr_index(index)}] = {value};")
+        )
+
+    def set_ranges(self, lengths, reduction_lengths):
+        if self.call_ranges:
+            assert self.call_ranges == tuple(lengths) + tuple(
+                reduction_lengths
+            ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
+            assert self.reduction_depth == len(lengths)
+        else:
+            self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
+            self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
+            self.itervars = [
+                sympy_index_symbol(f"x{n}") for n in range(len(self.ranges))
+            ]
+            self.reduction_depth = len(lengths)
+        return (
+            self.itervars[: self.reduction_depth],
+            self.itervars[self.reduction_depth :],
+        )
+
+    def size_hint(self):
+        return V.graph.sizevars.size_hint(
+            sympy_product(self.call_ranges), fallback=8192
+        )
+
+    def codegen_loops_impl(self, loop_nest, code, worksharing):
+        threads = parallel_num_threads()
+        assert self.call_ranges is not None
+        par_depth = self.decide_parallel_depth(
+            self.call_ranges[: loop_nest.max_parallel_depth()], threads
+        )
+        with contextlib.ExitStack() as stack:
+            if par_depth:
+                if loop_nest.is_reduction_only():
+                    # need to close the worksharing scope to define reduction vars outside it
+                    worksharing.close()
+                else:
+                    worksharing.parallel(threads)
+                loop_nest.mark_parallel(par_depth)
+            elif threads > 1:
+                if worksharing.single():
+                    stack.enter_context(code.indent())
+
+            def gen_kernel(kernel):
+                with contextlib.ExitStack() as stack:
+                    assert kernel
+                    if hasattr(kernel, "codegen_inner_loops"):
+                        code.splice(kernel.preloads)
+                        kernel.codegen_inner_loops(code)
+                        stack.enter_context(code.indent())
+                    code.splice(kernel.loads)
+                    code.splice(kernel.compute)
+                    code.splice(kernel.stores)
+                if hasattr(kernel, "codegen_inner_loops"):
+                    code.splice(kernel.poststores)
+
+            def get_reduction_code_buffer(loops, is_suffix=True):
+                for loop in loops:
+                    for kernel in loop.get_kernels():
+                        if is_suffix:
+                            return kernel.reduction_suffix
+                        else:
+                            return kernel.reduction_prefix
+                return None
+
+            def gen_loops(loops: List[LoopLevel], in_reduction=False):
+                with contextlib.ExitStack() as stack_outer:
+                    if loops:
+                        loop = loops[0]
+                        if loop.is_reduction() and not in_reduction:
+                            reduction_prefix = get_reduction_code_buffer(
+                                loops, is_suffix=False
+                            )
+                            if reduction_prefix:
+                                stack_outer.enter_context(code.indent())
+                            code.splice(reduction_prefix)
+                        if loop_nest.is_reduction_only() and loop.parallel:
+                            worksharing.parallel(threads)
+
+                    for loop in loops:
+                        gen_loop(loop, in_reduction)
+
+                    if loops:
+                        loop = loops[0]
+                        if loop_nest.is_reduction_only() and loop.parallel:
+                            worksharing.close()
+                        if loop.is_reduction() and not in_reduction:
+                            code.splice(
+                                get_reduction_code_buffer(loops, is_suffix=True)
+                            )
+
+            def gen_loop(loop: LoopLevel, in_reduction=False):
+                with contextlib.ExitStack() as stack:
+                    loop_lines = loop.lines()
+                    if loop_lines is None:
+                        return
+                    code.writelines(loop_lines)
+                    stack.enter_context(code.indent())
+                    # generate inner loops or loop body
+                    if loop.inner:
+                        gen_loops(loop.inner, loop.is_reduction())
+                    else:
+                        kernels = loop.get_kernels()
+                        assert len(kernels) == 1
+                        gen_kernel(kernels[0])
+
+            stack.enter_context(code.indent())
+            if loop_nest.root:
+                gen_loops(loop_nest.root)
+            else:
+                gen_kernel(loop_nest.kernel)
+
+    def codegen_loops(self, code, worksharing):
+        loop_nest = LoopNestWithSplit.build(self)
+        self.codegen_loops_impl(loop_nest, code, worksharing)
+
+    @property
+    def assert_function(self) -> str:
+        if V.graph.aot_mode:
+            return "AOTI_TORCH_CHECK"
+        else:
+            return "TORCH_CHECK"
+
+    def decide_parallel_depth(self, ranges, threads):
+        seq = self.size_hint()
+        par = 1
+        depth = 0
+        for expr in ranges:
+            hint = V.graph.sizevars.size_hint(expr, fallback=8192)
+            if par >= 2 * threads or par == threads:
+                break
+            if seq // threads < config.cpp.min_chunk_size:
+                # not enough work
+                break
+            depth += 1
+            par *= hint
+            seq /= hint
+        # if we assume thread number is dynamic, make sure we
+        # have at least one parallel scope and let OMP runtime
+        # to manage the serial vs. parallel.
+        if config.cpp.dynamic_threads and depth == 0 and len(ranges) > 0:
+            depth = 1
+        return depth
+
+    @contextlib.contextmanager
+    def write_to_suffix(self):
+        prior = (self.loads, self.compute, self.stores, self.cse)
+        self.loads = IndentedBuffer()
+        self.compute = IndentedBuffer()
+        self.stores = IndentedBuffer()
+        self.cse = self.cse.clone()
+        yield
+        self.reduction_suffix.splice(self.loads)
+        self.reduction_suffix.splice(self.compute)
+        self.reduction_suffix.splice(self.stores)
+        (self.loads, self.compute, self.stores, self.cse) = prior
+
+    def create_cse_var(self, *args, **kwargs):
+        return CppCSEVariable(*args, **kwargs)
+
+
+class CppVecKernel(CppKernel):
+    overrides = CppVecOverrides  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        args,
+        num_threads,
+        tiling_factor=0,
+        tiling_idx=-1,
+        tiling_dtype=torch.float,
+    ):
+        super().__init__(args, num_threads)
+        self.vec_isa = codecache.pick_vec_isa()
+        assert self.vec_isa
+        if tiling_factor == 0:
+            tiling_factor = self.vec_isa.nelements(dtype=tiling_dtype)
+        self.tiling_factor = tiling_factor
+        self.tiling_idx = tiling_idx
+
+    def _get_num_vectors(self, dtype: torch.dtype) -> int:
+        num_vectors = math.ceil(
+            self.tiling_factor * dtype.itemsize * 8 / self.vec_isa.bit_width()
+        )
+        assert num_vectors >= 1
+        return num_vectors
+
+    def _get_vec_type(self, dtype: torch.dtype) -> str:
+        num_vectors = self._get_num_vectors(dtype)
+        if num_vectors == 1:
+            return f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+        else:
+            return f"at::vec::VectorizedN<{DTYPE_TO_CPP[dtype]},{num_vectors}>"
+
+    def _get_vec_load_line(
+        self,
+        var: str,
+        index: sympy.Expr,
+        dtype: torch.dtype,
+        load_mask: Optional[CppCSEVariable] = None,
+    ):
+        """
+        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
+        If `load_mask` is not None, we do a masked load accordingly.
+        Notes on the `dtype`:
+        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
+           It means we load half of the vector lanes for 16-bit data types and quarter of the
+           vector lanes for 8-bit data types.
+        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
+        """
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx is not None
+        load_mask_str = f"to_float_mask({load_mask})" if load_mask else None
+        loadbuf = f"{var} + {cexpr_index(index)}" if index != 0 else var
+        if dtype in (torch.uint8, torch.int8) and opt_ctx.is_load_int8_as_float:
+            assert self._get_num_vectors(torch.uint8) == 1
+            line = (
+                f"masked_load({loadbuf}, {load_mask_str})"
+                if load_mask_str
+                else f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>::loadu_one_fourth({loadbuf})"
+            )
+        elif opt_ctx.is_load_as_mask:
+            line = f"flag_to_float_vec({loadbuf})"
+        elif dtype in DTYPE_LOWP_FP:
+            line = (
+                f"masked_load({loadbuf}, {load_mask_str})"
+                if load_mask_str
+                else f"{self._get_vec_type(dtype)}::loadu({loadbuf}, {self.tiling_factor})"
+            )
+        else:
+            line = (
+                f"masked_load({loadbuf}, {load_mask_str})"
+                if load_mask_str
+                else f"{self._get_vec_type(dtype)}::loadu({loadbuf})"
+            )
+        return line
+
+    def load_non_contiguous(
+        self,
+        var: Optional[str],
+        index: sympy.Expr,
+        dtype: torch.dtype,
+        buffer: Optional[IndentedBuffer] = None,
+    ) -> CppCSEVariable:
+        """
+        Load a vector in a non-contiguous way. The vector is initialized from an array that is
+        filled in an inner loop over the tiling factor.
+        :param var: buffer to load from, i.e. `var[transformed(index)]`. If None, we load the index
+                    as index expression, i.e. `transformed(index)`.
+        :param index: index into the `var` or the index expression by its own if `var` is None.
+                      The `index` could contain indirect indexing or the tiling itervar. When used in
+                      the inner loop, the index is transformed as follows:
+                      1. the index is linearized along the tiling dim.
+                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
+        :param dtype: data type of `var` or `index` if `var` is None.
+        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
+        :return: a CppCSEVariable that represents the loaded vector.
+        """
+        if buffer is None:
+            buffer = self.loads
+
+        def get_result_size(dtype: torch.dtype) -> int:
+            if dtype.itemsize < 4:
+                return self.tiling_factor * (4 // dtype.itemsize)
+            else:
+                return self.tiling_factor
+
+        def vec_to_array(vec_var: CppCSEVariable) -> CppCSEVariable:
+            assert vec_var.is_vec
+            code = BracesBuffer()
+            code.writeline("[&]")
+            with self.swap_buffers(code), code.indent():
+                vec_dtype = vec_var.dtype
+                assert vec_dtype is not None
+                if vec_dtype == torch.bool:
+                    vec_dtype = torch.float
+                result_size = get_result_size(vec_dtype)
+                code.writeline(
+                    f"__at_align__ std::array<{DTYPE_TO_CPP[vec_dtype]}, {result_size}> tmpbuf;"
+                )
+                line = f"{vec_var}.store(tmpbuf.data());"
+                code.writeline(line)
+                code.writeline("return tmpbuf;")
+            code.writeline("()")
+            csevar = self.cse.generate(buffer, code)
+            assert isinstance(csevar, CppCSEVariable)
+            return csevar
+
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        assert opt_ctx is not None
+        is_mask = opt_ctx.is_load_as_mask
+        code = BracesBuffer()
+        code.writeline("[&]")
+        with self.swap_buffers(code), code.indent():
+            result_type = "float" if is_mask else f"{DTYPE_TO_CPP[dtype]}"
+            result_size = get_result_size(dtype)
+            result_declare = (
+                f"__at_align__ std::array<{result_type}, {result_size}> tmpbuf;"
+            )
+            code.writeline(result_declare)
+            itervar_inner = sympy_index_symbol(
+                f"{self.itervars[self.tiling_idx]}_inner"
+            )
+            replacements = {}
+            for indirect_var in (
+                self.cse.varname_map[s.name]  # type: ignore[attr-defined]
+                for s in index.free_symbols
+                if s.name.startswith("tmp")  # type: ignore[attr-defined]
+            ):
+                assert isinstance(indirect_var, CppCSEVariable)
+                if indirect_var.is_vec:
+                    array_var = vec_to_array(indirect_var)
+                    replacements[indirect_var] = f"{array_var}[{itervar_inner}]"
+            load_mask = None
+            if self._load_mask is not None:
+                assert isinstance(self._load_mask, CppCSEVariable), self._load_mask
+                if self._load_mask.is_vec:
+                    load_mask = (
+                        f"vector_lane_mask_check({self._load_mask}, {itervar_inner})"
+                    )
+                else:
+                    load_mask = f"{self._load_mask} != 0"
+            index = sympy_subs(index, replacements)  # type: ignore[arg-type]
+            index = self.scale_index_with_offset(
+                index, itervar_idx=self.tiling_idx, offset=itervar_inner
+            )
+            if codecache.is_gcc():
+                code.writeline(f"#pragma GCC unroll {self.tiling_factor}")
+            else:
+                code.writeline(f"#pragma unroll {self.tiling_factor}")
+            code.writeline(
+                f"for (long {itervar_inner} = 0; {itervar_inner} < {self.tiling_factor}; {itervar_inner}++)"
+            )
+            with code.indent(), contextlib.ExitStack() as stack:
+                rhs = (
+                    f"{var}[{cexpr_index(index)}]"
+                    if var is not None
+                    else f"{cexpr_index(index)}"
+                )
+                if is_mask:
+                    rhs = f"flag_to_float_scalar({rhs})"
+                if load_mask:
+                    code.writeline(f"if ({load_mask})")
+                    stack.enter_context(code.indent())
+                code.writeline(f"tmpbuf[{itervar_inner}] = {rhs};")
+            load_line = self._get_vec_load_line("tmpbuf.data()", 0, dtype)  # type: ignore[arg-type]
+            code.writeline(f"return {load_line};")
+        code.writeline("()")
+        csevar = self.cse.generate(buffer, code)
+        assert isinstance(csevar, CppCSEVariable)
+        csevar.is_vec = True
+        return csevar
+
+    def load(self, name: str, index: sympy.Expr):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        var = self.args.input(name)
+        index = self.rename_indexing(index)
+        dtype = V.graph.get_dtype(name)
+        tiling_var = self.itervars[self.tiling_idx]
+        stride = stride_at_vec_range(index, tiling_var, self.tiling_factor)
+        if stride == 0:
+            # load scalar and lazily broadcast it on demand
+            return super().load(name, index)
+        non_contiguous = stride != 1 or self.index_indirect_depends_on(
+            index, tiling_var
+        )
+        if non_contiguous:
+            csevar = self.load_non_contiguous(var, index, dtype)
+        else:
+            line = self._get_vec_load_line(var, index, dtype, self._load_mask)
+            csevar = self.cse.generate(self.loads, line)  # type: ignore[assignment]
+        assert isinstance(csevar, CppCSEVariable)
+        csevar.update_on_args("load", (name, index), {})
+        csevar.is_vec = True
+        return csevar
+
+    def _get_vec_store_line(
+        self,
+        value: Union[str, CppCSEVariable],
+        var: str,
+        index: sympy.Expr,
+        dtype: torch.dtype,
+    ):
+        """
+        Get a store line str that stores `value` into `var` at `index` of `dtype`.
+        :param value: Vectorized type templaterized on `dtype`.
+        :param var: buffer to store into.
+        :index: index into the `var`.
+        """
+        # when value's type is str (e.g., welford reduction), caller should make sure
+        # it is a vector
+        assert isinstance(value, str) or (
+            isinstance(value, CppCSEVariable) and value.is_vec
+        ), value
+        tiling_var = self.itervars[self.tiling_idx]
+        assert index.has(tiling_var), f"index: {index}, tiling_var: {tiling_var}"
+        var_expr = f"{var} + {cexpr_index(index)}"
+        stride = stride_at_vec_range(index, tiling_var, self.tiling_factor)
+        non_contiguous = stride != 1 or self.index_indirect_depends_on(
+            index, tiling_var
+        )
+        if non_contiguous:
+            var_expr = "tmpbuf"
+        if dtype == torch.float:
+            line = f"{value}.store({var_expr});"
+        else:
+            line = f"{value}.store({var_expr}, {self.tiling_factor});"
+        if non_contiguous:
+            inner = sympy_index_symbol(f"{tiling_var}_inner")
+            new_index = self.scale_index_with_offset(
+                index, itervar_idx=self.tiling_idx, offset=inner
+            )
+            tmp_bufsize = (
+                f"{self.tiling_factor}*sizeof(float)/sizeof({DTYPE_TO_CPP[dtype]})"
+            )
+            line = (
+                f"{{ __at_align__ {DTYPE_TO_CPP[dtype]} tmpbuf[{tmp_bufsize}]; {line} "
+                f"for (long {inner} = 0; {inner} < {self.tiling_factor}; {inner}++) "
+                f"{var}[{cexpr_index(new_index)}] = tmpbuf[{inner}]; }}"
+            )
+        return line
+
+    def store(self, name, index, value, mode=None):
+        assert "buf" in name
+        assert mode is None
+        assert isinstance(value, CppCSEVariable), value
+        if not value.is_vec:
+            # this happens when we store a scalar into a vectorized buffer like "fill"
+            value = self.broadcast(value)
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        var = self.args.output(name)
+        self.cache_fp32_cse_var_before_lowp_store(value)
+        index = self.rename_indexing(index)
+        self.stores.writeline(
+            DeferredLine(
+                name,
+                self._get_vec_store_line(value, var, index, V.graph.get_dtype(name)),
+            )
+        )
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        assert reduction_type in {
+            "max",
+            "min",
+            "sum",
+            "prod",
+            "xor_sum",
+            "welford_reduce",
+            "welford_combine",
+        }
+        assert dtype == src_dtype
+        assert dtype in [torch.float, torch.int64]
+        assert isinstance(value, CppCSEVariable), value
+
+        if not value.is_vec:
+            value = self.broadcast(value)
+
+        acc_type = reduction_acc_type(reduction_type, dtype)
+        acc_type_vec = self.reduction_acc_type_vec(reduction_type, dtype)
+
+        if (reduction_type, acc_type) not in self.reduction_omp_dec:
+            if RTYPE_TO_CPP[reduction_type] not in NATIVE_OMP_RTYPES:
+                # Scalar reduction for other reductions are declared by default
+                self.reduction_prefix.splice(
+                    f"""\
+#pragma omp declare reduction(\
+{RTYPE_TO_CPP[reduction_type]}:{acc_type}:\
+omp_out = {reduction_combine(reduction_type, "omp_out", "omp_in")}) \
+initializer(omp_priv={{{reduction_init(reduction_type, dtype)}}})
+            """
+                )
+            self.reduction_omp_dec[reduction_type, acc_type] = RTYPE_TO_CPP[
+                reduction_type
+            ]
+
+        if (reduction_type, acc_type_vec) not in self.reduction_omp_dec:
+            self.reduction_prefix.splice(
+                f"""\
+#pragma omp declare reduction(\
+{RTYPE_TO_CPP[reduction_type]}:{acc_type_vec}:\
+omp_out = {self.reduction_combine_vec(reduction_type, "omp_out", "omp_in")}) \
+initializer(omp_priv={{{self.reduction_init_vec(reduction_type, dtype)}}})
+            """
+            )
+            self.reduction_omp_dec[reduction_type, acc_type_vec] = RTYPE_TO_CPP[
+                reduction_type
+            ]
+
+        reduction_key = src_dtype, reduction_type, value
+        if reduction_key in self.reduction_cse.reduction_cache:
+            return self.reduction_cse.reduction_cache[reduction_key]
+
+        acc = self.reduction_cse.generate(
+            self.loads, f"reduction {reduction_key}", write=False
+        )
+        acc_vec = f"{acc}_vec"
+
+        self.reduction_var_map[acc_vec] = reduction_type
+        self.reduction_prefix.writeline(
+            f"{acc_type} {acc} = {reduction_init(reduction_type, dtype)};"
+        )
+        self.reduction_prefix.writeline(
+            f"{acc_type_vec} {acc_vec} = {self.reduction_init_vec(reduction_type, dtype)};"
+        )
+        self.stores.writeline(
+            f"{acc_vec} = {self.reduction_combine_vec(reduction_type, acc_vec, value)};"
+        )
+
+        tmpvar: Union[str, CSEVariable]
+        if self.tiling_idx >= self.reduction_depth:
+            # Horizontal reduction
+            if is_welford_reduction(reduction_type):
+                assert (
+                    self._get_num_vectors(dtype) == 1
+                ), "Welford reduction does not support VectorizedN (N>1)"
+                next_value = f"welford_vec_reduce_all({acc_vec})"
+            else:
+                reduce_all_body = (
+                    "{ return "
+                    + self.reduction_combine_vec(reduction_type, "x", "y")
+                    + "; }"
+                )
+                vec = f"at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>"
+                vec_reduce_all_func = f"at::vec::vec_reduce_all<{DTYPE_TO_CPP[dtype]}>"
+                next_value = f"{vec_reduce_all_func}([]({vec}& x, {vec}& y) {reduce_all_body}, {acc_vec})"
+
+            self.reduction_suffix.writeline(
+                f"{acc} = {reduction_combine(reduction_type, acc, next_value)};"
+            )
+            tmpvar = acc
+        else:
+            tmpvar = acc_vec
+
+        result = reduction_project(reduction_type, tmpvar)
+        self.reduction_cse.reduction_cache[reduction_key] = result
+        return result
+
+    def store_reduction(self, name, index, value):
+        index = self.rename_indexing(index)
+        var = self.args.output(name)
+        out_dtype = V.graph.get_dtype(name)
+        # Only float reductions are vectorized currently
+        dtype = torch.float
+        if self.tiling_idx >= self.reduction_depth:
+            # Horizontal reduction
+            self.reduction_suffix.writeline(
+                DeferredLine(
+                    name,
+                    f"{var}[{cexpr_index(index)}] = static_cast<{DTYPE_TO_CPP[out_dtype]}>({value});",
+                )
+            )
+        else:
+            # Vertical reduction
+            store_lines = []
+            if out_dtype != dtype:
+                if out_dtype in DTYPE_LOWP_FP and dtype == torch.float:
+                    _lowp_fp_tmpvar_vec = f"{DTYPE_TO_CPP[out_dtype]}_{value}"
+                    store_lines = [
+                        DeferredLine(
+                            name,
+                            f"auto {_lowp_fp_tmpvar_vec} = cvt_fp32_to_lowp_fp<{DTYPE_TO_CPP[out_dtype]}>({value});",
+                        )
+                    ]
+                    value = _lowp_fp_tmpvar_vec
+                else:
+                    raise AssertionError(
+                        f"Unsupported reduction type from {dtype} to {out_dtype}"
+                    )
+            store_lines += [
+                DeferredLine(
+                    name,
+                    self._get_vec_store_line(value, var, index, out_dtype),
+                )
+            ]
+            self.reduction_suffix.writelines(store_lines)
+
+    def broadcast(self, scalar_var: CppCSEVariable) -> CppCSEVariable:
+        assert not scalar_var.is_vec
+        if scalar_var.dtype == torch.bool:
+            vec_var = self.cse.generate(
+                self.compute, f"to_float_mask({scalar_var.name})"
+            )
+        else:
+            assert scalar_var.dtype is not None
+            vec_var = self.cse.generate(
+                self.compute,
+                f"{self._get_vec_type(scalar_var.dtype)}({scalar_var.name})",
+            )
+        assert isinstance(vec_var, CppCSEVariable)
+        vec_var.dtype = scalar_var.dtype
+        vec_var.dependent_itervars = scalar_var.dependent_itervars
+        vec_var.is_vec = True
+        return vec_var
+
+    def arange(
+        self, index: Union[sympy.Expr, CppCSEVariable], stride: sympy.Symbol
+    ) -> CppCSEVariable:
+        if isinstance(index, sympy.Expr):
+            index = cexpr(index)
+        else:
+            assert isinstance(index, CppCSEVariable)
+            assert not index.is_vec
+        csevar = self.cse.generate(
+            self.compute,
+            f"{self._get_vec_type(torch.int32)}::arange({index}, {stride})",
+        )
+        assert isinstance(csevar, CppCSEVariable)
+        csevar.dtype = torch.int32
+        csevar.is_vec = True
+        return csevar
+
+    def reduction_init_vec(self, reduction_type, dtype):
+        scalar_type = DTYPE_TO_COMPUTATION_DTYPE[dtype]
+        vec_type = self._get_vec_type(scalar_type)
+
+        if is_welford_reduction(reduction_type):
+            return f"Welford<{vec_type}>()"
+
+        scalar_init = reduction_init(reduction_type, dtype)
+        return f"{vec_type}({scalar_init})"
+
+    def reduction_acc_type_vec(self, reduction_type, dtype):
+        assert reduction_type not in {"argmin", "argmax"}
+        scalar_type = DTYPE_TO_COMPUTATION_DTYPE[dtype]
+        vec_type = self._get_vec_type(scalar_type)
+        if is_welford_reduction(reduction_type):
+            return f"Welford<{vec_type}>"
+
+        return vec_type
+
+    def reduction_combine_vec(self, reduction_type, var, next_value):
+        if reduction_type == "max":
+            return f"at::vec::maximum({var}, {next_value})"
+        elif reduction_type == "min":
+            return f"at::vec::minimum({var}, {next_value})"
+        elif reduction_type == "sum":
+            return f"{var} + {next_value}"
+        elif reduction_type == "prod":
+            return f"{var} * {next_value}"
+        elif reduction_type == "xor_sum":
+            return f"{var} ^ {next_value}"
+        elif reduction_type == "welford_reduce":
+            return f"welford_combine({var}, {next_value})"
+        elif reduction_type == "welford_combine":
+            if isinstance(next_value, tuple):
+                # When reading a value from Inductor IR we have a tuple of variable names
+                mean, m2, weight = next_value
+            else:
+                # When combining intermediate accumulators we have a Welford<T> struct
+                mean, m2, weight = reduction_project(reduction_type, next_value)
+            return f"welford_combine({var}, {{{mean}, {m2}, {weight}}})"
+        else:
+            raise NotImplementedError()
+
+
+class CppTile2DKernel(CppVecKernel):
+    """
+    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
+    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
+    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
+    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
+    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
+    and store are generated into kernel.preloads and kernel.poststores buffers.
+
+    The loop structure looks like below:
+    for ...
+      for i_outer ...
+        for ...
+          for inner_most ...
+            // generated by CppTile2DKernel
+            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
+            float tmp1[16*16]; // into kernel.preloads
+            for i_inner ... { // the kernel inner loop
+              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
+            }
+            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
+          for inner_most ... (tail)
+            // generated by CppVecKernel
+            ...
+      for i_outer ... (tail)
+        for ...
+          for ...
+            // generated by CppKernel
+            ...
+    """
+
+    overrides = CppTile2DOverrides  # type: ignore[assignment]
+
+    def __init__(self, args, num_threads, tiling_factor, tiling_indices, tiling_dtype):
+        super().__init__(
+            args, num_threads, tiling_factor, tiling_indices[1], tiling_dtype
+        )
+        self.tiling_indices = tiling_indices
+
+    def inner_itervar(self):
+        return sympy_index_symbol(f"{self.itervars[self.outer_idx]}_inner")
+
+    def need_vec_transpose(self, index):
+        outer_var = self.itervars[self.outer_idx]
+        inner_var = self.itervars[self.tiling_idx]
+        outer_stride = stride_at_vec_range(index, outer_var, self.tiling_factor)
+        inner_stride = stride_at_vec_range(index, inner_var, self.tiling_factor)
+        return (
+            self._load_mask is None  # TODO: support transposition with mask
+            and outer_stride == 1
+            and index.has(inner_var)
+            and not inner_stride.has(inner_var)
+            and not inner_stride.has(outer_var)
+        )
+
+    def gen_transposed_tile_load_store(self, name, var, index, is_store):
+        # transposed tile load/store outside the kernel inner loop
+        dtype = V.graph.get_dtype(name)
+        factor = self.tiling_factor
+        src = f"{var} + {cexpr_index(index)}"
+        dst = "__place_holder__"
+        ld_src = f"{cexpr_index(stride_at_vec_range(index, self.itervars[self.tiling_idx], self.tiling_factor))}"
+        ld_dst = f"{factor}"
+        if is_store:
+            src, dst = dst, src
+            ld_src, ld_dst = ld_dst, ld_src
+
+        need_define = True
+        load_or_store = f"at::vec::transpose_mxn<{DTYPE_TO_CPP[dtype]},{factor},{factor}>({src}, {ld_src}, {dst}, {ld_dst});"
+        if is_store:
+            tile_var = self.cse.newvar()
+        elif load_or_store not in self.cse.cache:
+            tile_var = self.cse.generate(self.preloads, load_or_store, write=False)
+        else:
+            need_define = False
+            tile_var = self.cse.cache[load_or_store]
+
+        if need_define:
+            define_line = f"{DTYPE_TO_CPP[dtype]} {tile_var}[{factor}*{factor}] __attribute__ ((aligned ({factor})));"
+            self.preloads.writeline(define_line)
+
+        load_or_store = load_or_store.replace("__place_holder__", str(tile_var))
+        if is_store:
+            self.poststores.writeline(DeferredLine(name, load_or_store))
+        else:
+            self.preloads.writeline(load_or_store)
+
+        return tile_var
+
+    def load(self, name: str, index: sympy.Expr):
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        var = self.args.input(name)
+        index = self.rename_indexing(index)
+
+        inner = self.inner_itervar()
+        if self.need_vec_transpose(index):
+            tile_var = self.gen_transposed_tile_load_store(
+                name, var, index, is_store=False
+            )
+            # vector load inside the kernel inner loop
+            loadbuf = f"{tile_var} + {cexpr_index(inner * self.tiling_factor)}"
+            dtype = V.graph.get_dtype(name)
+            line = self._get_vec_load_line(loadbuf, 0, dtype)  # type: ignore[arg-type]
+            csevar = self.cse.generate(self.loads, line)
+            csevar.update_on_args("load", (name, index), {})
+            assert isinstance(csevar, CppCSEVariable)
+            csevar.is_vec = True
+            return csevar
+        else:
+            new_index = self.transform_indexing(index)
+            return super().load(name, new_index)
+
+    def store(self, name, index, value, mode=None):
+        assert "buf" in name
+        opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+        var = self.args.output(name)
+
+        inner = self.inner_itervar()
+        index = self.rename_indexing(index)
+        assert mode is None
+        if self.need_vec_transpose(index):
+            tile_var = self.gen_transposed_tile_load_store(
+                name, var, index, is_store=True
+            )
+            # vector store inside the kernel inner loop
+            storebuf = f"{tile_var} + {cexpr_index(inner * self.tiling_factor)}"
+            if V.graph.get_dtype(name) in DTYPE_LOWP_FP:
+                line = f"{value}.store({storebuf}, {self.tiling_factor});"
+            elif V.graph.get_dtype(name) in (torch.uint8, torch.int8):
+                line = f"{value}.store({storebuf}, {self.tiling_factor});"
+            else:
+                line = f"{value}.store({storebuf});"
+            self.stores.writeline(DeferredLine(name, line))
+        else:
+            new_index = self.transform_indexing(index)
+            super().store(name, new_index, value, mode)
+
+    def codegen_inner_loops(self, code):
+        inner = self.inner_itervar()
+        code.writeline(
+            f"for (long {inner} = 0; {inner} < {self.tiling_factor}; {inner}++)"
+        )
+
+    def set_ranges(self, group, reduction_group):
+        vars = super().set_ranges(group, reduction_group)
+        # do vertical reduction as the tail loop
+        self.outer_idx, self.tiling_idx = (
+            self.tiling_indices
+            if self.tiling_indices[1] < self.reduction_depth
+            else reversed(self.tiling_indices)
+        )
+        return vars
+
+    def transform_indexing(self, index: sympy.Expr) -> sympy.Expr:
+        return self.scale_index_with_offset(
+            index,
+            itervar_idx=self.outer_idx,
+            offset=self.inner_itervar(),
+        )
+
+
+class CppVecKernelChecker(CppVecKernel):
+    def __init__(self, args, num_threads, tiling_factor, tiling_idx=-1):
+        super().__init__(args, num_threads, tiling_factor, tiling_idx)
+
+        # Since this kernel is only for checker but does not generate any
+        # code, so we need to decrease the kernel count.
+        metrics.generated_kernel_count -= 1
+
+        # Used to record the graph wrapper code as the wrapper_code status could be
+        # changed during graph run.
+        self._orig_wrapper_code = None
+
+        self.simd_vec = True
+
+        self.fast_vec_list = []
+        for k, v in CppVecOverrides.__dict__.items():
+            if isinstance(v, staticmethod):
+                self.fast_vec_list.append(k)
+        self.exit_stack = contextlib.ExitStack()
+
+        # Cache all the load result
+        self.load_supported_dtypes: List[torch.dtype] = [
+            torch.float,
+            torch.bfloat16,
+            torch.float16,
+            torch.bool,
+            torch.uint8,
+            torch.int8,
+            torch.int32,
+            torch.int64,
+        ]
+        self.store_supported_dtypes: List[torch.dtype] = [
+            torch.float,
+            torch.bfloat16,
+            torch.float16,
+            torch.uint8,
+            torch.int8,
+            torch.int32,
+            torch.int64,
+        ]
+        # Cache the dtypes of the store operation. If the store is mixing dtypes, the
+        # vectorization would not support it as it is hard to determine the vec dtype
+        self.store_dtypes: List[torch.dtype] = []
+        # The dtype is used for vectorization
+        self.vec_dtype: torch.dtype = torch.float32
+
+    def disable_vec(self, msg=None):
+        if schedule_log.isEnabledFor(logging.DEBUG):
+            schedule_log.debug("Disabled vectorization: %s", msg)
+        self.simd_vec = False
+
+    def is_mask(self, name: str, users: Dict[torch.fx.Node, None]):
+        load_type = V.graph.get_dtype(name)
+        if load_type == torch.bool:
+            return all(user.target in ("where", "masked") for user in users.keys())
+        elif load_type in (torch.uint8, torch.int8):
+            """
+            If the load value is torch.uint8/int8, then we only support the loaded
+            value is as the mask.
+            """
+            if not all(
+                user.target == "to_dtype" and user.args[-1] == torch.bool
+                for user in users.keys()
+            ):
+                return False
+
+            for to_dtype_node in users.keys():
+                assert to_dtype_node.target == "to_dtype"
+                if not all(
+                    user.target in ("where", "masked")
+                    for user in to_dtype_node.users.keys()
+                ):
+                    return False
+            return True
+        else:
+            return False
+
+    def is_load_int8_as_float(self, name: str, users: Dict[torch.fx.Node, None]):
+        """
+        Check:
+        1. load_type is torch.uint8 or torch.int8
+        2. has 1 user node of target to_dtype
+        3. dtype of to_dtype is torch.float
+        """
+        load_type = V.graph.get_dtype(name)
+        if load_type not in (torch.uint8, torch.int8):
+            return False
+        if len(users) == 1:
+            user = next(iter(users))
+            if (user.target == "to_dtype") and (user.args[-1] == torch.float):
+                return True
+            return False
+        return False
+
+    def can_store_fp32_as_int8(self, store_var: str, value_node: torch.fx.Node):
+        """
+        Check:
+        1. store_type is torch.uint8/torch.int8
+        2. value_node is of target to_dtype
+        3. dtype of to_dtype node is torch.uint8/torch.int8
+        """
+        store_type = V.graph.get_dtype(store_var)
+        if store_type not in (torch.uint8, torch.int8):
+            return False
+        if value_node.target == "to_dtype" and value_node.args[-1] in (
+            torch.uint8,
+            torch.int8,
+        ):
+            return True
+
+        return False
+
+    def is_load_integer_scalar_tensor(self, name: str, index: sympy.Expr):
+        load_dtype = V.graph.get_dtype(name)
+        buffer = V.graph.get_buffer(name)
+        return (
+            load_dtype in [torch.int32, torch.int64]
+            and isinstance(buffer, TensorBox)
+            and isinstance(buffer.data, StorageBox)
+            and (len(buffer.data.layout.size) == 0)
+            and (index == 0)
+        )
+
+    def load(self, name: str, index: sympy.Expr):
+        with RecordOptimizationContext(__name__) as node_ctx:
+            load_dtype = V.graph.get_dtype(name)
+            opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+            assert opt_ctx
+            opt_ctx.dtype = load_dtype
+            opt_ctx.is_load_as_mask = self.is_mask(name, node_ctx.get_fx_node().users)
+            opt_ctx.is_load_int8_as_float = self.is_load_int8_as_float(
+                name, node_ctx.get_fx_node().users
+            )
+
+            var = self.cse.newvar()
+
+            if len(self.itervars) == 0:
+                self.disable_vec("not a loop")
+                return var
+
+            if load_dtype in (torch.bool, torch.uint8, torch.int8) and not (
+                opt_ctx.is_load_as_mask or opt_ctx.is_load_int8_as_float
+            ):
+                if not opt_ctx.is_load_as_mask:
+                    self.disable_vec(f"{load_dtype} not loaded as mask")
+                elif not opt_ctx.is_load_int8_as_float:
+                    self.disable_vec(f"{load_dtype} not loaded as float")
+                return var
+
+            if (
+                (load_dtype not in self.load_supported_dtypes)
+                and not self.is_load_integer_scalar_tensor(name, index)
+                and index.has(self.itervars[self.tiling_idx])
+            ):
+                self.disable_vec(f"{load_dtype} not supported by load")
+                return var
+
+            return var
+
+    def store(self, name, index, value, mode=None):
+        with RecordOptimizationContext(__name__) as node_ctx:
+            if len(self.itervars) == 0:
+                self.disable_vec("not a loop")
+                return self.simd_vec
+
+            store_dtype = V.graph.get_dtype(name)
+
+            opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+            assert opt_ctx
+            opt_ctx.dtype = store_dtype
+
+            store_dtype = torch.float if store_dtype == torch.float32 else store_dtype
+            self.store_dtypes.append(store_dtype)
+            if store_dtype not in self.store_supported_dtypes:
+                self.disable_vec(f"{store_dtype} not supported by store")
+                return self.simd_vec
+
+            if store_dtype in (torch.uint8, torch.int8):
+                value_node = node_ctx.get_fx_node().all_input_nodes[-1]
+                if not self.can_store_fp32_as_int8(name, value_node):
+                    self.disable_vec("not support store float32 as uint8/int8")
+                    return self.simd_vec
+
+            assert "buf" in name
+            index = self.rename_indexing(index)
+
+            if mode:
+                self.disable_vec(f"store mode: {mode}")
+                return self.simd_vec
+
+            if index.is_number:
+                self.disable_vec(f"constant store index: {index}")
+            return self.simd_vec
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        if (
+            (dtype == torch.float and src_dtype == torch.float)
+            or (dtype == torch.int64 and src_dtype == torch.int64)
+            and reduction_type in VECTORIZABLE_RTYPES
+        ):
+            pass
+        else:
+            self.disable_vec(
+                f"reduction: dtype {dtype}, src_dtype {src_dtype}, reduction_type {reduction_type}"
+            )
+        if is_welford_reduction(reduction_type):
+            return tuple([self.simd_vec] * 3)
+        return self.simd_vec
+
+    def store_reduction(self, name, index, value):
+        return self.simd_vec
+
+    def is_supported_cmp(self, node: torch.fx.Node):
+        def get_node_dtype(node):
+            if type(node) == torch.fx.Node:
+                opt_ctx: OptimizationContext = get_current_node_opt_ctx()
+                return opt_ctx.dtype if opt_ctx else None
+            else:
+                return None
+
+        def get_cmp_dtypes(node: torch.fx.Node):
+            return get_node_dtype(node.args[-2]), get_node_dtype(node.args[-1])
+
+        assert len(node.args) >= 2
+        # cmp(x, y): y is a magic value like x >= 1
+        if type(node.args[-1]) in [int, float]:
+            return True
+        # cmp(x, y): x is a magic value like 1 >= y
+        if type(node.args[-2]) in [int, float]:
+            return False
+
+        left_dtype, right_dtype = get_cmp_dtypes(node)
+        if left_dtype is None or right_dtype is None:
+            # TODO(Eikan): To record, deduce and propagate the data type of every expression.
+            return True
+        else:
+            return left_dtype == right_dtype
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        assert self._orig_wrapper_code is not None
+        # Restore the wrapper_code
+        V.graph.wrapper_code = self._orig_wrapper_code
+        self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
+
+    def __enter__(self):
+        # Record the graph wrapper code. The wrapper_code status could be
+        # changed during graph run. Regarding this checker, we also need to
+        # run the graph but we don't expect to change any status that would
+        # impact the code generation. Hence, we record the graph wrapper code
+        # and replace it with a dummy wrapper_code and then restore to the
+        # original one as long as the checker is finished.
+        self._orig_wrapper_code = V.graph.wrapper_code
+        V.graph.wrapper_code = WrapperCodeGen()
+
+        parent_handler = V.MockHandler()
+
+        class VecCheckerProxy:
+            bin_cmp_ops = ["eq", "ne", "le", "ge", "lt", "gt"]
+
+            @staticmethod
+            def _bin_cmp_op(x, y):
+                current_node: torch.fx.Node = V.interpreter.current_node
+                if not self.is_supported_cmp(current_node):
+                    self.disable_vec(f"binary comparison op: {current_node}")
+                return self.simd_vec
+
+            @staticmethod
+            def __getattr__(name):  # type: ignore[misc]
+                def inner(*args, **kwargs):
+                    if name in VecCheckerProxy.bin_cmp_ops:
+                        return VecCheckerProxy._bin_cmp_op(args, kwargs)
+
+                    if name not in self.fast_vec_list:
+                        self.disable_vec(f"op: {name}")
+
+                    parent_val = getattr(parent_handler, name)(*args, **kwargs)
+                    return pytree.tree_map(lambda _: self.simd_vec, parent_val)
+
+                return inner
+
+            @staticmethod
+            def load(name: str, index: sympy.Expr):
+                return self.load(name, index)
+
+            @staticmethod
+            def store(name, index, value, mode=None):
+                return self.store(name, index, value, mode=mode)
+
+            @staticmethod
+            def reduction(dtype, src_dtype, reduction_type, value):
+                return self.reduction(dtype, src_dtype, reduction_type, value)
+
+            @staticmethod
+            def store_reduction(name, index, value):
+                return self.store_reduction(name, index, value)
+
+            @staticmethod
+            def constant(val, dtype):
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    # VecKernel override dtype for constant
+                    # Vectorization only support int32/fp32 now
+                    # So if dtype = int64/fp64, we will cast it to int32/fp32 if possible
+                    i32_iinfo = torch.iinfo(torch.int32)
+                    if (
+                        dtype == torch.int64
+                        and val <= i32_iinfo.max
+                        and val >= i32_iinfo.min
+                    ):
+                        opt_ctx.dtype = torch.int32
+
+                    f32_iinfo = torch.finfo(torch.float32)
+                    if dtype == torch.double:
+                        if (
+                            (val <= f32_iinfo.max and val >= f32_iinfo.min)
+                            or (val == torch.inf)
+                            or (val == -torch.inf)
+                        ):
+                            opt_ctx.dtype = torch.float32
+
+                    supported_dtypes = [
+                        torch.float32,
+                        torch.int32,
+                        torch.int64,
+                        torch.bfloat16,
+                        torch.float16,
+                        torch.bool,
+                    ]
+
+                    if opt_ctx.dtype not in supported_dtypes or (
+                        opt_ctx.dtype == torch.int32
+                        and not all(
+                            user.target in VecCheckerProxy.bin_cmp_ops
+                            for user in node_ctx.current_node.users
+                        )
+                    ):
+                        self.disable_vec(f"constant dtype: {opt_ctx.dtype}")
+                    return val
+
+            @staticmethod
+            def index_expr(expr, dtype):
+                assert len(self.ranges) == len(self.itervars)
+                if not len(self.ranges) or not all(
+                    not isinstance(range, sympy.Expr) or sympy.simplify(range).is_number
+                    for range in self.ranges
+                ):
+                    # if the range value is sympy.Expr, we might could not deduce the accurate loop interval.
+                    self.disable_vec(f"index_expr: {expr}, dtype {dtype}")
+                    return self.cse.newvar()
+
+                def can_use_int32():
+                    free_symbols = list(expr.free_symbols)
+                    sizes = {
+                        k: v
+                        for k, v in zip(self.itervars, self.ranges)
+                        if k in free_symbols
+                    }
+                    # Trivial case: Range empty
+                    if any(v == 0 for v in sizes.values()):
+                        return True
+
+                    vars_ranges = {k: ValueRanges(0, v - 1) for k, v in sizes.items()}
+                    if not vars_ranges or len(vars_ranges) != len(free_symbols):
+                        i32_iinfo = torch.iinfo(torch.int32)
+                        return (
+                            expr.is_number
+                            and expr <= i32_iinfo.max
+                            and expr >= i32_iinfo.min
+                        )
+                    expr_ranges = bound_sympy(expr, vars_ranges)
+                    if math.isinf(expr_ranges.lower) or math.isinf(expr_ranges.upper):  # type: ignore[arg-type]
+                        return False
+                    # If something takes the values 0..7, we will compare in the loop
+                    # x < 8. As such, for the loop not to overflow in the last iteration, we want
+                    # to check that expr_ranges.upper + 1 is representable as well
+                    return range_expressable_in_32_bits(
+                        ValueRanges(
+                            int(expr_ranges.lower), int(expr_ranges.upper) + 1  # type: ignore[arg-type]
+                        )
+                    )
+
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    assert len(self.ranges) == len(self.itervars)
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    if (
+                        dtype == torch.int64
+                        and can_use_int32()
+                        and all(
+                            user.target in VecCheckerProxy.bin_cmp_ops
+                            for user in node_ctx.current_node.users
+                        )
+                    ):
+                        opt_ctx.dtype = torch.int32
+                    else:
+                        opt_ctx.dtype = dtype
+                        self.disable_vec(f"index_expr: {expr}, dtype {dtype}")
+
+                    tmp_var = self.cse.newvar()
+                    return tmp_var
+
+            @staticmethod
+            def indirect_indexing(index_var, size, check=True):
+                return sympy_index_symbol(str(index_var))
+
+            @staticmethod
+            def masked(mask, body, other):
+                body()
+                return self.cse.newvar()
+
+            @staticmethod
+            def to_dtype(x, dtype, src_dtype=None):
+                with RecordOptimizationContext(__name__) as node_ctx:
+                    opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
+                    assert opt_ctx
+                    opt_ctx.dtype = dtype
+
+                    cur_node = node_ctx.get_fx_node()
+                    input_value: torch.fx.Node = cur_node.all_input_nodes[1]
+                    if dtype == torch.float:
+                        if input_value.target in [
+                            "load",
+                        ]:
+                            # Support masked_load for BF16/FP16. Because the legalization will
+                            # insert to_dtype to convert the BF16/FP16 input to FP32.
+                            dtype = (
+                                V.graph.get_dtype(input_value.args[1])  # type: ignore[arg-type]
+                                if input_value.target == "load"
+                                else input_value.args[-1]
+                            )
+                            if dtype in [
+                                torch.float16,
+                                torch.bfloat16,
+                                torch.float,
+                                torch.float64,
+                                torch.uint8,
+                                torch.int8,
+                                torch.int32,
+                                torch.int64,
+                            ]:
+                                # Convert from dtype to torch.float
+                                pass
+                            else:
+                                self.disable_vec(f"to_dtype: dtype {dtype}")
+                    elif dtype in DTYPE_LOWP_FP:
+                        if not all(usr.target == "store" for usr in cur_node.users):
+                            self.disable_vec(
+                                "to_dtype: bfloat16/float16 expecting users are all stores"
+                            )
+                            return x
+
+                        store_names = [usr.args[1] for usr in cur_node.users]
+                        if not all(
+                            V.graph.get_dtype(name) in [dtype] for name in store_names
+                        ):
+                            self.disable_vec(
+                                "to_dtype: expecting all stores into bfloat16 or float16"
+                            )
+                            return x
+                    elif dtype == torch.bool:
+                        pass
+                    elif dtype in (torch.uint8, torch.int8):
+                        # Only allow below 2 cases:
+                        # Case 1: to_int8 and store which corresponding to the single quant node
+                        # at last of fusion pattern.
+                        is_to_int8_and_store = all(
+                            usr.target in ["store"] for usr in cur_node.users
+                        )
+                        # Case 2: to_int8 and to_float which corresponding to pair of quant/dequant node
+                        # at middle of fusion pattern.
+                        is_to_int8_and_to_float = all(
+                            (
+                                usr.target in ["to_dtype"]
+                                and usr.args[2] == torch.float32
+                            )
+                            for usr in cur_node.users
+                        )
+                        if not (is_to_int8_and_store or is_to_int8_and_to_float):
+                            self.disable_vec(f"to_dtype: dtype {dtype}")
+                    elif dtype in [torch.int64, torch.int32]:
+                        pass
+                    else:
+                        self.disable_vec(f"to_dtype: dtype {dtype}")
+                    return x
+
+        self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
+        self.exit_stack.enter_context(V.set_kernel_handler(self))
+        return self
+
+
+class CppKernelProxy(CppKernel):
+    def __init__(self, kernel_group):
+        super().__init__(kernel_group.args, kernel_group.ws.num_threads)
+        self.kernel_group = kernel_group
+        self.loop_nest = None
+        self.call_ranges = None
+        self.picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
+
+    def data_type_propagation(self, nodes):
+        for _node in nodes:
+            assert isinstance(_node, SchedulerNode)
+            DataTypePropagation.propagate_scheduler_node(_node)
+
+    # Check if all the nodes of a given fx graph can support BF16/FP16
+    def is_lowp_fp_scheduler(self, scheduler_node: SchedulerNode):
+        if not isinstance(scheduler_node._body, ir.LoopBody):
+            return True
+
+        _lowp_fp_type: Optional[torch.dtype] = None
+
+        # Propagate the dtype to check if all the fx node is bf16/fp16
+        DataTypePropagation.propagate_scheduler_node(scheduler_node)
+
+        sub_blocks = [scheduler_node._body.root_block] + list(
+            scheduler_node._body.subblocks.values()
+        )
+        for sub_block in sub_blocks:
+            for _node in sub_block.graph.nodes:
+                # TODO(Eikan): Regarding get_index and index_expr, we should conclude the
+                # the data type as well.
+                if _node.op == "placeholder" or _node.target in (
+                    "get_index",
+                    "index_expr",
+                ):
+                    continue
+
+                # Fast path if all operations can support bf16/fp16 without converting to fp32
+                if _node.target not in [
+                    "load",
+                    "store",
+                    "abs",
+                    "neg",
+                    "output",
+                ]:
+                    return False
+
+                if hasattr(_node, "meta") and _node.meta:
+                    assert OptimizationContext.key in _node.meta
+                    opt_ctx: OptimizationContext = _node.meta[OptimizationContext.key]
+                    if not opt_ctx.dtype or opt_ctx.dtype not in DTYPE_LOWP_FP:
+                        return False
+                    if _lowp_fp_type:
+                        assert (
+                            _lowp_fp_type == opt_ctx.dtype
+                        ), "scheduler node do not support bf16/fp16 mix"
+                    else:
+                        _lowp_fp_type = opt_ctx.dtype
+                else:
+                    return False
+
+        scheduler_node._lowp_fp_type = _lowp_fp_type  # type: ignore[attr-defined]
+        return True
+
+    def legalize_lowp_fp_dtype(self, nodes):
+        def add_to_dtype(sub_graph: torch.fx.Graph):
+            def is_lowp_fp_load(node: torch.fx.Node):
+                if node.target not in ["load"]:
+                    return False
+                assert len(node.args) == 3
+                load_dtype = V.graph.get_dtype(node.args[1])  # type: ignore[arg-type]
+                return load_dtype in DTYPE_LOWP_FP
+
+            def is_lowp_fp_store(node: torch.fx.Node):
+                if node.target != "store":
+                    return False
+                _, store_var, _, _, _ = node.args
+                store_dtype = V.graph.get_dtype(store_var)  # type: ignore[arg-type]
+                return store_dtype in DTYPE_LOWP_FP
+
+            sub_graph_nodes = list(sub_graph.nodes)
+            to_lowp_fp_legalized_nodes = []
+            for _node in sub_graph_nodes:
+                if is_lowp_fp_load(_node):
+                    # No need to promote to float if all users are direct stores
+                    if all(user.target == "store" for user in _node.users):
+                        continue
+                    ops = _node.args[0]
+                    with sub_graph.inserting_after(_node):
+                        to_type_node = sub_graph.call_method(
+                            "to_dtype", args=(ops, _node, torch.float)
+                        )
+                        to_type_node_args = to_type_node.args
+                        _node.replace_all_uses_with(to_type_node)
+                        to_type_node.args = to_type_node_args
+                        metrics.cpp_to_dtype_count += 1
+                elif is_lowp_fp_store(_node):
+                    ops, name, _, value_var, _ = _node.args
+                    # No need to promote to float if it is a user of a load which are all directly stored
+                    if value_var.target == "load" and all(
+                        user.target == "store" for user in value_var.users
+                    ):
+                        continue
+                    dtype = V.graph.get_dtype(name)
+                    with sub_graph.inserting_before(_node):
+                        to_type_node = sub_graph.call_method(
+                            "to_dtype", args=(ops, value_var, dtype)
+                        )
+                        _node.replace_input_with(value_var, to_type_node)
+                        metrics.cpp_to_dtype_count += 1
+                elif _node.target == "reduction":
+                    (
+                        ops,
+                        dtype,
+                        src_dtype,
+                        reduction_type,
+                        value,
+                    ) = _node.args
+                    if src_dtype in DTYPE_LOWP_FP:
+                        # Since we always convert the load/store value to float if the tensor is bfloat16/float16.
+                        # Therefore, the reduction should never work with bfloat16/float16 value. Hence, we update
+                        # the bfloat16/float16 reduction by
+                        #     1) updating the src_dtype to float
+                        # and 2) updating the dtype to float if it is bfloat16/float16.
+                        assert dtype in [
+                            torch.float,
+                            torch.bfloat16,
+                            torch.float16,
+                            torch.int64,
+                        ]
+                        _node.args = (
+                            ops,
+                            torch.float if dtype in DTYPE_LOWP_FP else dtype,
+                            torch.float,
+                            reduction_type,
+                            value,
+                        )
+                elif _node.target == "to_dtype" and _node.args[-1] in DTYPE_LOWP_FP:
+                    (ops, x, _) = _node.args
+                    # The legalization always loads the BF16/FP16 tensor as FP32 for computation
+                    # and converts back to BF16/FP16 after the computation.
+                    # Hence, there should be no computation w/ BF16/FP16.
+                    # Therefore, we update the to_dtype by replacing the bf16/fp16 dtype with fp32.
+                    # Save the legalized to_dtype node for the elimination(eliminate_to_dtype step):
+                    #  1) Eliminate the redundant to_dtype node if we have a pattern as follows:
+                    #     graph():
+                    #       %lowp_fp_legalized = call_method[target=to_dtype](args = (%ops, %input, torch.float))
+                    #       %to_dtype2 = call_method[target=to_dtype](args = (%ops, %lowp_fp_legalized, torch.bfloat16/float16))
+                    # Regarding the first to_dtype, it is redundant because
+                    # the second to_type also converts to the torch.bfloat16/torch.float16.
+                    # Hence, we remove the first to_type.
+                    to_lowp_fp_legalized_nodes.append(_node)
+                    _node.args = (ops, x, torch.float)
+                else:
+                    pass
+
+            def eliminate_to_dtype(sub_graph: torch.fx.Graph):
+                def _eliminate_duplicate_to_node(sub_graph: torch.fx.Graph):
+                    # Eliminate the redundant to_dtype node. Let's consider a pattern as follows:
+                    #   graph():
+                    #     %to_dtype1 = call_method[target=to_dtype](args = (%ops, %input, torch.float), kwargs = {})
+                    #     %to_dtype2 = call_method[target=to_dtype](args = (%ops, %to_dtype1, torch.float), kwargs = {})
+                    # Regarding the first to_dtype, it is redundant because the second to_type also converts to the
+                    # torch.float. Hence, we remove the first to_type
+                    def _used_by_to(to_node: torch.fx.Node):
+                        return all(usr.target == "to_dtype" for usr in to_node.users)
+
+                    all_to_nodes = [
+                        node for node in sub_graph.nodes if node.target == "to_dtype"
+                    ]
+                    all_to_nodes_and_users = [
+                        {node: node.users} for node in all_to_nodes if _used_by_to(node)
+                    ]
+                    for node_users in all_to_nodes_and_users:
+                        for node, users in node_users.items():
+                            if node in sub_graph.nodes and (
+                                all(usr.args[-1] == node.args[-1] for usr in users)
+                                or (
+                                    node in to_lowp_fp_legalized_nodes
+                                    and all(
+                                        usr.args[-1] in DTYPE_LOWP_FP for usr in users
+                                    )
+                                )
+                            ):
+                                val_node = node.all_input_nodes[-1]
+                                node.replace_all_uses_with(val_node)
+                                sub_graph.erase_node(node)
+
+                    # For debug mode, the graph of LoopBody will attach a new GraphModule as
+                    # owning_module for debugging while the release mode will not. The lint will
+                    # check whether the graph has owning_module to decide if it needs to check
+                    # call_module. LoopBody might contain get_index as a module call. But it
+                    # is just a function. Hence, it cannot pass the lint check for debug mode.
+                    # We bypass the check if the owning_module is None. Eventually, we should call
+                    # get_index via call_function but not call_module.
+                    if sub_graph.owning_module is None:
+                        sub_graph.lint()
+
+                _eliminate_duplicate_to_node(sub_graph)
+
+            eliminate_to_dtype(sub_graph)
+
+        def _legalize_lowp_fp(loop_body: ir.LoopBody):
+            sub_blocks = [loop_body.root_block] + list(loop_body.subblocks.values())
+            for sub_block in sub_blocks:
+                add_to_dtype(sub_block.graph)
+
+        if all(
+            isinstance(_node, SchedulerNode) and self.is_lowp_fp_scheduler(_node)
+            for _node in nodes
+        ):
+            # Mark the load node to load bf16/fp16
+            for _node in nodes:
+                sub_blocks = [_node._body.root_block] + list(
+                    _node._body.subblocks.values()
+                )
+                for sub_block in sub_blocks:
+                    for fx_node in sub_block.graph.nodes:
+                        if fx_node.target in ["load", "store"]:
+                            assert fx_node.meta
+                            assert OptimizationContext.key in fx_node.meta
+                            opt_ctx: OptimizationContext = fx_node.meta[
+                                OptimizationContext.key
+                            ]
+                            assert opt_ctx.dtype in DTYPE_LOWP_FP
+
+            # Bypass the legalization as the kernel can run with bf16/fp16 directly
+            return
+
+        for _node in nodes:
+            assert isinstance(_node, SchedulerNode)
+            assert isinstance(_node._body, ir.LoopBody)
+            node: SchedulerNode = _node
+
+            def is_memory_copy_scheduler_node(node: SchedulerNode):
+                op_counts = node.read_writes.op_counts
+                return (
+                    len(op_counts) == 2 and "load" in op_counts and "store" in op_counts
+                )
+
+            should_legalize = not is_memory_copy_scheduler_node(node)
+            if should_legalize:
+                body: ir.LoopBody = node._body
+                _legalize_lowp_fp(body)
+
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
+        # Legalize BF16 node by adding to_dtype explicitly
+        self.legalize_lowp_fp_dtype(nodes)
+        self.data_type_propagation(nodes)
+
+        assert len(nodes) >= 1
+        first_node = nodes[0]
+        vec_dtype = (
+            first_node._lowp_fp_type  # type: ignore[attr-defined]
+            if all(
+                hasattr(_node, "_lowp_fp_type")
+                and _node._lowp_fp_type == first_node._lowp_fp_type  # type: ignore[attr-defined]
+                for _node in nodes
+            )
+            else torch.float
+        )
+
+        kernel_group = self.kernel_group
+        _, (group, reduction_group) = max(
+            nodes, key=lambda x: int(x.is_reduction())
+        ).group
+
+        self.set_ranges(group, reduction_group)
+
+        def codegen_kernel(cls, *args):
+            with kernel_group.new_kernel(cls, *args) as kernel:
+                # Ugly hack to maintain the metrics kernel count since
+                # we only count in CppKernelProxy, not those contained in it
+                metrics.generated_kernel_count -= 1
+
+                run(kernel)
+                return kernel
+
+        def run(kernel):
+            vars, reduction_vars = kernel.set_ranges(group, reduction_group)
+            in_suffix = False
+            for node in nodes:
+                if node.group[1] in [
+                    (group, reduction_group),
+                    (group + reduction_group, ()),
+                ]:
+                    assert not in_suffix
+                    node.run(vars, reduction_vars)
+                else:
+                    in_suffix = True
+                    assert node.group[1] == (
+                        group,
+                        (),
+                    ), f"unexpected group: {node.group[1]} != {group}, {reduction_group}"
+                    # we can fuse in some extra pointwise into the suffix
+                    with kernel.write_to_suffix():
+                        node.run(vars, ())
+
+        scalar_kernel = codegen_kernel(CppKernel)
+        V.graph.removed_buffers |= scalar_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= scalar_kernel.inplaced_to_remove
+        self.loop_nest = LoopNestWithSplit.build(scalar_kernel)
+
+        if not self.picked_vec_isa:
+            return
+
+        def select_tiling_indices(tiling_factor):
+            all_index = []
+            for node in nodes:
+                rw = dependencies.extract_read_writes(node._body, *node._sizes)
+                all_index += [dep.index for dep in itertools.chain(rw.reads, rw.writes)]
+            contig_vars = set()
+            contig_vars_list = []
+            non_contig_stride_const = set()
+            non_contig_stride_other = set()
+            for index in all_index:
+                for var in index.free_symbols:
+                    if not re.search(r"^d\d+$", var.name):
+                        continue
+                    stride = stride_at_vec_range(index, var, tiling_factor)
+                    if stride == 0:
+                        continue
+                    elif stride == 1:
+                        contig_vars.add(int(var.name[1:]))
+                        contig_vars_list.append(int(var.name[1:]))
+                    elif all(s.name.startswith("s") for s in stride.free_symbols):
+                        non_contig_stride_const.add(int(var.name[1:]))
+                    else:
+                        non_contig_stride_other.add(int(var.name[1:]))
+            contig_only = (
+                contig_vars - non_contig_stride_const - non_contig_stride_other
+            )
+            if len(contig_vars) == 0:
+                # no contiguous vars
+                return [len(self.itervars) - 1]
+            if contig_only:
+                return sorted(contig_only)[-1:]
+            contig_and_const_stride = (
+                contig_vars & non_contig_stride_const
+            ) - non_contig_stride_other
+            contig_vars_sorted = sorted(contig_vars)
+            if (
+                len(contig_vars_sorted) == 2
+                and contig_vars_sorted[-1] in contig_and_const_stride
+                and contig_vars_sorted[-1] == len(self.itervars) - 1
+            ):
+                return contig_vars_sorted
+            return sorted(contig_vars_sorted, key=contig_vars_list.count)[-1:]
+
+        def select_tiling(dtype: torch.dtype = torch.float):
+            # TODO(jgong5): support alternative tiling factors and data types
+            tiling_factor = self.picked_vec_isa.nelements(dtype=dtype)
+            tiling_indices = select_tiling_indices(tiling_factor)
+            if tiling_indices:
+                could_vec = True
+                for tiling_indice in tiling_indices:
+                    with CppVecKernelChecker(
+                        deepcopy(self.kernel_group.args),
+                        parallel_num_threads(),
+                        tiling_factor,
+                        tiling_indice,
+                    ) as vec_checker:
+                        run(vec_checker)
+                        could_vec = could_vec and vec_checker.simd_vec
+                        if not could_vec:
+                            break
+                if could_vec:
+                    if len(tiling_indices) == 1:
+                        return [tiling_factor], tiling_indices
+                    if len(tiling_indices) == 2:
+                        return [tiling_factor, tiling_factor], tiling_indices
+            return [], []
+
+        # Kernels share the same global contexts like V.graph.wrapper_code, V.kernel.args.
+        # But the generated scalar kernel has updated these global contexts. Hence, the other kernels
+        # should not do this again to avoid context conflict. By now, we only control the
+        # config.inplace_buffers. In the future, we could maintain more contexts.
+        with torch._inductor.config.patch(inplace_buffers=False):
+            tiling_factors, tiling_indices = select_tiling(vec_dtype)
+            assert len(tiling_factors) == len(tiling_indices)
+            try:
+                if len(tiling_indices) == 1:
+                    vec_kernel = codegen_kernel(
+                        CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
+                    )
+                    metrics.generated_cpp_vec_kernel_count += 1
+                    main_loop, tail_loop = self.loop_nest.split_with_tiling(
+                        tiling_indices[0], factor=tiling_factors[0]
+                    )
+                    main_loop.set_kernel(vec_kernel)
+                    tail_loop.set_kernel(scalar_kernel)
+                    main_loop.simd_vec = True
+                    tail_loop.simd_omp = True
+                    # We chop the loop into two cubes by the nelements - main loop and tail loop.
+                    # Regarding the main loop, it is straightforward that it could be vectorized with
+                    # nelements. But for the tail loop, it still could be vectorized. For example,
+                    # if the nelements is 8(256bits), then the tail loop still could be vectorized
+                    # as 4(128bits).
+                    tail_loop.simd_nelements = tiling_factors[0] // 2
+                elif len(tiling_indices) == 2:
+                    assert (
+                        tiling_indices[1] == len(self.itervars) - 1
+                        and tiling_factors[0] == tiling_factors[1]
+                    )
+                    tile2d_kernel = codegen_kernel(
+                        CppTile2DKernel, tiling_factors[0], tiling_indices, vec_dtype
+                    )
+                    vec_kernel = codegen_kernel(
+                        CppVecKernel, tiling_factors[0], tiling_indices[0], vec_dtype
+                    )
+                    metrics.generated_cpp_vec_kernel_count += 2
+                    outer_main_loop, outer_tail_loop = self.loop_nest.split_with_tiling(
+                        tiling_indices[0], factor=tiling_factors[0]
+                    )
+                    outer_tail_loop.set_kernel(scalar_kernel)
+                    (
+                        inner_main_loop,
+                        inner_tail_loop,
+                    ) = outer_main_loop.split_with_tiling(
+                        tiling_indices[1] - tiling_indices[0], factor=tiling_factors[0]
+                    )
+                    inner_main_loop.set_kernel(tile2d_kernel)
+                    inner_tail_loop.set_kernel(vec_kernel)
+            except CppVecUnsupportedError as e:
+                if schedule_log.isEnabledFor(logging.DEBUG):
+                    schedule_log.debug("Disabled vectorization: %s", e)
+
+    def codegen_loops(self, code, worksharing):
+        self.codegen_loops_impl(self.loop_nest, code, worksharing)
+
+
+class ReasonFusedNodes(Enum):
+    SAME_VARS_REDUCE = "same_vars_reduce"
+    COMPATIBLE_REDUCTION = "compatible_reduction"
+    COMPATIBLE_RANGES_NO_REDUCTION = "compatible_ranges_no_reduction"
+
+
+class CppScheduling(BaseScheduling):
+    # ctypes limits the number of args to 1024, refer to:
+    # https://github.com/python/cpython/commit/a285af7e626d1b81cf09f8b2bf7656f100bc1237
+    # We set a conservative threshold here.
+    MAX_FUSED_KERNEL_ARGS_NUM = 500
+
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+        self.get_kernel_group()
+        self._ready_to_flush = False
+
+    def _set_flush_status(self, status: bool):
+        self._ready_to_flush = status
+
+    def group_fn(self, sizes):
+        return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
+
+    def get_kernel_group(self):
+        from .cpp_wrapper_cpu import CppWrapperCpu
+
+        self.kernel_group: Union[CppWrapperKernelGroup, KernelGroup]
+        if isinstance(V.graph.wrapper_code, CppWrapperCpu):
+            self.kernel_group = CppWrapperKernelGroup()
+        else:
+            self.kernel_group = KernelGroup()
+
+    def fuse(self, node1, node2):
+        if node1.is_foreach() or node2.is_foreach():
+            return ForeachKernelSchedulerNode.fuse(node1, node2)
+        else:
+            if (
+                self._why_fuse_nodes(node1, node2)
+                == ReasonFusedNodes.COMPATIBLE_RANGES_NO_REDUCTION
+            ):
+                assert isinstance(node1, (SchedulerNode, FusedSchedulerNode))
+                assert isinstance(node2, (SchedulerNode, FusedSchedulerNode))
+
+                _, (vars1, reduce1) = node1.group
+                _, (vars2, reduce2) = node2.group
+                assert reduce1 == () and reduce2 == (), (reduce1, reduce2)
+
+                def get_indexing_ranges_exprs(node):
+                    if isinstance(node, FusedSchedulerNode):
+                        assert len(node.snodes) > 0
+                        return get_indexing_ranges_exprs(node.snodes[0])
+                    else:
+                        assert isinstance(node, SchedulerNode)
+                        comp_buffer = node.node
+                        assert isinstance(comp_buffer, ir.ComputedBuffer)
+                        _, body, _ = comp_buffer.get_default_sizes_body()
+                        return body.var_ranges, list(body.indexing_exprs.values())
+
+                node_to_recomp = node1 if len(vars1) < len(vars2) else node2
+                assert isinstance(node_to_recomp, SchedulerNode)
+
+                ref_node = node2 if len(vars1) < len(vars2) else node1
+
+                extra_indexing_constraints = get_indexing_ranges_exprs(ref_node)
+
+                node_to_recomp.recompute_size_and_body(
+                    extra_indexing_constraints=extra_indexing_constraints
+                )
+
+                _, (vars1, _) = node1.group
+                _, (vars2, _) = node2.group
+                assert vars1 == vars2, (vars1, vars2)
+
+            return FusedSchedulerNode.fuse(node1, node2)
+
+    def _why_fuse_nodes(self, node1, node2) -> Optional[ReasonFusedNodes]:
+        _, (vars1, reduce1) = node1.group
+        _, (vars2, reduce2) = node2.group
+
+        if vars1 == vars2 and reduce1 == reduce2:
+            return ReasonFusedNodes.SAME_VARS_REDUCE
+        if reduce1 == () and vars1 == vars2 + reduce2:
+            return ReasonFusedNodes.COMPATIBLE_REDUCTION
+        if self._can_fuse_nodes_with_compatible_ranges(node1, node2):
+            return ReasonFusedNodes.COMPATIBLE_RANGES_NO_REDUCTION
+        # TODO(jansel): allow fusion pointwise (vars1, ()) suffix?
+        return None
+
+    def _can_fuse_nodes_with_compatible_ranges(self, node1, node2):
+        # Here we try to fuse SchedulerNode/FusedSchedulerNode with compatible ranges
+        # e.g. (s0, s1, s2) and (s0 * s1 * s2)
+        _, (vars1, reduce1) = node1.group
+        _, (vars2, reduce2) = node2.group
+
+        c1 = reduce1 == () and reduce2 == ()
+        c2 = math.prod(vars1) == math.prod(vars2)
+        c3 = len(vars1) == 1 or len(vars2) == 1
+        if not (c1 and c2 and c3):
+            return False
+
+        node_to_recomp = node1 if len(vars1) < len(vars2) else node2
+        ref_node = node2 if len(vars1) < len(vars2) else node1
+
+        # We can not recompute sizes and body for nodes other than SchedulerNode
+        # TODO: we can extend fusion support with compatible ranges for FusedSchedulerNode
+        if isinstance(node_to_recomp, FusedSchedulerNode):
+            return False
+
+        def get_buffer(node):
+            if isinstance(node, FusedSchedulerNode):
+                assert len(node.snodes) > 0
+                # use the last scheduler node from the list as it has the most
+                # relevant indexing expressions
+                return get_buffer(node.snodes[-1])
+            else:
+                assert isinstance(node, SchedulerNode)
+                return node.node
+
+        ref_node_buffer = get_buffer(ref_node)
+        if isinstance(ref_node_buffer, ir.TemplateBuffer):
+            return False
+
+        assert isinstance(ref_node_buffer, ir.ComputedBuffer)
+
+        # It may happen that node1 and node2 compatible number of elements
+        # but different original ranges, for example:
+        # {d0: s0, d1: s1, d2: s2} vs {d0: s0*s1*s2}
+        # See https://github.com/pytorch/pytorch/pull/120077/files#r1500427848 for more details
+        # TODO: we can fix if it allows us to CSE at least one of the variables
+        var_ranges1 = ref_node_buffer.get_read_writes().var_ranges
+        var_ranges2 = node_to_recomp.node.get_read_writes().var_ranges
+        if var_ranges1 != var_ranges2:
+            return False
+
+        return True
+
+    def _can_fuse_horizontal_impl(self, node1, node2):
+        assert isinstance(node1, (FusedSchedulerNode, SchedulerNode))
+        assert isinstance(node2, (FusedSchedulerNode, SchedulerNode))
+        return self._why_fuse_nodes(node1, node2) is not None
+
+    def can_fuse_horizontal(self, node1, node2):
+        if (
+            len(node1.get_nodes()) + len(node2.get_nodes())
+            > config.cpp.max_horizontal_fusion_size
+        ):
+            return False
+
+        return self._can_fuse_horizontal_impl(node1, node2)
+
+    def can_fuse_vertical(self, node1, node2):
+        return self._can_fuse_horizontal_impl(node1, node2) and not node1.is_reduction()
+
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
+        """
+        Turn an set of pre-fused nodes into a C++ kernel.
+        """
+        kernel_group = self.kernel_group
+
+        cpp_kernel_proxy = CppKernelProxy(kernel_group)
+        cpp_kernel_proxy.codegen_nodes(nodes)
+
+        kernel_group.finalize_kernel(cpp_kernel_proxy, nodes)
+
+        args_num = self._get_scheduled_num_args()
+        if args_num > CppScheduling.MAX_FUSED_KERNEL_ARGS_NUM:
+            self._set_flush_status(True)
+
+    def _get_scheduled_num_args(self):
+        return self.kernel_group.get_num_args()
+
+    def ready_to_flush(self):
+        return self._ready_to_flush
+
+    def codegen_sync(self):
+        pass
+
+    def flush(self):
+        self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
+        self.get_kernel_group()
+        self._set_flush_status(False)
+
+
+class KernelGroup:
+    def __init__(self):
+        super().__init__()
+        self.args = KernelArgs()
+        self.loops_code = BracesBuffer()
+        self.ws = WorkSharing(self.loops_code)
+        self.stack = contextlib.ExitStack()
+        self.stack.enter_context(self.ws)
+        self.scheduled_nodes = []
+
+    def new_kernel(self, cls, *args):
+        return cls(self.args, parallel_num_threads(), *args)
+
+    def finalize_kernel(self, new_kernel, nodes):
+        self.scheduled_nodes += nodes
+        code = self.loops_code
+        ws = self.ws
+        new_kernel.codegen_loops(code, ws)
+
+    def get_num_args(self):
+        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
+        args_num = len(arg_defs)
+        return args_num
+
+    def codegen_define_and_call(self, wrapper):
+        self.stack.close()
+        if not self.scheduled_nodes:
+            return
+
+        fused_name = (
+            get_fused_kernel_name(self.scheduled_nodes, config.cpp.descriptive_names)
+            if config.cpp.descriptive_names
+            else ""
+        )
+        kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
+        arg_defs, call_args, arg_types = self.args.cpp_argdefs()
+        arg_defs = ",\n".ljust(25).join(arg_defs)
+        code = BracesBuffer()
+        # TODO: support kernel profile on other platforms
+        enable_kernel_profile = (
+            config.cpp.enable_kernel_profile and sys.platform == "linux"
+        )
+        if enable_kernel_profile:
+            code.writelines(["#include <ATen/record_function.h>"])
+        kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
+        code.writeline(codecache.cpp_prefix())
+
+        code.writeline(f'extern "C" void {kernel_decl_name}({arg_defs})')
+        with code.indent():
+            if enable_kernel_profile:
+                graph_id = V.graph.graph_id
+                prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
+                code.writelines(
+                    [
+                        f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
+                    ]
+                )
+            for old, new in self.args.aliases():
+                code.writeline(f"auto {old} = {new};")
+            code.splice(self.loops_code)
+
+        codecache_def = IndentedBuffer()
+        if not V.graph.cpp_wrapper:
+            codecache_def.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
+        codecache_def.splice(code)
+        if not V.graph.cpp_wrapper:
+            codecache_def.writeline("''')")
+
+        codecache_str = codecache_def.getvalue()
+        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+        codecache_str = codecache_str.replace("#pragma CMT", "//")
+        wrapper.define_kernel(kernel_name, codecache_str, cuda=False)
+        # generate the code to call this
+        wrapper.generate_kernel_call(
+            kernel_name, call_args, cuda=False, arg_types=arg_types
+        )
+
+
+class CppWrapperKernelGroup(KernelGroup):
+    def __init__(self):
+        super().__init__()
+        self.args = CppWrapperKernelArgs()
+
+
+class WorkSharing:
+    def __init__(self, code):
+        self.code = code
+        self.in_parallel = False
+        self.num_threads = None
+        self.stack = contextlib.ExitStack()
+
+    def parallel(self, threads):
+        if self.in_parallel and threads != self.num_threads:
+            # wrong number of threads
+            self.close()
+        if not self.in_parallel:
+            self.num_threads = threads
+            self.in_parallel = True
+            if config.cpp.dynamic_threads:
+                self.code.writeline("#pragma omp parallel")
+            else:
+                self.code.writeline(f"#pragma omp parallel num_threads({threads})")
+            self.stack.enter_context(self.code.indent())
+
+    def single(self):
+        if self.in_parallel:
+            self.code.writeline("#pragma omp single")
+        return self.in_parallel
+
+    def close(self):
+        self.stack.close()
+        self.in_parallel = False
+
+    def __enter__(self):
+        self.stack.__enter__()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stack.__exit__(exc_type, exc_val, exc_tb)
+
+
+@dataclasses.dataclass
+class LoopLevel:
+    var: Optional[sympy.Expr] = None
+    size: Optional[sympy.Expr] = None
+    offset: sympy.Expr = sympy.Integer(0)
+    steps: sympy.Expr = sympy.Integer(1)
+    parallel: int = 0
+    simd_omp: bool = False
+    simd_vec: bool = False
+    collapsed: bool = False
+    reduction_var_map: Optional[Dict[str, str]] = None
+    parent: Optional["LoopLevel"] = None
+    # the next inner level of the loop, empty if it is inner-most
+    # contains >1 LoopLevel if the inner level of loop is split
+    inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
+    # kernel assigned to this loop level, only valid when it is a leaf
+    kernel: Optional[CppKernel] = None
+
+    def __post_init__(self):
+        # Regarding the C++/OpenMP backend, `codecache.pick_vec_isa()` to check
+        # vectorization ISA is a time-consuming and one-shot operation. It leads
+        # to taking a longer time to import `codegen.cpp` package because the
+        # `LoopLevel` of the package is decorated by `@dataclasses.dataclass` while
+        # the decorator will invoke `codecache.pick_vec_isa()` to initialize the
+        # `simd_nelements` of the `LoopLevel`. It might introduce additional compilation
+        # overhead to the Triton backend. Therefore, we moved the `simd_nelements` to
+        # `__post_init__`
+        picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
+        self.simd_nelements: int = picked_vec_isa.nelements() if picked_vec_isa else 0
+
+    def get_kernels(self) -> List[CppKernel]:
+        """Get all kernel objects under this loop level"""
+        if self.kernel:
+            return [self.kernel]
+        kernels = []
+        for loop in self.inner:
+            kernels += loop.get_kernels()
+        return kernels
+
+    def set_kernel(self, kernel: CppKernel):
+        """
+        Set the kernel under this loop level. No split is allowed under
+        this loop level.
+        """
+        if not self.inner:
+            self.kernel = kernel
+            loop: Optional[LoopLevel] = self
+            assert loop is not None
+            if loop.is_reduction():
+                loop.reduction_var_map = kernel.reduction_var_map.copy()
+                loop = loop.parent
+                while loop is not None and loop.is_reduction():
+                    assert loop.reduction_var_map is not None
+                    loop.reduction_var_map.update(kernel.reduction_var_map)
+                    loop = loop.parent
+            return
+        assert len(self.inner) == 1
+        self.inner[0].set_kernel(kernel)
+
+    def get_loops_at(self, depth) -> List["LoopLevel"]:
+        if depth == 0:
+            return [self]
+        else:
+            loops = []
+            for loop in self.inner:
+                loops += loop.get_loops_at(depth - 1)
+            return loops
+
+    def is_reduction(self):
+        return bool(self.reduction_var_map)
+
+    def split_with_tiling(self, depth, factor):
+        def clone_inner():
+            inner = []
+            if self.inner:
+                for loop in self.inner:
+                    inner.append(loop.clone())
+            return inner
+
+        def do_split_with_tiling():
+            sympy_factor = sympy.Integer(factor)
+
+            offset = FloorDiv(self.size, sympy_factor) * sympy_factor
+            main_loop = LoopLevel(self.var, offset)
+            main_loop.steps = sympy_factor
+            main_loop.parallel = self.parallel
+            main_loop.collapsed = False
+            main_loop.reduction_var_map = self.reduction_var_map
+            main_loop.inner = clone_inner()
+            if main_loop.inner:
+                for loop in main_loop.inner:
+                    loop.parent = main_loop
+
+            tail_loop = LoopLevel(self.var, self.size)
+            tail_loop.offset = offset
+            tail_loop.parallel = self.parallel
+            tail_loop.collapsed = False
+            tail_loop.reduction_var_map = self.reduction_var_map
+            tail_loop.inner = clone_inner()
+            if tail_loop.inner:
+                for loop in tail_loop.inner:
+                    loop.parent = tail_loop
+
+            return main_loop, tail_loop
+
+        if depth == 0:
+            main_loop, tail_loop = do_split_with_tiling()
+            parent = self.parent
+            if parent:
+                parent.inner = [main_loop, tail_loop]
+                main_loop.parent = parent
+                tail_loop.parent = parent
+            return main_loop, tail_loop
+        else:
+            assert len(self.inner) == 1
+            return self.inner[0].split_with_tiling(depth - 1, factor)
+
+    def clone(self):
+        loop = copy(self)
+        loop.inner = []
+        if self.inner:
+            for inner_loop in self.inner:
+                inner_loop_clone = inner_loop.clone()
+                inner_loop_clone.parent = loop
+                loop.inner.append(inner_loop_clone)
+        loop.kernel = deepcopy(self.kernel)
+        return loop
+
+    def lines(self):
+        offset_expr = cexpr_index(self.offset)
+        size_expr = cexpr_index(self.size)
+        if config.cpp.no_redundant_loops and offset_expr == size_expr:
+            return None
+        if self.reduction_var_map:
+            reduction = " " + " ".join(
+                f"reduction({RTYPE_TO_CPP[rtype]}:{var})"
+                for var, rtype in self.reduction_var_map.items()
+            )
+        else:
+            reduction = ""
+        simd = (
+            f"simd simdlen({self.simd_nelements}) "
+            if self.simd_omp and self.simd_nelements > 1
+            else ""
+        )
+        if self.parallel:
+            # TODO(jansel): look into chunk size and other schedules
+            line1 = f"#pragma omp for{reduction} "
+            if self.parallel > 1:
+                line1 += f" collapse({self.parallel})"
+            if self.simd_omp:
+                line1 = line1.replace(" for ", f" for {simd}")
+        elif self.simd_vec:
+            line1 = ""
+        elif self.simd_omp:
+            line1 = f"#pragma omp {simd}{reduction}"
+        elif not self.reduction_var_map and codecache.is_gcc():
+            line1 = "#pragma GCC ivdep"
+        else:
+            line1 = ""
+        offset_str = f"{INDEX_TYPE} {self.var}={offset_expr}"
+        size_str = f"{self.var}<{size_expr}"
+        steps_str = f"{self.var}+={cexpr_index(self.steps)}"
+        line2 = f"for({offset_str}; {size_str}; {steps_str})"
+        if self.collapsed or not line1:
+            return [line2]
+        return [line1, line2]
+
+
+@dataclasses.dataclass
+class LoopNestWithSplit:
+    """
+    A loop-nest like structure but with some loop level split along
+    the loop range into the main tiling loop and the tail. It is built
+    with the `build` method as a loop nest and then split with
+    `split_with_tiling` at some depth.
+
+    A typical case is for vectorization where we typically split at the inner-most
+    loop level. A more complicated case is 2D tiling where we split at
+    both inner-most and outer levels.
+    """
+
+    root: Optional[List[LoopLevel]] = None
+    kernel: Optional[CppKernel] = None
+
+    @staticmethod
+    def build(kernel: CppKernel):
+        """Build a LoopNest with the given `kernel` as the leaf"""
+        itervars = kernel.itervars
+        ranges = kernel.ranges
+        reduction_depth = kernel.reduction_depth
+        assert reduction_depth is not None
+
+        root: List[LoopLevel] = []
+        levels: List[LoopLevel] = root
+        loop: Optional[LoopLevel] = None
+        for loop_idx, (var, size) in enumerate(zip(itervars, ranges)):
+            loop = LoopLevel(var, size, parent=loop)
+            if loop_idx >= reduction_depth:
+                loop.reduction_var_map = kernel.reduction_var_map.copy()
+            levels.append(loop)
+            levels = loop.inner
+        loop_nest = LoopNestWithSplit(root)
+        if loop:
+            loop.kernel = kernel
+        else:
+            loop_nest.kernel = kernel
+        return loop_nest
+
+    def __bool__(self):
+        return bool(self.root)
+
+    def get_loops_at(self, depth) -> List[LoopLevel]:
+        """Get all the loop levels at the given `depth` (most outer loop has depth 0)"""
+        loops: List[LoopLevel] = []
+        assert self.root is not None
+        for loop in self.root:
+            loops += loop.get_loops_at(depth)
+        return loops
+
+    @cache_on_self
+    def max_parallel_depth(self):
+        """
+        Maximal allowed depth for parallelism:
+        1) Levels without splitting and
+        2) All reduction or non-reduction levels
+        When the loop is split at the top level, the max depth is 1.
+        """
+        max_depth = 0
+        assert self.root is not None
+        loops = self.root
+        if len(loops) > 1:
+            return 1
+        is_reduction = loops[0].is_reduction() if loops else False
+        while len(loops) == 1 and loops[0].is_reduction() == is_reduction:
+            max_depth += 1
+            loops = loops[0].inner
+        return max_depth
+
+    def is_reduction_only(self):
+        """
+        Whether all the loops are for reduction. Reduction loops
+        are always the inner most ones.
+        """
+        return (
+            self.root is not None and len(self.root) > 0 and self.root[0].is_reduction()
+        )
+
+    def mark_parallel(self, par_depth):
+        assert (
+            par_depth <= self.max_parallel_depth()
+        ), "Parallel depth cannot exceed the maximal allowed parallel depth"
+        assert self.root is not None
+        loops = self.root
+        for loop in loops:
+            loop.parallel = par_depth
+        for i in range(1, par_depth):
+            loops = loops[0].inner
+            loops[0].collapsed = True
+
+    def split_with_tiling(self, depth, factor):
+        """
+        Split the loop into main and tail loops at given `depth` so that the range
+        of the main loop has range `floor_div(range, factor) * factor` and
+        the tail loop handles the remainder. The main loop is tiled
+        according to the `factor`.
+        """
+        loops = self.get_loops_at(depth)
+        assert len(loops) == 1
+        split_loops = loops[0].split_with_tiling(0, factor)
+        if depth == 0:
+            self.root = split_loops
+        return split_loops
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_prefix.h b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_prefix.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd9a7add180e3d8759ebe170027ee4106e5890e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_prefix.h
@@ -0,0 +1,595 @@
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <omp.h>
+
+#include <ATen/NumericUtils.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+#include <ATen/native/Math.h>
+
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/generic_math.h>
+#include <c10/util/Half.h>
+#include <c10/util/TypeCast.h>
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+#define INDUCTOR_USE_VECTOR_TYPES() 1
+#else
+#define INDUCTOR_USE_VECTOR_TYPES() 0
+#endif
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/vec_n.h>
+#endif
+
+typedef at::Half half;
+typedef at::BFloat16 bfloat16;
+
+typedef at::Float8_e4m3fn float8_e4m3fn;
+typedef at::Float8_e5m2 float8_e5m2;
+
+template <typename T>
+struct Welford {
+  T mean = T(0);
+  T m2 = T(0);
+  T weight = T(0);
+};
+
+
+template <typename T>
+struct IsVecType: std::false_type {};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T>
+struct IsVecType<at::vec::Vectorized<T>>: std::true_type {};
+#endif
+
+template <typename T>
+Welford<T> welford_combine(const Welford<T> &a, const Welford<T> &b) {
+  if constexpr (!IsVecType<T>::value) {
+    if (a.weight == 0) {
+      return b;
+    }
+    if (b.weight == 0) {
+      return a;
+    }
+  }
+  auto delta = b.mean - a.mean;
+  auto new_weight = a.weight + b.weight;
+  auto wb_over_w = b.weight / new_weight;
+  if constexpr (IsVecType<T>::value) {
+    // Guard against division by zero
+    wb_over_w = T::blendv(wb_over_w, T(0), new_weight == T(0));
+  }
+  auto result = Welford<T>{
+    a.mean + delta * wb_over_w,
+    a.m2 + b.m2 + delta * delta * a.weight * wb_over_w,
+    new_weight
+  };
+  return result;
+}
+
+template <typename T>
+Welford<T> welford_combine(const Welford<T> &acc, T data) {
+  // Add a single data point
+  auto delta = data - acc.mean;
+  auto new_weight = acc.weight + T(1);
+  auto new_mean = acc.mean + delta / new_weight;
+  auto new_delta = data - new_mean;
+  auto result = Welford<T>{
+    new_mean,
+    acc.m2 + delta * new_delta,
+    new_weight
+  };
+  return result;
+}
+
+// Refer to https://github.com/pytorch/pytorch/blob/b5b36cf0c4e1958f1ff25120f5d4beeef3288187/
+// aten/src/ATen/native/SharedReduceOps.h#L419-L445
+template <typename scalar_t>
+inline bool greater_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else max(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a > b);
+}
+
+template <typename scalar_t>
+inline bool less_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else min(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a < b);
+}
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> vec_shuffle_down(at::vec::Vectorized<scalar_t> x, size_t n) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  x.store(array);
+  for (size_t i = 0; i + n < Vec::size(); i += 2 * n) {
+    array[i] = array[i + n];
+  }
+  return Vec::loadu(array);
+}
+
+#ifdef CPU_CAPABILITY_AVX2
+inline at::vec::Vectorized<float> vec_shuffle_down(at::vec::Vectorized<float> x, size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+  case 1:
+    return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+  case 2:
+    return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+  case 4:
+    return vec_t(_mm256_permute2f128_ps(x, x, SHUFFLE_MASK(1, 1, 1, 1)));
+  }
+  TORCH_CHECK(false, "Unhandled vec_shuffle_down value ", n);
+}
+#endif
+
+template <typename scalar_t>
+Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::Vectorized<scalar_t>> acc) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  for (size_t n = 1; n < Vec::size(); n *= 2) {
+    auto shuffled = Welford<Vec>{
+      vec_shuffle_down(acc.mean, n),
+      vec_shuffle_down(acc.m2, n),
+      vec_shuffle_down(acc.weight, n)
+    };
+    acc = welford_combine(acc, shuffled);
+  }
+
+  Welford<scalar_t> result;
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  acc.mean.store(array);
+  result.mean = array[0];
+
+  acc.m2.store(array);
+  result.m2 = array[0];
+
+  acc.weight.store(array);
+  result.weight = array[0];
+
+  return result;
+}
+#endif
+
+
+template <typename T, typename U> inline typename std::common_type<T, U>::type mod(T a, U b) { return a % b; }
+template <> inline float mod(float a, float b) { return std::fmod(a, b); }
+template <> inline double mod(double a, double b) { return std::fmod(a, b); }
+
+template <typename scalar_t>
+inline scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a > b ? a : b;
+}
+
+template <typename scalar_t>
+inline scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a < b ? a : b;
+}
+
+constexpr float uint32_to_uniform_float(uint32_t value) {
+  // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+  constexpr float scale = 4.6566127342e-10;
+  return static_cast<float>(value & 0x7FFFFFFF) * scale;
+}
+
+float normalized_rand_cpu(uint32_t seed, uint32_t offset) {
+  return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)());
+}
+
+float randn_cpu(uint32_t seed, uint32_t offset) {
+  at::Philox4_32 engine(seed, 0, offset);
+  return engine.randn(10);
+}
+
+int64_t randint64_cpu(uint32_t seed, uint32_t offset, int64_t low, int64_t high) {
+  auto gen = at::Philox4_32(seed, 0, offset);
+  uint64_t r0 = gen();
+  uint64_t r1 = gen();
+  uint64_t result = r0 | (r1 << 32);
+  return static_cast<int64_t>(result % (high - low)) + low;
+}
+
+template <typename T> struct AsIntegerType { typedef T type; };
+template <> struct AsIntegerType<float> { typedef uint32_t type; };
+template <> struct AsIntegerType<double> { typedef uint64_t type; };
+template <> struct AsIntegerType<bfloat16> { typedef uint16_t type; };
+
+template <typename T>
+typename std::enable_if<!std::is_reduced_floating_point<T>::value, T>::type
+inline fetch_value(volatile T *addr) {
+  return *addr;
+}
+
+template <typename T>
+typename std::enable_if<std::is_reduced_floating_point<T>::value, T>::type
+inline fetch_value(volatile T *addr) {
+  return T(addr->x, T::from_bits());
+}
+
+template <typename T>
+typename std::enable_if<!std::is_integral<T>::value>::type
+atomic_add(volatile T *addr, T offset) {
+  typedef typename AsIntegerType<T>::type alt_type;
+
+  static_assert(sizeof(std::atomic<alt_type>) == sizeof(T),
+                "std::atomic issue");
+
+  alt_type expected;
+
+  alt_type desired;
+
+  std::atomic<alt_type> *atomic_addr = (std::atomic<alt_type> *)addr;
+  do {
+    T val = fetch_value(addr);
+    reinterpret_cast<T *>(&expected)[0] = val;
+    reinterpret_cast<T *>(&desired)[0] = val + offset;
+  } while (!atomic_addr->compare_exchange_weak(expected, desired,
+                                               std::memory_order_relaxed));
+}
+
+// Since C++20 float is supported by fetch_add, but the performance may not
+// better than compare_exchange_weak, which can be checked by microbenchmark
+// inductor_cpu_atomic.py
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type
+atomic_add(volatile T *addr, T offset) {
+  static_assert(sizeof(std::atomic<T>) == sizeof(T),
+                "std::atomic issue");
+  std::atomic<T> *atomic_addr = (std::atomic<T> *)addr;
+  atomic_addr->fetch_add(offset, std::memory_order_relaxed);
+}
+
+// This function is used to convert bool or uint8 to float mask for
+// vectorization. The caller needs to make sure the src represents TRUE/FALSE
+// correctly.
+template <typename T>
+inline float flag_to_float_scalar(T src) {
+  float ret;
+  *(uint32_t*)(&ret) = src ? 0xFFFFFFFF : 0;
+  return ret;
+}
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR)
+
+inline at::vec::Vectorized<float> masked_load(const float* src, at::vec::Vectorized<float> mask) {
+# if defined(CPU_CAPABILITY_AVX512)
+    at::vec::Vectorized<float> zero_vec(0);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_loadu_ps(zero_vec, mmask, src);
+# elif defined(CPU_CAPABILITY_AVX2)
+    auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
+    return _mm256_maskload_ps(src, mmask);
+# elif defined(CPU_CAPABILITY_ZVECTOR)
+    auto result = at::vec::Vectorized<float>::loadu(src);
+    return (result & mask);
+# else
+# error Unsupported vectorization CPU capability
+# endif
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, bfloat16>::value || std::is_same<T, half>::value, at::vec::Vectorized<T>>::type
+inline masked_load(const T* src, at::vec::Vectorized<float> mask) {
+# if defined(CPU_CAPABILITY_AVX512)
+  auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+  auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
+  auto zero = _mm256_set1_epi16(0);
+  auto temp = _mm256_mask_loadu_epi16(zero, mmask, src);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(temp), zero, 1);
+# elif defined(CPU_CAPABILITY_AVX2)
+  auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
+  auto mmask_vec = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
+  __at_align__ uint32_t mmask[8];
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(mmask), mmask_vec);
+  __at_align__ uint16_t result[16];
+  for (auto i = 0; i < 8; i++) {
+    result[i] = mmask[i] == 0xFFFFFFFF ? src[i].x: uint16_t(0);
+  }
+  return at::vec::Vectorized<T>::loadu(result);
+# elif defined(CPU_CAPABILITY_ZVECTOR)
+  auto result = at::vec::Vectorized<T>::loadu(src, 8);
+  uint32_t maskdata[8] = { 0 };
+  uint16_t maskdata_dest[16] = { 0 };
+  mask.store(maskdata);
+  for (auto i = 0; i < 8; i++) {
+    maskdata_dest[i] = (maskdata[i] == 0xFFFFFFFF) ? 0xFFFF: 0;
+  }
+  auto maskvector = at::vec::Vectorized<T>::loadu(maskdata_dest);
+  return (result & maskvector);
+# else
+# error Unsupported vectorization CPU capability
+# endif
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, at::vec::Vectorized<T>>::type
+inline masked_load(const T* src, at::vec::Vectorized<float> mask) {
+# if defined(CPU_CAPABILITY_AVX512)
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask), all_ones, _MM_CMPINT_EQ);
+    auto zero = _mm_set1_epi8(0);
+    auto temp = _mm_mask_loadu_epi8(zero, mmask, src);
+    return _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0);
+# elif defined(CPU_CAPABILITY_AVX2)
+    auto all_ones = _mm256_set1_epi32(0xFFFFFFFF);
+    auto mmask_vec = _mm256_cmpeq_epi32(_mm256_castps_si256(mask), all_ones);
+    __at_align__ uint32_t mmask[8];
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(mmask), mmask_vec);
+    __at_align__ T result[32];
+    for (auto i = 0; i < 8; i++) {
+      result[i] = mmask[i] == 0xFFFFFFFF ? src[i]: T(0);
+    }
+    return at::vec::Vectorized<T>::loadu(result);
+# elif defined(CPU_CAPABILITY_ZVECTOR)
+    auto result = at::vec::Vectorized<T>::loadu(src, 8);
+    uint32_t maskdata[8];
+    T maskdata_dest[32] = { 0 };
+    mask.store(maskdata);
+    for (auto i = 0; i < 8; i++) {
+      maskdata_dest[i] = (maskdata[i] == 0xFFFFFFFF) ? 0xFF: 0;
+    }
+    auto maskvector = at::vec::Vectorized<T>::loadu(maskdata_dest);
+    return (result & maskvector);
+# else
+# error Unsupported vectorization CPU capability
+# endif
+}
+
+template <typename T>
+inline at::vec::Vectorized<float> flag_to_float_vec(const T* src) {
+  __at_align__ float dst_tmp[at::vec::Vectorized<float>::size()];
+  #pragma unroll
+  for (int64_t i = 0; i < at::vec::Vectorized<float>::size(); i++) {
+    dst_tmp[i] = flag_to_float_scalar(src[i]);
+  }
+  return at::vec::Vectorized<float>::loadu(dst_tmp);
+}
+
+template <typename scalar_t>
+inline at::vec::Vectorized<float> cvt_lowp_fp_to_fp32(
+    at::vec::Vectorized<scalar_t> src) {
+  at::vec::Vectorized<float> res_vec1(0);
+  at::vec::Vectorized<float> res_vec2(0);
+  std::tie(res_vec1, res_vec2) = at::vec::convert_to_float<scalar_t>(src);
+  return res_vec1;
+}
+
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> cvt_fp32_to_lowp_fp(
+    at::vec::Vectorized<float> src) {
+  return at::vec::convert_from_float<scalar_t>(src, src);
+}
+
+inline at::vec::Vectorized<float> mask_convert_to_float(at::vec::Vectorized<float> src) {
+  auto zeros = at::vec::Vectorized<float>(0);
+  auto ones = at::vec::Vectorized<float>(1);
+  return at::vec::Vectorized<float>::blendv(zeros, ones, src);
+}
+
+template <typename scalar_t>
+inline
+typename std::enable_if<std::is_same<scalar_t, bfloat16>::value || std::is_same<scalar_t, half>::value, at::vec::Vectorized<scalar_t>>::type
+mask_convert_to_lowp(at::vec::Vectorized<float> src) {
+  auto fp_vec = mask_convert_to_float(src);
+  return cvt_fp32_to_lowp_fp<scalar_t>(fp_vec);
+}
+
+template <typename SRC>
+inline at::vec::Vectorized<float> vec_convert_to_mask(at::vec::Vectorized<SRC> src) {
+  assert(
+      at::vec::Vectorized<float>::size() == at::vec::Vectorized<SRC>::size());
+  at::vec::Vectorized<float> res_vec(0);
+  __at_align__ float dst_tmp[at::vec::Vectorized<float>::size()];
+  __at_align__ SRC src_tmp[at::vec::Vectorized<SRC>::size()];
+  src.store(src_tmp);
+
+#pragma unroll
+  for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
+    *(uint32_t*)(dst_tmp + i) = src_tmp[i] ? 0xFFFFFFFF : 0;
+  }
+
+  return res_vec.loadu(dst_tmp);
+}
+
+template <typename SRC>
+inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<SRC> src) {
+  return vec_convert_to_mask(src);
+}
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)
+template <>
+inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<int> src) {
+#if defined(CPU_CAPABILITY_AVX2)
+  return at::vec::Vectorized<float>(_mm256_castsi256_ps(src));
+#else
+  return at::vec::Vectorized<float>(_mm512_castsi512_ps(src));
+#endif
+}
+#endif
+
+template <>
+inline at::vec::Vectorized<float> to_float_mask(at::vec::Vectorized<float> src) {
+  return src;
+}
+
+inline at::vec::Vectorized<float> to_float_mask(int src) {
+  union {
+      float fmask;
+      uint32_t imask;
+  } mask;
+  mask.imask = src ? 0xFFFFFFFF : 0;
+  return at::vec::Vectorized<float>(mask.fmask);
+}
+
+inline bool all_zero(at::vec::Vectorized<float> src) {
+# if defined(CPU_CAPABILITY_AVX512)
+  auto src_int = _mm512_castps_si512(src);
+  __mmask16 mask = _mm512_test_epi32_mask(src_int, src_int);
+  return mask == 0;
+# elif defined(CPU_CAPABILITY_AVX2)
+  return _mm256_testz_ps(src, src);
+# else
+  __at_align__ int mask[at::vec::Vectorized<float>::size()];
+  src.store(mask);
+  for (int i = 0; i < at::vec::Vectorized<float>::size(); i++) {
+    if (mask[i] != 0) {
+      return false;
+    }
+  }
+  return true;
+# endif
+}
+
+inline bool vector_lane_mask_check(at::vec::Vectorized<float> src, int lane) {
+# if defined(CPU_CAPABILITY_AVX512)
+  return _mm512_movepi32_mask(_mm512_castps_si512(src)) & (1 << lane);
+# elif defined(CPU_CAPABILITY_AVX2)
+  return _mm256_movemask_ps(src) & (1 << lane);
+# else
+  __at_align__ int mask[at::vec::Vectorized<float>::size()];
+  src.store(mask);
+  return mask[lane] != 0;
+# endif
+}
+
+inline at::vec::Vectorized<float> cvt_int64_to_fp32(at::vec::VectorizedN<int64_t,2> src) {
+# if defined(CPU_CAPABILITY_AVX512)
+  auto low = _mm512_cvtepi64_ps(src[0]);
+  auto high = _mm512_cvtepi64_ps(src[1]);
+  return _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1);
+# elif defined(CPU_CAPABILITY_AVX2)
+  auto low_double = at::vec::convert_to_fp_of_same_size<double>(src[0]);
+  auto low = _mm256_cvtpd_ps(low_double);
+  auto high_double = at::vec::convert_to_fp_of_same_size<double>(src[1]);
+  auto high = _mm256_cvtpd_ps(high_double);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+# else
+  constexpr int float_vec_size = at::vec::Vectorized<float>::size();
+  constexpr int int64_vec_size = at::vec::Vectorized<int64_t>::size();
+  __at_align__ float result[float_vec_size];
+  __at_align__ int64_t src_buf[int64_vec_size];
+  for (int i = 0; i < 2; i++) {
+    src[i].store(src_buf + i * int64_vec_size);
+    for (int j = 0; j < int64_vec_size; j++) {
+      result[i * int64_vec_size + j] = static_cast<float>(src_buf[i * int64_vec_size + j]);
+    }
+  }
+  return at::vec::Vectorized<float>::loadu(result);
+# endif
+}
+
+inline at::vec::VectorizedN<int64_t,2> cvt_fp32_to_int64(at::vec::Vectorized<float> src) {
+  at::vec::VectorizedN<int64_t,2> result;
+# if defined(CPU_CAPABILITY_AVX512)
+  result[0] = _mm512_cvt_roundps_epi64(_mm512_castps512_ps256(src), _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+  result[1] = _mm512_cvt_roundps_epi64(_mm512_extractf32x8_ps(src, 1), _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+# elif defined(CPU_CAPABILITY_AVX2)
+  auto int32_vec = at::vec::convert_to_int_of_same_size(src);
+  result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(int32_vec));
+  result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(int32_vec, 1));
+# else
+  constexpr int float_vec_size = at::vec::Vectorized<float>::size();
+  constexpr int int64_vec_size = at::vec::Vectorized<int64_t>::size();
+  __at_align__ float src_buf[float_vec_size];
+  __at_align__ int64_t result_buf[int64_vec_size];
+  src.store(src_buf);
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < int64_vec_size; j++) {
+      result_buf[j] = static_cast<int64_t>(src_buf[i * int64_vec_size + j]);
+    }
+    result[i] = at::vec::Vectorized<int64_t>::loadu(result_buf);
+  }
+# endif
+  return result;
+}
+
+inline at::vec::Vectorized<int32_t> cvt_int64_to_int32(at::vec::VectorizedN<int64_t,2> src) {
+# if defined(CPU_CAPABILITY_AVX512)
+  auto low = _mm512_cvtepi64_epi32(src[0]);
+  auto high = _mm512_cvtepi64_epi32(src[1]);
+  return _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1);
+# elif defined(CPU_CAPABILITY_AVX2)
+  auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0));
+  auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0));
+  auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0));
+  auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0));
+  return _mm256_blend_epi32(low_perm, high_perm, 0xF0);
+# else
+  constexpr int int32_vec_size = at::vec::Vectorized<int32_t>::size();
+  constexpr int int64_vec_size = at::vec::Vectorized<int64_t>::size();
+  __at_align__ int32_t result[int32_vec_size];
+  __at_align__ int64_t src_buf[int64_vec_size];
+  for (int i = 0; i < 2; i++) {
+    src[i].store(src_buf + i * int64_vec_size);
+    for (int j = 0; j < int64_vec_size; j++) {
+      result[i * int64_vec_size + j] = static_cast<int32_t>(src_buf[i * int64_vec_size + j]);
+    }
+  }
+  return at::vec::Vectorized<int32_t>::loadu(result);
+# endif
+}
+
+inline at::vec::VectorizedN<int64_t,2> cvt_int32_to_int64(at::vec::Vectorized<int32_t> src) {
+  at::vec::VectorizedN<int64_t,2> result;
+# if defined(CPU_CAPABILITY_AVX512)
+  result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src));
+  result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src, 1));
+# elif defined(CPU_CAPABILITY_AVX2)
+  result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
+  result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
+#else
+  constexpr int int32_vec_size = at::vec::Vectorized<int32_t>::size();
+  constexpr int int64_vec_size = at::vec::Vectorized<int64_t>::size();
+  __at_align__ int32_t src_buf[int32_vec_size];
+  __at_align__ int64_t result_buf[int64_vec_size];
+  src.store(src_buf);
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < int64_vec_size; j++) {
+      result_buf[j] = static_cast<int64_t>(src_buf[i * int64_vec_size + j]);
+    }
+    result[i] = at::vec::Vectorized<int64_t>::loadu(result_buf);
+  }
+# endif
+  return result;
+}
+
+inline at::vec::VectorizedN<int64_t,2> mask_convert_to_int64(at::vec::Vectorized<float> src) {
+  return cvt_fp32_to_int64(mask_convert_to_float(src));
+}
+
+inline at::vec::Vectorized<float> to_float_mask(at::vec::VectorizedN<int64_t,2> src) {
+  return to_float_mask(cvt_int64_to_int32(src));
+}
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97b538b3693f910d6630dbf2171807d37cd98b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -0,0 +1,1851 @@
+import functools
+import os
+import sys
+from itertools import count
+from typing import List, Optional, Tuple
+
+import sympy
+from sympy import Expr
+
+import torch
+import torch._ops
+from .. import config, ir
+
+from ..codecache import CudaKernelParamCache
+from ..utils import cache_on_self, sympy_product
+from ..virtualized import V
+from .common import IndentedBuffer
+from .wrapper import EnterSubgraphLine, ExitSubgraphLine, pexpr, WrapperCodeGen
+
+
+class CppWrapperCpu(WrapperCodeGen):
+    """
+    Generates cpp wrapper for running on CPU and calls cpp kernels
+    """
+
+    def __init__(self):
+        if not hasattr(self, "device"):
+            self.device = "cpu"
+        super().__init__()
+        self.declare = "auto "
+        self.declare_maybe_reference = "decltype(auto) "
+        self.ending = ";"
+        self.open_bracket = "{"
+        self.closed_bracket = "}"
+        self.comment = "//"
+        self.namespace = "at::"
+        self.none_str = "nullptr" if config.abi_compatible else "at::Tensor()"
+        self.extern_call_ops = set()
+        self.size = "sizes()"
+        self.stride = "strides()"
+        self.cuda = False
+        self.supports_intermediate_hooks = False
+        self.outputs_need_copy = set()
+        self.kernel_callsite_id = count()
+        self.int_array_id = count()  # for int array local variable declarations
+        self.declared_int_array_vars = set()
+        self.tmp_tensor_id = count()  # for tmp tensor local variable declarations
+        self.arg_var_id = count()
+        self.used_cached_devices = set()
+        self.used_cached_dtypes = set()
+        self.cached_output_id = count()
+        self.scalar_to_tensor_id = count()
+
+        from .cpp import cexpr, CppPrinter
+
+        self.expr_printer = cexpr
+
+        # CppPrinter sometimes calls at::native functions which causes problems in
+        # the ABI-compatible mode. Currently we are hitting this problem when codegen
+        # Grid computation expressions, but we my need to fix other size computation
+        # as well.
+        class GridExprCppPrinter(CppPrinter):
+            def _print_FloorDiv(self, expr):
+                x, div = expr.args
+                x = self.paren(self.doprint(x))
+                div = self.paren(self.doprint(div))
+                assert expr.is_integer, "Expect integers in GridExprPrinter"
+                return f"({x}/{div})"
+
+        self.grid_expr_printer = GridExprCppPrinter().doprint
+
+    def generate_kernel_call(
+        self,
+        name,
+        call_args,
+        grid=None,
+        device_index=None,
+        cuda=True,
+        triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+    ):
+        """
+        Generates kernel call code.
+
+        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.
+
+        triton: Defines whether the GPU backend uses Triton for codegen.
+                Otherwise it uses the CUDA language for codegen.
+                Only valid when cuda == True.
+        """
+        if cuda:
+            return super().generate_kernel_call(
+                name,
+                call_args,
+                grid,
+                device_index,
+                cuda,
+                triton,
+                arg_types,
+                grid_fn,
+            )
+        else:
+            if config.abi_compatible:
+                assert arg_types is not None and len(call_args) == len(
+                    arg_types
+                ), "Mismatch call_args and arg_types in generate_kernel_call"
+                new_args = []
+                for idx, arg in enumerate(call_args):
+                    if "*" in arg_types[idx]:
+                        var_name = f"var_{next(self.arg_var_id)}"
+                        self.writeline(
+                            f"auto* {var_name} = get_data_ptr_wrapper({arg});"
+                        )
+                        new_args.append(f"({arg_types[idx]})({var_name})")
+                    else:
+                        # arg is a scalar
+                        new_args.append(arg)
+                self.writeline(self.wrap_kernel_call(name, new_args))
+            else:
+                self.writeline(self.wrap_kernel_call(name, call_args))
+
+    def write_constant(self, name, hashed):
+        # include a hash so our code cache gives different constants different files
+        self.header.writeline(f"// {name} {hashed}")
+
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        if V.graph.aot_mode:
+            for header_cpp_file in ("interface.cpp", "implementation.cpp"):
+                with open(
+                    os.path.join(
+                        os.path.dirname(__file__), "aoti_runtime", header_cpp_file
+                    )
+                ) as f:
+                    self.header.splice(f.read())
+        else:
+            self.header.splice(
+                """
+                import torch
+                from torch._inductor.codecache import CppWrapperCodeCache
+
+                cpp_wrapper_src = (
+                '''
+                """
+            )
+
+        if config.abi_compatible:
+            if config.c_shim_version == "1":
+                self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
+            else:
+                self.header.splice(
+                    f"#include <torch/csrc/inductor/aoti_torch/generated/c_shim_{self.device}.h>"
+                )
+            self.header.splice(
+                """
+                #include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+                #include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+                #include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+                """
+            )
+            if V.graph.aot_mode:
+                self.header.splice(
+                    """
+                    #include <torch/csrc/inductor/aoti_runtime/model.h>
+                    """
+                )
+        else:
+            self.header.splice(
+                """
+                #include <ATen/ATen.h>
+                #include <ATen/core/dispatch/Dispatcher.h>
+                #include <ATen/native/BinaryOps.h>
+                #include <torch/csrc/inductor/aoti_runtime/utils.h>
+                #include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+                #include <torch/csrc/inductor/inductor_ops.h>
+                #include <torch/types.h>
+                #include <ATen/ops/bernoulli_native.h>
+
+                #define reinterpret_tensor torch::inductor::_reinterpret_tensor
+                #define alloc_from_pool torch::inductor::_alloc_from_pool
+                """
+            )
+
+        self.header.splice("#include <c10/util/generic_math.h>")
+
+        if not V.graph.aot_mode:
+            self.header.splice(
+                """
+                #include <pybind11/pybind11.h>
+
+                using namespace torch::aot_inductor;
+                """
+            )
+
+        from .memory_planning import ALIGN_BYTES
+
+        # Round up to the nearest multiple of ALIGN_BYTES
+        # ALIGN_BYTES must be a power of 2
+        self.header.splice(
+            f"""
+            [[maybe_unused]] static int64_t align(int64_t nbytes) {{
+              return (nbytes + {ALIGN_BYTES} - 1) & -{ALIGN_BYTES};
+            }}
+            """
+        )
+
+    def mark_output_type(self):
+        # mark output type to unwrap tensor back to python scalar
+        from ..ir import ShapeAsConstantBuffer
+
+        output_is_tensor = dict()
+        for idx, x in enumerate(V.graph.graph_outputs):
+            if isinstance(x, ShapeAsConstantBuffer):
+                output_is_tensor[idx] = False
+            else:
+                output_is_tensor[idx] = True
+
+        self.output_is_tensor = output_is_tensor
+
+    def write_prefix(self):
+        if V.graph.is_const_graph:
+            # We do not write prefix for constant graph, it will be written by main module.
+            return
+
+        if V.graph.aot_mode:
+            self.prefix.writeline("namespace torch {")
+            self.prefix.writeline("namespace aot_inductor {")
+
+    def write_input_output_info(
+        self,
+        info_kind: str,
+        idx: int,
+        name: str,
+    ):
+        self.prefix.writeline(f"""{info_kind}[{idx}].name = "{name}";""")
+
+    @staticmethod
+    def get_input_cpp_type(input):
+        assert config.use_minimal_arrayref_interface
+        from .cpp import DTYPE_TO_CPP
+
+        if isinstance(input, sympy.Expr):
+            from ..graph import may_get_constant_buffer_dtype
+
+            dtype = may_get_constant_buffer_dtype(input)
+            assert dtype is not None, f"Failed to get the dtype of sympy.Expr: {input}"
+            return DTYPE_TO_CPP[dtype]
+        return f"ArrayRefTensor<{DTYPE_TO_CPP[input.get_dtype()]}>"
+
+    def write_wrapper_decl(self):
+        inputs_len = len(V.graph.graph_inputs.keys())
+        if V.graph.aot_mode:
+            if config.use_minimal_arrayref_interface and not V.graph.is_const_graph:
+                from .cpp import DTYPE_TO_CPP
+
+                input_cpp_types = ", ".join(
+                    f"{CppWrapperCpu.get_input_cpp_type(x)}"
+                    for x in V.graph.graph_inputs.values()
+                )
+
+                output_arrayref_types = ", ".join(
+                    f"ArrayRefTensor<{DTYPE_TO_CPP[x.get_dtype()]}>"
+                    for x in V.graph.graph_outputs
+                )
+
+                self.prefix.splice(
+                    f"""
+                    using AOTInductorModelInputs = std::tuple<{input_cpp_types}>;
+                    using AOTInductorModelOutputs = std::tuple<{output_arrayref_types}>;
+                    """
+                )
+
+            if V.graph.const_module:
+                self.header.splice(V.graph.const_module.wrapper_code.header)
+                self.prefix.splice(V.graph.const_code)
+
+            if V.graph.is_const_graph:
+                self.prefix.splice(
+                    """
+                    void AOTInductorModel::_const_run_impl(
+                        std::vector<AtenTensorHandle>& output_handles,
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                )
+            else:
+                if not config.aot_inductor.use_runtime_constant_folding:
+                    # If we do not split the constant graph, we'll just create
+                    # an empty implementation when wrapping the main module.
+                    self.prefix.splice(
+                        """
+                        void AOTInductorModel::_const_run_impl(
+                            std::vector<AtenTensorHandle>& output_handles,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {}
+
+                        """
+                    )
+
+                run_impl_proto = """
+                    void AOTInductorModel::run_impl(
+                        AtenTensorHandle*
+                            input_handles, // array of input AtenTensorHandle; handles
+                                            // are stolen; the array itself is borrowed
+                        AtenTensorHandle*
+                            output_handles, // array for writing output AtenTensorHandle; handles
+                                            // will be stolen by the caller; the array itself is
+                                            // borrowed
+                        DeviceStreamType stream,
+                        AOTIProxyExecutorHandle proxy_executor
+                    ) {
+                    """
+                if config.use_minimal_arrayref_interface:
+                    self.prefix.splice(
+                        """
+                        template <>
+                        AOTInductorModelOutputs AOTInductorModel::run_impl_minimal_arrayref_interface<
+                          AOTInductorModelInputs, AOTInductorModelOutputs>(
+                            const AOTInductorModelInputs& inputs,
+                            DeviceStreamType stream,
+                            AOTIProxyExecutorHandle proxy_executor
+                        ) {
+                        """
+                    )
+                    self.suffix.splice(run_impl_proto)
+                    self.suffix.splice(
+                        """
+                            AOTInductorModelInputs inputs;
+                            convert_handles_to_inputs(input_handles, inputs);
+                            auto outputs = run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                inputs, stream, proxy_executor);
+                            // NOTE: outputs is full of ArrayRef to thread_local storage. If in the future we need this
+                            // interface to perform well for a DSO using the minimal arrayref interface, all we need
+                            // to do is provide ThreadLocalCachedTensor for each one!
+                            convert_outputs_to_handles(outputs, output_handles);
+                        }
+                    """
+                    )
+
+                    self.suffix.splice(
+                        """
+                        extern "C" AOTIRuntimeError AOTInductorModelRunMinimalArrayrefInterface(
+                            AOTInductorModelHandle model_handle,
+                            const AOTInductorModelInputs& inputs,
+                            AOTInductorModelOutputs& outputs) {
+                          auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+                          CONVERT_EXCEPTION_TO_ERROR_CODE({
+                              outputs = model->run_impl_minimal_arrayref_interface<AOTInductorModelInputs, AOTInductorModelOutputs>(
+                                  inputs,
+                                  (torch::aot_inductor::DeviceStreamType)nullptr,
+                                  nullptr);
+                          })
+                        }
+                    """
+                    )
+                else:
+                    self.prefix.splice(run_impl_proto)
+        else:
+            self.prefix.splice(
+                """
+                void inductor_entry_impl(
+                    AtenTensorHandle*
+                        input_handles, // array of input AtenTensorHandle; handles
+                                        // are stolen; the array itself is borrowed
+                    AtenTensorHandle*
+                        output_handles  // array for writing output AtenTensorHandle; handles
+                                        // will be stolen by the caller; the array itself is
+                                        // borrowed)
+                ) {
+                """
+            )
+        with self.prefix.indent():
+            # assign inputs and outputs in both cases so the later codegen can be simplified
+            if not config.use_minimal_arrayref_interface:
+                if not V.graph.is_const_graph:
+                    if V.graph.aot_mode:
+                        num_args = len(V.graph.graph_inputs)
+                    else:
+                        # Weights are promoted in the JIT mode
+                        num_args = len(V.graph.graph_inputs) + len(V.graph.constants)
+                        self.prefix.splice(
+                            """
+                                pybind11::gil_scoped_release release;
+                            """
+                        )
+
+                    if config.abi_compatible:
+                        self.prefix.splice(
+                            f"""
+                                auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, {num_args});
+                            """
+                        )
+                    else:
+                        # This looks dumb, but can avoid creating two versions of code in the AOTInductor runtime.
+                        self.prefix.splice(
+                            f"""
+                                auto inputs = alloc_tensors_by_stealing_from_handles(input_handles, {num_args});
+                            """
+                        )
+
+            if inputs_len != 0:
+                for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
+                    if config.use_minimal_arrayref_interface:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::get<{idx}>(inputs);"
+                        )
+                        continue
+                    # unwrap input tensor back to scalar
+                    if isinstance(V.graph.graph_inputs[input_key], sympy.Expr):
+                        from ..graph import may_get_constant_buffer_dtype
+                        from .cpp import DTYPE_TO_CPP
+
+                        dtype = may_get_constant_buffer_dtype(
+                            V.graph.graph_inputs[input_key]
+                        )
+                        assert (
+                            dtype is not None
+                        ), "Fails to get the dtype of the sympy.Expr"
+                        cpp_dtype = DTYPE_TO_CPP[dtype]
+                        if config.abi_compatible:
+                            self.prefix.writeline(f"{cpp_dtype} {input_key};")
+                            dtype_str = str(dtype).split(".")[-1]
+                            self.prefix.writeline(
+                                f"aoti_torch_item_{dtype_str}(inputs[{idx}], &{input_key});"
+                            )
+                        else:
+                            self.prefix.writeline(
+                                f"{cpp_dtype} {input_key} = inputs[{idx}].item<{cpp_dtype}>();"
+                            )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {input_key} = std::move(inputs[{idx}]);"
+                        )
+
+            assert all(
+                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
+            ), "Expect all constants to be Tensor"
+            for idx, constants_key in enumerate(V.graph.constants.keys()):
+                if V.graph.aot_mode:
+                    # Weights are stored in constants_ and owned by RAIIAtenTensorHandle there.
+                    # Don't call std::move here because it will cause constants_ to lose the ownership.
+                    if config.abi_compatible:
+                        self.prefix.writeline(
+                            f"""auto {constants_key} = constants_->at({idx});"""
+                        )
+                    else:
+                        self.prefix.writeline(
+                            f"auto {constants_key} = *tensor_handle_to_tensor_pointer("
+                            + f"""constants_->at({idx}));"""
+                        )
+                else:
+                    # Append constants as inputs to the graph
+                    constants_idx = inputs_len + idx
+                    self.prefix.writeline(
+                        f"auto {constants_key} = inputs[{constants_idx}];"
+                    )
+
+            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+
+            if V.graph.aot_mode:
+                if not V.graph.is_const_graph:
+                    if config.use_minimal_arrayref_interface:
+                        # TODO: input shape checking for regular tensor interface as well?
+                        self.codegen_input_numel_asserts()
+                    else:
+                        self.prefix.writeline("inputs.clear();")
+                self.prefix.writeline(
+                    "auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());"
+                )
+
+    def codegen_input_numel_asserts(self):
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+
+            # comparing strides for 0 size tensor is tricky. Ignore them for now.
+            if sympy_product(buf.get_size()) == 0:
+                continue
+            numel = buf.get_numel()
+            self.prefix.writeline(f"assert_numel({name}, {numel});")
+
+    def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
+        if config.abi_compatible:
+            code.writeline(f"int64_t* {name}_size;")
+            code.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes({name}, &{name}_size));"
+            )
+        else:
+            super().codegen_input_size_var_decl(code, name)
+
+    def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
+        if config.abi_compatible:
+            code.writeline(f"int64_t* {name}_stride;")
+            code.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides({name}, &{name}_stride));"
+            )
+        else:
+            super().codegen_input_stride_var_decl(code, name)
+
+    def codegen_model_kernels(self):
+        self.prefix.writeline("namespace {")
+        self.prefix.writeline(
+            "class AOTInductorModelKernels : public AOTInductorModelKernelsBase {"
+        )
+        self.prefix.writeline("  public:")
+        declare_kernel = set(self.src_to_kernel.values())
+        declare_kernel.update(
+            entry[0] for entry in self.user_defined_kernel_cache.values()
+        )
+        if V.graph.const_module:
+            declare_kernel.update(
+                V.graph.const_module.wrapper_code.src_to_kernel.values()
+            )
+        for kernel in declare_kernel:
+            self.prefix.writeline(f"    CUfunction {kernel}{{nullptr}};")
+        self.prefix.writeline("};")
+        self.prefix.writeline("}  // namespace")
+
+    def codegen_model_constructor(self):
+        """
+        // Generated code example
+        AOTInductorModel::AOTInductorModel()
+            : AOTInductorModelBase(4, 1) {
+        inputs_info_[0].name = "input0";
+        inputs_info_[0].dtype = "torch.float16";
+        ...
+        constants_info_[0].name = "L__self___weight";
+        constants_info_[0].dtype = at::kFloat;
+        constants_info_[0].offset = 0;
+        constants_info_[0].data_size = 8192;
+        constants_info_[0].shape = {64, 32};
+        constants_info_[0].stride = {32, 1};
+        ...
+        outputs_info_[0].name = "output0";
+        outputs_info_[0].dtype = "torch.float16";
+        }
+        """
+
+        num_inputs = len(V.graph.graph_inputs)
+        num_outputs = len(V.graph.graph_outputs)
+        num_constants = len(V.graph.constants)
+        self.prefix.splice(
+            f"""
+            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                               std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                               const std::string& device_str,
+                                               std::optional<std::string> cubin_dir)
+                : AOTInductorModelBase({num_inputs}, {num_outputs}, {num_constants}, device_str, cubin_dir) {{
+            """
+        )
+
+        with self.prefix.indent():
+            for idx, (name, inp) in enumerate(V.graph.graph_inputs.items()):
+                assert not isinstance(
+                    inp, sympy.Expr
+                ), f"input {name=} cannot be symbolic"
+                self.write_input_output_info("inputs_info_", idx, name)
+
+            for idx, (name, tensor) in enumerate(V.graph.constants.items()):
+                assert isinstance(tensor, torch.Tensor)
+                self.prefix.writeline(f"""constants_info_[{idx}].name = "{name}";""")
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].dtype = static_cast<int32_t>({self.codegen_dtype(tensor.dtype)});"
+                )
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].offset = {tensor.storage_offset()};"
+                )
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].data_size = {tensor.untyped_storage().nbytes()};"
+                )
+                from_folded = "true" if name in V.graph.folded_constants else "false"
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].from_folded = {from_folded};"
+                )
+
+                size_str = ", ".join([str(s) for s in tensor.size()])
+                self.prefix.writeline(f"constants_info_[{idx}].shape = {{{size_str}}};")
+
+                stride_str = ", ".join([str(s) for s in tensor.stride()])
+                self.prefix.writeline(
+                    f"constants_info_[{idx}].stride = {{{stride_str}}};"
+                )
+                if name in V.graph.dynamo_flat_name_to_original_fqn:
+                    original_fqn = V.graph.dynamo_flat_name_to_original_fqn.get(
+                        name, name
+                    )
+                elif name in V.graph.allocated_constant_name:
+                    original_fqn = V.graph.allocated_constant_name[name]
+                else:
+                    raise AssertionError("original_fqn must be set for constant")
+                self.prefix.writeline(
+                    f"""constants_info_[{idx}].original_fqn = "{original_fqn}";"""
+                )
+            self.prefix.writeline("update_constants_map(std::move(constants_map));")
+            self.prefix.writeline("update_constants_array(std::move(constants_array));")
+
+            def escape_string(x):
+                return (
+                    x.replace("\\", "\\\\")
+                    .replace('"', '\\"')
+                    .replace("\n", "\\n")
+                    .replace("\t", "\\t")
+                )
+
+            self.prefix.writeline(
+                f'in_spec_ = "{escape_string(config.aot_inductor.serialized_in_spec)}";'
+            )
+            self.prefix.writeline(
+                f'out_spec_ = "{escape_string(config.aot_inductor.serialized_out_spec)}";'
+            )
+
+            for idx, output in enumerate(V.graph.graph_outputs):
+                assert not isinstance(
+                    output, sympy.Expr
+                ), f"output {name=} cannot be symbolic"
+                name = f"output{idx}"
+                self.write_input_output_info("outputs_info_", idx, name)
+
+            self.prefix.writeline(
+                "this->kernels_ = std::make_unique<AOTInductorModelKernels>();"
+            )
+
+        self.prefix.writeline("}")
+
+    def codegen_const_run_driver(self):
+        """
+        // Generated code example
+        std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+            DeviceStreamType stream,
+            AOTIProxyExecutorHandle proxy_executor,
+            bool initialization
+        ) {
+            std::unordered_map<std::string, AtenTensorHandle> folded_constants_map;
+            std::vector<AtenTensorHandle> output_handles;
+            // build up output_handles over here.
+            _const_run_impl(output_handles, stream, proxy_executor);
+            // build up folded_constants_map
+            return folded_constants_map;
+        }
+        """
+
+        self.prefix.splice(
+            """
+            std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+                DeviceStreamType stream,
+                AOTIProxyExecutorHandle proxy_executor,
+                bool initialization
+            ) {
+            """
+        )
+        if not config.aot_inductor.use_runtime_constant_folding:
+            self.prefix.splice(
+                """
+                    if (!initialization) {
+                        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                                  << "aot_inductor.use_runtime_constant_folding=False\\n";
+                    }
+                    return {};
+                }
+                """
+            )
+            return
+
+        with self.prefix.indent():
+            # This is a mapping to the index of constant folding graph's output
+            const_index_mapping: List[Optional[Tuple[int, str]]] = [None] * len(
+                V.graph.const_output_index
+            )
+            for idx, (name, _) in enumerate(V.graph.constants.items()):
+                if name in V.graph.const_output_index:
+                    const_index_mapping[V.graph.const_output_index[name]] = (idx, name)  # type: ignore[call-overload]
+            assert (
+                None not in const_index_mapping
+            ), "Not all constant gets mapped for constant folding graph."
+
+            self.prefix.writeline(
+                f"""
+                std::unordered_map<std::string, AtenTensorHandle> folded_constants_map;
+                folded_constants_map.reserve({len(const_index_mapping)});
+                std::vector<AtenTensorHandle> output_handles({len(const_index_mapping)});
+                """
+            )
+
+            self.prefix.splice(
+                """
+                // The below assignment of output_handles to constants is not used directly.
+                // It's only used to memo the correspondence of handle and constants.
+                """
+            )
+
+            for output_idx, (const_idx, _) in enumerate(const_index_mapping):  # type: ignore[misc]
+                self.prefix.writeline(
+                    f"output_handles[{output_idx}] = constants_->at({const_idx});"
+                )
+
+            self.prefix.writeline(
+                "_const_run_impl(output_handles, stream, proxy_executor);"
+            )
+
+            for output_idx, (_, const_name) in enumerate(const_index_mapping):  # type: ignore[misc]
+                self.prefix.writeline(
+                    f'folded_constants_map["{const_name}"] = output_handles[{output_idx}];'
+                )
+            self.prefix.writeline("return folded_constants_map;")
+
+        self.prefix.writeline("}")
+
+    def generate(self, is_inference):
+        if V.graph.aot_mode and not V.graph.is_const_graph:
+            self.codegen_model_kernels()
+            self.codegen_model_constructor()
+            self.codegen_const_run_driver()
+        self.write_wrapper_decl()
+        return super().generate(is_inference)
+
+    def finalize_prefix(self):
+        cached_dtypes_buffer = IndentedBuffer()
+        if config.abi_compatible:
+            for dtype in self.used_cached_dtypes:
+                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DTYPE({dtype});")
+            for device in self.used_cached_devices:
+                cached_dtypes_buffer.writeline(f"CACHE_TORCH_DEVICE({device});")
+        cached_dtypes_buffer.splice(self.prefix)
+        self.prefix = cached_dtypes_buffer
+
+    def define_kernel(
+        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=False
+    ):
+        self.header.splice(f"\n{kernel}\n")
+
+    def codegen_scalar_to_tensor(self, output: str):
+        name = f"scalar_to_tensor_{next(self.scalar_to_tensor_id)}"
+        self.wrapper_call.writeline(
+            f"RAIIAtenTensorHandle {name} = scalar_to_tensor_handle({output});"
+        )
+        return name
+
+    @cache_on_self
+    def get_output_refs(self):
+        return [
+            f"torch::tensor({x.codegen_reference(self.wrapper_call)})"
+            if isinstance(x, ir.ShapeAsConstantBuffer) and not config.abi_compatible
+            else x.codegen_reference(self.wrapper_call)
+            for x in V.graph.graph_outputs
+        ]
+
+    def generate_return(self, output_refs):
+        cst_names = V.graph.constants.keys()
+        arr_iface = (
+            not V.graph.is_const_graph and config.use_minimal_arrayref_interface
+        )  # For brevity.
+
+        def use_thread_local_cached_output_tensor(idx, output):
+            cached_output_name = f"cached_output_{next(self.cached_output_id)}"
+            cache_type = "Array" if arr_iface else "Tensor"
+            self.wrapper_call.writeline(
+                f"thread_local ThreadLocalCachedOutput{cache_type}<std::decay_t<decltype({output})>> "
+                f"{cached_output_name}({output});"
+            )
+            if arr_iface:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                output_entry = f"std::get<{idx}>(output_arrayref_tensors)"
+                element_type = f"std::decay_t<decltype({output_entry}.data()[0])>"
+                self.wrapper_call.writeline(
+                    f"{output_entry} = {cached_output_name}.arrayref_tensor<{element_type}>();"
+                )
+            else:
+                self.wrapper_call.writeline(
+                    f"{cached_output_name}.copy_data_from({output});"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&output_handles[{idx}]));"
+                )
+                self.wrapper_call.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors({cached_output_name}.tensor(), "
+                    f"output_handles[{idx}]));"
+                )
+
+        if arr_iface:
+            self.wrapper_call.writeline(
+                "AOTInductorModelOutputs output_arrayref_tensors;"
+            )
+        for idx, output in enumerate(output_refs):
+            if config.abi_compatible:
+                output_buffer = V.graph.graph_outputs[idx]
+                if isinstance(output_buffer, ir.ShapeAsConstantBuffer):
+                    # Need to wrap scalar into tensor as the main function returns a vector of tensors
+                    output_tensor = self.codegen_scalar_to_tensor(output)
+                    self.wrapper_call.writeline(
+                        f"output_handles[{idx}] = {output_tensor}.release();"
+                    )
+                    continue
+
+                output_is_tensor_handle_expr = (
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "RAIIAtenTensorHandle> || "
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "AtenTensorHandle> || "
+                    f"std::is_same_v<std::decay_t<decltype({output})>,"
+                    "ConstantHandle>"
+                )
+                self.wrapper_call.writeline(
+                    f"if constexpr ({output_is_tensor_handle_expr}) {{"
+                )
+                with self.wrapper_call.indent():
+                    if arr_iface:
+                        cached_output_name = (
+                            f"cached_output_{next(self.cached_output_id)}"
+                        )
+                        output_value_type = f"std::decay_t<decltype(std::get<{idx}>(output_arrayref_tensors).data()[0])>"
+                        self.wrapper_call.writeline(
+                            f"thread_local RAIIAtenTensorHandle {cached_output_name};"
+                        )
+                        if output in cst_names:
+                            # NOTE(return_constant): In some rare cases where we return
+                            # a constant, we have to return a copy of this constant,
+                            # because (1) constants are not owned by the Model instance
+                            # (2) constants remain the same cross inference runs,
+                            # assuming they are not updated at runtime Basically, we
+                            # cannot release or transfer the ownership of any original
+                            # constant to the user.
+                            self.wrapper_call.writeline(
+                                f"AtenTensorHandle {cached_output_name}_tmp;"
+                            )
+                            self.wrapper_call.writeline(
+                                f"aoti_torch_clone({output}, &{cached_output_name}_tmp);"
+                            )
+                            self.wrapper_call.writeline(
+                                f"{cached_output_name} = {cached_output_name}_tmp;"
+                            )
+                        else:
+                            self.wrapper_call.writeline(
+                                f"{cached_output_name} = {output}.release();"
+                            )
+                        self.wrapper_call.writeline(
+                            f"convert_handle_to_arrayref_tensor({cached_output_name}, "
+                            f"std::get<{idx}>(output_arrayref_tensors));"
+                        )
+                    else:
+                        if output in cst_names:
+                            # See NOTE(return_constant) above.
+                            self.wrapper_call.writeline(
+                                f"aoti_torch_clone({output}, &output_handles[{idx}]);"
+                            )
+                        else:
+                            self.wrapper_call.writeline(
+                                f"output_handles[{idx}] = {output}.release();"
+                            )
+                self.wrapper_call.writeline("} else {")
+                with self.wrapper_call.indent():
+                    use_thread_local_cached_output_tensor(idx, output)
+                self.wrapper_call.writeline("}")
+
+            else:
+                assert (
+                    not arr_iface
+                ), "minimal ArrayRef interface is only supported in ABI-compatible mode"
+                if output in cst_names:
+                    output_expr = f"{output}.clone()"
+                    # See NOTE(return_constant) above.
+                else:
+                    output_expr = output
+                self.wrapper_call.writeline(
+                    f"output_handles[{idx}] = reinterpret_cast<AtenTensorHandle>("
+                    + f"new at::Tensor({output_expr}));"
+                )
+        if arr_iface:
+            self.wrapper_call.writeline("return output_arrayref_tensors;")
+
+    def generate_before_suffix(self, result):
+        if not V.graph.is_const_graph:
+            if V.graph.aot_mode:
+                result.writeline("} // AOTInductorModel::run_impl")
+            else:
+                result.writeline("} // inductor_entry_impl")
+
+    def generate_end(self, result):
+        if V.graph.aot_mode:
+            if V.graph.is_const_graph:
+                result.writeline("} // AOTInductorModel::_const_run_impl")
+            else:
+                result.writeline("} // namespace aot_inductor")
+                result.writeline("} // namespace torch")
+            return
+
+        result.writeline("'''\n)")
+        result.splice(
+            f"""
+            inductor_entry = CppWrapperCodeCache.load_pybinding(
+                ["std::vector<at::Tensor>"], cpp_wrapper_src, {self.cuda}, {len(V.graph.graph_outputs)})
+            """
+        )
+
+        # unwrap output tensor back to python scalar
+        if all(x for x in self.output_is_tensor.values()):
+            # If no ShapeAsConstantBuffer in the output, directly return the output as tensors
+            return_str = "return f(args_tensor)"
+        else:
+            outputs = [
+                f"outputs[{i}]" if self.output_is_tensor[i] else f"outputs[{i}].item()"
+                for i in range(len(V.graph.graph_outputs))
+            ]
+            outputs_str = f"[{', '.join(outputs)}]"
+            return_str = f"""
+                    outputs = f(args_tensor)
+                    return {outputs_str}
+            """
+
+        args_str = "args_tensor = [arg if isinstance(arg, torch.Tensor) else torch.tensor(arg) for arg in args]"
+        if V.graph.constants:
+            # Append constants to the input args for cpp wrapper.
+            # Python wrapper directly gets the value inside the wrapper call
+            # as a global variable passed when calling exec(code, mod.__dict__, mod.__dict__).
+            # For cpp wrapper, we need to pass this python value to the inductor_entry_impl function explicitly.
+            assert all(
+                isinstance(v, torch.Tensor) for v in list(V.graph.constants.values())
+            ), "Expect all constants to be Tensor"
+            constants_str = f"[{', '.join(V.graph.constants.keys())}]"
+            args_str += f"""
+                    constants_tensor = {constants_str}
+                    args_tensor.extend(constants_tensor)
+            """
+
+        # Wrap the func to support setting result._boxed_call = True
+        result.splice(
+            f"""
+            def _wrap_func(f):
+                def g(args):
+                    {args_str}
+                    {return_str}
+                return g
+            call = _wrap_func(inductor_entry)
+            """
+        )
+
+    def generate_c_shim_extern_kernel_call(self, kernel, args):
+        # In the abi_compatible mode, we call fallback aten ops through a C shim layer
+        self.allow_stack_allocation = False
+        kernel_tokens = kernel.split("::")
+        kernel_suffix = kernel_tokens[-1]
+        if kernel_suffix == "call":
+            kernel_suffix = kernel_tokens[-2]
+        if config.c_shim_version == "1":
+            shim_fn = f"aoti_torch_{kernel_suffix}"
+        else:
+            shim_fn = f"aoti_torch_{self.device}_{kernel_suffix}"
+
+        # HACK: val_to_arg_str jams multiple arguments together using a comma. If that
+        # ever breaks, it needs to be reworked to be able to return multiple arguments,
+        # and the split-on-comma code here needs to be removed.
+        wrapped_args = []
+        for x in args:
+            pieces = x.split(", ")
+            for piece in pieces:
+                # We only really *need* convert_arrayref_tensor_to_tensor for
+                # ArrayRefTensors. The code flowing into here uses `0` for nullptr,
+                # which convert_arrayref_tensor_to_tensor would blindly coerce to int,
+                # so just avoid wrapping integers.
+                if not piece.isdigit():
+                    piece = f"convert_arrayref_tensor_to_tensor({piece})"
+                wrapped_args.append(piece)
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(wrapped_args)}));"
+        )
+
+    def generate_c_shim_extern_kernel_alloc(self, extern_kernel, args):
+        # registered output buffer name
+        name = extern_kernel.name
+        output_handle_name = f"{name}_handle"
+        self.writeline(f"AtenTensorHandle {output_handle_name};")
+        output_arg = f"&{output_handle_name}"
+        self.generate_c_shim_extern_kernel_call(
+            extern_kernel.get_kernel_name(), args + [output_arg]
+        )
+        self.writeline(f"RAIIAtenTensorHandle {name}({output_handle_name});")
+
+    def generate_extern_kernel_alloc(self, extern_kernel, args):
+        if config.abi_compatible:
+            self.generate_c_shim_extern_kernel_alloc(extern_kernel, args)
+        else:
+            super().generate_extern_kernel_alloc(extern_kernel, args)
+
+    def generate_c_shim_fallback_kernel(self, fallback_kernel, args):
+        output_args = []
+        output_raii_handles = []
+        output_name_base = fallback_kernel.get_name()
+        for idx, output in enumerate(fallback_kernel.outputs):
+            if isinstance(output, ir.MultiOutput):
+                name = f"{output.get_name()}"
+                output_handle_name = f"{name}_handle"
+                if output.indices:
+                    assert (
+                        output.indices[0][1] == idx
+                    ), f"expected {output.indices[0][1]=} == {idx=} for {output_name_base=}"
+                self.writeline(f"AtenTensorHandle {output_handle_name};")
+                output_args.append(f"&{output_handle_name}")
+                output_raii_handles.append(
+                    f"RAIIAtenTensorHandle {name}({output_handle_name});"
+                )
+            elif isinstance(output, int):
+                output_name = f"{output_name_base}_{idx}"
+                self.writeline(f"int64_t {output_name} = {output};")
+                output_args.append(f"&{output_name}")
+            elif output is None:
+                output_args.append("nullptr")
+            else:
+                raise NotImplementedError("unsupported type of {output=}")
+        args = args + output_args
+        assert (
+            fallback_kernel.abi_compatible_kernel is not None
+        ), f"abi_compatible_kernel is None for {fallback_kernel.python_kernel_name=}"
+        self.generate_c_shim_extern_kernel_call(
+            fallback_kernel.abi_compatible_kernel, args
+        )
+        for raii_handle in output_raii_handles:
+            self.writeline(raii_handle)
+
+    def generate_fallback_kernel(self, fallback_kernel, args):
+        if config.abi_compatible:
+            self.generate_c_shim_fallback_kernel(fallback_kernel, args)
+        else:
+            super().generate_fallback_kernel(fallback_kernel, args)
+
+    def generate_extern_kernel_out(self, output_view, codegen_reference, args, kernel):
+        if output_view:
+            output_as_strided = f"{output_view.codegen_reference()}"
+            output_name = f"{output_view.get_name()}_as_strided"
+            self.writeline(f"auto {output_name} = {output_as_strided};")
+
+            args.insert(0, output_name)
+        else:
+            args.insert(0, f"{codegen_reference}")
+
+        if config.abi_compatible:
+            self.generate_c_shim_extern_kernel_call(kernel, args)
+        else:
+            self.writeline(self.wrap_kernel_call(kernel, args))
+
+    def generate_user_defined_triton_kernel(
+        self, kernel_name, grid, configs, args, triton_meta
+    ):
+        assert len(grid) != 0
+        if len(grid) == 1:
+            grid_decision = grid[0]
+        else:
+            meta = CudaKernelParamCache.get(kernel_name)
+            assert meta is not None
+            grid_decision = None
+            for i, c in enumerate(configs):
+                if all(arg == meta["meta"][key] for key, arg in c.kwargs.items()):
+                    grid_decision = grid[i]
+                    break
+            assert grid_decision is not None
+
+        self.generate_kernel_call(
+            kernel_name,
+            args,
+            grid=grid_decision,
+            device_index=V.graph.scheduler.current_device.index,
+            cuda=True,
+            triton=True,
+            triton_meta=triton_meta,
+        )
+
+    def generate_scatter_fallback(
+        self, output, inputs, kernel, python_kernel_name, src_is_tensor, reduce, kwargs
+    ):
+        # TODO: support other overload for cpp wrapper and remove the below assertions
+        if config.abi_compatible:
+            # call the ABI shim function instead of the ATen one
+            kernel = kernel.replace("at::", "aoti_torch_")
+        line = f"{kernel}({output}, {','.join(map(str, inputs))}"
+        if python_kernel_name == "aten.scatter_":
+            if src_is_tensor:
+                if reduce:
+                    line += f", {V.graph.wrapper_code.val_to_arg_str(reduce)}"
+            else:
+                assert (
+                    reduce is None
+                ), "Expect reduce to be None for aten.scatter_ with scalar src"
+        else:
+            line += f", {','.join(kwargs)}"
+        line += f"){self.ending}"
+        self.writeline(line)
+
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        if V.graph.aot_mode and V.graph.cpp_wrapper and config.abi_compatible:
+            # See the comment in codegen_reinterpret_view about why having something like
+            # RAIIAtenTensorHandle(tmp_tensor_handle_2) in a tmp array can cause the correponding
+            # tensor prematurely deallocated, thus this std::vector().data() trick here.
+            indices_str = (
+                f"std::vector<AtenTensorHandle>{{{', '.join(indices)}}}.data()"
+            )
+            args = [x, indices_str, str(len(indices)), values, accumulate]
+        else:
+            indices_str = (
+                f"{self.open_bracket}{', '.join(indices)}{self.closed_bracket}"
+            )
+            args = [x, indices_str, values, accumulate]
+
+        args.insert(0, x)  # set x as the output tensor, this fallback mutates x.
+        self.writeline(self.wrap_kernel_call(kernel, args))
+
+    def add_benchmark_harness(self, output):
+        if V.graph.aot_mode:
+            return
+        super().add_benchmark_harness(output)
+
+    def codegen_sizevar(self, x: Expr) -> str:
+        return self.expr_printer(V.graph.sizevars.simplify(x))
+
+    def codegen_tuple_access(self, basename: str, name: str, index: str) -> str:
+        if config.abi_compatible:
+            # in the abi_compatible mode, outputs are returned via arguments
+            return name
+        else:
+            return f"std::get<{index}>({basename})"
+
+    def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        parts = list(map(self.codegen_sizevar, shape))
+        if len(parts) == 0:
+            return "{}"
+        if len(parts) == 1:
+            return f"{{{parts[0]}, }}"
+        return f"{{{', '.join(parts)}}}"
+
+    def codegen_dynamic_scalar(self, node):
+        from .cpp import DTYPE_TO_ATEN, DTYPE_TO_CPP
+
+        (data,) = (t.codegen_reference() for t in node.inputs)
+        if config.abi_compatible:
+            dtype = node.inputs[0].get_dtype()
+            dtype_str = str(dtype).split(".")[-1]
+            self.writeline(f"{DTYPE_TO_CPP[dtype]} {node.sym};")
+            self.writeline(f"aoti_torch_item_{dtype_str}({data}, &{node.sym});")
+            # record in unbacked_symbol_decls so we won't generate a declaration of the symbol again
+            self.unbacked_symbol_decls.add(str(node.sym))
+        else:
+            if node.is_bool:
+                self.writeline(f"bool {node.sym} = {data}.item() ? 1 : 0;")
+            else:
+                convert_type = DTYPE_TO_ATEN[node.inputs[0].get_dtype()].replace(
+                    "at::k", "to"
+                )
+                self.writeline(f"auto {node.sym} = {data}.item().{convert_type}();")
+
+    def can_stack_allocate_buffer(self, buffer):
+        return (
+            self.allow_stack_allocation
+            and buffer.get_device().type == "cpu"
+            and self.can_prove_buffer_has_static_shape(buffer)
+            and ir.is_contiguous_strides_for_shape(
+                buffer.get_stride(), buffer.get_size()
+            )
+        )
+
+    def make_buffer_free(self, buffer):
+        return (
+            ""
+            if isinstance(buffer.get_layout(), ir.MultiOutputLayout)
+            or (V.graph.aot_mode and buffer.get_name() in self.stack_allocated_buffers)
+            or (
+                config.use_minimal_arrayref_interface
+                and V.graph.aot_mode
+                and buffer.get_name() in V.graph.graph_inputs
+            )
+            else f"{buffer.get_name()}.reset();"
+        )
+
+    def make_free_by_names(self, names_to_del: List[str]):
+        return " ".join(f"{name}.reset();" for name in names_to_del)
+
+    def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
+        if config.abi_compatible:
+            return f"auto {new_name} = std::move({old_name});  // reuse"
+        else:
+            return super().codegen_exact_buffer_reuse(old_name, new_name, del_line)
+
+    def generate_profiler_mark_wrapper_call(self, stack):
+        self.wrapper_call.writeline(
+            'RECORD_FUNCTION("inductor_wrapper_call", c10::ArrayRef<c10::IValue>());'
+        )
+
+    def write_triton_header_once(self):
+        pass
+
+    def generate_start_graph(self):
+        pass
+
+    def generate_end_graph(self):
+        pass
+
+    def generate_inf_and_nan_checker(self, nodes):
+        for buf in nodes.get_names():
+            # TODO: Add buf name directly into check_inf_and_nan.
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_check_inf_and_nan({buf}));"
+            )
+
+    def codegen_device(self, device):
+        if config.abi_compatible:
+            self.used_cached_devices.add(device.type)
+            return f"cached_torch_device_type_{device.type},{device.index if device.index else 0}"
+        else:
+            from .cpp import DEVICE_TO_ATEN
+
+            return (
+                f"c10::Device({DEVICE_TO_ATEN[device.type]}, {device.index})"
+                if device.index is not None
+                else f"{DEVICE_TO_ATEN[device.type]}"
+            )
+
+    def codegen_dtype(self, dtype):
+        if config.abi_compatible:
+            dtype_str = str(dtype).split(".")[-1]
+            self.used_cached_dtypes.add(dtype_str)
+            return f"cached_torch_dtype_{dtype_str}"
+        else:
+            from .cpp import DTYPE_TO_ATEN
+
+            return DTYPE_TO_ATEN[dtype]
+
+    @functools.lru_cache(None)
+    def codegen_int_array_var(
+        self,
+        int_array: str,
+        writer=None,
+        known_statically=False,
+        graph=None,  # for per-graph caching
+    ):
+        # Because the memory planning is done in two passes (see the implementation
+        # of self.generate), the writeline behavior is different in the two passes.
+        # As a result, the emitted int array declarations may appear in a later
+        # position of the generated code, so the second pass codegen should not
+        # reuse int array declarations generated in the first pass
+        if writer is None:
+            # The first pass codegen uses `self` as the writer
+            writer = self
+
+        var = f"int_array_{next(self.int_array_id)}"
+        if var not in self.declared_int_array_vars:
+            self.declared_int_array_vars.add(var)
+            if known_statically:
+                writer.writeline(f"static constexpr int64_t {var}[] = {int_array};")
+            else:
+                writer.writeline(f"int64_t {var}[] = {int_array};")
+        return var
+
+    def make_buffer_allocation(self, buffer):
+        return self.make_allocation(
+            buffer.get_name(),
+            buffer.get_device(),
+            buffer.get_dtype(),
+            buffer.get_size(),
+            buffer.get_stride(),
+            buffer if self.can_stack_allocate_buffer(buffer) else None,
+        )
+
+    def make_allocation(
+        self, name, device, dtype, shape, stride, buffer_if_can_stack_allocate=None
+    ):
+        orig_stride = stride
+        device_str = self.codegen_device(device)
+        dtype_code = self.codegen_dtype(dtype)
+        size = self.codegen_shape_tuple(shape)
+        stride = self.codegen_shape_tuple(orig_stride)
+        if config.abi_compatible:
+            size_array_var = self.codegen_int_array_var(
+                size,
+                self.wrapper_call,
+                known_statically=self.is_statically_known_list_of_ints(shape),
+                graph=self.get_codegened_graph(),
+            )
+            stride_array_var = self.codegen_int_array_var(
+                stride,
+                self.wrapper_call,
+                known_statically=self.is_statically_known_list_of_ints(orig_stride),
+                graph=self.get_codegened_graph(),
+            )
+            device_type, device_id = device_str.split(",")
+            device_idx = "this->device_idx_" if V.graph.aot_mode else device_id
+            if buffer_if_can_stack_allocate is not None:
+                from .cpp import DTYPE_TO_CPP
+
+                self.stack_allocated_buffers[name] = buffer_if_can_stack_allocate
+                cpp_type = DTYPE_TO_CPP[dtype]
+                numel = buffer_if_can_stack_allocate.get_numel()
+                # Note: we don't zero storage because empty_strided doesn't zero either.
+                self.wrapper_call.writeline(f"{cpp_type} {name}_storage[{numel}];")
+                args = [
+                    f"{name}_storage",
+                    size_array_var,
+                    stride_array_var,
+                    device_type,
+                    device_idx,
+                ]
+                return f"ArrayRefTensor<{cpp_type}> {name}({', '.join(args)});"
+
+            args = [
+                str(len(shape)),
+                size_array_var,
+                stride_array_var,
+                dtype_code,
+                device_type,
+                device_idx,
+                f"&{name}_handle",
+            ]
+
+            self.wrapper_call.writeline(f"AtenTensorHandle {name}_handle;")
+            self.wrapper_call.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided({', '.join(args)}));"
+            )
+
+            return f"RAIIAtenTensorHandle {name}({name}_handle);"
+
+        if V.graph.aot_mode and device_str.startswith("c10::Device("):
+            tensor_device = f"{device_str.split(',')[0]}, this->device_idx_)"
+        else:
+            tensor_device = device_str
+
+        if device.type == "cpu":
+            return f"at::Tensor {name} = at::detail::empty_strided_cpu({size}, {stride}, {dtype_code});"
+        if device.type == "cuda":
+            return (
+                f"at::Tensor {name} = at::detail::empty_strided_cuda("
+                f"{size}, {stride}, {dtype_code}, c10::DeviceType::CUDA);"
+            )
+        return (
+            f"{self.declare}{name} = {self.namespace}empty_strided("
+            f"{size}, {stride}, at::TensorOptions({tensor_device}).dtype({dtype_code})){self.ending}"
+        )
+
+    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+        if config.abi_compatible:
+            size = self.codegen_shape_tuple(shape)
+            stride = self.codegen_shape_tuple(stride)
+            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
+            args = [
+                name,
+                pexpr(offset),  # bytes not numel
+                self.codegen_dtype(dtype),
+                str(len(shape)),
+                self.codegen_int_array_var(
+                    size, self.wrapper_call, graph=self.get_codegened_graph()
+                ),
+                self.codegen_int_array_var(
+                    stride, self.wrapper_call, graph=self.get_codegened_graph()
+                ),
+                f"&{tmp_name}",
+            ]
+            self.wrapper_call.writeline(f"AtenTensorHandle {tmp_name};")
+            self.wrapper_call.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool({', '.join(args)}));"
+            )
+            return f"RAIIAtenTensorHandle({tmp_name})"
+
+        return "alloc_from_pool({})".format(
+            ", ".join(
+                [
+                    name,
+                    pexpr(offset),  # bytes not numel
+                    self.codegen_dtype(dtype),
+                    self.codegen_shape_tuple(shape),
+                    self.codegen_shape_tuple(stride),
+                ]
+            )
+        )
+
+    def codegen_reinterpret_view(
+        self, data, size_list, stride_list, offset, writer
+    ) -> str:
+        dim = str(len(size_list))
+        size = self.codegen_shape_tuple(size_list)
+        stride = self.codegen_shape_tuple(stride_list)
+        offset = self.codegen_sizevar(offset)
+
+        if config.abi_compatible:
+            tmp_name = f"tmp_tensor_handle_{next(self.tmp_tensor_id)}"
+            # Because the memory planning is done in two passes (see the implementation
+            # of self.generate), the writeline behavior is different in the two passes.
+            if writer is None:
+                writer = self
+
+            args = [
+                f"{data.get_name()}",
+                dim,
+                self.codegen_int_array_var(
+                    size,
+                    writer,
+                    known_statically=self.is_statically_known_list_of_ints(size_list),
+                    graph=self.get_codegened_graph(),
+                ),
+                self.codegen_int_array_var(
+                    stride,
+                    writer,
+                    known_statically=self.is_statically_known_list_of_ints(stride_list),
+                    graph=self.get_codegened_graph(),
+                ),
+                offset,
+            ]
+
+            def gen_reinterpret_call(writer, args):
+                writer.writeline(
+                    f"auto {tmp_name} = reinterpret_tensor_wrapper({', '.join(args)});"
+                )
+
+            if (
+                self.can_stack_allocate_buffer(data)
+                and self.is_statically_known_list_of_ints(size_list)
+                and self.is_statically_known_list_of_ints(stride_list)
+                and ir.is_contiguous_strides_for_shape(stride_list, size_list)
+            ):
+                gen_reinterpret_call(writer, args)
+                return tmp_name
+
+            gen_reinterpret_call(writer, args)
+
+            # NB, the return handle here represents a temporary tensor, which will be automatically
+            # released.
+            # Here's a sample usage in the cpp wrapper code:
+            # ```
+            # aoti_torch_addmm_out(
+            #     buf1,
+            #     arg1_1,
+            #     RAIIAtenTensorHandle(tmp_tensor_handle_0),
+            #     buf0,
+            #     1L,
+            #     1L));
+            # ```
+            # RAIIAtenTensorHandle(tmp_tensor_handle_0) will be released after the call to addmm_out.
+            # This could be problematic when it's used in a different pattern, for example:
+            # ````
+            # AtenTensorHandle tensor_args[] = {RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6};
+            # aoti_torch_proxy_executor_call_function(..., tensor_args);
+            # ````
+            # RAIIAtenTensorHandle(tmp_tensor_handle_2) will be invalid when it's used in the latter
+            # kernel call.
+            #
+            # This is solved by updating the proxy_executor invocation to
+            # ```
+            # aoti_torch_proxy_executor_call_function(...,
+            #     std::vector<AtenTensorHandle>{
+            #         RAIIAtenTensorHandle(tmp_tensor_handle_2), buf5, buf6
+            #     }.data()
+            # );
+            # ```
+            return f"wrap_with_raii_handle_if_needed({tmp_name})"
+        else:
+            args = [data.get_name(), size, stride, offset]
+            return f"reinterpret_tensor({', '.join(args)})"
+
+    def codegen_device_copy(self, src, dst):
+        if config.abi_compatible:
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_tensor_copy_(expensive_copy_to_tensor_if_needed({src}), {dst}));"
+            )
+        else:
+            self.writeline(f"{dst}.copy_({src});")
+
+    def codegen_multi_output(self, name, value):
+        # in the abi_compatible mode, outputs are retrieved by passing
+        # output pointers, so we skip its codegen here.
+        if not config.abi_compatible:
+            super().codegen_multi_output(name, value)
+
+    def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_input, outer_input in zip(subgraph.graph.graph_inputs, outer_inputs):
+            if config.abi_compatible:
+                # in ABI-compatible mode, we copy the underlying at::Tensor of the conditional
+                # input (outer_input) into another at::Tensor to be used as a subgraph input
+                # (inner_input) in the nested scope. we can't std::move here, as the codegened
+                # outer input may be an expression / rvalue (e.g., reinterpret_view(x)), so we
+                # can't necessarily std::move it back to the origin (x).
+                self.writeline(f"AtenTensorHandle {inner_input}_handle;")
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_assign_tensors_out({outer_input}, &{inner_input}_handle));"
+                )
+                self.writeline(
+                    f"RAIIAtenTensorHandle {inner_input}({inner_input}_handle);"
+                )
+            else:
+                self.writeline(
+                    f"{self.declare}{inner_input} = {outer_input}{self.ending}"
+                )
+
+    def codegen_subgraph_suffix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_output, outer_output in zip(
+            subgraph.graph.graph_outputs, outer_outputs
+        ):
+            src = inner_output.codegen_reference()
+            if config.abi_compatible:
+                # in ABI-compatible mode, we need to std::move subgraph output (inner_output)
+                # to the conditional output (outer_output), as RAIIAtenTensorHandle's copy
+                # constructor is deleted.
+                src = f"std::move({src})"
+            self.writeline(f"{outer_output} = {src}{self.ending}")
+
+    def codegen_conditional(self, conditional):
+        name = conditional.get_name()
+        outer_inputs = [f"{buf.codegen_reference()}" for buf in conditional.operands]
+        if config.abi_compatible:
+            outer_outputs = []
+            for out in conditional.outputs:
+                # in ABI-compatible mode, ir.MultiOutput is not codegened,
+                # hence pre-declare output variables directly and separately
+                self.writeline(f"RAIIAtenTensorHandle {out.get_name()};")
+                outer_outputs.append(out.get_name())
+            predicate = f"{conditional.predicate.get_name()}_scalar"
+            self.writeline(f"bool {predicate};")
+            # in ABI-compatible mode, we need to use the ABI shim function
+            # to extract a C++ bool from the unrelying scalar bool Tensor
+            self.writeline(
+                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_bool({conditional.predicate.codegen_reference()}, &{predicate}));"
+            )
+        else:
+            # in non-ABI-compatible mode, we can codegen the conditional outputs
+            # as array of at::Tensor instances, as the ir.MultiOutput is codegened
+            outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+            self.writeline(f"at::Tensor {name}[{len(conditional.outputs)}];")
+            predicate = f"{conditional.predicate.codegen_reference()}.item<bool>()"
+
+        self.writeline(f"if ({predicate}) {{")
+        self.writeline(EnterSubgraphLine(self, conditional.true_subgraph.graph))
+        self.codegen_subgraph(conditional.true_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("} else {")
+        self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
+        self.codegen_subgraph(conditional.false_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("}")
+
+    def generate_extern_kernel_args_decl_if_needed(
+        self, op_overload, raw_args, output_args
+    ):
+        arg_types = [x.real_type for x in op_overload._schema.arguments]
+        return_types = [x.type for x in op_overload._schema.returns]
+
+        new_tensor_args = []
+        new_int_args = []
+
+        def fill_args(arg, arg_type):
+            static_arg_types = (
+                torch.FloatType,
+                torch.BoolType,
+                torch.StringType,
+                torch.Type,
+                torch.DeviceObjType,
+            )
+            inductor_tensor_buffers = (
+                ir.Buffer,
+                ir.ReinterpretView,
+            )
+
+            if isinstance(arg_type, torch.TensorType):
+                assert isinstance(arg, inductor_tensor_buffers), f"got {type(arg)}"
+                new_tensor_args.append(f"{arg.codegen_reference()}")
+            elif isinstance(arg_type, torch.IntType):
+                # int
+                new_int_args.append(str(arg))
+            elif isinstance(arg_type, torch.SymIntType):
+                # SymInt
+                expr = arg.node.expr if isinstance(arg, torch.SymInt) else arg
+                new_int_args.append(self.expr_printer(expr))
+            elif isinstance(arg_type, torch.NumberType):
+                # Scalar of type int
+                assert isinstance(arg, (int, float, bool))
+                # Only treat int Scalar as dynamic
+                if isinstance(arg, int):
+                    new_int_args.append(str(arg))
+            elif isinstance(arg_type, torch.ListType):
+                assert isinstance(arg, (list, tuple))
+
+                # List[Tensor]
+                if isinstance(arg_type.getElementType(), torch.TensorType):
+                    new_tensor_args.extend([f"{a.codegen_reference()}" for a in arg])
+                # List[Optional[Tensor]]
+                elif isinstance(
+                    arg_type.getElementType(), torch.OptionalType
+                ) and isinstance(
+                    arg_type.getElementType().getElementType(), torch.TensorType
+                ):
+                    new_tensor_args.extend(
+                        [f"{a.codegen_reference()}" for a in arg if a is not None]
+                    )
+                # List[int]
+                elif isinstance(arg_type.getElementType(), torch.IntType):
+                    new_int_args.extend([str(a) for a in arg])
+                # List[SymInt]
+                elif isinstance(arg_type.getElementType(), torch.SymIntType):
+                    expressions = [
+                        a.node.expr if isinstance(a, torch.SymInt) else a for a in arg
+                    ]
+                    new_int_args.extend(
+                        [self.expr_printer(expr) for expr in expressions]
+                    )
+                # List[Scalar]
+                elif isinstance(arg_type.getElementType(), torch.NumberType):
+                    # Only treat int Scalar as dynamic
+                    is_int_type = [isinstance(a, int) for a in arg]
+                    if any(is_int_type):
+                        assert all(
+                            is_int_type
+                        ), "AOTInductor only supports int scalars of the same type"
+                        new_int_args.extend([str(a) for a in arg])
+                else:
+                    assert isinstance(
+                        arg_type.getElementType(), static_arg_types  # type: ignore[arg-type]
+                    ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
+            else:
+                assert isinstance(
+                    arg_type, static_arg_types  # type: ignore[arg-type]
+                ), f"Fall through arguments must be one of static_arg_types, got {type(arg_type)}"
+
+        for arg, arg_type in zip(raw_args, arg_types):
+            if arg is not None:
+                if isinstance(arg_type, torch.OptionalType):
+                    fill_args(arg, arg_type.getElementType())
+                else:
+                    fill_args(arg, arg_type)
+
+        def fill_output_arg(arg, return_type):
+            if isinstance(return_type, torch.TensorType):
+                self.writeline(f"AtenTensorHandle {arg}_handle;  // output buffer")
+                self.writeline(
+                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_new_uninitialized_tensor(&{arg}_handle));"
+                )
+                self.writeline(f"RAIIAtenTensorHandle {arg}({arg}_handle);")
+                new_tensor_args.append(f"{arg}")
+            elif isinstance(return_type, torch.SymIntType):
+                raise NotImplementedError("NYI support for return type: SymInt")
+            elif isinstance(return_type, torch.ListType) and isinstance(
+                return_type.getElementType(), torch.SymIntType
+            ):
+                raise NotImplementedError("NYI support for return type: List[SymInt]")
+            else:
+                raise AssertionError(f"Unsupported return type found: {return_type}")
+
+        # TODO: Only support tensor(s) returns for now, SymInt is not implemented yet
+        for return_type in return_types:
+            if isinstance(return_type, (torch.TensorType)):
+                pass
+            elif isinstance(return_type, torch.OptionalType):
+                assert isinstance(return_type.getElementType(), torch.TensorType)
+            elif isinstance(return_type, torch.ListType):
+                assert isinstance(return_type.getElementType(), torch.TensorType)
+            else:
+                raise NotImplementedError(
+                    f"return type {return_type} is not yet supported."
+                )
+
+        for output_arg in output_args:
+            assert output_arg is not None, "Optional return types are not yet supported"
+            if isinstance(output_arg, (list, tuple)):
+                for out in output_arg:
+                    fill_output_arg(out, torch.TensorType.get())
+            else:
+                fill_output_arg(output_arg, torch.TensorType.get())
+
+        return new_tensor_args, new_int_args
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed(
+        self,
+        name,
+        kernel,
+        codegen_args,
+        cpp_op_schema,
+        cpp_kernel_key,
+        cpp_kernel_overload_name="",
+        op_overload=None,
+        raw_args=None,
+        outputs=None,
+    ):
+        if config.is_fbcode():
+            assert op_overload is not None
+            assert raw_args is not None
+            assert outputs is not None
+
+            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
+                name,
+                cpp_kernel_key,
+                op_overload,
+                raw_args,
+                outputs,
+            )
+        else:
+            return self.generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
+                name,
+                kernel,
+                codegen_args,
+                cpp_op_schema,
+                cpp_kernel_key,
+                cpp_kernel_overload_name,
+            )
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed_oss(
+        self,
+        name,
+        kernel,
+        codegen_args,
+        cpp_op_schema,
+        cpp_kernel_key,
+        cpp_kernel_overload_name="",
+    ):
+        if cpp_kernel_key not in self.extern_call_ops:
+            self.writeline(
+                f"static auto op_{cpp_kernel_key} = c10::Dispatcher::singleton()"
+            )
+            self.writeline(
+                f'\t.findSchemaOrThrow("{kernel}", "{cpp_kernel_overload_name}")'
+            )
+            self.writeline(f"\t.typed<{cpp_op_schema}>();")
+            self.extern_call_ops.add(cpp_kernel_key)
+
+        self.writeline(
+            f"auto {name} = op_{cpp_kernel_key}.call({', '.join(codegen_args)});"
+        )
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed_fbcode(
+        self,
+        name,
+        cpp_kernel_key,
+        op_overload,
+        raw_args,  # contains both args and flatten kwargs
+        outputs,
+    ):
+        def extract_output_name(out):
+            assert out is not None, "None, i.e. optional output is not supported"
+            if isinstance(out, ir.MultiOutput):
+                return out.get_name()
+            elif isinstance(out, (list, tuple)):
+                return type(out)(extract_output_name(o) for o in out)
+            else:
+                raise AssertionError(f"Unexpected output: {type(out)}")
+
+        # output_args has the same pytree structure as outputs
+        output_args = extract_output_name(outputs)
+        if isinstance(output_args, str):
+            output_args = [output_args]
+
+        (
+            tensor_call_args,
+            int_call_args,
+        ) = self.generate_extern_kernel_args_decl_if_needed(
+            op_overload, raw_args, output_args
+        )
+
+        tensor_call_args_str = ", ".join(tensor_call_args)
+        int_call_args_str = ", ".join(int_call_args)
+
+        extern_kernel_node_index = len(V.graph.extern_kernel_nodes) - 1
+
+        self.writeline(
+            f"aoti_torch_proxy_executor_call_function(proxy_executor, "
+            f"{extern_kernel_node_index}, "
+            f"{len(int_call_args)}, "
+            f"std::vector<int64_t>{{{int_call_args_str}}}.data(), "
+            f"{len(tensor_call_args)}, "
+            f"std::vector<AtenTensorHandle>{{{tensor_call_args_str}}}.data());"
+        )
+
+        self.extern_call_ops.add(cpp_kernel_key)
+
+    def generate_reset_kernel_saved_flags(self):
+        pass
+
+    def generate_save_uncompiled_kernels(self):
+        pass
+
+    def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
+        if (
+            config.abi_compatible
+            and not is_legacy_abi
+            and isinstance(type_, torch.OptionalType)
+        ):
+            if val is None:
+                return "0"  # nullptr is not available in C
+            if not isinstance(type_.getElementType(), torch.TensorType):
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(f"auto {var_name} = {self.val_to_arg_str(val)};")
+                return f"&{var_name}"
+            elif config.c_shim_version == "2":
+                # Similar to other data type, use pointer to denote optional tensor arg in v2 C shim
+                base_handle = self.val_to_arg_str(val)
+                if "wrap_with_raii_handle_if_needed" in base_handle:
+                    # wrap_with_raii_handle_if_needed creates a temp RAIIAtenTensorHandle, so we need to
+                    # explicitly store it. Otherwise, it will be destroyed before the fallback kernel call.
+                    tmp_var_name = f"var_{next(self.arg_var_id)}"
+                    self.writeline(
+                        f"RAIIAtenTensorHandle {tmp_var_name} = {base_handle};"
+                    )
+                    base_handle = tmp_var_name
+                var_name = f"var_{next(self.arg_var_id)}"
+                self.writeline(f"AtenTensorHandle {var_name} = {base_handle}.get();")
+                return f"&{var_name}"
+
+        return self.val_to_arg_str(val)
+
+    def val_to_arg_str(self, val) -> str:
+        if val is None:
+            # When None is passed as an argument, it represents an optional that does not contain a value.
+            if config.abi_compatible:
+                return "0"  # nullptr is not available in C
+            return "c10::nullopt"
+        elif isinstance(val, bool):
+            if config.abi_compatible:
+                return "1" if val else "0"
+            else:
+                return "true" if val else "false"
+        elif isinstance(val, int):
+            # uint64_t is long on Linux, but long long on MacOS
+            return f"{val}LL" if sys.platform == "darwin" else f"{val}L"
+        elif isinstance(val, str):
+            return f'"{val}"'
+        elif isinstance(
+            val, (ir.Buffer, ir.ReinterpretView, ir.StorageBox, ir.TensorBox)
+        ):
+            return val.codegen_reference()
+        elif isinstance(val, torch.device):
+            return self.codegen_device(val)
+        elif isinstance(val, torch.dtype):
+            return self.codegen_dtype(val)
+        elif isinstance(val, float) and val in [float("inf"), float("-inf")]:
+            if val == float("inf"):
+                return "std::numeric_limits<float>::infinity()"
+            else:
+                return "-std::numeric_limits<float>::infinity()"
+        elif isinstance(val, (list, tuple)):
+            # FIXME handle embedded optional types?
+            result = f"{{{', '.join(self.val_to_arg_str(x) for x in val)}}}"
+            if config.abi_compatible:
+                static = self.is_statically_known_list_of_ints(val)
+                # Need to pass the array length because we can't use std::vector
+                int_var_array = self.codegen_int_array_var(
+                    result,
+                    known_statically=static,
+                    graph=self.get_codegened_graph(),
+                )
+                return f"{int_var_array}, {len(val)}"
+            else:
+                return result
+        else:
+            return repr(val)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cuda.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..36bff25c66c371bc8c64a1ca785bcee7c573b4ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cpp_wrapper_cuda.py
@@ -0,0 +1,328 @@
+import functools
+import os
+from itertools import chain, count
+from typing import Any, List, Optional, TYPE_CHECKING
+
+import sympy
+
+from torch._inductor.codecache import get_cpp_wrapper_cubin_path_name
+
+from .. import config
+from ..codecache import CudaKernelParamCache
+from ..triton_heuristics import grid as default_grid
+from ..virtualized import V
+from .cpp_wrapper_cpu import CppWrapperCpu
+from .wrapper import SymbolicCallArg
+
+if TYPE_CHECKING:
+    from ..graph import GraphLowering
+
+
+def is_int(s: str) -> bool:
+    # Cpp code gen adds L at the end of ints
+    # Lets remove it for checking whether we have an int or not
+    if s and s[-1] == "L":
+        s = s[:-1]
+    try:
+        int(s)
+    except ValueError:
+        return False
+    except TypeError:
+        return False
+    return True
+
+
+def is_float(s: str) -> bool:
+    try:
+        float(s)
+    except ValueError:
+        return False
+    return True
+
+
+class CppWrapperCuda(CppWrapperCpu):
+    """
+    Generates cpp wrapper for running on GPU and calls CUDA kernels
+    """
+
+    def __init__(self):
+        self.device = "cuda"
+        super().__init__()
+        self.grid_id = count()
+        self.cuda = True
+
+    def write_header(self):
+        if V.graph.is_const_graph:
+            # We do not write header for constant graph, it will be written by main module.
+            return
+
+        super().write_header()
+
+        self.header.splice("#include <filesystem>")
+        if config.abi_compatible:
+            self.header.splice(
+                "#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>"
+            )
+        else:
+            self.header.splice(
+                """
+                #include <c10/cuda/CUDAGuard.h>
+                #include <c10/cuda/CUDAStream.h>
+                #include <ATen/cuda/EmptyTensor.h>
+                """
+            )
+
+        self.header.splice(
+            """
+            #define CUDA_DRIVER_CHECK(EXPR)                    \\
+            do {                                               \\
+                CUresult code = EXPR;                          \\
+                const char *msg;                               \\
+                cuGetErrorString(code, &msg);                  \\
+                if (code != CUDA_SUCCESS) {                    \\
+                    throw std::runtime_error(                  \\
+                        std::string("CUDA driver error: ") +   \\
+                        std::string(msg));                     \\
+                }                                              \\
+            } while (0);
+
+            namespace {
+
+            struct Grid {
+                Grid(uint32_t x, uint32_t y, uint32_t z)
+                  : grid_x(x), grid_y(y), grid_z(z) {}
+                uint32_t grid_x;
+                uint32_t grid_y;
+                uint32_t grid_z;
+
+                bool is_non_zero() {
+                    return grid_x > 0 && grid_y > 0 && grid_z > 0;
+                }
+            };
+
+            }  // anonymous namespace
+
+            static inline CUfunction loadKernel(
+                    std::string filePath,
+                    const std::string &funcName,
+                    uint32_t sharedMemBytes,
+                    const std::optional<std::string> &cubinDir = std::nullopt) {
+                if (cubinDir) {
+                    std::filesystem::path p1{*cubinDir};
+                    std::filesystem::path p2{filePath};
+                    filePath = (p1 / p2.filename()).string();
+                }
+
+                CUmodule mod;
+                CUfunction func;
+                CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+                if (sharedMemBytes > 0) {
+                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+                        func,
+                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                        sharedMemBytes
+                    ))
+                }
+                return func;
+            }
+
+            static inline void launchKernel(
+                    CUfunction func,
+                    uint32_t gridX,
+                    uint32_t gridY,
+                    uint32_t gridZ,
+                    uint32_t numWarps,
+                    uint32_t sharedMemBytes,
+                    void* args[],
+                    cudaStream_t stream) {
+                CUDA_DRIVER_CHECK(cuLaunchKernel(
+                    func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+                ));
+            }
+            """
+        )
+
+    def write_get_raw_stream(self, index, graph=None):
+        name = f"stream{index}"
+        self.writeline(f"cudaStream_t {name};")
+        self.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_cuda_stream({index}, (void**)&{name}));"
+        )
+        return name
+
+    def define_kernel(
+        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=True
+    ):
+        if not cuda:
+            return super().define_kernel(name, kernel, metadata, cuda)
+
+    def generate(self, is_inference):
+        self.prefix.writeline("\n")
+        if not V.graph.aot_mode:
+            for kernel in chain(
+                self.src_to_kernel.values(),
+                [entry[0] for entry in self.user_defined_kernel_cache.values()],
+            ):
+                self.prefix.writeline(f"static CUfunction {kernel} = nullptr;")
+            self.prefix.writeline("\n")
+        return super().generate(is_inference)
+
+    @functools.lru_cache(None)
+    def generate_load_kernel_once(
+        self,
+        name: str,
+        mangled_name: str,
+        cubin_path: str,
+        shared_mem: int,
+        graph: "GraphLowering",  # for per-graph caching
+    ):
+        if V.graph.aot_mode:
+            self.writeline(f"if (kernels.{name} == nullptr) {{")
+            self.writeline(
+                f"""    kernels.{name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem}, this->cubin_dir_);"""
+            )
+            self.writeline("}")
+        else:
+            self.writeline(f"if ({name} == nullptr) {{")
+            self.writeline(
+                f"""    {name} = loadKernel("{cubin_path}", "{mangled_name}", {shared_mem});"""
+            )
+            self.writeline("}")
+
+    def generate_args_decl(self, call_args):
+        dynamic_symbols = V.graph.sizevars.free_symbols()
+        # TODO: only works for constant now, need type info
+        new_args = []
+        for arg in call_args:
+            var_name = f"var_{next(self.arg_var_id)}"
+            if isinstance(arg, (sympy.Integer, sympy.Symbol, SymbolicCallArg)):
+                self.writeline(f"auto {var_name} = {arg};")
+            elif isinstance(arg, sympy.Expr):
+                self.writeline(f"auto {var_name} = {self.expr_printer(arg)};")
+            elif is_int(arg):
+                self.writeline(f"int {var_name} = {arg};")
+            elif is_float(arg):
+                self.writeline(f"float {var_name} = {arg};")
+            elif any(str(arg) == s.name for s in dynamic_symbols):
+                self.writeline(f"auto {var_name} = {arg};")
+            elif arg == "nullptr":
+                self.writeline(f"auto {var_name} = nullptr;")
+            elif arg == "c10::nullopt":
+                self.writeline(f"auto {var_name} = c10::nullopt;")
+            else:
+                if config.abi_compatible:
+                    self.writeline(f"CUdeviceptr {var_name};")
+                    self.writeline(
+                        f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr({arg}, reinterpret_cast<void**>(&{var_name})));"
+                    )
+                else:
+                    self.writeline(
+                        f"CUdeviceptr {var_name} = reinterpret_cast<CUdeviceptr>({arg}.data_ptr());"
+                    )
+            new_args.append(f"&{var_name}")
+
+        return ", ".join(new_args)
+
+    def generate_default_grid(self, name: str, grid: List[Any], cuda: bool = True):
+        """
+        Generate grid configs for launching a CUDA kernel using the grid
+        function from triton_heuristics.
+        """
+        if not cuda:
+            return grid
+        assert isinstance(grid, list), f"expected {grid=} to be a list"
+        grid = [e.inner_expr if isinstance(e, SymbolicCallArg) else e for e in grid]
+        grid_fn = default_grid(*grid)
+        params = CudaKernelParamCache.get(name)
+        assert (
+            params is not None
+        ), f"cuda kernel parameters for {name} should already exist at this moment, only found {CudaKernelParamCache.get_keys()}"
+        block_cfg = {
+            "XBLOCK": params["x_block"],
+            "YBLOCK": params["y_block"],
+            "ZBLOCK": params["z_block"],
+        }
+        return grid_fn(block_cfg)
+
+    def generate_kernel_call(
+        self,
+        name,
+        call_args,
+        grid=None,
+        device_index=None,
+        cuda=True,
+        triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+    ):
+        if not cuda:
+            # Even in CppWrapperCuda, we may see cpp kernels
+            return super().generate_kernel_call(
+                name, call_args, grid, device_index, cuda, triton, arg_types
+            )
+
+        params = CudaKernelParamCache.get(name)
+        assert (
+            params is not None
+        ), f"cuda kernel parameters for {name} should already exist at this moment"
+        mangled_name = params.get("mangled_name", None)
+        assert mangled_name is not None, "missing mangled_name"
+        cubin_path = params.get(get_cpp_wrapper_cubin_path_name(), None)
+        assert cubin_path is not None and os.path.exists(
+            cubin_path
+        ), f"cubin file should already exist at this moment: {cubin_path}"
+        shared_mem = params.get("shared_mem", 0)
+
+        self.generate_load_kernel_once(
+            name, mangled_name, cubin_path, shared_mem, V.graph
+        )
+
+        # args with value 1 are added into equal_to_1 and constants
+        # in triton_meta (in the Python codegen) which makes them
+        # inlined in the PTX and compiled CUBIN
+        if (
+            triton_meta is not None
+            and "configs" in triton_meta
+            and triton_meta["configs"]
+        ):
+            equal_to_1 = triton_meta["configs"][0].equal_to_1
+            call_args = [arg for i, arg in enumerate(call_args) if i not in equal_to_1]
+
+        call_args = self.generate_args_decl(call_args)
+        kernel_args_var = f"kernel_args_var_{next(self.kernel_callsite_id)}"
+        self.writeline(f"void* {kernel_args_var}[] = {{{call_args}}};")
+        stream = (
+            "stream"
+            if V.graph.aot_mode
+            else self.write_get_raw_stream(device_index, V.graph)
+        )
+        grid_name = f"{name}_grid_{next(self.grid_id)}"
+        assert isinstance(
+            grid, (list, tuple)
+        ), f"expected grid to be a list or tuple but got: {grid=}"
+
+        grid = [V.graph.sizevars.simplify(item) for item in grid]
+        grid_uses_symbolic_shapes = any(item.free_symbols for item in grid)
+        grid_args = [self.grid_expr_printer(item) for item in grid]
+        grid_args_str = ", ".join(grid_args)
+        self.writeline(f"Grid {grid_name} = Grid({grid_args_str});")
+
+        if grid_uses_symbolic_shapes:
+            self.writeline(f"if ({grid_name}.is_non_zero()) {{")
+        kernel_var_name = f"kernels.{name}" if V.graph.aot_mode else name
+        self.writeline(
+            "launchKernel({}, {}, {}, {}, {}, {}, {}, {});".format(
+                kernel_var_name,
+                f"{grid_name}.grid_x",
+                f"{grid_name}.grid_y",
+                f"{grid_name}.grid_z",
+                params["num_warps"],
+                params["shared_mem"],
+                kernel_args_var,
+                stream,
+            )
+        )
+        if grid_uses_symbolic_shapes:
+            self.writeline("}")
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb5020127ee427fac425d3ff076ebb2103061f1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_cpp_scheduling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_cpp_scheduling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c349ec0e541d7a736f3605d410525d864bcba4c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_cpp_scheduling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_env.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df49d81bcc89546ef4edb54efd7be42cd9b16221
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_kernel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_kernel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..430ad5ffcaf314bd398beaf363d83bbe4a62dc17
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_kernel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_template.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_template.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ed9580d1a13b2dfd235582418bb21a84a6baf51
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cuda_template.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_epilogue_gen.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_epilogue_gen.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f29bdf2dcc3cd64e186a377dca599a9eb6298ae8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_epilogue_gen.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d848ca1f89b3ce1350b08b2fcc86bff3d93d0f60
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/cutlass_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41251202dae9c99699ec7411515fd369fd560239
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/device_op_overrides.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/gemm_template.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/gemm_template.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..896c08d4bf424aeceba88c2e6be8f615e784ea2c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/__pycache__/gemm_template.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3417527a59d4a32ed1b1eeb0b8de3e2111319e5b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
@@ -0,0 +1,212 @@
+import logging
+from typing import cast, List
+
+from ...._dynamo.utils import counters
+
+from ... import config, ir
+from ...codecache import code_hash, get_path
+from ...ir import ComputedBuffer, CUDATemplateBuffer, Pointwise
+from ...scheduler import (
+    BaseSchedulerNode,
+    BaseScheduling,
+    FusedSchedulerNode,
+    Scheduler,
+    SchedulerNode,
+)
+from ...utils import get_fused_kernel_name, get_kernel_metadata, sympy_product
+from ...virtualized import V
+from ..common import IndentedBuffer
+
+from .cutlass_epilogue_gen import CUTLASSEVTOpNotImplementedError
+
+log = logging.getLogger(__name__)
+
+
+class CUDACPPScheduling(BaseScheduling):
+    """
+    Partial Scheduling implementation for CUDA C++ Kernels.
+    This class is intended to be used in combination with TritonScheduling,
+    and delegated to by CUDACombinedScheduling.
+
+    It handles fusion decisions and CUDA C++ specific template code generation.
+    """
+
+    def __init__(self, scheduler: Scheduler):
+        super().__init__()
+        self.scheduler = scheduler
+
+    def group_fn(self, sizes):
+        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
+
+    def is_cuda_cpp_template(self, node: BaseSchedulerNode) -> bool:
+        return isinstance(node, SchedulerNode) and isinstance(
+            node.node, CUDATemplateBuffer
+        )
+
+    def is_cuda_cpp_fused_template(self, node: BaseSchedulerNode) -> bool:
+        return isinstance(node, FusedSchedulerNode) and self.is_cuda_cpp_template(
+            node.get_template_node()
+        )
+
+    def _can_fuse_epilogue_impl(
+        self,
+        cuda_template_buffer: CUDATemplateBuffer,
+        epilogue_nodes: List[ir.IRNode],
+        additional_node: ir.IRNode,
+    ) -> bool:
+        """
+        Check if the given node can be fused with the epilogue. At the moment, Kernels
+        support fusion with Pointwise operations, wrapped in (named) ComputedBuffer nodes.
+
+        Args:
+            cuda_template_buffer : A CUDATemplateBuffer object representing the CUDA template and it's result buffer
+            epilogue_nodes : List[ir.Buffer]: The list of already fused epilogue nodes.
+            additional_node: The ir.Buffer node to be checked if it can be fused with the epilogue.
+        Returns:
+        - bool: True if the given node can be fused with the epilogue, False otherwise.
+
+        """
+        if not isinstance(cuda_template_buffer, CUDATemplateBuffer):
+            return False
+        if not cuda_template_buffer.template.can_fuse_epilogue:
+            # The used GEMM op does not support fusing epilogues
+            return False
+        if not isinstance(additional_node, ComputedBuffer):
+            return False
+        if not isinstance(additional_node.data, Pointwise):
+            return False
+        # We can fuse a Pointwise op that depends on the last fused epilogue node
+        # if any. If there is no epilogue node yet, it needs to depend on the template
+        # node
+        node_name = additional_node.get_computed_buffer_name()
+        if node_name is None:
+            return False
+
+        if len(epilogue_nodes) == 0:
+            if cuda_template_buffer.name not in additional_node.get_read_names():
+                return False
+        else:
+            last_epilogue_node = epilogue_nodes[-1]
+            assert isinstance(last_epilogue_node, ir.ComputedBuffer)  # for mypy
+            last_epilogue_name = (
+                last_epilogue_node.name
+                if last_epilogue_node.name is not None
+                else last_epilogue_node.data.name  # type: ignore[attr-defined]
+            )
+            if last_epilogue_name not in additional_node.get_read_names():
+                return False
+        if additional_node.layout != cuda_template_buffer.layout:
+            return False
+        try:
+            from torch._inductor.codegen.cuda.cutlass_epilogue_gen import (
+                CutlassEVTEpilogueArgumentFormatter,
+                CutlassEVTEpilogueTypeFormatter,
+            )
+
+            CutlassEVTEpilogueTypeFormatter.ir_to_evt_string(
+                cast(str, cuda_template_buffer.name), "anything", [additional_node]
+            )
+            CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string(
+                cast(str, cuda_template_buffer.name), [additional_node]
+            )
+        except CUTLASSEVTOpNotImplementedError as e:
+            not_implemented_op = str(e)
+            if not_implemented_op.startswith("_op_"):
+                not_implemented_op = not_implemented_op[4:]
+                log.warning(
+                    f"Cannot fuse epilogue node {additional_node} into {cuda_template_buffer.name}, likely due to unsupported operation: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+            else:
+                # Likely due to unsupported dtype.
+                log.warning(
+                    f"Cannot fuse epilogue node {additional_node} into {cuda_template_buffer.name}. Reason: {not_implemented_op}"  # noqa: G004, B950
+                )
+                return False
+        return True
+
+    @staticmethod
+    def _unwrap_epilogue_nodes(fused_node: FusedSchedulerNode) -> List[ir.IRNode]:
+        nodes = fused_node.get_nodes()
+        template_node = fused_node.get_template_node()
+        nodes.remove(template_node)
+        return [n.node for n in nodes]
+
+    def can_fuse_vertical(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> bool:
+        if self.is_cuda_cpp_template(node1) and isinstance(node2, SchedulerNode):
+            return self._can_fuse_epilogue_impl(
+                cast(CUDATemplateBuffer, node1.node), [], node2.node
+            )
+        elif self.is_cuda_cpp_fused_template(node1) and isinstance(
+            node2, SchedulerNode
+        ):
+            fnode1 = cast(FusedSchedulerNode, node1)
+            return self._can_fuse_epilogue_impl(
+                fnode1.get_template_node().node,
+                self._unwrap_epilogue_nodes(fnode1),
+                node2.node,
+            )
+        return False
+
+    def define_kernel(self, src_code: str, node_schedule) -> str:
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            kernel_name = "_".join(["cuda", fused_name, wrapper.next_kernel_suffix()])
+            # use the original src_code as the key
+            wrapper.src_to_kernel[src_code] = kernel_name
+            src_code = src_code.replace("KERNEL_NAME", kernel_name)
+
+            _, _, kernel_path = get_path(code_hash(src_code), "py")
+
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline("async_compile.cuda(r'''")
+            compile_wrapper.splice(src_code, strip=True)
+            compile_wrapper.writeline("''', 'so')")
+
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+        return kernel_name
+
+    def codegen_template(
+        self, template_node: BaseSchedulerNode, epilogue_nodes: List[SchedulerNode]
+    ):
+        """
+        Codegen a CUDA template, possibly with fused epilogues
+        """
+        counters["inductor"]["cuda_epilogue_fusion_counter"] += len(epilogue_nodes)
+        assert self.is_cuda_cpp_template(
+            template_node
+        ), "Template node passed to CUDAScheduler.codegen_template must be a SchedulerNode that wraps a CUDATemplateBuffer"
+        template_node = cast(SchedulerNode, template_node)
+        _, (numel, rnumel) = template_node.group
+        assert rnumel == 1
+        ctb: CUDATemplateBuffer = cast(CUDATemplateBuffer, template_node.node)
+        epilogue_ir_nodes: List[ir.Buffer] = [n.node for n in epilogue_nodes]
+        assert all(
+            isinstance(n, ir.ComputedBuffer) for n in epilogue_ir_nodes
+        ), "Epilogue nodes must all be instances of ir.ComputedBuffer"
+        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_ir_nodes)
+        with kernel:
+            for node in [template_node, *epilogue_nodes]:
+                node.mark_run()
+            src_code = render()
+
+        with V.set_kernel_handler(kernel):
+            node_schedule = [template_node, *epilogue_nodes]
+            kernel_name = self.define_kernel(src_code, node_schedule)
+        kernel.call_kernel(kernel_name, ctb, epilogue_ir_nodes)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        self.scheduler.free_buffers()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_env.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..581291f3e8e34105ed80b7b7865ffbaa9e962a6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_env.py
@@ -0,0 +1,45 @@
+import functools
+import logging
+from typing import Optional
+
+import torch
+
+from ... import config
+
+log = logging.getLogger(__name__)
+
+
+def get_cuda_arch() -> Optional[str]:
+    try:
+        cuda_arch = config.cuda.arch
+        if cuda_arch is None:
+            # Get Compute Capability of the first Visible device
+            major, minor = torch.cuda.get_device_capability(0)
+            return str(major * 10 + minor)
+        return str(cuda_arch)
+    except Exception as e:
+        log.error("Error getting cuda arch: %s", e)
+        return None
+
+
+def get_cuda_version() -> Optional[str]:
+    try:
+        cuda_version = config.cuda.version
+        if cuda_version is None:
+            cuda_version = torch.version.cuda
+        return cuda_version
+    except Exception as e:
+        log.error("Error getting cuda version: %s", e)
+        return None
+
+
+@functools.lru_cache(None)
+def nvcc_exist(nvcc_path: str = "nvcc") -> bool:
+    if nvcc_path is None:
+        return False
+    import subprocess
+
+    res = subprocess.call(
+        ["which", nvcc_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+    )
+    return res == 0
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_kernel.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..330b5c279a1c390eb8e6f96b6de61c3c64449b06
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_kernel.py
@@ -0,0 +1,374 @@
+import logging
+from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING, Union
+
+from ... import ir
+from ...autotune_process import CUDABenchmarkRequest
+from ...ir import Buffer, CUDATemplateBuffer, IRNode, Layout, TensorBox
+from ...select_algorithm import ChoiceCaller
+from ...utils import sympy_product
+from ...virtualized import V
+
+from ..common import IndentedBuffer, Kernel, OpOverrides, PrimitiveInfoType
+from ..cpp import CppPrinter, DTYPE_TO_CPP
+
+if TYPE_CHECKING:
+    from torch._inductor.codegen.cuda.cuda_template import CUDATemplate
+
+log = logging.getLogger(__name__)
+
+cexpr = CppPrinter().doprint
+
+
+def _normalize_idx(index: int, total_length: int) -> int:
+    return index if index >= 0 else index + total_length
+
+
+class CUDAKernel(Kernel):
+    """
+    Baseclass for CUDA / Cutlass based Kernels
+    """
+
+    overrides = OpOverrides  # type: ignore[assignment]
+
+
+class CUDATemplateKernel(CUDAKernel):
+    """
+    Template kernels defined by CUDA / Cutlass in C++.
+    """
+
+    _EXTRA_CPP_ARGS = "size_t* workspace_size, uint8_t* workspace, cudaStream_t stream"
+
+    def __init__(self, kernel_name):
+        """
+        Initializes a new instance of the CUDATemplateKernel class.
+
+        Args:
+            kernel_name (str): The name of the kernel.
+        """
+        super().__init__()
+        self.kernel_name = kernel_name
+        # Mapping from arg name to IRNode.
+        self.named_nodes: Dict[str, IRNode] = {}
+
+    def arg_name(self, node: IRNode) -> Optional[str]:
+        """
+        Returns arg name of a given input or output node.
+        """
+        if node is None:
+            return None
+        return {**self.args.input_buffers, **self.args.output_buffers}.get(
+            node.get_name(), None
+        )
+
+    def check_not_null(self, node: IRNode) -> str:
+        """
+        Generates code to check that a node is not null.
+        """
+
+        if node is None:
+            return ""
+
+        size_str = self.size(node, 0, -1)
+        name_str = self.arg_name(node)
+        if name_str is None:
+            return ""
+
+        res = IndentedBuffer(initial_indent=2)
+        res.tabwidth = 1
+        res.splice(
+            f"""
+            {{
+              if (!{name_str}) {{
+                int64_t {name_str}_size = {size_str};
+                if ({name_str}_size > 0) {{
+                  throw std::runtime_error("input {name_str} is null but size is not 0!");
+                }}
+              }}
+            }}
+            """
+        )
+        return res.getvalue()
+
+    def def_kernel(
+        self,
+        inputs: List[IRNode],
+        outputs: List[IRNode],
+        names_str: str = "",
+        input_reorder: Optional[List[int]] = None,
+    ) -> str:
+        """
+        Hook called from template code to generate function definition and
+        needed args.
+
+        Args:
+            inputs: List of input IRNodes
+            outputs: List of output IRNodes
+            names_str: Comma separated list of input + output argument names.
+            input_reorder: The actual order of input nodes.
+                           e.g. The template might have input argument defined as [X, W, Bias],
+                           and the actual input passed into this template could be [Bias, X, W].
+                           In this case, the `input_reorder` would be [2, 0, 1].
+        """
+
+        names = [x.strip() for x in names_str.strip().split(",")]
+        if len(inputs) + len(outputs) != len(names):
+            raise RuntimeError(
+                f"{len(inputs) + len(outputs)=} != {len(names)=}, {inputs=}, {outputs=}, {names=}"
+            )
+
+        if input_reorder is not None:
+            assert len(inputs) == len(input_reorder)
+        else:
+            input_reorder = list(range(len(inputs)))
+
+        for idx in input_reorder:
+            name = names[idx]
+            node = inputs[idx]
+            if node is not None:
+                self.named_nodes[name] = node
+                self.args.input_buffers[node.get_name()] = name
+
+        for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs):
+            if node is not None:
+                self.named_nodes[name] = node
+                self.args.output_buffers[node.get_name()] = name
+
+        arg_defs, *_ = self.args.cpp_argdefs()
+        return f"PT_EXPORT int {self.kernel_name}({', '.join(arg_defs)}, {self._EXTRA_CPP_ARGS})"
+
+    def call_kernel(
+        self, name: str, node: "CUDATemplateBuffer", epilogue_nodes: List[ir.Buffer]  # type: ignore[name-defined]
+    ) -> None:
+        """
+        Generates code to call the kernel through V.graph.wrapper_code.
+        used from within torch._inductor.wrapper.WrapperCodeGen
+
+        name: Name of kernel function.
+        node: The CUDATemplateBuffer node which contains information about the kernel, it's fused epilogue nodes
+        as well as all required inputs and outputs.
+        """
+        wrapper = V.graph.wrapper_code
+        _, call_args, _ = self.args.python_argdefs()
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+            else:
+                call_args[i] = f"c_void_p({call_args[i]}.data_ptr())"
+
+        # workspace_size ptr is NULL to mark this call is not intended for retrieving workspace_size.
+        # workspace_size should have already been retrieved prior to this call.
+        call_args.append("None")
+
+        if node.get_workspace_size() > 0:
+            call_args.append(f"c_void_p({node.get_name()}_workspace.data_ptr())")
+        else:
+            call_args.append("None")
+
+        wrapper.generate_kernel_call(
+            name,
+            call_args,
+            device_index=V.graph.scheduler.current_device.index,
+            cuda=True,
+            triton=False,
+        )
+
+    def dtype(self, node: IRNode) -> Optional[str]:
+        """
+        Generates code which represents dtype of a given node.
+        """
+
+        if node is None:
+            return "void"
+        return DTYPE_TO_CPP.get(node.get_layout().dtype)
+
+    def offset(self, node: IRNode) -> str:
+        """
+        Generates code which represents offset of a given node.
+        """
+
+        if node is None:
+            return "0"
+        return str(node.get_layout().offset)
+
+    def ptr(self, node: IRNode) -> str:
+        """
+        Generates code which represents pointer of a given node.
+        """
+
+        if node is None:
+            return "nullptr"
+        arg_name = self.arg_name(node)
+        if arg_name is None:
+            return "nullptr"
+        offset = self.offset(node)
+        return arg_name if offset == "0" else f"{arg_name} + {offset}"
+
+    def size(
+        self,
+        node: IRNode,
+        start_index: int,
+        end_index: Optional[int] = None,
+        default_value: int = 0,
+    ) -> str:
+        """
+        Hook called from template code to get the size of an arg.
+        Generates code which represents size of a given node in [start_index, end_index).
+        If node is None, returns default_value.
+
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+
+        if node is None:
+            return str(default_value)
+
+        start_index = _normalize_idx(start_index, len(node.get_size()))
+        if end_index is None:
+            end_index = start_index
+        end_index = _normalize_idx(end_index, len(node.get_size()))
+
+        sizes = node.get_size()[start_index : end_index + 1]
+        if len(sizes) == 0:
+            return str(default_value)
+
+        val = sympy_product(sizes)
+        return cexpr(self.rename_indexing(val))
+
+    def stride(self, node: IRNode, index: int, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the stride of an arg.
+        Generates code which represents stride of a given node at index.
+        If node is None, returns default_value.
+
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+
+        if node is None:
+            return str(default_value)
+
+        index = _normalize_idx(index, len(node.get_size()))
+        if index < 0:
+            return str(default_value)
+
+        stride = node.get_stride()[index]
+        return cexpr(self.rename_indexing(stride))
+
+    def row_or_column_stride(self, node: IRNode, default_value: int = 0) -> str:
+        """
+        Hook called from template code to get the row or column stride of an arg.
+        This is required by some CUTLASS 2.X APIs.
+        If the node is in row_major, it returns stride[-2].
+        If the node is in column_major, it returns stride[-1].
+
+        TODO: Will add needed args to pass it in if it is dynamic.
+        """
+
+        if node is None or len(node.get_stride()) < 2:
+            return str(default_value)
+
+        stride0 = node.get_stride()[-1]
+        stride1 = node.get_stride()[-2]
+        if stride0 == 1:
+            return cexpr(self.rename_indexing(stride1))
+        elif stride1 == 1:
+            return cexpr(self.rename_indexing(stride0))
+        else:
+            raise RuntimeError(
+                f"At least 1 stride should be 1. Strides: {node.get_stride()=}"
+            )
+
+
+class CUDATemplateCaller(ChoiceCaller):
+    """
+    CUDATemplateCaller
+
+    This class represents a caller for CUDA template kernels. It is a subclass of ChoiceCaller.
+    Attributes:
+        name (str): The name of the caller.
+        category (str): The category of the caller.
+        bmreq (CUDABenchmarkRequest): The benchmark request for the caller.
+        template_buffer (CUDATemplateBuffer): The template buffer for the caller.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        category: str,
+        input_nodes: List[Buffer],
+        layout: Layout,
+        make_kernel_render: Callable[[CUDATemplateBuffer, Optional[List[IRNode]]], str],
+        bmreq: CUDABenchmarkRequest,
+        template: "CUDATemplate",  # type: ignore[name-defined]
+        info_kwargs: Optional[Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]],  # type: ignore[type-arg]
+    ):
+        super().__init__(name, input_nodes, layout)
+        self.category = category
+        self.make_kernel_render = make_kernel_render
+        self.bmreq = bmreq
+        self.template = template
+        self.info_kwargs = info_kwargs
+
+    def precompile(self) -> None:
+        assert self.bmreq is not None
+        self.bmreq.precompile()
+
+    def benchmark(self, *args, out) -> float:
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(
+            *args, output_tensor=out
+        )  # @TODO: Hack for ensuring that Cutlass Kernel is preferred
+
+    def __str__(self):
+        return f"CUDATemplateCaller(source_file={self.bmreq.source_file})"
+
+    def call_name(self) -> str:
+        return f"cuda_template_kernels.{self.name}"
+
+    def hash_key(self) -> str:
+        return "-".join(
+            [
+                self.category,
+                self.bmreq.hash_key,
+            ]
+        )
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        if self.info_kwargs is not None and "op" in self.info_kwargs:
+            op: Any = self.info_kwargs["op"]
+            epilogue_node_names: List[str] = [
+                getattr(en, "name", "no_name")
+                for en in self.info_kwargs.get("epilogue_nodes", [])  # type: ignore[union-attr]
+            ]
+            epilogue_node_strs: List[str] = [
+                str(en) for en in self.info_kwargs.get("epilogue_nodes", [])  # type: ignore[union-attr]
+            ]
+            return {
+                "backend": "CUDA",
+                "op_type": type(op).__name__,
+                "op_conf_name": str(op.configuration_name()),
+                "op_arch": str(op.arch),
+                "tile_shape": str(op.tile_description.tile_shape),
+                "epilogue_schedule": str(op.epilogue_schedule),
+                "kernel_schedule": str(op.kernel_schedule),
+                "element_accumulator": str(op.accumulator_type()),
+                "op_name": str(op.procedural_name()),
+                "epilogue_node_names": epilogue_node_names,  # type: ignore[dict-item]
+                "epilogue_node_strs": epilogue_node_strs,  # type: ignore[dict-item]
+                "instruction_shape": str(
+                    op.tile_description.math_instruction.instruction_shape
+                ),
+            }
+        else:
+            return {"backend": "CUDA", "op_type": "unknown"}
+
+    def output_node(self) -> TensorBox:
+        return TensorBox.create(
+            CUDATemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+                workspace_size=self.bmreq.workspace_size,
+                template=self.template,
+            )
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_template.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd653556e996229a4700652c67b23aa1338df69
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cuda_template.py
@@ -0,0 +1,242 @@
+import functools
+import itertools
+import logging
+from typing import List, Optional
+from unittest.mock import patch
+
+import sympy
+
+import torch
+from ...autotune_process import CUDABenchmarkRequest, TensorMeta
+from ...ir import Buffer, CUDATemplateBuffer, IRNode, Layout
+
+from ...utils import IndentedBuffer, unique
+from ...virtualized import V
+from ..common import KernelTemplate
+from .cuda_kernel import CUDATemplateCaller, CUDATemplateKernel
+
+log = logging.getLogger(__name__)
+
+
+class CUDATemplate(KernelTemplate):
+    index_counter = itertools.count()
+
+    def __init__(
+        self,
+        name: str,
+        input_nodes: List[Buffer],
+        layout: Layout,
+        input_reorder: Optional[List[int]] = None,
+    ):
+        """
+
+        Baseclass for CUDA C++ Templates, derived from KernelTemplate. Not to be instantiated directly.
+
+        Args:
+            name (str): The name of the CUDATemplate object.
+            input_nodes (List[IRNode]): A list of input IRNodes.
+            layout (Layout): The layout of the output buffer / tensor.
+            input_reorder (Optional[List[int]]): An optional list that specifies the order of the input nodes.
+
+        """
+        super().__init__(name)
+        self.input_nodes = input_nodes
+        self.output_node: Buffer = Buffer("buf_out", layout)
+        self.input_reorder = input_reorder
+        self.layout = layout
+
+    def generate(  # type: ignore[override]
+        self,
+        **kwargs,
+    ) -> CUDATemplateCaller:
+        """
+        Generates the CUDA template caller object for the given GEMM template and operation. This CUDATemplateCaller
+        may be used to call and benchmark the generated CUDA kernel in a standalone manner to enable Autotuning.
+
+        Args:
+            kwargs: Additional keyword arguments.
+
+        Returns:
+            A CUDATemplateCaller object representing the generated CUDA template caller.
+        """
+        kernel_name = f"cuda_{self.name}"
+        with patch.object(
+            V.graph, "get_dtype", self._fake_get_dtype(self.output_node)
+        ), CUDATemplateKernel(
+            kernel_name=kernel_name,
+        ) as kernel:
+            code = self.render(kernel=kernel, **kwargs)
+            _, call_args, _ = kernel.args.python_argdefs()
+            log.debug("Generated Code:\n%s", code)
+            log.debug(
+                "Args: cpp_argdefs: %s, python_argdefs: %s",
+                kernel.args.cpp_argdefs(),
+                kernel.args.python_argdefs(),
+            )
+
+        input_reorder = (
+            self.input_reorder
+            if self.input_reorder is not None
+            else list(range(len(self.input_nodes)))
+        )
+        expected_args = list(
+            unique(self.input_nodes[idx].get_name() for idx in input_reorder)
+        )
+        expected_args.extend([self.output_node.get_name()])
+        assert list(call_args)[: len(expected_args)] == expected_args, (
+            call_args,
+            expected_args,
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, call_args[len(expected_args) :])
+        )
+
+        kernel_hash_name = f"cuda_{self.name}_{next(self.index_counter)}"
+
+        # create the BenchmarkRequest
+        bmreq = CUDABenchmarkRequest(
+            kernel_name=kernel_name,
+            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+            extra_args=extra_args,
+            source_code=code,
+        )
+
+        def make_kernel_render(
+            template_node: CUDATemplateBuffer,
+            epilogue_nodes: Optional[List[IRNode]] = None,
+        ):
+            kernel = CUDATemplateKernel(
+                kernel_name="KERNEL_NAME",
+            )
+            render = functools.partial(
+                self.render,
+                kernel=kernel,
+                template_buffer_node=template_node,
+                epilogue_nodes=epilogue_nodes,
+                **kwargs,  # includes "op" argument in case of CUTLASSGemmTemplate
+            )
+            return kernel, render
+
+        return CUDATemplateCaller(
+            kernel_hash_name,
+            self.name,
+            self.input_nodes,
+            self.output_node.get_layout(),
+            make_kernel_render,
+            bmreq,
+            self,
+            kwargs,
+        )
+
+    def header(self) -> IndentedBuffer:
+        res = IndentedBuffer()
+        res.splice(
+            """
+                #include <exception>
+                #include <iostream>
+                #include <memory>
+                #include <random>
+                #include <vector>
+            """
+        )
+        return res
+
+    def globals(self) -> IndentedBuffer:
+        res = IndentedBuffer()
+        res.splice(
+            """
+                // We compile all models with -fvisibility=hidden. Any symbols that need to be
+                // exposed in the final shared library must be declared with PT_EXPORT to make
+                // them visible.
+                #ifdef __GNUC__ // Applies to any compiler with GNU extensions (clang and g++)
+                #define PT_EXPORT __attribute__((__visibility__("default")))
+                #else
+                #ifdef _WIN32
+                #define PT_EXPORT __declspec(dllexport)
+                #else
+                #define PT_EXPORT
+                #endif
+                #endif
+                using bfloat16 = nv_bfloat16;
+            """
+        )
+        return res
+
+    def render(self, **kwargs) -> str:
+        raise NotImplementedError
+
+
+class CUTLASSTemplate(CUDATemplate):
+    """
+    CUTLASSTemplate is a class that provides a template for generating CUTLASS Templates. Used as a baseclass for the
+    CUTLASSGemmTemplate, providing functionality that might also be relevant for non-GEMM CUTLASS Kernels.
+    """
+
+    def header(self) -> IndentedBuffer:
+        res = super().header()
+        res.splice(
+            """
+                #include "cute/tensor.hpp"
+                #include "cutlass/cutlass.h"
+                #include "cutlass/numeric_types.h"
+                #include "cutlass/tensor_ref.h"
+                #include "cutlass/util/host_tensor.h"
+                #include "cutlass/util/reference/host/tensor_fill.h"
+                #include "cutlass/util/reference/device/tensor_fill.h"
+                #include "cutlass/util/device_memory.h"
+            """
+        )
+        return res
+
+    def globals(self) -> IndentedBuffer:
+        res = super().globals()
+        res.splice(
+            """
+                using namespace cute;
+                #define CUTLASS_CHECK(status)                                                      \\
+                {                                                                                  \\
+                  cutlass::Status error = status;                                                  \\
+                  if (error != cutlass::Status::kSuccess) {                                        \\
+                    auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +             \\
+                        cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);        \\
+                    throw std::runtime_error(msg);                                                 \\
+                  }                                                                                \\
+                }
+
+                // Used as pass-through functor in EVT just for type casting / rounding
+                template <typename T>
+                struct identity_op {
+                  CUTLASS_HOST_DEVICE
+                  T operator()(T val) const { return val; }
+                };
+
+            """
+        )
+        return res
+
+    def cute_int(self, int_str: str, var_name: str) -> str:
+        res = ""
+        if int_str in {"1", "1L"}:
+            res = "cute::Int<1>{}"
+        else:
+            res = int_str
+
+        return f"{res} /* {var_name} */"
+
+    _DTYPE_TO_CUTLASS = {
+        torch.float32: "float",
+        torch.float64: "double",
+        torch.float16: "cutlass::half_t",
+        torch.int32: "int",
+        torch.int8: "int8_t",
+        torch.uint8: "uint8_t",
+        torch.bool: "bool",
+        torch.bfloat16: "cutlass::bfloat16_t",
+    }
+
+    def cutlass_type_cast(self, node: IRNode, ptr: str) -> str:
+        if node is None:
+            return ptr
+        else:
+            return f"({self._DTYPE_TO_CUTLASS.get(node.get_dtype())}*)({ptr})"
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd42711d6235bf4f0c294371abe588c20b78aa48
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_epilogue_gen.py
@@ -0,0 +1,360 @@
+from typing import Dict, List
+from unittest.mock import patch
+
+import sympy
+
+import torch._inductor.virtualized as virtualized
+from torch._inductor.ir import ComputedBuffer, FlexibleLayout, IRNode, Pointwise
+from torch._inductor.utils import IndentedBuffer, sympy_str
+
+
+# Used as a magic string to indicate an unsupported sympy expression
+# became part of generated C++ code.
+_MAGIC_SYMPY_ERROR_STRING = "[!sympy: unsupported expr!]"
+
+
+def _arg_str(a):
+    if isinstance(a, sympy.Expr):
+        # If this return value containting the _MAGIC_SYMPY_ERROR_STRING
+        # is used as part of the final generated C++ code,
+        # a CUTLASSEVTOpNotImplementedError is raised to indicate that
+        # the op could not be converted to a valid EVT expression.
+        return f"{_MAGIC_SYMPY_ERROR_STRING}('{sympy_str(a)}')"
+    return str(a)
+
+
+class CUTLASSEVTOpNotImplementedError(NotImplementedError):
+    pass
+
+
+class CutlassEVTEpilogueTypeFormatter:
+    """
+    Codegen class, which provides an entry point to generate
+    Cutlass "Epilogue Visitor Tree" (EVT) functor declarations.
+
+    See https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder
+    for more about EVTs and how they are declared and used to generate.
+
+    Notes:
+        * Used by CUTLASSGemmTemplate.
+        * This class should not be instantiated by users, it is intended to be used
+            by calling CutlassEVTEpilogueTypeFormatter.ir_to_evt_string(...)
+            which instantiates this class as an ops handler for virtualized.V.ops.[op-name]
+        * Extend this with more _op_<whatever> nodes to add support for new pointwise operations.
+
+
+    """
+
+    def __init__(self, accumulator_node_name, evt_type_name):
+        """
+
+        Initialize an instance of CutlassEVTEpilogueTypeFormatter.
+
+        Parameters:
+        - accumulator_node_name (str): The name of the output Buffer for the GEMM operation in the original (unfused)
+                                       IR graph.
+        - evt_type_name (str):      The output name of the EVT type we are generating.
+
+        """
+        self.accumulator_node_name = accumulator_node_name
+        self.output = IndentedBuffer(0)
+        self.var_counter = 0
+        self.evt_type_name = evt_type_name
+        self.aliases = dict()
+
+    @staticmethod
+    def ir_to_evt_string(
+        template_output_node_name: str,
+        evt_type_name: str,
+        epilogue_nodes: List[IRNode],
+    ):
+        """
+        Formats IR nodes into a string representation compatible with Cutlass EVT format.
+
+        Args:
+            template_output_node_name (str): The name of the template output node.
+            evt_type_name (str): The name of the EVT type.
+            epilogue_nodes (List[IRNode]): A list of IR nodes representing the epilogue nodes. As of now, these must be
+                ComputedBuffer nodes wrapping Pointwise nodes.
+
+        Returns:
+            A string representation of the IR nodes formatted according to the Cutlass EVT format.
+        """
+        formatter = CutlassEVTEpilogueTypeFormatter(
+            template_output_node_name, evt_type_name
+        )
+
+        with virtualized.V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            for node in epilogue_nodes:
+                if isinstance(node, ComputedBuffer):
+                    pnode = node.data
+                else:
+                    raise RuntimeError(
+                        "Epilogue nodes must be Pointwise nodes, wrapped in a named ComputedBuffer"
+                    )
+                assert isinstance(pnode, Pointwise)
+                index = pnode._index(pnode.ranges)
+                result = pnode.inner_fn(index)
+                # each epilogue node results in a single "using" statement and may refer to the previous steps by name
+                formatter.aliases[node.name] = result
+            res = formatter.getvalue(result)  # type: ignore[possibly-undefined]
+            if _MAGIC_SYMPY_ERROR_STRING in res:
+                raise CUTLASSEVTOpNotImplementedError(
+                    "sympy / indexing expressions not yet supported in EVT fusion"
+                )
+            else:
+                return res
+
+    def __getattr__(self, name):
+        """
+        Resolve V.ops.<whatever> calls, after this instance has been installed as V.ops handler.
+        """
+
+        def inner(*args, **kwargs):
+            fargs = [_arg_str(a) for a in args]
+            fkwargs = {key: _arg_str(a) for key, a in kwargs.items()}
+            fn = getattr(self, f"_op_{name}")
+            line = fn(*fargs, **fkwargs)
+            self.var_counter += 1
+            varname = f"EVT_expr_{self.var_counter}"
+            # replace line with a new variable name
+            self.output.writeline(f"using {varname} = {line};")
+            return varname
+
+        if name.startswith("_"):
+            raise CUTLASSEVTOpNotImplementedError(name)
+        if hasattr(self, f"_op_{name}"):
+            return inner
+        else:
+            raise CUTLASSEVTOpNotImplementedError(name)
+
+    def _op_load(self, name, index_expr):
+        # Load an input to an operation. Might be the output of the matmul, the result
+        # of a previous epilogue node, a constant or (TODO) an auxiliary input.
+        if name == self.accumulator_node_name:
+            return f"cutlass::epilogue::fusion::Sm90AccFetch /* :={name} (matmul output in accumulator) */"
+        elif name in self.aliases:
+            return self.aliases[name]
+        else:
+            # return f"cutlass::epilogue::fusion::Sm90SrcFetch /* :={name} */"
+            raise CUTLASSEVTOpNotImplementedError(
+                f"Operand {name} not found. Auxiliary inputs not supported yet."
+            )
+
+    def _op_constant(self, value, dtype):
+        # Load a constant
+        if str(dtype) in ("torch.float16", "torch.float32"):
+            return f"cutlass::epilogue::fusion::Sm90ScalarBroadcast<ElementAcc> /* value={value}, dtype={dtype} */"
+        else:
+            raise CUTLASSEVTOpNotImplementedError(
+                f"Unsupported dtype for constant: {dtype}"
+            )
+
+    def _cutlass_binary_functional_op(self, op, a, b):
+        # Perform a named operation on two inputs
+        # see https://github.com/NVIDIA/cutlass/blob/6407bcdf0a24097b7b016ee105937693c62f9923/include/cutlass/functional.h for ops
+        return f"cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::{op}, ElementAcc, ElementAcc, RoundStyle>,{a},{b}>"  # noqa: B950
+
+    def _convert_to_output_dtype(self, a):
+        # Convert the final output to the dtype of the output buffer
+        return f"cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<identity_op, ElementD, ElementAcc, RoundStyle>,{a}>"  # noqa: B950
+
+    def _op_to_dtype(self, a, *args, **kwargs):
+        # no-op in our case, since we convert to the output dtype at the end and convert everything to the accumulator
+        # dtype.
+        # Is is asserted ( and ascertained during can_fuse decision ) that the dtype remains compatible
+        # throughout the fusion chain.
+        return a  # noqa: B950
+
+    def _op_mul(self, a, b):
+        return self._cutlass_binary_functional_op("multiplies", a, b)
+
+    def _op_div(self, a, b):
+        return self._cutlass_binary_functional_op("divides", a, b)
+
+    def _op_truediv(self, a, b):
+        return self._cutlass_binary_functional_op("divides", a, b)
+
+    def _op_ge(self, a, b):
+        return self._cutlass_binary_functional_op("greater_equal", a, b)
+
+    def _op_add(self, a, b):
+        return self._cutlass_binary_functional_op("plus", a, b)
+
+    def _op_sub(self, a, b):
+        return self._cutlass_binary_functional_op("minus", a, b)
+
+    def _op_minimum(self, a, b):
+        return self._cutlass_binary_functional_op("minimum", a, b)
+
+    def _op_maximum(self, a, b):
+        return self._cutlass_binary_functional_op("maximum", a, b)
+
+    def _op_relu(self, a):
+        const_zero = self._op_constant(0.0, "torch.float32")
+        return f"cutlass::epilogue::fusion::Sm90EVT<cutlass::epilogue::fusion::Sm90Compute<cutlass::maximum, ElementAcc, ElementAcc, RoundStyle>,{a}, {const_zero}>"  # noqa: B950
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        raise CUTLASSEVTOpNotImplementedError()
+
+    # Add more ops here...
+    def getvalue(self, result) -> str:
+        # Return final result
+        dtype_converted_expr = self._convert_to_output_dtype(
+            f"EVT_expr_{self.var_counter}"
+        )
+        self.output.writeline(f"using {self.evt_type_name} = {dtype_converted_expr};")
+        return self.output.getvalue()
+
+
+class CutlassEVTEpilogueArgumentFormatter:
+    """
+    Codegen class, which provides an entry point to generate
+    Cutlass "Epilogue Visitor Tree" (EVT) Argument initializers
+
+    See https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder
+    for more about EVTs and how they are declared and used to generate.
+
+    Notes:
+        * Used by CUTLASSGemmTemplate.
+        * This class should not be instantiated by users, it is intended to be used
+            by calling CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string(...)
+            which instantiates this class as an ops handler for virtualized.V.ops.[op-name]
+        * Extend this with more _op_<whatever> nodes to add support for new pointwise operations.
+
+
+    """
+
+    def __init__(self, accumulator_node_name: str):
+        """
+
+        Initializes a CutlassEVTEpilogueArgumentFormatter object. Do not instantiate directly.
+        Use the CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string static method.
+
+        Args:
+            accumulator_node_name (str): The name of the accumulator node which should contain
+                                          the Matmul result before fusion according to the IR graph.
+        """
+        self.accumulator_node_name: str = accumulator_node_name  #
+        self.output: IndentedBuffer = IndentedBuffer(0)  # The output buffer for codegen
+        self.var_counter: int = (
+            0  # used to generate variable names, incremented for each new variable
+        )
+        self.aliases: Dict[str, str] = dict()  # Aliases for subexpression functors
+
+    @staticmethod
+    def ir_to_evt_argument_string(
+        template_output_node_name: str,
+        epilogue_nodes: List[IRNode],
+    ) -> str:
+        formatter = CutlassEVTEpilogueArgumentFormatter(
+            template_output_node_name,
+        )
+
+        with virtualized.V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            for node in epilogue_nodes:
+                assert isinstance(node, ComputedBuffer)
+                pnode = node.data
+                assert isinstance(pnode, Pointwise)
+                index = pnode._index(pnode.ranges)
+                result = pnode.inner_fn(index)
+                # each epilogue node results in a single "using" statement and may refer to the previous steps by name
+                if node.name is not None:
+                    formatter.aliases[node.name] = result
+
+            res: str = formatter.getvalue(result)  # type: ignore[possibly-undefined]
+            if _MAGIC_SYMPY_ERROR_STRING in res:
+                raise CUTLASSEVTOpNotImplementedError(
+                    "sympy / indexing expressions not yet supported in EVT fusion"
+                )
+            else:
+                return res
+
+    def __getattr__(self, name):
+        def inner(*args, **kwargs):
+            fargs = [_arg_str(a) for a in args]
+            fkwargs = {key: _arg_str(a) for key, a in kwargs.items()}
+            fn = getattr(self, f"_op_{name}")
+            line = fn(*fargs, **fkwargs)
+            return line
+
+        if name.startswith("_"):
+            raise CUTLASSEVTOpNotImplementedError(name)
+
+        if hasattr(self, f"_op_{name}"):
+            return inner
+        else:
+            raise CUTLASSEVTOpNotImplementedError(name)
+
+    def _op_load(self, name, index_expr):
+        if name == self.accumulator_node_name:
+            return "{}"
+        elif name in self.aliases:
+            return self.aliases[name]
+        else:
+            raise CUTLASSEVTOpNotImplementedError(
+                f"Operand {name} not found. Auxiliary inputs not supported yet."
+            )
+
+    def _op_constant(self, value, dtype):
+        if str(dtype) in ("torch.float16", "torch.float32"):
+            return "{ static_cast<ElementAcc>(" + str(value) + ") }"
+        else:
+            raise CUTLASSEVTOpNotImplementedError(
+                f"Unsupported dtype for constant: {dtype}"
+            )
+
+    def _cutlass_binary_functional_op(self, op, a, b):
+        return f"{{ /*{op}: */ {a}, {b} }}"
+
+    def _op_mul(self, a, b):
+        return self._cutlass_binary_functional_op("multiplies", a, b)
+
+    def _op_div(self, a, b):
+        return self._cutlass_binary_functional_op("divides", a, b)
+
+    def _op_truediv(self, a, b):
+        return self._cutlass_binary_functional_op("divides", a, b)
+
+    def _op_ge(self, a, b):
+        return self._cutlass_binary_functional_op("greater_equal", a, b)
+
+    def _op_add(self, a, b):
+        return self._cutlass_binary_functional_op("plus", a, b)
+
+    def _op_sub(self, a, b):
+        return self._cutlass_binary_functional_op("minus", a, b)
+
+    def _op_minimum(self, a, b):
+        return self._cutlass_binary_functional_op("minimum", a, b)
+
+    def _op_maximum(self, a, b):
+        return self._cutlass_binary_functional_op("maximum", a, b)
+
+    def _op_relu(self, a):
+        const_zero = self._op_constant(0.0, "torch.float32")
+        return "{" + str(a) + ", " + const_zero + "}"
+
+    def _op_to_dtype(self, a, dtype, src_dtype=None):
+        # Is is asserted ( and ascertained during can_fuse decision ) that the dtype remains compatible
+        # throughout the fusion chain.
+        assert dtype in (
+            "torch.float32",
+            "torch.float16",
+        ), f"Unsupported dtype: {dtype}"
+        assert src_dtype in (
+            None,
+            "torch.float32",
+            "torch.float16",
+        ), f"Unsupported source dtype: {src_dtype}"
+        return a
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        raise CUTLASSEVTOpNotImplementedError()
+
+    def getvalue(self, result) -> str:
+        return "{" + str(result) + "}"
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20eb4b3f94a273ba03ec722adde8da8c71143887
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/gemm_operation_extensions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/gemm_operation_extensions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d9cbbc92f6252e3ee83340027a7575908875afe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/__pycache__/gemm_operation_extensions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a34bd7e9d3a68ee694f5f0170056dc9ef67d7e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/gemm_operation_extensions.py
@@ -0,0 +1,186 @@
+from ..cutlass_utils import try_import_cutlass
+
+if try_import_cutlass():
+    import enum
+
+    from cutlass_library.library import *  # noqa: F401, F403
+    from cutlass_library.gemm_operation import *  # noqa: F401, F403
+
+    # copied / modified from original at
+    # https://github.com/NVIDIA/cutlass/blob/8783c41851cd3582490e04e69e0cd756a8c1db7f/tools/library/scripts/gemm_operation.py#L658
+    # to support EVT similar to
+    # https://github.com/NVIDIA/cutlass/blob/8783c41851cd3582490e04e69e0cd756a8c1db7f/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu#L315C69-L315C69  # noqa: B950
+    class EmitGemmUniversal3xInstanceWithEVT:
+        """Responsible for emitting a CUTLASS 3.x template definition"""
+
+        def __init__(self, operation_suffix=""):
+            self.operation_suffix = operation_suffix
+            self.includes = [
+                "cutlass/cutlass.h",
+                "cutlass/gemm/gemm.h",
+                "cutlass/numeric_types.h",
+                "cutlass/gemm/kernel/gemm_universal.hpp",
+                "cutlass/gemm/collective/collective_builder.hpp",
+                "cutlass/epilogue/collective/collective_builder.hpp",
+            ]
+            self.builtin_epilogue_functor_template = """
+            ${epilogue_functor}<
+              ${element_c},
+              ${epilogue_vector_length},
+              ${element_accumulator},
+              ${element_epilogue}
+            >
+        """
+            self.gemm_template = """
+        using EpilogueScheduleType = ${epilogue_schedule};
+        static_assert(cute::is_same_v<EpilogueScheduleType, cutlass::epilogue::TmaWarpSpecialized> ||
+                 cute::is_same_v<EpilogueScheduleType, cutlass::epilogue::TmaWarpSpecializedCooperative>,
+                "Epilogue visitor trees are currently only supported by the TMA warp-specialized epilogue");
+        static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
+        using ElementAcc = ${element_accumulator};
+        using ElementD = ${element_d};
+        ${epilogue_functor};
+        using ${operation_name}_epilogue =
+          typename cutlass::epilogue::collective::CollectiveBuilder<
+            ${arch}, ${opcode_class},
+            cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+            cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+            cutlass::epilogue::collective::EpilogueTileAuto,
+            ${element_accumulator}, ${element_epilogue},
+            ${element_c}, ${layout_c}, ${align_c},
+            ${element_d}, ${layout_d}, ${align_d},
+            EpilogueScheduleType,
+            ${operation_name}_epilogue_functor
+          >::CollectiveOp;
+
+        using ${operation_name}_mainloop =
+          typename cutlass::gemm::collective::CollectiveBuilder<
+            ${arch}, ${opcode_class},
+            ${element_a}, ${layout_a}, ${align_a},
+            ${element_b}, ${layout_b}, ${align_b},
+            ${element_accumulator},
+            cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+            cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+            ${stages},
+          ${kernel_schedule}
+          >::CollectiveOp;
+
+        // Gemm operator ${operation_name}
+        using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+            cute::Shape<int,int,int,int>,
+            ${operation_name}_mainloop,
+            ${operation_name}_epilogue,
+            ${tile_scheduler}>;
+
+        // Define named type
+        struct ${operation_name} :
+          public ${operation_name}_base { };
+
+        """
+
+        #
+        def instance_template(self):
+            return """
+        ${compile_guard_start}
+          using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+          manifest.append(
+            new ${gemm_kind}<GemmKernel>("${operation_name}"));
+        ${compile_guard_end}
+        """
+
+        #
+        def emit(self, operation):
+            tile_shape = operation.tile_description.tile_shape
+            warp_count = operation.tile_description.warp_count
+            # stage count set to zero indicates builder automatic stage selection
+            if operation.tile_description.stages > 0:
+                stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
+            else:
+                stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage)>"  # noqa: B950
+            warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]
+
+            (
+                instance_layout_A,
+                instance_layout_B,
+                instance_layout_C,
+                instance_layout_D,
+            ) = (
+                operation.A.layout,
+                operation.B.layout,
+                operation.C.layout,
+                operation.D.layout,
+            )
+
+            # 3.0 profiler integration only supports trivial epilogues for now
+            epilogue_vector_length = 1
+
+            # Support built-in epilogue functors or user-defined functions
+            if isinstance(operation.epilogue_functor, enum.Enum):
+                values = {
+                    "epilogue_vector_length": str(epilogue_vector_length),
+                    "element_epilogue": str(DataTypeTag[operation.element_epilogue]),  # type: ignore[name-defined]
+                    "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor],  # type: ignore[name-defined]
+                }
+                epilogue_functor = SubstituteTemplate(  # type: ignore[name-defined]
+                    self.builtin_epilogue_functor_template, values
+                )
+
+            elif callable(operation.epilogue_functor):
+                epilogue_functor = operation.epilogue_functor(
+                    operation.procedural_name() + "_epilogue_functor"
+                )
+            else:
+                epilogue_functor = str(operation.epilogue_functor)
+            #
+
+            values = {
+                "operation_name": operation.procedural_name(),
+                "operation_suffix": self.operation_suffix,
+                "element_a": DataTypeTag[operation.A.element],  # type: ignore[name-defined]
+                "layout_a": LayoutTag[instance_layout_A],  # type: ignore[name-defined]
+                "element_b": DataTypeTag[operation.B.element],  # type: ignore[name-defined]
+                "layout_b": LayoutTag[instance_layout_B],  # type: ignore[name-defined]
+                "element_c": DataTypeTag[operation.C.element],  # type: ignore[name-defined]
+                "layout_c": LayoutTag[instance_layout_C],  # type: ignore[name-defined]
+                "element_d": DataTypeTag[operation.D.element],  # type: ignore[name-defined]
+                "layout_d": LayoutTag[instance_layout_D],  # type: ignore[name-defined]
+                "element_accumulator": DataTypeTag[operation.accumulator_type()],  # type: ignore[name-defined]
+                "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],  # type: ignore[name-defined] # noqa: B950
+                "arch": "cutlass::arch::Sm%d" % operation.arch,
+                "tile_shape_m": str(operation.tile_description.tile_shape[0]),
+                "tile_shape_n": str(operation.tile_description.tile_shape[1]),
+                "tile_shape_k": str(operation.tile_description.tile_shape[2]),
+                "cluster_m": str(operation.tile_description.cluster_shape[0]),
+                "cluster_n": str(operation.tile_description.cluster_shape[1]),
+                "cluster_k": str(operation.tile_description.cluster_shape[2]),
+                "warp_shape_m": str(warp_shape[0]),
+                "warp_shape_n": str(warp_shape[1]),
+                "warp_shape_k": str(warp_shape[2]),
+                "instruction_shape_m": str(
+                    operation.tile_description.math_instruction.instruction_shape[0]
+                ),
+                "instruction_shape_n": str(
+                    operation.tile_description.math_instruction.instruction_shape[1]
+                ),
+                "instruction_shape_k": str(
+                    operation.tile_description.math_instruction.instruction_shape[2]
+                ),
+                "kernel_schedule": str(KernelScheduleTag[operation.kernel_schedule]),  # type: ignore[name-defined]
+                "epilogue_schedule": str(EpilogueScheduleTag[operation.epilogue_schedule]),  # type: ignore[name-defined]
+                "epilogue_functor": epilogue_functor,
+                "stages": stage_count_string,
+                "align_a": str(operation.A.alignment),
+                "align_b": str(operation.B.alignment),
+                "align_c": str(operation.C.alignment),
+                "align_d": str(operation.C.alignment),
+                "transform_a": ComplexTransformTag[operation.A.complex_transform],  # type: ignore[name-defined]
+                "transform_b": ComplexTransformTag[operation.B.complex_transform],  # type: ignore[name-defined]
+                "math_operation": MathOperationTag[  # type: ignore[name-defined]
+                    operation.tile_description.math_instruction.math_operation
+                ],
+                "epilogue_vector_length": str(epilogue_vector_length),
+                "element_epilogue": str(DataTypeTag[operation.element_epilogue]),  # type: ignore[name-defined]
+                "tile_scheduler": str(TileSchedulerTag[operation.tile_scheduler]),  # type: ignore[name-defined]
+            }
+
+            return SubstituteTemplate(self.gemm_template, values)  # type: ignore[name-defined]
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fa1bdc941a45d5a7c7e009886182a0f68d11e2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/cutlass_utils.py
@@ -0,0 +1,258 @@
+import functools
+import logging
+import os
+import sys
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import sympy
+
+import torch
+
+from ...codecache import cache_dir
+from ...config import cuda as inductor_cuda_config
+from ...ir import Layout
+from .cuda_env import get_cuda_arch, get_cuda_version
+
+log = logging.getLogger(__name__)
+
+
+def _rename_cutlass_import(content: str, cutlass_modules: List[str]) -> str:
+    for cutlass_module in cutlass_modules:
+        content = content.replace(
+            f"from {cutlass_module} import ",
+            f"from cutlass_library.{cutlass_module} import ",
+        )
+    return content
+
+
+def _gen_cutlass_file(
+    file_name: str, cutlass_modules: List[str], src_dir: str, dst_dir: str
+) -> None:
+    orig_full_path = os.path.abspath(os.path.join(src_dir, file_name))
+    text = ""
+    with open(orig_full_path) as f:
+        text = f.read()
+    text = _rename_cutlass_import(text, cutlass_modules)
+    dst_full_path = os.path.abspath(
+        os.path.join(
+            dst_dir,
+            file_name,
+        )
+    )
+    with open(dst_full_path, "w") as f:
+        f.write(text)
+
+
+@functools.lru_cache(None)
+def try_import_cutlass() -> bool:
+    # Copy CUTLASS python scripts to a temp dir and add the temp dir to Python search path.
+    # This is a temporary hack to avoid CUTLASS module naming conflicts.
+    # TODO(ipiszy): remove this hack when CUTLASS solves Python scripts packaging structure issues.
+
+    cutlass_py_full_path = os.path.abspath(
+        os.path.join(inductor_cuda_config.cutlass_dir, "python/cutlass_library")
+    )
+    tmp_cutlass_py_full_path = os.path.abspath(
+        os.path.join(cache_dir(), "torch_cutlass_library")
+    )
+    dst_link = os.path.join(tmp_cutlass_py_full_path, "cutlass_library")
+
+    if os.path.isdir(cutlass_py_full_path):
+        if tmp_cutlass_py_full_path not in sys.path:
+            if os.path.exists(dst_link):
+                assert os.path.islink(
+                    dst_link
+                ), f"{dst_link} is not a symlink. Try to remove {dst_link} manually and try again."
+                assert os.path.realpath(os.readlink(dst_link)) == os.path.realpath(
+                    cutlass_py_full_path
+                ), f"Symlink at {dst_link} does not point to {cutlass_py_full_path}"
+            else:
+                os.makedirs(tmp_cutlass_py_full_path, exist_ok=True)
+                os.symlink(cutlass_py_full_path, dst_link)
+            sys.path.append(tmp_cutlass_py_full_path)
+        try:
+            import cutlass_library.generator  # noqa: F401
+            import cutlass_library.library  # noqa: F401
+            import cutlass_library.manifest  # noqa: F401
+
+            return True
+
+        except ImportError as e:
+            log.debug(
+                "Failed to import CUTLASS packages: %s, ignoring the CUTLASS backend.",
+                str(e),
+            )
+    else:
+        log.debug(
+            "Failed to import CUTLASS packages: CUTLASS repo does not exist: %s",
+            cutlass_py_full_path,
+        )
+    return False
+
+
+def _normalize_cuda_arch(arch: str) -> str:
+    if int(arch) >= 90:
+        return "90"
+    elif int(arch) >= 80:
+        return "80"
+    elif int(arch) >= 75:
+        return "75"
+    elif int(arch) >= 70:
+        return "70"
+    else:
+        raise NotImplementedError(f"Unsupported cuda arch: {arch}")
+
+
+@dataclass
+class CUTLASSArgs:
+    """
+    CUTLASS args used to initialize a CUTLASS Manifest.
+    """
+
+    architectures: Optional[str] = None
+    cuda_version: Optional[str] = None
+
+    operations = "all"
+    build_dir = ""
+    curr_build_dir = ""
+    generator_target = ""
+    kernels = "all"
+    ignore_kernels = ""
+    # TODO: these three look dead?
+    kernel_filter_file: None = None
+    selected_kernel_list: None = None
+    interface_dir: None = None
+    filter_by_cc = True
+    disable_full_archs_compilation = False
+
+    def __post_init__(self):
+        if self.architectures is None or self.cuda_version is None:
+            raise RuntimeError(
+                f"{self.architectures=} or {self.cuda_version=} is None!"
+            )
+        self.architectures = _normalize_cuda_arch(self.architectures)
+
+
+@functools.lru_cache(None)
+def _gen_ops_cached(arch, version) -> List[Any]:
+    # Note: Cache needs to be specific for cuda architecture and version
+
+    # Import cutlass python scripts.
+    assert try_import_cutlass()
+    import cutlass_library.generator as cutlass_generator
+    import cutlass_library.manifest as cutlass_manifest
+
+    if arch is None or version is None:
+        log.error(
+            "Cannot detect cuda arch %s or cuda version %s. "
+            "Will discard all cutlass ops. "
+            "Please consider setting _inductor.cuda.arch and _inductor.cuda.version configs.",
+            arch,
+            version,
+        )
+        return list()
+    arch = _normalize_cuda_arch(arch)
+    args = CUTLASSArgs(architectures=arch, cuda_version=version)
+    manifest = cutlass_manifest.Manifest(args)
+
+    if arch == "90":
+        cutlass_generator.GenerateSM90(manifest, args.cuda_version)
+        cutlass_generator.GenerateSM80(manifest, args.cuda_version)
+    else:
+        try:
+            func = getattr(cutlass_generator, "GenerateSM" + arch)
+            func(manifest, args.cuda_version)
+        except AttributeError as e:
+            raise NotImplementedError(
+                "Arch " + arch + " is not supported by current cutlass lib."
+            ) from e
+    return manifest.operations
+
+
+def gen_ops() -> List[Any]:
+    """
+    Generates all supported CUTLASS operations.
+    """
+    arch = get_cuda_arch()
+    version = get_cuda_version()
+    return _gen_ops_cached(arch, version)
+
+
+def dtype_match(
+    torch_dtype: Optional[torch.dtype],
+    cutlass_dtype: "cutlass_library.library.DataType",  # type: ignore[name-defined]  # noqa: F821
+) -> bool:
+    # Import cutlass python scripts.
+    assert try_import_cutlass()
+    import cutlass_library
+
+    if torch_dtype == torch.float:
+        return (
+            cutlass_dtype == cutlass_library.library.DataType.f32
+            or cutlass_dtype == cutlass_library.library.DataType.tf32
+        )
+    elif torch_dtype == torch.half:
+        return cutlass_dtype == cutlass_library.library.DataType.f16
+    elif torch_dtype == torch.bfloat16:
+        return cutlass_dtype == cutlass_library.library.DataType.bf16
+    else:
+        return False
+
+
+def get_accumulator_dtype(
+    input_torch_dtypes: List[torch.dtype],
+) -> Optional[torch.dtype]:
+    """
+    Given a list of input torch dtypes, returns the inferred accumulator torch dtype.
+    """
+
+    if len(input_torch_dtypes) == 0:
+        return None
+    torch_dtype = input_torch_dtypes[0]
+    for dtype in input_torch_dtypes[1:]:
+        if torch_dtype != dtype:
+            raise RuntimeError(f"Unmatched input dtypes: {torch_dtype=}, {dtype=}")
+    if torch_dtype == torch.half:
+        if torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction:
+            return torch_dtype
+        else:
+            return torch.float
+    if torch_dtype in {torch.bfloat16, torch.float}:
+        return torch.float
+    raise NotImplementedError(f"Unsupported data type: {input_torch_dtypes=}")
+
+
+def get_alignments(torch_dtype: torch.dtype) -> List[int]:
+    """
+    Returns all possible valid CUTLASS alignments in terms of the number of elements for a given dtype.
+    CUTLASS gemm / conv SM80 APIs support 16 bytes max alignment, and 2 bytes min alignment.
+    """
+
+    if torch_dtype in (torch.half, torch.bfloat16):
+        return [8, 4, 2, 1]
+    elif torch_dtype == torch.float:
+        return [4, 2, 1]
+    else:
+        raise NotImplementedError(f"unsupported {torch_dtype=} for alignments")
+
+
+def get_max_alignment(inductor_layout: Layout) -> int:
+    """
+    Returns the max alignment (in terms of number of elements) for a given Inductor Layout.
+    """
+
+    dtype = inductor_layout.dtype
+    size = inductor_layout.size
+    offset = inductor_layout.offset
+
+    def is_static_int(number):
+        return isinstance(number, (int, sympy.Integer))
+
+    if is_static_int(size[-1]) and is_static_int(offset):
+        alignments = get_alignments(dtype)
+        for alignment in alignments:
+            if int(size[-1]) % alignment == 0 and int(offset) % alignment == 0:
+                return alignment
+
+    return 1
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/device_op_overrides.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/device_op_overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..579f340c9af17a598401be2b3906f6741ecd94e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/device_op_overrides.py
@@ -0,0 +1,18 @@
+from ..common import DeviceOpOverrides, register_device_op_overrides
+
+
+class CUDADeviceOpOverrides(DeviceOpOverrides):
+    def import_get_raw_stream_as(self, name):
+        return f"from torch._C import _cuda_getCurrentRawStream as {name}"
+
+    def set_device(self, device_idx):
+        return f"torch.cuda.set_device({device_idx})"
+
+    def synchronize(self):
+        return "torch.cuda.synchronize()"
+
+    def device_guard(self, device_idx):
+        return f"torch.cuda._DeviceGuard({device_idx})"
+
+
+register_device_op_overrides("cuda", CUDADeviceOpOverrides())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/gemm_template.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/gemm_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea022d4d7019ceb4250841a4ccba385abc660bd6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda/gemm_template.py
@@ -0,0 +1,706 @@
+import copy
+import logging
+import re
+from typing import cast, Dict, List, Optional, Tuple
+
+from ...config import cuda as inductor_cuda_config
+from ...ir import Buffer, CUDATemplateBuffer, FixedLayout, IRNode, Layout
+from ..common import IndentedBuffer
+
+from . import cutlass_utils
+from .cuda_kernel import CUDATemplateKernel
+from .cuda_template import CUTLASSTemplate
+from .cutlass_epilogue_gen import (
+    CutlassEVTEpilogueArgumentFormatter,
+    CutlassEVTEpilogueTypeFormatter,
+)
+
+log = logging.getLogger(__name__)
+
+GEMM_TEMPLATE = r"""
+{{template.header().getvalue()}}
+{{template.globals().getvalue()}}
+{{instance_definition}}
+// When workspace_size is not a nullptr, populates requested workspace_size and returns.
+// Otherwise, computes the Gemm kernel using the given workspace ptr.
+extern "C" {
+{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} {
+  try {
+  {{kernel.check_not_null(X)}}
+  {{kernel.check_not_null(W)}}
+  {{kernel.check_not_null(Bias)}}
+  {{kernel.check_not_null(Y)}}
+  int64_t B = {{kernel.size(Y, 0, -3, default_value=1)}};
+  int64_t M = {{kernel.size(X, -2)}};
+  int64_t K = {{kernel.size(X, -1)}};
+  int64_t N = {{kernel.size(W, -1)}};
+  using ElementComputeEpilogue = {{instance_type}}::ElementAccumulator;
+  using coord_t = cutlass::gemm::GemmCoord::Index;
+  {{instance_type}}::Arguments arguments;
+  {{template.render_gemm_arguments(argument_template, epilogue_template, should_swap_xw,
+                                    X, W, Bias, Y, alpha, beta, kernel, epilogue_args)}}
+  {{instance_type}} gemm_op;
+  if (workspace_size) {
+    *workspace_size = gemm_op.get_workspace_size(arguments);
+    return 0;
+  }
+  {
+    auto status = gemm_op.can_implement(arguments);
+    CUTLASS_CHECK(status);
+  }
+  {
+    auto status = gemm_op.initialize(arguments, workspace, stream);
+    CUTLASS_CHECK(status);
+  }
+  {
+    auto status = gemm_op(stream);
+    CUTLASS_CHECK(status);
+  }
+  }
+  catch (std::exception& e) {
+    std::cerr << "Runtime error: " << e.what() << std::endl;
+    return -1;
+  }
+  catch (...) {
+    return -1;
+  }
+  return 0;
+}
+}
+"""
+
+
+GEMM_ARGS_CUTLASS_2X = r"""
+  int64_t batch_stride_x = {{kernel.stride(X, -3)}};
+  int64_t row_stride_x = {{kernel.row_or_column_stride(X)}};
+  int64_t batch_stride_w = {{kernel.stride(W, -3)}};
+  int64_t row_stride_w = {{kernel.row_or_column_stride(W)}};
+  int64_t batch_stride_bias = {{kernel.stride(Bias, -3)}};
+  int64_t row_stride_bias = {{kernel.row_or_column_stride(Bias)}};
+  int64_t batch_stride_y = {{kernel.stride(Y, -3)}};
+  int64_t row_stride_y = {{kernel.row_or_column_stride(Y)}};
+  // Initialize GemmUniversalInstance arguments.
+  arguments = {
+    {{template.gemm_mode()}},  // GemmUniversalMode mode
+    {
+      static_cast<coord_t>(M),
+      static_cast<coord_t>(N),
+      static_cast<coord_t>(K)
+    },  // GemmCoord problem_size
+    {{split_k if split_k > 1 else 'B'}},  // int batch_count
+    {ElementComputeEpilogue({{alpha}}), ElementComputeEpilogue({{beta}})},  // typename EpilogueOutputOp::Params epilogue
+    {{template.cutlass_type_cast(X, kernel.ptr(X))}},  // void const * ptr_A
+    {{template.cutlass_type_cast(W, kernel.ptr(W))}},  // void const * ptr_B
+    {{template.cutlass_type_cast(Bias, kernel.ptr(Bias))}},  // void const * ptr_C
+    {{template.cutlass_type_cast(Y, kernel.ptr(Y))}},  // void * ptr_D
+    batch_stride_x,  // int64_t batch_stride_A
+    batch_stride_w,  // int64_t batch_stride_B
+    batch_stride_bias,  // int64_t batch_stride_C
+    batch_stride_y,  // int64_t batch_stride_D
+    row_stride_x,  // typename LayoutA::Stride::LongIndex lda
+    row_stride_w,  // typename LayoutB::Stride::LongIndex ldb
+    row_stride_bias,  // typename LayoutC::Stride::LongIndex ldc
+    row_stride_y,  // typename LayoutC::Stride::LongIndex ldd
+  };
+"""
+
+
+GEMM_ARGS_CUTLASS_3X = r"""
+  // Initialize GemmUniversal3xInstance arguments.
+  arguments = {
+    {{template.gemm_mode()}},  // GemmUniversalMode mode
+    {
+      static_cast<coord_t>({{M}}),
+      static_cast<coord_t>({{N}}),
+      static_cast<coord_t>(K),
+      static_cast<coord_t>(B)
+    }, // ProblemShape problem_shape
+    {
+      {{template.cutlass_type_cast(X, kernel.ptr(X))}},  // ElementA const* ptr_A
+      {
+        {{template.cute_int(kernel.stride(X, -2), "stride_x0")}},
+        {{template.cute_int(kernel.stride(X, -1), "stride_x1")}},
+        {{template.cute_int(kernel.stride(X, -3), "batch_stride_x")}}
+      },  // StrideA dA
+      {{template.cutlass_type_cast(W, kernel.ptr(W))}},  // ElementB const* ptr_B
+      {
+        {{template.cute_int(kernel.stride(W, -1), "stride_w1")}},
+        {{template.cute_int(kernel.stride(W, -2), "stride_w0")}},
+        {{template.cute_int(kernel.stride(W, -3), "batch_stride_w")}}
+      },  // StrideB dB
+    },  // MainloopArguments mainloop
+    {{epilogue_arguments}}
+  };
+"""
+
+GEMM_ARGS_CUTLASS_3X_EPILOGUE = r"""
+    // see https://tinyurl.com/4rk89z48
+    {
+      {{epilogue_args}},  // thread, typename FusionCallbacks::Arguments ( EVT ) or ThreadEpilogueOp::Params (non-EVT )
+      {{template.cutlass_type_cast(Bias, kernel.ptr(Bias))}},  // ElementC const* ptr_C
+      {
+        {{template.cute_int(kernel.stride(Bias, -2, 1), "stride_bias0")}},
+        {{template.cute_int(kernel.stride(Bias, -1, 1), "stride_bias1")}},
+        {{template.cute_int(kernel.stride(Bias, -3), "batch_stride_bias")}}
+      },  // StrideC dC
+      {{template.cutlass_type_cast(Y, kernel.ptr(Y))}},  // ElementD const* ptr_D
+      {
+        {{template.cute_int(kernel.stride(Y, -2), "stride_y0")}},
+        {{template.cute_int(kernel.stride(Y, -1), "stride_y1")}},
+        {{template.cute_int(kernel.stride(Y, -3), "batch_stride_y")}}
+      },  // StrideD dD
+    },  // EpilogueArguments epilogue
+"""
+
+
+class CUTLASSGemmTemplate(CUTLASSTemplate):
+    """
+    CUTLASS GEMM template, which is used to generate CUTLASS GEMM kernels
+    including those which allow flexible fusions with epilogues.
+    """
+
+    def __init__(
+        self,
+        input_nodes: List[Buffer],
+        layout: Layout,
+        alpha: float,
+        beta: float,
+        input_reorder: Optional[List[int]] = None,
+        can_fuse_epilogue: Optional[bool] = None,
+    ):
+        """
+        Args:
+            input_nodes: input nodes of the kernel
+            layout: layout of the output node
+            alpha: alpha value of the GEMM operation
+            beta: beta value of the GEMM operation
+            input_reorder: reorder of the input nodes
+            can_fuse_epilogue: If set to True, will only list and use operators capable of flexible epilogue fusions.
+                               If False, it will not use those. If None, both may be listed, but it will not allow fusions.
+                               Defaults to None
+        """
+        super().__init__("cutlass_gemm", input_nodes, layout, input_reorder)
+        self.alpha = alpha
+        self.beta = beta
+        self.can_fuse_epilogue = can_fuse_epilogue
+
+    @staticmethod
+    def add_cutlass_gemm_choices(
+        choices,
+        layout,
+        input_nodes,
+        alpha=1,
+        beta=0,
+        input_reorder=None,
+        fuseable=True,
+        non_fuseable=True,
+    ):
+        if non_fuseable:
+            if fuseable:
+                # list both fuseable and non-fuseable ops, and treat them all as non-fuseable
+                can_fuse_epilogue = False
+            else:
+                can_fuse_epilogue = None
+
+            cutlass_template = CUTLASSGemmTemplate(
+                input_nodes,
+                layout,
+                alpha=alpha,
+                beta=beta,
+                input_reorder=input_reorder,
+                can_fuse_epilogue=can_fuse_epilogue,
+            )
+            ops = cutlass_template.gen_ops()
+            for op in ops:
+                cutlass_template.maybe_append_choice(
+                    choices,
+                    op=op,
+                )
+        else:
+            ops = []
+        if fuseable:
+            cutlass_template_evt = CUTLASSGemmTemplate(
+                input_nodes,
+                layout,
+                alpha=alpha,
+                beta=beta,
+                input_reorder=input_reorder,
+                can_fuse_epilogue=True,
+            )
+            # This will list only ops capable of EVT fusion
+            ops_evt = cutlass_template_evt.gen_ops()
+            for op in ops_evt:
+                cutlass_template_evt.maybe_append_choice(
+                    choices,
+                    op=op,
+                )
+        else:
+            ops_evt = []
+        log.debug(
+            "Added %d cutlass gemm configs and %d fuseable gemm configs.",
+            len(ops),
+            len(ops_evt),
+        )
+
+    def header(self) -> IndentedBuffer:
+        res = super().header()
+        res.splice(
+            """
+                #include "cutlass/gemm/gemm.h"
+                #include "cutlass/gemm/device/gemm_universal.h"
+                #include "cutlass/gemm/device/gemm_universal_adapter.h"
+                #include "cutlass/gemm/kernel/gemm_universal.hpp"
+                #include "cutlass/gemm/collective/collective_builder.hpp"
+                #include "cutlass/epilogue/collective/collective_builder.hpp"
+                #include "cutlass/epilogue/collective/default_epilogue.hpp"
+                #include "cutlass/epilogue/thread/linear_combination.h"
+                #include "cutlass/gemm/dispatch_policy.hpp"
+                #include "cutlass/gemm/kernel/tile_scheduler.hpp"
+                #include "cutlass/util/distribution.h"
+                #include "cutlass/util/packed_stride.hpp"
+                #include "cutlass/util/tensor_view_io.h"
+            """
+        )
+        return res
+
+    @staticmethod
+    def cutlass_layout(torch_layout) -> "Optional[cutlass_lib.LayoutType]":  # type: ignore[name-defined]  # noqa: F821
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib
+
+        if torch_layout.stride[-1] == 1:
+            return cutlass_lib.LayoutType.RowMajor
+        elif torch_layout.stride[-2] == 1:
+            return cutlass_lib.LayoutType.ColumnMajor
+        else:
+            return None
+
+    @staticmethod
+    def flip_cutlass_layout(
+        cutlass_layout: "cutlass_lib.LayoutType",  # type: ignore[name-defined]  # noqa: F821
+    ) -> "cutlass_lib.LayoutType":  # type: ignore[name-defined]  # noqa: F821
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib
+
+        if cutlass_layout == cutlass_lib.LayoutType.RowMajor:
+            return cutlass_lib.LayoutType.ColumnMajor
+        else:
+            return cutlass_lib.LayoutType.RowMajor
+
+    @staticmethod
+    def layout_match(torch_layout, cutlass_layout) -> bool:
+        return CUTLASSGemmTemplate.cutlass_layout(torch_layout) == cutlass_layout
+
+    @staticmethod
+    def set_alignment(torch_layout, op_element) -> bool:
+        alignment = cutlass_utils.get_max_alignment(torch_layout)
+        if alignment < op_element.alignment:
+            return False
+        else:
+            op_element.alignment = alignment
+            return True
+
+    @staticmethod
+    def has_tma_epilogue(op) -> bool:
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib
+
+        result = False
+        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+            epilogue_schedule_str = str(op.epilogue_schedule).split(".")[-1]
+            result = epilogue_schedule_str.lower().startswith("tma")
+        return result
+
+    @staticmethod
+    def supports_evt(op: "cutlass_library.gemm_op.GemmOperation") -> bool:  # type: ignore[name-defined]  # noqa: F821
+        """
+        returns True if the op is capable of flexible epilogue fusions
+        using epilogue visitor trees.
+
+        See https://github.com/NVIDIA/cutlass/blob/e01b9b5029b7caca5a43c29f7d2714d7cf1dcae8/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu#L283-L285 # noqa: B950
+        """
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib
+
+        if op.gemm_kind != cutlass_lib.GemmKind.Universal3x:
+            return False
+        if op.epilogue_schedule not in (
+            cutlass_lib.EpilogueScheduleType.TmaWarpSpecialized,
+            cutlass_lib.EpilogueScheduleType.TmaWarpSpecializedCooperative,
+        ):
+            return False
+
+        return True
+
+    def render_evt_epilogue_declaration(
+        self,
+        template_output_node_name: str,
+        evt_type_name: str,
+        epilogue_nodes: List[IRNode],
+    ) -> str:
+        """Generates the epilogue for the EVT epilogue fusion"""
+        return CutlassEVTEpilogueTypeFormatter.ir_to_evt_string(
+            template_output_node_name, evt_type_name, epilogue_nodes
+        )
+
+    def define_gemm_instance(
+        self,
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+        output_buffer_name: str,
+        epilogue_nodes: Optional[List[IRNode]] = None,
+    ) -> Tuple[str, str]:
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.gemm_operation as cutlass_gemm_op
+        import cutlass_library.library as cutlass_lib
+
+        from torch._inductor.codegen.cuda.cutlass_lib_extensions.gemm_operation_extensions import (
+            EmitGemmUniversal3xInstanceWithEVT,
+        )
+
+        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+                emitter = EmitGemmUniversal3xInstanceWithEVT()
+                op.epilogue_functor = lambda epilogue_functor_type_name: self.render_evt_epilogue_declaration(
+                    output_buffer_name, epilogue_functor_type_name, epilogue_nodes
+                )
+            else:
+                emitter = cutlass_gemm_op.EmitGemmUniversal3xInstance()
+            op_def = emitter.emit(op)
+            pattern = re.compile(r"\s*struct\s(.*?)\s:")
+            decl = [line for line in op_def.split("\n") if "struct " in line][-1]
+        else:
+            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+                raise RuntimeError(
+                    "EVT epilogue fusion is not supported for Cutlass 2.x ops."
+                )
+            emitter = cutlass_gemm_op.EmitGemmInstance()
+            op_def = emitter.emit(op)
+            op_def = op_def.replace(
+                "cutlass::gemm::device::Gemm", "cutlass::gemm::device::GemmUniversal"
+            )
+            op_def = op_def.replace("false,", "")
+            pattern = re.compile(r"\s*using\s(.*?)\s=")
+            decl = op_def.split("\n")[2]
+        match = pattern.match(decl)
+        if match is None:
+            raise RuntimeError("Invalid Gemm config: \n" + op_def)
+        op_type = match.groups()[0]
+        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+            op_def += f"\n  using {op_type}_device_type = cutlass::gemm::device::GemmUniversalAdapter<{op_type}>;\n"
+            op_type = f"{op_type}_device_type"
+        return op_def, op_type
+
+    @staticmethod
+    def should_swap_XW(
+        bias: IRNode,
+        beta: float,
+    ) -> bool:
+        return True
+
+        # TODO(ipiszy): Check whether it's necessary to swap X/W.
+        # strides = bias.get_stride()
+        # if strides[-1] != 1:
+        #     return True
+        # for stride in strides[:-1]:
+        #     if stride != 0:
+        #         return True
+        # return False
+
+    @staticmethod
+    def swap_XW(
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+    ) -> "cutlass_library.gemm_op.GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+        # Swap X and W in GemmOperation.
+        new_op = copy.deepcopy(op)
+        new_op.A.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.A.layout)
+        new_op.B.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.B.layout)
+        new_op.A, new_op.B = new_op.B, new_op.A
+        new_op.C.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.C.layout)
+        new_op.D.layout = CUTLASSGemmTemplate.flip_cutlass_layout(new_op.D.layout)
+        return new_op
+
+    def filter_op(
+        self,
+        op: "cutlass_library.gemm_op.GemmOperation",  # type: ignore[name-defined]  # noqa: F821
+    ) -> "cutlass_library.gemm_op.GemmOperation":  # type: ignore[name-defined]  # noqa: F821
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.library as cutlass_lib
+
+        # Skip simt kernels
+        if (
+            op.tile_description.math_instruction.opcode_class
+            == cutlass_lib.OpcodeClass.Simt
+        ):
+            return None
+
+        # Only keep GemmUniversal kernels
+        if op.gemm_kind not in {
+            cutlass_lib.GemmKind.Universal,
+            cutlass_lib.GemmKind.Universal3x,
+        }:
+            return None
+        # Filter ops by dtypes.
+        X = self.input_nodes[0]
+        W = self.input_nodes[1]
+        accumulator_torch_dtype = cutlass_utils.get_accumulator_dtype(
+            [X.get_dtype(), W.get_dtype()],
+        )
+        if not (
+            cutlass_utils.dtype_match(X.get_dtype(), op.A.element)
+            and cutlass_utils.dtype_match(W.get_dtype(), op.B.element)
+            and cutlass_utils.dtype_match(
+                self.output_node.get_layout().dtype, op.C.element
+            )
+            and cutlass_utils.dtype_match(
+                accumulator_torch_dtype, op.accumulator_type()
+            )
+        ):
+            return None
+
+        # Filter ops by input layouts.
+        if not (
+            self.layout_match(X.get_layout(), op.A.layout)
+            and self.layout_match(W.get_layout(), op.B.layout)
+        ):
+            return None
+
+        # Update op.
+        op = copy.deepcopy(op)
+
+        # Set output layout.
+        op.D.layout = CUTLASSGemmTemplate.cutlass_layout(self.output_node.get_layout())
+
+        # Filter ops by alignments and set alignments.
+        if not (
+            self.set_alignment(X.get_layout(), op.A)
+            and self.set_alignment(W.get_layout(), op.B)
+            and self.set_alignment(self.output_node.get_layout(), op.D)
+        ):
+            return None
+
+        # Set epilogue.
+        # TODO: update epilogue functor according to epilogues.
+        op.element_epilogue = op.accumulator_type()
+
+        # Set bias layout and alignment.
+        if len(self.input_nodes) >= 3 and self.input_nodes[2] is not None:
+            Bias = self.input_nodes[2]
+            bias_layout = CUTLASSGemmTemplate.cutlass_layout(Bias.get_layout())
+            if op.gemm_kind != cutlass_lib.GemmKind.Universal3x:
+                if bias_layout != op.D.layout:
+                    # For cutlass2, bias and output layout must match
+                    return None
+            else:
+                op.C.layout = bias_layout
+            if not self.set_alignment(Bias.get_layout(), op.C):
+                return None
+        else:
+            if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+                op.C.element = cutlass_lib.DataType.void
+            else:
+                op.C.layout = op.D.layout
+        supports_evt: bool = self.supports_evt(op)
+        if (self.can_fuse_epilogue is not None) and (
+            self.can_fuse_epilogue != supports_evt
+        ):
+            return None
+        if inductor_cuda_config.cutlass_only_evt_capable_ops and not supports_evt:
+            return None
+        return op
+
+    def gen_ops(self) -> "List[cutlass_gemm_op.GemmOperation]":  # type: ignore[name-defined]  # noqa: F821
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.gemm_operation as cutlass_gemm_op
+        import cutlass_library.library as cutlass_lib
+
+        ops = cutlass_utils.gen_ops()[cutlass_lib.OperationKind.Gemm]
+        res: Dict[str, cutlass_gemm_op.GemmOperation] = dict()
+        num_3x_ops = 0
+        num_2x_ops = 0
+        for op_dict in ops.values():
+            for op_list in op_dict.values():
+                for op in op_list:
+                    assert isinstance(op, cutlass_gemm_op.GemmOperation)
+                    filter_res = self.filter_op(op)
+                    if (
+                        filter_res is not None
+                        and res.get(filter_res.configuration_name(), None) is None
+                    ):
+                        res[filter_res.configuration_name()] = filter_res
+        for op in res.values():
+            if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+                num_3x_ops += 1
+            else:
+                num_2x_ops += 1
+        log.debug(
+            "Got cutlass configs: total number of ops: %d, "
+            "total number of 3x ops: %d, total number of 2x ops: %d",
+            len(res),
+            num_3x_ops,
+            num_2x_ops,
+        )
+        return list(res.values())[: inductor_cuda_config.cutlass_max_profiling_configs]
+
+    def gemm_mode(self) -> str:
+        sizes = self.output_node.get_size()
+        if len(sizes) > 2:
+            return "cutlass::gemm::GemmUniversalMode::kBatched"
+        else:
+            return "cutlass::gemm::GemmUniversalMode::kGemm"
+
+    def render_gemm_arguments(
+        self,
+        argument_template: str,
+        epilogue_template: str,
+        should_swap_xw: bool,
+        X: IRNode,
+        W: IRNode,
+        Bias: IRNode,
+        Y: IRNode,
+        alpha: float,
+        beta: float,
+        kernel: CUDATemplateKernel,
+        epilogue_args,
+    ) -> str:
+        options = dict(
+            alpha=self.alpha,
+            beta=self.beta,
+            X=X,
+            W=W,
+            Y=Y,
+            Bias=Bias,
+            template=self,
+            kernel=kernel,
+            M="M",
+            N="N",
+            epilogue_args=epilogue_args,
+        )
+
+        if epilogue_template is not None:
+            if should_swap_xw:
+                # Swap
+                def clone_with_transposed_stride(node: IRNode) -> IRNode:
+                    old_layout = node.get_layout()
+                    new_stride = list(old_layout.stride)
+                    new_stride[-2], new_stride[-1] = new_stride[-1], new_stride[-2]
+                    new_layout = FixedLayout(
+                        old_layout.device,
+                        old_layout.dtype,
+                        list(old_layout.size),
+                        new_stride,
+                        old_layout.offset,
+                    )
+                    return Buffer(node.get_name(), new_layout)
+
+                new_X = clone_with_transposed_stride(X)
+                new_W = clone_with_transposed_stride(W)
+                new_Bias = clone_with_transposed_stride(Bias)
+                new_Y = clone_with_transposed_stride(Y)
+                options["X"], options["W"], options["Bias"], options["Y"] = (
+                    new_W,
+                    new_X,
+                    new_Bias,
+                    new_Y,
+                )
+                options["M"], options["N"] = "N", "M"
+
+            epilogue_arguments = self._template_from_string(epilogue_template).render(
+                **options
+            )
+            arguments = self._template_from_string(argument_template).render(
+                epilogue_arguments=epilogue_arguments, **options
+            )
+        else:
+            arguments = self._template_from_string(GEMM_ARGS_CUTLASS_2X).render(
+                split_k=1, **options
+            )
+        return arguments
+
+    def render(  # type: ignore[override]
+        self,
+        kernel: CUDATemplateKernel,
+        op: "cutlass_gemm_op.GemmOperation" = None,  # type: ignore[name-defined]  # noqa: F821
+        template_buffer_node: Optional[CUDATemplateBuffer] = None,
+        epilogue_nodes: Optional[List[IRNode]] = None,
+        **kwargs,
+    ) -> str:
+        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+            assert self.can_fuse_epilogue and CUTLASSGemmTemplate.supports_evt(
+                op
+            ), "op does not support EVT epilogue fusion"
+            assert (
+                template_buffer_node is not None
+            ), "Template node is required for epilogue fusion"
+            assert isinstance(
+                template_buffer_node, CUDATemplateBuffer
+            ), f"Template node has to be a CUDATemplateBuffer, is type {type(template_buffer_node)}"
+            assert (
+                template_buffer_node.name is not None
+            ), "Output node has to be a Buffer with a name"
+            # This is the name of the output of the Matmul, before epilogues are applied.
+            # it is not necessarily materialized in global memory if we have an epilogue
+
+        template_output_node_name = (
+            template_buffer_node.name if template_buffer_node is not None else None
+        )
+
+        assert cutlass_utils.try_import_cutlass()
+        import cutlass_library.gemm_operation as cutlass_gemm_op
+        import cutlass_library.library as cutlass_lib
+
+        assert isinstance(
+            op, cutlass_gemm_op.GemmOperation
+        ), "op argument is required and has to be an instance of GemmOperation"
+        if template_buffer_node is not None:
+            self.output_node = template_buffer_node
+        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+            self.output_node = cast(Buffer, epilogue_nodes[-1])
+
+        assert len(self.input_nodes) >= 2 and self.output_node is not None
+        X, W = self.input_nodes[0], self.input_nodes[1]
+        Y = self.output_node
+        Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2]
+
+        epilogue_template: Optional[str] = None
+        should_swap_xw: bool = False
+        epilogue_args = f"{{ElementComputeEpilogue({self.alpha}), ElementComputeEpilogue({self.beta})}}"
+        if op.gemm_kind == cutlass_lib.GemmKind.Universal3x:
+            if Bias is not None and self.has_tma_epilogue(op):
+                if self.should_swap_XW(Bias, self.beta):
+                    # TMA epilogue requires bias vector in column major to get best perf.
+                    op = self.swap_XW(op)
+                    should_swap_xw = True
+            if epilogue_nodes is not None and len(epilogue_nodes) > 0:
+                epilogue_args = (
+                    CutlassEVTEpilogueArgumentFormatter.ir_to_evt_argument_string(
+                        cast(str, template_output_node_name), epilogue_nodes
+                    )
+                )
+            epilogue_template = GEMM_ARGS_CUTLASS_3X_EPILOGUE
+            argument_template = GEMM_ARGS_CUTLASS_3X
+        else:
+            # TODO: Support split_k.
+            argument_template = GEMM_ARGS_CUTLASS_2X
+
+        instance_definition, instance_type = self.define_gemm_instance(
+            op, cast(str, template_output_node_name), epilogue_nodes
+        )
+        options = dict(
+            alpha=self.alpha,
+            beta=self.beta,
+            X=X,
+            W=W,
+            Y=Y,
+            Bias=Bias,
+            epilogue_template=epilogue_template,
+            argument_template=argument_template,
+            should_swap_xw=should_swap_xw,
+            template=self,
+            kernel=kernel,
+            instance_definition=instance_definition,
+            instance_type=instance_type,
+            input_reorder=self.input_reorder,
+            epilogue_args=epilogue_args,
+        )
+        res = self._template_from_string(GEMM_TEMPLATE).render(**options)
+        return res
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda_combined_scheduling.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda_combined_scheduling.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed88fc3a9d824779eb783ccddcdab6ffc557e5b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/cuda_combined_scheduling.py
@@ -0,0 +1,75 @@
+from typing import List
+
+from ..scheduler import BaseSchedulerNode, BaseScheduling, Scheduler, SchedulerNode
+from .cuda.cuda_cpp_scheduling import CUDACPPScheduling
+
+from .triton import TritonScheduling
+
+
+class CUDACombinedScheduling(BaseScheduling):
+    """
+    Scheduler for CUDA Kernels, which delegates calls as appropriate
+    to the CUDA-C++ and Triton Schedulers, which both work for CUDA devices
+    and use a unified-wrapper for codegen.
+
+    If Scheduling code needs to be specialized for the case of mixed Triton / CUDA C++ code,
+    this would also be the place to do it.
+    """
+
+    def __init__(self, scheduler: Scheduler):
+        super().__init__()
+        self._scheduler = scheduler
+        self._triton_scheduling = TritonScheduling(scheduler)
+        self._cuda_cpp_scheduling = CUDACPPScheduling(scheduler)
+
+    def choose_node_backend(self, node: BaseSchedulerNode) -> BaseScheduling:
+        if self._cuda_cpp_scheduling.is_cuda_cpp_template(
+            node
+        ) or self._cuda_cpp_scheduling.is_cuda_cpp_fused_template(node):
+            return self._cuda_cpp_scheduling
+        return self._triton_scheduling
+
+    def can_fuse_vertical(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        if self._cuda_cpp_scheduling.can_fuse_vertical(node1, node2):
+            return True
+        return self._triton_scheduling.can_fuse_vertical(node1, node2)
+
+    def can_fuse_horizontal(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        for node in (node1, node2):
+            if self._cuda_cpp_scheduling.is_cuda_cpp_template(
+                node
+            ) or self._cuda_cpp_scheduling.is_cuda_cpp_fused_template(node):
+                return self._cuda_cpp_scheduling.can_fuse_horizontal(
+                    node1, node2
+                )  # always False at the moment
+        return self._triton_scheduling.can_fuse_horizontal(node1, node2)
+
+    def group_fn(self, sizes):
+        return self._triton_scheduling.group_fn(sizes)
+
+    def codegen_template(
+        self, template_node: SchedulerNode, epilogue_nodes: List[SchedulerNode]
+    ):
+        if self._cuda_cpp_scheduling.is_cuda_cpp_template(template_node):
+            return self._cuda_cpp_scheduling.codegen_template(
+                template_node, epilogue_nodes
+            )
+        else:
+            return self._triton_scheduling.codegen_template(
+                template_node, epilogue_nodes
+            )
+
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
+        return self._triton_scheduling.codegen_nodes(nodes)
+
+    def codegen_sync(self):
+        return self._triton_scheduling.codegen_sync()
+
+    def flush(self):
+        return self._triton_scheduling.flush()
+
+    def codegen_foreach(self, *args, **kwargs):
+        return self._triton_scheduling.codegen_foreach(*args, **kwargs)
+
+    def benchmark_fused_nodes(self, nodes):
+        return self._triton_scheduling.benchmark_fused_nodes(nodes)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/memory_planning.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/memory_planning.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f921c527e99409679e714aafe6ec14758b31f7f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/memory_planning.py
@@ -0,0 +1,799 @@
+from __future__ import annotations
+
+import collections
+import dataclasses
+import itertools
+import pprint
+from typing import Any, Dict, Iterable, List, Optional, Protocol
+
+import sympy
+
+import torch
+from .. import config, ir
+from ..utils import cache_on_self, CachedMethod, IndentedBuffer
+from ..virtualized import V
+
+from .wrapper import (
+    AllocateLine,
+    FreeIfNotReusedLine,
+    MemoryPlanningLine,
+    NullLine,
+    ReuseLine,
+)
+
+
+ALIGN_BYTES = 64
+assert (ALIGN_BYTES & (ALIGN_BYTES - 1)) == 0 and ALIGN_BYTES >= 8, "must be power of 2"
+
+
+def _align(nbytes):
+    """Round up to the nearest multiple of ALIGN_BYTES"""
+    return (nbytes + ALIGN_BYTES - 1) & -ALIGN_BYTES
+
+
+def _is_aligned(v: sympy.Expr):
+    """v can be statically proven to be a multiple of ALIGN_BYTES"""
+    if isinstance(v, (sympy.Add, sympy.Max)):
+        return all(map(_is_aligned, v.args))
+    return isinstance(v, align) or sympy.gcd(v, ALIGN_BYTES) == ALIGN_BYTES
+
+
+class align(sympy.Function):
+    """Symbolically round up to the nearest multiple of ALIGN_BYTES"""
+
+    nargs = (1,)
+    is_integer = True
+
+    @classmethod
+    def eval(cls, value):
+        if isinstance(value, (int, sympy.Integer)):
+            return _align(int(value))
+        if _is_aligned(value):
+            return value
+
+
+@dataclasses.dataclass
+class LiveRange:
+    """
+    A range where a given tensor is live.  Begin and end are both counters
+    representing points in the program of grouped memory operations.
+    Begin is inclusive, end is exclusive.
+
+    Invariant: begin <= end
+    """
+
+    begin: float  # int | ±inf
+    end: float  # int | ±inf
+
+    def contains(self, other: LiveRange):
+        """Is other entirely within self"""
+        return self.begin <= other.begin and other.end <= self.end
+
+    def join(self, other: LiveRange):
+        """Combine two ranges using a union operation"""
+        return LiveRange(min(self.begin, other.begin), max(self.end, other.end))
+
+    def __len__(self):
+        return self.end - self.begin
+
+
+class LiveRanges:
+    """
+    A collection of LiveRange regions, allowing for non-contiguous
+    live regions.
+
+    Invariant: LiveRanges.ranges is in sorted order and non-overlapping
+    """
+
+    def __init__(self, ranges: Iterable[LiveRange]):
+        ranges = [*sorted(ranges, key=lambda x: x.begin)]
+        self.ranges = ranges[:1]
+        for r in ranges[1:]:
+            assert self.ranges[-1].begin <= r.begin
+            if self.ranges[-1].end >= r.begin:
+                self.ranges[-1] = LiveRange.join(self.ranges[-1], r)
+            else:
+                self.ranges.append(r)
+
+    def overlaps(self, other: LiveRanges):
+        """Check if any pair of ranges in self and other overlap"""
+        left = collections.deque(self.ranges)
+        right = collections.deque(other.ranges)
+        while left and right:
+            if left[0].begin > right[0].begin:
+                left, right = right, left
+            assert left[0].begin <= right[0].begin
+            if left[0].end > right[0].begin:
+                return True
+            left.popleft()
+        return False
+
+    @property
+    def begin(self):
+        return self.ranges[0].begin
+
+    @property
+    def end(self):
+        return self.ranges[-1].end
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}([{', '.join(map(repr, self.ranges))}])"
+
+
+class AllocationTreeNode:
+    """
+    Abstract base class for nodes in allocation pool.
+    """
+
+    def allocate(self, block: Allocation, is_last: bool) -> bool:
+        """
+        Try to assign block to a memory location in this bool.  Return True if
+        an assignment was made.
+        """
+        return False
+
+    def get_live_ranges(self) -> LiveRanges:
+        """Aggregate LiveRanges for all objects below this in tree"""
+        raise NotImplementedError()
+
+    def get_size_hint(self) -> int:
+        """Number of bytes used for example inputs"""
+        raise NotImplementedError()
+
+    def get_symbolic_size(self) -> sympy.Expr:
+        """Number of bytes needed at runtime"""
+        raise NotImplementedError()
+
+    def finalize(self, pool, offset) -> AllocationTreeNode:
+        """Called after all allocations have been made"""
+        return self
+
+    def is_empty(self):
+        return False
+
+
+@dataclasses.dataclass
+class Allocation(AllocationTreeNode):
+    """
+    Represents memory allocated to a given node in the allocation pool.
+    """
+
+    node: ir.Buffer
+    live_range: LiveRange
+    size_hint: int
+    symbolic_size: sympy.Expr
+    allocated: bool = False
+    pool: Optional[AllocationPool] = None
+    offset: Optional[sympy.Expr] = None
+
+    @property
+    def device(self):
+        return self.node.get_device()
+
+    def get_live_ranges(self):
+        return LiveRanges([self.live_range])
+
+    def get_size_hint(self):
+        return self.size_hint
+
+    def get_symbolic_size(self):
+        return self.symbolic_size
+
+    def mark_allocated(self):
+        assert not self.allocated
+        self.allocated = True
+
+    def finalize(self, pool, offset):
+        assert self.pool is None and self.offset is None
+        self.pool = pool
+        self.offset = offset
+        return self
+
+    def codegen_alloc_from_pool(self, wrapper):
+        assert self.pool
+        node = self.node
+        shape = tuple(node.get_size())
+        stride = tuple(node.get_stride())
+        return wrapper.codegen_alloc_from_pool(
+            self.pool.name, self.offset, node.get_dtype(), shape, stride
+        )
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"node={self.node.get_name()}, "
+            f"live_range={self.live_range}, "
+            f"size_hint={self.size_hint}, "
+            f"symbolic_size={self.symbolic_size}, "
+            f"pool={self.pool.name if self.pool else None}, "
+            f"offset={self.offset})"
+        )
+
+
+@dataclasses.dataclass
+class Empty(AllocationTreeNode):
+    """
+    Placeholder to represent empty space in the allocation pool.
+    Only exists to get the size_hint correct in parent nodes.
+    """
+
+    size_hint: int
+
+    def get_live_ranges(self):
+        return LiveRanges([])
+
+    def get_size_hint(self):
+        return self.size_hint
+
+    def get_symbolic_size(self):
+        return 0
+
+    def is_empty(self):
+        return True
+
+
+class MemorySplitProtocol(Protocol):
+    get_live_ranges: CachedMethod[[], LiveRanges]
+    get_size_hint: CachedMethod[[], int]
+    get_symbolic_size: CachedMethod[[], sympy.Expr]
+
+    def _allocate(self, block: Allocation, is_last: bool) -> bool:
+        ...
+
+
+class ClearCacheOnAllocateMixin(MemorySplitProtocol):
+    """
+    Helper to assist in caching get_live_ranges, get_size_hint, and
+    get_symbolic_size.
+    """
+
+    def allocate(self, block: Allocation, is_last: bool):
+        is_allocated = self._allocate(block, is_last)
+        if is_allocated:
+            self.clear_cache()
+        return is_allocated
+
+    def clear_cache(self):
+        self.get_live_ranges.clear_cache(self)
+        self.get_size_hint.clear_cache(self)
+        self.get_symbolic_size.clear_cache(self)
+
+
+@dataclasses.dataclass
+class TemporalSplit(ClearCacheOnAllocateMixin, AllocationTreeNode):
+    """
+    Contains a list of allocations not overlapping in LiveRanges.
+
+    Invariant: no pair (a,b) in self.allocations will have:
+         a.get_live_ranges().overlaps(b.get_live_ranges())
+    """
+
+    allocations: List[AllocationTreeNode]
+
+    def _allocate(self, block: Allocation, is_last: bool):
+        slot_size = self.get_size_hint()
+        block_size = block.get_size_hint()
+        if not is_last and block_size > slot_size:
+            return False  # doesn't fit
+
+        block_live = block.get_live_ranges()
+        overlapping = [
+            s for s in self.allocations if s.get_live_ranges().overlaps(block_live)
+        ]
+        if len(overlapping) > 1:
+            # TODO(jansel): we could try harder here by merging overlapping in space
+            return False
+        elif len(overlapping) == 1:
+            return overlapping[0].allocate(block, is_last)
+        else:
+            block.mark_allocated()
+
+            if len(self.allocations) == 1 and isinstance(self.allocations[-1], Empty):
+                self.allocations.pop()
+
+            if slot_size == block_size:
+                # perfect fit
+                self.allocations.append(block)
+            elif slot_size > block_size:
+                self.allocations.append(
+                    SpatialSplit.create(block, slot_size - block_size)
+                )
+            else:  # grow this allocation
+                assert is_last
+                self.allocations = [
+                    *(
+                        SpatialSplit.create(a, block_size - slot_size)
+                        for a in self.allocations
+                    ),
+                    block,
+                ]
+            return True
+
+    @cache_on_self
+    def get_live_ranges(self) -> LiveRanges:
+        return LiveRanges(
+            itertools.chain.from_iterable(
+                x.get_live_ranges().ranges for x in self.allocations
+            )
+        )
+
+    @cache_on_self
+    def get_size_hint(self) -> int:
+        if not self.allocations:
+            return 0
+        return max(x.get_size_hint() for x in self.allocations)
+
+    @cache_on_self
+    def get_symbolic_size(self) -> sympy.Expr:
+        if not self.allocations:
+            return 0  # type: ignore[return-value]
+        return sympy.Max(*[x.get_symbolic_size() for x in self.allocations])
+
+    def is_empty(self):
+        return len(self.allocations) == 1 and self.allocations[0].is_empty()
+
+    def finalize(self, pool, offset):
+        self.allocations = [block.finalize(pool, offset) for block in self.allocations]
+        self.clear_cache()
+        if len(self.allocations) == 1:
+            return self.allocations[0]
+        return self
+
+
+@dataclasses.dataclass
+class SpatialSplit(ClearCacheOnAllocateMixin, AllocationTreeNode):
+    """
+    Contains two allocations, left and right, that do not overlap in space.
+    Right will be allocated immediately after left in memory.
+    """
+
+    left: TemporalSplit
+    right: TemporalSplit
+
+    @staticmethod
+    def create(left, extra_space):
+        assert isinstance(left, AllocationTreeNode)
+        assert isinstance(extra_space, int) and extra_space >= 1
+        return SpatialSplit(TemporalSplit([left]), TemporalSplit([Empty(extra_space)]))
+
+    def _allocate(self, block: Allocation, is_last: bool):
+        return self.left.allocate(block, False) or self.right.allocate(block, is_last)
+
+    @cache_on_self
+    def get_live_ranges(self):
+        return LiveRanges(
+            itertools.chain(
+                self.left.get_live_ranges().ranges, self.right.get_live_ranges().ranges
+            )
+        )
+
+    @cache_on_self
+    def get_size_hint(self) -> int:
+        return _align(self.left.get_size_hint()) + self.right.get_size_hint()
+
+    @cache_on_self
+    def get_symbolic_size(self) -> sympy.Expr:
+        return align(self.left.get_symbolic_size()) + self.right.get_symbolic_size()
+
+    def finalize(self, pool, offset):
+        self.left = self.left.finalize(pool, offset)
+        self.right = self.right.finalize(
+            pool, offset + align(self.left.get_symbolic_size())
+        )
+        self.clear_cache()
+        if self.right.is_empty():
+            return self.left
+        return self
+
+
+@dataclasses.dataclass
+class AllocationPool:
+    """
+    Represents a pool of allocations that will be generated by a single
+    call to torch.empty.
+    """
+
+    device: torch.device
+    root: TemporalSplit
+    can_expand: bool = True
+    restrict_live_range: Optional[LiveRange] = None
+    name: Optional[str] = None
+    names_to_del: List[str] = dataclasses.field(default_factory=list)
+    creation_cache: Dict[str, str] = dataclasses.field(default_factory=dict)
+
+    def allocate(self, block: Allocation, is_last: bool):
+        if self.restrict_live_range and not self.restrict_live_range.contains(
+            block.live_range
+        ):
+            return False
+
+        is_last = self.can_expand and is_last
+        if self.root.allocate(block, is_last):
+            return True
+
+        if is_last:
+            return self.allocate_at_end(block)
+
+        return False
+
+    def allocate_at_end(self, block):
+        block.mark_allocated()
+        self.root = TemporalSplit([SpatialSplit(self.root, TemporalSplit([block]))])
+        return True
+
+    def finalize(self, name):
+        assert not self.name
+        self.name = name
+        self.names_to_del.append(name)
+        self.root.finalize(self, 0)
+
+    def codegen_create(self, wrapper, code: IndentedBuffer):
+        assert self.name
+        nbytes = self.root.get_symbolic_size()
+        for block in self.root.allocations:
+            if isinstance(block, Allocation) and nbytes == block.get_symbolic_size():
+                # optimization: fuse first allocation and pool creation
+                node = block.node
+                code.writeline(
+                    wrapper.make_allocation(
+                        self.name,
+                        device=self.device,
+                        dtype=node.get_dtype(),
+                        shape=tuple(node.get_size()),
+                        stride=tuple(node.get_stride()),
+                    )
+                )
+                self.creation_cache[block.codegen_alloc_from_pool(wrapper)] = self.name
+                return
+        else:
+            code.writeline(
+                wrapper.make_allocation(
+                    self.name,
+                    device=self.device,
+                    dtype=torch.uint8,
+                    shape=(nbytes,),
+                    stride=(1,),
+                )
+            )
+
+    def codegen_destroy(self, wrapper, code: IndentedBuffer):
+        code.writeline(wrapper.make_free_by_names(self.names_to_del))
+
+    def __eq__(self, other):
+        return self is other
+
+    def __hash__(self):
+        return id(self)
+
+
+@dataclasses.dataclass
+class AllocationPools:
+    """
+    Collection of many AllocationPool objects grouped by device.
+    """
+
+    device_to_pools: Dict[torch.device, List[AllocationPool]] = dataclasses.field(
+        default_factory=dict
+    )
+
+    def get_pools(self, block):
+        if block.device not in self.device_to_pools:
+            self.device_to_pools[block.device] = []
+        return self.device_to_pools[block.device]
+
+    def allocate(self, block: Allocation):
+        pools = self.get_pools(block)
+
+        for pool in pools:
+            if pool.allocate(block, is_last=pool is pools[-1]):
+                return
+
+        # everything is full, make a new pool
+        pools.append(
+            AllocationPool(
+                block.device,
+                TemporalSplit([block]),
+                can_expand=config.memory_pool != "none",
+            )
+        )
+        block.mark_allocated()
+
+    def allocate_output(self, block: Allocation):
+        """Outputs get different pools so memory gets freed properly"""
+        pools = self.get_pools(block)
+        if pools and config.memory_pool in ("outputs", "combined"):
+            pools[-1].allocate_at_end(block)
+        else:
+            # create a new pool
+            block.mark_allocated()
+            pools.append(
+                AllocationPool(
+                    block.device,
+                    TemporalSplit([block]),
+                    can_expand=config.memory_pool == "combined",
+                )
+            )
+
+    def finalize(self):
+        """Called at the end of allocation process"""
+        for i, pool in enumerate(
+            itertools.chain.from_iterable(self.device_to_pools.values())
+        ):
+            pool.finalize(f"pool{i}")
+
+    def pprint(self):
+        for pool in itertools.chain.from_iterable(self.device_to_pools.values()):
+            print()
+            print(pool.name)
+            print(pool.root.get_live_ranges())
+            pprint.pprint(pool.root)
+
+
+class BufferGroup:
+    """
+    Due to inplace reuse an allocated buffer can have many names.
+    This tracks these collections of buffers sharing underlying memory.
+    """
+
+    def __init__(self, node: ir.Buffer):
+        self.node = node
+        self.names = [node.get_name()]
+        self.is_output = False
+        self.allocation: Optional[Allocation] = None
+        self.live_range = LiveRange(float("inf"), -float("inf"))
+
+    def update_usage(self, timestep: int):
+        """Expand self.live_range to include timestep"""
+        self.live_range = LiveRange(
+            min(timestep, self.live_range.begin),
+            max(timestep, self.live_range.end),
+        )
+
+    def sym_nbytes(self):
+        return self.node.get_layout().storage_size() * self.node.get_dtype().itemsize
+
+    def make_allocation(self):
+        assert not self.allocation, "multiple allocations"
+        assert isinstance(self.live_range.begin, int), "live ranges not computed"
+        nbytes = self.sym_nbytes()
+        # For now, fallback value will be used if we encounter an unbacked SymInt. The longer-term plan is to have
+        # size_hint() use better heuristics for unbackeds, at which point the fallback value will be ignored.
+        size_hint = V.graph.sizevars.size_hint(nbytes, fallback=64)
+        self.allocation = Allocation(
+            self.node,
+            self.live_range,
+            size_hint=size_hint,
+            symbolic_size=nbytes,
+        )
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}({self.names!r}, is_output={self.is_output}, "
+            f"live_range={self.live_range}"
+        )
+
+
+@dataclasses.dataclass
+class PoolMemoryPlanningLine(MemoryPlanningLine):
+    """Abstract base class for {Alloc,Dealloc}FromPoolLine"""
+
+    group: BufferGroup
+    timestep: Optional[int] = None
+
+    @property
+    def node(self):
+        return self.group.node
+
+
+@dataclasses.dataclass
+class AllocFromPoolLine(PoolMemoryPlanningLine):
+    """Similar to AllocationLine, but takes memory from a pool"""
+
+    is_first_pool_usage: bool = False
+
+    def codegen(self, code: IndentedBuffer):
+        allocation = self.group.allocation
+        assert allocation and allocation.pool
+        pool = allocation.pool
+        name = self.node.get_name()
+
+        if self.is_first_pool_usage:
+            pool.codegen_create(self.wrapper, code)
+
+        pool.names_to_del.extend(self.group.names)
+        alloc_from_pool = allocation.codegen_alloc_from_pool(self.wrapper)
+        if alloc_from_pool in pool.creation_cache:
+            code.writeline(
+                self.wrapper.make_tensor_alias(
+                    name, pool.creation_cache[alloc_from_pool], "alloc"
+                )
+            )
+        else:
+            pool.creation_cache[alloc_from_pool] = name
+            code.writeline(
+                f"{self.wrapper.declare}{name} = {alloc_from_pool}{self.wrapper.ending}"
+            )
+
+
+@dataclasses.dataclass
+class DeallocFromPoolLine(PoolMemoryPlanningLine):
+    """Similar to FreeIfNotReusedLine, but takes memory from a pool"""
+
+    is_last_pool_usage: bool = False
+
+    def codegen(self, code: IndentedBuffer):
+        if self.is_last_pool_usage:
+            assert self.group.allocation and self.group.allocation.pool
+            self.group.allocation.pool.codegen_destroy(self.wrapper, code)
+
+
+@dataclasses.dataclass
+class MemoryPlanner:
+    """
+    Coordination object to run memory planning passes during wrapper
+    codegen.
+    """
+
+    wrapper: Any
+    pools: AllocationPools = dataclasses.field(default_factory=AllocationPools)
+    buffer_groups: Optional[List[BufferGroup]] = None
+
+    def plan(self, lines: List[Any]) -> List[Any]:
+        """Call all the memory planning passes in sequence"""
+        lines = [*lines]
+        self.drop_removed_buffers(lines)
+        self.convert_to_pool_lines(lines)
+        self.compute_live_ranges(lines)
+        self.allocate_groups()
+        self.mark_first_last_usage(lines)
+        return lines
+
+    def drop_removed_buffers(self, lines):
+        """
+        Replace any memory planning lines in V.graph.removed_buffers with NullLine
+        """
+        # drop any removed buffers
+        for i, line in enumerate(lines):
+            if isinstance(line, (AllocateLine, FreeIfNotReusedLine, ReuseLine)):
+                if line.node.get_name() in V.graph.removed_buffers:
+                    lines[i] = NullLine(self.wrapper)
+
+    def compute_buffer_groups(self, lines):
+        """
+        Populates self.buffer_groups with BufferGroup objects that join
+        allocations with common storage (due to inplace reuse) into a
+        single object.
+        """
+        name_to_group = {}
+        for line in lines:
+            if isinstance(line, AllocateLine):
+                name = line.node.get_name()
+                assert name not in name_to_group
+                name_to_group[name] = BufferGroup(line.node)
+            elif isinstance(line, ReuseLine):
+                old_name = line.node.get_name()
+                new_name = line.reused_as.get_name()
+                assert new_name not in name_to_group
+                # TODO(jansel): we should support reusing buffers created via ExternKernelAlloc
+                if old_name in name_to_group:
+                    name_to_group[old_name].names.append(new_name)
+                    name_to_group[new_name] = name_to_group[old_name]
+
+        outputs = set(V.graph.get_output_names())
+        unique_groups = [*{id(g): g for g in name_to_group.values()}.values()]
+        for group in unique_groups:
+            group.is_output = any(x in outputs for x in group.names)
+
+        assert self.buffer_groups is None
+        self.buffer_groups = unique_groups
+        return name_to_group
+
+    def convert_to_pool_lines(self, lines):
+        """
+        Convert AllocateLine/FreeIfNotReusedLine/ReuseLine into their
+        pool-based counterparts.
+        """
+        name_to_group = self.compute_buffer_groups(lines)
+        for i, line in enumerate(lines):
+            if isinstance(line, AllocateLine):
+                if line.node.get_name() in name_to_group:
+                    lines[i] = AllocFromPoolLine(
+                        self.wrapper, name_to_group[line.node.get_name()]
+                    )
+            elif isinstance(line, FreeIfNotReusedLine):
+                assert not line.is_reused
+                if line.node.get_name() in name_to_group:
+                    lines[i] = DeallocFromPoolLine(
+                        self.wrapper, name_to_group[line.node.get_name()]
+                    )
+            elif isinstance(line, ReuseLine):
+                if line.node.get_name() in name_to_group:
+                    line.delete_old = False
+
+    def compute_live_ranges(self, lines):
+        """Populate every BufferGroup.live_ranges field based on first/last usage"""
+        timestep = 0
+        worklist = collections.deque(lines)
+        while worklist:
+            if isinstance(worklist[0], MemoryPlanningLine):
+                timestep += 1
+                while worklist and isinstance(worklist[0], MemoryPlanningLine):
+                    line = worklist.popleft()
+                    if isinstance(line, PoolMemoryPlanningLine):
+                        line.group.update_usage(timestep)
+                        line.timestep = timestep
+            else:
+                worklist.popleft()
+
+        timestep += 1
+        assert self.buffer_groups is not None
+        for group in self.buffer_groups:
+            if group.is_output:
+                group.update_usage(timestep)
+
+    def allocate_groups(self):
+        """
+        Assign every allocation to a specific location in a specific AllocationPool.
+        """
+        assert config.memory_pool in ("none", "intermediates", "outputs", "combined")
+        assert self.buffer_groups is not None
+
+        for group in self.buffer_groups:
+            group.make_allocation()
+
+        outputs: List[Allocation] = []
+        intermediates: List[Allocation] = []
+        for group in self.buffer_groups:
+            assert group.allocation
+            if group.is_output and config.memory_pool != "combined":
+                outputs.append(group.allocation)
+            else:
+                intermediates.append(group.allocation)
+
+        for block in sorted(
+            outputs,
+            key=lambda x: (
+                x.size_hint,
+                -len(x.live_range),
+            ),
+        ):
+            self.pools.allocate_output(block)
+
+        for block in sorted(
+            intermediates,
+            key=lambda x: (
+                -x.size_hint,
+                -len(x.live_range),
+            ),
+        ):
+            self.pools.allocate(block)
+
+        self.pools.finalize()
+
+    def mark_first_last_usage(self, lines):
+        """
+        Populate the AllocFromPoolLine.is_first_pool_usage and
+        DeallocFromPoolLine.is_last_pool_usage fields so that pools
+        are created/destroyed.
+        """
+        seen = set()
+        for line in lines:
+            if isinstance(line, AllocFromPoolLine):
+                assert line.group.allocation
+                pool = line.group.allocation.pool
+                assert pool is not None
+                if pool not in seen:
+                    line.is_first_pool_usage = True
+                    seen.add(pool)
+
+        seen = set()
+        for line in reversed(lines):
+            if isinstance(line, DeallocFromPoolLine):
+                assert line.group.allocation
+                pool = line.group.allocation.pool
+                assert pool is not None
+                if pool not in seen:
+                    line.is_last_pool_usage = (
+                        pool.root.get_live_ranges().end <= line.timestep
+                    )
+                    seen.add(pool)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/multi_kernel.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/multi_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..d788076470203ff068b5b5557dc921f44a4781b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/multi_kernel.py
@@ -0,0 +1,413 @@
+import logging
+import os
+from typing import Any, List
+
+from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+
+from .. import config
+from ..codecache import PyCodeCache, TritonFuture
+from ..utils import cache_on_self, do_bench
+from ..virtualized import V
+from .common import TensorArg
+
+log = logging.getLogger(__name__)
+
+
+def get_kernel_argdefs(kernel):
+    arg_defs, _, _ = kernel.args.python_argdefs()
+    return arg_defs
+
+
+def _get_all_args(args_list):
+    all_args = max(args_list, key=len)[:]
+    for args in args_list:
+        assert set(args).issubset(set(all_args)), f"{args} v.s. {all_args}"
+
+    return all_args
+
+
+def get_all_kernel_argdefs(kernels):
+    """
+    The logic here must match with `get_all_call_args`.
+    """
+    argdefs_list = [get_kernel_argdefs(kernel) for kernel in kernels]
+
+    return _get_all_args(argdefs_list)
+
+
+def get_all_call_args(call_args_list):
+    """
+    Passed in the call_args for each subkernel and return the call_args for the
+    combined multi-kernel.
+
+    Note an algorithm as follows does not always work:
+    ```
+        all_call_args: Dict[
+            Any, None
+        ] = {}  # use a dict rather than set to maintain insertion order
+        for call_args in call_args_list:
+            all_call_args.update({arg: None for arg in call_args})
+
+        all_call_args = list(all_call_args.keys())
+    ```
+    It will fail if any kernel has the same argument passed in multiple times.
+    Check test_pass_same_arg_multi_times in test_multi_kernel.py
+
+    Instead, we pick the longest call args and assert that otehr call args are
+    a subset of it.
+    """
+    return _get_all_args(call_args_list)
+
+
+def get_numel_argdefs(kernel):
+    numel_argdefs = []
+    for tree in kernel.range_trees:
+        if tree.prefix != "r" or kernel.inside_reduction:
+            numel_argdefs.append(f"{tree.prefix}numel")
+
+    return numel_argdefs
+
+
+class MultiKernelState:
+    """
+    Maintain state of multi-kernel compilation so we don't define duplicated
+    multi-kernel for the same set of sub-kernels.
+
+    V.graph.wrapper_code has a reference to MultiKernelState instance.
+    """
+
+    def __init__(self):
+        self.subkernel_to_kernel_name = {}
+
+    def define_kernel(self, kernels):
+        """
+        Previously we name the multi kernel as "multi_kernel_{kernel_names[0]}".
+        This has some minor issue.
+
+        E.g. for persistent reduction https://gist.github.com/shunting314/39e7c00ff8bb2055942ed5a3255d61ca ,
+        there are 2 flavors of non-persistent reduction:
+          https://gist.github.com/shunting314/056d43d35907e87efb883970b35c17d4
+        and
+          https://gist.github.com/shunting314/02ee753b65c513c54e695626afe682bd
+
+        The only different is cache eviction policy.
+
+        We should name the multi-kernel differently in these 2 cases.
+        """
+        kernel_names = tuple(k.kernel_name for k in kernels)
+        if kernel_names in self.subkernel_to_kernel_name:
+            return self.subkernel_to_kernel_name[kernel_names]
+
+        # name the multi kernel based on the first kernel
+        multi_kernel_name = f"multi_kernel_{len(self.subkernel_to_kernel_name)}"
+        self.subkernel_to_kernel_name[kernel_names] = multi_kernel_name
+
+        if V.graph.cpp_wrapper:
+            # we should not generate any python code for multi-kernel during
+            # the second pass of cpp-wrapper.
+            return multi_kernel_name
+
+        wrapper = V.graph.wrapper_code
+
+        kernel_call_def_code = "\n".join(
+            [
+                f"""
+    def call{idx}(need_clone_args=False):
+        args = [{', '.join(get_kernel_argdefs(kernels[idx]))}]
+        if need_clone_args:
+            args, _ = multi_kernel_call.kernels[{idx}].clone_args(*args)
+        multi_kernel_call.kernels[{idx}].run(*args, {', '.join(get_numel_argdefs(kernels[idx]))}, grid=grid, stream=stream)
+        """.format(
+                    idx
+                ).strip(
+                    "\n"
+                )
+                for idx in range(len(kernels))
+            ]
+        )
+
+        # add subkernel src code hashes to the multi-kernel source code so changing a
+        # subkernel implementation will result in a differnt py file for
+        # multi-kernel. This makes cache implementation straightforward since
+        # we can decide cache file name based on multi-kernel py file name
+        # directly.
+        #
+        # Without the hash added for subkernels, the cache file may be shared by
+        # different subkernels which is incorrect.
+        subkernel_hashes = "\n".join(
+            f"# subkernel{i} code hash: {kernel.code_hash}"
+            for i, kernel in enumerate(kernels)
+        )
+
+        src_code = f"""
+{subkernel_hashes}
+def run(multi_kernel_call, {', '.join(get_all_kernel_argdefs(kernels))}, {', '.join(get_numel_argdefs(kernels[0]))}, grid, stream):
+{kernel_call_def_code}
+    multi_kernel_call.run_with_argless_kernels([call0, call1])
+        """  # noqa: B950 line too long
+        wrapper.header.splice(
+            f"""
+        {multi_kernel_name} = async_compile.multi_kernel({multi_kernel_name!r}, [
+            {", ".join(kernel_names)},
+        ],
+            '''
+        """
+        )
+        wrapper.header.splice(src_code)
+        wrapper.header.splice(
+            """
+            '''
+        )
+        """
+        )
+
+        return multi_kernel_name
+
+
+class MultiKernel:
+    """
+    This class maintains the compile time state for multi kernels.
+
+    Assume we do codegen for a MultiKernel encapsulating kernel1 and kernel2.
+    The generated definition for the multi-kernel will looks like:
+    ```
+    multi_kernel_kernel1 = MultiKernelCall([kernel1, kernel2], multi_kernel_definition_code)
+    ```
+
+    Here is an concrete example: https://gist.github.com/shunting314/d9f3fb6bc6cee3dbae005825ca196d39
+    """
+
+    def __init__(self, kernels):
+        assert len(kernels) >= 2
+
+        self.kernels = kernels
+        self.kernel_name = V.graph.wrapper_code.multi_kernel_state.define_kernel(
+            kernels
+        )
+
+        # need this since some code in inductor check if the kernel object has an args
+        # attribute to decide if it's a non-null kernel.
+        self.args = object()
+
+    def call_kernel(self, kernel_name):
+        """
+        Collect the union of arguments from all subkernels as the arguments
+        for the multi-kernel.
+        """
+        assert kernel_name == self.kernel_name
+        call_args_list = [kernel.get_call_args() for kernel in self.kernels]
+
+        all_call_args = get_all_call_args(call_args_list)
+        grid: List[Any] = []
+
+        if V.graph.cpp_wrapper:
+            # for the second pass of cpp-wrapper codegen, we should call
+            # the fast kernel directly
+            picked_kernel = MultiKernelCall.lookup_choice(kernel_name)
+            kernel_name = self.kernels[picked_kernel].kernel_name
+            final_call_args = call_args_list[picked_kernel]
+        else:
+            final_call_args = all_call_args
+
+        # numels for all subkernels should be the same. Use kernels[0] here
+        self.kernels[0].add_numel_to_call_args_and_grid(
+            kernel_name, final_call_args, grid
+        )
+
+        grid = V.graph.wrapper_code.generate_default_grid(kernel_name, grid)
+
+        V.graph.wrapper_code.generate_kernel_call(
+            kernel_name,
+            final_call_args,
+            grid,
+            V.graph.scheduler.current_device.index,
+        )
+
+    def codegen_nan_check(self):
+        wrapper = V.graph.wrapper_code
+        seen = set()
+        for k in self.kernels:
+            _, call_args, arg_types = k.args.python_argdefs()
+            for arg, arg_type in zip(call_args, arg_types):
+                if arg in seen:
+                    continue
+                seen.add(arg)
+                if isinstance(arg_type, TensorArg):
+                    line = f"assert not {arg}.isnan().any().item()"
+                    wrapper.writeline(line)
+                    line = f"assert not {arg}.isinf().any().item()"
+                    wrapper.writeline(line)
+
+    @property
+    def removed_buffers(self):
+        return set.intersection(*[k.removed_buffers for k in self.kernels])
+
+    @property
+    def inplaced_to_remove(self):
+        return set.intersection(*[k.inplaced_to_remove for k in self.kernels])
+
+    @property
+    @cache_on_self
+    def inplace_update_buffers(self):
+        """
+        Make sure all kernels have the same inplace update mappings.
+        """
+        for k in self.kernels[1:]:
+            assert k.inplace_update_buffers == self.kernels[0].inplace_update_buffers
+        return self.kernels[0].inplace_update_buffers
+
+    def warn_mix_layout(self, kernel_name: str):
+        pass
+
+
+class MultiKernelCall:
+    """
+    This class is called at run time to actually run the kernel
+    """
+
+    def __init__(self, multi_kernel_name, kernels, src_code):
+        assert len(kernels) >= 2
+        self._kernels = kernels
+        self.multi_kernel_name = multi_kernel_name
+
+        self._run = PyCodeCache.load(src_code).run
+        self.disable_cache = os.environ.get(
+            "TORCHINDUCTOR_DISABLE_MULTI_KERNEL_CACHE"
+        ) == "1" or is_metric_table_enabled("persistent_red_perf")
+
+        self.picked_kernel = None
+        if config.triton.multi_kernel > 1:
+            # manually force a subkernel to ease perf testing
+            picked_by_config = config.triton.multi_kernel - 2
+            assert picked_by_config < len(self._kernels)
+            self.picked_kernel = picked_by_config
+        elif not self.disable_cache:
+            self.load_cache()
+
+        self._recorded = False
+
+    def cache_file_path(self):
+        py_file_path = self._run.__globals__["__file__"]
+        return os.path.splitext(py_file_path)[0] + ".picked_kernel"
+
+    def load_cache(self):
+        assert self.picked_kernel is None
+        path = self.cache_file_path()
+        if os.path.exists(path):
+            with open(path) as fd:
+                self.picked_kernel = int(fd.read())
+                assert self.picked_kernel >= 0 and self.picked_kernel < len(
+                    self._kernels
+                )
+                log.debug(
+                    "Load picked kernel %d from cache file %s", self.picked_kernel, path
+                )
+
+    def store_cache(self):
+        assert self.picked_kernel is not None
+        path = self.cache_file_path()
+        with open(path, "w") as fd:
+            fd.write(str(self.picked_kernel))
+        log.debug("Store picked kernel %d to cache file %s", self.picked_kernel, path)
+
+    @property
+    def kernels(self):
+        """
+        Read results from future.
+
+        This should be called after parallel compilation is done.
+        In case you call this before compilation is done,
+        it may slow down the parallel compilation.
+        """
+        for i, kernel in enumerate(self._kernels):
+            if isinstance(kernel, TritonFuture):
+                self._kernels[i] = kernel.result()
+
+        return self._kernels
+
+    def run(self, *args, **kwargs):
+        self._run(self, *args, **kwargs)
+
+    @staticmethod
+    def benchmark_sub_kernels(kernel_calls):
+        """
+        Benchmark all the sub kernels and return the execution time
+        (in milliseconds) for each of time.
+
+        Unit test may mock this method to force a specific kernel to
+        be picked.
+        """
+        return [
+            do_bench(lambda: kernel_call(True), rep=40, fast_flush=True)
+            for kernel_call in kernel_calls
+        ]
+
+    # record_choice and lookup_choice are helper functions for cpp-wrapper
+    # codegen. The first pass use record_choice to keep the choice and
+    # the second pass do lookup by calling lookup_choice.
+    #
+    # An alternative that reused the multi-kernel cache does not work well
+    # since during codegen of the second pass, it's very hard to know the
+    # path for the cache file. Also reading the cache file need do some IO
+    # which can be slower.
+    @staticmethod
+    def record_choice(multi_kernel_name, choice):
+        """
+        Record the multi-kernel choice for cpp-wrapper first pass codegen
+        for the second pass.
+
+        We should do nothing if this function is not called during codegen.
+        """
+        from torch._inductor.graph import GraphLowering
+
+        if not isinstance(V.graph, GraphLowering):
+            return
+
+        if not V.graph.record_multi_kernel_choice:
+            return
+
+        V.graph.multi_kernel_to_choice[multi_kernel_name] = choice
+
+    @staticmethod
+    def lookup_choice(multi_kernel_name):
+        # this should always been done during cpp-wrapper codegen
+        assert V.graph.record_multi_kernel_choice
+        # there should be no miss
+        return V.graph.multi_kernel_to_choice[multi_kernel_name]
+
+    def run_with_argless_kernels(self, kernel_calls):
+        if self.picked_kernel is None:
+            timings = self.benchmark_sub_kernels(kernel_calls)
+            self.picked_kernel = timings.index(min(timings))
+            k0 = self.kernels[0]
+            log.debug(
+                "pick %dth sub-kernel in %s. Size hints %s. Reduction hint %s. Timings %s",
+                self.picked_kernel,
+                [k.inductor_meta.get("kernel_name") for k in self.kernels],
+                k0.size_hints,
+                k0.inductor_meta.get("reduction_hint"),
+                timings,
+            )
+
+            def get_kernel_path(k):
+                return k.fn.fn.__code__.co_filename
+
+            get_metric_table("persistent_red_perf").add_row(
+                lambda: {
+                    "kernel1_name": get_kernel_path(self.kernels[0]),
+                    "kernel2_name": get_kernel_path(self.kernels[1]),
+                    "kernel1_latency": timings[0],
+                    "kernel2_latency": timings[1],
+                    "size_hints": k0.size_hints,
+                    "reduction_hint": k0.inductor_meta.get("reduction_hint"),
+                    "speedup": timings[1] / timings[0],
+                }
+            )
+
+            if not self.disable_cache:
+                self.store_cache()
+
+        if not self._recorded:
+            self._recorded = True
+            self.record_choice(self.multi_kernel_name, self.picked_kernel)
+        kernel_calls[self.picked_kernel]()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/triton.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f61b0710f81fdb64bd80cc0da8b9fbfb77c530
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton.py
@@ -0,0 +1,3931 @@
+from __future__ import annotations
+
+import collections
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import operator
+import os
+import textwrap
+from functools import lru_cache
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Counter,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import sympy
+
+import torch
+import torch._logging
+
+from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata
+from torch._prims_common import is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import ValueRanges
+from torch.utils._triton import has_triton_package
+
+from ..._dynamo.utils import counters
+from .. import config, ir, scheduler
+from ..codecache import code_hash, get_path, PyCodeCache
+from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
+from ..ir import IRNode, ReductionHint, TritonTemplateBuffer
+from ..optimize_indexing import indexing_dtype_strength_reduction
+from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
+from ..triton_heuristics import AutotuneHint
+from ..utils import (
+    cache_on_self,
+    do_bench,
+    get_dtype_size,
+    get_fused_kernel_name,
+    get_kernel_metadata,
+    get_max_y_grid,
+    green_text,
+    is_welford_reduction,
+    next_power_of_2,
+    Placeholder,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    sympy_subs,
+    unique,
+    yellow_text,
+)
+from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V
+from ..wrapper_benchmark import get_kernel_category_by_source_code
+from .common import (
+    CSE,
+    CSEVariable,
+    DeferredLine,
+    free_symbol_startswith,
+    IndentedBuffer,
+    index_prevent_reordering,
+    Kernel,
+    OpOverrides,
+    PythonPrinter,
+    SizeArg,
+    TensorArg,
+)
+from .multi_kernel import MultiKernel
+from .triton_utils import config_of, signature_of, signature_to_meta
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
+
+
+@lru_cache(None)
+def gen_attr_descriptor_import():
+    """
+    import AttrsDescriptor if the triton version is new enough to have this
+    class defined.
+    """
+    if not has_triton_package():
+        return ""
+
+    import triton.compiler.compiler
+
+    if hasattr(triton.compiler.compiler, "AttrsDescriptor"):
+        return "from triton.compiler.compiler import AttrsDescriptor"
+    else:
+        return ""
+
+
+@lru_cache(None)
+def gen_common_triton_imports():
+    imports = IndentedBuffer()
+    imports.splice(
+        """
+        import triton
+        import triton.language as tl
+        """
+    )
+    if attr_desc := gen_attr_descriptor_import():
+        imports.writeline(attr_desc)
+
+    imports.splice(
+        """
+        from torch._inductor import triton_helpers, triton_heuristics
+        from torch._inductor.ir import ReductionHint, TileHint
+        from torch._inductor.triton_helpers import libdevice, math as tl_math
+        from torch._inductor.triton_heuristics import AutotuneHint
+        from torch._inductor.utils import instance_descriptor
+        """
+    )
+    return imports.getvalue()
+
+
+@dataclasses.dataclass
+class IndexingOptions:
+    index_str: str
+    mask_vars: Set[sympy.Symbol]
+    mask_str: str
+    expand_str: Optional[str]
+    _has_rindex: bool
+
+    def has_mask(self):
+        return bool(self.mask_vars)
+
+    def has_rindex(self):
+        return self._has_rindex
+
+    def has_tmpmask(self):
+        return "tmp" in self.mask_str
+
+    def has_rmask(self):
+        return "rmask" in self.mask_str
+
+
+@dataclasses.dataclass
+class BlockPtrOptions:
+    constant_offset: sympy.Expr
+    shape: List[sympy.Expr]
+    strides: List[sympy.Expr]
+    block_shape: List[str]
+    order: List[int]
+    offsets: List[str]
+    mask_vars: Set[sympy.Symbol]
+    reshape_suffix: List[str]
+
+    @staticmethod
+    def create(
+        strides: List[sympy.Expr],
+        constant_offset: sympy.Expr,
+        range_trees: List[IterationRangesEntry],
+        mask_vars: Set[sympy.Symbol],
+    ) -> BlockPtrOptions:
+        """Helper to create a  BlockPtrOptions instance"""
+        block_shape = [f"{t.prefix.upper()}BLOCK" for t in range_trees]
+        reshape_suffix = [*block_shape]
+
+        broadcasting_dim = [s == 0 for s in strides]
+        for i, is_broadcasting in enumerate(broadcasting_dim):
+            if is_broadcasting:
+                # drop any stride==0 dimensions for performance
+                reshape_suffix[i] = "1"
+
+        if V.kernel.no_x_dim:
+            assert range_trees[0].prefix == "x"
+            reshape_suffix.pop(0)
+
+        if (
+            not V.kernel.inside_reduction
+            and len(strides) == len(V.kernel.numels) - 1
+            and V.kernel.numels[-1] != 1
+        ):
+            # Need to expand rank by 1 to match rank when self.inside_reduction=True
+            reshape_suffix.append("1")
+
+        def filter(it):
+            """Removes any broadcasting dims from a given sequence"""
+            assert len(it) == len(broadcasting_dim)
+            return [
+                item
+                for item, is_broadcasting in zip(it, broadcasting_dim)
+                if not is_broadcasting
+            ]
+
+        return BlockPtrOptions(
+            constant_offset=V.graph.sizevars.lookup_precomputed_size(constant_offset),
+            shape=[
+                V.graph.sizevars.lookup_precomputed_size(t.numel)
+                for t in filter(range_trees)
+            ],
+            strides=[*map(V.graph.sizevars.lookup_precomputed_size, filter(strides))],
+            block_shape=filter(block_shape),
+            order=V.graph.sizevars.guarded_order(filter(strides)),
+            offsets=filter([f"{t.prefix}offset" for t in range_trees]),
+            mask_vars=mask_vars,
+            reshape_suffix=reshape_suffix,
+        )
+
+    def format(self, name: str, roffset=True) -> str:
+        """
+        Codegen a call to tl.make_block_ptr()
+
+        Args:
+            name: variable name for pointer
+            roffset: should roffset be included in offsets=..., for use with tl.advance()
+
+        Returns:
+            "tl.make_block_ptr(...)"
+        """
+        f = V.kernel.index_to_str
+        offsets = [*self.offsets]
+        if not roffset:
+            offsets[offsets.index("roffset")] = "0"
+        args = [
+            f"{name} + ({f(self.constant_offset)})"
+            if self.constant_offset != 0
+            else name,
+            f"shape={f(self.shape)}",
+            f"strides={f(self.strides)}",
+            f"block_shape={f(self.block_shape)}",
+            f"order={f(self.order)}",
+            f"offsets={f(offsets)}",
+        ]
+        return f"tl.make_block_ptr({', '.join(args)})"
+
+    @cache_on_self
+    def boundary_check(self) -> List[int]:
+        """List of indices to pass to tl.load(boundary_check=...)"""
+        check = []
+        for i in range(len(self.shape)):
+            if (
+                self.block_shape[i] != "1"
+                and not V.graph.sizevars.statically_known_equals(self.strides[i], 0)  # type: ignore[arg-type]
+                and not V.graph.sizevars.statically_known_multiple_of(
+                    self.shape[i],
+                    config.triton.max_block[self.block_shape[i][0]],  # type: ignore[arg-type]
+                )
+                and not (V.kernel.no_x_dim and self.block_shape[i] == "XBLOCK")
+            ):
+                check.append(i)
+        return check
+
+    def advance_roffset(self):
+        """Codegen string to pass to tl.advance(name, ...)"""
+        advance = ["0"] * len(self.shape)
+        advance[self.offsets.index("roffset")] = "RBLOCK"
+        return V.kernel.index_to_str(advance)
+
+    def has_rindex(self):
+        return "RBLOCK" in self.block_shape
+
+    def has_rmask(self):
+        return self.has_rindex()
+
+    def has_tmpmask(self):
+        return False  # block_ptr can't do indirect indexing
+
+    def has_mask(self):
+        return bool(self.boundary_check())
+
+
+def triton_reshape(value: str, old_shape: List[str], new_shape: List[str]):
+    """Workaround https://github.com/openai/triton/issues/2836"""
+    assert isinstance(old_shape, list) and isinstance(new_shape, list)
+    if old_shape == new_shape:
+        return value
+    if [s for s in new_shape if s != "1"] != old_shape:
+        return f"tl.reshape({value}, [{', '.join(new_shape)}])"
+    # rewrite to [:, None] syntax, which is less buggy
+    idx = 0
+    expand = []
+    for size in new_shape:
+        if idx < len(old_shape) and size == old_shape[idx]:
+            expand.append(":")
+            idx += 1
+        else:
+            assert size == "1"
+            expand.append("None")
+    assert idx == len(old_shape)
+    return f"{value}[{', '.join(expand)}]"
+
+
+class TritonPrinter(PythonPrinter):
+    def _print_floor(self, expr):
+        assert len(expr.args) == 1
+        return (
+            f"libdevice.floor({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
+
+    def _print_ceiling(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.ceil({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+
+    def _helper_sqrt(self, expr):
+        return f"libdevice.sqrt({self._print(expr)}.to(tl.float32))"
+
+    def _print_Where(self, expr):
+        c = self.doprint(expr.args[0])
+        p = self.doprint(expr.args[1])
+        q = self.doprint(expr.args[2])
+        return f"tl.where({c}, {p}, {q})"
+
+    def _print_Min(self, expr):
+        nargs = len(expr.args)
+        if len(expr.args) == 1:
+            return self._print(expr.args[0])
+
+        mid = len(expr.args) // 2
+        a = self._print(sympy.Min(*expr.args[:mid]))
+        b = self._print(sympy.Min(*expr.args[mid:]))
+        return f"tl.minimum({a}, {b})"
+
+    def _print_Max(self, expr):
+        nargs = len(expr.args)
+        if len(expr.args) == 1:
+            return self._print(expr.args[0])
+
+        mid = len(expr.args) // 2
+        a = self._print(sympy.Max(*expr.args[:mid]))
+        b = self._print(sympy.Max(*expr.args[mid:]))
+
+        return f"tl.maximum({a}, {b})"
+
+    def _print_Abs(self, expr):
+        assert len(expr.args) == 1
+        return f"tl_math.abs({self._print(expr.args[0])})"
+
+    def _print_cos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_cosh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.cosh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_acos(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.acos(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_sin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_sinh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.sinh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_asin(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.asin(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_tan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tan(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_tanh(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.tanh(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_atan(self, expr):
+        assert len(expr.args) == 1
+        return f"libdevice.atan(({self._print(expr.args[0])}).to(tl.float32))"
+
+    def _print_FloorDiv(self, expr):
+        if expr.is_integer:
+            return super()._print_FloorDiv(expr)
+
+        x, div = expr.args
+        x = self.paren(self.doprint(x))
+        div = self.paren(self.doprint(div))
+        return f"libdevice.floor({x} / {div}).to({V.kernel.index_dtype})"
+
+    def _print_Round(self, expr):
+        assert len(expr.args) == 1
+        return (
+            f"libdevice.llrint({self._print(expr.args[0])}).to({V.kernel.index_dtype})"
+        )
+
+    def _print_RoundDecimal(self, expr):
+        assert len(expr.args) == 2
+        number, ndigits = expr.args
+        if number.is_integer:
+            # ndigits < 0 should have been filtered by the sympy function
+            assert ndigits < 0
+            raise ValueError(
+                f"For integer inputs, only non-negative ndigits are currently supported, but got {ndigits}."
+            )
+        return f"libdevice.nearbyint(1e{ndigits} * {self.paren(self._print(number))}) * 1e{-ndigits}"
+
+
+texpr = TritonPrinter().doprint
+pexpr = PythonPrinter().doprint
+
+
+def triton_compute_type(dtype):
+    triton_type_name = str(dtype).split(".")[-1]
+    if triton_type_name == "bool":
+        triton_type_name = "int1"
+    elif triton_type_name in ("float16", "bfloat16"):
+        # float16 math is done in float32 inside the kernel
+        triton_type_name = "float32"
+    elif triton_type_name == "float8_e4m3fn":
+        triton_type_name = "float8e4nv"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5"
+    elif triton_type_name == "float8_e4m3fnuz":
+        triton_type_name = "float8e4b8"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5b16"
+    return f"tl.{triton_type_name}"
+
+
+def triton_store_type(dtype):
+    triton_type_name = str(dtype).split(".")[-1]
+    if triton_type_name == "bool":
+        triton_type_name = "int8"
+    elif triton_type_name == "float8_e4m3fn":
+        triton_type_name = "float8e4nv"
+    elif triton_type_name == "float8_e5m2":
+        triton_type_name = "float8e5"
+    return f"tl.{triton_type_name}"
+
+
+def triton_acc_type(dtype):
+    if is_integer_dtype(dtype) and dtype.is_signed:
+        nbits = 64 if dtype == torch.int64 else 32
+        return f"tl.int{nbits}"
+    return triton_compute_type(dtype)
+
+
+def triton_constant(value):
+    if value == float("inf"):
+        return 'float("inf")'
+    elif value == float("-inf"):
+        return 'float("-inf")'
+    elif math.isnan(value):
+        return 'float("nan")'
+    return repr(value)
+
+
+class TritonCSEVariable(CSEVariable):
+    def __init__(self, name, bounds: ValueRanges[Any]):
+        super().__init__(name, bounds)
+        # We'll use this to track which masks the variable needs when used for indirect indexing
+        self.mask_vars: Set[str] = set()
+
+    def update_on_args(self, name, args, kwargs):
+        # When making a variable that is going to be used in indirect indexing
+        # if a where clause is used it should mean that the result is always a
+        # valid index, so you shouldn't include any of the dependent variables
+        # in the resulting load mask
+        if name == "where":
+            return
+        for arg in args:
+            if isinstance(arg, TritonCSEVariable):
+                self.mask_vars.update(arg.mask_vars)
+            elif isinstance(arg, sympy.Symbol) and arg.name[0] in "xyr":
+                # most of the time index vars don't need masks associated with them
+                # however, when index vars are used to compute indices for indirect reads
+                # those reads should subsequently be masked,
+                self.mask_vars.update({f"{arg.name[0]}mask"})
+
+    def __repr__(self):
+        return f"TritonCSEVariable(name={self.name})"
+
+
+class TritonOverrides(OpOverrides):
+    """Map element-wise ops to Triton"""
+
+    @staticmethod
+    def to_dtype(x, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None):
+        def _get_min_elements_per_thread(
+            src_dtype: torch.dtype, dst_dtype: torch.dtype
+        ) -> int:
+            if src_dtype == dst_dtype:
+                # No data type conversion is needed. No requirements on min_elem_per_thread.
+                return 0
+
+            # fp8 data type conversions has min_elem_per_thread requirements.
+            # Refer to Triton implementations here:
+            # https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
+            fp8_dtypes = {
+                torch.float8_e4m3fn,
+                torch.float8_e5m2,
+            }
+            # Triton doesn't support type conversions between fp8_e4m3 and fp8_e5m2.
+            assert not (
+                src_dtype in fp8_dtypes
+                and dst_dtype in fp8_dtypes
+                and src_dtype != dst_dtype
+            ), "Conversions between float8_e5m2 and float8_e4m3fn is not supported!"
+            if src_dtype == torch.float8_e5m2 or dst_dtype == torch.float8_e5m2:
+                return 4
+            if src_dtype == torch.float8_e4m3fn or dst_dtype == torch.float8_e4m3fn:
+                return 2
+            # No requirements on min_elem_per_thread.
+            return 0
+
+        if src_dtype is not None:
+            # Both dtype and src_dtype are set. This is used by torch to(dtype=dtype).
+            # It takes the maximum min_elem_per_thread if there are multiple fp8 conversions
+            # in the same kernel.
+            V.kernel.min_elem_per_thread = max(
+                _get_min_elements_per_thread(src_dtype, dtype),
+                V.kernel.min_elem_per_thread,
+            )
+
+        if dtype == torch.bool:
+            return f"({x} != 0)"
+        elif dtype == torch.uint8:
+            # to work around llvm uint conversion semantics
+            # that produces 0's for negative values
+            return f"{x}.to(tl.int8).to(tl.uint8)"
+        return f"{x}.to({triton_compute_type(dtype)})"
+
+    @staticmethod
+    def to_dtype_bitcast(x, dtype: torch.dtype, src_dtype: torch.dtype):
+        triton_dtype = triton_compute_type(dtype)
+        # We may promote float16 or bfloat16 to float32 and cause the
+        # bitwidth of dtype to be different from the input tensor (i.e. float32).
+        # In such as case, we will have to convert the input tensor to
+        # its src_type, perform bitcast, and then convert the bit-casted
+        # tensor back to float to ensure we use values with the right precision.
+        if src_dtype in (torch.float16, torch.bfloat16):
+            triton_src_dtype = str(src_dtype).split(".")[-1]
+            cast_x = f"{x}.to(tl.{triton_src_dtype})"
+            cast_x = f"{cast_x}.to({triton_dtype}, bitcast=True)"
+            return f"{cast_x}.to(tl.float32)"
+        else:
+            return f"{x}.to({triton_dtype}, bitcast=True)"
+
+    @staticmethod
+    def _shaped_constant(value, dtype, shape):
+        type_ = torch._prims_common.dtype_to_type(dtype)
+        triton_val = triton_constant(type_(value))
+        triton_type = triton_compute_type(dtype)
+
+        if triton_type == "tl.float32":
+            # Float constants are always f32 in triton
+            return triton_val
+
+        # NOTE: We use a tensor here in order to get the expected type.
+        # Otherwise, e.g. float64 constants would be trunctated to float32.
+        return f"tl.full({shape}, {triton_val}, {triton_type})"
+
+    @classmethod
+    def constant(cls, value, dtype):
+        return cls._shaped_constant(value, dtype, shape=[])
+
+    @staticmethod
+    def abs(x):
+        return f"tl_math.abs({x})"
+
+    @staticmethod
+    def libdevice_abs(x):
+        return f"libdevice.abs({x})"
+
+    @staticmethod
+    def exp(x):
+        return f"tl_math.exp({x})"
+
+    @staticmethod
+    def libdevice_exp(x):
+        return f"libdevice.exp({x})"
+
+    @staticmethod
+    def exp2(x):
+        return f"libdevice.exp2({x})"
+
+    @staticmethod
+    def expm1(x):
+        return f"libdevice.expm1({x})"
+
+    @staticmethod
+    def sqrt(x):
+        return f"libdevice.sqrt({x})"
+
+    @staticmethod
+    def libdevice_sqrt(x):
+        return f"libdevice.sqrt({x})"
+
+    @staticmethod
+    def relu(x):
+        bug = config.triton.inject_relu_bug_TESTING_ONLY
+        if bug == "compile_error":
+            return "compile error!"
+        elif bug == "runtime_error":
+            # NB: this only triggers runtime error as long as input
+            # is not all zero
+            return f'triton_helpers.device_assert_then({x} == 0, "injected assert fail", {x})'
+        elif bug == "accuracy":
+            return f"{x} + 1"
+        elif bug is None:
+            return ops.maximum("0", x)
+        else:
+            raise AssertionError(
+                f"unrecognized config triton.inject_relu_bug_TESTING_ONLY = {bug!r}"
+            )
+
+    @staticmethod
+    def minimum(a, b):
+        return f"triton_helpers.minimum({a}, {b})"
+
+    @staticmethod
+    def maximum(a, b):
+        return f"triton_helpers.maximum({a}, {b})"
+
+    @staticmethod
+    def where(a, b, c):
+        return f"tl.where({a}, {b}, {c})"
+
+    @staticmethod
+    def cos(x):
+        return f"tl_math.cos({x})"
+
+    @staticmethod
+    def libdevice_cos(x):
+        return f"libdevice.cos({x})"
+
+    @staticmethod
+    def sin(x):
+        return f"tl_math.sin({x})"
+
+    @staticmethod
+    def libdevice_sin(x):
+        return f"libdevice.sin({x})"
+
+    @classmethod
+    def index_expr(cls, expr, dtype):
+        raise NotImplementedError("ops.index_expr not implemented outside a kernel")
+
+    @staticmethod
+    def masked(mask, body, other):
+        raise NotImplementedError("ops.masked not implemented outside a kernel")
+
+    @staticmethod
+    def lgamma(x):
+        return f"libdevice.lgamma({x})"
+
+    @staticmethod
+    def erf(x):
+        return f"libdevice.erf({x})"
+
+    @staticmethod
+    def cosh(x):
+        return f"libdevice.cosh({x})"
+
+    @staticmethod
+    def sinh(x):
+        return f"libdevice.sinh({x})"
+
+    @staticmethod
+    def acos(x):
+        return f"libdevice.acos({x})"
+
+    @staticmethod
+    def acosh(x):
+        return f"libdevice.acosh({x})"
+
+    @staticmethod
+    def asin(x):
+        return f"libdevice.asin({x})"
+
+    @staticmethod
+    def asinh(x):
+        return f"libdevice.asinh({x})"
+
+    @staticmethod
+    def atan2(x, y):
+        return f"libdevice.atan2({x}, {y})"
+
+    @staticmethod
+    def atan(x):
+        return f"libdevice.atan({x})"
+
+    @staticmethod
+    def atanh(x):
+        return f"libdevice.atanh({x})"
+
+    @staticmethod
+    def copysign(x, y):
+        return f"libdevice.copysign({x}, {y})"
+
+    @staticmethod
+    def erfc(x):
+        return f"libdevice.erfc({x})"
+
+    @staticmethod
+    def erfinv(x):
+        return f"libdevice.erfinv({x})"
+
+    @staticmethod
+    def hypot(x, y):
+        return f"libdevice.hypot({x}, {y})"
+
+    @staticmethod
+    def log10(x):
+        return f"libdevice.log10({x})"
+
+    @staticmethod
+    def nextafter(x, y):
+        return f"libdevice.nextafter({x}, {y})"
+
+    @staticmethod
+    def logical_and(a, b):
+        return f"{a} & {b}"
+
+    @staticmethod
+    def logical_not(a):
+        return f"{a} == 0"
+
+    @staticmethod
+    def logical_or(a, b):
+        return f"{a} | {b}"
+
+    @staticmethod
+    def logical_xor(a, b):
+        return f"({a} ^ {b})"
+
+    @staticmethod
+    def bitwise_and(a, b):
+        return f"{a} & {b}"
+
+    @staticmethod
+    def bitwise_not(a):
+        return f"~{a}"
+
+    @staticmethod
+    def bitwise_or(a, b):
+        return f"{a} | {b}"
+
+    @staticmethod
+    def bitwise_xor(a, b):
+        return f"{a} ^ {b}"
+
+    @staticmethod
+    def bitwise_left_shift(a, b):
+        return f"{a} << {b}"
+
+    @staticmethod
+    def bitwise_right_shift(a, b):
+        return f"{a} >> {b}"
+
+    @staticmethod
+    def rand(seed, offset):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"tl.rand({seed}, {offset})"
+
+    @staticmethod
+    def randn(seed, offset):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"tl.randn({seed}, {offset})"
+
+    @staticmethod
+    def randint64(seed, offset, low, high):
+        offset = f"({offset}).to(tl.uint32)"
+        return f"triton_helpers.randint64({seed}, {offset}, {low}, {high})"
+
+    @staticmethod
+    def load_seed(name, offset):
+        raise NotImplementedError("ops.load_seed not implemented outside a kernel")
+
+    @staticmethod
+    def rsqrt(x):
+        return f"libdevice.rsqrt({x})"
+
+    @staticmethod
+    def log1p(x):
+        return f"libdevice.log1p({x})"
+
+    @staticmethod
+    def tan(x):
+        return f"libdevice.tan({x})"
+
+    @staticmethod
+    def tanh(x):
+        return f"libdevice.tanh({x})"
+
+    @staticmethod
+    def sigmoid(x):
+        return f"tl.sigmoid({x})"
+
+    @staticmethod
+    def libdevice_sigmoid(x):
+        return f"1/(1 + libdevice.exp(-({x})))"
+
+    @staticmethod
+    def signbit(x):
+        # XX: This is wrong for the value -0.0 in floating point
+        return f"libdevice.signbit({x}) if ({x}).dtype is tl.float32 else {x} < 0"
+
+    @staticmethod
+    def fmod(a, b):
+        return f"libdevice.fmod({a}, {b})"
+
+    @staticmethod
+    def pow(a, b):
+        return f"libdevice.pow({a}, {b})"
+
+    @staticmethod
+    def log(x):
+        return f"tl_math.log({x})"
+
+    @staticmethod
+    def libdevice_log(x):
+        return f"libdevice.log({x})"
+
+    @staticmethod
+    def isinf(x):
+        return f"libdevice.isinf({x}).to(tl.int1)"
+
+    @staticmethod
+    def isnan(x):
+        return f"libdevice.isnan({x}).to(tl.int1)"
+
+    @staticmethod
+    def round(x):
+        return f"libdevice.nearbyint({x})"
+
+    @staticmethod
+    def floor(x):
+        return f"libdevice.floor({x})"
+
+    @staticmethod
+    def floordiv(a, b):
+        # See the comment in lowering.div_mode. a and b are integer type.
+        # Similar to div_floor_kernel_cuda in pytorch core.
+        # Notice that // in triton behaves as truncdiv instead of floordiv
+        quot = f"{a} // {b}"
+        rem = f"{a} % {b}"
+        return f"tl.where(({a} < 0) != ({b} < 0), tl.where({rem} != 0, {quot} - 1, {quot}), {quot})"
+
+    @staticmethod
+    def sign(x):
+        def to_int(s):
+            return f"{s}.to(tl.int8)"
+
+        left = to_int(ops.lt("0", x))
+        right = to_int(ops.lt(x, "0"))
+        sub = ops.sub(left, right)
+        return f"{sub}.to({x}.dtype)"
+
+    @staticmethod
+    def trunc(x):
+        return f"libdevice.trunc({x})"
+
+    @staticmethod
+    def truncdiv(a, b):
+        # See the comment in lowering.div_mode. a and b are integer type.
+        # Notice that // in triton behaves as truncdiv instead of floordiv
+        return f"{a} // {b}"
+
+    @staticmethod
+    def ceil(x):
+        return f"libdevice.ceil({x})"
+
+
+TritonOverrides._initialize_pointwise_overrides("triton")
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonOverrides(h: TritonOverrides) -> OpsHandler[str]:
+    return h
+
+
+class TritonKernelOverrides(TritonOverrides):
+    """Map element-wise ops to Triton within a TritonKernel
+
+    Unlike TritonOverrides, these assume the code is going to be inserted into
+    the body of the main triton kernel and so it may use indexing and mask
+    variables which are assumed to already be defined in the current scope.
+    """
+
+    @classmethod
+    def constant(cls, value, dtype):
+        # NOTE: Cannot use shape=[] as it's not supported by triton-rocm
+        # We could use shape=[1] instead but starting with the correct
+        # ndim avoids extra `tt.expand_dim` ops appearing in the triton IR.
+        ndim = V.kernel.triton_tensor_ndim()
+        shape = [1] * ndim
+        return cls._shaped_constant(value, dtype, shape=shape)
+
+    @classmethod
+    def index_expr(cls, expr, dtype):
+        indexing = V.kernel.indexing(expr, block_ptr=False)
+        assert isinstance(indexing, IndexingOptions)
+        # This is called from CSEProxy.__getattr__,  so we'll set the bounds there
+        var = V.kernel.cse.generate(V.kernel.compute, indexing.index_str)
+
+        if dtype not in {torch.int32, torch.int64}:
+            var = V.kernel.cse.generate(V.kernel.compute, cls.to_dtype(var, dtype))
+        var.mask_vars = indexing.mask_vars
+        return var
+
+    @staticmethod
+    def masked(mask, body, other):
+        with V.kernel.mask_loads(mask) as new_mask:
+            result = body()
+
+        # Take dtype from result to prevent accidental promotion
+        other = V.kernel.cse.generate(
+            V.kernel.compute,
+            f"tl.full({result}.shape, {triton_constant(other)}, {result}.dtype)",
+        )
+        return ops.where(new_mask, result, other)
+
+    @staticmethod
+    def load_seed(name, offset):
+        var = V.kernel.args.input(name)
+        return (
+            f"tl.load({var} + {V.kernel.args.seed_offset('load_seed_offset', offset)})"
+        )
+
+    @staticmethod
+    def frexp(x):
+        cache_key = f"frexp({x})"
+        if cache_key in V.kernel.cse.cache:
+            return V.kernel.cse.cache[cache_key]
+
+        mantissa = V.kernel.cse.newvar()
+        exponent = V.kernel.cse.newvar()
+        V.kernel.compute.writeline(
+            f"{mantissa}, {exponent} = triton_helpers.frexp({x})"
+        )
+        V.kernel.cse.cache[cache_key] = (mantissa, exponent)
+        return (mantissa, exponent)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_TritonKernelOverrides(h: TritonKernelOverrides) -> OpsHandler[str]:
+    return h
+
+
+@dataclasses.dataclass
+class IterationRanges:
+    """
+    Each range tree represents multiple sets of iteration indexing
+    in a single tiled dimension in the output kernel.
+
+    If you have two loops ranges one (4, 3, 2) and another (4, 6),
+    then the range tree will be:
+            4 (i0)
+        3 (i1)  6 (i3)
+        2 (i2)
+    Where i0 is shared between both loops, but then the split into
+    different indexing vars.  All loop ranges must iterate over
+    the same number of elements.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        var_list: List[sympy.Symbol],
+        var_ranges: Dict[sympy.Symbol, sympy.Expr],
+        numel: sympy.Expr,
+        prefix: str,
+        *,
+        kernel: TritonKernel,
+        divisor=sympy.Integer(1),
+        length=sympy.Integer(1),
+        root: IterationRangesRoot,
+    ):
+        super().__init__()
+        self.name = name
+        self.var_list = var_list
+        self.var_ranges = var_ranges
+        self.numel = numel
+        self.prefix = prefix
+        self.divisor = divisor
+        self.length = length
+        self.kernel = kernel
+        self.root = root
+
+    def symbol(self):
+        return sympy_index_symbol(self.name)
+
+
+class IterationRangesRoot(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        numel: sympy.Expr,
+        prefix: str,
+        index: int,
+        kernel: TritonKernel,
+        pid_cache=None,
+        *,
+        is_loop: bool,
+        tensor_dim: Optional[int],
+        grid_dim: Optional[int],
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__(
+            name=name,
+            var_list=[],
+            var_ranges={},
+            numel=numel,
+            prefix=prefix,
+            kernel=kernel,
+            root=self,
+        )
+        self.index = index
+        # Store all the nodes in one flat list
+        self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {}
+        # This is for re-ordering program ID in triton mm template
+        # pid_cache["tl.program_id(0)"] = pid_m
+        self.pid_cache: Dict[str, str] = pid_cache
+
+        # True if the dimension is implemented as a single program looping over
+        # the full dimension (currently only used for non-persistent reduction)
+        assert not is_loop or (prefix == "r" and grid_dim is None)
+        self.is_loop = is_loop
+        # Index of corresponding dimension on triton tensors
+        self.tensor_dim = tensor_dim
+        # Index of corresponding dimension in the triton grid
+        self.grid_dim = grid_dim
+
+    def __repr__(self):
+        return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)"
+
+    def cache_clear(self):
+        for node in self.nodes.values():
+            node.cache_clear()
+
+    def lookup(self, divisor, length):
+        """
+        Lookup a given RangeTreeEntry, creating it if needed
+        """
+        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
+        else:
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
+
+        if expr not in self.nodes:
+            node = IterationRangesEntry(
+                f"{self.prefix}{next(V.kernel.iter_vars_count)}",
+                divisor,
+                length,
+                expr,
+                self,
+            )
+            V.kernel.range_tree_nodes[node.symbol()] = node
+            self.var_list.append(node.symbol())
+            self.var_ranges[node.symbol()] = length
+            self.nodes[expr] = node
+        return self.nodes[expr]
+
+    def construct_entries(self, lengths: List[sympy.Expr]):
+        divisor = sympy.Integer(1)
+        itervars = []
+        for length in reversed(lengths):
+            itervars.append(self.lookup(divisor, length))
+            divisor = divisor * length
+        return list(reversed(itervars))
+
+    def construct(self, lengths: List[sympy.Expr]):
+        return [e.symbol() for e in self.construct_entries(lengths)]
+
+    def vars_and_sizes(self, index: sympy.Expr):
+        """Figure out vars from this tree used in index"""
+        nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
+        nodes = [n for n in nodes if n and n.prefix == self.prefix]
+        nodes.sort(key=lambda x: V.graph.sizevars.size_hint(x.divisor))
+        divisor = sympy.Integer(1)
+        index_vars = []
+        sizes = []
+
+        def add(node):
+            nonlocal divisor
+            index_vars.append(node.symbol())
+            sizes.append(node.length)
+            divisor = divisor * node.length
+
+        for node in nodes:
+            if not V.graph.sizevars.statically_known_equals(node.divisor, divisor):
+                # fill in unused index var
+                add(self.lookup(divisor, FloorDiv(node.divisor, divisor)))
+                divisor = node.divisor
+            add(node)
+        if not V.graph.sizevars.statically_known_equals(self.numel, divisor):
+            # fill in unused index var
+            add(self.lookup(divisor, FloorDiv(self.numel, divisor)))
+
+        return list(reversed(index_vars)), list(reversed(sizes))
+
+    def ranges_code(self):
+        assert self.tensor_dim is not None
+        size = self.kernel.indexing_size_str(self.tensor_dim)
+        index_dtype = self.kernel.index_dtype
+        convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
+        return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}"
+
+    def scalar_code(self, value):
+        index_dtype = self.kernel.index_dtype
+        ndim = self.kernel.triton_tensor_ndim()
+        size = [1] * ndim
+        return f"tl.full({size}, {value}, {index_dtype})"
+
+    def get_pid(self):
+        assert self.grid_dim is not None
+        key = f"tl.program_id({self.grid_dim})"
+        # y_grid has a limit, so express it in terms of y and z in case of overflow.
+        # z grid is only exercised when max_tiles == 3 (off by default).
+        if (
+            self.grid_dim == 1
+            and config.triton.max_tiles <= 2
+            and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid())
+        ):
+            key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)"
+        pid = self.pid_cache.get(key, key)
+        if self.kernel.index_dtype != "tl.int32":
+            return f"{pid}.to({self.kernel.index_dtype})"
+        return pid
+
+    def codegen_header(self, code):
+        x = self.prefix
+        if self.is_loop:
+            code.writeline(f"{self.name} = {x}offset + {x}base")
+        elif self.grid_dim is None:
+            # no need to "{x}offset = "
+            code.writeline(f"{self.name} = {self.ranges_code()}")
+            code.writeline(f"{x}offset = 0")
+        else:
+            if self.tensor_dim is not None:
+                line = f"{x}offset + {self.ranges_code()}"
+            else:
+                line = self.scalar_code(f"{x}offset")
+            code.writelines(
+                [
+                    f"{x}offset = {self.get_pid()} * {x.upper()}BLOCK",
+                    f"{self.name} = {line}",
+                ]
+            )
+        code.writeline(f"{x}mask = {self.name} < {x}numel")
+
+
+class IterationRangesEntry(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        divisor: sympy.Expr,
+        length: sympy.Expr,
+        expr: sympy.Expr,
+        parent: IterationRanges,
+    ):
+        super().__init__(
+            name=name,
+            numel=parent.numel / length,
+            var_list=parent.var_list,
+            var_ranges=parent.var_ranges,
+            prefix=parent.prefix,
+            divisor=divisor,
+            length=length,
+            kernel=parent.kernel,
+            root=parent.root,
+        )
+        self.parent = parent
+        self.codegen = functools.lru_cache(None)(self._codegen)
+        self.expr = expr
+
+    def __repr__(self):
+        return f"IterationRangesEntry({self.name}, {self.divisor}, {self.length}, {self.expr}, {self.var_ranges})"
+
+    def set_name(self, name):
+        self.codegen = lambda: name  # type: ignore[assignment]
+        self.codegen.cache_clear = lambda: None  # type: ignore[method-assign]
+        self.name = name
+
+    def cache_clear(self):
+        self.codegen.cache_clear()
+
+    def writeline(self, line):
+        if self.root.is_loop:
+            V.kernel.indexing_code.writeline(line)
+        else:
+            # lift non-reduction stores outside loop
+            V.kernel.body.writeline(line)
+
+    def _codegen(self):
+        self.writeline(f"{self.name} = " + texpr(V.kernel.rename_indexing(self.expr)))
+        return self.name
+
+    def precomputed_args(self):
+        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
+        precomputed_args: List[sympy.Expr] = []
+        if isinstance(self.expr, sympy.Symbol):
+            return precomputed_args
+        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
+        for arg in self.expr.args[1:]:
+            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
+                symbols = arg.free_symbols
+                if len(symbols) > 0 and all(s.name.startswith("s") for s in symbols):
+                    precomputed_args.append(arg)
+        return precomputed_args
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+
+class HelperFunctions:
+    """An ordered set of helper functions."""
+
+    _templates_seen: Dict[str, str]  # Template code to function name
+    finalized_helpers: List[str]
+
+    def __init__(self):
+        self._templates_seen = {}
+        self.finalized_helpers = []
+
+    def add(self, template_code: str) -> str:
+        """This accepts a function definition with the function name
+        left as a format specifier e.g.
+
+            @triton.jit
+            def {name}(arg0, arg1):
+                return arg0 + arg1
+
+        We add the templated code to the function set and return the name
+        assigned to that function.
+
+        """
+        existing_name = self._templates_seen.get(template_code)
+        if existing_name is not None:
+            # Don't duplicate existing helpers
+            return existing_name
+
+        name = f"_triton_helper_fn{len(self.finalized_helpers)}"
+        self._templates_seen[template_code] = name
+        self.finalized_helpers.append(template_code.format(name=name))
+        return name
+
+    def __iter__(self):
+        return iter(self.finalized_helpers)
+
+    def __getitem__(self, idx):
+        return self.finalized_helpers[idx]
+
+
+class TritonKernel(Kernel):
+    overrides = TritonKernelOverrides  # type: ignore[assignment]
+    sexpr = pexpr
+
+    helper_functions: HelperFunctions
+
+    def __init__(
+        self,
+        *groups,
+        index_dtype: str,
+        mutations: Optional[Set[str]] = None,
+        pid_cache=None,
+        reduction_hint=ReductionHint.DEFAULT,
+        min_elem_per_thread=0,
+        disable_persistent_reduction=False,
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__()
+        self.numels = [V.graph.sizevars.simplify(s) for s in groups]
+        self.mutations: Set[str] = mutations if mutations is not None else set()
+        self.range_trees: List[IterationRangesRoot] = []
+        self.range_tree_nodes: Dict[sympy.Symbol, IterationRangesEntry] = {}
+        self.iter_vars_count = itertools.count()
+        self.inside_reduction = self.numels[-1] != 1
+        self.body = IndentedBuffer()
+        self.indexing_code = IndentedBuffer()
+        self.suffix: IndentedBuffer = IndentedBuffer()  # type: ignore[assignment]
+        self.outside_loop_vars: Set[Any] = set()
+        self.reduction_hint = reduction_hint
+        self.index_dtype: str = index_dtype
+        self.min_elem_per_thread = min_elem_per_thread
+        self.last_usage: Set[str] = set()
+        self.block_ptr_id = itertools.count()
+        # buffer accesses in the kernel
+        self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
+
+        self.persistent_reduction: bool = (
+            not disable_persistent_reduction
+        ) and self.should_use_persistent_reduction()
+        self.no_x_dim = (
+            self.reduction_hint == ReductionHint.INNER
+            and self.persistent_reduction
+            and len(self.numels) == 2
+            and self.numels[-1] >= 256
+        )
+        self.initialize_range_tree(pid_cache)
+
+        self.helper_functions = HelperFunctions()
+
+        # A set of autotuning hints to pass as part of triton_meta
+        self.autotune_hints: Set[AutotuneHint] = set()
+
+        # define this in a closure to make cache local to object
+        @functools.lru_cache(None)
+        def simplify_indexing(index: sympy.Expr):
+            index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges())
+            for tree in self.range_trees:
+                index = self.combine_contiguous_dims(index, tree)
+            return index
+
+        self.simplify_indexing = simplify_indexing
+        self.code_hash = None
+        self.triton_meta: Optional[Dict[str, object]] = None
+
+    def need_numel_args(self):
+        r"""
+        Indicate whether we need provide numel as arguments for the generated
+        kernel calls in the benchmark.
+
+        Should be true for pointwise/reduction kernels but false for triton
+        matmul kernels.
+        """
+        return True
+
+    def should_use_persistent_reduction(self) -> bool:
+        """
+        Heuristic to set self.persistent_reduction and add guards
+        if needed.
+        """
+        if not (self.inside_reduction and config.triton.persistent_reductions):
+            return False
+        threshold = {
+            ReductionHint.INNER: 1024,
+        }.get(self.reduction_hint, 64)
+
+        # If multi_kernel is enabled, we do more aggressive persistent reduction.
+        # This may result in some persisent reductions slower than the
+        # corresponding non-persistent reductions. MultiKernel will do benchmarking
+        # to pick the faster one.
+        if config.triton.multi_kernel:
+            threshold *= 16
+        last_numel = self.numels[-1]
+        if not isinstance(last_numel, (int, sympy.Integer)):
+            # Not static
+            return False
+        hint = V.graph.sizevars.size_hint(last_numel)
+        if hint > threshold:
+            return False
+        # will need to recompile if we cross a larger power of 2 boundary
+        V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))  # type: ignore[arg-type]
+        return True
+
+    def set_last_usage(self, nodes):
+        if not self.inside_reduction or self.persistent_reduction:
+            return
+        self.last_usage = set(
+            itertools.chain.from_iterable(
+                n.last_usage for n in nodes if n is not EnableReduction
+            )
+        )
+
+    def initialize_range_tree(self, pid_cache):
+        no_r_dim = not self.inside_reduction or self.numels[-1] == 1
+
+        prefixes = "zyxr"
+        active_prefixes = prefixes[-len(self.numels) :]
+
+        grid_dims = "xyz"
+        if self.no_x_dim:
+            tensor_dims = "r"
+        elif no_r_dim:
+            tensor_dims = "xyz"
+        else:
+            tensor_dims = "xyzr"
+
+        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
+
+        for i, prefix in enumerate(active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
+            grid_dim = None if is_reduction else grid_dims.find(prefix)
+            index = i if grid_dim is None else grid_dim
+            self.range_trees.append(
+                IterationRangesRoot(
+                    f"{prefix}index",
+                    self.numels[i],
+                    prefix,
+                    index,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=is_reduction and not self.persistent_reduction,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                )
+            )
+        for tree in self.range_trees:
+            # reduction indexing goes inside a loop
+            if not tree.is_loop:
+                tree.codegen_header(self.body)
+        if self.inside_reduction and self.range_trees[-1].is_loop:
+            # workaround for this issue:
+            # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7
+            self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}")
+
+    def disable_reduction(self):
+        should_flush = self.range_trees[-1].is_loop
+
+        @contextlib.contextmanager
+        def ctx():
+            if self.numels[-1] == 1:
+                assert not self.inside_reduction
+                yield
+                return
+            if should_flush:
+                # calling codegen_body() will flush all the pending buffers
+                # and write out a reduction loop
+                self.codegen_body()
+            self.inside_reduction = False
+            try:
+                yield
+                if should_flush:
+                    # flush out any code before opening the next loop
+                    self.codegen_body()
+            finally:
+                self.inside_reduction = True
+
+        return ctx()
+
+    def set_ranges(self, *lengths):
+        assert len(lengths) == len(self.range_trees)
+        return [
+            ranges.construct(length)
+            for length, ranges in zip(lengths, self.range_trees)
+        ]
+
+    @staticmethod
+    def _split_iteration_ranges(
+        groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        sv = V.graph.sizevars
+        new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
+        remaining = [sv.simplify(g) for g in groups]
+        var_count = itertools.count()
+
+        def add_range(i, expr):
+            expr = sv.simplify(expr)
+            if not sv.statically_known_multiple_of(remaining[i], expr):
+                raise CantSplit()
+            # guard on the last item out
+            remaining[i] = FloorDiv(remaining[i], expr)
+            new_ranges[i].append(expr)
+            return next(var_count)
+
+        def make_combined(size, idx1, idx2):
+            def getter(flat_vars):
+                return size * flat_vars[idx1] + flat_vars[idx2]
+
+            return getter
+
+        return_getters_groups = []
+        current_group = 0
+        for length_group in lengths:
+            return_getters = []
+            for size in length_group:
+                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
+                    return_getters.append(lambda _: sympy.Integer(0))
+                    continue
+
+                while (
+                    current_group < len(remaining)
+                    and sv.size_hint(remaining[current_group]) == 1
+                ):
+                    # scroll to next group with remaining elements
+                    current_group += 1
+
+                if sv.size_hint(size) > sv.size_hint(remaining[current_group]):
+                    # need to break size in two
+                    if not sv.statically_known_multiple_of(
+                        size, remaining[current_group]
+                    ):
+                        raise CantSplit()
+                    size1 = remaining[current_group]
+                    size2 = FloorDiv(size, remaining[current_group])
+                    return_getters.append(
+                        make_combined(
+                            size2,
+                            add_range(current_group, size1),
+                            add_range(current_group + 1, size2),
+                        )
+                    )
+                else:
+                    return_getters.append(
+                        operator.itemgetter(add_range(current_group, size))
+                    )
+            return_getters_groups.append(return_getters)
+
+        assert all(
+            V.graph.sizevars.size_hint(s) == 1 for s in remaining
+        ), f"failed to set ranges {remaining} {lengths}"
+
+        return new_ranges, return_getters_groups
+
+    @classmethod
+    def is_compatible(
+        cls, groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        try:
+            cls._split_iteration_ranges(groups, lengths)
+            return True
+        except CantSplit:
+            return False
+
+    def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]):
+        """
+        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).
+
+        To do this we need to split up the iteration space of i0 into something like:
+            for i1 in s0:
+              for i2 in s1:
+                i0 = i1*s1 + i2
+                ....
+
+        This function matches and resplits lengths to the groups of
+        this kernel to enable tiled + non-tiled fusions.
+        """
+        groups = [rt.numel for rt in self.range_trees]
+        if not self.inside_reduction:
+            groups[-1] = sympy.Integer(1)
+
+        if len(lengths) == len(self.range_trees) and all(
+            V.graph.sizevars.simplify(sympy_product(x) - g) == 0
+            for x, g in zip(lengths, groups)
+        ):
+            return self.set_ranges(*lengths)
+
+        new_ranges, return_getters_groups = self._split_iteration_ranges(
+            groups, lengths
+        )
+        itervars = list(itertools.chain.from_iterable(self.set_ranges(*new_ranges)))
+        return [[fn(itervars) for fn in fns] for fns in return_getters_groups]
+
+    def is_indirect_indexing(self, index: sympy.Expr):
+        # tmpX  means indirect indexing
+        return free_symbol_startswith(index, "tmp")
+
+    def is_broadcasted(self, index: sympy.Expr):
+        # Note. This may not be correct when there is indirect indexing
+        if self.is_indirect_indexing(index):
+            return False
+
+        index_numels = [1] * len(self.numels)
+        for symbol in index.free_symbols:
+            if symbol not in self.range_tree_nodes:
+                # Non-iterated variables, e.g. strides
+                continue
+            entry = self.range_tree_nodes[symbol]  # type: ignore[index]
+            assert isinstance(entry.parent, IterationRangesRoot)
+            index_numels[entry.parent.index] *= entry.length
+
+        # If the index variables only iterate over a subset of the kernel
+        # numels, then it must be broadcasted.
+        simplify = V.graph.sizevars.simplify
+        return any(
+            simplify(idx_range) != simplify(iter_range)  # type: ignore[arg-type]
+            for idx_range, iter_range in zip(index_numels, self.numels)
+        )
+
+    def combine_contiguous_dims(self, index: sympy.Expr, tree: IterationRangesRoot):
+        """
+        More aggressive simplification to merge contiguous dims
+        """
+        if isinstance(index, (sympy.Integer, sympy.Symbol)):
+            return index
+        index_vars, sizes = tree.vars_and_sizes(index)
+        if len(sizes) <= 1:
+            return index
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars, sizes, index_prevent_reordering([index], index_vars, sizes)
+        )
+        if new_sizes == sizes:
+            return index
+        new_index_vars = tree.construct(new_sizes)
+        new_index = sympy_subs(index, dict(zip(index_vars, reindex(new_index_vars))))
+        return new_index
+
+    def index_to_str(self, index: sympy.Expr) -> str:
+        """
+        Convert an index expr to a string that can be used in triton code.
+        e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel.
+
+        Index expressions often need to be passed in as arguments to the triton kernel.
+        Rename_indexing and codegen_indexing keep track of the needed indices and add
+        new parameters to the function signature.
+        """
+        if isinstance(index, list):
+            return f"[{', '.join(map(self.index_to_str, index))}]"
+        return texpr(self.rename_indexing(self.codegen_indexing(index)))
+
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        copy_shape=None,
+        dense_indexing=False,
+        override_mask=None,
+        block_ptr=False,
+    ) -> Union[IndexingOptions, BlockPtrOptions]:
+        """
+        Compute the index and mask to pass to tl.load() or tl.store()
+        """
+        index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
+        # last resort, if no range vars are in the expr, hoist it
+        # TODO instead of trying to blindly find complicated exprs, we should hoist the
+        # inputs/outputs sizes and strides, but at the time indexing is generated
+        # kernel inputs and outputs are not set yet, we'd need a deeper refactor
+        # to do it this way
+
+        if len(index.atoms(sympy.ceiling)):
+            for a in index.atoms(sympy.ceiling):
+                # for nested exprs, atoms yields top level first (?)
+                # so if everything goes fine, lower level replacements will come up empty
+                symbols = a.free_symbols
+                if len(symbols) > 0 and all(
+                    s.name.startswith("s") or s.name.startswith("ps") for s in symbols
+                ):
+                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
+                    index = sympy_subs(index, replacements)
+
+        index = self.simplify_indexing(index)
+        index_vars = index.free_symbols
+        has_rindex = False
+
+        mask_vars: Set[str] = set()
+        for var in index_vars:
+            assert isinstance(var, sympy.Symbol)
+            has_rindex = has_rindex or var.name.startswith("r")
+            if override_mask:
+                pass
+            elif var.name.startswith("tmp"):
+                # indirect indexing
+                cse_var = self.cse.varname_map[var.name]
+                mask_vars.update(cse_var.mask_vars)
+            elif var.name.startswith(("s", "ps", "i", "u")):
+                pass
+            else:
+                # var is one of xN, yN or rN
+                assert var.name[0] in "xyr", var.name
+                mask_vars.add(f"{var.name[0]}mask")
+
+        need_dense = (
+            config.triton.dense_indexing
+            or dense_indexing
+            or self._load_mask is not None
+        ) and index != 0
+
+        have_dense = True
+        have_loop_vars = False
+        dense_mask_vars = set()
+
+        for tree in self.active_range_trees():
+            if index_vars.intersection(tree.var_list):
+                have_loop_vars = True
+            else:
+                have_dense = False
+            dense_mask_vars.add(f"{tree.prefix}mask")
+
+        if (
+            block_ptr
+            and config.triton.use_block_ptr
+            and not override_mask
+            and not self._load_mask
+            and len(mask_vars - dense_mask_vars) == 0
+            and not self.is_indirect_indexing(index)
+            and have_loop_vars
+            # workaround https://github.com/openai/triton/issues/2821
+            and self.index_dtype == "tl.int32"
+        ):
+            index_relative_to_xyr_index = sympy_subs(
+                index, {v: t.expr for v, t in self.range_tree_nodes.items()}
+            )
+            range_trees = self.active_range_trees(reorder=True)
+            symbols = [t.symbol() for t in range_trees]
+            strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols]
+            offset = sympy.Wild("_offset", exclude=symbols)
+            m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset)
+            # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with
+            #               a tl.reshape the correct block.  We will miss these cases today.
+            if m:
+                self.filter_masks(mask_vars)
+                return BlockPtrOptions.create(
+                    [m[s] for s in strides],
+                    m[offset],
+                    range_trees,
+                    mask_vars,  # type: ignore[arg-type]
+                )
+
+        expand_str = None
+        index_str = self.index_to_str(index)
+        if isinstance(index, sympy.Integer):
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
+            return IndexingOptions(index_str, set(), "None", expand_str, has_rindex)
+
+        if need_dense and not have_dense:
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+            mask_vars = dense_mask_vars
+        elif not have_loop_vars and copy_shape:
+            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
+            mask_vars = dense_mask_vars
+
+        if override_mask:
+            mask_vars = {override_mask}
+
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        self.filter_masks(mask_vars)
+
+        mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
+        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex)  # type: ignore[arg-type]
+
+    def active_range_trees(self, reorder=False):
+        trees = [
+            t for t in self.range_trees if t.prefix != "r" or self.inside_reduction
+        ]
+        if reorder and len(trees) > 1:
+            count = sum(t.prefix in "xyz" for t in trees)
+            assert "".join(t.prefix for t in trees[:count]) == "zyx"[-count:], [
+                t.prefix for t in trees[:count]
+            ]
+            trees[:count] = reversed(trees[:count])
+        return trees
+
+    def filter_masks(self, mask_vars):
+        for tree in self.range_trees:
+            # Masks are superfluous if we only have one element
+            if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+                continue
+            # Masks are superfluous if numel is a multiple of BLOCK
+            # (We use the fact that BLOCK is required by triton to be a power of 2)
+            if tree.prefix.upper() not in config.triton.max_block:
+                continue
+            max_block = config.triton.max_block[tree.prefix.upper()]
+            # Optional optimization: if block divides numel exactly, we will
+            # never need to do a masked load to handle stragglers at the end.
+            # It's faster to avoid masking at all.  But it is sound to always
+            # mask.
+            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+
+    def var_ranges(self):
+        return dict(
+            itertools.chain.from_iterable(
+                tree.var_ranges.items() for tree in self.range_trees
+            )
+        )
+
+    def codegen_indexing(self, expr: sympy.Expr):
+        expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
+        for sym in sorted(expr.free_symbols, key=str):
+            if sym in self.range_tree_nodes:
+                # if indexing expression is complicated, we precompute it on the host side
+                # and send the result as a kernel argument
+                replacements = {}
+                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
+                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
+                if len(replacements) > 0:
+                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
+                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
+                    )
+                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
+        return expr
+
+    @contextlib.contextmanager
+    def mask_loads(self, mask):
+        """Context manager to add an additional mask to tl.load/store"""
+        prior = self._load_mask
+        if prior:
+            mask = self.cse.generate(self.compute, f"{mask} & {prior}")
+
+        self._load_mask = mask
+        try:
+            # TODO(jansel): do we need a reshape here?
+            yield mask
+        finally:
+            self._load_mask = prior
+
+    def generate_assert(self, check):
+        return torch.version.hip is None and super().generate_assert(check)
+
+    def load_mask(self, var):
+        mask = ""
+        mask_vars = set(var.mask_vars)
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        if mask_vars:
+            mask = (
+                f"{next(iter(mask_vars))}"
+                if len(mask_vars) == 1
+                else f"({' & '.join(str(v) for v in mask_vars)})"
+            )
+        return mask
+
+    @property
+    def assert_function(self) -> str:
+        return "tl.device_assert"
+
+    def get_strides_of_load(self, index: sympy.Expr):
+        """
+        This gets the stride of the index for each of the tiling variables
+        (technically, it does it at index 0)
+
+        For example, if
+        xindex = x0 + 512*x1 + 1024*r0
+        x0 = (xindex//512)
+        x1 = (xindex % 512)
+        r0 = rindex // 1024
+
+        this function would return
+        {xindex: 512, rindex: 1024}
+        """
+        index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()}
+        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)  # type: ignore[arg-type]
+        strides = {}
+        for range_tree in self.range_trees:
+            s = sympy_index_symbol(range_tree.name)
+            strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs(
+                index_in_tile_vars, {s: 0}
+            )
+        return strides
+
+    def codegen_block_ptr(
+        self, name: str, var: str, indexing: BlockPtrOptions, other=""
+    ) -> Tuple[str, Optional[DeferredLine], str]:
+        advance_block_ptr = None
+        check = indexing.boundary_check()
+        if not check:
+            # workaround https://github.com/openai/triton/issues/2813
+            other = ""
+        elif other:
+            assert other == ", other=0.0"
+            other = f", boundary_check={check!r}, padding_option='zero'"
+        else:
+            other = f", boundary_check={check!r}"
+        if (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and indexing.has_rindex()
+        ):
+            block_ptr = f"block_ptr{next(self.block_ptr_id)}"
+            self.body.writeline(
+                DeferredLine(
+                    name, f"{block_ptr} = {indexing.format(var, roffset=False)}"
+                )
+            )
+            advance_block_ptr = DeferredLine(
+                name,
+                f"{block_ptr} = tl.advance({block_ptr}, {indexing.advance_roffset()})",
+            )
+        else:
+            block_ptr = indexing.format(var)
+        return block_ptr, advance_block_ptr, other
+
+    def codegen_block_ptr_store_line(self, name, indexing, block_ptr, value, other=""):
+        # broadcasting is not implicit for block_ptrs
+        value = (
+            f"tl.broadcast_to({value}, {self.index_to_str(indexing.reshape_suffix)})"
+        )
+        # drop any extra size=1 dimensions
+        value = triton_reshape(value, indexing.reshape_suffix, indexing.block_shape)
+        # workaround https://github.com/openai/triton/issues/2814
+        value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
+        return f"tl.store({block_ptr}, {value}{other})"
+
+    def load(self, name: str, index: sympy.Expr):
+        var = self.args.input(name)
+        indirect_indexing = self.is_indirect_indexing(index)
+        original_index = index
+        indexing = self.indexing(index, block_ptr=True)
+        has_rindex = indexing.has_rindex()
+        has_tmpmask = indexing.has_tmpmask()
+
+        # Keep the variable in cache if were going to reuse it. Equiv., if any of the following hold
+        #  1) We are doing broadcasting
+        #  2) It is a non-coalesced load. The intuition is that if it's
+        #  non-coalesced, we will likely load each element multiple times in
+        #  practice.
+        #  3) It will be used later and it won't be CSE'd. Equiv., if all the following hold
+        #   3.1) We are in a reduction loop
+        #   3.2) Its not its last use
+        #   3.3) This load will not be lifted to the body
+        #
+        is_coalesced = any(
+            i == 1 for i in self.get_strides_of_load(original_index).values()
+        )
+        if self.is_broadcasted(original_index):
+            ep = ", eviction_policy='evict_last'"
+        elif not is_coalesced:
+            ep = ", eviction_policy='evict_last'"
+        elif self.inside_reduction and self.range_trees[-1].is_loop:
+            if name in self.args.inplace_buffers:
+                names = set(self.args.inplace_buffers[name].other_names)
+            else:
+                names = {name}
+            last_use = len(names & self.last_usage) > 0
+            evict_last = not last_use and (has_rindex or indirect_indexing)
+            if evict_last:
+                ep = ", eviction_policy='evict_last'"
+            else:
+                ep = ", eviction_policy='evict_first'"
+        else:
+            ep = ""
+        # "other" below is a workaround for https://github.com/openai/triton/issues/737
+        # for bool, even though it's likely subject to the same bug, setting `other` leads
+        # to LLVM errors so we are skipping it for now
+        if (
+            (has_tmpmask or has_rindex)
+            and V.graph.get_dtype(name) != torch.bool
+            and indexing.has_mask()
+        ):
+            other = ", other=0.0"
+        else:
+            other = ""
+
+        advance_block_ptr = None
+        append_broadcast = None
+        if V.graph.is_unspec_arg(name):
+            line = var
+        else:
+            if isinstance(indexing, BlockPtrOptions):
+                block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                    name, var, indexing, other
+                )
+                line = f"tl.load({block_ptr}{other}{ep})"
+                # add needed size=1 dimensions
+                line = triton_reshape(
+                    line, indexing.block_shape, indexing.reshape_suffix
+                )
+            elif isinstance(original_index, sympy.Integer):
+                line = f"tl.load({var} + ({original_index}))"
+                append_broadcast = indexing.expand_str
+            else:
+                line = f"tl.load({var} + ({indexing.index_str}), {indexing.mask_str}{ep}{other})"
+
+            dtype = V.graph.get_dtype(name)
+            if dtype in (torch.float16, torch.bfloat16):
+                line += ".to(tl.float32)"
+            if dtype == torch.bool and torch.version.hip is None:
+                # Workaround for https://github.com/openai/triton/issues/2151
+                # tl.load returns int8 when loading from pointer to int1
+                # NOTE: Currently causes hangs on bool UTs for ROCm
+                line += ".to(tl.int1)"
+
+        if has_tmpmask:
+            # Masked loads must come after the mask is computed
+            load_buffer = self.compute
+        elif (
+            self.inside_reduction
+            and self.range_trees[-1].is_loop
+            and not indirect_indexing
+            and not has_rindex
+        ):
+            # can lift a common load outside of reduction loop
+            # One exception is when this is an indirect_load.
+            load_buffer = self.body
+        else:
+            load_buffer = self.loads
+
+        result_var = self.cse.generate(load_buffer, line)
+        assert isinstance(result_var, TritonCSEVariable)
+        result_var.mask_vars = indexing.mask_vars  # type: ignore[assignment]
+
+        if append_broadcast:
+            line = f"tl.broadcast_to({result_var}, {append_broadcast})"
+            result_var = self.cse.generate(load_buffer, line)
+
+        if advance_block_ptr:
+            load_buffer.writeline(advance_block_ptr)
+
+        if not self.inside_reduction or (not indexing.has_rmask() and not has_rindex):
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> None:
+        var = self.args.output(name)
+        original_index = index
+        indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
+
+        # Guard against write-after-read corruption in triton.
+        # See # https://github.com/openai/triton/issues/1615
+        # This triton bug means that a load which is broadcasted over multiple
+        # warps may see the result of a store that happens later in the triton
+        # program. The workaround is to add a barrier before storing, which
+        # enforces that all warps have already read the data.
+        is_inplace = name in self.args.inplace_buffers
+        is_broadcasted = self.is_broadcasted(original_index)
+        if is_inplace and is_broadcasted:
+            self.stores.writeline(DeferredLine(name, "tl.debug_barrier()"))
+
+        advance_block_ptr = None
+        if isinstance(indexing, BlockPtrOptions):
+            block_ptr, advance_block_ptr, other = self.codegen_block_ptr(
+                name, var, indexing
+            )
+            # block_ptr stores don't do implicit casting
+            line = self.codegen_block_ptr_store_line(
+                name, indexing, block_ptr, value, other
+            )
+        elif mode is None:
+            line = f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+        elif mode == "atomic_add":
+            line = f"tl.atomic_add({var} + ({indexing.index_str}), {value}, {indexing.mask_str})"
+        else:
+            raise NotImplementedError(f"store mode={mode}")
+        self.stores.writeline(DeferredLine(name, line))
+        if advance_block_ptr:
+            self.stores.writeline(advance_block_ptr)
+
+        if not self.inside_reduction:
+            self.outside_loop_vars.add(value)
+
+    def bucketize(
+        self,
+        values: CSEVariable,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ) -> CSEVariable:
+        """
+        See [Note: Inductor bucketize op]
+        """
+
+        # Triton performance for bucketize_binary_search is much better when the number
+        # of threads equals the number of elements.
+        # If we're trying to use a bucketize kernel, we should make sure that an
+        # autotuning config with num_elements_per_warp=32 exists.
+        self.autotune_hints.add(AutotuneHint.ELEMENTS_PER_WARP_32)
+
+        offsets_ptr = self.args.input(offsets_name)
+        block_size = self.dense_size_str()
+        offsets_size_str = self.index_to_str(offsets_size)
+
+        if indexing_dtype == torch.int32:
+            triton_dtype = "tl.int32"
+        elif indexing_dtype == torch.int64:
+            triton_dtype = "tl.int64"
+        else:
+            raise NotImplementedError(
+                "Bucketize only supports indexing with int32 and int64"
+            )
+
+        result = self.cse.generate(
+            self.compute,
+            f"triton_helpers.bucketize_binary_search({values}, {offsets_ptr}, {triton_dtype}, {right}, {offsets_size_str}, {block_size})",  # noqa: B950 line too long
+        )
+
+        return result
+
+    def reduction_resize(self, value):
+        ndims = self.triton_tensor_ndim()
+        if ndims == 1:
+            return f"triton_helpers.promote_to_tensor({value})"
+
+        sizes = [":"] * ndims
+        sizes[-1] = "None"
+        return f"{value}[{', '.join(sizes)}]"
+
+    @staticmethod
+    def _map_tuple_or_scalar(fn, value):
+        if isinstance(value, tuple):
+            return tuple(map(fn, value))
+        return fn(value)
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[CSEVariable, Tuple[CSEVariable, ...]],
+    ) -> Union[CSEVariable, Tuple[CSEVariable, ...]]:
+        assert self.inside_reduction
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+        reduction_range_prefix = self.range_trees[-1].prefix
+
+        # Say we have
+        #     tmp0 = ops.constant(1, torch.int64)
+        #     tmp1 = ops.reduction(torch.int64, torch.int64, "sum", tmp0)
+        # tmp0 in the triton code is either a scalar, or single-element tensor
+        # so if we emit tl.sum directly, it will only give 1 instead of RBLOCK * 1
+        # To avoid this, we broadcast to the expected shape first.
+        dense_size_str = self.dense_size_str()
+        value = self._map_tuple_or_scalar(
+            lambda v: self.cse.generate(
+                self.compute, f"tl.broadcast_to({v}, {dense_size_str})"
+            ),
+            value,
+        )
+
+        dim: int
+        root_op: str
+
+        def final_reduction(value):
+            use_helper = reduction_type in {"any", "max", "min", "prod"}
+            module = "triton_helpers" if use_helper else "tl"
+            if reduction_type in {"max", "min"}:
+                return self.reduction_resize(
+                    f"{module}.{reduction_type}2({value}, {dim})"
+                )
+            return self.reduction_resize(f"{module}.{reduction_type}({value}, {dim})")
+
+        def final_argreduce(buffer, result_var, value, index):
+            buffer.splice(
+                f"""\
+                _, {result_var}_tmp = triton_helpers.{root_op}_with_index({value}, {index}, {dim})
+                {result_var} = {self.reduction_resize(f'{result_var}_tmp')}
+                """
+            )
+
+        cache_key = (src_dtype, reduction_type, value)
+        if cache_key in self.cse.reduction_cache:
+            return self.cse.reduction_cache[cache_key]
+
+        dim = self.triton_tensor_ndim() - 1
+        acc_type = triton_acc_type(src_dtype)
+        result_var: Any = self.cse.newvar()
+        result_var.mask_vars = {var for var in masks if var[0] != "r"}
+        cond = " & ".join(masks)
+
+        def where_cond(tval, fval):
+            if not cond:
+                return tval
+            return TritonKernelOverrides.where(cond, tval, fval)
+
+        if self.persistent_reduction:
+            default = ir.Reduction.default_value(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(triton_constant, default)
+
+            def _mask_value(value, default):
+                return self.cse.generate(self.compute, where_cond(value, default))
+
+            if isinstance(value, tuple):
+                masked_value = [_mask_value(v, d) for v, d in zip(value, default)]
+            else:
+                masked_value = _mask_value(value, default)
+
+            if reduction_type in {"argmax", "argmin"}:
+                accumulator_index = str(
+                    self.cse.generate(
+                        self.compute,
+                        f"tl.broadcast_to({reduction_range_prefix}index, {masked_value}.shape)",
+                    )
+                )
+                root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+                final_argreduce(
+                    self.compute, result_var, masked_value, accumulator_index
+                )
+            elif reduction_type == "welford_reduce":
+                # For persistent reductions, don't bother with
+                # welford's algorithm since it uses more registers, and
+                # taking two reductions doesn't increase memory usage.
+                sum_ = ops.reduction(dtype, dtype, "sum", value)
+                self.inside_reduction = False
+                rnumel = ops.index_expr(self.numels[-1], dtype)
+                mean = ops.truediv(sum_, rnumel)
+
+                self.inside_reduction = True
+                dx = ops.sub(value, mean)
+                dx2 = ops.mul(dx, dx)
+                m2 = ops.reduction(dtype, dtype, "sum", dx2)
+                result_var = (mean, m2, rnumel)
+            elif reduction_type == "welford_combine":
+                mean, m2, weight = masked_value
+                welford = f"triton_helpers.welford({mean}, {m2}, {weight}, {dim})"
+                mean, m2, weight = (self.cse.newvar() for _ in range(3))
+                self.compute.writeline(f"{mean}, {m2}, {weight} = {welford}")
+
+                result_var = tuple(
+                    self.cse.generate(self.compute, self.reduction_resize(var_name))
+                    for var_name in (mean, m2, weight)
+                )
+            else:
+                result_var = self.cse.generate(
+                    self.compute, final_reduction(masked_value)
+                )
+        else:
+            accumulator = f"_{result_var}"
+            default = ir.Reduction.default_accumulator(reduction_type, src_dtype)
+            default = self._map_tuple_or_scalar(triton_constant, default)
+            if not isinstance(default, tuple):
+                self.body.writeline(
+                    f"{accumulator} = tl.full({self.dense_size_str()}, {default}, {acc_type})"
+                )
+
+            if reduction_type in {"argmax", "argmin"}:
+                accumulator_index = f"_{result_var}_index"
+                long_max = torch.iinfo(torch.int64).max
+                self.body.writeline(
+                    f"{accumulator_index} = tl.full({self.dense_size_str()}, {long_max}, tl.int64)"
+                )
+                root_op = {"argmax": "max", "argmin": "min"}[reduction_type]
+
+                self.compute.splice(
+                    f"""\
+                {accumulator}_next, {accumulator_index}_next = triton_helpers.{root_op}imum_with_index(
+                    {accumulator}, {accumulator_index}, {value}, {reduction_range_prefix}index
+                )
+                {accumulator} = {where_cond(f'{accumulator}_next', accumulator)}
+                {accumulator_index} = {where_cond(f'{accumulator_index}_next', accumulator_index)}
+                """
+                )
+                final_argreduce(self.suffix, result_var, accumulator, accumulator_index)
+            elif is_welford_reduction(reduction_type):
+                accumulator = f"{result_var}_mean"
+                accumulator_m2 = f"{result_var}_m2"
+                accumulator_weight = f"{result_var}_weight"
+                self.body.writeline(
+                    f"{accumulator} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+                self.body.writeline(
+                    f"{accumulator_m2} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+                self.body.writeline(
+                    f"{accumulator_weight} = tl.zeros({self.dense_size_str()}, {acc_type})"
+                )
+
+                if reduction_type == "welford_combine":
+                    mean, m2, weight = value
+                    self.compute.splice(
+                        f"""\
+                    {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_combine(
+                        {accumulator}, {accumulator_m2}, {accumulator_weight},
+                        {mean}, {m2}, {weight}
+                    )
+                    """
+                    )
+                else:
+                    assert reduction_type == "welford_reduce"
+                    self.compute.splice(
+                        f"""\
+                    {accumulator}_next, {accumulator_m2}_next, {accumulator_weight}_next = triton_helpers.welford_reduce(
+                        {value}, {accumulator}, {accumulator_m2}, {accumulator_weight}, roffset == 0
+                    )
+                    """
+                    )
+
+                self.compute.splice(
+                    f"""\
+                {accumulator} = {where_cond(f'{accumulator}_next', accumulator)}
+                {accumulator_m2} = {where_cond(f'{accumulator_m2}_next', accumulator_m2)}
+                {accumulator_weight} = {where_cond(f'{accumulator_weight}_next', accumulator_weight)}
+                """
+                )
+
+                result_mean = result_var
+                result_m2 = self.cse.newvar()
+                result_weight = self.cse.newvar()
+                self.suffix.splice(
+                    f"""\
+                {result_mean}_tmp, {result_m2}_tmp, {result_weight}_tmp = triton_helpers.welford(
+                    {accumulator}, {accumulator_m2}, {accumulator_weight}, {dim}
+                )
+                {result_mean} = {self.reduction_resize(f'{result_mean}_tmp')}
+                {result_m2} = {self.reduction_resize(f'{result_m2}_tmp')}
+                {result_weight} = {self.reduction_resize(f'{result_weight}_tmp')}
+                """
+                )
+                result_var = result_mean, result_m2, result_weight
+            else:
+                combine_fn = ir.get_reduction_combine_fn(reduction_type, src_dtype)
+                updated = combine_fn(accumulator, value)
+                self.compute.writeline(
+                    f"{accumulator} = {where_cond(updated, accumulator)}"
+                )
+
+                if src_dtype == torch.bool:
+                    # This is only really used for aten.any. It changes the
+                    # final reduction of a non-persistent reduction from
+                    #     tmp5 = triton_helpers.max(_tmp5, 1)[:, None]
+                    # to
+                    #     tmp5 = triton_helpers.max(_tmp5.to(tl.int8), 1)[:, None].to(tl.int1)
+                    # which is needed because tl.reduce doesn't support tl.int1
+                    accumulator = f"{accumulator}.to(tl.int8)"
+                    result_type = triton_compute_type(dtype)
+                    self.suffix.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}.to({result_type})"
+                    )
+                else:
+                    self.suffix.writeline(
+                        f"{result_var} = {final_reduction(accumulator)}"
+                    )
+
+        self.cse.reduction_cache[cache_key] = result_var
+
+        if isinstance(result_var, tuple):
+            self.outside_loop_vars |= set(result_var)
+        else:
+            self.outside_loop_vars.add(result_var)
+
+        return result_var
+
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        assert self.inside_reduction
+        self.inside_reduction = False
+        indexing = self.indexing(index, block_ptr=True)
+        self.inside_reduction = True
+        var = self.args.output(name)
+
+        if isinstance(indexing, BlockPtrOptions):
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    self.codegen_block_ptr_store_line(
+                        name,
+                        indexing,
+                        indexing.format(var),
+                        value,
+                        f", boundary_check={indexing.boundary_check()!r}",
+                    ),
+                )
+            )
+        else:
+            assert isinstance(indexing, IndexingOptions)
+            self.suffix.writeline(
+                DeferredLine(
+                    name,
+                    f"tl.store({var} + ({indexing.index_str}), {value}, {indexing.mask_str})",
+                )
+            )
+
+    def _lift_helper(self, fn, num_args) -> str:
+        # Lift IR function into a triton function in the global namespace
+        helper = IndentedBuffer()
+        helper.writeline("@triton.jit")
+        args = [f"arg{n}" for n in range(num_args)]
+        signature = ", ".join(args)
+        helper.writeline(f"def {{name}}({signature}):")
+
+        cse = CSE(prefix="", suffix="")
+        overrides = TritonOverrides(V.MockHandler())
+
+        class CSEProxy:
+            def __getattr__(self, name: str) -> Callable[..., CSEVariable]:
+                def inner(*args, **kwargs):
+                    return cse.generate(
+                        helper,
+                        getattr(overrides, name)(*args, **kwargs),
+                    )
+
+                return inner
+
+        with helper.indent(), V.set_ops_handler(CSEProxy()):
+            outputs = fn(*args)
+            helper.writeline(f"return {outputs}")
+
+        return self.helper_functions.add(helper.getvalue())
+
+    def scan(
+        self,
+        dtype: torch.dtype,
+        combine_fn: Callable[[CSEVariable, CSEVariable], CSEVariable],
+        value: CSEVariable,
+        init: int,
+    ) -> CSEVariable:
+        assert self.inside_reduction
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+        reduction_range_prefix = self.range_trees[-1].prefix
+
+        value = self.cse.generate(
+            self.compute, f"tl.broadcast_to({value}, {self.dense_size_str()})"
+        )
+
+        default = triton_constant(init)
+        dim = self.triton_tensor_ndim() - 1
+        acc_type = triton_acc_type(dtype)
+        cond = " & ".join(masks)
+
+        combine_helper_fn = self._lift_helper(combine_fn, 2)
+
+        def where_cond(value):
+            if not cond:
+                return value
+            default_tensor = self.cse.generate(
+                self.body,
+                f"tl.full({[1] * self.triton_tensor_ndim()}, {default}, {triton_compute_type(dtype)})",
+            )
+            return self.cse.generate(
+                self.compute, f"tl.where({cond}, {value}, {default_tensor})"
+            )
+
+        if self.persistent_reduction:
+            masked_value = where_cond(value)
+            result_var = self.cse.generate(
+                self.compute,
+                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
+            )
+        else:
+            accumulator = self.cse.newvar()
+            reduced_size = self.dense_size_list()
+            reduced_size[-1] = "1"
+            reduced_size = f"[{', '.join(reduced_size)}]"
+
+            self.body.writeline(
+                f"{accumulator} = tl.full({reduced_size}, {default}, {acc_type})"
+            )
+
+            masked_value = where_cond(value)
+            partial_reduce = self.cse.generate(
+                self.compute,
+                self.reduction_resize(
+                    f"tl.reduce({value}, {dim}, {combine_helper_fn})"
+                ),
+            )
+            acc_next = combine_fn(accumulator, partial_reduce)
+            partial_scan = self.cse.generate(
+                self.compute,
+                f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})",
+            )
+            result_var = self.cse.generate(
+                self.compute, combine_fn(accumulator, partial_scan)
+            )
+            self.compute.writeline(f"{accumulator} = {acc_next}")
+
+        result_var.mask_vars = masks  # type: ignore[attr-defined]
+        return result_var
+
+    def codegen_body(self):
+        """
+        Concat output code from index_code, loads, compute, stores,
+        suffix into self.body.
+
+        For pointwise kernels, this is called just once at the end.
+
+        For reduction kernels, this generates a loop over the reduction
+        axis.
+        """
+        if not (
+            self.indexing_code
+            or self.loads
+            or self.stores
+            or self.compute
+            or self.suffix
+        ):
+            return
+
+        if self.inside_reduction and self.range_trees[-1].is_loop:
+            self.body.writeline("for roffset in range(0, rnumel, RBLOCK):")
+            with self.body.indent():
+                # last range tree is always reduction
+                self.range_trees[-1].codegen_header(self.body)
+                self.body.splice(self.indexing_code)
+                self.body.splice(self.loads)
+                self.body.splice(self.compute)
+                self.body.splice(self.stores)
+
+            # invalidate any caches that came from inside the reduction loop
+            self.cse.invalidate(self.outside_loop_vars)
+            self.range_trees[-1].cache_clear()
+        else:
+            self.body.splice(self.indexing_code)
+            self.body.splice(self.loads)
+            self.body.splice(self.compute)
+            self.body.splice(self.stores)
+        self.body.splice(self.suffix)
+        self.indexing_code.clear()
+        self.loads.clear()
+        self.compute.clear()
+        self.stores.clear()
+        self.suffix.clear()
+
+    def codegen_kernel_benchmark(self, num_gb, grid=None):
+        result = IndentedBuffer()
+        argdefs, call_args, signature = self.args.python_argdefs()
+
+        result.writelines(["", "", "def get_args():"])
+        with result.indent():
+            name_cnt = itertools.count()
+            var_names = []
+            for arg_name, arg_sig in zip(call_args, signature):
+                var_name = f"arg_{next(name_cnt)}"
+                buf = V.graph.get_buffer(arg_name)
+                if buf:
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(buf.get_size())}, {V.graph.sizevars.size_hints(buf.get_stride())}, device='{buf.get_device()}', dtype={buf.get_dtype()})"  # noqa: B950 line too long
+                    )
+                elif arg_name in V.graph.constants:
+                    # note that random seed is put in V.graph.constants
+                    const_tensor = V.graph.constants[arg_name]
+                    result.writeline(
+                        f"{var_name} = rand_strided({V.graph.sizevars.size_hints(const_tensor.size())}, {V.graph.sizevars.size_hints(const_tensor.stride())}, device='{const_tensor.device}', dtype={const_tensor.dtype})"  # type: ignore[arg-type]  # noqa: B950 line too long
+                    )
+                elif isinstance(arg_sig, SizeArg):
+                    symval_hint = V.graph.sizevars.size_hint(arg_sig.expr)
+
+                    # Force the seed_offset to be 0 so calls to the same kernel
+                    # using different seed offset will have the same benchmark harness.
+                    # We can dedup kernel definitions in this case.
+                    if "seed_offset" in arg_sig.name:
+                        symval_hint = 0
+                    result.writeline(f"{var_name} = {symval_hint}")
+                else:
+                    raise KeyError(
+                        f"Don't find the buffer or const tensor for {arg_name}"
+                    )
+                var_names.append(var_name)
+            result.writeline(f"return {', '.join(var_names)},")
+
+        result.writelines(["\n", "\n", "def call(args):"])
+        if grid is None:
+            grid = []
+            extra_args = []
+            extra_args_str = None
+            for tree in self.active_range_trees():
+                expr = pexpr(V.graph.sizevars.size_hint(tree.numel))
+                extra_args.append(expr)
+                if tree.prefix != "r":
+                    grid.append(expr)
+            if self.need_numel_args():
+                extra_args_str = ", ".join(map(str, extra_args)) + ", "
+            else:
+                extra_args_str = ""
+            grid_arg = f"{extra_args_str}grid=grid({', '.join(grid)})"
+        else:
+            grid_arg = f"grid={grid}"
+        index = V.graph.scheduler.current_device.index
+        with result.indent():
+            result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
+            with result.indent():
+                result.writeline(
+                    V.graph.device_ops.set_device(index)
+                )  # no-op to ensure context
+                stream_name = f"stream{index}"
+                result.writeline(f"{stream_name} = get_raw_stream({index})")
+                result.writeline(
+                    f"{str(Placeholder.KERNEL_NAME)}.run(*args, {grid_arg}, stream={stream_name})"
+                )
+
+        # benchmark all configs
+        result.writelines(["\n", "\n", "def benchmark_all_configs(args):"])
+        with result.indent():
+            result.writeline(f"with {V.graph.device_ops.device_guard(index)}:")
+            with result.indent():
+                result.writeline(
+                    V.graph.device_ops.set_device(index)
+                )  # no-op to ensure context
+                result.writeline(
+                    f"return {str(Placeholder.KERNEL_NAME)}.benchmark_all_configs(*args, {grid_arg})"
+                )
+
+        result.writelines(["\n", "\n", "if __name__ == '__main__':"])
+        with result.indent():
+            result.writeline("from triton.testing import do_bench")
+            result.writeline("")
+
+            result.writeline("args = get_args()")
+            result.writeline(
+                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
+            )
+            result.writeline(f"num_gb = {num_gb}")
+            result.writeline("gb_per_s = num_gb / (ms / 1e3)")
+            result.writeline(
+                'print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s")'
+            )
+
+        return result
+
+    def imports_for_benchmark_kernel(self):
+        return textwrap.dedent(
+            """
+            from torch._dynamo.testing import rand_strided
+            {}
+            import torch
+            from torch._inductor.triton_heuristics import grid, split_scan_grid
+        """.format(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+        )
+
+    def estimate_kernel_num_bytes(self):
+        """
+        Try the best to estimate the total size (in bytes) of the
+        kernel's inputs and outputs, which is used for estimating the memory
+        throughput of this kernel. This information is used for checking how
+        far we are from the peak memory bandwidth. It's important that
+        we want to avoid overestimating the sizes of the inputs and outputs,
+        because it can wrongfully give us a very large memory traffic value,
+        which may be even larger than the theoretical bandwidth and thus
+        become very misleading. This is particularly problematic for cases
+        where we slice some inputs. In those cases, we should only count
+        the size of the "slices" instead of the original inputs, because
+        only the slices contribute to the real memory traffic.
+        """
+        nbytes = []
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        _, call_args, _ = self.args.python_argdefs()
+
+        # For pointwise and reduction kernels, this is the upper-bound numels
+        # for the output buffer.
+        # FIXME: This is not exactly right for cases like below:
+        #    def foo(tensor0, tensor1):
+        #        x0 = narrow(tensor0)
+        #        return cat(x0, tensor1)
+        # For this example, we will end up overestimate the size for the
+        # slice s0. Potentially, we could have precise inputs information
+        # if we maintained the original inputs of the Pointwise kernel created
+        # for the "cat". However, I think it might be a bit overwhelming that
+        # we add such complexity only for handling some particular cases for
+        # benchmarking.
+        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels))
+        for i, arg in enumerate(call_args):
+            # "buf" may be narrowed. In this case, the number of memory accesses
+            # should be estimated based on the reinterpreted layout.
+            # On the other hand, buf may be broadcasted. In this case,
+            # counting the size of the underline storage would give us
+            # a better estimation in terms of memory accesses.
+            if arg not in self.buf_accesses:
+                nbytes.append(0)
+                continue
+            arg_numel = V.graph.get_numel(arg)
+            buf_size = V.graph.sizevars.size_hint(arg_numel)
+            if buf_size > out_numel:
+                # This arg points to a buf that has been sliced.
+                # We need to count each individual slice to have
+                # a better estimation.
+                indices: Set[Any] = set()
+                no_index_dep_count = 0
+                for dep in self.buf_accesses[arg]:
+                    if isinstance(dep, (StarDep, WeakDep)):
+                        indices.add(f"no_index_dep_{no_index_dep_count}")
+                        no_index_dep_count += 1
+                    else:
+                        indices.add(dep.index)
+                numel = len(indices) * out_numel
+            else:
+                numel = buf_size
+            dtype = V.graph.get_dtype(arg)
+            dtype_size = get_dtype_size(dtype)
+            nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(nbytes)
+
+    def _get_heuristic(self):
+        if self.persistent_reduction:
+            assert self.inside_reduction
+            return "persistent_reduction"
+        elif self.inside_reduction:
+            return "reduction"
+        return "pointwise"
+
+    def codegen_kernel(self, name=None):
+        code = IndentedBuffer()
+
+        size_hints = []
+        for numel in self.numels:
+            numel_hint = V.graph.sizevars.symbolic_hint(numel)
+            if not isinstance(numel_hint, (int, sympy.Integer)):
+                # This default heuristic hint was picked carefully: it is
+                # large, to ensure that we don't shrink the block size (since
+                # if you don't have many elements, it'd be wasteful to pick a
+                # large block size).  Since we don't know how many elements we
+                # might have, we should be OK with some inefficiency to make
+                # sure we handle the large case well.  8192 is the largest
+                # block size we support, so we pick that.
+                #
+                # If we have a better hint for unbacked SymInts (e.g., because
+                # a user told us, or we are tracking upper bounds) we could
+                # use that here.
+                size_hint = 8192
+            else:
+                size_hint = next_power_of_2(int(numel_hint))
+            size_hints.append(size_hint)
+
+        if not self.inside_reduction:
+            size_hints.pop()
+
+        heuristics = self._get_heuristic()
+
+        if name is None:
+            code.splice(gen_common_triton_imports())
+
+            if config.benchmark_kernel:
+                code.splice(self.imports_for_benchmark_kernel())
+
+        argdefs, _, signature = self.args.python_argdefs()
+        # maps actual expression to SizeArg if it is in sizevars replacements
+        for i, arg in enumerate(signature):
+            if isinstance(arg, SizeArg):
+                # mypy is unhappy about the sympy.Expr
+                # type for the key of the dict below
+                symbol = cast(sympy.Symbol, arg.expr)
+                if symbol in V.graph.sizevars.inv_precomputed_replacements:
+                    signature[i] = SizeArg(
+                        arg.name, V.graph.sizevars.inv_precomputed_replacements[symbol]
+                    )
+
+        mutated_args = set()
+        for mutation in self.mutations:
+            if mutation in self.args.input_buffers:
+                mutated_args.add(self.args.input_buffers[mutation])
+            if (
+                mutation in self.args.inplace_buffers
+                and mutation not in V.graph.removed_buffers
+                and mutation not in self.removed_buffers
+            ):
+                mutated_args.add(self.args.inplace_buffers[mutation].inner_name)
+            if mutation in self.args.output_buffers:
+                mutated_args.add(self.args.output_buffers[mutation])
+        mutated_args = sorted(mutated_args)
+
+        triton_meta_signature = signature_to_meta(
+            signature, size_dtype=self.index_dtype
+        )
+        triton_meta = {
+            "signature": triton_meta_signature,
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            "constants": {},
+        }
+
+        inductor_meta = {
+            "autotune_hints": set(self.autotune_hints),
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "mutated_arg_names": mutated_args,
+            "no_x_dim": self.no_x_dim,
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+        num_gb = None
+        if config.benchmark_kernel or config.profile_bandwidth:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+
+        for tree in self.active_range_trees():
+            sizearg = SizeArg(f"{tree.prefix}numel", tree.numel)
+            signature.append(sizearg)
+            triton_meta_signature[len(argdefs)] = signature_of(
+                sizearg, size_dtype=self.index_dtype
+            )
+            argdefs.append(f"{tree.prefix}numel")
+            # constexpr version causes issues, see
+            # https://github.com/pytorch/torchdynamo/pull/1362
+            # triton_meta["constants"][len(argdefs)] = V.graph.sizevars.size_hint(
+            #     tree.numel
+            # )
+            # argdefs.append(f"{tree.prefix}numel: tl.constexpr")
+        triton_meta["configs"] = [config_of(signature)]
+
+        # Triton compiler includes equal_to_1 args into constants even
+        # when they are not constexpr. otherwise there may be a segfault
+        # during launching the Inductor-compiled Triton kernel.
+        # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+        # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+
+        self.triton_meta = triton_meta
+
+        for tree in self.range_trees:
+            if tree.prefix == "r" and self.persistent_reduction:
+                # RBLOCK for persistent_reduction is defined in codegen_static_numels
+                continue
+            if tree.tensor_dim is None:
+                continue
+            argdefs.append(f"{tree.prefix.upper()}BLOCK : tl.constexpr")
+
+        self.codegen_body()
+
+        for helper in self.helper_functions:
+            code.writeline("")
+            code.splice(helper)
+
+        if self.inside_reduction:
+            reduction_hint = self.reduction_hint
+            heuristics_line = f"""
+                @triton_heuristics.{heuristics}(
+                    size_hints={size_hints!r},
+                    reduction_hint={reduction_hint},
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r}
+                )
+                @triton.jit
+            """
+        else:
+            tile_hint = ""
+            if len(size_hints) == 2:
+                if len(signature) == 4:  # input, output and 2 args
+                    tile_hint = "tile_hint=TileHint.SQUARE,"
+                else:
+                    tile_hint = "tile_hint=TileHint.DEFAULT,"
+            heuristics_line = f"""
+                @triton_heuristics.{heuristics}(
+                    size_hints={size_hints!r}, {tile_hint}
+                    filename=__file__,
+                    triton_meta={triton_meta!r},
+                    inductor_meta={inductor_meta!r},
+                    min_elem_per_thread={self.min_elem_per_thread}
+                )
+                @triton.jit
+            """
+        code.splice(heuristics_line)
+        code.writeline(
+            f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):"
+        )
+        with code.indent():
+            self.codegen_static_numels(code)
+            for old, new in self.args.aliases():
+                code.writeline(f"{old} = {new}")
+            code.splice(self.body)
+
+        if config.benchmark_kernel:
+            code.splice(self.codegen_kernel_benchmark(num_gb))
+
+        return code.getvalue()
+
+    def codegen_static_numels(self, code):
+        """
+        We get a small speedup from hard coding numels if they are static.
+
+        This code stomps on the passed-in values by writing an constant to the top of the kernel.
+
+        In a kernel like:
+        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
+
+        We would add
+        xnumel = 4096
+        rnumel = 768
+
+        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
+        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
+        knows that its a static numel, as that you just plop a constant into the kernel.
+        """
+        for tree in self.range_trees:
+            if tree.prefix != "r" or self.inside_reduction:
+                simplified_tree_numel = V.graph.sizevars.simplify(tree.numel)
+                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                    code.writeline(f"{tree.prefix}numel = {int(simplified_tree_numel)}")
+
+            if tree.prefix == "r" and self.persistent_reduction:
+                simplified_tree_numel = V.graph.sizevars.simplify(tree.numel)
+                if isinstance(simplified_tree_numel, (sympy.Integer, int)):
+                    val = int(simplified_tree_numel)
+                else:
+                    continue
+                val = next_power_of_2(val)
+                code.writeline(f"RBLOCK: tl.constexpr = {val}")
+
+            if tree.prefix == "x" and self.no_x_dim:
+                code.writeline("XBLOCK: tl.constexpr = 1")
+
+    def triton_tensor_ndim(self):
+        return sum(int(tree.tensor_dim is not None) for tree in self.range_trees)
+
+    def indexing_size_str(self, i):
+        sizes = ["None"] * self.triton_tensor_ndim()
+        sizes[i] = ":"
+        return f"[{', '.join(sizes)}]"
+
+    def dense_size_list(self) -> List[str]:
+        sizes = ["1"] * self.triton_tensor_ndim()
+        for tree in self.range_trees:
+            if tree.tensor_dim is None:
+                continue
+
+            if tree.prefix != "r" or self.inside_reduction:
+                sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
+        return sizes
+
+    def dense_size_str(self):
+        sizes = self.dense_size_list()
+        return f"[{', '.join(sizes)}]"
+
+    def _get_grid_fn(self):
+        return "grid"
+
+    def add_numel_to_call_args_and_grid(self, name, call_args, grid):
+        # TODO(jansel): if there are constants, we shouldn't bother passing them as args
+        for tree in self.range_trees:
+            if isinstance(tree.numel, (sympy.Integer, sympy.Symbol)):
+                expr = tree.numel
+            else:
+                expr = V.graph.wrapper_code.generate_numel_expr(name, tree)
+
+            if tree.prefix != "r" or self.inside_reduction:
+                call_args.append(expr)
+            if tree.grid_dim is not None:
+                grid.append(expr)
+
+    def get_call_args(self):
+        _, call_args, _ = self.args.python_argdefs()
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+
+        return call_args
+
+    def call_kernel(self, name: str, node: Optional[IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        call_args = self.get_call_args()
+        grid: List[Any] = []
+        self.add_numel_to_call_args_and_grid(name, call_args, grid)
+        current_device = V.graph.scheduler.current_device
+
+        if self.args.workspace_arg is not None:
+            ws = self.args.workspace_arg
+            wrapper.generate_workspace_allocation(
+                ws.nbytes, current_device, ws.zero_fill
+            )
+
+        grid = wrapper.generate_default_grid(name, grid)
+        wrapper.generate_kernel_call(
+            name,
+            call_args,
+            grid,
+            current_device.index,
+            cuda=True,
+            triton=True,
+            grid_fn=self._get_grid_fn(),
+            triton_meta=self.triton_meta,
+        )
+
+        if self.args.workspace_arg is not None:
+            wrapper.writeline(wrapper.make_free_by_names(["workspace"]))
+
+    def codegen_nan_check(self):
+        wrapper = V.graph.wrapper_code
+        _, call_args, arg_types = self.args.python_argdefs()
+        for arg, arg_type in zip(call_args, arg_types):
+            if isinstance(arg_type, TensorArg):
+                line = f"assert not {arg}.isnan().any().item()"
+                wrapper.writeline(line)
+                line = f"assert not {arg}.isinf().any().item()"
+                wrapper.writeline(line)
+
+    def warn_mix_layout(self, kernel_name):
+        """
+        Print message if the kernel have mixed layout inputs.
+        Only care about 4D tensor for now.
+        """
+        if (
+            len(self.args.input_buffers) == 1
+            and len(self.args.output_buffers) == 1
+            and len(self.args.inplace_buffers) == 0
+        ):
+            # even if input buffer and output buffer have different layout,
+            # this can be a layout conversion kernel. No need to warn for
+            # the mix layouts.
+            return
+
+        argdefs, call_args, signature = self.args.python_argdefs()
+        uniform_stride_order = None
+        for arg_name in call_args:
+            buf = V.graph.get_buffer(arg_name)
+            if buf and len(buf.layout.size) == 4:
+                # ignore the tensor if only 1 dimension is non-zero
+                if len([x for x in buf.layout.size if x == 1]) == 3:
+                    continue
+                stride_order = ir.get_stride_order(buf.layout.stride)
+                if uniform_stride_order is None:
+                    uniform_stride_order = stride_order
+                elif uniform_stride_order != stride_order:
+                    msg = yellow_text(
+                        f"Expected stride order {uniform_stride_order}, but found stride order"
+                        + f" {stride_order} for kernel {kernel_name}"
+                    )
+                    log.warning(msg)
+
+                    stride_order_list = [
+                        ir.get_stride_order(V.graph.get_buffer(name).layout.stride)
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    size_list = [
+                        V.graph.get_buffer(name).layout.size
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    source_list = [
+                        "GraphInput"
+                        if name in V.graph.graph_inputs
+                        else "IntermediateBuffer"
+                        if name in V.graph.name_to_buffer
+                        else None
+                        for name in call_args
+                    ]
+
+                    msg = yellow_text(
+                        f"  param names {argdefs}\n  buf names {call_args}\n  strides {stride_order_list}"
+                        + f"\n  sizes {size_list}\n  sources {source_list}\n"
+                    )
+                    log.warning(msg)
+                    return
+        msg = green_text(
+            f"All the inputs for the triton kernel {kernel_name} have uniform layout"
+        )
+        log.warning(msg)
+
+    def create_cse_var(self, *args, **kwargs):
+        return TritonCSEVariable(*args, **kwargs)
+
+
+class TritonScheduling(BaseScheduling):
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+
+    def group_fn(self, sizes):
+        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
+
+    def can_fuse(self, node1, node2):
+        """
+        Hook called by Scheduler to determine if the Triton backend
+        can fuse node1 and node2.  These nodes might already be
+        FusedSchedulerNodes.
+        """
+        if isinstance(node1, scheduler.ForeachKernelSchedulerNode) or isinstance(
+            node2, scheduler.ForeachKernelSchedulerNode
+        ):
+            return scheduler.ForeachKernelSchedulerNode.can_fuse(node1, node2)
+
+        _, (numel1, rnumel1) = node1.group
+        _, (numel2, rnumel2) = node2.group
+        why = WhyNoFuse(node1, node2)
+
+        if node1.is_split_scan() and not node2.is_split_scan():
+            if node2.is_reduction():
+                why("Split scan cannot fuse with reductions")
+        elif node2.is_split_scan() and not node1.is_split_scan():
+            if node1.is_reduction():
+                why("Split scan cannot fuse with reductions")
+
+        if node1.is_reduction() and node2.is_reduction():
+            reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
+            if not reduction_can_fuse:
+                why(
+                    "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+            return reduction_can_fuse
+
+        if not node1.is_reduction() and not node2.is_reduction():
+            if not (numel1 == numel2 and rnumel1 == rnumel2):
+                why(
+                    "numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+                return False
+
+            if node1.is_template():
+                # Only allow fusion for TritonTemplates for now.
+                # Fusion for CUDATemplates are not supported.
+                is_triton_template = isinstance(node1.node, TritonTemplateBuffer)
+                if not is_triton_template:
+                    why("node1 is not TritonTemplateBuffer")
+                return is_triton_template
+
+            # check for a bad combined tiling
+            tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
+            tiling2 = self.select_tiling(node2.get_nodes(), numel1, rnumel1)
+            tiling3 = self.select_tiling(
+                node1.get_nodes() + node2.get_nodes(), numel1, rnumel1
+            )
+            if config.triton.tiling_prevents_pointwise_fusion:
+                cond = True
+                if len(tiling1) > 2:
+                    if len(tiling2) > 2:
+                        cond = tiling1 == tiling2 == tiling3
+                    else:
+                        cond = tiling1 == tiling3
+                elif len(tiling2) > 2:
+                    cond = tiling2 == tiling3
+                if not cond:
+                    why(
+                        "tiling mismatch (%s, %s, %s)",
+                        tiling1,
+                        tiling2,
+                        tiling3,
+                    )
+                    return False
+
+            return True
+
+        if not node1.is_reduction() and node2.is_reduction():
+            assert rnumel1 == 1 and rnumel2 != 1
+            if numel1 == numel2 * rnumel2:
+                if not all(
+                    TritonKernel.is_compatible((numel2, rnumel2), n.get_ranges())
+                    for n in node1.get_nodes()
+                ):
+                    why("nodes numel/rnumel incompatibility")
+                    return False
+                if (
+                    config.triton.tiling_prevents_reduction_fusion
+                    and not node1.is_template()
+                ):
+                    is_reduction_tiling_valid = self.select_tiling(
+                        node1.get_nodes(), numel1
+                    ) in (
+                        (numel1, 1),
+                        (numel2, rnumel2, 1),
+                    )
+                    if not is_reduction_tiling_valid:
+                        why("invalid tiling for reduction")
+                    return is_reduction_tiling_valid
+                return True
+
+            if numel1 != numel2:
+                why("nodes numel incompatibility")
+            return numel1 == numel2
+
+        assert node1.is_reduction() and not node2.is_reduction()
+        # swap args to hit the case above
+        return self.can_fuse_horizontal(node2, node1)
+
+    can_fuse_vertical = can_fuse
+    can_fuse_horizontal = can_fuse
+
+    def generate_node_schedule(self, nodes, numel, rnumel):
+        node_schedule: List[Any] = []
+        current_loop_writes: Set[str] = set()
+
+        # Writes with a reduced shape, meaning they are only present once the
+        # reduction loop has ended
+        current_loop_reduced_writes = set()
+        current_loop_has_writes = False
+        done = set()
+
+        def fits_in_main_body(n):
+            _, (node_numel, node_rnumel) = n.group
+            return (node_numel == numel and node_rnumel == rnumel) or (
+                node_numel == numel * rnumel and node_rnumel == 1
+            )
+
+        def fits_outside_reduction(n):
+            _, (node_numel, node_rnumel) = n.group
+            return node_numel == numel and node_rnumel == 1 and rnumel != 1
+
+        def schedule_node_in_loop(n):
+            nonlocal current_loop_has_writes
+            done.add(n)
+            node_schedule.append(n)
+            current_loop_has_writes = True
+            # A scan is modelled as a reduction in the scheduler but has a
+            # full sized output that can be used inside the loop body
+            if (
+                n.is_reduction()
+                and isinstance(n, scheduler.SchedulerNode)
+                and isinstance(n.node, ir.ComputedBuffer)
+                and not isinstance(n.node.data, ir.Scan)
+            ):
+                current_loop_reduced_writes.add(n.get_name())
+
+        @contextlib.contextmanager
+        def end_current_reduction_loop():
+            nonlocal current_loop_has_writes
+            if current_loop_has_writes:
+                # flush out any other runnable nodes to reduce number of loops
+                for other_node in nodes[index + 1 :]:
+                    if (
+                        node not in done
+                        and fits_in_main_body(other_node)
+                        and not (current_loop_reduced_writes & other_node.ancestors)
+                    ):
+                        schedule_node_in_loop(node)
+
+            if node_schedule and node_schedule[-1] is EnableReduction:
+                node_schedule.pop()
+            else:
+                node_schedule.append(DisableReduction)
+            yield
+            node_schedule.append(EnableReduction)
+            current_loop_reduced_writes.clear()
+            current_loop_has_writes = False
+
+        for index, node in enumerate(nodes):
+            if node in done:
+                continue
+            done.add(node)
+
+            def requires_closing_previous_reduction(node, node_schedule):
+                if rnumel == 1:
+                    return False
+                if not current_loop_reduced_writes & node.ancestors:
+                    return False
+                assert node_schedule and not isinstance(
+                    node_schedule[-1], (EnableReduction, DisableReduction)
+                )
+                return bool(current_loop_reduced_writes)
+
+            if fits_in_main_body(node):
+                if requires_closing_previous_reduction(node, node_schedule):
+                    with end_current_reduction_loop():
+                        pass  # need to start a new reduction loop
+
+                schedule_node_in_loop(node)
+            elif fits_outside_reduction(node):
+                with end_current_reduction_loop():
+                    node_schedule.append(node)
+            else:
+                raise NotImplementedError(
+                    f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
+                )
+
+        return node_schedule
+
+    def codegen_nodes(self, nodes: List[scheduler.SchedulerNode]):
+        """
+        Given a set of pre-fused nodes, generate a Triton kernel.
+        """
+        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+
+        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+        buf_accesses = collections.defaultdict(list)
+        for node in nodes:
+            for access in node.read_writes.reads | node.read_writes.writes:
+                buf_accesses[access.name].append(access)
+
+        schedule_log.debug("Schedule:\n %s", node_schedule)
+
+        return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel)
+
+    @staticmethod
+    def reduction_hint(node):
+        assert node.is_reduction()
+        if all(
+            dep.is_contiguous()
+            for dep in itertools.chain(node.read_writes.reads, node.read_writes.writes)
+        ):
+            return ReductionHint.INNER
+        else:
+            return node.node.data.reduction_hint
+
+    @staticmethod
+    def can_use_32bit_indexing(
+        numel: sympy.Expr, buffers: Iterable[Union[ir.Buffer, ir.TensorBox]]
+    ) -> bool:
+        int_max = torch.iinfo(torch.int32).max
+        size_hint = V.graph.sizevars.size_hint
+        has_hint = V.graph.sizevars.shape_env.has_hint
+
+        def within_32bit(e):
+            # Allow for unhinted e as long as we can still statically prove
+            # (e.g., via ValueRanges) that it is still in bounds
+            if V.graph.sizevars.is_expr_static_and_true(e <= int_max):
+                return True
+            # Otherwise, the hint MUST exist and be in range
+            return has_hint(e) and size_hint(e) <= int_max
+
+        if not within_32bit(numel):
+            return False
+
+        # Any use of a MultiOutputLayout will create a buffer with a
+        # Layout whose sizes are accounted for
+        buf_sizes = [
+            buf.get_layout().storage_size()
+            for buf in buffers
+            if not isinstance(buf.get_layout(), ir.MultiOutputLayout)
+        ]
+
+        if not all(within_32bit(size) for size in buf_sizes):
+            return False
+
+        # Only install guards for 32-bit indexing as there is no correctness
+        # issue with using 64-bit for everything
+        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
+        for size in buf_sizes:
+            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
+        return True
+
+    @staticmethod
+    def select_index_dtype(node_schedule, numel, reduction_numel):
+        # Gather all used buffer names
+        buffer_names = set()
+        for node in node_schedule:
+            if not isinstance(node, scheduler.BaseSchedulerNode):
+                continue
+
+            buffer_names.update(node.get_names())
+            buffer_names.update(node.used_buffer_names())
+
+        # Get buffers objects
+        def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
+            if name in V.graph.name_to_buffer:
+                return V.graph.name_to_buffer[name]
+            elif name in V.graph.graph_inputs:
+                return V.graph.graph_inputs[name]
+            elif name in V.graph.constants:
+                data = V.graph.constants[name]
+                return ir.ConstantBuffer(
+                    name,
+                    ir.FixedLayout(
+                        data.device, data.dtype, *V.graph.static_sizes_strides(data)
+                    ),
+                )
+            raise RuntimeError(f"Failed to find buffer matching name {name}")
+
+        buffers = [_get_buffer(name) for name in buffer_names]
+
+        # In theory we can separately check xnumel and rnumel are <= int_max
+        # but some indexers do use the full linear index so we need to be
+        # conservative here.
+        total_numel = numel * reduction_numel
+
+        if TritonScheduling.can_use_32bit_indexing(total_numel, buffers):
+            return "tl.int32"
+        return "tl.int64"
+
+    def get_kernel_args(self, node_schedule, numel, reduction_numel):
+        reductions = list(
+            filter(
+                lambda n: n not in (EnableReduction, DisableReduction)
+                and n.is_reduction(),
+                node_schedule,
+            )
+        )
+        if len(reductions) > 0:
+            hints = [self.reduction_hint(n) for n in reductions]
+            if hints.count(hints[0]) == len(hints):
+                reduction_hint_val = hints[0]
+            else:
+                reduction_hint_val = ReductionHint.DEFAULT
+        else:
+            reduction_hint_val = ReductionHint.DEFAULT
+
+        mutations = set()
+        for node in node_schedule:
+            if hasattr(node, "get_mutations"):
+                mutations.update(node.get_mutations())
+
+        index_dtype = self.select_index_dtype(node_schedule, numel, reduction_numel)
+
+        return reduction_hint_val, mutations, index_dtype
+
+    def codegen_comment(self, node_schedule):
+        wrapper = V.graph.wrapper_code
+        origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+        if origins:
+            wrapper.writeline(origins)
+
+        if config.debug_fusion:
+            from torch._inductor.scheduler import (
+                BaseSchedulerNode,
+                ForeachKernelSchedulerNode,
+            )
+
+            if not any(
+                isinstance(n, ForeachKernelSchedulerNode) for n in node_schedule
+            ):
+                # We probably should look what are the nodes inside a foreach
+                # schedule node
+                node_names = [
+                    n.get_name()
+                    for n in node_schedule
+                    if isinstance(n, BaseSchedulerNode)
+                ]
+                wrapper.writeline(
+                    f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
+                )
+
+    def codegen_node_schedule(
+        self, node_schedule, buf_accesses, numel, reduction_numel
+    ):
+        from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
+
+        tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
+        reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+            node_schedule, numel, reduction_numel
+        )
+
+        is_split_scan = any(
+            isinstance(node, BaseSchedulerNode) and node.is_split_scan()
+            for node in node_schedule
+        )
+        kernel_type = TritonSplitScanKernel if is_split_scan else TritonKernel
+        kernel_args = tiled_groups
+        kernel_kwargs = {
+            "reduction_hint": reduction_hint_val,
+            "mutations": mutations,
+            "index_dtype": index_dtype,
+        }
+        kernel = kernel_type(
+            *kernel_args,
+            **kernel_kwargs,
+        )
+        kernel.buf_accesses = buf_accesses
+
+        self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+
+        with V.set_kernel_handler(kernel):
+            src_code = kernel.codegen_kernel()
+
+        kernel_name = self.define_kernel(src_code, node_schedule)
+        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
+        kernel.kernel_name = kernel_name
+        kernel.code_hash = code_hash(src_code)
+
+        if kernel.persistent_reduction and config.triton.multi_kernel:
+            kernel2 = TritonKernel(
+                *kernel_args,
+                **kernel_kwargs,
+                disable_persistent_reduction=True,
+            )
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel2)
+            with V.set_kernel_handler(kernel2):
+                src_code2 = kernel2.codegen_kernel()
+            kernel_name2 = self.define_kernel(src_code2, node_schedule)
+            kernel2.kernel_name = kernel_name2
+            kernel2.code_hash = code_hash(src_code2)
+
+            final_kernel = MultiKernel([kernel, kernel2])
+        else:
+            final_kernel = kernel  # type: ignore[assignment]
+
+        with V.set_kernel_handler(final_kernel):
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.mark_run()
+
+        self.codegen_comment(node_schedule)
+        final_kernel.call_kernel(final_kernel.kernel_name)
+        if config.nan_asserts:
+            final_kernel.codegen_nan_check()
+        if config.warn_mix_layout:
+            final_kernel.warn_mix_layout(kernel_name)
+
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
+
+        if (
+            V.graph.wrapper_code.supports_intermediate_hooks
+            and config.generate_intermediate_hooks
+        ):
+            # Not every node in the schedule will actually be live on output;
+            # we can't check dead buffers.
+            live_outs = kernel.args.live_output_buffers()
+            for node in node_schedule:
+                if not isinstance(node, scheduler.BaseSchedulerNode):
+                    continue
+                name = node.get_name()
+                if name not in live_outs:
+                    continue
+                origin_node = node.node.get_origin_node()
+                if origin_node is not None:
+                    counters["inductor"]["intermediate_hooks"] += 1
+                    V.graph.wrapper_code.writeline(
+                        f"run_intermediate_hooks({origin_node.name!r}, {name})"
+                    )
+
+        self.scheduler.free_buffers()
+
+    def codegen_node_schedule_with_kernel(self, node_schedule, kernel):
+        def current_reduction_nodes(nodes):
+            return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
+
+        with kernel:
+            stack = contextlib.ExitStack()
+            kernel.set_last_usage(current_reduction_nodes(node_schedule))
+
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.decide_inplace_update()
+            for i, node in enumerate(node_schedule):
+                if node is DisableReduction:
+                    stack.enter_context(kernel.disable_reduction())
+                elif node is EnableReduction:
+                    stack.close()
+                    kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
+                else:
+                    # TODO - use split ranges ?
+                    indexing_dtype_strength_reduction(node._body)
+                    index_vars = kernel.split_and_set_ranges(node.get_ranges())
+                    node.codegen(index_vars)
+
+    def define_kernel(self, src_code, node_schedule):
+        wrapper = V.graph.wrapper_code
+        if src_code in wrapper.src_to_kernel:
+            kernel_name = wrapper.src_to_kernel[src_code]
+        else:
+            fused_name = (
+                get_fused_kernel_name(node_schedule, config.triton.descriptive_names)
+                if config.triton.descriptive_names
+                else ""
+            )
+            kernel_category = get_kernel_category_by_source_code(src_code)[:3]
+            kernel_name = "_".join(
+                ["triton", kernel_category, fused_name, wrapper.next_kernel_suffix()]
+            )
+            # use the original src_code as the key
+            wrapper.src_to_kernel[src_code] = kernel_name
+            subs_name = kernel_name if config.triton.unique_kernel_names else "triton_"
+
+            # DESCRIPTIVE_NAME is used for profiling purposes; it shows the full kernel name
+            # even when unique_kernel_names is turned off. Meanwhile, KERNEL_NAME is sometimes set
+            # to "triton_" to maximize caching opportunities (when unique_kernel_names = False).
+            src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
+            src_code = src_code.replace(str(Placeholder.KERNEL_NAME), subs_name)
+
+            # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+            # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+            src_code = src_code.replace("#pragma CMT", "#")
+
+            basename, _, kernel_path = get_path(code_hash(src_code.strip()), "py")
+
+            compile_wrapper = IndentedBuffer()
+            compile_wrapper.writeline(f"async_compile.triton({subs_name!r}, '''")
+            compile_wrapper.splice(src_code, strip=True)
+            compile_wrapper.writeline(
+                f"''', device_str='{V.graph.scheduler.current_device.type}')"
+            )
+
+            metadata_comment = f"# kernel path: {kernel_path}"
+            origins, detailed_origins = get_kernel_metadata(node_schedule, wrapper)
+            metadata_comment += "\n" + origins + "\n" + detailed_origins
+            wrapper.define_kernel(
+                kernel_name, compile_wrapper.getvalue(), metadata_comment
+            )
+
+            # log kernel metadata for offline analysis.
+            # E.g. one can find all unaligned inner reduction and check if
+            # padding helps with the perf kernel by kernel.
+            if is_metric_table_enabled("kernel_metadata"):
+                log_kernel_metadata(kernel_name, kernel_path, src_code)
+
+        return kernel_name
+
+    def codegen_template(
+        self, template_node, epilogue_nodes, only_gen_src_code=False
+    ) -> Optional[str]:
+        """
+        Codegen a triton template
+
+        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
+        """
+        _, (numel, rnumel) = template_node.group
+        assert rnumel == 1
+        kernel, render = template_node.node.make_kernel_render(template_node.node)
+        with kernel:
+            if not only_gen_src_code:
+                for node in [template_node, *epilogue_nodes]:
+                    node.mark_run()
+            partial_code = render()
+            for node in epilogue_nodes:
+                node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+
+        # finalize must be called after adding epilogue above
+        with V.set_kernel_handler(kernel):
+            # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
+            node_schedule = [template_node, *epilogue_nodes]
+
+            if config.benchmark_kernel:
+                num_gb = kernel.estimate_kernel_num_bytes() / 1e9
+                grid_args = V.graph.sizevars.size_hints(kernel.call_sizes)
+                assert kernel.meta is not None, "meta is None"
+                grid = kernel.grid_fn(*grid_args, kernel.meta)
+                src_code = (
+                    f"{kernel.imports_for_benchmark_kernel()}\n"
+                    f"{src_code}\n"
+                    f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}"
+                )
+
+            if only_gen_src_code:
+                return src_code
+
+            kernel_name = self.define_kernel(src_code, node_schedule)
+
+        self.codegen_comment(node_schedule)
+        kernel.call_kernel(kernel_name, template_node.node)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
+        self.scheduler.free_buffers()
+        return None
+
+    def codegen_sync(self):
+        V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
+
+    def codegen_foreach(self, foreach_node):
+        from .triton_foreach import ForeachKernel
+
+        for partitions_with_metadata in ForeachKernel.horizontal_partition(
+            foreach_node.get_subkernel_nodes(), self
+        ):
+            kernel = ForeachKernel()
+            for nodes, tiled_groups, numel, rnumel in partitions_with_metadata:
+                node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+                (
+                    reduction_hint_val,
+                    mutations,
+                    index_dtype,
+                ) = self.get_kernel_args(node_schedule, numel, rnumel)
+
+                subkernel = kernel.create_sub_kernel(
+                    *tiled_groups,
+                    reduction_hint=reduction_hint_val,
+                    mutations=mutations,
+                    index_dtype=index_dtype,
+                )
+
+                self.codegen_node_schedule_with_kernel(
+                    node_schedule,
+                    subkernel,
+                )
+
+                with V.set_kernel_handler(subkernel):
+                    for node in node_schedule:
+                        if node not in (EnableReduction, DisableReduction):
+                            node.mark_run()
+                V.graph.removed_buffers |= subkernel.removed_buffers
+                V.graph.inplaced_to_remove |= subkernel.inplaced_to_remove
+
+            src_code = kernel.codegen_kernel()
+            kernel_name = self.define_kernel(src_code, [foreach_node])
+            self.codegen_comment([foreach_node])
+            kernel.call_kernel(V.graph.wrapper_code, kernel_name)
+
+        self.scheduler.free_buffers()
+
+    @staticmethod
+    @functools.lru_cache(32)
+    def candidate_tilings(node):
+        ranges, reduction_ranges = node.get_ranges()
+        if len(ranges) <= 1:
+            return ()
+
+        rw = node.pointwise_read_writes()
+        assert len(rw.range_vars) == len(ranges)
+
+        # isinstance(dep, MemoryDep): this filters out StarDeps. StarDeps refer to reads
+        # that need to access the entire tensor; they don't contribute read indexing
+        # information (and practically, they don't have dep.index so they can't be used
+        # for stride_hints below
+        dep_sources = [rw.reads, rw.writes]
+        assert all(
+            isinstance(dep, (MemoryDep, StarDep))
+            for dep in itertools.chain.from_iterable(dep_sources)
+        )
+        deps = [
+            dep
+            for dep in itertools.chain.from_iterable(dep_sources)
+            if dep.name not in V.graph.removed_buffers and isinstance(dep, MemoryDep)
+        ]
+        write_names = {dep.name for dep in rw.writes}
+
+        tilings: List[CandidateTiling] = []
+
+        for dep in deps:
+            strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars)
+            assert len(strides) == len(ranges)
+            try:
+                split = strides.index(1) + 1
+                if split == len(ranges):
+                    continue
+                if all(s == 0 for s in strides[split:]):
+                    # if this is a broadcasted tensor and all dimensions after split are broadcast,
+                    # this is not a real split
+                    continue
+
+            except ValueError:
+                continue
+            tiled_groups = (
+                V.graph.sizevars.simplify(sympy_product(ranges[:split])),
+                V.graph.sizevars.simplify(sympy_product(ranges[split:])),
+            )
+            # score by number of elements
+            score = V.graph.sizevars.size_hint(
+                sympy_product(
+                    size for size, stride in zip(ranges, strides) if stride != 0
+                )
+            )
+            if dep.name in write_names:
+                # ngimel said contiguous writes is more important than reads
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[0]):
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[1]):
+                score *= 2
+
+            if (
+                V.graph.sizevars.size_hint(
+                    score - sympy_product(itertools.chain(ranges, reduction_ranges))
+                )
+                >= 0
+            ):
+                tilings.append(CandidateTiling(tiled_groups, score, dep.name))
+        return tilings
+
+    @classmethod
+    def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)):
+        """
+        Heuristics to decide how to tile kernels.
+        Currently, we tile based on stride-1 dimensions.
+
+        Returns:
+            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`
+
+        """
+        if reduction_numel != 1 or config.triton.max_tiles <= 1:
+            # TODO(jansel): should we tile reductions?
+            # do perf hint here if stride-1 dim is not being reduced
+            if perf_hint_log.level <= logging.WARNING:
+                for node in EnableReduction.filter(node_schedule):
+                    if len(cls.candidate_tilings(node)) > 0:
+                        perf_hint_log.info("reduction over non-contiguous dims")
+                        break
+            return (numel, reduction_numel)
+
+        seen_names = set()
+        candidate_tiles: Counter[Any] = collections.Counter()
+        for node in EnableReduction.filter(node_schedule):
+            for tiling in cls.candidate_tilings(node):
+                if tiling.name in seen_names:
+                    continue
+                seen_names.add(tiling.name)
+                candidate_tiles[tiling.tiling] += tiling.score
+
+        ranked_tilings = [tiling for tiling, score in candidate_tiles.most_common()]
+
+        if config.triton.max_tiles >= 3:
+            # Consider adding a third dimension of tiling, but only
+            # when a1 is a multiple of b1; otherwise, you have a lot
+            # of stragglers which is annoying to generate code for.
+            #
+            # NB: More than three max tiles is not enabled by default.
+
+            # Add one 3D tiling choice
+            for i in range(1, len(ranked_tilings)):
+                a0, a1 = ranked_tilings[0]
+                b0, b1 = ranked_tilings[i]
+                if V.graph.sizevars.size_hint(a1 - b1) == 0:
+                    continue
+                if V.graph.sizevars.size_hint(a1 - b1) < 0:
+                    # swap so a0 is bigger
+                    a0, a1 = ranked_tilings[i]
+                    b0, b1 = ranked_tilings[0]
+                assert V.graph.sizevars.size_hint(a1 - b1) > 0
+                if V.graph.sizevars.statically_known_multiple_of(a1, b1):
+                    tiling = (a0, FloorDiv(a1, b1), b1)
+                    ranked_tilings = [tiling] + ranked_tilings
+                    break  # only 1 choice for now
+
+        if len(ranked_tilings) > 1:
+            perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
+
+        for tiled_groups in ranked_tilings:
+            new_groups = (*tiled_groups, reduction_numel)
+            if all(
+                TritonKernel.is_compatible(new_groups, node.get_ranges())
+                for node in node_schedule
+                if isinstance(node, scheduler.SchedulerNode)
+            ):
+                return new_groups
+
+        return (numel, reduction_numel)
+
+    def flush(self):
+        pass
+
+    def ready_to_flush(self) -> bool:
+        return False
+
+    def benchmark_fused_nodes(self, nodes):
+        # empty last_usage. May cause more aggressive 'evict_last'. Should be fine.
+        for n in nodes:
+            n.last_usage = set()
+
+        if not nodes[0].is_template():
+            _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+            node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+
+            tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
+            reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+                node_schedule, numel, rnumel
+            )
+
+            kernel = TritonKernel(
+                *tiled_groups,
+                reduction_hint=reduction_hint_val,
+                mutations=mutations,
+                index_dtype=index_dtype,
+            )
+
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+            with config.patch("benchmark_kernel", True), V.set_kernel_handler(kernel):
+                src_code = kernel.codegen_kernel()
+        else:
+            template_node = nodes[0]
+            epilogue_nodes = nodes[1:]
+
+            with config.patch("benchmark_kernel", True):
+                src_code = self.codegen_template(
+                    template_node, epilogue_nodes, only_gen_src_code=True
+                )
+
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
+        mod = PyCodeCache.load(src_code)
+
+        def cache_file_path():
+            assert mod.__file__ is not None
+            return os.path.splitext(mod.__file__)[0] + ".kernel_perf"
+
+        def load_cache():
+            path = cache_file_path()
+            if os.path.exists(path):
+                with open(path) as fd:
+                    return float(fd.read())
+            return None
+
+        def store_cache():
+            path = cache_file_path()
+            with open(path, "w") as fd:
+                fd.write(str(ms))
+
+        log.debug(
+            "kernel src code for %s written to: %s",
+            {n.get_name() for n in nodes},
+            mod.__file__,
+        )
+        ms = load_cache()
+        if ms is not None:
+            return ms, mod.__file__
+
+        args = mod.get_args()
+        call = mod.call
+        wrapped_jit_function = mod.triton_
+
+        # call once to trigger the compilation
+        call(wrapped_jit_function.clone_args(*args)[0])
+
+        launchers = wrapped_jit_function.launchers
+        assert len(launchers) == 1
+        if launchers[0].n_spills > 0:
+            # skip benchmarking the kernel if there are register spills
+            ms = float("inf")
+        else:
+            # We have to clone the inplace updated arguments to avoid earlier calls
+            # generating out of range indices for later calls.
+            ms = do_bench(lambda: call(wrapped_jit_function.clone_args(*args)[0]))
+
+        log.debug(
+            "The fused kernel for %s took %.3f ms to run",
+            {n.get_name() for n in nodes},
+            ms,
+        )
+        store_cache()
+        return ms, mod.__file__
+
+
+@dataclasses.dataclass
+class CandidateTiling:
+    tiling: Tuple[sympy.Expr, sympy.Expr]
+    score: int  # higher is better
+    name: Optional[str] = None
+
+    @staticmethod
+    def is_good_size(s):
+        """Somewhat arbitrary heuristic used to boost scores for some sizes"""
+        s = V.graph.sizevars.size_hint(s)
+        return s >= 32 and (s % 32 == 0)
+
+
+class DisableReduction:
+    """
+    Marker to invoke `kernel.disable_reduction()`.  This closes a
+    reduction loop and allows for pointwise ops to occur on the output
+    of a reduction.
+    """
+
+
+class EnableReduction:
+    """
+    Marker to end a DisableReduction block.
+    """
+
+    @staticmethod
+    def filter(node_schedule):
+        """
+        Get the nodes from node_schedule skipping those in a
+        DisableReduction block.
+        """
+        disabled = False
+        for node in node_schedule:
+            if node in (EnableReduction, DisableReduction):
+                # Don't tile stuff outside the main reduction loop
+                disabled = node is DisableReduction
+            elif disabled:
+                pass
+            else:
+                yield node
+
+
+class CantSplit(Exception):
+    pass
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_foreach.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_foreach.py
new file mode 100644
index 0000000000000000000000000000000000000000..8698731a6ce13b28932a6ccda50c38e71ba0f0b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_foreach.py
@@ -0,0 +1,250 @@
+import itertools
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+from sympy import Integer
+
+import torch
+
+from .. import metrics
+from ..scheduler import SchedulerNode
+from ..utils import ceildiv, Placeholder
+from ..virtualized import V
+from .common import IndentedBuffer, Kernel
+from .triton import gen_common_triton_imports, TritonKernel
+from .triton_utils import config_of, signature_to_meta
+
+
+@dataclass
+class PartitionState:
+    partitions: List[
+        List[Tuple[List[SchedulerNode], Tuple[Integer, ...], Integer, Integer]]
+    ]
+    cur_partition: List[
+        Tuple[List[SchedulerNode], Tuple[Integer, ...], Integer, Integer]
+    ]
+    cur_count: int
+
+    def finalize(self):
+        if self.cur_partition:
+            self.partitions.append(self.cur_partition)
+
+
+class ForeachKernel(Kernel):
+    MAX_NUM_ARGS = 250  # number where I would no longer get triton errors
+
+    @staticmethod
+    def _update_partition(partition_state, node_rw_count, node_info):
+        if partition_state.cur_count + node_rw_count > ForeachKernel.MAX_NUM_ARGS:
+            partition_state.partitions.append(partition_state.cur_partition)
+            partition_state.cur_partition = [node_info]
+            partition_state.cur_count = node_rw_count
+        else:
+            partition_state.cur_count += node_rw_count
+            partition_state.cur_partition.append(node_info)
+
+    @staticmethod
+    def horizontal_partition(subkernel_nodes, triton_scheduling):
+        """Generates a list of lists of node info tuples which consist of (fused_nodes, tiling, numel, rnumel)
+        for each subkernel node where each sublist is guaranteed to not exceed CUDA limits for number of args
+        (read/writes) and to have the same 2D or 1D blocking strategy."""
+        assert len(subkernel_nodes) >= 1
+
+        partition_state_1d = PartitionState([], [], 0)
+        yelem_to_partition_state_2d: Dict[Integer, PartitionState] = defaultdict(
+            lambda: PartitionState([], [], 0)
+        )
+
+        for node in subkernel_nodes:
+            fused_nodes = node.get_nodes()
+            _, (numel, rnumel) = max(
+                fused_nodes, key=lambda x: int(x.is_reduction())
+            ).group
+            tiled_groups = triton_scheduling.select_tiling(fused_nodes, numel, rnumel)
+            node_info = fused_nodes, tiled_groups, numel, rnumel
+
+            read_writes = node.read_writes
+            read_write_count = len(read_writes.reads) + len(read_writes.writes)
+
+            if tiled_groups[1] == 1:
+                ForeachKernel._update_partition(
+                    partition_state_1d, read_write_count, node_info
+                )
+            else:
+                y_elem = tiled_groups[0]
+                partition_state_2d = yelem_to_partition_state_2d[y_elem]
+                ForeachKernel._update_partition(
+                    partition_state_2d, read_write_count, node_info
+                )
+
+        partition_state_1d.finalize()
+        all_partitions = partition_state_1d.partitions
+        for partition_state_2d in yelem_to_partition_state_2d.values():
+            partition_state_2d.finalize()
+            all_partitions.extend(partition_state_2d.partitions)
+
+        return all_partitions
+
+    def __init__(self):
+        super().__init__()
+        self.blocking_2d = False
+        self.block_size_1d = 1024  # Try tuning this value
+        self.block_size_2d = 32
+        self.num_warps = 8
+        self.sub_kernels = []
+        self.iter_vars_count = itertools.count()
+        self.x_block_count = 0
+        self.y_block_count = 0
+
+    def get_block_size(self):
+        if self.blocking_2d:
+            return self.block_size_2d
+        else:
+            return self.block_size_1d
+
+    @staticmethod
+    def codegen_pid_offsets(code, block_count, lower_bound, prefix):
+        if block_count == 0:
+            code.splice(f"{prefix}pid_offset = {prefix}pid")
+        else:
+            code.splice(f"{prefix}pid_offset = {prefix}pid - {lower_bound}")
+
+    def codegen_pid_range(self, code, x_elems):
+        num_x_blocks = ceildiv(x_elems, self.get_block_size())
+        upper_bound_x_pid = self.x_block_count + num_x_blocks
+        lower_bound_x_pid = self.x_block_count
+
+        if self.x_block_count == 0:
+            cond = "if"
+        else:
+            cond = "elif"
+
+        x_pid_bounds_check = (
+            f"xpid >= {lower_bound_x_pid} and xpid < {upper_bound_x_pid}"
+        )
+        code.splice(f"{cond} {x_pid_bounds_check}:")
+
+        with code.indent():
+            ForeachKernel.codegen_pid_offsets(
+                code, num_x_blocks, lower_bound_x_pid, "x"
+            )
+            self.x_block_count += num_x_blocks
+
+    def create_sub_kernel(self, *groups, index_dtype, mutations, reduction_hint):
+        sub_kernel = TritonKernel(
+            *groups,
+            index_dtype=index_dtype,
+            mutations=mutations,
+            pid_cache={
+                "tl.program_id(0)": "xpid_offset",
+                "tl.program_id(1)": "ypid",
+            },
+            reduction_hint=reduction_hint,
+        )
+        if self.blocking_2d:
+            assert len(groups) == 3
+
+        self.blocking_2d |= groups[1] != 1 and len(groups) == 3
+        metrics.generated_kernel_count -= 1
+        sub_kernel.args = self.args
+        sub_kernel.iter_vars_count = self.iter_vars_count
+        sub_kernel.cse.iter_buffer_ids = self.cse.iter_buffer_ids
+        self.sub_kernels.append(sub_kernel)
+        return sub_kernel
+
+    def jit_lines(self):
+        can_use_32bit = all(k.index_dtype == "tl.int32" for k in self.sub_kernels)
+        size_dtype = "tl.int32" if can_use_32bit else "tl.int64"
+        _, _, signature = self.args.python_argdefs()
+        triton_meta = {
+            "signature": signature_to_meta(signature, size_dtype=size_dtype),
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            "constants": {},
+        }
+        triton_meta["configs"] = [config_of(signature)]
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+        return f"""
+            @triton_heuristics.foreach(
+                num_warps={self.num_warps},
+                triton_meta={triton_meta!r},
+                inductor_meta={inductor_meta!r},
+            )
+            @triton.jit
+        """
+
+    def grid(self):
+        return (
+            self.x_block_count,
+            ceildiv(int(self.sub_kernels[0].numels[0]), self.block_size_2d)
+            if self.blocking_2d
+            else 1,
+            1,
+        )
+
+    def codegen_kernel(self, name=None):
+        code = IndentedBuffer()
+
+        code.splice(gen_common_triton_imports())
+        argdefs, _, _ = self.args.python_argdefs()
+        code.splice(self.jit_lines())
+        code.writeline(
+            f"def {name or str(Placeholder.KERNEL_NAME)}({', '.join(argdefs)}):"
+        )
+
+        with code.indent():
+            code.splice("xpid = tl.program_id(0)")
+            if self.blocking_2d:
+                code.splice("ypid = tl.program_id(1)")
+                code.splice(f"XBLOCK: tl.constexpr = {self.block_size_2d}")
+                code.splice(f"YBLOCK: tl.constexpr = {self.block_size_2d}")
+            else:
+                code.splice(f"XBLOCK: tl.constexpr = {self.block_size_1d}")
+
+            for sub_kernel in self.sub_kernels:
+                assert len(sub_kernel.numels) <= 3
+                # TODO mlazos: support dynamic shapes
+                numel_ind = 0 if not self.blocking_2d else 1
+                self.codegen_pid_range(code, int(sub_kernel.numels[numel_ind]))
+                with code.indent():
+                    if self.blocking_2d:
+                        code.splice(f"ynumel = {sub_kernel.numels[0]}")
+                        code.splice(f"xnumel = {sub_kernel.numels[1]}")
+                    else:
+                        code.splice(f"xnumel = {sub_kernel.numels[0]}")
+
+                    sub_kernel.codegen_body()
+                    code.splice(sub_kernel.body)
+
+            code.splice("else:")
+            with code.indent():
+                code.splice("pass")
+
+        return code.getvalue()
+
+    def call_kernel(self, code, name: str):
+        _, call_args, _ = self.args.python_argdefs()
+        # dynamo wraps unspec variable as 0d CPU tensor, need convert to scalar
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+        if V.graph.cpp_wrapper:
+            V.graph.wrapper_code.generate_kernel_call(
+                name,
+                call_args,
+                device_index=V.graph.scheduler.current_device.index,
+                grid=self.grid(),
+            )
+        else:
+            # TODO: refactor generate_kernel_call
+            call_args_str = ", ".join(call_args)
+            stream_name = code.write_get_raw_stream(
+                V.graph.scheduler.current_device.index
+            )
+            code.writeline(
+                f"{name}.run({call_args_str}, grid=({self.grid()}), stream={stream_name})"
+            )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_split_scan.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_split_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6d935dc196d0c23e358df469b241dfbc089d700
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_split_scan.py
@@ -0,0 +1,180 @@
+import functools
+
+from typing import Optional, Set
+
+from torch._inductor import config, ir
+
+from torch._inductor.codegen.triton import (
+    IterationRangesRoot,
+    triton_compute_type,
+    TritonKernel,
+    TritonKernelOverrides,
+)
+
+from torch._prims_common import prod
+
+from torch.utils._sympy.functions import CeilDiv
+
+
+class TritonSplitScanKernel(TritonKernel):
+    """Generates a triton kernel that supports ops.scan calls while also splitting
+    the reduction dimension over multiple triton programs.
+
+    For this kernel, loop numels will always take the form ``(xdim, rdim)``
+    and the grid has the shape ``(CeilDiv(rdim, RBLOCK), xdim)``. Communication
+    between blocks occurs within a global memory workspace buffer, which
+    must be zero-filled before launching the kernel.
+
+    Note that generation for ``ops.reduction`` is not supported.
+
+    For details of the communication strategy, see
+    https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    """
+
+    def __init__(
+        self,
+        *groups,
+        index_dtype: str,
+        mutations: Optional[Set[str]] = None,
+        reduction_hint=ir.ReductionHint.DEFAULT,
+        min_elem_per_thread=0,
+    ):
+        super().__init__(
+            *groups,
+            index_dtype=index_dtype,
+            mutations=mutations,
+            pid_cache=None,
+            reduction_hint=reduction_hint,
+            min_elem_per_thread=min_elem_per_thread,
+        )
+        self.no_x_dim = True
+
+    def initialize_range_tree(self, pid_cache):
+        prefixes = "yxr"
+        assert len(self.numels) <= len(
+            prefixes
+        ), "z dimension not supported for split scan"
+        active_prefixes = prefixes[len(prefixes) - len(self.numels) :]
+
+        grid_dims = "rxy"
+        for numel, prefix in zip(self.numels, active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = 0 if is_reduction else None
+            grid_dim = grid_dims.find(prefix)
+            self.range_trees.append(
+                IterationRangesRoot(
+                    f"{prefix}index",
+                    numel,
+                    prefix,
+                    grid_dim,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=False,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                )
+            )
+        for tree in self.range_trees:
+            tree.codegen_header(self.body)
+
+    def reduction(self, dtype, src_dtype, reduction_type, value):
+        raise NotImplementedError("NYI TritonSplitDimKernel reductions")
+
+    def scan(self, dtype, combine_fn, value, init):
+        import triton.language as tl
+
+        compute_type = triton_compute_type(dtype)
+        compute_type_triton = getattr(tl, compute_type[3:])
+
+        element_nbits = compute_type_triton.primitive_bitwidth
+
+        scratch_type = "tl.uint32" if element_nbits <= 16 else "tl.uint64"
+        scratch_type_triton = getattr(tl, scratch_type[3:])
+        scratch_elems_per_block = 3 if element_nbits == 64 else 1
+        scratch_nbytes_per_block = scratch_elems_per_block * (
+            scratch_type_triton.primitive_bitwidth // 8
+        )
+
+        cse_load = functools.partial(self.cse.generate, self.loads)
+        cse_compute = functools.partial(self.cse.generate, self.compute)
+
+        assert len(self.numels) == 2, "Unexpected tiling"
+        min_rblock = config.triton.min_split_scan_rblock
+        max_blocks = prod(self.numels[:-1]) * CeilDiv(self.numels[-1], min_rblock)
+        nbytes = scratch_nbytes_per_block * max_blocks
+        scratch_base, offset = self.args.workspace(nbytes=nbytes, zero_fill=True)
+        if offset != 0:
+            scratch_base = cse_load(f"{scratch_base} + {self.index_to_str(offset)}")
+        runtime_rblocks = cse_load(f"tl.num_programs({self.range_trees[-1].index})")
+        scratch_base = cse_load(
+            f"{scratch_base}.to(tl.pointer_type({scratch_type})) + xoffset * "
+            f"{scratch_elems_per_block} * {runtime_rblocks}"
+        )
+
+        masks = {f"{tree.prefix}mask" for tree in self.range_trees}
+        self.filter_masks(masks)
+        masks = sorted(masks)
+        if self._load_mask:
+            masks.append(self._load_mask)
+
+        value = cse_compute(f"{value}.to({compute_type})")
+        value = cse_compute(f"tl.broadcast_to({value}, {self.dense_size_str()})")
+        init = cse_compute(f"tl.full([], {init}, {compute_type})")
+        if masks:
+            cond = " & ".join(masks)
+            masked_value = cse_compute(TritonKernelOverrides.where(cond, value, init))
+        else:
+            masked_value = value
+
+        combine_helper_fn = self._lift_helper(combine_fn, 2)
+        dim = self.triton_tensor_ndim() - 1
+        assert dim == 0, ""
+
+        block_sum = cse_compute(
+            f"tl.reduce({masked_value}, {dim}, {combine_helper_fn})"
+        )
+        exclusive_prefix = self.cse.newvar()
+        if element_nbits == 64:
+            self.compute.splice(
+                f"""
+                {exclusive_prefix} = triton_helpers.exclusive_scan_decoupled_lookback_64(
+                    {scratch_base},
+                    {block_sum},
+                    {self.range_trees[-1].get_pid()},
+                    {combine_helper_fn},
+                    {init},
+                )
+                """,
+                strip=True,
+            )
+
+        else:
+            assert element_nbits <= 32
+            value_as_uint_dtype = f"tl.uint{element_nbits}"
+
+            self.compute.splice(
+                f"""
+                {exclusive_prefix} = triton_helpers.exclusive_scan_decoupled_lookback(
+                    {scratch_base},
+                    {block_sum},
+                    {self.range_trees[-1].get_pid()},
+                    {combine_helper_fn},
+                    {init},
+                    DTYPE_VALUE_AS_UINT={value_as_uint_dtype},
+                    DTYPE_PACK={scratch_type},
+                )
+                """,
+                strip=True,
+            )
+        # Compute final cumsum
+        block_scan = cse_compute(
+            f"tl.associative_scan({masked_value}, {dim}, {combine_helper_fn})"
+        )
+        return cse_compute(f"{combine_helper_fn}({exclusive_prefix}, {block_scan})")
+
+    def _get_heuristic(self):
+        return "split_scan"
+
+    def _get_grid_fn(self):
+        return "split_scan_grid"
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_utils.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c12fded8a2f0b0d602fc9365bc3abb527540640
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/triton_utils.py
@@ -0,0 +1,130 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from .. import config
+from ..utils import _type_of, instance_descriptor
+from ..virtualized import V
+from .common import KernelArgType, SizeArg, TensorArg, WorkspaceArg
+
+
+def signature_of(arg: KernelArgType, *, size_dtype: str) -> str:
+    if isinstance(arg, TensorArg):
+        # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes.
+        # Related PR: https://github.com/openai/triton/pull/2279/
+        if arg.dtype == torch.float8_e4m3fn:
+            tye = "*fp8e4nv"
+        elif arg.dtype == torch.float8_e5m2:
+            tye = "*fp8e5"
+        elif arg.dtype == torch.float8_e4m3fnuz:
+            tye = "*fp8e4b8"
+        elif arg.dtype == torch.float8_e5m2fnuz:
+            tye = "*fp8e5b16"
+        else:
+            tye = _type_of(arg.dtype)
+        if V.graph.is_unspec_arg(arg.buffer):
+            # had unwrapped 0d tensor as scalar
+            new_tye = tye.lstrip("*")
+            if new_tye in ["fp16", "bf16"]:
+                return "fp32"
+            else:
+                return new_tye
+        else:
+            return tye
+    if isinstance(arg, SizeArg):
+        if arg.expr is None:
+            # From triton/runtime/jit.py
+            # `None` is nullptr.  Implicitly convert to *i8.
+            return "*i8"
+        elif isinstance(arg.expr, float):
+            return "fp32"
+        if size_dtype == "tl.int32":
+            return "i32"
+        elif size_dtype == "tl.int64":
+            return "i64"
+        else:
+            raise NotImplementedError(f"unhandled size_dtype {size_dtype}")
+    if isinstance(arg, WorkspaceArg):
+        return "*i8"
+    raise NotImplementedError(f"unhandled {type(arg)}: {arg}")
+
+
+def signature_to_meta(
+    signature: List[KernelArgType],
+    *,
+    size_dtype: str,
+    indices: Optional[List[int]] = None,
+) -> Dict[int, str]:
+    if indices is None:
+        indices = list(range(len(signature)))
+    return {
+        i: signature_of(arg, size_dtype=size_dtype)
+        for i, arg in zip(indices, signature)
+    }
+
+
+def config_of(
+    args: List[KernelArgType],
+    *,
+    indices: Optional[List[int]] = None,
+) -> Any:
+    if indices is None:
+        indices = list(range(len(args)))
+
+    def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
+        """
+        Roughly follow triton code here:
+        https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
+        """
+        if isinstance(x, TensorArg):
+            if include_tensor:
+                offset_aligned = V.graph.sizevars.statically_known_multiple_of(
+                    x.offset * x.dtype.itemsize, alignment  # type: ignore[arg-type]
+                )
+                return offset_aligned and not V.graph.scheduler.is_unaligned_buffer(
+                    x.buffer
+                )
+            else:
+                return False
+        if isinstance(x, SizeArg):
+            # TODO(voz): These are kinda redundant, if we can solve out statically_known_multiple_of with
+            # _maybe_evaluate_static...
+            if x.name.startswith("load_seed_offset"):
+                return False
+            if x.expr is None:
+                return False
+            if isinstance(x.expr, float):
+                return False
+            return V.graph.sizevars.statically_known_multiple_of(x.expr, alignment)  # type: ignore[arg-type]
+        if isinstance(x, WorkspaceArg):
+            return V.graph.sizevars.statically_known_multiple_of(x.nbytes, alignment)  # type: ignore[arg-type]
+        raise NotImplementedError(f"unhandled {type(x)}: {x}")
+
+    if config.triton.divisible_by_16:
+        divisible_by_16 = tuple(
+            i
+            for i, arg in zip(indices, args)
+            if is_aligned(arg, alignment=16, include_tensor=True)
+        )
+    else:
+        divisible_by_16 = ()
+    divisible_by_8 = tuple(
+        i
+        for i, arg in zip(indices, args)
+        if is_aligned(arg, alignment=8, include_tensor=False)
+    )
+
+    equal_to_1 = tuple(
+        i
+        for i, arg in zip(indices, args)
+        if isinstance(arg, SizeArg)
+        and arg.expr is not None
+        and V.graph.sizevars.statically_known_equals(arg.expr, 1)  # type: ignore[arg-type]
+    )
+    # ids_of_folded_args is set from equal_to_1
+    # and None args by the Triton compiler
+    ids_of_folded_args = tuple(equal_to_1)
+
+    return instance_descriptor(
+        divisible_by_16, equal_to_1, ids_of_folded_args, divisible_by_8
+    )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/codegen/wrapper.py b/MLPY/Lib/site-packages/torch/_inductor/codegen/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb4976ade34e58600dcabe6e5ba5ff24e3e3fff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/codegen/wrapper.py
@@ -0,0 +1,1543 @@
+import collections
+import contextlib
+import dataclasses
+import functools
+import inspect
+import operator
+import re
+from itertools import count
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+
+import sympy
+from sympy import Expr
+
+import torch
+import torch._ops
+from torch._dynamo.utils import counters, dynamo_timed
+
+from torch._inductor.codegen.multi_kernel import MultiKernelState
+from torch.fx.experimental.symbolic_shapes import SymTypes
+from torch.fx.node import _get_qualified_name
+from torch.utils._sympy.singleton_int import SingletonInt
+
+from .. import codecache, config, ir
+from ..ir import ReinterpretView
+from ..utils import (
+    cache_on_self,
+    get_benchmark_name,
+    LineContext,
+    sympy_product,
+    sympy_str,
+)
+from ..virtualized import V
+from .common import CodeGen, DeferredLine, IndentedBuffer, PythonPrinter
+from .triton_utils import config_of, signature_to_meta
+
+if TYPE_CHECKING:
+    import triton
+
+    from ..graph import GraphLowering
+
+
+pexpr = PythonPrinter().doprint
+
+
+ReuseKey = Tuple[torch.device, torch.dtype, str]
+
+
+def buffer_reuse_key(node: ir.Buffer) -> ReuseKey:
+    return (
+        node.get_device(),
+        node.get_dtype(),
+        # NB: this is symbolic so that we don't try to reuse a buffer
+        # for s0 for s1, just because they happen to share the same
+        # size hint
+        sympy_str(V.graph.sizevars.simplify(node.layout.storage_size())),
+    )
+
+
+def convert_arg_type(arg: torch.Argument) -> str:
+    from .cpp import CONTAINER_PYTHON_TO_CPP, PYTHON_TO_CPP
+
+    # use x.real_type instead of x.type so that we get ScalarType instead of int
+    python_type = repr(arg.real_type)  # type: ignore[attr-defined]
+
+    if python_type == "Tensor":
+        # Conversions rules follow https://github.com/pytorch/pytorch/tree/main/aten/src/ATen/native#func
+        if arg.alias_info is not None and arg.alias_info.is_write:
+            return f"at::{python_type}&"
+        else:
+            return f"at::{python_type} const&"
+
+    if python_type in PYTHON_TO_CPP:
+        cpp_type = PYTHON_TO_CPP[python_type]
+        return cpp_type
+
+    # Convert args of container types e.g. Optional[*]
+    for py_container, cpp_container in CONTAINER_PYTHON_TO_CPP.items():
+        container_match = re.findall(py_container + r"\[([a-zA-Z_]+)]", python_type)
+        if len(container_match) == 1:
+            contained_type = container_match[0]
+            assert (
+                contained_type in PYTHON_TO_CPP
+            ), f"unsupported {py_container} type in convert_arg_type: {contained_type}"
+            cpp_contained_type = PYTHON_TO_CPP[contained_type]
+            return f"{cpp_container}<{cpp_contained_type}>"
+
+    raise AssertionError(f"unsupport python_type: {python_type}")
+
+
+def convert_return_type(ret: torch.Argument) -> str:
+    # use x.real_type instead of x.type so that we get ScalarType instead of int
+    python_type = repr(ret.real_type)  # type: ignore[attr-defined]
+    python_to_cpp = {
+        "Tensor": "at::Tensor",
+        "List[Tensor]": "std::vector<at::Tensor>",
+    }
+
+    cpp_type = python_to_cpp.get(python_type, None)
+    assert cpp_type is not None, f"NYI return type: {python_type}"
+    # An output aliasing an input is returned by reference only when it's a
+    # Tensor, not when it's a Tensor[]. For example, aten.split.Tensor's output
+    # aliases the input tensor, but the op returns a vector by value.
+    if python_type == "Tensor" and ret.alias_info is not None:
+        cpp_type += "&"
+    return cpp_type
+
+
+def get_cpp_op_schema(kernel: torch._ops.OpOverload) -> str:
+    args = kernel._schema.arguments
+    returns = kernel._schema.returns
+
+    num_returns = len(returns)
+    assert num_returns > 0, "must have at least one return value"
+
+    if num_returns == 1:
+        cpp_return_value = convert_return_type(returns[0])
+    elif num_returns > 1:
+        tuple_returns = ", ".join([convert_return_type(r) for r in returns])
+        cpp_return_value = f"std::tuple<{tuple_returns}>"
+
+    cpp_arg_type = [f"{convert_arg_type(arg)} {arg.name}" for arg in args]
+    return f"{cpp_return_value}({', '.join(cpp_arg_type)})"  # type: ignore[possibly-undefined]
+
+
+# TODO: Move to a well known place
+TritonMetaParams = Dict[str, int]
+TritonGrid = Union[
+    Tuple[Union[int, sympy.Expr], ...], Callable[[TritonMetaParams], Tuple[int, ...]]
+]
+
+
+def user_defined_kernel_grid_fn_code(
+    name: str,
+    configs: List["triton.Config"],
+    grids: List[TritonGrid],
+    wrapper: Optional["WrapperCodeGen"] = None,
+) -> Tuple[str, str]:
+    output = IndentedBuffer()
+
+    def _convert_to_sympy_expr(item: Union[int, sympy.Expr]) -> sympy.Expr:
+        return item if isinstance(item, sympy.Expr) else sympy.Integer(item)
+
+    def determine_grid(grid: TritonGrid):
+        if wrapper is None or callable(grid):
+            # return as-is when used in eager mode or when grid is callable
+            return grid
+        # Grid contains ints/Expr, so utilize wrapper's expr printer for codegen
+        sympy_grid = tuple(_convert_to_sympy_expr(g) for g in grid)
+        return wrapper.codegen_shape_tuple(sympy_grid)
+
+    fn_name = f"grid_wrapper_for_{name}"
+    output.writeline(f"def {fn_name}(meta):")
+    with output.indent():
+        if len(grids) == 1:
+            grid = determine_grid(grids[0])
+            output.writeline(f"return {grid}")
+        else:
+            assert len(grids) > 1
+            assert len(grids) == len(configs)
+            seen = set()
+            for grid, c in zip(grids, configs):
+                guards = [f"meta['{name}'] == {val}" for name, val in c.kwargs.items()]
+                guards = " and ".join(guards)
+                grid = determine_grid(grid)
+                statement = f"if {guards}: return {grid}"
+                if statement in seen:
+                    continue
+                seen.add(statement)
+                output.writeline(statement)
+
+    return fn_name, output.getvalue()
+
+
+@dataclasses.dataclass
+class SymbolicCallArg:
+    inner: str
+    # the original symbolic expression represented by inner
+    inner_expr: sympy.Expr
+
+    def __str__(self):
+        return str(self.inner)
+
+
+# Default thread stack sizes vary by platform:
+# - Linux: 8 MB
+# - macOS: 512 KB
+# - Windows: 1 MB
+# Just pick something comfortably smaller than the smallest for now.
+MAX_STACK_ALLOCATION_SIZE = 1024 * 100
+
+
+class MemoryPlanningState:
+    def __init__(self):
+        super().__init__()
+        self.reuse_pool: Dict[
+            ReuseKey, List[FreeIfNotReusedLine]
+        ] = collections.defaultdict(list)
+        self.total_allocated_buffer_size: int = 0
+
+    def __contains__(self, key: ReuseKey) -> bool:
+        return bool(self.reuse_pool.get(key, None))
+
+    def pop(self, key: ReuseKey) -> "FreeIfNotReusedLine":
+        item = self.reuse_pool[key].pop()
+        assert not item.is_reused
+        return item
+
+    def push(self, key: ReuseKey, item: "FreeIfNotReusedLine") -> None:
+        assert not item.is_reused
+        self.reuse_pool[key].append(item)
+
+
+class WrapperLine:
+    pass
+
+
+@dataclasses.dataclass
+class EnterSubgraphLine(WrapperLine):
+    wrapper: "WrapperCodeGen"
+    graph: "GraphLowering"
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper.push_codegened_graph(self.graph)
+        code.do_indent()
+
+
+@dataclasses.dataclass
+class ExitSubgraphLine(WrapperLine):
+    wrapper: "WrapperCodeGen"
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        self.wrapper.pop_codegened_graph()
+        code.do_unindent()
+
+
+@dataclasses.dataclass
+class EnterDeviceContextManagerLine(WrapperLine):
+    device_idx: int
+    last_seen_device_guard_index: Optional[int]
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        if V.graph.cpp_wrapper:
+            code.writeline("\n")
+            if V.graph.aot_mode:
+                # In AOT mode, we have a stream provided as a param. A stream is
+                # associated with a device, so we never expect the device to change.
+                # CUDAStreamGuard sets the stream and the device.
+                if self.last_seen_device_guard_index is None:
+                    if config.abi_compatible:
+                        code.writeline(
+                            "AOTICudaStreamGuard stream_guard(stream, this->device_idx_);"
+                        )
+                    else:
+                        code.writeline(
+                            "at::cuda::CUDAStreamGuard stream_guard("
+                            + "at::cuda::getStreamFromExternal(stream, this->device_idx_));"
+                        )
+                else:
+                    assert (
+                        self.last_seen_device_guard_index == self.device_idx
+                    ), "AOTInductor only supports running on one CUDA device"
+            else:
+                if self.last_seen_device_guard_index is None:
+                    code.writeline(
+                        f"AOTICudaGuard device_guard({self.device_idx});"
+                        if config.abi_compatible
+                        else f"at::cuda::CUDAGuard device_guard({self.device_idx});"
+                    )
+                else:
+                    code.writeline(f"device_guard.set_index({self.device_idx});")
+        else:
+            # Note _DeviceGuard has less overhead than device, but only accepts
+            # integers
+            code.writeline(f"with {V.graph.device_ops.device_guard(self.device_idx)}:")
+            code.do_indent()
+            code.writeline(V.graph.device_ops.set_device(self.device_idx))
+
+
+class ExitDeviceContextManagerLine(WrapperLine):
+    def codegen(self, code: IndentedBuffer) -> None:
+        if not V.graph.cpp_wrapper:
+            code.do_unindent()
+
+
+@dataclasses.dataclass
+class MemoryPlanningLine(WrapperLine):
+    wrapper: "WrapperCodeGen"
+
+    def plan(self, state: MemoryPlanningState) -> "MemoryPlanningLine":
+        """First pass to find reuse"""
+        return self
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        """Second pass to output code"""
+        pass
+
+    def __str__(self) -> str:
+        """
+        Emits a string representation that fits on one line.
+        """
+        args: List[str] = []
+        for field in dataclasses.fields(self):
+            if field.name == "wrapper":
+                continue
+            val = getattr(self, field.name)
+            args.append(
+                f"{field.name}={val.get_name() if field.type is ir.Buffer else val}"
+            )
+        return f"{type(self).__name__}({', '.join(args)})"
+
+
+@dataclasses.dataclass
+class AllocateLine(MemoryPlanningLine):
+    node: ir.Buffer
+
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
+        if self.node.get_name() in V.graph.removed_buffers:
+            return NullLine(self.wrapper)
+
+        # try to reuse a recently freed buffer
+        key = buffer_reuse_key(self.node)
+        if config.allow_buffer_reuse and key in state:
+            free_line = state.pop(key)
+            free_line.is_reused = True
+            return ReuseLine(self.wrapper, free_line.node, self.node)
+
+        if self.node.get_device().type == "cpu":
+            static_shape = self.wrapper.static_shape_for_buffer_or_none(self.node)
+            if static_shape is not None:
+                state.total_allocated_buffer_size += int(
+                    functools.reduce(operator.mul, static_shape, 1)
+                )
+
+        return self
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        assert self.node.get_name() not in V.graph.removed_buffers
+        line = self.wrapper.make_buffer_allocation(self.node)
+        code.writeline(line)
+
+
+@dataclasses.dataclass
+class FreeIfNotReusedLine(MemoryPlanningLine):
+    node: ir.Buffer
+    is_reused: bool = False
+
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
+        if isinstance(self.node.layout, (ir.AliasedLayout, ir.MultiOutputLayout)):
+            return self
+        assert not self.is_reused
+        if self.node.get_name() in V.graph.removed_buffers:
+            return NullLine(self.wrapper)
+        if config.allow_buffer_reuse:
+            state.push(buffer_reuse_key(self.node), self)
+        return self
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        assert self.node.get_name() not in V.graph.removed_buffers
+        if not self.is_reused:
+            code.writeline(self.wrapper.make_buffer_free(self.node))
+
+
+@dataclasses.dataclass
+class ReuseLine(MemoryPlanningLine):
+    node: ir.Buffer
+    reused_as: ir.Buffer
+    delete_old: bool = True
+
+    def plan(self, state: MemoryPlanningState) -> MemoryPlanningLine:
+        if self.node.get_name() in V.graph.removed_buffers:
+            assert self.reused_as.get_name() in V.graph.removed_buffers
+            return NullLine(self.wrapper)
+        assert self.reused_as.get_name() not in V.graph.removed_buffers
+        return self
+
+    def codegen(self, code: IndentedBuffer) -> None:
+        assert self.node.get_name() not in V.graph.removed_buffers
+        assert self.reused_as.get_name() not in V.graph.removed_buffers
+        code.writeline(
+            self.wrapper.make_buffer_reuse(self.node, self.reused_as, self.delete_old)
+        )
+
+
+class NullLine(MemoryPlanningLine):
+    pass
+
+
+BufferName = str
+
+
+class WrapperCodeGen(CodeGen):
+    """
+    Generate outer wrapper in Python that calls the kernels.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._names_iter: Iterator[int] = count()
+        self.header = IndentedBuffer()
+        self.prefix = IndentedBuffer()
+        self.suffix = IndentedBuffer()
+        self.wrapper_call = IndentedBuffer()
+        # If the generated source code is exactly the same, reuse the
+        # pre-existing kernel for it
+        self.src_to_kernel: Dict[str, str] = {}
+        self.kernel_numel_expr: Set[Tuple[str, "GraphLowering"]] = set()
+        self.lines: List[Union[MemoryPlanningLine, LineContext]] = []
+        self.declare = ""
+        self.declare_maybe_reference = ""
+        self.ending = ""
+        self.open_bracket = "["
+        self.closed_bracket = "]"
+        self.comment = "#"
+        self.namespace = ""
+        self.none_str = "None"
+        self.size = "size()"
+        self.stride = "stride()"
+        self.last_seen_device_guard_index: Optional[int] = None
+        self.supports_intermediate_hooks = True
+        self.expr_printer = pexpr
+        self.user_defined_kernel_cache: Dict[Tuple[Any, ...], Tuple[str, Any]] = {}
+        self.unbacked_symbol_decls: Set[str] = set()  # str of sympy.Symbol
+        self.allow_stack_allocation: Optional[bool] = None
+        self.stack_allocated_buffers: Dict[BufferName, ir.Buffer] = {}
+        self.computed_sizes: Set[sympy.Symbol] = set()
+
+        # this is used for tracking which GraphLowering instance---parent graph
+        # or (nested) subgraph---is currently codegened; the primary use case is
+        # including the graph instance into a cache key to avoid cross-graph
+        # caching during lowering of nested subgraphs
+        self.codegened_graph_stack = [V.graph]
+
+        self.write_header()
+        self.write_prefix()
+
+        if not V.graph.aot_mode:
+            for name, hashed in V.graph.constant_reprs.items():
+                # include a hash so our code cache puts different constants into different files
+                self.write_constant(name, hashed)
+
+        self.allocated: Set[BufferName] = set()
+        self.freed: Set[BufferName] = set()
+
+        # maps from reusing buffer to reused buffer
+        self.reuses: Dict[BufferName, BufferName] = dict()
+
+        self.write_get_raw_stream = functools.lru_cache(None)(  # type: ignore[assignment]
+            self.write_get_raw_stream
+        )
+
+        @functools.lru_cache(None)
+        def add_import_once(line: str) -> None:
+            self.header.writeline(line)
+
+        self.add_import_once = add_import_once
+        self._metas: Dict[str, str] = {}
+        self.multi_kernel_state = MultiKernelState()
+
+    def write_constant(self, name: str, hashed: str) -> None:
+        self.header.writeline(f"{name} = None  # {hashed}")
+
+    def write_header(self) -> None:
+        self.header.splice(
+            f"""
+                from ctypes import c_void_p, c_long
+                import torch
+                import math
+                import random
+                import os
+                import tempfile
+                from math import inf, nan
+                from torch._inductor.hooks import run_intermediate_hooks
+                from torch._inductor.utils import maybe_profile
+                from torch._inductor.codegen.memory_planning import _align as align
+
+                from torch import device, empty_strided
+                from {codecache.__name__} import AsyncCompile
+                from torch._inductor.select_algorithm import extern_kernels
+                from torch._inductor.codegen.multi_kernel import MultiKernelCall
+
+                aten = torch.ops.aten
+                inductor_ops = torch.ops.inductor
+                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
+                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
+                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
+                alloc_from_pool = torch.ops.inductor._alloc_from_pool
+                reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
+                async_compile = AsyncCompile()
+
+            """
+        )
+
+    @cache_on_self
+    def write_triton_header_once(self) -> None:
+        self.header.splice(
+            """
+            import triton
+            import triton.language as tl
+            from torch._inductor.triton_heuristics import grid, split_scan_grid, start_graph, end_graph
+            {}
+            """.format(
+                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
+            )
+        )
+
+    def add_meta_once(self, meta: TritonMetaParams) -> str:
+        meta = repr(meta)
+        if meta not in self._metas:
+            var = f"meta{len(self._metas)}"
+            self._metas[meta] = var
+            self.header.writeline(f"{var} = {meta}")
+        return self._metas[meta]
+
+    @cache_on_self
+    def get_output_refs(self) -> List[str]:
+        return [x.codegen_reference(self.wrapper_call) for x in V.graph.graph_outputs]
+
+    def mark_output_type(self) -> None:
+        return
+
+    def codegen_input_size_asserts(self) -> None:
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+
+            # comparing strides for 0 size tensor is tricky. Ignore them for now.
+            if sympy_product(buf.get_size()) == 0:
+                continue
+            size = self.codegen_shape_tuple(buf.get_size())
+            stride = self.codegen_shape_tuple(buf.get_stride())
+            self.prefix.writeline(f"assert_size_stride({name}, {size}, {stride})")
+
+    def codegen_input_nan_asserts(self) -> None:
+        self.prefix.writeline("# make sure graph inputs are not nan/inf")
+        for name, buf in V.graph.graph_inputs.items():
+            if isinstance(buf, sympy.Expr):
+                continue
+
+            line = f"assert not {name}.isnan().any().item()"
+            self.prefix.writeline(line)
+            line = f"assert not {name}.isinf().any().item()"
+            self.prefix.writeline(line)
+
+    def write_prefix(self) -> None:
+        self.prefix.splice(
+            """
+
+            async_compile.wait(globals())
+            del async_compile
+
+            def call(args):
+            """
+        )
+        with self.prefix.indent():
+            if config.triton.debug_sync_graph:
+                self.prefix.writeline(V.graph.device_ops.synchronize())
+            if V.graph.graph_inputs:
+                lhs = ", ".join(V.graph.graph_input_names)
+                if len(V.graph.graph_input_names) == 1:
+                    lhs += ","
+                self.prefix.writeline(f"{lhs} = args")
+                self.prefix.writeline("args.clear()")
+
+            self.codegen_inputs(self.prefix, V.graph.graph_inputs)
+            if config.size_asserts:
+                self.codegen_input_size_asserts()
+            if config.nan_asserts:
+                self.codegen_input_nan_asserts()
+
+    # this function (and below) takes a graph as input so
+    # that stream caching happens per graph instance. this
+    # is important for nested subgraph codegening.
+    def write_get_raw_stream(self, device_idx: int, graph=None) -> str:
+        self.write_triton_header_once()
+        name = f"stream{device_idx}"
+        self.writeline(f"{name} = get_raw_stream({device_idx})")
+        return name
+
+    def get_codegened_graph(self):
+        return self.codegened_graph_stack[-1]
+
+    def push_codegened_graph(self, graph):
+        self.codegened_graph_stack.append(graph)
+
+    def pop_codegened_graph(self):
+        return self.codegened_graph_stack.pop()
+
+    def next_kernel_suffix(self) -> str:
+        return f"{next(self._names_iter)}"
+
+    def codegen_device_guard_enter(self, device_idx: int) -> None:
+        self.writeline(
+            EnterDeviceContextManagerLine(device_idx, self.last_seen_device_guard_index)
+        )
+        self.last_seen_device_guard_index = device_idx
+
+    def codegen_device_guard_exit(self) -> None:
+        self.writeline(ExitDeviceContextManagerLine())
+
+    def generate_return(self, output_refs: List[str]) -> None:
+        if output_refs:
+            self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
+        else:
+            self.wrapper_call.writeline("return ()")
+
+    def generate_before_suffix(self, result: IndentedBuffer) -> None:
+        return
+
+    def generate_end(self, result: IndentedBuffer) -> None:
+        return
+
+    def generate_fallback_kernel(self, fallback_kernel, args):
+        self.generate_extern_kernel_alloc(fallback_kernel, args)
+
+    def generate_extern_kernel_alloc(self, extern_kernel, args):
+        output_name = extern_kernel.get_name()
+        origin_node = extern_kernel.get_origin_node()
+        kernel_name = extern_kernel.get_kernel_name()
+        ending = self.ending
+        if config.memory_planning and "view_as_complex" in kernel_name:
+            # view operation fallbacks cause issues since inductor
+            # doesn't know the memory is still needed and might reuse it.
+            ending = f".clone(){ending}"
+        self.writeline(
+            f"{self.declare}{output_name} = {kernel_name}({', '.join(args)}){ending}"
+        )
+        if (
+            self.supports_intermediate_hooks
+            and config.generate_intermediate_hooks
+            and origin_node is not None
+        ):
+            counters["inductor"]["intermediate_hooks"] += 1
+            self.writeline(
+                f"run_intermediate_hooks({origin_node.name!r}, {output_name})"
+            )
+
+    def generate_extern_kernel_out(self, output_view, codegen_reference, args, kernel):
+        if output_view:
+            args.append(f"out={output_view.codegen_reference()}")
+        else:
+            args.append(f"out={codegen_reference}")
+        self.writeline(f"{kernel}({', '.join(args)})")
+
+    def generate_user_defined_triton_kernel(
+        self, kernel_name, grid, configs, args, triton_meta
+    ):
+        grid, code = user_defined_kernel_grid_fn_code(
+            kernel_name, configs, grid, wrapper=self
+        )
+        # Must happen after free symbols are already codegened
+        # Emit the grid wrapper function right before the call
+        for line in code.split("\n"):
+            self.writeline(line)
+
+        stream_name = self.write_get_raw_stream(
+            V.graph.scheduler.current_device.index, V.graph
+        )
+        self.writeline(
+            f"{kernel_name}.run({', '.join(args)}, grid={grid}, stream={stream_name})"
+        )
+
+    def generate_scatter_fallback(
+        self, output, inputs, kernel, python_kernel_name, src_is_tensor, reduce, kwargs
+    ):
+        line = f"{kernel}({','.join(map(str, inputs))}"
+        if kernel == "aten.scatter_":
+            if reduce:
+                line += f", reduce={repr(reduce)}"
+        else:
+            line += ", ".join([""] + kwargs)
+        line += f"){self.ending}"
+        self.writeline(line)
+
+    def generate_index_put_fallback(self, kernel, x, indices, values, accumulate):
+        indices_str = f"{self.open_bracket}{', '.join(indices)}{self.closed_bracket}"
+        args = [x, indices_str, values, accumulate]
+        self.writeline(self.wrap_kernel_call(kernel, args))
+
+    def generate_extern_kernel_alloc_and_find_schema_if_needed(
+        self,
+        name,
+        kernel,
+        codegen_args,
+        cpp_op_schema,
+        cpp_kernel_key,
+        cpp_kernel_overload_name="",
+        op_overload=None,
+        raw_args=None,
+        outputs=None,
+    ):
+        self.writeline(f"{name} = {kernel}({', '.join(codegen_args)})")
+
+    def generate_inf_and_nan_checker(self, node):
+        # TODO: Add check for python too.
+        pass
+
+    @dynamo_timed
+    def generate(self, is_inference):
+        if config.profile_bandwidth:
+            self.write_triton_header_once()
+        result = IndentedBuffer()
+        result.splice(self.header)
+
+        with contextlib.ExitStack() as stack:
+            stack.enter_context(self.wrapper_call.indent())
+            if config.profiler_mark_wrapper_call:
+                self.generate_profiler_mark_wrapper_call(stack)
+            if config.profile_bandwidth:
+                self.generate_start_graph()
+
+            # We disable planning during training because it presently increases peak memory consumption.
+            if is_inference and config.memory_planning:
+                self.memory_plan()
+                # TODO: integrate memory planning & stack allocation?
+                self.allow_stack_allocation = False
+            else:
+                self.memory_plan_reuse()
+
+            if config.triton.store_cubin:
+                self.generate_reset_kernel_saved_flags()
+
+            for line in self.lines:
+                if isinstance(line, WrapperLine):
+                    line.codegen(self.wrapper_call)
+                else:
+                    self.wrapper_call.writeline(line)
+
+            output_refs = self.get_output_refs()
+            self.mark_output_type()
+            if config.triton.debug_sync_graph:
+                self.wrapper_call.writeline(V.graph.device_ops.synchronize())
+
+            if config.profile_bandwidth:
+                self.generate_end_graph()
+
+            if config.triton.store_cubin:
+                self.generate_save_uncompiled_kernels()
+
+            self.generate_return(output_refs)
+
+        self.finalize_prefix()
+        result.splice(self.prefix)
+
+        with result.indent():
+            result.splice(self.wrapper_call)
+
+        self.generate_before_suffix(result)
+        result.splice(self.suffix)
+
+        self.generate_end(result)
+
+        self.add_benchmark_harness(result)
+
+        return result.getvaluewithlinemap()
+
+    def memory_plan(self):
+        from .memory_planning import MemoryPlanner
+
+        self.lines = MemoryPlanner(self).plan(self.lines)
+
+    def memory_plan_reuse(self):
+        out_names = V.graph.get_output_names()
+
+        while (
+            self.lines
+            and isinstance(self.lines[-1], MemoryPlanningLine)
+            # TODO: this seems legit, NullLine has no node
+            and self.lines[-1].node.name not in out_names  # type: ignore[attr-defined]
+        ):
+            # these lines will be pointless
+            self.lines.pop()
+
+        # codegen allocations in two passes
+        planning_states = [MemoryPlanningState()]
+        past_planning_states = []
+        for i in range(len(self.lines)):
+            line = self.lines[i]
+            if isinstance(line, MemoryPlanningLine):
+                self.lines[i] = line.plan(planning_states[-1])
+            elif isinstance(line, EnterSubgraphLine):
+                planning_states.append(MemoryPlanningState())
+            elif isinstance(line, ExitSubgraphLine):
+                past_planning_states.append(planning_states.pop())
+        past_planning_states.append(planning_states.pop())
+        assert len(planning_states) == 0
+
+        # conservatively use the sum of all allocated buffer sizes
+        # in potentially nested scopes as the total allocated size
+        total_allocated_buffer_size = sum(
+            s.total_allocated_buffer_size for s in past_planning_states
+        )
+
+        self.allow_stack_allocation = (
+            self.allow_stack_allocation is not False
+            and config.allow_stack_allocation
+            and total_allocated_buffer_size <= MAX_STACK_ALLOCATION_SIZE
+        )
+
+    def codegen_input_size_var_decl(self, code: IndentedBuffer, name):
+        code.writeline(f"{self.declare}{name}_size = {name}.{self.size}{self.ending}")
+
+    def codegen_input_stride_var_decl(self, code: IndentedBuffer, name):
+        code.writeline(
+            f"{self.declare}{name}_stride = {name}.{self.stride}{self.ending}"
+        )
+
+    def codegen_inputs(
+        self, code: IndentedBuffer, graph_inputs: Dict[str, ir.TensorBox]
+    ):
+        """Assign all symbolic shapes to locals"""
+
+        @functools.lru_cache(None)
+        def sizeof(name):
+            self.codegen_input_size_var_decl(code, name)
+            return f"{name}_size"
+
+        @functools.lru_cache(None)
+        def strideof(name):
+            self.codegen_input_stride_var_decl(code, name)
+            return f"{name}_stride"
+
+        # Assign all symbolic shapes needed to local variables
+        needed = V.graph.sizevars.free_symbols()
+
+        def is_expr(x):
+            return isinstance(x[1], sympy.Expr)
+
+        graph_inputs_expr = list(filter(is_expr, graph_inputs.items()))
+        graph_inputs_tensors = list(
+            filter(lambda x: not is_expr(x), graph_inputs.items())
+        )
+
+        for name, shape in graph_inputs_expr:
+            shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+            if shape in needed:
+                needed.remove(shape)  # type: ignore[arg-type]
+                code.writeline(f"{self.declare}{shape} = {name}{self.ending}")
+
+        for name, value in graph_inputs_tensors:
+            shapes = value.get_size()
+            for dim, shape in enumerate(shapes):
+                shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+                if shape in needed:
+                    needed.remove(shape)  # type: ignore[arg-type]
+                    code.writeline(
+                        f"{self.declare}{shape} = {sizeof(name)}[{dim}]{self.ending}"
+                    )
+
+        for name, value in graph_inputs_tensors:
+            shapes = value.get_stride()
+            for dim, shape in enumerate(shapes):
+                shape = V.graph.sizevars.simplify(shape)  # type: ignore[arg-type]
+                if shape in needed:
+                    needed.remove(shape)  # type: ignore[arg-type]
+                    code.writeline(
+                        f"{self.declare}{shape} = {strideof(name)}[{dim}]{self.ending}"
+                    )
+
+    def ensure_size_computed(self, sym: sympy.Symbol):
+        if isinstance(sym, sympy.Symbol) and sym.name.startswith("ps"):
+            if sym in self.computed_sizes:
+                return
+            self.computed_sizes.add(sym)
+            expr = V.graph.sizevars.inv_precomputed_replacements[sym]
+            self.writeline(
+                f"{self.declare}{sym} = {self.expr_printer(expr)}{self.ending}"
+            )
+
+    def finalize_prefix(self):
+        pass
+
+    def codegen_python_sizevar(self, x: Expr) -> str:
+        return pexpr(V.graph.sizevars.simplify(x))
+
+    def codegen_sizevar(self, x: Expr) -> str:
+        return self.codegen_python_sizevar(x)
+
+    def codegen_tuple_access(self, basename: str, name: str, index: str) -> str:
+        return f"{basename}[{index}]"
+
+    def codegen_python_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        parts = list(map(self.codegen_python_sizevar, shape))
+        if len(parts) == 0:
+            return "()"
+        if len(parts) == 1:
+            return f"({parts[0]}, )"
+        return f"({', '.join(parts)})"
+
+    def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
+        return self.codegen_python_shape_tuple(shape)
+
+    def codegen_alloc_from_pool(self, name, offset, dtype, shape, stride) -> str:
+        return "alloc_from_pool({})".format(
+            ", ".join(
+                [
+                    name,
+                    pexpr(offset),  # bytes not numel
+                    str(dtype),
+                    self.codegen_shape_tuple(shape),
+                    self.codegen_shape_tuple(stride),
+                ]
+            )
+        )
+
+    def codegen_reinterpret_view(self, data, size, stride, offset, writer) -> str:
+        size = self.codegen_shape_tuple(size)
+        stride = self.codegen_shape_tuple(stride)
+        offset = self.codegen_sizevar(offset)
+        return f"reinterpret_tensor({data.get_name()}, {size}, {stride}, {offset})"
+
+    def codegen_device_copy(self, src, dst):
+        self.writeline(f"{dst}.copy_({src})")
+
+    def codegen_multi_output(self, name, value):
+        self.writeline(f"{self.declare}{name} = {value}{self.ending}")
+
+    def codegen_dynamic_scalar(self, node):
+        (data,) = (t.codegen_reference() for t in node.inputs)
+        if node.is_bool:
+            self.writeline(f"{node.sym} = 1 if {data}.item() else 0")
+        else:
+            self.writeline(f"{node.sym} = {data}.item()")
+        # No one should ever use this buffer, but for uniformity
+        # define the variable and assign it None
+        self.writeline(f"{node.get_name()} = None")
+
+    def benchmark_compiled_module(self, output):
+        def add_fake_input(name, shape, stride, device, dtype):
+            output.writeline(
+                f"{name} = rand_strided("
+                f"{self.codegen_python_shape_tuple(shape)}, "
+                f"{self.codegen_python_shape_tuple(stride)}, "
+                f"device='{device}', dtype={dtype})"
+            )
+
+        def add_expr_input(name, val):
+            output.writeline(f"{name} = {val}")
+
+        output.writelines(
+            ["", "", "def benchmark_compiled_module(times=10, repeat=10):"]
+        )
+        with output.indent():
+            output.splice(
+                """
+                from torch._dynamo.testing import rand_strided
+                from torch._inductor.utils import print_performance
+                """,
+                strip=True,
+            )
+
+            for name, value in V.graph.constants.items():
+                # all the constants are global variables, that's why we need
+                # these 'global var_name' lines
+                output.writeline(f"global {name}")
+                add_fake_input(
+                    name, value.size(), value.stride(), value.device, value.dtype
+                )
+
+            for name, value in V.graph.graph_inputs.items():
+                if isinstance(value, sympy.Symbol) and isinstance(
+                    V.graph.sizevars.var_to_val.get(value, None), SingletonInt
+                ):
+                    # Inductor should only work with dense -> dense graph, and
+                    # SingletonInts belong to metadata that should only live on
+                    # the subclass.
+                    continue
+                if isinstance(value, sympy.Expr):  # Don't need to add symbolic
+                    add_expr_input(name, V.graph.sizevars.size_hint(value))
+                else:
+                    shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
+                    stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
+                    add_fake_input(
+                        name, shape, stride, value.get_device(), value.get_dtype()
+                    )
+
+            call_str = f"call([{', '.join(V.graph.graph_inputs.keys())}])"
+            output.writeline(f"fn = lambda: {call_str}")
+            output.writeline("return print_performance(fn, times=times, repeat=repeat)")
+
+    def add_benchmark_harness(self, output):
+        """
+        Append a benchmark harness to generated code for debugging
+        """
+        if not config.benchmark_harness:
+            return
+
+        self.benchmark_compiled_module(output)
+
+        output.writelines(["", "", 'if __name__ == "__main__":'])
+        with output.indent():
+            output.writelines(
+                [
+                    "from torch._inductor.wrapper_benchmark import compiled_module_main",
+                    f"compiled_module_main('{get_benchmark_name()}', benchmark_compiled_module)",
+                ]
+            )
+
+    def define_kernel(
+        self, name: str, kernel: str, metadata: Optional[str] = None, cuda=True
+    ):
+        metadata_comment = f"{metadata}\n" if metadata else ""
+        self.header.splice(f"\n\n{metadata_comment}{name} = {kernel}")
+
+    def define_user_defined_triton_kernel(self, kernel, configs, kwargs):
+        original_name = kernel.__name__
+
+        from .common import KernelArgType, SizeArg, TensorArg
+
+        signature: List[KernelArgType] = []
+        constants: Dict[int, Any] = {}
+        non_constant_indices = []
+        equal_to_1_arg_idx: List[int] = []
+        for idx, key in enumerate(kernel.arg_names):
+            if key not in kwargs:
+                continue
+            arg = kwargs[key]
+            if idx in kernel.constexprs:
+                constants[idx] = arg
+            else:
+                non_constant_indices.append(idx)
+                if isinstance(arg, ir.Buffer):
+                    signature.append(
+                        TensorArg(
+                            name=key,
+                            buffer=arg.get_name(),
+                            dtype=arg.get_dtype(),
+                        )
+                    )
+                elif isinstance(arg, ir.ReinterpretView):
+                    # for ReinterpretView we use the underlying
+                    # buffer name and note the (possibly non-zero)
+                    # offset relative to the underlying buffer
+                    signature.append(
+                        TensorArg(
+                            name=key,
+                            buffer=arg.data.get_name(),
+                            dtype=arg.get_dtype(),
+                            offset=arg.layout.offset,
+                        )
+                    )
+                else:
+                    signature.append(SizeArg(key, arg))
+                    if arg is not None and V.graph.sizevars.statically_known_equals(arg, 1):  # type: ignore[arg-type]
+                        equal_to_1_arg_idx.append(idx)
+        index_dtype = "tl.int32"
+        triton_meta = {
+            "signature": signature_to_meta(
+                signature,
+                size_dtype=index_dtype,
+                indices=non_constant_indices,
+            ),
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            # Triton compiler includes equal_to_1 args into constants even
+            # when they are not constexpr. otherwise there may be a segfault
+            # during launching the Inductor-compiled Triton kernel.
+            # TODO(aakhundov): add None args to constants, too. currently, this
+            # causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input.
+            # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
+            # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
+            "constants": {
+                **constants,
+                **{idx: 1 for idx in equal_to_1_arg_idx},
+            },
+            "configs": [
+                config_of(
+                    signature,
+                    indices=non_constant_indices,
+                )
+            ],
+        }
+
+        # Distinguish between different functions using function id
+        cache_key: List[Any] = [id(kernel.fn)]
+        if len(configs) > 0:
+            for arg in kwargs.values():
+                # We need to key on non tensor arg only in autotune mode
+                if not isinstance(arg, (ir.Buffer, ir.ReinterpretView)):
+                    cache_key.append(arg)
+        cache_key.append(str(triton_meta))
+        cache_key = tuple(cache_key)
+
+        if cache_key in self.user_defined_kernel_cache:
+            return self.user_defined_kernel_cache[cache_key]
+
+        name = f"{original_name}_{len(self.user_defined_kernel_cache)}"
+        # Add to the cache for the next use
+        self.user_defined_kernel_cache[cache_key] = (name, triton_meta)
+
+        compile_wrapper = IndentedBuffer()
+        compile_wrapper.writeline(f"async_compile.triton({original_name!r}, '''")
+
+        from .triton import gen_common_triton_imports
+
+        compile_wrapper.splice(gen_common_triton_imports())
+
+        inductor_meta = {
+            "kernel_name": name,
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+
+        configs = [
+            {
+                "kwargs": config.kwargs,
+                "num_warps": config.num_warps,
+                "num_stages": config.num_stages,
+            }
+            for config in configs
+        ]
+
+        compile_wrapper.splice(
+            f"""
+            @triton_heuristics.user_autotune(
+                configs={configs!r},
+                inductor_meta={inductor_meta!r},
+                triton_meta={triton_meta!r},
+                filename=__file__,
+                custom_kernel=True,
+            )
+            @triton.jit
+            """
+        )
+        compile_wrapper.splice(kernel.src, strip=True)
+
+        # Also include any possible kernel being called indirectly
+        from triton import JITFunction
+
+        symbols_included = {original_name}
+
+        def traverse(cur_kernel):
+            for symbol_name in cur_kernel.fn.__code__.co_names:
+                if symbol_name in symbols_included:
+                    continue
+                if symbol_name in cur_kernel.fn.__globals__:
+                    symbol = cur_kernel.fn.__globals__[symbol_name]
+                    if isinstance(symbol, JITFunction):
+                        compile_wrapper.newline()
+                        compile_wrapper.writeline("@triton.jit")
+                        compile_wrapper.splice(symbol.src, strip=True)
+                        symbols_included.add(symbol_name)
+                        traverse(symbol)
+                    elif isinstance(symbol, (int, str, bool)):
+                        compile_wrapper.newline()
+                        compile_wrapper.writeline(f"{symbol_name} = {symbol!r}")
+                        symbols_included.add(symbol_name)
+
+        traverse(kernel)
+
+        compile_wrapper.writeline(
+            f"''', device_str='{V.graph.scheduler.current_device.type}')"
+        )
+        _, lineno = inspect.getsourcelines(kernel.fn)
+        srcfile = inspect.getsourcefile(kernel.fn)
+        metadata = f"# Original path: {srcfile}:{lineno}"
+        self.define_kernel(
+            name,
+            compile_wrapper.getvalue(),
+            metadata,
+        )
+        return name, triton_meta
+
+    def generate_numel_expr(self, kernel_name: str, tree):
+        expr = f"{kernel_name}_{tree.prefix}numel"
+        if (expr, V.graph) not in self.kernel_numel_expr:
+            # declare expr once in each graph (scope)
+            self.kernel_numel_expr.add((expr, V.graph))
+            self.writeline(
+                f"{self.declare}{expr} = {self.expr_printer(tree.numel)}{self.ending}"
+            )
+        else:
+            self.writeline(f"{expr} = {self.expr_printer(tree.numel)}{self.ending}")
+        # We can get symbolic expressions here, like s0*64
+        # It is fine to have them here, but we need to handle them correctly as their own type
+        # This is tricky to do, so we wrap in a custom type, distinct from scalars, but also from sympy*
+        # scalars as well.
+        # This is handled in `generate_args_decl` which has a correct comment of: TODO: only works for
+        # constant now, need type info. I agree, this needs type info, and while this is not true type info
+        # it suffices as a type hint for the purposes of producing the correct code for this type.
+        return SymbolicCallArg(expr, tree.numel)
+
+    def generate_workspace_allocation(self, nbytes, device, zero_fill):
+        line = self.make_allocation(
+            "workspace", device, torch.uint8, shape=(nbytes,), stride=(1,)
+        )
+        self.writeline(line)
+        if zero_fill:
+            self.writeline(f"workspace.zero_(){self.ending}")
+
+    def wrap_kernel_call(self, name, call_args):
+        return f"{name}({', '.join(call_args)}){self.ending}"
+
+    def generate_profiler_mark_wrapper_call(self, stack):
+        self.wrapper_call.writeline("from torch.profiler import record_function")
+        self.wrapper_call.writeline(
+            f"with record_function('graph_{V.graph.graph_id}_inductor_wrapper_call'):"
+        )
+        stack.enter_context(self.wrapper_call.indent())
+
+    def generate_start_graph(self):
+        self.wrapper_call.writeline("start_graph()")
+
+    def generate_end_graph(self):
+        self.wrapper_call.writeline("end_graph()")
+
+    def generate_reset_kernel_saved_flags(self):
+        self.wrapper_call.splice(
+            """
+            for kernel in globals().values():
+                if isinstance(kernel, torch._inductor.triton_heuristics.CachingAutotuner):
+                    kernel.cuda_kernel_saved = False
+            """
+        )
+
+    def generate_save_uncompiled_kernels(self):
+        """
+        Precompile and save the CUBINs of the Triton kernels that haven't
+        been precompiled and saved as a side effect of running the generated
+        JIT model (Python wrapper). This can happen when the model contains
+        control flow: only one pass through the control flow operators covers
+        the kernels that are saved, the remaining kernels are not launched,
+        hence not saved. The main purpose of this codegen is to compile and
+        save the Triton kernels outside the active control flow path for
+        subsequent AOTInductor code generation and compilation.
+        """
+        self.wrapper_call.splice(
+            """
+            for kernel in globals().values():
+                if isinstance(kernel, torch._inductor.triton_heuristics.CachingAutotuner):
+                    if not kernel.cuda_kernel_saved:
+                        if len(kernel.launchers) == 0:
+                            kernel.precompile()
+                        kernel.save_cuda_kernel(
+                            grid=(0, 0, 0),   # use dummy grid
+                            stream="stream",  # use dummy stream
+                            launcher=kernel.launchers[0],
+                        )
+            """
+        )
+
+    def generate_default_grid(self, name: str, grid_args: List[Any]):
+        return grid_args
+
+    def generate_kernel_call(
+        self,
+        name,
+        call_args,
+        grid=None,
+        device_index=None,
+        cuda=True,
+        triton=True,
+        arg_types=None,
+        grid_fn: str = "grid",
+        triton_meta=None,
+    ):
+        """
+        Generates kernel call code.
+
+        cuda: Defines whether the backend is GPU. Otherwise the backend is CPU.
+
+        triton: Defines whether the GPU backend uses Triton for codegen.
+                Otherwise it uses the CUDA language for codegen.
+                Only valid when cuda == True.
+        """
+        if cuda:
+            call_args_str = ", ".join(pexpr(item) for item in call_args)
+            stream_name = self.write_get_raw_stream(
+                V.graph.scheduler.current_device.index, V.graph
+            )
+            if triton:
+                grid_str = ", ".join(pexpr(item) for item in grid)
+                grid_str = f"{grid_fn}({grid_str})"
+                self.writeline(
+                    f"{name}.run({call_args_str}, grid={grid_str}, stream={stream_name})"
+                )
+            else:
+                stream_ptr = f"c_void_p({stream_name})"
+                self.writeline(f"{name}.{name}({call_args_str}, {stream_ptr})")
+        else:
+            self.writeline(self.wrap_kernel_call(name, call_args))
+
+    def writeline(self, line):
+        self.lines.append(line)
+
+    def enter_context(self, ctx):
+        self.lines.append(LineContext(ctx))
+
+    def val_to_cpp_arg_str(self, type_, val, is_legacy_abi) -> str:
+        raise NotImplementedError()
+
+    def val_to_arg_str(self, s):
+        if isinstance(s, SymTypes):
+            return pexpr(sympy.expand(repr(s)))
+        elif isinstance(s, sympy.Expr):
+            return pexpr(s)
+        elif isinstance(s, (tuple, list)):
+
+            @dataclasses.dataclass
+            class Shim:
+                ref: Any
+
+                def __repr__(self):
+                    return self.ref
+
+            return repr(type(s)(Shim(self.val_to_arg_str(a)) for a in s))
+        elif isinstance(s, torch._ops.OpOverload):
+            return _get_qualified_name(s)
+        elif isinstance(s, (ir.Buffer, ReinterpretView)):
+            return s.codegen_reference()
+        else:
+            return repr(s)
+
+    # The following methods are for memory management
+    def make_buffer_allocation(self, buffer):
+        device = buffer.get_device()
+        dtype = buffer.get_dtype()
+        shape = tuple(buffer.get_size())
+        stride = tuple(buffer.get_stride())
+        return self.make_allocation(buffer.get_name(), device, dtype, shape, stride)
+
+    def make_allocation(self, name, device, dtype, shape, stride):
+        if device.type in ("cpu", "cuda"):
+            # optimized path for faster allocations, saving ~2us versus the stuff below
+            return (
+                f"{name} = empty_strided_{device.type}("
+                f"{self.codegen_shape_tuple(shape)}, "
+                f"{self.codegen_shape_tuple(stride)}, "
+                f"{dtype})"
+            )
+        # all other devices:
+        return (
+            f"{name} = empty_strided("
+            f"{self.codegen_shape_tuple(shape)}, "
+            f"{self.codegen_shape_tuple(stride)}, "
+            f"device='{device.type}', dtype={dtype})"
+        )
+
+    def make_tensor_alias(self, new_name, old_name, comment=""):
+        return f"{self.declare}{new_name} = {old_name}{self.ending}  {self.comment} {comment}"
+
+    def make_buffer_free(self, buffer):
+        return f"del {buffer.get_name()}"
+
+    def make_free_by_names(self, names_to_del: List[str]):
+        return f"del {', '.join(name for name in names_to_del)}"
+
+    def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
+        return f"{self.declare_maybe_reference}{new_name} = {old_name}{del_line}{self.ending}  {self.comment} reuse"
+
+    def make_buffer_reuse(self, old, new, delete_old: bool):
+        assert old.get_dtype() == new.get_dtype()
+        old_name = old.get_name()
+        new_name = new.get_name()
+        del_line = ";"
+        if old_name not in V.graph.get_output_names() and delete_old:
+            del_line = f"; {self.make_buffer_free(old)}"
+
+        if old.get_size() == new.get_size() and old.get_stride() == new.get_stride():
+            if old_name in self.stack_allocated_buffers:
+                self.stack_allocated_buffers[new_name] = new
+            return self.codegen_exact_buffer_reuse(old_name, new_name, del_line)
+
+        reinterpret_view = self.codegen_reinterpret_view(
+            old, new.get_size(), new.get_stride(), 0, self.wrapper_call
+        )
+        if reinterpret_view in self.stack_allocated_buffers:
+            self.stack_allocated_buffers[new_name] = new
+        return f"{self.declare_maybe_reference}{new_name} = {reinterpret_view}{del_line}  {self.comment} reuse"
+
+    def codegen_deferred_allocation(self, name, layout):
+        self.writeline(
+            DeferredLine(
+                name,
+                f"{self.declare_maybe_reference}{name} = {layout.view.codegen_reference()}{self.ending}  "
+                f"{self.comment} alias",
+            )
+        )
+
+    def codegen_allocation(self, buffer):
+        assert (
+            buffer.get_workspace_size() == 0
+        ), "Only support zero workspace size for now!"
+
+        name = buffer.get_name()
+
+        if name in V.graph.removed_buffers or name in self.allocated:
+            return
+        self.allocated.add(name)
+        if isinstance(
+            buffer,
+            (ir.ExternKernelAlloc, ir.MultiOutput),
+        ):
+            return
+
+        layout = buffer.get_layout()
+        if isinstance(layout, ir.MutationLayout):
+            return
+        if isinstance(layout, ir.AliasedLayout):
+            assert isinstance(
+                layout.view, ir.ReinterpretView
+            ), f"unexpected {type(layout.view)}: {layout.view}"
+            self.codegen_allocation(layout.view.data)
+            self.codegen_deferred_allocation(name, layout)
+            return
+
+        self.writeline(AllocateLine(self, buffer))
+
+    def codegen_free(self, buffer):
+        assert (
+            buffer.get_workspace_size() == 0
+        ), "Only support zero workspace size for now!"
+
+        name = buffer.get_name()
+
+        # can be freed but not reused
+        if isinstance(buffer, ir.InputBuffer):
+            self.writeline(self.make_buffer_free(buffer))
+            return
+
+        if not self.can_reuse(buffer):
+            return
+        self.freed.add(name)
+
+        self.writeline(FreeIfNotReusedLine(self, buffer))
+
+    def can_reuse(self, input_buffer, output_buffer=None):
+        name = input_buffer.get_name()
+        if (
+            name in V.graph.removed_buffers
+            or name in V.graph.graph_inputs
+            or name in V.graph.constants
+            or name in V.graph.never_reuse_buffers
+            or name in self.freed
+        ):
+            return False
+
+        return True
+
+    def did_reuse(self, buffer, reused_buffer):
+        # Check whether a given buffer was reused by a possible reuser in the wrapper codegen
+        # Can be consulted from inside ir codegen, e.g. to determine whether a copy is needed
+        return (
+            buffer.get_name() in self.reuses
+            and self.reuses[buffer.get_name()] == reused_buffer.get_name()
+        )
+
+    def codegen_inplace_reuse(self, input_buffer, output_buffer):
+        assert buffer_reuse_key(input_buffer) == buffer_reuse_key(output_buffer)
+        self.codegen_allocation(input_buffer)
+        self.freed.add(input_buffer.get_name())
+        self.allocated.add(output_buffer.get_name())
+        self.reuses[output_buffer.get_name()] = input_buffer.get_name()
+        self.writeline(ReuseLine(self, input_buffer, output_buffer))
+
+    def codegen_unbacked_symbol_decl(self, symbol):
+        name = str(symbol)
+        if name in self.unbacked_symbol_decls:
+            return name
+        else:
+            # When in CppWrapperCpu, we should only generate the declaration once
+            self.unbacked_symbol_decls.add(name)
+            return self.declare + name
+
+    def codegen_subgraph_prefix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_input, outer_input in zip(subgraph.graph.graph_inputs, outer_inputs):
+            self.writeline(f"{self.declare}{inner_input} = {outer_input}{self.ending}")
+
+    def codegen_subgraph_suffix(self, subgraph, outer_inputs, outer_outputs):
+        for inner_output, outer_output in zip(
+            subgraph.graph.graph_outputs, outer_outputs
+        ):
+            self.writeline(
+                f"{outer_output} = {inner_output.codegen_reference()}{self.ending}"
+            )
+
+    def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
+        try:
+            self.push_codegened_graph(subgraph.graph)
+            self.writeline(f"{self.comment} subgraph: {subgraph.name}")
+            self.codegen_subgraph_prefix(subgraph, outer_inputs, outer_outputs)
+            parent_graph = V.graph
+            with V.set_graph_handler(subgraph.graph):
+                subgraph.graph.codegen_subgraph(
+                    parent_graph=parent_graph,
+                )
+            self.codegen_subgraph_suffix(subgraph, outer_inputs, outer_outputs)
+        finally:
+            self.pop_codegened_graph()
+
+    def codegen_conditional(self, conditional):
+        name = conditional.get_name()
+        outer_inputs = [buf.codegen_reference() for buf in conditional.operands]
+        outer_outputs = [f"{name}[{i}]" for i in range(len(conditional.outputs))]
+
+        self.writeline(f"{name} = [None] * {len(conditional.outputs)}")
+        self.writeline(f"if {conditional.predicate.codegen_reference()}.item():")
+        self.writeline(EnterSubgraphLine(self, conditional.true_subgraph.graph))
+        self.codegen_subgraph(conditional.true_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline("else:")
+        self.writeline(EnterSubgraphLine(self, conditional.false_subgraph.graph))
+        self.codegen_subgraph(conditional.false_subgraph, outer_inputs, outer_outputs)
+        self.writeline(ExitSubgraphLine(self))
+
+    @staticmethod
+    def statically_known_int_or_none(x):
+        try:
+            val = V.graph._shape_env._maybe_evaluate_static(x)
+            return int(x)
+        except Exception:
+            return None
+
+    @staticmethod
+    def statically_known_list_of_ints_or_none(lst):
+        result = []
+        for x in lst:
+            num = WrapperCodeGen.statically_known_int_or_none(x)
+            if num is None:
+                return None
+            result.append(num)
+        return result
+
+    @staticmethod
+    def is_statically_known_list_of_ints(lst):
+        return WrapperCodeGen.statically_known_list_of_ints_or_none(lst) is not None
+
+    @staticmethod
+    def static_shape_for_buffer_or_none(buffer):
+        return WrapperCodeGen.statically_known_list_of_ints_or_none(buffer.get_size())
+
+    @staticmethod
+    def can_prove_buffer_has_static_shape(buffer):
+        return WrapperCodeGen.static_shape_for_buffer_or_none(buffer) is not None
diff --git a/MLPY/Lib/site-packages/torch/_inductor/comm_analysis.py b/MLPY/Lib/site-packages/torch/_inductor/comm_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f7f1d8f336bbc980afb949ee06492cfdcc04635
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/comm_analysis.py
@@ -0,0 +1,273 @@
+import math
+from enum import IntEnum
+
+import sympy
+
+import torch
+from . import ir
+
+from .utils import get_dtype_size, sympy_product
+from .virtualized import V
+
+
+class NCCL_COLL(IntEnum):
+    ALL_REDUCE = 0
+    ALL_GATHER = 1
+    REDUCE_SCATTER = 2
+
+
+class NVIDIA_GPU_TYPE(IntEnum):
+    VOLTA = 0
+    AMPERE = 1
+    HOPPER = 2
+
+
+def get_gpu_type() -> NVIDIA_GPU_TYPE:
+    gpu_info = torch.utils.collect_env.get_gpu_info(torch.utils.collect_env.run) or ""
+    if "V100" in gpu_info:
+        return NVIDIA_GPU_TYPE.VOLTA
+    elif "A100" in gpu_info:
+        return NVIDIA_GPU_TYPE.AMPERE
+    elif "H100" in gpu_info:
+        return NVIDIA_GPU_TYPE.HOPPER
+    else:
+        # for other gpu types, assume Ampere
+        return NVIDIA_GPU_TYPE.AMPERE
+
+
+def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
+    if isinstance(node, ir._CollectiveKernel):
+        kernel_name = node.python_kernel_name
+        assert kernel_name is not None
+        if "all_reduce" in kernel_name:
+            return NCCL_COLL.ALL_REDUCE
+        elif "all_gather" in kernel_name:
+            return NCCL_COLL.ALL_GATHER
+        elif "reduce_scatter" in kernel_name:
+            return NCCL_COLL.REDUCE_SCATTER
+        else:
+            raise Exception(f"Unsupported collective kernel: {kernel_name}")
+
+    if isinstance(node, (ir.AllReduce, ir.AllReduceCoalesced)):
+        return NCCL_COLL.ALL_REDUCE
+    elif isinstance(node, (ir.AllGatherIntoTensor, ir.AllGatherIntoTensorCoalesced)):
+        return NCCL_COLL.ALL_GATHER
+    elif isinstance(node, (ir.ReduceScatterTensor, ir.ReduceScatterTensorCoalesced)):
+        return NCCL_COLL.REDUCE_SCATTER
+    else:
+        raise Exception(f"Unsupported collective type: {node}")
+
+
+def get_collective_input_size_bytes(node: ir.IRNode) -> int:
+    sz_bytes = 0
+    for inp in node.inputs:  # type: ignore[attr-defined]
+        shape = inp.layout.size
+        numel = sympy_product(inp.layout.size)
+        if isinstance(numel, sympy.Integer):
+            # For ease of testing
+            numel = int(numel)
+        else:
+            numel = V.graph.sizevars.size_hint(numel)
+        sz_bytes += numel * get_dtype_size(inp.layout.dtype)
+    return sz_bytes
+
+
+def get_collective_group_size(node: ir.IRNode) -> int:
+    if type(node) == ir._CollectiveKernel:
+        from torch.distributed.distributed_c10d import _get_group_size_by_name
+
+        return _get_group_size_by_name(node.constant_args[-1])
+    elif isinstance(node, ir.CollectiveKernel):
+        return node.constant_args[2]  # type: ignore[attr-defined]
+    else:
+        raise TypeError(f"Unsupported collective type: {node}")
+
+
+####################################################################################################################
+# The following code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
+####################################################################################################################
+
+
+class NCCL_HW(IntEnum):
+    NVLINK = 0
+    PCI = 1
+    NET = 2
+
+
+class NCCL_ALGO(IntEnum):
+    TREE = 0
+    RING = 1
+
+
+class NCCL_PROTO(IntEnum):
+    # The ordering and enum values here matches original in
+    # https://github.com/NVIDIA/nccl/blob/0b083e52096c387bad7a5c5c65b26a9dca54de8c/src/include/devcomm.h#L28
+    # For difference between these protocols, see https://github.com/NVIDIA/nccl/issues/281#issuecomment-571816990
+    LL = 0  # Low-latency
+    # LL128 = 1   # Low-latency 128-byte
+    # SIMPLE = 2
+
+
+# Latencies in us
+# len(NCCL_ALGO) x len(NCCL_PROTO)
+# NOTE: use array instead of tensor to prevent incompatibility with fake mode
+baseLat = [
+    # Tree
+    [
+        6.8,  # LL
+    ],
+    # Ring
+    [
+        6.6,  # LL
+    ],
+]
+
+# Latencies in us
+# len(NCCL_HW) x len(NCCL_ALGO) x len(NCCL_PROTO)
+hwLat = [
+    # NVLINK
+    [
+        [0.6],  # Tree (LL)
+        [0.6],  # Ring (LL)
+    ],
+    # PCI
+    [
+        [1.0],  # Tree (LL)
+        [1.0],  # Ring (LL)
+    ],
+    # NET
+    [
+        [5.0],  # Tree (LL)
+        [2.7],  # Ring (LL)
+    ],
+]
+
+
+# LL128 max BW per channel
+llMaxBws = [
+    # Volta-N1/Intel-N2/Intel-N4
+    [
+        39.0,
+        39.0,
+        20.4,
+    ],
+    # Ampere-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+    # Hopper-N1/AMD-N2/AMD-N4
+    [
+        87.7,
+        22.5,  # avg of ring & tree
+        19.0,
+    ],
+]
+
+
+def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
+    """
+    Returns estimated NCCL collective runtime in nanoseconds (ns).
+
+    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
+    We aim to estimate the runtime as accurately as possible.
+
+    Assumptions:
+    - only ring algorithm (NCCL_ALGO_RING) is used
+    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
+    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    - collective is one of: allreduce, reducescatter, allgather
+    """
+    tensor_storage_size_bytes = get_collective_input_size_bytes(node)
+    # Convert bytes to GB
+    tensor_storage_size_GB = tensor_storage_size_bytes / 1024 / 1024 / 1024
+
+    # Currently assumes each node has 8 gpus. And when >1 node is used, assumes each node uses all 8 gpus.
+    # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
+    num_gpus_per_node = 8
+    group_size = get_collective_group_size(node)
+    nNodes = math.ceil(group_size / num_gpus_per_node)
+    nRanks = group_size  # this is total # of gpus globally that participate in this collective op
+
+    if nRanks <= 1:
+        return 0
+
+    # Assumes ring algorithm
+    nccl_algo = NCCL_ALGO.RING
+    nccl_proto = NCCL_PROTO.LL
+    coll = get_collective_type(node)
+
+    # =============== bandwidth computation ===============
+    # First compute bandwidth in GB/s; then at the end, convert it to GB/ns
+
+    bwIntra = torch._inductor.config.intra_node_bw
+    bwInter = torch._inductor.config.inter_node_bw
+
+    compCapIndex = get_gpu_type()
+    index2 = nNodes - 1 if nNodes <= 2 else 2
+    # LL: for single node, we look at GPU type; for multi-node, we look at CPU type
+    index1 = compCapIndex if nNodes == 1 else 0
+    llMaxBw = llMaxBws[index1][index2]
+
+    # NOTE: each step of ring algorithm is synchronized,
+    # and is bottlenecked by the slowest link which is the inter-node interconnect.
+    # hence when nNodes >= 2, bw is inter-node bandwidth.
+    # NOTE: the original code in https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc
+    # have this as `if nNodes <= 2` which seems wrong. Corrected it here.
+    bw = bwIntra if nNodes == 1 else bwInter
+    nChannels = 2  # Assume # channels is 2
+    busBw = nChannels * bw
+
+    # Various model refinements
+    busBw = min(
+        llMaxBw,
+        busBw
+        * (1.0 / 4.0 if (nNodes > 1 or coll == NCCL_COLL.ALL_REDUCE) else 1.0 / 3.0),
+    )
+
+    if coll == NCCL_COLL.ALL_REDUCE:
+        nsteps = 2 * (nRanks - 1)
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+        nsteps = nRanks - 1
+
+    # Convert bus BW to algorithm BW (tensor bytes / algoBW = actual execution time)
+    ratio = (1.0 * nRanks) / nsteps  # type: ignore[possibly-undefined]
+    bandwidth = busBw * ratio
+    # Convert GB/s to GB/ns
+    bandwidth_GB_per_ns = bandwidth / 1e9
+
+    # =============== latency computation ===============
+    intraHw = NCCL_HW.NVLINK
+    hw = intraHw if nNodes == 1 else NCCL_HW.NET
+
+    if coll == NCCL_COLL.ALL_REDUCE:
+        if nNodes > 1:
+            nInterSteps = 2 * nNodes
+        else:
+            nInterSteps = 0
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+        nInterSteps = nNodes - 1
+
+    # First compute latency in us; then at the end, convert it to ns
+    latency = baseLat[nccl_algo][nccl_proto]
+    intraLat = hwLat[intraHw][nccl_algo][nccl_proto]
+    interLat = hwLat[NCCL_HW.NET][nccl_algo][nccl_proto]
+
+    # Inter-node rings still have to launch nsteps * net overhead.
+    netOverhead = 0.0
+    if nNodes > 1:
+        netOverhead = 1.0  # getNetOverhead(comm);
+    intraLat = max(intraLat, netOverhead)
+    latency += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat  # type: ignore[possibly-undefined]
+    # Convert us to ns
+    latency_ns = latency * 1e3
+
+    # =============== final result ===============
+    transport_ns = tensor_storage_size_GB / bandwidth_GB_per_ns
+    return transport_ns + latency_ns
+
+
+################################################################################################################
+# The above code and constants are adapted from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc #
+################################################################################################################
diff --git a/MLPY/Lib/site-packages/torch/_inductor/comms.py b/MLPY/Lib/site-packages/torch/_inductor/comms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9974fa428976b5b3283f828fec1b2c75534672ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/comms.py
@@ -0,0 +1,363 @@
+# pyre-strict
+
+from typing import List
+
+import torch
+
+from . import config, ir, scheduler
+from .dependencies import WeakDep
+from .utils import tuple_sorted
+
+overlap_log = torch._logging.getArtifactLogger(__name__, "overlap")
+
+
+def sink_waits(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Greedily moves waits as late as possible (i.e. until we reach a use). Optimal in terms of
+    communication overlap.
+    """
+    new_order = []
+    cur_waits = set()
+    for snode in snodes:
+        if isinstance(snode.node, ir.Wait):
+            cur_waits.add(snode)
+        else:
+            for wait in tuple_sorted(cur_waits):
+                if snode in wait.node_users:
+                    new_order.append(wait)
+                    cur_waits.remove(wait)
+            new_order.append(snode)
+    new_order.extend(tuple_sorted(cur_waits))
+    return new_order
+
+
+def raise_comms(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Greedily moves comms as early as possible (i.e. until we reach an input).
+    Optimal in terms of communication overlap.
+
+    TODO: We might want to adjust this in the future to account for memory limitations.
+    e.g. when we are compiling FSDP, this heuristics will cause the all-gathers to be prefetched as soon as possible,
+    which is the beginning of the forwards pass. We'll have to either do a special pass for FSDP,
+    or we'll want to redo this pass with memory considerations so we handle the FSDP case in a general way.
+    """
+    new_order_reversed: List["scheduler.BaseSchedulerNode"] = []
+    cur_comms: List["scheduler.BaseSchedulerNode"] = []
+    for snode in reversed(snodes):
+        if isinstance(snode.node, ir.CollectiveKernel):
+            cur_comms.append(snode)
+        else:
+            for comm in cur_comms:
+                assert len(comm.inverse_users) > 0
+            while len(cur_comms) > 0 and any(
+                snode in comm.inverse_users for comm in cur_comms
+            ):
+                comm = cur_comms.pop(0)
+                new_order_reversed.append(comm)
+            new_order_reversed.append(snode)
+    assert len(cur_comms) <= 1
+    new_order_reversed.extend(tuple_sorted(cur_comms))
+    return new_order_reversed[::-1]
+
+
+def get_ancestors(node):
+    ancestors = set()
+    cur_nodes = [node]
+    while len(cur_nodes) > 0:
+        new_nodes = []
+        for node in cur_nodes:
+            for inp in node.inverse_users:
+                if inp not in ancestors:
+                    ancestors.add(inp)
+                    new_nodes.append(inp)
+        cur_nodes = new_nodes
+    return ancestors
+
+
+def get_descendants(node):
+    descendants = set()
+    cur_nodes = [node]
+    while len(cur_nodes) > 0:
+        new_nodes = []
+        for node in cur_nodes:
+            for inp in node.node_users:
+                if inp not in descendants:
+                    descendants.add(inp)
+                    new_nodes.append(inp)
+        cur_nodes = new_nodes
+    return descendants
+
+
+def decide_global_ordering_of_comms(nodes: List["scheduler.BaseSchedulerNode"]):
+    """
+    Decide global ordering of comms, by just enforcing the ordering that's in the input graph
+    (might not be the same ordering as the eager mode program).
+    TODO: Come up with a better approach
+    """
+    comm_nodes = [n for n in nodes if isinstance(n.node, ir.CollectiveKernel)]
+    for i in range(1, len(comm_nodes)):
+        # Enforce ordering by making previous comm a `WeakDep` dependency of the next comm
+        comm_nodes[i].add_fake_dep(WeakDep(comm_nodes[i - 1].get_name()))
+
+
+def assert_no_comm_nodes(snodes: List["scheduler.BaseSchedulerNode"]) -> None:
+    assert not any(isinstance(snode.node, ir.CollectiveKernel) for snode in snodes)
+
+
+def estimate_op_runtime(snode: "scheduler.BaseSchedulerNode") -> float:
+    """
+    Returns estimated op runtime in nanoseconds (ns)
+    """
+    if config.estimate_op_runtime == "default":
+        runtime = snode.get_estimated_runtime()
+    else:
+        assert callable(config.estimate_op_runtime)
+        runtime = config.estimate_op_runtime(snode)
+    return runtime
+
+
+def reorder_compute_for_overlap(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    """
+    Decides a global ordering of all compute and communication nodes,
+    assuming that we already have a global ordering of communication nodes.
+
+    Overall scheduling procedure is:
+        Step 1: Given that we've currently scheduled comm N, we now schedule all compute nodes
+            that are required for comm N + 1 but do not depend on comm N, to run at the same time with comm N.
+        Step 2: If all those compute nodes are sufficient to overlap comm N, we're done.
+            Otherwise, we now need to look elsewhere to find compute that overlaps with comm N.
+            We prioritize compute nodes that are needed sooner.
+        Step 3: We schedule the compute nodes dependent on comm N and required for comm N + 1.
+        Step 4: We schedule comm N + 1.
+        Repeat this for subsequent comm nodes.
+    """
+    final_order = []
+
+    comm_nodes = []
+    for snode in snodes:
+        if isinstance(snode.node, ir.CollectiveKernel):
+            comm_nodes.append(snode)
+    if len(comm_nodes) == 0:
+        # if there is no comm nodes, return the current order
+        return snodes
+
+    comm_ancestors = {node: get_ancestors(node) for node in comm_nodes}
+    comm_descendants = {node: get_descendants(node) for node in comm_nodes}
+
+    indeg = dict.fromkeys(snodes, 0)
+    for snode in snodes:
+        for user in snode.node_users:
+            if user in indeg:
+                indeg[user] += 1
+    ready_to_schedule_nodes = {node for node in snodes if indeg[node] == 0}
+
+    unscheduled_nodes = set()
+    unscheduled_nodes = set(snodes)
+
+    def schedule_node(snode):
+        """
+        Schedule a single node.
+        """
+        assert snode in unscheduled_nodes
+        assert snode in ready_to_schedule_nodes
+        ready_to_schedule_nodes.remove(snode)
+        unscheduled_nodes.remove(snode)
+        final_order.append(snode)
+        for user in tuple_sorted(snode.node_users):
+            if user in indeg:
+                indeg[user] -= 1
+                if indeg[user] == 0:
+                    ready_to_schedule_nodes.add(user)
+
+    def schedule_nodes(snodes):
+        """
+        Schedules all nodes in `snodes` in an arbitrary topologically valid order.
+        """
+        all_nodes = set(snodes)
+        assert all(node in unscheduled_nodes for node in all_nodes)
+        while len(all_nodes) > 0:
+            # NOTE: since model graph is always a DAG and does not have circular dependency inside,
+            # there should be at least one node that is a "free node" (i.e. indeg == 0),
+            # hence infinite loop is not possible. But we check here just to be safe.
+            progress = False
+            for node in tuple_sorted(all_nodes):
+                if node in ready_to_schedule_nodes:
+                    schedule_node(node)
+                    all_nodes.remove(node)
+                    progress = True
+            if not progress:
+                raise Exception(
+                    "Unable to find a free node (indeg == 0). This is an impossible state to reach. "
+                    "Please report a bug to PyTorch."
+                )
+
+    # First, schedule all compute nodes that are required by first comm node,
+    # as well as the first comm node itself.
+    assert len(comm_nodes) > 0
+    schedule_nodes(
+        list(comm_ancestors[comm_nodes[0]]) + [comm_nodes[0]],
+    )
+
+    rolled_over_compute_cost = 0
+    for idx in range(1, len(comm_ancestors)):
+        # Step 1: Given that we've currently scheduled comm `idx-1`, we now schedule
+        # all compute nodes that are required for comm `idx` but do not depend on comm `idx-1`,
+        # to run at the same time with comm `idx-1`.
+        needed_by_next_comm_and_ready_compute_nodes = unscheduled_nodes & (
+            comm_ancestors[comm_nodes[idx]] - comm_descendants[comm_nodes[idx - 1]]
+        )
+        assert_no_comm_nodes(needed_by_next_comm_and_ready_compute_nodes)
+
+        total_compute_runtime_cost = rolled_over_compute_cost + sum(
+            [
+                estimate_op_runtime(node)
+                for node in needed_by_next_comm_and_ready_compute_nodes
+            ]
+        )
+        prev_comm_runtime_cost = estimate_op_runtime(comm_nodes[idx - 1])
+        schedule_nodes(tuple_sorted(needed_by_next_comm_and_ready_compute_nodes))
+
+        # Step 2: If all those compute nodes are sufficient to overlap comm `idx-1`, we're done.
+        # Otherwise, we now need to look elsewhere to find compute that overlaps with comm `idx`.
+        # We prioritize compute nodes that are needed sooner.
+        step1_runtime_cost = total_compute_runtime_cost
+        if step1_runtime_cost >= prev_comm_runtime_cost:
+            pass
+        else:
+            # Find all ready to schedule compute nodes that do not depend on comm `idx-1`.
+            ready_to_schedule_compute_nodes = tuple_sorted(
+                ready_to_schedule_nodes - comm_descendants[comm_nodes[idx - 1]]
+            )
+            assert_no_comm_nodes(ready_to_schedule_compute_nodes)
+
+            def earliest_comm_descendant(node):
+                for idx in range(len(comm_nodes)):
+                    if node in comm_ancestors[comm_nodes[idx]]:
+                        return idx
+                return len(comm_nodes)
+
+            # Prioritize compute nodes that are needed sooner.
+            ready_to_schedule_compute_nodes = sorted(
+                ready_to_schedule_compute_nodes, key=earliest_comm_descendant
+            )
+
+            for snode in ready_to_schedule_compute_nodes:
+                if total_compute_runtime_cost >= prev_comm_runtime_cost:
+                    # If accumulated compute runtime cost is greater than comm `idx-1` runtime cost,
+                    # it means we have maximized overlap for comm `idx-1`, and hence we stop looking
+                    # for more compute to schedule.
+                    break
+                compute_runtime_cost = estimate_op_runtime(snode)
+                # If we're not able to leverage more than half of this
+                # node's compute to overlap, we skip it.
+                # TODO: Smarter heuristics here
+                if (
+                    prev_comm_runtime_cost - total_compute_runtime_cost
+                ) <= compute_runtime_cost / 2:
+                    continue
+                schedule_node(snode)
+                total_compute_runtime_cost += compute_runtime_cost
+        rollable_compute_cost = total_compute_runtime_cost - step1_runtime_cost
+
+        # Step 3: We schedule the compute nodes dependent on comm `idx-1` and required for comm `idx`.
+        needed_by_next_comm_nodes = unscheduled_nodes & comm_ancestors[comm_nodes[idx]]
+        schedule_nodes(list(needed_by_next_comm_nodes))
+
+        # Step 4: We schedule comm `idx`.
+        schedule_nodes([comm_nodes[idx]])
+
+        is_prev_comm_blocking_next_comm = len(needed_by_next_comm_nodes) > 0
+        # The idea here is that if there are no compute nodes from Step 3
+        # (i.e. if prev comm is not blocking next comm), we can roll over the compute nodes
+        # in Step 2 to overlap with the next comm, since they're not required to finish
+        # before the next comm starts.
+        if is_prev_comm_blocking_next_comm:
+            rolled_over_compute_cost = 0
+        else:
+            rolled_over_compute_cost = rollable_compute_cost  # type: ignore[assignment]
+
+    schedule_nodes(unscheduled_nodes)
+    return final_order
+
+
+def node_summary(snode):
+    detail = ""
+    if isinstance(snode.node, ir.ExternKernelOut):
+        detail = f" ({snode.node.python_kernel_name})"
+    out_tensor_info = ""
+    if (
+        hasattr(snode.node, "layout")
+        and hasattr(snode.node.layout, "size")
+        and hasattr(snode.node.layout, "stride")
+    ):
+        out_tensor_info = (
+            f" (size={snode.node.layout.size}, stride={snode.node.layout.stride})"
+        )
+    node_name = ""
+    if hasattr(snode.node, "name"):
+        node_name = snode.node.name
+    return f"{snode.node.__class__.__name__}{detail}{out_tensor_info} ({node_name})"
+
+
+def visualize_overlap(order):
+    total_est_runtime: float = 0.0
+    cur_comm_node = None
+    for snode in order:
+        if cur_comm_node is None:
+            if isinstance(snode.node, ir.CollectiveKernel):
+                total_est_runtime += estimate_op_runtime(snode)
+                cur_comm_node = snode.node
+            elif isinstance(snode.node, ir.Wait):
+                raise Exception(
+                    "Wait is not expected when there is no collective running"
+                )
+            else:  # exposed compute op
+                total_est_runtime += estimate_op_runtime(snode)
+            overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+        else:  # cur_comm_node is not None
+            if isinstance(snode.node, ir.CollectiveKernel):
+                raise Exception(
+                    "Found two collectives running at the same time. "
+                    "`visualize_overlap` needs to be updated to handle this case"
+                )
+            elif isinstance(snode.node, ir.Wait):  # end of this comm op
+                overlap_log.debug(f"{node_summary(snode)}")  # noqa: G004
+                cur_comm_node = None
+            else:  # overlapped compute op
+                overlap_log.debug(f"| {node_summary(snode)}")  # noqa: G004
+    overlap_log.debug(
+        f"Est. runtime (ms): {total_est_runtime / 1000 / 1000}"  # noqa: G004
+    )
+
+
+def reorder_compute_and_comm_for_overlap(
+    snodes: List["scheduler.BaseSchedulerNode"],
+) -> List["scheduler.BaseSchedulerNode"]:
+    order = snodes
+    for p in config.reorder_for_compute_comm_overlap_passes:
+        if isinstance(p, str) and p in globals():
+            p = globals()[p]  # it is a builtin pass
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap before reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+        order = p(order)  # type: ignore[operator]
+        if torch.distributed.get_rank() == 0:
+            overlap_log.debug(
+                f"==== Visualize overlap after reordering pass {p} ===="  # noqa: G004
+            )
+            try:
+                visualize_overlap(order)
+            except Exception as e:
+                overlap_log.debug(str(e))
+    return order
diff --git a/MLPY/Lib/site-packages/torch/_inductor/compile_fx.py b/MLPY/Lib/site-packages/torch/_inductor/compile_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f86d481e322d8a28fe8312d2b891fdae93124c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/compile_fx.py
@@ -0,0 +1,1451 @@
+import contextlib
+import functools
+import logging
+import os
+import sys
+import time
+import warnings
+from itertools import count
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    FrozenSet,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+from unittest import mock
+
+from functorch.compile import min_cut_rematerialization_partition
+
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dynamo import (
+    compiled_autograd,
+    config as dynamo_config,
+    logging as dynamo_logging,
+    utils as dynamo_utils,
+)
+from torch._dynamo.utils import (
+    counters,
+    detect_fake_mode,
+    lazy_format_graph_code,
+    optimus_scuba_log,
+)
+from torch._functorch.aot_autograd import aot_export_module, make_boxed_func
+from torch._inductor.codecache import code_hash, CompiledFxGraph, FxGraphCache
+from torch._inductor.cudagraph_utils import BoxedDeviceIndex
+
+from torch._inductor.debug import save_args_for_compile_fx_inner
+from torch._inductor.utils import BoxedBool, count_tangents
+from torch._logging import trace_structured
+from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import FakeTensor
+from torch._utils_internal import signpost_event
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+
+from .._dynamo.backends.common import aot_autograd
+from ..fx._lazy_graph_module import _use_lazy_graph_module  # type: ignore[attr-defined]
+from ..fx.graph import _PyTreeCodeGen
+from . import config, metrics
+from .debug import DebugContext
+from .decomposition import select_decomp_table
+from .fx_passes.joint_graph import joint_graph_passes
+from .fx_passes.post_grad import post_grad_passes, view_to_reshape
+from .fx_passes.pre_grad import pre_grad_passes
+from .graph import GraphLowering
+from .ir import ExternKernelNode
+from .utils import get_dtype_size, has_incompatible_cudagraph_ops, output_node
+from .virtualized import V
+
+if config.is_fbcode():
+    from torch._inductor.fb.utils import time_and_log
+else:
+    # no-op decorator
+    def time_and_log(attr: str, extra_loggings: Optional[Dict[str, str]] = None):
+        return dynamo_utils.identity
+
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+post_grad_graphs_log = torch._logging.getArtifactLogger(__name__, "post_grad_graphs")
+ALIGNMENT = 16
+
+
+# copy_ fails when trying to write to tensors with memory overlap,
+# for expanded dimensions (a dimension which used to have size 1 -> ?)
+# we can select one element from that dimension and write to it
+# to achieve writing to all values of that dimension of the input tensor
+def get_expanded_dims(t):
+    if not isinstance(t, torch.Tensor):
+        return None
+    return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1]
+
+
+def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tensor:
+    for expanded_dim in expanded_dims:
+        t = torch.ops.aten.slice(t, expanded_dim, 0, 1)
+    return t
+
+
+def complex_memory_overlap(t: torch.Tensor) -> bool:
+    # if torch._debug_has_internal_overlap thinks this tensor potentially has
+    # memory overlap internally, let's dig deeper to find out whether it's true.
+    t = index_expanded_dims(t, get_expanded_dims(t))
+    if torch._debug_has_internal_overlap(t) != 0:
+        strides = t.stride()
+        sizes = t.shape
+        indices = list(range(len(strides)))
+        indices = [x for _, x in sorted(zip(strides, indices))]
+        for i in range(len(strides)):
+            prev_stride = 1 if i == 0 else strides[indices[i - 1]]
+            prev_size = 1 if i == 0 else sizes[indices[i - 1]]
+            if strides[indices[i]] < prev_stride * prev_size:
+                return True
+    return False
+
+
+@functools.lru_cache(None)
+def _step_logger():
+    return dynamo_logging.get_step_logger(log)
+
+
+@functools.lru_cache(None)
+def _warn_tf32_disabled():
+    if (
+        torch.cuda.is_available()
+        and not torch.backends.cuda.matmul.allow_tf32
+        and torch.cuda.get_device_capability() >= (8, 0)
+    ):
+        warnings.warn(
+            "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. "
+            "Consider setting `torch.set_float32_matmul_precision('high')` for better performance."
+        )
+
+
+def _unlift_graph(mod, gm, graph_signature):
+    from torch.export.unflatten import _assign_attr, _AttrKind
+
+    state_dict = {}
+    for name, param in mod.named_parameters(remove_duplicate=False):
+        state_dict[name] = param
+        _assign_attr(
+            param,
+            gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        state_dict[name] = buffer
+        _assign_attr(
+            buffer,
+            gm,
+            name,
+            attr_kind=_AttrKind.BUFFER,
+        )
+
+    placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    lifted_inputs = []
+    for node in placeholder_nodes:
+        node_name = node.name
+        if node_name in graph_signature.inputs_to_parameters:
+            lifted_inputs.append(graph_signature.inputs_to_parameters[node_name])
+        elif node_name in graph_signature.inputs_to_buffers:
+            lifted_inputs.append(graph_signature.inputs_to_buffers[node_name])
+        else:
+            assert node_name in graph_signature.user_inputs
+            lifted_inputs.append(None)
+
+    from torch.export._unlift import _unlift
+
+    outputs = list(gm.graph.nodes)[-1].args[0]
+    mutated_outputs = []
+    for out in outputs:
+        if out in graph_signature.buffers_to_mutate:
+            mutated_outputs.append(graph_signature.buffers_to_mutate[out.name])
+        else:
+            mutated_outputs.append(None)
+
+    unlifted_gm = _unlift(
+        gm,
+        lifted_inputs,
+        mutated_outputs,
+        pytree.LeafSpec(),
+        None,
+        state_dict,
+        {},
+    )
+    return unlifted_gm
+
+
+def _get_subgraph_names(gm):
+    for node in gm.graph.nodes:
+        if node.target == torch.ops.higher_order.cond:
+            true_subgraph_name = node.args[1].name
+            false_subgraph_name = node.args[2].name
+            yield true_subgraph_name
+            yield false_subgraph_name
+
+
+def _recursive_pre_grad_passes(gm, example_inputs):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        # as we don't have recursive example inputs, passing None here
+        new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None)
+        setattr(gm, subgraph_name, new_subgraph)
+    return pre_grad_passes(gm, example_inputs)
+
+
+def _recursive_joint_graph_passes(gm):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_joint_graph_passes(subgraph)
+    joint_graph_passes(gm)
+
+
+def _recursive_post_grad_passes(gm, is_inference: bool = False):
+    for subgraph_name in _get_subgraph_names(gm):
+        subgraph = getattr(gm, subgraph_name)
+        _recursive_post_grad_passes(subgraph, is_inference)
+    post_grad_passes(gm, is_inference)
+
+
+def split_const_gm(
+    gm: torch.fx.GraphModule,
+) -> Tuple[torch.fx.GraphModule, Dict[str, int]]:
+    """
+    This function takes an GraphModule input "gm".
+    The gm will be split into 2 components,
+      1) const_gm, which consists the subgraph of gm that can be constant folded.
+      2) gm (being inplace modified,) which returns the graph after constant folding.
+
+    const_output_index is a mapping of corresponding node name from gm to the
+    output index of const_gm.
+    Returns (const_gm, const_output_index)
+    """
+    from torch._inductor.constant_folding import (
+        CONST_MODULE_TAG,
+        META_TAG,
+        MODULE_TAG,
+        replace_node_with_constant,
+        run_and_get_constant_graph,
+    )
+
+    const_gm = run_and_get_constant_graph(gm)
+    const_result = const_gm()
+
+    const_outputs = {
+        x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0])
+    }
+
+    to_erase_node = []
+    to_replace_node = []
+    const_output_index = {}
+    for node in gm.graph.nodes:
+        if node.name in const_outputs:
+            to_replace_node.append(node)
+        elif node.meta[META_TAG] == CONST_MODULE_TAG:
+            to_erase_node.append(node)
+
+    for node in to_replace_node:
+        new_const_name = "_FOLDED_CONST_" + node.name
+        replace_node_with_constant(
+            gm,
+            node,
+            const_result[const_outputs[node.name]],
+            new_const_name,
+        )
+        const_output_index[new_const_name] = const_outputs[node.name]
+    for node in to_erase_node[::-1]:
+        if node.users:
+            for n in node.users:
+                assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty."
+        else:
+            gm.graph.erase_node(node)
+    gm.recompile()
+
+    return const_gm, const_output_index
+
+
+def is_tf32_warning_applicable(gm: torch.fx.GraphModule):
+    aten = torch.ops.aten
+    tf32_ops = {
+        aten.mm.default,
+        aten.addmm.default,
+        aten.bmm.default,
+        aten.baddbmm.default,
+    }
+    for node in gm.graph.nodes:
+        if (
+            node.op == "call_function"
+            and node.target in tf32_ops
+            and isinstance(node.meta.get("val", None), torch.Tensor)
+            and node.meta["val"].dtype == torch.float32
+            and node.meta["val"].device.type == "cuda"
+        ):
+            return True
+    return False
+
+
+@DebugContext.wrap
+def count_bytes_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    num_fixed: int = 0,
+    **kwargs,
+):
+    shape_env = _shape_env_from_inputs(example_inputs)
+    fake_mode = fake_tensor_prop(gm, example_inputs)
+
+    with V.set_fake_mode(fake_mode):
+        _recursive_post_grad_passes(gm, False)
+
+    graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed)
+    with V.set_graph_handler(graph), V.set_real_inputs(example_inputs):
+        graph.run(*example_inputs)
+        num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
+        metrics.num_bytes_accessed += num_bytes
+        metrics.nodes_num_elem += nodes_num_elem
+        metrics.node_runtimes += node_runtimes
+    return make_boxed_func(gm.forward)
+
+
+def fake_tensor_prop(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    force_allow_non_fake_inputs: bool = False,
+):
+    """
+    If we can not detect fake mode from the context of inputs, create one.
+
+    The created fake mode will be returned.
+    """
+    fake_mode = detect_fake_mode(example_inputs)
+    if not fake_mode:
+        fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+        FakeTensorProp(gm, mode=fake_mode).propagate(*example_inputs)
+    else:
+        ctx = (
+            contextlib.nullcontext()
+            if not force_allow_non_fake_inputs
+            else mock.patch.object(fake_mode, "allow_non_fake_inputs", True)
+        )
+        with ctx:  # type: ignore[attr-defined]
+            FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs(
+                *example_inputs
+            )
+
+    return fake_mode
+
+
+# pass config dict back to user
+def get_patched_config_dict(config_patches=None) -> Dict[str, Any]:
+    with config.patch(config_patches):
+        return config.get_config_copy()
+
+
+@DebugContext.wrap
+@torch.utils._python_dispatch._disable_current_modes()
+@time_and_log(
+    attr="compilation time (in seconds)",
+    extra_loggings={"config_dict": str(get_patched_config_dict())},
+)
+# Need this decorator for compile_fx_inner even if we already have one for
+# compile_fx. The reason is the compilation for backward graph may happen after
+# compile_fx return and we may want to use the _LazyGraphModule for compiling
+# the backward graph as well.
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+@dynamo_utils.dynamo_timed(phase_name="inductor_compile")
+def compile_fx_inner(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    boxed_forward_device_index: Optional[BoxedDeviceIndex] = None,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    """
+    Inductor API that compiles a single graph.
+
+    If you change the argument list for this function, make sure you
+    also update the call to save_args_for_compile_fx_inner below accordingly.
+    """
+    if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode:
+        # trigger the real recompilation for _LazyGraphModule before returning
+        # the forward method.
+        from torch.fx._lazy_graph_module import _LazyGraphModule
+
+        _LazyGraphModule.force_recompile(gm)
+        return make_boxed_func(gm.forward)
+
+    assert isinstance(
+        next(iter(reversed(gm.graph.nodes))).args[0], (tuple, list)
+    ), f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}"
+
+    if config.save_args:
+        save_args_for_compile_fx_inner(
+            gm,
+            example_inputs,
+            cudagraphs=cudagraphs,
+            num_fixed=num_fixed,
+            is_backward=is_backward,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            is_inference=is_inference,
+            boxed_forward_device_index=boxed_forward_device_index,
+            user_visible_outputs=user_visible_outputs,
+            layout_opt=layout_opt,
+        )
+
+    if cudagraphs is None:
+        cudagraphs = BoxedBool(config.triton.cudagraphs)
+
+    # Inputs to fx_codegen_and_compile
+    # Anything that affects codegen should go here, so if the signature
+    # of fx_codegen_and_compile changes, the dict should be updated accordingly
+    graph_kwargs = {
+        "cudagraphs": cudagraphs,
+        "num_fixed": num_fixed,
+        "is_backward": is_backward,
+        "graph_id": graph_id,
+        "cpp_wrapper": cpp_wrapper,
+        "aot_mode": aot_mode,
+        "is_inference": is_inference,
+        "user_visible_outputs": user_visible_outputs,
+        "layout_opt": layout_opt,
+        "extern_node_serializer": extern_node_serializer,
+    }
+
+    start = time.time()
+
+    if config.fx_graph_cache and not aot_mode:
+        compiled_graph = FxGraphCache.load(
+            fx_codegen_and_compile, gm, example_inputs, graph_kwargs
+        )
+    else:
+        compiled_graph = fx_codegen_and_compile(
+            gm, example_inputs, **graph_kwargs  # type: ignore[arg-type]
+        )
+
+    log.debug("FX codegen and compilation took %.3fs", time.time() - start)
+
+    # check cudagraph disabling reasons from inductor lowering
+    if cudagraphs and compiled_graph.disabled_cudagraphs_reason:
+        perf_hint_log.warning(
+            "skipping cudagraphs due to %s", compiled_graph.disabled_cudagraphs_reason
+        )
+        BoxedBool.disable(cudagraphs)
+
+    # Return the output strides to the caller via TracingContext
+    context = torch._guards.TracingContext.try_get()
+    if context is not None and context.output_strides is not None:
+        assert len(context.output_strides) == 0
+        context.output_strides.extend(compiled_graph.output_strides)
+
+    if aot_mode:
+        return compiled_graph
+
+    if cudagraphs:
+        # output args are tuple of first argument
+        output = output_node(gm)
+        assert len(output.args) == 1
+        stack_traces = [
+            (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None)
+            for arg in output.args[0]
+        ]
+
+        complex_memory_overlap_inputs = any(
+            complex_memory_overlap(t)
+            for t in example_inputs
+            if isinstance(t, torch.Tensor)
+        )
+
+        from torch._inductor.cudagraph_utils import check_for_mutation
+
+        has_mutation_str = check_for_mutation(gm, compiled_graph, num_fixed)
+        has_mutation = has_mutation_str is not None
+
+        if has_mutation:
+            compiled_graph.disabled_cudagraphs_reason = has_mutation_str
+
+        cudagraph_tests = [
+            (not has_mutation, "mutated inputs"),
+            (not has_incompatible_cudagraph_ops(gm), "incompatible ops"),
+            (not complex_memory_overlap_inputs, "complex memory overlap"),
+            (
+                all(
+                    isinstance(t, (torch.Tensor, torch.SymInt)) for t in example_inputs
+                ),
+                "non-Tensor inputs",
+            ),
+        ]
+        cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b]
+
+        if not cudagraph_fail_reasons:
+            if not config.triton.cudagraph_trees:
+                # Force specialize all inputs so that CUDA graphs will work
+                for t in example_inputs:
+                    if isinstance(t, torch.SymInt):
+                        int(t)  # guard
+
+            if (
+                boxed_forward_device_index is not None
+                and not is_inference
+                and not is_backward
+            ):
+                boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs)))
+
+            compiled_graph.current_callable = cudagraphify(
+                compiled_graph.get_current_callable(),
+                example_inputs,
+                static_input_idxs=range(num_fixed),
+                device_index=next(iter(compiled_graph.device_idxs)),
+                stack_traces=stack_traces,
+                is_backward=is_backward,
+                is_inference=is_inference,
+                constants=tuple(compiled_graph.constants.values()),
+            )
+        else:
+            BoxedBool.disable(cudagraphs)
+
+            # See [Backward Generation Handling]
+            # if cudagraph'd the forward and set the device, we need to let the cudagraph manager
+            # know we are we running the backward even if we will not run it in cudagraphs
+            if is_backward and config.triton.cudagraph_trees:
+                assert boxed_forward_device_index is not None
+                assert boxed_forward_device_index.value is not None
+                compiled_graph_callable = compiled_graph.get_current_callable()
+
+                manager = torch._inductor.cudagraph_trees.get_manager(
+                    boxed_forward_device_index.value, create_if_none_exists=False
+                )
+                # should already exist from forward
+                assert manager is not None
+
+                def compiled_artifact(new_inputs):
+                    manager.set_to_running_backward()
+                    return compiled_graph_callable(new_inputs)
+
+                compiled_graph.current_callable = compiled_artifact
+
+            if "cuda" in compiled_graph.device_types:
+                # prefer better disable_cudagraphs_reason bc stack trace
+                # TODO: migrate all disable reasons to stack trace, refactor
+                if compiled_graph.disabled_cudagraphs_reason:
+                    perf_hint_log.warning(compiled_graph.disabled_cudagraphs_reason)
+                else:
+                    perf_hint_log.warning(
+                        "skipping cudagraphs due to %s", cudagraph_fail_reasons
+                    )
+
+    # cudagraphs does its own aligning of inputs
+    if not cudagraphs:
+        new_callable = align_inputs(
+            compiled_graph.get_current_callable(), example_inputs, range(num_fixed)
+        )
+        if new_callable is not compiled_graph.get_current_callable():
+            compiled_graph.current_callable = new_callable
+
+    _step_logger()(
+        logging.INFO,
+        "torchinductor done compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+
+    # aot autograd needs to know to pass in inputs as a list
+    compiled_graph._boxed_call = True
+    return compiled_graph
+
+
+def fx_codegen_and_compile(
+    gm: torch.fx.GraphModule,
+    example_inputs: List[torch.Tensor],
+    cudagraphs: Optional[BoxedBool] = None,
+    num_fixed: int = 0,
+    is_backward: bool = False,
+    graph_id: Optional[int] = None,
+    cpp_wrapper: bool = False,
+    aot_mode: bool = False,
+    is_inference: bool = False,
+    user_visible_outputs: FrozenSet[str] = frozenset(),
+    layout_opt: Optional[bool] = None,
+    extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None,
+) -> Union[CompiledFxGraph, str]:
+    if is_tf32_warning_applicable(gm):
+        _warn_tf32_disabled()
+
+    # lift the maximum depth of the Python interpreter stack
+    # to adapt large/deep models
+    sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000))
+
+    _step_logger()(
+        logging.INFO,
+        "torchinductor compiling "
+        f"{'BACKWARDS' if is_backward else 'FORWARDS'} "
+        f"graph {graph_id}",
+    )
+    V.debug.fx_graph(gm, example_inputs)
+    # TODO: Should we actually dump this?  It should be redundant with the aot
+    # structured logs...
+    # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False))
+
+    shape_env = _shape_env_from_inputs(example_inputs)
+
+    # Convert view to reshape in the graph. This is necessary primarily for
+    # layout optimization. Do it unconditionally for uniformity.
+    #
+    # It's needed because when we do layout optimization, an contiguous tensor
+    # in eager mode may becomes a channels last tensor. A view op previously
+    # can be applied to the contiguous tensor may not be able to be applied
+    # on the channels tensor any more. An error like
+    #   RuntimeError: view size is not compatible with input tensor's size and stride
+    #   (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+    # will be printed.
+    #
+    # Replace view op to reshape op in this case.
+    # As an example, timm_resnest/botnet26t_256/convnext_base etc. will fail if we don't do this.
+    #
+    # Also this has to be done before FakeTensorProp below to avoid the failed
+    # .view() call.
+    view_to_reshape(gm)
+
+    # It is safe to run FakeTensorProp under no_grad because by the time
+    # we're in inductor, we assume that AOTAutograd has already "taken care"
+    # of autograd, so there should be no more autograd-related API's in the
+    # graph.
+    with torch.no_grad():
+        fake_mode = fake_tensor_prop(gm, example_inputs)
+
+    # pattern matcher passes might not preserve striding information
+    # on node.meta["val"]. if in the future we rely on these being
+    # correct we will need to fix.
+
+    with V.set_fake_mode(fake_mode):
+        # has some issues with memory in training
+        _recursive_post_grad_passes(gm, is_inference=is_inference)
+        V.debug.fx_graph_transformed(gm, example_inputs)
+        post_grad_graphs_log.debug("%s", lazy_format_graph_code("AFTER POST GRAD", gm))
+        trace_structured(
+            "inductor_post_grad_graph",
+            payload_fn=lambda: gm.print_readable(print_output=False),
+        )
+        optimus_scuba_log["inductor_post_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.post_grad_passes",
+            optimus_scuba_log,
+        )
+
+    with V.set_fake_mode(fake_mode):
+        const_output_index = None
+        const_graph = None
+        const_code = None
+
+        if aot_mode and config.aot_inductor.use_runtime_constant_folding:
+            const_gm, const_output_index = split_const_gm(gm)
+
+            const_graph = GraphLowering(
+                const_gm,
+                example_inputs=[],
+                shape_env=shape_env,
+                num_static_inputs=num_fixed,
+                graph_id=graph_id,
+                cpp_wrapper=cpp_wrapper,
+                aot_mode=aot_mode,
+                user_visible_outputs=user_visible_outputs,
+                extern_node_serializer=extern_node_serializer,
+                is_inference=is_inference,
+                is_const_graph=True,
+            )
+            with V.set_graph_handler(const_graph):
+                assert cpp_wrapper, "AOT mode only supports C++ wrapper"
+                const_graph.run()
+
+                const_code, _ = const_graph.codegen_with_cpp_wrapper()
+
+        graph = GraphLowering(
+            gm,
+            # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning.
+            # For the forward pass, we have the real inputs to be used as example_inputs. For the backward pass,
+            # we currently use fake tensors and defake them later.
+            example_inputs=example_inputs,
+            shape_env=shape_env,
+            num_static_inputs=num_fixed,
+            graph_id=graph_id,
+            cpp_wrapper=cpp_wrapper,
+            aot_mode=aot_mode,
+            user_visible_outputs=user_visible_outputs,
+            extern_node_serializer=extern_node_serializer,
+            is_inference=is_inference,
+            const_output_index=const_output_index,
+            const_code=const_code,
+            const_module=const_graph,
+        )
+        with V.set_graph_handler(graph):
+            graph.run(*example_inputs)
+            output_strides: List[Optional[Tuple[int, ...]]] = []
+            if graph.graph_outputs is not None:
+                # We'll put the output strides in the compiled graph so we
+                # can later return them to the caller via TracingContext
+                for out in graph.graph_outputs:
+                    if hasattr(out, "layout"):
+                        output_strides.append(
+                            tuple(
+                                V.graph.sizevars.size_hint(s) for s in out.layout.stride
+                            )
+                        )
+                    else:
+                        output_strides.append(None)
+
+            metrics_helper = metrics.CachedMetricsHelper()
+            compiled_fn = graph.compile_to_fn()
+
+            if V.aot_compilation is True:
+                return compiled_fn
+
+            if cudagraphs and not V.graph.disable_cudagraphs_reason:
+                from torch._inductor.cudagraph_utils import (
+                    check_lowering_disable_cudagraph,
+                )
+
+                V.graph.disable_cudagraphs_reason = check_lowering_disable_cudagraph(
+                    V.graph.device_node_mapping
+                )
+
+            compiled_graph = CompiledFxGraph(
+                compiled_fn,
+                graph,
+                output_strides,
+                V.graph.disable_cudagraphs_reason,
+                metrics_helper.get_deltas(),
+            )
+
+    return compiled_graph
+
+
+def clone_preserve_strides(x: torch.Tensor):
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.as_strided(x, (needed_size,), (1,)).clone()
+    return torch.as_strided(buffer, x.size(), x.stride())
+
+
+def copy_misaligned_inputs(
+    new_inputs: List[torch.Tensor], check_inputs_idxs: Sequence[int]
+) -> None:
+    for i in check_inputs_idxs:
+        if new_inputs[i].data_ptr() % ALIGNMENT:
+            new_inputs[i] = clone_preserve_strides(new_inputs[i])
+
+
+def get_input_idxs_to_check(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+) -> Sequence[int]:
+    def is_aligned(storage_offset, dtype):
+        return (storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0
+
+    ids_to_check = []
+    for i, input in enumerate(inputs):
+        if (
+            isinstance(input, torch.Tensor)
+            and (
+                i not in static_input_idxs
+                or not is_aligned(input.storage_offset(), input.dtype)
+            )
+            and input.device.type == "cuda"
+        ):
+            ids_to_check.append(i)
+    return ids_to_check
+
+
+def align_inputs_from_check_idxs(
+    model: Callable[[List[torch.Tensor]], Any], inputs_to_check: Sequence[int]
+):
+    if len(inputs_to_check) == 0:
+        return model
+
+    def run(new_inputs):
+        copy_misaligned_inputs(new_inputs, inputs_to_check)
+        return model(new_inputs)
+
+    return run
+
+
+def align_inputs(
+    model: Callable[[List[torch.Tensor]], Any],
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    inputs_to_check = get_input_idxs_to_check(inputs, static_input_idxs)
+    return align_inputs_from_check_idxs(model, inputs_to_check)
+
+
+@dynamo_utils.dynamo_timed
+def cudagraphify(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+    *,
+    device_index: int,
+    stack_traces: List[Optional[str]],
+    is_backward: bool,
+    is_inference: bool,
+    constants: Tuple[torch.Tensor, ...] = (),
+):
+    from torch._inductor.cudagraph_trees import (
+        cudagraphify_impl as new_cudagraphify_impl,
+    )
+
+    cudagraphify_fn: Callable[..., Any]
+    if config.triton.cudagraph_trees:
+        cudagraphify_fn = functools.partial(
+            new_cudagraphify_impl,
+            device_index=device_index,
+            stack_traces=stack_traces,
+            is_backward=is_backward,
+            is_inference=is_inference,
+            constants=constants,
+        )
+    else:
+        cudagraphify_fn = cudagraphify_impl
+
+    # if using fake tensors, defer cudagraphs until we get real inputs at runtime
+    if not any(isinstance(inp, FakeTensor) for inp in inputs):
+        return cudagraphify_fn(model, inputs, static_input_idxs)
+
+    compiled_fn = None
+
+    def run(new_inputs):
+        nonlocal compiled_fn
+        if compiled_fn is None:
+            with dynamo_utils.preserve_rng_state():
+                compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs)
+        return compiled_fn(new_inputs)
+
+    return run
+
+
+def remove_unaligned_input_idxs(
+    inputs: Union[List[torch.Tensor], Sequence[int]],
+    static_input_idxs: Sequence[int],
+):
+    """
+    We require all inputs to be aligned, so introduce a copy for any
+    that aren't.
+    """
+    aligned_static_input_idxs = []
+    for idx, input in zip(static_input_idxs, inputs):
+        if isinstance(input, torch.Tensor) and (input.data_ptr() % ALIGNMENT) == 0:
+            aligned_static_input_idxs.append(idx)
+    if len(aligned_static_input_idxs) != len(static_input_idxs):
+        return aligned_static_input_idxs
+    return static_input_idxs
+
+
+def static_input(x: torch.Tensor):
+    """
+    Copy and input while preserving strides
+    """
+    # TODO(jansel): figure out why this version doesn't work:
+    # return torch.empty_strided(x.size(), x.stride(), dtype=x.dtype, device=x.device)
+    needed_size = (
+        sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1
+    )
+    buffer = torch.empty(needed_size, dtype=x.dtype, device=x.device)
+    return torch.as_strided(buffer, x.size(), x.stride())
+
+
+def index_expanded_dims_and_copy_(
+    dst: torch.Tensor,
+    src: torch.Tensor,
+    expanded_dims: List[int],
+):
+    "Index into expanded dimensions of both dst and src then copy_"
+    dst = index_expanded_dims(dst, expanded_dims)
+    src = index_expanded_dims(src, expanded_dims)
+    dst.copy_(src)
+
+
+def cudagraphify_impl(
+    model: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    static_input_idxs: Sequence[int] = (),
+):
+    """
+    Assumes inputs[static_input_idxs[i]] are always the same memory address
+    """
+    check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)
+    static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs)
+    copy_misaligned_inputs(inputs, check_input_idxs)
+
+    assert isinstance(inputs, list)
+
+    inps_expanded_dims = [
+        get_expanded_dims(x) if idx not in static_input_idxs else []
+        for idx, x in enumerate(inputs)
+    ]
+
+    # allocate static tensor inputs
+    static_inputs = [
+        x
+        if not isinstance(x, torch.Tensor)
+        else static_input(x)
+        if idx not in static_input_idxs
+        else x.detach()
+        for idx, x in enumerate(inputs)
+    ]
+
+    # copy over input values for fresh allocations
+    for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)):
+        if isinstance(x, torch.Tensor) and idx not in static_input_idxs:
+            index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims)
+
+    # warmup
+    torch.cuda.synchronize()
+    stream = torch.cuda.Stream()
+    stream.wait_stream(torch.cuda.current_stream())
+    # copy static_inputs because it will be cleared in model
+    with torch.cuda.stream(stream):
+        model(list(static_inputs))
+    stream.synchronize()
+    torch.cuda.current_stream().wait_stream(stream)
+    torch.cuda.synchronize()
+
+    # record
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, stream=stream, capture_error_mode="thread_local"):
+        static_outputs = model(list(static_inputs))
+    if not isinstance(static_outputs, (list, tuple)):
+        static_outputs = (static_outputs,)
+
+    if config.size_asserts:
+
+        def run(new_inputs):
+            assert len(static_inputs) == len(new_inputs)
+            for idx, (dst, src, expanded_dims) in enumerate(
+                zip(static_inputs, new_inputs, inps_expanded_dims)
+            ):
+                if not isinstance(dst, torch.Tensor):
+                    pass
+                elif idx in static_input_idxs:
+                    assert dst.data_ptr() == src.data_ptr()
+                else:
+                    # TODO - could make one single op of multiple slices
+                    # and avoid dispatch.
+                    # Could also pre-index the `dst` tensors
+                    index_expanded_dims_and_copy_(dst, src, expanded_dims)
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+
+    else:
+        copy_indices = [
+            idx for idx in range(len(static_inputs)) if idx not in static_input_idxs
+        ]
+
+        def run(new_inputs):
+            for idx in copy_indices:
+                expanded_dims = inps_expanded_dims[idx]
+                index_expanded_dims_and_copy_(
+                    static_inputs[idx], new_inputs[idx], expanded_dims
+                )
+            new_inputs.clear()
+            graph.replay()
+            return static_outputs
+
+    return align_inputs_from_check_idxs(run, check_input_idxs)
+
+
+def compile_fx_aot(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+):
+    config_patches: Dict[str, Any] = (
+        {"cpp_wrapper": True}
+        if config_patches is None
+        else {**config_patches, "cpp_wrapper": True}
+    )
+    if (
+        "aot_inductor.output_path" not in config_patches
+        and not config.aot_inductor.output_path
+    ):
+        config_patches = {
+            **config_patches,
+            "aot_inductor.output_path": code_hash(model_.code),
+        }
+
+    extern_node_serializer = config_patches.pop("extern_node_serializer", None)
+    with V.set_aot_compilation(True):
+        compiled_lib_path = compile_fx(
+            model_,
+            example_inputs_,
+            inner_compile=functools.partial(
+                inner_compile,
+                aot_mode=True,
+                extern_node_serializer=extern_node_serializer,
+            ),
+            config_patches=config_patches,
+        )
+        assert os.path.exists(
+            compiled_lib_path
+        ), f"AOTInductor compiled library does not exist at {compiled_lib_path}"
+        return compiled_lib_path
+
+
+_graph_counter = count(0)
+
+
+def fw_compiler_freezing(
+    aot_autograd_model: torch.fx.GraphModule,
+    aot_example_inputs: List[torch.Tensor],
+    dynamo_model: torch.fx.GraphModule,
+    num_example_inputs: int,
+    inner_compile: Callable[..., Any],
+    cudagraphs: BoxedBool,
+    graph_id: int,
+    forward_device: BoxedDeviceIndex,
+):
+    from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze
+
+    # partition_fn won't be called
+    _recursive_joint_graph_passes(aot_autograd_model)
+
+    layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True)
+    if layout_opt:
+        # make sure meta['val'] is properly setup
+        fake_tensor_prop(aot_autograd_model, aot_example_inputs, True)
+        convert_conv_weights_to_channels_last(aot_autograd_model)
+
+    opt_model, preserved_arg_indices = freeze(
+        dynamo_model,
+        aot_autograd_model,
+        aot_example_inputs,  # type: ignore[arg-type]
+    )
+
+    aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices]
+    num_fixed = len(preserved_arg_indices) - num_example_inputs
+
+    fake_mode = detect_fake_mode(aot_example_inputs)
+
+    # for freezing, all graph outputs should be user visible
+    *_, model_outputs_node = opt_model.graph.nodes
+    model_outputs = model_outputs_node.args[0]
+    user_visible_outputs = [
+        n.name for n in model_outputs if isinstance(n, torch.fx.Node)
+    ]
+
+    # constant params will be real tensors, not fake
+    tracing_context = torch._guards.TracingContext.try_get()
+    if tracing_context is not None:
+        params_flat = tracing_context.params_flat
+        assert params_flat is not None
+        for i in range(len(params_flat)):
+            if i not in preserved_arg_indices:
+                params_flat[i] = None
+
+    with mock.patch.object(fake_mode, "allow_non_fake_inputs", True):
+        optimized_function = inner_compile(
+            opt_model,
+            aot_example_inputs,
+            num_fixed=num_fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=True,
+            boxed_forward_device_index=forward_device,
+            layout_opt=layout_opt,
+            user_visible_outputs=user_visible_outputs,
+        )
+
+    # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper
+    # that drops constant-ified params
+    if V.aot_compilation is True:
+        return optimized_function
+
+    def wrapper(args):
+        args_new = [args[i] for i in preserved_arg_indices]
+        args.clear()
+        return optimized_function(args_new)
+
+    wrapper._boxed_call = True  # type: ignore[attr-defined]
+
+    return wrapper
+
+
+@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module)
+def compile_fx(
+    model_: torch.fx.GraphModule,
+    example_inputs_: List[torch.Tensor],
+    inner_compile: Callable[..., Any] = compile_fx_inner,
+    config_patches: Optional[Dict[str, Any]] = None,
+    decompositions: Optional[Dict[OpOverload, Callable[..., Any]]] = None,
+):
+    """Main entrypoint to a compile given FX graph"""
+    if config_patches:
+        with config.patch(config_patches):
+            return compile_fx(
+                model_,
+                example_inputs_,
+                # need extra layer of patching as backwards is compiled out of scope
+                inner_compile=config.patch(config_patches)(inner_compile),
+                decompositions=decompositions,
+            )
+
+    if config.cpp_wrapper:
+        with config.patch(
+            {
+                "cpp_wrapper": False,
+                "triton.autotune_cublasLt": False,
+                "triton.cudagraphs": False,
+                "triton.store_cubin": True,
+            }
+        ), V.set_real_inputs(example_inputs_):
+            inputs_ = example_inputs_
+            if isinstance(model_, torch.fx.GraphModule):
+                fake_inputs = [
+                    node.meta.get("val")
+                    for node in model_.graph.nodes
+                    if node.op == "placeholder"
+                ]
+                if all(v is not None for v in fake_inputs):
+                    # Validate devices before switching to fake tensors.
+                    for idx, fi, i in zip(count(), fake_inputs, inputs_):
+                        if fi.device != i.device:
+                            raise ValueError(
+                                f"Device mismatch between fake input and example input at position #{idx}: "
+                                f"{fi.device} vs {i.device}. If the model was exported via torch.export(), "
+                                "make sure torch.export() and torch.aot_compile() run on the same device."
+                            )
+                    inputs_ = fake_inputs
+            return compile_fx(
+                model_,
+                inputs_,
+                inner_compile=functools.partial(inner_compile, cpp_wrapper=True),
+                decompositions=decompositions,
+            )
+
+    recursive_compile_fx = functools.partial(
+        compile_fx,
+        inner_compile=inner_compile,
+        decompositions=decompositions,
+    )
+
+    if not graph_returns_tuple(model_):
+        return make_graph_return_tuple(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    if isinstance(model_, torch.fx.GraphModule):
+        if isinstance(model_.graph._codegen, _PyTreeCodeGen):
+            # this graph is the result of dynamo.export()
+            return handle_dynamo_export_graph(
+                model_,
+                example_inputs_,
+                recursive_compile_fx,
+            )
+
+        model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+        optimus_scuba_log["inductor_pre_grad"] = counters["inductor"]
+        signpost_event(
+            "optimus",
+            "compile_fx.pre_grad_passes",
+            optimus_scuba_log,
+        )
+
+    if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_):
+        return flatten_graph_inputs(
+            model_,
+            example_inputs_,
+            recursive_compile_fx,
+        )
+
+    assert not config._raise_error_for_testing
+    num_example_inputs = len(example_inputs_)
+    cudagraphs = BoxedBool(config.triton.cudagraphs)
+    forward_device = BoxedDeviceIndex(None)
+
+    graph_id = next(_graph_counter)
+
+    decompositions = (
+        decompositions if decompositions is not None else select_decomp_table()
+    )
+
+    @dynamo_utils.dynamo_timed
+    def fw_compiler_base(
+        model: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        is_inference: bool,
+    ):
+        if is_inference:
+            # partition_fn won't be called
+            _recursive_joint_graph_passes(model)
+
+        fixed = torch._inductor.utils.num_fw_fixed_arguments(
+            num_example_inputs, len(example_inputs)
+        )
+        user_visible_outputs = set()
+
+        if config.keep_output_stride:
+            *_, model_outputs_node = model.graph.nodes
+            assert model_outputs_node.op == "output"
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            num_model_outputs = len(model_outputs)
+
+            context = torch._guards.TracingContext.try_get()
+            # See Note [User Outputs in the inductor graph]
+            if context is not None and context.fw_metadata and not is_inference:
+                original_output_start_index = (
+                    context.fw_metadata.num_mutated_inp_runtime_indices
+                )
+            else:
+                original_output_start_index = 0
+
+            if isinstance(model_, torch.fx.GraphModule):
+                *_, orig_model_outputs_node = model_.graph.nodes
+                assert orig_model_outputs_node.op == "output"
+                orig_model_outputs, _ = pytree.tree_flatten(
+                    orig_model_outputs_node.args
+                )
+                num_orig_model_outputs = len(orig_model_outputs)
+            else:
+                num_orig_model_outputs = num_model_outputs
+
+            assert num_orig_model_outputs <= num_model_outputs
+
+            # Note [User Outputs in the inductor graph]
+            # We makes the following assumption
+            # For inference
+            #   len(orig_model_outputs) == len(model_outputs)
+            # For training
+            #   len(orig_model_outputs) <= len(model_outputs)
+            # During training, most of the time the model_outputs starts with
+            # original module's outputs followed by saved activations.
+            # But this can be not true if the model have inplace updated tensors.
+            # AOTAutograd will make those tensors being returned before the original
+            # module's output.
+            # To make things safe, we'll use original_output_start_index field
+            # set by AOTAutograd to decide where the original module outputs start.
+            orig_output_end_idx = original_output_start_index + num_orig_model_outputs
+            # Sanity chec: we are about to splice out the "user" outputs from the full set
+            # of "graph" outputs. Make sure we're within bounds.
+            assert orig_output_end_idx <= num_model_outputs
+
+            user_visible_outputs = {
+                n.name
+                for n in model_outputs[original_output_start_index:orig_output_end_idx]
+                if isinstance(n, torch.fx.Node)
+            }
+
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            is_inference=is_inference,
+            boxed_forward_device_index=forward_device,
+            user_visible_outputs=user_visible_outputs,
+        )
+
+    fw_compiler = functools.partial(fw_compiler_base, is_inference=False)
+
+    if config.freezing and not torch.is_grad_enabled():
+        inference_compiler = functools.partial(
+            fw_compiler_freezing,
+            dynamo_model=model_,
+            num_example_inputs=num_example_inputs,
+            inner_compile=inner_compile,
+            cudagraphs=cudagraphs,
+            graph_id=graph_id,
+            forward_device=forward_device,
+        )
+    else:
+        inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
+
+    def partition_fn(graph, joint_inputs, **kwargs):
+        _recursive_joint_graph_passes(graph)
+        return min_cut_rematerialization_partition(
+            graph, joint_inputs, **kwargs, compiler="inductor"
+        )
+
+    @dynamo_utils.dynamo_timed
+    @dynamo_utils.maybe_cprofile
+    def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]):
+        fixed = count_tangents(model)
+        return inner_compile(
+            model,
+            example_inputs,
+            num_fixed=fixed,
+            cudagraphs=cudagraphs,
+            is_backward=True,
+            graph_id=graph_id,
+            boxed_forward_device_index=forward_device,
+        )
+
+    # TODO: can add logging before/after the call to create_aot_dispatcher_function
+    # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func
+    # once torchdynamo is merged into pytorch
+
+    fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode(
+        allow_non_fake_inputs=True
+    )
+    tracing_context = (
+        torch._guards.TracingContext.try_get()
+        or torch._guards.TracingContext(fake_mode)
+    )
+
+    if V.aot_compilation is True:
+        gm, graph_signature = aot_export_module(
+            model_, example_inputs_, trace_joint=False, decompositions=decompositions
+        )
+        unlifted_gm = _unlift_graph(model_, gm, graph_signature)
+        if "dynamo_flat_name_to_original_fqn" in model_.meta:
+            unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[
+                "dynamo_flat_name_to_original_fqn"
+            ]
+        with V.set_fake_mode(fake_mode), compiled_autograd.disable():
+            return inference_compiler(unlifted_gm, example_inputs_)
+
+    with V.set_fake_mode(fake_mode), torch._guards.tracing(
+        tracing_context
+    ), compiled_autograd.disable():
+        return aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            inference_compiler=inference_compiler,
+            decompositions=decompositions,
+            partition_fn=partition_fn,
+            keep_inference_input_mutations=True,
+        )(model_, example_inputs_)
+
+
+def _shape_env_from_inputs(inputs: List[torch.Tensor]):
+    shape_env = None
+    fake_mode = detect_fake_mode(inputs)
+
+    # TODO(voz): It would be nice to enable this assert, but there are lots of tests that
+    # pass in real inputs for now.
+    # if len(inputs) > 0:
+    # assert fake_mode is not None, breakpoint()
+
+    if fake_mode is not None:
+        return fake_mode.shape_env
+
+    # When there are no tensor inputs, get shape_env from the first SymInt.
+    for input in inputs:
+        if isinstance(input, torch.SymInt):
+            return input.node.shape_env
+
+    # TODO(voz): Should we always have one anyway?
+    return None
+
+
+def graph_returns_tuple(gm: torch.fx.GraphModule):
+    """True if a FX graph returns a tuple"""
+    if not isinstance(gm, torch.fx.GraphModule):
+        return True  # can't check this, assume true
+    (rv,) = output_node(gm).args
+    if isinstance(rv, (list, tuple)):
+        return True
+    if (
+        isinstance(rv, torch.fx.node.Node)
+        and hasattr(rv.target, "_schema")
+        and len(rv.target._schema.returns) > 1
+        and all(str(ret.type) == "Tensor" for ret in rv.target._schema.returns)
+    ):
+        # for graphs whose result is one node with multiple outputs
+        return True
+    return False
+
+
+def make_graph_return_tuple(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    Mutate gm so it returns a tuple.  This is only needed for graphs
+    not created by torchdynamo that return non-tuples.
+    """
+    node = output_node(gm)
+    (rv,) = node.args
+    rv, spec = pytree.tree_flatten(rv)
+    with gm.graph.inserting_before(node):
+        gm.graph.output(rv)
+    gm.graph.erase_node(node)
+    assert graph_returns_tuple(gm)
+
+    compiled_fn = compile_gm(gm, inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args, **kwargs):
+        return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec)
+
+    return wrapper
+
+
+def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm):
+    """
+    Mutate inputs so that they are flat and wrap gm such that it
+    accepts those inputs.  This is only needed for graphs not created
+    by torchdynamo that take bumpy inputs.
+    """
+    inputs, spec = pytree.tree_flatten(inputs)
+
+    class GmWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gm = gm
+
+        def forward(self, *args):
+            args: List[Any] = list(args)
+            return self.gm(*pytree.tree_unflatten(args, spec))
+
+    compiled_fn = compile_gm(GmWrapper(), inputs)
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        # note this doesn't check the spec, assuming it is the same
+        return compiled_fn(*pytree.arg_tree_leaves(*args))
+
+    return wrapper
+
+
+def handle_dynamo_export_graph(
+    gm: torch.fx.GraphModule,
+    inputs: List[torch.Tensor],
+    compile_gm: Callable[..., Any],
+):
+    """
+    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
+    convert that to a normal FX graph so inductor can compile it.
+    """
+    codegen = gm.graph._codegen
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.recompile()
+
+    compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs))
+
+    @functools.wraps(compiled_fn)
+    def wrapper(*args):
+        return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args)))
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/_inductor/config.py b/MLPY/Lib/site-packages/torch/_inductor/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..845c9ad6bae9e55dc247e857d57672327bc5ff0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/config.py
@@ -0,0 +1,752 @@
+import os  # noqa: C101
+import sys
+from typing import Any, Callable, Dict, Optional, TYPE_CHECKING
+
+import torch
+
+
+def is_fbcode():
+    return not hasattr(torch.version, "git_version")
+
+
+# add some debug printouts
+debug = False
+
+# add inf and NaN checkers
+debug_check_inf_and_nan = False
+
+# Whether to disable a progress bar for autotuning
+disable_progress = True
+
+# Whether to enable printing the source code for each future
+verbose_progress = False
+
+# use fx aot graph codegen cache
+fx_graph_cache = os.environ.get("TORCHINDUCTOR_FX_GRAPH_CACHE") == "1"
+
+# use cpp wrapper instead of python wrapper
+cpp_wrapper = os.environ.get("TORCHINDUCTOR_CPP_WRAPPER", "0") == "1"
+
+# codegen cpp wrapper code in an ABI compatible mode
+abi_compatible = (
+    os.environ.get("TORCHINDUCTOR_ABI_COMPATIBLE", "1" if is_fbcode() else "0") == "1"
+)
+
+c_shim_version = os.environ.get(
+    "TORCHINDUCTOR_C_SHIM_VERSION", "1" if is_fbcode() else "2"
+)
+
+# dead code elimination
+dce = False
+
+# assume weight tensors are fixed size
+static_weight_shapes = True
+
+# put correctness assertions in generated code
+size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1"
+nan_asserts = os.environ.get("TORCHINDUCTOR_NAN_ASSERTS") == "1"
+
+# enable loop reordering based on input orders
+pick_loop_orders = True
+
+# reuse a kernel input as the output
+inplace_buffers = True
+
+# reuse a buffer for an unrelated purpose
+allow_buffer_reuse = True
+
+# Enable pooled allocations for non-output tensors
+memory_planning = os.environ.get("TORCHINDUCTOR_MEMORY_PLANNING", "0") == "1"
+
+# How to organize memory under memory_planning=True:
+# - "none": do not try to pool storage, just reuse
+# - "intermediates": all non-outputs share storage, outputs each get unique storage
+# - "outputs": two pools, one for intermediates (freed on return) and one for outputs
+# - "combined": a single pool for both intermediates and outputs
+memory_pool = os.environ.get("TORCHINDUCTOR_MEMORY_POOL", "intermediates")
+
+# codegen benchmark harness
+benchmark_harness = True
+
+# fuse pointwise into templates
+epilogue_fusion = True
+
+# do epilogue fusions before other fusions
+epilogue_fusion_first = False
+
+# enable pattern match+replace optimizations
+pattern_matcher = True
+
+# register custom graph optimization pass hook. so far, pre/post passes are
+# only applied before/after pattern_matcher in post_grad_passes.
+#
+# def my_custom_pre_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# def my_custom_post_pass(graph: torch.fx.graph.Graph):
+#     # my custom graph optimization pass
+#     ...
+#
+# torch._inductor.config.post_grad_custom_pre_pass = my_custom_pre_pass
+# torch._inductor.config.post_grad_custom_post_pass = my_custom_post_pass
+post_grad_custom_pre_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+post_grad_custom_post_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+
+# Registers a custom pregrad pass. Note that the pre-grad IR is 1.
+# non-functional, 2. non-normalized, and 3. prone to change. Ideally we should
+# use post-grad passes.
+pre_grad_custom_pass: Optional[Callable[[torch.fx.graph.Graph], None]] = None
+
+# Optimize away split cat patterns (Experimental)
+split_cat_fx_passes = True
+
+# Optimize conv-batchnorm if batchnorm is in eval mode. Slightly reduces numerical stability.
+efficient_conv_bn_eval_fx_passes = False
+
+# Enable predispatch aten IR for export
+is_predispatch = False
+
+# Deprecated
+group_fusion = False
+
+# Deprecated
+batch_fusion = True
+
+# Pre grad group/batch fusion and options in order, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions()` to see available fusions.
+pre_grad_fusion_options: Dict[str, Dict[str, Any]] = {
+    "batch_linear": {},
+    "batch_linear_lhs": {},
+    "batch_layernorm": {},
+    "batch_tanh": {},
+    "batch_relu": {},
+    "batch_sigmoid": {},
+}
+
+# Post grad group/batch fusion and options, set to empty dict to disable fusion.
+# Call `torch._inductor.fx_passes.group_batch_fusion.list_group_batch_fusions(False)` to see available fusions.
+post_grad_fusion_options: Dict[str, Dict[str, Any]] = {}
+
+# enable reordering pass for improving memory locality
+reorder_for_locality = True
+
+# Scale down RBLOCK for better occupancy
+dynamic_scale_rblock = os.environ.get("TORCHINDUCTOR_DYNAMIC_SCALE_RBLOCK", "1") == "1"
+
+# this forces fusion for int_mm with mul. Needed when you want to avoid realizing the int32
+# but the mul gets fused with other pointwise ops instead.
+force_fuse_int_mm_with_mul = False
+
+# for pattern torch.mm(a, b.to(dtype)) with cuda tensors,
+# enable torch._inductor.kernel.mm.tuned_mixed_mm fused kernel.
+# Autotune will compare perf with normal cast->then->mm option
+use_mixed_mm = False
+
+# enable runtime numeric check for pre/post grad fx passes
+# floating point provides limited accuracy (about 7 decimal digits for single precision
+# floating point numbers,about 16 decimal digits for double precision floating point numbers)
+# according to PyTorch documentation.
+# https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations
+fx_passes_numeric_check: Dict[str, Any] = {
+    "pre_grad": False,
+    "precision": 1e-4,
+    "num_iterations": 1,
+    "requires_optimizer": True,
+}
+
+# for pattern torch.mm(a, b.to(dtype)) with cuda tensors, always use
+# torch._inductor.kernel.mm.tuned_mixed_mm's fused kernel.
+# Autotune will not compare with normal cast->then->mm option.
+# (if force_mixed_mm is true, the use_mixed_mm flag will be ignored)
+force_mixed_mm = False
+
+# enable reordering pass for increasing overlap between compute and communication
+reorder_for_compute_comm_overlap = False
+
+# passes (in execution order) for increasing overlap between compute and communication
+# for built-in passes, use string name; for user-defined passes, pass in the function handle
+reorder_for_compute_comm_overlap_passes = [
+    "reorder_compute_for_overlap",
+    "sink_waits",
+    "raise_comms",
+]
+
+# runtime estimation function for ops
+# for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
+estimate_op_runtime = "default"
+
+# unit: GB/s, uni-directional P2P bandwidth per card
+# default value is NVLink
+intra_node_bw = 300
+
+# unit: GB/s, uni-directional P2P bandwidth per node
+# default value is InfiniBand
+inter_node_bw = 25
+
+# enable slow autotuning passes to select algorithms
+max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
+
+# enable slow autotuning passes to select pointwise/reductions algorithms
+max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1"
+
+# enable slow autotuning passes to select gemm algorithms
+max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
+
+# enable autotune local cache
+use_autotune_local_cache = True
+
+# enable autotune remote cache
+use_autotune_remote_cache = (
+    os.environ.get("TORCH_INDUCTOR_AUTOTUNE_REMOTE_CACHE") == "1"
+)
+
+# force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
+# when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
+# for any combinations of m, n, k, regardless of their alignment. setting this flag will ensure
+# that triton does not use TF32 wherever cublas would not use TF32
+force_same_precision = (
+    True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1"
+)
+# Specify candidate backends for gemm autotune.
+# Possible choices are combinations of: ATen, Triton, CUTLASS.
+# ATen: default Pytorch ATen kernels.
+# Triton: Triton templates defined in torch inductor.
+# CUTLASS: Cutlass templates and kernels.
+max_autotune_gemm_backends = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON"
+).upper()
+
+# the value used as a fallback for the unbacked SymInts
+# that can appear in the input shapes (e.g., in autotuning)
+unbacked_symint_fallback = 8192
+
+# enable searching global and local cache regardless of `max_autotune`
+search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1"
+
+save_args = os.environ.get("TORCHINDUCTOR_SAVE_ARGS") == "1"
+
+# We will disable creating subprocess for autotuning if this is False
+autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1"
+
+# If autotuning in subprocess, whether to use multiple devices
+autotune_multi_device = os.environ.get("TORCHINDUCTOR_AUTOTUNE_MULTI_DEVICE") == "1"
+
+coordinate_descent_tuning = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1"
+)
+coordinate_descent_check_all_directions = (
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1"
+)
+coordinate_descent_search_radius = int(
+    os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1")
+)
+
+# Disabled by default on ROCm, opt-in if model utilises NHWC convolutions
+layout_opt_default = "1" if not torch.version.hip else "0"
+layout_optimization = (
+    os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", layout_opt_default) == "1"
+)
+
+force_layout_optimization = os.environ.get("TORCHINDUCTOR_FORCE_LAYOUT_OPT", "0") == "1"
+
+
+# Whether to keep the output strides the same as eager after layout optimization.
+keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1"
+
+# Enabling this will let compiler print warning messages if a generated triton
+# kernel has inputs with mixed layouts.  This is helpful for perf debugging
+# since kernel with mixed layout inputs may run much slower then one whose inputs
+# have uniform layouts.
+warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1"
+
+# control store vs recompute heuristic
+# For fanouts, rematerialization can lead to exponential blowup. So, have
+# smaller threshold
+realize_reads_threshold = 4
+realize_opcount_threshold = 30
+
+# Threshold to prevent excessive accumulation of ops in one buffer during lowering
+realize_acc_reads_threshold = 8
+
+# fallback to eager for random/dropout, this is slow but useful for debugging
+fallback_random = False
+
+# automatically create fallbacks when encountering an unhandled op
+implicit_fallbacks = True
+
+# fuse even in cases without common reads
+aggressive_fusion = False
+
+# For each fused kernel in the wrapper, comment with the nodes that get fused.
+# Useful for debugging fusion.
+debug_fusion = os.environ.get("TORCHINDUCTOR_DEBUG_FUSION") == "1"
+benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
+enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
+
+# how many nodes to allow into a single fusion
+max_fusion_size = 64
+
+# max number of inputs to generate cat as a pointwise op with masked laods
+max_pointwise_cat_inputs = 8
+
+# replace small reductions with pointwise, disable with `= 1`
+unroll_reductions_threshold = 8
+
+# Add extra comments to output code (causes compile cache misses)
+comment_origin = False
+
+# Convert 1x1 convs into matmuls
+conv_1x1_as_mm = False
+
+# Enable split reductions for better utilization when the dimension
+# being reduced over is large (by splitting it)
+split_reductions = True
+
+benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
+
+# Enable constant and index_expr folding
+constant_and_index_propagation = True
+
+# we always add constants into graph.constants without
+# performing any constant-inlining optimization
+always_keep_tensor_constants = False
+
+# assert that indirect indexing does not read / write out of bounds
+assert_indirect_indexing = True
+
+# constant folding on the joint graph
+joint_graph_constant_folding = True
+
+# Enable indirect_indexing asserts for decompositions and lowerings
+debug_index_asserts = False
+
+# warnings intended for PyTorch developers, disable for point releases
+is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
+developer_warnings = is_fbcode() or is_nightly_or_source
+
+# The multiprocessing start method to use for inductor workers in the codecache.
+# TODO: fork is not safe in a multithreaded environment, we should evaluate changing
+# the default to spawn.
+worker_start_method = "fork"
+
+
+def decide_compile_threads():
+    """
+    Here are the precedence to decide compile_threads
+    1. User can override it by TORCHINDUCTOR_COMPILE_THREADS.  One may want to disable async compiling by
+       setting this to 1 to make pdb happy.
+    2. Set to 1 if it's win32 platform or it's a fbcode build
+    3. decide by the number of CPU cores
+    """
+    if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
+        return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
+    elif sys.platform == "win32" or is_fbcode():
+        return 1
+    else:
+        cpu_count = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else os.cpu_count()
+        )
+        assert cpu_count
+        return min(32, cpu_count)
+
+
+compile_threads = decide_compile_threads()
+
+# gemm autotuning global cache dir
+if is_fbcode():
+    from libfb.py import parutil
+
+    try:
+        if __package__:
+            global_cache_dir = parutil.get_dir_path(
+                os.path.join(__package__.replace(".", os.sep), "fb/cache")
+            )
+        else:
+            global_cache_dir = parutil.get_dir_path("fb/cache")
+    except ValueError:
+        global_cache_dir = None
+else:
+    global_cache_dir = None
+
+# If kernel is fused, the name is generated from the origin node op names
+# for larger kernels limit this
+kernel_name_max_ops = 10
+
+# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
+shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1"
+
+# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
+permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
+
+# Mark the wrapper call in PyTorch profiler
+profiler_mark_wrapper_call = False
+
+# Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for
+# every intermediate for which we can correlate it with an intermediate
+# from the original FX graph
+generate_intermediate_hooks = False
+
+# Populate traceback field on IRNode; good for debugging why origin_node is
+# not populated, or finding out where an IRNode was constructed
+debug_ir_traceback = False
+
+# used for debugging to make sure config is properly set
+_raise_error_for_testing = False
+
+_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "")
+profile_bandwidth = _profile_var != ""
+profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var
+# Specify a file where we print out the profiling results.
+# None means we do not dump results to a file.
+profile_bandwidth_output = os.environ.get("TORCHINDUCTOR_PROFILE_OUTPUT", None)
+
+# TODO: remove later
+disable_cpp_codegen = False
+
+
+# Freezing will attempt to inline weights as constants in optimization
+# and run constant folding and other optimizations on them. After freezing, weights
+# can no longer be updated.
+freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1"
+
+# Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead
+# of potentially keeping multiple copies of weights.
+freezing_discard_parameters: bool = False
+
+# Kill switch for allowing temporary tensors to be allocated as stack arrays. Tests
+# should be run with this flag both on and off to make sure we have coverage.
+allow_stack_allocation: bool = (
+    os.environ.get("TORCHINDUCTOR_STACK_ALLOCATION", "1") == "1"
+)
+
+# Enables an alternate DSO interface (the "minimal ArrayRef interface") intended
+# to maximize performance for use cases that it can accommodate at the expense of
+# generality. In brief:
+# - inputs and outputs are ArrayRefTensor<T> (note that strides are required, but the
+#   tensor must be contiguous)
+# - constant handling is unchanged because it is not a per-inference-iteration bottleneck
+#
+# When the DSO is generated in this mode, the usual interface will also be supported,
+# but performance for that interface may be degraded.
+use_minimal_arrayref_interface: bool = False
+
+# decompose some memory bound matmul/bmm to mul
+decompose_mem_bound_mm: bool = False
+
+
+# config specific to codegen/cpp.py
+class cpp:
+    # set to torch.get_num_threads()
+    threads = -1
+
+    # Do not generate loops when the condition doesn't hold, like:
+    # for(long i0=4096; i0<4096; i0+=1)
+    no_redundant_loops = True
+
+    # Assume number of threads is dynamic, don't specialize thread number.
+    # Kernels don't recompile on thread number changes with this flag on.
+    # For single-threaded workload, turning it on would incur a slight
+    # performance degradation.
+    dynamic_threads = False
+
+    simdlen: Optional[int] = None
+    min_chunk_size = 4096
+    cxx = (
+        None,  # download gcc12 from conda-forge if conda is installed
+        # "g++-12",
+        # "g++-11",
+        # "g++-10",
+        # "clang++",
+        os.environ.get("CXX", "clang++" if sys.platform == "darwin" else "g++"),
+        # "g++.par",
+    )
+    # Allow kernel performance profiling via PyTorch profiler
+    enable_kernel_profile = False
+
+    # enable weight prepacking to get a better performance; may lead to large memory footprint
+    weight_prepack = True
+
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+    inject_log1p_bug_TESTING_ONLY: Optional[str] = None
+
+    # If None, autodetect whether or not AVX512/AVX2 can be used.  Otherwise,
+    # force usage as specified, without testing.
+    vec_isa_ok: Optional[bool] = None
+
+    # similar to config.triton.descriptive_names
+    descriptive_names = "original_aten"
+
+    # how many nodes to allow into a single horizontal fusion
+    max_horizontal_fusion_size = 16
+
+    # Make scatter_reduce fallback when reduce is sum to avoid performance regression
+    # using atomic_add.
+    fallback_scatter_reduce_sum = True
+
+    # Use funsafe-math-optimizations when compiling
+    enable_unsafe_math_opt_flag = False
+
+    # Use ffp-contract when compiling
+    enable_floating_point_contract_flag = False
+
+
+# config specific to codegen/triton.py
+class triton:
+    # Use cudagraphs on output code
+    cudagraphs = False
+
+    # Use cudagraph trees for memory pooling if `cudagraphs` is True
+    cudagraph_trees = True
+
+    # assertions not on the fast path, steady state
+    slow_path_cudagraph_asserts = True
+
+    # TODO - need to debug why this prevents cleanup
+    cudagraph_trees_history_recording = False
+
+    # assertions on the fast path
+    fast_path_cudagraph_asserts = False
+
+    # skip warmup for cudagraph trees
+    skip_cudagraph_warmup = False
+
+    # Synchronize before and after every compiled graph.
+    debug_sync_graph = False
+
+    # Synchronize after every kernel launch, to help pinpoint bugs
+    debug_sync_kernel = False
+
+    # Always load full blocks (rather than broadcasting inside the block)
+    dense_indexing = False
+
+    # limit tiling dimensions
+    max_tiles = 2
+
+    # use triton.autotune for pointwise ops with complex layouts
+    # this should only be disabled for debugging/testing
+    autotune_pointwise = True
+
+    # max autotune gemm with cublasLt
+    autotune_cublasLt = True
+
+    # should we stop a fusion to allow better tiling?
+    tiling_prevents_pointwise_fusion = True
+    tiling_prevents_reduction_fusion = True
+
+    # should we give different names to kernels
+    # Note: This is orthogonal to descriptive_names - this is deciding whether
+    # our triton kernel names should all be `triton_` (to maximize caching) or
+    # whether they should be unique.
+    unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1"
+
+    # should we put op names in kernel names
+    # False: No special names (just triton__1, triton__2, etc.)
+    # "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
+    # "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
+    # "inductor_node": Maps to the node name in the FX graph passed to Inductor
+    descriptive_names = "original_aten"
+
+    # use alternate codegen for smaller reductions
+    persistent_reductions = (
+        os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
+    )
+
+    # 0/False: disable
+    # 1/True: enable, use tuning to pick between different subkernels
+    # 2: enable, force using persistent reduction (for debugging)
+    # 3: enable, force using non-persistent reduction (for debugging)
+    multi_kernel = int(os.environ.get("TORCHINDUCTOR_MULTI_KERNEL", "0"))
+
+    # hint to Triton when arguments are divisible by 16
+    divisible_by_16 = True
+
+    # theses are not enforced, but they are used by asserts in triton_heuristics.py
+    # NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048
+
+    # Max RBLOCK will be large for multi-kernel since we do more aggressive
+    # persistent reduction.
+    max_block = {
+        "X": 2048,
+        "Y": 1024,
+        "Z": 1024,
+        "R": 4096 * (16 if multi_kernel else 1),
+    }
+
+    # Minimum RBLOCK to be used for a TritonSplitScanKernel
+    # NOTE: This also indirectly controls the size of workspace buffer required
+    min_split_scan_rblock = 256
+
+    # Store the generated cubin files for cpp wrapper code to load
+    store_cubin = False
+
+    # the max number of spills we allow for the configs we benchmark.
+    # Setting this to 0 means we skip a config if it spills even a single
+    # register.
+    # Setting it to a larger value allows a config spilling a small amount
+    # of registers being benchmarked.
+    #
+    # NOTE: triton will always report >0 register spills for kernels using sin/cos.
+    # (check this issue https://github.com/openai/triton/issues/1756 )
+    # So far we see a fixed 8 spilled registers for kernels using sin/cos.
+    # Raise the threshold to 16 to be safe.
+    # We should revisit this once we understand more of the source of register spills.
+    spill_threshold: int = 16
+
+    # Generate code containing the newer tl.make_block_ptr() API for loads/store
+    use_block_ptr = False
+
+    # Inject a bug into our relu implementation; useful for testing our repro
+    # extraction and minification functionality.
+    # Valid values: "compile_error", "runtime_error", "accuracy"
+    inject_relu_bug_TESTING_ONLY: Optional[str] = None
+
+
+class aot_inductor:
+    # AOTInductor output path
+    # If an absolute path is specified, the generated lib files will be stored under the directory;
+    # If a relative path is specified, it will be used as a subdirectory under the default caching path;
+    # If not specified, a temp directory will be created under the default caching path.
+    # If the specified path contains something like "model.so", the sub-string will be used
+    # to name the generated library.
+    output_path = ""
+
+    debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
+
+    # Serialized tree spec for flattening inputs
+    serialized_in_spec = ""
+
+    # Serialized tree spec for flattening outputs
+    serialized_out_spec = ""
+
+    # flag to decide whether to create a submodule for constant graph.
+    use_runtime_constant_folding: bool = False
+
+
+class cuda:
+    # CUDA arch to use for CUDA template kernel compilation.
+    # e.g. "70", "75", "80", "90", etc.
+    # When arch is None, Inductor uses torch.cuda.get_device_capability(0).
+    arch: Optional[str] = None
+
+    # CUDA version to use for CUDA template kernel compilation.
+    # e.g. "11.4", "12.1", etc.
+    # When version is None, Inductor uses torch.version.cuda.
+    version: Optional[str] = None
+
+    # Optimization level for the host compiler.
+    compile_opt_level = "-O1"
+
+    # Whether to enable device LTO (link-time-optimization).
+    enable_cuda_lto = False
+
+    # Whether to keep intermediate files dring compilation.
+    enable_ptxas_info = False
+
+    # Whether to enable debug info, e.g. line number, cutlass debug info.
+    enable_debug_info = False
+
+    # Whether to use fast math.
+    use_fast_math = False
+
+    # Path to the CUTLASS repo root directory.
+    # The default path only works under PyTorch local development environment.
+    cutlass_dir = os.environ.get(
+        "TORCHINDUCTOR_CUTLASS_DIR",
+        os.path.abspath(
+            os.path.join(os.path.dirname(torch.__file__), "../third_party/cutlass/")
+        ),
+    )
+
+    # Configures the maximum number of CUTLASS configs to profile in max_autotune.
+    # By default it's None, so that all CUTLASS configs are tuned.
+    # This is mainly used to reduce test time in CI.
+    cutlass_max_profiling_configs: Optional[int] = None
+
+    # Path to CUDA NVCC.
+    # NVCC search order:
+    # 1) cuda_cxx set in this config
+    # 2）CUDACXX environment variable
+    # 3）CUDA_HOME environment variable
+    # 4) default system search PATH.
+    cuda_cxx: Optional[str] = None
+
+    # If set to True, it will ensure that only GEMM ops capable of
+    # epilogue fusion via CUTLASS Epilogue Visitor Trees ( EVT )
+    # are enabled for the CUTLASS backend.
+    cutlass_only_evt_capable_ops: bool = False
+
+
+# create a directory containing lots of debug information
+class trace:
+    # master switch for all debugging flags below
+    enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+
+    # Save debug information to a temporary directory
+    # If not specified, a temp directory will be created by system
+    debug_dir: Optional[str] = None
+
+    # Save python logger call >=logging.DEBUG
+    debug_log = False
+
+    # Save python logger call >=logging.INFO
+    info_log = False
+
+    # Save input FX graph (post decomps, pre optimization)
+    fx_graph = True
+
+    # Save FX graph after transformations
+    fx_graph_transformed = True
+
+    # Save TorchInductor IR before fusion pass
+    ir_pre_fusion = True
+
+    # Save TorchInductor IR after fusion pass
+    ir_post_fusion = True
+
+    # Copy generated code to trace dir
+    output_code = True
+
+    # SVG figure showing post-fusion graph
+    graph_diagram = os.environ.get("INDUCTOR_POST_FUSION_SVG", "0") == "1"
+
+    # SVG figure showing fx with fusion
+    draw_orig_fx_graph = os.environ.get("INDUCTOR_ORIG_FX_SVG", "0") == "1"
+
+    # We draw our fx graphs with the "record" shape attribute by default.
+    # Sometimes, when the graph is very complex, we may hit dot errors like below:
+    #   "flat edge between adjacent nodes one of which has a record shape -
+    #    replace records with HTML-like labels"
+    # and thus fail to generate a graph. So, let's give the user an option
+    # to specify the shape attribute for the dot graph. For example, passing
+    # INDUCTOR_DOT_GRAPH_SHAPE_SVG = "none" would let us generate HTML-like lables
+    # to workaround the above failure.
+    dot_graph_shape = os.environ.get("INDUCTOR_DOT_GRAPH_SHAPE_SVG", None)
+
+    # Store cProfile (see snakeviz to view)
+    compile_profile = False
+
+    # Upload the .tar.gz file
+    # Needs to be overriden based on specific environment needs
+    upload_tar: Optional[Callable[[str], None]] = None
+
+    log_autotuning_results: bool = False
+
+
+_save_config_ignore = {
+    # workaround: "Can't pickle <function ...>"
+    "trace.upload_tar",
+}
+
+if TYPE_CHECKING:
+    from torch.utils._config_typing import *  # noqa: F401, F403
+
+from torch.utils._config_module import install_config_module
+
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])
diff --git a/MLPY/Lib/site-packages/torch/_inductor/constant_folding.py b/MLPY/Lib/site-packages/torch/_inductor/constant_folding.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b0a71b8008bd317e56c6864c1ad65901da8fe5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/constant_folding.py
@@ -0,0 +1,264 @@
+import collections
+from typing import Any, Callable, Dict, Optional
+
+import torch
+import torch.utils._pytree as pytree
+
+aten = torch.ops.aten
+
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
+
+
+def replace_node_with_constant(gm, node, constant, name=None):
+    g = gm.graph
+
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0
+        i = gm._frozen_param_count
+
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
+
+        gm._frozen_param_count = i + 1
+
+    with g.inserting_before(node):
+        new_input_node = g.create_node("get_attr", qualname, (), {})
+        node.replace_all_uses_with(new_input_node)
+        new_input_node.meta.update(node.meta)
+        g.erase_node(node)
+
+    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+    gm.register_buffer(qualname, constant)
+    setattr(gm, qualname, constant)
+
+
+class ConstantFolder(torch.fx.Interpreter):
+    def __init__(
+        self,
+        gm,
+        skip_constructors=False,
+    ):
+        super().__init__(gm)
+        self.node_replacements: Dict[torch.fx.Node, Any] = {}
+        self.replaced_uses: Dict[torch.fx.Node, int] = collections.Counter()
+        self.unknown_value = object()
+        self.skip_constructors: bool = skip_constructors
+
+        # overwrite this to deallocate env values if their only remaining use
+        # is the output
+        self.user_to_last_uses = self.node_to_last_non_output_use()
+
+    def is_impure(self, node: torch.fx.node.Node):
+        if node.target in [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+        ]:
+            # For the pattern fp32_weight -> q -> dq
+            # We only folding fp32_weight -> q
+            # int8_weight and leave dq in graph to be fused
+            return True
+        return False
+
+    def node_to_last_non_output_use(self):
+        last_non_output_use = collections.defaultdict(list)
+        seen_uses = set()
+        output_node = next(iter(reversed(self.module.graph.nodes)))
+
+        for node in reversed(self.module.graph.nodes):
+            if node.target == "output":
+                continue
+
+            def add_use(inp):
+                if inp in seen_uses:
+                    return
+
+                seen_uses.add(inp)
+                last_non_output_use[node].append(inp)
+
+            pytree.tree_map_only(torch.fx.Node, add_use, (node.args, node.kwargs))
+
+            # if this node is only used in output, we want to gc it right away
+            if len(node.users) == 1 and output_node in node.users:
+                last_non_output_use[node].append(node)
+
+        return last_non_output_use
+
+    def run_node(self, node):
+        if node.target == "output":
+            # because we remove nodes from env on last non output use,
+            # re-define them now or we'll get error in interpreter
+            def set_env(arg):
+                self.env[arg] = self.unknown_value
+
+            pytree.tree_map_only(torch.fx.Node, set_env, node.args)
+            return super().run_node(node)
+
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+
+        if self.unknown_value in flattened_inputs:
+            return self.unknown_value
+
+        # TODO - fix errors with this
+        if (
+            node.op == "call_function"
+            and node.target == aten._efficientzerotensor.default
+        ):
+            return self.unknown_value
+
+        # TODO - constant folding triton kernel returns the inputs -- fix this
+        if (
+            node.op == "call_function"
+            and node.name == "triton_kernel_wrapper_functional_proxy"
+        ):
+            return self.unknown_value
+
+        # skip constructors, since inductor generates optimal code for them already
+        # and turning into tensor would result in an additional global memory read
+        # TODO - more complicated strategy
+        if (
+            self.skip_constructors
+            and node.op != "get_attr"
+            and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
+        ):
+            return self.unknown_value
+
+        # All mutations should either be removed or on inputs which we did not make constant
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            return self.unknown_value
+
+        out = super().run_node(node)
+
+        if node.op != "get_attr" and isinstance(out, torch.Tensor):
+            if not self.insertable_tensor_check(out):
+                return out
+
+            if self.is_impure(node):
+                return self.unknown_value
+
+            self.add_node_replacement(node, out)
+
+            flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+
+            for n in flattened_node_inps:
+                if not isinstance(n, torch.fx.Node):
+                    continue
+
+                self.replaced_uses[n] += 1
+
+            for to_delete in self.user_to_last_uses.get(node, []):
+                if self.replaced_uses[to_delete] == len(to_delete.users):
+                    self.node_replacements.pop(to_delete, None)
+
+        return out
+
+    def insertable_tensor_check(self, tensor: torch.Tensor) -> bool:
+        return True
+
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor
+
+    def run(self):
+        env = {}
+        for n in self.module.graph.nodes:
+            if n.op == "placeholder":
+                env[n] = self.unknown_value
+        return super().run(initial_env=env)
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_fold(gm, constraint_fn: Optional[Callable[[torch.fx.Node], bool]] = None):
+    cf = ConstantFolder(gm, skip_constructors=True)
+    cf.run()
+
+    for node, constant in cf.node_replacements.items():
+        if constraint_fn is not None and not constraint_fn(node):
+            continue
+        replace_node_with_constant(gm, node, constant)
+
+    erased_params = []
+    for node in gm.graph.nodes:
+        if node.op == "get_attr" and len(node.users) == 0:
+            if hasattr(gm, node.target):
+                delattr(gm, node.target)
+            erased_params.append(node)
+
+    for node in erased_params:
+        gm.graph.erase_node(node)
+
+    gm.graph.eliminate_dead_code()
+    gm.graph.lint()
+    gm.recompile()
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_graph_tag(gm: torch.fx.GraphModule):
+    cf = ConstantFolder(gm, skip_constructors=True)
+    cf.run()
+
+    for node in gm.graph.nodes:
+        if (
+            node.op == "get_attr"
+            or node in cf.node_replacements
+            or node in cf.replaced_uses
+        ):
+            node.meta[META_TAG] = CONST_MODULE_TAG
+        else:
+            node.meta[META_TAG] = MODULE_TAG
+
+
+def run_and_get_constant_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+
+    constant_graph_tag(gm)
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            used_to_fold = False
+            for u in node.users:
+                if u.meta[META_TAG] == CONST_MODULE_TAG:
+                    used_to_fold = True
+                    break
+            if not used_to_fold:
+                node.meta[META_TAG] = MODULE_TAG
+
+    new_graph = torch.fx.Graph()
+
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+
+    return new_gm
diff --git a/MLPY/Lib/site-packages/torch/_inductor/coordinate_descent_tuner.py b/MLPY/Lib/site-packages/torch/_inductor/coordinate_descent_tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba64b1df60fe71faf548fbf00e56fa7c53bb6907
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/coordinate_descent_tuner.py
@@ -0,0 +1,315 @@
+import copy
+import itertools
+import logging
+from typing import Callable, Optional
+
+from torch.utils._triton import has_triton
+from .utils import red_text, triton_config_to_hashable
+
+if has_triton():
+    import triton
+else:
+    triton = None
+
+from . import config as inductor_config
+
+log = logging.getLogger(__name__)
+
+
+def get_field(config, name):
+    if name == "num_warps":
+        return config.num_warps
+    elif name == "num_stages":
+        return config.num_stages
+    else:
+        return config.kwargs.get(name, None)
+
+
+def set_field(config, name, value):
+    if name == "num_warps":
+        config.num_warps = value
+    elif name == "num_stages":
+        config.num_stages = value
+    else:
+        config.kwargs[name] = value
+
+
+class CoordescTuner:
+    """
+    The coordinate descent tuner. Tune one field/coordinate at a time.
+
+    TODO will it be necessary to tune multiple fields simultaneously.
+
+
+    TODO: what if both increasing and decreasing a field can improve perf.
+          i.e., there are multiple local optima..
+    """
+
+    def __init__(self, is_mm=False, name="unknown", size_hints=None):
+        self.is_mm = is_mm  # we will tune num_stages for mm
+        self.cached_benchmark_results = {}
+        self.name = name
+        self.size_hints = size_hints
+
+    def get_xmax(self):
+        xmax = inductor_config.triton.max_block["X"]
+        if self.size_hints and len(self.size_hints) > 0:
+            xmax = min(xmax, self.size_hints[0])
+        return xmax
+
+    def get_ymax(self):
+        ymax = inductor_config.triton.max_block["Y"]
+        if self.size_hints and len(self.size_hints) > 1:
+            ymax = min(ymax, self.size_hints[1])
+        return ymax
+
+    def get_zmax(self):
+        zmax = inductor_config.triton.max_block["Z"]
+        if self.size_hints and len(self.size_hints) > 2:
+            zmax = min(zmax, self.size_hints[2])
+        return zmax
+
+    def get_rmax(self):
+        if self.size_hints and len(self.size_hints) > 0:
+            return self.size_hints[-1]  # the last one is for reduction
+        else:
+            # large enough. We should not pick this large RBLOCK anyway
+            return 2**30
+
+    def get_warpsmax(self):
+        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
+        # number of warps.
+        return 1024 // 32
+
+    def cache_benchmark_result(self, config, timing):
+        self.cached_benchmark_results[triton_config_to_hashable(config)] = timing
+
+    def lookup_in_cache(self, config):
+        return self.cached_benchmark_results.get(triton_config_to_hashable(config))
+
+    def call_func(self, func, config):
+        found = self.lookup_in_cache(config)
+        if found is not None:
+            log.debug("  CACHED")
+            return found
+        timing = func(config)
+        self.cache_benchmark_result(config, timing)
+        return timing
+
+    @property
+    def tunable_fields(self):
+        out = [
+            "XBLOCK",
+            "YBLOCK",
+            "ZBLOCK",
+            # NOTE: we should not tune RBLOCK for persistent reduction.
+            # We rely on the fact that persistent reduction's triton.Config
+            # does not have the RBLOCK field to guarantee that.
+            "RBLOCK",
+            # the following 3 are for mm
+            "BLOCK_M",
+            "BLOCK_N",
+            "BLOCK_K",
+            "num_warps",
+        ]
+        if self.is_mm:
+            out.append("num_stages")
+
+        return out
+
+    def value_too_large(self, name, val):
+        if name == "XBLOCK":
+            return val > self.get_xmax()
+        if name == "YBLOCK":
+            return val > self.get_ymax()
+        if name == "ZBLOCK":
+            return val > self.get_zmax()
+        if name == "RBLOCK":
+            return val > self.get_rmax()
+        if name == "num_warps":
+            return val > self.get_warpsmax()
+
+        return False
+
+    def get_neighbour_values(self, name, orig_val, radius=1, include_self=False):
+        """
+        Get neighbour values in 'radius' steps. The original value is not
+        returned as it's own neighbour.
+        """
+        assert radius >= 1
+
+        def update(cur_val, inc=True):
+            if name == "num_stages":
+                if inc:
+                    return cur_val + 1
+                else:
+                    return cur_val - 1
+            else:
+                if inc:
+                    return cur_val * 2
+                else:
+                    return cur_val // 2
+
+        out = []
+        # increment loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, True)
+            if self.value_too_large(name, cur_val):
+                break
+            out.append(cur_val)
+
+        # decrement loop
+        cur_val = orig_val
+        for _ in range(radius):
+            cur_val = update(cur_val, False)
+            if cur_val <= 0:
+                break
+            out.append(cur_val)
+
+        if include_self:
+            out.append(orig_val)
+        return out
+
+    @staticmethod
+    def has_improvement(baseline, test):
+        threshold = 0.001  # 0.1%
+        return test is not None and test < baseline * (1 - threshold)
+
+    def check_all_tuning_directions(
+        self,
+        func: Callable[["triton.Config"], float],
+        best_config,
+        best_timing,
+    ):
+        """
+        Check all directions. We only do this once the regular coordinate
+        descent tuning find no better choices any more.
+        We only have a few tunable fields, so this should be fine.
+        """
+        candidate_values_list = []
+        effective_fields = []
+        for field in self.tunable_fields:
+            old_value = get_field(best_config, field)
+            if old_value is None:
+                continue
+            candidate_values = self.get_neighbour_values(
+                field,
+                old_value,
+                radius=inductor_config.coordinate_descent_search_radius,
+                include_self=True,
+            )
+            candidate_values_list.append(candidate_values)
+            effective_fields.append(field)
+
+        choices = itertools.product(*candidate_values_list)
+        improved = False
+        for choice in choices:
+            assert len(choice) == len(effective_fields)
+            candidate_config = copy.deepcopy(best_config)
+            for new_val, field in zip(choice, effective_fields):
+                set_field(candidate_config, field, new_val)
+            cmp_res, candidate_timing = self.compare_config(
+                func, candidate_config, best_config, best_timing
+            )
+            if cmp_res:
+                improved = True
+                best_config = candidate_config
+                best_timing = candidate_timing
+
+        return improved, best_config, best_timing
+
+    def compare_config(self, func, candidate_config, best_config, best_timing):
+        """
+        Check if candidate_config is better than best_config.
+
+        Return a touple of (compare_result, candidate_timing).
+        compare_result is true iff candidate_config is better.
+        """
+        log.debug("Try config %s", candidate_config)
+        try:
+            candidate_timing = self.call_func(func, candidate_config)
+        except Exception as e:
+            log.debug("Got exception %s", e)
+            return False, float("inf")
+
+        if self.has_improvement(best_timing, candidate_timing):
+            log.debug(
+                "Tune from %s %f -> %s %f",
+                best_config,
+                best_timing,
+                candidate_config,
+                candidate_timing,
+            )
+
+            return True, candidate_timing
+        return False, candidate_timing
+
+    def autotune(
+        self,
+        func: Callable[["triton.Config"], float],
+        baseline_config: "triton.Config",
+        baseline_timing: Optional[float] = None,
+    ) -> "triton.Config":
+        if baseline_timing is None:
+            baseline_timing = self.call_func(func, baseline_config)
+
+        log.debug("= Do coordinate descent tuning for %s =", self.name)
+        log.debug(
+            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+        )
+        improved = True
+        best_config = baseline_config
+        best_timing = baseline_timing
+        tunable_fields = self.tunable_fields
+
+        while improved:
+            improved = False
+
+            for name in tunable_fields:
+                cur_val = get_field(best_config, name)
+                # some kernel don't have RBLOCK/YBLOCK/ZBLOCK. So cur_val may be None
+                if cur_val is None:
+                    continue
+
+                # It's possible that candidate_values is empty.
+                # E.g., if XBLOCK is 1 initially and size_hint for x is also 1.
+                # We would not try either larger or smaller XBLOCK in this case.
+                candidate_values = self.get_neighbour_values(name, cur_val)
+
+                for next_val in candidate_values:
+                    candidate_config = copy.deepcopy(best_config)
+                    set_field(candidate_config, name, next_val)
+
+                    cmp_res, candidate_timing = self.compare_config(
+                        func, candidate_config, best_config, best_timing
+                    )
+                    if cmp_res:
+                        improved = True
+                        best_config, best_timing = candidate_config, candidate_timing
+
+            if not improved and inductor_config.coordinate_descent_check_all_directions:
+                old_best_timing = best_timing
+                improved, best_config, best_timing = self.check_all_tuning_directions(
+                    func, best_config, best_timing
+                )
+
+                if improved:
+                    msg = red_text(
+                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                    )
+                    log.debug(
+                        msg,
+                        old_best_timing / best_timing,
+                    )
+
+        log.debug(
+            "Improve from %s %f -> %s %f, %.3fx",
+            baseline_config,
+            baseline_timing,
+            best_config,
+            best_timing,
+            baseline_timing / best_timing,
+        )
+
+        return best_config
diff --git a/MLPY/Lib/site-packages/torch/_inductor/cudagraph_trees.py b/MLPY/Lib/site-packages/torch/_inductor/cudagraph_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..f567001e9fbffa8987de7141a124dac3da3d621a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/cudagraph_trees.py
@@ -0,0 +1,2159 @@
+"""
+CUDA graph trees are a safety abstraction over CUDAGraphs, similar to make_graph_callables,
+which share the same memory pool.  Sharing a memory pool is an extremely
+important optimization when chaining multiple CUDA graphs together, as it
+prevents you from needing to copy intermediate tensors from one graph to the
+next, and reduces overall memory usage by allowing dead memory from the first
+pool to be reused in the second.
+
+The standard graph/make_graph_callables support sharing memory pool, but
+with a lot of caveats.  CUDA graph trees remove these restrictions:
+
+* Previously, if you recorded graphs A, B, you had to replay A, B in that
+  order.  With CUDA graph trees, after replaying A, you can change your
+  mind and record/replay a different graph B'; we will support efficient
+  execution of both A, B and A, B', using only max(mem(A, B), mem(A, B')).  In
+  other words: we support arbitrary trees of CUDA graph operations, not just
+  sequences (this is why this feature is called CUDA graph trees.)
+
+* Previously, if you executed graph A, some non-CUDA graph code, and then
+  graph B, after executing graph B, it was not safe to retain any references
+  to intermediates produced by A.  With CUDA graph trees, we track if any
+outputs of graph A are still live by the time graph B is run, and make
+  sure graph B doesn't clobber there memory when reusing the CUDA graphs
+  pool.  You'll get a separate recording of B depending on what tensors
+  stay live or dead.
+
+CUDA graph trees are flexible enough to be used in Dynamo across graph breaks,
+which is their primary use case.
+
+The ability to switch from replay to record is fairly nontrivial: remember that
+when you replay a CUDA graph, you only replay CUDA operations; no CPU side state
+is updated.  In particular, the CPU-side book-keeping for the allocator is not
+reconstructed.  However, to record a new child CUDA graph, we must restore this
+book-keeping.  This is what checkpoint pool state is used for.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import dataclasses
+import functools
+import gc
+import itertools
+import operator
+import sys
+import threading
+import traceback
+import warnings
+import weakref
+from collections import defaultdict
+
+from enum import auto, Enum
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch.fx
+from torch import Tensor
+from torch._dynamo.mutation_guard import GenerationTracker
+from torch._dynamo.utils import preserve_rng_state
+from torch._inductor.compile_fx import (
+    align_inputs_from_check_idxs,
+    copy_misaligned_inputs,
+    get_expanded_dims,
+    get_input_idxs_to_check,
+    index_expanded_dims,
+    remove_unaligned_input_idxs,
+    static_input,
+)
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.storage import UntypedStorage
+from torch.types import _bool
+from torch.utils import _pytree as pytree
+from torch.utils.weak import TensorWeakRef
+
+StorageWeakRefPointer = int
+StorageDataPtr = int
+NBytes = int
+
+if torch.backends.cuda.is_built():
+    from torch._C import (
+        _cuda_CUDAAllocator_AllocatorState as AllocatorState,
+        _set_cached_tensors_enabled as _set_cached_tensors_enabled,
+    )
+else:
+
+    class AllocatorState:  # type: ignore[no-redef]
+        pass
+
+    def _set_cached_tensors_enabled(enabled: _bool) -> None:
+        pass
+
+
+log = torch._logging.getArtifactLogger(__name__, "cudagraphs")
+
+
+from . import config
+
+
+@dataclasses.dataclass(frozen=True)
+class GraphID:
+    "Unique counter of a cuda graph recording"
+    id: int
+
+
+@dataclasses.dataclass(frozen=True)
+class FunctionID:
+    "Unique counter of a function wrapped in cudagraphify_impl"
+    id: int
+
+
+@dataclasses.dataclass(frozen=True)
+class WrappedFunction:
+    """
+    Represents a function that you want to record for CUDA graph replay,
+    with a little more metadata so we can identify if we have an applicable
+    CUDA graph in our CUDA graph tree for it.
+    """
+
+    model: Callable[..., Any]
+    static_input_idxs: Sequence[int]
+    id: FunctionID
+    constants: Tuple[torch.Tensor, ...]
+
+
+def clear_cublass_cache():
+    """
+    Cublas keeps a persistent workspace allocation for running matmuls. This poses a problem for
+    doing warmup within a CUDAGraph private pool because we do not want persistent allocations from
+    one one run to the next. When we begin a new run of a cudagraphs path (generation), all tensors
+    from the previous generation are freed. This frees them the memory pool, but not elsewhere.
+    A tensor in the cublas workspace would continue to be in use the workspace but would also get allocated
+    in the next run. The memory would be in use in two places.
+
+    To solve this, we clear cublas caches before and after warming up or recording. If a workspace is required
+    it will be allocated to the cudagraph private pool and accounted for in the allocator for the duration of the
+    program. There is no overhead to this on replay since cudagraphs removes allocation overhead.
+    """
+    torch._C._cuda_clearCublasWorkspaces()
+
+
+@contextlib.contextmanager
+def clear_cublas_manager():
+    "Context manager around clearing cublas caches that will clear on enter and exit"
+    clear_cublass_cache()
+    try:
+        yield
+    finally:
+        clear_cublass_cache()
+
+
+@contextlib.contextmanager
+def disable_conv_cache_emptying():
+    prev = torch._C._cuda_get_conv_benchmark_empty_cache()
+    torch._C._cudnn_set_conv_benchmark_empty_cache(False)
+    try:
+        yield
+    finally:
+        torch._C._cudnn_set_conv_benchmark_empty_cache(prev)
+
+
+@contextlib.contextmanager
+def enable_history_recording():
+    "Turns on history recording in the CUDA Caching Allocator"
+    enabled = torch._C._cuda_isHistoryEnabled()
+    try:
+        if not enabled:
+            torch.cuda.memory._record_memory_history()
+        yield
+    finally:
+        if not enabled:
+            torch.cuda.memory._record_memory_history(None)
+
+
+def get_history_recording():
+    # TODO - remove, prevents cleanup
+    if not config.triton.cudagraph_trees_history_recording:
+        return contextlib.nullcontext()
+    return enable_history_recording()
+
+
+class TreeManagerContainer:
+    """
+    Manages the lifetime of the tree manager. Like `PrivatePool` in cuda caching allocator,
+    the tree and its corresponding memory pool should be kept alive as long as any outstanding
+    graph or tensor which is an output of a graph remains alive.
+
+    There is a single tree manager container per device.
+
+    The lifecycle of a tree_manager is:
+    -  Is constructed, no graph, no fns, no tensors
+    -  Tree manager is fetched, resulting in tree manager being allocated
+    -  We generate a bunch of functions, calling add_strong_reference
+    -  These functions die, calling finalize_reference
+    -  When all the functions die, we finalize_tree_manager.
+
+    TODO: in the future, we would like to do the following once storage weak refs land
+    -  We look for all the live storages and add references to THOSE
+    -  We count as storages die
+    -  All the storages are dead, we deallocate the tree manager
+    """
+
+    def __init__(self, device_index):
+        # This class keeps a strong reference to tree_manager,
+        # but upon all other strong references to the tree_manager will reset it to None.
+        # We need a strong reference so that we can still access its attributes upon cleanup.
+        self.tree_manager: Optional[CUDAGraphTreeManager] = None
+
+        # Number of outstanding references to the current tree manager
+        self.live_cudagraphify_fns = 0
+
+        self.device_index = device_index
+
+        # Following two objects are only set in the case that Tensor outputs outlive
+        # the cudagraphify_fns. Reference to the Graph is needed to keep the private pool from
+        # deallocation.
+        self.live_storages_count = 0
+        self.graph: Optional[torch.cuda.CUDAGraph] = None
+
+        self.lock = threading.Lock()
+
+    def _finalize_tensor(self):
+        with self.lock:
+            self.live_storages_count -= 1
+            if self.live_storages_count == 0:
+                self.graph = None
+
+                # manager was used again after existing cleanup,
+                # we shouldnt set it to None
+                if self.live_cudagraphify_fns == 0:
+                    self.tree_manager = None
+
+    def finalize_cudagraphify_fn(self):
+        with self.lock:
+            self.live_cudagraphify_fns -= 1
+            if self.live_cudagraphify_fns == 0:
+                self._finalize_tree_manager()
+
+    def _finalize_tree_manager(self):
+        assert self.lock.locked()
+        self.tree_manager = None
+
+        # TODO - when issue #91395 is landed, we can set a weakref on
+        # storages and trigger a deallocation when all outputs of the
+        # cudagraph are dead.
+
+        # live_storages = list(
+        #     tree_manager.live_cudagraph_pool_storages_in_curr_execution()
+        # )
+
+        # # Maintain reference to graph to keep tensors alive
+        # assert len(tree_manager.roots) > 0, "expected at least one use"
+        # root = next(tree_manager.get_roots())
+        # self.graph = root.graph
+        # seen_storages = set()
+        # for stor in live_storages:
+        #     if stor in seen_storages:
+        #         continue
+        #     seen_storages.add(stor)
+        #     self.live_storages_count += 1
+        # .   weakref.finalize(stor, self._finalize_tensor)
+
+    def add_strong_reference(self, fn: Callable[..., Any]):
+        with self.lock:
+            self.live_cudagraphify_fns += 1
+
+        weakref.finalize(fn, self.finalize_cudagraphify_fn)
+
+    def get_tree_manager(self) -> CUDAGraphTreeManager:
+        with self.lock:
+            if self.tree_manager is None:
+                self.tree_manager = CUDAGraphTreeManager(self.device_index)
+            return self.tree_manager
+
+
+local = threading.local()
+
+# one tree manager per device
+local.tree_manager_containers = {}
+local.tree_manager_locks = defaultdict(threading.Lock)
+
+
+# only incremented by user call of mark_step_begin
+class MarkStepBox:
+    mark_step_counter = 0
+
+
+# We need to register this as an object that will be copied over as TLS when new
+# threads are created in autograd
+torch._C._stash_obj_in_tls("tree_manager_containers", local.tree_manager_containers)
+torch._C._stash_obj_in_tls("tree_manager_locks", local.tree_manager_locks)
+
+
+def mark_step_begin():
+    "Indicates that a new iteration of inference or training is about to begin."
+
+    # iterate down to distinguish from GenerationTracking counter
+    MarkStepBox.mark_step_counter -= 1
+
+
+def reset_cudagraph_trees():
+    "Clear all cudagraph trees"
+    # see shutdown below for why this is necessary
+    container_dict = get_obj(local, "tree_manager_containers")
+    locks_dict = get_obj(local, "tree_manager_locks")
+    for device, lock in locks_dict.items():
+        with lock:
+            container = container_dict.get(device)
+            if not container or not container.tree_manager:
+                continue
+
+            container.tree_manager.shutdown()
+
+    _set_cached_tensors_enabled(False)
+    container_dict.clear()
+
+    MarkStepBox.mark_step_counter = 0
+
+
+def get_obj(local, attr_name):
+    if hasattr(local, attr_name):
+        return getattr(local, attr_name)
+    else:
+        assert torch._C._is_key_in_tls(attr_name)
+        return torch._C._get_obj_in_tls(attr_name)
+
+
+def get_container(device_index: int):
+    container_dict = get_obj(local, "tree_manager_containers")
+    lock = get_obj(local, "tree_manager_locks")[device_index]
+
+    with lock:
+        if device_index not in container_dict:
+            container_dict[device_index] = TreeManagerContainer(device_index)
+
+        return container_dict[device_index]
+
+
+def get_manager(
+    device_index: int, create_if_none_exists=True
+) -> Optional[CUDAGraphTreeManager]:
+    if create_if_none_exists:
+        return get_container(device_index).get_tree_manager()
+    return get_container(device_index).tree_manager
+
+
+def cudagraphify_impl(model, inputs, static_input_idxs, *args, **kwargs):
+    fn_cache: Dict[Tuple[int, ...], Callable[..., Any]] = {}
+
+    # Detect int inputs: we need to index on these
+    int_key = [i for i, v in enumerate(inputs) if isinstance(v, int)]
+    get_ints: Any = operator.itemgetter(*int_key) if int_key else lambda _: None
+
+    del inputs
+
+    def deferred_cudagraphify(inputs):
+        int_key = get_ints(inputs)
+        fn = fn_cache.get(int_key)
+        if fn is not None:
+            return fn(inputs)
+
+        if int_key is None:
+            log.info("recording cudagraph tree for graph without symints")
+        else:
+            log.info("recording cudagraph tree for symint key %s", int_key)
+
+        # first get indices we need to check to align, then update our static inputs,
+        # and finally copy
+        check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs)
+        new_static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs)
+        copy_misaligned_inputs(inputs, check_input_idxs)
+
+        fn, out = cudagraphify(model, inputs, new_static_input_idxs, *args, **kwargs)
+        fn = align_inputs_from_check_idxs(fn, inputs_to_check=check_input_idxs)
+        fn_cache[int_key] = fn
+
+        return out
+
+    return deferred_cudagraphify
+
+
+def cudagraphify(
+    model,
+    inputs,
+    static_input_idxs=(),
+    *,
+    device_index: int,
+    is_backward: bool,
+    is_inference: bool,
+    stack_traces: Optional[StackTraces] = None,
+    constants: Tuple[torch.Tensor, ...] = (),
+):
+    manager = get_container(device_index).get_tree_manager()
+    assert not (is_backward and is_inference)
+    mode = (
+        CompilationMode.BACKWARD
+        if is_backward
+        else (CompilationMode.INFERENCE if is_inference else CompilationMode.FORWARD)
+    )
+
+    return manager.add_function(
+        model,
+        inputs,
+        static_input_idxs,
+        stack_traces,
+        mode,
+        constants,
+    )
+
+
+class StorageWeakRefWrapper:
+    """
+    Wrapper around a storage weak ref. Will deallocate it upon expiration if invoked.
+    """
+
+    __slots__ = ["ref", "_data_ptr", "extra_ref_check"]
+
+    storage_ref: Optional[StorageWeakRef]
+
+    def __init__(
+        self,
+        inp: Union[Tensor, UntypedStorage],
+        extra_ref_check: Optional[Callable[[], None]] = None,
+    ):
+        """
+        extra_ref_check is an additional check we need to run to check if the
+        weak ref has expired. in checking storage use count we assume extra_ref_check
+        will hold an additional reference to the storage.
+        """
+        if isinstance(inp, Tensor):
+            stor = inp.untyped_storage()
+        else:
+            assert isinstance(inp, UntypedStorage)
+            stor = inp
+        self.ref = StorageWeakRef(stor)
+        self._data_ptr = stor.data_ptr()
+        self.extra_ref_check = extra_ref_check
+
+    @classmethod
+    def from_weakref_and_data_ptr(cls, cdata, data_ptr, extra_ref_check=None):
+        instance = cls.__new__(cls)
+        instance._data_ptr = data_ptr
+        instance.ref = StorageWeakRef.from_weakref(cdata)
+        instance.extra_ref_check = extra_ref_check
+        return instance
+
+    def __call__(self) -> Optional[StorageWeakRefPointer]:
+        if self.expired():
+            return None
+
+        return self.ref.cdata
+
+    def swap_weakref(self, cdata):
+        self.ref.__del__()
+        self.ref.cdata = cdata
+
+    def data_ptr(self) -> int:
+        "NB: returns the data ptr even if the storage has expired"
+        return self._data_ptr
+
+    def remove_extra_reference(self):
+        self.extra_ref_check = None
+
+    def expired(self):
+        if self.extra_ref_check is not None and not self.extra_ref_check():
+            return False
+
+        # if extra_ref_check is not None we expect an additional reference
+        stor_count = torch._C._storage_Use_Count(self.ref.cdata)
+        return (stor_count - (self.extra_ref_check is not None)) == 0
+
+    def __repr__(self):
+        if self.ref is None or self.ref.expired():
+            return f"StorageWeakRefWrapper to {self.data_ptr()}; dead"
+        else:
+            return f"StorageWeakRefWrapper to {self.data_ptr()}; alive"
+
+
+def is_live(weak_ref: Optional[StorageWeakRefWrapper]) -> bool:
+    return maybe_deref(weak_ref) is not None
+
+
+def maybe_deref(
+    weak_ref: Optional[StorageWeakRefWrapper],
+) -> Optional[Tuple[StorageWeakRefPointer, int]]:
+    if weak_ref is None:
+        return None
+    r = weak_ref()
+    if r is None:
+        return None
+    # NB: r.data_ptr() does not necessarily equal weak_ref.data_ptr()
+    return r, weak_ref.data_ptr()
+
+
+@contextlib.contextmanager
+def _use_cuda_memory_pool_manager(device, mem_pool, stream):
+    """
+    Context manager to use cuda graph pool for new allocations. If you use this manager
+    all cudagraph tensors in use should be reflected in the allocator or they will be overwritten.
+    existing_graph should already have been used in a capture, and the mem_pool must already exist,
+    because this manager will not preserve a reference to the pool which keeps it alive.
+    """
+    torch.cuda.synchronize()
+    stream.wait_stream(torch.cuda.current_stream())
+
+    with torch.cuda.stream(stream), torch.device(device):
+        torch._C._cuda_beginAllocateCurrentStreamToPool(device, mem_pool)
+        try:
+            yield
+        finally:
+            torch._C._cuda_endAllocateCurrentStreamToPool(device, mem_pool)
+            torch._C._cuda_releasePool(device, mem_pool)
+
+    torch.cuda.current_stream().wait_stream(stream)
+
+
+def map_to_ref(t: Optional[Tensor]) -> Optional[StorageWeakRefWrapper]:
+    if not isinstance(t, torch.Tensor):
+        assert t is None
+        return None
+    return StorageWeakRefWrapper(t)
+
+
+# A path index of (depth, offset) indices into a graph that is `depth`` number of nodes from the root
+# at graph output offset
+PathOutputIndex = Tuple[int, int]
+
+# For each node in the path, for each output, is the output alive
+PathLiveness = List[List[bool]]
+
+StackTraces = List[Optional[str]]
+
+
+class CUDAWarmupNode:
+    """
+    Simplified Wrapper around A CUDA Model that wraps outputs in storage refs and exposes
+    apis to get the live storages in the current chain of warmup.
+
+    A CUDAWarmupNode may have either CUDAGraphNode or CUDAWarmupNode as a parent, but may only have
+    CUDAWarmupNode as children, because we cannot record or execute with tensors which do not have stable
+    memory addresses.
+
+    CUDAWarmupNode and CUDAGraphNode have a number of differences that make it easier to use separate classes.
+    - Much of the CUDAGraphNode logic & initialization is based on the tensor properties of first recording. In the
+    first instance of warmup, these are not finalized yet.
+    - All Inputs to the RecordedFunction must be copied over to the cuda graph memory pool, this is unnecessary in warmup.
+    - CUDAWarmup is only used once and so does not need to optimize as much bookkeeping. It is much simpler.
+
+    NB: this class and CUDAGraphNode need to expose `path_live_weakrefs`, `all_outputs_are_dead`, and
+    `self.outputs_weakrefs`, `stack_traces`, and `tensor_weakrefs` for compatibility.
+    """
+
+    def __init__(
+        self,
+        wrapped_function: WrappedFunction,
+        parent,
+        cuda_graphs_pool: Tuple[int, int],
+        existing_cuda_graph: Optional[torch.cuda.CUDAGraph],
+        device_index: int,
+        stack_traces: Optional[StackTraces],
+        stream: torch.cuda.Stream,
+        already_warm: bool,
+    ):
+        self.wrapped_function = wrapped_function
+        self.parent = parent
+        self.cuda_graphs_pool = cuda_graphs_pool
+        self.outputs_weakrefs: List[Optional[StorageWeakRefWrapper]] = []
+        self.tensor_weakrefs: List[Optional[TensorWeakRef]] = []
+        self.existing_cuda_graph = existing_cuda_graph
+        self.has_run = False
+        self.device_index = device_index
+        self.stack_traces = stack_traces
+        self.stream = stream
+        self.already_warm = already_warm
+
+    def run(self, new_inputs):
+        assert not self.has_run, "Wrapped function should never be run twice"
+
+        # See: output_is_alias_of_persistent_static_inputs below. We should only be returning freshly created
+        # storages in path_live_weakrefs.
+        existing_path_data_ptrs = {
+            t.data_ptr() for t in self.path_live_weakrefs() if t()
+        }
+
+        def get_non_cudagraph_inps():
+            non_cudagraph_inps = set()
+            for t in itertools.chain(new_inputs, self.wrapped_function.constants):
+                if (
+                    isinstance(t, torch.Tensor)
+                    and t.untyped_storage().data_ptr() not in existing_path_data_ptrs
+                ):
+                    non_cudagraph_inps.add(t.untyped_storage().data_ptr())
+            return non_cudagraph_inps
+
+        non_cudagraph_inps = get_non_cudagraph_inps()
+
+        if config.triton.slow_path_cudagraph_asserts and not self.already_warm:
+            refs = list(self.path_live_weakrefs())
+            check_memory_pool(self.device_index, self.cuda_graphs_pool, refs)
+
+        with torch.cuda.device(
+            self.device_index
+        ), disable_conv_cache_emptying(), clear_cublas_manager(), _use_cuda_memory_pool_manager(
+            self.device_index, self.cuda_graphs_pool, self.stream
+        ), get_history_recording():
+            out = self.wrapped_function.model(new_inputs)
+
+        assert len(new_inputs) == 0
+
+        # sdpa returns cpu tensors when not recording cuda graph
+        def add_ref(o):
+            return (
+                o is not None
+                and isinstance(o, torch.Tensor)
+                and o.is_cuda
+                and o.untyped_storage().data_ptr() not in non_cudagraph_inps
+                and o.untyped_storage().data_ptr() != 0
+            )
+
+        self.outputs_weakrefs.extend(
+            [map_to_ref(o) if add_ref(o) else None for o in out]
+        )
+        self.tensor_weakrefs.extend(
+            [TensorWeakRef(o) if add_ref(o) else None for o in out]
+        )
+
+        if config.triton.slow_path_cudagraph_asserts and not self.already_warm:
+            out_refs = self.path_live_weakrefs()
+            new_storages = [
+                t for t in out_refs if t.data_ptr() not in non_cudagraph_inps
+            ]
+            check_memory_pool(self.device_index, self.cuda_graphs_pool, new_storages)
+
+        return out
+
+    @property
+    def _path_from_root(self):
+        nodes = []
+        node = self
+        while node:
+            nodes.append(node)
+            node = node.parent
+
+        yield from reversed(nodes)
+
+    def path_live_weakrefs(self) -> Iterator[StorageWeakRefWrapper]:
+        "Returns all live storages weakrefs that created by nodes in this path"
+        for node in self._path_from_root:
+            for output in node.outputs_weakrefs:
+                if is_live(output):
+                    yield output
+
+    def all_outputs_are_dead(self):
+        return not list(self.path_live_weakrefs())
+
+
+# Aliases for List that say what the indices denote
+InputList = List  # input indexes
+OutputList = List  # output indexes
+LevelList = List  # levels (distance from root of tree)
+
+
+class OutputAliasInfo:
+    pass
+
+
+class _UnaliasedStorage(OutputAliasInfo):
+    "Singleton to mark that the graph output constructs a new alias or is None"
+    pass
+
+
+UnaliasedStorage = _UnaliasedStorage()
+
+
+class AliasesPriorGraphOutput(OutputAliasInfo):
+    "Marks that the graph output aliases an output of a prior graph"
+    __slots__ = ["index"]
+
+    index: PathOutputIndex
+
+    def __init__(self, index: PathOutputIndex):
+        assert isinstance(index, tuple)
+        self.index = index
+
+
+class AliasesNewOutput(OutputAliasInfo):
+    "Marks that the graph output aliases an index in the new, returned outputs"
+
+    __slots__ = ["index"]
+
+    index: int
+
+    def __init__(self, index):
+        assert isinstance(index, int)
+        self.index = index
+
+
+class CUDAGraphNode:
+    """
+    A single recording of a function into a CUDA Graph. Recordings of CUDA Graphs share a single memory pool
+    and are structured into a tree, where there is a single recording that can precede it (parent) and multiple
+    subsequent recordings that may follow (children). A node will have no parent if it is the first recording
+    in a tree; i.e., when it is first recorded, there are no live tensors from a previous recording which
+    would force a dependency.
+
+    On first recording, all of the live tensors in the current CUDA Graph Node path will be
+    reflected in the corresponding private pool. On subsequent executions, the caching allocator
+    is unaffected when the graph is replayed.
+
+    In order to support recording a subsequent cuda graph recording after execution of this graph,
+    we checkpoint the state of the memory pool so that it may later be resumed.
+
+    WrappedFunction should have already been warmed up prior to invocation.
+
+    See [setCheckpointPoolState] for further explanation, as well as
+    https://user-images.githubusercontent.com/13564/222815509-374f3400-f83d-4f7d-8fa6-4a092b3250bb.png
+    """
+
+    def __init__(
+        self,
+        wrapped_function: WrappedFunction,
+        id: GraphID,
+        parent: Optional[CUDAGraphNode],
+        inputs: List[Tensor],
+        cuda_graphs_pool: Tuple[int, int],
+        device_index: int,
+        stack_traces: Optional[StackTraces],
+        stream: torch.cuda.Stream,
+    ):
+        assert isinstance(inputs, (list, tuple))
+
+        self.wrapped_function = wrapped_function
+        self.id = id
+        self.device = device_index
+        self.stack_traces = stack_traces
+        self.stream = stream
+
+        # if this is a root parent will be None. use weakref to prevent reference cycle
+        self._parent = weakref.ref(parent) if parent is not None else None
+        # reference to the shared memory pool for the entire cuda graphs tree
+        self.cuda_graphs_pool = cuda_graphs_pool
+
+        # A single wrapped function may be recorded multiple times if memory patterns or
+        # invariants change from one execution to the next
+        self.children: Dict[FunctionID, List[CUDAGraphNode]] = defaultdict(list)
+
+        # StorageWeakRef maintains whether the Storage C++ object remains allocated,
+        # not whether the corresponding memory has been deallocated. In order
+        # to use them to track memory deallocations we must maintain a single StorageWeakRef
+        # for all Storages that reference that memory (even if we are constructing Storages
+        # that do not have a deallocator function). We maintain one single storage_cache
+        # as we execute any tree path. When we retrieve a storage from the cache we
+        # check that it is still alive, and we hash based on observed recording data ptr
+        # and storage cdata.
+
+        # we preserve a single reference to executed outputs that is then referenced
+        # in children to avoid children having to chase parent pointers in the hot path
+        # DO NOT reassign output_weakrefs, only call `clear()`
+        # Path is a series of nodes from root to the current node
+        self.outputs_weakrefs: OutputList[Optional[StorageWeakRefWrapper]] = []
+        self.path_weakrefs: LevelList[OutputList[Optional[StorageWeakRefWrapper]]] = [
+            node.outputs_weakrefs for node in self._path_from_root
+        ]
+        self.path_stacktraces: LevelList[StackTraces] = [
+            node.stack_traces for node in self._path_from_root
+        ]
+        self.tensor_weakrefs: OutputList[Optional[TensorWeakRef]] = []
+
+        # tensors which are outputs of previous graphs in the tree
+        self.cudagraph_managed_idxs: List[int] = [
+            idx
+            for idx, t in enumerate(inputs)
+            if isinstance(t, torch.Tensor) and self._is_cuda_graph_recorded_tensor(t)
+        ]
+
+        self.static_input_idxs: List[int] = list(
+            set(wrapped_function.static_input_idxs) | set(self.cudagraph_managed_idxs)
+        )
+
+        self.static_input_data_ptrs: InputList[Optional[int]] = [
+            (
+                inputs[i].data_ptr()
+                if isinstance(inputs[i], torch.Tensor) and i in self.static_input_idxs
+                else None
+            )
+            for i in range(len(inputs))
+        ]
+
+        # When we checkpoint, and free generations, we will be manually freeing the outputs
+        # of CUDAGraphNodes. We should not be freeing parameters, not do we need to account for
+        # their liveness (they are static), so we need to compute which outputs are aliases of
+        # parameters. Some static inputs are saved tensors from the forward that die in the backward.
+        # Their locations are static but lifetimes are not. We only include the persistent static
+        # data ptrs below because the non persistent data ptrs may be outputs of this record and
+        # fresh allocations.
+
+        # precompute expanded dims to avoid computing in the hot path
+        self.expanded_dims: List[List[int]] = [
+            get_expanded_dims(x)
+            if isinstance(x, torch.Tensor) and idx not in self.static_input_idxs
+            else []
+            for idx, x in enumerate(inputs)
+        ]
+
+        # For each node in path, which outputs were observed to be live
+        # before invoking graph recording, and after graph recording
+        self.recorded_liveness_before_graph: LevelList[OutputList[bool]] = []
+        self.recorded_liveness_after_graph: LevelList[OutputList[bool]] = []
+
+        # List of Tuples of (depth, output_index) that index into node at depth
+        # number of nodes from root and output_index of outputs. Will index into
+        # path_weakrefs.
+        self.expected_dead_indices_before_graph: List[PathOutputIndex] = []
+        self.expected_dead_indices_after_graph: List[PathOutputIndex] = []
+
+        # all live indices after graph recording
+        self.live_indices_after_graph: List[PathOutputIndex] = []
+
+        if self.parent is not None:
+            previous_liveness = self.parent.recorded_liveness_after_graph
+            curr_liveness = self._get_liveness(self.path_weakrefs)
+
+            different_indices = self._get_different_indices(
+                previous_liveness, curr_liveness
+            )
+
+            self.recorded_liveness_before_graph = curr_liveness
+            self.expected_dead_indices_before_graph = different_indices
+
+        recording_inputs = self._allocate_and_copy_recording_inputs(inputs)
+        # recording inputs will copy over memory, so we can free non recording inputs
+        inputs.clear()
+        del inputs
+
+        # graph used for recording model invocation
+        self.graph: Optional[torch.cuda.CUDAGraph] = torch.cuda.CUDAGraph()
+
+        # we allocate non-static inputs within the same memory pool as the CUDAGraph
+        # which we will record the model with. For memory efficiency, it is important
+        # to reclaim the input memory when the inputs are no longer live. To accomplish this,
+        # we reconstruct tensors at the correct data pointers of our inputs which are
+        # non owning and do not prevent deallocation. On subsequent executions, input values
+        # will be copied over to these tensors.
+        self.reconstructed_inputs: InputList[Union[Tensor, int]] = [
+            self._reconstruct_from_tensor_metadata(self._tensor_metadata(x))
+            if isinstance(x, torch.Tensor)
+            else x
+            for x in recording_inputs
+        ]
+
+        # DO THE RECORDING!!!
+        # We record the CUDA graph in the constructor of CUDAGraphNode, which
+        # gives you what the CPU side compute of the function would do.  We
+        # don't throw the recording outputs away: their memory is
+        # correctly accounted for in the CUDAGraphs caching allocator.  This
+        # means on the very FIRST run of the CUDA graph node, we can directly
+        # do more recording, because we have a valid caching allocator state.
+        # NB: This relies on run() being called immediately after the
+        # constructor, otherwise this optimization would not be valid.
+
+        # initialized below in _record
+
+        self.checkpointed_caching_state: Optional[AllocatorState] = None
+
+        # Output Storage Alias information, can be:
+        # - A new, unaliased storage, or the output is None
+        # - An alias of an output of a prior graph
+        # - An alias of an output already created in the reconstructed outputs
+        # This is None if the output in question is an int
+        self.output_storage_alias: OutputList[Optional[OutputAliasInfo]] = []
+
+        # is the output Storage unaliased in subsequent outputs, of all subsequent paths
+        # if it is, we cached the output tensor and adjust storage liveness tracking to also
+        # check if the output tensor does not have an additional python reference.
+        # If a descendent node discovers it has an alias of a prior output, then the output
+        # will no longer be cached in the ancestor.
+        # The large majority of tensors are unaliased, and preserving aliased output tensors would add
+        # significant additional complexity with marginal gains
+        # The cached tensor outputs are added on the first execution, and cleared whenever we need
+        # to do subsequent recording
+        self.unaliased_in_all_paths: OutputList[bool] = []
+        self.cached_tensor_outputs: OutputList[Optional[Tensor]] = []
+
+        # if an output aliases a static, persistent input then the corresponding Tensor will
+        # be set here. These are different than cached tensors, because they are tensors that
+        # are aliases of parameters that are always live.
+        self.static_output_tensors: OutputList[Optional[Tensor]] = []
+
+        # Cleared after recording
+        self.recording_outputs: Optional[
+            OutputList[Union[torch.Tensor, int]]
+        ] = self._record(wrapped_function.model, recording_inputs)
+        self.outputs_metadata: OutputList[Union[Dict[str, Any], int, None]] = []
+
+        # As with inputs, we do not want to keep the outputs permanently alive because that would prevent
+        # their memory being reclaimed in subsequent cuda graph recordings. We record the tensor metadata
+        # needed to reconstruct instead.
+        assert self.recording_outputs is not None
+        for out in self.recording_outputs:
+            if isinstance(out, torch.Tensor):
+                self.outputs_metadata.append(
+                    self._tensor_metadata(out, ignore_storage_offset=False)
+                )
+            else:
+                assert isinstance(out, (int, type(None))), type(out)
+                self.outputs_metadata.append(out)
+
+        self.graph.replay()
+
+    def _copy_input(self, idx, dst, src):
+        expanded_dims = self.expanded_dims[idx]
+        dst = index_expanded_dims(dst, expanded_dims)
+        src = index_expanded_dims(src, expanded_dims)
+        # TODO - one jit kernel across multiple inputs
+        dst.copy_(src)
+
+    def run_first_inputs(self, new_inputs):
+        if config.triton.fast_path_cudagraph_asserts:
+            self.debug_check_invariants_before_invocation()
+
+        # graph is already invoked in the __init__
+        # inputs are copied over in _allocate_recording_inputs and subsequently cleared
+        assert len(new_inputs) == 0
+        outputs = self.recording_outputs
+        self.recording_outputs = None
+        return outputs
+
+    def run(self, new_inputs):
+        if config.triton.fast_path_cudagraph_asserts:
+            self.debug_check_invariants_before_invocation()
+
+        assert len(self.static_input_data_ptrs) == len(new_inputs)
+        # NB: this ranges over non-static inputs too
+        for idx, data_ptr in enumerate(self.static_input_data_ptrs):
+            if idx in self.cudagraph_managed_idxs:
+                continue
+            if not isinstance(new_inputs[idx], torch.Tensor):
+                pass
+            elif data_ptr is not None:
+                # static input, e.g., parameter
+                assert data_ptr == new_inputs[idx].data_ptr()
+            else:
+                # non-static input, need to copy it into CUDA graph
+                dst = self.reconstructed_inputs[idx]
+                src = new_inputs[idx]
+                self._copy_input(idx, dst, src)
+
+        new_inputs.clear()
+        self.run_graph()
+
+        outputs = self.reconstruct_outputs()
+        self.debug_check_invariants_after_invocation()
+
+        return outputs
+
+    def reconstruct_outputs(self):
+        "Reconstruct output tensors according to their saved metadata and alias information"
+
+        # Cached tensors will not yet be set on the first execution
+        # They are also cleared in checkpointing, so if we checkpoint this node
+        # and then execute it again we will need to repopulate cached tensors
+        if not self.cached_tensor_outputs:
+            self._initialize_cached_tensors()
+
+        outputs: List[Optional[Union[int, torch.Tensor]]] = []
+
+        for i, (storage_info, metadata) in enumerate(
+            zip(self.output_storage_alias, self.outputs_metadata)
+        ):
+            if not isinstance(metadata, dict):  # tensor metadata
+                assert isinstance(metadata, (int, type(None)))
+                outputs.append(metadata)
+                continue
+
+            cached_t = self.cached_tensor_outputs[i]
+            if cached_t is not None:
+                # No need to update weakrefs, already correctly initialized
+                outputs.append(cached_t)
+                continue
+
+            static_t = self.static_output_tensors[i]
+            if static_t is not None:
+                assert self.outputs_weakrefs[i] is None
+                outputs.append(static_t)
+                continue
+
+            storage = self.prepare_alias_info_for_tensor_construction(
+                storage_info, metadata
+            )
+
+            if isinstance(storage, UntypedStorage) or storage is None:
+                out = self._reconstruct_from_tensor_metadata(metadata, storage)
+            else:
+                assert isinstance(storage, int)
+                out = self._reconstruct_from_tensor_metadata(
+                    metadata, cast(torch.Tensor, outputs[storage]).untyped_storage()
+                )
+
+            outputs.append(out)
+            w = self.outputs_weakrefs[i]
+            assert w is not None
+            w.swap_weakref(out.untyped_storage()._weak_ref())
+
+        return outputs
+
+    def prepare_alias_info_for_tensor_construction(
+        self,
+        out_alias_info: Optional[OutputAliasInfo],
+        metadata: Union[Dict[str, Any], int, None],
+    ) -> Union[UntypedStorage, None, int]:
+        if (
+            isinstance(metadata, (int, type(None)))
+            or out_alias_info is UnaliasedStorage
+        ):
+            return None
+
+        if isinstance(out_alias_info, AliasesPriorGraphOutput):
+            depth, existing_output_index = out_alias_info.index
+            ref = self.path_weakrefs[depth][existing_output_index]
+            assert ref is not None
+            return torch.UntypedStorage._new_with_weak_ptr(ref())
+
+        assert isinstance(out_alias_info, AliasesNewOutput)
+        return out_alias_info.index
+
+    def prepare_storages_for_construction(
+        self,
+    ) -> List[Union[UntypedStorage, None, int]]:
+        output_storages = []
+        for output_storage_alias, metadata in zip(
+            self.output_storage_alias, self.outputs_metadata
+        ):
+            output_storages.append(
+                self.prepare_alias_info_for_tensor_construction(
+                    output_storage_alias, metadata
+                )
+            )
+
+        return output_storages
+
+    def run_graph(self):
+        assert self.graph is not None
+        self.graph.replay()
+
+    def all_outputs_are_dead(self):
+        "All outputs of the path from this node to its root are dead"
+        for depth, output_index in self.live_indices_after_graph:
+            if is_live(self.path_weakrefs[depth][output_index]):
+                return False
+        return True
+
+    def _record(self, model, inputs):
+        "Record the model"
+
+        def static_input_iter():
+            for i in self.wrapped_function.static_input_idxs:
+                if isinstance(
+                    inputs[i], torch.Tensor
+                ) and not self._is_cuda_graph_recorded_tensor(inputs[i]):
+                    yield inputs[i]
+
+        # see: output_is_alias_of_persistent_static_inputs above
+        static_input_persistent_storage_ptrs: Dict[int, StorageWeakRefWrapper] = {
+            inp.untyped_storage().data_ptr(): StorageWeakRefWrapper(inp)
+            for inp in itertools.chain(
+                static_input_iter(), self.wrapped_function.constants
+            )
+        }
+
+        if config.triton.slow_path_cudagraph_asserts:
+            # need to use parent live weakrefs because live_indices isnt set yet
+            memory = (
+                [] if self.parent is None else list(self.parent.path_live_weakrefs())
+            )
+            memory += [
+                StorageWeakRefWrapper(elem)
+                for i, elem in enumerate(inputs)
+                if isinstance(elem, torch.Tensor)
+                and i not in self.wrapped_function.static_input_idxs
+                and elem.untyped_storage().data_ptr() != 0
+            ]
+            check_memory_pool(self.device, self.cuda_graphs_pool, memory)
+
+        with preserve_rng_state(), torch.cuda.device(
+            self.device
+        ), clear_cublas_manager(), torch.cuda.graph(
+            self.graph,
+            stream=self.stream,
+            pool=self.cuda_graphs_pool,
+            capture_error_mode="thread_local",
+        ), get_history_recording():
+            static_outputs = model(inputs)
+
+        # running model should reclaim memory
+        assert len(inputs) == 0
+
+        if not isinstance(static_outputs, (list, tuple)):
+            static_outputs = (static_outputs,)
+
+        self._add_first_outputs(static_outputs, static_input_persistent_storage_ptrs)
+
+        return static_outputs
+
+    def _add_first_outputs(
+        self,
+        outputs,
+        static_input_persistent_storage_ptrs: Dict[int, StorageWeakRefWrapper],
+    ):
+        "Add the outputs from the first invocation of the node and set up metadata"
+
+        # getting liveness before we have added the outputs to path, so the length
+        # of the two lists is equal
+        prev_liveness = self.recorded_liveness_before_graph
+        curr_liveness = self._get_liveness(self.path_weakrefs)
+
+        delta = self._get_different_indices(prev_liveness, curr_liveness)
+        self.expected_dead_indices_after_graph = delta
+
+        assert len(self.outputs_weakrefs) == 0
+        # index from data pointer to index in outputs
+        output_new_storages_index: Dict[StorageDataPtr, int] = {}
+
+        self.unaliased_in_all_paths = [False for _ in range(len(outputs))]
+        self.static_output_tensors = [None for _ in range(len(outputs))]
+
+        for i, o in enumerate(outputs):
+            if o is None or not isinstance(o, torch.Tensor):
+                self.output_storage_alias.append(UnaliasedStorage)
+                continue
+
+            torch._check(
+                o.is_cuda or o.untyped_storage().data_ptr() == 0,
+                lambda: (
+                    "Expected all cuda outputs in cuda graph recording. Non cuda output "
+                    f"from {self.stack_traces[i] if self.stack_traces else '(unknown)'}"
+                ),
+            ),
+
+            ref = static_input_persistent_storage_ptrs.get(
+                o.untyped_storage().data_ptr(), None
+            )
+            # also treat empty storages as static outputs because we do not need to manage their lifetime
+            # and they should not participate in checkpointing
+            is_empty_storage = o.untyped_storage().data_ptr() == 0
+            if (ref and ref() is not None) or is_empty_storage:
+                self.output_storage_alias.append(None)
+                self.static_output_tensors[i] = o
+                continue
+
+            path_ref = self._is_alias_of_live_recorded_tensor(o)
+            if path_ref is not None:
+                self._mark_prior_graph_output_as_aliased(path_ref)
+                self.output_storage_alias.append(AliasesPriorGraphOutput(path_ref))
+                continue
+
+            if o.untyped_storage().data_ptr() in output_new_storages_index:
+                index = output_new_storages_index[o.untyped_storage().data_ptr()]
+                self.unaliased_in_all_paths[index] = False
+                self.output_storage_alias.append(AliasesNewOutput(index))
+                continue
+
+            output_new_storages_index[o.untyped_storage().data_ptr()] = i
+            self.output_storage_alias.append(UnaliasedStorage)
+            self.unaliased_in_all_paths[i] = True
+
+        if self.stack_traces is None:
+            self.stack_traces = [None for _ in range(len(outputs))]
+        else:
+            assert len(self.stack_traces) == len(
+                outputs
+            ), "Wrong number of stack traces passed in"
+
+        assert not self.outputs_weakrefs
+        for out, static_output_tensor in zip(outputs, self.static_output_tensors):
+            if not isinstance(out, torch.Tensor) or static_output_tensor is not None:
+                self.outputs_weakrefs.append(None)
+                self.tensor_weakrefs.append(None)
+            else:
+                self.outputs_weakrefs.append(StorageWeakRefWrapper(out))
+                self.tensor_weakrefs.append(TensorWeakRef(out))
+
+        self.recorded_liveness_after_graph = self._get_liveness(self.path_weakrefs)
+        self.checkpointed_caching_state = torch._C._cuda_getCheckpointState(
+            self.device, self.cuda_graphs_pool
+        )
+
+        # now, get liveness with outputs added
+        for depth in range(len(self.path_weakrefs)):
+            for output_index in range(len(self.path_weakrefs[depth])):
+                if is_live(self.path_weakrefs[depth][output_index]):
+                    self.live_indices_after_graph.append((depth, output_index))
+
+        self.debug_check_invariants_after_invocation()
+        if config.triton.slow_path_cudagraph_asserts:
+            check_memory_pool(
+                self.device, self.cuda_graphs_pool, list(self.path_live_weakrefs())
+            )
+
+    def _mark_prior_graph_output_as_aliased(self, index: PathOutputIndex):
+        "Remove a graph output from the unaliased, cached tensors in an ancestor node"
+        depth, output_index = index
+        node = list(self._path_from_root)[depth]
+        node.unaliased_in_all_paths[output_index] = False
+        x = self.path_weakrefs[depth][output_index]
+        assert x is not None
+        x.remove_extra_reference()
+
+    def _initialize_cached_tensors(self):
+        # we should not be clearing output_weakrefs, and they should be set in the first
+        # record run
+        assert len(self.outputs_weakrefs) == len(self.outputs_metadata)
+
+        for i, (storage_info, metadata, make_cached) in enumerate(
+            zip(
+                self.output_storage_alias,
+                self.outputs_metadata,
+                self.unaliased_in_all_paths,
+            )
+        ):
+            if not make_cached:
+                self.cached_tensor_outputs.append(None)
+                continue
+
+            assert storage_info is UnaliasedStorage
+            assert isinstance(metadata, dict)
+            s = self.create_storage(metadata)
+            out = self._reconstruct_from_tensor_metadata(metadata, storage=s)
+
+            # XXX: let autograd know that there will be an additional reference to the tensor
+            # that can be ignored when deciding whether to do gradient buffer inplacing.
+            # Otherwise, inplacing could differ between tracing and subsequent execution.
+            # For some models we tested this led to inputs no longer being in cudagraph pools,
+            # leading to spurious re-recordings.
+            # It also tells AMP cache that even though the tensor impls cannot be cached
+            # in dtype conversions.
+
+            torch._C._add_cached_tensor(out)
+
+            self_ref = weakref.ref(self)
+
+            # one reference in our array, and calling sys.getrefcount bumps the refcount by one
+            def check_refcount(i):
+                self_loc = self_ref()
+                if self_loc is None:
+                    return False
+                return self_loc.get_output_refcount(i) == 2
+
+            check = functools.partial(check_refcount, i=i)
+
+            self.outputs_weakrefs[i] = StorageWeakRefWrapper(out, extra_ref_check=check)
+            self.cached_tensor_outputs.append(out)
+
+    def get_output_refcount(self, index):
+        return sys.getrefcount(self.cached_tensor_outputs[index])
+
+    @property
+    def parent(self):
+        "unwraps the weakref to _parent"
+        return self._parent() if self._parent is not None else None
+
+    @property
+    def _path_to_root(self):
+        "Returns all nodes in the path starting at self and ending at root"
+        node = self
+        while node:
+            yield node
+            node = node.parent
+
+    @property
+    def _path_from_root(self):
+        "Returns all nodes in the path starting at the root and ending at self"
+        nodes = reversed(list(self._path_to_root))
+        yield from nodes
+
+    def _is_cuda_graph_recorded_tensor(self, t: torch.Tensor):
+        "Is this tensor an output of a node in this path"
+        for output_refs in self.path_weakrefs:
+            for storage_weak_ref in output_refs:
+                if storage_weak_ref is None:
+                    continue
+                # don't need to check liveness of storage since the cuda graph managed
+                # memory is never released.
+                data_ptr = storage_weak_ref.data_ptr()
+                if t.untyped_storage().data_ptr() == data_ptr:
+                    return True
+
+        return False
+
+    def _is_alias_of_live_recorded_tensor(
+        self, t: torch.Tensor
+    ) -> Optional[PathOutputIndex]:
+        for depth, output_refs in enumerate(self.path_weakrefs):
+            for output_index, storage_ref in enumerate(output_refs):
+                if (storage_and_ptr := maybe_deref(storage_ref)) is not None:
+                    storage, ptr = storage_and_ptr
+                    if ptr == t.untyped_storage().data_ptr():
+                        return (depth, output_index)
+
+        return None
+
+    @staticmethod
+    def _check_liveness(
+        indices: List[PathOutputIndex],
+        output_refs: List[List[Optional[StorageWeakRefWrapper]]],
+    ):
+        "Check that all of the indices specified are dead references"
+        for depth, output_index in indices:
+            w = output_refs[depth][output_index]
+            assert w is not None
+            if w() is not None:
+                return False
+        return True
+
+    def add_child(self, function_id: FunctionID, node: CUDAGraphNode):
+        "Adds node as a a child of self"
+        self.children[function_id].append(node)
+
+    @staticmethod
+    def _get_different_indices(
+        prev: List[List[bool]], curr: List[List[bool]]
+    ) -> List[PathOutputIndex]:
+        "Find indices where the two lists differ."
+        dead_indices = []
+        assert len(prev) <= len(curr)
+        for i, (outputs1, outputs2) in enumerate(zip(prev, curr)):
+            assert len(outputs1) == len(outputs2)
+            for j, (output1, output2) in enumerate(zip(outputs1, outputs2)):
+                if output1 != output2:
+                    dead_indices.append((i, j))
+
+        return dead_indices
+
+    @staticmethod
+    def _get_liveness(
+        weakrefs: List[List[Optional[StorageWeakRefWrapper]]],
+    ) -> List[List[bool]]:
+        "Maps weakrefs to true if the reference is alive and false otherwise"
+        if len(weakrefs) == 0:
+            return []
+
+        return [pytree.tree_map(is_live, outputs) for outputs in weakrefs]
+
+    def debug_assert_invariants(
+        self, expected_liveness: List[List[bool]], newly_dead: List[PathOutputIndex]
+    ):
+        if not config.triton.fast_path_cudagraph_asserts:
+            return
+
+        for i, node in enumerate(self._path_from_root):
+            assert self.path_weakrefs[i] is node.outputs_weakrefs
+
+        nodes = list(self._path_from_root)
+
+        live_blocks = get_block_addrs(self.cuda_graphs_pool)
+
+        live_storage_data_ptrs = set()
+        live_storage_weak_ptrs = set()
+
+        for depth, outputs_liveness in enumerate(expected_liveness):
+            for output_idx, output_liveness in enumerate(outputs_liveness):
+                # tensor can die early, but it can't be alive when it should be dead
+                w = self.path_weakrefs[depth][output_idx]
+                if (stor_weak_ptr_and_data_ptr := maybe_deref(w)) is not None:
+                    assert output_liveness
+                    stor_weak_ptr, stor_data_ptr = stor_weak_ptr_and_data_ptr
+                    assert (stor_data_ptr in live_storage_data_ptrs) == (
+                        stor_weak_ptr in live_storage_weak_ptrs
+                    )
+                    live_storage_data_ptrs.add(stor_data_ptr)
+                    live_storage_weak_ptrs.add(stor_weak_ptr)
+
+                    is_persistent_alias = (
+                        nodes[depth].static_output_tensors[output_idx] is not None
+                    )
+
+                    if is_persistent_alias:
+                        assert stor_data_ptr not in live_blocks
+
+        for depth, output_index in newly_dead:
+            assert not is_live(self.path_weakrefs[depth][output_index])
+
+    def debug_check_invariants_before_invocation(self):
+        self.debug_assert_invariants(
+            self.recorded_liveness_before_graph, self.expected_dead_indices_before_graph
+        )
+
+    def debug_check_invariants_after_invocation(self):
+        self.debug_assert_invariants(
+            self.recorded_liveness_before_graph, self.expected_dead_indices_after_graph
+        )
+
+    def data_ptrs_dead_since_invocation(self) -> List[int]:
+        """
+        Since this node was invoked, return data ptrs of all tensor outputs that have died
+        in the current executing tree path.
+        """
+        curr_liveness = self._get_liveness(self.path_weakrefs)
+        _get_different_indices = self._get_different_indices(
+            self.recorded_liveness_after_graph, curr_liveness
+        )
+
+        path = list(self._path_from_root)
+        ptrs_to_deallocate = []
+        for depth, output_index in _get_different_indices:
+            ptrs_to_deallocate.append(
+                path[depth].outputs_metadata[output_index]["data_ptr"]
+            )
+
+        return ptrs_to_deallocate
+
+    def path_live_weakrefs(self) -> Iterator[StorageWeakRefWrapper]:
+        for i, j in self.live_indices_after_graph:
+            out = self.path_weakrefs[i][j]
+            if out is not None and is_live(out):
+                yield out
+
+    def remove_node_cached_tensors(self):
+        for t in self.cached_tensor_outputs:
+            if t is not None:
+                torch._C._remove_cached_tensor(t)
+        self.cached_tensor_outputs.clear()
+
+        for i, unaliased in enumerate(self.unaliased_in_all_paths):
+            if unaliased:
+                n = self.outputs_weakrefs[i]
+                assert n is not None
+                n.remove_extra_reference()
+
+    def remove_path_cached_tensors(self):
+        for node in self._path_from_root:
+            node.remove_node_cached_tensors()
+
+    def clear_path_state(self):
+        "Clear the path state in this current executing node"
+        # this doesnt actually do anything right now, leaving it as placeholder
+        pass
+
+    @staticmethod
+    def _tensor_metadata(x, ignore_storage_offset=True):
+        assert isinstance(x, torch.Tensor)
+        # We ignore the storage offset for inputs, but not for outputs
+        # TODO: - should we make the storage resizable ?
+        return {
+            "nbytes": x.untyped_storage().nbytes(),
+            "data_ptr": x.untyped_storage().data_ptr(),
+            "size": x.shape,
+            "stride": x.stride(),
+            "dtype": x.dtype,
+            "device": x.device,
+            "storage_offset": x.storage_offset() if not ignore_storage_offset else 0,
+        }
+
+    def _reconstruct_from_tensor_metadata(
+        self, metadata: Dict[str, Any], storage=None
+    ) -> Tensor:
+        s = self.create_storage(metadata) if storage is None else storage
+        return torch._C._construct_CUDA_Tensor_From_Storage_And_Metadata(metadata, s)
+
+    def create_storage(self, metadata):
+        return torch._C._construct_storage_from_data_pointer(
+            metadata["data_ptr"], metadata["device"], metadata["nbytes"]
+        )
+
+    def _allocate_and_copy_recording_inputs(
+        self, inputs
+    ) -> List[Union[torch.Tensor, int]]:
+        """
+        Allocate inputs for non static, non cudagraph managraphed managed tensors in the memory pool
+        and copy over the tensor values.
+        """
+
+        torch.cuda.synchronize()
+        self.stream.wait_stream(torch.cuda.current_stream())
+        recording_inputs: List[Union[Tensor, int]] = []
+
+        with warnings.catch_warnings(record=True), torch.cuda.device(
+            self.device
+        ), _use_cuda_memory_pool_manager(
+            self.device,
+            mem_pool=self.cuda_graphs_pool,
+            stream=self.stream,
+        ):
+            for i, inp in enumerate(inputs):
+                if not isinstance(inp, torch.Tensor):
+                    assert isinstance(inp, int)
+                    recording_inputs.append(inp)
+                elif i not in self.static_input_idxs:
+                    # static_input does an allocation!
+                    recording_inputs.append(static_input(inp))
+                    # copy over and clear non recording input
+                    self._copy_input(i, recording_inputs[-1], inp)
+                    inputs[i] = None
+                    del inp
+                else:
+                    recording_inputs.append(inp)
+
+        return recording_inputs
+
+    def check_invariants(self, inputs: List[Tensor]) -> bool:
+        """
+        Checks if this node can be run. The same pattern of tensor liveness and tensors
+        managed in the cudagraph private pool must remain stable.
+        """
+
+        # previously managed data pointers remain stable
+        for idx in self.cudagraph_managed_idxs:
+            if inputs[idx].data_ptr() != self.static_input_data_ptrs[idx]:
+                return False
+
+        if not self._check_liveness(
+            self.expected_dead_indices_before_graph, self.path_weakrefs
+        ):
+            return False
+
+        # the cudagraph managed tensors which died upon recording must also die upon
+        # this invocation. it is too late to check after we've replayed the graph,
+        # because we would have already written over their memory.
+        for idx in self.cudagraph_managed_idxs:
+            inputs[idx] = None  # type: ignore[call-overload]
+
+        torch._check(
+            self._check_liveness(
+                self.expected_dead_indices_after_graph, self.path_weakrefs
+            ),
+            lambda: "TODO: graph recording observed an input tensor deallocate during graph "
+            " recording that did not occur during replay. Please file an issue.",
+        )
+        return True
+
+    def num_descendants(self) -> int:
+        "Total number of descendents of this node"
+        num_desc = 0
+        for children in self.children.values():
+            for child in children:
+                num_desc += 1
+                num_desc += child.num_descendants()
+        return num_desc
+
+
+def get_cudagraph_segments(pool_id):
+    segments = torch.cuda.memory_snapshot()
+    return [segment for segment in segments if segment["segment_pool_id"] == pool_id]
+
+
+def get_block_addrs(pool_id, live_only=True):
+    blocks = []
+
+    for segment in get_cudagraph_segments(pool_id):
+        addr = segment["address"]
+        for block in segment["blocks"]:
+            if block["state"] == "active_allocated" or not live_only:
+                blocks.append(addr)
+
+            addr += block["size"]
+
+    return blocks
+
+
+def format_tb(frames):
+    formatted_traceback = []
+
+    for entry in frames:
+        formatted_traceback.append(
+            traceback.FrameSummary(entry["filename"], entry["line"], entry["name"])
+        )
+
+    return "".join(traceback.format_list(formatted_traceback))
+
+
+def check_memory_pool(device, pool_id, live_storages_ptrs: List[StorageWeakRefWrapper]):
+    assert all(
+        isinstance(elem, StorageWeakRefWrapper) for elem in live_storages_ptrs
+    )  # noqa: C419
+    unique_storages = {stor.data_ptr() for stor in live_storages_ptrs if stor()}
+
+    # check if there is a divergence first, then do the expensive snapshot call after
+    # we know it will error
+    if torch._C._cuda_checkPoolLiveAllocations(device, pool_id, unique_storages):
+        return
+
+    # at this point we are past the fast-path. we have seen rare cases where a dead tensor is dead,
+    # but hasn't been gc'd yet, and gives false positive for allocated_not_in_live_storages
+    gc.collect()
+
+    segments = get_cudagraph_segments(pool_id)
+
+    allocated_not_in_live_storages = {}
+
+    for segment in segments:
+        addr = segment["address"]
+        for block in segment["blocks"]:
+            if block["state"] == "active_allocated":
+                if addr not in unique_storages:
+                    allocated_not_in_live_storages[addr] = block
+                else:
+                    unique_storages.remove(addr)
+
+            addr += block["size"]
+
+    torch._check(
+        len(unique_storages) == 0,
+        lambda: f"These storage data ptrs are not allocated in pool {pool_id} but should be {unique_storages}",
+    )
+
+    if allocated_not_in_live_storages != 0:
+        formatted = []
+        for dp, block in allocated_not_in_live_storages.items():
+            trace = format_tb(block.get("frames", []))
+            formatted.append(f"Data Pointer: {dp}, history: \n{trace}")
+        formatted_s = "\n".join(formatted)
+        msg = (
+            f"These live storage data ptrs are in the cudagraph pool but not "
+            f"accounted for as an output of cudagraph trees: \n\n{formatted_s}"
+        )
+        raise RuntimeError(msg)
+
+
+class ExecutionState(Enum):
+    """
+    Represents the state of the CUDAGraph Tree. Will be None if there is no live current memory allocated
+    in the cuda graph pool. Otherwise will reflect the state of the most recently executed node.
+    """
+
+    NONE = auto()
+    WARMUP = auto()
+    RECORDING = auto()
+    EXECUTION = auto()
+
+
+class CompilationMode(Enum):
+    FORWARD = auto()
+    BACKWARD = auto()
+    INFERENCE = auto()
+
+
+class CUDAGraphTreeManager:
+    """
+    Groups individual recordings or executions of cuda graphs into a tree of recordings,
+    and checks required invariants, and manages warmups of graphs.
+
+    When graphs are recorded in the same tree, it enforces subsequent execution
+    to follow the same order and have the same output tensor livespans. To remove
+    unnecessary coupling of cuda graphs (and additional imposed invariants),
+    the tree manager will end a currently recording tree whenever it is valid - when
+    the memory pool no longer has any live allocations.
+
+    We ignore outputs from a previous generation that correspond to prior model outputs.
+    Currently this is hardcoded `GenerationTracker.generation` tracked in torch dynamo.
+    # TODO: make generation increment configurable, warn on overwrite.
+
+    We run graph warmups in the cudagraph memory pool and return the result on the first invocation
+    of a function. For many models it is important to reclaim activations as you run the backward.
+    If we were to warm up the model and keep an extra copy of the inputs around to subsequently
+    use for recording, we would incur a memory penalty. Additionally, if we are part way through training
+    your model and need to recompile, memory will be allocated to the cuda graph pool, so we run this
+    warmup run in the cuda graph memory pool. As for recording, warm up needs the state of live tensors
+    to be accurately reflected so we checkpoint the allocator state if we need to warm up following graph
+    replay.
+    """
+
+    def __init__(self, device_index: int):
+        # roots are functions which have no dependencies on an other node. I.e.,
+        # when they are first invoked, none of their inputs are outputs are outputs
+        # of another node, nor are there any live outputs of another node whose
+        # liveness would create a dependency.
+        self.roots: Dict[FunctionID, List[CUDAGraphNode]] = defaultdict(list)
+
+        # mapping from function id to wrapped function
+        self.ids_to_funcs: Dict[FunctionID, WrappedFunction] = {}
+
+        self.ids_to_stack_traces: Dict[FunctionID, StackTraces] = {}
+
+        self.warmed_up_functions: Set[FunctionID] = set()
+        # if we fail to increment generation, and are stuck warming up,
+        # only warn on each function once
+        self.warned_functions: Set[FunctionID] = set()
+        torch._C._set_cached_tensors_enabled(True)
+
+        # NB: cuda caching allocator will remember the stream a segment is allocated to
+        # and only allocate that segment to the same stream. we need to use a single stream
+        # for all allocations to the memory pool, otherwise the allocations to separate streams
+        # will not be reused; separate recordings would have use the same memory pool, but not
+        # the same memory.
+
+        with torch.cuda.device(device_index):
+            torch.cuda.synchronize()
+            self.stream = torch.cuda.Stream()
+            self.stream.wait_stream(torch.cuda.current_stream())
+
+            # Keeps Memory Pool Alive
+            self.graph: Optional[torch.cuda.CUDAGraph] = torch.cuda.CUDAGraph()
+            self.cuda_graphs_thread_pool = torch.cuda.graph_pool_handle()
+
+            with warnings.catch_warnings(record=True), torch.cuda.graph(
+                self.graph,
+                pool=self.cuda_graphs_thread_pool,
+                stream=self.stream,
+                capture_error_mode="thread_local",
+            ):
+                pass
+
+        self.graph_counter = itertools.count(0)
+        self.func_counter = itertools.count(0)
+
+        # whether we the current node is in a state of warmup, recording, execution. If
+        # there is no current node the state will be ExecutionState.None.
+        self.path_state = ExecutionState.NONE
+        self.device_index = device_index
+
+        # the most recently invoked cudagraph wrapping of a function. Will be None
+        # when there is no output from a previous recording or execution whose memory
+        # we need to respect in the cuda caching allocation. If you incremented generation,
+        # this will also be none, as ignore those allocations.
+        self.current_node: Optional[CUDAGraphNode] = None
+
+        # current generation of cudagraph invocations. when torch.compile is run
+        # we increment the current generation. are willing to ignore live outputs
+        # of a previous generation in checking liveness.
+        self.current_gen: int = -1
+
+        # number of instances we are in execution and failed to match to an
+        # existing child
+        self.debug_fail_counter = 0
+        # number of instances we had to checkpoint the function
+        self.debug_checkpointing_counter = 0
+
+        self.id_to_mode: Dict[FunctionID, CompilationMode] = {}
+
+        # Note: [Backward Generation Handling]
+        # We generally perform a sequence of forward executions followed by backward executions.
+        # If multiple torch.compile wrapped forwards are executed with their backwards pending,
+        # we should not disregard the outputs from a prior torch.compile since the entire training
+        # loop hasn't completed.  Occasionally, a backward pass corresponding to a forward pass may
+        # not be executed, so we cannot wait for all pending forward pass backward completions, so
+        # we cannot wait for all backwards to have been invoked. Instead we wait for a single backward
+        # invocation. Triggering a backward pass typically doesn't lead to another torch.compile
+        # invocation, making it less likely for the generation to increase between multiple
+        # backward calls. The following use case is covered by this approach:
+        # mod1 = torch.compile(...)
+        # mod2 = torch.compile(...)
+        # mod2(mod1(x)).sum().backward()
+
+        self.running_forwards_with_pending_backwards = False
+
+    def run(self, new_inputs: List[Tensor], function_id: FunctionID):
+        assert self.graph is not None, "Running CUDAGraph after shutdown"
+        out = self._run(new_inputs, function_id)
+
+        # The forwards are only pending following invocation, not before
+        mode = self.id_to_mode[function_id]
+        if mode == CompilationMode.FORWARD:
+            self.running_forwards_with_pending_backwards = True
+        elif mode == CompilationMode.BACKWARD:
+            self.running_forwards_with_pending_backwards = False
+
+        return out
+
+    def set_to_running_backward(self):
+        self.running_forwards_with_pending_backwards = False
+
+    def _run(self, new_inputs: List[Tensor], function_id: FunctionID):
+        # we will try to end the current execution lazily, since
+        # we dont want to do unnecessary checking of the existing outputs
+        # on the hot path, but both recording and warmup only happen once
+        # so we check up front
+        if self.in_recording:
+            self.try_end_curr_recording(function_id)
+
+        if self.in_warmup:
+            self.try_end_curr_warmup(function_id)
+
+        # warming up a function and subsequentally recording may use different memory addresses
+        # because both depend on the state of the caching allocator. if we warm up graph A,
+        # then warm up graph B and make more allocations, the subsequent recording of A will not
+        # necessarily use the same addresses as in the warm up. Thus any warm up of a node can only
+        # be followed by warm up runs.
+        if (
+            not (
+                function_id in self.warmed_up_functions
+                or config.triton.skip_cudagraph_warmup
+            )
+        ) or self.in_warmup:
+            # If we are in the middle of executing cuda graphs, then we need to checkpoint memory state.
+            # Both Recording and Warmup will be reflected in the allocator and dont need changes
+            if self.path_state == ExecutionState.EXECUTION:
+                self.apply_checkpoint_execution_state_in_allocator()
+
+            return self.run_eager(new_inputs, function_id)
+
+        child_nodes = (
+            self.roots if self.current_node is None else self.current_node.children
+        )
+
+        if not self.in_recording:
+            for child in child_nodes[function_id]:
+                # here we are checking memory consistency between recording and execution,
+                # as well as things like stability of tensor locations, etc
+                # and other
+                if child.check_invariants(new_inputs):
+                    return self.execute_node(child, new_inputs)
+
+            # now that we know the new function can't be run as a child of the
+            # current node, if it is a root, try to end the current execution.
+            # as noted above, we want to do this lazily to avoid having to
+            # check all existing outputs
+            if self.current_node is not None and function_id in self.roots:
+                self.try_end_curr_execution()
+
+                # run again to hit the root matching case which must succeed
+                if self.current_node is None:
+                    return self.run(new_inputs, function_id)
+
+            # at this point, we necessarily will do a new recording
+            self.debug_fail_counter += 1
+
+            self.try_end_curr_execution()
+            if self.current_node is not None:
+                self.apply_checkpoint_execution_state_in_allocator()
+
+        # now, we are in a recording state !
+        return self.record_function(new_inputs, function_id)
+
+    def shutdown(self):
+        """
+        Remove all cached tensors in all nodes. Because cached tensors can hold gradients which in turn
+        might reference a backward which invokes a CUDA Graph Node, we have to manually clear them on shutdown
+        to avoid a reference cycle.
+        """
+        nodes = []
+        for roots in self.roots.values():
+            nodes.extend(roots)
+
+        while nodes:
+            node = nodes.pop()
+            for children in node.children.values():
+                nodes.extend(children)
+            node.remove_node_cached_tensors()
+            node.graph = None
+
+        self.graph = None
+        self.roots = None  # type: ignore[assignment]
+        self.current_node = None
+
+    def record_function(self, new_inputs, function_id) -> List[Optional[Tensor]]:
+        graph_id = self.new_graph_id()
+        log.debug(
+            "Recording function %d of graph recording id %d",
+            function_id.id,
+            graph_id.id,
+        )
+        torch.cuda.synchronize()
+        node = CUDAGraphNode(
+            self.ids_to_funcs[function_id],
+            graph_id,
+            self.current_node,
+            new_inputs,
+            self.cuda_graphs_thread_pool,
+            self.device_index,
+            self.ids_to_stack_traces[function_id],
+            self.stream,
+        )
+        if self.current_node is None:
+            self.roots[function_id].append(node)
+        else:
+            self.current_node.add_child(function_id, node)
+        self.current_node = node
+        self.path_state = ExecutionState.RECORDING
+        self.update_generation()
+        torch.cuda.synchronize()
+        return node.run_first_inputs(new_inputs)
+
+    def execute_node(self, node: CUDAGraphNode, new_inputs) -> List[Optional[Tensor]]:
+        self.current_node = node
+        self.path_state = ExecutionState.EXECUTION
+        self.update_generation()
+        return node.run(new_inputs)
+
+    def run_eager(self, new_inputs, function_id: FunctionID):
+        # this is only stored on current node, because when we start a new path,
+        # we will deallocate it
+        already_warm = function_id in self.warmed_up_functions
+        if not already_warm:
+            log.debug("Running warmup of function %d", function_id.id)
+        else:
+            log.debug(
+                "Running eager of function %d because ancestor needed to warm up",
+                function_id.id,
+            )
+        self.warmed_up_functions.add(function_id)
+        node = CUDAWarmupNode(
+            self.ids_to_funcs[function_id],
+            self.current_node,
+            self.cuda_graphs_thread_pool,
+            self.graph,
+            self.device_index,
+            self.ids_to_stack_traces[function_id],
+            self.stream,
+            already_warm,
+        )
+        self.current_node = node
+        self.path_state = ExecutionState.WARMUP
+        self.update_generation()
+        return node.run(new_inputs)
+
+    def new_graph_id(self) -> GraphID:
+        return GraphID(next(self.graph_counter))
+
+    def new_func_id(self) -> FunctionID:
+        return FunctionID(next(self.func_counter))
+
+    def add_function(
+        self,
+        model,
+        inputs,
+        static_input_idxs,
+        stack_traces,
+        mode,
+        constants,
+    ) -> Tuple[Callable[..., Any], List[Optional[Tensor]]]:
+        id = self.new_func_id()
+        self.ids_to_stack_traces[id] = stack_traces
+        self.ids_to_funcs[id] = WrappedFunction(
+            model,
+            static_input_idxs,
+            id,
+            tuple(t for t in constants if isinstance(t, torch.Tensor) and t.is_cuda),
+        )
+        self.id_to_mode[id] = mode
+        fn = functools.partial(self.run, function_id=id)
+
+        # container needs to set clean up when fn dies
+        get_container(self.device_index).add_strong_reference(fn)
+        return fn, fn(inputs)
+
+    @property
+    def in_recording(self):
+        return self.path_state == ExecutionState.RECORDING
+
+    @property
+    def in_warmup(self):
+        return self.path_state == ExecutionState.WARMUP
+
+    def get_roots(self) -> Iterator[CUDAGraphNode]:
+        for nodes in self.roots.values():
+            yield from nodes
+
+    @property
+    def current_node(self):
+        return self._current_node
+
+    @current_node.setter
+    def current_node(self, value):
+        self._current_node = value
+        if value is None:
+            self.path_state = ExecutionState.NONE
+
+    def update_generation(self):
+        self.current_gen = self.get_curr_generation()
+
+    @staticmethod
+    def get_curr_generation() -> int:
+        if MarkStepBox.mark_step_counter != 0:
+            return MarkStepBox.mark_step_counter
+
+        return GenerationTracker.generation
+
+    @staticmethod
+    def user_invoked_mark_step():
+        return MarkStepBox.mark_step_counter != 0
+
+    def can_start_new_generation(self) -> bool:
+        if not self.in_new_torch_compile_invocation():
+            return False
+
+        if self.user_invoked_mark_step():
+            return True
+
+        return not self.running_forwards_with_pending_backwards
+
+    def in_new_torch_compile_invocation(self):
+        return self.current_gen != self.get_curr_generation()
+
+    def try_end_curr_recording(self, function_id: FunctionID) -> None:
+        """
+        Check if the current recording can be terminated, either because all outputs of the
+        previously recorded node are dead or because it was executed in a different
+        generation. Will set current_node to None and in_recording to False if successful.
+        """
+        assert self.in_recording
+        assert self.current_node is not None
+
+        # multiple invocations, allow overwriting the previous generation
+        if self.can_start_new_generation():
+            self.dealloc_current_path_weakrefs()
+            self.clear_current_path_state_and_set_to_none()
+            return
+
+        if self.current_node.all_outputs_are_dead():
+            self.clear_current_path_state_and_set_to_none()
+            return
+
+        self.check_warn_on_unable_to_start_executing(function_id)
+
+    def try_end_curr_execution(self) -> None:
+        """
+        Check if the current executing node can be terminated, either because all outputs of the
+        previously executed node are dead or because it was executed in a different generation.
+        Will set current_node to None if successful.
+        """
+
+        assert not self.in_recording
+        if self.current_node is None:
+            return
+
+        if self.can_start_new_generation():
+            self.clear_current_path_state_and_set_to_none()
+            return
+
+        if self.current_node.all_outputs_are_dead():
+            self.clear_current_path_state_and_set_to_none()
+
+    def try_end_curr_warmup(self, function_id: FunctionID):
+        if self.can_start_new_generation():
+            self.dealloc_current_path_weakrefs()
+            self.current_node = None
+            return
+
+        if self.current_node.all_outputs_are_dead():
+            self.current_node = None
+            return
+
+        self.check_warn_on_unable_to_start_executing(function_id)
+
+    def check_warn_on_unable_to_start_executing(self, function_id: FunctionID):
+        "Warn if we in a potential loop where we are unable to hit fast path"
+        if (
+            function_id in self.warned_functions
+            or not self.in_new_torch_compile_invocation()
+        ):
+            return
+
+        existing_nodes = [
+            node
+            for node in self.current_node._path_from_root
+            if node.wrapped_function.id == function_id
+        ]
+
+        if len(existing_nodes) <= 1:
+            return
+
+        # repeated same pattern
+        parents = {
+            n.parent.wrapped_function.id
+            for n in itertools.chain(existing_nodes, (self.current_node,))
+            if n.parent is not None
+        }
+        if len(parents) == len(existing_nodes):
+            return
+
+        self.warned_functions.add(function_id)
+        warnings.warn(
+            "Unable to hit fast path of CUDAGraphs because of pending, uninvoked backwards. "
+            "Consider running with torch.no_grad() or using torch.compiler.cudagraph_mark_step_begin() "
+            "before each model invocation"
+        )
+
+    def dealloc_current_path_weakrefs(self):
+        # TODO: we could also allow the these weak refs to continue to be allocated,
+        # but that adds some complications.
+        for node in self.current_node._path_from_root:
+            assert len(node.tensor_weakrefs) == len(node.stack_traces)
+            for t, stack_trace in zip(node.tensor_weakrefs, node.stack_traces):
+                ten = None if t is None else t()
+                if ten is None:
+                    continue
+
+                stack_trace = (
+                    stack_trace.strip()
+                    if stack_trace
+                    else "[Could not find stack trace]"
+                )
+                msg = (
+                    "Error: accessing tensor output of CUDAGraphs that has been overwritten by a subsequent run. "
+                    f"Stack trace: {stack_trace}. "
+                    "To prevent overwriting, clone the tensor outside of torch.compile() "
+                    "or call torch.compiler.cudagraph_mark_step_begin() before each model invocation."
+                )
+                torch._C._set_storage_access_error_msg(ten, msg)
+
+        deleted = set()
+        for storage_ref in self.current_node.path_live_weakrefs():
+            if storage_ref() and storage_ref.data_ptr() not in deleted:
+                deleted.add(storage_ref.data_ptr())
+                torch._C._free_And_Remove_DeleterFn(storage_ref())
+
+    def clear_current_path_state_and_set_to_none(self):
+        self.current_node.clear_path_state()
+        self.current_node = None
+
+    def apply_checkpoint_execution_state_in_allocator(self):
+        """
+        Checkpoint the current execution state in the caching allocator so that
+        additional cudagraph recordings can be made respecting existent live storages.
+        """
+        self.debug_checkpointing_counter += 1
+        log.debug(
+            "Checkpointing cuda caching allocator state. Number of checkpoints %d",
+            self.debug_checkpointing_counter,
+        )
+
+        state = self.current_node.checkpointed_caching_state
+        device = self.current_node.device
+        assert state is not None and device is not None
+
+        # currently we deallocate on instead of allowing stale recordings
+        stale_storages: List[int] = []
+
+        # remove cached tensors, otherwise they would prevent memory from being
+        # reclaimed in subsequent recordings
+        self.current_node.remove_path_cached_tensors()
+        live_storages_wrappers = list(self.current_node.path_live_weakrefs())
+
+        live_storages_weak_refs = [t() for t in live_storages_wrappers]
+        ptrs_to_deallocate = self.current_node.data_ptrs_dead_since_invocation()
+        torch._C._cuda_setCheckpointPoolState(
+            device, state, stale_storages, live_storages_weak_refs
+        )
+
+        # NB: deduplicate aliased outputs
+        for ptr in set(ptrs_to_deallocate):
+            torch._C._cuda_cudaCachingAllocator_raw_delete(ptr)
+
+        # Now the live blocks should be exactly equal to the live storages in private pool
+        if config.triton.slow_path_cudagraph_asserts:
+            check_memory_pool(
+                self.device_index, self.cuda_graphs_thread_pool, live_storages_wrappers
+            )
+            for wrapper in live_storages_wrappers:
+                assert wrapper()
+                assert torch._C._has_Standard_Deleter(wrapper())
+                assert wrapper.data_ptr() not in ptrs_to_deallocate
+
+    def live_cudagraph_pool_storages_in_curr_execution(
+        self,
+    ) -> List[StorageWeakRefPointer]:
+        if self.current_node is None:
+            return []
+        # explicitly ignoring previous recorded outputs from past path
+        return [t() for t in self.current_node.path_live_weakrefs()]
diff --git a/MLPY/Lib/site-packages/torch/_inductor/cudagraph_utils.py b/MLPY/Lib/site-packages/torch/_inductor/cudagraph_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c38b6ff4cd4cf7cf1de809daffd69e17f3752c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/cudagraph_utils.py
@@ -0,0 +1,105 @@
+import dataclasses
+from typing import Dict, Iterable, Optional
+
+import torch
+from torch._inductor.codecache import CompiledFxGraph
+
+
+def get_mutating_use_stack_trace(placeholder_node: torch.fx.Node) -> Optional[str]:
+    # reinplaced uses might have a single, non-copy_ use
+    if len(placeholder_node.users) == 1:
+        return next(iter(placeholder_node.users)).meta.get("stack_trace", None)
+
+    for use in placeholder_node.users:
+        if use.target == torch.ops.aten.copy_.default:
+            if stack_trace := use.meta.get("stack_trace", None):
+                return stack_trace
+
+    return None
+
+
+def format_default_skip_message(reason: str) -> str:
+    return f"skipping cudagraphs due to {reason}"
+
+
+def get_mutation_stack_trace(
+    gm: torch.fx.GraphModule, mutation_indices: Iterable[int]
+) -> str:
+    stack_trace: Optional[str] = ""
+    placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"]
+
+    for idx in mutation_indices:
+        placeholder = placeholders[idx]
+        if stack_trace := get_mutating_use_stack_trace(placeholder):
+            break
+
+    if stack_trace:
+        msg = f"skipping cudagraphs due to mutation on input. Found from : \n {stack_trace}"
+        return msg
+
+    return format_default_skip_message("mutated inputs")
+
+
+def check_for_mutation(
+    gm: torch.fx.GraphModule, compiled_graph: CompiledFxGraph, num_fixed: int
+) -> Optional[str]:
+    default_msg = format_default_skip_message("mutated inputs")
+
+    # doesnt work for non-trees because the warmup run would apply mutation twice
+    if torch._inductor.config.triton.cudagraph_trees:
+        # checking if mutation is only on parameters/static inputs
+        mutation_indices = [
+            idx for idx in compiled_graph.mutated_input_idxs if idx >= num_fixed
+        ]
+        has_mutation = len(mutation_indices) != 0
+        if not has_mutation:
+            return None
+
+        return get_mutation_stack_trace(gm, mutation_indices)
+
+    else:
+        has_mutation = len(compiled_graph.mutated_inputs) != 0
+        return None if not has_mutation else default_msg
+
+
+def get_use_stack_trace(node) -> Optional[str]:
+    for use in node.users:
+        if stack_trace := use.meta.get("stack_trace", None):
+            return stack_trace
+    return None
+
+
+def check_multiple_devices_or_any_cpu_nodes(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+) -> Optional[str]:
+    if cpu_node := device_node_mapping.get(torch.device("cpu")):
+        if stack_trace := get_use_stack_trace(cpu_node):
+            return format_default_skip_message(
+                f"cpu device. Found from : \n {stack_trace}"
+            )
+
+        return format_default_skip_message("cpu device")
+
+    if (
+        len(device_node_mapping) == 1
+        and next(iter(device_node_mapping.keys())).type == "cuda"
+    ):
+        return None
+
+    keys_repr = (repr(key) for key in device_node_mapping.keys())
+    return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}")
+
+
+def check_lowering_disable_cudagraph(
+    device_node_mapping: Dict[torch.device, torch.fx.Node]
+):
+    return check_multiple_devices_or_any_cpu_nodes(device_node_mapping)
+
+
+@dataclasses.dataclass
+class BoxedDeviceIndex:
+    value: Optional[int]
+
+    def set(self, device_idx: Optional[int]):
+        assert device_idx is None or isinstance(device_idx, int)
+        self.value = device_idx
diff --git a/MLPY/Lib/site-packages/torch/_inductor/debug.py b/MLPY/Lib/site-packages/torch/_inductor/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fa404a54a5d8e4d27ddee8d9c53ab311153a59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/debug.py
@@ -0,0 +1,655 @@
+import collections
+import contextlib
+import cProfile
+import dataclasses
+import functools
+import itertools
+import logging
+import os
+import os.path
+import pickle
+import pstats
+import shutil
+import subprocess
+from typing import Any, Dict, List, Optional
+from unittest.mock import patch
+
+from functorch.compile import draw_graph, get_aot_graph_name, get_graph_being_compiled
+
+import torch
+from torch import fx as fx
+
+from torch._dynamo.repro.after_aot import save_graph_repro, wrap_compiler_debug
+from torch._dynamo.utils import get_debug_dir
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+from torch.fx.passes.tools_common import legalize_graph
+from torch.utils._pytree import tree_map
+
+from . import config, ir  # noqa: F811, this is needed
+from .scheduler import (
+    BaseSchedulerNode,
+    FusedSchedulerNode,
+    NopKernelSchedulerNode,
+    OutputNode,
+    SchedulerNode,
+)
+from .virtualized import V
+
+log = logging.getLogger(__name__)
+
+SchedulerNodeList = List[Any]
+BufMeta = collections.namedtuple("BufMeta", ["name", "n_origin"])
+GRAPHVIZ_COMMAND_SCALABLE = ["dot", "-Gnslimit=2", "-Gnslimit1=2", "-Gmaxiter=5000"]
+
+
+@functools.lru_cache(None)
+def has_dot() -> bool:
+    try:
+        subprocess.check_output(["which", "dot"], stderr=subprocess.PIPE)
+        return True
+    except subprocess.SubprocessError:
+        return False
+
+
+def draw_buffers(nodes: List[BaseSchedulerNode], print_graph=False, fname=None):
+    """
+    Draw a graph in fname.svg.
+    """
+    if not has_dot():
+        log.warning("draw_buffers() requires `graphviz` package")
+        return
+
+    if fname is None:
+        fname = get_graph_being_compiled()
+
+    graph = create_fx_from_snodes(nodes)
+
+    for node in graph.nodes:
+        if "fusion_meta" not in node.meta:
+            continue
+        group = node.meta["fusion_meta"].group
+        if isinstance(group, tuple):
+            if isinstance(group[1], int):
+                group = (group[1],)
+            else:
+                group = group[1]
+
+        # gather meta data
+        dtype = None
+        if isinstance(node, ir.ComputedBuffer):
+            dtype = node.data.dtype
+
+        metadata = TensorMetadata(group, dtype, None, None, None, None, None)  # type: ignore[arg-type]
+        node.meta["tensor_meta"] = metadata
+
+    if print_graph:
+        print(graph)
+
+    gm = GraphModule({}, graph)
+    legalize_graph(gm)
+    gm.graph.lint()
+    draw_graph(
+        gm, fname, clear_meta=False, dot_graph_shape=config.trace.dot_graph_shape
+    )
+
+
+def create_fx_from_snodes(snodes: List[BaseSchedulerNode]) -> fx.Graph:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+
+    def get_fake_func(name):
+        def func1(*args):
+            return 0
+
+        func1.__name__ = name
+        return func1
+
+    FusionMeta = collections.namedtuple("FusionMeta", ["group", "snode", "type"])
+
+    buf_to_fx_node = {}
+    graph = torch.fx.Graph()
+    first_node = None
+
+    outputs = []
+    group: Any = None
+    # create call_function node for each Buffer and Kernel
+    for snode in snodes:
+        if snode.is_extern():
+            node_type = "extern"
+            group = node_type
+        elif snode.is_template():
+            node_type = "template"
+            group = node_type
+        elif isinstance(snode, NopKernelSchedulerNode):
+            node_type = "nop"
+            group = node_type
+        elif isinstance(snode, SchedulerNode):
+            node_type = "compute"
+            group = snode.group
+        elif isinstance(snode, FusedSchedulerNode):
+            node_type = "fused"
+            group = snode.group
+        else:
+            raise RuntimeError("Unknown node type")
+
+        fused_name = torch._inductor.utils.get_fused_kernel_name(
+            snode.get_nodes(), "original_aten"
+        )
+        func_name = f"{node_type}: {fused_name}"
+        node_func = get_fake_func(func_name)
+        kwargs = {}
+        if hasattr(snode, "get_device"):
+            kwargs = {"device": snode.get_device()}
+        fx_node = graph.call_function(node_func, args=(), kwargs=kwargs)
+
+        def in_output(snode):
+            if isinstance(snode, FusedSchedulerNode):
+                return any(in_output(x) for x in snode.snodes)
+            return any(isinstance(user.node, OutputNode) for user in snode.users)
+
+        if in_output(snode):
+            outputs.append(fx_node)
+        name = snode.get_name()
+        fx_node.name = name
+
+        fx_node.meta["fusion_meta"] = FusionMeta(group, snode, node_type)
+
+        if isinstance(snode, FusedSchedulerNode):
+            for x in snode.snodes:
+                buf_to_fx_node[x.get_name()] = fx_node
+        buf_to_fx_node[name] = fx_node
+
+        if first_node is None:
+            first_node = fx_node
+
+    # create edges between nodes
+    for snode in snodes:
+        name = snode.get_name()
+        deps = snode.read_writes.reads
+
+        fx_node = buf_to_fx_node[name]
+        new_args = []
+        for dep in deps:
+            if dep.name in buf_to_fx_node:
+                dep_node = buf_to_fx_node[dep.name]
+            else:
+                with graph.inserting_before(first_node):
+                    dep_node = graph.placeholder(dep.name)
+                    buf_to_fx_node[dep.name] = dep_node
+            new_args.append(dep_node)
+
+        fx_node.args = tuple(new_args)
+
+    graph.output(outputs[0] if len(outputs) == 1 else tuple(outputs))
+    return graph
+
+
+def update_orig_fx_node_name_to_buf_name(
+    nodes: SchedulerNodeList,
+    node_name_to_buf_name: Dict[str, str],
+    parent_buf_name: Optional[str] = None,
+    n_origins: int = 0,
+):
+    if nodes is None:
+        return
+    for node in nodes:
+        # for FusedSchedulerNode, traverse recursively into get_nodes()
+        buf_name = node.get_name()
+        children_nodes = node.get_nodes()
+        if children_nodes is not None and len(children_nodes) > 1:
+            update_orig_fx_node_name_to_buf_name(
+                children_nodes,
+                node_name_to_buf_name,
+                buf_name if parent_buf_name is None else parent_buf_name,
+            )
+            continue
+        else:
+            assert len(children_nodes) == 1 and children_nodes[0] == node
+
+        ir_node = node.node
+        if ir_node is None or ir_node.origins is None:
+            continue
+        for origin in ir_node.origins:
+            node_name = origin.name
+            # when buf1 and buf2 both have origin=node1
+            # we draw node1 according to buf1
+            if node_name not in node_name_to_buf_name:
+                node_name_to_buf_name[node_name] = (
+                    buf_name if parent_buf_name is None else parent_buf_name
+                )
+
+
+def get_node_name_to_buf_meta(node_name_to_buf_name: Dict[str, str]):
+    buf_name_to_n_node = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        if buf_name not in buf_name_to_n_node:
+            buf_name_to_n_node[buf_name] = {node_name}
+        else:
+            buf_name_to_n_node[buf_name].add(node_name)
+
+    node_name_to_buf_meta = {}
+    for node_name, buf_name in node_name_to_buf_name.items():
+        n_node = len(buf_name_to_n_node[buf_name])
+        node_name_to_buf_meta[node_name] = BufMeta(buf_name, n_node)
+    return node_name_to_buf_meta
+
+
+def annotate_orig_fx_with_snodes(
+    gm: torch.fx.GraphModule, snodes: SchedulerNodeList
+) -> None:
+    """
+    Creates a FX Graph from a list of SchedulerNode objects.
+    """
+    node_name_to_buf_name: Dict[str, str] = {}
+    update_orig_fx_node_name_to_buf_name(snodes, node_name_to_buf_name)
+    if node_name_to_buf_name is None:
+        return
+    node_name_to_buf_meta = get_node_name_to_buf_meta(node_name_to_buf_name)
+    for node in gm.graph.nodes:
+        if node.name in node_name_to_buf_meta:
+            node.meta["buf_meta"] = node_name_to_buf_meta.get(node.name)
+
+
+@contextlib.contextmanager
+def enable_aot_logging():
+    compile_debug = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
+
+    import torch._functorch.aot_autograd
+
+    log = logging.getLogger(torch._functorch.aot_autograd.__name__)
+
+    stack = contextlib.ExitStack()
+    if not compile_debug:
+        try:
+            yield
+        finally:
+            stack.close()
+        return
+
+    # Enable all graphs to be logged to a file by setting the flags to True
+    # and the log level of the file logger to DEBUG
+    stack.enter_context(patch("functorch.compile.config.debug_partitioner", True))
+
+    path = os.path.join(get_debug_dir(), "torchinductor")
+    os.makedirs(path, exist_ok=True)
+
+    fh = logging.FileHandler(
+        os.path.join(
+            path,
+            f"aot_{get_aot_graph_name()}_debug.log",
+        )
+    )
+    fh.setLevel(logging.DEBUG)
+    fh.setFormatter(
+        logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+    )
+    log.addHandler(fh)
+    try:
+        yield
+    finally:
+        log.removeHandler(fh)
+        stack.close()
+
+
+class DebugContext:
+    _counter = itertools.count()
+
+    @staticmethod
+    def wrap(fn):
+        @functools.wraps(fn)
+        def inner(*args, **kwargs):
+            with DebugContext():
+                return fn(*args, **kwargs)
+
+        return wrap_compiler_debug(inner, compiler_name="inductor")
+
+    @staticmethod
+    def create_debug_dir(folder_name: str) -> Optional[str]:
+        debug_dir = config.trace.debug_dir or get_debug_dir()
+        for n in DebugContext._counter:
+            dirname = os.path.join(
+                debug_dir,
+                "torchinductor",
+                f"{folder_name}.{n}",
+            )
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+                return dirname
+        return None
+
+    def __init__(self):
+        self._prof = None
+        self._path = None
+        self._stack = contextlib.ExitStack()
+
+    def copy(self, new_path: str):
+        if not self._path:
+            return
+        assert new_path.endswith(".debug"), new_path
+        if os.path.exists(new_path):
+            shutil.rmtree(new_path)
+        try:
+            shutil.copytree(self._path, new_path)
+            self._path = new_path
+        except OSError:
+            log.warning(
+                "Failed to copy debug files from %s to %s", self._path, new_path
+            )
+            pass
+
+    def fopen(self, filename: str, write_mode: str = "w", *args, **kwargs):
+        assert self._path
+        return open(os.path.join(self._path, filename), write_mode, *args, **kwargs)
+
+    @contextlib.contextmanager
+    def fopen_context(self, filename: str, write_mode: str = "w", *args, **kwargs):
+        assert self._path
+        with open(os.path.join(self._path, filename), write_mode, *args, **kwargs) as f:
+            yield f
+
+    def filename(self, suffix: str):
+        assert self._path
+        return os.path.join(self._path, suffix)
+
+    def upload_tar(self):
+        if config.trace.upload_tar is not None:
+            import tarfile
+
+            assert self._path
+            tar_file = os.path.join(
+                self._path, f"{os.path.basename(self._path)}.tar.gz"
+            )
+            with tarfile.open(tar_file, "w:gz") as tar:
+                tar.add(self._path, arcname=os.path.basename(self._path))
+            config.trace.upload_tar(tar_file)
+
+    def __enter__(self):
+        if config.debug:
+            log = logging.getLogger("torch._dynamo")
+            prev_level = log.level
+            log.setLevel(logging.DEBUG)
+
+            def reset_log_level(level):
+                log.setLevel(level)
+
+            self._stack.callback(reset_log_level, prev_level)
+
+        self._stack.enter_context(V.set_debug_handler(self))
+
+        if not config.trace.enabled:
+            return
+
+        self._path = self.create_debug_dir(get_aot_graph_name())
+
+        if config.trace.debug_log:
+            self._setup_log_capture("debug.log", logging.DEBUG)
+        if config.trace.info_log:
+            self._setup_log_capture("info.log", logging.INFO)
+        if config.trace.compile_profile:
+            self._prof = cProfile.Profile()
+            self._prof.enable()
+
+    def _setup_log_capture(self, filename: str, level: int):
+        log = logging.getLogger("torch._inductor")
+        fd = self._stack.enter_context(self.fopen(filename))
+        ch = logging.StreamHandler(fd)
+        ch.setLevel(level)
+        ch.setFormatter(
+            logging.Formatter("[%(filename)s:%(lineno)d %(levelname)s] %(message)s")
+        )
+        log.addHandler(ch)
+        log.setLevel(min(log.level, level))
+        self._stack.callback(log.removeHandler, ch)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._prof:
+            self._prof.disable()
+            self._save_profile_data()
+
+        if self._path:
+            self.upload_tar()
+            log.warning("%s debug trace: %s", get_graph_being_compiled(), self._path)
+        self._stack.close()
+
+    def _save_profile_data(self):
+        assert self._prof
+        self._prof.dump_stats(self.filename("compile.prof"))
+        with self.fopen("compile.stats") as fd:
+            stats = pstats.Stats(self._prof, stream=fd)
+            stats.strip_dirs()
+            stats.sort_stats("cumtime")
+            stats.print_stats(100)
+            stats.sort_stats("tottime")
+            stats.print_stats(100)
+
+    def __getattr__(self, name):
+        if config.trace.enabled and getattr(config.trace, name):
+            try:
+                return getattr(DebugFormatter(self), name)
+            except Exception:
+                log.warning("Ignoring exception in debug code", exc_info=True)
+        else:
+
+            def ignored(*args, **kwargs):
+                pass
+
+            return ignored
+
+
+class DebugFormatter:
+    def __init__(self, handler):
+        self.fopen = handler.fopen
+        self.fopen_context = handler.fopen_context
+        self.filename = handler.filename
+        self.handler = handler
+
+    def fx_graph(self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]):
+        with self.fopen("fx_graph_runnable.py") as fd:
+            save_graph_repro(fd, gm, inputs, "inductor")
+
+        with self.fopen("fx_graph_readable.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+
+    def fx_graph_transformed(
+        self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]
+    ):
+        with self.fopen("fx_graph_transformed.py") as fd:
+            fd.write(gm.print_readable(print_output=False))
+
+    def ir_pre_fusion(self, nodes: SchedulerNodeList):
+        self._write_ir("ir_pre_fusion.txt", nodes)
+
+    def ir_post_fusion(self, nodes: SchedulerNodeList):
+        self._write_ir("ir_post_fusion.txt", nodes)
+
+    def _write_ir(self, filename: str, nodes: SchedulerNodeList):
+        with self.fopen(filename) as fd:
+            log.info("Writing debug ir to  %s", fd.name)
+            for node in nodes:
+                fd.write(node.debug_str())
+                fd.write("\n\n\n")
+
+    def graph_diagram(self, nodes: SchedulerNodeList):
+        draw_buffers(nodes, fname=self.filename("graph_diagram.svg"))
+
+    def draw_orig_fx_graph(self, gm: torch.fx.GraphModule, nodes: SchedulerNodeList):
+        annotate_orig_fx_with_snodes(gm, nodes)
+        draw_graph(
+            gm,
+            fname=self.filename("orig_fx_graph_diagram.svg"),
+            clear_meta=False,
+            prog=GRAPHVIZ_COMMAND_SCALABLE,
+            parse_stack_trace=True,
+            dot_graph_shape=config.trace.dot_graph_shape,
+        )
+
+    def output_code(self, filename):
+        shutil.copy(filename, self.filename("output_code.py"))
+
+    def log_autotuning_results(
+        self,
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict["ChoiceCaller", float],  # type: ignore[name-defined] # noqa: F821
+        elapse: float,
+    ):
+        import json
+
+        from .ir import FixedLayout
+
+        def build_node_info(node: ir.IRNode):
+            if hasattr(node, "name"):
+                node_name = node.name
+            else:
+                node_name = ""
+            node_info = {
+                "name": node_name,
+                "type": type(node).__name__,
+            }
+            try:
+                layout = node.get_layout()
+                if isinstance(layout, FixedLayout):
+                    offset = 0
+                    try:
+                        offset = int(layout.offset)
+                    except Exception:
+                        try:
+                            offset = V.graph.sizevars.size_hint(
+                                layout.offset, fallback=0
+                            )
+                        except Exception:
+                            pass
+                    static_layout = FixedLayout(
+                        layout.device,
+                        dtype=layout.dtype,
+                        size=list(V.graph.sizevars.size_hints(layout.size)),
+                        stride=list(V.graph.sizevars.size_hints(layout.stride)),
+                        offset=offset,
+                    )
+                    node_info["layout"] = str(static_layout)
+                else:
+                    node_info["layout"] = str(node.get_layout())
+            except Exception as e:
+                pass
+            try:
+                node_info["dtype"] = str(node.get_dtype())
+            except Exception as e:
+                pass
+            try:
+                node_info["device"] = str(node.get_device())
+            except Exception as e:
+                pass
+            try:
+                node_info["stride"] = str(
+                    V.graph.sizevars.size_hints(node.get_stride())
+                )
+            except Exception as e:
+                pass
+            try:
+                node_info["size"] = str(V.graph.sizevars.size_hints(node.get_size()))
+            except Exception as e:
+                pass
+            try:
+                node_info["numel"] = str(V.graph.sizevars.size_hint(node.get_numel()))
+            except Exception as e:
+                pass
+            if hasattr(node, "data") and isinstance(node.data, ir.IRNode):
+                node_info["data"] = build_node_info(node.data)
+            return node_info
+
+        general_properties = {
+            "op_name": name,
+            "cuda_device_name": torch.cuda.get_device_name(),
+            "cuda_device_count": torch.cuda.device_count(),
+            "input_nodes": [build_node_info(node) for node in input_nodes],
+            "autotuning_time": elapse,
+        }
+        with self.fopen_context(
+            "autotuning_result_json_list.txt", "at", encoding="utf-8"
+        ) as fd:
+            for caller, time in timings.items():
+                info_dict = dict(caller.info_dict())
+                info_dict.update(general_properties)
+                info_dict["benchmark_result"] = time
+                json.dump(info_dict, fd)
+                fd.write("\n")
+
+
+@dataclasses.dataclass
+class TensorMetadataHolder:
+    tensor_metadata: TensorMetadata
+    device: torch.device
+
+
+save_args_cnt = itertools.count()
+
+
+def save_args_for_compile_fx_inner(*args, **kwargs):
+    """
+    This function is used to save arguments for a compile_fx_inner function call
+    to the file system.  Later on one can replay the compile_fx_inner call
+    with the saved arguments using load_args_and_run_compile_fx_inner.
+    """
+
+    folder = "/tmp/inductor_saved_args"
+    if not os.path.exists(folder):
+        os.mkdir(folder)
+
+    def handle_tensor(x):
+        """
+        Pickle FakeTensor will result in error:
+        AttributeError: Can't pickle local object 'WeakValueDictionary.__init__.<locals>.remove'
+
+        Convert all Tensor to metadata. This may also makes pickle faster.
+        """
+        if isinstance(x, torch.Tensor):
+            return TensorMetadataHolder(_extract_tensor_metadata(x), x.device)
+        else:
+            return x
+
+    args_to_save, kwargs_to_save = tree_map(handle_tensor, (args, kwargs))
+
+    fn_name = "compile_fx_inner"
+    path = f"{folder}/{fn_name}_{next(save_args_cnt)}.pkl"
+    with open(path, "wb") as f:
+        pickle.dump((args_to_save, kwargs_to_save), f)
+
+    if log.isEnabledFor(logging.DEBUG):
+        message = f"""
+Arguments for a compile_fx_inner call is saved to {path}. To replay the call,
+run the following:
+
+from torch._inductor.debug import load_args_and_run_compile_fx_inner
+load_args_and_run_compile_fx_inner({path!r})
+        """
+        # call print rather than log.debug. log.debug will print message
+        # prefix for each line which makes the code snippet harder to be
+        # copied.
+        # Not a big deal since the code is already been guarded by checking
+        # the log level.
+        print(message)
+
+
+def load_args_and_run_compile_fx_inner(path: str):
+    from torch._inductor.compile_fx import compile_fx_inner
+
+    with open(path, "rb") as f:
+        args, kwargs = pickle.load(f)
+
+    def handle_tensor(x):
+        if isinstance(x, TensorMetadataHolder):
+            return torch._dynamo.testing.rand_strided(
+                x.tensor_metadata.shape,
+                x.tensor_metadata.stride,
+                x.tensor_metadata.dtype,
+                x.device,
+            )
+        else:
+            return x
+
+    fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True)
+    with fake_mode, config.patch("save_args", False):
+        args, kwargs = tree_map(handle_tensor, (args, kwargs))
+        return compile_fx_inner(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/decomposition.py b/MLPY/Lib/site-packages/torch/_inductor/decomposition.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7f5de93b07c209075921f3b940e66cb4ba08fb4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/decomposition.py
@@ -0,0 +1,678 @@
+import functools
+import logging
+import math
+import sys
+import typing
+from typing import Optional
+
+import torch
+import torch._decomp as decomp
+import torch._prims_common as utils
+import torch.ao.quantization.fx._decomposed
+from torch._decomp import (
+    core_aten_decompositions,
+    get_decompositions,
+    remove_decompositions,
+)
+from torch._decomp.decompositions import (
+    _grid_sampler_2d as decomp_grid_sampler_2d,
+    pw_cast_for_opmath,
+)
+from torch._decomp.decompositions_for_rng import extra_random_decomps
+from torch._higher_order_ops.out_dtype import out_dtype
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    type_to_dtype,
+)
+
+from . import config, inductor_prims
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+quantized_decomposed = torch.ops.quantized_decomposed
+
+inductor_decompositions = get_decompositions(
+    [
+        aten._adaptive_avg_pool2d_backward,
+        aten.arange,
+        aten.bitwise_and_,
+        aten.bitwise_or_,
+        aten.clamp_min_,
+        aten.dist,
+        aten.empty_like,
+        aten.flip,
+        aten.gelu,
+        aten.hardtanh,
+        aten.index_select,
+        aten.lcm,
+        aten.leaky_relu,
+        aten.linalg_vector_norm,
+        aten._log_softmax,
+        aten.max_pool2d_with_indices_backward,
+        aten._native_batch_norm_legit,
+        aten._native_batch_norm_legit_functional,
+        aten._native_batch_norm_legit_no_training,
+        aten.native_batch_norm,
+        aten.native_group_norm,
+        aten.native_layer_norm,
+        aten.nll_loss2d_backward,
+        aten._softmax,
+        aten.sin_,
+        aten.sqrt_,
+        out_dtype,
+        aten._to_copy,
+        aten.tril_indices,
+        aten.triu_indices,
+        aten.upsample_bilinear2d.vec,
+    ]
+)
+decompositions = {**core_aten_decompositions(), **inductor_decompositions}
+
+# Remove unwanted decompositions included via the core ATen decompositions from
+# the Inductor decomp table.
+decomps_to_exclude = [
+    aten._unsafe_index,
+    aten._scaled_dot_product_flash_attention_for_cpu.default,  # See comments in torch/_decomp/decompositions.py
+    aten.clamp_max,
+    aten.clamp_min,
+    aten.glu,  # inductor lowers this directly
+    aten.split.Tensor,  # inductor lowers this directly
+    aten.squeeze,  # inductor lowers this directly
+    aten.sum,  # inductor lowers this directly
+    aten.unbind,  # inductor lowers this directly
+]
+
+remove_decompositions(decompositions, decomps_to_exclude)
+
+
+def register_decomposition(ops):
+    for op in [ops] if callable(ops) else ops:
+        if op in decompositions:
+            log.warning("duplicate decomp: %s", ops)
+    return decomp.register_decomposition(ops, decompositions)
+
+
+# TODO: for now, inductor doesn't handle asserts
+# because the condition is symbool -> tensor in the graph.
+@register_decomposition([aten._assert_async.msg])
+def assert_async_msg_decomp(tensor, msg):
+    return
+
+
+# Following `assert_async_msg_decomp` and implement as non-op.
+@register_decomposition([aten._functional_assert_async.msg])
+def functional_assert_async_msg_decomp(tensor, msg):
+    return
+
+
+@register_decomposition([aten.sym_constrain_range_for_size.default])
+def sym_constrain_range_for_size(symbol, *, min=None, max=None):
+    return
+
+
+@register_decomposition([aten.clamp])
+@pw_cast_for_opmath
+def clamp(x, min=None, max=None):
+    if min is not None:
+        x = x.clamp_min(min)
+    if max is not None:
+        x = x.clamp_max(max)
+    return x
+
+
+@register_decomposition([aten.full])
+def full(size, fill_value, **kwargs):
+    dtype = kwargs.get("dtype")
+    if dtype is None:
+        kwargs["dtype"] = type_to_dtype(type(fill_value))
+        return aten.full(size, fill_value, **kwargs)
+    return NotImplemented
+
+
+# Not really sure how to put this into the main library.  PrimTorch wants
+# empty_permuted to go to the prim, and typically users don't really want
+# to decompose to empty_strided (but inductor is OK with it, because we are
+# cool with strides and everything goes to empty_strided)
+@register_decomposition([aten.empty_permuted.default])
+def empty_permuted(size, physical_layout, **kwargs):
+    perm = [0] * len(size)
+    for p, l in enumerate(physical_layout):
+        perm[l] = p
+    return torch.empty([size[l] for l in physical_layout], **kwargs).permute(perm)
+
+
+@register_decomposition([aten.convolution_backward])
+def convolution_backward(
+    grad_output,
+    input,
+    weight,
+    bias_sizes,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    output_mask,
+):
+    if not output_mask[2] or grad_output.device.type != "cuda":
+        return NotImplemented
+    grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
+    grad_inp, grad_weight, _ = aten.convolution_backward(
+        grad_output,
+        input,
+        weight,
+        bias_sizes,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        [output_mask[0], output_mask[1], False],
+    )
+    return (grad_inp, grad_weight, grad_bias)
+
+
+@register_decomposition([aten.log2])
+def log2(x):
+    return torch.log(x) * (1.0 / math.log(2.0))
+
+
+@register_decomposition([aten.round.decimals])
+def round_dec(x, decimals=0):
+    ten_pow_decimals = 10.0**decimals
+    return aten.round(x * ten_pow_decimals) * (1.0 / ten_pow_decimals)
+
+
+@register_decomposition([aten.bmm])
+@pw_cast_for_opmath
+def bmm(self, batch2):
+    if config.coordinate_descent_tuning:
+        if self.shape[1] == 1 or batch2.shape[2] == 1:
+            out = (self.unsqueeze(-1) * batch2.unsqueeze(1)).sum(dim=2)
+            return out
+    if self.device.type == "cpu":
+        if self.size(1) == 1 and batch2.size(-1) == 1:
+            return torch.sum(
+                self.squeeze(1) * batch2.squeeze(-1), dim=1, keepdim=True
+            ).unsqueeze(1)
+    return NotImplemented
+
+
+@register_decomposition([aten.addmm])
+@pw_cast_for_opmath
+def addmm(self, mat1, mat2, beta=1, alpha=1):
+    if self.device.type == "cpu":
+        if mat1.size(0) == 1 and mat2.size(-1) == 1:
+            out = torch.sum(
+                mat1.squeeze(0) * mat2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+            return alpha * out + beta * self
+        if mat1.size(0) == 1 and mat2.size(0) <= 16 and mat2.size(1) <= 16:
+            out = (mat1.T * mat2).sum(dim=0, keepdim=True)
+            return alpha * out + beta * self
+    return NotImplemented
+
+
+@register_decomposition([aten.mm])
+@pw_cast_for_opmath
+def mm(self, input2):
+    from torch.fx.experimental.symbolic_shapes import (
+        definitely_true,
+        guard_size_oblivious,
+    )
+
+    # Our matrix vector multiplies only achieve peak bandwidth with coordinate descent tuning.
+    # todo: Look into why and fix it (hopefully)
+    if config.coordinate_descent_tuning:
+        if self.shape[0] == 1 or input2.shape[1] == 1:
+            return (self.unsqueeze(2) * input2.unsqueeze(0)).sum(dim=1)
+    if self.device.type == "cpu":
+        if (
+            guard_size_oblivious(self.size(-1) == 1)
+            and guard_size_oblivious(self.size(0) > 0)
+            and guard_size_oblivious(input2.size(0) == 1)
+            and (self.dtype == input2.dtype)
+            and definitely_true((torch.numel(self) + torch.numel(input2)) <= 32)
+        ):
+            return torch.cat([self[i, :] * input2 for i in range(self.size(0))])
+        if guard_size_oblivious(self.size(0) == 1) and guard_size_oblivious(
+            input2.size(-1) == 1
+        ):
+            return torch.sum(
+                self.squeeze(0) * input2.squeeze(-1), dim=0, keepdim=True
+            ).unsqueeze(0)
+    return NotImplemented
+
+
+# This pass does two things:
+# - Eliminate cat when there is only one tensor input
+# - Normalize cat calls, so that legacy empty 1-D tensors are removed (NB: we
+#   don't remove ALL empty tensors, only the naughty ones)
+@register_decomposition([aten.cat.default])
+def cat(tensors, dim=0):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    def non_empty_tensor(x):
+        # For better or worse, this is a valid cat:
+        #
+        #   torch.cat([torch.randn(2, 2, 4), torch.randn(0), torch.randn(3, 2, 4)])
+        #
+        # We'd like to eliminate naughtiness like this for downstream passes
+        # like split_cat.  The easiest way is to just drop such inputs
+        # (guarding that they are non-zero).
+        #
+        # Is it permissible for this filtering to be size-oblivious?  A case
+        # where this could matter is cat([(2, 2), (u0,)], dim=0); if u0
+        # happened to be zero, we would have liked to have filtered it out.
+        # But actually, the ONLY way this could have passed is if u0 == 0,
+        # so by the time we get here we have already installed a deferred
+        # runtime assert forcing u0 to be zero.  So if this hasn't happened,
+        # we know that the unbacked SymInt has appropriate size and there are
+        # no problems.
+        return len(x.shape) != 1 or guard_size_oblivious(x.shape[0] > 0)
+
+    filtered_tensors = list(filter(non_empty_tensor, tensors))
+
+    if len(filtered_tensors) == 1:
+        return filtered_tensors[0].clone()
+    elif 1 < len(filtered_tensors) < len(tensors):
+        # on the first call, when we remove empty tensors, we redispatch recursively
+        return aten.cat.default(filtered_tensors, dim)
+    # when no 'filtering' has occurred, we raise to prevent infinite recursion (no more decomposition needed)
+    return NotImplemented
+
+
+@register_decomposition([aten.angle])
+def angle(x):
+    if x.is_complex():
+        return torch.where(
+            torch.isnan(x.real), float("nan"), torch.atan2(x.imag, x.real)
+        )
+
+    # when x is real number
+    #   if x >= 0, return 0
+    #   if x < 0, return pi
+    #   if x is nan, return nan
+    _, dtype = elementwise_dtypes(
+        x,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    pi = torch.scalar_tensor(math.pi, dtype=dtype, device=x.device)
+    ret = torch.where(x < 0, pi, 0.0)
+    return torch.where(torch.isnan(x), float("nan"), ret)
+
+
+@register_decomposition([aten.add])
+def add(x, y, *, alpha=None):
+    x_is_complex_tensor = torch.is_tensor(x) and x.is_complex()
+    y_is_complex_tensor = torch.is_tensor(y) and y.is_complex()
+    if not x_is_complex_tensor or not y_is_complex_tensor:
+        return NotImplemented
+    z = y
+    if alpha is not None:
+        z = alpha * y
+    complex_type = torch.promote_types(x.dtype, y.dtype)
+    return (x.view(x.real.dtype) + z.view(y.real.dtype)).view(complex_type)
+
+
+@register_decomposition([aten.conj_physical])
+def conj_physical(self):
+    assert not self.is_complex(), "TODO: implement this"
+    return self
+
+
+@register_decomposition([aten.lift, aten.detach_])
+def lift(self):
+    return self
+
+
+@register_decomposition([aten.bernoulli.default])
+def bernoulli(self, *, generator=None):
+    assert generator is None
+    return (torch.rand_like(self, dtype=torch.float32) < self).to(self.dtype)
+
+
+@register_decomposition([aten.fmin, prims.fmin])
+def fmin(self, other):
+    return torch.where(torch.isnan(other) | (other > self), self, other)
+
+
+@register_decomposition([aten.fmax, prims.fmax])
+def fmax(self, other):
+    return torch.where(torch.isnan(other) | (other < self), self, other)
+
+
+@register_decomposition(aten.amax)
+def amax(self, dim=None, keepdim=False):
+    if self.dtype == torch.bool:
+        return torch.any(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+
+
+@register_decomposition(aten.amin)
+def amin(self, dim=None, keepdim=False):
+    if self.dtype == torch.bool:
+        return torch.all(self, dim=dim, keepdim=keepdim)
+    return NotImplemented
+
+
+@register_decomposition([aten.narrow_copy])
+def narrow_copy(self, dim, start, length):
+    return torch.narrow(self, dim, start, length).clone()
+
+
+@register_decomposition([aten.expand_copy])
+def expand_copy(self, size, *, implicit=False):
+    return aten.expand(self, size, implicit=implicit).clone()
+
+
+@register_decomposition([aten.view_copy.default])
+def view_copy_default(self, size):
+    return aten.view(self, size).clone()
+
+
+@register_decomposition([aten.view_copy.dtype])
+def view_copy_dtype(self, dtype):
+    return self.to(dtype).clone()
+
+
+def get_like_layout(
+    tensor: torch.Tensor, memory_format: Optional[torch.memory_format]
+) -> torch.memory_format:
+    # TODO: _to_copy tensor to stride permutation
+    if memory_format is torch.preserve_format or memory_format is None:
+        return utils.suggest_memory_format(tensor)
+    else:
+        return memory_format
+
+
+@register_decomposition(aten.rand_like)
+def rand_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return torch.rand(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randn_like)
+def randn_like(self, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return torch.randn(
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.full_like)
+def full_like(
+    self,
+    fill_value,
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    requires_grad=False,
+    memory_format=torch.preserve_format,
+):
+    return torch.full(
+        [*self.size()],
+        fill_value,
+        dtype=dtype or self.dtype,
+        layout=layout or self.layout,
+        device=device or self.device,
+        requires_grad=requires_grad,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randint_like.default)
+def randint_like(self, high, *, dtype=None, device=None, memory_format=None, **kwargs):
+    return aten.randint.low(
+        0,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randint_like.low_dtype)
+def randint_like_low(
+    self, low, high, *, dtype=None, device=None, memory_format=None, **kwargs
+):
+    return aten.randint.low(
+        low,
+        high,
+        [*self.size()],
+        dtype=dtype or self.dtype,
+        device=device or self.device,
+        **kwargs,
+    ).to(memory_format=get_like_layout(self, memory_format))
+
+
+@register_decomposition(aten.randint.default)
+def randint(high, size, **kwargs):
+    return aten.randint.low(0, high, size, **kwargs)
+
+
+# The difference between quantize_per_tensor.default and quantize_per_tensor.tensor is
+# scale and zero_point is scalar or scalar tensor
+@register_decomposition(quantized_decomposed.quantize_per_tensor.default)
+def quantize_per_tensor_default_decomp_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    inv_scale = 1.0 / scale
+    return torch.clamp(
+        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
+    ).to(dtype)
+
+
+# The difference between dequantize_per_tensor.default and dequantize_per_tensor.tensor is
+# scale and zero_point is scalar or scalar tensor
+@register_decomposition(quantized_decomposed.dequantize_per_tensor.default)
+def dequantize_per_tensor_default_decomp_impl(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return (input.to(torch.float32) - zero_point) * scale
+
+
+@register_decomposition(quantized_decomposed.quantize_per_tensor.tensor)
+def quantize_per_tensor_tensor_decomp_impl(
+    input: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    inv_scale = 1.0 / scale
+    return torch.clamp(
+        torch.round(input * inv_scale) + zero_point, quant_min, quant_max
+    ).to(dtype)
+
+
+@register_decomposition(quantized_decomposed.dequantize_per_tensor.tensor)
+def dequantize_per_tensor_tensor_decomp_impl(
+    input: torch.Tensor,
+    scale: torch.Tensor,
+    zero_point: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return (input.to(torch.float32) - zero_point.to(torch.int32)) * scale.to(
+        torch.float32
+    )
+
+
+@register_decomposition(torch.ops.quantized.embedding_bag_byte_unpack)
+def q_embedding_bag_byte_unpack_decomp(packed):
+    def bitcast_u8_to_f32(u8):
+        x, y, z, w = (u8[..., n].to(torch.int32) for n in (0, 1, 2, 3))
+        if sys.byteorder == "little":
+            return (x + (y << 8) + (z << 16) + (w << 24)).view(torch.float32)[..., None]
+        else:
+            return ((x << 24) + (y << 16) + (z << 8) + w).view(torch.float32)[..., None]
+
+    scales = bitcast_u8_to_f32(packed[..., -8:-4])
+    offsets = bitcast_u8_to_f32(packed[..., -4:])
+    return packed[..., :-8].to(torch.float32) * scales + offsets
+
+
+@register_decomposition([aten.grid_sampler_2d])
+@pw_cast_for_opmath
+def grid_sampler_2d(
+    a: torch.Tensor,
+    grid: torch.Tensor,
+    interpolation_mode: int = 0,
+    padding_mode: int = 0,
+    align_corners: bool = False,
+) -> torch.Tensor:
+    # We do not expand the grid (_expand_grid=False) on cpu for performance reasons
+    # Experimenting locally it was found that compiled CUDA code is accelerated by ~5x
+    # and CPU code by ~2x on bicubic mode, if we expand the grid from (N, H, W, 2) into (N, C, H, W, 2)
+    # However, this leads to a slowdown around ~0.8x on CPU bilinear mode, channels first.
+    # Thus we apply this hack to not expand the grid for this case.
+    _expand_grid = not (
+        a.device == torch.device("cpu")
+        and interpolation_mode == 0
+        and a.is_contiguous(memory_format=torch.contiguous_format)
+    )
+
+    output = decomp_grid_sampler_2d(
+        a,
+        grid=grid,
+        interpolation_mode=interpolation_mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+        _expand_grid=_expand_grid,
+    )
+    return output
+
+
+@register_decomposition(aten._foreach_addcmul.Scalar)
+def _foreach_addcmul_scalar(self, left_tensors, right_tensors, scalar=1):
+    return aten._foreach_add.List(
+        self, aten._foreach_mul.List(left_tensors, right_tensors), alpha=scalar
+    )
+
+
+@register_decomposition(aten._foreach_addcdiv.Scalar)
+def _foreach_addcdiv_scalar(self, left_tensors, right_tensors, scalar=1):
+    return aten._foreach_add.List(
+        self, aten._foreach_div.List(left_tensors, right_tensors), alpha=scalar
+    )
+
+
+@register_decomposition(aten._foreach_lerp.Scalar)
+def _foreach_lerp_scalar(start_tensors, end_tensors, weight):
+    return aten._foreach_add.List(
+        start_tensors,
+        aten._foreach_mul.Scalar(
+            aten._foreach_sub.List(end_tensors, start_tensors), weight
+        ),
+    )
+
+
+@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
+@register_decomposition(aten.miopen_batch_norm)
+def miopen_batch_norm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    running_mean: typing.Optional[torch.Tensor],
+    running_var: typing.Optional[torch.Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+):
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+
+    if training:
+        return (a, b, c)
+    return (
+        a,
+        weight.new_zeros((0,)),
+        weight.new_zeros((0,)),
+    )
+
+
+@functools.lru_cache(None)
+def fast_random_decomps():
+    return {**decompositions, **extra_random_decomps}
+
+
+def select_decomp_table():
+    """decomps can change based on config"""
+    if config.fallback_random:
+        return decompositions
+    return fast_random_decomps()
+
+
+@register_decomposition(aten.masked_scatter)
+def masked_scatter(self, mask, source):
+    if self.device.type == "cuda":
+        # This two-step algorithm is the same as eager CUDA, for eager CPU we
+        # use a 1-shot serial iteration.
+        self, mask = aten.broadcast_tensors([self, mask])
+        source_idx = mask.reshape(-1).cumsum(0) - 1
+        return inductor_prims.masked_scatter_with_index(self, mask, source_idx, source)
+    return NotImplemented
+
+
+@register_decomposition(quantized_decomposed.choose_qparams.tensor)
+def choose_qparams_tensor(
+    input: torch.Tensor, quant_min: int, quant_max: int, eps: float, dtype: torch.dtype
+):
+    min_val, max_val = torch.aminmax(input)
+    scale = (max_val - min_val) / float(quant_max - quant_min)
+    scale = torch.max(scale, torch.Tensor([eps]))
+    zero_point = quant_min - torch.round(min_val / scale).to(torch.int)
+    zero_point = torch.clamp(zero_point, quant_min, quant_max)
+    return scale.to(torch.float64), zero_point.to(torch.int64)
+
+
+@register_decomposition(aten.put)
+def put(self, index, source, accumulate=False):
+    flattened = self.flatten()
+    flattened = torch.index_put(
+        flattened, [index], source.reshape(index.shape), accumulate
+    )
+    return flattened.reshape(self.shape)
+
+
+@register_decomposition(aten.put_)
+def put_(self, index, source, accumulate=False):
+    out = aten.put(self, index, source, accumulate=accumulate)
+    return self.copy_(out)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/dependencies.py b/MLPY/Lib/site-packages/torch/_inductor/dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..7733ea29fb079ee35cd09aa2b94287cb94ec37a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/dependencies.py
@@ -0,0 +1,506 @@
+import collections
+import dataclasses
+import itertools
+import logging
+import re
+import typing
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from unittest.mock import patch
+
+import sympy
+
+import torch
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+from .codegen.common import index_prevent_reordering
+from .utils import (
+    get_dtype_size,
+    reduction_num_outputs,
+    sympy_index_symbol,
+    sympy_str,
+    sympy_subs,
+    VarRanges,
+)
+from .virtualized import OpsHandler, ReductionType, V
+
+log = logging.getLogger(__name__)
+is_indirect = re.compile(r"indirect|tmp").search
+Dep = Union["MemoryDep", "StarDep", "WeakDep"]
+
+
+class MemoryDep(typing.NamedTuple):
+    name: str
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+
+    def __repr__(self):
+        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})"
+
+    @property
+    def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]:
+        """{c0: 128, c1: 512, ...}"""
+        return dict(zip(self.var_names, self.size))
+
+    def get_numel(self) -> sympy.Expr:
+        if self.is_indirect():
+            numel = V.graph.get_numel(self.name)
+        else:
+            vars = set(self.index.free_symbols)
+            numel = sympy.Integer(1)
+            for var, size in zip(self.var_names, self.size):
+                if var in vars:
+                    numel = numel * size
+        return numel
+
+    def rename(self, renames: Dict[str, str]) -> "MemoryDep":
+        if self.name in renames:
+            return MemoryDep(
+                renames[self.name], self.index, var_names=self.var_names, size=self.size
+            )
+        return self
+
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+
+    def is_contiguous(self) -> bool:
+        return isinstance(self.index, sympy.Symbol) and self.index in self.var_names
+
+    def is_scalar(self) -> bool:
+        if isinstance(self.index, sympy.Symbol):
+            return self.index not in self.var_names and not self.is_indirect()
+        return isinstance(self.index, (int, sympy.Integer))
+
+    def is_indirect(self) -> bool:
+        return any(is_indirect(v.name) for v in self.index.free_symbols)  # type: ignore[attr-defined]
+
+
+class StarDep(typing.NamedTuple):
+    # depends on the entire buffer
+    name: str
+
+    @property
+    def index(self):
+        raise NotImplementedError("StarDep does not have an index")
+
+    def get_numel(self) -> sympy.Expr:
+        return V.graph.get_numel(self.name)
+
+    def rename(self, renames: Dict[str, str]) -> "StarDep":
+        if self.name in renames:
+            return StarDep(renames[self.name])
+        return self
+
+    def numbytes_hint(self):
+        return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size(
+            V.graph.get_dtype(self.name)
+        )
+
+    def has_unbacked_symbols(self):
+        return len(free_unbacked_symbols(self.get_numel())) > 0
+
+    def is_contiguous(self) -> bool:
+        return False
+
+    def is_scalar(self) -> bool:
+        return False
+
+    def is_indirect(self) -> bool:
+        return False
+
+
+# Used for tracking mutation ordering
+# if A reads a buffer and B mutates it
+# B must be ordered after A
+#
+# It is weak because if it turns out A's read is never used, we can still
+# eliminate it
+class WeakDep(typing.NamedTuple):
+    name: str
+
+    @property
+    def index(self):
+        raise NotImplementedError("WeakDep does not have an index")
+
+    def get_numel(self) -> sympy.Expr:
+        return sympy.Integer(1)
+
+    def rename(self, renames: Dict[str, str]) -> "WeakDep":
+        if self.name in renames:
+            return WeakDep(renames[self.name])
+        return self
+
+    def numbytes_hint(self):
+        return 1  # Purely inserted for ordering, not an actual dep
+
+    def has_unbacked_symbols(self):
+        return False
+
+    def is_contiguous(self) -> bool:
+        return False
+
+
+class IndexExprDep(typing.NamedTuple):
+    index: sympy.Expr  # type: ignore[assignment]
+    var_names: Tuple[sympy.Symbol, ...]
+    size: Tuple[sympy.Expr, ...]
+
+
+@dataclasses.dataclass
+class ReadWrites:
+    reads: Set[Dep]
+    writes: Set[Dep]
+    index_exprs: Set[IndexExprDep]
+    range_vars: Optional[List[sympy.Expr]] = None
+    var_ranges: Optional[VarRanges] = None
+    op_counts: typing.Counter[str] = dataclasses.field(
+        default_factory=collections.Counter
+    )
+
+    def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites":
+        return ReadWrites(
+            {dep.rename(renames) for dep in self.reads},
+            {dep.rename(renames) for dep in self.writes},
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def with_read(self, dep: Dep) -> "ReadWrites":
+        assert isinstance(dep, (WeakDep, StarDep))
+        return ReadWrites(
+            set.union(self.reads, {dep}),
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def merge(self, other: "ReadWrites"):
+        reads = set.union(self.reads, other.reads)
+        writes = set.union(self.writes, other.writes)
+        index_exprs = set.union(self.index_exprs, other.index_exprs)
+        op_counts = collections.Counter(self.op_counts)
+        op_counts.update(other.op_counts)
+        return ReadWrites(reads - writes, writes, index_exprs, op_counts=op_counts)
+
+    @staticmethod
+    def merge_list(read_writes: List["ReadWrites"]):
+        all_writes = set.union(*[rw.writes for rw in read_writes])
+        all_reads = set.union(*[rw.reads for rw in read_writes]) - all_writes
+        all_index_exprs = set.union(*[rw.index_exprs for rw in read_writes])
+
+        op_counts: typing.Counter[Any] = collections.Counter()
+        for rw in read_writes:
+            op_counts.update(rw.op_counts)
+
+        return ReadWrites(all_reads, all_writes, all_index_exprs, op_counts=op_counts)
+
+    def remove_reads(self, rem_reads):
+        return ReadWrites(
+            self.reads - rem_reads,
+            self.writes,
+            self.index_exprs,
+            self.range_vars,
+            self.var_ranges,
+            op_counts=self.op_counts,
+        )
+
+    def reads_and_writes(self):
+        return itertools.chain(self.reads, self.writes)
+
+
+class _RecordLoadStoreInner(V.MockHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        super().__init__()
+        self._reads: Set[Dep] = set()
+        self._writes: Set[MemoryDep] = set()
+        self._index_exprs: Set[IndexExprDep] = set()
+        self._var_ranges: VarRanges = var_ranges
+        self._normalize: bool = normalize
+
+    def canonicalize(
+        self, index: sympy.Expr
+    ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]:
+        if not self._normalize:
+            sizes = [V.graph.sizevars.simplify(x) for x in self._var_ranges.values()]
+            var_names = tuple(
+                k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1
+            )
+            sizes = tuple(v for v in sizes if v != 1)
+            return index, var_names, sizes  # type: ignore[return-value]
+
+        # Try to further simplify the indexes even if simplify_loops didn't
+        # convert it to the simplest form because of the interference from
+        # different indexing formulas.
+        free_symbols = index.free_symbols
+        var_ranges = {
+            k: V.graph.sizevars.simplify(v)
+            for k, v in self._var_ranges.items()
+            # TODO(jansel): explore this further normalization
+            # if k in free_symbols
+        }
+        index_vars = [*var_ranges.keys()]
+        sizes = tuple(var_ranges.values())
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars,
+            sizes,
+            index_prevent_reordering([index], index_vars, sizes),
+        )
+
+        # assign new variables each dimension to deal with numbering mismatches
+        # d0, d1, d2 could become d0, d2 -- which won't match d0, d1
+        new_vars, add_var = var_builder(canonicalization_prefix())
+        replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
+        index = sympy_subs(sympy.expand(index), replacement)
+
+        new_vars = [*new_vars.keys()]
+        new_sizes = [*new_sizes]
+        free_symbols = index.free_symbols
+        while new_vars and new_vars[-1] not in free_symbols:
+            # Reduction has last (reduced) dim in its sizes, but
+            # downstream users won't.  Normalize this away.
+            new_vars.pop()
+            new_sizes.pop()
+        return index, tuple(new_vars), tuple(new_sizes)  # type: ignore[arg-type]
+
+    def load(self, name: str, index: sympy.Expr) -> str:
+        self._reads.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"load({name}, {sympy_str(index)})"
+
+    def load_seed(self, name: str, index: int):
+        assert isinstance(index, int)
+        return self.load(name, sympy.Integer(index))
+
+    def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str:
+        self._writes.add(MemoryDep(name, *self.canonicalize(index)))
+        return f"store({name}, {sympy_str(index)}, {value}, {mode})"
+
+    def store_reduction(self, name: str, index, value) -> str:
+        return self.store(name, index, f"store_reduction({value})")
+
+    def index_expr(self, index: sympy.Expr, dtype) -> str:
+        self._index_exprs.add(IndexExprDep(*self.canonicalize(index)))
+        return f"index_expr({sympy_str(index)}, {dtype})"
+
+    def bucketize(
+        self,
+        values,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ):
+        self._reads.add(StarDep(offsets_name))
+        return f"bucketize({values}, {offsets_name}, {sympy_str(offsets_size)}, {indexing_dtype}, {right})"
+
+
+class _OpCounter:
+    """Shim to count how many times each op is used"""
+
+    def __init__(self, inner):
+        super().__init__()
+        self.parent_handler = inner
+        self._op_counts: typing.Counter[Any] = collections.Counter()
+
+    def __getattr__(self, name):
+        self._op_counts[name] += 1
+        return getattr(self.parent_handler, name)
+
+
+class RecordLoadStore(V.KernelFormatterHandler):  # type: ignore[name-defined]
+    def __init__(self, var_ranges: VarRanges, normalize: bool):
+        parent_handler = _RecordLoadStoreInner(
+            var_ranges=var_ranges, normalize=normalize
+        )
+        parent_handler = _OpCounter(parent_handler)
+        super().__init__(parent_handler=parent_handler)
+
+
+def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]:
+    cnt = itertools.count()
+    var_ranges: VarRanges = dict()
+
+    def add_var(length: sympy.Expr) -> sympy.Symbol:
+        v = sympy_index_symbol(f"{prefix}{next(cnt)}")
+        var_ranges[v] = length
+        return v
+
+    return var_ranges, add_var
+
+
+def index_vars_no_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str):
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Symbol]] = []
+    for size in argsizes:
+        args.append(list(map(add_var, size)))
+    return args, var_ranges
+
+
+def index_vars_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str = "d"):
+    from .ir import SqueezeView
+
+    var_ranges, add_var = var_builder(prefix)
+    args: List[List[sympy.Expr]] = []
+    new_sizes: List[List[sympy.Expr]] = []
+    for size in argsizes:
+        new_size, reindex = SqueezeView.squeezer(size)
+        new_sizes.append(new_size)
+        args.append(reindex(list(map(add_var, new_size))))
+    return args, var_ranges
+
+
+def extract_read_writes(
+    fn: Callable[..., Any],
+    *argsizes: Tuple[sympy.Expr, ...],
+    normalize: bool = False,
+    prefix: str = "d",
+):
+    args, var_ranges = index_vars_squeeze(*argsizes, prefix=prefix)
+    rw = RecordLoadStore(var_ranges, normalize=normalize)
+    with V.set_ops_handler(rw):
+        fn(*args)
+
+    if normalize:
+        range_vars = []  # Number of vars could differ due to normalization
+    else:
+        range_vars = list(itertools.chain.from_iterable(args))
+
+    inner = rw.parent_handler.parent_handler
+    return ReadWrites(
+        set(inner._reads),
+        set(inner._writes),
+        inner._index_exprs,
+        range_vars,
+        var_ranges,
+        rw.parent_handler._op_counts,
+    )
+
+
+def extract_input_node_reduction_ranges(
+    input_node: "torch._inductor.ir.TensorBox",
+) -> Tuple[Optional[List[sympy.Expr]], Optional[List[sympy.Expr]]]:
+    """
+    Returns the size and reduction size of all inputs, if the sizes and reduction_sizes (if exist) are all the same.
+    It's possible that a node has multiple inputs, some are Reduction nodes and others are Pointwise nodes.
+    In this case, reduction_sizes of the Reduction nodes need to be the same.
+    Otherwise returns (None, None).
+    """
+
+    from .ir import ComputedBuffer, Loops
+
+    if isinstance(input_node.data, ComputedBuffer):
+        # Input node has already been realized. Return its size and reduction_size.
+        size = input_node.get_size()
+        reduction_size = input_node.get_reduction_size()
+        if len(reduction_size) > 0:
+            return (size, reduction_size)
+        else:
+            return (None, None)
+
+    if not isinstance(input_node.data.data, Loops):  # type: ignore[attr-defined]
+        # Other IRNodes do not have reduction_ranges.
+        return (None, None)
+
+    # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes?
+    # The current method still uses reduction ranges from the dependent realized node, which is not ideal.
+    # Is there a way to check whether there are permutations inbetween?
+    reads = input_node.get_reads()
+    reduction_size = None
+    size = None
+    while reduction_size is None and len(reads) > 0:
+        seen = set()
+        new_reads = []
+        for read in reads:
+            if not isinstance(read, MemoryDep):
+                continue
+            if read.name in seen:
+                continue
+            seen.add(read.name)
+            buffer = V.graph.get_buffer(read.name)
+            if buffer is None:
+                continue
+            if (
+                isinstance(buffer, ComputedBuffer)
+                and len(buffer.get_reduction_size()) > 0
+            ):
+                if reduction_size is None:
+                    reduction_size = buffer.get_reduction_size()
+                    size = buffer.get_size()
+                elif (
+                    reduction_size != buffer.get_reduction_size()
+                    or size != buffer.get_size()
+                ):
+                    return (None, None)
+            else:
+                new_reads.extend(buffer.get_reads())
+        if reads == new_reads:
+            return (size, reduction_size)
+        else:
+            reads = new_reads
+    return (size, reduction_size)
+
+
+def canonicalization_prefix():
+    return "c"
+
+
+# ops handler which computes all the free unbacked symbols for an IR
+class FreeUnbackedSymbolsOpsHandler:
+    symbols: Set[sympy.Symbol]
+
+    def __init__(self):
+        self.symbols = set()
+
+    def __getattr__(self, name: str) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            for a in itertools.chain(args, kwargs.values()):
+                if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)):
+                    self.symbols |= free_unbacked_symbols(a)
+
+        return inner
+
+    def indirect_indexing(self, index_var, size, check=True) -> sympy.Symbol:
+        assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean))
+        self.symbols |= free_unbacked_symbols(size)
+        return sympy_index_symbol(f"({str(index_var)})")
+
+    def frexp(self, x):
+        return (None,) * 2
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[None, Tuple[None, ...]],
+    ) -> Union[None, Tuple[None, ...]]:
+        num_values = reduction_num_outputs(reduction_type)
+        return (None,) * num_values if num_values > 1 else None
+
+
+def _typecheck_FreeUnbackedSymbolsOpsHandler(
+    h: FreeUnbackedSymbolsOpsHandler,
+) -> OpsHandler[None]:
+    return h
+
+
+def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None):
+    from .ir import FlexibleLayout
+
+    args = [index, rindex] if rindex is not None else [index]
+    handler = FreeUnbackedSymbolsOpsHandler()
+    # NB: I cargo culted the allow_indexing patch here, I don't understand why
+    # people do this all over
+    with V.set_ops_handler(handler), patch.object(
+        FlexibleLayout, "allow_indexing", True
+    ):
+        fn(*args)
+    return handler.symbols
diff --git a/MLPY/Lib/site-packages/torch/_inductor/exc.py b/MLPY/Lib/site-packages/torch/_inductor/exc.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb73e3f21cee556757f4cc21a003d8c6583cf6d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/exc.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+
+import os
+import tempfile
+import textwrap
+from functools import lru_cache
+
+if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1":
+
+    @lru_cache(None)
+    def _record_missing_op(target):
+        with open(f"{tempfile.gettempdir()}/missing_ops.txt", "a") as fd:
+            fd.write(str(target) + "\n")
+
+else:
+
+    def _record_missing_op(target):  # type: ignore[misc]
+        pass
+
+
+class OperatorIssue(RuntimeError):
+    @staticmethod
+    def operator_str(target, args, kwargs):
+        lines = [f"target: {target}"] + [
+            f"args[{i}]: {arg}" for i, arg in enumerate(args)
+        ]
+        if kwargs:
+            lines.append(f"kwargs: {kwargs}")
+        return textwrap.indent("\n".join(lines), "  ")
+
+
+class MissingOperatorWithoutDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(f"missing lowering\n{self.operator_str(target, args, kwargs)}")
+
+
+class MissingOperatorWithDecomp(OperatorIssue):
+    def __init__(self, target, args, kwargs):
+        _record_missing_op(target)
+        super().__init__(
+            f"missing decomposition\n{self.operator_str(target, args, kwargs)}"
+            + textwrap.dedent(
+                f"""
+
+                There is a decomposition available for {target} in
+                torch._decomp.get_decompositions().  Please add this operator to the
+                `decompositions` list in torch._inductor.decompositions
+                """
+            )
+        )
+
+
+class LoweringException(OperatorIssue):
+    def __init__(self, exc: Exception, target, args, kwargs):
+        super().__init__(
+            f"{type(exc).__name__}: {exc}\n{self.operator_str(target, args, kwargs)}"
+        )
+
+
+class InvalidCxxCompiler(RuntimeError):
+    def __init__(self):
+        from . import config
+
+        super().__init__(
+            f"No working C++ compiler found in {config.__name__}.cpp.cxx: {config.cpp.cxx}"
+        )
+
+
+class CppWrapperCodeGenError(RuntimeError):
+    def __init__(self, msg: str):
+        super().__init__(f"C++ wrapper codegen error: {msg}")
+
+
+class CppCompileError(RuntimeError):
+    def __init__(self, cmd: list[str], output: str):
+        if isinstance(output, bytes):
+            output = output.decode("utf-8")
+
+        super().__init__(
+            textwrap.dedent(
+                """
+                    C++ compile error
+
+                    Command:
+                    {cmd}
+
+                    Output:
+                    {output}
+                """
+            )
+            .strip()
+            .format(cmd=" ".join(cmd), output=output)
+        )
+
+
+class CUDACompileError(CppCompileError):
+    pass
diff --git a/MLPY/Lib/site-packages/torch/_inductor/freezing.py b/MLPY/Lib/site-packages/torch/_inductor/freezing.py
new file mode 100644
index 0000000000000000000000000000000000000000..8de18a38e140187af47d0fe5781cbaa10a37e533
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/freezing.py
@@ -0,0 +1,266 @@
+from __future__ import annotations
+
+import itertools
+import logging
+
+import weakref
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch._dynamo.utils import dynamo_timed, lazy_format_graph_code
+from torch._functorch.aot_autograd import MutationType
+from torch._functorch.compile_utils import fx_graph_cse
+from torch._inductor.constant_folding import constant_fold, replace_node_with_constant
+
+from torch._inductor.fx_passes.freezing_patterns import freezing_passes
+from torch._inductor.fx_passes.post_grad import view_to_reshape
+
+from . import config
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+log = logging.getLogger(__name__)
+
+
+def replace_params_with_constants(
+    gm: torch.fx.GraphModule,
+    flat_params: list[Any],
+    fw_metadata: torch._functorch.aot_autograd.ViewAndMutationMeta,
+) -> List[int]:
+    """
+    Replaces the parameters of a PyTorch GraphModule with constants wherever possible.
+    Returns a list of indices representing the input parameters that were not converted to constants.
+    """
+    params = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    fake_inp_nodes = params[: len(params)]
+    preserved_arg_indices = []
+    aliased_input_args = [
+        out_info.base_idx
+        for out_info in fw_metadata.output_info
+        if out_info.base_idx is not None
+    ]
+
+    # TODO (tmanlaibaatar) figure out why this is different
+    # from mutated_inp_runtime_indices
+    mutated_inps = [
+        i
+        for i, m in enumerate(fw_metadata.input_info)
+        if m.mutation_type
+        in (MutationType.MUTATED_IN_GRAPH, MutationType.MUTATED_OUT_GRAPH)
+    ]
+
+    for i, (real_input, node) in enumerate(zip(flat_params, fake_inp_nodes)):
+        if i in mutated_inps or i in aliased_input_args:
+            preserved_arg_indices.append(i)
+            continue
+        replace_node_with_constant(gm, node, real_input)
+    # add on non param inputs
+    preserved_arg_indices.extend(range(len(flat_params), len(params)))
+    # is this necessary ?
+    gm.recompile()
+    return preserved_arg_indices
+
+
+def freeze(
+    dynamo_gm: torch.fx.GraphModule,
+    aot_autograd_gm: torch.fx.GraphModule,
+    example_inputs: List[torch._subclasses.FakeTensor],
+) -> Tuple[torch.fx.GraphModule, List[int]]:
+    """
+    Inlines parameters that are not mutated into constants and optimizes the graph through constant propagation
+    and other techniques. If enabled, the function also discards the original parameters of the module for memory efficiency.
+
+    Assumes that this function is run in dynamo tracing post aot_autograd.
+
+    Args:
+        dynamo_gm (torch.fx.GraphModule): The Dynamo constructed GraphModule.
+        aot_autograd_gm (torch.fx.GraphModule): The aot_autograd constructed GraphModule to be frozen.
+        example_inputs (List[torch.Tensor]): A list of example input tensors to be used in the freezing process.
+
+    Returns:
+        Tuple[torch.fx.GraphModule, List[int]]: A tuple containing the frozen GraphModule and a list of indices
+        of the inputs that were preserved (not turned into constants).
+    """
+    # We have convert conv's weight to channels last which may meet error for .view
+    # when doing fake_tensor_prop. So we need to convert view to reshape first.
+    # See the details in fx_codegen_and_compile of compile_fx.py.
+    view_to_reshape(aot_autograd_gm)
+
+    if tracing_context := torch._guards.TracingContext.try_get():
+        fw_metadata = tracing_context.fw_metadata
+        params_flat = tracing_context.params_flat
+        assert fw_metadata is not None and params_flat is not None
+
+        preserved_arg_indices = replace_params_with_constants(
+            aot_autograd_gm, params_flat, fw_metadata
+        )
+    else:
+        inputs = [
+            node for node in aot_autograd_gm.graph.nodes if node.op == "placeholder"
+        ]
+        preserved_arg_indices = list(range(len(inputs)))
+
+    # TODO - further restrict cse ? right now needed to dedup aliasing ops
+    cse_graph = fx_graph_cse(aot_autograd_gm.graph)
+    aot_autograd_gm.graph = cse_graph
+    aot_autograd_gm.recompile()
+
+    aot_example_inputs = [example_inputs[ind] for ind in preserved_arg_indices]
+    freezing_passes(aot_autograd_gm, aot_example_inputs)
+
+    constant_fold(aot_autograd_gm)
+    # invalidate nn Modules
+    if config.freezing_discard_parameters:
+        invalidate_eager_modules()
+        discard_traced_gm_params(dynamo_gm)
+
+    log.debug("%s", lazy_format_graph_code("FROZEN GRAPH", aot_autograd_gm))
+
+    return aot_autograd_gm, preserved_arg_indices
+
+
+class ErasedTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, elem, name, owning_mod):
+        return super().__new__(cls, elem.to(device="meta"))
+
+    def __init__(self, elem, name: Optional[str], mod):
+        self.erased_name = name
+        self.owning_mod_ref = weakref.ref(mod)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        erased_tensors = [
+            e
+            for e in pytree.arg_tree_leaves(*args, **kwargs)
+            if isinstance(e, ErasedTensor)
+        ]
+        assert len(erased_tensors) > 0
+        e = erased_tensors[0]
+
+        raise RuntimeError(
+            f"Trying to run Pytorch Eager Module after Dynamo Freezing. "
+            "The original parameters have been discarded for memory efficiency. "
+            f"Found in op {func} for erased parameter {e.erased_name} of {e.owning_mod_ref()}"
+        )
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def invalidate_eager_modules():
+    for mod in torch._guards.TracingContext.get().module_context.nn_modules.values():
+        if not isinstance(mod, torch.nn.Module):
+            continue
+
+        for attr_name, tensor in list(
+            itertools.chain(
+                mod.named_parameters(recurse=False), mod.named_buffers(recurse=False)
+            )
+        ):
+            with torch._dispatch.python.no_python_dispatcher():
+                e_t = ErasedTensor(tensor, attr_name, mod)
+            if isinstance(tensor, torch.nn.Parameter):
+                e_t.requires_grad_(True)
+                e_t._is_param = True  # type: ignore[attr-defined]
+            setattr(mod, attr_name, e_t)
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def discard_traced_gm_params(mod: torch.fx.GraphModule):
+    for attr_name, tensor in list(
+        itertools.chain(
+            mod.named_parameters(recurse=False), mod.named_buffers(recurse=False)
+        )
+    ):
+        with torch._dispatch.python.no_python_dispatcher():
+            e_t = ErasedTensor(tensor, attr_name, mod)
+        if isinstance(tensor, torch.nn.Parameter):
+            e_t.requires_grad_(True)
+            e_t._is_param = True  # type: ignore[attr-defined]
+        setattr(mod, attr_name, e_t)
+
+
+def enforce_output_layout(gm: torch.fx.GraphModule):
+    """
+    Make sure the output node's layout does not change due to compiler optimizations
+    by adding aten.as_strided nodes with the expected strides.
+
+    Only used for inference so we can assume all graph outputs are model outputs.
+    """
+    *_, output_node = gm.graph.nodes
+    out_list = output_node.args[0]
+    with gm.graph.inserting_before(output_node):
+        for n in out_list:
+            if not isinstance(
+                n.meta["val"], torch.Tensor
+            ) or not torch._prims_common.is_non_overlapping_and_dense(n.meta["val"]):
+                continue
+
+            # add a node to enforce eager layout
+            ft = n.meta["val"]
+            new_node = gm.graph.call_function(
+                prims.inductor_force_stride_order.default, (n, ft.stride())
+            )
+
+            # can not call
+            # n.replace_all_uses_with(new_node)
+            # since it will replace the usage of n in new_node itself.
+            output_node.replace_input_with(n, new_node)
+
+    gm.graph.lint()
+    gm.recompile()
+
+
+def enforce_as_strided_input_layout(gm: torch.fx.GraphModule):
+    """
+    Make sure the as_strided node's input's layout does not change due to compiler
+    optimizations, because the as_strided strides info depends on input tensor stride info.
+    """
+
+    as_strided_ops = [
+        torch.ops.aten.as_strided.default,
+        torch.ops.aten.as_strided_.default,
+        torch.ops.aten.as_strided_scatter.default,
+    ]
+    strided_nodes = [n for n in gm.graph.nodes if n.target in as_strided_ops]
+    for n in strided_nodes:
+        with gm.graph.inserting_before(n):
+            # add a node to enforce eager layout
+            ft = n.args[0].meta["val"]
+            new_node = gm.graph.call_function(
+                prims.inductor_force_stride_order.default, (n.args[0], ft.stride())
+            )
+            n.replace_input_with(n.args[0], new_node)
+
+    gm.graph.lint()
+    gm.recompile()
+
+
+@dynamo_timed
+def convert_conv_weights_to_channels_last(gm: torch.fx.GraphModule):
+    """
+    Convert 4d convolution weight tensor to channels last format.
+
+    This pass is performed before freezing so the added nodes can be constant
+    folded by freezing.
+    """
+    convs = [n for n in gm.graph.nodes if n.target == aten.convolution.default]
+    for conv in convs:
+        weight_node = conv.args[1]
+        if len(weight_node.meta["val"].size()) != 4 or weight_node.meta[
+            "val"
+        ].is_contiguous(memory_format=torch.channels_last):
+            # not a 4d tensor or already channels last, skip
+            continue
+
+        with gm.graph.inserting_before(conv):
+            new_node = gm.graph.call_function(
+                aten.clone.default,
+                (weight_node,),
+                {"memory_format": torch.channels_last},
+            )
+            conv.replace_input_with(weight_node, new_node)
+
+    enforce_as_strided_input_layout(gm)
+    enforce_output_layout(gm)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e044de5a3058c82c64b5292cf3fa706d1cce1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..924e386148a9938ba3b5f7034ff8a31f0f1ee9ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/binary_folding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a48ddef1ef6c206faac222726830caf1ee79ac5b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/decompose_mem_bound_mm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a4e5bf3aa83f2607103270eda6aa1643a378de2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/dedupe_symint_uses.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cd2133ff8a669fb2af9ba4535feed0e5aac4604
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/efficient_conv_bn_eval.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e80fe654a13df12421a0c59d54b1e4301388092
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/freezing_patterns.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9e50be63a3df14bb9ac9165f6d858567b5a042d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/fuse_attention.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09fdb6f5771f0820bc5a00348e8f4d73514d38c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/group_batch_fusion.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..194c7707eab000aa10cb4e829ac4a87662f9d700
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/joint_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74b731fae158acbed29e2469d14a53c08870b4a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/misc_patterns.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..123054a5c65babbb349efcc567783f3e535c6d13
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/mkldnn_fusion.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6d5670dd7b046ca3360dff168549191bdba8f0e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/numeric_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe921a293f754e0de2b57631f02882a6e4b4890
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pad_mm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c9fb96f886c6a092ff2e9b97cbaf3f48d004d11
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/post_grad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0185aa822214a328c054a19d2ad420cdb250c051
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/pre_grad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..583ed21c481c06ed3d6f4ec3184969fd5a4340c0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/quantization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa3e8ae176ccc7ff6575c268d8cc50bae4b54dd5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/reinplace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..006cb41e80248fd95827223b3226bfeb07080f1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/replace_random.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31484150a6e7c7a20b4d6e5c598a03a679020d28
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/__pycache__/split_cat.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/binary_folding.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/binary_folding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f088b3af0bb4d41257ce3836f8bf067373feaa4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/binary_folding.py
@@ -0,0 +1,277 @@
+import functools
+import itertools
+
+import torch
+from ..._dynamo.utils import counters
+
+from ..pattern_matcher import Arg, CallFunction, KeywordArg
+from .freezing_patterns import register_binary_folding_pattern
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+
+def mark_mixed_dtype_conv(conv):
+    conv_dtype = conv.meta["val"].dtype
+    if conv_dtype not in (torch.float16, torch.bfloat16):
+        return
+
+    if not len(conv.users) == 1:
+        return
+
+    conv_user = next(iter(conv.users.keys()))
+    if not isinstance(conv_user.meta["val"], torch.Tensor):
+        return
+
+    if not conv_user.meta["val"].dtype == torch.float32:
+        return
+
+    while conv_user.target in _binary_ops:
+        if not len(conv_user.users) == 1:
+            return
+
+        conv_user = next(iter(conv_user.users.keys()))
+
+    if not (
+        conv_user.target == prims.convert_element_type.default
+        and conv_user.args[1] == conv_dtype
+    ):
+        return
+
+    conv.meta["_allow_conv_mixed_dtype_folding"] = conv_dtype
+
+
+def mark_mixed_dtype_allowed_convs(gm):
+    """
+    Mark convolutions which we will binary fold even with mixed precision constants. We constant fold in the higher precision
+    for better accuracy and then recover the original precision after.
+    """
+    for node in gm.graph.nodes:
+        if node.target is aten.convolution.default:
+            mark_mixed_dtype_conv(node)
+
+
+def recover_original_precision_folded_convs(gm):
+    """
+    After binary folding conv weights and biases to a higher dtype, recover the original precision they were in.
+    """
+    graph = gm.graph
+    convs = [node for node in graph.nodes if node.target is aten.convolution.default]
+    for node in convs:
+        orig_dtype = node.meta.get("_allow_conv_mixed_dtype_folding", None)
+        if orig_dtype is None:
+            continue
+
+        with graph.inserting_before(node):
+            for idx in [1, 2]:
+                old_input = node.args[idx]
+                if old_input is None:
+                    continue
+
+                new_input = graph.create_node(
+                    "call_function",
+                    prims.convert_element_type.default,
+                    (old_input, orig_dtype),
+                )
+                node.replace_input_with(old_input, new_input)
+
+
+_binary_ops = [aten.add.Tensor, aten.sub.Tensor, aten.mul.Tensor, aten.div.Tensor]
+
+
+@functools.lru_cache(None)
+def binary_folding_init():
+    _conv_args = [Arg() for _ in range(9)]
+    _computation_ops = [aten.convolution.default]
+    _computation_calls = [CallFunction(aten.convolution.default, *_conv_args, _users=1)]
+
+    """
+    In order to fuse add/sub/mul/div with conv, the dimensions of its
+    constant tensor must satisfy the following:
+    - with resizing, broadcast to w/ weight/bias tensor shape
+    - broadcast to the conv output shape
+    It needs to have a shape that can resize to weight/bias
+    tensor shape because we need to run the op with the conv
+    weights/bias without changing their sizes.
+    It needs to broadcast to the conv output shape so that we do
+    accidentally change the shape of op output by pre-fusing it
+    compared to eager.
+    The only dimension value shared by weight/bias/conv output
+    is they all contain a dim with value = channels-out. In the
+    conv output tensor, this is in the second dimension,
+    so the pointwise op tensor may have a second dimension of
+    value == channels-out, but all the other dimensions have to be 1
+    """
+
+    def _op_not_broadcasting_with_conv(weight_tensor, other_tensor):
+        # According to opDoesNotBroadCastWithConv of frozen_conv_folding.cpp
+        weight_shape = weight_tensor.shape
+        other_shape = other_tensor.shape
+        if len(weight_shape) < len(other_shape):
+            return False
+        if len(weight_shape) == len(other_shape) + 1:
+            # weight shape is [o, i, *], other_shape is [o, 1...].
+            for i in reversed(range(len(other_shape))):
+                if i == 0 and weight_shape[0] == other_shape[i]:
+                    continue
+                if other_shape[i] != 1:
+                    return False
+        else:
+            # weight shape is [o, i, *], other_shape is [1, i, *]
+            for i in reversed(range(len(other_shape))):
+                if i == 1 and weight_shape[0] == other_shape[i]:
+                    continue
+                if other_shape[i] != 1:
+                    return False
+        return True
+
+    def _check_conv_and_broadcast_op(conv_node, other):
+        # According to checkConvAndBroadcastingOpPreConditions of frozen_conv_folding.cpp.
+        # conv.weight
+        if conv_node.args[1].op != "get_attr":
+            return False
+        # conv.bias
+        if conv_node.args[1] is not None and conv_node.args[1].op != "get_attr":
+            return False
+        if (
+            not isinstance(other, int)
+            and not isinstance(other, float)
+            and other.op != "get_attr"
+        ):
+            return False
+
+        if not len(conv_node.args[1].users) == 1:
+            return False
+
+        weight_meta_value = conv_node.args[1].meta.get("val")
+        if weight_meta_value is None:
+            return False
+        # Avoid fusing op that causes type promotion
+        # restricting to float avoids int/float difficulties with scalar overload
+        if not weight_meta_value.is_floating_point():
+            return False
+        if isinstance(other, torch.fx.Node) and other.op == "get_attr":
+            other_meta_value = other.meta.get("val")
+            if not other_meta_value.is_floating_point():
+                return False
+            if (
+                torch.promote_types(other_meta_value.dtype, weight_meta_value.dtype)
+                != weight_meta_value.dtype
+            ):
+                if not conv_node.meta.get("_allow_conv_mixed_dtype_folding", False):
+                    return False
+
+                if (
+                    other_meta_value.dtype != torch.float
+                    and weight_meta_value.dtype not in (torch.float16, torch.bfloat16)
+                ):
+                    return False
+
+            if not _op_not_broadcasting_with_conv(weight_meta_value, other_meta_value):
+                return False
+        else:
+            # TODO: support scalar case
+            return False
+
+        return True
+
+    def _is_foldable_pattern(match):
+        binary_node = match.output_node()
+        computation_node = binary_node.args[0]
+        other = binary_node.args[1]
+        if binary_node.args[0].target not in _computation_ops:
+            computation_node = binary_node.args[1]
+            other = binary_node.args[0]
+        if binary_node.args[0].target == aten.convolution.default:
+            return _check_conv_and_broadcast_op(computation_node, other)
+
+        return False
+
+    def resize_scalar_or_tensor_to_shape(graph, other, shape):
+        # TODO: support scalar case
+        if other.meta.get("val").numel() == 1:
+            # expand errors if the shape input has less # dims than the tensor input
+            res = graph.create_node(
+                "call_function",
+                aten.reshape.default,
+                (other, (1,)),
+            )
+            res = graph.create_node(
+                "call_function",
+                aten.expand.default,
+                (res, shape),
+            )
+        else:
+            res = graph.create_node(
+                "call_function",
+                aten.reshape.default,
+                (other, shape),
+            )
+        return res
+
+    def _create_new_conv_node(graph, conv_node, binary_node, other):
+        assert conv_node.target == aten.convolution.default
+        conv_args = list(conv_node.args)
+        weight_meta_value = conv_node.args[1].meta.get("val")
+        bias = conv_args[2]
+        if binary_node.target in [aten.add.Tensor, aten.sub.Tensor]:
+            other_reshape = resize_scalar_or_tensor_to_shape(
+                graph, other, (weight_meta_value.size(0),)
+            )
+            new_bias = graph.create_node(
+                "call_function",
+                binary_node.target,
+                (0 if bias is None else bias, other_reshape),
+            )
+            conv_args[2] = new_bias
+        else:
+            assert binary_node.target in [aten.mul.Tensor, aten.div.Tensor]
+            weight_broadcast_shape = [1 for _ in range(len(weight_meta_value.shape))]
+            weight_broadcast_shape[0] = weight_meta_value.size(0)
+            other_reshape1 = resize_scalar_or_tensor_to_shape(
+                graph, other, tuple(weight_broadcast_shape)
+            )
+            new_weight = graph.create_node(
+                "call_function", binary_node.target, (conv_args[1], other_reshape1)
+            )
+            new_weight.meta.update(conv_args[1].meta)
+            conv_args[1] = new_weight
+            if bias is not None:
+                other_reshape = resize_scalar_or_tensor_to_shape(
+                    graph, other, (weight_meta_value.size(0),)
+                )
+                new_bias = graph.create_node(
+                    "call_function", binary_node.target, (bias, other_reshape)
+                )
+                new_bias.meta.update(bias.meta)
+                conv_args[2] = new_bias
+        return graph.create_node("call_function", conv_node.target, tuple(conv_args))
+
+    for _computation_call, binary_op in itertools.product(
+        _computation_calls, _binary_ops
+    ):
+
+        @register_binary_folding_pattern(
+            CallFunction(binary_op, _computation_call, KeywordArg("other")),
+            extra_check=_is_foldable_pattern,
+        )
+        def folded_op(match, *args, **kwargs):
+            counters["inductor"]["binary_folding"] += 1
+            other = kwargs.get("other")
+            binary_node = match.output_node()
+            computation_node = (
+                binary_node.args[0]
+                if binary_node.args[0].target in _computation_ops
+                else binary_node.args[1]
+            )
+            graph = match.graph
+            with graph.inserting_before(binary_node):
+                # TODO: support linear?
+                assert computation_node.target == aten.convolution.default
+                new_computation_node = _create_new_conv_node(
+                    graph, computation_node, binary_node, other
+                )
+                binary_node.replace_all_uses_with(new_computation_node)
+                new_computation_node.meta.update(computation_node.meta)
+                graph.erase_node(binary_node)
+                graph.erase_node(computation_node)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/decompose_mem_bound_mm.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..10d1aaf3c8bf6d66d44ff9bac13d6c3a6bb3d818
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/decompose_mem_bound_mm.py
@@ -0,0 +1,221 @@
+import logging
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+from torch._dynamo.utils import counters
+from torch._inductor import utils
+
+from ..pattern_matcher import (
+    Arg,
+    CallFunction,
+    config_flag,
+    Ignored,
+    Match,
+    register_graph_pattern,
+)
+from .post_grad import decompose_mm_pass
+
+aten = torch.ops.aten
+log = logging.getLogger(__name__)
+
+# TODO: need a better strategy for decomposing mm
+MIN_FIRST_DIMENSION_DECOMPOSITION = 10240
+MAX_OTHER_DIMENSION_DECOMPOSITION = 32
+
+
+def check_device(a: Tensor, b: Tensor) -> bool:
+    return a.is_cuda and b.is_cuda
+
+
+def should_decompose_common(
+    mat1: Tensor, mat2: Tensor, input: Optional[Tensor] = None
+) -> bool:
+    return (
+        torch._inductor.config.decompose_mem_bound_mm
+        and check_device(mat1, mat2)
+        and not utils.any_is_symbolic(mat1, mat2, input)
+    )
+
+
+def should_decompose_bmm(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    if not should_decompose_common(mat1, mat2):
+        return False
+    else:
+        if len(mat1.shape) != 3 or len(mat2.shape) != 3:
+            return False
+        if mat1.shape[0] < MIN_FIRST_DIMENSION_DECOMPOSITION:
+            return False
+        # 2 of m, n, k must be <= MAX_OTHER_DIMENSION_DECOMPOSITION
+        if (mat1.shape[1] < MAX_OTHER_DIMENSION_DECOMPOSITION) + (
+            mat1.shape[2] < MAX_OTHER_DIMENSION_DECOMPOSITION
+        ) + (mat2.shape[2] < MAX_OTHER_DIMENSION_DECOMPOSITION) < 2:
+            return False
+    return True
+
+
+def should_decompose_mm(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    return (
+        should_decompose_common(mat1, mat2)
+        and len(mat1.shape) == 2
+        and len(mat2.shape) == 2
+        and mat1.shape[0] >= MIN_FIRST_DIMENSION_DECOMPOSITION
+        and mat2.shape[0] < MAX_OTHER_DIMENSION_DECOMPOSITION
+        and mat2.shape[1] < MAX_OTHER_DIMENSION_DECOMPOSITION
+    )
+
+
+def should_decompose_mmt(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    return (
+        should_decompose_common(mat1, mat2)
+        and len(mat1.shape) == 2
+        and len(mat2.shape) == 2
+        and mat1.shape[0] >= MIN_FIRST_DIMENSION_DECOMPOSITION
+        and mat1.shape[1] < MAX_OTHER_DIMENSION_DECOMPOSITION
+        and mat2.shape[1] < MAX_OTHER_DIMENSION_DECOMPOSITION
+    )
+
+
+def should_decompose_mm_largek(mat1, mat2) -> bool:
+    if is_node_meta_valid(mat1) and is_node_meta_valid(mat2):
+        mat1 = mat1.meta["val"]
+        mat2 = mat2.meta["val"]
+    else:
+        return False
+    return (
+        should_decompose_common(mat1, mat2)
+        and len(mat1.shape) == 2
+        and len(mat2.shape) == 2
+        and mat1.shape[1] >= MIN_FIRST_DIMENSION_DECOMPOSITION
+        and mat1.shape[0] < MAX_OTHER_DIMENSION_DECOMPOSITION
+        and mat2.shape[1] < MAX_OTHER_DIMENSION_DECOMPOSITION
+    )
+
+
+def is_node_meta_valid(node: torch.fx.Node):
+    return "val" in node.meta
+
+
+def print_decompose_pattern(match: Match, inputs: List[torch.fx.Node]):
+    node = match.nodes[-1]
+    log.debug(
+        "Decompose %s with input shape: %s",
+        node.target,
+        ", ".join(
+            str(input.meta["val"].shape) if "val" in input.meta else "None"
+            for input in inputs
+        ),
+    )
+
+
+@register_graph_pattern(
+    CallFunction(aten.bmm, Arg(), Arg()),
+    pass_dict=decompose_mm_pass,
+    extra_check=config_flag("decompose_mem_bound_mm"),
+)
+def decompose_bmm(match: Match, mat1: torch.fx.Node, mat2: torch.fx.Node):
+    def repl(mat1, mat2):
+        return torch.sum(mat1[:, :, :, None] * mat2[:, None, :, :], dim=-2)
+
+    if should_decompose_bmm(mat1, mat2):
+        counters["inductor"]["decompose_bmm"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.addmm, Arg(), Arg(), Arg()),
+    pass_dict=decompose_mm_pass,
+    extra_check=config_flag("decompose_mem_bound_mm"),
+)
+def decompose_addmm(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+    mat3: torch.fx.Node,
+):
+    def repl(mat1, mat2, mat3):
+        return torch.sum(mat2[:, :, None] * mat3[None, :, :], dim=-2) + mat1
+
+    if should_decompose_mm(mat2, mat3):
+        counters["inductor"]["decompose_addmm"] += 1
+        match.replace_by_example(repl, [mat1, mat2, mat3])
+        print_decompose_pattern(match, [mat1, mat2, mat3])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.mm, CallFunction(aten.permute, Arg(), Ignored()), Arg()),
+    pass_dict=decompose_mm_pass,
+    extra_check=config_flag("decompose_mem_bound_mm"),
+)
+def decompose_mmt(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+):
+    def repl(mat1, mat2):
+        return torch.sum(mat1[:, :, None] * mat2[:, None, :], dim=0)
+
+    if should_decompose_mmt(mat1, mat2):
+        counters["inductor"]["decompose_mmt"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.mm, Arg(), Arg()),
+    pass_dict=decompose_mm_pass,
+    extra_check=config_flag("decompose_mem_bound_mm"),
+)
+def decompose_mm(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+):
+    def repl(mat1, mat2):
+        return torch.sum(mat1[:, :, None] * mat2[None, :, :], dim=-2)
+
+    if should_decompose_mm(mat1, mat2):
+        counters["inductor"]["decompose_mm"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+    return
+
+
+@register_graph_pattern(
+    CallFunction(aten.mm, Arg(), Arg()),
+    pass_dict=decompose_mm_pass,
+    extra_check=config_flag("decompose_mem_bound_mm"),
+)
+def decompose_mm_large_k(
+    match: Match,
+    mat1: torch.fx.Node,
+    mat2: torch.fx.Node,
+):
+    def repl(mat1, mat2):
+        mat1 = mat1.permute(1, 0)
+        return torch.sum(mat1[:, :, None] * mat2[:, None, :], dim=0)
+
+    if should_decompose_mm_largek(mat1, mat2):
+        counters["inductor"]["decompose_mm_large_k"] += 1
+        match.replace_by_example(repl, [mat1, mat2])
+        print_decompose_pattern(match, [mat1, mat2])
+    return
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/dedupe_symint_uses.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/dedupe_symint_uses.py
new file mode 100644
index 0000000000000000000000000000000000000000..0df666affd75ab02b3b62c67d8fdd65bc3171fb5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/dedupe_symint_uses.py
@@ -0,0 +1,78 @@
+from dataclasses import dataclass
+from typing import Union
+
+import torch
+from torch.fx.experimental.proxy_tensor import py_sym_types, SymBool, SymFloat, SymInt
+
+
+@dataclass
+class _SymExprHash:
+    """
+    Hash for a py_sym_types that will use the underlying sympy expression
+    """
+
+    sym_obj: Union[SymInt, SymFloat, SymBool]
+
+    def __hash__(self) -> int:
+        return hash((type(self.sym_obj), self.sym_obj.node.expr))
+
+    def __eq__(self, value) -> bool:
+        if not isinstance(value, _SymExprHash):
+            return False
+        return self.sym_obj.node.expr == value.sym_obj.node.expr
+
+
+class _SymHashingDict:
+    """
+    Wrapper around a dictionary that will convert sym types to hash with _SymExprHash and reuse
+    existing sym proxies.
+
+    SymPy hash is not always reliable so optimistically hash sympy expression, and if those fail,
+    fallback to symnodes.
+    """
+
+    def __init__(self):
+        self.sym_hash_dict = {}
+
+    def __setitem__(self, key, value):
+        self.sym_hash_dict.__setitem__(self._wrap_to_sym_expr_hash(key), value)
+
+    def __getitem__(self, key):
+        return self.sym_hash_dict[self._wrap_to_sym_expr_hash(key)]
+
+    def __contains__(self, key):
+        return self._wrap_to_sym_expr_hash(key) in self.sym_hash_dict
+
+    def get(self, key, default=None):
+        return self.sym_hash_dict.get(self._wrap_to_sym_expr_hash(key), default)
+
+    def _wrap_to_sym_expr_hash(self, key):
+        return _SymExprHash(key) if isinstance(key, py_sym_types) else key
+
+
+def dedupe_symints(graph: torch.fx.Graph):
+    """
+    Dedupes sym ints in the graph to nodes are resolvable to symint graph inputs.
+
+    We only dedupe from graph inputs to avoid adding a potential dependency in the forward
+    from the backward.
+
+    """
+
+    sym_dict = _SymHashingDict()
+    resolvable_from_input_symints = set()
+
+    for node in graph.nodes:
+        val = node.meta.get("val", None)
+        if val is None or not isinstance(val, py_sym_types):
+            continue
+
+        if node.op == "placeholder":
+            resolvable_from_input_symints.add(node)
+            sym_dict[val] = node
+        elif existing_node := sym_dict.get(val):
+            node.replace_all_uses_with(existing_node)
+            graph.erase_node(node)
+        elif all(n in resolvable_from_input_symints for n in node.all_input_nodes):
+            sym_dict[val] = node
+            resolvable_from_input_symints.add(node)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/efficient_conv_bn_eval.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..8712cf39a2a2e27b4e14c927324f3d2826a4e502
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/efficient_conv_bn_eval.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn as nn
+
+from torch._dynamo.utils import counters
+from torch._inductor import config as inductor_config
+from torch.func import functional_call
+
+from ..pattern_matcher import CallModuleVarArgs, Match, register_graph_pattern
+
+from .pre_grad import efficient_conv_bn_eval_pass
+
+
+def efficient_conv_bn_eval(
+    bn: nn.modules.batchnorm._BatchNorm, conv: nn.modules.conv._ConvNd, x: torch.Tensor
+):
+    """
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for **training** as well, but only if one sets `bn.training=False`. It
+     reduces memory footprint and computation cost, at the cost of slightly
+     reduced numerical stability.
+    Args:
+        bn (nn.modules.batchnorm._BatchNorm): a BatchNorm module.
+        conv (nn.modules.conv._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+
+    assert bn.running_var is not None
+
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    target_shape = [-1] + [1] * (conv.weight.ndim - 1)
+    if isinstance(conv, nn.modules.conv._ConvTransposeNd):
+        # for transposed conv, the C_out dimension should at index 1.
+        target_shape[:2] = [target_shape[1], target_shape[0]]
+    weight_coeff = torch.rsqrt(bn.running_var + bn.eps).reshape(target_shape)
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() * (
+        bias_on_the_fly - bn.running_mean
+    )
+
+    input = x
+    params = {"weight": weight_on_the_fly, "bias": bias_on_the_fly}
+    output = functional_call(conv, params, input)
+    return output
+
+
+@register_graph_pattern(
+    CallModuleVarArgs(
+        [
+            nn.modules.batchnorm._BatchNorm,
+            nn.BatchNorm1d,
+            nn.BatchNorm2d,
+            nn.BatchNorm3d,
+            nn.SyncBatchNorm,
+        ],
+    ),
+    pass_dict=efficient_conv_bn_eval_pass,
+    extra_check=lambda match: not inductor_config.freezing
+    and inductor_config.efficient_conv_bn_eval_fx_passes,
+)
+def efficient_conv_bn_eval_graph_transform(match: Match, *args, **kwargs):
+    # We matched a BN node
+    bn_node = match.nodes[0]
+    graph = match.graph
+    gm = graph.owning_module
+    bn_mod = getattr(gm, bn_node.target)  # type: ignore[arg-type]
+
+    # We can only use efficient conv-bn for eval mode with track_running_stats
+    if not bn_mod.track_running_stats or bn_mod.training:
+        return
+
+    # Check if the input is Conv
+    if bn_node.args:
+        input_node = bn_node.args[0]
+    else:
+        input_node = bn_node.kwargs["input"]
+    if input_node.op != "call_module":  # type: ignore[union-attr]
+        return
+    if not hasattr(gm, input_node.target):  # type: ignore[arg-type, union-attr]
+        return
+    input_mod = getattr(gm, input_node.target)  # type: ignore[arg-type, union-attr]
+    supported_convs = [
+        nn.Linear,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.Conv3d,
+        nn.ConvTranspose1d,
+        nn.ConvTranspose2d,
+        nn.ConvTranspose3d,
+    ]
+    if not any(isinstance(input_mod, cls) for cls in supported_convs):
+        return
+    conv_node = input_node
+    # Output of conv is used by other nodes, cannot optimize
+    if len(conv_node.users) > 1:  # type: ignore[union-attr]
+        return
+
+    # Find a pair of conv and bn computation nodes to optimize.
+    counters["inductor"]["efficient_conv_bn_eval"] += 1
+
+    with graph.inserting_before(conv_node):
+        # create `get_attr` node to access modules
+        # note that we directly call `create_node` to fill the `name`
+        # argument. `graph.get_attr` and
+        # `graph.call_function` does not allow the `name` argument.
+        conv_get_node = graph.create_node(
+            op="get_attr", target=conv_node.target, name="get_conv"  # type: ignore[union-attr]
+        )
+        bn_get_node = graph.create_node(
+            op="get_attr", target=bn_node.target, name="get_bn"
+        )
+        if conv_node.args:  # type: ignore[union-attr]
+            conv_input = conv_node.args[0]  # type: ignore[union-attr]
+        else:
+            conv_input = conv_node.kwargs["input"]  # type: ignore[union-attr]
+        # prepare args for the fused function
+        args = (bn_get_node, conv_get_node, conv_input)
+        # create a new node
+        new_node = graph.create_node(
+            op="call_function",
+            target=efficient_conv_bn_eval,
+            args=args,
+            name="efficient_conv_bn_eval",
+        )
+    # this node replaces the original conv + bn, and therefore
+    # should replace the uses of bn_node
+    bn_node.replace_all_uses_with(new_node)
+    # take care of the deletion order:
+    # delete bn_node first, and then conv_node
+    graph.erase_node(bn_node)
+    graph.erase_node(conv_node)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/freezing_patterns.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/freezing_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f679925c534798e4ee6a11ad25e2f256b80196
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/freezing_patterns.py
@@ -0,0 +1,212 @@
+import functools
+
+import torch
+from torch._inductor.compile_fx import fake_tensor_prop
+from ..._dynamo.utils import counters
+
+from .. import config
+from ..pattern_matcher import (
+    _return_true,
+    CallFunction,
+    fwd_only,
+    Ignored,
+    init_once_fakemode,
+    KeywordArg,
+    Match,
+    PatternMatcherPass,
+    register_graph_pattern,
+    register_replacement,
+    stable_topological_sort,
+)
+
+aten = torch.ops.aten
+
+# First pass_patterns[0] are applied, then [1], then [2]
+pass_patterns = [
+    PatternMatcherPass(),
+    PatternMatcherPass(),
+    PatternMatcherPass(),
+]
+
+binary_folding_pass = PatternMatcherPass()
+
+
+def freezing_passes(gm: torch.fx.GraphModule, aot_example_inputs):
+    """
+    Passes that are applied to the graph to freeze pass.
+    """
+
+    from ..freezing import constant_fold
+
+    lazy_init()
+    # We need a few rounds of binary folding to get rid of all the
+    # unnecessary nodes, but may need a good method to chose the rounds number.
+    # works like: conv+binary+binary.
+    binary_folding = counters["inductor"]["binary_folding"]
+    fake_tensor_prop(gm, aot_example_inputs, True)
+
+    torch._inductor.fx_passes.binary_folding.mark_mixed_dtype_allowed_convs(gm)
+    for _ in range(4):
+        constant_fold(gm)
+        # Make sure meta['val'] is properly set for all nodes
+        fake_tensor_prop(gm, aot_example_inputs, True)
+        binary_folding_pass.apply(gm.graph)  # type: ignore[arg-type]
+        # If we don't have binary folding, we don't need to run the pass again.
+        # TODO: remove the need to run fake_tensor_prop on the whole model.
+        if counters["inductor"]["binary_folding"] == binary_folding:
+            break
+        binary_folding = counters["inductor"]["binary_folding"]
+
+    torch._inductor.fx_passes.binary_folding.recover_original_precision_folded_convs(gm)
+
+    constant_fold(gm)
+    fake_tensor_prop(gm, aot_example_inputs, True)
+
+    for pattern in pass_patterns:
+        pattern.apply(gm.graph)  # type: ignore[arg-type]
+
+    # The CPU weight packing always assume the conv's weight is channels last,
+    # So make sure the layout_optimization is on when doing it.
+    if (
+        torch._C._has_mkldnn
+        and config.cpp.weight_prepack
+        and config.layout_optimization
+    ):
+        from .mkldnn_fusion import _eliminate_duplicate_packed_nodes
+
+        _eliminate_duplicate_packed_nodes(gm)
+
+    stable_topological_sort(gm.graph)
+    gm.recompile()
+    gm.graph.lint()
+
+
+@init_once_fakemode
+def lazy_init():
+    if torch._C._has_mkldnn and config.cpp.weight_prepack:
+        from .mkldnn_fusion import _mkldnn_weight_pack_init
+
+        _mkldnn_weight_pack_init()
+
+    from .binary_folding import binary_folding_init
+
+    addmm_patterns_init()
+    binary_folding_init()
+
+
+def register_freezing_graph_pattern(pattern, extra_check=_return_true, pass_number=0):
+    return register_graph_pattern(
+        pattern,
+        extra_check=extra_check,
+        pass_dict=pass_patterns[pass_number],
+    )
+
+
+def register_binary_folding_pattern(pattern, extra_check=_return_true):
+    return register_graph_pattern(
+        pattern,
+        extra_check=extra_check,
+        pass_dict=binary_folding_pass,
+    )
+
+
+@functools.lru_cache(None)
+def addmm_patterns_init():
+    if torch.cuda.is_available():
+        # workaround https://github.com/pytorch/pytorch/issues/97894
+        device = "cuda"
+    else:
+        device = "cpu"
+    val = functools.partial(torch.empty, (10, 10), device=device, requires_grad=False)
+
+    def check_concat_weights(match):
+        weights = [
+            match.kwargs["w1"],
+            match.kwargs["w2"],
+        ]
+        if "w3" in match.kwargs:
+            weights.append(match.kwargs["w3"])
+
+        return all(
+            w.op == "get_attr" and w.meta["val"].shape == weights[0].meta["val"].shape
+            for w in weights
+        )
+
+    def matmul_fuse_pattern(inp, w1, w2, w3):
+        return (inp @ w1, inp @ w2, inp @ w3)
+
+    def matmul_replacement(inp, w1, w2, w3):
+        cat_t = torch.cat((w1, w2, w3), dim=1)
+        mm = inp @ cat_t
+        return mm.chunk(3, dim=1)
+
+    register_replacement(
+        matmul_fuse_pattern,
+        matmul_replacement,
+        [val(), val(), val(), val()],
+        fwd_only,
+        pass_patterns[0],
+        extra_check=check_concat_weights,
+        exclusive_arg_names=("w1", "w2", "w3"),
+    )
+
+    def matmul_fuse_pattern_two(inp, w1, w2):
+        return (inp @ w1, inp @ w2)
+
+    def matmul_replacement_two(inp, w1, w2):
+        cat_t = torch.cat((w1, w2), dim=1)
+        mm = inp @ cat_t
+        return mm.chunk(2, dim=1)
+
+    register_replacement(
+        matmul_fuse_pattern_two,
+        matmul_replacement_two,
+        [val(), val(), val()],
+        fwd_only,
+        pass_patterns[0],
+        extra_check=check_concat_weights,
+        exclusive_arg_names=("w1", "w2"),
+    )
+
+    def addmm_fuse_pattern_second(inp, w1, w2, w3, b1, b2, b3):
+        return (
+            aten.addmm(b1, inp, w1),
+            aten.addmm(b2, inp, w2),
+            aten.addmm(b3, inp, w3),
+        )
+
+    def addmm_fuse_replacement_second(inp, w1, w2, w3, b1, b2, b3):
+        cat_w = torch.cat((w1, w2, w3), dim=1)
+        cat_b = torch.cat((b1, b2, b3))
+        return aten.addmm(cat_b, inp, cat_w).chunk(3, dim=1)
+
+    register_replacement(
+        addmm_fuse_pattern_second,
+        addmm_fuse_replacement_second,
+        [val() for _ in range(7)],
+        fwd_only,
+        pass_patterns[0],
+        extra_check=check_concat_weights,
+        exclusive_arg_names=("w1", "w2", "w3", "b1", "b2", "b3"),
+    )
+
+
+def same_dtype(match):
+    return match.output_node().args[0].meta["val"].dtype == match.kwargs["dtype"]
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.ops.prims.convert_element_type.default,
+        Ignored(),
+        KeywordArg("dtype"),
+    ),
+    pass_dict=pass_patterns[0],
+    extra_check=same_dtype,
+)
+def unnecessary_dtype_convert(match: Match, **kwargs):
+    """Remove unnecessary dtype conversion op, probably left as a result of Conv-Bn folding"""
+    graph = match.graph
+    node = match.output_node()
+    node.replace_all_uses_with(node.args[0])
+    graph.erase_node(node)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/fuse_attention.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/fuse_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47ec3558d114009135664e20025af2dd4cb32e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/fuse_attention.py
@@ -0,0 +1,786 @@
+import functools
+import inspect
+import logging
+import math
+
+import torch
+from ..._dynamo.utils import counters
+from ..pattern_matcher import (
+    filter_nodes,
+    fwd_only,
+    joint_fwd_bwd,
+    register_replacement,
+)
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+
+def _sfdp_pattern_1(query, key, value, inv_scale):
+    return (
+        torch.matmul(query, key.transpose(-2, -1))
+        .div(inv_scale)
+        .softmax(dim=-1)
+        .matmul(value)
+    )
+
+
+def _sfdp_replacement_1(query, key, value, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_2(query, key, value, scale_factor):
+    return (
+        torch.matmul(query, key.transpose(-2, -1))
+        .mul(scale_factor)
+        .softmax(dim=-1)
+        .matmul(value)
+    )
+
+
+def _sfdp_replacement_2(query, key, value, scale_factor):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=scale_factor,
+    )
+
+
+def _sfdp_pattern_3(query, key, value, inv_scale_factor, dropout_p):
+    return torch.nn.functional.dropout(
+        torch.matmul(query, key.transpose(-2, -1))
+        .div(inv_scale_factor)
+        .softmax(dim=-1),
+        p=dropout_p,
+    ).matmul(value)
+
+
+def _sfdp_replacement_3(query, key, value, inv_scale_factor, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=None,
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale_factor,
+    )
+
+
+def _sfdp_pattern_4(query, key, value, scale_factor, dropout_p):
+    return torch.nn.functional.dropout(
+        torch.matmul(query, key.transpose(-2, -1)).mul(scale_factor).softmax(dim=-1),
+        p=dropout_p,
+    ).matmul(value)
+
+
+def _sfdp_replacement_4(query, key, value, scale_factor, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=None,
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=scale_factor,
+    )
+
+
+def _sfdp_pattern_5(query, key, value, attn_mask):
+    attn_weight = torch.softmax(
+        (query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))) + attn_mask, dim=-1
+    )
+    # attn_weight = torch.dropout(attn_weight, dropout_p)
+    return attn_weight @ value
+
+
+def _sfdp_replacement_5(query, key, value, attn_mask):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=0.0,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_6(query, key, value, attn_mask, dropout_p):
+    attn_weight = torch.softmax(
+        (query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))) + attn_mask, dim=-1
+    )
+    attn_weight = torch.dropout(attn_weight, dropout_p, True)
+    return attn_weight @ value
+
+
+def _sfdp_replacement_6(query, key, value, attn_mask, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.contiguous(),
+        key.contiguous(),
+        value.contiguous(),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=dropout_p,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_7(query, key, value, dropout_p):
+    # in real workloads inputs to matmul are permuted
+    # causing matmul to expand to a series of expand and clone calls
+    # we want the same to happen during pattern tracing
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    div = q @ k.transpose(-2, -1) / math.sqrt(q.size(-1))
+    div = div.to(torch.float32)
+    attn_weight = torch.softmax(div, dim=-1)
+    attn_weight = torch.dropout(attn_weight, dropout_p, True)
+    attn_weight = attn_weight.to(torch.float16)
+    return attn_weight @ v
+
+
+def _sfdp_replacement_7(query, key, value, dropout_p):
+    # sdpa prefers inputs in permuted format
+    # it makes a copy to put them in this format
+    # if they aren't already
+    # to make replacement efficient ensure that inputs to sdpa
+    # are in required order
+    counters["inductor"]["fuse_attention"] += 1
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return aten.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=None,  # attn_mask,
+        dropout_p=dropout_p,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_8(query, key, value):
+    # no dropout version of pattern 7
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    div = q @ k.transpose(-2, -1) / math.sqrt(q.size(-1))
+    div = div.to(torch.float32)
+    attn_weight = torch.softmax(div, dim=-1)
+    attn_weight = attn_weight.to(torch.float16)
+    return attn_weight @ v
+
+
+def _sfdp_replacement_8(query, key, value):
+    counters["inductor"]["fuse_attention"] += 1
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return aten.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=None,  # attn_mask,
+        dropout_p=0.0,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_9(query, key, value, dropout_p):
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    q = q / math.sqrt(q.size(-1))
+    div = q @ k.transpose(-2, -1)
+    div = div.to(torch.float32)
+    attn_weight = torch.softmax(div, dim=-1)
+    attn_weight = torch.dropout(attn_weight, dropout_p, True)
+    attn_weight = attn_weight.to(torch.float16)
+    return attn_weight @ v
+
+
+def _sfdp_replacement_9(query, key, value, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return aten.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=None,  # attn_mask,
+        dropout_p=dropout_p,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_10(query, key, value):
+    # no dropout version of 9
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    q = q / math.sqrt(q.size(-1))
+    div = q @ k.transpose(-2, -1)
+    div = div.to(torch.float32)
+    attn_weight = torch.softmax(div, dim=-1)
+    attn_weight = attn_weight.to(torch.float16)
+    return attn_weight @ v
+
+
+def _sfdp_replacement_10(query, key, value):
+    counters["inductor"]["fuse_attention"] += 1
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return aten.scaled_dot_product_attention(
+        q,
+        k,
+        v,
+        attn_mask=None,  # attn_mask,
+        dropout_p=0.0,
+        is_causal=False,
+    )
+
+
+def _sfdp_pattern_11(query, key, value, inv_scale):
+    # Mainly for huggingface models
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return torch.matmul(q, k.transpose(-2, -1)).div(inv_scale).softmax(dim=-1).matmul(v)
+
+
+def _sfdp_replacement_11(query, key, value, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_12(query, key, value, inv_scale_factor, dropout_p):
+    q = query.permute(0, 2, 1, 3)
+    k = key.permute(0, 2, 1, 3)
+    v = value.permute(0, 2, 1, 3)
+    return torch.nn.functional.dropout(
+        torch.matmul(q, k.transpose(-2, -1)).div(inv_scale_factor).softmax(dim=-1),
+        p=dropout_p,
+    ).matmul(v)
+
+
+def _sfdp_replacement_12(query, key, value, inv_scale_factor, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=None,
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale_factor,
+    )
+
+
+def _sfdp_pattern_13(query, key, value, dropout_p):
+    attn_weight = torch.bmm(query, key.transpose(1, 2)).softmax(dim=-1)
+    attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p)
+    return torch.bmm(attn_weight, value)
+
+
+def _sfdp_replacement_13(query, key, value, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.unsqueeze(0),
+        key.unsqueeze(0),
+        value.unsqueeze(0),
+        dropout_p=dropout_p,
+        scale=1.0,
+    ).squeeze(0)
+
+
+def _sfdp_pattern_14(query, key, value, attn_mask, inv_scale):
+    # for BertLarge
+    # Permutations are needed to create clones in graph.
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    return (
+        (torch.matmul(q, k.transpose(-2, -1)).div(inv_scale) + attn_mask)
+        .softmax(dim=-1)
+        .matmul(v)
+    )
+
+
+def _sfdp_replacement_14(query, key, value, attn_mask, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_15(query, key, value, attn_mask, inv_scale):
+    # for DistilBert
+    # Permutations are needed to create clones in graph.
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    bs = q.size(0)
+    k_len = k.size(-2)
+    scores = q @ k.transpose(-2, -1)
+    scores = scores.div(inv_scale)
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+    return torch.softmax(scores.masked_fill(attn_mask, fill_value), dim=-1) @ v
+
+
+def _sfdp_replacement_15(query, key, value, attn_mask, inv_scale):
+    counters["inductor"]["fuse_attention"] += 1
+    bs = query.size(0)
+    n_head = query.size(2)
+    q_len = query.size(1)
+    k_len = key.size(1)
+    # do attn_mask->logical_not() in aten.scaled_dot_product_attention
+    attn_mask = (
+        (attn_mask == 1).view((bs, 1, 1, k_len)).expand((bs, n_head, q_len, k_len))
+    )
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=torch.bool),
+        dropout_p=0.0,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_16(query, key, value, attn_mask, inv_scale, dropout_p):
+    # for BertLarge with dropout
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    return (
+        torch.nn.functional.dropout(
+            (torch.matmul(q, k.transpose(-2, -1)).div(inv_scale) + attn_mask).softmax(
+                dim=-1
+            ),
+            dropout_p,
+        )
+        .to(dtype=query.dtype)
+        .matmul(v)
+    )
+
+
+def _sfdp_replacement_16(query, key, value, attn_mask, inv_scale, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=query.dtype),
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_pattern_17(query, key, value, attn_mask, inv_scale, dropout_p):
+    # for DistilBert with dropout
+    q = query.permute([0, 2, 1, 3])
+    k = key.permute([0, 2, 1, 3])
+    v = value.permute([0, 2, 1, 3])
+    bs = q.size(0)
+    k_len = k.size(-2)
+    scores = q @ k.transpose(-2, -1)
+    scores = scores.div(inv_scale)
+    fill_value = torch.full((), -float("inf"), dtype=query.dtype, device=query.device)
+    attn_mask = (attn_mask == 0).view((bs, 1, 1, k_len)).expand_as(scores)
+    return (
+        torch.nn.functional.dropout(
+            torch.softmax(scores.masked_fill(attn_mask, fill_value), dim=-1), dropout_p
+        )
+        @ v
+    )
+
+
+def _sfdp_replacement_17(query, key, value, attn_mask, inv_scale, dropout_p):
+    counters["inductor"]["fuse_attention"] += 1
+    bs = query.size(0)
+    n_head = query.size(2)
+    q_len = query.size(1)
+    k_len = key.size(1)
+    # do attn_mask->logical_not() in aten.scaled_dot_product_attention
+    attn_mask = (
+        (attn_mask == 1).view((bs, 1, 1, k_len)).expand((bs, n_head, q_len, k_len))
+    )
+    return aten.scaled_dot_product_attention(
+        query.transpose(1, 2),
+        key.transpose(1, 2),
+        value.transpose(1, 2),
+        attn_mask=attn_mask.to(dtype=torch.bool),
+        dropout_p=dropout_p,
+        is_causal=False,
+        scale=1.0 / inv_scale,
+    )
+
+
+def _sfdp_params_check(match):
+    assert all(k in match.kwargs for k in ("query", "key", "value"))
+    query = match.kwargs["query"].meta["val"]
+    key = match.kwargs["key"].meta["val"]
+    value = match.kwargs["value"].meta["val"]
+    if not (query.dtype == key.dtype == value.dtype) or not (
+        query.device == key.device == value.device
+    ):
+        return False
+    add_mask_node = filter_nodes(match.nodes, aten.add.Tensor)
+    # Has attn_mask add.
+    if len(add_mask_node) > 0:
+        attn_mask_node = add_mask_node[0].args[1]
+        # attn_mask_node may be a float/int number.
+        if not hasattr(attn_mask_node, "meta"):
+            return False
+        attn_mask = attn_mask_node.meta["val"]  # type: ignore[union-attr]
+        # Make sure attn_mask.dtype == query.dtype or attn_mask.dtype == torch.bool
+        # attn_mask.dtype == torch.float for models like albert.
+        if (
+            not isinstance(attn_mask, torch.Tensor)
+            or not (
+                attn_mask.dtype == query.dtype
+                or attn_mask.dtype == torch.bool
+                or attn_mask.dtype == torch.float
+            )
+            or query.device != attn_mask.device
+        ):
+            return False
+    return True
+
+
+def _sfdp_extra_check(scale_factor_op, disable_cuda=False):
+    def fn(match):
+        scale_factor_node = filter_nodes(match.nodes, scale_factor_op)[0]
+        # Note: args[1] of the scale_factor_node is always the scale_factor for the current patterns.
+        scale_factor = scale_factor_node.args[1]
+        # make sure the scale_factor a float/int. SymInt?
+        if not isinstance(scale_factor, (float, int)):
+            return False
+        if (
+            disable_cuda
+            and "query" in match.kwargs
+            and "cuda" in str(match.kwargs["query"].meta["val"].device)
+        ):
+            return False
+        return _sfdp_params_check(match)
+
+    return fn
+
+
+def partialize_and_update_signature(func, **kwargs):
+    """
+    Equivalent to functools.partial but also updates the signature on returned function
+    """
+    original_sig = inspect.signature(func)
+    parameters = original_sig.parameters
+
+    new_parameters = {
+        key: value for key, value in parameters.items() if key not in kwargs
+    }
+    new_sig = inspect.Signature(parameters=list(new_parameters.values()))
+
+    partial_func = functools.partial(func, **kwargs)
+
+    def wrapper(*args, **kwargs):
+        return partial_func(*args, **kwargs)
+
+    wrapper.__signature__ = new_sig  # type: ignore[attr-defined]
+    wrapper.__name__ = func.__name__
+
+    return wrapper
+
+
+def _get_sfdp_patterns():
+    from .joint_graph import patterns
+
+    if torch.cuda.is_available():
+        # workaround https://github.com/pytorch/pytorch/issues/97894
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # sizes/values don't actually matter for initial trace
+    # once we get a possible match we re-trace with the actual values and verify the match still holds
+    g_inp = functools.partial(
+        torch.empty, (2, 4, 8, 16), device=device, requires_grad=True
+    )
+    # attn_mask
+    b_inp = functools.partial(torch.empty, (1, 1, 8, 8), device=device)
+    m_inp = functools.partial(torch.empty, (2, 1, 1, 4), device=device)
+    # inv_scale
+    c_inp = functools.partial(torch.tensor, 2.0, device=device)
+    # workaround https://github.com/pytorch/pytorch/issues/97894
+    # 0.113377 is a "magic" value that lets us recover the lost input arg relationship
+    d = {"dropout_p": 0.113377}
+
+    # we could also generate all these patterns in 3d.. TODO
+    g_3d_inp = functools.partial(
+        torch.empty, (1024, 128, 128), device=device, requires_grad=True
+    )
+
+    # reshape in matmul decomposition generates a clone when batch_size>1 due to the memory layout change.
+    # however when batch_size=1, reshape does not change the memory layout, so clone would not be generated.
+    # here we need to trace with input of batch_size=1 to generate a pattern graph without clone.
+    g_bs1_inp = functools.partial(
+        torch.empty, (1, 4, 8, 16), device=device, requires_grad=True
+    )
+    m_bs1_inp = functools.partial(torch.empty, (1, 1, 1, 4), device=device)
+
+    # softmax will generate a dtype conversion on inputs if they are in half,
+    # but will not in float, so we generate a pattern for both
+    for dtype in [torch.float, torch.half]:
+        g = functools.partial(g_inp, dtype=dtype)
+        b = functools.partial(b_inp, dtype=dtype)
+        m = functools.partial(m_inp, dtype=dtype)
+        m_float = functools.partial(m_inp, dtype=torch.float)
+        c = functools.partial(c_inp, dtype=dtype)
+        g_3d = functools.partial(g_3d_inp, dtype=dtype)
+        g_bs1 = functools.partial(g_bs1_inp, dtype=dtype)
+        m_bs1 = functools.partial(m_bs1_inp, dtype=dtype)
+        m_bs1_float = functools.partial(m_bs1_inp, dtype=torch.float)
+
+        candidates = [
+            (
+                _sfdp_pattern_1,
+                _sfdp_replacement_1,
+                [g(), g(), g(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_2,
+                _sfdp_replacement_2,
+                [g(), g(), g(), c()],
+                {},
+                _sfdp_extra_check(aten.mul.Tensor),
+            ),
+            (
+                _sfdp_pattern_3,
+                _sfdp_replacement_3,
+                [g(), g(), g(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_4,
+                _sfdp_replacement_4,
+                [g(), g(), g(), c()],
+                d,
+                _sfdp_extra_check(aten.mul.Tensor),
+            ),
+            (
+                _sfdp_pattern_5,
+                _sfdp_replacement_5,
+                [g(), g(), g(), b()],
+                {},
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_6,
+                _sfdp_replacement_6,
+                [g(), g(), g(), b()],
+                d,
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_7,
+                _sfdp_replacement_7,
+                [g(), g(), g()],
+                d,
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_8,
+                _sfdp_replacement_8,
+                [g(), g(), g()],
+                {},
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_9,
+                _sfdp_replacement_9,
+                [g(), g(), g()],
+                d,
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_10,
+                _sfdp_replacement_10,
+                [g(), g(), g()],
+                {},
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_11,
+                _sfdp_replacement_11,
+                [g(), g(), g(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_12,
+                _sfdp_replacement_12,
+                [g(), g(), g(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_13,
+                _sfdp_replacement_13,
+                [g_3d(), g_3d(), g_3d()],
+                d,
+                _sfdp_params_check,
+            ),
+            (
+                _sfdp_pattern_14,
+                _sfdp_replacement_14,
+                [g(), g(), g(), m(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            (
+                _sfdp_pattern_15,
+                _sfdp_replacement_15,
+                [g(), g(), g(), m(), c()],
+                {},
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+            # TODO: Enable CUDA after solving Bert accuracy issue of calling efficient attention
+            (
+                _sfdp_pattern_16,
+                _sfdp_replacement_16,
+                [g(), g(), g(), m(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_16,
+                _sfdp_replacement_16,
+                [g_bs1(), g_bs1(), g_bs1(), m_bs1(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+            ),
+            (
+                _sfdp_pattern_17,
+                _sfdp_replacement_17,
+                [g(), g(), g(), m(), c()],
+                d,
+                _sfdp_extra_check(aten.div.Tensor),
+            ),
+        ]
+        mask_fp32_patterns = ["pattern_16"]
+        if dtype == torch.half:
+            # Add inputs of bf16 q/k/v and fp32 mask, for models like albert.
+            candidates.append(
+                (
+                    _sfdp_pattern_16,
+                    _sfdp_replacement_16,
+                    [g(), g(), g(), m_float(), c()],
+                    d,
+                    _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+                )
+            )
+            candidates.append(
+                (
+                    _sfdp_pattern_16,
+                    _sfdp_replacement_16,
+                    [g_bs1(), g_bs1(), g_bs1(), m_bs1_float(), c()],
+                    d,
+                    _sfdp_extra_check(aten.div.Tensor, disable_cuda=True),
+                )
+            )
+
+        for pattern, replacement, args, workaround, extra_check in candidates:
+            # XXX: when adding a new pattern, re-run `gen_attention_patterns` so the pattern
+            # gets serialized to a python file and does not require tracing at runtime.
+            assert isinstance(workaround, dict)
+            name = pattern.__name__
+
+            if dtype != torch.float:
+                name += "_half"
+                if (
+                    any(p in name for p in mask_fp32_patterns)
+                    and args[3].dtype == torch.float32
+                ):
+                    name += "_mask_fp32"
+            if args[0].size(0) == 1:
+                name += "_bs1"
+
+            training_name = name + "_training"
+            yield training_name, {
+                "search_fn": pattern,
+                "replace_fn": replacement,
+                "example_inputs": args,
+                "trace_fn": joint_fwd_bwd,
+                "pass_dicts": patterns,
+                "extra_check": extra_check,
+                "scalar_workaround": workaround,
+            }
+
+            if workaround:
+                assert len(workaround) == 1 and "dropout_p" in workaround
+                # functools.partial insufficient because we look at signature downstream
+                pattern = partialize_and_update_signature(pattern, dropout_p=0.0)
+                replacement = partialize_and_update_signature(
+                    replacement, dropout_p=0.0
+                )
+                workaround = {}
+
+            inference_name = name + "_inference"
+            yield inference_name, {
+                "search_fn": pattern,
+                "replace_fn": replacement,
+                "example_inputs": args,
+                "trace_fn": fwd_only,
+                "pass_dicts": patterns,
+                "extra_check": extra_check,
+                "scalar_workaround": workaround,
+            }
+
+
+@functools.lru_cache(None)
+def _sfdp_init():
+    from .serialized_patterns.central_index import get_serialized_pattern
+
+    for key, register_replacement_kwargs in _get_sfdp_patterns():
+        search_fn_pattern = get_serialized_pattern(key)
+        register_replacement(
+            **register_replacement_kwargs, search_fn_pattern=search_fn_pattern
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/group_batch_fusion.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/group_batch_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbfb6aa4631d0fafd5c10f89a8bf00b17e6ea684
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/group_batch_fusion.py
@@ -0,0 +1,1059 @@
+import collections
+import logging
+import operator
+from collections import OrderedDict
+from typing import (
+    Any,
+    DefaultDict,
+    Deque,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+)
+
+import torch
+from torch._dynamo.utils import counters
+
+from .. import config
+from ..pattern_matcher import (
+    CallFunctionVarArgs,
+    get_arg_value,
+    stable_topological_sort,
+)
+
+try:
+    # importing this will register fbgemm lowerings for inductor
+    import deeplearning.fbgemm.fbgemm_gpu.fb.inductor_lowerings  # noqa: F401
+
+    has_fbgemm = True
+except Exception:
+    has_fbgemm = False
+    pass
+
+aten = torch.ops.aten
+
+log = logging.getLogger(__name__)
+
+MIN_FUSE_SET_SIZE = 5
+MAX_FUSE_SET_SIZE = 300
+MAX_FUSE_SEARCH_DEPTH = 5
+# The maximum tensor size that can go into the fusion group
+MAX_FUSE_TENSOR_SIZE_GROUP_LINEAR = 4096
+
+# exclude these nodes from BFS
+# excluding get item improves optimizer compilation time by 60s
+SEARCH_EXCLUSIONS = {operator.getitem}
+
+
+default_graph_search_options = {
+    "min_fuse_set_size": MIN_FUSE_SET_SIZE,
+    "max_fuse_set_size": MAX_FUSE_SET_SIZE,
+    "max_fuse_search_depth": MAX_FUSE_SEARCH_DEPTH,
+    "max_fuse_tensor_size_group_linear": MAX_FUSE_TENSOR_SIZE_GROUP_LINEAR,
+}
+
+graph_search_options = default_graph_search_options
+
+
+def update_stack_example_value(node, metadata, dim=0, op=torch.stack):
+    """
+    Update the example value of the node in the graph to enable followup split cat opt.
+    """
+    if node is not None and hasattr(node, "meta"):
+        if op == torch.stack:
+            example_value = torch.stack(metadata, dim=dim)
+        elif op == torch.unbind:
+            example_value = torch.unbind(metadata, dim=dim)  # type: ignore[assignment]
+        else:
+            return
+        node.meta["example_value"] = example_value
+
+
+def update_pointwise_example_value(pointwise_node, input, other, op):
+    """
+    Update the example value of the add node in the graph to enable followup split cat opt.
+    """
+    if pointwise_node is not None and hasattr(pointwise_node, "meta"):
+        if op == torch.add:
+            example_value = torch.add(input, other)
+        elif op == torch.mul:
+            example_value = torch.mul(input, other)
+        else:
+            return
+        pointwise_node.meta["example_value"] = example_value
+
+
+class GroupBatchFusionBase:
+    def __init__(self, **kwargs):
+        self.graph_search_options = kwargs.pop(
+            "graph_search_options", default_graph_search_options
+        )
+
+    def match(self, node):
+        raise NotImplementedError("match called on base")
+
+    def fuse(self, graph, subset):
+        raise NotImplementedError("fuse called on base")
+
+
+PRE_GRAD_FUSIONS: Dict[str, GroupBatchFusionBase] = dict()
+POST_GRAD_FUSIONS: Dict[str, GroupBatchFusionBase] = dict()
+
+
+def register_fusion(name: str, pre_grad=True):
+    def decorator(fusion_cls: GroupBatchFusionBase):
+        if pre_grad:
+            PRE_GRAD_FUSIONS[name] = fusion_cls
+        else:
+            POST_GRAD_FUSIONS[name] = fusion_cls
+        return fusion_cls
+
+    return decorator
+
+
+def list_group_batch_fusions(pre_grad=True) -> List[str]:
+    if pre_grad:
+        return list(PRE_GRAD_FUSIONS.keys())
+    else:
+        return list(POST_GRAD_FUSIONS.keys())
+
+
+def decompose_stack(graph: torch.fx.GraphModule, input_tensors: List[Any]) -> Any:
+    unsqueezed_inputs = []
+    for input_tensor in input_tensors:
+        unsqueezed_input = graph.call_function(
+            aten.unsqueeze, args=(input_tensor,), kwargs={"dim": 0}
+        )
+        unsqueezed_inputs.append(unsqueezed_input)
+    stacked_inputs = graph.call_function(
+        aten.cat, args=(unsqueezed_inputs,), kwargs={"dim": 0}
+    )
+    return stacked_inputs
+
+
+class GroupFusion(GroupBatchFusionBase):
+    """
+    Fuse ops in a group way, e.g, fuse mm/addmm of arbitrary input shapes with fbgemm.gmm.
+    """
+
+    pass
+
+
+class BatchFusion(GroupBatchFusionBase):
+    """
+    Fuse ops in a batch way, e.g, fuse mm/addmm of same input shapes with bmm.
+    """
+
+    pass
+
+
+class BatchPointwiseOpsFusionFactory(BatchFusion):
+    def __init__(self, op, **kwargs):
+        super().__init__(**kwargs)
+        self.op = op
+
+
+@register_fusion("batch_linear_post_grad", pre_grad=False)
+class PostGradBatchLinearFusion(BatchFusion):
+    """
+    Fuse ops in a batch way in post grad (aten level).
+    """
+
+    def _addmm_node_can_be_fused(self, node: torch.fx.Node) -> bool:
+        return (
+            node.kwargs.get("beta", 1.0) == 1.0 and node.kwargs.get("alpha", 1.0) == 1.0  # type: ignore[return-value]
+        )
+
+    def _is_input_2d(self, input: torch.fx.Node) -> bool:
+        input_shapes = input.meta["tensor_meta"].shape
+        return (
+            len(input_shapes) == 2
+            and isinstance(input_shapes[0], int)
+            and isinstance(input_shapes[1], int)
+        )
+
+    def match(self, node: torch.fx.Node) -> Optional[Tuple[str, int, int, int, bool]]:
+        if CallFunctionVarArgs(aten.mm).match(node):
+            input_m, weight_m = node.args
+            bias_m = None
+
+        elif CallFunctionVarArgs(aten.addmm.default).match(
+            node
+        ) and self._addmm_node_can_be_fused(node):
+            bias_m, input_m, weight_m = node.args
+        else:
+            return None
+
+        # only handle the cases where inputs are 2D tensors
+        if not self._is_input_2d(input_m) or not self._is_input_2d(weight_m):  # type: ignore[arg-type]
+            return None
+        m, k = input_m.meta["tensor_meta"].shape  # type: ignore[union-attr]
+        n = weight_m.meta["tensor_meta"].shape[1]  # type: ignore[union-attr]
+        batch_key = ("batch_linear", m, k, n, bias_m is not None)
+        return batch_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        batch_inputs = []
+        batch_weights = []
+        batch_biases = []
+        batch_nodes = []
+
+        for node in subset:
+            if CallFunctionVarArgs(aten.addmm.default).match(node):
+                bias, input, weight = node.args
+            elif CallFunctionVarArgs(aten.mm.default).match(node):
+                input, weight = node.args
+                bias = None
+            batch_nodes.append(node)
+            batch_inputs.append(input)  # type: ignore[possibly-undefined]
+            batch_weights.append(weight)  # type: ignore[possibly-undefined]
+            batch_biases.append(bias)  # type: ignore[possibly-undefined]
+
+        with graph.inserting_before(subset[-1]):
+            fused_inputs = decompose_stack(graph, batch_inputs)
+            fused_weights = decompose_stack(graph, batch_weights)
+            fused_bmm = graph.call_function(
+                aten.bmm,
+                args=(fused_inputs, fused_weights),
+            )
+
+        for i, original_mm in enumerate(batch_nodes):
+            has_bias = False
+            with graph.inserting_after(fused_bmm):
+                new_mm = graph.call_function(aten.select, args=((fused_bmm, 0, i)))
+                if batch_biases[i]:
+                    has_bias = True
+                    new_bias_add = graph.call_function(
+                        aten.add, args=((batch_biases[i], new_mm))
+                    )
+            new_mm_cont = new_bias_add if has_bias else new_mm  # type: ignore[possibly-undefined]
+            original_mm.replace_all_uses_with(new_mm_cont)
+            new_mm_cont.meta.update(original_mm.meta)
+            graph.erase_node(original_mm)
+
+
+@register_fusion("group_linear", pre_grad=False)
+class GroupLinearFusion(GroupFusion):
+    def _addmm_node_can_be_fused(self, node: torch.fx.Node):
+        input_shape = node.args[1].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        weight_shape = node.args[2].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        return (
+            node.kwargs.get("beta", 1.0) == 1.0
+            and node.kwargs.get("alpha", 1.0) == 1.0
+            and len(input_shape) == 2
+            and len(weight_shape) == 2
+            and all(x % 2 == 0 for x in input_shape + weight_shape)
+            and all(
+                shape <= self.graph_search_options["max_fuse_tensor_size_group_linear"]
+                for shape in input_shape + weight_shape
+            )
+        )
+
+    def _mm_node_can_be_fused(self, node: torch.fx.Node):
+        input_shape = node.args[0].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        weight_shape = node.args[1].meta["tensor_meta"].shape  # type: ignore[union-attr]
+        return (
+            len(input_shape) == 2
+            and len(weight_shape) == 2
+            and all(x % 2 == 0 for x in input_shape + weight_shape)
+            and all(
+                shape <= self.graph_search_options["max_fuse_tensor_size_group_linear"]
+                for shape in input_shape + weight_shape
+            )
+        )
+
+    def match(self, node: torch.fx.Node) -> Optional[Tuple[str, bool]]:
+        if CallFunctionVarArgs(aten.mm.default).match(
+            node
+        ) and self._mm_node_can_be_fused(node):
+            group_key = ("group_linear", True)
+        elif CallFunctionVarArgs(aten.addmm.default).match(
+            node
+        ) and self._addmm_node_can_be_fused(node):
+            bias = node.args[0]
+            group_key = ("group_linear", bias is None)
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        group_inputs = []
+        group_weights = []
+        group_biases = []
+        group_nodes = []
+        for node in subset:
+            if CallFunctionVarArgs(aten.addmm.default).match(node):
+                bias, input, weight = node.args
+            else:
+                assert CallFunctionVarArgs(aten.mm.default).match(node)
+                input, weight = node.args
+                bias = None
+
+            group_nodes.append(node)
+            group_inputs.append(input)
+            group_weights.append(weight)
+            group_biases.append(bias)
+
+        if all(bias is None for bias in group_biases):
+            group_biases = None  # type: ignore[assignment]
+        group_biases: Optional[List[Any]]
+
+        with graph.inserting_before(subset[0]):
+            fused_mm = graph.call_function(
+                torch.ops.fbgemm.gmm.default,
+                args=(group_inputs, group_weights, group_biases),
+                kwargs={"smart_fused": True},
+            )
+
+        for i, original_mm in enumerate(group_nodes):
+            with graph.inserting_after(fused_mm):
+                new_mm = graph.call_function(operator.getitem, args=(fused_mm, i))
+            original_mm.replace_all_uses_with(new_mm)
+            new_mm.meta.update(original_mm.meta)
+            graph.erase_node(original_mm)
+
+
+class BatchPointwiseOpsPostGradFusion(BatchPointwiseOpsFusionFactory):
+    """
+    Batch pointwise operator (e.g., add, mul) in post grad pass.
+    """
+
+    def __init__(self, op, **kwargs):
+        super().__init__(op, **kwargs)
+        self.op = op
+
+    def _pointwise_node_can_be_fused(self, node: torch.fx.Node):
+        # note: we only consider the case where the inputs are tensors
+        # for mixed precision training, we need to make sure the inputs
+        # of the aten.cat when do the stack should be the same dtype
+        # otherwise, the output of the aten.cat may be not the same as
+        # its inputs, and cause dtype not same error in mm or addmm
+        input, other = node.args
+        return (
+            input.meta["tensor_meta"].shape == other.meta["tensor_meta"].shape  # type: ignore[union-attr]
+            if hasattr(input, "meta")
+            and hasattr(other, "meta")
+            and "tensor_meta" in input.meta  # type: ignore[union-attr]
+            and "tensor_meta" in other.meta  # type: ignore[union-attr]
+            else False
+        )
+
+    def match(self, node: torch.fx.Node):
+        if CallFunctionVarArgs(self.op).match(
+            node
+        ) and self._pointwise_node_can_be_fused(node):
+            alpha = node.kwargs.get("alpha", 1.0)
+            rounding_mode = node.kwargs.get("rounding_mode", None)
+            input, other = node.args
+            shape = list(input.meta["tensor_meta"].shape)  # type: ignore[union-attr]
+            group_key = (
+                "batch_" + self.op.__name__.lower() + "_post_grad",
+                str(shape),
+                str(input.meta["tensor_meta"].dtype),  # type: ignore[union-attr]
+                str(other.meta["tensor_meta"].dtype),  # type: ignore[union-attr]
+                str(alpha),
+                str(rounding_mode),
+            )
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        batch_inputs, batch_others = [], []
+        alpha = subset[0].kwargs.get("alpha", 1.0)
+
+        for node in subset:
+            input, other = node.args
+            batch_inputs.append(input)
+            batch_others.append(other)
+
+        with graph.inserting_before(subset[0]):
+            stack_inputs = decompose_stack(graph, batch_inputs)
+            stack_others = decompose_stack(graph, batch_others)
+
+            batch_op = graph.call_function(
+                self.op,
+                args=(stack_inputs, stack_others),
+                kwargs={"alpha": alpha} if self.op == aten.add.Tensor else {},
+            )
+            for i, original_add in enumerate(subset):
+                with graph.inserting_after(batch_op):
+                    new_add = graph.call_function(
+                        torch.ops.aten.select, args=((batch_op, 0, i))
+                    )
+                original_add.replace_all_uses_with(new_add)
+                new_add.meta.update(original_add.meta)
+                graph.erase_node(original_add)
+
+
+@register_fusion("batch_linear_lhs")
+class BatchLinearLHSFusion(BatchFusion):
+    """
+    Batch linear left-hand side fusion. This pass tries to fuse the following patterns:
+
+        torch.nn.functional.linear(x, w1), linear(x, w2),... * linear(x, wn)
+        -> torch.mm(x, torch.cat([w1, w2,... * wn]).transpose(0, 1))
+
+    We have a separate pass to eliminate contiguous transpose in a generic way.
+    """
+
+    def match(self, node: torch.fx.Node) -> Optional[Tuple[str, bool, Any]]:
+        if CallFunctionVarArgs(torch.nn.functional.linear).match(
+            node
+        ) and is_linear_node_can_be_fused(node):
+            input = get_arg_value(node, 0, "input")
+            bias = get_arg_value(node, 2, "bias")
+            group_key = ("batch_linear_lhs", bias is None, input)
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        batch_nodes = []
+        batch_input = None
+        batch_weights = []
+        batch_biases = []
+        split_sections = []
+        for node in subset:
+            input = get_arg_value(node, 0, "input")
+            weight = get_arg_value(node, 1, "weight")
+            bias = get_arg_value(node, 2, "bias")
+            batch_nodes.append(node)
+            if batch_input is None:
+                batch_input = input
+            else:
+                assert batch_input is input
+            batch_weights.append(weight)
+            if bias:
+                batch_biases.append(bias)
+            split_sections.append(weight.meta["example_value"].shape[0])
+
+        with graph.inserting_before(subset[0]):
+            cat_weights = graph.call_function(
+                torch.cat, args=(batch_weights,), kwargs={"dim": 0}
+            )
+            transposed_weights = graph.call_function(
+                torch.transpose, args=(cat_weights, 0, 1)
+            )
+            if len(batch_biases) > 0:
+                cat_biases = graph.call_function(
+                    torch.cat, args=(batch_biases,), kwargs={"dim": 0}
+                )
+                fused_lhs = graph.call_function(
+                    torch.addmm,
+                    args=(cat_biases, batch_input, transposed_weights),
+                )
+            else:
+                fused_lhs = graph.call_function(
+                    torch.mm,
+                    args=(batch_input, transposed_weights),
+                )
+            fused_lhs_list = graph.call_function(
+                torch.split, args=(fused_lhs, split_sections), kwargs={"dim": 1}
+            )
+
+        for i, node in enumerate(batch_nodes):
+            with graph.inserting_after(fused_lhs_list):
+                new_node = graph.call_function(
+                    operator.getitem, args=(fused_lhs_list, i)
+                )
+            node.replace_all_uses_with(new_node)
+            new_node.meta.update(node.meta)
+            graph.erase_node(node)
+
+
+def is_node_meta_valid(node: Optional[torch.fx.Node]):
+    if node is None:
+        return True
+    if "example_value" not in node.meta:
+        return False
+    return True
+
+
+def is_linear_node_can_be_fused(node: torch.fx.Node):
+    input = get_arg_value(node, 0, "input")
+    weight = get_arg_value(node, 1, "weight")
+    return (
+        is_node_meta_valid(node)
+        and is_node_meta_valid(input)
+        and is_node_meta_valid(weight)
+        and len(input.meta["example_value"].shape) == 2
+        and len(weight.meta["example_value"].shape) == 2
+    )
+
+
+@register_fusion("batch_linear")
+class PreGradBatchLinearFusion(BatchFusion):
+    """
+    Batch linear fusion in pre grad pass.
+    Fuse linear with same size with torch.baddmm
+    """
+
+    def _getitem_args(self, getitem_node: torch.fx.Node):
+        if getitem_node.target != operator.__getitem__ or (
+            getitem_node.op != "call_function"
+        ):
+            return None
+        return getitem_node.args[0]
+
+    def match(self, node: torch.fx.Node):
+        if CallFunctionVarArgs(torch.nn.functional.linear).match(
+            node
+        ) and is_linear_node_can_be_fused(node):
+            input = get_arg_value(node, 0, "input")
+            weight = get_arg_value(node, 1, "weight")
+            bias = get_arg_value(node, 2, "bias")
+            group_key = (
+                "batch_linear_pre_grad",
+                self._getitem_args(input),
+                str(input.meta["example_value"].shape),
+                str(weight.meta["example_value"].shape),
+                bias is None,
+            )
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        batch_nodes = []
+        batch_inputs = []
+        batch_weights = []
+        batch_biases = []
+        batch_inputs_metadata = []
+        batch_weights_metadata = []
+        batch_biases_metadata = []
+        for node in subset:
+            batch_nodes.append(node)
+            input = get_arg_value(node, 0, "input")
+            batch_inputs.append(input)
+            batch_inputs_metadata.append(input.meta["example_value"])
+            weight = get_arg_value(node, 1, "weight")
+            batch_weights.append(weight)
+            batch_weights_metadata.append(weight.meta["example_value"])
+            bias = get_arg_value(node, 2, "bias")
+            batch_biases.append(bias)
+            if bias is not None and hasattr(bias, "meta"):
+                batch_biases_metadata.append(bias.meta["example_value"])
+
+        with graph.inserting_before(subset[0]):
+            stack_inputs = graph.call_function(
+                torch.stack, args=(batch_inputs,), kwargs={"dim": 0}
+            )
+            update_stack_example_value(stack_inputs, batch_inputs_metadata)
+            stack_weights = graph.call_function(
+                torch.stack, args=(batch_weights,), kwargs={"dim": 0}
+            )
+            update_stack_example_value(stack_weights, batch_weights_metadata)
+            transpose_weight = graph.call_function(
+                torch.transpose, args=(stack_weights, 1, 2)
+            )
+            if all(bias is None for bias in batch_biases):
+                bmm = graph.call_function(
+                    torch.bmm,
+                    args=(stack_inputs, transpose_weight),
+                )
+            else:
+                stack_biases = graph.call_function(
+                    torch.stack, args=(batch_biases,), kwargs={"dim": 0}
+                )
+                update_stack_example_value(stack_biases, batch_biases_metadata)
+                unsqueeze_biases = graph.call_function(
+                    torch.unsqueeze, args=(stack_biases, 1)
+                )
+                bmm = graph.call_function(
+                    torch.baddbmm,
+                    args=(unsqueeze_biases, stack_inputs, transpose_weight),
+                )
+
+            bmm = graph.call_function(torch.unbind, args=(bmm,), kwargs={"dim": 0})
+            for i, linear in enumerate(batch_nodes):
+                with graph.inserting_after(bmm):
+                    getitem = graph.call_function(operator.getitem, args=(bmm, i))
+                linear.replace_all_uses_with(getitem)
+                getitem.meta.update(linear.meta)
+                graph.erase_node(linear)
+
+
+@register_fusion("batch_layernorm")
+class BatchLayernormFusion(BatchFusion):
+    """
+    Batch layer norm fusion in pre grad pass
+    """
+
+    def match(self, node: torch.fx.Node):
+        if CallFunctionVarArgs(torch.nn.functional.layer_norm).match(node):
+            input = get_arg_value(node, 0, "input")
+            weight = get_arg_value(node, 2, "weight")
+            bias = get_arg_value(node, 3, "bias")
+            group_key = (
+                (
+                    "batch_layernorm",
+                    str(input.meta["example_value"].shape),
+                    str(weight.meta["example_value"].shape)
+                    if weight is not None
+                    else "",
+                    str(bias.meta["example_value"].shape) if bias is not None else "",
+                    str(get_arg_value(node, 1, "normalized_shape")),
+                    str(get_arg_value(node, 4, "eps")),
+                )
+                if "example_value" in input.meta
+                and is_node_meta_valid(weight)
+                and is_node_meta_valid(bias)
+                else None
+            )
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        group_inputs = []
+        group_shapes = []
+        group_weights = []
+        group_biases = []
+        group_epss = []
+        group_nodes = []
+        group_inputs_metadata = []
+        group_biases_metadata = []
+        group_weights_metadata = []
+        for node in subset:
+            group_nodes.append(node)
+            input = get_arg_value(node, 0, "input")
+            group_inputs.append(input)
+            group_inputs_metadata.append(input.meta["example_value"])
+            group_shapes.append(get_arg_value(node, 1, "normalized_shape"))
+            weight = get_arg_value(node, 2, "weight")
+            group_weights.append(weight)
+            if weight is not None and hasattr(weight, "meta"):
+                group_weights_metadata.append(weight.meta["example_value"])
+            bias = get_arg_value(node, 3, "bias")
+            group_biases.append(bias)
+            if bias is not None and hasattr(bias, "meta"):
+                group_biases_metadata.append(bias.meta["example_value"])
+            eps = get_arg_value(node, 4, "eps")
+            if eps is None:
+                eps = 1e-5
+            group_epss.append(eps)
+        stack_dim = -1 - len(group_shapes[-1])
+
+        if all(bias is None for bias in group_biases):
+            group_biases = None  # type: ignore[assignment]
+        group_biases: Optional[List[Any]]
+        if all(weight is None for weight in group_weights):
+            group_weights = None  # type: ignore[assignment]
+        group_weights: Optional[List[Any]]
+        assert all(
+            eps == group_epss[0] for eps in group_epss
+        ), "all epsilon values must be equal"
+
+        with graph.inserting_before(subset[0]):
+            stack_input = graph.call_function(
+                torch.stack, args=(group_inputs,), kwargs={"dim": stack_dim}
+            )
+            update_stack_example_value(stack_input, group_inputs_metadata, stack_dim)
+            if group_weights is not None:
+                stack_weight = graph.call_function(
+                    torch.stack, args=(group_weights,), kwargs={"dim": 0}
+                )
+                update_stack_example_value(stack_weight, group_weights_metadata)
+            else:
+                stack_weight = None
+            if group_biases is not None:
+                stack_bias = graph.call_function(
+                    torch.stack, args=(group_biases,), kwargs={"dim": 0}
+                )
+                update_stack_example_value(stack_bias, group_biases_metadata)
+            else:
+                stack_bias = None
+
+            batch_layer_norm = graph.call_function(
+                torch.nn.functional.layer_norm,
+                args=(stack_input, group_shapes[-1]),
+                kwargs={"eps": group_epss[-1]},
+            )
+            batch_layer_norm.meta["example_value"] = stack_input.meta["example_value"]
+
+            if group_weights is not None and group_biases is not None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                batch_layer_norm = graph.call_function(
+                    torch.mul, args=(stack_weight, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_weight.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.mul,
+                )
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                batch_layer_norm = graph.call_function(
+                    torch.add, args=(stack_bias, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_bias.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.add,
+                )
+            elif group_weights is not None and group_biases is None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                batch_layer_norm = graph.call_function(
+                    torch.mul, args=(stack_weight, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_weight.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.mul,
+                )
+            elif group_weights is None and group_biases is not None:
+                previous_batch_layer_norm_meta = batch_layer_norm.meta["example_value"]
+                batch_layer_norm = graph.call_function(
+                    torch.add, args=(stack_bias, batch_layer_norm)
+                )
+                update_pointwise_example_value(
+                    batch_layer_norm,
+                    stack_bias.meta["example_value"],
+                    previous_batch_layer_norm_meta,
+                    torch.add,
+                )
+
+            batch_layer_norm_unbind = graph.call_function(
+                torch.unbind,
+                args=(batch_layer_norm,),
+                kwargs={"dim": stack_dim},
+            )
+            update_stack_example_value(
+                batch_layer_norm_unbind,
+                batch_layer_norm.meta["example_value"],
+                op=torch.unbind,
+                dim=stack_dim,
+            )
+
+        for i, node in enumerate(group_nodes):
+            with graph.inserting_after(batch_layer_norm_unbind):
+                new_node = graph.call_function(
+                    operator.getitem, args=(batch_layer_norm_unbind, i)
+                )
+            node.replace_all_uses_with(new_node)
+            new_node.meta.update(node.meta)
+            graph.erase_node(node)
+
+
+class BatchPointwiseOpsPreGradFusion(BatchPointwiseOpsFusionFactory):
+    """
+    Batch poinwise ops (e.g., sigmoid, relu, tanh) fusion in pre grad pass.
+    We fuse it in random place, and the introduced stack node may be merged in split cat.
+    """
+
+    def __init__(self, op, **kwargs):
+        super().__init__(op, **kwargs)
+        self.op = op
+
+    def match(self, node: torch.fx.Node):
+        input = get_arg_value(node, 0, "input")
+        if CallFunctionVarArgs(self.op).match(node) and is_node_meta_valid(node):
+            # for relu op, we also use the inplace to construct the key
+            group_key = (
+                "batch_" + self.op.__name__.lower() + "_pre_grad",
+                str(input.meta["example_value"].shape),
+                str(node.kwargs.get("inplace", False)),
+            )
+        else:
+            group_key = None
+        return group_key
+
+    def fuse(self, graph: torch.fx.GraphModule, subset: List[torch.fx.Node]):
+        batch_nodes = []
+        batch_inputs = []
+        batch_inputs_metadata = []
+
+        for node in subset:
+            batch_nodes.append(node)
+            input = get_arg_value(node, 0, "input")
+            batch_inputs.append(input)
+            batch_inputs_metadata.append(input.meta["example_value"])
+
+        with graph.inserting_before(subset[0]):
+            stack_inputs = graph.call_function(
+                torch.stack, args=(batch_inputs,), kwargs={"dim": 0}
+            )
+            update_stack_example_value(stack_inputs, batch_inputs_metadata)
+            if self.op == torch.nn.functional.relu:
+                batch_op = graph.call_function(
+                    self.op,
+                    args=(stack_inputs,),
+                    kwargs={"inplace": subset[0].kwargs.get("inplace", False)},
+                )
+            else:
+                batch_op = graph.call_function(
+                    self.op,
+                    args=(stack_inputs,),
+                )
+            unbind_op = graph.call_function(
+                torch.unbind, args=(batch_op,), kwargs={"dim": 0}
+            )
+            for i, node in enumerate(batch_nodes):
+                with graph.inserting_after(unbind_op):
+                    getitem = graph.call_function(operator.getitem, args=(unbind_op, i))
+                node.replace_all_uses_with(getitem)
+                getitem.meta.update(node.meta)
+                graph.erase_node(node)
+
+
+@register_fusion("batch_tanh")
+class BatchTanhPreGradFusion(BatchPointwiseOpsPreGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(torch.tanh, **kwargs)
+
+
+@register_fusion("batch_sigmoid")
+class BatchSigmoidPreGradFusion(BatchPointwiseOpsPreGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(torch.sigmoid, **kwargs)
+
+
+@register_fusion("batch_relu")
+class BatchReLuPreGradFusion(BatchPointwiseOpsPreGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(torch.nn.functional.relu, **kwargs)
+
+
+@register_fusion("batch_aten_add", pre_grad=False)
+class BatchAddPostGradFusion(BatchPointwiseOpsPostGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(aten.add.Tensor, **kwargs)
+
+
+@register_fusion("batch_aten_sub", pre_grad=False)
+class BatchSubPostGradFusion(BatchPointwiseOpsPostGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(aten.sub.Tensor, **kwargs)
+
+
+@register_fusion("batch_aten_div", pre_grad=False)
+class BatchDivPostGradFusion(BatchPointwiseOpsPostGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(aten.div.Tensor, **kwargs)
+
+
+@register_fusion("batch_aten_mul", pre_grad=False)
+class BatchMulPostGradFusion(BatchPointwiseOpsPostGradFusion):
+    def __init__(self, **kwargs):
+        super().__init__(aten.mul.Tensor, **kwargs)
+
+
+class _OrderedSet:
+    def __init__(self, param=None):
+        if param:
+            self.rep = OrderedDict({k: None for k in param})
+        else:
+            self.rep = OrderedDict()
+
+    def __contains__(self, o):
+        return o in self.rep
+
+    def __len__(self):
+        return self.rep.__len__()
+
+    def append(self, o):
+        self.rep[o] = None
+
+    def __iter__(self):
+        return self.rep.keys().__iter__()
+
+
+def find_independent_subset_greedy(
+    node_list: Iterable[torch.fx.Node],
+    graph_search_options: Dict[str, Any],
+) -> Iterator[Iterable[torch.fx.Node]]:
+    """
+    Yields a list of subsets of `node_list` where no element in the subset
+    depends on any other element in the subset. This results in a set of
+    independent nodes which can be fused together.
+
+    The order of `node_list` is preserved within each subset so we can benefit
+    from split-cat elimination in later passes.
+
+    During iteration it is only safe to mutate the graph by changing the nodes
+    that have been returned.
+
+    graph_search_options:
+      - min_fuse_set_size: Minimum size of the subset to consider. Subsets below
+        this size will be ignored.
+      - max_fuse_set_size: Maximum size of the subset to consider. Subsets will
+        be broken to be at most this size.
+    """
+
+    # Compute all the children of `node` which are members of
+    # `interesting_nodes`.
+    def find_dependent_nodes(node, interesting_nodes):
+        visited_node_set: Set[torch.fx.Node] = {node}
+        dep_set: Set[torch.fx.Node] = set()
+
+        work = [node]
+        while work:
+            node = work.pop()
+            for input_node in node.all_input_nodes:
+                if input_node in interesting_nodes:
+                    dep_set.add(input_node)
+
+                if input_node not in visited_node_set:
+                    visited_node_set.add(input_node)
+                    work.append(input_node)
+
+        return dep_set
+
+    min_fuse_set_size = graph_search_options["min_fuse_set_size"]
+    max_fuse_set_size = graph_search_options["max_fuse_set_size"]
+
+    # node_list needs to be a set because we only track the nodes that are left
+    # in it (and we want to do the `in` on a set, not a list). But we want to
+    # keep the correct order.
+    node_list = _OrderedSet(node_list)
+
+    cache: Dict[torch.fx.Node, Set[torch.fx.Node]] = {}
+    while node_list:
+        subset: List[torch.fx.Node] = []
+        subset_deps: Set[torch.fx.Node] = set()
+
+        next_round_node_list = _OrderedSet()
+        for node in node_list:
+            if len(subset) >= max_fuse_set_size or node in subset_deps:
+                next_round_node_list.append(node)
+                continue
+
+            dep_set = cache.pop(node, None)
+            if dep_set is None:
+                dep_set = find_dependent_nodes(node, node_list)
+
+            if not dep_set.intersection(subset):
+                subset.append(node)
+                subset_deps.update(dep_set)
+            else:
+                next_round_node_list.append(node)
+                cache[node] = dep_set
+
+        if len(subset) >= min_fuse_set_size:
+            # Careful here - the caller uses the subsets to fuse nodes together
+            # so we need to clear any cache entry that contains one of the
+            # returned nodes because the dependency list could be different
+            # (larger) after the merge.
+            cache = {k: v for k, v in cache.items() if v.isdisjoint(subset)}
+            yield subset
+
+        node_list = next_round_node_list
+
+
+def get_fusion_candidates(
+    rule: GroupBatchFusionBase, root_node: torch.fx.Node, fused_set: Set[torch.fx.Node]
+) -> DefaultDict[Any, List[torch.fx.Node]]:
+    """
+    Search fusion candidates for a specific rule using BFS starting from the root node.
+    We only search the subgraph within graph_search_options["max_fuse_search_depth"].
+    """
+    q: Deque[Tuple[int, torch.fx.Node]] = collections.deque()
+
+    candidate_dict: DefaultDict[Any, List[torch.fx.Node]] = collections.defaultdict(
+        list
+    )
+
+    if root_node.target in SEARCH_EXCLUSIONS:
+        return candidate_dict
+
+    visited_set: Set[torch.fx.Node] = set()
+
+    for next_node in root_node.all_input_nodes:
+        q.append((1, next_node))
+        visited_set.add(next_node)
+
+    while len(q) > 0:
+        depth, node = q.popleft()
+
+        if node in fused_set:
+            continue
+
+        key = rule.match(node)
+        if key is not None:
+            candidate_nodes = candidate_dict[key]
+            if node not in candidate_nodes:
+                candidate_nodes.append(node)
+        else:
+            if depth < rule.graph_search_options["max_fuse_search_depth"]:
+                for next_node in node.all_input_nodes:
+                    if next_node not in visited_set:
+                        visited_set.add(next_node)
+                        q.append((depth + 1, next_node))
+
+    return candidate_dict
+
+
+def apply_group_batch_fusion(graph: torch.fx.GraphModule, rule: GroupBatchFusionBase):
+    stable_topological_sort(graph)  # type: ignore[arg-type]
+    fused_set: Set[torch.fx.Node] = set()
+
+    for node in reversed(graph.nodes):
+        candidates = get_fusion_candidates(rule, node, fused_set)
+
+        for key, candidate_nodes in candidates.items():
+            if len(candidate_nodes) < rule.graph_search_options["min_fuse_set_size"]:
+                continue
+
+            for subset in find_independent_subset_greedy(
+                candidate_nodes, rule.graph_search_options
+            ):
+                rule.fuse(graph, subset)
+                fused_set.update(subset)
+                if isinstance(rule, GroupFusion):
+                    counters["inductor"]["group_fusion"] += 1
+                elif isinstance(rule, BatchFusion):
+                    counters["inductor"]["batch_fusion"] += 1
+                else:
+                    counters["inductor"]["unknown_group_batch_fusion"] += 1
+
+                log.debug(
+                    f"{rule.__class__.__name__}: key = {key}; subset size = {len(list(subset))}"  # noqa: G004
+                )
+
+
+def generate_fusion_from_config(config_options: Dict[str, Any], pre_grad=True):
+    fusions: List[GroupBatchFusionBase] = []
+    for name, options in config_options.items():
+        fusion_cls = PRE_GRAD_FUSIONS[name] if pre_grad else POST_GRAD_FUSIONS[name]
+        _options = graph_search_options.copy()
+        _options.update(options)
+        fusions.append(fusion_cls(graph_search_options=_options))  # type: ignore[operator]
+    return fusions
+
+
+def group_batch_fusion_passes(graph: torch.fx.Graph, pre_grad=True):
+    fusions: List[GroupBatchFusionBase] = []
+    # we keep all current pre grad fusions to keep
+    # current implementation, will remove this later
+    if pre_grad:
+        fusions += generate_fusion_from_config(
+            config.pre_grad_fusion_options, pre_grad=True
+        )
+    else:
+        fbgemm_fusion_keys = [
+            x
+            for x in config.post_grad_fusion_options
+            if config.post_grad_fusion_options[x].get("require_fbgemm", False)
+        ]
+        fbgemm_fusions = {
+            fusion: config.post_grad_fusion_options[fusion]
+            for fusion in fbgemm_fusion_keys
+        }
+        non_fbgemm_fusions = {
+            fusion: config.post_grad_fusion_options[fusion]
+            for fusion in config.post_grad_fusion_options.keys()
+            if fusion not in fbgemm_fusion_keys
+        }
+        fusions += generate_fusion_from_config(non_fbgemm_fusions, pre_grad=False)
+        if has_fbgemm:
+            fusions += generate_fusion_from_config(fbgemm_fusions, pre_grad=False)
+
+    for rule in fusions:
+        apply_group_batch_fusion(graph, rule)  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/joint_graph.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/joint_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3e292db6922a21a9cfc0f73bb03508ad07f7b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/joint_graph.py
@@ -0,0 +1,341 @@
+import logging
+import typing
+from collections import Counter
+from typing import Dict, List, Set
+
+import torch
+import torch._guards
+from torch._inductor.constant_folding import ConstantFolder
+from torch.multiprocessing.reductions import StorageWeakRef
+
+from .. import config
+from ..pattern_matcher import (
+    CallFunction,
+    init_once_fakemode,
+    KeywordArg,
+    Match,
+    PatternMatcherPass,
+    register_graph_pattern,
+    stable_topological_sort,
+)
+from .replace_random import replace_random_passes
+
+log = logging.getLogger(__name__)
+patterns = PatternMatcherPass()
+
+
+@init_once_fakemode
+def lazy_init():
+    from .fuse_attention import _sfdp_init
+    from .misc_patterns import _misc_patterns_init
+    from .pad_mm import _pad_mm_init
+
+    _pad_mm_init()
+    _sfdp_init()
+    _misc_patterns_init()
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def remove_no_ops(
+    gm: torch.fx.GraphModule, zeros: Set[torch.fx.Node], ones: Set[torch.fx.Node]
+):
+    "Removes no-ops: (+ 0, - 0, * 1, / 1)"
+    aten = torch.ops.aten
+    graph = gm.graph
+
+    def fake_tensors_eq(t1, t2, fields=("shape", "dtype", "device")):
+        if any(not isinstance(t, torch.Tensor) for t in (t1, t2)):
+            return False
+        for field in fields:
+            if getattr(t1, field) != getattr(t2, field):
+                return False
+        return True
+
+    def replace_no_op(node, replace_input_index):
+        replacement = node.args[replace_input_index]
+
+        # https://github.com/pytorch/pytorch/issues/86128 causes
+        # non-Tensor inputs even for ops with only Tensor inputs.
+        # TODO - decompose/type promote to avoid this
+        if not all(isinstance(arg, torch.fx.Node) for arg in node.args):
+            return
+
+        if not fake_tensors_eq(node.meta["val"], replacement.meta["val"]):
+            if fake_tensors_eq(
+                node.meta["val"],
+                replacement.meta["val"],
+                ("shape", "device"),
+            ):
+                with graph.inserting_after(node):
+                    replacement = graph.call_function(
+                        torch.ops.prims.convert_element_type.default,
+                        args=(replacement, node.meta["val"].dtype),
+                    )
+            else:
+                return
+
+        node.replace_all_uses_with(replacement)
+        replacement.meta.update(node.meta)
+        graph.erase_node(node)
+
+    for node in graph.nodes:
+        if node.op != "call_function":
+            continue
+
+        # TODO handle Tensor-Scalar adds, it's a different schema
+        if node.target == aten.add.Tensor and len(node.args) == 2:
+            if (
+                not any(e in zeros for e in node.args)
+                or node.kwargs.get("alpha", 1) != 1
+            ):
+                continue
+
+            replace_index = 1 if node.args[0] in zeros else 0
+            replace_no_op(node, replace_index)
+
+        elif node.target == aten.sub.Tensor and len(node.args) == 2:
+            if node.args[1] not in zeros or node.kwargs.get("alpha", 1) != 1:
+                continue
+
+            replace_no_op(node, 0)
+
+        elif node.target == aten.mul.Tensor and len(node.args) == 2:
+            if not any(e in ones for e in node.args):
+                continue
+
+            replace_input_index = 1 if node.args[0] in ones else 0
+            replace_no_op(node, replace_input_index)
+
+        elif (
+            node.target == aten.div.Tensor
+            and len(node.args) == 2
+            and node.args[1] in ones
+        ):
+            replace_no_op(node, 0)
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def remove_redundant_views(gm: torch.fx.GraphModule):
+    """
+    Removes redundant views by reusing existing ones.
+    """
+
+    # A dictionary mapping a tensor to all aliased views.
+    views: Dict[torch.fx.Node, Dict[torch.dtype, torch.fx.Node]] = {}
+    graph = gm.graph
+
+    for node in graph.nodes:
+        if node.op != "call_function":
+            continue
+
+        if node.target != torch.ops.aten.view.dtype:
+            continue
+
+        src = node.args[0]
+        to_type = node.args[1]
+        existing_views = views.get(src)
+        is_needed = True
+
+        if existing_views:
+            # Replace the view with the an existing view if available.
+            alias = existing_views.get(to_type)
+            if alias:
+                is_needed = False
+                node.replace_all_uses_with(alias)
+                alias.meta.update(node.meta)
+                graph.erase_node(node)
+        else:
+            from_type = src.meta["val"].dtype
+            existing_views = {from_type: src}
+            views[src] = existing_views
+
+        if is_needed:
+            # Save the new alias but do not replace existing one.
+            existing_views.setdefault(to_type, node)
+            views[node] = existing_views
+
+    # Clean up unused views.
+    while True:
+        unused_views = [alias for alias in views if not alias.users]
+        if len(unused_views) == 0:
+            break
+        for unused in unused_views:
+            views.pop(unused)
+            graph.erase_node(unused)
+
+
+class UniformValueConstantFolder(ConstantFolder):
+    """
+    Runs constant folding and replaces tensors that have a unifrom value
+    with a tensor constructor call: aten.full([shape], value, ...)
+    """
+
+    def __init__(self, gm, skip_constructors=False):
+        super().__init__(gm, skip_constructors)
+        self.node_storages_ptrs: Dict[torch.fx.Node, int] = {}
+        self.constant_data_ptrs: Dict[torch.fx.Node, StorageWeakRef] = {}
+        # we may constant fold a tensor which in the graph has a sym size
+        # see: [constant folding refining of symints]
+        self.node_replacements_shapes: Dict[torch.fx.Node, List[int]] = {}
+
+    def insertable_tensor_check(self, t: torch.Tensor) -> bool:
+        # TODO - we could also Tensors which get replaced with arange here
+        return (
+            t.numel() != 0
+            and bool((t == t.flatten()[0]).all())
+            and torch._C._has_storage(t)
+            and t.layout == torch.strided
+        )
+
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor.flatten()[0].item()
+        self.constant_data_ptrs[node] = StorageWeakRef(tensor.untyped_storage())
+        shape = list(tensor.shape)
+        assert all(type(dim) is int for dim in shape)
+        self.node_replacements_shapes[node] = shape
+
+
+@torch.utils._python_dispatch._disable_current_modes()
+def constant_fold_uniform_value(gm: torch.fx.GraphModule):
+    "Runs constant folding and replaces constants which can be constructed with a single `full` call. Calls into remove_no_ops."
+    aten = torch.ops.aten
+
+    # Constant folding can leak memory, especially with repeated compilation, so we are only going to
+    # remove constants which can be replaced with a constructor.
+    cf = UniformValueConstantFolder(gm)
+    cf.run()
+
+    node_replacements = cf.node_replacements
+
+    # note: [constant folding refining of symints]
+    # constant folding will partially evaluate a graph such that values which have dependencies which
+    # are entirely known at compile time may also become compile time constants. in some cases,
+    # this will include symints which we had not yet previously deduced are guaranteed a
+    # constant value and is then deduced in constant folding. an example is:
+    # unbacked_symint_eq_11 = torch.full((), 11).item()
+    # torch.full((unbacked_symint_eq_11,), 0)
+    node_replacements_shapes = cf.node_replacements_shapes
+
+    graph = gm.graph
+
+    zeros = set()
+    ones = set()
+
+    # Got failures in `test_is_set_to_cuda` if we change aliasing on constants,
+    # so just constant-ify if a Tensor is unaliased
+    constant_data_ptr_count: typing.Counter[StorageWeakRef] = Counter()
+
+    for node in cf.node_replacements:
+        constant_data_ptr_count[cf.constant_data_ptrs[node]] += 1
+
+    for node, value in node_replacements.items():
+        # we dont have a functional way right now of instantiating a non-contiguous tensor with full/zeros/ones right now
+        # hasn't shown up to be important yet
+        fake_tensor = node.meta["val"]
+        if not fake_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            continue
+
+        if constant_data_ptr_count[cf.constant_data_ptrs[node]] > 1:
+            continue
+
+        with graph.inserting_after(node):
+            # the conversion from tensor and back to value can be lossy, just use the original full ctor value
+            if (
+                node.op == "call_function"
+                and node.target == aten.full.default
+                and len(node.args) == 2
+            ):
+                value = node.args[1]
+
+            # refines symints, see [constant folding refining of symints] above
+            for runtime_size, compile_time_size in zip(
+                node_replacements_shapes[node], fake_tensor.shape
+            ):
+                torch._check(runtime_size == compile_time_size)
+
+            # zeros, and ones just get traced into full, so we insert those
+            new_node = graph.call_function(
+                aten.full.default,
+                args=(node_replacements_shapes[node], value),
+                kwargs={
+                    "dtype": fake_tensor.dtype,
+                    "layout": torch.strided,
+                    "device": fake_tensor.device,
+                    "pin_memory": False,
+                },
+            )
+
+            new_node.meta.update(node.meta)
+            node.replace_all_uses_with(new_node)
+            graph.erase_node(node)
+
+            if value == 0:
+                zeros.add(new_node)
+            elif value == 1:
+                ones.add(new_node)
+
+    remove_no_ops(gm, zeros, ones)
+    remove_redundant_views(gm)
+
+
+def joint_graph_passes(graph: torch.fx.GraphModule):
+    """
+    Run FX transformations on the joint forwards+backwards graph.
+    """
+    lazy_init()
+    count = 0
+
+    if config.joint_graph_constant_folding:
+        constant_fold_uniform_value(graph)
+
+    if config.pattern_matcher:
+        count += patterns.apply(graph.graph)  # type: ignore[arg-type]
+
+    if not config.fallback_random:
+        count += replace_random_passes(graph)
+
+    if count:
+        stable_topological_sort(graph.graph)
+        graph.graph.lint()
+        graph.recompile()
+    return graph
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.ops.prims.convert_element_type.default,
+        CallFunction(
+            torch.ops.prims.convert_element_type.default,
+            KeywordArg("arg"),
+            KeywordArg("dtype1"),
+        ),
+        KeywordArg("dtype2"),
+    ),
+    pass_dict=patterns,
+)
+def pointless_convert(match: Match, arg, dtype1: torch.dtype, dtype2: torch.dtype):
+    """Remove chain of dtype conversions often created by AMP"""
+    graph = match.graph
+    node = match.output_node()
+    allowed = {torch.float16, torch.bfloat16, torch.float32, torch.float64}
+    if dtype1 in allowed and dtype2 in allowed:
+        repl = graph.call_function(
+            torch.ops.prims.convert_element_type.default, (arg, dtype2)
+        )
+        repl.meta.update(node.meta)
+        node.replace_all_uses_with(repl)
+        match.erase_nodes(graph)
+
+
+@register_graph_pattern(
+    CallFunction(torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")),
+    pass_dict=patterns,
+)
+def pointless_view(match: Match, arg, size):
+    """Remove no-op view"""
+    graph = match.graph
+    node = match.output_node()
+    arg_size = list(node.args[0].meta["val"].shape)  # type: ignore[union-attr]
+    if size == arg_size:
+        node.replace_all_uses_with(node.args[0])
+        match.erase_nodes(graph)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/misc_patterns.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/misc_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91fdd6611af037d7e855489d990ec31f0c490cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/misc_patterns.py
@@ -0,0 +1,130 @@
+import functools
+
+from typing import Dict, Set, Tuple
+
+import torch
+from torch._dynamo.utils import counters
+
+from torch._ops import OpOverload, OpOverloadPacket
+from ..pattern_matcher import fwd_only, register_replacement
+
+aten = torch.ops.aten
+
+
+@functools.lru_cache(None)
+def _misc_patterns_init():
+    from .joint_graph import patterns as joint_graph_patterns
+    from .post_grad import pass_patterns as post_grad_patterns_all
+
+    post_grad_patterns = post_grad_patterns_all[1]  # medium priority
+
+    if torch.cuda.is_available():
+        # workaround https://github.com/pytorch/pytorch/issues/97894
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # These patterns do 2 things
+    # 1. Since we know that index is completely unique, we can codegen it using
+    # stores instead of atomic adds, which is quite a bit faster.
+    # 2. Also, since we are guaranteed that they are completely within bounds,
+    # we can use unsafe indexing and skip debug asserts
+    def randperm_index_add_pattern(x, y):
+        index = torch.randperm(x.shape[0], device=x.device)[: y.shape[0]]
+        return torch.index_add(x, dim=0, source=y, index=index), index
+
+    def randperm_index_add_replacement(x, y):
+        index = torch.randperm(x.shape[0], device=x.device)[: y.shape[0]]
+        return (
+            torch.ops.aten._unsafe_index_put(
+                x, (index,), aten._unsafe_index(x, (index,)) + y, accumulate=False
+            ),
+            index,
+        )
+
+    register_replacement(
+        randperm_index_add_pattern,
+        randperm_index_add_replacement,
+        [torch.empty(4, 8, device=device), torch.empty(2, 8, device=device)],
+        fwd_only,
+        [post_grad_patterns, joint_graph_patterns],
+    )
+
+    def randperm_index_pattern(x, slice_shape):
+        index = torch.randperm(x.shape[0], device=x.device)[:slice_shape]
+        return torch.ops.aten.index(x, (index,)), index
+
+    def randperm_index_replacement(x, slice_shape):
+        index = torch.randperm(x.shape[0], device=x.device)[:slice_shape]
+        return torch.ops.aten._unsafe_index(x, (index,)), index
+
+    pattern = register_replacement(
+        randperm_index_pattern,
+        randperm_index_replacement,
+        [torch.empty(4, 8, device=device)],
+        fwd_only,
+        [post_grad_patterns, joint_graph_patterns],
+        scalar_workaround={"slice_shape": 42},
+    )
+
+
+class NumpyCompatNormalization:
+    numpy_compat: Dict[str, Tuple[str, ...]] = {
+        "dim": ("axis",),
+        "keepdim": ("keepdims",),
+        "input": ("x", "a", "x1"),
+        "other": ("x2",),
+    }
+    inverse_mapping: Dict[str, str]
+    cache: Dict["torch.fx.graph.Target", Set[str]]
+
+    def __init__(self):
+        self.cache = {}  # callable -> tuple of replaceable args e.g. ["axis"]
+        self.inverse_mapping = {}
+        for actual_kwarg, numpy_kwargs in self.numpy_compat.items():
+            for numpy_kwarg in numpy_kwargs:
+                assert numpy_kwarg not in self.inverse_mapping
+                self.inverse_mapping[numpy_kwarg] = actual_kwarg
+
+    def __call__(self, graph: torch.fx.Graph):
+        for node in graph.nodes:
+            if node.op != "call_function":
+                continue
+            if isinstance(node.target, (OpOverload, OpOverloadPacket)):
+                # only applies to torch ops; e.g. torch.stack(axis=1) works, torch.ops.aten.stack(axis=1) doesn't.
+                continue
+            kwargs = node.kwargs
+
+            if node.target in self.cache:
+                replaceable_kwargs = self.cache[node.target]
+            else:
+                signatures = torch.fx.operator_schemas.get_signature_for_torch_op(
+                    node.target
+                )
+                signatures = () if signatures is None else signatures
+                replaceable_kwargs = set()
+                for sig in signatures:
+                    for param_name in sig.parameters.keys():
+                        if param_name in self.numpy_compat:
+                            replaceable_kwargs.update(self.numpy_compat[param_name])
+
+                self.cache[node.target] = replaceable_kwargs
+
+            if not replaceable_kwargs:
+                continue
+
+            new_kwargs = {}
+            kwargs_changed = False
+            for k, v in kwargs.items():
+                if k in replaceable_kwargs:
+                    kwargs_changed = True
+                    new_kwargs[self.inverse_mapping[k]] = v
+                else:
+                    new_kwargs[k] = v
+
+            if kwargs_changed:
+                node.kwargs = torch.fx.immutable_collections.immutable_dict(new_kwargs)
+                counters["inductor"]["numpy_compat_normalization"] += 1
+
+
+numpy_compat_normalization = NumpyCompatNormalization()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/mkldnn_fusion.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/mkldnn_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..81da87fd503a8850d367ba9c7308e757ec976919
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/mkldnn_fusion.py
@@ -0,0 +1,1204 @@
+import functools
+import operator
+from functools import reduce
+from typing import Any, Tuple
+
+import torch
+
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
+
+from .. import ir
+
+from ..lowering import lowerings as L
+from ..pattern_matcher import (
+    Arg,
+    CallFunction,
+    filter_nodes,
+    get_arg_value,
+    KeywordArg,
+    MULTIPLE,
+)
+from ..virtualized import ops
+from .freezing_patterns import register_freezing_graph_pattern
+from .post_grad import register_lowering_pattern
+from .quantization import (
+    _register_quantization_lowerings,
+    _register_quantization_weight_pack_pass,
+)
+
+if torch._C._has_mkldnn:
+    aten = torch.ops.aten
+    mkldnn = torch.ops.mkldnn
+    prims = torch.ops.prims
+
+    _conv_args = [Arg() for _ in range(10)]
+    _linear_args = [Arg() for _ in range(6)]
+    _conv_transpose_args = [Arg() for _ in range(11)]
+
+    def _conv_call(users=1):
+        return CallFunction(
+            mkldnn._convolution_pointwise.default, *_conv_args, _users=users
+        )
+
+    def _linear_call(users=1):
+        return CallFunction(
+            mkldnn._linear_pointwise.default, *_linear_args, _users=users
+        )
+
+    def _conv_transpose_call(users=1):
+        return CallFunction(
+            mkldnn._convolution_transpose_pointwise.default,
+            *_conv_transpose_args,
+            _users=users,
+        )
+
+    def _to_float(input_call, users=1):
+        return CallFunction(
+            prims.convert_element_type.default,
+            input_call,
+            KeywordArg("to_float"),
+            _users=users,
+        )
+
+    def _to_bf16(input_call):
+        return CallFunction(
+            prims.convert_element_type.default,
+            input_call,
+            KeywordArg("to_bf16"),
+            _users=1,
+        )
+
+    def _to_fp16(input_call):
+        return CallFunction(
+            prims.convert_element_type.default,
+            input_call,
+            KeywordArg("to_fp16"),
+            _users=1,
+        )
+
+    def _unary_fusion_pattern(unary_fusion, call_fn, users, lowp_dtype):
+        # only insert to_dtype if lowp_dtype is True
+        computation_call = (
+            _to_float(call_fn(), users=users) if lowp_dtype else call_fn(users=users)
+        )
+        out = unary_fusion(computation_call)
+        if lowp_dtype == torch.bfloat16:
+            return _to_bf16(out)
+        elif lowp_dtype == torch.float16:
+            return _to_fp16(out)
+        else:
+            return out
+
+    def _gelu_fusion_1(computation_call):
+        return CallFunction(
+            aten.mul,
+            CallFunction(aten.mul, computation_call, 0.5),
+            CallFunction(
+                aten.add,
+                CallFunction(
+                    aten.erf,
+                    CallFunction(aten.mul, computation_call, 0.7071067811865476),
+                ),
+                1,
+            ),
+        )
+
+    def _gelu_fusion_2(computation_call):
+        return CallFunction(
+            aten.mul,
+            CallFunction(aten.mul, computation_call, 0.5),
+            CallFunction(
+                aten.add,
+                CallFunction(
+                    aten.tanh,
+                    CallFunction(
+                        aten.mul,
+                        CallFunction(
+                            aten.add,
+                            computation_call,
+                            CallFunction(
+                                aten.mul,
+                                CallFunction(
+                                    aten.mul,
+                                    CallFunction(
+                                        aten.mul, computation_call, computation_call
+                                    ),
+                                    computation_call,
+                                ),
+                                0.044715,
+                            ),
+                        ),
+                        0.7978845608028654,
+                    ),
+                ),
+                1,
+            ),
+        )
+
+    def _hardswish_fusion(computation_call):
+        return CallFunction(
+            aten.div,
+            CallFunction(
+                aten.mul,
+                computation_call,
+                CallFunction(
+                    aten.clamp_max,
+                    CallFunction(
+                        aten.clamp_min, CallFunction(aten.add, computation_call, 3), 0
+                    ),
+                    6,
+                ),
+            ),
+            6,
+        )
+
+    def _silu_fusion(computation_call):
+        return CallFunction(
+            aten.mul, computation_call, CallFunction(aten.sigmoid, computation_call)
+        )
+
+    def _hardsigmoid_fusion(computation_call):
+        return CallFunction(
+            aten.div,
+            CallFunction(
+                aten.clamp_max,
+                CallFunction(
+                    aten.clamp_min, CallFunction(aten.add, computation_call, 3), 0
+                ),
+                6,
+            ),
+            6,
+        )
+
+    def _leaky_relu_fusion(computation_call):
+        return CallFunction(
+            aten.where,
+            CallFunction(aten.gt, computation_call, 0),
+            computation_call,
+            CallFunction(aten.mul, computation_call, KeywordArg("negative_slope")),
+        )
+
+    def _hardtanh_fusion(computation_call):
+        return CallFunction(
+            aten.clamp_max,
+            CallFunction(aten.clamp_min, computation_call, KeywordArg("min_value")),
+            KeywordArg("max_value"),
+        )
+
+    def _combined_fusion(computation_call, elementwise_op):
+        return CallFunction(elementwise_op, computation_call)
+
+    # binary_op(other, computation_op)
+    def _binary_fusion_v1(computation_call, binary_fn):
+        return CallFunction(binary_fn, KeywordArg("other"), computation_call)
+
+    # binary_op(computation_op, other)
+    def _binary_fusion_v2(computation_call, binary_fn):
+        return CallFunction(binary_fn, computation_call, KeywordArg("other"))
+
+    def _is_single_computation_op(computation_op):
+        def fn(match):
+            computation_nodes = filter_nodes(match.nodes, computation_op)
+            if len(computation_nodes) < 1:
+                return False
+            if any(n.args[-3] != "none" for n in computation_nodes):
+                return False
+            return True
+
+        return fn
+
+    def _is_valid_computation_unary_fusion(computation_op, lowp_dtype=None):
+        def fn(match):
+            matched = _is_single_computation_op(computation_op)(match)
+            computation_node = filter_nodes(match.nodes, computation_op)[0]
+            if lowp_dtype:
+                conversion_dtype_nodes = filter_nodes(
+                    match.nodes, prims.convert_element_type.default
+                )
+                if len(conversion_dtype_nodes) != 2:
+                    return False
+                # fusion pattern is always in the form of computation_op + to_float32 + unary_op + to_bfloat16
+                if computation_node == conversion_dtype_nodes[0].args[0]:
+                    to_float = conversion_dtype_nodes[0].args[1]
+                    to_lp = conversion_dtype_nodes[1].args[1]
+                else:
+                    to_float = conversion_dtype_nodes[1].args[1]
+                    to_lp = conversion_dtype_nodes[0].args[1]
+                matched = matched and to_float == torch.float and to_lp == lowp_dtype
+            return matched
+
+        return fn
+
+    def _register_unary_fusion_lowering(
+        pattern, unary_attr, computation_op, lowp_dtype=None
+    ):
+        @register_lowering_pattern(
+            pattern,
+            extra_check=_is_valid_computation_unary_fusion(computation_op, lowp_dtype),
+        )
+        def fn(match, *args, **kwargs):
+            computation_args = list(args)[:-3] + [
+                unary_attr.op_name,
+                unary_attr.scalars_attr,
+                unary_attr.algorithm_attr,
+            ]
+            return L[computation_op](*computation_args)
+
+        return fn
+
+    def _register_leaky_relu_fusion_lowering(pattern, computation_op, lowp_dtype=None):
+        @register_lowering_pattern(
+            pattern, extra_check=_is_single_computation_op(computation_op)
+        )
+        def fn(match, *args, **kwargs):
+            negative_slope = kwargs.get("negative_slope")
+            if isinstance(negative_slope, ir.TensorBox):
+                matched = False
+            else:  # inp is a Number
+                matched = True
+            if lowp_dtype:
+                dtype1 = kwargs.get("to_float")
+                dtype2 = (
+                    kwargs.get("to_bf16")
+                    if lowp_dtype == torch.bfloat16
+                    else kwargs.get("to_fp16")
+                )
+                matched = matched and dtype1 == torch.float and dtype2 == lowp_dtype
+            computation_args = list(args)
+            if matched:
+                computation_args = computation_args[:-3] + [
+                    "leaky_relu",
+                    [negative_slope],
+                    "",
+                ]
+                return L[computation_op](*computation_args)
+            else:
+                # computation_args += ["none", [], ""]
+                out = L[computation_op](*computation_args)
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=torch.float)
+                out = L[aten.where](
+                    L[aten.gt](out, 0),
+                    out,
+                    L[aten.mul](out, negative_slope),
+                )
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=dtype2)  # type: ignore[possibly-undefined]
+                return out
+
+        return fn
+
+    def _register_hardtanh_fusion_lowering(pattern, computation_op, lowp_dtype=None):
+        @register_lowering_pattern(
+            pattern, extra_check=_is_single_computation_op(computation_op)
+        )
+        def fn(match, *args, **kwargs):
+            min_value = kwargs.get("min_value")
+            max_value = kwargs.get("max_value")
+            if isinstance(min_value, ir.TensorBox) or isinstance(
+                max_value, ir.TensorBox
+            ):
+                matched = False
+            else:  # inp is a Number
+                assert max_value is not None
+                matched = min_value <= max_value
+            if lowp_dtype:
+                dtype1 = kwargs.get("to_float")
+                dtype2 = (
+                    kwargs.get("to_bf16")
+                    if lowp_dtype == torch.bfloat16
+                    else kwargs.get("to_fp16")
+                )
+                matched = matched and dtype1 == torch.float and dtype2 == lowp_dtype
+            computation_args = list(args)
+            if matched:
+                computation_args = computation_args[:-3] + [
+                    "hardtanh",
+                    [min_value, max_value],
+                    "",
+                ]
+                return L[computation_op](*computation_args)
+            else:
+                out = L[computation_op](*computation_args)
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=torch.float)
+                out = L[aten.clamp_max](L[aten.clamp_min](out, min_value), max_value)
+                if lowp_dtype:
+                    out = L[prims.convert_element_type.default](out, dtype=dtype2)  # type: ignore[possibly-undefined]
+                return out
+
+        return fn
+
+    _binary_attr = {
+        aten.add: "add",
+        ops.add: "add",
+        aten.sub: "sub",
+        ops.sub: "sub",
+    }
+
+    def _is_valid_binary(match, fn):
+        binary_nodes = filter_nodes(match.nodes, fn)
+        if len(binary_nodes) < 1:
+            return False
+
+        def get_meta_value(argument: torch.fx.node.Argument):
+            # Only torch.fx.Node is expected to have meta.
+            if isinstance(argument, torch.fx.Node):
+                return argument.meta.get("val", None)
+            return None
+
+        if any(
+            not isinstance(get_meta_value(n.args[0]), torch.Tensor)
+            or not isinstance(get_meta_value(n.args[1]), torch.Tensor)
+            for n in binary_nodes
+        ):
+            return False
+        # check alpha is one.
+        if any(
+            get_arg_value(n, 2, kwarg_name="alpha") != 1.0
+            and get_arg_value(n, 2, kwarg_name="alpha") is not None
+            for n in binary_nodes
+        ):
+            return False
+        if any(
+            get_meta_value(n.args[0]).size() != get_meta_value(n.args[1]).size()
+            or get_meta_value(n.args[0]).device != get_meta_value(n.args[1]).device
+            or get_meta_value(n.args[0]).dtype != get_meta_value(n.args[1]).dtype
+            for n in binary_nodes
+        ):
+            return False
+        # check args[0] and args[1] is not same
+        if any(n.args[0] == n.args[1] for n in binary_nodes):
+            return False
+        return True
+
+    def _is_valid_computation_binary(computation_op, binary_op, other_index=None):
+        def fn(match):
+            if not _is_single_computation_op(computation_op)(match):
+                return False
+            if not _is_valid_binary(match, binary_op):
+                return False
+            return True
+
+        return fn
+
+    def _get_remaining_users(extra_input_node, compute_node):
+        # Think about this pattern:
+        #      ReLU
+        #     /   \
+        #  Conv1
+        #   /      \
+        # Conv2
+        #   \      /
+        #      Add
+        # Although, the extra input node (ReLU) has more than 1 users: Conv1 and Add.
+        # The Conv1 is the ancestor node of the current compute node (Conv2).
+        # This indicates that the buffer of ReLU has completed all its usage,
+        # So we can safely make changes to it now by doing Conv2->Add inplace fusion.
+        # Take above case as example:
+        # * extra_input_node: ReLU
+        # * compute_node: Conv2
+        # _get_remaining_users will return the users of extra_input_node which are not
+        # ancestor node of compute_node.
+        def _is_ancestor_node(_current_node, _ancestor_node):
+            # Check whether _ancestor_node is the ancestor node of _current_node
+            _node_list = [_current_node]
+            _visited_nodes = set()
+            while len(_node_list) != 0:
+                _current_node = _node_list.pop(0)
+                if _current_node not in _visited_nodes:
+                    _visited_nodes.add(_current_node)
+                    if _current_node == _ancestor_node:
+                        return True
+                    elif isinstance(
+                        _current_node, torch.fx.Node
+                    ) and _current_node.op not in ["placeholder", "output", "get_attr"]:
+                        for input in _current_node.all_input_nodes:
+                            _node_list.append(input)  # noqa: PERF402
+            return False
+
+        return [
+            user
+            for user in list(extra_input_node.users)
+            if not _is_ancestor_node(compute_node, user)
+        ]
+
+    def _is_valid_computation_binary_inplace(computation_op, binary_op, other_index):
+        def fn(match):
+            if not _is_valid_computation_binary(computation_op, binary_op)(match):
+                return False
+            binary_nodes = filter_nodes(match.nodes, binary_op)
+
+            def _get_compute_node(_binary_node, _other_index):
+                assert (
+                    len(_binary_node.all_input_nodes) == 2
+                ), "Binary node should have 2 input nodes."
+                _compute_index = 1 if (_other_index == 0) else 0
+                return _binary_node.args[_compute_index]
+
+            def _other_input_not_inplaceable(_binary_node, _other_index):
+                _compute_node = _get_compute_node(_binary_node, _other_index)
+                return (
+                    len(
+                        _get_remaining_users(
+                            _binary_node.args[_other_index], _compute_node
+                        )
+                    )
+                    > 1
+                    or _binary_node.args[_other_index] == _compute_node.args[0]
+                )
+
+            if any(_other_input_not_inplaceable(n, other_index) for n in binary_nodes):
+                return False
+            if any(
+                n.args[other_index].op in ["placeholder", "output"]
+                for n in binary_nodes
+            ):
+                return False
+            return True
+
+        return fn
+
+    def _register_binary_unary_fusion_lowering(
+        pattern,
+        computation_op,
+        binary_op,
+        fusion_op,
+        unary_attr=None,
+    ):
+        @register_lowering_pattern(
+            pattern, extra_check=_is_valid_computation_binary(computation_op, binary_op)
+        )
+        def fn(match, *args, **kwargs):
+            other = kwargs.get("other")
+            assert isinstance(other, ir.TensorBox)
+            binary_attr = _binary_attr[binary_op]
+            args_list = list(args)
+            computation_args = [args_list[0], other] + args_list[1:-3] + [binary_attr]
+            if len(args_list) > 6:
+                if unary_attr is not None:
+                    computation_args += [
+                        1.0,
+                        unary_attr.op_name,
+                        unary_attr.scalars_attr,
+                        unary_attr.algorithm_attr,
+                    ]
+                else:
+                    computation_args += [1.0, None, [], None]
+            return L[fusion_op](*computation_args)
+
+        return fn
+
+    def _can_be_inplace(_other):
+        if isinstance(_other.data, ir.View):
+            return _can_be_inplace(_other.data)
+        else:
+            return not (
+                isinstance(_other.data, ir.ReinterpretView)
+                or isinstance(
+                    _other.get_layout(), (ir.MutationLayout, ir.AliasedLayout)
+                )
+            )
+
+    def _register_binary_unary_maybe_inplace_fusion_lowering(
+        pattern,
+        computation_op,
+        binary_op,
+        inplace_fusion_op,
+        outplace_fusion_op,
+        unary_attr=None,
+        other_index=None,
+    ):
+        @register_lowering_pattern(
+            pattern,
+            extra_check=_is_valid_computation_binary_inplace(
+                computation_op, binary_op, other_index
+            ),
+        )
+        def fn(match, *args, **kwargs):
+            other = kwargs.get("other")
+            assert isinstance(other, ir.TensorBox)
+            binary_attr = _binary_attr[binary_op]
+            args_list = list(args)
+            computation_args = [args_list[0], other] + args_list[1:-3] + [binary_attr]
+            if len(args_list) > 6:
+                if unary_attr is not None:
+                    computation_args += [
+                        1.0,
+                        unary_attr.op_name,
+                        unary_attr.scalars_attr,
+                        unary_attr.algorithm_attr,
+                    ]
+                else:
+                    computation_args += [1.0, None, [], None]
+            # Make sure the other is not an alias or mutation(fx side doesn't has such info).
+            other.realize()
+            if not _can_be_inplace(other):
+                return L[outplace_fusion_op](*computation_args)
+            return L[inplace_fusion_op](*computation_args)
+
+        return fn
+
+    computation_ops = [
+        mkldnn._convolution_pointwise.default,
+        mkldnn._linear_pointwise.default,
+        mkldnn._convolution_transpose_pointwise.default,
+    ]
+
+    class UnaryAttr:
+        def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
+            self.op_name = op_name
+            self.scalars_attr = scalars_attr if scalars_attr else []
+            self.algorithm_attr = algorithm_attr if algorithm_attr else ""
+
+    def _register_unary_fusion():
+        computation_call_fns = [_conv_call, _linear_call, _conv_transpose_call]
+
+        def _unary_fusion_patterns(lowp_dtype):
+            replacement_unary_fusion_patterns = {
+                UnaryAttr("gelu", algorithm_attr="tanh"): [
+                    _unary_fusion_pattern(_gelu_fusion_2, call_fn, 4, lowp_dtype)
+                    for call_fn in computation_call_fns
+                ],
+                UnaryAttr("gelu", algorithm_attr="none"): [
+                    _unary_fusion_pattern(_gelu_fusion_1, call_fn, 2, lowp_dtype)
+                    for call_fn in computation_call_fns
+                ],
+                UnaryAttr("hardswish"): [
+                    _unary_fusion_pattern(_hardswish_fusion, call_fn, 2, lowp_dtype)
+                    for call_fn in computation_call_fns
+                ],
+                UnaryAttr("hardsigmoid"): [
+                    _unary_fusion_pattern(_hardsigmoid_fusion, call_fn, 1, lowp_dtype)
+                    for call_fn in computation_call_fns
+                ],
+                UnaryAttr("swish"): [
+                    _unary_fusion_pattern(_silu_fusion, call_fn, 2, lowp_dtype)
+                    for call_fn in computation_call_fns
+                ],
+            }
+            if not lowp_dtype:
+                call_user1 = [call_fn(users=1) for call_fn in computation_call_fns]
+                replacement_unary_fusion_patterns.update(
+                    {
+                        UnaryAttr("relu"): [
+                            _combined_fusion(u, aten.relu) for u in call_user1
+                        ],
+                        UnaryAttr("sigmoid"): [
+                            _combined_fusion(u, aten.sigmoid) for u in call_user1
+                        ],
+                        UnaryAttr("tanh"): [
+                            _combined_fusion(u, aten.tanh) for u in call_user1
+                        ],
+                    }
+                )
+
+            return replacement_unary_fusion_patterns
+
+        for lowp_dtype in [torch.bfloat16, torch.float16, None]:
+            replace_patterns = _unary_fusion_patterns(lowp_dtype)
+            for unary_attr, patterns in replace_patterns.items():
+                _register_unary_fusion_lowering(
+                    patterns[0], unary_attr, computation_ops[0], lowp_dtype
+                )
+                _register_unary_fusion_lowering(
+                    patterns[1], unary_attr, computation_ops[1], lowp_dtype
+                )
+                _register_unary_fusion_lowering(
+                    patterns[2], unary_attr, computation_ops[2], lowp_dtype
+                )
+            _leaky_relu_patterns = [
+                _unary_fusion_pattern(_leaky_relu_fusion, call_fn, 3, lowp_dtype)
+                for call_fn in computation_call_fns
+            ]
+            for pattern, computation_op in zip(_leaky_relu_patterns, computation_ops):
+                _register_leaky_relu_fusion_lowering(
+                    pattern, computation_op, lowp_dtype
+                )
+            hardtanh_patterns = [
+                _unary_fusion_pattern(_hardtanh_fusion, call_fn, 1, lowp_dtype)
+                for call_fn in computation_call_fns
+            ]
+            for pattern, computation_op in zip(hardtanh_patterns, computation_ops):
+                _register_hardtanh_fusion_lowering(pattern, computation_op, lowp_dtype)
+
+    def _register_inplace_fusion():
+        binary_ops = [aten.add, ops.add]
+        inplace_fusion_op = mkldnn._convolution_pointwise_.binary
+        outplace_fusion_op = mkldnn._convolution_pointwise.binary
+        conv_call = _conv_call(users=1)
+        conv_op = computation_ops[0]
+        for binary_op in binary_ops:
+            binary_v1 = _binary_fusion_v1(conv_call, binary_op)
+            binary_unary_v1 = _combined_fusion(binary_v1, aten.relu)
+            _register_binary_unary_maybe_inplace_fusion_lowering(
+                binary_unary_v1,
+                conv_op,
+                binary_op,
+                inplace_fusion_op,
+                outplace_fusion_op,
+                other_index=0,
+                unary_attr=UnaryAttr("relu"),
+            )
+            _register_binary_unary_maybe_inplace_fusion_lowering(
+                binary_v1,
+                conv_op,
+                binary_op,
+                inplace_fusion_op,
+                outplace_fusion_op,
+                other_index=0,
+            )
+            binary_v2 = _binary_fusion_v2(conv_call, binary_op)
+            binary_unary_v2 = _combined_fusion(binary_v2, aten.relu)
+            _register_binary_unary_maybe_inplace_fusion_lowering(
+                binary_unary_v2,
+                conv_op,
+                binary_op,
+                inplace_fusion_op,
+                outplace_fusion_op,
+                other_index=1,
+                unary_attr=UnaryAttr("relu"),
+            )
+            _register_binary_unary_maybe_inplace_fusion_lowering(
+                binary_v2,
+                conv_op,
+                binary_op,
+                inplace_fusion_op,
+                outplace_fusion_op,
+                other_index=1,
+            )
+
+    def _register_binary_fusion():
+        binary_ops = [aten.add, ops.add, aten.sub, ops.sub]
+        fusion_ops = [
+            mkldnn._convolution_pointwise.binary,
+            mkldnn._linear_pointwise.binary,
+        ]
+        _computation_user_1 = [_conv_call(users=1), _linear_call(users=1)]
+        for computation_call, computation_op, fusion_op in zip(
+            _computation_user_1, computation_ops[:-1], fusion_ops
+        ):
+            for binary_op in binary_ops:
+                pattern = _binary_fusion_v2(computation_call, binary_op)
+                _register_binary_unary_fusion_lowering(
+                    pattern, computation_op, binary_op, fusion_op
+                )
+
+            for binary_op in [aten.add, ops.add]:
+                pattern = _binary_fusion_v1(computation_call, binary_op)
+                _register_binary_unary_fusion_lowering(
+                    pattern, computation_op, binary_op, fusion_op
+                )
+
+    def _register_binary_unary_fusion():
+        binary_ops = [aten.add, ops.add, aten.sub, ops.sub]
+        fusion_ops = [mkldnn._convolution_pointwise.binary]
+        _computation_user_1 = [_conv_call(users=1)]
+        for computation_call, computation_op, fusion_op in zip(
+            _computation_user_1, computation_ops[:-1], fusion_ops
+        ):
+            for binary_op in binary_ops:
+                pattern_v1 = _combined_fusion(
+                    _binary_fusion_v2(computation_call, binary_op), aten.relu
+                )
+                _register_binary_unary_fusion_lowering(
+                    pattern_v1,
+                    computation_op,
+                    binary_op,
+                    fusion_op,
+                    unary_attr=UnaryAttr("relu"),
+                )
+            for binary_op in [aten.add, ops.add]:
+                pattern_v2 = _combined_fusion(
+                    _binary_fusion_v1(computation_call, binary_op), aten.relu
+                )
+                _register_binary_unary_fusion_lowering(
+                    pattern_v2,
+                    computation_op,
+                    binary_op,
+                    fusion_op,
+                    unary_attr=UnaryAttr("relu"),
+                )
+
+    def _recover_linear():
+        # convert reshape+linear+reshape to a single linear for applying fusion path.
+        @register_freezing_graph_pattern(
+            CallFunction(
+                aten.reshape.default,
+                CallFunction(
+                    mkldnn._linear_pointwise.default,
+                    CallFunction(
+                        aten.reshape.default,
+                        Arg(),
+                        KeywordArg("reshape_1"),
+                        _users=MULTIPLE,
+                    ),
+                    Arg(),
+                    Arg(),
+                    Arg(),
+                    Arg(),
+                    Arg(),
+                ),
+                KeywordArg("reshape_2"),
+            ),
+            pass_number=1,
+        )
+        def reshape_linear_reshape_pattern(match, *args, **kwargs):
+            reshape_1 = kwargs.get("reshape_1")
+            reshape_2 = kwargs.get("reshape_2")
+            assert isinstance(reshape_1, list)
+            assert isinstance(reshape_2, list)
+            assert len(reshape_1) == 2
+            dynamic_shapes = not all(
+                isinstance(x, int) for x in ([reshape_1[0]] + reshape_2[:-1])
+            )
+
+            graph = match.graph
+            reshape_2_node = match.output_node()
+            linear_input_node = reshape_2_node.args[0].args[0].args[0]
+            # check linear's input's shape[:-1] == reshape_2[:-1]
+            # and check product(reshape_2[:-1]) == reshape_1[0]
+            if dynamic_shapes:
+                # TODO: Haozhe investigate how add guard here
+                return
+            else:
+                can_remove_reshape = linear_input_node.meta.get("val").shape[
+                    :-1
+                ] == torch.Size(reshape_2[:-1])
+                can_remove_reshape = can_remove_reshape and (
+                    reduce(operator.mul, reshape_2[:-1]) == reshape_1[0]
+                )
+
+            if can_remove_reshape:
+                repl = graph.call_function(mkldnn._linear_pointwise.default, args)
+                repl.meta.update(reshape_2_node.meta)
+                reshape_2_node.replace_all_uses_with(repl)
+                old_linear_node = reshape_2_node.args[0]
+                reshape_1_node = old_linear_node.args[0]
+                graph.erase_node(reshape_2_node)
+                graph.erase_node(old_linear_node)
+                if len(reshape_1_node.users) == 0:
+                    graph.erase_node(reshape_1_node)
+
+        def is_linear_add_bias(match):
+            add_node = match.output_node()
+            linear_node = add_node.args[0]
+            weight_meta = linear_node.args[1].meta.get("val")
+            bias_meta = add_node.args[1].meta.get("val")
+            if weight_meta is None or bias_meta is None:
+                return False
+            return (
+                linear_node.args[2] is None
+                and bias_meta.dim() == 1
+                and bias_meta.size(0) == weight_meta.size(0)
+            )
+
+        # convert linear+bias to a single linear for applying fusion path.
+        @register_freezing_graph_pattern(
+            CallFunction(
+                aten.add.Tensor,
+                CallFunction(mkldnn._linear_pointwise.default, *_linear_args),
+                Arg(),
+            ),
+            pass_number=1,
+            extra_check=is_linear_add_bias,
+        )
+        def linear_bias_pattern(match, *args):
+            graph = match.graph
+            add_node = match.output_node()
+            linear_node = add_node.args[0]
+            new_args = list(linear_node.args)
+            new_args[2] = add_node.args[1]
+            repl = graph.call_function(
+                mkldnn._linear_pointwise.default, tuple(new_args)
+            )
+            repl.meta.update(add_node.meta)
+            add_node.replace_all_uses_with(repl)
+            match.erase_nodes(graph)
+
+    def _is_packable_mkldnn_rnn_layer(match):
+        lstm_node = match.output_node()
+        POS_WEIGHTS = [1, 2]
+        POS_INPUTS = [0, 5, 6]
+        POS_ARGS = POS_WEIGHTS + POS_INPUTS
+        # Weights should be Constant
+        if any(
+            lstm_node.args[POS_WEIGHT].op != "get_attr" for POS_WEIGHT in POS_WEIGHTS
+        ):
+            return False
+
+        # Meta info for weights and inputs should be available
+        if any(lstm_node.args[POS_ARG].meta.get("val") is None for POS_ARG in POS_ARGS):
+            return False
+
+        # Check device
+        if any(
+            lstm_node.args[POS_ARG].meta.get("val").device.type != "cpu"
+            for POS_ARG in POS_ARGS
+        ):
+            return False
+
+        # Check dtype
+        if any(
+            lstm_node.args[POS_ARG].meta.get("val").dtype == torch.bfloat16
+            and not mkldnn._is_mkldnn_bf16_supported()
+            for POS_ARG in POS_ARGS
+        ):
+            return False
+        if any(
+            lstm_node.args[POS_ARG].meta.get("val").dtype == torch.float16
+            and not mkldnn._is_mkldnn_fp16_supported()
+            for POS_ARG in POS_ARGS
+        ):
+            return False
+
+        return True
+
+    def _is_packable_convolution(match):
+        """
+        Check if the node is supported for MKLDNN convolution.
+        """
+        conv_node = match.output_node()
+        input_meta_value = conv_node.args[0].meta.get("val")
+        weight_meta_value = conv_node.args[1].meta.get("val")
+        if input_meta_value is None or weight_meta_value is None:
+            return False
+        input_size = input_meta_value.shape
+        if conv_node.args[1].op != "get_attr":
+            return False
+        for meta_value in [input_meta_value, weight_meta_value]:
+            if (
+                meta_value is None
+                or meta_value.device.type != "cpu"
+                or meta_value.dim() != 4
+            ):
+                return False
+        if (
+            input_meta_value.dtype == torch.bfloat16
+            or weight_meta_value.dtype == torch.bfloat16
+        ):
+            if not mkldnn._is_mkldnn_bf16_supported():
+                return False
+        if (
+            input_meta_value.dtype == torch.float16
+            or weight_meta_value.dtype == torch.float16
+        ):
+            if not mkldnn._is_mkldnn_fp16_supported():
+                return False
+        is_transposed = conv_node.args[-3]
+        if is_transposed:
+            # TODO: Support dynamic shape case for MKLDNN conv transpose.
+            if has_free_symbols(input_size):
+                return False
+            groups = conv_node.args[-1]
+            in_channels = weight_meta_value.size(0)
+            # doesn't support group_depthwise_conv_transpose.
+            if groups > 1 and groups == in_channels:
+                return False
+            # Port from: aten/src/ATen/native/Convolution.cpp:is_output_padding_big
+            output_paddings = conv_node.args[-2]
+            strides = conv_node.args[3]
+            if any(
+                output_padding >= stride
+                for output_padding, stride in zip(output_paddings, strides)
+            ):
+                return False
+        return True
+
+    def _is_packable_linear(match):
+        """
+        Check if the node is supported for MKLDNN linear.
+        """
+        linear_node = match.output_node()
+        # weight_idx is 1 for aten.mm and is 2 for aten.addmm
+        weight_idx = 2 if linear_node.target == aten.addmm.default else 1
+        if linear_node.args[weight_idx].op != "get_attr":
+            return False
+        input_meta_value = linear_node.args[weight_idx - 1].meta.get("val")
+        weight_meta_value = linear_node.args[weight_idx].meta.get("val")
+        if input_meta_value is None or weight_meta_value is None:
+            return False
+        batch_size = input_meta_value.shape[0]
+        is_lp_weight = weight_meta_value.dtype in (
+            torch.bfloat16,
+            torch.float16,
+        )
+        # on x86, for fp32, mkl should be enabled and batch_size should not be a free symbol.
+        # on aarch64, use mkldnn op for fp32 as well if acl is enabled
+        if (
+            not is_lp_weight
+            and not mkldnn._is_mkldnn_acl_supported()
+            and ((not torch._C.has_mkl) or has_free_symbols(batch_size))
+        ):
+            return False
+        for meta_value in [input_meta_value, weight_meta_value]:
+            if (
+                meta_value is None
+                or meta_value.device.type != "cpu"
+                or meta_value.dim() != 2
+            ):
+                return False
+        if weight_idx == 2:
+            bias_meta_value = linear_node.args[0].meta.get("val")
+            if (
+                bias_meta_value is None
+                or meta_value.device.type != "cpu"
+                or bias_meta_value.dim() != 1
+                or bias_meta_value.size(0) != weight_meta_value.size(1)
+            ):
+                return False
+
+        if (
+            input_meta_value.dtype == torch.bfloat16
+            or weight_meta_value.dtype == torch.bfloat16
+        ):
+            if not mkldnn._is_mkldnn_bf16_supported():
+                return False
+        if (
+            input_meta_value.dtype == torch.float16
+            or weight_meta_value.dtype == torch.float16
+        ):
+            if not mkldnn._is_mkldnn_fp16_supported():
+                return False
+        return True
+
+    _aten_conv_args = (
+        Arg(),
+        Arg(),
+        Arg(),
+        Arg(),
+        Arg(),
+        Arg(),
+        KeywordArg("is_transposed"),
+        Arg(),
+        Arg(),
+    )
+
+    _aten_mkldnn_rnn_layer_args = (
+        Arg(),  # input
+        Arg(),  # weight0
+        Arg(),  # weight1
+        Arg(),  # weight2
+        Arg(),  # weight3
+        Arg(),  # hx_
+        Arg(),  # cx_
+        KeywordArg("reverse"),  # reverse
+        Arg(),  # batch_sizes
+        Arg(),  # mode
+        Arg(),  # hidden_size
+        Arg(),  # num_layers
+        Arg(),  # has_biases
+        Arg(),  # bidirectional
+        Arg(),  # batch_first
+        Arg(),  # train
+    )
+
+    def _register_weight_pack_pass():
+        @register_freezing_graph_pattern(
+            CallFunction(aten.convolution.default, *_aten_conv_args),
+            extra_check=_is_packable_convolution,
+        )
+        def convolution(match, *args, **kwargs):
+            is_transposed = kwargs.get("is_transposed")
+            assert isinstance(is_transposed, bool)
+            graph = match.graph
+            conv_node = match.output_node()
+            input_size = conv_node.args[0].meta.get("val").shape
+            with graph.inserting_before(conv_node):
+                constant_args = [args[4], args[3], args[5], args[-1]]
+                packed_weight_op = mkldnn._reorder_convolution_weight
+                packed_conv_op = mkldnn._convolution_pointwise.default
+                if is_transposed:
+                    constant_args.insert(1, args[-2])  # output_padding
+                    packed_weight_op = mkldnn._reorder_convolution_transpose_weight
+                    packed_conv_op = mkldnn._convolution_transpose_pointwise.default
+                if not has_free_symbols(input_size):
+                    packed_weight_inputs = (
+                        (args[1],) + tuple(constant_args) + (input_size,)
+                    )
+                    packed_weight_node = graph.create_node(
+                        "call_function", packed_weight_op, args=packed_weight_inputs
+                    )
+                else:
+                    assert not is_transposed
+                    # For dynamic shape case, we need to pack weight in runtime.
+                    packed_weight_node = args[1]
+                packed_conv_inputs = (
+                    (args[0], packed_weight_node, args[2])
+                    + tuple(constant_args)
+                    + ("none", [], "")
+                )
+                packed_conv_node = graph.create_node(
+                    "call_function", packed_conv_op, tuple(packed_conv_inputs)
+                )
+                conv_node.replace_all_uses_with(packed_conv_node)
+                packed_conv_node.meta.update(conv_node.meta)
+                graph.erase_node(conv_node)
+
+        @register_freezing_graph_pattern(
+            CallFunction(aten.mkldnn_rnn_layer.default, *_aten_mkldnn_rnn_layer_args),
+            extra_check=_is_packable_mkldnn_rnn_layer,
+        )
+        def mkldnn_rnn_layer(match, *args, **kwargs):
+            def get_item(graph, node, index):
+                return graph.call_function(operator.getitem, (node, index))
+
+            graph = match.graph
+            lstm_node = match.output_node()
+            input = args[0]
+            weight0, weight1 = args[1:3]
+            reverse = kwargs.get("reverse")
+            packed_lstm_op = aten.mkldnn_rnn_layer.default
+            hidden_size = args[9]
+            has_biases = args[11]
+            batch_first = args[13]
+            with graph.inserting_before(lstm_node):
+                packed_weight_op = mkldnn._reorder_mkldnn_rnn_layer_weight.default
+                packed_weight_inputs = (
+                    weight0,
+                    weight1,
+                    hidden_size,
+                    reverse,
+                    has_biases,
+                    batch_first,
+                )
+                packed_weight_node = graph.create_node(
+                    "call_function", packed_weight_op, packed_weight_inputs, {}, "name"
+                )
+                packed_weight_items = [
+                    get_item(graph, packed_weight_node, i) for i in range(2)
+                ]
+                pack_lstm_inputs = (
+                    args[0],
+                    *packed_weight_items,
+                    args[3],
+                    args[4],
+                    args[5],
+                    args[6],
+                    reverse,
+                    *args[7:],
+                )
+
+                packed_lstm_node = graph.create_node(
+                    "call_function", packed_lstm_op, args=pack_lstm_inputs
+                )
+                lstm_node.replace_all_uses_with(packed_lstm_node)
+                packed_lstm_node.meta.update(lstm_node.meta)
+                graph.erase_node(lstm_node)
+
+        @register_freezing_graph_pattern(
+            CallFunction(aten.addmm.default, Arg(), Arg(), Arg()),
+            extra_check=_is_packable_linear,
+        )
+        @register_freezing_graph_pattern(
+            CallFunction(aten.mm.default, Arg(), Arg()),
+            extra_check=_is_packable_linear,
+        )
+        def linear(match, *args, **kwargs):
+            graph = match.graph
+            linear_node = match.output_node()
+            input = args[0] if linear_node.target == aten.mm.default else args[1]
+            bias = None if linear_node.target == aten.mm.default else args[0]
+            weight = args[1] if linear_node.target == aten.mm.default else args[2]
+            with graph.inserting_before(linear_node):
+                transpose_weight_node = graph.create_node(
+                    "call_function", aten.permute.default, (weight, (1, 0))
+                )
+                weight_dtype = weight.meta.get("val").dtype
+                is_lp_weight = weight_dtype in (
+                    torch.bfloat16,
+                    torch.float16,
+                )
+                batch_size = input.meta.get("val").shape[0]
+                if has_free_symbols(batch_size):
+                    assert (
+                        is_lp_weight or mkldnn._is_mkldnn_acl_supported()
+                    ), f"only bf16/fp16 weight prepacking supports dynamic shape inputs but got {weight_dtype}"
+                # For bfloat16 dynamic shape path, using input size hint to pack weight for a better performance.
+                packed_weight_inputs = (
+                    transpose_weight_node,
+                    batch_size.node.shape_env.size_hint(batch_size.node.expr)
+                    if has_free_symbols(batch_size)
+                    else batch_size,
+                )
+                packed_weight_op = (
+                    mkldnn._reorder_linear_weight
+                    if (is_lp_weight or mkldnn._is_mkldnn_acl_supported())
+                    else torch.ops.mkl._mkl_reorder_linear_weight
+                )
+                packed_weight_node = graph.create_node(
+                    "call_function", packed_weight_op, args=packed_weight_inputs
+                )
+
+                packed_linear_inputs: Tuple[Any, ...] = (input, packed_weight_node)
+                if is_lp_weight or mkldnn._is_mkldnn_acl_supported():
+                    packed_linear_inputs += (bias, "none", [], "")
+                    packed_linear_op = mkldnn._linear_pointwise.default
+                else:
+                    packed_linear_inputs += (transpose_weight_node, bias, batch_size)
+                    packed_linear_op = torch.ops.mkl._mkl_linear
+                packed_linear_node = graph.create_node(
+                    "call_function", packed_linear_op, packed_linear_inputs
+                )
+                linear_node.replace_all_uses_with(packed_linear_node)
+                packed_linear_node.meta.update(linear_node.meta)
+                graph.erase_node(linear_node)
+
+    def _eliminate_duplicate_packed_nodes(gm):
+        """
+        Combine packed weight nodes with the same inputs to reduce memory usage.
+        for example:
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = nn.Linear(32, 32, bias=True)
+
+            def forward(self, x):
+                return self.linear(self.linear(x))
+
+        the above's packed weight nodes are duplicate if two linear calls have same input size.
+        """
+        if not (torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available()):
+            return gm
+
+        packed_weight_ops = [
+            torch._C._nn.mkldnn_reorder_conv2d_weight,
+            mkldnn._reorder_convolution_transpose_weight,
+            mkldnn._reorder_linear_weight,
+            mkldnn._reorder_mkldnn_rnn_layer_weight,
+        ]
+        if torch._C.has_mkl:
+            packed_weight_ops.append(torch.ops.mkl._mkl_reorder_linear_weight)
+
+        for node in gm.graph.nodes:
+            if node.target in packed_weight_ops and len(node.args[0].users) > 1:
+                for user_node in list(node.args[0].users.keys()):
+                    if (
+                        user_node.target == node.target
+                        and user_node != node
+                        and user_node.args == node.args
+                    ):
+                        user_node.replace_all_uses_with(node)
+                        gm.graph.erase_node(user_node)
+
+    @functools.lru_cache(None)
+    def _mkldnn_fusion_init():
+        # TODO: aarch64: enable op fusion for acl once it supports fused operators. Disabling it for now.
+        # Otherwise even the matmul or innerproduct can not be accelerated with acl
+        if (
+            torch.backends.mkldnn.enabled
+            and torch.backends.mkldnn.is_available()
+            and not torch.ops.mkldnn._is_mkldnn_acl_supported()
+        ):
+            _register_unary_fusion()
+            _register_inplace_fusion()
+            _register_binary_unary_fusion()
+            _register_binary_fusion()
+            _register_quantization_lowerings()
+
+    @functools.lru_cache(None)
+    def _mkldnn_weight_pack_init():
+        if torch.backends.mkldnn.enabled and torch.backends.mkldnn.is_available():
+            _register_weight_pack_pass()
+            _recover_linear()
+            _register_quantization_weight_pack_pass()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/numeric_utils.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/numeric_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e08cef8c47ed39a2cbac29b0bcc4aaaa479c37a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/numeric_utils.py
@@ -0,0 +1,210 @@
+import gc
+import logging
+import os
+import random
+import traceback
+
+import numpy
+
+import torch
+import torch.optim as optim
+
+from .. import config
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+MAIN_RANDOM_SEED = 1337
+
+# Set the CUBLAS_WORKSPACE_CONFIG environment variable
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+
+# If the two forward functions involve any non-deterministic operations,
+# such as certain types of parallelism or asynchronous execution,
+# this can also lead to different outputs.
+def set_deterministic() -> None:
+    """Make torch manual seed deterministic."""
+
+    torch.manual_seed(MAIN_RANDOM_SEED)
+    random.seed(MAIN_RANDOM_SEED)
+    numpy.random.seed(MAIN_RANDOM_SEED)
+    torch.use_deterministic_algorithms(True)
+
+
+def clean_memory() -> None:
+    """Clean memory to avoid OOM."""
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+# We compare the numerical results before and after pre/post grad fx passes
+# transformation to make sure the numerical results are the same.
+def compare_dict_tensors(dict_base, dict_control, precision):
+    if len(set(dict_base.keys())) != len(set(dict_control.keys())):
+        logger.warning("Mismatch keys found before and after pre/post grad fx passes.")
+        logger.debug("keys before pre/post grad fx passes %s", dict_base.keys())
+        logger.debug("keys after pre/post grad fx passes %s", dict_control.keys())
+        return False
+    is_allclose = True
+    for key in dict_base.keys():
+        if key not in dict_control:
+            logger.warning(
+                "Mismatch parameter name %s does not exist after pre/post grad fx passes",
+                key,
+            )
+        # Some parameters have `None`, and not every param has a valid .grad field, we skip them
+        if dict_base[key] is None or dict_control[key] is None:
+            continue
+        if not torch.allclose(
+            dict_base[key],
+            dict_control[key],
+            rtol=precision,
+            atol=precision,
+            equal_nan=True,
+        ):
+            logger.warning(
+                "Mismatch parameter values found before and after pre/post grad fx passes."
+            )
+            logger.debug("value before pre/post grad fx passes %s", dict_base[key])
+            logger.debug("value after pre/post grad fx passes %s", dict_control[key])
+            is_allclose = False
+    return is_allclose
+
+
+def compare_tuple_tensors(tuple_base, tuple_control, precision):
+    if len(tuple_base) != len(tuple_control):
+        logger.warning(
+            "Mismatch fw output length. before transformation: %s, after transformation: %s",
+            len(tuple_base),
+            len(tuple_control),
+        )
+        return False
+    is_allclose = True
+    for i in range(len(tuple_base)):
+        # Some parameters have `None`, we skip them
+        if tuple_base[i] is None or tuple_control[i] is None:
+            continue
+        if not torch.allclose(
+            tuple_base[i],
+            tuple_control[i],
+            rtol=precision,
+            atol=precision,
+            equal_nan=True,
+        ):
+            logger.debug(
+                "forward output before pre/post grad fx passes %s", tuple_base[i]
+            )
+            logger.debug(
+                "forward output after pre/post grad fx passes %s", tuple_control[i]
+            )
+            is_allclose = False
+    return is_allclose
+
+
+def compare_parameters(model_base, model_control, precision):
+    return compare_dict_tensors(
+        dict(model_base.named_parameters()),
+        dict(model_control.named_parameters()),
+        precision,
+    )
+
+
+def compare_forward_output(pred_base, pred_control, precision):
+    return compare_tuple_tensors(
+        pred_base,
+        pred_control,
+        precision,
+    )
+
+
+def compare_gradients(model_base, model_control, precision):
+    grad_base = {key: param.grad for key, param in model_base.named_parameters()}
+    grad_pt2 = {key: param.grad for key, param in model_control.named_parameters()}
+    return compare_dict_tensors(
+        grad_base,
+        grad_pt2,
+        precision,
+    )
+
+
+def run_model(
+    model_base, model_control, model_input, num_iterations=10, precision=1e-4
+):
+    clean_memory()
+    for i in range(num_iterations):
+        logger.info("start %s iteration", i)
+        set_deterministic()
+        pred_base = model_base(*model_input)
+        set_deterministic()
+        pred_control = model_control(*model_input)
+
+        res = compare_parameters(model_base, model_control, precision)
+        logger.info("compare parameters. Numerical result : %s", res)
+
+        res = compare_forward_output(pred_base, pred_control, precision)
+        logger.info("compare loss/predict. Numerical result : %s", res)
+        # tensor may not have a grad_fn
+        try:
+            _ = pred_base[0].sum().backward(retain_graph=True)
+            _ = pred_control[0].sum().backward(retain_graph=True)
+            res = compare_gradients(model_base, model_control, precision)
+            logger.info("compare param grad. Numerical result : %s", res)
+        except Exception as e:
+            logger.exception("Exception %s when compare gradients", e)
+            traceback.print_exc()
+
+        if config.fx_passes_numeric_check["requires_optimizer"]:
+            try:
+                optimizer_base = optim.SGD(
+                    [param for name, param in model_base.named_parameters()], lr=0.01
+                )
+                optimizer_base.step()
+
+                optimizer_control = optim.SGD(
+                    [param for name, param in model_control.named_parameters()], lr=0.01
+                )
+                optimizer_control.step()
+
+                res = compare_parameters(model_base, model_control, precision)
+                logger.info(
+                    "compare parameters with optimizer added. Numerical result : %s",
+                    res,
+                )
+            except Exception as e:
+                logger.exception(
+                    "Exception %s when optimizer is added to check parameter names", e
+                )
+                traceback.print_exc()
+        else:
+            logger.warning(
+                "no parameter with optimizer to compare with length %s before transformation"
+                " and the length %s after transformation",
+                len(dict(model_base.named_parameters())),
+                len(dict(model_control.named_parameters())),
+            )
+
+
+def numeric_check_if_enabled(
+    gm_before_fx_passes,
+    gm_after_fx_passes,
+    example_inputs,
+    num_iterations,
+    precision,
+):
+    # need to topo-sort graphmodule before we run the model,
+    # otherwise it may fail as refer before def
+    # fail silently in order not to block the model run
+    try:
+        with torch.autograd.set_detect_anomaly(True):
+            run_model(
+                gm_before_fx_passes,
+                gm_after_fx_passes,
+                example_inputs,
+                num_iterations=num_iterations,
+                precision=precision,
+            )
+    except Exception as e:
+        logger.warning(
+            "Runtime numeric check failed in pre grad fx passes with error: %s", e
+        )
+        traceback.print_exc()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pad_mm.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pad_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e8d8b00a596ef6da021b14e4580bfcad192726e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pad_mm.py
@@ -0,0 +1,567 @@
+import functools
+from typing import List, Optional, Set, Union
+
+import torch
+from torch import Tensor
+from torch._inductor import utils
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._triton import has_triton
+
+from ..pattern_matcher import (
+    fwd_only,
+    joint_fwd_bwd,
+    Match,
+    MatchContext,
+    register_replacement,
+)
+from ..utils import is_view
+
+aten = torch.ops.aten
+
+
+# This flag is only used for testing purpose.
+# Changing it to True will ignore comparing do_bench times
+# between original pattern and padded one.
+_skip_do_bench_times = False
+
+
+def fetch_fake_tensors(match, kwarg_names) -> List[Tensor]:
+    kwargs = match.kwargs
+    return [kwargs[name].meta["val"] for name in kwarg_names]
+
+
+def unwrap_fake_args(*arg_names):
+    def decorator(func):
+        def wrapper(match):
+            fake_tensors = fetch_fake_tensors(match, arg_names)
+            return func(*fake_tensors)
+
+        return wrapper
+
+    return decorator
+
+
+def get_alignment_size(x: Tensor) -> int:
+    if x.dtype == torch.float16 or x.dtype == torch.half or x.dtype == torch.bfloat16:
+        return 8
+    elif x.dtype == torch.float32 or x.dtype == torch.float:
+        return 4
+    else:
+        return 0
+
+
+def check_device(a: Tensor, b: Tensor) -> bool:
+    return a.is_cuda and b.is_cuda
+
+
+def check_dtype(a: Tensor, b: Tensor) -> bool:
+    return a.is_floating_point() and b.is_floating_point()
+
+
+def _result_layout_affects_graph_output(match: Match) -> bool:
+    """
+    Check if the matched GEMM operation potentially affects the graph output strides.
+    returns True if the matched op's output buffer does not pass through functions which certainly
+    redefine the memory layout before being part of the graph output.
+    """
+
+    if match.ctx is not None:
+        assert isinstance(match.ctx, MatchContext)
+        search_node: torch.fx.Node = match.output_node()
+    else:
+        return True
+
+    assert search_node is not None
+    seen: Set[torch.fx.Node] = set()
+
+    def find_output(node: torch.fx.Node, is_start_node=False):
+        if not isinstance(node, torch.fx.Node):
+            return False
+        if node in seen:
+            return False
+        seen.add(node)
+        if node.op == "output":
+            return True
+        if node.op != "call_function":
+            return False
+        if not is_start_node and (
+            (not isinstance(node.target, torch._ops.OpOverload))
+            or (not is_view(node.target))
+        ):
+            return False
+        if node.users is not None and len(node.users) > 0:
+            for n in node.users:
+                if find_output(n):
+                    return True
+        return False
+
+    return find_output(search_node, True)
+
+
+def should_pad_common(
+    mat1: Tensor, mat2: Tensor, input: Optional[Tensor] = None
+) -> bool:
+    # It's fine we have symbolic shapes or strides as long as they
+    # have hints. Later, we will make sure we only pad non-symbolic dimensions.
+    def valid_shape_and_stride(t: Optional[Tensor]) -> bool:
+        if t is None:
+            return True
+
+        symbolic_cnt = 0
+        for x in t.size():
+            if isinstance(x, int):
+                continue
+            elif utils.is_symbolic(x):
+                if not x.node.has_hint():
+                    return False
+                symbolic_cnt += 1
+            else:
+                return False
+        # filter out cases where all dimentions are symbolic
+        if symbolic_cnt == len(t.size()):
+            return False
+        return all(
+            isinstance(x, int) or (utils.is_symbolic(x) and x.node.has_hint())
+            for x in t.stride()
+        )
+
+    return (
+        torch._inductor.config.shape_padding
+        and check_device(mat1, mat2)
+        and check_dtype(mat1, mat2)
+        and all(valid_shape_and_stride(t) for t in (mat1, mat2, input))
+    )
+
+
+def get_padded_length(x: Union[int, torch.SymInt], alignment_size) -> int:
+    # we don't pad x if it is symbolic
+    if isinstance(x, torch.SymInt) or alignment_size == 0 or x % alignment_size == 0:
+        return 0
+    return int((x // alignment_size + 1) * alignment_size) - x
+
+
+def pad_dim(x: Tensor, padded_length: int, dim: int) -> Tensor:
+    if padded_length == 0:
+        return x
+    pad = x.new_zeros(*x.shape[:dim], padded_length, *x.shape[dim + 1 :])
+    return torch.cat([x, pad], dim=dim)
+
+
+def addmm_pattern(
+    input: Tensor, mat1: Tensor, mat2: Tensor, beta: float, alpha: float
+) -> Tensor:
+    return aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
+
+
+def should_pad_addmm(match: Match) -> bool:
+    if (
+        torch._inductor.config.keep_output_stride
+        and _result_layout_affects_graph_output(match)
+    ):
+        return False
+    mat1, mat2, input = fetch_fake_tensors(match, ("mat1", "mat2", "input"))
+    return should_pad_common(mat1, mat2, input) and should_pad_bench(
+        mat1, mat2, torch.ops.aten.addmm, input=input
+    )
+
+
+def addmm_replace(
+    input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0
+) -> Tensor:
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+
+    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
+        return pad_addmm(
+            input,
+            mat1,
+            mat2,
+            m_padded_length,
+            k_padded_length,
+            n_padded_length,
+            beta,
+            alpha,
+        )
+
+    return aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
+
+
+def pad_addmm(
+    input: Optional[Tensor],
+    mat1: Tensor,
+    mat2: Tensor,
+    m_padded_length: int,
+    k_padded_length: int,
+    n_padded_length: int,
+    beta=1.0,
+    alpha=1.0,
+):
+    # addmm decomp with padding will go through pad_addmm multiple times if multiple dimensions are needed to be padded
+    if k_padded_length != 0:
+        mat1 = pad_dim(mat1, k_padded_length, 1)
+        mat2 = pad_dim(mat2, k_padded_length, 0)
+    elif n_padded_length != 0:
+        mat2 = pad_dim(mat2, n_padded_length, 1)
+    elif m_padded_length != 0:
+        mat1 = pad_dim(mat1, m_padded_length, 0)
+
+    # the add broadcasts, so we only pad if the dimension != 1
+    if input is not None and k_padded_length == 0:
+        if n_padded_length != 0:
+            if input.dim() == 2 and input.shape[1] != 1:
+                input = pad_dim(input, n_padded_length, 1)
+            elif input.dim() == 1 and input.shape[0] != 1:
+                input = pad_dim(input, n_padded_length, 0)
+        elif m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1:
+            input = pad_dim(input, m_padded_length, 0)
+
+    if k_padded_length != 0:
+        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)
+    elif n_padded_length != 0:
+        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
+            :, :-n_padded_length
+        ]
+    else:
+        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
+            :-m_padded_length, :
+        ]
+
+
+def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
+    denominator = M * K + N * K + M * N
+    if denominator == 0:
+        return False
+    arithmetic_intensity = (M * N * K) / denominator
+
+    # Fails with AMD
+    try:
+        machine_balance = (
+            1000 * utils.get_device_tflops(dtype)
+        ) / utils.get_gpu_dram_gbps()
+    except Exception:
+        return True
+
+    # dram_gbps might be underestimating bandwidth because of cache.
+    # if we estimate machine balance too low we might miss some speedups,
+    # if we extimate too high there will be unnecessary compilation time increase.
+    # TODO - finetune coefficient here. As a reference point, Triton mm model assumes
+    # 80% of reads are in cache and cache is 4x faster than dram_gbps
+    machine_balance = machine_balance * 0.5
+
+    return arithmetic_intensity > machine_balance
+
+
+@functools.lru_cache(None)
+def get_pad_cache():
+    return torch._inductor.codecache.LocalCache()
+
+
+def get_cached_should_pad(key):
+    return get_pad_cache().lookup(key)
+
+
+def set_cached_should_pad(key, value):
+    return get_pad_cache().set_value(key, value=value)
+
+
+def should_pad_bench_key(
+    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+) -> str:
+    def tensor_key(t):
+        return (t.shape, t.stride(), t.dtype)
+
+    tf32_key = (
+        None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
+    )
+    key = (
+        tensor_key(mat1),
+        tensor_key(mat2),
+        op,
+        input if input is None else tensor_key(input),
+        tf32_key,
+    )
+
+    return str(key)
+
+
+def should_pad_bench(
+    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+) -> bool:
+    if not has_triton():
+        return False
+
+    do_bench = functools.partial(
+        utils.do_bench,
+        warmup=5,
+    )
+
+    with no_dispatch():
+        if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
+            m = mat1.shape[0]
+            k = mat1.shape[1]
+            n = mat2.shape[1]
+
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
+            k_padded_length = get_padded_length(k, get_alignment_size(mat1))
+            n_padded_length = get_padded_length(n, get_alignment_size(mat2))
+        elif op is torch.ops.aten.bmm:
+            m = mat1.shape[1]
+            k = mat1.shape[2]
+            n = mat2.shape[2]
+
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
+            k_padded_length = get_padded_length(k, get_alignment_size(mat1))
+            n_padded_length = get_padded_length(n, get_alignment_size(mat2))
+        else:
+            return False
+
+        if m_padded_length == k_padded_length == n_padded_length == 0:
+            return False
+
+        if not is_mm_compute_bound(m, k, n, mat1.dtype):
+            return False
+
+        # We don't want to look up the cache for cases that are trivially false
+        # since it does file io
+        key = should_pad_bench_key(mat1, mat2, op, input)
+
+        cached_pad = get_cached_should_pad(key)
+        if cached_pad is not None:
+            return cached_pad
+
+        def realize_symbols(ds):
+            return [d if isinstance(d, int) else d.node.hint for d in ds]
+
+        def realize_tensor(t):
+            if isinstance(t, FakeTensor):
+                size_hints = realize_symbols(t.size())
+                stride_hint = realize_symbols(t.stride())
+                real_size = (
+                    sum((d - 1) * s for d, s in zip(size_hints, stride_hint)) + 1
+                )
+                real_t = torch.randn(real_size, dtype=t.dtype, device=t.device)
+                return torch.as_strided(real_t, size_hints, stride_hint)
+            else:
+                return torch.randn_like(t)
+
+        mat1 = realize_tensor(mat1)
+        mat2 = realize_tensor(mat2)
+        if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
+            ori_time = do_bench(
+                lambda: op(mat1, mat2),
+            )
+        else:
+            if input is not None:
+                input = realize_tensor(input)
+            ori_time = do_bench(
+                lambda: op(input, mat1, mat2),
+            )
+
+        mat1_pad = torch.randn_like(mat1)
+        mat2_pad = torch.randn_like(mat2)
+
+        if op is torch.ops.aten.addmm:
+            input_pad = None
+            if input is not None and input.is_cuda:
+                input_pad = torch.randn_like(input)
+            pad_time = do_bench(
+                lambda: pad_addmm(
+                    input_pad,
+                    mat1_pad,
+                    mat2_pad,
+                    m_padded_length,
+                    k_padded_length,
+                    n_padded_length,
+                ),
+            )
+        elif op is torch.ops.aten.mm:
+            pad_time = do_bench(
+                lambda: pad_mm(
+                    mat1_pad,
+                    mat2_pad,
+                    m_padded_length,
+                    k_padded_length,
+                    n_padded_length,
+                ),
+            )
+        else:
+            pad_time = do_bench(
+                lambda: pad_bmm(
+                    mat1_pad,
+                    mat2_pad,
+                    m_padded_length,
+                    k_padded_length,
+                    n_padded_length,
+                ),
+            )
+
+        # Shape padding introduces additional memory ops. Based on microbenchmarks, 1.1x represents a reasonable
+        # tradeoff between performance improvement from shape padding and overhead from additional memory ops
+        # TODO: Build a learned model which would be better than this heuristic
+        should_pad = _skip_do_bench_times or ori_time > pad_time * 1.1
+        set_cached_should_pad(key, should_pad)
+
+        return should_pad
+
+
+def mm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
+    return aten.mm(mat1, mat2)
+
+
+def should_pad_mm(match: Match) -> bool:
+    if (
+        torch._inductor.config.keep_output_stride
+        and _result_layout_affects_graph_output(match)
+    ):
+        return False
+    mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
+    return should_pad_common(mat1, mat2) and should_pad_bench(
+        mat1, mat2, torch.ops.aten.mm
+    )
+
+
+def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+
+    return pad_mm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
+
+
+def pad_mm(
+    mat1: Tensor,
+    mat2: Tensor,
+    m_padded_length: int,
+    k_padded_length: int,
+    n_padded_length: int,
+) -> Tensor:
+    # mm_replace will go through pad_mm multiple times if multiple dimensions are needed to be padded
+    if k_padded_length != 0:
+        mat1 = pad_dim(mat1, k_padded_length, 1)
+        mat2 = pad_dim(mat2, k_padded_length, 0)
+        return torch.ops.aten.mm(mat1, mat2)
+    elif n_padded_length != 0:
+        mat2 = pad_dim(mat2, n_padded_length, 1)
+        return torch.ops.aten.mm(mat1, mat2)[:, :-n_padded_length]
+    else:
+        mat1 = pad_dim(mat1, m_padded_length, 0)
+        return torch.ops.aten.mm(mat1, mat2)[:-m_padded_length, :]
+
+
+def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
+    return aten.bmm(mat1, mat2)
+
+
+def should_pad_bmm(match: Match) -> bool:
+    if (
+        torch._inductor.config.keep_output_stride
+        and _result_layout_affects_graph_output(match)
+    ):
+        return False
+    mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
+    return should_pad_common(mat1, mat2) and should_pad_bench(
+        mat1, mat2, torch.ops.aten.bmm
+    )
+
+
+def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2))
+
+    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
+        return pad_bmm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
+
+    return aten.bmm(mat1, mat2)
+
+
+def pad_bmm(
+    mat1: Tensor,
+    mat2: Tensor,
+    m_padded_length: int,
+    k_padded_length: int,
+    n_padded_length: int,
+) -> Tensor:
+    # bmm_replace will go through pad_bmm multiple times if multiple dimensions are needed to be padded
+    if k_padded_length != 0:
+        mat1 = pad_dim(mat1, k_padded_length, 2)
+        mat2 = pad_dim(mat2, k_padded_length, 1)
+
+        return aten.bmm(mat1, mat2)
+    elif n_padded_length != 0:
+        mat2 = pad_dim(mat2, n_padded_length, 2)
+        return aten.bmm(mat1, mat2)[:, :, :-n_padded_length].contiguous()
+    else:
+        mat1 = pad_dim(mat1, m_padded_length, 1)
+        return aten.bmm(mat1, mat2)[:, :-m_padded_length, :].contiguous()
+
+
+@functools.lru_cache(None)
+def _pad_mm_init():
+    from .joint_graph import patterns
+
+    if torch.cuda.is_available():
+        # workaround https://github.com/pytorch/pytorch/issues/97894
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # sizes/values dont actually matter for initial trace
+    # once we get a possible match we re-trace with the actual values and verify the match still holds
+
+    dim2a = functools.partial(torch.empty, (4, 4), device=device, requires_grad=True)
+    dim2b = functools.partial(torch.empty, (4, 4), device=device, requires_grad=True)
+
+    dim3a = functools.partial(torch.empty, (4, 4, 4), device=device, requires_grad=True)
+    dim3b = functools.partial(torch.empty, (4, 4, 4), device=device, requires_grad=True)
+
+    dim1a = functools.partial(torch.empty, (4), device=device, requires_grad=True)
+
+    # workaround https://github.com/pytorch/pytorch/issues/97894
+    # 0.113377 is a "magic" value that lets us recover the lost input arg relationship
+    rep = {"beta": 0.213377, "alpha": 0.113377}
+
+    for pattern, replacement, args, workaround, extra_check in [
+        (
+            mm_pattern,
+            mm_replace,
+            [dim2a(), dim2b()],
+            {},
+            should_pad_mm,
+        ),
+        (
+            bmm_pattern,
+            bmm_replace,
+            [dim3a(), dim3b()],
+            {},
+            should_pad_bmm,
+        ),
+        (
+            addmm_pattern,
+            addmm_replace,
+            [dim1a(), dim2a(), dim2b()],
+            rep,
+            should_pad_addmm,
+        ),
+    ]:
+        assert isinstance(workaround, dict)  # mypy is unable to infer the type properly
+        register_replacement(
+            pattern,
+            replacement,
+            args,
+            joint_fwd_bwd,
+            patterns,
+            extra_check=extra_check,
+            scalar_workaround=workaround,
+        )
+        register_replacement(
+            pattern,
+            replacement,
+            args,
+            fwd_only,
+            patterns,
+            extra_check=extra_check,
+            scalar_workaround=workaround,
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/post_grad.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/post_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..a10893ae1a574896c3e977710fb451dab2ea2b22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/post_grad.py
@@ -0,0 +1,1100 @@
+import copy
+import functools
+import itertools
+import logging
+import operator
+from collections import Counter, defaultdict
+from typing import Any, Dict, List, Optional, Set, Union
+
+from sympy import Expr
+
+import torch
+import torch._inductor as inductor
+import torch.utils._pytree as pytree
+from torch import fx
+from torch._decomp import register_decomposition
+from torch._dynamo.utils import counters, optimus_scuba_log
+
+from torch._prims_common import is_boolean_dtype, is_expandable_to, is_integer_dtype
+
+from torch._utils_internal import upload_graph
+from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
+
+from .. import config, ir, pattern_matcher
+from ..fx_utils import FakeTensorUpdater, get_fake_args_kwargs, get_node_storage
+
+from ..lowering import lowerings as L
+from ..pattern_matcher import (
+    _return_true,
+    Arg,
+    CallFunction,
+    CallFunctionVarArgs,
+    filter_nodes,
+    get_arg_value,
+    get_mutation_region_id,
+    Ignored,
+    init_once_fakemode,
+    KeywordArg,
+    ListOf,
+    Match,
+    MULTIPLE,
+    PatternMatcherPass,
+    register_graph_pattern,
+    stable_topological_sort,
+)
+from ..utils import decode_device, is_pointwise_use
+from ..virtualized import V
+from .group_batch_fusion import group_batch_fusion_passes
+from .reinplace import reinplace_inplaceable_ops
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+# First pass_patterns[0] are applied, then [1], then [2]
+pass_patterns = [
+    PatternMatcherPass(),
+    PatternMatcherPass(),
+    PatternMatcherPass(),
+]
+# patterns applied only in inference
+inference_patterns = PatternMatcherPass()
+decompose_mm_pass = PatternMatcherPass()
+
+
+def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
+    """
+    Passes that run on after grad.  This is called once on the forwards
+    graph and once on the backwards graph.
+
+    The IR here has been normalized and functionalized.
+    """
+    if config.dce:
+        # has some issues with mutation in inference mode
+        gm.graph.eliminate_dead_code()
+
+    if is_inference and config.reorder_for_locality:
+        reorder_for_locality(gm.graph)
+
+    fake_tensor_updater = FakeTensorUpdater(gm.graph)
+
+    if config.post_grad_custom_pre_pass is not None:
+        config.post_grad_custom_pre_pass(gm.graph)
+
+    if config.pattern_matcher:
+        lazy_init()
+        inductor_before_change = copy.deepcopy(counters["inductor"])
+        group_batch_fusion_passes(gm.graph, pre_grad=False)
+        if counters["inductor"] != inductor_before_change:
+            optimus_scuba_log["group_batch_fusion_post_grad"] = upload_graph(gm.graph)
+        remove_noop_ops(gm.graph)
+        for patterns in pass_patterns:
+            patterns.apply(gm.graph)  # type: ignore[arg-type]
+        if is_inference:
+            inference_patterns.apply(gm.graph)  # type: ignore[arg-type]
+        decompose_mm_pass.apply(gm.graph)  # type: ignore[arg-type]
+
+    if config.post_grad_custom_post_pass is not None:
+        config.post_grad_custom_post_pass(gm.graph)
+
+    stable_topological_sort(gm.graph)
+
+    move_constructors_to_cuda(gm.graph)
+
+    fake_tensor_updater.incremental_update()
+
+    # Keep these last, since they introduces mutation. Look at
+    # ./fx_passes/README.md for a discussion of mutation invariants.
+    reinplace_inplaceable_ops(gm.graph)
+    decompose_auto_functionalized(gm.graph)
+
+    gm.recompile()
+    gm.graph.lint()
+
+
+@init_once_fakemode
+def lazy_init():
+    if torch._C._has_mkldnn:
+        from . import decompose_mem_bound_mm  # noqa: F401
+        from .mkldnn_fusion import _mkldnn_fusion_init
+
+        _mkldnn_fusion_init()
+
+
+def reorder_for_locality(graph: torch.fx.Graph):
+    def visit(other_node):
+        if (
+            other_node.op == "call_function"
+            and other_node.target != operator.getitem
+            and all((n in seen_nodes) for n in other_node.users)
+            and get_mutation_region_id(graph, node)
+            == get_mutation_region_id(graph, other_node)
+        ):
+            # move node's producers right before it
+            node.prepend(other_node)
+
+    seen_nodes = set()
+
+    # only reorder nodes before the first copy_ in the graph.
+    # copy_ will appear at the end of functionalized graphs when there is mutation on inputs,
+    # and this reordering doesnt work well with mutation
+    first_copy = next(
+        (
+            node
+            for node in graph.nodes
+            if node.op == "call_function"
+            and node.target == torch.ops.aten.copy_.default
+        ),
+        None,
+    )
+    past_mutating_epilogue = True if first_copy is None else False
+
+    for node in reversed(graph.nodes):
+        seen_nodes.add(node)
+        if not past_mutating_epilogue:
+            past_mutating_epilogue = node is first_copy
+            continue
+
+        torch.fx.map_arg((node.args, node.kwargs), visit)
+
+
+def register_lowering_pattern(pattern, extra_check=_return_true, pass_number=1):
+    """
+    Register an aten to inductor IR replacement pattern
+    """
+    return pattern_matcher.register_lowering_pattern(
+        pattern, extra_check, pass_dict=pass_patterns[pass_number]
+    )
+
+
+################################################################################
+# Actual patterns below this point.
+# Priority of patterns is:
+#   - later output nodes first
+#   - order patterns are defined in
+################################################################################
+
+
+def is_valid_mm_plus_mm(match: Match):
+    *b1, m1, k1 = match.kwargs["mat1"].meta.get("tensor_meta").shape
+    *b2, k2, n1 = match.kwargs["mat2"].meta.get("tensor_meta").shape
+    if k1 != k2:
+        return False
+
+    *b1, m2, k3 = match.kwargs["mat3"].meta.get("tensor_meta").shape
+    *b2, k4, n2 = match.kwargs["mat4"].meta.get("tensor_meta").shape
+    if k3 != k4:
+        return False
+
+    if m1 != m2 or n1 != n2:
+        return False
+
+    return True
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.add,
+        CallFunction(aten.mm, KeywordArg("mat1"), KeywordArg("mat2")),
+        CallFunction(aten.mm, KeywordArg("mat3"), KeywordArg("mat4")),
+    ),
+    extra_check=is_valid_mm_plus_mm,
+)
+def mm_plus_mm(match: Match, mat1, mat2, mat3, mat4):
+    return inductor.kernel.mm_plus_mm.tuned_mm_plus_mm(mat1, mat2, mat3, mat4)
+
+
+def cuda_and_enabled_mixed_mm(match):
+    return (config.use_mixed_mm or config.force_mixed_mm) and getattr(
+        match.kwargs["mat1"].meta.get("val"), "is_cuda", False
+    )
+
+
+def cuda_and_enabled_mixed_mm_and_not_int8(match):
+    return (
+        cuda_and_enabled_mixed_mm(match)
+        and getattr(match.kwargs["mat1"].meta.get("val"), "is_cuda", False)
+        and getattr(match.kwargs["mat2"].meta.get("val"), "dtype", torch.int8)
+        != torch.int8
+    )  # bitshift numerics in triton and pytorch don't match for torch.int8
+
+
+"""
+    this is intended to be used to unpack a [K,N] int4 tensor from a [K/2, N] uint4x2 tensor
+    (where the int4 and uint4x2 are represented with int8 and uint8 respectively)
+    where every other row of the int4 is packed with the row above it as:
+    uint4x2[k,n] = (8+int4[2*k,n])+(8+int4[2*k+1,n])<<4
+
+    unpack formulas:
+    int4[2*k,n]=(uint4x2[k,n] & 0xF) - 8
+    int4[2*k+1,n]=(uint4x2[k,n] >> 4) - 8
+
+    thus matching on unpack formula:
+    torch.mm(mat1, torch.cat((mat2 & 0xF, mat2>>4),1).reshape(mat2_mm_shape).to(mat2_dtype).sub(8))
+
+    note: although the unpack formula in pytorch and the triton kernel is designed for a uint8 mat2, the behavior
+    of the kernel matches the pytorch formula for all dtypes except torch.int8
+    where the bitwise numerics in triton do not match those in pytorch.
+"""
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.mm.default,
+        KeywordArg("mat1"),
+        CallFunction(
+            aten.sub.Tensor,
+            CallFunction(
+                prims.convert_element_type.default,
+                CallFunction(
+                    aten.reshape.default,
+                    CallFunction(
+                        aten.cat.default,
+                        ListOf(
+                            CallFunction(
+                                aten.bitwise_and.Scalar,
+                                KeywordArg("mat2"),
+                                0xF,
+                            ),
+                            CallFunction(
+                                aten.__rshift__.Scalar,
+                                KeywordArg("mat2"),
+                                4,
+                            ),
+                        ),
+                        1,
+                    ),
+                    KeywordArg("mat2_mm_shape"),
+                ),
+                KeywordArg("mat2_dtype"),
+            ),
+            8,
+        ),
+    ),
+    extra_check=cuda_and_enabled_mixed_mm_and_not_int8,
+)
+def uint4x2_mixed_mm(match: Match, mat1, mat2, mat2_mm_shape, mat2_dtype):
+    return inductor.kernel.unpack_mixed_mm.tuned_uint4x2_mixed_mm(
+        mat1, mat2, mat2_mm_shape, mat2_dtype
+    )
+
+
+"""
+    torch.mm(mat1, mat2.to(mat2_dtype))
+"""
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.mm,
+        KeywordArg("mat1"),
+        CallFunction(
+            prims.convert_element_type.default,
+            KeywordArg("mat2"),
+            KeywordArg("mat2_dtype"),
+        ),
+    ),
+    extra_check=cuda_and_enabled_mixed_mm,
+)
+def mixed_mm(match: Match, mat1, mat2, mat2_dtype):
+    return inductor.kernel.mm.tuned_mixed_mm(mat1, mat2, mat2_dtype)
+
+
+@register_graph_pattern(
+    CallFunction(
+        aten.cumsum.default,
+        CallFunction(
+            torch.ops.aten.full.default,
+            KeywordArg("shape"),
+            KeywordArg("fill_value"),
+            dtype=KeywordArg("dtype"),
+            layout=Ignored(),
+            device=KeywordArg("device"),
+            pin_memory=False,
+            _users=MULTIPLE,
+        ),
+        KeywordArg("dim"),
+        _users=MULTIPLE,
+    ),
+    pass_dict=pass_patterns[1],
+)
+def pointless_cumsum_replacement(match: Match, shape, fill_value, device, dtype, dim):
+    """Based on a pattern in OPTForCausalLM"""
+
+    if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
+        # cumsum promotes all integral types to int64
+        dtype = torch.int64
+
+    def repl(*shape):
+        dim_size = shape[dim]
+        idx = torch.arange(1, dim_size + 1, device=device, dtype=dtype)
+
+        inter_shape = [1] * len(shape)
+        inter_shape[dim] = dim_size
+        return (idx * fill_value).view(inter_shape).expand(shape)
+
+    # only replace the output node, not all nodes
+    match.nodes = [match.output_node()]
+    with V.fake_mode:
+        match.replace_by_example(repl, list(shape))
+
+
+def shape_of_mm(a, b):
+    m, _ = a.get_size()
+    _, n = b.get_size()
+    return [m, n]
+
+
+@register_lowering_pattern(
+    CallFunction(aten.cat, ListOf(CallFunction(aten.mm, Arg(), Arg())), Arg()),
+)
+def cat_mm(match, inputs, dim):
+    return cat_tuned_op(match, inputs, dim, op=L[aten.mm], shape_of=shape_of_mm)
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.cat, ListOf(CallFunction(aten.addmm, Arg(), Arg(), Arg())), Arg()
+    ),
+)
+def cat_addmm(match, inputs, dim):
+    def shape_of(bias, a, b):
+        m, _ = a.get_size()
+        _, n = b.get_size()
+        return [m, n]
+
+    return cat_tuned_op(match, inputs, dim, op=L[aten.addmm], shape_of=shape_of)
+
+
+def cat_tuned_op(match, inputs, dim, *, op, shape_of):
+    """
+    Memory planning to remove cat. We can't use the stock memory
+    planner since autotuning matmuls needs to know the output layout.
+    """
+    if len(inputs) == 1:
+        return op(*inputs[0])
+
+    # TODO(jansel): rewrite this as a bmm?
+    if dim < 0:
+        dim += len(shape_of(*inputs[0]))
+    assert dim in (0, 1)
+    notdim = 1 - dim
+
+    new_size: Optional[Union[List[Expr], List[int]]] = None
+    offsets_start = []
+    offsets_end = []
+
+    # compute output sizes
+    for i in range(len(inputs)):
+        shape = shape_of(*inputs[i])
+        if new_size is None:
+            new_size = shape
+        else:
+            new_size[notdim] = V.graph.sizevars.guard_equals(  # type: ignore[call-overload]
+                shape[notdim], new_size[notdim]
+            )
+            new_size[dim] += shape[dim]
+        offsets_start.append(new_size[dim] - shape[dim])
+        offsets_end.append(new_size[dim])
+
+    assert new_size is not None
+    dtype = functools.reduce(
+        torch.promote_types,
+        [x.get_dtype() for x in itertools.chain.from_iterable(inputs)],
+    )
+    device = inputs[0][0].get_device()
+    kernel = ir.ConcatKernel(
+        name=None,
+        layout=ir.FixedLayout(device, dtype, new_size),
+        inputs=[],
+    )
+    kernel_tensor = ir.TensorBox.create(kernel)
+
+    for i in range(len(inputs)):
+        dst = ir.SliceView.create(kernel_tensor, dim, offsets_start[i], offsets_end[i])
+        src = op(*inputs[i], layout=dst.get_layout()).data.data
+        assert isinstance(src, (ir.ExternKernelOut, ir.TemplateBuffer))
+        src.layout = ir.AliasedLayout(dst)
+        kernel.inputs.append(src)
+
+    kernel.name = V.graph.register_buffer(kernel)
+    kernel.inputs = ir.ConcatKernel.unwrap_storage(kernel.inputs)
+    return kernel_tensor
+
+
+_cat_1 = CallFunction(aten.cat, Arg(), 1, _users=2)
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.cat,
+        [
+            _cat_1,
+            CallFunction(
+                aten.slice,
+                _cat_1,
+                1,
+                0,
+                KeywordArg("size"),
+            ),
+        ],
+        1,
+    )
+)
+def cat_slice_cat(match, cat_input, size, dim=1):
+    """
+    This is an example of a more complex pattern where cat_1 is used
+    multiple times inside the pattern.  We fold 2 calls to cat into one.
+
+    Matches:
+        cat_1: f32[1024, 4077] = torch.ops.aten.cat.default([add_26, primals_217], 1)
+        slice_1: f32[1024, 4077] = torch.ops.aten.slice.Tensor(cat_1, 0, 0, 9223372036854775807)
+        slice_2: f32[1024, 19] = torch.ops.aten.slice.Tensor(slice_1, 1, 0, 19)
+        cat_2: f32[1024, 4096] = torch.ops.aten.cat.default([cat_1, slice_2], 1)
+
+
+    Rewrite to:
+        slice_2 = torch.ops.aten.slice.Tensor(add_26, 1, 0, 19)
+        cat_2 = torch.ops.aten.cat.default([add_26, primals_217, slice2], 1)
+    """
+    first, *rest = cat_input
+    # Optimization is optional, because we can just not fold the cat
+    # size should be within first.get_size()[dim] such that the optimization is valid.
+    # For negative `end`, we currently fallback to not optimizing.
+    if size >= 0 and V.graph.sizevars.statically_known_leq(size, first.get_size()[dim]):
+        # fold 2 cats into 1 cat
+        return L[aten.cat](
+            [
+                first,
+                *rest,
+                L[aten.slice](first, dim, 0, size),
+            ],
+            dim,
+        )
+    else:
+        # don't expect to hit this case, just fall back
+        tmp = L[aten.cat](cat_input, dim)
+        return L[aten.cat](
+            [
+                tmp,
+                L[aten.slice](tmp, dim, 0, size),
+            ],
+            dim,
+        )
+
+
+def is_valid_splitwithsizes_cat(match):
+    split_nodes = filter_nodes(match.nodes, aten.split_with_sizes)
+    cat_nodes = filter_nodes(match.nodes, aten.cat)
+    get_item_nodes = filter_nodes(match.nodes, operator.getitem)
+    if len(split_nodes) != 1 or len(cat_nodes) != 1:
+        return False
+    split_node, cat_node = split_nodes[0], cat_nodes[0]
+    # The dim of split and cat should match for passthrough
+    if get_arg_value(split_node, 2, "dim") != get_arg_value(cat_node, 1, "dim"):
+        return False
+    get_item_args = {
+        get_arg_value(get_item_node, 1) for get_item_node in get_item_nodes
+    }
+    assert None not in get_item_args
+    split_sizes = get_arg_value(split_node, 1, "split_sizes")
+    # All parts of split should be included in the cat
+    if get_item_args != set(range(len(split_sizes))):
+        return False
+    # The order of get_item_args should same with cat_node used.
+    # For example, if the split_node like split_with_sizes(input, [2, 2, 3], 1),
+    # the cat node should be like cat([get_item(0), get_item(1), get_item(2)], 1).
+    cat_items_args_order = [
+        get_arg_value(item_node, 1) for item_node in get_arg_value(cat_node, 0)
+    ]
+    if cat_items_args_order != list(range(len(split_sizes))):
+        return False
+
+    return True
+
+
+def same_meta(node1: torch.fx.Node, node2: torch.fx.Node):
+    """True if two nodes have the same metadata"""
+    val1 = node1.meta.get("val")
+    val2 = node2.meta.get("val")
+    return (
+        val1 is not None
+        and val2 is not None
+        and statically_known_true(sym_eq(val1.size(), val2.size()))
+        and val1.layout == val2.layout
+        and val1.dtype == val2.dtype
+        and val1.device == val2.device
+        and (
+            val1.layout != torch.strided
+            or statically_known_true(sym_eq(val1.stride(), val2.stride()))
+        )
+    )
+
+
+noop_registry: Dict[Any, Any] = {}
+
+
+def register_noop_decomp(targets, nop_arg=0):
+    def register_fun(cond):
+        register_decomposition(targets, registry=noop_registry, unsafe=True)(
+            (cond, nop_arg)
+        )
+        return cond
+
+    return register_fun
+
+
+@register_noop_decomp(aten.slice)
+def slice_noop(self, dim=0, start=None, end=None, step=1):
+    if start is None or end is None:
+        return False
+    if start == 0 and end >= 2**63 - 1 and step == 1:
+        return True
+    return False
+
+
+@register_noop_decomp(aten.slice_scatter, 1)
+def slice_scatter_noop(self, src, dim=0, start=None, end=None, step=1):
+    if start is None:
+        start = 0
+    if end is None:
+        end = 2**63 - 1
+    if start == 0 and end >= 2**63 - 1 and step == 1:
+        return True
+    return False
+
+
+@register_noop_decomp(aten.repeat)
+def repeat_noop(self, repeats):
+    return all(r == 1 for r in repeats)
+
+
+@register_noop_decomp(aten.constant_pad_nd)
+def constant_pad_nd(x, padding, fill_value=0):
+    return all(p == 0 for p in padding)
+
+
+@register_noop_decomp(torch.ops.prims.convert_element_type)
+def convert_element_type_noop(x, dtype: torch.dtype):
+    return x.dtype == dtype
+
+
+@register_noop_decomp(torch.ops.prims.device_put)
+def device_put_noop(x, device):
+    return x.device == decode_device(device)
+
+
+@register_noop_decomp([aten.ceil, aten.floor, aten.round, aten.trunc])
+def int_noop(x):
+    return is_integer_dtype(x.dtype)
+
+
+@register_noop_decomp([aten.pow])
+def pow_noop(a, b):
+    return isinstance(b, int) and b == 1
+
+
+@register_noop_decomp([aten.cat], lambda args: args[0][0])
+def cat_noop(inputs, dim=0):
+    return len(inputs) == 1
+
+
+@register_noop_decomp(aten.view)
+def view_noop(arg, size):
+    return arg.shape == size
+
+
+# Note, we also always have a check for identical metadata, which is why these
+# are safe
+@register_noop_decomp([aten.copy], nop_arg=1)
+@register_noop_decomp([aten.alias, aten.clone])
+def true_noop(*args, **kwargs):
+    return True
+
+
+def remove_noop_ops(graph: torch.fx.Graph):
+    """
+    Removes both operations that are essentially aten.clone and operations that are essentially aten.alias from the graph.
+    """
+    inputs = set()
+    input_storages = set()
+    output_storages = set()
+
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            inputs.add(node)
+            input_storages.add(get_node_storage(node))
+        else:
+            break
+
+    output_node = next(iter(reversed(graph.nodes)))
+    assert output_node.op == "output"
+    for out in output_node.args[0]:
+        if isinstance(out, torch.fx.Node):
+            output_storages.add(get_node_storage(out))
+
+    for node in graph.nodes:
+        if node.target in noop_registry:
+            cond, src_index = noop_registry[node.target]
+            if isinstance(src_index, int):
+                src = node.args[src_index]
+            else:
+                src = src_index(node.args)
+            if not isinstance(src, torch.fx.Node):
+                continue
+            # Don't introduce new aliasing between inputs and outputs.
+            # See fx_passes/README.md for a discussion of why this is
+            # necessary.
+            node_storage = get_node_storage(node)
+            src_storage = get_node_storage(src)
+            node_is_view = node_storage == src_storage
+            if (
+                not node_is_view
+                and node_storage in output_storages
+                and (src_storage in input_storages or src_storage in output_storages)
+            ):
+                continue
+
+            # Even if input and outputs are expected to alias,
+            # don't make "node is src" True
+            if (
+                node_is_view
+                and node in output_node.args
+                and (src in inputs or src in output_node.args)
+            ):
+                continue
+
+            is_valid, args, kwargs = get_fake_args_kwargs(node)
+            if not is_valid:
+                continue
+            if same_meta(node, src) and cond(*args, **kwargs):
+                node.replace_all_uses_with(src)
+                graph.erase_node(node)
+
+
+def decompose_auto_functionalized(graph):
+    graph_pass = PatternMatcherPass()
+
+    @register_graph_pattern(
+        CallFunctionVarArgs(torch.ops.higher_order.auto_functionalized),
+        pass_dict=graph_pass,
+    )
+    def replacement(match: Match, *args, **kwargs):
+        from torch._higher_order_ops.auto_functionalize import auto_functionalized_dense
+
+        only_clone_these_tensors = tuple(
+            match.nodes[0].meta.get("only_clone_these_tensors", [])
+        )
+
+        flat_args, spec = pytree.tree_flatten((args, kwargs))
+
+        # NB: we combine (args, kwargs) into flat args for replacing.
+        # This is replace_by_example uses make_fx which does not support
+        # tracing a function with kwargs.
+        def decomp(*flat_args):
+            args, kwargs = pytree.tree_unflatten(flat_args, spec)
+            return auto_functionalized_dense(*args, only_clone_these_tensors, **kwargs)
+
+        with V.fake_mode:
+            match.replace_by_example(decomp, flat_args, run_dce=False)
+
+    graph_pass.apply(graph)
+    for node in graph.nodes:
+        if node.target is torch.ops.higher_order.auto_functionalized:
+            raise AssertionError("auto_functionalized was not removed")
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.cat,
+        ListOf(
+            CallFunction(
+                operator.getitem,
+                CallFunction(
+                    aten.split_with_sizes,
+                    KeywordArg("input_"),
+                    Ignored(),
+                    Ignored(),
+                    _users=MULTIPLE,
+                ),
+                Ignored(),
+            ),
+        ),
+        Ignored(),
+    ),
+    pass_number=2,
+    extra_check=is_valid_splitwithsizes_cat,
+)
+def splitwithsizes_cat_replace(match, input_):
+    return input_
+
+
+def is_valid_cat_splitwithsizes(match):
+    cat_nodes = filter_nodes(match.nodes, aten.cat)
+    split_nodes = filter_nodes(match.nodes, aten.split_with_sizes)
+    if len(split_nodes) != 1 or len(cat_nodes) != 1:
+        return False
+    split_node, cat_node = split_nodes[0], cat_nodes[0]
+
+    # the cat node has other users: can't eliminate
+    if len(cat_node.users) > 1:
+        return False
+
+    # the dim of the cat and split should match
+    dim = get_arg_value(split_node, 2, "dim")
+    if dim != get_arg_value(cat_node, 1, "dim"):
+        return False
+
+    cat_inputs = list(get_arg_value(cat_node, 0))
+    split_sizes = get_arg_value(split_node, 1, "split_sizes")
+    # the number of input tensors in cat and the
+    # length of the split sizes should match
+    if len(cat_inputs) != len(split_sizes):
+        return False
+
+    for cat_input, split_size in zip(cat_inputs, split_sizes):
+        # each cat input tensor's size along dim
+        # should match the corresponding split size
+        if "val" not in cat_input.meta:
+            return False
+        cat_input_size = cat_input.meta["val"].size(dim)
+        if cat_input_size != split_size:
+            return False
+
+    return True
+
+
+@register_lowering_pattern(
+    CallFunction(
+        aten.split_with_sizes,
+        CallFunction(
+            aten.cat,
+            KeywordArg("input_"),
+            Ignored(),
+            _users=MULTIPLE,
+        ),
+        Ignored(),
+        Ignored(),
+    ),
+    pass_number=2,
+    extra_check=is_valid_cat_splitwithsizes,
+)
+def cat_splitwithsizes_replace(match, input_):
+    return input_
+
+
+def view_to_reshape(gm):
+    """
+    Replace view ops in the GraphModule to reshape ops.
+    """
+    for nd in gm.graph.nodes:
+        if nd.target == torch.ops.aten.view.default:
+            nd.target = torch.ops.aten.reshape.default
+
+
+def should_prefer_unfused_addmm(match):
+    inp = match.kwargs["inp"]
+    if not inp.meta["val"].is_cuda:
+        return False
+
+    output = match.output_node()
+    return all(is_pointwise_use(use) for use in output.users)
+
+
+@register_graph_pattern(
+    CallFunction(aten.addmm, KeywordArg("inp"), Arg(), Arg()),
+    pass_dict=pass_patterns[2],
+    extra_check=should_prefer_unfused_addmm,
+)
+def unfuse_bias_add_to_pointwise(match: Match, mat1, mat2, *, inp):
+    def repl(inp, x1, x2):
+        return x1 @ x2 + inp
+
+    with V.fake_mode:
+        match.replace_by_example(repl, [inp, mat1, mat2])
+
+
+def is_valid_addmm_fusion(match):
+    mat1, mat2 = match.args
+    inp = match.kwargs["inp"]
+
+    if not (
+        isinstance(inp, torch.fx.Node) and isinstance(inp.meta["val"], torch.Tensor)
+    ):
+        return False  # Input is a number
+
+    in_shape = inp.meta["val"].shape
+    mm_shape = mat1.meta["val"].shape[0], mat2.meta["val"].shape[1]
+    matched = is_expandable_to(in_shape, mm_shape)
+    if not matched:
+        return False  # Shape mismatch
+
+    return not should_prefer_unfused_addmm(match)
+
+
+@register_graph_pattern(
+    CallFunction(
+        aten.add,
+        CallFunction(aten.mm, Arg(), Arg()),
+        KeywordArg("inp"),
+    ),
+    pass_dict=pass_patterns[2],
+    extra_check=is_valid_addmm_fusion,
+)
+@register_graph_pattern(
+    CallFunction(
+        aten.add,
+        KeywordArg("inp"),
+        CallFunction(aten.mm, Arg(), Arg()),
+    ),
+    pass_dict=pass_patterns[2],
+    extra_check=is_valid_addmm_fusion,
+)
+def addmm(match, mat1, mat2, *, inp):
+    def repl(inp, mat1, mat2):
+        return aten.addmm(inp, mat1, mat2)
+
+    with V.fake_mode:
+        match.replace_by_example(repl, [inp, mat1, mat2])
+
+
+def check_shape_cuda_and_fused_int_mm_mul_enabled(match):
+    return (
+        config.force_fuse_int_mm_with_mul
+        and len(getattr(match.args[2].meta.get("val"), "shape", [])) == 2
+        and getattr(match.args[2].meta.get("val"), "is_cuda", False)
+    )
+
+
+@register_lowering_pattern(
+    CallFunction(
+        prims.convert_element_type.default,
+        CallFunction(
+            aten.mul,
+            CallFunction(
+                aten._int_mm,
+                Arg(),
+                Arg(),
+            ),
+            Arg(),
+        ),
+        Arg(),
+    ),
+    check_shape_cuda_and_fused_int_mm_mul_enabled,
+)
+@register_lowering_pattern(
+    CallFunction(
+        aten.mul,
+        CallFunction(
+            aten._int_mm,
+            Arg(),
+            Arg(),
+        ),
+        Arg(),
+    ),
+    check_shape_cuda_and_fused_int_mm_mul_enabled,
+)
+def fused_int_mm_mul(match: Match, mat1, mat2, mat3, out_dtype=None):
+    return inductor.kernel.mm.tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype)
+
+
+class ConstructorMoverPass:
+    def __init__(self, target: str, allow_outputs: bool = False) -> None:
+        """
+        Move constructors from cpu to the target_device.
+
+        Sweeps through the module, looking for constructor nodes that can be moved
+        to the target_device.
+
+        A constructor node can be moved to the target_device iff all of its users
+        can also be moved (tested by cannot_be_moved). Otherwise, all dependent
+        constructor nodes won't be moved.
+
+        - target: target device type
+        - allow_outputs: allow outputs to be moved
+        """
+
+        self.target = target
+        self.allow_outputs = allow_outputs
+
+        assert isinstance(target, str), (
+            "target should be a string representing the device type. "
+            f"Got: {type(target).__name__}"
+        )
+
+    def allow_cpu_device(self, node: fx.Node) -> bool:
+        """
+        Returns whether a node that returns a tensor on the target device may have
+        cpu tensors as input.
+        """
+        return node.target in (
+            torch.ops.aten.index.Tensor,
+            torch.ops.aten.index_put.default,
+            torch.ops.aten.index_put_.default,
+            torch.ops.aten.copy.default,
+            torch.ops.aten.copy_.default,
+            torch.ops.aten.slice_scatter.default,
+        )
+
+    def cannot_be_moved(self, node: fx.Node) -> bool:
+        """
+        Returns whether a node can be moved to the target device.
+
+        If this function returns False, it means that this node and all of its users
+        won't be moved into the target device.
+        """
+        if node.target == "output":
+            return not self.allow_outputs
+
+        if not (
+            isinstance(node.target, torch._ops.OpOverload)
+            and node.target.namespace in ("prims", "aten")
+        ):
+            return True
+
+        return False
+
+    def get_node_device(self, node: fx.Node) -> Optional[torch.device]:
+        """
+        Get the device of a node.
+        """
+        ten = node.meta.get("val")
+        return None if not isinstance(ten, torch.Tensor) else ten.device
+
+    def get_cpu_indeg_count(self, graph: fx.Graph) -> Dict[fx.Node, int]:
+        """
+        Get the number of cpu inputs to a node
+        """
+        cpu_indeg: Dict[fx.Node, int] = Counter()
+
+        for node in graph.nodes:
+            cpu_count = 0
+
+            def add_cpu_inp(node):
+                nonlocal cpu_count
+                device = self.get_node_device(node)
+                cpu_count += device is not None and device.type == "cpu"
+
+            pytree.tree_map_only(fx.Node, add_cpu_inp, (node.args, node.kwargs))
+
+            if cpu_count:
+                cpu_indeg[node] = cpu_count
+
+        return cpu_indeg
+
+    def __call__(self, graph: fx.Graph) -> None:
+        target_devices = set()
+        constructors = []
+
+        for node in graph.nodes:
+            device = self.get_node_device(node)
+            if device and device.type == self.target:
+                target_devices.add(device)
+
+            if not (
+                isinstance(node.target, torch._ops.OpOverload)
+                and node.target.namespace in ("prims", "aten")
+            ):
+                continue
+
+            if not torch._subclasses.fake_tensor._is_tensor_constructor(node.target):
+                continue
+
+            if not node.kwargs.get("device") == torch.device("cpu"):
+                continue
+
+            constructors.append(node)
+
+        # not handling multiple target devices initially
+        if not constructors or len(target_devices) != 1:
+            return
+
+        movable_constructors = self.find_movable_constructors(graph, constructors)
+
+        for node in movable_constructors:
+            kwargs = node.kwargs.copy()
+            kwargs["device"] = next(iter(target_devices))
+            node.kwargs = kwargs
+
+    def find_movable_constructors(
+        self, graph: fx.Graph, constructors: List[fx.Node]
+    ) -> Set[fx.Node]:
+        """
+        Starting from the cpu constructors, iterate through the graph and test that all of their
+        downstream uses can safely be moved to cpu.
+        """
+        cpu_indeg: Dict[fx.Node, int] = self.get_cpu_indeg_count(graph)
+
+        # which constructors cannot be moved to cuda
+        cannot_move_to_cuda: Set[fx.Node] = set()
+
+        # For any node in the graph, which constructors does it have a dependency on
+        constructor_dependencies: Dict[fx.Node, Set[fx.Node]] = defaultdict(set)
+
+        # if a cpu node has a dependency on two different cpu constructors,
+        # then if either constructor cannot be moved to cuda, the other cannot as well.
+        # In this case any node with a dependency on one will have a dependency on the other
+        equal_constructor_sets: Dict[fx.Node, Set[fx.Node]] = {
+            c: {c} for c in constructors
+        }
+
+        def make_dependencies_equivalent(
+            set1: Set[fx.Node], set2: Set[fx.Node]
+        ) -> Set[fx.Node]:
+            # could use union find but not worth complexity here
+            set1.update(set2)
+            for obj in set1:
+                equal_constructor_sets[obj] = set1
+            return set1
+
+        queue: List[fx.Node] = list(constructors)
+
+        for c in queue:
+            constructor_dependencies[c].add(c)
+
+        while queue:
+            node = queue.pop()
+            dependencies = constructor_dependencies[node]
+
+            for user in node.users:
+                if self.cannot_be_moved(user):
+                    cannot_move_to_cuda.update(dependencies)
+                    break
+
+                # this node was used on a op which takes in multiple devices and output a cuda
+                # tensor. we can convert its cpu input to cuda without making further changes
+                node_device = self.get_node_device(user)
+                if (
+                    self.allow_cpu_device(user)
+                    and node_device
+                    and node_device.type == self.target
+                ):
+                    del cpu_indeg[user]
+                else:
+                    # otherwise, we should continue look at its downstream uses
+                    cpu_indeg[user] -= 1
+                    if cpu_indeg[user] == 0:
+                        del cpu_indeg[user]
+                        queue.append(user)
+
+                unioned_set = make_dependencies_equivalent(
+                    dependencies, constructor_dependencies[user]
+                )
+                constructor_dependencies[user] = unioned_set
+
+        for node in cpu_indeg:
+            if constructor_dependencies[node]:
+                cannot_move_to_cuda.update(constructor_dependencies[node])
+
+        all_cannot_move_to_cuda = cannot_move_to_cuda.copy()
+        for constructor in cannot_move_to_cuda:
+            all_cannot_move_to_cuda.update(equal_constructor_sets[constructor])
+
+        return set(constructors) - all_cannot_move_to_cuda
+
+
+def move_constructors_to_cuda(graph: fx.Graph) -> None:
+    """
+    Moves intermediary tensors which are constructed on the cpu to cuda when safe
+    """
+    ConstructorMoverPass("cuda")(graph)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pre_grad.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pre_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed3b3229003399fdcfd979bd7e0ce79ef3786fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/pre_grad.py
@@ -0,0 +1,611 @@
+import copy
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from torch._dynamo.utils import counters, detect_fake_mode, optimus_scuba_log
+from torch._utils_internal import upload_graph
+from torch.fx.experimental.optimization import (
+    matches_module_pattern,
+    replace_node_module,
+)
+from torch.fx.passes.shape_prop import ShapeProp
+from torch.nn import functional as F
+from torch.nn.utils.fusion import fuse_conv_bn_eval, fuse_conv_bn_weights
+
+from .. import config
+
+from ..fx_utils import matches_module_function_pattern
+from ..pattern_matcher import (
+    init_once_fakemode,
+    PatternMatcherPass,
+    stable_topological_sort,
+)
+from ..utils import is_cpu_device, pass_execution_and_save
+from .group_batch_fusion import group_batch_fusion_passes
+from .misc_patterns import numpy_compat_normalization
+
+log = logging.getLogger(__name__)
+
+normalization_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="normalization_pass"
+)
+merge_splits_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="merge_splits_pass"
+)
+split_cat_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="split_cat_pass"
+)
+unbind_stack_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="unbind_stack_pass"
+)
+efficient_conv_bn_eval_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="efficient_conv_bn_eval_pass"
+)
+merge_getitem_cat_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True, pass_name="merge_getitem_cat_pass"
+)
+
+fuse_split_linear_add_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="fuse_split_linear_add_pass",
+)
+fuse_chunk_squeeze_cat_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="fuse_chunk_squeeze_cat_pass",
+)
+remove_reshape_pass = PatternMatcherPass(
+    prevent_match_across_mutations=True,
+    pass_name="remove_reshape_pass",
+)
+
+# based on predispatch aten IR
+normalization_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_splits_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+split_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+unbind_stack_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+merge_getitem_cat_pass_aten = PatternMatcherPass(prevent_match_across_mutations=True)
+
+
+def fuse_parallel_linear_pass(graph):
+    return None
+
+
+def remove_split_ops(graph, shape_prop):
+    return None
+
+
+pattern_matcher_passes: List[PatternMatcherPass] = [
+    normalization_pass,
+    merge_getitem_cat_pass,
+    merge_splits_pass,
+    split_cat_pass,
+    unbind_stack_pass,
+    efficient_conv_bn_eval_pass,
+]
+pattern_matcher_passes_aten: List[PatternMatcherPass] = [
+    merge_getitem_cat_pass_aten,
+    merge_splits_pass_aten,
+    split_cat_pass_aten,
+    unbind_stack_pass_aten,
+]
+
+
+@init_once_fakemode
+def lazy_init():
+    from . import efficient_conv_bn_eval, split_cat  # noqa: F401  # noqa: F401
+
+    if config.is_fbcode():
+        from . import fb  # type: ignore[attr-defined]  # noqa: F401
+
+
+def pre_grad_passes(gm: torch.fx.GraphModule, example_inputs=None):
+    """
+    Apply passes on the input FX graph using Torch IR.
+
+    WARNING:
+    The IR before grad is not functional or normalized, so it is harder
+    to write passes on this IR.  Passes must be safe with respect to
+    aliasing and mutation and need to handle all possible arg schemas.
+
+    Consider adding a new pass to post_grad.py or joint_graph.py which
+    are after functionalization and normalization.
+    """
+    if config.pattern_matcher:
+        lazy_init()
+        if hasattr(
+            config, "fx_passes_numeric_check"
+        ) and config.fx_passes_numeric_check.get("pre_grad", False):
+            gm_before_fx_passes = gm.__copy__()
+        # explicitly run with predispatch atenIR based passes
+        if config.is_predispatch:
+
+            def shape_prop(mod) -> None:
+                ShapeProp(
+                    gm=mod,
+                    fake_mode=detect_fake_mode(example_inputs),
+                ).propagate(*example_inputs)
+
+            # normalization pass
+            pass_execution_and_save(
+                normalization_pass_aten.apply,
+                gm,
+                "[Pre grad(predispatch IR)]Apply normalization pass",
+            )
+            pass_execution_and_save(
+                group_batch_fusion_passes,
+                gm,
+                "[Pre grad(predispatch IR)] Apply group_batch_fusion",
+            )
+            pass_execution_and_save(
+                fuse_chunk_squeeze_cat_pass.apply,
+                gm,
+                "[Pre grad(predispatch IR)] Apply fuse_chunk_squeeze_cat_pass",
+            )
+            pass_execution_and_save(
+                fuse_split_linear_add_pass.apply,
+                gm,
+                "[Pre grad(predispatch IR)] Apply fuse_split_linear_add_pass",
+            )
+
+            log.debug(
+                "[Pre grad(predispatch IR)]Before split cat in pre grad pass. graph: %s",
+                gm.graph,
+            )
+            for ind, pattern_matcher_pass_aten in enumerate(
+                pattern_matcher_passes_aten
+            ):
+                pass_execution_and_save(
+                    pattern_matcher_pass_aten.apply,
+                    gm,
+                    f"[Pre grad(predispatch IR)]Apply split_cat, index: {ind}",
+                )
+            pass_execution_and_save(
+                remove_reshape_pass.apply,
+                gm,
+                "[Pre grad(predispatch IR)] Apply remove_reshape_pass",
+            )
+            pass_execution_and_save(
+                fuse_parallel_linear_pass,
+                gm,
+                "[Pre grad(predispatch IR)] Apply fuse_parallel_linear_pass",
+            )
+            pass_execution_and_save(
+                lambda graph: remove_split_ops(graph.owning_module, shape_prop),
+                gm,
+                "[Pre grad(predispatch IR)] Apply remove_split_ops",
+            )
+            shape_prop(gm)
+
+        else:
+            # We only log the graph with changes to avoid the excessive compilation time
+            # https://fb.workplace.com/groups/257735836456307/permalink/633533465543207/
+            if example_inputs is not None:
+                gm = fuse_fx(gm, example_inputs)
+            numpy_compat_normalization(gm.graph)
+            inductor_before_change = copy.deepcopy(counters["inductor"])
+            group_batch_fusion_passes(gm.graph, pre_grad=True)
+            if counters["inductor"] != inductor_before_change:
+                optimus_scuba_log["group_batch_fusion_pre_grad"] = upload_graph(
+                    gm.graph
+                )
+            for pattern_matcher_pass in pattern_matcher_passes:
+                inductor_before_change = copy.deepcopy(counters["inductor"])
+                pattern_matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+                if counters["inductor"] != inductor_before_change:
+                    optimus_scuba_log[
+                        f"split_cat_pattern_{pattern_matcher_pass.pass_name}_pre_grad"
+                    ] = upload_graph(gm.graph)
+
+    if config.pre_grad_custom_pass is not None:
+        config.pre_grad_custom_pass(gm.graph)
+    stable_topological_sort(gm.graph)
+    gm.graph.lint()
+    gm.recompile()
+
+    if (
+        config.pattern_matcher
+        and hasattr(config, "fx_passes_numeric_check")
+        and config.fx_passes_numeric_check.get("pre_grad", False)
+        and example_inputs is not None
+    ):
+        from .numeric_utils import numeric_check_if_enabled
+
+        gm_after_fx_passes = gm.__copy__()
+        numeric_check_if_enabled(
+            gm_before_fx_passes,  # type: ignore[possibly-undefined]
+            gm_after_fx_passes,
+            example_inputs,
+            config.fx_passes_numeric_check.get("num_iterations", 1),
+            config.fx_passes_numeric_check.get("precision", 1e-4),
+        )
+
+    return gm
+
+
+def fuse_fx(gm: torch.fx.GraphModule, example_inputs) -> torch.fx.GraphModule:
+    is_cpu = is_cpu_device(example_inputs)
+
+    fake_mode = detect_fake_mode(example_inputs)
+
+    gm = sink_cat_after_pointwise(gm)
+    if config.permute_fusion and not is_cpu:
+        # For linear permute fusion, we need to check input info to identify
+        # and perform proper permutation/transpose
+        ShapeProp(gm, fake_mode=fake_mode).propagate(*example_inputs)
+        gm = linear_permute_fusion(gm)
+        gm = permute_linear_fusion(gm)
+        gm = permute_matmul_fusion(gm)
+
+    # make sure the autograd is disabled.
+    if torch.is_grad_enabled() or not is_cpu:
+        return gm
+    if config.freezing:
+        gm = remove_identity(gm)
+        gm = fuse_conv_bn(gm)
+    return gm
+
+
+def fetch_attr(target: str, mod):
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistant target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+def remove_identity(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Removes all identity layers from the module.
+    """
+
+    class IdentityRemover(torch.fx.Transformer):
+        def call_module(self, target, args, kwargs):
+            if isinstance(self.submodules[target], nn.Identity):
+                assert len(args) == 1
+                return args[0]
+            else:
+                return super().call_module(target, args, kwargs)
+
+    return IdentityRemover(gm).transform()
+
+
+def fuse_conv_bn(gm: torch.fx.GraphModule, inplace=False) -> torch.fx.GraphModule:
+    """
+    Fuses Convolution/BN layers for inference purposes.
+    """
+    modules_patterns = [
+        (torch.nn.Conv1d, torch.nn.BatchNorm1d),
+        (torch.nn.Conv2d, torch.nn.BatchNorm2d),
+        (torch.nn.Conv3d, torch.nn.BatchNorm3d),
+    ]
+    module_function_patterns = [
+        (torch.nn.Conv1d, F.batch_norm),
+        (torch.nn.Conv2d, F.batch_norm),
+        (torch.nn.Conv3d, F.batch_norm),
+    ]
+    modules = dict(gm.named_modules())
+    for pattern in modules_patterns:
+        for node in gm.graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                bn = modules[node.target]
+                eval_mode = all(not n.training for n in [conv, bn])
+                if not eval_mode:
+                    continue
+                if not bn.track_running_stats:
+                    continue
+                fused_conv = fuse_conv_bn_eval(conv, bn)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                gm.graph.erase_node(node)
+    gm.graph.lint()
+    for pattern in module_function_patterns:
+        for node in gm.graph.nodes:
+            if matches_module_function_pattern(pattern, node, modules):
+                # TODO: support kwargs.
+                if len(node.args) != 8:
+                    continue
+                conv = modules[node.args[0].target]
+                bn_training = node.args[5]
+                bn_eps = node.args[7]
+                if conv.training or bn_training:
+                    continue
+                if type(bn_eps) is not float:
+                    continue
+                bn_args_is_constant = all(
+                    n.op == "get_attr" and len(n.users) == 1 for n in node.args[1:5]
+                )
+                if not bn_args_is_constant:
+                    continue
+                bn_running_mean = fetch_attr(node.args[1].target, gm)
+                bn_running_var = fetch_attr(node.args[2].target, gm)
+                bn_weight = fetch_attr(node.args[3].target, gm)
+                bn_bias = fetch_attr(node.args[4].target, gm)
+                if bn_running_mean is None or bn_running_var is None:
+                    continue
+                fused_conv = copy.deepcopy(conv)
+                fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
+                    fused_conv.weight,
+                    fused_conv.bias,
+                    bn_running_mean,
+                    bn_running_var,
+                    bn_eps,
+                    bn_weight,
+                    bn_bias,
+                )
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                gm.graph.erase_node(node)
+    gm.graph.lint()
+    gm.recompile()
+
+    return gm
+
+
+class NormalizedLinearNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.nn.functional.linear]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]  # type: ignore[return-value]
+        else:
+            return self.node.kwargs["input"]  # type: ignore[return-value]
+
+    def get_weight(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]  # type: ignore[return-value]
+        else:
+            return self.node.kwargs["weight"]  # type: ignore[return-value]
+
+    def get_bias(self) -> torch.fx.Node:
+        if len(self.node.args) > 2:
+            return self.node.args[2]  # type: ignore[return-value]
+        else:
+            return self.node.kwargs["bias"] if "bias" in self.node.kwargs else None  # type: ignore[return-value]
+
+
+class NormalizedMatmulNode:
+    def __init__(self, node: torch.fx.Node) -> None:
+        assert node.op == "call_function"
+        assert node.target in [torch.bmm, torch.matmul]
+        self.node: torch.fx.Node = node
+
+    def get_input(self) -> torch.fx.Node:
+        if len(self.node.args) > 0:
+            return self.node.args[0]  # type: ignore[return-value]
+        else:
+            return self.node.kwargs["input"]  # type: ignore[return-value]
+
+    def get_other(self) -> torch.fx.Node:
+        if len(self.node.args) > 1:
+            return self.node.args[1]  # type: ignore[return-value]
+        else:
+            return self.node.kwargs["other"]  # type: ignore[return-value]
+
+
+def check_permute(node: torch.fx.Node) -> bool:
+    ranks = len(node.meta["tensor_meta"].shape)
+    if len(node.args) > 3:
+        permutation = [node.args[i] % ranks for i in range(1, ranks + 1)]  # type: ignore[operator]
+    elif (
+        "permutation" in node.kwargs
+        and node.kwargs["permutation"] is not None
+        and len(node.kwargs["permutation"]) > 2  # type: ignore[arg-type]
+    ):
+        permutation = [i % ranks for i in node.kwargs["permutation"]]  # type: ignore[union-attr]
+    else:
+        return False
+    allowed_permutation = list(range(ranks))
+    allowed_permutation[-1] = ranks - 2
+    allowed_permutation[-2] = ranks - 1
+    return permutation == allowed_permutation
+
+
+def sink_cat_after_pointwise(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    def one_user(node):
+        users = list(node.users)
+        return users[0] if len(users) == 1 else None
+
+    def is_view(node):
+        view = {"view"}
+        return node.op == "call_method" and node.target in view
+
+    def is_pointwise_unary(node):
+        pointwise = {torch.relu, torch.tanh, "relu", "tanh"}
+        return node.op in {"call_function", "call_method"} and node.target in pointwise
+
+    g = module.graph
+    for node in g.nodes:
+        if node.op != "call_function" or node.target != torch.cat:
+            continue
+
+        cat_or_view = node
+        while True:
+            user = one_user(cat_or_view)
+            if not user or not is_view(user):
+                break
+            cat_or_view = user
+
+        if user and is_pointwise_unary(user):
+            with g.inserting_before(node):
+
+                def cat_args(tensors, dim=0):
+                    return tensors, dim
+
+                tensors, dim = cat_args(*node.args, **node.kwargs)
+                new_tensors = [
+                    g.create_node(user.op, user.target, args=(arg,), kwargs=user.kwargs)
+                    for arg in tensors
+                ]
+                new_cat = g.create_node(
+                    "call_function", torch.cat, args=(new_tensors, dim)
+                )
+                user.replace_all_uses_with(cat_or_view)
+                node.replace_all_uses_with(new_cat)
+                g.erase_node(user)
+                g.erase_node(node)
+    g.lint()
+    module.recompile()
+    return module
+
+
+def linear_permute_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if (
+            node.op == "call_method"
+            and node.target == "permute"
+            and check_permute(node)
+        ):
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_function"
+                and input_node.target == torch.nn.functional.linear
+            ):
+                normalized = NormalizedLinearNode(input_node)
+                input = normalized.get_input()
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        linear_transpose, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+                    module.graph.erase_node(node)
+                    if len(input_node.users) == 0:
+                        module.graph.erase_node(input_node)
+
+    module.graph.lint()
+    module.recompile()
+    return module
+
+
+# Y1 = X * W^T + bias
+# Y2 = Y1.permute(0, 2, 1)
+# ---->
+# Y2 = (W * X^T + bias.unsqueeze(-1))^T
+def linear_transpose(
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+) -> torch.Tensor:
+    if bias is None:
+        return torch.matmul(weight, input.transpose(-1, -2))
+    return torch.matmul(weight, input.transpose(-1, -2)) + bias.unsqueeze(-1)
+
+
+def permute_linear_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and node.target == torch.nn.functional.linear:
+            if len(node.args) > 0:
+                input_node = node.args[0]
+            else:
+                input_node = node.kwargs["input"]
+            if (
+                input_node.op == "call_method"
+                and input_node.target == "permute"
+                and check_permute(input_node)
+            ):
+                normalized = NormalizedLinearNode(node)
+                if len(input_node.args) > 0:
+                    input = input_node.args[0]
+                else:
+                    input = input_node.kwargs["input"]
+                weight = normalized.get_weight()
+                bias = normalized.get_bias()
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_linear, args=(input, weight, bias)
+                    )
+                    node.replace_all_uses_with(fused_node)
+                    module.graph.erase_node(node)
+                    if len(input_node.users) == 0:
+                        module.graph.erase_node(input_node)
+
+    module.graph.lint()
+    module.recompile()
+    return module
+
+
+def permute_matmul_fusion(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for node in module.graph.nodes:
+        if node.op == "call_function" and (
+            node.target == torch.bmm or node.target == torch.matmul
+        ):
+            normalized = NormalizedMatmulNode(node)
+            input_A_node = normalized.get_input()
+            input_B_node = normalized.get_other()
+            input_A = input_A_node
+            input_B = input_B_node
+            Atrans = Btrans = False
+            if (
+                input_A_node.op == "call_method"
+                and input_A_node.target == "permute"
+                and check_permute(input_A_node)
+            ):
+                Atrans = True
+                if len(input_A_node.args) > 0:
+                    input_A = input_A_node.args[0]  # type: ignore[assignment]
+                else:
+                    input_A = input_A_node.kwargs["input"]  # type: ignore[assignment]
+
+            if (
+                input_B_node.op == "call_method"
+                and input_B_node.target == "permute"
+                and check_permute(input_B_node)
+            ):
+                Btrans = True
+                if len(input_B_node.args) > 0:
+                    input_B = input_B_node.args[0]  # type: ignore[assignment]
+                else:
+                    input_B = input_B_node.kwargs["input"]  # type: ignore[assignment]
+
+            if Atrans or Btrans:
+                with module.graph.inserting_before(node):
+                    fused_node = module.graph.call_function(
+                        transpose_matmul,
+                        args=(input_A, input_B, Atrans, Btrans),
+                    )
+                node.replace_all_uses_with(fused_node)
+                module.graph.erase_node(node)
+                if Atrans and len(input_A_node.users) == 0:
+                    module.graph.erase_node(input_A_node)
+                if Btrans and len(input_B_node.users) == 0:
+                    module.graph.erase_node(input_B_node)
+
+    module.graph.lint()
+    module.recompile()
+    return module
+
+
+# X1 = X.permute(0, 2, 1)
+# Y1 = X1 * W1^T + bias1
+# ---->
+# Y2 = X1.transpose(-1, -2) * W1^T + bias1
+def transpose_linear(
+    input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor]
+) -> torch.Tensor:
+    if bias is None:
+        return torch.matmul(input.transpose(-1, -2), weight.t())
+    return torch.matmul(input.transpose(-1, -2), weight.t()) + bias
+
+
+def transpose_matmul(
+    A: torch.Tensor, B: torch.Tensor, Atrans: bool, Btrans: bool
+) -> torch.Tensor:
+    if Atrans:
+        A = A.transpose(-1, -2)
+    if Btrans:
+        B = B.transpose(-1, -2)
+    return torch.matmul(A, B)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/quantization.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af6622a719245a019f31cb95e14035bc619c68d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/quantization.py
@@ -0,0 +1,1980 @@
+import copy
+import functools
+import itertools
+import math
+import operator
+from typing import Any, Tuple
+
+import torch
+from torch._dynamo.utils import counters
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
+from ..lowering import lowerings as L, require_channels_last
+from ..pattern_matcher import Arg, CallFunction, filter_nodes, KeywordArg, ListOf, Match
+from ..utils import pad_listlike
+from .freezing_patterns import register_freezing_graph_pattern
+from .post_grad import register_lowering_pattern
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+quantized_decomposed = torch.ops.quantized_decomposed
+quantized = torch.ops.quantized
+
+"""
+The quantization.py file primarily incorporates passes related to quantization fusion
+in inductor, includes:
+1. Dequant Promotion;
+2. Conv/GEMM weight prepack with oneDNN Library;
+3. Conv/GEMM quantization fusion with output quant node (if have);
+4. Other pointwise operators' quantization fusion like: qmaxpool2d, qcat and more;
+
+It also involves int8-mixed-fp32 and int8-mixed-bf16 quantization. The main difference
+of patterns for int8-mixed-bf16, comparing with int8-mixed-fp32, is
+1. There is to(dtype=torch.bfloat16) node at the inputs of activation and weight for Conv/GEMM.
+2. There is to(dtype=torch.float32) node at the outputs of Conv/GEMM before inputs to next quant node.
+Refer to: https://github.com/pytorch/pytorch/issues/111640 for detail design of int8-mixed-bf16
+quantization.
+"""
+
+
+def _may_generate_pattern_with_dtype_convert(pattern, dtype=Arg(), dtype_convert=True):
+    if dtype_convert:
+        return CallFunction(
+            prims.convert_element_type.default,
+            pattern,
+            dtype,
+        )
+    else:
+        return pattern
+
+
+def _may_generate_pattern_with_reshape(pattern, reshape_size=Arg(), with_reshape=True):
+    if with_reshape:
+        return CallFunction(
+            torch.ops.aten.reshape.default,
+            pattern,
+            reshape_size,
+        )
+    else:
+        return pattern
+
+
+def _generate_linear_t_pattern(
+    _dequant_per_channel_pattern,
+    dtype,
+):
+    assert dtype in [torch.float32, torch.bfloat16]
+    t_pattern = CallFunction(
+        aten.permute.default,
+        _may_generate_pattern_with_dtype_convert(
+            _dequant_per_channel_pattern,
+            KeywordArg("autocast_wgt_dtype"),
+            dtype == torch.bfloat16,
+        ),
+        KeywordArg("permute_axes"),
+    )
+    return t_pattern
+
+
+"""
+dequantize activation:
+    x = x.to(fp32)
+    x = x - zero_point
+    x = x * scale
+"""
+dequantize_per_tensor_activation_pattern = CallFunction(
+    aten.mul.Tensor,
+    CallFunction(
+        aten.sub.Tensor,
+        CallFunction(
+            prims.convert_element_type.default,
+            KeywordArg("x"),
+            KeywordArg("x_dq_dtype"),
+        ),
+        KeywordArg("x_zp"),
+    ),
+    KeywordArg("x_scale"),
+)
+
+dequantize_per_channel_weight_pattern = CallFunction(
+    quantized_decomposed.dequantize_per_channel.default,
+    KeywordArg("q_weight"),
+    KeywordArg("w_scale"),
+    KeywordArg("w_zp"),
+    KeywordArg("w_axis"),
+    KeywordArg("w_quant_min"),
+    KeywordArg("w_quant_max"),
+    KeywordArg("w_dtype"),
+)
+
+dequantize_per_channel_to_bf16_weight_pattern = (
+    _may_generate_pattern_with_dtype_convert(
+        dequantize_per_channel_weight_pattern,
+        KeywordArg("autocast_wgt_dtype"),
+    )
+)
+
+dequantize_per_channel_clone_weight_pattern = CallFunction(
+    aten.clone.default,
+    dequantize_per_channel_weight_pattern,
+    memory_format=KeywordArg("memory_format"),
+)
+
+dequantize_per_channel_to_bf16_clone_weight_pattern = CallFunction(
+    aten.clone.default,
+    dequantize_per_channel_to_bf16_weight_pattern,
+    memory_format=KeywordArg("memory_format"),
+)
+
+
+def get_dequantize_qconv_pt2e_pattern(users=1):
+    return CallFunction(
+        torch.ops.onednn.qconv2d_pointwise.default,
+        KeywordArg("x"),
+        KeywordArg("x_scale"),  # x_scale
+        KeywordArg("x_zp"),  # x_zp
+        KeywordArg("packed_weight"),  # packed_weight
+        KeywordArg("w_scale"),  # w_scale
+        KeywordArg("w_zp"),  # w_zp
+        KeywordArg("b"),  # bias
+        KeywordArg("stride"),
+        KeywordArg("padding"),
+        KeywordArg("dilation"),
+        KeywordArg("groups"),
+        KeywordArg("inv_output_scale"),  # inv_output_scale = 1.0
+        KeywordArg("output_zero_point"),  # output_zero_point = 0
+        KeywordArg("output_dtype"),  # output_dtype = None
+        KeywordArg("attr"),  # attr = "none"
+        Arg(),  # scalars
+        Arg(),  # algorithm
+        _users=users,
+    )
+
+
+def get_qlinear_pt2e_pattern(x_scale_zp_are_tensors):
+    qlinear_op = (
+        torch.ops.onednn.qlinear_pointwise.tensor
+        if x_scale_zp_are_tensors
+        else torch.ops.onednn.qlinear_pointwise.default
+    )
+    return CallFunction(
+        qlinear_op,
+        KeywordArg("x"),
+        KeywordArg("x_scale"),
+        KeywordArg("x_zp"),
+        KeywordArg("packed_weight"),
+        KeywordArg("w_scale"),
+        KeywordArg("w_zp"),
+        KeywordArg("b"),
+        KeywordArg("output_scale"),
+        KeywordArg("output_zero_point"),
+        KeywordArg("output_dtype"),
+        KeywordArg("postop_name"),
+        KeywordArg("postop_args"),
+        KeywordArg("postop_algorithm"),
+    )
+
+
+dequantize_accum_pattern = CallFunction(
+    aten.mul.Tensor,
+    CallFunction(
+        aten.sub.Tensor,
+        CallFunction(
+            prims.convert_element_type.default,
+            KeywordArg("accum"),
+            KeywordArg("accum_dq_dtype"),
+        ),
+        KeywordArg("accum_zp"),
+    ),
+    KeywordArg("accum_scale"),
+)
+
+
+def generate_pattern_with_binary(
+    binary_post_op,
+    computation_call,
+    extra_input_pattern,
+    int8_mixed_bf16_with_inplace_add=False,
+):
+    binary_pattern = CallFunction(
+        binary_post_op,
+        computation_call,
+        extra_input_pattern,
+    )
+    return _may_generate_pattern_with_dtype_convert(
+        binary_pattern,
+        KeywordArg("convert_dtype_after_inplace_add"),
+        int8_mixed_bf16_with_inplace_add,
+    )
+
+
+def generate_pattern_with_unary(computation_call, unary_post_op):
+    if unary_post_op is not None:
+        if unary_post_op == aten.hardtanh.default:
+            return CallFunction(
+                aten.clamp_max,
+                CallFunction(aten.clamp_min, computation_call, KeywordArg("min_value")),
+                KeywordArg("max_value"),
+            )
+        if unary_post_op == aten.hardswish.default:
+            return CallFunction(
+                aten.div,
+                CallFunction(
+                    aten.mul,
+                    computation_call,
+                    CallFunction(
+                        aten.clamp_max,
+                        CallFunction(
+                            aten.clamp_min,
+                            CallFunction(aten.add, computation_call, 3),
+                            0,
+                        ),
+                        6,
+                    ),
+                ),
+                6,
+            )
+        else:
+            return CallFunction(
+                unary_post_op,
+                computation_call,
+            )
+    return computation_call
+
+
+def generate_pattern_with_output_quant(computation_call, dtype=torch.float32):
+    """
+    quantize output:
+        output = round(output * o_inv_scale)
+        output = output + zero_point
+        output = clamp_min(output, 0)
+        output = clamp_max(output, 127)
+        output = output.to(uint8)
+    """
+    assert dtype in [torch.float32, torch.bfloat16]
+    quantized_op_output_pattern_pt2e = CallFunction(
+        prims.convert_element_type.default,
+        CallFunction(
+            aten.clamp_max.default,
+            CallFunction(
+                aten.clamp_min.default,
+                CallFunction(
+                    aten.add.Tensor,
+                    CallFunction(
+                        aten.round.default,
+                        CallFunction(
+                            aten.mul.Tensor,
+                            _may_generate_pattern_with_dtype_convert(
+                                computation_call,
+                                KeywordArg("autocast_output_quant_dtype"),
+                                dtype == torch.bfloat16,
+                            ),
+                            KeywordArg("o_inv_scale"),
+                        ),
+                    ),
+                    KeywordArg("o_zp"),
+                ),
+                KeywordArg("o_qmin"),
+            ),
+            KeywordArg("o_qmax"),
+        ),
+        KeywordArg("o_dtype"),
+    )
+    return quantized_op_output_pattern_pt2e
+
+
+def _check_node_kwarg_arg_value(check_node, kwarg_name, args_index, expected_value):
+    if kwarg_name in check_node.kwargs:
+        actual_value = check_node.kwargs[kwarg_name]
+        return actual_value == expected_value
+    else:
+        assert len(check_node.args) >= (args_index + 1)
+        actual_value = check_node.args[args_index]
+        return actual_value == expected_value
+
+
+def _is_valid_quantized_conv2d_optimization_pattern(output_dtype):
+    def fn(match):
+        if output_dtype is not None:
+            # Only keep matched pattern with same output_dtype
+            qconv_node_after_weight_prepack = filter_nodes(
+                match.nodes, torch.ops.onednn.qconv2d_pointwise
+            )[0]
+            return _check_node_kwarg_arg_value(
+                qconv_node_after_weight_prepack, "output_dtype", 13, output_dtype
+            )
+        return True
+
+    return fn
+
+
+def _register_quantized_conv_lowering(
+    pattern,
+    pass_number,
+    computation_op,
+    output_dtype,
+    unary_attr,
+    original_pattern_output_dtype=torch.float32,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_quantized_conv2d_optimization_pattern(output_dtype),
+        pass_number=pass_number,
+    )
+    def qconv(match: Match, *args, **kwargs):
+        # Activation QParams
+        x, x_scale, x_zp = (
+            kwargs["x"],
+            kwargs["x_scale"],
+            kwargs["x_zp"],
+        )
+        # Weight QParams
+        packed_weight, w_scale, w_zp = (
+            kwargs["packed_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+        # Conv Params
+        b, stride, padding, dilation, groups = (
+            kwargs["b"],
+            kwargs["stride"],
+            kwargs["padding"],
+            kwargs["dilation"],
+            kwargs["groups"],
+        )
+        assert output_dtype in [None, torch.float32, torch.bfloat16]
+        # Output QParams
+        o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
+        o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
+        assert (
+            kwargs["output_dtype"] is original_pattern_output_dtype
+        )  # Expected int8-in fp32-out qconv in weight prepack phase
+        assert (
+            kwargs["attr"] == "none"
+        )  # Expected no post op fused in weight prepack phase
+        if unary_attr.op_name == "hardtanh":
+            min_value = kwargs.get("min_value")
+            max_value = kwargs.get("max_value")
+            unary_attr.scalars_attr = [min_value, max_value]
+
+        computation_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            b,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            unary_attr.op_name,
+            unary_attr.scalars_attr,
+            unary_attr.algorithm_attr,
+        )
+        counters["inductor"]["qconv2d_unary_matcher_count"] += 1
+        counters["inductor"]["qconv2d_unary_matcher_nodes"] += len(match.nodes)
+        return L[computation_op](*computation_args)
+
+    return qconv
+
+
+def _is_valid_quantized_linear_optimization_pattern(output_dtype):
+    def fn(match):
+        if output_dtype is not None:
+            # Only keep matched pattern with same output_dtype
+            qlinear_node_after_weight_prepack = filter_nodes(
+                match.nodes, torch.ops.onednn.qlinear_pointwise
+            )[0]
+            return _check_node_kwarg_arg_value(
+                qlinear_node_after_weight_prepack, "output_dtype", 9, output_dtype
+            )
+        return True
+
+    return fn
+
+
+def _register_quantized_linear_lowering(
+    pattern,
+    pass_number,
+    computation_op,
+    output_dtype,
+    unary_attr,
+    original_pattern_output_dtype=torch.float32,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_quantized_linear_optimization_pattern(output_dtype),
+        pass_number=pass_number,
+    )
+    def qlinear(match: Match, *args, **kwargs):
+        # Activation QParams
+        x, x_scale, x_zp = (
+            kwargs["x"],
+            kwargs["x_scale"],
+            kwargs["x_zp"],
+        )
+        # Weight QParams
+        packed_weight, w_scale, w_zp = (
+            kwargs["packed_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+
+        # bias
+        b = kwargs["b"] if "b" in kwargs else None
+
+        # Output QParams
+        o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
+        o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
+        assert (
+            kwargs["output_dtype"] is original_pattern_output_dtype
+        )  # Expected int8-in fp32/bf16-out qlinear in weight prepack phase
+        assert (
+            kwargs["postop_name"] == "none"
+        )  # Expected no post op fused in weight prepack phase
+
+        computation_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            b,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            unary_attr.op_name,
+            unary_attr.scalars_attr,
+            unary_attr.algorithm_attr,
+        )
+        counters["inductor"]["qlinear_unary_matcher_count"] += 1
+        counters["inductor"]["qlinear_unary_matcher_nodes"] += len(match.nodes)
+        return L[computation_op](*computation_args)
+
+    return qlinear
+
+
+def _is_valid_quantized_conv_binary_optimization_pattern(output_dtype):
+    # Check if it's a valid Conv Binary Pattern:
+    # * qconv2d_pointwise should only has one users
+    # * Extra input of binary node comes from dequant pattern
+    # * the two inputs of binary node should have attribute "meta" and should be tensors
+    # * the two inputs of binary node should have the same shape
+    # * All users of the extra input in this pattern should be
+    #   ancestor nodes of the compute node, except for the binary node
+    #   connected to the compute node.
+    def fn(match):
+        compute_node = filter_nodes(match.nodes, torch.ops.onednn.qconv2d_pointwise)[0]
+        # qconv2d_pointwise should only have one user
+        if len(compute_node.users) != 1:
+            return False
+        binary_node_inputs = next(iter(compute_node.users)).args
+        assert len(binary_node_inputs) == 2, "Expects binary node with 2 inputs"
+        if output_dtype is not None:
+            extra_input_of_binary_node = None
+            for arg in binary_node_inputs:
+                if arg != compute_node:
+                    extra_input_of_binary_node = arg
+                    break
+            assert extra_input_of_binary_node is not None
+            # Extra input of binary node comes from dequant pattern
+            if (not isinstance(extra_input_of_binary_node, torch.fx.Node)) or (
+                extra_input_of_binary_node.target != aten.mul.Tensor
+            ):
+                return False
+
+        # the two inputs of binary node should have attribute "meta" and should be tensors
+        if not (
+            hasattr(binary_node_inputs[0], "meta")
+            and isinstance(binary_node_inputs[0].meta.get("val", None), torch.Tensor)  # type: ignore[union-attr]
+        ) or not (
+            hasattr(binary_node_inputs[1], "meta")
+            and isinstance(binary_node_inputs[1].meta.get("val", None), torch.Tensor)  # type: ignore[union-attr]
+        ):
+            return False
+        # the two inputs of binary node should have the same shape
+        if (
+            binary_node_inputs[0].meta["val"].size()  # type: ignore[union-attr]
+            != binary_node_inputs[1].meta["val"].size()  # type: ignore[union-attr]
+        ):
+            return False
+
+        # All users of the extra input in this pattern should be
+        # ancestor nodes of the compute node, except for the binary node
+        # connected to the compute node.
+
+        from .mkldnn_fusion import _get_remaining_users
+
+        extra_input_of_pattern = (
+            match.kwargs["accum"]
+            if output_dtype is None
+            else match.kwargs["accum_after_dequant"]
+        )
+        if (
+            len(
+                _get_remaining_users(
+                    extra_input_of_pattern,
+                    compute_node,
+                )
+            )
+            > 1
+            or extra_input_of_pattern == compute_node.args[0]
+        ):
+            return False
+        return True
+
+    return fn
+
+
+def _register_quantized_conv_binary_lowering(
+    pattern,
+    pass_number,
+    computation_op,
+    output_dtype,
+    binary_unary_attr,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_quantized_conv_binary_optimization_pattern(output_dtype),
+        pass_number=pass_number,
+    )
+    def qconv_binary(match: Match, *args, **kwargs):
+        x, x_scale, x_zp = kwargs["x"], kwargs["x_scale"], kwargs["x_zp"]
+        accum = (
+            kwargs["accum"] if output_dtype is None else kwargs["accum_after_dequant"]
+        )
+        accum_scale = kwargs["accum_scale"] if output_dtype is None else 1.0
+        accum_zp = kwargs["accum_zp"] if output_dtype is None else 0
+        packed_weight, w_scale, w_zp = (
+            kwargs["packed_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+        b, stride, padding, dilation, groups = (
+            kwargs["b"],
+            kwargs["stride"],
+            kwargs["padding"],
+            kwargs["dilation"],
+            kwargs["groups"],
+        )
+        # Output QParams
+        o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
+        o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
+
+        accum.realize()
+        from .mkldnn_fusion import _can_be_inplace
+
+        assert _can_be_inplace(
+            accum
+        ), "QConv Binary Inplace Fusion requires accum is not an alias or mutation."
+
+        computation_args = (
+            x,
+            x_scale,
+            x_zp,
+            accum,
+            accum_scale,
+            accum_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            b,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            binary_unary_attr.binary_op_name,
+            binary_unary_attr.alpha,
+            binary_unary_attr.unary_op_name,
+            binary_unary_attr.scalars_attr,
+            binary_unary_attr.algorithm_attr,
+        )
+        counters["inductor"]["qconv2d_binary_matcher_count"] += 1
+        counters["inductor"]["qconv2d_binary_matcher_nodes"] += len(match.nodes)
+        return L[computation_op](*computation_args)
+
+    return qconv_binary
+
+
+def _register_quantization_unary_fusion():
+    class UnaryAttr:
+        def __init__(self, op_name: str, scalars_attr=None, algorithm_attr=None):
+            self.op_name = op_name
+            self.scalars_attr = scalars_attr if scalars_attr else []
+            self.algorithm_attr = algorithm_attr if algorithm_attr else ""
+
+    for original_pattern_output_dtype in [torch.float32, torch.bfloat16]:
+        # QConv2d
+        # Priority 1 to match: QConv2d Unary pattern with int8 output
+        # If a pattern1 is a sub-set of pattern2, we should try to match pattern2 firstly.
+        # For example: pattern1 is qconv_fp32 -> relu, pattern2 is qconv_fp32 -> relu -> quant
+        conv_unary_replace_patterns = {
+            UnaryAttr("none", [], ""): generate_pattern_with_output_quant(
+                get_dequantize_qconv_pt2e_pattern(1),
+                dtype=original_pattern_output_dtype,
+            ),
+            UnaryAttr("relu", [], ""): generate_pattern_with_output_quant(
+                generate_pattern_with_unary(
+                    get_dequantize_qconv_pt2e_pattern(1), aten.relu.default
+                ),
+                dtype=original_pattern_output_dtype,
+            ),
+            UnaryAttr("hardtanh", [], ""): generate_pattern_with_output_quant(
+                generate_pattern_with_unary(
+                    get_dequantize_qconv_pt2e_pattern(1), aten.hardtanh.default
+                ),
+                dtype=original_pattern_output_dtype,
+            ),
+            UnaryAttr("hardswish", [], ""): generate_pattern_with_output_quant(
+                generate_pattern_with_unary(
+                    get_dequantize_qconv_pt2e_pattern(2), aten.hardswish.default
+                ),
+                dtype=original_pattern_output_dtype,
+            ),
+        }
+
+        for unary_attr, patterns in conv_unary_replace_patterns.items():
+            # Register qconv2d pattern for ExternKernel Lowering
+            _register_quantized_conv_lowering(
+                patterns,
+                1,  # pass_number
+                torch.ops.onednn.qconv2d_pointwise,  # computation_op
+                None,  # output_dtype, None is the default value for int8 output
+                unary_attr,  # unary_attr
+                original_pattern_output_dtype=original_pattern_output_dtype,
+            )
+
+        # Priority 2 to match: QConv2d Unary pattern with fp32/bfloat16 output
+        conv_unary_replace_float_out_patterns = {
+            UnaryAttr("relu", [], ""): generate_pattern_with_unary(
+                get_dequantize_qconv_pt2e_pattern(1), aten.relu.default
+            ),
+            UnaryAttr("hardtanh", [], ""): generate_pattern_with_unary(
+                get_dequantize_qconv_pt2e_pattern(1), aten.hardtanh.default
+            ),
+            UnaryAttr("hardswish", [], ""): generate_pattern_with_unary(
+                get_dequantize_qconv_pt2e_pattern(2), aten.hardswish.default
+            ),
+        }
+
+        for unary_attr, patterns in conv_unary_replace_float_out_patterns.items():
+            # Register qconv2d pattern for ExternKernel Lowering
+            _register_quantized_conv_lowering(
+                patterns,
+                2,  # pass_number
+                torch.ops.onednn.qconv2d_pointwise,  # computation_op
+                original_pattern_output_dtype,  # output_dtype
+                unary_attr,  # unary_attr
+                original_pattern_output_dtype=original_pattern_output_dtype,
+            )
+
+        # QLinear
+        for x_scale_zp_are_tensors in (False, True):
+            qlinear_pattern = get_qlinear_pt2e_pattern(x_scale_zp_are_tensors)
+            # Priority 1 to match: QLinear Unary pattern with int8 output
+            linear_unary_replace_patterns = {
+                UnaryAttr("none", [], ""): generate_pattern_with_output_quant(
+                    qlinear_pattern,
+                    dtype=original_pattern_output_dtype,
+                ),
+                UnaryAttr("relu", [], ""): generate_pattern_with_output_quant(
+                    generate_pattern_with_unary(qlinear_pattern, aten.relu.default),
+                    dtype=original_pattern_output_dtype,
+                ),
+            }
+
+            for unary_attr, patterns in linear_unary_replace_patterns.items():
+                _register_quantized_linear_lowering(
+                    patterns,
+                    1,  # pass_number
+                    torch.ops.onednn.qlinear_pointwise,  # computation_op
+                    None,  # output_dtype
+                    unary_attr,  # unary_attr
+                    original_pattern_output_dtype=original_pattern_output_dtype,
+                )
+
+            # Priority 2 to match: QLinear Unary pattern with FP32/BF16 output
+            linear_unary_replace_float_out_patterns = {
+                UnaryAttr("relu", [], ""): generate_pattern_with_unary(
+                    qlinear_pattern, aten.relu.default
+                ),
+            }
+
+            for unary_attr, patterns in linear_unary_replace_float_out_patterns.items():
+                _register_quantized_linear_lowering(
+                    patterns,
+                    2,  # pass_number
+                    torch.ops.onednn.qlinear_pointwise,  # computation_op
+                    original_pattern_output_dtype,  # output_dtype
+                    unary_attr,  # unary_attr
+                    original_pattern_output_dtype=original_pattern_output_dtype,
+                )
+
+
+def _register_quantization_binary_fusion():
+    class BinaryUnaryAttr:
+        def __init__(
+            self,
+            binary_op_name: str,
+            alpha=None,
+            unary_op_name: str = "none",
+            scalars_attr=None,
+            algorithm_attr=None,
+        ):
+            self.binary_op_name = binary_op_name
+            self.alpha = alpha if alpha else 1.0
+            self.unary_op_name = unary_op_name
+            self.scalars_attr = scalars_attr if scalars_attr else []
+            self.algorithm_attr = algorithm_attr if algorithm_attr else ""
+
+    for int8_mixed_bf16_with_inplace_add in [False, True]:
+        # Priority 1 to match: QConv2d Binary or Binary-Unary pattern with int8 output
+        binary_replace_patterns = {
+            BinaryUnaryAttr(
+                "sum", 1.0, "none", [], ""
+            ): generate_pattern_with_output_quant(
+                generate_pattern_with_binary(
+                    aten.add.Tensor,
+                    get_dequantize_qconv_pt2e_pattern(1),
+                    dequantize_accum_pattern,
+                    int8_mixed_bf16_with_inplace_add,
+                ),
+                dtype=torch.bfloat16
+                if int8_mixed_bf16_with_inplace_add
+                else torch.float32,
+            ),
+            BinaryUnaryAttr(
+                "sum", 1.0, "relu", [], ""
+            ): generate_pattern_with_output_quant(
+                generate_pattern_with_unary(
+                    generate_pattern_with_binary(
+                        aten.add.Tensor,
+                        get_dequantize_qconv_pt2e_pattern(1),
+                        dequantize_accum_pattern,
+                        int8_mixed_bf16_with_inplace_add,
+                    ),
+                    aten.relu.default,
+                ),
+                dtype=torch.bfloat16
+                if int8_mixed_bf16_with_inplace_add
+                else torch.float32,
+            ),
+        }
+
+        for binary_unary_attr, patterns in binary_replace_patterns.items():
+            _register_quantized_conv_binary_lowering(
+                patterns,
+                0,  # pass_number
+                torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                None,  # output_dtype
+                binary_unary_attr,  # binary_unary_attr
+            )
+
+        # Priority 2 to match: QConv2d Binary-Unary pattern with fp32/bfloat16 output
+        binary_replace_float_out_patterns = {
+            BinaryUnaryAttr("sum", 1.0, "relu", [], ""): generate_pattern_with_unary(
+                generate_pattern_with_binary(
+                    aten.add.Tensor,
+                    get_dequantize_qconv_pt2e_pattern(1),
+                    KeywordArg("accum_after_dequant"),
+                    int8_mixed_bf16_with_inplace_add,
+                ),
+                aten.relu.default,
+            ),
+        }
+
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            if int8_mixed_bf16_with_inplace_add:
+                _register_quantized_conv_binary_lowering(
+                    patterns,
+                    0,  # pass_number
+                    torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                    # Note that for int8-mixed-bf16 and non-inplace add, because we have
+                    # q-dq inserted at extra input of add, so the non-inplace add has bf16 and fp32 inputs,
+                    # the output dtype will be float32.
+                    # For inplace add, there is a extra to_bf16 node at add output, so the fusion pattern has bfloat16 output.
+                    torch.bfloat16,
+                    binary_unary_attr,  # binary_unary_attr
+                )
+            else:
+                _register_quantized_conv_binary_lowering(
+                    patterns,
+                    1,  # pass_number
+                    torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                    torch.float32,
+                    binary_unary_attr,  # binary_unary_attr
+                )
+
+        # Priority 3: QConv2d Binary pattern with fp32/bfloat16 output
+        binary_replace_float_out_patterns = {
+            BinaryUnaryAttr("sum", 1.0, "none", [], ""): generate_pattern_with_binary(
+                aten.add.Tensor,
+                get_dequantize_qconv_pt2e_pattern(1),
+                KeywordArg("accum_after_dequant"),
+                int8_mixed_bf16_with_inplace_add,
+            ),
+        }
+
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            _register_quantized_conv_binary_lowering(
+                patterns,
+                1 if int8_mixed_bf16_with_inplace_add else 2,  # pass_number
+                torch.ops.onednn.qconv2d_pointwise.binary,  # computation_op
+                # Same output dtype setting as conv-add-relu pattern
+                torch.bfloat16 if int8_mixed_bf16_with_inplace_add else torch.float32,
+                binary_unary_attr,  # binary_unary_attr
+            )
+
+
+def _is_valid_quantized_maxpool2d_optimization_pattern():
+    def fn(match):
+        # Only match the pattern which max_pool2d_with_indices returns value
+        # instead of indices.
+        get_item_node = filter_nodes(match.nodes, operator.getitem)[0]
+        return get_item_node.args[1] == 0
+
+    return fn
+
+
+def _register_quantized_maxpool2d_lowering(
+    pattern,
+    computation_op,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_quantized_maxpool2d_optimization_pattern(),
+    )
+    def qmaxpool2d(match: Match, *args, **kwargs):
+        x = kwargs["x"]
+        kernel_size = kwargs["kernel_size"]
+        stride = kwargs["stride"] if ("stride" in kwargs) else None
+        padding = kwargs["padding"] if ("padding" in kwargs) else 0
+        dilation = kwargs["dilation"] if ("dilation" in kwargs) else 1
+        ceil_mode = kwargs["ceil_mode"] if ("ceil_mode" in kwargs) else False
+
+        if padding == 0:
+            padding = [0, 0]
+        if dilation == 1:
+            dilation = [1, 1]
+        if not stride:
+            stride = kernel_size
+        kernel_size = pad_listlike(kernel_size, 2)
+        stride = pad_listlike(stride, 2)
+        padding = pad_listlike(padding, 2)
+        dilation = pad_listlike(dilation, 2)
+
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        assert len(padding) == 2
+        assert len(dilation) == 2
+
+        computation_args = (
+            x,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            ceil_mode,
+        )
+        computation_args, _ = require_channels_last(computation_op, *computation_args)
+        return L[computation_op](*computation_args)
+
+    return qmaxpool2d
+
+
+def _register_quantization_maxpool2d():
+    # Currently, the default parameters are not in FX Graph generated by Dynamo export.
+    # So, if user defines nn.MaxPool2d with different assignment of default parameter,
+    # it will generate graph with different number of input nodes and hence
+    # different pattern to be matched.
+    # Refer to the issue: https://github.com/pytorch/pytorch/issues/105901
+    max_pool2d_args_list = [
+        [
+            KeywordArg("stride"),
+        ],
+        [
+            KeywordArg("stride"),
+            KeywordArg("padding"),
+        ],
+        [
+            KeywordArg("stride"),
+            KeywordArg("padding"),
+            KeywordArg("dilation"),
+        ],
+        [
+            KeywordArg("stride"),
+            KeywordArg("padding"),
+            KeywordArg("dilation"),
+            KeywordArg("ceil_mode"),
+        ],
+    ]
+
+    for max_pool2d_args in max_pool2d_args_list:
+        dequantize_maxpool2d_pattern = CallFunction(
+            aten.max_pool2d_with_indices.default,
+            dequantize_per_tensor_activation_pattern,
+            KeywordArg("kernel_size"),
+            *max_pool2d_args,
+        )
+        dequantize_maxpool2d_get_item_pattern = CallFunction(
+            operator.getitem,
+            dequantize_maxpool2d_pattern,
+            Arg(),
+        )
+        _register_quantized_maxpool2d_lowering(
+            generate_pattern_with_output_quant(dequantize_maxpool2d_get_item_pattern),
+            quantized.max_pool2d.default,
+        )
+
+
+def _is_input_output_same_scale_zp(check_node):
+    def fn(match):
+        # Ensure all the inputs and output has same scale and zero point
+        # Step 1: Check inputs/output zero point
+        sub_nodes = filter_nodes(match.nodes, aten.sub.Tensor)
+        zero_points = [node.args[1] for node in sub_nodes]
+        add_nodes = filter_nodes(match.nodes, aten.add.Tensor)
+        assert len(add_nodes) == 1, "expect only 1 add node at output quant pattern"
+        zero_points.append(add_nodes[0].args[1])
+        if not all(zero_point == zero_points[0] for zero_point in zero_points):
+            return False
+
+        # Step 2: Check inputs/output scale
+        mul_nodes = filter_nodes(match.nodes, aten.mul.Tensor)
+        # We need to find mul node at output since the scale value is reciprocal to input scale.
+        # Mul node at output should connect to cat node directly.
+        scales = [
+            (
+                mul_node.args[1]
+                if mul_node.args[0].target is check_node  # type: ignore[union-attr]
+                else 1.0 / mul_node.args[1]  # type: ignore[operator]
+            )
+            for mul_node in mul_nodes
+        ]
+        if not all(math.isclose(scale, scales[0], rel_tol=1e-5) for scale in scales):  # type: ignore[arg-type]
+            return False
+
+        return True
+
+    return fn
+
+
+def _register_quantized_cat_lowering(
+    pattern,
+    computation_op,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_input_output_same_scale_zp(aten.cat.default),
+    )
+    def qcat(match: Match, inputs, dim, **kwargs):
+        # inputs is with format: [[x1, x1_dq_dtype, x1_zp, x1_scale], ...]
+        uint8_inputs = [input[0] for input in inputs]
+        return L[computation_op](uint8_inputs, dim)
+
+    return qcat
+
+
+_raw_dequantize_per_tensor_activation_pattern = CallFunction(
+    aten.mul.Tensor,
+    CallFunction(
+        aten.sub.Tensor,
+        CallFunction(
+            prims.convert_element_type.default,
+            Arg(),
+            Arg(),
+        ),
+        Arg(),
+    ),
+    Arg(),
+)
+
+
+def _register_quantization_cat():
+    dequantize_cat_pattern = CallFunction(
+        aten.cat.default,
+        ListOf(_raw_dequantize_per_tensor_activation_pattern),
+        KeywordArg("dim"),
+    )
+    _register_quantized_cat_lowering(
+        generate_pattern_with_output_quant(dequantize_cat_pattern),
+        aten.cat,
+    )
+
+
+def _register_quantized_reshape_lowering(
+    pattern,
+    computation_op,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_input_output_same_scale_zp(aten.reshape.default),
+    )
+    def qreshape(match: Match, *args, **kwargs):
+        qx = kwargs["x"]
+        shape = kwargs["shape"]
+        counters["inductor"]["qreshape_matcher_count"] += 1
+        counters["inductor"]["qreshape_matcher_nodes"] += len(match.nodes)
+        return L[computation_op](qx, shape)
+
+    return qreshape
+
+
+def _register_quantization_reshape():
+    dequantize_reshape_pattern = CallFunction(
+        torch.ops.aten.reshape.default,
+        dequantize_per_tensor_activation_pattern,
+        KeywordArg("shape"),
+    )
+    _register_quantized_reshape_lowering(
+        generate_pattern_with_output_quant(dequantize_reshape_pattern),
+        aten.reshape,
+    )
+
+
+def _register_quantization_lowerings():
+    _register_quantization_unary_fusion()
+    _register_quantization_binary_fusion()
+    _register_quantization_maxpool2d()
+    _register_quantization_cat()
+    _register_quantization_reshape()
+
+
+def _is_valid_dequant_promotion_pattern(dtype=torch.float32):
+    def _inner(match):
+        assert dtype in [torch.float32, torch.bfloat16]
+        dequant_pattern_end_node = match.output_node()
+        if dequant_pattern_end_node.target not in [
+            aten.mul.Tensor,
+            prims.convert_element_type.default,
+            aten.reshape.default,
+        ]:
+            return False
+
+        if dequant_pattern_end_node.target is aten.reshape.default:
+            mul_node = (
+                dequant_pattern_end_node.args[0]  # pattern: linear <- reshape <- mul
+                if dtype == torch.float32
+                else dequant_pattern_end_node.args[0].args[
+                    0
+                ]  # pattern: linear <- reshape <- to_bf16 <- mul
+            )
+        else:
+            mul_node = (
+                dequant_pattern_end_node  # pattern: linear <- mul
+                if dtype == torch.float32
+                else dequant_pattern_end_node.args[
+                    0
+                ]  # pattern: linear <- to_bf16 <- mul
+            )
+
+        sub_node = mul_node.args[0]
+        to_fp32_node = sub_node.args[0]
+        if (
+            mul_node.target is aten.mul.Tensor
+            and sub_node.target is aten.sub.Tensor
+            and to_fp32_node.target is prims.convert_element_type.default
+            and len(list(dequant_pattern_end_node.users)) > 1
+        ):
+            # If dequant pattern has more than 1 users, then do dequant promoted
+            return True
+        return False
+
+    return _inner
+
+
+def _register_dequant_promotion_pass(pattern, pass_number, dtype=torch.float32):
+    @register_freezing_graph_pattern(
+        pattern,
+        extra_check=_is_valid_dequant_promotion_pattern(dtype),
+        pass_number=pass_number,
+    )
+    def dequant_promotion(match: Match, *args, **kwargs):
+        # Dequant_promotion will transform
+        # graph 1:
+        #            quant
+        #      + - - - | - - - +
+        #      |    dequant    |
+        #      |    /     \    |
+        #      |  node1  node2 |
+        #      + - | - - - | - +
+        #        quant   quant
+        # into:
+        # graph 2:
+        #            quant
+        #      + - - / - \ - - +
+        #      |dequant dequant|
+        #      |    |      |   |
+        #      | node1 node2   |
+        #      + - | - - - | - +
+        #        quant   quant
+        # In graph 1, the dequant node is shared by node1 and node2,
+        # as a result, neither node1 nor node2 could form an int8
+        # fusion pattern.
+        # After this transformation, the graph 2 could hit the int8
+        # fusion pattern: dequant-node-quant, respectively for
+        # node1 and node2.
+        assert dtype in [torch.float32, torch.bfloat16]
+
+        def clone_to_new_node(graph, source_node, user_node):
+            # Clone the source_node to a new node
+            # Replace user_node's input from source_node to new_node
+            assert (
+                source_node.op == "call_function"
+            ), "clone_to_new_node only support node.op call_function"
+            with graph.inserting_before(user_node):
+                new_node = graph.call_function(
+                    source_node.target,
+                    args=source_node.args,
+                    kwargs=source_node.kwargs,
+                )
+                new_node.meta = copy.copy(source_node.meta)
+                user_node.replace_input_with(source_node, new_node)
+            return new_node
+
+        # Find the start node and end node of a dequant pattern
+        # * End node should be the match.output_node()
+        # * Start node should be the node of dtype convert to float32
+        dequant_pattern_end_node = match.output_node()
+        assert dequant_pattern_end_node.target in [
+            aten.mul.Tensor,
+            prims.convert_element_type.default,
+            aten.reshape.default,
+        ]
+
+        # For a dequant pattern, we should expect see the node list as:
+        # * OPT(aten.reshape.default)
+        # * OPT(prims.convert_element_type.default) (to_bf16)
+        # * aten.mul
+        # * aten.sub
+        # * prims.convert_element_type.default (to_fp32)
+        def _find_first_node_in_dequant_pattern(_node):
+            if (
+                _node.target is prims.convert_element_type.default
+                and _node.args[1] == torch.float32
+            ):
+                # For a dequant pattern, we expect the start node is a to_fp32 node
+                return _node
+            else:
+                assert (
+                    len(_node.args) >= 1
+                ), "In in dequant pattern, each node should have more than 1 arg."
+                return _find_first_node_in_dequant_pattern(_node.args[0])
+
+        dequant_pattern_start_node = _find_first_node_in_dequant_pattern(
+            dequant_pattern_end_node
+        )
+
+        # Clone the dequant pattern for each user node
+        graph = match.graph
+        user_node_list = list(dequant_pattern_end_node.users)
+        for user_node in user_node_list[1:]:
+            _source_node = dequant_pattern_end_node
+            _user_node = user_node
+            while _source_node != dequant_pattern_start_node.args[0]:
+                _user_node = clone_to_new_node(graph, _source_node, _user_node)
+                _source_node = _source_node.args[0]  # type: ignore[assignment]
+
+        counters["inductor"]["dequant_promotion_matcher_count"] += 1
+        counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
+
+
+def _is_valid_dequant_conv2d_pattern(dtype):
+    def _inner(match):
+        # Here we do some further check to ensure:
+        # 1. It's a conv2d node with dim of 4, since we only support lowering of conv2d now.
+        # 2. The dequant pattern has only 1 user of conv2d node.
+        # If these conditions don't meet, we will not
+        # insert weight prepack node into the matched pattern.
+        conv_node = match.output_node()
+        assert conv_node.target is aten.convolution.default
+        input_meta_value = conv_node.args[0].meta.get("val")
+        weight_meta_value = conv_node.args[1].meta.get("val")
+        for meta_value in [input_meta_value, weight_meta_value]:
+            if (
+                meta_value is None
+                or meta_value.device.type != "cpu"
+                or meta_value.dim() != 4
+            ):
+                # Only support conv2d now
+                return False
+
+        assert dtype in [torch.float32, torch.bfloat16]
+        if dtype == torch.float32:
+            mul_node = conv_node.args[0]
+        else:
+            convert_to_bf16 = conv_node.args[0]
+            mul_node = convert_to_bf16.args[0]
+        sub_node = mul_node.args[0]
+        to_fp32_node = sub_node.args[0]
+
+        assert to_fp32_node.target is prims.convert_element_type.default
+        assert sub_node.target is aten.sub.Tensor
+        assert mul_node.target is aten.mul.Tensor
+        if (
+            len(list(to_fp32_node.users)) != 1
+            or len(list(sub_node.users)) != 1
+            or len(list(mul_node.users)) != 1
+        ):
+            # Ensure the dequant pattern only has 1 user
+            # since we will delete the dequant pattern here
+            return False
+        return True
+
+    return _inner
+
+
+def _register_qconv_weight_prepack_pass(pattern, pass_number, dtype=torch.float32):
+    @register_freezing_graph_pattern(
+        pattern,
+        extra_check=_is_valid_dequant_conv2d_pattern(dtype),
+        pass_number=pass_number,
+    )
+    def qconv_weight_prepack(match: Match, *args, **kwargs):
+        """
+        Match the pattern:
+        int8 activation
+          |
+        dequant_per_tensor
+          |
+        Conv2d <- optional(aten.clone.default) <- dequant_per_channel <- int8_weight
+
+        Insert weight prepack node and change the pattern to:
+        int8 activation
+          |
+        onednn.qconv2d_pointwise <- onednn.qconv_prepack <- int8_weight
+        """
+        assert dtype in [torch.float32, torch.bfloat16]
+        conv_node = match.output_node()
+        assert conv_node.target is aten.convolution.default
+        if dtype == torch.float32:
+            mul_node = conv_node.args[0]
+        else:
+            convert_to_bf16 = conv_node.args[0]
+            mul_node = convert_to_bf16.args[0]  # type: ignore[union-attr]
+        sub_node = mul_node.args[0]  # type: ignore[union-attr]
+        to_fp32_node = sub_node.args[0]  # type: ignore[union-attr]
+        has_clone_to_channel_last_node_in_pattern = (
+            conv_node.args[1].target is aten.clone.default  # type: ignore[union-attr]
+        )
+        clone_node = (
+            conv_node.args[1] if has_clone_to_channel_last_node_in_pattern else None
+        )
+
+        if dtype == torch.float32:
+            dequant_per_channel = (
+                clone_node.args[0]  # type: ignore[union-attr]
+                if has_clone_to_channel_last_node_in_pattern
+                else conv_node.args[1]
+            )
+        else:
+            weight_to_bf16_node = (
+                clone_node.args[0]  # type: ignore[union-attr]
+                if has_clone_to_channel_last_node_in_pattern
+                else conv_node.args[1]
+            )
+            dequant_per_channel = weight_to_bf16_node.args[0]  # type: ignore[union-attr]
+
+        assert (
+            dequant_per_channel.target  # type: ignore[union-attr]
+            is quantized_decomposed.dequantize_per_channel.default
+        )
+
+        # Activation QParams
+        qx, x_zp, x_scale = (
+            kwargs["x"],
+            kwargs["x_zp"],
+            kwargs["x_scale"],
+        )
+
+        # Weight QParams
+        qw, w_scale, w_zp = (
+            kwargs["q_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+
+        # Conv Params
+        bias, stride, padding, dilation, groups = (
+            kwargs["b"],
+            kwargs["stride"],
+            kwargs["padding"],
+            kwargs["dilation"],
+            kwargs["groups"],
+        )
+
+        x_shape = qx.meta.get("tensor_meta").shape
+        if has_free_symbols(x_shape):
+            # For dynamic shape case, we can't get activation shape ahead of runtime.
+            x_shape = None
+        graph = match.graph
+        with graph.inserting_before(conv_node):
+            # Insert weight prepack node and the QConv node
+            packed_weight_inputs = (
+                qw,
+                w_scale,
+                x_scale,
+                x_zp,
+                stride,
+                padding,
+                dilation,
+                groups,
+                x_shape,
+            )
+            packed_weight_op = torch.ops.onednn.qconv_prepack
+            prepack_weight_node = graph.call_function(
+                packed_weight_op, args=packed_weight_inputs
+            )
+
+            new_args: Tuple[Any, ...] = (
+                qx,
+                x_scale,
+                x_zp,
+                prepack_weight_node,
+                w_scale,
+                w_zp,
+                bias,
+                stride,
+                padding,
+                dilation,
+                groups,
+                1.0,  # inv_output_scale
+                0,  # output_zero_point
+                dtype,  # output_dtype
+                "none",  # attr
+                [],  # scalars
+                "",  # algorithm
+            )
+            new_conv_node = graph.call_function(
+                torch.ops.onednn.qconv2d_pointwise.default, args=new_args
+            )
+            conv_node.replace_all_uses_with(new_conv_node)
+            new_conv_node.meta.update(conv_node.meta)
+
+            # Erase the original conv node
+            graph.erase_node(conv_node)
+            # Erase the dequant pattern
+            if dtype == torch.bfloat16:
+                graph.erase_node(convert_to_bf16)  # type: ignore[possibly-undefined]
+            # Erase the dequant pattern
+            graph.erase_node(mul_node)
+            graph.erase_node(sub_node)
+            graph.erase_node(to_fp32_node)
+            # Erase the dequant per channel pattern
+            if clone_node is not None:
+                graph.erase_node(clone_node)
+            if dtype == torch.bfloat16:
+                graph.erase_node(weight_to_bf16_node)  # type: ignore[possibly-undefined]
+            graph.erase_node(dequant_per_channel)
+            counters["inductor"]["qconv2d_weight_prepack_matcher_count"] += 1
+            counters["inductor"]["qconv2d_weight_prepack_matcher_nodes"] += len(
+                match.nodes
+            )
+
+
+def _generate_dequant_convolution_node_pattern(
+    _dequant_per_channel_pattern, dtype=torch.float32
+):
+    assert dtype in [torch.float32, torch.bfloat16]
+    dequant_convolution_node_pattern = CallFunction(
+        aten.convolution.default,
+        _may_generate_pattern_with_dtype_convert(
+            dequantize_per_tensor_activation_pattern,
+            KeywordArg("autocast_act_dtype"),
+            dtype == torch.bfloat16,
+        ),
+        _dequant_per_channel_pattern,
+        KeywordArg("b"),
+        KeywordArg("stride"),
+        KeywordArg("padding"),
+        KeywordArg("dilation"),
+        KeywordArg("is_transposed"),
+        KeywordArg("out_padding"),
+        KeywordArg("groups"),
+    )
+    return dequant_convolution_node_pattern
+
+
+def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
+    assert dtype in [torch.float32, torch.bfloat16]
+    return (
+        _generate_dequant_convolution_node_pattern(
+            dequantize_per_channel_weight_pattern
+            if dtype == torch.float32
+            else dequantize_per_channel_to_bf16_weight_pattern,
+            dtype,
+        ),
+        # There is another pattern due to the pass of convert_conv_weights_to_channels_last
+        # https://github.com/pytorch/pytorch/blob/07107919297db3f8ab37f11c12666b6d6d5f692e/torch/_inductor/freezing.py#L338-L362.
+        # Depend on some heuristics, it may or may not insert to(channel_last) node
+        # between convolution and dequant_per_channel node
+        _generate_dequant_convolution_node_pattern(
+            dequantize_per_channel_clone_weight_pattern
+            if dtype == torch.float32
+            else dequantize_per_channel_to_bf16_clone_weight_pattern,
+            dtype,
+        ),
+    )
+
+
+def _get_linear_node(match, input_dim_exceeds_two, input_contiguous):
+    output_reshape_node = None
+    if input_dim_exceeds_two:
+        if input_contiguous:
+            output_reshape_node = match.output_node()
+            assert output_reshape_node.target is aten.reshape.default
+            linear_node = output_reshape_node.args[0]
+        else:
+            linear_nodes = filter_nodes(match.nodes, aten.bmm.default)
+            assert len(linear_nodes) == 1
+            linear_node = linear_nodes[0]
+    else:
+        linear_node = match.output_node()
+
+    assert linear_node.target in (
+        aten.addmm.default,
+        aten.mm.default,
+        aten.bmm.default,
+    )
+    return linear_node, output_reshape_node
+
+
+def _get_linear_dq_mul_node(
+    linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+):
+    act_reshape_node = None
+    activation_to_bf16_node = None
+    act_expand_node = None
+    if input_dim_exceeds_two:
+        if input_contiguous:
+            act_reshape_node = linear_node.args[input_index]
+            assert act_reshape_node.target is aten.reshape.default
+            if dtype == torch.float32:
+                # pattern: linear -> reshape -> mul
+                mul_node = act_reshape_node.args[0]
+            else:
+                # pattern: linear -> reshape -> to_bf16 -> mul
+                activation_to_bf16_node = act_reshape_node.args[0]
+                mul_node = activation_to_bf16_node.args[0]
+        else:
+            # bmm pattern decomposed from linear when input dim exceeds 2 and not contiguous
+            act_expand_node = linear_node.args[input_index]
+            assert act_expand_node.target is aten.expand.default
+            if dtype == torch.float32:
+                mul_node = act_expand_node.args[0]
+            else:
+                activation_to_bf16_node = act_expand_node.args[0]
+                mul_node = activation_to_bf16_node.args[0]
+    else:
+        if dtype == torch.float32:
+            # pattern: linear -> mul
+            mul_node = linear_node.args[input_index]
+        else:
+            # pattern: linear -> to_bf16 -> mul
+            activation_to_bf16_node = linear_node.args[input_index]
+            mul_node = activation_to_bf16_node.args[0]
+    return mul_node, act_reshape_node, activation_to_bf16_node, act_expand_node
+
+
+def _is_valid_dequant_linear_pattern(dtype, input_dim_exceeds_two, input_contiguous):
+    def _inner(match):
+        # Check dequant pattern has only 1 user.
+        (
+            linear_node,
+            _,
+        ) = _get_linear_node(match, input_dim_exceeds_two, input_contiguous)
+
+        input_index = 1 if linear_node.target is aten.addmm.default else 0
+        assert dtype in [torch.float32, torch.bfloat16]
+
+        (
+            mul_node,
+            _,
+            _,
+            _,
+        ) = _get_linear_dq_mul_node(
+            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+        )
+
+        sub_node = mul_node.args[0]
+        to_fp32_node = sub_node.args[0]
+
+        assert to_fp32_node.target is prims.convert_element_type.default
+        assert sub_node.target is aten.sub.Tensor
+        assert mul_node.target is aten.mul.Tensor
+        if (
+            len(list(to_fp32_node.users)) != 1
+            or len(list(sub_node.users)) != 1
+            or len(list(mul_node.users)) != 1
+        ):
+            # Ensure the dequant pattern only has 1 user
+            # since we will delete the dequant pattern here
+            return False
+
+        # Extra check for bmm pattern
+        if input_dim_exceeds_two and not input_contiguous:
+            # Check for act
+            # Act expand size should be exactly same as act size
+            act_expand_size = match.kwargs["act_expand_size"]
+            act_node = match.kwargs["x"]
+            if not (
+                hasattr(act_node, "meta")
+                and isinstance(act_node.meta.get("val", None), torch.Tensor)
+                and (act_node.meta["val"].size() == torch.Size(act_expand_size))
+            ):
+                return False
+
+            # Check for wgt
+            # wgt permute dims should be [1, 0]
+            wgt_permute_dims = match.kwargs["permute_axes"]
+            if wgt_permute_dims != [1, 0]:
+                return False
+
+            # Check below wgt size items:
+            # wgt before expand should with dim 2
+            # Expand size should with dim 3
+            # Expand size[0] should same as act size[0]
+            # Expand size[1] should same as wgt size[1]
+            # Expand size[2] should same as wgt size[0]
+            qweight_node = match.kwargs["q_weight"]
+            wgt_expand_size = match.kwargs["wgt_expand_size"]
+            if not (
+                hasattr(qweight_node, "meta")
+                and isinstance(qweight_node.meta.get("val", None), torch.Tensor)
+                and len(qweight_node.meta["val"].size()) == 2
+                and len(wgt_expand_size) == 3
+                and wgt_expand_size[0] == act_node.meta["val"].size()[0]
+                and wgt_expand_size[1] == qweight_node.meta["val"].size()[1]
+                and wgt_expand_size[2] == qweight_node.meta["val"].size()[0]
+            ):
+                return False
+
+        return True
+
+    return _inner
+
+
+def _register_qlinear_weight_prepack_pass(
+    pattern,
+    pass_number,
+    dtype=torch.float32,
+    input_dim_exceeds_two=False,
+    input_contiguous=True,
+):
+    @register_freezing_graph_pattern(
+        pattern,
+        extra_check=_is_valid_dequant_linear_pattern(
+            dtype, input_dim_exceeds_two, input_contiguous
+        ),
+        pass_number=pass_number,
+    )
+    def qlinear_weight_prepack(match: Match, *args, **kwargs):
+        """
+        Match the pattern:
+        int8 activation
+          |
+        dequant_per_tensor
+          |
+        mm/addmm <- t <- dequant_per_channel <- int8_weight
+
+        Insert weight prepack node and change the pattern to:
+        int8 activation
+          |
+        onednn.qlinear_pointwise <- onednn.qlinear_prepack <- int8_weight
+        """
+        assert dtype in [torch.float32, torch.bfloat16]
+        (
+            linear_node,
+            output_reshape_node,
+        ) = _get_linear_node(match, input_dim_exceeds_two, input_contiguous)
+        input_index = 1 if linear_node.target is aten.addmm.default else 0
+        weight_index = input_index + 1
+
+        (
+            mul_node,
+            act_reshape_node,
+            activation_to_bf16_node,
+            act_expand_node,
+        ) = _get_linear_dq_mul_node(
+            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+        )
+
+        sub_node = mul_node.args[0]
+        to_fp32_node = sub_node.args[0]
+
+        if input_dim_exceeds_two and not input_contiguous:
+            wgt_expand_node = linear_node.args[weight_index]
+            assert wgt_expand_node.target is aten.expand.default
+            t_node = wgt_expand_node.args[0]
+        else:
+            t_node = linear_node.args[weight_index]
+
+        if dtype == torch.float32:
+            dequant_per_channel = t_node.args[0]
+        else:
+            weight_to_bf16_node = t_node.args[0]
+            dequant_per_channel = weight_to_bf16_node.args[0]
+        assert (
+            dequant_per_channel.target
+            is quantized_decomposed.dequantize_per_channel.default
+        )
+
+        # Activation QParams
+        qx, x_zp, x_scale = (
+            kwargs["x"],
+            kwargs["x_zp"],
+            kwargs["x_scale"],
+        )
+
+        # Weight QParams
+        qw, w_scale, w_zp = (
+            kwargs["q_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+
+        # Params
+        bias = kwargs["b"] if "b" in kwargs else None
+
+        x_shape = qx.meta.get("tensor_meta").shape
+        if has_free_symbols(x_shape):
+            # For dynamic shape case, we can't get activation shape ahead of runtime.
+            x_shape = None
+        graph = match.graph
+        with graph.inserting_before(linear_node):
+            # Insert weight prepack node and the qlinear node
+            packed_weight_inputs = (
+                qw,
+                x_shape,
+            )
+            packed_weight_op = torch.ops.onednn.qlinear_prepack
+            prepack_weight_node = graph.call_function(
+                packed_weight_op, args=packed_weight_inputs
+            )
+
+            new_args: Tuple[Any, ...] = (
+                qx,
+                x_scale,
+                x_zp,
+                prepack_weight_node,
+                w_scale,
+                w_zp,
+                bias,
+                1.0,  # output_scale
+                0,  # output_zero_point
+                dtype,  # output_dtype
+                "none",  # post op name
+                [],  # post op args
+                "",  # post op algorithm
+            )
+            Node = torch.fx.node.Node
+            if isinstance(x_scale, Node) and isinstance(x_zp, Node):
+                new_linear_node = graph.call_function(
+                    torch.ops.onednn.qlinear_pointwise.tensor, args=new_args
+                )
+            else:
+                new_linear_node = graph.call_function(
+                    torch.ops.onednn.qlinear_pointwise.default, args=new_args
+                )
+            if input_dim_exceeds_two:
+                if input_contiguous:
+                    output_reshape_node.replace_all_uses_with(new_linear_node)
+                    new_linear_node.meta.update(output_reshape_node.meta)
+                else:
+                    if bias:
+                        output_add_node_for_bias = match.output_node()
+                        assert output_add_node_for_bias.target is aten.add.Tensor
+                        output_add_node_for_bias.replace_all_uses_with(new_linear_node)
+                        new_linear_node.meta.update(output_add_node_for_bias.meta)
+                    else:
+                        linear_node.replace_all_uses_with(new_linear_node)
+                        new_linear_node.meta.update(linear_node.meta)
+            else:
+                linear_node.replace_all_uses_with(new_linear_node)
+                new_linear_node.meta.update(linear_node.meta)
+
+            # Erase the original linear node
+            if input_dim_exceeds_two:
+                if input_contiguous:
+                    graph.erase_node(output_reshape_node)
+                elif not input_contiguous and bias:
+                    graph.erase_node(output_add_node_for_bias)  # type: ignore[possibly-undefined]
+            graph.erase_node(linear_node)
+            if input_dim_exceeds_two:
+                if input_contiguous:
+                    graph.erase_node(act_reshape_node)
+                else:
+                    graph.erase_node(act_expand_node)
+                    graph.erase_node(wgt_expand_node)  # type: ignore[possibly-undefined]
+            if dtype == torch.bfloat16:
+                graph.erase_node(activation_to_bf16_node)
+            # Erase the dequant pattern
+            graph.erase_node(mul_node)
+            graph.erase_node(sub_node)
+            graph.erase_node(to_fp32_node)
+            # Erase the dequant per channel pattern
+            graph.erase_node(t_node)
+            if dtype == torch.bfloat16:
+                graph.erase_node(weight_to_bf16_node)  # type: ignore[possibly-undefined]
+            graph.erase_node(dequant_per_channel)
+
+            counters["inductor"]["qlinear_weight_prepack_matcher_count"] += 1
+            counters["inductor"]["qlinear_weight_prepack_matcher_nodes"] += len(
+                match.nodes
+            )
+
+
+def _generate_dequant_linear_node_pattern(
+    _dequant_per_channel_pattern, dtype=torch.float32, input_dim_exceeds_two=False
+):
+    assert dtype in [torch.float32, torch.bfloat16]
+    t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
+    dequant_linear_bias_pattern = _may_generate_pattern_with_reshape(
+        CallFunction(
+            aten.addmm.default,
+            KeywordArg("b"),
+            _may_generate_pattern_with_reshape(
+                _may_generate_pattern_with_dtype_convert(
+                    dequantize_per_tensor_activation_pattern,
+                    KeywordArg("autocast_act_dtype"),
+                    dtype == torch.bfloat16,
+                ),
+                KeywordArg("act_reshape_size"),
+                input_dim_exceeds_two,
+            ),
+            t_pattern,
+        ),
+        KeywordArg("output_reshape_size"),
+        input_dim_exceeds_two,
+    )
+    dequant_linear_no_bias_pattern = _may_generate_pattern_with_reshape(
+        CallFunction(
+            aten.mm.default,
+            _may_generate_pattern_with_reshape(
+                _may_generate_pattern_with_dtype_convert(
+                    dequantize_per_tensor_activation_pattern,
+                    KeywordArg("autocast_act_dtype"),
+                    dtype == torch.bfloat16,
+                ),
+                KeywordArg("act_reshape_size"),
+                input_dim_exceeds_two,
+            ),
+            t_pattern,
+        ),
+        KeywordArg("output_reshape_size"),
+        input_dim_exceeds_two,
+    )
+    return dequant_linear_bias_pattern, dequant_linear_no_bias_pattern
+
+
+def _generate_dequant_bmm_node_pattern(
+    _dequant_per_channel_pattern,
+    dtype=torch.float32,
+    with_bias=False,
+):
+    # When activation of linear dim exceed 2 and not contiguous
+    t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
+
+    assert dtype in [torch.float32, torch.bfloat16]
+    dequant_bmm_pattern = CallFunction(
+        aten.bmm.default,
+        CallFunction(
+            aten.expand.default,
+            _may_generate_pattern_with_dtype_convert(
+                dequantize_per_tensor_activation_pattern,
+                KeywordArg("autocast_act_dtype"),
+                dtype == torch.bfloat16,
+            ),
+            KeywordArg("act_expand_size"),
+        ),
+        CallFunction(
+            aten.expand.default,
+            t_pattern,
+            KeywordArg("wgt_expand_size"),
+        ),
+    )
+
+    def _generate_pattern_with_output_add(_dequant_bmm_pattern, _with_bias):
+        if _with_bias:
+            return CallFunction(
+                aten.add.Tensor,
+                _dequant_bmm_pattern,
+                KeywordArg("b"),
+            )
+        else:
+            return _dequant_bmm_pattern
+
+    return _generate_pattern_with_output_add(dequant_bmm_pattern, with_bias)
+
+
+def _generate_qlinear_weight_prepack_patterns(
+    dtype=torch.float32,
+    input_dim_exceeds_two=False,
+    input_contiguous=True,
+    with_bias=False,
+):
+    if input_dim_exceeds_two and not input_contiguous:
+        return _generate_dequant_bmm_node_pattern(
+            dequantize_per_channel_weight_pattern,
+            dtype,
+            with_bias,
+        )
+    else:
+        return _generate_dequant_linear_node_pattern(
+            dequantize_per_channel_weight_pattern, dtype, input_dim_exceeds_two
+        )
+
+
+def _register_dequant_promotion():
+    dequant_pattern_cases = itertools.product(
+        [torch.float32, torch.bfloat16], [True, False]
+    )
+    for dtype, input_dim_exceeds_two in dequant_pattern_cases:
+        # 4 dequantization patterns will be matched based on the dtype and input dimension size.
+        # Case 1: int8-mixed-fp32, input dim size is 2
+        # Case 2: int8-mixed-fp32, input dim size exceeds 2
+        # Case 3: int8-mixed-bf16, input dim size is 2
+        # Case 4: int8-mixed-bf16, input dim size exceeds 2
+        #           quant
+        #   + - - - - | - - - - +
+        #   |      dequant      |
+        #   |         |         |
+        #   |    OPT(to_bf16)   |
+        #   |         |         |
+        #   |    OPT(reshape)   |
+        #   |      /     \      |
+        #   |    node1  node2   |
+        #   + - - | - - - | - - +
+        #  OPT(reshape) OPT(reshape)
+        #   + - - | - - - | - - +
+        #  OPT(to_fp32) OPT(to_fp32)
+        #   + - - | - - - | - - +
+        #       quant   quant
+        _register_dequant_promotion_pass(
+            _may_generate_pattern_with_reshape(
+                _may_generate_pattern_with_dtype_convert(
+                    dequantize_per_tensor_activation_pattern,
+                    KeywordArg("autocast_act_dtype"),
+                    dtype == torch.bfloat16,
+                ),
+                KeywordArg("act_reshape_size"),
+                with_reshape=input_dim_exceeds_two,
+            ),
+            pass_number=0,
+            dtype=dtype,
+        )  # pass_number=0 to run before weight prepack
+
+
+def _register_qconv_weight_prepack():
+    for dtype in [torch.float32, torch.bfloat16]:
+        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(dtype)
+        for weight_prepack_pattern in weight_prepack_patterns:
+            # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
+            _register_qconv_weight_prepack_pass(
+                weight_prepack_pattern, pass_number=1, dtype=dtype
+            )
+
+
+def _register_qlinear_weight_prepack():
+    # 6 Linear related patterns will be matched based on the dtype, input dimension size and input contiguous.
+    # Then convert the pattern into a QLinear node with int8_fp32/bf16.
+    # Case 1: int8-mixed-fp32, input dim size is 2
+    # Case 2: int8-mixed-fp32, input dim size exceeds 2 and contiguous
+    # Case 3: int8-mixed-bf16, input dim size is 2
+    # Case 4: int8-mixed-bf16, input dim size exceeds 2 and contiguous
+
+    #   + - - - - | - - - - - - | - - - - - +
+    #   |    dq_per_tensor  dq_per_channel  |
+    #   |         |              |          |
+    #   |    OPT(to_bf16)    OPT(to_bf16)   |
+    #   |         |              |          |
+    #   |     OPT(reshape)   permute        |
+    #   |            \        /             |
+    #   |             addmm/mm              |
+    #   |                |                  |
+    #   |           OPT(reshape)            |
+
+    # Case 5: int8-mixed-fp32, input dim size exceeds 2 and not contiguous
+    # Case 6: int8-mixed-bf16, input dim size exceeds 2 and not contiguous
+
+    #   + - - - - | - - - - - - | - - - - - +
+    #   |    dq_per_tensor  dq_per_channel  |
+    #   |         |              |          |
+    #   |    OPT(to_bf16)    OPT(to_bf16)   |
+    #   |         |              |          |
+    #   |       expand       permute        |
+    #   |          \             |          |
+    #   |                    expand         |
+    #   |                    /              |
+    #   |               bmm                 |
+    #   |                |                  |
+    #   |            OPT(add)               |
+
+    linear_weight_prepack_cases = itertools.product(
+        [torch.float32, torch.bfloat16], [True, False]
+    )
+
+    # Step 1: register patterns from mm and addmm
+    for dtype, input_dim_exceeds_two in linear_weight_prepack_cases:
+        weight_prepack_patterns = _generate_qlinear_weight_prepack_patterns(
+            dtype, input_dim_exceeds_two
+        )
+        for weight_prepack_pattern in weight_prepack_patterns:
+            # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
+            _register_qlinear_weight_prepack_pass(
+                weight_prepack_pattern,
+                pass_number=1,
+                dtype=dtype,
+                input_dim_exceeds_two=input_dim_exceeds_two,
+            )
+
+    # Step 2: register patterns from bmm
+    # Linear might be decomposed into bmm when input dim exceeds 2 and not contiguous
+    # refer to:
+    # https://github.com/pytorch/pytorch/blob/
+    # 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
+    # in this case, we can convert it back to qlinear
+    for dtype, with_bias in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False]
+    ):
+        bmm_pattern = _generate_qlinear_weight_prepack_patterns(
+            dtype=dtype,
+            input_dim_exceeds_two=True,
+            input_contiguous=False,
+            with_bias=with_bias,
+        )
+        _register_qlinear_weight_prepack_pass(
+            bmm_pattern,
+            pass_number=1
+            if with_bias
+            else 2,  # if with_bias, there is an output add, so we should try to match it firstly
+            dtype=dtype,
+            input_dim_exceeds_two=True,
+            input_contiguous=False,
+        )
+
+
+@functools.lru_cache(None)
+def _register_quantization_weight_pack_pass():
+    # Step 1: Dequant promotion for int8-mixed-fp32/bf16
+    _register_dequant_promotion()
+
+    # Step 2: QConv weight prepack
+    _register_qconv_weight_prepack()
+
+    # Step 3: QLinear weight prepack
+    _register_qlinear_weight_prepack()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/reinplace.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/reinplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab939087a72c8c0a33e402ba8a118a04402b4783
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/reinplace.py
@@ -0,0 +1,537 @@
+import operator
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Tuple
+
+import torch
+from torch._higher_order_ops.triton_kernel_wrap import triton_kernel_wrapper_functional
+from torch._inductor import inductor_prims
+from torch._inductor.fx_utils import get_node_storage, is_node_realized
+from torch._inductor.lowering import (
+    inplaceable_foreach_ops as inplaceable_foreach_ops_lowerings,
+)
+from torch._inductor.virtualized import V
+from torch.fx.immutable_collections import immutable_dict
+from torch.fx.passes.reinplace import _is_view_op
+from torch.utils import _pytree as pytree
+
+aten = torch.ops.aten
+
+
+@dataclass(frozen=True)
+class InplaceableOp:
+    inplace_op: Callable[..., Any]
+    mutated_arg: int
+    extra_check: Callable[[torch.fx.Node], bool] = lambda node: True
+
+
+_SCATTER_OP_TO_VIEW = {
+    torch.ops.aten.diagonal_scatter.default: torch.ops.aten.diagonal.default,
+    torch.ops.aten.select_scatter.default: torch.ops.aten.select.int,
+    torch.ops.aten.slice_scatter.default: torch.ops.aten.slice.Tensor,
+    torch.ops.aten.as_strided_scatter.default: torch.ops.aten.as_strided.default,
+}
+_VIEW_OP_TO_SCATTER = {v: k for k, v in _SCATTER_OP_TO_VIEW.items()}
+
+
+def graph_call_function(graph: torch.fx.Graph, fn, *args, **kwargs):
+    fake_args, fake_kwargs = pytree.tree_map(
+        lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
+        (args, kwargs),
+    )
+    with V.fake_mode:
+        fake_result = fn(*fake_args, **fake_kwargs)
+
+    node = graph.call_function(fn, args, kwargs)
+    node.meta["val"] = fake_result
+    return node
+
+
+@dataclass
+class ViewOp:
+    target: torch._ops.OpOverload
+    args: Tuple[Any, ...]
+    kwargs: Dict[str, Any]
+
+
+def _inplace_generalized_scatter(
+    inp: torch.Tensor, src: torch.Tensor, view_ops: List[ViewOp]
+) -> torch.Tensor:
+    tmp = inp
+    for view in view_ops:
+        fake_args, fake_kwargs = pytree.tree_map(
+            lambda node: node.meta["val"] if isinstance(node, torch.fx.Node) else node,
+            (view.args, view.kwargs),
+        )
+        tmp = view.target(tmp, *fake_args, **fake_kwargs)
+    tmp.copy_(src)
+    return inp
+
+
+def _generalized_scatter(
+    inp: torch.Tensor, src: torch.Tensor, view_ops: List[ViewOp]
+) -> torch.Tensor:
+    out = inp.clone()
+    return _inplace_generalized_scatter(out, src, view_ops)
+
+
+def _decompose_scatter_functional_helper(
+    graph: torch.fx.Graph,
+    inp: torch.Tensor,
+    src: torch.Tensor,
+    view_ops: List[ViewOp],
+) -> torch.fx.Node:
+    view_op, view_ops_tail = view_ops[0], view_ops[1:]
+
+    if view_ops_tail:
+        view = graph_call_function(
+            graph, view_op.target, inp, *view_op.args, **view_op.kwargs
+        )
+        src = _decompose_scatter_functional_helper(graph, view, src, view_ops[1:])  # type: ignore[assignment]
+
+    return graph_call_function(
+        graph,
+        _VIEW_OP_TO_SCATTER[view_op.target],
+        inp,
+        src,
+        *view_op.args,
+        **view_op.kwargs,
+    )
+
+
+def _decompose_scatter_functional(
+    graph: torch.fx.Graph, node: torch.fx.Node
+) -> torch.fx.Node:
+    """Decompose _generalized_scatter to a sequence of view_scatter operations
+
+    e.g. _generalized_scatter(inp, src, [(aten.slice, 0, 0, 10), (aten.slice, 1, 10, -10)])
+
+    will become
+
+    view = aten.slice(inp, 0, 0, 10)
+    view_updated = aten.slice_scatter(view, src, 1, 10, -10)
+    inp_updated = aten.slice_scatter(inp, view_updated, 0, 0, 10)
+    """
+    assert node.target is _generalized_scatter
+    inp, src, view_ops = node.args
+    return _decompose_scatter_functional_helper(graph, *node.args)  # type: ignore[arg-type]
+
+
+def _decompose_scatter_mutating(
+    graph: torch.fx.Graph, node: torch.fx.Node
+) -> torch.fx.Node:
+    """Decompose _generalized_scatter using mutations
+
+    e.g. _generalized_scatter(inp, src, [(aten.slice, 0, 0, 10), (aten.slice, 1, 10, -10)])
+
+    will become
+
+    inp_updated = aten.clone(inp)
+    slice1 = aten.slice(inp_updated, 0, 0, 10)
+    slice2 = aten.slice(slice1, 1, 10, -10)
+    slice2.copy_(src)
+
+    """
+    assert node.target in (_generalized_scatter, _inplace_generalized_scatter)
+    inp, src, view_ops = node.args
+    assert not node.kwargs
+
+    if node.target is _generalized_scatter:
+        inp = graph_call_function(graph, aten.clone, inp)
+
+    tmp = inp
+    for view in view_ops:  # type: ignore[union-attr]
+        tmp = graph_call_function(graph, view.target, tmp, *view.args, **view.kwargs)  # type: ignore[union-attr]
+
+    graph_call_function(graph, aten.copy_.default, tmp, src)
+    return inp  # type: ignore[return-value]
+
+
+# View ops whose view_scatter op is lowered into mutations anyway,
+# so is never a pessimisation to decompose.
+_ALWAYS_MUTATING_SCATTER_OPS = {
+    aten.as_strided.default,
+    aten.diagonal.default,
+}
+
+
+def scatter_always_uses_mutation(node: torch.fx.Node) -> bool:
+    _, _, view_ops = node.args
+    return any(view.target in _ALWAYS_MUTATING_SCATTER_OPS for view in view_ops)  # type: ignore[union-attr]
+
+
+def should_reinplace_scatter(node: torch.fx.Node) -> bool:
+    """Choose between mutating and functional scatter decompositions
+
+    Reinplacing view scatter ops can be pessimising as it blocks fusion with the
+    input or output tensor computations. However, it is still profitable if the
+    input and output would have been realized anyway.
+
+    """
+    inp, src, view_ops = node.args
+
+    # Mutating scatter ops unconditionally realize input and output
+    if scatter_always_uses_mutation(node):
+        return True
+
+    if is_node_realized(inp) and is_node_realized(node):  # type: ignore[arg-type]
+        return True
+
+    # If the output is copied back into the input, this forces both to be
+    # realized as the output is a user of the input
+    if inp.op == "placeholder" and any(  # type: ignore[union-attr]
+        user.target is aten.copy_.default and user.args[0] is inp for user in node.users
+    ):
+        return True
+
+    # Otherwise, assume fusions will make functional variants profitable
+    return False
+
+
+def decompose_generalized_scatter(graph: torch.fx.Graph) -> None:
+    """Replace _generalized_scatter with normal aten ops"""
+    for node in graph.nodes:
+        if node.target not in (_generalized_scatter, _inplace_generalized_scatter):
+            continue
+
+        use_mutation = (
+            node.target is _inplace_generalized_scatter
+            or scatter_always_uses_mutation(node)
+        )
+
+        with graph.inserting_before(node):
+            if use_mutation:
+                new_node = _decompose_scatter_mutating(graph, node)
+            else:
+                new_node = _decompose_scatter_functional(graph, node)
+
+        node.replace_all_uses_with(new_node)
+        graph.erase_node(node)
+
+
+def canonicalize_view_scatter_ops(graph: torch.fx.Graph) -> None:
+    """
+    This canonicalizes view scatter ops into a generalized form, defined as:
+      def scatter(inp, src, views):
+        tmp = inp.clone()
+        for view in views:
+          tmp = view(tmp)
+        tmp.copy_(src)
+
+    We also fuse consecutive view scatter ops of the form
+        a = scatter(view2(self), src, [view1])
+        b = scatter(self, a, [view2])
+    which can be rewritten as
+        b = scatter(self, src, [view2, view1])
+        a = view2(b)
+
+    This is both more efficient as we only do a single scatter, and also
+    easier to reinplace since there is only one use of `self`
+    """
+
+    node_to_view_base: Dict[torch.fx.Node, torch.fx.Node] = {}
+    node_to_view_op: Dict[torch.fx.Node, List[ViewOp]] = defaultdict(list)
+
+    def handle_views(node: torch.fx.Node):
+        inp = node.args[0]
+        node_to_view_base[node] = node_to_view_base.get(inp, inp)  # type: ignore[arg-type]
+        node_to_view_op[node] = [
+            *node_to_view_op[inp],  # type: ignore[index]
+            ViewOp(
+                node.target,  # type: ignore[arg-type]
+                args=node.args[1:],
+                kwargs=node.kwargs,
+            ),
+        ]
+
+    def handle_view_scatter(node: torch.fx.Node):
+        assert len(node.args) >= 2
+        inp, src = node.args[:2]
+
+        scatter_view_op = ViewOp(
+            _SCATTER_OP_TO_VIEW[node.target],
+            args=node.args[2:],
+            kwargs=node.kwargs,
+        )
+
+        def can_fuse():
+            if src.target is not _generalized_scatter:  # type: ignore[union-attr]
+                return False
+            src_inp, src_src, src_scatter_view_op = src.args  # type: ignore[union-attr]
+
+            inp_base = node_to_view_base.get(inp, inp)  # type: ignore[arg-type]
+            src_base = node_to_view_base.get(src_inp, src_inp)  # type: ignore[arg-type]
+            return inp_base is src_base and node_to_view_op[src_inp] == [  # type: ignore[index]
+                *node_to_view_op[inp],  # type: ignore[index]
+                scatter_view_op,
+            ]
+
+        if not can_fuse():
+            with graph.inserting_before(node):
+                new_node = graph_call_function(
+                    graph,
+                    _generalized_scatter,
+                    inp,
+                    src,
+                    [scatter_view_op],
+                )
+            node.replace_all_uses_with(new_node)
+            graph.erase_node(node)
+            return
+
+        src_inp, src_src, src_scatter_view_op = src.args  # type: ignore[union-attr]
+        with graph.inserting_before(src):
+            new_node = graph_call_function(
+                graph,
+                _generalized_scatter,
+                inp,
+                src_src,
+                [scatter_view_op, *src_scatter_view_op],  # type: ignore[misc]
+            )
+            node.replace_all_uses_with(new_node)
+            graph.erase_node(node)
+
+            if src.users:  # type: ignore[union-attr]
+                new_src = graph_call_function(
+                    graph,
+                    _SCATTER_OP_TO_VIEW[node.target],
+                    new_node,
+                    *node.args[2:],
+                    **node.kwargs,
+                )
+
+                handle_views(new_src)
+                src.replace_all_uses_with(new_src)  # type: ignore[union-attr]
+
+            graph.erase_node(src)
+
+    for node in graph.nodes:
+        if _is_view_op(node.target):
+            handle_views(node)
+        elif node.target in _SCATTER_OP_TO_VIEW:
+            handle_view_scatter(node)
+
+
+inplaceable_ops = {
+    aten.index_put.default: InplaceableOp(aten.index_put_.default, 0),
+    aten._unsafe_index_put.default: InplaceableOp(inductor_prims._unsafe_index_put_, 0),
+    _generalized_scatter: InplaceableOp(
+        _inplace_generalized_scatter,
+        0,
+        extra_check=should_reinplace_scatter,
+    ),
+}
+
+try:
+    c10d_functional = torch.ops._c10d_functional
+    inplaceable_collective_ops = {
+        c10d_functional.all_reduce.default: InplaceableOp(
+            c10d_functional.all_reduce_.default, 0
+        ),
+        c10d_functional.all_reduce_coalesced.default: InplaceableOp(
+            c10d_functional.all_reduce_coalesced_.default, 0
+        ),
+    }
+    inplaceable_ops.update(inplaceable_collective_ops)
+except AttributeError:
+    # _c10d_functional ops are only available when torch
+    # is built with USE_DISTRIBUTED=1.
+    pass
+
+inplaceable_foreach_ops: Dict[torch._ops.OpOverload, InplaceableOp] = {}
+for outplace_op, inplace_op in inplaceable_foreach_ops_lowerings.items():
+    inplaceable_foreach_ops[outplace_op] = InplaceableOp(inplace_op, 0)
+
+
+inplaceable_triton_ops = {triton_kernel_wrapper_functional}
+
+
+# Operators that don't depend on the tensor data
+META_ONLY_OPS = {
+    aten.sym_size.int,
+    aten.sym_stride.int,
+    aten.sym_numel.default,
+    aten.sym_storage_offset.default,
+}
+
+
+def reinplace_inplaceable_ops_core(graph: torch.fx.Graph) -> None:
+    """
+    Reinplaces in-placeable operations.
+    If there are no uses of a view of the mutated arg after the current node,
+    it is possible to inplace the op.
+    This above algorithm could be justified by observing side effects. While
+    we traverse the graph in forwards direction, only latter nodes could view
+    side effects of the current node. If the current node is not used later as
+    well as no view of this node is used later in the graph, then it is safe to
+    inplace as there would be no way to observe the side effects.
+    This condition is slightly different for graph inputs where they can only
+    be inplaced if the above condition is true and there's a copy_ in the
+    epilogue that signals that the caller wants to observe the mutation.
+    """
+
+    copy_args_to_copy_nodes = {}
+    mutated_inputs = set()
+    storage_to_nodes = defaultdict(list)
+    node_order: Dict[Any, int] = {}
+    for i, node in enumerate(reversed(graph.nodes)):
+        node_order[node] = len(graph.nodes) - i - 1
+        storage_to_nodes[get_node_storage(node)].append(node)
+        if node.target == aten.copy_.default and node.args[0].op == "placeholder":
+            dst = node.args[0]
+            src = node.args[1]
+            # If the target is a getitem and it indexes a possible clone,
+            # then skip over it
+            if src.target == operator.getitem and (
+                (
+                    src.args[0].target == triton_kernel_wrapper_functional
+                    and src.args[0].kwargs["kwargs"][src.args[1]] == node.args[0]
+                )
+                or (src.args[0].target in inplaceable_foreach_ops)
+                or (src.args[0].target == torch.ops.higher_order.auto_functionalized)
+            ):
+                src = src.args[0]
+
+            copy_args_to_copy_nodes[(dst, src)] = node
+
+            mutated_inputs.add(node.args[0])
+
+    def any_use_of_views_after_node(node, shared_view_nodes, *, copy_node):
+        node_loc = node_order[node]
+        copy_node_loc = node_order[copy_node] if copy_node is not None else None
+
+        def is_meta_only_user(node):
+            if _is_view_op(node.target):
+                return all(is_meta_only_user(u) for u in node.users)
+            return node.target in META_ONLY_OPS
+
+        for view in shared_view_nodes:
+            for user in view.users:
+                user_loc = node_order[user]
+                # Skip all users before node
+                if user_loc <= node_loc:
+                    continue
+                # Ignore uses after the copy_ epilogue node, where the input
+                # has already been mutated anyway
+                if copy_node_loc is not None and copy_node_loc <= user_loc:
+                    continue
+                # Reinplacing does not change shape metadata
+                if is_meta_only_user(user):
+                    continue
+                return True
+        return False
+
+    def can_inplace(node, mutated_arg):
+        if isinstance(mutated_arg, (list, tuple)):
+            return all(can_inplace(node, arg) for arg in mutated_arg)
+
+        if get_node_storage(mutated_arg) is None:
+            return False
+        shared_view_nodes = storage_to_nodes[get_node_storage(mutated_arg)]
+        if mutated_arg.op == "placeholder":
+            if not (
+                copy_node := copy_args_to_copy_nodes.get((mutated_arg, node), False)
+            ):
+                return False
+
+            if any_use_of_views_after_node(
+                node, shared_view_nodes, copy_node=copy_node
+            ):
+                return False
+
+            return True
+        elif any(view.op == "placeholder" for view in shared_view_nodes):
+            # If mutated arg is view of any of the inputs of the graph,
+            # do not allow for inplacing.
+            # This would require more sophisticated algorithm to handle
+            return False
+        else:
+            return not any_use_of_views_after_node(
+                node, shared_view_nodes, copy_node=None
+            )
+
+    replace_dict: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+    def reinplace_and_refine_tensors_to_clone(old_tensors_to_clone, kwargs):
+        tensors_to_clone: List[str] = []
+        for arg in old_tensors_to_clone:
+            assert arg in kwargs
+            mutated_arg = kwargs[arg]
+            if can_inplace(node, mutated_arg):
+                copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
+                if copy_node is not None:
+                    replace_dict[copy_node] = copy_node.args[0]
+                for user in node.users:
+                    if user.target == operator.getitem and user.args[1] == arg:
+                        replace_dict[user] = mutated_arg
+            else:
+                tensors_to_clone.append(arg)
+        return tensors_to_clone
+
+    for node in graph.nodes:
+        if (inplaceable_op := inplaceable_ops.get(node.target, None)) is not None:
+            mutated_arg = node.args[inplaceable_op.mutated_arg]
+            if can_inplace(node, mutated_arg) and inplaceable_op.extra_check(node):
+                # TODO(yifu): this doesn't properly remove copy epilogues for
+                # ops that mutate multiple inputs. Need to revise the copy
+                # node tracking logic to support the case.
+                copy_node = copy_args_to_copy_nodes.get((mutated_arg, node))
+                if copy_node is not None:
+                    replace_dict[copy_node] = copy_node.args[0]
+                node.target = inplaceable_op.inplace_op
+        elif node.target == torch.ops.higher_order.auto_functionalized:
+            _mutable_op = node.args[0]
+            from torch._higher_order_ops.auto_functionalize import get_mutable_arg_names
+
+            tensors_to_clone = get_mutable_arg_names(_mutable_op)
+            # Don't try to reinplace Optional[Tensor] args that are None.
+            tensors_to_clone = [
+                t for t in tensors_to_clone if node.kwargs[t] is not None
+            ]
+            tensors_to_clone = reinplace_and_refine_tensors_to_clone(
+                tensors_to_clone, node.kwargs
+            )
+
+            # Stash the metadata. There is a pass later on where we decompose
+            # auto_functionalized into clones + a mutable op; this metadata
+            # tells the decomp to only clone the following inputs
+            node.meta["only_clone_these_tensors"] = tensors_to_clone
+        elif node.target in inplaceable_triton_ops:
+            # inplaceable_triton_ops take an additional argument called
+            # tensors_to_clone which contain a list of tensors to clone
+            # This pass iterates over them and sees which ones are safe
+            # to eliminate (i.e. no longer need the clones)
+            tensors_to_clone = reinplace_and_refine_tensors_to_clone(
+                node.kwargs["tensors_to_clone"], node.kwargs["kwargs"]
+            )
+
+            kwargs = dict(node.kwargs)
+            kwargs["tensors_to_clone"] = tensors_to_clone
+            node.kwargs = immutable_dict(kwargs)
+        elif (
+            inplaceable_op := inplaceable_foreach_ops.get(node.target, None)
+        ) is not None:
+            mutated_args = node.args[inplaceable_op.mutated_arg]
+
+            if not all((arg, node) in copy_args_to_copy_nodes for arg in mutated_args):
+                continue
+
+            if can_inplace(node, mutated_args):
+                for arg in mutated_args:
+                    copy_node = copy_args_to_copy_nodes[(arg, node)]
+                    replace_dict[copy_node] = copy_node.args[0]
+
+                node.target = inplaceable_op.inplace_op
+    for node, replacement in replace_dict.items():
+        while replacement in replace_dict:
+            replacement = replace_dict[replacement]
+        replace_dict[node] = replacement
+
+        node.replace_all_uses_with(replacement)
+        graph.erase_node(node)
+
+
+def reinplace_inplaceable_ops(graph: torch.fx.Graph) -> None:
+    canonicalize_view_scatter_ops(graph)
+    reinplace_inplaceable_ops_core(graph)
+    decompose_generalized_scatter(graph)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/replace_random.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/replace_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3bd47f93d3d8a655af9d6606dd3570ce58957b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/replace_random.py
@@ -0,0 +1,139 @@
+import collections
+import logging
+
+import torch
+
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from .. import config, inductor_prims
+from ..pattern_matcher import (
+    CallFunctionVarArgs,
+    Match,
+    PatternMatcherPass,
+    register_graph_pattern,
+)
+from ..virtualized import V
+
+log = logging.getLogger(__name__)
+patterns = PatternMatcherPass()
+aten = torch.ops.aten
+
+
+def replace_random_passes(gm: torch.fx.GraphModule):
+    """Modify the given FX graph to use backend-native random ops"""
+    if config.fallback_random:
+        return 0
+
+    count = patterns.apply(gm)
+    count += fuse_seed_creation_pass(gm.graph)
+
+    return count
+
+
+def fuse_seed_creation_pass(graph: torch.fx.Graph):
+    """
+    Horizontally fuse all the seed generation on each device
+
+        a = inductor_seed(dev)
+        b = inductor_seed(dev)
+
+    Becomes:
+        seeds = inductor_seeds(2, dev)
+        a = inductor_lookup_seed(seeds, 0)
+        b = inductor_lookup_seed(seeds, 1)
+
+    We do this because seed creation is entirely launch overhead bound.
+    """
+    device_seeds = collections.defaultdict(list)
+    for node in graph.nodes:
+        if CallFunctionVarArgs(inductor_prims.seed).match(node):
+            device_seeds[node.args[0]].append(node)
+
+    if not device_seeds:
+        return 0
+
+    for device, seeds in device_seeds.items():
+        with graph.inserting_before(seeds[0]):
+            combined = graph.call_function(inductor_prims.seeds, (len(seeds), device))
+            with V.fake_mode:
+                combined.meta["val"] = torch.empty(
+                    [len(seeds)], device=device, dtype=torch.int64
+                )
+                combined.meta["tensor_meta"] = _extract_tensor_metadata(
+                    combined.meta["val"]
+                )
+
+        for idx, seed in enumerate(seeds):
+            with graph.inserting_before(seed):
+                new_seed = graph.call_function(
+                    inductor_prims.lookup_seed, (combined, idx)
+                )
+            seed.replace_all_uses_with(new_seed)
+            new_seed.meta.update(seed.meta)
+            graph.erase_node(seed)
+
+    return len(device_seeds)
+
+
+def default_kwargs(device):
+    return {}
+
+
+def get_device(device):
+    if device is not None:
+        return device
+    return torch.empty([]).device  # default device
+
+
+@register_graph_pattern(CallFunctionVarArgs(aten.rand.default), pass_dict=patterns)
+@register_graph_pattern(CallFunctionVarArgs(aten.rand.generator), pass_dict=patterns)
+@register_graph_pattern(CallFunctionVarArgs(aten.randn.default), pass_dict=patterns)
+@register_graph_pattern(CallFunctionVarArgs(aten.randn.generator), pass_dict=patterns)
+def replace_random(
+    match: Match,
+    size,
+    *,
+    generator=None,
+    dtype=None,
+    device=None,
+    layout=None,
+    pin_memory=None,
+):
+    if generator is not None:
+        return
+
+    def replacement(size):
+        result = inductor_prims.random(
+            size, inductor_prims.seed(device), mode, **default_kwargs(device)
+        )
+        if dtype is not None:
+            result = result.to(dtype)
+        return result
+
+    mode = {
+        aten.rand: "rand",
+        aten.randn: "randn",
+    }[
+        match.output_node().target.overloadpacket  # type: ignore[union-attr]
+    ]  # type: ignore[union-attr]
+    device = get_device(device)
+    match.replace_by_example(replacement, [size])
+
+
+@register_graph_pattern(CallFunctionVarArgs(aten.randint.low), pass_dict=patterns)
+def replace_randint(
+    match: Match,
+    low,
+    high,
+    size,
+    *,
+    dtype=torch.int64,
+    device=None,
+    layout=None,
+    pin_memory=None,
+):
+    def replacement(size):
+        result = inductor_prims.randint(low, high, size, inductor_prims.seed(device))
+        return result.to(dtype)
+
+    device = get_device(device)
+    match.replace_by_example(replacement, [size])
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1951dcb0b2a98735214df0ece98a6aa8db20258
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d14aedbc748390cd63f71c0bcddd3bc54512fb16
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_1.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_10.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_10.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..118cae4b91c5c17f6172342f6e0841cc41d44728
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_10.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc49d65fcac1cb24a7fc77bea4cc0f94009929dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_11.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_12.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_12.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6eccbeec1b2fba1e2be3d52bad5ad0e6ca6fe5f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_12.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_13.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_13.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f55ad1b7e534ba4a302abc6f9e8b43c65de86e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_13.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_14.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_14.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5331184e8c5af2cbe8357224f862cd50ceaa005a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_14.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97ff30d7a5e93a1432ba74e688e5ed3786a488f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_15.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1855f808287b6141c9a8acae6feb017620312865
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_16.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_17.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_17.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2285a440852168e2fcddd00de3971e29d0e21abd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_17.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db56343d61e311ef4a18e9d9d5d0d881eddf127
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_3.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc12d667d2b39ef1c75ab3cb2bdaee1d9116288
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_3.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65488e3dd89c841169c5c1cf8a83b24ab0ceaf76
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_4.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e09a0a4b547d1acc8aeaebcb7b22a77a8193805d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_5.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_6.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_6.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b00473d6be5787f2db16951b631bfc9adcaeb8d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_6.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71929133477589e09a6f24a2b58f0e50ce83cd76
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_7.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_8.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_8.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f670d9e4fa2460b35cb25f745ec5793a7fa33b53
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_8.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fc35b514d02f3e54c1e1a576f6d56c4101b7113
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/_sfdp_pattern_9.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/central_index.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/central_index.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c65099dc221b736a973115b3fba5dedd49caf5a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/__pycache__/central_index.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cabcce55572f2cf6c655bf9001dc9955aa962aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_1.py
@@ -0,0 +1,182 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_1_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_1_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_1_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7202fa6c6ac7a649c8ab03717782d7d85f62acc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_10.py
@@ -0,0 +1,213 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
+view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+view_default_8 = CallFunction(aten.view.default, sub_Tensor_1, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_9, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_10_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_10_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, convert_element_type_default_3, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_9, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_10_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_10_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
new file mode 100644
index 0000000000000000000000000000000000000000..29f4f100f755b663e035c1927668b13cad3a1ef8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_11.py
@@ -0,0 +1,212 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_11_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_11_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_11_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_11_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
new file mode 100644
index 0000000000000000000000000000000000000000..67030a6f9735a3e56814f5a1b86de7e4b0f85b2f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_12.py
@@ -0,0 +1,232 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale_factor'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_12_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_12_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale_factor'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_12_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_12_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
new file mode 100644
index 0000000000000000000000000000000000000000..20062a0d75db069326c85fc3a81a8b024a04abcf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_13.py
@@ -0,0 +1,142 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, KeywordArg('query'), permute_default, _users=2)
+amax_default = CallFunction(aten.amax.default, bmm_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, bmm_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, mul_Tensor_1, KeywordArg('value'))
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, bmm_default_2, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5, _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, sub_Tensor_1, permute_default_2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, sub_Tensor_1)
+permute_default_4 = CallFunction(aten.permute.default, bmm_default_4, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, mul_Tensor_1, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, KeywordArg('tangents_1'))
+_sfdp_pattern_13_training = MultiOutputPattern([bmm_default_1,
+  bmm_default_3,
+  permute_default_4,
+  bmm_default_5,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+bmm_default = CallFunction(aten.bmm.default, KeywordArg('query'), permute_default, _users=2)
+amax_default = CallFunction(aten.amax.default, bmm_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, bmm_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor)
+_sfdp_pattern_13_inference = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'))
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, KeywordArg('query'), permute_default)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, mul_Tensor_1, KeywordArg('value'))
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, KeywordArg('tangents_1'), permute_default_1)
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, bmm_default_2, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, permute_default, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, convert_element_type_default_5, permute_default_2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, convert_element_type_default_5)
+permute_default_4 = CallFunction(aten.permute.default, bmm_default_4, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, mul_Tensor_1, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, KeywordArg('tangents_1'))
+_sfdp_pattern_13_half_training = MultiOutputPattern([bmm_default_1,
+  bmm_default_3,
+  permute_default_4,
+  bmm_default_5,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+bmm_default = CallFunction(aten.bmm.default, KeywordArg('query'), permute_default)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+_sfdp_pattern_13_half_inference = CallFunction(aten.bmm.default, clone_default, KeywordArg('value'))
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8e7045190031ebb1fd7f78ab3e0726fab2d062e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_14.py
@@ -0,0 +1,218 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_14_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_14_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc566249470c1fd15659cf8f407b915188affc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_15.py
@@ -0,0 +1,236 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, sub_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_15_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_3 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_15_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_4)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_15_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_15_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d44b570a0eb401069408d72947c6b9b9e5b36b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_16.py
@@ -0,0 +1,635 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_1, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_mask_fp32_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_mask_fp32_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_1, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_16_half_mask_fp32_bs1_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_16_half_mask_fp32_bs1_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5fa9a19d545f959188bfac12e7e932e89e02ad2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_17.py
@@ -0,0 +1,256 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, sub_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_17_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor, _users=2)
+amax_default = CallFunction(aten.amax.default, where_self, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, where_self, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_3 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_17_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+expand_default = CallFunction(aten.expand.default, eq_Scalar, Ignored(), _users=2)
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+scalar_tensor_default = CallFunction(aten.scalar_tensor.default, Ignored(), dtype=Ignored(), layout=torch.strided, device=Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default_3, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+where_self_1 = CallFunction(aten.where.self, expand_default, scalar_tensor_default, convert_element_type_default_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, where_self_1, KeywordArg('inv_scale'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_17_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None,
+  None,
+  None
+])
+
+
+eq_Scalar = CallFunction(aten.eq.Scalar, KeywordArg('attn_mask'), Ignored())
+view_default = CallFunction(aten.view.default, eq_Scalar, Ignored())
+expand_default = CallFunction(aten.expand.default, view_default, Ignored())
+full_default = CallFunction(aten.full.default, [], Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_2, memory_format=torch.contiguous_format)
+view_default_2 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default_1, view_default_2)
+view_default_3 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_3, KeywordArg('inv_scale'))
+where_self = CallFunction(aten.where.self, expand_default, full_default, div_Tensor)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, where_self, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_3 = CallFunction(aten.expand.default, clone_default_2, Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_4 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_4, memory_format=torch.contiguous_format)
+view_default_5 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_4, view_default_5)
+_sfdp_pattern_17_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e7d69f6121a65907b068edbb5a0507b9c5046e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_2.py
@@ -0,0 +1,182 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, mul_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, mul_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_1, Ignored(), True)
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_1, mul_Tensor_2)
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, sub_Tensor_1, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_3, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_2_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, mul_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, mul_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_2_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_1, Ignored(), True)
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_1, mul_Tensor_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_3, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_2_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_2_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..895b921f9ee3f733f6e33befa0181848cebab503
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_3.py
@@ -0,0 +1,202 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, KeywordArg('inv_scale_factor'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_3_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_3_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, KeywordArg('inv_scale_factor'))
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_3_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, KeywordArg('inv_scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_3_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ff3b6644ab50271233860505beaafb2de6b9a13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_4.py
@@ -0,0 +1,202 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, mul_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, mul_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor)
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_3)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_4, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_5, Ignored(), True)
+mul_Tensor_6 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_5, mul_Tensor_6)
+mul_Tensor_7 = CallFunction(aten.mul.Tensor, sub_Tensor_1, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_7, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_4_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'), _users=2)
+amax_default = CallFunction(aten.amax.default, mul_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, mul_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_4_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_2, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_3)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_4, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_5, Ignored(), True)
+mul_Tensor_6 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_5, mul_Tensor_6)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+mul_Tensor_7 = CallFunction(aten.mul.Tensor, convert_element_type_default_5, KeywordArg('scale_factor'))
+view_default_8 = CallFunction(aten.view.default, mul_Tensor_7, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_4_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_2, KeywordArg('scale_factor'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_4_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b1ca2ef9321d837deb360cc1d1d5e6dd9359781
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_5.py
@@ -0,0 +1,186 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, view_default_7, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_5_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+expand_default_2 = CallFunction(aten.expand.default, div_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_5_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, convert_element_type_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_5_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_5_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3dff8e414a77b65a8d3679ce8ed0baf04a1cca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_6.py
@@ -0,0 +1,206 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_6_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'), _users=2)
+amax_default = CallFunction(aten.amax.default, add_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, add_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default = CallFunction(aten.clone.default, div_Tensor_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_6_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored(), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored(), _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, convert_element_type_default_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, mul_Tensor_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_1)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, view_default_7, mul_Tensor_2)
+clone_default = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, clone_default, Ignored())
+alias_default = CallFunction(aten.alias.default, convert_element_type_default_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, alias_default_3, Ignored(), _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, convert_element_type_default_4, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, convert_element_type_default_4, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_5 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_5, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_2 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_2)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_3, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_4 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_5 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_5, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+_sfdp_pattern_6_half_training = MultiOutputPattern([view_default_5,
+  view_default_9,
+  permute_default_4,
+  view_default_11,
+  None,
+  None
+])
+
+
+expand_default = CallFunction(aten.expand.default, KeywordArg('query'), Ignored())
+view_default = CallFunction(aten.view.default, expand_default, Ignored())
+permute_default = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default, Ignored())
+view_default_1 = CallFunction(aten.view.default, expand_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+add_Tensor = CallFunction(aten.add.Tensor, div_Tensor, KeywordArg('attn_mask'))
+convert_element_type_default = CallFunction(prims.convert_element_type.default, add_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+clone_default = CallFunction(aten.clone.default, convert_element_type_default_1)
+expand_default_2 = CallFunction(aten.expand.default, clone_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+expand_default_3 = CallFunction(aten.expand.default, KeywordArg('value'), Ignored())
+view_default_4 = CallFunction(aten.view.default, expand_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_6_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
new file mode 100644
index 0000000000000000000000000000000000000000..a277750c844e1e0faeb485635a49b0d0643e7191
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_7.py
@@ -0,0 +1,233 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
+view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_7_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_7_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_4, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_7_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_7_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
new file mode 100644
index 0000000000000000000000000000000000000000..69eefd322686b5e80260250de7ff6057605eac24
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_8.py
@@ -0,0 +1,213 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
+view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+div_Tensor_2 = CallFunction(aten.div.Tensor, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_8_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, div_Tensor, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, div_Tensor, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_8_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, convert_element_type_default_2, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor, Ignored(), True)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor, mul_Tensor_1)
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, convert_element_type_default_3, Ignored())
+view_default_8 = CallFunction(aten.view.default, div_Tensor_2, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, view_default_9, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_8_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+expand_default = CallFunction(aten.expand.default, permute_default, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, view_default_2, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, div_Tensor, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, div_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_8_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0cdb933a8e9bca2c02a391917a634f10c40f287
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/_sfdp_pattern_9.py
@@ -0,0 +1,233 @@
+# mypy: ignore-errors
+
+# noqa: F401, E501
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+
+import torch
+import torch._inductor
+
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+from torch._inductor.pattern_matcher import (
+   Arg,
+   CallFunction,
+   CallFunctionVarArgs,
+   CallMethod,
+   CallMethodVarArgs,
+   CallModule,
+   CallModuleVarArgs,
+   ExclusiveKeywordArg,
+   Ignored,
+   KeywordArg,
+   ListOf,
+   MultiOutputPattern,
+   PatternExpr,
+   RepeatedExpr,
+   _TargetArgsExpr,
+   _TargetExpr,
+   _TargetExprVarArgs,
+)
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, bmm_default_2, Ignored())
+view_default_7 = CallFunction(aten.view.default, convert_element_type_default_1, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+view_default_8 = CallFunction(aten.view.default, sub_Tensor_1, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_9, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_9_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, view_default_2, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, view_default_2, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_9_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
+
+
+rand_default = CallFunction(aten.rand.default, Ignored(), dtype=Ignored(), device=Ignored(), pin_memory=False)
+gt_Scalar = CallFunction(aten.gt.Scalar, rand_default, KeywordArg('dropout_p'), _users=2)
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored(), _users=2)
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored(), _users=2)
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList, _users=2)
+mul_Tensor = CallFunction(aten.mul.Tensor, gt_Scalar, div_Tensor_1)
+mul_Tensor_1 = CallFunction(aten.mul.Tensor, mul_Tensor, Ignored())
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, mul_Tensor_1, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored(), _users=2)
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_2 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_2, Ignored(), _users=2)
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+view_default_5 = CallFunction(aten.view.default, bmm_default_1, Ignored())
+view_default_6 = CallFunction(aten.view.default, KeywordArg('tangents_1'), Ignored(), _users=2)
+permute_default_4 = CallFunction(aten.permute.default, view_default_4, Ignored())
+bmm_default_2 = CallFunction(aten.bmm.default, view_default_6, permute_default_4)
+view_default_7 = CallFunction(aten.view.default, bmm_default_2, Ignored())
+convert_element_type_default_2 = CallFunction(prims.convert_element_type.default, view_default_7, Ignored())
+convert_element_type_default_3 = CallFunction(prims.convert_element_type.default, gt_Scalar, Ignored())
+mul_Tensor_2 = CallFunction(aten.mul.Tensor, convert_element_type_default_3, Ignored())
+mul_Tensor_3 = CallFunction(aten.mul.Tensor, convert_element_type_default_2, mul_Tensor_2)
+clone_default_3 = CallFunction(aten.clone.default, mul_Tensor_3, memory_format=torch.contiguous_format)
+alias_default = CallFunction(aten.alias.default, div_Tensor_1)
+alias_default_1 = CallFunction(aten.alias.default, alias_default)
+alias_default_2 = CallFunction(aten.alias.default, alias_default_1)
+alias_default_3 = CallFunction(aten.alias.default, alias_default_2, _users=2)
+mul_Tensor_4 = CallFunction(aten.mul.Tensor, clone_default_3, alias_default_3, _users=2)
+sum_dim_IntList_1 = CallFunction(aten.sum.dim_IntList, mul_Tensor_4, Ignored(), True)
+mul_Tensor_5 = CallFunction(aten.mul.Tensor, alias_default_3, sum_dim_IntList_1)
+sub_Tensor_1 = CallFunction(aten.sub.Tensor, mul_Tensor_4, mul_Tensor_5)
+convert_element_type_default_4 = CallFunction(prims.convert_element_type.default, sub_Tensor_1, Ignored())
+view_default_8 = CallFunction(aten.view.default, convert_element_type_default_4, Ignored(), _users=2)
+permute_default_5 = CallFunction(aten.permute.default, view_default_1, Ignored())
+bmm_default_3 = CallFunction(aten.bmm.default, view_default_8, permute_default_5)
+view_default_9 = CallFunction(aten.view.default, bmm_default_3, Ignored())
+div_Tensor_2 = CallFunction(aten.div.Tensor, view_default_9, Ignored())
+permute_default_6 = CallFunction(aten.permute.default, div_Tensor_2, Ignored())
+permute_default_7 = CallFunction(aten.permute.default, view_default, Ignored())
+bmm_default_4 = CallFunction(aten.bmm.default, permute_default_7, view_default_8)
+view_default_10 = CallFunction(aten.view.default, bmm_default_4, Ignored())
+permute_default_8 = CallFunction(aten.permute.default, view_default_10, Ignored())
+permute_default_9 = CallFunction(aten.permute.default, permute_default_8, Ignored())
+permute_default_10 = CallFunction(aten.permute.default, view_default_3, Ignored())
+bmm_default_5 = CallFunction(aten.bmm.default, permute_default_10, view_default_6)
+view_default_11 = CallFunction(aten.view.default, bmm_default_5, Ignored())
+permute_default_11 = CallFunction(aten.permute.default, view_default_11, Ignored())
+_sfdp_pattern_9_half_training = MultiOutputPattern([view_default_5,
+  permute_default_6,
+  permute_default_9,
+  permute_default_11,
+  None
+])
+
+
+permute_default = CallFunction(aten.permute.default, KeywordArg('query'), Ignored())
+div_Tensor = CallFunction(aten.div.Tensor, permute_default, Ignored())
+expand_default = CallFunction(aten.expand.default, div_Tensor, Ignored())
+clone_default = CallFunction(aten.clone.default, expand_default, memory_format=torch.contiguous_format)
+view_default = CallFunction(aten.view.default, clone_default, Ignored())
+permute_default_1 = CallFunction(aten.permute.default, KeywordArg('key'), Ignored())
+permute_default_2 = CallFunction(aten.permute.default, permute_default_1, Ignored())
+expand_default_1 = CallFunction(aten.expand.default, permute_default_2, Ignored())
+clone_default_1 = CallFunction(aten.clone.default, expand_default_1, memory_format=torch.contiguous_format)
+view_default_1 = CallFunction(aten.view.default, clone_default_1, Ignored())
+bmm_default = CallFunction(aten.bmm.default, view_default, view_default_1)
+view_default_2 = CallFunction(aten.view.default, bmm_default, Ignored())
+convert_element_type_default = CallFunction(prims.convert_element_type.default, view_default_2, Ignored(), _users=2)
+amax_default = CallFunction(aten.amax.default, convert_element_type_default, Ignored(), True)
+sub_Tensor = CallFunction(aten.sub.Tensor, convert_element_type_default, amax_default)
+exp_default = CallFunction(aten.exp.default, sub_Tensor, _users=2)
+sum_dim_IntList = CallFunction(aten.sum.dim_IntList, exp_default, Ignored(), True)
+div_Tensor_1 = CallFunction(aten.div.Tensor, exp_default, sum_dim_IntList)
+clone_default_2 = CallFunction(aten.clone.default, div_Tensor_1)
+convert_element_type_default_1 = CallFunction(prims.convert_element_type.default, clone_default_2, Ignored())
+expand_default_2 = CallFunction(aten.expand.default, convert_element_type_default_1, Ignored())
+view_default_3 = CallFunction(aten.view.default, expand_default_2, Ignored())
+permute_default_3 = CallFunction(aten.permute.default, KeywordArg('value'), Ignored())
+expand_default_3 = CallFunction(aten.expand.default, permute_default_3, Ignored())
+clone_default_3 = CallFunction(aten.clone.default, expand_default_3, memory_format=torch.contiguous_format)
+view_default_4 = CallFunction(aten.view.default, clone_default_3, Ignored())
+bmm_default_1 = CallFunction(aten.bmm.default, view_default_3, view_default_4)
+_sfdp_pattern_9_half_inference = CallFunction(aten.view.default, bmm_default_1, Ignored())
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/central_index.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/central_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd4a11ee604d7103f1669816c846012786b90ac6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/serialized_patterns/central_index.py
@@ -0,0 +1,114 @@
+# mypy: ignore-errors
+
+# This is an auto-generated file. Please do not modify it by hand.
+# To re-generate, run:
+# cd ~/pytorch && python
+# torchgen/fuse_attention_patterns/gen_attention_patterns.py
+from ._sfdp_pattern_1 import (_sfdp_pattern_1_training, _sfdp_pattern_1_inference, _sfdp_pattern_1_half_training, _sfdp_pattern_1_half_inference)
+from ._sfdp_pattern_2 import (_sfdp_pattern_2_training, _sfdp_pattern_2_inference, _sfdp_pattern_2_half_training, _sfdp_pattern_2_half_inference)
+from ._sfdp_pattern_3 import (_sfdp_pattern_3_training, _sfdp_pattern_3_inference, _sfdp_pattern_3_half_training, _sfdp_pattern_3_half_inference)
+from ._sfdp_pattern_4 import (_sfdp_pattern_4_training, _sfdp_pattern_4_inference, _sfdp_pattern_4_half_training, _sfdp_pattern_4_half_inference)
+from ._sfdp_pattern_5 import (_sfdp_pattern_5_training, _sfdp_pattern_5_inference, _sfdp_pattern_5_half_training, _sfdp_pattern_5_half_inference)
+from ._sfdp_pattern_6 import (_sfdp_pattern_6_training, _sfdp_pattern_6_inference, _sfdp_pattern_6_half_training, _sfdp_pattern_6_half_inference)
+from ._sfdp_pattern_7 import (_sfdp_pattern_7_training, _sfdp_pattern_7_inference, _sfdp_pattern_7_half_training, _sfdp_pattern_7_half_inference)
+from ._sfdp_pattern_8 import (_sfdp_pattern_8_training, _sfdp_pattern_8_inference, _sfdp_pattern_8_half_training, _sfdp_pattern_8_half_inference)
+from ._sfdp_pattern_9 import (_sfdp_pattern_9_training, _sfdp_pattern_9_inference, _sfdp_pattern_9_half_training, _sfdp_pattern_9_half_inference)
+from ._sfdp_pattern_10 import (_sfdp_pattern_10_training, _sfdp_pattern_10_inference, _sfdp_pattern_10_half_training, _sfdp_pattern_10_half_inference)
+from ._sfdp_pattern_11 import (_sfdp_pattern_11_training, _sfdp_pattern_11_inference, _sfdp_pattern_11_half_training, _sfdp_pattern_11_half_inference)
+from ._sfdp_pattern_12 import (_sfdp_pattern_12_training, _sfdp_pattern_12_inference, _sfdp_pattern_12_half_training, _sfdp_pattern_12_half_inference)
+from ._sfdp_pattern_13 import (_sfdp_pattern_13_training, _sfdp_pattern_13_inference, _sfdp_pattern_13_half_training, _sfdp_pattern_13_half_inference)
+from ._sfdp_pattern_14 import (_sfdp_pattern_14_training, _sfdp_pattern_14_inference, _sfdp_pattern_14_half_training, _sfdp_pattern_14_half_inference)
+from ._sfdp_pattern_15 import (_sfdp_pattern_15_training, _sfdp_pattern_15_inference, _sfdp_pattern_15_half_training, _sfdp_pattern_15_half_inference)
+from ._sfdp_pattern_16 import (_sfdp_pattern_16_training, _sfdp_pattern_16_inference, _sfdp_pattern_16_bs1_training, _sfdp_pattern_16_bs1_inference, _sfdp_pattern_16_half_training, _sfdp_pattern_16_half_inference, _sfdp_pattern_16_half_bs1_training, _sfdp_pattern_16_half_bs1_inference, _sfdp_pattern_16_half_mask_fp32_training, _sfdp_pattern_16_half_mask_fp32_inference, _sfdp_pattern_16_half_mask_fp32_bs1_training, _sfdp_pattern_16_half_mask_fp32_bs1_inference)
+from ._sfdp_pattern_17 import (_sfdp_pattern_17_training, _sfdp_pattern_17_inference, _sfdp_pattern_17_half_training, _sfdp_pattern_17_half_inference)
+
+central_index = {
+    '_sfdp_pattern_1_training': _sfdp_pattern_1_training,
+    '_sfdp_pattern_1_inference': _sfdp_pattern_1_inference,
+    '_sfdp_pattern_2_training': _sfdp_pattern_2_training,
+    '_sfdp_pattern_2_inference': _sfdp_pattern_2_inference,
+    '_sfdp_pattern_3_training': _sfdp_pattern_3_training,
+    '_sfdp_pattern_3_inference': _sfdp_pattern_3_inference,
+    '_sfdp_pattern_4_training': _sfdp_pattern_4_training,
+    '_sfdp_pattern_4_inference': _sfdp_pattern_4_inference,
+    '_sfdp_pattern_5_training': _sfdp_pattern_5_training,
+    '_sfdp_pattern_5_inference': _sfdp_pattern_5_inference,
+    '_sfdp_pattern_6_training': _sfdp_pattern_6_training,
+    '_sfdp_pattern_6_inference': _sfdp_pattern_6_inference,
+    '_sfdp_pattern_7_training': _sfdp_pattern_7_training,
+    '_sfdp_pattern_7_inference': _sfdp_pattern_7_inference,
+    '_sfdp_pattern_8_training': _sfdp_pattern_8_training,
+    '_sfdp_pattern_8_inference': _sfdp_pattern_8_inference,
+    '_sfdp_pattern_9_training': _sfdp_pattern_9_training,
+    '_sfdp_pattern_9_inference': _sfdp_pattern_9_inference,
+    '_sfdp_pattern_10_training': _sfdp_pattern_10_training,
+    '_sfdp_pattern_10_inference': _sfdp_pattern_10_inference,
+    '_sfdp_pattern_11_training': _sfdp_pattern_11_training,
+    '_sfdp_pattern_11_inference': _sfdp_pattern_11_inference,
+    '_sfdp_pattern_12_training': _sfdp_pattern_12_training,
+    '_sfdp_pattern_12_inference': _sfdp_pattern_12_inference,
+    '_sfdp_pattern_13_training': _sfdp_pattern_13_training,
+    '_sfdp_pattern_13_inference': _sfdp_pattern_13_inference,
+    '_sfdp_pattern_14_training': _sfdp_pattern_14_training,
+    '_sfdp_pattern_14_inference': _sfdp_pattern_14_inference,
+    '_sfdp_pattern_15_training': _sfdp_pattern_15_training,
+    '_sfdp_pattern_15_inference': _sfdp_pattern_15_inference,
+    '_sfdp_pattern_16_training': _sfdp_pattern_16_training,
+    '_sfdp_pattern_16_inference': _sfdp_pattern_16_inference,
+    '_sfdp_pattern_16_bs1_training': _sfdp_pattern_16_bs1_training,
+    '_sfdp_pattern_16_bs1_inference': _sfdp_pattern_16_bs1_inference,
+    '_sfdp_pattern_17_training': _sfdp_pattern_17_training,
+    '_sfdp_pattern_17_inference': _sfdp_pattern_17_inference,
+    '_sfdp_pattern_1_half_training': _sfdp_pattern_1_half_training,
+    '_sfdp_pattern_1_half_inference': _sfdp_pattern_1_half_inference,
+    '_sfdp_pattern_2_half_training': _sfdp_pattern_2_half_training,
+    '_sfdp_pattern_2_half_inference': _sfdp_pattern_2_half_inference,
+    '_sfdp_pattern_3_half_training': _sfdp_pattern_3_half_training,
+    '_sfdp_pattern_3_half_inference': _sfdp_pattern_3_half_inference,
+    '_sfdp_pattern_4_half_training': _sfdp_pattern_4_half_training,
+    '_sfdp_pattern_4_half_inference': _sfdp_pattern_4_half_inference,
+    '_sfdp_pattern_5_half_training': _sfdp_pattern_5_half_training,
+    '_sfdp_pattern_5_half_inference': _sfdp_pattern_5_half_inference,
+    '_sfdp_pattern_6_half_training': _sfdp_pattern_6_half_training,
+    '_sfdp_pattern_6_half_inference': _sfdp_pattern_6_half_inference,
+    '_sfdp_pattern_7_half_training': _sfdp_pattern_7_half_training,
+    '_sfdp_pattern_7_half_inference': _sfdp_pattern_7_half_inference,
+    '_sfdp_pattern_8_half_training': _sfdp_pattern_8_half_training,
+    '_sfdp_pattern_8_half_inference': _sfdp_pattern_8_half_inference,
+    '_sfdp_pattern_9_half_training': _sfdp_pattern_9_half_training,
+    '_sfdp_pattern_9_half_inference': _sfdp_pattern_9_half_inference,
+    '_sfdp_pattern_10_half_training': _sfdp_pattern_10_half_training,
+    '_sfdp_pattern_10_half_inference': _sfdp_pattern_10_half_inference,
+    '_sfdp_pattern_11_half_training': _sfdp_pattern_11_half_training,
+    '_sfdp_pattern_11_half_inference': _sfdp_pattern_11_half_inference,
+    '_sfdp_pattern_12_half_training': _sfdp_pattern_12_half_training,
+    '_sfdp_pattern_12_half_inference': _sfdp_pattern_12_half_inference,
+    '_sfdp_pattern_13_half_training': _sfdp_pattern_13_half_training,
+    '_sfdp_pattern_13_half_inference': _sfdp_pattern_13_half_inference,
+    '_sfdp_pattern_14_half_training': _sfdp_pattern_14_half_training,
+    '_sfdp_pattern_14_half_inference': _sfdp_pattern_14_half_inference,
+    '_sfdp_pattern_15_half_training': _sfdp_pattern_15_half_training,
+    '_sfdp_pattern_15_half_inference': _sfdp_pattern_15_half_inference,
+    '_sfdp_pattern_16_half_training': _sfdp_pattern_16_half_training,
+    '_sfdp_pattern_16_half_inference': _sfdp_pattern_16_half_inference,
+    '_sfdp_pattern_16_half_bs1_training': _sfdp_pattern_16_half_bs1_training,
+    '_sfdp_pattern_16_half_bs1_inference': _sfdp_pattern_16_half_bs1_inference,
+    '_sfdp_pattern_17_half_training': _sfdp_pattern_17_half_training,
+    '_sfdp_pattern_17_half_inference': _sfdp_pattern_17_half_inference,
+    '_sfdp_pattern_16_half_mask_fp32_training': _sfdp_pattern_16_half_mask_fp32_training,
+    '_sfdp_pattern_16_half_mask_fp32_inference': _sfdp_pattern_16_half_mask_fp32_inference,
+    '_sfdp_pattern_16_half_mask_fp32_bs1_training': _sfdp_pattern_16_half_mask_fp32_bs1_training,
+    '_sfdp_pattern_16_half_mask_fp32_bs1_inference': _sfdp_pattern_16_half_mask_fp32_bs1_inference,
+}
+
+
+def get_serialized_pattern(key):
+    import torch._inductor  # noqa: F401
+    from torch._inductor import config
+    if config.fallback_random:
+        return None
+
+    # TODO - could add more validation that the same set of decomps used when
+    # tracing SDPA are also used in current context. softmax, dropout, etc
+    # decomp use is stable so not an issue in practice.
+    return central_index.get(key)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_passes/split_cat.py b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/split_cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f02e1ec5d90523caa1b772c0a565a67c6f973dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_passes/split_cat.py
@@ -0,0 +1,1537 @@
+import itertools
+import logging
+import operator
+from typing import Any, Callable, List, Optional, Sequence, Set, Tuple, Union
+
+from typing_extensions import TypeAlias
+
+import torch
+from torch._dynamo.utils import counters
+
+from ..pattern_matcher import (
+    Arg,
+    CallFunction,
+    CallFunctionVarArgs,
+    CallMethodVarArgs,
+    config_flag,
+    FailedMatch,
+    get_arg_value,
+    Ignored,
+    KeywordArg,
+    ListOf,
+    Match,
+    MatchContext,
+    MULTIPLE,
+    PatternExpr,
+    register_graph_pattern,
+    RepeatedExpr,
+)
+from .group_batch_fusion import is_node_meta_valid
+from .pre_grad import (
+    merge_getitem_cat_pass,
+    merge_splits_pass,
+    normalization_pass,
+    split_cat_pass,
+    unbind_stack_pass,
+)
+
+log = logging.getLogger(__name__)
+
+_Arguments: TypeAlias = Tuple[torch.fx.node.Argument, ...]
+_TransformParam: TypeAlias = Tuple[
+    Optional[_Arguments],
+    Optional[_Arguments],
+    Optional[_Arguments],
+    Optional[_Arguments],
+]
+_Range: TypeAlias = Tuple[int, int]
+
+
+def _get_split_args_default(split_node):
+    input_kwarg = "tensor"
+    split_size_kwarg = "split_size_or_sections"
+    dim_kwarg = "dim"
+    default_dim_value = 0
+    if split_node.op == "call_method":
+        split_size_kwarg = "split_size"
+    return (
+        get_arg_value(split_node, 0, input_kwarg),
+        get_arg_value(split_node, 1, split_size_kwarg),
+        get_arg_value(split_node, 2, dim_kwarg) or default_dim_value,
+    )
+
+
+# noqa: W605
+# ############The pattern to be optimized is#########
+#         unbind (dim=0)
+#       /   ...    \
+# getitem      getitem   -> user=1
+#    |            |
+#  split         split  -> dim=1, user=1, split_section_size=1
+#    |            |
+#  getitem       getitem  -> user=1
+#    \           /
+#        cat (dim=1)  -> user=1
+#          |
+
+# ################After transformation#############
+#          unbind (dim=0)
+#        /    ...   \
+#    getitem       getitem  -> user=1
+#       \          /
+#        cat (dim=1)  -> user=1
+#         |
+
+
+def remove_split_with_size_one(
+    graph: torch.fx.Graph,
+    node: torch.fx.Node,
+    input: torch.fx.Node,
+):
+    # find the grand children of the split_node
+    next_users = find_next_users(node)
+    user = next(iter(node.users.keys()))
+    # replace the users of grand child node with the input node
+    for next_user in next_users:
+        next_user.replace_input_with(user, input)
+    # erase the split node and its child
+    graph.erase_node(user)
+    graph.erase_node(node)
+
+    counters["inductor"]["remove_split_with_size_one"] += 1
+
+
+def normalize_split_base(
+    match: Match,
+    _get_split_args: Callable[
+        [torch.fx.Node], Tuple[Optional[torch.fx.Node], Optional[Any], Optional[int]]
+    ],
+):
+    """
+    Normalize split with split_size into split_with_sizes, so that we only deal with one type of split in
+    subsequent optimizations
+    """
+    split_node = match.nodes[0]
+    graph = match.graph
+    split_input, split_size, split_dim = _get_split_args(split_node)
+    if split_input is None or split_dim is None or split_size is None:
+        log.debug("couldn't find split args")
+        return
+    if "example_value" not in split_node.meta:
+        log.debug("example value absent for node: %s", split_node)
+        return
+    assert isinstance(split_node.meta["example_value"], (list, tuple))
+    split_sections = [t.size()[split_dim] for t in split_node.meta["example_value"]]
+
+    if any(isinstance(section, torch.SymInt) for section in split_sections):
+        # TODO dynamic_shapes with assume_static_by_default=False fails while AOT Autograd tracing.
+        return
+    # remove the dummy split whose split sections size is one
+    if len(split_sections) == 1:
+        remove_split_with_size_one(graph, split_node, split_input)
+        return
+    if split_dim < 0:  # Normalize split dim
+        split_dim += split_input.meta["example_value"].dim()
+    with graph.inserting_after(split_node):
+        new_split_node = graph.call_function(
+            torch.split,
+            args=(split_input, split_sections),
+            kwargs={"dim": split_dim},
+        )
+    split_node.replace_all_uses_with(new_split_node)
+    new_split_node.meta.update(split_node.meta)
+    graph.erase_node(split_node)
+    counters["inductor"]["split_cat_norm"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.split, users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallMethodVarArgs("split", users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def normalize_split_default(match: Match, *args, **kwargs):
+    return normalize_split_base(match, _get_split_args_default)
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.unbind, users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallMethodVarArgs("unbind", users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def normalize_unbind_default(match: Match, *args, **kwargs):
+    node = match.nodes[0]
+    graph = match.graph
+    input = get_arg_value(node, 0, "input")
+    dim = get_arg_value(node, 1, "dim")
+    if dim is None:
+        axis = node.kwargs.get("axis")
+        if axis is not None:
+            dim = axis
+        else:
+            dim = 0
+    if input is None:
+        log.debug("couldn't find unbind args")
+        return
+    if "example_value" not in input.meta:
+        log.debug("example value absent for node: %s", input)
+        return
+    ndim = input.meta["example_value"].ndim
+    if dim < 0:  # Normalize unbind dim
+        dim += ndim
+    with graph.inserting_after(node):
+        new_node = graph.call_function(
+            torch.unbind,
+            args=(input,),
+            kwargs={"dim": dim},
+        )
+    node.replace_all_uses_with(new_node)
+    new_node.meta.update(node.meta)
+    graph.erase_node(node)
+    counters["inductor"]["split_cat_norm"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.cat, users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def normalize_cat_default(match: Match, *args, **kwargs):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    cat_node = match.nodes[0]
+    graph = match.graph
+    tensors = get_arg_value(cat_node, 0, "tensors")
+    cat_dim = get_arg_value(cat_node, 1, "dim")
+    if cat_dim is None:
+        cat_axis = cat_node.kwargs.get("axis")
+        if cat_axis is not None:
+            cat_dim = cat_axis
+        else:
+            cat_dim = 0
+    if tensors is None or cat_dim is None:
+        log.debug("couldn't find cat args")
+        return
+    assert isinstance(tensors, (list, tuple))
+    for tensor in itertools.chain([cat_node], tensors):
+        if "example_value" not in tensor.meta:
+            log.debug("example value absent for node: %s", tensor)
+            return
+
+    ndim = cat_node.meta["example_value"].dim()
+
+    def is_empty_tensor(x):
+        # special case where torch.cat supports cat'ing with an empty tensor
+        x_shape = x.meta["example_value"].shape
+        return len(x_shape) == 1 and guard_size_oblivious(x_shape[0] == 0)
+
+    assert all(
+        ndim == x.meta["example_value"].dim() or is_empty_tensor(x) for x in tensors
+    )
+
+    if cat_dim < 0:  # Normalize cat dim
+        cat_dim += ndim
+
+    with graph.inserting_after(cat_node):
+        new_cat_node = graph.call_function(
+            torch.cat,
+            args=(tensors,),
+            kwargs={"dim": cat_dim},
+        )
+    cat_node.replace_all_uses_with(new_cat_node)
+    new_cat_node.meta.update(cat_node.meta)
+    graph.erase_node(cat_node)
+    counters["inductor"]["split_cat_norm"] += 1
+
+
+@register_graph_pattern(
+    CallFunctionVarArgs(torch.stack, users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def normalize_stack_default(match: Match, *args, **kwargs):
+    node = match.nodes[0]
+    graph = match.graph
+    tensors = get_arg_value(node, 0, "tensors")
+    dim = get_arg_value(node, 1, "dim") or 0
+    if tensors is None or dim is None:
+        log.debug("couldn't find stack args")
+        return
+    assert isinstance(tensors, (list, tuple))
+
+    # A bug in pytorch, some nodes miss the example_value metadata
+    for tensor in itertools.chain([node], tensors):
+        if "example_value" not in tensor.meta:
+            log.debug("example value absent for node: %s", tensor)
+            return
+
+    ndim = node.meta["example_value"].dim()
+    if dim < 0:  # Normalize dim
+        dim += ndim
+
+    with graph.inserting_after(node):
+        new_node = graph.call_function(
+            node.target,
+            args=(tensors,),
+            kwargs={"dim": dim},
+        )
+    node.replace_all_uses_with(new_node)
+    new_node.meta.update(node.meta)
+    graph.erase_node(node)
+    counters["inductor"]["split_cat_norm"] += 1
+
+
+def find_next_users(split_node: torch.fx.Node) -> List[torch.fx.Node]:
+    next_users = []
+    for getitem_node in split_node.users.keys():
+        for getitem_user in getitem_node.users.keys():
+            if getitem_user not in next_users:
+                next_users.append(getitem_user)
+    return next_users
+
+
+@register_graph_pattern(
+    CallMethodVarArgs("squeeze", users=MULTIPLE),
+    pass_dict=normalization_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def normalize_squeeze_default(match: Match, *args, **kwargs):
+    squeeze_node = match.nodes[0]
+    squeeze_input = get_arg_value(squeeze_node, 0)
+
+    if "dim" in squeeze_node.kwargs:
+        assert len(squeeze_node.args) == 1
+        dim = squeeze_node.kwargs["dim"]
+    elif len(squeeze_node.args) == 1:
+        # squeeze(Tensor)
+        dim = None
+    elif len(squeeze_node.args) == 2:
+        # squeeze(Tensor self, int dim)
+        # squeeze(Tensor self, int[] dim)
+        dim = squeeze_node.args[1]
+    else:
+        # squeeze(Tensor self, int[] dim) (called with varargs)
+        dim = squeeze_node.args[1:]
+
+    if isinstance(dim, Sequence) and len(dim) == 1:
+        dim = dim[0]
+
+    with match.graph.inserting_after(squeeze_node):
+        if dim is None:
+            new_squeeze_node = match.graph.call_function(
+                torch.squeeze, args=(squeeze_input,)
+            )
+        else:
+            new_squeeze_node = match.graph.call_function(
+                torch.squeeze, args=(squeeze_input,), kwargs={"dim": dim}
+            )
+    squeeze_node.replace_all_uses_with(new_squeeze_node)
+    match.graph.erase_node(squeeze_node)
+
+
+class TorchSplit(CallFunction):
+    """
+    Matches a call to torch.split if it is in a normalized form. Ensures that all users of
+    splits are unique getitems.
+    """
+
+    def __init__(self, arg, sizes, func=torch.split):
+        # using KeywordArg("dim") for `dim` checks they all match
+        super().__init__(func, arg, sizes, _users=MULTIPLE, dim=KeywordArg("dim"))
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        m = super()._match(node, ctx)
+        if not m:
+            return m
+        split_sections = node.args[1]
+        if not isinstance(split_sections, (list, tuple)):
+            return FailedMatch("split not normalized")
+        # check users are all unique getitems
+        seen_idxs = set()
+        for user in node.users:
+            if not CallFunction(operator.getitem, Arg(), Arg()).match(user):
+                # This should ideally never happen. Split user should always be a getitem
+                return FailedMatch(f"user of split not a getitem: {user}")
+            if not isinstance(user.args[1], int):
+                return FailedMatch("only integer getitems are handled")
+            if user.args[1] in seen_idxs:
+                return FailedMatch(f"duplicate getitem {user.args[1]}")
+            if user.args[-1] < 0:  # type: ignore[operator]
+                # This shouldn't ideally happen as dynamo normalizes indexes to positive
+                return FailedMatch("negative index")
+            seen_idxs.add(user.args[1])
+        return m
+
+
+@register_graph_pattern(
+    TorchSplit(
+        CallFunction(
+            operator.getitem,
+            TorchSplit(
+                KeywordArg("first_split_input"),
+                KeywordArg("first_split_sections"),
+            ),
+            Ignored(),
+        ),
+        KeywordArg("next_split_sections"),
+    ),
+    pass_dict=merge_splits_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def merge_splits(
+    match: Match,
+    first_split_input: torch.fx.Node,
+    first_split_sections: List[int],
+    next_split_sections: List[int],
+    # Note: dim is implicitly passed by TorchSplit, as it internally uses a pattern with dim
+    dim: int,
+):
+    node = match.output_node()
+    # it is possible that the split has no users,
+    # we check the corner case and skip the pattern
+    if len(node.users.keys()) == 0:
+        return
+    graph = match.graph
+    first_split = node.args[0].args[0]  # type: ignore[union-attr]
+    next_split_index = node.args[0].args[1]  # type: ignore[union-attr]
+
+    new_split_sections = list(first_split_sections)
+    new_split_sections[next_split_index : next_split_index + 1] = next_split_sections  # type: ignore[operator, misc]
+
+    first_split_dim = first_split.kwargs["dim"]  # type: ignore[union-attr]
+
+    to_remove = []
+
+    with graph.inserting_before(first_split):
+        # Add the new split node
+        new_split = graph.call_function(
+            torch.split,
+            args=(first_split_input, new_split_sections),
+            kwargs={"dim": first_split_dim},
+        )
+        first_split_num_to_user = {
+            user.args[1]: user for user in first_split.users.keys()  # type: ignore[union-attr]
+        }
+
+        new_split_num = 0
+        for split_num in range(len(first_split_sections)):
+            if split_num not in first_split_num_to_user:
+                new_split_num += 1
+                continue
+            old_getitem = first_split_num_to_user[split_num]
+            if split_num != next_split_index:
+                old_getitem.update_arg(0, new_split)
+                old_getitem.update_arg(1, new_split_num)
+                new_split_num += 1
+            else:
+                next_split_num_to_user = {
+                    user.args[1]: user for user in node.users.keys()
+                }
+                # It is not necessary all getitems from the split node are used.
+                # We use the num of users to check the getitems to be merged.
+                for next_split_num in range(len(node.users.keys())):
+                    with graph.inserting_after(new_split):
+                        new_getitem = graph.call_function(
+                            operator.getitem, args=(new_split, new_split_num)
+                        )
+                    new_split_num += 1
+                    next_getitem = next_split_num_to_user[next_split_num]
+                    new_getitem.meta.update(next_getitem.meta)
+                    next_getitem.replace_all_uses_with(new_getitem)
+                    to_remove.append(next_getitem)
+                to_remove.append(node)
+                to_remove.append(old_getitem)
+
+        to_remove.append(first_split)  # type: ignore[arg-type]
+    for node in to_remove:
+        graph.erase_node(node)
+
+    counters["inductor"]["consecutive_split_merged"] += 1
+
+
+class SplitCatSimplifier:
+    """
+    Helper class to simplify split-cat pattern. In simple cases, both split and cat node can be removed in a "split->cat"
+    pattern. However, there are various cases where they can't and we need to simplify split/ add transforms before cat.
+    Some such cases are:
+        1. Final node has additional args (not coming from the initial split)
+        2. Shuffling of args between split/cat
+        3. Some final nodes are non-(cat/stack)
+        4. Split-dim != cat-dim (but equal split)
+
+    Note that any combination of the above cases can happen.
+
+    To deal with 1, 2, & 3 - we iterate over all users of split. And figure out common "ranges" that can be merged.
+    Then, we simplify the split accordingly. In the best case, split can be entirely removed.
+
+    To deal with 4, we add some transformations (unflatten + movedim) (See `get_transform_params`).
+
+    Finally, depending on final node being cat or stack, unsqueeze/flatten needs to be added.
+
+    """
+
+    def simplify(
+        self,
+        graph: torch.fx.Graph,
+        split_node: torch.fx.Node,
+        split_sections: List[int],
+    ):
+        # Find the next users (i.e. users after the getitem)
+        next_users = find_next_users(split_node)
+        # Gather inputs of the next users. When inputs come from `split_node`, they are instead represented by
+        # a tuple indicating the split ranges. See `get_user_input_list` for more details
+        user_inputs_list = self.get_user_input_list(split_node, next_users)
+        # Simplify the split_sections based on user_inputs_list. In simpler cases, len(simplified_split_ranges) == 1 and
+        # we can simply replace the split node. Otherwise, we simplify it.
+        simplified_split_ranges = self.get_simplified_split_ranges(
+            split_sections, next_users, user_inputs_list
+        )
+        if not simplified_split_ranges:  # Simplification not possible
+            return
+        transform_params_list = self.get_transform_params(
+            split_node, next_users, user_inputs_list
+        )
+        if not transform_params_list:
+            return
+
+        # Start actual replacement
+        user_inputs_list_new = self.replace_split(
+            graph, split_node, split_sections, user_inputs_list, simplified_split_ranges
+        )
+        self.replace_cat(
+            graph, split_node, next_users, user_inputs_list_new, transform_params_list  # type: ignore[arg-type]
+        )
+        self.erase_old_nodes(graph, split_node, next_users)  # type: ignore[arg-type]
+
+    def get_user_input_list(
+        self, split_node: torch.fx.Node, next_users: List[torch.fx.Node]
+    ) -> List[List[Union[torch.fx.Node, _Range]]]:
+        """
+        Returns list of inputs to the following user nodes, in order. The outer list represents the user node. The inner
+        list represents the inputs to that particular node. This list can either contain
+          - a tuple representing the ranges of get_items that should go into the cat (closed interval)
+          - torch.fx.Node representing "other" inputs (which are not coming from our split)
+        """
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]] = []
+        for user in next_users:
+            if user.target in {torch.cat, torch.stack}:
+                user_inputs_list.append(self.get_merged_user_inputs(split_node, user))
+            else:
+                user_inputs_list.append(self.get_non_cat_node_input(split_node, user))  # type: ignore[arg-type]
+        return user_inputs_list
+
+    def get_merged_user_inputs(
+        self, split_node: torch.fx.Node, cat_node: torch.fx.Node
+    ) -> List[Union[torch.fx.Node, _Range]]:
+        user_inputs = get_arg_value(cat_node, 0, "tensors")
+        simplified_user_inputs = []
+        split_users = set(split_node.users.keys())
+        for user_input in user_inputs:
+            if user_input not in split_users:
+                simplified_user_inputs.append(user_input)
+            else:
+                # Add which "getitem" cat depends on
+                simplified_user_inputs.append(user_input.args[1])
+        return self.merge_consecutive_inputs(simplified_user_inputs)
+
+    def get_non_cat_node_input(
+        self, split_node: torch.fx.Node, node: torch.fx.Node
+    ) -> List[_Range]:
+        """
+        Get input for a non cat node in the same format as `get_merged_user_inputs`
+        """
+        node_input = []
+        split_users = set(split_node.users.keys())
+        for node_arg in node.all_input_nodes:
+            if node_arg in split_users:
+                getitem_num = get_arg_value(node_arg, 1)
+                node_input.append((getitem_num, getitem_num))
+        return node_input
+
+    def merge_consecutive_inputs(
+        self, inputs: List[Union[torch.fx.Node, int]]
+    ) -> List[Union[torch.fx.Node, _Range]]:
+        """
+        Merge consecutive inputs going into a user node.
+
+        For e.g.
+        [arg0, 0, 1, 2, arg1] -> [arg0, (0, 2), arg1]
+        """
+        merged_ranges = []
+        cur_range = None
+        for input_ in inputs:
+            if isinstance(input_, int):
+                if not cur_range:
+                    cur_range = [input_, input_]
+                elif input_ == cur_range[1] + 1:
+                    cur_range[1] += 1
+                else:
+                    merged_ranges.append(tuple(cur_range))
+                    cur_range = [input_, input_]
+            else:
+                if cur_range:
+                    merged_ranges.append(tuple(cur_range))
+                    cur_range = None
+                merged_ranges.append(input_)  # type: ignore[arg-type]
+        if cur_range:
+            merged_ranges.append(tuple(cur_range))
+        return merged_ranges  # type: ignore[return-value]
+
+    def get_simplified_split_ranges(
+        self,
+        split_sections,
+        next_users,
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[List[_Range]]:
+        ranges = set()
+        for user_node, user_inputs in zip(next_users, user_inputs_list):
+            ranges |= {
+                user_input
+                for user_input in user_inputs
+                if isinstance(user_input, tuple)
+            }
+        cumulative_sizes = [0] + torch.cumsum(torch.tensor(split_sections), 0).tolist()
+        split_ranges = sorted(
+            [(cumulative_sizes[r[0]], cumulative_sizes[r[1] + 1]) for r in ranges]
+        )
+
+        if not self.has_non_overlapping_ranges(
+            split_ranges,
+        ):  # This need not be a strict condition
+            # However, we keep it now for simplicity.
+            return None
+        split_ranges = self.fill_gaps(split_ranges, 0, cumulative_sizes[-1])
+        if len(split_sections) == len(split_ranges):  # Simplification not possible
+            return None
+        counters["inductor"]["scmerge_split_sections_removed"] = len(
+            split_sections
+        ) - len(split_ranges)
+        return split_ranges
+
+    def has_non_overlapping_ranges(self, ranges: List[_Range]) -> bool:
+        for range_, next_range in zip(ranges, ranges[1:]):
+            if range_[1] > next_range[0]:
+                return False
+        return True
+
+    def fill_gaps(self, ranges: List[_Range], min_: int, max_: int) -> List[_Range]:
+        cur = min_
+        filled_ranges = []
+        for a, b in ranges:
+            if cur < a:
+                filled_ranges.append((cur, a))
+            filled_ranges.append((a, b))
+            cur = b
+        if filled_ranges[-1][1] < max_:
+            filled_ranges.append((filled_ranges[-1][1], max_))
+        return filled_ranges
+
+    def get_transform_params(
+        self,
+        split_node: torch.fx.Node,
+        next_users: List[torch.fx.Node],
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[List[List[_TransformParam]]]:
+        """
+        Figure out what transforms are needed for each input to each cat node.
+
+        We replace a split node with an unflatten followed by a movedim
+        """
+        split_dim = split_node.kwargs["dim"]
+        split_sections = split_node.args[1]
+        transform_params_list: List[List[_TransformParam]] = []
+
+        for user_node, user_inputs in zip(next_users, user_inputs_list):
+            if user_node.target not in {torch.cat, torch.stack}:
+                transform_params_list.append([])
+                continue
+
+            cat_dim = get_arg_value(user_node, 1, "dim")
+            transform_params: List[_TransformParam] = []
+            for user_input in user_inputs:
+                if split_dim == cat_dim and user_node.target == torch.cat:
+                    # No transform needed
+                    transform_params.append((None, None, None, None))
+                elif isinstance(user_input, tuple):  # Split being simplified
+                    # Verify equal split
+                    subset_split_sections = split_sections[  # type: ignore[index]
+                        user_input[0] : user_input[1] + 1
+                    ]
+                    # All sections should be equal
+                    if len(set(subset_split_sections)) != 1:
+                        return None
+
+                    num_splits = len(subset_split_sections)
+                    unflatten_params = (split_dim, (num_splits, -1))
+                    movedim_params = (
+                        (split_dim, cat_dim) if split_dim != cat_dim else None
+                    )
+                    transform_params.append(
+                        (unflatten_params, movedim_params, None, None)
+                    )
+                elif (
+                    user_node.target == torch.stack or split_dim != cat_dim
+                ):  # We need to unsqueeze inputs not coming through split
+                    transform_params.append((None, None, (cat_dim,), None))
+                else:  # Non-split inputs
+                    transform_params.append((None, None, None, None))
+            transform_params_list.append(transform_params)
+        return transform_params_list
+
+    def replace_split(
+        self,
+        graph: torch.fx.Graph,
+        split_node: torch.fx.Node,
+        split_sections: List[int],
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
+        split_ranges: List[_Range],
+    ) -> List[List[torch.fx.Node]]:
+        """
+        Replace the split node. It can either remove the split node if len(split_ranges) == 1, or simplify it
+        into a split with lesser sections if len(split_ranges) > 1.
+
+        Returns the new `user_inputs_list`, with tuples replaced with new getitems from the newer split node.
+        """
+        split_input = split_node.args[0]
+        split_dim = split_node.kwargs["dim"]
+        if len(split_ranges) == 1:  # We can completely eliminate the split node
+            split_items = [split_input]
+        else:
+            with graph.inserting_after(split_node):
+                new_split = graph.call_function(
+                    torch.split,
+                    args=(
+                        split_input,
+                        [r[1] - r[0] for r in split_ranges],
+                    ),
+                    kwargs={"dim": split_dim},
+                )
+                new_split.meta.update(split_node.meta)
+                counters["inductor"]["scmerge_split_added"] += 1
+            with graph.inserting_after(new_split):
+                split_items = [
+                    graph.call_function(operator.getitem, args=(new_split, i))
+                    for i in range(len(split_ranges))
+                ]
+        # Now assign the right getitem to the right input
+        cumulative_sizes = [0] + torch.cumsum(torch.tensor(split_sections), 0).tolist()
+        new_user_inputs_list = []
+        for user_inputs in user_inputs_list:
+            new_user_inputs = []
+            for user_input in user_inputs:
+                if isinstance(user_input, tuple):
+                    # Find the correct new getitem (present in split_items)
+                    new_user_inputs.append(
+                        split_items[
+                            split_ranges.index(
+                                (
+                                    cumulative_sizes[user_input[0]],
+                                    cumulative_sizes[user_input[1] + 1],
+                                )
+                            )
+                        ]
+                    )
+                else:
+                    new_user_inputs.append(user_input)
+            new_user_inputs_list.append(new_user_inputs)
+        return new_user_inputs_list  # type: ignore[return-value]
+
+    def replace_cat(
+        self,
+        graph: torch.fx.GraphModule,
+        split_node: torch.fx.Node,
+        next_users: List[torch.fx.Node],
+        user_inputs_list_new,
+        transform_params_list: List[List[_TransformParam]],
+    ):
+        split_dim = split_node.kwargs["dim"]
+
+        split_users = split_node.users.keys()
+        new_cats = []
+        for user_node, user_inputs_new, transform_params in zip(
+            next_users, user_inputs_list_new, transform_params_list
+        ):
+            if user_node.target not in {torch.cat, torch.stack}:
+                # Change the args and kwargs of non-cat/stack nodes. Replace old getitems (belonging to
+                # the original split node) with the newer getitems
+                next_cat_input = 0
+                for input_node in user_node.all_input_nodes:
+                    if input_node in split_users:
+                        user_node.replace_input_with(
+                            input_node, user_inputs_new[next_cat_input]
+                        )
+                        next_cat_input += 1
+                continue
+
+            # Handle cat/stack user nodes
+            cat_dim = get_arg_value(user_node, 1, "dim")
+            user_inputs_new_transformed = []
+            # For `unsqueeze` transform, we will combine consecutive inputs with the same unsqueeze params, and stack them
+            to_stack = []
+            stack_dim = None
+            with graph.inserting_before(user_node):
+                for user_input_new, transform_param in zip(
+                    user_inputs_new, transform_params
+                ):
+                    # Apply transforms
+                    (
+                        unflatten_params,
+                        movedim_params,
+                        unsqueeze_params,
+                        flatten_params,
+                    ) = transform_param
+                    if unsqueeze_params and (
+                        stack_dim is None or stack_dim == unsqueeze_params[0]
+                    ):
+                        to_stack.append(user_input_new)
+                        stack_dim = unsqueeze_params[0]
+                        continue
+                    elif to_stack:
+                        stacked_input = graph.call_function(
+                            torch.stack, args=(to_stack,), kwargs={"dim": stack_dim}
+                        )
+                        to_stack = []
+                        stack_dim = None
+                        user_inputs_new_transformed.append(stacked_input)
+                        if unsqueeze_params:
+                            to_stack.append(user_input_new)
+                            stack_dim = unsqueeze_params[0]
+                            continue
+
+                    if unflatten_params:
+                        user_input_new = graph.call_function(
+                            torch.unflatten, args=(user_input_new, *unflatten_params)
+                        )
+                    if movedim_params:
+                        user_input_new = graph.call_function(
+                            torch.movedim, args=(user_input_new, *movedim_params)
+                        )
+                    if flatten_params:
+                        user_input_new = graph.call_function(
+                            torch.flatten, args=(user_input_new, *flatten_params)
+                        )
+                    user_inputs_new_transformed.append(user_input_new)
+                if to_stack:
+                    stacked_input = graph.call_function(
+                        torch.stack, args=(to_stack,), kwargs={"dim": stack_dim}
+                    )
+                    user_inputs_new_transformed.append(stacked_input)
+
+            with graph.inserting_after(user_node):
+                if len(user_inputs_new_transformed) > 1:
+                    new_cat_node = graph.call_function(
+                        torch.cat,
+                        args=(user_inputs_new_transformed,),
+                        kwargs={"dim": cat_dim},
+                    )
+                    new_cat_node.meta.update(user_node.meta)
+                    counters["inductor"]["scmerge_cat_added"] += 1
+                else:
+                    new_cat_node = user_inputs_new_transformed[-1]
+
+            if (
+                user_node.target == torch.cat
+                and split_dim != cat_dim
+                and split_node.target == torch.split
+            ):
+                with graph.inserting_after(new_cat_node):
+                    new_cat_node = graph.call_function(
+                        torch.flatten, args=(new_cat_node, cat_dim, cat_dim + 1)
+                    )
+            user_node.replace_all_uses_with(new_cat_node)
+            new_cats.append(new_cat_node)
+
+    def erase_old_nodes(
+        self,
+        graph: torch.fx.GraphModule,
+        split_node: torch.fx.Node,
+        next_users: List[torch.fx.Node],
+    ):
+        to_remove = [split_node]
+        counters["inductor"]["scmerge_split_removed"] += 1
+        to_remove.extend(split_node.users.keys())
+        for next_user in next_users:
+            if next_user.target not in {torch.cat, torch.stack}:
+                continue
+            counters["inductor"]["scmerge_cat_removed"] += 1
+            to_remove.append(next_user)
+        for node in reversed(to_remove):
+            graph.erase_node(node)
+
+
+class UnbindCatRemover(SplitCatSimplifier):
+    """
+    Helper class to merge Unbind->Cat/Stack. Many of the cases are similar to SplitCatSimplifier.
+
+    Unbind can't be simplified like splits. So, we can only remove the unbind node. Other than this,
+    other cases like multiple users, additional args, dim mismatch are similar to `SplitCatSimplifier`,
+    hence we extend that class.
+    """
+
+    def remove_unbind(
+        self,
+        graph: torch.fx.Graph,
+        unbind_node: torch.fx.Node,
+    ):
+        num_unbind = (  # type: ignore[operator]
+            max(getitem_node.args[1] for getitem_node in unbind_node.users.keys()) + 1  # type: ignore[operator, union-attr, type-var]
+        )
+        split_sections = [1 for _ in range(num_unbind)]  # type: ignore[operator, arg-type]
+
+        super().simplify(graph, unbind_node, split_sections)
+
+    def get_simplified_split_ranges(
+        self,
+        split_sections: List[int],
+        next_users: List[torch.fx.Node],
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[List[_Range]]:
+        simplified_split_ranges = super().get_simplified_split_ranges(
+            split_sections, next_users, user_inputs_list
+        )
+        if not simplified_split_ranges or len(simplified_split_ranges) != 1:
+            return None
+        return simplified_split_ranges
+
+    def get_transform_params(
+        self,
+        unbind_node: torch.fx.Node,
+        next_users: List[torch.fx.Node],
+        user_inputs_list: List[List[Union[torch.fx.Node, _Range]]],
+    ) -> Optional[List[List[_TransformParam]]]:
+        """
+        Figure out what transforms are needed for each input to each cat node.
+
+        Here is the rough transforms we apply:
+
+        x -> unbind -> stack => x -> movedim
+
+        x -> unbind -> cat => x -> movedim -> flatten
+
+        When cat/stack nodes have additional args:
+
+             addn ---|              addn -> unsqueeze ---|
+        x -> unbind -> stack  =>           x -> movedim  -> cat
+
+             addn ---|                            addn ---|
+        x -> unbind -> cat  =>   x -> movedim -> flatten  -> cat
+
+        (Note application of these depends on the dims as well)
+
+
+        """
+        split_dim = unbind_node.kwargs["dim"]
+        transform_params_list: List[List[_TransformParam]] = []
+        for user_node, user_inputs in zip(next_users, user_inputs_list):
+            cat_dim = get_arg_value(user_node, 1, "dim") or 0
+            transform_params: List[_TransformParam] = []
+            for user_input in user_inputs:
+                if isinstance(user_input, tuple):
+                    # User input is coming from unbind
+                    movedim_params = (
+                        (split_dim, cat_dim) if split_dim != cat_dim else None
+                    )
+                    flatten_params = None
+                    if user_node.target == torch.cat:
+                        flatten_params = (cat_dim, cat_dim + 1)
+                    transform_params.append(
+                        (None, movedim_params, None, flatten_params)
+                    )
+                elif (
+                    user_node.target == torch.stack
+                ):  # We need to unsqueeze inputs not coming through unbind into cat
+                    transform_params.append((None, None, (cat_dim,), None))
+                else:  # Non-unbind inputs
+                    transform_params.append((None, None, None, None))
+            transform_params_list.append(transform_params)
+        return transform_params_list
+
+
+class GetItem(CallFunction):
+    def __init__(self, arg, index, _users=1):
+        super().__init__(operator.getitem, arg, index, _users=_users)
+
+    def find_anchor_nodes(self, ctx: MatchContext, searched: Set[torch.fx.Node]):
+        # We generally match GetItem with arg being an Arg(). So, we never return the anchor
+        # nodes as the stored node in ctx.pattern_to_node is returned. Here we override find_anchor_nodes
+        # to not use ctx.pattern_to_node
+        for pattern in self.flat_args_kwargs[0]:
+            if isinstance(pattern, PatternExpr):
+                for other_node in pattern.find_anchor_nodes(ctx, searched):
+                    if not isinstance(other_node, torch.fx.Node):
+                        continue
+                    for node in other_node.users:
+                        if node not in searched:
+                            if self._match_fns(node):
+                                yield node
+                                searched.add(node)
+
+
+@register_graph_pattern(
+    RepeatedExpr(
+        CallFunction(
+            torch.squeeze,
+            GetItem(
+                TorchSplit(
+                    KeywordArg("split_input"),
+                    KeywordArg("split_sizes"),
+                ),
+                Ignored(),
+            ),
+            KeywordArg("dim"),
+            _users=MULTIPLE,
+        ),
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    RepeatedExpr(
+        CallFunction(
+            torch.squeeze,
+            GetItem(
+                TorchSplit(
+                    KeywordArg("split_input"),
+                    KeywordArg("split_sizes"),
+                ),
+                Ignored(),
+            ),
+            dim=KeywordArg("dim"),
+            _users=MULTIPLE,
+        )
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def merge_split_squeeze(
+    match: Match, split_input: torch.fx.Node, split_sizes: List[int], dim: int
+):
+    graph = match.graph
+    split = next(node for node in match.nodes if node.target == torch.split)
+    if not all(s == 1 for s in split_sizes):
+        return
+    if isinstance(dim, Sequence):
+        return
+    next_users = find_next_users(split)
+    if not all(node.target == torch.squeeze for node in next_users):
+        return
+    with graph.inserting_before(match.output_node()):
+        unbind = graph.call_function(
+            torch.unbind, args=(split_input,), kwargs={"dim": dim}
+        )
+        for item_index, getitem_node in sorted(
+            [
+                (getitem_node.args[1], getitem_node)
+                for getitem_node in split.users.keys()
+            ]
+        ):
+            squeeze = next(iter(getitem_node.users.keys()))
+            new_get_item = graph.call_function(
+                operator.getitem, args=(unbind, item_index)
+            )
+            squeeze.replace_all_uses_with(new_get_item)
+            new_get_item.meta.update(squeeze.meta)
+            graph.erase_node(squeeze)
+            graph.erase_node(getitem_node)
+    graph.erase_node(split)
+    counters["inductor"]["split_squeeze_replaced"] += 1
+
+
+getitem_unbind = ListOf(
+    GetItem(
+        CallFunction(
+            torch.unbind,
+            KeywordArg("unbind_input"),
+            dim=KeywordArg("dim"),
+            _users=MULTIPLE,
+        ),
+        Ignored(),
+        _users=MULTIPLE,
+    ),
+    partial=True,
+)
+
+
+@register_graph_pattern(
+    CallFunction([torch.stack, torch.cat], getitem_unbind, Ignored(), _users=MULTIPLE),
+    pass_dict=unbind_stack_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        [torch.stack, torch.cat], getitem_unbind, dim=Ignored(), _users=MULTIPLE
+    ),
+    pass_dict=unbind_stack_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        [torch.stack, torch.cat], tensors=getitem_unbind, dim=Ignored(), _users=MULTIPLE
+    ),
+    pass_dict=unbind_stack_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def merge_unbind_stack(match: Match, unbind_input: torch.fx.Node, dim: int):
+    unbind_node = next(node for node in match.nodes if node.target == torch.unbind)
+    UnbindCatRemover().remove_unbind(match.graph, unbind_node)
+
+
+getitem_split = ListOf(
+    CallFunction(
+        operator.getitem,
+        TorchSplit(
+            Ignored(),
+            KeywordArg("split_sections"),
+        ),
+        Ignored(),
+        _users=MULTIPLE,
+    ),
+    partial=True,
+)
+
+
+@register_graph_pattern(
+    CallFunction(
+        [torch.stack, torch.cat],
+        tensors=getitem_split,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        [torch.stack, torch.cat],
+        getitem_split,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        [torch.stack, torch.cat],
+        getitem_split,
+        Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def simplify_split_cat(match: Match, split_sections: List[int], dim: int):
+    if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
+        return
+    split_node = next(node for node in match.nodes if node.target == torch.split)
+    SplitCatSimplifier().simplify(match.graph, split_node, split_sections)
+
+
+# noqa: W605
+# ############pattern to be optimized is#########
+
+#                 split_node(dim=1)
+#       /     \         ...       /         \
+# getitem    getitem          getitem     getitem   -> user=1
+#    \       /                     \       /
+#      cat (user=mul, dim=1)           cat(user=mul, dim=1)
+#       |            \                   |          \
+
+# ################after transformation#############
+
+#                 split_node(dim=1)
+#       /              ...                  \
+#     getitem                             getitem
+#     |    \                              |     \
+
+
+def has_same_parent_node(node: torch.fx.Node):
+    # the input nodes of the node should come from the same parent
+    prev_node = None
+    for getitem in node.args[0]:  # type: ignore[union-attr]
+        if getitem.target != operator.getitem:  # type: ignore[union-attr]
+            return False
+        if prev_node is None:
+            prev_node = getitem.args[0]  # type: ignore[union-attr]
+        else:
+            if getitem.args[0] != prev_node:
+                return False
+    return True
+
+
+def remove_zeros(split_sections: List[int]):
+    """
+    Remove zeros from the list and get the index mapping dict from getitem
+    in split node to getitem in new split node
+    """
+    new_split_sections, index_mapping = [], {}
+    idx = 0
+    for i in range(len(split_sections)):
+        if split_sections[i] > 0:
+            new_split_sections.append(split_sections[i])
+            index_mapping[i] = idx
+            idx += 1
+
+    return new_split_sections, index_mapping
+
+
+def is_sorted_and_consecutive(arr: List[int]) -> bool:
+    # check if the array is sorted
+    if arr == sorted(arr):
+        # check if the differences between adjacent elements are all 1
+        return all(x[1] - x[0] == 1 for x in zip(arr, arr[1:]))
+    else:
+        return False
+
+
+def calculate_fused_tensor_size(split_node: torch.fx.Node, indices: List[int]) -> int:
+    """
+    Calculate the fused tensor size in the indices
+    """
+    fused_tensor_size = 0
+    for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
+        if i in indices:
+            fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
+    return fused_tensor_size
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.cat,
+        getitem_split,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=merge_getitem_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def merge_getitem_cat(match: Match, split_sections: List[int], dim: int):
+    if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
+        return
+    graph = match.graph
+    split_node = next(node for node in match.nodes if node.target == torch.split)
+    split_input, split_size, split_dim = _get_split_args_default(split_node)
+    # if the cat and split have different dims, return
+    # Find the next users (i.e. users after the getitem)
+    next_users = find_next_users(split_node)
+    # 'immutable_list' object does not support mutation. Create a new copy of it
+    split_sections = list(split_sections)
+    for cat_user in next_users:
+        if cat_user.target == torch.cat:
+            cat_dim = get_arg_value(cat_user, 1, "dim")
+            # check the all getitems in the cat_user from the same node
+            # check the input of the cat has all getitem from the split
+            # check all getitem only has one single user
+            if (
+                split_dim != cat_dim
+                or not has_same_parent_node(cat_user)
+                or not all(len(arg.users) == 1 for arg in cat_user.args[0])  # type: ignore[union-attr]
+            ):
+                continue
+            # find the index of getitems to be cated/stacked
+            indices = []
+            for arg in cat_user.args[0]:  # type: ignore[union-attr]
+                indices.append(arg.args[1])  # type: ignore[union-attr]
+            # the gettitems to be merged must be consecutive, otherwise
+            # returned sliced tensor could be wrong
+            if not is_sorted_and_consecutive(indices):
+                continue
+            # update the arg of cat user, only keep the first getitem
+            cat_user.update_arg(0, cat_user.args[0][0])  # type: ignore[index]
+            # calculate the fused tensor sizes in the indices
+            fused_tensor_size = 0
+            for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
+                if i in indices:
+                    fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, assignment, index]
+            # update the split sections
+            split_sections[indices[0]] = calculate_fused_tensor_size(
+                split_node, indices
+            )
+            # padding others with zeros to keep the same dict size
+            for i in indices[1:]:
+                split_sections[i] = 0
+            # remove all unused indexes in the split_node
+            new_split_sections, index_mapping = remove_zeros(split_sections)
+            with graph.inserting_after(split_node):
+                new_split_node = graph.call_function(
+                    torch.split,
+                    args=(split_input, split_sections),
+                    kwargs={"dim": split_dim},
+                )
+                split_node.replace_all_uses_with(new_split_node)
+                new_split_node.meta.update(split_node.meta)
+                # remove all unused getitem nodes
+                to_remove = [cat_user]
+                # dictionary keys changed during iteration
+                new_split_getitem_nodes = list(new_split_node.users.keys())
+                for getitem_node in new_split_getitem_nodes:
+                    if getitem_node.args[1] in indices[1:]:
+                        to_remove.append(getitem_node)
+                    # update meta data of getitem
+                    elif getitem_node.args[1] == indices[0]:
+                        cat_user.replace_all_uses_with(getitem_node)
+                        getitem_node.meta.update(cat_user.meta)
+                    else:
+                        # update getitem index for new split node
+                        getitem_node.update_arg(1, index_mapping[getitem_node.args[1]])
+                graph.erase_node(split_node)
+                for getitem_node in to_remove:
+                    graph.erase_node(getitem_node)
+                # update the split sections of new split node
+                new_split_node.update_arg(1, new_split_sections)
+                split_node = new_split_node
+                split_sections = new_split_sections
+
+                counters["inductor"]["getitem_cat_merged"] += 1
+
+
+# ############pattern to be optimized is#########
+
+#                 split_node(dim=1)  -> user=multiple
+#       /     \         ...       /         \
+# getitem    getitem          getitem     getitem   -> user=multiple
+#    \       \                    /            \
+#          other_op /cat(user=mul, dim=1)             other_op
+#                      |
+
+# ################after transformation#############
+
+#                 split_node(dim=1)         -> -> user=multiple
+#       /     \         ...       /         \
+# getitem    getitem          getitem     getitem   -> user=multiple
+#    \       \                    /           \
+#                          other_op
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.cat,
+        getitem_split,
+        dim=Ignored(),
+        _users=MULTIPLE,
+    ),
+    pass_dict=split_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def mutate_cat_node(match: Match, split_sections: List[int], dim: int):
+    if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
+        return
+    graph = match.graph
+    split_node = next(node for node in match.nodes if node.target == torch.split)
+    split_input, split_size, split_dim = _get_split_args_default(split_node)
+    # if the cat and split have different dims, return
+    # Find the next users (i.e. users after the getitem)
+    next_users = find_next_users(split_node)
+    for cat_user in next_users:
+        if cat_user.target == torch.cat:
+            cat_dim = get_arg_value(cat_user, 1, "dim") or 0
+            # check that all getitems in the cat_user from the same node
+            # check the input of the cat has all getitem from the split
+            if split_dim != cat_dim or not has_same_parent_node(cat_user):
+                continue
+            # find the index of getitems to be cat
+            indices, idx_to_getitem = [], {}
+            for getitem in cat_user.args[0]:  # type: ignore[union-attr]
+                indices.append(getitem.args[1])  # type: ignore[union-attr]
+                idx_to_getitem[getitem.args[1]] = getitem  # type: ignore[union-attr]
+            # the gettitems to be merged must be consecutive, otherwise
+            # returned sliced tensor could be wrong
+            if not is_sorted_and_consecutive(indices):
+                continue
+            # case 1: the cat uses all getitems from the split
+            if len(split_sections) == len(cat_user.args[0]):  # type: ignore[arg-type]
+                # replace the users of the cat node to be the input of the split node
+                cat_user.replace_all_uses_with(split_node.args[0])
+                # remove the cat node
+                graph.erase_node(cat_user)
+                counters["inductor"]["cat_mutated"] += 1
+            # case 2: the cat uses some getitems from the split
+            elif is_node_meta_valid(split_node.args[0]):  # type: ignore[arg-type]
+                # check the split dim, and construct the slice tuple
+                start_fused_size = calculate_fused_tensor_size(
+                    split_node, list(range(indices[0]))
+                )
+                end_fused_size = start_fused_size + calculate_fused_tensor_size(
+                    split_node, indices
+                )
+                slice_list = []
+                for i in range(len(split_node.args[0].meta["example_value"].shape)):  # type: ignore[union-attr]
+                    if i != split_dim:
+                        slice_list.append(slice(None, None, None))
+                    else:
+                        slice_list.append(slice(start_fused_size, end_fused_size, None))
+                with graph.inserting_after(split_node):
+                    slice_node = graph.call_function(
+                        operator.getitem,
+                        args=(split_node.args[0], tuple(slice_list)),
+                    )
+                    cat_user.replace_all_uses_with(slice_node)
+                    slice_node.meta.update(cat_user.meta)
+
+                # remove the cat node
+                graph.erase_node(cat_user)
+                counters["inductor"]["cat_mutated"] += 1
+
+
+# noqa: W605
+# ############The pattern to be optimized is#########
+#                            split_node (dim=1)
+#       /   ...    \             ...       /         \
+# getitem      getitem                 getitem     getitem -> user=1
+#    \           /
+#        stack (dim=0)  -> user=1, getitems to be consecutive
+#          |
+#         tahn  -> user=1
+#          |
+#         unbind (dim=0)
+#           |
+
+# ################After transformation#############
+#                  split_node (dim=1)
+#             /      ...       /         \
+#    getitem       getitem     getitem -> user=1
+#       |
+#     tahn
+#       |
+#     split
+#       |
+
+
+@register_graph_pattern(
+    CallFunction(
+        torch.tanh,
+        CallFunction(
+            torch.stack,
+            getitem_split,
+            dim=Ignored(),
+        ),
+    ),
+    pass_dict=merge_getitem_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        torch.tanh,
+        CallFunction(
+            torch.stack,
+            tensors=getitem_split,
+            dim=Ignored(),
+        ),
+    ),
+    pass_dict=merge_getitem_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+@register_graph_pattern(
+    CallFunction(
+        torch.tanh,
+        CallFunction(
+            torch.stack,
+            getitem_split,
+            Ignored(),
+        ),
+    ),
+    pass_dict=merge_getitem_cat_pass,
+    extra_check=config_flag("split_cat_fx_passes"),
+)
+def merge_stack_tahn_unbind(match: Match, split_sections: List[int], dim: int):
+    if not isinstance(split_sections, (list, tuple)):  # Unnormalized split
+        return
+    graph = match.graph
+    split_node = next(node for node in match.nodes if node.target == torch.split)
+    split_input, split_size, split_dim = _get_split_args_default(split_node)
+    # Find the next users (i.e. users after the getitem)
+    next_users = find_next_users(split_node)
+    # 'immutable_list' object does not support mutation. Create a new copy of it
+    split_sections = list(split_sections)
+    for user in next_users:
+        # stack user only has one user
+        if user.target == torch.stack:
+            stack_dim = get_arg_value(user, 1, "dim") or 0
+            unbind_user = find_next_users(user)[0]
+            if unbind_user.target != torch.unbind:
+                continue
+            unbind_dim = get_arg_value(unbind_user, 1, "dim") or 0
+            # stack and unbind should have the same dim
+            # check the all getitems in the user from the same node
+            # check all the getitems only has single user
+            if (
+                stack_dim != unbind_dim
+                or not has_same_parent_node(user)
+                or not all(len(arg.users) == 1 for arg in user.args[0])  # type: ignore[union-attr]
+            ):
+                continue
+            # find the index of getitems to be stacked
+            indices = []
+            split_sections_for_unbind = []
+            for arg in user.args[0]:  # type: ignore[union-attr]
+                indices.append(arg.args[1])  # type: ignore[union-attr]
+                split_sections_for_unbind.append(split_sections[arg.args[1]])  # type: ignore[union-attr]
+            # the gettitems to be merged must be consecutive, otherwise
+            # returned sliced tensor could be wrong
+            if not is_sorted_and_consecutive(indices):
+                continue
+            # update the arg of stack user, only keep the first getitem
+            user.update_arg(0, user.args[0][0])  # type: ignore[index]
+            # calculate the fused tensor sizes in the indices
+            fused_tensor_size = 0
+            for i in range(len(split_node.args[1])):  # type: ignore[arg-type]
+                if i in indices:
+                    fused_tensor_size += split_node.args[1][i]  # type: ignore[operator, index, assignment]
+            # update the split sections
+            split_sections[indices[0]] = calculate_fused_tensor_size(
+                split_node, indices
+            )
+            # padding others with zeros to keep the same dict size
+            for i in indices[1:]:
+                split_sections[i] = 0
+            # remove all unused indexes in the split_node
+            new_split_sections, index_mapping = remove_zeros(split_sections)
+            with graph.inserting_after(split_node):
+                new_split_node = graph.call_function(
+                    torch.split,
+                    args=(split_input, split_sections),
+                    kwargs={"dim": split_dim},
+                )
+                replace_unbind_with_split = graph.call_function(
+                    torch.split,
+                    args=(unbind_user.args[0], split_sections_for_unbind),
+                    kwargs={"dim": split_dim},
+                )
+                unbind_user.replace_all_uses_with(replace_unbind_with_split)
+                replace_unbind_with_split.meta.update(unbind_user.meta)
+                # remove getitem and split, stack
+                split_node.replace_all_uses_with(new_split_node)
+                new_split_node.meta.update(split_node.meta)
+                # remove all unused getitem nodes
+                to_remove = [unbind_user]
+                # dictionary keys changed during iteration
+                new_split_getitem_nodes = list(new_split_node.users.keys())
+                for getitem_node in new_split_getitem_nodes:
+                    if getitem_node.args[1] in indices[1:]:
+                        to_remove.append(getitem_node)
+                    # update meta data of getitem
+                    elif getitem_node.args[1] == indices[0]:
+                        user.replace_all_uses_with(getitem_node)
+                        getitem_node.meta.update(user.meta)
+                    else:
+                        # update getitem index for new split node
+                        getitem_node.update_arg(1, index_mapping[getitem_node.args[1]])
+                graph.erase_node(split_node)
+                graph.erase_node(user)
+                for getitem_node in to_remove:
+                    graph.erase_node(getitem_node)
+                # update the split sections of new split node
+                new_split_node.update_arg(1, new_split_sections)
+                split_node = new_split_node
+                split_sections = new_split_sections
+
+                counters["inductor"]["stack_tahn_unbind_merged"] += 1
diff --git a/MLPY/Lib/site-packages/torch/_inductor/fx_utils.py b/MLPY/Lib/site-packages/torch/_inductor/fx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ed09b35b6fbba51eb61076045cd56371dd4b6de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/fx_utils.py
@@ -0,0 +1,220 @@
+import operator
+from collections import defaultdict
+from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type
+
+import torch
+import torch.fx
+from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_map
+from .virtualized import V
+
+
+# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched.
+# Works for length 2 patterns with 1 module and 1 function/method.
+def matches_module_function_pattern(
+    pattern: Tuple[Type[torch.nn.modules.Module], Callable[..., Any]],
+    node: torch.fx.node.Node,
+    modules: Dict[str, torch.nn.modules.Module],
+) -> bool:
+    if len(node.args) == 0:
+        return False
+    if not isinstance(node.args[0], torch.fx.Node) or not isinstance(
+        node, torch.fx.Node
+    ):
+        return False
+    # the first node is call_module
+    if node.args[0].op != "call_module":
+        return False
+    if not isinstance(node.args[0].target, str):
+        return False
+    if node.args[0].target not in modules:
+        return False
+    if type(modules[node.args[0].target]) is not pattern[0]:
+        return False
+    # the second node is call_function or call_method
+    if node.op != "call_function" and node.op != "call_method":
+        return False
+    if node.target != pattern[1]:
+        return False
+    # make sure node.args[0] output is only used by current node.
+    if len(node.args[0].users) > 1:
+        return False
+    return True
+
+
+class FakeTensorUpdater:
+    """
+    The main idea here is that it's difficult to maintain accurate fake
+    tensors (our primary form of metadata) for each node in our graph as we
+    transform it.
+
+    The most reliable way to obtain this information is by rerunning
+    faketensor propagation. However, in general, faketensor propagation is
+    fairly expensive. So, instead we'd like to only rerun faketensor
+    propagation on nodes that have changed.
+
+    In order to detect which nodes have changed, we first hash its node,
+    target, and argument lists (which are immutable in FX).
+
+    Then, whenever we call incremental_update, we check which FX nodes have a
+    new hash, and recompute the faketensor metadata for that node. Then, we
+    continue to recursively compute the faketensors for all users until the
+    fake tensors stop changing.
+    """
+
+    def __init__(self, graph: torch.fx.Graph):
+        self.processed_hashes = set()
+        self.graph = graph
+
+        for node in self.graph.nodes:
+            self.processed_hashes.add(self.hash_node(node))
+
+    def hash_node(self, node: torch.fx.Node):
+        # todo(chilli): Not a great hash function
+        return (node, node.target, id(node.args), id(node.kwargs))
+
+    def incremental_update(self):
+        processed = set()
+        existing_storages: DefaultDict[Optional[int], int] = defaultdict(int)
+        for node in self.graph.nodes:
+            existing_storages[get_node_storage(node)] += 1
+
+        def is_intlist_same(new, old):
+            return statically_known_true(sym_eq(new, old))
+
+        def is_fake_tensor_same(new, old):
+            if type(new) != type(old):
+                return False
+            if isinstance(new, (list, tuple)):
+                if len(new) != len(old):
+                    return False
+                return all(
+                    is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old)
+                )
+            assert isinstance(new, torch.Tensor)
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
+                return False
+
+            if get_storage(new) == get_storage(old):
+                return True
+
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+            ):
+                return True
+            return False
+
+        for node in self.graph.nodes:
+            if self.hash_node(node) in self.processed_hashes:
+                continue
+
+            def is_aten_node(node):
+                return node.op == "call_function" and isinstance(
+                    node.target, torch._ops.OpOverload
+                )
+
+            if not is_aten_node(node):
+                continue
+
+            processing = [node]
+            while len(processing) > 0:
+                updating_node = processing.pop()
+                if updating_node in processed:
+                    continue
+                if is_aten_node(updating_node):
+                    continue
+
+                is_valid, args, kwargs = get_fake_args_kwargs(updating_node)
+                if not is_valid:
+                    continue
+                with V.fake_mode:
+                    new_fake_tensor = updating_node.target(*args, **kwargs)
+                if "val" in updating_node.meta and is_fake_tensor_same(
+                    new_fake_tensor, updating_node.meta["val"]
+                ):
+                    continue
+                updating_node.meta["val"] = new_fake_tensor
+
+                # todo(chilli): This code path is not exercised by our existing
+                # tests - add a test
+                existing_storages[get_node_storage(new_fake_tensor)] += 1
+                processed.add(updating_node)
+                processing.extend(updating_node.users)
+
+                self.processed_hashes.add(self.hash_node(updating_node))
+
+
+def get_storage(t: torch.Tensor) -> int:
+    return t.untyped_storage()._cdata
+
+
+def get_node_storage(node: torch.fx.Node) -> Optional[int]:
+    if "val" not in node.meta:
+        return None
+    if not isinstance(node.meta["val"], torch.Tensor):
+        return None
+    if not torch._C._has_storage(node.meta["val"]):
+        return None
+    return get_storage(node.meta["val"])
+
+
+def get_fake(x):
+    if isinstance(x, torch.fx.Node):
+        if "val" not in x.meta:
+            return x
+        return x.meta["val"]
+    return x
+
+
+def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str, Any]]:
+    """
+    First value returns a boolean if any of the input nodes don't have a faketensor.
+    """
+    args, kwargs = tree_map(get_fake, (x.args, x.kwargs))
+    if any(
+        isinstance(a, torch.fx.Node) for a in pytree.arg_tree_leaves(*args, **kwargs)
+    ):
+        return False, args, kwargs
+    return True, args, kwargs
+
+
+def is_node_realized(node: torch.fx.Node) -> bool:
+    """Returns true if a node is always realized when lowered to inductor IR.
+
+    NOTE: This may return some false negatives. e.g. it doesn't
+    handle buffers realized heuristically during lowering, or
+    buffers realized indirectly through view ops.
+    """
+    from torch._inductor.lowering import fallbacks, needs_realized_inputs
+
+    def is_buffer(node: torch.fx.Node) -> bool:
+        if node.op == "call_function" and node.target is operator.getitem:
+            # For nodes with multiple outputs, we get the fx graph:
+            #     foo = torch.ops.aten.foo(...)
+            #     getitem = foo[0]
+            #     getitem_1 = foo[1]
+            # where we need to check if foo is a fallback kernel
+            return is_buffer(node.args[0])  # type: ignore[arg-type]
+        return node.op in ("placeholder", "output") or node.target in fallbacks
+
+    if is_buffer(node):
+        return True
+
+    def realizes_inputs(node: torch.fx.Node) -> bool:
+        return node.op == "output" or node.target in needs_realized_inputs
+
+    if any(realizes_inputs(user) for user in node.users):
+        return True
+
+    # Otherwise, assume node isn't realized
+    return False
diff --git a/MLPY/Lib/site-packages/torch/_inductor/graph.py b/MLPY/Lib/site-packages/torch/_inductor/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed21547c5da3cc2259c5026632440c169309a3a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/graph.py
@@ -0,0 +1,1324 @@
+import itertools
+import logging
+import operator
+import os
+import re
+import sys
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from typing import Any, Callable, DefaultDict, Dict, List, Optional, Set, Tuple
+
+import sympy
+
+import torch
+import torch._logging
+import torch.fx
+from torch._decomp import get_decompositions
+from torch._dynamo.utils import defake, dynamo_timed
+from torch._logging import LazyString, trace_structured
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx.experimental._backward_state import BackwardState
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.fx.experimental.symbolic_shapes import has_free_symbols, ShapeEnv, SymTypes
+from torch.utils._mode_utils import no_dispatch
+
+from . import config, ir
+from .codegen.common import (
+    DeviceOpOverrides,
+    get_device_op_overrides,
+    get_scheduling_for_device,
+    get_wrapper_codegen_for_device,
+    register_backend_for_device,
+)
+from .codegen.cpp_wrapper_cpu import CppWrapperCpu
+from .codegen.cpp_wrapper_cuda import CppWrapperCuda
+from .codegen.wrapper import WrapperCodeGen
+from .exc import (
+    CppWrapperCodeGenError,
+    LoweringException,
+    MissingOperatorWithDecomp,
+    MissingOperatorWithoutDecomp,
+)
+from .ir import (
+    Constant,
+    FixedLayout,
+    InputBuffer,
+    Pointwise,
+    Reduction,
+    StorageBox,
+    TensorBox,
+)
+from .lowering import (
+    constrain_to_fx_strides,
+    FALLBACK_ALLOW_LIST,
+    fallback_handler,
+    fallback_node_due_to_unsupported_type,
+    layout_constraints,
+    lowerings,
+    make_fallback,
+    needs_realized_inputs,
+    unsupported_output_tensor,
+)
+from .sizevars import SizeVarAllocator
+from .utils import convert_shape_to_inductor, gather_origins, get_sympy_Expr_dtype
+from .virtualized import V
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+output_code_log = torch._logging.getArtifactLogger(__name__, "output_code")
+
+
+if config.is_fbcode():
+    from torch._inductor.fb.utils import log_module_code
+else:
+
+    def log_module_code(*args, **kwargs):
+        pass
+
+
+def supported_dtype_of_cpp_wrapper(dtype, cuda):
+    supported_dtype = {
+        torch.float32,
+        torch.float64,
+        torch.int64,
+        torch.int32,
+        torch.int16,
+        torch.int8,
+        torch.uint8,
+        torch.bool,
+        torch.bfloat16,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.float16,
+    }
+    if cuda:
+        supported_dtype.add(torch.float8_e4m3fn)
+        supported_dtype.add(torch.float8_e5m2)
+        supported_dtype.add(torch.float8_e4m3fnuz)
+        supported_dtype.add(torch.float8_e5m2fnuz)
+
+    return dtype in supported_dtype
+
+
+def may_get_constant_buffer_dtype(constant_buffer):
+    assert isinstance(
+        constant_buffer, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+    ), "get_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer"
+    if isinstance(constant_buffer, sympy.core.numbers.Integer):
+        return torch.int64
+
+    if isinstance(constant_buffer, sympy.Expr):
+        return get_sympy_Expr_dtype(constant_buffer)
+
+    if constant_buffer.is_integer:
+        return torch.int64
+    elif constant_buffer.is_float:
+        return torch.float32
+    else:
+        return None
+
+
+def is_magic_method(op):
+    magic_ops = {method_to_operator(m) for m in magic_methods}
+    return op in magic_ops
+
+
+def getattr_recursive(obj, target):
+    target_atoms = target.split(".")
+    attr_itr = obj
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+class GraphLowering(torch.fx.Interpreter):
+    graph_outputs: List[ir.IRNode]
+
+    def symbolic_sizes_strides(self, ex: torch.Tensor):
+        """
+        Support dynamic shapes and dynamic strides by assigning variables
+        to each dimension.  We duck-shape tensors, so if two tensors
+        have the same size they get assigned the same symbolic variable.
+        """
+        if self.reuse_shape_env:
+            return convert_shape_to_inductor(ex.size()), convert_shape_to_inductor(
+                ex.stride()
+            )
+        else:
+            from torch._dynamo.source import ConstantSource
+
+            # TODO: this should not be needed once #93059 lands
+            # https://github.com/pytorch/pytorch/pull/94031#discussion_r1096044816
+            # TODO: make a dedicated UnknownSource for this?
+            # NB: This is using the legacy default behavior from
+            # create_symbolic_sizes_strides_storage_offset but we hope we can
+            # just delete this entirely
+            source = ConstantSource(
+                f"__inductor_unknown_tensor_{len(self._shape_env.var_to_val)}"
+            )
+            (
+                size,
+                stride,
+                _,
+            ) = self._shape_env.create_symbolic_sizes_strides_storage_offset(
+                ex,
+                source,
+            )
+
+        size = [i.node.expr if isinstance(i, torch.SymInt) else i for i in size]
+        stride = [i.node.expr if isinstance(i, torch.SymInt) else i for i in stride]
+        return size, stride
+
+    def static_sizes_strides(self, ex: torch.Tensor):
+        """
+        Primarily used to weights
+        """
+        size = [sympy.Integer(i) for i in ex.size()]
+        stride = [sympy.Integer(i) for i in ex.stride()]
+        return size, stride
+
+    def init_backend_registration(self):
+        if get_scheduling_for_device("cpu") is None:
+            from .codegen.cpp import CppScheduling
+
+            register_backend_for_device("cpu", CppScheduling, WrapperCodeGen)
+
+        if get_scheduling_for_device("cuda") is None:
+            from .codegen.cuda_combined_scheduling import CUDACombinedScheduling
+
+            # CUDACombinedScheduling combines Triton and CUDA C++ scheduling for CUDA devices via delegation
+            register_backend_for_device("cuda", CUDACombinedScheduling, WrapperCodeGen)
+
+    def __init__(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: Optional[List[torch.Tensor]] = None,
+        shape_env=None,
+        num_static_inputs=None,
+        graph_id=None,
+        cpp_wrapper=False,
+        aot_mode=False,
+        user_visible_outputs=frozenset(),
+        layout_opt=None,
+        extern_node_serializer=None,
+        is_inference=False,
+        is_const_graph=False,
+        const_output_index=None,
+        const_code=None,
+        const_module=None,
+        name=None,
+    ):
+        super().__init__(gm)
+
+        self.example_inputs = example_inputs
+        self.layout_opt = (
+            layout_opt
+            if layout_opt is not None
+            else self.decide_layout_opt(gm, is_inference=is_inference)
+        )
+        self.num_channels_last_conv = 0
+        self.is_inference = is_inference
+        self.is_const_graph = is_const_graph
+        self.const_code = const_code
+        self.const_module = const_module
+
+        self.extra_traceback = False  # we do our own error wrapping
+        if shape_env is None:
+            shape_env = ShapeEnv()
+            self.reuse_shape_env = False
+        else:
+            self._shape_env = shape_env
+            self.reuse_shape_env = True
+        self._shape_env = shape_env
+        self.sizevars = SizeVarAllocator(shape_env)
+        self.graph_input_names: List[str] = []
+        self.graph_inputs: Dict[str, TensorBox] = {}
+        self.graph_inputs_original: Dict[str, InputBuffer] = {}
+        self.device_types: Set[str] = (
+            const_module.device_types if const_module else set()
+        )
+        self.device_idxs: Set[int] = const_module.device_idxs if const_module else set()
+        self.cuda = False
+        self.buffers: List[ir.Buffer] = []
+        self.const_output_index: Dict[str, int] = (
+            const_output_index if const_output_index else {}
+        )
+        self.folded_constants: Set[str] = (
+            set(const_output_index.keys()) if const_output_index else set()
+        )
+        self.constants: Dict[str, torch.Tensor] = (
+            const_module.constants if const_module else {}
+        )
+        self.constant_reprs: Dict[str, str] = {}
+        self.removed_buffers: Set[str] = set()
+        self.removed_inplace_buffers: Set[str] = set()
+        self.mutated_buffers: Set[str] = set()
+        self.never_reuse_buffers: Set[str] = set()
+        self.inplaced_to_remove: Set[str] = set()
+        self.device_ops: DeviceOpOverrides = None  # type: ignore[assignment]
+        self.wrapper_code: WrapperCodeGen = None  # type: ignore[assignment]
+        # See `ProxyExecutor Design Note` in ir.py for more details
+        self.extern_kernel_nodes: List[ir.ExternKernelNode] = []
+        self.extern_node_serializer: Optional[
+            Callable[[List[ir.ExternKernelNode]], Any]
+        ] = extern_node_serializer
+        self.current_node: torch.fx.Node = None  # type: ignore[assignment]
+        self.num_static_inputs = num_static_inputs
+        self.lists: Dict[str, List[str]] = {}
+        self.mutated_inputs: Set[str] = set()
+        self.mutated_input_idxs: List[int] = []
+        self.name_to_buffer: Dict[str, ir.Buffer] = {}
+        self.name_to_users: DefaultDict[str, List[ir.IRNode]] = defaultdict(list)
+        self.creation_time = time.time()
+        self.name = name
+        self.cpp_wrapper = cpp_wrapper
+
+        # record multi_kernel choice for cpp_wrapper so the second pass knows
+        # which sub-kernel is picked. Copy cpp_wrapper to another variable
+        # since cpp_wrapper flag is set to false for the first pass of codegen.
+        self.record_multi_kernel_choice = cpp_wrapper
+        self.multi_kernel_to_choice: Dict[str, int] = {}
+
+        self.aot_mode = aot_mode
+        self.graph_id = graph_id
+        self.scheduler: "torch._inductor.scheduler.Scheduler" = None  # type: ignore[assignment]
+        self.nodes_prefer_channels_last = (
+            self.find_nodes_prefer_channels_last() if self.layout_opt else set()
+        )
+        self._warned_fallback = {"aten.convolution_backward"}
+        self.user_visible_outputs = user_visible_outputs
+        self.cache_key: str = ""  # This is the cache key for the compiled artifact
+        self.cache_path: str = ""  # This is the path in the filesystem where the compiled artifact is stored
+        self.cache_linemap: List[
+            Tuple[int, str]
+        ] = (
+            []
+        )  # This is the linemap used by the profiler to mark custom compiled kernels getting run
+        # Used if lowering encounters cases where cudagraphs are not supported
+        self.disable_cudagraphs_reason: Optional[str] = None
+
+        # only keeping one node per device for stack trace purposes
+        self.device_node_mapping: Dict[torch.device, torch.fx.Node] = {}
+        self.orig_gm: torch.fx.GraphModule = gm.__copy__()
+        self.dynamo_flat_name_to_original_fqn = self.module.meta.get(
+            "dynamo_flat_name_to_original_fqn", {}
+        )
+        self.allocated_constant_name = (
+            const_module.allocated_constant_name if const_module is not None else {}
+        )
+        self.init_backend_registration()
+
+    @staticmethod
+    def decide_layout_opt(gm, *, is_inference) -> bool:
+        """
+        Decide if we should enable layout optimization for this graph based on
+        heuristics.
+        """
+        if not config.layout_optimization:
+            return False
+
+        if config.force_layout_optimization:
+            return True
+
+        conv_nodes = [
+            n for n in gm.graph.nodes if n.target == torch.ops.aten.convolution.default
+        ]
+        nconv = len(conv_nodes)
+
+        if nconv == 0:
+            return False
+
+        # For cpu backend and mkldnn enabled, we always use channels_last for better performance.
+        if (
+            torch.backends.mkldnn.enabled
+            and torch.backends.mkldnn.is_available()
+            and all(
+                n.args[idx].meta["val"].device == torch.device("cpu")
+                for n in conv_nodes
+                for idx in [0, 1]
+            )
+        ):
+            return True
+
+        # Following models are skipped due to this:
+        # jx_nest_base
+        # volo_d1_224
+        if len(list(gm.graph.nodes)) >= 300 * nconv:
+            log.debug("Skipped layout opt because only a few conv")
+            return False
+
+        if any(
+            has_free_symbols(n.args[idx].meta["val"])
+            for n in conv_nodes
+            for idx in [0, 1]
+        ):
+            log.debug(
+                "See perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670"
+            )
+            return False
+
+        def is_grouped(n):
+            return n.args[-1] > 1 and n.args[1].meta["val"].size(1) > 1
+
+        def is_in_out_channel(n):
+            return (
+                n.args[1].meta["val"].size(0) * 2 <= n.args[1].meta["val"].size(1)
+                and n.args[1].meta["val"].size(2) > 1
+            )
+
+        def is_small_channel(n):
+            return (
+                n.args[1].meta["val"].size(0) <= 64
+                and n.args[1].meta["val"].size(1) <= 64
+            )
+
+        # only grouped convolutions benchmarked as slower in conv samples for inference only
+        if is_inference:
+            from torch.utils.flop_counter import FlopCounterMode
+
+            flop_counts: Dict[str, float] = defaultdict(float)
+            for node in conv_nodes:
+                success, args, kwargs = torch._inductor.fx_utils.get_fake_args_kwargs(
+                    node
+                )
+
+                if success:
+                    with FlopCounterMode(display=False) as flop_counter_mode:
+                        with V.fake_mode:
+                            node.target(*args, **kwargs)
+
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    if is_grouped(node):
+                        node_type = "grouped"
+                    elif is_small_channel(node):
+                        node_type = "small"
+                    elif is_in_out_channel(node):
+                        node_type = "in_out"
+                    else:
+                        node_type = "default"
+
+                    flop_counts[node_type] += counted_flops
+                else:
+                    log.debug("Conv inputs meta not found")
+
+            # average benchmarked channels last speedup / slowdown, < 1 is speedup.
+            # taken from the set of convolution inputs in benchmarks/dynamo/microbenchmarks/operator_inp_logs/torchbench_train/
+            # To regenerate these numbers follow https://gist.github.com/eellison/55d7a6ed6f39829d68ac56f95f4df5bb
+            GROUPED_MULTIPLIER = 1.358
+            DEFAULT_MULTIPLIER = 0.823
+            IN_OUT_MULTIPLIER = 0.725
+            SMALL_MULTIPLIER = 0.783
+
+            total_flops = sum(flop_counts.values())
+            # TODO - get different values per hardware
+            weighted_flops = (
+                flop_counts["grouped"] * GROUPED_MULTIPLIER
+                + flop_counts["small"] * SMALL_MULTIPLIER
+                + flop_counts["in_out"] * IN_OUT_MULTIPLIER
+                + flop_counts["default"] * DEFAULT_MULTIPLIER
+            )
+            do_layout_opt = weighted_flops <= total_flops
+            if not do_layout_opt:
+                log.debug(
+                    "Skipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %d",
+                    total_flops,
+                    weighted_flops,
+                )
+            return do_layout_opt
+
+        # Channels last layout can dramatically hurt grouped conv perf. E.g.
+        # Conv with arguments like
+        #   {"input_shape": [32, 224, 112, 112], "weight_shape": [224, 112, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 2}
+        # slows down 31x using channels last..
+
+        # But a lot of timm models use depthwise separable convolution which will
+        # result in grouped convolution with in-channel size == 1.
+        # For those grouped convolution, channels last still helps a lot.
+        # E.g.
+        # Conv with arguments
+        #   {"input_shape": [128, 58, 56, 56], "weight_shape": [58, 1, 3, 3],
+        #    "stride": [2, 2], "padding": [1, 1], "groups": 58}
+        # get 1.86x speedup with channels last layout.
+        #
+        # The following heuristics skip using channels-last if the model contains
+        # grouped convolution with in-channels > 1.
+        if any(map(is_grouped, conv_nodes)):
+            log.debug(
+                "Skip layout opt because found grouped convolution with >1 in_channels!"
+            )
+            return False
+
+        # For some models that contain convolution with larger in-channel than out-channel, applying
+        # channels last hurts performance.
+        # Following models are skipped due to this:
+        # - pytorch_unet
+        # - phlippe_densenet (slightly worse)
+        # - Background_Matting (1.22x -> 0.821x)
+        # - pytorch_CycleGAN_and_pix2pix (1.597x -> 1.294x)
+        if any(map(is_in_out_channel, conv_nodes)):
+            log.debug(
+                "Skip layout opt because some convolutions have smaller out_channel"
+            )
+            return False
+
+        # Following models are skipped due to this:
+        # - functorch_maml_omniglot
+        if all(map(is_small_channel, conv_nodes)):
+            log.debug("Skip layout opt because all convolution channels are too small")
+            return False
+
+        return True
+
+    def qualify_name(self, name: str) -> str:
+        """Prepend the given name with the graph name if any."""
+        if self.name is not None:
+            return f"{self.name}_{name}"
+        return name
+
+    def make_subgraph(
+        self,
+        gm: torch.fx.GraphModule,
+        example_inputs: List[torch.Tensor],
+        subgraph_name: str,
+    ) -> "GraphLowering":
+        """
+        Make a subgraph of the current graph with all inherited
+        parts, except the graph module (`gm`) and `example_inputs`.
+        The subgraphs are lowered separately, but intended to be
+        inlined in the parent graph's codegening. Hence the need
+        for maintaining the same `shape_env` and other properties.
+        The subgraph name is qualified by the parent graph's name.
+        """
+        return GraphLowering(
+            gm=gm,
+            example_inputs=example_inputs,
+            shape_env=self._shape_env,
+            cpp_wrapper=self.cpp_wrapper,
+            aot_mode=self.aot_mode,
+            extern_node_serializer=self.extern_node_serializer,
+            is_inference=self.is_inference,
+            name=self.qualify_name(subgraph_name),
+        )
+
+    def find_nodes_prefer_channels_last(self):
+        """
+        The rule to decide if an node prefer channels last is simple.
+        1. if it's input/output of a convolution
+        2. if one of its user prefers channels last
+
+        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
+        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
+        channels last.
+
+        Consider the scenario: conv -> batch-norm -> relu -> conv
+        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
+        1. the output of batch-norm should be channels last initially since its input is a conv's output.
+           Forcing the batch-norm's output to be contiguous results in the first copy
+        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
+           We need convert it to channels last layout which results in the second copy.
+        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
+        can be saved.
+        """
+        output_set = set()
+        for n in reversed(self.module.graph.nodes):
+            if n.target == torch.ops.aten.convolution.default:
+                output_set.add(n)
+                continue
+
+            for user in n.users:
+                if user in output_set:
+                    output_set.add(n)
+                    break
+
+        # need a second pass to add downstream nodes of those channel last nodes to the sets.
+        # This pass is especially needed to avoid mix-layout kernel inputs in backward pass.
+        #
+        # Let's say a conv-batchnorm 's output is passed to relu whose output is in turn returned
+        # from the fwd graph. Without this second pass, we will force relu's output to be contiguous.
+        # Then in the kernel in backward pass, the contiguous output of relu may be mix with other channels last
+        # tensors and passed to a kernel.
+        #
+        # This pass improve yolov3 training speedup from 1.116x (worse than disabling layout optimization speedup 1.196x) to 1.457x.
+        # It also improves dla102 training speedup from 1.240x (worse than disabling layout optimization speedup 1.523x) to 1.835x .
+        # This also helps the following models:
+        # - res2net101_26w_4s
+        # - res2net50_14w_8s
+        # - sebotnet33ts_256
+        for n in self.module.graph.nodes:
+            if n in output_set:
+                for child in n.users:
+                    output_set.add(child)
+
+        return output_set
+
+    def warn_fallback(self, name):
+        if name not in self._warned_fallback:
+            self._warned_fallback.add(name)
+            perf_hint_log.info("Using FallbackKernel: %s", name)
+
+    def add_device_info(self, device: torch.device):
+        self.device_types.add(device.type)
+        if device.index is not None:
+            self.device_idxs.add(device.index)
+        if V.graph.current_node and device not in self.device_node_mapping:
+            self.device_node_mapping[device] = V.graph.current_node
+
+    @property
+    def fake_mode(self):
+        return V.fake_mode
+
+    def get_buffer(self, buffer_name: str):
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name]
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name]
+        return None
+
+    def get_dtype(self, buffer_name: str):
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].dtype
+        if buffer_name in self.name_to_buffer:
+            return self.name_to_buffer[buffer_name].get_dtype()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_dtype()
+        m = re.match(r"(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),", buffer_name)
+        if m:
+            return self.get_dtype(m.group(1))
+        raise KeyError(f"could not find {buffer_name}")
+
+    def get_numel(self, buffer_name: str):
+        from .ir import MultiOutputLayout
+
+        if buffer_name in self.constants:
+            return self.constants[buffer_name].numel()
+        if buffer_name in self.name_to_buffer:
+            buf = self.name_to_buffer[buffer_name]
+            if isinstance(getattr(buf, "layout", None), MultiOutputLayout):
+                return 1
+            return buf.get_numel()
+        if buffer_name in self.graph_inputs:
+            return self.graph_inputs[buffer_name].get_numel()
+        raise KeyError(f"could not find {buffer_name}")
+
+    @dynamo_timed
+    def run(self, *args):
+        return super().run(*args)
+
+    def register_buffer(self, buffer: ir.Buffer):
+        name = self.qualify_name(f"buf{len(self.buffers)}")
+        self.buffers.append(buffer)
+        self.name_to_buffer[name] = buffer
+        # Skip empty CPU tensor so that CUDA graphs can succeed, see https://github.com/pytorch/pytorch/pull/114144
+        if not isinstance(buffer, ir.ComputedBuffer) or not buffer.is_zero_elements():
+            self.add_device_info(buffer.get_device())
+        return name
+
+    def register_list(self, buffer_names: List[str]):
+        name = self.qualify_name("list_" + "_".join(buffer_names))
+        self.lists[name] = buffer_names
+        return name
+
+    def register_users_of(self, node_output):
+        def register(value):
+            if isinstance(value, (list, tuple)):
+                for x in value:
+                    register(x)
+            if isinstance(value, ir.IRNode):
+                if (
+                    not hasattr(value, "data")
+                    or not isinstance(value.data, ir.IRNode)
+                    or not (
+                        hasattr(value.data, "data")
+                        and isinstance(value.data.data, ir.IRNode)
+                    )
+                ):
+                    return
+
+                for read_name in value.get_read_names():
+                    self.name_to_users[read_name].append(value)
+
+        register(node_output)
+
+    def mark_buffer_mutated(self, name: str):
+        """
+        When a buffer is mutated we need to make sure all the reads to
+        the old version are realized before the mutation happens.
+        """
+        assert isinstance(name, str)
+        self.mutated_buffers.add(name)
+
+        if name not in self.name_to_users:
+            return
+
+        for user in self.name_to_users[name]:
+            user.realize()
+
+    def add_tensor_constant(self, data, name=None):
+        def allocate(name):
+            if not config.aot_inductor.use_runtime_constant_folding:
+                for constant_name, value in self.constants.items():
+                    if (
+                        not data.is_mkldnn
+                        and data.size() == value.size()
+                        and data.stride() == value.stride()
+                        and data.dtype == value.dtype
+                        and data.device == value.device
+                        and torch.eq(data, value).all()
+                    ):
+                        return constant_name
+
+            if name is None:
+                name = f"constant{len(self.constants)}"
+            if name[0].isdigit():
+                name = f"constant_{name}"
+            name = self.qualify_name(name)
+            # We may generate a var name for each constant in the codegen.
+            # Let's only keep sane characters.
+            prefix = re.sub(r"[^a-zA-Z0-9_]", "_", name)
+            name = prefix
+            cnt = 0
+            while name in self.constants:
+                name = f"{prefix}_{cnt}"
+                cnt += 1
+            self.constants[name] = data
+            self.constant_reprs[name] = (
+                f"{data.device!r} {data.dtype!r} "
+                f"{tuple(data.size())!r} {tuple(data.stride())!r} "
+                f"{hash(data):x}"
+            )
+            return name
+
+        new_name = allocate(name)
+        self.allocated_constant_name[new_name] = name
+
+        return TensorBox.create(
+            ir.ConstantBuffer(
+                new_name,
+                FixedLayout(data.device, data.dtype, *self.static_sizes_strides(data)),
+            )
+        )
+
+    def constant_name(self, name: str, device_override: Optional[torch.device]):
+        """
+        We AOT copy constants to the devices they are needed on.
+        If device_override doesn't match the constant's device, then
+        copy it and return a different name.
+        """
+        if self.constants[name].device == device_override or device_override is None:
+            return name
+        alt_name = f"{name}_{device_override.type}{device_override.index or 0}"
+        if alt_name not in self.constants:
+            self.constants[alt_name] = self.constants[name].to(device_override)
+        return alt_name
+
+    def placeholder(self, target: str, args, kwargs):
+        example = super().placeholder(target, args, kwargs)
+        self.graph_input_names.append(target)
+        if isinstance(example, SymTypes):
+            expr = example.node.expr
+            self.graph_inputs[target] = expr
+            return expr
+        elif isinstance(example, (int, bool, float)):
+            expr = sympy.sympify(example)
+            self.graph_inputs[target] = expr
+            return expr
+        if isinstance(example, BackwardState):
+            # Ignored arg, must be unused
+            # Alternately we could filter this out in AotAutograd
+            return None
+        assert isinstance(example, torch.Tensor), example
+        # todo(chilli): We can remove the last check once we turn buffers into
+        # static shape tensors. That's a hack to workaround Inductor believing
+        # the buffer should be static but us passing in a fake tensor with
+        # symbolic shapes.
+        if not example._has_symbolic_sizes_strides:
+            # the first N inputs are weights
+            sizes, strides = self.static_sizes_strides(example)
+        else:
+            sizes, strides = self.symbolic_sizes_strides(example)
+        # TODO(jansel): handle input aliasing
+        target = self.qualify_name(target)
+        tensor = TensorBox.create(
+            InputBuffer(
+                target,
+                FixedLayout(example.device, example.dtype, sizes, strides),
+            )
+        )
+        self.graph_inputs[target] = tensor
+        self.graph_inputs_original[target] = tensor.data.data
+        self.add_device_info(example.device)
+        return tensor
+
+    def call_function(self, target, args, kwargs):
+        if target is operator.getitem and isinstance(args[0], (list, tuple, dict)):
+            return super().call_function(target, args, kwargs)
+
+        if hasattr(target, "_inductor_lowering_function"):
+            # passthrough lowerings from .pattern_matcher
+            return target(*args, **kwargs)
+
+        def get_custom_op_layout_constraints(target, args, kwargs):
+            # Custom operations that require preserving stride order
+            # which run through implicit fallback must constrain their
+            # arguments' fx strides
+            layout_constraint = None
+            if torch._C.Tag.needs_fixed_stride_order in target.tags:
+                # We have to set the current args because call_function will immediately
+                # evaluate this lowering after creating the fallback, without evaluating
+                # the layout constraint
+                args, kwargs = constrain_to_fx_strides(
+                    self.current_node, *args, **kwargs
+                )
+                # Also register the layout constraint so when the fallback
+                # is used again, we can constrain the args to the same layout
+                layout_constraint = constrain_to_fx_strides
+            return layout_constraint, args, kwargs
+
+        if target not in lowerings:
+            assert isinstance(
+                target, torch._ops.OpOverload
+            ), f"{target} is not an OpOverload"
+            base_name = target.name().split(".")[0]
+            if base_name in FALLBACK_ALLOW_LIST:
+                make_fallback(target)
+            elif config.implicit_fallbacks:
+                layout_constraint, args, kwargs = get_custom_op_layout_constraints(
+                    target, args, kwargs
+                )
+                error = (
+                    MissingOperatorWithDecomp
+                    if get_decompositions([target])
+                    else MissingOperatorWithoutDecomp
+                )
+                log.info(
+                    "Creating implicit fallback for:\n%s",
+                    error.operator_str(target, args, kwargs),
+                )
+                make_fallback(target, layout_constraint)
+
+            elif get_decompositions([target]):
+                # There isn't a good way to dynamically patch this in
+                # since AOT Autograd already ran.  The error message tells
+                # the user how to fix it.
+                raise MissingOperatorWithDecomp(target, args, kwargs)
+            else:
+                raise MissingOperatorWithoutDecomp(target, args, kwargs)
+
+        try:
+            log.debug("  via %s", lowerings[target])
+            out = lowerings[target](*args, **kwargs)
+            return out
+        except Exception as e:
+            raise LoweringException(e, target, args, kwargs).with_traceback(
+                e.__traceback__
+            ) from None
+
+    @staticmethod
+    def can_inline_constant(t: torch.Tensor) -> bool:
+        """
+        True if this is a small constant attr that will be inlined.
+        """
+        return len(t.shape) == 1 and t.shape[0] <= 8
+
+    def get_attr(self, target, args, kwargs):
+        # this is a constant
+        value = getattr_recursive(self.module, target)
+
+        if isinstance(value, torch.fx.GraphModule):
+            return ir.Subgraph(name=target, graph_module=value)
+
+        if (
+            config.aot_inductor.use_runtime_constant_folding
+            or config.always_keep_tensor_constants
+            or unsupported_output_tensor(value)
+        ):
+            return self.add_tensor_constant(value, target)
+
+        with no_dispatch():
+            if value.shape == ():
+                return Constant(value.item(), value.dtype, value.device)
+            if self.can_inline_constant(value):
+                # tensor lowering has constant inlining logic
+                from .lowering import tensor
+
+                return tensor(value.tolist(), dtype=value.dtype, device=value.device)
+
+        return self.add_tensor_constant(value, target)
+
+    def call_module(self, target, args, kwargs):
+        raise AssertionError()
+
+    def call_method(self, target, args, kwargs):
+        raise AssertionError()
+
+    def output(self, target, args, kwargs):
+        result = super().output(target, args, kwargs)
+        assert isinstance(result, (tuple, list)), type(result)
+        assert all(
+            isinstance(
+                x,
+                (
+                    TensorBox,
+                    ir.Constant,
+                    type(None),
+                    ir.ConstantBuffer,
+                    sympy.Expr,
+                    sympy.logic.boolalg.Boolean,
+                    int,
+                ),
+            )
+            for x in result
+        ), result
+        self.graph_outputs = [ir.ExternKernel.realize_input(x) for x in result]
+        value: ir.IRNode
+        for name, value in self.graph_inputs.items():
+            assert isinstance(
+                value, (TensorBox, sympy.Expr)
+            ), f"Unsupported inductor graph input type: {type(value)}"
+            if not isinstance(value, TensorBox):
+                continue
+            value.realize()
+            assert isinstance(value, TensorBox)
+            value = value.data
+            assert isinstance(value, ir.StorageBox)
+            value_storage_box = value
+            value = value.data
+            if not isinstance(value, InputBuffer) or value.get_name() != name:
+                # one of our inputs was mutated, need to turn that into a copy
+                ir.MutationLayout.realize_into(value, self.graph_inputs_original[name])
+                # replace output with mutated input
+                try:
+                    ind = self.graph_outputs.index(value_storage_box)
+                    self.graph_outputs[ind] = self.graph_inputs_original[name]
+                except ValueError:
+                    pass
+
+        self.finalize()
+        log.debug(
+            "Force channels last inputs for %d conv for the current graph with id %d",
+            self.num_channels_last_conv,
+            self.graph_id if self.graph_id is not None else -1,
+        )
+
+    def finalize(self):
+        for buf in self.buffers:
+            buf.decide_layout()
+
+    @contextmanager
+    def set_current_node(self, node: torch.fx.Node):
+        old = self.current_node
+        try:
+            self.current_node = node
+            yield
+        finally:
+            self.current_node = old
+
+    def run_node(self, n: torch.fx.Node):
+        def debug(msg):
+            log.debug("lowering %s %s", LazyString(n.format_node), msg)
+
+        origins = {n}
+        if n.op == "call_function":
+            args, kwargs = self.fetch_args_kwargs_from_env(n)
+            origins |= gather_origins(args, kwargs)
+        with ir.IRNode.current_origins(origins), self.set_current_node(
+            n
+        ), V.set_current_node(n):
+            if (
+                n.op == "call_function"
+                and n.target is not operator.getitem
+                and fallback_node_due_to_unsupported_type(n)
+            ):
+                debug("fallback_handler")
+                result = fallback_handler(n.target, add_to_fallback_set=False)(
+                    *args, **kwargs  # type: ignore[possibly-undefined]
+                )
+            elif n.op == "call_function" and n.target in layout_constraints:
+                debug("layout_constraints")
+                args, kwargs = layout_constraints[n.target](n, *args, **kwargs)  # type: ignore[index]
+                result = self.call_function(n.target, args, kwargs)
+            elif is_magic_method(n.target):
+                # TODO: this is sus, it probably should be handled in the
+                # lowerings themselves similarly to sym_size/sym-stride
+                debug("is_magic_method")
+                if isinstance(n.meta["val"], torch.SymInt):
+                    result = n.meta["val"].node.expr
+                else:
+                    result = super().run_node(n)
+            else:
+                debug("")
+                result = super().run_node(n)
+
+            # require the same stride order for dense outputs,
+            # 1. user-land view() will not throw because inductor
+            # output different strides than eager
+            # long term the solution is to make view() always succeed
+            # with infallible strides.
+            # 2: as_strided ops, we need make sure its input has same size/stride with
+            # eager model to align with eager behavior.
+            as_strided_ops = [
+                torch.ops.aten.as_strided.default,
+                torch.ops.aten.as_strided_.default,
+                torch.ops.aten.as_strided_scatter.default,
+            ]
+            is_output = any(user.op == "output" for user in n.users)
+            is_input_for_as_strided = any(
+                user.target in as_strided_ops for user in n.users
+            )
+            if (
+                is_output
+                and isinstance(result, TensorBox)
+                and isinstance(result.data, ir.BaseView)
+            ):
+                # Realize so that outputs are correctly aliased
+                result.realize()
+
+            if (is_output or is_input_for_as_strided) and isinstance(
+                n.meta["val"], torch.Tensor
+            ):
+                strides = n.meta["val"].stride()
+                dense = torch._prims_common.is_non_overlapping_and_dense(n.meta["val"])
+                # requiring a stride order for a non-dense output wouldn't
+                # recreate the same strides, and would fail with view, defer for now.
+                if dense and len(strides):
+                    stride_order = ir.get_stride_order(strides)
+                    if (
+                        len(result.get_size()) == 4
+                        and n in self.nodes_prefer_channels_last
+                        and n.name not in self.user_visible_outputs
+                        and not is_input_for_as_strided
+                    ):
+                        stride_order = ir.NHWC_STRIDE_ORDER
+                    result = ir.ExternKernel.require_stride_order(result, stride_order)
+
+            # Realize if (1) any user need inputs realized, or (2) there is
+            # already too many reads and rematerializing can be bad.
+            num_users = len(set(n.users))
+            if num_users > 1 and isinstance(result, TensorBox):
+                for user in n.users:
+                    if user.target in needs_realized_inputs:
+                        result.realize_hint()
+                        # This inclusion is somewhat controversial (from
+                        # discussion between Horace, Natalia, and Elias).
+                        # Currently, it's not very clear why this is helpful.
+                        # The general idea here is that even though a node may
+                        # have FlexibleLayout, we still often *treat* it as if
+                        # it was contiguous. This appears to sometimes result in
+                        # suboptimal behavior.
+                        #
+                        # When we do a better job selecting layout, we should
+                        # revisit this.
+                        need_fixed_layout = [
+                            torch.ops.aten.convolution_backward.default,
+                            torch.ops.aten.mm.default,
+                            torch.ops.aten._int_mm.default,
+                        ]
+                        if not self.layout_opt:
+                            need_fixed_layout.append(torch.ops.aten.convolution.default)
+                        if torch._C._has_mkldnn:
+                            need_fixed_layout += [
+                                torch.ops.mkldnn._convolution_pointwise.default,
+                                torch.ops.mkldnn._convolution_pointwise.binary,
+                                torch.ops.mkldnn._convolution_pointwise_.binary,
+                                torch.ops.mkldnn._convolution_transpose_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.default,
+                                torch.ops.mkldnn._linear_pointwise.binary,
+                                torch.ops.aten.mkldnn_rnn_layer.default,
+                                torch.ops.onednn.qconv2d_pointwise.default,
+                                torch.ops.onednn.qconv2d_pointwise.binary,
+                                torch.ops.onednn.qlinear_pointwise.default,
+                                torch.ops.onednn.qlinear_pointwise.tensor,
+                            ]
+                            if torch._C.has_mkl:
+                                need_fixed_layout += [torch.ops.mkl._mkl_linear.default]
+                        if user.target in need_fixed_layout:
+                            result = ir.ExternKernel.require_stride_order(
+                                result, ir.get_stride_order(n.meta["val"].stride())
+                            )
+                    if user.op == "output":
+                        if isinstance(result.data.data, (Pointwise, Reduction)):
+                            result.realize()
+
+                # TODO(jansel): introduce a store vs inline choice
+                result.mark_reuse(len(n.users))
+
+            # Realize if the IRNode already has accumulated lots of reads
+            if isinstance(result, TensorBox) and result.has_exceeded_max_reads():
+                # Prevent excessive accumulation in a computed buffer, when
+                # there are multiple branches each with small number of memory
+                # reads, but they converge to a user.
+                result.realize_hint()
+
+            # Realize if a Pointwise has too much stuff to be inlined.
+            # As this may cause RecursionError during Inductor's evaluation.
+            if isinstance(result, TensorBox) and isinstance(result.data, StorageBox):
+                curr = result.data.data
+                if isinstance(curr, Pointwise):
+                    # Use inner fn as a rough proxy. Good enough.
+                    if curr.has_large_inner_fn():
+                        result.realize()
+
+        # This is not complete, but it doesn't have to be: origin_node
+        # tracking is best effort.  The logic here critically relies on direct
+        # TensorBox -> StorageBox denoting a non-view; we don't bother trying
+        # to get views to work.  Feel free to add any extra cases as needed.
+        #
+        # Note: we can't YOLO tree_map over this result, because if there are
+        # buffers or a view involved, we might not be able to validly assign
+        # the origin_node here.
+        if isinstance(result, TensorBox) and isinstance(result.data, ir.StorageBox):
+            if isinstance(result.data.data, ir.Loops):
+                result.data.data.origin_node = n
+            elif isinstance(result.data.data, ir.Buffer):
+                result.data.data.origin_node = n
+                if isinstance(result.data.data, ir.ComputedBuffer) and isinstance(
+                    result.data.data.data, ir.Loops
+                ):
+                    result.data.data.data.origin_node = n
+                # Not really multi-output, can straightforwardly recurse in
+                elif (
+                    isinstance(result.data.data, ir.MultiOutput)
+                    and not result.data.data.indices
+                ):
+                    if isinstance(result.data.data.inputs[0], ir.Buffer):
+                        result.data.data.inputs[0].origin_node = n
+
+        self.register_users_of(result)
+
+        return result
+
+    def validate_can_generate_cpp_wrapper(self):
+        if config.disable_cpp_codegen:
+            raise CppWrapperCodeGenError("C++ codegen is disabled")
+
+        if sys.platform not in ["linux", "darwin"]:
+            raise CppWrapperCodeGenError(f"Unsupported platform {sys.platform}")
+
+        for value in self.graph_inputs.values():
+            dtype = None
+            if isinstance(value, TensorBox):
+                dtype = value.get_dtype()
+            elif isinstance(
+                value, (sympy.Symbol, sympy.Expr, sympy.core.numbers.Integer)
+            ):
+                dtype = may_get_constant_buffer_dtype(value)
+
+            if not supported_dtype_of_cpp_wrapper(dtype, self.cuda):
+                raise CppWrapperCodeGenError(f"Unsupported input dtype {dtype}")
+
+    def init_wrapper_code(self):
+        self.cuda = "cuda" in self.device_types
+        if self.cpp_wrapper:
+            self.validate_can_generate_cpp_wrapper()
+            self.wrapper_code = CppWrapperCuda() if self.cuda else CppWrapperCpu()
+        else:
+            device_types = self.device_types.copy()
+            device_types.discard("cpu")
+            # TODO(Eikan): Only support mixing cpu and other device now.
+            assert len(device_types) <= 1, "Does not support mixing {}".format(
+                "+".join(device_types)
+            )
+            only_cpu = len(device_types) == 0
+            device_type = "cpu" if only_cpu else device_types.pop()
+
+            self.device_ops = get_device_op_overrides(device_type)
+            wrapper_code_gen_cls = get_wrapper_codegen_for_device(device_type)
+            assert (
+                wrapper_code_gen_cls is not None
+            ), f"Device {device_type} not supported"
+            self.wrapper_code = wrapper_code_gen_cls()
+
+        if self.const_module:
+            # If we have const module, we could reuse the kernels
+            # This could avoid duplication and save time on doing recompilation (if Triton.)
+            self.wrapper_code._names_iter = self.const_module.wrapper_code._names_iter
+            self.wrapper_code.src_to_kernel = (
+                self.const_module.wrapper_code.src_to_kernel
+            )
+
+    def codegen_with_cpp_wrapper(self):
+        """
+        For CPU, the cpp wrapper codegen is done in one pass.
+        For GPU, the cpp wrapper codegen is done in two steps: JIT-compile the model with python
+        wrapper code and run it to generate autotuned kernel binaries in the first pass; and then
+        generate cpp wrapper code and compile it to a dynamic library in the second pass.
+        """
+        if "cuda" in self.device_types:
+            # first pass
+            self.cpp_wrapper = False
+            compiled = self.compile_to_module().call
+
+            def materialize(x):
+                if isinstance(x, (torch.SymInt, torch.SymFloat)):
+                    # Need concrete value to run dynamic shapes and tune the result
+                    return x.node.hint
+                elif isinstance(x, FakeTensor):
+                    return defake(x)
+                else:
+                    assert isinstance(
+                        x, torch.Tensor
+                    ), "Unknown type when creating real inputs" + str(type(x))
+                    return x
+
+            if tracing_context := torch._guards.TracingContext.try_get():
+                if tracing_context.output_strides:
+                    tracing_context.output_strides.clear()
+
+                params_flat = [
+                    param
+                    for param in tracing_context.params_flat  # type: ignore[union-attr]
+                    if param is not None
+                ]
+                real_inputs = [
+                    materialize(x) for x in itertools.chain(params_flat, V.real_inputs)
+                ]
+            else:
+                real_inputs = [materialize(x) for x in V.real_inputs]
+
+            with torch.utils._python_dispatch._disable_current_modes():
+                assert self.example_inputs is not None
+                compiled(real_inputs)
+            del real_inputs
+
+            # second pass
+            # TODO: reuse self.scheduler from the first pass to speed up the second pass
+            self.cpp_wrapper = True
+            self.removed_buffers.clear()
+            self.inplaced_to_remove.clear()
+            return self.codegen()
+        else:
+            # cpu
+            return self.codegen()
+
+    def codegen(self):
+        from .scheduler import Scheduler
+
+        self.init_wrapper_code()
+
+        self.scheduler = Scheduler(self.buffers)
+        V.debug.draw_orig_fx_graph(self.orig_gm, self.scheduler.nodes)
+
+        self.scheduler.codegen()
+        return self.wrapper_code.generate(self.is_inference)
+
+    def codegen_subgraph(self, parent_graph):
+        """
+        This is a more compact version of the `codegen()` above
+        where we codegen this graph as a subgraph of some parent
+        graph. The parent graph is passed as an argument: the
+        intention is to inline codegening of the subgraph in
+        the parent graph's wrapper code (including the generated
+        kerenls). The wrapper code is not finalized (via `.generate()`
+        call), as this will be done in the parent graph's `codegen()`.
+        """
+        from .scheduler import Scheduler
+
+        self.wrapper_code = parent_graph.wrapper_code
+        self.device_ops = parent_graph.device_ops
+        self.cpp_wrapper = parent_graph.cpp_wrapper
+
+        self.scheduler = Scheduler(self.buffers)
+        self.scheduler.codegen()
+
+    def count_bytes(self):
+        from .scheduler import Scheduler
+
+        scheduler = Scheduler(self.buffers)
+
+        total_bytes = 0
+        node_counts = []
+        node_runtimes = []
+        for node in scheduler.nodes:
+            num_bytes = node.get_read_write_buffers_sizes()
+            total_bytes += num_bytes
+            node_counts.append((node, num_bytes // 4))
+            node_runtimes.append((node, node.get_estimated_runtime()))
+        return total_bytes, node_counts, node_runtimes
+
+    @dynamo_timed(phase_name="code_gen")
+    def compile_to_module(self):
+        from .codecache import PyCodeCache
+
+        code, linemap = (
+            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+        )
+        linemap = [(line_no, node.stack_trace) for line_no, node in linemap]
+        key, path = PyCodeCache.write(code)
+        mod = PyCodeCache.load_by_key_path(
+            key, path, linemap=linemap, attrs=self.constants
+        )
+        self.cache_key = key
+        self.cache_path = path
+        self.cache_linemap = linemap
+
+        # Logged twice as per https://github.com/pytorch/pytorch/pull/99038#discussion_r1167826029
+        # TODO. Revisit this once the logging API is more mature
+        assert mod.__file__ is not None
+
+        log_module_code(mod.__file__)
+        log.debug("Output code written to: %s", mod.__file__)
+        output_code_log.debug("Output code: \n%s", code)
+        trace_structured(
+            "inductor_output_code",
+            lambda: {"filename": mod.__file__},
+            payload_fn=lambda: code,
+        )
+        output_code_log.info("Output code written to: %s", mod.__file__)
+        if config.benchmark_kernel:
+            print(f"Compiled module path: {mod.__file__}", file=sys.stderr)
+        V.debug.output_code(mod.__file__)
+        V.debug.copy(os.path.splitext(mod.__file__)[0] + ".debug")
+        return mod
+
+    def compile_to_fn(self):
+        if self.aot_mode:
+            from .codecache import AotCodeCompiler
+
+            assert self.cpp_wrapper, "AOT mode only supports C++ wrapper"
+            code, linemap = self.codegen_with_cpp_wrapper()
+            output_code_log.debug("Output code: \n%s", code)
+
+            serialized_extern_kernel_nodes = None
+            if (
+                config.is_fbcode()
+                and self.extern_kernel_nodes
+                and self.extern_node_serializer
+            ):
+                serialized_extern_kernel_nodes = self.extern_node_serializer(
+                    self.extern_kernel_nodes
+                )
+                output_code_log.debug(
+                    "Serialized Extern Kernel Nodes: \n%s",
+                    serialized_extern_kernel_nodes,
+                )
+
+            # Directly return the file path with the compiled code
+            return AotCodeCompiler.compile(
+                self, code, serialized_extern_kernel_nodes, cuda=self.cuda
+            )
+        else:
+            return self.compile_to_module().call
+
+    def get_output_names(self):
+        return [
+            node.get_name()
+            for node in self.graph_outputs
+            if not isinstance(node, ir.NoneAsConstantBuffer)
+            and not isinstance(node, ir.ShapeAsConstantBuffer)
+        ]
+
+    def is_unspec_arg(self, name: str):
+        # dynamo wraps unspec variable as 0d CPU tensor,
+        # need to convert to scalar during codegen (triton only)
+        return (
+            name in self.graph_inputs.keys()
+            and self.graph_inputs[name].get_numel() == 1
+            and self.graph_inputs[name].get_device().type == "cpu"
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/hooks.py b/MLPY/Lib/site-packages/torch/_inductor/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..761e5156553dc9aa71bb925972125bd8e0eda31c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/hooks.py
@@ -0,0 +1,28 @@
+import contextlib
+from typing import Callable, List, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch
+
+# Executed in the order they're registered
+INTERMEDIATE_HOOKS: List[Callable[[str, "torch.Tensor"], None]] = []
+
+
+@contextlib.contextmanager
+def intermediate_hook(fn):
+    INTERMEDIATE_HOOKS.append(fn)
+    try:
+        yield
+    finally:
+        INTERMEDIATE_HOOKS.pop()
+
+
+def run_intermediate_hooks(name, val):
+    global INTERMEDIATE_HOOKS
+    hooks = INTERMEDIATE_HOOKS
+    INTERMEDIATE_HOOKS = []
+    try:
+        for hook in hooks:
+            hook(name, val)
+    finally:
+        INTERMEDIATE_HOOKS = hooks
diff --git a/MLPY/Lib/site-packages/torch/_inductor/index_propagation.py b/MLPY/Lib/site-packages/torch/_inductor/index_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5f60972fe422c58cb52a778e9cd0ac2c7cb49ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/index_propagation.py
@@ -0,0 +1,277 @@
+"""This file implements the IndexPropagation ops handler, which wraps an
+underlying handler to add a limited form of constant propagation, as well as
+propagation of sympy expressions downstream of ops.index_expr calls.
+
+For example, say we have the IR:
+
+   tmp0 = ops.index_expr(x, torch.int32)
+   tmp1 = ops.constant(2, torch.int32)
+   tmp2 = ops.mul(tmp0, tmp1)
+   tmp3 = ops.indirect_indexing(tmp2, x_size)
+   tmp4 = ops.load("buf0", tmp3)
+
+The underlying handler would just see:
+
+   ops.load("buf0", x * 2)
+
+This is limited by the set of operators handled in the sympy expression
+printers. So simple operations like minimum and maximum cannot be translated to
+SymPy expressions yet, despite sympy.Min and sympy.Max existing.
+
+"""
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Literal, Optional, overload, Tuple, Union
+
+import sympy
+
+from typing_extensions import TypeAlias
+
+import torch
+from torch._prims_common import is_boolean_dtype, is_integer_dtype
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where
+
+
+@dataclass
+class TypedExpr:
+    """A SymPy expression with associated type"""
+
+    expr: sympy.Expr
+    dtype: torch.dtype
+
+
+class SymPyOps:
+    """An ops handler where all IR values are SymPy expressions
+
+    When a value cannot be represented as a SymPy expression, the method is
+    either not defined, or returns NotImplemented
+
+    """
+
+    @staticmethod
+    def identity(value: Any) -> Any:
+        return value
+
+    @staticmethod
+    def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr:
+        if is_boolean_dtype(dtype):
+            expr = sympy.Integer(bool(value))
+        elif is_integer_dtype(dtype):
+            expr = sympy.Integer(int(value))
+        else:
+            expr = sympy.Float(float(value))
+        return TypedExpr(expr, dtype)
+
+    @staticmethod
+    def index_expr(value: sympy.Expr, dtype: torch.dtype) -> Union[int, TypedExpr]:
+        if isinstance(value, int):
+            value = sympy.Integer(value)
+        return TypedExpr(value, dtype)
+
+    @staticmethod
+    def to_dtype(
+        value: Any, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> Union[int, TypedExpr]:
+        if isinstance(value.expr, (sympy.Integer, sympy.Float)):
+            return SymPyOps.constant(value.expr, dtype)
+        elif is_integer_dtype(dtype) and is_integer_dtype(value.dtype):
+            return SymPyOps.index_expr(value.expr, dtype)
+        else:
+            # TODO: Inductor doesn't handle floating point in sympy expressions well at the moment
+            return NotImplemented
+
+    @staticmethod
+    def square(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(x.expr * x.expr, x.dtype)
+
+    @staticmethod
+    def add(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr + y.expr, result_type)
+
+    @staticmethod
+    def sub(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr - y.expr, result_type)
+
+    @staticmethod
+    def mul(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(x.expr * y.expr, result_type)
+
+    @staticmethod
+    def neg(x: TypedExpr) -> TypedExpr:
+        return TypedExpr(-x.expr, x.dtype)
+
+    @staticmethod
+    def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+
+        return TypedExpr(FloorDiv(x.expr, y.expr), result_type)
+
+    @staticmethod
+    def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+
+        result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+        return TypedExpr(result_expr, result_type)
+
+    @staticmethod
+    def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        if not is_integer_dtype(result_type):
+            return NotImplemented
+        # In these cases, remainder in Python == remainder in C++, so this transformation
+        # is sound
+        if (
+            x.expr.is_nonnegative is not None
+            and x.expr.is_nonnegative == y.expr.is_positive
+        ):
+            result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr)
+            return TypedExpr(result_expr, result_type)
+        return NotImplemented
+
+    @staticmethod
+    def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Min(x.expr, y.expr), result_type)
+
+    @staticmethod
+    def maximum(x: TypedExpr, y: TypedExpr) -> TypedExpr:
+        result_type = torch.promote_types(x.dtype, y.dtype)
+        return TypedExpr(sympy.Max(x.expr, y.expr), result_type)
+
+
+@dataclass
+class IndexPropVar:
+    value: Any  # Either an IR value, or TypedExpr if is_symbolic is true
+    is_symbolic: bool = False
+
+    @staticmethod
+    def new_symbolic(expr: TypedExpr) -> "IndexPropVar":
+        return IndexPropVar(expr, is_symbolic=True)
+
+    def __post_init__(self):
+        assert not self.is_symbolic or isinstance(
+            self.value, TypedExpr
+        ), "Symbolic IndexPropVar must contain a TypedExpr"
+
+
+IndexPropResult: TypeAlias = Union[IndexPropVar, Tuple["IndexPropResult", ...]]
+
+
+class IndexPropagation:
+    """Ops wrapper that tries to propagate constant and index_expr values through the computation.
+
+    This aims to maximize the compile time simplification possible, and convert
+    indirect indexing from arange into normal static indexing.
+
+    """
+
+    def __init__(self, inner: Any):
+        self._inner = inner
+
+    def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any:
+        # Construct a new constant/index_expr from the SymPy expression
+        if isinstance(expr, sympy.Integer):
+            return self._inner.constant(int(expr), dtype)
+        elif expr.is_number:
+            return self._inner.constant(float(expr), dtype)
+        return self._inner.index_expr(expr, dtype)
+
+    def unwrap(self, a: Union[Any, IndexPropVar]) -> Any:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.unwrap(v) for v in a)
+
+        if not isinstance(a, IndexPropVar):
+            return a
+
+        # Prefer the sympy representation if possible
+        if a.is_symbolic:
+            return self.materialize_expr(a.value.expr, a.value.dtype)
+
+        return a.value
+
+    def wrap(self, a) -> IndexPropResult:
+        if isinstance(a, (list, tuple)):
+            return tuple(self.wrap(v) for v in a)
+        return IndexPropVar(a)
+
+    @overload
+    def fallback(
+        self,
+        name: Literal["indirect_indexing"],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> IndexPropVar:
+        ...
+
+    @overload
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        ...
+
+    def fallback(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Fallback to the wrapped handler
+        new_args = [self.unwrap(a) for a in args]
+        new_kwargs = {k: self.unwrap(v) for k, v in kwargs.items()}
+        return self.wrap(getattr(self._inner, name)(*new_args, **new_kwargs))
+
+    def propagate_sympy(
+        self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> IndexPropResult:
+        # Build a new SymPy expression from this ops call
+        def unwrap(a: Union[Any, IndexPropVar]) -> Any:
+            if not isinstance(a, IndexPropVar):
+                return a
+            return a.value
+
+        new_args = [unwrap(a) for a in args]
+        new_kwargs = {k: unwrap(v) for k, v in kwargs.items()}
+        new_expr = getattr(SymPyOps, name)(*new_args, **new_kwargs)
+        is_valid_expr = new_expr is not NotImplemented and (
+            # Inductor doesn't expect floating point in sympy expressions, but
+            # allow floating point constants to be propagated
+            isinstance(new_expr.expr, sympy.Number)
+            or new_expr.expr.is_integer
+        )
+        if not is_valid_expr:
+            return self.fallback(name, args, kwargs)
+        return IndexPropVar.new_symbolic(new_expr)
+
+    def __getattr__(self, name: str) -> Callable[..., IndexPropResult]:
+        def inner(*args: Any, **kwargs: Any) -> IndexPropResult:
+            if not hasattr(SymPyOps, name):
+                return self.fallback(name, args, kwargs)
+
+            var_arguments = [
+                a
+                for a in itertools.chain(args, kwargs.values())
+                if isinstance(a, IndexPropVar)
+            ]
+            if not all(v.is_symbolic for v in var_arguments):
+                return self.fallback(name, args, kwargs)
+
+            return self.propagate_sympy(name, args, kwargs)
+
+        return inner
+
+    def indirect_indexing(
+        self, index: Union[Any, IndexPropVar], size: Any, check: bool = True
+    ) -> Any:
+        # nb. We do index + Where(...) rather than Where(idx >= 0, idx, idx + sz) because we don't have CSE
+        #     for SymPy expressions, so we don't want to repeat idx too much
+
+        # indirect_indexing returns a sympy value, so no need to wrap in IndexPropVar here
+        if isinstance(index, IndexPropVar) and index.is_symbolic:
+            # If we are turning a indirect indexing into direct, we need to wrap it.
+            index = index.value.expr
+            return index + Where(index >= 0, 0, size)
+        return self.fallback("indirect_indexing", (index, size, check), {}).value
diff --git a/MLPY/Lib/site-packages/torch/_inductor/inductor_prims.py b/MLPY/Lib/site-packages/torch/_inductor/inductor_prims.py
new file mode 100644
index 0000000000000000000000000000000000000000..df99fb4b5ca4bdc132c8dde28b641e05b789c702
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/inductor_prims.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import logging
+from typing import Optional, Sequence
+
+import torch
+from torch import _prims, Tensor
+
+log = logging.getLogger(__name__)
+
+
+def make_prim(
+    schema: str,
+    impl_aten,
+    return_type=_prims.RETURN_TYPE.NEW,
+    doc: str = "",
+    tags: Optional[Sequence[torch.Tag]] = None,
+):
+    def meta(*args, **kwargs):
+        return _prims.TensorMeta(impl_aten(*args, **kwargs))
+
+    return _prims._make_prim(
+        schema=schema,
+        return_type=return_type,
+        meta=meta,
+        impl_aten=impl_aten,
+        doc=doc,
+        tags=tags,
+    )
+
+
+def eager_force_stride(input_tensor: Tensor, stride) -> Tensor:
+    if input_tensor.stride() == stride:
+        return input_tensor
+    new_tensor = input_tensor.clone().as_strided(
+        input_tensor.shape,
+        stride,
+    )
+    new_tensor.copy_(input_tensor)
+    return new_tensor
+
+
+# Custom prims used for handling randomness
+seed = make_prim(
+    "inductor_seed(Device device) -> Tensor",
+    lambda device: torch.randint(2**63 - 1, [], device=device),
+    doc="create a fresh seed (one per call) for use with inductor_rand",
+    tags=(torch.Tag.nondeterministic_seeded,),
+)
+seeds = make_prim(
+    "inductor_seeds(int count, Device device) -> Tensor",
+    lambda count, device: torch.randint(2**63 - 1, [count], device=device),
+    doc="Horizontal fusion of many inductor_seed() calls",
+    tags=(torch.Tag.nondeterministic_seeded,),
+)
+lookup_seed = make_prim(
+    # if inductor_lookup_seed changes, update partitioners.py
+    "inductor_lookup_seed(Tensor seeds, int index) -> Tensor",
+    lambda seeds, index: seeds[index],
+    doc="Extract a single seed from the result of inductor_seeds()",
+)
+random = make_prim(
+    "inductor_random(SymInt[] size, Tensor seed, str mode) -> Tensor",
+    lambda size, seed, mode: getattr(torch, mode)(size, device=seed.device),
+    doc="torch.rand()/torch.randn() using backend-specific RNG that can be fused",
+)
+randint = make_prim(
+    "inductor_randint(SymInt low, SymInt high, SymInt[] size, Tensor seed) -> Tensor",
+    lambda low, high, size, seed: torch.randint(low, high, size, device=seed.device),
+    doc="torch.randint() using backend-specific RNG that can be fused",
+)
+force_stride_order = make_prim(
+    "inductor_force_stride_order(Tensor input, SymInt[] stride) -> Tensor",
+    eager_force_stride,
+    doc="Force the stride order for input tensor. No-op if the input tensor already has the stride. Do a copy otherwise",
+)
+masked_scatter_with_index = make_prim(
+    "inductor_masked_scatter_with_index(Tensor input, Tensor mask, Tensor source_idx, Tensor source) -> Tensor",
+    lambda input_tensor, mask, index, source: torch.masked_scatter(
+        input_tensor, mask, source
+    ),
+    doc="masked_scatter with precomputed indices",
+)
+_unsafe_index_put_ = make_prim(
+    "_unsafe_index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)",
+    lambda self, indices, values, accumulate=False: torch.ops.aten.index_put_(
+        self, indices, values, accumulate
+    ),
+    doc="Unsafe index_put_ (doesn't issue device asserts)",
+)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/ir.py b/MLPY/Lib/site-packages/torch/_inductor/ir.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e6d7a959f9c37ed1f4eba980203a81cc6378812
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/ir.py
@@ -0,0 +1,8064 @@
+import collections
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import re
+import textwrap
+import traceback
+from contextlib import nullcontext
+from enum import Enum
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+from unittest.mock import patch
+
+import sympy
+from sympy import Expr, Integer
+
+import torch._export.serde.schema as export_schema
+
+import torch._logging
+
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dynamo.device_interface import get_interface_for_device
+from torch._dynamo.utils import identity
+from torch._export.serde.serialize import GraphModuleSerializer
+from torch._higher_order_ops.auto_functionalize import can_auto_functionalize
+from torch._prims_common import (
+    compute_required_storage_length,
+    is_boolean_dtype,
+    is_float_dtype,
+    make_channels_last_strides_for,
+    make_contiguous_strides_for,
+    StrideType,
+)
+from torch._subclasses.fake_tensor import get_schema_info
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols, SymTypes
+from torch.utils._sympy.functions import CleanDiv, FloorDiv, ModularIndexing
+
+from . import config, dependencies
+from .codegen.common import index_prevent_reordering
+from .dependencies import (
+    extract_free_unbacked_symbols,
+    extract_input_node_reduction_ranges,
+    extract_read_writes,
+    var_builder,
+)
+from .ops_handler import OpCounterCSE
+from .utils import (
+    argsort,
+    cache_on_self,
+    convert_shape_to_inductor,
+    convert_shape_to_symint,
+    developer_warning,
+    get_kernel_metadata,
+    is_dynamic,
+    pad_listlike,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    sympy_subs,
+)
+from .virtualized import ops, V
+
+if TYPE_CHECKING:
+    from .graph import GraphLowering
+
+log = logging.getLogger(__name__)
+indent = functools.partial(textwrap.indent, prefix="  ")
+aten = torch.ops.aten
+
+""" [Note: Inductor IR]
+
+Inductor's IR is produced by executing 'lowering' code (see lowering.py).  Each
+lowering is registered to a particular aten operator, and expects inputs that
+correspond to the aten schema.  However, in place of torch Tensor inputs, lowerings
+expect Inductor TensorBox inputs.
+
+TensorBox IR represents torch tensors.  Tensors are sometimes single objects owning
+storage, and sometimes views of another Tensor's storage.  Mutating tensor operations
+(such as add_()) affect the underlying storage and any associated views.  Other operations
+(such as .t_()) update metadata about the current view but don't modify the underlying storage.
+
+To model this in Inductor, the IR distinguishes between TensorBox, View, StorageBox and Buffer.
+
+TensorBox is the top level IR construct that any lowering should produce and maps to a torch.Tensor
+output from an operation.  But just as torch.Tensors take different forms, TensorBox IR can
+reference View IR or directly reference StorageBox IRs.
+
+Some Inductor lowerings produce new sets of 'Box'es, while others (such as .t() or other view ops)
+may take an existing TensorBox and point it to a new underlying View IR.
+
+Tensors that directly own storage are represented as a chain of:
+TensorBox -> StorageBox -> Buffer
+where Buffer is a simple (1D) allocation, and StorageBox introduces the concept of a Layout.
+
+If you mutate the data of such a tensor, we swing the StorageBox pointer to point to a new buffer
+(leaving the old buffer unmodified and functionalizing the operation).
+
+Tensors backed by views add one more indirection to the IR.
+TensorBox -> View -> StorageBox -> Buffer
+In these cases, the underlying StorageBox/Buffer will be shared with the pre-view TensorBox.
+"""
+
+
+def validate_ir(node_or_nodes):
+    def _check_tensorbox(nodes):
+        # Could expand this to check deeper properties
+        # (e.g. TensorBox points to View or StorageBox)
+        if isinstance(nodes, (list, tuple)):
+            for node in nodes:
+                _check_tensorbox(node)
+        elif isinstance(nodes, dict):
+            for node in nodes.values():
+                _check_tensorbox(node)
+        else:
+            assert isinstance(
+                nodes,
+                (
+                    torch._inductor.ir.ExpandView,
+                    DynamicScalar,
+                    AssertScalar,
+                    TensorBox,
+                    sympy.logic.boolalg.Boolean,
+                    Expr,
+                ),
+            ), f"Found {type(nodes)}, which is not a supported top level IR node. See [Note: Inductor IR]"
+
+    # Be picky about the accepted data structure (don't use pytree here)
+    _check_tensorbox(node_or_nodes)
+
+
+def ops_wrapper(name):
+    assert isinstance(name, str)
+
+    def fn(*args, **kwargs):
+        return getattr(ops, name)(*args, **kwargs)
+
+    return fn
+
+
+def inverse_reorder(order):
+    inv_order = dict(zip(order, range(len(order))))
+
+    def reindex(index):
+        assert len(index) == len(inv_order)
+        return [index[inv_order[i]] for i in range(len(index))]
+
+    return reindex
+
+
+def same_reorder(order):
+    def reindex(index):
+        assert len(index) == len(order)
+        return [index[order[i]] for i in range(len(index))]
+
+    return reindex
+
+
+def fuse_reindexing(reindex1, reindex2):
+    def reindex(index):
+        return reindex1(reindex2(index))
+
+    return reindex
+
+
+NHWC_STRIDE_ORDER = [3, 0, 2, 1]
+
+
+def stride_order2fill_order(order):
+    """
+    Convert stride order to fill order
+    For channel last format,
+    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
+    """
+    lookup = {pos: idx for idx, pos in enumerate(order)}
+    fill_order = [lookup[i] for i in range(len(order))]
+    return fill_order
+
+
+def get_stride_order(seq: Sequence[int]) -> List[int]:
+    """
+    Convert strides to stride order
+    """
+    sorted_idx: List[int] = argsort(seq)
+    out = [0 for _ in range(len(seq))]
+    for i, elem in enumerate(sorted_idx):
+        out[elem] = i
+    return out
+
+
+def ir_node_to_tensor(x, guard_shape=True):
+    if x is None:
+        return None
+
+    shape_fn: Callable[[Expr], Union[int, Expr]]
+    if not guard_shape:
+        shape_fn = V.graph.sizevars.size_hint
+    else:
+        shape_fn = identity
+    size = [shape_fn(s) for s in x.get_size()]
+    stride: StrideType
+    if is_storage_and_layout(x):
+        stride = [shape_fn(s) for s in x.get_layout().stride]  # type: ignore[misc]
+    else:
+        stride = make_contiguous_strides_for(size)  # type: ignore[arg-type]
+    dtype = x.get_dtype()
+    device = x.get_device()
+    size = convert_shape_to_symint(size)
+    stride = convert_shape_to_symint(stride)
+    t = torch.empty_strided(
+        size=size, stride=stride, dtype=dtype, device=device
+    ).zero_()
+    return t
+
+
+def may_convert_to_optional(value):
+    if isinstance(value, list) and not value:
+        # [None] makes sure the cpp wrapper codegen will generate something like
+        # {c10::nullopt} instead of {}
+        return [None]
+    return value
+
+
+def get_device_type(x):
+    if getattr(x, "get_device", None):
+        return get_device_type(x.get_device())
+    if isinstance(x, torch.device):
+        return x.type
+    return None
+
+
+def is_triton(x):
+    return get_device_type(x) == "cuda"
+
+
+def is_cpu(x):
+    return get_device_type(x) == "cpu"
+
+
+class IRNode:
+    _current_origins: ClassVar[Set[Any]] = set()
+
+    @staticmethod
+    @contextlib.contextmanager
+    def current_origins(origins: Set[torch.fx.Node]):
+        old = IRNode._current_origins
+        IRNode._current_origins = old | origins
+        try:
+            yield
+        finally:
+            IRNode._current_origins = old
+
+    def __post_init__(self):
+        self.origins = set(self._current_origins)
+        self.traceback = traceback.format_stack() if config.debug_ir_traceback else None
+
+    def get_traceback(self):
+        return self.traceback
+
+    def common_repr(self):
+        origins = f"origins={getattr(self, 'origins', '')}"
+        if len(origins) > 64:
+            # this can get *very* long
+            origins = f"{origins[:61]}..."
+        return [origins]
+
+    def str_helper(self, lines):
+        lines = lines + self.common_repr()
+        lines = indent(",\n".join(map(str, lines)))
+        return f"{type(self).__name__}(\n{lines}\n)"
+
+    def is_user_of(self, name):
+        return name in self.get_read_names()
+
+    @cache_on_self
+    def get_read_names(self):
+        return {dep.name for dep in self.get_reads()}
+
+    def get_dtype(self):
+        return self.dtype
+
+    def get_layout(self):
+        raise NotImplementedError(f"get_layout() is not implemented by {type(self)}!")
+
+    def get_size(self):
+        raise NotImplementedError(f"get_size() is not implemented by {type(self)}!")
+
+    def get_numel(self):
+        return sympy_product(self.get_size())
+
+    def is_zero_elements(self):
+        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))  # type: ignore[arg-type]
+
+    def realize(self):
+        """
+        If the IRNode refers to data which has not been materialized (e.g.,
+        it is a Pointwise/Reduction that could potentially have more
+        compute fused into it), realize the IRNode into physical memory,
+        ending the possibility of fusing into it, but allowing, e.g., multiple
+        users to access the data without having to recompute.
+
+        Check StorageBox.realize for a particularly notable implementation.
+
+        TODO(ezyang): I think, in principle, every IRNode should have an
+        implementation of this, and most of the time no-op is OK, but you
+        really do have to audit each IRNode for this, so for now, raise
+        an error if it's not implemented.  Note that some code in graph.py
+        will catch this thrown error and suppress it with a warning.
+        """
+        raise NotImplementedError(f"realize NYI on {type(self)}")
+
+    def codegen_reference(self, writer=None):
+        raise NotImplementedError(f"codegen_reference NYI on {type(self)}")
+
+    # The abstract method declarations below serve to convince mypy that all IRNode instances have these functions
+    # defined, while having no effect at runtime. We cannot create stub implementations here because other parts of
+    # the code dynamically check for defined attributes.
+    get_device: Callable[[], torch.device]
+    dtype: torch.dtype
+    get_name: Callable[[], str]
+    get_reads: Callable[[], Any]
+    get_stride: Callable[[], Any]
+    get_storage_numel: Callable[[], Any]
+    has_exceeded_max_reads: Callable[[], bool]
+    make_loader: Callable[[], Callable[[Any], Any]]
+    make_indexer: Callable[[], Callable[[Any], Any]]
+    mark_reuse: Callable[[int], None]
+    realize_hint: Callable[[], None]
+    get_unbacked_symbol_uses: Callable[[], Set[sympy.Symbol]]
+
+
+@dataclasses.dataclass
+class Loops(IRNode):
+    device: torch.device
+    dtype: torch.dtype
+    inner_fn: Callable[..., Any]
+    ranges: List[Expr]
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return set().union(
+            *(free_unbacked_symbols(e) for e in self.ranges),
+            self.inner_fn_free_unbacked_symbols(),
+        )
+
+    def __str__(self, names=("ranges",)):
+        return self.str_helper(
+            [
+                f"'{self.device.type}'",
+                str(self.dtype),
+                self.inner_fn_str(),
+            ]
+            + [f"{name}={getattr(self, name)}" for name in names]
+            + [f"origin_node={self.origin_node!r}"]
+        )
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.origin_node = None
+
+    __repr__ = __str__
+
+    def get_device(self):
+        return self.device
+
+    def get_origin_node(self):
+        return self.origin_node
+
+    def get_size(self):
+        return self.ranges
+
+    def get_pointwise_size(self):
+        return self.ranges
+
+    def is_extern(self):
+        return False
+
+    @classmethod
+    def create(cls, *args, **kwargs):
+        origin_node = kwargs.pop("origin_node", None)
+        tb = kwargs.pop("traceback", None)
+        r = cls(*args, **kwargs)
+        r.origin_node = origin_node
+        r.traceback = (
+            tb or traceback.format_stack() if config.debug_ir_traceback else None
+        )
+        return TensorBox.create(r)
+
+    @staticmethod
+    def _index(ranges, prefix="i"):
+        return [
+            sympy.Integer(0) if s == 1 else sympy_index_symbol(f"{prefix}{n}")
+            for n, s in enumerate(ranges)
+        ]
+
+    @cache_on_self
+    def inner_fn_opcount(self):
+        from .ir import FlexibleLayout
+
+        opcounter = OpCounterCSE(V.MockHandler())
+
+        with V.set_ops_handler(opcounter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = self.inner_fn(*self.inner_fn_args())
+            return opcounter.op_count
+
+    def inner_fn_args(self):
+        return (self._index(self.ranges),)
+
+    def inner_fn_str(self):
+        return V.KernelFormatterHandler.ir_to_string(
+            self.inner_fn, *self.inner_fn_args()
+        )
+
+    def has_large_inner_fn(self):
+        return self.inner_fn_opcount() > config.realize_opcount_threshold
+
+    def inner_fn_free_unbacked_symbols(self):
+        index = self._index(self.ranges)
+        return extract_free_unbacked_symbols(self.inner_fn, index)
+
+    def get_reads(self):
+        with patch.object(FlexibleLayout, "allow_indexing", True):
+            if self.get_reduction_type():
+                return extract_read_writes(
+                    self.make_loader(),
+                    self.get_size(),
+                    self.get_reduction_size(),
+                ).reads
+            else:
+                return extract_read_writes(
+                    self.make_loader(),
+                    self.get_size(),
+                ).reads
+
+    def get_reduction_size(self):
+        raise NotImplementedError(
+            f"get_reduction_size() is not implemented by {type(self)}!"
+        )
+
+    def get_reduction_type(self):
+        raise NotImplementedError(
+            f"get_reduction_type() is not implemented by {type(self)}!"
+        )
+
+    def constant_to_device(self, device):
+        raise NotImplementedError(
+            f"constant_to_device() is not implemented by {type(self)}!"
+        )
+
+
+def nop_loader_fn(idx, *, dtype):
+    if dtype.is_floating_point:
+        return ops.constant(float("nan"), dtype)
+    else:
+        return ops.constant(0, dtype)
+
+
+class Pointwise(Loops):
+    def make_loader(self):
+        # Make zero-element loops into a no-op
+        if self.is_zero_elements():
+            return partial(nop_loader_fn, dtype=self.dtype)
+
+        return self.inner_fn
+
+    def get_reduction_size(self):
+        return []
+
+    def get_reduction_type(self):
+        return None
+
+    def store_output(self, output_name, indexer, vars):
+        loader = self.make_loader()
+        return ops.store(output_name, indexer(vars), loader(vars))
+
+    def constant_to_device(self, device):
+        """Move this to a given device. Requires that all reads are to constants."""
+        loader = self.make_loader()
+        loader = patch.object(ConstantBuffer, "override_device", device)(loader)
+        return Pointwise(device, self.dtype, loader, self.ranges)
+
+
+@dataclasses.dataclass
+class Scatter(Pointwise):
+    output_indexer: Callable[[List[Expr]], Expr]
+    scatter_mode: Optional[str] = None
+
+    def constant_to_device(self, device):
+        """Move this to a given device. Requires that all reads are to constants."""
+        loader = self.make_loader()
+        loader = patch.object(ConstantBuffer, "override_device", device)(loader)
+        return Scatter(
+            device,
+            self.dtype,
+            loader,
+            self.ranges,
+            self.output_indexer,
+            self.scatter_mode,
+        )
+
+    def store_output(self, output_name, indexer, vars):
+        loader = self.make_loader()
+        return ops.store(
+            output_name,
+            indexer(self.output_indexer(vars)),
+            loader(vars),
+            mode=self.scatter_mode,
+        )
+
+
+class ReductionHint(Enum):
+    INNER = 0
+    OUTER = 1
+    OUTER_TINY = 2
+    DEFAULT = 3
+
+
+class TileHint(Enum):
+    SQUARE = 0
+    DEFAULT = 1
+
+
+REDUCTION_COMBINE_FN = {
+    "any": ops_wrapper("logical_or"),
+    "max": ops_wrapper("maximum"),
+    "min": ops_wrapper("minimum"),
+    "prod": ops_wrapper("mul"),
+    "sum": ops_wrapper("add"),
+    "xor_sum": ops_wrapper("bitwise_xor"),
+}
+
+
+def get_reduction_combine_fn(reduction_type, dtype):
+    if reduction_type in REDUCTION_COMBINE_FN:
+        combine_fn = REDUCTION_COMBINE_FN[reduction_type]
+    elif reduction_type in {"argmax", "argmin"}:
+
+        def combine_fn(a, b):
+            a_value, a_index = a
+            b_value, b_index = b
+
+            if reduction_type == "argmin":
+                mask = ops.lt(a_value, b_value)
+            else:
+                mask = ops.gt(a_value, b_value)
+
+            equal = ops.eq(a_value, b_value)
+            if is_float_dtype(dtype):
+                a_isnan = ops.ne(a_value, a_value)
+                b_isnan = ops.ne(b_value, b_value)
+                mask = ops.logical_or(mask, ops.gt(a_isnan, b_isnan))
+                equal = ops.logical_or(equal, ops.logical_and(a_isnan, b_isnan))
+
+            mask = ops.logical_or(
+                mask, ops.logical_and(equal, ops.lt(a_index, b_index))
+            )
+            return (
+                ops.where(mask, a_value, b_value),
+                ops.where(mask, a_index, b_index),
+            )
+
+    elif reduction_type == "welford_combine":
+
+        def combine_fn(a, b):
+            a_mean, a_m2, a_weight = a
+            b_mean, b_m2, b_weight = b
+
+            delta = b_mean - a_mean
+            new_weight = a_weight + b_weight
+            w2_over_w = b_weight / new_weight
+            return (
+                a_mean + delta * w2_over_w,
+                a_m2 + b_m2 + delta * delta * a_weight * w2_over_w,
+                new_weight,
+            )
+
+    else:
+        raise NotImplementedError(f"unknown reduction_type={reduction_type}")
+
+    return combine_fn
+
+
+@dataclasses.dataclass
+class Reduction(Loops):
+    reduction_ranges: List[Expr]
+    reduction_type: str
+    # self.dtype represents the dst dtype
+    src_dtype: torch.dtype
+    reduction_hint: ReductionHint
+
+    def __str__(self):
+        return Loops.__str__(  # type: ignore[call-arg]
+            self, names=("ranges", "reduction_ranges", "reduction_type")
+        )
+
+    def __repr__(self):
+        return self.__str__()
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return super().get_unbacked_symbol_uses() | set().union(
+            *(free_unbacked_symbols(e) for e in self.reduction_ranges)
+        )
+
+    def get_reduction_size(self):
+        return self.reduction_ranges
+
+    def get_reduction_type(self):
+        return self.reduction_type
+
+    def store_reduction(self, output_name, indexer, vars, reduction_vars):
+        value = ops.reduction(
+            self.dtype,
+            self.src_dtype,
+            self.reduction_type,
+            self.inner_fn(vars, reduction_vars),
+        )
+        return ops.store_reduction(output_name, indexer(vars), value)
+
+    def index_length(self):
+        return len(self.ranges) + len(self.reduction_ranges)
+
+    def inner_fn_args(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.reduction_ranges, "r")
+        return (index, rindex)
+
+    def inner_fn_free_unbacked_symbols(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.reduction_ranges, "r")
+        return extract_free_unbacked_symbols(self.inner_fn, index, rindex)
+
+    def constant_to_device(self, device):
+        """Move this to a given device. Requires that all reads are to constants."""
+        loader = self.make_loader()
+        loader = patch.object(ConstantBuffer, "override_device", device)(loader)
+        return Reduction(
+            device,
+            self.dtype,
+            loader,
+            self.ranges,
+            self.reduction_ranges,
+            self.reduction_type,
+            self.src_dtype,
+            ReductionHint.DEFAULT,
+        )
+
+    @staticmethod
+    def num_splits(
+        device,
+        dst_dtype,
+        src_dtype,
+        inner_fn,
+        ranges,
+        reduction_ranges,
+        reduction_type,
+        reduction_numel,
+        input_node: Optional[IRNode] = None,
+    ):
+        def _is_static(x):
+            return isinstance(x, (int, sympy.Integer))
+
+        reduction_numel_hint = V.graph.sizevars.symbolic_hint(reduction_numel)
+        numel_hint = V.graph.sizevars.symbolic_hint(sympy_product(ranges))
+
+        should_split = (
+            is_triton(device)
+            and reduction_type
+            not in {
+                "argmax",
+                "argmin",
+            }
+            and config.split_reductions
+            # We don't support unbacked symints
+            and _is_static(reduction_numel_hint)
+            and _is_static(numel_hint)
+        )
+        if not should_split:
+            return ReductionHint.DEFAULT, 1
+
+        device_interface = get_interface_for_device(get_device_type(device))
+        num_sm = device_interface.Worker.get_device_properties(
+            device
+        ).multi_processor_count
+        min_elements_per_thread = 32
+        max_elements_per_thread = 512
+        threads_per_sm = 2048
+        min_elements_per_device = min_elements_per_thread * num_sm * threads_per_sm
+        max_elements_per_device = max_elements_per_thread * num_sm * threads_per_sm
+
+        def inner_reduction_splits(reduction_numel_hint, numel_hint):
+            # do heuristics that's close to eager mode for split inner reduction
+            # we leak reduction autotune configs here, and will need to refactor to avoid this later
+            num_warps = 8
+            num_threads = 32 * num_warps
+            if numel_hint >= 2 * num_sm:  # don't split if there are enough outputs
+                return 1
+            if reduction_numel_hint <= 8192:
+                return 1
+            if reduction_numel_hint * numel_hint <= min_elements_per_device:
+                split_size = min_elements_per_thread
+            elif reduction_numel_hint * numel_hint < max_elements_per_device:
+                target_blocks = num_sm * threads_per_sm // (2 * num_threads)
+                blocks_per_output = (target_blocks + numel_hint - 1) // numel_hint
+                tmp_split_size = (
+                    reduction_numel_hint + num_threads * blocks_per_output - 1
+                ) // (num_threads * blocks_per_output)
+                divisors = sympy.divisors(reduction_numel_hint)
+                closest = min(divisors, key=lambda x: abs(x - tmp_split_size))
+                if abs(closest - tmp_split_size) < 30:
+                    # prefer even splits, but never smalle than min_elements_per_thread
+                    split_size = max(closest, min_elements_per_thread)
+                else:
+                    split_size = tmp_split_size
+            else:
+                divisors = sympy.divisors(reduction_numel_hint)
+                closest = min(divisors, key=lambda x: abs(x - max_elements_per_thread))
+                if abs(closest - max_elements_per_thread) < 50:
+                    # prefer even splits
+                    split_size = closest
+                else:
+                    split_size = max_elements_per_thread
+            return (reduction_numel_hint + split_size * num_threads - 1) // (
+                split_size * num_threads
+            )
+
+        def outer_reduction_splits(reduction_numel_hint, numel_hint):
+            # TODO the best heuristic currently has XBLOCK (corresponding to numel_hint) 128
+            # extend to even smaller number of outputs
+            num_warps = 8
+            num_threads = num_warps * 32
+            rvals_per_thread = 4  # comes from heuristics, refactor to not leak here
+            xvals_per_block = 128
+            xblocks = (numel_hint + xvals_per_block - 1) // xvals_per_block
+            if reduction_numel_hint * numel_hint < min_elements_per_device:
+                split_size = min_elements_per_thread
+            elif reduction_numel_hint * numel_hint < max_elements_per_device:
+                target_blocks = num_sm * threads_per_sm // (num_threads)
+                target_blocks = (target_blocks + xblocks - 1) // xblocks
+                tmp_split_size = (
+                    reduction_numel_hint + rvals_per_thread * target_blocks - 1
+                ) // (rvals_per_thread * target_blocks)
+                divisors = sympy.divisors(reduction_numel_hint)
+                closest = min(divisors, key=lambda x: abs(x - tmp_split_size))
+                if abs(tmp_split_size - closest) < 20:
+                    split_size = max(closest, min_elements_per_thread)
+                else:
+                    split_size = tmp_split_size
+            else:
+                divisors = sympy.divisors(reduction_numel_hint)
+                closest = min(divisors, key=lambda x: abs(x - max_elements_per_thread))
+                if abs(closest - max_elements_per_thread) < 50:
+                    # prefer even splits
+                    split_size = closest
+                else:
+                    split_size = max_elements_per_thread
+
+            return (reduction_numel_hint + rvals_per_thread * split_size - 1) // (
+                rvals_per_thread * split_size
+            )
+
+        # easy cases
+        if numel_hint == 1:
+            split = inner_reduction_splits(reduction_numel_hint, numel_hint)
+            if split == 1:
+                # No need to split.
+                return ReductionHint.INNER, split
+            if (
+                len(ranges) == 0
+                and input_node is not None
+                and isinstance(input_node, TensorBox)
+            ):
+                # Only handles the case where keep_dim = False.
+                # Otherwise, we need to propagate reduction dim info to the stage where
+                # the intermediate loader of the first Reduction is generated.
+                new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
+                    input_node
+                )
+                if new_ranges is not None and new_reduction_ranges is not None:
+                    extracted_numel_hint = V.graph.sizevars.symbolic_hint(
+                        sympy_product(new_ranges + new_reduction_ranges)
+                    )
+                    if reduction_numel_hint == extracted_numel_hint:
+                        log.debug(
+                            "Use previous IRNode's range and reduction_ranges instead of split. "
+                            "current ranges: %s, current reduction ranges: %s, current split: %d, "
+                            "new ranges: %s, new reduction ranges: %s",
+                            ranges,
+                            reduction_ranges,
+                            split,
+                            new_ranges,
+                            new_reduction_ranges,
+                        )
+                        # If the input_node or its dependent nodes are also Reduction nodes,
+                        # use reduction_sizes of this node or its dependent nodes directly.
+                        return ReductionHint.INNER, -1
+            return ReductionHint.INNER, split
+        if (
+            reduction_numel_hint <= min_elements_per_thread
+            or numel_hint >= num_sm * 2 * 32
+        ):
+            return ReductionHint.DEFAULT, 1
+
+        r = Reduction(
+            device,
+            dst_dtype,
+            inner_fn,
+            ranges,
+            reduction_ranges,
+            reduction_type,
+            src_dtype,
+            ReductionHint.DEFAULT,
+        )
+
+        def get_read_indices(r):
+            cb = ComputedBuffer(
+                name=None,
+                layout=FlexibleLayout(
+                    device=r.get_device(),
+                    dtype=r.get_dtype(),
+                    size=r.get_size(),
+                ),
+                data=r,
+            )
+            read_writes = cb.get_read_writes()
+            # try finding the full size producer
+            # TODO this will fail for something like ((1, N) * (N, 1)).sum()
+            # this would also possibly be wrong for producers with the different contiguity but we hope those cases are rare
+            range_vars = [
+                r
+                for r in read_writes.range_vars
+                if isinstance(r, sympy.Expr) and not isinstance(r, sympy.Number)
+            ]
+            indices = []
+            changed = False
+            for md in sorted(read_writes.reads, key=lambda x: x.name):
+                if all(r in md.index.free_symbols for r in range_vars):
+                    indices.append(md.index)
+                    if md.name in V.graph.name_to_buffer:
+                        buf = V.graph.name_to_buffer[md.name]
+                        original_stride = buf.layout.stride
+                        buf.decide_layout()
+                        if buf.layout.stride != original_stride:
+                            changed = True
+            return indices, changed
+
+        indices, changed = get_read_indices(r)
+        if changed:
+            indices, _ = get_read_indices(r)
+
+        if len(indices) == 0:
+            # TODO determine splits when all inputs are broadcast
+            return ReductionHint.DEFAULT, 1
+
+        (_, reduction_vars), ranges = dependencies.index_vars_squeeze(
+            r.get_size(), r.get_reduction_size()
+        )
+        num_outer = 0
+        num_inner = 0
+        for i in indices:
+            i = V.graph.sizevars.simplify_with_ranges(i, ranges)
+            strides = V.graph.sizevars.stride_hints(i, reduction_vars, ranges.keys())
+            outer = all(s > 1 for s in strides)
+            if outer:
+                num_outer += 1
+            else:
+                num_inner += 1
+        if num_inner > num_outer:
+            return ReductionHint.INNER, inner_reduction_splits(
+                reduction_numel_hint, numel_hint
+            )
+        else:
+            return ReductionHint.OUTER, outer_reduction_splits(
+                reduction_numel_hint, numel_hint
+            )
+
+    @staticmethod
+    def _unroll_reduction_fn(inner_fn, reduction_ranges, reduction_type, src_dtype):
+        """Convert inner_fn from a reduction to an pointwise"""
+        reduction_ranges = [
+            V.graph.sizevars.evaluate_static_shape(x) for x in reduction_ranges
+        ]
+
+        combine_fn = get_reduction_combine_fn(reduction_type, src_dtype)
+
+        def fn(index):
+            return functools.reduce(
+                combine_fn,
+                (
+                    value_fn(index, rindex)
+                    for rindex in itertools.product(
+                        *[range(x) for x in reduction_ranges]
+                    )
+                ),
+            )
+
+        if reduction_type in ("argmin", "argmax"):
+            flatten_index = FixedLayout(
+                None,  # type: ignore[arg-type]
+                None,  # type: ignore[arg-type]
+                reduction_ranges,
+                FlexibleLayout.contiguous_strides(reduction_ranges),
+            ).make_indexer()
+
+            def value_fn(index, rindex):
+                rindex = [sympy.expand(i) for i in rindex]
+                return (
+                    inner_fn(index, rindex),
+                    ops.index_expr(flatten_index(rindex), torch.int64),
+                )
+
+            return lambda index: fn(index)[1]
+        else:
+            value_fn = inner_fn
+            return fn
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        device: torch.device,
+        dst_dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        inner_fn: Callable[..., Any],
+        ranges: List[Expr],
+        reduction_ranges: List[Expr],
+        reduction_type: str,
+        reduction_hint: ReductionHint = ReductionHint.DEFAULT,
+        input_node: Optional[IRNode] = None,
+    ):
+        reduction_numel = V.graph.sizevars.simplify(sympy_product(reduction_ranges))
+
+        if reduction_numel == 0:
+            # N.B. This is a hack to generate the literal of the given type
+            # Ideally, we should be fixing `def constant` in triton.py
+            # but it breaks due to hardcoded dtypes in other places
+            def py_cnst(val):
+                return (
+                    bool(val)
+                    if dst_dtype == torch.bool
+                    else float(val)
+                    if dst_dtype.is_floating_point
+                    else int(val)
+                )
+
+            rtypes_to_inits = {
+                "sum": py_cnst(0),
+                "xor_sum": py_cnst(0),
+                "prod": py_cnst(1),
+                "any": py_cnst(0),
+                # "all" is desugared to `!any(!val)`
+            }
+
+            assert (
+                reduction_type in rtypes_to_inits.keys()
+            ), f"{reduction_type} not supported for zero-dimension tensors!"
+
+            def const_fn(index):
+                return ops.constant(rtypes_to_inits[reduction_type], dst_dtype)
+
+            return Pointwise.create(
+                device=device,
+                dtype=src_dtype,
+                inner_fn=const_fn,
+                ranges=list(ranges),
+            )
+
+        if reduction_numel == 1:
+            # this reduction is actually a pointwise op
+            if reduction_type in ("argmin", "argmax"):
+
+                def fn(index):
+                    return ops.constant(0, dst_dtype)
+
+            else:
+
+                def fn(index):
+                    reduction_index = [sympy.Integer(0) for _ in reduction_ranges]
+                    return inner_fn(index, reduction_index)
+
+            return Pointwise.create(device, dst_dtype, fn, ranges)
+
+        if (
+            isinstance(reduction_numel, sympy.Integer)
+            and V.graph.sizevars.size_hint(reduction_numel)
+            < config.unroll_reductions_threshold
+            and sympy_product(ranges) != 1
+        ):
+            return Pointwise.create(
+                device,
+                dst_dtype,
+                cls._unroll_reduction_fn(
+                    inner_fn, reduction_ranges, reduction_type, src_dtype
+                ),
+                ranges,
+            )
+
+        # triton doesn't support reduce to single element well, so break it up
+        hint, split = cls.num_splits(
+            device,
+            dst_dtype,
+            src_dtype,
+            inner_fn,
+            ranges,
+            reduction_ranges,
+            reduction_type,
+            reduction_numel,
+            input_node,
+        )
+        # intermediate reduction in split can contain complex indexing,
+        # and num_splits will fail to correctly set the hint
+        # reuse the passed hint if available
+        if reduction_hint == ReductionHint.DEFAULT:
+            reduction_hint = hint
+        if split == -1:
+            assert input_node is not None
+            new_ranges, new_reduction_ranges = extract_input_node_reduction_ranges(
+                input_node  # type: ignore[arg-type]
+            )
+            assert new_ranges is not None
+            assert new_reduction_ranges is not None
+            return cls.create_multilayer_existing_ranges(
+                device,
+                dst_dtype,
+                src_dtype,
+                inner_fn,
+                ranges,
+                reduction_ranges,
+                new_ranges,
+                new_reduction_ranges,
+                reduction_type,
+                reduction_hint,
+            )
+        elif split > 1:
+            # triton doesn't support reduce to single element well, so break it up
+            return cls.create_multilayer(
+                device,
+                dst_dtype,
+                src_dtype,
+                inner_fn,
+                ranges,
+                reduction_ranges,
+                reduction_type,
+                split,
+                reduction_hint,
+            )
+
+        return TensorBox.create(
+            Reduction(
+                device,
+                dst_dtype,
+                inner_fn,
+                ranges,
+                reduction_ranges,
+                reduction_type,
+                src_dtype,
+                reduction_hint,
+            )
+        )
+
+    @staticmethod
+    def default_accumulator(reduction_type, dtype):
+        if reduction_type in {"max", "argmax"}:
+            if is_float_dtype(dtype):
+                return float("-inf")
+            elif is_boolean_dtype(dtype):
+                return 0
+            else:
+                return torch.iinfo(dtype).min
+        if reduction_type in {"min", "argmin"}:
+            if is_float_dtype(dtype):
+                return float("inf")
+            elif is_boolean_dtype(dtype):
+                return 1
+            else:
+                return torch.iinfo(dtype).max
+
+        return {
+            "sum": 0,
+            "prod": 1,
+            "xor_sum": 0,
+            "any": 0,
+            "welford_reduce": (0, 0, 0),
+            "welford_combine": (0, 0, 0),
+        }[reduction_type]
+
+    @staticmethod
+    def default_value(reduction_type, dtype):
+        if reduction_type == "welford_reduce":
+            return 0
+        return Reduction.default_accumulator(reduction_type, dtype)
+
+    @staticmethod
+    def _multilayer_second_step_hint(
+        split: int, numel_hint: int, reduction_hint: ReductionHint
+    ) -> ReductionHint:
+        if split == -1:
+            return reduction_hint
+        if split <= 512 and numel_hint <= 512 and reduction_hint == ReductionHint.OUTER:
+            return ReductionHint.OUTER_TINY
+        if (
+            split <= 1024
+            and numel_hint <= 256
+            and reduction_hint == ReductionHint.OUTER
+        ):
+            return ReductionHint.OUTER_TINY
+
+        return reduction_hint
+
+    @classmethod
+    def _multilayer_wrap_loader(
+        cls,
+        loader,
+        reduction_ranges,
+        reduction_numel,
+        split,
+        block_size,
+        default,
+    ):
+        reindex = View.dynamic_reshape_indexer(reduction_ranges, [reduction_numel])
+        need_mask = not V.graph.sizevars.is_expr_static_and_true(
+            sympy.Eq(reduction_numel % split, 0)  # type: ignore[arg-type]
+        )
+
+        def wrapper_fn(index, reduction_index):
+            (reduction_index,) = reduction_index
+            *new_index, reduction_block = index
+            indices = block_size * reduction_block + reduction_index
+
+            def body():
+                return loader(new_index, reindex([indices]))
+
+            if need_mask:
+                mask = ops.lt(
+                    ops.index_expr(indices, torch.int32),
+                    ops.index_expr(reduction_numel, torch.int32),
+                )
+                return ops.masked(mask, body, default)
+            else:
+                return body()
+
+        return wrapper_fn
+
+    @classmethod
+    def _multilayer_wrap_loader_existing_ranges(
+        cls,
+        loader,
+        original_ranges,
+        original_reduction_ranges,
+        new_ranges,
+        new_reduction_ranges,
+        default,
+    ):
+        assert len(original_ranges) == 0, f"{original_ranges}= is not equal to []"
+        reindex = View.dynamic_reshape_indexer(
+            original_reduction_ranges, tuple(new_ranges) + tuple(new_reduction_ranges)
+        )
+
+        def wrapper_fn(index, reduction_index):
+            return loader([], reindex(tuple(index) + tuple(reduction_index)))
+
+        return wrapper_fn
+
+    @classmethod
+    def create_multilayer_helper(
+        cls,
+        device: torch.device,
+        dst_dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        wrapper_fn: Callable[..., Any],
+        original_ranges: List[Expr],
+        original_reduction_ranges: List[Expr],
+        new_ranges: List[Expr],
+        new_reduction_ranges: List[Expr],
+        reduction_type: str,
+        split: int,
+        reduction_hint: ReductionHint,
+    ):
+        """
+        Break a large reduction up into multiple smaller reductions
+        recursively
+        """
+        # triton will automatically compute reductions in fp32 if reducing over fp16/bf16
+        # within the kernel. keep the intermediate in fp32 so as to keep the whole reduction
+        # in fp32 and not reduce precision by breaking up the kernel into multiple layers
+        intermediate_dtype = (
+            dst_dtype
+            if dst_dtype not in (torch.float16, torch.bfloat16)
+            else torch.float
+        )
+        intermediate = Reduction.create(
+            device,
+            intermediate_dtype,
+            src_dtype,
+            wrapper_fn,
+            new_ranges,
+            new_reduction_ranges,
+            reduction_type,
+            reduction_hint,
+        )
+        intermediate.realize()
+        intermediate_loader = intermediate.make_loader()
+
+        def intermediate_fn(index, reduction_index):
+            return intermediate_loader([*index, *reduction_index])
+
+        numel_hint = V.graph.sizevars.size_hint(sympy_product(original_ranges))
+        reduction_hint = cls._multilayer_second_step_hint(
+            split, numel_hint, reduction_hint
+        )
+
+        assert original_ranges == new_ranges[: len(original_ranges)]
+        return TensorBox.create(
+            Reduction(
+                device,
+                dst_dtype,
+                intermediate_fn,
+                original_ranges,
+                new_ranges[len(original_ranges) :],
+                reduction_type,
+                src_dtype,
+                reduction_hint,
+            )
+        )
+
+    @classmethod
+    def create_multilayer(
+        cls,
+        device: torch.device,
+        dst_dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        inner_fn: Callable[..., Any],
+        ranges: List[Expr],
+        reduction_ranges: List[Expr],
+        reduction_type: str,
+        split: int,
+        reduction_hint: ReductionHint,
+    ):
+        """
+        Break a large reduction up into multiple smaller reductions
+        recursively
+        """
+        # TODO(jansel): realize the reduction so we can do dynamic indexing
+        reduction_numel = sympy_product(reduction_ranges)
+        block_size = FloorDiv(reduction_numel + (split - 1), split)
+        default = cls.default_value(reduction_type, dst_dtype)
+        wrapper_fn = cls._multilayer_wrap_loader(
+            inner_fn, reduction_ranges, reduction_numel, split, block_size, default
+        )
+
+        return cls.create_multilayer_helper(
+            device,
+            dst_dtype,
+            src_dtype,
+            wrapper_fn,
+            ranges,
+            reduction_ranges,
+            [*ranges, split],  # type: ignore[list-item]
+            [block_size],
+            reduction_type,
+            split,
+            reduction_hint,
+        )
+
+    @classmethod
+    def create_multilayer_existing_ranges(
+        cls,
+        device: torch.device,
+        dst_dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        inner_fn: Callable[..., Any],
+        original_ranges: List[Expr],
+        original_reduction_ranges: List[Expr],
+        new_ranges: List[Expr],
+        new_reduction_ranges: List[Expr],
+        reduction_type: str,
+        reduction_hint: ReductionHint,
+    ):
+        """
+        Break a large reduction up into multiple smaller reductions
+        recursively
+        """
+        default = cls.default_value(reduction_type, dst_dtype)
+        wrapper_fn = cls._multilayer_wrap_loader_existing_ranges(
+            inner_fn,
+            original_ranges,
+            original_reduction_ranges,
+            new_ranges,
+            new_reduction_ranges,
+            default,
+        )
+        return cls.create_multilayer_helper(
+            device,
+            dst_dtype,
+            src_dtype,
+            wrapper_fn,
+            original_ranges,
+            original_reduction_ranges,
+            new_ranges,
+            new_reduction_ranges,
+            reduction_type,
+            -1,
+            reduction_hint,
+        )
+
+
+def num_reduction_outputs(reduction_type):
+    return 3 if "welford" in reduction_type else 1
+
+
+class WelfordReduction(Reduction):
+    output_index: int
+
+    def __init__(
+        self,
+        device,
+        dtype,
+        inner_fns,
+        ranges,
+        reduction_ranges,
+        reduction_type,
+        reduction_hint,
+        output_index,
+    ):
+        if len(inner_fns) == 1:
+            loader = inner_fns[0]
+        else:
+
+            def loader(idx, reduction_idx):
+                return tuple(fn(idx, reduction_idx) for fn in inner_fns)
+
+        super().__init__(
+            device,
+            dtype,
+            loader,
+            ranges,
+            reduction_ranges,
+            reduction_type,
+            dtype,
+            reduction_hint,
+        )
+        self.output_index = output_index
+
+    def store_reduction(self, output_name, indexer, vars, reduction_vars):
+        values = ops.reduction(
+            self.dtype,
+            self.src_dtype,
+            self.reduction_type,
+            self.inner_fn(vars, reduction_vars),
+        )
+        value = values[self.output_index]
+        return ops.store_reduction(output_name, indexer(vars), value)
+
+    @classmethod
+    def create(  # type: ignore[override]
+        cls,
+        device: torch.device,
+        dtype: torch.dtype,
+        inner_fns: Sequence[Callable[..., Any]],
+        ranges: List[Expr],
+        reduction_ranges: List[Expr],
+        reduction_type: str,
+        reduction_hint: ReductionHint = ReductionHint.DEFAULT,
+    ):
+        assert reduction_type in {"welford_reduce", "welford_combine"}
+
+        reduction_numel = V.graph.sizevars.simplify(sympy_product(reduction_ranges))
+
+        def const(val):
+            def inner_fn(idx):
+                return ops.constant(
+                    val,
+                    dtype,
+                )
+
+            return Pointwise.create(
+                device=device,
+                dtype=dtype,
+                inner_fn=inner_fn,
+                ranges=list(ranges),
+            )
+
+        if reduction_numel == 0:
+            mean = const(0)
+            m2 = const(0)
+            weight = const(0)
+            return mean, m2, weight
+
+        if reduction_numel == 1:
+
+            def copy(loader):
+                def inner_fn(idx):
+                    reduction_index = [sympy.Integer(0) for _ in reduction_ranges]
+                    return loader(idx, reduction_index)
+
+                return Pointwise.create(
+                    device=device,
+                    dtype=dtype,
+                    inner_fn=inner_fn,
+                    ranges=list(ranges),
+                )
+
+            if reduction_type == "welford_reduce":
+                return copy(inner_fns[0]), const(0), const(1)
+            else:
+                return tuple(copy(fn) for fn in inner_fns)
+
+        # TODO: Unrolled reduction
+        # if (
+        #     isinstance(reduction_numel, sympy.Integer)
+        #     and V.graph.sizevars.size_hint(reduction_numel)
+        #     < config.unroll_reductions_threshold
+        #     and sympy_product(ranges) != 1
+        # ):
+        #     return Pointwise.create(
+        #         device,
+        #         dst_dtype,
+        #         cls._unroll_reduction_fn(
+        #             inner_fn, reduction_ranges, reduction_type, src_dtype
+        #         ),
+        #         ranges,
+        #     )
+
+        # triton doesn't support reduce to single element well, so break it up
+        hint, split = Reduction.num_splits(
+            device,
+            dtype,
+            dtype,
+            inner_fns[0],
+            ranges,
+            reduction_ranges,
+            reduction_type=reduction_type,
+            reduction_numel=reduction_numel,
+        )
+        # intermediate reduction in split can contain complex indexing,
+        # and num_splits will fail to correctly set the hint
+        # reuse the passed hint if available
+        if reduction_hint == ReductionHint.DEFAULT:
+            reduction_hint = hint
+        if split > 1:
+            # triton doesn't support reduce to single element well, so break it up
+            return cls.create_multilayer(
+                device,
+                dtype,
+                inner_fns,
+                ranges,
+                reduction_ranges,
+                reduction_type,
+                split,
+                reduction_hint,
+            )
+
+        results = [
+            TensorBox.create(
+                WelfordReduction(
+                    device,
+                    dtype,
+                    inner_fns,
+                    ranges,
+                    reduction_ranges,
+                    reduction_type,
+                    reduction_hint,
+                    output_idx,
+                )
+            )
+            for output_idx in range(3)
+        ]
+        for t in results:
+            t.realize()
+        return results
+
+    @staticmethod
+    def default_value(reduction_type, dtype):
+        return (0, 0, 0)
+
+    @classmethod
+    def create_multilayer(  # type: ignore[override]
+        cls,
+        device: torch.device,
+        dtype: torch.dtype,
+        inner_fns: Sequence[Callable[..., Any]],
+        ranges: List[Expr],
+        reduction_ranges: List[Expr],
+        reduction_type: str,
+        split: int,
+        reduction_hint: ReductionHint,
+    ):
+        """
+        Break a large reduction up into multiple smaller reductions
+        recursively
+        """
+        reduction_numel = sympy_product(reduction_ranges)
+        need_mask = not V.graph.sizevars.is_expr_static_and_true(
+            sympy.Eq(reduction_numel % split, 0)  # type: ignore[arg-type]
+        )
+
+        if need_mask and reduction_type != "welford_combine":
+            # If we need mask, then "welford_reduce" doesn't work because
+            # masked inputs shouldn't count towards the welford weight
+
+            def constant(idx, reduction_idx, value):
+                return ops.constant(value, dtype)
+
+            return cls.create_multilayer(
+                device=device,
+                dtype=dtype,
+                inner_fns=(
+                    inner_fns[0],
+                    partial(constant, value=0),
+                    partial(constant, value=1),
+                ),
+                ranges=ranges,
+                reduction_ranges=reduction_ranges,
+                reduction_type="welford_combine",
+                split=split,
+                reduction_hint=reduction_hint,
+            )
+
+        block_size = FloorDiv(reduction_numel + (split - 1), split)
+        intermediates = WelfordReduction.create(
+            device,
+            dtype,
+            tuple(
+                cls._multilayer_wrap_loader(
+                    loader,
+                    reduction_ranges,
+                    reduction_numel,
+                    split,
+                    block_size,
+                    default=0,
+                )
+                for loader in inner_fns
+            ),
+            [*ranges, split],  # type: ignore[list-item]
+            [block_size],
+            reduction_type,
+            reduction_hint,
+        )
+        for i in intermediates:
+            i.realize()
+
+        i_loaders = [i.make_loader() for i in intermediates]
+
+        def intermediate_loader_fn(index, reduction_index, loader):
+            return loader([*index, *reduction_index])
+
+        numel_hint = V.graph.sizevars.size_hint(sympy_product(ranges))
+        reduction_hint = cls._multilayer_second_step_hint(
+            split, numel_hint, reduction_hint
+        )
+        return WelfordReduction.create(
+            device,
+            dtype,
+            tuple(
+                partial(intermediate_loader_fn, loader=i.make_loader())
+                for i in intermediates
+            ),
+            ranges,
+            [split],  # type: ignore[list-item]
+            # welford_reduce turns one input into three outputs, which are combined with welford_combine
+            "welford_combine",
+            reduction_hint,
+        )
+
+
+@dataclasses.dataclass
+class Scan(Loops):
+    scan_ranges: List[Expr]
+    size: List[Expr]
+    combine_fn: Callable[..., Any]
+    reindex: Callable[[List[Expr], List[Expr]], List[Expr]]
+    reduction_hint: ReductionHint
+    init: int
+
+    # HACK we mimick reduction
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        # TODO: Can combine_fn/reindex close over unbacked symbols? If so, we
+        # need to explicitly represent the closure so we can pull out unbacked
+        # symbols here
+        return (
+            super().get_unbacked_symbol_uses()
+            | set().union(*(free_unbacked_symbols(e) for e in self.scan_ranges))
+            | set().union(*(free_unbacked_symbols(e) for e in self.size))
+        )
+
+    def __post_init__(self):
+        assert len(self.ranges) + len(self.scan_ranges) == len(self.size)
+        super().__post_init__()
+
+    def store_reduction(self, output_name, indexer, vars, scan_vars):
+        idx = self.reindex(vars, scan_vars)
+        value = self.inner_fn(idx)
+        result = ops.scan(self.dtype, self.combine_fn, value, self.init)
+        return ops.store(output_name, indexer(idx), result)
+
+    def get_reduction_type(self):
+        # return self.scan_op
+        return "custom"
+
+    def get_reduction_size(self):
+        return self.scan_ranges
+
+    def get_size(self):
+        return self.size
+
+    def get_pointwise_size(self):
+        return self.ranges
+
+    def index_length(self):
+        return len(self.ranges) + len(self.scan_ranges)
+
+    def inner_fn_args(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.scan_ranges, "r")
+        idx = self.reindex(index, rindex)
+        return (idx,)
+
+    def inner_fn_free_unbacked_symbols(self):
+        index = self._index(self.ranges)
+        rindex = self._index(self.scan_ranges, "r")
+        idx = self.reindex(index, rindex)
+        return extract_free_unbacked_symbols(self.inner_fn, idx)
+
+    @classmethod
+    def create(
+        cls,
+        device: torch.device,
+        dtype: torch.dtype,
+        inner_fn: Callable[[List[Expr]], Any],
+        size: List[Expr],
+        axis: int,
+        combine_fn: Callable[..., Any],
+        init: Any,
+        reduction_hint: ReductionHint = ReductionHint.DEFAULT,
+    ) -> Optional["TensorBox"]:
+        pointwise_ranges = [*size[:axis], *size[axis + 1 :]]
+        scan_ranges = [size[axis]]
+
+        if device.type != "cuda":
+            # TODO: CPU support
+            return None
+
+        sizevars = V.graph.sizevars
+        scan_numel = sizevars.simplify(sympy_product(scan_ranges))
+
+        # Scan with a single element is just a copy
+        if sizevars.is_expr_static_and_true(sympy.Le(scan_numel, 1)):  # type: ignore[arg-type]
+            return Pointwise.create(
+                device=device,
+                dtype=dtype,
+                inner_fn=inner_fn,
+                ranges=size,
+            )
+
+        reduction_hint, num_splits = cls.num_splits(
+            device=device,
+            dtype=dtype,
+            inner_fn=inner_fn,
+            axis=axis,
+            pointwise_ranges=pointwise_ranges,
+            scan_ranges=scan_ranges,
+            combine_fn=combine_fn,
+            scan_numel=scan_numel,
+        )
+        scan_type = Scan if num_splits <= 1 else SplitScan
+
+        if num_splits > 1 and torch.version.hip is not None:
+            # Fallback for split-scan on ROCm
+            return None
+
+        def reindex(index, scan_index):
+            assert len(scan_index) == len(scan_ranges)
+            assert len(index) == len(pointwise_ranges)
+            return [*index[:axis], *scan_index, *index[axis:]]
+
+        result = TensorBox.create(
+            scan_type(
+                device=device,
+                dtype=dtype,
+                inner_fn=inner_fn,
+                size=size,
+                ranges=pointwise_ranges,
+                scan_ranges=scan_ranges,
+                combine_fn=combine_fn,
+                reindex=reindex,
+                init=init,
+                reduction_hint=reduction_hint,
+            )
+        )
+        result.realize()
+        return result
+
+    @classmethod
+    def num_splits(
+        cls,
+        device: torch.device,
+        dtype: torch.dtype,
+        inner_fn: Callable[[List[Expr]], Any],
+        axis: int,
+        pointwise_ranges: List[Expr],
+        scan_ranges: List[Expr],
+        combine_fn: Callable[..., Any],
+        scan_numel: Expr,
+    ):
+        # TODO: custom splitting heuristic for scan
+        def wrapper_fn(idx, reduction_idx):
+            return inner_fn([*idx[:axis], *reduction_idx, *idx[axis:]])
+
+        return Reduction.num_splits(
+            device=device,
+            dst_dtype=dtype,
+            src_dtype=dtype,
+            inner_fn=wrapper_fn,
+            ranges=pointwise_ranges,
+            reduction_ranges=scan_ranges,
+            reduction_type="sum",
+            reduction_numel=scan_numel,
+        )
+
+
+# This signifies a scan op that should go through TritonSplitScanKernel codgen on CUDA.
+@dataclasses.dataclass
+class SplitScan(Scan):
+    pass
+
+
+def is_storage_and_layout(x):
+    try:
+        as_storage_and_layout(x, freeze=False)
+        return True
+    except NotImplementedError:
+        return False
+
+
+def is_contiguous_storage_and_layout(x):
+    try:
+        buffer, layout = as_storage_and_layout(x, freeze=False)
+        return layout.is_contiguous()
+    except NotImplementedError:
+        return False
+
+
+def as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=None):
+    """Try to simplify x into a StorageBox and a Layout"""
+    if isinstance(x, TensorBox):
+        return as_storage_and_layout(
+            x.data,
+            freeze=freeze,
+            want_contiguous=want_contiguous,
+            stride_order=stride_order,
+        )
+    if isinstance(x, StorageBox) and isinstance(x.data, Buffer):
+        if freeze:
+            if want_contiguous:
+                x.data.freeze_layout()
+                assert x.data.layout.is_contiguous()
+            elif stride_order is not None:
+                x.data.freeze_layout_with_stride_order(stride_order)
+            else:
+                x.data.decide_layout()
+        return x, x.data.layout
+    if isinstance(x, ReinterpretView):
+        # making the base of x contiguous or stride_ordered will not necessarily make
+        # the ReinterpretView either, so don't pass along those arguments
+        buffer, _ = as_storage_and_layout(
+            x.data,
+            freeze=freeze,
+        )
+        return buffer, x.layout
+    raise NotImplementedError
+
+
+as_contiguous_storage_and_layout = functools.partial(
+    as_storage_and_layout, want_contiguous=True
+)
+
+
+def is_stride_order_storage_and_layout(x, stride_order):
+    try:
+        buffer, layout = as_storage_and_layout(x, freeze=False)
+        return layout.is_stride_ordered(stride_order)
+    except NotImplementedError:
+        return False
+
+
+@dataclasses.dataclass
+class BaseView(IRNode):
+    data: IRNode
+
+    def get_unbacked_symbol_uses(self):
+        return self.data.get_unbacked_symbol_uses()
+
+    def make_reindexer(self):
+        raise NotImplementedError(f"make_reindexer NYI on {self}")
+
+    def make_indexer(self):
+        inner = self.data.make_indexer()
+        reindex = self.make_reindexer()
+
+        def indexer(idx):
+            return inner(reindex(idx))
+
+        return indexer
+
+    def make_loader(self):
+        inner = self.data.make_loader()
+        reindex = self.make_reindexer()
+
+        def loader(idx):
+            return inner(reindex(idx))
+
+        return loader
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    def get_layout(self):
+        return self.data.get_layout()
+
+    def get_device(self):
+        return self.data.get_device()
+
+    def get_origin_node(self):
+        return None
+
+    def get_name(self):
+        return self.data.get_name()
+
+    def get_pointwise_size(self):
+        return self.get_size()
+
+    def mark_reuse(self, users):
+        return self.data.mark_reuse(users)
+
+    def has_exceeded_max_reads(self):
+        return self.data.has_exceeded_max_reads()
+
+    def realize(self):
+        return self.data.realize()
+
+    def realize_hint(self):
+        return self.data.realize_hint()
+
+    def get_storage_numel(self):
+        return self.data.get_storage_numel()
+
+    def is_extern(self):
+        return self.data.is_extern()  # type: ignore[attr-defined]
+
+    def get_reads(self):
+        with patch.object(FlexibleLayout, "allow_indexing", True):
+            return extract_read_writes(
+                self.make_loader(),
+                self.get_size(),
+            ).reads
+
+    def unwrap_view(self):
+        x: IRNode = self
+        while isinstance(x, BaseView):
+            x = x.data
+        return x
+
+    def constant_to_device(self, device):
+        """Move this to a given device. Requires that all reads are to constants."""
+        loader = self.make_loader()
+        loader = patch.object(ConstantBuffer, "override_device", device)(loader)
+        return Pointwise(device, self.get_dtype(), loader, self.get_size())
+
+
+@dataclasses.dataclass
+class ExpandView(BaseView):
+    size: List[Expr]
+
+    @staticmethod
+    def _normalize_size(x, new_size):
+        """Replace `-1` with correct sizes"""
+        new_size = list(map(sympy.expand, new_size))
+        old_size = x.get_size()
+        old_size = [None] * (len(new_size) - len(old_size)) + list(old_size)
+        assert len(new_size) == len(old_size)
+        for i in range(len(new_size)):
+            if new_size[i] == -1:
+                assert old_size[i] is not None
+                new_size[i] = old_size[i]
+            elif old_size[i] is None or old_size[i] == 1:
+                pass
+            else:
+                # Expect broadcast compatibility
+                new_size[i] = V.graph.sizevars.expect_equals(
+                    new_size[i],
+                    old_size[i],
+                    msg=f"Broadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i}",
+                )
+        return new_size
+
+    @classmethod
+    def create(cls, x, new_size):
+        new_size = cls._normalize_size(x, new_size)
+
+        if is_storage_and_layout(x):
+            storage, old_layout = as_storage_and_layout(x)
+            skip = len(new_size) - len(old_layout.size)
+            assert skip >= 0
+            new_stride = [sympy.Integer(0)] * skip
+            for stride, size in zip(old_layout.stride, old_layout.size):
+                new_stride.append(stride if size != 1 else sympy.Integer(0))
+            new_layout = FixedLayout(
+                old_layout.device,
+                old_layout.dtype,
+                list(new_size),
+                new_stride,
+                old_layout.offset,
+            )
+            return ReinterpretView(storage, new_layout)
+
+        return ExpandView(x, new_size)
+
+    def get_size(self):
+        return self.size
+
+    def make_reindexer(self):
+        target = self.get_size()
+        actual = self.data.get_size()
+        skip = len(target) - len(actual)
+
+        def reindex(index):
+            index = list(index[skip:])
+            assert len(index) == len(actual)
+            for i in range(len(actual)):
+                if actual[i] == 1:
+                    # zero out broadcast dimension
+                    index[i] = sympy.Integer(0)
+            return index
+
+        return reindex
+
+
+@dataclasses.dataclass
+class PermuteView(BaseView):
+    dims: List[Expr]
+
+    @classmethod
+    def create(cls, x, dims):
+        dims = cls._map_neg_dims(dims)
+        assert set(dims) == set(range(len(dims)))
+
+        if is_storage_and_layout(x):
+            storage, old_layout = as_storage_and_layout(x)
+            new_layout = FixedLayout(
+                old_layout.device,
+                old_layout.dtype,
+                [old_layout.size[i] for i in dims],
+                [old_layout.stride[i] for i in dims],
+                old_layout.offset,
+            )
+            return ReinterpretView(storage, new_layout)
+
+        return PermuteView(x, dims)
+
+    @classmethod
+    def _map_neg_dims(cls, dims):
+        return [dim if dim >= 0 else len(dims) + dim for dim in dims]
+
+    def get_size(self):
+        assert set(self._map_neg_dims(self.dims)) == set(range(len(self.dims)))
+        size = self.data.get_size()
+        return [size[i] for i in self.dims]
+
+    def make_reindexer(self):
+        inv = {j: i for i, j in enumerate(self.dims)}
+        inv = [inv[i] for i in range(len(self.dims))]  # type: ignore[index]
+        assert set(inv) == set(range(len(self.dims)))
+
+        def reindex(index):
+            return [index[i] for i in inv]
+
+        return reindex
+
+
+class SqueezeView(BaseView):
+    @classmethod
+    def create(cls, x, *, dim=None):
+        if is_storage_and_layout(x):
+            storage, old_layout = as_storage_and_layout(x)
+            new_size = []
+            new_stride = []
+            if dim is not None:
+                assert isinstance(dim, int), "expected integer dim argument"
+                assert 0 <= dim and dim < len(old_layout.size)
+
+            for i, (size, stride) in enumerate(zip(old_layout.size, old_layout.stride)):
+                if dim is None:
+                    if size != 1:
+                        new_size.append(size)
+                        new_stride.append(stride)
+                else:
+                    if i != dim:
+                        new_size.append(size)
+                        new_stride.append(stride)
+                    else:
+                        assert size == 1, "expected squeezed size to be 1"
+
+            new_layout = FixedLayout(
+                old_layout.device,
+                old_layout.dtype,
+                new_size,
+                new_stride,
+                old_layout.offset,
+            )
+            return ReinterpretView(storage, new_layout)
+
+        if dim is None:
+            # redirect to a generic view
+            return View.create(x, [s for s in x.get_size() if s != 1])
+        else:
+            assert x.get_size()[dim] == 1
+            return View.create(x, [s for i, s in enumerate(x.get_size()) if i != dim])
+
+    @staticmethod
+    def squeezer(size: Tuple[sympy.Expr, ...]):
+        new_size = [s for s in size if s != 1]
+        not_one = [i for i, s in enumerate(size) if s != 1]
+        length = len(size)
+
+        def reindex(index: List[sympy.Expr]) -> Tuple[sympy.Expr, ...]:
+            assert len(index) == len(not_one), f"{index} {not_one}"
+            new_index = [sympy.Integer(0)] * length
+            for idx, s in zip(not_one, index):
+                new_index[idx] = s
+            return tuple(new_index)
+
+        return new_size, reindex
+
+    def __init__(self, data):
+        raise AssertionError("use SqueezeView.create()")
+
+
+@dataclasses.dataclass
+class GenericView(BaseView):
+    size: List[Expr]
+    reindex: Callable[..., Any]
+
+    def make_reindexer(self):
+        return self.reindex
+
+    def reindex_str(self):
+        index_old = [sympy_index_symbol(f"i{n}") for n in range(len(self.size))]
+        index_new = list(self.reindex(index_old))
+        return f"lambda {', '.join(map(str, index_old))}: {index_new}"
+
+    def __str__(self):
+        return self.str_helper(
+            [self.data, f"size={self.size}", f"reindex={self.reindex_str()}"]
+        )
+
+    __repr__ = __str__
+
+    @classmethod
+    def create(cls, x, new_size, reindex):
+        return cls(x, list(new_size), reindex)
+
+    def get_size(self):
+        return self.size
+
+
+@dataclasses.dataclass
+class View(GenericView):
+    @staticmethod
+    def handle_negative_index(idx, size):
+        idx = sympy.expand(idx)
+        size = sympy.expand(size)
+        evaluate_expr = V.graph.sizevars.shape_env.evaluate_expr
+        if evaluate_expr(sympy.Lt(idx, 0)):
+            idx = idx + size
+        return idx
+
+    @classmethod
+    def create(cls, x, new_size):
+        assert isinstance(new_size, (tuple, list))
+        old_size, new_size = cls.resolve_negative_size(x.get_size(), new_size)
+
+        # Skip pointless views
+        if V.graph.sizevars.statically_known_list_equals(old_size, new_size):
+            return x
+
+        unbacked_symbols_in_sizes = False
+        if (
+            len(free_unbacked_symbols(old_size)) > 0
+            or len(free_unbacked_symbols(new_size)) > 0
+        ):
+            unbacked_symbols_in_sizes = True
+
+        if 0 in new_size:
+
+            def fake_reindex(index):
+                return tuple([0] * len(old_size))
+
+            return cls(x, list(new_size), fake_reindex)
+        # TODO: a new class for FixedTransferLayout that output layout is constrained by input layout
+        elif is_contiguous_storage_and_layout(x) or unbacked_symbols_in_sizes:
+            if unbacked_symbols_in_sizes and (not is_contiguous_storage_and_layout(x)):
+                # realize x; otherwise, the dynamic_reshape_indexer below will fail
+                # due to the size_hint's inability to process unbacked SymInts
+                x = ExternKernel.realize_input(x)
+
+            storage, old_layout = as_contiguous_storage_and_layout(x)
+            new_layout = FixedLayout(
+                old_layout.device,
+                old_layout.dtype,
+                new_size,
+                FlexibleLayout.contiguous_strides(new_size),
+                old_layout.offset,
+            )
+            return ReinterpretView(storage, new_layout)
+
+        reindex = cls.dynamic_reshape_indexer(old_size, new_size)
+        return cls(x, list(new_size), reindex)
+
+    @staticmethod
+    def resolve_negative_size(old_size, new_size):
+        new_size = [V.graph.sizevars.simplify(x) for x in new_size]
+        old_size = [V.graph.sizevars.simplify(x) for x in old_size]
+
+        new_size = list(new_size)
+        for i in range(len(new_size)):
+            if new_size[i] == -1:
+                new_size[i] = sympy.Integer(1)
+                new_size[i] = CleanDiv(sympy_product(old_size), sympy_product(new_size))
+                break
+
+        V.graph.sizevars.guard_equals(sympy_product(old_size), sympy_product(new_size))
+        return old_size, new_size
+
+    @classmethod
+    def dynamic_reshape_indexer(cls, old_size, new_size):
+        try:
+            reindex = cls._dynamic_reshape_indexer(old_size, new_size)
+        except (AssertionError, IndexError):
+            # optimistic algorithm failed, lets do a fallback
+            flat = [sympy_product(old_size)]
+            reindex1 = cls._dynamic_reshape_indexer(old_size, flat)
+            reindex2 = cls._dynamic_reshape_indexer(flat, new_size)
+            reindex = fuse_reindexing(reindex1, reindex2)
+        return reindex
+
+    @staticmethod
+    def _dynamic_reshape_indexer(old_size, new_size):
+        """
+        Perform a reshape entirely by modifying indexing math
+        """
+        size_hint = V.graph.sizevars.size_hint
+        vars = [sympy_index_symbol(f"view{i}") for i in range(len(new_size))]
+
+        stack_new = list(zip(vars, new_size))
+        stack_old = list(old_size)
+
+        view_expr = []
+        while stack_new and stack_old:
+            size_old = stack_old.pop()
+            var, size_new = stack_new.pop()
+            if size_old == 1:
+                view_expr.append(sympy.Integer(0))
+                stack_new.append((var, size_new))  # re-add
+            elif size_new == 1:
+                stack_old.append(size_old)  # re-add
+            elif size_hint(size_new) == size_hint(size_old):
+                view_expr.append(var)
+                V.graph.sizevars.guard_equals(size_new, size_old)
+            elif size_hint(size_new) < size_hint(size_old):
+                while size_hint(size_new) < size_hint(size_old):
+                    var2, size_new2 = stack_new.pop()
+                    var = var2 * size_new + var
+                    size_new = size_new * size_new2
+                view_expr.append(var)
+                V.graph.sizevars.guard_equals(size_new, size_old)
+            elif size_hint(size_new) > size_hint(size_old):
+                divisor = sympy.Integer(1)
+                modulus = size_old
+                view_expr.append(ModularIndexing(var, divisor, modulus))
+                divisor = divisor * modulus
+                while size_hint(size_new) > size_hint(size_old):
+                    modulus = stack_old.pop()
+                    view_expr.append(ModularIndexing(var, divisor, modulus))
+                    divisor = divisor * modulus
+                    size_old = size_old * modulus
+                V.graph.sizevars.guard_equals(size_new, size_old)
+            else:
+                raise AssertionError()
+
+        while stack_old:
+            size_old = stack_old.pop()
+            V.graph.sizevars.guard_equals(size_old, 1)  # type: ignore[arg-type]
+            view_expr.append(sympy.Integer(0))
+
+        while stack_new:
+            var, size_new = stack_new.pop()
+            V.graph.sizevars.guard_equals(size_new, 1)  # type: ignore[arg-type]
+
+        view_expr.reverse()
+        assert len(view_expr) == len(old_size)
+
+        def reindex(index):
+            assert len(index) == len(vars), (len(index), len(vars))
+            replacements = dict(zip(vars, index))
+            return tuple(sympy_subs(x, replacements) for x in view_expr)  # type: ignore[arg-type]
+
+        return reindex
+
+
+@dataclasses.dataclass
+class ReinterpretView(BaseView):
+    """Pretend our storage has a different layout"""
+
+    layout: "Layout"
+
+    def __post_init__(self):
+        super().__post_init__()
+        if isinstance(self.data, BaseView):
+            self.data = self.data.unwrap_view()
+
+    def __str__(self):
+        return self.str_helper(
+            [
+                self.data,
+                self.layout,
+            ]
+        )
+
+    __repr__ = __str__
+
+    def get_name(self):
+        return self.data.get_name()
+
+    def get_device(self):
+        return self.layout.device
+
+    def get_origin_node(self):
+        return None
+
+    @property
+    def dtype(self):
+        return self.layout.dtype
+
+    def get_size(self):
+        return list(self.layout.size)
+
+    def get_stride(self):
+        return list(self.layout.stride)
+
+    def make_loader(self):
+        def loader(index):
+            indexer = self.layout.make_indexer()
+            return ops.load(self.get_name(), indexer(index))
+
+        return loader
+
+    def make_indexer(self):
+        return self.layout.make_indexer()
+
+    def get_layout(self):
+        return self.layout
+
+    def freeze_layout(self):
+        pass
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return (
+            free_unbacked_symbols(self.layout.size)
+            | free_unbacked_symbols(self.layout.stride)
+            | free_unbacked_symbols(self.layout.offset)
+        )
+
+    def codegen_reference(self, writer=None):
+        # reinterpret_tensor is similar to as_strided except:
+        # - offset is added to the existing offset (rather than replacing it)
+        # - view tracking is disabled similar to unsafe_view
+        return V.graph.wrapper_code.codegen_reinterpret_view(
+            self.data,
+            self.layout.size,
+            self.layout.stride,
+            self.layout.offset,
+            writer,
+        )
+
+
+class SliceView(View):
+    @classmethod
+    def normalize_start_end(cls, x, dim, start, end):
+        """
+        Normalize start and end such that both are in the range
+        [0, x.get_size()[dim]] and start <= end.
+        """
+        sizevars = V.graph.sizevars
+        dim_size = x.get_size()[dim]
+
+        if any(free_unbacked_symbols(x) for x in (start, end, dim_size)):
+
+            def clamp(x, lower, upper):
+                return sympy.Min(sympy.Max(x, lower), upper)
+
+        else:
+
+            def clamp(x, lower, upper):
+                return sizevars.evaluate_min(sizevars.evaluate_max(x, lower), upper)
+
+        def clamp_wrap(val, lower, upper, default):
+            if val is None:
+                return default
+            val = cls.handle_negative_index(val, dim_size)
+            return clamp(val, lower, upper)
+
+        start = clamp_wrap(start, 0, dim_size, 0)
+        end = clamp_wrap(end, start, dim_size, dim_size)
+        return start, end
+
+    @classmethod
+    def create(cls, x, dim, start, end, step=1):
+        step = sympy.expand(step)
+        assert step > 0
+        try:
+            if start == 0 and end >= 2**63 - 1 and step == 1:
+                return x
+        except TypeError:
+            pass
+
+        sizevars = V.graph.sizevars
+        new_size = list(x.get_size())
+
+        start, end = cls.normalize_start_end(x, dim, start, end)
+
+        new_size[dim] = FloorDiv(end - start + (step - 1), step)
+
+        if is_storage_and_layout(x):
+            # Fast path
+            storage, old_layout = as_storage_and_layout(x)
+            new_stride = list(old_layout.stride)
+            new_stride[dim] = new_stride[dim] * step
+            new_layout = FixedLayout(
+                old_layout.device,
+                old_layout.dtype,
+                new_size,
+                new_stride,
+                old_layout.offset + old_layout.stride[dim] * start,
+            )
+            return ReinterpretView(storage, new_layout)
+
+        def reindex(index):
+            assert len(index) == len(new_size), f"wrong ndim {index} {new_size}"
+            index = list(index)
+            index[dim] = index[dim] * step + start
+            return index
+
+        # redirect to a generic view
+        return SliceView(x, size=new_size, reindex=reindex)
+
+
+class BaseConstant(IRNode):
+    dtype: torch.dtype
+    device: torch.device
+
+    def get_size(self):
+        return ()
+
+    def get_device(self):
+        return self.device
+
+    def get_origin_node(self):
+        return None
+
+    def mark_reuse(self, users):
+        pass
+
+    def has_exceeded_max_reads(self):
+        return False
+
+    def get_reads(self):
+        return ()
+
+    def is_extern(self):
+        return False
+
+
+@dataclasses.dataclass
+class Constant(BaseConstant):
+    value: Any
+    dtype: torch.dtype
+    device: torch.device
+
+    def make_loader(self):
+        def loader(index):
+            return ops.constant(self.value, self.dtype)
+
+        return loader
+
+    def realize(self):
+        pass
+
+    def constant_to_device(self, device):
+        return Constant(self.value, self.dtype, device)
+
+
+@dataclasses.dataclass
+class IndexingConstant(BaseConstant):
+    index: Any
+    dtype: torch.dtype
+    device: torch.device
+
+    def make_loader(self):
+        def loader(index):
+            return ops.index_expr(self.index, self.dtype)
+
+        return loader
+
+    def constant_to_device(self, device):
+        return IndexingConstant(self.index, self.dtype, device)
+
+
+def is_contiguous_strides_for_shape(stride, shape):
+    return all(
+        size == 1 or left == right
+        for left, right, size in zip(
+            stride, FlexibleLayout.contiguous_strides(shape), shape
+        )
+    )
+
+
+@dataclasses.dataclass
+class Layout(IRNode):
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        size: List[Expr],
+        stride: Optional[Sequence[Union[Expr, int]]],
+        offset: Expr = Integer(0),
+    ):
+        assert stride is None or len(size) == len(
+            stride
+        ), f"size={size}, stride={stride}"
+        self.device = device
+        self.dtype = dtype
+        assert all(isinstance(s, (Expr, int)) for s in size)
+        self.size = size
+        self._stride = stride
+        self.offset = offset
+
+    @property
+    def stride(self):
+        return self._stride
+
+    def __str__(self):
+        offset = ""
+        if self.offset != 0:
+            offset = f", offset={self.offset}"
+        return (
+            f"{type(self).__name__}('{self.device.type}', {self.dtype}, "
+            f"size={self.size}, stride={self.stride}{offset})"
+        )
+
+    __repr__ = __str__
+
+    def is_contiguous(self):
+        return is_contiguous_strides_for_shape(self.stride, self.size)
+
+    def is_channels_last_contiguous(self):
+        ndim = len(self.size)
+        if ndim not in [4, 5]:
+            return False
+        for left, right, size in zip(
+            self.stride, make_channels_last_strides_for(self.size), self.size  # type: ignore[arg-type]
+        ):
+            if size != 1 and left != right:
+                return False
+        return True
+
+    def is_transposed(self):
+        for left, right, size in zip(
+            self.stride,
+            reversed(FlexibleLayout.contiguous_strides(self.size)),
+            self.size,
+        ):
+            if size != 1 and left != right:
+                return False
+        return True
+
+    def is_stride_ordered(self, order):
+        assert len(self.stride) == len(order)
+
+        # ignore dimensions of size 1, they dont affect layout
+        non_1_indices = [
+            i
+            for i, dim in enumerate(self.size)
+            if V.graph.sizevars.size_hint(dim, fallback=2) != 1
+        ]
+
+        stride = [self.stride[i] for i in non_1_indices]
+        order = [order[i] for i in non_1_indices]
+
+        def sorted_indices(arr):
+            sorted_arr = sorted(arr)
+            return [sorted_arr.index(element) for element in arr]
+
+        # since we may have removed dimensions, need to re-sort & re-index order
+        order = sorted_indices(order)
+
+        # reorder the stride given order
+        stride_ordered = [-1] * len(order)
+        for i in range(len(order)):
+            stride_ordered[order[i]] = V.graph.sizevars.size_hint(stride[i])
+        # check if it is in ascending order
+        for i in range(len(order) - 1):
+            if stride_ordered[i] > stride_ordered[i + 1]:
+                return False
+        return True
+
+    def is_channels_last_stride_ordered(self):
+        # create channels_last order(NCHW, NCDHW, the C is the first order).
+        order = [0] + list(reversed(range(1, len(self.stride) - 1)))
+        order = [len(order)] + order
+        return self.is_stride_ordered(order)
+
+    def as_fixed(self):
+        return FixedLayout(
+            self.device,
+            self.dtype,
+            self.size,
+            self.stride,
+            self.offset,
+        )
+
+    def make_indexer(self):
+        assert (
+            FlexibleLayout.allow_indexing
+        ), f"convert {type(self).__name__} to FixedLayout first"
+        return self.as_fixed().make_indexer()
+
+    def __eq__(self, other) -> bool:
+        return (
+            self.device == other.device
+            and self.dtype == other.dtype
+            and self.size == other.size
+            and self.stride == other.stride
+            and self.offset == other.offset
+        )
+
+    def storage_size(self) -> sympy.Expr:
+        return compute_required_storage_length(self.size, self.stride, self.offset)  # type: ignore[arg-type, return-value]
+
+
+class FixedLayout(Layout):
+    """A Tensor layout we cannot change"""
+
+    def __init__(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        size: Union[List[Expr], List[int]],
+        stride: Optional[Sequence[Union[Expr, int]]] = None,
+        offset: Union[Expr, int] = Integer(0),
+    ):
+        if stride is None:
+            stride = FlexibleLayout.contiguous_strides(size)
+        super().__init__(
+            device,
+            dtype,
+            size,  # type: ignore[arg-type]
+            stride,
+            offset,  # type: ignore[arg-type]
+        )
+
+    def make_indexer(self):
+        """A closure containing math to read a given element"""
+
+        def indexer(index):
+            assert len(index) == len(self.stride) == len(self.size)
+            result = self.offset
+            for idx, stride, sz in zip(index, self.stride, self.size):
+                if sz != 1:
+                    result = result + idx * stride
+            return result
+
+        return indexer
+
+
+class FlexibleLayout(Layout):
+    """A Tensor layout we are allowed to change"""
+
+    allow_indexing = False
+
+    @staticmethod
+    def contiguous_strides(sizes):
+        if len(sizes) == 0:
+            return []
+        reversed_strides = [sympy.Integer(1)]
+        for size in reversed(sizes[1:]):
+            reversed_strides.append(size * reversed_strides[-1])
+        return list(reversed(reversed_strides))
+
+    @staticmethod
+    def fill_ordered(sizes, order):
+        """
+        Create a stride based on the order the dimensions should be filled in.
+
+        In this format, channels last would be:
+            [1, 3, 2, 0]
+        """
+        assert set(range(len(sizes))) == set(order)
+        next_stride = sympy.Integer(1)
+        strides = [None] * len(order)
+
+        for i in order:
+            strides[i] = next_stride
+            next_stride = next_stride * sizes[i]
+        return strides
+
+    @staticmethod
+    def stride_ordered(sizes, order):
+        """
+        Create a stride based on the sorted order of a permuted range.
+
+        In this format, channels last would be:
+            [3, 0, 2, 1]
+        """
+        assert set(range(len(sizes))) == set(order)
+        fill_order = stride_order2fill_order(order)
+        return FlexibleLayout.fill_ordered(sizes, fill_order)
+
+    @staticmethod
+    def same_ordered(sizes, stride):
+        """
+        Create a stride that has the same stride order as given stride
+
+        For example, if given stride is [1000, 1, 100, 10],
+        the fill order should be [1, 3, 2, 0]
+        """
+        assert len(sizes) == len(stride)
+        stride = [V.graph.sizevars.size_hint(x) for x in stride]
+        fill_order = sorted(range(len(stride)), key=stride.__getitem__)
+        return FlexibleLayout.fill_ordered(sizes, fill_order)
+
+    def as_stride_order(self, order):
+        return FixedLayout(
+            self.device,
+            self.dtype,
+            self.size,
+            self.stride_ordered(self.size, order),
+            self.offset,
+        )
+
+    def as_fill_order(self, order):
+        return FixedLayout(
+            self.device,
+            self.dtype,
+            self.size,
+            self.fill_ordered(self.size, order),
+            self.offset,
+        )
+
+    def as_same_order(self, stride):
+        return FixedLayout(
+            self.device,
+            self.dtype,
+            self.size,
+            self.same_ordered(self.size, stride),
+            self.offset,
+        )
+
+    def __init__(self, device, dtype, size, stride_order=None):
+        if stride_order:
+            strides = FlexibleLayout.fill_ordered(size, stride_order)
+        else:
+            strides = FlexibleLayout.contiguous_strides(size)
+        super().__init__(device, dtype, size, strides)
+
+
+class AliasedLayout(Layout):
+    """Shares the same storage as another tensor"""
+
+    def __init__(self, view: Union[BaseView, "TensorBox"]):
+        layout = view.get_layout()
+        super().__init__(
+            layout.device,
+            layout.dtype,
+            layout.size,
+            layout.stride,
+        )
+        self.view = view
+
+    def make_indexer(self):
+        return self.as_fixed().make_indexer()
+
+    def maybe_guard_aligned(self):
+        offset = self.view.get_layout().offset
+        if offset == 0:
+            return True
+        from .compile_fx import ALIGNMENT
+
+        return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)  # type: ignore[arg-type]
+
+
+class NoneLayout(IRNode):
+    # This is janky, I figured out what fields to populate by just running
+    # the model I was interested in and adding properties/methods as needed.
+    # This doesn't inherit from Layout because Layout assumes you have stuff
+    # like sizes, but I don't really have anything here.
+    #
+    # If you have an ir.Node with NoneLayout, you probably need to setup
+    # dependencies manually in scheduler
+
+    def __init__(self, device):
+        self.device = device
+        self.size = [0]
+        self.stride = [0]
+
+    def storage_size(self):
+        return 0
+
+    def as_fixed(self):
+        return self
+
+
+class MutationLayout(Layout):
+    def __init__(self, target: IRNode):
+        super().__init__(
+            target.get_device(),
+            target.get_dtype(),
+            target.get_size(),
+            None,
+        )
+        self.target = target
+        name = self.get_buffer().get_name()
+        V.graph.mark_buffer_mutated(name)
+
+    @Layout.stride.getter  # type: ignore[attr-defined]
+    def stride(self):
+        return self.real_layout().stride
+
+    def storage_size(self) -> sympy.Expr:
+        return self.real_layout().storage_size()
+
+    def get_buffer(self) -> "Buffer":
+        def unwrap_views(target):
+            if isinstance(target, MutationLayout):
+                return unwrap_views(target.target)
+            if isinstance(target, BaseView):
+                return unwrap_views(target.unwrap_view())
+            if isinstance(target, MutableBox):
+                return unwrap_views(target.data)
+            return target
+
+        result = unwrap_views(self.target)
+        assert isinstance(result, Buffer), "MutationLayout must refer to a buffer"
+        return result
+
+    def real_layout(self):
+        return self.get_buffer().layout
+
+    @classmethod
+    def realize_into(cls, src, dst, unsafe_alias=False):
+        dst.realize()
+        # NOTE: We must realize users of `dst` before we realize `src`, since
+        # realization order determines scheduling order. Otherwise, src's
+        # mutation would be scheduled before the existing users of dst!
+        V.graph.mark_buffer_mutated(dst.get_name())
+
+        if isinstance(src, TensorBox):
+            src = src.data
+
+        # We copy the contents of src into dst. In most cases this should
+        # be fused into a single kernel by the scheduler.
+        # NOTE: We cannot change src's layout to mutate dst directly as this
+        # would alias src to dst, which is not correct as further mutations to
+        # dst would effect users of src. However if there are no more users of
+        # dst, we can alias src to dst.
+        src.realize_hint()
+
+        if not unsafe_alias:
+            src = Pointwise.create(
+                device=src.get_device(),
+                dtype=src.get_dtype(),
+                inner_fn=src.make_loader(),
+                ranges=[
+                    V.graph.sizevars.guard_equals(a, b)
+                    for a, b in zip(src.get_size(), dst.get_size())
+                ],
+            ).data
+
+        src.realize()
+        assert isinstance(src.data.layout, FlexibleLayout)
+        src.data.layout = MutationLayout(dst)
+        return src.data
+
+    def as_fixed(self):
+        return self
+
+    def make_indexer(self):
+        return self.target.make_indexer()
+
+
+@dataclasses.dataclass
+class Buffer(IRNode):
+    # Name is sometimes None; e.g., ForceInPlace, where there isn't
+    # a meaningful name
+    name: Optional[str]
+    layout: Layout
+
+    # Multi-output buffers will define 'outputs: List[Buffer]'. Confusingly,
+    # MultiOutput does NOT define this!
+
+    def __post_init__(self):
+        super().__post_init__()
+        self.origin_node = None
+
+    def make_indexer(self):
+        return self.layout.make_indexer()
+
+    def get_name(self) -> str:
+        assert self.name
+        return self.name
+
+    def get_device(self):
+        return self.layout.device
+
+    def get_origin_node(self):
+        return self.origin_node
+
+    @property
+    def dtype(self):
+        return getattr(self.layout, "dtype", None)
+
+    def get_size(self):
+        return list(self.layout.size)
+
+    def get_stride(self):
+        return list(self.layout.stride)
+
+    def get_offset(self):
+        return self.layout.offset
+
+    def get_layout(self):
+        return self.layout
+
+    def get_storage_numel(self):
+        return self.get_numel()
+
+    def is_extern(self):
+        return False
+
+    def freeze_layout(self):
+        if not isinstance(self.layout, (MultiOutputLayout, AliasedLayout)):
+            self.layout = self.layout.as_fixed()
+
+    def freeze_layout_with_stride_order(self, order):
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_stride_order(order)
+
+    def freeze_layout_with_fill_order(self, order):
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_fill_order(order)
+
+    def freeze_layout_with_same_order(self, stride):
+        assert isinstance(self.layout, FlexibleLayout)
+        self.layout = self.layout.as_same_order(stride)
+
+    def is_zero_elements(self):
+        return V.graph.sizevars.is_expr_static_and_true(sympy.Eq(self.get_numel(), 0))  # type: ignore[arg-type]
+
+    def make_loader(self):
+        # Loading from a zero-element buffer is a no-op
+        if self.is_zero_elements():
+            return partial(nop_loader_fn, dtype=self.get_dtype())
+
+        def loader(index):
+            indexer = self.layout.make_indexer()
+            return ops.load(self.name, indexer(index))
+
+        return loader
+
+    def is_no_op(self):
+        return False
+
+    def codegen_reference(self, writer=None):
+        return self.get_name()
+
+    def decide_layout(self):
+        pass
+
+    def get_alias_names(self):
+        if isinstance(self.layout, AliasedLayout):
+            return [self.layout.view.get_name()]
+        return ()
+
+    def get_mutation_names(self):
+        if isinstance(self.layout, MutationLayout):
+            return [self.layout.target.get_name()]
+        return ()
+
+    def get_read_writes(self):
+        with patch.object(FlexibleLayout, "allow_indexing", True):
+            return extract_read_writes(
+                self.make_loader(),
+                self.get_size(),
+            )
+
+    def get_reads(self):
+        return self.get_read_writes().reads
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        """
+        Returns the unbacked symbols which are defined by this IR node,
+        because this is a data-dependent IR node, or item()
+        """
+        # So this is a little unusual.  In principle, you could imagine
+        # defining a MultiOutputLayout buffer so that it DOES define
+        # unbacked symints.  However, we can't easily tell what symints
+        # such a buffer defines, because MultiOutputLayout doesn't actually
+        # define any useful information about what it returns.
+        #
+        # An easier and better approach is to delay the symint allocation
+        # to the MultiOutput IR nodes, which are when we actually extract
+        # out the buffers and know what their sizes are.
+        #
+        # There are two subleties here:
+        #
+        # 1. Suppose you have a kernel that produces out1: (i0,), out2: (i0,)
+        #    Both of these actually count as defs!  The scheduler will just
+        #    arbitrarily pick one of these as the canonical definer and
+        #    ensure it stays live.  It's not a big deal if we pick the
+        #    wrong one because tuple accesses are cheap, and all this means
+        #    is we accidentally keep a MultiOutput node live when it wasn't
+        #    strictly necessary.
+        #
+        # 2. Suppose you have a MultiOutput buffer whose size is (i0,), but
+        #    the MultiOutputLayout buffer it is projecting from isn't actually
+        #    dynamic; it has i0 as one of the arguments.  We cannot tell this
+        #    directly from MultiOutput, we have to look at the input buffer's
+        #    uses to work this out.  No big deal.
+        if isinstance(self.layout, (NoneLayout, MultiOutputLayout)):
+            return set()
+
+        # This kernel defines all unbacked symbols... that it didn't get in as
+        # arguments!
+        defs = (
+            free_unbacked_symbols(self.get_size())
+            | free_unbacked_symbols(self.get_stride())
+            | free_unbacked_symbols(self.get_offset())
+        )
+        return defs - self.get_unbacked_symbol_uses()
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        """
+        Returns the unbacked symbols which are required to be in scope in
+        order to successfully perform codegen for this buffer.  For example,
+        a buffer that corresponds to an extern kernel call that takes i0 as
+        an argument would return {i0} here.  This is used to generate necessary
+        dependencies that ensure we actually bind i0 in codegen before you
+        try to use it.
+
+        Note that this is NOT transitive; in particular, if this buffer takes
+        in as input another buffer with dynamic shape (e.g., (i0,)), we will
+        not report it here, because you will already have a dependency
+        on that buffer, which will eventually have a dependency on i0 if
+        necessary.
+        """
+        return set()
+
+    def codegen_unbacked_symbol_defs(self, wrapper):
+        # NB: If it is possible for other ir node types to return unbacked
+        # symints, you need to make sure their codegen calls this method.
+        # Don't forget to update get_unbacked_symbol_defs too.
+        symbols_to_define = self.get_unbacked_symbol_defs()
+        for i, s in enumerate(self.get_size()):
+            if s in symbols_to_define:
+                wrapper.writeline(
+                    f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.size({i}){wrapper.ending}"
+                )
+                symbols_to_define.remove(s)
+        for i, s in enumerate(self.get_stride()):
+            if s in symbols_to_define:
+                wrapper.writeline(
+                    f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.stride({i}){wrapper.ending}"
+                )
+                symbols_to_define.remove(s)
+        if (s := self.get_offset()) in symbols_to_define:
+            wrapper.writeline(
+                f"{wrapper.codegen_unbacked_symbol_decl(s)} = {self.get_name()}.storage_offset(){wrapper.ending}"
+            )
+            symbols_to_define.remove(s)
+        assert (
+            not symbols_to_define
+        ), f"unbacked symint {s} not written out, check comment above"
+
+    def realize(self):
+        pass
+
+    def get_workspace_size(self):
+        """
+        Gets extra global memory size needed by this buffer.
+        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
+        """
+        return 0
+
+    def should_allocate(self):
+        # Returns False by default.
+        return False
+
+
+class InputBuffer(Buffer):
+    pass
+
+
+class ConstantBuffer(InputBuffer):
+    override_device: Optional[torch.device] = None
+
+    def make_loader(self):
+        def loader(index):
+            indexer = self.layout.make_indexer()
+            return ops.load(
+                V.graph.constant_name(self.get_name(), self.override_device),
+                indexer(index),
+            )
+
+        return loader
+
+    def constant_to_device(self, device):
+        return ConstantBuffer(
+            V.graph.constant_name(self.get_name(), device), self.layout
+        )
+
+
+class NoneAsConstantBuffer(IRNode):
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def codegen_reference(self, writer=None):
+        return V.graph.wrapper_code.none_str
+
+
+class ShapeAsConstantBuffer(IRNode):
+    def __init__(self, shape):
+        super().__init__()
+        self.shape = shape
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return free_unbacked_symbols(self.shape)
+
+    def codegen_reference(self, writer=None):
+        return V.graph.wrapper_code.expr_printer(V.graph.sizevars.simplify(self.shape))
+
+
+@dataclasses.dataclass
+class ComputedBuffer(Buffer):
+    data: Loops
+
+    def get_computed_buffer_name(self):
+        """
+        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
+        If neither exist, returns None.
+        """
+        if self.name is not None:
+            return self.name
+        if hasattr(self.data, "name"):
+            return self.data.name
+        return None
+
+    @cache_on_self
+    def num_reads(self):
+        return len(self.get_read_writes().reads)
+
+    def get_read_writes(self):
+        with patch.object(FlexibleLayout, "allow_indexing", True):
+            if self.data.get_reduction_type():
+                return extract_read_writes(
+                    self.get_store_function(),
+                    self.data.get_pointwise_size(),
+                    self.data.get_reduction_size(),
+                )
+            else:
+                return extract_read_writes(
+                    self.get_store_function(),
+                    self.data.get_size(),
+                )
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        # Ordinarily, we'd like to just peek at the arguments list,
+        # but ComputedBuffers have no argument list.
+        #
+        # Morally, this logic needs to be synchronized with the
+        # KernelArgs.size calls, which are responsible for making symbols make
+        # there way as kernel arguments (and it is precisely passing in one of
+        # those symbols that establishes a dependency).  However, we haven't
+        # started codegen yet so we can't directly reuse that logic.
+        #
+        # For now, I'm just yoloing with the size of the buffer.  Not sure if
+        # it is enough.
+        #
+        # One thing you might wonder is if this is enough for a ComputedBuffer
+        # denoting a reduction over i0.  Empirically, it is enough, but for an
+        # unusual reason: we only need accurate dependencies for item() call,
+        # but it's impossible to end up with a reduction over i0 from an
+        # item() call without a regular non-reduction buffer first.
+        return (
+            free_unbacked_symbols(self.get_size())
+            | free_unbacked_symbols(self.get_stride())
+            | free_unbacked_symbols(self.get_offset())
+            | self.data.get_unbacked_symbol_uses()
+        )
+
+    def make_loader(self):
+        # Inline constants and index_expressions
+        if (
+            hasattr(self.data, "make_loader")
+            and self.name not in V.graph.mutated_buffers
+            and self.num_reads() == 0
+        ):
+            # can be inlined
+            return self.data.make_loader()
+        return super().make_loader()
+
+    def get_store_function(self):
+        indexer = self.layout.as_fixed().make_indexer()
+        if isinstance(self.data, (Reduction, Scan)):
+            return partial(self.data.store_reduction, self.name, indexer)
+        else:
+            assert isinstance(self.data, Pointwise)
+            return partial(self.data.store_output, self.name, indexer)
+
+    def get_fill_order(self):
+        """
+        If our layout is still flexible, try to determine the stride order based on stride orders of reads.
+
+        TODO(jansel): A better algorithm here would look at downstream consumers of this
+                      value and try to do global graph-level layout optimization.
+                      This is also something just begging to be autotuned.
+        """
+        if isinstance(self.layout, FlexibleLayout):
+            (index_vars, reduction_vars), _ = dependencies.index_vars_squeeze(
+                self.data.get_pointwise_size(), self.data.get_reduction_size()
+            )
+            reads = self.get_read_writes().reads
+            reads_bufs = [
+                V.graph.name_to_buffer[r.name]
+                if r.name in V.graph.name_to_buffer.keys()
+                else None
+                for r in reads
+            ]
+            # only consider reads to buffer of same size
+            # ignore StarDeps because they don't contribute stride information
+            assert all(
+                isinstance(r, (dependencies.StarDep, dependencies.MemoryDep))
+                for r in reads
+            )
+            reads = [
+                sympy_subs(
+                    r.index, {v: sympy.Integer(0) for v in reduction_vars if v != 0}
+                )
+                for r in reads
+                if isinstance(r, dependencies.MemoryDep)
+            ]
+
+            if reads:
+                if isinstance(self.data, Scan):
+                    indices = self.data.reindex(index_vars, reduction_vars)
+                else:
+                    indices = index_vars
+                stride_lengths = [
+                    V.graph.sizevars.stride_hints(expr, indices) for expr in reads  # type: ignore[arg-type]
+                ]
+                from .scheduler import pick_loop_order
+
+                return pick_loop_order(stride_lengths, self.get_size())
+
+        return None
+
+    def decide_layout(self):
+        if isinstance(self.layout, FlexibleLayout):
+            order = self.get_fill_order()
+            if order:
+                self.freeze_layout_with_fill_order(order)
+            else:
+                self.freeze_layout()
+
+    def get_default_sizes_body(self):
+        args, var_ranges = dependencies.index_vars_squeeze(
+            self.data.get_pointwise_size(), self.data.get_reduction_size(), prefix="q"
+        )
+        with patch.object(ConstantBuffer, "override_device", self.get_device()):
+            body = LoopBody(
+                self.get_store_function(),
+                (args if self.get_reduction_type() else args[:1]),
+                var_ranges,
+            )
+        index_vars = []
+        reduce_vars: List[Any] = []
+        index_size = []
+        reduce_size = []
+        for v, s in var_ranges.items():
+            if v in args[0]:
+                assert not reduce_vars
+                index_vars.append(v)
+                index_size.append(s)
+            else:
+                assert v in args[1]
+                reduce_vars.append(v)
+                reduce_size.append(s)
+        return (index_size, reduce_size), body, (index_vars, reduce_vars)
+
+    def simplify_and_reorder(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        """
+        This is a main place where we do loop transformations in a
+        backend-agnostic way.
+
+        Here we:
+            1) Remove any 1 dimensions
+            2) Fuse contiguous dimensions together
+            3) Reorder dimensions based on stride orders
+
+        Optional argument extra_indexing_constraints can be used to append additional
+        indexing expressions to existing ones derived from buffer's body. This can be useful
+        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
+        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
+        the scheduler node compatible with other nodes.
+        """
+        (
+            (index_size, reduce_size),
+            body,
+            (index_vars, reduce_vars),
+        ) = self.get_default_sizes_body()
+
+        index_formulas = [*body.indexing_exprs.values()]
+        if extra_indexing_constraints is not None:
+            assert (
+                isinstance(extra_indexing_constraints, tuple)
+                and len(extra_indexing_constraints) == 2
+            )
+            extra_indexing_ranges, extra_indexing_expr = extra_indexing_constraints
+            assert isinstance(extra_indexing_ranges, dict)
+            assert isinstance(extra_indexing_expr, list)
+            assert all(isinstance(f, Expr) for f in extra_indexing_expr)
+
+            expected_var_ranges = body.var_ranges
+            assert expected_var_ranges == extra_indexing_ranges, (
+                expected_var_ranges,
+                extra_indexing_ranges,
+            )
+            # remove already existing expressions
+            extra_indexing_expr = [
+                e for e in extra_indexing_expr if e not in index_formulas
+            ]
+            index_formulas += extra_indexing_expr
+
+        reads_bufs = [
+            V.graph.name_to_buffer[reads_name]
+            if reads_name in V.graph.name_to_buffer.keys()
+            else None
+            for reads_name in body.reads_name2expr.keys()
+        ]
+        memory_addrs = [
+            *body.reads_name2expr.values(),
+            *body.writes_name2expr.values(),
+        ]
+
+        # the reordering_reindex in reads' simplify_reorder_and_tile
+        reordering_reindex = [same_reorder(range(len(index_vars)))] * len(memory_addrs)
+        for i, reads_buf in enumerate(reads_bufs):
+            if isinstance(reads_buf, ComputedBuffer) and hasattr(
+                reads_buf, "iter_reordering_reindex"
+            ):
+                reordering_reindex[i] = reads_buf.iter_reordering_reindex  # type: ignore[has-type]
+
+        def simplify_and_reorder(x_vars, support_vars, sizes, reordering_reindex=None):
+            sizes, reindex0, reindex1 = self._apply_loop_reordering(
+                x_vars, support_vars, sizes, memory_addrs, reordering_reindex
+            )
+            # for NHWC: reindex0([0,1,2,3]) = [0,2,3,1], reindex1([0,1,2,3]) = [0,3,2,1]
+            x_vars = reindex0(x_vars)
+            sizes, reindex2, prune = V.graph.sizevars._simplify_loops(
+                x_vars,
+                sizes,
+                index_prevent_reordering(index_formulas, x_vars, sizes),
+            )
+            x_vars = prune(x_vars)
+            # sizes, reindex1, prune = _simplify_loops(x_vars, sizes, index_formulas)
+            # x_vars = prune(x_vars)
+            # sizes, reindex2 = self._apply_loop_reordering(x_vars, sizes, memory_addrs)
+            reindex = fuse_reindexing(reindex1, reindex2)
+            return sizes, reindex, reindex1
+
+        support_vars = index_vars + reduce_vars
+        iter_ranges, iter_reindex, iter_reordering_reindex = simplify_and_reorder(
+            index_vars, support_vars, index_size, reordering_reindex
+        )
+        reduce_ranges, reduce_reindex, _ = simplify_and_reorder(
+            reduce_vars, support_vars, reduce_size
+        )
+
+        # remember the reordering if not have loop collapse.
+        if len(iter_ranges) == len(index_vars):
+            self.iter_reordering_reindex = iter_reordering_reindex
+        # retrace the loop body with simplification and reordering applied
+        (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
+            iter_ranges, reduce_ranges, prefix="z"
+        )
+        body = LoopBody(
+            body, [iter_reindex(iter_vars), reduce_reindex(reduce_vars)], var_ranges
+        )
+        return (iter_ranges, reduce_ranges), body
+
+    @staticmethod
+    def _apply_loop_reordering(
+        index_vars,
+        support_vars,
+        sizes,
+        memory_addrs,
+        reordering_reindex=None,
+        priority_idx=None,
+    ):
+        """
+        Shuffle the order of loops around to hopefully improve performance.
+        """
+        from .scheduler import pick_loop_order
+
+        if priority_idx is None:
+            priority_idx = []
+
+        try:
+            strides = [
+                V.graph.sizevars.stride_hints(expr, index_vars, support_vars)
+                for expr in memory_addrs
+            ]
+            assert len(strides) == len(memory_addrs) and len(strides[0]) == len(
+                index_vars
+            )
+            # consider both layout(strides) and reordering(reordering_reindex)
+            if reordering_reindex is not None:
+                for i in range(len(memory_addrs)):
+                    try:
+                        strides[i] = reordering_reindex[i](strides[i])
+                    # if len(order) != len(strides), do not reorder
+                    except AssertionError:
+                        pass
+            order = list(reversed(pick_loop_order(strides, sizes, priority_idx)))
+        except Exception:
+            if config.debug:
+                log.warning(
+                    "Did not simplify complex index:\n%s\n%s",
+                    dict(zip(index_vars, sizes)),
+                    memory_addrs,
+                )
+            order = list(range(len(sizes)))
+        sizes = [sizes[i] for i in order]
+        return sizes, same_reorder(order), inverse_reorder(order)
+
+    def get_reduction_size(self):
+        return self.data.get_reduction_size()
+
+    def get_reduction_type(self):
+        return self.data.get_reduction_type()
+
+    def is_no_op(self):
+        return self.data.is_zero_elements()
+
+    def should_allocate(self):
+        return True
+
+    def constant_to_device(self, device):
+        """Move this to a given device. Requires that all reads are to constants."""
+        return self.data.constant_to_device(device)
+
+
+class TemplateBuffer(Buffer):
+    """
+    Represents a Triton (in the future other type) of template operator
+    that we can fuse an epilogue onto.
+    """
+
+    def __init__(self, layout, inputs, make_kernel_render):
+        super().__init__(name=None, layout=layout)
+        self.inputs = InputsKernel.unwrap_storage(inputs)
+        self.make_kernel_render = make_kernel_render
+        self.name = V.graph.register_buffer(self)
+
+    def get_read_writes(self):
+        return self.normalized_read_writes()
+
+    def normalized_read_writes(self):
+        name = self.get_name()
+        indexer = self.layout.make_indexer()
+
+        def dummy(index, rindex):
+            assert len(rindex) == 0
+            return ops.store(name, indexer(index), "fake")
+
+        deps = dependencies.extract_read_writes(
+            dummy, self.get_size(), (), normalize=True
+        )
+        deps.reads = {dependencies.StarDep(x.get_name()) for x in self.inputs}
+        return deps
+
+    def get_reduction_size(self):
+        return 1
+
+    def get_reduction_type(self):
+        return None
+
+    def is_no_op(self):
+        return False
+
+    def should_allocate(self):
+        return True
+
+    def simplify_and_reorder(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        return (
+            (
+                self.get_size(),
+                (),
+            ),
+            None,
+        )
+
+
+class TritonTemplateBuffer(TemplateBuffer):
+    pass
+
+
+class CUDATemplateBuffer(TemplateBuffer):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        make_kernel_render,
+        workspace_size: int,
+        template: "CUDATemplate",  # type: ignore[name-defined]  # noqa: F821
+    ):
+        super().__init__(layout, inputs, make_kernel_render)
+        # Global memory (in bytes) needed for this template.
+        self.workspace_size = workspace_size
+        self.template = template
+
+    def get_workspace_size(self):
+        return self.workspace_size if self.workspace_size is not None else 0
+
+
+@dataclasses.dataclass
+class InputsKernel(Buffer):
+    inputs: List[Buffer]
+
+    def get_read_writes_input(self, x):
+        return dependencies.StarDep(x.get_name())
+
+    def get_read_writes(self):
+        star_dep = []
+        for input in self.inputs:
+            if isinstance(input, list):
+                star_dep.extend([self.get_read_writes_input(x) for x in input])
+            else:
+                star_dep.append(self.get_read_writes_input(input))
+
+        return dependencies.ReadWrites(
+            set(star_dep),
+            {dependencies.StarDep(self.get_name())},
+            set(),
+            [],
+            None,
+            op_counts=collections.Counter(),
+        )
+
+    @classmethod
+    def unwrap_storage_for_input(cls, x):
+        if isinstance(x, TensorBox):
+            x = x.data
+        if isinstance(x, StorageBox):
+            x = x.data
+        if isinstance(x, BaseView) and not isinstance(x, ReinterpretView):
+            x = ExternKernel.realize_input(x)
+        if isinstance(x, TensorBox):
+            # when converting to ReinterpretView fails in the
+            # realize_input call above, the result will be wrapped
+            # into TensorBox / StorageBox pair as a result of the
+            # cls.copy_input call; so we should unwrap recursively
+            return cls.unwrap_storage_for_input(x)
+        assert isinstance(x, (Buffer, ReinterpretView)), x
+        return x
+
+    @staticmethod
+    def unwrap_storage(inputs):
+        inputs_new = []
+        for x in inputs:
+            if isinstance(x, list):
+                x = [InputsKernel.unwrap_storage_for_input(i) for i in x]
+            else:
+                x = InputsKernel.unwrap_storage_for_input(x)
+            inputs_new.append(x)
+        return inputs_new
+
+    def is_extern(self):
+        return True
+
+
+class NopKernel(InputsKernel):
+    def is_no_op(self):
+        return True
+
+
+class ConcatKernel(NopKernel):
+    """
+    There isn't actually a real kernel for concat, we just change the
+    storage for the upstream data.
+    """
+
+    @classmethod
+    def create(cls, inputs, dim):
+        device = inputs[0].get_device()
+        dtype = inputs[0].get_dtype()
+        new_size = list(inputs[0].get_size())
+        offsets_start = [0]
+        offsets_end = [new_size[dim]]
+        assert 0 <= dim < len(new_size)
+        for i in range(1, len(inputs)):
+            input_size = inputs[i].get_size()
+            offsets_start.append(new_size[dim])
+            assert len(input_size) == len(new_size)
+            assert inputs[i].get_dtype() == dtype
+            assert inputs[i].get_device() == device
+            for j in range(len(new_size)):
+                if j == dim:
+                    new_size[j] = new_size[j] + input_size[j]
+                else:
+                    new_size[j] = V.graph.sizevars.guard_equals(
+                        new_size[j], input_size[j]
+                    )
+            offsets_end.append(new_size[dim])
+
+        output_stride = FlexibleLayout.contiguous_strides(new_size)
+        # If any of the inputs is in CL format, use CL format for the output
+        for i in range(len(inputs)):
+            x = inputs[i]
+            if is_storage_and_layout(x):
+                layout = x.get_layout()
+                if (
+                    isinstance(layout, FixedLayout)
+                    and layout.is_channels_last_contiguous()
+                ):
+                    # use CL stride for the output
+                    output_stride = make_channels_last_strides_for(new_size)
+                    break
+
+        concat_kernel = ConcatKernel(
+            name=None,
+            layout=FixedLayout(
+                device=device,
+                dtype=dtype,
+                size=new_size,
+                stride=output_stride,
+            ),
+            inputs=[],
+        )
+        kernel = StorageBox(concat_kernel)
+        buffer_names = []
+        for i in range(len(inputs)):
+            input_buffer = cls.realize_into(
+                inputs[i],
+                SliceView.create(kernel, dim, offsets_start[i], offsets_end[i]),
+            )
+            concat_kernel.inputs.append(input_buffer)
+
+            if isinstance(inputs[i].data, BaseView):
+                input_unwrapped = inputs[i].data.unwrap_view()
+            else:
+                input_unwrapped = inputs[i].data
+
+            if (
+                input_unwrapped.is_input_buffer()
+                and inputs[i].get_device().type == "cuda"
+                and not is_dynamic(input_buffer)
+            ):
+                buffer_names.append(input_buffer.get_name())
+
+        if len(buffer_names) > 1:
+            V.graph.register_list(buffer_names)
+
+        concat_kernel.name = V.graph.register_buffer(concat_kernel)
+        concat_kernel.inputs = cls.unwrap_storage(concat_kernel.inputs)
+
+        return kernel
+
+    @classmethod
+    def can_realize_into_without_copy(cls, src):
+        if isinstance(src, TensorBox):
+            # unwrap a TensorBox
+            return cls.can_realize_into_without_copy(src.data)
+
+        return isinstance(src.data.layout, FlexibleLayout) and not isinstance(
+            src.data, ExternKernelAlloc
+        )
+
+    @classmethod
+    def realize_into(cls, src, dst):
+        # Attempt to turn this into a ReinterpretView rather than assert.
+        # This has concessions around layout, as as_storage_and_layout
+        # can cause us to go from flexible to fixed layout.
+        if not isinstance(dst, ReinterpretView):
+            if is_storage_and_layout(dst):
+                storage, layout = as_storage_and_layout(dst)
+                dst = ReinterpretView(storage, layout)
+        assert isinstance(dst, ReinterpretView), dst
+        if isinstance(src, TensorBox):
+            # unwrap a TensorBox
+            return cls.realize_into(src.data, dst)
+        if isinstance(src, StorageBox):
+            src.realize()
+            # ExternKernelAlloc has specific requirements for output layout, should create a copy
+            assert hasattr(src.data, "layout")
+            if cls.can_realize_into_without_copy(src):
+                src.data.layout = AliasedLayout(dst)
+                return src.data
+        # introduce a copy
+        pw = Pointwise.create(
+            device=src.get_device(),
+            dtype=src.get_dtype(),
+            inner_fn=src.make_loader(),
+            ranges=[
+                V.graph.sizevars.guard_equals(a, b)
+                for a, b in zip(src.get_size(), dst.get_size())
+            ],
+        )
+        return cls.realize_into(pw, dst)
+
+    def should_allocate(self):
+        return True
+
+
+@dataclasses.dataclass
+class ExternKernel(InputsKernel):
+    constant_args: Tuple[Any, ...] = ()
+    kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
+    output_view: Optional[ReinterpretView] = None
+    python_kernel_name: Optional[str] = None
+    cpp_kernel_name: Optional[str] = None
+    # FIXME: in some cases we sill need to explicitly pass in ordered_kwargs_for_cpp_kernel
+    # We shouldn't need to do this since the information can be retrieved from op_overload._schema.
+    ordered_kwargs_for_cpp_kernel: Iterable[str] = dataclasses.field(
+        default_factory=list
+    )
+    op_overload: Optional[
+        Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]
+    ] = None
+    arg_properties: Optional[List[Dict[str, Any]]] = None
+    kwarg_properties: Optional[Dict[str, Dict[str, Any]]] = None
+
+    def __init__(
+        self,
+        name,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        output_view=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ):
+        super().__init__(
+            name,
+            layout,
+            inputs,
+        )
+        self.constant_args = constant_args
+        self.kwargs = kwargs if kwargs else {}
+        self.output_view = output_view
+        self.python_kernel_name = python_kernel_name
+        self.cpp_kernel_name = cpp_kernel_name
+        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
+        self.op_overload = op_overload
+        self.collect_arg_kwarg_properties()
+
+    def collect_arg_kwarg_properties(self):
+        # if self.op_overload is torch._ops.OpOverload, we can use its schema to collect additional
+        # information for args and kwargs, e.g. type and default value, to help with the cpp wrapper codegen
+        if (
+            isinstance(self.op_overload, torch._ops.OpOverload)
+            and not self.ordered_kwargs_for_cpp_kernel
+        ):
+            self.ordered_kwargs_for_cpp_kernel = [
+                x.name for x in self.op_overload._schema.arguments if x.kwarg_only
+            ]
+        self.arg_properties = (
+            [
+                {
+                    "name": x.name,
+                    "type": x.real_type,
+                    "default_value": x.default_value,
+                }
+                for x in self.op_overload._schema.arguments
+                if not x.kwarg_only
+            ]
+            if isinstance(self.op_overload, torch._ops.OpOverload)
+            else [{} for i in range(len(self.inputs))]
+        )
+        self.kwarg_properties = (
+            {
+                x.name: {"type": x.real_type, "default_value": x.default_value}
+                for x in self.op_overload._schema.arguments
+                if x.kwarg_only
+            }
+            if isinstance(self.op_overload, torch._ops.OpOverload)
+            else {}
+        )
+
+    def decide_layout(self):
+        if isinstance(self.layout, FlexibleLayout):
+            self.apply_constraint()
+            self.freeze_layout()
+
+    def codegen_comment(self, wrapper):
+        origin_str, detailed_origin_str = get_kernel_metadata(self, wrapper)
+        if origin_str:
+            wrapper.writeline(origin_str)
+
+    def codegen(self, wrapper):
+        raise NotImplementedError()
+
+    def get_kernel_name(self):
+        return self.cpp_kernel_name if V.graph.cpp_wrapper else self.python_kernel_name
+
+    @staticmethod
+    def copy_input(x):
+        pw = Pointwise.create(
+            device=x.get_device(),
+            dtype=x.get_dtype(),
+            inner_fn=x.make_loader(),
+            ranges=x.get_size(),
+            origin_node=x.get_origin_node(),
+            traceback=x.get_traceback(),
+        )
+        pw.realize()
+        return pw
+
+    @classmethod
+    def process_kernel(cls, kernel, *args, **kwargs):
+        binded_args = {"args": args, "kwargs": kwargs}
+
+        args_flat, args_spec = pytree.tree_flatten(binded_args)
+
+        is_arg_tensor = []
+        tensor_args = []
+        non_tensor_args: List[Any] = []
+        for arg in args_flat:
+            is_arg_tensor.append(isinstance(arg, IRNode))
+            if is_arg_tensor[-1]:
+                tensor_args.append(arg)
+            else:
+                if isinstance(arg, sympy.Expr):
+                    arg = V.graph.sizevars.shape_env.create_symintnode(arg, hint=None)
+                non_tensor_args.append(arg)
+
+        def unflatten_args(new_tensor_args, new_non_tensor_args):
+            result = []
+            it_tensors = iter(new_tensor_args)
+            it_non_tensors = iter(new_non_tensor_args)
+            for is_tensor in is_arg_tensor:
+                if is_tensor:
+                    result.append(next(it_tensors))
+                else:
+                    result.append(next(it_non_tensors))
+            r = pytree.tree_unflatten(result, args_spec)
+            return r.get("args", []), r.get("kwargs", {})
+
+        tensor_args = [cls.realize_input(x) for x in tensor_args]
+
+        # freeze layout otherwise our output stride calculation might
+        # become incorrect
+        for x in tensor_args:
+            if is_storage_and_layout(x):
+                as_storage_and_layout(x, freeze=True)
+
+        # We don't have generic shape formulas, so just burn in the
+        # shapes and run an example input.
+        # TODO(jansel): replace this with dynamic shape formulas
+        example_args = []
+
+        # We need to retain the constant values of fake tensors that we originally
+        # propagated the graph with, because for some operators running without a
+        # constant would trigger an error / DataDependentException
+        for x in tensor_args:
+            if x.get_name() in V.graph.constants:
+                example_args.append(V.graph.constants[x.get_name()])
+            else:
+                example_args.append(ir_node_to_tensor(x, guard_shape=True))
+
+        new_args, new_kwargs = unflatten_args(example_args, non_tensor_args)
+        example_output = kernel(*new_args, **new_kwargs)
+
+        example_out_li = (
+            [example_output]
+            if not isinstance(example_output, (list, tuple))
+            else example_output
+        )
+        for t in example_out_li:
+            if isinstance(t, torch.Tensor) and t.is_sparse:
+                msg = "sparsity not handled. Please file issue for sparse inference weights."
+                if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
+                    msg = f"{msg} Found from : \n {stack_trace}"
+                V.graph.disable_cudagraphs_reason = msg
+
+        # TODO: Unconditionally do this, not just when example_output has
+        # unbacked symbols
+        if maybe_free_unbacked_symbols(example_output):
+            example_output = V.graph.current_node.meta["val"]
+
+        return example_output, tensor_args, non_tensor_args, unflatten_args
+
+    @classmethod
+    def convert_to_reinterpret_view(cls, x):
+        """
+        In order to pass this to an extern kernel we need a
+        ReinterpretView not a View.  This allows us to avoid some
+        unneeded copies.
+        """
+        assert isinstance(x, BaseView)
+        if isinstance(x, ReinterpretView):
+            return x
+
+        # NOTE: Don't use extract_read_writes here as it fails when
+        # make_loader() inlines the computation
+        x.unwrap_view().freeze_layout()
+        index_args, var_ranges = dependencies.index_vars_squeeze(
+            x.get_size(), prefix="r"
+        )
+        range_vars = index_args[0]
+        index = x.make_indexer()(range_vars)
+
+        index = V.graph.sizevars.simplify_with_ranges(index, var_ranges)
+        strides = V.graph.sizevars.stride_vars(index, range_vars)
+        offset = V.graph.sizevars.offset_var(index, range_vars)
+        expected = sympy_dot(range_vars, strides) + offset
+
+        if index != expected:
+            log.debug(
+                "convert_to_reinterpret_view failed: stride=%s offset=%s index=%s",
+                strides,
+                offset,
+                index,
+            )
+            raise NotImplementedError()
+
+        return ReinterpretView(
+            data=x.data,
+            layout=FixedLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=x.get_size(),
+                stride=strides,
+                offset=offset,
+            ),
+        )
+
+    @classmethod
+    def realize_input(cls, x):
+        if x is None:
+            return NoneAsConstantBuffer()
+        if isinstance(x, (sympy.Expr, sympy.logic.boolalg.Boolean, int)):
+            return ShapeAsConstantBuffer(x)
+        if isinstance(x, Constant):
+            return V.graph.add_tensor_constant(
+                torch.tensor(x.value, dtype=x.get_dtype(), device=x.get_device())
+            )
+        if isinstance(x, ConstantBuffer):
+            return x
+        if isinstance(x, TensorBox):
+            return cls.realize_input(x.data)
+        if isinstance(x, ReinterpretView):
+            return ReinterpretView(cls.realize_input(x.data), x.get_layout())
+        if isinstance(x, BaseView):
+            x.realize()
+            if is_storage_and_layout(x.unwrap_view()):
+                try:
+                    return cls.convert_to_reinterpret_view(x)
+                except NotImplementedError:
+                    pass
+        if isinstance(x, StorageBox):
+            # TODO(jansel): impose layout preference on realized buffer
+            x.realize()
+            return x
+        return cls.copy_input(x)
+
+    @classmethod
+    def require_stride1(cls, x):
+        if is_storage_and_layout(x):
+            if len(x.get_stride()) == 0:
+                return x
+            for stride in x.get_stride():
+                if stride == 1:
+                    return x
+        return cls.copy_input(x)
+
+    @classmethod
+    def require_stride_order(cls, x, order):
+        if x.get_numel() == 0:  # Layout doesn't matter
+            return x
+
+        # require x to have the layout as strided_ordered as order
+        if is_storage_and_layout(x):
+            while isinstance(x.get_layout(), AliasedLayout):
+                x = x.get_layout().view
+            if isinstance(x.get_layout(), FlexibleLayout):
+                # fix flexiblelayout to be FixedLayout with stride_order
+                as_storage_and_layout(
+                    x, freeze=True, want_contiguous=False, stride_order=order
+                )
+                return x
+            elif isinstance(
+                x.get_layout(), FixedLayout
+            ) and x.get_layout().is_stride_ordered(order):
+                return x
+            elif isinstance(x.get_layout(), MutationLayout):
+                if isinstance(x.get_layout().real_layout(), FlexibleLayout):
+                    raise AssertionError(
+                        "the MutationLayout's real layout shouldn't be FlexibleLayout"
+                    )
+                elif isinstance(
+                    x.get_layout().real_layout(), FixedLayout
+                ) and x.get_layout().real_layout().is_stride_ordered(order):
+                    return x
+
+        # TODO - Storage to InputBuffer
+        if isinstance(x, InputBuffer) and x.get_layout().is_stride_ordered(order):
+            return x
+        if (
+            isinstance(x, TensorBox)
+            and isinstance(x.data, BaseView)
+            and not isinstance(x.data, ReinterpretView)
+            and is_storage_and_layout(x.unwrap_view())
+            and not isinstance(x.unwrap_view().data, ExternKernelAlloc)
+        ):
+            try:
+                x.data = cls.convert_to_reinterpret_view(x.data)
+                return cls.require_stride_order(x, order)
+            except NotImplementedError:
+                pass
+        x = cls.copy_input(x)
+        as_storage_and_layout(x, freeze=True, want_contiguous=False, stride_order=order)
+        assert is_stride_order_storage_and_layout(x, order)
+        return x
+
+    @classmethod
+    def require_channels_last(cls, x):
+        return cls.require_stride_order(x, NHWC_STRIDE_ORDER)
+
+    @classmethod
+    def require_contiguous(cls, x):
+        return cls.require_stride_order(x, list(reversed(range(len(x.get_size())))))
+
+    def apply_constraint(self):
+        pass
+
+    def codegen_const_args(self):
+        return map(V.graph.wrapper_code.val_to_arg_str, self.constant_args)
+
+    def codegen_args(self):
+        args = []
+        for i, x in enumerate(self.inputs):
+            if isinstance(x, list):
+                names = [i.codegen_reference() for i in x]
+                codegen_reference = f'[{", ".join(names)}]'
+                args.append(codegen_reference)
+            else:
+                if V.graph.cpp_wrapper:
+                    assert self.arg_properties and i < len(
+                        self.arg_properties
+                    ), "Invalid arg_properties accessing"
+                    type_ = self.arg_properties[i].get("type")
+                    args.append(
+                        V.graph.wrapper_code.val_to_cpp_arg_str(  # type: ignore[arg-type]
+                            type_, x, self.is_legacy_abi_kernel()
+                        )
+                    )
+                else:
+                    args.append(x.codegen_reference())
+        args.extend(self.codegen_const_args())
+        return args
+
+    def get_kwargs_value(self, arg_name):
+        if arg_name in self.kwargs:
+            return self.kwargs.get(arg_name)
+        if self.kwarg_properties and self.kwarg_properties.get(arg_name):
+            return self.kwarg_properties.get(arg_name).get("default_value")  # type: ignore[union-attr]
+        else:
+            raise AssertionError(f"{arg_name} not in self.kwarg_properties")
+
+    def is_legacy_abi_kernel(self):
+        return False
+
+    def codegen_kwargs(self):
+        if V.graph.cpp_wrapper:
+            kwargs = []
+            for arg_name in self.ordered_kwargs_for_cpp_kernel:
+                v = self.get_kwargs_value(arg_name)
+                if isinstance(v, sympy.Expr):
+                    kwargs.append(v)
+                else:
+                    type_ = (
+                        self.kwarg_properties.get(arg_name).get("type")  # type: ignore[union-attr]
+                        if self.kwarg_properties and arg_name in self.kwarg_properties
+                        else None
+                    )
+                    kwargs.append(
+                        V.graph.wrapper_code.val_to_cpp_arg_str(  # type: ignore[arg-type]
+                            type_, v, self.is_legacy_abi_kernel()
+                        )
+                    )
+        else:
+            kwargs = [
+                f"{k}={V.graph.wrapper_code.val_to_arg_str(v)}"  # type: ignore[misc]
+                for k, v in self.kwargs.items()
+            ]
+        return kwargs
+
+    def codegen_size_asserts(self, wrapper):
+        if config.size_asserts and not V.graph.cpp_wrapper:
+            size = V.graph.wrapper_code.codegen_shape_tuple(self.get_size())
+            stride = V.graph.wrapper_code.codegen_shape_tuple(self.get_stride())
+            wrapper.writeline(
+                f"assert_size_stride({self.get_name()}, {size}, {stride})"
+            )
+
+    def get_group_stride(self):
+        """
+        get output sizes and strides, for template_codegen
+        """
+        _size = self.get_size()
+        _stride = self.get_stride()
+        # iter_ranges = _size of output tensor, reduce_range = [] because no reduction
+        return [_size, []], _stride
+
+    def canonicalize(self):
+        """
+        Manually get canonicalization of the output index
+        """
+        # manually generate index formula for conv
+        sizevars = V.graph.sizevars
+        sizes = self.get_size()
+        strides = self.get_stride()
+        strides = [sizevars.size_hint(x) for x in strides]
+        index_vars = [sympy_index_symbol(f"d{i}") for i in range(len(sizes))]
+        # reorder index vars according to stride
+        index_order = sorted(range(len(strides)), key=strides.__getitem__, reverse=True)
+        lookup = {pos: idx for idx, pos in enumerate(index_order)}
+        order = [lookup[i] for i in range(len(lookup))]
+        index_vars = [index_vars[i] for i in order]
+        indexer = self.make_indexer()
+        index = indexer(index_vars)
+
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars, sizes, [index]
+        )
+
+        # assign new variables each dimension to deal with numbering mismatches
+        # d0, d1, d2 could become d0, d2 -- which won't match d0, d1
+        _, add_var = var_builder("c")
+        replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes])))
+
+        index = sympy_subs(sympy.expand(index), replacement)  # type: ignore[arg-type]
+        return index, tuple(new_sizes)
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        # NB: It's not necessary to check regular inputs as we automatically
+        # have dependencies on them
+        r = set()
+        for arg in self.constant_args:
+            r |= maybe_free_unbacked_symbols(arg)
+        for arg in self.kwargs.values():
+            r |= maybe_free_unbacked_symbols(arg)
+        return r
+
+    def __str__(self):
+        kernel_name = getattr(self, "python_kernel_name", None)
+        lines = [
+            f"python_kernel_name={kernel_name!r}",
+        ]
+        lines += [
+            f"{field.name}={getattr(self, field.name)}"
+            for field in dataclasses.fields(self)
+        ]
+        lines.append(f"origin_node={self.origin_node!r}")
+        return self.str_helper(lines)
+
+    __repr__ = __str__
+
+
+@dataclasses.dataclass
+class ExternKernelOut(ExternKernel):
+    def codegen(self, wrapper):
+        self.codegen_comment(wrapper)
+        args = [*self.codegen_args(), *self.codegen_kwargs()]
+        wrapper.generate_extern_kernel_out(
+            self.output_view,
+            self.codegen_reference(),
+            args,
+            self.get_kernel_name(),
+        )
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        output_view=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ):
+        super().__init__(
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+            constant_args,
+            kwargs or {},
+            None,
+            python_kernel_name,
+            cpp_kernel_name,
+            ordered_kwargs_for_cpp_kernel,
+            op_overload,
+        )
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return True
+
+
+class RandomSeeds(ExternKernelOut):
+    def __init__(self, count: int, device: torch.device):
+        limits = torch.iinfo(torch.int64)
+        super().__init__(
+            layout=FixedLayout(
+                device=device,
+                dtype=torch.int64,
+                size=[count],
+            ),
+            inputs=[],
+            constant_args=[limits.min, limits.max, [count]],
+            python_kernel_name="aten.randint.low_out",
+            cpp_kernel_name="at::randint_out",
+        )
+
+
+class ExternKernelAlloc(ExternKernel):
+    def codegen(self, wrapper):
+        self.codegen_comment(wrapper)
+        args = [*self.codegen_args(), *self.codegen_kwargs()]
+        V.graph.wrapper_code.generate_extern_kernel_alloc(self, args)
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        kwargs=None,
+        python_kernel_name=None,
+        cpp_kernel_name=None,
+        ordered_kwargs_for_cpp_kernel=(),
+        op_overload=None,
+    ):
+        super().__init__(
+            None,
+            layout,
+            self.unwrap_storage(inputs),
+            constant_args,
+            kwargs or {},
+            None,
+            python_kernel_name,
+            cpp_kernel_name,
+            ordered_kwargs_for_cpp_kernel,
+            op_overload,
+        )
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return False
+
+    def apply_constraint(self):
+        raise NotImplementedError
+
+
+class UserDefinedTritonKernel(ExternKernel):
+    def get_kernel_and_configs(self):
+        from triton.runtime.autotuner import Autotuner
+
+        from torch._higher_order_ops.triton_kernel_wrap import kernel_side_table
+
+        kernel = kernel_side_table.get_kernel(self.kernel_idx)
+        configs = []
+        if isinstance(kernel, Autotuner):
+            configs = kernel.configs
+            kernel = kernel.fn
+        return kernel, configs
+
+    def codegen(self, wrapper):
+        kernel, configs = self.get_kernel_and_configs()
+
+        # Definition of kernel
+        new_name, triton_meta = wrapper.define_user_defined_triton_kernel(
+            kernel, configs, self.kwargs
+        )
+
+        args = self.codegen_kwargs()
+        if V.graph.cpp_wrapper:
+            # in C++ wrapper, we don't pass constexpr args, as they don't
+            # get added as parameters to the PTX code compiled from the
+            # user-defined Triton kernel (only non-constexpr args do)
+            args = [arg for i, arg in enumerate(args) if i not in kernel.constexprs]
+
+        # Call to kernel
+        self.codegen_comment(wrapper)
+        wrapper.generate_user_defined_triton_kernel(
+            new_name,
+            self.grid,
+            configs,
+            args,
+            triton_meta,
+        )
+
+    def should_allocate(self):
+        return False
+
+    def has_side_effects(self):
+        # UserDefinedTritonKernel does not return anything, but rather
+        # modifies input in place, do not let it get DCEd
+        return True
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def get_mutation_names(self):
+        return []
+
+    def __init__(self, *, kernel_idx, grid, kernel_args):
+        inputs = []
+        kwargs = dict()
+        constant_args = []
+        for k, v in kernel_args.items():
+            if isinstance(v, TensorBox):
+                t = InputsKernel.unwrap_storage_for_input(self.realize_input(v))
+                inputs.append(t)
+                kwargs[k] = t
+            else:
+                constant_args.append(v)
+                kwargs[k] = v
+
+        assert len(inputs) != 0
+        device = inputs[0].get_device()
+
+        super().__init__(
+            None,
+            NoneLayout(device),  # type: ignore[arg-type]
+            inputs,
+            tuple(constant_args),
+            kwargs,
+        )
+        self.name = V.graph.register_buffer(self)
+        self.kernel_idx = kernel_idx
+        self.grid = grid
+
+        kernel, _ = self.get_kernel_and_configs()
+        # If we are autotuning, not all arguments will be passed
+        self.ordered_kwargs_for_cpp_kernel = [
+            arg for arg in kernel.arg_names if arg in kernel_args
+        ]
+
+        mark_node_as_mutating(
+            self, *[a for a in kernel_args.values() if isinstance(a, TensorBox)]
+        )
+
+    def get_alias_names(self):
+        return [i.get_name() for i in self.inputs]
+
+
+def mark_node_as_mutating(cur_buffer, *mutated_ops):
+    """
+    Allows ops in mutated_ops to be marked as being mutated as well as
+    indicates to the scheduler that these ops depend on cur_buffer.
+    """
+    for op in mutated_ops:
+        assert isinstance(op, IRNode), op
+        V.graph.mark_buffer_mutated(op.get_name())
+        assert hasattr(op, "layout")
+        MutationOutput(op.layout, op, cur_buffer)
+
+
+class MutationOutput(ExternKernel):
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def __init__(self, layout, input, parent):
+        super().__init__(None, layout, [input, parent], ())
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return False
+
+    def is_no_op(self):
+        return True
+
+    def has_side_effects(self):
+        return True
+
+    def get_alias_names(self):
+        return [self.inputs[0].get_name()]
+
+
+class InplaceBernoulliFallback(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly
+    """
+
+    def codegen(self, wrapper):
+        (x,) = (t.codegen_reference() for t in self.inputs)
+        wrapper.writeline(
+            f"{self.get_kernel_name()}({x}, {', '.join(map(repr, self.constant_args))}){wrapper.ending}"
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def __init__(self, x, *constant_args):
+        super().__init__(
+            None,
+            NoneLayout(x.get_device()),  # type: ignore[arg-type]
+            self.unwrap_storage([x]),
+            constant_args,
+        )
+        self.name = V.graph.register_buffer(self)
+        self.python_kernel_name = "aten.bernoulli_"
+        self.cpp_kernel_name = (
+            "aoti_torch_bernoulli_"
+            if config.abi_compatible
+            else "at::native::bernoulli_"
+        )
+        mark_node_as_mutating(self, x)
+
+
+# Used to deal with torch.complex types
+class InplaceCopyFallback(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly
+    """
+
+    def codegen(self, wrapper):
+        (dst, src, non_blocking) = self.codegen_args()
+        wrapper.writeline(
+            f"{self.get_kernel_name()}({dst}, {src}, {non_blocking}){wrapper.ending}"
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args,
+    ):
+        super().__init__(
+            None,
+            layout,
+            inputs,
+            constant_args,
+            python_kernel_name="aten.copy_",
+            cpp_kernel_name=(
+                "aoti_torch_copy_" if config.abi_compatible else "at::_ops::copy_::call"
+            ),
+        )
+        self.name = V.graph.register_buffer(self)
+
+    @classmethod
+    def create(cls, dst, src, non_blocking: bool = False):
+        inputs = [cls.realize_input(t) for t in [dst, src]]
+        constant_args = (non_blocking,)
+        result = InplaceCopyFallback(
+            NoneLayout(dst.get_device()),  # type: ignore[arg-type]
+            inputs,
+            constant_args,
+        )
+        mark_node_as_mutating(result, dst)
+        return result
+
+
+class MutatingFirstArgExternKernel(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly
+    """
+
+    def codegen(self, wrapper):
+        argrefs = [
+            *(t.codegen_reference() for t in self.inputs),
+            *map(repr, self.constant_args),
+        ]
+        wrapper.writeline(
+            f"{self.get_kernel_name()}({', '.join(argrefs)}){wrapper.ending}"
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def has_side_effects(self):
+        return True
+
+
+class ResizeStorageBytes(MutatingFirstArgExternKernel):
+    def __init__(self, variable, new_size):
+        assert isinstance(new_size, int), "TODO: dynamic shapes"
+        super().__init__(
+            None,
+            NoneLayout(variable.get_device()),  # type: ignore[arg-type]
+            self.unwrap_storage([variable]),
+            constant_args=(new_size,),
+        )
+        V.graph.mark_buffer_mutated(variable.get_name())
+        self.name = V.graph.register_buffer(self)
+        self.python_kernel_name = "inductor_ops.resize_storage_bytes_"
+        self.cpp_kernel_name = "torch::inductor::resize_storage_bytes_"
+        V.graph.never_reuse_buffers.add(variable.data.get_name())
+        mark_node_as_mutating(self, variable)
+
+
+class ScatterFallback(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly.
+    This class handles both aten.scatter_ and aten.scatter_reduce_.
+    It also handle the case `src` being a scalar properly.
+    """
+
+    def codegen(self, wrapper):
+        reduce = self.kwargs["reduce"]
+        if V.graph.cpp_wrapper:
+            # Follow aten/src/ATen/native/ReductionType.h:get_operator_enum
+            get_operator_enum = {"add": "sum", "multiply": "prod"}
+            if reduce in get_operator_enum:
+                reduce = get_operator_enum[reduce]
+
+        if self.src_is_tensor:
+            (x, index, src) = (t.codegen_reference() for t in self.inputs)
+        else:
+            (x, index) = (t.codegen_reference() for t in self.inputs)
+            src = self.constant_args[1]
+        wrapper.generate_scatter_fallback(
+            x,
+            [x, self.constant_args[0], index, src],
+            self.get_kernel_name(),
+            self.python_kernel_name,
+            self.src_is_tensor,
+            reduce,
+            self.codegen_kwargs(),
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_cpp_kernel(self):
+        reduce = self.kwargs["reduce"]
+        if self.python_kernel_name == "aten.scatter_":
+            if self.src_is_tensor:
+                kernel = (
+                    "at::scatter_out" if reduce is None else "at::scatter_reduce_out"
+                )
+            else:
+                assert (
+                    reduce is None
+                ), "Expect reduce to be None for aten.scatter_ with scalar src"
+                kernel = "at::scatter_out"
+        else:
+            assert (
+                reduce is not None
+            ), "Expect reduce to be not None for aten.scatter_reduce_"
+            kernel = "at::scatter_reduce_out"
+        return kernel
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def __init__(
+        self,
+        op_overload,
+        python_kernel_name,
+        x,
+        dim: int,
+        index,
+        src,
+        *,
+        reduce: Optional[str] = None,
+        include_self: bool = True,
+    ):
+        assert python_kernel_name in {"aten.scatter_", "aten.scatter_reduce_"}
+        self.src_is_tensor = isinstance(src, TensorBox)
+
+        constant_args: Tuple[Any, ...]
+        if self.src_is_tensor:
+            tensors = [self.realize_input(t) for t in [x, index, src]]
+            constant_args = (dim,)
+        else:
+            tensors = [self.realize_input(t) for t in [x, index]]
+            constant_args = (dim, src)
+
+        super().__init__(
+            None,
+            NoneLayout(x.get_device()),  # type: ignore[arg-type]
+            self.unwrap_storage(tensors),
+            constant_args,
+            {"reduce": reduce, "include_self": include_self},
+            python_kernel_name=python_kernel_name,
+            ordered_kwargs_for_cpp_kernel=["reduce", "include_self"],
+            op_overload=op_overload,
+        )
+        self.cpp_kernel_name = self.get_cpp_kernel()
+        self.name = V.graph.register_buffer(self)
+        mark_node_as_mutating(self, x)
+
+
+class IndexPutFallback(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation and indices properly
+    """
+
+    def codegen(self, wrapper):
+        (x, values, *valid_indices) = (t.codegen_reference() for t in self.inputs)
+        indices = []
+        iter_valid_indices = iter(valid_indices)
+        for i, _ in enumerate(self.indices):
+            if self.indices[i] is not None:
+                indices.append(next(iter_valid_indices))
+            else:
+                indices.append(V.graph.wrapper_code.none_str)
+
+        wrapper.generate_index_put_fallback(
+            self.get_kernel_name(), x, indices, values, *self.codegen_const_args()
+        )
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    def __init__(self, op_overload, x, indices, values, accumulate):
+        self.indices = indices
+        valid_indices = [i for i in indices if i is not None]
+        tensors = [self.realize_input(x) for x in [x, values, *valid_indices]]
+        cpp_kernel_name = (
+            "aoti_torch_index_put_out" if config.abi_compatible else "at::index_put_out"
+        )
+        super().__init__(
+            None,
+            NoneLayout(x.get_device()),  # type: ignore[arg-type]
+            self.unwrap_storage(tensors),
+            (accumulate,),
+            python_kernel_name="aten.index_put_",
+            cpp_kernel_name=cpp_kernel_name,
+            op_overload=op_overload,
+        )
+        self.name = V.graph.register_buffer(self)
+        mark_node_as_mutating(self, x)
+
+
+class DeviceCopy(ExternKernelOut):
+    @classmethod
+    def create(cls, x, device):
+        if (
+            not x.is_extern()
+            and all(
+                (r.name in V.graph.constants and isinstance(r, dependencies.MemoryDep))
+                for r in x.get_reads()
+            )
+            and not config.aot_inductor.use_runtime_constant_folding
+        ):
+            return x.constant_to_device(device)
+
+        V.graph.add_device_info(device)
+        V.graph.add_device_info(x.get_device())
+
+        developer_warning("DeviceCopy in input program")
+        return DeviceCopy(
+            FlexibleLayout(
+                device=device,
+                dtype=x.get_dtype(),
+                size=x.get_size(),
+            ),
+            [cls.realize_input(x)],
+        )
+
+    def codegen(self, wrapper):
+        args = self.codegen_args()
+        assert len(args) == 1
+        if self.output_view:
+            wrapper.codegen_device_copy(args[0], self.output_view.codegen_reference())
+        else:
+            wrapper.codegen_device_copy(args[0], self.codegen_reference())
+
+
+class DynamicScalar(ExternKernel):
+    """
+    The result of a call to aten._local_scalar_dense.
+    """
+
+    def get_reads(self):
+        return ()
+
+    def should_allocate(self):
+        return False
+
+    # TODO: handle bools carefully
+    def __init__(self, sym, data):
+        data.realize()
+        super().__init__(None, NoneLayout(torch.device("cpu")), self.unwrap_storage([data]))  # type: ignore[arg-type]
+        if isinstance(sym, sympy.Symbol):
+            self.sym = sym
+            self.is_bool = False
+        else:
+            # Special case for boolean.  For Reasons(TM), we don't represent
+            # boolean variables directly in sympy; instead, we generate an
+            # indicator integer variable which we then convert to a boolean by
+            # testing i0 == 1.  We have to identify the underlying indicator
+            # variable, and then bind i0 to the appropriate integer value
+            # based on the runtime boolean.
+            assert isinstance(sym, sympy.Eq), sym
+            assert isinstance(sym.args[0], sympy.Symbol), sym
+            assert sym.args[1] == 1, sym
+            self.sym = sym.args[0]
+            self.is_bool = True
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return {self.sym}
+
+    def codegen(self, wrapper):
+        wrapper.codegen_dynamic_scalar(self)
+
+
+class AssertScalar(ExternKernel):
+    """
+    The result of a call to aten._assert_scalar
+    """
+
+    def get_reads(self):
+        return ()
+
+    def should_allocate(self):
+        return False
+
+    def __init__(self, scalar, msg):
+        super().__init__(
+            # Buffer(name, layotu)
+            None,
+            NoneLayout(torch.device("cpu")),  # type: ignore[arg-type]
+            # InputsKernel(inputs)
+            [],
+        )  # type: ignore[arg-type]
+        self.scalar = scalar
+        self.msg = msg
+
+    def has_side_effects(self):
+        return True
+
+    def get_unbacked_symbol_uses(self):
+        return free_unbacked_symbols(self.scalar)
+
+    def codegen(self, wrapper):
+        if V.graph.cpp_wrapper:
+            pass
+        else:
+            wrapper.writeline(
+                f"if not {V.graph.wrapper_code.codegen_python_sizevar(self.scalar)}:"
+            )
+            wrapper.writeline(f"    raise RuntimeError({repr(self.msg)})")
+            # No one should ever use this buffer, but for uniformity
+            # define the variable and assign it None
+            wrapper.writeline(f"{self.get_name()} = None")
+
+
+@dataclasses.dataclass
+class ExternKernelNode:
+    name: str
+    node: export_schema.Node
+
+
+has_c_shim = {
+    aten._embedding_bag.default,
+    aten._fft_c2c.default,
+    aten._scaled_dot_product_efficient_attention.default,
+    aten._scaled_dot_product_flash_attention.default,
+    aten._scaled_mm.default,
+    aten.addmm.out,
+    aten.bmm.out,
+    aten.copy_.default,
+    aten.mm.out,
+    aten.repeat_interleave.Tensor,
+    aten.nonzero.default,
+    aten.view.dtype,
+    aten.view_as_real.default,
+}
+
+
+def get_aten_cpp_kernel_name(kernel):
+    # Calling with the default kernel name can lead to ambiguous behavior like the following example.
+    # repeat_interleave(const at::Tensor & repeats, c10::optional<int64_t> output_size=c10::nullopt)
+    # repeat_interleave(const at::Tensor & self, int64_t repeats,
+    #       c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt)
+    assert (
+        isinstance(kernel, torch._ops.OpOverload) and kernel.namespace == "aten"
+    ), "Invalid aten kernel"
+    opname = (
+        kernel.__name__.split(".")[0]
+        if kernel._overloadname == "default"
+        else kernel.__name__.replace(".", "_")
+    )
+    return f"at::_ops::{opname}::call"
+
+
+class FallbackKernel(ExternKernelAlloc):
+    args_default_value: List[Dict[str, Any]]
+
+    def __init__(
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+        kwargs=None,
+    ):
+        super().__init__(
+            layout,
+            tuple(tensor_args),
+            tuple(nontensor_args),
+            op_overload=kernel,
+        )
+        # We need output buffers for generating kernel arguments in the
+        # abi-compatible mode, where we retrieve outputs by pass each individual
+        # output through the abi-compatible interface.
+        self.outputs: Sequence[Any] = []
+        self.use_runtime_dispatch = False
+        self.abi_compatible_kernel = None
+
+        assert isinstance(
+            kernel,
+            (
+                torch._ops.OpOverload,
+                torch._ops.HigherOrderOperator,
+            ),
+        ), f"Fails to create FallbackKernel for {kernel}: {type(kernel)} not supported"
+        self.op_overload = kernel
+
+        self.unflatten_args = unflatten_args
+        self.kwargs = {} if kwargs is None else kwargs
+        V.graph.warn_fallback(self.python_kernel_name)
+
+        # args that are aliased
+        self.alias_names: List[str] = []
+        # args that are mutated AND returned from the op
+        self.mutation_names: List[str] = []
+
+        if isinstance(self.op_overload, torch._ops.HigherOrderOperator):
+            # We assume here that HOPs with FallbackKernel are functional.
+            # This may not always be true! HOPs must individually opt-in to
+            # FallbackKernel, so please check this if you opt-in.
+            return
+
+        if "_c10d_functional" in self.op_overload.name():
+            # _c10d_functional kernels are lowered into _CollectiveKernel which
+            # derives from FallbackKernel for the cpp codegen. The kernels
+            # don't pass the can_auto_functionalize check, but their mutation
+            # is handled properly by _CollectiveKernel.
+            return
+
+        schema = self.op_overload._schema
+
+        # NOTE: [FallbackKernel supported operators]
+        # We only support three types of operators:
+        # - functional ops
+        # - view ops
+        # - inplace aten ops
+        # - mutating ops that are auto-functionalizable. That is,
+        # the operator may mutate any number of inputs, but its outputs
+        # may not alias any of the inputs.
+        #
+        # The unsupported cases usually do not show up here (because
+        # AOTAutograd functionalized them away); the only way for an in-place
+        # op to show up here is if a lowering or pass introduced it.
+        if torch._library.utils.mutates_and_returns_first_arg(self.op_overload):
+            self.mutation_names.append(tensor_args[0].get_name())
+            return
+
+        if schema.is_mutable and not can_auto_functionalize(kernel):
+            raise NotImplementedError(
+                f"NYI: Can't generate FallbackKernel for {kernel}"
+            )
+
+        schema_args = schema.arguments
+        args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
+
+        def handle_aliasing_and_mutation(info, arg):
+            # Assertions to make sure we didn't mismatch args
+            if isinstance(info.type, torch.ListType):
+                assert isinstance(arg, (list, tuple))
+            is_optional_tensor = isinstance(
+                info.type, torch.OptionalType
+            ) and isinstance(info.type.getElementType(), torch.TensorType)
+            if is_optional_tensor or isinstance(info.type, torch.TensorType):
+                # PyTorch also accepts None and scalar types for args marked as "Tensor".
+                # We're not going to check all of them here.
+                assert not isinstance(arg, (tuple, list))
+
+            if arg is None:
+                return
+            if info.alias_info is None:
+                return
+            # can_auto_functionalize already filters out mutable List[Tensor].
+            # We can support this in the future, but this is very uncommon.
+            assert isinstance(info.type, torch.TensorType) or is_optional_tensor
+            self.alias_names.append(arg.get_name())
+            if info.alias_info.is_write:
+                mark_node_as_mutating(self, arg)
+
+        for info, arg in torch._library.utils.zip_schema(schema, args, kwargs):
+            handle_aliasing_and_mutation(info, arg)
+
+    def set_cpp_kernel(self, kernel):
+        from .codegen.wrapper import get_cpp_op_schema
+
+        assert (
+            not kernel._schema.is_mutable
+        ), f"mutable {kernel.__name__} is not supported with cpp_wrapper"
+
+        # These checks are here because ops that return aliasing tensors will
+        # return type Tensor& instead of Tensor, but codegen will always write
+        # type Tensor on the LHS.
+        def is_not_write(arg):
+            return arg.alias_info is None or not arg.alias_info.is_write
+
+        assert all(
+            is_not_write(x) for x in kernel._schema.arguments
+        ), f"{kernel.__name__} with alias_info arguments is not supported with cpp_wrapper"
+        assert all(
+            is_not_write(x) for x in kernel._schema.returns
+        ), f"{kernel.__name__} with alias_info returns is not supported with cpp_wrapper"
+
+        self.cpp_kernel_name = kernel._schema.name
+        self.cpp_kernel_overload_name = kernel._schema.overload_name
+        self.cpp_kernel_key = f"{self.cpp_kernel_name.replace('::', '_')}_{self.cpp_kernel_overload_name}"  # type: ignore[union-attr]
+
+        self.cpp_op_schema = get_cpp_op_schema(kernel)
+        self.init_args_default_value(kernel._schema)
+
+    def is_legacy_abi_kernel(self):
+        return (
+            config.c_shim_version == "1"
+            and "_scaled_dot_product_flash_attention" in str(self.python_kernel_name)
+        )
+
+    def init_args_default_value(self, schema):
+        self.args_default_value = [
+            {
+                "name": x.name,
+                "type": x.real_type,
+                "value": x.default_value,
+            }
+            for x in schema.arguments
+            if not x.kwarg_only
+        ]
+
+    def get_pos_arg_value(self, pos, kwargs):
+        # positional args may be provided in kwargs
+        pos_arg_name = self.args_default_value[pos]["name"]
+        if pos_arg_name in kwargs:
+            log.debug(
+                "Found argument %s with value %s from kwargs",
+                pos_arg_name,
+                kwargs[pos_arg_name],
+            )
+            return kwargs[pos_arg_name]
+
+        assert hasattr(
+            self, "args_default_value"
+        ), "self.args_default_value has to be provided"
+        assert pos < len(
+            self.args_default_value
+        ), f"expected the index {pos} to be smaller than len(self.args_default_value): {len(self.args_default_value)}"
+        arg_default_value = self.args_default_value[pos]["value"]
+        log.debug(
+            "Use default value %s for argument %s", arg_default_value, pos_arg_name
+        )
+        return arg_default_value
+
+    def codegen_args(self):
+        @dataclasses.dataclass
+        class Shim:
+            ref: Any
+
+            def __repr__(self):
+                return self.ref
+
+        tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
+        args, kwargs = self.unflatten_args(tensor_args, self.constant_args)
+        # Now we setup abi_compatible_kernel after self.python_kernel_name
+        # and kwargs are adjusted appropriately.
+        # For sdpa, we need the v2 version since v1 didn't consider optional arg
+        # FIXME: no need to do this after we switch to the torchgen-ed C shim
+        self.abi_compatible_kernel = (
+            f"{self.cpp_kernel_name}_v2"
+            if self.cpp_kernel_name in {"at::_scaled_dot_product_flash_attention"}
+            and config.c_shim_version == "1"
+            else self.cpp_kernel_name
+        )
+
+        if V.graph.cpp_wrapper and isinstance(self.op_overload, torch._ops.OpOverload):
+            args = [
+                V.graph.wrapper_code.val_to_cpp_arg_str(
+                    param.real_type, x, self.is_legacy_abi_kernel()
+                )
+                for param, x in zip(self.op_overload._schema.arguments, args)
+            ]
+        else:
+            args = [V.graph.wrapper_code.val_to_arg_str(x) for x in args]
+
+        # Previously, we want to maintain forward-compatibility by skipping
+        # default args in the serialized artifacts in fbcode. However,
+        # some of our shim interfaces require default values being set.
+        # Discussed with Sherlock offline and we decided to allow serializing
+        # default args into the C++ wrapper code for now. We will refine this
+        # part if we see real FC requirement. More details related to FC
+        # can be found at:
+        # https://docs.google.com/document/d/1FzWm-sHYwmRi3x_g036kOxd99KaYquUsA-L5JwOn8ys/edit?usp=sharing
+        if V.graph.cpp_wrapper and hasattr(self, "args_default_value"):
+            self.fill_non_provided_args(args, kwargs, convert_val_to_str=True)
+
+        # let self.codegen_kwargs handle kwargs
+        self.kwargs.update(kwargs)
+        return args
+
+    @staticmethod
+    def find_device(tensor_args, example_output):
+        if tensor_args:
+            return tensor_args[0].get_device()
+        if isinstance(example_output, torch.Tensor):
+            return example_output.device
+        if isinstance(example_output, (list, tuple)):
+            devices = {FallbackKernel.find_device(None, x) for x in example_output}
+            # Remove None
+            devices = [device for device in devices if device]
+            if len(devices) == 1:
+                return devices[0]
+            for device in devices:
+                if device.type == "cuda":
+                    return device
+            return devices[0]
+        return None
+
+    def has_side_effects(self):
+        if isinstance(self.op_overload, torch._ops.HigherOrderOperator):
+            return False
+        return get_schema_info(self.op_overload).is_mutable()
+
+    def get_alias_names(self):
+        return self.alias_names
+
+    def get_mutation_names(self):
+        assert len(self.mutation_names) <= 1
+        return self.mutation_names
+
+    def fill_non_provided_args(self, args, kwargs, convert_val_to_str=False):
+        assert isinstance(args, (list, tuple))
+        if isinstance(args, tuple):
+            args = list(args)
+        assert hasattr(self, "args_default_value")
+        n_args = len(args)
+        n_pos_args = len(self.args_default_value)
+        # For cpp wrapper, if some positional args are not provided, we need to check
+        # if they're in the kwargs or use their default value
+        if n_args < n_pos_args:
+            log.debug(
+                "%s has %d unprovided positional arguments. "
+                "Will check if they are in the keyword arguments or will use default values.",
+                self.op_overload,
+                n_pos_args - n_args,
+            )
+            pos_args = [
+                self.get_pos_arg_value(i, kwargs) for i in range(n_args, n_pos_args)
+            ]
+            if convert_val_to_str:
+                pos_args = [V.graph.wrapper_code.val_to_arg_str(x) for x in pos_args]
+            args.extend(pos_args)
+        return args
+
+    # ProxyExecutor Design Note
+    # We export the ExternFallbackNodes (for custom ops) into a serialized file
+    # and run it with a host side proxy executor to address the ABI problem
+    # This is currently only implemented for fbcode. Eventually, we will also make this work for OSS.
+    # Detailed design doc can be found at
+    # https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
+    def export_extern_kernel_node(self):
+        assert isinstance(self, FallbackKernel)
+        args, kwargs = self.unflatten_args(self.inputs, self.constant_args)
+        args = self.fill_non_provided_args(args, kwargs)
+        ordered_kwargs = [
+            kwargs.get(key, None) for key in self.ordered_kwargs_for_cpp_kernel
+        ]
+
+        serializer = GraphModuleSerializer(None, None)  # type: ignore[arg-type]
+        named_arguments = serializer.serialize_inputs(self.op_overload, args, kwargs)  # type: ignore[arg-type]
+
+        # serialize_outputs
+        def handle_single_output(return_type, output):
+            if isinstance(return_type, torch.TensorType):
+                # For single Tensor
+                out = output
+                if isinstance(output, (list, tuple)):
+                    assert len(output) == 1
+                    out = output[0]
+                return export_schema.Argument.create(
+                    as_tensor=export_schema.TensorArgument(name=out.get_name())
+                )
+            elif isinstance(return_type, torch.ListType) and isinstance(
+                return_type.getElementType(), torch.TensorType
+            ):
+                # For single TensorList
+                return export_schema.Argument.create(
+                    as_tensors=[
+                        export_schema.TensorArgument(name=out.get_name())
+                        for out in output
+                    ]
+                )
+            else:
+                raise RuntimeError(f"Unsupported return type {type(return_type)}")
+
+        target = self.op_overload
+        returns = target._schema.returns  # type: ignore[union-attr]
+        if len(returns) == 1:
+            return_type = returns[0].real_type
+            output_arguments = [handle_single_output(return_type, self.outputs)]
+        else:
+            # For tuple returns, e.g "-> (Tensor, Tensor)" or "-> (Tesnor, Tensor[])"
+            assert isinstance(self.outputs, tuple)
+            assert len(returns) == len(self.outputs)
+            output_arguments = [
+                handle_single_output(return_schema.real_type, output)
+                for return_schema, output in zip(returns, self.outputs)
+            ]
+
+        node = ExternKernelNode(
+            name=self.get_name(),
+            node=export_schema.Node(
+                target=self.op_overload.name(),  # type: ignore[union-attr]
+                inputs=named_arguments,
+                outputs=output_arguments,
+                metadata={},
+            ),
+        )
+
+        V.graph.extern_kernel_nodes.append(node)
+
+        return [*args, *ordered_kwargs]
+
+    def codegen(self, wrapper):
+        kernel = self.op_overload
+        if kernel.namespace == "aten":  # type: ignore[union-attr]
+            # Aten Fallback Ops
+            assert isinstance(kernel, torch._ops.OpOverload)
+            if V.graph.cpp_wrapper:
+                if (
+                    config.is_fbcode()
+                    and kernel not in has_c_shim
+                    # C shim v2 is torchgen-ed, which should cover all aten ops.
+                    # If you do hit a missed op, please update gen_aoti_c_shim.py.
+                    and config.c_shim_version == "1"
+                ):
+                    log.warning(
+                        "%s is missing a c-shim implementation, using proxy executor as fallback",
+                        kernel,
+                    )
+                    self.use_runtime_dispatch = True
+                    self.set_cpp_kernel(kernel)
+                else:
+                    self.cpp_kernel_name = get_aten_cpp_kernel_name(kernel)
+                    schema = kernel._schema
+                    self.init_args_default_value(schema)
+            else:
+                self.python_kernel_name = str(kernel)
+
+        elif isinstance(kernel, torch._ops.HigherOrderOperator):
+            self.python_kernel_name = f"torch.ops.higher_order.{kernel.__name__}"
+        else:
+            # For non-aten OpOverload, i.e. custom ops
+            if V.graph.cpp_wrapper:
+                self.use_runtime_dispatch = True
+                self.set_cpp_kernel(kernel)
+            else:
+                self.python_kernel_name = f"{kernel.__module__.replace('._ops.', '.ops.')}.{kernel.__name__}"  # type: ignore[union-attr]
+
+        if self.use_runtime_dispatch:
+            self.codegen_comment(wrapper)
+
+            exported_args = None
+            args = None
+            if config.is_fbcode() and V.graph.cpp_wrapper:
+                exported_args = self.export_extern_kernel_node()
+            else:
+                args = [*self.codegen_args(), *self.codegen_kwargs()]
+
+            wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+                self.get_name(),
+                self.get_kernel_name(),
+                args,
+                self.cpp_op_schema,
+                self.cpp_kernel_key,
+                self.cpp_kernel_overload_name,
+                self.op_overload,
+                exported_args,
+                self.outputs,
+            )
+        else:
+            self.codegen_comment(wrapper)
+            args = [*self.codegen_args(), *self.codegen_kwargs()]
+            V.graph.wrapper_code.generate_fallback_kernel(self, args)
+            if isinstance(self.layout, Layout):
+                self.codegen_size_asserts(wrapper)
+
+    @staticmethod
+    def tensor_to_layout(output: torch.Tensor):
+        return FixedLayout(
+            output.device,
+            output.dtype,
+            convert_shape_to_inductor(output.size()),
+            convert_shape_to_inductor(output.stride()),
+        )
+
+    @classmethod
+    def create(cls, kernel, *args, **kwargs):
+        fake_incorrect_kernels = (aten._fused_moving_avg_obs_fq_helper_functional,)
+        context = (
+            V.graph.fake_mode if kernel not in fake_incorrect_kernels else nullcontext()
+        )
+        with context:
+            (
+                example_output,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            ) = cls.process_kernel(kernel, *args, **kwargs)
+
+        device = cls.find_device(tensor_args, example_output)
+        assert device, "Not sure where to find device info"
+
+        packed = cls(
+            MultiOutputLayout(device),
+            kernel,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+        )
+
+        def generate_output(output, indices):
+            if isinstance(output, (list, tuple)):
+                return type(output)(
+                    generate_output(output[i], indices + [(type(output), i)])
+                    for i in range(len(output))
+                )
+            elif isinstance(output, dict):
+                return {
+                    key: generate_output(val, indices + [(type(output), key)])
+                    for key, val in output.items()
+                }
+            elif isinstance(output, torch.Tensor):
+                return MultiOutput(
+                    cls.tensor_to_layout(output),
+                    packed,
+                    indices,
+                )
+            elif isinstance(output, int):
+                return output
+            elif isinstance(output, torch.SymInt):
+                return output.node.expr
+            else:
+                assert (
+                    output is None
+                ), f"FallbackKernel output type {type(output)} is not supported"
+                return None
+
+        outputs = generate_output(example_output, [])
+        if isinstance(outputs, (list, tuple, dict)):
+            packed.outputs = outputs  # type: ignore[assignment]
+        else:
+            packed.outputs = [outputs]
+        return outputs
+
+    def apply_constraint(self):
+        return super().apply_constraint()
+
+
+@dataclasses.dataclass
+class ComplexView(FallbackKernel):
+    """View a complex number as two dtyped numbers or vice versa"""
+
+    def should_allocate(self):
+        return False
+
+    def get_alias_names(self):
+        # Signal to codegen that our output buffer isn't safe to reuse
+        return [self.inputs[0].get_name()]
+
+    def __init__(
+        self,
+        layout,
+        kernel,
+        tensor_args,
+        nontensor_args,
+        unflatten_args,
+    ):
+        super().__init__(
+            layout,
+            kernel,
+            tensor_args,
+            nontensor_args,
+            unflatten_args,
+        )
+
+
+@dataclasses.dataclass
+class MultiOutputLayout(IRNode):
+    device: torch.device
+
+
+class MultiOutput(ExternKernel):
+    # Given an input MultiOutputLayout buffer, indexes out an actual buffer
+    # from that result.  This doesn't actually produce multiple outputs,
+    # that's MultiOutputLayout!
+    def codegen_list_tuple_access(self, basename, indices):
+        if len(indices) > 0:
+            itype, i = indices[0]
+            if itype == list:
+                return self.codegen_list_tuple_access(f"{basename}[{i}]", indices[1:])
+            elif itype == tuple:
+                # cpp wrapper code needs to use std::get<> to access a tuple
+                tuple_access = V.graph.wrapper_code.codegen_tuple_access(
+                    basename, self.get_name(), str(i)
+                )
+                return self.codegen_list_tuple_access(tuple_access, indices[1:])
+            elif itype == dict:
+                return self.codegen_list_tuple_access(f"{basename}['{i}']", indices[1:])
+            else:
+                raise AssertionError("non supported index type")
+        else:
+            return basename
+
+    def codegen(self, wrapper):
+        wrapper.codegen_multi_output(
+            self.get_name(),
+            self.codegen_list_tuple_access(self.inputs[0].get_name(), self.indices),
+        )
+        self.codegen_unbacked_symbol_defs(wrapper)
+
+    def __init__(self, layout, input, indices: List[Tuple[Any, ...]]):
+        super().__init__(None, layout, [input], ())
+        self.name = V.graph.register_buffer(self)
+        self.indices = indices
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return self.inputs[0].get_unbacked_symbol_uses()
+
+    def should_allocate(self):
+        return False
+
+    def get_alias_names(self):
+        return [
+            inp.get_name()
+            for inp in self.inputs
+            if isinstance(inp, FallbackKernel) and len(inp.get_alias_names()) > 0
+        ]
+
+
+def _prepare_convolution_fusion_create(
+    cls,
+    x: "TensorBox",
+    weight: "TensorBox",
+    bias: "TensorBox",
+    padding: List[int],
+    stride: List[int],
+    dilation: List[int],
+    groups: int,
+    transposed: bool = False,
+    output_padding: Optional[List[int]] = None,
+):
+    """
+    This function is a helper function to prepare inputs, layout and constant args
+    for convolution post-op fusion's create function, including deciding the output
+    layout (channels first or channels last), realizing inputs and make them etc. The
+    function only supports the CPU device since conv post-op fusion kernel is only
+    supported on CPU right now.
+    """
+
+    # Port from aten/src/ATen/native/ConvUtils.h: _conv_input_size
+    def _conv_input_size(
+        output_size, weight_size, padding, output_padding, stride, dilation, groups
+    ):
+        assert len(output_size) == len(weight_size), "Expect input dim == weight dim"
+        dim = len(output_size)
+        assert dim > 2, "Expect input dim > 2"
+
+        BATCH_DIM = 0
+        WEIGHT_INPUT_CHANNELS_DIM = 1
+        input_size = []
+        input_size.append(output_size[BATCH_DIM])
+        input_size.append(weight_size[WEIGHT_INPUT_CHANNELS_DIM] * groups)
+        for d in range(2, dim):
+            kernel = (weight_size[d] - 1) * dilation[d - 2] + 1
+            input_size_d = (
+                (output_size[d] - 1) * stride[d - 2]
+                - (padding[d - 2] * 2)
+                + kernel
+                + output_padding[d - 2]
+            )
+            input_size.append(input_size_d)
+        return list(map(int, input_size))
+
+    # The size of prepacked_weight is the prepacked weight size of deconv:
+    #   Groups > 1:  [g*o, i/g, ...]
+    #   Groups == 1: [o, i, ...]
+    # Returns original weight size in [i, o, ...]
+    def _original_deconv_weight_size(
+        prepacked_weight,
+        groups,
+    ):
+        prepacked_weight_size = prepacked_weight.size()
+        dim = len(prepacked_weight_size)
+        assert dim > 2, "Expect weight dim > 2"
+        if groups > 1:
+            weight_size = []
+            weight_size.append(prepacked_weight_size[1] * groups)
+            weight_size.append(prepacked_weight_size[0] / groups)
+            for d in range(2, dim):
+                weight_size.append(prepacked_weight_size[d])
+        else:
+            weight_size = prepacked_weight.transpose(0, 1).size()
+        return weight_size
+
+    x.realize()
+    weight.realize()
+    if bias is not None:
+        bias.realize()
+    with V.graph.fake_mode:
+        # TODO <Leslie> cleaned up the fake_tensor trace as Linear implementation
+        x_fake = ir_node_to_tensor(x, guard_shape=True)
+        weight_fake = ir_node_to_tensor(weight, guard_shape=True)
+        dims = len(x_fake.size()) - 2
+        assert 0 < len(padding) <= dims
+        assert 0 < len(dilation) <= dims
+        assert 0 < len(stride) <= dims
+        padding = pad_listlike(padding, dims)
+        dilation = pad_listlike(dilation, dims)
+        stride = pad_listlike(stride, dims)
+        if output_padding is None:
+            output_padding = pad_listlike([0], dims)
+        else:
+            assert 0 < len(output_padding) <= dims
+            output_padding = pad_listlike(output_padding, dims)
+        assert isinstance(groups, int)
+        if transposed:
+            # When transposed, the size of the prepacked oneDNN weight is different
+            # from the PyTorch weight. We're not able to run aten conv with such
+            # size. We infer the output size from the input params here:
+            weight_size = _original_deconv_weight_size(weight_fake, groups)
+            input_size = x_fake.size()
+            output_size = _conv_input_size(
+                input_size,
+                weight_size,
+                padding,
+                output_padding,
+                stride,
+                dilation,
+                groups,
+            )
+        else:
+            bias_fake = (
+                ir_node_to_tensor(bias, guard_shape=True) if bias is not None else bias
+            )
+            output = torch.ops.aten.convolution(
+                x_fake,
+                weight_fake,
+                bias_fake,
+                stride,
+                padding,
+                dilation,
+                transposed,
+                output_padding,
+                groups,
+            )
+            output_size = output.size()
+
+        req_stride_order = [0] + list(reversed(range(1, len(stride) + 1)))
+        req_stride_order = [len(req_stride_order)] + req_stride_order
+        output_stride = make_channels_last_strides_for(output_size)
+
+    x = cls.require_stride_order(x, req_stride_order)
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
+
+    kernel_layout = FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        convert_shape_to_inductor(output_size),
+        convert_shape_to_inductor(output_stride),
+    )
+    constant_args = [padding, stride, dilation, groups]
+    if transposed:
+        constant_args.insert(1, output_padding)
+
+    if bias is not None:
+        inputs.append(bias)
+    else:
+        constant_args.insert(0, bias)
+    return inputs, constant_args, kernel_layout, req_stride_order
+
+
+def _prepare_linear_fusion_create(
+    cls,
+    x: "TensorBox",
+    weight: "TensorBox",
+    bias: "TensorBox",
+):
+    """
+    This function is a helper function to prepare inputs, layout and constant args
+    for linear post-op fusion's create function. The function only supports the CPU device
+    since linear post-op fusion kernel is only supported on CPU right now.
+    """
+    x.realize()
+    weight.realize()
+    if bias is not None:
+        bias.realize()
+
+    *m, _ = x.get_size()
+    # The weight has been transposed during the qlinear weight prepack process.
+    # https://github.com/pytorch/pytorch/blob/4979f9c0d72490970e2019bb1d2284f83d93f76b/
+    # aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp#L291
+    _, oc = weight.get_size()
+    output_size = list(m) + [oc]
+    req_stride_order = list(reversed(range(len(x.get_size()))))
+
+    x = cls.require_stride_order(x, req_stride_order)
+    assert x.get_device().type == "cpu" and weight.get_device().type == "cpu"
+    inputs = [x, weight]
+
+    output_stride = make_contiguous_strides_for(output_size)
+    kernel_layout = FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        output_size,
+        output_stride,
+    )
+    constant_args: List[Any] = []
+
+    if bias is not None:
+        inputs.append(bias)
+    else:
+        constant_args.insert(0, bias)
+    return inputs, constant_args, kernel_layout, req_stride_order
+
+
+class ConvolutionUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._convolution_pointwise",
+            cpp_kernel_name="mkldnn::_convolution_pointwise",
+        )
+        self.cpp_kernel_key = "convolution_pointwise"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view attr,
+                torch::List<c10::optional<at::Scalar>> scalars,
+                c10::optional<c10::string_view> algorithm)"""
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        attr,
+        scalars: Optional[List[Any]],
+        algorithm,
+    ):
+        (inputs, constant_args, kernel_layout, _) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        constant_args = constant_args + [
+            attr,
+            may_convert_to_optional(scalars),
+            algorithm,
+        ]
+        return ConvolutionUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class ConvolutionBinary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        cpp_constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._convolution_pointwise.binary",
+            cpp_kernel_name="mkldnn::_convolution_pointwise",
+        )
+        self.cpp_kernel_overload_name = "binary"
+        self.cpp_kernel_key = "convolution_pointwise_binary"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& other_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view binary_attr,
+                c10::optional<at::Scalar> alpha,
+                c10::optional<c10::string_view> unary_attr,
+                torch::List<c10::optional<at::Scalar>> unary_scalars,
+                c10::optional<c10::string_view> unary_algorithm)"""
+        self.cpp_constant_args = cpp_constant_args
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List[Any]],
+        unary_algorithm: Optional[str],
+    ):
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.require_stride_order(other, req_stride_order)
+        inputs.insert(1, other)
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+        return ConvolutionBinary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class ConvolutionBinaryInplace(ExternKernelAlloc):
+    def __init__(
+        self,
+        kernel_layout,
+        inputs,
+        constant_args=(),
+    ):
+        # Due to constrain of op.call, other (Tensor&) should be at input[0]
+        reordered_inputs = [inputs[1], inputs[0]] + inputs[2:]
+
+        super().__init__(
+            kernel_layout,
+            reordered_inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._convolution_pointwise_.binary",
+            cpp_kernel_name="mkldnn::_convolution_pointwise_",
+        )
+        self.cpp_kernel_overload_name = "binary"
+        self.cpp_kernel_key = "convolution_pointwise_binary_"
+        # TODO: op.call: input[0] should be at::Tensor&
+        self.cpp_op_schema = """
+            at::Tensor&(
+                at::Tensor& other_t,
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view binary_attr,
+                c10::optional<at::Scalar> alpha,
+                c10::optional<c10::string_view> unary_attr,
+                torch::List<c10::optional<at::Scalar>> unary_scalars,
+                c10::optional<c10::string_view> unary_algorithm)"""
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        other: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups: int,
+        binary_attr: str,
+        binary_alpha: Optional[float],
+        unary_attr: Optional[str],
+        unary_scalars: Optional[List[Any]],
+        unary_algorithm: Optional[str],
+    ):
+        (
+            inputs,
+            constant_args,
+            _,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls, x, weight, bias, padding_, stride_, dilation_, groups
+        )
+        other = cls.require_stride_order(other, req_stride_order)
+        inputs.insert(1, other)
+        constant_args = constant_args + [
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+        packed = ConvolutionBinaryInplace(
+            kernel_layout=NoneLayout(inputs[1].get_device()),  # type: ignore[arg-type]
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+        mark_node_as_mutating(packed, inputs[1])
+        # This op mutates in place which means that the result is not the
+        # target but rather the input that is being mutated
+        # init reorders the inputs, so inputs[1] becomes packed.inputs[0]
+        return packed.inputs[0]
+
+
+class MKLPackedLinear(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkl._mkl_linear",
+            cpp_kernel_name="mkl::_mkl_linear",
+        )
+        self.cpp_kernel_key = "mkl_linear"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& self,
+                const at::Tensor& mkl_weight_t,
+                const at::Tensor& origin_weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                const int64_t prepack_batch_size)"""
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+
+    @classmethod
+    def create(cls, x, packed_w, orig_w, batch_size):
+        x = cls.require_stride1(cls.realize_input(x))
+        orig_w = cls.require_stride1(cls.realize_input(orig_w))
+        *m, _ = x.get_size()
+        oc, _ = orig_w.get_size()
+        output_size = list(m) + [oc]
+        output_stride = make_contiguous_strides_for(output_size)
+        inputs = [x, packed_w, orig_w]
+        constant_args = [None, batch_size]
+
+        return MKLPackedLinear(
+            layout=FixedLayout(
+                x.get_device(), x.get_dtype(), output_size, output_stride
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class LinearUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._linear_pointwise",
+            cpp_kernel_name="mkldnn::_linear_pointwise",
+        )
+        self.cpp_kernel_key = "linear_pointwise"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                c10::string_view attr,
+                torch::List<c10::optional<at::Scalar>> scalars,
+                c10::optional<c10::string_view> algorithm)"""
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+
+    @classmethod
+    def create(cls, x, w, b, attr, scalars, algorithm):
+        x = cls.require_contiguous(cls.realize_input(x))
+        w = cls.require_contiguous(cls.realize_input(w))
+
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+        inputs = [x, w]
+        constant_args = [attr, scalars if scalars else [-1], algorithm]
+        if b is not None:
+            b = cls.require_contiguous(cls.realize_input(b))
+            inputs.append(b)
+        else:
+            constant_args.insert(0, None)
+
+        return LinearUnary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+    def apply_constraint(self):
+        pass
+
+
+class LinearBinary(ExternKernelAlloc):
+    kernel = "torch.ops.mkldnn._linear_pointwise.binary"
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._linear_pointwise.binary",
+            cpp_kernel_name="mkldnn::_linear_pointwise",
+        )
+        self.cpp_kernel_overload_name = "binary"
+        self.cpp_kernel_key = "linear_pointwise_binary"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& other_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                c10::string_view attr)
+        """
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+
+    @classmethod
+    def create(cls, x, y, w, b, attr):
+        x = cls.require_contiguous(cls.realize_input(x))
+        y = cls.require_contiguous(cls.realize_input(y))
+        w = cls.require_contiguous(cls.realize_input(w))
+
+        *m, ic = x.get_size()
+        oc, ic = w.get_size()
+
+        inputs = [x, y, w]
+        constant_args = [attr]
+        if b is not None:
+            b = cls.require_contiguous(cls.realize_input(b))
+            inputs.append(b)
+        else:
+            constant_args.insert(0, b)
+
+        return LinearBinary(
+            layout=FlexibleLayout(
+                device=x.get_device(),
+                dtype=x.get_dtype(),
+                size=list(m) + [oc],
+            ),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+    def apply_constraint(self):
+        pass
+
+
+class ConvolutionTransposeUnary(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.mkldnn._convolution_transpose_pointwise",
+            cpp_kernel_name="mkldnn::_convolution_transpose_pointwise",
+        )
+        self.cpp_kernel_key = "convolution_transpose_pointwise"
+        self.cpp_op_schema = """
+            at::Tensor(
+                const at::Tensor& input_t,
+                const at::Tensor& weight_t,
+                const c10::optional<at::Tensor>& bias_opt,
+                at::IntArrayRef padding,
+                at::IntArrayRef output_padding,
+                at::IntArrayRef stride,
+                at::IntArrayRef dilation,
+                int64_t groups,
+                c10::string_view attr,
+                torch::List<c10::optional<at::Scalar>> scalars,
+                c10::optional<c10::string_view> algorithm)"""
+
+    def codegen(self, wrapper):
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            self.codegen_args(),
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        weight: "TensorBox",
+        bias: "TensorBox",
+        padding_: List[int],
+        output_padding_: List[int],
+        stride_: List[int],
+        dilation_: List[int],
+        groups_: int,
+        attr,
+        scalars: Optional[List[Any]],
+        algorithm,
+    ):
+        transposed = True
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            _,
+        ) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups_,
+            transposed,
+            output_padding_,
+        )
+        constant_args = constant_args + [
+            attr,
+            may_convert_to_optional(scalars),
+            algorithm,
+        ]
+        return ConvolutionTransposeUnary(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class MkldnnRnnLayer(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="aten.mkldnn_rnn_layer",
+            cpp_kernel_name="at::mkldnn_rnn_layer",
+        )
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        w0: "TensorBox",
+        w1: "TensorBox",
+        w2: "TensorBox",
+        w3: "TensorBox",
+        hx: "TensorBox",
+        cx: "TensorBox",
+        reverse: bool,
+        batch_sizes: List[int],
+        mode: int,
+        hidden_size: int,
+        num_layers: int,
+        has_biases: bool,
+        bidirectional: bool,
+        batch_first: bool,
+        train: bool,
+    ):
+        x = cls.require_stride1(cls.realize_input(x))
+        # If batch_first, x has been permuted in lstm before entering the mkldnn_rnn_layer.
+        # Make sure x is contiguous in batch_first case.
+        x.freeze_layout()
+        w0 = cls.require_stride1(cls.realize_input(w0))
+        w1 = cls.require_stride1(cls.realize_input(w1))
+        w2 = cls.require_stride1(cls.realize_input(w2))
+        w3 = cls.require_stride1(cls.realize_input(w3))
+        hx = cls.require_stride1(cls.realize_input(hx))
+        hx.freeze_layout()
+        cx = cls.require_stride1(cls.realize_input(cx))
+        cx.freeze_layout()
+
+        input_size = x.get_size()
+        assert len(input_size) == 3, "Expect lstm input to be 3D"
+        # batch_first is handled in the lstm OP. When entering
+        # rnn_layer here, we'll always have batch_first = False
+        seq_length, mini_batch, input_size = input_size
+        output_shape = [seq_length, mini_batch, hidden_size]
+
+        hy_shape = hx.get_size()
+        cy_shape = cx.get_size()
+
+        res: List[IRNode] = []
+
+        inputs = [x, w0, w1, w2, w3, hx, cx]
+        constant_args = [
+            reverse,
+            batch_sizes,
+            mode,
+            hidden_size,
+            num_layers,
+            has_biases,
+            bidirectional,
+            batch_first,
+            train,
+        ]
+
+        packed = MkldnnRnnLayer(
+            MultiOutputLayout(x.get_device()),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+        def get_strides_of_lstm_output(output_shape, batch_first):
+            assert len(output_shape) == 3, "Expect output_shape to be 3D"
+            return make_contiguous_strides_for(output_shape)
+
+        output_sizes = [output_shape, hy_shape, cy_shape]
+        output_strides = [
+            get_strides_of_lstm_output(output_shape, batch_first),
+            make_contiguous_strides_for(hy_shape),
+            make_contiguous_strides_for(cy_shape),
+        ]
+        output_ir = [
+            MultiOutput(
+                FixedLayout(
+                    x.get_device(),
+                    x.get_dtype(),
+                    output_size,
+                    output_stride,
+                ),
+                packed,
+                [(tuple, i)],
+            )
+            for i, (output_size, output_stride) in enumerate(
+                zip(output_sizes, output_strides)
+            )
+        ]
+
+        return output_ir
+
+
+class QConvPointWisePT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp]
+            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp]
+            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = len(inputs) == 5
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.onednn.qconv2d_pointwise",
+            cpp_kernel_name="onednn::qconv2d_pointwise",
+        )
+        self.cpp_kernel_key = "qconv2d_pointwise"
+        self.cpp_op_schema = """
+            at::Tensor(
+                at::Tensor act,
+                double act_scale,
+                int64_t act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                c10::optional<at::Tensor> bias,
+                torch::List<int64_t> stride,
+                torch::List<int64_t> padding,
+                torch::List<int64_t> dilation,
+                int64_t groups,
+                double inv_output_scale,
+                int64_t output_zero_point,
+                c10::optional<c10::ScalarType> output_dtype,
+                c10::string_view attr,
+                torch::List<c10::optional<at::Scalar>> scalars,
+                c10::optional<c10::string_view> algorithm)"""
+
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+
+        x = args[0]
+        packed_weight = args[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        w_scale, w_zp = args[-2], args[-1]
+        (
+            stride,
+            padding,
+            dilation,
+            groups,
+            x_scale,
+            x_zp,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ) = const_args[-12:]
+
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        x_scale: float,
+        x_zp: int,
+        weight: "TensorBox",  # packed_weight
+        w_scale: "TensorBox",
+        w_zp: "TensorBox",
+        bias: "TensorBox",
+        stride_: List[int],
+        padding_: List[int],
+        dilation_: List[int],
+        groups: int,
+        o_inv_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        unary_attr,
+        unary_scalars,
+        unary_algorithm,
+    ):
+        transposed = False
+        output_padding = None
+        (inputs, constant_args, kernel_layout, _) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups,
+            transposed,
+            output_padding,
+        )
+        # swap padding and stride to align with functional conv arg order
+        if bias is None:
+            constant_args[1], constant_args[2] = constant_args[2], constant_args[1]
+        else:
+            constant_args[0], constant_args[1] = constant_args[1], constant_args[0]
+
+        w_scale.realize()
+        w_zp.realize()
+        inputs = inputs + [w_scale, w_zp]
+        constant_args = constant_args + [
+            x_scale,
+            x_zp,
+            o_inv_scale,
+            output_zero_point,
+            output_dtype,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+
+        if output_dtype is not None:
+            assert output_dtype in [torch.float32, torch.bfloat16]
+            # in _prepare_convolution_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set output_dtype is not None, the output buf should be output_dtype instead of uint8.
+            kernel_layout.dtype = output_dtype
+
+        return QConvPointWisePT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+
+
+class QConvPointWiseBinaryPT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        """
+        Needs input/weight/output qparams
+        if bias is not None
+            - inputs = [x, w, b, accum, w_scale, w_zp]
+            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
+            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, accum, w_scale, w_zp]
+            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
+            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = len(inputs) == 6
+        self.idx_for_inplace_sum = 3 if self.has_bias else 2
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name="torch.ops.onednn.qconv2d_pointwise.binary",
+            cpp_kernel_name="onednn::qconv2d_pointwise",
+        )
+        self.cpp_kernel_overload_name = "binary"
+        self.cpp_kernel_key = "qconv2d_pointwise_binary"
+        self.cpp_op_schema = """
+            at::Tensor(
+                at::Tensor act,
+                double act_scale,
+                int64_t act_zero_point,
+                at::Tensor accum,
+                double accum_scale,
+                int64_t accum_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                c10::optional<at::Tensor> bias,
+                torch::List<int64_t> stride,
+                torch::List<int64_t> padding,
+                torch::List<int64_t> dilation,
+                int64_t groups,
+                double inv_output_scale,
+                int64_t output_zero_point,
+                c10::optional<c10::ScalarType> output_dtype,
+                c10::string_view binary_attr,
+                c10::optional<at::Scalar> alpha,
+                c10::optional<c10::string_view> attr,
+                torch::List<c10::optional<at::Scalar>> scalars,
+                c10::optional<c10::string_view> algorithm)"""
+
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+
+        x = args[0]
+        packed_weight = args[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        accum, w_scale, w_zp = args[-3], args[-2], args[-1]
+        (
+            stride,
+            padding,
+            dilation,
+            groups,
+            x_scale,
+            x_zp,
+            accum_scale,
+            accum_zp,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ) = const_args[-16:]
+        conv_args = (
+            x,
+            x_scale,
+            x_zp,
+            accum,
+            accum_scale,
+            accum_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            conv_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    def get_mutation_names(self):
+        return [self.inputs[self.idx_for_inplace_sum].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        x_scale,
+        x_zp,
+        accum: "TensorBox",
+        accum_scale,
+        accum_zp,
+        weight: "TensorBox",  # packed_weight
+        w_scale,
+        w_zp,
+        bias: "TensorBox",
+        stride_: List[int],
+        padding_: List[int],
+        dilation_: List[int],
+        groups: int,
+        o_inv_scale: "TensorBox",
+        output_zero_point: "TensorBox",
+        output_dtype,
+        binary_attr,
+        alpha,
+        unary_attr,
+        unary_scalars,
+        unary_algorithm,
+    ):
+        transposed = False
+        output_padding = None
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_convolution_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+            padding_,
+            stride_,
+            dilation_,
+            groups,
+            transposed,
+            output_padding,
+        )
+
+        accum = cls.require_stride_order(accum, req_stride_order)
+        inputs.append(accum)
+
+        # swap padding and stride to align with functional conv arg order
+        if bias is None:
+            constant_args[1], constant_args[2] = constant_args[2], constant_args[1]
+        else:
+            constant_args[0], constant_args[1] = constant_args[1], constant_args[0]
+
+        w_scale.realize()
+        w_zp.realize()
+        inputs = inputs + [w_scale, w_zp]
+        constant_args = constant_args + [
+            x_scale,
+            x_zp,
+            accum_scale,
+            accum_zp,
+            o_inv_scale,
+            output_zero_point,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+
+        assert (
+            binary_attr == "sum"
+        ), "For now, only post op sum is supported in QConvPointWiseBinaryPT2E."
+
+        packed = QConvPointWiseBinaryPT2E(
+            layout=NoneLayout(accum.get_device()),
+            inputs=inputs,
+            constant_args=constant_args,
+        )
+        mark_node_as_mutating(packed, accum)
+
+        # Return accum since it has been inplace changed.
+        return packed.inputs[packed.idx_for_inplace_sum]
+
+
+class QLinearPointwisePT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        has_bias=True,
+        x_scale_zp_are_tensors=False,
+    ):
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp]
+            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp]
+            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = has_bias
+        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name=(
+                "torch.ops.onednn.qlinear_pointwise.tensor"
+                if x_scale_zp_are_tensors
+                else "torch.ops.onednn.qlinear_pointwise.default"
+            ),
+            cpp_kernel_name="onednn::qlinear_pointwise",
+        )
+        self.cpp_kernel_overload_name = "tensor" if x_scale_zp_are_tensors else ""
+        self.cpp_kernel_key = "qlinear_pointwise"
+        x_scale_type_str, x_zp_type_str = (
+            ("at::Tensor", "at::Tensor")
+            if x_scale_zp_are_tensors
+            else ("double", "int64_t")
+        )
+        self.cpp_op_schema = f"""
+            at::Tensor(
+                at::Tensor act,
+                {x_scale_type_str} act_scale,
+                {x_zp_type_str} act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                c10::optional<at::Tensor> bias,
+                double inv_output_scale,
+                int64_t output_zero_point,
+                c10::optional<c10::ScalarType> output_dtype,
+                std::string post_op_name,
+                torch::List<c10::optional<at::Scalar>> post_op_args,
+                std::string post_op_algorithm)"""
+
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+
+        x = args[0]
+        packed_weight = args[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        w_scale, w_zp = args[-2], args[-1]
+        if self.x_scale_zp_are_tensors:
+            assert len(args) >= 4
+            x_scale, x_zp = args[-4], args[-3]
+            (
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-6:]
+        else:
+            assert len(const_args) >= 8
+            (
+                x_scale,
+                x_zp,
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-8:]
+
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.get_kernel_name(),
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        x_scale: float,
+        x_zp: int,
+        weight: "TensorBox",  # packed_weight
+        w_scale: "TensorBox",
+        w_zp: "TensorBox",
+        bias: "TensorBox",
+        o_inv_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        unary_attr,
+        unary_scalars,
+        unary_algorithm,
+    ):
+        (inputs, constant_args, kernel_layout, _) = _prepare_linear_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+        )
+
+        if isinstance(x_scale, TensorBox) and isinstance(x_zp, TensorBox):
+            x_scale.realize()
+            x_zp.realize()
+            inputs = inputs + [x_scale, x_zp]
+            x_scale_zp_are_tensors = True
+        else:
+            assert isinstance(x_scale, float) and isinstance(x_zp, int)
+            constant_args = constant_args + [x_scale, x_zp]
+            x_scale_zp_are_tensors = False
+        w_scale.realize()
+        w_zp.realize()
+        inputs = inputs + [w_scale, w_zp]
+        constant_args = constant_args + [
+            o_inv_scale,
+            output_zero_point,
+            output_dtype,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+
+        if output_dtype is not None:
+            assert output_dtype in [torch.float32, torch.bfloat16]
+            # in _prepare_linear_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set fp32_output, the output buf should be dtype float32 instead of uint8.
+            kernel_layout.dtype = output_dtype
+
+        return QLinearPointwisePT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            has_bias=(bias is not None),
+            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+        )
+
+
+@dataclasses.dataclass
+class MutableBox(IRNode):
+    """
+    TensorBox / StorageBox allow in-place mutation of Tensors
+    """
+
+    data: IRNode
+
+    def __getattr__(self, name):
+        fn = getattr(self.data, name)
+        if callable(fn):
+            return fn
+        raise AttributeError(f"{type(self.data).__name__}.{name} not callable")
+
+    def realize(self):
+        return self.data.realize()
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        return self.data.get_unbacked_symbol_uses()
+
+    def codegen_reference(self, writer=None):
+        return self.data.codegen_reference(writer)
+
+    @property
+    def layout(self):
+        return self.data.layout  # type: ignore[attr-defined]
+
+    def get_layout(self):
+        return self.layout
+
+    def get_size(self):
+        return self.data.get_size()
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    def __str__(self):
+        if isinstance(self.data, MutableBox):
+            line0 = f"{type(self).__name__}({type(self.data).__name__}("
+            endl = "))"
+            inner = self.data.data
+        else:
+            line0 = f"{type(self).__name__}("
+            inner = self.data
+            endl = ")"
+
+        lines = [
+            line0,
+            indent(str(inner)),
+            endl,
+        ]
+        return "\n".join(lines)
+
+    __repr__ = __str__
+
+
+class TensorBox(MutableBox):
+    @staticmethod
+    def create(data):
+        return TensorBox(StorageBox(data))
+
+
+class StorageBox(MutableBox):
+    def is_input_buffer(self):
+        if isinstance(self.data, (InputBuffer, ReinterpretView)):
+            return self.data.get_name() in V.graph.graph_inputs
+        return False
+
+    def realize(self):
+        if isinstance(
+            self.data,
+            (
+                ComputedBuffer,
+                InputsKernel,
+                InputBuffer,
+                ReinterpretView,
+                TemplateBuffer,
+            ),
+        ):
+            return self.data.get_name()
+        assert isinstance(self.data, (Pointwise, Reduction, Scan)), type(self.data)
+        origin_node = self.data.get_origin_node()
+        traceback = self.data.get_traceback()
+        self.data = ComputedBuffer(
+            name=None,
+            layout=FlexibleLayout(
+                device=self.data.get_device(),
+                dtype=self.data.get_dtype(),
+                size=self.data.get_size(),
+            ),
+            data=self.data,
+        )
+        self.data.name = V.graph.register_buffer(self.data)
+        self.data.origins = self.origins
+        self.data.origin_node = origin_node
+        self.data.traceback = traceback
+        return self.data.name
+
+    def realize_hint(self):
+        """
+        Called on buffers we expect to be forced to realize later.
+        """
+        if (
+            isinstance(self.data, (Pointwise, Reduction))
+            and self.num_reads() > 1
+            and self.is_pointwise_non_scalar_tensor_num_reads_larger_than_one()
+        ):
+            self.realize()
+
+    def has_exceeded_max_reads(self):
+        return isinstance(self.data, Pointwise) and (
+            self.num_reads() > config.realize_acc_reads_threshold
+            or self.has_large_inner_fn()
+        )
+
+    def mark_reuse(self, users):
+        """
+        A heuristic to decide if we should realize a tensor
+        that is used multiple times.
+        """
+
+        def should_realize_on_cpu(loops: Union[Pointwise, Reduction]):
+            """
+            The heuristic for realizing reused result of heavy ops on cpu
+            """
+            heavy_ops = ["exp"]  # a list of heavy ops
+            fn_str = loops.inner_fn_str()
+            return any((op + "(") in fn_str for op in heavy_ops)
+
+        if (
+            users > 1
+            and isinstance(self.data, (Pointwise, Reduction))
+            and (
+                self.num_reads() > config.realize_reads_threshold
+                or self.has_large_inner_fn()
+                or (is_cpu(self.data) and should_realize_on_cpu(self.data))
+            )
+        ):
+            self.realize()
+
+    @cache_on_self
+    def num_reads(self):
+        data = self.data
+        if isinstance(data, (InputsKernel, InputBuffer, ReinterpretView)):
+            return 1
+        if isinstance(data, ComputedBuffer):
+            read_writes = data.get_read_writes()
+        else:
+            assert isinstance(data, (Pointwise, Reduction)), type(data)
+            read_writes = ComputedBuffer(
+                name=None,
+                layout=FlexibleLayout(
+                    device=data.get_device(),
+                    dtype=data.get_dtype(),
+                    size=data.get_size(),
+                ),
+                data=data,
+            ).get_read_writes()
+        return len(read_writes.reads)
+
+    @cache_on_self
+    def is_pointwise_non_scalar_tensor_num_reads_larger_than_one(self):
+        # Skip the check for non Pointwise instances
+        return (
+            (sum(read.index != 0 for read in self.data.get_reads()) > 1)
+            if isinstance(self.data, Pointwise)
+            and all(
+                not isinstance(read, dependencies.StarDep)
+                for read in self.data.get_reads()
+            )
+            else True
+        )
+
+
+@dataclasses.dataclass
+class Subgraph(IRNode):
+    name: str
+    graph_module: torch.fx.GraphModule
+    graph: Optional["GraphLowering"] = None
+
+
+@dataclasses.dataclass
+class Conditional(ExternKernel):
+    predicate: Optional[DynamicScalar] = None
+    operands: Optional[List[TensorBox]] = None
+    true_subgraph: Optional[Subgraph] = None
+    false_subgraph: Optional[Subgraph] = None
+    outputs: Optional[List[MultiOutput]] = None
+
+    def __init__(
+        self,
+        predicate: DynamicScalar,
+        operands: List[TensorBox],
+        true_subgraph: Subgraph,
+        false_subgraph: Subgraph,
+        layout: MultiOutputLayout,
+    ):
+        self.predicate = predicate
+        self.operands = operands
+        self.true_subgraph = true_subgraph
+        self.false_subgraph = false_subgraph
+
+        super().__init__(
+            name=None,
+            layout=layout,  # type: ignore[arg-type]
+            inputs=[predicate, *operands],  # type: ignore[list-item]
+        )
+
+        self.name = V.graph.register_buffer(self)
+
+    @classmethod
+    def create(
+        cls,
+        predicate: TensorBox,
+        true_fn: Subgraph,
+        false_fn: Subgraph,
+        operands: List[TensorBox],
+    ):
+        predicate = cls.realize_input(predicate)
+        operands = [cls.realize_input(x) for x in operands]
+
+        fx_operands = V.graph.current_node.args[-1]
+        fake_operands = [x.meta["val"] for x in fx_operands]  # type: ignore[union-attr]
+
+        for subgraph in (true_fn, false_fn):
+            if subgraph.graph is None:
+                # create and lower subgraphs
+                subgraph.graph = V.graph.make_subgraph(
+                    gm=subgraph.graph_module,
+                    example_inputs=fake_operands,
+                    subgraph_name=subgraph.name,
+                )
+                with V.set_graph_handler(subgraph.graph):
+                    subgraph.graph.run(*fake_operands)
+
+        true_outputs = true_fn.graph.graph_outputs  # type: ignore[union-attr]
+        false_outputs = true_fn.graph.graph_outputs  # type: ignore[union-attr]
+
+        def _aliased_buffers(outputs):
+            buffers = [
+                output.unwrap_view() if isinstance(output, ReinterpretView) else output
+                for output in outputs
+            ]
+            # assuming the same buffer is represented by the same IRNode object
+            return len({id(buffer) for buffer in buffers}) < len(outputs)
+
+        for name, outputs in (("true_fn", true_outputs), ("false_fn", false_outputs)):
+            if _aliased_buffers(true_outputs):
+                raise AssertionError(
+                    "Output aliasing is currently not supported in compiled torch.cond. "
+                    f"The outputs of the {name} subgraph of torch.cond are aliased: {outputs}"
+                )
+
+        # make sure true and false outputs are structurally equivalent
+        assert len(true_outputs) == len(false_outputs), (true_outputs, false_outputs)
+        for i, (to, fo) in enumerate(zip(true_outputs, false_outputs)):
+            assert to.get_size() == fo.get_size(), (i, to, fo)
+            assert to.get_stride() == fo.get_stride(), (i, to, fo)
+            assert to.get_device() == fo.get_device(), (i, to, fo)
+            assert to.get_dtype() == fo.get_dtype(), (i, to, fo)
+            assert to.get_layout().offset == fo.get_layout().offset, (i, to, fo)
+
+        conditional = Conditional(
+            predicate=predicate,
+            operands=operands,
+            true_subgraph=true_fn,
+            false_subgraph=false_fn,
+            # use predicate device for consistent codegen-ing
+            layout=MultiOutputLayout(predicate.get_device()),
+        )
+
+        outputs = [
+            MultiOutput(
+                FixedLayout(
+                    device=output.get_device(),
+                    dtype=output.get_dtype(),
+                    size=output.get_size(),
+                    stride=output.get_stride(),
+                    offset=output.get_layout().offset,
+                ),
+                conditional,
+                [(list, i)],
+            )
+            # as the true and false outputs are equivalent,
+            # we can use either of them here as a "template"
+            for i, output in enumerate(true_outputs)
+        ]
+
+        conditional.outputs = outputs
+        return outputs
+
+    def codegen(self, wrapper):
+        wrapper.codegen_conditional(self)
+
+
+class InterpreterShim(torch.fx.Interpreter):
+    @staticmethod
+    @functools.lru_cache(None)
+    def _dummy_gm():
+        return torch.fx.symbolic_trace(identity)
+
+    def __init__(self, graph, submodules):
+        # call super() with a placeholder to avoid constructing a
+        # GraphModule which is very expensive (it does codegen).
+        super().__init__(self._dummy_gm(), garbage_collect_values=False)
+        self.module = self  # type: ignore[assignment]
+        self.graph = graph
+        self.submodules = submodules
+        self.extra_traceback = False
+        self.fetch_attr = submodules.__getitem__
+        self.current_node = None
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        self.current_node = n
+        return super().run_node(n)
+
+    def run(self, *args, **kwargs):
+        with V.set_interpreter_handler(self):
+            return super().run(*args, **kwargs)
+
+
+class LoopBody:
+    """
+    Captures the body of a Loops subclass into an FX graph.  Persists any
+    indexing simplifications and makes it easier to analyze loop bodies.
+    """
+
+    def __init__(self, fn, args, var_ranges):
+        super().__init__()
+        self.var_ranges = var_ranges
+        self.indexing_exprs = {}
+        self.indexing_exprs_name = {}
+        self.reads = []
+        self.writes = []
+        self.reads_name2expr = {}
+        self.writes_name2expr = {}
+        self.other = []
+        self.submodules = {"get_index": self.get_index}
+        self.subblocks = {}
+        self.indirect_vars = []
+        self.root_block = LoopBodyBlock(self, fn, args)
+        self.indexing = None
+
+    @cache_on_self
+    def get_nodes(self):
+        all_graphs = itertools.chain(
+            (self.root_block.graph,),
+            (block.graph for block in self.subblocks.values()),
+        )
+        return [node for graph in all_graphs for node in graph.nodes]
+
+    @cache_on_self
+    def bounds(self):
+        # Doing a local import to avoid dumping all the code here
+        from .bounds import BoundVars
+
+        return BoundVars(self)
+
+    def debug_str(self):
+        lines = [f"var_ranges = {dict(self.var_ranges)}"]
+        lines.extend([f"{name} = {val}" for name, val in self.indexing_exprs.items()])
+        lines.extend(
+            [
+                block.debug_str(name)
+                for name, block in itertools.chain(
+                    [("body", self.root_block)], self.subblocks.items()
+                )
+            ]
+        )
+        return "\n".join(lines)
+
+    def add_index_expr(self, expr: sympy.Expr, category, buf_name):
+        getattr(self, category).append(expr)
+        if buf_name is not None:
+            getattr(self, f"{category}_name2expr")[buf_name] = expr
+        if expr not in self.indexing_exprs_name:
+            name = f"index{len(self.indexing_exprs)}"
+            self.indexing_exprs_name[expr] = name
+            self.indexing_exprs[name] = expr
+        return self.indexing_exprs_name[expr]
+
+    def add_submodule(self, block, prefix):
+        """Not actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodes"""
+        if prefix[-1].isnumeric() and prefix not in self.submodules:
+            name = prefix
+        else:
+            name = f"{prefix}{len(self.submodules)}"
+        self.submodules[name] = block
+        return name
+
+    def add_indirect(self, size):
+        name = f"indirect{len(self.indirect_vars)}"
+        var = sympy_index_symbol(name)
+        self.indirect_vars.append(var)
+        return var
+
+    def replace_indirect(self, old, new):
+        """Swap in a variable used in indirect indexing"""
+        if str(old) == str(new):
+            return
+        assert self.indexing is not None
+        self.indexing = {k: sympy_subs(v, {old: new}) for k, v in self.indexing.items()}
+
+    def get_index(self, name):
+        assert self.indexing is not None
+        return self.indexing[name]
+
+    def __call__(self, *indices):
+        index = list(itertools.chain.from_iterable(indices))
+        assert len(index) == len(self.var_ranges), (index, self.var_ranges)
+        assert all(v not in self.var_ranges for v in index)
+        replacements = dict(zip(self.var_ranges.keys(), index))
+        self.indexing = {
+            name: sympy_subs(expr, replacements)
+            for name, expr in self.indexing_exprs.items()
+        }
+        result = self.root_block()
+        self.indexing = None
+        return result
+
+
+class LoopBodyBlock:
+    """
+    Captures the body of a Loops subclass into an FX graph.
+    In normal cases there will be a 1:1 mapping between LoopBody and
+    LoopBodyBlock, hower in the case of ops.masked() the masked out
+    operations will manifest as an extra LoopBodyBlock.
+    """
+
+    def __init__(self, body: LoopBody, fn: Callable[..., Any], args: List[Any]):
+        self.body = body
+
+        def add_index(expr, category, buf_name=None):
+            return tracer.create_proxy(
+                "call_module",
+                "get_index",
+                (self.body.add_index_expr(expr, category, buf_name),),
+                {},
+            )
+
+        class CaptureIndexing(V.WrapperHandler):  # type: ignore[name-defined]
+            self.name = "CaptureIndexing"
+
+            def load(self, name: str, index: sympy.Expr):
+                index = add_index(index, "reads", name)
+                return self._inner.load(name, index)
+
+            def store(self, name, index, value, mode=None):
+                index = add_index(index, "writes", name)
+                return self._inner.store(name, index, value, mode)
+
+            def store_reduction(self, name, index, value):
+                index = add_index(index, "writes", name)
+                return self._inner.store_reduction(name, index, value)
+
+            def reduction(self, dtype, src_dtype, reduction_type, value):
+                result = self._inner.reduction(dtype, src_dtype, reduction_type, value)
+                if "welford" in reduction_type:
+                    return tuple(result[i] for i in range(3))
+                return result
+
+            def index_expr(self, index, dtype):
+                if isinstance(index, (int, sympy.Integer)):
+                    return self._inner.constant(int(index), dtype)
+                index = add_index(index, "other")
+                return self._inner.index_expr(index, dtype)
+
+            def bucketize(
+                self,
+                values,
+                offsets_name: str,
+                offsets_size: sympy.Expr,
+                indexing_dtype: torch.dtype,
+                right: bool,
+            ):
+                offsets_size = add_index(offsets_size, "other")
+                return self._inner.bucketize(
+                    values, offsets_name, offsets_size, indexing_dtype, right
+                )
+
+            @staticmethod
+            def masked(mask_proxy, masked_body: Callable[..., Any], other_proxy):
+                """
+                Recursively capture the masked out body in another LoopBodyBlock
+                """
+
+                subblock: LoopBodyBlock
+
+                def shim(mask, other):
+                    return V.ops.masked(mask, subblock, other)
+
+                name = self.body.add_submodule(shim, "masked_subblock")
+                subblock = LoopBodyBlock(self.body, masked_body, [])
+                self.body.subblocks[name] = subblock
+                return tracer.create_proxy(
+                    "call_module", name, (mask_proxy, other_proxy), {}
+                )
+
+            @staticmethod
+            def scan(
+                dtype_proxy, combine_fn: Callable[..., Any], value_proxy, init_proxy
+            ):
+                def shim(dtype, value, init):
+                    return V.ops.scan(dtype, combine_fn, value, init)
+
+                name = self.body.add_submodule(shim, "scan")
+                return tracer.create_proxy(
+                    "call_module", name, (dtype_proxy, value_proxy, init_proxy), {}
+                )
+
+            def frexp(self, value_proxy):
+                result = self._inner.frexp(value_proxy)
+                # Proxies are iterable, but some methods expect tuples/lists
+                return (result[0], result[1])
+
+            @staticmethod
+            def indirect_indexing(index_proxy, size, check=True):
+                """
+                Flow data from tensors into indexing formulas.
+                Introduce a call_module to update the indexing.
+                """
+
+                var = self.body.add_indirect(size)
+
+                def set_indirect(new_var):
+                    self.body.replace_indirect(
+                        var, V.ops.indirect_indexing(new_var, size, check)
+                    )
+
+                tracer.create_proxy(
+                    "call_module",
+                    self.body.add_submodule(set_indirect, f"set_{var}"),
+                    (index_proxy,),
+                    {},
+                )
+                return var
+
+            @staticmethod
+            def output(result):
+                tracer.create_proxy("output", "output", (result,), {})
+
+        tracer = torch.fx.Tracer()
+        tracer.graph = torch.fx.Graph(tracer_cls=tracer.__class__)
+        proxy_ops = tracer.create_proxy("placeholder", "ops", (), {})
+
+        from .index_propagation import IndexPropagation
+        from .sizevars import SimplifyIndexing
+
+        handler: Any = SimplifyIndexing(
+            CaptureIndexing(proxy_ops), self.body.var_ranges
+        )
+        if config.constant_and_index_propagation:
+            handler = IndexPropagation(handler)
+
+        with V.set_ops_handler(handler):
+            # This indirection is just a cute way to get IndexPropagation to
+            # unwrap the return value.
+            ops.output(fn(*args))
+        self.graph = tracer.graph
+
+    def __call__(self):
+        graph = self.graph
+        submodules = self.body.submodules
+
+        return InterpreterShim(graph, submodules).run(V.get_ops_handler())
+
+    def debug_str(self, name="block"):
+        code = torch.fx.GraphModule(self.body.submodules, self.graph).code
+        return re.sub(
+            # strip `; del var0` suffixes to make output prettier
+            r";[^\n]*",
+            "",
+            code.strip().replace("def forward(", f"def {name}("),
+        )
+
+
+class Wait(ExternKernelAlloc):
+    """
+    Wait should not be used by itself.  It should always be constructed in tandem
+    with a collective op that produces a work to wait on.
+    """
+
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+    ):
+        super().__init__(layout, inputs, constant_args)
+
+    def should_allocate(self):
+        return False
+
+    def codegen(self, wrapper):
+        from .codegen.wrapper import ReuseLine
+
+        wrapper.add_import_once(
+            "from torch.distributed._functional_collectives_impl import _wait_tensor"
+        )
+        (input_collective,) = (t.codegen_reference() for t in self.inputs)
+        wrapper.writeline(f"{input_collective} = _wait_tensor({input_collective})")
+
+        # wait op still needs to produce a 'buffer' that represents the tensor output.
+        # this is a symbolic gesture, and it gets handled by WrapperCodegen.
+        # codegen outputs a '# reuse' line that assigns the input buffer here ('input_collective')
+        # to a new name (`self.get_name()`) and `del`s the old name.
+        wrapper.writeline(ReuseLine(wrapper, self.inputs[0], self, delete_old=False))
+
+    @classmethod
+    def create(cls, collective_op: "TensorBox"):
+        # TODO(whc) i'm not sure what's going on here, this probably means I missed something upstream
+        collective_op.decide_layout()
+        return Wait(
+            layout=AliasedLayout(collective_op),
+            inputs=[collective_op],
+        )
+
+    def get_alias_names(self):
+        # Signal to codegen that our output buffer isn't safe to reuse
+        return [self.inputs[0].codegen_reference()]
+
+    def get_mutation_names(self):
+        # The generated `_wait_tensor` op mutates the input tensor
+        return [self.inputs[0].codegen_reference()]
+
+
+class CollectiveKernel(ExternKernel):
+    """
+    Each collective should follow the pattern:
+    - extend InPlaceCollectiveKernel or OutOfPlaceCollectiveKernel.
+    - the kernel delegates into c10d processgroup, which returns a 'work' obj
+    - the work obj is registered via _register_tensor_work so it can be waited on later
+    """
+
+    def __init__(self, layout, inputs, constant_args):
+        super().__init__(None, layout, inputs, constant_args)
+        self.name = V.graph.register_buffer(self)
+
+    def should_emit_register_tensor_work(self):
+        return True
+
+    def should_emit_find_or_create_pg(self):
+        return True
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        # factor so the boilerplate can be handled in CollectiveKernel.codegen
+        raise NotImplementedError("Must implement")
+
+    def codegen_output(self, wrapper, output_name, input_names):
+        # factor so the boilerplate can be handled in CollectiveKernel.codegen
+        raise NotImplementedError("Must implement")
+
+    @classmethod
+    def wrap_inputs_as_inplace(cls, inputs):
+        def wrap_input(var):
+            op = InPlaceHint(
+                FlexibleLayout(var.get_device(), var.get_dtype(), var.get_size()), var
+            )
+            return TensorBox.create(op)
+
+        return list(map(wrap_input, inputs))
+
+    def codegen(self, wrapper):
+        wrapper.add_import_once("import torch.distributed as dist")
+        wrapper.add_import_once("import torch.distributed.distributed_c10d as c10d")
+        wrapper.add_import_once(
+            "import torch.distributed._functional_collectives_impl as fun_col_impl"
+        )
+        # extract references to our args in string form for codegen output
+        input_names = [t.codegen_reference() for t in self.inputs]
+        output_name = self.get_name()
+        tag, ranks, group_size = self.constant_args
+
+        if self.should_emit_find_or_create_pg():
+            # TODO: avoid more than one ref of the same pg (even though they are cached inside the api)
+            wrapper.writeline(
+                f"{output_name}_pg = c10d._find_or_create_pg_by_ranks_and_tag('{tag}', {ranks}, {group_size})"
+            )
+
+        self.codegen_output(wrapper, output_name, input_names)
+        self.codegen_collective(wrapper, output_name, input_names)
+        if self.should_emit_register_tensor_work():
+            wrapper.writeline(
+                f"fun_col_impl._register_tensor_work({output_name}, {output_name}_work)"
+            )
+
+
+class InPlaceCollectiveKernel(CollectiveKernel):
+    """
+    InPlaceCollectiveKernel are those with in-out arguments such as all_reduce.
+    Extend this kernel if your collective needs to modify its inputs in-place.
+    """
+
+    def __init__(self, layout, inputs, constant_args):
+        super().__init__(layout, inputs, constant_args)
+
+    def should_allocate(self):
+        return False
+
+    def has_side_effects(self):
+        return True
+
+    def codegen_output(self, wrapper, output_name, input_names):
+        if len(input_names) > 1:
+            wrapper.writeline(f"{output_name} = [{','.join(input_names)}] ")
+        else:
+            wrapper.writeline(f"{output_name} = {input_names[0]}")
+
+
+class OutOfPlaceCollectiveKernel(CollectiveKernel):
+    """
+    OutOfPlaceCollectiveKernel are those that allocate their
+    outputs and leave their inputs inplace, such as all_gather.
+    """
+
+    def __init__(self, layout, inputs, outputs, constant_args):
+        super().__init__(layout, inputs + outputs, constant_args)
+        self.outputs = outputs
+        self.original_inputs = inputs
+        # NOTE: As seen in issue #108780, output buffers of out-of-place collectives
+        # could be incorrectly reused. As a safety measure, here we just ban the reuse of them.
+        # TODO: A better fix is to figure out how to propagate the aliases properly,
+        # so that the buffer is only reused after all its users have consumed it.
+        for x in self.outputs:
+            V.graph.never_reuse_buffers.add(x.name)
+
+    def should_allocate(self):
+        return False
+
+    def has_side_effects(self):
+        return True
+
+    def codegen_output(self, wrapper, output_name, input_names):
+        input_names = [t.codegen_reference() for t in self.original_inputs]
+        wrapper.writeline(f"{output_name}_inputs = [{','.join(input_names)}]")
+        wrapper.writeline(f"{output_name} = [{','.join(x.name for x in self.outputs)}]")
+
+    @classmethod
+    def create_output_buffers(cls, inputs, size_cb=None):
+        outputs = []
+        for input in inputs:
+            new_size = input.get_size()
+            if size_cb is not None:
+                size_cb(new_size)
+            # new_size[0] *= group_size
+
+            buff = OutputBuffer(
+                layout=FlexibleLayout(
+                    device=input.get_device(),
+                    dtype=input.get_dtype(),
+                    size=new_size,
+                ),
+            )
+            outputs.append(buff)
+        return outputs
+
+    @classmethod
+    def create_output_nodes(cls, coll, output_buffers):
+        return [
+            MultiOutputNoSizeAssert(
+                out_t.layout,
+                coll,
+                f"[{i}]",
+            )
+            for i, out_t in enumerate(output_buffers)
+        ]
+
+
+class InPlaceHint(ExternKernel):
+    """
+    Helper OP to encode an in/out argument that tries to make it inplace whenever possible.
+    Wrap the input of your inplace op to enable this behavior.
+
+    The design is based on two key decisions:
+    - this node is responsible for allocating the in/out buffer used by the collective.
+        This is controlled by the ``should_allocate`` method that returns True here and
+        False for the collective node
+    - The scheduler special-case this node and enable it to reuse its input.
+    """
+
+    def codegen(self, wrapper):
+        input_name = self.inputs[0].codegen_reference()
+        output_name = self.get_name()
+        if not wrapper.did_reuse(self, self.inputs[0]):
+            wrapper.writeline(f"{output_name}.copy_({input_name}) #no reuse")
+
+    def __init__(self, layout, input):
+        input = self.realize_input(input)
+        super().__init__(None, layout, self.unwrap_storage([input]), ())
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return True
+
+
+class OutputBuffer(ExternKernel):
+    """
+    Represent the output buffer used by ops that require multiple of them
+    """
+
+    def __init__(self, layout):
+        super().__init__(name=None, layout=layout, inputs=[])
+        self.name = V.graph.register_buffer(self)
+
+    def should_allocate(self):
+        return True
+
+    def codegen(self, wrapper):
+        wrapper.writeline(f"# collective out buffer {self.name}")
+
+
+class MultiOutputNoSizeAssert(MultiOutput):
+    """
+    Extract partial output from a multi-output OP.
+    Works like MultiOutput but doesn't assert size. This must be a property guaranteed by the op emitting this.
+    """
+
+    def __init__(self, layout, input, index):
+        super().__init__(layout, input, [])
+        self.index = index
+
+    def codegen(self, wrapper):
+        wrapper.writeline(
+            f"{self.get_name()} = {self.inputs[0].get_name()}{self.index}"
+        )
+
+
+class Broadcast(InPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, constant_args, src):
+        super().__init__(layout, inputs, constant_args)
+        self.src = src
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    @classmethod
+    def create(
+        cls, x: "TensorBox", src: int, tag: str, ranks: List[int], group_size: int
+    ):
+        inplace_inputs = cls.wrap_inputs_as_inplace([x])
+        packed = Broadcast(
+            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
+            inputs=inplace_inputs,
+            constant_args=[tag, ranks, group_size],
+            src=src,
+        )
+        mark_node_as_mutating(packed, inplace_inputs[0])
+        return inplace_inputs[0]
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = dist.broadcast("
+            f"{output_name}, async_op=True, group={output_name}_pg, src={self.src})"
+        )
+
+
+class AllReduceCoalesced(InPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, constant_args, reduce_op):
+        super().__init__(layout, inputs, constant_args)
+        self.reduce_op = reduce_op
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    @classmethod
+    def create(
+        cls,
+        inputs: List["TensorBox"],
+        reduce_op: str,
+        tag: str,
+        ranks: List[int],
+        group_size: int,
+    ):
+        inplace_inputs = cls.wrap_inputs_as_inplace(inputs)
+        packed = AllReduceCoalesced(
+            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
+            inputs=inplace_inputs,
+            constant_args=[tag, ranks, group_size],
+            reduce_op=reduce_op,
+        )
+        mark_node_as_mutating(packed, inplace_inputs[0])
+        return inplace_inputs
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_reduce_coalesced("
+            f"{output_name}, "
+            f"op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'), "
+            f"group={output_name}_pg, "
+            "async_op=True)"
+        )
+
+
+class AllReduce(InPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, constant_args, reduce_op):
+        super().__init__(layout, inputs, constant_args)
+        self.reduce_op = reduce_op
+
+    def get_mutation_names(self):
+        return [self.inputs[0].get_name()]
+
+    def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
+        return set()
+
+    @classmethod
+    def create(
+        cls, x: "TensorBox", reduce_op: str, tag: str, ranks: List[int], group_size: int
+    ):
+        inplace_inputs = cls.wrap_inputs_as_inplace([x])
+
+        packed = AllReduce(
+            layout=NoneLayout(inplace_inputs[0].get_device()),  # type: ignore[arg-type]
+            inputs=inplace_inputs,
+            constant_args=[tag, ranks, group_size],
+            reduce_op=reduce_op,
+        )
+        mark_node_as_mutating(packed, inplace_inputs[0])
+        return inplace_inputs[0]
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_reduce("
+            f"{output_name}, async_op=True, group={output_name}_pg, op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'))"
+        )
+
+
+class AllGatherIntoTensor(OutOfPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, outputs, constant_args):
+        super().__init__(layout, inputs, outputs, constant_args)
+
+    @classmethod
+    def create(cls, x: "TensorBox", tag: str, ranks: List[int], group_size: int):
+        inputs = [cls.realize_input(x)]
+
+        def compute_size(new_size):
+            new_size[0] *= group_size
+
+        outputs = cls.create_output_buffers(inputs, compute_size)
+
+        layout = MultiOutputLayout(inputs[0].get_device())
+
+        packed = AllGatherIntoTensor(
+            layout=layout,
+            inputs=inputs,
+            outputs=outputs,
+            constant_args=[tag, ranks, group_size],
+        )
+        return cls.create_output_nodes(packed, outputs)[0]
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_gather_into_tensor("
+            f"{output_name}[0], {output_name}_inputs[0], async_op=True, group={output_name}_pg)"
+        )
+
+
+class ReduceScatterTensor(OutOfPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, outputs, constant_args, reduce_op):
+        super().__init__(layout, inputs, outputs, constant_args)
+        self.reduce_op = reduce_op
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        reduce_op: str,
+        tag: str,
+        ranks: List[int],
+        group_size: int,
+    ):
+        inputs = [cls.realize_input(x)]
+
+        def compute_size(new_size):
+            new_size[0] //= group_size
+
+        outputs = cls.create_output_buffers(inputs, compute_size)
+
+        layout = MultiOutputLayout(inputs[0].get_device())
+
+        packed = ReduceScatterTensor(
+            layout=layout,
+            inputs=inputs,
+            outputs=outputs,
+            constant_args=[tag, ranks, group_size],
+            reduce_op=reduce_op,
+        )
+        return cls.create_output_nodes(packed, outputs)[0]
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = dist.reduce_scatter_tensor("
+            f"{output_name}[0], {output_name}_inputs[0], "
+            f"async_op=True, group={output_name}_pg, op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'))"
+        )
+
+
+class AllGatherIntoTensorCoalesced(OutOfPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, outputs, constant_args):
+        super().__init__(layout, inputs, outputs, constant_args)
+
+    @classmethod
+    def create(
+        cls,
+        inputs: List["TensorBox"],
+        tag: str,
+        ranks: List[int],
+        group_size: int,
+    ):
+        inputs = [cls.realize_input(x) for x in inputs]
+
+        def compute_size(new_size):
+            new_size[0] *= group_size
+
+        outputs = cls.create_output_buffers(inputs, compute_size)
+
+        layout = MultiOutputLayout(inputs[0].get_device())
+
+        packed = AllGatherIntoTensorCoalesced(
+            layout=layout,
+            inputs=inputs,
+            outputs=outputs,
+            constant_args=[tag, ranks, group_size],
+        )
+
+        return outputs
+        # return cls.create_output_nodes(packed, outputs)
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = fun_col_impl._all_gather_into_tensor_coalesced_fallback("
+            f"output_tensors={output_name}, "
+            f"input_tensors={output_name}_inputs, "
+            f"group={output_name}_pg, "
+            "async_op=True)"
+        )
+
+
+class ReduceScatterTensorCoalesced(OutOfPlaceCollectiveKernel):
+    def __init__(self, layout, inputs, outputs, constant_args, reduce_op):
+        super().__init__(layout, inputs, outputs, constant_args)
+        self.reduce_op = reduce_op
+
+    @classmethod
+    def create(
+        cls,
+        inputs: List["TensorBox"],
+        reduce_op: str,
+        tag: str,
+        ranks: List[int],
+        group_size: int,
+    ):
+        inputs = [cls.realize_input(x) for x in inputs]
+
+        def compute_size(new_size):
+            new_size[0] //= group_size
+
+        outputs = cls.create_output_buffers(inputs, compute_size)
+
+        layout = MultiOutputLayout(inputs[0].get_device())
+
+        _ = ReduceScatterTensorCoalesced(
+            layout=layout,
+            inputs=inputs,
+            outputs=outputs,
+            constant_args=[tag, ranks, group_size],
+            reduce_op=reduce_op,
+        )
+
+        return outputs
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        wrapper.writeline(
+            f"{output_name}_work = fun_col_impl._reduce_scatter_tensor_coalesced_fallback("
+            f"output_tensors={output_name}, "
+            f"input_tensors={output_name}_inputs, "
+            f"op=fun_col_impl._str_to_reduce_op('{str(self.reduce_op)}'), "
+            f"group={output_name}_pg, "
+            "async_op=True)"
+        )
+
+
+# TODO(yifu): replace the CollectiveKernel IR hierarchy with _CollectiveKernel.
+class _CollectiveKernel(FallbackKernel):
+    def should_allocate(self):
+        return False
+
+    def has_side_effects(self):
+        return True
+
+    # This is identical to FallbackKernel.set_cpp_kernel(), minus the
+    # part that checks against input aliasing and mutation.
+    def set_cpp_kernel(self, kernel):
+        from .codegen.wrapper import get_cpp_op_schema
+
+        self.cpp_kernel_name = kernel._schema.name
+        self.cpp_kernel_overload_name = kernel._schema.overload_name
+        self.cpp_kernel_key = f"{self.cpp_kernel_name.replace('::', '_')}_{self.cpp_kernel_overload_name}"  # type: ignore[union-attr]
+
+        self.cpp_op_schema = get_cpp_op_schema(kernel)
+        self.ordered_kwargs_for_cpp_kernel = [
+            x.name for x in kernel._schema.arguments if x.kwarg_only
+        ]
+
+    # NOTE: [In-Place Collective Safety]
+    # Between the initiation and completion of an in-place collective, the
+    # input buffers are subject to both volatile reads and volatile writes.
+    # They must not be read, written to or reused by another kernel. To ensure
+    # the constraints, we model collective -> wait_tensor as as two-step
+    # mutation of the input buffers.
+    @classmethod
+    def create_inplace(
+        cls, kernel, inputs: Union[TensorBox, List[TensorBox]], *args, **kwargs
+    ) -> None:
+        cpp_kernel_name = kernel._name
+        python_kernel_name = cpp_kernel_name.replace("::", ".")
+        with V.graph.fake_mode:
+            (
+                example_output,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            ) = cls.process_kernel(kernel, inputs, *args, **kwargs)
+        for tensor_arg in tensor_args:
+            tensor_arg.realize()
+
+        packed = cls(
+            NoneLayout(tensor_args[0].get_device()),
+            kernel,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+        )
+        packed.cpp_kernel_name = cpp_kernel_name
+        packed.python_kernel_name = python_kernel_name
+
+        def mark_mutation(x):
+            if isinstance(x.data, BaseView):
+                x = x.data.unwrap_view()
+            MutationOutput(x.layout, x, packed)
+
+        pytree.tree_map(lambda inp: mark_mutation(inp), inputs)
+
+    # NOTE: [Out-of-Place Collective Safety]
+    # Between the initiation and completion of an out-of-place collective:
+    #
+    # Input buffers:
+    # - Are subject to volatile reads
+    # - Can be read by another kernel
+    # - Must not be written to or reused by another kernel
+    #
+    # Output buffers:
+    # - Are subject to volatile writes
+    # - Must not be read, written to or reused by another kernel
+    #
+    # To ensure the safety of input buffers without sacrificing read
+    # availability, we add input buffers as read deps of wait_tensor kernels.
+    #
+    # To ensure the safety of output buffers, we model wait_tensor as a
+    # mutation to the output buffer. Note we also assumes the user program being
+    # correct and the output buffer is not consumed by kernels other than
+    # wait_tensor.
+    #
+    # TODO(yifu): add a pre-grad pass to validate the correctness of collective
+    # usage in the user program.
+    @classmethod
+    def create_out_of_place(
+        cls, kernel, inputs: Union[TensorBox, List[TensorBox]], *args, **kwargs
+    ):
+        cpp_kernel_name = kernel._name
+        python_kernel_name = cpp_kernel_name.replace("::", ".")
+        with V.graph.fake_mode:
+            (
+                example_output,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            ) = cls.process_kernel(kernel, inputs, *args, **kwargs)
+        for tensor_arg in tensor_args:
+            tensor_arg.realize()
+
+        if isinstance(example_output, list):
+            device = cls.find_device(tensor_args, example_output)
+            packed = cls(
+                MultiOutputLayout(device),
+                kernel,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            )
+            packed.cpp_kernel_name = cpp_kernel_name
+            packed.python_kernel_name = python_kernel_name
+            packed.outputs = [
+                MultiOutput(
+                    cls.tensor_to_layout(tensor),
+                    packed,
+                    [(list, i)],
+                )
+                for i, tensor in enumerate(example_output)
+            ]
+            return packed.outputs
+        else:
+            packed = cls(
+                cls.tensor_to_layout(example_output),
+                kernel,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            )
+            packed.cpp_kernel_name = cpp_kernel_name
+            packed.python_kernel_name = python_kernel_name
+            packed.outputs = [packed]
+            return packed
+
+
+class _WaitKernel(_CollectiveKernel):
+    def get_volatile_reads(self):
+        inp = self.inputs[0]
+        if isinstance(inp, _CollectiveKernel):
+            # Out-of-place single-output
+            return [inp.inputs[0]]
+        elif isinstance(inp, MultiOutput):
+            # This can be two things:
+            # 1. Out-of-place multi-output coll
+            # 2. In-place coll with inputs coming from another MultiOutput
+            coll = inp.inputs[0]
+            # Case 1
+            if isinstance(coll, _CollectiveKernel):
+                _, idx = inp.indices[0]
+                return [coll.inputs[idx]]
+            # Case 2
+            return []
+        else:
+            # In-place requires no additional deps handling for volatile
+            # reads since the inputs are mutated.
+            return []
+
+    @classmethod
+    def create_wait(cls, kernel, inp: TensorBox) -> None:
+        with V.graph.fake_mode:
+            (
+                example_output,
+                tensor_args,
+                non_tensor_args,
+                unflatten_args,
+            ) = cls.process_kernel(kernel, inp)
+        packed = cls(
+            NoneLayout(inp.get_device()),
+            kernel,
+            tensor_args,
+            non_tensor_args,
+            unflatten_args,
+        )
+        if isinstance(inp.data, BaseView):
+            inp = inp.data.unwrap_view()
+        MutationOutput(inp.layout, inp, packed)
+
+    def get_read_writes(self):
+        read_writes = super().get_read_writes()
+        # See [Out-of-Place Collective Safety].
+        volatile_reads = self.get_volatile_reads()
+        for vr in volatile_reads:
+            read_writes.reads.add(dependencies.StarDep(vr.get_name()))
+        return read_writes
+
+
+# NB: recursive structure here reflects val_to_arg_str, avoid
+# calling free_unbacked_symbols on "exotic" types that don't get pexpr
+# treatment
+def maybe_free_unbacked_symbols(s):
+    if isinstance(s, (SymTypes, sympy.Expr)):
+        # This branch should be impossible in return position
+        return free_unbacked_symbols(s)
+    elif isinstance(s, (tuple, list)):
+        r = set()
+        for t in s:
+            r |= maybe_free_unbacked_symbols(t)
+        return r
+    elif isinstance(s, torch.Tensor):
+        # This branch is impossible in constant-args position
+        return free_unbacked_symbols(s)
+    else:
+        return set()
+
+
+class AllToAllSingle(OutOfPlaceCollectiveKernel):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        outputs,
+        constant_args,
+        output_split_sizes,
+        input_split_sizes,
+    ):
+        super().__init__(layout, inputs, outputs, constant_args)
+        self.output_split_sizes = output_split_sizes
+        self.input_split_sizes = input_split_sizes
+
+    def get_unbacked_symbol_uses(self) -> Set[sympy.Symbol]:
+        r = set()
+        if self.output_split_sizes is not None:
+            r |= free_unbacked_symbols(self.output_split_sizes)
+        if self.input_split_sizes is not None:
+            r |= free_unbacked_symbols(self.input_split_sizes)
+        return r
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        output_split_sizes: Optional[List[Expr]],
+        input_split_sizes: Optional[List[Expr]],
+        tag: str,
+        ranks: List[int],
+        group_size: int,
+    ):
+        inputs = [cls.realize_input(x)]
+
+        def compute_size(new_size):
+            if output_split_sizes is not None:
+                new_size[0] = sum(output_split_sizes)
+
+        outputs = cls.create_output_buffers(inputs, compute_size)
+
+        layout = MultiOutputLayout(inputs[0].get_device())
+
+        packed = AllToAllSingle(
+            layout=layout,
+            inputs=inputs,
+            outputs=outputs,
+            constant_args=[tag, ranks, group_size],
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+        )
+        return cls.create_output_nodes(packed, outputs)[0]
+
+    def codegen_collective(self, wrapper, output_name, input_names):
+        tag, ranks, group_size = self.constant_args
+
+        # TODO: might be necessary to do some pretty printing on
+        # split sizes
+        wrapper.writeline(
+            f"{output_name}_work = dist.all_to_all_single("
+            f"{output_name}[0], {output_name}_inputs[0], "
+            f"output_split_sizes={self.output_split_sizes}, "
+            f"input_split_sizes={self.input_split_sizes}, "
+            f"group={output_name}_pg, async_op=True)"
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__init__.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba0fe9754f43c39cd3adc2187ccb68c1890605c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/__init__.py
@@ -0,0 +1 @@
+from . import mm, mm_common, mm_plus_mm, unpack_mixed_mm
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecfbb31f76acf7f98eccde4f1522dcdf30ba925e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dfcc05734cb791b52ffd6afb2c0b3457ada52ef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce2e57376b2c52e5acf129ac4209964ed86aded1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c0b6f01ec764f5bdbfbf46fdcb6e38611b0b859
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07d09e94e2b2b50f5a6334fe0ddcfd2554e195e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2613aafe4b2712785e4dd75043d223b54d17180
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/mm_plus_mm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c704a30764f355123981e6c4fcd8a86ad0b1d8d4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_inductor/kernel/__pycache__/unpack_mixed_mm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/bmm.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a09c730afa6f9cc6b3b72552b737d328665e66fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/bmm.py
@@ -0,0 +1,128 @@
+import torch
+
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import ceildiv as cdiv, use_aten_gemm_kernels, use_triton_template
+
+from .mm_common import addmm_epilogue, mm_args, mm_configs, mm_options
+
+aten = torch.ops.aten
+
+
+def bmm_grid(b, m, n, meta):
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), b, 1)
+
+
+bmm_template = TritonTemplate(
+    name="bmm",
+    grid=bmm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", -2)}}
+    N = {{size("B", -1)}}
+    K = {{size("A", -1)}}
+
+    stride_aq = {{stride("A", 0)}}
+    stride_am = {{stride("A", 1)}}
+    stride_ak = {{stride("A", 2)}}
+
+    stride_bq = {{stride("B", 0)}}
+    stride_bk = {{stride("B", 1)}}
+    stride_bn = {{stride("B", 2)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_q = tl.program_id(1)  # batch dimension for BMM
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+aten_bmm = ExternKernelChoice(torch.bmm, "at::bmm_out")
+aten_baddbmm = ExternKernelChoice(torch.baddbmm, "at::baddbmm_out")
+
+
+@register_lowering(aten.bmm)
+def tuned_bmm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+
+    # options to tune from
+    choices = [aten_bmm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+
+    return autotune_select_algorithm("bmm", choices, [mat1, mat2], layout)
+
+
+# Don't register this since it is slower than decomposing it
+# @register_lowering(aten.baddbmm)
+def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
+
+    # options to tune from
+    choices = (
+        [aten_baddbmm.bind((inp, mat1, mat2), layout, alpha=alpha, beta=beta)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            bmm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+
+    return autotune_select_algorithm("baddbmm", choices, [inp, mat1, mat2], layout)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/conv.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a419432cbe0f10a2ce9512e9fe3ccf9e569f4b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/conv.py
@@ -0,0 +1,495 @@
+from __future__ import annotations
+
+import functools
+import logging
+from typing import cast, List, Optional, Sequence, Tuple, TypedDict
+
+import torch
+from .. import config, ir
+from ..ir import TensorBox
+
+from ..lowering import (
+    add_layout_constraint,
+    constrain_to_fx_strides,
+    lowerings as L,
+    register_lowering,
+)
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    ceildiv,
+    is_ones,
+    is_zeros,
+    pad_listlike,
+    sympy_product,
+    use_triton_template,
+)
+from ..virtualized import V
+from .mm_common import filtered_configs
+
+log = logging.getLogger(__name__)
+
+
+aten = torch.ops.aten
+
+
+def conv_grid(n, c, h, w, meta):
+    return (
+        ceildiv(n * h * w, meta["BLOCK_M"]),
+        ceildiv(c, meta["BLOCK_N"]),
+        meta["GROUPS"],
+    )
+
+
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform
+kernel_configs = [
+    # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+    {"config": (64, 256, 16, 2, 4), "cond": True},
+    {"config": (256, 64, 16, 2, 4), "cond": True},
+    {"config": (1024, 16, 16, 1, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 256, 32, 2, 8), "cond": True},
+    {"config": (256, 64, 32, 2, 8), "cond": True},
+]
+
+# Create filtered list of configs based on conv
+platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in kernel_configs
+    if config["cond"]
+)
+
+# On ROCm convert num_stages to 1 as pipelining provides no benefit
+if torch.version.hip:
+    platform_configs = tuple(
+        (config[0], config[1], config[2], 1, config[4]) for config in platform_configs
+    )
+
+conv_configs = functools.partial(
+    filtered_configs,
+    configs=platform_configs,
+)
+
+LOOP_BODY = """
+        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
+        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
+        idx_x_c = tl.arange(0, BLOCK_K) + k
+
+        x_ptrs = x_base + (
+            (idx_x_h * stride_xh)[:, None]
+            + (idx_x_w * stride_xw)[:, None]
+            + (idx_x_c * stride_xc)[None, :]
+        )
+        mask_x = (
+            (idx_n < BATCH)[:, None]
+            & (idx_x_h >= 0)[:, None]
+            & (idx_x_h < IN_H)[:, None]
+            & (idx_x_w >= 0)[:, None]
+            & (idx_x_w < IN_W)[:, None]
+            & (idx_x_c < GROUP_IN_C)[None, :]
+        )
+        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)
+
+        w_ptrs = w_base + (
+            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
+        )
+        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
+        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
+        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
+"""
+
+"""
+This is a relatively simple conv implementation that can likely be
+improved.  Many alternate conv versions can be found here:
+https://github.com/pytorch/torchdynamo/pull/971
+"""
+conv2d_template = TritonTemplate(
+    name="convolution",
+    grid=conv_grid,
+    source=r"""
+{{def_kernel("X", "W")}}
+    # Tensor dimensions
+    BATCH = {{size("X", 0)}}
+    IN_C = {{size("X", 1)}}
+    IN_H = {{size("X", 2)}}
+    IN_W = {{size("X", 3)}}
+    OUT_C = {{size(None, 1)}}
+    OUT_H = {{size(None, 2)}}
+    OUT_W = {{size(None, 3)}}
+
+    # Strides:
+    stride_xn = {{stride("X", 0)}}
+    stride_xc = {{stride("X", 1)}}
+    stride_xh = {{stride("X", 2)}}
+    stride_xw = {{stride("X", 3)}}
+    stride_wc_out = {{stride("W", 0)}}
+    stride_wc_in = {{stride("W", 1)}}
+    stride_wh = {{stride("W", 2)}}
+    stride_ww = {{stride("W", 3)}}
+
+    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    idx_y_w = nhw % OUT_W
+    nh = nhw // OUT_W
+    idx_y_h = nh % OUT_H
+    idx_n = nh // OUT_H
+    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+{% if GROUPS == 1 %}
+    group = 0
+    GROUP_IN_C = IN_C
+    GROUP_OUT_C = OUT_C
+{% else %}
+    group = tl.program_id(2)
+    GROUP_IN_C = IN_C // GROUPS
+    GROUP_OUT_C = OUT_C // GROUPS
+{% endif %}
+
+    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
+    w_base = (
+        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
+    )
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+{% if UNROLL %}
+{% for i in range(KERNEL_H) %}
+{% for j in range(KERNEL_W) %}
+    i = {{i}}
+    j = {{j}}
+    for k in range(0, GROUP_IN_C, BLOCK_K):
+        """
+    + LOOP_BODY
+    + """
+{% endfor %}
+{% endfor %}
+{% else %}
+    # Could be simplified, but slightly slower:
+    # for i in range(KERNEL_H):
+    #     for j in range(KERNEL_W):
+    #         for k in range(0, GROUP_IN_C, BLOCK_K):
+    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
+    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
+        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
+        ij = ijk // BLOCK_K_COUNT
+        i = ij // KERNEL_W
+        j = ij % KERNEL_W
+        """
+    + LOOP_BODY
+    + """
+{% endif %}
+
+    mask = (
+        (idx_n < BATCH)[:, None]
+        & (idx_y_h < OUT_H)[:, None]
+        & (idx_y_w < OUT_W)[:, None]
+        & (idx_y_c < GROUP_OUT_C)[None, :]
+    )
+    idx_n = idx_n[:, None]
+    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
+    idx_h = idx_y_h[:, None]
+    idx_w = idx_y_w[:, None]
+
+    # inductor generates a suffix
+    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
+""",
+)
+
+aten_convolution = ExternKernelChoice(
+    torch.convolution,
+    "at::convolution",
+    has_out_variant=False,
+    op_overload=aten.convolution.default,
+)
+
+
+def conv1x1_via_mm(x, w, *, out):
+    w = torch.squeeze(torch.squeeze(w, -1), -1)
+    return torch.matmul(
+        x.permute(0, 2, 3, 1), w.permute(1, 0), out=out.permute(0, 2, 3, 1)
+    )
+
+
+aten_conv1x1_via_mm = ExternKernelChoice(conv1x1_via_mm, None)
+
+
+class ConvLayoutParams(TypedDict):
+    stride: tuple[int, ...]
+    padding: tuple[int, ...]
+    dilation: tuple[int, ...]
+    transposed: bool
+    output_padding: tuple[int, ...]
+    groups: int
+
+
+def conv_layout(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: Optional[TensorBox],
+    stride: Sequence[int],
+    padding: tuple[int, ...],
+    dilation: tuple[int, ...],
+    transposed: bool,
+    output_padding: tuple[int, ...],
+    groups: int,
+) -> ir.Layout:
+    """Determine output layout for a convolution"""
+    with V.graph.fake_mode:
+        output = torch.ops.aten.convolution(
+            ir.ir_node_to_tensor(x, guard_shape=True),
+            ir.ir_node_to_tensor(weight, guard_shape=True),
+            ir.ir_node_to_tensor(bias, guard_shape=True),
+            stride,
+            tuple(V.graph.sizevars.size_hint(p) for p in padding),  # type: ignore[arg-type]
+            dilation,
+            transposed,
+            tuple(V.graph.sizevars.size_hint(p) for p in output_padding),  # type: ignore[arg-type]
+            groups,
+        )
+        sizes = ir.convert_shape_to_inductor(output.size())
+        stride = ir.convert_shape_to_inductor(output.stride())  # type: ignore[assignment]
+
+    return ir.FixedLayout(
+        x.get_device(),
+        x.get_dtype(),
+        sizes,
+        stride,
+    )
+
+
+def channels_last_order(rank):
+    order = list(reversed(range(rank)))
+    order.insert(1, order.pop(-1))
+    return order
+
+
+def convert_1x1_conv_to_mm(x, weight, bias):
+    # special case for 1x1 convolution, which is actually just a matmul
+    rank = len(weight.get_size())
+    for _ in range(rank - 2):
+        weight = L[aten.squeeze](weight, dim=-1)
+    weight = L[aten.permute](weight, [1, 0])
+
+    if x.get_size()[0] != 1:
+        x = ir.ExternKernel.require_stride_order(x, channels_last_order(rank))
+    else:
+        x.realize()
+        x.freeze_layout()
+
+    x_permute = list(range(rank))
+    x_permute.append(x_permute.pop(1))
+    x = L[aten.permute](x, x_permute)
+    *sizes, in_chan = x.get_size()
+    x = L[aten.reshape](x, [sympy_product(sizes), in_chan])
+    if bias is None:
+        result = L[aten.mm](x, weight)
+    else:
+        result = L[aten.addmm](bias, x, weight)
+    result = L[aten.reshape](result, [*sizes, -1])
+    result_permute = list(range(rank))
+    result_permute.insert(1, result_permute.pop(-1))
+    return L[aten.permute](result, result_permute)
+
+
+@register_lowering(aten.convolution)
+def convolution(
+    x: TensorBox,
+    weight: TensorBox,
+    bias: TensorBox,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+    stride = tuple(stride)
+    padding = tuple(padding)
+    dilation = tuple(dilation)
+    output_padding = tuple(output_padding)
+    if not isinstance(groups, int):
+        groups = V.graph.sizevars.evaluate_static_shape(groups)
+    assert isinstance(groups, int)
+    kwargs: ConvLayoutParams = {
+        "stride": stride,
+        "padding": padding,
+        "dilation": dilation,
+        "transposed": transposed,
+        "output_padding": output_padding,
+        "groups": groups,
+    }
+
+    if len(x.get_size()) == len(weight.get_size()) - 1:
+        # add batch dimension to simplify rest of function
+        return L[aten.squeeze](
+            convolution(L[aten.expand](x, [1, *x.get_size()]), weight, bias, **kwargs),
+            dim=0,
+        )
+
+    out_chan, in_chan, *kernel_shape = V.graph.sizevars.evaluate_static_shapes(
+        weight.get_size()
+    )
+    ndim = len(kernel_shape)
+    stride = pad_listlike(stride, ndim)
+    padding = pad_listlike(padding, ndim)
+    dilation = pad_listlike(dilation, ndim)
+    output_padding = pad_listlike(output_padding, ndim)
+
+    def channels_last_conv():
+        if V.graph.layout_opt and ndim == 2:
+            return True
+
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        return req_stride_order == ir.NHWC_STRIDE_ORDER
+
+    autotuning_gemm = config.max_autotune or config.max_autotune_gemm
+
+    if (
+        (config.conv_1x1_as_mm or (autotuning_gemm and channels_last_conv()))
+        and is_ones(kernel_shape)
+        and is_ones(stride)
+        and is_zeros(padding)
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        and groups == 1
+    ):
+        return convert_1x1_conv_to_mm(x, weight, bias)
+
+    if bias is not None and ir.get_device_type(x) != "cpu":
+        # peel off the bias, cudnn is slower with it
+        result = convolution(x, weight, None, **kwargs)
+        return L[aten.add](
+            result, L[aten.view](bias, [result.get_size()[1]] + ndim * [1])
+        )
+
+    x.realize()
+    weight.realize()
+
+    # ndim can be 1 for convolution in models such as demucs
+    # TODO: check if it's beneficial to convert Conv1d to Conv2d and then
+    # apply channels last.
+    if V.graph.layout_opt and ndim == 2:
+        V.graph.num_channels_last_conv += 1
+        x = ir.ExternKernel.require_channels_last(x)
+        # TODO maybe we can convert weights to channels last just once before
+        # running the model.
+        weight = ir.ExternKernel.require_channels_last(weight)
+        layout = conv_layout(x, weight, None, **kwargs)
+    else:
+        layout = conv_layout(x, weight, None, **kwargs)
+        req_stride_order = ir.get_stride_order(
+            V.graph.sizevars.size_hints(layout.stride)
+        )
+        x = ir.ExternKernel.require_stride_order(x, req_stride_order)
+        weight = ir.ExternKernel.require_stride_order(weight, req_stride_order)
+
+    ordered_kwargs_for_cpp_kernel = [
+        "stride",
+        "padding",
+        "dilation",
+        "transposed",
+        "output_padding",
+        "groups",
+    ]
+    if bias is None:
+        args = [x, weight]
+        kwargs["bias"] = None  # type: ignore[typeddict-unknown-key]
+        ordered_kwargs_for_cpp_kernel.insert(0, "bias")
+    else:
+        args = [x, weight, bias]
+        bias.realize()
+        bias.freeze_layout()
+        V.graph.sizevars.evaluate_static_shapes(bias.get_size())
+    choices = [
+        aten_convolution.bind(
+            args,
+            layout,
+            ordered_kwargs_for_cpp_kernel,
+            **kwargs,
+        )
+    ]
+
+    if (
+        use_triton_template(layout)
+        # templates only support these:
+        and ndim == 2
+        and is_ones(dilation)
+        and not transposed
+        and is_zeros(output_padding)
+        # there are some odd models where this check fails (e.g. shufflenet_v2_x1_0)
+        and V.graph.sizevars.statically_known_equals(in_chan, x.get_size()[1])  # type: ignore[arg-type]
+    ):
+        if (
+            is_ones(kernel_shape)
+            and is_ones(stride)
+            and is_zeros(padding)
+            and groups == 1
+        ):
+            choices.append(aten_conv1x1_via_mm.bind(args, layout))
+
+        for cfg in conv_configs(
+            sympy_product([x.get_size()[0], *x.get_size()[2:]]),
+            out_chan,
+            in_chan,
+        ):
+            conv2d_template.maybe_append_choice(
+                choices,
+                input_nodes=(x, weight),
+                layout=layout,
+                KERNEL_H=kernel_shape[0],
+                KERNEL_W=kernel_shape[1],
+                STRIDE_H=stride[0],
+                STRIDE_W=stride[1],
+                PADDING_H=padding[0],
+                PADDING_W=padding[1],
+                GROUPS=groups,
+                # TODO(jansel): try unroll for bigger kernels once fixed:
+                #               https://github.com/openai/triton/issues/1254
+                UNROLL=is_ones(kernel_shape),
+                ALLOW_TF32=torch.backends.cudnn.allow_tf32,
+                num_stages=cfg.num_stages,
+                num_warps=cfg.num_warps,
+                **cfg.kwargs,
+            )
+
+    return autotune_select_algorithm("convolution", choices, args, layout)
+
+
+@register_lowering(aten._convolution)
+def _convolution(
+    x,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32,
+):
+    return convolution(
+        x, weight, bias, stride, padding, dilation, transposed, output_padding, groups
+    )
+
+
+def constrain_conv_to_fx_strides(fx_node, *args, **kwargs):
+    assert fx_node.target == torch.ops.aten.convolution.default
+    if V.graph.layout_opt:
+        return args, kwargs
+    else:
+        return constrain_to_fx_strides(fx_node, *args, **kwargs)
+
+
+add_layout_constraint(aten.convolution, constrain_conv_to_fx_strides)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/mm.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1375501d570c56e12983eceb06b0aac6b17e042
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm.py
@@ -0,0 +1,312 @@
+import functools
+import logging
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch._inductor.virtualized import V
+from .. import config as inductor_config
+from ..codegen.cuda.gemm_template import CUTLASSGemmTemplate
+from ..lowering import register_lowering
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import (
+    use_aten_gemm_kernels,
+    use_cutlass_template,
+    use_max_autotune,
+    use_triton_template,
+)
+from .mm_common import (
+    addmm_epilogue,
+    int8_mm_configs,
+    mm_args,
+    mm_configs,
+    mm_grid,
+    mm_options,
+)
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+
+mm_template = TritonTemplate(
+    name="mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        if B_PROLOGUE_CAST_TYPE is not None:
+            b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+aten_mm = ExternKernelChoice(torch.mm, "at::mm_out")
+
+
+aten_addmm = ExternKernelChoice(
+    torch.addmm, "at::addmm_out", op_overload=aten.addmm.default
+)
+
+aten__int_mm = ExternKernelChoice(torch._int_mm, "at::_int_mm")
+
+
+def _is_int8_mat(mat):
+    return mat.get_dtype() in (torch.int8, torch.uint8)
+
+
+def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
+    """
+    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
+    kernel under the hood.  There are a few shapes where this is slower,
+    but they are rare.
+    """
+    if inp.stride(0) == 0 or inp.size(0) == 1:
+        return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
+    return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
+
+
+aten_bias_addmm = ExternKernelChoice(bias_addmm, None)
+
+
+@register_lowering(aten.mm, type_promotion_kind=None)
+def tuned_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+
+    # options to tune from
+    choices = [aten_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+
+    if m * n != 0 and use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+
+    if m * n != 0 and use_cutlass_template(layout):
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
+            choices, layout, [mat1, mat2], fuseable=True, non_fuseable=True
+        )
+
+    from torch._inductor.ir import FixedLayout, FlexibleLayout
+
+    if (
+        len(choices) == 1
+        and use_aten_gemm_kernels()
+        and isinstance(layout, FixedLayout)
+    ):
+        # If we are not autotuning, we can swap to a FlexibleLayout
+        # in order to get fusion optimizations to kick in, e.g. ConcatFusion
+        layout = FlexibleLayout(
+            device=layout.device, dtype=layout.dtype, size=layout.size
+        )
+        choices = [aten_mm.bind((mat1, mat2), layout)]
+
+    return autotune_select_algorithm("mm", choices, [mat1, mat2], layout)
+
+
+@register_lowering(aten._int_mm, type_promotion_kind=None)
+def tuned_int_mm(mat1, mat2, *, layout=None):
+    m, n, k, layout, mat1, mat2 = mm_args(
+        mat1, mat2, layout=layout, out_dtype=torch.int32
+    )
+    choices = (
+        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
+    )
+    if m * n != 0 and use_triton_template(layout, enable_int32=True):
+        # TODO: Re-enable eager mode implementation once cuBLAS is fixed
+        choices = []
+        for config in int8_mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+            )
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2], layout)
+
+
+@register_lowering(aten.addmm, type_promotion_kind=None)
+def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
+    if m * n == 0 or not use_max_autotune():
+        choices = (
+            [
+                aten_addmm.bind(
+                    (inp, mat1, mat2),
+                    layout,
+                    alpha=alpha,
+                    beta=beta,
+                )
+            ]
+            if use_aten_gemm_kernels()
+            else []
+        )
+        return autotune_select_algorithm("addmm", choices, [inp, mat1, mat2], layout)
+
+    choices = (
+        [
+            aten_addmm.bind(
+                (inp_expanded, mat1, mat2),
+                layout,
+                alpha=alpha,
+                beta=beta,
+            )
+        ]
+        if use_aten_gemm_kernels()
+        else []
+    )
+
+    if (
+        use_aten_gemm_kernels()
+        and inp_expanded.get_stride()[0] == 0
+        and inp_expanded.get_device().type == "cuda"
+        and inductor_config.triton.autotune_cublasLt
+    ):
+        # unexpand inp to make sure fused addmm from cublasLt is used
+        choices.insert(
+            0,
+            aten_bias_addmm.bind(
+                (inp_expanded, mat1, mat2), layout, alpha=alpha, beta=beta
+            ),
+        )
+
+    if use_triton_template(layout):
+        for config in mm_configs(m, n, k):
+            mm_template.maybe_append_choice(
+                choices,
+                input_nodes=(inp_expanded, mat1, mat2),
+                layout=layout,
+                **mm_options(config, m, n, k, layout),
+                prefix_args=1,
+                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            )
+
+    if use_cutlass_template(layout):
+        CUTLASSGemmTemplate.add_cutlass_gemm_choices(
+            choices,
+            layout,
+            [mat1, mat2, inp_expanded],
+            alpha=alpha,
+            beta=beta,
+            input_reorder=[2, 0, 1],
+            fuseable=False,
+        )
+
+    return autotune_select_algorithm(
+        "addmm", choices, [inp_expanded, mat1, mat2], layout
+    )
+
+
+def fallback_mixed_mm(mat1, mat2, *, out):
+    return torch.mm(mat1, mat2.to(mat1.dtype), out=out)
+
+
+aten_fallback_mixed_mm = ExternKernelChoice(fallback_mixed_mm, None)
+
+
+@functools.lru_cache(None)
+def _is_sm7x_or_older_gpu(index: Optional[int]) -> bool:
+    props = torch.cuda.get_device_properties(index or 0)
+    return props.major <= 7
+
+
+def tuned_mixed_mm(mat1, mat2, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None)
+    choices = [aten_fallback_mixed_mm.bind((mat1, mat2), layout)]
+    if (
+        mat1.layout.dtype != torch.float32 and not mat2.layout.is_contiguous()
+    ) or _is_sm7x_or_older_gpu(layout.device.index):
+        # can't use triton kernel unless one of these is true or if running on v100 (numerical issues)
+        return autotune_select_algorithm("mixed_mm", choices, [mat1, mat2], layout)
+    if inductor_config.force_mixed_mm:
+        choices = []
+    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+    has_int8_tensor = _is_int8_mat(mat1) or _is_int8_mat(mat2)
+    for config in mm_configs(m, n, k, has_int8_tensor=has_int8_tensor):
+        mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2),
+            layout=layout,
+            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+        )
+    return autotune_select_algorithm("mixed_mm", choices, [mat1, mat2], layout)
+
+
+# This op is a special case of the int_mm op which we use based on the pattern
+# _int_mm -> mul (defined in ../fx_passes/post_grad.py) in order to prevent
+# realization of the int32 _int_mm output by forcing fusion with the mul op.
+# This is only used when config.force_fuse_int_mm_with_mul = True
+def tuned_fused_int_mm_mul(mat1, mat2, mat3, out_dtype, *, layout=None):
+    out_dtype = (
+        torch.promote_types(mat3.get_dtype(), torch.int32)
+        if out_dtype is None
+        else out_dtype
+    )
+    m, n, k, layout, mat1, mat2, mat3 = mm_args(
+        mat1, mat2, mat3, layout=layout, out_dtype=out_dtype
+    )
+    choices: List[Dict[Any, Any]] = []
+    for config in int8_mm_configs(m, n, k):
+        mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2, mat3),
+            layout=layout,
+            **dict(mm_options(config, m, n, k, layout), ACC_TYPE="tl.int32"),
+            suffix_args=1,
+            epilogue_fn=V.ops.mul,
+        )
+    return autotune_select_algorithm("int_mm", choices, [mat1, mat2, mat3], layout)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_common.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0edc9b9b5ba645b236f9885d63a5f61be57cec31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_common.py
@@ -0,0 +1,262 @@
+import functools
+import logging
+from typing import cast, List, Tuple
+
+import sympy
+
+import torch
+from torch._inductor.select_algorithm import realize_inputs
+from torch._inductor.virtualized import V
+
+from .. import config as inductor_config
+from ..utils import ceildiv as cdiv, next_power_of_2
+
+log = logging.getLogger(__name__)
+
+
+def triton_config(num_stages, num_warps, **kwargs):
+    from triton import Config
+
+    return Config(kwargs, num_stages=num_stages, num_warps=num_warps)
+
+
+def filtered_configs(
+    m: int,
+    n: int,
+    k: int,
+    configs: List[Tuple[int, int, int, int, int]],
+    has_int8_tensor=False,
+):
+    """Heuristic to shrink configs when they are bigger than the input size"""
+
+    # According to https://github.com/openai/triton/issues/2156#issuecomment-1695897424
+    # it's safer to use at least [32, 32] block size for int8/uint8
+    # tensors
+    min_block_size = 32 if has_int8_tensor else 16
+    m = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                m, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    n = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                n, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    k = max(
+        next_power_of_2(
+            V.graph.sizevars.size_hint(
+                k, fallback=torch._inductor.config.unbacked_symint_fallback  # type: ignore[arg-type]
+            )
+        ),
+        min_block_size,
+    )
+    used = set()
+    for block_m, block_n, block_k, num_stages, num_warps in configs:
+        # shrink configs for small sizes
+        block_m = max(min(block_m, m), min_block_size)
+        block_n = max(min(block_n, n), min_block_size)
+        block_k = max(min(block_k, k), min_block_size)
+        # each warp computes 16x16 tile = 256
+        num_warps = min(num_warps, block_m * block_n // 256)
+        if torch.version.hip:
+            for matrix_instr_nonkdim in [0, 16]:
+                if matrix_instr_nonkdim != 0 and (
+                    block_m % matrix_instr_nonkdim != 0
+                    or block_n % matrix_instr_nonkdim != 0
+                ):
+                    #  block_m and block_n must be a multiple of matrix_instr_nonkdim
+                    continue
+                if (
+                    block_m,
+                    block_n,
+                    block_k,
+                    num_stages,
+                    num_warps,
+                    matrix_instr_nonkdim,
+                ) not in used:
+                    used.add(
+                        (
+                            block_m,
+                            block_n,
+                            block_k,
+                            num_stages,
+                            num_warps,
+                            matrix_instr_nonkdim,
+                        )
+                    )
+                    yield triton_config(
+                        BLOCK_M=block_m,
+                        BLOCK_N=block_n,
+                        BLOCK_K=block_k,
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        matrix_instr_nonkdim=matrix_instr_nonkdim,
+                    )
+        else:
+            if (block_m, block_n, block_k, num_stages, num_warps, 0) not in used:
+                used.add((block_m, block_n, block_k, num_stages, num_warps, 0))
+                yield triton_config(
+                    BLOCK_M=block_m,
+                    BLOCK_N=block_n,
+                    BLOCK_K=block_k,
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                )
+
+
+# List of dictionaries to store the kernel configs. Configs that evaluate to true
+# will be utilised on the target platform
+mm_kernel_configs = [
+    # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 128, 32, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 3, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 8), "cond": True},
+    {"config": (128, 64, 32, 4, 8), "cond": True},
+    {"config": (64, 32, 32, 5, 8), "cond": True},
+    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 64, 3, 8), "cond": True},
+    {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
+    {"config": (64, 64, 16, 2, 4), "cond": True},
+    {"config": (32, 32, 16, 1, 2), "cond": True},
+]
+
+int8_mm_kernel_configs = [
+    {"config": (64, 64, 32, 2, 4), "cond": True},
+    {"config": (64, 128, 32, 3, 4), "cond": True},
+    {"config": (128, 64, 32, 3, 4), "cond": True},
+    {"config": (64, 128, 32, 4, 8), "cond": True},
+    {"config": (128, 64, 32, 4, 8), "cond": True},
+    {"config": (64, 32, 32, 5, 8), "cond": True},
+    {"config": (32, 64, 32, 5, 8), "cond": True},
+    {"config": (128, 128, 32, 2, 8), "cond": True},
+    {"config": (64, 64, 64, 3, 8), "cond": True},
+    # {"config": (32, 32, 128, 2, 4), "cond": True},
+    # {"config": (64, 64, 16, 2, 4), "cond": True},
+    # {"config": (32, 32, 16, 1, 2), "cond": True},
+    {"config": (128, 256, 128, 3, 8), "cond": torch.version.hip is None},
+    {"config": (256, 128, 128, 3, 8), "cond": torch.version.hip is None},
+]
+
+# Create filtered list of configs based on cond evaluation
+
+
+mm_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in mm_kernel_configs
+    if config["cond"]
+)
+int8_platform_configs = tuple(
+    cast(Tuple[int, int, int, int, int], config["config"])
+    for config in int8_mm_kernel_configs
+    if config["cond"]
+)
+
+# On ROCm convert num_stages to 1 as pipelining provides no benefit
+if torch.version.hip:
+    mm_platform_configs = tuple(
+        (config[0], config[1], config[2], 1, config[4])
+        for config in mm_platform_configs
+    )
+    int8_platform_configs = tuple(
+        (config[0], config[1], config[2], 1, config[4])
+        for config in mm_platform_configs
+    )
+
+mm_configs = functools.partial(
+    filtered_configs,
+    configs=mm_platform_configs,
+)
+
+int8_mm_configs = functools.partial(
+    filtered_configs,
+    configs=int8_platform_configs,
+)
+
+
+def mm_grid(m, n, meta):
+    """
+    The CUDA grid size for matmul triton templates.
+    """
+    return (cdiv(m, meta["BLOCK_M"]) * cdiv(n, meta["BLOCK_N"]), 1, 1)
+
+
+def acc_type(dtype):
+    if dtype in (torch.float16, torch.bfloat16):
+        return "tl.float32"
+    return f"tl.{dtype}".replace("torch.", "")
+
+
+def mm_options(config, sym_m, sym_n, sym_k, layout, b_prologue_cast_type=None):
+    """
+    Common options to matmul triton templates.
+    """
+    even_k_symbolic = (
+        # it isn't worth guarding on this
+        sympy.gcd(sym_k, config.kwargs["BLOCK_K"])
+        == config.kwargs["BLOCK_K"]
+    )
+    allow_tf32 = torch.backends.cuda.matmul.allow_tf32 and (
+        not inductor_config.force_same_precision
+        or ((sym_m % 16) == 0 and (sym_n % 16) == 0 and (sym_k % 8) == 0)
+    )
+    return dict(
+        GROUP_M=8,
+        EVEN_K=even_k_symbolic,
+        ALLOW_TF32=allow_tf32,
+        ACC_TYPE=acc_type(layout.dtype),
+        B_PROLOGUE_CAST_TYPE=b_prologue_cast_type,
+        num_stages=config.num_stages,
+        num_warps=config.num_warps,
+        **config.kwargs,
+    )
+
+
+def mm_args(mat1, mat2, *others, layout=None, out_dtype=None, use_4x2_dim=False):
+    """
+    Common arg processing for mm,bmm,addmm,etc
+    """
+    mat1, mat2 = realize_inputs(mat1, mat2)
+    *b1, m, k1 = mat1.get_size()
+    *b2, k2, n = mat2.get_size()
+    b = [V.graph.sizevars.guard_equals(a, b) for a, b in zip(b1, b2)]
+    if use_4x2_dim:
+        k2 = k2 * 2
+    k = V.graph.sizevars.guard_equals(k1, k2)
+    if layout is None:
+        from torch._inductor.ir import FixedLayout
+
+        if out_dtype is None:
+            out_dtype = mat1.get_dtype()
+        layout = FixedLayout(
+            mat1.get_device(),
+            out_dtype,
+            [*b, m, n],
+        )
+    else:
+        assert out_dtype is None, "out_dtype is ignored if layout is specified."
+
+    from ..lowering import expand
+
+    others = [realize_inputs(expand(x, layout.size)) for x in others]
+
+    return [m, n, k, layout, mat1, mat2, *others]
+
+
+def addmm_epilogue(dtype, alpha, beta):
+    def epilogue(acc, bias):
+        if alpha != 1:
+            acc = V.ops.mul(acc, V.ops.constant(alpha, dtype))
+        if beta != 1:
+            bias = V.ops.mul(bias, V.ops.constant(beta, dtype))
+        return V.ops.add(acc, bias)
+
+    return epilogue
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_plus_mm.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_plus_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d547532c606aae2ade4058134207ad4f1bb5c51
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/mm_plus_mm.py
@@ -0,0 +1,235 @@
+import functools
+
+import torch
+
+from ..lowering import lowerings
+from ..select_algorithm import (
+    autotune_select_algorithm,
+    ExternKernelChoice,
+    TritonTemplate,
+)
+from ..utils import use_aten_gemm_kernels, use_triton_template
+from ..virtualized import V
+from .mm_common import mm_args, mm_grid, mm_options
+
+aten = torch.ops.aten
+
+aten_mm_plus_mm = ExternKernelChoice(
+    torch.ops.inductor._mm_plus_mm, "torch::inductor::_mm_plus_mm"
+)
+
+mm_plus_mm_template = TritonTemplate(
+    name="mm_plus_mm",
+    grid=mm_grid,
+    debug=False,
+    source=r"""
+{{def_kernel("A", "B", "C", "D")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K1 = {{size("A", 1)}}
+    if M * N == 0:
+        # early exit due to zero-size input(s)
+        return
+    # K2 = {{size("C", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+    stride_cm = {{stride("C", 0)}}
+    stride_ck = {{stride("C", 1)}}
+    stride_dk = {{stride("D", 0)}}
+    stride_dn = {{stride("D", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+    C = C + (ram[:, None] * stride_cm + rk[None, :] * stride_ck)
+    D = D + (rk[:, None] * stride_dk + rbn[None, :] * stride_dn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k1 in range(K1, 0, -BLOCK_K):
+        # First matmul with A @ B
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k1, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k1, other=0.)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    for k2 in range(K1, 0, -BLOCK_K):
+
+        # Second matmul with C @ D
+        if EVEN_K:
+            c = tl.load(C)
+            d = tl.load(D)
+        else:
+            c = tl.load(C, mask=rk[None, :] < k2, other=0.)
+            d = tl.load(D, mask=rk[:, None] < k2, other=0.)
+        acc += tl.dot(c, d, allow_tf32=ALLOW_TF32)
+        C += BLOCK_K * stride_ck
+        D += BLOCK_K * stride_dk
+
+
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+@functools.lru_cache(None)
+def mm_configs():
+    import triton
+
+    # List of dictionaries to store the kernel configs. Configs that evaluate to true
+    # will be utilised on the target platform
+    mm_triton_configs = [
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 3,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 16,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32},
+            "num_stages": 4,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 128},
+            "num_stages": 1,
+            "num_warps": 8,
+            "cond": torch.version.hip is None,
+        },
+        {
+            "config": {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 16},
+            "num_stages": 2,
+            "num_warps": 4,
+            "cond": True,
+        },
+        {
+            "config": {"BLOCK_M": 32, "BLOCK_N": 32, "BLOCK_K": 16},
+            "num_stages": 1,
+            "num_warps": 2,
+            "cond": True,
+        },
+    ]
+
+    # Filter out configs in which cond evaluates to true
+    # On ROCm convert num_stages to 1 as pipelining provides no benefit
+    if torch.version.hip:
+        filtered_configs = [
+            triton.Config(c["config"], num_stages=1, num_warps=c["num_warps"])
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+    else:
+        filtered_configs = [
+            triton.Config(
+                c["config"], num_stages=c["num_stages"], num_warps=c["num_warps"]
+            )
+            for c in mm_triton_configs
+            if c["cond"]
+        ]
+
+    return filtered_configs
+
+
+def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
+    """
+    Computes mm(mat1, mat2) + mm(mat3, mat4)
+    """
+    m1, n1, k1, layout1, mat1, mat2 = mm_args(mat1, mat2, layout=layout)
+    m2, n2, _, layout2, mat3, mat4 = mm_args(mat3, mat4, layout=layout)
+    # Optimization is optional, because we can always just not do the fusion
+    if (
+        m1 * n1 == 0
+        or m2 * n2 == 0
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat1.get_size(), mat3.get_size()
+        )
+        or not V.graph.sizevars.statically_known_list_equals(
+            mat2.get_size(), mat4.get_size()
+        )
+    ):
+        # TODO(jansel): support different K values when this is fixed:
+        # https://github.com/openai/triton/issues/967
+        return lowerings[aten.add](
+            lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
+        )
+
+    assert layout1 == layout2
+    # options to tune from
+    choices = (
+        [aten_mm_plus_mm.bind((mat1, mat2, mat3, mat4), layout1)]
+        if use_aten_gemm_kernels()
+        else []
+    )
+    if use_triton_template(layout1):
+        for config in mm_configs():
+            # see https://github.com/openai/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if config.kwargs["BLOCK_K"] < k1:
+                mm_plus_mm_template.maybe_append_choice(
+                    choices,
+                    input_nodes=(mat1, mat2, mat3, mat4),
+                    layout=layout1,
+                    **mm_options(config, m1, n1, k1, layout1),
+                )
+
+    return autotune_select_algorithm(
+        "mm_plus_mm", choices, [mat1, mat2, mat3, mat4], layout1
+    )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py b/MLPY/Lib/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf03a9ffb0b5127cb480fccfa6255893f0efc40e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/kernel/unpack_mixed_mm.py
@@ -0,0 +1,82 @@
+import logging
+from typing import List
+
+from ..select_algorithm import autotune_select_algorithm, ChoiceCaller, TritonTemplate
+from .mm_common import mm_args, mm_configs, mm_grid, mm_options
+
+log = logging.getLogger(__name__)
+
+uint4x2_mixed_mm_template = TritonTemplate(
+    name="uint4x2_mixed_mm",
+    grid=mm_grid,
+    source=r"""
+{{def_kernel("A", "B")}}
+    M = {{size("A", 0)}}
+    N = {{size("B", 1)}}
+    K = {{size("A", 1)}}
+    stride_am = {{stride("A", 0)}}
+    stride_ak = {{stride("A", 1)}}
+    stride_bk = {{stride("B", 0)}}
+    stride_bn = {{stride("B", 1)}}
+
+    # based on triton.ops.matmul
+    pid = tl.program_id(0)
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None]//2 * stride_bk + rbn[None, :] * stride_bn)
+    b_shifts = 4*(rk%2)
+    b_subs = 8*(1-(rk%2))
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.)
+        b = ((b >> b_shifts[:, None]) & 0xF) - 8
+        b = b.to(B_PROLOGUE_CAST_TYPE)
+        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K//2 * stride_bk
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    idx_m = rm[:, None]
+    idx_n = rn[None, :]
+    mask = (idx_m < M) & (idx_n < N)
+
+    # inductor generates a suffix
+    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
+""",
+)
+
+
+def tuned_uint4x2_mixed_mm(mat1, mat2, mat2_mm_shape, mat2_dtype):
+    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=None, use_4x2_dim=True)
+    choices: List[ChoiceCaller] = []
+    b_prologue_cast_type = f"tl.{mat2_dtype}".replace("torch.", "")
+    for config in mm_configs(m, n, k):
+        uint4x2_mixed_mm_template.maybe_append_choice(
+            choices,
+            input_nodes=(mat1, mat2),
+            layout=layout,
+            **mm_options(config, m, n, k, layout, b_prologue_cast_type),
+        )
+    return autotune_select_algorithm("uint4x2_mixed_mm", choices, [mat1, mat2], layout)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/lowering.py b/MLPY/Lib/site-packages/torch/_inductor/lowering.py
new file mode 100644
index 0000000000000000000000000000000000000000..8987a72cabfde8f1c60d06acf4aaae184240a4af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/lowering.py
@@ -0,0 +1,6006 @@
+import functools
+import itertools
+import logging
+import os
+import warnings
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+
+import sympy
+
+import torch
+import torch.ao.quantization.fx._decomposed
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._higher_order_ops.triton_kernel_wrap import (
+    triton_kernel_wrapper_functional,
+    triton_kernel_wrapper_mutation,
+)
+from torch._prims_common import (
+    canonicalize_dim,
+    canonicalize_dims,
+    check,
+    dtype_to_type,
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    get_computation_dtype,
+    is_boolean_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    Number,
+)
+from torch.fx.experimental.sym_node import magic_methods, method_to_operator
+from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
+from .._dynamo.utils import import_submodule
+
+from . import config, inductor_prims, ir, test_operators  # NOQA: F401
+from .decomposition import decompositions, get_decompositions
+from .ir import (
+    ExpandView,
+    IndexingConstant,
+    is_triton,
+    ops_wrapper,
+    PermuteView,
+    Pointwise,
+    Reduction,
+    SqueezeView,
+    TensorBox,
+    validate_ir,
+    View,
+)
+from .utils import (
+    ceildiv,
+    decode_device,
+    is_dynamic,
+    is_pointwise_use,
+    pad_listlike,
+    parallel_num_threads,
+    sympy_product,
+)
+from .virtualized import ops, V
+
+log = logging.getLogger(__name__)
+lowerings: Dict[torch._ops.OpOverload, Callable[..., Any]] = {}
+layout_constraints: Dict[torch._ops.OpOverload, Callable[..., Any]] = {}
+fallbacks: Set[torch._ops.OpOverload] = set()
+aten = torch.ops.aten
+tr_c10d = torch.ops.tr_c10d
+prims = torch.ops.prims
+needs_realized_inputs: Set[torch._ops.OpOverload] = set()
+foreach_ops: Set[torch._ops.OpOverload] = set()
+inplace_foreach_ops: Set[torch._ops.OpOverload] = set()
+inplaceable_foreach_ops: Dict[torch._ops.OpOverload, torch._ops.OpOverload] = dict()
+quantized_decomposed = torch.ops.quantized_decomposed
+
+
+def assert_nyi(cond, msg):
+    if not cond:
+        raise NotImplementedError(f"inductor does not support {msg}")
+
+
+def add_needs_realized_inputs(fn):
+    if isinstance(fn, (list, tuple, set)):
+        return [add_needs_realized_inputs(x) for x in fn]
+    needs_realized_inputs.add(fn)
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        for overload in fn.overloads():
+            needs_realized_inputs.add(getattr(fn, overload))
+
+
+def add_layout_constraint(fn, constraint):
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        for overload in fn.overloads():
+            layout_constraints[getattr(fn, overload)] = constraint
+    else:
+        layout_constraints[fn] = constraint
+
+
+add_needs_realized_inputs(
+    [
+        aten.as_strided,
+        aten.avg_pool2d,
+        aten.avg_pool2d_backward,
+        aten.bmm,
+        aten.convolution,
+        aten.convolution_backward,
+        aten.max_pool2d_with_indices,
+        aten.max_pool2d_with_indices_backward,
+        aten.mm,
+        aten.upsample_nearest2d,
+        aten._upsample_nearest_exact2d,
+        aten.upsample_bicubic2d,
+        aten._int_mm,
+    ]
+)
+
+# TODO(jansel): ezyang says we won't need this in the future, try removing it
+# based on https://github.com/pytorch/pytorch/blob/9e3eb329df8f701/c10/core/ScalarType.h#L28
+DTYPE_ID_LOOKUP = {
+    0: torch.uint8,
+    1: torch.int8,
+    2: torch.int16,
+    3: torch.int32,
+    4: torch.int64,
+    5: torch.float16,
+    6: torch.float32,
+    7: torch.float64,
+    8: torch.complex32,
+    9: torch.complex64,
+    10: torch.complex32,
+    11: torch.bool,
+    15: torch.bfloat16,
+    # TODO(jansel): add quantized types?
+    #  _(c10::qint8, QInt8) /* 12 */
+    # _(c10::quint8, QUInt8) /* 13 */
+    # _(c10::qint32, QInt32) /* 14 */
+    # _(c10::quint4x2, QUInt4x2) /* 16 */
+    # _(c10::quint2x4, QUInt2x4) /* 17 */
+}
+
+
+def decode_dtype(dtype: int):
+    if not isinstance(dtype, int):
+        return dtype
+    assert dtype in DTYPE_ID_LOOKUP, f"id {dtype} missing from DTYPE_ID_LOOKUP"
+    dtype = DTYPE_ID_LOOKUP[dtype]
+    return dtype
+
+
+def is_integer_type(x):
+    if isinstance(x, TensorBox):
+        return is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+    elif isinstance(x, sympy.Expr):
+        return x.is_integer is True  # type: ignore[attr-defined]
+    else:
+        return isinstance(x, int)
+
+
+def is_boolean_type(x):
+    if isinstance(x, TensorBox):
+        return is_boolean_dtype(x.get_dtype())
+    else:
+        return isinstance(x, bool)
+
+
+def get_promoted_dtype(*args, type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND):
+    def construct_input(inp):
+        if isinstance(inp, (Number, sympy.Expr)):
+            return inp
+        else:
+            assert hasattr(inp, "get_dtype")
+            dim = len(inp.get_size())
+            # construct a tmp tensor to feed into torch.result_type
+            return torch.zeros([1] * dim, dtype=inp.get_dtype())
+
+    inps = [construct_input(arg) for arg in args]
+    _, dtype = elementwise_dtypes(*inps, type_promotion_kind=type_promotion_kind)
+    return dtype
+
+
+def get_overloads(aten_fn):
+    if not isinstance(aten_fn, (list, tuple)):
+        aten_fn = [aten_fn]
+    else:
+        aten_fn = list(aten_fn)
+
+    for fn in list(aten_fn):
+        if isinstance(fn, torch._ops.OpOverloadPacket):
+            for overload in fn.overloads():
+                other_fn = getattr(fn, overload)
+                if other_fn not in lowerings:
+                    aten_fn.append(other_fn)
+
+    return aten_fn
+
+
+def transform_args(args, broadcast, type_promotion_kind, convert_input_to_bool):
+    indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
+    if (type_promotion_kind or convert_input_to_bool) and indices:
+        if convert_input_to_bool:
+            dtype = torch.bool
+        else:
+            # FIXME that's a crude approximation for promoting args
+            promoting_args = [
+                a
+                for a in args
+                if isinstance(a, (Number, sympy.Expr)) or hasattr(a, "dtype")
+            ]
+            dtype = get_promoted_dtype(
+                *promoting_args, type_promotion_kind=type_promotion_kind
+            )
+
+        # sometimes args are an immutable list so we can't mutate them
+        def promote(arg):
+            if isinstance(arg, TensorBox):
+                return to_dtype(arg, dtype)
+            elif isinstance(arg, ir.Constant):
+                return ir.Constant(arg.value, dtype, args[indices[0]].get_device())
+            else:
+                return arg
+
+        args = [promote(a) for a in args]
+    if broadcast and indices:
+        for i, x in zip(indices, broadcast_tensors(*[args[i] for i in indices])):
+            args[i] = x
+        for i in range(len(args)):
+            if isinstance(args[i], ir.Constant):
+                args[i] = ExpandView.create(args[i], list(args[indices[0]].get_size()))
+
+    return args
+
+
+def _register_foreach_lowering(aten_fn, decomp_fn):
+    """
+    Add a foreach lowering to lowerings dict.
+
+    Arguments:
+        aten_fn: torch.ops.aten.* fn we are lowering
+        decomp_fn: alternate implementation on our IR
+        broadcast: True to apply broadcasting to tensor inputs
+        type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion
+        convert_input_to_bool: some logical ops require inputs are converted to bool
+    """
+
+    @functools.wraps(decomp_fn)
+    def wrapped(*args, **kwargs):
+        assert len(args) <= 2
+        out = decomp_fn(*args, **kwargs)
+        validate_ir(out)
+        return out
+
+    aten_fns = get_overloads(aten_fn)
+    foreach_ops.update(aten_fns)
+    lowerings.update(dict.fromkeys(aten_fns, wrapped))
+    return wrapped
+
+
+def _register_lowering(
+    aten_fn, decomp_fn, broadcast, type_promotion_kind, convert_input_to_bool
+):
+    """
+    Add a lowering to lowerings dict
+
+    Arguments:
+        aten_fn: torch.ops.aten.* fn we are lowering
+        decomp_fn: alternate implementation on our IR
+        broadcast: True to apply broadcasting to tensor inputs
+        type_promotion_kind: kind of type promotion applied to tensor inputs, `None` means no type promotion
+        convert_input_to_bool: some logical ops require inputs are converted to bool
+    """
+
+    @functools.wraps(decomp_fn)
+    def wrapped(*args, **kwargs):
+        args: Union[List[Any], Tuple[Any, ...], Dict[Any, Any]] = list(args)
+        unpacked = False
+        # TODO maybe we need to use pytrees here
+        if len(args) == 1 and isinstance(args[0], (list, tuple)):
+            unpacked = True
+            args = args[0]
+
+        # explicitly assert for "out=" ops for better error messages
+        assert not any(
+            x == "out" for x in kwargs.keys()
+        ), "out= ops aren't yet supported"
+        # kwargs tensors not supported yet unless it's a fallback op
+        assert not any(isinstance(x, TensorBox) for x in kwargs.values()) or all(
+            fn in fallbacks for fn in aten_fn
+        )
+
+        args = transform_args(
+            args, broadcast, type_promotion_kind, convert_input_to_bool
+        )
+
+        if unpacked:
+            args = [args]
+
+        out = decomp_fn(*args, **kwargs)
+        validate_ir(out)
+
+        return out
+
+    aten_fn = get_overloads(aten_fn)
+
+    lowerings.update(dict.fromkeys(aten_fn, wrapped))
+    return wrapped
+
+
+def register_lowering(
+    aten_fn,
+    broadcast=False,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    convert_input_to_bool=False,
+):
+    """
+    Shim to support decorator syntax.
+    """
+    return functools.partial(
+        _register_lowering,
+        aten_fn,
+        broadcast=broadcast,
+        type_promotion_kind=type_promotion_kind,
+        convert_input_to_bool=convert_input_to_bool,
+    )
+
+
+def broadcast_symbolic_shapes(a, b):
+    """
+    Broadcasting logic based on symbolic shapes.
+
+    We give the shapes 0 and 1 concrete values, while all other shapes
+    are symbolic sympy formulas.
+    """
+    output = []
+    for x, y in itertools.zip_longest(
+        reversed(a), reversed(b), fillvalue=sympy.Integer(1)
+    ):
+        if y == 1:
+            output.append(x)
+        elif x == 1:
+            output.append(y)
+        else:
+            V.graph.sizevars.guard_equals(x, y)
+            if len(sympy.expand(y).free_symbols) < len(sympy.expand(x).free_symbols):
+                output.append(y)  # prefer shorter formula
+            else:
+                output.append(x)
+    return tuple(reversed(output))
+
+
+def promote_constants(inputs, override_return_dtype=None, type_promotion_kind=None):
+    assert (
+        override_return_dtype is None or type_promotion_kind is None
+    ), "only one of override_return_dtype or type_promotion_kind may be given"
+
+    if override_return_dtype is None and type_promotion_kind is None:
+        type_promotion_kind = ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+
+    if not any(isinstance(x, (sympy.Expr, int, float)) for x in inputs):
+        return inputs
+    if all(isinstance(x, (int, float, sympy.Expr)) for x in inputs):
+        dtype = override_return_dtype or get_promoted_dtype(
+            *inputs, type_promotion_kind=type_promotion_kind
+        )
+
+        def const_func(x):
+            if isinstance(x, sympy.Expr):
+                return ir.IndexingConstant(x, dtype, decode_device(None))
+            else:
+                return ir.Constant(x, dtype, decode_device(None))
+
+        return [const_func(x) for x in inputs]
+    ex = next(x for x in inputs if isinstance(x, (TensorBox, ExpandView)))
+    out = []
+    for x in inputs:
+        if isinstance(x, (int, float)):
+            out.append(
+                ExpandView.create(
+                    ir.Constant(x, ex.get_dtype(), ex.get_device()), list(ex.get_size())
+                )
+            )
+        elif isinstance(x, sympy.Expr):
+            out.append(
+                ExpandView.create(
+                    IndexingConstant(x, ex.get_dtype(), ex.get_device()),
+                    list(ex.get_size()),
+                )
+            )
+        else:
+            out.append(x)
+
+    return out
+
+
+def make_pointwise(
+    fn,
+    override_return_dtype=None,
+    override_device=None,
+    override_fn_when_input_bool=None,
+    override_fn_when_cuda_float64=None,
+    allow_alpha=False,
+    triton_fallback=None,
+):
+    def inner(*inputs: List[TensorBox], alpha=None):
+        if triton_fallback is not None and any(map(is_triton, inputs)):
+            assert not allow_alpha  # not implemented
+            return triton_fallback(*inputs)
+
+        inputs = promote_constants(inputs, override_return_dtype)
+        if allow_alpha:
+            if alpha is not None and alpha != 1:
+                inputs = list(inputs)
+                inputs[-1] = mul(inputs[-1], alpha)
+        else:
+            assert alpha is None
+        loaders = [x.make_loader() for x in inputs]
+        ranges = inputs[0].get_size()
+        dtype = override_return_dtype or inputs[0].get_dtype()
+        is_cuda = decode_device(inputs[0].get_device()).type == "cuda"
+
+        for other in inputs[1:]:
+            assert isinstance(other, ir.BaseConstant) or len(ranges) == len(
+                other.get_size()
+            ), f"ndim mismatch {fn} {ranges} {other.get_size()}"
+
+        def inner_fn(index):
+            assert len(index) == len(ranges), f"wrong ndim {index} {ranges}"
+            if dtype == torch.bool and override_fn_when_input_bool is not None:
+                return override_fn_when_input_bool(*[load(index) for load in loaders])
+            elif override_fn_when_cuda_float64 and is_cuda and dtype == torch.float64:
+                return override_fn_when_cuda_float64(*[load(index) for load in loaders])
+            else:
+                return fn(*[load(index) for load in loaders])
+
+        if not override_device:
+            device = None
+            for i in inputs:
+                if i.get_device().type == "cuda":
+                    device = i.get_device()
+                    break
+            if not device:
+                device = inputs[0].get_device()
+
+        device = override_device or device
+
+        return Pointwise.create(
+            device=device,
+            dtype=dtype,
+            inner_fn=inner_fn,
+            ranges=ranges,
+        )
+
+    return inner
+
+
+def make_foreach_pointwise(pw_fn, allow_alpha=False):
+    def inner(*inputs: List[List[TensorBox]], alpha=1):
+        # group by device, whether any of the inputs are dynamic, and whether their types match
+        # (proxy for type promotion)
+        def group_args(arg_pairs):
+            out = defaultdict(list)
+            for i, args in enumerate(arg_pairs):
+                use_foreach = not is_dynamic(*args)
+                device = None
+                for t in args:
+                    if isinstance(t, TensorBox):
+                        device = t.data.get_device()
+                        break
+                assert (
+                    device is not None
+                ), "foreach op should have at least one tensor arg"
+                out[(device, use_foreach)].append((i, args))
+            return out
+
+        realize_outputs = (
+            len(V.graph.current_node.users) == 0
+            or V.graph.current_node.target in inplace_foreach_ops
+        )
+        for node in V.graph.current_node.users:
+            for user in node.users:
+                if not (user.op == "call_function" and (user.target in foreach_ops)):
+                    realize_outputs = True
+
+        a_list_input = None
+        for input in inputs:
+            if isinstance(input, (list, tuple)):
+                a_list_input = input
+                break
+        assert (
+            a_list_input is not None
+        ), "at least one input must be a list to a foreach op"
+
+        # broadcast scalar inputs to match length of list inputs
+        broadcast_inputs = []
+        for input in inputs:
+            if not isinstance(input, (list, tuple)):
+                broadcast_inputs.append([input] * len(a_list_input))
+            else:
+                broadcast_inputs.append(input)
+
+        groups = group_args(zip(*broadcast_inputs))
+
+        outputs = [None] * len(a_list_input)
+        for (device, use_foreach), group in groups.items():
+            buffer_list = []
+            for (
+                output_ind,
+                args,
+            ) in group:
+                if allow_alpha:
+                    output = pw_fn(*args, alpha=alpha)
+                else:
+                    output = pw_fn(*args)
+
+                outputs[output_ind] = output
+
+                if device.type == "cuda" and use_foreach and realize_outputs:
+                    buffer_list.append(output.realize())
+
+            if buffer_list:
+                V.graph.register_list(buffer_list)
+
+        assert all(x is not None for x in outputs)
+        return outputs
+
+    return inner
+
+
+def to_dtype(x: TensorBox, dtype: torch.dtype, copy=False):
+    src_dtype = x.get_dtype()
+    if src_dtype == dtype:
+        return clone(x) if copy else x
+
+    def _to_dtype(x):
+        return ops.to_dtype(x, dtype, src_dtype=src_dtype)
+
+    return make_pointwise(_to_dtype, override_return_dtype=dtype)(x)
+
+
+@register_lowering(prims.convert_element_type, type_promotion_kind=None)
+def _convert_element_type(x: TensorBox, dtype: torch.dtype):
+    if dtype.is_complex or x.get_dtype().is_complex:
+        if x.get_size():
+            # Decompose since aa aten fallback is more friendly for c++ codegen.
+            # This decompostion doesn't work for empty tensor, which needs more investigation.
+            dst = empty_like(x, dtype=dtype)
+            ir.InplaceCopyFallback.create(dst, x)
+            return dst
+        else:
+            return fallback_handler(
+                prims.convert_element_type.default, add_to_fallback_set=False
+            )(x, dtype)
+    return to_dtype(x, dtype, copy=True)
+
+
+def to_dtype_bitcast(x: TensorBox, dtype: torch.dtype, *, copy=False):
+    x_dtype = x.get_dtype()
+    if x_dtype == dtype:
+        return clone(x) if copy else x
+
+    def _get_primitive_bitwidth(dtype):
+        if dtype.is_floating_point:
+            return torch.finfo(dtype).bits
+        else:
+            return torch.iinfo(dtype).bits
+
+    src_bits = _get_primitive_bitwidth(x_dtype)
+    dst_bits = _get_primitive_bitwidth(dtype)
+    if src_bits != dst_bits:
+        raise NotImplementedError(
+            f"bitcast {x_dtype} to different bitwidth type {dtype} is not supported yet."
+        )
+
+    def _to_dtype_bitcast(x):
+        # Because we may promote tensor type from float16 or bfloat16
+        # to float, we will need to pass the original src dtype (i.e. x_dtype),
+        # which is used for correctly constructing type conversion before bitcast,
+        # which requires the bitwidth of the input tensor type is the same as the
+        # target type.
+        return ops.to_dtype_bitcast(x, dtype, x_dtype)
+
+    return make_pointwise(_to_dtype_bitcast, override_return_dtype=dtype)(x)
+
+
+@register_lowering(aten.view.dtype, type_promotion_kind=None)
+def _view_dtype(x: TensorBox, dtype: torch.dtype):
+    if dtype.is_complex or x.get_dtype().is_complex:
+        return TensorBox.create(
+            ir.ComplexView.create(torch.ops.aten.view.dtype, x, dtype)
+        )
+    return to_dtype_bitcast(x, dtype, copy=True)
+
+
+def to_device(x: TensorBox, device: torch.device, *, copy=False):
+    device = decode_device(device)
+    if x.get_device() == device:
+        return clone(x) if copy else x
+    return TensorBox.create(ir.DeviceCopy.create(x, device))
+
+
+@register_lowering(prims.device_put, type_promotion_kind=None)
+def _device_put(x: TensorBox, device: torch.device):
+    return to_device(x, device, copy=True)
+
+
+def register_pointwise(
+    aten_fn,
+    name=None,
+    broadcast=True,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    convert_input_to_bool=False,
+    override_return_dtype=None,
+    override_fn_when_input_bool=None,
+    allow_alpha=False,
+    use_libdevice_for_f64=False,
+    triton_fallback=None,
+):
+    """A pointwise function that maps ops.{name} to inputs"""
+    name = name or aten_fn.__name__
+    fn = ops_wrapper(name)
+    if use_libdevice_for_f64:
+        fn_libdevice = ops_wrapper("libdevice_" + name)
+    if override_fn_when_input_bool is not None:
+        override_fn_when_input_bool = ops_wrapper(override_fn_when_input_bool)
+
+    fn = make_pointwise(
+        fn,
+        override_return_dtype=override_return_dtype,
+        override_fn_when_input_bool=override_fn_when_input_bool,
+        override_fn_when_cuda_float64=fn_libdevice if use_libdevice_for_f64 else None,  # type: ignore[possibly-undefined]
+        allow_alpha=allow_alpha,
+        triton_fallback=triton_fallback,
+    )
+    fn = register_lowering(
+        aten_fn,
+        broadcast=broadcast,
+        type_promotion_kind=type_promotion_kind,
+        convert_input_to_bool=convert_input_to_bool,
+    )(fn)
+
+    if hasattr(prims, name):
+        register_lowering(
+            getattr(prims, name),
+            type_promotion_kind=None,
+            convert_input_to_bool=convert_input_to_bool,
+        )(fn)
+    return fn
+
+
+def register_frexp():
+    """A pointwise function that maps ops.frexp to inputs"""
+    name = "frexp"
+    frexp = ops_wrapper("frexp")
+
+    def frexp0(*args, **kwargs):
+        return frexp(*args, **kwargs)[0]
+
+    def frexp1(*args, **kwargs):
+        return frexp(*args, **kwargs)[1]
+
+    pw_fns = [
+        make_pointwise(frexp0),
+        make_pointwise(frexp1, override_return_dtype=torch.int32),
+    ]
+
+    def fn(*args, **kwargs):
+        return pw_fns[0](*args, **kwargs), pw_fns[1](*args, **kwargs)
+
+    fn = register_lowering(
+        aten.frexp,
+    )(fn)
+
+    if hasattr(prims, name):
+        register_lowering(
+            getattr(prims, name),
+            type_promotion_kind=None,
+        )(fn)
+    return fn
+
+
+register_frexp()
+
+
+def register_foreach_pointwise(
+    aten_fn,
+    pointwise_lowering_fn,
+    allow_alpha=False,
+):
+    fn = make_foreach_pointwise(pointwise_lowering_fn, allow_alpha=allow_alpha)
+    fn = _register_foreach_lowering(aten_fn, fn)
+    return fn
+
+
+@register_lowering(aten.where, broadcast=False, type_promotion_kind=None)
+def where(cond, a, b):
+    def fn(*args):
+        return ops.where(*args)
+
+    if isinstance(a, (float, int)):
+        a = constant_like(a)(b)
+    if isinstance(b, (float, int)):
+        b = constant_like(b)(a)
+
+    args = [cond, a, b]
+    dtype = get_promoted_dtype(
+        args[1], args[2], type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
+    for i, x in zip(indices, broadcast_tensors(*[args[i] for i in indices])):
+        args[i] = x
+    for i in range(len(args)):
+        if isinstance(args[i], ir.Constant):
+            args[i] = ExpandView.create(args[i], list(args[indices[0]].get_size()))
+    return make_pointwise(fn, override_return_dtype=dtype)(
+        args[0], to_dtype(args[1], dtype), to_dtype(args[2], dtype)
+    )
+
+
+@register_lowering(aten.broadcast_tensors, broadcast=False, type_promotion_kind=None)
+def broadcast_tensors(*inputs):
+    if len(inputs) == 1 and isinstance(inputs[0], (list, tuple)):
+        return broadcast_tensors(*inputs[0])
+    target: List[sympy.Expr] = functools.reduce(
+        broadcast_symbolic_shapes, [x.get_size() for x in inputs], []
+    )
+    outputs = []
+    for x in inputs:
+        sizes = x.get_size()
+        if len(sizes) != len(target) or any(
+            ((a == 1 and b != 1) or (a != 1 and b == 1)) for a, b in zip(sizes, target)
+        ):
+            x = expand(x, target)
+        outputs.append(x)
+    return outputs
+
+
+@register_lowering([aten.alias, aten.detach, aten.detach_, aten.lift, prims.view_of])
+def nop(x):
+    return x  # AOT autograd handles this for us
+
+
+if hasattr(aten, "lift_fresh"):
+    register_lowering(aten.lift_fresh)(nop)
+
+
+@register_lowering(aten.squeeze, type_promotion_kind=None)
+def squeeze(x, dim=None):
+    assert isinstance(x, TensorBox)
+    if dim is None:
+        return TensorBox(SqueezeView.create(x.data))
+
+    dim = canonicalize_dims(len(x.get_size()), dim)
+    dims = set((dim,) if not isinstance(dim, tuple) else dim)
+
+    new_shape = []
+    for d, s in enumerate(x.get_size()):
+        if not (d in dims and V.graph.sizevars.evaluate_expr(sympy.Eq(s, 1))):
+            new_shape.append(s)
+
+    # squeeze does nothing if the size isn't 1
+    return view(x, new_shape) if new_shape != x.get_size() else x
+
+
+@register_lowering(aten.squeeze_copy, type_promotion_kind=None)
+def squeeze_copy(x, dim=None):
+    return clone(squeeze(x, dim))
+
+
+@register_lowering([aten.squeeze_])
+def squeeze_(x, dim=None):
+    val = squeeze(x, dim)
+    assert isinstance(x, TensorBox)
+    assert isinstance(val, TensorBox)
+    x.data = val.data
+    return x
+
+
+@register_lowering(aten.isinf)
+def isinf(x):
+    if is_integer_type(x):
+        return full_like(x, False, dtype=torch.bool)
+    fn = ops_wrapper("isinf")
+    return make_pointwise(fn, override_return_dtype=torch.bool)(x)
+
+
+@register_lowering(aten.isnan)
+def isnan(x):
+    if is_integer_type(x):
+        return full_like(x, False, dtype=torch.bool)
+    fn = ops_wrapper("isnan")
+    return make_pointwise(fn, override_return_dtype=torch.bool)(x)
+
+
+@register_lowering(aten.ceil)
+def ceil(x):
+    if is_integer_type(x):
+        return clone(x)
+    fn = ops_wrapper("ceil")
+    return make_pointwise(fn)(x)
+
+
+@register_lowering(aten.floor)
+def floor(x):
+    if is_integer_type(x):
+        return clone(x)
+    fn = ops_wrapper("floor")
+    return make_pointwise(fn)(x)
+
+
+@register_lowering(aten.round.default)
+def round(x):
+    if is_integer_type(x):
+        return clone(x)
+    else:
+        fn = ops_wrapper("round")
+        return make_pointwise(fn)(x)
+
+
+@register_lowering(aten.trunc)
+def trunc(x):
+    if is_integer_type(x):
+        return clone(x)
+    fn = ops_wrapper("trunc")
+    return make_pointwise(fn)(x)
+
+
+@register_lowering(aten.expand, type_promotion_kind=None)
+def expand(x, sizes):
+    (x,) = promote_constants([x])
+    if isinstance(x, ir.BaseConstant):
+        return ExpandView.create(x, tuple(sizes))
+    assert isinstance(x, TensorBox)
+    assert isinstance(sizes, (list, tuple))
+    if tuple(x.get_size()) == tuple(sizes):
+        return x
+
+    if not any(V.graph.sizevars.shape_env.is_unbacked_symint(s) for s in x.get_size()):
+        x_size_product = V.graph.sizevars.size_hint(sympy_product(x.get_size()))
+        # TODO: It would be better to realize the input if any of its sizes
+        # are unbacked, because typically the size will be non-zero.  However,
+        # this cannot be done directly as below as we'll choke on the size_hint
+        # here
+        if x_size_product > 0 and not any(
+            V.graph.sizevars.shape_env.is_unbacked_symint(s) for s in sizes
+        ):
+            # maybe realize input before broadcasting it
+            x.mark_reuse(
+                V.graph.sizevars.size_hint(sympy_product(sizes)) // x_size_product
+            )
+    return TensorBox(ExpandView.create(x.data, tuple(sizes)))
+
+
+@register_lowering(prims.broadcast_in_dim, type_promotion_kind=None)
+def broadcast_in_dim(a, shape, broadcast_dimensions):
+    s = list(shape)
+    for broadcast_dimension in broadcast_dimensions:
+        s[broadcast_dimension] = -1
+
+    v = a
+    for idx, x in enumerate(s):
+        if x != -1:
+            v = unsqueeze(v, idx)
+
+    return expand(v, shape)
+
+
+@register_lowering(aten.expand_as, type_promotion_kind=None)
+def expand_as(x, y):
+    return expand(x, y.get_size())
+
+
+@register_lowering(aten.repeat)
+def repeat(x, repeats):
+    old_size = list(x.get_size())
+    if len(repeats) > len(old_size):
+        old_size = [sympy.Integer(1)] * (len(repeats) - len(old_size)) + old_size
+        x = view(x, list(old_size))
+    assert len(repeats) == len(x.get_size())
+
+    new_size = list(x.get_size())
+
+    zero_tensor = False
+    for i in range(len(repeats)):
+        if repeats[i] == 0:
+            zero_tensor = True
+        new_size[i] = new_size[i] * repeats[i]
+
+    if zero_tensor:
+        return empty(new_size, dtype=x.get_dtype(), device=x.get_device())
+    if all((a == 1 or b == 1) for a, b in zip(repeats, old_size)):
+        return expand(x, new_size)
+
+    x_loader: Callable[[Any], Any]
+
+    def inner_fn(index):
+        assert len(index) == len(repeats)
+        index = list(index)
+        for i in range(len(repeats)):
+            if repeats[i] != 1:
+                if old_size[i] == 1:
+                    index[i] = sympy.Integer(0)
+                else:
+                    index[i] = ModularIndexing(index[i], 1, old_size[i])
+        return x_loader(index)
+
+    old_size_product = V.graph.sizevars.size_hint(sympy_product(old_size))
+    if old_size_product > 0:
+        # maybe realize the input
+        x.mark_reuse(
+            V.graph.sizevars.size_hint(sympy_product(new_size)) // old_size_product
+        )
+
+    x_loader = x.make_loader()
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=list(new_size),
+    )
+
+
+@register_lowering(aten._unsafe_view, type_promotion_kind=None)
+@register_lowering(aten.view, type_promotion_kind=None)
+@register_lowering(aten.reshape, type_promotion_kind=None)
+def view(x, sizes):
+    assert isinstance(x, TensorBox)
+    assert isinstance(sizes, (list, tuple))
+    return TensorBox(View.create(x.data, sizes))
+
+
+@register_lowering(aten.permute, type_promotion_kind=None)
+def permute(x, dims):
+    assert isinstance(x, TensorBox)
+    assert isinstance(dims, (list, tuple))
+    return TensorBox(PermuteView.create(x.data, tuple(dims)))
+
+
+@register_lowering(aten.slice, type_promotion_kind=None)
+def slice_(x, dim=0, start=0, end=2**63, step=1):
+    assert isinstance(x, TensorBox)
+    dim = _validate_dim(x, dim, 0)
+    dim_size = x.get_size()[dim]
+    return TensorBox(ir.SliceView.create(x.data, dim, start, end, step))
+
+
+@register_lowering(aten.as_strided, type_promotion_kind=None)
+def as_strided(x, size, stride, storage_offset=None):
+    if isinstance(x, TensorBox) and isinstance(x.data, ir.BaseView):
+        # as_strided ignores views
+        x = x.data.unwrap_view()
+    x.realize()
+    if not ir.is_storage_and_layout(x):
+        raise NotImplementedError(f"unrealized as_strided({x}, ...)")
+    storage, old_layout = ir.as_storage_and_layout(x)
+    new_layout = ir.FixedLayout(
+        old_layout.device,
+        old_layout.dtype,
+        [sympy.expand(s) for s in size],
+        [sympy.expand(s) for s in stride],
+        sympy.expand(storage_offset or 0),
+    )
+    return TensorBox(ir.ReinterpretView(storage, new_layout))
+
+
+@register_lowering(aten.as_strided_, type_promotion_kind=None)
+def as_strided_(x, size, stride, storage_offset=None):
+    assert isinstance(x, TensorBox)
+    x.data = as_strided(x, size, stride, storage_offset).data
+    return x
+
+
+@register_lowering(aten.as_strided_copy, type_promotion_kind=None)
+def as_strided_copy(x, size, stride, storage_offset=None):
+    result = as_strided(x, size, stride, storage_offset)
+    return clone(result)
+
+
+def pointwise_cat(inputs, dim=0):
+    # (inclusive, exclusive)
+    inputs_ranges: List[Tuple[sympy.Expr, sympy.Expr]] = []
+    prev_end = 0
+    for inp in inputs:
+        inputs_ranges.append((prev_end, prev_end + inp.get_size()[dim]))  # type: ignore[arg-type]
+        prev_end = inputs_ranges[-1][-1]  # type: ignore[assignment]
+
+    inputs_loaders = [inp.make_loader() for inp in inputs]
+
+    def inner_fn(idx):
+        idx_dim = ops.index_expr(idx[dim], torch.int64)
+
+        masks = []
+        masked_loads = []
+        for i in range(len(inputs)):
+            start = (
+                ops.constant(0, torch.int64)
+                if i == 0
+                else ops.index_expr(inputs_ranges[i][0], torch.int64)
+            )
+            end = ops.index_expr(inputs_ranges[i][1], torch.int64)
+
+            start_cond = ops.ge(idx_dim, start)
+            end_cond = ops.lt(idx_dim, end)
+            if i == 0:
+                mask = end_cond
+            elif i == len(inputs) - 1:
+                mask = start_cond
+            else:
+                mask = ops.and_(start_cond, end_cond)
+
+            masks.append(mask)
+            idx_load = list(idx)
+
+            # if we're concatting [4], [2]
+            # when we index the second tensor for 5 we want to index 5 - 4
+            idx_load[dim] -= inputs_ranges[i][0]
+
+            masked_loads.append(
+                ops.masked(
+                    mask,
+                    lambda: inputs_loaders[i](idx_load),
+                    0.0,  # this value should be unused
+                ),
+            )
+
+        next_val = masked_loads[-1]
+        for i in range((len(inputs)) - 2, -1, -1):
+            next_val = ops.where(
+                masks[i],
+                masked_loads[i],
+                next_val,
+            )
+        return next_val
+
+    new_size = list(inputs[0].get_size())
+    new_size[dim] = inputs_ranges[-1][-1]
+
+    return Pointwise.create(
+        device=inputs[0].get_device(),
+        dtype=inputs[0].get_dtype(),
+        inner_fn=inner_fn,
+        ranges=new_size,
+    )
+
+
+@register_lowering(quantized_decomposed.quantize_per_channel, type_promotion_kind=None)
+def quantized_decomposed_quantize_per_channel(
+    input: TensorBox,
+    scales: TensorBox,
+    zero_points: TensorBox,
+    axis: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert len(scales.get_size()) == 1, "expect scales 1 dim"
+    assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
+
+    if input.get_dtype() == torch.bfloat16:
+        input = to_dtype(input, torch.float32)
+    assert (
+        input.get_dtype() == torch.float32
+    ), f"Expecting input to have dtype torch.float32, but got dtype: {input.get_dtype()}"
+    assert axis < len(
+        input.get_size()
+    ), f"Expecting axis to be < {len(input.get_size())}"
+
+    input_loader = input.make_loader()
+    scales_loader = scales.make_loader()
+    zero_points_loader = zero_points.make_loader()
+
+    def inner_fn(idx):
+        channel_idx = (idx[axis],)
+
+        input = input_loader(idx)
+        scale = scales_loader(channel_idx)
+        zero_point = zero_points_loader(channel_idx)
+        qmin, qmax = _create_constants(quant_min, quant_max, dtype=torch.float32)
+
+        if scales.dtype != torch.float32:
+            scale = ops.to_dtype(scale, torch.float32)
+        if zero_points.dtype != torch.int32:
+            zero_point = ops.to_dtype(zero_point, torch.int32)
+        inv_scale = ops.reciprocal(scale)
+        val = ops.round(input * inv_scale) + zero_point
+        clamped = ops.maximum(qmin, ops.minimum(qmax, val))
+        return ops.to_dtype(clamped, dtype)
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(
+    quantized_decomposed.dequantize_per_channel, type_promotion_kind=None
+)
+def quantized_decomposed_dequantize_per_channel(
+    input: TensorBox,
+    scales: TensorBox,
+    zero_points: TensorBox,
+    axis: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> TensorBox:
+    assert len(scales.get_size()) == 1, "expect scales 1 dim"
+    assert len(zero_points.get_size()) == 1, "expect zero_points 1 dim"
+    assert (
+        input.get_dtype() == dtype
+    ), f"Expecting input to have dtype {dtype}, but got dtype: {input.get_dtype()}"
+    assert axis < len(
+        input.get_size()
+    ), f"Expecting axis to be < {len(input.get_size())}"
+
+    input_loader = input.make_loader()
+    scales_loader = scales.make_loader()
+    zero_points_loader = zero_points.make_loader()
+
+    def inner_fn(idx):
+        channel_idx = (idx[axis],)
+
+        input = input_loader(idx)
+        scale = scales_loader(channel_idx)
+        zero_point = zero_points_loader(channel_idx)
+
+        if scales.dtype != torch.float32:
+            scale = ops.to_dtype(scale, torch.float32)
+        if zero_points.dtype != torch.float32:
+            zero_point = ops.to_dtype(zero_point, torch.float32)
+        val = ops.sub(ops.to_dtype(input, torch.float32), zero_point) * scale
+        return val
+
+    return Pointwise.create(
+        device=input.get_device(),
+        dtype=torch.float32,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+@register_lowering(aten.cat)
+def cat(inputs, dim=0):
+    if all(input.get_dtype() in [torch.int8, torch.uint8] for input in inputs):
+        # TODO <leslie> Remove this fallback when we support vectorization
+        # code gen with uint8 data type directly.
+        for input in inputs:
+            input.realize()
+        if all(len(input.get_size()) == 4 for input in inputs):
+            inputs, _ = require_channels_last(aten.cat, *inputs)
+        return fallback_handler(aten.cat.default)(inputs, dim)
+
+    if len(inputs) == 1:
+        return clone(inputs[0])
+
+    dim = _validate_dim(inputs[0], dim, 0)
+    dtype = get_promoted_dtype(
+        *inputs, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    inputs = [to_dtype(inp, dtype) for inp in inputs]
+
+    def unwrap_tensor(x: Union[TensorBox, ir.StorageBox]) -> ir.IRNode:
+        if isinstance(x, TensorBox):
+            if isinstance(x.data, ir.BaseView):
+                return x.data.unwrap_view()
+            else:
+                return x.data
+
+        if isinstance(x, ir.StorageBox):
+            return x.data
+
+        return x
+
+    def should_lower_cat_input(x) -> bool:
+        # Unrealized inputs will not be storage and layouts, and we dont want to realize
+        # them in case we want to fuse
+        if ir.is_storage_and_layout(x):
+            storage, _ = ir.as_storage_and_layout(x, freeze=False)
+            return not ir.ConcatKernel.can_realize_into_without_copy(storage)
+
+        if isinstance(x, (TensorBox, ir.StorageBox)):
+            return should_lower_cat_input(unwrap_tensor(x))
+
+        if isinstance(x, ir.Pointwise):
+            return True
+
+        return False
+
+    def is_reduction(t):
+        return isinstance(t, ir.ComputedBuffer) and isinstance(t.data, ir.Reduction)
+
+    def can_fuse_reduction(t):
+        if isinstance(t, (TensorBox, ir.StorageBox)):
+            return can_fuse_reduction(unwrap_tensor(t))
+        return (
+            is_reduction(t)
+            or isinstance(t, ir.Pointwise)
+            and any(
+                can_fuse_reduction(V.graph.get_buffer(read))
+                for read in t.get_read_names()
+            )
+        )
+
+    # fusing reducutions into computed concat buffer can cause regressions.
+    fusable_reduction = any(can_fuse_reduction(t) for t in inputs)
+
+    # TODO: We observed negative performance impact of pointwise_cat optimization on CPU so disabled it.
+    #             We will revisit this later after enabling vectorization on index_expr.
+    if inputs[0].get_device().type == "cpu" or fusable_reduction:
+        return TensorBox(ir.ConcatKernel.create(inputs, dim))
+
+    def op_count(x):
+        if isinstance(x, (TensorBox, ir.StorageBox)):
+            return op_count(unwrap_tensor(x))
+
+        # this will correspond to a direct memory read
+        if not isinstance(x, ir.Pointwise):
+            return 0
+
+        count = x.inner_fn_opcount()
+        for read in x.get_read_names():
+            count += op_count(V.graph.get_buffer(read))
+
+        return count
+
+    # as of inputs increase, possibility for register spilling also increases
+    # past a certain threshold of inputs we only fuse if the if the input kernels
+    # are simple
+    # not sure if we want to expose to users via config since logic may change in future
+    MAX_COMPLEX_POINTWISE_CAT = 8
+    MAX_SIMPLE_OP_COUNT = 2
+
+    if len(inputs) <= MAX_COMPLEX_POINTWISE_CAT or (
+        (len(inputs) <= config.max_pointwise_cat_inputs)
+        and all(op_count(t) <= MAX_SIMPLE_OP_COUNT for t in inputs)
+    ):
+        pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users)
+        all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs)
+        any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs)
+
+        if all_pointwise_inputs or (any_pointwise_inputs and pointwise_uses):
+            return pointwise_cat(inputs, dim)
+
+    return TensorBox(ir.ConcatKernel.create(inputs, dim))
+
+
+@register_lowering(aten.diagonal, type_promotion_kind=None)
+def diagonal(input, offset: int = 0, dim1: int = 0, dim2: int = 1):
+    original_shape = input.get_size()
+    num_dims = len(original_shape)
+    dim1 = canonicalize_dim(idx=dim1, rank=num_dims)
+    dim2 = canonicalize_dim(idx=dim2, rank=num_dims)
+
+    check(
+        dim1 != dim2, lambda: f"diagonal dimensions cannot be identical {dim1}, {dim2}"
+    )
+
+    offset_negative = V.graph.sizevars.evaluate_expr(sympy.Lt(offset, 0))
+    if offset_negative:
+        diag_size = max(min(original_shape[dim1] + offset, original_shape[dim2]), 0)
+    else:
+        diag_size = max(min(original_shape[dim1], original_shape[dim2] - offset), 0)
+
+    base_idx = (0, 0)
+    if offset_negative:
+        base_idx = (-offset, 0)
+    else:
+        base_idx = (0, offset)
+
+    sizes = [s for i, s in enumerate(original_shape) if i not in (dim1, dim2)]
+    sizes.append(diag_size)
+
+    def reindexer(idx):
+        diag_idx = idx[-1]
+        original_idx = [0] * len(original_shape)
+        cur_dim = 0
+        for d in range(num_dims):
+            if d == dim1:
+                original_idx[d] = diag_idx + base_idx[0]
+            elif d == dim2:
+                original_idx[d] = diag_idx + base_idx[1]
+            else:
+                original_idx[d] = idx[cur_dim]
+                cur_dim += 1
+
+        assert cur_dim == len(original_shape) - 2
+        return original_idx
+
+    return TensorBox(ir.GenericView.create(input, sizes, reindexer))
+
+
+@register_lowering(aten.diagonal_copy, type_promotion_kind=None)
+def diagonal_copy(input, offset: int = 0, dim1: int = 0, dim2: int = 1):
+    return clone(diagonal(input, offset, dim1, dim2))
+
+
+@register_lowering(aten.diagonal_scatter, type_promotion_kind=None)
+def diagonal_scatter(input, src, offset: int = 0, dim1: int = 0, dim2: int = 1):
+    output = clone(input)
+    target = diagonal(output, offset, dim1, dim2)
+    mutate_to(target, src)
+    return output
+
+
+@register_lowering(aten.select, type_promotion_kind=None)
+def select(x, dim, idx):
+    idx = View.handle_negative_index(idx, x.get_size()[dim])
+    return squeeze(slice_(x, dim, idx, idx + 1), dim)
+
+
+@register_lowering(aten.split, type_promotion_kind=None)
+def split(x, sizes, dim=0):
+    dim = _validate_dim(x, dim, 0)
+    x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
+    if isinstance(sizes, sympy.Expr):
+        # TODO: We don't have to guard on sizes per se, but the number
+        # of splits must stay constant
+        sizes = V.graph.sizevars.evaluate_static_shape(sizes)
+    if isinstance(sizes, (int, sympy.Integer)):
+        sizes = [sizes] * ((x_size + sizes - 1) // sizes)
+    result = []
+    start = 0
+    for size in sizes:
+        end = start + size
+        result.append(slice_(x, dim, start, end))
+        start = end
+    return result
+
+
+@register_lowering(aten.split_with_sizes, type_promotion_kind=None)
+def split_with_sizes(x, sizes, dim=0):
+    return split(x, sizes, dim)
+
+
+@register_lowering(aten.unbind, type_promotion_kind=None)
+def unbind(x, dim=0):
+    dim = _validate_dim(x, dim, 0)
+    x_size = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim])
+    result = []
+    for i in range(x_size):
+        result.append(select(x, dim, i))
+    return result
+
+
+@register_lowering(aten.unfold, type_promotion_kind=None)
+def unfold(x, dimension, size, step):
+    sizes = x.get_size()
+    ndim = len(sizes)
+    dim = canonicalize_dim(ndim, dimension)
+
+    if ndim == 0:
+        return slice_(unsqueeze(x, 0), end=size)
+
+    dim_size = sizes[dim]
+    sizevars = V.graph.sizevars
+    sizevars.guard_leq(size, dim_size)
+    sizevars.guard_lt(0, step)  # type: ignore[arg-type]
+
+    new_dim_size = FloorDiv(dim_size - size, step) + 1
+    if sizevars.size_hint(dim_size) > 0:
+        x.mark_reuse(sizevars.size_hint(CeilDiv(new_dim_size * size, dim_size)))
+
+    out_size = [*sizes[:dim], new_dim_size, *sizes[dim + 1 :], size]
+
+    def reindexer(idx):
+        dim_idx = idx[-1] + idx[dim] * step
+        return (*idx[:dim], dim_idx, *idx[dim + 1 : -1])
+
+    return TensorBox(ir.GenericView.create(x, out_size, reindexer))
+
+
+@register_lowering(aten.unsqueeze, type_promotion_kind=None)
+def unsqueeze(x, dim):
+    dim = _validate_dim(x, dim, 1)
+    new_shape = list(x.get_size())
+    new_shape.insert(dim, sympy.Integer(1))
+    return view(x, new_shape)
+
+
+@register_lowering(aten.unsqueeze_, type_promotion_kind=None)
+def unsqueeze_(x, dim):
+    val = unsqueeze(x, dim)
+    assert isinstance(x, TensorBox)
+    assert isinstance(val, TensorBox)
+    x.data = val.data
+    return x
+
+
+def _validate_dim(x, dim, offset=0):
+    assert isinstance(dim, int)
+    ndim = len(x.get_size())
+    if dim < 0:
+        dim += ndim + offset
+    assert 0 <= dim < ndim + offset
+    return dim
+
+
+@register_lowering(aten.glu)
+def glu(x, dim=-1):
+    dim = _validate_dim(x, dim, 0)
+    # TODO: don't guard on static shape here
+    new_len = V.graph.sizevars.evaluate_static_shape(x.get_size()[dim]) // 2
+    a = slice_(x, dim, 0, new_len)
+    b = slice_(x, dim, new_len, new_len * 2)
+    return mul(a, sigmoid(b))
+
+
+def register_onednn_fusion_ops():
+    if torch._C._has_mkldnn:
+        cpu_needs_realized_inputs = [
+            torch.ops.mkldnn._convolution_pointwise,
+            torch.ops.mkldnn._convolution_pointwise_,
+            torch.ops.mkldnn._convolution_transpose_pointwise,
+            torch.ops.mkldnn._linear_pointwise,
+            aten.mkldnn_rnn_layer.default,
+            torch.ops.onednn.qconv2d_pointwise,
+        ]
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise)
+        def convolution_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise.binary)
+        def convolution_binary(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinary.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._convolution_pointwise_.binary)
+        def convolution_binary_inplace(
+            x: TensorBox,
+            other: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            stride,
+            dilation,
+            groups,
+            binary_attr,
+            binary_alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionBinaryInplace.create(
+                    x,
+                    other,
+                    weight,
+                    bias,
+                    padding,
+                    stride,
+                    dilation,
+                    groups,
+                    binary_attr,
+                    binary_alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithm,
+                )
+            )
+
+        @register_lowering(torch.ops.mkldnn._linear_pointwise)
+        def linear_unary(
+            x: TensorBox, w: TensorBox, b: TensorBox, attr, scalars, algorithm
+        ):
+            return TensorBox.create(
+                ir.LinearUnary.create(x, w, b, attr, scalars, algorithm)
+            )
+
+        @register_lowering(torch.ops.mkldnn._linear_pointwise.binary)
+        def linear_binary(x: TensorBox, y: TensorBox, w: TensorBox, b: TensorBox, attr):
+            return TensorBox.create(ir.LinearBinary.create(x, y, w, b, attr))
+
+        @register_lowering(torch.ops.mkldnn._convolution_transpose_pointwise)
+        def convolution_transpose_unary(
+            x: TensorBox,
+            weight: TensorBox,
+            bias: TensorBox,
+            padding,
+            output_padding,
+            stride,
+            dilation,
+            groups,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.ConvolutionTransposeUnary.create(
+                    x,
+                    weight,
+                    bias,
+                    padding,
+                    output_padding,
+                    stride,
+                    dilation,
+                    groups,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(aten.mkldnn_rnn_layer.default)
+        def mkldnn_rnn_layer(
+            x: TensorBox,
+            w0: TensorBox,
+            w1: TensorBox,
+            w2: TensorBox,
+            w3: TensorBox,
+            hx: TensorBox,
+            cx: TensorBox,
+            reverse: bool,
+            batch_sizes: List[int],
+            mode: int,
+            hidden_size: int,
+            num_layers: int,
+            has_biases: bool,
+            bidirectional: bool,
+            batch_first: bool,
+            train: bool,
+        ):
+            return pytree.tree_map(
+                TensorBox.create,
+                ir.MkldnnRnnLayer.create(
+                    x,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    hx,
+                    cx,
+                    reverse,
+                    batch_sizes,
+                    mode,
+                    hidden_size,
+                    num_layers,
+                    has_biases,
+                    bidirectional,
+                    batch_first,
+                    train,
+                ),
+            )
+
+        @register_lowering(torch.ops.onednn.qconv2d_pointwise, type_promotion_kind=None)
+        def qconvolution_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.QConvPointWisePT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        @register_lowering(
+            torch.ops.onednn.qconv2d_pointwise.binary, type_promotion_kind=None
+        )
+        def qconvolution_binary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            accum: TensorBox,
+            accum_scale,
+            accum_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            stride,
+            padding,
+            dilation,
+            groups,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithmm,
+        ):
+            if (
+                binary_attr == "sum"
+                and output_dtype in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() in [torch.float32, torch.bfloat16]
+                and accum.get_dtype() != output_dtype
+            ):
+                # For int8-mixed-bf16 quantization and inplace add,
+                # there is case when accum dtype is float32 but output dtype is bfloat16.
+                # Since the accum will be inplaced changed with post op sum,
+                # we will do accum dtype convertion here.
+                accum = to_dtype(accum, output_dtype)
+            return TensorBox.create(
+                ir.QConvPointWiseBinaryPT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    accum,
+                    accum_scale,
+                    accum_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    binary_attr,
+                    alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithmm,
+                )
+            )
+
+        @register_lowering(torch.ops.onednn.qlinear_pointwise, type_promotion_kind=None)
+        def qlinear_unary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            attr,
+            scalars,
+            algorithm,
+        ):
+            return TensorBox.create(
+                ir.QLinearPointwisePT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    attr,
+                    scalars,
+                    algorithm,
+                )
+            )
+
+        if torch._C.has_mkl:
+            cpu_needs_realized_inputs.append(torch.ops.mkl._mkl_linear)
+
+            @register_lowering(torch.ops.mkl._mkl_linear)
+            def mkl_packed_linear(
+                x: TensorBox,
+                packed_w: TensorBox,
+                orig_w: TensorBox,
+                b: TensorBox,
+                batch_size,
+            ):
+                result = TensorBox.create(
+                    ir.MKLPackedLinear.create(x, packed_w, orig_w, batch_size)
+                )
+                if b is not None:
+                    result = add(result, b)
+                return result
+
+        add_needs_realized_inputs(cpu_needs_realized_inputs)
+    else:
+        pass
+
+
+register_onednn_fusion_ops()
+
+
+def fallback_handler(kernel, add_to_fallback_set=True):
+    if add_to_fallback_set:
+        fallbacks.add(kernel)
+
+    def handler(*args, **kwargs):
+        return pytree.tree_map(
+            TensorBox.create, ir.FallbackKernel.create(kernel, *args, **kwargs)
+        )
+
+    return handler
+
+
+@functools.lru_cache(None)
+def _warn_complex_not_supported():
+    warnings.warn(
+        "Torchinductor does not support code generation for complex operators. Performance may be worse than eager."
+    )
+
+
+# There are some types (CPU) which we accept as input but not as
+# output.
+def unsupported_input_tensor(t: torch._subclasses.FakeTensor, parent=None):
+    "Do not support reading or writing to this tensor"
+    if t.is_complex():
+        # Complex views are supported with IR ComplexView
+        if parent and parent.target in (
+            torch.ops.aten.view.dtype,
+            torch.ops.prims.convert_element_type.default,
+        ):
+            return False
+        _warn_complex_not_supported()
+        return True
+    return False
+
+
+def unsupported_output_tensor(t: torch._subclasses.FakeTensor, parent=None):
+    "Do not support writing tensor but can read from it"
+    if unsupported_input_tensor(t, parent):
+        return True
+    return t.is_cpu and config.disable_cpp_codegen
+
+
+def fallback_node_due_to_unsupported_type(node: torch.fx.Node, allow_cpu_inputs=True):
+    # Custom fallback lowering
+    if node.target is aten.view_as_complex.default:
+        return False
+
+    # We should be able to remove this special case once `disable_cpp_codegen` is killed.
+    if node.target is aten.lift_fresh_copy.default:
+        return False
+
+    def check_skip_condition(node, parent, is_output):
+        if not isinstance(node, torch.fx.Node):
+            return False
+
+        if "val" not in node.meta:
+            return False
+
+        for meta in pytree.tree_leaves(node.meta["val"]):
+            if not isinstance(meta, torch._subclasses.FakeTensor):
+                continue
+
+            if is_output:
+                if unsupported_output_tensor(meta, parent):
+                    return True
+            else:
+                if unsupported_input_tensor(meta, parent):
+                    return True
+
+        return False
+
+    # only skip codegen if there is a cpu output, not input
+    for arg in pytree.arg_tree_leaves(*node.args, **node.kwargs):
+        if check_skip_condition(arg, node, is_output=False):
+            return True
+
+    return check_skip_condition(node, node, is_output=True)
+
+
+def make_fallback(op, layout_constraint=None, warn=True):
+    assert op not in decompositions, f"both a fallback and a decomp for same op: {op}"
+    if (
+        warn
+        and bool(os.getenv("CI"))
+        and get_decompositions([op])
+        # if fallback_random, we allow not decomposing random
+        and not (
+            config.fallback_random
+            and op in torch._decomp.decompositions_for_rng.extra_random_decomps
+        )
+    ):
+        # Note: 'warn' is holdover from when this was a warning, but for ops that previously
+        # set warn=False we do not want a CI error.
+        # Ignore the 'suppress errors' configs in CI, as this particular warning happens on startup anyway and is not
+        # likely to be triggered preferentially on one CI config over another.
+        if torch._dynamo.config.suppress_errors:
+            torch._dynamo.config.suppress_errors = False
+            log.warning(
+                "A make_fallback error occurred in suppress_errors config,"
+                " and suppress_errors is being disabled to surface it."
+            )
+        raise AssertionError(
+            f"make_fallback({op}): a decomposition exists, we should switch to it."
+            " To fix this error, either add a decomposition to core_aten_decompositions (preferred)"
+            " or inductor_decompositions, and delete the corresponding `make_fallback` line."
+            " Get help from the inductor team if unsure, don't pick arbitrarily to unblock yourself.",
+        )
+
+    def register_fallback(op_overload):
+        add_needs_realized_inputs(op_overload)
+        if layout_constraint is not None:
+            add_layout_constraint(op_overload, layout_constraint)
+        return register_lowering(op_overload, type_promotion_kind=None)(
+            fallback_handler(op_overload)
+        )
+
+    if isinstance(op, torch._ops.OpOverloadPacket):
+        for ol in op.overloads():
+            op_overload = getattr(op, ol)
+            register_fallback(op_overload)
+    elif isinstance(op, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)):
+        register_fallback(op)
+    else:
+        raise RuntimeError(f"Unsupported fallback {op} with type {type(op)}")
+
+
+def philox_rand_offset(shape):
+    """
+    TorchInductor offset calculation differs from PyTorch eager offset
+    calculation for random ops (tl.rand vs torch.rand). In future, we should
+    strive for same impl for tl.rand and torch.rand.
+    """
+    numel = 1
+    for s in shape:
+        numel = numel * s
+    return tensor(numel, dtype=torch.int64)
+
+
+@register_lowering(torch.ops.rngprims.philox_rand, type_promotion_kind=None)
+def philox_rand(size, seed, offset, stride, device, dtype):
+    # stride arg is optional and will be used in future for distributed random
+    # ops. Currently, its unused.
+    random_pos = ir.FixedLayout(
+        device,
+        dtype,
+        size,
+        ir.FlexibleLayout.contiguous_strides(size),
+    ).make_indexer()
+    seed_loader = seed.make_loader()
+    offset_loader = offset.make_loader()
+
+    def inner_fn(index):
+        # Both seed and offset in the philox_rand op are tensors.
+        # torch seed and offsets are of type int64, but tl.rand accepts int32
+        seed_index_expr = ops.to_dtype(seed_loader([]), torch.int32)
+        offset_index_expr = ops.to_dtype(offset_loader([]), torch.int32)
+        # Get the offset'd position
+        rand_index_expr = ops.add(
+            ops.index_expr(random_pos(index), torch.int32), offset_index_expr
+        )
+        result = ops.rand(
+            seed_index_expr,
+            rand_index_expr,
+        )
+        return ops.to_dtype(result, dtype)
+
+    random_values_node = Pointwise.create(
+        device=device,
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=list(size),
+    )
+
+    offset_node = philox_rand_offset(size)
+    return random_values_node, offset_node
+
+
+@register_lowering(aten.native_dropout, type_promotion_kind=None)
+def native_dropout(x, p, train):
+    if config.fallback_random:
+        return pytree.tree_map(
+            TensorBox.create,
+            ir.FallbackKernel.create(aten.native_dropout.default, x, p, train),
+        )
+    else:
+        raise AssertionError("should be handled in replace_random.py")
+
+
+@register_lowering(aten.bernoulli_, type_promotion_kind=None)
+def bernoulli_(x, *args):
+    assert config.fallback_random or x.get_device() == torch.device(
+        "cpu"
+    ), "this should be handled in decomps unless config.fallback_random or the device is CPU"
+    x.realize()
+    ir.InplaceBernoulliFallback(x, *args)
+    return x
+
+
+@register_lowering(aten.bernoulli.p, type_promotion_kind=None)
+def bernoulli_p(x, *args):
+    assert config.fallback_random or x.get_device() == torch.device(
+        "cpu"
+    ), "this should be handled in decomps unless config.fallback_random or the device is CPU"
+    return bernoulli_(clone(x), *args)
+
+
+# This shouldn't be called in general
+@register_lowering(aten._foobar)
+def _foobar(_):
+    raise AssertionError()
+
+
+@functools.lru_cache(1)
+def _warn_triton_random(salt):
+    log.info("using triton random, expect difference from eager")
+
+
+def warn_triton_random():
+    # only warn once per graph
+    _warn_triton_random(V.graph.creation_time)
+
+
+fallback_rand_default = fallback_handler(aten.rand.default)
+fallback_rand_generator = fallback_handler(aten.rand.generator)
+fallback_randn_default = fallback_handler(aten.randn.default)
+fallback_randn_generator = fallback_handler(aten.randn.generator)
+make_fallback(aten.randint)
+
+
+@register_lowering(aten.rand)
+def rand(*args, **kwargs):
+    if kwargs.get("generator", None) is not None:
+        return fallback_rand_generator(*args, **kwargs)
+    elif config.fallback_random:
+        kwargs.pop("generator", None)
+        return fallback_rand_default(*args, **kwargs)
+    raise AssertionError("should have been handled in replace_random.py")
+
+
+@register_lowering(aten.randn)
+def randn(*args, **kwargs):
+    if kwargs.get("generator", None) is not None:
+        return fallback_randn_generator(*args, **kwargs)
+    elif config.fallback_random:
+        kwargs.pop("generator", None)
+        return fallback_randn_default(*args, **kwargs)
+    raise AssertionError("should have been handled in replace_random.py")
+
+
+@register_lowering(inductor_prims.force_stride_order, type_promotion_kind=None)
+def inductor_force_stride_order(input_tensor, stride):
+    stride_order = ir.get_stride_order(stride)
+    return ir.ExternKernel.require_stride_order(input_tensor, stride_order)
+
+
+@register_lowering(inductor_prims.seed, type_promotion_kind=None)
+def inductor_seed(device: torch.device):
+    raise AssertionError("should be handled in fuse_seed_creation_pass()")
+
+
+@register_lowering(inductor_prims.seeds, type_promotion_kind=None)
+def inductor_seeds(count, device):
+    warn_triton_random()
+    return TensorBox.create(ir.RandomSeeds(count, decode_device(device)))
+
+
+@register_lowering(inductor_prims.lookup_seed, type_promotion_kind=None)
+def inductor_lookup_seed(seeds, index):
+    def inner_fn(_):
+        return ops.load_seed(seeds.get_name(), index)
+
+    return Pointwise.create(
+        device=seeds.get_device(),
+        dtype=seeds.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=[],
+    )
+
+
+@register_lowering(inductor_prims.random, type_promotion_kind=None)
+def inductor_random(size: List[int], seed: TensorBox, mode: str, *, offset: int = 0):
+    assert not config.fallback_random
+    assert mode in ("rand", "randn")
+    size = [*size]
+    dtype = torch.float32
+    device = seed.get_device()
+    random_pos = ir.FixedLayout(
+        device, dtype, size, ir.FlexibleLayout.contiguous_strides(size), offset=offset
+    ).make_indexer()
+    seed_loader = seed.make_loader()
+
+    def inner_fn(index):
+        return getattr(ops, mode)(
+            seed_loader([]),
+            ops.index_expr(random_pos(index), torch.int32),
+        )
+
+    result = Pointwise.create(
+        device=device,
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=[*size],
+    )
+    result.realize()
+    return result
+
+
+@register_lowering(inductor_prims.randint, type_promotion_kind=None)
+def inductor_randint(
+    low: int, high: int, size: List[int], seed: TensorBox, *, offset: int = 0
+):
+    assert not config.fallback_random
+    size = [*size]
+    dtype = torch.int64
+    device = seed.get_device()
+    random_pos = ir.FixedLayout(
+        device, dtype, size, ir.FlexibleLayout.contiguous_strides(size), offset=offset
+    ).make_indexer()
+    seed_loader = seed.make_loader()
+
+    def inner_fn(index):
+        return ops.randint64(
+            seed_loader([]),
+            ops.index_expr(random_pos(index), torch.int32),
+            low,
+            high,
+        )
+
+    return Pointwise.create(
+        device=device,
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=[*size],
+    )
+
+
+@register_lowering(aten.bucketize, type_promotion_kind=None)
+def bucketize(
+    input: TensorBox,
+    boundaries: TensorBox,
+    *,
+    out_int32: bool = False,
+    right: bool = False,
+):
+    assert len(boundaries.get_size()) == 1
+
+    if not (is_triton(input) and is_triton(boundaries)):
+        return fallback_handler(aten.bucketize.Tensor, add_to_fallback_set=False)(
+            input, boundaries, out_int32=out_int32, right=right
+        )
+
+    # The entire boundaries tensor needs to be used by ops.bucketize, so we
+    # need to realize it into global memory; or in other words, we can't
+    # guarantee that boundaries.get_name() (used below) will exist unless
+    # we call boundaries.realize().
+    boundaries.realize()
+    boundaries_size = boundaries.get_size()[0]
+    boundaries_loader = boundaries.make_loader()
+    device = input.get_device()
+    input_loader = input.make_loader()
+
+    index_dtype = torch.int32 if out_int32 else torch.int64
+
+    def inner_fn(index):
+        val = input_loader(index)
+        indices = ops.bucketize(
+            val,
+            boundaries.get_name(),
+            boundaries_size,
+            index_dtype,
+            right,
+        )
+
+        return indices
+
+    return Pointwise.create(
+        device=device,
+        dtype=index_dtype,
+        inner_fn=inner_fn,
+        ranges=input.get_size(),
+    )
+
+
+def require_dense(_, *args, **kwargs):
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, ir.ExternKernel.require_stride1, (args, kwargs)
+    )
+    return args, kwargs
+
+
+def require_contiguous(_, *args, **kwargs):
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, ir.ExternKernel.require_contiguous, (args, kwargs)
+    )
+    return args, kwargs
+
+
+def require_channels_last(_, *args, **kwargs):
+    args, kwargs = pytree.tree_map_only(
+        ir.IRNode, ir.ExternKernel.require_channels_last, (args, kwargs)
+    )
+    return args, kwargs
+
+
+def constrain_to_fx_strides(fx_node, *args, **kwargs):
+    def apply_constraint(arg, fx_arg):
+        if isinstance(arg, ir.IRNode):
+            stride_order = ir.get_stride_order(fx_arg.meta["val"].stride())
+            return ir.ExternKernel.require_stride_order(arg, stride_order)
+        return arg
+
+    args = tuple(
+        apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
+    )
+    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+    return args, kwargs
+
+
+# TODO(jansel): we should implement decomps or lowerings for these
+# https://github.com/pytorch/torchdynamo/issues/327
+FALLBACK_ALLOW_LIST = {
+    "torchvision::roi_align",
+}
+
+
+def sdpa_constraint(fx_node, *args, **kwargs):
+    # sdpa requires dense last dimension]
+
+    def apply_constraint(arg, fx_arg):
+        if not isinstance(arg, ir.IRNode):
+            return arg
+
+        meta_val = fx_arg.meta["val"]
+        if not meta_val.is_cuda:
+            return arg
+
+        stride_order = ir.get_stride_order(meta_val.stride())
+        if stride_order and stride_order[-1] != 0:
+            # contiguous stride order
+            stride_order = list(reversed(range(len(arg.get_size()))))
+
+        # This is the minimum alignment required by SDPA kernels for attention_bias.
+        # This value can be found in pytorch/aten/src/ATen/native/transformers/attention.cpp preprocess_mask
+        ALIGNMENT = 8
+
+        assert isinstance(arg, TensorBox)
+        if len(arg.get_size()) not in (3, 4):
+            return arg
+
+        def is_aligned_realized_tensor(x):
+            aligned_strides = all(
+                (V.graph.sizevars.size_hint(x.get_stride()[i]) % ALIGNMENT) == 0
+                for i in range(len(x.get_stride()) - 1)
+            )
+            return (
+                V.graph.sizevars.size_hint(x.get_stride()[-1])
+            ) == 1 and aligned_strides
+
+        try:
+            arg.get_stride()
+            if is_aligned_realized_tensor(arg):
+                return arg
+        except AttributeError:
+            pass
+
+        def is_aligned(x):
+            return (V.graph.sizevars.size_hint(x.get_size()[-1]) % ALIGNMENT) == 0
+
+        if isinstance(arg.data, ir.BaseView):
+            if not is_aligned(arg):
+                if is_aligned(arg.unwrap_view()):
+                    return arg
+
+        return ir.ExternKernel.require_stride_order(arg, stride_order)
+
+    args = tuple(
+        apply_constraint(arg, fx_arg) for arg, fx_arg in zip(args, fx_node.args)
+    )
+    kwargs = {k: apply_constraint(v, fx_node.kwargs[k]) for k, v in kwargs.items()}
+    return args, kwargs
+
+
+# WIP
+make_fallback(aten.index_reduce)  # @pearu
+make_fallback(aten._adaptive_avg_pool3d)  # @isuruf
+make_fallback(aten.adaptive_max_pool3d)  # @isuruf
+make_fallback(aten.avg_pool3d)  # @isuruf
+make_fallback(aten.fractional_max_pool3d)  # @isuruf
+make_fallback(aten.max_pool3d_with_indices)  # @isuruf (can this one be implemented?)
+make_fallback(aten.cummax)  # @isuruf
+make_fallback(aten.cummin)  # @isuruf
+
+
+# 1) Easy
+make_fallback(aten.uniform, warn=False)
+make_fallback(aten.exponential.default, warn=False)  # (fails accuracy on test_torch.py)
+make_fallback(aten._pdist_forward)  # Has decomp. Needs benchmarks
+make_fallback(aten.soft_margin_loss_backward, warn=False)  # py_impl?
+make_fallback(aten.searchsorted)  # bucketized is implemented (see eager impl)
+
+
+# 1.5) Easy or Impossible
+make_fallback(aten._cdist_forward)  # p=2 should be feasible
+make_fallback(aten._cdist_backward)
+# See resize_storage_bytes
+make_fallback(aten.resize)
+make_fallback(aten.resize_)
+make_fallback(aten.resize_as)
+make_fallback(aten.resize_as_)
+
+
+# 2) Medium
+make_fallback(aten.max_unpool2d)
+make_fallback(aten.max_unpool3d)
+make_fallback(aten._trilinear)
+
+
+# 3) Difficult
+# Scans
+# See the discussion at
+# https://dev-discuss.pytorch.org/t/pytorch-sparse-gnn-compiler-rfc/1644/19
+make_fallback(aten.segment_reduce.default)
+make_fallback(aten._segment_reduce_backward.default)
+
+# Histogram (need to implement Histogram IR)
+make_fallback(aten.histc)
+make_fallback(aten.histogram.bin_ct)
+make_fallback(aten._histogramdd_bin_edges.default)
+make_fallback(aten._histogramdd_from_bin_cts.default)
+
+# Need templated kernel
+make_fallback(aten.addbmm)
+make_fallback(aten.addmv, warn=False)
+make_fallback(aten._addmm_activation, warn=False)
+
+# Need templated kernel. Probably impossible to write efficiently
+make_fallback(aten.convolution_backward, constrain_to_fx_strides)
+make_fallback(aten._cudnn_rnn, require_dense)
+make_fallback(aten._cudnn_rnn_backward, require_contiguous)
+
+# Haven't checked but sound difficult / impossible
+make_fallback(aten._embedding_bag, require_contiguous)
+make_fallback(aten._embedding_bag_forward_only, require_contiguous)
+make_fallback(aten._embedding_bag_dense_backward)
+make_fallback(aten._embedding_bag_per_sample_weights_backward)
+make_fallback(aten._embedding_bag_per_sample_weights_backward)
+make_fallback(aten._fused_moving_avg_obs_fq_helper)
+make_fallback(aten._fused_moving_avg_obs_fq_helper_functional)
+
+
+# 4) Backwards (try py_impl'ing them) when fwd is written as a decomp
+make_fallback(aten.avg_pool3d_backward)
+make_fallback(aten.max_pool3d_with_indices_backward)
+make_fallback(aten._adaptive_avg_pool2d_backward, require_dense)
+make_fallback(aten._adaptive_avg_pool3d_backward)
+make_fallback(aten.adaptive_max_pool2d_backward)
+make_fallback(aten.adaptive_max_pool3d_backward)
+make_fallback(aten.fractional_max_pool2d_backward)
+make_fallback(aten.fractional_max_pool3d_backward)
+make_fallback(aten.replication_pad1d_backward)
+make_fallback(aten.replication_pad2d_backward)
+make_fallback(aten.upsample_linear1d_backward)
+make_fallback(aten.upsample_bicubic2d_backward, require_contiguous)
+make_fallback(aten.upsample_trilinear3d_backward)
+make_fallback(aten.grid_sampler_2d_backward, require_dense)
+make_fallback(aten._pdist_backward)
+
+
+# 5) Impossible (missing triton/CPU features)
+
+# Sorting / Sorting-like
+make_fallback(aten.sort)
+make_fallback(aten.sort.stable)
+make_fallback(aten.kthvalue)
+make_fallback(aten.topk)
+make_fallback(aten.mode)
+make_fallback(aten.median)
+make_fallback(aten.nanmedian)
+make_fallback(aten.randperm)
+
+# Linalg
+make_fallback(aten._linalg_det)
+make_fallback(aten.linalg_householder_product)
+make_fallback(aten.linalg_inv_ex)
+make_fallback(aten.linalg_ldl_factor_ex)
+make_fallback(aten.linalg_ldl_solve)
+make_fallback(aten.linalg_lu)
+make_fallback(aten.linalg_lu_factor_ex)
+make_fallback(aten.linalg_lu_solve)
+make_fallback(aten.linalg_matrix_exp)
+make_fallback(aten.linalg_qr)
+make_fallback(aten._linalg_slogdet)
+make_fallback(aten._linalg_solve_ex)
+make_fallback(aten.linalg_solve_triangular)
+make_fallback(aten._linalg_svd)
+make_fallback(aten.lu_unpack)
+make_fallback(aten.ormqr)
+make_fallback(aten._linalg_check_errors)
+make_fallback(aten.linalg_pinv.atol_rtol_tensor)
+make_fallback(aten._linalg_eigh)
+make_fallback(aten.triangular_solve)
+make_fallback(aten.linalg_cholesky_ex)
+make_fallback(aten.cholesky_inverse)
+make_fallback(aten.cholesky_solve)
+make_fallback(aten.geqrf)
+make_fallback(aten._fft_r2c)  # needs complex as well
+
+# Data dependent (are these necessary?)
+make_fallback(aten.nonzero.default)
+
+# Misc
+make_fallback(aten.gcd.default, warn=False)
+make_fallback(aten._thnn_fused_lstm_cell, require_dense)
+make_fallback(torch._prims.rng_prims.run_and_save_rng_state)
+make_fallback(torch._prims.rng_prims.run_with_rng_state)
+
+# Implmented / Half implemented
+# Scans. Implemented for CUDA, missing CPU
+make_fallback(aten.masked_scatter)
+make_fallback(aten.masked_scatter_backward)
+
+# Complex number support
+make_fallback(aten.view_as_complex, require_contiguous)
+make_fallback(aten.angle)  # needs complex
+
+# Needs efficentzerotensor
+make_fallback(aten._efficientzerotensor)
+
+# Needs Sparse
+make_fallback(aten._sparse_coo_tensor_with_dims_and_tensors)
+make_fallback(aten.to_sparse)
+make_fallback(aten._to_sparse)
+
+# Needs dimname support
+make_fallback(aten.zeros.names)
+
+
+# 6) Pattern-matched
+make_fallback(
+    aten._scaled_dot_product_efficient_attention.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_efficient_attention_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_for_cpu.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(
+    aten._scaled_dot_product_flash_attention_for_cpu_backward.default,
+    sdpa_constraint,
+    warn=False,
+)
+make_fallback(aten._flash_attention_forward.default, sdpa_constraint)
+make_fallback(aten._flash_attention_backward.default, sdpa_constraint)
+make_fallback(aten._efficient_attention_forward.default, sdpa_constraint)
+make_fallback(aten._efficient_attention_backward.default, sdpa_constraint)
+make_fallback(aten._scaled_mm.default, constrain_to_fx_strides)
+
+
+# Register with type_promotion_kind None.
+# For example, fp16.copy_(fp32) should **not** promote the first input's dtype.
+@register_lowering(aten.copy, type_promotion_kind=None)
+def copy(self, src, non_blocking=False):
+    x = src
+    if self.get_device() != src.get_device():
+        x = to_device(x, self.get_device())
+    if self.get_dtype() != src.get_dtype():
+        x = to_dtype(x, self.get_dtype())
+
+    if self.get_size() != src.get_size():
+        out = expand(x, self.get_size())
+        return clone(out)
+    return clone(x)
+
+
+@register_lowering(aten.clone)
+def clone(x, *, memory_format=None):
+    # TODO(jansel): memory format
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=x.make_loader(),
+        ranges=list(x.get_size()),
+    )
+
+
+def clone_preserve_reinterpret_view(x):
+    reinterpret_view_layouts = []
+    if isinstance(x, TensorBox) and isinstance(x.data, ir.ReinterpretView):
+        x = x.data  # unwrap TensorBox
+        while isinstance(x, ir.ReinterpretView):
+            reinterpret_view_layouts.append(x.get_layout())
+            x = x.data
+        x = TensorBox(x)
+
+    x = clone(x)
+
+    if reinterpret_view_layouts:
+        x = x.data  # unwrap TensorBox
+        for layout in reinterpret_view_layouts[::-1]:
+            x = ir.ReinterpretView(x, layout)
+        x = TensorBox(x)
+
+    return x
+
+
+if hasattr(aten, "lift_fresh_copy"):
+    register_lowering(aten.lift_fresh_copy)(clone)
+
+
+@register_lowering(prims.iota)
+def iota(
+    length,
+    *,
+    start,
+    step,
+    dtype,
+    device,
+    requires_grad,
+):
+    def fn(index):
+        return ops.index_expr(step * index[0] + start, dtype=dtype)
+
+    return Pointwise.create(
+        device=decode_device(device),
+        dtype=dtype,
+        inner_fn=fn,
+        ranges=[length],
+    )
+
+
+@register_lowering(aten.select_scatter, type_promotion_kind=None)
+def select_scatter(x, src, dim: int, index: int):
+    assert x.get_dtype() == src.get_dtype()
+    x_loader = x.make_loader()
+    dim = _validate_dim(x, dim, 0)
+    if V.graph.sizevars.evaluate_expr(sympy.Lt(index, 0)):
+        index = index + x.get_size()[dim]
+    V.graph.sizevars.guard_leq(0, index)  # type: ignore[arg-type]
+    V.graph.sizevars.guard_lt(index, x.get_size()[dim])  # type: ignore[arg-type]
+    src = expand(unsqueeze(src, dim), x.get_size())
+    src_loader = src.make_loader()
+
+    def inner_fn(idx):
+        return ops.where(
+            ops.eq(
+                ops.index_expr(idx[dim], torch.int32),
+                ops.index_expr(index, torch.int32),
+            ),
+            src_loader(idx),
+            x_loader(idx),
+        )
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=list(x.get_size()),
+    )
+
+
+@register_lowering(aten.slice_scatter, type_promotion_kind=None)
+def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
+    assert x.get_dtype() == src.get_dtype()
+    x_loader = x.make_loader()
+    dim = _validate_dim(x, dim, 0)
+    dim_size = x.get_size()[dim]
+
+    start, end = ir.SliceView.normalize_start_end(x, dim, start, end)
+
+    src_size = list(x.get_size())
+    src_size[dim] = FloorDiv(end - start + (step - 1), step)
+    src = expand(src, src_size)
+    src_loader = src.make_loader()
+
+    def inner_fn(idx):
+        if start == 0 and end == dim_size and step == 1:
+            # selecting every element is the same as just src.clone()
+            return src_loader(idx)
+
+        idx_dim = ops.index_expr(idx[dim], torch.int64)
+        src_idx = list(idx)
+        src_idx[dim] = FloorDiv(idx[dim] - start, step)
+
+        mask = []
+        if start != 0:
+            mask.append(
+                ops.ge(
+                    idx_dim,
+                    ops.index_expr(sympy.expand(start), torch.int64),
+                )
+            )
+        if end != dim_size:
+            mask.append(
+                ops.lt(
+                    idx_dim,
+                    ops.index_expr(sympy.expand(end), torch.int64),
+                )
+            )
+        if step != 1:
+            mask.append(
+                ops.eq(
+                    ops.index_expr(
+                        ModularIndexing(idx[dim] - start, 1, step), torch.int64
+                    ),
+                    ops.constant(0, torch.torch.int64),
+                )
+            )
+        assert mask
+        mask = functools.reduce(ops.and_, mask)
+        src_val = ops.masked(
+            mask,
+            lambda: src_loader(src_idx),
+            0 if is_integer_type(x) else 0.0,
+        )
+        return ops.where(
+            mask,
+            src_val,
+            x_loader(idx),
+        )
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=list(x.get_size()),
+    )
+
+
+def _unwrap(x):
+    if isinstance(x, (list, tuple)) and len(x) > 0:
+        return _unwrap(x[0])
+    return x
+
+
+@register_lowering([torch.tensor, aten.scalar_tensor])
+def tensor(data, *, dtype=None, device=None, layout=None, pin_memory=False):
+    assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+    assert_nyi(not pin_memory, "pin_memory")
+    if isinstance(_unwrap(data), int):
+        dtype = dtype or torch.int64
+    else:
+        dtype = dtype or torch.get_default_dtype()
+
+    ranges: List[sympy.Expr] = []
+
+    if isinstance(data, sympy.Expr):
+
+        def inner_fn(index):
+            return ops.index_expr(data, dtype)
+
+    elif isinstance(data, (float, int)):
+
+        def inner_fn(index):
+            return ops.constant(data, dtype)
+
+    elif len(data) == 0 or isinstance(data[0], (float, int)) and len(data) <= 8:
+        # inline small tensors
+        ranges.append(sympy.Integer(len(data)))
+
+        def inner_fn(index):
+            def binary_search(start, end):
+                assert start < end
+                if end - start == 1:
+                    return ops.constant(data[start], dtype)
+                mid = (end - start) // 2 + start
+                return ops.where(
+                    ops.lt(
+                        ops.index_expr(index[0], torch.int64),
+                        ops.constant(mid, torch.int64),
+                    ),
+                    binary_search(start, mid),
+                    binary_search(mid, end),
+                )
+
+            if len(data) == 0:
+                return ops.constant(0, dtype)
+            return binary_search(0, len(data))
+
+    else:
+        return V.graph.add_tensor_constant(
+            torch.tensor(data, dtype=dtype, device=device)
+        )
+
+    return Pointwise.create(
+        device=decode_device(device),
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=ranges,
+    )
+
+
+@register_lowering(torch.as_tensor)
+def as_tensor(data, dtype=None, device=None):
+    if isinstance(data, TensorBox):
+        if dtype is not None:
+            data = to_dtype(data, dtype)
+        if device is not None:
+            data = to_device(data, device)
+        return data
+    return tensor(data, dtype=dtype, device=device)
+
+
+@register_lowering(torch.LongTensor)
+def long_tensor(data):
+    return tensor(data, dtype=torch.int64)
+
+
+@register_lowering(aten._local_scalar_dense)
+def _local_scalar_dense(data):
+    # This is interesting!  Most lowerings return tensors, so you can just
+    # return the buffer you allocated and it will get used (or not used, if
+    # it's dead.)  But _local_scalar_dense (aka item) returns an int,
+    # not a Tensor, so you would have a type mismatch if you return a buffer;
+    # we are obligated to return a sympy expression instead.  However,
+    # we need to actually codegen the .item() call somehow.  We do this
+    # by registering a faux buffer for the DynamicScalar IR node, which is
+    # solely responsible for generating this .item().  The buffer is
+    # not used for anything (notice we discard it); at codegen time,
+    # the "buffer" just gets assigned None.
+    sym = V.graph.current_node.meta["val"].node.expr
+    buffer = ir.DynamicScalar(sym, data)
+    buffer.name = V.graph.register_buffer(buffer)
+    return sym
+
+
+@register_lowering(aten._assert_scalar)
+def _assert_scalar(data, msg):
+    buffer = ir.AssertScalar(data, msg)
+    # This buffer isn't used by anyone (it returns None), so we must explicitly register it
+    buffer.name = V.graph.register_buffer(buffer)
+    return buffer
+
+
+def _full(fill_value, device, dtype, size):
+    value = fill_value
+    if not isinstance(fill_value, (int, float)) and hasattr(value, "value"):
+        value = value.value
+
+    if isinstance(value, (int, float)):
+
+        def inner_fn(index):
+            return ops.constant(value, dtype)
+
+    elif isinstance(value, sympy.Expr):
+
+        def inner_fn(index):
+            return ops.index_expr(value, dtype)
+
+    else:
+        assert len(value.get_size()) == 0
+        value_loader = value.make_loader()
+
+        def inner_fn(index):
+            return value_loader([])
+
+    return Pointwise.create(
+        device=device,
+        dtype=dtype,
+        inner_fn=inner_fn,
+        ranges=list(size),
+    )
+
+
+@register_lowering(aten.full_like, type_promotion_kind=None)
+def full_like(x, fill_value, **kwargs):
+    return create_tensor_like(tensor_constructor(fill_value))(x, **kwargs)
+
+
+def tensor_constructor(fill_value):
+    # torch.zeros, torch.ones, etc
+    def inner(
+        *size,
+        names=None,
+        dtype=None,
+        device=None,
+        layout=None,
+        pin_memory=False,
+        memory_format=None,
+    ):
+        assert_nyi(names is None, "named tensors")
+        assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        assert_nyi(not pin_memory, "pin_memory")
+        device = decode_device(device)
+        dtype = dtype or torch.get_default_dtype()
+        if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)):
+            size = tuple(size[0])
+        # See https://github.com/pytorch/pytorch/issues/118102
+        # All sizes at lowering time should be sympy.Symbol, not SymInt!
+        for s in size:
+            assert not isinstance(s, torch.SymInt)
+        size = [sympy.expand(s) for s in size]
+        return _full(fill_value, device, dtype, size)
+
+    return inner
+
+
+@register_lowering([torch.empty, aten.empty])
+def empty(
+    *size,
+    names=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    memory_format=None,
+):
+    assert_nyi(names is None, "named tensors")
+    device = decode_device(device)
+    if len(size) == 1 and isinstance(size[0], (list, tuple, torch.Size)):
+        size = tuple(size[0])
+    return empty_strided(
+        size, None, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+def create_tensor_like(creation_fn):
+    """
+    Shim to convert X_like(...) into X(...).  For example zeros_like() into zeros().
+    """
+
+    def _constant_like(
+        x, *, dtype=None, device=None, layout=None, pin_memory=False, memory_format=None
+    ):
+        assert_nyi(not pin_memory, "pin_memory")
+        assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        if dtype is None:
+            dtype = x.get_dtype()
+        else:
+            dtype = decode_dtype(dtype)
+        device = device or x.get_device()
+        size = list(x.get_size())
+        return creation_fn(
+            size, dtype=dtype, device=device, layout=layout, pin_memory=pin_memory
+        )
+
+    return _constant_like
+
+
+def constant_like(fill_value):
+    return create_tensor_like(tensor_constructor(fill_value))
+
+
+empty_like = register_lowering(aten.empty_like)(create_tensor_like(empty))
+ones_like = create_tensor_like(tensor_constructor(1))
+zeros_like = create_tensor_like(tensor_constructor(0))
+
+
+def new_constant(fill_value):
+    def _new_constant(
+        x, size, *, dtype=None, layout=None, device=None, pin_memory=None
+    ):
+        assert isinstance(size, (list, tuple))
+        assert_nyi(not pin_memory, "pin_memory")
+        assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+        dtype = decode_dtype(dtype) or x.get_dtype()
+        device = device or x.get_device()
+        size = [sympy.Integer(s) for s in size]
+        return _full(fill_value, device, dtype, size)
+
+    return _new_constant
+
+
+@register_lowering(aten.new_empty)
+def new_empty(x, size, *, dtype=None, layout=None, device=None, pin_memory=None):
+    if dtype is None:
+        dtype = x.get_dtype()
+    if device is None:
+        device = x.get_device()
+    return empty_strided(
+        size, None, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_lowering(aten.empty_strided)
+def empty_strided(
+    size, stride, *, dtype=None, layout=None, device=None, pin_memory=None
+):
+    assert isinstance(size, (list, tuple))
+    assert isinstance(stride, (list, tuple, type(None)))
+    assert_nyi(not pin_memory, "pin_memory")
+    assert_nyi(layout in (None, torch.strided), f"layout={layout}")
+    dtype = decode_dtype(dtype) or torch.get_default_dtype()
+    device = device or torch.tensor(0.0).device
+    pointwise = _full(fill_value=0, device=device, dtype=dtype, size=size)
+    pointwise.realize()
+    buffer = pointwise.data.data
+    # explicitly set ranges to zeros in order to make a NopKernelSchedulerNode
+    buffer.data.ranges = [0] * len(size)
+    assert isinstance(buffer, ir.ComputedBuffer)
+    size = [sympy.expand(s) for s in size]
+    stride = (
+        [sympy.expand(s) for s in stride]
+        if stride
+        else ir.FlexibleLayout.contiguous_strides(size)
+    )
+    buffer.layout = ir.FixedLayout(
+        device=device,
+        dtype=dtype,
+        size=size,
+        stride=stride,
+    )
+    return pointwise
+
+
+@register_lowering(aten.new_empty_strided)
+def new_empty_strided(
+    x, size, stride, *, dtype=None, layout=None, device=None, pin_memory=None
+):
+    if dtype is None:
+        dtype = x.get_dtype()
+    if device is None:
+        device = x.get_device()
+    return empty_strided(
+        size, stride, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_lowering(prims.copy_strided.default)
+def copy_strided(x, stride):
+    stride = [V.graph.sizevars.size_hint(s) for s in stride]
+    stride_order = sorted(range(len(stride)), key=stride.__getitem__)
+    return ir.ExternKernel.require_stride_order(x, stride_order)
+
+
+@register_lowering([torch.full, aten.full])
+def full(size, fill_value, **kwargs):
+    assert kwargs.get("dtype") is not None, "dtype should be handled by decomposition"
+    return tensor_constructor(fill_value)(size, **kwargs)
+
+
+@register_lowering(aten.gather, type_promotion_kind=None)
+def gather(x, dim, index, sparse_grad=False):
+    # sparse_grad doesn't affect forward computation,
+    # and backward tracing is taken care of by AOT Autograd
+    assert isinstance(x, TensorBox)
+    assert index.get_dtype() == torch.int64
+    size = x.get_size()
+    offset = len(size) == 0
+    dim = _validate_dim(x, dim, offset)
+
+    x_loader = x.make_loader()
+    index_loader = index.make_loader()
+
+    def fn(idx):
+        idx = list(idx)
+        if len(idx) != 0:
+            idx[dim] = ops.indirect_indexing(index_loader(idx), size[dim])
+        return x_loader(idx)
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=fn,
+        ranges=index.get_size(),
+    )
+
+
+@register_lowering(aten.embedding, type_promotion_kind=None)
+def embedding(weight, indices, padding_idx=-1, scale_grad_by_freq=False, sparse=False):
+    assert not sparse
+    assert isinstance(weight, TensorBox)
+    assert isinstance(indices, TensorBox)
+    assert "int" in str(indices.get_dtype())
+
+    weight_loader = weight.make_loader()
+    indices_loader = indices.make_loader()
+    indices_ndim = len(indices.get_size())
+    weight_size = weight.get_size()
+    new_size = [*indices.get_size(), *weight_size[1:]]
+
+    def fn(idx):
+        assert len(idx) == len(new_size), f"{idx} != {new_size}"
+        var_index = indices_loader(idx[:indices_ndim])
+        weight_idx = [ops.indirect_indexing(var_index, weight_size[0])] + [
+            *idx[indices_ndim:]
+        ]
+        return weight_loader(weight_idx)
+
+    return Pointwise.create(
+        device=weight.get_device(),
+        dtype=weight.get_dtype(),
+        inner_fn=fn,
+        ranges=new_size,
+    )
+
+
+def check_and_broadcast_indices(indices, device):
+    assert all(
+        i.get_dtype() in (torch.int64, torch.int32, torch.bool, torch.uint8)
+        for i in indices
+        if i is not None
+    ), f"indices must be int64, byte or bool. Got {[i.get_dtype() for i in indices if i is not None]}"
+    if any(
+        i.get_dtype() in (torch.bool, torch.uint8) for i in indices if i is not None
+    ):
+        raise NotImplementedError("Fallback for bool indices")
+
+    valid_idxs = [i for i, x in enumerate(indices) if isinstance(x, TensorBox)]
+    assert len(valid_idxs) > 0, "requires at least 1 non-None index"
+    new_indices = [None] * len(indices)
+    for i, x in zip(valid_idxs, broadcast_tensors(*[indices[i] for i in valid_idxs])):
+        # Eager allows indices to be CPU tensor when running on CUDA
+        # FIXME: Calling to_device(x, device) should work but
+        # test_advancedindex_mixed_cpu_devices still fails
+        if x.get_device() != device:
+            raise NotImplementedError("Fallback when indices is on a different device")
+        new_indices[i] = x
+    return new_indices, valid_idxs
+
+
+def index_output_size_and_inner_fn(
+    x_size,
+    indices,
+    tensor_indices,
+    tensor_size,
+    indices_loaders,
+    indexed_size,
+    x_loader,
+    check,
+):
+    # Note that behavior of indexing differs when there are non consecutive
+    # tensors. In this case, the tensor index is pulled to the beginning.
+    #
+    # Suppose a = torch.arange(3 * 4 * 5 * 6 * 7).view(3, 4, 5, 6, 7)
+    #         x = torch.tensor[1,2]
+    # Then, a[:,x,:,x,:] will have shape 2,3,5,7 as due to x,:,x then 2 will
+    # be pulled to the front.
+    non_consecutive_tensors = False
+    for previous, current in zip(tensor_indices, tensor_indices[1:]):
+        if current - previous != 1:
+            non_consecutive_tensors = True
+
+    output_size = [x_size[i] for i, val in enumerate(indices) if val is None]
+    output_size = [*output_size, *x_size[len(output_size) + len(tensor_indices) :]]
+
+    first_tensor_index = tensor_indices[0]
+    if non_consecutive_tensors:
+        output_size = tensor_size + output_size
+    else:
+        output_size = (
+            output_size[:first_tensor_index]
+            + tensor_size
+            + output_size[first_tensor_index:]
+        )
+
+    def fn(idx):
+        assert len(idx) == len(output_size)
+        assert len(indices_loaders) == len(indexed_size)
+
+        rank = len(tensor_size)
+        new_index = []
+        first_tensor_index = tensor_indices[0]
+        start_offset = 0 if non_consecutive_tensors else first_tensor_index
+        next_idx = 0
+        for i in range(tensor_indices[-1] + 1):
+            if i == start_offset:
+                next_idx += rank
+            if indices[i] is None:
+                assert next_idx < len(idx)
+                new_index.append(idx[next_idx])
+                next_idx += 1
+            else:
+                loader = indices_loaders[i]
+                assert loader is not None
+                size = indexed_size[i]
+                new_index.append(
+                    ops.indirect_indexing(
+                        loader(idx[start_offset : start_offset + rank]),
+                        size,
+                        check=check,
+                    )
+                )
+        new_index = [
+            *new_index,
+            *idx[next_idx:],
+        ]
+        return new_index if x_loader is None else x_loader(new_index)
+
+    return output_size, fn
+
+
+def index_impl(x, indices, check):
+    assert isinstance(indices, (list, tuple))
+    x_loader = x.make_loader()
+    indices, tensor_indices = check_and_broadcast_indices(indices, x.get_device())
+    assert len(tensor_indices) > 0, "Must have at least one valid idx"
+
+    indices_loaders = [i.make_loader() if i is not None else None for i in indices]
+    # no guards on output size, all the guards are set in broadcast_tensors
+
+    # We can use the first one since they are all required to be the same size
+    tensor_size = list(indices[tensor_indices[0]].get_size())
+
+    x_size = x.get_size()
+
+    indexed_size = [x_size[i] for i in range(len(indices)) if indices[i] is not None]
+    if 0 in indexed_size and 0 not in tensor_size:
+        raise IndexError("index is out of bounds for dimension with size 0")
+
+    indexed_size = [x_size[i] for i in range(len(indices))]
+    output_size, inner_fn = index_output_size_and_inner_fn(
+        x_size,
+        indices,
+        tensor_indices,
+        tensor_size,
+        indices_loaders,
+        indexed_size,
+        x_loader,
+        check=check,
+    )
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=output_size,
+    )
+
+
+@register_lowering(aten.index, type_promotion_kind=None)
+def index(x, indices):
+    try:
+        return index_impl(x, indices, check=True)
+    except NotImplementedError:
+        # Fallback to ATen for boolean indexing
+        x.realize()
+        return fallback_handler(aten.index.Tensor, add_to_fallback_set=False)(
+            x, indices
+        )
+
+
+@register_lowering(aten._unsafe_index, type_promotion_kind=None)
+def _unsafe_index(x, indices):
+    return index_impl(x, indices, check=False)
+
+
+# All the indexing decompositions are written in terms of index, index_put, and index_put_
+# We cannot have this lowering as a decomposition as it introduces
+# mutation in the graph, which is bad for Aot Autograd. Aot Autograd runs dead
+# code elimination and common subexpression elimination optimizations, which
+# assume graphs to be side-effect free. More details at
+# https://github.com/pytorch/torchdynamo/issues/1235
+# and
+# https://github.com/pytorch/torchdynamo/issues/1863
+@register_lowering(aten.index_put)
+def index_put(x, indices, values, accumulate=False):
+    return index_put_(clone(x), indices, values, accumulate)
+
+
+@register_lowering(aten._unsafe_index_put)
+def _unsafe_index_put(x, indices, values, accumulate=False):
+    return index_put_impl_(clone(x), indices, values, accumulate, check=False)
+
+
+def index_put_as_masked_fill(self, indices, value, accumulate):
+    if value.get_device() != self.get_device():
+        value = to_device(value, self.get_device())
+    if accumulate:
+        value = add(self, value)
+    return mutate_to(self, where(indices[0], value, self))
+
+
+def index_put_fallback(self, indices, values, accumulate):
+    deterministic = torch.are_deterministic_algorithms_enabled()
+    if is_triton(values) and (accumulate or deterministic):
+        msg = (
+            "index put with accumulate."
+            if not deterministic
+            else "deterministic index put."
+        )
+        if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
+            msg = f"{msg} Found from : \n {stack_trace}"
+        V.graph.disable_cudagraphs_reason = msg
+
+    ir.IndexPutFallback(V.graph.current_node.target, self, indices, values, accumulate)
+    return self
+
+
+@register_lowering(aten.index_put_, type_promotion_kind=None)
+def index_put_(self, indices, values, accumulate=False):
+    return index_put_impl_(self, indices, values, accumulate, check=True)
+
+
+@register_lowering(inductor_prims._unsafe_index_put_, type_promotion_kind=None)
+def _unsafe_index_put_(self, indices, values, accumulate=False):
+    return index_put_impl_(self, indices, values, accumulate, check=False)
+
+
+def needs_fallback_due_to_atomic_add_limitations(dtype):
+    # tl.atomic_add does NOT support the following types
+    return dtype in {torch.int64, torch.bool, torch.bfloat16}
+
+
+def index_put_impl_(self, indices, values, accumulate, check):
+    # Dispatch to masked fill for single boolean index with single value
+    if (
+        values.get_numel() == 1
+        and len(indices) == 1
+        and indices[0].get_dtype() in {torch.bool, torch.uint8}
+    ):
+        mask = indices[0]
+        for _ in range(len(mask.get_size()), len(self.get_size())):
+            mask = unsqueeze(mask, -1)
+        return index_put_as_masked_fill(self, [mask], values, accumulate)
+
+    # Fallback in torch deterministic mode
+    if torch.are_deterministic_algorithms_enabled():
+        return index_put_fallback(self, indices, values, accumulate)
+
+    # Fallback if there is a boolean index
+    for index in indices:
+        if index is not None and index.get_dtype() in {torch.bool, torch.uint8}:
+            return index_put_fallback(self, indices, values, accumulate)
+
+    x_size = self.get_size()
+    x_ndim = len(x_size)
+
+    if accumulate and needs_fallback_due_to_atomic_add_limitations(self.get_dtype()):
+        # self is an scalar Tensor
+        if x_ndim == 0:
+            self = view(self, [1])
+        self = index_put_fallback(self, indices, values, accumulate)
+        if x_ndim == 0:
+            self = view(self, [])
+        return self
+
+    values = to_dtype(values, self.get_dtype())
+
+    try:
+        # Note that code will only get here when dtype is uint32
+        indices, tensor_indices = check_and_broadcast_indices(
+            indices, self.get_device()
+        )
+    except NotImplementedError:
+        return index_put_fallback(self, indices, values, accumulate)
+
+    indices_loaders = [i.make_loader() if i is not None else None for i in indices]
+
+    assert isinstance(self, TensorBox)
+    self.realize()
+
+    # self is an scalar Tensor
+    if x_ndim == 0:
+        self = view(self, [1])
+
+    # We can use the first one since they are all required to be the same size
+    tensor_size = list(indices[tensor_indices[0]].get_size())
+    indexed_size = [x_size[i] for i in range(len(indices))]
+
+    expected_vals_size, inner_fn = index_output_size_and_inner_fn(
+        x_size,
+        indices,
+        tensor_indices,
+        tensor_size,
+        indices_loaders,
+        indexed_size,
+        None,
+        check=check,
+    )
+
+    values = expand(values, expected_vals_size)
+    # all guards are set above during broadcast_tensors and expand
+
+    scatter = ir.Scatter(
+        device=self.get_device(),
+        dtype=self.get_dtype(),
+        inner_fn=values.make_loader(),
+        ranges=expected_vals_size,  # iter_ranges,
+        output_indexer=inner_fn,
+        scatter_mode="atomic_add" if accumulate else None,
+    )
+    buffer = ir.ComputedBuffer(
+        None,
+        ir.MutationLayout(self),
+        scatter,
+    )
+    buffer.name = V.graph.register_buffer(buffer)
+
+    if x_ndim == 0:
+        self = view(self, [])
+    return self
+
+
+@register_lowering(
+    inductor_prims.masked_scatter_with_index, type_promotion_kind=None, broadcast=False
+)
+def masked_scatter_with_index(self, mask, source_idx, source):
+    self_flat, mask_flat, source_flat = (view(x, (-1,)) for x in (self, mask, source))
+
+    assert self.get_size() == mask.get_size()
+    assert mask.get_dtype() in {torch.bool, torch.uint8}
+
+    self_loader = self_flat.make_loader()
+    mask_loader = mask_flat.make_loader()
+    source_idx_loader = source_idx.make_loader()
+    source_loader = source_flat.make_loader()
+    source_numel = source.get_numel()
+
+    def inner_fn(idx):
+        self_val = self_loader(idx)
+        mask_val = ops.to_dtype(mask_loader(idx), torch.bool)
+
+        def load_source_val():
+            source_idx_val = source_idx_loader(idx)
+            i = ops.indirect_indexing(source_idx_val, source_numel)
+            return source_loader([i])
+
+        source_val = ops.masked(mask_val, load_source_val, 0)
+        return ops.where(mask_val, source_val, self_val)
+
+    result_flat = Pointwise.create(
+        device=self.get_device(),
+        dtype=self.get_dtype(),
+        inner_fn=inner_fn,
+        ranges=self_flat.get_size(),
+    )
+    return view(result_flat, self.get_size())
+
+
+@register_lowering(aten.as_strided_scatter, type_promotion_kind=None)
+def as_strided_scatter(self, src, size, stride, storage_offset=None):
+    output = clone(self)
+    output_view = as_strided(output, size, stride, storage_offset)
+    copy_(output_view, src)
+    return output
+
+
+@register_lowering(aten.scatter, type_promotion_kind=None)
+def scatter(x, dim: int, index, src, **kwargs):
+    return scatter_(clone(x), dim, index, src, **kwargs)
+
+
+def scatter_fallback(
+    fn,
+    self,
+    dim: int,
+    index,
+    src,
+    *,
+    reduce: Optional[str] = None,
+    include_self: bool = True,
+):
+    reduce_ty = "add" if fn == "aten.scatter_" else "sum"
+    if (
+        reduce not in {None, reduce_ty}
+        or (
+            isinstance(src, TensorBox)
+            and src.get_device().type == torch.device("cuda").type
+            and needs_fallback_due_to_atomic_add_limitations(src.get_dtype())
+        )
+        or (
+            fn == "aten.scatter_reduce_"
+            and reduce == "sum"
+            and isinstance(src, TensorBox)
+            and src.get_device() == torch.device("cpu")
+            and config.cpp.fallback_scatter_reduce_sum
+            and (config.cpp.dynamic_threads or parallel_num_threads() != 1)
+        )
+        or (reduce == reduce_ty and self.get_dtype() in {torch.bool, torch.int64})
+        or torch.are_deterministic_algorithms_enabled()
+    ):
+        ir.ScatterFallback(
+            V.graph.current_node.target,
+            fn,
+            self,
+            dim,
+            index,
+            src,
+            reduce=reduce,
+            include_self=include_self,
+        )
+        return self
+
+    return None
+
+
+@register_lowering(aten.scatter_, type_promotion_kind=None)
+def scatter_(self, dim: int, index, src, *, reduce: Optional[str] = None):
+    assert reduce in {None, "add", "multiply"}
+
+    fallback_result = scatter_fallback(
+        "aten.scatter_", self, dim, index, src, reduce=reduce
+    )
+
+    if fallback_result:
+        return fallback_result
+
+    if reduce == "add":
+        reduce = "sum"
+    elif reduce == "multiply":
+        reduce = "prod"
+
+    return scatter_reduce_(self, dim, index, src, reduce)
+
+
+@register_lowering(aten.scatter_add, type_promotion_kind=None)
+def scatter_add(x, dim: int, index, src):
+    return scatter_add_(clone(x), dim, index, src)
+
+
+@register_lowering(aten.scatter_add_, type_promotion_kind=None)
+def scatter_add_(x, dim: int, index, src):
+    return scatter_reduce_(x, dim, index, src, "sum")
+
+
+@register_lowering(aten.scatter_reduce, type_promotion_kind=None)
+def scatter_reduce(x, dim: int, index, src, reduction_type, **kwargs):
+    return scatter_reduce_(clone(x), dim, index, src, reduction_type, **kwargs)
+
+
+@register_lowering(aten.scatter_reduce_, type_promotion_kind=None)
+def scatter_reduce_(self, dim: int, index, src, reduce, *, include_self: bool = True):
+    assert reduce in {None, "sum", "prod", "mean", "amax", "amin"}
+
+    fallback_result = scatter_fallback(
+        "aten.scatter_reduce_",
+        self,
+        dim,
+        index,
+        src,
+        reduce=reduce,
+        include_self=include_self,
+    )
+
+    if fallback_result:
+        return fallback_result
+
+    assert isinstance(self, TensorBox)
+    assert "int" in str(index.get_dtype())
+
+    ndim = len(self.get_size())
+    if ndim == 0:
+        self = view(self, [1])
+
+    if isinstance(src, TensorBox) and len(src.get_size()) == 0:
+        src = view(src, [1])
+
+    if isinstance(index, TensorBox) and len(index.get_size()) == 0:
+        index = view(index, [1])
+
+    dim = _validate_dim(self, dim)
+
+    self.realize()
+    index_loader = index.make_loader()
+    src_loader = src.make_loader() if isinstance(src, TensorBox) else None
+
+    def output_indexer(idx):
+        # self is captured from the end of the function, so it may have 0 dim
+        shape = self.get_size()
+        ndim = len(shape)
+        indirect_idx = list(idx)
+        indirect_idx[dim] = ops.indirect_indexing(
+            index_loader(idx), 1 if ndim == 0 else shape[dim]
+        )
+        return indirect_idx
+
+    def fn(idx):
+        if src_loader:
+            return src_loader(idx)
+        else:
+            # src is a scalar
+            return ops.constant(src, self.get_dtype())
+
+    def backend_reduce_str(reduce):
+        if reduce == "sum":
+            return "atomic_add"
+        else:
+            # TODO: Need to support more reduction type
+            assert reduce is None
+            return None
+
+    if not include_self:
+        # zero out the corresponding elements first
+        zero_out = ir.Scatter(
+            device=self.get_device(),
+            dtype=self.get_dtype(),
+            inner_fn=lambda index: ops.constant(0, self.get_dtype()),
+            ranges=index.get_size(),
+            output_indexer=output_indexer,
+            scatter_mode=None,
+        )
+        buffer = ir.ComputedBuffer(
+            None,
+            ir.MutationLayout(self),
+            zero_out,
+        )
+        buffer.name = V.graph.register_buffer(buffer)
+
+    # self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    # self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    # self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+    scatter = ir.Scatter(
+        device=self.get_device(),
+        dtype=self.get_dtype(),
+        inner_fn=fn,
+        ranges=index.get_size(),
+        output_indexer=output_indexer,
+        scatter_mode=backend_reduce_str(reduce),
+    )
+    buffer = ir.ComputedBuffer(
+        None,
+        ir.MutationLayout(self),
+        scatter,
+    )
+    buffer.name = V.graph.register_buffer(buffer)
+
+    if ndim == 0:
+        self = view(self, [])
+    return self
+
+
+def upsample_nearestnd(
+    x,
+    output_size,
+    scales_x: Tuple[Optional[float], ...],
+    n: int = 2,
+    exact: bool = False,
+):
+    x.realize_hint()  # elements are reused
+    x_loader = x.make_loader()
+    i_sizes = x.get_size()[-n:]
+    batch = x.get_size()[:-n]
+    i_sizes = [V.graph.sizevars.evaluate_static_shape(i) for i in i_sizes]
+
+    assert len(scales_x) == n
+    o_sizes = output_size
+
+    inv_scales = [i / o for i, o in zip(i_sizes, o_sizes)]
+    for i, scale in enumerate(scales_x):
+        if scale is not None:
+            inv_scales[i] = 1.0 / scale
+
+    def scale_fn(x, scale, size):
+        # Nearest Exact: input_index = round(scale * (output_index + 0.5) - 0.5)
+        #                            = floor(scale * (output_index + 0.5))
+        # Nearest: input_index = floor(scale * output_index)
+        x = ops.index_expr(x, torch.float32)
+        if exact:
+            x = ops.add(x, ops.constant(0.5, torch.float32))
+        x = ops.mul(x, ops.constant(scale, torch.float32))
+        x = ops.to_dtype(x, torch.int32)
+        return ops.indirect_indexing(x, size, check=False)
+
+    def fn(idx):
+        x = idx[-n:]
+        b = idx[:-n]
+        return x_loader(
+            [*b, *[scale_fn(i, s, size) for i, s, size in zip(x, inv_scales, i_sizes)]]
+        )
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=fn,
+        ranges=[*batch, *o_sizes],
+    )
+
+
+@register_lowering(aten.upsample_nearest1d.default)
+def upsample_nearest1d(x, output_size, scales: Optional[float] = None):
+    return upsample_nearestnd(x, output_size, (scales,), n=1)
+
+
+@register_lowering(aten._upsample_nearest_exact1d.default)
+def _upsample_nearest_exact1d(x, output_size, scales: Optional[float] = None):
+    return upsample_nearestnd(x, output_size, (scales,), n=1, exact=True)
+
+
+@register_lowering(aten.upsample_nearest2d.default)
+def upsample_nearest2d(
+    x, output_size, scales_h: Optional[float] = None, scales_w: Optional[float] = None
+):
+    return upsample_nearestnd(x, output_size, (scales_h, scales_w), n=2)
+
+
+@register_lowering(aten._upsample_nearest_exact2d.default)
+def _upsample_nearest_exact2d(
+    x, output_size, scales_h: Optional[float] = None, scales_w: Optional[float] = None
+):
+    return upsample_nearestnd(x, output_size, (scales_h, scales_w), n=2, exact=True)
+
+
+@register_lowering(aten.upsample_nearest3d.default)
+def upsample_nearest3d(
+    x,
+    output_size,
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    return upsample_nearestnd(x, output_size, (scales_d, scales_h, scales_w), n=3)
+
+
+@register_lowering(aten._upsample_nearest_exact3d.default)
+def _upsample_nearest_exact3d(
+    x,
+    output_size,
+    scales_d: Optional[float] = None,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    return upsample_nearestnd(
+        x, output_size, (scales_d, scales_h, scales_w), n=3, exact=True
+    )
+
+
+def _create_constants(*args, dtype):
+    return tuple(ops.constant(a, dtype) for a in args)
+
+
+@register_lowering(aten.upsample_bicubic2d.default)
+def upsample_bicubic2d_default(
+    x,
+    output_size,
+    align_corners: bool,
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    x.realize_hint()
+    x_loader = x.make_loader()
+
+    N, C, iH, iW = x.get_size()
+    oH, oW = output_size
+
+    iH = V.graph.sizevars.evaluate_static_shape(iH)
+    iW = V.graph.sizevars.evaluate_static_shape(iW)
+
+    def get_int_dtype(maxval):
+        if maxval > torch.iinfo(torch.int32).max:
+            return torch.int64
+        return torch.int32
+
+    def compute_scale(in_size, out_size, align_corners, scale=None):
+        if align_corners:
+            return (in_size - 1) / (out_size - 1) if out_size > 1 else 0
+        else:
+            return 1 / scale if scale is not None and scale > 0 else in_size / out_size
+
+    def compute_source_index(scale, dst_index, align_corners):
+        dst_index_ie = ops.index_expr(dst_index, torch.float32)
+        scale = ops.constant(scale, torch.float32)
+        if align_corners:
+            return ops.mul(scale, dst_index_ie)
+        else:
+            half = ops.constant(0.5, torch.float32)
+            return scale * (dst_index_ie + half) - half
+
+    def cubic_convolution1(x, A):
+        _Ap2, _Ap3, _1 = _create_constants(A + 2, A + 3, 1, dtype=torch.float32)
+        return (_Ap2 * x - _Ap3) * x * x + _1
+
+    def cubic_convolution2(x, A):
+        _A, _4A, _5A, _8A = _create_constants(
+            A, 4 * A, 5 * A, 8 * A, dtype=torch.float32
+        )
+        return ((_A * x - _5A) * x + _8A) * x - _4A
+
+    def get_cubic_upsample_coefficients(t):
+        A = -0.75
+        _1 = ops.constant(1.0, torch.float32)
+        c0 = cubic_convolution2(ops.add(t, _1), A)
+        c1 = cubic_convolution1(t, A)
+
+        x2 = ops.sub(_1, t)
+        c2 = cubic_convolution1(x2, A)
+        c3 = cubic_convolution2(ops.add(x2, _1), A)
+        return (c0, c1, c2, c3)
+
+    def cubic_interp1d(xs, t):
+        cs = get_cubic_upsample_coefficients(t)
+        # dot product between xs and cs
+        return xs[0] * cs[0] + xs[1] * cs[1] + xs[2] * cs[2] + xs[3] * cs[3]
+
+    height_scale = compute_scale(iH, oH, align_corners, scales_h)
+    width_scale = compute_scale(iW, oW, align_corners, scales_h)
+
+    def clamp(v, min, max):
+        return ops.maximum(min, ops.minimum(max, v))
+
+    def fn(idx):
+        n, c, oy, ox = idx
+
+        real_x = compute_source_index(width_scale, ox, align_corners)
+        in_x = ops.floor(real_x)
+        t_x = ops.sub(real_x, in_x)
+
+        real_y = compute_source_index(height_scale, oy, align_corners)
+        in_y = ops.floor(real_y)
+        t_y = ops.sub(real_y, in_y)
+
+        def load_bounded(fy, fx):
+            # TODO(Lezcano) Here we may not need to set-up a device_size
+            _0 = ops.constant(0, torch.int32)
+            iHm1 = ops.constant(iH - 1, torch.int32)
+            iWm1 = ops.constant(iW - 1, torch.int32)
+            iy = ops.indirect_indexing(clamp(fy, _0, iHm1), iH, check=False)
+            ix = ops.indirect_indexing(clamp(fx, _0, iWm1), iW, check=False)
+            return x_loader([n, c, iy, ix])
+
+        iy = ops.to_dtype(in_y, get_int_dtype(iH + 1))
+        ix = ops.to_dtype(in_x, get_int_dtype(iW + 1))
+        iys_ofs = tuple(ops.add(iy, ofs) for ofs in (-1, 0, 1, 2))
+        ixs_ofs = tuple(ops.add(ix, ofs) for ofs in (-1, 0, 1, 2))
+
+        def get_x_interp(y):
+            coeffs_x = tuple(load_bounded(y, x) for x in ixs_ofs)
+            return cubic_interp1d(coeffs_x, t_x)
+
+        coeffs_y = tuple(get_x_interp(y) for y in iys_ofs)
+        return cubic_interp1d(coeffs_y, t_y)
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=fn,
+        ranges=[N, C, sympy.Integer(oH), sympy.Integer(oW)],
+    )
+
+
+@register_lowering(aten.reflection_pad1d_backward)
+@register_lowering(aten.reflection_pad2d_backward)
+@register_lowering(aten.reflection_pad3d_backward)
+def _reflection_padnd_backward(grad_output, x, padding):
+    dim = len(padding) // 2
+
+    dhw = [h - 1 for h in x.get_size()[-dim:]]
+    grad_loader = grad_output.make_loader()
+
+    padding_left = [padding[2 * (dim - 1 - i)] for i in range(dim)]
+    padding_right = [padding[2 * (dim - 1 - i) + 1] for i in range(dim)]
+
+    def fn(idx):
+        b = idx[:-dim]
+        xyz = idx[-dim:]
+
+        def load_from_output(x):
+            return grad_loader([*b, *x])
+
+        def index_range_condition(index_range):
+            i, lb, ub = index_range
+            i = ops.index_expr(i, torch.int32)
+            lb = ops.index_expr(lb, torch.int64)
+            ub = ops.index_expr(ub, torch.int64)
+            return ops.and_(ops.ge(i, lb), ops.le(i, ub))
+
+        # Areas after reflection:
+        #
+        #   top-left    |   top     |   top-right
+        # -----------------------------------------
+        #   left        |   center  |   right
+        # -----------------------------------------
+        #   bottom-left |   bottom  |   bottom-right
+        #
+        # The center area is the original matrix. Other areas are reflections.
+
+        center = [xyz[i] + padding_left[i] for i in range(dim)]
+        left_reflect = [padding_left[i] - xyz[i] for i in range(dim)]
+        right_reflect = [2 * dhw[i] + padding_left[i] - xyz[i] for i in range(dim)]
+
+        # Accumulate gradients from different areas
+        # If some of the padding is negative, center load is not always valid
+        range_c = [
+            (center[i], 0, dhw[i] + padding_left[i] + padding_right[i])
+            for i in range(dim)
+        ]
+        cond = functools.reduce(
+            ops.and_, [index_range_condition(range_c[i]) for i in range(dim)]
+        )
+        grad = ops.masked(cond, lambda: load_from_output(center), 0.0)
+
+        def accumulate(grad, out, index_ranges):
+            # If the upper bound is less than the lower bound, we can get rid of one accumulation.
+            # This happens when the padding size is zero.
+            for i in range(dim):
+                upper_less_than_lower = index_ranges[i][2] < index_ranges[i][1]
+                if isinstance(upper_less_than_lower, bool) and upper_less_than_lower:
+                    return grad
+            cond = functools.reduce(
+                ops.and_,
+                [index_range_condition(index_range) for index_range in index_ranges],
+            )
+            g = ops.masked(cond, lambda: load_from_output(out), 0.0)
+            return ops.add(grad, g)
+
+        for area in itertools.product(*[[-1, 0, 1] for _ in range(dim)]):
+            if area == tuple([0] * dim):
+                # center, this is already done.
+                continue
+
+            outs = []
+            index_ranges = []
+
+            for i in range(dim):
+                if area[i] == 0:
+                    out = center[i]
+                    index_range = range_c[i]
+                elif area[i] == -1:
+                    out = left_reflect[i]
+                    index_range = (xyz[i], 1, padding_left[i])
+                elif area[i] == 1:
+                    out = right_reflect[i]
+                    index_range = (xyz[i], dhw[i] - padding_right[i], dhw[i] - 1)
+
+                outs.append(out)  # type: ignore[possibly-undefined]
+                index_ranges.append(index_range)  # type: ignore[possibly-undefined]
+
+            grad = accumulate(grad, outs, index_ranges)
+
+        return grad
+
+    return Pointwise.create(
+        device=grad_output.get_device(),
+        dtype=grad_output.get_dtype(),
+        inner_fn=fn,
+        ranges=list(x.get_size()),
+    )
+
+
+@register_lowering(prims.rev.default)
+def rev(x, dims):
+    # note - dims pre-canonicalized
+    x_loader = x.make_loader()
+    sizes = x.get_size()
+
+    def loader(idx):
+        idx = list(idx)
+        assert len(idx) == len(sizes)
+        for dim in dims:
+            idx[dim] = (sizes[dim] - 1) - idx[dim]
+
+        return x_loader(idx)
+
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=loader,
+        ranges=sizes,
+    )
+
+
+@register_lowering(aten.constant_pad_nd, type_promotion_kind=None)
+def constant_pad_nd(x, padding, fill_value=0):
+    assert (len(padding) % 2) == 0
+    if all(p == 0 for p in padding):
+        return clone(x)
+
+    sizes = x.get_size()
+
+    bounds = list(reversed(list(zip(padding[::2], padding[1::2]))))
+    n = len(sizes) - len(bounds)
+
+    # if padding is a complicated expression, hoist it
+    bounds_precomp: List[Tuple[sympy.Symbol, Any]] = []
+    for l, h in bounds:
+        bounds_precomp.append((V.graph.sizevars.lookup_precomputed_size(l), h))  # type: ignore[arg-type]
+
+    output_size = list(sizes[:n])
+    mask_sizes = []
+    for (low, high), size in zip(bounds, sizes[n:]):
+        mask_sizes.append(size)
+        output_size.append(sympy.expand(size + low + high))
+    assert len(output_size) == len(sizes)
+    fill_value = dtype_to_type(x.get_dtype())(fill_value)
+
+    def mask(index):
+        mask = []
+        for idx, (low, high), length in zip(index[n:], bounds, mask_sizes):
+            if low != 0:
+                mask.append(range_mask_low(idx, 0))
+            if high != 0:
+                mask.append(range_mask_high(idx, length))
+        mask = functools.reduce(ops.and_, mask)
+        return ops.masked(mask, lambda: x_loader(index), fill_value)
+
+    def offset_fn(index):
+        new_index = list(index[:n])
+        for idx, (low, high) in zip(index[n:], bounds_precomp):
+            new_index.append(idx - low)
+        assert len(new_index) == len(index)
+        return mask(new_index)
+
+    x_loader = x.make_loader()
+    return Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=offset_fn,
+        ranges=output_size,
+    )
+
+
+def range_mask_low(i: sympy.Expr, low: Union[sympy.Expr, int]):
+    return ops.ge(
+        ops.index_expr(i, torch.int64),
+        ops.index_expr(sympy.Integer(low), torch.int64),
+    )
+
+
+def range_mask_high(i: sympy.Expr, high: sympy.Expr):
+    return ops.lt(
+        ops.index_expr(i, torch.int64),
+        ops.index_expr(high, torch.int64),
+    )
+
+
+def range_mask(i: sympy.Expr, high: sympy.Expr, low: sympy.Expr):
+    return ops.and_(
+        range_mask_low(i, low),
+        range_mask_high(i, high),
+    )
+
+
+def constant_boundary_condition_2d(x, fill_value, padding=None, pad_fill_value=1.0):
+    *_, h, w = x.get_size()
+    x_loader = x.make_loader()
+    padding_h = padding[0] if padding else 0
+    padding_w = padding[1] if padding else 0
+
+    def load(index):
+        *prefix, ih, iw = index
+
+        mask = ops.and_(
+            range_mask(ih, h + padding_h, -padding_h),
+            range_mask(iw, w + padding_w, -padding_w),
+        )
+        return (
+            ops.masked(
+                mask,
+                lambda: constant_boundary_condition_2d(x, pad_fill_value)(
+                    [*prefix, ih, iw]
+                ),
+                fill_value,
+            )
+            if padding
+            else ops.masked(mask, lambda: x_loader([*prefix, ih, iw]), fill_value)
+        )
+
+    return load
+
+
+def pooling_size(x, i, kernel_size, stride, padding, ceil_mode):
+    x_out = FloorDiv(
+        x + 2 * padding[i] - (kernel_size[i] - 1) + (stride[i] - 1), stride[i]
+    )
+
+    if ceil_mode:
+        x_alt = FloorDiv(
+            x + 2 * padding[i] - (kernel_size[i] - 1) + 2 * (stride[i] - 1), stride[i]
+        )
+        if V.graph.sizevars.size_hint((x_alt - 1) * stride[i] - x - padding[i]) >= 0:
+            # Sliding windows must start within the input or left padding
+            x_alt -= 1  # type: ignore[assignment]
+            V.graph.sizevars.guard_leq(0, x_alt * stride[i] - x - padding[i])  # type: ignore[arg-type]
+        if V.graph.sizevars.size_hint(x_out - x_alt) == 0:
+            # ceil mode is actually a no-op, lets guard on that
+            V.graph.sizevars.guard_equals(x_out, x_alt)
+            ceil_mode = False
+        else:
+            x_out = x_alt
+    return x_out, ceil_mode
+
+
+fallback_max_pool2d_with_indices = fallback_handler(
+    aten.max_pool2d_with_indices.default,
+    add_to_fallback_set=False,
+)
+
+
+@register_lowering(aten.max_pool2d_with_indices, type_promotion_kind=None)
+def max_pool2d_with_indices(
+    x, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False
+):
+    if padding == 0:
+        padding = [0, 0]
+    if dilation == 1:
+        dilation = [1, 1]
+    if not stride:
+        stride = kernel_size
+    kernel_size = pad_listlike(kernel_size, 2)
+    stride = pad_listlike(stride, 2)
+    padding = pad_listlike(padding, 2)
+    dilation = pad_listlike(dilation, 2)
+
+    assert isinstance(x, TensorBox)
+    assert len(kernel_size) == 2
+    assert len(stride) == 2
+    assert len(padding) == 2
+    assert len(dilation) == 2
+    assert len(x.get_size()) in (3, 4)
+
+    x.realize_hint()
+    *batch, h, w = x.get_size()
+
+    h_out, ceil_mode1 = pooling_size(h, 0, kernel_size, stride, padding, ceil_mode)
+    w_out, ceil_mode2 = pooling_size(w, 1, kernel_size, stride, padding, ceil_mode)
+
+    if padding[0] or padding[1] or ceil_mode1 or ceil_mode2:
+        x_loader = constant_boundary_condition_2d(x, float("-inf"))
+    else:
+        x_loader = x.make_loader()
+
+    new_size = list(batch) + [h_out, w_out]
+    window_size = kernel_size[0] * kernel_size[1]
+
+    if window_size > 25 or any(d != 1 for d in dilation):
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_max_pool2d_with_indices(
+            x, kernel_size, stride, padding, dilation, ceil_mode
+        )
+
+    def fn(idx, return_index):
+        *prefix, bh, bw = idx
+        maxval = None
+        maxindex = None
+        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
+            ih = bh * stride[0] + ih - padding[0]
+            iw = bw * stride[1] + iw - padding[1]
+            val = x_loader([*prefix, ih, iw])
+            if return_index:
+                index = ops.index_expr(ih * w + iw, torch.int64)
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(ops.gt(val, maxval), index, maxindex)
+            if maxval is None:
+                maxval = val
+            else:
+                maxval = ops.maximum(val, maxval)
+        if return_index:
+            return maxindex
+        else:
+            return maxval
+
+    r1 = Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=functools.partial(fn, return_index=False),
+        ranges=new_size,
+    )
+    r2 = Pointwise.create(
+        device=x.get_device(),
+        dtype=torch.int64,
+        inner_fn=functools.partial(fn, return_index=True),
+        ranges=new_size,
+    )
+    # TODO(jansel): should we force these to be realized?
+    return r1, r2
+
+
+fallback_max_pool2d_with_indices_backward = fallback_handler(
+    aten.max_pool2d_with_indices_backward.default,
+    add_to_fallback_set=False,
+)
+
+
+@register_lowering(aten.max_pool2d_with_indices_backward, type_promotion_kind=None)
+def max_pool2d_with_indices_backward(
+    grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
+):
+    if padding == 0:
+        padding = [0, 0]
+    if dilation == 1:
+        dilation = [1, 1]
+    if not stride:
+        stride = kernel_size
+
+    assert isinstance(x, TensorBox)
+    assert len(kernel_size) == 2
+    assert len(stride) == 2
+    assert len(padding) == 2
+    assert len(dilation) == 2
+    assert len(x.get_size()) in (3, 4)
+
+    # we will read this many times, so make sure it is computed
+    grad_output.realize_hint()
+    try:
+        gO_stride = grad_output.get_stride()
+    except AttributeError:
+        # some classes don't have `get_stride`
+        # TODO will need a better way of determining if inputs are channels-last
+        gO_stride = None
+    if isinstance(x, TensorBox) and isinstance(x.data.data, Pointwise):  # type: ignore[attr-defined]
+        data = x.data.data  # type: ignore[attr-defined]
+        x_buffer = ir.ComputedBuffer(
+            name=None,
+            layout=ir.FlexibleLayout(
+                device=data.get_device(),
+                dtype=data.get_dtype(),
+                size=data.get_size(),
+            ),
+            data=data,
+        )
+        x_buffer.decide_layout()
+        x_stride = x_buffer.get_stride()
+    else:
+        try:
+            x_stride = x.get_stride()
+        except AttributeError:
+            x_stride = None
+
+    is_channels_last = (x_stride is not None and x_stride[1] == 1) or (
+        gO_stride is not None and gO_stride[1] == 1
+    )
+    autotune = (
+        config.coordinate_descent_tuning
+        or config.max_autotune
+        or config.max_autotune_pointwise
+    )
+    if any(d != 1 for d in dilation) or (is_channels_last and not autotune):
+        # don't codegen channels-last when autotune is not enabled, it's very slow
+        return fallback_max_pool2d_with_indices_backward(
+            grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
+        )
+
+    indices.realize_hint()
+
+    *batch, height, width = x.get_size()
+    *_, pooled_height, pooled_width = grad_output.get_size()
+
+    indices_loader = indices.make_loader()
+    grad_loader = grad_output.make_loader()
+    new_size = list(x.get_size())
+
+    h_window_size = max(
+        [
+            max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+            for h in range(kernel_size[0] * 2)
+        ]
+    )
+    w_window_size = max(
+        [
+            max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+            for w in range(kernel_size[1] * 2)
+        ]
+    )
+
+    window_size = h_window_size * w_window_size
+
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_max_pool2d_with_indices_backward(
+            grad_output, x, kernel_size, stride, padding, dilation, ceil_mode, indices
+        )
+
+    indices_size = indices.get_size()
+
+    def fn(idx):
+        *prefix, h, w = idx
+        index_test = ops.index_expr(h * width + w, torch.int32)
+        h = h + padding[0]
+        w = w + padding[1]
+        phstart = ops.index_expr(
+            FloorDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
+        )
+        pwstart = ops.index_expr(
+            FloorDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
+        )
+        phend = ops.index_expr(FloorDiv(h, stride[0]) + 1, torch.int32)
+        pwend = ops.index_expr(FloorDiv(w, stride[1]) + 1, torch.int32)
+
+        phstart = ops.maximum(phstart, ops.constant(0, torch.int32))
+        pwstart = ops.maximum(pwstart, ops.constant(0, torch.int32))
+        phend = ops.minimum(phend, ops.index_expr(pooled_height, torch.int32))
+        pwend = ops.minimum(pwend, ops.index_expr(pooled_width, torch.int32))
+
+        gradient = None
+        for ph_ in range(h_window_size):
+            for pw_ in range(w_window_size):
+                ph = ops.add(phstart, ops.constant(ph_, torch.int32))
+                pw = ops.add(pwstart, ops.constant(pw_, torch.int32))
+                grad_index = [
+                    *prefix,
+                    ops.indirect_indexing(
+                        ops.minimum(ph, ops.sub(phend, ops.constant(1, torch.int32))),
+                        indices_size[-2],
+                        check=False,
+                    ),
+                    ops.indirect_indexing(
+                        ops.minimum(pw, ops.sub(pwend, ops.constant(1, torch.int32))),
+                        indices_size[-1],
+                        check=False,
+                    ),
+                ]
+
+                index_actual = indices_loader(grad_index)
+                grad_part = grad_loader(grad_index)
+                check = ops.eq(index_actual, index_test)
+
+                if gradient is None:
+                    # don't need mask for 0, 0
+                    gradient = ops.where(
+                        check, grad_part, ops.constant(0.0, torch.float32)
+                    )
+                else:
+                    mask = ops.and_(
+                        ops.and_(
+                            ops.lt(ph, phend),
+                            ops.lt(pw, pwend),
+                        ),
+                        check,
+                    )
+                    gradient = ops.where(mask, ops.add(gradient, grad_part), gradient)
+        assert gradient is not None
+        return gradient
+
+    return Pointwise.create(
+        device=grad_output.get_device(),
+        dtype=grad_output.get_dtype(),
+        inner_fn=fn,
+        ranges=new_size,
+    )
+
+
+def pad_adaptive_loader(x, pad_val=0.0):
+    *_, h, w = x.get_size()
+    x_loader = x.make_loader()
+
+    def load(prefix, increments, start_indices, end_indices):
+        ih, iw = increments
+        h_start_index, w_start_index = start_indices
+        h_end_index, w_end_index = end_indices
+
+        mask = ops.and_(
+            ops.lt(
+                ops.index_expr(h_start_index + ih, torch.int64),
+                ops.index_expr(h_end_index, torch.int64),
+            ),
+            ops.lt(
+                ops.index_expr(w_start_index + iw, torch.int64),
+                ops.index_expr(w_end_index, torch.int64),
+            ),
+        )
+
+        return ops.masked(
+            mask,
+            lambda: x_loader([*prefix, h_start_index + ih, w_start_index + iw]),
+            pad_val,
+        )
+
+    return load
+
+
+def _adaptive_pooling_idx_sum(kernel_maxes, start_index_fns, end_index_fns):
+    h_start_index_fn, w_start_index_fn = start_index_fns
+    h_end_index_fn, w_end_index_fn = end_index_fns
+
+    def fn_sum(idx, loader):
+        *prefix, bh, bw = idx
+
+        h_start_index = h_start_index_fn(bh)
+        h_end_index = h_end_index_fn(bh)
+
+        w_start_index = w_start_index_fn(bw)
+        w_end_index = w_end_index_fn(bw)
+
+        total = None
+        for ih, iw in itertools.product(range(kernel_maxes[0]), range(kernel_maxes[1])):
+            val = loader(
+                prefix,
+                [ih, iw],
+                [h_start_index, w_start_index],
+                [h_end_index, w_end_index],
+            )
+            if total is None:
+                total = val
+            else:
+                total = ops.add(val, total)
+        return total
+
+    return fn_sum
+
+
+fallback_adaptive_avg_pool2d = fallback_handler(
+    aten._adaptive_avg_pool2d.default, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten._adaptive_avg_pool2d)
+def _adaptive_avg_pool2d(x, output_size):
+    assert isinstance(x, TensorBox)
+    assert len(output_size) == 2
+    x.realize_hint()
+
+    *batch, h_in, w_in = x.get_size()
+
+    h_in = V.graph.sizevars.evaluate_static_shape(h_in)
+    w_in = V.graph.sizevars.evaluate_static_shape(w_in)
+
+    h_out, w_out = output_size
+
+    # no-op if the same input and output
+    if h_in == h_out and w_in == w_out:
+        return clone(x)
+
+    if h_out == 0 or w_out == 0:
+        o_size = [*batch, h_out, w_out]
+        return empty(o_size, dtype=x.get_dtype(), device=x.get_device())
+    if h_in % h_out == 0 and w_in % w_out == 0:
+        kernel_size = [h_in // h_out, w_in // w_out]
+        return avg_pool2d(x, kernel_size)
+
+    h_kernel_max = ceildiv((h_in + h_out - 1), h_out)
+    w_kernel_max = ceildiv((w_in + w_out - 1), w_out)
+
+    new_size = list(batch) + [h_out, w_out]
+    dtype = x.get_dtype()
+
+    def start_index(index, out_dim, inp_dim):
+        return FloorDiv((index * inp_dim), out_dim)
+
+    def end_index(index, out_dim, inp_dim):
+        return FloorDiv((index + 1) * inp_dim + out_dim - 1, out_dim)
+
+    h_start_index = functools.partial(start_index, out_dim=h_out, inp_dim=h_in)
+    h_end_index = functools.partial(end_index, out_dim=h_out, inp_dim=h_in)
+
+    w_start_index = functools.partial(start_index, out_dim=w_out, inp_dim=w_in)
+    w_end_index = functools.partial(end_index, out_dim=w_out, inp_dim=w_in)
+
+    window_size = h_kernel_max * w_kernel_max
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_adaptive_avg_pool2d(x, output_size)
+
+    fn_sum = _adaptive_pooling_idx_sum(
+        [h_kernel_max, w_kernel_max],
+        [h_start_index, w_start_index],
+        [h_end_index, w_end_index],
+    )
+
+    ones_loader = pad_adaptive_loader(ones_like(x))
+
+    def fn(idx):
+        return ops.truediv(
+            fn_sum(idx, pad_adaptive_loader(x)), fn_sum(idx, ones_loader)
+        )
+
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=dtype,
+        inner_fn=fn,
+        ranges=new_size,
+    )
+    # TODO: should we force these to be realized?
+    return rv
+
+
+def _adaptive_pooling_idx_max(kernel_maxes, in_sizes, out_sizes, return_index, loader):
+    # NOTE: There is some duplication between this and addaptive_avg_pool2d and max_pool2d
+    # Look into refactoring/deduplication after #116418 is merged.
+    h_in, w_in = in_sizes
+    h_out, w_out = out_sizes
+
+    def start_index(index, out_dim, inp_dim):
+        return FloorDiv((index * inp_dim), out_dim)
+
+    def end_index(index, out_dim, inp_dim):
+        return FloorDiv((index + 1) * inp_dim + out_dim - 1, out_dim)
+
+    h_start_index_fn = functools.partial(start_index, out_dim=h_out, inp_dim=h_in)
+    h_end_index_fn = functools.partial(end_index, out_dim=h_out, inp_dim=h_in)
+    w_start_index_fn = functools.partial(start_index, out_dim=w_out, inp_dim=w_in)
+    w_end_index_fn = functools.partial(end_index, out_dim=w_out, inp_dim=w_in)
+
+    def fn_max(idx):
+        *prefix, bh, bw = idx
+
+        h_start_index = h_start_index_fn(bh)
+        h_end_index = h_end_index_fn(bh)
+
+        w_start_index = w_start_index_fn(bw)
+        w_end_index = w_end_index_fn(bw)
+        maxval = None
+        maxindex = None
+        for ih, iw in itertools.product(range(kernel_maxes[0]), range(kernel_maxes[1])):
+            val = loader(
+                prefix,
+                [ih, iw],
+                [h_start_index, w_start_index],
+                [h_end_index, w_end_index],
+            )
+            index = ops.index_expr(
+                (h_start_index + ih) * w_in + w_start_index + iw, torch.int64
+            )
+            if return_index:
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(ops.gt(val, maxval), index, maxindex)
+            if maxval is None:
+                maxval = val
+            else:
+                maxval = ops.maximum(val, maxval)
+        if return_index:
+            return maxindex
+        else:
+            return maxval
+
+    return fn_max
+
+
+fallback_adaptive_max_pool2d = fallback_handler(
+    aten.adaptive_max_pool2d.default, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten.adaptive_max_pool2d)
+def adaptive_max_pool2d(x, output_size):
+    assert isinstance(x, TensorBox)
+    assert len(output_size) == 2
+    x.realize_hint()
+
+    *batch, h_in, w_in = x.get_size()
+
+    h_in = V.graph.sizevars.evaluate_static_shape(h_in)
+    w_in = V.graph.sizevars.evaluate_static_shape(w_in)
+
+    h_out, w_out = output_size
+
+    if h_out == 0 or w_out == 0:
+        o_size = [*batch, h_out, w_out]
+        return empty(o_size, dtype=x.get_dtype(), device=x.get_device()), empty(
+            o_size, dtype=torch.int64, device=x.get_device()
+        )
+    if h_in % h_out == 0 and w_in % w_out == 0:
+        kernel_size = [h_in // h_out, w_in // w_out]
+        return max_pool2d_with_indices(x, kernel_size)
+
+    h_kernel_max = ceildiv((h_in + h_out - 1), h_out)
+    w_kernel_max = ceildiv((w_in + w_out - 1), w_out)
+
+    new_size = list(batch) + [h_out, w_out]
+    dtype = x.get_dtype()
+
+    window_size = h_kernel_max * w_kernel_max
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_adaptive_max_pool2d(x, output_size)
+
+    inner_func_max_val = _adaptive_pooling_idx_max(
+        kernel_maxes=[h_kernel_max, w_kernel_max],
+        in_sizes=[h_in, w_in],
+        out_sizes=[h_out, w_out],
+        return_index=False,
+        loader=pad_adaptive_loader(x, float("-inf")),
+    )
+
+    inner_func_max_idx = _adaptive_pooling_idx_max(
+        kernel_maxes=[h_kernel_max, w_kernel_max],
+        in_sizes=[h_in, w_in],
+        out_sizes=[h_out, w_out],
+        return_index=True,
+        loader=pad_adaptive_loader(x, float("-inf")),
+    )
+
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=dtype,
+        inner_fn=inner_func_max_val,
+        ranges=new_size,
+    )
+    ri = Pointwise.create(
+        device=x.get_device(),
+        dtype=torch.int64,
+        inner_fn=inner_func_max_idx,
+        ranges=new_size,
+    )
+    return rv, ri
+
+
+fallback_fractional_max_pool2d = fallback_handler(
+    aten.fractional_max_pool2d.default, add_to_fallback_set=False
+)
+
+
+def _fractional_pooling_offsets(samples, in_sz, out_sz, kernel_sz, dim):
+    out_sz = out_sz[dim]
+    in_sz = in_sz[dim]
+    kernel_sz = kernel_sz[dim]
+    alpha = (in_sz - kernel_sz) / (out_sz - 1)
+    samples_loader = samples.make_loader()
+
+    def load(prefix, i):
+        sample = samples_loader([*prefix, dim])
+        i_expr = ops.index_expr(i, samples.get_dtype())
+        alpha_expr = ops.index_expr(alpha, samples.get_dtype())
+        seq_i = ops.floor((i_expr + sample) * alpha_expr) - ops.floor(
+            sample * alpha_expr
+        )
+        seq_i = ops.to_dtype(seq_i, torch.int64)
+
+        mask = ops.lt(
+            i_expr,
+            ops.index_expr(out_sz - 1, torch.int64),
+        )
+        return ops.where(mask, seq_i, ops.index_expr(in_sz - kernel_sz, torch.int64))
+
+    return load
+
+
+@register_lowering(aten.fractional_max_pool2d)
+def fractional_max_pool2d(x, kernel_size, output_size, random_samples):
+    x.realize_hint()
+    *batch, inp_h, inp_w = x.get_size()
+    kernel_h, kernel_w = kernel_size
+    h_out, w_out = output_size
+
+    if kernel_h * kernel_w >= 25:
+        return fallback_fractional_max_pool2d(
+            x, kernel_size, output_size, random_samples
+        )
+
+    gen_offsets_for_dim = functools.partial(
+        _fractional_pooling_offsets,
+        samples=random_samples,
+        in_sz=[inp_h, inp_w],
+        out_sz=output_size,
+        kernel_sz=kernel_size,
+    )
+
+    h_index_fn = gen_offsets_for_dim(dim=0)
+    w_index_fn = gen_offsets_for_dim(dim=1)
+    x_loader = x.make_loader()
+
+    def fn(idx, return_index):
+        *prefix, bh, bw = idx
+
+        h_start_index = ops.indirect_indexing(h_index_fn(prefix, bh), inp_h)
+        w_start_index = ops.indirect_indexing(w_index_fn(prefix, bw), inp_w)
+
+        maxval = None
+        maxindex = None
+        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
+            val = x_loader([*prefix, h_start_index + ih, w_start_index + iw])
+            if return_index:
+                index = ops.index_expr(
+                    (h_start_index + ih) * inp_w + w_start_index + iw, torch.int64
+                )
+                if maxindex is None:
+                    maxindex = index
+                else:
+                    maxindex = ops.where(
+                        ops.or_(ops.gt(val, maxval), ops.isnan(val)), index, maxindex
+                    )
+            if maxval is None:
+                maxval = val
+            else:
+                maxval = ops.maximum(val, maxval)
+        if return_index:
+            return maxindex
+        else:
+            return maxval
+
+    new_size = list(batch) + [h_out, w_out]
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=functools.partial(fn, return_index=False),
+        ranges=new_size,
+    )
+
+    ri = Pointwise.create(
+        device=x.get_device(),
+        dtype=torch.int64,
+        inner_fn=functools.partial(fn, return_index=True),
+        ranges=new_size,
+    )
+    return rv, ri
+
+
+@register_lowering(aten.upsample_nearest2d_backward.default)
+def upsample_nearest2d_backward(
+    x, output_size=None, input_size=None, scales_h=None, scales_w=None
+):
+    x.realize_hint()
+
+    *batch, inp_h, inp_w = x.get_size()
+    inp_h = V.graph.sizevars.evaluate_static_shape(inp_h)
+    inp_w = V.graph.sizevars.evaluate_static_shape(inp_w)
+
+    *batch, out_h, out_w = input_size
+
+    if inp_h % out_h == 0 and inp_w % out_w == 0:
+        return avg_pool2d(x, [inp_h // out_h, inp_w // out_w], divisor_override=1)
+
+    h_kernel_max = ceildiv(inp_h, out_h)
+    w_kernel_max = ceildiv(inp_w, out_w)
+
+    def start_index(index, out_dim, inp_dim):
+        return CeilDiv(index * inp_dim, out_dim)
+
+    def end_index(index, out_dim, inp_dim):
+        return start_index((index + 1), out_dim, inp_dim)
+
+    h_start_index = functools.partial(start_index, out_dim=out_h, inp_dim=inp_h)
+    h_end_index = functools.partial(end_index, out_dim=out_h, inp_dim=inp_h)
+
+    w_start_index = functools.partial(start_index, out_dim=out_w, inp_dim=inp_w)
+    w_end_index = functools.partial(end_index, out_dim=out_w, inp_dim=inp_w)
+
+    fn_sum = _adaptive_pooling_idx_sum(
+        [h_kernel_max, w_kernel_max],
+        [h_start_index, w_start_index],
+        [h_end_index, w_end_index],
+    )
+
+    def fn(idx):
+        return fn_sum(idx, pad_adaptive_loader(x))
+
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=fn,
+        ranges=list(input_size),
+    )
+
+    return rv
+
+
+fallback_avg_pool2d = fallback_handler(
+    aten.avg_pool2d.default, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten.avg_pool2d, type_promotion_kind=None)
+def avg_pool2d(
+    x,
+    kernel_size,
+    stride=(),
+    padding=0,
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    if not stride:
+        stride = kernel_size
+    if not padding:
+        padding = [0, 0]
+    kernel_size = pad_listlike(kernel_size, 2)
+    stride = pad_listlike(stride, 2)
+    padding = pad_listlike(padding, 2)
+
+    assert isinstance(x, TensorBox)
+    assert len(kernel_size) == 2
+    assert len(stride) == 2
+    assert len(padding) == 2
+    assert len(x.get_size()) in (3, 4)
+
+    x.realize_hint()
+    *batch, h, w = x.get_size()
+
+    h_out, ceil_mode1 = pooling_size(h, 0, kernel_size, stride, padding, ceil_mode)
+    w_out, ceil_mode2 = pooling_size(w, 1, kernel_size, stride, padding, ceil_mode)
+
+    if padding[0] or padding[1] or ceil_mode1 or ceil_mode2:
+        x_loader = constant_boundary_condition_2d(x, 0.0)
+        had_padding = True
+    else:
+        x_loader = x.make_loader()
+        had_padding = False
+
+    new_size = list(batch) + [h_out, w_out]
+    dtype = x.get_dtype()
+
+    window_size = kernel_size[0] * kernel_size[1]
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_avg_pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+
+    def fn_sum(idx, loader):
+        *prefix, bh, bw = idx
+        total = None
+        for ih, iw in itertools.product(range(kernel_size[0]), range(kernel_size[1])):
+            ih = bh * stride[0] + ih - padding[0]
+            iw = bw * stride[1] + iw - padding[1]
+            val = loader([*prefix, ih, iw])
+            if total is None:
+                total = val
+            else:
+                total = ops.add(val, total)
+        return total
+
+    if not had_padding or divisor_override:
+        if divisor_override:
+            scale = 1 / divisor_override
+        else:
+            scale = 1.0 / (kernel_size[0] * kernel_size[1])
+
+        def fn(idx):
+            return ops.mul(fn_sum(idx, x_loader), ops.constant(scale, dtype))
+
+    else:
+        ones_loader = constant_boundary_condition_2d(
+            ones_like(x), 0.0, padding if count_include_pad else None
+        )
+
+        def fn(idx):
+            # TODO(jansel): optimize to do `int(x<h)` rather than `x<h?1:0`
+            return ops.truediv(fn_sum(idx, x_loader), fn_sum(idx, ones_loader))
+
+    rv = Pointwise.create(
+        device=x.get_device(),
+        dtype=dtype,
+        inner_fn=fn,
+        ranges=new_size,
+    )
+    # TODO(jansel): should we force these to be realized?
+    return rv
+
+
+fallback_avg_pool2d_backward = fallback_handler(
+    aten.avg_pool2d_backward.default, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten.avg_pool2d_backward, type_promotion_kind=None)
+def avg_pool2d_backward(
+    grad_output,
+    x,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override=None,
+):
+    assert divisor_override is None or divisor_override != 0, "divisor must be not zero"
+    if not stride:
+        stride = kernel_size
+    if not padding:
+        padding = [0, 0]
+
+    assert isinstance(grad_output, TensorBox)
+    assert isinstance(x, TensorBox)
+    assert len(kernel_size) == 2
+    assert len(stride) == 2
+    assert len(padding) == 2
+    assert len(x.get_size()) in (3, 4)
+
+    grad_output.realize_hint()  # we will read this many times, so make sure it is computed
+
+    *batch, height, width = x.get_size()
+
+    h_out, ceil_mode1 = pooling_size(height, 0, kernel_size, stride, padding, ceil_mode)
+    w_out, ceil_mode2 = pooling_size(width, 1, kernel_size, stride, padding, ceil_mode)
+
+    grad_loader = grad_output.make_loader()
+
+    had_padding = padding[0] or padding[1] or ceil_mode1 or ceil_mode2
+
+    *_, pooled_height, pooled_width = grad_output.get_size()
+    new_size = list(x.get_size())
+    dtype = x.get_dtype()
+
+    h_window_size = max(
+        [
+            max(h // stride[0] - max(0, (h - kernel_size[0]) // stride[0]), 1)
+            for h in range(kernel_size[0] * 2)
+        ]
+    )
+    w_window_size = max(
+        [
+            max(w // stride[1] - max(0, (w - kernel_size[1]) // stride[1]), 1)
+            for w in range(kernel_size[1] * 2)
+        ]
+    )
+
+    window_size = h_window_size * w_window_size
+    if window_size > 25:
+        # Kernel size too big. Results in hard-to-optimize Triton code. Use fallback.
+        return fallback_avg_pool2d_backward(
+            grad_output,
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+
+    def compute_pool_size_without_padding(ph, pw):
+        """
+        This computes the scaling factor that we will divide an element
+        by when `count_include_pad=False`
+        """
+        stride_h = ops.constant(stride[0], torch.int32)
+        stride_w = ops.constant(stride[1], torch.int32)
+        pad_h = ops.constant(padding[0], torch.int32)
+        pad_w = ops.constant(padding[1], torch.int32)
+        kernel_h = ops.constant(kernel_size[0], torch.int32)
+        kernel_w = ops.constant(kernel_size[1], torch.int32)
+        hstart = ops.sub(ops.mul(ph, stride_h), pad_h)
+        wstart = ops.sub(ops.mul(pw, stride_w), pad_w)
+        hend = ops.minimum(
+            ops.add(hstart, kernel_h),
+            ops.add(ops.index_expr(height, torch.int32), pad_h),
+        )
+        wend = ops.minimum(
+            ops.add(wstart, kernel_w),
+            ops.add(ops.index_expr(width, torch.int32), pad_w),
+        )
+        hstart = ops.maximum(hstart, ops.constant(0, torch.int32))
+        wstart = ops.maximum(wstart, ops.constant(0, torch.int32))
+        hend = ops.minimum(hend, ops.index_expr(height, torch.int32))
+        wend = ops.minimum(wend, ops.index_expr(width, torch.int32))
+        divide_factor = ops.mul(ops.sub(hend, hstart), ops.sub(wend, wstart))
+        return divide_factor
+
+    def fn(idx):
+        *prefix, h, w = idx
+        h = h + padding[0]
+        w = w + padding[1]
+        phstart = ops.index_expr(
+            FloorDiv(h - kernel_size[0] + stride[0], stride[0]), torch.int32
+        )
+        pwstart = ops.index_expr(
+            FloorDiv(w - kernel_size[1] + stride[1], stride[1]), torch.int32
+        )
+        phend = ops.index_expr(FloorDiv(h, stride[0]) + 1, torch.int32)
+        pwend = ops.index_expr(FloorDiv(w, stride[1]) + 1, torch.int32)
+
+        phstart = ops.maximum(phstart, ops.constant(0, torch.int32))
+        pwstart = ops.maximum(pwstart, ops.constant(0, torch.int32))
+        phend = ops.minimum(phend, ops.index_expr(pooled_height, torch.int32))
+        pwend = ops.minimum(pwend, ops.index_expr(pooled_width, torch.int32))
+
+        gradient = None
+        for ph_ in range(h_window_size):
+            for pw_ in range(w_window_size):
+                ph = ops.add(phstart, ops.constant(ph_, torch.int32))
+                pw = ops.add(pwstart, ops.constant(pw_, torch.int32))
+
+                if divisor_override is not None:
+                    scale = divisor_override
+                elif count_include_pad or not had_padding:
+                    scale = kernel_size[0] * kernel_size[1]
+                else:
+                    scale = compute_pool_size_without_padding(ph, pw)
+
+                part = ops.truediv(
+                    grad_loader(
+                        [
+                            *prefix,
+                            ops.indirect_indexing(
+                                ops.minimum(
+                                    ph, ops.sub(phend, ops.constant(1, torch.int32))
+                                ),
+                                pooled_height,
+                                check=False,
+                            ),
+                            ops.indirect_indexing(
+                                ops.minimum(
+                                    pw, ops.sub(pwend, ops.constant(1, torch.int32))
+                                ),
+                                pooled_width,
+                                check=False,
+                            ),
+                        ]
+                    ),
+                    scale,
+                )
+
+                mask = ops.and_(
+                    ops.lt(ph, phend),
+                    ops.lt(pw, pwend),
+                )
+                if gradient is None:
+                    gradient = ops.where(mask, part, ops.constant(0.0, torch.float32))
+                else:
+                    gradient = ops.where(mask, ops.add(gradient, part), gradient)
+        assert gradient is not None
+        return gradient
+
+    rv = Pointwise.create(
+        device=grad_output.get_device(),
+        dtype=dtype,
+        inner_fn=fn,
+        ranges=new_size,
+    )
+    return rv
+
+
+def _validate_reduction_axis(x, axis):
+    size = x.get_size()
+    if isinstance(axis, int):
+        axis = [axis]
+    elif not axis:
+        axis = range(len(size))
+    if len(size) == 0:
+        assert tuple(axis) in [(), (0,), (-1,)], f"invalid axis: {axis}"
+        return []
+    axis = list(axis)
+    for i in range(len(axis)):
+        if axis[i] < 0:
+            axis[i] += len(size) if len(size) else 1
+        assert 0 <= axis[i] < len(size) or (len(size) == 0 and axis[i] == 0)
+    assert len(set(axis)) == len(axis), "reduction axis not unique"
+    return axis
+
+
+def _make_reduction_inner(x, *, axis, keepdims, dtype, override_return_dtype):
+    if dtype is not None:
+        x = to_dtype(x, dtype)
+    size = x.get_size()
+    axis = set(_validate_reduction_axis(x, axis))
+
+    kept_sizes = []
+    kept_idx = []
+    reduced_sizes = []
+    reduced_idx = []
+    for i in range(len(size)):
+        if i in axis:
+            reduced_idx.append(i)
+            reduced_sizes.append(size[i])
+        else:
+            kept_idx.append(i)
+            kept_sizes.append(size[i])
+
+    def loader(index, reduction_index):
+        assert len(reduction_index) == len(reduced_idx)
+        if keepdims:
+            assert len(index) == len(size)
+            index = [index[i] for i in kept_idx]
+        assert len(index) == len(kept_idx)
+        new_index = [None] * (len(index) + len(reduction_index))
+        for idx, var in itertools.chain(
+            zip(kept_idx, index), zip(reduced_idx, reduction_index)
+        ):
+            new_index[idx] = var
+        return inner_loader(new_index)
+
+    if keepdims:
+        new_size = list(size)
+        for i in reduced_idx:
+            new_size[i] = sympy.Integer(1)
+    else:
+        new_size = kept_sizes
+
+    inner_loader = x.make_loader()
+    return dict(
+        device=x.get_device(),
+        dst_dtype=override_return_dtype or x.get_dtype(),
+        src_dtype=x.get_dtype(),
+        inner_fn=loader,
+        ranges=new_size,
+        reduction_ranges=reduced_sizes,
+    )
+
+
+def make_reduction(reduction_type: str, override_return_dtype=None):
+    def inner(x, axis=None, keepdims=False, *, dtype=None):
+        kwargs = _make_reduction_inner(
+            x,
+            axis=axis,
+            keepdims=keepdims,
+            dtype=dtype,
+            override_return_dtype=override_return_dtype,
+        )
+        result = Reduction.create(reduction_type=reduction_type, input_node=x, **kwargs)
+        if isinstance(
+            result.data.data, Reduction
+        ):  # Only realize if reduction isn't unrolled
+            result.realize()
+        return result
+
+    return inner
+
+
+def _make_scan_inner(x, *, axis, dtype):
+    if dtype is not None:
+        x = to_dtype(x, dtype)
+    size = x.get_size()
+    axis = _validate_dim(x, axis)
+
+    return dict(
+        device=x.get_device(),
+        dtype=x.get_dtype(),
+        inner_fn=x.make_loader(),
+        size=x.get_size(),
+        axis=axis,
+    )
+
+
+@register_lowering(aten.mean)
+def mean(x, axis=None, keepdim=False, *, dtype=None):
+    if dtype is not None:
+        x = to_dtype(x, dtype)
+    size = x.get_size()
+    axis = _validate_reduction_axis(x, axis)
+    # compute in higher-precision until end of mean lowering
+    output_dtype = x.get_dtype()
+    if output_dtype in (torch.float16, torch.bfloat16):
+        x = to_dtype(x, torch.float)
+    sum_result = sum_(x, axis, keepdim)
+    denom = sympy_product(size[i] for i in axis)
+    denom = ir.IndexingConstant(denom, x.get_dtype(), x.get_device())
+    denom = ExpandView.create(denom, list(sum_result.get_size()))
+    return to_dtype(div(sum_result, denom), output_dtype)
+
+
+def var_mean_sum_(x, axis, correction, keepdim, return_mean):
+    if correction is None:
+        correction = 1
+
+    size = x.get_size()
+    axis = _validate_reduction_axis(x, axis)
+    x_mean = mean(x, axis, keepdim=True)
+    if return_mean:
+        x_mean.realize()
+
+    diffs = square(sub(x, x_mean))
+    sum_result = sum_(diffs, axis, keepdim)
+
+    denom = sympy_product(size[i] for i in axis)
+    if correction:
+        denom = sympy.Max(denom - correction, 0)
+    denom = ir.IndexingConstant(denom, x.get_dtype(), x.get_device())
+    denom = ExpandView.create(denom, list(sum_result.get_size()))
+    x_var = div(sum_result, denom)
+    if not return_mean:
+        return (x_var,)
+
+    x_mean = x_mean if keepdim else squeeze(x_mean, axis)
+    return x_var, x_mean
+
+
+def use_two_step_variance(x, axis, keepdim):
+    # Instead of unrolling welford, just unroll the simpler two-step var
+    axis = _validate_reduction_axis(x, axis)
+    kwargs = _make_reduction_inner(
+        x, axis=axis, keepdims=keepdim, dtype=None, override_return_dtype=None
+    )
+
+    ranges = kwargs["ranges"]
+    reduction_numel = sympy_product(kwargs["reduction_ranges"])
+    return (
+        isinstance(reduction_numel, sympy.Integer)
+        and int(reduction_numel) < config.unroll_reductions_threshold
+        and sympy_product(ranges) != 1
+    )
+
+
+def var_mean_welford_(x, axis, *, correction, keepdim, return_mean):
+    if correction is None:
+        correction = 1
+
+    kwargs = _make_reduction_inner(
+        x, axis=axis, keepdims=keepdim, dtype=None, override_return_dtype=None
+    )
+    loader = kwargs.pop("inner_fn")
+    kwargs.pop("dst_dtype")
+    kwargs.pop("src_dtype")
+
+    mean, m2, _ = ir.WelfordReduction.create(
+        inner_fns=(loader,),
+        reduction_type="welford_reduce",
+        dtype=x.get_dtype(),
+        **kwargs,
+    )
+    m2.realize()
+
+    dtype = x.get_dtype()
+    size = x.get_size()
+    axis = _validate_reduction_axis(x, axis)
+    rnumel = sympy_product(size[i] for i in axis)
+
+    def get_constant_or_index_expr(x, dtype):
+        if isinstance(x, sympy.Expr) and not x.is_number:
+            return ops.to_dtype(ops.index_expr(x, torch.int64), dtype)
+        return ops.constant(x, dtype)
+
+    def scale_fn(data):
+        c = get_constant_or_index_expr(correction, dtype)
+        N = get_constant_or_index_expr(rnumel, dtype)
+        zero = ops.constant(0, dtype)
+        return data / ops.maximum(zero, N - c)
+
+    var = make_pointwise(scale_fn)(m2)
+
+    if return_mean:
+        mean.realize()
+        return var, mean
+    return (var,)
+
+
+def var_mean_helper_(x, *, axis, correction, keepdim, return_mean):
+    out_dtype = x.get_dtype()
+    compute_dtype = get_computation_dtype(out_dtype)
+    x = to_dtype(x, compute_dtype, copy=False)
+    kwargs = dict(
+        x=x,
+        axis=axis,
+        correction=correction,
+        keepdim=keepdim,
+        return_mean=return_mean,
+    )
+    output = (
+        var_mean_sum_(**kwargs)
+        if use_two_step_variance(x, axis=axis, keepdim=keepdim)
+        else var_mean_welford_(**kwargs)
+    )
+    output = tuple(to_dtype(x, out_dtype, copy=False) for x in output)
+    return output[0] if not return_mean else output
+
+
+@register_lowering([aten.var, prims.var])
+def var_(x, axis=None, *, correction=None, keepdim=False):
+    return var_mean_helper_(
+        x, axis=axis, correction=correction, keepdim=keepdim, return_mean=False
+    )
+
+
+@register_lowering(aten.var_mean)
+def var_mean(x, axis=None, *, correction=None, keepdim=False):
+    return var_mean_helper_(
+        x, axis=axis, correction=correction, keepdim=keepdim, return_mean=True
+    )
+
+
+def pow_recursive(x, y, dtype):
+    if y < 0:
+        return pow_recursive(ops.reciprocal(x), -y, dtype)
+    if y == 0:
+        return ops.constant(1, dtype)
+    if y == 1:
+        return x
+
+    result = pow_recursive(x, y // 2, dtype)
+    result = ops.mul(result, result)
+    if (y % 2) == 1:
+        result = ops.mul(result, x)
+    return result
+
+
+@make_pointwise
+def pow_native(a, b):
+    return ops.pow(a, b)
+
+
+fallback_pow_tensor_tensor = fallback_handler(
+    aten.pow.Tensor_Tensor, add_to_fallback_set=False
+)
+fallback_pow_scalar = fallback_handler(aten.pow.Scalar, add_to_fallback_set=False)
+fallback_pow_tensor_scalar = fallback_handler(
+    aten.pow.Tensor_Scalar, add_to_fallback_set=False
+)
+
+
+@register_lowering(aten.pow, broadcast=True)
+def pow(a, b):
+    if isinstance(b, float) and b == int(b):
+        return pow(a, int(b))
+    elif isinstance(b, float) and b == 0.5:
+        return sqrt(a)
+    elif isinstance(b, int) and b == 1:
+        return clone(a)
+
+    # Type promotion ensures all tensor arguments have the same type
+    dtype = next(x.get_dtype() for x in (a, b) if isinstance(x, ir.TensorBox))
+    is_integer_pow = is_integer_dtype(dtype)
+
+    # Optimize away small fixed powers, or for integers avoid falling back to ATen
+    embed_exponent = isinstance(b, int) and (
+        -32 < b < 32 or (is_integer_pow and b >= 0)
+    )
+    if embed_exponent:
+        loader = a.make_loader()
+
+        def fn(idx):
+            return pow_recursive(loader(idx), b, a.get_dtype())
+
+        return Pointwise.create(
+            device=a.get_device(),
+            dtype=a.get_dtype(),
+            inner_fn=fn,
+            ranges=a.get_size(),
+        )
+
+    if isinstance(a, Number):
+        if a == 1:
+            return full_like(b, 1)
+        if a == 2 and is_float_dtype(b.get_dtype()):
+            return exp2(b)
+
+    if is_integer_pow:
+        # ops.pow doesn't work for integers
+        if isinstance(a, Number):
+            return fallback_pow_scalar(a, b)
+        elif isinstance(b, Number):
+            return fallback_pow_tensor_scalar(a, b)
+        else:
+            return fallback_pow_tensor_tensor(a, b)
+
+    return pow_native(a, b)
+
+
+def mutate_to(changed, val, unsafe_alias=False):
+    if isinstance(changed, TensorBox):
+        changed_data = changed.data
+    else:
+        changed_data = changed
+    if isinstance(val, TensorBox):
+        val = val.data
+
+    if not isinstance(val, ir.StorageBox):
+        # introduce a copy to handle views
+        val = Pointwise.create(
+            device=changed.get_device(),
+            dtype=changed.get_dtype(),
+            inner_fn=val.make_loader(),
+            ranges=changed.get_size(),
+        ).data
+        assert isinstance(val, ir.StorageBox)
+
+    if isinstance(changed_data, ir.StorageBox) and not (
+        changed_data.is_input_buffer() or isinstance(changed_data.data, ir.NopKernel)
+    ):
+        # Fast path, just swing the data pointer
+        val.realize()
+        changed_data.data = val.data
+        return changed
+
+    ir.MutationLayout.realize_into(val, changed_data, unsafe_alias=unsafe_alias)
+    return changed
+
+
+@register_lowering(aten.fill_)
+def fill_(x, fill_value):
+    return mutate_to(x, full_like(x, fill_value))
+
+
+@register_lowering(aten.copy_, type_promotion_kind=None)
+def copy_(dst, src, non_blocking=False):
+    src = to_device(src, dst.get_device())
+    src = to_dtype(src, dst.get_dtype())
+    src = expand(src, dst.get_size())
+    return mutate_to(dst, src)
+
+
+@make_pointwise
+def floordiv(a, b):
+    return ops.floordiv(a, b)
+
+
+@make_pointwise
+def truncdiv(a, b):
+    return ops.truncdiv(a, b)
+
+
+@register_lowering(aten.div, broadcast=True)
+def div_mode(a, b, rounding_mode=None):
+    both_integer = is_integer_type(a) and is_integer_type(b)
+    both_boolean = is_boolean_type(a) and is_boolean_type(b)
+
+    # floordiv and truncdiv need special handling for integer tensors on Triton,
+    # see the discussion at https://github.com/openai/triton/issues/605
+    if rounding_mode == "floor":
+        assert not both_boolean, "floordiv operands can not be boolean at the same time"
+        return floordiv(a, b) if both_integer else floor(div(a, b))
+    if rounding_mode == "trunc":
+        assert not both_boolean, "truncdiv operands can not be boolean at the same time"
+        return truncdiv(a, b) if both_integer else trunc(div(a, b))
+    return div(a, b)
+
+
+@register_lowering([aten.mul], broadcast=True)
+def mul(a, b):
+    both_bool = is_boolean_type(a) and is_boolean_type(b)
+    if both_bool:
+        return logical_and(a, b)
+    else:
+        fn = ops_wrapper(aten.mul.__name__)
+        return make_pointwise(fn)(a, b)
+
+
+# NOTE: prims.div maps to a / b in C, so performs truncation division on
+#   integer inputs and true division for floating and complex inputs.
+@register_lowering([prims.div], broadcast=True)
+def div_prim(a, b):
+    is_integral = all(is_boolean_type(x) or is_integer_type(x) for x in [a, b])
+
+    if is_integral:
+        return truncdiv(a, b)
+
+    def fn(*args):
+        return ops.truediv(*args)
+
+    return make_pointwise(fn)(a, b)
+
+
+@register_lowering(
+    [aten.true_divide, aten.div.Tensor],
+    broadcast=True,
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def div(a, b):
+    a, b = promote_constants(
+        (a, b), type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    )
+    return div_prim(a, b)
+
+
+@register_lowering([aten.fmod, prims.fmod], broadcast=True)
+def fmod(a, b):
+    is_integral = is_boolean_type(a) or is_integer_type(a)
+
+    if is_integral:
+
+        def fn(a, b):
+            return ops.mod(a, b)
+
+    else:
+
+        def fn(a, b):
+            return ops.fmod(a, b)
+
+    return make_pointwise(fn)(a, b)
+
+
+@register_lowering(aten.rsqrt)
+def rsqrt(x):
+    dtype = x.get_dtype()
+    if is_integer_dtype(dtype) or is_boolean_dtype(dtype):
+        x = to_dtype(x, torch.get_default_dtype())
+
+    def _rsqrt(x):
+        return ops.rsqrt(x)
+
+    return make_pointwise(_rsqrt)(x)
+
+
+@register_lowering([aten.sum, prims.sum])
+def sum_(x, axis=None, keepdims=False, *, dtype=None):
+    if (
+        is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+    ) and dtype is None:
+        dtype = torch.int64
+
+    fn = make_reduction("sum", override_return_dtype=dtype)
+    return fn(x, axis, keepdims, dtype=dtype)
+
+
+fallback_cumsum = fallback_handler(aten.cumsum.default)
+fallback_cumprod = fallback_handler(aten.cumprod.default)
+fallback_logcumsumexp = fallback_handler(aten.logcumsumexp.default)
+
+
+@register_lowering(aten.cumsum)
+def cumsum(x, axis=None, dtype=None):
+    if (
+        is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+    ) and dtype is None:
+        dtype = torch.int64
+
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        dtype = dtype or x.get_dtype()
+        return to_dtype(x, dtype, copy=True)
+
+    kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
+    result = ir.Scan.create(**kwargs, combine_fn=ops.add, init=0)
+    if result is None:
+        return fallback_cumsum(x, dim=axis, dtype=dtype)
+    return result
+
+
+@register_lowering(aten.cumprod)
+def cumprod(x, axis=None, dtype=None):
+    if (
+        is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+    ) and dtype is None:
+        dtype = torch.int64
+
+    if len(x.get_size()) == 0:
+        assert axis in [0, -1]
+        dtype = dtype or x.get_dtype()
+        return to_dtype(x, dtype, copy=True)
+
+    kwargs = _make_scan_inner(x, axis=axis, dtype=dtype)
+    result = ir.Scan.create(**kwargs, combine_fn=ops.mul, init=1)
+    if result is None:
+        return fallback_cumprod(x, dim=axis, dtype=dtype)
+    return result
+
+
+@register_lowering(aten.logcumsumexp)
+def logcumsumexp(x, dim):
+    def log_add_exp_helper(a, b):
+        min_v = ops.minimum(a, b)
+        max_v = ops.maximum(a, b)
+        mask = (min_v != max_v) | (~ops.isinf(min_v))
+        return ops.where(mask, ops.log1p(ops.exp(min_v - max_v)) + max_v, a)
+
+    dtype = x.get_dtype()
+    if len(x.get_size()) == 0:
+        assert dim in [0, -1]
+        return clone(x)
+
+    kwargs = _make_scan_inner(x, axis=dim, dtype=dtype)
+    result = ir.Scan.create(**kwargs, combine_fn=log_add_exp_helper, init=float("-inf"))
+    if result is None:
+        return fallback_logcumsumexp(x, dim=dim)
+    return result
+
+
+@register_lowering(aten.prod)
+def prod(x, axis=None, keepdims=False, *, dtype=None):
+    if (
+        is_integer_dtype(x.get_dtype()) or is_boolean_dtype(x.get_dtype())
+    ) and dtype is None:
+        dtype = torch.int64
+
+    fn = make_reduction("prod", override_return_dtype=dtype)
+    return fn(x, axis, keepdims, dtype=dtype)
+
+
+@register_lowering(aten.any)
+def reduce_any(x, dim=None, keepdim=False):
+    x = to_dtype(x, torch.bool)
+    return make_reduction("any")(x, axis=dim, keepdims=keepdim)
+
+
+@register_lowering(aten.max, type_promotion_kind=None)
+def reduce_max(x, dim=None, keepdim=False):
+    if dim is not None:
+        return (
+            reduce_amax(x, axis=dim, keepdims=keepdim),
+            reduce_argmax(x, axis=dim, keepdims=keepdim),
+        )
+
+    return reduce_amax(x, axis=None, keepdims=keepdim)
+
+
+@register_lowering(aten.min, type_promotion_kind=None)
+def reduce_min(x, dim=None, keepdim=False):
+    if dim is not None:
+        return (
+            reduce_amin(x, axis=dim, keepdims=keepdim),
+            reduce_argmin(x, axis=dim, keepdims=keepdim),
+        )
+
+    return reduce_amin(x, axis=None, keepdims=keepdim)
+
+
+register_lowering(prims.xor_sum)(make_reduction("xor_sum"))
+reduce_amax = register_lowering(aten.amax)(make_reduction("max"))
+reduce_amin = register_lowering(aten.amin)(make_reduction("min"))
+reduce_argmax = register_lowering(aten.argmax)(
+    make_reduction("argmax", override_return_dtype=torch.int64)
+)
+reduce_argmin = register_lowering(aten.argmin)(
+    make_reduction("argmin", override_return_dtype=torch.int64)
+)
+
+add = register_pointwise(
+    aten.add, allow_alpha=True, override_fn_when_input_bool="logical_or"
+)
+
+
+def register_pointwise_numeric(op, name=None, triton_fallback=None):
+    return register_pointwise(
+        op,
+        name=name,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        triton_fallback=triton_fallback,
+    )
+
+
+def register_pointwise_numeric_ldf64(op):
+    return register_pointwise(
+        op,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        use_libdevice_for_f64=True,
+    )
+
+
+exp = register_pointwise_numeric_ldf64(aten.exp)
+exp2 = register_pointwise_numeric(aten.exp2)
+expm1 = register_pointwise_numeric(aten.expm1)
+relu = register_pointwise(aten.relu)
+sigmoid = register_pointwise_numeric_ldf64(aten.sigmoid)
+sqrt = register_pointwise_numeric_ldf64(aten.sqrt)
+square = register_pointwise(aten.square)
+sub = register_pointwise(aten.sub, allow_alpha=True)
+register_pointwise_numeric_ldf64(aten.cos)
+register_pointwise_numeric_ldf64(aten.sin)
+abs = register_pointwise(aten.abs)
+bitwise_and = register_pointwise(aten.bitwise_and)
+bitwise_left_shift = register_pointwise(aten.bitwise_left_shift)
+bitwise_not = register_pointwise(
+    aten.bitwise_not, override_fn_when_input_bool="logical_not"
+)
+bitwise_or = register_pointwise(aten.bitwise_or)
+bitwise_right_shift = register_pointwise(aten.bitwise_right_shift)
+bitwise_xor = register_pointwise(aten.bitwise_xor)
+register_pointwise_numeric(aten.lgamma)
+erf = register_pointwise_numeric(aten.erf)
+register_lowering(
+    aten.special_erf, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+)(erf)
+
+register_pointwise_numeric(aten.log1p)
+register_pointwise_numeric(aten.tan)
+register_pointwise_numeric(aten.tanh)
+register_pointwise_numeric_ldf64(aten.log)
+logical_and = register_pointwise(
+    aten.logical_and,
+    type_promotion_kind=None,
+    convert_input_to_bool=True,
+    override_return_dtype=torch.bool,
+)
+logical_not = register_pointwise(
+    aten.logical_not,
+    type_promotion_kind=None,
+    convert_input_to_bool=True,
+    override_return_dtype=torch.bool,
+)
+logical_or = register_pointwise(
+    aten.logical_or,
+    type_promotion_kind=None,
+    convert_input_to_bool=True,
+    override_return_dtype=torch.bool,
+)
+logical_xor = register_pointwise(
+    aten.logical_xor,
+    type_promotion_kind=None,
+    convert_input_to_bool=True,
+    override_return_dtype=torch.bool,
+)
+maximum = register_pointwise(aten.maximum)
+minimum = register_pointwise(aten.minimum)
+register_lowering(aten.clamp_min)(maximum)
+register_lowering(aten.clamp_max)(minimum)
+neg = register_pointwise(aten.neg)
+abs = register_pointwise(aten.abs)
+reciprocal = register_pointwise_numeric(aten.reciprocal)
+register_pointwise(aten.remainder)
+sign = register_pointwise(aten.sign, override_fn_when_input_bool="identity")
+register_pointwise(aten.ceil)
+register_pointwise(aten.signbit, override_return_dtype=torch.bool)
+
+register_lowering(aten._neg_view)(neg)
+
+register_pointwise(aten.le, override_return_dtype=torch.bool)
+register_pointwise(aten.lt, override_return_dtype=torch.bool)
+register_pointwise(aten.ge, override_return_dtype=torch.bool)
+gt = register_pointwise(aten.gt, override_return_dtype=torch.bool)
+register_pointwise(aten.eq, override_return_dtype=torch.bool)
+register_pointwise(aten.ne, override_return_dtype=torch.bool)
+
+register_pointwise_numeric(aten.cosh)
+register_pointwise_numeric(aten.sinh)
+register_pointwise_numeric(aten.acos)
+register_pointwise_numeric(aten.acosh)
+register_pointwise_numeric(aten.asin)
+register_pointwise_numeric(aten.asinh)
+register_pointwise_numeric(aten.atan2)
+register_pointwise_numeric(aten.atan)
+register_pointwise_numeric(aten.atanh)
+register_pointwise_numeric(aten.copysign)
+register_pointwise_numeric(aten.erfc)
+register_pointwise_numeric(aten.erfinv)
+register_pointwise_numeric(aten.hypot)
+register_pointwise_numeric(aten.log10)
+register_pointwise_numeric(aten.nextafter)
+
+from .codegen.common import pointwise_overrides_data
+
+
+def _get_pointwise_overrides(ns, name):
+    data = pointwise_overrides_data[name]
+    op = getattr(ns, data.name, None)
+    if op is None:
+        return
+
+    def make_triton_fallback(op):
+        if data.triton is None:
+            return fallback_handler(op)
+
+    if isinstance(op, torch._ops.OpOverloadPacket):
+        for olname in op.overloads():
+            ol = getattr(op, olname)
+            yield ol, data.type_promotion_kind, make_triton_fallback(ol)
+    else:
+        yield op, data.type_promotion_kind, make_triton_fallback(op)
+
+
+for name in pointwise_overrides_data:
+    for op, type_promotion_kind, triton_fallback in _get_pointwise_overrides(
+        aten, name
+    ):
+        register_pointwise(
+            op,
+            name=name,
+            type_promotion_kind=type_promotion_kind,
+            triton_fallback=triton_fallback,
+        )
+
+    for op, type_promotion_kind, triton_fallback in _get_pointwise_overrides(
+        prims, name
+    ):
+        register_pointwise(
+            op,
+            name=name,
+            type_promotion_kind=type_promotion_kind,
+            triton_fallback=triton_fallback,
+        )
+
+
+foreach_add_list = register_foreach_pointwise(
+    aten._foreach_add.List, add, allow_alpha=True
+)
+foreach_add_scalar = register_foreach_pointwise(
+    aten._foreach_add.Scalar, add, allow_alpha=True
+)
+register_foreach_pointwise(aten._foreach_add.Tensor, add, allow_alpha=True)
+foreach_mul_list = register_foreach_pointwise(aten._foreach_mul.List, mul)
+foreach_mul_scalar = register_foreach_pointwise(aten._foreach_mul.Scalar, mul)
+register_foreach_pointwise(aten._foreach_sub.List, sub)
+register_foreach_pointwise(aten._foreach_sub.Scalar, sub)
+register_foreach_pointwise(aten._foreach_neg.default, neg)
+register_foreach_pointwise(aten._foreach_abs.default, abs)
+register_foreach_pointwise(aten._foreach_pow.Scalar, pow)
+register_foreach_pointwise(aten._foreach_pow.ScalarAndTensor, pow)
+foreach_div_list = register_foreach_pointwise(aten._foreach_div.List, div)
+foreach_div_scalar = register_foreach_pointwise(aten._foreach_div.Scalar, div)
+register_foreach_pointwise(aten._foreach_sqrt, sqrt)
+register_foreach_pointwise(aten._foreach_maximum.List, maximum)
+register_foreach_pointwise(aten._foreach_maximum.Scalar, maximum)
+register_foreach_pointwise(aten._foreach_minimum.List, minimum)
+register_foreach_pointwise(aten._foreach_minimum.Scalar, minimum)
+register_foreach_pointwise(aten._foreach_clamp_min.List, maximum)
+register_foreach_pointwise(aten._foreach_clamp_min.Scalar, maximum)
+register_foreach_pointwise(aten._foreach_clamp_max.List, minimum)
+register_foreach_pointwise(aten._foreach_clamp_max.Scalar, minimum)
+register_foreach_pointwise(aten._foreach_reciprocal, reciprocal)
+register_foreach_pointwise(aten._foreach_sign, sign)
+register_foreach_pointwise(aten._foreach_copy, copy)
+
+
+# these are only encountered as outputs of the graph
+# reinplacing epilogue copies improves compile time
+# by removing extra buffers sent to the scheduler.
+def register_foreach_inplace(aten_op, outplace_aten_op, outplace_op):
+    inplaceable_foreach_ops[outplace_aten_op] = aten_op
+    inplace_foreach_ops.add(aten_op)
+
+    def fn(*args, **kwargs):
+        results = outplace_op(*args, **kwargs)
+        mut_results = []
+        for arg, result in zip(args[0], results):
+            mut_results.append(mutate_to(arg, result, unsafe_alias=True))
+
+        return mut_results
+
+    _register_foreach_lowering(aten_op, fn)
+
+
+register_foreach_inplace(
+    aten._foreach_add_.List, aten._foreach_add.List, foreach_add_list
+)
+register_foreach_inplace(
+    aten._foreach_add_.Scalar, aten._foreach_add.Scalar, foreach_add_scalar
+)
+register_foreach_inplace(
+    aten._foreach_mul_.List, aten._foreach_mul.List, foreach_mul_list
+)
+register_foreach_inplace(
+    aten._foreach_mul_.Scalar, aten._foreach_mul.Scalar, foreach_mul_scalar
+)
+register_foreach_inplace(
+    aten._foreach_div_.List, aten._foreach_div.List, foreach_div_list
+)
+register_foreach_inplace(
+    aten._foreach_div_.Scalar, aten._foreach_div.Scalar, foreach_div_scalar
+)
+
+
+def register_inplace(aten_op, outplace_op):
+    @register_lowering(aten_op, type_promotion_kind=None)
+    def fn(*args, **kwargs):
+        result = outplace_op(*args, **kwargs)
+        result = to_dtype(result, args[0].get_dtype())
+        return mutate_to(args[0], result)
+
+    return fn
+
+
+register_inplace(aten.add_, add)
+register_inplace(aten.bitwise_and_, bitwise_and)
+register_inplace(aten.bitwise_left_shift_, bitwise_left_shift)
+register_inplace(aten.bitwise_not_, bitwise_not)
+register_inplace(aten.bitwise_or_, bitwise_or)
+register_inplace(aten.bitwise_right_shift_, bitwise_right_shift)
+register_inplace(aten.bitwise_xor_, bitwise_xor)
+register_inplace(aten.mul_, mul)
+register_inplace(aten.div_.Tensor, div)
+register_inplace(aten.div_.Tensor_mode, div_mode)
+register_inplace(aten.logical_and_, logical_and)
+register_inplace(aten.logical_not_, logical_not)
+register_inplace(aten.logical_or_, logical_or)
+register_inplace(aten.logical_xor_, logical_xor)
+register_inplace(aten.sub_, sub)
+register_inplace(aten.relu_, relu)
+register_inplace(aten.sigmoid_, sigmoid)
+
+
+register_lowering(aten.__and__)(bitwise_and)
+register_lowering(aten.__lshift__)(bitwise_left_shift)
+register_lowering(aten.__or__)(bitwise_or)
+register_lowering(aten.__rshift__)(bitwise_right_shift)
+register_lowering(aten.__xor__)(bitwise_xor)
+
+register_inplace(aten.__iand__, aten.__and__)
+register_inplace(aten.__ilshift__, aten.__lshift__)
+register_inplace(aten.__ior__, aten.__or__)
+register_inplace(aten.__irshift__, aten.__rshift__)
+register_inplace(aten.__ixor__, aten.__xor__)
+
+
+@register_lowering(aten.sym_constrain_range)
+def sym_constrain_range(a, min=None, max=None):
+    tracing_context = torch._guards.TracingContext.try_get()
+    assert (
+        tracing_context is None or a in tracing_context.fake_mode.shape_env.var_to_range
+    )
+    return a
+
+
+@register_lowering(aten.sym_size.int)
+def sym_size(a, dim):
+    val = V.graph.current_node.meta["val"]
+    # Note [Can val be an int?]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~
+    # In principle, someone could construct an FX graph where
+    # a call to size/stride has a val that is a plain int (not
+    # SymInt).  However, we will maintain the invariant that
+    # this is not possible: if you are constructing an FX graph
+    # where there is a call to size/stride that returns an
+    # int, but you KNOW that int must always be a constant,
+    # then you do not need trace that call at all (and just
+    # constant propagate the integer as is.)
+    assert isinstance(val, torch.SymInt)
+    return val.node.expr
+
+
+@register_lowering(aten.sym_stride.int)
+def sym_stride(a, dim):
+    val = V.graph.current_node.meta["val"]
+    # See Note [Can val be an int?]
+    assert isinstance(val, torch.SymInt)
+    return val.node.expr
+
+
+@register_lowering(aten.sym_numel)
+def sym_numel(a):
+    return a.get_numel()
+
+
+for method, func in magic_methods.items():
+    register_lowering(method_to_operator(method))(func)
+
+
+@register_lowering(aten._foobar)
+def foobar(self, *args, **kwargs):
+    raise NotImplementedError("Helpful for debugging")
+
+
+@register_lowering(torch.ops._inductor_test.realize)
+def _realize(x):
+    x.realize()
+    return clone(x)
+
+
+@register_lowering(torch.ops.inductor.resize_storage_bytes_)
+def resize_storage_bytes_(variable, new_size):
+    variable.realize()
+    ir.ResizeStorageBytes(variable, new_size)
+    return variable
+
+
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+make_fallback(auto_functionalized)
+
+
+@register_lowering(triton_kernel_wrapper_mutation)
+def triton_kernel_wrap_(*, kernel_idx, grid, kwargs):
+    ir.UserDefinedTritonKernel(kernel_idx=kernel_idx, grid=grid, kernel_args=kwargs)
+    return {key: val for key, val in kwargs.items() if isinstance(val, TensorBox)}
+
+
+@register_lowering(triton_kernel_wrapper_functional)
+def triton_kernel_wrap(*, kernel_idx, grid, kwargs, tensors_to_clone):
+    new_kwargs = {}
+    for name, value in kwargs.items():
+        if isinstance(value, ir.TensorBox):
+            x = value.data
+            has_non_rv_views = False
+            while isinstance(x, ir.BaseView):
+                if not isinstance(x, ir.ReinterpretView):
+                    has_non_rv_views = True
+                    break
+                x = x.data
+            if has_non_rv_views:
+                # we realize the inputs wrapped into any view which is not
+                # ReinterpretView to convert them into ReinterpretView during
+                # realization; all views being ReinterpretView is assumed by
+                # the downstream code (e.g., preserving ReinterpretView in
+                # cloning; layout should be available in mutation marking)
+                value = ir.TensorBox(ir.ExternKernel.realize_input(value))
+            if name in tensors_to_clone:
+                value = clone_preserve_reinterpret_view(value)
+        new_kwargs[name] = value
+
+    return triton_kernel_wrap_(kernel_idx=kernel_idx, grid=grid, kwargs=new_kwargs)
+
+
+@register_lowering(torch.ops.higher_order.cond)
+def cond(pred, true_fn, false_fn, operands):
+    if is_triton(pred) or any(map(is_triton, operands)):
+        msg = "control flow operator: torch.cond."
+        if stack_trace := V.graph.current_node.meta.get("stack_trace", None):
+            msg = f"{msg} Found from : \n {stack_trace}"
+        V.graph.disable_cudagraphs_reason = msg
+
+    result = ir.Conditional.create(pred, true_fn, false_fn, operands)
+    return list(map(TensorBox.create, result))
+
+
+try:
+    import torch.distributed._functional_collectives
+
+    c10d_functional = torch.ops.c10d_functional
+
+    @register_lowering(c10d_functional.wait_tensor)
+    def wait(input):
+        return TensorBox.create(ir.Wait.create(input))
+
+    @register_lowering(c10d_functional.broadcast)
+    def broadcast(input, src, tag, ranks, group_size):
+        return ir.Broadcast.create(input, src, tag, ranks, group_size)
+
+    @register_lowering(c10d_functional.all_reduce)
+    def allreduce(input, reduce_op, tag, ranks, group_size):
+        return ir.AllReduce.create(input, reduce_op, tag, ranks, group_size)
+
+    @register_lowering(c10d_functional.all_gather_into_tensor)
+    def all_gather_into_tensor(shard, tag, ranks, group_size):
+        return TensorBox.create(
+            ir.AllGatherIntoTensor.create(
+                ir.ExternKernel.require_contiguous(shard), tag, ranks, group_size
+            )
+        )
+
+    @register_lowering(c10d_functional.reduce_scatter_tensor)
+    def reduce_scatter_tensor(input, reduce_op, tag, ranks, group_size):
+        return TensorBox.create(
+            ir.ReduceScatterTensor.create(input, reduce_op, tag, ranks, group_size)
+        )
+
+    @register_lowering(c10d_functional.all_reduce_coalesced)
+    def all_reduce_coalesced(input, reduce_op, tag, ranks, group_size):
+        return ir.AllReduceCoalesced.create(input, reduce_op, tag, ranks, group_size)
+
+    @register_lowering(c10d_functional.all_gather_into_tensor_coalesced)
+    def all_gather_into_tensor_coalesced(self, tag, ranks, group_size):
+        result = ir.AllGatherIntoTensorCoalesced.create(self, tag, ranks, group_size)
+        return list(map(TensorBox.create, result))
+
+    @register_lowering(c10d_functional.reduce_scatter_tensor_coalesced)
+    def reduce_scatter_tensor_coalesced(self, reduceOp, tag, ranks, group_size):
+        result = ir.ReduceScatterTensorCoalesced.create(
+            self, reduceOp, tag, ranks, group_size
+        )
+        return list(map(TensorBox.create, result))
+
+    @register_lowering(c10d_functional.all_to_all_single)
+    def all_to_all_single(
+        self, output_split_sizes, input_split_sizes, tag, ranks, group_size
+    ):
+        return TensorBox.create(
+            ir.AllToAllSingle.create(
+                self, output_split_sizes, input_split_sizes, tag, ranks, group_size
+            )
+        )
+
+    _c10d_functional = torch.ops._c10d_functional
+
+    @register_lowering(_c10d_functional.all_reduce)
+    def _all_reduce(inp, reduce_op, group_name):
+        inp = clone(inp)
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.all_reduce_.default, inp, reduce_op, group_name
+        )
+        return inp
+
+    @register_lowering(_c10d_functional.all_reduce_)
+    def _all_reduce_(inp, reduce_op, group_name):
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.all_reduce_.default, inp, reduce_op, group_name
+        )
+        return inp
+
+    @register_lowering(_c10d_functional.all_reduce_coalesced)
+    def _all_reduce_coalesced(inputs, reduce_op, group_name):
+        inputs = [clone(inp) for inp in inputs]
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.all_reduce_coalesced_.default,
+            inputs,
+            reduce_op,
+            group_name,
+        )
+        return inputs
+
+    @register_lowering(_c10d_functional.all_reduce_coalesced_)
+    def _all_reduce_coalesced_(inputs, reduce_op, group_name):
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.all_reduce_coalesced_.default,
+            inputs,
+            reduce_op,
+            group_name,
+        )
+        return inputs
+
+    @register_lowering(_c10d_functional.all_gather_into_tensor)
+    def _all_gather_into_tensor(inp, group_size, group_name):
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                _c10d_functional.all_gather_into_tensor.default,
+                inp,
+                group_size,
+                group_name,
+            )
+        )
+
+    @register_lowering(_c10d_functional.all_gather_into_tensor_coalesced)
+    def _all_gather_into_tensor_coalesced(inputs, group_size, group_name):
+        return pytree.tree_map(
+            ir.TensorBox.create,
+            ir._CollectiveKernel.create_out_of_place(
+                _c10d_functional.all_gather_into_tensor_coalesced.default,
+                inputs,
+                group_size,
+                group_name,
+            ),
+        )
+
+    @register_lowering(_c10d_functional.reduce_scatter_tensor)
+    def _reduce_scatter_tensor(inp, reduce_op, group_size, group_name):
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                _c10d_functional.reduce_scatter_tensor.default,
+                inp,
+                reduce_op,
+                group_size,
+                group_name,
+            )
+        )
+
+    @register_lowering(_c10d_functional.reduce_scatter_tensor_coalesced)
+    def _reduce_scatter_tensor_coalesced(inputs, reduce_op, group_size, group_name):
+        return pytree.tree_map(
+            ir.TensorBox.create,
+            ir._CollectiveKernel.create_out_of_place(
+                _c10d_functional.reduce_scatter_tensor_coalesced.default,
+                inputs,
+                reduce_op,
+                group_size,
+                group_name,
+            ),
+        )
+
+    @register_lowering(_c10d_functional.all_to_all_single)
+    def _all_to_all_single(inp, output_split_sizes, input_split_sizes, group_name):
+        return ir.TensorBox.create(
+            ir._CollectiveKernel.create_out_of_place(
+                _c10d_functional.all_to_all_single.default,
+                inp,
+                output_split_sizes,
+                input_split_sizes,
+                group_name,
+            )
+        )
+
+    @register_lowering(_c10d_functional.broadcast)
+    def _broadcast(inp, src, group_name):
+        inp = clone(inp)
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.broadcast_.default, inp, src, group_name
+        )
+        return inp
+
+    @register_lowering(_c10d_functional.broadcast_)
+    def _broadcast_(inp, src, group_name):
+        ir._CollectiveKernel.create_inplace(
+            _c10d_functional.broadcast_.default, inp, src, group_name
+        )
+        return inp
+
+    @register_lowering(_c10d_functional.wait_tensor)
+    def _wait_tensor(inp):
+        ir._WaitKernel.create_wait(_c10d_functional.wait_tensor.default, inp)
+        return inp
+
+except ImportError:
+    log.info(
+        "Inductor support for distributed collectives depends on building torch.distributed"
+    )
+
+# populate lowerings defined in kernel/*
+from . import kernel
+
+import_submodule(kernel)
+
+from . import quantized_lowerings
+
+quantized_lowerings.register_quantized_ops()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/metrics.py b/MLPY/Lib/site-packages/torch/_inductor/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..915b602d6f3b197cc297b7c1814855a023dfd3b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/metrics.py
@@ -0,0 +1,419 @@
+from __future__ import annotations
+
+import csv
+import inspect
+import os
+import re
+from dataclasses import dataclass
+from functools import lru_cache
+
+from typing import Dict, List, Set, Tuple, TYPE_CHECKING, Union
+
+from torch._inductor import config
+from torch._inductor.utils import get_benchmark_name
+
+# Prevent circular import
+if TYPE_CHECKING:
+    from torch._inductor.scheduler import (
+        BaseSchedulerNode,
+        ExternKernelSchedulerNode,
+        NopKernelSchedulerNode,
+        SchedulerNode,
+    )
+
+# counter for tracking how many kernels have been generated
+generated_kernel_count = 0
+generated_cpp_vec_kernel_count = 0
+num_bytes_accessed = 0
+nodes_num_elem: List[
+    Tuple[
+        Union[NopKernelSchedulerNode, SchedulerNode, ExternKernelSchedulerNode],
+        int,
+    ]
+] = []
+node_runtimes: List[Tuple[BaseSchedulerNode, float]] = []
+
+# counters for tracking fusions
+ir_nodes_pre_fusion = 0
+
+# counters for tracking to_dtype inserted
+cpp_to_dtype_count = 0
+
+# counters for tracking cpp_wrapper disabled
+disable_cpp_wrapper = 0
+
+
+# reset all counters
+def reset():
+    global generated_kernel_count
+    global generated_cpp_vec_kernel_count
+    global num_bytes_accessed, nodes_num_elem
+    global ir_nodes_pre_fusion
+    global cpp_to_dtype_count
+    global disable_cpp_wrapper
+
+    generated_kernel_count = 0
+    generated_cpp_vec_kernel_count = 0
+    num_bytes_accessed = 0
+    nodes_num_elem.clear()
+    node_runtimes.clear()
+    ir_nodes_pre_fusion = 0
+    cpp_to_dtype_count = 0
+    disable_cpp_wrapper = 0
+
+
+@dataclass
+class CachedMetricsDeltas:
+    """
+    The subset of metrics we want update across cache hits, e.g., the
+    FxGraphCache.
+    """
+
+    generated_kernel_count: int
+    generated_cpp_vec_kernel_count: int
+    ir_nodes_pre_fusion: int
+    cpp_to_dtype_count: int
+
+
+class CachedMetricsHelper:
+    """
+    A helper class to help calculate and apply counter deltas for those
+    metrics we want to save with cache entries (e.g., FxGraphCache) and
+    apply on a cache hit.
+    """
+
+    def __init__(self):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        self.generated_kernel_count = generated_kernel_count
+        self.generated_cpp_vec_kernel_count = generated_cpp_vec_kernel_count
+        self.ir_nodes_pre_fusion = ir_nodes_pre_fusion
+        self.cpp_to_dtype_count = cpp_to_dtype_count
+
+    def get_deltas(self) -> CachedMetricsDeltas:
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        return CachedMetricsDeltas(
+            generated_kernel_count - self.generated_kernel_count,
+            generated_cpp_vec_kernel_count - self.generated_cpp_vec_kernel_count,
+            ir_nodes_pre_fusion - self.ir_nodes_pre_fusion,
+            cpp_to_dtype_count - self.cpp_to_dtype_count,
+        )
+
+    @staticmethod
+    def apply_deltas(delta: CachedMetricsDeltas):
+        global generated_kernel_count
+        global generated_cpp_vec_kernel_count
+        global ir_nodes_pre_fusion
+        global cpp_to_dtype_count
+
+        generated_kernel_count += delta.generated_kernel_count
+        generated_cpp_vec_kernel_count += delta.generated_cpp_vec_kernel_count
+        ir_nodes_pre_fusion += delta.ir_nodes_pre_fusion
+        cpp_to_dtype_count += delta.cpp_to_dtype_count
+
+
+REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {}
+
+
+@dataclass
+class MetricTable:
+    table_name: str
+    column_names: List[str]
+
+    num_rows_added: int = 0
+
+    def add_row(self, row_fn):
+        if self.table_name not in enabled_metric_tables():
+            return
+
+        row_dict = row_fn()
+        assert len(self.column_names) == len(
+            row_dict
+        ), f"{len(self.column_names)} v.s. {len(row_dict)}"
+        assert set(self.column_names) == set(
+            row_dict.keys()
+        ), f"{set(self.column_names)} v.s. {set(row_dict.keys())}"
+
+        row = [
+            get_benchmark_name(),
+        ]
+        row += [row_dict[column_name] for column_name in self.column_names]
+        self._write_row(row)
+
+    def output_filename(self):
+        return f"metric_table_{self.table_name}.csv"
+
+    def write_header(self):
+        filename = self.output_filename()
+        with open(filename, "w") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(["model_name"] + self.column_names)
+
+    def _write_row(self, row):
+        filename = self.output_filename()
+        if self.num_rows_added == 0 and not os.path.exists(filename):
+            self.write_header()
+
+        self.num_rows_added += 1
+
+        for idx, orig_val in enumerate(row):
+            if isinstance(orig_val, float):
+                new_val = f"{orig_val:.6f}"
+            elif orig_val is None:
+                new_val = ""
+            else:
+                new_val = orig_val
+            row[idx] = new_val
+
+        with open(filename, "a") as fd:
+            writer = csv.writer(fd, lineterminator="\n")
+            writer.writerow(row)
+
+    @staticmethod
+    def register_table(name, column_names):
+        table = MetricTable(name, column_names)
+        REGISTERED_METRIC_TABLES[name] = table
+
+
+MetricTable.register_table(
+    "slow_fusion",
+    [
+        "kernel1_path",
+        "kernel1_latency",
+        "kernel2_path",
+        "kernel2_latency",
+        "fused_kernel_path",
+        "fused_kernel_latency",
+        "slow_down_ratio",
+    ],
+)
+
+# track the fusion statistics for each graph
+MetricTable.register_table(
+    "graph_stats",
+    [
+        "graph_id",
+        "num_nodes_before_fusion",
+        "num_nodes_after_fusion",
+    ],
+)
+
+# track the perf difference between persistent reduction and non-persistent
+# reductions
+MetricTable.register_table(
+    "persistent_red_perf",
+    [
+        "kernel1_name",
+        "kernel2_name",
+        "kernel1_latency",
+        "kernel2_latency",
+        "size_hints",
+        "reduction_hint",
+        "speedup",
+    ],
+)
+
+# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint
+MetricTable.register_table(
+    "kernel_metadata",
+    [
+        "kernel_name",
+        "kernel_path",
+        "kernel_category",  # pointwise/reduction/foreach etc.
+        "size_hints",
+        "reduction_hint",
+        "line_of_code",
+        "num_load",
+        "num_store",
+        "num_for_loop",
+        "num_atomic_add",
+        "num_args",
+        # xyz numel can be different to size_hints since size_hints are rounded
+        # up to the nearest power of 2.
+        # Inductor kernel will burn in the xyz numel in kernel code for static
+        # shape kernels.
+        # Logging them will be helpful to find unaligned shape for reduction
+        "xnumel",
+        "ynumel",
+        "rnumel",
+        "kernel_args_num_gb",
+    ],
+)
+
+
+def _parse_kernel_fn_code(kernel_module_code):
+    """
+    The kernel_module_code is the python module that contains kernel function code.
+    kernel function is the proper triton kernel function annotated with
+    @triton.jit
+    """
+    from .codecache import PyCodeCache
+    from .wrapper_benchmark import get_triton_kernel
+
+    mod = PyCodeCache.load(kernel_module_code)
+    kernel = get_triton_kernel(mod)
+    # kernel is a CachingAutotune; kernel.fn is the JITFunction;
+    # kernel.fn.fn is the function being decorate by triton.jit
+    return inspect.getsource(kernel.fn.fn)
+
+
+def _parse_kernel_line_of_code(proper_kernel_fn_code):
+    """
+    Return the line of code for the kernel excluding the decorators.
+    """
+    return len(proper_kernel_fn_code.splitlines())
+
+
+def _parse_size_hints(kernel_module_code, kernel_category):
+    if kernel_category == "foreach":
+        # foreach kernel does not have size_hints
+        return None
+    m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code)
+    assert m, "size_hints missing!"
+    return m.group(1)
+
+
+def _parse_reduction_hint(kernel_category, kernel_module_code):
+    if kernel_category not in ("reduction", "persistent_reduction"):
+        return None
+    m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code)
+    assert m, "reduction_hint not found in kernel source code!"
+    return m.group(1)
+
+
+def _count_pattern(proper_kernel_fn_code, pattern):
+    return proper_kernel_fn_code.count(pattern)
+
+
+def _count_args(proper_kernel_fn_code):
+    def_line = proper_kernel_fn_code.splitlines()[0]
+    assert def_line.startswith("def ")
+    start_idx = def_line.index("(")
+    end_idx = def_line.index("):")
+    decl_csv = def_line[start_idx + 1 : end_idx]
+    comps = decl_csv.split(",")
+    return len(comps)
+
+
+def _parse_proper_kernel_fn_code(kernel_fn_code):
+    """
+    Skip decorators.
+    """
+    start_pos = kernel_fn_code.index("def ")
+    return kernel_fn_code[start_pos:]
+
+
+def _parse_numel(proper_kernel_fn_code, numel_arg_name):
+    m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code)
+    if m:
+        return int(m.group(1))
+    else:
+        return None
+
+
+def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category):
+    """
+    inductor meta looks like:
+        inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0},
+    """
+    m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code)
+    if m:
+        return float(m.group(1))
+    else:
+        """
+        There are a few cases that kernel_num_gdb field can be missing:
+        1. the field will be missing if config.benchmark_kernel and
+           config.profile_bandwidth are false
+        2. even if config.benchmark_kernel or config.profile_bandwidth is true.
+           foreach kernel does not have kernel_num_gb field in the metadata
+        """
+        return None
+
+
+def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code):
+    """
+    An utility to log kernel metadata. We may parse metadata from kernel source code here.
+
+    It's fine to parse the generated kernel code here since the logging is
+    disabled by default. It would hurt compilation time.
+    """
+    from .wrapper_benchmark import get_kernel_category_by_source_code
+
+    kernel_category = get_kernel_category_by_source_code(kernel_module_code)
+    reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code)
+    size_hints = _parse_size_hints(kernel_module_code, kernel_category)
+    kernel_fn_code = _parse_kernel_fn_code(kernel_module_code)
+
+    proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code)
+
+    # the line of code excluding the decortors
+    kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code)
+
+    get_metric_table("kernel_metadata").add_row(
+        lambda: {
+            "kernel_name": kernel_name,
+            "kernel_path": kernel_path,
+            "kernel_category": kernel_category,
+            "size_hints": size_hints,
+            "reduction_hint": reduction_hint,
+            "line_of_code": kernel_line_of_code,
+            "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"),
+            "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"),
+            "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "),
+            "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"),
+            "num_args": _count_args(proper_kernel_fn_code),
+            "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"),
+            "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"),
+            "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"),
+            "kernel_args_num_gb": _parse_kernel_args_num_gb(
+                kernel_fn_code, kernel_category
+            ),
+        }
+    )
+
+
+def purge_old_log_files():
+    """
+    Purge the old log file at the beginning when the benchmark script runs.
+    Should do it in the parent process rather than the child processes running
+    each individual model.
+    """
+    for name, table in REGISTERED_METRIC_TABLES.items():
+        if name in enabled_metric_tables():
+            filename = table.output_filename()
+            if os.path.exists(filename):
+                os.unlink(filename)
+
+            table.write_header()
+
+
+@lru_cache
+def enabled_metric_tables() -> Set[str]:
+    config_str = config.enabled_metric_tables
+
+    enabled = set()
+    for name in config_str.split(","):
+        name = name.strip()
+        if not name:
+            continue
+        assert (
+            name in REGISTERED_METRIC_TABLES
+        ), f"Metric table name {name} is not registered"
+        enabled.add(name)
+    return enabled
+
+
+def is_metric_table_enabled(name):
+    return name in enabled_metric_tables()
+
+
+def get_metric_table(name):
+    assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined"
+    return REGISTERED_METRIC_TABLES[name]
diff --git a/MLPY/Lib/site-packages/torch/_inductor/ops_handler.py b/MLPY/Lib/site-packages/torch/_inductor/ops_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a20e9848a3244a752ddff2e8aa258115ec6ddcc6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/ops_handler.py
@@ -0,0 +1,655 @@
+import itertools
+from typing import Any, Callable, Generic, Literal, Optional, Tuple, TypeVar, Union
+from unittest.mock import patch
+
+import sympy
+from typing_extensions import Protocol
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx.graph import inplace_methods, magic_methods
+from .utils import IndentedBuffer, reduction_num_outputs, sympy_index_symbol, sympy_str
+
+T = TypeVar("T")
+StoreMode = Optional[Literal["atomic_add"]]
+ReductionType = Literal[
+    "argmax",
+    "argmin",
+    "welford_reduce",
+    "welford_combine",
+    "any",
+    "max",
+    "min",
+    "prod",
+    "sum",
+    "xor_sum",
+]
+
+
+def _arg_str(a) -> str:
+    if isinstance(a, sympy.Expr):
+        return sympy_str(a)
+    return str(a)
+
+
+# NB: This is not done as a parent class, because our ops handlers
+# implementations make heavy use of __getattr__ magic, and pre-existing
+# stubs for methods would interfere with this mechanism.
+#
+# TODO: A superclass that does desugaring for operations like
+# reciprocal/square might be useful.
+class OpsHandler(Protocol[T]):
+    """
+    Protocol describing the set of valid operations on ``torch._inductor.virtualized.ops``,
+    as well as the contract for op handlers.  The type T signifies the domain
+    of the abstract analysis AKA what all of the functions return / take as arguments
+    anywhere compute occurs.
+
+    While these operators are typically dtype polymorphic (e.g., you can use mul
+    on both integers and floats), they do NOT do promotion and usually return the
+    same dtype as the input.  You are expected to have handled type promotion
+    during ATen decompositions.  Most operators correspond exactly to pointwise
+    operations as defined by torch, so when in doubt about semantics, check the
+    corresponding torch documentation.  These are all scalar operations (so they
+    are defined to operate on a single element at a time.)
+
+    For convenience, many operators take a src_dtype which indicates what the dtype
+    of the input argument is.  Although in principle this can be derived by an
+    analysis, providing this for ops where it is useful helps avoid having to repeatedly
+    recompute dtype in code generation.
+
+    Note that this often describes a class of static methods, for stateless
+    ops handlers.
+
+    Handlers are often defined using ``__getattr__`` metaprogramming, which means
+    that you cannot declare that a type implements a protocol by inheriting from
+    it (as the type stubs count as attribute declarations and impede the getattr
+    magic method from being called).  Instead, define a function that casts an
+    argument of your type to the protocol, which is sufficient to induce mypy to
+    test that the protocol is implemented correctly.  Search for ``_typecheck_``
+    in this file to see some examples.  If you see an obscure error where a
+    class doesn't implement a Protocol, but mypy doesn't say why, check to see
+    that ``__getattr__`` is typed correctly (typically, it is not possible to
+    type ``__getattr__`` without typing it as ``Callable[..., Any]``)
+    """
+
+    def constant(self, value: Union[bool, float, int], dtype: torch.dtype) -> T:
+        """Produces a scalar constant of type dtype."""
+        ...
+
+    def load_seed(self, name: str, offset: T):
+        """Computes inductor_prims.lookup_seed."""
+        ...
+
+    def rand(self, seed: T, offset: T) -> T:
+        """Computes inductor_prims.random with mode="rand".  offset has dtype int32."""
+        ...
+
+    def randn(self, seed: T, offset: T) -> T:
+        """Computes inductor_prims.random with mode="randn".  offset has dtype int32."""
+        ...
+
+    def randint64(self, seed: T, offset: T, low: T, high: T) -> T:
+        """Computes inductor_prims.randint.  offset has dtype int32."""
+        ...
+
+    def masked(self, mask: T, body: Callable[[], T], other: T) -> T:
+        """
+        Computes body, but only perform loads/stores if the boolean mask
+        evaluates to true.  For example, you would use this if you needed to
+        perform an indirect load that may not be valid on some elements;
+        without masking, invalid accesses can cause IMAs.  When mask is true,
+        the result is the result of body; otherwise it is other.
+
+        Contrast this with ops.where, which can multiplex between two values
+        that have been unconditionally computed.
+        """
+        ...
+
+    def where(self, condition: T, input: T, other: T) -> T:
+        """
+        Computes torch.where: when condition is true, return input; otherwise return other.
+        """
+        ...
+
+    def index_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> T:
+        """
+        Converts a sympy expression into a scalar of type dtype.  expr is typically
+        an indexing expression, thus the name; however, it can also be used in
+        non-indexing situations.
+        """
+        ...
+
+    def to_dtype(
+        self, x: T, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None
+    ) -> T:
+        """
+        Convert x to dtype.  src_dtype can be optionally set to specify what the original
+        dtype of x was, which can improve code generation (used by torch to(dtype=dtype)).
+        """
+        ...
+
+    def to_dtype_bitcast(self, x: T, dtype: torch.dtype, src_dtype: torch.dtype) -> T:
+        """
+        Reinterpret cast x to dtype (reinterpreting the bits in memory as another dtype.)
+        src_dtype must be the original type of x.
+        """
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # These operations are only available in a "kernel" context.  Check
+    # torch._inductor.codegen.common.CSEProxy for their typical implementation
+    # in op handler (routing to their respective implementations in the kernel
+    # handler)
+    #
+    # Importantly, inside a kernel, indexing and mask variables are available
+    # in scope, which are typically used by sympy.Expr indexing.
+
+    def indirect_indexing(
+        self, x: T, size: sympy.Expr, check: bool = True
+    ) -> sympy.Expr:
+        """
+        Convert an integral x into a sympy.Expr that can be subsequently used in
+        indexing computation.  'size' represents an upper bound on the what valid
+        indexes can be; when 'check' is True, we check that the x is in bounds.
+
+        NB: This is typically mandatory to implement for any analysis, because you
+        MUST return a valid sympy.Expr of some sort (even if it's a meaningless symbol).
+        """
+        ...
+
+    def load(self, name: str, index: sympy.Expr) -> T:
+        """
+        Load from the memory location 'name', offset by some indexing expression 'index'.
+        """
+        ...
+
+    def store(
+        self,
+        name: str,
+        index: sympy.Expr,
+        value: T,
+        mode: StoreMode = None,
+    ) -> None:
+        """
+        Store 'value' to the memory location 'name' offset by 'expr'.  If
+        specified, 'mode' can require the store to be an atomic addition.
+        """
+        ...
+
+    # TODO: Better explain how the "collective" semantics of these ops;
+    # remember that the input value is a scalar, you can't reduce on it in the
+    # traditional sense!
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: T,
+    ) -> Union[T, Tuple[T, ...]]:
+        """
+        Perform a 'reduction_type' reduction on 'value' of dtype 'src_dtype',
+        using 'dtype' as the accumulation dtype for the reduction.  The result
+        is an intermediate computation which should be stored to the final
+        location using 'ops.store_reduction'.
+
+        Valid reduction types are .  For Welford reduction types, this
+        function returns multiple outputs; consult reduction_num_outputs to
+        determine the amount in metaprogramming applications.
+        """
+        ...
+
+    # TODO: in practice, this seems to actually return None, but not returning
+    # a T makes common __getattr__ idioms not type correctly.  Figure out if
+    # this should be returning something.
+    def store_reduction(self, name: str, index: sympy.Expr, value: T) -> T:
+        """
+        Store the fully accumulated result of 'reduction' to the memory
+        location 'name' offset by 'expr'.
+        """
+        ...
+
+    def scan(
+        self, dtype: torch.dtype, combine_fn: Callable[[T, T], T], value: T, init: int
+    ) -> T:
+        """
+        Perform an associative scan on 'value'.
+        """
+        # TODO: Improve the description with some pseudocode
+        ...
+
+    def bucketize(
+        self,
+        values: T,
+        offsets_name: str,
+        offsets_size: sympy.Expr,
+        indexing_dtype: torch.dtype,
+        right: bool,
+    ) -> T:
+        # See [Note: Inductor bucketize op]
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # The following ops have semantics that correspond exactly to the torch
+    # operation with the same corresponding name.
+
+    def abs(self, x0: T) -> T:
+        ...
+
+    def exp(self, x0: T) -> T:
+        ...
+
+    def exp2(self, x0: T) -> T:
+        ...
+
+    def expm1(self, x0: T) -> T:
+        ...
+
+    def sqrt(self, x0: T) -> T:
+        ...
+
+    def relu(self, x0: T) -> T:
+        ...
+
+    def minimum(self, x0: T, x1: T) -> T:
+        ...
+
+    def maximum(self, x0: T, x1: T) -> T:
+        ...
+
+    def cos(self, x0: T) -> T:
+        ...
+
+    def sin(self, x0: T) -> T:
+        ...
+
+    def lgamma(self, x0: T) -> T:
+        ...
+
+    def erf(self, x0: T) -> T:
+        ...
+
+    def cosh(self, x0: T) -> T:
+        ...
+
+    def sinh(self, x0: T) -> T:
+        ...
+
+    def acos(self, x0: T) -> T:
+        ...
+
+    def acosh(self, x0: T) -> T:
+        ...
+
+    def asin(self, x0: T) -> T:
+        ...
+
+    def asinh(self, x0: T) -> T:
+        ...
+
+    def atan2(self, x0: T, x1: T) -> T:
+        ...
+
+    def atan(self, x0: T) -> T:
+        ...
+
+    def atanh(self, x0: T) -> T:
+        ...
+
+    def copysign(self, x0: T, x1: T) -> T:
+        ...
+
+    def erfc(self, x0: T) -> T:
+        ...
+
+    def erfinv(self, x0: T) -> T:
+        ...
+
+    def frexp(self, x0: T):
+        ...
+
+    def hypot(self, x0: T, x1: T) -> T:
+        ...
+
+    def log10(self, x0: T) -> T:
+        ...
+
+    def nextafter(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_and(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_not(self, x0: T) -> T:
+        ...
+
+    def logical_or(self, x0: T, x1: T) -> T:
+        ...
+
+    def logical_xor(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_and(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_not(self, x0: T) -> T:
+        ...
+
+    def bitwise_or(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_xor(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_left_shift(self, x0: T, x1: T) -> T:
+        ...
+
+    def bitwise_right_shift(self, x0: T, x1: T) -> T:
+        ...
+
+    def rsqrt(self, x0: T) -> T:
+        ...
+
+    def log1p(self, x0: T) -> T:
+        ...
+
+    def tan(self, x0: T) -> T:
+        ...
+
+    def tanh(self, x0: T) -> T:
+        ...
+
+    def sigmoid(self, x0: T) -> T:
+        ...
+
+    def signbit(self, x0: T) -> T:
+        ...
+
+    def fmod(self, x0: T, x1: T) -> T:
+        ...
+
+    def log(self, x0: T) -> T:
+        ...
+
+    def isinf(self, x0: T) -> T:
+        ...
+
+    def isnan(self, x0: T) -> T:
+        ...
+
+    def round(self, x0: T) -> T:
+        ...
+
+    def floor(self, x0: T) -> T:
+        ...
+
+    def sign(self, x0: T) -> T:
+        ...
+
+    def to_int(self, x0: T) -> T:
+        ...
+
+    def trunc(self, x0: T) -> T:
+        ...
+
+    def truncdiv(self, x0: T, x1: T) -> T:
+        ...
+
+    def ceil(self, x0: T) -> T:
+        ...
+
+    def neg(self, x0: T) -> T:
+        ...
+
+    def reciprocal(self, x0: T) -> T:
+        ...
+
+    def eq(self, x0: T, x1: T) -> T:
+        ...
+
+    def ne(self, x0: T, x1: T) -> T:
+        ...
+
+    def lt(self, x0: T, x1: T) -> T:
+        ...
+
+    def gt(self, x0: T, x1: T) -> T:
+        ...
+
+    def le(self, x0: T, x1: T) -> T:
+        ...
+
+    def ge(self, x0: T, x1: T) -> T:
+        ...
+
+    def add(self, x0: T, x1: T) -> T:
+        ...
+
+    def sub(self, x0: T, x1: T) -> T:
+        ...
+
+    def mul(self, x0: T, x1: T) -> T:
+        ...
+
+    def floordiv(self, x0: T, x1: T) -> T:
+        ...
+
+    def truediv(self, x0: T, x1: T) -> T:
+        ...
+
+    def div(self, x0: T, x1: T) -> T:
+        ...
+
+    def mod(self, x0: T, x1: T) -> T:
+        ...
+
+    def pow(self, x0: T, x1: T) -> T:
+        ...
+
+    def and_(self, x0: T, x1: T) -> T:
+        ...
+
+    def or_(self, x0: T, x1: T) -> T:
+        ...
+
+    def xor(self, x0: T, x1: T) -> T:
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # In CUDA, optimized implementations of other mathematical operations are
+    # offered separately via libdevice for double precision computation (in
+    # Triton, these go to tl.math rather than tl).  We lower to these
+    # operators when doing FP64 on CUDA.  Note that some operators
+    # unconditional go to tl.math.
+    #
+    # TODO(ezyang): Is this really the best way to do this?  What if we have
+    # abs internally route to tl.math automatically when given a double
+    # precision input?  One reason is that when doing codegen, we often don't
+    # know what the dtype of the inputs are!  (In principle we do know, but
+    # for many analyses it's not conveniently available.)
+
+    def libdevice_abs(self, x0: T) -> T:
+        ...
+
+    def libdevice_exp(self, x0: T) -> T:
+        ...
+
+    def libdevice_sqrt(self, x0: T) -> T:
+        ...
+
+    def libdevice_cos(self, x0: T) -> T:
+        ...
+
+    def libdevice_sin(self, x0: T) -> T:
+        ...
+
+    def libdevice_sigmoid(self, x0: T) -> T:
+        ...
+
+    def libdevice_log(self, x0: T) -> T:
+        ...
+
+
+class MockHandler:
+    def __getattr__(self, name):
+        if name == "name":
+            return "MockHandler"
+
+        def inner(*args, **kwargs):
+            fargs = [_arg_str(a) for a in args]
+            fargs.extend(f"{k}={v}" for k, v in kwargs.items())
+            return f"ops.{name}({', '.join(fargs)})"
+
+        return inner
+
+    @staticmethod
+    def masked(mask, body, other) -> str:
+        return f"ops.masked({mask}, {body()}, {other})"
+
+    @staticmethod
+    def frexp(x):
+        return (f"ops.frexp({x})[0]", f"ops.frexp({x})[1]")
+
+    @staticmethod
+    def indirect_indexing(index_var, size, check=True) -> sympy.Symbol:
+        return sympy_index_symbol(f"({str(index_var)})")
+
+    @classmethod
+    def _init_cls(cls):
+        def make_handler(format_string):
+            @staticmethod  # type: ignore[misc]
+            def inner(*args):
+                return format_string.format(*args)
+
+            return inner
+
+        for name, format_string in itertools.chain(
+            magic_methods.items(), inplace_methods.items()
+        ):
+            setattr(cls, name, make_handler(format_string))
+
+
+MockHandler._init_cls()
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_MockHandler(h: MockHandler) -> OpsHandler[str]:
+    return h
+
+
+class KernelFormatterHandler:
+    def __init__(self, parent_handler):
+        self.parent_handler = parent_handler
+        self.output = IndentedBuffer(1)
+        self.var_counter = itertools.count()
+
+    @staticmethod
+    def ir_to_string(ir_fn, index, rindex=None) -> str:
+        from .ir import FlexibleLayout
+        from .virtualized import V
+
+        args = [index, rindex] if rindex is not None else [index]
+        names = ["index", "rindex"] if rindex is not None else ["index"]
+        formatter = KernelFormatterHandler(MockHandler())
+
+        with formatter.output.indent(-1):
+            formatter.output.writeline(f"def inner_fn({', '.join(names)}):")
+        for name, arg in zip(names, args):
+            if arg:
+                lhs = ", ".join(
+                    [
+                        str("_" if isinstance(v, (int, sympy.Integer)) else v)
+                        for v in arg
+                    ]
+                )
+                formatter.output.writeline(f"{lhs} = {name}")
+
+        with V.set_ops_handler(formatter), patch.object(
+            FlexibleLayout, "allow_indexing", True
+        ):
+            result = ir_fn(*args)
+            return formatter.getvalue(result)
+
+    def __getattr__(self, name) -> Callable[..., Any]:
+        def inner(*args, **kwargs):
+            line = getattr(self.parent_handler, name)(*args, **kwargs)
+            if name == "indirect_indexing":
+                return line
+
+            def write(line):
+                # replace line with a new variable name
+                varname = f"tmp{next(self.var_counter)}"
+                self.output.writeline(f"{varname} = {line}")
+                return varname
+
+            return pytree.tree_map(write, line)
+
+        return inner
+
+    def reduction(
+        self,
+        dtype: torch.dtype,
+        src_dtype: torch.dtype,
+        reduction_type: ReductionType,
+        value: Union[str, Tuple[str, ...]],
+    ) -> Union[str, Tuple[str, ...]]:
+        line = self.parent_handler.reduction(dtype, src_dtype, reduction_type, value)
+        num_values = reduction_num_outputs(reduction_type)
+        varnames = [f"tmp{next(self.var_counter)}" for _ in range(num_values)]
+        self.output.writeline(f"{','.join(varnames)} = {line}")
+        return tuple(varnames) if num_values > 1 else varnames[0]
+
+    def getvalue(self, result):
+        self.output.writeline(f"return {result}")
+        return self.output.getvalue()
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_KernelFormatterHandler(h: KernelFormatterHandler) -> OpsHandler[str]:
+    return h
+
+
+class WrapperHandler(Generic[T]):
+    def __init__(self, inner: OpsHandler[T]):
+        self._inner = inner
+
+    def __getattr__(self, item):
+        return getattr(self._inner, item)
+
+
+# Use mypy to check protocol implemented correctly
+def _typecheck_WrapperHandler(h: WrapperHandler[T]) -> OpsHandler[T]:
+    return h
+
+
+class OpCounterCSE:
+    """Shim to count how many ops are used"""
+
+    def __init__(self, inner):
+        super().__init__()
+        self.parent_handler = inner
+        self.op_count = 0
+        self.var_names = {}
+
+    def __getattr__(self, name):
+        def inner(*args, **kwargs):
+            val = getattr(self.parent_handler, name)(*args, **kwargs)
+            if name == "indirect_indexing":
+                return val
+
+            def count(val):
+                if val not in self.var_names:
+                    varname = f"tmp{self.op_count}"
+                    self.op_count += 1
+                    self.var_names[val] = varname
+                    return varname
+                else:
+                    return self.var_names[val]
+
+            return pytree.tree_map(count, val)
+
+        return inner
+
+
+def _typecheck_OpCounterCSE(h: OpCounterCSE) -> OpsHandler[str]:
+    return h
diff --git a/MLPY/Lib/site-packages/torch/_inductor/optimize_indexing.py b/MLPY/Lib/site-packages/torch/_inductor/optimize_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2438f68d16e42610a02062f67bcbfdf47564917
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/optimize_indexing.py
@@ -0,0 +1,118 @@
+import math
+
+import sympy
+
+import torch
+from torch.utils._sympy.value_ranges import ValueRanges
+from .ir import LoopBody
+from .utils import dominated_nodes
+
+
+def val_expressable_in_32_bits(val):
+    if getattr(val, "is_Boolean", False):
+        return True
+
+    if isinstance(val, sympy.Expr):
+        assert val.is_number
+        if val.is_Integer or val.is_Boolean:
+            val = int(val)
+        else:
+            val = float(val)
+
+    # bound within mantissa
+    if isinstance(val, float):
+        return val <= (2**24) and val >= -(2**24)
+
+    if isinstance(val, int):
+        iinfo = torch.iinfo(torch.int32)
+        return val <= iinfo.max and val >= iinfo.min
+
+    raise Exception(f"Unexpected value {val}")
+
+
+def range_expressable_in_32_bits(range):
+    return val_expressable_in_32_bits(range.lower) and val_expressable_in_32_bits(
+        range.upper
+    )
+
+
+def try_to_reduce_precision(node, bounds, indirect_vars, indices, replacement_vals):
+    # if a downstream use of a node explicitly converts to int32, or float16/float32/float64,
+    # then it's precision is set for that chain of uses, and we don't need to consider those
+    # dominated values
+    def skip_filter(node):
+        return node.target == "to_dtype" and node.args[2] in (
+            torch.int32,
+            torch.float32,
+            torch.float64,
+        )
+
+    # TODO - there are dominated uses whose dtype does not depend on whether
+    # we reduce the precision here, e.g. add(int64, int64) one of the args can be reduced to
+    # int32 without changing the output precision of the node. this case hasn't shown up
+    for dominated in dominated_nodes([node], skip_filter):
+        if dominated.target in ["store", "output"]:
+            continue
+
+        if isinstance(dominated.target, str) and "set_indirect" in dominated.target:
+            idx = int(dominated.target[len("set_indirect") :])
+            indirect_var = indirect_vars[idx]
+
+            # We check that we can compute all the indices it's involved in with int32
+            for index, expr in indices.items():
+                if indirect_var in expr.free_symbols:
+                    index_val = replacement_vals[index]
+
+                    if math.isinf(index_val.lower) or math.isinf(index_val.upper):
+                        return
+
+                    # all indices are integers, so make sure that we
+                    # use the bounds of integers instead of floats.
+                    # TODO - not sure if we should be doing int/float casts while tracing,
+                    # might interfere with sympy.
+
+                    index_val_int = ValueRanges[sympy.Expr](
+                        int(index_val.lower), int(index_val.upper)
+                    )
+                    if not range_expressable_in_32_bits(index_val_int):
+                        return
+
+        if not range_expressable_in_32_bits(bounds[dominated]):
+            return
+
+    args = list(node.args)
+    args[2] = torch.int32
+    node.args = tuple(args)
+
+
+def indexing_dtype_strength_reduction(loop_body: LoopBody):
+    """
+    Performs Value Range Analysis on LoopBody's fx graph to reduce precision of
+    intermediaries from int64 to int32
+    """
+    bv = loop_body.bounds()
+
+    int64_dtype_nodes = [
+        node
+        for node in loop_body.get_nodes()
+        if (
+            node.target == "to_dtype"
+            and node.args[2] == torch.int64
+            and node not in bv.unbounded_vars
+        )
+    ]
+    if not int64_dtype_nodes:
+        return
+
+    bounds = bv.get_bounds()
+
+    # TODO - if dominated node of one to_dtype is not expressible in int32,
+    # we should short circuit another to_dtype node if that node also dominates
+    for node in int64_dtype_nodes:
+        try_to_reduce_precision(
+            node,
+            bounds,
+            loop_body.indirect_vars,
+            loop_body.indexing_exprs,
+            bv.replacement_vals,
+        )
diff --git a/MLPY/Lib/site-packages/torch/_inductor/pattern_matcher.py b/MLPY/Lib/site-packages/torch/_inductor/pattern_matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..016cf680973ba93cf07ff5f1638f1d816edd8bb7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/pattern_matcher.py
@@ -0,0 +1,1524 @@
+from __future__ import annotations
+
+import dataclasses
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import os
+import re
+from collections import defaultdict
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    NoReturn,
+    Optional,
+    Set,
+    Union,
+)
+
+from typing_extensions import TypeGuard
+
+import torch
+import torch._guards
+import torch.fx
+import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
+from torch._dynamo.utils import counters
+from torch._prims_common import is_integer_dtype
+from torch.fx import Node
+from torch.fx.experimental.proxy_tensor import make_fx, maybe_disable_fake_tensor_mode
+from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+
+from .._functorch import config as functorch_config
+from .._functorch.aot_autograd import aot_function, make_boxed_func
+from .._functorch.partitioners import default_partition
+from .._subclasses import FakeTensorMode
+from ..fx import Transformer
+from . import config
+from .decomposition import select_decomp_table
+from .lowering import fallback_node_due_to_unsupported_type
+
+log = logging.getLogger(__name__)
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+Constant = Any
+NodeOrConstant = Union[Constant, torch.fx.Node]
+
+
+class Multiple:
+    pass
+
+
+# Sentinel indicating multiple quantities can be matched
+MULTIPLE = Multiple()
+
+
+class Match:
+    """
+    Represents a successfully matched pattern.
+    """
+
+    def __init__(self, pattern: PatternExpr, args=None, kwargs=None):
+        super().__init__()
+        self.pattern = pattern
+        # The input nodes that must be passed in to the result
+        self.args = args or []
+        self.kwargs = kwargs or {}
+        # The nodes matched in this expression
+        self.nodes: List[torch.fx.Node] = []
+        # Mapping CallFunction to the node.target
+        self.targets: Dict[_TargetExpr, torch.fx.node.Target] = {}
+        self.ctx: Optional[MatchContext] = None
+        self.replacement_graph: Optional[torch.fx.Graph] = None
+
+    @property
+    def graph(self) -> torch.fx.Graph:
+        assert self.ctx
+        return self.ctx.graph
+
+    def extend(self, other: Match):
+        if self.kwargs:
+            for key in set(self.kwargs.keys()) & set(other.kwargs.keys()):
+                if self.kwargs[key] != other.kwargs[key]:
+                    raise FailedMatch("kwarg mismatch: {}", key)
+        self.args.extend(other.args)
+        self.nodes.extend(other.nodes)
+        self.kwargs.update(other.kwargs)
+        self.targets.update(other.targets)
+
+    def bundle(self) -> Match:
+        # Wrap args in an extra list
+        self.args = [tuple(self.args)] if self.args else []
+        return self
+
+    def __repr__(self):
+        return f"Match(..., {self.args}, {self.kwargs})"
+
+    def erase_nodes(self, graph: torch.fx.Graph):
+        for n in reversed(self.nodes):
+            if not n._erased:
+                graph.erase_node(n)
+
+    def output_nodes(self) -> List[Optional[torch.fx.Node]]:
+        assert self.ctx
+        return [
+            (self.ctx.pattern_to_node[p] if p is not None else None)
+            for p in self.ctx.outputs
+        ]
+
+    def output_node(self) -> torch.fx.Node:
+        return next(p for p in self.output_nodes() if p)
+
+    def replace_with_graph(self, replacement_graph, args):
+        assert self.ctx
+        ReplacementPatternEntry.replace_with_graph(
+            self, self.ctx.graph, replacement_graph, args
+        )
+
+    def replace_by_example(self, replacement_fn, args, trace_fn=None, run_dce=True):
+        assert self.ctx
+        if trace_fn is None:
+            trace_fn = functools.partial(fwd_only, run_dce=run_dce)
+        replacement = trace_fn(
+            replacement_fn, torch.fx.map_arg(args, lambda arg: arg.meta["val"])
+        )
+        ReplacementPatternEntry.replace_with_graph(
+            self,
+            self.ctx.graph,
+            replacement,
+            args,
+        )
+
+
+class FailedMatch(RuntimeError):
+    def __init__(self, format_string, *args, **kwargs):
+        self.format_string = format_string
+        # We want to construct error messages lazily instead of eagerly, as
+        # constructing them eagerly can significantly worsen compile times.
+        if len(format_string) > 200:
+            raise RuntimeError(
+                f"Format string too long - use lazy construction of strings instead. Format string is\n {format_string}"
+            )
+        self.args = args
+        self.kwargs = kwargs
+
+    def __str__(self):
+        return self.format_string.format(*self.args, **self.kwargs)
+
+    def __bool__(self):
+        return False
+
+
+def is_match(m: Union[Match, FailedMatch]) -> TypeGuard[Match]:
+    """
+    TypeGuards cannot act on `self`. Thus this function exists to let mypy
+    recognize FailedMatch.__bool__ as a TypeGuard.
+    """
+    return bool(m)
+
+
+class MatchContext:
+    """
+    State needed while running PatternExpr._match().
+    """
+
+    def __init__(
+        self,
+        outputs: List[Optional[PatternExpr]],
+        pattern_to_node: Optional[Dict[PatternExpr, Node]] = None,
+        *,
+        graph: torch.fx.Graph,
+    ):
+        self.outputs = outputs
+        self.pattern_to_node = {} if pattern_to_node is None else pattern_to_node
+        self.graph = graph
+        self.exclusive_node_set: List[NodeOrConstant] = []
+
+    def match(self, pattern, node):
+        """wrapper to check reused nodes in patterns"""
+        if pattern in self.pattern_to_node:
+            if self.pattern_to_node[pattern] == node:
+                return Match(pattern)  # already checked this node
+            else:
+                return FailedMatch("repeated pattern differs")
+        m = pattern._match(node, self)
+        assert pattern not in self.pattern_to_node
+        self.pattern_to_node[pattern] = node if m else None
+        m.ctx = self
+        return m
+
+    def filter_multi_user_patterns(self):
+        return {
+            pattern: node
+            for pattern, node in self.pattern_to_node.items()
+            if pattern.has_multiple_users() and node is not None
+        }
+
+
+class PatternExpr:
+    """
+    Base class for types of patterns
+    """
+
+    def _match(
+        self, node: torch.fx.Node, ctx: MatchContext
+    ) -> Union[Match, FailedMatch]:
+        raise NotImplementedError()
+
+    def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
+        try:
+            return MatchContext([self], graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+
+    def has_multiple_users(self) -> bool:
+        return False
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+
+
+class Arg(PatternExpr):
+    """
+    Capture an arg which will become an input to the handler.  Args are
+    passed in depth first order.
+    """
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, args=[node])  # matches anything
+
+
+class Ignored(PatternExpr):
+    """
+    Match an arg, but don't pass it to handler
+    """
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self)  # matches anything
+
+    def __repr__(self):
+        return "*"
+
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        return "Ignored()"
+
+
+class KeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+
+    def __repr__(self):
+        return f"KeywordArg({self.name!r})"
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        return Match(self, kwargs={self.name: node})  # matches anything
+
+
+class ExclusiveKeywordArg(PatternExpr):
+    """
+    Capture a kwarg which will become an input to the handler.
+    """
+
+    def __init__(self, name):
+        super().__init__()
+        self.name = name
+
+    def __repr__(self):
+        return f"ExclusiveKeywordArg({self.name!r})"
+
+    def _match(self, node: NodeOrConstant, ctx: MatchContext):
+        if node in ctx.exclusive_node_set:
+            return FailedMatch("exclusive arg appears twice")
+
+        ctx.exclusive_node_set.append(node)
+        return Match(self, kwargs={self.name: node})  # matches anything
+
+
+class _TargetExpr(PatternExpr):
+    """
+    Base class for filtering match by node.target
+    """
+
+    op: Optional[str] = None
+
+    def __init__(self, fns, users=1):
+        if not self.op:
+            raise NotImplementedError("Shouldn't directly use _BaseNodeMatch")
+        super().__init__()
+        fns = [fns] if callable(fns) or isinstance(fns, str) else list(fns)
+        for fn in list(fns):
+            if isinstance(fn, torch._ops.OpOverloadPacket):
+                fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+
+        self.fns: List[Union[Callable[..., Any], str]] = fns
+        self.fns_set: Set[Union[Callable[..., Any], str]] = set(fns)
+        self.users: Union[int, Multiple] = users
+
+    def fns_repr(self) -> str:
+        first_repr = self.fns[0]
+        if not isinstance(first_repr, str):
+            first_repr = first_repr.__name__
+
+        if len(self.fns) > 1:
+            return f"[{first_repr}, ...]"
+        elif self.fns[0] is getattr(torch, first_repr, None):
+            return f"torch.{first_repr}"
+        elif isinstance(self.fns[0], torch._ops.OpOverload):
+            return str(self.fns[0])
+        else:
+            return first_repr
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.fns_repr()})"
+
+    def has_multiple_users(self) -> bool:
+        return isinstance(self.users, Multiple) or self.users > 1
+
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        raise NotImplementedError()
+
+    def _match_fns(self, node: torch.fx.Node):
+        return (
+            isinstance(node, torch.fx.Node)
+            and node.op == self.op
+            and extract_target(node) in self.fns_set
+        )
+
+    def _match_users(self, node: torch.fx.Node, ctx: MatchContext):
+        return (
+            self in ctx.outputs
+            or self.users is MULTIPLE
+            or len(node.users) == self.users
+        )
+
+
+class _TargetArgsExpr(_TargetExpr):
+    """
+    Base class for filtering match by node.{target,args,kwargs}
+    """
+
+    def __init__(self, fns, *args, _users=1, **kwargs):
+        super().__init__(fns, _users)
+        self.args = tuple(args)
+        self.kwargs = dict(kwargs)
+        if any(
+            isinstance(x, (dict, list, tuple))
+            for x in itertools.chain(args, kwargs.values())
+        ):
+            self.flatten = self.pytree_flatten
+        else:
+            self.flatten = self.simple_flatten
+        self.flat_args_kwargs = self.flatten(self.args, self.kwargs)
+
+    @staticmethod
+    def simple_flatten(args, kwargs: Dict[Any, Any]):
+        return (*args, *kwargs.values()), (len(args), *kwargs.keys())
+
+    @staticmethod
+    def pytree_flatten(args, kwargs: Dict[Any, Any]):
+        def norm_spec(s: pytree.TreeSpec):
+            if s.type is None:
+                return s
+            mapping = {immutable_list: list, tuple: list, immutable_dict: dict}
+            return pytree.TreeSpec(
+                mapping.get(s.type, s.type),
+                s.context,
+                list(map(norm_spec, s.children_specs)),
+            )
+
+        flat, spec = pytree.tree_flatten([args, kwargs])
+        spec = norm_spec(spec)
+        return flat, spec
+
+    def __repr__(self):
+        args = [
+            self.fns_repr(),
+            *map(repr, self.args),
+            *[f"{k}={v}" for k, v in self.kwargs.items()],
+        ]
+        return f"{self.__class__.__name__}({', '.join(args)})"
+
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        args = [
+            self.fns_repr(),
+            *(pp.pretty_print(x) for x in self.args),
+            *[f"{k}={pp.pretty_print(v)}" for k, v in self.kwargs.items()],
+        ]
+        if isinstance(self.users, Multiple):
+            args.append("_users=MULTIPLE")
+        elif self.users > 1:
+            args.append(f"_users={self.users}")
+
+        joiner_str = ", "
+        return f"{self.__class__.__name__}({joiner_str.join(args)})"
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        if not self._match_fns(node) or len(node.args) != len(self.args):
+            return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users {}", self)
+
+        _args = node.args
+        _kwargs = node.kwargs
+        if len(_kwargs) < len(self.kwargs):
+            from torch.fx.operator_schemas import normalize_function
+
+            normalized_args_and_kwargs = normalize_function(
+                node.target, node.args, node.kwargs
+            )
+
+            if normalized_args_and_kwargs is None:
+                return FailedMatch("function_mismatch: node={}, pattern={}", node, self)
+            else:
+                _args, _kwargs = normalized_args_and_kwargs
+                if len(_args) == len(self.args) and len(_kwargs) >= len(self.kwargs):
+                    _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+                else:
+                    return FailedMatch(
+                        "function_mismatch: node={}, pattern={}", node, self
+                    )
+        else:
+            _kwargs = {i: _kwargs[i] for i in _kwargs if i in self.kwargs}
+
+        node_items, node_spec = self.flatten(_args, _kwargs)
+        self_items, self_spec = self.flat_args_kwargs
+        if node_spec != self_spec:
+            return FailedMatch("args_structure {} {}", node_spec, self_spec)
+        assert len(node_items) == len(self_items)
+
+        m = Match(self)
+        for i, pattern, child_node in zip(itertools.count(), self_items, node_items):
+            if isinstance(pattern, PatternExpr):
+                child_match = ctx.match(pattern, child_node)
+                if not child_match:
+                    return child_match
+                m.extend(child_match)
+            elif isinstance(child_node, torch.fx.Node) or child_node != pattern:
+                return FailedMatch(
+                    "constant_args: {} {!r}!={pattern!r}", node, child_node
+                )
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        return m
+
+    def find_anchor_nodes(self, ctx: MatchContext, searched):
+        """
+        This is used when we are matching a pattern with multiple outputs.
+        There is a partial match (stored in ctx) and we want to walk
+        this pattern to find a connection to an already-matched node.
+
+        Yields candidate nodes that `self._match` might like.
+        """
+        if self in ctx.pattern_to_node:
+            yield ctx.pattern_to_node[self]
+            return
+
+        for pattern in self.flat_args_kwargs[0]:
+            if isinstance(pattern, PatternExpr):
+                for other_node in pattern.find_anchor_nodes(ctx, searched):
+                    if not isinstance(other_node, torch.fx.Node):
+                        continue
+                    for node in other_node.users:
+                        if node not in searched:
+                            if self._match_fns(node):
+                                yield node
+                                searched.add(node)
+
+
+class CallFunction(_TargetArgsExpr):
+    """
+    Matches a call_function node in the FX graphs: `fns[i](*args, **kwargs)`
+    """
+
+    op = "call_function"
+
+
+class CallMethod(_TargetArgsExpr):
+    """
+    Matches a call_method node in the FX graphs: `fns[i].method(*args, **kwargs)`
+    """
+
+    op = "call_method"
+
+
+class CallModule(_TargetArgsExpr):
+    """
+    Matches a call_module node in the FX graphs: `module(*args, **kwargs)`
+    """
+
+    op = "call_module"
+
+
+class _TargetExprVarArgs(_TargetExpr):
+    """
+    Matches a call_function node with any arguments which are passed into the pattern
+    """
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        if not self._match_fns(node):
+            return FailedMatch("function_mismatch")
+
+        if not self._match_users(node, ctx):
+            return FailedMatch("multiple_users")
+
+        m = Match(self)
+        m.nodes.append(node)
+        m.targets[self] = node.target
+        m.args.extend(node.args)
+        m.kwargs.update(node.kwargs)
+        return m
+
+
+class CallFunctionVarArgs(_TargetExprVarArgs):
+    op = "call_function"
+
+
+class CallMethodVarArgs(_TargetExprVarArgs):
+    op = "call_method"
+
+
+class CallModuleVarArgs(_TargetExprVarArgs):
+    op = "call_module"
+
+
+class ListOf(PatternExpr):
+    """
+    Matches a repeated pattern
+    """
+
+    def __init__(self, pattern: PatternExpr, partial=False):
+        super().__init__()
+        assert isinstance(pattern, PatternExpr)
+        self.pattern = pattern
+        self.partial = partial
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.pattern})"
+
+    def _match(self, node: List[torch.fx.Node], ctx: MatchContext):  # type: ignore[override]
+        if not isinstance(node, (list, tuple)) or len(node) == 0:
+            return FailedMatch("non_list")
+        m = Match(self)
+        # Propagating patterns with multiple users will ensure we don't revisit
+        # the same nodes
+        pattern_to_node = ctx.filter_multi_user_patterns()
+        matched = False
+        for i, child_node in enumerate(node):
+            child_ctx = MatchContext(
+                ctx.outputs, pattern_to_node, graph=child_node.graph
+            )
+            child_match = child_ctx.match(self.pattern, child_node)
+            pattern_to_node = child_ctx.filter_multi_user_patterns()
+            if not child_match:
+                if not self.partial:
+                    return FailedMatch("list[{}]: {}", i, child_match)
+                continue
+            matched = True
+            m.extend(child_match.bundle())
+        if not matched:
+            return FailedMatch("list: no_match")
+        return m.bundle()
+
+
+class MultiOutputPattern(PatternExpr):
+    def __init__(self, outputs):
+        super().__init__()
+        assert all(isinstance(x, (PatternExpr, type(None))) for x in outputs), outputs
+        self.outputs: List[Optional[PatternExpr]] = outputs
+
+    @property
+    def fns(self):
+        assert self.outputs[0] and hasattr(self.outputs[0], "fns")
+        return self.outputs[0].fns
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self.outputs})"
+
+    def pretty_print(self, pp: PatternPrettyPrinter):
+        args = [pp.pretty_print(x) for x in self.outputs]
+        joiner_str = f",\n{'  '}"
+        str_out = f"{self.__class__.__name__}([{joiner_str.join(args)}"
+        str_out = f"{str_out}\n])"
+        return str_out
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        m = ctx.match(self.outputs[0], node)
+        if not m:
+            return m
+
+        for pattern in self.outputs[1:]:
+            if pattern is None:
+                continue
+            child_match = self._match_from_anchors(pattern, ctx)
+            if not child_match:
+                return child_match
+            m.extend(child_match)
+
+        return m
+
+    def _match_from_anchors(self, pattern, ctx):
+        prior = dict(ctx.pattern_to_node)
+        m = FailedMatch("no anchor found")
+        for node in pattern.find_anchor_nodes(ctx, set()):
+            m = ctx.match(pattern, node)
+            if m:
+                return m
+            # revert any partial matches
+            ctx.pattern_to_node = dict(prior)
+        return m
+
+    def match(self, node: torch.fx.Node) -> Union[Match, FailedMatch]:
+        try:
+            return MatchContext(self.outputs, graph=node.graph).match(self, node)
+        except FailedMatch as e:
+            return e
+
+
+class RepeatedExpr(PatternExpr):
+    """
+    Checks for a repeated pattern. Useful for repeated operations after a node such as `split` or `unbind`
+    """
+
+    def __init__(self, inner_pattern: PatternExpr):
+        super().__init__()
+        assert hasattr(inner_pattern, "fns")
+        self.inner_pattern = inner_pattern
+
+    @property
+    def fns(self):
+        return self.inner_pattern.fns
+
+    def _match(self, node: torch.fx.Node, ctx: MatchContext):
+        m = ctx.match(self.inner_pattern, node)
+        if not m:
+            return m
+        ctx.pattern_to_node.pop(
+            self.inner_pattern,
+        )
+        # Check all anchor nodes match the pattern
+        for anchor_node in self.inner_pattern.find_anchor_nodes(ctx, set()):
+            anchor_m = MatchContext([self], graph=node.graph).match(
+                self.inner_pattern, anchor_node
+            )
+            if not anchor_m:
+                return anchor_m
+            m.extend(anchor_m)
+        return m
+
+
+class PatternPrettyPrinter:
+    """
+    Serializes Patterns to executable python.
+    XXX: currently only used and tested for fuse attention patterns. May not cover
+    all patterns.
+    """
+
+    def __init__(self):
+        self.namespace = torch.fx.graph._Namespace()
+        self.memoized_objs_names: Dict[PatternExpr, str] = {}
+        self.memoized_objs_pp: Dict[PatternExpr, str] = {}
+
+    @staticmethod
+    def run(obj: PatternExpr, output_name="output"):
+        """
+        Serializes obj to python code with obj written out to `output_name`
+        """
+
+        pp = PatternPrettyPrinter()
+        assert hasattr(obj, "pretty_print")
+        out_str = obj.pretty_print(pp=pp)
+
+        output = []
+        for key in pp.memoized_objs_names:
+            output.append(f"{pp.memoized_objs_names[key]} = {pp.memoized_objs_pp[key]}")
+
+        output.append(f"{output_name} = {out_str}")
+
+        return "\n".join(output)
+
+    def pretty_print(self, obj):
+        if isinstance(obj, _TargetArgsExpr):
+            if memoized_name := self.memoized_objs_names.get(obj):
+                return memoized_name
+            else:
+                return self.memoize(obj)
+        if hasattr(obj, "pretty_print"):
+            return obj.pretty_print(self)
+
+        return repr(obj)
+
+    def memoize(self, obj):
+        obj_str = obj.pretty_print(self)
+        obj_name = obj.fns_repr()
+        for prefix in ("aten.", "torch.", "prims."):
+            obj_name = obj_name.replace(prefix, "")
+
+        tmp_name = self.namespace.create_name(obj_name, None)
+        self.memoized_objs_names[obj] = tmp_name
+        self.memoized_objs_pp[obj] = obj_str
+        return tmp_name
+
+
+@dataclasses.dataclass
+class PatternEntry:
+    pattern: PatternExpr
+    extra_check: Callable[[Match], bool]
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        raise NotImplementedError()
+
+    def register(self, pass_dicts, target=None, prepend=False):
+        if target is None:
+            assert hasattr(self.pattern, "fns")
+            for fn in self.pattern.fns:
+                self.register(pass_dicts, fn, prepend=prepend)
+        elif isinstance(pass_dicts, (dict, PatternMatcherPass)):
+            if prepend:
+                pass_dicts[target].insert(0, self)
+            else:
+                pass_dicts[target].append(self)
+        else:
+            for x in pass_dicts:
+                self.register(x, target, prepend=prepend)
+
+
+@dataclasses.dataclass
+class LoweringPatternEntry(PatternEntry):
+    handler: Callable[..., Any]
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        handler = functools.wraps(self.handler)(functools.partial(self.handler, match))
+        with graph.inserting_before(node):
+            replacement = graph.call_function(handler, tuple(match.args), match.kwargs)
+            replacement.meta.update(node.meta)
+            node.replace_all_uses_with(replacement)
+        assert match.nodes[-1] is node
+        match.erase_nodes(graph)
+
+
+@dataclasses.dataclass
+class GraphPatternEntry(PatternEntry):
+    """
+    A pattern that runs a function on the FX graph
+    """
+
+    handler: Callable[..., Any]
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        with graph.inserting_before(node):
+            self.handler(match, *match.args, **match.kwargs)
+
+
+@dataclasses.dataclass
+class ReplacementPatternEntry(PatternEntry):
+    normalize_args: Callable[..., List[Any]]
+
+    @staticmethod
+    def replace_with_graph(
+        match: Match,
+        graph: torch.fx.Graph,
+        replacement_graph: torch.fx.Graph,
+        args: List[Any],
+    ):
+        output_nodes = match.output_nodes()
+        first_node = output_nodes[0]
+
+        class Replacer(torch.fx.Interpreter):
+            call_method = None  # type: ignore[assignment]
+            call_module = None  # type: ignore[assignment]
+            get_attr = None  # type: ignore[assignment]
+
+            def run_node(self, node) -> Any:
+                if node.op in ("placeholder", "output"):
+                    return super().run_node(node)
+                if node.op == "call_function":
+                    target = node.target
+                    args, kwargs = self.fetch_args_kwargs_from_env(node)
+                    result = graph.call_function(target, args, kwargs)
+                    if "val" in node.meta and "val" not in result.meta:
+                        result.meta["val"] = node.meta["val"]
+                        if isinstance(node.meta["val"], torch.Tensor):
+                            assert "tensor_meta" in node.meta
+                            result.meta["tensor_meta"] = node.meta["tensor_meta"]
+                    return result
+                raise NotImplementedError(f"unhandled {node}")
+
+        output_nodes = match.output_nodes()
+
+        if len(output_nodes) == 1:
+            last_node = output_nodes[0]
+        else:
+            assert output_nodes[0]
+            nodes = list(output_nodes[0].graph.nodes)
+            indices = [
+                (nodes.index(n), n)
+                for n in output_nodes
+                if isinstance(n, torch.fx.Node)
+            ]
+            last_node = min(indices, key=lambda tup: tup[0])[1]
+
+        def percolate_tags(node, recompute_tag, input_stops):
+            queue = [node]
+            visited = set()
+
+            while queue:
+                arg = queue.pop()
+                if (
+                    arg not in visited
+                    and arg not in input_stops
+                    and hasattr(arg, "meta")
+                ):
+                    visited.add(arg)
+                    arg.meta["recompute"] = recompute_tag
+                    queue.extend(arg.all_input_nodes)
+
+        with graph.inserting_before(last_node):
+            replacement = Replacer(replacement_graph).run(*args)
+            if isinstance(replacement, torch.fx.Node):
+                replacement = [replacement]
+
+            def maybe_getitem(node):
+                if node.op != "call_function":
+                    return None
+                if node.target != operator.getitem:
+                    return None
+                assert len(node.args) == 2
+                return node.args[1]
+
+            def replace(old, new):
+                if old is None:
+                    assert new is None
+                    return
+                assert isinstance(old, torch.fx.Node)
+                if new is None:
+                    old.replace_all_uses_with(None)
+                    graph.erase_node(old)
+                    return
+                if isinstance(new, torch.fx.Node):
+                    if "val" not in new.meta:
+                        new.meta.update(old.meta)
+
+                    # Preserve the recompute tags in the replacement graph. We
+                    # look at the recompute tags of the original output node to
+                    # propagate the tag from the output all the way to the input
+                    # args (named as args in the replace_with_graph).
+                    # Note that this is best effort. Since patterns are from
+                    # many to many, there is no easy way to correctly map the
+                    # recomputable tags. It is possible in some scenarios that we
+                    # incorrectly tag some nodes as recomputables.
+                    if "recompute" in old.meta:
+                        percolate_tags(new, old.meta["recompute"], args)
+
+                    old.replace_all_uses_with(new)
+                    graph.erase_node(old)
+                    return
+
+                # `new` is not a node: it's a list of nodes.
+                #
+                # This happens when we want to replace a node that has a single
+                # packed return with multiple unpacked returns. We need to do
+                # some graph surgery here.
+                #
+                # Example:
+                #   def original_graph(x):
+                #      a = op(x)
+                #      b = a[0]
+                #      c = a[1]
+                #      ...
+                #
+                # Assume that we want to replace op(x) with the graph
+                #   def new_op(x):
+                #      w = x + 1
+                #      z = x + 2
+                #      return (w, z)
+                #
+                # We need to replace `op` with the contents of `new_op`,
+                # and then rewrite a[0] to be w and a[1] to be z, as so:
+                #   def new_graph(x):
+                #     w = x + 1
+                #     z = x + 2
+                #     b = w
+                #     c = z
+                #     ...
+                old_uses = list(old.users.keys())
+                for user in old_uses:
+                    idx = maybe_getitem(user)
+                    if idx is None:
+                        raise AssertionError("can't handle")
+                    replace(user, new[idx])
+                graph.erase_node(old)
+
+            if len(output_nodes) == len(replacement):
+                for old, new in zip(output_nodes, replacement):
+                    replace(old, new)
+            else:
+                assert len(output_nodes) == 1
+                replace(output_nodes[0], replacement)
+
+        match.erase_nodes(graph)
+
+    def apply(self, match: Match, graph: torch.fx.Graph, node: torch.fx.Node):
+        self.replace_with_graph(
+            match,
+            graph,
+            match.replacement_graph,  # type: ignore[arg-type]
+            self.normalize_args(*match.args, **match.kwargs),
+        )
+
+
+def _return_true(match):
+    return True
+
+
+def log_trace_failure(search_fn, e):
+    log.info(
+        "Replacement pattern %s failed to apply due to shape mismatch: %s",
+        search_fn.__name__,
+        e,
+    )
+
+
+def register_replacement(
+    search_fn,
+    replace_fn,
+    example_inputs: Iterable[Any],
+    trace_fn: Callable[[Callable[..., Any], Iterable[Any]], torch.fx.GraphModule],
+    pass_dicts,
+    extra_check=_return_true,
+    scalar_workaround=(),
+    exclusive_arg_names=(),
+    search_fn_pattern=None,
+):
+    """
+    Create a replacement rule based on example functions that get traced
+    to create patterns.  This supports both training and inference when
+    run on a joint forward+backward graph.
+
+    Args:
+        search_fn: traced to give original pattern
+        replace_fn: traced to give replacement graph
+        example_inputs: example inputs for initial trace
+        trace_fn: fwd_only or joint_fwd_bwd
+        pass_dict: dict of passes to register to
+        extra_check: additional check to run on match(using real shapes)
+    """
+    argnames_static = [*inspect.signature(search_fn).parameters.keys()]
+
+    def check_fn(match: Match):
+        """
+        Often shapes get burned into the pattern, so our initial match ran with
+        `ignore_types=(int, ...)`.
+
+        Recheck the match with the correct shapes.
+        """
+        argnames = list(argnames_static)
+        for name in argnames:
+            if name not in match.kwargs:
+                raise RuntimeError(
+                    f"Not all inputs to pattern found in match.kwargs. Perhaps one "
+                    f"of the inputs is unused? argnames={argnames}, match.kwargs={match.kwargs}"
+                )
+
+        args = list(
+            torch.fx.map_arg(
+                [match.kwargs[name] for name in argnames], lambda n: n.meta["val"]
+            )
+        )
+        sym_args: List[torch.SymInt] = []
+        with torch._dynamo.utils.detect_fake_mode(args):
+            for i, grad in enumerate(requires_grad):
+                if isinstance(args[i], torch.Tensor):
+                    if grad and is_integer_dtype(args[i].dtype):
+                        return False
+
+                    args[i] = torch.empty_strided(
+                        args[i].size(),
+                        args[i].stride(),
+                        dtype=args[i].dtype,
+                        device=args[i].device,
+                        requires_grad=grad,
+                    )
+                    for v in itertools.chain(args[i].shape, args[i].stride()):
+                        if isinstance(v, torch.SymInt) and all(
+                            guard_size_oblivious(v != a) for a in sym_args
+                        ):
+                            sym_args.append(v)
+
+            if sym_args:
+                # AOT Autograd and make fx will dedupe symbolic shape size
+                # accesses of sym ints that appear as inputs
+                # We don't want the sym_size uses to interfere with pattern matching
+                # so we provide them as inputs.
+                # Later, when we actually do the replacement, the symbolic shape
+                # sizes will get re-traced and added to the graph.
+
+                def search_fn_new(*args_new):
+                    return search_fn(*args_new[len(args_new) - len(args) :])
+
+                try:
+                    specific_graph = trace_fn(search_fn_new, sym_args + args)
+                except RuntimeError as e:
+                    log_trace_failure(search_fn, e)
+                    return False
+
+                # correct argnames in the graph
+                sym_arg_names = []
+                for i, placeholder in zip(
+                    range(len(sym_args) + len(args)),
+                    specific_graph.graph.nodes,
+                ):
+                    if i < len(sym_args):
+                        sym_arg_names.append(placeholder.target)
+                        continue
+
+                    with specific_graph.graph.inserting_after(placeholder):
+                        new_node = specific_graph.graph.placeholder(
+                            argnames[i - len(sym_args)]
+                        )
+                        new_node.target = new_node.name
+                        placeholder.replace_all_uses_with(new_node)
+                        specific_graph.graph.erase_node(placeholder)
+
+                argnames = sym_arg_names + argnames
+            else:
+                try:
+                    specific_graph = trace_fn(search_fn, args)
+                except RuntimeError as e:
+                    log_trace_failure(search_fn, e)
+                    return False
+
+            specific_pattern = fx_to_pattern(
+                specific_graph,
+                argnames=argnames,
+                exclusive_arg_names=exclusive_arg_names,
+                scalar_workaround=scalar_workaround,
+            )
+            specific_pattern_match = specific_pattern.match(match.output_nodes()[0])  # type: ignore[arg-type]
+            if specific_pattern_match and extra_check(specific_pattern_match):
+                # trace the pattern using the shapes from the user program
+                match.replacement_graph = trace_fn(replace_fn, args)  # type: ignore[assignment]
+                return True
+            return False
+
+    def normalize_args(**kwargs):
+        args = []
+        for name in argnames_static:
+            args.append(kwargs.pop(name))
+        for i in range(1, len(kwargs) + 1):
+            if f"tangents_{i}" not in kwargs:
+                break
+            args.append(kwargs.pop(f"tangents_{i}"))
+        assert not kwargs, f"leftover kwargs: {kwargs!r}"
+        return args
+
+    if trace_fn is joint_fwd_bwd:
+        # If inference mode is enabled during compilation, assume that we don't
+        # want to match on any training graph patterns
+        if torch.is_inference_mode_enabled():
+            return False
+
+    # TODO: Revisit the functionalize_rng_ops for lowmem dropout
+    with functorch_config.patch(functionalize_rng_ops=False):
+        requires_grad: List[bool] = [
+            isinstance(x, torch.Tensor) and x.requires_grad for x in example_inputs
+        ]
+        if search_fn_pattern is None:
+            pattern = gen_pattern(
+                search_fn,
+                example_inputs,
+                trace_fn,
+                scalar_workaround,
+                exclusive_arg_names,
+            )
+        else:
+            pattern = search_fn_pattern
+
+        pattern_repr = PatternPrettyPrinter.run(pattern)
+        assert pattern_repr not in _seen_patterns
+        _seen_patterns.add(pattern_repr)
+        pattern = ReplacementPatternEntry(
+            pattern=pattern,
+            extra_check=check_fn,
+            normalize_args=normalize_args,
+        )
+        pattern.register(pass_dicts)
+        return pattern.pattern
+
+
+@functorch_config.patch(functionalize_rng_ops=False)
+def gen_pattern(
+    search_fn, example_inputs, trace_fn, scalar_workaround=(), exclusive_arg_names=()
+) -> PatternExpr:
+    argnames = [*inspect.signature(search_fn).parameters.keys()]
+
+    if scalar_workaround == ():
+        scalar_workaround = {}
+    flat_inputs = []
+    input_idx = 0  # Positional arguments index
+
+    for argname in argnames:
+        if argname in scalar_workaround:
+            flat_inputs.append(scalar_workaround[argname])
+        else:
+            flat_inputs.append(example_inputs[input_idx])
+            input_idx += 1
+
+    search_gm = trace_fn(search_fn, flat_inputs)
+    return fx_to_pattern(
+        search_gm,
+        ignore_types=(int, float, list, torch.device, torch.dtype),
+        argnames=argnames,
+        scalar_workaround=scalar_workaround,
+        exclusive_arg_names=exclusive_arg_names,
+    )
+
+
+def register_lowering_pattern(
+    pattern: PatternExpr, extra_check=_return_true, *, pass_dict, prepend=False
+):
+    """
+    Register an aten to inductor IR replacement pattern.  The decorated
+    function is saved and then called a lowering time allowing direct
+    pattern to inductor IR conversion.
+    """
+
+    def decorator(handler):
+        assert callable(handler)
+        LoweringPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        handler._inductor_lowering_function = True
+        return handler
+
+    return decorator
+
+
+def register_graph_pattern(
+    pattern: PatternExpr, extra_check=_return_true, *, pass_dict, prepend=False
+):
+    """
+    Register a pattern that runs a function on the FX graph, allowing
+    custom transformation code.
+    """
+
+    def decorator(handler):
+        assert callable(handler)
+        GraphPatternEntry(
+            pattern=pattern, extra_check=extra_check, handler=handler
+        ).register(pass_dict, prepend=prepend)
+        return handler
+
+    return decorator
+
+
+def is_start_of_fx_graph(graph: torch.fx.Graph, node: torch.fx.Node) -> bool:
+    # first node in the graph
+    return node is next(iter(graph.nodes))
+
+
+# match: copy_, relu_, _set_grad_enabled, manual_seed, enter_functional_autocast, etc
+_mutation_op_re = re.compile(r"_$|_[.]|(\b|_)(set|enter|exit|seed)(\b|_)")
+
+
+def is_mutation_op(node: torch.fx.Node) -> bool:
+    if node.op == "call_function":
+        if _mutation_op_re.search(node.target.__name__):  # type: ignore[union-attr]
+            return True
+    elif node.op == "call_method":
+        if _mutation_op_re.search(node.target):  # type: ignore[union-attr, arg-type]
+            return True
+    return node.kwargs.get("out") is not None
+
+
+def get_mutation_region_id(graph: torch.fx.Graph, node: torch.fx.Node) -> int:
+    n = node
+    while "mutation_region_id" not in n.meta and not is_start_of_fx_graph(graph, n):
+        n = n.prev
+    mutation_region_id = n.meta.get("mutation_region_id", 0)
+    while n is not node:
+        n = n.next
+        if is_mutation_op(n):
+            mutation_region_id += 1
+        n.meta["mutation_region_id"] = mutation_region_id
+    return mutation_region_id
+
+
+def should_compute_mutation_region_ids(graph: torch.fx.GraphModule) -> bool:
+    return "mutation_region_id" not in next(iter(graph.nodes)).meta
+
+
+def compute_mutation_region_ids(graph: torch.fx.GraphModule):
+    mutation_region_id = 0
+    for nd in graph.nodes:
+        if is_mutation_op(nd):
+            mutation_region_id += 1
+        nd.meta["mutation_region_id"] = mutation_region_id
+
+
+class PatternMatcherPass:
+    def __init__(
+        self, prevent_match_across_mutations=False, pass_name: Optional[str] = None
+    ):
+        super().__init__()
+        self.patterns: DefaultDict[
+            torch.fx.node.Target, List[PatternEntry]
+        ] = defaultdict(list)
+        self.prevent_match_across_mutations = prevent_match_across_mutations
+        self.pass_name = pass_name
+
+    def __getitem__(self, item: torch.fx.node.Target) -> List[PatternEntry]:
+        return self.patterns[item]
+
+    def apply(self, graph: torch.fx.GraphModule) -> int:
+        if not self.patterns:
+            return 0
+        if isinstance(graph, torch.fx.GraphModule):
+            graph = graph.graph
+        if self.prevent_match_across_mutations:
+            if should_compute_mutation_region_ids(graph):
+                compute_mutation_region_ids(graph)
+            get_mutation_region_id_partial = functools.partial(
+                get_mutation_region_id, graph
+            )
+        count = 0
+        for node in reversed(graph.nodes):
+            target = extract_target(node)
+            if (
+                node.op in ["call_function", "call_method", "call_module"]
+                and target in self.patterns
+            ):
+                # conservatively not applying pattern for cpu input,
+                # since some of the patterns induce codegen and split nodes.
+                # Note: we will only skip cpu compute if disable_cpp_codegen=True
+                if fallback_node_due_to_unsupported_type(node, allow_cpu_inputs=False):
+                    continue
+
+                for entry in self.patterns[target]:
+                    if node._erased:
+                        break
+                    m = entry.pattern.match(node)
+                    # pattern match crosses mutation barrier - discard
+                    if (
+                        self.prevent_match_across_mutations
+                        and is_match(m)
+                        and len(set(map(get_mutation_region_id_partial, m.nodes))) != 1  # type: ignore[possibly-undefined]
+                    ):
+                        continue
+                    if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
+                        log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
+                    if is_match(m) and entry.extra_check(m):
+                        count += 1
+                        entry.apply(m, graph, node)  # type: ignore[arg-type]
+                        counters["inductor"]["pattern_matcher_count"] += 1
+                        counters["inductor"]["pattern_matcher_nodes"] += len(m.nodes)
+        return count
+
+    def clear(self):
+        self.patterns.clear()
+
+
+def _not_implemented(*args, **kwargs) -> NoReturn:
+    raise NotImplementedError()
+
+
+def fx_to_pattern(
+    gm,
+    ignore_types=(),
+    argnames=(),
+    scalar_workaround=(),
+    exclusive_arg_names=(),
+) -> PatternExpr:
+    """
+    Convert an FX graph into a PatternExpr.  This is useful for simple
+    patterns that can only match single functions and fixed-length lists.
+    """
+    # scalar_workaround is a hack to capture dropout_p
+    # see https://github.com/pytorch/pytorch/issues/97894
+    scalar_workaround = scalar_workaround or {}
+    inv_scalar_workaround = {v: k for k, v in scalar_workaround.items()}
+    assert len(inv_scalar_workaround) == len(scalar_workaround)
+
+    def process_arg(x):
+        if isinstance(x, (float, int)) and x in inv_scalar_workaround:
+            return KeywordArg(inv_scalar_workaround[x])
+        if type(x) in ignore_types:
+            return Ignored()
+        if isinstance(x, list) and all(isinstance(y, Ignored) for y in x) and x:
+            return Ignored()
+        return x
+
+    argnum = itertools.count()
+
+    class Converter(torch.fx.Interpreter):
+        call_method = _not_implemented
+        call_module = _not_implemented
+        get_attr = _not_implemented
+
+        def placeholder(self, target, args, kwargs):
+            n = next(argnum)
+            if n < len(argnames):
+                name = argnames[n]
+            elif argnames:
+                assert target.startswith("tangent")
+                name = target
+            else:
+                target = re.sub(r"_\d+$", "", target)  # de-mangle arg name
+                name = target
+            if name in exclusive_arg_names:
+                return ExclusiveKeywordArg(name)
+            else:
+                return KeywordArg(name)
+
+        def call_function(self, target, args, kwargs):
+            args, kwargs = pytree.tree_map(process_arg, (args, kwargs))
+            if list in ignore_types:
+                # Handle a burned in tensor size which are now [Ignored(), Ignored(), ...]
+                args = [process_arg(a) for a in args]
+                kwargs = {k: process_arg(a) for k, a in kwargs.items()}
+            return CallFunction(target, *args, **kwargs)
+
+        def run_node(self, n):
+            rv = super().run_node(n)
+            if n.op == "output" and isinstance(rv, tuple):
+                assert len(rv) == len(n.args[0])
+                for r, arg in zip(rv, n.args[0]):
+                    r.users = len(arg.users)
+            else:
+                rv.users = len(n.users)
+            return rv
+
+    pattern = Converter(gm).run()
+    if not isinstance(pattern, PatternExpr):
+        return MultiOutputPattern(pytree.tree_leaves(pattern))
+    return pattern
+
+
+@torch.no_grad()
+def fwd_only(fn, args, *, run_dce=True) -> torch.fx.GraphModule:
+    """Build a normalized inference graph, for use with fx_to_pattern"""
+    # TODO - look into using aot autograd, asserting no mutating ops here
+    with enable_python_dispatcher():
+        mode = (
+            "real" if not torch._inductor.utils.any_is_symbolic(*args) else "symbolic"
+        )
+        gm = make_fx(fn, select_decomp_table(), tracing_mode=mode)(*args)
+    if run_dce:
+        gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+
+
+@torch.enable_grad()
+def joint_fwd_bwd(fn, args) -> torch.fx.GraphModule:
+    """Build a normalized training graph, for use with fx_to_pattern"""
+    gm: Optional[torch.fx.GraphModule] = None
+
+    def record_joint_graph(joint_graph, inputs, **kwargs):
+        nonlocal gm
+        assert not gm
+        gm = clone_graph(joint_graph)
+        return default_partition(joint_graph, inputs, **kwargs)
+
+    with torch._guards.tracing(None):
+        aot_function(
+            fn,
+            lambda g, i: make_boxed_func(g),
+            partition_fn=record_joint_graph,
+            decompositions=select_decomp_table(),
+            keep_inference_input_mutations=True,
+            enable_log=False,
+        )(*args)
+    assert gm
+
+    from .fx_passes.joint_graph import pointless_view
+
+    matcher_pass = PatternMatcherPass()
+
+    pattern = CallFunction(
+        torch.ops.aten.view.default, KeywordArg("arg"), KeywordArg("size")
+    )
+    GraphPatternEntry(
+        pattern=pattern, handler=pointless_view, extra_check=_return_true
+    ).register(matcher_pass.patterns)
+    matcher_pass.apply(gm.graph)  # type: ignore[arg-type]
+
+    # remove in/out specs
+    gm.graph._codegen = torch.fx.graph.CodeGen()
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+
+
+def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]:
+    args: List[torch.fx.node.Argument] = list()
+    torch.fx.map_arg((n.args, n.kwargs), args.append)
+    return args
+
+
+def stable_topological_sort(graph: torch.fx.Graph):
+    # Nodes are in exactly one of these three collections:
+
+    # - Nodes in `pending` are waiting to be processed (in reverse order):
+    pending = list(reversed(graph.nodes))
+
+    # - Nodes in `ready` have been processed and are already in the correct
+    #   order.
+    ready = set()
+
+    # - `waiting` is a mapping from a dependency to nodes which depend on that
+    #   dependency.
+    waiting = defaultdict(list)
+
+    # The cursor indicates the last processed node so we can add new nodes
+    # after it.
+    cursor = None
+    while pending:
+        node = pending.pop()
+        waiting_for = [x for x in _args(node) if x not in ready]
+        if waiting_for:
+            # We have unprocessed input nodes. Might as well wait for the last
+            # arg so an already sorted list will only recheck this node once.
+            waiting[waiting_for[-1]].append(node)
+        else:
+            ready.add(node)
+            if cursor and cursor.next is not node:
+                cursor.append(node)
+            cursor = node
+            # Mark the nodes that have been waiting for this node to finish as
+            # ready to check again.
+            pending.extend(reversed(waiting.pop(node, ())))
+
+    assert not waiting and len(ready) == len(graph.nodes)
+
+
+def init_once_fakemode(fn: Callable[..., Any]):
+    """Wrapper around lazy init functions in fx_passes/"""
+
+    @functools.lru_cache(None)
+    @functools.wraps(fn)
+    def lazy_init():
+        counters_ref = counters["inductor"].copy()
+
+        with torch._guards.tracing(
+            None
+        ), maybe_disable_fake_tensor_mode(), FakeTensorMode():
+            result = fn()
+
+        # clear view matches encountered during tracing
+        counters["inductor"] = counters_ref
+
+        return result
+
+    return lazy_init
+
+
+def config_flag(name):
+    """Function for extra_check to put pass behind a flag"""
+
+    def flag_check(match):
+        return getattr(config, name)
+
+    return flag_check
+
+
+def clone_graph(input_graph: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    class CopyGraph(Transformer):
+        def run_node(self, old_node):
+            new_node = super().run_node(old_node)
+            if isinstance(new_node, torch.fx.Proxy):
+                new_node.node.meta.update(old_node.meta)
+                new_node.node.name = self.new_graph._graph_namespace.create_name(
+                    old_node.name, None
+                )
+            return new_node
+
+    return CopyGraph(input_graph).transform()
+
+
+_seen_patterns: Set[str] = set()
+
+
+def get_arg_value(
+    node: torch.fx.Node, arg_number: int, kwarg_name: Optional[str] = None
+):
+    return (
+        node.args[arg_number]
+        if len(node.args) > arg_number
+        else node.kwargs.get(kwarg_name)  # type: ignore[arg-type]
+    )
+
+
+def filter_nodes(nodes: Iterable[torch.fx.Node], fn) -> List[torch.fx.Node]:
+    fns = [fn]
+    if isinstance(fn, torch._ops.OpOverloadPacket):
+        fns.extend([getattr(fn, overload) for overload in fn.overloads()])
+
+    return [node for node in nodes if node.target in fns]
+
+
+def extract_target(node: Node):
+    """For call_function and call_method, we directly use the target function;
+    For call_module, the target is string, and we treat the module class
+     as a function.
+    """
+    if node.op == "call_module":
+        return getattr(node.graph.owning_module, node.target).__class__  # type: ignore[arg-type]
+    return node.target
diff --git a/MLPY/Lib/site-packages/torch/_inductor/quantized_lowerings.py b/MLPY/Lib/site-packages/torch/_inductor/quantized_lowerings.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d7c233d986c53aed900a66f32a10908cf2494e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/quantized_lowerings.py
@@ -0,0 +1,15 @@
+import torch
+
+
+def register_quantized_ops():
+    from . import lowering
+
+    quantized = torch.ops.quantized
+
+    lowering.add_needs_realized_inputs(
+        [
+            quantized.max_pool2d,
+        ]
+    )
+
+    lowering.make_fallback(quantized.max_pool2d)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/scheduler.py b/MLPY/Lib/site-packages/torch/_inductor/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0484dad1fbf7e1ab2050069e083b21ea222bbcfb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/scheduler.py
@@ -0,0 +1,2445 @@
+import collections
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import operator
+import os
+import pprint
+import textwrap
+from typing import (
+    Any,
+    Counter,
+    DefaultDict,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import sympy
+
+import torch
+from torch._dynamo.utils import dynamo_timed
+from torch._inductor.metrics import get_metric_table, is_metric_table_enabled
+from torch.utils._triton import has_triton
+
+from . import comms, config, dependencies, ir, metrics
+from .codegen.common import get_scheduling_for_device, Kernel
+from .comm_analysis import estimate_nccl_collective_runtime
+from .dependencies import Dep, MemoryDep, StarDep, WeakDep
+from .ir import ComputedBuffer, MultiOutput, MultiOutputLayout
+from .sizevars import SimplifyIndexing
+from .utils import (
+    cache_on_self,
+    cmp,
+    free_symbol_has,
+    get_device_tflops,
+    get_dtype_size,
+    get_gpu_dram_gbps,
+    green_text,
+    is_collective,
+    is_wait,
+    red_text,
+    sympy_product,
+)
+from .virtualized import V
+
+
+log = logging.getLogger(__name__)
+fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
+
+
+class WhyNoFuse:
+    # TODO when we drop support for Python < 3.10, we can use
+    # @dataclass(slots=True) instead of manually specifying __slots__.
+    __slots__ = ["node1", "node2", "reason", "args"]
+    reason: str
+    args: Tuple[Any, ...]
+
+    def __init__(self, node1: "BaseSchedulerNode", node2: "BaseSchedulerNode"):
+        self.node1 = node1
+        self.node2 = node2
+
+    def __call__(self, reason, *args):
+        self.reason = reason
+        self.args = args
+        fusion_log.debug(self)
+
+    def __str__(self):
+        return f"cannot fuse {self.node1.get_name()} with {self.node2.get_name()}: " + (
+            self.reason % self.args
+        )
+
+
+def pformat(obj):
+    if isinstance(obj, set):
+        # pformat has trouble with sets of sympy exprs
+        obj = sorted(obj, key=str)
+    result = pprint.pformat(obj, indent=4)
+    if "\n" in result:
+        return f"\n{textwrap.indent(result, ' '*4)}"
+    return result
+
+
+class OutputNode:
+    def __init__(self, dep):
+        self.unmet_dependencies = {dep}
+        self.inverse_users = []
+
+    def is_reduction(self):
+        return False
+
+    def get_alias_names(self):
+        return ()
+
+    def get_name(self):
+        return "OUTPUT"
+
+    __repr__ = get_name
+
+
+def _prune_redundant_deps(node, name_to_fused_node):
+    """
+    Prunes weakdeps intended for mutation ordering
+    on an upstream fused node if after fusion there is another dependency
+    on the fused upstream node, making the weakdep redundant
+
+    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
+    be incrementally removed, enabling other fusions, ensuring they are fused in order.
+    """
+    name_to_dep_count: Counter[str] = collections.Counter()
+
+    for dep in node.unmet_dependencies:
+        if not isinstance(dep, WeakDep):
+            name_to_dep_count[name_to_fused_node[dep.name].get_name()] += 1
+
+    def should_prune(dep):
+        if isinstance(dep, WeakDep):
+            is_redundant = (
+                name_to_dep_count[name_to_fused_node[dep.name].get_name()] > 0
+            )
+            # These can occur because fused nodes always gather deps from their snodes
+            # If B has a weakdep on A
+            # B gets fused with C, then any time BC is fused, the weakdep will reappear
+            is_self_dep = name_to_fused_node[dep.name] == node
+            return is_redundant or is_self_dep
+        else:
+            return False
+
+    deps_to_prune = {dep for dep in node.unmet_dependencies if should_prune(dep)}
+
+    if deps_to_prune:
+        node.unmet_dependencies = node.unmet_dependencies - deps_to_prune
+        node.set_read_writes(node.read_writes.remove_reads(deps_to_prune))
+
+
+# TODO(xmfan): reuse an existing mapping for this if it exists, or formalize this into ir.py:ExternKernel
+kernel_name_to_op = {
+    "extern_kernels.convolution": torch.ops.aten.convolution,
+    "extern_kernels.mm": torch.ops.aten.mm,
+    "extern_kernels.bmm": torch.ops.aten.bmm,
+    "extern_kernels.addmm": torch.ops.aten.addmm,
+}
+
+
+class BaseSchedulerNode:
+    def __init__(self, scheduler: "Scheduler", node: ir.Buffer):
+        self.scheduler: Scheduler = scheduler
+        self.node: ir.Buffer = node
+        self.users: List[NodeUser] = []
+        self.inverse_users: List[BaseSchedulerNode] = []
+        self.node_users: List[BaseSchedulerNode] = []
+        self.set_read_writes(node.get_read_writes())
+        self.ancestors: Set[str] = set()
+        self.min_order: int
+        self.max_order: int
+        self.last_usage: Set[
+            str
+        ] = set()  # buffers that won't be used after this kernel
+        self.written = False
+
+    def __repr__(self):
+        return f"{type(self).__name__}(name={self.get_name()!r})"
+
+    def debug_str(self) -> str:
+        """Longer form printout for trace logs"""
+        name = self.get_name()
+        lines = [
+            f"{name}: {type(self).__name__}({type(getattr(self, 'node', None)).__name__})",
+            f"{name}.writes = {pformat(self.read_writes.writes)}",
+            f"{name}.unmet_dependencies = {pformat(self.unmet_dependencies)}",
+            f"{name}.met_dependencies = {pformat(self.read_writes.reads - self.unmet_dependencies)}",
+            f"{name}.users = {self.users}",
+        ]
+        try:
+            lines += [
+                self.debug_str_extra(),
+            ]
+        except Exception:
+            log.warning("Ignoring error in debug_str()", exc_info=True)
+
+        return "\n".join(lines).rstrip()
+
+    def debug_str_extra(self) -> str:
+        return ""
+
+    def log_details(self):
+        log.info(
+            "%s: unmet_dependencies = %s, writes = %s",
+            self,
+            self.unmet_dependencies,
+            self.read_writes.writes,
+        )
+
+    def update_mutated_names(self, renames: Dict[str, str]):
+        self.set_read_writes(self.read_writes.rename(renames))
+
+    def add_mutation_dep(self, dep):
+        self.set_read_writes(self.read_writes.with_read(dep))
+
+    def add_fake_dep(self, dep):
+        self.set_read_writes(self.read_writes.with_read(dep))
+
+    def set_users(self, users: List["NodeUser"]):
+        # deduplicate
+        result: Dict[int, NodeUser] = {}
+        for use in users:
+            if id(use.node) in result:
+                result[id(use.node)] = use.merge(result[id(use.node)])
+            else:
+                result[id(use.node)] = use
+        self.users = list(result.values())
+
+    def set_last_usage(
+        self, future_used_buffers: Set[str], mutation_real_name: Dict[str, str]
+    ):
+        used_buffers = self.used_or_aliased_buffer_names()
+        used_buffers = {mutation_real_name.get(k, k) for k in used_buffers}
+        self.last_usage = used_buffers - future_used_buffers
+
+    def get_aliases(self):
+        return self.node.get_alias_names()
+
+    def get_mutations(self):
+        return self.node.get_mutation_names()
+
+    def has_aliasing_or_mutation(self):
+        return bool(self.get_aliases() or self.get_mutations())
+
+    def set_read_writes(self, rw: dependencies.ReadWrites):
+        self.read_writes: dependencies.ReadWrites = rw
+        self.unmet_dependencies = self.read_writes.reads
+        self.prune_deps()
+
+    def op_counts(self):
+        return self.read_writes.op_counts
+
+    def used_buffer_names(self) -> Set[str]:
+        return {
+            dep.name
+            for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes)
+        }
+
+    def used_or_aliased_buffer_names(self) -> Set[str]:
+        used_names = set()
+
+        for dep in itertools.chain(self.read_writes.reads, self.read_writes.writes):
+            used_names.add(dep.name)
+            if V.graph.name_to_buffer.get(dep.name):
+                layout = V.graph.name_to_buffer[dep.name].get_layout()
+                # needed to avoid deallocating aliased buffer
+                # if there are still uses of aliases ahead
+                if isinstance(layout, ir.AliasedLayout):
+                    used_names.add(layout.view.data.get_name())
+        return used_names
+
+    def prune_deps(self):
+        self.unmet_dependencies = {
+            dep
+            for dep in self.unmet_dependencies
+            if dep.name not in self.scheduler.available_buffer_names
+        }
+
+    def prune_weak_deps(self):
+        # Prune weak dependencies on buffers that have been removed
+        def should_prune(dep):
+            return isinstance(dep, WeakDep) and dep.name in V.graph.removed_buffers
+
+        to_remove = {dep for dep in self.read_writes.reads if should_prune(dep)}
+        self.set_read_writes(self.read_writes.remove_reads(to_remove))
+
+    def prune_redundant_deps(self, name_to_fused_node):
+        _prune_redundant_deps(self, name_to_fused_node)
+
+    def get_name(self) -> str:
+        return self.node.get_name()
+
+    def get_first_name(self) -> str:
+        return self.get_name()
+
+    def get_names(self) -> Set[str]:
+        return {self.get_name()}
+
+    def get_nodes(self) -> Sequence["BaseSchedulerNode"]:
+        return [self]
+
+    def get_device(self):
+        return self.node.get_device()
+
+    def is_reduction(self):
+        return False
+
+    def is_split_scan(self):
+        return False
+
+    def is_template(self):
+        return False
+
+    def is_extern(self):
+        return False
+
+    def is_foreach(self):
+        return False
+
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        return False
+
+    def has_side_effects(self):
+        return False
+
+    def decide_inplace_update(self):
+        """
+        Decide if there should be inplace updates for the node
+        and record the decision in the active kernel.
+        """
+        if not self.node.should_allocate():
+            return
+
+        if isinstance(self, (SchedulerNode,)) and (
+            self.node.get_alias_names() or self.node.get_mutation_names()
+        ):
+            return
+
+        if (
+            (
+                isinstance(self, (SchedulerNode,))
+                # o what have i done.  lets make this an api
+                or (
+                    isinstance(self, ExternKernelSchedulerNode)
+                    and isinstance(self.node, (ir.AllReduce, ir.InPlaceHint))
+                )
+            )
+            and config.inplace_buffers
+            and (
+                not isinstance(V.kernel, torch._inductor.codegen.triton.TritonKernel)
+                or getattr(V.kernel, "mutations", None) is not None
+            )
+        ):
+            from .codegen.wrapper import buffer_reuse_key
+
+            ordered_reads = sorted(self.read_writes.reads, key=lambda x: x.name)
+
+            for read in ordered_reads:
+                input_node: Optional[
+                    BaseSchedulerNode
+                ] = self.scheduler.name_to_node.get(read.name)
+                if input_node and V.graph.wrapper_code.can_reuse(input_node, self):
+                    assert input_node.users is not None
+                    remaining_uses = [
+                        x
+                        for x in input_node.users
+                        if x.node.get_name()
+                        not in self.scheduler.available_buffer_names
+                    ]
+                    if (
+                        len(remaining_uses) == 1
+                        and remaining_uses[0].can_inplace
+                        and remaining_uses[0].node is self
+                        and not isinstance(
+                            input_node.node.get_layout(),
+                            (
+                                ir.MultiOutputLayout,
+                                ir.MutationLayout,
+                                ir.AliasedLayout,
+                            ),
+                        )
+                        and not (
+                            isinstance(
+                                input_node.node, (ir.FallbackKernel, ir.MultiOutput)
+                            )
+                            and len(input_node.node.get_alias_names()) > 0
+                        )
+                        and buffer_reuse_key(input_node.node)
+                        == buffer_reuse_key(self.node)
+                    ):
+                        # hacky check for if V.kernel is a real kernel or NullHandler
+                        if hasattr(V.kernel, "args"):
+                            # if there isn't a triton kernel, then we don't need to call triton-specific things.
+                            # but TODO this might be a convenient place to signal to the Collective kernels to inplace
+                            # (and, can we make "kernel" less generic of a name?)
+                            V.kernel.args.make_inplace(
+                                input_node.get_name(), self.get_name()
+                            )
+                            # mutations not tracked in cpp kernels
+                            if isinstance(
+                                V.kernel, torch._inductor.codegen.triton.TritonKernel
+                            ):
+                                V.kernel.mutations.add(input_node.get_name())
+                                V.kernel.mutations.add(self.get_name())
+
+                            # update last usage of reused node
+                            self.last_usage.discard(input_node.get_name())
+
+                            V.kernel.inplace_update_buffers[
+                                self.get_name()
+                            ] = input_node.get_name()
+                        break
+
+    def allocate(self):
+        if not self.node.should_allocate():
+            return
+
+        if isinstance(self, (SchedulerNode,)) and (
+            self.node.get_alias_names() or self.node.get_mutation_names()
+        ):
+            V.graph.wrapper_code.codegen_allocation(self.node)
+            return
+
+        # hacky check for if V.kernel is a real kernel or NullHandler
+        if (
+            hasattr(V.kernel, "args")
+            and self.get_name() in V.kernel.inplace_update_buffers
+        ):
+            V.graph.wrapper_code.codegen_inplace_reuse(
+                self.scheduler.name_to_node[
+                    V.kernel.inplace_update_buffers[self.get_name()]
+                ].node,
+                self.node,
+            )
+        else:
+            V.graph.wrapper_code.codegen_allocation(self.node)
+
+    def can_free(self):
+        # There's no real allocated buffer, no need to free it
+        if isinstance(self.node.layout, ir.NoneLayout):
+            return False
+        for use in self.users:
+            if isinstance(use.node, OutputNode):
+                return False
+        return True
+
+    def codegen_originating_info(self, buffer, only_once=True):
+        if not config.comment_origin:
+            return
+
+        if only_once and self.written:
+            return
+        origins = self.node.origins
+        out_lines = []
+
+        for o in origins:
+            if o.op == "output":
+                # These are boring and samey
+                continue
+
+            out_lines.append("")
+            # TODO(voz): Should the pragma be constant somewhere?
+            out_lines.append("#pragma CMT ORIGIN:")
+            op_info_str = f"#pragma CMT {o.op} {o.target}"
+            if "seq_nr" in o.meta:
+                op_info_str = op_info_str + f" seq_nr:{o.meta['seq_nr']}"
+            out_lines.append(op_info_str)
+            if "stack_trace" in o.meta:
+                stack_trace = f"{o.meta['stack_trace']}"
+                stack_trace_last_line = stack_trace.split("|")[-1]
+                out_lines.append(
+                    "#pragma CMT "
+                    + stack_trace_last_line.replace("{", "{{")
+                    .replace("}", "}}")
+                    .replace("\n", "\\")
+                )
+                out_lines.append("#pragma CMT END ORIGIN")
+                out_lines.append("")
+
+        if len(out_lines) == 0:
+            return
+
+        # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
+        # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
+        buffer.writelines(out_lines)
+        self.written = True
+
+    def get_read_write_buffers_sizes(self) -> int:
+        """
+        Counting the number of bytes accessed for a kernel is
+        surprisingly tricky. In particular, there is a differentiation
+        between 'theoretical' memory accesses and practical memory
+        accesses. For example, a layernorm kernel may actually access an
+        input 3 times, but in theory, it only needs to access its input
+        once (and may be optimized to do so through say, persistent
+        reductions)
+
+        Another example is that even though a buffer is passed in, we may
+        not access the entire buffer. This may occur if we are accessing
+        a slice of the buffer. Another tricky case is for indirect
+        indexing, where the amount of bytes accessed depends on the
+        values of the input.
+
+        What this function aims to compute is the memory accesses for
+        worst-case inputs, best-case optimization. What this means is
+        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.
+
+        1. Numel in ranges multiplied by number of deps the buffer has
+        2. The buffer size
+        """
+        if isinstance(self, NopKernelSchedulerNode):
+            return 0
+        if isinstance(self, ExternKernelSchedulerNode) and isinstance(
+            self.node, MultiOutput
+        ):
+            return 0
+
+        if isinstance(self, SchedulerNode):
+            node_numel = V.graph.sizevars.size_hint(
+                sympy_product(self.get_ranges()[0])
+                * sympy_product(self.get_ranges()[1])
+            )
+        else:
+            node_numel = int(1e9)
+        buf_accesses = collections.defaultdict(list)
+        for dep in self.read_writes.reads | self.read_writes.writes:
+            buf_accesses[dep.name].append(dep)
+
+        reads = {dep.name for dep in self.read_writes.reads}
+        writes = {dep.name for dep in self.read_writes.writes}
+
+        def is_materialized(buf, snodes):
+            users = self.scheduler.name_to_node[buf].users
+            buf_uses = {user.node for user in users}
+            return len(buf_uses - set(snodes)) > 0
+
+        if isinstance(self, FusedSchedulerNode):
+            removed_buffers = {
+                dep for dep in writes if not is_materialized(dep, self.snodes)
+            }
+            writes = writes - removed_buffers
+            reads = reads - removed_buffers
+        node_bytes = 0
+
+        for buf_name in reads | writes:
+            buf_accessed_elems = sum([node_numel for dep in buf_accesses[buf_name]])
+            buf: Union[ir.Buffer, ir.TensorBox]
+            if buf_name in V.graph.name_to_buffer:
+                buf = V.graph.name_to_buffer[buf_name]
+            elif buf_name in V.graph.graph_inputs:
+                buf = V.graph.graph_inputs[buf_name]
+            else:
+                continue
+
+            def get_buf_elems(buf):
+                return V.graph.sizevars.size_hint(sympy_product(buf.get_size()))
+
+            # Kind of a lazy way to get the MultiOutput nodes corresponding to
+            # a MultiOutputLayout
+            if isinstance(buf.layout, MultiOutputLayout):
+                users = self.scheduler.name_to_node[buf.get_name()].users
+                buf_elems = sum(get_buf_elems(user.node.node) for user in users)
+            else:
+                buf_elems = get_buf_elems(buf)
+
+            node_bytes += min(buf_elems, buf_accessed_elems) * get_dtype_size(
+                buf.get_dtype()
+            )
+
+        return node_bytes
+
+    def get_estimated_runtime(self) -> float:
+        """
+        Returns estimated op runtime in nanoseconds (ns)
+        """
+        layout = None
+        dtype = None
+        if not hasattr(self, "node") or not self.node:
+            assert isinstance(
+                self, (FusedSchedulerNode, ForeachKernelSchedulerNode)
+            ), f"{type(self)=}"
+            assert self.snodes
+            if not self.snodes[0].node:
+                return 0
+            layout = self.snodes[0].node.get_layout()
+            dtype = self.snodes[0].node.get_dtype()
+        else:
+            layout = self.node.get_layout()
+            dtype = self.node.get_dtype()
+
+        if "cuda" != layout.device.type:
+            # default to no reordering based on runtime
+            return 0
+
+        # Collective kernels
+        if is_collective(self.node):
+            return estimate_nccl_collective_runtime(self.node)
+        elif is_wait(self.node):
+            # ir.Wait is only used for collective ops.
+            # The time needed for the collective op is already estimated and considered
+            # when we are processing the collective op IR node, so ir.Wait takes 0 time
+            # since it doesn't take extra time to get the result after the collective is completed.
+            return 0
+
+        try:
+            gpu_memory_bandwidth = get_gpu_dram_gbps()
+            gpu_flops = get_device_tflops(dtype) * 10**12
+        except Exception:
+            return 0
+
+        if isinstance(self, ExternKernelSchedulerNode):
+            assert isinstance(self.node, ir.ExternKernel), f"{type(self.node)=}"
+            op = kernel_name_to_op.get(
+                getattr(self.node, "python_kernel_name", ""), None
+            )
+
+            # if there is a resolved op, dry-run using fake mode and record flop count
+            if op is not None:
+                from torch._subclasses.fake_tensor import FakeTensorMode
+                from torch.utils.flop_counter import FlopCounterMode
+
+                with FakeTensorMode(), FlopCounterMode(
+                    display=False
+                ) as flop_counter_mode:
+                    from .ir import ir_node_to_tensor
+
+                    fake_inputs = [
+                        ir_node_to_tensor(input, guard_shape=False)
+                        for input in self.node.inputs
+                    ]
+                    cls = self.node.__class__
+                    cls.process_kernel(op, *fake_inputs, **self.node.kwargs)
+
+                    # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
+                    factor = 1.0
+                    counted_flops = flop_counter_mode.get_total_flops()
+                    counted_bytes = self.get_read_write_buffers_sizes()
+                    compute_time = (factor * counted_flops / gpu_flops) * 1e9
+                    transfer_time = counted_bytes / gpu_memory_bandwidth
+
+                    # Return estimated runtime in nanoseconds
+                    return max(compute_time, transfer_time)
+
+        elif isinstance(self, FusedSchedulerNode) or isinstance(
+            self.node, ComputedBuffer
+        ):
+            # Return estimated runtime in nanoseconds (bytes / gbps)
+            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+
+        return 0
+
+
+class ExternKernelSchedulerNode(BaseSchedulerNode):
+    def debug_str_extra(self) -> str:
+        return f"{self.get_name()}.node.kernel = {getattr(self.node, 'python_kernel_name', None)}"
+
+    def is_extern(self):
+        return True
+
+    def has_side_effects(self):
+        return hasattr(self.node, "has_side_effects") and self.node.has_side_effects()
+
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        if self.get_aliases() or self.is_template():
+            return False
+
+        if read_dep.name not in self.scheduler.name_to_node:
+            # don't allow reuse of an 'input' buffer, we don't own it
+            # (would this have been fixed if I tracked mutations properly above?)
+            return False
+        if not isinstance(
+            self.node, (torch._inductor.ir.AllReduce, torch._inductor.ir.InPlaceHint)
+        ):
+            # TODO make this a property of the IR
+            return False
+
+        if len(self.read_writes.writes) == 1:
+            write_dep = next(iter(self.read_writes.writes))
+            numel_diff = read_dep.get_numel() - write_dep.get_numel()
+            return V.graph.sizevars.simplify(numel_diff) == 0
+
+        return False
+
+
+class NopKernelSchedulerNode(BaseSchedulerNode):
+    pass
+
+
+class SchedulerNode(BaseSchedulerNode):
+    def __init__(
+        self,
+        scheduler: "Scheduler",
+        node: Union[ir.ComputedBuffer, ir.TemplateBuffer],
+    ):
+        super().__init__(scheduler, node)
+        self._compute_attrs()
+
+    def _compute_attrs(
+        self,
+        extra_indexing_constraints: Optional[Tuple[Dict[Any, Any], List[Any]]] = None,
+    ):
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+        self._sizes, self._body = self.node.simplify_and_reorder(
+            extra_indexing_constraints=extra_indexing_constraints
+        )
+
+        group_fn = self.scheduler.get_backend(self.node.get_device()).group_fn
+        self.group = (self.node.get_device(), group_fn(self._sizes))
+
+        if isinstance(self.node, ir.TemplateBuffer):
+            self.set_read_writes(self.node.normalized_read_writes())
+        else:
+            self.set_read_writes(
+                dependencies.extract_read_writes(
+                    self._body, *self._sizes, normalize=True
+                )
+            )
+
+    def recompute_size_and_body(
+        self, extra_indexing_constraints: Tuple[Dict[Any, Any], List[Any]]
+    ):
+        self._compute_attrs(extra_indexing_constraints=extra_indexing_constraints)
+
+    def debug_str_extra(self) -> str:
+        name = self.get_name()
+        lines = [
+            f"{name}.group.device = {self.group[0]}",
+            f"{name}.group.iteration = {self.group[1]}",
+            f"{name}.sizes = {self._sizes}",
+        ]
+        if self.get_aliases():
+            lines.append(f"{name}.aliases = {pformat(self.get_aliases())}")
+        if self.get_mutations():
+            lines.append(f"{name}.mutations = {pformat(self.get_mutations())}")
+        if isinstance(self._body, ir.LoopBody):
+            lines.append(f"class {name}_loop_body:")
+            lines.append(textwrap.indent(self._body.debug_str(), "    "))
+        return "\n".join(lines)
+
+    def get_ranges(self):
+        return self._sizes
+
+    def is_reduction(self):
+        assert isinstance(
+            self.node, (ir.ComputedBuffer, ir.TemplateBuffer)
+        ), f"{type(self.node)=}"
+        return bool(self.node.get_reduction_type())
+
+    def is_split_scan(self):
+        assert isinstance(
+            self.node, (ir.ComputedBuffer, ir.TemplateBuffer)
+        ), f"{type(self.node)=}"
+        return isinstance(self.node, ir.ComputedBuffer) and isinstance(
+            self.node.data, ir.SplitScan
+        )
+
+    def is_template(self):
+        return isinstance(self.node, ir.TemplateBuffer)
+
+    def get_template_node(self):
+        return self.node if self.is_template() else None
+
+    def run(self, *index_vars):
+        self.decide_inplace_update()
+        self.mark_run()
+        self.codegen(index_vars)
+
+    def mark_run(self):
+        self.allocate()
+
+    def ranges_from_index_vars(self, index_vars):
+        sizes = self._sizes
+        assert sum(map(len, sizes)) == sum(map(len, index_vars))
+        var_ranges = dict(
+            zip(
+                itertools.chain.from_iterable(index_vars),
+                itertools.chain.from_iterable(sizes),
+            )
+        )
+        return var_ranges
+
+    def codegen(self, index_vars):
+        var_ranges = self.ranges_from_index_vars(index_vars)
+        try:
+            with V.set_ops_handler(
+                SimplifyIndexing(V.get_ops_handler(), var_ranges)
+            ), V.kernel.set_current_node(self):
+                self._body(*index_vars)
+        except Exception:
+            log.fatal("Error in codegen for %s", self.node)
+            raise
+
+    def pointwise_read_writes(self):
+        """
+        Get the memory dependencies in the non-reduction axis.
+        """
+        sizes, reduction_sizes = self._sizes
+
+        def fn(index):
+            return self._body(index, [sympy.Integer(0) for _ in reduction_sizes])
+
+        return dependencies.extract_read_writes(fn, sizes)
+
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        if self.get_aliases() or self.is_template():
+            return False
+        if len(self.read_writes.writes) == 1 and isinstance(
+            read_dep, dependencies.MemoryDep
+        ):
+            write_dep = next(iter(self.read_writes.writes))
+            assert isinstance(write_dep, dependencies.MemoryDep), f"{type(write_dep)=}"
+            return read_dep.index == write_dep.index and read_dep.size == write_dep.size
+        return False
+
+    @cache_on_self
+    def _get_atomic_add_buffers(self) -> Set[str]:
+        buffers_store_as_atomic_add = set()
+        if isinstance(self._body, ir.LoopBody):
+            for node in self._body.get_nodes():
+                if (
+                    node.op == "call_method"
+                    and node.target == "store"
+                    and (
+                        ("mode" in node.kwargs and node.kwargs["mode"] == "atomic_add")
+                        or (len(node.args) == 5 and node.args[4] == "atomic_add")
+                    )
+                ):
+                    buffers_store_as_atomic_add.add(
+                        node.kwargs["name"]
+                        if "name" in node.kwargs
+                        else (node.args[1] if len(node.args) >= 2 else "")
+                    )
+        return buffers_store_as_atomic_add
+
+    def has_atomic_add(self, check_buf):
+        return check_buf in self._get_atomic_add_buffers()
+
+
+class FusedSchedulerNode(BaseSchedulerNode):
+    """
+    This is a "fake" scheduler node that represents a group of scheduler nodes
+    that are meant to be fused together. The way it does this is by maintaining
+    its unmet dependencies as the union of its constituent nodes.
+    """
+
+    @classmethod
+    def fuse(cls, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        assert node1.scheduler is node2.scheduler
+        assert isinstance(node1, (SchedulerNode, FusedSchedulerNode)) and isinstance(
+            node2, (SchedulerNode, FusedSchedulerNode)
+        )
+        return cls(node1.scheduler, list(node1.get_nodes()) + list(node2.get_nodes()))  # type: ignore[arg-type]
+
+    def __init__(self, scheduler: "Scheduler", snodes: List[SchedulerNode]):
+        # NB: No need to call super().__init__() because we don't need to re-use any of its logic.
+        self.snodes = snodes
+        self.scheduler = scheduler
+        self.node: ir.Buffer = None  # type: ignore[assignment]
+        self.users: List[NodeUser] = []
+        self.inverse_users = []
+        self.node_users = []
+        self.group = max(snodes, key=lambda x: int(x.is_reduction())).group
+        self.ancestors = set.union(
+            *[x.ancestors for x in snodes if x.ancestors is not None]
+        )
+
+        self.set_read_writes(
+            dependencies.ReadWrites.merge_list([x.read_writes for x in snodes])
+        )
+
+        self.unmet_dependencies = {
+            dep
+            for dep in set.union(*[x.unmet_dependencies for x in snodes])
+            if dep.name not in self.get_names()
+        } - self.read_writes.writes
+        self.min_order = min([x.min_order for x in self.snodes])
+        self.max_order = max([x.max_order for x in self.snodes])
+
+    @cache_on_self
+    def get_name(self) -> str:
+        return "_".join([x.get_name() for x in self.snodes])
+
+    def get_first_name(self) -> str:
+        return self.snodes[0].get_name()
+
+    @cache_on_self
+    def get_names(self) -> Set[str]:
+        return set.union(*[x.get_names() for x in self.snodes])
+
+    def debug_str_extra(self) -> str:
+        lines = [
+            f"{self.get_name()}.snodes[{i}] =\n{node.debug_str()}"
+            for i, node in enumerate(self.snodes)
+        ]
+        return textwrap.indent("\n".join(lines).rstrip(), "    ")
+
+    def set_last_usage(
+        self, future_used_buffers: Set[str], mutation_real_name: Dict[str, str]
+    ):
+        # Set self.last_usage using the global information
+        # This will be used for inter-kernel optimisations
+        super().set_last_usage(future_used_buffers, mutation_real_name)
+        # Set self.last_usage on the snodes
+        # This will be used for optimisations within the kernel
+        future_used_buffers: Set[str] = set()
+        for node in reversed(self.snodes):
+            node.set_last_usage(future_used_buffers, mutation_real_name)
+            future_used_buffers.update(node.last_usage)  # type: ignore[arg-type]
+
+    @cache_on_self
+    def used_buffer_names(self) -> Set[str]:
+        return set.union(*[x.used_buffer_names() for x in self.snodes])
+
+    @cache_on_self
+    def used_or_aliased_buffer_names(self) -> Set[str]:
+        return set.union(*[x.used_or_aliased_buffer_names() for x in self.snodes])
+
+    def get_nodes(self) -> List[SchedulerNode]:
+        return self.snodes
+
+    def __repr__(self):
+        return f"{type(self).__name__}(nodes={self.get_name()})"
+
+    @cache_on_self
+    def is_reduction(self):
+        return any(x.is_reduction() for x in self.snodes)
+
+    @cache_on_self
+    def is_split_scan(self):
+        return any(x.is_split_scan() for x in self.snodes)
+
+    @cache_on_self
+    def is_template(self):
+        return any(x.is_template() for x in self.snodes)
+
+    @cache_on_self
+    def get_template_node(self):
+        for node in self.snodes:
+            if node.is_template():
+                return node
+        return None
+
+    def get_device(self):
+        return self.group[0]
+
+    @cache_on_self
+    def has_aliasing_or_mutation(self):
+        return any(x.has_aliasing_or_mutation() for x in self.snodes)
+
+    @cache_on_self
+    def op_counts(self):
+        op_counts: Counter[str] = collections.Counter()
+        for node in self.snodes:
+            op_counts.update(node.op_counts())
+        return op_counts
+
+    def has_atomic_add(self, check_buf):
+        return any(
+            (
+                isinstance(sub_schedule_node1, SchedulerNode)
+                and sub_schedule_node1.has_atomic_add(check_buf)
+            )
+            for sub_schedule_node1 in self.get_nodes()
+        )
+
+    # None of these need to be implemented, as a FusedSchedulerNode is just an
+    # abstraction for scheduling purposes
+    def update_mutated_names(self, renames: Dict[str, str]):
+        raise NotImplementedError
+
+    def add_mutation_dep(self, name):
+        raise NotImplementedError
+
+    def set_users(self, users: List["NodeUser"]):
+        raise NotImplementedError
+
+    def get_aliases(self):
+        raise NotImplementedError
+
+    def get_mutations(self):
+        raise NotImplementedError
+
+    def can_inplace(self, read_dep: dependencies.MemoryDep):
+        raise NotImplementedError
+
+    def allocate(self):
+        raise NotImplementedError
+
+    def can_free(self):
+        raise NotImplementedError
+
+    def debug_str(self) -> str:
+        """Longer form printout for trace logs"""
+        name = self.get_name()
+        node_typestr = ",".join(type(n).__name__ for n in self.snodes)
+        lines = [
+            f"{name}: {type(self).__name__}({node_typestr})",
+            f"{name}.writes = {pformat(self.read_writes.writes)}",
+            f"{name}.unmet_dependencies = {pformat(self.unmet_dependencies)}",
+            f"{name}.met_dependencies = {pformat(self.read_writes.reads - self.unmet_dependencies)}",
+            f"{name}.users = {self.users}",
+        ]
+        try:
+            lines += [
+                self.debug_str_extra(),
+            ]
+        except Exception:
+            log.warning("Ignoring error in debug_str()", exc_info=True)
+
+        return "\n".join(lines).rstrip()
+
+
+class ForeachKernelSchedulerNode(FusedSchedulerNode):
+    """Scheduler node which consists of a list of scheduler nodes that each operate on a
+    distinct tensor in a list of tensors."""
+
+    def get_consumer_subnode_for(self, producer):
+        if producer.get_name() in self.read_to_node:
+            return self.read_to_node[producer.get_name()]
+
+        return None
+
+    def get_producer_subnode_for(self, consumer):
+        for rd in consumer.read_writes.reads:
+            if rd.name in self.name_to_node:
+                return self.name_to_node[rd.name]
+
+        return None
+
+    @classmethod
+    def can_fuse(cls, producer, consumer):
+        why = WhyNoFuse(producer, consumer)
+        if producer.is_foreach() and consumer.is_foreach():
+            foreach_match = len(producer.snodes) == len(consumer.snodes)
+            if not foreach_match:
+                why("foreach do not have same length")
+            return foreach_match and all(
+                producer.scheduler.can_fuse(l, r)
+                for l, r in zip(producer.snodes, consumer.snodes)
+            )
+        elif consumer.is_foreach():
+            consumer_subnode = consumer.get_consumer_subnode_for(producer)
+            if consumer_subnode is not None:
+                return consumer.scheduler.can_fuse(producer, consumer_subnode)
+
+            why("candidate producer is not dep of any foreach consumer")
+            return False
+
+        elif producer.is_foreach():
+            producer_subnode = producer.get_producer_subnode_for(consumer)
+            if producer_subnode is not None:
+                return producer.scheduler.can_fuse(producer_subnode, consumer)
+
+            why("candidate consumer has no dep in any foreach producer")
+            return False
+
+        raise AssertionError(
+            "At least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node"
+        )
+
+    @classmethod
+    def fuse(cls, producer, consumer):
+        assert producer.is_foreach() or consumer.is_foreach()
+        prev_node_1 = None
+        prev_node_2 = None
+        if producer.is_foreach() and consumer.is_foreach():
+            fused_nodes = [
+                FusedSchedulerNode.fuse(l, r)
+                for l, r in zip(producer.snodes, consumer.snodes)
+            ]
+        elif producer.is_foreach():
+            producer_subnode = producer.get_producer_subnode_for(consumer)
+            fused_nodes = []
+            prev_node_1 = producer
+            prev_node_2 = None
+            for node in producer.snodes:
+                if node is producer_subnode:
+                    new_node = FusedSchedulerNode.fuse(node, consumer)
+                    prev_node_2 = new_node
+                    fused_nodes.append(new_node)
+                else:
+                    fused_nodes.append(node)
+
+        elif consumer.is_foreach():
+            consumer_subnode = consumer.get_consumer_subnode_for(producer)
+            fused_nodes = []
+            prev_node_1 = consumer
+            prev_node_2 = None
+
+            for node in consumer.snodes:
+                if node is consumer_subnode:
+                    new_node = FusedSchedulerNode.fuse(producer, node)
+                    prev_node_2 = new_node
+                    fused_nodes.append(new_node)
+                else:
+                    fused_nodes.append(node)
+
+        return cls(producer.scheduler, fused_nodes, prev_node_1, prev_node_2)  # type: ignore[possibly-undefined]
+
+    def __init__(
+        self,
+        scheduler: "Scheduler",
+        nodes: List[SchedulerNode],
+        prev_node_1=None,
+        prev_node_2=None,
+    ):
+        self.read_to_node = {}
+        self.name_to_node = {}
+
+        if prev_node_1 is None or prev_node_2 is None:
+            super().__init__(scheduler, nodes)
+
+            for node in nodes:
+                for read in node.read_writes.reads:
+                    self.read_to_node[read.name] = node
+
+                for name in node.get_names():
+                    self.name_to_node[name] = node
+        else:
+            self.scheduler = scheduler
+            self.snodes = nodes
+            self.node: ir.Buffer = None  # type: ignore[assignment]
+            self.users: List[NodeUser] = []
+
+            self.set_read_writes(
+                dependencies.ReadWrites.merge_list(
+                    [prev_node_1.read_writes, prev_node_2.read_writes]
+                )
+            )
+
+            self.unmet_dependencies = {
+                dep
+                for dep in set.union(
+                    prev_node_1.unmet_dependencies, prev_node_2.unmet_dependencies
+                )
+                if dep.name not in self.get_names()
+            } - self.read_writes.writes
+
+            self.min_order = min([prev_node_1.min_order, prev_node_2.min_order])
+            self.max_order = max([prev_node_1.max_order, prev_node_2.max_order])
+
+            foreach_node = prev_node_1 if prev_node_1.is_foreach() else prev_node_2
+            other_node = prev_node_2 if prev_node_1.is_foreach() else prev_node_1
+
+            self.ancestors = foreach_node.ancestors
+            self.ancestors.update(other_node.ancestors)
+
+            self.name_to_node = foreach_node.name_to_node
+            for name in other_node.get_names():
+                self.name_to_node[name] = other_node
+
+        self.group = (nodes[0].get_device(), "foreach")
+
+        self.origins: Set[torch.fx.Node] = set()
+
+    def mark_run(self):
+        raise NotImplementedError
+
+    def codegen(self):
+        assert isinstance(self.node, ir.ComputedBuffer), f"{type(self.node)=}"
+        self.node.get_store_function()(self.node.make_loader()())
+
+    def can_free(self):
+        return NotImplementedError
+
+    def is_foreach(self):
+        return True
+
+    def get_subkernel_nodes(self):
+        """Returns a list of nodes which comprise the foreach kernel, operating on corresponding elements of our input lists.
+        These nodes may be vertically fused."""
+        return list(self.snodes)
+
+    def get_nodes(self):
+        """Returns all nodes contained in this kernel, unpacking fused nodes into their constituent scheduler nodes."""
+        return list(itertools.chain.from_iterable(x.get_nodes() for x in self.snodes))
+
+    def get_first_name(self):
+        return self.snodes[0].get_first_name()
+
+    def prune_redundant_deps(self, name_to_fused_node):
+        _prune_redundant_deps(self, name_to_fused_node)
+
+        for node in self.snodes:
+            node.prune_redundant_deps(name_to_fused_node)
+
+
+def pick_loop_order(stride_lengths, sizes, priority_idx=()):
+    """
+    A heuristic to decide loop iteration orders.  This has not been well
+    tuned and may be something we should autotune.
+    """
+
+    @functools.cmp_to_key
+    def index_cmp(a, b):
+        if sizes[a] == 1 or sizes[b] == 1:
+            # 1-sizes don't matter, just move them to the end
+            return cmp(sizes[a] == 1, sizes[b] == 1)
+
+        stride_len_a = [sl[a] for sl in stride_lengths]
+        stride_len_b = [sl[b] for sl in stride_lengths]
+
+        # equivalent to
+        # np.logical_or(stride_lengths[:, b] == 0, stride_lengths[:, a] < stride_lengths[:, b]).all()
+        a_first = sum(
+            sl_b == 0 or sl_a < sl_b for sl_a, sl_b in zip(stride_len_a, stride_len_b)
+        )
+        b_first = sum(
+            sl_a == 0 or sl_b < sl_a for sl_a, sl_b in zip(stride_len_a, stride_len_b)
+        )
+        if a_first > b_first:
+            return -1
+        if b_first > a_first:
+            return 1
+
+        # otherwise contiguous
+        return cmp(b, a)
+
+    order = list(reversed(range(len(stride_lengths[0]))))
+    if len(priority_idx) > 0:
+        # if we have priority node, only use that node's order
+        stride_lengths = [stride_lengths[pi] for pi in priority_idx]
+    if config.pick_loop_orders:
+        order.sort(key=index_cmp)
+    return order
+
+
+@dataclasses.dataclass
+class NodeUser:
+    node: BaseSchedulerNode
+    can_inplace: bool = False
+
+    # A weak user must be scheduled after a given node, but doesn't actually
+    # use the result
+    is_weak: bool = False
+
+    def __hash__(self):
+        return hash((self.node.get_name(), self.can_inplace, self.is_weak))
+
+    def __eq__(self, other):
+        return (
+            self.get_name() == other.get_name()
+            and self.can_inplace == other.can_inplace
+            and self.is_weak == other.is_weak
+        )
+
+    def get_name(self):
+        return self.node.get_name()
+
+    def merge(self, other: "NodeUser") -> "NodeUser":
+        assert self.node is other.node
+        return NodeUser(
+            self.node,
+            self.can_inplace and other.can_inplace,
+            self.is_weak and other.is_weak,
+        )
+
+
+_post_grad_graph_counter = itertools.count()
+
+
+class Scheduler:
+    @dynamo_timed
+    def __init__(self, nodes):
+        super().__init__()
+        self.backends = {}
+        self.fuse_cache = {}
+        self.post_grad_graph_id = next(_post_grad_graph_counter)
+
+        self.nodes = []
+        self.available_buffer_names = {
+            *V.graph.graph_inputs.keys(),
+            *V.graph.constants.keys(),
+        }
+
+        self.nodes = [self.create_scheduler_node(n) for n in nodes]
+
+        # some new constants could have been created above
+        self.available_buffer_names.update(V.graph.constants.keys())
+        for node in self.nodes:
+            node.prune_deps()
+
+        self.name_to_node: Dict[str, BaseSchedulerNode] = {
+            n.get_name(): n for n in self.nodes
+        }
+        self.name_to_fused_node: Dict[
+            str, BaseSchedulerNode
+        ] = dict()  # set in fuse_nodes()
+
+        # mutation_real_name: Maps back to the original name for codegen
+        # Example:
+        # If you mutate buf0 inside of buf1's kernel, then:
+        # mutation_real_name = {"buf0" : "buf1"}
+        # all subsequent uses of buf0 become buf1's usage in dependency graph
+        self.mutation_real_name = {}
+
+        # We handle mutation by renaming modified versions of the same
+        # buffer in the dependency graph to prevent cycles.
+        # mutation_renames: tracks the current name for a given buffer
+        #                   (changed once per mutation)
+        # Example:
+        # If you mutate buf0 inside of buf1's kernel, then:
+        # mutation_renames = {"buf1" : "buf0"}
+        # in codegen we only use buf0, never buf1
+        self.mutation_renames = {}
+
+        self.compute_dependencies()
+        self.topological_sort_schedule()
+        self.dead_node_elimination()
+        if config.reorder_for_compute_comm_overlap:
+            comms.decide_global_ordering_of_comms(self.nodes)
+        self.compute_ancestors()
+
+        metrics.ir_nodes_pre_fusion += len(self.nodes)
+        V.debug.ir_pre_fusion(self.nodes)
+        self.num_orig_nodes = len(self.nodes)
+        self.name_to_fused_node = {n.get_name(): n for n in self.nodes}
+        self.create_foreach_nodes()
+        self.topological_sort_schedule()
+        self.logged_slow_fusion = set()
+        self.fuse_nodes()
+        if config.reorder_for_compute_comm_overlap:
+            # Refresh node_users and inverse_users to reflect fused nodes
+            self.compute_node_users()
+            self.nodes = comms.reorder_compute_and_comm_for_overlap(self.nodes)
+        self.compute_last_usage()
+        V.debug.ir_post_fusion(self.nodes)
+        V.debug.graph_diagram(self.nodes)
+        self.debug_draw_graph()
+
+        # used during codegen:
+        self.current_device: torch.device = None  # type: ignore[assignment]
+        self.buffer_names_to_free = set()
+
+        # fx graph node to the position it appears in the graph
+        # for debug attribution
+        self.origin_to_index = {}
+
+        get_metric_table("graph_stats").add_row(
+            lambda: {
+                "graph_id": self.post_grad_graph_id,
+                "num_nodes_before_fusion": self.num_orig_nodes,
+                "num_nodes_after_fusion": len(self.nodes),
+            }
+        )
+
+    def debug_draw_graph(self):
+        """Generate an image of the graph for debugging"""
+        if os.environ.get("INDUCTOR_WRITE_SCHEDULER_GRAPH", None) == "1":
+            from .debug import draw_buffers
+
+            draw_buffers(self.nodes, print_graph=True)
+
+    def debug_print_nodes(self, label):
+        if log.isEnabledFor(logging.INFO):
+            log.info("%s:", label)
+            for node in self.nodes:
+                node.log_details()
+
+    def create_scheduler_node(self, node):
+        assert (
+            node.origins is not None
+        ), "All nodes passed to scheduling must have an origin"
+        if node.is_no_op():
+            return NopKernelSchedulerNode(self, node)
+        elif isinstance(node, (ir.ComputedBuffer, ir.TemplateBuffer)):
+            return SchedulerNode(self, node)
+        elif isinstance(node, ir.ExternKernel):
+            return ExternKernelSchedulerNode(self, node)
+        else:
+            raise NotImplementedError(node)
+
+    def create_foreach_nodes(self):
+        removed_node_names = set()
+        fe_nodes = []
+        kept_node_names = self.name_to_fused_node.keys()
+
+        for names in V.graph.lists.values():
+            names = [
+                name
+                for name in names
+                if name in kept_node_names
+                and not isinstance(self.name_to_node[name], NopKernelSchedulerNode)
+            ]
+            if not names:
+                # All nodes eliminated
+                continue
+
+            removed_node_names.update(names)
+            snodes = [self.name_to_node[name] for name in names]
+
+            fe_node = ForeachKernelSchedulerNode(self, snodes)  # type: ignore[arg-type]
+
+            fe_nodes.append(fe_node)
+
+            for name in names:
+                self.name_to_fused_node[name] = fe_node
+
+        self.nodes = [
+            node for node in self.nodes if node.get_name() not in removed_node_names
+        ] + fe_nodes
+
+    def compute_dependencies(self):
+        """
+        Create dependency edges between nodes, handling aliasing and
+        mutation properly.
+        """
+
+        T = TypeVar("T")
+
+        class DedupList(Generic[T]):
+            """
+            This data structure behaves like a list except it makes sure the
+            elements remain unique.
+            Normally one could use a set/dict for this purpose however
+            the list in question gets elements appended as it is being
+            iterated over which means that we need to keep the list
+            semantics.
+            """
+
+            def __init__(self, items=None, membership=None):
+                self.items = items or list()
+                self.membership = membership or set()
+
+            def append(self, node_user: T) -> None:
+                if node_user in self.membership:
+                    return
+                self.items.append(node_user)
+                self.membership.add(node_user)
+
+            def __add__(self, other: "DedupList[T]") -> "DedupList[T]":
+                new_membership = set.union(self.membership, other.membership)
+                new_items = self.items + [
+                    x for x in other.items if x not in self.membership
+                ]
+                return DedupList(new_items, new_membership)
+
+        name_to_users: DefaultDict[str, DedupList[NodeUser]] = collections.defaultdict(
+            DedupList
+        )
+
+        # handle aliasing by using python aliasing in name_to_users
+        # if foo aliases bar then we will make name_to_users["foo"] point
+        # to the same python list as name_to_users["bar"]
+        for node1 in self.nodes:
+            node1_name = node1.get_name()
+            for node2_name in node1.get_aliases():
+                if node1_name in name_to_users and node2_name in name_to_users:
+                    # merge the two
+                    list1 = name_to_users[node1_name]
+                    list2 = name_to_users[node2_name]
+                    combined = list1 + list2
+                    for key in name_to_users.keys():
+                        if name_to_users[key] is list1 or name_to_users[key] is list2:
+                            name_to_users[key] = combined
+                elif node1_name in name_to_users:
+                    name_to_users[node2_name] = name_to_users[node1_name]
+                else:
+                    name_to_users[node1_name] = name_to_users[node2_name]
+
+        def rename(n):
+            if n in self.mutation_renames:
+                return rename(self.mutation_renames[n])
+            return n
+
+        def dep_closure(node_name):
+            reachable_names = {node_name}
+            node = self.name_to_node[node_name]
+            write_dep = next(iter(node.read_writes.writes))
+            for read_dep in node.read_writes.reads:
+                if (
+                    read_dep.name in self.name_to_node
+                    and isinstance(read_dep, dependencies.MemoryDep)
+                    and isinstance(write_dep, dependencies.MemoryDep)
+                    and read_dep.index == write_dep.index
+                    and read_dep.size == write_dep.size
+                ):
+                    reachable_names.update(dep_closure(read_dep.name))
+            return reachable_names
+
+        def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
+            name_to_users[rename(used_by_name)].append(
+                NodeUser(user_node, can_inplace, is_weak)
+            )
+
+        unbacked_symbol_to_origin_node = {}
+
+        for node in self.nodes:
+            log.debug("scheduling %s", node.node)
+
+            # unbacked symbols don't follow ordinary buffer dependencies, so
+            # we track their def/uses separately
+            unbacked_symbol_defs = sorted(
+                node.node.get_unbacked_symbol_defs(), key=lambda x: x.name
+            )
+            for s in unbacked_symbol_defs:
+                assert isinstance(s, sympy.Symbol)
+                # Pick the first definer as canonical.  There may be multiple
+                # because if a MultiOutputLayout buffer propagates an unbacked
+                # symint to multiple outputs, they will all claim to def it.
+                if s not in unbacked_symbol_to_origin_node:
+                    unbacked_symbol_to_origin_node[s] = node
+
+            unbacked_symbol_uses = sorted(
+                node.node.get_unbacked_symbol_uses(), key=lambda x: x.name
+            )
+            # if a kernel takes unbacked symints, register dependencies
+            for s in unbacked_symbol_uses:
+                assert (
+                    s in unbacked_symbol_to_origin_node
+                ), f"{s} not in {unbacked_symbol_to_origin_node}"
+                node.add_fake_dep(StarDep(unbacked_symbol_to_origin_node[s].get_name()))
+
+            # a node will mutate either 0 or 1 buffers
+            assert len(node.get_mutations()) <= 1
+            for alt_name in node.get_mutations():
+                alt_name = rename(alt_name)
+                # this node must run after the prior writer
+                add_user(alt_name, node)
+                node.add_mutation_dep(StarDep(alt_name))
+                for other_node in name_to_users[alt_name].items:
+                    # this node must run after all prior readers
+                    other_name = rename(other_node.get_name())
+                    known_dep_node_names = dep_closure(node.get_name())
+                    if other_name not in known_dep_node_names:
+                        # If this node already directly or indirectly depends on other_node,
+                        # we don't need to insert an extra dep.
+                        node.add_mutation_dep(WeakDep(other_name))
+                        add_user(other_name, node, is_weak=True)
+
+            # add normal non-mutation dependencies
+            for read in node.read_writes.reads:
+                is_weak = isinstance(read, WeakDep)
+                add_user(read.name, node, node.can_inplace(read), is_weak)
+
+            node.update_mutated_names(self.mutation_renames)
+
+            # update our renaming scheme for the next iteration
+            for alt_name in node.get_mutations():
+                self.mutation_renames[rename(alt_name)] = node.get_name()
+                self.mutation_renames[alt_name] = node.get_name()
+                self.mutation_real_name[node.get_name()] = self.mutation_real_name.get(
+                    alt_name, alt_name
+                )
+
+        # make sure outputs aren't dead-code-eliminated
+        for node_name in V.graph.get_output_names():
+            log.debug("scheduling output %s", node_name)
+            add_user(node_name, OutputNode(StarDep(node_name)))
+
+        # make sure unbacked symints aren't dead-code-eliminated
+        for node in V.graph.graph_outputs:
+            for s in node.get_unbacked_symbol_uses():
+                assert (
+                    s in unbacked_symbol_to_origin_node
+                ), f"{s} not in {unbacked_symbol_to_origin_node.keys()}"
+                node_name = unbacked_symbol_to_origin_node[s].node.name
+                log.debug("scheduling output %s for unbacked symint %s", node_name, s)
+                add_user(node_name, OutputNode(StarDep(node_name)))
+
+        # make sure input mutation isn't dead-code-eliminated
+        for name in self.mutation_renames:
+            if name in V.graph.graph_inputs:
+                add_user(name, OutputNode(StarDep(name)))
+                V.graph.mutated_inputs.add(name)
+
+        inp_names = {
+            name: index for index, name in enumerate(V.graph.graph_inputs.keys())
+        }
+        V.graph.mutated_input_idxs = [
+            inp_names[name] for name in V.graph.mutated_inputs
+        ]
+
+        # copy users information onto the nodes
+        for node in self.nodes:
+            node.set_users(name_to_users[node.get_name()].items)
+
+        # populate inverse_users
+        for node in self.nodes:
+            for user in node.users:
+                user.node.inverse_users.append(node)
+
+    def compute_node_users(self):
+        # set up buffer name to (fused)snode mapping
+        buf_to_snode = {}
+        for node in self.nodes:
+            if isinstance(node, FusedSchedulerNode):
+                for x in node.snodes:
+                    buf_to_snode[x.get_name()] = node
+            buf_to_snode[node.get_name()] = node
+
+        for node in self.nodes:
+            node.node_users = []
+            node.inverse_users = []
+
+        # compute inverse_users
+        for node in self.nodes:
+            inverse_users = []
+            for dep in node.unmet_dependencies:
+                assert dep.name in buf_to_snode
+                dep_node = buf_to_snode[dep.name]
+                inverse_users.append(dep_node)
+            node.inverse_users = inverse_users
+
+        # compute node_users
+        # TODO: ideally, we should deduplicate .users and .node_users,
+        # but currently .users contains extra information that's difficult to
+        # extract into a standalone container.
+        node_to_users: Dict[BaseSchedulerNode, List[BaseSchedulerNode]] = {}
+        for node in self.nodes:
+            for inverse_user in node.inverse_users:
+                node_to_users.setdefault(inverse_user, []).append(node)
+        for node, users in node_to_users.items():
+            node.node_users = users
+
+    def dead_node_elimination(self):
+        """
+        Remove any nodes without users
+        """
+        again = True  # repeat until a fixed point
+        while again:
+            updated_nodes = []
+            for node in self.nodes:
+
+                def can_eliminate_user(user: NodeUser):
+                    return user.is_weak or user.get_name() in V.graph.removed_buffers
+
+                can_eliminate = not node.has_side_effects() and all(
+                    can_eliminate_user(u) for u in node.users
+                )
+
+                if not can_eliminate:
+                    updated_nodes.append(node)
+                else:
+                    # dead code
+                    log.debug("removed dead node: %s", node.get_name())
+                    V.graph.removed_buffers.add(node.get_name())
+
+            again = len(self.nodes) > len(updated_nodes)
+            self.nodes = updated_nodes
+
+        # Prune any WeakDeps no longer needed
+        for node in self.nodes:
+            node.prune_weak_deps()
+
+    def topological_sort_schedule(self):
+        """
+        Ensure self.nodes is in topologically sorted order
+        """
+        seen: Set[ir.Buffer] = set()
+        name_to_node: Dict[str, ir.Buffer] = dict()
+        result: List[ir.Buffer] = []
+
+        def visit(n):
+            if n not in seen:
+                seen.add(n)
+                for dep in sorted(n.unmet_dependencies, key=lambda d: d.name):
+                    visit(name_to_node[dep.name])
+                result.append(n)
+
+        for node in self.nodes:
+            for name in node.get_names():
+                name_to_node[name] = node
+        for node in self.nodes:
+            visit(node)
+        self.nodes = result
+
+    def compute_ancestors(self):
+        """
+        Populate each node.ancestors
+        """
+        # note self.nodes is topologically sorted
+        name_to_ancestors: Dict[str, Set[str]] = {}
+        for node in self.nodes:
+            ancestors = set()
+            for dep in node.unmet_dependencies:
+                ancestors.add(dep.name)
+                ancestors |= name_to_ancestors[dep.name]
+            name_to_ancestors[node.get_name()] = ancestors
+            node.ancestors = ancestors
+
+        for order, node in enumerate(self.nodes):
+            node.min_order = order
+            node.max_order = order
+
+    def fuse_nodes(self):
+        """
+        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
+        """
+        for i in range(10):
+            old_len = len(self.nodes)
+            fusion_log.debug(
+                "===== attempting fusion (%d/10): %d nodes =====", i + 1, old_len
+            )
+            self.fuse_nodes_once()
+            new_len = len(self.nodes)
+            fusion_log.debug(
+                "completed fusion round (%d/10): fused %d nodes into %d nodes\n",
+                i + 1,
+                old_len,
+                new_len,
+            )
+            if new_len == old_len or new_len == 1:
+                fusion_log.debug("===== fusion complete (%d iterations) =====", i + 1)
+                break
+
+    def benchmark_fused_nodes(self, nodes):
+        """
+        Benchmark fused list of nodes and return the execution time
+        in milliseconds on randomly generated inputs.
+        """
+        assert len(nodes) > 0
+        device = nodes[0].get_device()
+        V.graph.scheduler = self
+        self.current_device = device
+        backend = self.get_backend(device)
+        return backend.benchmark_fused_nodes(nodes)
+
+    def speedup_by_fusion(self, node1, node2):
+        """
+        If config.benchmark_fusion is False, always return True.
+        Otherwise, return True if fusion can brings speedup.
+        """
+        if not config.benchmark_fusion:
+            return True
+
+        if (
+            node1.is_template()
+            and not isinstance(node1.get_template_node(), ir.TritonTemplateBuffer)
+            or node1.is_foreach()
+            or node2.is_foreach()
+        ):
+            # TODO support benchmarking epilogue fusion
+            return True
+
+        node_list_1 = node1.get_nodes()
+        device = node_list_1[0].get_device()
+
+        # don't support benchmark fusion for CPU right now.
+        if device.type == "cpu":
+            return True
+
+        node_list_2 = node2.get_nodes()
+        node_list_fused = node_list_1 + node_list_2
+
+        # We can not accurately benchmark kernel using atomic_add
+        # due to how we generate random integer inputs.
+        # Skip benchmarking them by allowing fusion.
+        if any(
+            hasattr(n.node, "data")
+            and hasattr(n.node.data, "scatter_mode")
+            and n.node.data.scatter_mode == "atomic_add"
+            for n in node_list_fused
+        ):
+            return True
+
+        from triton.compiler.errors import CompilationError
+
+        why = WhyNoFuse(node1, node2)
+
+        try:
+            ms1, path1 = self.benchmark_fused_nodes(node_list_1)
+            if math.isinf(ms1):
+                why("register spilling of the first kernel")
+                return False
+            ms2, path2 = self.benchmark_fused_nodes(node_list_2)
+            if math.isinf(ms2):
+                why("register spilling of the second kernel")
+                return False
+            ms_fused, path_fused = self.benchmark_fused_nodes(node_list_fused)
+            if math.isinf(ms_fused):
+                why("register spilling of the fused kernel")
+                return False
+        except CompilationError as e:
+            # workaround triton issue: https://github.com/openai/triton/issues/2151
+            if "Loop-carried variable" in str(e):
+                return True  # allow fusion
+            else:
+                raise
+
+        if fusion_log.isEnabledFor(logging.DEBUG):
+            if ms_fused < ms1 + ms2:
+                fusion_log.debug(
+                    "can fuse (benchmark): fusing %s with %s cause %sx speedup",
+                    node1.get_names(),
+                    node2.get_names(),
+                    green_text(f"{(ms1 + ms2) / ms_fused:.3f}"),
+                )
+            else:
+                fusion_log.debug(
+                    "cannot fuse (benchmark): fusing %s with %s cause %sx slowdown",
+                    node1.get_names(),
+                    node2.get_names(),
+                    red_text(f"{ms_fused / (ms1 + ms2):.3f}"),
+                )
+
+        if (
+            is_metric_table_enabled("slow_fusion")
+            and ms_fused >= ms1 + ms2
+            and (path1, path2) not in self.logged_slow_fusion
+        ):
+            self.logged_slow_fusion.add((path1, path2))
+            get_metric_table("slow_fusion").add_row(
+                lambda: {
+                    "kernel1_path": path1,
+                    "kernel1_latency": ms1,
+                    "kernel2_path": path2,
+                    "kernel2_latency": ms2,
+                    "fused_kernel_path": path_fused,
+                    "fused_kernel_latency": ms_fused,
+                    "slow_down_ratio": ms_fused / (ms1 + ms2),
+                }
+            )
+        return ms_fused < ms1 + ms2
+
+    def fuse_nodes_once(self):
+        """
+        Mutates self.nodes to combine nodes into FusedSchedulerNodes.
+
+        This relies on two key functions to control the logic:
+            - self.can_fuse(): checks if a fusion is legal
+            - self.score_fusion(): assigns priority to a given fusion
+        """
+        fused_nodes = set(self.nodes)
+        for node1, node2 in self.get_possible_fusions():
+            node1 = self.name_to_fused_node[node1.get_first_name()]
+            node2 = self.name_to_fused_node[node2.get_first_name()]
+            if self.can_fuse(node1, node2) and not self.will_fusion_create_cycle(
+                node1, node2
+            ):
+                if not self.speedup_by_fusion(node1, node2):
+                    continue
+                fusion_log.debug(
+                    "fusing %s with %s", node1.get_name(), node2.get_name()
+                )
+
+                # above can_fuse asserts that node2 has the same device
+                device = node1.get_device()
+                node3 = self.get_backend(device).fuse(node1, node2)
+                fused_nodes.remove(node1)
+                fused_nodes.remove(node2)
+                fused_nodes.add(node3)
+                self.name_to_fused_node.update(
+                    {n.get_name(): node3 for n in node3.get_nodes()}
+                )
+        self.nodes = sorted(fused_nodes, key=lambda x: x.min_order)
+        self.topological_sort_schedule()
+        self.prune_redundant_deps()
+
+    def prune_redundant_deps(self):
+        for node in self.nodes:
+            node.prune_redundant_deps(self.name_to_fused_node)
+
+    def get_possible_fusions(self):
+        """
+        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
+        """
+        possible_fusions = []
+        seen = set()
+
+        def check_all_pairs(nodes):
+            for node1_index, node1 in enumerate(nodes):
+                for node2 in nodes[node1_index + 1 :]:
+                    key = (node1, node2)
+                    if key in seen:
+                        continue
+                    seen.add(key)
+
+                    if self.can_fuse(node1, node2):
+                        possible_fusions.append(key)
+                    elif (node2.is_template() or node2.is_foreach()) and self.can_fuse(
+                        node2, node1
+                    ):
+                        # foreach fusions and epilogue fusions are order dependent
+                        possible_fusions.append((node2, node1))
+
+        buffer_names_grouping = collections.defaultdict(list)
+        for node in self.nodes:
+            for buf in node.used_buffer_names():
+                buffer_names_grouping[buf].append(node)
+        for node_grouping in buffer_names_grouping.values():
+            check_all_pairs(node_grouping)
+
+        if config.aggressive_fusion:
+            group_grouping = collections.defaultdict(list)
+            for node in self.nodes:
+                group = getattr(node, "group", None)
+                if group:
+                    group_grouping[group].append(node)
+            for node_grouping in group_grouping.values():
+                check_all_pairs(node_grouping)
+
+        possible_fusions.sort(key=self.score_fusion_key, reverse=True)
+        fusion_log.debug("found %d possible fusions", len(possible_fusions))
+        return possible_fusions
+
+    def will_fusion_create_cycle(self, node1, node2):
+        """
+        Finds whether there's a path from node1 to node2 (or vice-versa)
+        caused indirectly by other fusions.
+        """
+
+        def found_path(node):
+            # only fused nodes can introduce new ancestors.
+            if isinstance(node, FusedSchedulerNode) and node not in visited:
+                visited.add(node)
+                if node.get_names().issubset(combined_ancestors):
+                    # All fusion outputs are in ancestors of node1 and node2, thus
+                    # cannot introduce new path:
+                    #
+                    # 1. if output is neither descendent of node1 or node2, the
+                    #        output cannot introduce a path
+                    # 2. due to [can_fuse]: if WLOG output is descendent of node1, it cannot be
+                    #        on path(node1->node2), hence it cannot be ancestor of node2
+                    # 3. due to [acyclic]: if WLOG output is descendent of node1, it cannot be
+                    #        ancestor of node1
+                    return False
+                else:
+                    # continue DFS of new ancestors introduced by the fusion
+                    return bool(combined_names & node.ancestors) or any(
+                        found_path(self.name_to_fused_node[n])
+                        for n in node.ancestors - combined_ancestors
+                    )
+            return False
+
+        visited = set()
+        combined_names = node1.get_names() | node2.get_names()
+        combined_ancestors = (node1.ancestors | node2.ancestors) - combined_names
+        cycle = any(found_path(self.name_to_fused_node[n]) for n in combined_ancestors)
+        if cycle:
+            WhyNoFuse(node1, node2)("will create cycle")
+        return cycle
+
+    def can_fusion_increase_peak_memory(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ):
+        """
+        This function prevents fusion for nodes that can increase memory
+        footprint. This problem is more common in horizontal fusion, where nodes
+        that are far apart in the original order get fused, lengthening the live
+        intervals of tensors. This is very evident in models with activation
+        checkpointing, where the recomputed nodes from different checkpointed
+        regions get fused and significantly increase the memory footprint.
+
+        The current attempt is a quick, possibly hacky, heuristic to prevent the
+        fusion of nodes that are far away in the original order.
+
+        A better but difficult to implement heurisitic would be to use live
+        intervals of the buffers, find region of peak pressure in the original
+        program and prevent fusion that crosses that peak region. We might need
+        special care or good approximation in this implementation, as fusion of
+        node changes live intervals, and re-computing live intervals and peak
+        memory after each fusion can introduce large compilation overhead.
+        """
+        proximity_score = max(
+            abs(node1.min_order - node2.max_order),
+            abs(node2.min_order - node1.max_order),
+        )
+        return proximity_score > 64
+
+    def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Determine if it is possible to combine node1 and node2 into a
+        single fused node.
+        """
+
+        if node1 is node2:
+            return False
+
+        why = WhyNoFuse(node1, node2)
+
+        if (
+            isinstance(node1, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
+            and not node1.is_template()
+        ):
+            why("node1 is extern or nop")
+            return False
+        if (
+            isinstance(node2, (ExternKernelSchedulerNode, NopKernelSchedulerNode))
+            and not node2.is_template()
+        ):
+            why("node2 is extern or nop")
+            return False
+
+        if node2.get_names() & node1.ancestors:
+            why("node1 must go before node2")
+            return False
+
+        if (
+            isinstance(node1, (FusedSchedulerNode, SchedulerNode))
+            and isinstance(node2, SchedulerNode)
+            and isinstance(node2._body, ir.LoopBody)
+        ):
+            # Fix issue: https://github.com/pytorch/pytorch/issues/108963
+            # Check:
+            #   If node2 reads a buf which is a mutation buf of node1(SchedulerNode) or among nodes in node1(FusedSchedulerNode),
+            #   we will get the corresponding mutation buf and check if this mutation buf is stored by atomic_add mode.
+            # If True, we will disable the fusion of node1 and node2.
+            if any(
+                (
+                    node2_used_buf in self.mutation_renames
+                    and node1.has_atomic_add(self.mutation_renames[node2_used_buf])
+                )
+                for node2_used_buf in node2._body.reads_name2expr.keys()
+            ):
+                return False
+
+        if node2.is_template():
+            why("templates can only fuse epilogues")
+            return False
+        if node1.is_template() and (
+            node2.has_aliasing_or_mutation()
+            or node2.is_reduction()
+            or not config.epilogue_fusion
+        ):
+            why("template epilogue not satisfied")
+            return False
+
+        device = node1.get_device()
+        device2 = node2.get_device()
+        if device != device2:
+            why("device mismatch (%s vs %s)", device, device2)
+            return False
+        del device2
+
+        no_shared_data = self.score_fusion_memory(node1, node2) == 0
+        if no_shared_data and (
+            not config.aggressive_fusion or node1.is_reduction() or node2.is_reduction()
+        ):
+            why("no shared data")
+            return False  # heuristic not needed for correctness
+
+        if (
+            not node1.is_foreach()
+            and not node2.is_foreach()
+            and len(node1.get_nodes()) + len(node2.get_nodes()) > config.max_fusion_size
+        ):
+            why("exceeds max fusion")
+            return False  # heuristic not needed for correctness
+
+        if node1.get_names() & node2.ancestors:
+            # node2 depends on node1 outputs
+            if not self.can_fuse_vertical(node1, node2):
+                return False
+            return self.get_backend(device).can_fuse_vertical(node1, node2)
+        else:  # nodes don't depend on each other, but may have common reads
+            if self.can_fusion_increase_peak_memory(node1, node2):
+                why("will increase peak memory")
+                return False
+            return self.get_backend(device).can_fuse_horizontal(node1, node2)
+
+    def can_fuse_vertical(self, node1, node2):
+        """
+        Check if it is legal to fuse a consumer (node2) into a producer (node1).
+
+        We can fuse them if all the reads of node2 either match
+        corresponding writes in node1, or are written by nodes that can
+        be scheduled before the fusion of node1 and node2.
+
+        We also disable fusion of a write subsequent to a read if the reads
+        and writes do not align.
+        """
+        node1_names = node1.get_names()
+        computed_deps = set()
+        why = WhyNoFuse(node1, node2)
+
+        # StarDep doesn't match MemoryDep, different indices don't match
+        # However, broadcasting sometimes strips dimensions, and if that's the case
+        # we still can match unmet dep
+        # if there's indirect indexing, don't match it
+        def fusable_read_and_write(read: Dep, write: Dep):
+            return (
+                self.mutation_renames.get(read.name, read.name) == write.name
+                and (isinstance(read, MemoryDep) and isinstance(write, MemoryDep))
+                and not free_symbol_has(read.index, "tmp")
+                and not free_symbol_has(write.index, "tmp")
+                and read.index == write.index
+                and len(read.size) >= len(write.size)
+                and read.size[: len(write.size)] == write.size
+            )
+
+        for rd in node2.unmet_dependencies:
+            for cd in node1.read_writes.writes:
+                if fusable_read_and_write(rd, cd):
+                    computed_deps.add(rd)
+
+        remaining_deps = {dep.name for dep in node2.unmet_dependencies - computed_deps}
+        if remaining_deps & node1_names:
+            # MemoryDeps didn't match and read different locations of the same buffer.
+            # Examples here include:
+            #   - MemoryDep("foo", x) != MemoryDep("foo", x + 1)
+            #   - MemoryDep("foo", x) != StarDep("foo")
+            why("memory deps did not match")
+            return False
+        for name in remaining_deps:
+            if node1_names & self.name_to_fused_node[name].ancestors:
+                why("intermediate nodes between node1 & node2")
+                return False
+
+        # similar to can_inplace, if we are going to fuse a write subsequent to a read
+        # require that the indexing and size is the same
+        for write in node2.read_writes.writes:
+            for read in node1.read_writes.reads:
+                if write.name != self.mutation_renames.get(read.name, read.name):
+                    continue
+
+                # bail on StarDep
+                if not fusable_read_and_write(read=read, write=write):
+                    why("fusing a write into a read with different indexing formula")
+                    return False
+
+        return True
+
+    def score_fusion(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Assign a score (higher comes first) to the fusion of node1
+        and node2.  When different fusions conflict with each other,
+        this is the way we decide what order to run them in.
+
+        Our current score is based on:
+        - Estimate of the saved memory operations
+        - Fusions closer together in original order
+        """
+        memory_score = self.score_fusion_memory(node1, node2)
+        proximity_score = -max(
+            abs(node1.min_order - node2.max_order),
+            abs(node2.min_order - node1.max_order),
+        )
+        return (
+            node1.is_template() == config.epilogue_fusion_first and memory_score > 0,
+            node1.is_reduction() == node2.is_reduction() and memory_score > 0,
+            memory_score,
+            proximity_score,
+        )
+
+    def score_fusion_memory(self, node1, node2):
+        """
+        The first term in our fusion score that estimates number of saved memory operations.
+        """
+        common_memory_deps = (node1.read_writes.reads | node1.read_writes.writes) & (
+            node2.read_writes.reads | node2.read_writes.writes
+        )
+        common_memory_deps = {
+            dep for dep in common_memory_deps if not dep.has_unbacked_symbols()
+        }
+        return sum(dep.numbytes_hint() for dep in common_memory_deps)
+
+    def score_fusion_key(self, nodes):
+        """
+        Shim for list.sort(key=...)
+        """
+        node1, node2 = nodes
+        return self.score_fusion(node1, node2)
+
+    def compute_last_usage(self):
+        """
+        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
+        """
+
+        future_used_buffers = set()
+        for node_name in V.graph.get_output_names():
+            future_used_buffers.add(node_name)
+
+        for node in reversed(self.nodes):
+            node.set_last_usage(future_used_buffers, self.mutation_real_name)
+            future_used_buffers.update(node.last_usage)
+
+    def free_buffers(self):
+        """Free any buffers that are no longer needed"""
+        for name in sorted(
+            self.buffer_names_to_free
+            - V.graph.removed_buffers
+            - V.graph.wrapper_code.freed
+        ):
+            if name in self.name_to_node:
+                node = self.name_to_node[name]
+                if node.can_free():
+                    V.graph.wrapper_code.codegen_free(node.node)
+            elif name in V.graph.graph_inputs:
+                storage = V.graph.graph_inputs[name].data
+                assert isinstance(storage, ir.StorageBox) and storage.is_input_buffer()
+                V.graph.wrapper_code.codegen_free(storage.data)
+
+        self.buffer_names_to_free.clear()
+
+    def remove_kernel_local_buffers(self):
+        """
+        Any buffers that are both created and have a last use in the
+        same kernel can be removed.
+        """
+
+        # V.kernel.store_buffer_names should represent the set of nodes
+        # get fused
+        fused_node_names = V.kernel.store_buffer_names
+        names_to_remove = []
+        for out_buf in V.kernel.store_buffer_names:
+            users = self.name_to_node[out_buf].users
+            assert users is not None
+            users = {user.get_name() for user in users if not user.is_weak}
+            if users.issubset(fused_node_names):
+                names_to_remove.append(out_buf)
+
+        def remove_filter(n):
+            return (
+                n not in V.kernel.must_keep_buffers
+                and n not in V.kernel.args.input_buffers
+                and n not in self.mutation_renames
+                and n not in self.mutation_real_name
+            )
+
+        names_to_remove = list(filter(remove_filter, names_to_remove))
+
+        for name in names_to_remove:
+            if name in V.kernel.args.inplace_buffers:
+                buf = V.kernel.args.inplace_buffers[name]
+                if isinstance(buf, str) and buf.startswith("REMOVED"):
+                    continue
+                remove = all(n in names_to_remove for n in buf.other_names)
+                if remove:
+                    self.remove_inplace_buffer(name)
+                V.kernel.inplaced_to_remove.add(name)
+            else:
+                self.remove_buffer(name)
+
+    def remove_buffer(self, name):
+        # Assign a special value instead of deleting the entry
+        # because we still rely on output_buffers's length to
+        # generate unique arg name.
+        log.debug("remove_buffer(%r)", name)
+        V.kernel.args.output_buffers[name] = "REMOVED"
+        V.kernel.removed_buffers.add(name)
+
+    def remove_inplace_buffer(self, name):
+        log.debug("removing_inplace_buffer(%r)", name)
+        inner_name = V.kernel.args.inplace_buffers[name].inner_name
+        V.kernel.args.inplace_buffers[name] = inner_name.replace(
+            "in_out_ptr", "REMOVED"
+        )
+        V.kernel.removed_buffers.add(name)
+
+    def flush(self):
+        for backend in self.backends.values():
+            backend.flush()
+        self.free_buffers()
+
+    def codegen_extern_call(self, scheduler_node: ExternKernelSchedulerNode):
+        assert isinstance(scheduler_node, ExternKernelSchedulerNode)
+        # 'decide_inplace_update' stores the inplace update decisions in
+        # the current kernel from where 'allocate' retrieve those decisions.
+        # We have to make sure there is a non-NULL kernel handler to store
+        # those inplace update decisions.
+        with V.set_kernel_handler(Kernel(increase_kernel_count=False)):
+            scheduler_node.decide_inplace_update()
+            scheduler_node.allocate()
+        node = scheduler_node.node
+        assert isinstance(node, ir.ExternKernel), f"{type(node)=}"
+        node.codegen(V.graph.wrapper_code)
+        self.free_buffers()
+
+    def create_backend(self, device: torch.device):
+        assert (
+            device.type != "cuda" or device.index is not None
+        ), f"{device} should have been normalized in lowering"
+        V.graph.add_device_info(device)
+
+        device_scheduling = get_scheduling_for_device(device.type)
+        if device_scheduling is None:
+            raise RuntimeError(f"Unsupported device type: {device.type}")
+
+        if device.type == "cuda" and not has_triton():
+            device_props = torch.cuda.get_device_properties(device)
+            if device_props.major < 7:
+                raise RuntimeError(
+                    f"Found {device_props.name} which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability {device_props.major}.{device_props.minor}"  # noqa: B950
+                )
+            else:
+                raise RuntimeError(
+                    "Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton"  # noqa: B950
+                )
+
+        return device_scheduling(self)
+
+    def get_backend(self, device: torch.device):
+        if device not in self.backends:
+            self.backends[device] = self.create_backend(device)
+        return self.backends[device]
+
+    def enter_context(self, node):
+        def get_order(n):
+            if n not in self.origin_to_index:
+                self.origin_to_index.update({n: i for i, n in enumerate(n.graph.nodes)})
+            return self.origin_to_index[n]
+
+        # Use a dict to have ordering
+        origins = {
+            (get_order(e), e): None for n in node.get_nodes() for e in n.node.origins
+        }
+        origins = list(origins.keys())
+        if origins:
+            _, last = max(origins, key=operator.itemgetter(0))
+            V.graph.wrapper_code.enter_context(last)
+
+    @dynamo_timed
+    def codegen(self):
+        for node in self.nodes:
+            try:
+                log.debug(
+                    "Generating code for node %s with estimated runtime %f",
+                    node.get_name(),
+                    node.get_estimated_runtime(),
+                )
+            except Exception as e:
+                log.debug(
+                    "Generating code for node %s with estimated runtime 0.0",
+                    node.get_name(),
+                )
+
+            self.enter_context(node)
+
+            if not isinstance(node, NopKernelSchedulerNode):
+                device = node.get_device()
+                if (
+                    device != self.current_device
+                    or node.is_extern()
+                    or node.is_template()
+                ):
+                    self.flush()
+                if device != self.current_device:
+                    if device.type == "cuda":
+                        if self.current_device and self.current_device.type == "cuda":
+                            V.graph.wrapper_code.codegen_device_guard_exit()
+                        assert device.index is not None, "device should have an index"
+                        V.graph.wrapper_code.codegen_device_guard_enter(device.index)
+                    elif self.current_device and self.current_device.type == "cuda":
+                        V.graph.wrapper_code.codegen_device_guard_exit()
+                    self.current_device = device
+
+            self.buffer_names_to_free.update(node.last_usage)
+
+            if node.is_template():
+                node, *epilogue = node.get_nodes()
+                self.get_backend(device).codegen_template(node, epilogue)  # type: ignore[possibly-undefined]
+            elif node.is_extern():
+                self.codegen_extern_call(node)
+            elif node.is_foreach():
+                self.get_backend(device).codegen_foreach(node)  # type: ignore[possibly-undefined]
+            elif isinstance(node, (FusedSchedulerNode, SchedulerNode)):
+                self.get_backend(device).codegen_nodes(node.get_nodes())  # type: ignore[possibly-undefined]
+            else:
+                assert isinstance(node, NopKernelSchedulerNode)
+                node.allocate()
+
+            if config.debug_check_inf_and_nan:
+                V.graph.wrapper_code.generate_inf_and_nan_checker(node)
+
+            if config.triton.debug_sync_kernel:
+                self.get_backend(device).codegen_sync()  # type: ignore[possibly-undefined]
+
+            self.available_buffer_names.update(node.get_names())
+
+            if not isinstance(node, NopKernelSchedulerNode):
+                device = node.get_device()
+                if self.get_backend(device).ready_to_flush():
+                    self.flush()
+
+        if self.current_device and self.current_device.type == "cuda":
+            # exit the outermost CUDA device guard. this is
+            # important for nested indentation codegen-ing.
+            V.graph.wrapper_code.codegen_device_guard_exit()
+
+        self.flush()
+
+    def is_unaligned_buffer(self, buf_name):
+        if buf_name in V.graph.graph_inputs or buf_name in V.graph.constants:
+            # all graph inputs or constants are assumed to be aligned
+            return False
+        node = self.name_to_node[buf_name]
+        layout = node.node.get_layout()
+        if isinstance(layout, ir.AliasedLayout):
+            return not layout.maybe_guard_aligned()
+        else:
+            return False
+
+
+class BaseScheduling:
+    def can_fuse_vertical(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Check whether node1 and node2 can be vertically fused or not.
+        """
+        raise NotImplementedError()
+
+    def can_fuse_horizontal(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Check whether node1 and node2 can be horizontally fused or not.
+        """
+        raise NotImplementedError()
+
+    def fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
+        """
+        Fuse two nodes
+        """
+        if node1.is_foreach() or node2.is_foreach():
+            return ForeachKernelSchedulerNode.fuse(node1, node2)
+        else:
+            return FusedSchedulerNode.fuse(node1, node2)
+
+    def group_fn(self, sizes):
+        """
+        Process the iteration sizes in case a transformation needs to be applied.
+        """
+        raise NotImplementedError()
+
+    def codegen_template(
+        self, template_node: SchedulerNode, epilogue_nodes: List[SchedulerNode]
+    ):
+        """
+        Given a template node, generate a kernel.
+
+        This function is only available for triton now. If the third-party backend behaves as a sub-class
+        of TritonScheduling, it can override it or reuse it.
+        """
+        raise NotImplementedError()
+
+    def codegen_nodes(self, nodes: List[SchedulerNode]):
+        """
+        Generate a kernel given a list of pre-fused nodes.
+        """
+        raise NotImplementedError()
+
+    def codegen_sync(self):
+        """
+        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
+        """
+        raise NotImplementedError()
+
+    def ready_to_flush(self) -> bool:
+        """
+        Check whether the backend is requesting the scheduler to flush the generated kernel.
+        If not supported, please return False.
+        """
+        return False
+
+    def flush(self):
+        """
+        Flush the generated kernel and python wrapper code to the source code file.
+        """
+        raise NotImplementedError()
+
+    def benchmark_fused_nodes(self, nodes):
+        """
+        Benchmark fused list of nodes and return the execution time
+        in milliseconds on randomly generated inputs.
+        """
+        raise NotImplementedError()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/select_algorithm.py b/MLPY/Lib/site-packages/torch/_inductor/select_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5865c3d9d0b9a04b509f49e1f562296928352c05
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/select_algorithm.py
@@ -0,0 +1,1156 @@
+import builtins
+import functools
+import inspect
+import itertools
+import logging
+import operator
+import sys
+import textwrap
+import time
+from concurrent.futures import ThreadPoolExecutor
+from io import StringIO
+
+from typing import Any, Callable, Dict, List, Optional, Union
+from unittest.mock import patch
+
+import sympy
+
+import torch
+from torch._dynamo.testing import rand_strided
+from torch._dynamo.utils import counters, identity, preserve_rng_state
+
+from . import config, ir
+from .autotune_process import TensorMeta, TritonBenchmarkRequest
+from .codecache import code_hash, PersistentCache, PyCodeCache
+from .codegen.common import (
+    ChoiceCaller,
+    IndentedBuffer,
+    KernelTemplate,
+    PrimitiveInfoType,
+)
+from .codegen.triton import (
+    gen_common_triton_imports,
+    texpr,
+    TritonKernel,
+    TritonPrinter,
+    TritonScheduling,
+)
+from .codegen.triton_utils import config_of, signature_to_meta
+from .exc import CUDACompileError
+from .utils import (
+    do_bench,
+    get_dtype_size,
+    Placeholder,
+    sympy_dot,
+    sympy_product,
+    unique,
+)
+from .virtualized import V
+
+log = logging.getLogger(__name__)
+
+# correctness checks struggle with fp16/tf32
+VERIFY: Dict[str, Any] = dict()
+PRINT_AUTOTUNE = True
+DEBUG = False
+
+
+class KernelNamespace:
+    pass
+
+
+# these objects are imported from the generated wrapper code
+extern_kernels = KernelNamespace()
+
+
+class PartialRender:
+    """
+    Some parts of a template need to be generated at the end, but
+    inserted into the template at the start.  This allows doing a bunch
+    of replacements after the initial render.
+    """
+
+    def __init__(self, code, replacement_hooks):
+        super().__init__()
+        self.code = code
+        self.replacement_hooks = replacement_hooks
+
+    def finalize(self):
+        code = self.code
+        assert code is not None, "can only be called once"
+        self.code = None
+        for key, fn in self.replacement_hooks.items():
+            code = code.replace(key, fn())
+        return code
+
+
+class TritonTemplateKernel(TritonKernel):
+    def __init__(
+        self,
+        kernel_name,
+        input_nodes,
+        output_node,
+        defines,
+        num_stages,
+        num_warps,
+        grid_fn,
+        meta,
+        call_sizes,
+        use_jit=True,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        *,
+        index_dtype,
+    ):
+        super().__init__(
+            sympy_product(output_node.get_size()),
+            sympy.Integer(1),
+            index_dtype=index_dtype,
+        )
+        self.input_nodes = input_nodes
+        self.output_node = output_node
+        self.named_input_nodes = {}
+        self.defines = defines
+        self.kernel_name = kernel_name
+        self.template_mask = None
+        self.use_jit = use_jit
+        self.num_stages = num_stages
+        self.num_warps = num_warps
+        self.grid_fn = grid_fn
+        self.meta = meta
+        self.call_sizes = call_sizes
+        # for templates with fixed epilogues
+        self.prefix_args = prefix_args
+        self.suffix_args = suffix_args
+        self.epilogue_fn = epilogue_fn
+        self.render_hooks = dict()
+        self.triton_meta: Optional[Dict[str, object]] = None
+
+    def need_numel_args(self):
+        return False
+
+    def estimate_kernel_num_bytes(self):
+        """
+        Estimate the total number of bytes this kernel takes.
+        For in/out nodes, sizes are counted twice: once for reading and
+        once for writing.
+        """
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        num_bytes = []
+        for i, inp in enumerate(itertools.chain(self.input_nodes, (self.output_node,))):
+            size = V.graph.sizevars.size_hints(inp.get_size())
+            numel = functools.reduce(operator.mul, size)
+            dtype_size = get_dtype_size(inp.get_dtype())
+            num_bytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(num_bytes)
+
+    def jit_lines(self):
+        if self.use_jit:
+            return "@triton.jit"
+
+        argdefs, _, signature = self.args.python_argdefs()
+        triton_meta = {
+            "signature": signature_to_meta(signature, size_dtype=self.index_dtype),
+            "device": V.graph.scheduler.current_device.index,
+            "device_type": V.graph.scheduler.current_device.type,
+            "constants": {},
+        }
+        triton_meta["configs"] = [config_of(signature)]
+        for arg_num in triton_meta["configs"][0].equal_to_1:  # type: ignore[index]
+            triton_meta["constants"][arg_num] = 1  # type: ignore[index]
+        self.triton_meta = triton_meta
+
+        inductor_meta = {
+            "kernel_name": str(Placeholder.DESCRIPTIVE_NAME),
+            "backend_hash": torch.utils._triton.triton_hash_with_backend(),
+        }
+        if config.profile_bandwidth or config.benchmark_kernel:
+            num_gb = self.estimate_kernel_num_bytes() / 1e9
+            inductor_meta["kernel_num_gb"] = num_gb
+        return f"""
+            @triton_heuristics.template(
+                num_stages={self.num_stages},
+                num_warps={self.num_warps},
+                triton_meta={triton_meta!r},
+                inductor_meta={inductor_meta!r},
+            )
+            @triton.jit
+        """
+
+    def def_kernel(self, *argnames):
+        """
+        Hook called from template code to generate function def and
+        needed args.
+        """
+        assert all(isinstance(x, str) for x in argnames)
+        renames = IndentedBuffer(initial_indent=1)
+
+        named_args = self.input_nodes[
+            self.prefix_args : len(self.input_nodes) - self.suffix_args
+        ]
+
+        assert len(argnames) == len(named_args), (
+            len(argnames),
+            len(named_args),
+            self.prefix_args,
+            len(self.input_nodes),
+        )
+
+        for input_node in self.input_nodes[: self.prefix_args]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+
+        for name, input_node in zip(argnames, named_args):
+            arg_name = f"arg_{name}"
+            self.named_input_nodes[name] = input_node
+            self.args.input_buffers[input_node.get_name()] = arg_name
+
+        # The args may be duplicated, so renaming must be after args are de-duplicated.
+        for name in argnames:
+            input_node = self.named_input_nodes[name]
+            arg_name = self.args.input_buffers[input_node.get_name()]
+            if input_node.get_layout().offset == 0:
+                renames.writeline(f"{name} = {arg_name}")
+            else:
+                offset = texpr(self.rename_indexing(input_node.get_layout().offset))
+                renames.writeline(f"{name} = {arg_name} + {offset}")
+
+        for input_node in self.input_nodes[len(self.input_nodes) - self.suffix_args :]:
+            # get args in correct order
+            self.args.input(input_node.get_name())
+
+        def hook():
+            # python_argdefs() cannot be run until after the rest of the template lazily adds more args
+            arg_defs, *_ = self.args.python_argdefs()
+            code = IndentedBuffer()
+            code.splice(gen_common_triton_imports())
+            code.splice(self.jit_lines())
+            code.writeline(f"def {self.kernel_name}({', '.join(arg_defs)}):")
+            with code.indent():
+                code.splice(self.defines)
+                code.splice(renames.getvalue())
+            return code.getvalue()
+
+        assert "<DEF_KERNEL>" not in self.render_hooks
+        self.render_hooks["<DEF_KERNEL>"] = hook
+        return "<DEF_KERNEL>"
+
+    def size(self, name: str, index: int):
+        """
+        Hook called from template code to get the size of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        assert isinstance(index, int)
+        if name is None:
+            val = self.output_node.get_size()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_size()[index]
+        return texpr(self.rename_indexing(val))
+
+    def stride(self, name, index):
+        """
+        Hook called from template code to get the stride of an arg.
+        Will add needed args to pass it in if it is dynamic.
+        """
+        assert isinstance(index, int)
+        if name is None:
+            val = self.output_node.get_stride()[index]
+        else:
+            assert isinstance(name, str)
+            val = self.named_input_nodes[name].get_stride()[index]
+        return texpr(self.rename_indexing(val))
+
+    def store_output(self, indices, val, mask):
+        """
+        Hook called from template code to store the final output
+        (if the buffer hasn't been optimized away), then append any
+        epilogue fusions.
+        """
+        assert isinstance(indices, (list, tuple))
+        assert isinstance(val, str)
+        assert isinstance(mask, str)
+        assert self.template_mask is None
+        indices = list(map(TritonPrinter.paren, indices))
+        index_symbols = [sympy.Symbol(x) for x in indices]
+        lengths = [V.graph.sizevars.simplify(s) for s in self.output_node.get_size()]
+        assert len(indices) == len(lengths)
+
+        # glue to make generated code use same indexing from template
+        for name, range_tree_entry in zip(
+            indices, self.range_trees[0].construct_entries(lengths)
+        ):
+            range_tree_entry.set_name(name)
+        contiguous_index = sympy_dot(
+            ir.FlexibleLayout.contiguous_strides(lengths), index_symbols
+        )
+        contiguous_index = self.rename_indexing(contiguous_index)
+        self.body.writeline("xindex = " + texpr(contiguous_index))
+        self.range_trees[0].lookup(sympy.Integer(1), sympy_product(lengths)).set_name(
+            "xindex"
+        )
+        self.template_mask = mask
+        self.template_indices = indices
+        output_index = self.output_node.get_layout().make_indexer()(index_symbols)
+        output_index = self.rename_indexing(output_index)
+        if output_index == contiguous_index:
+            output_index = sympy.Symbol("xindex")
+
+        epilogue_args = [val]
+        for input_node in itertools.chain(
+            self.input_nodes[: self.prefix_args],
+            self.input_nodes[len(self.input_nodes) - self.suffix_args :],
+        ):
+            input_node.freeze_layout()
+            epilogue_args.append(input_node.make_loader()(index_symbols))
+
+        V.ops.store(
+            self.output_node.get_name(),
+            output_index,
+            self.epilogue_fn(*epilogue_args),
+        )
+        self.codegen_body()
+
+        def hook():
+            # more stuff might have been added since the codegen_body above
+            self.codegen_body()
+            return textwrap.indent(self.body.getvalue(), "    ").strip()
+
+        assert "<STORE_OUTPUT>" not in self.render_hooks
+        self.render_hooks["<STORE_OUTPUT>"] = hook
+        return "<STORE_OUTPUT>"
+
+    def render(self, template, kwargs):
+        return PartialRender(
+            template.render(**self.template_env(), **kwargs),
+            self.render_hooks,
+        )
+
+    def make_load(self, name, indices, mask):
+        """
+        Optional helper called from template code to generate the code
+        needed to load from an tensor.
+        """
+        assert isinstance(indices, (list, tuple))
+        assert isinstance(name, str)
+        assert isinstance(mask, str)
+        stride = self.named_input_nodes[name].get_stride()
+        indices = list(map(TritonPrinter.paren, indices))
+        assert len(indices) == len(stride)
+        index = " + ".join(
+            f"{texpr(self.rename_indexing(s))} * {i}" for s, i in zip(stride, indices)
+        )
+        return f"tl.load({name} + ({index}), {mask})"
+
+    def template_env(self):
+        """
+        Generate the namespace visible in the template.
+        """
+        return {
+            fn.__name__: fn
+            for fn in [
+                self.def_kernel,
+                self.size,
+                self.stride,
+                self.store_output,
+                self.make_load,
+            ]
+        }
+
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        dense_indexing=False,
+        copy_shape=None,
+        override_mask=None,
+        block_ptr=False,
+    ):
+        """
+        Override the default indexing to use our custom mask and force
+        dense indexing.
+        """
+        return super().indexing(
+            index,
+            dense_indexing=False,
+            copy_shape=self.template_mask,
+            override_mask=self.template_mask,
+            block_ptr=block_ptr,
+        )
+
+    def initialize_range_tree(self, pid_cache):
+        super().initialize_range_tree(pid_cache)
+        # ignore default codegen
+        self.body.clear()
+        self.indexing_code.clear()
+
+    def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
+        wrapper = V.graph.wrapper_code
+        _, call_args, _ = self.args.python_argdefs()
+        call_args = [str(a) for a in call_args]
+
+        for i in range(len(call_args)):
+            if V.graph.is_unspec_arg(call_args[i]):
+                call_args[i] = call_args[i] + ".item()"
+            if isinstance(call_args[i], sympy.Symbol):
+                call_args[i] = texpr(call_args[i])
+
+        if V.graph.cpp_wrapper:
+            # In the cpp_wrapper case, we have to compute CUDA launch grid at runtime
+            # if any dynamic dimension is involved. We rely on the Python version
+            # of the grid function to generate those grid configs, which may contain
+            # symbolic values. The wrapper will use cexpr to print out C++ code
+            # appropriately for the grid configs.
+            grid_args = [V.graph.sizevars.simplify(s) for s in self.call_sizes] + [
+                self.meta
+            ]
+            grid = self.grid_fn(*grid_args)
+
+            wrapper.generate_kernel_call(
+                name,
+                call_args,
+                device_index=V.graph.scheduler.current_device.index,
+                grid=grid,
+                triton_meta=self.triton_meta,
+            )
+        else:
+            stream_name = wrapper.write_get_raw_stream(
+                V.graph.scheduler.current_device.index
+            )
+
+            wrapper.add_import_once(f"import {self.grid_fn.__module__}")
+            meta = wrapper.add_meta_once(self.meta)
+
+            grid_call = [
+                texpr(V.graph.sizevars.simplify(s)) for s in self.call_sizes
+            ] + [meta]
+            grid_call = f"{self.grid_fn.__module__}.{self.grid_fn.__name__}({', '.join(grid_call)})"
+            wrapper.writeline(
+                f"{name}.run({', '.join(call_args)}, grid={grid_call}, stream={stream_name})"
+            )
+
+
+@functools.lru_cache(None)
+def _jinja2_env():
+    try:
+        import jinja2
+
+        return jinja2.Environment(
+            undefined=jinja2.StrictUndefined,
+        )
+    except ImportError:
+        return None
+
+
+class TritonTemplate(KernelTemplate):
+    index_counter = itertools.count()
+    all_templates: Dict[str, "TritonTemplate"] = dict()
+
+    def __init__(self, name: str, grid: Any, source: str, debug=False):
+        super().__init__(name)
+        self.grid = grid
+        self.template = self._template_from_string(source)
+        assert name not in self.all_templates, "duplicate template name"
+        self.all_templates[name] = self
+        self.debug = debug
+
+    def generate(
+        self,
+        input_nodes,
+        layout,
+        num_stages,
+        num_warps,
+        prefix_args=0,
+        suffix_args=0,
+        epilogue_fn=identity,
+        **kwargs,
+    ):
+        assert self.template, "requires jinja2"
+        defines = StringIO()
+        for name, val in kwargs.items():
+            defines.write(f"    {name} : tl.constexpr = {val}\n")
+        defines = defines.getvalue()
+
+        fake_out = ir.Buffer("buf_out", layout)
+        kernel_name = f"triton_{self.name}"
+
+        numel = sympy_product(layout.size)
+        buffers = itertools.chain(input_nodes, (fake_out,))
+        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            raise NotImplementedError(
+                "64-bit indexing is not yet implemented for triton templates"
+            )
+
+        kernel_options = dict(
+            input_nodes=input_nodes,
+            defines=defines,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            grid_fn=self.grid,
+            meta=kwargs,
+            call_sizes=layout.size,
+            prefix_args=prefix_args,
+            suffix_args=suffix_args,
+            epilogue_fn=epilogue_fn,
+            index_dtype="tl.int32",
+        )
+        with patch.object(
+            V.graph, "get_dtype", self._fake_get_dtype(fake_out)
+        ), TritonTemplateKernel(
+            kernel_name=kernel_name,
+            output_node=fake_out,
+            use_jit=True,
+            **kernel_options,
+        ) as kernel:
+            try:
+                code = kernel.render(self.template, kwargs).finalize()
+            except ZeroDivisionError:
+                # TODO(nmacchioni): fix sympy division by zero
+                return None
+            if self.debug:
+                print("Generated Code:\n", code)
+            extra = (
+                "-".join(
+                    [
+                        *[
+                            f"{kwarg}={repr(kwargs[kwarg])}"
+                            for kwarg in sorted(kwargs.keys())
+                        ],
+                        f"num_stages={num_stages}",
+                        f"num_warps={num_warps}",
+                    ]
+                )
+                + "-"
+            )
+            mod = PyCodeCache.load(code, extra)
+            _, call_args, _ = kernel.args.python_argdefs()
+
+        expected_args = list(unique(x.get_name() for x in input_nodes))
+        expected_args.extend([fake_out.get_name()])
+        assert list(call_args)[: len(expected_args)] == expected_args, (
+            call_args,
+            expected_args,
+        )
+        extra_args = V.graph.sizevars.size_hints(
+            map(sympy.expand, call_args[len(expected_args) :]),
+            fallback=config.unbacked_symint_fallback,
+        )
+
+        kernel_hash_name = f"triton_{self.name}_{next(self.index_counter)}"
+
+        def make_kernel_render(out_node):
+            kernel = TritonTemplateKernel(
+                kernel_name=str(Placeholder.KERNEL_NAME),
+                output_node=out_node,
+                use_jit=False,
+                **kernel_options,
+            )
+            render = functools.partial(
+                kernel.render,
+                self.template,
+                kwargs,
+            )
+            return kernel, render
+
+        # create the BenchmarkRequest
+        assert mod.__file__ is not None
+        grid = self.grid(
+            *V.graph.sizevars.size_hints(
+                layout.size,
+                fallback=config.unbacked_symint_fallback,
+            ),
+            kwargs,
+        )
+        bmreq = TritonBenchmarkRequest(
+            module_path=mod.__file__,
+            module_cache_key=mod.key,
+            kernel_name=kernel_name,
+            grid=grid,
+            extra_args=extra_args,
+            num_stages=num_stages,
+            num_warps=num_warps,
+            matrix_instr_nonkdim=kwargs.get("matrix_instr_nonkdim", 0),
+            input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
+            output_tensor_meta=TensorMeta.from_irnodes(layout),
+        )
+
+        return TritonTemplateCaller(
+            kernel_hash_name,
+            input_nodes,
+            layout,
+            make_kernel_render,
+            extra.strip("-").replace("-", ", "),
+            bmreq,
+            log_info={
+                "tile_shape": str(
+                    (
+                        kwargs.get("BLOCK_M", -1),
+                        kwargs.get("BLOCK_K", -1),
+                        kwargs.get("BLOCK_N", -1),
+                    )
+                ),
+                "num_stages": num_stages,
+                "num_warps": num_warps,
+                "allow_tf32": str(kwargs.get("ALLOW_TF32", None)),
+                "acc_type": str(kwargs.get("ACC_TYPE", None)),
+            },
+        )
+
+
+class ExternKernelChoice:
+    def __init__(
+        self,
+        kernel,
+        cpp_kernel=None,
+        *,
+        name=None,
+        has_out_variant=True,
+        op_overload=None,
+        use_fallback_kernel=False,
+    ):
+        super().__init__()
+        name = name or kernel.__name__
+        assert callable(kernel)
+        assert not hasattr(extern_kernels, name), "duplicate extern kernel"
+        self.name = name
+        self.cpp_kernel_name = cpp_kernel
+        self.has_out_variant = has_out_variant
+        setattr(extern_kernels, name, kernel)
+        self.op_overload = op_overload
+        self.use_fallback_kernel = use_fallback_kernel
+
+    def to_callable(self):
+        return getattr(extern_kernels, self.name)
+
+    def call_name(self):
+        return f"extern_kernels.{self.name}"
+
+    @functools.lru_cache(None)
+    def hash_key(self):
+        fn = self.to_callable()
+        parts = [
+            self.name,
+            getattr(fn, "__name__", ""),
+            getattr(fn, "__module__", ""),
+        ]
+        try:
+            parts.append(inspect.getsource(fn))
+        except Exception:
+            pass
+        return code_hash("-".join(parts))
+
+    def bind(
+        self,
+        input_nodes,
+        layout,
+        ordered_kwargs_for_cpp_kernel=(),
+        **kwargs,
+    ):
+        self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
+        return ExternKernelCaller(
+            self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
+        )
+
+
+class TritonTemplateCaller(ChoiceCaller):
+    def __init__(
+        self,
+        name,
+        input_nodes,
+        layout,
+        make_kernel_render,
+        debug_extra,
+        bmreq,
+        log_info: Optional[
+            Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]
+        ] = None,
+    ):
+        super().__init__(name, input_nodes, layout)
+        self.make_kernel_render = make_kernel_render
+        self.debug_extra = debug_extra
+        self.bmreq: TritonBenchmarkRequest = bmreq
+        if log_info is None:
+            log_info = {}
+        self.log_info: Dict[str, Any] = log_info
+        self.log_info.update(
+            {
+                "backend": "Triton",
+                "grid": str(self.bmreq.grid),
+                "num_stages": self.bmreq.num_stages,
+                "num_warps": self.bmreq.num_warps,
+            }
+        )
+
+    def benchmark(self, *args, out):
+        assert self.bmreq is not None
+        return self.bmreq.benchmark(*args, output_tensor=out)
+
+    def __str__(self):
+        return f"TritonTemplateCaller({self.bmreq.module_path}, {self.debug_extra})"
+
+    def call_name(self):
+        return f"template_kernels.{self.name}"
+
+    def hash_key(self):
+        return "-".join(
+            [
+                self.name.rsplit("_", 1)[0],
+                self.bmreq.module_cache_key,
+            ]
+        )
+
+    def output_node(self):
+        return ir.TensorBox.create(
+            ir.TritonTemplateBuffer(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                make_kernel_render=self.make_kernel_render,
+            )
+        )
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return self.log_info
+
+
+class ExternKernelCaller(ChoiceCaller):
+    def __init__(
+        self,
+        choice: ExternKernelChoice,
+        input_nodes,
+        layout,
+        kwargs=None,
+        *,
+        has_out_variant=True,
+    ):
+        super().__init__(choice.name, input_nodes, layout)
+        self.choice = choice
+        self.kwargs = kwargs or {}
+        self.has_out_variant = has_out_variant
+
+    def __str__(self):
+        return f"ExternKernelCaller({self.choice.call_name()})"
+
+    def benchmark(self, *args, out):
+        if self.has_out_variant:
+            return super().benchmark(*args, out=out)
+        else:
+            algo = self.to_callable()
+            out_new = algo(*args)
+            torch._C._dynamo.guards.assert_size_stride(
+                out_new, tuple(out.size()), tuple(out.stride())
+            )
+            out.copy_(out_new)  # for correctness checking
+            return do_bench(lambda: algo(*args))
+
+    def to_callable(self):
+        fn = self.choice.to_callable()
+        if self.kwargs:
+            return functools.partial(fn, **self.kwargs)
+        else:
+            return fn
+
+    def hash_key(self):
+        return "-".join(
+            [
+                self.choice.name,
+                *[
+                    f"{kwarg}={repr(self.kwargs[kwarg])}"
+                    for kwarg in sorted(self.kwargs.keys())
+                ],
+                self.choice.hash_key(),
+            ]
+        )
+
+    def output_node(self):
+        if config.abi_compatible and self.choice.use_fallback_kernel:
+            assert (
+                self.choice.op_overload is not None
+            ), "Please provide an op_overload to use ir.FallbackKernel"
+            inner = ir.FallbackKernel.create(
+                self.choice.op_overload, *self.input_nodes, **self.kwargs
+            )
+        else:
+            cls = ir.ExternKernelOut if self.has_out_variant else ir.ExternKernelAlloc
+            inner = cls(
+                layout=self.layout,
+                inputs=self.input_nodes,
+                python_kernel_name=self.choice.call_name(),
+                cpp_kernel_name=self.choice.cpp_kernel_name,
+                ordered_kwargs_for_cpp_kernel=self.choice.ordered_kwargs_for_cpp_kernel,
+                op_overload=self.choice.op_overload,
+                kwargs=self.kwargs,
+            )
+
+        return ir.TensorBox.create(inner)
+
+    def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType]]]:
+        """Information returned here is logged to the autotune log file when that is enabled."""
+        return {
+            "backend": "extern",
+            "kernel_call_name": self.choice.call_name(),
+        }
+
+
+class ErrorFromChoice(RuntimeError):
+    def __init__(self, msg, choice: ChoiceCaller, inputs_str):
+        msg += f"\nFrom choice {choice}\n{inputs_str}"
+        super().__init__(msg)
+        self.choice = choice
+
+
+class AlgorithmSelectorCache(PersistentCache):
+    def __call__(
+        self,
+        name,
+        choices: List[ChoiceCaller],
+        input_nodes,
+        layout,
+        # optional dict mapping arg indices to the functions
+        # generating a torch.Tensor for that input from the
+        # corresponding ir.Buffer. if passed for a given
+        # arg, the function will be called instead of
+        # generating a random torch.Tensor for benchmarking.
+        input_gen_fns: Optional[Dict[int, Callable[[ir.Buffer], torch.Tensor]]] = None,
+        precompilation_timeout_seconds: int = 60 * 60,
+    ):
+        from .codegen.cuda.cuda_kernel import CUDATemplateCaller
+
+        # TODO(nmacchioni): remove once CI tests are fixed
+        choices = [choice for choice in choices if choice is not None]
+        if len(choices) == 0:
+            raise RuntimeError(
+                "No choices to select, please consider adding ATEN into max_autotune_gemm_backends "
+                "config (defined in torch/_inductor/config.py) to allow at least one choice. "
+            )
+        log.debug("Max autotune selects from %s choices.", str(len(choices)))
+
+        if len(choices) == 1:
+            if not isinstance(choices[0], CUDATemplateCaller):
+                # CUDATemplateCaller still needs to go through autotuning process to retrieve workspace size.
+                return choices[0].output_node()
+
+        @functools.lru_cache(None)
+        def make_benchmark_fn():
+            return self.make_benchmark_fn(choices, input_nodes, layout, input_gen_fns)
+
+        def precompile(choices):
+            if (
+                precompilation_timeout_seconds is None
+                or precompilation_timeout_seconds <= 0
+            ):
+                return
+            num_workers = min(
+                config.compile_threads,
+                torch.get_num_threads(),
+                len(choices),
+            )
+            if num_workers <= 0:
+                return
+            log.info(
+                "Multithreaded precompilation for %d choices using %d worker threads",
+                len(choices),
+                num_workers,
+            )
+            with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                futures = executor.map(
+                    lambda c: c.precompile(),
+                    [c for c in choices if hasattr(c, "precompile")],
+                    timeout=precompilation_timeout_seconds,
+                )
+                try:
+                    iterator = iter(futures)
+                    while True:
+                        try:
+                            next(iterator)
+                        except CUDACompileError:
+                            log.error(  # noqa: G201
+                                "CUDA Compilation error", exc_info=True
+                            )
+                except TimeoutError:
+                    log.warning(
+                        f"Precompilation timed out after {precompilation_timeout_seconds} seconds."  # noqa: G004
+                    )
+                except StopIteration:
+                    pass
+                executor.shutdown(wait=True)
+
+        def autotune(choices):
+            try:
+                precompile(choices)
+            except TimeoutError:
+                log.warning(
+                    "Precompilation phase took longer than timeout allowed. Continuing"
+                )
+                pass
+            return make_benchmark_fn()(choices)
+
+        if config.autotune_in_subproc:
+            from .autotune_process import tuning_pool
+
+            # do the optional warmup
+            tuning_pool.initialize()
+
+        autotune_start_ts = time.time()
+        timings = self.lookup(
+            choices,
+            name,
+            repr([self.key_of(x) for x in input_nodes]),
+            autotune,
+        )
+        autotune_elapse = time.time() - autotune_start_ts
+        if timings == {} or choices[0] not in timings:
+            return choices[0].output_node()
+
+        if make_benchmark_fn.cache_info().currsize:
+            counters["inductor"]["select_algorithm_autotune"] += 1
+        if (
+            make_benchmark_fn.cache_info().currsize
+            or log.getEffectiveLevel() == logging.DEBUG
+            or config.trace.log_autotuning_results
+        ):
+            self.log_results(name, input_nodes, timings, autotune_elapse)
+        selected_choice = builtins.min(timings, key=timings.__getitem__).output_node()
+        log.debug("selected choice: %s", str(selected_choice))
+        return selected_choice
+
+    @classmethod
+    def make_benchmark_fn(
+        cls,
+        choices,
+        input_nodes,
+        layout,
+        input_gen_fns=None,
+    ):
+        if input_gen_fns is None:
+            input_gen_fns = {}
+
+        # de-duplicate args
+        unique_example_inputs = {
+            x.get_name(): input_gen_fns.get(i, cls.benchmark_example_value)(x)
+            for i, x in enumerate(input_nodes)
+        }
+        example_inputs = list(unique_example_inputs.values())
+        example_inputs_extern = [
+            torch.as_strided(
+                unique_example_inputs[input_node.get_name()],
+                V.graph.sizevars.size_hints(
+                    input_node.get_size(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hints(
+                    input_node.get_stride(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hint(
+                    input_node.get_layout().offset,
+                    fallback=config.unbacked_symint_fallback,
+                ),
+            )
+            for input_node in input_nodes
+        ]
+
+        out = cls.benchmark_example_value(layout)
+        out_extern = torch.as_strided(
+            out, out.size(), out.stride(), V.graph.sizevars.size_hint(layout.offset)
+        )
+        if VERIFY:
+            choices[0].benchmark(*example_inputs_extern, out=out_extern)
+            expected = out_extern.clone()
+
+        if DEBUG:
+            print(f"{len(choices)} tuning requests:")
+
+        def debug_str():
+            def tensor_repr(x):
+                return (
+                    f"torch.empty_strided({tuple(x.size())!r}, {tuple(x.stride())!r}, "
+                    f"dtype={x.dtype!r}, device={x.device.type!r})"
+                )
+
+            lines = [
+                "inputs = [",
+            ]
+            for x in example_inputs:
+                lines.append(f"    {tensor_repr(x)},")
+            lines += ["]", f"out = {tensor_repr(out)}", ""]
+            return "\n".join(lines)
+
+        def benchmark_choice_in_current_process(choice):
+            out.zero_()
+            if isinstance(choice, ExternKernelCaller):
+                # aten kernels want the offset baked in for sliced tensors
+                result = choice.benchmark(*example_inputs_extern, out=out_extern)
+            else:
+                # triton templates want the base pointer for sliced tensors
+                result = choice.benchmark(*example_inputs, out=out)
+            if VERIFY:
+                torch.testing.assert_close(out_extern, expected, **VERIFY)
+            torch.cuda.synchronize()  # shake out any CUDA errors
+            return result
+
+        def benchmark_in_current_process(choices):
+            timings = {}
+            for choice in choices:
+                try:
+                    timing = benchmark_choice_in_current_process(choice)
+                except CUDACompileError as e:
+                    log.warning(
+                        "CUDA compilation error: \n%s. \nIgnore this choice.", str(e)
+                    )
+                    timing = float("inf")
+                except RuntimeError as e:
+                    msg = str(e)
+                    if "invalid argument" in msg:
+                        msg += "\n\nThis may mean this GPU is too small for max_autotune mode.\n\n"
+                        log.warning(msg)
+                        timing = float("inf")
+                    else:
+                        if "illegal memory access" in msg:
+                            msg += "\n\nEither error in template or triton bug.\n"
+                        raise ErrorFromChoice(msg, choice, debug_str())  # noqa: TRY200
+                except AssertionError as e:
+                    raise AssertionError(  # noqa: TRY200
+                        f"Incorrect result from choice {choice}\n\n{e}"
+                    )
+
+                timings[choice] = timing
+
+            return timings
+
+        def benchmark_in_sub_process(choices):
+            from . import autotune_process
+
+            # only benchmark triton kernel in sub process for now.
+            # ATen/Extern kernel are still benchmarked in the current process.
+            extern = [c for c in choices if isinstance(c, ExternKernelCaller)]
+            triton = [c for c in choices if not isinstance(c, ExternKernelCaller)]
+
+            timings = benchmark_in_current_process(extern)
+            timings.update(autotune_process.benchmark_in_sub_process(triton))
+            return timings
+
+        benchmark = (
+            benchmark_in_sub_process
+            if config.autotune_in_subproc
+            else benchmark_in_current_process
+        )
+
+        return benchmark
+
+    @staticmethod
+    def log_results(
+        name: str,
+        input_nodes: List[ir.IRNode],
+        timings: Dict[ChoiceCaller, float],
+        elapse: float,
+    ):
+        V.debug.log_autotuning_results(name, input_nodes, timings, elapse)
+        if not (config.max_autotune or config.max_autotune_gemm) or not PRINT_AUTOTUNE:
+            return
+        sizes = ", ".join(
+            [
+                "x".join(
+                    map(
+                        str,
+                        V.graph.sizevars.size_hints(
+                            n.get_size(), fallback=config.unbacked_symint_fallback
+                        ),
+                    )
+                )
+                for n in input_nodes
+            ]
+        )
+        n = None if log.getEffectiveLevel() == logging.DEBUG else 10
+        top_k = sorted(timings, key=timings.__getitem__)[:n]
+        best = top_k[0]
+        best_time = timings[best]
+        sys.stderr.write(f"AUTOTUNE {name}({sizes})\n")
+        for choice in top_k:
+            result = timings[choice]
+            if result:
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms {best_time/result:.1%}\n"
+                )
+            else:
+                sys.stderr.write(
+                    f"  {choice.name} {result:.4f} ms <DIVIDED BY ZERO ERROR>\n"
+                )
+
+        autotune_type_str = (
+            "SubProcess" if config.autotune_in_subproc else "SingleProcess"
+        )
+        sys.stderr.write(f"{autotune_type_str} AUTOTUNE takes {elapse:.4f} seconds\n")
+
+    @staticmethod
+    def benchmark_example_value(node):
+        """
+        Convert an ir.Buffer into a concrete torch.Tensor we can use for
+        benchmarking.
+        """
+        if isinstance(node, ir.Layout):
+            node = ir.Buffer("fake", node)
+        # triton templates want the base tensor.
+        if isinstance(node, ir.BaseView):
+            node = node.unwrap_view()
+        # preserve rng states to avoid the rand_strided call below changes
+        # the rng states for the real model code.
+        with preserve_rng_state():
+            return rand_strided(
+                V.graph.sizevars.size_hints(
+                    node.get_size(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                V.graph.sizevars.size_hints(
+                    node.get_stride(),
+                    fallback=config.unbacked_symint_fallback,
+                ),
+                device=node.get_device(),
+                dtype=node.get_dtype(),
+                extra_size=node.layout.offset,
+            )
+
+    @staticmethod
+    def key_of(node):
+        """
+        Extract the pieces of an ir.Buffer that we should invalidate cached
+        autotuning results on.
+        """
+        sizevars = V.graph.sizevars
+        return (
+            node.get_device().type,
+            str(node.get_dtype()),
+            *sizevars.size_hints(
+                node.get_size(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            *sizevars.size_hints(
+                node.get_stride(),
+                fallback=config.unbacked_symint_fallback,
+            ),
+            sizevars.size_hint(
+                node.get_layout().offset,
+                fallback=config.unbacked_symint_fallback,
+            ),
+        )
+
+
+_ALGORITHM_SELECTOR_CACHE: Optional[AlgorithmSelectorCache] = None
+
+
+def autotune_select_algorithm(*args, **kwargs):
+    global _ALGORITHM_SELECTOR_CACHE
+    if _ALGORITHM_SELECTOR_CACHE is None:
+        _ALGORITHM_SELECTOR_CACHE = AlgorithmSelectorCache()
+    return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
+
+
+def realize_inputs(*args):
+    if len(args) == 1:
+        return ir.ExternKernel.require_stride1(ir.ExternKernel.realize_input(args[0]))
+    return [realize_inputs(x) for x in args]
+
+
+# ensure lowering is imported so that `extern_kernels.*` is populated
+from . import lowering  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/_inductor/sizevars.py b/MLPY/Lib/site-packages/torch/_inductor/sizevars.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb0faf55efa24630c6de37e96304ece7ca3bcf8d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/sizevars.py
@@ -0,0 +1,643 @@
+import functools
+import itertools
+import logging
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import sympy
+from sympy import Expr
+
+from torch.fx.experimental.symbolic_shapes import ShapeEnv
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.value_ranges import bound_sympy
+
+from .utils import sympy_index_symbol, sympy_subs, VarRanges
+from .virtualized import V
+
+log = logging.getLogger(__name__)
+
+
+# This class is a little awkward, because ShapeEnv is doing most of the heavy
+# lifting and in some cases we should be directly passing through to ShapeEnv,
+# but there is some extra inductor logic that needs to be handled here
+class SizeVarAllocator:
+    def __init__(self, shape_env=None):
+        super().__init__()
+        if shape_env is None:
+            shape_env = ShapeEnv()
+        self.shape_env = shape_env
+        self.var_to_val = self.shape_env.var_to_val
+        self.replacements: Dict[sympy.Symbol, Expr] = self.shape_env.replacements
+        # Maps of dynamic sizes that have to be precomputed on the host to the kernel args.
+        # The basic idea is if we have some complicated sympy expression
+        # f(s0), we may choose to precompute it on the host and then replace
+        # all occurrences of that sympy expression with ps0, so that when we
+        # codegen we simply reference ps0 directly without repeating
+        # f(s0).  Unlike regular size variables, ps variables cannot be
+        # guarded upon; so if we are asked to guard on a Sympy expression
+        # which potentially could have already had a precomputed replacement
+        # on it, we are obligated to invert the precomputed replacements
+        # (inv_precomputed_replacements).
+        self.precomputed_replacements: Dict[Expr, sympy.Symbol] = dict()
+        self.inv_precomputed_replacements: Dict[sympy.Symbol, Expr] = dict()
+        self.stride_vars = self.make_stride_vars_cache()
+        self.simplify_with_ranges = self.make_simplify_with_ranges_cache()
+        self._simplify_loops = self.make_simplify_loops_cache()
+
+    def simplify(self, expr: Expr):
+        return sympy.expand(expr).xreplace(self.replacements)
+
+    def make_simplify_with_ranges_cache(self) -> Callable[[Expr, VarRanges], Expr]:
+        """
+        self._simplify_with_ranges() can be expensive, cache its results
+        """
+        cache: Dict[Tuple[Any, ...], Expr] = dict()
+        replacement_count = len(self.replacements)
+
+        def simplify_with_ranges(expr: Expr, var_ranges: VarRanges) -> Expr:
+            nonlocal replacement_count
+            if replacement_count != len(self.replacements):
+                # new replacements invalidates cached results
+                cache.clear()
+                replacement_count = len(self.replacements)
+            key = (expr, *var_ranges.items())
+            result = cache.get(key, None)
+            if result is None:
+                result = self._simplify_with_ranges(expr, var_ranges)
+                cache[key] = result
+            return result
+
+        return simplify_with_ranges
+
+    def make_simplify_loops_cache(self):
+        """
+        self._simplify_with_ranges() can be expensive, cache its results
+        """
+        cache: Dict[Tuple[Any, ...], Any] = dict()
+        replacement_count = len(self.replacements)
+
+        def simplify_loops(index_vars, sizes, index_formulas):
+            nonlocal replacement_count
+            if replacement_count != len(self.replacements):
+                # new replacements invalidates cached results
+                cache.clear()
+                replacement_count = len(self.replacements)
+            key = (*index_vars, *sizes, *index_formulas)
+            result = cache.get(key, None)
+            if result is None:
+                result = self._simplify_loops_impl(index_vars, sizes, index_formulas)
+                cache[key] = result
+            return result
+
+        return simplify_loops
+
+    def _simplify_with_ranges(self, expr: Expr, var_ranges: VarRanges) -> Expr:
+        """
+        Simplify indexing expression with knowledge of the ranges of
+        iteration variables.
+        """
+
+        expr = join_dimensions(self.simplify(expr))
+        original_expr = expr
+
+        def remove_zero_terms(base, divisor):
+            """Symbols smaller than the divisor are zero"""
+            for v in base.free_symbols:
+                if v in var_ranges:
+                    # var smaller than divisor can be removed
+                    # if the rest is guaranteed to be multiple of divisor
+                    rest = sympy.Wild("_rest", exclude=[v])
+                    m = base.match(v + rest)
+                    if m and v not in m[rest].free_symbols:
+                        gcd = sympy.gcd(m[rest], divisor)
+                        if gcd == divisor:
+                            if self.statically_known_leq(var_ranges[v], divisor):
+                                base = m[rest]
+            return base
+
+        def visit_indexing_div(base, divisor):
+            return FloorDiv(remove_zero_terms(base, divisor), divisor)
+
+        def visit_modular_indexing(base, divisor, modulus):
+            base = remove_zero_terms(base, divisor)
+            base_pos = True
+            if isinstance(base, ModularIndexing):
+                # for modular indexing, biggest values from the ranges don't necessarily result in
+                # the biggest result, the biggest result is modulus - 1
+                base_s = base.args[2] - 1
+            elif not base.has(ModularIndexing):
+                # actual iteration range is to size-1
+                iter_ranges_zero = {k: 0 for k, v in var_ranges.items()}
+                base_lowest = sympy_subs(base, iter_ranges_zero)
+                if self.statically_known_leq(0, base_lowest):  # type: ignore[arg-type]
+                    # can't replace with indexing div if base can be negative
+                    base_pos = True
+                else:
+                    base_pos = False
+                iter_ranges = {k: v - 1 for k, v in var_ranges.items()}
+                base_s = sympy_subs(base, iter_ranges)
+            else:
+                base_s = base
+            if self.statically_known_lt(base_s, modulus * divisor) and base_pos:
+                return FloorDiv(base, divisor)
+            return ModularIndexing(base, divisor, modulus)
+
+        if expr.has(ModularIndexing):
+            expr = expr.replace(
+                ModularIndexing(
+                    sympy.Wild("base"),
+                    sympy.Wild("divisor"),
+                    sympy.Wild("modulus"),
+                ),
+                visit_modular_indexing,
+            )
+
+        if expr.has(FloorDiv):
+            expr = expr.replace(
+                FloorDiv(
+                    sympy.Wild("base"),
+                    sympy.Wild("divisor"),
+                ),
+                visit_indexing_div,
+            )
+
+        if expr != original_expr:
+            return self._simplify_with_ranges(expr, var_ranges)
+        return expr
+
+    def _simplify_loops_impl(
+        self, index_vars: List[sympy.Symbol], sizes, index_formulas
+    ):
+        """
+        Try to remove as many axis from loop iterations as possible, by:
+            1) removing size==1 dimensions
+            2) fuse contiguous dimensions into a single loop
+            If channel_last = True, we will prevent the last dim fused with other dims
+        """
+        sizes = list(map(self.simplify, sizes))
+
+        strides = [self.stride_vars(x, index_vars) for x in index_formulas]
+        assert len(sizes) == len(strides[0]), (len(sizes), len(strides[0]))
+
+        for i in range(len(sizes)):
+            if sizes[i] == 1:
+                # remove dim
+                sizes[i] = None
+
+        def can_merge_dims(a, b):
+            for k in range(len(strides)):
+                if self.simplify(strides[k][a] * sizes[a]) == self.simplify(
+                    strides[k][b]
+                ):
+                    # approximate test passed, try sound version
+                    va = index_vars[a]
+                    vb = index_vars[b]
+                    v = sympy_index_symbol("_merge_tester")
+                    expr1 = sympy_subs(index_formulas[k], {va: v * sizes[a], vb: 0})
+                    expr2 = sympy_subs(index_formulas[k], {va: 0, vb: v})
+                    if self.simplify(expr1) == self.simplify(expr2):
+                        continue
+                return False
+            return True
+
+        changed = True
+        while changed:
+            changed = False
+            for i, j in itertools.product(
+                reversed(range(len(sizes))), reversed(range(len(sizes)))
+            ):
+                if i == j or sizes[i] is None or sizes[j] is None:
+                    continue
+                if can_merge_dims(i, j):
+                    changed = True
+                    sizes[i] = sizes[i] * sizes[j]
+                    sizes[j] = None
+
+        def reindex(index):
+            it = list(reversed(index))
+            new_index = []
+            for size in sizes:
+                if size is None:
+                    new_index.append(sympy.Integer(0))
+                else:
+                    new_index.append(it.pop())
+            assert not it
+            return new_index
+
+        def prune(index):
+            assert len(index) == len(sizes)
+            return [i for i, s in zip(index, sizes) if s is not None]
+
+        return [x for x in sizes if x is not None], reindex, prune
+
+    # Note - [On Statically Known]
+    #
+    # The statically_known_* family of functions below replaces a prior system, called maybe_guard_*. The prior system
+    # operated by providing essentially a question, where the size hinted values were evaluated. If the condition was
+    # true, we add a guard and return True, otherwise, False.
+    #
+    # def maybe_guard_foo(args):
+    #   if size_hinted_check(args):
+    #       return False # No guard, no optim
+    #   guard(args) # Make a guard
+    #   return True # Safe to apply optimization
+    #
+    # The prior system incurred a guard, and green lit an optimization.
+    #
+    # The new system works in reverse - in the new system, if we know that the inputs are static, and evaluate the
+    # condition as true, we green light the optimization, and we do not incur a guard. If we cannot prove that, we
+    # return False.
+    #
+    # def maybe_guard_foo(args):
+    #   if all_static(args):
+    #       return True # Safe to apply optimization
+    #   else:
+    #       return False # No guard, no optim
+
+    # See Note - [On Statically Known]
+
+    def is_expr_static_and_true(self, expr: Union[Expr, int]) -> bool:
+        if expr in (True, False):
+            return bool(expr)
+
+        try:
+            simplified = self.shape_env._maybe_evaluate_static(expr)
+            if simplified is not None:
+                return bool(simplified)
+        except Exception:
+            log.debug("Could not simplify %s", expr)
+
+        return False
+
+    def statically_known_equals(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left and right are equal.
+        """
+        return self.is_expr_static_and_true(sympy.Eq(left, right))  # type: ignore[arg-type]
+
+    # See Note - [On Statically Known]
+    def statically_known_list_equals(self, left: List[Expr], right: List[Expr]) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left and right lists are equal.
+        """
+        if len(left) != len(right):
+            return False
+        if all(self.statically_known_equals(l, r) for l, r in zip(left, right)):
+            return True
+        return False
+
+    # See Note - [On Statically Known]
+    def statically_known_leq(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is less than or equal to right.
+        """
+        expr = left <= right
+        return self.is_expr_static_and_true(expr)
+
+    # See Note - [On Statically Known]
+    def statically_known_lt(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is less than right.
+        """
+        expr = left < right
+        return self.is_expr_static_and_true(expr)
+
+    # See Note - [On Statically Known]
+    def statically_known_multiple_of(self, numerator: Expr, denominator: Expr) -> bool:
+        """
+        Return a bool indicating if it is sound to optimize for the numerator being a multiple of the denominator.
+        """
+        expr = sympy.Eq(numerator % denominator, 0)
+        return self.is_expr_static_and_true(expr)  # type: ignore[arg-type]
+
+    # The guard functions require you to ALREADY KNOW that a particular
+    # condition holds.  If you don't know (you want to guard on an expression
+    # being a particular value, and then get access to that value), use
+    # the evaluate functions.
+
+    def guard_equals(self, left: Expr, right: Expr) -> Expr:
+        if isinstance(left, Expr):
+            left = sympy_subs(left, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        if isinstance(right, Expr):
+            right = sympy_subs(right, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        assert self.shape_env.evaluate_expr(sympy.Eq(left, right))
+        return left
+
+    def guard_leq(self, left: Expr, right: Expr) -> None:
+        return self.guard_lt(left, right + 1)
+
+    def guard_lt(self, left: Expr, right: Expr) -> None:
+        assert self.shape_env.evaluate_expr(sympy.Lt(left, right))
+
+    def expect_true(self, expr: Expr, *, msg: str) -> None:
+        expr = sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        self.shape_env.defer_runtime_assert(expr, msg, fx_node=None)
+
+    def expect_equals(self, left: Expr, right: Expr, *, msg: str) -> Expr:
+        # Prefer returning the expression without unbacked symints
+        if self.shape_env.is_unbacked_symint(left):
+            self.expect_true(sympy.Eq(left, right), msg=msg)  # type: ignore[arg-type]
+            return right
+        elif self.shape_env.is_unbacked_symint(right):
+            self.expect_true(sympy.Eq(left, right), msg=msg)  # type: ignore[arg-type]
+            return left
+        else:
+            return self.guard_equals(left, right)
+
+    def guarded_order(self, seq):
+        """
+        Return the order of a sequence as a permutation of range(len(seq)) and guard on that order not changing.
+        Used for generating block_ptrs.
+        """
+        seq = [*map(self.remove_precomputed_replacements, seq)]
+        seq = [(self.size_hint(var), orig_idx, var) for orig_idx, var in enumerate(seq)]
+        seq.sort()
+        order = [-1] * len(seq)
+        last_var = None
+        for new_index, (_, orig_index, var) in enumerate(seq):
+            order[orig_index] = new_index
+            if last_var is not None:
+                self.guard_leq(last_var, var)
+            last_var = var
+        return order
+
+    # The evaluate functions evaluate some symbolic sympy expression
+    # (NB: not necessarily an Expr) and return what the concrete result
+    # is, guarding on the expression being that result
+
+    # NB: write evaluate_expr(sympy.Lt(a, b)) rather than evaluate_expr(a < b)
+    # as this will ensure that you actually have a sympy'ified expression,
+    # and will prevent you from incorrectly writing evaluate_expr(a == b)
+    # which does the wrong thing if a or b is a sympy expression
+    def evaluate_expr(self, left: Union[Expr, sympy.logic.boolalg.Boolean]) -> bool:
+        assert isinstance(left, (Expr, sympy.logic.boolalg.Boolean)), type(left)
+        return self.shape_env.evaluate_expr(sympy.sympify(left))
+
+    def evaluate_min(self, left: Expr, right: Expr) -> Expr:
+        """return the smaller of left and right, and guard on that choice"""
+        lv = self.size_hint(left)
+        rv = self.size_hint(right)
+        if lv <= rv:
+            self.guard_leq(left, right)
+            return left
+        else:
+            self.guard_leq(right, left)
+            return right
+
+    def evaluate_max(self, left: Expr, right: Expr) -> Expr:
+        """return the larger of left and right, and guard on that choice"""
+        # Always choose the opposite of eval min for consistency
+        # This means min(a, b) and max(a, b) produce the same guards
+        min_val = self.evaluate_min(left, right)
+        return right if min_val is left else left
+
+    def evaluate_static_shape(self, left: Expr) -> int:
+        right = self.size_hint(left)
+        self.guard_equals(left, sympy.Integer(right))
+        return int(right)
+
+    def evaluate_static_shapes(self, left: List[Expr]) -> List[int]:
+        return [self.evaluate_static_shape(x) for x in left]
+
+    def remove_precomputed_replacements(self, expr: Expr) -> Expr:
+        if any(s.name.startswith("ps") for s in expr.free_symbols):  # type: ignore[attr-defined]
+            return sympy_subs(expr, self.inv_precomputed_replacements)  # type: ignore[arg-type]
+        return expr
+
+    def symbolic_hint(self, expr: Expr) -> Expr:
+        # Substitute all hints into expr, but leave unbacked symints alone
+        if not isinstance(expr, Expr):
+            assert isinstance(expr, int)
+            return expr
+        free_symbols = expr.free_symbols
+        if not free_symbols:
+            return int(expr)  # type: ignore[return-value]
+        expr = self.remove_precomputed_replacements(expr)
+        return sympy_subs(expr, self.var_to_val)
+
+    def size_hint(self, expr: Expr, *, fallback: Optional[int] = None) -> int:
+        out = self.symbolic_hint(expr)
+        if not isinstance(out, (int, sympy.Integer)) and fallback is not None:
+            # Use the provided heuristic fallback hint
+            sym_vrs = {
+                s: self.shape_env.var_to_range.get(s, None) for s in expr.free_symbols
+            }
+            if all(vr is not None for vr in sym_vrs.values()):
+                expr_vr = bound_sympy(expr, sym_vrs)  # type: ignore[arg-type]
+                lower = self.size_hint(expr_vr.lower)  # type: ignore[arg-type]
+                upper = self.size_hint(expr_vr.upper)  # type: ignore[arg-type]
+                fallback = min(max(fallback, lower), upper)
+            return fallback
+        try:
+            return int(out)
+        except Exception:
+            log.debug("failed on: %s", out)
+            raise
+
+    def size_hints(
+        self,
+        exprs: Iterable[Expr],
+        *,
+        fallback: Optional[int] = None,
+    ) -> Tuple[int, ...]:
+        return tuple(self.size_hint(x, fallback=fallback) for x in exprs)
+
+    def _lru_cache(self, fn, maxsize=None):
+        """
+        Wrapper around functools.lru_cache that clears when replacements
+        has been invalidated.
+        """
+        fn_cache = functools.lru_cache(maxsize)(fn)
+        prior_len = len(self.replacements)
+
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            nonlocal prior_len
+            if prior_len != len(self.replacements):
+                prior_len = len(self.replacements)
+                fn_cache.cache_clear()
+            return fn_cache(*args, **kwargs)
+
+        return wrapper
+
+    def make_stride_vars_cache(self):
+        cache = self._lru_cache(self._stride_vars)
+
+        def stride_vars(
+            index: Expr,
+            vars: List[sympy.Symbol],
+            support_vars: Optional[List[sympy.Symbol]] = None,
+        ) -> List[Expr]:
+            if not support_vars:
+                support_vars = vars
+            return cache(index, tuple(vars), tuple(support_vars))
+
+        return stride_vars
+
+    def _stride_vars(
+        self, index: Expr, vars: List[sympy.Symbol], support_vars: List[sympy.Symbol]
+    ) -> List[Expr]:
+        """Convert an indexing expression back into strides
+
+        NOTE: This is only valid if the index is a standard strided offset
+        calculation. e.g. 10 * ModularIndexing(i0 + 1, 1, 2) would give a
+        stride of -10 because the index wraps around after the first element
+
+        """
+        strides = []
+        index = self.simplify(index)
+        # remove any offset
+        index = index - sympy_subs(
+            index, {v: sympy.Integer(0) for v in support_vars if v != 0}
+        )
+        for i in range(len(vars)):
+            # drop all the other dims
+            index_dim = sympy_subs(
+                index,
+                {
+                    support_vars[j]: sympy.Integer(0)
+                    for j in range(len(support_vars))
+                    if vars[i] != support_vars[j] and support_vars[j] != 0
+                },
+            )
+            v = vars[i]
+            if v == 0:
+                strides.append(sympy.Integer(0))
+            else:
+                # TODO(jansel): should we use sympy.diff here?
+                strides.append(
+                    sympy_subs(index_dim, {v: sympy.Integer(1)})
+                    - sympy_subs(index_dim, {v: sympy.Integer(0)})
+                )
+        return strides
+
+    def offset_var(self, index: Expr, vars: List[sympy.Symbol]) -> Expr:
+        """Extract offset part of an indexing expression"""
+        index = self.simplify(index)
+        return sympy_subs(index, {v: sympy.Integer(0) for v in vars if v != 0})
+
+    def stride_hints(
+        self,
+        index: Expr,
+        vars: List[sympy.Symbol],
+        support_vars: Optional[List[sympy.Symbol]] = None,
+    ) -> List[int]:
+        for v in index.free_symbols:
+            if v.name.startswith("indirect"):  # type: ignore[attr-defined]
+                index = sympy_subs(index, {v: 0})  # type: ignore[dict-item]
+        result = []
+        for s in self.stride_vars(index, vars, support_vars):
+            try:
+                result.append(self.size_hint(s))
+            except TypeError:
+                result.append(0)
+        return result
+
+    def stride_order(self, index: Expr, vars: List[sympy.Symbol]) -> List[int]:
+        strides = tuple(map(abs, self.stride_hints(index, vars)))
+        order = list(range(len(strides)))
+        order.sort(key=lambda x: (strides[x] == 0, strides[x]))
+        return order
+
+    def lookup_precomputed_size(self, expr: Expr) -> Expr:
+        if (
+            isinstance(expr, (int, sympy.Symbol, sympy.Number))
+            or expr.is_number
+            or expr.is_symbol
+        ):
+            return expr
+        expr = self.remove_precomputed_replacements(expr)
+        if expr not in self.precomputed_replacements:
+            sym = sympy_index_symbol(f"ps{len(self.precomputed_replacements)}")
+            self.precomputed_replacements[expr] = sym
+            self.inv_precomputed_replacements[sym] = expr
+        return self.precomputed_replacements[expr]
+
+    def free_symbols(self) -> Set[sympy.Symbol]:
+        return set(self.var_to_val.keys()) - set(self.replacements.keys())
+
+
+def join_dimensions(expr: Expr) -> Expr:
+    if not isinstance(expr, sympy.Add) or not expr.has(ModularIndexing):
+        return expr  # fast exit path
+    return _join_dimensions_cached(expr)
+
+
+@functools.lru_cache(256)
+def _join_dimensions_cached(expr: Expr) -> Expr:
+    """
+    ModularIndexing(i0, 1, 32) + 32 * ModularIndexing(i0, 32, 4)
+    becomes
+    ModularIndexing(i0, 1, 128)
+    ModularIndexing(i0, 1, 32) + 32 * FloorDiv(i0, 32)
+    becomes i0
+
+
+    This type of pattern can come from view operations
+    """
+    assert isinstance(expr, sympy.Add)
+
+    scale = sympy.Wild("scale", exclude=[0])
+    base = sympy.Wild("base")
+    divisor = sympy.Wild("divisor")
+    mod1 = sympy.Wild("modulus")
+    mod2 = sympy.Wild("modulus2")
+    for term1 in expr.args:
+        m1 = term1.match(scale * ModularIndexing(base, divisor, mod1))
+        if m1:
+            for term2 in expr.args:
+                m2 = term2.match(
+                    m1[scale]
+                    * m1[mod1]
+                    * ModularIndexing(m1[base], m1[divisor] * m1[mod1], mod2)
+                )
+                if m2 and term1 != term2:
+                    expr = join_dimensions(
+                        expr
+                        - term1
+                        - term2
+                        + m1[scale]
+                        * ModularIndexing(m1[base], m1[divisor], m1[mod1] * m2[mod2])
+                    )
+                    return expr
+    for term1 in expr.args:
+        m1 = term1.match(scale * ModularIndexing(base, divisor, mod1))
+        if m1:
+            for term2 in expr.args:
+                m2 = term2.match(
+                    m1[scale] * m1[mod1] * FloorDiv(m1[base], m1[divisor] * m1[mod1])
+                )
+                if m2 is not None:  # in case of success we get an empty dict here
+                    expr = join_dimensions(
+                        expr
+                        - term1
+                        - term2
+                        + m1[scale] * FloorDiv(m1[base], m1[divisor])
+                    )
+                    return expr
+    return expr
+
+
+class SimplifyIndexing(V.WrapperHandler):  # type: ignore[name-defined]
+    """
+    A wrapper around .virtualize.ops that uses var range information to
+    simplify ModularIndexing/FloorDiv.
+    """
+
+    def __init__(self, inner, var_ranges: VarRanges):
+        super().__init__(inner)
+        self.name = "SimplifyIndexing"
+        self._simplify: Callable[
+            [Expr], Expr
+        ] = lambda index: V.graph.sizevars.simplify_with_ranges(index, var_ranges)
+
+    def load(self, name: str, index: sympy.Expr):
+        return self._inner.load(name, self._simplify(index))
+
+    def store(self, name, index, value, mode=None):
+        return self._inner.store(name, self._simplify(index), value, mode=mode)
+
+    def store_reduction(self, name, index, value):
+        return self._inner.store_reduction(name, self._simplify(index), value)
+
+    def index_expr(self, index, dtype):
+        return self._inner.index_expr(self._simplify(index), dtype)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/test_case.py b/MLPY/Lib/site-packages/torch/_inductor/test_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..545c33dc4b952d5352e61caee1bcc7429458e007
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/test_case.py
@@ -0,0 +1,53 @@
+import contextlib
+import tempfile
+import unittest
+
+from torch._dynamo.test_case import (
+    run_tests as dynamo_run_tests,
+    TestCase as DynamoTestCase,
+)
+
+from torch._inductor import config
+
+
+def run_tests(needs=()):
+    dynamo_run_tests(needs)
+
+
+class TestCase(DynamoTestCase):
+    """
+    A base TestCase for inductor tests. Enables FX graph caching and isolates
+    the cache directory for each test.
+    """
+
+    _stack: contextlib.ExitStack
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._stack = contextlib.ExitStack()
+        cls._stack.enter_context(config.patch({"fx_graph_cache": True}))
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        cls._stack.close()
+
+    def setUp(self):
+        super().setUp()
+
+        # For all tests, mock the tmp directory populated by the inductor
+        # FxGraphCache, both for test isolation and to avoid filling disk.
+        self._inductor_cache_tmp_dir = tempfile.TemporaryDirectory()
+        self._inductor_cache_get_tmp_dir_patch = unittest.mock.patch(
+            "torch._inductor.codecache.FxGraphCache._get_tmp_dir"
+        )
+        mock_get_dir = self._inductor_cache_get_tmp_dir_patch.start()
+        mock_get_dir.return_value = self._inductor_cache_tmp_dir.name
+
+    def tearDown(self):
+        super().tearDown()
+
+        # Clean up the FxGraphCache tmp dir.
+        self._inductor_cache_get_tmp_dir_patch.stop()
+        self._inductor_cache_tmp_dir.cleanup()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/test_operators.py b/MLPY/Lib/site-packages/torch/_inductor/test_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d41e9c1e6b41b27b468c8d48199c4bbfe792706
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/test_operators.py
@@ -0,0 +1,24 @@
+import torch.library
+from torch import Tensor
+from torch.autograd import Function
+
+_test_lib_def = torch.library.Library("_inductor_test", "DEF")
+_test_lib_def.define("realize(Tensor self) -> Tensor", tags=torch.Tag.pt2_compliant_tag)
+
+_test_lib_impl = torch.library.Library("_inductor_test", "IMPL")
+for dispatch_key in ("CPU", "CUDA", "Meta"):
+    _test_lib_impl.impl("realize", lambda x: x.clone(), dispatch_key)
+
+
+class Realize(Function):
+    @staticmethod
+    def forward(ctx, x):
+        return torch.ops._inductor_test.realize(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+def realize(x: Tensor) -> Tensor:
+    return Realize.apply(x)
diff --git a/MLPY/Lib/site-packages/torch/_inductor/triton_helpers.py b/MLPY/Lib/site-packages/torch/_inductor/triton_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3773267cd6a0c062037f60f9e9943873d1fddaf6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/triton_helpers.py
@@ -0,0 +1,344 @@
+import triton
+import triton.language as tl
+
+# In the latest triton, math functions were shuffled around into different modules:
+# https://github.com/openai/triton/pull/3172
+if hasattr(tl.extra.cuda, "libdevice"):
+    libdevice = tl.extra.cuda.libdevice
+    math = tl.math
+else:
+    libdevice = tl.math
+    math = tl
+
+
+@triton.jit
+def promote_to_tensor(x):
+    # Addition promotes to tensor for us
+    return x + tl.zeros((1,), tl.int1)
+
+
+@triton.jit
+def is_floating(x):
+    return promote_to_tensor(x).dtype.is_floating()
+
+
+@triton.jit
+def _prod_accumulate(a, b):
+    return a * b
+
+
+@triton.jit
+def prod(input, axis):
+    return tl.reduce(input, axis, _prod_accumulate)
+
+
+@triton.jit
+def minimum(a, b):
+    mask = a < b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def maximum(a, b):
+    mask = a > b
+    if is_floating(a):
+        mask |= a != a
+    return tl.where(mask, a, b)
+
+
+@triton.jit
+def min2(a, dim):
+    return tl.reduce(a, dim, minimum)
+
+
+@triton.jit
+def max2(a, dim):
+    return tl.reduce(a, dim, maximum)
+
+
+@triton.jit
+def minimum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value < b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def maximum_with_index(a_value, a_index, b_value, b_index):
+    mask = a_value > b_value
+    equal = a_value == b_value
+    if is_floating(a_value):
+        a_isnan = a_value != a_value
+        b_isnan = b_value != b_value
+        mask |= a_isnan and not b_isnan
+        # Consider NaNs as equal
+        equal |= a_isnan and b_isnan
+
+    # Prefer lowest index if values are equal
+    mask |= equal & (a_index < b_index)
+    return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index)
+
+
+@triton.jit
+def min_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, minimum_with_index)
+
+
+@triton.jit
+def max_with_index(value, index, dim):
+    return tl.reduce((value, index), dim, maximum_with_index)
+
+
+@triton.jit
+def welford_reduce(value, mean, m2, weight, first_iteration):
+    if first_iteration:
+        new_weight = tl.full(weight.shape, 1, weight.dtype)
+        new_mean = value
+        new_m2 = tl.zeros_like(m2)
+    else:
+        delta = value - mean
+        new_weight = weight + 1
+        new_mean = mean + delta / new_weight
+        new_m2 = m2 + delta * (value - new_mean)
+    return new_mean, new_m2, new_weight
+
+
+@triton.jit
+def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2):
+    delta = mean_2 - mean_1
+    new_weight = weight_1 + weight_2
+    w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight)
+    return (
+        mean_1 + delta * w2_over_w,
+        m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w,
+        new_weight,
+    )
+
+
+@triton.jit
+def welford(mean, m2, weight, dim):
+    return tl.reduce((mean, m2, weight), dim, welford_combine)
+
+
+@triton.jit
+def device_assert_then(cond, msg, r):
+    tl.device_assert(cond, msg)
+    return r
+
+
+@triton.jit
+def randint64(seed, offset, low, high):
+    r0, r1, r2, r3 = tl.randint4x(seed, offset)
+    r0 = r0.to(tl.uint64)
+    r1 = r1.to(tl.uint64)
+    result = r0 | (r1 << 32)
+    size = high - low
+    result = result % size.to(tl.uint64)
+    result = result.to(tl.int64) + low
+    return result
+
+
+@triton.jit
+def _any_combine(a, b):
+    return a | b
+
+
+@triton.jit
+def any(a, dim):
+    return tl.reduce(a, dim, _any_combine)
+
+
+@triton.jit
+def bucketize_binary_search(
+    values,  # 1D tensor
+    offsets_ptr,
+    indexing_dtype,
+    right,  # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op]
+    OFFSETS_SIZE: int,
+    BLOCK_SHAPE,  # tuple/list of block shape
+):
+    """
+    See [Note: Inductor bucketize op]
+    """
+
+    low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype)
+    high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype)
+
+    full_range = OFFSETS_SIZE + 1
+    while full_range > 1:
+        mid = (high + low) // 2
+        mask = mid < OFFSETS_SIZE
+        bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask)
+        if right:
+            is_above = values >= bucket_upper_bound
+        else:
+            is_above = values > bucket_upper_bound
+
+        low = tl.where(is_above & mask, mid + 1, low)
+        high = tl.where(is_above, high, mid)
+
+        full_range = (full_range + 1) // 2
+
+    return low
+
+
+@triton.jit
+def pack_value_flag(
+    value,
+    flag,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK)
+    return flag.to(DTYPE_PACK) | (uv << bitwidth)
+
+
+@triton.jit
+def unpack_value(
+    pack,
+    DTYPE_VALUE,
+    DTYPE_VALUE_AS_UINT,
+):
+    # Workaround for triton bug, tensor.to doesn't unwrap constexpr values
+    DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE)
+    DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT)
+    bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth
+    value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT)
+    return value_uint.to(DTYPE_VALUE, bitcast=True)
+
+
+@triton.jit
+def unpack_flag(pack, DTYPE_FLAG):
+    return pack.to(DTYPE_FLAG)
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback(
+    scratch_base,
+    block_value,
+    index,
+    combine_fn,
+    init,
+    DTYPE_VALUE_AS_UINT: tl.constexpr,
+    DTYPE_PACK: tl.constexpr,
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value``
+    DTYPE_PACK: Unsigned type twice the width of block_value
+
+    NOTE: This function is limited to values which are 32-bits or less.
+    """
+    DTYPE_VALUE = block_value.dtype
+    pack = pack_value_flag(
+        block_value,
+        tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        # tl.atomic_load
+        flag = tl.full([], 0, DTYPE_VALUE_AS_UINT)
+        while flag == 0:
+            pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed")
+            flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT)
+
+        value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    pack = pack_value_flag(
+        inclusive_prefix,
+        tl.full([], 2, DTYPE_VALUE_AS_UINT),
+        DTYPE_VALUE_AS_UINT,
+        DTYPE_PACK,
+    )
+    tl.atomic_xchg(scratch_base + index, pack, sem="relaxed")
+    return exclusive_prefix
+
+
+@triton.jit
+def exclusive_scan_decoupled_lookback_64(
+    scratch_base, block_value, index, combine_fn, init
+):
+    """Compute exclusive scan of a scalar value between blocks
+
+    Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back
+
+    scratch_base: Pointer to scratch space in global memory
+    block_value: Scalar value for this block, must be 64-bits wide
+    index: Scalar index of this block relative to the current scan
+    combine_fn: Function ``(value, value) -> value`` which is scanned over
+    init: Scalar value equal to the identiy of combine_fn
+    """
+    block_value_u64 = block_value.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 1, block_value_u64)
+    tl.debug_barrier()
+    flag_one = tl.full([], 1, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release")
+
+    exclusive_prefix = init
+    test_target = index - 1
+    while test_target >= 0:
+        flag = tl.full([], 0, tl.uint64)
+        while flag == 0:
+            flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire")
+
+        value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32))
+        value = value_u64.to(block_value.dtype, bitcast=True)
+        exclusive_prefix = combine_fn(value, exclusive_prefix)
+
+        if flag == 2:
+            test_target = -1
+        else:
+            test_target = test_target - 1
+
+    # Make inclusive block sum visible to other blocks
+    inclusive_prefix = combine_fn(exclusive_prefix, block_value)
+    inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True)
+    tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64)
+    tl.debug_barrier()
+    flag_two = tl.full([], 2, tl.uint64)
+    tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release")
+
+    return exclusive_prefix
+
+
+@triton.jit
+def frexp(x):
+    # TODO(isuruf): use inline_asm_elementwise here
+    y = libdevice.ilogb(x) + 1
+    exponent = tl.where(x == 0, 0, y)
+    mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y))
+    return mantissa, exponent
diff --git a/MLPY/Lib/site-packages/torch/_inductor/triton_heuristics.py b/MLPY/Lib/site-packages/torch/_inductor/triton_heuristics.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0589405e555857aeb14813dc96d66204fc08ddf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/triton_heuristics.py
@@ -0,0 +1,1527 @@
+import builtins
+import copy
+import functools
+import hashlib
+import inspect
+import json
+import logging
+import math
+import operator
+import os
+import os.path
+import re
+import threading
+from enum import auto, Enum
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple
+
+import torch
+
+import torch.autograd.profiler as autograd_profiler
+from torch._dynamo.device_interface import get_interface_for_device
+from torch._dynamo.utils import dynamo_timed, get_first_attr
+from torch.utils._triton import has_triton_package
+
+from . import config
+from .codecache import cache_dir, CudaKernelParamCache
+from .coordinate_descent_tuner import CoordescTuner
+
+from .ir import ReductionHint, TileHint
+from .utils import (
+    ceildiv,
+    conditional_product,
+    create_bandwidth_info_str,
+    do_bench,
+    get_max_y_grid,
+    get_num_bytes,
+    next_power_of_2,
+    triton_config_to_hashable,
+)
+
+
+log = logging.getLogger(__name__)
+
+if has_triton_package():
+    import triton
+    from triton import Config
+    from triton.runtime.autotuner import OutOfResources
+    from triton.runtime.jit import KernelInterface
+
+    try:
+        from triton.compiler.compiler import ASTSource
+    except ImportError:
+        ASTSource = None
+else:
+    Config = object
+    triton = None
+    KernelInterface = object
+    OutOfResources = object
+    ASTSource = None
+
+
+_NUM_THREADS_PER_WARP = 32
+
+
+class HeuristicType(Enum):
+    PERSISTENT_REDUCTION = auto()
+    POINTWISE = auto()
+    REDUCTION = auto()
+    SPLIT_SCAN = auto()
+    TEMPLATE = auto()
+    USER_AUTOTUNE = auto()
+
+
+class AutotuneHint(Enum):
+    ELEMENTS_PER_WARP_32 = 0
+
+    # Triton codegen tries to codegen set of AutotuneHints.
+    # Enum.__repr__ looks like "<AutotuneHint.ELEMENTS_PER_WARP_32: 0>""
+    # which isn't valid python.
+    # Enum.__str__ will just return "AutotuneHint.ELEMENTS_PER_WARP_32".
+    __repr__ = Enum.__str__
+
+
+def autotune_hints_to_configs(
+    hints: Set[AutotuneHint], size_hints, block_size: int
+) -> List[Config]:
+    """
+    AutotuneHints can be attached to the metadata of triton kernels for providing
+    suggestions about what to try for autotuning. One reason to do this is if there are
+    some configs that are only useful in specific scenarios, in which case we can avoid
+    wasting compile time on autotuning unless we know we are in one of those scenarios.
+
+    Based on those hints, this function will generate a list of additional autotuning
+    configs to try.
+    """
+    xyz_options: Tuple[Tuple[int, Optional[int], Optional[int]], ...]
+    configs = []
+
+    for hint in hints:
+        if hint == AutotuneHint.ELEMENTS_PER_WARP_32:
+            if len(size_hints) == 1:
+                xyz_options = ((block_size // 4, None, None),)
+            elif len(size_hints) == 2:
+                xyz_options = ((block_size // 4, 1, None), (1, block_size // 4, None))
+            elif len(size_hints) == 3:
+                xyz_options = (
+                    (block_size // 4, 1, 1),
+                    (1, block_size // 4, 1),
+                    (1, 1, block_size // 4),
+                )
+            for xyz in xyz_options:
+                configs.append(
+                    triton_config(
+                        size_hints,
+                        *xyz,
+                        num_elements_per_warp=32,
+                    )
+                )
+
+    return configs
+
+
+def disable_pointwise_autotuning():
+    # Autotuning can give different benchmarking results from run to run, and
+    # therefore we disable autotuning when use_deterministic flag is on.
+    if torch.are_deterministic_algorithms_enabled():
+        return True
+    return not config.triton.autotune_pointwise
+
+
+class CachingAutotuner(KernelInterface):
+    """
+    Simplified version of Triton autotuner that has no invalidation
+    key and caches the best config to disk to improve cold start times.
+    Unlike the main triton Autotuner, this version can precompile all
+    configs, and does not rely on the Triton JIT.
+    """
+
+    def __init__(
+        self,
+        fn,
+        triton_meta,  # passed directly to triton
+        configs,
+        save_cache_hook,
+        mutated_arg_names,
+        heuristic_type,
+        size_hints=None,
+        inductor_meta=None,  # metadata not relevant to triton
+        custom_kernel=False,  # whether the kernel is inductor-generated or custom
+    ):
+        super().__init__()
+
+        assert len(configs) > 0, "Non-empty TritonConfig list required for compiling"
+        self.fn = fn
+        self.triton_meta = triton_meta
+        self.inductor_meta = {} if inductor_meta is None else inductor_meta
+        self.save_cache_hook = save_cache_hook
+        self.mutated_arg_names = mutated_arg_names
+        self.configs = configs
+        self.heuristic_type = heuristic_type
+        self.custom_kernel = custom_kernel
+        self.cuda_kernel_saved = False
+
+        # Align the default design that default as cuda
+        self.device_type = (
+            triton_meta["device_type"] if "device_type" in triton_meta else "cuda"
+        )
+        self.gpu_device = get_interface_for_device(self.device_type)
+
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug(
+                "CachingAutotuner gets %d configs for %s",
+                len(self.configs),
+                self.fn.__name__,
+            )
+            for c in self.configs:
+                log.debug(c)
+
+        self.launchers = []
+        self.lock = threading.Lock()
+        if os.getenv("TRITON_CACHE_DIR") is None:
+            os.environ["TRITON_CACHE_DIR"] = os.path.join(
+                cache_dir(),
+                "triton",
+                str(self.triton_meta.get("device", 0)),
+            )
+
+        self.size_hints = size_hints
+        self.coordesc_tuner = CoordescTuner(
+            is_mm=False, name=self.fn.__name__, size_hints=size_hints
+        )
+
+        # pre-create the profiler context manager to reduce latency
+        self.record_function_ctx = torch._C._profiler._RecordFunctionFast(
+            self.inductor_meta.get("kernel_name", "triton kernel")
+        )
+
+    def precompile(self, warm_cache_only_with_cc=None):
+        with self.lock:
+            if self.launchers:
+                return
+            self.launchers = []
+            compiled_binaries = []
+            if not self.configs:
+                raise RuntimeError("No triton configs are available")
+
+            for c in self.configs:
+                try:
+                    compiled_binary, launcher = self._precompile_config(
+                        c, warm_cache_only_with_cc
+                    )
+                except OutOfResources:
+                    # Skip the config if we run out of resource
+                    continue
+                self.launchers.append(launcher)
+                compiled_binaries.append(compiled_binary)
+
+            if len(self.launchers) == 0:
+                raise RuntimeError(
+                    "No valid triton configs. Report a fatal compilation error"
+                )
+
+            seen_configs = set(self.configs)
+
+            device_prop = self.gpu_device.Worker.get_device_properties(
+                self.triton_meta["device"]
+            )
+            if (
+                config.dynamic_scale_rblock
+                and self.heuristic_type == HeuristicType.REDUCTION
+                and self.size_hints is not None
+                # Disable for AMDGPU as Triton is not ready to return n_regs for a compiled_binary.
+                and torch.version.hip is None
+                and device_prop.major >= 8
+            ):
+                for triton_config, compiled_binary in zip(
+                    self.configs, compiled_binaries
+                ):
+                    assert len(self.size_hints) == 2
+                    xblock = triton_config.kwargs.get("XBLOCK", 1)
+                    rblock = triton_config.kwargs["RBLOCK"]
+                    total_block = (self.size_hints[0] + xblock - 1) // xblock
+                    nreg = getattr(compiled_binary, "n_regs", None)
+                    if nreg is None:
+                        continue
+
+                    # make sure rblock is not too small
+                    if rblock <= 64:
+                        continue
+
+                    # each SM of A100 has 65536 32-bit registers. To maximize
+                    # the theoretical occupancy, we need run 2048 threads on each
+                    # SM. So each thread should use no more than 65536 / 2048
+                    # = 32 registers. In cases where occupancy matters, and each
+                    # thread uses too many registers, reduce RBLOCK to reduce
+                    # the register usage.
+                    # For kernel https://gist.github.com/shunting314/e4cccc031fe30d378b9b23c08c238cbd
+                    # from PLBartForCausalLM, latency improve from
+                    # 7.795ms to 4.883ms.
+                    #
+                    if (
+                        nreg
+                        <= device_prop.regs_per_multiprocessor
+                        // device_prop.max_threads_per_multi_processor
+                    ):
+                        continue
+
+                    nreg_per_warp = nreg * 32
+                    nreg_per_block = nreg_per_warp * triton_config.num_warps
+
+                    # Previously we set max_blocks_per_sm to 'max_threads_per_multi_processo / (32 * num_warps)'
+                    # The formula below is a tighter upper bound since we have the assumption that
+                    #   nreg > device_prop.regs_per_multiprocessor // device_prop.max_threads_per_multi_processor
+                    # due to the if condition above and:
+                    #   regs_per_multiprocessor / nreg_per_block
+                    #   = regs_per_multiprocessor / (nreg * 32 * num_warps)
+                    #   < regs_per_multiprocessor / ((regs_per_multiprocessor / max_threads_per_multi_processor) * 32 * num_warps)
+                    #   = max_threads_per_multi_processor / (32 * num_warps)
+                    # Using a tigher upper bound can reveal more optimization opportunities.
+                    max_blocks_per_sm = max(
+                        device_prop.regs_per_multiprocessor // nreg_per_block, 1
+                    )
+
+                    if (
+                        total_block
+                        <= max_blocks_per_sm * device_prop.multi_processor_count
+                    ):
+                        # no need to improve occupancy
+                        continue
+                    new_config = copy.deepcopy(triton_config)
+                    new_config.kwargs["RBLOCK"] = rblock // 2
+                    if new_config in seen_configs:
+                        continue
+                    seen_configs.add(new_config)
+                    self.launchers.append(
+                        self._precompile_config(new_config, warm_cache_only_with_cc)[1]
+                    )
+            self.configs = None
+
+    def _precompile_config(self, cfg: Config, warm_cache_only_with_cc: Optional[int]):
+        """Ahead of time compile a given autotuner config."""
+        compile_meta = copy.deepcopy(self.triton_meta)
+        for k, v in cfg.kwargs.items():
+            compile_meta["constants"][self.fn.arg_names.index(k)] = v
+        compile_meta["num_warps"] = cfg.num_warps
+        compile_meta["num_stages"] = cfg.num_stages
+        compile_meta["debug"] = (
+            config.assert_indirect_indexing and torch.version.hip is None
+        )
+
+        # Setting device_type="hip" required on ROCm to pass down to triton
+        compile_meta["device_type"] = (
+            self.device_type if torch.version.hip is None else "hip"
+        )
+
+        if warm_cache_only_with_cc:
+            cc = warm_cache_only_with_cc
+        else:
+            # Use device_type 'cuda' for both cuda and hip devices to retrieve
+            # the compute capability.
+            device_type = self.device_type if torch.version.hip is None else "cuda"
+            device_id = compile_meta["device"]
+            device = torch.device(device_type, device_id)
+            cc = self.gpu_device.get_compute_capability(device)
+
+        compile_meta["cc"] = cc
+
+        if ASTSource:
+            compile_args = (
+                ASTSource(
+                    self.fn,
+                    compile_meta["signature"],
+                    compile_meta["constants"],
+                    compile_meta["configs"][0],
+                ),
+            )
+
+            target = (compile_meta["device_type"], cc)
+            options = {
+                "num_warps": compile_meta["num_warps"],
+                "num_stages": compile_meta["num_stages"],
+                "debug": compile_meta["debug"],
+            }
+            compile_kwargs = {
+                "target": target,
+                "options": options,
+            }
+        else:
+            compile_args = (self.fn,)
+            compile_kwargs = compile_meta
+
+        if warm_cache_only_with_cc:
+            return (
+                triton.compile(*compile_args, **compile_kwargs),
+                None,
+            )
+
+        # load binary to the correct device
+        with self.gpu_device.device(compile_meta["device"]):  # type: ignore[attr-defined]
+            # need to initialize context
+            self.gpu_device.synchronize(self.gpu_device.current_device())
+
+            try:
+                binary = triton.compile(*compile_args, **compile_kwargs)
+            except Exception:
+                log.exception(
+                    "Triton compilation failed: %s\n%s\nmetadata: %s",
+                    self.inductor_meta.get("kernel_name", "triton_"),
+                    self.fn.src,
+                    compile_meta,
+                )
+                raise
+            binary._init_handles()
+
+        call_args = [
+            arg
+            for i, arg in enumerate(self.fn.arg_names)
+            if i not in self.fn.constexprs
+        ]
+        def_args = [name for name in self.fn.arg_names if name not in cfg.kwargs]
+
+        scope = {
+            "grid_meta": cfg.kwargs,
+            "bin": binary,
+            "launch_enter_hook": binary.launch_enter_hook,
+            "launch_exit_hook": binary.launch_exit_hook,
+            "metadata": binary.metadata,
+            "torch": torch,
+            "set_device": self.gpu_device.set_device,
+            "current_device": self.gpu_device.current_device,
+        }
+
+        scope["runner"] = get_first_attr(binary, "run", "c_wrapper")
+        scope["function"] = get_first_attr(binary, "function", "cu_function")
+        scope["cta_args"] = (
+            (binary.num_ctas, *get_first_attr(binary, "cluster_dims", "clusterDims"))
+            if hasattr(binary, "num_ctas")
+            else (
+                (binary.metadata.num_ctas, *binary.metadata.cluster_dims)
+                if hasattr(binary, "metadata")
+                else ()
+            )
+        )
+        scope["num_warps"] = (
+            binary.num_warps
+            if hasattr(binary, "num_warps")
+            else binary.metadata.num_warps
+        )
+        binary_shared = (
+            binary.shared if hasattr(binary, "shared") else binary.metadata.shared
+        )
+        scope["shared"] = binary_shared
+
+        exec(
+            f"""
+            def launcher({', '.join(def_args)}, grid, stream):
+                if callable(grid):
+                    grid_0, grid_1, grid_2 = grid(grid_meta)
+                else:
+                    grid_0, grid_1, grid_2 = grid
+
+                runner(grid_0, grid_1, grid_2, num_warps,
+                            *cta_args, shared,
+                            stream, function,
+                            launch_enter_hook,
+                            launch_exit_hook,
+                            metadata,
+                            {', '.join(call_args)})
+                return bin
+            """.lstrip(),
+            scope,
+        )
+
+        launcher = scope["launcher"]
+        launcher.config = cfg
+        launcher.n_regs = getattr(binary, "n_regs", None)
+        launcher.n_spills = getattr(binary, "n_spills", None)
+        launcher.shared = binary_shared
+        launcher.store_cubin = config.triton.store_cubin
+        # store this global variable to avoid the high overhead of reading it when calling run
+        if launcher.store_cubin:
+            launcher.fn = self.fn
+            launcher.bin = binary
+
+        return binary, launcher
+
+    def bench(self, launcher, *args, grid, **kwargs):
+        """Measure the performance of a given launcher"""
+        # we don't skip configs wiht spilled registers when auto-tuning custom
+        # (user-written) Triton kernels, as (i) we don't have any knowledge or
+        # control over the kernel code; (ii) there is empirical evidence that
+        # for some (complicated) custom Triton kernels, a register-spilling
+        # config may yield the best latency.
+        if not self.custom_kernel and launcher.n_spills > config.triton.spill_threshold:
+            log.debug(
+                "Skip config %s because of register spilling: %d",
+                launcher.config,
+                launcher.n_spills,
+            )
+            return float("inf")
+
+        stream = self.gpu_device.get_raw_stream(  # type: ignore[call-arg]
+            self.gpu_device.current_device()
+        )
+
+        def kernel_call():
+            if launcher.config.pre_hook is not None:
+                launcher.config.pre_hook(
+                    {**dict(zip(self.arg_names, args)), **launcher.config.kwargs}
+                )
+
+            cloned_args, cloned_kwargs = self.clone_args(*args, **kwargs)
+            launcher(
+                *cloned_args,
+                **cloned_kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+        return do_bench(kernel_call, rep=40, fast_flush=True)
+
+    def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]:
+        from .compile_fx import clone_preserve_strides
+
+        # clone inplace buffers to avoid autotune contaminating them if
+        # the kernel does in-place stores. avoid cloning other buffers because
+        # it leads to increase memory use
+        cloned_args = []
+        for i, arg in enumerate(args):
+            if self.fn.arg_names[i] in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_args.append(clone_preserve_strides(arg))
+            else:
+                cloned_args.append(arg)
+
+        cloned_kwargs: Dict[str, Any] = {}
+        for name, arg in kwargs.items():
+            if name in self.mutated_arg_names:
+                assert isinstance(arg, torch.Tensor)
+                cloned_kwargs[name] = clone_preserve_strides(arg)
+            else:
+                cloned_kwargs[name] = arg
+
+        return cloned_args, cloned_kwargs
+
+    @dynamo_timed
+    def benchmark_all_configs(self, *args, **kwargs):
+        timings = {
+            launcher: self.bench(launcher, *args, **kwargs)
+            for launcher in self.launchers
+        }
+
+        for k, v in timings.items():
+            self.coordesc_tuner.cache_benchmark_result(k.config, v)
+
+        if log.isEnabledFor(logging.DEBUG):
+            log.debug("Benchmark all input configs for %s, get:", self.fn.__name__)
+            for k, v in timings.items():
+                log.debug(
+                    "%s: %f, nreg %d, nspill %d, #shared-mem %s",
+                    k.config,
+                    v,
+                    k.n_regs,
+                    k.n_spills,
+                    k.shared,
+                )
+
+        return timings
+
+    def autotune_to_one_config(self, *args, **kwargs):
+        """Do the actual autotuning"""
+        timings = self.benchmark_all_configs(*args, **kwargs)
+        self.launchers = [builtins.min(timings, key=timings.get)]
+        if self.save_cache_hook:
+            self.save_cache_hook(self.launchers[0].config)
+
+    def save_cuda_kernel(self, grid, stream, launcher):
+        if callable(grid):
+            grid_x, grid_y, grid_z = grid(launcher.config.kwargs)
+        else:
+            grid_x, grid_y, grid_z = grid
+
+        key = self.inductor_meta.get("kernel_name", None)  # unique kernel name
+        assert key is not None, "kernel_name can not be None"
+        params = {
+            "mangled_name": launcher.bin.metadata.name
+            if hasattr(launcher.bin.metadata, "name")
+            else launcher.bin.metadata["name"],
+            "grid_x": grid_x,
+            "grid_y": grid_y,
+            "grid_z": grid_z,
+            "x_block": launcher.config.kwargs.get("XBLOCK", 1),
+            "y_block": launcher.config.kwargs.get("YBLOCK", None),
+            "z_block": launcher.config.kwargs.get("ZBLOCK", None),
+            "num_warps": launcher.bin.num_warps
+            if hasattr(launcher.bin, "num_warps")
+            else launcher.bin.metadata.num_warps,
+            "shared_mem": launcher.bin.shared
+            if hasattr(launcher.bin, "shared")
+            else launcher.bin.metadata.shared,
+            "stream": stream,
+            # User defined triton kernels will have arbitrary kwarg names
+            "meta": launcher.config.kwargs,
+        }
+
+        if torch.version.hip is None:
+            CudaKernelParamCache.set(key, params, launcher.bin.asm["cubin"])
+        else:
+            # There is some divergence between CUDA and ROCm here.
+            # On ROCm's triton we only have the the path to the binary, not the binary itself.
+            # For ROCm we will copy the binary to the new location instead of writing to file
+            import pathlib
+
+            launcher.bin.asm["hsaco"] = pathlib.Path(
+                launcher.bin.asm["hsaco_path"]
+            ).read_bytes()
+            CudaKernelParamCache.set(key, params, launcher.bin.asm["hsaco"])
+
+        self.cuda_kernel_saved = True
+
+    def coordinate_descent_tuning(self, launcher, *args, **kwargs):
+        """
+        Coordinate descent tuning can be run with or without max-autotune.
+
+        The only difference between these two is the starting config for coordinate_descent tuning.
+        E.g., assuming regular autotune only get one config C1; while max-autotune get 4 configs C1, C2, C3, C4
+        and max-autotune figure out C3 is the best.
+
+        Then if coordinate descnt tuning is run with max-autotune disabled, it will start from C1;
+        while if coordinate descent tuning is run with max-autotune enabled, it will start from C3.
+        """
+        if (
+            self.heuristic_type == HeuristicType.TEMPLATE
+            or self.heuristic_type == HeuristicType.USER_AUTOTUNE
+        ):
+            # skip triton template
+            return launcher
+
+        cloned_args, _ = self.clone_args(*args)
+        config2launcher = {launcher.config: launcher}
+
+        def benchmark_one_config(config):
+            with self.lock:
+                _, launcher = self._precompile_config(config, None)
+            config2launcher[config] = launcher
+
+            out = self.bench(launcher, *cloned_args, **kwargs)
+            log.debug(
+                "COORDESC: %s: %f, nreg %d, nspill %d, #shared-mem %d",
+                launcher.config,
+                out,
+                launcher.n_regs,
+                launcher.n_spills,
+                launcher.shared,
+            )
+            return out
+
+        assert not (
+            self.heuristic_type == HeuristicType.PERSISTENT_REDUCTION
+            and "RBLOCK" in launcher.config.kwargs
+        ), "Coordinate descent tuner relies on the assumption that persistent reduction's triton config does not have RBLOCK"
+        best_config = self.coordesc_tuner.autotune(
+            benchmark_one_config, launcher.config, None
+        )
+        best_config.found_by_coordesc = True
+
+        if self.save_cache_hook:
+            self.save_cache_hook(best_config, found_by_coordesc=True)
+        return config2launcher.get(best_config)
+
+    def run(self, *args, grid, stream, **kwargs):
+        if len(self.launchers) != 1:
+            if len(self.launchers) == 0:
+                self.precompile()
+            if len(self.launchers) > 1:
+                self.autotune_to_one_config(*args, grid=grid, **kwargs)
+
+        if (
+            not getattr(self.launchers[0].config, "found_by_coordesc", False)
+            and config.coordinate_descent_tuning
+        ):
+            self.launchers = [
+                self.coordinate_descent_tuning(
+                    self.launchers[0], *args, grid=grid, **kwargs
+                )
+            ]
+
+        (launcher,) = self.launchers
+        if launcher.store_cubin:
+            self.save_cuda_kernel(grid, stream, launcher)
+
+        if launcher.config.pre_hook is not None:
+            launcher.config.pre_hook(
+                {**dict(zip(self.arg_names, args)), **launcher.config.kwargs, **kwargs}
+            )
+
+        # guard the record_function_ctx and only call it if profiling is currently
+        # in progress, to reduce latency when profiler is not turned on. Note that
+        # the "if" statement (instead of, say, a contextlib.nullcontext) is intentional;
+        # it is faster than entering and exiting a context manager, even if the context
+        # manager is a nullcontext.
+        if autograd_profiler._is_profiler_enabled:
+            with self.record_function_ctx:
+                return launcher(
+                    *args,
+                    **kwargs,
+                    grid=grid,
+                    stream=stream,
+                )
+        else:
+            return launcher(
+                *args,
+                **kwargs,
+                grid=grid,
+                stream=stream,
+            )
+
+
+def _find_names(obj):
+    import gc
+    import inspect
+
+    frame = inspect.currentframe()
+    while frame is not None:
+        frame.f_locals
+        frame = frame.f_back
+    obj_names = []
+    for referrer in gc.get_referrers(obj):
+        if isinstance(referrer, dict):
+            for k, v in referrer.items():
+                if v is obj:
+                    obj_names.append(k)
+    return obj_names
+
+
+collected_calls: List[Any] = []
+
+
+def start_graph():
+    collected_calls.clear()
+
+
+def end_graph():
+    if len(collected_calls) == 0:
+        return
+    overall_time = sum(call[0] for call in collected_calls)
+    overall_gb = sum(call[1] for call in collected_calls)
+    cur_file = inspect.stack()[1].filename
+    summary_str = (
+        f"SUMMARY ({cur_file})\n"
+        f"{overall_time:.2f}ms   \t {overall_gb:.2f} GB\t {overall_gb/(overall_time/1e3):.2f}GB/s"
+    )
+    print(summary_str)
+    print()
+    output_file = config.profile_bandwidth_output
+    if output_file is not None:
+        # sort perf numbers in descending order, i.e. placing the
+        # most runtime-heavy kernels at the top of the list
+        sorted_calls = sorted(collected_calls, key=lambda c: float(c[0]), reverse=True)
+        try:
+            with open(output_file, "a") as file:
+                log.debug("Save profile bandwidth results to %s", output_file)
+                file.write("====================\n")
+                file.write(f"TRITON KERNELS BANDWIDTH INFO ({cur_file})\n")
+                for ms, num_gb, gb_per_s, kernel_name in sorted_calls:
+                    # also display the runtime percentage for each kernel
+                    percentage = f"{ms/overall_time*100:.2f}%"
+                    suffix = f" \t {percentage} \t {kernel_name}"
+                    bw_info_str = create_bandwidth_info_str(
+                        ms,
+                        num_gb,
+                        gb_per_s,
+                        suffix=suffix,
+                        color=False,
+                    )
+                    file.write(bw_info_str + "\n")
+                file.write(f"{summary_str}\n\n")
+        except Exception as e:
+            log.warning(
+                "failed to write profile bandwidth result into %s: %s",
+                output_file,
+                e,
+            )
+
+
+class DebugAutotuner(CachingAutotuner):
+    def __init__(self, *args, regex_filter="", **kwargs):
+        self.regex_filter = regex_filter
+        super().__init__(*args, **kwargs)
+        self.cached = None
+
+    def run(self, *args, grid, stream):
+        possible_names = _find_names(self)
+        kernel_name = f"{max(possible_names, key=len)}"
+        if not re.match(self.regex_filter, kernel_name):
+            return
+        super().run(*args, grid=grid, stream=stream)
+        (launcher,) = self.launchers
+
+        if self.cached is None:
+            ms = self.bench(launcher, *args, grid=grid)
+            num_in_out_ptrs = len(
+                [
+                    arg_name
+                    for arg_name in self.fn.arg_names
+                    if arg_name.startswith("in_out_ptr")
+                ]
+            )
+            num_gb = self.inductor_meta.get("kernel_num_gb", None)
+            if num_gb is None:
+                num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+            gb_per_s = num_gb / (ms / 1e3)
+            self.cached = (ms, num_gb, gb_per_s, kernel_name)
+        else:
+            ms, num_gb, gb_per_s, kernel_name = self.cached
+        collected_calls.append((ms, num_gb, gb_per_s, kernel_name))
+        print(
+            create_bandwidth_info_str(ms, num_gb, gb_per_s, suffix=f" \t {kernel_name}")
+        )
+
+
+def hash_configs(configs: List[Config]):
+    """
+    Hash used to check for changes in configurations
+    """
+    hasher = hashlib.sha256()
+    for cfg in configs:
+        hasher.update(
+            f"{sorted(cfg.kwargs.items())} {cfg.num_warps} {cfg.num_stages}\n".encode()
+        )
+    return hasher.hexdigest()
+
+
+def load_cached_autotuning(
+    best_config,
+    configs_hash: str,
+    configs: List[Config],
+):
+    if best_config is None:
+        return None
+    if best_config.pop("configs_hash", None) != configs_hash:
+        return None
+
+    if config.coordinate_descent_tuning and best_config.pop("found_by_coordesc", False):
+        num_warps = best_config.pop("num_warps")
+        num_stages = best_config.pop("num_stages")
+        triton_config = Config(best_config, num_warps=num_warps, num_stages=num_stages)
+        triton_config.found_by_coordesc = True
+        return triton_config
+
+    matching_configs = [
+        cfg
+        for cfg in configs
+        if all(val == best_config.get(key) for key, val in cfg.kwargs.items())
+        and cfg.num_warps == best_config.get("num_warps")
+        and cfg.num_stages == best_config.get("num_stages")
+    ]
+    if len(matching_configs) != 1:
+        return None
+
+    return matching_configs[0]
+
+
+def cached_autotune(
+    size_hints: Optional[List[int]],
+    configs: List[Config],
+    triton_meta,
+    heuristic_type,
+    filename=None,
+    inductor_meta=None,
+    custom_kernel=False,
+):
+    """
+    A copy of triton.autotune that calls our subclass.  Our subclass
+    has additional debugging, error handling, and on-disk caching.
+    """
+    configs = unique_configs(configs)
+    assert len(configs) == 1 or filename
+    save_cache_hook: Optional[Callable[[Any, Any], Any]]
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+
+    # on disk caching logic and/or remote caching
+    if filename is not None and (len(configs) > 1 or config.coordinate_descent_tuning):
+        configs_hash = hash_configs(configs)
+
+        cache_filename = None
+        remote_cache = None
+        remote_cache_key = None
+        if config.use_autotune_local_cache:
+            cache_filename = os.path.splitext(filename)[0] + ".best_config"
+        if config.use_autotune_remote_cache or (
+            config.is_fbcode()
+            and torch._utils_internal.justknobs_check(
+                "pytorch/autotune_remote_cache:enable"
+            )
+        ):
+            backend_hash = inductor_meta.get("backend_hash", None)
+            if backend_hash is not None:
+                key = backend_hash + configs_hash + "autotune-best-config"
+                key = hashlib.sha256(key.encode("utf-8")).hexdigest()
+
+                try:
+                    if config.is_fbcode():
+                        remote_cache = (
+                            triton.runtime.fb_memcache.FbMemcacheRemoteCacheBackend(
+                                key, is_autotune=True
+                            )
+                        )
+                    else:
+                        remote_cache = triton.runtime.cache.RedisRemoteCacheBackend(key)
+                except Exception:
+                    remote_cache = None
+                    log.warning("Unable to create a remote cache", exc_info=True)
+                # we already sha256 hash the source contents
+                remote_cache_key = os.path.basename(filename)
+            else:
+                log.debug(
+                    "backend_hash is not passed on the inductor_meta, unable to use autotune remote cache"
+                )
+
+        best_config = None
+        if cache_filename is not None and os.path.exists(cache_filename):
+            with open(cache_filename) as fd:
+                best_config = json.loads(fd.read())
+        elif remote_cache is not None and remote_cache_key is not None:
+            cache_outs = remote_cache.get([remote_cache_key])
+            cache_out = cache_outs.get(remote_cache_key, None)
+            best_config = json.loads(cache_out) if cache_out else None
+
+        best_config = load_cached_autotuning(best_config, configs_hash, configs)
+        if best_config:
+            configs = [best_config]
+
+        def save_cache_hook(cfg, found_by_coordesc=False):
+            data = json.dumps(
+                {
+                    **cfg.kwargs,
+                    "num_warps": cfg.num_warps,
+                    "num_stages": cfg.num_stages,
+                    "configs_hash": configs_hash,
+                    "found_by_coordesc": found_by_coordesc,
+                }
+            )
+            if cache_filename is not None:
+                with open(cache_filename, "w") as fd:
+                    fd.write(data)
+            if remote_cache is not None and remote_cache_key is not None:
+                remote_cache.put(remote_cache_key, data)
+
+            if log.isEnabledFor(logging.DEBUG):
+                type_str = "coordesc" if found_by_coordesc else "heuristic"
+                log.debug("Save %s tuning result to %s", type_str, cache_filename)
+
+    else:
+        save_cache_hook = None
+
+    mutated_arg_names = inductor_meta.pop("mutated_arg_names", ())
+
+    def decorator(fn):
+        # Remove XBLOCK from config if it's not a function argument.
+        # This way, coordinate descent tuning will not try to tune it.
+        #
+        # Context: When TritonKernel.no_x_dim is True, we hardcode XBLOCK to 1.
+        import inspect
+
+        if "XBLOCK" not in inspect.signature(fn.fn).parameters:
+            for tconfig in configs:
+                if "XBLOCK" in tconfig.kwargs:
+                    assert tconfig.kwargs["XBLOCK"] == 1
+                    tconfig.kwargs.pop("XBLOCK")
+
+        if config.profile_bandwidth:
+            return DebugAutotuner(
+                fn,
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                regex_filter=config.profile_bandwidth_regex,
+                configs=configs,
+                save_cache_hook=save_cache_hook,
+                mutated_arg_names=mutated_arg_names,
+                heuristic_type=heuristic_type,
+                size_hints=size_hints,
+                custom_kernel=custom_kernel,
+            )
+        return CachingAutotuner(
+            fn,
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            configs=configs,
+            save_cache_hook=save_cache_hook,
+            mutated_arg_names=mutated_arg_names,
+            heuristic_type=heuristic_type,
+            size_hints=size_hints,
+            custom_kernel=custom_kernel,
+        )
+
+    return decorator
+
+
+def unique_configs(configs: List[Config]):
+    """Remove duplicate configurations"""
+    seen = set()
+    pruned_configs = []
+
+    for cfg in configs:
+        key = triton_config_to_hashable(cfg)
+        if key not in seen:
+            seen.add(key)
+            pruned_configs.append(cfg)
+    return pruned_configs
+
+
+def check_config(cfg, *, xnumel=None, ynumel=None, znumel=None):
+    for numel, label in zip((xnumel, ynumel, znumel), "XYZ"):
+        if numel is None:
+            continue
+        block = cfg[f"{label}BLOCK"]
+        if numel == 1:
+            assert block == 1, (
+                f"TritonKernel.indexing assumes numel == 1 => BLOCK == 1"
+                f" but {label.lower()}numel=={numel} and {label}BLOCK={block} (cfg={cfg})."
+            )
+        max_block = config.triton.max_block[label]
+        max_block_str = f'config.triton.max_block["{label}"]'
+        assert max_block % block == 0, (
+            f"TritonKernel.indexing assumes {label}BLOCK divides {max_block_str}"
+            f" but {label}BLOCK={block} and {max_block_str}={max_block} (cfg={cfg})."
+        )
+
+
+def triton_config(
+    size_hints,
+    x,
+    y=None,
+    z=None,
+    num_stages=1,
+    num_elements_per_warp=256,
+    min_elem_per_thread=0,
+) -> Config:
+    """
+    Construct a pointwise triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+
+    num_elements_per_warp is a suggestion for controlling how many warps
+    the triton config should contain. e.g.: if x=16, y=8, z=4 then
+    num_elements = 16*8*4 = 512. Then if we set num_elements_per_warp=128,
+    we'll launch 512 (elem) / 128 (elem/warp) = 4 warps. Note that it's
+    just a suggestion, and sometimes other adjustment heuristics will
+    override the num_elements_per_warp.
+
+    min_elem_per_thread controls the minimum number of elements
+    processed by each thread. It's always enforced.
+    """
+    # Ideally we want to read this from some device config
+
+    # for a 2d size_hints [a, b], a should be mapped to YBLOCK rather than XBLOCK
+    size_hints = list(reversed(size_hints))
+
+    maxGridSize = [2147483647, 65535, 65535]
+
+    target = conditional_product(x, y, z)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    if y:
+        y = min(y, size_hints[1])
+    if z:
+        z = min(z, size_hints[2])
+
+    # if we are below original block size, scale up where we can;
+    # or if the calculated grid size is larger than the limit, we bump up the corresponding dimension
+    while x < min(size_hints[0], config.triton.max_block["X"]) and (
+        x * maxGridSize[0] < size_hints[0] or conditional_product(x, y, z) < target
+    ):
+        x *= 2
+    while (
+        y
+        and y < min(size_hints[1], config.triton.max_block["Y"])
+        and (
+            y * maxGridSize[1] < size_hints[1] or conditional_product(x, y, z) < target
+        )
+    ):
+        y *= 2
+    while (
+        z
+        and z < min(size_hints[2], config.triton.max_block["Z"])
+        and (
+            z * maxGridSize[2] < size_hints[2] or conditional_product(x, y, z) < target
+        )
+    ):
+        z *= 2
+
+    num_warps = next_power_of_2(
+        min(max(conditional_product(x, y, z) // num_elements_per_warp, 1), 8)
+    )
+    # we are going to arrive at 2 warps only if bs was too small due to
+    # numel being too small. However to workaround some ptx bugs we still
+    # want at least 4 warps if there's enough elements per thread
+    # given that this is a rare situation, don't expect this to affect perf
+    # in general
+    # see https://github.com/pytorch/pytorch/pull/97950
+    num_warps = max(num_warps, 4) if conditional_product(x, y, z) >= 128 else num_warps
+    xnumel = size_hints[0]
+    ynumel = size_hints[1] if y else None
+    znumel = size_hints[2] if z else None
+
+    # Increase x to satisfy min_elem_per_thread requirements.
+    block_size = max(
+        conditional_product(x, y, z),
+        min_elem_per_thread * _NUM_THREADS_PER_WARP * num_warps,
+    )
+    x *= math.ceil(block_size / conditional_product(x, y, z))
+
+    cfg = {"XBLOCK": x}
+    if y:
+        cfg["YBLOCK"] = y
+    if z:
+        cfg["ZBLOCK"] = z
+    check_config(cfg, xnumel=xnumel, ynumel=ynumel, znumel=znumel)
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_reduction(size_hints, x, r, num_stages=1, num_warps=None) -> Config:
+    """
+    Construct a reduction triton config with some adjustment heuristics
+    based on size_hints. Size_hints is a tuple of numels in each tile
+    dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    r = min(r, size_hints[1])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, r) < target:
+        x *= 2
+    while r < size_hints[1] and conditional_product(x, r) < target:
+        r *= 2
+
+    cfg = {"XBLOCK": x, "RBLOCK": r}
+    if num_warps is None:
+        num_warps = conditional_product(x, r) // 128
+    num_warps = next_power_of_2(min(max(num_warps, 2), 8))
+    check_config(cfg, xnumel=size_hints[0])
+    assert (
+        r <= config.triton.max_block["R"]
+    ), f"increase config.triton.MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def triton_config_tiled_reduction(size_hints, x, y, r, num_stages=1):
+    """
+    Construct a tile reduction triton config with some adjustment
+    heuristics based on size_hints. Size_hints is a tuple of numels in
+    each tile dimension and will be rounded up to the nearest power of 2.
+    """
+
+    target = conditional_product(x, y, r)
+    if conditional_product(*size_hints) < target:
+        target //= 8
+
+    # shrink sizes to size hints
+    x = min(x, size_hints[0])
+    y = min(y, size_hints[1])
+    r = min(r, size_hints[2])
+
+    # if we are below original block size, scale up where we can
+    while x < size_hints[0] and conditional_product(x, y, r) < target:
+        x *= 2
+    while r < size_hints[2] and conditional_product(x, y, r) < target:
+        r *= 2
+    while y < size_hints[1] and conditional_product(x, y, r) < target:
+        y *= 2
+
+    cfg = {"XBLOCK": x, "YBLOCK": y, "RBLOCK": r}
+    num_warps = next_power_of_2(min(max(conditional_product(x, y, r) // 256, 1), 8))
+    check_config(cfg, xnumel=size_hints[0], ynumel=size_hints[1])
+    assert (
+        r <= config.triton.max_block["R"]
+    ), f"increase config.triton.MAX_BLOCK['r'] to {r}"
+    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+
+
+def pointwise(
+    size_hints,
+    triton_meta,
+    tile_hint=None,
+    filename=None,
+    min_elem_per_thread=0,
+    inductor_meta=None,
+):
+    """
+    Construct @triton.heuristics() based on size_hints.
+    """
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    assert not inductor_meta.get("no_x_dim")
+
+    numel = functools.reduce(operator.mul, size_hints)
+    bs = max(256, min(numel // 128, 1024))
+
+    hinted_configs = autotune_hints_to_configs(
+        inductor_meta.get("autotune_hints", set()), size_hints, bs
+    )
+
+    triton_config_with_settings = functools.partial(
+        triton_config, min_elem_per_thread=min_elem_per_thread
+    )
+
+    if len(size_hints) == 1:
+        if disable_pointwise_autotuning() and not (
+            config.max_autotune or config.max_autotune_pointwise
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, bs)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        else:
+            return cached_autotune(
+                size_hints,
+                [
+                    triton_config_with_settings(
+                        size_hints, bs, num_elements_per_warp=256
+                    ),
+                    triton_config_with_settings(
+                        size_hints, bs // 2, num_elements_per_warp=64
+                    ),
+                    *hinted_configs,
+                ],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+    if len(size_hints) == 2:
+        if (disable_pointwise_autotuning() or tile_hint == TileHint.SQUARE) and not (
+            config.max_autotune or config.max_autotune_pointwise
+        ):
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 32, 32)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 32, 32),
+                triton_config_with_settings(size_hints, 64, 64),  # ~8% better for fp16
+                triton_config_with_settings(size_hints, 256, 16),
+                triton_config_with_settings(size_hints, 16, 256),
+                triton_config_with_settings(size_hints, bs, 1),
+                triton_config_with_settings(size_hints, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    if len(size_hints) == 3:
+        if disable_pointwise_autotuning():
+            return cached_autotune(
+                size_hints,
+                [triton_config_with_settings(size_hints, 16, 16, 16)],
+                triton_meta=triton_meta,
+                inductor_meta=inductor_meta,
+                heuristic_type=HeuristicType.POINTWISE,
+                filename=filename,
+            )
+        return cached_autotune(
+            size_hints,
+            [
+                triton_config_with_settings(size_hints, 16, 16, 16),
+                triton_config_with_settings(size_hints, 64, 8, 8),
+                triton_config_with_settings(size_hints, 8, 64, 8),
+                triton_config_with_settings(size_hints, 8, 8, 64),
+                triton_config_with_settings(size_hints, bs, 1, 1),
+                triton_config_with_settings(size_hints, 1, bs, 1),
+                triton_config_with_settings(size_hints, 1, 1, bs),
+                *hinted_configs,
+            ],
+            triton_meta=triton_meta,
+            inductor_meta=inductor_meta,
+            filename=filename,
+            heuristic_type=HeuristicType.POINTWISE,
+        )
+    raise NotImplementedError(f"size_hints: {size_hints}")
+
+
+def _reduction_configs(
+    *, size_hints: List[int], inductor_meta: Dict[str, Any]
+) -> List[Config]:
+    reduction_hint = inductor_meta.get("reduction_hint", None)
+    assert len(size_hints) == 2
+    rnumel = size_hints[-1]
+
+    contiguous_config = triton_config_reduction(
+        size_hints, 1, (rnumel if 256 <= rnumel < 2048 else 2048)
+    )
+    outer_config = triton_config_reduction(size_hints, 64, 8)
+    tiny_config = triton_config_reduction(
+        size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, min(rnumel, 2048)
+    )
+    if config.max_autotune or config.max_autotune_pointwise:
+        pass  # skip all these cases
+    elif reduction_hint == ReductionHint.INNER:
+        return [contiguous_config]
+    elif reduction_hint == ReductionHint.OUTER:
+        return [outer_config]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        return [tiny_config]
+    if disable_pointwise_autotuning():
+        return [triton_config_reduction(size_hints, 32, 128)]
+    return [
+        contiguous_config,
+        outer_config,
+        tiny_config,
+        triton_config_reduction(size_hints, 64, 64),
+        triton_config_reduction(size_hints, 8, 512),
+        # halve the XBLOCK/RBLOCK compared to outer_config
+        # TODO: this may only be beneficial when each iteration of the reduction
+        # is quite heavy. E.g. https://gist.github.com/shunting314/189a8ef69f90db9d614a823385147a72
+        triton_config_reduction(size_hints, 64, 4, num_warps=8),
+    ]
+
+
+def reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """args to @triton.heuristics()"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    rnumel = size_hints[-1]
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.REDUCTION,
+        filename=filename,
+    )
+
+
+def persistent_reduction(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    xnumel, rnumel = size_hints
+
+    configs = [
+        triton_config_reduction(size_hints, xblock, rnumel)
+        for xblock in (1, 8, 32, 128)
+        if xblock == 1 or (rnumel * xblock <= 4096 and xblock <= xnumel)
+    ]
+
+    # TODO(jansel): we should be able to improve these heuristics
+    if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+        configs = configs[:1]
+    elif reduction_hint == ReductionHint.OUTER:
+        configs = configs[-1:]
+    elif reduction_hint == ReductionHint.OUTER_TINY:
+        configs = [
+            triton_config_reduction(
+                size_hints, 2 * (256 // rnumel) if rnumel <= 256 else 1, rnumel
+            )
+        ]
+    for c in configs:
+        # we don't need RBLOCK for persistent reduction
+        c.kwargs.pop("RBLOCK")
+
+    if disable_pointwise_autotuning():
+        configs = configs[:1]
+
+    return cached_autotune(
+        size_hints,
+        configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        filename=filename,
+        heuristic_type=HeuristicType.PERSISTENT_REDUCTION,
+    )
+
+
+def split_scan(
+    size_hints,
+    reduction_hint=False,
+    triton_meta=None,
+    filename=None,
+    inductor_meta=None,
+):
+    """Heuristic for TritonSplitScanKernel"""
+    inductor_meta = {} if inductor_meta is None else inductor_meta
+    inductor_meta["reduction_hint"] = reduction_hint
+    if inductor_meta.get("no_x_dim"):
+        size_hints = [1, *size_hints[1:]]
+
+    assert triton_meta is not None
+    rnumel = size_hints[-1]
+    if len(size_hints) != 2:
+        raise NotImplementedError(f"size_hints: {size_hints}")
+
+    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+
+    # Fixup configs to enforce the minimum RBLOCK size
+    min_rblock = config.triton.min_split_scan_rblock
+    for cfg in configs:
+        if cfg.kwargs["RBLOCK"] < min_rblock:
+            cfg.kwargs["RBLOCK"] = min_rblock
+
+    return cached_autotune(
+        size_hints,
+        configs=configs,
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.SPLIT_SCAN,
+        filename=filename,
+    )
+
+
+def template(num_stages, num_warps, triton_meta, filename=None, inductor_meta=None):
+    """
+    Compile a triton template
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=num_stages, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def user_autotune(
+    configs, triton_meta, filename=None, inductor_meta=None, custom_kernel=False
+):
+    """
+    Compile a user defined triton kernel
+    """
+    defaults = inspect.signature(triton.Config).parameters
+    default_num_stages = defaults["num_stages"].default
+    default_num_warps = defaults["num_warps"].default
+
+    if len(configs) == 0:
+        configs = [
+            triton.Config(
+                {}, num_stages=default_num_stages, num_warps=default_num_warps
+            )
+        ]
+    else:
+        configs = [
+            triton.Config(
+                c.get("kwargs", {}),
+                num_stages=c.get("num_stages", default_num_stages),
+                num_warps=c.get("num_warps", default_num_warps),
+            )
+            for c in configs
+        ]
+
+    return cached_autotune(
+        None,
+        configs,
+        triton_meta=triton_meta,
+        heuristic_type=HeuristicType.USER_AUTOTUNE,
+        filename=filename,
+        inductor_meta=inductor_meta,
+        custom_kernel=custom_kernel,
+    )
+
+
+def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+    """
+    Compile a triton foreach kernel
+    """
+    return cached_autotune(
+        None,
+        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        triton_meta=triton_meta,
+        inductor_meta=inductor_meta,
+        heuristic_type=HeuristicType.TEMPLATE,
+        filename=filename,
+    )
+
+
+def grid(*numels):
+    """Helper function to compute triton grids"""
+    if len(numels) == 1:
+        xnumel, ynumel, znumel = numels[0], None, None
+    elif len(numels) == 2:
+        xnumel, ynumel, znumel = numels[1], numels[0], None
+    elif len(numels) == 3:
+        xnumel, ynumel, znumel = numels[2], numels[1], numels[0]
+    else:
+        raise AssertionError(f"invalid size for numels {len(numels)}")
+
+    def get_grid_dim(numel, block):
+        if numel is None:
+            return 1
+        if block is None:
+            return numel
+        return ceildiv(numel, block)
+
+    max_grid_dims = config.triton.max_tiles
+
+    def grid_fn(meta):
+        x_grid = get_grid_dim(xnumel, meta.get("XBLOCK", 1))
+        y_grid = get_grid_dim(ynumel, meta.get("YBLOCK", None))
+
+        MAX_Y_GRID = get_max_y_grid()
+        if znumel is None and max_grid_dims <= 2:
+            div = ceildiv(y_grid, MAX_Y_GRID)
+            y_grid = y_grid // div
+            z_grid = div
+        else:
+            z_grid = get_grid_dim(znumel, meta.get("ZBLOCK", None))
+            torch._check(
+                y_grid <= MAX_Y_GRID,
+                lambda: f"Generated y grid beyond 2^16 ({y_grid}) not supported with z dimension present. File issue",
+            )
+
+        return (
+            x_grid,
+            y_grid,
+            z_grid,
+        )
+
+    return grid_fn
+
+
+def split_scan_grid(xnumel, rnumel):
+    def grid_fn(meta):
+        assert meta.get("XBLOCK", 1) == 1
+        return (ceildiv(rnumel, meta.get("RBLOCK", 1)), xnumel, 1)
+
+    return grid_fn
diff --git a/MLPY/Lib/site-packages/torch/_inductor/utils.py b/MLPY/Lib/site-packages/torch/_inductor/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..693d80727bc5dd956a2f280a54ba1df082a439d3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/utils.py
@@ -0,0 +1,1428 @@
+from __future__ import annotations
+
+import collections
+import contextlib
+import dataclasses
+import enum
+import functools
+import getpass
+import inspect
+import io
+import itertools
+import logging
+import math
+import operator
+import os
+import platform
+import re
+import shutil
+import sys
+import tempfile
+import textwrap
+import time
+import unittest
+from dataclasses import fields
+from datetime import datetime
+from io import StringIO
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    List,
+    NamedTuple,
+    Optional,
+    Protocol,
+    Set,
+    TypeVar,
+    Union,
+    ValuesView,
+)
+from unittest import mock
+
+import sympy
+from typing_extensions import Concatenate, ParamSpec
+
+import torch
+from torch._dynamo.device_interface import get_interface_for_device
+from torch.autograd import DeviceType
+from torch.autograd.profiler_util import EventList
+from torch.utils._sympy.functions import CeilDiv, CleanDiv, FloorDiv, ModularIndexing
+from . import config
+
+log = logging.getLogger(__name__)
+
+_T = TypeVar("_T")
+VarRanges = Dict[sympy.Expr, sympy.Expr]
+
+
+def do_bench_using_profiling(fn: Callable[[], Any], warmup=25, rep=100) -> float:
+    """
+    Returns benchmark results by examining torch profiler events.
+    This could be more accurate as it doesn't count CPU side overhead.
+    However, this also requires manually excluding irrelevant event, e.g.
+    vectorized_elementwise_kernel which is used to fill L2 cache,
+    various CUDA events, etc, so could also be fragile.
+    """
+
+    fn()
+    torch.cuda.synchronize()
+    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device="cuda")
+
+    # Estimate the runtime of the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        cache.zero_()
+        fn()
+    end_event.record()
+    torch.cuda.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+
+    # compute number of warmup and repeat
+    n_warmup = max(1, int(warmup / estimate_ms))
+    n_repeat = max(1, int(rep / estimate_ms))
+
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CUDA,
+        ]
+    ) as p:
+        # Benchmark
+        for i in range(n_repeat):
+            # we clear the L2 cache before each run
+            cache.zero_()
+            # record time of `fn`
+            fn()
+        # Record clocks
+        torch.cuda.synchronize()
+
+    log.debug("raw events")
+    log.debug(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+    filtered_events = EventList(
+        [
+            event
+            for event in p.events()
+            if event.device_type == DeviceType.CUDA and event.name != "Context Sync"
+        ]
+    )
+    if len(filtered_events) % n_repeat != 0:
+        raise RuntimeError(
+            "Failed to divide all profiling events into #repeat groups. "
+            "#CUDA events: %d, #repeats: %s",
+            len(filtered_events),
+            n_repeat,
+        )
+    num_event_per_group = len(filtered_events) / n_repeat
+    actual_events = EventList(
+        [
+            event
+            for i, event in enumerate(filtered_events)
+            if i % num_event_per_group != 0
+        ]
+    )
+    actual_events._build_tree()
+    actual_events = actual_events.key_averages()
+
+    log.debug("profiling time breakdown")
+    log.debug(actual_events.table(row_limit=-1))
+
+    res = sum(event.cuda_time_total for event in actual_events) / 1000.0 / n_repeat
+    log.debug("profiling results: %s ms", res)
+    return res
+
+
+def do_bench(*args, **kwargs):
+    @functools.lru_cache(None)
+    def load_triton():
+        try:
+            # NB: Lazily load triton, as importing triton is slow
+            # see https://github.com/openai/triton/issues/1599
+            from triton.testing import do_bench as triton_do_bench
+        except ImportError as exc:
+            raise NotImplementedError("requires Triton") from exc
+
+        # triton PR https://github.com/openai/triton/pull/1513 change the
+        # quantile fields name from 'percentiles' to 'quantiles'
+        # and change the default value from (0.5, 0.2, 0.8) to None.
+        # This may break inductor since a caller expects a tuple may get a item.
+        #
+        # Add a wrapper to maintain the same behavior for inductor.
+        # Maybe we should have own implementation of this function?
+        return triton_do_bench, (
+            "quantiles"
+            if inspect.signature(triton_do_bench).parameters.get("quantiles")
+            is not None
+            else "percentiles"
+        )
+
+    triton_do_bench, quantile_field_name = load_triton()
+
+    if quantile_field_name not in kwargs:
+        kwargs[quantile_field_name] = (0.5, 0.2, 0.8)
+    return triton_do_bench(*args, **kwargs)[0]
+
+
+@functools.lru_cache(None)
+def has_torchvision_roi_align() -> bool:
+    try:
+        from torchvision.ops import roi_align  # noqa: F401
+
+        return roi_align is not None and hasattr(
+            getattr(torch.ops, "torchvision", None), "roi_align"
+        )
+    except ImportError:
+        return False
+
+
+def conditional_product(*args):
+    return functools.reduce(operator.mul, [x for x in args if x])
+
+
+def decode_device(device: Union[Optional[torch.device], str]) -> torch.device:
+    if device is None:
+        return torch.tensor(0.0).device  # default device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if device.type != "cpu" and device.index is None:
+        device_interface = get_interface_for_device(device.type)
+        return torch.device(device.type, index=device_interface.Worker.current_device())
+    return device
+
+
+def sympy_product(it):
+    return functools.reduce(operator.mul, it, sympy.Integer(1))
+
+
+def sympy_dot(seq1, seq2):
+    assert len(seq1) == len(seq2)
+    return sympy.expand(sum(a * b for a, b in zip(seq1, seq2)))
+
+
+def unique(it: Iterable[_T]) -> ValuesView[_T]:
+    return {id(x): x for x in it}.values()
+
+
+def ceildiv(
+    numer: Union[int, sympy.Expr], denom: Union[int, sympy.Expr]
+) -> Union[int, sympy.Expr]:
+    if isinstance(numer, sympy.Expr) or isinstance(denom, sympy.Expr):
+        return CeilDiv(numer, denom)
+    # TODO: There is a bug in a call to this function, to repro:
+    # python benchmarks/dynamo/huggingface.py --inductor -d cuda --accuracy
+    # --amp --only YituTechConvBert --dynamic-shapes
+    assert isinstance(numer, int) and isinstance(
+        denom, int
+    ), f"{numer}: {type(numer)}, {denom}: {type(denom)}"
+    return -(numer // -denom)
+
+
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
+def _type_of(key):
+    # Use the function here to get rid of dependencies on the Triton during the codegen.
+    # Refer to Triton implementation here:
+    # https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
+    # `None` is nullptr.  Implicitly convert to *i8.
+    if key is None:
+        return "*i8"
+    dtype_str = str(key).split(".")[-1]
+    tys = {
+        "bool": "i1",
+        "float8e4nv": "fp8e4nv",
+        "float8e5": "fp8e5",
+        "float8e4b15": "fp8e4b15",
+        "float8e4b15x4": "fp8e4b15x4",
+        "float8_e4m3fn": "fp8e4nv",
+        "float8_e5m2": "fp8e5",
+        "float16": "fp16",
+        "bfloat16": "bf16",
+        "float32": "fp32",
+        "float64": "fp64",
+        "int8": "i8",
+        "int16": "i16",
+        "int32": "i32",
+        "int64": "i64",
+        "uint8": "u8",
+        "uint16": "u16",
+        "uint32": "u32",
+        "uint64": "u64",
+    }
+    # reinterpret can create triton type
+    for v in list(tys.values()):
+        tys[v] = v
+    return key if isinstance(key, str) else f"*{tys[dtype_str]}"
+
+
+def convert_shape_to_inductor(
+    lst: Iterable[Union[int, torch.SymInt]]
+) -> List[sympy.Expr]:
+    """
+    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
+    trivial. But for symbolic tensors, we need to map from SymIntNode into
+    sympy.Expr.
+    """
+    return [
+        i.node.expr if isinstance(i, torch.SymInt) else sympy.Integer(i) for i in lst
+    ]
+
+
+def convert_shape_to_symint(
+    lst: Iterable[Union[int, sympy.Expr]]
+) -> List[Union[int, torch.SymInt]]:
+    """
+    Takes a list of shapes from Inductor and converts them into symints (or just
+    ints if all shapes are static).
+    """
+    from .virtualized import V
+
+    return [
+        i
+        if isinstance(i, int)
+        else int(i)
+        if isinstance(i, sympy.Integer)
+        else V.graph.sizevars.shape_env.create_symintnode(i, hint=None)
+        for i in lst
+    ]
+
+
+def is_view(op: torch._ops.OpOverload):
+    """
+    Does this op overload have aliasing
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    return any(a.alias_info is not None for a in op._schema.arguments)
+
+
+def is_pointwise_use(use):
+    if not use.op == "call_function":
+        return False
+
+    if not (
+        isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
+    ):
+        return False
+
+    if use.target is operator.getitem or is_view(use.target):
+        return all(is_pointwise_use(u) for u in use.users)
+
+    return torch.Tag.pointwise in use.target.tags
+
+
+def gen_gm_and_inputs(target, args, kwargs):
+    g = torch.fx.Graph()
+    g_args = []
+    a_args = []
+    for n, arg in enumerate(args):
+        if isinstance(arg, torch.Tensor):
+            g_args.append(g.placeholder(f"arg{n}"))
+            a_args.append(arg)
+        else:
+            g_args.append(arg)
+    assert all(not isinstance(x, torch.Tensor) for x in kwargs.values())
+    node = g.call_function(target, tuple(g_args), kwargs)
+    if (
+        len(target._schema.returns) == 1
+        and str(target._schema.returns[0].type) == "Tensor"
+    ):
+        node = (node,)
+    g.output(node)
+
+    gm = torch.fx.GraphModule({}, g)
+    return gm, a_args
+
+
+def synchronize(device: str = "cuda"):
+    if device == "cpu":
+        return
+    device_interface = get_interface_for_device(device)
+    if device_interface.is_available():
+        device_interface.synchronize()
+
+
+def timed(
+    model: Callable[..., Any], example_inputs, times: int = 1, device: str = "cuda"
+) -> float:
+    synchronize(device)
+    torch.manual_seed(1337)
+    t0 = time.perf_counter()
+    for _ in range(times):
+        result = model(*example_inputs)
+        synchronize(device)
+    t1 = time.perf_counter()
+    # GC the result after timing
+    assert result is not None  # type: ignore[possibly-undefined]
+    return t1 - t0
+
+
+def print_performance(
+    fn, args=(), times=10, repeat=10, baseline=1.0, device: str = "cuda"
+):
+    timings = torch.tensor([timed(fn, args, times, device) for _ in range(repeat)])
+    took = torch.median(timings) / times
+    print(f"{took/baseline:.6f}")
+    return took
+
+
+def precompute_method(obj: Any, method: str):
+    """Replace obj.method() with a new method that returns a precomputed constant."""
+    result = getattr(obj, method)()
+    setattr(obj, method, lambda: result)
+
+
+def precompute_methods(obj: Any, methods: List[str]):
+    """Replace methods with new methods that returns a precomputed constants."""
+    for method in methods:
+        precompute_method(obj, method)
+
+
+def cmp(a, b) -> int:
+    return int(a > b) - int(a < b)
+
+
+def pad_listlike(x, size):
+    if len(x) == 1:
+        return type(x)([x[0]]) * size
+    else:
+        return x
+
+
+# Used to ensure that iterating over a set is deterministic
+def tuple_sorted(x):
+    if len(x) == 0:
+        return []
+
+    def sort_func(elem):
+        if isinstance(elem, str):
+            return elem
+        else:
+            # We expect `elem` to be `scheduler.BaseSchedulerNode` type here,
+            # but we are not able to do isinstance assert because of circular dependency
+            return elem.get_name()
+
+    return sorted(x, key=sort_func)
+
+
+P = ParamSpec("P")
+RV = TypeVar("RV", covariant=True)
+
+
+class CachedMethod(Generic[P, RV], Protocol):
+    @staticmethod
+    def clear_cache(self) -> None:
+        ...
+
+    def __call__(self, *args: P.args, **kwargs: P.kwargs) -> RV:
+        ...
+
+
+# See https://github.com/python/mypy/issues/13222#issuecomment-1193073470 to understand the type signature
+def cache_on_self(fn: Callable[Concatenate[Any, P], RV]) -> CachedMethod[P, RV]:
+    key = f"__{fn.__name__}_cache"
+
+    @functools.wraps(fn)
+    def wrapper(self):
+        if not hasattr(self, key):
+            setattr(self, key, fn(self))
+        return getattr(self, key)
+
+    def clear_cache(self):
+        if hasattr(self, key):
+            delattr(self, key)
+
+    wrapper.clear_cache = clear_cache  # type: ignore[attr-defined]
+    return wrapper  # type: ignore[return-value]
+
+
+def aggregate_origins(node_schedule):
+    from . import ir
+
+    if isinstance(node_schedule, list):
+        return functools.reduce(
+            operator.or_,
+            [
+                node.node.origins
+                for node in node_schedule
+                if hasattr(node, "node") and node.node
+            ],
+            set(),
+        )
+    elif isinstance(node_schedule, ir.ExternKernel):
+        return node_schedule.origins
+    else:
+        return set()
+
+
+def get_fused_kernel_name(node_schedule, descriptive_names):
+    all_origins = aggregate_origins(node_schedule)
+    if descriptive_names == "original_aten":
+        # Bases the kernel name off of the top-level aten operator (i.e. pre-decompositions)
+        sources = [
+            origin.meta["original_aten"]._overloadpacket.__name__
+            for origin in all_origins
+            if origin.op == "call_function"
+            and "original_aten" in origin.meta
+            and origin.meta["original_aten"] is not None
+        ]
+        sources = sorted(set(sources))
+    elif descriptive_names == "torch":
+        # Bases the kernel name off of the top-level "torch" operator (i.e. post-dynamo graph)
+        sources = []
+        for origin in all_origins:
+            if origin.op == "call_function" and "source_fn_stack" in origin.meta:
+                source_fn = origin.meta["source_fn_stack"][-1]
+                if isinstance(source_fn[1], str):
+                    sources.append(source_fn[1])
+                else:
+                    sources.append(source_fn[1].__name__)
+        sources = sorted(set(sources))
+    elif descriptive_names == "inductor_node":
+        sources = [
+            origin.name for origin in all_origins if origin.op == "call_function"
+        ]
+    else:
+        raise NotImplementedError
+    sources = sources
+    return "_".join(["fused"] + sources)
+
+
+def get_kernel_metadata(node_schedule, wrapper):
+    all_origins = aggregate_origins(node_schedule)
+    inductor_nodes = [origin for origin in all_origins if origin.op == "call_function"]
+
+    from_node_dict = collections.defaultdict(list)
+    original_aten_dict = collections.defaultdict(list)
+    for node in inductor_nodes:
+        if "original_aten" in node.meta and node.meta["original_aten"] is not None:
+            key = str(node.meta["original_aten"]._overloadpacket)
+            original_aten_dict[key].append(node.name)
+        if "from_node" in node.meta:
+            key = node.meta["from_node"][0][0]
+            from_node_dict[key].append(node.name)
+    metadata = (
+        f"{wrapper.comment} Source Nodes: [{', '.join(sorted(from_node_dict.keys()))}], "
+        f"Original ATen: [{', '.join(sorted(original_aten_dict.keys()))}]"
+    )
+    # trace back to original node here
+    detailed_metadata = []
+    for original_node, nodes in sorted(from_node_dict.items()):
+        detailed_metadata.append(
+            f"{wrapper.comment} {original_node} => {', '.join(sorted(nodes))}"
+        )
+    return metadata, "\n".join(detailed_metadata)
+
+
+def dominated_nodes(
+    initial_queue: Iterable[torch.fx.Node], skip_filter=None
+) -> Set[torch.fx.Node]:
+    """Returns the set of nodes whose values depend on those within initial_queue"""
+    initial_queue = list(initial_queue)
+    dominated_set = set(initial_queue)
+
+    while initial_queue:
+        node = initial_queue.pop()
+        for user in node.users:
+            if skip_filter and skip_filter(user):
+                continue
+            if user not in dominated_set:
+                dominated_set.add(user)
+                initial_queue.append(user)
+
+    return dominated_set
+
+
+def gather_origins(args, kwargs):
+    import itertools
+
+    from . import ir
+
+    def is_unrealized_node(n):
+        if isinstance(n, ir.TensorBox):
+            return is_unrealized_node(n.data)
+        if isinstance(n, ir.StorageBox):
+            return is_unrealized_node(n.data)
+        return isinstance(n, ir.IRNode) and isinstance(n, ir.Pointwise)
+
+    kwarg_origins = [val.origins for val in kwargs.values() if is_unrealized_node(val)]
+    arg_origins = [arg.origins for arg in args if is_unrealized_node(arg)]
+    return set(itertools.chain(*arg_origins, *kwarg_origins))
+
+
+def sympy_str(expr: sympy.Expr) -> str:
+    """
+    Normal sympy str is very slow, this is a lot faster.  The result are
+    somewhat worse, as it doesn't do as much simplification.  So don't
+    use this for final codegen.
+    """
+    if isinstance(expr, sympy.Symbol):
+        return expr.name
+    if isinstance(expr, sympy.Add):
+        return " + ".join(map(sympy_str, expr.args))
+    if isinstance(expr, sympy.Mul):
+        return " * ".join(map(sympy_str, expr.args))
+
+    if isinstance(expr, (ModularIndexing, CleanDiv, FloorDiv)):
+        return f"{expr.func.__name__}({', '.join(map(sympy_str, expr.args))})"
+    return str(expr)
+
+
+def sympy_index_symbol(name: str) -> sympy.Symbol:
+    """
+    Used to generate an integer-nonnegative symbol.
+    """
+    # This should never be used for creating shape/stride symbols, as those
+    # should all be allocated before Inductor.
+    assert name[0] != "s"
+    # NOTE: shape symbols are positive (> 0), but index variables are only
+    # non-negative (>= 0).
+    return sympy.Symbol(name, integer=True, nonnegative=True)
+
+
+def sympy_subs(expr: sympy.Expr, replacements: Dict[sympy.Expr, Any]) -> sympy.Expr:
+    """
+    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
+    have the same replaced expression integer and nonnegative properties.
+    """
+
+    def to_symbol(replaced, replacement):
+        assert isinstance(replaced, sympy.Expr)
+        if isinstance(replacement, str):
+            return sympy.Symbol(
+                replacement,
+                integer=replaced.is_integer,  # type: ignore[attr-defined]
+                nonnegative=replaced.is_nonnegative,  # type: ignore[attr-defined]
+            )
+        else:
+            return replacement
+
+    # xreplace is faster than subs, but is way more picky
+    return sympy.sympify(expr).xreplace(
+        {k: to_symbol(k, v) for k, v in replacements.items()}
+    )
+
+
+def free_symbol_startswith(index: sympy.Expr, prefix: str):
+    return any(v.name.startswith(prefix) for v in index.free_symbols)  # type: ignore[attr-defined]
+
+
+def free_symbol_has(index: sympy.Expr, pattern: str):
+    return any(pattern in v.name for v in index.free_symbols)  # type: ignore[attr-defined]
+
+
+def is_symbolic(a: Any) -> bool:
+    return isinstance(a, torch.SymInt) or (
+        isinstance(a, torch.Tensor)
+        and any(is_symbolic(x) for x in itertools.chain(a.size(), a.stride()))
+    )
+
+
+def any_is_symbolic(*args: Any) -> bool:
+    return any(is_symbolic(a) for a in args)
+
+
+def has_incompatible_cudagraph_ops(gm):
+    from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+    forbidden_set = {
+        "aten._fused_moving_avg_obs_fq_helper.default",
+        "aten._fused_moving_avg_obs_fq_helper_functional.default",
+        "aten.multinomial.default",
+        "fbgemm.dense_to_jagged.default",
+        "fbgemm.jagged_to_padded_dense.default",
+        "run_and_save_rng_state",
+        "run_with_rng_state",
+        "aten._local_scalar_dense",
+        # Technically, it's not necessary to ban this, because an
+        # assert_scalar with constant arguments can be validly run
+        # with CUDA graphs, but the operator is also pointless with
+        # constant arguments, so might as well ban
+        "aten._assert_scalar",
+    }
+    if torch.are_deterministic_algorithms_enabled():
+        forbidden_set.update(
+            {
+                "aten._unsafe_index_put.default",
+                "aten.index_put.default",
+                "aten.index_put_.default",
+                "aten.scatter.src",
+                "aten.scatter.reduce",
+                "aten.scatter.value_reduce",
+                "aten.scatter_add_",
+                "aten.scatter_add.default",
+                "aten.scatter_reduce.two",
+                "aten.scatter_reduce_.two",
+                "aten.scatter_reduce.two_out",
+            }
+        )
+    for node in gm.graph.nodes:
+        if str(node.target) in forbidden_set:
+            return True
+        if (val := node.meta.get("val")) is not None and free_unbacked_symbols(val):
+            return True
+    return False
+
+
+def output_node(gm: torch.fx.GraphModule):
+    """Get the output node from an FX graph"""
+    last_node = next(iter(reversed(gm.graph.nodes)))
+    assert last_node.op == "output"
+    return last_node
+
+
+# Attempt to import AttrsDescriptor from Triton
+try:
+    from triton.compiler.compiler import AttrsDescriptor
+
+    attrs_descriptor_available = True
+    # Determine if 'ids_of_folded_args' is a valid field for AttrsDescriptor
+    attr_desc_fields = {f.name for f in fields(AttrsDescriptor)}
+    ids_of_folded_args_available = "ids_of_folded_args" in attr_desc_fields
+    divisible_by_8_available = "divisible_by_8" in attr_desc_fields
+except ImportError:
+    attrs_descriptor_available = False
+
+# Define `instance_descriptor` function with clear conditional handling
+if attrs_descriptor_available:
+
+    def instance_descriptor(
+        divisible_by_16=None,
+        equal_to_1=None,
+        ids_of_folded_args=None,
+        divisible_by_8=None,
+    ):
+        # Prepare the arguments for AttrsDescriptor
+        kwargs = {
+            "divisible_by_16": divisible_by_16,
+            "equal_to_1": equal_to_1,
+        }
+
+        # Conditionally add 'ids_of_folded_args' if it's available in AttrsDescriptor
+        if ids_of_folded_args_available:
+            kwargs["ids_of_folded_args"] = ids_of_folded_args
+        if divisible_by_8_available:
+            kwargs["divisible_by_8"] = divisible_by_8
+
+        # Instantiate AttrsDescriptor with the prepared arguments
+        return AttrsDescriptor(**kwargs)
+
+else:
+    # Define a namedtuple as a fallback when AttrsDescriptor is not available
+    instance_descriptor = collections.namedtuple(  # type: ignore[no-redef]
+        "instance_descriptor",
+        ["divisible_by_16", "equal_to_1", "ids_of_folded_args", "divisible_by_8"],
+        defaults=[tuple(), tuple(), tuple(), tuple()],
+    )
+
+
+@functools.lru_cache(None)
+def cache_dir() -> str:
+    cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR")
+    if cache_dir is None:
+        sanitized_username = re.sub(r'[\\/:*?"<>|]', "_", getpass.getuser())
+        cache_dir = os.path.join(
+            tempfile.gettempdir(),
+            "torchinductor_" + sanitized_username,
+        )
+    os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
+
+
+@contextlib.contextmanager
+def fresh_inductor_cache(cache_entries=None):
+    """
+    Contextmanager that provides a clean tmp cachedir for inductor.
+
+    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
+    generated with this cache instance.
+    """
+    with tempfile.TemporaryDirectory() as inductor_cache_dir:
+        with mock.patch.dict(
+            os.environ, {"TORCHINDUCTOR_CACHE_DIR": inductor_cache_dir}
+        ):
+            triton_cache_dir = os.path.join(inductor_cache_dir, "triton")
+            with mock.patch.dict(os.environ, {"TRITON_CACHE_DIR": triton_cache_dir}):
+                yield
+                if isinstance(cache_entries, dict):
+                    assert len(cache_entries) == 0, "expected empty cache_entries dict"
+                    if os.path.exists(triton_cache_dir):
+                        files = os.listdir(triton_cache_dir)
+                        cache_entries.update(
+                            {
+                                f: os.path.getsize(os.path.join(triton_cache_dir, f))
+                                for f in files
+                                if ".lock" not in f
+                            }
+                        )
+
+
+def argsort(seq) -> List[int]:
+    # preserve original order for equal strides
+    getter = seq.__getitem__
+    a_r = range(len(seq))
+    return list(reversed(sorted(a_r, key=getter, reverse=True)))  # noqa: C413
+
+
+@functools.lru_cache(8)
+def get_dtype_size(dtype):
+    return torch.empty((), dtype=dtype).element_size()
+
+
+class LineContext(NamedTuple):
+    context: Any
+
+
+class IndentedBuffer:
+    tabwidth = 4
+
+    def __init__(self, initial_indent=0):
+        self._lines = []
+        self._indent = initial_indent
+
+    def getvaluewithlinemap(self) -> tuple[str, list[tuple[int, LineContext]]]:
+        buf = StringIO()
+        p = 1
+        linemap = []
+        for line in self._lines:
+            if isinstance(line, DeferredLineBase):
+                line = line()
+                if line is None:
+                    continue
+            elif isinstance(line, LineContext):
+                linemap.append((p, line.context))
+                continue
+            assert isinstance(line, str)
+            buf.write(line)
+            buf.write("\n")
+            p += 1 + line.count("\n")
+        return buf.getvalue(), linemap
+
+    def getvalue(self) -> str:
+        v, _ = self.getvaluewithlinemap()
+        return v
+
+    def getrawvalue(self) -> str:
+        buf = StringIO()
+        for line in self._lines:
+            if isinstance(line, DeferredLineBase):
+                line = line()
+                if line is None:
+                    continue
+            elif isinstance(line, LineContext):
+                continue
+            assert isinstance(line, str)
+            # backslash implies line continuation
+            if line.endswith("\\"):
+                buf.write(line[:-1])
+            else:
+                buf.write(line)
+                buf.write("\n")
+        return buf.getvalue()
+
+    def clear(self):
+        self._lines.clear()
+
+    def __bool__(self):
+        return bool(self._lines)
+
+    def prefix(self):
+        return " " * (self._indent * self.tabwidth)
+
+    def newline(self):
+        self.writeline("\n")
+
+    def writeline(self, line):
+        if isinstance(line, LineContext):
+            self._lines.append(line)
+        elif isinstance(line, DeferredLineBase):
+            self._lines.append(line.with_prefix(self.prefix()))
+        elif line.strip():
+            self._lines.append(f"{self.prefix()}{line}")
+        else:
+            self._lines.append("")
+
+    def writelines(self, lines):
+        for line in lines:
+            self.writeline(line)
+
+    def indent(self, offset=1):
+        @contextlib.contextmanager
+        def ctx():
+            self._indent += offset
+            try:
+                yield
+            finally:
+                self._indent -= offset
+
+        return ctx()
+
+    def do_indent(self, offset=1):
+        self._indent += offset
+
+    def do_unindent(self, offset=1):
+        self._indent -= offset
+
+    def splice(self, other_code, strip=False):
+        if isinstance(other_code, IndentedBuffer):
+            dedent = float("inf")
+            for line in other_code._lines:
+                if not isinstance(line, LineContext) and line:
+                    dedent = min(dedent, len(line) - len(line.lstrip()))
+            if math.isinf(dedent):
+                dedent = 0
+            for line in other_code._lines:
+                if isinstance(line, LineContext):
+                    self._lines.append(line)
+                else:
+                    IndentedBuffer.writeline(self, line[int(dedent) :])
+        else:
+            other_code = textwrap.dedent(other_code)
+            if strip:
+                other_code = other_code.lstrip()
+            if not other_code:
+                return
+            other_code = other_code.rstrip()
+            for line in other_code.split("\n"):
+                self.writeline(line)
+
+    def __repr__(self):
+        return f"{type(self)}({self.getvalue()})"
+
+
+class DeferredLineBase:
+    """A line that can be 'unwritten' at a later time"""
+
+    def __init__(self, line):
+        if not line.strip():
+            line = ""
+        self.line = line
+
+    def __call__(self) -> Optional[str]:
+        """Returns either self.line or None to indicate the line has been 'unwritten'"""
+        raise NotImplementedError()
+
+    def _new_line(self, line: str) -> DeferredLineBase:
+        """Returns a new deferred line with the same condition"""
+        raise NotImplementedError()
+
+    def with_prefix(self, prefix):
+        return self._new_line(f"{prefix}{self.line}")
+
+    def lstrip(self):
+        return self._new_line(self.line.lstrip())
+
+    def __getitem__(self, index):
+        return self._new_line(self.line[index])
+
+    def __bool__(self):
+        return bool(self.line)
+
+    def __len__(self):
+        return len(self.line)
+
+
+@functools.lru_cache(None)
+def is_big_gpu(index):
+    sms = torch.cuda.get_device_properties(index).multi_processor_count
+    if sms < 80:  # V100
+        log.warning("not enough SMs to use max_autotune_gemm mode")
+        return False
+    return True
+
+
+def use_max_autotune() -> bool:
+    return (
+        config.max_autotune or config.max_autotune_gemm or config.search_autotune_cache
+    )
+
+
+def _use_template_for_cuda(layout, allowed_layout_dtypes: List[torch.dtype]) -> bool:
+    return (
+        use_max_autotune()
+        and layout.device.type == "cuda"
+        and layout.dtype in allowed_layout_dtypes
+        and is_big_gpu(layout.device.index or 0)
+    )
+
+
+def _use_autotune_backend(backend: str) -> bool:
+    return backend.upper() in [
+        x.strip() for x in config.max_autotune_gemm_backends.upper().split(",")
+    ]
+
+
+def use_triton_template(layout, *, enable_int32=False):
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    if enable_int32:
+        layout_dtypes = [torch.float16, torch.bfloat16, torch.float32, torch.int32]
+    return _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend(
+        "TRITON"
+    )
+
+
+def use_cutlass_template(layout):
+    from .codegen.cuda.cutlass_utils import try_import_cutlass
+
+    # Do not use cutlass template on ROCm
+    if torch.version.hip:
+        return False
+
+    layout_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+    res = _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend(
+        "CUTLASS"
+    )
+
+    if res:
+        if not try_import_cutlass():
+            log.warning(
+                "Failed to import CUTLASS lib. Please check whether "
+                "_inductor.config.cuda.cutlass_dir is set correctly. "
+                "Skipping CUTLASS backend for now."
+            )
+            return False
+    return res
+
+
+def use_aten_gemm_kernels():
+    return not use_max_autotune() or _use_autotune_backend("ATEN")
+
+
+class DebugDirManager:
+    counter = itertools.count(0)
+    prev_debug_name: str
+
+    def __init__(self):
+        self.id = next(DebugDirManager.counter)
+
+    def __enter__(self):
+        self.prev_debug_name = torch._dynamo.config.debug_dir_root
+        self.new_name = f"{self.prev_debug_name}_tmp_{self.id}"
+        torch._dynamo.config.debug_dir_root = self.new_name
+
+    def __exit__(self, *args):
+        shutil.rmtree(self.new_name)
+        torch._dynamo.config.debug_dir_root = self.prev_debug_name
+
+
+def run_and_get_code(fn, *args, **kwargs):
+    from .graph import GraphLowering
+
+    compile_to_module = GraphLowering.compile_to_module
+    source_codes = []
+
+    def patched_compile_to_module(self):
+        mod = compile_to_module(self)
+        with open(mod.__file__) as f:
+            source_codes.append(f.read())
+        return mod
+
+    # If FX code caching is enabled, a hit prevents getting the code.
+    with config.patch({"fx_graph_cache": False}):
+        with mock.patch.object(
+            GraphLowering, "compile_to_module", patched_compile_to_module
+        ):
+            torch._dynamo.reset()
+            result = fn(*args, **kwargs)
+    return result, source_codes
+
+
+def run_and_get_triton_code(fn, *args, **kwargs):
+    _, source_codes = run_and_get_code(fn, *args, **kwargs)
+    # Can have two outputs if backwards was eagerly compiled
+    assert (
+        1 <= len(source_codes) <= 2
+    ), f"expected one or two code outputs got {len(source_codes)}"
+    return source_codes[0]
+
+
+@contextlib.contextmanager
+def override_lowering(aten_op, override_fn):
+    """
+    Override the lowering of aten_op with override_fn.
+    The first argument of override_fn is the original lowering fn.
+    """
+    from torch._inductor import lowering
+
+    orig_fn = lowering.lowerings[aten_op]
+    try:
+        lowering.lowerings[aten_op] = functools.partial(override_fn, orig_fn)
+        yield
+    finally:
+        lowering.lowerings[aten_op] = orig_fn
+
+
+def add_scheduler_init_hook(pre_fn, post_fn=None):
+    """
+    Add hook functions to be called at the beginning and end of Scheduler.__init__.
+    Used for unit tests.
+    """
+    from torch._inductor.scheduler import Scheduler
+
+    orig_fn = Scheduler.__init__
+
+    def wrapper(scheduler, nodes):
+        pre_fn(scheduler, nodes)
+        out = orig_fn(scheduler, nodes)
+        if post_fn:
+            post_fn(scheduler, nodes)
+        return out
+
+    return unittest.mock.patch.object(Scheduler, "__init__", wrapper)
+
+
+def developer_warning(msg):
+    """
+    Warnings that will be actionable for PyTorch developers, but not
+    end users.  Allows us to easily disable them in stable releases but
+    keep them on for nightly builds.
+    """
+    if config.developer_warnings:
+        log.warning(msg)
+    else:
+        log.info(msg)
+
+
+def get_num_bytes(*args: torch.Tensor, num_in_out_args: int = 0) -> int:
+    """
+    Return the total number of bytes the arguments of tensor type takes.
+
+    For in/out args, tensor sizes are counted twice: once for reading and
+    once for writing.
+
+    The first num_in_out_args arguments are in out tensors.
+    """
+    return sum(
+        arg.numel() * arg.element_size() * (1 + int(i < num_in_out_args))
+        for i, arg in enumerate(args)
+        if isinstance(arg, torch.Tensor)
+    )
+
+
+def create_bandwidth_info_str(ms, num_gb, gb_per_s, prefix="", suffix="", color=True):
+    info_str = f"{prefix}{ms:.3f}ms    \t{num_gb:.3f} GB \t {gb_per_s:7.2f}GB/s{suffix}"
+    slow = ms > 0.012 and gb_per_s < 650
+    return red_text(info_str) if color and slow else info_str
+
+
+def get_benchmark_name():
+    """
+    An experimental API used only when config.benchmark_kernel is true.
+
+    The benchmark name is only available at codegen time. So we can not
+    directly call it in benchmark_all_kernels which is run after codegen.
+
+    The function assumes the argument after --only is the benchmark name.
+    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
+    scripts, this function may return None.
+
+    There are 2 flavors of --only argument we need handle:
+    1. --only model_name
+    2. --only=model_name
+    """
+    try:
+        idx = sys.argv.index("--only")
+        if (
+            idx + 1 < len(sys.argv)
+            and len(sys.argv[idx + 1]) > 0
+            and sys.argv[idx + 1][0] != "-"
+        ):
+            return sys.argv[idx + 1]
+    except ValueError:
+        pass
+
+    for arg in sys.argv:
+        if arg.startswith("--only="):
+            return arg[len("--only=") :]
+
+
+def is_ones(items):
+    return all(x == 1 for x in items)
+
+
+def is_zeros(items):
+    return all(x == 0 for x in items)
+
+
+def is_cpu_device(inputs):
+    return all(
+        item.device == torch.device("cpu")
+        for item in inputs
+        if isinstance(item, torch.Tensor)
+    )
+
+
+def get_sympy_Expr_dtype(val: sympy.Expr) -> torch.dtype:
+    assert isinstance(
+        val, sympy.Expr
+    ), "only support sympy.Expr as input to get_sympy_Expr_dtype"
+    if val.is_integer:  # type: ignore[attr-defined]
+        return torch.int64
+    else:
+        return torch.float64
+
+
+@contextlib.contextmanager
+def maybe_profile(should_profile, *args, **kwargs):
+    if should_profile:
+        with torch.profiler.profile(*args, **kwargs) as p:
+            yield p
+    else:
+        yield
+
+
+def triton_config_to_hashable(cfg):
+    """
+    Convert triton config to a tuple that can uniquely identify it. We can use
+    the return value as a dictionary key.
+    """
+    items = sorted(cfg.kwargs.items())
+    items.append(("num_warps", cfg.num_warps))
+    items.append(("num_stages", cfg.num_stages))
+    return tuple(items)
+
+
+def parallel_num_threads():
+    threads = config.cpp.threads
+    if threads < 1:
+        threads = torch.get_num_threads()
+    return threads
+
+
+HAS_COLORAMA = True
+try:
+    import colorama
+except ImportError:
+    HAS_COLORAMA = False
+
+
+def _color_text(msg, color):
+    if not HAS_COLORAMA:
+        return msg
+
+    return getattr(colorama.Fore, color.upper()) + msg + colorama.Fore.RESET
+
+
+def green_text(msg):
+    return _color_text(msg, "green")
+
+
+def yellow_text(msg):
+    return _color_text(msg, "yellow")
+
+
+def red_text(msg):
+    return _color_text(msg, "red")
+
+
+def blue_text(msg):
+    return _color_text(msg, "blue")
+
+
+@functools.lru_cache(None)
+def get_device_tflops(dtype):
+    from triton.testing import get_max_simd_tflops, get_max_tensorcore_tflops
+
+    assert dtype in (torch.float16, torch.bfloat16, torch.float32)
+
+    if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
+        # Triton API change in https://github.com/openai/triton/pull/2293
+        from torch._utils_internal import max_clock_rate
+
+        sm_clock = max_clock_rate()
+        if dtype in (torch.float16, torch.bfloat16):
+            return get_max_tensorcore_tflops(dtype, sm_clock)
+
+        if torch.backends.cuda.matmul.allow_tf32:
+            return get_max_tensorcore_tflops(torch.float32, sm_clock)
+        else:
+            return get_max_simd_tflops(torch.float32, sm_clock)
+    else:
+        if dtype in (torch.float16, torch.bfloat16):
+            return get_max_tensorcore_tflops(dtype)
+
+        if torch.backends.cuda.matmul.allow_tf32:
+            return get_max_tensorcore_tflops(torch.float32)
+        else:
+            return get_max_simd_tflops(torch.float32)
+
+
+@functools.lru_cache(None)
+def get_gpu_dram_gbps():
+    from triton.testing import get_dram_gbps
+
+    return get_dram_gbps()
+
+
+def is_welford_reduction(reduction_type):
+    return reduction_type.startswith("welford")
+
+
+def reduction_num_outputs(reduction_type):
+    return 3 if is_welford_reduction(reduction_type) else 1
+
+
+def get_max_y_grid():
+    return 65535
+
+
+def is_linux() -> bool:
+    return platform.system() == "Linux"
+
+
+def has_free_symbols(itr: Iterable[Any]):
+    return any(isinstance(x, sympy.Expr) and not x.is_number for x in itr)
+
+
+def is_dynamic(*args):
+    from . import ir
+
+    for t in args:
+        if isinstance(t, ir.TensorBox):
+            if has_free_symbols(t.data.get_size()) or (
+                hasattr(t.data, "get_stride") and has_free_symbols(t.data.get_stride())
+            ):
+                return True
+        elif isinstance(t, (ir.StorageBox, ir.BaseView, ir.ComputedBuffer)):
+            assert hasattr(t, "get_size") and hasattr(t, "get_stride")
+            if has_free_symbols(t.get_size()) or has_free_symbols(t.get_stride()):
+                return True
+        elif not isinstance(t, ir.IRNode):
+            continue
+        else:
+            raise TypeError(f"unexpected type for is_dynamic {type(t)}")
+
+    return False
+
+
+# Placeholder strings used in triton codegen.
+class Placeholder(enum.Enum):
+    # The placeholder for the actual name of a triton kernel.
+    # e.g. for "def triton_" it would be "triton_"
+    KERNEL_NAME = "KERNEL_NAME"
+
+    # The descriptive name of the triton kernel; when unique_kernel_names = False, this
+    # placeholder will be replaced with a string with more information.
+    DESCRIPTIVE_NAME = "DESCRIPTIVE_NAME"
+
+
+def pass_execution_and_save(func, gm, msg):
+    from .pattern_matcher import stable_topological_sort
+
+    with tempfile.NamedTemporaryFile(
+        mode="w",
+        encoding="utf-8",
+        delete=False,
+    ) as f:
+        before_io = io.StringIO()
+        after_io = io.StringIO()
+        print(f"Before:\n{gm.graph}", file=f)
+        print(gm.graph, file=before_io)
+        start_time = datetime.now()
+        func(gm.graph)
+        time_elapsed = datetime.now() - start_time
+        # recompile graph
+        stable_topological_sort(gm.graph)
+        gm.graph.lint()
+        gm.recompile()
+
+        print(f"After:\n{gm.graph}", file=f)
+        print(gm.graph, file=after_io)
+        t = before_io.getvalue() == after_io.getvalue()
+        log.info(
+            "%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s",
+            msg,
+            f.name,
+            t,
+            time_elapsed,
+        )
+
+
+def is_collective(node):
+    from . import ir
+
+    return isinstance(node, ir.CollectiveKernel) or type(node) == ir._CollectiveKernel
+
+
+def is_wait(node):
+    from . import ir
+
+    return isinstance(node, ir.Wait) or type(node) == ir._WaitKernel
+
+
+def num_fw_fixed_arguments(dynamo_gm_num_inputs: int, aot_fw_gm_num_inputs: int):
+    "Computes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)"
+    num_rng_seed_offset_inputs = (
+        2 if torch._functorch.config.functionalize_rng_ops else 0
+    )
+    return aot_fw_gm_num_inputs - dynamo_gm_num_inputs - num_rng_seed_offset_inputs
+
+
+def count_tangents(fx_g: torch.fx.GraphModule):
+    """
+    Infers which inputs are static for a backwards graph
+    """
+
+    def is_saved_tensor(x):
+        return (
+            "tangents" not in x.name
+            and "bwd_seed" not in x.name
+            and "bwd_base_offset" not in x.name
+        )
+
+    arg_count = 0
+    static_arg_idxs = []
+    for n in fx_g.graph.nodes:
+        if n.op == "placeholder":
+            if is_saved_tensor(n):
+                static_arg_idxs.append(arg_count)
+            arg_count += 1
+
+    assert static_arg_idxs == list(range(len(static_arg_idxs)))
+    return len(static_arg_idxs)
+
+
+@dataclasses.dataclass
+class BoxedBool:
+    value: bool
+
+    def __bool__(self):
+        return self.value
+
+    @staticmethod
+    def disable(obj):
+        if isinstance(obj, BoxedBool):
+            obj.value = False
+            return obj
+        return False
+
+
+@contextlib.contextmanager
+def collect_defined_kernels(kernel_list):
+    from .codegen.wrapper import WrapperCodeGen
+
+    orig_define_kernel = WrapperCodeGen.define_kernel
+
+    def new_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs):
+        nonlocal kernel_list
+        kernel_list.append(kernel_code)
+        return orig_define_kernel(wrapper, name, kernel_code, metadata, *args, **kwargs)
+
+    with unittest.mock.patch.object(WrapperCodeGen, "define_kernel", new_define_kernel):
+        yield
diff --git a/MLPY/Lib/site-packages/torch/_inductor/virtualized.py b/MLPY/Lib/site-packages/torch/_inductor/virtualized.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdd641b40eea6e1b935ddd7a87d4e4208ab023f8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/virtualized.py
@@ -0,0 +1,351 @@
+"""
+This file provides a number of "global" variables/handlers that are actually
+thread local and dynamically scoped, with Inductor patching them to various
+implementations depending on the situation.
+
+These handlers are interacted with in a fairly stylized way.  Typically,
+we will import V from this module::
+
+    from .virtualized import V
+
+Various handlers are accessible as attributes on this module; for example,
+you might access ``V.graph.sizevars.size_hint`` to resolve a size hint associated with
+a number.
+
+There are a few distinct usage patterns for virtualized global variables:
+
+1. Implicit argument passing.  Examples: ``V.current_node``, ``V.aot_compilation``.
+   Use ``V.set_current_node`` to change what the current node is while we're
+   executing some region of code, so code inside that region can query ``V.current_node``
+   to find out what it is.  This is often more convenient than manually threading
+   the current node as an argument through all call stacks.
+
+2. Per-compilation global state.  Examples: ``V.fake_mode``, ``V.graph``.  For a
+   given ``compile_fx`` invocation, these typically don't change, but they are
+   associated with some internal state so they cannot just be global functions.
+   We install these objects at the beginning of compilation and then you can
+   conveniently access them without having to pass them around.
+
+3. Alternate define-by-run interpretations.  Examples: ``V.ops``, ``V.kernel``.
+   A commonly used IR in Inductor is define-by-run: instead of maintaining
+   explicit syntax data structures, we instead represent loop bodies as
+   callable functions, which internally invoke operations defined on
+   ``V.ops``.  To perform semantic analysis, print or code generate these
+   operations, we dynamically patch ``V.ops`` with an alternate handler with
+   the intended semantics and then run the callable function.  For example, to
+   extract out a traditional (FX) graph representation of the define-by-run
+   IR, simply install a handler that records each ``ops`` call to a graph.
+
+   TODO: Define a parent class / protocol that defines all of the operations
+   V.ops is expected to support.
+
+It is typically an error to access a virtualized global without having installed
+an appropriate handler (you will get a NullHandler), although in some cases we
+provide a default implementation.
+
+One last thing: although most virtualized globals are accessed via ``V``, ``ops`` is
+ubiquitous enough to have its own top level variable, so you will typically see
+``ops.constant(...)`` rather than ``V.ops.constant(...)``.  In fact, these are not
+equivalent; the former interface supports arithmetic overloads like ``x + y``
+instead of forcing ``ops.add(x, y)``, so it should be preferred.
+
+Some operators are seemingly unused, but they are implicitly used by ops_wrapper.
+In particular, we typically have an operator for every basic pointwise PyTorch operation
+supported.
+"""
+
+from __future__ import annotations
+
+from contextlib import AbstractContextManager, contextmanager
+from threading import local
+from typing import Any, Callable, Generic, List, Type, TYPE_CHECKING, TypeVar, Union
+
+from .ops_handler import (  # noqa: F401
+    KernelFormatterHandler,
+    MockHandler,
+    OpsHandler,
+    ReductionType,
+    StoreMode,
+    WrapperHandler,
+)
+
+if TYPE_CHECKING:
+    import torch
+    from torch._inductor.debug import DebugContext
+    from torch._inductor.graph import GraphLowering
+    from torch._inductor.ir import InterpreterShim
+    from torch._subclasses import FakeTensorMode
+
+threadlocal = local()
+
+T = TypeVar("T")
+
+
+class NullHandler:
+    """
+    Sentinel indicating that a global variable is unset ala None.  Typically,
+    attempting to access the global variable before it's set is an error, but with
+    NullHandler it won't fail until you try to access an attribute on it.
+    """
+
+    pass
+
+
+class Virtualized(Generic[T]):
+    """
+    Implements a global variable that redirects via thread local variable
+    (NB: construct this class to create the global variable; this is not
+    a singleton class!)
+
+    This allows us to swap in different op implementations in codegen.
+
+    NB: Despite the fact that we typically call these "handlers" (e.g., NullHandler is
+    the default value of the variable), we sometimes use these variables to
+    store other things, like booleans.
+    """
+
+    def __init__(self, vname: str, default: Union[Callable[[], T], Type[NullHandler]]):
+        self._key: str = f"__torchinductor_{vname}"
+        self._default = default
+
+    def _set_handler(self, value: T) -> AbstractContextManager[None]:
+        prior = self._get_handler()
+        setattr(threadlocal, self._key, value)
+
+        @contextmanager
+        def ctx():
+            try:
+                yield
+            finally:
+                self._set_handler(prior)
+
+        return ctx()
+
+    def _get_handler(self) -> T:
+        try:
+            return getattr(threadlocal, self._key)
+        except AttributeError:
+            # TODO: To be honest, I feel we probably should just error in this
+            # case, instead of making a null handler that will probably error
+            # when you getattr on it
+            return self._default()  # type: ignore[return-value]
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._get_handler(), name)
+
+
+class NullKernelHandler(NullHandler):
+    """
+    We need access `V.kernel.removed_buffers` in DeferredLine class when there
+    is no kernel in the context. This happens when codegening the wrapper.
+    Initialize `removed_buffers` and `inplaced_to_remove` explicitly so we don't
+    need call 'getattr' with default value which is error prone to typo in
+    attribute name.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.removed_buffers = set()
+        self.inplaced_to_remove = set()
+        self.index_dtype = "tl.int64"
+
+
+_ops: Virtualized[OpsHandler[Any]] = Virtualized("ops", MockHandler)
+_graph: Virtualized[GraphLowering] = Virtualized("graph", NullHandler)
+_real_inputs: Virtualized[List[torch.Tensor]] = Virtualized("real_inputs", NullHandler)
+_fake_mode: Virtualized[FakeTensorMode] = Virtualized("fake_mode", NullHandler)
+_kernel: Virtualized[NullKernelHandler] = Virtualized(
+    "kernel", NullKernelHandler
+)  # TODO: improve type
+_debug: Virtualized[DebugContext] = Virtualized("debug", NullHandler)
+_interpreter: Virtualized[InterpreterShim] = Virtualized("interpreter", NullHandler)
+_aot_compilation: Virtualized[bool] = Virtualized("aot_compilation", NullHandler)
+_current_node: Virtualized[torch.fx.Node] = Virtualized("current_node", NullHandler)
+
+
+class OpsValue:
+    """The return type of most ops calls.
+
+    This exists so we can overload magic methods, and write mathematical
+    expressions much more fluently. So instead of
+
+        ops.add(ops.mul(ops.mul(ops.sub(ops.mul(_Ap2, x), _Ap3), x), x), _1)
+
+    we can write
+
+        (_Ap2 * x - _Ap3) * x * x + _1
+
+    """
+
+    value: Any
+
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return str(self.value)
+
+    def __repr__(self):
+        return f"OpsValue({self.value!r})"
+
+    def __add__(self, other):
+        return ops.add(self, other)
+
+    def __mul__(self, other):
+        return ops.mul(self, other)
+
+    def __sub__(self, other):
+        return ops.sub(self, other)
+
+    def __neg__(self):
+        return ops.neg(self)
+
+    def __truediv__(self, other):
+        return ops.truediv(self, other)
+
+    def __floordiv__(self, other):
+        return ops.floordiv(self, other)
+
+    def __mod__(self, other):
+        return ops.mod(self, other)
+
+    def __pow__(self, other):
+        return ops.pow(self, other)
+
+    def __lt__(self, other):
+        return ops.lt(self, other)
+
+    def __le__(self, other):
+        return ops.le(self, other)
+
+    def __eq__(self, other):
+        return ops.eq(self, other)
+
+    def __ne__(self, other):
+        return ops.ne(self, other)
+
+    def __gt__(self, other):
+        return ops.gt(self, other)
+
+    def __ge__(self, other):
+        return ops.ge(self, other)
+
+    def __and__(self, other):
+        return ops.bitwise_and(self, other)
+
+    def __or__(self, other):
+        return ops.bitwise_or(self, other)
+
+    def __xor__(self, other):
+        return ops.bitwise_xor(self, other)
+
+    def __invert__(self):
+        return ops.bitwise_not(self)
+
+    def __rshfit__(self, n):
+        return ops.bitwise_right_shift(self, n)
+
+    def __lshift__(self, n):
+        return ops.bitwise_left_shift(self, n)
+
+
+class OpsWrapper:
+    """This wraps any returned IR values into an `OpsValue` instance, so that we
+    can overload the magic methods for writing mathematical expressions fluently.
+    """
+
+    def __getattr__(self, name):
+        def inner(*args, **kwargs):
+            new_args = [OpsWrapper._unwrap(a) for a in args]
+            new_kwargs = {k: OpsWrapper._unwrap(v) for k, v in kwargs.items()}
+            return OpsWrapper._wrap(getattr(_ops, name)(*new_args, **new_kwargs))
+
+        return inner
+
+    @staticmethod
+    def _unwrap(x):
+        if isinstance(x, (list, tuple)):
+            return tuple(OpsWrapper._unwrap(v) for v in x)
+        if isinstance(x, OpsValue):
+            return x.value
+        return x
+
+    @staticmethod
+    def _wrap(x):
+        if isinstance(x, (list, tuple)):
+            return tuple(OpsValue(v) for v in x)
+        return OpsValue(x)
+
+    @staticmethod
+    def indirect_indexing(index, size, check=True):
+        # Returns a sympy value, not IR value
+        index = OpsWrapper._unwrap(index)
+        return _ops.indirect_indexing(index, size, check)
+
+
+ops = OpsWrapper()
+
+
+class _V:
+    MockHandler = MockHandler
+    KernelFormatterHandler = KernelFormatterHandler
+    WrapperHandler = WrapperHandler
+
+    set_ops_handler: Callable[[Any], Any] = _ops._set_handler
+    get_ops_handler: Callable[[], Any] = _ops._get_handler
+    set_graph_handler: Callable[[GraphLowering], Any] = _graph._set_handler
+    set_real_inputs: Callable[[Any], Any] = _real_inputs._set_handler
+    get_real_inputs: Callable[[], Any] = _real_inputs._get_handler
+    set_fake_mode: Callable[[Any], Any] = _fake_mode._set_handler
+    get_fake_mode: Callable[[], Any] = _fake_mode._get_handler
+    set_kernel_handler: Callable[[Any], Any] = _kernel._set_handler
+    set_debug_handler: Callable[[Any], Any] = _debug._set_handler
+    set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler
+    set_aot_compilation: Callable[[bool], Any] = _aot_compilation._set_handler
+    get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler
+    set_current_node: Callable[[Any], Any] = _current_node._set_handler
+    get_current_node: Callable[[], Any] = _current_node._get_handler
+
+    @property
+    def ops(self) -> OpsHandler[Any]:
+        """The operator handler specific to the current codegen task"""
+        return _ops._get_handler()
+
+    @property
+    def graph(self) -> GraphLowering:
+        """The graph currently being generated"""
+        return _graph._get_handler()
+
+    @property
+    def real_inputs(self):
+        """non-fake example inputs"""
+        return _real_inputs._get_handler()
+
+    @property
+    def fake_mode(self):
+        """The graph currently being generated"""
+        return _fake_mode._get_handler()
+
+    @property
+    def kernel(self):
+        """The kernel currently being generated"""
+        return _kernel._get_handler()
+
+    @property
+    def debug(self):
+        return _debug._get_handler()
+
+    @property
+    def interpreter(self):
+        return _interpreter._get_handler()
+
+    @property
+    def aot_compilation(self):
+        return _aot_compilation._get_handler()
+
+    @property
+    def current_node(self):
+        return _current_node._get_handler()
+
+
+V = _V()
diff --git a/MLPY/Lib/site-packages/torch/_inductor/wrapper_benchmark.py b/MLPY/Lib/site-packages/torch/_inductor/wrapper_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..730bdeaf2b927a225b4867615a7c2c1efc8f1ecd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_inductor/wrapper_benchmark.py
@@ -0,0 +1,299 @@
+import dataclasses
+import tempfile
+from collections import defaultdict
+
+import torch
+from torch.autograd import DeviceType
+from .utils import create_bandwidth_info_str, do_bench, get_num_bytes
+
+_kernel_category_choices = [
+    "foreach",
+    "persistent_reduction",
+    "pointwise",
+    "reduction",
+    "split_scan",
+    "template",
+]
+
+
+def get_kernel_category_by_source_code(src_code):
+    """
+    Similar to get_kernel_category but use the source code. Call this API
+    if we have not compile the src_code to module yet.
+    """
+    choices = [
+        ch for ch in _kernel_category_choices if f"@triton_heuristics.{ch}" in src_code
+    ]
+    if len(choices) == 1:
+        return choices[0]
+    else:
+        return "unknown"
+
+
+def get_kernel_category(kernel_mod):
+    """
+    Given the module defining a triton kernel, return the category of the kernel.
+    Category can be one of:
+    - pointwise
+    - reduction
+    - persistent_reduction
+
+    Currently we simply decide the category depending on what decorator is imported
+    by the kernel.
+    """
+    choices = [ch for ch in _kernel_category_choices if ch in kernel_mod.__dict__]
+    if len(choices) == 1:
+        return choices[0]
+    else:
+        return "unknown"
+
+
+def get_triton_kernel(mod):
+    from torch._inductor.triton_heuristics import CachingAutotuner
+
+    cand_list = [
+        v
+        for k, v in mod.__dict__.items()
+        if k.startswith("triton_") and isinstance(v, CachingAutotuner)
+    ]
+    assert len(cand_list) == 1
+    return cand_list[0]
+
+
+def benchmark_all_kernels(benchmark_name, benchmark_all_configs):
+    """
+    An experimental API used only when config.benchmark_kernel is true.
+
+    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
+    Used in the compiled modules.
+
+    Put this method here rather than codegen it for convenience since its implementation
+    does not change based on different graph modules being compiled.
+    """
+    from torch._inductor.codecache import PyCodeCache
+
+    nfound = 0
+    for kernel_key, kernel_mod in PyCodeCache.cache.items():
+        if not hasattr(kernel_mod, "get_args") or not hasattr(kernel_mod, "call"):
+            continue
+
+        triton_kernel = get_triton_kernel(kernel_mod)
+        kernel_category = get_kernel_category(kernel_mod)
+        args = kernel_mod.get_args()
+        num_in_out_ptrs = len(
+            [
+                arg_name
+                for arg_name in triton_kernel.fn.arg_names
+                if arg_name.startswith("in_out_ptr")
+            ]
+        )
+        num_gb = triton_kernel.inductor_meta.get("kernel_num_gb", None)
+        if num_gb is None:
+            num_gb = get_num_bytes(*args, num_in_out_args=num_in_out_ptrs) / 1e9
+
+        def get_info_str(ms, n_regs, n_spills, shared, prefix=""):
+            if not any(x is None for x in [n_regs, n_spills, shared]):
+                kernel_detail_str = (
+                    f"  {n_regs:3} regs  {n_spills:3} spills  {shared:8} shared mem"
+                )
+            else:
+                kernel_detail_str = ""
+
+            gb_per_s = num_gb / (ms / 1e3)
+            return create_bandwidth_info_str(
+                ms, num_gb, gb_per_s, prefix=prefix, suffix=kernel_detail_str
+            )
+
+        kernel_desc = (
+            f"{benchmark_name:20} {kernel_category[:3].upper()} {kernel_key[:10]}"
+        )
+        if benchmark_all_configs:
+            assert hasattr(kernel_mod, "benchmark_all_configs")
+            bench_result = kernel_mod.benchmark_all_configs(args)
+            print(kernel_desc)
+            for launcher, ms in bench_result.items():
+                print(
+                    f"  {get_info_str(ms, launcher.n_regs, launcher.n_spills, launcher.shared)} @ {launcher.config}"
+                )
+        else:
+            ms = do_bench(lambda: kernel_mod.call(args), rep=40, fast_flush=True)
+            assert (
+                len(triton_kernel.launchers) == 1
+            ), "Autotuner should have selected the best config"
+            launcher = triton_kernel.launchers[0]
+            print(
+                get_info_str(
+                    ms,
+                    launcher.n_regs,
+                    launcher.n_spills,
+                    launcher.shared,
+                    prefix=f"{kernel_desc} ",
+                )
+            )
+
+        nfound += 1
+    if nfound == 0:
+        print(
+            "No kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True"
+        )
+
+
+@dataclasses.dataclass
+class ProfileEvent:
+    category: str
+    key: str
+    self_cuda_time_ms: float
+    # the benchmark is run multiple times and we average the count across all the
+    # runs. It should be an integer but define a float just in case.
+    count: float
+
+
+def parse_profile_event_list(benchmark_name, event_list, wall_time_ms, nruns):
+    def get_self_cuda_time(ev):
+        """
+        ev.self_cuda_time_total is in microsecond. Convert to millisecond.
+        """
+        return ev.self_cuda_time_total / 1000 / nruns
+
+    all_events = defaultdict(list)
+
+    def add_event(ev, category):
+        profile_ev = ProfileEvent(
+            category=category,
+            key=ev.key,
+            self_cuda_time_ms=get_self_cuda_time(ev),
+            count=ev.count / nruns,  # average across all runs
+        )
+        all_events[category].append(profile_ev)
+
+    for ev in event_list:
+        assert not ev.is_legacy, "Don't support the legacy profiler"
+        if ev.device_type == DeviceType.CPU:
+            # ignore the event on CPU side
+            continue
+
+        category = "unknown"
+        if ev.key.startswith("triton_"):
+            if ev.key.startswith("triton_poi"):
+                category = "triton_pointwise"
+            elif ev.key.startswith("triton_red"):
+                category = "triton_reduction"
+            elif ev.key.startswith("triton_per"):
+                category = "triton_persistent_reduction"
+            else:
+                category = "triton_unknown"
+
+        add_event(ev, category)
+
+    def report_category(category, profile_events):
+        from tabulate import tabulate
+
+        profile_events.sort(key=lambda ev: ev.self_cuda_time_ms, reverse=True)
+
+        rows = []
+        total_time = 0.0
+        print(f"\n  == {category} category kernels == ")
+        for ev in profile_events:
+            total_time += ev.self_cuda_time_ms
+            percent = f"{ev.self_cuda_time_ms / wall_time_ms * 100:.2f}%"
+            rows.append([ev.key[:120], ev.self_cuda_time_ms, ev.count, percent])
+        rows.append(
+            ["Total", total_time, "", f"{total_time / wall_time_ms * 100:.2f}%"]
+        )
+        print(
+            tabulate(
+                rows, headers=["Kernel", "Self CUDA TIME (ms)", "Count", "Percent"]
+            )
+        )
+        return total_time
+
+    def report():
+        category_list = [
+            "triton_pointwise",
+            "triton_reduction",
+            "triton_persistent_reduction",
+            "triton_unknown",
+            "unknown",
+        ]
+        assert set(all_events.keys()).issubset(
+            set(category_list)
+        ), f"{list(all_events.keys())}"
+
+        per_category_wall_time = {}
+        total_cuda_ms = 0.0
+        for category in category_list:
+            if category in all_events:
+                _time = report_category(category, all_events[category])
+                per_category_wall_time[category] = _time
+                total_cuda_ms += _time
+
+        gpu_busy_percent = f"{total_cuda_ms / wall_time_ms * 100:.2f}%"
+        print(f"\nPercent of time when GPU is busy: {gpu_busy_percent}")
+        print(f"Total wall time {wall_time_ms:.3f} ms")
+
+        # output such a line so we can gather such line from all compiled modules from all
+        # benchmarks and tabulate it!
+        # Columns: benchmark_name, pointwise_percent, reduction_percent, persistent_reduction_percent,
+        #   unknown_category_percent, GPU_busy_percent, wall_time_ms
+        tabulate_line = f"Output for tabulate: {benchmark_name}"
+        for category in category_list:
+            percent = (
+                f"{per_category_wall_time.get(category, 0.0) / wall_time_ms * 100:.2f}%"
+            )
+            tabulate_line += f", {percent}"
+        tabulate_line += f", {gpu_busy_percent}, {wall_time_ms:.3f}ms"
+
+        print(tabulate_line)
+
+    report()
+
+
+def compiled_module_main(benchmark_name, benchmark_compiled_module_fn):
+    """
+    This is the function called in __main__ block of a compiled module.
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--benchmark-kernels",
+        "-k",
+        action="store_true",
+        help="Whether to benchmark each individual kernels",
+    )
+    parser.add_argument(
+        "--benchmark-all-configs",
+        "-c",
+        action="store_true",
+        help="Whether to benchmark each individual config for a kernel",
+    )
+    parser.add_argument(
+        "--profile",
+        "-p",
+        action="store_true",
+        help="Whether to profile the compiled module",
+    )
+    args = parser.parse_args()
+
+    if args.benchmark_kernels:
+        benchmark_all_kernels(benchmark_name, args.benchmark_all_configs)
+    else:
+        times = 10
+        repeat = 10
+        wall_time_ms = benchmark_compiled_module_fn(times=times, repeat=repeat) * 1000
+
+        if not args.profile:
+            return
+
+        with torch.profiler.profile(record_shapes=True) as p:
+            benchmark_compiled_module_fn(times=times, repeat=repeat)
+
+        path = f"{tempfile.gettempdir()}/compiled_module_profile.json"
+        p.export_chrome_trace(path)
+        print(f"Profiling result for a compiled module of benchmark {benchmark_name}:")
+        print(f"Chrome trace for the profile is written to {path}")
+        event_list = p.key_averages(group_by_input_shape=True)
+        print(event_list.table(sort_by="self_cuda_time_total", row_limit=10))
+        parse_profile_event_list(
+            benchmark_name, event_list, wall_time_ms, times * repeat
+        )
diff --git a/MLPY/Lib/site-packages/torch/_jit_internal.py b/MLPY/Lib/site-packages/torch/_jit_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de2be2ad72b3d235bfe160198a54f2020670d8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_jit_internal.py
@@ -0,0 +1,1510 @@
+"""
+The weak_script annotation needs to be here instead of inside torch/jit/ so it
+can be used in other places in torch/ (namely torch.nn) without running into
+circular dependency problems
+"""
+
+import ast
+import builtins
+import collections
+import contextlib
+import enum
+import inspect
+import io
+import pickle
+import sys
+import threading
+import types
+import typing
+import warnings
+import weakref
+from textwrap import dedent
+from typing import (  # noqa: F401
+    Any,
+    Callable,
+    Dict,
+    Final,
+    ForwardRef,
+    Generic,
+    get_args,  # new in 3.8
+    get_origin,  # new in 3.8
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import torch
+
+# This is needed. `torch._jit_internal` is imported before `torch.distributed.__init__`.
+# Explicitly ask to import `torch.distributed.__init__` first.
+# Otherwise, "AttributeError: module 'torch' has no attribute 'distributed'" is raised.
+import torch.distributed.rpc
+import torch.package._mangling as package_mangling
+from torch._awaits import _Await
+from torch._C import _Await as CAwait, Future as CFuture
+from torch._sources import fake_range, get_source_lines_and_file, parse_def
+from torch.futures import Future
+
+IS_PY39_PLUS: Final[bool] = sys.version_info >= (3, 9)
+IS_PY310_PLUS: Final[bool] = sys.version_info >= (3, 10)
+
+BuiltinUnionType: Union[Type, Tuple[Type, ...]]
+if sys.version_info >= (3, 10):
+    # NOTE: IS_PY310_PLUS doesn't work with mypy.
+    # cf. https://mypy.readthedocs.io/en/stable/common_issues.html#python-version-and-system-platform-checks
+    BuiltinUnionType = types.UnionType
+else:
+    BuiltinUnionType = ()  # trick: this makes isinstance short circuit.
+
+LockType: Type
+try:
+    import _thread
+
+    LockType = _thread.LockType
+except ImportError:
+    import _dummy_thread  # type: ignore[import-not-found]
+
+    LockType = _dummy_thread.LockType
+
+# Wrapper functions that can call either of 2 functions depending on a boolean
+# argument
+boolean_dispatched: "weakref.WeakKeyDictionary[Callable, Dict[str, Callable]]" = (
+    weakref.WeakKeyDictionary()
+)  # noqa: T484
+
+
+FAKE_FILENAME_PREFIX = "__torch_jit_dataclass"
+
+
+class SourceLoader:
+    def __init__(self):
+        self.content = {}
+
+    def cache(self, fn, source):
+        self.content[fn] = source
+
+    def get_source(self, fn):
+        return self.content.get(fn)
+
+
+loader = SourceLoader()
+
+
+def createResolutionCallbackFromEnv(lookup_base):
+    """
+    Creates a resolution callback that will look up qualified names in an
+    environment, starting with `lookup_base` for the base of any qualified
+    names, then proceeding down the lookup chain with the resolved object.
+
+    You should not use this directly, it should only be used from the other
+    createResolutionCallbackFrom* functions.
+    """
+
+    def lookupInModule(qualified_name, module):
+        if "." in qualified_name:
+            base, remaining_pieces = qualified_name.split(".", maxsplit=1)
+            module_value = getattr(module, base)
+            return lookupInModule(remaining_pieces, module_value)
+        else:
+            return getattr(module, qualified_name)
+
+    def parseNestedExpr(expr, module) -> Tuple[Any, int]:
+        i = 0
+        while i < len(expr) and expr[i] not in (",", "[", "]"):
+            i += 1
+
+        # Special case logic for the empty Tuple as a subscript (used
+        # in the type annotation `Tuple[()]`)
+        if expr[:i] == "()":
+            return (), i
+
+        base = lookupInModule(expr[:i].strip(), module)
+        assert base is not None, f"Unresolvable type {expr[:i]}"
+        if i == len(expr) or expr[i] != "[":
+            return base, i
+
+        assert expr[i] == "["
+        parts = []
+        while expr[i] != "]":
+            part_len = 0
+            i += 1
+            part, part_len = parseNestedExpr(expr[i:], module)
+            parts.append(part)
+            i += part_len
+        if len(parts) > 1:
+            return base[tuple(parts)], i + 1
+        else:
+            return base[parts[0]], i + 1
+
+    def parseExpr(expr, module):
+        try:
+            value, len_parsed = parseNestedExpr(expr, module)
+            assert len_parsed == len(
+                expr
+            ), "whole expression was not parsed, falling back to c++ parser"
+            return value
+        except Exception:
+            """
+            The python resolver fails in several cases in known unit tests, and is intended
+            to fall back gracefully to the c++ resolver in general.  For example, python 2 style
+            annotations which are frequent in our unit tests often fail with types e.g. int not
+            resolvable from the calling frame.
+            """
+            return None
+
+    return lambda expr: parseExpr(expr, lookup_base)
+
+
+def createResolutionCallbackFromFrame(frames_up: int = 0):
+    """
+    Creates a function which, given a string variable name,
+    returns the value of the variable in the scope of the caller of
+    the function which called createResolutionCallbackFromFrame (by default).
+
+    This is used to enable access in-scope Python variables inside
+    TorchScript fragments.
+
+    frames_up is number of additional frames to go up on the stack.
+    The default value is 0, which correspond to the frame of the caller
+    of createResolutionCallbackFromFrame. Also for example, if frames_up is set
+    to 1, then the frame of the caller's caller of createResolutionCallbackFromFrame
+    will be taken.
+
+    For example, the following program prints 2::
+
+        def bar():
+            cb = createResolutionCallbackFromFrame(1)
+            print(cb("foo"))
+
+        def baz():
+            foo = 2
+            bar()
+
+        baz()
+    """
+    frame = inspect.currentframe()
+    i = 0
+    while i < frames_up + 1:
+        assert frame is not None
+        frame = frame.f_back
+        i += 1
+
+    assert frame is not None
+    f_locals = frame.f_locals
+    f_globals = frame.f_globals
+
+    class env:
+        def __getattr__(self, key):
+            if key in f_locals:
+                return f_locals[key]
+            elif key in f_globals:
+                return f_globals[key]
+            elif key in dir(builtins):
+                return getattr(builtins, key)
+
+    return createResolutionCallbackFromEnv(env())
+
+
+def get_closure(fn):
+    """
+    Get a dictionary of closed over variables from a function
+    """
+    captures = {}
+    captures.update(fn.__globals__)
+
+    for index, captured_name in enumerate(fn.__code__.co_freevars):
+        captures[captured_name] = fn.__closure__[index].cell_contents
+
+    return captures
+
+
+# [local resolution in python]
+# Depending on where a variable is defined, and where it is used, we may
+# or may not be able to recover its value when recursively compiling a
+# script function. Remember in the general case, a module or function is
+# first defined and then later scripted. This means we do not have a
+# chance to capture the active frames when the function is defined. Hence any
+# name resolution has to happen later on the created closure. The way
+# python captures type annotations restricts what we can recover. The
+# follow example illustrates the different cases:
+#
+#         class MyGlobalClass:
+#         ...
+#         def my_local_scope():
+#             @torch.jit.script
+#             class MyClass:
+#                 ...
+#             @torch.jit.script
+#             class MyClassUsedAsVar:
+#                 ...
+#             def eg(x: MyClass, y: MyGlobalClass):
+#                 a_local_capture : Foo
+#                 return MyClassUsedAsVar(x)
+#
+# MyGlobalClass is defined in the __globals__ dictionary of function
+# 'eg', so it is always recoverable. my_local_scope introduces a new local
+# variable scope in the function. Classes defined here are only visible as
+# local variables. For the case of MyClassUsedAsVar, it is captured
+# because it is used as a variable inside the body of the function, and we
+# can resolve it using the captures returned from `get_closure`. However,
+# the type annotations are not captured by the closure. In Python
+# 3.0--3.9, the _value_ of MyClass and MyGlobalClass will be available as
+# annotations on `eg``, but starting in Python 4.0, they will represented as
+# strings and no longer present. Furthermore, since the body of `eg` does
+# not reference those names, they do not appear in the list of closed over
+# variables. In Python 2.x, type annotations are in comments, leading to a
+# similar situation where their definitions are not available. We anticipate
+# that most users will not run into this issue because their modules and
+# functions will be defined at a global scope like MyGlobalClass. In cases
+# where they are not, it is possible to work around issues by declaring the
+# values global in the function.
+# In Python 3.9 declaring class as global will make it invisible to
+# `inspect.getsource`, see https://bugs.python.org/issue42666 .
+# This could be worked around by manualy adding it to `global()` dictionary.
+
+
+def createResolutionCallbackFromClosure(fn):
+    """
+    Create a resolutionCallback by introspecting the function instead of
+    looking up the stack for the enclosing scope
+    """
+    closure = get_closure(fn)
+
+    class closure_lookup:
+        # This is a class since `closure` is a dict and it's easier in
+        # `env_helper` if everything just works with `getattr` calls
+        def __getattr__(self, key):
+            if key in closure:
+                return closure[key]
+            elif hasattr(typing, key):
+                return getattr(typing, key)
+            elif hasattr(builtins, key):
+                return getattr(builtins, key)
+            return None
+
+    return createResolutionCallbackFromEnv(closure_lookup())
+
+
+def can_compile_class(cls) -> bool:
+    # If any of the functions on a type don't have a code object, this type can't
+    # be compiled and is probably a builtin / bound from C
+    if is_ignored_fn(cls):
+        return False
+
+    # Ignore the following list of built-in classes.
+    ignored_builtin_classes = (torch.nn.Module, tuple, list, Exception)
+    if issubclass(cls, ignored_builtin_classes):
+        return False
+
+    names = cls.__dict__
+    fns = [
+        getattr(cls, name)
+        for name in names
+        if inspect.isroutine(getattr(cls, name, None))
+    ]
+    has_code = [hasattr(fn, "__code__") for fn in fns]
+    return all(has_code)
+
+
+def get_callable_argument_names(fn) -> List[str]:
+    """
+    Gets names of all POSITIONAL_OR_KEYWORD arguments for callable `fn`.
+    Returns an empty list when other types of arguments are present.
+
+    This is used by `torch.jit.trace` to assign meaningful argument names to
+    traced functions and modules.
+
+    Args:
+        fn: A callable.
+    Returns:
+        Argument names: List[str]
+    """
+    # inspect.signature may fail, give up in that case.
+    try:
+        callable_signature = inspect.signature(fn)
+    except Exception:
+        return []
+
+    argument_names = []
+    for name, param in callable_signature.parameters.items():
+        # All four other types of arguments do not map to individual values
+        # with a keyword as name.
+        if not param.kind == param.POSITIONAL_OR_KEYWORD:
+            continue
+
+        argument_names.append(name)
+
+    return argument_names
+
+
+def get_annotation_str(annotation):
+    """
+    Convert an AST node containing a type annotation to the string present in the source
+    that represents the same annotation.
+    """
+    if isinstance(annotation, ast.Name):
+        return annotation.id
+    elif isinstance(annotation, ast.Attribute):
+        return ".".join([get_annotation_str(annotation.value), annotation.attr])
+    elif isinstance(annotation, ast.Subscript):
+        # In Python3.9+ subscript indicies are not wrapped in ast.Index
+        subscript_slice = annotation.slice if IS_PY39_PLUS else annotation.slice.value  # type: ignore[attr-defined]
+        return f"{get_annotation_str(annotation.value)}[{get_annotation_str(subscript_slice)}]"
+    elif isinstance(annotation, ast.Tuple):
+        return ",".join([get_annotation_str(elt) for elt in annotation.elts])
+    elif isinstance(annotation, (ast.Constant, ast.NameConstant)):
+        return f"{annotation.value}"
+
+    # If an AST node is not handled here, it's probably handled in ScriptTypeParser.
+    return None
+
+
+def get_type_hint_captures(fn):
+    """
+    Get a dictionary containing type resolution mappings necessary to resolve types
+    for the literal annotations on 'fn'. These are not considered to be closed-over by fn
+    and must be obtained separately (e.g. using this function).
+
+    Args:
+        fn: A callable.
+    Returns:
+        A Dict[str, Any] containing a mapping from the literal annotations used on
+        fn to the Python objects they refer to.
+    """
+    # First, try to get the source of the function. We'll need to parse it to find the actual string names
+    # that were used to annotate the types, since inspect.signature() will only return the class object that
+    # the annotation refers to, not the string name. If we can't get the source, simply return an empty dict.
+    # This may happen in cases where the function is synthesized dynamically at runtime.
+    src = loader.get_source(fn)
+    if src is None:
+        src = inspect.getsource(fn)
+
+    # Gather a dictionary of parameter name -> type, skipping any parameters whose annotated
+    # types are strings. These are only understood by TorchScript in the context of a type annotation
+    # that refers to a class in its own definition, but trying to include a mapping for this in the result
+    # function would cause infinite recursion because the class is currently being compiled.
+    # In addition, there is logic in ScriptTypeParser to handle this.
+    signature = inspect.signature(fn)
+    name_to_type = {
+        name: parameter.annotation
+        for name, parameter in signature.parameters.items()
+        if parameter.annotation is not inspect.Parameter.empty
+        and not isinstance(parameter.annotation, str)
+    }
+
+    # Then, get the literal type annotations from the function declaration
+    # by source inspection. This accounts for the case in which aliases are used
+    # to annotate the arguments (e.g device_t = torch.device, and then d: device_t).
+    # frontend.py cannot be used here because it includes _jit_internal, so use ast instead.
+    a = ast.parse(dedent(src))
+    if len(a.body) != 1 or not isinstance(a.body[0], ast.FunctionDef):
+        raise RuntimeError(f"Expected {fn} to be a function")
+    f = a.body[0]
+
+    # Prepare a dictionary of source annotation -> type, which will be the final result of this function,
+    # by using the parsed AST (f) to reconstruct source annotations as strings for each parameter and mapping
+    # them to the type object corresponding to the annotation via name_to_type using the parameter name.
+    annotation_to_type = {}
+
+    for arg in f.args.args:
+        # Get the source type annotation string for this argument if possible.
+        arg_annotation_str = (
+            get_annotation_str(arg.annotation) if arg.annotation else None
+        )
+
+        # If the argument has no annotation or get_annotation_str cannot convert it to a string,
+        # arg_annotation_str will be None. Skip this arg; ScriptTypeParser will probably handle
+        # this in the latter case.
+        if arg_annotation_str is None:
+            continue
+
+        # Insert {arg_annotation_str: type} into annotation_to_type if possible. One reason arg_name may not
+        # be present in name_to_type is that the annotation itself is a string and not a type object
+        # (common for self-refential annotations in classes). Once again, let ScriptTypeParser handle this.
+        arg_name = arg.arg
+        if arg_name in name_to_type:
+            annotation_to_type[arg_annotation_str] = name_to_type[arg_name]
+
+    # If there is a valid return annotation, include it in annotation_to_type. As with argument annotations,
+    # the literal annotation has to be convertible to a string by get_annotation_str, and the actual type
+    # of the annotation cannot be a string.
+    literal_return_annotation = get_annotation_str(f.returns)
+    valid_literal_annotation = literal_return_annotation is not None
+    return_annotation = signature.return_annotation
+    valid_return_annotation_type = (
+        return_annotation is not inspect.Parameter.empty
+        and not isinstance(return_annotation, str)
+    )
+    if valid_literal_annotation and valid_return_annotation_type:
+        annotation_to_type[literal_return_annotation] = return_annotation
+
+    return annotation_to_type
+
+
+def createResolutionCallbackForClassMethods(cls):
+    """
+    This looks at all the methods defined in a class and pulls their closed-over
+    variables into a dictionary and uses that to resolve variables.
+    """
+    # cls is a type here, so `ismethod` is false since the methods on the type
+    # aren't bound to anything, so Python treats them as regular functions
+    fns = [
+        getattr(cls, name)
+        for name in cls.__dict__
+        if inspect.isroutine(getattr(cls, name))
+    ]
+    # Skip built-ins, as they do not have global scope nor type hints
+    # Needed to support `enum.Enum` derived classes in Python-3.11
+    # That adds `_new_member_` property which is an alias to `__new__`
+    fns = [fn for fn in fns if not inspect.isbuiltin(fn) and hasattr(fn, "__globals__")]
+    captures = {}
+
+    for fn in fns:
+        captures.update(get_closure(fn))
+        captures.update(get_type_hint_captures(fn))
+
+    def lookup_in_class(key):
+        if key in captures:
+            return captures[key]
+        else:
+            return getattr(builtins, key, None)
+
+    return lookup_in_class
+
+
+def boolean_dispatch(
+    arg_name, arg_index, default, if_true, if_false, module_name, func_name
+):
+    """
+    Dispatches to either of 2 script functions based on a boolean argument.
+    In TorchScript, the boolean argument must be constant so that the correct
+    function to use can be determined at compile time.
+    """
+
+    def fn(*args, **kwargs):
+        dispatch_flag = default
+        if arg_name in kwargs:
+            dispatch_flag = kwargs[arg_name]
+        elif arg_index < len(args):
+            dispatch_flag = args[arg_index]
+
+        if dispatch_flag:
+            return if_true(*args, **kwargs)
+        else:
+            return if_false(*args, **kwargs)
+
+    if if_true.__doc__ is None and if_false.__doc__ is not None:
+        doc = if_false.__doc__
+        if_true.__doc__ = doc
+    elif if_false.__doc__ is None and if_true.__doc__ is not None:
+        doc = if_true.__doc__
+        if_false.__doc__ = doc
+    elif if_false.__doc__ is None and if_true.__doc__ is None:
+        # neither function has a docstring
+        doc = None
+    else:
+        raise RuntimeError("only one function can have a docstring")
+    fn.__doc__ = doc
+
+    if module_name is not None:
+        fn.__module__ = module_name
+    if func_name is not None:
+        fn.__name__ = func_name
+
+    boolean_dispatched[fn] = {
+        "if_true": if_true,
+        "if_false": if_false,
+        "index": arg_index,
+        "default": default,
+        "arg_name": arg_name,
+    }
+    return fn
+
+
+class FunctionModifiers:
+    """
+    Used to denote the behavior of a function in TorchScript. See export() and
+    ignore() for details.
+    """
+
+    UNUSED = "unused (ignored and replaced with raising of an exception)"
+    IGNORE = "ignore (leave as a call to Python, cannot be torch.jit.save'd)"
+    EXPORT = "export (compile this function even if nothing calls it)"
+    DEFAULT = "default (compile if called from a exported function / forward)"
+    COPY_TO_SCRIPT_WRAPPER = (
+        "if this method is not scripted, copy the python method onto the scripted model"
+    )
+    _DROP = "_drop (function is fully ignored, declaration can be unscriptable)"
+
+
+def export(fn):
+    """
+    This decorator indicates that a method on an ``nn.Module`` is used as an entry point into a
+    :class:`ScriptModule` and should be compiled.
+
+    ``forward`` implicitly is assumed to be an entry point, so it does not need this decorator.
+    Functions and methods called from ``forward`` are compiled as they are seen
+    by the compiler, so they do not need this decorator either.
+
+    Example (using ``@torch.jit.export`` on a method):
+
+    .. testcode::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            def implicitly_compiled_method(self, x):
+                return x + 99
+
+            # `forward` is implicitly decorated with `@torch.jit.export`,
+            # so adding it here would have no effect
+            def forward(self, x):
+                return x + 10
+
+            @torch.jit.export
+            def another_forward(self, x):
+                # When the compiler sees this call, it will compile
+                # `implicitly_compiled_method`
+                return self.implicitly_compiled_method(x)
+
+            def unused_method(self, x):
+                return x - 20
+
+        # `m` will contain compiled methods:
+        #     `forward`
+        #     `another_forward`
+        #     `implicitly_compiled_method`
+        # `unused_method` will not be compiled since it was not called from
+        # any compiled methods and wasn't decorated with `@torch.jit.export`
+        m = torch.jit.script(MyModule())
+    """
+    fn._torchscript_modifier = FunctionModifiers.EXPORT
+    return fn
+
+
+def unused(fn):
+    """
+    This decorator indicates to the compiler that a function or method should
+    be ignored and replaced with the raising of an exception. This allows you
+    to leave code in your model that is not yet TorchScript compatible and still
+    export your model.
+
+        Example (using ``@torch.jit.unused`` on a method)::
+
+            import torch
+            import torch.nn as nn
+
+            class MyModule(nn.Module):
+                def __init__(self, use_memory_efficient):
+                    super().__init__()
+                    self.use_memory_efficient = use_memory_efficient
+
+                @torch.jit.unused
+                def memory_efficient(self, x):
+                    import pdb
+                    pdb.set_trace()
+                    return x + 10
+
+                def forward(self, x):
+                    # Use not-yet-scriptable memory efficient mode
+                    if self.use_memory_efficient:
+                        return self.memory_efficient(x)
+                    else:
+                        return x + 10
+
+            m = torch.jit.script(MyModule(use_memory_efficient=False))
+            m.save("m.pt")
+
+            m = torch.jit.script(MyModule(use_memory_efficient=True))
+            # exception raised
+            m(torch.rand(100))
+    """
+    if isinstance(fn, property):
+        prop = fn
+        setattr(  # noqa: B010
+            prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED
+        )
+
+        if prop.fset:
+            setattr(  # noqa: B010
+                prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED
+            )
+
+        return prop
+
+    fn._torchscript_modifier = FunctionModifiers.UNUSED
+    return fn
+
+
+# No op context manager from python side
+class _IgnoreContextManager(contextlib.AbstractContextManager):
+    def __init__(self, **kwargs):
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        pass
+
+
+def ignore(drop=False, **kwargs):
+    """
+    This decorator indicates to the compiler that a function or method should
+    be ignored and left as a Python function. This allows you to leave code in
+    your model that is not yet TorchScript compatible. If called from TorchScript,
+    ignored functions will dispatch the call to the Python interpreter. Models with ignored
+    functions cannot be exported; use :func:`@torch.jit.unused <torch.jit.unused>` instead.
+
+    Example (using ``@torch.jit.ignore`` on a method)::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            @torch.jit.ignore
+            def debugger(self, x):
+                import pdb
+                pdb.set_trace()
+
+            def forward(self, x):
+                x += 10
+                # The compiler would normally try to compile `debugger`,
+                # but since it is `@ignore`d, it will be left as a call
+                # to Python
+                self.debugger(x)
+                return x
+
+        m = torch.jit.script(MyModule())
+
+        # Error! The call `debugger` cannot be saved since it calls into Python
+        m.save("m.pt")
+
+    Example (using ``@torch.jit.ignore(drop=True)`` on a method):
+
+    .. testcode::
+
+        import torch
+        import torch.nn as nn
+
+        class MyModule(nn.Module):
+            @torch.jit.ignore(drop=True)
+            def training_method(self, x):
+                import pdb
+                pdb.set_trace()
+
+            def forward(self, x):
+                if self.training:
+                    self.training_method(x)
+                return x
+
+        m = torch.jit.script(MyModule())
+
+        # This is OK since `training_method` is not saved, the call is replaced
+        # with a `raise`.
+        m.save("m.pt")
+
+    .. testcleanup::
+
+        import os
+        os.remove('m.pt')
+    """
+
+    if callable(drop):
+        # used without any args, so drop is actually a function
+        #   @torch.jit.ignore
+        #   def fn(...):
+        fn = drop
+        fn._torchscript_modifier = FunctionModifiers.IGNORE
+        return fn
+
+    if not isinstance(drop, bool):
+        raise RuntimeError(
+            "Argument to @torch.jit.ignore must be a bool or "
+            f"a function but got {drop}"
+        )
+
+    # for backwards compat
+    drop_on_export = kwargs.pop("drop_on_export", None)
+    if drop_on_export:
+        warnings.warn(
+            "ignore(drop_on_export=True) has been deprecated. TorchScript will now drop the function "
+            "call on compilation. Use torch.jit.unused now. {}",
+            category=FutureWarning,
+        )
+
+        drop = drop_on_export
+    elif drop:
+        warnings.warn(
+            "ignore(True) has been deprecated. TorchScript will now drop the function "
+            "call on compilation. Use torch.jit.unused now. {}",
+            category=FutureWarning,
+        )
+
+    def decorator(fn):
+        if drop:
+            fn._torchscript_modifier = FunctionModifiers.UNUSED
+        else:
+            fn._torchscript_modifier = FunctionModifiers.IGNORE
+        return fn
+
+    return decorator
+
+
+def _drop(fn):
+    fn._torchscript_modifier = FunctionModifiers._DROP
+    return fn
+
+
+def _copy_to_script_wrapper(fn):
+    fn._torchscript_modifier = FunctionModifiers.COPY_TO_SCRIPT_WRAPPER
+    return fn
+
+
+def module_has_exports(mod):
+    for name in dir(mod):
+        if hasattr(mod, name):
+            item = getattr(mod, name)
+            if callable(item):
+                if get_torchscript_modifier(item) is FunctionModifiers.EXPORT:
+                    return True
+    return False
+
+
+# WARNING: should_drop is currently being used by our JIT code coverage plug-in to mark JIT'd code as covered. If you
+# rename this function, please update references in tools/coverage_plugins_package/src/coverage_plugins/jit_plugin.py to
+# allow JIT'd code to still be covered.
+def should_drop(fn) -> bool:
+    attr = get_torchscript_modifier(fn)
+    if attr is None:
+        return False
+    return attr is FunctionModifiers.UNUSED or attr is FunctionModifiers._DROP
+
+
+def is_ignored_fn(fn) -> bool:
+    mod = get_torchscript_modifier(fn)
+    return (
+        mod is FunctionModifiers.UNUSED
+        or mod is FunctionModifiers.IGNORE
+        or mod is FunctionModifiers._DROP
+    )
+
+
+def _is_drop_fn(fn) -> bool:
+    mod = get_torchscript_modifier(fn)
+    return mod is FunctionModifiers._DROP
+
+
+def is_static_fn(cls, fn) -> bool:
+    return isinstance(inspect.getattr_static(cls, fn, default=None), staticmethod)
+
+
+def get_static_fn(cls, fn):
+    return inspect.getattr_static(cls, fn).__func__
+
+
+def get_torchscript_modifier(fn):
+    if not callable(fn):
+        return None
+    if hasattr(fn, "__func__"):
+        fn = fn.__func__
+    return getattr(fn, "_torchscript_modifier", FunctionModifiers.DEFAULT)
+
+
+def copy_torchscript_modifier(orig, new) -> None:
+    attr = get_torchscript_modifier(orig)
+    if attr is None:
+        return
+    new._torchscript_modifier = attr
+
+
+# overloading registration
+# overloads get registered in this file, and compiled in torch/jit/__init__.py
+# so that they can be imported in nn/functional.py without an import cycle
+
+# qualified_name => list[overload_functions]
+_overloaded_fns: Dict[str, List[Callable]] = {}  # noqa: T484
+
+
+_OVERLOAD_EXAMPLE = """
+Example usage of overload function:
+@torch.jit._overload
+def my_function(x: type0) -> type0: # decl 1
+    pass
+
+@torch.jit._overload
+def my_function(x: type1) -> type1: # decl 2
+    pass
+
+def my_function(x):                 # implementation
+    if isinstance(x, type0):
+        return x
+    elif isinstance(x, type1):
+        return x
+"""
+
+
+def get_overload_no_implementation_error_message(kind, obj):
+    sourcelines, file_lineno, filename = get_source_lines_and_file(obj)
+    return (
+        f'Implementation for the {kind} "{_qualified_name(obj)}" is missing. Please make '
+        f"sure a definition is provided and defined after all overload declarations.\n"
+        f'File "{filename}", line {file_lineno}:\n'
+        + "".join(sourcelines)
+        + "\n"
+        + _OVERLOAD_EXAMPLE
+    )
+
+
+def _check_overload_body(func):
+    try:
+        parsed_def = parse_def(func)
+    except OSError as e:
+        # Parsing the function definition can raise an OSError if source is unavailable.
+        # Since this is just an initial check, just raise a warning if this is the case.
+        warnings.warn(
+            f"Unable to retrieve source for @torch.jit._overload function: {func}."
+        )
+        return
+
+    body = parsed_def.ast.body[0].body
+
+    def is_pass(x):
+        return isinstance(x, ast.Pass)
+
+    def is_ellipsis(x):
+        return isinstance(x, ast.Expr) and isinstance(x.value, ast.Ellipsis)
+
+    if len(body) != 1 or not (is_pass(body[0]) or is_ellipsis(body[0])):
+        msg = (
+            "Only `pass` statement or `...` can be the body of overload declaration:\n"
+        )
+        msg += "\n".join(parsed_def.source.split("\n")[:3])
+        msg += " <- Expecting `pass` or `...` here!\n" + _OVERLOAD_EXAMPLE
+        raise RuntimeError(msg)
+
+
+def _overload(func):
+    _check_overload_body(func)
+    qual_name = _qualified_name(func)
+    global _overloaded_fns
+    fn_overload_list = _overloaded_fns.get(qual_name)
+    if fn_overload_list is None:
+        fn_overload_list = []
+        _overloaded_fns[qual_name] = fn_overload_list
+    fn_overload_list.append(func)
+    return func
+
+
+def _get_fn_overloads(qual_name):
+    return _overloaded_fns.get(qual_name)
+
+
+def _clear_fn_overloads(qual_name) -> None:
+    del _overloaded_fns[qual_name]
+
+
+def get_class_name_lineno(method) -> Tuple[str, int]:
+    current_frame = inspect.currentframe()
+
+    # one for the get_class_name call, one for _overload_method call
+    for i in range(2):
+        assert (
+            current_frame is not None
+        )  # assert current frame is not an Optional[FrameType]
+        current_frame = current_frame.f_back
+
+    assert current_frame is not None  # same here
+    class_name = current_frame.f_code.co_name
+    line_no = current_frame.f_code.co_firstlineno
+    return class_name, line_no
+
+
+# At the point the decorator is applied to class methods the method
+# has no reference to its owning class. _qualified_name would not include
+# the class it is defined in, so any methods with the same name in the same file
+# would have the same _qualified_name, even if they were defined in different
+# classes. This problem only exists in python 2.
+# We get around this problem by looking at the stack frame and identifying
+# the class name, and throwing an error whenever overloads are used
+# when modules of the same name are in the same file
+
+# qualified_name => class name => list[overload_functions]
+_overloaded_methods: Dict[str, Dict[str, List[Callable]]] = {}  # noqa: T484
+
+
+# (qualified_name, class name) => class_fileno
+_overloaded_method_class_fileno: Dict[Tuple[str, str], int] = {}
+
+
+def _overload_method(func):
+    _check_overload_body(func)
+    qual_name = _qualified_name(func)
+    global _overloaded_methods
+    class_name_map = _overloaded_methods.get(qual_name, None)
+    if class_name_map is None:
+        class_name_map = {}
+        _overloaded_methods[qual_name] = class_name_map
+
+    class_name, line_no = get_class_name_lineno(func)
+    method_overloads = class_name_map.get(class_name, None)
+    if method_overloads is None:
+        method_overloads = []
+        class_name_map[class_name] = method_overloads
+        _overloaded_method_class_fileno[(qual_name, class_name)] = line_no
+    else:
+        existing_lineno = _overloaded_method_class_fileno[(qual_name, class_name)]
+        if existing_lineno != line_no:
+            raise RuntimeError(
+                "Cannot currently overload the same method name in two different"
+                " classes with the same name in the same module"
+            )
+
+    method_overloads.append(func)
+    return func
+
+
+def _get_overloaded_methods(method, mod_class):
+    # TODO: __name__ not set for submodules in recursive script
+    if not hasattr(method, "__name__"):
+        return None
+    qual_name = _qualified_name(method)
+    class_name_map = _overloaded_methods.get(qual_name, None)
+    if class_name_map is None:
+        return None
+    overloads = class_name_map.get(mod_class.__name__, None)
+    if overloads is None:
+        return None
+
+    method_line_no = get_source_lines_and_file(method)[1]
+    mod_class_fileno = get_source_lines_and_file(mod_class)[1]
+    mod_end_fileno = mod_class_fileno + len(get_source_lines_and_file(mod_class)[0])
+    if not (method_line_no >= mod_class_fileno and method_line_no <= mod_end_fileno):
+        raise Exception(
+            "Overloads are not useable when a module is redeclared within the same file: "
+            + str(method)
+        )
+    return overloads
+
+
+def is_tuple(ann) -> bool:
+    if ann is Tuple:
+        raise_error_container_parameter_missing("Tuple")
+
+    # For some reason Python 3.7 violates the Type[A, B].__origin__ == Type rule
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is tuple:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is Tuple or ann_origin is tuple)
+
+
+def is_list(ann) -> bool:
+    if ann is List:
+        raise_error_container_parameter_missing("List")
+
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is list:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is List or ann_origin is list)
+
+
+def is_dict(ann) -> bool:
+    if ann is Dict:
+        raise_error_container_parameter_missing("Dict")
+
+    if not hasattr(ann, "__module__"):
+        return False
+
+    ann_origin = get_origin(ann)
+    if IS_PY39_PLUS and ann.__module__ == "builtins" and ann_origin is dict:
+        return True
+    return ann.__module__ == "typing" and (ann_origin is Dict or ann_origin is dict)
+
+
+def is_union(ann):
+    if ann is Union:
+        raise_error_container_parameter_missing("Union")
+
+    return isinstance(ann, BuiltinUnionType) or (
+        hasattr(ann, "__module__")
+        and ann.__module__ == "typing"
+        and (get_origin(ann) is Union)
+    )
+
+
+def is_optional(ann):
+    if ann is Optional:
+        raise_error_container_parameter_missing("Optional")
+
+    def is_optional_as_optional(ann):
+        return (
+            hasattr(ann, "__module__")
+            and ann.__module__ == "typing"
+            and (get_origin(ann) is Optional)
+        )
+
+    def is_union_as_optional(ann):
+        ann_args = get_args(ann)
+        return len(ann_args) == 2 and (None in ann_args or type(None) in ann_args)
+
+    return is_optional_as_optional(ann) or (is_union(ann) and is_union_as_optional(ann))
+
+
+def is_future(ann) -> bool:
+    if ann is Future:
+        raise RuntimeError(
+            "Attempted to use Future without a "
+            "contained type. Please add a contained type, e.g. "
+            "Future[int]"
+        )
+    return get_origin(ann) is Future
+
+
+def is_await(ann) -> bool:
+    if ann is _Await:
+        return True
+    return get_origin(ann) is _Await
+
+
+if torch.distributed.rpc.is_available():
+    from torch._C._distributed_rpc import PyRRef
+    from torch.distributed.rpc import RRef
+
+    def is_rref(ann) -> bool:
+        if ann is RRef:
+            raise RuntimeError(
+                "Attempted to use RRef without a "
+                "contained type. Please add a contained type, e.g. "
+                "RRef[int]"
+            )
+        return get_origin(ann) is RRef
+
+    def is_rref_instance(obj) -> bool:
+        return isinstance(obj, PyRRef)
+
+else:
+
+    def is_rref_instance(obj) -> bool:
+        # If the RPC module doesn't exist then RRefs don't exist either.
+        return False
+
+
+def is_final(ann) -> bool:
+    return (
+        hasattr(ann, "__module__")
+        and ann.__module__ in {"typing", "typing_extensions"}
+        and (get_origin(ann) is Final or isinstance(ann, type(Final)))
+    )
+
+
+# allows BroadcastingList instance to be subscriptable
+class BroadcastingListCls:
+    def __getitem__(self, types):
+        return
+
+
+# mypy doesn't support parameters on types, so we have to explicitly type each
+# list size
+BroadcastingList1 = BroadcastingListCls()
+for i in range(2, 7):
+    globals()[f"BroadcastingList{i}"] = BroadcastingList1
+
+
+def is_scripting() -> bool:
+    r"""
+    Function that returns True when in compilation and False otherwise. This
+    is useful especially with the @unused decorator to leave code in your
+    model that is not yet TorchScript compatible.
+    .. testcode::
+
+        import torch
+
+        @torch.jit.unused
+        def unsupported_linear_op(x):
+            return x
+
+        def linear(x):
+           if torch.jit.is_scripting():
+              return torch.linear(x)
+           else:
+              return unsupported_linear_op(x)
+    """
+    return False
+
+
+# Retrieves a fully-qualified name (module hierarchy + classname) for a given obj.
+def _qualified_name(obj, mangle_name=True) -> str:
+    # This special case allows us to override the qualified name on a type.
+    # It's currently used in conjunction with tracing, where we create a
+    # fake module to filter only supported attributes. However, since this
+    # new type is defined as a local class, we need a mechanism to override
+    # its qualname so it appears correctly in the TorchScript system. This,
+    # we set '_jit_override_qualname' with the original traced module's
+    # qualified name, which is picked up here
+    if hasattr(obj, "_jit_override_qualname"):
+        return obj._jit_override_qualname
+    # short-circuit in cases where the object already has a known qualified name
+    if isinstance(obj, torch._C.ScriptFunction):
+        return obj.qualified_name
+
+    if getattr(obj, "__name__", None):
+        name = obj.__name__
+    # Enum classes do not have `__name__` attr, instead they have `name`.
+    elif isinstance(obj, enum.Enum):
+        name = obj.name
+    else:
+        raise RuntimeError("Could not get name of python class object")
+
+    if name == "<lambda>":
+        name = "_lambda"  # make name a valid identifier
+
+    module_name = obj.__module__
+
+    # If the module is actually a torchbind module, then we should short circuit
+    if module_name == "torch._classes":
+        return obj.qualified_name
+
+    # The Python docs are very clear that `__module__` can be None, but I can't
+    # figure out when it actually would be.
+    if module_name is None:
+        raise RuntimeError(
+            f"Could not get qualified name for class '{name}': "
+            "__module__ can't be None."
+        )
+
+    # if getattr(sys.modules[module_name], name) is not obj:
+    #     raise RuntimeError(f"Could not get qualified name for class '{name}': "
+    #                        f"the attr {name} on module {module_name} is not the class")
+
+    # torch.package and TorchScript have separate mangling schemes to avoid
+    # name collisions from multiple packages. To avoid them interfering with
+    # each other, normalize the package manging here.
+    if package_mangling.is_mangled(module_name):
+        module_name = module_name.replace("<", "_")
+        module_name = module_name.replace(">", "_")
+
+    # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h
+    # does not need mangle the python class name.
+    if mangle_name:
+        # __main__ is a builtin module, so rewrite it to "__torch__".
+        if module_name == "__main__":
+            module_name = "__torch__"
+        else:
+            # Everything else gets a "__torch__" prefix to avoid name collisions
+            # with the names of user values.
+            module_name = "__torch__." + module_name
+
+    if "." in name:
+        raise RuntimeError(
+            f"Could not get qualified name for class '{name}': "
+            f"'{name}' is not a valid identifier"
+        )
+
+    return module_name + "." + name
+
+
+def _try_get_dispatched_fn(fn):
+    if not callable(fn):
+        return None
+    return boolean_dispatched.get(fn)
+
+
+def _get_named_tuple_properties(
+    obj, loc: Optional[torch._C._jit_tree_views.SourceRange] = None, rcb=None
+):
+    if loc is None:
+        loc = fake_range()
+
+    assert issubclass(obj, tuple) and hasattr(obj, "_fields")
+    if hasattr(obj, "_field_defaults"):
+        defaults = [
+            obj._field_defaults[field]
+            for field in obj._fields
+            if field in obj._field_defaults
+        ]
+    else:
+        defaults = []
+    # In 3.10 recommended way to get annotations is to call `inspect.get_annotations` function
+    # Also, annotations from base class are not inherited so they need to be queried explicitly
+    if sys.version_info[:2] < (3, 10):
+        obj_annotations = getattr(obj, "__annotations__", {})
+    else:
+        obj_annotations = inspect.get_annotations(obj)
+        if len(obj_annotations) == 0 and hasattr(obj, "__base__"):
+            obj_annotations = inspect.get_annotations(obj.__base__)
+
+    annotations = []
+    for field in obj._fields:
+        if field in obj_annotations:
+            field_type = obj_annotations[field]
+            # [Note: ForwardRef annotations in NamedTuple attributes]
+            # NamedTuple types are slightly different from normal types.
+            #
+            # Normally, annotations are evaluted like this (during jit.script):
+            # 1. Load strings of python code into c++ and parse.
+            # 2. Get annotations as strings
+            # 3. Use the PythonResolver's resolution callback (rcb) to convert
+            #    the string into a python object
+            # 4. We call into annotations.py:ann_to_type to convert python obj
+            #    from step 3 into a type that torchscript understands.
+            #
+            # NamedTuples are more complicated, because it has sub-types.
+            # Normally, once we have the NamedTuple type object from #3,
+            # we can just look at the annotation literal values and use
+            # ann_to_type directly on them.
+            #
+            # But sometimes, users will annotate with string literals, e.g.
+            #    x: 'int'
+            # This also happens with PEP563 (from __forward__ import annotations)
+            #
+            # These annotations appear in the annotation dict as ForwardRef('int').
+            #
+            # Then, we need to convert the string into a python object. This
+            # requires having local context for custom objects or imported types.
+            # rcb() is what gives us this. So, we plumb rcb through the stack so
+            # it can be used in this context for the if block below.
+            #
+            # FAQ:
+            # - Why do we need this special handling for NamedTuple but string
+            #   annotations work fine for normal types? Normally, we parse the
+            #   string directly and then call rcb() directly from C++.
+            # - Why not use ForwardRef._evaluate? For that, we need globals()
+            #   and locals() for the local context where the NamedTuple was defined.
+            #   rcb is what lets us look up into these. So, basically rcb does the
+            #   hard work for us.
+            if isinstance(field_type, ForwardRef) and rcb is not None:
+                rcb_type = rcb(field_type.__forward_arg__)
+                # rcb returns None if it can't find anything.
+                if rcb_type is None:
+                    raise ValueError(
+                        f"Unknown type annotation: '{field_type}' in NamedTuple {obj.__name__}."
+                        f" Likely due to partial support for ForwardRef parameters in NamedTuples, see #95858."
+                        f" Issue occurred at {loc.highlight()}"
+                    )
+                field_type = rcb_type
+            the_type = torch.jit.annotations.ann_to_type(field_type, loc, rcb)
+            annotations.append(the_type)
+        else:
+            annotations.append(torch._C.TensorType.getInferred())
+    return type(obj).__name__, obj._fields, annotations, defaults
+
+
+def _create_named_tuple(
+    t, unqual_name: str, field_names: List[str], defaults: Tuple[Any, ...]
+):
+    TupleType = collections.namedtuple(unqual_name, field_names, defaults=defaults)  # type: ignore[call-arg, no-redef, misc]
+    return TupleType(*t)
+
+
+@contextlib.contextmanager
+def _disable_emit_hooks():
+    hooks = torch._C._jit_get_emit_hooks()
+    torch._C._jit_set_emit_hooks(None, None)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_emit_hooks(hooks[0], hooks[1])
+
+
+def _disable_emit_hooks_decorator(_DecoratorContextManager) -> None:  # noqa: F811
+    def __enter__(self) -> None:
+        self.hooks = torch._C._jit_get_emit_hooks()
+        torch._C._jit_set_emit_hooks(None, None)
+
+    def __exit__(self, *args) -> None:
+        torch._C._jit_set_emit_hooks(self.hooks[0], self.hooks[1])
+
+
+def _is_exception(obj) -> bool:
+    if not inspect.isclass(obj):
+        return False
+    return issubclass(obj, Exception)
+
+
+def raise_error_container_parameter_missing(target_type) -> None:
+    if target_type == "Dict":
+        raise RuntimeError(
+            "Attempted to use Dict without "
+            "contained types. Please add contained type, e.g. "
+            "Dict[int, int]"
+        )
+    raise RuntimeError(
+        f"Attempted to use {target_type} without a "
+        "contained type. Please add a contained type, e.g. "
+        f"{target_type}[int]"
+    )
+
+
+def check_args_exist(target_type) -> None:
+    if target_type is List or target_type is list:
+        raise_error_container_parameter_missing("List")
+    elif target_type is Tuple or target_type is tuple:
+        raise_error_container_parameter_missing("Tuple")
+    elif target_type is Dict or target_type is dict:
+        raise_error_container_parameter_missing("Dict")
+    elif target_type is None or target_type is Optional:
+        raise_error_container_parameter_missing("Optional")
+
+
+def check_empty_containers(obj) -> None:
+    if obj == [] or obj == {} or obj == ():
+        warnings.warn(
+            "The inner type of a container is lost when "
+            "calling torch.jit.isinstance in eager mode. For "
+            "example, List[int] would become list and "
+            "therefore falsely return True for List[float] or"
+            " List[str]."
+        )
+
+
+# supports List/Dict/Tuple and Optional types
+# TODO support future
+def container_checker(obj, target_type) -> bool:
+    origin_type = get_origin(target_type)
+    check_args_exist(target_type)
+    if origin_type is None:
+        return False
+    elif origin_type is list or origin_type is List:
+        check_empty_containers(obj)
+        if not isinstance(obj, list):
+            return False
+        arg_type = get_args(target_type)[0]
+        arg_origin = get_origin(arg_type)
+        for el in obj:
+            # check if nested container, ex: List[List[str]]
+            if arg_origin:  # processes nested container, ex: List[List[str]]
+                if not container_checker(el, arg_type):
+                    return False
+            elif not isinstance(el, arg_type):
+                return False
+        return True
+    elif origin_type is Dict or origin_type is dict:
+        check_empty_containers(obj)
+        if not isinstance(obj, dict):
+            return False
+        key_type = get_args(target_type)[0]
+        val_type = get_args(target_type)[1]
+        for key, val in obj.items():
+            # check if keys are of right type
+            if not isinstance(key, key_type):
+                return False
+            val_origin = get_origin(val_type)
+            if val_origin:
+                if not container_checker(val, val_type):
+                    return False
+            elif not isinstance(val, val_type):
+                return False
+        return True
+    elif origin_type is Tuple or origin_type is tuple:
+        check_empty_containers(obj)
+        if not isinstance(obj, tuple):
+            return False
+        arg_types = get_args(target_type)
+        if len(obj) != len(arg_types):
+            return False
+        for el, el_type in zip(obj, arg_types):
+            el_origin = get_origin(el_type)
+            if el_origin:
+                if not container_checker(el, el_type):
+                    return False
+            elif not isinstance(el, el_type):
+                return False
+        return True
+    elif origin_type is Union or issubclass(
+        origin_type, BuiltinUnionType
+    ):  # also handles Optional
+        if obj is None:  # check before recursion because None is always fine
+            return True
+        inner_types = get_args(target_type)
+        for t in inner_types:
+            t_origin = get_origin(t)
+            if t_origin:
+                return container_checker(obj, t)
+            elif isinstance(obj, t):
+                return True
+    return False
+
+
+def _isinstance(obj, target_type) -> bool:
+    if isinstance(target_type, collections.abc.Container):
+        if not isinstance(target_type, tuple):
+            raise RuntimeError(
+                "The second argument to "
+                "`torch.jit.isinstance` must be a type "
+                "or a tuple of types"
+            )
+        for t_type in target_type:
+            if _isinstance(obj, t_type):
+                return True
+        return False
+
+    origin_type = get_origin(target_type)
+    if origin_type:
+        return container_checker(obj, target_type)
+
+    # Check to handle non-typed optional origin returns as none instead
+    #    of as optional in 3.7-3.8
+    check_args_exist(target_type)
+
+    # handle non-containers
+    return isinstance(obj, target_type)
+
+
+class _TensorExtractor(pickle.Pickler):
+    def __init__(self, *args, tensors: List[torch.Tensor], **kwargs):
+        super().__init__(*args, **kwargs)
+        self.tensors = tensors
+
+    def persistent_id(self, obj):
+        if isinstance(obj, torch.Tensor):
+            self.tensors.append(obj)
+            return ""
+        # Since we just want to extract tensors, we don't mind if an object is
+        # unpicklable if it doesn't contain tensors, as we can just ignore/skip
+        # it. To play it safe, we only do so for common objects that we're sure
+        # don't contain tensors. Feel free to add new types here. Note also that
+        # even if a type isn't listed here this won't block users, since thet
+        # can just add a __getstate__ or __reduce__ method to their class.
+        if isinstance(obj, LockType):
+            return ""
+        # Futures and RRefs don't technically contain a value, they just offer
+        # the means to access a value.
+        if isinstance(obj, CFuture) or is_rref_instance(obj):
+            return ""
+        if isinstance(obj, CAwait):
+            return ""
+        if isinstance(obj, torch.cuda.Event):
+            return ""
+        if isinstance(obj, threading.Thread):
+            return ""
+        return None
+
+
+def _extract_tensors(obj):
+    r"""
+    This function is exclusively called from C++.
+    See ``torch/csrc/jit/python/python_ivalue.h``.
+
+    It extracts the tensors contained in the given object, through pickling.
+    """
+    tensors: List[torch.Tensor] = []
+    extractor = _TensorExtractor(io.BytesIO(), protocol=-1, tensors=tensors)
+    extractor.dump(obj)
+    return tensors
+
+
+# In Python-3.11+ typed enums (i.e. IntEnum for example) retain number of base class methods in subclass
+# that were previously dropped. To preserve the behavior, explicitly drop them there
+
+if sys.version_info > (3, 10):
+    _drop(enum.Enum.__new__)
+    _drop(enum.Enum.__format__)
+    _drop(enum.Enum.__repr__)
+    _drop(enum.Enum.__str__)
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__init__.py b/MLPY/Lib/site-packages/torch/_lazy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee827fe4809f28fed7b065d2b96e752342367288
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/__init__.py
@@ -0,0 +1,55 @@
+import threading
+
+import torch._C._lazy
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from .closure import add_step_closure, run_step_closures
+
+
+def mark_step(device: str = "", wait=False):
+    """Triggers a mark step, which amounts to
+    - collecting a group of 'live' lazy tensors to index into the compilation cache
+      (lowering/compiling their IR graphs if not cached)
+    - kicking off execution of the compiled function
+    - (optionally, wait=True) waiting for cpu-side execution to complete (does not sync the accelerator)
+    """
+    # TODO(whc) expand this to include backend hooks and align with XLA backend needs
+    torch._C._lazy._mark_step(device, [], wait=wait)
+
+    run_step_closures()
+
+
+def wait_device_ops(devices=None):
+    """Waits for all the async operations on the given devices to complete.
+    Args:
+      devices (string..., optional): The devices whose async ops need to be waited
+        for. If empty, all the local devices will be waited for.
+    """
+    if devices is None:
+        devices = []
+    torch._C._lazy._wait_device_ops(devices=devices)
+
+
+def sync_multi(tensors, devices):
+    """
+    Sync the list of lazy tensors so there IR get lowered for the activate backend
+    and the compiled computation graph get cached.
+    """
+    torch._C._lazy._sync_multi(tensors, devices)
+
+
+def get_tensor_id(tensor):
+    """Return a unique id of the lazy tensor maintained by LTC"""
+    return torch._C._lazy._get_tensor_id(tensor)
+
+
+def to_cpu(tensors, devices=None):
+    devices = devices or ["lazy"]
+
+    flattened, spec = tree_flatten(tensors)
+    sync_multi(flattened, devices)
+    return tree_unflatten([t.to("cpu") for t in flattened], spec)
+
+
+def save(tensors, *args, **kwargs):
+    torch.save(to_cpu(tensors), *args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00ec282163633e78fefe472ce9f34e5f0d6ac677
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/closure.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/closure.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4c480f891805f4a51d555b076142b0261ef8e8f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/closure.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/computation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/computation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4423b7f56de2eed1b9e352a0c4729715af5d596d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/computation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47d5f6ce4d6f774ac1fd1b743f9c37386bb565af
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/debug.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/debug.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de58b0bf9fee6d04e155abc7691cbb8ddfa450e7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/debug.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/device_context.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/device_context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82bb35b4c30aa95185176038f6bad51adeff3aee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/device_context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/extract_compiled_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/extract_compiled_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99c0033eea7df0940b20f046836e02935a74cde7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/extract_compiled_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ir_cache.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ir_cache.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f6ccc21144e85d001c1228c05ffe1e2b6bc6354
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ir_cache.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/metrics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/metrics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..103f8461985813f5371e3c638ed4ca149094db58
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/metrics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/tensor_factory_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/tensor_factory_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3809b1575ad2339ab71ef139a6bd8e24c9ff7f7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/tensor_factory_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ts_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ts_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81d0c66bdf88f129f1e6bbc443fb98d8fe2c8112
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_lazy/__pycache__/ts_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_lazy/closure.py b/MLPY/Lib/site-packages/torch/_lazy/closure.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3c00f2814692b61eae9f62d5c53085ec7663aa4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/closure.py
@@ -0,0 +1,134 @@
+import os
+import threading
+from queue import Empty as EmptyQueue, Queue
+
+from torch._lazy.device_context import get_device_context
+
+
+class ClosureHandler:
+    def __init__(self):
+        pass
+
+    def run(self, closure):
+        """Run closure function
+
+        Args:
+        closure: callable function to run
+        """
+        closure()
+
+    def __call__(self, closures):
+        for closure in closures:
+            self.run(closure)
+
+
+class AsyncClosureHandler(ClosureHandler):
+    """Handler for Asynchronous Step Closures
+    Args:
+        max_queue_size: The maximum length of the closure queue after which
+        the training loop will block until closures are evaluated.
+        By default, a reasonable limit of a maximum of 100 on the queue.
+        This value can be set using the `XLA_MAX_ASYNC_QUEUE` environment
+        variable.
+    """
+
+    def __init__(self, max_queue_size=100):
+        super().__init__()
+        self._closure_queue: Queue = Queue(
+            int(os.environ.get("LTC_MAX_ASYNC_QUEUE", max_queue_size))
+        )
+        self._closure_exception: Queue = Queue()
+        self._closure_lock = threading.Lock()
+        self._closure_event_loop_finished = threading.Event()
+        self._closure_event_loop = None
+
+    def start_event_loop(self):
+        """Start closure event loop if not started"""
+        if self._closure_event_loop is None:
+
+            def event_loop():
+                # Run loop until closure event is set and closure queue is empty
+                while True:
+                    try:
+                        closure = self._closure_queue.get(block=True, timeout=3)
+                        closure()
+                        self._closure_queue.task_done()
+                    except EmptyQueue:
+                        with self._closure_lock:
+                            if self._closure_queue.empty():
+                                self._closure_event_loop_finished.set()
+                                return
+                    except Exception as e:
+                        self._closure_exception.put(e)
+                        return
+
+            self._closure_event_loop = threading.Thread(target=event_loop)
+            self._closure_event_loop.start()
+
+    def run(self, closure):
+        with self._closure_lock:
+            self._closure_queue.put(closure, block=True)
+            if (
+                self._closure_event_loop is None
+                or not self._closure_event_loop.is_alive()
+            ):
+                try:
+                    e = self._closure_exception.get(block=False)
+                    raise RuntimeError(
+                        "Cannot run asynchronous closure due to previously raised exception"
+                    ) from e
+                except EmptyQueue:
+                    self._closure_event_loop = None
+                    self.start_event_loop()
+
+
+def add_step_closure(closure, args=(), run_async=False):
+    """Adds a closure to the list of the ones to be run at the end of the step.
+    Many times during model training there is the need to print/report (print to
+    console, post to tensorboard, etc...) information which require the content of
+    intermediary tensors to be inspected.
+    Inspecting different tensors content in different points of the model code
+    requires many executions and typically causes performance issues.
+    Adding a step closure will ensure that it will be run after the barrier, when
+    all the live tensors will be already materialized to device data.
+    Live tensors which will include the ones captured by the closure arguments.
+    So using `add_step_closure()` will ensure a single execution will be
+    performed, even when multiple closures are queued, requiring multiple tensors
+    to be inspected.
+    Step closures will be run sequentially in the order they have been queued.
+    Note that even though using this API the execution will be optimized, it is
+    advised to throttle the printing/reporting events once every N steps.
+    Args:
+      closure (callable): The function to be called.
+      args (tuple): The arguments to be passed to the closure.
+      run_async: If True, run the closure asynchronously.
+    """
+    devctx = get_device_context()
+    closures_type = "async_step_closures" if run_async else "step_closures"
+    step_closures = getattr(devctx, closures_type, None)
+    if step_closures is None:
+        step_closures = []
+        setattr(devctx, closures_type, step_closures)
+    step_closures.append(lambda a=args: closure(*a))
+
+
+def run_step_closures():
+    devctx = get_device_context()
+    async_step_closures = getattr(devctx, "async_step_closures", None)
+    if async_step_closures is not None:
+        devctx.async_step_closures = []
+        async_closure_handler = getattr(devctx, "async_closure_handler", None)
+        if async_closure_handler is None:
+            async_closure_handler = AsyncClosureHandler()
+            devctx.async_closure_handler = async_closure_handler
+        async_closure_handler(async_step_closures)
+
+    step_closures = getattr(devctx, "step_closures", None)
+    if step_closures is not None:
+        devctx.step_closures = []
+        closure_handler = getattr(devctx, "closure_handler", None)
+        if closure_handler is None:
+            closure_handler = ClosureHandler()
+            devctx.closure_handler = closure_handler
+        closure_handler(step_closures)
+    return devctx
diff --git a/MLPY/Lib/site-packages/torch/_lazy/computation.py b/MLPY/Lib/site-packages/torch/_lazy/computation.py
new file mode 100644
index 0000000000000000000000000000000000000000..747e009ab85d5e8ac1048b9a4cd0a7e7a34111f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/computation.py
@@ -0,0 +1,26 @@
+import torch._C._lazy
+import torch._C._lazy_ts_backend
+
+
+def get_tensors_ts_device_data_node(tensors):
+    """Return tensor ids and eager tensors for DeviceData nodes in the
+    IR for the passed in lazy tensors.
+
+    TODO: This API is currently ts backend specific. We are working on
+    generalizing it to all backends including XLA.
+    """
+    return torch._C._lazy_ts_backend._get_tensors_ts_device_data_node(tensors)
+
+
+def get_graph_hash(tensors):
+    """Return the graph hash for the passed in lazy tensors"""
+    return torch._C._lazy._get_graph_hash(tensors)
+
+
+def run_cached_graph(hash_str, graph_inputs):
+    """Running the cached computation graph with the given inputs
+
+    TODO: This API is currently ts backend specific. We are working on
+    generalizing it to all backends including XLA.
+    """
+    return torch._C._lazy_ts_backend._run_cached_graph(hash_str, graph_inputs)
diff --git a/MLPY/Lib/site-packages/torch/_lazy/config.py b/MLPY/Lib/site-packages/torch/_lazy/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c204f1cd4aea9ab63039f69557b4aeec58ee0a8d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/config.py
@@ -0,0 +1,16 @@
+import torch._C._lazy
+
+
+def get_force_fallback():
+    """Get the config used to force LTC fallback"""
+    return torch._C._lazy._get_force_fallback()
+
+
+def set_force_fallback(configval):
+    """Set the config used to force LTC fallback"""
+    torch._C._lazy._set_force_fallback(configval)
+
+
+def set_reuse_ir(val: bool):
+    """Set the config to reuse IR nodes for faster tracing"""
+    torch._C._lazy._set_reuse_ir(val)
diff --git a/MLPY/Lib/site-packages/torch/_lazy/debug.py b/MLPY/Lib/site-packages/torch/_lazy/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..201674767b8c692352a87b5cd66748270c9d5210
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/debug.py
@@ -0,0 +1,21 @@
+import torch._C._lazy
+
+
+def render_ir_graph(tensors):
+    """Return a text dump of the LTC IR graph in dot format for the tensors.
+    The text can be processed by tools like dot to be rendered in pdf,png etc."""
+    return torch._C._lazy._get_tensors_dot(tensors)
+
+
+def dump_ir(tensors, ir_format):
+    """Return a dump of the tensors in the specified format.
+    Valid format are
+    - text: for LTC IR
+    - backend: for the activate backend IR
+    """
+    if ir_format == "text":
+        return torch._C._lazy._get_tensors_text(tensors)
+    elif ir_format == "backend":
+        return torch._C._lazy._get_tensors_backend(tensors)
+    else:
+        raise RuntimeError(f"Unrecognized IR format: {ir_format}")
diff --git a/MLPY/Lib/site-packages/torch/_lazy/device_context.py b/MLPY/Lib/site-packages/torch/_lazy/device_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..1332f4e9d7dddefa00369702131977d77a3933db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/device_context.py
@@ -0,0 +1,25 @@
+import threading
+from typing import Any, Dict
+
+import torch._C._lazy
+
+
+class DeviceContext:
+    _CONTEXTS: Dict[str, Any] = dict()
+    _CONTEXTS_LOCK = threading.Lock()
+
+    def __init__(self, device):
+        self.device = device
+
+
+def get_device_context(device=None):
+    if device is None:
+        device = torch._C._lazy._get_default_device_type()
+    else:
+        device = str(device)
+    with DeviceContext._CONTEXTS_LOCK:
+        devctx = DeviceContext._CONTEXTS.get(device, None)
+        if devctx is None:
+            devctx = DeviceContext(device)
+            DeviceContext._CONTEXTS[device] = devctx
+        return devctx
diff --git a/MLPY/Lib/site-packages/torch/_lazy/extract_compiled_graph.py b/MLPY/Lib/site-packages/torch/_lazy/extract_compiled_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecac11f9cb13aca2caf43e4bf25b75ce7a81d25e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/extract_compiled_graph.py
@@ -0,0 +1,223 @@
+import copy
+import dataclasses
+import itertools
+import os
+from typing import Any, Callable, Dict, List
+
+import torch
+import torch._lazy as lazy
+import torch._lazy.metrics as metrics
+from torch import fx
+from torch._lazy import computation, debug as lazy_debug
+from torch._lazy.tensor_factory_functions import tensor_factory_functions
+
+debug = os.environ.get("debug_extract_compiled_graph") is not None
+
+
+@dataclasses.dataclass
+class GraphInputMatcher:
+    """
+    The GraphInputMatcher class setup the graph inputs for future calls after lazy tracing.
+    Specifically, those graph inputs corresponding to method parameters should be replaced with the
+    arguments for the current call.
+
+    tensor_id_to_arg_idx maps the tensor id to the parameter index.
+    graph_input_tensor_ids, graph_input_ivalues list the tensor_id and ivalue for each of the
+    TS/XLA graph inputs.
+    """
+
+    tensor_id_to_arg_idx: Dict[int, int]
+    graph_input_tensor_ids: List[int]
+    # there are 2 categories of graph_input_tensors.
+    # Category 1: those whose id are not found in tensor_id_to_arg_idx. These are
+    # most likely const tensors and we can get its content from graph_input_tensors
+    # Category 2: those whose id are found in tensor_id_to_arg_idx. We should get
+    #  the tensor from method arguments
+    graph_input_ivalues: List[Any]
+
+    # get the real graph input tensors
+    def __call__(self, args):
+        real_input = []
+        for tensor_id, traced_ivalue in zip(
+            self.graph_input_tensor_ids, self.graph_input_ivalues
+        ):
+            arg_idx = self.tensor_id_to_arg_idx.get(tensor_id, None)
+            if arg_idx is None:
+                inp = traced_ivalue
+            else:
+                inp = args[arg_idx]
+            real_input.append(inp)
+        return real_input
+
+
+class ReturnValueHandler:
+    r"""
+    When ltc_sync_multi is called on multi tensors, the compiled graph
+    will contain output only for unique tensors - if a tensor appears multiple
+    times in the input to _ltc_sync_multi, only the first occurance matters.
+
+    However from python level, we still expect multi tensors returned with duplciation
+    even if the TS graph dedup the output. e.g. for method:
+
+      def forward(self, a):
+        return a, a
+
+    the TS graph captured by LTC will return a single tensor, but Python method expects 2.
+
+    This class dedup the lazy tensors first to get the index that will be used
+    to duplicate the eager tensors later.
+    """
+
+    def __init__(self, lazy_out_list):
+        self.index: List[List[int]] = []
+        self.total_count = len(lazy_out_list)
+
+        tensor_id_to_idx: Dict[int, int] = {}
+        for dup_idx, lazy_tensor in enumerate(lazy_out_list):
+            uniq_idx = tensor_id_to_idx.get(id(lazy_tensor), None)
+            if uniq_idx is not None:
+                self.index[uniq_idx].append(dup_idx)
+            else:
+                uniq_idx = len(self.index)
+                self.index.append([dup_idx])
+                tensor_id_to_idx[id(lazy_tensor)] = uniq_idx
+
+    def duplicate_eager_tensors(self, eager_tensor_list):
+        duplicated_list = [None] * self.total_count
+        assert len(eager_tensor_list) == len(self.index)
+
+        for uniq_idx, eager_tensor in enumerate(eager_tensor_list):
+            for dup_idx in self.index[uniq_idx]:
+                duplicated_list[dup_idx] = eager_tensor
+        return duplicated_list
+
+
+def force_lazy_device(model: fx.GraphModule):
+    """
+    Factory methods in a Fx graph may create tensors for a specific eager devices.
+    If we take no actions, those eager tensors will be mixed with lazy tensors and
+    cause crash. This method overwrite those eager device to lazy device.
+    """
+
+    def tolazydevice(dev):
+        if isinstance(dev, torch.device):
+            return torch.device("lazy", index=dev.index)
+        return dev
+
+    def hasDeviceArg(args, kwargs):
+        return any(
+            isinstance(arg, torch.device)
+            for arg in itertools.chain(args, kwargs.values())
+        )
+
+    for nd in model.graph.nodes:
+        nd.args = tuple(tolazydevice(arg) for arg in nd.args)
+        nd.kwargs = {k: tolazydevice(v) for k, v in nd.kwargs.items()}
+
+        # For torchbench like yolov3, hf_Bart, dynamo generates Fx graph that return
+        # eager tensors on the default device
+        # (check https://gist.github.com/shunting314/eabdf6c769c59bc384469717b8f9bb7f for yolove,
+        # and https://gist.github.com/shunting314/8d5e2d9348a3258959d3954186c48814 for hf_Bart).
+        # To force those tensors on the lazy device, we can not simply override
+        # the device argument since there is no explicit device argument.
+        # What we are doing here is, for the list of covered tensor factory methods
+        # we add a lazy device argument explicity.
+        #
+        # TODO: This solution is no ideal since we may miss some factory methods. In future
+        # when we support lazy mode, this method can be replaced by that.
+        if nd.target in tensor_factory_functions and not hasDeviceArg(
+            nd.args, nd.kwargs
+        ):
+            kwargs = dict(nd.kwargs)  # nd.kwargs is immutable. make a mutable copy.
+            kwargs["device"] = torch.device("lazy")
+            nd.kwargs = kwargs
+
+    model.recompile()
+
+
+def get_fallback_ops():
+    fallback_ops = []
+    for opname in metrics.counter_names():
+        if "aten::" not in opname:
+            continue
+        val = int(metrics.counter_value(opname))
+        if val > 0:
+            fallback_ops.append(f"{opname}={val}")
+
+    return fallback_ops
+
+
+def extract_compiled_graph(model: fx.GraphModule, example_inputs) -> Callable:
+    """
+    Optimize an eager model with LTC and returns a wrapper to execute the
+    compiled graph directly without retracing. It depends on other mechanisms
+    like TorchDynamo guards to guarantee the returned wrapper is only called
+    when it's safe.
+    """
+    lazy_args = [arg.to(device="lazy") for arg in example_inputs]
+    args_tensor_ids = [lazy.get_tensor_id(lazy_arg) for lazy_arg in lazy_args]
+    tensor_id_to_arg_idx = {tensor_id: i for i, tensor_id in enumerate(args_tensor_ids)}
+    lazy_model = copy.deepcopy(model).to(device=torch.device("lazy"))
+    force_lazy_device(lazy_model)
+
+    # This line executes lazy tracing and enable us extracting compiled graph later
+    metrics.reset()
+    lazy_out = lazy_model(*lazy_args)
+    fallback_ops = get_fallback_ops()
+    metrics.reset()
+
+    if len(fallback_ops) > 0:
+        raise RuntimeError(
+            f"Fail to extact the compiled graph because of fallback: {','.join(fallback_ops)}"
+        )
+
+    if not isinstance(lazy_out, (tuple, list)):
+        lazy_out = (lazy_out,)
+
+    args_and_out = tuple(lazy_args) + tuple(lazy_out)
+    return_value_handler = ReturnValueHandler(args_and_out)
+    if debug:
+        print("Fx code:\n", model.code)
+        print("LTC IR:", lazy_debug.dump_ir(args_and_out, "text"))
+
+    # TODO: this part is TS backend specific for now and will be generalized to
+    # support XLA
+    (
+        graph_input_tensor_ids,
+        graph_input_ivalues,
+    ) = computation.get_tensors_ts_device_data_node(args_and_out)
+    assert len(graph_input_tensor_ids) == len(graph_input_ivalues)
+    graph_input_matcher = GraphInputMatcher(
+        tensor_id_to_arg_idx, graph_input_tensor_ids, graph_input_ivalues
+    )
+
+    graph_hash = computation.get_graph_hash(args_and_out)
+
+    if debug:
+        print("graph_hash", graph_hash)
+        print(f"args_tensor_ids {args_tensor_ids}")
+        print("tensor ids from device data:", graph_input_tensor_ids)
+
+    # sync the list of output tensors so the computation graph for these
+    # tensors will be cached. Those computation graphs can be retrieved
+    # by graph hash later.
+    lazy.sync_multi(args_and_out, [])
+
+    def optimized_mod(*args):
+        if len(args_and_out) == 0:
+            return ()
+        graph_input = graph_input_matcher(args)
+        res = return_value_handler.duplicate_eager_tensors(
+            computation.run_cached_graph(graph_hash, graph_input)
+        )
+
+        assert len(res) == len(args_and_out)
+        for i, arg in enumerate(args):
+            # only copy those tensors that get inplace updated
+            if arg is not res[i]:
+                arg.copy_(res[i])
+
+        # skip the args
+        return res[len(args) :]
+
+    return optimized_mod
diff --git a/MLPY/Lib/site-packages/torch/_lazy/ir_cache.py b/MLPY/Lib/site-packages/torch/_lazy/ir_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..63cf09d13b2345210dfb06c33ac77d0dea5d6296
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/ir_cache.py
@@ -0,0 +1,13 @@
+import torch._C._lazy
+
+
+def dump(dot_file_name: str):
+    """Dump TrieCache in the dot format"""
+    return torch._C._lazy._dump_ir_cache(dot_file_name)
+
+
+def reset():
+    """Clear TrieCache. This is needed in testing to avoid
+    node reusing between different tests.
+    """
+    return torch._C._lazy._clear_ir_cache()
diff --git a/MLPY/Lib/site-packages/torch/_lazy/metrics.py b/MLPY/Lib/site-packages/torch/_lazy/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..662f77fb65d21e5297c4df23b923cb2efa47a655
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/metrics.py
@@ -0,0 +1,21 @@
+import torch._C._lazy
+
+
+def reset():
+    """Resets all metric counters."""
+    torch._C._lazy._reset_metrics()
+
+
+def counter_names():
+    """Retrieves all the currently active counter names."""
+    return torch._C._lazy._counter_names()
+
+
+def counter_value(name: str):
+    """Return the value of the counter with the speficied name"""
+    return torch._C._lazy._counter_value(name)
+
+
+def metrics_report():
+    """Return the combined (lazy core and backend) metric report"""
+    return torch._C._lazy._metrics_report()
diff --git a/MLPY/Lib/site-packages/torch/_lazy/tensor_factory_functions.py b/MLPY/Lib/site-packages/torch/_lazy/tensor_factory_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ffe07b101eb3a8713f8d5d0c6ececf452b696e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/tensor_factory_functions.py
@@ -0,0 +1,48 @@
+import torch
+
+"""
+tensor_factory_functions defines the list of torch functions that create tensors.
+The list is grabbed by searching thru native_functions.yaml by the following
+regular expression:
+
+  cat native_functions.yaml | grep 'func:' | grep -v "Tensor.*->" | grep "[-]>.*Tensor"
+
+It's possible that new tensor factory functions are added making this list stale.
+Use at your own risk or regenerate the list.
+"""
+tensor_factory_functions = (
+    torch._cudnn_init_dropout_state,
+    torch.arange,
+    torch.bartlett_window,
+    torch.blackman_window,
+    torch._empty_affine_quantized,
+    torch.empty_strided,
+    torch.eye,
+    torch.full,
+    torch.from_file,
+    torch.hann_window,
+    torch.hamming_window,
+    torch.kaiser_window,
+    torch.linspace,
+    torch.logspace,
+    torch.ones,
+    torch.scalar_tensor,
+    torch.rand,
+    torch.randint,
+    torch.randn,
+    torch.randperm,
+    torch.range,
+    torch._efficientzerotensor,
+    torch.zeros,
+    torch.tril_indices,
+    torch.triu_indices,
+    # Note: the following functions match the regular expression search above but
+    # they are not available in the torch module. Comment out.
+    # torch._sparse_coo_tensor_with_dims,
+    # torch.fft_fftfreq,
+    # torch.fft_rfftfreq,
+) + (
+    # torch.tensor is special since it's not in native_functions.yaml
+    # add it separately
+    torch.tensor,
+)
diff --git a/MLPY/Lib/site-packages/torch/_lazy/ts_backend.py b/MLPY/Lib/site-packages/torch/_lazy/ts_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b6ea374121c16284382c258eb5e90f51094061d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lazy/ts_backend.py
@@ -0,0 +1,6 @@
+import torch._C._lazy_ts_backend
+
+
+def init():
+    """Initializes the lazy Torchscript backend"""
+    torch._C._lazy_ts_backend._init()
diff --git a/MLPY/Lib/site-packages/torch/_library/__init__.py b/MLPY/Lib/site-packages/torch/_library/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..526349fcaa97c2b8b850a2b33672c8a2eb98b894
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_library/__init__.py
@@ -0,0 +1,3 @@
+import torch._library.abstract_impl
+import torch._library.simple_registry
+import torch._library.utils
diff --git a/MLPY/Lib/site-packages/torch/_library/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_library/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c0e67703f3c51291bdd255a67f0e5ea84fba83a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_library/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_library/__pycache__/abstract_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_library/__pycache__/abstract_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9131026ad7cf59304fd634e10143f665d20588dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_library/__pycache__/abstract_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_library/__pycache__/simple_registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_library/__pycache__/simple_registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d86083153a2de1f15460cc1cea9cf9c92465ca50
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_library/__pycache__/simple_registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_library/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_library/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e1eab717dcf37cddef1290cf9aa4fb43703435e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_library/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_library/abstract_impl.py b/MLPY/Lib/site-packages/torch/_library/abstract_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ef7cbf9fcafc92085b226dbecaddb4b52a990f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_library/abstract_impl.py
@@ -0,0 +1,206 @@
+import contextlib
+import functools
+import warnings
+from typing import Callable, Optional
+
+import torch
+from torch._library.utils import Kernel, RegistrationHandle
+
+
+class AbstractImplHolder:
+    """A holder where one can register an abstract impl to."""
+
+    def __init__(self, qualname: str):
+        self.qualname: str = qualname
+        self.kernel: Optional[Kernel] = None
+        self.lib: Optional[torch.library.Library] = None
+
+    def register(self, func: Callable, source: str) -> RegistrationHandle:
+        """Register an abstract impl.
+
+        Returns a RegistrationHandle that one can use to de-register this
+        abstract impl.
+        """
+        if self.kernel is not None:
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self.qualname} "
+                f"already has an abstract impl registered at "
+                f"{self.kernel.source}."
+            )
+        if torch._C._dispatch_has_kernel_for_dispatch_key(self.qualname, "Meta"):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self.qualname} "
+                f"already has an DispatchKey::Meta implementation via a "
+                f"pre-existing torch.library or TORCH_LIBRARY registration. "
+                f"Please either remove that registration or don't call "
+                f"impl_abstract."
+            )
+
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            self.qualname, "CompositeImplicitAutograd"
+        ):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self.qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing registration to "
+                f"DispatchKey::CompositeImplicitAutograd."
+                f"CompositeImplicitAutograd operators do not need an abstract "
+                f"impl; "
+                f"instead, the operator will decompose into its constituents "
+                f"and those "
+                f"can have abstract impls defined on them."
+            )
+
+        # Store the kernel in this holder
+        self.kernel = Kernel(func, source)
+
+        # Also register the abstract impl to Meta key
+        if self.lib is None:
+            ns = self.qualname.split("::")[0]
+            self.lib = torch.library.Library(ns, "FRAGMENT")
+        meta_kernel = construct_meta_kernel(self.qualname, self)
+        self.lib.impl(self.qualname, meta_kernel, "Meta")
+
+        def deregister_abstract_impl():
+            if self.lib:
+                self.lib._destroy()
+                self.lib = None
+            self.kernel = None
+
+        return RegistrationHandle(deregister_abstract_impl)
+
+
+def construct_meta_kernel(
+    qualname: str, abstract_impl_holder: AbstractImplHolder
+) -> Callable:
+    assert abstract_impl_holder.kernel is not None
+
+    @functools.wraps(abstract_impl_holder.kernel.func)
+    def meta_kernel(*args, **kwargs):
+        assert abstract_impl_holder.kernel is not None
+        source = abstract_impl_holder.kernel.source
+
+        def error_on_ctx():
+            raise RuntimeError(
+                f"Attempted to call get_ctx() for the meta implementation "
+                f"for {qualname} (implemented at {source})"
+                f"You have presumably called get_ctx() because the operator "
+                f"has a data-dependent output shape; if so, there is no "
+                f"such meta implementation and this error is the correct "
+                f"behavior."
+            )
+
+        with set_ctx_getter(error_on_ctx):
+            return abstract_impl_holder.kernel(*args, **kwargs)
+
+    return meta_kernel
+
+
+def get_none():
+    return None
+
+
+global_ctx_getter: Callable = get_none
+
+
+@contextlib.contextmanager
+def set_ctx_getter(ctx_getter):
+    global global_ctx_getter
+    prev = global_ctx_getter
+    try:
+        global_ctx_getter = ctx_getter
+        yield
+    finally:
+        global_ctx_getter = prev
+
+
+class AbstractImplCtx:
+    """
+    Context object for writing abstract implementations for custom operators.
+    """
+
+    def __init__(self, _shape_env, _op):
+        self._shape_env = _shape_env
+        self._op = _op
+
+    def create_unbacked_symint(self, *, min=2, max=None) -> torch.SymInt:
+        warnings.warn(
+            "create_unbacked_symint is deprecated, please use new_dynamic_size instead"
+        )
+        return self.new_dynamic_size(min=min, max=max)
+
+    def new_dynamic_size(self, *, min=0, max=None) -> torch.SymInt:
+        """Constructs a new symint (symbolic int) representing a data-dependent value.
+
+        This is useful for writing the abstract implementation (which is necessary
+        for torch.compile) for a CustomOp where an output Tensor has a size
+        that depends on the data of the input Tensors.
+
+        Args:
+            min (int): A statically known inclusive lower bound for this symint. Default: 0
+            max (Optional[int]): A statically known inclusive upper bound for this
+                symint. Default: None
+
+        .. warning:
+
+            It is important that the ``min`` and ``max`` (if not None) values are set
+            correctly, otherwise, there will be undefined behavior under
+            torch.compile. The default value of ``min`` is 2 due to torch.compile
+            specializing on 0/1 sizes.
+
+            You must also verify that your implementation on concrete Tensors
+            (e.g. CPU/CUDA) only returns Tensors where the size that corresponds
+            to the symint also has respects these constraint.
+            The easiest way to do this is to add an assertion in the CPU/CUDA/etc
+            implementation that the size follows these bounds.
+
+        Example::
+
+            >>> # An operator with data-dependent output shape
+            >>> lib = torch.library.Library("mymodule", "FRAGMENT")
+            >>> lib.define("mymodule::custom_nonzero(Tensor x) -> Tensor")
+            >>>
+            >>> @torch.library.impl_abstract("mymodule::custom_nonzero")
+            >>> def custom_nonzero_abstract(x):
+            >>>     # Number of nonzero-elements is data-dependent.
+            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # we use the ctx object to construct a new symint that
+            >>>     # represents the data-dependent size.
+            >>>     ctx = torch.library.get_ctx()
+            >>>     nnz = ctx.new_dynamic_size()
+            >>>     shape = [nnz, x.dim()]
+            >>>     result = x.new_empty(shape, dtype=torch.int64)
+            >>>     return result
+            >>>
+            >>> @torch.library.impl(lib, "custom_nonzero", "CPU")
+            >>> def custom_nonzero_cpu(x):
+            >>>     x_np = x.numpy()
+            >>>     res = np.stack(np.nonzero(x_np), axis=1)
+            >>>     return torch.tensor(res, device=x.device)
+
+        """
+        if (
+            self._shape_env is None
+            or not self._shape_env.allow_dynamic_output_shape_ops
+        ):
+            raise torch._subclasses.fake_tensor.DynamicOutputShapeException(self._op)
+
+        if isinstance(min, torch.SymInt) or isinstance(max, torch.SymInt):
+            raise ValueError(
+                f"ctx.new_dynamic_size(min={min}, max={max}): expected "
+                f"min and max to be statically known ints but got SymInt. "
+                f"This is not supported."
+            )
+
+        if min < 0:
+            raise ValueError(
+                f"ctx.new_dynamic_size(min={min}, ...): expected min to be "
+                f"greater than or equal to 0: this API can only create "
+                f"non-negative sizes."
+            )
+
+        result = self._shape_env.create_unbacked_symint()
+        torch.fx.experimental.symbolic_shapes._constrain_range_for_size(
+            result, min=min, max=max
+        )
+        return result
diff --git a/MLPY/Lib/site-packages/torch/_library/simple_registry.py b/MLPY/Lib/site-packages/torch/_library/simple_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..6653eed7a2cacb8f3cfba1f7445c8ca65e176f9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_library/simple_registry.py
@@ -0,0 +1,43 @@
+from .abstract_impl import AbstractImplHolder
+
+__all__ = ["SimpleLibraryRegistry", "SimpleOperatorEntry", "singleton"]
+
+
+class SimpleLibraryRegistry:
+    """Registry for the "simple" torch.library APIs
+
+    The "simple" torch.library APIs are a higher-level API on top of the
+    raw PyTorch DispatchKey registration APIs that includes:
+    - abstract impl
+
+    Registrations for these APIs do not go into the PyTorch dispatcher's
+    table because they may not directly involve a DispatchKey. For example,
+    the abstract impl is a Python function that gets invoked by FakeTensor.
+    Instead, we manage them here.
+
+    SimpleLibraryRegistry is a mapping from a fully qualified operator name
+    (including the overload) to SimpleOperatorEntry.
+    """
+
+    def __init__(self):
+        self._data = {}
+
+    def find(self, qualname: str) -> "SimpleOperatorEntry":
+        if qualname not in self._data:
+            self._data[qualname] = SimpleOperatorEntry(qualname)
+        return self._data[qualname]
+
+
+singleton: SimpleLibraryRegistry = SimpleLibraryRegistry()
+
+
+class SimpleOperatorEntry:
+    """This is 1:1 to an operator overload.
+
+    The fields of SimpleOperatorEntry are Holders where kernels can be
+    registered to.
+    """
+
+    def __init__(self, qualname: str):
+        self.qualname: str = qualname
+        self.abstract_impl: AbstractImplHolder = AbstractImplHolder(qualname)
diff --git a/MLPY/Lib/site-packages/torch/_library/utils.py b/MLPY/Lib/site-packages/torch/_library/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7fdd52b67b73c48e575f1d06b26052530f931f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_library/utils.py
@@ -0,0 +1,158 @@
+import dataclasses
+import inspect
+import sys
+from typing import Any, Callable, Tuple
+
+import torch
+
+
+@dataclasses.dataclass
+class Kernel:
+    """Models a (function, source location)"""
+
+    func: Callable
+    source: str
+
+    def __call__(self, *args, **kwargs):
+        return self.func(*args, **kwargs)
+
+
+class RegistrationHandle:
+    """Does something when someone calls .destroy() on it"""
+
+    def __init__(self, on_destroy: Callable):
+        self._on_destroy = on_destroy
+
+    def destroy(self) -> None:
+        self._on_destroy()
+
+
+def get_source(stacklevel: int) -> str:
+    """Get a string that represents the caller.
+
+    Example: "/path/to/foo.py:42"
+
+    Use stacklevel=1 to get the caller's source
+    Use stacklevel=2 to get the caller's caller's source
+    etc.
+    """
+    frame = inspect.getframeinfo(sys._getframe(stacklevel))
+    source = f"{frame.filename}:{frame.lineno}"
+    return source
+
+
+def parse_namespace(qualname: str) -> Tuple[str, str]:
+    splits = qualname.split("::")
+    if len(splits) != 2:
+        raise ValueError(
+            f"Expected `qualname` to be of the form "
+            f'"namespace::name", but got {qualname}. '
+            f"The qualname passed to the torch.library APIs must consist "
+            f"of a namespace and a name, e.g. aten::sin"
+        )
+    return splits[0], splits[1]
+
+
+def lookup_op(qualname: str) -> torch._ops.OpOverloadPacket:
+    namespace, name = parse_namespace(qualname)
+    if "." in name:
+        name, overload = name.split(".")
+    else:
+        overload = "default"
+    ns = getattr(torch.ops, namespace)
+    packet = getattr(ns, name)
+    return getattr(packet, overload)
+
+
+def is_builtin(op: torch._ops.OpOverload) -> bool:
+    assert isinstance(op, torch._ops.OpOverload)
+    return op.namespace in {"aten", "prim", "prims"}
+
+
+def is_functional_schema(schema: Any) -> bool:
+    """Check if the schema is functional.
+
+    An operator is functional if:
+    - it does not mutate any of its inputs
+    - it does not return a view on any of its inputs
+    - it has at least one return
+    """
+
+    # Lazy import because not all PyTorch builds have torchgen
+    from torchgen.model import FunctionSchema, SchemaKind
+
+    assert isinstance(schema, (str, FunctionSchema))
+    if isinstance(schema, str):
+        schema = FunctionSchema.parse(schema)
+
+    if schema.kind() != SchemaKind.functional:
+        return False
+    rets = schema.returns
+    is_non_mutating_view = len(rets) > 0 and any(
+        r.annotation is not None and not r.annotation.is_write for r in rets
+    )
+    if is_non_mutating_view:
+        return False
+    if not schema.returns:
+        return False
+    return True
+
+
+def mutates_and_returns_first_arg(op: torch._ops.OpOverload):
+    """Check if an op is an inplace aten op, i.e. it mutates and returns the first arg.
+
+    TODO: torchgen/model.py's FunctionSchema.parse is the source of truth for this,
+    but not all PyTorch builds have torchgen (due to the yaml dependency being weird).
+    Figure this out.
+
+    Example: add_(Tensor(a!) x, Tensor y) -> Tensor(a)
+    """
+    if op.namespace != "aten":
+        return False
+    schema = op._schema
+    if not len(schema.returns) == 1:
+        return False
+    if schema.returns[0].alias_info is None:
+        return False
+    alias_set = schema.returns[0].alias_info.after_set
+    if len(alias_set) != 1:
+        return False
+    loc = next(iter(alias_set))
+    if len(schema.arguments) < 1:
+        return False
+    first_arg = schema.arguments[0]
+    if first_arg.alias_info is None:
+        return False
+    if not first_arg.alias_info.is_write:
+        return False
+    alias_set = first_arg.alias_info.after_set
+    if len(alias_set) != 1:
+        return False
+    if loc != next(iter(alias_set)):
+        return False
+    for arg in schema.arguments[1:]:
+        if arg.alias_info is not None:
+            return False
+    return True
+
+
+def zip_schema(schema, args, kwargs):
+    """zips schema.arguments and (args, kwargs) together.
+
+    Assumes that (args, kwargs) were the inputs to some torch._ops.OpOverload:
+    that is, kwargs must be keyword-only arguments and default values may be omitted.
+    """
+    assert len(schema.arguments) >= len(args) + len(kwargs)
+    for i in range(len(schema.arguments)):
+        info = schema.arguments[i]
+        if info.kwarg_only:
+            if info.name in kwargs:
+                yield info, kwargs[info.name]
+            continue
+        if i >= len(args):
+            # args that are equal to their default values are not populated
+            # if they are followed by args that are equal to their defaults.
+            # Skip these.
+            continue
+        yield info, args[i]
+    return
diff --git a/MLPY/Lib/site-packages/torch/_linalg_utils.py b/MLPY/Lib/site-packages/torch/_linalg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de7fd4f72e0bd3e756cb63cfa25283b50a693c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_linalg_utils.py
@@ -0,0 +1,164 @@
+"""Various linear algebra utility methods for internal use.
+
+"""
+
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+
+def is_sparse(A):
+    """Check if tensor A is a sparse tensor"""
+    if isinstance(A, torch.Tensor):
+        return A.layout == torch.sparse_coo
+
+    error_str = "expected Tensor"
+    if not torch.jit.is_scripting():
+        error_str += f" but got {type(A)}"
+    raise TypeError(error_str)
+
+
+def get_floating_dtype(A):
+    """Return the floating point dtype of tensor A.
+
+    Integer types map to float32.
+    """
+    dtype = A.dtype
+    if dtype in (torch.float16, torch.float32, torch.float64):
+        return dtype
+    return torch.float32
+
+
+def matmul(A: Optional[Tensor], B: Tensor) -> Tensor:
+    """Multiply two matrices.
+
+    If A is None, return B. A can be sparse or dense. B is always
+    dense.
+    """
+    if A is None:
+        return B
+    if is_sparse(A):
+        return torch.sparse.mm(A, B)
+    return torch.matmul(A, B)
+
+
+def conjugate(A):
+    """Return conjugate of tensor A.
+
+    .. note:: If A's dtype is not complex, A is returned.
+    """
+    if A.is_complex():
+        return A.conj()
+    return A
+
+
+def transpose(A):
+    """Return transpose of a matrix or batches of matrices."""
+    ndim = len(A.shape)
+    return A.transpose(ndim - 1, ndim - 2)
+
+
+def transjugate(A):
+    """Return transpose conjugate of a matrix or batches of matrices."""
+    return conjugate(transpose(A))
+
+
+def bform(X: Tensor, A: Optional[Tensor], Y: Tensor) -> Tensor:
+    """Return bilinear form of matrices: :math:`X^T A Y`."""
+    return matmul(transpose(X), matmul(A, Y))
+
+
+def qform(A: Optional[Tensor], S: Tensor):
+    """Return quadratic form :math:`S^T A S`."""
+    return bform(S, A, S)
+
+
+def basis(A):
+    """Return orthogonal basis of A columns."""
+    return torch.linalg.qr(A).Q
+
+
+def symeig(A: Tensor, largest: Optional[bool] = False) -> Tuple[Tensor, Tensor]:
+    """Return eigenpairs of A with specified ordering."""
+    if largest is None:
+        largest = False
+    E, Z = torch.linalg.eigh(A, UPLO="U")
+    # assuming that E is ordered
+    if largest:
+        E = torch.flip(E, dims=(-1,))
+        Z = torch.flip(Z, dims=(-1,))
+    return E, Z
+
+
+# These functions were deprecated and removed
+# This nice error message can be removed in version 1.13+
+def matrix_rank(input, tol=None, symmetric=False, *, out=None) -> Tensor:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed.\n"
+        "Please use the `torch.linalg.matrix_rank` function instead. "
+        "The parameter 'symmetric' was renamed in `torch.linalg.matrix_rank()` to 'hermitian'."
+    )
+
+
+def solve(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.solve` is deprecated in favor of `torch.linalg.solve`. "
+        "`torch.linalg.solve` has its arguments reversed and does not return the LU factorization.\n\n"
+        "To get the LU factorization see `torch.lu`, which can be used with `torch.lu_solve` or `torch.lu_unpack`.\n"
+        "X = torch.solve(B, A).solution "
+        "should be replaced with:\n"
+        "X = torch.linalg.solve(A, B)"
+    )
+
+
+def lstsq(input: Tensor, A: Tensor, *, out=None) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.lstsq` is deprecated in favor of `torch.linalg.lstsq`.\n"
+        "`torch.linalg.lstsq` has reversed arguments and does not return the QR decomposition in "
+        "the returned tuple (although it returns other information about the problem).\n\n"
+        "To get the QR decomposition consider using `torch.linalg.qr`.\n\n"
+        "The returned solution in `torch.lstsq` stored the residuals of the solution in the "
+        "last m - n columns of the returned value whenever m > n. In torch.linalg.lstsq, "
+        "the residuals are in the field 'residuals' of the returned named tuple.\n\n"
+        "The unpacking of the solution, as in\n"
+        "X, _ = torch.lstsq(B, A).solution[:A.size(1)]\n"
+        "should be replaced with:\n"
+        "X = torch.linalg.lstsq(A, B).solution"
+    )
+
+
+def _symeig(
+    input, eigenvectors=False, upper=True, *, out=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "The default behavior has changed from using the upper triangular portion of the matrix by default "
+        "to using the lower triangular portion.\n\n"
+        "L, _ = torch.symeig(A, upper=upper) "
+        "should be replaced with:\n"
+        "L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L')\n\n"
+        "and\n\n"
+        "L, V = torch.symeig(A, eigenvectors=True) "
+        "should be replaced with:\n"
+        "L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L')"
+    )
+
+
+def eig(
+    self: Tensor, eigenvectors: bool = False, *, e=None, v=None
+) -> Tuple[Tensor, Tensor]:
+    raise RuntimeError(
+        "This function was deprecated since version 1.9 and is now removed. "
+        "`torch.linalg.eig` returns complex tensors of dtype `cfloat` or `cdouble` rather than real tensors "
+        "mimicking complex tensors.\n\n"
+        "L, _ = torch.eig(A) "
+        "should be replaced with:\n"
+        "L_complex = torch.linalg.eigvals(A)\n\n"
+        "and\n\n"
+        "L, V = torch.eig(A, eigenvectors=True) "
+        "should be replaced with:\n"
+        "L_complex, V_complex = torch.linalg.eig(A)"
+    )
diff --git a/MLPY/Lib/site-packages/torch/_lobpcg.py b/MLPY/Lib/site-packages/torch/_lobpcg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d686337f5e059a51247f9f93cbda9a3f8a9382a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lobpcg.py
@@ -0,0 +1,1167 @@
+"""Locally Optimal Block Preconditioned Conjugate Gradient methods.
+"""
+# Author: Pearu Peterson
+# Created: February 2020
+
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import Tensor
+from . import _linalg_utils as _utils
+from .overrides import handle_torch_function, has_torch_function
+
+
+__all__ = ["lobpcg"]
+
+
+def _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U):
+    # compute F, such that F_ij = (d_j - d_i)^{-1} for i != j, F_ii = 0
+    F = D.unsqueeze(-2) - D.unsqueeze(-1)
+    F.diagonal(dim1=-2, dim2=-1).fill_(float("inf"))
+    F.pow_(-1)
+
+    # A.grad = U (D.grad + (U^T U.grad * F)) U^T
+    Ut = U.mT.contiguous()
+    res = torch.matmul(
+        U, torch.matmul(torch.diag_embed(D_grad) + torch.matmul(Ut, U_grad) * F, Ut)
+    )
+
+    return res
+
+
+def _polynomial_coefficients_given_roots(roots):
+    """
+    Given the `roots` of a polynomial, find the polynomial's coefficients.
+
+    If roots = (r_1, ..., r_n), then the method returns
+    coefficients (a_0, a_1, ..., a_n (== 1)) so that
+    p(x) = (x - r_1) * ... * (x - r_n)
+         = x^n + a_{n-1} * x^{n-1} + ... a_1 * x_1 + a_0
+
+    Note: for better performance requires writing a low-level kernel
+    """
+    poly_order = roots.shape[-1]
+    poly_coeffs_shape = list(roots.shape)
+    # we assume p(x) = x^n + a_{n-1} * x^{n-1} + ... + a_1 * x + a_0,
+    # so poly_coeffs = {a_0, ..., a_n, a_{n+1}(== 1)},
+    # but we insert one extra coefficient to enable better vectorization below
+    poly_coeffs_shape[-1] += 2
+    poly_coeffs = roots.new_zeros(poly_coeffs_shape)
+    poly_coeffs[..., 0] = 1
+    poly_coeffs[..., -1] = 1
+
+    # perform the Horner's rule
+    for i in range(1, poly_order + 1):
+        # note that it is computationally hard to compute backward for this method,
+        # because then given the coefficients it would require finding the roots and/or
+        # calculating the sensitivity based on the Vieta's theorem.
+        # So the code below tries to circumvent the explicit root finding by series
+        # of operations on memory copies imitating the Horner's method.
+        # The memory copies are required to construct nodes in the computational graph
+        # by exploting the explicit (not in-place, separate node for each step)
+        # recursion of the Horner's method.
+        # Needs more memory, O(... * k^2), but with only O(... * k^2) complexity.
+        poly_coeffs_new = poly_coeffs.clone() if roots.requires_grad else poly_coeffs
+        out = poly_coeffs_new.narrow(-1, poly_order - i, i + 1)
+        out -= roots.narrow(-1, i - 1, 1) * poly_coeffs.narrow(
+            -1, poly_order - i + 1, i + 1
+        )
+        poly_coeffs = poly_coeffs_new
+
+    return poly_coeffs.narrow(-1, 1, poly_order + 1)
+
+
+def _polynomial_value(poly, x, zero_power, transition):
+    """
+    A generic method for computing poly(x) using the Horner's rule.
+
+    Args:
+      poly (Tensor): the (possibly batched) 1D Tensor representing
+                     polynomial coefficients such that
+                     poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and
+                     poly(x) = poly[..., 0] * zero_power + ... + poly[..., n] * x^n
+
+      x (Tensor): the value (possible batched) to evalate the polynomial `poly` at.
+
+      zero_power (Tensor): the representation of `x^0`. It is application-specific.
+
+      transition (Callable): the function that accepts some intermediate result `int_val`,
+                             the `x` and a specific polynomial coefficient
+                             `poly[..., k]` for some iteration `k`.
+                             It basically performs one iteration of the Horner's rule
+                             defined as `x * int_val + poly[..., k] * zero_power`.
+                             Note that `zero_power` is not a parameter,
+                             because the step `+ poly[..., k] * zero_power` depends on `x`,
+                             whether it is a vector, a matrix, or something else, so this
+                             functionality is delegated to the user.
+    """
+
+    res = zero_power.clone()
+    for k in range(poly.size(-1) - 2, -1, -1):
+        res = transition(res, x, poly[..., k])
+    return res
+
+
+def _matrix_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) matrix input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # matrix-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = x.matmul(curr_poly_val)
+        res.diagonal(dim1=-2, dim2=-1).add_(poly_coeff.unsqueeze(-1))
+        return res
+
+    if zero_power is None:
+        zero_power = torch.eye(
+            x.size(-1), x.size(-1), dtype=x.dtype, device=x.device
+        ).view(*([1] * len(list(x.shape[:-2]))), x.size(-1), x.size(-1))
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+
+def _vector_polynomial_value(poly, x, zero_power=None):
+    """
+    Evaluates `poly(x)` for the (batched) vector input `x`.
+    Check out `_polynomial_value` function for more details.
+    """
+
+    # vector-aware Horner's rule iteration
+    def transition(curr_poly_val, x, poly_coeff):
+        res = torch.addcmul(poly_coeff.unsqueeze(-1), x, curr_poly_val)
+        return res
+
+    if zero_power is None:
+        zero_power = x.new_ones(1).expand(x.shape)
+
+    return _polynomial_value(poly, x, zero_power, transition)
+
+
+def _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest):
+    # compute a projection operator onto an orthogonal subspace spanned by the
+    # columns of U defined as (I - UU^T)
+    Ut = U.mT.contiguous()
+    proj_U_ortho = -U.matmul(Ut)
+    proj_U_ortho.diagonal(dim1=-2, dim2=-1).add_(1)
+
+    # compute U_ortho, a basis for the orthogonal complement to the span(U),
+    # by projecting a random [..., m, m - k] matrix onto the subspace spanned
+    # by the columns of U.
+    #
+    # fix generator for determinism
+    gen = torch.Generator(A.device)
+
+    # orthogonal complement to the span(U)
+    U_ortho = proj_U_ortho.matmul(
+        torch.randn(
+            (*A.shape[:-1], A.size(-1) - D.size(-1)),
+            dtype=A.dtype,
+            device=A.device,
+            generator=gen,
+        )
+    )
+    U_ortho_t = U_ortho.mT.contiguous()
+
+    # compute the coefficients of the characteristic polynomial of the tensor D.
+    # Note that D is diagonal, so the diagonal elements are exactly the roots
+    # of the characteristic polynomial.
+    chr_poly_D = _polynomial_coefficients_given_roots(D)
+
+    # the code belows finds the explicit solution to the Sylvester equation
+    # U_ortho^T A U_ortho dX - dX D = -U_ortho^T A U
+    # and incorporates it into the whole gradient stored in the `res` variable.
+    #
+    # Equivalent to the following naive implementation:
+    # res = A.new_zeros(A.shape)
+    # p_res = A.new_zeros(*A.shape[:-1], D.size(-1))
+    # for k in range(1, chr_poly_D.size(-1)):
+    #     p_res.zero_()
+    #     for i in range(0, k):
+    #         p_res += (A.matrix_power(k - 1 - i) @ U_grad) * D.pow(i).unsqueeze(-2)
+    #     res -= chr_poly_D[k] * (U_ortho @ poly_D_at_A.inverse() @ U_ortho_t @  p_res @ U.t())
+    #
+    # Note that dX is a differential, so the gradient contribution comes from the backward sensitivity
+    # Tr(f(U_grad, D_grad, A, U, D)^T dX) = Tr(g(U_grad, A, U, D)^T dA) for some functions f and g,
+    # and we need to compute g(U_grad, A, U, D)
+    #
+    # The naive implementation is based on the paper
+    # Hu, Qingxi, and Daizhan Cheng.
+    # "The polynomial solution to the Sylvester matrix equation."
+    # Applied mathematics letters 19.9 (2006): 859-864.
+    #
+    # We can modify the computation of `p_res` from above in a more efficient way
+    # p_res =   U_grad * (chr_poly_D[1] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k)).unsqueeze(-2)
+    #       + A U_grad * (chr_poly_D[2] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k - 1)).unsqueeze(-2)
+    #       + ...
+    #       + A.matrix_power(k - 1) U_grad * chr_poly_D[k]
+    # Note that this saves us from redundant matrix products with A (elimination of matrix_power)
+    U_grad_projected = U_grad
+    series_acc = U_grad_projected.new_zeros(U_grad_projected.shape)
+    for k in range(1, chr_poly_D.size(-1)):
+        poly_D = _vector_polynomial_value(chr_poly_D[..., k:], D)
+        series_acc += U_grad_projected * poly_D.unsqueeze(-2)
+        U_grad_projected = A.matmul(U_grad_projected)
+
+    # compute chr_poly_D(A) which essentially is:
+    #
+    # chr_poly_D_at_A = A.new_zeros(A.shape)
+    # for k in range(chr_poly_D.size(-1)):
+    #     chr_poly_D_at_A += chr_poly_D[k] * A.matrix_power(k)
+    #
+    # Note, however, for better performance we use the Horner's rule
+    chr_poly_D_at_A = _matrix_polynomial_value(chr_poly_D, A)
+
+    # compute the action of `chr_poly_D_at_A` restricted to U_ortho_t
+    chr_poly_D_at_A_to_U_ortho = torch.matmul(
+        U_ortho_t, torch.matmul(chr_poly_D_at_A, U_ortho)
+    )
+    # we need to invert 'chr_poly_D_at_A_to_U_ortho`, for that we compute its
+    # Cholesky decomposition and then use `torch.cholesky_solve` for better stability.
+    # Cholesky decomposition requires the input to be positive-definite.
+    # Note that `chr_poly_D_at_A_to_U_ortho` is positive-definite if
+    # 1. `largest` == False, or
+    # 2. `largest` == True and `k` is even
+    # under the assumption that `A` has distinct eigenvalues.
+    #
+    # check if `chr_poly_D_at_A_to_U_ortho` is positive-definite or negative-definite
+    chr_poly_D_at_A_to_U_ortho_sign = -1 if (largest and (k % 2 == 1)) else +1
+    chr_poly_D_at_A_to_U_ortho_L = torch.linalg.cholesky(
+        chr_poly_D_at_A_to_U_ortho_sign * chr_poly_D_at_A_to_U_ortho
+    )
+
+    # compute the gradient part in span(U)
+    res = _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U)
+
+    # incorporate the Sylvester equation solution into the full gradient
+    # it resides in span(U_ortho)
+    res -= U_ortho.matmul(
+        chr_poly_D_at_A_to_U_ortho_sign
+        * torch.cholesky_solve(
+            U_ortho_t.matmul(series_acc), chr_poly_D_at_A_to_U_ortho_L
+        )
+    ).matmul(Ut)
+
+    return res
+
+
+def _symeig_backward(D_grad, U_grad, A, D, U, largest):
+    # if `U` is square, then the columns of `U` is a complete eigenspace
+    if U.size(-1) == U.size(-2):
+        return _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U)
+    else:
+        return _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest)
+
+
+class LOBPCGAutogradFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        A: Tensor,
+        k: Optional[int] = None,
+        B: Optional[Tensor] = None,
+        X: Optional[Tensor] = None,
+        n: Optional[int] = None,
+        iK: Optional[Tensor] = None,
+        niter: Optional[int] = None,
+        tol: Optional[float] = None,
+        largest: Optional[bool] = None,
+        method: Optional[str] = None,
+        tracker: None = None,
+        ortho_iparams: Optional[Dict[str, int]] = None,
+        ortho_fparams: Optional[Dict[str, float]] = None,
+        ortho_bparams: Optional[Dict[str, bool]] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        # makes sure that input is contiguous for efficiency.
+        # Note: autograd does not support dense gradients for sparse input yet.
+        A = A.contiguous() if (not A.is_sparse) else A
+        if B is not None:
+            B = B.contiguous() if (not B.is_sparse) else B
+
+        D, U = _lobpcg(
+            A,
+            k,
+            B,
+            X,
+            n,
+            iK,
+            niter,
+            tol,
+            largest,
+            method,
+            tracker,
+            ortho_iparams,
+            ortho_fparams,
+            ortho_bparams,
+        )
+
+        ctx.save_for_backward(A, B, D, U)
+        ctx.largest = largest
+
+        return D, U
+
+    @staticmethod
+    def backward(ctx, D_grad, U_grad):
+        A_grad = B_grad = None
+        grads = [None] * 14
+
+        A, B, D, U = ctx.saved_tensors
+        largest = ctx.largest
+
+        # lobpcg.backward has some limitations. Checks for unsupported input
+        if A.is_sparse or (B is not None and B.is_sparse and ctx.needs_input_grad[2]):
+            raise ValueError(
+                "lobpcg.backward does not support sparse input yet."
+                "Note that lobpcg.forward does though."
+            )
+        if (
+            A.dtype in (torch.complex64, torch.complex128)
+            or B is not None
+            and B.dtype in (torch.complex64, torch.complex128)
+        ):
+            raise ValueError(
+                "lobpcg.backward does not support complex input yet."
+                "Note that lobpcg.forward does though."
+            )
+        if B is not None:
+            raise ValueError(
+                "lobpcg.backward does not support backward with B != I yet."
+            )
+
+        if largest is None:
+            largest = True
+
+        # symeig backward
+        if B is None:
+            A_grad = _symeig_backward(D_grad, U_grad, A, D, U, largest)
+
+        # A has index 0
+        grads[0] = A_grad
+        # B has index 2
+        grads[2] = B_grad
+        return tuple(grads)
+
+
+def lobpcg(
+    A: Tensor,
+    k: Optional[int] = None,
+    B: Optional[Tensor] = None,
+    X: Optional[Tensor] = None,
+    n: Optional[int] = None,
+    iK: Optional[Tensor] = None,
+    niter: Optional[int] = None,
+    tol: Optional[float] = None,
+    largest: Optional[bool] = None,
+    method: Optional[str] = None,
+    tracker: None = None,
+    ortho_iparams: Optional[Dict[str, int]] = None,
+    ortho_fparams: Optional[Dict[str, float]] = None,
+    ortho_bparams: Optional[Dict[str, bool]] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Find the k largest (or smallest) eigenvalues and the corresponding
+    eigenvectors of a symmetric positive definite generalized
+    eigenvalue problem using matrix-free LOBPCG methods.
+
+    This function is a front-end to the following LOBPCG algorithms
+    selectable via `method` argument:
+
+      `method="basic"` - the LOBPCG method introduced by Andrew
+      Knyazev, see [Knyazev2001]. A less robust method, may fail when
+      Cholesky is applied to singular input.
+
+      `method="ortho"` - the LOBPCG method with orthogonal basis
+      selection [StathopoulosEtal2002]. A robust method.
+
+    Supported inputs are dense, sparse, and batches of dense matrices.
+
+    .. note:: In general, the basic method spends least time per
+      iteration. However, the robust methods converge much faster and
+      are more stable. So, the usage of the basic method is generally
+      not recommended but there exist cases where the usage of the
+      basic method may be preferred.
+
+    .. warning:: The backward method does not support sparse and complex inputs.
+      It works only when `B` is not provided (i.e. `B == None`).
+      We are actively working on extensions, and the details of
+      the algorithms are going to be published promptly.
+
+    .. warning:: While it is assumed that `A` is symmetric, `A.grad` is not.
+      To make sure that `A.grad` is symmetric, so that `A - t * A.grad` is symmetric
+      in first-order optimization routines, prior to running `lobpcg`
+      we do the following symmetrization map: `A -> (A + A.t()) / 2`.
+      The map is performed only when the `A` requires gradients.
+
+    Args:
+
+      A (Tensor): the input tensor of size :math:`(*, m, m)`
+
+      B (Tensor, optional): the input tensor of size :math:`(*, m,
+                  m)`. When not specified, `B` is interpreted as
+                  identity matrix.
+
+      X (tensor, optional): the input tensor of size :math:`(*, m, n)`
+                  where `k <= n <= m`. When specified, it is used as
+                  initial approximation of eigenvectors. X must be a
+                  dense tensor.
+
+      iK (tensor, optional): the input tensor of size :math:`(*, m,
+                  m)`. When specified, it will be used as preconditioner.
+
+      k (integer, optional): the number of requested
+                  eigenpairs. Default is the number of :math:`X`
+                  columns (when specified) or `1`.
+
+      n (integer, optional): if :math:`X` is not specified then `n`
+                  specifies the size of the generated random
+                  approximation of eigenvectors. Default value for `n`
+                  is `k`. If :math:`X` is specified, the value of `n`
+                  (when specified) must be the number of :math:`X`
+                  columns.
+
+      tol (float, optional): residual tolerance for stopping
+                 criterion. Default is `feps ** 0.5` where `feps` is
+                 smallest non-zero floating-point number of the given
+                 input tensor `A` data type.
+
+      largest (bool, optional): when True, solve the eigenproblem for
+                 the largest eigenvalues. Otherwise, solve the
+                 eigenproblem for smallest eigenvalues. Default is
+                 `True`.
+
+      method (str, optional): select LOBPCG method. See the
+                 description of the function above. Default is
+                 "ortho".
+
+      niter (int, optional): maximum number of iterations. When
+                 reached, the iteration process is hard-stopped and
+                 the current approximation of eigenpairs is returned.
+                 For infinite iteration but until convergence criteria
+                 is met, use `-1`.
+
+      tracker (callable, optional) : a function for tracing the
+                 iteration process. When specified, it is called at
+                 each iteration step with LOBPCG instance as an
+                 argument. The LOBPCG instance holds the full state of
+                 the iteration process in the following attributes:
+
+                   `iparams`, `fparams`, `bparams` - dictionaries of
+                   integer, float, and boolean valued input
+                   parameters, respectively
+
+                   `ivars`, `fvars`, `bvars`, `tvars` - dictionaries
+                   of integer, float, boolean, and Tensor valued
+                   iteration variables, respectively.
+
+                   `A`, `B`, `iK` - input Tensor arguments.
+
+                   `E`, `X`, `S`, `R` - iteration Tensor variables.
+
+                 For instance:
+
+                   `ivars["istep"]` - the current iteration step
+                   `X` - the current approximation of eigenvectors
+                   `E` - the current approximation of eigenvalues
+                   `R` - the current residual
+                   `ivars["converged_count"]` - the current number of converged eigenpairs
+                   `tvars["rerr"]` - the current state of convergence criteria
+
+                 Note that when `tracker` stores Tensor objects from
+                 the LOBPCG instance, it must make copies of these.
+
+                 If `tracker` sets `bvars["force_stop"] = True`, the
+                 iteration process will be hard-stopped.
+
+      ortho_iparams, ortho_fparams, ortho_bparams (dict, optional):
+                 various parameters to LOBPCG algorithm when using
+                 `method="ortho"`.
+
+    Returns:
+
+      E (Tensor): tensor of eigenvalues of size :math:`(*, k)`
+
+      X (Tensor): tensor of eigenvectors of size :math:`(*, m, k)`
+
+    References:
+
+      [Knyazev2001] Andrew V. Knyazev. (2001) Toward the Optimal
+      Preconditioned Eigensolver: Locally Optimal Block Preconditioned
+      Conjugate Gradient Method. SIAM J. Sci. Comput., 23(2),
+      517-541. (25 pages)
+      https://epubs.siam.org/doi/abs/10.1137/S1064827500366124
+
+      [StathopoulosEtal2002] Andreas Stathopoulos and Kesheng
+      Wu. (2002) A Block Orthogonalization Procedure with Constant
+      Synchronization Requirements. SIAM J. Sci. Comput., 23(6),
+      2165-2182. (18 pages)
+      https://epubs.siam.org/doi/10.1137/S1064827500370883
+
+      [DuerschEtal2018] Jed A. Duersch, Meiyue Shao, Chao Yang, Ming
+      Gu. (2018) A Robust and Efficient Implementation of LOBPCG.
+      SIAM J. Sci. Comput., 40(5), C655-C676. (22 pages)
+      https://epubs.siam.org/doi/abs/10.1137/17M1129830
+
+    """
+
+    if not torch.jit.is_scripting():
+        tensor_ops = (A, B, X, iK)
+        if not set(map(type, tensor_ops)).issubset(
+            (torch.Tensor, type(None))
+        ) and has_torch_function(tensor_ops):
+            return handle_torch_function(
+                lobpcg,
+                tensor_ops,
+                A,
+                k=k,
+                B=B,
+                X=X,
+                n=n,
+                iK=iK,
+                niter=niter,
+                tol=tol,
+                largest=largest,
+                method=method,
+                tracker=tracker,
+                ortho_iparams=ortho_iparams,
+                ortho_fparams=ortho_fparams,
+                ortho_bparams=ortho_bparams,
+            )
+
+    if not torch._jit_internal.is_scripting():
+        if A.requires_grad or (B is not None and B.requires_grad):
+            # While it is expected that `A` is symmetric,
+            # the `A_grad` might be not. Therefore we perform the trick below,
+            # so that `A_grad` becomes symmetric.
+            # The symmetrization is important for first-order optimization methods,
+            # so that (A - alpha * A_grad) is still a symmetric matrix.
+            # Same holds for `B`.
+            A_sym = (A + A.mT) / 2
+            B_sym = (B + B.mT) / 2 if (B is not None) else None
+
+            return LOBPCGAutogradFunction.apply(
+                A_sym,
+                k,
+                B_sym,
+                X,
+                n,
+                iK,
+                niter,
+                tol,
+                largest,
+                method,
+                tracker,
+                ortho_iparams,
+                ortho_fparams,
+                ortho_bparams,
+            )
+    else:
+        if A.requires_grad or (B is not None and B.requires_grad):
+            raise RuntimeError(
+                "Script and require grads is not supported atm."
+                "If you just want to do the forward, use .detach()"
+                "on A and B before calling into lobpcg"
+            )
+
+    return _lobpcg(
+        A,
+        k,
+        B,
+        X,
+        n,
+        iK,
+        niter,
+        tol,
+        largest,
+        method,
+        tracker,
+        ortho_iparams,
+        ortho_fparams,
+        ortho_bparams,
+    )
+
+
+def _lobpcg(
+    A: Tensor,
+    k: Optional[int] = None,
+    B: Optional[Tensor] = None,
+    X: Optional[Tensor] = None,
+    n: Optional[int] = None,
+    iK: Optional[Tensor] = None,
+    niter: Optional[int] = None,
+    tol: Optional[float] = None,
+    largest: Optional[bool] = None,
+    method: Optional[str] = None,
+    tracker: None = None,
+    ortho_iparams: Optional[Dict[str, int]] = None,
+    ortho_fparams: Optional[Dict[str, float]] = None,
+    ortho_bparams: Optional[Dict[str, bool]] = None,
+) -> Tuple[Tensor, Tensor]:
+    # A must be square:
+    assert A.shape[-2] == A.shape[-1], A.shape
+    if B is not None:
+        # A and B must have the same shapes:
+        assert A.shape == B.shape, (A.shape, B.shape)
+
+    dtype = _utils.get_floating_dtype(A)
+    device = A.device
+    if tol is None:
+        feps = {torch.float32: 1.2e-07, torch.float64: 2.23e-16}[dtype]
+        tol = feps**0.5
+
+    m = A.shape[-1]
+    k = (1 if X is None else X.shape[-1]) if k is None else k
+    n = (k if n is None else n) if X is None else X.shape[-1]
+
+    if m < 3 * n:
+        raise ValueError(
+            f"LPBPCG algorithm is not applicable when the number of A rows (={m})"
+            f" is smaller than 3 x the number of requested eigenpairs (={n})"
+        )
+
+    method = "ortho" if method is None else method
+
+    iparams = {
+        "m": m,
+        "n": n,
+        "k": k,
+        "niter": 1000 if niter is None else niter,
+    }
+
+    fparams = {
+        "tol": tol,
+    }
+
+    bparams = {"largest": True if largest is None else largest}
+
+    if method == "ortho":
+        if ortho_iparams is not None:
+            iparams.update(ortho_iparams)
+        if ortho_fparams is not None:
+            fparams.update(ortho_fparams)
+        if ortho_bparams is not None:
+            bparams.update(ortho_bparams)
+        iparams["ortho_i_max"] = iparams.get("ortho_i_max", 3)
+        iparams["ortho_j_max"] = iparams.get("ortho_j_max", 3)
+        fparams["ortho_tol"] = fparams.get("ortho_tol", tol)
+        fparams["ortho_tol_drop"] = fparams.get("ortho_tol_drop", tol)
+        fparams["ortho_tol_replace"] = fparams.get("ortho_tol_replace", tol)
+        bparams["ortho_use_drop"] = bparams.get("ortho_use_drop", False)
+
+    if not torch.jit.is_scripting():
+        LOBPCG.call_tracker = LOBPCG_call_tracker  # type: ignore[method-assign]
+
+    if len(A.shape) > 2:
+        N = int(torch.prod(torch.tensor(A.shape[:-2])))
+        bA = A.reshape((N,) + A.shape[-2:])
+        bB = B.reshape((N,) + A.shape[-2:]) if B is not None else None
+        bX = X.reshape((N,) + X.shape[-2:]) if X is not None else None
+        bE = torch.empty((N, k), dtype=dtype, device=device)
+        bXret = torch.empty((N, m, k), dtype=dtype, device=device)
+
+        for i in range(N):
+            A_ = bA[i]
+            B_ = bB[i] if bB is not None else None
+            X_ = (
+                torch.randn((m, n), dtype=dtype, device=device) if bX is None else bX[i]
+            )
+            assert len(X_.shape) == 2 and X_.shape == (m, n), (X_.shape, (m, n))
+            iparams["batch_index"] = i
+            worker = LOBPCG(A_, B_, X_, iK, iparams, fparams, bparams, method, tracker)
+            worker.run()
+            bE[i] = worker.E[:k]
+            bXret[i] = worker.X[:, :k]
+
+        if not torch.jit.is_scripting():
+            LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[method-assign]
+
+        return bE.reshape(A.shape[:-2] + (k,)), bXret.reshape(A.shape[:-2] + (m, k))
+
+    X = torch.randn((m, n), dtype=dtype, device=device) if X is None else X
+    assert len(X.shape) == 2 and X.shape == (m, n), (X.shape, (m, n))
+
+    worker = LOBPCG(A, B, X, iK, iparams, fparams, bparams, method, tracker)
+
+    worker.run()
+
+    if not torch.jit.is_scripting():
+        LOBPCG.call_tracker = LOBPCG_call_tracker_orig  # type: ignore[method-assign]
+
+    return worker.E[:k], worker.X[:, :k]
+
+
+class LOBPCG:
+    """Worker class of LOBPCG methods."""
+
+    def __init__(
+        self,
+        A: Optional[Tensor],
+        B: Optional[Tensor],
+        X: Tensor,
+        iK: Optional[Tensor],
+        iparams: Dict[str, int],
+        fparams: Dict[str, float],
+        bparams: Dict[str, bool],
+        method: str,
+        tracker: None,
+    ) -> None:
+        # constant parameters
+        self.A = A
+        self.B = B
+        self.iK = iK
+        self.iparams = iparams
+        self.fparams = fparams
+        self.bparams = bparams
+        self.method = method
+        self.tracker = tracker
+        m = iparams["m"]
+        n = iparams["n"]
+
+        # variable parameters
+        self.X = X
+        self.E = torch.zeros((n,), dtype=X.dtype, device=X.device)
+        self.R = torch.zeros((m, n), dtype=X.dtype, device=X.device)
+        self.S = torch.zeros((m, 3 * n), dtype=X.dtype, device=X.device)
+        self.tvars: Dict[str, Tensor] = {}
+        self.ivars: Dict[str, int] = {"istep": 0}
+        self.fvars: Dict[str, float] = {"_": 0.0}
+        self.bvars: Dict[str, bool] = {"_": False}
+
+    def __str__(self):
+        lines = ["LOPBCG:"]
+        lines += [f"  iparams={self.iparams}"]
+        lines += [f"  fparams={self.fparams}"]
+        lines += [f"  bparams={self.bparams}"]
+        lines += [f"  ivars={self.ivars}"]
+        lines += [f"  fvars={self.fvars}"]
+        lines += [f"  bvars={self.bvars}"]
+        lines += [f"  tvars={self.tvars}"]
+        lines += [f"  A={self.A}"]
+        lines += [f"  B={self.B}"]
+        lines += [f"  iK={self.iK}"]
+        lines += [f"  X={self.X}"]
+        lines += [f"  E={self.E}"]
+        r = ""
+        for line in lines:
+            r += line + "\n"
+        return r
+
+    def update(self):
+        """Set and update iteration variables."""
+        if self.ivars["istep"] == 0:
+            X_norm = float(torch.norm(self.X))
+            iX_norm = X_norm**-1
+            A_norm = float(torch.norm(_utils.matmul(self.A, self.X))) * iX_norm
+            B_norm = float(torch.norm(_utils.matmul(self.B, self.X))) * iX_norm
+            self.fvars["X_norm"] = X_norm
+            self.fvars["A_norm"] = A_norm
+            self.fvars["B_norm"] = B_norm
+            self.ivars["iterations_left"] = self.iparams["niter"]
+            self.ivars["converged_count"] = 0
+            self.ivars["converged_end"] = 0
+
+        if self.method == "ortho":
+            self._update_ortho()
+        else:
+            self._update_basic()
+
+        self.ivars["iterations_left"] = self.ivars["iterations_left"] - 1
+        self.ivars["istep"] = self.ivars["istep"] + 1
+
+    def update_residual(self):
+        """Update residual R from A, B, X, E."""
+        mm = _utils.matmul
+        self.R = mm(self.A, self.X) - mm(self.B, self.X) * self.E
+
+    def update_converged_count(self):
+        """Determine the number of converged eigenpairs using backward stable
+        convergence criterion, see discussion in Sec 4.3 of [DuerschEtal2018].
+
+        Users may redefine this method for custom convergence criteria.
+        """
+        # (...) -> int
+        prev_count = self.ivars["converged_count"]
+        tol = self.fparams["tol"]
+        A_norm = self.fvars["A_norm"]
+        B_norm = self.fvars["B_norm"]
+        E, X, R = self.E, self.X, self.R
+        rerr = (
+            torch.norm(R, 2, (0,))
+            * (torch.norm(X, 2, (0,)) * (A_norm + E[: X.shape[-1]] * B_norm)) ** -1
+        )
+        converged = rerr < tol
+        count = 0
+        for b in converged:
+            if not b:
+                # ignore convergence of following pairs to ensure
+                # strict ordering of eigenpairs
+                break
+            count += 1
+        assert (
+            count >= prev_count
+        ), f"the number of converged eigenpairs (was {prev_count}, got {count}) cannot decrease"
+        self.ivars["converged_count"] = count
+        self.tvars["rerr"] = rerr
+        return count
+
+    def stop_iteration(self):
+        """Return True to stop iterations.
+
+        Note that tracker (if defined) can force-stop iterations by
+        setting ``worker.bvars['force_stop'] = True``.
+        """
+        return (
+            self.bvars.get("force_stop", False)
+            or self.ivars["iterations_left"] == 0
+            or self.ivars["converged_count"] >= self.iparams["k"]
+        )
+
+    def run(self):
+        """Run LOBPCG iterations.
+
+        Use this method as a template for implementing LOBPCG
+        iteration scheme with custom tracker that is compatible with
+        TorchScript.
+        """
+        self.update()
+
+        if not torch.jit.is_scripting() and self.tracker is not None:
+            self.call_tracker()
+
+        while not self.stop_iteration():
+            self.update()
+
+            if not torch.jit.is_scripting() and self.tracker is not None:
+                self.call_tracker()
+
+    @torch.jit.unused
+    def call_tracker(self):
+        """Interface for tracking iteration process in Python mode.
+
+        Tracking the iteration process is disabled in TorchScript
+        mode. In fact, one should specify tracker=None when JIT
+        compiling functions using lobpcg.
+        """
+        # do nothing when in TorchScript mode
+        pass
+
+    # Internal methods
+
+    def _update_basic(self):
+        """
+        Update or initialize iteration variables when `method == "basic"`.
+        """
+        mm = torch.matmul
+        ns = self.ivars["converged_end"]
+        nc = self.ivars["converged_count"]
+        n = self.iparams["n"]
+        largest = self.bparams["largest"]
+
+        if self.ivars["istep"] == 0:
+            Ri = self._get_rayleigh_ritz_transform(self.X)
+            M = _utils.qform(_utils.qform(self.A, self.X), Ri)
+            E, Z = _utils.symeig(M, largest)
+            self.X[:] = mm(self.X, mm(Ri, Z))
+            self.E[:] = E
+            np = 0
+            self.update_residual()
+            nc = self.update_converged_count()
+            self.S[..., :n] = self.X
+
+            W = _utils.matmul(self.iK, self.R)
+            self.ivars["converged_end"] = ns = n + np + W.shape[-1]
+            self.S[:, n + np : ns] = W
+        else:
+            S_ = self.S[:, nc:ns]
+            Ri = self._get_rayleigh_ritz_transform(S_)
+            M = _utils.qform(_utils.qform(self.A, S_), Ri)
+            E_, Z = _utils.symeig(M, largest)
+            self.X[:, nc:] = mm(S_, mm(Ri, Z[:, : n - nc]))
+            self.E[nc:] = E_[: n - nc]
+            P = mm(S_, mm(Ri, Z[:, n : 2 * n - nc]))
+            np = P.shape[-1]
+
+            self.update_residual()
+            nc = self.update_converged_count()
+            self.S[..., :n] = self.X
+            self.S[:, n : n + np] = P
+            W = _utils.matmul(self.iK, self.R[:, nc:])
+
+            self.ivars["converged_end"] = ns = n + np + W.shape[-1]
+            self.S[:, n + np : ns] = W
+
+    def _update_ortho(self):
+        """
+        Update or initialize iteration variables when `method == "ortho"`.
+        """
+        mm = torch.matmul
+        ns = self.ivars["converged_end"]
+        nc = self.ivars["converged_count"]
+        n = self.iparams["n"]
+        largest = self.bparams["largest"]
+
+        if self.ivars["istep"] == 0:
+            Ri = self._get_rayleigh_ritz_transform(self.X)
+            M = _utils.qform(_utils.qform(self.A, self.X), Ri)
+            E, Z = _utils.symeig(M, largest)
+            self.X = mm(self.X, mm(Ri, Z))
+            self.update_residual()
+            np = 0
+            nc = self.update_converged_count()
+            self.S[:, :n] = self.X
+            W = self._get_ortho(self.R, self.X)
+            ns = self.ivars["converged_end"] = n + np + W.shape[-1]
+            self.S[:, n + np : ns] = W
+
+        else:
+            S_ = self.S[:, nc:ns]
+            # Rayleigh-Ritz procedure
+            E_, Z = _utils.symeig(_utils.qform(self.A, S_), largest)
+
+            # Update E, X, P
+            self.X[:, nc:] = mm(S_, Z[:, : n - nc])
+            self.E[nc:] = E_[: n - nc]
+            P = mm(
+                S_,
+                mm(
+                    Z[:, n - nc :],
+                    _utils.basis(_utils.transpose(Z[: n - nc, n - nc :])),
+                ),
+            )
+            np = P.shape[-1]
+
+            # check convergence
+            self.update_residual()
+            nc = self.update_converged_count()
+
+            # update S
+            self.S[:, :n] = self.X
+            self.S[:, n : n + np] = P
+            W = self._get_ortho(self.R[:, nc:], self.S[:, : n + np])
+            ns = self.ivars["converged_end"] = n + np + W.shape[-1]
+            self.S[:, n + np : ns] = W
+
+    def _get_rayleigh_ritz_transform(self, S):
+        """Return a transformation matrix that is used in Rayleigh-Ritz
+        procedure for reducing a general eigenvalue problem :math:`(S^TAS)
+        C = (S^TBS) C E` to a standard eigenvalue problem :math: `(Ri^T
+        S^TAS Ri) Z = Z E` where `C = Ri Z`.
+
+        .. note:: In the original Rayleight-Ritz procedure in
+          [DuerschEtal2018], the problem is formulated as follows::
+
+            SAS = S^T A S
+            SBS = S^T B S
+            D = (<diagonal matrix of SBS>) ** -1/2
+            R^T R = Cholesky(D SBS D)
+            Ri = D R^-1
+            solve symeig problem Ri^T SAS Ri Z = Theta Z
+            C = Ri Z
+
+          To reduce the number of matrix products (denoted by empty
+          space between matrices), here we introduce element-wise
+          products (denoted by symbol `*`) so that the Rayleight-Ritz
+          procedure becomes::
+
+            SAS = S^T A S
+            SBS = S^T B S
+            d = (<diagonal of SBS>) ** -1/2    # this is 1-d column vector
+            dd = d d^T                         # this is 2-d matrix
+            R^T R = Cholesky(dd * SBS)
+            Ri = R^-1 * d                      # broadcasting
+            solve symeig problem Ri^T SAS Ri Z = Theta Z
+            C = Ri Z
+
+          where `dd` is 2-d matrix that replaces matrix products `D M
+          D` with one element-wise product `M * dd`; and `d` replaces
+          matrix product `D M` with element-wise product `M *
+          d`. Also, creating the diagonal matrix `D` is avoided.
+
+        Args:
+        S (Tensor): the matrix basis for the search subspace, size is
+                    :math:`(m, n)`.
+
+        Returns:
+        Ri (tensor): upper-triangular transformation matrix of size
+                     :math:`(n, n)`.
+
+        """
+        B = self.B
+        mm = torch.matmul
+        SBS = _utils.qform(B, S)
+        d_row = SBS.diagonal(0, -2, -1) ** -0.5
+        d_col = d_row.reshape(d_row.shape[0], 1)
+        # TODO use torch.linalg.cholesky_solve once it is implemented
+        R = torch.linalg.cholesky((SBS * d_row) * d_col, upper=True)
+        return torch.linalg.solve_triangular(
+            R, d_row.diag_embed(), upper=True, left=False
+        )
+
+    def _get_svqb(
+        self, U: Tensor, drop: bool, tau: float  # Tensor  # bool  # float
+    ) -> Tensor:
+        """Return B-orthonormal U.
+
+        .. note:: When `drop` is `False` then `svqb` is based on the
+                  Algorithm 4 from [DuerschPhD2015] that is a slight
+                  modification of the corresponding algorithm
+                  introduced in [StathopolousWu2002].
+
+        Args:
+
+          U (Tensor) : initial approximation, size is (m, n)
+          drop (bool) : when True, drop columns that
+                     contribution to the `span([U])` is small.
+          tau (float) : positive tolerance
+
+        Returns:
+
+          U (Tensor) : B-orthonormal columns (:math:`U^T B U = I`), size
+                       is (m, n1), where `n1 = n` if `drop` is `False,
+                       otherwise `n1 <= n`.
+
+        """
+        if torch.numel(U) == 0:
+            return U
+        UBU = _utils.qform(self.B, U)
+        d = UBU.diagonal(0, -2, -1)
+
+        # Detect and drop exact zero columns from U. While the test
+        # `abs(d) == 0` is unlikely to be True for random data, it is
+        # possible to construct input data to lobpcg where it will be
+        # True leading to a failure (notice the `d ** -0.5` operation
+        # in the original algorithm). To prevent the failure, we drop
+        # the exact zero columns here and then continue with the
+        # original algorithm below.
+        nz = torch.where(abs(d) != 0.0)
+        assert len(nz) == 1, nz
+        if len(nz[0]) < len(d):
+            U = U[:, nz[0]]
+            if torch.numel(U) == 0:
+                return U
+            UBU = _utils.qform(self.B, U)
+            d = UBU.diagonal(0, -2, -1)
+            nz = torch.where(abs(d) != 0.0)
+            assert len(nz[0]) == len(d)
+
+        # The original algorithm 4 from [DuerschPhD2015].
+        d_col = (d**-0.5).reshape(d.shape[0], 1)
+        DUBUD = (UBU * d_col) * _utils.transpose(d_col)
+        E, Z = _utils.symeig(DUBUD)
+        t = tau * abs(E).max()
+        if drop:
+            keep = torch.where(E > t)
+            assert len(keep) == 1, keep
+            E = E[keep[0]]
+            Z = Z[:, keep[0]]
+            d_col = d_col[keep[0]]
+        else:
+            E[(torch.where(E < t))[0]] = t
+
+        return torch.matmul(U * _utils.transpose(d_col), Z * E**-0.5)
+
+    def _get_ortho(self, U, V):
+        """Return B-orthonormal U with columns are B-orthogonal to V.
+
+        .. note:: When `bparams["ortho_use_drop"] == False` then
+                  `_get_ortho` is based on the Algorithm 3 from
+                  [DuerschPhD2015] that is a slight modification of
+                  the corresponding algorithm introduced in
+                  [StathopolousWu2002]. Otherwise, the method
+                  implements Algorithm 6 from [DuerschPhD2015]
+
+        .. note:: If all U columns are B-collinear to V then the
+                  returned tensor U will be empty.
+
+        Args:
+
+          U (Tensor) : initial approximation, size is (m, n)
+          V (Tensor) : B-orthogonal external basis, size is (m, k)
+
+        Returns:
+
+          U (Tensor) : B-orthonormal columns (:math:`U^T B U = I`)
+                       such that :math:`V^T B U=0`, size is (m, n1),
+                       where `n1 = n` if `drop` is `False, otherwise
+                       `n1 <= n`.
+        """
+        mm = torch.matmul
+        mm_B = _utils.matmul
+        m = self.iparams["m"]
+        tau_ortho = self.fparams["ortho_tol"]
+        tau_drop = self.fparams["ortho_tol_drop"]
+        tau_replace = self.fparams["ortho_tol_replace"]
+        i_max = self.iparams["ortho_i_max"]
+        j_max = self.iparams["ortho_j_max"]
+        # when use_drop==True, enable dropping U columns that have
+        # small contribution to the `span([U, V])`.
+        use_drop = self.bparams["ortho_use_drop"]
+
+        # clean up variables from the previous call
+        for vkey in list(self.fvars.keys()):
+            if vkey.startswith("ortho_") and vkey.endswith("_rerr"):
+                self.fvars.pop(vkey)
+        self.ivars.pop("ortho_i", 0)
+        self.ivars.pop("ortho_j", 0)
+
+        BV_norm = torch.norm(mm_B(self.B, V))
+        BU = mm_B(self.B, U)
+        VBU = mm(_utils.transpose(V), BU)
+        i = j = 0
+        stats = ""
+        for i in range(i_max):
+            U = U - mm(V, VBU)
+            drop = False
+            tau_svqb = tau_drop
+            for j in range(j_max):
+                if use_drop:
+                    U = self._get_svqb(U, drop, tau_svqb)
+                    drop = True
+                    tau_svqb = tau_replace
+                else:
+                    U = self._get_svqb(U, False, tau_replace)
+                if torch.numel(U) == 0:
+                    # all initial U columns are B-collinear to V
+                    self.ivars["ortho_i"] = i
+                    self.ivars["ortho_j"] = j
+                    return U
+                BU = mm_B(self.B, U)
+                UBU = mm(_utils.transpose(U), BU)
+                U_norm = torch.norm(U)
+                BU_norm = torch.norm(BU)
+                R = UBU - torch.eye(UBU.shape[-1], device=UBU.device, dtype=UBU.dtype)
+                R_norm = torch.norm(R)
+                # https://github.com/pytorch/pytorch/issues/33810 workaround:
+                rerr = float(R_norm) * float(BU_norm * U_norm) ** -1
+                vkey = f"ortho_UBUmI_rerr[{i}, {j}]"
+                self.fvars[vkey] = rerr
+                if rerr < tau_ortho:
+                    break
+            VBU = mm(_utils.transpose(V), BU)
+            VBU_norm = torch.norm(VBU)
+            U_norm = torch.norm(U)
+            rerr = float(VBU_norm) * float(BV_norm * U_norm) ** -1
+            vkey = f"ortho_VBU_rerr[{i}]"
+            self.fvars[vkey] = rerr
+            if rerr < tau_ortho:
+                break
+            if m < U.shape[-1] + V.shape[-1]:
+                # TorchScript needs the class var to be assigned to a local to
+                # do optional type refinement
+                B = self.B
+                assert B is not None
+                raise ValueError(
+                    "Overdetermined shape of U:"
+                    f" #B-cols(={B.shape[-1]}) >= #U-cols(={U.shape[-1]}) + #V-cols(={V.shape[-1]}) must hold"
+                )
+        self.ivars["ortho_i"] = i
+        self.ivars["ortho_j"] = j
+        return U
+
+
+# Calling tracker is separated from LOBPCG definitions because
+# TorchScript does not support user-defined callback arguments:
+LOBPCG_call_tracker_orig = LOBPCG.call_tracker
+
+
+def LOBPCG_call_tracker(self):
+    self.tracker(self)
diff --git a/MLPY/Lib/site-packages/torch/_logging/__init__.py b/MLPY/Lib/site-packages/torch/_logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a388324056a62d7b70e75c7284c0ae3f79c06a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_logging/__init__.py
@@ -0,0 +1,16 @@
+# Top level logging module for torch logging
+# Design doc: https://docs.google.com/document/d/1ZRfTWKa8eaPq1AxaiHrq4ASTPouzzlPiuquSBEJYwS8/edit#
+# Simple setup for onboarding (see above doc for more detail):
+# 1. register any top-level log qualified name for your module in torch._logging._registrations (see there for examples)
+# 2. register any artifacts (<artifact_name> below) in torch._logging._registrations
+#   a. call getArtifactLogger(__name__, <artifact_name>) at your logging site instead of the standard logger to log your artifact
+import torch._logging._registrations
+from ._internal import (
+    _init_logs,
+    DEFAULT_LOGGING,
+    getArtifactLogger,
+    LazyString,
+    set_logs,
+    trace_structured,
+    warning_once,
+)
diff --git a/MLPY/Lib/site-packages/torch/_logging/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_logging/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4825548a3f3211f4faa64d0c2a0b473488f7ad23
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_logging/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_logging/__pycache__/_internal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_logging/__pycache__/_internal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ad80d979c054f1c8f5968320d6078162b28ec02
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_logging/__pycache__/_internal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_logging/__pycache__/_registrations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_logging/__pycache__/_registrations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7480f6096e877f0df667072af729ce7e195d6660
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_logging/__pycache__/_registrations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_logging/__pycache__/structured.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_logging/__pycache__/structured.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42e1e97fc6e2fff82e9dca353fcf68ed8ac33a99
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_logging/__pycache__/structured.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_logging/_internal.py b/MLPY/Lib/site-packages/torch/_logging/_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fad1c394cf704f0147c7e43927cec5b8c4e29b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_logging/_internal.py
@@ -0,0 +1,1085 @@
+import functools
+import hashlib
+import itertools
+import json
+import logging
+import os
+import os.path
+import re
+import tempfile
+from dataclasses import dataclass, field
+from importlib import __import__
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from weakref import WeakSet
+
+log = logging.getLogger(__name__)
+
+# This is a synthetic logger which doesn't correspond to an actual logger,
+# but handles all of our "tracing" logging, which is structured and doesn't go
+# to stderr but always goes to a dedicated log file.  We don't put these
+# loggers in the classic module hierarchy, because we don't want a suppression
+# of logs to also cause a trace to get suppressed (traces typically are not
+# collected, unless we are in prod, in which case they always are collected.)
+#
+# TODO: Maybe we should allow for some sub-hierarchy so you can control which
+# traces you want to collect, for performance reasons.
+#
+# See https://docs.google.com/document/d/1CX_hJ0PNy9f3R1y8TJrfkSeLkvGjjjLU84BSXgS2AZ8/edit
+trace_log = logging.getLogger("torch.__trace")
+
+DEFAULT_LOG_LEVEL = logging.WARNING
+LOG_ENV_VAR = "TORCH_LOGS"
+LOG_OUT_ENV_VAR = "TORCH_LOGS_OUT"
+LOG_FORMAT_ENV_VAR = "TORCH_LOGS_FORMAT"
+TRACE_ENV_VAR = "TORCH_TRACE"
+
+
+@dataclass
+class LogRegistry:
+    # shorthand name to log qualified name
+    # Note: this only contains loggers registered
+    # from register_log
+    # e.g. "dynamo" -> "torch._dynamo"
+    log_alias_to_log_qnames: Dict[str, List[str]] = field(default_factory=dict)
+
+    # artifact logger qualified names,
+    # this is populated lazily, as calls to getArtifactLogger
+    # currently formatted as <module>.__<artifact_name>
+    # e.g. "torch._dynamo.convert_frame.__guards"
+    artifact_log_qnames: Set[str] = field(default_factory=set)
+
+    # child logs of registered logs if specified via open
+    # registration by the user (ie placing "torch._dynamo.output_graph" in the env var)
+    # these need to be tracked so their levels can be reset properly
+    # e.g. "torch._dynamo.output_graph"
+    child_log_qnames: Set[str] = field(default_factory=set)
+
+    # artifact names, populated by register_artifact
+    # e.g. "guards"
+    artifact_names: Set[str] = field(default_factory=set)
+
+    # Artifacts that should be visible by default in the error message
+    visible_artifacts: Set[str] = field(default_factory=set)
+
+    # A short description of each artifact
+    artifact_descriptions: Dict[str, str] = field(default_factory=dict)
+
+    # artifacts which are not displayed unless explicitly named in the
+    # settings. Ex. output_code is NOT displayed even if the inductor
+    # log level is set to DEBUG. It must be explicitly named in the settings
+    off_by_default_artifact_names: Set[str] = field(default_factory=set)
+
+    # logging format string for artifacts
+    artifact_log_formatters: Dict[str, logging.Formatter] = field(default_factory=dict)
+
+    def is_artifact(self, name):
+        return name in self.artifact_names
+
+    def is_log(self, alias):
+        return alias in self.log_alias_to_log_qnames
+
+    # register a log with an alias
+    def register_log(self, alias, log_qnames: Union[str, List[str]]):
+        if isinstance(log_qnames, str):
+            log_qnames = [log_qnames]
+        self.log_alias_to_log_qnames[alias] = log_qnames
+
+    # register an artifact name
+    def register_artifact_name(
+        self, name, description, visible, off_by_default, log_format
+    ):
+        self.artifact_names.add(name)
+        if visible:
+            self.visible_artifacts.add(name)
+        self.artifact_descriptions[name] = description
+
+        # if off by default, don't enable it
+        # when log_name's log_level is set to DEBUG
+        if off_by_default:
+            self.off_by_default_artifact_names.add(name)
+
+        if log_format is not None:
+            self.artifact_log_formatters[name] = logging.Formatter(log_format)
+
+    # register the qualified name of an artifact log
+    # this is needed to know which logs need to be reset
+    # whenever the log_state is changed
+    def register_artifact_log(self, artifact_log_qname):
+        self.artifact_log_qnames.add(artifact_log_qname)
+
+    def register_child_log(self, log_qname):
+        self.child_log_qnames.add(log_qname)
+
+    # flattens all the qnames together (TODO: consider memoizing?)
+    def get_log_qnames(self) -> Set[str]:
+        return {
+            qname
+            for qnames in self.log_alias_to_log_qnames.values()
+            for qname in qnames
+        }
+
+    def get_artifact_log_qnames(self):
+        return set(self.artifact_log_qnames)
+
+    def get_child_log_qnames(self):
+        return set(self.child_log_qnames)
+
+    def is_off_by_default(self, artifact_qname):
+        return artifact_qname in self.off_by_default_artifact_names
+
+
+@dataclass
+class LogState:
+    # qualified log names -> currently set log level
+    log_qname_to_level: Dict[str, str] = field(default_factory=dict)
+
+    # the set of currently enabled artifacts
+    artifact_names: Set[str] = field(default_factory=set)
+
+    def enable_artifact(self, artifact_name):
+        self.artifact_names.add(artifact_name)
+
+    def is_artifact_enabled(self, name):
+        return name in self.artifact_names
+
+    def enable_log(self, log_qnames, log_level):
+        if isinstance(log_qnames, str):
+            log_qnames = [log_qnames]
+        for log_qname in log_qnames:
+            self.log_qname_to_level[log_qname] = log_level
+
+    def get_log_level_pairs(self):
+        """Returns all qualified module names for which the user requested
+        explicit logging settings.
+
+        .. warning:
+
+            This function used to return all loggers, regardless of whether
+            or not the user specified them or not; it now only returns logs
+            which were explicitly mentioned by the user (and torch, which
+            always is implicitly requested when we initialize our logging
+            subsystem.)
+        """
+        return self.log_qname_to_level.items()
+
+    def clear(self):
+        self.log_qname_to_level.clear()
+        self.artifact_names.clear()
+
+
+log_registry = LogRegistry()
+log_state = LogState()
+
+# sample usage: torch._logging.set_logs(**torch._logging.DEFAULT_LOGGING)
+DEFAULT_LOGGING = {
+    "dynamo": logging.DEBUG,
+    "aot": logging.DEBUG,
+    "inductor": logging.DEBUG,
+    "ddp_graphs": True,
+    "graph_breaks": True,
+    "guards": True,
+    "recompiles": True,
+    "dynamic": logging.INFO,
+}
+
+
+def set_logs(
+    *,
+    all: Optional[int] = None,
+    dynamo: Optional[int] = None,
+    aot: Optional[int] = None,
+    autograd: Optional[int] = None,
+    dynamic: Optional[int] = None,
+    inductor: Optional[int] = None,
+    distributed: Optional[int] = None,
+    dist_c10d: Optional[int] = None,
+    dist_ddp: Optional[int] = None,
+    dist_fsdp: Optional[int] = None,
+    onnx: Optional[int] = None,
+    bytecode: bool = False,
+    aot_graphs: bool = False,
+    aot_joint_graph: bool = False,
+    ddp_graphs: bool = False,
+    graph: bool = False,
+    graph_code: bool = False,
+    graph_breaks: bool = False,
+    graph_sizes: bool = False,
+    guards: bool = False,
+    recompiles: bool = False,
+    recompiles_verbose: bool = False,
+    trace_source: bool = False,
+    trace_call: bool = False,
+    output_code: bool = False,
+    schedule: bool = False,
+    perf_hints: bool = False,
+    post_grad_graphs: bool = False,
+    onnx_diagnostics: bool = False,
+    fusion: bool = False,
+    overlap: bool = False,
+    export: Optional[int] = None,
+    modules: Optional[Dict[str, Union[int, bool]]] = None,
+    cudagraphs: bool = False,
+    sym_node: bool = False,
+):
+    """
+    Sets the log level for individual components and toggles individual log
+    artifact types.
+
+    .. warning:: This feature is a prototype and may have compatibility
+        breaking changes in the future.
+
+    .. note:: The ``TORCH_LOGS`` environment variable has complete precedence
+        over this function, so if it was set, this function does nothing.
+
+    A component is a set of related features in PyTorch. All of the log
+    messages emitted from a given component have their own log levels. If the
+    log level of a particular message has priority greater than or equal to its
+    component's log level setting, it is emitted. Otherwise, it is suppressed.
+    This allows you to, for instance, silence large groups of log messages that
+    are not relevant to you and increase verbosity of logs for components that
+    are relevant. The expected log level values, ordered from highest to lowest
+    priority, are:
+
+        * ``logging.CRITICAL``
+        * ``logging.ERROR``
+        * ``logging.WARNING``
+        * ``logging.INFO``
+        * ``logging.DEBUG``
+        * ``logging.NOTSET``
+
+    See documentation for the Python ``logging`` module for more information on
+    log levels: `<https://docs.python.org/3/library/logging.html#logging-levels>`_
+
+    An artifact is a particular type of log message. Each artifact is assigned
+    to a parent component. A component can emit many different kinds of
+    artifacts. In general, an artifact is emitted if either its corresponding
+    setting in the argument list below is turned on or if its parent component
+    is set to a log level less than or equal to the log level of the artifact.
+
+    Keyword args:
+        all (:class:`Optional[int]`):
+            The default log level for all components. Default: ``logging.WARN``
+
+        dynamo (:class:`Optional[int]`):
+            The log level for the TorchDynamo component. Default: ``logging.WARN``
+
+        aot (:class:`Optional[int]`):
+            The log level for the AOTAutograd component. Default: ``logging.WARN``
+
+        autograd (:class:`Optional[int]`):
+            The log level for autograd. Default: ``logging.WARN``
+
+        inductor (:class:`Optional[int]`):
+            The log level for the TorchInductor component. Default: ``logging.WARN``
+
+        dynamic (:class:`Optional[int]`):
+            The log level for dynamic shapes. Default: ``logging.WARN``
+
+        distributed (:class:`Optional[int]`):
+            Whether to log c10d communication operations and other debug info from PyTorch Distributed components.
+            Default: ``logging.WARN``
+
+        dist_c10d (:class:`Optional[int]`):
+            Whether to log c10d communication operations related debug info in PyTorch Distributed components.
+            Default: ``logging.WARN``
+
+        dist_ddp (:class:`Optional[int]`):
+            Whether to log debug info related to ``DistributedDataParallel``(DDP) from PyTorch Distributed components.
+            Default: ``logging.WARN``
+
+        dist_fsdp (:class:`Optional[int]`):
+            Whether to log debug info related to ``FullyShardedDataParallel``(FSDP) in PyTorch Distributed components.
+            Default: ``logging.WARN``
+
+        onnx (:class:`Optional[int]`):
+            The log level for the ONNX exporter component. Default: ``logging.WARN``
+
+        bytecode (:class:`bool`):
+            Whether to emit the original and generated bytecode from TorchDynamo.
+            Default: ``False``
+
+        aot_graphs (:class:`bool`):
+            Whether to emit the graphs generated by AOTAutograd. Default: ``False``
+
+        aot_joint_graph (:class:`bool`):
+            Whether to emit the joint forward-backward graph generated by AOTAutograd. Default: ``False``
+
+        inductor (:class:`Optional[int]`):
+            Whether to log information from inductor cudagraphs. Default: ``logging.WARN``
+
+        ddp_graphs (:class:`bool`):
+            Whether to emit graphs generated by DDPOptimizer. Default: ``False``
+
+        graph (:class:`bool`):
+            Whether to emit the graph captured by TorchDynamo in tabular format.
+            Default: ``False``
+
+        graph_code (:class:`bool`):
+            Whether to emit the python source of the graph captured by TorchDynamo.
+            Default: ``False``
+
+        graph_breaks (:class:`bool`):
+            Whether to emit the graph breaks encountered by TorchDynamo.
+            Default: ``False``
+
+        graph_sizes (:class:`bool`):
+            Whether to emit tensor sizes of the graph captured by TorchDynamo.
+            Default: ``False``
+
+        guards (:class:`bool`):
+            Whether to emit the guards generated by TorchDynamo for each compiled
+            function. Default: ``False``
+
+        recompiles (:class:`bool`):
+            Whether to emit a guard failure reason and message every time
+            TorchDynamo recompiles a function. Default: ``False``
+
+        recompiles_verbose (:class:`bool`):
+            Whether to emit all guard failure reasons when TorchDynamo recompiles
+            a function, even those that are not actually run. Default: ``False``
+
+        trace_source (:class:`bool`):
+            Whether to emit when TorchDynamo begins tracing a new line. Default: ``False``
+
+        trace_call (:class:`bool`):
+            Whether to emit detailed line location when TorchDynamo creates an FX node
+            corresponding to function call. Python 3.11+ only. Default: ``False``
+
+        output_code (:class:`bool`):
+            Whether to emit the TorchInductor output code. Default: ``False``
+
+        schedule (:class:`bool`):
+            Whether to emit the TorchInductor schedule. Default: ``False``
+
+        perf_hints (:class:`bool`):
+            Whether to emit the TorchInductor perf hints. Default: ``False``
+
+        post_grad_graphs (:class:`bool`):
+            Whether to emit the graphs generated by after post grad passes. Default: ``False``
+
+        onnx_diagnostics (:class:`bool`):
+            Whether to emit the ONNX exporter diagnostics in logging. Default: ``False``
+
+        fusion (:class:`bool`):
+            Whether to emit detailed Inductor fusion decisions. Default: ``False``
+
+        overlap (:class:`bool`):
+            Whether to emit detailed Inductor compute/comm overlap decisions. Default: ``False``
+
+        sym_node (:class:`bool`):
+            Whether to emit debug info for various SymNode opterations. Default: ``False``
+
+        export (:class:`Optional[int]`):
+            The log level for export. Default: ``logging.WARN``
+
+        modules (dict):
+            This argument provides an alternate way to specify the above log
+            component and artifact settings, in the format of a keyword args
+            dictionary given as a single argument. There are two cases
+            where this is useful (1) if a new log component or artifact has
+            been registered but a keyword argument for it has not been added
+            to this function and (2) if the log level for an unregistered module
+            needs to be set. This can be done by providing the fully-qualified module
+            name as the key, with the log level as the value. Default: ``None``
+
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> import logging
+
+        # The following changes the "dynamo" component to emit DEBUG-level
+        # logs, and to emit "graph_code" artifacts.
+
+        >>> torch._logging.set_logs(dynamo=logging.DEBUG, graph_code=True)
+
+        # The following enables the logs for a different module
+
+        >>> torch._logging.set_logs(modules={"unregistered.module.name": logging.DEBUG})
+    """
+    # ignore if env var is set
+    if LOG_ENV_VAR in os.environ:
+        log.warning(
+            "Using TORCH_LOGS environment variable for log settings, ignoring call to set_logs"
+        )
+        return
+
+    log_state.clear()
+
+    modules = modules or {}
+
+    def _set_logs(**kwargs):
+        for alias, val in itertools.chain(kwargs.items(), modules.items()):  # type: ignore[union-attr]
+            if val is None:
+                continue
+
+            if log_registry.is_artifact(alias):
+                if not isinstance(val, bool):
+                    raise ValueError(
+                        f"Expected bool to enable artifact {alias}, received {val}"
+                    )
+
+                if val:
+                    log_state.enable_artifact(alias)
+            elif log_registry.is_log(alias) or alias in log_registry.child_log_qnames:
+                if val not in logging._levelToName:
+                    raise ValueError(
+                        f"Unrecognized log level for log {alias}: {val}, valid level values "
+                        f"are: {','.join([str(k) for k in logging._levelToName.keys()])}"
+                    )
+
+                log_state.enable_log(
+                    log_registry.log_alias_to_log_qnames.get(alias, alias), val
+                )
+            else:
+                raise ValueError(
+                    f"Unrecognized log or artifact name passed to set_logs: {alias}"
+                )
+
+        _init_logs()
+
+    _set_logs(
+        torch=all,
+        dynamo=dynamo,
+        aot=aot,
+        autograd=autograd,
+        inductor=inductor,
+        dynamic=dynamic,
+        bytecode=bytecode,
+        aot_graphs=aot_graphs,
+        aot_joint_graph=aot_joint_graph,
+        ddp_graphs=ddp_graphs,
+        distributed=distributed,
+        dist_c10d=dist_c10d,
+        dist_ddp=dist_ddp,
+        dist_fsdp=dist_fsdp,
+        graph=graph,
+        graph_code=graph_code,
+        graph_breaks=graph_breaks,
+        graph_sizes=graph_sizes,
+        guards=guards,
+        recompiles=recompiles,
+        recompiles_verbose=recompiles_verbose,
+        trace_source=trace_source,
+        trace_call=trace_call,
+        output_code=output_code,
+        schedule=schedule,
+        perf_hints=perf_hints,
+        post_grad_graphs=post_grad_graphs,
+        onnx=onnx,
+        onnx_diagnostics=onnx_diagnostics,
+        fusion=fusion,
+        overlap=overlap,
+        sym_node=sym_node,
+        export=export,
+        cudagraphs=cudagraphs,
+    )
+
+
+def get_loggers():
+    """
+    Returns: a list of all registered loggers
+    """
+    return [logging.getLogger(qname) for qname in log_registry.get_log_qnames()]
+
+
+def register_log(setting_name, log_name):
+    """
+    Enables a log to be controlled by the env var and user API with the setting_name
+    Args:
+        setting_name:  the shorthand name used in the env var and user API
+        log_name:  the log name that the setting_name is associated with
+    """
+    log_registry.register_log(setting_name, log_name)
+
+
+def register_artifact(
+    setting_name, description, visible=False, off_by_default=False, log_format=None
+):
+    """
+    Enables an artifact to be controlled by the env var and user API with name
+    Args:
+        setting_name: the shorthand name used in the env var and user API
+        description: A description of what this outputs
+        visible: Whether it gets suggested to users by default
+        off_by_default: whether this artifact should be logged when the ancestor loggers
+            are enabled at level DEBUG
+    """
+    log_registry.register_artifact_name(
+        setting_name, description, visible, off_by_default, log_format
+    )
+
+
+def getArtifactLogger(module_qname, artifact_name):
+    if artifact_name not in log_registry.artifact_names:
+        raise ValueError(
+            f"Artifact name: {repr(artifact_name)} not registered,"
+            f"please call register_artifact({repr(artifact_name)}) in torch._logging.registrations."
+        )
+    qname = module_qname + f".__{artifact_name}"
+    log = logging.getLogger(qname)
+    log.artifact_name = artifact_name  # type: ignore[attr-defined]
+    log_registry.register_artifact_log(qname)
+    configure_artifact_log(log)
+    return log
+
+
+INCR_VERBOSITY_CHAR = "+"
+DECR_VERBOSITY_CHAR = "-"
+VERBOSITY_REGEX = (
+    "("
+    + "|".join([re.escape(INCR_VERBOSITY_CHAR), re.escape(DECR_VERBOSITY_CHAR)])
+    + "?)"
+)
+
+
+def configure_artifact_log(log):
+    # If the artifact is off by default, then it should only be logged when explicitly
+    # enabled; set propagate to False so that this artifact is not propagated
+    # to its ancestor logger
+    if log_registry.is_off_by_default(log.artifact_name):
+        log.propagate = False
+
+    # enable artifact logging when explicitly enabled
+    if log_state.is_artifact_enabled(log.artifact_name):
+        log.setLevel(logging.DEBUG)
+        log.propagate = True
+
+
+# match a comma separated list of loggable names (whitespace allowed after commas)
+def _gen_settings_regex():
+    return re.compile(r"((\+|-)?[\w\.]+,\s*)*(\+|-)?[\w\.]+?")
+
+
+def _validate_settings(settings):
+    return re.fullmatch(_gen_settings_regex(), settings) is not None
+
+
+def help_message(verbose=False):
+    def pad_to(s, length=30):
+        assert len(s) <= length
+        return s + " " * (length - len(s))
+
+    if verbose:
+        printed_artifacts = log_registry.artifact_names
+    else:
+        printed_artifacts = log_registry.visible_artifacts
+
+    if verbose:
+        heading = "All registered names"
+    else:
+        heading = "Visible registered names (use TORCH_LOGS='+help' for full list)"
+    lines = (
+        ["all"]
+        + sorted(log_registry.log_alias_to_log_qnames.keys())
+        + sorted(
+            [
+                f"{pad_to(name)}\t{log_registry.artifact_descriptions[name]}"
+                for name in printed_artifacts
+            ]
+        )
+    )
+    setting_info = "  " + "\n  ".join(lines)
+    examples = """
+Examples:
+  TORCH_LOGS="+dynamo,aot" will set the log level of TorchDynamo to
+  logging.DEBUG and AOT to logging.INFO
+
+  TORCH_LOGS="-dynamo,+inductor" will set the log level of TorchDynamo to
+  logging.ERROR and TorchInductor to logging.DEBUG
+
+  TORCH_LOGS="aot_graphs" will enable the aot_graphs artifact
+
+  TORCH_LOGS="+dynamo,schedule" will enable set the log level of TorchDynamo
+  to logging.DEBUG and enable the schedule artifact
+
+  TORCH_LOGS="+some.random.module,schedule" will set the log level of
+  some.random.module to logging.DEBUG and enable the schedule artifact
+
+  TORCH_LOGS_FORMAT="%(levelname)s: %(message)s" or any provided format
+  string will set the output format
+  Valid keys are "levelname", "message", "pathname", "levelno", "lineno",
+  "filename" and "name".
+
+  TORCH_LOGS_OUT=/tmp/output.txt will output the logs to /tmp/output.txt as
+  well. This is useful when the output is long.
+"""  # flake8: noqa: B950
+    msg = f"""
+TORCH_LOGS Info
+{examples}
+
+{heading}
+{setting_info}
+"""
+    return msg
+
+
+def _invalid_settings_err_msg(settings, verbose=False):
+    valid_settings = ", ".join(
+        ["all"]
+        + list(log_registry.log_alias_to_log_qnames.keys())
+        + list(log_registry.artifact_names)
+    )
+    msg = f"""
+Invalid log settings: {settings}, must be a comma separated list of fully
+qualified module names, registered log names or registered artifact names.
+For more info on various settings, try TORCH_LOGS="help"
+Valid settings:
+{valid_settings}
+"""
+    return msg
+
+
+@functools.lru_cache
+def _parse_log_settings(settings):
+    if settings == "":
+        return dict()
+
+    if settings == "help":
+        raise ValueError(help_message(verbose=False))
+    elif settings == "+help":
+        raise ValueError(help_message(verbose=True))
+    if not _validate_settings(settings):
+        raise ValueError(_invalid_settings_err_msg(settings))
+
+    settings = re.sub(r"\s+", "", settings)
+    log_names = settings.split(",")
+
+    def get_name_level_pair(name):
+        clean_name = name.replace(INCR_VERBOSITY_CHAR, "")
+        clean_name = clean_name.replace(DECR_VERBOSITY_CHAR, "")
+
+        if name[0] == INCR_VERBOSITY_CHAR:
+            level = logging.DEBUG
+        elif name[0] == DECR_VERBOSITY_CHAR:
+            level = logging.ERROR
+        else:
+            level = logging.INFO
+
+        return clean_name, level
+
+    log_state = LogState()
+
+    for name in log_names:
+        name, level = get_name_level_pair(name)
+
+        if name == "all":
+            name = "torch"
+
+        if log_registry.is_log(name):
+            assert level is not None
+            log_qnames = log_registry.log_alias_to_log_qnames[name]
+            log_state.enable_log(log_qnames, level)
+        elif log_registry.is_artifact(name):
+            log_state.enable_artifact(name)
+        elif _is_valid_module(name):
+            if not _has_registered_parent(name):
+                log_registry.register_log(name, name)
+            else:
+                log_registry.register_child_log(name)
+            log_state.enable_log(name, level)
+        else:
+            raise ValueError(_invalid_settings_err_msg(settings))
+
+    return log_state
+
+
+def _is_valid_module(qname):
+    try:
+        __import__(qname)
+        return True
+    except ImportError:
+        return False
+
+
+def _update_log_state_from_env():
+    global log_state
+    log_setting = os.environ.get(LOG_ENV_VAR, None)
+    if log_setting is not None:
+        log_state = _parse_log_settings(log_setting)
+
+
+def _has_registered_parent(log_qname):
+    cur_log = logging.getLogger(log_qname)
+
+    registered_log_qnames = log_registry.get_log_qnames()
+
+    while cur_log.parent:
+        if cur_log.name in registered_log_qnames:
+            return True
+        cur_log = cur_log.parent
+
+    return False
+
+
+# apply custom formats to artifacts when necessary
+class TorchLogsFormatter(logging.Formatter):
+    def __init__(self, *, trace: bool = False):
+        super().__init__()
+        self._is_trace = trace
+
+    def format(self, record):
+        artifact_name = getattr(logging.getLogger(record.name), "artifact_name", None)
+        if artifact_name is not None:
+            artifact_formatter = log_registry.artifact_log_formatters.get(
+                artifact_name, None
+            )
+            if artifact_formatter is not None:
+                return artifact_formatter.format(record)
+
+        record.message = record.getMessage()
+        record.asctime = self.formatTime(record, "%m%d %H:%M:%S")
+
+        # exception handling - copied from logging.Formatter.format
+        s = record.message
+        if record.exc_info:
+            # Cache the traceback text to avoid converting it multiple times
+            # (it's constant anyway)
+            if not record.exc_text:
+                record.exc_text = self.formatException(record.exc_info)
+        if record.exc_text:
+            if s[-1:] != "\n":
+                s = s + "\n"
+            s = s + record.exc_text
+        if record.stack_info:
+            if s[-1:] != "\n":
+                s = s + "\n"
+            s = s + self.formatStack(record.stack_info)
+
+        record.rankprefix = ""
+        if not self._is_trace and dist.is_available() and dist.is_initialized():
+            record.rankprefix = f"[rank{dist.get_rank()}]:"
+
+        record.traceid = ""
+        if (
+            not self._is_trace
+            and (trace_id := torch._guards.CompileContext.current_trace_id())
+            is not None
+        ):
+            record.traceid = f" [{trace_id}]"
+
+        glog_level_to_abbr = {
+            "DEBUG": "V",  # V is for VERBOSE in glog
+            "INFO": "I",
+            "WARNING": "W",
+            "ERROR": "E",
+            "CRITICAL": "C",
+        }
+
+        shortlevel = glog_level_to_abbr.get(record.levelname, record.levelname)
+
+        record.artifactprefix = ""
+        if artifact_name is not None:
+            record.artifactprefix = f" [__{artifact_name}]"
+
+        prefix = (
+            f"{record.rankprefix}{shortlevel}{record.asctime}.{int(record.msecs*1000):06d} {record.thread} "
+            f"{os.path.relpath(record.pathname, os.path.dirname(os.path.dirname(torch.__file__)))}:"
+            f"{record.lineno}]{record.traceid}{record.artifactprefix}"
+        )
+        if self._is_trace:
+            assert s == ""
+            r = f"{prefix} {json.dumps(record.metadata)}"
+            if record.payload is not None:
+                r += "".join(f"\n\t{l}" for l in record.payload.split("\n"))
+            return r
+        else:
+            lines = s.split("\n")
+            return "\n".join(f"{prefix} {l}" for l in lines)
+
+
+def _default_formatter():
+    fmt = os.environ.get(LOG_FORMAT_ENV_VAR, None)
+    if fmt is None:
+        return TorchLogsFormatter()
+    else:
+        if fmt in ("short", "basic"):
+            fmt = logging.BASIC_FORMAT
+        return logging.Formatter(fmt)
+
+
+DEFAULT_FORMATTER = _default_formatter()
+
+
+def _setup_handlers(create_handler_fn, log):
+    debug_handler = _track_handler(create_handler_fn())
+    debug_handler.setFormatter(DEFAULT_FORMATTER)
+    debug_handler.setLevel(logging.DEBUG)
+    log.addHandler(debug_handler)
+
+
+handlers = WeakSet()  # type: ignore[var-annotated]
+
+
+# mark handlers that we've created
+# so we don't modify user handlers
+def _track_handler(handler):
+    handlers.add(handler)
+    return handler
+
+
+def _is_torch_handler(handler):
+    return handler in handlers
+
+
+# clears all torch handlers on specified loggers
+def _clear_handlers(log):
+    to_remove = [handler for handler in log.handlers if _is_torch_handler(handler)]
+    for handler in to_remove:
+        log.removeHandler(handler)
+
+
+def _reset_logs():
+    # reset all registered logs
+    for log_qname in log_registry.get_log_qnames():
+        log = logging.getLogger(log_qname)
+        log.setLevel(logging.WARNING)
+        log.propagate = False
+        _clear_handlers(log)
+
+    # reset all artifact and child logs
+    for artifact_log_qname in itertools.chain(
+        log_registry.get_artifact_log_qnames(), log_registry.get_child_log_qnames()
+    ):
+        log = logging.getLogger(artifact_log_qname)
+        log.setLevel(logging.NOTSET)
+        log.propagate = True
+
+    trace_log.propagate = False
+    _clear_handlers(trace_log)
+
+
+def _get_log_state():
+    return log_state
+
+
+def _set_log_state(state):
+    global log_state
+    log_state = state
+
+
+def _init_logs(log_file_name=None):
+    _reset_logs()
+    _update_log_state_from_env()
+
+    out = os.environ.get(LOG_OUT_ENV_VAR, None)
+    if out is not None:
+        log_file_name = out
+
+    # First, reset all known (registered) loggers to NOTSET, so that they
+    # respect their parent log level
+    for log_qname in log_registry.get_log_qnames():
+        # But not the top level torch level: this defaults to WARNING so
+        # that our log messages don't leak to the lower levels
+        if log_qname == "torch":
+            continue
+        log = logging.getLogger(log_qname)
+        log.setLevel(logging.NOTSET)
+
+    # Now, for all loggers which the user requested to have non-standard
+    # logging behavior, modify their log levels
+    for log_qname, level in log_state.get_log_level_pairs():
+        log = logging.getLogger(log_qname)
+        log.setLevel(level)
+
+    # Finally, setup handlers for all registered loggers
+    for log_qname in log_registry.get_log_qnames():
+        log = logging.getLogger(log_qname)
+        _setup_handlers(
+            logging.StreamHandler,
+            log,
+        )
+
+        if log_file_name is not None:
+            _setup_handlers(
+                lambda: logging.FileHandler(log_file_name),
+                log,
+            )
+
+    # configure artifact loggers, note: this must happen last
+    # since the levels of ancestor loggers are taken into account
+    for artifact_log_qname in log_registry.get_artifact_log_qnames():
+        log = logging.getLogger(artifact_log_qname)
+        configure_artifact_log(log)
+
+    # Setup handler for the special trace_log, with different default
+    # configuration
+    trace_dir_name = os.environ.get(TRACE_ENV_VAR, None)
+    # This handler may remove itself if trace_dir_name is None and we are not
+    # actually in an FB environment.  This allows us to defer actually
+    # initializing it until we actually need to log anything.  This is
+    # important because JK initializes a C++ singleton, which will pork our
+    # process if we subsequently fork.
+    handler = LazyTraceHandler(trace_dir_name)
+    # This log is ALWAYS at debug level.  We will additionally test if there
+    # are any handlers before deciding to actually call logging on this.  Do
+    # not manually call
+    trace_log.setLevel(logging.DEBUG)
+    trace_log_handler = _track_handler(handler)
+    trace_log_handler.setFormatter(TorchLogsFormatter(trace=True))
+    trace_log.addHandler(trace_log_handler)
+
+
+class LazyTraceHandler(logging.StreamHandler):
+    """Like FileHandler, but the file is allocated lazily only upon the first log message"""
+
+    def __init__(self, root_dir: Optional[str]):
+        # This is implemented in the same way that delay is implemented on
+        # FileHandler
+        self.root_dir = root_dir
+        logging.Handler.__init__(self)
+        self.stream = None
+        self._builtin_open = open
+
+    # cloned from FileHandler in cpython
+    def close(self):
+        self.acquire()
+        try:
+            try:
+                if self.stream:
+                    try:
+                        self.flush()
+                    finally:
+                        stream = self.stream
+                        self.stream = None
+                        if hasattr(stream, "close"):
+                            stream.close()
+            finally:
+                # Issue #19523: call unconditionally to
+                # prevent a handler leak when delay is set
+                # Also see Issue #42378: we also rely on
+                # self._closed being set to True there
+                logging.StreamHandler.close(self)
+        finally:
+            self.release()
+
+    def emit(self, record):
+        if self.stream is None:
+            ok = False
+            if self.root_dir is None:
+                TRACE_LOG_DIR = "/logs"
+                open_func = self._builtin_open
+
+                import torch.version as torch_version
+
+                if hasattr(torch_version, "git_version"):
+                    log.info("LazyTraceHandler: disabled because not fbcode")
+                elif not torch._utils_internal.justknobs_check("pytorch/trace:enable"):
+                    log.info(
+                        "LazyTraceHandler: disabled because justknobs_check('pytorch/trace:enable') returned False"
+                    )
+                elif not os.path.exists(TRACE_LOG_DIR):
+                    log.info(
+                        "LazyTraceHandler: disabled because %s does not exist",
+                        TRACE_LOG_DIR,
+                    )
+                elif not os.access(TRACE_LOG_DIR, os.W_OK):
+                    log.info(
+                        "LazyTraceHandler: disabled because %s is not writeable",
+                        TRACE_LOG_DIR,
+                    )
+                else:
+                    self.root_dir = TRACE_LOG_DIR
+
+            if self.root_dir is not None:
+                os.makedirs(self.root_dir, exist_ok=True)
+                ranksuffix = ""
+                if dist.is_available() and dist.is_initialized():
+                    ranksuffix = f"rank_{dist.get_rank()}_"
+                self.stream = tempfile.NamedTemporaryFile(
+                    mode="w+",
+                    suffix=".log",
+                    prefix=f"dedicated_log_torch_trace_{ranksuffix}",
+                    dir=self.root_dir,
+                    delete=False,
+                )
+                log.info("LazyTraceHandler: logging to %s", self.stream.name)
+            else:
+                # We go poof, remove and no-op
+                trace_log.removeHandler(self)
+                return
+        if self.stream:
+            super().emit(record)
+
+
+@functools.lru_cache(None)
+def warning_once(logger_obj, *args, **kwargs):
+    """
+    This function is similar to `logger.warning()`, but will emit the warning with the same message only once
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    logger_obj.warning(*args, **kwargs)
+
+
+class LazyString:
+    def __init__(self, func, *args, **kwargs):
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+
+    def __str__(self):
+        return self.func(*self.args, **self.kwargs)
+
+
+def trace_structured(
+    name: str,
+    # NB: metadata expected to be dict so adding more info is forward compatible
+    # Tuple[str, int] is a special case for string interning
+    metadata_fn: Callable[[], Union[Dict[str, Any], Tuple[str, int]]] = dict,
+    *,
+    payload_fn: Callable[[], Optional[Union[str, object]]] = lambda: None,
+    suppress_context: bool = False,
+):
+    """
+    metadata is an arbitrary JSON compatible struct, but it's expected to not be
+    too long (e.g., less than 1MB)
+
+    payload is an arbitrary string, which can be arbitrarily long (but expected to have
+    newlines so no lines are too long)
+    """
+    assert "name" not in ["rank", "frame_id", "frame_compile_id", "attempt"]
+    assert callable(
+        metadata_fn
+    ), f"metadata_fn should be callable, but got {type(metadata_fn)}"
+    assert callable(
+        payload_fn
+    ), f"payload_fn should be callable, but got {type(payload_fn)}"
+    # trace_log never propagates and is ALWAYS DEBUG, so also check that there
+    # are handlers instead of checking the log level
+    if trace_log.handlers:
+        record: Dict[str, object] = {}
+        record[name] = metadata_fn()
+        if not suppress_context:
+            # TODO: Actually, the rank probably should just be emitted once at
+            # the top, and not repeatedly spammed in all the logs, since it
+            # never changes and we assume no interleaving
+            if dist.is_available() and dist.is_initialized():
+                record["rank"] = dist.get_rank()
+            if (
+                trace_id := torch._guards.CompileContext.current_trace_id()
+            ) is not None:
+                record["frame_id"] = trace_id.compile_id.frame_id
+                record["frame_compile_id"] = trace_id.compile_id.frame_compile_id
+                record["attempt"] = trace_id.attempt
+        payload = payload_fn()
+        if payload is not None:
+            if not isinstance(payload, str):
+                if isinstance(payload, list):
+                    # special case to look better
+                    payload = "[\n" + ",\n".join(json.dumps(i) for i in payload) + "\n]"
+                else:
+                    # force newlines so we are unlikely to overflow line limit
+                    payload = json.dumps(payload, indent=0)
+            h = hashlib.md5()
+            h.update(payload.encode("utf-8"))
+            record["has_payload"] = h.hexdigest()
+        trace_log.debug(
+            "", extra={"metadata": record, "payload": payload}, stacklevel=2
+        )
+
+
+import torch._guards
+import torch._utils_internal
+import torch.distributed as dist
diff --git a/MLPY/Lib/site-packages/torch/_logging/_registrations.py b/MLPY/Lib/site-packages/torch/_logging/_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad33a92eca3deaec4dbade3723e5f24ab805d048
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_logging/_registrations.py
@@ -0,0 +1,134 @@
+# flake8: noqa: B950
+from ._internal import register_artifact, register_log
+
+DYNAMIC = ["torch.fx.experimental.symbolic_shapes", "torch.fx.experimental.sym_node"]
+DISTRIBUTED = [
+    "torch.distributed",
+    "torch._dynamo.backends.distributed",
+    "torch.nn.parallel.distributed",
+]
+
+register_log("dynamo", ["torch._dynamo", *DYNAMIC])
+register_log("aot", ["torch._functorch.aot_autograd", "torch._functorch._aot_autograd"])
+register_log("autograd", "torch.autograd")
+register_log("inductor", ["torch._inductor", "torch._inductor.cudagraph_trees"])
+
+register_artifact(
+    "cudagraphs",
+    "Logs information from wrapping inductor generated code with cudagraphs.",
+)
+
+register_log("dynamic", DYNAMIC)
+register_log("torch", "torch")
+register_log("distributed", DISTRIBUTED)
+register_log(
+    "dist_c10d", ["torch.distributed.distributed_c10d", "torch.distributed.rendezvous"]
+)
+register_log(
+    "dist_ddp", ["torch.nn.parallel.distributed", "torch._dynamo.backends.distributed"]
+)
+register_log("dist_fsdp", ["torch.distributed.fsdp"])
+register_log("onnx", "torch.onnx")
+register_log("export", ["torch._dynamo", "torch.export", *DYNAMIC])
+
+register_artifact(
+    "guards",
+    "This prints the guards for every compiled Dynamo frame. It does not tell you where the guards come from.",
+    visible=True,
+)
+register_artifact("verbose_guards", "", off_by_default=True)
+register_artifact(
+    "bytecode",
+    "Prints the original and modified bytecode from Dynamo. Mostly useful if you're debugging our bytecode generation in Dynamo.",
+    off_by_default=True,
+)
+register_artifact(
+    "graph",
+    "Prints the dynamo traced graph (prior to AOTDispatch) in a table. If you prefer python code use `graph_code` instead. ",
+)
+register_artifact("graph_code", "Like `graph`, but gives you the Python code instead.")
+register_artifact(
+    "graph_sizes", "Prints the sizes of all FX nodes in the dynamo graph."
+)
+register_artifact(
+    "trace_source",
+    "As we execute bytecode, prints the file name / line number we are processing and the actual source code. Useful with `bytecode`",
+)
+register_artifact(
+    "trace_call",
+    "Like trace_source, but it will give you the per-expression blow-by-blow if your Python is recent enough.",
+)
+register_artifact(
+    "aot_graphs",
+    "Prints the FX forward and backward graph generated by AOTDispatch, after partitioning. Useful to understand what's being given to Inductor",
+    visible=True,
+)
+register_artifact(
+    "aot_joint_graph",
+    "Print FX joint graph from AOTAutograd, prior to partitioning. Useful for debugging partitioning",
+)
+register_artifact(
+    "post_grad_graphs",
+    "Prints the FX graph generated by post grad passes. Useful to understand what's being given to Inductor after post grad passes",
+)
+register_artifact(
+    "compiled_autograd",
+    "Prints various logs in compiled_autograd, including but not limited to the graphs. Useful for debugging compiled_autograd.",
+    visible=True,
+)
+register_artifact(
+    "ddp_graphs",
+    "Only relevant for compiling DDP. DDP splits into multiple graphs to trigger comms early. This will print each individual graph here.",
+)
+register_artifact(
+    "recompiles",
+    "Prints the reason why we recompiled a graph. Very, very useful.",
+    visible=True,
+)
+register_artifact(
+    "recompiles_verbose",
+    "Prints all guard checks that fail during a recompilation. "
+    "At runtime, Dynamo will stop at the first failed check for each failing guard. "
+    "So not all logged failing checks are actually ran by Dynamo.",
+    visible=True,
+    off_by_default=True,
+)
+register_artifact(
+    "graph_breaks",
+    "Prints whenever Dynamo decides that it needs to graph break (i.e. create a new graph). Useful for debugging why torch.compile has poor performance",
+    visible=True,
+)
+register_artifact(
+    "not_implemented",
+    "Prints log messages whenever we return NotImplemented in a multi-dispatch, letting you trace through each object we attempted to dispatch to",
+)
+register_artifact(
+    "output_code",
+    "Prints the code that Inductor generates (either Triton or C++)",
+    off_by_default=True,
+    visible=True,
+)
+register_artifact(
+    "schedule",
+    "Inductor scheduler information. Useful if working on Inductor fusion algo",
+    off_by_default=True,
+)
+register_artifact("perf_hints", "", off_by_default=True)
+register_artifact("onnx_diagnostics", "", off_by_default=True)
+register_artifact(
+    "fusion",
+    "Detailed Inductor fusion decisions. More detailed than 'schedule'",
+    off_by_default=True,
+)
+register_artifact(
+    "overlap",
+    "Detailed Inductor compute/comm overlap decisions",
+    off_by_default=True,
+)
+register_artifact(
+    "sym_node",
+    "Logs extra info for various SymNode operations",
+    off_by_default=True,
+)
+
+register_artifact("custom_format_test_artifact", "Testing only", log_format="")
diff --git a/MLPY/Lib/site-packages/torch/_logging/structured.py b/MLPY/Lib/site-packages/torch/_logging/structured.py
new file mode 100644
index 0000000000000000000000000000000000000000..26f9600a93a32482317124155a091cb98f495ed6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_logging/structured.py
@@ -0,0 +1,37 @@
+"""
+Utilities for converting data types into structured JSON for dumping.
+"""
+
+import traceback
+from typing import Dict, Sequence
+
+import torch._logging._internal
+
+
+INTERN_TABLE: Dict[str, int] = {}
+
+
+def intern_string(s: str) -> int:
+    r = INTERN_TABLE.get(s, None)
+    if r is None:
+        r = len(INTERN_TABLE)
+        INTERN_TABLE[s] = r
+        torch._logging._internal.trace_structured(
+            "str", lambda: (s, r), suppress_context=True
+        )
+    return r
+
+
+def from_traceback(tb: Sequence[traceback.FrameSummary]) -> object:
+    r = []
+    for frame in tb:
+        # dict naming convention here coincides with
+        # python/combined_traceback.cpp
+        r.append(
+            {
+                "line": frame.lineno,
+                "name": frame.name,
+                "filename": intern_string(frame.filename),
+            }
+        )
+    return r
diff --git a/MLPY/Lib/site-packages/torch/_lowrank.py b/MLPY/Lib/site-packages/torch/_lowrank.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e458d4198dccfcf88b0da570f437cdee4e3aa47
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_lowrank.py
@@ -0,0 +1,298 @@
+"""Implement various linear algebra algorithms for low rank matrices.
+"""
+
+__all__ = ["svd_lowrank", "pca_lowrank"]
+
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+from . import _linalg_utils as _utils
+from .overrides import handle_torch_function, has_torch_function
+
+
+def get_approximate_basis(
+    A: Tensor, q: int, niter: Optional[int] = 2, M: Optional[Tensor] = None
+) -> Tensor:
+    """Return tensor :math:`Q` with :math:`q` orthonormal columns such
+    that :math:`Q Q^H A` approximates :math:`A`. If :math:`M` is
+    specified, then :math:`Q` is such that :math:`Q Q^H (A - M)`
+    approximates :math:`A - M`.
+
+    .. note:: The implementation is based on the Algorithm 4.4 from
+              Halko et al, 2009.
+
+    .. note:: For an adequate approximation of a k-rank matrix
+              :math:`A`, where k is not known in advance but could be
+              estimated, the number of :math:`Q` columns, q, can be
+              choosen according to the following criteria: in general,
+              :math:`k <= q <= min(2*k, m, n)`. For large low-rank
+              matrices, take :math:`q = k + 5..10`.  If k is
+              relatively small compared to :math:`min(m, n)`, choosing
+              :math:`q = k + 0..2` may be sufficient.
+
+    .. note:: To obtain repeatable results, reset the seed for the
+              pseudorandom number generator
+
+    Args::
+        A (Tensor): the input tensor of size :math:`(*, m, n)`
+
+        q (int): the dimension of subspace spanned by :math:`Q`
+                 columns.
+
+        niter (int, optional): the number of subspace iterations to
+                               conduct; ``niter`` must be a
+                               nonnegative integer. In most cases, the
+                               default value 2 is more than enough.
+
+        M (Tensor, optional): the input tensor's mean of size
+                              :math:`(*, 1, n)`.
+
+    References::
+        - Nathan Halko, Per-Gunnar Martinsson, and Joel Tropp, Finding
+          structure with randomness: probabilistic algorithms for
+          constructing approximate matrix decompositions,
+          arXiv:0909.4061 [math.NA; math.PR], 2009 (available at
+          `arXiv <http://arxiv.org/abs/0909.4061>`_).
+    """
+
+    niter = 2 if niter is None else niter
+    m, n = A.shape[-2:]
+    dtype = _utils.get_floating_dtype(A)
+    matmul = _utils.matmul
+
+    R = torch.randn(n, q, dtype=dtype, device=A.device)
+
+    # The following code could be made faster using torch.geqrf + torch.ormqr
+    # but geqrf is not differentiable
+    A_H = _utils.transjugate(A)
+    if M is None:
+        Q = torch.linalg.qr(matmul(A, R)).Q
+        for i in range(niter):
+            Q = torch.linalg.qr(matmul(A_H, Q)).Q
+            Q = torch.linalg.qr(matmul(A, Q)).Q
+    else:
+        M_H = _utils.transjugate(M)
+        Q = torch.linalg.qr(matmul(A, R) - matmul(M, R)).Q
+        for i in range(niter):
+            Q = torch.linalg.qr(matmul(A_H, Q) - matmul(M_H, Q)).Q
+            Q = torch.linalg.qr(matmul(A, Q) - matmul(M, Q)).Q
+
+    return Q
+
+
+def svd_lowrank(
+    A: Tensor,
+    q: Optional[int] = 6,
+    niter: Optional[int] = 2,
+    M: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    r"""Return the singular value decomposition ``(U, S, V)`` of a matrix,
+    batches of matrices, or a sparse matrix :math:`A` such that
+    :math:`A \approx U diag(S) V^T`. In case :math:`M` is given, then
+    SVD is computed for the matrix :math:`A - M`.
+
+    .. note:: The implementation is based on the Algorithm 5.1 from
+              Halko et al, 2009.
+
+    .. note:: To obtain repeatable results, reset the seed for the
+              pseudorandom number generator
+
+    .. note:: The input is assumed to be a low-rank matrix.
+
+    .. note:: In general, use the full-rank SVD implementation
+              :func:`torch.linalg.svd` for dense matrices due to its 10-fold
+              higher performance characteristics. The low-rank SVD
+              will be useful for huge sparse matrices that
+              :func:`torch.linalg.svd` cannot handle.
+
+    Args::
+        A (Tensor): the input tensor of size :math:`(*, m, n)`
+
+        q (int, optional): a slightly overestimated rank of A.
+
+        niter (int, optional): the number of subspace iterations to
+                               conduct; niter must be a nonnegative
+                               integer, and defaults to 2
+
+        M (Tensor, optional): the input tensor's mean of size
+                              :math:`(*, 1, n)`.
+
+    References::
+        - Nathan Halko, Per-Gunnar Martinsson, and Joel Tropp, Finding
+          structure with randomness: probabilistic algorithms for
+          constructing approximate matrix decompositions,
+          arXiv:0909.4061 [math.NA; math.PR], 2009 (available at
+          `arXiv <https://arxiv.org/abs/0909.4061>`_).
+
+    """
+    if not torch.jit.is_scripting():
+        tensor_ops = (A, M)
+        if not set(map(type, tensor_ops)).issubset(
+            (torch.Tensor, type(None))
+        ) and has_torch_function(tensor_ops):
+            return handle_torch_function(
+                svd_lowrank, tensor_ops, A, q=q, niter=niter, M=M
+            )
+    return _svd_lowrank(A, q=q, niter=niter, M=M)
+
+
+def _svd_lowrank(
+    A: Tensor,
+    q: Optional[int] = 6,
+    niter: Optional[int] = 2,
+    M: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    q = 6 if q is None else q
+    m, n = A.shape[-2:]
+    matmul = _utils.matmul
+    if M is None:
+        M_t = None
+    else:
+        M_t = _utils.transpose(M)
+    A_t = _utils.transpose(A)
+
+    # Algorithm 5.1 in Halko et al 2009, slightly modified to reduce
+    # the number conjugate and transpose operations
+    if m < n or n > q:
+        # computing the SVD approximation of a transpose in
+        # order to keep B shape minimal (the m < n case) or the V
+        # shape small (the n > q case)
+        Q = get_approximate_basis(A_t, q, niter=niter, M=M_t)
+        Q_c = _utils.conjugate(Q)
+        if M is None:
+            B_t = matmul(A, Q_c)
+        else:
+            B_t = matmul(A, Q_c) - matmul(M, Q_c)
+        assert B_t.shape[-2] == m, (B_t.shape, m)
+        assert B_t.shape[-1] == q, (B_t.shape, q)
+        assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
+        U, S, Vh = torch.linalg.svd(B_t, full_matrices=False)
+        V = Vh.mH
+        V = Q.matmul(V)
+    else:
+        Q = get_approximate_basis(A, q, niter=niter, M=M)
+        Q_c = _utils.conjugate(Q)
+        if M is None:
+            B = matmul(A_t, Q_c)
+        else:
+            B = matmul(A_t, Q_c) - matmul(M_t, Q_c)
+        B_t = _utils.transpose(B)
+        assert B_t.shape[-2] == q, (B_t.shape, q)
+        assert B_t.shape[-1] == n, (B_t.shape, n)
+        assert B_t.shape[-1] <= B_t.shape[-2], B_t.shape
+        U, S, Vh = torch.linalg.svd(B_t, full_matrices=False)
+        V = Vh.mH
+        U = Q.matmul(U)
+
+    return U, S, V
+
+
+def pca_lowrank(
+    A: Tensor, q: Optional[int] = None, center: bool = True, niter: int = 2
+) -> Tuple[Tensor, Tensor, Tensor]:
+    r"""Performs linear Principal Component Analysis (PCA) on a low-rank
+    matrix, batches of such matrices, or sparse matrix.
+
+    This function returns a namedtuple ``(U, S, V)`` which is the
+    nearly optimal approximation of a singular value decomposition of
+    a centered matrix :math:`A` such that :math:`A = U diag(S) V^T`.
+
+    .. note:: The relation of ``(U, S, V)`` to PCA is as follows:
+
+                - :math:`A` is a data matrix with ``m`` samples and
+                  ``n`` features
+
+                - the :math:`V` columns represent the principal directions
+
+                - :math:`S ** 2 / (m - 1)` contains the eigenvalues of
+                  :math:`A^T A / (m - 1)` which is the covariance of
+                  ``A`` when ``center=True`` is provided.
+
+                - ``matmul(A, V[:, :k])`` projects data to the first k
+                  principal components
+
+    .. note:: Different from the standard SVD, the size of returned
+              matrices depend on the specified rank and q
+              values as follows:
+
+                - :math:`U` is m x q matrix
+
+                - :math:`S` is q-vector
+
+                - :math:`V` is n x q matrix
+
+    .. note:: To obtain repeatable results, reset the seed for the
+              pseudorandom number generator
+
+    Args:
+
+        A (Tensor): the input tensor of size :math:`(*, m, n)`
+
+        q (int, optional): a slightly overestimated rank of
+                           :math:`A`. By default, ``q = min(6, m,
+                           n)``.
+
+        center (bool, optional): if True, center the input tensor,
+                                 otherwise, assume that the input is
+                                 centered.
+
+        niter (int, optional): the number of subspace iterations to
+                               conduct; niter must be a nonnegative
+                               integer, and defaults to 2.
+
+    References::
+
+        - Nathan Halko, Per-Gunnar Martinsson, and Joel Tropp, Finding
+          structure with randomness: probabilistic algorithms for
+          constructing approximate matrix decompositions,
+          arXiv:0909.4061 [math.NA; math.PR], 2009 (available at
+          `arXiv <http://arxiv.org/abs/0909.4061>`_).
+
+    """
+
+    if not torch.jit.is_scripting():
+        if type(A) is not torch.Tensor and has_torch_function((A,)):
+            return handle_torch_function(
+                pca_lowrank, (A,), A, q=q, center=center, niter=niter
+            )
+
+    (m, n) = A.shape[-2:]
+
+    if q is None:
+        q = min(6, m, n)
+    elif not (q >= 0 and q <= min(m, n)):
+        raise ValueError(
+            f"q(={q}) must be non-negative integer and not greater than min(m, n)={min(m, n)}"
+        )
+    if not (niter >= 0):
+        raise ValueError(f"niter(={niter}) must be non-negative integer")
+
+    dtype = _utils.get_floating_dtype(A)
+
+    if not center:
+        return _svd_lowrank(A, q, niter=niter, M=None)
+
+    if _utils.is_sparse(A):
+        if len(A.shape) != 2:
+            raise ValueError("pca_lowrank input is expected to be 2-dimensional tensor")
+        c = torch.sparse.sum(A, dim=(-2,)) / m
+        # reshape c
+        column_indices = c.indices()[0]
+        indices = torch.zeros(
+            2,
+            len(column_indices),
+            dtype=column_indices.dtype,
+            device=column_indices.device,
+        )
+        indices[0] = column_indices
+        C_t = torch.sparse_coo_tensor(
+            indices, c.values(), (n, 1), dtype=dtype, device=A.device
+        )
+
+        ones_m1_t = torch.ones(A.shape[:-2] + (1, m), dtype=dtype, device=A.device)
+        M = _utils.transpose(torch.sparse.mm(C_t, ones_m1_t))
+        return _svd_lowrank(A, q, niter=niter, M=M)
+    else:
+        C = A.mean(dim=(-2,), keepdim=True)
+        return _svd_lowrank(A - C, q, niter=niter, M=None)
diff --git a/MLPY/Lib/site-packages/torch/_meta_registrations.py b/MLPY/Lib/site-packages/torch/_meta_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7776aae7803b1da33cf59f8f7a5609c3ac2a9a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_meta_registrations.py
@@ -0,0 +1,6253 @@
+import math
+from enum import Enum
+from functools import partial
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch._prims_common as utils
+from torch import SymBool, SymFloat, Tensor
+from torch._decomp import (
+    _add_op_to_registry,
+    _convert_out_params,
+    global_decomposition_table,
+    meta_table,
+)
+from torch._ops import OpOverload
+from torch._prims import _prim_elementwise_meta, ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND
+from torch._prims_common import (
+    corresponding_complex_dtype,
+    corresponding_real_dtype,
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    IntLike,
+    make_contiguous_strides_for,
+    TensorLike,
+)
+
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+    _resize_output_check,
+    _safe_copy_out,
+    out_wrapper,
+)
+from torch._refs import _broadcast_shapes, _maybe_broadcast
+from torch.utils import _pytree as pytree
+
+
+aten = torch.ops.aten
+
+_meta_lib_dont_use_me_use_register_meta = torch.library.Library("aten", "IMPL", "Meta")
+
+
+def register_meta(op):
+    def wrapper(fn):
+        fn = _convert_out_params(fn)
+
+        def register(op):
+            _add_op_to_registry(meta_table, op, fn)
+
+        pytree.tree_map_(register, op)
+        return fn
+
+    return wrapper
+
+
+def elementwise_meta(
+    *args,
+    type_promotion: ELEMENTWISE_TYPE_PROMOTION_KIND,
+):
+    # Perform type promotion, as this is expected from prim_metafunction
+    _, result_dtype = utils.elementwise_dtypes(
+        *args,
+        type_promotion_kind=type_promotion,
+    )
+    args = [_maybe_convert_to_dtype(x, result_dtype) for x in args]
+
+    # Broadcast
+    args = _maybe_broadcast(*args)
+
+    # Perform prim checks
+    return _prim_elementwise_meta(
+        *args, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+def toRealValueType(dtype):
+    from_complex = {
+        torch.complex32: torch.half,
+        torch.cfloat: torch.float,
+        torch.cdouble: torch.double,
+    }
+    return from_complex.get(dtype, dtype)
+
+
+def check_inplace_broadcast(self_shape, *args_shape):
+    broadcasted_shape = tuple(_broadcast_shapes(self_shape, *args_shape))
+    torch._check(
+        broadcasted_shape == self_shape,
+        lambda: f"output with shape {self_shape} doesn't match the broadcast shape {broadcasted_shape}",
+    )
+
+
+@register_meta([aten.linspace, aten.logspace])
+@out_wrapper()
+def meta_linspace_logspace(
+    start,
+    end,
+    steps,
+    base=None,
+    dtype=None,
+    device=None,
+    layout=torch.strided,
+    pin_memory=False,
+    requires_grad=False,
+):
+    if isinstance(start, torch.Tensor):
+        torch._check(
+            start.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+    if isinstance(end, torch.Tensor):
+        torch._check(
+            end.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+
+    if any(isinstance(arg, complex) for arg in (start, end, steps)):
+        default_complex_dtype = utils.corresponding_complex_dtype(
+            torch.get_default_dtype()
+        )
+        if dtype is None:
+            dtype = default_complex_dtype
+        else:
+            torch._check(
+                utils.is_complex_dtype(dtype),
+                lambda: f"linspace(): inferred dtype {default_complex_dtype} can't be safely cast to passed dtype {dtype}",
+            )
+    else:
+        dtype = dtype or torch.get_default_dtype()
+    assert isinstance(dtype, torch.dtype)
+
+    # steps does not participate in the computation of the dtype
+    torch._check_type(
+        isinstance(steps, IntLike),
+        lambda: f"received an invalid combination of arguments - got \
+({type(start).__name__}, {type(end).__name__}, {type(steps).__name__})",
+    )
+    assert isinstance(steps, IntLike)  # for mypy
+    torch._check(steps >= 0, lambda: "number of steps must be non-negative")
+
+    return torch.empty(
+        (steps,),  # type: ignore[arg-type]
+        dtype=dtype,
+        layout=layout,
+        device="meta",
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_meta([aten.take.default, aten.take.out])
+@out_wrapper()
+def meta_take(self, index):
+    # Type and device checks
+    torch._check(
+        index.dtype == torch.long,
+        lambda: f"take(): Expected a long tensor for index, but got {index.dtype}",
+    )
+    # Index checks
+    torch._check_index(
+        not (self.numel() == 0 and index.numel() != 0),
+        lambda: "take(): tried to take from an empty tensor",
+    )
+    return self.new_empty(index.shape)
+
+
+@register_meta([aten.linalg_cross.default, aten.linalg_cross.out])
+@out_wrapper()
+def linalg_cross(self, other, *, dim=-1):
+    x_d = self.ndim
+    y_d = other.ndim
+    torch._check(
+        x_d == y_d,
+        lambda: "linalg.cross: inputs must have the same number of dimensions.",
+    )
+    torch._check(
+        self.size(dim) == 3 and other.size(dim) == 3,
+        lambda: (
+            f"linalg.cross: inputs dimension {dim} must have length 3. "
+            f"Got {self.size(dim)} and {other.size(dim)}"
+        ),
+    )
+    out_shape = _broadcast_shapes(self.shape, other.shape)
+    return self.new_empty(out_shape)
+
+
+@register_meta(aten.linalg_matrix_exp)
+@out_wrapper()
+def linalg_matrix_exp(self):
+    squareCheckInputs(self, "linalg.matrix_exp")
+    checkFloatingOrComplex(self, "linalg.matrix_exp")
+    return torch.empty_like(self, memory_format=torch.contiguous_format)
+
+
+@register_meta(
+    [aten.cummax.default, aten.cummax.out, aten.cummin.default, aten.cummin.out]
+)
+@out_wrapper("values", "indices")
+def cummaxmin(self, dim):
+    values = torch.empty(self.shape, device=self.device, dtype=self.dtype)
+    indices = torch.empty(self.shape, device=self.device, dtype=torch.int64)
+    if self.numel() != 0 and self.ndim != 0:
+        # Checks that dim is within bounds
+        maybe_wrap_dim(dim, self.ndim)
+    return values, indices
+
+
+@register_meta([aten.logcumsumexp.default, aten.logcumsumexp.out])
+@out_wrapper()
+def logcumsumexp(self, dim):
+    # Checks that dim is within bounds
+    maybe_wrap_dim(dim, self.ndim)
+    return torch.empty_like(self).contiguous()
+
+
+# Stride-related code from _exec_fft in aten/src/ATen/native/cuda/SpectralOps.cpp
+def _exec_fft(out, self, out_sizes, dim, forward):
+    ndim = self.ndim
+    signal_ndim = len(dim)
+    batch_dims = ndim - signal_ndim
+
+    # Permute dimensions so batch dimensions come first, and in stride order
+    dim_permute = list(range(ndim))
+
+    is_transformed_dim = [False for _ in range(ndim)]
+    for d in dim:
+        is_transformed_dim[d] = True
+
+    # std::partition
+    left, right = [], []
+    for d in dim_permute:
+        if not is_transformed_dim[d]:
+            left.append(d)
+        else:
+            right.append(d)
+    dim_permute = left + right
+    batch_end = len(left)
+
+    self_strides = self.stride()
+    tmp = dim_permute[:batch_end]
+    tmp.sort(key=lambda x: self_strides[x], reverse=True)
+    dim_permute = tmp + dim_permute[batch_end:]
+    input = self.permute(dim_permute)
+
+    # Collapse batch dimensions into a single dimension
+    batched_sizes = [-1] + list(input.shape[batch_dims:])
+    input = input.reshape(batched_sizes)
+
+    batch_size = input.size(0)
+    batched_sizes[0] = batch_size
+    batched_out_sizes = batched_sizes
+    for i in range(len(dim)):
+        batched_out_sizes[i + 1] = out_sizes[dim[i]]
+    out = out.reshape(batched_out_sizes)
+
+    # Reshaping to original batch shape and inverting the dimension permutation
+    out_strides = [0 for _ in range(ndim)]
+    batch_numel = 1
+    i = batch_dims - 1
+    while i >= 0:
+        out_strides[dim_permute[i]] = batch_numel * out.stride(0)
+        batch_numel *= out_sizes[dim_permute[i]]
+        i -= 1
+    for i in range(batch_dims, ndim):
+        out_strides[dim_permute[i]] = out.stride(1 + (i - batch_dims))
+    return out.as_strided(out_sizes, out_strides, out.storage_offset())
+
+
+# See _fft_c2c_cufft in aten/src/ATen/native/cuda/SpectralOps.cpp
+# and _fft_c2c_mkl in aten/src/ATen/native/mkl/SpectralOps.cpp
+@register_meta([aten._fft_c2c.default, aten._fft_c2c.out])
+@out_wrapper()
+def meta_fft_c2c(self, dim, normalization, forward):
+    assert self.dtype.is_complex
+
+    out_sizes = self.shape
+    output = self.new_empty(out_sizes)
+
+    if not dim:
+        return output
+
+    sorted_dims = dim[:]
+    self_strides = self.stride()
+    sorted_dims.sort(key=lambda x: self_strides[x], reverse=True)
+    output = _exec_fft(output, self, out_sizes, sorted_dims, forward)
+
+    return output
+
+
+@register_meta([aten._fft_r2c.default, aten._fft_r2c.out])
+@out_wrapper()
+def meta_fft_r2c(self, dim, normalization, onesided):
+    assert self.dtype.is_floating_point
+    output_sizes = list(self.size())
+
+    if onesided:
+        last_dim = dim[-1]
+        last_dim_halfsize = (output_sizes[last_dim] // 2) + 1
+        output_sizes[last_dim] = last_dim_halfsize
+
+    return self.new_empty(
+        output_sizes, dtype=utils.corresponding_complex_dtype(self.dtype)
+    )
+
+
+@register_meta(aten.randperm.generator_out)
+def meta_randperm(n, *, generator=None, out):
+    return _maybe_resize_out(out, torch.Size([n]))
+
+
+@register_meta(aten.randperm.default)
+def meta_randperm_default(
+    n, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        n, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.randint.default)
+def meta_randint(
+    high, size, *, dtype=torch.long, layout=None, device=None, pin_memory=None
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.randint.low)
+def meta_randint_low(
+    low,
+    high,
+    size,
+    *,
+    dtype=torch.long,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.rand.default)
+def meta_rand_default(size, *, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        size, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta([aten._fft_c2r.default, aten._fft_c2r.out])
+@out_wrapper()
+def meta_fft_c2r(self, dim, normalization, lastdim):
+    assert self.dtype.is_complex
+    output_sizes = list(self.size())
+    output_sizes[dim[-1]] = lastdim
+    return self.new_empty(output_sizes, dtype=toRealValueType(self.dtype))
+
+
+@register_meta(aten.copy_.default)
+def meta_copy_(self, src, non_blocking=False):
+    # This code simulates the original decomp from inductor,
+    # which runs most of the meta checks that we care about.
+    # In theory, we should make this more robust by carefully
+    # auditing our C++ copy_() kernel and copying the checks here.
+
+    if torch._debug_has_internal_overlap(self) == 1:  # 1 == MemOverlap::Yes
+        raise RuntimeError(
+            "more than one element of the written-to tensor refers to a single memory location"
+        )
+
+    if isinstance(src, Tensor):
+        intermediate = src.to(self, non_blocking)
+        if self.size() != intermediate.size():
+            aten.expand_copy.default(intermediate, self.size())
+    return self
+
+
+def inferUnsqueezeGeometry(tensor, dim):
+    result_sizes = list(tensor.size())
+    result_strides = list(tensor.stride())
+    new_stride = 1 if dim >= tensor.dim() else result_sizes[dim] * result_strides[dim]
+    result_sizes.insert(dim, 1)
+    result_strides.insert(dim, new_stride)
+    return result_sizes, result_strides
+
+
+@register_meta(aten.unsqueeze_.default)
+def meta_unsqueeze_(self, dim):
+    dim = maybe_wrap_dim(dim, self.dim() + 1)
+    g_sizes, g_strides = inferUnsqueezeGeometry(self, dim)
+    self.as_strided_(g_sizes, g_strides)
+    return self
+
+
+@register_meta(aten._sparse_semi_structured_linear)
+def meta_sparse_structured_linear(
+    input: Tensor,
+    weight: Tensor,
+    _meta: Tensor,
+    bias: Optional[Tensor] = None,
+    _activation_opt: Optional[str] = None,
+    out_dtype: Optional[torch.dtype] = None,
+):
+    output_sizes = list(input.shape)
+    if bias is not None:
+        assert weight.size(0) == bias.size(0), "output size mismatch"
+    assert weight.size(1) == input.size(-1) / 2
+    output_sizes[-1] = weight.size(0)
+
+    # see: https://github.com/pytorch/pytorch/pull/114477#issuecomment-1830121375
+    # We assume that we have already squashed the inputs into a 2-D tensor
+    # Then, as the output is transposed, we need to propagate the transposed
+    # stride information to the output tensor
+    assert len(input.shape) == 2, "we can only handle the squashed input case"
+    transposed_strides = (1, input.size(0))
+
+    if out_dtype is not None:
+        assert (
+            input.dtype == torch.int8 and out_dtype == torch.int32
+        ), "out_dtype is only supported for i8i8->i32 linear operator"
+    output = input.new_empty(
+        output_sizes,
+        dtype=input.dtype if out_dtype is None else out_dtype,
+    ).as_strided(output_sizes, transposed_strides)
+
+    return output
+
+
+@register_meta(aten._cslt_sparse_mm)
+def meta__cslt_sparse_mm(
+    compressed_A: torch.Tensor,
+    dense_B: torch.Tensor,
+    bias: Optional[Tensor] = None,
+    alpha: Optional[Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    transpose_result: bool = False,
+):
+    assert dense_B.dtype in {
+        torch.float32,
+        torch.float16,
+        torch.bfloat16,
+        torch.int8,
+    }, "_cslt_sparse_mm only supports fp16, bf16, and int8"
+    assert compressed_A.dtype == dense_B.dtype, "inputs must have the same dtype"
+    assert len(dense_B.shape) == 2, "_cslt_sparse_mm only supports 2d inputs"
+
+    is_int8_input_type = compressed_A.dtype == torch.int8
+    compression_factor = 10 if is_int8_input_type else 9
+    k = dense_B.size(0)
+    n = dense_B.size(1)
+    m = (compressed_A.numel() * 16) // (compression_factor * k)
+    if bias is not None:
+        assert m == bias.size(0)
+
+    if out_dtype is not None:
+        assert is_int8_input_type and out_dtype in {
+            torch.float16,
+            torch.bfloat16,
+            torch.int32,
+        }, "out_dtype is only supported for i8i8->fp16, bf16, or i32 matmul"
+    output_shape = (n, m) if transpose_result else (m, n)
+    result = dense_B.new_empty(output_shape, dtype=out_dtype)
+    return result
+
+
+@register_meta(aten.index_reduce.default)
+def meta_index_reduce(
+    self: Tensor,
+    dim: int,
+    index: Tensor,
+    source: torch.Tensor,
+    reduce: str,
+    *,
+    include_self: bool = True,
+) -> Tensor:
+    return torch.empty_like(self, memory_format=torch.contiguous_format)
+
+
+@register_meta(aten.index_reduce_.default)
+def meta_index_reduce_(
+    self: Tensor,
+    dim: int,
+    index: Tensor,
+    source: torch.Tensor,
+    reduce: str,
+    *,
+    include_self: bool = True,
+) -> Tensor:
+    return self
+
+
+# Implementations below are taken from https://github.com/albanD/subclass_zoo/blob/main/python_meta_tensor.py
+@out_wrapper()
+@register_meta(aten.index_select.default)
+def meta_index_select(self, dim, index):
+    result_size = list(self.size())
+    if self.dim() > 0:
+        result_size[dim] = index.numel()
+    return self.new_empty(result_size)
+
+
+@register_meta(aten.segment_reduce.default)
+def meta_segment_reduce(
+    data: Tensor,
+    reduce: str,
+    *,
+    lengths: Optional[Tensor] = None,
+    indices: Optional[Tensor] = None,
+    offsets: Optional[Tensor] = None,
+    axis: int = 0,
+    unsafe: bool = False,
+    initial=None,
+) -> Tensor:
+    if indices is not None:
+        raise NotImplementedError(
+            "segment_reduce(): indices based reduction is not supported yet."
+        )
+
+    def segment_reduce_lengths_tensor(lengths_shape):
+        return torch.empty(
+            lengths_shape + data.shape[axis + 1 :],
+            dtype=data.dtype,
+            device="meta",
+            memory_format=torch.contiguous_format,
+        )
+
+    if lengths is not None:
+        return segment_reduce_lengths_tensor(lengths.shape)
+    # FIXME should probably check that lengths and offset aren't both set, but
+    # the ATen implementation neglects this too
+    if offsets is not None:
+        # lengths == torch.diff(offsets)
+        lengths_shape = offsets.shape[:-1] + (offsets.shape[-1] - 1,)
+        return segment_reduce_lengths_tensor(lengths_shape)
+    raise RuntimeError("segment_reduce(): Either lengths or offsets must be defined.")
+
+
+@register_meta([aten.max.default, aten.max.unary_out])
+@out_wrapper()
+def meta_max(self):
+    return self.new_empty(())
+
+
+@register_meta(aten.max.dim)
+def meta_max_dim(self, dim, keepdim=False):
+    dim = utils.reduction_dims(self.shape, (dim,))
+    output_shape = _compute_reduction_shape(self, dim, keepdim)
+    return (
+        self.new_empty(output_shape),
+        self.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta([aten.min.default, aten.min.unary_out])
+@out_wrapper()
+def meta_min(self):
+    return self.new_empty(())
+
+
+@register_meta(aten.min.dim)
+def meta_min_dim(self, dim, keepdim=False):
+    dim = utils.reduction_dims(self.shape, (dim,))
+    output_shape = _compute_reduction_shape(self, dim, keepdim)
+    return (
+        self.new_empty(output_shape),
+        self.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta(aten.angle.default)
+def meta_angle(self):
+    if self.is_complex():
+        result_dtype = corresponding_real_dtype(self.dtype)
+    else:
+        _, result_dtype = elementwise_dtypes(
+            self,
+            type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        )
+    return torch.empty_like(self, dtype=result_dtype)
+
+
+@register_meta(aten.angle.out)
+def meta_angle_out(self, out):
+    torch._resize_output_(out, self.size(), self.device)
+    return out.copy_(torch.angle(self))
+
+
+@register_meta(aten._assert_async.default)
+def assert_async(val):
+    return
+
+
+@register_meta(aten._assert_async.msg)
+def assert_async_meta(val, assert_msg):
+    return
+
+
+@register_meta(aten._print.default)
+def print_meta(s):
+    return
+
+
+@register_meta(aten._make_dep_token.default)
+def make_dep_token(
+    *,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    memory_format=None,
+):
+    return torch.empty([], device="meta")
+
+
+@register_meta(aten.sym_constrain_range.default)
+def sym_constrain_range(size, min=None, max=None):
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import constrain_range
+
+    if isinstance(size, (SymFloat, SymBool)):
+        raise ValueError("Constraining SymFloat or Symbool is nyi")
+    constrain_range(size, min=min, max=max)
+
+
+@register_meta(aten._functional_sym_constrain_range.default)
+def functional_sym_constrain_range(size, min=None, max=None, dep_token=None):
+    aten.sym_constrain_range(size, min=min, max=max)
+    return dep_token
+
+
+@register_meta(aten.sym_constrain_range_for_size.default)
+def sym_constrain_range_for_size(size, min=None, max=None):
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
+
+    if isinstance(size, (SymFloat, SymBool)):
+        raise ValueError("Constraining SymFloat or Symbool is nyi")
+    _constrain_range_for_size(size, min=min, max=max)
+
+
+@register_meta(aten._functional_sym_constrain_range_for_size.default)
+def functional_sym_constrain_range_for_size(size, min, max, dep_token):
+    aten.sym_constrain_range_for_size(size, min=min, max=max)
+    return dep_token
+
+
+@register_meta(aten._functional_assert_async.msg)
+def functional_assert_async_meta(val, assert_msg, dep_token):
+    return dep_token
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def squareCheckInputs(self: Tensor, f_name: str):
+    assert (
+        self.dim() >= 2
+    ), f"{f_name}: The input tensor must have at least 2 dimensions."
+    assert self.size(-1) == self.size(
+        -2
+    ), f"{f_name}: A must be batches of square matrices, but they are {self.size(-2)} by {self.size(-1)} matrices"
+
+
+# Validates input shapes and devices
+# for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve)
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def linearSolveCheckInputs(
+    self: Tensor,
+    A: Tensor,
+    name: str,
+):
+    torch._check(
+        self.device == A.device,
+        lambda: (
+            f"Expected b and A to be on the same device, but found b on "
+            f"{self.device} and A on {A.device} instead."
+        ),
+    )
+
+    torch._check(
+        self.dtype == A.dtype,
+        lambda: (
+            f"Expected b and A to have the same dtype, but found b of type "
+            f"{self.dtype} and A of type {A.dtype} instead."
+        ),
+    )
+
+    torch._check(
+        A.size(-1) == A.size(-2),
+        lambda: (
+            f"A must be batches of square matrices, "
+            f"but they are {A.size(-2)} by {A.size(-1)} matrices"
+        ),
+    )
+
+    torch._check(
+        A.size(-1) == self.size(-2),
+        lambda: (
+            f"Incompatible matrix sizes for {name}: each A "
+            f"matrix is {A.size(-1)} by {A.size(-1)}"
+            f" but each b matrix is {self.size(-2)} by {self.size(-1)}"
+        ),
+    )
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def checkFloatingOrComplex(
+    t: Tensor, f_name: str, allow_low_precision_dtypes: bool = True
+):
+    dtype = t.dtype
+    torch._check(
+        t.is_floating_point() or t.is_complex(),
+        lambda: f"{f_name}: Expected a floating point or complex tensor as input. Got {dtype}",
+    )
+    if not allow_low_precision_dtypes:
+        torch._check(
+            dtype in (torch.float, torch.double, torch.cfloat, torch.cdouble),
+            lambda: f"{f_name}: Low precision dtypes not supported. Got {dtype}",
+        )
+
+
+# From aten/src/ATen/native/LinearAlgebraUtils.h
+def checkIsMatrix(A: Tensor, f_name: str, arg_name: str = "A"):
+    torch._check(
+        A.dim() >= 2,
+        lambda: f"{f_name}: The input tensor {arg_name} must have at least 2 dimensions.",
+    )
+
+
+def checkInputsSolver(
+    A: Tensor,
+    B: Tensor,
+    left: bool,
+    f_name: str,
+):
+    squareCheckInputs(A, f_name)
+    checkIsMatrix(B, f_name)
+    torch._check(
+        A.size(-2) == B.size(-2) if left else A.size(-1) == B.size(-1),
+        lambda: (
+            f"{f_name}: Incompatible shapes of A and B for the equation "
+            f"{'AX = B' if left else 'XA = B'}"
+            f" ({A.size(-2)}x{A.size(-1)} and {B.size(-2)}x{B.size(-1)})"
+        ),
+    )
+
+
+def checkSameDevice(
+    fn_name: str, result: Tensor, input: Tensor, result_name: str = "result"
+):
+    torch._check(
+        result.device == input.device,
+        lambda: (
+            f"{fn_name}: Expected {result_name} and input tensors to be on the same device, but got "
+            f"{result_name} on {result.device} and input on {input.device}"
+        ),
+    )
+
+
+def checkUplo(UPLO: str):
+    UPLO_uppercase = UPLO.upper()
+    torch._check(
+        len(UPLO) == 1 and (UPLO_uppercase == "U" or UPLO_uppercase == "L"),
+        lambda: f"Expected UPLO argument to be 'L' or 'U', but got {UPLO}",
+    )
+
+
+@register_meta([aten._linalg_eigh.default, aten._linalg_eigh.eigenvalues])
+@out_wrapper("eigenvalues", "eigenvectors")
+def meta__linalg_eigh(
+    A: Tensor,
+    UPLO: str = "L",
+    compute_v: bool = True,
+):
+    squareCheckInputs(A, "linalg.eigh")
+    checkUplo(UPLO)
+
+    shape = list(A.shape)
+    if compute_v:
+        vecs = A.new_empty(shape)
+        vecs.as_strided_(shape, make_contiguous_strides_for(shape, row_major=False))
+    else:
+        vecs = A.new_empty([0])
+
+    shape.pop()
+    vals = A.new_empty(shape, dtype=toRealValueType(A.dtype))
+
+    return vals, vecs
+
+
+@register_meta([aten._linalg_eigvals.default, aten.linalg_eigvals.out])
+@out_wrapper()
+def meta__linalg_eigvals(input: Tensor) -> Tensor:
+    squareCheckInputs(input, "linalg.eigvals")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    return input.new_empty(input.shape[:-1], dtype=complex_dtype)
+
+
+@register_meta([aten.linalg_eig])
+@out_wrapper("eigenvalues", "eigenvectors")
+def meta_linalg_eig(input: Tensor):
+    squareCheckInputs(input, "linalg.eig")
+    complex_dtype = (
+        input.dtype
+        if utils.is_complex_dtype(input.dtype)
+        else utils.corresponding_complex_dtype(input.dtype)
+    )
+    values = input.new_empty(input.shape[:-1], dtype=complex_dtype)
+    vectors = input.new_empty(input.shape, dtype=complex_dtype)
+    return values, vectors
+
+
+def cloneBatchedColumnMajor(src: Tensor) -> Tensor:
+    return src.mT.clone(memory_format=torch.contiguous_format).transpose(-2, -1)
+
+
+@register_meta(aten._cholesky_solve_helper)
+@out_wrapper()
+def _cholesky_solve_helper(self: Tensor, A: Tensor, upper: bool) -> Tensor:
+    return cloneBatchedColumnMajor(self)
+
+
+@register_meta(aten.cholesky_solve)
+@out_wrapper()
+def cholesky_solve(self: Tensor, A: Tensor, upper: bool = False) -> Tensor:
+    torch._check(
+        self.ndim >= 2,
+        lambda: f"b should have at least 2 dimensions, but has {self.ndim} dimensions instead",
+    )
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"u should have at least 2 dimensions, but has {A.ndim} dimensions instead",
+    )
+    self_broadcasted, A_broadcasted = _linalg_broadcast_batch_dims_name(
+        self, A, "cholesky_solve"
+    )
+    return _cholesky_solve_helper(self_broadcasted, A_broadcasted, upper)
+
+
+@register_meta(aten.cholesky)
+@out_wrapper()
+def cholesky(self: Tensor, upper: bool = False) -> Tensor:
+    if self.numel() == 0:
+        return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+    squareCheckInputs(self, "cholesky")
+    return cloneBatchedColumnMajor(self)
+
+
+@register_meta(aten.cholesky_inverse)
+@out_wrapper()
+def cholesky_inverse(self: Tensor, upper: bool = False) -> Tensor:
+    squareCheckInputs(self, "cholesky_inverse")
+    return cloneBatchedColumnMajor(self)
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_cholesky_ex.default)
+def linalg_cholesky_ex(A: Tensor, upper: bool = False, check_errors: bool = False):
+    squareCheckInputs(A, "linalg.cholesky")
+    checkFloatingOrComplex(A, "linalg.cholesky")
+
+    A_shape = A.shape
+    ndim = len(A_shape)
+
+    # L
+    L_strides = make_contiguous_strides_for(A_shape, False)
+    L = A.new_empty(A_shape)
+    L.as_strided_(A_shape, L_strides)
+
+    # infos
+    infos = A.new_empty(A_shape[0 : ndim - 2], dtype=torch.int32)
+    return L, infos
+
+
+@register_meta(
+    [aten.linalg_householder_product.default, aten.linalg_householder_product.out]
+)
+@out_wrapper()
+def linalg_householder_product(input: Tensor, tau: Tensor) -> Tensor:
+    torch._check(
+        input.ndim >= 2,
+        lambda: "torch.linalg.householder_product: input must have at least 2 dimensions.",
+    )
+    torch._check(
+        input.size(-2) >= input.size(-1),
+        lambda: "torch.linalg.householder_product: input.shape[-2] must be greater than or equal to input.shape[-1]",
+    )
+    torch._check(
+        input.size(-1) >= tau.size(-1),
+        lambda: "torch.linalg.householder_product: input.shape[-1] must be greater than or equal to tau.shape[-1]",
+    )
+
+    torch._check(
+        input.ndim - tau.ndim == 1,
+        lambda: (
+            f"torch.linalg.householder_product: Expected tau to have one dimension less than input, "
+            f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+    if input.ndim > 2:
+        expected_batch_tau_shape = input.shape[:-2]
+        actual_batch_tau_shape = tau.shape[:-1]
+        torch._check(
+            actual_batch_tau_shape == expected_batch_tau_shape,
+            lambda: (
+                f"torch.linalg.householder_product: Expected batch dimensions of tau to be "
+                f"equal to input.shape[:-2], but got {actual_batch_tau_shape}"
+            ),
+        )
+
+    torch._check(
+        tau.dtype == input.dtype,
+        lambda: (
+            f"torch.linalg.householder_product: tau dtype {tau.dtype}"
+            f" does not match input dtype {input.dtype}"
+        ),
+    )
+    checkSameDevice("torch.linalg.householder_product", tau, input, "tau")
+
+    return torch.empty_strided(
+        size=input.shape,
+        stride=make_contiguous_strides_for(input.shape, row_major=False),
+        dtype=input.dtype,
+        device=input.device,
+    )
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+@register_meta(aten.linalg_inv_ex.default)
+def linalg_inv_ex_meta(A: Tensor, check_errors: bool = False):
+    squareCheckInputs(A, "linalg.inv_ex")
+    checkFloatingOrComplex(A, "linalg.inv_ex", allow_low_precision_dtypes=False)
+
+    L = A.new_empty(A.shape)
+    L.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False))
+
+    infos = A.new_empty(A.shape[:-2], dtype=torch.int32)
+    return L, infos
+
+
+@register_meta([aten.linalg_ldl_factor_ex.default, aten.linalg_ldl_factor_ex.out])
+@out_wrapper("LD", "pivots", "info")
+def linalg_ldl_factor_ex_meta(
+    self: Tensor,
+    *,
+    hermitian: bool = False,
+    check_errors: bool = False,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    squareCheckInputs(self, "torch.linalg.ldl_factor_ex")
+    checkFloatingOrComplex(self, "torch.linalg.ldl_factor_ex")
+    LD = torch.empty_strided(
+        size=self.shape,
+        stride=make_contiguous_strides_for(self.shape, row_major=False),
+        dtype=self.dtype,
+        device=self.device,
+    )
+    pivots = self.new_empty(self.shape[:-1], dtype=torch.int)
+    info = self.new_empty(self.shape[:-2], dtype=torch.int)
+    return LD, pivots, info
+
+
+@register_meta([aten.linalg_ldl_solve.default, aten.linalg_ldl_solve.out])
+@out_wrapper()
+def linalg_ldl_solve_meta(
+    LD: Tensor, pivots: Tensor, B: Tensor, *, hermitian: bool = False
+) -> Tensor:
+    squareCheckInputs(LD, "torch.linalg.ldl_solve")
+    checkFloatingOrComplex(LD, "torch.linalg.ldl_solve")
+    linearSolveCheckInputs(B, LD, "torch.linalg.ldl_solve")
+    torch._check(
+        B.ndim >= 2,
+        lambda: (
+            f"torch.linalg.ldl_solve: Expected B to have at least 2 dimensions, "
+            f"but it has {B.ndim} dimensions instead"
+        ),
+    )
+    expected_pivots_shape = LD.shape[:-1]
+    torch._check(
+        expected_pivots_shape == pivots.shape,
+        lambda: (
+            f"torch.linalg.ldl_solve: Expected LD.shape[:-1] and pivots.shape to be the same, "
+            f"but got pivots with shape {pivots.shape} instead"
+        ),
+    )
+    torch._check(
+        utils.is_integer_dtype(pivots.dtype),
+        lambda: f"torch.linalg.ldl_solve: Expected pivots to be integers. Got {pivots.dtype}",
+    )
+    torch._check(
+        LD.dtype == B.dtype,
+        lambda: f"torch.linalg.ldl_solve: LD dtype {LD.dtype} does not match b dtype {B.dtype}",
+    )
+    B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LD)
+    return torch.empty_strided(
+        size=B_broadcast_size,
+        stride=make_contiguous_strides_for(B_broadcast_size, row_major=False),
+        dtype=B.dtype,
+        device=B.device,
+    )
+
+
+@register_meta([aten.linalg_lu.default, aten.linalg_lu.out])
+@out_wrapper("P", "L", "U")
+def linalg_lu_meta(A: Tensor, *, pivot: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"linalg.lu: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead",
+    )
+
+    sizes = list(A.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+    k = min(m, n)
+
+    sizes[-1] = m
+    if pivot:
+        P = A.new_empty(sizes)
+    else:
+        P = A.new_empty([0])
+
+    sizes[-1] = k
+    L = A.new_empty(sizes)
+
+    sizes[-2] = k
+    sizes[-1] = n
+    U = A.new_empty(sizes)
+    return P, L, U
+
+
+@register_meta([aten.linalg_lu_factor_ex.default, aten.linalg_lu_factor_ex.out])
+@out_wrapper("LU", "pivots", "info")
+def linalg_lu_factor_ex_meta(
+    A: Tensor, *, pivot: bool = True, check_errors: bool = False
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        A.ndim >= 2,
+        lambda: f"torch.lu_factor: Expected tensor with 2 or more dimensions. Got size: {A.shape} instead",
+    )
+
+    sizes = list(A.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+
+    LU = torch.empty_strided(
+        size=sizes,
+        stride=make_contiguous_strides_for(sizes, row_major=False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+
+    # Sets sizes to the size of pivots
+    sizes.pop()
+    sizes[-1] = min(m, n)
+    pivots = A.new_empty(sizes, dtype=torch.int)
+
+    # Sets sizes to the size of info
+    sizes.pop()
+    info = A.new_empty(sizes, dtype=torch.int)
+
+    return LU, pivots, info
+
+
+@register_meta([aten.linalg_lu_solve.default, aten.linalg_lu_solve.out])
+@out_wrapper()
+def linalg_lu_solve_meta(
+    LU: Tensor,
+    pivots: Tensor,
+    B: Tensor,
+    *,
+    left: bool = True,
+    adjoint: bool = False,
+) -> Tensor:
+    # dtype
+    checkFloatingOrComplex(LU, "torch.linalg.lu_solve")
+    torch._check(
+        LU.dtype == B.dtype,
+        lambda: (
+            f"linalg.lu_solve: Expected LU and B to have the same dtype, "
+            f"but found LU of type {LU.dtype} and B of type {B.dtype} instead"
+        ),
+    )
+    torch._check(
+        pivots.dtype == torch.int,
+        lambda: "linalg.lu_solve: pivots should be a Tensor of scalar type torch.int32",
+    )
+
+    # matrix shapes
+    squareCheckInputs(LU, "torch.linalg.lu_solve")
+    checkInputsSolver(LU, B, left, "linalg.lu_solve")
+    torch._check(
+        LU.size(-1) == pivots.size(-1),
+        lambda: "linalg.lu_solve: Number of pivots per batch should be same as the dimension of the matrix",
+    )
+
+    # batches
+    torch._check(
+        LU.shape[:-1] == pivots.shape,
+        lambda: (
+            f"linalg.lu_solve: Expected LU.shape[:-1] and pivots.shape to be the same, "
+            f"but got pivots with shape {pivots.shape} instead"
+        ),
+    )
+
+    B_broadcast_size, _ = _linalg_broadcast_batch_dims(B, LU)
+
+    result = torch.empty_strided(
+        size=B_broadcast_size,
+        stride=make_contiguous_strides_for(B_broadcast_size, row_major=not left),
+        dtype=B.dtype,
+        device=B.device,
+    )
+
+    if result.numel() != 0 and not left:
+        if result.is_complex():
+            result = result.conj()
+
+    return result
+
+
+@register_meta(aten.lu_unpack)
+@out_wrapper("P", "L", "U")
+def lu_unpack_meta(
+    LU: Tensor,
+    pivots: Tensor,
+    unpack_data: bool = True,
+    unpack_pivots: bool = True,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        LU.ndim >= 2,
+        lambda: f"torch.lu_unpack: Expected tensor with 2 or more dimensions. Got size: {LU.shape} instead",
+    )
+    if unpack_pivots:
+        torch._check(
+            pivots.dtype == torch.int32,
+            lambda: (
+                "torch.lu_unpack: LU_pivots is expected to be a contiguous tensor of torch.int32 dtype.\n"
+                "Note: this function is intended to be used with the output produced by torch.linalg.lu_factor"
+            ),
+        )
+    sizes = list(LU.shape)
+    m = sizes[-2]
+    n = sizes[-1]
+    k = min(m, n)
+    sizes[-1] = m
+    if unpack_pivots:
+        P = LU.new_empty(sizes)
+    else:
+        P = LU.new_empty([0])
+    if unpack_data:
+        sizes[-1] = k
+        L = LU.new_empty(sizes)
+        sizes[-2] = k
+        sizes[-1] = n
+        U = LU.new_empty(sizes)
+    else:
+        L = LU.new_empty([0])
+        U = LU.new_empty([0])
+    return P, L, U
+
+
+# parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced)
+def _parse_qr_mode(mode: str) -> Tuple[bool, bool]:
+    if mode == "reduced":
+        compute_q = True
+        reduced = True
+    elif mode == "complete":
+        compute_q = True
+        reduced = False
+    elif mode == "r":
+        compute_q = False
+        reduced = True  # this is actually irrelevant in this mode
+    else:
+        torch._check(
+            False,
+            lambda: (
+                f"qr received unrecognized mode '{mode}' "
+                f"but expected one of 'reduced' (default), 'r', or 'complete'"
+            ),
+        )
+    return compute_q, reduced  # type: ignore[possibly-undefined]
+
+
+@register_meta([aten.linalg_qr.default, aten.linalg_qr.out])
+@out_wrapper("Q", "R")
+def linalg_qr_meta(
+    A: Tensor,
+    mode: str = "reduced",
+) -> Tuple[Tensor, Tensor]:
+    checkIsMatrix(A, "linalg.qr")
+    checkFloatingOrComplex(A, "linalg.qr")
+
+    compute_q, reduced_mode = _parse_qr_mode(mode)
+
+    m = A.shape[-2]
+    n = A.shape[-1]
+    k = min(m, n)
+
+    if compute_q:
+        Q_shape = list(A.shape)
+        Q_shape[-1] = k if reduced_mode else m
+        Q = A.new_empty(Q_shape)
+        Q.as_strided_(Q_shape, make_contiguous_strides_for(Q_shape, row_major=False))
+    else:
+        Q = A.new_empty([0])
+
+    # For readability
+    R_shape = list(A.shape)
+    R_shape[-2] = k if reduced_mode or not compute_q else m
+    R = A.new_empty(R_shape)
+    R.as_strided_(R_shape, make_contiguous_strides_for(R_shape, row_major=False))
+    return Q, R
+
+
+@register_meta([aten._linalg_slogdet.default, aten._linalg_slogdet.sign])
+@out_wrapper("sign", "logabsdet", "LU", "pivots")
+def _linalg_slogdet(A: Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    squareCheckInputs(A, "linalg.slogdet")
+    checkFloatingOrComplex(A, "linalg.slogdet", False)
+    shape = A.shape
+    sign = A.new_empty(shape[:-2])
+    logabsdet = A.new_empty(shape[:-2], dtype=toRealValueType(A.dtype))
+    LU = torch.empty_strided(
+        size=shape,
+        stride=make_contiguous_strides_for(shape, False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+    pivots = A.new_empty(shape[:-1], dtype=torch.int32)
+    return sign, logabsdet, LU, pivots
+
+
+# From aten/src/ATen/native/BatchLinearAlgebra.cpp
+# NOTE: matching defaults in aten/src/ATen/native/native_functions.yaml
+@register_meta(aten._linalg_svd.default)
+def _linalg_svd_meta(
+    A: Tensor,
+    full_matrices: bool = False,
+    compute_uv: bool = True,
+    driver: Optional[str] = None,
+):
+    checkIsMatrix(A, "linalg.svd")
+    checkFloatingOrComplex(A, "linalg.svd")
+
+    batch_dims = list(A.shape[:-2])
+    m = A.shape[-2]
+    n = A.shape[-1]
+    k = min(m, n)
+
+    if compute_uv:
+        U_shape = batch_dims + [m, m if full_matrices else k]
+        U = A.new_empty(U_shape)
+        U.as_strided_(U_shape, make_contiguous_strides_for(U_shape, row_major=False))
+
+        V_shape = batch_dims + [n if full_matrices else k, n]
+        V = A.new_empty(V_shape)
+        # NB: This checks for CUDA since there is no way to check for cuSolver.
+        # Also, this might not work correctly on CPU when fake_device is not
+        # available as device_hint just defaults to CUDA in that case. See
+        # _linalg_svd meta in core.
+        is_cuda = device_hint(A) == "cuda"
+        V.as_strided_(V_shape, make_contiguous_strides_for(V_shape, row_major=is_cuda))
+    else:
+        # doesn't matter
+        U = A.new_empty([0])
+        V = A.new_empty([0])
+
+    # S is always real, even when A is complex.
+    S = A.new_empty(batch_dims + [k], dtype=toRealValueType(A.dtype))
+    return U, S, V
+
+
+def _linalg_broadcast_batch_dims(
+    arg1: Tensor, arg2: Tensor
+) -> Tuple[List[int], List[int]]:
+    # broadcast the batch dimensions of arg1 and arg2.
+    arg1_batch_sizes = arg1.shape[:-2]
+    arg2_batch_sizes = arg2.shape[:-2]
+    expand_batch_portion = _broadcast_shapes(arg1_batch_sizes, arg2_batch_sizes)
+
+    arg1_expand_size = list(expand_batch_portion)
+    arg1_expand_size += [arg1.size(-2), arg1.size(-1)]
+
+    arg2_expand_size = list(expand_batch_portion)
+    arg2_expand_size += [arg2.size(-2), arg2.size(-1)]
+    return arg1_expand_size, arg2_expand_size
+
+
+def _linalg_broadcast_batch_dims_name(
+    arg1: Tensor, arg2: Tensor, name: Optional[str]
+) -> Tuple[Tensor, Tensor]:
+    # If there's no name we assume we don't want to check the errors
+    if name:
+        linearSolveCheckInputs(arg1, arg2, name)
+
+    arg1_expand_size, arg2_expand_size = _linalg_broadcast_batch_dims(arg1, arg2)
+
+    arg1_broadcasted = (
+        arg1 if arg1_expand_size == arg1.shape else arg1.expand(arg1_expand_size)
+    )
+    arg2_broadcasted = (
+        arg2 if arg2_expand_size == arg2.shape else arg2.expand(arg2_expand_size)
+    )
+    return arg1_broadcasted, arg2_broadcasted
+
+
+def linalg_solve_is_vector_rhs(input: Tensor, other: Tensor) -> bool:
+    expected_batched_rhs_shape = input.shape[:-1]
+    vector_case = other.ndim == 1 or (
+        input.ndim - 1 == other.ndim and other.shape == expected_batched_rhs_shape
+    )
+    return vector_case
+
+
+@register_meta(aten._linalg_solve_ex)
+def _linalg_solve_ex(
+    A: Tensor,
+    B: Tensor,
+    *,
+    left: bool = True,
+    check_errors: bool = False,
+    result: Optional[Tensor] = None,
+    LU: Optional[Tensor] = None,
+    pivots: Optional[Tensor] = None,
+    info: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    checkFloatingOrComplex(A, "linalg.solve")
+    torch._check(
+        A.dtype == B.dtype,
+        lambda: (
+            f"linalg.solve: Expected A and B to have the same dtype, but found A of type "
+            f"{A.dtype} and B of type {B.dtype} instead"
+        ),
+    )
+    vector_case = linalg_solve_is_vector_rhs(A, B)
+    B_ = B.unsqueeze(-1) if vector_case else B
+    checkInputsSolver(A, B_, left, "linalg.solve")
+    B_broad_shape, _ = _linalg_broadcast_batch_dims(B_, A)
+    torch._check(
+        left or not vector_case,
+        lambda: (
+            "linalg.solve: Vector broadcasting of the left hand side is not supported for left=False. "
+            "In this case linalg.solve is equivalent to B / A.squeeze(-1)"
+        ),
+    )
+    result_shape = B_broad_shape[:-1] if vector_case else B_broad_shape
+    result_ = torch.empty_strided(
+        size=result_shape,
+        stride=make_contiguous_strides_for(result_shape, not left),
+        dtype=B.dtype,
+        device=B.device,
+    )
+    shape = A.shape
+    ndim = A.ndim
+    LU_ = torch.empty_strided(
+        size=shape,
+        stride=make_contiguous_strides_for(shape, False),
+        dtype=A.dtype,
+        device=A.device,
+    )
+    pivots_ = A.new_empty(shape[:-1], dtype=torch.int32)
+    info_ = A.new_empty(shape[:-2], dtype=torch.int32)
+    out = (result, LU, pivots, info)
+    res = (result_, LU_, pivots_, info_)
+    if all(x is not None for x in out):
+        for r, o in zip(res, out):
+            # resize and copy operations are done in-place
+            _maybe_resize_out(o, r.shape)  # type: ignore[arg-type]
+            # strides are not copied in out_wrapper
+            o.as_strided_(r.shape, r.stride())  # type: ignore[union-attr]
+            _safe_copy_out(copy_from=r, copy_to=o, exact_dtype=False)  # type: ignore[arg-type]
+    return res
+
+
+@register_meta([aten.linalg_solve_triangular.default, aten.linalg_solve_triangular.out])
+def linalg_solve_triangular_meta(
+    A: Tensor,
+    B: Tensor,
+    *,
+    upper: bool,
+    left: bool = True,
+    unitriangular: bool = False,
+    out: Optional[Tensor] = None,
+) -> Tensor:
+    if out is None:
+        out = A.new_empty([0])
+    assert isinstance(out, TensorLike)
+    checkInputsSolver(A, B, left, "linalg.solve_triangular")
+    B_, A_ = _linalg_broadcast_batch_dims_name(B, A, None)
+    avoid_copy_A = A_.transpose(-2, -1).is_contiguous() and A_.is_conj()
+    if avoid_copy_A:
+        out = _maybe_resize_out(out, B_.shape)
+    else:
+        # reimplementation of resize_output with result F-contig
+        if _resize_output_check(out, B_.shape):
+            out.resize_(B_.transpose(-2, -1).shape)
+            out.transpose_(-2, -1)
+    return out  # type: ignore[return-value]
+
+
+@register_meta(aten.triangular_solve)
+@out_wrapper("solution", "cloned_coefficient")
+def triangular_solve_meta(
+    self: Tensor,
+    A: Tensor,
+    upper: bool = True,
+    transpose: bool = False,
+    unitriangular: bool = False,
+) -> Tuple[Tensor, Tensor]:
+    torch._check(
+        self.ndim >= 2,
+        lambda: (
+            f"torch.triangular_solve: Expected b to have at least 2 dimensions, "
+            f"but it has {self.ndim} dimensions instead"
+        ),
+    )
+    torch._check(
+        A.ndim >= 2,
+        lambda: (
+            f"torch.triangular_solve: Expected A to have at least 2 dimensions, "
+            f"but it has {A.ndim} dimensions instead"
+        ),
+    )
+
+    linearSolveCheckInputs(self, A, "triangular_solve")
+
+    if A.layout == torch.strided:
+        self_broadcast_size, A_broadcast_size = _linalg_broadcast_batch_dims(self, A)
+        solution = torch.empty_strided(
+            size=self_broadcast_size,
+            stride=make_contiguous_strides_for(self_broadcast_size, row_major=False),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        cloned_coefficient = torch.empty_strided(
+            size=A_broadcast_size,
+            stride=make_contiguous_strides_for(A_broadcast_size, row_major=False),
+            dtype=A.dtype,
+            device=A.device,
+        )
+    elif A.layout == torch.sparse_csr or A.layout == torch.sparse_bsr:
+        solution = torch.empty_like(self)
+        cloned_coefficient = self.new_empty([0])
+    else:
+        torch._check(False, lambda: "triangular_solve: Got an unexpected layout.")
+    return solution, cloned_coefficient  # type: ignore[possibly-undefined]
+
+
+# From aten/src/ATen/native/LinearAlgebra.cpp
+@register_meta(aten._linalg_det.default)
+def _linalg_det_meta(A):
+    squareCheckInputs(A, "linalg.det")
+    checkFloatingOrComplex(A, "linalg.det")
+
+    det = A.new_empty(A.shape[:-2])
+
+    LU = A.new_empty(A.shape)
+    LU.as_strided_(A.shape, make_contiguous_strides_for(A.shape, row_major=False))
+
+    pivots = A.new_empty(A.shape[:-1], dtype=torch.int32)
+    return det, LU, pivots
+
+
+@register_meta(aten.ormqr)
+@out_wrapper()
+def ormqr(
+    input: Tensor,
+    tau: Tensor,
+    other: Tensor,
+    left: bool = True,
+    transpose: bool = False,
+) -> Tensor:
+    torch._check(
+        input.ndim >= 2, lambda: "torch.ormqr: input must have at least 2 dimensions."
+    )
+    torch._check(
+        other.ndim >= 2, lambda: "torch.ormqr: other must have at least 2 dimensions."
+    )
+
+    left_size_condition = -2 if left else -1
+    torch._check(
+        other.shape[left_size_condition] >= tau.shape[-1],
+        lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be greater than or equal to tau.shape[-1]",
+    )
+    torch._check(
+        other.shape[left_size_condition] == input.shape[-2],
+        lambda: f"torch.ormqr: other.shape[{left_size_condition}] must be equal to input.shape[-2]",
+    )
+
+    torch._check(
+        tau.shape[-1] <= input.shape[-1],
+        lambda: "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]",
+    )
+
+    torch._check(
+        input.ndim - tau.ndim == 1,
+        lambda: (
+            f"torch.ormqr: Expected tau to have one dimension less than input, "
+            f"but got tau.ndim equal to {tau.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+    torch._check(
+        input.ndim == other.ndim,
+        lambda: (
+            f"torch.ormqr: Expected other to have the same number of dimensions as input, "
+            f"but got other.ndim equal to {other.ndim} and input.ndim is equal to {input.ndim}"
+        ),
+    )
+
+    if input.ndim > 2:
+        expected_batch_shape = input.shape[:-2]
+        actual_batch_tau_shape = tau.shape[:-1]
+        torch._check(
+            actual_batch_tau_shape == expected_batch_shape,
+            lambda: (
+                f"torch.ormqr: Expected batch dimensions of tau to be "
+                f"equal to input.shape[:-2], but got {actual_batch_tau_shape}"
+            ),
+        )
+
+        actual_batch_other_shape = other.shape[:-2]
+        torch._check(
+            actual_batch_other_shape == expected_batch_shape,
+            lambda: (
+                f"torch.ormqr: Expected batch dimensions of other to be "
+                f"equal to input.shape[:-2], but got {actual_batch_other_shape}"
+            ),
+        )
+
+    torch._check(
+        tau.dtype == input.dtype,
+        lambda: (
+            f"torch.ormqr: Expected input and tau to have the same dtype, "
+            f"but input has dtype {input.dtype} and tau has dtype {tau.dtype}"
+        ),
+    )
+    torch._check(
+        other.dtype == input.dtype,
+        lambda: (
+            f"torch.ormqr: Expected input and other to have the same dtype, "
+            f"but input has dtype {input.dtype} and other has dtype {other.dtype}"
+        ),
+    )
+
+    checkSameDevice("torch.ormqr", tau, input, "tau")
+    checkSameDevice("torch.ormqr", other, input, "other")
+
+    return torch.empty_strided(
+        size=other.shape,
+        stride=make_contiguous_strides_for(other.shape, row_major=False),
+        dtype=other.dtype,
+        device=other.device,
+    )
+
+
+def _padding_check_valid_input(input, padding, *, dim):
+    torch._check(
+        len(padding) == 2 * dim,
+        lambda: f"padding size is expected to be {2 * dim}, but got: {len(padding)}",
+    )
+
+    input_dim = input.ndim
+
+    is_batch_mode = input_dim == (dim + 2)
+
+    valid_batch_mode = is_batch_mode
+    valid_non_batch_mode = not is_batch_mode
+
+    if is_batch_mode:
+        # allow batch size of 0-dim.
+        for d in range(1, input_dim):
+            valid_batch_mode = valid_batch_mode and input.size(d) != 0
+    else:
+        for d in range(0, input_dim):
+            valid_non_batch_mode = valid_non_batch_mode and input.size(d) != 0
+
+    # allow empty batch size but not other dimensions.
+    torch._check(
+        valid_batch_mode or valid_non_batch_mode,
+        lambda: (
+            f"Expected {dim + 1}D or {dim + 2}D (batch mode) tensor with possibly 0 batch size "
+            f"and other non-zero dimensions for input, but got: {input.shape}"
+        ),
+    )
+
+
+def _pad1d_common(input, padding, *, is_reflection):
+    dim_plane = 0
+    dim_w = 1
+    nbatch = 1
+
+    if input.ndim == 3:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_plane += 1
+
+    _padding_check_valid_input(input, padding, dim=1)
+
+    pad_l, pad_r = padding
+
+    nplane = input.size(dim_plane)
+    input_w = input.size(dim_w)
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1,
+        lambda: f"input (W: {input_w}) is too small. Calculated output W: {output_w}",
+    )
+
+    if input.ndim == 2:
+        return input.new_empty((nplane, output_w))
+    else:
+        return input.new_empty((nbatch, nplane, output_w))
+
+
+@register_meta(aten.reflection_pad1d)
+@out_wrapper()
+def meta_reflection_pad1d(input, padding):
+    return _pad1d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad1d)
+@out_wrapper()
+def meta_replication_pad1d(input, padding):
+    return _pad1d_common(input, padding, is_reflection=False)
+
+
+def _pad1d_backward_common(grad_output, input, padding, *, is_reflection):
+    dim_w = 1
+    if not is_reflection:
+        torch._check(len(padding) == 2, lambda: "padding size is expected to be 2")
+
+    if input.ndim == 3:
+        dim_w += 1
+
+    pad_l, pad_r = padding
+
+    input_w = input.size(dim_w)
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten.reflection_pad1d_backward)
+@out_wrapper("grad_input")
+def meta_reflection_pad1d_backward(grad_output, input, padding):
+    return _pad1d_backward_common(grad_output, input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad1d_backward)
+@out_wrapper("grad_input")
+def meta_replication_pad1d_backward(grad_output, input, padding):
+    return _pad1d_backward_common(grad_output, input, padding, is_reflection=False)
+
+
+def _pad2d_common(input, padding, *, is_reflection):
+    dim_w = 2
+    dim_h = 1
+    dim_slices = 0
+    nbatch = 1
+
+    _padding_check_valid_input(input, padding, dim=2)
+
+    ndim = input.ndim
+    if ndim == 4:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_h += 1
+        dim_slices += 1
+
+    pad_l, pad_r, pad_t, pad_b = padding
+
+    nplane = input.size(dim_slices)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_t < input_h and pad_b < input_h,
+            lambda: (
+                f"Argument #6: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1 or output_h >= 1,
+        lambda: (
+            f"input (H: {input_h} W: {input_w}) is too small. "
+            f"Calculated output H: {output_h} W: {output_w}"
+        ),
+    )
+
+    if input.ndim == 3:
+        return input.new_empty((nplane, output_h, output_w))
+    else:
+        return input.new_empty((nbatch, nplane, output_h, output_w))
+
+
+@register_meta(aten.reflection_pad2d)
+@out_wrapper()
+def meta_reflection_pad2d(input, padding):
+    return _pad2d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad2d)
+@out_wrapper()
+def meta_replication_pad2d(input, padding):
+    return _pad2d_common(input, padding, is_reflection=False)
+
+
+@register_meta(
+    [
+        aten.reflection_pad2d_backward.default,
+        aten.reflection_pad2d_backward.grad_input,
+        aten.replication_pad2d_backward.default,
+        aten.replication_pad2d_backward.grad_input,
+    ]
+)
+@out_wrapper("grad_input")
+def meta_pad2d_backward(grad_output, self, padding):
+    dim_w = 2
+    dim_h = 1
+    dim_plane = 0
+    nbatch = 1
+
+    self_shape = self.shape
+    if self.dim() == 4:
+        nbatch = self_shape[0]
+        dim_w += 1
+        dim_h += 1
+        dim_plane += 1
+
+    pad_l, pad_r, pad_t, pad_b = padding
+
+    nplane = self_shape[dim_plane]
+    input_h = self_shape[dim_h]
+    input_w = self_shape[dim_w]
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+    torch._check(
+        output_h == grad_output.size(dim_h),
+        lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}",
+    )
+    return self.new_empty(self.shape)
+
+
+def _pad3d_common(input, padding, *, is_reflection):
+    dim_w = 3
+    dim_h = 2
+    dim_d = 1
+    dim_plane = 0
+
+    _padding_check_valid_input(input, padding, dim=3)
+
+    batch_mode = input.ndim == 5
+    if batch_mode:
+        nbatch = input.size(0)
+        dim_w += 1
+        dim_h += 1
+        dim_d += 1
+        dim_plane += 1
+
+    pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding
+
+    nplane = input.size(dim_plane)
+    input_d = input.size(dim_d)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_d = input_d + pad_f + pad_bk
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    if is_reflection:
+        torch._check(
+            pad_l < input_w and pad_r < input_w,
+            lambda: (
+                f"Argument #4: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_l}, {pad_r}) at dimension {dim_w} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_t < input_h and pad_b < input_h,
+            lambda: (
+                f"Argument #6: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_t}, {pad_b}) at dimension {dim_h} of input {input.shape}"
+            ),
+        )
+        torch._check(
+            pad_f < input_d and pad_bk < input_d,
+            lambda: (
+                f"Argument #8: Padding size should be less than the corresponding input dimension, "
+                f"but got: padding ({pad_f}, {pad_bk}) at dimension {dim_d} of input {input.shape}"
+            ),
+        )
+
+    torch._check(
+        output_w >= 1 or output_h >= 1 or output_d >= 1,
+        lambda: (
+            f"input (D: {input_d} H: {input_h} W: {input_w}) is too small. "
+            f"Calculated output D: {output_d} H: {output_h} W: {output_w}"
+        ),
+    )
+
+    if batch_mode:
+        return input.new_empty((nbatch, nplane, output_d, output_h, output_w))  # type: ignore[possibly-undefined]
+    else:
+        return input.new_empty((nplane, output_d, output_h, output_w))
+
+
+@register_meta(aten.reflection_pad3d)
+@out_wrapper()
+def meta_reflection_pad3d(input, padding):
+    return _pad3d_common(input, padding, is_reflection=True)
+
+
+@register_meta(aten.replication_pad3d)
+@out_wrapper()
+def meta_replication_pad3d(input, padding):
+    return _pad3d_common(input, padding, is_reflection=False)
+
+
+@register_meta(
+    [
+        aten.reflection_pad3d_backward.default,
+        aten.reflection_pad3d_backward.grad_input,
+        aten.replication_pad3d_backward.default,
+        aten.replication_pad3d_backward.grad_input,
+    ]
+)
+@out_wrapper("grad_input")
+def meta_pad3d_backward(grad_output, input, padding):
+    torch._check(len(padding) == 6, lambda: "padding size is expected to be 6")
+    assert input.ndim > 3
+    assert grad_output.ndim == input.ndim
+
+    dim_w = 3
+    dim_h = 2
+    dim_d = 1
+
+    if input.ndim == 5:
+        dim_w += 1
+        dim_h += 1
+        dim_d += 1
+
+    pad_l, pad_r, pad_t, pad_b, pad_f, pad_bk = padding
+
+    input_d = input.size(dim_d)
+    input_h = input.size(dim_h)
+    input_w = input.size(dim_w)
+    output_d = input_d + pad_f + pad_bk
+    output_h = input_h + pad_t + pad_b
+    output_w = input_w + pad_l + pad_r
+
+    torch._check(
+        output_w == grad_output.size(dim_w),
+        lambda: f"grad_output width unexpected. Expected: {output_w}, Got: {grad_output.size(dim_w)}",
+    )
+    torch._check(
+        output_h == grad_output.size(dim_h),
+        lambda: f"grad_output height unexpected. Expected: {output_h}, Got: {grad_output.size(dim_h)}",
+    )
+    torch._check(
+        output_d == grad_output.size(dim_d),
+        lambda: f"grad_output depth unexpected. Expected: {output_d}, Got: {grad_output.size(dim_d)}",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten._pdist_forward)
+@out_wrapper()
+def meta__pdist_forward(self: Tensor, p: float = 2) -> Tensor:
+    torch._check(
+        self.is_contiguous(), lambda: "_pdist_forward requires contiguous input"
+    )
+    n = self.size(0)
+    if n <= 1:
+        return self.new_empty([0]).to(memory_format=torch.legacy_contiguous_format)  # type: ignore[call-overload]
+    else:
+        return self.new_empty((n * (n - 1) // 2,)).to(
+            memory_format=torch.legacy_contiguous_format
+        )  # type: ignore[call-overload]
+
+
+@register_meta(aten._pdist_backward)
+@out_wrapper()
+def meta__pdist_backward(grad: Tensor, self: Tensor, p: float, pdist: Tensor) -> Tensor:
+    torch._check(
+        self.is_contiguous(), lambda: "_pdist_backward requires self to be contiguous"
+    )
+    torch._check(
+        pdist.is_contiguous(), lambda: "_pdist_backward requires pdist to be contiguous"
+    )
+    return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+
+
+@register_meta([aten.baddbmm.default, aten.baddbmm.out])
+@out_wrapper()
+def meta_baddbmm(self, batch1, batch2, *, beta=1, alpha=1):
+    dim1 = batch1.size(0)
+    dim2 = batch1.size(1)
+    dim3 = batch2.size(2)
+    self = self.expand((dim1, dim2, dim3))
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+    torch._check(
+        self.dtype == batch1.dtype == batch2.dtype,
+        lambda: f"Input dtypes must be the same, got: input: {self.dtype}, batch1: {batch1.dtype}, batch2: {batch2.dtype}",
+    )
+    batch1_sizes = batch1.shape
+    batch2_sizes = batch2.shape
+    bs = batch1_sizes[0]
+    contraction_size = batch1_sizes[2]
+    torch._check(
+        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+        lambda: (
+            f"Expected size for first two dimensions of batch2 tensor to be: "
+            f"[{bs}, {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}]."
+        ),
+    )
+    return self.new_empty(self.size())
+
+
+@register_meta([aten.bernoulli.default, aten.bernoulli.out])
+@out_wrapper()
+def meta_bernoulli(self, *, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
+@register_meta(aten.bernoulli_.float)
+def meta_bernoulli_(self, p=0.5, generator=None):
+    return self
+
+
+@register_meta(aten.bernoulli.p)
+def meta_bernoulli_p(self, p=0.5, generator=None):
+    # https://github.com/pytorch/pytorch/issues/88612
+    return torch.empty_like(self).contiguous()
+
+
+@register_meta(aten._fused_moving_avg_obs_fq_helper.default)
+def meta__fused_moving_avg_obs_fq_helper(
+    self,
+    observer_on,
+    fake_quant_on,
+    running_min,
+    running_max,
+    scale,
+    zero_point,
+    averaging_const,
+    quant_min,
+    quant_max,
+    ch_axis,
+    per_row_fake_quant=False,
+    symmetric_quant=False,
+):
+    torch._check(
+        ch_axis < self.dim(),
+        lambda: "Error in fused_moving_avg_obs_fake_quant_cpu: ch_axis must be < self.dim()",
+    )
+    mask = torch.empty_like(self, dtype=torch.bool)
+    return (torch.empty_like(self), mask)
+
+
+@register_meta(aten.mm)
+@out_wrapper()
+def meta_mm(a, b):
+    torch._check(a.dim() == 2, lambda: "a must be 2D")
+    torch._check(b.dim() == 2, lambda: "b must be 2D")
+    N, M1 = a.shape
+    M2, P = b.shape
+    torch._check(
+        M1 == M2,
+        lambda: f"a and b must have same reduction dim, but got [{N}, {M1}] X [{M2}, {P}].",
+    )
+    return a.new_empty(N, P)
+
+
+def _compute_reduction_shape(self, dims, keepdim):
+    if keepdim:
+        return tuple(self.shape[i] if i not in dims else 1 for i in range(self.ndim))
+
+    return utils.compute_reduction_output_shape(self.shape, dims)
+
+
+# FakeTensors (meta tensors with a device) will report device as meta
+# when running meta kernels. Here, access the "fake device" of FakeTensor if it
+# exists so meta kernels which have diverge per device will be more
+# accurate when run with FakeTensors
+def device_hint(tensor) -> "str":
+    if isinstance(tensor, torch._subclasses.FakeTensor):
+        return tensor.fake_device.type
+    else:
+        return "cuda"  # default to cuda
+
+
+def calc_conv_nd_return_shape(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    stride: Union[List[int], int],
+    padding: Union[List[int], int],
+    dilation: Union[List[int], int],
+    is_transposed: bool,
+    groups: int,
+    output_padding: Optional[Union[List[int], int]] = None,
+):
+    def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+
+        See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+        Returns:
+            The output length
+        """
+        return (ln + 2 * p - d * (k - 1) - 1) // s + 1
+
+    def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
+        """
+        Formula to apply to calculate the length of some dimension of the output
+        if transposed convolution is used.
+        See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+
+        Args:
+            ln: length of the dimension
+            p: padding in that dim
+            d: dilation in that dim
+            k: kernel size in that dim
+            s: stride in that dim
+            op: output padding in that dim
+
+        Returns:
+            The output length
+        """
+        return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
+
+    kernel_size = weight.shape[2:]
+    dims = input_tensor.shape[2:]
+    if is_transposed:
+        out_channels = groups * weight.shape[1]
+    else:
+        out_channels = weight.shape[0]
+        if weight.shape[1] * groups != input_tensor.shape[1]:
+            raise RuntimeError("Invalid channel dimensions")
+
+    ret_shape = [input_tensor.shape[0], out_channels]
+    if isinstance(stride, IntLike):
+        stride = [stride] * len(dims)
+    elif len(stride) == 1:
+        stride = [stride[0]] * len(dims)
+
+    if isinstance(padding, IntLike):
+        padding = [padding] * len(dims)
+    elif len(padding) == 1:
+        padding = [padding[0]] * len(dims)
+
+    if isinstance(dilation, IntLike):
+        dilation = [dilation] * len(dims)
+    elif len(dilation) == 1:
+        dilation = [dilation[0]] * len(dims)
+
+    output_padding_list: Optional[List[int]] = None
+    if output_padding:
+        if isinstance(output_padding, IntLike):
+            output_padding_list = [output_padding] * len(dims)
+        elif len(output_padding) == 1:
+            output_padding_list = [output_padding[0]] * len(dims)
+        else:
+            output_padding_list = output_padding
+
+    for i in range(len(dims)):
+        # If output_padding is present, we are dealing with a transposed convolution
+        if output_padding_list:
+            ret_shape.append(
+                _formula_transposed(
+                    dims[i],
+                    padding[i],
+                    dilation[i],
+                    kernel_size[i],
+                    stride[i],
+                    output_padding_list[i],
+                )
+            )
+        else:
+            ret_shape.append(
+                _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
+            )
+
+    return ret_shape
+
+
+def is_channels_last(ten):
+    return torch._prims_common.suggest_memory_format(ten) == torch.channels_last
+
+
+@register_meta(aten.convolution.default)
+def meta_conv(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    is_transposed: bool,
+    output_padding: List[int],
+    groups: int,
+):
+    def pick_memory_format():
+        if device_hint(input_tensor) == "cuda":
+            if is_channels_last(input_tensor) or is_channels_last(weight):
+                return torch.channels_last
+        else:
+            if is_channels_last(input_tensor):
+                return torch.channels_last
+        if input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+    shape_out = calc_conv_nd_return_shape(
+        input_tensor,
+        weight,
+        stride,
+        padding,
+        dilation,
+        is_transposed,
+        groups,
+        output_padding if is_transposed else None,
+    )
+
+    input_channels_dim = 1
+    output_channels_dim = 1
+    if input_tensor.size(input_channels_dim) == 0:
+        shape_out[output_channels_dim] = 0
+
+    out = input_tensor.new_empty(shape_out)
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
+    return out
+
+
+if torch._C._has_mkldnn:
+    _meta_lib_dont_use_me_use_register_meta_for_mkldnn = torch.library.Library(
+        "mkldnn", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.mkldnn._convolution_pointwise.default)
+    def meta_mkldnn_convolution_default(
+        input_tensor,
+        weight,
+        bias,
+        padding,
+        stride,
+        dilation,
+        groups,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        shape_out = calc_conv_nd_return_shape(
+            input_tensor, weight, stride, padding, dilation, False, groups, []
+        )
+        out = input_tensor.new_empty(shape_out)
+        out_memory_format = torch.channels_last
+        out = out.to(memory_format=out_memory_format)  # type: ignore[call-overload]
+        return out
+
+    @register_meta(torch.ops.mkldnn._linear_pointwise.default)
+    def meta_linear_pointwise_default(
+        input_tensor, weight, bias, attr, scalars, algorithm
+    ):
+        return input_tensor.new_empty((*input_tensor.shape[:-1], weight.shape[0]))
+
+    if torch._C.has_mkl:
+        _meta_lib_dont_use_me_use_register_meta_for_mkl = torch.library.Library(
+            "mkl", "IMPL", "Meta"
+        )
+
+        @register_meta(torch.ops.mkl._mkl_linear)
+        def meta_mkl_linear(
+            input_tensor,
+            packed_weight,
+            orig_weight,
+            bias,
+            batch_size,
+        ):
+            return input_tensor.new_empty(
+                (*input_tensor.shape[:-1], orig_weight.shape[0])
+            )
+
+    _meta_lib_dont_use_me_use_register_meta_for_onednn = torch.library.Library(
+        "onednn", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.onednn.qconv2d_pointwise.default)
+    def meta_qconv2d_pointwise(
+        x,
+        x_scale,
+        x_zp,
+        w,  # prepacked_weight
+        w_scale,
+        w_zp,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        attr,
+        scalars,
+        algorithm,
+    ):
+        shape_out = calc_conv_nd_return_shape(
+            x,
+            w,
+            stride,
+            padding,
+            dilation,
+            False,
+            groups,
+            None,
+        )
+        assert output_dtype in [torch.float32, torch.bfloat16]
+        out = x.new_empty(shape_out, dtype=output_dtype)
+        out = out.to(memory_format=torch.channels_last)
+        return out
+
+    @register_meta(torch.ops.onednn.qlinear_pointwise.default)
+    @register_meta(torch.ops.onednn.qlinear_pointwise.tensor)
+    def meta_qlinear_pointwise(
+        x,
+        x_scale,
+        x_zp,
+        w,
+        w_scale,
+        w_zp,
+        bias,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        post_op_name,
+        post_op_args,
+        post_op_algorithm,
+    ):
+        output_shape = list(x.shape)
+        # The weight has been transposed during the qlinear weight prepack process.
+        output_shape[-1] = w.shape[1]
+        assert output_dtype in [torch.float32, torch.bfloat16]
+        out = x.new_empty(output_shape, dtype=output_dtype)
+        return out
+
+    _meta_lib_dont_use_me_use_register_meta_for_quantized = torch.library.Library(
+        "quantized", "IMPL", "Meta"
+    )
+
+    @register_meta(torch.ops.quantized.max_pool2d)
+    def meta_quantized_max_pool2d(
+        input,
+        kernel_size,
+        stride=(),
+        padding=(0,),
+        dilation=(1,),
+        ceil_mode=False,
+    ):
+        (
+            nInputPlane,
+            outputHeight,
+            outputWidth,
+        ) = max_pool2d_checks_and_compute_shape(
+            input, kernel_size, stride, padding, dilation, ceil_mode
+        )
+        nbatch = input.size(-4) if input.dim() == 4 else 1
+        memory_format = torch.channels_last
+        if input.dim() == 3:
+            size = [nInputPlane, outputHeight, outputWidth]
+        else:
+            size = [nbatch, nInputPlane, outputHeight, outputWidth]
+        return torch.empty(
+            size,
+            dtype=input.dtype,
+            device=input.device,
+            memory_format=memory_format,
+        )
+
+
+# from check_dim_size() in aten/src/ATen/TensorUtils.cpp.
+def check_dim_size(tensor, dim, dim_size, size):
+    torch._check(
+        tensor.dim() == dim and tensor.shape[dim_size] == size,
+        lambda: f"Expected a tensor of dimension {dim} and tensor.size[{dim_size}] == {size}, "
+        + f"but got : dimension {tensor.dim()} and tensor.size[{dim_size}] = {tensor.shape[dim_size]}",
+    )
+
+
+@register_meta(aten.avg_pool2d.default)
+def meta_avg_pool2d(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    def unpack(name, val):
+        torch._check(
+            len(val) in [1, 2],
+            lambda: f"avg_pool2d: {name} must either be a single int, or a tuple of two ints",
+        )
+        H = val[0]
+        W = H if len(val) == 1 else val[1]
+        return H, W
+
+    kH, kW = unpack("kernel_size", kernel_size)
+    torch._check(
+        len(stride) in [0, 1, 2],
+        lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    if len(stride) == 0:
+        dH, dW = kH, kW
+    elif len(stride) == 1:
+        dH, dW = stride[0], stride[0]
+    else:
+        dH, dW = unpack("stride", stride)
+
+    padH, padW = unpack("padding", padding)
+
+    torch._check(
+        divisor_override is None or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nbatch = input.size(-4) if input.dim() == 4 else 1
+    nInputPlane = input.size(-3)
+    inputHeight = input.size(-2)
+    inputWidth = input.size(-1)
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode)
+
+    memory_format = utils.suggest_memory_format(input)
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        1,
+        1,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        memory_format,
+    )
+
+    if input.dim() == 3:
+        size = [nInputPlane, outputHeight, outputWidth]
+    else:
+        size = [nbatch, nInputPlane, outputHeight, outputWidth]
+    return torch.empty(
+        size,
+        dtype=input.dtype,
+        device=input.device,
+        memory_format=memory_format,
+    )
+
+
+# from avg_pool2d_backward_shape_check() in aten/src/ATen/native/Pool.h.
+def avg_pool2d_backward_shape_check(
+    input,
+    gradOutput,
+    nbatch,
+    kH,
+    kW,
+    dH,
+    dW,
+    padH,
+    padW,
+    nInputPlane,
+    inputHeight,
+    inputWidth,
+    outputHeight,
+    outputWidth,
+    mem_format,
+):
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        1,
+        1,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        mem_format,
+    )
+
+    ndim = input.dim()
+    nOutputPlane = nInputPlane
+
+    check_dim_size(gradOutput, ndim, ndim - 3, nOutputPlane)
+    check_dim_size(gradOutput, ndim, ndim - 2, outputHeight)
+    check_dim_size(gradOutput, ndim, ndim - 1, outputWidth)
+
+
+# Don't override the C++ registration.
+@register_meta(aten.avg_pool2d_backward.default)
+def meta_avg_pool2d_backward(
+    gradOutput_,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    # From aten/src/ATen/native/AveragePool2d.cpp structured kernel meta func.
+    torch._check(
+        len(kernel_size) == 1 or len(kernel_size) == 2,
+        lambda: "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints",
+    )
+    kH = kernel_size[0]
+    kW = kH if len(kernel_size) == 1 else kernel_size[1]
+    torch._check(
+        len(stride) == 0 or len(stride) == 1 or len(stride) == 2,
+        lambda: "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    dH = kH if len(stride) == 0 else stride[0]
+    dW = kW if len(stride) == 0 else dH if len(stride) == 1 else stride[1]
+    torch._check(
+        len(padding) == 1 or len(padding) == 2,
+        lambda: "avg_pool2d: padding must either be a single int, or a tuple of two ints",
+    )
+    padH = padding[0]
+    padW = padH if len(padding) == 1 else padding[1]
+
+    torch._check(
+        divisor_override is None or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    input_size = input.shape
+    nbatch = input_size[-4] if input.dim() == 4 else 1
+    nInputPlane = input_size[-3]
+    inputHeight = input_size[-2]
+    inputWidth = input_size[-1]
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, 1, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, 1, ceil_mode)
+
+    mem_format = utils.suggest_memory_format(input)
+
+    avg_pool2d_backward_shape_check(
+        input,
+        gradOutput_,
+        nbatch,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        mem_format,
+    )
+
+    return torch.empty(
+        input_size,
+        dtype=input.dtype,
+        device=input.device,
+        memory_format=mem_format,
+    )
+
+
+@register_meta(aten.avg_pool3d)
+@out_wrapper()
+def meta_avg_pool3d(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    ceil_mode=False,
+    count_include_pad=True,
+    divisor_override=None,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints",
+    )
+    padT = padding[0]
+    padH = padT if len(padding) == 1 else padding[1]
+    padW = padT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    torch._check(
+        not divisor_override or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nbatch = input.size(0)
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode)
+    oheight = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode)
+    owidth = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode)
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        padT,
+        padH,
+        padW,
+        1,
+        1,
+        1,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "avg_pool3d()",
+        check_input_size=True,
+    )
+
+    if input.ndim == 4:
+        return input.new_empty((nslices, otime, oheight, owidth))
+    else:
+        return input.new_empty((nbatch, nslices, otime, oheight, owidth))
+
+
+@register_meta(aten.avg_pool3d_backward)
+@out_wrapper("grad_input")
+def meta_avg_pool3d_backward(
+    grad_output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "avg_pool3d: kernel_size must be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "avg_pool3d: stride must be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "avg_pool3d: padding must be a single int, or a tuple of three ints",
+    )
+    padT = padding[0]
+    padH = padT if len(padding) == 1 else padding[1]
+    padW = padT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    torch._check(
+        not divisor_override or divisor_override != 0,
+        lambda: "divisor must be not zero",
+    )
+
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime_for_shape_check = pooling_output_shape(itime, kT, padT, dT, 1, ceil_mode)
+    oheight_for_shape_check = pooling_output_shape(iheight, kH, padH, dH, 1, ceil_mode)
+    owidth_for_shape_check = pooling_output_shape(iwidth, kW, padW, dW, 1, ceil_mode)
+
+    avg_pool3d_backward_shape_check(
+        input,
+        grad_output,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        padT,
+        padH,
+        padW,
+        itime,
+        iheight,
+        iwidth,
+        otime_for_shape_check,
+        oheight_for_shape_check,
+        owidth_for_shape_check,
+        "avg_pool3d_backward()",
+    )
+
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten._adaptive_avg_pool2d.default)
+def meta_adaptive_avg_pool2d(self, output_size):
+    torch._check(
+        self.ndim == 3 or self.ndim == 4,
+        lambda: f"Expected 3D or 4D tensor, but got {self.shape}",
+    )
+    output_shape = self.shape[:-2] + tuple(output_size)
+    memory_format = utils.suggest_memory_format(self)
+    # need to set memory_format to preserve the memory format of the input
+    # channel last input should have channel last output
+    return torch.empty(
+        output_shape,
+        dtype=self.dtype,
+        device=self.device,
+        memory_format=memory_format,
+    )
+
+
+@register_meta(aten._adaptive_avg_pool3d.default)
+def meta_adaptive_avg_pool3d(self, output_size):
+    torch._check(
+        self.ndim == 4 or self.ndim == 5,
+        lambda: f"Expected 4D or 5D tensor, but got {self.shape}",
+    )
+    return self.new_empty(self.shape[:-3] + tuple(output_size))
+
+
+@register_meta(aten._adaptive_avg_pool2d_backward.default)
+def meta__adaptive_avg_pool2d_backward(grad_out, self):
+    ndim = grad_out.ndim
+    for i in range(1, ndim):
+        torch._check(
+            grad_out.size(i) > 0,
+            lambda: f"adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero \
+                      size for non-batch dimensions, {grad_out.shape} with dimension {i} being empty",
+        )
+    torch._check(
+        ndim == 3 or ndim == 4,
+        lambda: f"adaptive_avg_pool2d_backward(): Expected 3D or 4D tensor, but got {self.shape}",
+    )
+    torch._check(
+        self.dtype == grad_out.dtype,
+        lambda: f"expected dtype {self.dtype} for `grad_output` but got dtype {grad_out.dtype}",
+    )
+    memory_format = torch.contiguous_format
+    if is_channels_last(self):
+        memory_format = torch.channels_last
+    return self.new_empty(self.shape).to(memory_format=memory_format)
+
+
+@register_meta(aten._adaptive_avg_pool3d_backward)
+@out_wrapper("grad_input")
+def meta__adaptive_avg_pool3d_backward(grad_output, self):
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_avg_pool3d_backward")
+    return torch.empty_like(self, memory_format=torch.legacy_contiguous_format)
+
+
+def _adaptive_pool_empty_output_check(grad_output: Tensor, arg_name: str):
+    ndim = grad_output.ndim
+    for i in range(1, ndim):
+        torch._check(
+            grad_output.size(i) > 0,
+            lambda: (
+                f"{arg_name}(): Expected grad_output to have non-zero size for non-batch dimensions, "
+                f"but grad_output has sizes {grad_output.shape} with dimension {i} being empty"
+            ),
+        )
+
+
+@register_meta(aten.adaptive_max_pool2d)
+@out_wrapper("out", "indices")
+def meta_adaptive_max_pool2d(input, output_size):
+    ndim = input.ndim
+    torch._check(
+        ndim in (3, 4),
+        lambda: f"adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: {input.shape}",
+    )
+    for i in range(1, ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+    torch._check(
+        len(output_size) == 2,
+        lambda: "adaptive_max_pool2d(): internal error: output_size.size() must be 2",
+    )
+
+    dimH = 1
+    sizeB = 1
+    sizeD = 0
+
+    if input.ndim == 4:
+        sizeB = input.size(0)
+        dimH += 1
+
+    sizeD = input.size(dimH - 1)
+    osizeH, osizeW = output_size
+
+    if input.ndim == 3:
+        out_shape = (sizeD, osizeH, osizeW)
+        out = input.new_empty(out_shape)
+        indices = input.new_empty(out_shape, dtype=torch.int64)
+        return out, indices
+    else:
+        out_shape = (sizeB, sizeD, osizeH, osizeW)  # type: ignore[assignment]
+        memory_format = utils.suggest_memory_format(input)
+        out = input.new_empty(out_shape).to(memory_format=memory_format)
+        indices = input.new_empty(out_shape, dtype=torch.int64).to(
+            memory_format=memory_format
+        )
+        return out, indices
+
+
+@register_meta(aten.adaptive_max_pool2d_backward)
+@out_wrapper("grad_input")
+def meta_adaptive_max_pool2d_backward(grad_output, input, indices):
+    ndim = grad_output.ndim
+    torch._check(
+        ndim in (3, 4),
+        lambda: f"adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: {grad_output.shape}",
+    )
+
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool2d_backward")
+
+    torch._check(
+        input.dtype == grad_output.dtype,
+        lambda: f"expected dtype {input.dtype} for `grad_output` but got dtype {grad_output.dtype}",
+    )
+
+    memory_format = utils.suggest_memory_format(input)
+    return input.new_empty(input.shape).to(memory_format=memory_format)
+
+
+@register_meta(aten.adaptive_max_pool3d)
+@out_wrapper("out", "indices")
+def meta_adaptive_max_pool3d(input, output_size):
+    ndim = input.ndim
+    torch._check(
+        ndim in (4, 5),
+        lambda: f"adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: {input.shape}",
+    )
+    for i in range(1, ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+    torch._check(
+        len(output_size) == 3,
+        lambda: "adaptive_max_pool3d(): internal error: output_size.size() must be 3",
+    )
+
+    dimD = 0
+    sizeB = 1
+    sizeD = 0
+
+    if ndim == 5:
+        sizeB = input.size(0)
+        dimD += 1
+
+    sizeD = input.size(dimD)
+    osizeT, osizeH, osizeW = output_size
+
+    if ndim == 4:
+        out_shape = (sizeD, osizeT, osizeH, osizeW)
+    else:
+        out_shape = (sizeB, sizeD, osizeT, osizeH, osizeW)  # type: ignore[assignment]
+
+    out = input.new_empty(out_shape)
+    indices = input.new_empty(out_shape, dtype=torch.int64)
+
+    return out, indices
+
+
+@register_meta(aten.adaptive_max_pool3d_backward)
+@out_wrapper("grad_input")
+def meta_adaptive_max_pool3d_backward(grad_output, input, indices):
+    _adaptive_pool_empty_output_check(grad_output, "adaptive_max_pool3d_backward")
+    return input.new_empty(input.shape)
+
+
+@register_meta(aten.repeat_interleave.Tensor)
+def meta_repeat_interleave_Tensor(repeats, output_size=None):
+    if output_size is None:
+        raise RuntimeError("cannot repeat_interleave a meta tensor without output_size")
+    return repeats.new_empty(output_size)
+
+
+@register_meta([aten.complex.default, aten.complex.out])
+@out_wrapper()
+def meta_complex(real, imag):
+    assert real.dtype.is_floating_point
+    assert imag.dtype.is_floating_point
+    out_shape = _broadcast_shapes(real.shape, imag.shape)
+    return real.new_empty(out_shape, dtype=corresponding_complex_dtype(real.dtype))
+
+
+@register_meta([aten.nonzero_static.default, aten.nonzero_static.out])
+@out_wrapper()
+def nonzero_static(self, *, size: int, fill_value: int = -1):
+    return self.new_empty((size, self.dim()), dtype=torch.long)
+
+
+@register_meta([aten.index.Tensor, aten._unsafe_index.Tensor])
+def meta_index_Tensor(self, indices):
+    torch._check(bool(indices), lambda: "at least one index must be provided")
+    # aten::index is the internal advanced indexing implementation
+    # checkIndexTensorTypes and expandTensors
+    result: List[Optional[Tensor]] = []
+    for i, index in enumerate(indices):
+        if index is not None:
+            torch._check(
+                index.dtype in [torch.long, torch.int, torch.int8, torch.bool],
+                lambda: "tensors used as indices must be long, int, byte or bool tensors",
+            )
+            if index.dtype in [torch.int8, torch.bool]:
+                nonzero = index.nonzero()
+                k = len(result)
+                torch._check_index(
+                    k + index.ndim <= self.ndim,
+                    lambda: f"too many indices for tensor of dimension {self.ndim}",
+                )
+                for j in range(index.ndim):
+                    torch._check_index(
+                        index.shape[j] == self.shape[k + j],
+                        lambda: f"The shape of the mask {index.shape} at index {i} "
+                        f"does not match the shape of the indexed tensor {self.shape} at index {k + j}",
+                    )
+                    result.append(nonzero.select(1, j))
+            else:
+                result.append(index)
+        else:
+            result.append(index)
+    indices = result
+    torch._check(
+        len(indices) <= self.ndim,
+        lambda: f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})",
+    )
+    # expand_outplace
+    import torch._refs as refs  # avoid import cycle in mypy
+
+    indices = list(refs._maybe_broadcast(*indices))
+    # add missing null tensors
+    while len(indices) < self.ndim:
+        indices.append(None)
+
+    # hasContiguousSubspace
+    #   true if all non-null tensors are adjacent
+    # See:
+    # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+    # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
+    state = 0
+    has_contiguous_subspace = False
+    for index in indices:
+        if state == 0:
+            if index is not None:
+                state = 1
+        elif state == 1:
+            if index is None:
+                state = 2
+        else:
+            if index is not None:
+                break
+    else:
+        has_contiguous_subspace = True
+
+    # transposeToFront
+    # This is the logic that causes the newly inserted dimensions to show up
+    # at the beginning of the tensor, if they're not contiguous
+    if not has_contiguous_subspace:
+        dims = []
+        transposed_indices = []
+        for i, index in enumerate(indices):
+            if index is not None:
+                dims.append(i)
+                transposed_indices.append(index)
+        for i, index in enumerate(indices):
+            if index is None:
+                dims.append(i)
+                transposed_indices.append(index)
+        self = self.permute(dims)
+        indices = transposed_indices
+
+    # AdvancedIndex::AdvancedIndex
+    # Now we can assume the indices have contiguous subspace
+    # This is simplified from AdvancedIndex which goes to more effort
+    # to put the input and indices in a form so that TensorIterator can
+    # take them.  If we write a ref for this, probably that logic should
+    # get implemented
+    before_shape: List[int] = []
+    after_shape: List[int] = []
+    replacement_shape: List[int] = []
+    for dim, index in enumerate(indices):
+        if index is None:
+            if replacement_shape:
+                after_shape.append(self.shape[dim])
+            else:
+                before_shape.append(self.shape[dim])
+        else:
+            replacement_shape = list(index.shape)
+    return self.new_empty(before_shape + replacement_shape + after_shape)
+
+
+@register_meta([aten.convolution_backward.default])
+def meta_convolution_backward(
+    grad_output_,
+    input_,
+    weight_,
+    bias_sizes_opt,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    output_mask,
+):
+    # High level logic taken from slow_conv3d_backward_cpu which should
+    # be representative of all convolution_backward impls
+    backend_grad_input = None
+    backend_grad_weight = None
+    backend_grad_bias = None
+
+    if output_mask[0]:
+        backend_grad_input = grad_output_.new_empty(input_.size())
+    if output_mask[1]:
+        backend_grad_weight = grad_output_.new_empty(weight_.size())
+    if output_mask[2]:
+        backend_grad_bias = grad_output_.new_empty(bias_sizes_opt)
+
+    return (backend_grad_input, backend_grad_weight, backend_grad_bias)
+
+
+@register_meta([aten.addbmm.default, aten.addbmm.out])
+@out_wrapper()
+def meta_addbmm(self, batch1, batch2, *, beta=1, alpha=1):
+    dim1 = batch1.size(1)
+    dim2 = batch2.size(2)
+    self = self.expand((dim1, dim2))
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+    torch._check(
+        batch1.size(0) == batch2.size(0),
+        lambda: f"batch1 and batch2 must have same number of batches, got {batch1.size(0)} and {batch2.size(0)}",
+    )
+    torch._check(
+        batch1.size(2) == batch2.size(1),
+        lambda: (
+            f"Incompatible matrix sizes for bmm ({batch1.size(1)}x{batch1.size(2)} "
+            f"and {batch2.size(1)}x{batch2.size(2)})"
+        ),
+    )
+    torch._check(
+        self.size(0) == dim1 and self.size(1) == dim2,
+        lambda: "self tensor does not match matmul output shape",
+    )
+    return self.new_empty(self.size())
+
+
+def register_meta_foreach(ops):
+    def wrapper(fn):
+        def register(op):
+            op_name = str(op).split(".")[1]
+            scalar_op = getattr(aten, op_name.replace("_foreach_", ""))
+
+            _add_op_to_registry(
+                meta_table,
+                op,
+                partial(
+                    fn,
+                    _scalar_op=scalar_op,
+                ),
+            )
+
+        pytree.tree_map_(register, ops)
+        return fn
+
+    return wrapper
+
+
+@register_meta_foreach(
+    [
+        aten._foreach_abs,
+        aten._foreach_acos,
+        aten._foreach_asin,
+        aten._foreach_atan,
+        aten._foreach_ceil,
+        aten._foreach_cos,
+        aten._foreach_cosh,
+        aten._foreach_erf,
+        aten._foreach_erfc,
+        aten._foreach_exp,
+        aten._foreach_expm1,
+        aten._foreach_frac,
+        aten._foreach_floor,
+        aten._foreach_lgamma,
+        aten._foreach_log,
+        aten._foreach_log10,
+        aten._foreach_log1p,
+        aten._foreach_log2,
+        aten._foreach_neg,
+        aten._foreach_norm,
+        aten._foreach_reciprocal,
+        aten._foreach_round,
+        aten._foreach_sigmoid,
+        aten._foreach_sign,
+        aten._foreach_sin,
+        aten._foreach_sinh,
+        aten._foreach_sqrt,
+        aten._foreach_tan,
+        aten._foreach_tanh,
+        aten._foreach_trunc,
+        aten._foreach_zero,
+        aten._foreach_add,
+        aten._foreach_sub,
+        aten._foreach_mul,
+        aten._foreach_div,
+        aten._foreach_clamp_min,
+        aten._foreach_clamp_max,
+        aten._foreach_lerp,
+    ],
+)
+def _meta_foreach_out_of_place(*args, _scalar_op=None, **kwargs):
+    torch._check(
+        isinstance(args[0], list),
+        lambda: (f"The first argument must be List[Tensor], but got {type(args[0])}."),
+    )
+
+    nelem = len(args[0])
+    torch._check(
+        nelem > 0,
+        lambda: ("Tensor list must have at least one tensor."),
+    )
+
+    nlists = 1
+    for iarg, arg in enumerate(args[1:]):
+        if isinstance(arg, list):
+            nlists += 1
+            torch._check(
+                len(arg) == nelem,
+                lambda: (
+                    f"self and argument-{iarg+2} must match in length, "
+                    f"but got {nelem} and {len(arg)}."
+                ),
+            )
+        elif isinstance(arg, Tensor):
+            torch._check(
+                arg.dim() == 0 and arg.numel() == 1,
+                lambda: (
+                    "scalar tensor expected to be 0 dim but it has "
+                    f"{arg.dim()} dimensions and {arg.numel()} elements."
+                ),
+            )
+        else:
+            break
+
+    result = []
+    for elem in range(nelem):
+        each_args = [args[i][elem] for i in range(nlists)]
+        result.append(_scalar_op(*each_args, *args[nlists:], **kwargs))
+
+    return result
+
+
+@register_meta_foreach(
+    [
+        aten._foreach_abs_,
+        aten._foreach_acos_,
+        aten._foreach_asin_,
+        aten._foreach_atan_,
+        aten._foreach_ceil_,
+        aten._foreach_cos_,
+        aten._foreach_cosh_,
+        aten._foreach_erf_,
+        aten._foreach_erfc_,
+        aten._foreach_exp_,
+        aten._foreach_expm1_,
+        aten._foreach_frac_,
+        aten._foreach_floor_,
+        aten._foreach_lgamma_,
+        aten._foreach_log_,
+        aten._foreach_log10_,
+        aten._foreach_log1p_,
+        aten._foreach_log2_,
+        aten._foreach_neg_,
+        aten._foreach_reciprocal_,
+        aten._foreach_round_,
+        aten._foreach_sigmoid_,
+        aten._foreach_sign_,
+        aten._foreach_sin_,
+        aten._foreach_sinh_,
+        aten._foreach_sqrt_,
+        aten._foreach_tan_,
+        aten._foreach_tanh_,
+        aten._foreach_trunc_,
+        aten._foreach_zero_,
+        aten._foreach_add_,
+        aten._foreach_sub_,
+        aten._foreach_mul_,
+        aten._foreach_div_,
+        aten._foreach_clamp_min_,
+        aten._foreach_clamp_max_,
+        aten._foreach_lerp_,
+        aten._foreach_copy_,
+    ]
+)
+def _meta_foreach_inplace(*args, _scalar_op=None, **kwargs):
+    _meta_foreach_out_of_place(*args, _scalar_op=_scalar_op, **kwargs)
+    return
+
+
+@register_meta([aten._foreach_pow.ScalarAndTensor])
+def meta__foreach_pow_scalar_and_tensor(self, exponent):
+    # Only foreach_pow has a ScalarAndTensor method and needs special
+    # handling because it does not work with _meta_foreach_out_of_place.
+    torch._check(
+        isinstance(exponent, List),
+        lambda: f"exponent must be a tensor list but got {type(exponent)}",
+    )
+    return [torch.empty_like(e) for e in exponent]
+
+
+def _check_foreach_binop_tensor_lists(self, other):
+    torch._check(
+        isinstance(self, List) and isinstance(other, List),
+        lambda: (
+            "The first two arguments of must be List[Tensor], "
+            f"but got {type(self)} and {type(other)}."
+        ),
+    )
+    torch._check(
+        len(self) > 0 and len(self) == len(other),
+        lambda: (
+            "self and other must be non-empty and match in length, "
+            f"but got {len(self)} and {len(other)}."
+        ),
+    )
+
+
+@register_meta(
+    [
+        aten._foreach_maximum,
+        aten._foreach_minimum,
+    ]
+)
+def meta__foreach_binop_scalar(*args):
+    # aten.maximum(Tensor, Scalar) does not exist.
+    return _meta_foreach_out_of_place(*args, _scalar_op=aten.clamp_min)
+
+
+@register_meta(
+    [
+        aten._foreach_maximum_,
+        aten._foreach_minimum_,
+    ]
+)
+def meta__foreach_binop__scalar(*args):
+    # aten.maximum(Tensor, Scalar) does not exist
+    _meta_foreach_inplace(*args, _scalar_op=aten.clamp_min_)
+    return
+
+
+@register_meta(
+    [
+        aten._foreach_addcdiv.Scalar,
+        aten._foreach_addcmul.Scalar,
+    ]
+)
+def meta__foreach_addcop_scalar(self, tensor1, tensor2, scalar=1):
+    # forach_addcdiv and addcdiv have different signatures and
+    # cannot use _meta_foreach_out_of_place.
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2]),
+        lambda: (
+            "All arguments must be List[Tensor], "
+            f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+    return [torch.empty_like(s) for s in self]
+
+
+@register_meta([aten._foreach_addcdiv_.Tensor, aten._foreach_addcmul_.Tensor])
+def meta__foreach_addcop_tensor(self, tensor1, tensor2, scalars):
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2])
+        and isinstance(scalars, torch.Tensor),
+        lambda: (
+            "_foreach_addc*_ op expects arguments of type: List[Tensor], List[Tensor], List[Tensor], tensor, "
+            f"but got: {type(self)}, {type(tensor1)}, {type(tensor2)}, and {type(scalars)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+
+@register_meta(
+    [
+        aten._foreach_addcdiv_.Scalar,
+        aten._foreach_addcmul_.Scalar,
+    ]
+)
+def meta__foreach_addcop__scalar(self, tensor1, tensor2, scalar=1):
+    torch._check(
+        all(isinstance(l, List) for l in [self, tensor1, tensor2]),
+        lambda: (
+            "All arguments of _foreach_addc*_ must be List[Tensor], "
+            f"but got {type(self)}, {type(tensor1)}, and {type(tensor2)}"
+        ),
+    )
+    torch._check(len(self) > 0, lambda: "input tensor list must not be empty.")
+    torch._check(
+        len(self) == len(tensor1) and len(self) == len(tensor2),
+        lambda: "All input tensor lists must have the same length",
+    )
+
+
+@register_meta([aten._fused_adam_.default])
+def meta__fused_adam_(
+    self,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    max_exp_avg_sqs,
+    state_steps,
+    *,
+    lr,
+    beta1,
+    beta2,
+    weight_decay,
+    eps,
+    amsgrad,
+    maximize,
+    grad_scale=None,
+    found_inf=None,
+):
+    for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]:
+        torch._check(
+            isinstance(l, List),
+            lambda: f"exponent must be a tensor list but got {type(l)}",
+        )
+
+
+@register_meta([aten._fused_adam.default])
+def meta__fused_adam(
+    self,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    max_exp_avg_sqs,
+    state_steps,
+    *,
+    lr,
+    beta1,
+    beta2,
+    weight_decay,
+    eps,
+    amsgrad,
+    maximize,
+    grad_scale=None,
+    found_inf=None,
+):
+    for l in [self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps]:
+        torch._check(
+            isinstance(l, List),
+            lambda: f"exponent must be a tensor list but got {type(l)}",
+        )
+
+    def empty_like_list(tensor_list):
+        return [torch.empty_like(t) for t in tensor_list]
+
+    return (
+        empty_like_list(self),
+        empty_like_list(grads),
+        empty_like_list(exp_avgs),
+        empty_like_list(exp_avg_sqs),
+        empty_like_list(max_exp_avg_sqs),
+    )
+
+
+@register_meta([aten._int_mm])
+@out_wrapper()
+def meta__int_mm(a, b):
+    torch._check(a.dim() == 2, lambda: "a must be a 2D tensor")
+    torch._check(b.dim() == 2, lambda: "b must be a 2D tensor")
+    torch._check(
+        a.dtype is torch.int8,
+        lambda: f"expected self to be int8, got {a.dtype}",
+    )
+    torch._check(
+        b.dtype is torch.int8,
+        lambda: f"expected mat2 to be int8, got {b.dtype}",
+    )
+    torch._check(
+        a.size(1) == b.size(0),
+        lambda: (
+            f"Incompatible matrix sizes for _int_mm ({a.size(0)}x{a.size(1)} "
+            f"and {b.size(0)}x{b.size(1)})"
+        ),
+    )
+    return a.new_empty((a.size(0), b.size(1)), dtype=torch.int32)
+
+
+@register_meta([aten._convert_weight_to_int4pack])
+def meta__convert_weight_to_int4pack(w, inner_k_tiles):
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        w.dtype is torch.int32,
+        lambda: f"expected w to be int32, got {w.dtype}",
+    )
+    n = w.size(0)
+    k = w.size(1)
+    return w.new_empty(
+        (
+            n // 8,
+            k // (inner_k_tiles * 16),
+            32,
+            inner_k_tiles // 2,
+        ),
+        dtype=torch.int32,
+    )
+
+
+@register_meta([aten._weight_int4pack_mm])
+def meta__weight_int4pack_mm(x, w, q_group_size, q_scale_and_zeros):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(w.dim() == 4, lambda: "w must be a 4D tensor")
+    torch._check(
+        x.dtype is torch.bfloat16,
+        lambda: f"expected x to be bf16, got {x.dtype}",
+    )
+    torch._check(
+        w.dtype is torch.int32,
+        lambda: f"expected w to be int32, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0) * 8, dtype=x.dtype)
+
+
+@register_meta([aten._weight_int8pack_mm])
+def meta__weight_int8pack_mm(x, w, q_scales):
+    torch._check(x.dim() == 2, lambda: "x must be a 2D tensor")
+    torch._check(
+        x.dtype is torch.bfloat16,
+        lambda: f"expected x to be bf16, got {x.dtype}",
+    )
+    torch._check(w.dim() == 2, lambda: "w must be a 2D tensor")
+    torch._check(
+        w.dtype is torch.int8,
+        lambda: f"expected w to be int8, got {w.dtype}",
+    )
+    return x.new_empty(x.size(0), w.size(0), dtype=x.dtype)
+
+
+@register_meta(aten._cdist_forward.default)
+def meta_cdist_forward(x1, x2, p, compute_mode):
+    torch._check(
+        x1.dim() >= 2,
+        lambda: f"cdist only supports at least 2D tensors, X1 got: {x1.dim()}D",
+    )
+    torch._check(
+        x2.dim() >= 2,
+        lambda: f"cdist only supports at least 2D tensors, X2 got: {x2.dim()}D",
+    )
+    torch._check(
+        x1.size(-1) == x2.size(-1),
+        lambda: f"X1 and X2 must have the same number of columns. X1: {x1.size(-1)} X2: {x2.size(-1)}",
+    )
+    torch._check(
+        utils.is_float_dtype(x1.dtype),
+        lambda: "cdist only supports floating-point dtypes, X1 got: {x1.dtype}",
+    )
+    torch._check(
+        utils.is_float_dtype(x2.dtype),
+        lambda: "cdist only supports floating-point dtypes, X2 got: {x2.dtype}",
+    )
+    torch._check(p >= 0, lambda: "cdist only supports non-negative p values")
+    torch._check(
+        compute_mode in (None, 1, 2),
+        lambda: f"possible modes: None, 1, 2, but was: {compute_mode}",
+    )
+    r1 = x1.size(-2)
+    r2 = x2.size(-2)
+    batch_tensor1 = x1.shape[:-2]
+    batch_tensor2 = x2.shape[:-2]
+    output_shape = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2))
+    output_shape.extend([r1, r2])
+    return x1.new_empty(output_shape)
+
+
+@register_meta(aten._cdist_backward)
+@out_wrapper()
+def meta_cdist_backward(grad, x1, x2, p, cdist):
+    c1 = x1.shape[-1]
+    r1 = x1.shape[-2]
+    r2 = x2.shape[-2]
+    batch_tensor1 = x1.shape[:-2]
+    batch_tensor2 = x2.shape[:-2]
+    expand_batch_portion = list(torch.broadcast_shapes(batch_tensor1, batch_tensor2))
+    tensor1_expand_size = expand_batch_portion.copy()
+    tensor1_expand_size.extend([r1, c1])
+    batch_product = math.prod(expand_batch_portion)
+    if r1 == 0 or r2 == 0 or c1 == 0 or batch_product == 0:
+        return torch.zeros_like(x1)
+    if tensor1_expand_size != list(x1.shape):
+        x1 = x1.expand(tensor1_expand_size)
+    return torch.empty_like(x1, memory_format=torch.contiguous_format)
+
+
+# NB: This meta function accepts non-meta arguments!  When this behavior
+# was originally introduced this was accidental, but it is now load bearing
+# as people are using this so that they can conveniently test code involving
+# embeddings (feeding CPU tensor inputs with meta device EmbeddingBag module)
+@register_meta(aten._embedding_bag.default)
+def meta_embedding_bag(
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq=False,
+    mode=0,
+    sparse=False,
+    per_sample_weights=None,
+    include_last_offset=False,
+    padding_idx=-1,
+):
+    torch._check(
+        indices.dtype in (torch.long, torch.int),
+        lambda: f"expected indices to be long or int, got {indices.dtype}",
+    )
+    torch._check(
+        offsets.dtype in (torch.long, torch.int),
+        lambda: f"expected offsets to be long or int, got {offsets.dtype}",
+    )
+    torch._check(
+        utils.is_float_dtype(weight.dtype),
+        lambda: f"expected weight to be floating point type, got {weight.dtype}",
+    )
+
+    num_bags = offsets.size(0)
+    if include_last_offset:
+        torch._check(
+            num_bags >= 1,
+            lambda: "include_last_offset: numBags should be at least 1",
+        )
+        num_bags -= 1
+
+    output = weight.new_empty(num_bags, weight.size(1))
+    MODE_SUM, MODE_MEAN, MODE_MAX = range(3)
+
+    if per_sample_weights is not None:
+        torch._check(
+            mode == MODE_SUM,
+            lambda: "embedding_bag: per_sample_weights only supported with mode='sum'",
+        )
+        torch._check(
+            per_sample_weights.dtype == weight.dtype,
+            lambda: f"expected weight ({weight.dtype}) and per_sample_weights ({per_sample_weights.dtype}) to have same dtype",
+        )
+        torch._check(
+            per_sample_weights.ndim == 1,
+            lambda: f"expected per_sample_weights to be 1D tensor, got {per_sample_weights.ndim}D",
+        )
+        torch._check(
+            per_sample_weights.numel() == indices.numel(),
+            lambda: (
+                f"expected per_sample_weights.numel() ({per_sample_weights.numel()} "
+                f"to be the same as indices.numel() ({indices.numel()})"
+            ),
+        )
+
+    def is_fast_path_index_select_scale(src, scale, output, padding_idx):
+        return (
+            is_fast_path_index_select(src, output, padding_idx) and scale.stride(0) == 1
+        )
+
+    def is_fast_path_index_select(src, output, padding_idx):
+        return (
+            (src.dtype == torch.float or src.dtype == torch.half)
+            and src.stride(1) == 1
+            and output.stride(1) == 1
+            and padding_idx < 0
+        )
+
+    def is_fast_path(src, scale, output, padding_idx):
+        if scale is not None:
+            return is_fast_path_index_select_scale(src, scale, output, padding_idx)
+        else:
+            return is_fast_path_index_select(src, output, padding_idx)
+
+    if device_hint(offsets) != "cpu":
+        offset2bag = indices.new_empty(indices.size(0))
+        bag_size = indices.new_empty(offsets.size())
+        if mode == MODE_MAX:
+            max_indices = indices.new_empty(num_bags, weight.size(1))
+        else:
+            max_indices = indices.new_empty(0)
+    else:
+        fast_path_sum = is_fast_path(weight, per_sample_weights, output, padding_idx)
+        if mode in (MODE_MEAN, MODE_MAX) or not fast_path_sum:
+            offset2bag = offsets.new_empty(indices.size(0))
+        else:
+            offset2bag = offsets.new_empty(0)
+        bag_size = offsets.new_empty(num_bags)
+        # This part of the logic comes from make_max_indices_out in EmbeddingBag.cpp
+        numBags = offsets.shape[0]
+        if mode == MODE_MAX:
+            if include_last_offset:
+                torch._check(
+                    numBags >= 1,
+                    lambda: "include_last_offset: numBags should be at least 1",
+                )
+                numBags -= 1
+            max_indices = offsets.new_empty(numBags, weight.shape[1])
+        else:
+            max_indices = offsets.new_empty(bag_size.size())
+    return output, offset2bag, bag_size, max_indices
+
+
+@register_meta(aten._embedding_bag_forward_only.default)
+def meta_embedding_bag_forward_only(weight, indices, offsets, *args):
+    output, offset2bag, bag_size, max_indices = meta_embedding_bag(
+        weight, indices, offsets, *args
+    )
+    if device_hint(offsets) == "cpu":
+        bag_size = offsets.new_empty(offsets.size())
+    return output, offset2bag, bag_size, max_indices
+
+
+def _get_reduction_dtype(input, dtype, promote_int_to_long=True):
+    # if specified, dtype takes precedence
+    if dtype:
+        return dtype
+
+    if input.dtype.is_floating_point or input.dtype.is_complex:
+        return input.dtype
+    elif promote_int_to_long:
+        return torch.long
+
+    return input.dtype
+
+
+@register_meta([aten.nansum.default, aten.nansum.out])
+@out_wrapper()
+def meta_nansum(input, dims=None, keepdim=False, *, dtype=None):
+    output_dtype = _get_reduction_dtype(input, dtype, promote_int_to_long=True)
+    dims = utils.reduction_dims(input.shape, dims)
+    output_shape = _compute_reduction_shape(input, dims, keepdim)
+    return input.new_empty(output_shape, dtype=output_dtype)
+
+
+@register_meta([aten.median.default, aten.nanmedian.default])
+def meta_median(input):
+    output_shape = utils.compute_reduction_output_shape(
+        input.shape, tuple(range(input.dim()))
+    )
+    return input.new_empty(output_shape)
+
+
+@register_meta(
+    [
+        aten.median.dim,
+        aten.median.dim_values,
+        aten.nanmedian.dim,
+        aten.nanmedian.dim_values,
+        aten.mode.default,
+        aten.mode.values,
+    ]
+)
+@out_wrapper("values", "indices")
+def meta_median_mode_dim(input, dim=-1, keepdim=False):
+    if device_hint(input) == "cuda":
+        utils.alert_not_deterministic("median CUDA with indices output")
+    dim = utils.reduction_dims(input.shape, (dim,))
+    output_shape = _compute_reduction_shape(input, dim, keepdim)
+    return (
+        input.new_empty(output_shape),
+        input.new_empty(output_shape, dtype=torch.long),
+    )
+
+
+@register_meta(aten.logical_not_.default)
+def meta_logical_not_(self):
+    return self
+
+
+@register_meta(aten.repeat.default)
+def meta_repeat(self, repeats):
+    torch._check(
+        len(repeats) >= self.dim(),
+        lambda: "Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor",
+    )
+    # Add new leading dimensions to the tensor if the
+    # number of target dimensions is larger than the
+    # number of source dimensions.
+    num_new_dimensions = len(repeats) - self.dim()
+    padded_size = (1,) * num_new_dimensions + tuple(self.shape)
+    target_size = [padded_size[i] * repeats[i] for i in range(len(repeats))]
+    return self.new_empty(target_size)
+
+
+@register_meta(aten.zero_.default)
+def meta_zero_(self):
+    return self
+
+
+@register_meta(
+    [
+        aten.mul_.Scalar,
+        aten.div_.Scalar,
+        aten.mul_.Tensor,
+        aten.div_.Tensor,
+        aten.logical_and_.default,
+        aten.logical_or_.default,
+        aten.logical_xor_.default,
+    ],
+)
+def meta_binop_inplace(self, other):
+    if isinstance(other, torch.Tensor):
+        check_inplace_broadcast(self.shape, other.shape)
+    return self
+
+
+@register_meta(
+    [
+        aten.add_.Scalar,
+        aten.sub_.Scalar,
+        aten.add_.Tensor,
+        aten.sub_.Tensor,
+    ],
+)
+def meta_binop_inplace_alpha(self, other, alpha=1):
+    if isinstance(other, torch.Tensor):
+        check_inplace_broadcast(self.shape, other.shape)
+    return self
+
+
+@register_meta([aten.round.default, aten.round.decimals])
+def meta_round(self, **kwargs):
+    return elementwise_meta(
+        self, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+def shift_dtype_check(fn_name, self, val):
+    torch._check(
+        utils.is_integer_dtype(self.dtype),
+        lambda: f"{fn_name}: Expected input tensor to have an integral dtype. Got {self.dtype}",
+    )
+    if isinstance(val, torch.Tensor):
+        torch._check(
+            utils.is_integer_dtype(val.dtype),
+            lambda: f"{fn_name}: Expected shift value to have an integral dtype. Got {val.dtype}",
+        )
+    else:
+        torch._check(
+            isinstance(val, IntLike),
+            lambda: f"{fn_name}: Expected shift value to be an int. Got {val}",
+        )
+
+
+@register_meta([aten.__rshift__.Tensor, aten.__rshift__.Scalar])
+def meta_rshifts(self, other):
+    shift_dtype_check("rshift", self, other)
+    return elementwise_meta(
+        self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta([aten.__lshift__.Tensor, aten.__lshift__.Scalar])
+def meta_lshifts(self, other):
+    shift_dtype_check("lshift", self, other)
+    return elementwise_meta(
+        self, other, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+@register_meta(aten.zero.default)
+def meta_zero(self):
+    return self.new_empty(self.shape)
+
+
+@register_meta([aten.fill_.Tensor, aten.fill_.Scalar])
+def meta_fill_(self, val):
+    return self
+
+
+@register_meta([aten.fill.Tensor, aten.fill.Scalar])
+def meta_fill(self, val):
+    return torch.empty_like(self)
+
+
+@register_meta(aten.relu_.default)
+def meta_relu_(self):
+    return self
+
+
+@register_meta([aten.index_put.default, aten._unsafe_index_put.default])
+def meta_index_put(self, indices, values, accumulate=False):
+    return torch.empty_like(self)
+
+
+@register_meta(aten.masked_fill_.Scalar)
+def meta_masked_fill_(self, mask, value):
+    check_inplace_broadcast(self.shape, mask.shape)
+    return self
+
+
+@register_meta(aten.masked_scatter_)
+def meta_masked_scatter_(self, mask, source):
+    torch._check(
+        mask.dtype in (torch.bool, torch.uint8), lambda: "Mask must be bool or uint8"
+    )
+    torch._check(
+        self.dtype == source.dtype,
+        lambda: "masked_scatter: expected self and source to have same "
+        "dtypes but got {self.dtype} and {source.dtype}",
+    )
+    return self
+
+
+@register_meta(aten.masked_scatter)
+@out_wrapper()
+def meta_masked_scatter(self, mask, source):
+    self, mask = _maybe_broadcast(self, mask)
+    output = torch.empty_like(self, memory_format=torch.contiguous_format)
+    return meta_masked_scatter_(output, mask, source)
+
+
+@register_meta(aten.masked_scatter_backward)
+def meta_masked_scatter_backward(self, mask, sizes):
+    return self.new_empty(sizes)
+
+
+@register_meta(aten.index_put_.default)
+def meta_index_put_(self, indices, values, accumulate=False):
+    return self
+
+
+@register_meta(aten.alias.default)
+def meta_alias(self):
+    return self.view(self.shape)
+
+
+def common_meta_baddbmm_bmm(batch1, batch2, is_bmm, self_baddbmm=None):
+    torch._check(batch1.dim() == 3, lambda: "batch1 must be a 3D tensor")
+    torch._check(batch2.dim() == 3, lambda: "batch2 must be a 3D tensor")
+
+    batch1_sizes = batch1.size()
+    batch2_sizes = batch2.size()
+
+    bs = batch1_sizes[0]
+    contraction_size = batch1_sizes[2]
+    res_rows = batch1_sizes[1]
+    res_cols = batch2_sizes[2]
+    output_size = (bs, res_rows, res_cols)
+
+    torch._check(
+        batch2_sizes[0] == bs and batch2_sizes[1] == contraction_size,
+        lambda: f"Expected size for first two dimensions of batch2 tensor to be: [{bs}"
+        f", {contraction_size}] but got: [{batch2_sizes[0]}, {batch2_sizes[1]}].",
+    )
+
+    # TODO: handle out
+
+    output = batch2.new_empty(output_size)
+
+    if not is_bmm and self_baddbmm is not None:
+        torch._check(self_baddbmm.dim() == 3, lambda: "self must be a 3D tensor")
+        torch._check(
+            self_baddbmm.size() == output_size,
+            lambda: f"Expected an input tensor shape with shape {output_size} but got shape: {self_baddbmm.size()}",
+        )
+
+    return output
+
+
+@register_meta(aten.bmm.default)
+def meta_bmm(self, mat2):
+    return common_meta_baddbmm_bmm(self, mat2, True)
+
+
+def div_rtn(x, y):
+    q = x // y
+    r = x % y
+    # WARNING: explicit bool conversion here is necessary;
+    # would be fixed by SymBool
+    if r != 0 and (bool(r < 0) != bool(y < 0)):
+        q -= 1
+    return q
+
+
+def pooling_output_shape_pad_lr(
+    inputSize, kernelSize, pad_l, pad_r, stride, dilation, ceil_mode
+):
+    outputSize = (
+        div_rtn(
+            inputSize
+            + pad_l
+            + pad_r
+            - dilation * (kernelSize - 1)
+            - 1
+            + (stride - 1 if ceil_mode else 0),
+            stride,
+        )
+        + 1
+    )
+    if ceil_mode:
+        if (outputSize - 1) * stride >= inputSize + pad_l:
+            outputSize -= 1
+    return outputSize
+
+
+def pooling_output_shape(inputSize, kernelSize, pad, stride, dilation, ceil_mode):
+    torch._check(stride != 0, lambda: "stride should not be zero")
+    torch._check(pad >= 0, lambda: f"pad must be non-negative, but got pad: {pad}")
+    torch._check(
+        pad <= ((kernelSize - 1) * dilation + 1) // 2,
+        lambda: (
+            f"pad should be at most half of effective kernel size, but got pad={pad}, "
+            f"kernel_size={kernelSize} and dilation={dilation}"
+        ),
+    )
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode
+    )
+
+
+def pool2d_shape_check(
+    input,
+    kH,
+    kW,
+    dH,
+    dW,
+    padH,
+    padW,
+    dilationH,
+    dilationW,
+    nInputPlane,
+    inputHeight,
+    inputWidth,
+    outputHeight,
+    outputWidth,
+    memory_format,
+):
+    ndim = input.dim()
+    nOutputPlane = nInputPlane
+
+    torch._check(
+        kW > 0 and kH > 0,
+        lambda: "kernel size should be greater than zero, but got kH: {kH}, kW: {kW}",
+    )
+    torch._check(
+        dW > 0 and dH > 0,
+        lambda: "stride should be greater than zero, but got dH: {dH}, dW: {dW}",
+    )
+    torch._check(
+        dilationH > 0 and dilationW > 0,
+        lambda: "dilation should be greater than zero, but got dilationH: {dilationH}, dilationW: {dilationW}",
+    )
+
+    valid_dims = input.size(1) != 0 and input.size(2) != 0
+
+    if memory_format == torch.channels_last:
+        torch._check(
+            ndim == 4 and valid_dims and input.size(3) != 0,
+            lambda: "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+            " with optional 0 dim batch size for input, but got: {input.size()}",
+        )
+    else:
+        torch._check(
+            (ndim == 3 and input.size(0) != 0 and valid_dims)
+            or (ndim == 4 and valid_dims and input.size(3) != 0),
+            lambda: f"Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got: {input.size()}",
+        )
+
+    torch._check(
+        kW // 2 >= padW and kH // 2 >= padH,
+        lambda: "pad should be smaller than or equal to half of kernel size, but got "
+        f"padW = {padW}, padH = {padH}, kW = {kW}, kH = {kH}",
+    )
+
+    torch._check(
+        outputWidth >= 1 and outputHeight >= 1,
+        lambda: f"Given input size: ({nInputPlane}x{inputHeight}x{inputWidth}). "
+        f"Calculated output size: ({nOutputPlane}x{outputHeight}x{outputWidth}). "
+        "Output size is too small",
+    )
+
+
+def pool3d_shape_check(
+    input: Tensor,
+    nslices: int,
+    kT: int,
+    kH: int,
+    kW: int,
+    dT: int,
+    dH: int,
+    dW: int,
+    pT: int,
+    pH: int,
+    pW: int,
+    dilationT: int,
+    dilationH: int,
+    dilationW: int,
+    itime: int,
+    iheight: int,
+    iwidth: int,
+    otime: int,
+    oheight: int,
+    owidth: int,
+    fn_name: str,
+    check_input_size: bool = False,
+):
+    ndim = input.ndim
+
+    torch._check(
+        kT > 0 and kW > 0 and kH > 0,
+        lambda: (
+            f"kernel size should be greater than zero, but got "
+            f"kT: {kT}, kH: {kH}, kW: {kW}"
+        ),
+    )
+    torch._check(
+        dT > 0 and dW > 0 and dH > 0,
+        lambda: (
+            f"stride should be greater than zero, but got "
+            f"dT: {dT}, dH: {dH}, dW: {dW}"
+        ),
+    )
+    torch._check(
+        dilationT > 0 and dilationW > 0 and dilationH > 0,
+        lambda: (
+            f"dilation should be greater than zero, but got "
+            f"dilationT: {dilationT}, dilationH: {dilationH}, dilationW: {dilationW}"
+        ),
+    )
+
+    torch._check(
+        ndim in (4, 5),
+        lambda: f"{fn_name}: Expected 4D or 5D tensor for input, but got: {input.shape}",
+    )
+
+    for i in range(ndim):
+        if ndim == 5 and i == 0:
+            # size of batch-dim can be 0.
+            continue
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"{fn_name}: Expected input's non-batch dimensions to have positive length,"
+                f" but input has a shape of {input.shape}"
+                f" and non-batch dimension {input.size(i)} has length zero!"
+            ),
+        )
+
+    if check_input_size:  # AveragePool3d
+        torch._check(
+            itime >= kT and iheight >= kH and iwidth >= kW,
+            lambda: (
+                f"input image (T: {itime} H: {iheight} W: {iwidth}) smaller than "
+                f"kernel size (kT: {kT} kH: {kH} kW: {kW})"
+            ),
+        )
+
+    torch._check(
+        kT / 2 >= pT and kW / 2 >= pW and kH / 2 >= pH,
+        lambda: (
+            f"pad should be smaller than or equal to half of kernel size, but got "
+            f"kT: {kT} kW: {kW} kH: {kH} padT: {pT} padW: {pW} padH: {pH}"
+        ),
+    )
+
+    torch._check(
+        otime >= 1 and owidth >= 1 and oheight >= 1,
+        lambda: (
+            f"Given input size: ({nslices}x{itime}x{iheight}x{iwidth}). "
+            f"Calculated output size: ({nslices}x{otime}x{oheight}x{owidth}). "
+            f"Output size is too small"
+        ),
+    )
+
+
+def max_pool3d_backward_shape_check(
+    input,
+    grad_output,
+    indices,
+    nslices,
+    kT,
+    kH,
+    kW,
+    dT,
+    dH,
+    dW,
+    pT,
+    pH,
+    pW,
+    dilationT,
+    dilationH,
+    dilationW,
+    itime,
+    iheight,
+    iwidth,
+    otime,
+    oheight,
+    owidth,
+    fn_name,
+):
+    ndim = input.ndim
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        fn_name,
+    )
+
+    check_dim_size(grad_output, ndim, ndim - 4, nslices)
+    check_dim_size(grad_output, ndim, ndim - 3, otime)
+    check_dim_size(grad_output, ndim, ndim - 2, oheight)
+    check_dim_size(grad_output, ndim, ndim - 1, owidth)
+
+    check_dim_size(indices, ndim, ndim - 4, nslices)
+    check_dim_size(indices, ndim, ndim - 3, otime)
+    check_dim_size(indices, ndim, ndim - 2, oheight)
+    check_dim_size(indices, ndim, ndim - 1, owidth)
+
+
+def avg_pool3d_backward_shape_check(
+    input: Tensor,
+    grad_output: Tensor,
+    nslices: int,
+    kT: int,
+    kH: int,
+    kW: int,
+    dT: int,
+    dH: int,
+    dW: int,
+    pT: int,
+    pH: int,
+    pW: int,
+    itime: int,
+    iheight: int,
+    iwidth: int,
+    otime: int,
+    oheight: int,
+    owidth: int,
+    fn_name: str,
+):
+    ndim = input.ndim
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        1,
+        1,
+        1,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        fn_name,
+        True,
+    )
+
+    check_dim_size(grad_output, ndim, ndim - 4, nslices)
+    check_dim_size(grad_output, ndim, ndim - 3, otime)
+    check_dim_size(grad_output, ndim, ndim - 2, oheight)
+    check_dim_size(grad_output, ndim, ndim - 1, owidth)
+
+
+def max_pool2d_checks_and_compute_shape(
+    input, kernel_size, stride, padding, dilation, ceil_mode
+):
+    # Reference: aten/src/ATen/native/DilatedMaxPool2d.cpp
+    def unpack(name, val):
+        torch._check(
+            len(val) in [1, 2],
+            lambda: f"max_pool2d: {name} must either be a single int, or a tuple of two ints",
+        )
+        H = val[0]
+        W = H if len(val) == 1 else val[1]
+        return H, W
+
+    kH, kW = unpack("kernel_size", kernel_size)
+
+    torch._check(
+        len(stride) in [0, 1, 2],
+        lambda: "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints",
+    )
+    if len(stride) == 0:
+        dH, dW = kH, kW
+    else:
+        dH, dW = unpack("stride", stride)
+
+    padH, padW = unpack("padding", padding)
+    dilationH, dilationW = unpack("dilation", dilation)
+    nInputPlane = input.size(-3)
+    inputHeight = input.size(-2)
+    inputWidth = input.size(-1)
+
+    memory_format = utils.suggest_memory_format(input)
+    if memory_format == torch.channels_last:
+        torch._check(
+            input.dim() == 4,
+            lambda: "non-empty 4D (batch mode) tensor expected for input with channels_last layout",
+        )
+    elif memory_format == torch.contiguous_format:
+        torch._check(
+            input.dim() in [3, 4],
+            lambda: "non-empty 3D or 4D (batch mode) tensor expected for input",
+        )
+    else:
+        torch._check(
+            False,
+            lambda: "Unsupport memory format. Supports only ChannelsLast, Contiguous",
+        )
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
+
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        dilationH,
+        dilationW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+        memory_format,
+    )
+
+    return nInputPlane, outputHeight, outputWidth
+
+
+@register_meta(aten.max_pool2d_with_indices_backward.default)
+def meta_max_pool2d_with_indices_backward(
+    grad_output,
+    self,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    indices,
+):
+    (
+        nInputPlane,
+        outputHeight,
+        outputWidth,
+    ) = max_pool2d_checks_and_compute_shape(
+        self, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    torch._check(
+        self.dtype == grad_output.dtype,
+        lambda: f"Expected dtype {self.dtype} for `gradOutput` but got dtype {grad_output.dtype}",
+    )
+
+    nOutputPlane = nInputPlane
+    ndim = self.ndim
+
+    def _check_dim_size(t):
+        check_dim_size(t, ndim, ndim - 3, nOutputPlane)
+        check_dim_size(t, ndim, ndim - 2, outputHeight)
+        check_dim_size(t, ndim, ndim - 1, outputWidth)
+
+    _check_dim_size(grad_output)
+    _check_dim_size(indices)
+
+    memory_format = utils.suggest_memory_format(self)
+    return torch.empty(
+        self.shape,
+        dtype=self.dtype,
+        device=self.device,
+        memory_format=memory_format,
+    )
+
+
+@register_meta(aten.max_pool2d_with_indices.default)
+def meta_max_pool2d_with_indices(
+    input, kernel_size, stride=(), padding=(0,), dilation=(1,), ceil_mode=False
+):
+    (
+        nInputPlane,
+        outputHeight,
+        outputWidth,
+    ) = max_pool2d_checks_and_compute_shape(
+        input, kernel_size, stride, padding, dilation, ceil_mode
+    )
+
+    nbatch = input.size(-4) if input.dim() == 4 else 1
+    memory_format = utils.suggest_memory_format(input)
+    if input.dim() == 3:
+        size = [nInputPlane, outputHeight, outputWidth]
+    else:
+        size = [nbatch, nInputPlane, outputHeight, outputWidth]
+    return (
+        torch.empty(
+            size,
+            dtype=input.dtype,
+            device=input.device,
+            memory_format=memory_format,
+        ),
+        torch.empty(
+            size,
+            dtype=torch.int64,
+            device=input.device,
+            memory_format=memory_format,
+        ),
+    )
+
+
+@register_meta(aten.fractional_max_pool2d.default)
+def meta_fractional_max_pool2d(self_, kernel_size, output_size, random_samples):
+    torch._check(
+        self_.ndim in (3, 4),
+        lambda: f"fractional_max_pool2d: Expected 3D or 4D tensor, but got: {self_.ndim}",
+    )
+    ndim = self_.ndim
+
+    for d in range(ndim - 3, ndim):
+        torch._check(
+            self_.size(d) > 0,
+            f"fractional_max_pool2d: Expected input to have non-zero "
+            f" size for non-batch dimenions, but got {self_.size()} with dimension {d} empty",
+        )
+
+    # the check and message are out of sync, but this matches the structured meta
+    torch._check(
+        len(kernel_size) == 2,
+        lambda: "fractional_max_pool2d: kernel_size must"
+        "either be a single int or tuple of Ints",
+    )
+    torch._check(
+        len(output_size) == 2,
+        lambda: "fractional_max_pool2d: output_size must "
+        "either be a single int or tuple of Ints",
+    )
+
+    input_channels = self_.size(-3)
+    input_height = self_.size(-2)
+    input_width = self_.size(-1)
+    if ndim == 4:
+        input_batch = self_.size(0)
+    else:
+        input_batch = 1
+
+    torch._check(
+        self_.dtype == random_samples.dtype,
+        lambda: "Expect _random_samples to have the same dtype as input",
+    )
+    torch._check(
+        random_samples.ndim == 3,
+        lambda: f"Expect _random samples to have 3 dimensions got, {random_samples.ndim}",
+    )
+
+    n = random_samples.size(0)
+    c = random_samples.size(1)
+    d = random_samples.size(2)
+    torch._check(
+        n >= input_batch,
+        "Expect _random_samples.size(0) no less then input batch size.",
+    )
+    torch._check(
+        c == input_channels,
+        lambda: "Expect _random_samples.size(1) equals to input channel size.",
+    )
+    torch._check(d == 2, lambda: f"Expect _random_samples.size(2) equals to 2 got {d}.")
+
+    torch._check(
+        output_size[0] + kernel_size[0] - 1 <= input_height,
+        lambda: f"fractional_max_pool2d: kernel height {kernel_size[0]} is too large relative to input height {input_height}",
+    )
+    torch._check(
+        output_size[1] + kernel_size[1] - 1 <= input_width,
+        lambda: f"fractional_max_pool2d: kernel width {kernel_size[1]} is too large relative to input width {input_width}",
+    )
+
+    if self_.dim() == 4:
+        size = [input_batch, input_channels, output_size[0], output_size[1]]
+    else:
+        size = [input_channels, output_size[0], output_size[1]]
+
+    return (
+        torch.empty(
+            size,
+            dtype=self_.dtype,
+            device=self_.device,
+        ),
+        torch.empty(
+            size,
+            dtype=torch.int64,
+            device=self_.device,
+        ),
+    )
+
+
+@register_meta(aten.max_unpool2d)
+@out_wrapper()
+def meta_max_unpool2d(self_, indices, output_size):
+    utils.alert_not_deterministic("max_unpooling2d_forward_out")
+
+    torch._check(
+        indices.dtype == torch.int64,
+        lambda: f"elements in indices should be type int64 but got: {indices.dtype}",
+    )
+    torch._check(
+        len(output_size) == 2,
+        lambda: (
+            f"There should be exactly two elements (height, width) in output_size, "
+            f"but got {len(output_size)} elements."
+        ),
+    )
+
+    oheight, owidth = output_size
+
+    torch._check(
+        self_.ndim in (3, 4),
+        lambda: (
+            f"Input to max_unpooling2d should be a 3d or 4d Tensor, "
+            f"but got a tensor with {self_.ndim} dimensions."
+        ),
+    )
+    torch._check(
+        self_.shape == indices.shape,
+        lambda: (
+            f"Expected shape of indices to be same as that of the input tensor ({self_.shape}) "
+            f"but got indices tensor with shape: {indices.shape}"
+        ),
+    )
+
+    for i in range(1, self_.ndim):
+        torch._check(
+            self_.size(i) > 0,
+            lambda: (
+                f"max_unpooling2d(): "
+                f"Expected input to have non-zero size for non-batch dimensions, "
+                f"but got {self_.shape} with dimension {i} being empty."
+            ),
+        )
+
+    self = self_.contiguous()
+
+    if self_.ndim == 3:
+        nchannels = self.size(0)
+        result = self.new_empty((nchannels, oheight, owidth))
+    else:
+        nbatch = self.size(0)
+        nchannels = self.size(1)
+        result = self.new_empty((nbatch, nchannels, oheight, owidth))
+
+    return result
+
+
+def _max_unpooling3d_shape_check(input, indices, output_size, stride, padding, fn_name):
+    torch._check(
+        indices.dtype == torch.int64, lambda: "elements in indices should be type int64"
+    )
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: f"Input to max_unpooling3d should be a 4d or 5d Tensor, but got a tensor with {input.ndim} dimensions.",
+    )
+    torch._check(
+        len(output_size) == 3,
+        lambda: (
+            f"There should be exactly three elements (depth, height, width) in output_size, "
+            f"but got {len(output_size)} elements."
+        ),
+    )
+    torch._check(
+        len(stride) == 3,
+        lambda: f"There should be exactly three elements (depth, height, width) in stride, but got: {len(stride)} elements.",
+    )
+    torch._check(
+        len(padding) == 3,
+        lambda: f"There should be exactly three elements (depth, height, width) in padding, but got: {len(padding)} elements.",
+    )
+    torch._check(
+        input.shape == indices.shape,
+        lambda: (
+            f"Expected shape of indices to be same as that of the input tensor ({input.shape}) "
+            f"but got indices tensor with shape: {indices.shape}"
+        ),
+    )
+
+    for i in range(1, input.ndim):
+        torch._check(
+            input.size(i) > 0,
+            lambda: (
+                f"{fn_name}: "
+                f"Expected input to have non-zero size for non-batch dimensions, "
+                f"but got {input.shape} with dimension {i} being empty."
+            ),
+        )
+
+    torch._check(
+        stride[0] > 0 and stride[1] > 0 and stride[2] > 0,
+        lambda: f"strides should be greater than zero, but got stride: {stride}",
+    )
+
+
+@register_meta(aten.max_unpool3d)
+@out_wrapper()
+def meta_max_unpool3d(self_, indices, output_size, stride, padding):
+    utils.alert_not_deterministic("max_unpooling3d_forward_out")
+
+    _max_unpooling3d_shape_check(
+        self_, indices, output_size, stride, padding, "max_unpooling3d()"
+    )
+
+    self = self_.contiguous()
+
+    odepth, oheight, owidth = output_size
+
+    if self_.ndim == 4:
+        nchannels = self.size(0)
+        result = self.new_empty((nchannels, odepth, oheight, owidth))
+    else:
+        nbatch = self.size(0)
+        nchannels = self.size(1)
+        result = self.new_empty((nbatch, nchannels, odepth, oheight, owidth))
+
+    return result
+
+
+@register_meta(aten.max_pool3d_with_indices)
+@out_wrapper("out", "indices")
+def meta_max_pool3d_with_indices(
+    input,
+    kernel_size,
+    stride=(),
+    padding=(0,),
+    dilation=(1,),
+    ceil_mode=False,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints",
+    )
+    pT = padding[0]
+    pH = pT if len(padding) == 1 else padding[1]
+    pW = pT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        len(dilation) in (1, 3),
+        lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints",
+    )
+    dilationT = dilation[0]
+    dilationH = dilationT if len(dilation) == 1 else dilation[1]
+    dilationW = dilationT if len(dilation) == 1 else dilation[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    nbatch = input.size(-5) if input.ndim == 5 else 1
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = pooling_output_shape(itime, kT, pT, dT, dilationT, ceil_mode)
+    oheight = pooling_output_shape(iheight, kH, pH, dH, dilationH, ceil_mode)
+    owidth = pooling_output_shape(iwidth, kW, pW, dW, dilationW, ceil_mode)
+
+    pool3d_shape_check(
+        input,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "max_pool3d_with_indices()",
+    )
+
+    channels_last = (
+        input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d
+    )
+    if input.ndim == 4:
+        input_channels_last_check = input.unsqueeze(0)
+        channels_last = (
+            not input_channels_last_check.is_contiguous()
+        ) and input_channels_last_check.is_contiguous(
+            memory_format=torch.channels_last_3d
+        )
+        out_shape = (nslices, otime, oheight, owidth)
+    else:
+        out_shape = (nbatch, nslices, otime, oheight, owidth)  # type: ignore[assignment]
+
+    out = input.new_empty(out_shape)
+    indices = input.new_empty(out_shape, dtype=torch.int64)
+
+    if channels_last:
+        out = out.to(memory_format=torch.channels_last_3d)
+        indices = indices.to(memory_format=torch.channels_last_3d)
+
+    return out, indices
+
+
+@register_meta(aten.max_pool3d_with_indices_backward)
+@out_wrapper("grad_input")
+def meta_max_pool3d_with_indices_backward(
+    grad_output,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    indices,
+):
+    torch._check(
+        len(kernel_size) in (1, 3),
+        lambda: "max_pool3d: kernel_size must either be a single int, or a tuple of three ints",
+    )
+    kT = kernel_size[0]
+    kH = kT if len(kernel_size) == 1 else kernel_size[1]
+    kW = kT if len(kernel_size) == 1 else kernel_size[2]
+
+    torch._check(
+        not stride or len(stride) in (1, 3),
+        lambda: "max_pool3d: stride must either be omitted, a single int, or a tuple of three ints",
+    )
+    dT = kT if not stride else stride[0]
+    dH = kH if not stride else (dT if len(stride) == 1 else stride[1])
+    dW = kW if not stride else (dT if len(stride) == 1 else stride[2])
+
+    torch._check(
+        len(padding) in (1, 3),
+        lambda: "max_pool3d: padding must either be a single int, or a tuple of three ints",
+    )
+    pT = padding[0]
+    pH = pT if len(padding) == 1 else padding[1]
+    pW = pT if len(padding) == 1 else padding[2]
+
+    torch._check(
+        len(dilation) in (1, 3),
+        lambda: "max_pool3d: dilation must be either a single int, or a tuple of three ints",
+    )
+    dilationT = dilation[0]
+    dilationH = dilationT if len(dilation) == 1 else dilation[1]
+    dilationW = dilationT if len(dilation) == 1 else dilation[2]
+
+    torch._check(
+        input.ndim in (4, 5),
+        lambda: "non-empty 4D or 5D (batch mode) tensor expected for input",
+    )
+
+    nslices = input.size(-4)
+    itime = input.size(-3)
+    iheight = input.size(-2)
+    iwidth = input.size(-1)
+
+    otime = grad_output.size(-3)
+    oheight = grad_output.size(-2)
+    owidth = grad_output.size(-1)
+
+    max_pool3d_backward_shape_check(
+        input,
+        grad_output,
+        indices,
+        nslices,
+        kT,
+        kH,
+        kW,
+        dT,
+        dH,
+        dW,
+        pT,
+        pH,
+        pW,
+        dilationT,
+        dilationH,
+        dilationW,
+        itime,
+        iheight,
+        iwidth,
+        otime,
+        oheight,
+        owidth,
+        "max_pool3d_with_indices_backward()",
+    )
+
+    channels_last = (
+        input.ndim == 5 and utils.suggest_memory_format(input) == torch.channels_last_3d
+    )
+    if input.ndim == 4:
+        input_channels_last_check = input.unsqueeze(0)
+        channels_last = (
+            not input_channels_last_check.is_contiguous()
+        ) and input_channels_last_check.is_contiguous(
+            memory_format=torch.channels_last_3d
+        )
+
+    grad_input = input.new_empty(input.shape)
+
+    if channels_last:
+        grad_input = grad_input.to(memory_format=torch.channels_last_3d)
+
+    return grad_input
+
+
+def check_grid_sampler_common(input: Tensor, grid: Tensor):
+    torch._check(
+        input.device == grid.device,
+        lambda: (
+            f"grid_sampler(): expected input and grid to be on same device, but input "
+            f"is on {input.device} and grid is on {grid.device}"
+        ),
+    )
+    torch._check(
+        input.layout == torch.strided and grid.layout == torch.strided,
+        lambda: (
+            f"grid_sampler(): expected input and grid to have torch.strided layout, but "
+            f"input has {input.layout} and grid has {grid.layout}"
+        ),
+    )
+    torch._check(
+        input.shape[0] == grid.shape[0],
+        lambda: (
+            f"grid_sampler(): expected grid and input to have same batch size, but got "
+            f"input with sizes {input.shape} and grid with sizes {grid.shape}"
+        ),
+    )
+    torch._check(
+        grid.shape[-1] == input.ndim - 2,
+        lambda: (
+            f"grid_sampler(): expected grid to have size {input.ndim - 2} in last "
+            f"dimension, but got grid with sizes {grid.shape}"
+        ),
+    )
+
+    for i in range(2, input.ndim):
+        torch._check(
+            input.shape[i] > 0,
+            lambda: (
+                f"grid_sampler(): expected input to have non-empty spatial dimensions, "
+                f"but input has sizes {input.shape} with dimension {i} being empty"
+            ),
+        )
+
+
+class GridSamplerInterpolation(Enum):
+    BILINEAR = 0
+    NEAREST = 1
+    BICUBIC = 2
+
+
+def check_grid_sampler_3d(input: Tensor, grid: Tensor, interpolation_mode: int):
+    torch._check(
+        input.ndim == 5 and input.ndim == grid.ndim,
+        lambda: (
+            f"grid_sampler(): expected 5D input and grid with same number of "
+            f"dimensions, but got input with sizes {input.shape}"
+            f" and grid with sizes {grid.shape}"
+        ),
+    )
+    torch._check(
+        not (
+            input.ndim == 5
+            and interpolation_mode == GridSamplerInterpolation.BICUBIC.value
+        ),
+        lambda: "grid_sampler(): bicubic interpolation only supports 4D input",
+    )
+
+
+@register_meta(aten.grid_sampler_2d_backward.default)
+def grid_sampler_2d_backward_meta(
+    grad_output,
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+    output_mask,
+):
+    input_requires_grad = output_mask[0]
+    if input_requires_grad:
+        grad_input = torch.zeros_like(input, memory_format=torch.contiguous_format)
+    else:
+        grad_input = None
+    grad_grid = torch.empty_like(grid, memory_format=torch.contiguous_format)
+    return (grad_input, grad_grid)
+
+
+@register_meta(aten.grid_sampler_3d)
+@out_wrapper()
+def grid_sampler_3d(
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+):
+    check_grid_sampler_common(input, grid)
+    check_grid_sampler_3d(input, grid, interpolation_mode)
+    N = input.shape[0]
+    C = input.shape[1]
+    out_D = grid.shape[1]
+    out_H = grid.shape[2]
+    out_W = grid.shape[3]
+    return input.new_empty((N, C, out_D, out_H, out_W))
+
+
+@register_meta(aten.grid_sampler_3d_backward)
+@out_wrapper("grad_input", "grad_grid")
+def grid_sampler_3d_backward(
+    grad_output,
+    input,
+    grid,
+    interpolation_mode,
+    padding_mode,
+    align_corners,
+    output_mask,
+):
+    check_grid_sampler_common(input, grid)
+    check_grid_sampler_3d(input, grid, interpolation_mode)
+    input_requires_grad = output_mask[0]
+    if input_requires_grad:
+        grad_input = torch.zeros_like(
+            input, memory_format=torch.legacy_contiguous_format
+        )
+    else:
+        grad_input = None
+    grad_grid = torch.empty_like(grid, memory_format=torch.legacy_contiguous_format)
+    return grad_input, grad_grid
+
+
+@register_meta([aten.full.default])
+def full(size, fill_value, *args, **kwargs):
+    dtype = kwargs.get("dtype", None)
+    if not dtype:
+        dtype = utils.get_dtype(fill_value)
+    kwargs["dtype"] = dtype
+    return torch.empty(size, *args, **kwargs)
+
+
+# zeros_like is special cased to work for sparse
+@register_meta(aten.zeros_like.default)
+def zeros_like(
+    self,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    memory_format=None,
+):
+    if layout == torch.sparse_coo:
+        torch._check(
+            memory_format is None,
+            lambda: "memory format option is only supported by strided tensors",
+        )
+
+        res = torch.empty(
+            0,
+            dtype=self.dtype if dtype is None else dtype,
+            layout=layout,
+            device=self.device if device is None else device,
+            pin_memory=pin_memory,
+        )
+
+        if self.is_sparse:
+            res.sparse_resize_and_clear_(
+                self.size(), self.sparse_dim(), self.dense_dim()
+            )
+        else:
+            res.sparse_resize_and_clear_(self.size(), self.dim(), 0)
+
+        res._coalesced_(True)
+        return res
+    res = aten.empty_like.default(
+        self,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+    )
+    # device can be not "meta"
+    res.fill_(0)
+    return res
+
+
+@register_meta(aten.select.int)
+def meta_select(self, dim, index):
+    ndim = self.dim()
+    torch._check_index(
+        ndim != 0,
+        lambda: "select() cannot be applied to a 0-dim tensor.",
+    )
+
+    dim = dim if dim >= 0 else dim + ndim
+    size = self.size(dim)
+
+    torch._check_index(
+        not (-index > size or index >= size),
+        lambda: f"select(): index {index} out of range for tensor of size "
+        f"{self.size()} at dimension {dim}",
+    )
+
+    index = index if index >= 0 else index + size
+
+    new_size = list(self.size())
+    new_stride = list(self.stride())
+
+    new_storage_offset = self.storage_offset() + index * new_stride[dim]
+    del new_size[dim]
+    del new_stride[dim]
+
+    return self.as_strided(new_size, new_stride, new_storage_offset)
+
+
+@register_meta(aten.select_scatter.default)
+def meta_select_scatter(self, src, dim, index):
+    return utils.clone_preserve_strides(self)
+
+
+@register_meta(aten.slice_scatter.default)
+def meta_slice_scatter(self, src, dim=0, start=None, end=None, step=1):
+    return utils.clone_preserve_strides(self)
+
+
+# TODO: Deduplicate this with canonicalize_dim
+def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
+    if dim_post_expr <= 0:
+        assert wrap_scalar
+        dim_post_expr = 1
+    min = -dim_post_expr
+    max = dim_post_expr - 1
+    assert not (dim < min or dim > max), f"dim {dim} out of bounds ({min}, {max})"
+    if dim < 0:
+        dim += dim_post_expr
+    return dim
+
+
+def ensure_nonempty_size(t, dim):
+    return 1 if t.dim() == 0 else t.shape[dim]
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def gather_shape_check(self, dim, index):
+    self_dims = max(self.dim(), 1)
+    index_dims = max(index.dim(), 1)
+    torch._check(
+        self_dims == index_dims,
+        lambda: "Index tensor must have the same number of dimensions as input tensor",
+    )
+    for i in range(self_dims):
+        if i != dim:
+            torch._check(
+                ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i),
+                lambda: f"Size does not match at dimension {i} expected index {index.shape}"
+                + f" to be smaller than self {self.shape} apart from dimension {dim}",
+            )
+
+
+@register_meta(aten.gather.default)
+def meta_gather(self, dim, index, sparse_grad=False):
+    wrapped_dim = maybe_wrap_dim(dim, self.dim())
+    is_index_empty = index.numel() == 0
+    if not is_index_empty:
+        torch._check(
+            index.dtype == torch.long,
+            lambda: f"gather(): Expected dtype int64 for index, but got {index.dtype}",
+        )
+        gather_shape_check(self, wrapped_dim, index)
+    return self.new_empty(index.shape)
+
+
+# From aten/src/ATen/native/TensorAdvancedIndexing.cpp
+def get_operator_enum(reduce_, use_new_options=False):
+    if use_new_options:
+        if reduce_ == "sum":
+            return "REDUCE_ADD"
+        elif reduce_ == "prod":
+            return "REDUCE_MULTIPLY"
+        elif reduce_ == "mean":
+            return "REDUCE_MEAN"
+        elif reduce_ == "amax":
+            return "REDUCE_MAXIMUM"
+        elif reduce_ == "amin":
+            return "REDUCE_MINIMUM"
+        torch._check(
+            False,
+            lambda: "reduce argument must be either sum, prod, mean, amax or amin.",
+        )
+        return
+    else:
+        if reduce_ == "add":
+            return "REDUCE_ADD"
+        elif reduce_ == "multiply":
+            return "REDUCE_MULTIPLY"
+        torch._check(False, lambda: "reduce argument must be either add or multiply.")
+        return
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def scatter_gather_dtype_check(method_name, self, index, src_opt=None):
+    if index.numel() != 0:
+        torch._check(
+            index.dtype == torch.long,
+            lambda: f"{method_name}(): Expected dtype int64 for index",
+        )
+
+    if src_opt is not None:
+        torch._check(
+            self.dtype == src_opt.dtype,
+            lambda: f"{method_name}(): Expected self.dtype to be equal to src.dtype",
+        )
+
+
+def ensure_nonempty_dim(dim):
+    return max(dim, 1)
+
+
+# From aten/src/ATen/native/ScatterGatherChecks.h
+def scatter_shape_check(self, dim, index, src_opt=None):
+    if index.numel() == 0:
+        return
+    torch._check(
+        ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+        lambda: "Index tensor must have the same number of dimensions as self tensor",
+    )
+
+    is_wrong_shape = False
+    self_dims = ensure_nonempty_dim(self.dim())
+
+    # Check: index.size(d) <= self.size(d) for all d != dim
+    for d in range(self_dims):
+        index_d_size = ensure_nonempty_size(index, d)
+        if d == dim:
+            continue
+        if index_d_size > ensure_nonempty_size(self, d):
+            is_wrong_shape = True
+            break
+
+    # Check: index.size(d) <= src.size(d) for all d if src is Tensor
+    if not is_wrong_shape and src_opt is not None:
+        for d in range(self_dims):
+            index_d_size = ensure_nonempty_size(index, d)
+            if index_d_size > ensure_nonempty_size(src_opt, d):
+                is_wrong_shape = True
+                break
+
+    if src_opt is not None:
+        torch._check(
+            ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+            lambda: "Index tensor must have the same number of dimensions as self tensor",
+        )
+        torch._check(
+            not is_wrong_shape,
+            lambda: f"Expected index {index.shape} to be smaller than self {self.shape}"
+            + f" apart from dimension {dim} and to be smaller than src {src_opt.shape}",
+        )
+    else:
+        torch._check(
+            not is_wrong_shape,
+            lambda: f"Expected index {index.shape} to be smaller than self {self.shape}"
+            + f" apart from dimension {dim}",
+        )
+
+
+# From aten/src/ATen/native/TensorAdvancedIndexing.cpp
+def scatter_meta_impl(self, dim, index, src=None, reduce_=None, use_new_options=False):
+    wrapped_dim = maybe_wrap_dim(dim, self.dim())
+    scatter_gather_dtype_check("scatter", self, index, src)
+    scatter_shape_check(self, wrapped_dim, index, src)
+    if reduce_ is not None:
+        # Check if we have a valid reduce operator.
+        get_operator_enum(reduce_, use_new_options)
+
+
+@register_meta(aten.scatter_add.default)
+def meta_scatter_add(self, dim, index, src):
+    scatter_meta_impl(self, dim, index, src, "add")
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_add_)
+def meta_scatter_add_(self, dim, index, src):
+    scatter_meta_impl(self, dim, index, src, "add")
+    return self
+
+
+@register_meta(
+    [
+        aten.scatter.src,
+        aten.scatter.value,
+        aten.scatter.reduce,
+        aten.scatter.value_reduce,
+    ]
+)
+@out_wrapper()
+def meta_scatter(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self.new_empty(self.shape)
+
+
+@register_meta(
+    [
+        aten.scatter_.src,
+        aten.scatter_.value,
+        aten.scatter_.reduce,
+        aten.scatter_.value_reduce,
+    ]
+)
+def meta_scatter_(self, dim, index, src_or_value, reduce=None):
+    src = src_or_value if isinstance(src_or_value, torch.Tensor) else None
+    scatter_meta_impl(self, dim, index, src, reduce)
+    return self
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_backward,
+    ]
+)
+def meta__scaled_dot_product_flash_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    cum_seq_q: Tensor,
+    cum_seq_k: Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    scale: Optional[float] = None,
+):
+    grad_q = torch.empty_like(query.transpose(1, 2)).transpose(1, 2)
+    grad_k = torch.empty_like(key.transpose(1, 2)).transpose(1, 2)
+    grad_v = torch.empty_like(value.transpose(1, 2)).transpose(1, 2)
+    return grad_q, grad_k, grad_v
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_for_cpu,
+    ]
+)
+def meta__scaled_dot_product_flash_attention_for_cpu(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    attn_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+):
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_seqlen_batch_q = query.size(2)
+    head_dim = query.size(3)
+
+    attention = torch.empty(
+        (batch_size, max_seqlen_batch_q, num_heads, head_dim),
+        dtype=query.dtype,
+        device=query.device,
+    ).transpose(1, 2)
+    logsumexp = torch.empty(
+        (
+            batch_size,
+            max_seqlen_batch_q,
+            num_heads,
+        ),
+        dtype=torch.float,
+        device=query.device,
+    ).transpose(1, 2)
+    return (
+        attention,
+        logsumexp,
+    )
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_flash_attention_for_cpu_backward,
+    ]
+)
+def meta__scaled_dot_product_flash_attention_for_cpu_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    dropout_p: float,
+    is_causal: bool,
+    attn_mask: Optional[Tensor] = None,
+    scale: Optional[float] = None,
+):
+    # cpus's grad layout is different from cuda's,
+    # i.e. (batch_size, seq_len，num_heads, head_dim）
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    head_dim = query.size(3)
+    len_q = query.size(2)
+    len_k = key.size(2)
+
+    grad_q = torch.empty_permuted(
+        (batch_size, num_heads, len_q, head_dim),
+        (0, 2, 1, 3),
+        dtype=query.dtype,
+        device=query.device,
+    )
+    grad_k = torch.empty_permuted(
+        (batch_size, num_heads, len_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=key.dtype,
+        device=key.device,
+    )
+    grad_v = torch.empty_permuted(
+        (batch_size, num_heads, len_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=value.dtype,
+        device=value.device,
+    )
+
+    return grad_q, grad_k, grad_v
+
+
+@register_meta(
+    [
+        aten._scaled_dot_product_efficient_attention_backward,
+    ]
+)
+def meta__scaled_dot_product_efficient_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    attn_bias: Optional[Tensor],
+    out: Tensor,
+    logsumexp: Tensor,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    dropout_p: float,
+    grad_input_mask: List[bool],
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+):
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_q = query.size(2)
+    head_dim = query.size(3)
+    head_dim_v = value.size(3)
+
+    max_k = key.size(2)
+
+    grad_q = torch.empty_permuted(
+        (batch_size, num_heads, max_q, head_dim),
+        (0, 2, 1, 3),
+        dtype=query.dtype,
+        device=query.device,
+    )
+    grad_k = torch.empty_permuted(
+        (batch_size, num_heads, max_k, head_dim),
+        (0, 2, 1, 3),
+        dtype=key.dtype,
+        device=key.device,
+    )
+    grad_v = torch.empty_permuted(
+        (batch_size, num_heads, max_k, head_dim_v),
+        (0, 2, 1, 3),
+        dtype=value.dtype,
+        device=value.device,
+    )
+    grad_bias = None
+    if attn_bias is not None and grad_input_mask[3]:
+        lastDim = attn_bias.size(-1)
+        lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16
+        new_sizes = list(attn_bias.size())
+        new_sizes[-1] = lastDimAligned
+        grad_bias = torch.empty(
+            new_sizes, dtype=attn_bias.dtype, device=attn_bias.device
+        )
+        grad_bias = grad_bias[..., :lastDim]
+
+    return grad_q, grad_k, grad_v, grad_bias
+
+
+@register_meta(
+    [
+        aten._flash_attention_backward,
+    ]
+)
+def meta__flash_attention_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    out: Tensor,
+    logsumexp: Tensor,
+    cum_seq_q: Tensor,
+    cum_seq_k: Tensor,
+    max_q: int,
+    max_k: int,
+    dropout_p: float,
+    is_causal: bool,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    scale: Optional[float] = None,
+):
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    return grad_query, grad_key, grad_value
+
+
+@register_meta(
+    [
+        aten._efficient_attention_backward,
+    ]
+)
+def meta__efficient_attention_backward(
+    grad_out: Tensor,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    bias: Optional[Tensor],
+    cu_seqlens_q: Optional[Tensor],
+    cu_seqlens_k: Optional[Tensor],
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    logsumexp: Tensor,
+    dropout_p: float,
+    philox_seed: Tensor,
+    philox_offset: Tensor,
+    custom_mask_type: int,
+    bias_requires_grad: bool,
+    scale: Optional[float] = None,
+    num_splits_key: Optional[int] = None,
+):
+    grad_query = torch.empty_like(query)
+    grad_key = torch.empty_like(key)
+    grad_value = torch.empty_like(value)
+
+    if bias is not None:
+        lastDim = bias.size(-1)
+        lastDimAligned = lastDim if lastDim % 16 == 0 else lastDim + 16 - lastDim % 16
+        new_sizes = list(bias.size())
+        new_sizes[-1] = lastDimAligned
+        grad_bias = torch.empty(new_sizes, dtype=bias.dtype, device=bias.device)
+        grad_bias = grad_bias[..., :lastDim]
+    else:
+        grad_bias = torch.empty((), device=query.device)
+
+    return grad_query, grad_key, grad_value, grad_bias
+
+
+@register_meta([aten._scaled_mm.default])
+def meta_scaled_mm(
+    self: torch.Tensor,
+    mat2: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = None,
+    scale_a: Optional[torch.Tensor] = None,
+    scale_b: Optional[torch.Tensor] = None,
+    scale_result: Optional[torch.Tensor] = None,
+    use_fast_accum: bool = False,
+):
+    def is_row_major(stride):
+        return stride[0] > stride[1] and stride[1] == 1
+
+    def is_col_major(shape, stride):
+        return stride[0] == 1 and stride[1] == shape[0]
+
+    def is_fp8_type(dtype):
+        return dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        )
+
+    torch._check(
+        self.dim() == 2 and mat2.dim() == 2,
+        lambda: f"Inputs must be 2D but got self.dim()={self.dim()} and mat2.dim()={mat2.dim()}",
+    )
+    torch._check(
+        is_row_major(self.stride()),
+        lambda: "self must be row_major",
+    )
+    torch._check(
+        is_col_major(mat2.shape, mat2.stride()),
+        lambda: "mat2 must be col_major",
+    )
+    torch._check(
+        self.size(1) % 16 == 0,
+        lambda: f"Expected self.size(0) to be divisible by 16, but got self.size(1)={self.size(1)}",
+    )
+    torch._check(
+        mat2.size(0) % 16 == 0 and mat2.size(1) % 16 == 0,
+        lambda: f"Expected both dimensions of mat2 to be divisble by 16 but got {mat2.shape}",
+    )
+    torch._check(
+        is_fp8_type(self.dtype) and is_fp8_type(mat2.dtype),
+        lambda: f"Expected both inputs to be fp8 types but got self.dtype={self.dtype} and mat2.dtype={mat2.dtype}",
+    )
+    _out_dtype = out_dtype if out_dtype is not None else self.dtype
+    return torch.empty(
+        self.size(0), mat2.size(1), dtype=_out_dtype, device=self.device
+    ), torch.empty((), dtype=torch.float32, device=self.device)
+
+
+@register_meta([aten.scatter_reduce.two, aten.scatter_reduce.two_out])
+@out_wrapper()
+def meta_scatter_reduce_two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self.new_empty(self.shape)
+
+
+@register_meta(aten.scatter_reduce_.two)
+def meta_scatter_reduce__two(self, dim, index, src, reduce, include_self=True):
+    scatter_meta_impl(self, dim, index, src, reduce, use_new_options=True)
+    return self
+
+
+@register_meta([aten.multinomial.default, aten.multinomial.out])
+@out_wrapper()
+def meta_multinomial(input, num_samples, replacement=False, *, generator=None):
+    torch._check(
+        0 < input.dim() <= 2,
+        lambda: f"The probabilty distributions dimensions must be 1 or 2, but got {input.dim()}",
+    )
+    if input.dim() == 1:
+        return torch.empty(num_samples, dtype=torch.long, device=input.device)
+    return torch.empty(
+        input.size(0), num_samples, dtype=torch.long, device=input.device
+    )
+
+
+def multiply_integers(vs):
+    r = 1
+    for v in vs:
+        r *= v
+    return r
+
+
+def upsample_common_check(input_size, output_size, num_spatial_dims):
+    torch._check(
+        len(output_size) == num_spatial_dims,
+        lambda: f"It is expected output_size equals to {num_spatial_dims}, but got size {len(output_size)}",
+    )
+    expected_input_dims = num_spatial_dims + 2  # N, C, ...
+    torch._check(
+        len(input_size) == expected_input_dims,
+        lambda: f"It is expected input_size equals to {expected_input_dims}, but got size {len(input_size)}",
+    )
+
+    torch._check(
+        all(s > 0 for s in input_size[2:]) and all(s > 0 for s in output_size),
+        lambda: f"Input and output sizes should be greater than 0, but got "
+        f"input size {input_size} and output size {output_size}",
+    )
+
+    nbatch, channels = input_size[:2]
+    return (nbatch, channels, *output_size)
+
+
+@register_meta(
+    [aten.upsample_nearest1d.default, aten._upsample_nearest_exact1d.default]
+)
+def upsample_nearest1d(input, output_size, scales=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 3D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=1
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+@register_meta(
+    [aten.upsample_nearest2d.default, aten._upsample_nearest_exact2d.default]
+)
+def upsample_nearest2d(input, output_size, scales_h=None, scales_w=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=2
+    )
+    output = input.new_empty(full_output_size)
+
+    # convert output to correct memory format, if necessary
+    memory_format = utils.suggest_memory_format(input)
+
+    # following "heuristic: only use channels_last path when it's faster than the contiguous path"
+    _, n_channels, _, _ = input.shape
+    if input.device.type == "cuda" and n_channels < 4:
+        memory_format = torch.contiguous_format
+
+    output = output.contiguous(memory_format=memory_format)
+
+    return output
+
+
+@register_meta(
+    [
+        aten.upsample_nearest2d_backward.default,
+        aten._upsample_nearest_exact2d_backward.default,
+    ]
+)
+def upsample_nearest2d_backward(
+    grad_output: Tensor,
+    output_size: Sequence[Union[int, torch.SymInt]],
+    input_size: Sequence[Union[int, torch.SymInt]],
+    scales_h: Optional[float] = None,
+    scales_w: Optional[float] = None,
+):
+    full_output_size = upsample_common_check(
+        input_size, output_size, num_spatial_dims=2
+    )
+    torch._check(
+        grad_output.ndim == 4,
+        lambda: f"Expected grad_output to be a tensor of dimension 4 but got: dimension {grad_output.ndim}",
+    )
+    for i in range(4):
+        torch._check(
+            grad_output.size(i) == full_output_size[i],
+            lambda: (
+                f"Expected grad_output to have the same shape as output;"
+                f" output.size({i}) = {full_output_size[i]}"
+                f" but got grad_output.size({i}) = {grad_output.size(i)}"
+            ),
+        )
+
+    return grad_output.new_empty(input_size).to(
+        memory_format=utils.suggest_memory_format(grad_output)
+    )  # type: ignore[call-overload]
+
+
+@register_meta(
+    [aten.upsample_nearest3d.default, aten._upsample_nearest_exact3d.default]
+)
+def upsample_nearest3d(input, output_size, scales_d=None, scales_h=None, scales_w=None):
+    torch._check(
+        input.numel() != 0 or multiply_integers(input.size()[1:]),
+        lambda: f"Non-empty 5D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=3
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+@register_meta(
+    [
+        aten.sort.default,
+        aten.sort.stable,
+        aten.sort.values,
+        aten.sort.values_stable,
+    ]
+)
+def meta_sort(self, stable=None, dim=-1, descending=False, values=None, indices=None):
+    v, i = torch.empty_like(self), torch.empty_like(self, dtype=torch.int64)
+    if values is not None and indices is not None:
+        assert isinstance(values, TensorLike)
+        assert isinstance(indices, TensorLike)
+        # Makes sure values and indices have the same strides. For cases where
+        # these have different shapes, like (5, 10, 5) and (0) in msort.
+        out_shape = v.shape
+        out_stride = v.stride()
+        values = _maybe_resize_out(values, out_shape)
+        indices = _maybe_resize_out(indices, out_shape)
+        values.as_strided_(out_shape, out_stride)
+        indices.as_strided_(out_shape, out_stride)
+        _safe_copy_out(copy_from=v, copy_to=values)  # type: ignore[arg-type]
+        _safe_copy_out(copy_from=i, copy_to=indices)  # type: ignore[arg-type]
+        return values, indices
+    return v, i
+
+
+@register_meta(aten.argsort.stable)
+def meta_argsort(self, *, stable, dim=-1, descending=False):
+    return meta_sort(self, stable=stable, dim=dim, descending=descending)[1]
+
+
+def rnn_cell_checkSizes(
+    input_gates, hidden_gates, input_bias, hidden_bias, factor, prev_hidden
+):
+    torch._check(input_gates.ndim == 2, lambda: f"{input_gates.ndim} != 2")
+    torch._check(
+        input_gates.shape == hidden_gates.shape,
+        lambda: f"{input_gates.shape} != {hidden_gates.shape}",
+    )
+    gates_size = input_gates.size(1)
+    if input_bias is not None:
+        torch._check(input_bias.ndim == 1, lambda: f"{input_bias.ndim} != 1")
+        torch._check(
+            input_bias.numel() == gates_size,
+            lambda: f"{input_bias.numel()} != {gates_size}",
+        )
+        torch._check(
+            input_bias.shape == hidden_bias.shape,
+            lambda: f"{input_bias.shape} != {hidden_bias.shape}",
+        )
+    torch._check(prev_hidden.ndim == 2, lambda: f"{prev_hidden.ndim} != 2")
+    expected_prev_hidden_numel = input_gates.size(0) * gates_size // factor
+    torch._check(
+        prev_hidden.numel() == expected_prev_hidden_numel,
+        lambda: f"{prev_hidden.numel()} != {input_gates.size(0)} * {gates_size} // {factor} (aka {expected_prev_hidden_numel})",
+    )
+    torch._check(
+        all(
+            x.device == input_gates.device
+            for x in [hidden_gates, input_bias, hidden_bias, prev_hidden]
+        ),
+        lambda: "expected all inputs to be same device",
+    )
+
+
+@register_meta(aten._thnn_fused_lstm_cell.default)
+def _thnn_fused_lstm_cell_meta(
+    input_gates, hidden_gates, cx, input_bias=None, hidden_bias=None
+):
+    rnn_cell_checkSizes(input_gates, hidden_gates, input_bias, hidden_bias, 4, cx)
+    workspace = torch.empty_like(input_gates, memory_format=torch.contiguous_format)
+    hy = torch.empty_like(cx, memory_format=torch.contiguous_format)
+    cy = torch.empty_like(cx, memory_format=torch.contiguous_format)
+    return (hy, cy, workspace)
+
+
+@register_meta(aten._cudnn_rnn.default)
+def _cudnn_rnn(
+    input,
+    weight,
+    weight_stride0,
+    weight_buf,
+    hx,
+    cx,
+    mode,
+    hidden_size,
+    proj_size,
+    num_layers,
+    batch_first,
+    dropout,
+    train,
+    bidirectional,
+    batch_sizes,
+    dropout_state,
+):
+    is_input_packed = len(batch_sizes) != 0
+    if is_input_packed:
+        seq_length = len(batch_sizes)
+        mini_batch = batch_sizes[0]
+        batch_sizes_sum = input.shape[0]
+    else:
+        seq_length = input.shape[1] if batch_first else input.shape[0]
+        mini_batch = input.shape[0] if batch_first else input.shape[1]
+        batch_sizes_sum = -1
+
+    num_directions = 2 if bidirectional else 1
+    out_size = proj_size if proj_size != 0 else hidden_size
+    if is_input_packed:
+        out_shape = [batch_sizes_sum, out_size * num_directions]
+    else:
+        out_shape = (
+            [mini_batch, seq_length, out_size * num_directions]
+            if batch_first
+            else [seq_length, mini_batch, out_size * num_directions]
+        )
+    output = input.new_empty(out_shape)
+
+    cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
+    if cx is None:
+        cy = torch.empty(0, device=input.device)
+    else:
+        cy = cx.new_empty(cell_shape)
+
+    hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
+
+    # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
+    reserve_shape = 0 if train else 0
+    reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
+
+    return output, hy, cy, reserve, weight_buf
+
+
+@register_meta(aten.mkldnn_rnn_layer.default)
+def mkldnn_rnn_layer(
+    input,
+    w0,
+    w1,
+    w2,
+    w3,
+    hx_,
+    cx_,
+    reverse,
+    batch_sizes,
+    mode,
+    hidden_size,
+    num_layers,
+    has_biases,
+    bidirectional,
+    batch_first,
+    train,
+):
+    seq_length = input.shape[1] if batch_first else input.shape[0]
+    mini_batch = input.shape[0] if batch_first else input.shape[1]
+    output_chanels = hidden_size
+    out_shape = (
+        [mini_batch, seq_length, output_chanels]
+        if batch_first
+        else [seq_length, mini_batch, output_chanels]
+    )
+    output = input.new_empty(out_shape)
+    if hx_ is None:
+        hy = torch.empty(0, device=input.device)
+    else:
+        hy = hx_.new_empty(hx_.shape)
+    if cx_ is None:
+        cy = torch.empty(0, device=input.device)
+    else:
+        cy = cx_.new_empty(cx_.shape)
+    workspace = torch.empty(0, device=input.device, dtype=torch.uint8)
+    return output, hy, cy, workspace
+
+
+def zero_numel_check_dims(self, dim, fn_name):
+    if self.ndim == 0:
+        torch._check_index(
+            dim == 0 or dim == -1,
+            lambda: f"{fn_name}: Expected reduction dim -1 or 0 for scalar but got {dim}",
+        )
+    else:
+        torch._check_index(
+            self.size(dim) != 0,
+            lambda: f"{fn_name}: Expected reduction dim {dim} to have non-zero size.",
+        )
+
+
+# From aten/src/ATen/native/ReduceOps.cpp
+def check_argmax_argmin(name, self, dim):
+    if dim is not None:
+        dim = maybe_wrap_dim(dim, self.dim())
+        zero_numel_check_dims(self, dim, name)
+    else:
+        torch._check(
+            self.numel() != 0,
+            lambda: f"{name}: Expected reduction dim to be specified for input.numel() == 0.",
+        )
+
+
+@register_meta([aten.argmax.default, aten.argmin.default])
+def argmax_argmin_meta(self, dim=None, keepdim=False):
+    check_argmax_argmin("argmax", self, dim)
+    dims = utils.reduction_dims(self.shape, (dim,) if dim is not None else None)
+    shape = _compute_reduction_shape(self, dims, keepdim)
+    return self.new_empty(shape, dtype=torch.int64)
+
+
+@register_meta(aten.scalar_tensor.default)
+def scalar_tensor(s, dtype=None, layout=None, device=None, pin_memory=None):
+    return torch.empty(
+        (), dtype=dtype, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+
+@register_meta(aten.topk.default)
+def topk_meta(self, k, dim=-1, largest=True, sorted=True):
+    # From aten/src/ATen/native/Sorting.cpp
+    dim = maybe_wrap_dim(dim, self.dim(), wrap_scalar=True)
+    torch._check(
+        k >= 0 and k <= (self.size(dim) if self.dim() > 0 else 1),
+        lambda: "selected index k out of range",
+    )
+    sliceSize = 1 if self.dim() == 0 else self.size(dim)
+    torch._check(k >= 0 and k <= sliceSize, lambda: "k not in range for dimension")
+
+    topKSize = list(self.shape)
+    if len(topKSize) > 0:
+        topKSize[dim] = k
+    return self.new_empty(topKSize), self.new_empty(topKSize, dtype=torch.int64)
+
+
+legacy_contiguous_memory_format = torch.contiguous_format
+
+
+# From aten/src/ATen/native/cuda/RNN.cu
+def checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace):
+    defined_grad = grad_hy if grad_hy is not None else grad_cy
+    torch._check(defined_grad.dim() == 2, lambda: "")
+    exp_size = defined_grad.size()
+    if grad_hy is not None:
+        torch._check(grad_hy.size() == exp_size, lambda: "")
+    if grad_cy is not None:
+        torch._check(grad_cy.size() == exp_size, lambda: "")
+    torch._check(cx.size() == exp_size, lambda: "")
+    torch._check(cy.size() == exp_size, lambda: "")
+    torch._check(workspace.dim() == 2, lambda: "")
+    torch._check(workspace.numel() == exp_size[0] * exp_size[1] * 4, lambda: "")
+
+
+# From aten/src/ATen/native/cuda/RNN.cu
+@register_meta(aten._thnn_fused_lstm_cell_backward_impl.default)
+def _thnn_fused_lstm_cell_backward_impl(grad_hy, grad_cy, cx, cy, workspace, has_bias):
+    if grad_hy is None and grad_cy is None:
+        return None, None, None
+    checkLSTMBackwardSizes(grad_hy, grad_cy, cx, cy, workspace)
+    grad_gates = torch.empty_like(
+        workspace, memory_format=legacy_contiguous_memory_format
+    )
+    grad_cx = torch.empty_like(cx, memory_format=legacy_contiguous_memory_format)
+    grad_bias = grad_gates.sum(0, keepdim=False) if has_bias else None
+    return grad_gates, grad_cx, grad_bias
+
+
+# From aten/src/ATen/native/mps/operations/Linear.mm
+@register_meta(aten.linear_backward.default)
+def linear_backward(input_, grad_output_, weight_, output_mask):
+    grad_input = None
+    grad_weight = None
+    grad_bias = None
+    if output_mask[0]:
+        grad_input = grad_output_.new_empty(input_.size())
+    if output_mask[1] or output_mask[2]:
+        grad_weight = grad_output_.new_empty((grad_output_.size(-1), input_.size(-1)))
+        grad_bias = grad_output_.new_empty(grad_output_.size(-1))
+    return (grad_input, grad_weight, grad_bias)
+
+
+@register_meta(aten.pixel_shuffle.default)
+def meta_pixel_shuffle(self, upscale_factor):
+    assert (
+        len(self.shape) > 2 and self.shape[-3] % (upscale_factor * upscale_factor) == 0
+    ), f"Invalid input shape for pixel_shuffle: {self.shape} with upscale_factor = {upscale_factor}"
+
+    def is_channels_last(ten):
+        return torch._prims_common.suggest_memory_format(ten) == torch.channels_last
+
+    def pick_memory_format():
+        if is_channels_last(self):
+            if device_hint(self) == "cuda":
+                return torch.contiguous_format
+            else:
+                return torch.channels_last
+        elif self.is_contiguous(memory_format=torch.contiguous_format):
+            return torch.contiguous_format
+        elif self.is_contiguous(memory_format=torch.preserve_format):
+            return torch.preserve_format
+
+    C = self.shape[-3] // (upscale_factor * upscale_factor)
+    Hr = self.shape[-2] * upscale_factor
+    Wr = self.shape[-1] * upscale_factor
+    out_shape = (*self.shape[:-3], C, Hr, Wr)
+
+    out = self.new_empty(out_shape)
+    out = out.to(memory_format=pick_memory_format())  # type: ignore[call-overload]
+    return out
+
+
+@register_meta(aten.mkldnn_rnn_layer_backward.default)
+def mkldnn_rnn_layer_backward(
+    input,
+    weight0,
+    weight1,
+    weight2,
+    weight3,
+    hx_,
+    cx_tmp,
+    output,
+    hy_,
+    cy_,
+    grad_output_r_opt,
+    grad_hy_r_opt,
+    grad_cy_r_opt,
+    reverse,
+    mode,
+    hidden_size,
+    num_layers,
+    has_biases,
+    train,
+    bidirectional,
+    batch_sizes,
+    batch_first,
+    workspace,
+):
+    diff_x = input.new_empty(input.shape)
+    diff_hx = hx_.new_empty(hx_.shape)
+    diff_cx = cx_tmp.new_empty(cx_tmp.shape)
+    diff_w1 = weight0.new_empty(weight0.shape)
+    diff_w2 = weight1.new_empty(weight1.shape)
+    diff_b = weight2.new_empty(weight2.shape)
+    return diff_x, diff_w1, diff_w2, diff_b, diff_b, diff_hx, diff_cx
+
+
+@register_meta([aten.bucketize.Tensor, aten.bucketize.Tensor_out])
+@out_wrapper()
+def meta_bucketize(self, boundaries, *, out_int32=False, right=False):
+    return torch.empty_like(
+        self, dtype=torch.int32 if out_int32 else torch.int64
+    ).contiguous()
+
+
+@register_meta(
+    [aten._upsample_bilinear2d_aa.default, aten._upsample_bicubic2d_aa.default]
+)
+def meta_upsample_bimode2d_aa(
+    input, output_size, align_corners, scales_h=None, scales_w=None
+):
+    full_output_size = upsample_common_check(
+        input.size(), output_size, num_spatial_dims=2
+    )
+    torch._check(
+        input.numel() != 0 or all(size > 0 for size in input.size()[1:]),
+        lambda: f"Non-empty 4D data tensor expected but got a tensor with sizes {input.size()}",
+    )
+    return input.new_empty(full_output_size).to(
+        memory_format=utils.suggest_memory_format(input)
+    )
+
+
+# From aten/src/ATen/native/cuda/AmpKernels.cu
+@register_meta(aten._amp_foreach_non_finite_check_and_unscale_.default)
+def _amp_foreach_non_finite_check_and_unscale_(self, found_inf, inv_scale):
+    torch._check(
+        found_inf.numel() == 1, lambda: "found_inf must be a 1-element tensor."
+    )
+    torch._check(
+        inv_scale.numel() == 1, lambda: "inv_scale must be a 1-element tensor."
+    )
+    torch._check(
+        found_inf.dtype.is_floating_point,
+        lambda: "found_inf must be a float tensor.",
+    )
+    torch._check(
+        inv_scale.dtype.is_floating_point,
+        lambda: "inv_scale must be a float tensor.",
+    )
+
+
+# From aten/src/ATen/native/UnaryOps.cpp
+@register_meta([aten.nan_to_num.default, aten.nan_to_num.out])
+@out_wrapper()
+def nan_to_num(self, nan=None, posinf=None, neginf=None):
+    result_size = list(self.size())
+    return self.new_empty(result_size)
+
+
+@register_meta(torch.ops.aten.transpose_)
+def transpose_(self, dim0, dim1):
+    assert self.layout not in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }, f"torch.transpose_: in-place transposition is not supported for {self.layout} layout"
+
+    ndims = self.ndim
+
+    dim0 = maybe_wrap_dim(dim0, ndims)
+    dim1 = maybe_wrap_dim(dim1, ndims)
+
+    if dim0 == dim1:
+        return self
+
+    size = list(self.size())
+    stride = list(self.stride())
+
+    stride[dim0], stride[dim1] = stride[dim1], stride[dim0]
+    size[dim0], size[dim1] = size[dim1], size[dim0]
+
+    self.as_strided_(size, stride)
+    return self
+
+
+@register_meta(torch.ops.aten.t_)
+def t_(self):
+    ndims = self.ndim
+
+    if self.is_sparse:
+        sparse_dim = self.sparse_dim()
+        dense_dim = self.dense_dim()
+        assert (
+            sparse_dim <= 2 and dense_dim == 0
+        ), f"t_ expects a tensor with <= 2 sparse and 0 dense dimensions, but got {sparse_dim} sparse and {dense_dim} dense dimensions"  # noqa: B950
+    else:
+        assert (
+            self.dim() <= 2
+        ), f"t_ expects a tensor with <= 2 dimensions, but self is {ndims}D"
+
+    return transpose_(self, 0, 0 if ndims < 2 else 1)
+
+
+@register_meta(aten.searchsorted)
+@out_wrapper()
+def meta_searchsorted(
+    sorted_sequence, self, *, out_int32=False, right=False, side=None, sorter=None
+):
+    dtype = torch.int32 if out_int32 else torch.int64
+    if isinstance(self, torch.Tensor):
+        return torch.empty_like(self, dtype=dtype).contiguous()
+    else:  # Scalar
+        return torch.empty((), dtype=dtype, device=sorted_sequence.device)
+
+
+def _check_for_unsupported_isin_dtype(dtype):
+    torch._check(
+        dtype not in [torch.bool, torch.bfloat16, torch.complex128, torch.complex64],
+        lambda: f"Unsupported input type encountered for isin(): {dtype}",
+    )
+
+
+@register_meta(aten.isin)
+@out_wrapper()
+def meta_isin(elements, test_elements, *, assume_unique=False, invert=False):
+    torch._check(
+        isinstance(elements, Tensor) or isinstance(test_elements, Tensor),
+        lambda: "At least one of elements and test_elements must be a Tensor.",
+    )
+    if not isinstance(elements, Tensor):
+        elements = torch.tensor(elements, device=test_elements.device)
+
+    if not isinstance(test_elements, Tensor):
+        test_elements = torch.tensor(test_elements, device=elements.device)
+
+    _check_for_unsupported_isin_dtype(elements.dtype)
+    _check_for_unsupported_isin_dtype(test_elements.dtype)
+    return torch.empty_like(elements, dtype=torch.bool)
+
+
+@register_meta(aten.polygamma)
+@out_wrapper()
+def meta_polygamma(n: int, self: Tensor) -> Tensor:
+    torch._check(n >= 0, lambda: "polygamma(n, x) does not support negative n.")
+    _, result_dtype = elementwise_dtypes(
+        self,
+        type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    )
+    return torch.empty_like(self, dtype=result_dtype)
+
+
+def _create_unary_float_meta_func(func):
+    @register_meta(func)
+    @out_wrapper()
+    def _f(x):
+        return elementwise_meta(
+            x, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        )
+
+    return _f
+
+
+def _create_binary_float_meta_func(func):
+    @register_meta(func)
+    @out_wrapper()
+    def _f(x, y):
+        return elementwise_meta(
+            x, y, type_promotion=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+        )
+
+    return _f
+
+
+_create_unary_float_meta_func(aten.special_airy_ai)
+_create_unary_float_meta_func(aten.special_bessel_y0)
+_create_unary_float_meta_func(aten.special_bessel_y1)
+_create_unary_float_meta_func(aten.special_modified_bessel_i0)
+_create_unary_float_meta_func(aten.special_modified_bessel_i1)
+_create_unary_float_meta_func(aten.special_modified_bessel_k0)
+_create_unary_float_meta_func(aten.special_modified_bessel_k1)
+_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k0)
+_create_unary_float_meta_func(aten.special_scaled_modified_bessel_k1)
+
+
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_t)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_chebyshev_polynomial_w)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_t)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_u)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_v)
+_create_binary_float_meta_func(aten.special_shifted_chebyshev_polynomial_w)
+_create_binary_float_meta_func(aten.special_hermite_polynomial_h)
+_create_binary_float_meta_func(aten.special_hermite_polynomial_he)
+_create_binary_float_meta_func(aten.special_laguerre_polynomial_l)
+_create_binary_float_meta_func(aten.special_legendre_polynomial_p)
+
+
+# We must also trigger meta registrations from PrimTorch ref
+# decompositions
+import torch._refs
+import torch._refs.nn.functional
+import torch._refs.special
+
+
+def activate_meta():
+    activate_meta_table = {}
+
+    # For a given op, we pick the most specific decomp function from
+    # global_decomp_table in the precedence order of meta > post_autograd > pre_autograd
+    for type in ["meta", "post_autograd", "pre_autograd"]:
+        registry = global_decomposition_table[type]
+
+        for opo in registry:
+            if opo not in activate_meta_table:
+                activate_meta_table[opo] = registry[opo]
+
+    for op_overload, fn in activate_meta_table.items():
+        # Don't register meta for HigherOrderOp's decomp.
+        # We can reconsider this in the future, but in general,
+        # the way you do a meta for a HigherOrderOp is different from
+        # OpOverload.
+        if isinstance(op_overload, torch._ops.HigherOrderOperator):
+            continue
+        assert isinstance(op_overload, OpOverload)
+
+        op_overload.py_impl(torch._C.DispatchKey.Meta)(fn)
+
+        if torch._C._dispatch_has_kernel_for_dispatch_key(
+            op_overload.name(), "CompositeImplicitAutograd"
+        ):
+            # Internally, we shouldn't be registering meta kernels for any operators that
+            # have CompositeImplicitAutograd kernels.
+            # Instead, we should be letting those decompositions run, and writing meta kernels
+            # only for the base operators.
+            if op_overload in global_decomposition_table["meta"]:
+                raise RuntimeError(
+                    f"{op_overload} is a CompositeImplicitAutograd op, we shouldn't "
+                    "register meta function for it. Instead, we should let the decomposition run and write "
+                    "meta kernels for the base operators."
+                )
+            pass
+        elif op_overload.is_view:
+            # Attempting to register a python meta kernel for a view operator.
+            # We shouldn't do this, because the output will report as not having aliased storages.
+            # All view ops have meta kernels in C++ today, so we should use those instead.
+            pass
+        elif op_overload.name() in {
+            "aten::empty_strided",  # causing infinite recursion, test_meta.py
+            "aten::clone",  # causing infinite recursion
+            "aten::_to_copy",  # causing infinite recursion, test_serialization.py -k test_tensor_subclass_getstate_overwrite  # noqa: B950
+            "aten::copy_",  # Exception not raised, test_torch.py -k test_storage_meta_errors_cpu_int64  # noqa: B950
+            "aten::constant_pad_nd",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_amp_istft_cuda_float32  # noqa: B950
+            "aten::rot90",  # requires_grad mismatch! test_ops.py -k test_fake_crossref_backward_amp_rot90_cuda_float32  # noqa: B950
+            "aten::as_strided_scatter",  # requires_grad mismatch, test_ops.py -k test_fake_crossref_backward_no_amp_as_strided_scatter_cuda_float32  # noqa: B950
+        }:
+            pass
+        else:
+            if "mkldnn::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_mkldnn.impl(op_overload, fn)
+            elif "mkl::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_mkl.impl(op_overload, fn)
+            elif "onednn::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_onednn.impl(op_overload, fn)
+            elif "quantized::" in op_overload.name():
+                _meta_lib_dont_use_me_use_register_meta_for_quantized.impl(
+                    op_overload, fn
+                )
+            else:
+                _meta_lib_dont_use_me_use_register_meta.impl(op_overload, fn)
+
+
+activate_meta()
diff --git a/MLPY/Lib/site-packages/torch/_namedtensor_internals.py b/MLPY/Lib/site-packages/torch/_namedtensor_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..c073e7c12fc8795aa5fa4c375eb91c383a9cd666
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_namedtensor_internals.py
@@ -0,0 +1,157 @@
+from collections import OrderedDict
+
+"""
+This file contains helper functions that implement experimental functionality
+for named tensors in python. All of these are experimental, unstable, and
+subject to change or deletion.
+"""
+
+
+def check_serializing_named_tensor(tensor):
+    if tensor.has_names():
+        raise RuntimeError(
+            "NYI: Named tensors don't support serialization. Please drop "
+            "names via `tensor = tensor.rename(None)` before serialization."
+        )
+
+
+def build_dim_map(tensor):
+    """Returns a map of { dim: dim_name } where dim is a name if the dim is named
+    and the dim index otherwise."""
+    return OrderedDict(
+        [(idx if name is None else name, name) for idx, name in enumerate(tensor.names)]
+    )
+
+
+def unzip_namedshape(namedshape):
+    if isinstance(namedshape, OrderedDict):
+        namedshape = namedshape.items()
+    if not hasattr(namedshape, "__iter__") and not isinstance(namedshape, tuple):
+        raise RuntimeError(
+            f"Expected namedshape to be OrderedDict or iterable of tuples, got: {type(namedshape)}"
+        )
+    if len(namedshape) == 0:
+        raise RuntimeError("Expected namedshape to non-empty.")
+    return zip(*namedshape)
+
+
+def namer_api_name(inplace):
+    if inplace:
+        return "rename_"
+    else:
+        return "rename"
+
+
+def is_ellipsis(item):
+    return item == Ellipsis or item == "..."
+
+
+def single_ellipsis_index(names, fn_name):
+    ellipsis_indices = [i for i, name in enumerate(names) if is_ellipsis(name)]
+    if len(ellipsis_indices) >= 2:
+        raise RuntimeError(
+            f"{fn_name}: More than one Ellipsis ('...') found in names ("
+            f"{names}). This function supports up to one Ellipsis."
+        )
+    if len(ellipsis_indices) == 1:
+        return ellipsis_indices[0]
+    return None
+
+
+def expand_single_ellipsis(numel_pre_glob, numel_post_glob, names):
+    return names[numel_pre_glob : len(names) - numel_post_glob]
+
+
+def replace_ellipsis_by_position(ellipsis_idx, names, tensor_names):
+    globbed_names = expand_single_ellipsis(
+        ellipsis_idx, len(names) - ellipsis_idx - 1, tensor_names
+    )
+    return names[:ellipsis_idx] + globbed_names + names[ellipsis_idx + 1 :]
+
+
+def resolve_ellipsis(names, tensor_names, fn_name):
+    """
+    Expands ... inside `names` to be equal to a list of names from `tensor_names`.
+    """
+    ellipsis_idx = single_ellipsis_index(names, fn_name)
+    if ellipsis_idx is None:
+        return names
+    return replace_ellipsis_by_position(ellipsis_idx, names, tensor_names)
+
+
+def update_names_with_list(tensor, names, inplace):
+    # Special case for tensor.rename(None)
+    if len(names) == 1 and names[0] is None:
+        return tensor._update_names(None, inplace)
+
+    return tensor._update_names(
+        resolve_ellipsis(names, tensor.names, namer_api_name(inplace)), inplace
+    )
+
+
+def update_names_with_mapping(tensor, rename_map, inplace):
+    dim_map = build_dim_map(tensor)
+    for old_dim in rename_map.keys():
+        new_dim = rename_map[old_dim]
+        if old_dim in dim_map.keys():
+            dim_map[old_dim] = new_dim
+        else:
+            raise RuntimeError(
+                f"{namer_api_name(inplace)}: Tried to rename dim '{old_dim}' to dim "
+                f"{new_dim} in Tensor[{tensor.names}] but dim '{old_dim}' does not exist"
+            )
+    return tensor._update_names(tuple(dim_map.values()), inplace)
+
+
+def update_names(tensor, names, rename_map, inplace):
+    """There are two usages:
+
+    tensor.rename(*names) returns a view on tensor with named dims `names`.
+    `names` must be of length `tensor.dim()`; otherwise, if '...' is in `names`,
+    then it is expanded greedily to be equal to the corresponding names from
+    `tensor.names`.
+
+    For example,
+    ```
+    >>> # xdoctest: +SKIP
+    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+    >>> x.rename('...', 'height', 'width').names
+    ('N', 'C', 'height', 'width')
+
+    >>> # xdoctest: +SKIP
+    >>> x.rename('batch', '...', 'width').names
+    ('batch', 'C', 'H', 'width')
+
+    ```
+
+    tensor.rename(**rename_map) returns a view on tensor that has rename dims
+        as specified in the mapping `rename_map`.
+
+    For example,
+    ```
+    >>> # xdoctest: +SKIP
+    >>> x = torch.empty(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+    >>> x.rename(W='width', H='height').names
+    ('N', 'C', 'height', 'width')
+
+    ```
+
+    Finally, tensor.rename has an in-place version called tensor.rename_.
+    """
+    has_names = len(names) > 0
+    has_rename_pairs = bool(rename_map)
+    if has_names and has_rename_pairs:
+        raise RuntimeError(
+            f"{namer_api_name(inplace)}: This function takes either positional "
+            f"args or keyword args, but not both. Use tensor.{namer_api_name(inplace)}(*names) "
+            f"to name dims and tensor.{namer_api_name(inplace)}(**rename_map) to rename "
+            "dims."
+        )
+
+    # Special case for tensor.rename(*[]), which is valid for a 0 dim tensor.
+    if not has_names and not has_rename_pairs:
+        return update_names_with_list(tensor, names, inplace)
+
+    if has_names:
+        return update_names_with_list(tensor, names, inplace)
+    return update_names_with_mapping(tensor, rename_map, inplace)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__init__.py b/MLPY/Lib/site-packages/torch/_numpy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd4b4e7f7481fc5023cbfe62f10a7881e5257fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/__init__.py
@@ -0,0 +1,30 @@
+# mypy: ignore-errors
+
+from . import fft, linalg, random
+from ._dtypes import *  # noqa: F403
+from ._funcs import *  # noqa: F403
+from ._getlimits import finfo, iinfo
+from ._ndarray import (
+    array,
+    asarray,
+    ascontiguousarray,
+    can_cast,
+    from_dlpack,
+    ndarray,
+    newaxis,
+    result_type,
+)
+from ._ufuncs import *  # noqa: F403
+from ._util import AxisError, UFuncTypeError
+
+# from . import testing
+
+alltrue = all
+sometrue = any
+
+inf = float("inf")
+nan = float("nan")
+from math import pi, e  # isort: skip
+
+False_ = False
+True_ = True
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0267877707d8803f23d1817ce9f4eccc6064a36
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_binary_ufuncs_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_binary_ufuncs_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..084c7755d6277bb907a698597f3e3178f2e43c96
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_binary_ufuncs_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_casting_dicts.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_casting_dicts.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f322d82433813988ad140c1052ce97772c01b21
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_casting_dicts.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7dbc2c16828f670246508d56b57db107afad6930
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab17e2c8921d38e11a130ffae9c1f60fac334e77
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_dtypes_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afe5c2d6fbc24545f1d15db5c720854d594d2a38
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b423dadbb266156725ef5062425e7dfdd38ac561
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_funcs_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_getlimits.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_getlimits.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dbd2d6f0643875b6ae053d70144e98da41a4ef6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_getlimits.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ndarray.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ndarray.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efb9a2073b550fee22ca704e1e3535f21c4e58ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ndarray.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_normalizations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_normalizations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a11705d885176995c08a6e4d9fb08531747d6f0d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_normalizations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_reductions_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_reductions_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5a8ac5c96b085427e414ce26dba2dffab409376
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_reductions_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ufuncs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ufuncs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33f85bef0d85ba93a1306972ac71da3a7b62703a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_ufuncs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_unary_ufuncs_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_unary_ufuncs_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..518ce42ce37d5e38009f8c5fd2b143e5de206d4e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_unary_ufuncs_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_util.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..869e7f810ad8d103128fc928d9b89e73a10a9152
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/_util.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/fft.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/fft.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..320fabb3fdf81a33a6adfbd0a1fc9ab9e41e9ec6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/fft.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/linalg.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/linalg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6bfe97ec013d44375263d02357b5c6c85f184a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/linalg.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/__pycache__/random.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/random.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5f63a73bac41ac2049c1a591221eb822c9940af
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/__pycache__/random.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_binary_ufuncs_impl.py b/MLPY/Lib/site-packages/torch/_numpy/_binary_ufuncs_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..1636dfe34b3cbff5fb5a0be400935a87706d4dab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_binary_ufuncs_impl.py
@@ -0,0 +1,86 @@
+# mypy: ignore-errors
+
+"""Export torch work functions for binary ufuncs, rename/tweak to match numpy.
+This listing is further exported to public symbols in the `torch._numpy/_ufuncs.py` module.
+"""
+
+import torch
+
+from torch import (  # noqa: F401
+    add,  # noqa: F401
+    arctan2,  # noqa: F401
+    bitwise_and,  # noqa: F401
+    bitwise_left_shift as left_shift,  # noqa: F401
+    bitwise_or,  # noqa: F401
+    bitwise_right_shift as right_shift,  # noqa: F401
+    bitwise_xor,  # noqa: F401
+    copysign,  # noqa: F401
+    divide,  # noqa: F401
+    eq as equal,  # noqa: F401
+    float_power,  # noqa: F401
+    floor_divide,  # noqa: F401
+    fmax,  # noqa: F401
+    fmin,  # noqa: F401
+    fmod,  # noqa: F401
+    gcd,  # noqa: F401
+    greater,  # noqa: F401
+    greater_equal,  # noqa: F401
+    heaviside,  # noqa: F401
+    hypot,  # noqa: F401
+    lcm,  # noqa: F401
+    ldexp,  # noqa: F401
+    less,  # noqa: F401
+    less_equal,  # noqa: F401
+    logaddexp,  # noqa: F401
+    logaddexp2,  # noqa: F401
+    logical_and,  # noqa: F401
+    logical_or,  # noqa: F401
+    logical_xor,  # noqa: F401
+    maximum,  # noqa: F401
+    minimum,  # noqa: F401
+    multiply,  # noqa: F401
+    nextafter,  # noqa: F401
+    not_equal,  # noqa: F401
+    pow as power,  # noqa: F401
+    remainder,  # noqa: F401
+    remainder as mod,  # noqa: F401
+    subtract,  # noqa: F401
+    true_divide,  # noqa: F401
+)
+
+from . import _dtypes_impl, _util
+
+
+# work around torch limitations w.r.t. numpy
+def matmul(x, y):
+    # work around:
+    #  - RuntimeError: expected scalar type Int but found Double
+    #  - RuntimeError: "addmm_impl_cpu_" not implemented for 'Bool'
+    #  - RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
+    dtype = _dtypes_impl.result_type_impl(x, y)
+    is_bool = dtype == torch.bool
+    is_half = (x.dtype == torch.float16 or y.dtype == torch.float16) and (
+        x.is_cpu or y.is_cpu
+    )
+
+    work_dtype = dtype
+    if is_bool:
+        work_dtype = torch.uint8
+    if is_half:
+        work_dtype = torch.float32
+
+    x = _util.cast_if_needed(x, work_dtype)
+    y = _util.cast_if_needed(y, work_dtype)
+
+    result = torch.matmul(x, y)
+
+    if work_dtype != dtype:
+        result = result.to(dtype)
+
+    return result
+
+
+# a stub implementation of divmod, should be improved after
+# https://github.com/pytorch/pytorch/issues/90820 is fixed in pytorch
+def divmod(x, y):
+    return x // y, x % y
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_casting_dicts.py b/MLPY/Lib/site-packages/torch/_numpy/_casting_dicts.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb4176512321f696b02e9d77336e1cb3c769c2b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_casting_dicts.py
@@ -0,0 +1,881 @@
+# mypy: ignore-errors
+
+import torch
+
+# These two dicts are autogenerated with autogen/gen_dtypes.py,
+# using numpy version 1.23.5.
+
+_can_cast_dict = {
+    "no": {
+        torch.float16: {
+            torch.float16: True,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float32: {
+            torch.float16: False,
+            torch.float32: True,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: True,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: True,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex128: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.uint8: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: True,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int8: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int16: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: True,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int32: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: True,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.bool: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: True,
+        },
+    },
+    "equiv": {
+        torch.float16: {
+            torch.float16: True,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float32: {
+            torch.float16: False,
+            torch.float32: True,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: True,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: True,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex128: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.uint8: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: True,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int8: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int16: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: True,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int32: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: True,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.int64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.bool: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: False,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: True,
+        },
+    },
+    "safe": {
+        torch.float16: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float32: {
+            torch.float16: False,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: True,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex128: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.uint8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: False,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int16: {
+            torch.float16: False,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int32: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: True,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: True,
+            torch.complex64: False,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.bool: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+    },
+    "same_kind": {
+        torch.float16: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float32: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.float64: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex64: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.complex128: {
+            torch.float16: False,
+            torch.float32: False,
+            torch.float64: False,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: False,
+            torch.int16: False,
+            torch.int32: False,
+            torch.int64: False,
+            torch.bool: False,
+        },
+        torch.uint8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int16: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int32: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.int64: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: False,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: False,
+        },
+        torch.bool: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+    },
+    "unsafe": {
+        torch.float16: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.float32: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.float64: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.complex64: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.complex128: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.uint8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.int8: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.int16: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.int32: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.int64: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+        torch.bool: {
+            torch.float16: True,
+            torch.float32: True,
+            torch.float64: True,
+            torch.complex64: True,
+            torch.complex128: True,
+            torch.uint8: True,
+            torch.int8: True,
+            torch.int16: True,
+            torch.int32: True,
+            torch.int64: True,
+            torch.bool: True,
+        },
+    },
+}
+
+
+_result_type_dict = {
+    torch.float16: {
+        torch.float16: torch.float16,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.float16,
+        torch.int8: torch.float16,
+        torch.int16: torch.float32,
+        torch.int32: torch.float64,
+        torch.int64: torch.float64,
+        torch.bool: torch.float16,
+    },
+    torch.float32: {
+        torch.float16: torch.float32,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.float32,
+        torch.int8: torch.float32,
+        torch.int16: torch.float32,
+        torch.int32: torch.float64,
+        torch.int64: torch.float64,
+        torch.bool: torch.float32,
+    },
+    torch.float64: {
+        torch.float16: torch.float64,
+        torch.float32: torch.float64,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex128,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.float64,
+        torch.int8: torch.float64,
+        torch.int16: torch.float64,
+        torch.int32: torch.float64,
+        torch.int64: torch.float64,
+        torch.bool: torch.float64,
+    },
+    torch.complex64: {
+        torch.float16: torch.complex64,
+        torch.float32: torch.complex64,
+        torch.float64: torch.complex128,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.complex64,
+        torch.int8: torch.complex64,
+        torch.int16: torch.complex64,
+        torch.int32: torch.complex128,
+        torch.int64: torch.complex128,
+        torch.bool: torch.complex64,
+    },
+    torch.complex128: {
+        torch.float16: torch.complex128,
+        torch.float32: torch.complex128,
+        torch.float64: torch.complex128,
+        torch.complex64: torch.complex128,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.complex128,
+        torch.int8: torch.complex128,
+        torch.int16: torch.complex128,
+        torch.int32: torch.complex128,
+        torch.int64: torch.complex128,
+        torch.bool: torch.complex128,
+    },
+    torch.uint8: {
+        torch.float16: torch.float16,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.uint8,
+        torch.int8: torch.int16,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+        torch.int64: torch.int64,
+        torch.bool: torch.uint8,
+    },
+    torch.int8: {
+        torch.float16: torch.float16,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.int16,
+        torch.int8: torch.int8,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+        torch.int64: torch.int64,
+        torch.bool: torch.int8,
+    },
+    torch.int16: {
+        torch.float16: torch.float32,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.int16,
+        torch.int8: torch.int16,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+        torch.int64: torch.int64,
+        torch.bool: torch.int16,
+    },
+    torch.int32: {
+        torch.float16: torch.float64,
+        torch.float32: torch.float64,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex128,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.int32,
+        torch.int8: torch.int32,
+        torch.int16: torch.int32,
+        torch.int32: torch.int32,
+        torch.int64: torch.int64,
+        torch.bool: torch.int32,
+    },
+    torch.int64: {
+        torch.float16: torch.float64,
+        torch.float32: torch.float64,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex128,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.int64,
+        torch.int8: torch.int64,
+        torch.int16: torch.int64,
+        torch.int32: torch.int64,
+        torch.int64: torch.int64,
+        torch.bool: torch.int64,
+    },
+    torch.bool: {
+        torch.float16: torch.float16,
+        torch.float32: torch.float32,
+        torch.float64: torch.float64,
+        torch.complex64: torch.complex64,
+        torch.complex128: torch.complex128,
+        torch.uint8: torch.uint8,
+        torch.int8: torch.int8,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+        torch.int64: torch.int64,
+        torch.bool: torch.bool,
+    },
+}
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_dtypes.py b/MLPY/Lib/site-packages/torch/_numpy/_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4195f7a32d010ff91b5aa6ae80be589673f06ae4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_dtypes.py
@@ -0,0 +1,434 @@
+# mypy: ignore-errors
+
+""" Define analogs of numpy dtypes supported by pytorch.
+Define the scalar types and supported dtypes and numpy <--> torch dtype mappings.
+"""
+import builtins
+
+import torch
+
+from . import _dtypes_impl
+
+
+# ### Scalar types ###
+
+
+class generic:
+    name = "generic"
+
+    def __new__(cls, value):
+        # NumPy scalars are modelled as 0-D arrays
+        # so a call to np.float32(4) produces a 0-D array.
+
+        from ._ndarray import asarray, ndarray
+
+        if isinstance(value, str) and value in ["inf", "nan"]:
+            value = {"inf": torch.inf, "nan": torch.nan}[value]
+
+        if isinstance(value, ndarray):
+            return value.astype(cls)
+        else:
+            return asarray(value, dtype=cls)
+
+
+##################
+# abstract types #
+##################
+
+
+class number(generic):
+    name = "number"
+
+
+class integer(number):
+    name = "integer"
+
+
+class inexact(number):
+    name = "inexact"
+
+
+class signedinteger(integer):
+    name = "signedinteger"
+
+
+class unsignedinteger(integer):
+    name = "unsignedinteger"
+
+
+class floating(inexact):
+    name = "floating"
+
+
+class complexfloating(inexact):
+    name = "complexfloating"
+
+
+_abstract_dtypes = [
+    "generic",
+    "number",
+    "integer",
+    "signedinteger",
+    "unsignedinteger",
+    "inexact",
+    "floating",
+    "complexfloating",
+]
+
+# ##### concrete types
+
+# signed integers
+
+
+class int8(signedinteger):
+    name = "int8"
+    typecode = "b"
+    torch_dtype = torch.int8
+
+
+class int16(signedinteger):
+    name = "int16"
+    typecode = "h"
+    torch_dtype = torch.int16
+
+
+class int32(signedinteger):
+    name = "int32"
+    typecode = "i"
+    torch_dtype = torch.int32
+
+
+class int64(signedinteger):
+    name = "int64"
+    typecode = "l"
+    torch_dtype = torch.int64
+
+
+# unsigned integers
+
+
+class uint8(unsignedinteger):
+    name = "uint8"
+    typecode = "B"
+    torch_dtype = torch.uint8
+
+
+# floating point
+
+
+class float16(floating):
+    name = "float16"
+    typecode = "e"
+    torch_dtype = torch.float16
+
+
+class float32(floating):
+    name = "float32"
+    typecode = "f"
+    torch_dtype = torch.float32
+
+
+class float64(floating):
+    name = "float64"
+    typecode = "d"
+    torch_dtype = torch.float64
+
+
+class complex64(complexfloating):
+    name = "complex64"
+    typecode = "F"
+    torch_dtype = torch.complex64
+
+
+class complex128(complexfloating):
+    name = "complex128"
+    typecode = "D"
+    torch_dtype = torch.complex128
+
+
+class bool_(generic):
+    name = "bool_"
+    typecode = "?"
+    torch_dtype = torch.bool
+
+
+# name aliases
+_name_aliases = {
+    "intp": int64,
+    "int_": int64,
+    "intc": int32,
+    "byte": int8,
+    "short": int16,
+    "longlong": int64,  # XXX: is this correct?
+    "ubyte": uint8,
+    "half": float16,
+    "single": float32,
+    "double": float64,
+    "float_": float64,
+    "csingle": complex64,
+    "singlecomplex": complex64,
+    "cdouble": complex128,
+    "cfloat": complex128,
+    "complex_": complex128,
+}
+# We register float_ = float32 and so on
+for name, obj in _name_aliases.items():
+    vars()[name] = obj
+
+
+# Replicate this NumPy-defined way of grouping scalar types,
+# cf tests/core/test_scalar_methods.py
+sctypes = {
+    "int": [int8, int16, int32, int64],
+    "uint": [uint8],
+    "float": [float16, float32, float64],
+    "complex": [complex64, complex128],
+    "others": [bool_],
+}
+
+
+# Support mappings/functions
+
+_names = {st.name: st for cat in sctypes for st in sctypes[cat]}
+_typecodes = {st.typecode: st for cat in sctypes for st in sctypes[cat]}
+_torch_dtypes = {st.torch_dtype: st for cat in sctypes for st in sctypes[cat]}
+
+
+_aliases = {
+    "u1": uint8,
+    "i1": int8,
+    "i2": int16,
+    "i4": int32,
+    "i8": int64,
+    "b": int8,  # XXX: srsly?
+    "f2": float16,
+    "f4": float32,
+    "f8": float64,
+    "c8": complex64,
+    "c16": complex128,
+    # numpy-specific trailing underscore
+    "bool_": bool_,
+}
+
+
+_python_types = {
+    int: int64,
+    float: float64,
+    complex: complex128,
+    builtins.bool: bool_,
+    # also allow stringified names of python types
+    int.__name__: int64,
+    float.__name__: float64,
+    complex.__name__: complex128,
+    builtins.bool.__name__: bool_,
+}
+
+
+def sctype_from_string(s):
+    """Normalize a string value: a type 'name' or a typecode or a width alias."""
+    if s in _names:
+        return _names[s]
+    if s in _name_aliases.keys():
+        return _name_aliases[s]
+    if s in _typecodes:
+        return _typecodes[s]
+    if s in _aliases:
+        return _aliases[s]
+    if s in _python_types:
+        return _python_types[s]
+    raise TypeError(f"data type {s!r} not understood")
+
+
+def sctype_from_torch_dtype(torch_dtype):
+    return _torch_dtypes[torch_dtype]
+
+
+# ### DTypes. ###
+
+
+def dtype(arg):
+    if arg is None:
+        arg = _dtypes_impl.default_dtypes().float_dtype
+    return DType(arg)
+
+
+class DType:
+    def __init__(self, arg):
+        # a pytorch object?
+        if isinstance(arg, torch.dtype):
+            sctype = _torch_dtypes[arg]
+        elif isinstance(arg, torch.Tensor):
+            sctype = _torch_dtypes[arg.dtype]
+        # a scalar type?
+        elif issubclass_(arg, generic):
+            sctype = arg
+        # a dtype already?
+        elif isinstance(arg, DType):
+            sctype = arg._scalar_type
+        # a has a right attribute?
+        elif hasattr(arg, "dtype"):
+            sctype = arg.dtype._scalar_type
+        else:
+            sctype = sctype_from_string(arg)
+        self._scalar_type = sctype
+
+    @property
+    def name(self):
+        return self._scalar_type.name
+
+    @property
+    def type(self):
+        return self._scalar_type
+
+    @property
+    def kind(self):
+        # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html
+        return _torch_dtypes[self.torch_dtype].name[0]
+
+    @property
+    def typecode(self):
+        return self._scalar_type.typecode
+
+    def __eq__(self, other):
+        if isinstance(other, DType):
+            return self._scalar_type == other._scalar_type
+        try:
+            other_instance = DType(other)
+        except TypeError:
+            return False
+        return self._scalar_type == other_instance._scalar_type
+
+    @property
+    def torch_dtype(self):
+        return self._scalar_type.torch_dtype
+
+    def __hash__(self):
+        return hash(self._scalar_type.name)
+
+    def __repr__(self):
+        return f'dtype("{self.name}")'
+
+    __str__ = __repr__
+
+    @property
+    def itemsize(self):
+        elem = self.type(1)
+        return elem.tensor.element_size()
+
+    def __getstate__(self):
+        return self._scalar_type
+
+    def __setstate__(self, value):
+        self._scalar_type = value
+
+
+typecodes = {
+    "All": "efdFDBbhil?",
+    "AllFloat": "efdFD",
+    "AllInteger": "Bbhil",
+    "Integer": "bhil",
+    "UnsignedInteger": "B",
+    "Float": "efd",
+    "Complex": "FD",
+}
+
+
+# ### Defaults and dtype discovery
+
+
+def set_default_dtype(fp_dtype="numpy", int_dtype="numpy"):
+    """Set the (global) defaults for fp, complex, and int dtypes.
+
+    The complex dtype is inferred from the float (fp) dtype. It has
+    a width at least twice the width of the float dtype,
+    i.e., it's complex128 for float64 and complex64 for float32.
+
+    Parameters
+    ----------
+    fp_dtype
+        Allowed values are "numpy", "pytorch" or dtype_like things which
+        can be converted into a DType instance.
+        Default is "numpy" (i.e. float64).
+    int_dtype
+        Allowed values are "numpy", "pytorch" or dtype_like things which
+        can be converted into a DType instance.
+        Default is "numpy" (i.e. int64).
+
+    Returns
+    -------
+    The old default dtype state: a namedtuple with attributes ``float_dtype``,
+    ``complex_dtypes`` and ``int_dtype``. These attributes store *pytorch*
+    dtypes.
+
+    Notes
+    ------------
+    This functions has a side effect: it sets the global state with the provided dtypes.
+
+    The complex dtype has bit width of at least twice the width of the float
+    dtype, i.e. it's complex128 for float64 and complex64 for float32.
+
+    """
+    if fp_dtype not in ["numpy", "pytorch"]:
+        fp_dtype = dtype(fp_dtype).torch_dtype
+    if int_dtype not in ["numpy", "pytorch"]:
+        int_dtype = dtype(int_dtype).torch_dtype
+
+    if fp_dtype == "numpy":
+        float_dtype = torch.float64
+    elif fp_dtype == "pytorch":
+        float_dtype = torch.float32
+    else:
+        float_dtype = fp_dtype
+
+    complex_dtype = {
+        torch.float64: torch.complex128,
+        torch.float32: torch.complex64,
+        torch.float16: torch.complex64,
+    }[float_dtype]
+
+    if int_dtype in ["numpy", "pytorch"]:
+        int_dtype = torch.int64
+    else:
+        int_dtype = int_dtype
+
+    new_defaults = _dtypes_impl.DefaultDTypes(
+        float_dtype=float_dtype, complex_dtype=complex_dtype, int_dtype=int_dtype
+    )
+
+    # set the new global state and return the old state
+    old_defaults = _dtypes_impl.default_dtypes
+    _dtypes_impl._default_dtypes = new_defaults
+    return old_defaults
+
+
+def issubclass_(arg, klass):
+    try:
+        return issubclass(arg, klass)
+    except TypeError:
+        return False
+
+
+def issubdtype(arg1, arg2):
+    # cf https://github.com/numpy/numpy/blob/v1.24.0/numpy/core/numerictypes.py#L356-L420
+
+    # We also accept strings even if NumPy doesn't as dtypes are serialized as their
+    # string representation in dynamo's graph
+    def str_to_abstract(t):
+        if isinstance(t, str) and t in _abstract_dtypes:
+            return globals()[t]
+        return t
+
+    arg1 = str_to_abstract(arg1)
+    arg2 = str_to_abstract(arg2)
+
+    if not issubclass_(arg1, generic):
+        arg1 = dtype(arg1).type
+    if not issubclass_(arg2, generic):
+        arg2 = dtype(arg2).type
+    return issubclass(arg1, arg2)
+
+
+__all__ = ["dtype", "DType", "typecodes", "issubdtype", "set_default_dtype", "sctypes"]
+__all__ += list(_names.keys())  # noqa: PLE0605
+__all__ += list(_name_aliases.keys())  # noqa: PLE0605
+__all__ += _abstract_dtypes  # noqa: PLE0605
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_dtypes_impl.py b/MLPY/Lib/site-packages/torch/_numpy/_dtypes_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..834e585c89e719f93460075fbbc2aaeca4b487a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_dtypes_impl.py
@@ -0,0 +1,216 @@
+# mypy: ignore-errors
+
+"""Dtypes/scalar type implementaions with torch dtypes.
+
+Here `dtype` is always a torch.dtype, this module knows nothing about
+scalar types, wrapper dtypes or anything like that. PyTorch only.
+"""
+from collections import namedtuple
+
+import torch
+
+# defaults : mimic NumPy, allow user control
+DefaultDTypes = namedtuple(
+    "DefaultDTypes", ["float_dtype", "complex_dtype", "int_dtype"]
+)
+
+# a global state
+# We set it the first time we call default_dtypes() to avoid importing
+# torch._dynamo.config and create a circular reference
+_default_dtypes = None
+
+
+def default_dtypes():
+    global _default_dtypes
+    if _default_dtypes is None:
+        import torch._dynamo.config as config
+
+        _default_dtypes = DefaultDTypes(
+            float_dtype=getattr(torch, config.numpy_default_float),
+            complex_dtype=getattr(torch, config.numpy_default_complex),
+            int_dtype=getattr(torch, config.numpy_default_int),
+        )
+        assert isinstance(_default_dtypes.float_dtype, torch.dtype)
+        assert isinstance(_default_dtypes.complex_dtype, torch.dtype)
+        assert isinstance(_default_dtypes.int_dtype, torch.dtype)
+    return _default_dtypes
+
+
+def get_default_dtype_for(dtype):
+    """Default scalar type given sctype category."""
+    if dtype == torch.bool:
+        return dtype
+    if dtype.is_complex:
+        return default_dtypes().complex_dtype
+    if dtype.is_floating_point:
+        return default_dtypes().float_dtype
+    # else, it must be (some) integer
+    return default_dtypes().int_dtype
+
+
+from . import _casting_dicts as _cd
+
+
+def can_cast_impl(from_torch_dtype, to_torch_dtype, casting):
+    return _cd._can_cast_dict[casting][from_torch_dtype][to_torch_dtype]
+
+
+def result_type_impl(*tensors):
+    # NB: torch dtypes here
+    dtyp = tensors[0].dtype
+    if len(tensors) == 1:
+        return dtyp
+
+    for curr in tensors[1:]:
+        dtyp = _cd._result_type_dict[dtyp][curr.dtype]
+
+    return dtyp
+
+
+def python_type_for_torch(dtyp):
+    """Get a python scalar type a torch dtype"""
+    if dtyp.is_floating_point:
+        typ = float
+    elif dtyp.is_complex:
+        typ = complex
+    elif dtyp == torch.bool:
+        typ = bool
+    else:
+        typ = int
+    return typ
+
+
+# ### NEP 50 helpers ###
+
+_SCALAR_TYPES = (int, bool, float, complex)
+
+_SCALAR_AND_SYMBOLIC_TYPES = (
+    *_SCALAR_TYPES,
+    torch.SymInt,
+    torch.SymFloat,
+    torch.SymBool,
+)
+
+_NEP50_FUNCS_TENSOR_ONLY = (
+    "minimum",
+    "maximum",
+    "logaddexp",
+    "logaddexp2",
+    "lcm",
+    "gcd",
+    "hypot",
+    "heaviside",
+    "fmod",
+    "fmin",
+    "fmax",
+    "copysign",
+    "arctan2",
+)
+
+
+def is_scalar(x):
+    return isinstance(x, _SCALAR_TYPES)
+
+
+def is_scalar_or_symbolic(x):
+    return isinstance(x, _SCALAR_AND_SYMBOLIC_TYPES)
+
+
+def _dtype_for_scalar(py_type):
+    return {
+        bool: torch.bool,
+        torch.SymBool: torch.bool,
+        int: torch.int64,
+        torch.SymInt: torch.int64,
+        float: torch.float64,
+        torch.SymFloat: torch.float64,
+        complex: torch.complex128,
+    }[py_type]
+
+
+def _dtype_for_scalar_or_tensor(x):
+    return x.dtype if isinstance(x, torch.Tensor) else _dtype_for_scalar(type(x))
+
+
+def is_float_or_fp_tensor(x):
+    return _dtype_for_scalar_or_tensor(x).is_floating_point
+
+
+def is_complex_or_complex_tensor(x):
+    return _dtype_for_scalar_or_tensor(x).is_complex
+
+
+def _category(dtype):
+    return {
+        torch.bool: 0,
+        torch.SymBool: 0,
+        # int
+        torch.uint8: 1,
+        torch.int8: 1,
+        torch.int16: 1,
+        torch.int32: 1,
+        torch.int64: 1,
+        torch.SymInt: 1,
+        # float
+        torch.float16: 2,
+        torch.float32: 2,
+        torch.float64: 2,
+        torch.SymFloat: 2,
+        # complex
+        torch.complex64: 3,
+        torch.complex128: 3,
+    }[dtype]
+
+
+def nep50_to_tensors(x1, x2, handle_weaks, function_name):
+    """If either of inputs is a python scalar, type-promote with NEP 50."""
+
+    def to_tensor(scalar, dtype=None):
+        if dtype is None:
+            dtype = _dtype_for_scalar(type(scalar))
+            dtype = get_default_dtype_for(dtype)
+        return torch.as_tensor(scalar, dtype=dtype)
+
+    x1_is_weak = not isinstance(x1, torch.Tensor)
+    x2_is_weak = not isinstance(x2, torch.Tensor)
+    if not handle_weaks or (x1_is_weak and x2_is_weak):
+        x1 = to_tensor(x1) if x1_is_weak else x1
+        x2 = to_tensor(x2) if x2_is_weak else x2
+        return x1, x2
+
+    # scalar <op> tensor: NEP 50
+    assert x1_is_weak != x2_is_weak
+
+    weak, not_weak = (x1, x2) if x1_is_weak else (x2, x1)
+
+    # find the dtype for the weak's type
+    weak_dtype = _dtype_for_scalar(type(weak))
+
+    cat_weak = _category(weak_dtype)
+    cat_not_weak = _category(not_weak.dtype)
+
+    dt = not_weak.dtype if cat_weak <= cat_not_weak else None
+
+    # special-case complex + float32
+    if weak_dtype.is_complex and not_weak.dtype == torch.float32:
+        dt = torch.complex64
+
+    # detect overflows: in PyTorch, uint8(-1) wraps around to 255,
+    # while NEP50 mandates an exception.
+    #
+    # Note that we only check if each element of the binop overflows,
+    # not the result. Consider, e.g. `uint8(100) + 200`. Operands are OK
+    # in uint8, but the result overflows and wrap around 255.
+    # Numpy emits a RuntimeWarning, PyTorch does not, and we do not either.
+    if cat_weak == 1 and cat_not_weak == 1:
+        # integers
+        iinfo = torch.iinfo(not_weak.dtype)
+        if not (iinfo.min <= weak <= iinfo.max):
+            raise OverflowError(
+                f"Python integer {weak} out of bounds for {not_weak.dtype}"
+            )
+    if weak_dtype != dt or function_name in _NEP50_FUNCS_TENSOR_ONLY:
+        # finally, can make `weak` into a 0D tensor, if both parameters are required to be tensor.
+        weak = to_tensor(weak, dt)
+
+    return (weak, not_weak) if x1_is_weak else (not_weak, weak)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_funcs.py b/MLPY/Lib/site-packages/torch/_numpy/_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d46fef08b33c8c4fd826a8bc22e4242893645d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_funcs.py
@@ -0,0 +1,75 @@
+# mypy: ignore-errors
+
+import inspect
+import itertools
+
+from . import _funcs_impl, _reductions_impl
+from ._normalizations import normalizer
+
+# _funcs_impl.py contains functions which mimic NumPy's eponymous equivalents,
+# and consume/return PyTorch tensors/dtypes.
+# They are also type annotated.
+# Pull these functions from _funcs_impl and decorate them with @normalizer, which
+# - Converts any input `np.ndarray`, `torch._numpy.ndarray`, list of lists, Python scalars, etc into a `torch.Tensor`.
+# - Maps NumPy dtypes to PyTorch dtypes
+# - If the input to the `axis` kwarg is an ndarray, it maps it into a tuple
+# - Implements the semantics for the `out=` arg
+# - Wraps back the outputs into `torch._numpy.ndarrays`
+
+
+def _public_functions(mod):
+    def is_public_function(f):
+        return inspect.isfunction(f) and not f.__name__.startswith("_")
+
+    return inspect.getmembers(mod, is_public_function)
+
+
+# We fill in __all__ in the loop below
+__all__ = []
+
+# decorate implementer functions with argument normalizers and export to the top namespace
+for name, func in itertools.chain(
+    _public_functions(_funcs_impl), _public_functions(_reductions_impl)
+):
+    if name in ["percentile", "quantile", "median"]:
+        decorated = normalizer(func, promote_scalar_result=True)
+    elif name == "einsum":
+        # normalized manually
+        decorated = func
+    else:
+        decorated = normalizer(func)
+
+    decorated.__qualname__ = name
+    decorated.__name__ = name
+    vars()[name] = decorated
+    __all__.append(name)
+
+
+"""
+Vendored objects from numpy.lib.index_tricks
+"""
+
+
+class IndexExpression:
+    """
+    Written by Konrad Hinsen <hinsen@cnrs-orleans.fr>
+    last revision: 1999-7-23
+
+    Cosmetic changes by T. Oliphant 2001
+    """
+
+    def __init__(self, maketuple):
+        self.maketuple = maketuple
+
+    def __getitem__(self, item):
+        if self.maketuple and not isinstance(item, tuple):
+            return (item,)
+        else:
+            return item
+
+
+index_exp = IndexExpression(maketuple=True)
+s_ = IndexExpression(maketuple=False)
+
+
+__all__ += ["index_exp", "s_"]
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_funcs_impl.py b/MLPY/Lib/site-packages/torch/_numpy/_funcs_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d470076ee51163c7c1d26ce1bfc3d6139f9f6fa0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_funcs_impl.py
@@ -0,0 +1,2053 @@
+# mypy: ignore-errors
+
+"""A thin pytorch / numpy compat layer.
+
+Things imported from here have numpy-compatible signatures but operate on
+pytorch tensors.
+"""
+# Contents of this module ends up in the main namespace via _funcs.py
+# where type annotations are used in conjunction with the @normalizer decorator.
+from __future__ import annotations
+
+import builtins
+import itertools
+import operator
+from typing import Optional, Sequence
+
+import torch
+
+from . import _dtypes_impl, _util
+from ._normalizations import (
+    ArrayLike,
+    ArrayLikeOrScalar,
+    CastingModes,
+    DTypeLike,
+    NDArray,
+    NotImplementedType,
+    OutArray,
+)
+
+
+def copy(
+    a: ArrayLike, order: NotImplementedType = "K", subok: NotImplementedType = False
+):
+    return a.clone()
+
+
+def copyto(
+    dst: NDArray,
+    src: ArrayLike,
+    casting: Optional[CastingModes] = "same_kind",
+    where: NotImplementedType = None,
+):
+    (src,) = _util.typecast_tensors((src,), dst.dtype, casting=casting)
+    dst.copy_(src)
+
+
+def atleast_1d(*arys: ArrayLike):
+    res = torch.atleast_1d(*arys)
+    if isinstance(res, tuple):
+        return list(res)
+    else:
+        return res
+
+
+def atleast_2d(*arys: ArrayLike):
+    res = torch.atleast_2d(*arys)
+    if isinstance(res, tuple):
+        return list(res)
+    else:
+        return res
+
+
+def atleast_3d(*arys: ArrayLike):
+    res = torch.atleast_3d(*arys)
+    if isinstance(res, tuple):
+        return list(res)
+    else:
+        return res
+
+
+def _concat_check(tup, dtype, out):
+    if tup == ():
+        raise ValueError("need at least one array to concatenate")
+
+    """Check inputs in concatenate et al."""
+    if out is not None and dtype is not None:
+        # mimic numpy
+        raise TypeError(
+            "concatenate() only takes `out` or `dtype` as an "
+            "argument, but both were provided."
+        )
+
+
+def _concat_cast_helper(tensors, out=None, dtype=None, casting="same_kind"):
+    """Figure out dtypes, cast if necessary."""
+
+    if out is not None or dtype is not None:
+        # figure out the type of the inputs and outputs
+        out_dtype = out.dtype.torch_dtype if dtype is None else dtype
+    else:
+        out_dtype = _dtypes_impl.result_type_impl(*tensors)
+
+    # cast input arrays if necessary; do not broadcast them agains `out`
+    tensors = _util.typecast_tensors(tensors, out_dtype, casting)
+
+    return tensors
+
+
+def _concatenate(
+    tensors, axis=0, out=None, dtype=None, casting: Optional[CastingModes] = "same_kind"
+):
+    # pure torch implementation, used below and in cov/corrcoef below
+    tensors, axis = _util.axis_none_flatten(*tensors, axis=axis)
+    tensors = _concat_cast_helper(tensors, out, dtype, casting)
+    return torch.cat(tensors, axis)
+
+
+def concatenate(
+    ar_tuple: Sequence[ArrayLike],
+    axis=0,
+    out: Optional[OutArray] = None,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    _concat_check(ar_tuple, dtype, out=out)
+    result = _concatenate(ar_tuple, axis=axis, out=out, dtype=dtype, casting=casting)
+    return result
+
+
+def vstack(
+    tup: Sequence[ArrayLike],
+    *,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    _concat_check(tup, dtype, out=None)
+    tensors = _concat_cast_helper(tup, dtype=dtype, casting=casting)
+    return torch.vstack(tensors)
+
+
+row_stack = vstack
+
+
+def hstack(
+    tup: Sequence[ArrayLike],
+    *,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    _concat_check(tup, dtype, out=None)
+    tensors = _concat_cast_helper(tup, dtype=dtype, casting=casting)
+    return torch.hstack(tensors)
+
+
+def dstack(
+    tup: Sequence[ArrayLike],
+    *,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    # XXX: in numpy 1.24 dstack does not have dtype and casting keywords
+    # but {h,v}stack do.  Hence add them here for consistency.
+    _concat_check(tup, dtype, out=None)
+    tensors = _concat_cast_helper(tup, dtype=dtype, casting=casting)
+    return torch.dstack(tensors)
+
+
+def column_stack(
+    tup: Sequence[ArrayLike],
+    *,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    # XXX: in numpy 1.24 column_stack does not have dtype and casting keywords
+    # but row_stack does. (because row_stack is an alias for vstack, really).
+    # Hence add these keywords here for consistency.
+    _concat_check(tup, dtype, out=None)
+    tensors = _concat_cast_helper(tup, dtype=dtype, casting=casting)
+    return torch.column_stack(tensors)
+
+
+def stack(
+    arrays: Sequence[ArrayLike],
+    axis=0,
+    out: Optional[OutArray] = None,
+    *,
+    dtype: Optional[DTypeLike] = None,
+    casting: Optional[CastingModes] = "same_kind",
+):
+    _concat_check(arrays, dtype, out=out)
+
+    tensors = _concat_cast_helper(arrays, dtype=dtype, casting=casting)
+    result_ndim = tensors[0].ndim + 1
+    axis = _util.normalize_axis_index(axis, result_ndim)
+    return torch.stack(tensors, axis=axis)
+
+
+def append(arr: ArrayLike, values: ArrayLike, axis=None):
+    if axis is None:
+        if arr.ndim != 1:
+            arr = arr.flatten()
+        values = values.flatten()
+        axis = arr.ndim - 1
+    return _concatenate((arr, values), axis=axis)
+
+
+# ### split ###
+
+
+def _split_helper(tensor, indices_or_sections, axis, strict=False):
+    if isinstance(indices_or_sections, int):
+        return _split_helper_int(tensor, indices_or_sections, axis, strict)
+    elif isinstance(indices_or_sections, (list, tuple)):
+        # NB: drop split=..., it only applies to split_helper_int
+        return _split_helper_list(tensor, list(indices_or_sections), axis)
+    else:
+        raise TypeError("split_helper: ", type(indices_or_sections))
+
+
+def _split_helper_int(tensor, indices_or_sections, axis, strict=False):
+    if not isinstance(indices_or_sections, int):
+        raise NotImplementedError("split: indices_or_sections")
+
+    axis = _util.normalize_axis_index(axis, tensor.ndim)
+
+    # numpy: l%n chunks of size (l//n + 1), the rest are sized l//n
+    l, n = tensor.shape[axis], indices_or_sections
+
+    if n <= 0:
+        raise ValueError()
+
+    if l % n == 0:
+        num, sz = n, l // n
+        lst = [sz] * num
+    else:
+        if strict:
+            raise ValueError("array split does not result in an equal division")
+
+        num, sz = l % n, l // n + 1
+        lst = [sz] * num
+
+    lst += [sz - 1] * (n - num)
+
+    return torch.split(tensor, lst, axis)
+
+
+def _split_helper_list(tensor, indices_or_sections, axis):
+    if not isinstance(indices_or_sections, list):
+        raise NotImplementedError("split: indices_or_sections: list")
+    # numpy expects indices, while torch expects lengths of sections
+    # also, numpy appends zero-size arrays for indices above the shape[axis]
+    lst = [x for x in indices_or_sections if x <= tensor.shape[axis]]
+    num_extra = len(indices_or_sections) - len(lst)
+
+    lst.append(tensor.shape[axis])
+    lst = [
+        lst[0],
+    ] + [a - b for a, b in zip(lst[1:], lst[:-1])]
+    lst += [0] * num_extra
+
+    return torch.split(tensor, lst, axis)
+
+
+def array_split(ary: ArrayLike, indices_or_sections, axis=0):
+    return _split_helper(ary, indices_or_sections, axis)
+
+
+def split(ary: ArrayLike, indices_or_sections, axis=0):
+    return _split_helper(ary, indices_or_sections, axis, strict=True)
+
+
+def hsplit(ary: ArrayLike, indices_or_sections):
+    if ary.ndim == 0:
+        raise ValueError("hsplit only works on arrays of 1 or more dimensions")
+    axis = 1 if ary.ndim > 1 else 0
+    return _split_helper(ary, indices_or_sections, axis, strict=True)
+
+
+def vsplit(ary: ArrayLike, indices_or_sections):
+    if ary.ndim < 2:
+        raise ValueError("vsplit only works on arrays of 2 or more dimensions")
+    return _split_helper(ary, indices_or_sections, 0, strict=True)
+
+
+def dsplit(ary: ArrayLike, indices_or_sections):
+    if ary.ndim < 3:
+        raise ValueError("dsplit only works on arrays of 3 or more dimensions")
+    return _split_helper(ary, indices_or_sections, 2, strict=True)
+
+
+def kron(a: ArrayLike, b: ArrayLike):
+    return torch.kron(a, b)
+
+
+def vander(x: ArrayLike, N=None, increasing=False):
+    return torch.vander(x, N, increasing)
+
+
+# ### linspace, geomspace, logspace and arange ###
+
+
+def linspace(
+    start: ArrayLike,
+    stop: ArrayLike,
+    num=50,
+    endpoint=True,
+    retstep=False,
+    dtype: Optional[DTypeLike] = None,
+    axis=0,
+):
+    if axis != 0 or retstep or not endpoint:
+        raise NotImplementedError
+    if dtype is None:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+    # XXX: raises TypeError if start or stop are not scalars
+    return torch.linspace(start, stop, num, dtype=dtype)
+
+
+def geomspace(
+    start: ArrayLike,
+    stop: ArrayLike,
+    num=50,
+    endpoint=True,
+    dtype: Optional[DTypeLike] = None,
+    axis=0,
+):
+    if axis != 0 or not endpoint:
+        raise NotImplementedError
+    base = torch.pow(stop / start, 1.0 / (num - 1))
+    logbase = torch.log(base)
+    return torch.logspace(
+        torch.log(start) / logbase,
+        torch.log(stop) / logbase,
+        num,
+        base=base,
+    )
+
+
+def logspace(
+    start,
+    stop,
+    num=50,
+    endpoint=True,
+    base=10.0,
+    dtype: Optional[DTypeLike] = None,
+    axis=0,
+):
+    if axis != 0 or not endpoint:
+        raise NotImplementedError
+    return torch.logspace(start, stop, num, base=base, dtype=dtype)
+
+
+def arange(
+    start: Optional[ArrayLikeOrScalar] = None,
+    stop: Optional[ArrayLikeOrScalar] = None,
+    step: Optional[ArrayLikeOrScalar] = 1,
+    dtype: Optional[DTypeLike] = None,
+    *,
+    like: NotImplementedType = None,
+):
+    if step == 0:
+        raise ZeroDivisionError
+    if stop is None and start is None:
+        raise TypeError
+    if stop is None:
+        # XXX: this breaks if start is passed as a kwarg:
+        # arange(start=4) should raise (no stop) but doesn't
+        start, stop = 0, start
+    if start is None:
+        start = 0
+
+    # the dtype of the result
+    if dtype is None:
+        dtype = (
+            _dtypes_impl.default_dtypes().float_dtype
+            if any(_dtypes_impl.is_float_or_fp_tensor(x) for x in (start, stop, step))
+            else _dtypes_impl.default_dtypes().int_dtype
+        )
+    work_dtype = torch.float64 if dtype.is_complex else dtype
+
+    # RuntimeError: "lt_cpu" not implemented for 'ComplexFloat'. Fall back to eager.
+    if any(_dtypes_impl.is_complex_or_complex_tensor(x) for x in (start, stop, step)):
+        raise NotImplementedError
+
+    if (step > 0 and start > stop) or (step < 0 and start < stop):
+        # empty range
+        return torch.empty(0, dtype=dtype)
+
+    result = torch.arange(start, stop, step, dtype=work_dtype)
+    result = _util.cast_if_needed(result, dtype)
+    return result
+
+
+# ### zeros/ones/empty/full ###
+
+
+def empty(
+    shape,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "C",
+    *,
+    like: NotImplementedType = None,
+):
+    if dtype is None:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.empty(shape, dtype=dtype)
+
+
+# NB: *_like functions deliberately deviate from numpy: it has subok=True
+# as the default; we set subok=False and raise on anything else.
+
+
+def empty_like(
+    prototype: ArrayLike,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "K",
+    subok: NotImplementedType = False,
+    shape=None,
+):
+    result = torch.empty_like(prototype, dtype=dtype)
+    if shape is not None:
+        result = result.reshape(shape)
+    return result
+
+
+def full(
+    shape,
+    fill_value: ArrayLike,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "C",
+    *,
+    like: NotImplementedType = None,
+):
+    if isinstance(shape, int):
+        shape = (shape,)
+    if dtype is None:
+        dtype = fill_value.dtype
+    if not isinstance(shape, (tuple, list)):
+        shape = (shape,)
+    return torch.full(shape, fill_value, dtype=dtype)
+
+
+def full_like(
+    a: ArrayLike,
+    fill_value,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "K",
+    subok: NotImplementedType = False,
+    shape=None,
+):
+    # XXX: fill_value broadcasts
+    result = torch.full_like(a, fill_value, dtype=dtype)
+    if shape is not None:
+        result = result.reshape(shape)
+    return result
+
+
+def ones(
+    shape,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "C",
+    *,
+    like: NotImplementedType = None,
+):
+    if dtype is None:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.ones(shape, dtype=dtype)
+
+
+def ones_like(
+    a: ArrayLike,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "K",
+    subok: NotImplementedType = False,
+    shape=None,
+):
+    result = torch.ones_like(a, dtype=dtype)
+    if shape is not None:
+        result = result.reshape(shape)
+    return result
+
+
+def zeros(
+    shape,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "C",
+    *,
+    like: NotImplementedType = None,
+):
+    if dtype is None:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.zeros(shape, dtype=dtype)
+
+
+def zeros_like(
+    a: ArrayLike,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "K",
+    subok: NotImplementedType = False,
+    shape=None,
+):
+    result = torch.zeros_like(a, dtype=dtype)
+    if shape is not None:
+        result = result.reshape(shape)
+    return result
+
+
+# ### cov & corrcoef ###
+
+
+def _xy_helper_corrcoef(x_tensor, y_tensor=None, rowvar=True):
+    """Prepare inputs for cov and corrcoef."""
+
+    # https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/function_base.py#L2636
+    if y_tensor is not None:
+        # make sure x and y are at least 2D
+        ndim_extra = 2 - x_tensor.ndim
+        if ndim_extra > 0:
+            x_tensor = x_tensor.view((1,) * ndim_extra + x_tensor.shape)
+        if not rowvar and x_tensor.shape[0] != 1:
+            x_tensor = x_tensor.mT
+        x_tensor = x_tensor.clone()
+
+        ndim_extra = 2 - y_tensor.ndim
+        if ndim_extra > 0:
+            y_tensor = y_tensor.view((1,) * ndim_extra + y_tensor.shape)
+        if not rowvar and y_tensor.shape[0] != 1:
+            y_tensor = y_tensor.mT
+        y_tensor = y_tensor.clone()
+
+        x_tensor = _concatenate((x_tensor, y_tensor), axis=0)
+
+    return x_tensor
+
+
+def corrcoef(
+    x: ArrayLike,
+    y: Optional[ArrayLike] = None,
+    rowvar=True,
+    bias=None,
+    ddof=None,
+    *,
+    dtype: Optional[DTypeLike] = None,
+):
+    if bias is not None or ddof is not None:
+        # deprecated in NumPy
+        raise NotImplementedError
+    xy_tensor = _xy_helper_corrcoef(x, y, rowvar)
+
+    is_half = (xy_tensor.dtype == torch.float16) and xy_tensor.is_cpu
+    if is_half:
+        # work around torch's "addmm_impl_cpu_" not implemented for 'Half'"
+        dtype = torch.float32
+
+    xy_tensor = _util.cast_if_needed(xy_tensor, dtype)
+    result = torch.corrcoef(xy_tensor)
+
+    if is_half:
+        result = result.to(torch.float16)
+
+    return result
+
+
+def cov(
+    m: ArrayLike,
+    y: Optional[ArrayLike] = None,
+    rowvar=True,
+    bias=False,
+    ddof=None,
+    fweights: Optional[ArrayLike] = None,
+    aweights: Optional[ArrayLike] = None,
+    *,
+    dtype: Optional[DTypeLike] = None,
+):
+    m = _xy_helper_corrcoef(m, y, rowvar)
+
+    if ddof is None:
+        ddof = 1 if bias == 0 else 0
+
+    is_half = (m.dtype == torch.float16) and m.is_cpu
+    if is_half:
+        # work around torch's "addmm_impl_cpu_" not implemented for 'Half'"
+        dtype = torch.float32
+
+    m = _util.cast_if_needed(m, dtype)
+    result = torch.cov(m, correction=ddof, aweights=aweights, fweights=fweights)
+
+    if is_half:
+        result = result.to(torch.float16)
+
+    return result
+
+
+def _conv_corr_impl(a, v, mode):
+    dt = _dtypes_impl.result_type_impl(a, v)
+    a = _util.cast_if_needed(a, dt)
+    v = _util.cast_if_needed(v, dt)
+
+    padding = v.shape[0] - 1 if mode == "full" else mode
+
+    if padding == "same" and v.shape[0] % 2 == 0:
+        # UserWarning: Using padding='same' with even kernel lengths and odd
+        # dilation may require a zero-padded copy of the input be created
+        # (Triggered internally at pytorch/aten/src/ATen/native/Convolution.cpp:1010.)
+        raise NotImplementedError("mode='same' and even-length weights")
+
+    # NumPy only accepts 1D arrays; PyTorch requires 2D inputs and 3D weights
+    aa = a[None, :]
+    vv = v[None, None, :]
+
+    result = torch.nn.functional.conv1d(aa, vv, padding=padding)
+
+    # torch returns a 2D result, numpy returns a 1D array
+    return result[0, :]
+
+
+def convolve(a: ArrayLike, v: ArrayLike, mode="full"):
+    # NumPy: if v is longer than a, the arrays are swapped before computation
+    if a.shape[0] < v.shape[0]:
+        a, v = v, a
+
+    # flip the weights since numpy does and torch does not
+    v = torch.flip(v, (0,))
+
+    return _conv_corr_impl(a, v, mode)
+
+
+def correlate(a: ArrayLike, v: ArrayLike, mode="valid"):
+    v = torch.conj_physical(v)
+    return _conv_corr_impl(a, v, mode)
+
+
+# ### logic & element selection ###
+
+
+def bincount(x: ArrayLike, /, weights: Optional[ArrayLike] = None, minlength=0):
+    if x.numel() == 0:
+        # edge case allowed by numpy
+        x = x.new_empty(0, dtype=int)
+
+    int_dtype = _dtypes_impl.default_dtypes().int_dtype
+    (x,) = _util.typecast_tensors((x,), int_dtype, casting="safe")
+
+    return torch.bincount(x, weights, minlength)
+
+
+def where(
+    condition: ArrayLike,
+    x: Optional[ArrayLikeOrScalar] = None,
+    y: Optional[ArrayLikeOrScalar] = None,
+    /,
+):
+    if (x is None) != (y is None):
+        raise ValueError("either both or neither of x and y should be given")
+
+    if condition.dtype != torch.bool:
+        condition = condition.to(torch.bool)
+
+    if x is None and y is None:
+        result = torch.where(condition)
+    else:
+        result = torch.where(condition, x, y)
+    return result
+
+
+# ###### module-level queries of object properties
+
+
+def ndim(a: ArrayLike):
+    return a.ndim
+
+
+def shape(a: ArrayLike):
+    return tuple(a.shape)
+
+
+def size(a: ArrayLike, axis=None):
+    if axis is None:
+        return a.numel()
+    else:
+        return a.shape[axis]
+
+
+# ###### shape manipulations and indexing
+
+
+def expand_dims(a: ArrayLike, axis):
+    shape = _util.expand_shape(a.shape, axis)
+    return a.view(shape)  # never copies
+
+
+def flip(m: ArrayLike, axis=None):
+    # XXX: semantic difference: np.flip returns a view, torch.flip copies
+    if axis is None:
+        axis = tuple(range(m.ndim))
+    else:
+        axis = _util.normalize_axis_tuple(axis, m.ndim)
+    return torch.flip(m, axis)
+
+
+def flipud(m: ArrayLike):
+    return torch.flipud(m)
+
+
+def fliplr(m: ArrayLike):
+    return torch.fliplr(m)
+
+
+def rot90(m: ArrayLike, k=1, axes=(0, 1)):
+    axes = _util.normalize_axis_tuple(axes, m.ndim)
+    return torch.rot90(m, k, axes)
+
+
+# ### broadcasting and indices ###
+
+
+def broadcast_to(array: ArrayLike, shape, subok: NotImplementedType = False):
+    return torch.broadcast_to(array, size=shape)
+
+
+# This is a function from tuples to tuples, so we just reuse it
+from torch import broadcast_shapes
+
+
+def broadcast_arrays(*args: ArrayLike, subok: NotImplementedType = False):
+    return torch.broadcast_tensors(*args)
+
+
+def meshgrid(*xi: ArrayLike, copy=True, sparse=False, indexing="xy"):
+    ndim = len(xi)
+
+    if indexing not in ["xy", "ij"]:
+        raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.")
+
+    s0 = (1,) * ndim
+    output = [x.reshape(s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi)]
+
+    if indexing == "xy" and ndim > 1:
+        # switch first and second axis
+        output[0] = output[0].reshape((1, -1) + s0[2:])
+        output[1] = output[1].reshape((-1, 1) + s0[2:])
+
+    if not sparse:
+        # Return the full N-D matrix (not only the 1-D vector)
+        output = torch.broadcast_tensors(*output)
+
+    if copy:
+        output = [x.clone() for x in output]
+
+    return list(output)  # match numpy, return a list
+
+
+def indices(dimensions, dtype: Optional[DTypeLike] = int, sparse=False):
+    # https://github.com/numpy/numpy/blob/v1.24.0/numpy/core/numeric.py#L1691-L1791
+    dimensions = tuple(dimensions)
+    N = len(dimensions)
+    shape = (1,) * N
+    if sparse:
+        res = tuple()
+    else:
+        res = torch.empty((N,) + dimensions, dtype=dtype)
+    for i, dim in enumerate(dimensions):
+        idx = torch.arange(dim, dtype=dtype).reshape(
+            shape[:i] + (dim,) + shape[i + 1 :]
+        )
+        if sparse:
+            res = res + (idx,)
+        else:
+            res[i] = idx
+    return res
+
+
+# ### tri*-something ###
+
+
+def tril(m: ArrayLike, k=0):
+    return torch.tril(m, k)
+
+
+def triu(m: ArrayLike, k=0):
+    return torch.triu(m, k)
+
+
+def tril_indices(n, k=0, m=None):
+    if m is None:
+        m = n
+    return torch.tril_indices(n, m, offset=k)
+
+
+def triu_indices(n, k=0, m=None):
+    if m is None:
+        m = n
+    return torch.triu_indices(n, m, offset=k)
+
+
+def tril_indices_from(arr: ArrayLike, k=0):
+    if arr.ndim != 2:
+        raise ValueError("input array must be 2-d")
+    # Return a tensor rather than a tuple to avoid a graphbreak
+    return torch.tril_indices(arr.shape[0], arr.shape[1], offset=k)
+
+
+def triu_indices_from(arr: ArrayLike, k=0):
+    if arr.ndim != 2:
+        raise ValueError("input array must be 2-d")
+    # Return a tensor rather than a tuple to avoid a graphbreak
+    return torch.triu_indices(arr.shape[0], arr.shape[1], offset=k)
+
+
+def tri(
+    N,
+    M=None,
+    k=0,
+    dtype: Optional[DTypeLike] = None,
+    *,
+    like: NotImplementedType = None,
+):
+    if M is None:
+        M = N
+    tensor = torch.ones((N, M), dtype=dtype)
+    return torch.tril(tensor, diagonal=k)
+
+
+# ### equality, equivalence, allclose ###
+
+
+def isclose(a: ArrayLike, b: ArrayLike, rtol=1.0e-5, atol=1.0e-8, equal_nan=False):
+    dtype = _dtypes_impl.result_type_impl(a, b)
+    a = _util.cast_if_needed(a, dtype)
+    b = _util.cast_if_needed(b, dtype)
+    return torch.isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+
+def allclose(a: ArrayLike, b: ArrayLike, rtol=1e-05, atol=1e-08, equal_nan=False):
+    dtype = _dtypes_impl.result_type_impl(a, b)
+    a = _util.cast_if_needed(a, dtype)
+    b = _util.cast_if_needed(b, dtype)
+    return torch.allclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+
+def _tensor_equal(a1, a2, equal_nan=False):
+    # Implementation of array_equal/array_equiv.
+    if a1.shape != a2.shape:
+        return False
+    cond = a1 == a2
+    if equal_nan:
+        cond = cond | (torch.isnan(a1) & torch.isnan(a2))
+    return cond.all().item()
+
+
+def array_equal(a1: ArrayLike, a2: ArrayLike, equal_nan=False):
+    return _tensor_equal(a1, a2, equal_nan=equal_nan)
+
+
+def array_equiv(a1: ArrayLike, a2: ArrayLike):
+    # *almost* the same as array_equal: _equiv tries to broadcast, _equal does not
+    try:
+        a1_t, a2_t = torch.broadcast_tensors(a1, a2)
+    except RuntimeError:
+        # failed to broadcast => not equivalent
+        return False
+    return _tensor_equal(a1_t, a2_t)
+
+
+def nan_to_num(
+    x: ArrayLike, copy: NotImplementedType = True, nan=0.0, posinf=None, neginf=None
+):
+    # work around RuntimeError: "nan_to_num" not implemented for 'ComplexDouble'
+    if x.is_complex():
+        re = torch.nan_to_num(x.real, nan=nan, posinf=posinf, neginf=neginf)
+        im = torch.nan_to_num(x.imag, nan=nan, posinf=posinf, neginf=neginf)
+        return re + 1j * im
+    else:
+        return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+
+# ### put/take_along_axis ###
+
+
+def take(
+    a: ArrayLike,
+    indices: ArrayLike,
+    axis=None,
+    out: Optional[OutArray] = None,
+    mode: NotImplementedType = "raise",
+):
+    (a,), axis = _util.axis_none_flatten(a, axis=axis)
+    axis = _util.normalize_axis_index(axis, a.ndim)
+    idx = (slice(None),) * axis + (indices, ...)
+    result = a[idx]
+    return result
+
+
+def take_along_axis(arr: ArrayLike, indices: ArrayLike, axis):
+    (arr,), axis = _util.axis_none_flatten(arr, axis=axis)
+    axis = _util.normalize_axis_index(axis, arr.ndim)
+    return torch.take_along_dim(arr, indices, axis)
+
+
+def put(
+    a: NDArray,
+    indices: ArrayLike,
+    values: ArrayLike,
+    mode: NotImplementedType = "raise",
+):
+    v = values.type(a.dtype)
+    # If indices is larger than v, expand v to at least the size of indices. Any
+    # unnecessary trailing elements are then trimmed.
+    if indices.numel() > v.numel():
+        ratio = (indices.numel() + v.numel() - 1) // v.numel()
+        v = v.unsqueeze(0).expand((ratio,) + v.shape)
+    # Trim unnecessary elements, regardless if v was expanded or not. Note
+    # np.put() trims v to match indices by default too.
+    if indices.numel() < v.numel():
+        v = v.flatten()
+        v = v[: indices.numel()]
+    a.put_(indices, v)
+    return None
+
+
+def put_along_axis(arr: ArrayLike, indices: ArrayLike, values: ArrayLike, axis):
+    (arr,), axis = _util.axis_none_flatten(arr, axis=axis)
+    axis = _util.normalize_axis_index(axis, arr.ndim)
+
+    indices, values = torch.broadcast_tensors(indices, values)
+    values = _util.cast_if_needed(values, arr.dtype)
+    result = torch.scatter(arr, axis, indices, values)
+    arr.copy_(result.reshape(arr.shape))
+    return None
+
+
+def choose(
+    a: ArrayLike,
+    choices: Sequence[ArrayLike],
+    out: Optional[OutArray] = None,
+    mode: NotImplementedType = "raise",
+):
+    # First, broadcast elements of `choices`
+    choices = torch.stack(torch.broadcast_tensors(*choices))
+
+    # Use an analog of `gather(choices, 0, a)` which broadcasts `choices` vs `a`:
+    # (taken from https://github.com/pytorch/pytorch/issues/9407#issuecomment-1427907939)
+    idx_list = [
+        torch.arange(dim).view((1,) * i + (dim,) + (1,) * (choices.ndim - i - 1))
+        for i, dim in enumerate(choices.shape)
+    ]
+
+    idx_list[0] = a
+    return choices[idx_list].squeeze(0)
+
+
+# ### unique et al ###
+
+
+def unique(
+    ar: ArrayLike,
+    return_index: NotImplementedType = False,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    *,
+    equal_nan: NotImplementedType = True,
+):
+    (ar,), axis = _util.axis_none_flatten(ar, axis=axis)
+    axis = _util.normalize_axis_index(axis, ar.ndim)
+
+    result = torch.unique(
+        ar, return_inverse=return_inverse, return_counts=return_counts, dim=axis
+    )
+
+    return result
+
+
+def nonzero(a: ArrayLike):
+    return torch.nonzero(a, as_tuple=True)
+
+
+def argwhere(a: ArrayLike):
+    return torch.argwhere(a)
+
+
+def flatnonzero(a: ArrayLike):
+    return torch.flatten(a).nonzero(as_tuple=True)[0]
+
+
+def clip(
+    a: ArrayLike,
+    min: Optional[ArrayLike] = None,
+    max: Optional[ArrayLike] = None,
+    out: Optional[OutArray] = None,
+):
+    return torch.clamp(a, min, max)
+
+
+def repeat(a: ArrayLike, repeats: ArrayLikeOrScalar, axis=None):
+    return torch.repeat_interleave(a, repeats, axis)
+
+
+def tile(A: ArrayLike, reps):
+    if isinstance(reps, int):
+        reps = (reps,)
+    return torch.tile(A, reps)
+
+
+def resize(a: ArrayLike, new_shape=None):
+    # implementation vendored from
+    # https://github.com/numpy/numpy/blob/v1.24.0/numpy/core/fromnumeric.py#L1420-L1497
+    if new_shape is None:
+        return a
+
+    if isinstance(new_shape, int):
+        new_shape = (new_shape,)
+
+    a = a.flatten()
+
+    new_size = 1
+    for dim_length in new_shape:
+        new_size *= dim_length
+        if dim_length < 0:
+            raise ValueError("all elements of `new_shape` must be non-negative")
+
+    if a.numel() == 0 or new_size == 0:
+        # First case must zero fill. The second would have repeats == 0.
+        return torch.zeros(new_shape, dtype=a.dtype)
+
+    repeats = -(-new_size // a.numel())  # ceil division
+    a = concatenate((a,) * repeats)[:new_size]
+
+    return reshape(a, new_shape)
+
+
+# ### diag et al ###
+
+
+def diagonal(a: ArrayLike, offset=0, axis1=0, axis2=1):
+    axis1 = _util.normalize_axis_index(axis1, a.ndim)
+    axis2 = _util.normalize_axis_index(axis2, a.ndim)
+    return torch.diagonal(a, offset, axis1, axis2)
+
+
+def trace(
+    a: ArrayLike,
+    offset=0,
+    axis1=0,
+    axis2=1,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+):
+    result = torch.diagonal(a, offset, dim1=axis1, dim2=axis2).sum(-1, dtype=dtype)
+    return result
+
+
+def eye(
+    N,
+    M=None,
+    k=0,
+    dtype: Optional[DTypeLike] = None,
+    order: NotImplementedType = "C",
+    *,
+    like: NotImplementedType = None,
+):
+    if dtype is None:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+    if M is None:
+        M = N
+    z = torch.zeros(N, M, dtype=dtype)
+    z.diagonal(k).fill_(1)
+    return z
+
+
+def identity(n, dtype: Optional[DTypeLike] = None, *, like: NotImplementedType = None):
+    return torch.eye(n, dtype=dtype)
+
+
+def diag(v: ArrayLike, k=0):
+    return torch.diag(v, k)
+
+
+def diagflat(v: ArrayLike, k=0):
+    return torch.diagflat(v, k)
+
+
+def diag_indices(n, ndim=2):
+    idx = torch.arange(n)
+    return (idx,) * ndim
+
+
+def diag_indices_from(arr: ArrayLike):
+    if not arr.ndim >= 2:
+        raise ValueError("input array must be at least 2-d")
+    # For more than d=2, the strided formula is only valid for arrays with
+    # all dimensions equal, so we check first.
+    s = arr.shape
+    if s[1:] != s[:-1]:
+        raise ValueError("All dimensions of input must be of equal length")
+    return diag_indices(s[0], arr.ndim)
+
+
+def fill_diagonal(a: ArrayLike, val: ArrayLike, wrap=False):
+    if a.ndim < 2:
+        raise ValueError("array must be at least 2-d")
+    if val.numel() == 0 and not wrap:
+        a.fill_diagonal_(val)
+        return a
+
+    if val.ndim == 0:
+        val = val.unsqueeze(0)
+
+    # torch.Tensor.fill_diagonal_ only accepts scalars
+    # If the size of val is too large, then val is trimmed
+    if a.ndim == 2:
+        tall = a.shape[0] > a.shape[1]
+        # wrap does nothing for wide matrices...
+        if not wrap or not tall:
+            # Never wraps
+            diag = a.diagonal()
+            diag.copy_(val[: diag.numel()])
+        else:
+            # wraps and tall... leaving one empty line between diagonals?!
+            max_, min_ = a.shape
+            idx = torch.arange(max_ - max_ // (min_ + 1))
+            mod = idx % min_
+            div = idx // min_
+            a[(div * (min_ + 1) + mod, mod)] = val[: idx.numel()]
+    else:
+        idx = diag_indices_from(a)
+        # a.shape = (n, n, ..., n)
+        a[idx] = val[: a.shape[0]]
+
+    return a
+
+
+def vdot(a: ArrayLike, b: ArrayLike, /):
+    # 1. torch only accepts 1D arrays, numpy flattens
+    # 2. torch requires matching dtype, while numpy casts (?)
+    t_a, t_b = torch.atleast_1d(a, b)
+    if t_a.ndim > 1:
+        t_a = t_a.flatten()
+    if t_b.ndim > 1:
+        t_b = t_b.flatten()
+
+    dtype = _dtypes_impl.result_type_impl(t_a, t_b)
+    is_half = dtype == torch.float16 and (t_a.is_cpu or t_b.is_cpu)
+    is_bool = dtype == torch.bool
+
+    # work around torch's "dot" not implemented for 'Half', 'Bool'
+    if is_half:
+        dtype = torch.float32
+    elif is_bool:
+        dtype = torch.uint8
+
+    t_a = _util.cast_if_needed(t_a, dtype)
+    t_b = _util.cast_if_needed(t_b, dtype)
+
+    result = torch.vdot(t_a, t_b)
+
+    if is_half:
+        result = result.to(torch.float16)
+    elif is_bool:
+        result = result.to(torch.bool)
+
+    return result
+
+
+def tensordot(a: ArrayLike, b: ArrayLike, axes=2):
+    if isinstance(axes, (list, tuple)):
+        axes = [[ax] if isinstance(ax, int) else ax for ax in axes]
+
+    target_dtype = _dtypes_impl.result_type_impl(a, b)
+    a = _util.cast_if_needed(a, target_dtype)
+    b = _util.cast_if_needed(b, target_dtype)
+
+    return torch.tensordot(a, b, dims=axes)
+
+
+def dot(a: ArrayLike, b: ArrayLike, out: Optional[OutArray] = None):
+    dtype = _dtypes_impl.result_type_impl(a, b)
+    is_bool = dtype == torch.bool
+    if is_bool:
+        dtype = torch.uint8
+
+    a = _util.cast_if_needed(a, dtype)
+    b = _util.cast_if_needed(b, dtype)
+
+    if a.ndim == 0 or b.ndim == 0:
+        result = a * b
+    else:
+        result = torch.matmul(a, b)
+
+    if is_bool:
+        result = result.to(torch.bool)
+
+    return result
+
+
+def inner(a: ArrayLike, b: ArrayLike, /):
+    dtype = _dtypes_impl.result_type_impl(a, b)
+    is_half = dtype == torch.float16 and (a.is_cpu or b.is_cpu)
+    is_bool = dtype == torch.bool
+
+    if is_half:
+        # work around torch's "addmm_impl_cpu_" not implemented for 'Half'"
+        dtype = torch.float32
+    elif is_bool:
+        dtype = torch.uint8
+
+    a = _util.cast_if_needed(a, dtype)
+    b = _util.cast_if_needed(b, dtype)
+
+    result = torch.inner(a, b)
+
+    if is_half:
+        result = result.to(torch.float16)
+    elif is_bool:
+        result = result.to(torch.bool)
+    return result
+
+
+def outer(a: ArrayLike, b: ArrayLike, out: Optional[OutArray] = None):
+    return torch.outer(a, b)
+
+
+def cross(a: ArrayLike, b: ArrayLike, axisa=-1, axisb=-1, axisc=-1, axis=None):
+    # implementation vendored from
+    # https://github.com/numpy/numpy/blob/v1.24.0/numpy/core/numeric.py#L1486-L1685
+    if axis is not None:
+        axisa, axisb, axisc = (axis,) * 3
+
+    # Check axisa and axisb are within bounds
+    axisa = _util.normalize_axis_index(axisa, a.ndim)
+    axisb = _util.normalize_axis_index(axisb, b.ndim)
+
+    # Move working axis to the end of the shape
+    a = torch.moveaxis(a, axisa, -1)
+    b = torch.moveaxis(b, axisb, -1)
+    msg = "incompatible dimensions for cross product\n(dimension must be 2 or 3)"
+    if a.shape[-1] not in (2, 3) or b.shape[-1] not in (2, 3):
+        raise ValueError(msg)
+
+    # Create the output array
+    shape = broadcast_shapes(a[..., 0].shape, b[..., 0].shape)
+    if a.shape[-1] == 3 or b.shape[-1] == 3:
+        shape += (3,)
+        # Check axisc is within bounds
+        axisc = _util.normalize_axis_index(axisc, len(shape))
+    dtype = _dtypes_impl.result_type_impl(a, b)
+    cp = torch.empty(shape, dtype=dtype)
+
+    # recast arrays as dtype
+    a = _util.cast_if_needed(a, dtype)
+    b = _util.cast_if_needed(b, dtype)
+
+    # create local aliases for readability
+    a0 = a[..., 0]
+    a1 = a[..., 1]
+    if a.shape[-1] == 3:
+        a2 = a[..., 2]
+    b0 = b[..., 0]
+    b1 = b[..., 1]
+    if b.shape[-1] == 3:
+        b2 = b[..., 2]
+    if cp.ndim != 0 and cp.shape[-1] == 3:
+        cp0 = cp[..., 0]
+        cp1 = cp[..., 1]
+        cp2 = cp[..., 2]
+
+    if a.shape[-1] == 2:
+        if b.shape[-1] == 2:
+            # a0 * b1 - a1 * b0
+            cp[...] = a0 * b1 - a1 * b0
+            return cp
+        else:
+            assert b.shape[-1] == 3
+            # cp0 = a1 * b2 - 0  (a2 = 0)
+            # cp1 = 0 - a0 * b2  (a2 = 0)
+            # cp2 = a0 * b1 - a1 * b0
+            cp0[...] = a1 * b2
+            cp1[...] = -a0 * b2
+            cp2[...] = a0 * b1 - a1 * b0
+    else:
+        assert a.shape[-1] == 3
+        if b.shape[-1] == 3:
+            cp0[...] = a1 * b2 - a2 * b1
+            cp1[...] = a2 * b0 - a0 * b2
+            cp2[...] = a0 * b1 - a1 * b0
+        else:
+            assert b.shape[-1] == 2
+            cp0[...] = -a2 * b1
+            cp1[...] = a2 * b0
+            cp2[...] = a0 * b1 - a1 * b0
+
+    return torch.moveaxis(cp, -1, axisc)
+
+
+def einsum(*operands, out=None, dtype=None, order="K", casting="safe", optimize=False):
+    # Have to manually normalize *operands and **kwargs, following the NumPy signature
+    # We have a local import to avoid poluting the global space, as it will be then
+    # exported in funcs.py
+    from ._ndarray import ndarray
+    from ._normalizations import (
+        maybe_copy_to,
+        normalize_array_like,
+        normalize_casting,
+        normalize_dtype,
+        wrap_tensors,
+    )
+
+    dtype = normalize_dtype(dtype)
+    casting = normalize_casting(casting)
+    if out is not None and not isinstance(out, ndarray):
+        raise TypeError("'out' must be an array")
+    if order != "K":
+        raise NotImplementedError("'order' parameter is not supported.")
+
+    # parse arrays and normalize them
+    sublist_format = not isinstance(operands[0], str)
+    if sublist_format:
+        # op, str, op, str ... [sublistout] format: normalize every other argument
+
+        # - if sublistout is not given, the length of operands is even, and we pick
+        #   odd-numbered elements, which are arrays.
+        # - if sublistout is given, the length of operands is odd, we peel off
+        #   the last one, and pick odd-numbered elements, which are arrays.
+        #   Without [:-1], we would have picked sublistout, too.
+        array_operands = operands[:-1][::2]
+    else:
+        # ("ij->", arrays) format
+        subscripts, array_operands = operands[0], operands[1:]
+
+    tensors = [normalize_array_like(op) for op in array_operands]
+    target_dtype = _dtypes_impl.result_type_impl(*tensors) if dtype is None else dtype
+
+    # work around 'bmm' not implemented for 'Half' etc
+    is_half = target_dtype == torch.float16 and all(t.is_cpu for t in tensors)
+    if is_half:
+        target_dtype = torch.float32
+
+    is_short_int = target_dtype in [torch.uint8, torch.int8, torch.int16, torch.int32]
+    if is_short_int:
+        target_dtype = torch.int64
+
+    tensors = _util.typecast_tensors(tensors, target_dtype, casting)
+
+    from torch.backends import opt_einsum
+
+    try:
+        # set the global state to handle the optimize=... argument, restore on exit
+        if opt_einsum.is_available():
+            old_strategy = torch.backends.opt_einsum.strategy
+            old_enabled = torch.backends.opt_einsum.enabled
+
+            # torch.einsum calls opt_einsum.contract_path, which runs into
+            # https://github.com/dgasmith/opt_einsum/issues/219
+            # for strategy={True, False}
+            if optimize is True:
+                optimize = "auto"
+            elif optimize is False:
+                torch.backends.opt_einsum.enabled = False
+
+            torch.backends.opt_einsum.strategy = optimize
+
+        if sublist_format:
+            # recombine operands
+            sublists = operands[1::2]
+            has_sublistout = len(operands) % 2 == 1
+            if has_sublistout:
+                sublistout = operands[-1]
+            operands = list(itertools.chain.from_iterable(zip(tensors, sublists)))
+            if has_sublistout:
+                operands.append(sublistout)
+
+            result = torch.einsum(*operands)
+        else:
+            result = torch.einsum(subscripts, *tensors)
+
+    finally:
+        if opt_einsum.is_available():
+            torch.backends.opt_einsum.strategy = old_strategy
+            torch.backends.opt_einsum.enabled = old_enabled
+
+    result = maybe_copy_to(out, result)
+    return wrap_tensors(result)
+
+
+# ### sort and partition ###
+
+
+def _sort_helper(tensor, axis, kind, order):
+    if tensor.dtype.is_complex:
+        raise NotImplementedError(f"sorting {tensor.dtype} is not supported")
+    (tensor,), axis = _util.axis_none_flatten(tensor, axis=axis)
+    axis = _util.normalize_axis_index(axis, tensor.ndim)
+
+    stable = kind == "stable"
+
+    return tensor, axis, stable
+
+
+def sort(a: ArrayLike, axis=-1, kind=None, order: NotImplementedType = None):
+    # `order` keyword arg is only relevant for structured dtypes; so not supported here.
+    a, axis, stable = _sort_helper(a, axis, kind, order)
+    result = torch.sort(a, dim=axis, stable=stable)
+    return result.values
+
+
+def argsort(a: ArrayLike, axis=-1, kind=None, order: NotImplementedType = None):
+    a, axis, stable = _sort_helper(a, axis, kind, order)
+    return torch.argsort(a, dim=axis, stable=stable)
+
+
+def searchsorted(
+    a: ArrayLike, v: ArrayLike, side="left", sorter: Optional[ArrayLike] = None
+):
+    if a.dtype.is_complex:
+        raise NotImplementedError(f"searchsorted with dtype={a.dtype}")
+
+    return torch.searchsorted(a, v, side=side, sorter=sorter)
+
+
+# ### swap/move/roll axis ###
+
+
+def moveaxis(a: ArrayLike, source, destination):
+    source = _util.normalize_axis_tuple(source, a.ndim, "source")
+    destination = _util.normalize_axis_tuple(destination, a.ndim, "destination")
+    return torch.moveaxis(a, source, destination)
+
+
+def swapaxes(a: ArrayLike, axis1, axis2):
+    axis1 = _util.normalize_axis_index(axis1, a.ndim)
+    axis2 = _util.normalize_axis_index(axis2, a.ndim)
+    return torch.swapaxes(a, axis1, axis2)
+
+
+def rollaxis(a: ArrayLike, axis, start=0):
+    # Straight vendor from:
+    # https://github.com/numpy/numpy/blob/v1.24.0/numpy/core/numeric.py#L1259
+    #
+    # Also note this function in NumPy is mostly retained for backwards compat
+    # (https://stackoverflow.com/questions/29891583/reason-why-numpy-rollaxis-is-so-confusing)
+    # so let's not touch it unless hard pressed.
+    n = a.ndim
+    axis = _util.normalize_axis_index(axis, n)
+    if start < 0:
+        start += n
+    msg = "'%s' arg requires %d <= %s < %d, but %d was passed in"
+    if not (0 <= start < n + 1):
+        raise _util.AxisError(msg % ("start", -n, "start", n + 1, start))
+    if axis < start:
+        # it's been removed
+        start -= 1
+    if axis == start:
+        # numpy returns a view, here we try returning the tensor itself
+        # return tensor[...]
+        return a
+    axes = list(range(0, n))
+    axes.remove(axis)
+    axes.insert(start, axis)
+    return a.view(axes)
+
+
+def roll(a: ArrayLike, shift, axis=None):
+    if axis is not None:
+        axis = _util.normalize_axis_tuple(axis, a.ndim, allow_duplicate=True)
+        if not isinstance(shift, tuple):
+            shift = (shift,) * len(axis)
+    return torch.roll(a, shift, axis)
+
+
+# ### shape manipulations ###
+
+
+def squeeze(a: ArrayLike, axis=None):
+    if axis == ():
+        result = a
+    elif axis is None:
+        result = a.squeeze()
+    else:
+        if isinstance(axis, tuple):
+            result = a
+            for ax in axis:
+                result = a.squeeze(ax)
+        else:
+            result = a.squeeze(axis)
+    return result
+
+
+def reshape(a: ArrayLike, newshape, order: NotImplementedType = "C"):
+    # if sh = (1, 2, 3), numpy allows both .reshape(sh) and .reshape(*sh)
+    newshape = newshape[0] if len(newshape) == 1 else newshape
+    return a.reshape(newshape)
+
+
+# NB: cannot use torch.reshape(a, newshape) above, because of
+# (Pdb) torch.reshape(torch.as_tensor([1]), 1)
+# *** TypeError: reshape(): argument 'shape' (position 2) must be tuple of SymInts, not int
+
+
+def transpose(a: ArrayLike, axes=None):
+    # numpy allows both .transpose(sh) and .transpose(*sh)
+    # also older code uses axes being a list
+    if axes in [(), None, (None,)]:
+        axes = tuple(reversed(range(a.ndim)))
+    elif len(axes) == 1:
+        axes = axes[0]
+    return a.permute(axes)
+
+
+def ravel(a: ArrayLike, order: NotImplementedType = "C"):
+    return torch.flatten(a)
+
+
+def diff(
+    a: ArrayLike,
+    n=1,
+    axis=-1,
+    prepend: Optional[ArrayLike] = None,
+    append: Optional[ArrayLike] = None,
+):
+    axis = _util.normalize_axis_index(axis, a.ndim)
+
+    if n < 0:
+        raise ValueError(f"order must be non-negative but got {n}")
+
+    if n == 0:
+        # match numpy and return the input immediately
+        return a
+
+    if prepend is not None:
+        shape = list(a.shape)
+        shape[axis] = prepend.shape[axis] if prepend.ndim > 0 else 1
+        prepend = torch.broadcast_to(prepend, shape)
+
+    if append is not None:
+        shape = list(a.shape)
+        shape[axis] = append.shape[axis] if append.ndim > 0 else 1
+        append = torch.broadcast_to(append, shape)
+
+    return torch.diff(a, n, axis=axis, prepend=prepend, append=append)
+
+
+# ### math functions ###
+
+
+def angle(z: ArrayLike, deg=False):
+    result = torch.angle(z)
+    if deg:
+        result = result * (180 / torch.pi)
+    return result
+
+
+def sinc(x: ArrayLike):
+    return torch.sinc(x)
+
+
+# NB: have to normalize *varargs manually
+def gradient(f: ArrayLike, *varargs, axis=None, edge_order=1):
+    N = f.ndim  # number of dimensions
+
+    varargs = _util.ndarrays_to_tensors(varargs)
+
+    if axis is None:
+        axes = tuple(range(N))
+    else:
+        axes = _util.normalize_axis_tuple(axis, N)
+
+    len_axes = len(axes)
+    n = len(varargs)
+    if n == 0:
+        # no spacing argument - use 1 in all axes
+        dx = [1.0] * len_axes
+    elif n == 1 and (_dtypes_impl.is_scalar(varargs[0]) or varargs[0].ndim == 0):
+        # single scalar or 0D tensor for all axes (np.ndim(varargs[0]) == 0)
+        dx = varargs * len_axes
+    elif n == len_axes:
+        # scalar or 1d array for each axis
+        dx = list(varargs)
+        for i, distances in enumerate(dx):
+            distances = torch.as_tensor(distances)
+            if distances.ndim == 0:
+                continue
+            elif distances.ndim != 1:
+                raise ValueError("distances must be either scalars or 1d")
+            if len(distances) != f.shape[axes[i]]:
+                raise ValueError(
+                    "when 1d, distances must match "
+                    "the length of the corresponding dimension"
+                )
+            if not (distances.dtype.is_floating_point or distances.dtype.is_complex):
+                distances = distances.double()
+
+            diffx = torch.diff(distances)
+            # if distances are constant reduce to the scalar case
+            # since it brings a consistent speedup
+            if (diffx == diffx[0]).all():
+                diffx = diffx[0]
+            dx[i] = diffx
+    else:
+        raise TypeError("invalid number of arguments")
+
+    if edge_order > 2:
+        raise ValueError("'edge_order' greater than 2 not supported")
+
+    # use central differences on interior and one-sided differences on the
+    # endpoints. This preserves second order-accuracy over the full domain.
+
+    outvals = []
+
+    # create slice objects --- initially all are [:, :, ..., :]
+    slice1 = [slice(None)] * N
+    slice2 = [slice(None)] * N
+    slice3 = [slice(None)] * N
+    slice4 = [slice(None)] * N
+
+    otype = f.dtype
+    if _dtypes_impl.python_type_for_torch(otype) in (int, bool):
+        # Convert to floating point.
+        # First check if f is a numpy integer type; if so, convert f to float64
+        # to avoid modular arithmetic when computing the changes in f.
+        f = f.double()
+        otype = torch.float64
+
+    for axis, ax_dx in zip(axes, dx):
+        if f.shape[axis] < edge_order + 1:
+            raise ValueError(
+                "Shape of array too small to calculate a numerical gradient, "
+                "at least (edge_order + 1) elements are required."
+            )
+        # result allocation
+        out = torch.empty_like(f, dtype=otype)
+
+        # spacing for the current axis (NB: np.ndim(ax_dx) == 0)
+        uniform_spacing = _dtypes_impl.is_scalar(ax_dx) or ax_dx.ndim == 0
+
+        # Numerical differentiation: 2nd order interior
+        slice1[axis] = slice(1, -1)
+        slice2[axis] = slice(None, -2)
+        slice3[axis] = slice(1, -1)
+        slice4[axis] = slice(2, None)
+
+        if uniform_spacing:
+            out[tuple(slice1)] = (f[tuple(slice4)] - f[tuple(slice2)]) / (2.0 * ax_dx)
+        else:
+            dx1 = ax_dx[0:-1]
+            dx2 = ax_dx[1:]
+            a = -(dx2) / (dx1 * (dx1 + dx2))
+            b = (dx2 - dx1) / (dx1 * dx2)
+            c = dx1 / (dx2 * (dx1 + dx2))
+            # fix the shape for broadcasting
+            shape = [1] * N
+            shape[axis] = -1
+            a = a.reshape(shape)
+            b = b.reshape(shape)
+            c = c.reshape(shape)
+            # 1D equivalent -- out[1:-1] = a * f[:-2] + b * f[1:-1] + c * f[2:]
+            out[tuple(slice1)] = (
+                a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
+            )
+
+        # Numerical differentiation: 1st order edges
+        if edge_order == 1:
+            slice1[axis] = 0
+            slice2[axis] = 1
+            slice3[axis] = 0
+            dx_0 = ax_dx if uniform_spacing else ax_dx[0]
+            # 1D equivalent -- out[0] = (f[1] - f[0]) / (x[1] - x[0])
+            out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_0
+
+            slice1[axis] = -1
+            slice2[axis] = -1
+            slice3[axis] = -2
+            dx_n = ax_dx if uniform_spacing else ax_dx[-1]
+            # 1D equivalent -- out[-1] = (f[-1] - f[-2]) / (x[-1] - x[-2])
+            out[tuple(slice1)] = (f[tuple(slice2)] - f[tuple(slice3)]) / dx_n
+
+        # Numerical differentiation: 2nd order edges
+        else:
+            slice1[axis] = 0
+            slice2[axis] = 0
+            slice3[axis] = 1
+            slice4[axis] = 2
+            if uniform_spacing:
+                a = -1.5 / ax_dx
+                b = 2.0 / ax_dx
+                c = -0.5 / ax_dx
+            else:
+                dx1 = ax_dx[0]
+                dx2 = ax_dx[1]
+                a = -(2.0 * dx1 + dx2) / (dx1 * (dx1 + dx2))
+                b = (dx1 + dx2) / (dx1 * dx2)
+                c = -dx1 / (dx2 * (dx1 + dx2))
+            # 1D equivalent -- out[0] = a * f[0] + b * f[1] + c * f[2]
+            out[tuple(slice1)] = (
+                a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
+            )
+
+            slice1[axis] = -1
+            slice2[axis] = -3
+            slice3[axis] = -2
+            slice4[axis] = -1
+            if uniform_spacing:
+                a = 0.5 / ax_dx
+                b = -2.0 / ax_dx
+                c = 1.5 / ax_dx
+            else:
+                dx1 = ax_dx[-2]
+                dx2 = ax_dx[-1]
+                a = (dx2) / (dx1 * (dx1 + dx2))
+                b = -(dx2 + dx1) / (dx1 * dx2)
+                c = (2.0 * dx2 + dx1) / (dx2 * (dx1 + dx2))
+            # 1D equivalent -- out[-1] = a * f[-3] + b * f[-2] + c * f[-1]
+            out[tuple(slice1)] = (
+                a * f[tuple(slice2)] + b * f[tuple(slice3)] + c * f[tuple(slice4)]
+            )
+
+        outvals.append(out)
+
+        # reset the slice object in this dimension to ":"
+        slice1[axis] = slice(None)
+        slice2[axis] = slice(None)
+        slice3[axis] = slice(None)
+        slice4[axis] = slice(None)
+
+    if len_axes == 1:
+        return outvals[0]
+    else:
+        return outvals
+
+
+# ### Type/shape etc queries ###
+
+
+def round(a: ArrayLike, decimals=0, out: Optional[OutArray] = None):
+    if a.is_floating_point():
+        result = torch.round(a, decimals=decimals)
+    elif a.is_complex():
+        # RuntimeError: "round_cpu" not implemented for 'ComplexFloat'
+        result = torch.complex(
+            torch.round(a.real, decimals=decimals),
+            torch.round(a.imag, decimals=decimals),
+        )
+    else:
+        # RuntimeError: "round_cpu" not implemented for 'int'
+        result = a
+    return result
+
+
+around = round
+round_ = round
+
+
+def real_if_close(a: ArrayLike, tol=100):
+    if not torch.is_complex(a):
+        return a
+    if tol > 1:
+        # Undocumented in numpy: if tol < 1, it's an absolute tolerance!
+        # Otherwise, tol > 1 is relative tolerance, in units of the dtype epsilon
+        # https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/type_check.py#L577
+        tol = tol * torch.finfo(a.dtype).eps
+
+    mask = torch.abs(a.imag) < tol
+    return a.real if mask.all() else a
+
+
+def real(a: ArrayLike):
+    return torch.real(a)
+
+
+def imag(a: ArrayLike):
+    if a.is_complex():
+        return a.imag
+    return torch.zeros_like(a)
+
+
+def iscomplex(x: ArrayLike):
+    if torch.is_complex(x):
+        return x.imag != 0
+    return torch.zeros_like(x, dtype=torch.bool)
+
+
+def isreal(x: ArrayLike):
+    if torch.is_complex(x):
+        return x.imag == 0
+    return torch.ones_like(x, dtype=torch.bool)
+
+
+def iscomplexobj(x: ArrayLike):
+    return torch.is_complex(x)
+
+
+def isrealobj(x: ArrayLike):
+    return not torch.is_complex(x)
+
+
+def isneginf(x: ArrayLike, out: Optional[OutArray] = None):
+    return torch.isneginf(x)
+
+
+def isposinf(x: ArrayLike, out: Optional[OutArray] = None):
+    return torch.isposinf(x)
+
+
+def i0(x: ArrayLike):
+    return torch.special.i0(x)
+
+
+def isscalar(a):
+    # We need to use normalize_array_like, but we don't want to export it in funcs.py
+    from ._normalizations import normalize_array_like
+
+    try:
+        t = normalize_array_like(a)
+        return t.numel() == 1
+    except Exception:
+        return False
+
+
+# ### Filter windows ###
+
+
+def hamming(M):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.hamming_window(M, periodic=False, dtype=dtype)
+
+
+def hanning(M):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.hann_window(M, periodic=False, dtype=dtype)
+
+
+def kaiser(M, beta):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.kaiser_window(M, beta=beta, periodic=False, dtype=dtype)
+
+
+def blackman(M):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.blackman_window(M, periodic=False, dtype=dtype)
+
+
+def bartlett(M):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    return torch.bartlett_window(M, periodic=False, dtype=dtype)
+
+
+# ### Dtype routines ###
+
+# vendored from https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/type_check.py#L666
+
+
+array_type = [
+    [torch.float16, torch.float32, torch.float64],
+    [None, torch.complex64, torch.complex128],
+]
+array_precision = {
+    torch.float16: 0,
+    torch.float32: 1,
+    torch.float64: 2,
+    torch.complex64: 1,
+    torch.complex128: 2,
+}
+
+
+def common_type(*tensors: ArrayLike):
+    is_complex = False
+    precision = 0
+    for a in tensors:
+        t = a.dtype
+        if iscomplexobj(a):
+            is_complex = True
+        if not (t.is_floating_point or t.is_complex):
+            p = 2  # array_precision[_nx.double]
+        else:
+            p = array_precision.get(t, None)
+            if p is None:
+                raise TypeError("can't get common type for non-numeric array")
+        precision = builtins.max(precision, p)
+    if is_complex:
+        return array_type[1][precision]
+    else:
+        return array_type[0][precision]
+
+
+# ### histograms ###
+
+
+def histogram(
+    a: ArrayLike,
+    bins: ArrayLike = 10,
+    range=None,
+    normed=None,
+    weights: Optional[ArrayLike] = None,
+    density=None,
+):
+    if normed is not None:
+        raise ValueError("normed argument is deprecated, use density= instead")
+
+    if weights is not None and weights.dtype.is_complex:
+        raise NotImplementedError("complex weights histogram.")
+
+    is_a_int = not (a.dtype.is_floating_point or a.dtype.is_complex)
+    is_w_int = weights is None or not weights.dtype.is_floating_point
+    if is_a_int:
+        a = a.double()
+
+    if weights is not None:
+        weights = _util.cast_if_needed(weights, a.dtype)
+
+    if isinstance(bins, torch.Tensor):
+        if bins.ndim == 0:
+            # bins was a single int
+            bins = operator.index(bins)
+        else:
+            bins = _util.cast_if_needed(bins, a.dtype)
+
+    if range is None:
+        h, b = torch.histogram(a, bins, weight=weights, density=bool(density))
+    else:
+        h, b = torch.histogram(
+            a, bins, range=range, weight=weights, density=bool(density)
+        )
+
+    if not density and is_w_int:
+        h = h.long()
+    if is_a_int:
+        b = b.long()
+
+    return h, b
+
+
+def histogram2d(
+    x,
+    y,
+    bins=10,
+    range: Optional[ArrayLike] = None,
+    normed=None,
+    weights: Optional[ArrayLike] = None,
+    density=None,
+):
+    # vendored from https://github.com/numpy/numpy/blob/v1.24.0/numpy/lib/twodim_base.py#L655-L821
+    if len(x) != len(y):
+        raise ValueError("x and y must have the same length.")
+
+    try:
+        N = len(bins)
+    except TypeError:
+        N = 1
+
+    if N != 1 and N != 2:
+        bins = [bins, bins]
+
+    h, e = histogramdd((x, y), bins, range, normed, weights, density)
+
+    return h, e[0], e[1]
+
+
+def histogramdd(
+    sample,
+    bins=10,
+    range: Optional[ArrayLike] = None,
+    normed=None,
+    weights: Optional[ArrayLike] = None,
+    density=None,
+):
+    # have to normalize manually because `sample` interpretation differs
+    # for a list of lists and a 2D array
+    if normed is not None:
+        raise ValueError("normed argument is deprecated, use density= instead")
+
+    from ._normalizations import normalize_array_like, normalize_seq_array_like
+
+    if isinstance(sample, (list, tuple)):
+        sample = normalize_array_like(sample).T
+    else:
+        sample = normalize_array_like(sample)
+
+    sample = torch.atleast_2d(sample)
+
+    if not (sample.dtype.is_floating_point or sample.dtype.is_complex):
+        sample = sample.double()
+
+    # bins is either an int, or a sequence of ints or a sequence of arrays
+    bins_is_array = not (
+        isinstance(bins, int) or builtins.all(isinstance(b, int) for b in bins)
+    )
+    if bins_is_array:
+        bins = normalize_seq_array_like(bins)
+        bins_dtypes = [b.dtype for b in bins]
+        bins = [_util.cast_if_needed(b, sample.dtype) for b in bins]
+
+    if range is not None:
+        range = range.flatten().tolist()
+
+    if weights is not None:
+        # range=... is required : interleave min and max values per dimension
+        mm = sample.aminmax(dim=0)
+        range = torch.cat(mm).reshape(2, -1).T.flatten()
+        range = tuple(range.tolist())
+        weights = _util.cast_if_needed(weights, sample.dtype)
+        w_kwd = {"weight": weights}
+    else:
+        w_kwd = {}
+
+    h, b = torch.histogramdd(sample, bins, range, density=bool(density), **w_kwd)
+
+    if bins_is_array:
+        b = [_util.cast_if_needed(bb, dtyp) for bb, dtyp in zip(b, bins_dtypes)]
+
+    return h, b
+
+
+# ### odds and ends
+
+
+def min_scalar_type(a: ArrayLike, /):
+    # https://github.com/numpy/numpy/blob/maintenance/1.24.x/numpy/core/src/multiarray/convert_datatype.c#L1288
+
+    from ._dtypes import DType
+
+    if a.numel() > 1:
+        # numpy docs: "For non-scalar array a, returns the vector’s dtype unmodified."
+        return DType(a.dtype)
+
+    if a.dtype == torch.bool:
+        dtype = torch.bool
+
+    elif a.dtype.is_complex:
+        fi = torch.finfo(torch.float32)
+        fits_in_single = a.dtype == torch.complex64 or (
+            fi.min <= a.real <= fi.max and fi.min <= a.imag <= fi.max
+        )
+        dtype = torch.complex64 if fits_in_single else torch.complex128
+
+    elif a.dtype.is_floating_point:
+        for dt in [torch.float16, torch.float32, torch.float64]:
+            fi = torch.finfo(dt)
+            if fi.min <= a <= fi.max:
+                dtype = dt
+                break
+    else:
+        # must be integer
+        for dt in [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]:
+            # Prefer unsigned int where possible, as numpy does.
+            ii = torch.iinfo(dt)
+            if ii.min <= a <= ii.max:
+                dtype = dt
+                break
+
+    return DType(dtype)
+
+
+def pad(array: ArrayLike, pad_width: ArrayLike, mode="constant", **kwargs):
+    if mode != "constant":
+        raise NotImplementedError
+    value = kwargs.get("constant_values", 0)
+    # `value` must be a python scalar for torch.nn.functional.pad
+    typ = _dtypes_impl.python_type_for_torch(array.dtype)
+    value = typ(value)
+
+    pad_width = torch.broadcast_to(pad_width, (array.ndim, 2))
+    pad_width = torch.flip(pad_width, (0,)).flatten()
+
+    return torch.nn.functional.pad(array, tuple(pad_width), value=value)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_getlimits.py b/MLPY/Lib/site-packages/torch/_numpy/_getlimits.py
new file mode 100644
index 0000000000000000000000000000000000000000..75036ce6ab4b0b417be7ea0a308ec19018304fd4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_getlimits.py
@@ -0,0 +1,15 @@
+# mypy: ignore-errors
+
+import torch
+
+from . import _dtypes
+
+
+def finfo(dtyp):
+    torch_dtype = _dtypes.dtype(dtyp).torch_dtype
+    return torch.finfo(torch_dtype)
+
+
+def iinfo(dtyp):
+    torch_dtype = _dtypes.dtype(dtyp).torch_dtype
+    return torch.iinfo(torch_dtype)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_ndarray.py b/MLPY/Lib/site-packages/torch/_numpy/_ndarray.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e81a9cec8578a1366f1e0c93e5f0aec04e69dad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_ndarray.py
@@ -0,0 +1,591 @@
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+import builtins
+import math
+import operator
+from typing import Sequence
+
+import torch
+
+from . import _dtypes, _dtypes_impl, _funcs, _ufuncs, _util
+from ._normalizations import (
+    ArrayLike,
+    normalize_array_like,
+    normalizer,
+    NotImplementedType,
+)
+
+newaxis = None
+
+FLAGS = [
+    "C_CONTIGUOUS",
+    "F_CONTIGUOUS",
+    "OWNDATA",
+    "WRITEABLE",
+    "ALIGNED",
+    "WRITEBACKIFCOPY",
+    "FNC",
+    "FORC",
+    "BEHAVED",
+    "CARRAY",
+    "FARRAY",
+]
+
+SHORTHAND_TO_FLAGS = {
+    "C": "C_CONTIGUOUS",
+    "F": "F_CONTIGUOUS",
+    "O": "OWNDATA",
+    "W": "WRITEABLE",
+    "A": "ALIGNED",
+    "X": "WRITEBACKIFCOPY",
+    "B": "BEHAVED",
+    "CA": "CARRAY",
+    "FA": "FARRAY",
+}
+
+
+class Flags:
+    def __init__(self, flag_to_value: dict):
+        assert all(k in FLAGS for k in flag_to_value.keys())  # sanity check
+        self._flag_to_value = flag_to_value
+
+    def __getattr__(self, attr: str):
+        if attr.islower() and attr.upper() in FLAGS:
+            return self[attr.upper()]
+        else:
+            raise AttributeError(f"No flag attribute '{attr}'")
+
+    def __getitem__(self, key):
+        if key in SHORTHAND_TO_FLAGS.keys():
+            key = SHORTHAND_TO_FLAGS[key]
+        if key in FLAGS:
+            try:
+                return self._flag_to_value[key]
+            except KeyError as e:
+                raise NotImplementedError(f"{key=}") from e
+        else:
+            raise KeyError(f"No flag key '{key}'")
+
+    def __setattr__(self, attr, value):
+        if attr.islower() and attr.upper() in FLAGS:
+            self[attr.upper()] = value
+        else:
+            super().__setattr__(attr, value)
+
+    def __setitem__(self, key, value):
+        if key in FLAGS or key in SHORTHAND_TO_FLAGS.keys():
+            raise NotImplementedError("Modifying flags is not implemented")
+        else:
+            raise KeyError(f"No flag key '{key}'")
+
+
+def create_method(fn, name=None):
+    name = name or fn.__name__
+
+    def f(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    f.__name__ = name
+    f.__qualname__ = f"ndarray.{name}"
+    return f
+
+
+# Map ndarray.name_method -> np.name_func
+# If name_func == None, it means that name_method == name_func
+methods = {
+    "clip": None,
+    "nonzero": None,
+    "repeat": None,
+    "round": None,
+    "squeeze": None,
+    "swapaxes": None,
+    "ravel": None,
+    # linalg
+    "diagonal": None,
+    "dot": None,
+    "trace": None,
+    # sorting
+    "argsort": None,
+    "searchsorted": None,
+    # reductions
+    "argmax": None,
+    "argmin": None,
+    "any": None,
+    "all": None,
+    "max": None,
+    "min": None,
+    "ptp": None,
+    "sum": None,
+    "prod": None,
+    "mean": None,
+    "var": None,
+    "std": None,
+    # scans
+    "cumsum": None,
+    "cumprod": None,
+    # advanced indexing
+    "take": None,
+    "choose": None,
+}
+
+dunder = {
+    "abs": "absolute",
+    "invert": None,
+    "pos": "positive",
+    "neg": "negative",
+    "gt": "greater",
+    "lt": "less",
+    "ge": "greater_equal",
+    "le": "less_equal",
+}
+
+# dunder methods with right-looking and in-place variants
+ri_dunder = {
+    "add": None,
+    "sub": "subtract",
+    "mul": "multiply",
+    "truediv": "divide",
+    "floordiv": "floor_divide",
+    "pow": "power",
+    "mod": "remainder",
+    "and": "bitwise_and",
+    "or": "bitwise_or",
+    "xor": "bitwise_xor",
+    "lshift": "left_shift",
+    "rshift": "right_shift",
+    "matmul": None,
+}
+
+
+def _upcast_int_indices(index):
+    if isinstance(index, torch.Tensor):
+        if index.dtype in (torch.int8, torch.int16, torch.int32, torch.uint8):
+            return index.to(torch.int64)
+    elif isinstance(index, tuple):
+        return tuple(_upcast_int_indices(i) for i in index)
+    return index
+
+
+# Used to indicate that a parameter is unspecified (as opposed to explicitly
+# `None`)
+class _Unspecified:
+    pass
+
+
+_Unspecified.unspecified = _Unspecified()
+
+###############################################################
+#                      ndarray class                          #
+###############################################################
+
+
+class ndarray:
+    def __init__(self, t=None):
+        if t is None:
+            self.tensor = torch.Tensor()
+        elif isinstance(t, torch.Tensor):
+            self.tensor = t
+        else:
+            raise ValueError(
+                "ndarray constructor is not recommended; prefer"
+                "either array(...) or zeros/empty(...)"
+            )
+
+    # Register NumPy functions as methods
+    for method, name in methods.items():
+        fn = getattr(_funcs, name or method)
+        vars()[method] = create_method(fn, method)
+
+    # Regular methods but coming from ufuncs
+    conj = create_method(_ufuncs.conjugate, "conj")
+    conjugate = create_method(_ufuncs.conjugate)
+
+    for method, name in dunder.items():
+        fn = getattr(_ufuncs, name or method)
+        method = f"__{method}__"
+        vars()[method] = create_method(fn, method)
+
+    for method, name in ri_dunder.items():
+        fn = getattr(_ufuncs, name or method)
+        plain = f"__{method}__"
+        vars()[plain] = create_method(fn, plain)
+        rvar = f"__r{method}__"
+        vars()[rvar] = create_method(lambda self, other, fn=fn: fn(other, self), rvar)
+        ivar = f"__i{method}__"
+        vars()[ivar] = create_method(
+            lambda self, other, fn=fn: fn(self, other, out=self), ivar
+        )
+
+    # There's no __idivmod__
+    __divmod__ = create_method(_ufuncs.divmod, "__divmod__")
+    __rdivmod__ = create_method(
+        lambda self, other: _ufuncs.divmod(other, self), "__rdivmod__"
+    )
+
+    # prevent loop variables leaking into the ndarray class namespace
+    del ivar, rvar, name, plain, fn, method
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def size(self):
+        return self.tensor.numel()
+
+    @property
+    def ndim(self):
+        return self.tensor.ndim
+
+    @property
+    def dtype(self):
+        return _dtypes.dtype(self.tensor.dtype)
+
+    @property
+    def strides(self):
+        elsize = self.tensor.element_size()
+        return tuple(stride * elsize for stride in self.tensor.stride())
+
+    @property
+    def itemsize(self):
+        return self.tensor.element_size()
+
+    @property
+    def flags(self):
+        # Note contiguous in torch is assumed C-style
+        return Flags(
+            {
+                "C_CONTIGUOUS": self.tensor.is_contiguous(),
+                "F_CONTIGUOUS": self.T.tensor.is_contiguous(),
+                "OWNDATA": self.tensor._base is None,
+                "WRITEABLE": True,  # pytorch does not have readonly tensors
+            }
+        )
+
+    @property
+    def data(self):
+        return self.tensor.data_ptr()
+
+    @property
+    def nbytes(self):
+        return self.tensor.storage().nbytes()
+
+    @property
+    def T(self):
+        return self.transpose()
+
+    @property
+    def real(self):
+        return _funcs.real(self)
+
+    @real.setter
+    def real(self, value):
+        self.tensor.real = asarray(value).tensor
+
+    @property
+    def imag(self):
+        return _funcs.imag(self)
+
+    @imag.setter
+    def imag(self, value):
+        self.tensor.imag = asarray(value).tensor
+
+    # ctors
+    def astype(self, dtype, order="K", casting="unsafe", subok=True, copy=True):
+        if order != "K":
+            raise NotImplementedError(f"astype(..., order={order} is not implemented.")
+        if casting != "unsafe":
+            raise NotImplementedError(
+                f"astype(..., casting={casting} is not implemented."
+            )
+        if not subok:
+            raise NotImplementedError(f"astype(..., subok={subok} is not implemented.")
+        if not copy:
+            raise NotImplementedError(f"astype(..., copy={copy} is not implemented.")
+        torch_dtype = _dtypes.dtype(dtype).torch_dtype
+        t = self.tensor.to(torch_dtype)
+        return ndarray(t)
+
+    @normalizer
+    def copy(self: ArrayLike, order: NotImplementedType = "C"):
+        return self.clone()
+
+    @normalizer
+    def flatten(self: ArrayLike, order: NotImplementedType = "C"):
+        return torch.flatten(self)
+
+    def resize(self, *new_shape, refcheck=False):
+        # NB: differs from np.resize: fills with zeros instead of making repeated copies of input.
+        if refcheck:
+            raise NotImplementedError(
+                f"resize(..., refcheck={refcheck} is not implemented."
+            )
+        if new_shape in [(), (None,)]:
+            return
+
+        # support both x.resize((2, 2)) and x.resize(2, 2)
+        if len(new_shape) == 1:
+            new_shape = new_shape[0]
+        if isinstance(new_shape, int):
+            new_shape = (new_shape,)
+
+        if builtins.any(x < 0 for x in new_shape):
+            raise ValueError("all elements of `new_shape` must be non-negative")
+
+        new_numel, old_numel = math.prod(new_shape), self.tensor.numel()
+
+        self.tensor.resize_(new_shape)
+
+        if new_numel >= old_numel:
+            # zero-fill new elements
+            assert self.tensor.is_contiguous()
+            b = self.tensor.flatten()  # does not copy
+            b[old_numel:].zero_()
+
+    def view(self, dtype=_Unspecified.unspecified, type=_Unspecified.unspecified):
+        if dtype is _Unspecified.unspecified:
+            dtype = self.dtype
+        if type is not _Unspecified.unspecified:
+            raise NotImplementedError(f"view(..., type={type} is not implemented.")
+        torch_dtype = _dtypes.dtype(dtype).torch_dtype
+        tview = self.tensor.view(torch_dtype)
+        return ndarray(tview)
+
+    @normalizer
+    def fill(self, value: ArrayLike):
+        # Both Pytorch and NumPy accept 0D arrays/tensors and scalars, and
+        # error out on D > 0 arrays
+        self.tensor.fill_(value)
+
+    def tolist(self):
+        return self.tensor.tolist()
+
+    def __iter__(self):
+        return (ndarray(x) for x in self.tensor.__iter__())
+
+    def __str__(self):
+        return (
+            str(self.tensor)
+            .replace("tensor", "torch.ndarray")
+            .replace("dtype=torch.", "dtype=")
+        )
+
+    __repr__ = create_method(__str__)
+
+    def __eq__(self, other):
+        try:
+            return _ufuncs.equal(self, other)
+        except (RuntimeError, TypeError):
+            # Failed to convert other to array: definitely not equal.
+            falsy = torch.full(self.shape, fill_value=False, dtype=bool)
+            return asarray(falsy)
+
+    def __ne__(self, other):
+        return ~(self == other)
+
+    def __index__(self):
+        try:
+            return operator.index(self.tensor.item())
+        except Exception as exc:
+            raise TypeError(
+                "only integer scalar arrays can be converted to a scalar index"
+            ) from exc
+
+    def __bool__(self):
+        return bool(self.tensor)
+
+    def __int__(self):
+        return int(self.tensor)
+
+    def __float__(self):
+        return float(self.tensor)
+
+    def __complex__(self):
+        return complex(self.tensor)
+
+    def is_integer(self):
+        try:
+            v = self.tensor.item()
+            result = int(v) == v
+        except Exception:
+            result = False
+        return result
+
+    def __len__(self):
+        return self.tensor.shape[0]
+
+    def __contains__(self, x):
+        return self.tensor.__contains__(x)
+
+    def transpose(self, *axes):
+        # np.transpose(arr, axis=None) but arr.transpose(*axes)
+        return _funcs.transpose(self, axes)
+
+    def reshape(self, *shape, order="C"):
+        # arr.reshape(shape) and arr.reshape(*shape)
+        return _funcs.reshape(self, shape, order=order)
+
+    def sort(self, axis=-1, kind=None, order=None):
+        # ndarray.sort works in-place
+        _funcs.copyto(self, _funcs.sort(self, axis, kind, order))
+
+    def item(self, *args):
+        # Mimic NumPy's implementation with three special cases (no arguments,
+        # a flat index and a multi-index):
+        # https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/methods.c#L702
+        if args == ():
+            return self.tensor.item()
+        elif len(args) == 1:
+            # int argument
+            return self.ravel()[args[0]]
+        else:
+            return self.__getitem__(args)
+
+    def __getitem__(self, index):
+        tensor = self.tensor
+
+        def neg_step(i, s):
+            if not (isinstance(s, slice) and s.step is not None and s.step < 0):
+                return s
+
+            nonlocal tensor
+            tensor = torch.flip(tensor, (i,))
+
+            # Account for the fact that a slice includes the start but not the end
+            assert isinstance(s.start, int) or s.start is None
+            assert isinstance(s.stop, int) or s.stop is None
+            start = s.stop + 1 if s.stop else None
+            stop = s.start + 1 if s.start else None
+
+            return slice(start, stop, -s.step)
+
+        if isinstance(index, Sequence):
+            index = type(index)(neg_step(i, s) for i, s in enumerate(index))
+        else:
+            index = neg_step(0, index)
+        index = _util.ndarrays_to_tensors(index)
+        index = _upcast_int_indices(index)
+        return ndarray(tensor.__getitem__(index))
+
+    def __setitem__(self, index, value):
+        index = _util.ndarrays_to_tensors(index)
+        index = _upcast_int_indices(index)
+
+        if not _dtypes_impl.is_scalar(value):
+            value = normalize_array_like(value)
+            value = _util.cast_if_needed(value, self.tensor.dtype)
+
+        return self.tensor.__setitem__(index, value)
+
+    take = _funcs.take
+    put = _funcs.put
+
+    def __dlpack__(self, *, stream=None):
+        return self.tensor.__dlpack__(stream=stream)
+
+    def __dlpack_device__(self):
+        return self.tensor.__dlpack_device__()
+
+
+def _tolist(obj):
+    """Recursively convert tensors into lists."""
+    a1 = []
+    for elem in obj:
+        if isinstance(elem, (list, tuple)):
+            elem = _tolist(elem)
+        if isinstance(elem, ndarray):
+            a1.append(elem.tensor.tolist())
+        else:
+            a1.append(elem)
+    return a1
+
+
+# This is the ideally the only place which talks to ndarray directly.
+# The rest goes through asarray (preferred) or array.
+
+
+def array(obj, dtype=None, *, copy=True, order="K", subok=False, ndmin=0, like=None):
+    if subok is not False:
+        raise NotImplementedError("'subok' parameter is not supported.")
+    if like is not None:
+        raise NotImplementedError("'like' parameter is not supported.")
+    if order != "K":
+        raise NotImplementedError()
+
+    # a happy path
+    if (
+        isinstance(obj, ndarray)
+        and copy is False
+        and dtype is None
+        and ndmin <= obj.ndim
+    ):
+        return obj
+
+    if isinstance(obj, (list, tuple)):
+        # FIXME and they have the same dtype, device, etc
+        if obj and all(isinstance(x, torch.Tensor) for x in obj):
+            # list of arrays: *under torch.Dynamo* these are FakeTensors
+            obj = torch.stack(obj)
+        else:
+            # XXX: remove tolist
+            # lists of ndarrays: [1, [2, 3], ndarray(4)] convert to lists of lists
+            obj = _tolist(obj)
+
+    # is obj an ndarray already?
+    if isinstance(obj, ndarray):
+        obj = obj.tensor
+
+    # is a specific dtype requested?
+    torch_dtype = None
+    if dtype is not None:
+        torch_dtype = _dtypes.dtype(dtype).torch_dtype
+
+    tensor = _util._coerce_to_tensor(obj, torch_dtype, copy, ndmin)
+    return ndarray(tensor)
+
+
+def asarray(a, dtype=None, order="K", *, like=None):
+    return array(a, dtype=dtype, order=order, like=like, copy=False, ndmin=0)
+
+
+def ascontiguousarray(a, dtype=None, *, like=None):
+    arr = asarray(a, dtype=dtype, like=like)
+    if not arr.tensor.is_contiguous():
+        arr.tensor = arr.tensor.contiguous()
+    return arr
+
+
+def from_dlpack(x, /):
+    t = torch.from_dlpack(x)
+    return ndarray(t)
+
+
+def _extract_dtype(entry):
+    try:
+        dty = _dtypes.dtype(entry)
+    except Exception:
+        dty = asarray(entry).dtype
+    return dty
+
+
+def can_cast(from_, to, casting="safe"):
+    from_ = _extract_dtype(from_)
+    to_ = _extract_dtype(to)
+
+    return _dtypes_impl.can_cast_impl(from_.torch_dtype, to_.torch_dtype, casting)
+
+
+def result_type(*arrays_and_dtypes):
+    tensors = []
+    for entry in arrays_and_dtypes:
+        try:
+            t = asarray(entry).tensor
+        except (RuntimeError, ValueError, TypeError):
+            dty = _dtypes.dtype(entry)
+            t = torch.empty(1, dtype=dty.torch_dtype)
+        tensors.append(t)
+
+    torch_dtype = _dtypes_impl.result_type_impl(*tensors)
+    return _dtypes.dtype(torch_dtype)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_normalizations.py b/MLPY/Lib/site-packages/torch/_numpy/_normalizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2167e25554d782b30c7e445ae52560fb92ed397
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_normalizations.py
@@ -0,0 +1,258 @@
+# mypy: ignore-errors
+
+""" "Normalize" arguments: convert array_likes to tensors, dtypes to torch dtypes and so on.
+"""
+from __future__ import annotations
+
+import functools
+import inspect
+import operator
+import typing
+
+import torch
+
+from . import _dtypes, _dtypes_impl, _util
+
+ArrayLike = typing.TypeVar("ArrayLike")
+Scalar = typing.Union[int, float, complex, bool]
+ArrayLikeOrScalar = typing.Union[ArrayLike, Scalar]
+
+DTypeLike = typing.TypeVar("DTypeLike")
+AxisLike = typing.TypeVar("AxisLike")
+NDArray = typing.TypeVar("NDArray")
+CastingModes = typing.TypeVar("CastingModes")
+KeepDims = typing.TypeVar("KeepDims")
+
+# OutArray is to annotate the out= array argument.
+#
+# This one is special is several respects:
+# First, It needs to be an NDArray, and we need to preserve the `result is out`
+# semantics. Therefore, we cannot just extract the Tensor from the out array.
+# So we never pass the out array to implementer functions and handle it in the
+# `normalizer` below.
+# Second, the out= argument can be either keyword or positional argument, and
+# as a positional arg, it can be anywhere in the signature.
+# To handle all this, we define a special `OutArray` annotation and dispatch on it.
+#
+OutArray = typing.TypeVar("OutArray")
+
+try:
+    from typing import NotImplementedType
+except ImportError:
+    NotImplementedType = typing.TypeVar("NotImplementedType")
+
+
+def normalize_array_like(x, parm=None):
+    from ._ndarray import asarray
+
+    return asarray(x).tensor
+
+
+def normalize_array_like_or_scalar(x, parm=None):
+    if _dtypes_impl.is_scalar_or_symbolic(x):
+        return x
+    return normalize_array_like(x, parm)
+
+
+def normalize_optional_array_like_or_scalar(x, parm=None):
+    if x is None:
+        return None
+    return normalize_array_like_or_scalar(x, parm)
+
+
+def normalize_optional_array_like(x, parm=None):
+    # This explicit normalizer is needed because otherwise normalize_array_like
+    # does not run for a parameter annotated as Optional[ArrayLike]
+    return None if x is None else normalize_array_like(x, parm)
+
+
+def normalize_seq_array_like(x, parm=None):
+    return tuple(normalize_array_like(value) for value in x)
+
+
+def normalize_dtype(dtype, parm=None):
+    # cf _decorators.dtype_to_torch
+    torch_dtype = None
+    if dtype is not None:
+        dtype = _dtypes.dtype(dtype)
+        torch_dtype = dtype.torch_dtype
+    return torch_dtype
+
+
+def normalize_not_implemented(arg, parm):
+    if arg != parm.default:
+        raise NotImplementedError(f"'{parm.name}' parameter is not supported.")
+
+
+def normalize_axis_like(arg, parm=None):
+    from ._ndarray import ndarray
+
+    if isinstance(arg, ndarray):
+        arg = operator.index(arg)
+    return arg
+
+
+def normalize_ndarray(arg, parm=None):
+    # check the arg is an ndarray, extract its tensor attribute
+    if arg is None:
+        return arg
+
+    from ._ndarray import ndarray
+
+    if not isinstance(arg, ndarray):
+        raise TypeError(f"'{parm.name}' must be an array")
+    return arg.tensor
+
+
+def normalize_outarray(arg, parm=None):
+    # almost normalize_ndarray, only return the array, not its tensor
+    if arg is None:
+        return arg
+    from ._ndarray import ndarray
+
+    # Dynamo can pass torch tensors as out arguments,
+    # wrap it in an ndarray before processing
+    if isinstance(arg, torch.Tensor):
+        arg = ndarray(arg)
+
+    if not isinstance(arg, ndarray):
+        raise TypeError(f"'{parm.name}' must be an array")
+    return arg
+
+
+def normalize_casting(arg, parm=None):
+    if arg not in ["no", "equiv", "safe", "same_kind", "unsafe"]:
+        raise ValueError(
+            f"casting must be one of 'no', 'equiv', 'safe', 'same_kind', or 'unsafe' (got '{arg}')"
+        )
+    return arg
+
+
+normalizers = {
+    "ArrayLike": normalize_array_like,
+    "ArrayLikeOrScalar": normalize_array_like_or_scalar,
+    "Optional[ArrayLike]": normalize_optional_array_like,
+    "Sequence[ArrayLike]": normalize_seq_array_like,
+    "Optional[ArrayLikeOrScalar]": normalize_optional_array_like_or_scalar,
+    "Optional[NDArray]": normalize_ndarray,
+    "Optional[OutArray]": normalize_outarray,
+    "NDArray": normalize_ndarray,
+    "Optional[DTypeLike]": normalize_dtype,
+    "AxisLike": normalize_axis_like,
+    "NotImplementedType": normalize_not_implemented,
+    "Optional[CastingModes]": normalize_casting,
+}
+
+
+def maybe_normalize(arg, parm):
+    """Normalize arg if a normalizer is registered."""
+    normalizer = normalizers.get(parm.annotation, None)
+    return normalizer(arg, parm) if normalizer else arg
+
+
+# ### Return value helpers ###
+
+
+def maybe_copy_to(out, result, promote_scalar_result=False):
+    # NB: here out is either an ndarray or None
+    if out is None:
+        return result
+    elif isinstance(result, torch.Tensor):
+        if result.shape != out.shape:
+            can_fit = result.numel() == 1 and out.ndim == 0
+            if promote_scalar_result and can_fit:
+                result = result.squeeze()
+            else:
+                raise ValueError(
+                    f"Bad size of the out array: out.shape = {out.shape}"
+                    f" while result.shape = {result.shape}."
+                )
+        out.tensor.copy_(result)
+        return out
+    elif isinstance(result, (tuple, list)):
+        return type(result)(
+            maybe_copy_to(o, r, promote_scalar_result) for o, r in zip(out, result)
+        )
+    else:
+        raise AssertionError()  # We should never hit this path
+
+
+def wrap_tensors(result):
+    from ._ndarray import ndarray
+
+    if isinstance(result, torch.Tensor):
+        return ndarray(result)
+    elif isinstance(result, (tuple, list)):
+        result = type(result)(wrap_tensors(x) for x in result)
+    return result
+
+
+def array_or_scalar(values, py_type=float, return_scalar=False):
+    if return_scalar:
+        return py_type(values.item())
+    else:
+        from ._ndarray import ndarray
+
+        return ndarray(values)
+
+
+# ### The main decorator to normalize arguments / postprocess the output ###
+
+
+def normalizer(_func=None, *, promote_scalar_result=False):
+    def normalizer_inner(func):
+        @functools.wraps(func)
+        def wrapped(*args, **kwds):
+            sig = inspect.signature(func)
+            params = sig.parameters
+            first_param = next(iter(params.values()))
+
+            # NumPy's API does not have positional args before variadic positional args
+            if first_param.kind == inspect.Parameter.VAR_POSITIONAL:
+                args = [maybe_normalize(arg, first_param) for arg in args]
+            else:
+                # NB: extra unknown arguments: pass through, will raise in func(*args) below
+                args = (
+                    tuple(
+                        maybe_normalize(arg, parm)
+                        for arg, parm in zip(args, params.values())
+                    )
+                    + args[len(params.values()) :]
+                )
+
+            kwds = {
+                name: maybe_normalize(arg, params[name]) if name in params else arg
+                for name, arg in kwds.items()
+            }
+
+            result = func(*args, **kwds)
+
+            # keepdims
+            bound_args = None
+            if "keepdims" in params and params["keepdims"].annotation == "KeepDims":
+                # keepdims can be in any position so we need sig.bind
+                bound_args = sig.bind(*args, **kwds).arguments
+                if bound_args.get("keepdims", False):
+                    # In this case the first arg is the initial tensor and
+                    # the second arg is (optionally) the axis
+                    tensor = args[0]
+                    axis = bound_args.get("axis")
+                    result = _util.apply_keepdims(result, axis, tensor.ndim)
+
+            # out
+            if "out" in params:
+                # out can be in any position so we need sig.bind
+                if bound_args is None:
+                    bound_args = sig.bind(*args, **kwds).arguments
+                out = bound_args.get("out")
+                result = maybe_copy_to(out, result, promote_scalar_result)
+            result = wrap_tensors(result)
+
+            return result
+
+        return wrapped
+
+    if _func is None:
+        return normalizer_inner
+    else:
+        return normalizer_inner(_func)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_reductions_impl.py b/MLPY/Lib/site-packages/torch/_numpy/_reductions_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b0bba1bc12eb6de98259867132eb2d51fb9f941
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_reductions_impl.py
@@ -0,0 +1,456 @@
+# mypy: ignore-errors
+
+""" Implementation of reduction operations, to be wrapped into arrays, dtypes etc
+in the 'public' layer.
+
+Anything here only deals with torch objects, e.g. "dtype" is a torch.dtype instance etc
+"""
+from __future__ import annotations
+
+import functools
+from typing import Optional
+
+import torch
+
+from . import _dtypes_impl, _util
+from ._normalizations import (
+    ArrayLike,
+    AxisLike,
+    DTypeLike,
+    KeepDims,
+    NotImplementedType,
+    OutArray,
+)
+
+
+def _deco_axis_expand(func):
+    """
+    Generically handle axis arguments in reductions.
+    axis is *always* the 2nd arg in the function so no need to have a look at its signature
+    """
+
+    @functools.wraps(func)
+    def wrapped(a, axis=None, *args, **kwds):
+        if axis is not None:
+            axis = _util.normalize_axis_tuple(axis, a.ndim)
+
+        if axis == ():
+            # So we insert a length-one axis and run the reduction along it.
+            # We cannot return a.clone() as this would sidestep the checks inside the function
+            newshape = _util.expand_shape(a.shape, axis=0)
+            a = a.reshape(newshape)
+            axis = (0,)
+
+        return func(a, axis, *args, **kwds)
+
+    return wrapped
+
+
+def _atleast_float(dtype, other_dtype):
+    """Return a dtype that is real or complex floating-point.
+
+    For inputs that are boolean or integer dtypes, this returns the default
+    float dtype; inputs that are complex get converted to the default complex
+    dtype; real floating-point dtypes (`float*`) get passed through unchanged
+    """
+    if dtype is None:
+        dtype = other_dtype
+    if not (dtype.is_floating_point or dtype.is_complex):
+        return _dtypes_impl.default_dtypes().float_dtype
+    return dtype
+
+
+@_deco_axis_expand
+def count_nonzero(a: ArrayLike, axis: AxisLike = None, *, keepdims: KeepDims = False):
+    return a.count_nonzero(axis)
+
+
+@_deco_axis_expand
+def argmax(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    *,
+    keepdims: KeepDims = False,
+):
+    if a.is_complex():
+        raise NotImplementedError(f"argmax with dtype={a.dtype}.")
+
+    axis = _util.allow_only_single_axis(axis)
+
+    if a.dtype == torch.bool:
+        # RuntimeError: "argmax_cpu" not implemented for 'Bool'
+        a = a.to(torch.uint8)
+
+    return torch.argmax(a, axis)
+
+
+@_deco_axis_expand
+def argmin(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    *,
+    keepdims: KeepDims = False,
+):
+    if a.is_complex():
+        raise NotImplementedError(f"argmin with dtype={a.dtype}.")
+
+    axis = _util.allow_only_single_axis(axis)
+
+    if a.dtype == torch.bool:
+        # RuntimeError: "argmin_cpu" not implemented for 'Bool'
+        a = a.to(torch.uint8)
+
+    return torch.argmin(a, axis)
+
+
+@_deco_axis_expand
+def any(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    *,
+    where: NotImplementedType = None,
+):
+    axis = _util.allow_only_single_axis(axis)
+    axis_kw = {} if axis is None else {"dim": axis}
+    return torch.any(a, **axis_kw)
+
+
+@_deco_axis_expand
+def all(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    *,
+    where: NotImplementedType = None,
+):
+    axis = _util.allow_only_single_axis(axis)
+    axis_kw = {} if axis is None else {"dim": axis}
+    return torch.all(a, **axis_kw)
+
+
+@_deco_axis_expand
+def amax(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    initial: NotImplementedType = None,
+    where: NotImplementedType = None,
+):
+    if a.is_complex():
+        raise NotImplementedError(f"amax with dtype={a.dtype}")
+
+    return a.amax(axis)
+
+
+max = amax
+
+
+@_deco_axis_expand
+def amin(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    initial: NotImplementedType = None,
+    where: NotImplementedType = None,
+):
+    if a.is_complex():
+        raise NotImplementedError(f"amin with dtype={a.dtype}")
+
+    return a.amin(axis)
+
+
+min = amin
+
+
+@_deco_axis_expand
+def ptp(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+):
+    return a.amax(axis) - a.amin(axis)
+
+
+@_deco_axis_expand
+def sum(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    initial: NotImplementedType = None,
+    where: NotImplementedType = None,
+):
+    assert dtype is None or isinstance(dtype, torch.dtype)
+
+    if dtype == torch.bool:
+        dtype = _dtypes_impl.default_dtypes().int_dtype
+
+    axis_kw = {} if axis is None else {"dim": axis}
+    return a.sum(dtype=dtype, **axis_kw)
+
+
+@_deco_axis_expand
+def prod(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    initial: NotImplementedType = None,
+    where: NotImplementedType = None,
+):
+    axis = _util.allow_only_single_axis(axis)
+
+    if dtype == torch.bool:
+        dtype = _dtypes_impl.default_dtypes().int_dtype
+
+    axis_kw = {} if axis is None else {"dim": axis}
+    return a.prod(dtype=dtype, **axis_kw)
+
+
+product = prod
+
+
+@_deco_axis_expand
+def mean(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+    keepdims: KeepDims = False,
+    *,
+    where: NotImplementedType = None,
+):
+    dtype = _atleast_float(dtype, a.dtype)
+
+    axis_kw = {} if axis is None else {"dim": axis}
+    result = a.mean(dtype=dtype, **axis_kw)
+
+    return result
+
+
+@_deco_axis_expand
+def std(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+    ddof=0,
+    keepdims: KeepDims = False,
+    *,
+    where: NotImplementedType = None,
+):
+    in_dtype = dtype
+    dtype = _atleast_float(dtype, a.dtype)
+    tensor = _util.cast_if_needed(a, dtype)
+    result = tensor.std(dim=axis, correction=ddof)
+    return _util.cast_if_needed(result, in_dtype)
+
+
+@_deco_axis_expand
+def var(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+    ddof=0,
+    keepdims: KeepDims = False,
+    *,
+    where: NotImplementedType = None,
+):
+    in_dtype = dtype
+    dtype = _atleast_float(dtype, a.dtype)
+    tensor = _util.cast_if_needed(a, dtype)
+    result = tensor.var(dim=axis, correction=ddof)
+    return _util.cast_if_needed(result, in_dtype)
+
+
+# cumsum / cumprod are almost reductions:
+#   1. no keepdims
+#   2. axis=None flattens
+
+
+def cumsum(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+):
+    if dtype == torch.bool:
+        dtype = _dtypes_impl.default_dtypes().int_dtype
+    if dtype is None:
+        dtype = a.dtype
+
+    (a,), axis = _util.axis_none_flatten(a, axis=axis)
+    axis = _util.normalize_axis_index(axis, a.ndim)
+
+    return a.cumsum(axis=axis, dtype=dtype)
+
+
+def cumprod(
+    a: ArrayLike,
+    axis: AxisLike = None,
+    dtype: Optional[DTypeLike] = None,
+    out: Optional[OutArray] = None,
+):
+    if dtype == torch.bool:
+        dtype = _dtypes_impl.default_dtypes().int_dtype
+    if dtype is None:
+        dtype = a.dtype
+
+    (a,), axis = _util.axis_none_flatten(a, axis=axis)
+    axis = _util.normalize_axis_index(axis, a.ndim)
+
+    return a.cumprod(axis=axis, dtype=dtype)
+
+
+cumproduct = cumprod
+
+
+def average(
+    a: ArrayLike,
+    axis=None,
+    weights: ArrayLike = None,
+    returned=False,
+    *,
+    keepdims=False,
+):
+    if weights is None:
+        result = mean(a, axis=axis)
+        wsum = torch.as_tensor(a.numel() / result.numel(), dtype=result.dtype)
+    else:
+        if not a.dtype.is_floating_point:
+            a = a.double()
+
+        # axis & weights
+        if a.shape != weights.shape:
+            if axis is None:
+                raise TypeError(
+                    "Axis must be specified when shapes of a and weights differ."
+                )
+            if weights.ndim != 1:
+                raise TypeError(
+                    "1D weights expected when shapes of a and weights differ."
+                )
+            if weights.shape[0] != a.shape[axis]:
+                raise ValueError(
+                    "Length of weights not compatible with specified axis."
+                )
+
+            # setup weight to broadcast along axis
+            weights = torch.broadcast_to(weights, (a.ndim - 1) * (1,) + weights.shape)
+            weights = weights.swapaxes(-1, axis)
+
+        # do the work
+        result_dtype = _dtypes_impl.result_type_impl(a, weights)
+        numerator = sum(a * weights, axis, dtype=result_dtype)
+        wsum = sum(weights, axis, dtype=result_dtype)
+        result = numerator / wsum
+
+    # We process keepdims manually because the decorator does not deal with variadic returns
+    if keepdims:
+        result = _util.apply_keepdims(result, axis, a.ndim)
+
+    if returned:
+        if wsum.shape != result.shape:
+            wsum = torch.broadcast_to(wsum, result.shape).clone()
+        return result, wsum
+    else:
+        return result
+
+
+# Not using deco_axis_expand as it assumes that axis is the second arg
+def quantile(
+    a: ArrayLike,
+    q: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    overwrite_input=False,
+    method="linear",
+    keepdims: KeepDims = False,
+    *,
+    interpolation: NotImplementedType = None,
+):
+    if overwrite_input:
+        # raise NotImplementedError("overwrite_input in quantile not implemented.")
+        # NumPy documents that `overwrite_input` MAY modify inputs:
+        # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html#numpy-percentile
+        # Here we choose to work out-of-place because why not.
+        pass
+
+    if not a.dtype.is_floating_point:
+        dtype = _dtypes_impl.default_dtypes().float_dtype
+        a = a.to(dtype)
+
+    # edge case: torch.quantile only supports float32 and float64
+    if a.dtype == torch.float16:
+        a = a.to(torch.float32)
+
+    if axis is None:
+        a = a.flatten()
+        q = q.flatten()
+        axis = (0,)
+    else:
+        axis = _util.normalize_axis_tuple(axis, a.ndim)
+
+    # FIXME(Mario) Doesn't np.quantile accept a tuple?
+    # torch.quantile does accept a number. If we don't want to implement the tuple behaviour
+    # (it's deffo low prio) change `normalize_axis_tuple` into a normalize_axis index above.
+    axis = _util.allow_only_single_axis(axis)
+
+    q = _util.cast_if_needed(q, a.dtype)
+
+    return torch.quantile(a, q, axis=axis, interpolation=method)
+
+
+def percentile(
+    a: ArrayLike,
+    q: ArrayLike,
+    axis: AxisLike = None,
+    out: Optional[OutArray] = None,
+    overwrite_input=False,
+    method="linear",
+    keepdims: KeepDims = False,
+    *,
+    interpolation: NotImplementedType = None,
+):
+    # np.percentile(float_tensor, 30) : q.dtype is int64 => q / 100.0 is float32
+    if _dtypes_impl.python_type_for_torch(q.dtype) == int:
+        q = q.to(_dtypes_impl.default_dtypes().float_dtype)
+    qq = q / 100.0
+
+    return quantile(
+        a,
+        qq,
+        axis=axis,
+        overwrite_input=overwrite_input,
+        method=method,
+        keepdims=keepdims,
+        interpolation=interpolation,
+    )
+
+
+def median(
+    a: ArrayLike,
+    axis=None,
+    out: Optional[OutArray] = None,
+    overwrite_input=False,
+    keepdims: KeepDims = False,
+):
+    return quantile(
+        a,
+        torch.as_tensor(0.5),
+        axis=axis,
+        overwrite_input=overwrite_input,
+        out=out,
+        keepdims=keepdims,
+    )
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_ufuncs.py b/MLPY/Lib/site-packages/torch/_numpy/_ufuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..139aa89ebc5016dad0c522efaa2970c308f2df59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_ufuncs.py
@@ -0,0 +1,334 @@
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from typing import Optional
+
+import torch
+
+from . import _binary_ufuncs_impl, _dtypes_impl, _unary_ufuncs_impl, _util
+from ._normalizations import (
+    ArrayLike,
+    ArrayLikeOrScalar,
+    CastingModes,
+    DTypeLike,
+    normalizer,
+    NotImplementedType,
+    OutArray,
+)
+
+
+def _ufunc_postprocess(result, out, casting):
+    if out is not None:
+        result = _util.typecast_tensor(result, out.dtype.torch_dtype, casting)
+        result = torch.broadcast_to(result, out.shape)
+    return result
+
+
+# ############# Binary ufuncs ######################
+
+_binary = [
+    name
+    for name in dir(_binary_ufuncs_impl)
+    if not name.startswith("_") and name not in ["torch", "matmul", "divmod", "ldexp"]
+]
+
+
+NEP50_FUNCS = (
+    "add",
+    "subtract",
+    "multiply",
+    "floor_divide",
+    "true_divide",
+    "divide",
+    "remainder",
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "bitwise_left_shift",
+    "bitwise_right_shift",
+    "hypot",
+    "arctan2",
+    "logaddexp",
+    "logaddexp2",
+    "heaviside",
+    "copysign",
+    "fmax",
+    "minimum",
+    "fmin",
+    "maximum",
+    "fmod",
+    "gcd",
+    "lcm",
+    "pow",
+)
+
+
+def deco_binary_ufunc(torch_func):
+    """Common infra for binary ufuncs.
+
+    Normalize arguments, sort out type casting, broadcasting and delegate to
+    the pytorch functions for the actual work.
+    """
+
+    @normalizer
+    def wrapped(
+        x1: ArrayLikeOrScalar,
+        x2: ArrayLikeOrScalar,
+        /,
+        out: Optional[OutArray] = None,
+        *,
+        where: NotImplementedType = True,
+        casting: Optional[CastingModes] = "same_kind",
+        order: NotImplementedType = "K",
+        dtype: Optional[DTypeLike] = None,
+        subok: NotImplementedType = False,
+        signature: NotImplementedType = None,
+        extobj: NotImplementedType = None,
+    ):
+        if dtype is not None:
+
+            def cast(x, dtype):
+                if isinstance(x, torch.Tensor):
+                    return _util.typecast_tensor(x, dtype, casting)
+                else:
+                    return torch.as_tensor(x, dtype=dtype)
+
+            x1 = cast(x1, dtype)
+            x2 = cast(x2, dtype)
+        elif isinstance(x1, torch.Tensor) and isinstance(x2, torch.Tensor):
+            dtype = _dtypes_impl.result_type_impl(x1, x2)
+            x1, x2 = _util.typecast_tensors((x1, x2), dtype, casting)
+        else:
+            x1, x2 = _dtypes_impl.nep50_to_tensors(
+                x1, x2, torch_func.__name__ in NEP50_FUNCS, torch_func.__name__
+            )
+
+        result = torch_func(x1, x2)
+
+        return _ufunc_postprocess(result, out, casting)
+
+    wrapped.__qualname__ = torch_func.__name__
+    wrapped.__name__ = torch_func.__name__
+
+    return wrapped
+
+
+# matmul's signature is _slightly_ different from other ufuncs:
+# - no where=...
+# - additional axis=..., axes=...
+# - no NEP50 scalars in or out
+@normalizer
+def matmul(
+    x1: ArrayLike,
+    x2: ArrayLike,
+    /,
+    out: Optional[OutArray] = None,
+    *,
+    casting: Optional[CastingModes] = "same_kind",
+    order: NotImplementedType = "K",
+    dtype: Optional[DTypeLike] = None,
+    subok: NotImplementedType = False,
+    signature: NotImplementedType = None,
+    extobj: NotImplementedType = None,
+    axes: NotImplementedType = None,
+    axis: NotImplementedType = None,
+):
+    if dtype is None:
+        dtype = _dtypes_impl.result_type_impl(x1, x2)
+    x1, x2 = _util.typecast_tensors((x1, x2), dtype, casting)
+
+    result = _binary_ufuncs_impl.matmul(x1, x2)
+
+    result = _ufunc_postprocess(result, out, casting)
+    return result
+
+
+# ldexp casting is special : the dtype of the result == dtype of the 1st arg
+@normalizer
+def ldexp(
+    x1: ArrayLikeOrScalar,
+    x2: ArrayLikeOrScalar,
+    /,
+    out: Optional[OutArray] = None,
+    *,
+    where: NotImplementedType = True,
+    casting: Optional[CastingModes] = "same_kind",
+    order: NotImplementedType = "K",
+    dtype: Optional[DTypeLike] = None,
+    subok: NotImplementedType = False,
+    signature: NotImplementedType = None,
+    extobj: NotImplementedType = None,
+):
+    if dtype is not None:
+        if isinstance(x1, torch.Tensor):
+            x1 = _util.typecast_tensor(x1, dtype, casting)
+        else:
+            x1 = torch.as_tensor(x1, dtype=dtype)
+    else:
+        if not isinstance(x1, torch.Tensor):
+            x1 = torch.as_tensor(x1)
+            x1 = _util.cast_int_to_float(x1)
+
+    x2 = torch.as_tensor(x2)
+    # the second arg must be integer
+    if _dtypes_impl._category(x2.dtype) != 1:
+        raise ValueError("ldexp 2nd arg must be integer")
+
+    result = _binary_ufuncs_impl.ldexp(x1, x2)
+
+    if x1.dtype == torch.float16:
+        # torch.ldexp(f16, int) -> f32, undo it
+        result = result.to(torch.float16)
+
+    return _ufunc_postprocess(result, out, casting)
+
+
+# nin=2, nout=2
+@normalizer
+def divmod(
+    x1: ArrayLike,
+    x2: ArrayLike,
+    out1: Optional[OutArray] = None,
+    out2: Optional[OutArray] = None,
+    /,
+    out: tuple[Optional[OutArray], Optional[OutArray]] = (None, None),
+    *,
+    where: NotImplementedType = True,
+    casting: Optional[CastingModes] = "same_kind",
+    order: NotImplementedType = "K",
+    dtype: Optional[DTypeLike] = None,
+    subok: NotImplementedType = False,
+    signature: NotImplementedType = None,
+    extobj: NotImplementedType = None,
+):
+    # make sure we either have no out arrays at all, or there is either
+    # out1, out2, or out=tuple, but not both
+    num_outs = sum(x is not None for x in [out1, out2])
+    if num_outs == 1:
+        raise ValueError("both out1 and out2 need to be provided")
+    elif num_outs == 2:
+        o1, o2 = out
+        if o1 is not None or o2 is not None:
+            raise TypeError(
+                "cannot specify 'out' as both a positional and keyword argument"
+            )
+    else:
+        out1, out2 = out
+
+    if dtype is None:
+        dtype = _dtypes_impl.result_type_impl(x1, x2)
+    x1, x2 = _util.typecast_tensors((x1, x2), dtype, casting)
+
+    quot, rem = _binary_ufuncs_impl.divmod(x1, x2)
+
+    quot = _ufunc_postprocess(quot, out1, casting)
+    rem = _ufunc_postprocess(rem, out2, casting)
+    return quot, rem
+
+
+#
+# Attach ufuncs to this module, for a further export to the public namespace in __init__.py
+#
+for name in _binary:
+    ufunc = getattr(_binary_ufuncs_impl, name)
+    vars()[name] = deco_binary_ufunc(ufunc)
+
+
+def modf(x, /, *args, **kwds):
+    quot, rem = divmod(x, 1, *args, **kwds)
+    return rem, quot
+
+
+_binary = _binary + ["divmod", "modf", "matmul", "ldexp"]
+
+
+# ############# Unary ufuncs ######################
+
+
+_unary = [
+    name
+    for name in dir(_unary_ufuncs_impl)
+    if not name.startswith("_") and name != "torch"
+]
+
+
+# these are ufunc(int) -> float
+_fp_unary = [
+    "arccos",
+    "arccosh",
+    "arcsin",
+    "arcsinh",
+    "arctan",
+    "arctanh",
+    "cbrt",
+    "cos",
+    "cosh",
+    "deg2rad",
+    "degrees",
+    "exp",
+    "exp2",
+    "expm1",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "rad2deg",
+    "radians",
+    "reciprocal",
+    "sin",
+    "sinh",
+    "sqrt",
+    "square",
+    "tan",
+    "tanh",
+    "trunc",
+]
+
+
+def deco_unary_ufunc(torch_func):
+    """Common infra for unary ufuncs.
+
+    Normalize arguments, sort out type casting, broadcasting and delegate to
+    the pytorch functions for the actual work.
+    """
+
+    @normalizer
+    def wrapped(
+        x: ArrayLike,
+        /,
+        out: Optional[OutArray] = None,
+        *,
+        where=True,
+        casting: Optional[CastingModes] = "same_kind",
+        order="K",
+        dtype: Optional[DTypeLike] = None,
+        subok: NotImplementedType = False,
+        signature=None,
+        extobj=None,
+    ):
+        if dtype is not None:
+            x = _util.typecast_tensor(x, dtype, casting)
+
+        if torch_func.__name__ in _fp_unary:
+            x = _util.cast_int_to_float(x)
+
+        result = torch_func(x)
+        result = _ufunc_postprocess(result, out, casting)
+        return result
+
+    wrapped.__qualname__ = torch_func.__name__
+    wrapped.__name__ = torch_func.__name__
+
+    return wrapped
+
+
+#
+# Attach ufuncs to this module, for a further export to the public namespace in __init__.py
+#
+for name in _unary:
+    ufunc = getattr(_unary_ufuncs_impl, name)
+    vars()[name] = deco_unary_ufunc(ufunc)
+
+
+__all__ = _binary + _unary  # noqa: PLE0605
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_unary_ufuncs_impl.py b/MLPY/Lib/site-packages/torch/_numpy/_unary_ufuncs_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8678f87816a36cae55bec7d525bf514f5149c3f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_unary_ufuncs_impl.py
@@ -0,0 +1,73 @@
+# mypy: ignore-errors
+
+"""Export torch work functions for unary ufuncs, rename/tweak to match numpy.
+This listing is further exported to public symbols in the `_numpy/_ufuncs.py` module.
+"""
+
+import torch
+
+from torch import (  # noqa: F401
+    absolute as fabs,  # noqa: F401
+    arccos,  # noqa: F401
+    arccosh,  # noqa: F401
+    arcsin,  # noqa: F401
+    arcsinh,  # noqa: F401
+    arctan,  # noqa: F401
+    arctanh,  # noqa: F401
+    bitwise_not,  # noqa: F401
+    bitwise_not as invert,  # noqa: F401
+    ceil,  # noqa: F401
+    conj_physical as conjugate,  # noqa: F401
+    cos,  # noqa: F401
+    cosh,  # noqa: F401
+    deg2rad,  # noqa: F401
+    deg2rad as radians,  # noqa: F401
+    exp,  # noqa: F401
+    exp2,  # noqa: F401
+    expm1,  # noqa: F401
+    floor,  # noqa: F401
+    isfinite,  # noqa: F401
+    isinf,  # noqa: F401
+    isnan,  # noqa: F401
+    log,  # noqa: F401
+    log10,  # noqa: F401
+    log1p,  # noqa: F401
+    log2,  # noqa: F401
+    logical_not,  # noqa: F401
+    negative,  # noqa: F401
+    rad2deg,  # noqa: F401
+    rad2deg as degrees,  # noqa: F401
+    reciprocal,  # noqa: F401
+    round as fix,  # noqa: F401
+    round as rint,  # noqa: F401
+    sign,  # noqa: F401
+    signbit,  # noqa: F401
+    sin,  # noqa: F401
+    sinh,  # noqa: F401
+    sqrt,  # noqa: F401
+    square,  # noqa: F401
+    tan,  # noqa: F401
+    tanh,  # noqa: F401
+    trunc,  # noqa: F401
+)
+
+
+# special cases: torch does not export these names
+def cbrt(x):
+    return torch.pow(x, 1 / 3)
+
+
+def positive(x):
+    return +x
+
+
+def absolute(x):
+    # work around torch.absolute not impl for bools
+    if x.dtype == torch.bool:
+        return x
+    return torch.absolute(x)
+
+
+# TODO set __name__ and __qualname__
+abs = absolute
+conj = conjugate
diff --git a/MLPY/Lib/site-packages/torch/_numpy/_util.py b/MLPY/Lib/site-packages/torch/_numpy/_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c147bd30550972625a0b8fb424568096654c047c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/_util.py
@@ -0,0 +1,261 @@
+# mypy: ignore-errors
+
+"""Assorted utilities, which do not need anything other then torch and stdlib.
+"""
+
+import operator
+
+import torch
+
+from . import _dtypes_impl
+
+
+# https://github.com/numpy/numpy/blob/v1.23.0/numpy/distutils/misc_util.py#L497-L504
+def is_sequence(seq):
+    if isinstance(seq, str):
+        return False
+    try:
+        len(seq)
+    except Exception:
+        return False
+    return True
+
+
+class AxisError(ValueError, IndexError):
+    pass
+
+
+class UFuncTypeError(TypeError, RuntimeError):
+    pass
+
+
+def cast_if_needed(tensor, dtype):
+    # NB: no casting if dtype=None
+    if dtype is not None and tensor.dtype != dtype:
+        tensor = tensor.to(dtype)
+    return tensor
+
+
+def cast_int_to_float(x):
+    # cast integers and bools to the default float dtype
+    if _dtypes_impl._category(x.dtype) < 2:
+        x = x.to(_dtypes_impl.default_dtypes().float_dtype)
+    return x
+
+
+# a replica of the version in ./numpy/numpy/core/src/multiarray/common.h
+def normalize_axis_index(ax, ndim, argname=None):
+    if not (-ndim <= ax < ndim):
+        raise AxisError(f"axis {ax} is out of bounds for array of dimension {ndim}")
+    if ax < 0:
+        ax += ndim
+    return ax
+
+
+# from https://github.com/numpy/numpy/blob/main/numpy/core/numeric.py#L1378
+def normalize_axis_tuple(axis, ndim, argname=None, allow_duplicate=False):
+    """
+    Normalizes an axis argument into a tuple of non-negative integer axes.
+
+    This handles shorthands such as ``1`` and converts them to ``(1,)``,
+    as well as performing the handling of negative indices covered by
+    `normalize_axis_index`.
+
+    By default, this forbids axes from being specified multiple times.
+    Used internally by multi-axis-checking logic.
+
+    Parameters
+    ----------
+    axis : int, iterable of int
+        The un-normalized index or indices of the axis.
+    ndim : int
+        The number of dimensions of the array that `axis` should be normalized
+        against.
+    argname : str, optional
+        A prefix to put before the error message, typically the name of the
+        argument.
+    allow_duplicate : bool, optional
+        If False, the default, disallow an axis from being specified twice.
+
+    Returns
+    -------
+    normalized_axes : tuple of int
+        The normalized axis index, such that `0 <= normalized_axis < ndim`
+    """
+    # Optimization to speed-up the most common cases.
+    if type(axis) not in (tuple, list):
+        try:
+            axis = [operator.index(axis)]
+        except TypeError:
+            pass
+    # Going via an iterator directly is slower than via list comprehension.
+    axis = tuple([normalize_axis_index(ax, ndim, argname) for ax in axis])
+    if not allow_duplicate and len(set(axis)) != len(axis):
+        if argname:
+            raise ValueError(f"repeated axis in `{argname}` argument")
+        else:
+            raise ValueError("repeated axis")
+    return axis
+
+
+def allow_only_single_axis(axis):
+    if axis is None:
+        return axis
+    if len(axis) != 1:
+        raise NotImplementedError("does not handle tuple axis")
+    return axis[0]
+
+
+def expand_shape(arr_shape, axis):
+    # taken from numpy 1.23.x, expand_dims function
+    if type(axis) not in (list, tuple):
+        axis = (axis,)
+    out_ndim = len(axis) + len(arr_shape)
+    axis = normalize_axis_tuple(axis, out_ndim)
+    shape_it = iter(arr_shape)
+    shape = [1 if ax in axis else next(shape_it) for ax in range(out_ndim)]
+    return shape
+
+
+def apply_keepdims(tensor, axis, ndim):
+    if axis is None:
+        # tensor was a scalar
+        shape = (1,) * ndim
+        tensor = tensor.expand(shape).contiguous()
+    else:
+        shape = expand_shape(tensor.shape, axis)
+        tensor = tensor.reshape(shape)
+    return tensor
+
+
+def axis_none_flatten(*tensors, axis=None):
+    """Flatten the arrays if axis is None."""
+    if axis is None:
+        tensors = tuple(ar.flatten() for ar in tensors)
+        return tensors, 0
+    else:
+        return tensors, axis
+
+
+def typecast_tensor(t, target_dtype, casting):
+    """Dtype-cast tensor to target_dtype.
+
+    Parameters
+    ----------
+    t : torch.Tensor
+        The tensor to cast
+    target_dtype : torch dtype object
+        The array dtype to cast all tensors to
+    casting : str
+        The casting mode, see `np.can_cast`
+
+     Returns
+     -------
+    `torch.Tensor` of the `target_dtype` dtype
+
+     Raises
+     ------
+     ValueError
+        if the argument cannot be cast according to the `casting` rule
+
+    """
+    can_cast = _dtypes_impl.can_cast_impl
+
+    if not can_cast(t.dtype, target_dtype, casting=casting):
+        raise TypeError(
+            f"Cannot cast array data from {t.dtype} to"
+            f" {target_dtype} according to the rule '{casting}'"
+        )
+    return cast_if_needed(t, target_dtype)
+
+
+def typecast_tensors(tensors, target_dtype, casting):
+    return tuple(typecast_tensor(t, target_dtype, casting) for t in tensors)
+
+
+def _try_convert_to_tensor(obj):
+    try:
+        tensor = torch.as_tensor(obj)
+    except Exception as e:
+        mesg = f"failed to convert {obj} to ndarray. \nInternal error is: {str(e)}."
+        raise NotImplementedError(mesg)  # noqa: TRY200
+    return tensor
+
+
+def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0):
+    """The core logic of the array(...) function.
+
+    Parameters
+    ----------
+    obj : tensor_like
+        The thing to coerce
+    dtype : torch.dtype object or None
+        Coerce to this torch dtype
+    copy : bool
+        Copy or not
+    ndmin : int
+        The results as least this many dimensions
+    is_weak : bool
+        Whether obj is a weakly typed python scalar.
+
+    Returns
+    -------
+    tensor : torch.Tensor
+        a tensor object with requested dtype, ndim and copy semantics.
+
+    Notes
+    -----
+    This is almost a "tensor_like" coersion function. Does not handle wrapper
+    ndarrays (those should be handled in the ndarray-aware layer prior to
+    invoking this function).
+    """
+    if isinstance(obj, torch.Tensor):
+        tensor = obj
+    else:
+        # tensor.dtype is the pytorch default, typically float32. If obj's elements
+        # are not exactly representable in float32, we've lost precision:
+        # >>> torch.as_tensor(1e12).item() - 1e12
+        # -4096.0
+        default_dtype = torch.get_default_dtype()
+        torch.set_default_dtype(_dtypes_impl.get_default_dtype_for(torch.float32))
+        try:
+            tensor = _try_convert_to_tensor(obj)
+        finally:
+            torch.set_default_dtype(default_dtype)
+
+    # type cast if requested
+    tensor = cast_if_needed(tensor, dtype)
+
+    # adjust ndim if needed
+    ndim_extra = ndmin - tensor.ndim
+    if ndim_extra > 0:
+        tensor = tensor.view((1,) * ndim_extra + tensor.shape)
+
+    # copy if requested
+    if copy:
+        tensor = tensor.clone()
+
+    return tensor
+
+
+def ndarrays_to_tensors(*inputs):
+    """Convert all ndarrays from `inputs` to tensors. (other things are intact)"""
+    from ._ndarray import ndarray
+
+    if len(inputs) == 0:
+        return ValueError()
+    elif len(inputs) == 1:
+        input_ = inputs[0]
+        if isinstance(input_, ndarray):
+            return input_.tensor
+        elif isinstance(input_, tuple):
+            result = []
+            for sub_input in input_:
+                sub_result = ndarrays_to_tensors(sub_input)
+                result.append(sub_result)
+            return tuple(result)
+        else:
+            return input_
+    else:
+        assert isinstance(inputs, tuple)  # sanity check
+        return ndarrays_to_tensors(inputs)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/fft.py b/MLPY/Lib/site-packages/torch/_numpy/fft.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac26d8bc787c90023cd6b0e7a4b9abcb336dee92
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/fft.py
@@ -0,0 +1,130 @@
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+import functools
+
+import torch
+
+from . import _dtypes_impl, _util
+from ._normalizations import ArrayLike, normalizer
+
+
+def upcast(func):
+    """NumPy fft casts inputs to 64 bit and *returns 64-bit results*."""
+
+    @functools.wraps(func)
+    def wrapped(tensor, *args, **kwds):
+        target_dtype = (
+            _dtypes_impl.default_dtypes().complex_dtype
+            if tensor.is_complex()
+            else _dtypes_impl.default_dtypes().float_dtype
+        )
+        tensor = _util.cast_if_needed(tensor, target_dtype)
+        return func(tensor, *args, **kwds)
+
+    return wrapped
+
+
+@normalizer
+@upcast
+def fft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.fft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+@upcast
+def ifft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.ifft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+@upcast
+def rfft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.rfft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+@upcast
+def irfft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.irfft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+@upcast
+def fftn(a: ArrayLike, s=None, axes=None, norm=None):
+    return torch.fft.fftn(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def ifftn(a: ArrayLike, s=None, axes=None, norm=None):
+    return torch.fft.ifftn(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def rfftn(a: ArrayLike, s=None, axes=None, norm=None):
+    return torch.fft.rfftn(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def irfftn(a: ArrayLike, s=None, axes=None, norm=None):
+    return torch.fft.irfftn(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def fft2(a: ArrayLike, s=None, axes=(-2, -1), norm=None):
+    return torch.fft.fft2(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def ifft2(a: ArrayLike, s=None, axes=(-2, -1), norm=None):
+    return torch.fft.ifft2(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def rfft2(a: ArrayLike, s=None, axes=(-2, -1), norm=None):
+    return torch.fft.rfft2(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def irfft2(a: ArrayLike, s=None, axes=(-2, -1), norm=None):
+    return torch.fft.irfft2(a, s, dim=axes, norm=norm)
+
+
+@normalizer
+@upcast
+def hfft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.hfft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+@upcast
+def ihfft(a: ArrayLike, n=None, axis=-1, norm=None):
+    return torch.fft.ihfft(a, n, dim=axis, norm=norm)
+
+
+@normalizer
+def fftfreq(n, d=1.0):
+    return torch.fft.fftfreq(n, d)
+
+
+@normalizer
+def rfftfreq(n, d=1.0):
+    return torch.fft.rfftfreq(n, d)
+
+
+@normalizer
+def fftshift(x: ArrayLike, axes=None):
+    return torch.fft.fftshift(x, axes)
+
+
+@normalizer
+def ifftshift(x: ArrayLike, axes=None):
+    return torch.fft.ifftshift(x, axes)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/linalg.py b/MLPY/Lib/site-packages/torch/_numpy/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a17808c5b454f122d817ede08377f18413686f2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/linalg.py
@@ -0,0 +1,239 @@
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+import functools
+import math
+from typing import Sequence
+
+import torch
+
+from . import _dtypes_impl, _util
+from ._normalizations import ArrayLike, KeepDims, normalizer
+
+
+class LinAlgError(Exception):
+    pass
+
+
+def _atleast_float_1(a):
+    if not (a.dtype.is_floating_point or a.dtype.is_complex):
+        a = a.to(_dtypes_impl.default_dtypes().float_dtype)
+    return a
+
+
+def _atleast_float_2(a, b):
+    dtyp = _dtypes_impl.result_type_impl(a, b)
+    if not (dtyp.is_floating_point or dtyp.is_complex):
+        dtyp = _dtypes_impl.default_dtypes().float_dtype
+
+    a = _util.cast_if_needed(a, dtyp)
+    b = _util.cast_if_needed(b, dtyp)
+    return a, b
+
+
+def linalg_errors(func):
+    @functools.wraps(func)
+    def wrapped(*args, **kwds):
+        try:
+            return func(*args, **kwds)
+        except torch._C._LinAlgError as e:
+            raise LinAlgError(*e.args)  # noqa: TRY200
+
+    return wrapped
+
+
+# ### Matrix and vector products ###
+
+
+@normalizer
+@linalg_errors
+def matrix_power(a: ArrayLike, n):
+    a = _atleast_float_1(a)
+    return torch.linalg.matrix_power(a, n)
+
+
+@normalizer
+@linalg_errors
+def multi_dot(inputs: Sequence[ArrayLike], *, out=None):
+    return torch.linalg.multi_dot(inputs)
+
+
+# ### Solving equations and inverting matrices ###
+
+
+@normalizer
+@linalg_errors
+def solve(a: ArrayLike, b: ArrayLike):
+    a, b = _atleast_float_2(a, b)
+    return torch.linalg.solve(a, b)
+
+
+@normalizer
+@linalg_errors
+def lstsq(a: ArrayLike, b: ArrayLike, rcond=None):
+    a, b = _atleast_float_2(a, b)
+    # NumPy is using gelsd: https://github.com/numpy/numpy/blob/v1.24.0/numpy/linalg/umath_linalg.cpp#L3991
+    # on CUDA, only `gels` is available though, so use it instead
+    driver = "gels" if a.is_cuda or b.is_cuda else "gelsd"
+    return torch.linalg.lstsq(a, b, rcond=rcond, driver=driver)
+
+
+@normalizer
+@linalg_errors
+def inv(a: ArrayLike):
+    a = _atleast_float_1(a)
+    result = torch.linalg.inv(a)
+    return result
+
+
+@normalizer
+@linalg_errors
+def pinv(a: ArrayLike, rcond=1e-15, hermitian=False):
+    a = _atleast_float_1(a)
+    return torch.linalg.pinv(a, rtol=rcond, hermitian=hermitian)
+
+
+@normalizer
+@linalg_errors
+def tensorsolve(a: ArrayLike, b: ArrayLike, axes=None):
+    a, b = _atleast_float_2(a, b)
+    return torch.linalg.tensorsolve(a, b, dims=axes)
+
+
+@normalizer
+@linalg_errors
+def tensorinv(a: ArrayLike, ind=2):
+    a = _atleast_float_1(a)
+    return torch.linalg.tensorinv(a, ind=ind)
+
+
+# ### Norms and other numbers ###
+
+
+@normalizer
+@linalg_errors
+def det(a: ArrayLike):
+    a = _atleast_float_1(a)
+    return torch.linalg.det(a)
+
+
+@normalizer
+@linalg_errors
+def slogdet(a: ArrayLike):
+    a = _atleast_float_1(a)
+    return torch.linalg.slogdet(a)
+
+
+@normalizer
+@linalg_errors
+def cond(x: ArrayLike, p=None):
+    x = _atleast_float_1(x)
+
+    # check if empty
+    # cf: https://github.com/numpy/numpy/blob/v1.24.0/numpy/linalg/linalg.py#L1744
+    if x.numel() == 0 and math.prod(x.shape[-2:]) == 0:
+        raise LinAlgError("cond is not defined on empty arrays")
+
+    result = torch.linalg.cond(x, p=p)
+
+    # Convert nans to infs (numpy does it in a data-dependent way, depending on
+    # whether the input array has nans or not)
+    # XXX: NumPy does this: https://github.com/numpy/numpy/blob/v1.24.0/numpy/linalg/linalg.py#L1744
+    return torch.where(torch.isnan(result), float("inf"), result)
+
+
+@normalizer
+@linalg_errors
+def matrix_rank(a: ArrayLike, tol=None, hermitian=False):
+    a = _atleast_float_1(a)
+
+    if a.ndim < 2:
+        return int((a != 0).any())
+
+    if tol is None:
+        # follow https://github.com/numpy/numpy/blob/v1.24.0/numpy/linalg/linalg.py#L1885
+        atol = 0
+        rtol = max(a.shape[-2:]) * torch.finfo(a.dtype).eps
+    else:
+        atol, rtol = tol, 0
+    return torch.linalg.matrix_rank(a, atol=atol, rtol=rtol, hermitian=hermitian)
+
+
+@normalizer
+@linalg_errors
+def norm(x: ArrayLike, ord=None, axis=None, keepdims: KeepDims = False):
+    x = _atleast_float_1(x)
+    return torch.linalg.norm(x, ord=ord, dim=axis)
+
+
+# ### Decompositions ###
+
+
+@normalizer
+@linalg_errors
+def cholesky(a: ArrayLike):
+    a = _atleast_float_1(a)
+    return torch.linalg.cholesky(a)
+
+
+@normalizer
+@linalg_errors
+def qr(a: ArrayLike, mode="reduced"):
+    a = _atleast_float_1(a)
+    result = torch.linalg.qr(a, mode=mode)
+    if mode == "r":
+        # match NumPy
+        result = result.R
+    return result
+
+
+@normalizer
+@linalg_errors
+def svd(a: ArrayLike, full_matrices=True, compute_uv=True, hermitian=False):
+    a = _atleast_float_1(a)
+    if not compute_uv:
+        return torch.linalg.svdvals(a)
+
+    # NB: ignore the hermitian= argument (no pytorch equivalent)
+    result = torch.linalg.svd(a, full_matrices=full_matrices)
+    return result
+
+
+# ### Eigenvalues and eigenvectors ###
+
+
+@normalizer
+@linalg_errors
+def eig(a: ArrayLike):
+    a = _atleast_float_1(a)
+    w, vt = torch.linalg.eig(a)
+
+    if not a.is_complex() and w.is_complex() and (w.imag == 0).all():
+        w = w.real
+        vt = vt.real
+    return w, vt
+
+
+@normalizer
+@linalg_errors
+def eigh(a: ArrayLike, UPLO="L"):
+    a = _atleast_float_1(a)
+    return torch.linalg.eigh(a, UPLO=UPLO)
+
+
+@normalizer
+@linalg_errors
+def eigvals(a: ArrayLike):
+    a = _atleast_float_1(a)
+    result = torch.linalg.eigvals(a)
+    if not a.is_complex() and result.is_complex() and (result.imag == 0).all():
+        result = result.real
+    return result
+
+
+@normalizer
+@linalg_errors
+def eigvalsh(a: ArrayLike, UPLO="L"):
+    a = _atleast_float_1(a)
+    return torch.linalg.eigvalsh(a, UPLO=UPLO)
diff --git a/MLPY/Lib/site-packages/torch/_numpy/random.py b/MLPY/Lib/site-packages/torch/_numpy/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..57155b7bf9f081366dac3cfe706bc5b0c7231a2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/random.py
@@ -0,0 +1,191 @@
+# mypy: ignore-errors
+
+"""Wrapper to mimic (parts of) np.random API surface.
+
+NumPy has strict guarantees on reproducibility etc; here we don't give any.
+
+Q: default dtype is float64 in numpy
+
+"""
+from __future__ import annotations
+
+import functools
+from math import sqrt
+from typing import Optional
+
+import torch
+
+from . import _dtypes_impl, _util
+from ._normalizations import array_or_scalar, ArrayLike, normalizer
+
+
+__all__ = [
+    "seed",
+    "random_sample",
+    "sample",
+    "random",
+    "rand",
+    "randn",
+    "normal",
+    "choice",
+    "randint",
+    "shuffle",
+    "uniform",
+]
+
+
+def use_numpy_random():
+    # local import to avoid ref cycles
+    import torch._dynamo.config as config
+
+    return config.use_numpy_random_stream
+
+
+def deco_stream(func):
+    @functools.wraps(func)
+    def inner(*args, **kwds):
+        if not use_numpy_random():
+            return func(*args, **kwds)
+        else:
+            import numpy
+
+            from ._ndarray import ndarray
+
+            f = getattr(numpy.random, func.__name__)
+
+            # numpy funcs accept numpy ndarrays, unwrap
+            args = tuple(
+                arg.tensor.numpy() if isinstance(arg, ndarray) else arg for arg in args
+            )
+            kwds = {
+                key: val.tensor.numpy() if isinstance(val, ndarray) else val
+                for key, val in kwds.items()
+            }
+
+            value = f(*args, **kwds)
+
+            # `value` can be either numpy.ndarray or python scalar (or None)
+            if isinstance(value, numpy.ndarray):
+                value = ndarray(torch.as_tensor(value))
+
+            return value
+
+    return inner
+
+
+@deco_stream
+def seed(seed=None):
+    if seed is not None:
+        torch.random.manual_seed(seed)
+
+
+@deco_stream
+def random_sample(size=None):
+    if size is None:
+        size = ()
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    values = torch.empty(size, dtype=dtype).uniform_()
+    return array_or_scalar(values, return_scalar=size == ())
+
+
+def rand(*size):
+    if size == ():
+        size = None
+    return random_sample(size)
+
+
+sample = random_sample
+random = random_sample
+
+
+@deco_stream
+def uniform(low=0.0, high=1.0, size=None):
+    if size is None:
+        size = ()
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    values = torch.empty(size, dtype=dtype).uniform_(low, high)
+    return array_or_scalar(values, return_scalar=size == ())
+
+
+@deco_stream
+def randn(*size):
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    values = torch.randn(size, dtype=dtype)
+    return array_or_scalar(values, return_scalar=size == ())
+
+
+@deco_stream
+def normal(loc=0.0, scale=1.0, size=None):
+    if size is None:
+        size = ()
+    dtype = _dtypes_impl.default_dtypes().float_dtype
+    values = torch.empty(size, dtype=dtype).normal_(loc, scale)
+    return array_or_scalar(values, return_scalar=size == ())
+
+
+@deco_stream
+def shuffle(x):
+    # no @normalizer because we do not cast e.g. lists to tensors
+    from ._ndarray import ndarray
+
+    if isinstance(x, torch.Tensor):
+        tensor = x
+    elif isinstance(x, ndarray):
+        tensor = x.tensor
+    else:
+        raise NotImplementedError("We do not random.shuffle lists in-place")
+
+    perm = torch.randperm(tensor.shape[0])
+    xp = tensor[perm]
+    tensor.copy_(xp)
+
+
+@deco_stream
+def randint(low, high=None, size=None):
+    if size is None:
+        size = ()
+    if not isinstance(size, (tuple, list)):
+        size = (size,)
+    if high is None:
+        low, high = 0, low
+    values = torch.randint(low, high, size=size)
+    return array_or_scalar(values, int, return_scalar=size == ())
+
+
+@deco_stream
+@normalizer
+def choice(a: ArrayLike, size=None, replace=True, p: Optional[ArrayLike] = None):
+    # https://stackoverflow.com/questions/59461811/random-choice-with-pytorch
+    if a.numel() == 1:
+        a = torch.arange(a)
+
+    # TODO: check a.dtype is integer -- cf np.random.choice(3.4) which raises
+
+    # number of draws
+    if size is None:
+        num_el = 1
+    elif _util.is_sequence(size):
+        num_el = 1
+        for el in size:
+            num_el *= el
+    else:
+        num_el = size
+
+    # prepare the probabilities
+    if p is None:
+        p = torch.ones_like(a) / a.shape[0]
+
+    # cf https://github.com/numpy/numpy/blob/main/numpy/random/mtrand.pyx#L973
+    atol = sqrt(torch.finfo(p.dtype).eps)
+    if abs(p.sum() - 1.0) > atol:
+        raise ValueError("probabilities do not sum to 1.")
+
+    # actually sample
+    indices = torch.multinomial(p, num_el, replacement=replace)
+
+    if _util.is_sequence(size):
+        indices = indices.reshape(size)
+
+    samples = a[indices]
+
+    return samples
diff --git a/MLPY/Lib/site-packages/torch/_numpy/testing/__init__.py b/MLPY/Lib/site-packages/torch/_numpy/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02051d8eedc61437cf8bc0d2a85fd4f46ce1b692
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/testing/__init__.py
@@ -0,0 +1,19 @@
+# mypy: ignore-errors
+
+from .utils import (
+    _gen_alignment_data,
+    assert_,
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_almost_equal,
+    assert_array_equal,
+    assert_array_less,
+    assert_equal,
+    assert_raises_regex,
+    assert_warns,
+    HAS_REFCOUNT,
+    IS_WASM,
+    suppress_warnings,
+)
+
+# from .testing import assert_allclose    # FIXME
diff --git a/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..272d490cf4914e39770e08ddc323ead636216d79
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a137063da7030b007650f70a6eb3f82e199427
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_numpy/testing/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_numpy/testing/utils.py b/MLPY/Lib/site-packages/torch/_numpy/testing/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1b1b683f460865fd9086b4190cb6a96d44298f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_numpy/testing/utils.py
@@ -0,0 +1,2390 @@
+# mypy: ignore-errors
+
+"""
+Utility function to facilitate testing.
+
+"""
+import contextlib
+import gc
+import operator
+import os
+import platform
+import pprint
+import re
+import shutil
+import sys
+import warnings
+from functools import wraps
+from io import StringIO
+from tempfile import mkdtemp, mkstemp
+from warnings import WarningMessage
+
+import torch._numpy as np
+from torch._numpy import arange, asarray as asanyarray, empty, float32, intp, ndarray
+
+__all__ = [
+    "assert_equal",
+    "assert_almost_equal",
+    "assert_approx_equal",
+    "assert_array_equal",
+    "assert_array_less",
+    "assert_string_equal",
+    "assert_",
+    "assert_array_almost_equal",
+    "build_err_msg",
+    "decorate_methods",
+    "print_assert_equal",
+    "verbose",
+    "assert_",
+    "assert_array_almost_equal_nulp",
+    "assert_raises_regex",
+    "assert_array_max_ulp",
+    "assert_warns",
+    "assert_no_warnings",
+    "assert_allclose",
+    "IgnoreException",
+    "clear_and_catch_warnings",
+    "temppath",
+    "tempdir",
+    "IS_PYPY",
+    "HAS_REFCOUNT",
+    "IS_WASM",
+    "suppress_warnings",
+    "assert_array_compare",
+    "assert_no_gc_cycles",
+    "break_cycles",
+    "IS_PYSTON",
+]
+
+
+verbose = 0
+
+IS_WASM = platform.machine() in ["wasm32", "wasm64"]
+IS_PYPY = sys.implementation.name == "pypy"
+IS_PYSTON = hasattr(sys, "pyston_version_info")
+HAS_REFCOUNT = getattr(sys, "getrefcount", None) is not None and not IS_PYSTON
+
+
+def assert_(val, msg=""):
+    """
+    Assert that works in release mode.
+    Accepts callable msg to allow deferring evaluation until failure.
+
+    The Python built-in ``assert`` does not work when executing code in
+    optimized mode (the ``-O`` flag) - no byte-code is generated for it.
+
+    For documentation on usage, refer to the Python documentation.
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    if not val:
+        try:
+            smsg = msg()
+        except TypeError:
+            smsg = msg
+        raise AssertionError(smsg)
+
+
+def gisnan(x):
+    return np.isnan(x)
+
+
+def gisfinite(x):
+    return np.isfinite(x)
+
+
+def gisinf(x):
+    return np.isinf(x)
+
+
+def build_err_msg(
+    arrays,
+    err_msg,
+    header="Items are not equal:",
+    verbose=True,
+    names=("ACTUAL", "DESIRED"),
+    precision=8,
+):
+    msg = ["\n" + header]
+    if err_msg:
+        if err_msg.find("\n") == -1 and len(err_msg) < 79 - len(header):
+            msg = [msg[0] + " " + err_msg]
+        else:
+            msg.append(err_msg)
+    if verbose:
+        for i, a in enumerate(arrays):
+            if isinstance(a, ndarray):
+                # precision argument is only needed if the objects are ndarrays
+                # r_func = partial(array_repr, precision=precision)
+                r_func = ndarray.__repr__
+            else:
+                r_func = repr
+
+            try:
+                r = r_func(a)
+            except Exception as exc:
+                r = f"[repr failed for <{type(a).__name__}>: {exc}]"
+            if r.count("\n") > 3:
+                r = "\n".join(r.splitlines()[:3])
+                r += "..."
+            msg.append(f" {names[i]}: {r}")
+    return "\n".join(msg)
+
+
+def assert_equal(actual, desired, err_msg="", verbose=True):
+    """
+    Raises an AssertionError if two objects are not equal.
+
+    Given two objects (scalars, lists, tuples, dictionaries or numpy arrays),
+    check that all elements of these objects are equal. An exception is raised
+    at the first conflicting values.
+
+    When one of `actual` and `desired` is a scalar and the other is array_like,
+    the function checks that each element of the array_like object is equal to
+    the scalar.
+
+    This function handles NaN comparisons as if NaN was a "normal" number.
+    That is, AssertionError is not raised if both objects have NaNs in the same
+    positions.  This is in contrast to the IEEE standard on NaNs, which says
+    that NaN compared to anything must return False.
+
+    Parameters
+    ----------
+    actual : array_like
+        The object to check.
+    desired : array_like
+        The expected object.
+    err_msg : str, optional
+        The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+        If actual and desired are not equal.
+
+    Examples
+    --------
+    >>> np.testing.assert_equal([4,5], [4,6])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Items are not equal:
+    item=1
+     ACTUAL: 5
+     DESIRED: 6
+
+    The following comparison does not raise an exception.  There are NaNs
+    in the inputs, but they are in the same positions.
+
+    >>> np.testing.assert_equal(np.array([1.0, 2.0, np.nan]), [1, 2, np.nan])
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+
+    num_nones = sum([actual is None, desired is None])
+    if num_nones == 1:
+        raise AssertionError(f"Not equal: {actual} != {desired}")
+    elif num_nones == 2:
+        return True
+    # else, carry on
+
+    if isinstance(actual, np.DType) or isinstance(desired, np.DType):
+        result = actual == desired
+        if not result:
+            raise AssertionError(f"Not equal: {actual} != {desired}")
+        else:
+            return True
+
+    if isinstance(desired, str) and isinstance(actual, str):
+        assert actual == desired
+        return
+
+    if isinstance(desired, dict):
+        if not isinstance(actual, dict):
+            raise AssertionError(repr(type(actual)))
+        assert_equal(len(actual), len(desired), err_msg, verbose)
+        for k in desired.keys():
+            if k not in actual:
+                raise AssertionError(repr(k))
+            assert_equal(actual[k], desired[k], f"key={k!r}\n{err_msg}", verbose)
+        return
+    if isinstance(desired, (list, tuple)) and isinstance(actual, (list, tuple)):
+        assert_equal(len(actual), len(desired), err_msg, verbose)
+        for k in range(len(desired)):
+            assert_equal(actual[k], desired[k], f"item={k!r}\n{err_msg}", verbose)
+        return
+
+    from torch._numpy import imag, iscomplexobj, isscalar, ndarray, real, signbit
+
+    if isinstance(actual, ndarray) or isinstance(desired, ndarray):
+        return assert_array_equal(actual, desired, err_msg, verbose)
+    msg = build_err_msg([actual, desired], err_msg, verbose=verbose)
+
+    # Handle complex numbers: separate into real/imag to handle
+    # nan/inf/negative zero correctly
+    # XXX: catch ValueError for subclasses of ndarray where iscomplex fail
+    try:
+        usecomplex = iscomplexobj(actual) or iscomplexobj(desired)
+    except (ValueError, TypeError):
+        usecomplex = False
+
+    if usecomplex:
+        if iscomplexobj(actual):
+            actualr = real(actual)
+            actuali = imag(actual)
+        else:
+            actualr = actual
+            actuali = 0
+        if iscomplexobj(desired):
+            desiredr = real(desired)
+            desiredi = imag(desired)
+        else:
+            desiredr = desired
+            desiredi = 0
+        try:
+            assert_equal(actualr, desiredr)
+            assert_equal(actuali, desiredi)
+        except AssertionError:
+            raise AssertionError(msg)  # noqa: TRY200
+
+    # isscalar test to check cases such as [np.nan] != np.nan
+    if isscalar(desired) != isscalar(actual):
+        raise AssertionError(msg)
+
+    # Inf/nan/negative zero handling
+    try:
+        isdesnan = gisnan(desired)
+        isactnan = gisnan(actual)
+        if isdesnan and isactnan:
+            return  # both nan, so equal
+
+        # handle signed zero specially for floats
+        array_actual = np.asarray(actual)
+        array_desired = np.asarray(desired)
+
+        if desired == 0 and actual == 0:
+            if not signbit(desired) == signbit(actual):
+                raise AssertionError(msg)
+
+    except (TypeError, ValueError, NotImplementedError):
+        pass
+
+    try:
+        # Explicitly use __eq__ for comparison, gh-2552
+        if not (desired == actual):
+            raise AssertionError(msg)
+
+    except (DeprecationWarning, FutureWarning) as e:
+        # this handles the case when the two types are not even comparable
+        if "elementwise == comparison" in e.args[0]:
+            raise AssertionError(msg)  # noqa: TRY200
+        else:
+            raise
+
+
+def print_assert_equal(test_string, actual, desired):
+    """
+    Test if two objects are equal, and print an error message if test fails.
+
+    The test is performed with ``actual == desired``.
+
+    Parameters
+    ----------
+    test_string : str
+        The message supplied to AssertionError.
+    actual : object
+        The object to test for equality against `desired`.
+    desired : object
+        The expected result.
+
+    Examples
+    --------
+    >>> np.testing.print_assert_equal('Test XYZ of func xyz', [0, 1], [0, 1])  # doctest: +SKIP
+    >>> np.testing.print_assert_equal('Test XYZ of func xyz', [0, 1], [0, 2])  # doctest: +SKIP
+    Traceback (most recent call last):
+    ...
+    AssertionError: Test XYZ of func xyz failed
+    ACTUAL:
+    [0, 1]
+    DESIRED:
+    [0, 2]
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    import pprint
+
+    if not (actual == desired):
+        msg = StringIO()
+        msg.write(test_string)
+        msg.write(" failed\nACTUAL: \n")
+        pprint.pprint(actual, msg)
+        msg.write("DESIRED: \n")
+        pprint.pprint(desired, msg)
+        raise AssertionError(msg.getvalue())
+
+
+def assert_almost_equal(actual, desired, decimal=7, err_msg="", verbose=True):
+    """
+    Raises an AssertionError if two items are not equal up to desired
+    precision.
+
+    .. note:: It is recommended to use one of `assert_allclose`,
+              `assert_array_almost_equal_nulp` or `assert_array_max_ulp`
+              instead of this function for more consistent floating point
+              comparisons.
+
+    The test verifies that the elements of `actual` and `desired` satisfy.
+
+        ``abs(desired-actual) < float64(1.5 * 10**(-decimal))``
+
+    That is a looser test than originally documented, but agrees with what the
+    actual implementation in `assert_array_almost_equal` did up to rounding
+    vagaries. An exception is raised at conflicting values. For ndarrays this
+    delegates to assert_array_almost_equal
+
+    Parameters
+    ----------
+    actual : array_like
+        The object to check.
+    desired : array_like
+        The expected object.
+    decimal : int, optional
+        Desired precision, default is 7.
+    err_msg : str, optional
+        The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+      If actual and desired are not equal up to specified precision.
+
+    See Also
+    --------
+    assert_allclose: Compare two array_like objects for equality with desired
+                     relative and/or absolute precision.
+    assert_array_almost_equal_nulp, assert_array_max_ulp, assert_equal
+
+    Examples
+    --------
+    >>> from torch._numpy.testing import assert_almost_equal
+    >>> assert_almost_equal(2.3333333333333, 2.33333334)
+    >>> assert_almost_equal(2.3333333333333, 2.33333334, decimal=10)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 10 decimals
+     ACTUAL: 2.3333333333333
+     DESIRED: 2.33333334
+
+    >>> assert_almost_equal(np.array([1.0,2.3333333333333]),
+    ...                     np.array([1.0,2.33333334]), decimal=9)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 9 decimals
+    <BLANKLINE>
+    Mismatched elements: 1 / 2 (50%)
+    Max absolute difference: 6.666699636781459e-09
+    Max relative difference: 2.8571569790287484e-09
+     x: torch.ndarray([1.0000, 2.3333], dtype=float64)
+     y: torch.ndarray([1.0000, 2.3333], dtype=float64)
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    from torch._numpy import imag, iscomplexobj, ndarray, real
+
+    # Handle complex numbers: separate into real/imag to handle
+    # nan/inf/negative zero correctly
+    # XXX: catch ValueError for subclasses of ndarray where iscomplex fail
+    try:
+        usecomplex = iscomplexobj(actual) or iscomplexobj(desired)
+    except ValueError:
+        usecomplex = False
+
+    def _build_err_msg():
+        header = "Arrays are not almost equal to %d decimals" % decimal
+        return build_err_msg([actual, desired], err_msg, verbose=verbose, header=header)
+
+    if usecomplex:
+        if iscomplexobj(actual):
+            actualr = real(actual)
+            actuali = imag(actual)
+        else:
+            actualr = actual
+            actuali = 0
+        if iscomplexobj(desired):
+            desiredr = real(desired)
+            desiredi = imag(desired)
+        else:
+            desiredr = desired
+            desiredi = 0
+        try:
+            assert_almost_equal(actualr, desiredr, decimal=decimal)
+            assert_almost_equal(actuali, desiredi, decimal=decimal)
+        except AssertionError:
+            raise AssertionError(_build_err_msg())  # noqa: TRY200
+
+    if isinstance(actual, (ndarray, tuple, list)) or isinstance(
+        desired, (ndarray, tuple, list)
+    ):
+        return assert_array_almost_equal(actual, desired, decimal, err_msg)
+    try:
+        # If one of desired/actual is not finite, handle it specially here:
+        # check that both are nan if any is a nan, and test for equality
+        # otherwise
+        if not (gisfinite(desired) and gisfinite(actual)):
+            if gisnan(desired) or gisnan(actual):
+                if not (gisnan(desired) and gisnan(actual)):
+                    raise AssertionError(_build_err_msg())
+            else:
+                if not desired == actual:
+                    raise AssertionError(_build_err_msg())
+            return
+    except (NotImplementedError, TypeError):
+        pass
+    if abs(desired - actual) >= np.float64(1.5 * 10.0 ** (-decimal)):
+        raise AssertionError(_build_err_msg())
+
+
+def assert_approx_equal(actual, desired, significant=7, err_msg="", verbose=True):
+    """
+    Raises an AssertionError if two items are not equal up to significant
+    digits.
+
+    .. note:: It is recommended to use one of `assert_allclose`,
+              `assert_array_almost_equal_nulp` or `assert_array_max_ulp`
+              instead of this function for more consistent floating point
+              comparisons.
+
+    Given two numbers, check that they are approximately equal.
+    Approximately equal is defined as the number of significant digits
+    that agree.
+
+    Parameters
+    ----------
+    actual : scalar
+        The object to check.
+    desired : scalar
+        The expected object.
+    significant : int, optional
+        Desired precision, default is 7.
+    err_msg : str, optional
+        The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+      If actual and desired are not equal up to specified precision.
+
+    See Also
+    --------
+    assert_allclose: Compare two array_like objects for equality with desired
+                     relative and/or absolute precision.
+    assert_array_almost_equal_nulp, assert_array_max_ulp, assert_equal
+
+    Examples
+    --------
+    >>> np.testing.assert_approx_equal(0.12345677777777e-20, 0.1234567e-20)  # doctest: +SKIP
+    >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345671e-20,  # doctest: +SKIP
+    ...                                significant=8)
+    >>> np.testing.assert_approx_equal(0.12345670e-20, 0.12345672e-20,  # doctest: +SKIP
+    ...                                significant=8)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Items are not equal to 8 significant digits:
+     ACTUAL: 1.234567e-21
+     DESIRED: 1.2345672e-21
+
+    the evaluated condition that raises the exception is
+
+    >>> abs(0.12345670e-20/1e-21 - 0.12345672e-20/1e-21) >= 10**-(8-1)
+    True
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    import numpy as np
+
+    (actual, desired) = map(float, (actual, desired))
+    if desired == actual:
+        return
+    # Normalized the numbers to be in range (-10.0,10.0)
+    # scale = float(pow(10,math.floor(math.log10(0.5*(abs(desired)+abs(actual))))))
+    scale = 0.5 * (np.abs(desired) + np.abs(actual))
+    scale = np.power(10, np.floor(np.log10(scale)))
+    try:
+        sc_desired = desired / scale
+    except ZeroDivisionError:
+        sc_desired = 0.0
+    try:
+        sc_actual = actual / scale
+    except ZeroDivisionError:
+        sc_actual = 0.0
+    msg = build_err_msg(
+        [actual, desired],
+        err_msg,
+        header="Items are not equal to %d significant digits:" % significant,
+        verbose=verbose,
+    )
+    try:
+        # If one of desired/actual is not finite, handle it specially here:
+        # check that both are nan if any is a nan, and test for equality
+        # otherwise
+        if not (gisfinite(desired) and gisfinite(actual)):
+            if gisnan(desired) or gisnan(actual):
+                if not (gisnan(desired) and gisnan(actual)):
+                    raise AssertionError(msg)
+            else:
+                if not desired == actual:
+                    raise AssertionError(msg)
+            return
+    except (TypeError, NotImplementedError):
+        pass
+    if np.abs(sc_desired - sc_actual) >= np.power(10.0, -(significant - 1)):
+        raise AssertionError(msg)
+
+
+def assert_array_compare(
+    comparison,
+    x,
+    y,
+    err_msg="",
+    verbose=True,
+    header="",
+    precision=6,
+    equal_nan=True,
+    equal_inf=True,
+    *,
+    strict=False,
+):
+    __tracebackhide__ = True  # Hide traceback for py.test
+    from torch._numpy import all, array, asarray, bool_, inf, isnan, max
+
+    x = asarray(x)
+    y = asarray(y)
+
+    def array2string(a):
+        return str(a)
+
+    # original array for output formatting
+    ox, oy = x, y
+
+    def func_assert_same_pos(x, y, func=isnan, hasval="nan"):
+        """Handling nan/inf.
+
+        Combine results of running func on x and y, checking that they are True
+        at the same locations.
+
+        """
+        __tracebackhide__ = True  # Hide traceback for py.test
+        x_id = func(x)
+        y_id = func(y)
+        # We include work-arounds here to handle three types of slightly
+        # pathological ndarray subclasses:
+        # (1) all() on `masked` array scalars can return masked arrays, so we
+        #     use != True
+        # (2) __eq__ on some ndarray subclasses returns Python booleans
+        #     instead of element-wise comparisons, so we cast to bool_() and
+        #     use isinstance(..., bool) checks
+        # (3) subclasses with bare-bones __array_function__ implementations may
+        #     not implement np.all(), so favor using the .all() method
+        # We are not committed to supporting such subclasses, but it's nice to
+        # support them if possible.
+        if (x_id == y_id).all().item() is not True:
+            msg = build_err_msg(
+                [x, y],
+                err_msg + "\nx and y %s location mismatch:" % (hasval),
+                verbose=verbose,
+                header=header,
+                names=("x", "y"),
+                precision=precision,
+            )
+            raise AssertionError(msg)
+        # If there is a scalar, then here we know the array has the same
+        # flag as it everywhere, so we should return the scalar flag.
+        if isinstance(x_id, bool) or x_id.ndim == 0:
+            return bool_(x_id)
+        elif isinstance(y_id, bool) or y_id.ndim == 0:
+            return bool_(y_id)
+        else:
+            return y_id
+
+    try:
+        if strict:
+            cond = x.shape == y.shape and x.dtype == y.dtype
+        else:
+            cond = (x.shape == () or y.shape == ()) or x.shape == y.shape
+        if not cond:
+            if x.shape != y.shape:
+                reason = f"\n(shapes {x.shape}, {y.shape} mismatch)"
+            else:
+                reason = f"\n(dtypes {x.dtype}, {y.dtype} mismatch)"
+            msg = build_err_msg(
+                [x, y],
+                err_msg + reason,
+                verbose=verbose,
+                header=header,
+                names=("x", "y"),
+                precision=precision,
+            )
+            raise AssertionError(msg)
+
+        flagged = bool_(False)
+
+        if equal_nan:
+            flagged = func_assert_same_pos(x, y, func=isnan, hasval="nan")
+
+        if equal_inf:
+            flagged |= func_assert_same_pos(
+                x, y, func=lambda xy: xy == +inf, hasval="+inf"
+            )
+            flagged |= func_assert_same_pos(
+                x, y, func=lambda xy: xy == -inf, hasval="-inf"
+            )
+
+        if flagged.ndim > 0:
+            x, y = x[~flagged], y[~flagged]
+            # Only do the comparison if actual values are left
+            if x.size == 0:
+                return
+        elif flagged:
+            # no sense doing comparison if everything is flagged.
+            return
+
+        val = comparison(x, y)
+
+        if isinstance(val, bool):
+            cond = val
+            reduced = array([val])
+        else:
+            reduced = val.ravel()
+            cond = reduced.all()
+
+        # The below comparison is a hack to ensure that fully masked
+        # results, for which val.ravel().all() returns np.ma.masked,
+        # do not trigger a failure (np.ma.masked != True evaluates as
+        # np.ma.masked, which is falsy).
+        if not cond:
+            n_mismatch = reduced.size - int(reduced.sum(dtype=intp))
+            n_elements = flagged.size if flagged.ndim != 0 else reduced.size
+            percent_mismatch = 100 * n_mismatch / n_elements
+            remarks = [
+                f"Mismatched elements: {n_mismatch} / {n_elements} ({percent_mismatch:.3g}%)"
+            ]
+
+            # with errstate(all='ignore'):
+            # ignore errors for non-numeric types
+            with contextlib.suppress(TypeError, RuntimeError):
+                error = abs(x - y)
+                if np.issubdtype(x.dtype, np.unsignedinteger):
+                    error2 = abs(y - x)
+                    np.minimum(error, error2, out=error)
+                max_abs_error = max(error)
+                remarks.append(
+                    "Max absolute difference: " + array2string(max_abs_error.item())
+                )
+
+                # note: this definition of relative error matches that one
+                # used by assert_allclose (found in np.isclose)
+                # Filter values where the divisor would be zero
+                nonzero = bool_(y != 0)
+                if all(~nonzero):
+                    max_rel_error = array(inf)
+                else:
+                    max_rel_error = max(error[nonzero] / abs(y[nonzero]))
+                remarks.append(
+                    "Max relative difference: " + array2string(max_rel_error.item())
+                )
+
+            err_msg += "\n" + "\n".join(remarks)
+            msg = build_err_msg(
+                [ox, oy],
+                err_msg,
+                verbose=verbose,
+                header=header,
+                names=("x", "y"),
+                precision=precision,
+            )
+            raise AssertionError(msg)
+    except ValueError:
+        import traceback
+
+        efmt = traceback.format_exc()
+        header = f"error during assertion:\n\n{efmt}\n\n{header}"
+
+        msg = build_err_msg(
+            [x, y],
+            err_msg,
+            verbose=verbose,
+            header=header,
+            names=("x", "y"),
+            precision=precision,
+        )
+        raise ValueError(msg)  # noqa: TRY200
+
+
+def assert_array_equal(x, y, err_msg="", verbose=True, *, strict=False):
+    """
+    Raises an AssertionError if two array_like objects are not equal.
+
+    Given two array_like objects, check that the shape is equal and all
+    elements of these objects are equal (but see the Notes for the special
+    handling of a scalar). An exception is raised at shape mismatch or
+    conflicting values. In contrast to the standard usage in numpy, NaNs
+    are compared like numbers, no assertion is raised if both objects have
+    NaNs in the same positions.
+
+    The usual caution for verifying equality with floating point numbers is
+    advised.
+
+    Parameters
+    ----------
+    x : array_like
+        The actual object to check.
+    y : array_like
+        The desired, expected object.
+    err_msg : str, optional
+        The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+    strict : bool, optional
+        If True, raise an AssertionError when either the shape or the data
+        type of the array_like objects does not match. The special
+        handling for scalars mentioned in the Notes section is disabled.
+
+    Raises
+    ------
+    AssertionError
+        If actual and desired objects are not equal.
+
+    See Also
+    --------
+    assert_allclose: Compare two array_like objects for equality with desired
+                     relative and/or absolute precision.
+    assert_array_almost_equal_nulp, assert_array_max_ulp, assert_equal
+
+    Notes
+    -----
+    When one of `x` and `y` is a scalar and the other is array_like, the
+    function checks that each element of the array_like object is equal to
+    the scalar. This behaviour can be disabled with the `strict` parameter.
+
+    Examples
+    --------
+    The first assert does not raise an exception:
+
+    >>> np.testing.assert_array_equal([1.0,2.33333,np.nan],
+    ...                               [np.exp(0),2.33333, np.nan])
+
+    Use `assert_allclose` or one of the nulp (number of floating point values)
+    functions for these cases instead:
+
+    >>> np.testing.assert_allclose([1.0,np.pi,np.nan],
+    ...                            [1, np.sqrt(np.pi)**2, np.nan],
+    ...                            rtol=1e-10, atol=0)
+
+    As mentioned in the Notes section, `assert_array_equal` has special
+    handling for scalars. Here the test checks that each value in `x` is 3:
+
+    >>> x = np.full((2, 5), fill_value=3)
+    >>> np.testing.assert_array_equal(x, 3)
+
+    Use `strict` to raise an AssertionError when comparing a scalar with an
+    array:
+
+    >>> np.testing.assert_array_equal(x, 3, strict=True)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not equal
+    <BLANKLINE>
+    (shapes (2, 5), () mismatch)
+     x: torch.ndarray([[3, 3, 3, 3, 3],
+            [3, 3, 3, 3, 3]])
+     y: torch.ndarray(3)
+
+    The `strict` parameter also ensures that the array data types match:
+
+    >>> x = np.array([2, 2, 2])
+    >>> y = np.array([2., 2., 2.], dtype=np.float32)
+    >>> np.testing.assert_array_equal(x, y, strict=True)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not equal
+    <BLANKLINE>
+    (dtypes dtype("int64"), dtype("float32") mismatch)
+     x: torch.ndarray([2, 2, 2])
+     y: torch.ndarray([2., 2., 2.])
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    assert_array_compare(
+        operator.__eq__,
+        x,
+        y,
+        err_msg=err_msg,
+        verbose=verbose,
+        header="Arrays are not equal",
+        strict=strict,
+    )
+
+
+def assert_array_almost_equal(x, y, decimal=6, err_msg="", verbose=True):
+    """
+    Raises an AssertionError if two objects are not equal up to desired
+    precision.
+
+    .. note:: It is recommended to use one of `assert_allclose`,
+              `assert_array_almost_equal_nulp` or `assert_array_max_ulp`
+              instead of this function for more consistent floating point
+              comparisons.
+
+    The test verifies identical shapes and that the elements of ``actual`` and
+    ``desired`` satisfy.
+
+        ``abs(desired-actual) < 1.5 * 10**(-decimal)``
+
+    That is a looser test than originally documented, but agrees with what the
+    actual implementation did up to rounding vagaries. An exception is raised
+    at shape mismatch or conflicting values. In contrast to the standard usage
+    in numpy, NaNs are compared like numbers, no assertion is raised if both
+    objects have NaNs in the same positions.
+
+    Parameters
+    ----------
+    x : array_like
+        The actual object to check.
+    y : array_like
+        The desired, expected object.
+    decimal : int, optional
+        Desired precision, default is 6.
+    err_msg : str, optional
+      The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+        If actual and desired are not equal up to specified precision.
+
+    See Also
+    --------
+    assert_allclose: Compare two array_like objects for equality with desired
+                     relative and/or absolute precision.
+    assert_array_almost_equal_nulp, assert_array_max_ulp, assert_equal
+
+    Examples
+    --------
+    the first assert does not raise an exception
+
+    >>> np.testing.assert_array_almost_equal([1.0,2.333,np.nan],
+    ...                                      [1.0,2.333,np.nan])
+
+    >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
+    ...                                      [1.0,2.33339,np.nan], decimal=5)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 5 decimals
+    <BLANKLINE>
+    Mismatched elements: 1 / 3 (33.3%)
+    Max absolute difference: 5.999999999994898e-05
+    Max relative difference: 2.5713661239633743e-05
+     x: torch.ndarray([1.0000, 2.3333,    nan], dtype=float64)
+     y: torch.ndarray([1.0000, 2.3334,    nan], dtype=float64)
+
+    >>> np.testing.assert_array_almost_equal([1.0,2.33333,np.nan],
+    ...                                      [1.0,2.33333, 5], decimal=5)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not almost equal to 5 decimals
+    <BLANKLINE>
+    x and y nan location mismatch:
+     x: torch.ndarray([1.0000, 2.3333,    nan], dtype=float64)
+     y: torch.ndarray([1.0000, 2.3333, 5.0000], dtype=float64)
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    from torch._numpy import any as npany, float_, issubdtype, number, result_type
+
+    def compare(x, y):
+        try:
+            if npany(gisinf(x)) or npany(gisinf(y)):
+                xinfid = gisinf(x)
+                yinfid = gisinf(y)
+                if not (xinfid == yinfid).all():
+                    return False
+                # if one item, x and y is +- inf
+                if x.size == y.size == 1:
+                    return x == y
+                x = x[~xinfid]
+                y = y[~yinfid]
+        except (TypeError, NotImplementedError):
+            pass
+
+        # make sure y is an inexact type to avoid abs(MIN_INT); will cause
+        # casting of x later.
+        dtype = result_type(y, 1.0)
+        y = asanyarray(y, dtype)
+        z = abs(x - y)
+
+        if not issubdtype(z.dtype, number):
+            z = z.astype(float_)  # handle object arrays
+
+        return z < 1.5 * 10.0 ** (-decimal)
+
+    assert_array_compare(
+        compare,
+        x,
+        y,
+        err_msg=err_msg,
+        verbose=verbose,
+        header=("Arrays are not almost equal to %d decimals" % decimal),
+        precision=decimal,
+    )
+
+
+def assert_array_less(x, y, err_msg="", verbose=True):
+    """
+    Raises an AssertionError if two array_like objects are not ordered by less
+    than.
+
+    Given two array_like objects, check that the shape is equal and all
+    elements of the first object are strictly smaller than those of the
+    second object. An exception is raised at shape mismatch or incorrectly
+    ordered values. Shape mismatch does not raise if an object has zero
+    dimension. In contrast to the standard usage in numpy, NaNs are
+    compared, no assertion is raised if both objects have NaNs in the same
+    positions.
+
+
+
+    Parameters
+    ----------
+    x : array_like
+      The smaller object to check.
+    y : array_like
+      The larger object to compare.
+    err_msg : string
+      The error message to be printed in case of failure.
+    verbose : bool
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+      If actual and desired objects are not equal.
+
+    See Also
+    --------
+    assert_array_equal: tests objects for equality
+    assert_array_almost_equal: test objects for equality up to precision
+
+
+
+    Examples
+    --------
+    >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1.1, 2.0, np.nan])
+    >>> np.testing.assert_array_less([1.0, 1.0, np.nan], [1, 2.0, np.nan])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not less-ordered
+    <BLANKLINE>
+    Mismatched elements: 1 / 3 (33.3%)
+    Max absolute difference: 1.0
+    Max relative difference: 0.5
+     x: torch.ndarray([1.,  1., nan], dtype=float64)
+     y: torch.ndarray([1.,  2., nan], dtype=float64)
+
+    >>> np.testing.assert_array_less([1.0, 4.0], 3)
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not less-ordered
+    <BLANKLINE>
+    Mismatched elements: 1 / 2 (50%)
+    Max absolute difference: 2.0
+    Max relative difference: 0.6666666666666666
+     x: torch.ndarray([1., 4.], dtype=float64)
+     y: torch.ndarray(3)
+
+    >>> np.testing.assert_array_less([1.0, 2.0, 3.0], [4])
+    Traceback (most recent call last):
+        ...
+    AssertionError:
+    Arrays are not less-ordered
+    <BLANKLINE>
+    (shapes (3,), (1,) mismatch)
+     x: torch.ndarray([1., 2., 3.], dtype=float64)
+     y: torch.ndarray([4])
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    assert_array_compare(
+        operator.__lt__,
+        x,
+        y,
+        err_msg=err_msg,
+        verbose=verbose,
+        header="Arrays are not less-ordered",
+        equal_inf=False,
+    )
+
+
+def assert_string_equal(actual, desired):
+    """
+    Test if two strings are equal.
+
+    If the given strings are equal, `assert_string_equal` does nothing.
+    If they are not equal, an AssertionError is raised, and the diff
+    between the strings is shown.
+
+    Parameters
+    ----------
+    actual : str
+        The string to test for equality against the expected string.
+    desired : str
+        The expected string.
+
+    Examples
+    --------
+    >>> np.testing.assert_string_equal('abc', 'abc')  # doctest: +SKIP
+    >>> np.testing.assert_string_equal('abc', 'abcd')  # doctest: +SKIP
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+    ...
+    AssertionError: Differences in strings:
+    - abc+ abcd?    +
+
+    """
+    # delay import of difflib to reduce startup time
+    __tracebackhide__ = True  # Hide traceback for py.test
+    import difflib
+
+    if not isinstance(actual, str):
+        raise AssertionError(repr(type(actual)))
+    if not isinstance(desired, str):
+        raise AssertionError(repr(type(desired)))
+    if desired == actual:
+        return
+
+    diff = list(
+        difflib.Differ().compare(actual.splitlines(True), desired.splitlines(True))
+    )
+    diff_list = []
+    while diff:
+        d1 = diff.pop(0)
+        if d1.startswith("  "):
+            continue
+        if d1.startswith("- "):
+            l = [d1]
+            d2 = diff.pop(0)
+            if d2.startswith("? "):
+                l.append(d2)
+                d2 = diff.pop(0)
+            if not d2.startswith("+ "):
+                raise AssertionError(repr(d2))
+            l.append(d2)
+            if diff:
+                d3 = diff.pop(0)
+                if d3.startswith("? "):
+                    l.append(d3)
+                else:
+                    diff.insert(0, d3)
+            if d2[2:] == d1[2:]:
+                continue
+            diff_list.extend(l)
+            continue
+        raise AssertionError(repr(d1))
+    if not diff_list:
+        return
+    msg = f"Differences in strings:\n{''.join(diff_list).rstrip()}"
+    if actual != desired:
+        raise AssertionError(msg)
+
+
+import unittest
+
+
+class _Dummy(unittest.TestCase):
+    def nop(self):
+        pass
+
+
+_d = _Dummy("nop")
+
+
+def assert_raises_regex(exception_class, expected_regexp, *args, **kwargs):
+    """
+    assert_raises_regex(exception_class, expected_regexp, callable, *args,
+                        **kwargs)
+    assert_raises_regex(exception_class, expected_regexp)
+
+    Fail unless an exception of class exception_class and with message that
+    matches expected_regexp is thrown by callable when invoked with arguments
+    args and keyword arguments kwargs.
+
+    Alternatively, can be used as a context manager like `assert_raises`.
+
+    Notes
+    -----
+    .. versionadded:: 1.9.0
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    return _d.assertRaisesRegex(exception_class, expected_regexp, *args, **kwargs)
+
+
+def decorate_methods(cls, decorator, testmatch=None):
+    """
+    Apply a decorator to all methods in a class matching a regular expression.
+
+    The given decorator is applied to all public methods of `cls` that are
+    matched by the regular expression `testmatch`
+    (``testmatch.search(methodname)``). Methods that are private, i.e. start
+    with an underscore, are ignored.
+
+    Parameters
+    ----------
+    cls : class
+        Class whose methods to decorate.
+    decorator : function
+        Decorator to apply to methods
+    testmatch : compiled regexp or str, optional
+        The regular expression. Default value is None, in which case the
+        nose default (``re.compile(r'(?:^|[\\b_\\.%s-])[Tt]est' % os.sep)``)
+        is used.
+        If `testmatch` is a string, it is compiled to a regular expression
+        first.
+
+    """
+    if testmatch is None:
+        testmatch = re.compile(r"(?:^|[\\b_\\.%s-])[Tt]est" % os.sep)
+    else:
+        testmatch = re.compile(testmatch)
+    cls_attr = cls.__dict__
+
+    # delayed import to reduce startup time
+    from inspect import isfunction
+
+    methods = [_m for _m in cls_attr.values() if isfunction(_m)]
+    for function in methods:
+        try:
+            if hasattr(function, "compat_func_name"):
+                funcname = function.compat_func_name
+            else:
+                funcname = function.__name__
+        except AttributeError:
+            # not a function
+            continue
+        if testmatch.search(funcname) and not funcname.startswith("_"):
+            setattr(cls, funcname, decorator(function))
+    return
+
+
+def _assert_valid_refcount(op):
+    """
+    Check that ufuncs don't mishandle refcount of object `1`.
+    Used in a few regression tests.
+    """
+    if not HAS_REFCOUNT:
+        return True
+
+    import gc
+
+    import numpy as np
+
+    b = np.arange(100 * 100).reshape(100, 100)
+    c = b
+    i = 1
+
+    gc.disable()
+    try:
+        rc = sys.getrefcount(i)
+        for j in range(15):
+            d = op(b, c)
+        assert_(sys.getrefcount(i) >= rc)
+    finally:
+        gc.enable()
+    del d  # for pyflakes
+
+
+def assert_allclose(
+    actual,
+    desired,
+    rtol=1e-7,
+    atol=0,
+    equal_nan=True,
+    err_msg="",
+    verbose=True,
+    check_dtype=False,
+):
+    """
+    Raises an AssertionError if two objects are not equal up to desired
+    tolerance.
+
+    Given two array_like objects, check that their shapes and all elements
+    are equal (but see the Notes for the special handling of a scalar). An
+    exception is raised if the shapes mismatch or any values conflict. In
+    contrast to the standard usage in numpy, NaNs are compared like numbers,
+    no assertion is raised if both objects have NaNs in the same positions.
+
+    The test is equivalent to ``allclose(actual, desired, rtol, atol)`` (note
+    that ``allclose`` has different default values). It compares the difference
+    between `actual` and `desired` to ``atol + rtol * abs(desired)``.
+
+    .. versionadded:: 1.5.0
+
+    Parameters
+    ----------
+    actual : array_like
+        Array obtained.
+    desired : array_like
+        Array desired.
+    rtol : float, optional
+        Relative tolerance.
+    atol : float, optional
+        Absolute tolerance.
+    equal_nan : bool, optional.
+        If True, NaNs will compare equal.
+    err_msg : str, optional
+        The error message to be printed in case of failure.
+    verbose : bool, optional
+        If True, the conflicting values are appended to the error message.
+
+    Raises
+    ------
+    AssertionError
+        If actual and desired are not equal up to specified precision.
+
+    See Also
+    --------
+    assert_array_almost_equal_nulp, assert_array_max_ulp
+
+    Notes
+    -----
+    When one of `actual` and `desired` is a scalar and the other is
+    array_like, the function checks that each element of the array_like
+    object is equal to the scalar.
+
+    Examples
+    --------
+    >>> x = [1e-5, 1e-3, 1e-1]
+    >>> y = np.arccos(np.cos(x))
+    >>> np.testing.assert_allclose(x, y, rtol=1e-5, atol=0)
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+
+    def compare(x, y):
+        return np.isclose(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+    actual, desired = asanyarray(actual), asanyarray(desired)
+    header = f"Not equal to tolerance rtol={rtol:g}, atol={atol:g}"
+
+    if check_dtype:
+        assert actual.dtype == desired.dtype
+
+    assert_array_compare(
+        compare,
+        actual,
+        desired,
+        err_msg=str(err_msg),
+        verbose=verbose,
+        header=header,
+        equal_nan=equal_nan,
+    )
+
+
+def assert_array_almost_equal_nulp(x, y, nulp=1):
+    """
+    Compare two arrays relatively to their spacing.
+
+    This is a relatively robust method to compare two arrays whose amplitude
+    is variable.
+
+    Parameters
+    ----------
+    x, y : array_like
+        Input arrays.
+    nulp : int, optional
+        The maximum number of unit in the last place for tolerance (see Notes).
+        Default is 1.
+
+    Returns
+    -------
+    None
+
+    Raises
+    ------
+    AssertionError
+        If the spacing between `x` and `y` for one or more elements is larger
+        than `nulp`.
+
+    See Also
+    --------
+    assert_array_max_ulp : Check that all items of arrays differ in at most
+        N Units in the Last Place.
+    spacing : Return the distance between x and the nearest adjacent number.
+
+    Notes
+    -----
+    An assertion is raised if the following condition is not met::
+
+        abs(x - y) <= nulp * spacing(maximum(abs(x), abs(y)))
+
+    Examples
+    --------
+    >>> x = np.array([1., 1e-10, 1e-20])
+    >>> eps = np.finfo(x.dtype).eps
+    >>> np.testing.assert_array_almost_equal_nulp(x, x*eps/2 + x)  # doctest: +SKIP
+
+    >>> np.testing.assert_array_almost_equal_nulp(x, x*eps + x)  # doctest: +SKIP
+    Traceback (most recent call last):
+      ...
+    AssertionError: X and Y are not equal to 1 ULP (max is 2)
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    import numpy as np
+
+    ax = np.abs(x)
+    ay = np.abs(y)
+    ref = nulp * np.spacing(np.where(ax > ay, ax, ay))
+    if not np.all(np.abs(x - y) <= ref):
+        if np.iscomplexobj(x) or np.iscomplexobj(y):
+            msg = "X and Y are not equal to %d ULP" % nulp
+        else:
+            max_nulp = np.max(nulp_diff(x, y))
+            msg = "X and Y are not equal to %d ULP (max is %g)" % (nulp, max_nulp)
+        raise AssertionError(msg)
+
+
+def assert_array_max_ulp(a, b, maxulp=1, dtype=None):
+    """
+    Check that all items of arrays differ in at most N Units in the Last Place.
+
+    Parameters
+    ----------
+    a, b : array_like
+        Input arrays to be compared.
+    maxulp : int, optional
+        The maximum number of units in the last place that elements of `a` and
+        `b` can differ. Default is 1.
+    dtype : dtype, optional
+        Data-type to convert `a` and `b` to if given. Default is None.
+
+    Returns
+    -------
+    ret : ndarray
+        Array containing number of representable floating point numbers between
+        items in `a` and `b`.
+
+    Raises
+    ------
+    AssertionError
+        If one or more elements differ by more than `maxulp`.
+
+    Notes
+    -----
+    For computing the ULP difference, this API does not differentiate between
+    various representations of NAN (ULP difference between 0x7fc00000 and 0xffc00000
+    is zero).
+
+    See Also
+    --------
+    assert_array_almost_equal_nulp : Compare two arrays relatively to their
+        spacing.
+
+    Examples
+    --------
+    >>> a = np.linspace(0., 1., 100)
+    >>> res = np.testing.assert_array_max_ulp(a, np.arcsin(np.sin(a)))  # doctest: +SKIP
+
+    """
+    __tracebackhide__ = True  # Hide traceback for py.test
+    import numpy as np
+
+    ret = nulp_diff(a, b, dtype)
+    if not np.all(ret <= maxulp):
+        raise AssertionError(
+            f"Arrays are not almost equal up to {maxulp:g} "
+            f"ULP (max difference is {np.max(ret):g} ULP)"
+        )
+    return ret
+
+
+def nulp_diff(x, y, dtype=None):
+    """For each item in x and y, return the number of representable floating
+    points between them.
+
+    Parameters
+    ----------
+    x : array_like
+        first input array
+    y : array_like
+        second input array
+    dtype : dtype, optional
+        Data-type to convert `x` and `y` to if given. Default is None.
+
+    Returns
+    -------
+    nulp : array_like
+        number of representable floating point numbers between each item in x
+        and y.
+
+    Notes
+    -----
+    For computing the ULP difference, this API does not differentiate between
+    various representations of NAN (ULP difference between 0x7fc00000 and 0xffc00000
+    is zero).
+
+    Examples
+    --------
+    # By definition, epsilon is the smallest number such as 1 + eps != 1, so
+    # there should be exactly one ULP between 1 and 1 + eps
+    >>> nulp_diff(1, 1 + np.finfo(x.dtype).eps)  # doctest: +SKIP
+    1.0
+    """
+    import numpy as np
+
+    if dtype:
+        x = np.asarray(x, dtype=dtype)
+        y = np.asarray(y, dtype=dtype)
+    else:
+        x = np.asarray(x)
+        y = np.asarray(y)
+
+    t = np.common_type(x, y)
+    if np.iscomplexobj(x) or np.iscomplexobj(y):
+        raise NotImplementedError("_nulp not implemented for complex array")
+
+    x = np.array([x], dtype=t)
+    y = np.array([y], dtype=t)
+
+    x[np.isnan(x)] = np.nan
+    y[np.isnan(y)] = np.nan
+
+    if not x.shape == y.shape:
+        raise ValueError(f"x and y do not have the same shape: {x.shape} - {y.shape}")
+
+    def _diff(rx, ry, vdt):
+        diff = np.asarray(rx - ry, dtype=vdt)
+        return np.abs(diff)
+
+    rx = integer_repr(x)
+    ry = integer_repr(y)
+    return _diff(rx, ry, t)
+
+
+def _integer_repr(x, vdt, comp):
+    # Reinterpret binary representation of the float as sign-magnitude:
+    # take into account two-complement representation
+    # See also
+    # https://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+    rx = x.view(vdt)
+    if not (rx.size == 1):
+        rx[rx < 0] = comp - rx[rx < 0]
+    else:
+        if rx < 0:
+            rx = comp - rx
+
+    return rx
+
+
+def integer_repr(x):
+    """Return the signed-magnitude interpretation of the binary representation
+    of x."""
+    import numpy as np
+
+    if x.dtype == np.float16:
+        return _integer_repr(x, np.int16, np.int16(-(2**15)))
+    elif x.dtype == np.float32:
+        return _integer_repr(x, np.int32, np.int32(-(2**31)))
+    elif x.dtype == np.float64:
+        return _integer_repr(x, np.int64, np.int64(-(2**63)))
+    else:
+        raise ValueError(f"Unsupported dtype {x.dtype}")
+
+
+@contextlib.contextmanager
+def _assert_warns_context(warning_class, name=None):
+    __tracebackhide__ = True  # Hide traceback for py.test
+    with suppress_warnings() as sup:
+        l = sup.record(warning_class)
+        yield
+        if not len(l) > 0:
+            name_str = f" when calling {name}" if name is not None else ""
+            raise AssertionError("No warning raised" + name_str)
+
+
+def assert_warns(warning_class, *args, **kwargs):
+    """
+    Fail unless the given callable throws the specified warning.
+
+    A warning of class warning_class should be thrown by the callable when
+    invoked with arguments args and keyword arguments kwargs.
+    If a different type of warning is thrown, it will not be caught.
+
+    If called with all arguments other than the warning class omitted, may be
+    used as a context manager:
+
+        with assert_warns(SomeWarning):
+            do_something()
+
+    The ability to be used as a context manager is new in NumPy v1.11.0.
+
+    .. versionadded:: 1.4.0
+
+    Parameters
+    ----------
+    warning_class : class
+        The class defining the warning that `func` is expected to throw.
+    func : callable, optional
+        Callable to test
+    *args : Arguments
+        Arguments for `func`.
+    **kwargs : Kwargs
+        Keyword arguments for `func`.
+
+    Returns
+    -------
+    The value returned by `func`.
+
+    Examples
+    --------
+    >>> import warnings
+    >>> def deprecated_func(num):
+    ...     warnings.warn("Please upgrade", DeprecationWarning)
+    ...     return num*num
+    >>> with np.testing.assert_warns(DeprecationWarning):
+    ...     assert deprecated_func(4) == 16
+    >>> # or passing a func
+    >>> ret = np.testing.assert_warns(DeprecationWarning, deprecated_func, 4)
+    >>> assert ret == 16
+    """
+    if not args:
+        return _assert_warns_context(warning_class)
+
+    func = args[0]
+    args = args[1:]
+    with _assert_warns_context(warning_class, name=func.__name__):
+        return func(*args, **kwargs)
+
+
+@contextlib.contextmanager
+def _assert_no_warnings_context(name=None):
+    __tracebackhide__ = True  # Hide traceback for py.test
+    with warnings.catch_warnings(record=True) as l:
+        warnings.simplefilter("always")
+        yield
+        if len(l) > 0:
+            name_str = f" when calling {name}" if name is not None else ""
+            raise AssertionError(f"Got warnings{name_str}: {l}")
+
+
+def assert_no_warnings(*args, **kwargs):
+    """
+    Fail if the given callable produces any warnings.
+
+    If called with all arguments omitted, may be used as a context manager:
+
+        with assert_no_warnings():
+            do_something()
+
+    The ability to be used as a context manager is new in NumPy v1.11.0.
+
+    .. versionadded:: 1.7.0
+
+    Parameters
+    ----------
+    func : callable
+        The callable to test.
+    \\*args : Arguments
+        Arguments passed to `func`.
+    \\*\\*kwargs : Kwargs
+        Keyword arguments passed to `func`.
+
+    Returns
+    -------
+    The value returned by `func`.
+
+    """
+    if not args:
+        return _assert_no_warnings_context()
+
+    func = args[0]
+    args = args[1:]
+    with _assert_no_warnings_context(name=func.__name__):
+        return func(*args, **kwargs)
+
+
+def _gen_alignment_data(dtype=float32, type="binary", max_size=24):
+    """
+    generator producing data with different alignment and offsets
+    to test simd vectorization
+
+    Parameters
+    ----------
+    dtype : dtype
+        data type to produce
+    type : string
+        'unary': create data for unary operations, creates one input
+                 and output array
+        'binary': create data for unary operations, creates two input
+                 and output array
+    max_size : integer
+        maximum size of data to produce
+
+    Returns
+    -------
+    if type is 'unary' yields one output, one input array and a message
+    containing information on the data
+    if type is 'binary' yields one output array, two input array and a message
+    containing information on the data
+
+    """
+    ufmt = "unary offset=(%d, %d), size=%d, dtype=%r, %s"
+    bfmt = "binary offset=(%d, %d, %d), size=%d, dtype=%r, %s"
+    for o in range(3):
+        for s in range(o + 2, max(o + 3, max_size)):
+            if type == "unary":
+
+                def inp():
+                    return arange(s, dtype=dtype)[o:]
+
+                out = empty((s,), dtype=dtype)[o:]
+                yield out, inp(), ufmt % (o, o, s, dtype, "out of place")
+                d = inp()
+                yield d, d, ufmt % (o, o, s, dtype, "in place")
+                yield out[1:], inp()[:-1], ufmt % (
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp()[1:], ufmt % (
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield inp()[:-1], inp()[1:], ufmt % (o, o + 1, s - 1, dtype, "aliased")
+                yield inp()[1:], inp()[:-1], ufmt % (o + 1, o, s - 1, dtype, "aliased")
+            if type == "binary":
+
+                def inp1():
+                    return arange(s, dtype=dtype)[o:]
+
+                inp2 = inp1
+                out = empty((s,), dtype=dtype)[o:]
+                yield out, inp1(), inp2(), bfmt % (o, o, o, s, dtype, "out of place")
+                d = inp1()
+                yield d, d, inp2(), bfmt % (o, o, o, s, dtype, "in place1")
+                d = inp2()
+                yield d, inp1(), d, bfmt % (o, o, o, s, dtype, "in place2")
+                yield out[1:], inp1()[:-1], inp2()[:-1], bfmt % (
+                    o + 1,
+                    o,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp1()[1:], inp2()[:-1], bfmt % (
+                    o,
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield out[:-1], inp1()[:-1], inp2()[1:], bfmt % (
+                    o,
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "out of place",
+                )
+                yield inp1()[1:], inp1()[:-1], inp2()[:-1], bfmt % (
+                    o + 1,
+                    o,
+                    o,
+                    s - 1,
+                    dtype,
+                    "aliased",
+                )
+                yield inp1()[:-1], inp1()[1:], inp2()[:-1], bfmt % (
+                    o,
+                    o + 1,
+                    o,
+                    s - 1,
+                    dtype,
+                    "aliased",
+                )
+                yield inp1()[:-1], inp1()[:-1], inp2()[1:], bfmt % (
+                    o,
+                    o,
+                    o + 1,
+                    s - 1,
+                    dtype,
+                    "aliased",
+                )
+
+
+class IgnoreException(Exception):
+    "Ignoring this exception due to disabled feature"
+
+
+@contextlib.contextmanager
+def tempdir(*args, **kwargs):
+    """Context manager to provide a temporary test folder.
+
+    All arguments are passed as this to the underlying tempfile.mkdtemp
+    function.
+
+    """
+    tmpdir = mkdtemp(*args, **kwargs)
+    try:
+        yield tmpdir
+    finally:
+        shutil.rmtree(tmpdir)
+
+
+@contextlib.contextmanager
+def temppath(*args, **kwargs):
+    """Context manager for temporary files.
+
+    Context manager that returns the path to a closed temporary file. Its
+    parameters are the same as for tempfile.mkstemp and are passed directly
+    to that function. The underlying file is removed when the context is
+    exited, so it should be closed at that time.
+
+    Windows does not allow a temporary file to be opened if it is already
+    open, so the underlying file must be closed after opening before it
+    can be opened again.
+
+    """
+    fd, path = mkstemp(*args, **kwargs)
+    os.close(fd)
+    try:
+        yield path
+    finally:
+        os.remove(path)
+
+
+class clear_and_catch_warnings(warnings.catch_warnings):
+    """Context manager that resets warning registry for catching warnings
+
+    Warnings can be slippery, because, whenever a warning is triggered, Python
+    adds a ``__warningregistry__`` member to the *calling* module.  This makes
+    it impossible to retrigger the warning in this module, whatever you put in
+    the warnings filters.  This context manager accepts a sequence of `modules`
+    as a keyword argument to its constructor and:
+
+    * stores and removes any ``__warningregistry__`` entries in given `modules`
+      on entry;
+    * resets ``__warningregistry__`` to its previous state on exit.
+
+    This makes it possible to trigger any warning afresh inside the context
+    manager without disturbing the state of warnings outside.
+
+    For compatibility with Python 3.0, please consider all arguments to be
+    keyword-only.
+
+    Parameters
+    ----------
+    record : bool, optional
+        Specifies whether warnings should be captured by a custom
+        implementation of ``warnings.showwarning()`` and be appended to a list
+        returned by the context manager. Otherwise None is returned by the
+        context manager. The objects appended to the list are arguments whose
+        attributes mirror the arguments to ``showwarning()``.
+    modules : sequence, optional
+        Sequence of modules for which to reset warnings registry on entry and
+        restore on exit. To work correctly, all 'ignore' filters should
+        filter by one of these modules.
+
+    Examples
+    --------
+    >>> import warnings
+    >>> with np.testing.clear_and_catch_warnings(  # doctest: +SKIP
+    ...         modules=[np.core.fromnumeric]):
+    ...     warnings.simplefilter('always')
+    ...     warnings.filterwarnings('ignore', module='np.core.fromnumeric')
+    ...     # do something that raises a warning but ignore those in
+    ...     # np.core.fromnumeric
+    """
+
+    class_modules = ()
+
+    def __init__(self, record=False, modules=()):
+        self.modules = set(modules).union(self.class_modules)
+        self._warnreg_copies = {}
+        super().__init__(record=record)
+
+    def __enter__(self):
+        for mod in self.modules:
+            if hasattr(mod, "__warningregistry__"):
+                mod_reg = mod.__warningregistry__
+                self._warnreg_copies[mod] = mod_reg.copy()
+                mod_reg.clear()
+        return super().__enter__()
+
+    def __exit__(self, *exc_info):
+        super().__exit__(*exc_info)
+        for mod in self.modules:
+            if hasattr(mod, "__warningregistry__"):
+                mod.__warningregistry__.clear()
+            if mod in self._warnreg_copies:
+                mod.__warningregistry__.update(self._warnreg_copies[mod])
+
+
+class suppress_warnings:
+    """
+    Context manager and decorator doing much the same as
+    ``warnings.catch_warnings``.
+
+    However, it also provides a filter mechanism to work around
+    https://bugs.python.org/issue4180.
+
+    This bug causes Python before 3.4 to not reliably show warnings again
+    after they have been ignored once (even within catch_warnings). It
+    means that no "ignore" filter can be used easily, since following
+    tests might need to see the warning. Additionally it allows easier
+    specificity for testing warnings and can be nested.
+
+    Parameters
+    ----------
+    forwarding_rule : str, optional
+        One of "always", "once", "module", or "location". Analogous to
+        the usual warnings module filter mode, it is useful to reduce
+        noise mostly on the outmost level. Unsuppressed and unrecorded
+        warnings will be forwarded based on this rule. Defaults to "always".
+        "location" is equivalent to the warnings "default", match by exact
+        location the warning warning originated from.
+
+    Notes
+    -----
+    Filters added inside the context manager will be discarded again
+    when leaving it. Upon entering all filters defined outside a
+    context will be applied automatically.
+
+    When a recording filter is added, matching warnings are stored in the
+    ``log`` attribute as well as in the list returned by ``record``.
+
+    If filters are added and the ``module`` keyword is given, the
+    warning registry of this module will additionally be cleared when
+    applying it, entering the context, or exiting it. This could cause
+    warnings to appear a second time after leaving the context if they
+    were configured to be printed once (default) and were already
+    printed before the context was entered.
+
+    Nesting this context manager will work as expected when the
+    forwarding rule is "always" (default). Unfiltered and unrecorded
+    warnings will be passed out and be matched by the outer level.
+    On the outmost level they will be printed (or caught by another
+    warnings context). The forwarding rule argument can modify this
+    behaviour.
+
+    Like ``catch_warnings`` this context manager is not threadsafe.
+
+    Examples
+    --------
+
+    With a context manager::
+
+        with np.testing.suppress_warnings() as sup:
+            sup.filter(DeprecationWarning, "Some text")
+            sup.filter(module=np.ma.core)
+            log = sup.record(FutureWarning, "Does this occur?")
+            command_giving_warnings()
+            # The FutureWarning was given once, the filtered warnings were
+            # ignored. All other warnings abide outside settings (may be
+            # printed/error)
+            assert_(len(log) == 1)
+            assert_(len(sup.log) == 1)  # also stored in log attribute
+
+    Or as a decorator::
+
+        sup = np.testing.suppress_warnings()
+        sup.filter(module=np.ma.core)  # module must match exactly
+        @sup
+        def some_function():
+            # do something which causes a warning in np.ma.core
+            pass
+    """
+
+    def __init__(self, forwarding_rule="always"):
+        self._entered = False
+
+        # Suppressions are either instance or defined inside one with block:
+        self._suppressions = []
+
+        if forwarding_rule not in {"always", "module", "once", "location"}:
+            raise ValueError("unsupported forwarding rule.")
+        self._forwarding_rule = forwarding_rule
+
+    def _clear_registries(self):
+        if hasattr(warnings, "_filters_mutated"):
+            # clearing the registry should not be necessary on new pythons,
+            # instead the filters should be mutated.
+            warnings._filters_mutated()
+            return
+        # Simply clear the registry, this should normally be harmless,
+        # note that on new pythons it would be invalidated anyway.
+        for module in self._tmp_modules:
+            if hasattr(module, "__warningregistry__"):
+                module.__warningregistry__.clear()
+
+    def _filter(self, category=Warning, message="", module=None, record=False):
+        if record:
+            record = []  # The log where to store warnings
+        else:
+            record = None
+        if self._entered:
+            if module is None:
+                warnings.filterwarnings("always", category=category, message=message)
+            else:
+                module_regex = module.__name__.replace(".", r"\.") + "$"
+                warnings.filterwarnings(
+                    "always", category=category, message=message, module=module_regex
+                )
+                self._tmp_modules.add(module)
+                self._clear_registries()
+
+            self._tmp_suppressions.append(
+                (category, message, re.compile(message, re.I), module, record)
+            )
+        else:
+            self._suppressions.append(
+                (category, message, re.compile(message, re.I), module, record)
+            )
+
+        return record
+
+    def filter(self, category=Warning, message="", module=None):
+        """
+        Add a new suppressing filter or apply it if the state is entered.
+
+        Parameters
+        ----------
+        category : class, optional
+            Warning class to filter
+        message : string, optional
+            Regular expression matching the warning message.
+        module : module, optional
+            Module to filter for. Note that the module (and its file)
+            must match exactly and cannot be a submodule. This may make
+            it unreliable for external modules.
+
+        Notes
+        -----
+        When added within a context, filters are only added inside
+        the context and will be forgotten when the context is exited.
+        """
+        self._filter(category=category, message=message, module=module, record=False)
+
+    def record(self, category=Warning, message="", module=None):
+        """
+        Append a new recording filter or apply it if the state is entered.
+
+        All warnings matching will be appended to the ``log`` attribute.
+
+        Parameters
+        ----------
+        category : class, optional
+            Warning class to filter
+        message : string, optional
+            Regular expression matching the warning message.
+        module : module, optional
+            Module to filter for. Note that the module (and its file)
+            must match exactly and cannot be a submodule. This may make
+            it unreliable for external modules.
+
+        Returns
+        -------
+        log : list
+            A list which will be filled with all matched warnings.
+
+        Notes
+        -----
+        When added within a context, filters are only added inside
+        the context and will be forgotten when the context is exited.
+        """
+        return self._filter(
+            category=category, message=message, module=module, record=True
+        )
+
+    def __enter__(self):
+        if self._entered:
+            raise RuntimeError("cannot enter suppress_warnings twice.")
+
+        self._orig_show = warnings.showwarning
+        self._filters = warnings.filters
+        warnings.filters = self._filters[:]
+
+        self._entered = True
+        self._tmp_suppressions = []
+        self._tmp_modules = set()
+        self._forwarded = set()
+
+        self.log = []  # reset global log (no need to keep same list)
+
+        for cat, mess, _, mod, log in self._suppressions:
+            if log is not None:
+                del log[:]  # clear the log
+            if mod is None:
+                warnings.filterwarnings("always", category=cat, message=mess)
+            else:
+                module_regex = mod.__name__.replace(".", r"\.") + "$"
+                warnings.filterwarnings(
+                    "always", category=cat, message=mess, module=module_regex
+                )
+                self._tmp_modules.add(mod)
+        warnings.showwarning = self._showwarning
+        self._clear_registries()
+
+        return self
+
+    def __exit__(self, *exc_info):
+        warnings.showwarning = self._orig_show
+        warnings.filters = self._filters
+        self._clear_registries()
+        self._entered = False
+        del self._orig_show
+        del self._filters
+
+    def _showwarning(
+        self, message, category, filename, lineno, *args, use_warnmsg=None, **kwargs
+    ):
+        for cat, _, pattern, mod, rec in (self._suppressions + self._tmp_suppressions)[
+            ::-1
+        ]:
+            if issubclass(category, cat) and pattern.match(message.args[0]) is not None:
+                if mod is None:
+                    # Message and category match, either recorded or ignored
+                    if rec is not None:
+                        msg = WarningMessage(
+                            message, category, filename, lineno, **kwargs
+                        )
+                        self.log.append(msg)
+                        rec.append(msg)
+                    return
+                # Use startswith, because warnings strips the c or o from
+                # .pyc/.pyo files.
+                elif mod.__file__.startswith(filename):
+                    # The message and module (filename) match
+                    if rec is not None:
+                        msg = WarningMessage(
+                            message, category, filename, lineno, **kwargs
+                        )
+                        self.log.append(msg)
+                        rec.append(msg)
+                    return
+
+        # There is no filter in place, so pass to the outside handler
+        # unless we should only pass it once
+        if self._forwarding_rule == "always":
+            if use_warnmsg is None:
+                self._orig_show(message, category, filename, lineno, *args, **kwargs)
+            else:
+                self._orig_showmsg(use_warnmsg)
+            return
+
+        if self._forwarding_rule == "once":
+            signature = (message.args, category)
+        elif self._forwarding_rule == "module":
+            signature = (message.args, category, filename)
+        elif self._forwarding_rule == "location":
+            signature = (message.args, category, filename, lineno)
+
+        if signature in self._forwarded:
+            return
+        self._forwarded.add(signature)
+        if use_warnmsg is None:
+            self._orig_show(message, category, filename, lineno, *args, **kwargs)
+        else:
+            self._orig_showmsg(use_warnmsg)
+
+    def __call__(self, func):
+        """
+        Function decorator to apply certain suppressions to a whole
+        function.
+        """
+
+        @wraps(func)
+        def new_func(*args, **kwargs):
+            with self:
+                return func(*args, **kwargs)
+
+        return new_func
+
+
+@contextlib.contextmanager
+def _assert_no_gc_cycles_context(name=None):
+    __tracebackhide__ = True  # Hide traceback for py.test
+
+    # not meaningful to test if there is no refcounting
+    if not HAS_REFCOUNT:
+        yield
+        return
+
+    assert_(gc.isenabled())
+    gc.disable()
+    gc_debug = gc.get_debug()
+    try:
+        for i in range(100):
+            if gc.collect() == 0:
+                break
+        else:
+            raise RuntimeError(
+                "Unable to fully collect garbage - perhaps a __del__ method "
+                "is creating more reference cycles?"
+            )
+
+        gc.set_debug(gc.DEBUG_SAVEALL)
+        yield
+        # gc.collect returns the number of unreachable objects in cycles that
+        # were found -- we are checking that no cycles were created in the context
+        n_objects_in_cycles = gc.collect()
+        objects_in_cycles = gc.garbage[:]
+    finally:
+        del gc.garbage[:]
+        gc.set_debug(gc_debug)
+        gc.enable()
+
+    if n_objects_in_cycles:
+        name_str = f" when calling {name}" if name is not None else ""
+        raise AssertionError(
+            "Reference cycles were found{}: {} objects were collected, "
+            "of which {} are shown below:{}".format(
+                name_str,
+                n_objects_in_cycles,
+                len(objects_in_cycles),
+                "".join(
+                    "\n  {} object with id={}:\n    {}".format(
+                        type(o).__name__,
+                        id(o),
+                        pprint.pformat(o).replace("\n", "\n    "),
+                    )
+                    for o in objects_in_cycles
+                ),
+            )
+        )
+
+
+def assert_no_gc_cycles(*args, **kwargs):
+    """
+    Fail if the given callable produces any reference cycles.
+
+    If called with all arguments omitted, may be used as a context manager:
+
+        with assert_no_gc_cycles():
+            do_something()
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    func : callable
+        The callable to test.
+    \\*args : Arguments
+        Arguments passed to `func`.
+    \\*\\*kwargs : Kwargs
+        Keyword arguments passed to `func`.
+
+    Returns
+    -------
+    Nothing. The result is deliberately discarded to ensure that all cycles
+    are found.
+
+    """
+    if not args:
+        return _assert_no_gc_cycles_context()
+
+    func = args[0]
+    args = args[1:]
+    with _assert_no_gc_cycles_context(name=func.__name__):
+        func(*args, **kwargs)
+
+
+def break_cycles():
+    """
+    Break reference cycles by calling gc.collect
+    Objects can call other objects' methods (for instance, another object's
+     __del__) inside their own __del__. On PyPy, the interpreter only runs
+    between calls to gc.collect, so multiple calls are needed to completely
+    release all cycles.
+    """
+
+    gc.collect()
+    if IS_PYPY:
+        # a few more, just to make sure all the finalizers are called
+        gc.collect()
+        gc.collect()
+        gc.collect()
+        gc.collect()
+
+
+def requires_memory(free_bytes):
+    """Decorator to skip a test if not enough memory is available"""
+    import pytest
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*a, **kw):
+            msg = check_free_memory(free_bytes)
+            if msg is not None:
+                pytest.skip(msg)
+
+            try:
+                return func(*a, **kw)
+            except MemoryError:
+                # Probably ran out of memory regardless: don't regard as failure
+                pytest.xfail("MemoryError raised")
+
+        return wrapper
+
+    return decorator
+
+
+def check_free_memory(free_bytes):
+    """
+    Check whether `free_bytes` amount of memory is currently free.
+    Returns: None if enough memory available, otherwise error message
+    """
+    env_var = "NPY_AVAILABLE_MEM"
+    env_value = os.environ.get(env_var)
+    if env_value is not None:
+        try:
+            mem_free = _parse_size(env_value)
+        except ValueError as exc:
+            raise ValueError(  # noqa: TRY200
+                f"Invalid environment variable {env_var}: {exc}"
+            )
+
+        msg = (
+            f"{free_bytes/1e9} GB memory required, but environment variable "
+            f"NPY_AVAILABLE_MEM={env_value} set"
+        )
+    else:
+        mem_free = _get_mem_available()
+
+        if mem_free is None:
+            msg = (
+                "Could not determine available memory; set NPY_AVAILABLE_MEM "
+                "environment variable (e.g. NPY_AVAILABLE_MEM=16GB) to run "
+                "the test."
+            )
+            mem_free = -1
+        else:
+            msg = (
+                f"{free_bytes/1e9} GB memory required, but {mem_free/1e9} GB available"
+            )
+
+    return msg if mem_free < free_bytes else None
+
+
+def _parse_size(size_str):
+    """Convert memory size strings ('12 GB' etc.) to float"""
+    suffixes = {
+        "": 1,
+        "b": 1,
+        "k": 1000,
+        "m": 1000**2,
+        "g": 1000**3,
+        "t": 1000**4,
+        "kb": 1000,
+        "mb": 1000**2,
+        "gb": 1000**3,
+        "tb": 1000**4,
+        "kib": 1024,
+        "mib": 1024**2,
+        "gib": 1024**3,
+        "tib": 1024**4,
+    }
+
+    size_re = re.compile(
+        r"^\s*(\d+|\d+\.\d+)\s*({})\s*$".format("|".join(suffixes.keys())), re.I
+    )
+
+    m = size_re.match(size_str.lower())
+    if not m or m.group(2) not in suffixes:
+        raise ValueError(f"value {size_str!r} not a valid size")
+    return int(float(m.group(1)) * suffixes[m.group(2)])
+
+
+def _get_mem_available():
+    """Return available memory in bytes, or None if unknown."""
+    try:
+        import psutil
+
+        return psutil.virtual_memory().available
+    except (ImportError, AttributeError):
+        pass
+
+    if sys.platform.startswith("linux"):
+        info = {}
+        with open("/proc/meminfo") as f:
+            for line in f:
+                p = line.split()
+                info[p[0].strip(":").lower()] = int(p[1]) * 1024
+
+        if "memavailable" in info:
+            # Linux >= 3.14
+            return info["memavailable"]
+        else:
+            return info["memfree"] + info["cached"]
+
+    return None
+
+
+def _no_tracing(func):
+    """
+    Decorator to temporarily turn off tracing for the duration of a test.
+    Needed in tests that check refcounting, otherwise the tracing itself
+    influences the refcounts
+    """
+    if not hasattr(sys, "gettrace"):
+        return func
+    else:
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            original_trace = sys.gettrace()
+            try:
+                sys.settrace(None)
+                return func(*args, **kwargs)
+            finally:
+                sys.settrace(original_trace)
+
+        return wrapper
+
+
+def _get_glibc_version():
+    try:
+        ver = os.confstr("CS_GNU_LIBC_VERSION").rsplit(" ")[1]
+    except Exception as inst:
+        ver = "0.0"
+
+    return ver
+
+
+_glibcver = _get_glibc_version()
+
+
+def _glibc_older_than(x):
+    return _glibcver != "0.0" and _glibcver < x
diff --git a/MLPY/Lib/site-packages/torch/_ops.py b/MLPY/Lib/site-packages/torch/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..95774269e3ccb86ba4decffa94e960189f9f2aff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_ops.py
@@ -0,0 +1,1037 @@
+import contextlib
+import ctypes
+import importlib
+import inspect
+import sys
+import types
+from typing import Any, Callable, Dict, Set, Type, Union
+
+import torch._C
+import torch.utils._pytree as pytree
+from torch import _utils_internal
+from torch._functorch.pyfunctorch import dispatch_functorch
+from torch.utils._python_dispatch import TorchDispatchMode
+
+# Query `hasattr` only once.
+
+_SET_GLOBAL_FLAGS = hasattr(sys, "getdlopenflags") and hasattr(sys, "setdlopenflags")
+
+
+@contextlib.contextmanager
+def dl_open_guard():
+    """
+    Context manager to set the RTLD_GLOBAL dynamic linker flag while we open a
+    shared library to load custom operators.
+    """
+    if not _SET_GLOBAL_FLAGS:
+        yield
+        return
+    old_flags = sys.getdlopenflags()
+    sys.setdlopenflags(old_flags | ctypes.RTLD_GLOBAL)
+    try:
+        yield
+    finally:
+        sys.setdlopenflags(old_flags)
+
+
+class OperatorBase:
+    """
+    Base class for OpOverload (which represents C++ ATen operators) and HigherOrderOperator
+    (which represents Python-only operators that are unrepresentable in TorchScript).
+    """
+
+    def __init__(self):
+        # The dispatch cache precomputes a mapping of dispatch key that the
+        # dispatcher wants to dispatch to, to an actual implementation of the
+        # dispatch key.  Confusingly, the actual implementation could *also* be a
+        # dispatch key, but in this case, this refers to the C++ kernel that
+        # was registered to some dispatch key.  Aliases are permitted in the
+        # latter but not the former; for example, you might lookup the
+        # entry for AutogradCPU, and this maps you to the Autograd key for
+        # the generic autograd kernel that works for all devices.  Since this
+        # is the Python dispatcher, you can also put an arbitrary Python
+        # callable to call instead.  This handler gets precisely the
+        # args/kwargs that the operator was __call__'ed with.
+        # NB: This name is hard-coded in torch/csrc/autograd/python_variable.cpp
+        # for use with OpOverload; cache lookup is done entirely from C++
+        # for speed.
+        # TODO: The cache is NOT currently used by HigherOrderOperator, but it should!
+        self._dispatch_cache: Dict[
+            torch._C.DispatchKey, Union[torch._C.DispatchKey, Callable[..., Any]]
+        ] = {}
+
+        # This table allows you to override the behavior of a particular
+        # dispatch key to call a custom Python function, rather than the
+        # ordinary C++ configured behavior.  This is the raison d'etre of
+        # Python dispatcher: to let you program the dispatcher from Python
+        # in case you need something unusual, and don't want to clobber
+        # the existing registrations using the Python operator registration
+        # API.
+        self.py_kernels: Dict[torch._C.DispatchKey, Callable[..., Any]] = {}
+
+        # This table allows you to override the behavior of a particular
+        # operator for a particular TorchDispatchMode.  In practice,
+        # we are using this mostly for ProxyTensorMode.  Modes can be
+        # thought of as an open world extension of dispatch keys, so it
+        # makes sense that you should be able to register them, the same
+        # way you can register dispatch keys.
+        self.python_key_mode_table: Dict[
+            Type[TorchDispatchMode], Callable[..., Any]
+        ] = {}
+
+        # This table allows you to override the behavior of functorch
+        # transformations.  NB: this currently only does something for
+        # HigherOrderOperator
+        self.functorch_table = {}
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def has_kernel_for_dispatch_key(self, k):
+        return k in self.py_kernels
+
+    def has_kernel_for_any_dispatch_key(self, ks):
+        for k in self.py_kernels:
+            if not torch._C._dispatch_is_alias_key(k) and ks.has(k):
+                return True
+        return False
+
+    def py_impl(self, k):
+        def inner(fn):
+            if inspect.isclass(k) and issubclass(k, TorchDispatchMode):
+                assert k not in self.python_key_mode_table
+                # TODO(voz): Should we replace setting torch._C.DispatchKey.Python entirely with setting mode keys?
+                self.python_key_mode_table[k] = fn
+                self._dispatch_cache.clear()
+                return fn
+
+            if isinstance(k, torch._C._functorch.TransformType):
+                assert k not in self.functorch_table
+                self.functorch_table[k] = fn
+                return fn
+
+            assert isinstance(k, torch._C.DispatchKey)
+            assert (
+                k != torch._C.DispatchKey.Python
+            ), "Please register a mode for the torch._C.DispatchKey.Python key instead."
+
+            if k in self.py_kernels:
+                raise RuntimeError(
+                    f"Trying to override a python impl for {k} on operator {self.name()}"
+                )
+            self.py_kernels[k] = fn
+            self._dispatch_cache.clear()
+            return fn
+
+        return inner
+
+    # Registers an implementation to all **3** variants of functionalization that we have:
+    # - DispatchKey.Functionalize
+    # - functorch.TransformType.Functionalize
+    # - FunctionalTensorMode
+    # Example:
+    #   @py_functionalize_impl
+    #   def functionalize_rule(ctx, inner_f, *args):
+    #       args_unwrapped = ctx.unwrap_tensors(args)
+    #       with ctx.redispatch_to_next():
+    #           out = ctx.functionalize(inner_f)(*args_unwrapped)
+    #           return ctx.wrap_tensors(out)
+    def py_functionalize_impl(self, fn):
+        from torch._subclasses.functional_tensor import (
+            CppFunctionalizeAPI as _CppFunctionalizeAPI,
+            FunctorchFunctionalizeAPI as _FunctorchFunctionalizeAPI,
+            PythonFunctionalizeAPI as _PythonFunctionalizeAPI,
+        )
+
+        # Construct our three flavors of functionalization,
+        # each of which have slightly different wrap/unwrap/redispatch policies
+        def functionalize_dk_fn(*args, **kwargs):
+            return fn(_CppFunctionalizeAPI(), *args, **kwargs)
+
+        def functionalize_dispatch_mode_fn(mode, *args, **kwargs):
+            return fn(_PythonFunctionalizeAPI(mode), *args, **kwargs)
+
+        def functionalize_functorch_fn(interpreter, *args, **kwargs):
+            return fn(_FunctorchFunctionalizeAPI(interpreter), *args, **kwargs)
+
+        self.py_impl(torch._C.DispatchKey.Functionalize)(functionalize_dk_fn)
+        self.py_impl(torch._subclasses.functional_tensor.FunctionalTensorMode)(
+            functionalize_dispatch_mode_fn
+        )
+        self.py_impl(torch._C._functorch.TransformType.Functionalize)(
+            functionalize_functorch_fn
+        )
+
+        return fn
+
+    def name(self):
+        raise NotImplementedError()
+
+
+is_included_in_alias = torch._C._dispatch_is_included_in_alias
+
+DispatchKey = torch._C.DispatchKey
+
+
+# Equivalent to computeDispatchTableEntryWithDebug
+def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
+    # 1. (Direct) operator registration
+    if op.has_kernel_for_dispatch_key(k):
+        return k
+    # 2.1 Use CompositeExplicitAutogradNonFunctional kernel if available
+    cand = DispatchKey.CompositeExplicitAutogradNonFunctional
+    if (
+        k == DispatchKey.Undefined or is_included_in_alias(k, cand)
+    ) and op.has_kernel_for_dispatch_key(cand):
+        return cand
+    # 2.2 Use CompositeExplicitAutograd kernel if available
+    cand = DispatchKey.CompositeExplicitAutograd
+    if (
+        k == DispatchKey.Undefined or is_included_in_alias(k, cand)
+    ) and op.has_kernel_for_dispatch_key(cand):
+        return cand
+    has_backend_kernel = op.has_kernel_for_any_dispatch_key(
+        torch._C._dispatch_get_backend_keyset_from_autograd(k)
+    ) or op.has_kernel_for_dispatch_key(DispatchKey.CompositeExplicitAutograd)
+    # 2.3. Use CompositeImplicitAutograd kernel if available
+    cand = DispatchKey.CompositeImplicitAutogradNestedTensor
+    if (
+        (k != DispatchKey.Undefined and is_included_in_alias(k, cand))
+        and op.has_kernel_for_dispatch_key(cand)
+        and not has_backend_kernel
+    ):
+        return cand
+    cand = DispatchKey.CompositeImplicitAutograd
+    if (
+        k == DispatchKey.Undefined or is_included_in_alias(k, cand)
+    ) and op.has_kernel_for_dispatch_key(cand):
+        if k == DispatchKey.AutogradOther and op.has_kernel_for_any_dispatch_key(
+            torch._C._dispatch_autogradother_backends
+        ):
+            raise RuntimeError("ambiguous autogradother kernel")
+        elif not has_backend_kernel:
+            return cand
+    # 2.4. For autograd backend keys, use kernel from DispatchKey::Autograd if available
+    cand = DispatchKey.Autograd
+    if is_included_in_alias(k, cand) and op.has_kernel_for_dispatch_key(cand):
+        return cand
+    # 2.5 Use kernel from DispatchKey::FuncTorchBatchedDecomposition if available
+    cand = DispatchKey.FuncTorchBatchedDecomposition
+    if is_included_in_alias(k, cand) and op.has_kernel_for_dispatch_key(cand):
+        return cand
+    # Backend fallback
+    if torch._C._dispatch_has_backend_fallback(k):
+        # The dispatch key itself will implicitly route to backend fallback.
+        # This is probably not great for the pure Python implementation.
+        return k
+    raise NotImplementedError(f"could not find kernel for {op} at dispatch key {k}")
+
+
+_higher_order_ops: Dict[str, "HigherOrderOperator"] = {}
+
+_HIGHER_ORDER_OP_DEFAULT_FALLTHROUGH_DISPATCH_KEYS = [
+    DispatchKey.PythonDispatcher,  # type: ignore[attr-defined]
+    DispatchKey.PythonTLSSnapshot,  # type: ignore[attr-defined]
+    DispatchKey.ADInplaceOrView,
+    DispatchKey.BackendSelect,
+    DispatchKey.AutocastCPU,  # type: ignore[attr-defined]
+    DispatchKey.AutocastCUDA,  # type: ignore[attr-defined]
+]
+
+
+class HigherOrderOperator(OperatorBase):
+    # The HigherOrderOperator will appear as torch.ops.higher_order.{name}
+    #
+    # If you're creating a new HigherOrderOperator, please do not change the
+    # default. Adding operators to the global torch.ops namespace is a bad
+    # practice due to name collisions.
+    def __init__(self, name):
+        super().__init__()
+        self._name = name
+
+        # Make _OPNamespace not scream, this whole name based association needs a good hard look
+        self.__name__ = name
+        _higher_order_ops[name] = self
+        self._ns = "higher_order"
+
+        # For a normal HigherOrderOperator instance, we will change its __module__ from torch._ops to
+        # torch._ops.higher_order.
+        # For an instance of subclass of HigherOrderOperator (e.g. customized higher order op),
+        # the __module__ attribute will be kept unchanged.
+        if self.__class__ is HigherOrderOperator:
+            self_name_space = "." + self.namespace if self.namespace else ""
+            self.__module__ = self.__module__ + self_name_space
+        self.non_fallthrough_keys = torch._C._dispatch_keyset_full()
+
+        for dispatch_key in _HIGHER_ORDER_OP_DEFAULT_FALLTHROUGH_DISPATCH_KEYS:
+            self.fallthrough(dispatch_key)
+
+        # [NOTE] We have to register pre-dispatch key implementation
+        # because sometimes HOP use aot-dispatch tracing to detect certaion
+        # mutations. This is problematic when we are functionalizing HOP
+        # during pre-dispatch because when the inner tracer starts, it will see
+        # that PreDispatch key is still active. In that case, we just redispatch
+        # it to next key. This is only safe to do when PreDispatch key stack has no
+        # active modes.
+        # TODO (tmanlaibaatar) Make it generic fallback mechanism
+        def _(*args, **kwargs):
+            if _len_torch_dispatch_stack_pre_dispatch() == 0:
+                with torch._C._ExcludeDispatchKeyGuard(
+                    torch._C.DispatchKeySet(DispatchKey.PreDispatch)
+                ):
+                    return self(*args, **kwargs)
+            raise AssertionError(
+                """
+                Can't directly invoke HOP implementation at PreDispatch key
+                if there are active modes on PreDispatch mode stack.
+                """
+            )
+
+        self.py_impl(torch._C.DispatchKey.PreDispatch)(_)
+
+    def py_impl(self, k):
+        if isinstance(k, torch._C.DispatchKey) and not self.non_fallthrough_keys.has(k):
+            self.non_fallthrough_keys = self.non_fallthrough_keys.add(k)
+        return super().py_impl(k)
+
+    @property
+    def namespace(self):
+        return self._ns
+
+    def fallthrough(self, dispatch_key):
+        self.non_fallthrough_keys = self.non_fallthrough_keys.remove(dispatch_key)
+
+    def dispatch(self, dispatch_key, *args, **kwargs):
+        from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+        if dispatch_key in self._dispatch_cache:
+            kernel = self._dispatch_cache[dispatch_key]
+            assert not isinstance(kernel, torch._C.DispatchKey)
+            return kernel(*args, **kwargs)
+
+        if dispatch_key == torch._C.DispatchKey.FuncTorchDynamicLayerFrontMode:
+            return dispatch_functorch(self, args, kwargs)
+
+        if dispatch_key == torch._C.DispatchKey.Python:
+            # The place to handle ProxyTorchDispatchMode, FakeTensorMode, etc
+            from torch.utils._python_dispatch import _pop_mode_temporarily
+
+            curr_mode = _get_current_dispatch_mode()
+            assert (
+                curr_mode is not None
+            ), "Illegal invocation of dispatch on torch._C.DispatchKey.Python without a mode."
+            assert (
+                type(curr_mode) in self.python_key_mode_table
+            ), f"Current active mode {curr_mode} not registered"
+            handler = self.python_key_mode_table[type(curr_mode)]
+            with _pop_mode_temporarily() as mode:
+                return handler(mode, *args, **kwargs)
+
+        functionality_key = torch._C._to_functionality_key(dispatch_key)  # type: ignore[attr-defined]
+        if functionality_key == torch._C.DispatchKey.PreDispatch:
+            from torch.utils._python_dispatch import _pop_mode_temporarily
+
+            # The check for Python in the exclude set is so we properly respect `with no_dispatch()`
+            # calls inside of a mode.
+            if (
+                _len_torch_dispatch_stack_pre_dispatch() > 0
+            ) and not torch._C._dispatch_tls_is_dispatch_key_excluded(
+                DispatchKey.Python
+            ):
+                curr_mode = _get_current_dispatch_mode_pre_dispatch()
+                assert (
+                    curr_mode is not None
+                ), "Illegal invocation of dispatch on torch._C.DispatchKey.PreDispatch without a mode."
+                assert (
+                    type(curr_mode) in self.python_key_mode_table
+                ), f"Current active mode {curr_mode} not registered"
+                handler = self.python_key_mode_table[type(curr_mode)]
+                with _pop_mode_temporarily(functionality_key) as mode:
+                    return handler(mode, *args, **kwargs)
+
+        final_key = resolve_key(self, dispatch_key)
+
+        # This can current fail due to backend fallbacks.  You just have to
+        # register them by hand for HigherOrderOperator.
+        if final_key not in self.py_kernels:
+            raise NotImplementedError(
+                f"could not find kernel for HigherOrderOperator {self._name} "
+                f"at dispatch key {final_key} (resolved from {dispatch_key})"
+            )
+        self._dispatch_cache[dispatch_key] = self.py_kernels[final_key]
+        kernel = self.py_kernels[final_key]
+        # It's illegal to register DispatchKey to py_kernels, since there's no
+        # C++ kernel to call into
+        assert not isinstance(kernel, torch._C.DispatchKey)
+        return kernel(*args, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        # Dynamo already traces the body of HigherOrderOp beforehand when it
+        # so no need to trace into it.
+        import torch._dynamo
+        from torch._dynamo import disable
+
+        @disable
+        def wrapper():
+            flat_args = _to_flat_tuple(args, kwargs)
+            if torch.overrides.has_torch_function(flat_args):
+                return torch.overrides.handle_torch_function(
+                    self, flat_args, *args, **kwargs
+                )
+
+            dispatch_key_set = _compute_keyset(args, kwargs, self.non_fallthrough_keys)
+            return self.dispatch(
+                dispatch_key_set.highestPriorityTypeId(), *args, **kwargs
+            )
+
+        return wrapper()
+
+    def __str__(self):
+        return f"{self.name()}"
+
+    def name(self):
+        return self._name
+
+
+def _to_flat_tuple(args, kwargs):
+    return pytree.arg_tree_leaves(*args, **kwargs)
+
+
+def _compute_keyset(args, kwargs, non_fallthrough_keys):
+    tensors = _get_tensors(args, kwargs)
+    return key_extractor(tensors, non_fallthrough_keys)
+
+
+def _get_tensors(args, kwargs):
+    flat_all = _to_flat_tuple(args, kwargs)
+    tensor_args = [t for t in flat_all if isinstance(t, torch.Tensor)]
+    return tuple(tensor_args)
+
+
+# Note - this should maintain identical impl to the C++ dispatcher key extraction logic
+# at ATen/core/dispatch/DispatchKeyExtractor.h
+def key_extractor(tensors, key_mask):
+    key_set = torch._C._dispatch_tls_local_include_set()
+    for tensor in tensors:
+        key_set = key_set | torch._C._dispatch_keys(tensor)
+    key_set = key_set - torch._C._dispatch_tls_local_exclude_set()
+    key_set = key_set & key_mask
+    return key_set
+
+
+# Mode stack for PreDispatchKey
+# it should always have two keys with
+# priority given to FunctionalTensorMode and
+# then ProxyTorchDispatchMode. It means that
+# slot 0 belongs to ProxyTorchDispatchMode and
+# slot 1 belongs to FunctionalTensorMode.
+class _ModeStackStateForPreDispatch:
+    def __init__(self):
+        self.__infra_modes = [None, None]
+
+    def set(self, index, mode):
+        assert index < len(self.__infra_modes)
+        self.__infra_modes[index] = mode
+
+    def get(self, index):
+        assert index < len(self.__infra_modes)
+        return self.__infra_modes[index]
+
+    def count(self):
+        return len([i for i in self.__infra_modes if i is not None])
+
+
+_mode_stack_state_for_pre_dispatch = _ModeStackStateForPreDispatch()
+
+
+def unset_mode_pre_dispatch(mode_key):
+    current_mode_stack_pre_dispatch = mode_stack_state_for_pre_dispatch()
+    assert mode_key in (
+        torch._C._TorchDispatchModeKey.PROXY,
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+    )
+    if mode_key == torch._C._TorchDispatchModeKey.PROXY:
+        current_mode = current_mode_stack_pre_dispatch.get(0)
+        mode_stack_state_for_pre_dispatch().set(0, None)
+        return current_mode
+    else:
+        current_mode = current_mode_stack_pre_dispatch.get(1)
+        mode_stack_state_for_pre_dispatch().set(1, None)
+        return current_mode
+
+
+def _set_mode_pre_dispatch(mode):
+    from torch._subclasses.functional_tensor import FunctionalTensorMode
+    from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+
+    assert isinstance(mode, (FunctionalTensorMode, ProxyTorchDispatchMode))
+    if isinstance(mode, FunctionalTensorMode):
+        current_mode = mode_stack_state_for_pre_dispatch().get(1)
+        assert current_mode is None
+        mode_stack_state_for_pre_dispatch().set(1, mode)
+        return
+
+    current_mode = mode_stack_state_for_pre_dispatch().get(0)
+    assert current_mode is None
+    mode_stack_state_for_pre_dispatch().set(0, mode)
+
+
+def _pop_mode_from_pre_dispatch():
+    mode_stack = mode_stack_state_for_pre_dispatch()
+    if mode_stack.get(1) is not None:
+        res = mode_stack.get(1)
+        mode_stack.set(1, None)
+        return res
+
+    if mode_stack.get(0) is not None:
+        res = mode_stack.get(0)
+        mode_stack.set(0, None)
+        return res
+
+    raise AssertionError("Trying to pop empty mode stack")
+
+
+def _len_torch_dispatch_stack_pre_dispatch():
+    return mode_stack_state_for_pre_dispatch().count()
+
+
+def _get_dispatch_mode_pre_dispatch(mode_key):
+    assert mode_key in (
+        torch._C._TorchDispatchModeKey.PROXY,
+        torch._C._TorchDispatchModeKey.FUNCTIONAL,
+    )
+    if mode_key == torch._C._TorchDispatchModeKey.PROXY:
+        return mode_stack_state_for_pre_dispatch().get(0)
+    return mode_stack_state_for_pre_dispatch().get(1)
+
+
+def _get_current_dispatch_mode_pre_dispatch():
+    stack_len = mode_stack_state_for_pre_dispatch().count()
+    if stack_len == 2:
+        return mode_stack_state_for_pre_dispatch().get(1)
+    if stack_len == 1:
+        return (
+            mode_stack_state_for_pre_dispatch().get(1)
+            if mode_stack_state_for_pre_dispatch().get(1) is not None
+            else mode_stack_state_for_pre_dispatch().get(0)
+        )
+    return None
+
+
+def mode_stack_state_for_pre_dispatch():
+    global _mode_stack_state_for_pre_dispatch
+    return _mode_stack_state_for_pre_dispatch
+
+
+cached_ops: Set["OpOverload"] = set()
+
+
+def add_cached_op(op_overload):
+    global cached_ops
+    cached_ops.add(op_overload)
+
+
+def reset_cached_ops():
+    global cached_ops
+    cached_ops.clear()
+
+
+def get_cached_ops():
+    global cached_ops
+    return cached_ops
+
+
+# Each OpOverload object contains pointer to a a specific operator overload, a pointer to the parent `OpOverloadPacket` object.
+# You can obtain an OpOverload object through attribute query on OpOverloadPacket.
+class OpOverload(OperatorBase):
+    def __init__(self, overloadpacket, op, op_dk, schema, tags):
+        super().__init__()
+        self._op = op
+        self._op_dk = op_dk
+        self._schema = schema
+        self._overloadpacket = overloadpacket
+        self._tags = tags
+        self._overloadname = (
+            "default" if schema.overload_name == "" else schema.overload_name
+        )
+        self._name = self._schema.name
+        if schema.overload_name:
+            self._name += "." + schema.overload_name
+        self.__name__ = f"{self._schema.name.split('::')[1]}.{self._overloadname}"
+        self.__module__ = overloadpacket.__module__
+        op.__module__ = overloadpacket.__module__
+        self.__qualname__ = self._name
+        self.__annotations__ = {}
+
+        # If the OpOverload was constructed from a Library.def in Python.
+        self._defined_in_python = self.__qualname__ in torch.library._defs
+
+        # Logic replicated from aten/src/ATen/native/MathBitsFallback.h
+        is_write = None
+        for a in self._schema.arguments:
+            if a.alias_info is None:
+                continue
+            if is_write is None:
+                is_write = a.alias_info.is_write
+            else:
+                # We will conservatively call mixed mutable/non-mutable
+                # aliased inputs as NOT a view
+                is_write = a.alias_info.is_write or is_write
+        self.is_view = is_write is not None and not is_write
+
+    # it's a no-op since OpOverload object is immutable and must be unique for a given op overload.
+    def __deepcopy__(self, memo=None):
+        return self
+
+    def __repr__(self):
+        return "<OpOverload(op='{}.{}', overload='{}')>".format(
+            *self._schema.name.split("::"), self._overloadname
+        )
+
+    def __call__(self_, *args, **kwargs):  # noqa: B902
+        # use `self_` to avoid naming collide with aten ops arguments that
+        # are named "self". This way, all the aten ops can be called by kwargs.
+        return self_._op(*args, **kwargs)
+
+    def __hash__(self):
+        return hash(self._op)
+
+    # `my_namespace.my_op_name.overload_name`
+    def __str__(self):
+        return "{}.{}.{}".format(*self._schema.name.split("::"), self._overloadname)
+
+    def has_kernel_for_dispatch_key(self, k):
+        return super().has_kernel_for_dispatch_key(
+            k
+        ) or torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), k)
+
+    def has_kernel_for_any_dispatch_key(self, ks):
+        return torch._C._dispatch_has_kernel_for_any_dispatch_key(
+            self.name(), ks
+        ) or super().has_kernel_for_any_dispatch_key(ks)
+
+    @property
+    def namespace(self):
+        return self._schema.name.split("::")[0]
+
+    def _handle(self):
+        return torch._C._dispatch_find_schema_or_throw(
+            self._schema.name, self._schema.overload_name
+        )
+
+    def decompose(self, *args, **kwargs):
+        dk = torch._C.DispatchKey.CompositeImplicitAutograd
+        if dk in self.py_kernels:
+            # NB: This branch is not too necessary anymore, because we can
+            # apply Python CompositeImplicitAutograd *before* tracing
+            # using Python dispatcher (also taking advantage of the autograd
+            # formula).  But it's included for completeness
+            return self.py_kernels[dk](*args, **kwargs)
+        elif torch._C._dispatch_has_kernel_for_dispatch_key(self.name(), dk):
+            return self._op_dk(dk, *args, **kwargs)
+        else:
+            return NotImplemented
+
+    # Remove a dispatch key from the dispatch cache.  This will force it to get
+    # recomputed the next time.  Does nothing
+    # WARNING: if you register a dispatch key to py_kernels of an OpOverload,
+    # calling _del_dispatch on that key is NOT sufficient to apply your change,
+    # because a single registration may affect MULTIPLE dispatch keys (e.g.,
+    # registering Autograd affects AutogradCPU).  del_dispatch is to be used
+    # only if you are specifically modifying how get_dispatch handles a
+    # particular input 'key'.
+    def _uncache_dispatch(self, key):
+        self._dispatch_cache.pop(key, None)
+
+    # This implements the pre-computation logic for the Python dispatcher.
+    def _get_dispatch(self, key):
+        # This is only called upon a cache miss
+        assert key not in self._dispatch_cache, f"{self} {key}"
+
+        if key == torch._C.DispatchKey.Python:
+            if not self.python_key_mode_table:
+                self._dispatch_cache[key] = key
+                add_cached_op(self)
+                return key
+
+            def handler(*args, **kwargs):
+                from torch.utils._python_dispatch import _get_current_dispatch_mode
+
+                # TODO: We also need to handle tensor subclasses here
+                # TODO(voz): We should walk all the nodes here / turn it into a list, topmode is ok for now.
+                curr_mode = type(_get_current_dispatch_mode())
+                assert (
+                    curr_mode is not None
+                ), "Illegal invocation of dispatch on torch._C.DispatchKey.Python without a mode."
+                if curr_mode not in self.python_key_mode_table:
+                    # TODO: This path is slow, should generally encourage this
+                    # case to not happen
+                    return self._op_dk(key, *args, **kwargs)
+                # TODO(voz): The idea behind this is that we do not yet support dispatch by key + mode, only key.
+                return self.python_key_mode_table[curr_mode](*args, **kwargs)
+
+            self._dispatch_cache[key] = handler
+            add_cached_op(self)
+            return handler
+
+        functionality_key = torch._C._to_functionality_key(key)  # type: ignore[attr-defined]
+        if functionality_key == torch._C.DispatchKey.PreDispatch:
+            curr_stack_len = _len_torch_dispatch_stack_pre_dispatch()
+            # The check for Python in the exclude set is so we properly respect `with no_dispatch()`
+            # calls inside of a mode.
+            if (
+                curr_stack_len > 0
+                and not torch._C._dispatch_tls_is_dispatch_key_excluded(
+                    DispatchKey.Python
+                )
+            ):
+
+                def handler(*args, **kwargs):
+                    @contextlib.contextmanager
+                    def _temporarily_pop_modes_from_pre_dispatch():
+                        top_mode = _pop_mode_from_pre_dispatch()
+                        try:
+                            yield top_mode
+                        finally:
+                            _set_mode_pre_dispatch(top_mode)
+
+                    with _temporarily_pop_modes_from_pre_dispatch() as curr_mode:
+                        assert isinstance(curr_mode, TorchDispatchMode)
+                        overload_types = []
+                        args_flattened, _ = torch.utils._pytree.tree_flatten(
+                            (args, kwargs.values())
+                        )
+                        for a in args_flattened:
+                            # TODO: need to double check the semantics of the "types" argument to torch_dispatch.
+                            # It's generated in PyInterpreter.cpp, but seems to be generated in two places,
+                            # where in one case we only include tensors with the python key, and in another
+                            # we include **all** tensors.
+                            if isinstance(a, torch.Tensor) and torch._C._dispatch_keys(
+                                a
+                            ).has(torch._C.DispatchKey.Python):
+                                overload_types.append(type(a))
+                        # TODO: check that I got these args correct (in C++, we pass in "0000"??)
+
+                        return curr_mode.__torch_dispatch__(
+                            self, overload_types, args, kwargs
+                        )
+
+                # Note [Not Caching Per-Dispatch-Key Mode Handlers]
+                # Note that we're not caching this handler.  There isn't really a point, since the slow bit
+                # is the handler itself (in python).
+                # Also, not caching means that we don't have to reset the cache when any existing
+                # modes go out of scope (which in of itself takes time to loop through all operators).
+                return handler
+
+        final_key = resolve_key(self, key)
+
+        # See Note [Not Caching Per-Dispatch-Key Mode Handlers]
+        cache_result = key != torch._C.DispatchKey.PreDispatch
+
+        # TODO: We could potentially have lots of debugging wrappers against
+        # dispatch keys; design some general registration mechanism instead of
+        # having if statement for each of them
+        if key == torch._C.DispatchKey.Functionalize:
+            import torch._dispatch.python as pydispatch
+
+            if pydispatch.CROSSREF_FUNCTIONALIZE:
+                handler = pydispatch.make_crossref_functionalize(self, final_key)
+                if cache_result:
+                    self._dispatch_cache[key] = handler
+                    add_cached_op(self)
+                return handler
+
+        # print(self, key, final_key)
+        r = self.py_kernels.get(final_key, final_key)
+        if cache_result:
+            self._dispatch_cache[key] = r
+            add_cached_op(self)
+        return r
+
+    def name(self):
+        return self._name
+
+    @property
+    def overloadpacket(self):
+        return self._overloadpacket
+
+    @property
+    def op(self):
+        return self._op
+
+    @property
+    def tags(self):
+        return self._tags
+
+    # TODO: add more methods to expose information about input and output arguments
+
+
+# OpOverloadPacket class contains pointer to a base unresolved operator that doesn't correspond to a specific operator
+# You can obtain an OpOverload object through attribute query.
+class OpOverloadPacket:
+    def __init__(self, qualified_op_name, op_name, op, overload_names):
+        # These attributes are accessible on the object through the properties
+        # defined below but are immutable
+        self._qualified_op_name = qualified_op_name
+        self.__name__ = op_name
+        self._op = op
+        self._overload_names = overload_names
+        self._dir = []
+
+    # it's a no-op since OpOverloadPacket object is immutable and must be unique for a given op.
+    def __deepcopy__(self, memo=None):
+        return self
+
+    def __repr__(self):
+        return "<OpOverloadPacket(op='{}.{}')>".format(
+            *self._qualified_op_name.split("::")
+        )
+
+    def __hash__(self):
+        return hash(self._op)
+
+    def __str__(self):
+        return "{}.{}".format(*self._qualified_op_name.split("::"))
+
+    @property
+    def op(self):
+        return self._op
+
+    def __getattr__(self, key):
+        # It is not a valid op_name when __file__ is passed in
+        if key == "__file__":
+            return "torch.ops"
+
+        # ensure that query for dunder attributes that does not exist on
+        # opoverloadpacket but instead exists on the self._op object does not unnecessarily call
+        # `_get_operation_overload` (which is an expensive operation).
+        # This is done to prevent any potential slowdown. This list can be extended
+        # if there exists other attributes like `__name__` that only exist on self._op and not on the
+        # opoverloadpacket.
+        # This is ok since we are guaranteed that an overload name for an aten op can't start with '__'
+        try:
+            if key.startswith("__"):
+                return getattr(self._op, key)
+        except AttributeError:
+            # for consistency because it seems weird to
+            # throw an attribute error with a message containing
+            # an object name different from the one the attribute
+            # query was performed on.
+            raise AttributeError(
+                f"'{str(self)}' can't have an overload name beginning with '__' and the "
+                f"underlying op {str(self._op)} has no attribute {key} either."
+            ) from None
+
+        try:
+            # This is ok since we are guaranteed that an overload name for an aten op can't be 'default'
+            use_key = "" if key == "default" else key
+            # TODO: disallow access to overloads registered by JIT
+            op_, op_dk_, tags = torch._C._get_operation_overload(
+                self._qualified_op_name, use_key
+            )
+            schema = torch._C._get_schema(self._qualified_op_name, use_key)
+            overload = OpOverload(self, op_, op_dk_, schema, tags)
+            # cache the overload object
+            setattr(self, key, overload)
+            self._dir.append(key)
+            return overload
+        except RuntimeError:
+            raise AttributeError(
+                f"The underlying op of '{str(self)}' has no overload name '{key}'"
+            ) from None
+
+    def __iter__(self):
+        return iter(self._dir)
+
+    def __call__(self_, *args, **kwargs):  # noqa: B902
+        # use `self_` to avoid naming collide with aten ops arguments that
+        # named "self". This way, all the aten ops can be called by kwargs.
+
+        # overloading __call__ to ensure torch.ops.foo.bar()
+        # is still callable from JIT
+        # We save the function ptr as the `op` attribute on
+        # OpOverloadPacket to access it here.
+        return self_._op(*args, **(kwargs or {}))
+
+    # TODO: use this to make a __dir__
+    def overloads(self):
+        return [n if n else "default" for n in self._overload_names]
+
+
+# Resolution of torch.fn is different from torch.ops.aten.fn
+# torch.fn uses the Python argparser, matches with the
+# appropriate schema, and calls into the unboxed version of the method
+# torch.ops.aten.fn resolution is done via the mechanism defined in JIT.
+# JIT creates a stack of all the overloads and then tries to match the
+# correct one at runtime and always calls into the boxed version of the method
+# Autograd codegen creates VariableType, TracerType,
+# inplace or view type and python bindings.
+# Aten codegen generates tensor methods for the tensor class.
+
+# _OpNamespace is a subclass of ModuleType because the torch script
+# allows attribute lookups on modules only. Since we want torch.ops.foo.bar()
+# to work from script, we need to ensure ops and foo are modules
+
+
+class _OpNamespace(types.ModuleType):
+    """
+    An op namespace to dynamically bind Operators into Python.
+
+    Say a user has created a custom Operator called "my_namespace::my_op". To
+    call this op, the user will write torch.ops.my_namespace.my_op(...).
+    At startup, this operation will not yet be bound into Python. Instead, the
+    following sequence of magic tricks will occur:
+    1. `torch.ops.my_namespace` will invoke the `__getattr__` magic method
+       on the `torch.ops` object, which will create a new `_OpNamespace`
+       object called `my_namespace` and set it as an attribute on the `ops`
+       object.
+    2. `torch.ops.my_namespace.my_op` will then invoke `__getattr__` on
+       the `my_namespace` object, which will retrieve the operation via
+       `torch.get_operation`, a function bound from C++, and then in a similar
+       fashion bind this new object onto the `my_namespace` object.
+    3. `torch.ops.my_namespace.my_op(...)` then calls this new operation
+        and subsequent accesses will incur no further lookup (the namespace and
+        operation will already exist).
+    """
+
+    def __init__(self, name):
+        super().__init__("torch.ops." + name)
+        self.name = name
+        self._dir = []
+
+    def __iter__(self):
+        return iter(self._dir)
+
+    def __getattr__(self, op_name):
+        # It is not a valid op_name when __file__ is passed in
+        if op_name == "__file__":
+            return "torch.ops"
+        elif op_name in ["__origin__", "__self__"]:
+            raise AttributeError(
+                f"Invalid attribute '{op_name}' for '_OpNamespace' '{self.name}'"
+            )
+
+        # Get the op `my_namespace::my_op` if available. This will also check
+        # for overloads and raise an exception if there are more than one.
+        namespace_name = self.name
+        qualified_op_name = f"{namespace_name}::{op_name}"
+        try:
+            op, overload_names = torch._C._jit_get_operation(qualified_op_name)
+            if op is None:
+                raise AttributeError(
+                    f"'_OpNamespace' '{self.name}' object has no attribute '{op_name}'"
+                )
+        except RuntimeError as e:
+            # Turn this into AttributeError so getattr(obj, key, default)
+            # works (this is called by TorchScript with __origin__)
+            raise AttributeError(
+                f"'_OpNamespace' '{self.name}' object has no attribute '{op_name}'"
+            ) from e
+
+        # let the script frontend know that op is identical to the builtin op
+        # with qualified_op_name
+        torch.jit._builtins._register_builtin(op, qualified_op_name)
+        op.__module__ = self.__module__ + "." + namespace_name
+        opoverloadpacket = OpOverloadPacket(
+            qualified_op_name, op_name, op, overload_names
+        )
+        opoverloadpacket.__module__ = self.__module__ + "." + namespace_name
+        # cache the opoverloadpacket to ensure that each op corresponds to
+        # a unique OpOverloadPacket object
+        setattr(self, op_name, opoverloadpacket)
+        self._dir.append(op_name)
+        return opoverloadpacket
+
+
+class _PyOpNamespace(_OpNamespace):
+    def __init__(self, name, ops):
+        super().__init__(name)
+        self._ops = ops
+
+    def __getattr__(self, name):
+        # Following _OpNamespace.__getattr__, we cache the op on the _PyOpNamespace object.
+        op = self._ops.get(name, None)
+        if op is None:
+            raise AttributeError(
+                f"'_PyOpNamespace' '{self.name}' object has no attribute '{name}'"
+            )
+        setattr(self, name, op)
+        return op
+
+
+class _Ops(types.ModuleType):
+    __file__ = "_ops.py"
+
+    def __init__(self):
+        super().__init__("torch.ops")
+        self.loaded_libraries = set()
+        self._higher_order_op_namespace = _PyOpNamespace(
+            "torch.ops.higher_order", _higher_order_ops
+        )
+        self._dir = []
+
+    def __getattr__(self, name):
+        # Check if the name is a HigherOrderOperator
+        if name == "higher_order":
+            return self._higher_order_op_namespace
+
+        # Here we are creating `torch.ops.my_namespace`
+        namespace = _OpNamespace(name)
+        setattr(self, name, namespace)
+        self._dir.append(name)
+        return namespace
+
+    def __iter__(self):
+        return iter(self._dir)
+
+    def import_module(self, module):
+        """
+        Imports a Python module that has torch.library registrations.
+
+        Generally, to extend PyTorch with custom operators, a user will
+        create a Python module whose import triggers registration of
+        the custom operators via a torch.ops.load_library call or a call
+        to one or more torch.library.* APIs.
+
+        It is unexpected for Python modules to have side effects, so some
+        linters and formatters will complain. Use this API to import Python
+        modules that contain these torch.library side effects.
+
+        Args:
+            module (str): The name of the Python module to import
+
+        """
+        importlib.import_module(module)
+
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+
+        The library being loaded may run global initialization code to register
+        custom operators with the PyTorch JIT runtime. This allows dynamically
+        loading custom operators. For this, you should compile your operator
+        and the static registration code into a shared library object, and then
+        call ``torch.ops.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+
+        After the library is loaded, it is added to the
+        ``torch.ops.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        if torch._running_with_deploy():
+            return
+
+        path = _utils_internal.resolve_library_path(path)
+        with dl_open_guard():
+            # Import the shared library into the process, thus running its
+            # static (global) initialization code in order to register custom
+            # operators with the JIT.
+            ctypes.CDLL(path)
+        self.loaded_libraries.add(path)
+
+
+# The ops "namespace"
+ops = _Ops()
diff --git a/MLPY/Lib/site-packages/torch/_prims/__init__.py b/MLPY/Lib/site-packages/torch/_prims/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28af648500b507672f07811a38faef549844d794
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims/__init__.py
@@ -0,0 +1,3031 @@
+import contextlib
+import itertools
+import operator
+import weakref
+from enum import Enum
+from functools import partial, reduce
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Type, Union
+
+import torch
+
+import torch._prims_common as utils
+import torch.library
+from torch import sym_float, Tensor, TypedStorage
+from torch._C import _get_default_device
+from torch._prims.debug_prims import register_debug_prims
+from torch._prims.rng_prims import register_rng_prims
+from torch._prims_common import (
+    Dim,
+    DimsSequenceType,
+    DimsType,
+    IntLike,
+    Number,
+    NumberType,
+    RETURN_TYPE,
+    ShapeType,
+    StrideType,
+    TensorLike,
+    TensorLikeType,
+    type_to_dtype,
+)
+from torch._prims_common.wrappers import backwards_not_supported
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.overrides import handle_torch_function, has_torch_function
+from torch.utils._pytree import tree_flatten, tree_map, tree_unflatten
+
+prim = torch.library.Library("prims", "DEF")
+prim_impl = torch.library.Library("prims", "IMPL", "CompositeExplicitAutograd")
+prim_backend_select_impl = torch.library.Library("prims", "IMPL", "BackendSelect")
+prim_autograd_impl = torch.library.Library("prims", "IMPL", "Autograd")
+prim_meta_impl = torch.library.Library("prims", "IMPL", "Meta")
+
+# Experimental module containing prototype "primitive" operations.
+
+__all__ = [
+    #
+    # Common datastructures and helpers
+    #
+    "RETURN_TYPE",
+    #
+    # Elementwise unary prims
+    #
+    "abs",
+    "acos",
+    "acosh",
+    "asin",
+    "asinh",
+    "atan",
+    "atanh",
+    "cos",
+    "cosh",
+    "bessel_i0",
+    "bessel_i0e",
+    "bessel_i1",
+    "bessel_i1e",
+    "bessel_j0",
+    "bessel_j1",
+    "bitwise_not",
+    "cbrt",
+    "ceil",
+    "conj_physical",
+    "digamma",
+    "erf",
+    "erf_inv",
+    "erfc",
+    "erfcx",
+    "exp",
+    "expm1",
+    "exp2",
+    "fill",
+    "floor",
+    "imag",
+    "isfinite",
+    "lgamma",
+    "log",
+    "log1p",
+    "log2",
+    "log10",
+    "ndtri",
+    "neg",
+    "real",
+    "reciprocal",
+    "round",
+    "sign",
+    "signbit",
+    "sin",
+    "sinh",
+    "spherical_bessel_j0",
+    "sqrt",
+    "tan",
+    "tanh",
+    "trunc",
+    #
+    # Elementwise binary prims
+    #
+    "add",
+    "atan2",
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    # 'complex',  # needs custom meta
+    "div",
+    "eq",
+    "fmax",
+    "fmin",
+    "fmod",
+    "frexp",
+    "gcd",
+    "ge",
+    "gt",
+    "hypot",
+    "igamma",
+    "igammac",
+    "le",
+    "lt",
+    "maximum",
+    "minimum",
+    "mul",
+    "ne",
+    "nextafter",
+    "pow",
+    "remainder",
+    "rsqrt",
+    "shift_left",
+    "shift_right_arithmetic",
+    "shift_right_logical",  # not implemented
+    "sub",
+    "zeta",
+    #
+    # View prims
+    #
+    "as_strided",
+    "broadcast_in_dim",
+    "collapse_view",
+    "conj",
+    "expand_dims",
+    "slice",
+    "slice_in_dim",  # implemented using slice -- make this a ref?
+    "split_dim",
+    "squeeze",
+    "transpose",
+    "view_of",
+    "view_element_type",
+    #
+    # Functionalized view mutations
+    #
+    "as_strided_scatter",
+    #
+    # Shape prims
+    #
+    "collapse",
+    "cat",
+    "reshape",
+    "rev",
+    #
+    # Conditional prims
+    #
+    "where",
+    #
+    # Data conversion and movement prims
+    #
+    "clone",
+    "convert_element_type",
+    "device_put",
+    "item",
+    "maximum_value",
+    "minimum_value",
+    "copy_strided",
+    #
+    # Inplace prims
+    #
+    "copy_to",
+    "resize",
+    # "_set",  # Commented out, see note below
+    #
+    # Reduction prims
+    #
+    "amax",
+    "amin",
+    "prod",
+    "sum",
+    "xor_sum",
+    "var",
+    #
+    # Tensor Creation Prims
+    #
+    "empty_strided",
+    "empty_permuted",
+    "scalar_tensor",
+    "iota",
+    #
+    # Linear algebra (linalg) Prims
+    #
+    "svd",
+    #
+    # Randomness Prims
+    #
+    "normal",
+    "_uniform_helper",
+    #
+    # FFT prims
+    #
+    "fft_r2c",
+    "fft_c2c",
+    "fft_c2r",
+]
+
+
+def TensorMeta(
+    tensorlike: Optional[Union[NumberType, torch.Tensor]] = None,
+    *,
+    shape: Optional[ShapeType] = None,
+    strides: Optional[StrideType] = None,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[Union[torch.device, str]] = None,
+):
+    if isinstance(tensorlike, Number):
+        assert not shape and (shape is None or isinstance(shape, Sequence))
+        assert not strides and (strides is None or isinstance(strides, Sequence))
+        inferred_shape: Tuple[int, ...] = ()
+        inferred_strides: Tuple[int, ...] = ()
+        inferred_dtype = type_to_dtype(type(tensorlike))
+        inferred_device = torch.device("cpu")
+        # TODO: This looks wrong, a number that is wrapped into a tensor
+        # needs to behave differently than a scalar tensor for type
+        # promotion purposes
+    elif tensorlike is not None:
+        assert isinstance(tensorlike, torch.Tensor)
+        inferred_shape = tuple(tensorlike.shape)
+        inferred_strides = tuple(tensorlike.stride())
+        inferred_dtype = tensorlike.dtype
+        inferred_device = tensorlike.device
+    else:
+        # If no tensorlike "example" is given then all metadata
+        # must be provided explicitly
+        assert shape is not None
+        assert strides is not None
+        assert dtype is not None
+        assert device is not None
+
+    shape = inferred_shape if shape is None else tuple(shape)  # type: ignore[possibly-undefined]
+    strides = inferred_strides if strides is None else tuple(strides)  # type: ignore[possibly-undefined]
+    dtype = inferred_dtype if dtype is None else dtype  # type: ignore[possibly-undefined]
+    device = inferred_device if device is None else device  # type: ignore[possibly-undefined]
+
+    if isinstance(device, str):
+        device = torch.device(device)
+
+    return torch.empty_strided(shape, strides, dtype=dtype, device=device)
+
+
+def _make_prim(
+    *,
+    schema: str,
+    return_type: Union[RETURN_TYPE, Tuple[RETURN_TYPE, ...]],
+    meta: Callable,
+    impl_aten: Callable,
+    doc: str,
+    tags: Optional[Sequence[torch.Tag]] = None,
+):
+    """
+    Creates a primitive operation.
+
+    """
+
+    prim.define(schema, tags=torch.Tag.pt2_compliant_tag)
+
+    def _prim_impl(*args, **kwargs):
+        # always run the meta function because aten implementation will
+        # typically accept more inputs (e.g., it will do promotion and
+        # broadcasting) which we want to reject
+        meta(*args, **kwargs)
+        return impl_aten(*args, **kwargs)
+
+    # Right now prims don't support autograd (we can and should add an
+    # argument that provides an implementation for backward here.)  Because we
+    # don't have derivative formulas, we must setup a custom autograd function
+    # that raises an error if backwards is invoked
+    def _autograd_impl(*args, **kwargs):
+        return backwards_not_supported(_prim)(*args, **kwargs)
+
+    def _backend_select_impl(*args, **kwargs):
+        if kwargs.get("device") and kwargs["device"].type == "meta":
+            return meta(*args, **kwargs)
+        if any(isinstance(x, torch.device) and x.type == "meta" for x in args):
+            return meta(*args, **kwargs)
+        else:
+            return _prim_impl(*args, **kwargs)
+
+    name = schema.split("(")[0]
+    prim_impl.impl(name, _prim_impl)
+    prim_autograd_impl.impl(name, _autograd_impl)
+    prim_meta_impl.impl(name, meta)
+
+    _prim_packet = getattr(torch._ops.ops.prims, name)
+    _prim = _prim_packet.default
+    if tags:
+        _prim._tags = tags
+
+    from torch._subclasses.fake_tensor import contains_tensor_types
+
+    if not any(contains_tensor_types(a.type) for a in _prim._schema.arguments) or str(
+        _prim
+    ) in [
+        # See https://github.com/pytorch/pytorch/issues/103532
+        "prims.device_put.default"
+    ]:
+        prim_backend_select_impl.impl(name, _backend_select_impl)
+
+    for p in (_prim_packet, _prim):
+        p.__doc__ = doc
+        p.return_type = return_type  # type: ignore[attr-defined]
+
+        p.schema = schema
+        p.prim_impl = _prim_impl
+        p.prim_meta_impl = meta
+        p.impl_aten = impl_aten
+
+    return _prim
+
+
+class ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND(Enum):
+    DEFAULT = (0,)
+    INT_TO_FLOAT = (2,)
+    ALWAYS_BOOL = (3,)
+    COMPLEX_TO_FLOAT = (4,)
+
+
+# TODO: implement dtype validation here, too, or on the corresponding refs
+def _prim_elementwise_meta(
+    *args,
+    type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND,
+    args_with_fixed_dtypes: Optional[Tuple[TensorLikeType, ...]] = None,
+) -> FakeTensor:
+    """
+    Meta function for elementwise operations that produce outputs in the same dtype
+    as their inputs.
+
+    Stride logic is currently incorrect.
+    """
+
+    assert len(args) > 0
+
+    utils.check_same_dtype(*args)
+
+    args_ = list(args)
+    if args_with_fixed_dtypes is not None:
+        args_ = list(args_with_fixed_dtypes) + args_
+
+    utils.check_same_device(*args_, allow_cpu_scalar_tensors=True)
+    utils.check_same_shape(*args_, allow_cpu_scalar_tensors=True)
+
+    l2p_perm = utils.compute_elementwise_output_logical_to_physical_perm(*args_)
+    shape = utils.extract_shape(*args_, allow_cpu_scalar_tensors=True)
+
+    # Acquires the dtype
+    dtype = None
+    scalar_type = None
+    for arg in args:
+        if isinstance(arg, TensorLike):
+            if not utils.is_cpu_scalar_tensor(arg):
+                dtype = arg.dtype
+                break
+            else:
+                dtype = arg.dtype
+        elif isinstance(arg, Number):
+            scalar_type = type(arg)
+
+    if dtype is None and scalar_type is not None:
+        dtype = utils.type_to_dtype(scalar_type)
+
+    # Acquires the device (if it exists) or number
+    device = None
+    number = None
+    for arg in args_:
+        if isinstance(arg, TensorLike):
+            if utils.is_cpu_scalar_tensor(arg):
+                if device is None:
+                    device = arg.device
+                # keep going, in case there is a cuda tensor later
+            else:
+                device = arg.device
+                break
+
+        elif isinstance(arg, Number):
+            if number is None:
+                number = arg
+
+    # NOTE: type promotion behavior here is mostly hidden from tests because
+    # references will typically handle the type promotion properly even if this doesn't
+    # (but getting it wrong will cause too many casts to be inserted in traces!)
+    if device is not None:
+        assert dtype is not None
+        if type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT:
+            dtype = dtype
+        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+            dtype = torch.bool
+        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
+            if utils.is_integer_dtype(dtype) or utils.is_boolean_dtype(dtype):
+                dtype = torch.get_default_dtype()
+        elif type_promotion == ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
+            if utils.is_complex_dtype(dtype):
+                dtype = utils.corresponding_real_dtype(dtype)
+            else:
+                dtype = dtype
+
+        assert shape is not None
+        return torch.empty_permuted(shape, l2p_perm, device=device, dtype=dtype)  # type: ignore[return-value]
+
+    # Number case
+    # TODO: fix number type promotion (bool, complex->float)
+
+    # For now for symint/float, just implementing the common / simple cases of (int,float,symint,symfloat)
+    seen_float = False
+    if isinstance(number, (torch.SymInt, torch.SymFloat)):
+        for a in args:
+            assert isinstance(a, (int, float, torch.SymInt, torch.SymFloat)), "NYI"
+            seen_float = seen_float or isinstance(a, (float, torch.SymFloat))
+        if seen_float:
+            number = sym_float(number)
+
+    return TensorMeta(number)  # type: ignore[arg-type]
+
+
+def _complex_only_elementwise_meta(*args, **kwargs):
+    torch._check(
+        utils.is_complex_dtype(args[0].dtype), lambda: "Only complex dtype is supported"
+    )
+    return _prim_elementwise_meta(*args, **kwargs)
+
+
+def _make_elementwise_unary_prim(
+    name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs
+):
+    """
+    Creates an elementwise unary prim.
+    """
+
+    return _make_prim(
+        schema=f"{name}(Tensor self) -> Tensor",
+        meta=partial(_prim_elementwise_meta, type_promotion=type_promotion),
+        return_type=RETURN_TYPE.NEW,
+        **kwargs,
+    )
+
+
+def _make_elementwise_binary_prim(
+    name: str, *, type_promotion: ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND, **kwargs
+):
+    """
+    Creates an elementwise binary prim.
+    """
+
+    return _make_prim(
+        schema=f"{name}(Tensor self, Tensor other) -> Tensor",
+        meta=partial(_prim_elementwise_meta, type_promotion=type_promotion),
+        return_type=RETURN_TYPE.NEW,
+        **kwargs,
+    )
+
+
+def _not_impl(*args, **kwargs):
+    raise NotImplementedError
+
+
+#
+# Elementwise unary operations
+#
+
+
+abs = _make_elementwise_unary_prim(
+    "abs",
+    impl_aten=torch.abs,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+
+acos = _make_elementwise_unary_prim(
+    "acos",
+    impl_aten=torch.acos,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+acosh = _make_elementwise_unary_prim(
+    "acosh",
+    impl_aten=torch.acosh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+asin = _make_elementwise_unary_prim(
+    "asin",
+    impl_aten=torch.asin,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+asinh = _make_elementwise_unary_prim(
+    "asinh",
+    impl_aten=torch.asinh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+atan = _make_elementwise_unary_prim(
+    "atan",
+    impl_aten=torch.atan,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+atanh = _make_elementwise_unary_prim(
+    "atanh",
+    impl_aten=torch.atanh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+cos = _make_elementwise_unary_prim(
+    "cos",
+    impl_aten=torch.cos,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+cosh = _make_elementwise_unary_prim(
+    "cosh",
+    impl_aten=torch.cosh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_j0 = _make_elementwise_unary_prim(
+    "bessel_j0",
+    impl_aten=torch.special.bessel_j0,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_j1 = _make_elementwise_unary_prim(
+    "bessel_j1",
+    impl_aten=torch.special.bessel_j1,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i0 = _make_elementwise_unary_prim(
+    "bessel_i0",
+    impl_aten=torch.i0,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i0e = _make_elementwise_unary_prim(
+    "bessel_i0e",
+    impl_aten=torch.special.i0e,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i1 = _make_elementwise_unary_prim(
+    "bessel_i1",
+    impl_aten=torch.special.i1,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bessel_i1e = _make_elementwise_unary_prim(
+    "bessel_i1e",
+    impl_aten=torch.special.i1e,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_not = _make_elementwise_unary_prim(
+    "bitwise_not",
+    impl_aten=torch.bitwise_not,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _cbrt_aten(a: torch.Tensor) -> Tensor:
+    torch._check(
+        not a.is_complex(),
+        lambda: "cbrt: Complex inputs not supported. Consider calling torch.pow(a, 1.0/3.0)",
+    )
+    # Returns the real cubic root of the number.
+    # Note that if a < 0, pow(a, (1. / 3.)) returns th complex number
+    # exp(1/3 * log(a)) = exp(1/3 * (log(abs(a)) + pi*i)) = cbrt(abs(a)) * e^{pi/3*i}
+    # which is a complex number.
+    # For more info see the section Note in
+    # https://en.cppreference.com/w/cpp/numeric/math/cbrt
+    return torch.copysign(torch.pow(a.abs(), 1 / 3), a)
+
+
+cbrt = _make_elementwise_unary_prim(
+    "cbrt",
+    impl_aten=_cbrt_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+ceil = _make_elementwise_unary_prim(
+    "ceil",
+    impl_aten=torch.ceil,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _conj_physical_meta(input: TensorLikeType) -> TensorLikeType:
+    if not input.dtype.is_complex:
+        raise RuntimeError("prims.conj_physical is only defined for complex dtypes")
+
+    strides = utils.compute_elementwise_output_strides(input)
+    return TensorMeta(input, strides=strides)
+
+
+conj_physical = _make_prim(
+    schema="conj_physical(Tensor self) -> Tensor",
+    meta=_conj_physical_meta,
+    impl_aten=torch._conj_physical,
+    doc="Returns the physical conjugation of a complex tensor",
+    return_type=RETURN_TYPE.NEW,
+)
+
+
+def _clone_meta(
+    input: TensorLikeType, *, memory_format: torch.memory_format = torch.preserve_format
+) -> TensorLikeType:
+    if memory_format != torch.preserve_format:
+        return torch.empty(
+            input.shape,
+            dtype=input.dtype,
+            layout=input.layout,
+            device=input.device,
+            memory_format=memory_format,
+        )
+
+    # memory_format == torch.preserve_format
+    strides = utils.compute_elementwise_output_strides(input)
+    return torch.empty_strided(
+        input.shape,
+        strides,
+        dtype=input.dtype,
+        layout=input.layout,
+        device=input.device,
+    )
+
+
+clone = _make_prim(
+    schema="clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor",
+    meta=_clone_meta,
+    impl_aten=torch.clone,
+    doc="Returns the copy of a tensor",
+    return_type=RETURN_TYPE.NEW,
+)
+
+digamma = _make_elementwise_unary_prim(
+    "digamma",
+    impl_aten=torch.digamma,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erf = _make_elementwise_unary_prim(
+    "erf",
+    impl_aten=torch.erf,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erf_inv = _make_elementwise_unary_prim(
+    "erf_inv",
+    impl_aten=torch.special.erfinv,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erfc = _make_elementwise_unary_prim(
+    "erfc",
+    impl_aten=torch.special.erfc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+erfcx = _make_elementwise_unary_prim(
+    "erfcx",
+    impl_aten=torch.special.erfcx,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+exp = _make_elementwise_unary_prim(
+    "exp",
+    impl_aten=torch.exp,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+expm1 = _make_elementwise_unary_prim(
+    "expm1",
+    impl_aten=torch.special.expm1,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+exp2 = _make_elementwise_unary_prim(
+    "exp2",
+    impl_aten=torch.special.exp2,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _fill_meta(a: TensorLikeType, value: NumberType) -> TensorLikeType:
+    return _prim_elementwise_meta(
+        a, type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT
+    )
+
+
+# NOTE: fill uses _make_prim directly because it has a value parameter
+fill = _make_prim(
+    schema="fill(Tensor self, Scalar value) -> Tensor",
+    return_type=RETURN_TYPE.NEW,
+    meta=_fill_meta,
+    impl_aten=torch.fill,
+    doc="",
+)
+
+floor = _make_elementwise_unary_prim(
+    "floor",
+    impl_aten=torch.floor,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+imag = _make_prim(
+    schema="imag(Tensor self) -> Tensor",
+    meta=partial(
+        _complex_only_elementwise_meta,
+        type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+    ),
+    return_type=RETURN_TYPE.VIEW,
+    impl_aten=torch.imag,
+    doc="",
+)
+
+isfinite = _make_elementwise_unary_prim(
+    "isfinite",
+    impl_aten=torch.isfinite,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+lgamma = _make_elementwise_unary_prim(
+    "lgamma",
+    impl_aten=torch.lgamma,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log = _make_elementwise_unary_prim(
+    "log",
+    impl_aten=torch.log,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log1p = _make_elementwise_unary_prim(
+    "log1p",
+    impl_aten=torch.log1p,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log2 = _make_elementwise_unary_prim(
+    "log2",
+    impl_aten=torch.log2,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+log10 = _make_elementwise_unary_prim(
+    "log10",
+    impl_aten=torch.log10,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+real = _make_prim(
+    schema="real(Tensor self) -> Tensor",
+    meta=partial(
+        _complex_only_elementwise_meta,
+        type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+    ),
+    return_type=RETURN_TYPE.VIEW,
+    impl_aten=torch.real,
+    doc="",
+)
+
+reciprocal = _make_elementwise_unary_prim(
+    "reciprocal",
+    impl_aten=torch.reciprocal,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+ndtri = _make_elementwise_unary_prim(
+    "ndtri",
+    impl_aten=torch.special.ndtri,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+neg = _make_elementwise_unary_prim(
+    "neg",
+    impl_aten=torch.neg,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+round = _make_elementwise_unary_prim(
+    "round",
+    impl_aten=torch.round,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+rsqrt = _make_elementwise_unary_prim(
+    "rsqrt",
+    impl_aten=torch.rsqrt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sign = _make_elementwise_unary_prim(
+    "sign",
+    impl_aten=torch.sign,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+signbit = _make_elementwise_unary_prim(
+    "signbit",
+    impl_aten=torch.signbit,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sin = _make_elementwise_unary_prim(
+    "sin",
+    impl_aten=torch.sin,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sinh = _make_elementwise_unary_prim(
+    "sinh",
+    impl_aten=torch.sinh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+spherical_bessel_j0 = _make_elementwise_unary_prim(
+    "spherical_bessel_j0",
+    impl_aten=torch.special.spherical_bessel_j0,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+sqrt = _make_elementwise_unary_prim(
+    "sqrt",
+    impl_aten=torch.sqrt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+tan = _make_elementwise_unary_prim(
+    "tan",
+    impl_aten=torch.tan,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+tanh = _make_elementwise_unary_prim(
+    "tanh",
+    impl_aten=torch.tanh,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+trunc = _make_elementwise_unary_prim(
+    "trunc",
+    impl_aten=torch.trunc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+#
+# Elementwise binary operations
+#
+
+add = _make_elementwise_binary_prim(
+    name="add",
+    impl_aten=torch.add,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+atan2 = _make_elementwise_binary_prim(
+    name="atan2",
+    impl_aten=torch.atan2,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_and = _make_elementwise_binary_prim(
+    "bitwise_and",
+    impl_aten=torch.bitwise_and,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_or = _make_elementwise_binary_prim(
+    "bitwise_or",
+    impl_aten=torch.bitwise_or,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+bitwise_xor = _make_elementwise_binary_prim(
+    "bitwise_xor",
+    impl_aten=torch.bitwise_xor,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+# TODO: complex needs a special meta to account for its float -> complex behavior
+# complex = _make_elementwise_binary_prim(
+#   impl_aten=torch.complex,
+#   doc="",
+# )
+
+
+# div prim performs truncation division on integer inputs
+#   and true division for floating and complex inputs
+def _div_aten(a, b):
+    is_integral = isinstance(a, (bool, int, torch.SymInt)) or (
+        isinstance(a, torch.Tensor) and utils.is_integer_dtype(a.dtype)
+    )
+
+    if is_integral:
+        return torch.div(a, b, rounding_mode="trunc")
+    else:
+        return torch.true_divide(a, b)
+
+
+div = _make_elementwise_binary_prim(
+    "div",
+    impl_aten=_div_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+eq = _make_elementwise_binary_prim(
+    "eq",
+    impl_aten=torch.eq,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+fmax = _make_elementwise_binary_prim(
+    "fmax",
+    impl_aten=torch.fmax,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+fmin = _make_elementwise_binary_prim(
+    "fmin",
+    impl_aten=torch.fmin,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+fmod = _make_elementwise_binary_prim(
+    "fmod",
+    impl_aten=torch.fmod,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+gcd = _make_elementwise_binary_prim(
+    "gcd",
+    impl_aten=torch.gcd,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+ge = _make_elementwise_binary_prim(
+    "ge",
+    impl_aten=torch.ge,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+gt = _make_elementwise_binary_prim(
+    "gt",
+    impl_aten=torch.gt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+hypot = _make_elementwise_binary_prim(
+    "hypot",
+    impl_aten=torch.hypot,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+igamma = _make_elementwise_binary_prim(
+    "igamma",
+    impl_aten=torch.special.gammainc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+igammac = _make_elementwise_binary_prim(
+    "igammac",
+    impl_aten=torch.special.gammaincc,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+le = _make_elementwise_binary_prim(
+    "le",
+    impl_aten=torch.le,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+lt = _make_elementwise_binary_prim(
+    "lt",
+    impl_aten=torch.lt,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+
+# Note: the following impls are because torch.maximum and torch.minimum do not support scalar inputs
+def _maximum_aten(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+) -> TensorLikeType:
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+
+    return torch.maximum(a, b)  # type: ignore[arg-type]
+
+
+maximum = _make_elementwise_binary_prim(
+    "maximum",
+    impl_aten=_maximum_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+def _minimum_aten(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+) -> TensorLikeType:
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+
+    return torch.minimum(a, b)  # type: ignore[arg-type]
+
+
+minimum = _make_elementwise_binary_prim(
+    "minimum",
+    impl_aten=_minimum_aten,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+mul = _make_elementwise_binary_prim(
+    "mul",
+    impl_aten=torch.mul,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+ne = _make_elementwise_binary_prim(
+    "ne",
+    impl_aten=torch.ne,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+
+nextafter = _make_elementwise_binary_prim(
+    "nextafter",
+    impl_aten=torch.nextafter,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+pow = _make_elementwise_binary_prim(
+    "pow",
+    impl_aten=torch.pow,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+remainder = _make_elementwise_binary_prim(
+    "remainder",
+    impl_aten=torch.remainder,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+shift_left = _make_elementwise_binary_prim(
+    "shift_left",
+    impl_aten=torch.bitwise_left_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+shift_right_arithmetic = _make_elementwise_binary_prim(
+    "shift_right_arithmetic",
+    impl_aten=torch.bitwise_right_shift,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+shift_right_logical = _not_impl
+
+sub = _make_elementwise_binary_prim(
+    "sub",
+    impl_aten=torch.sub,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+zeta = _make_elementwise_binary_prim(
+    "zeta",
+    impl_aten=torch.special.zeta,
+    doc="",
+    type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+)
+
+
+#
+# View operations
+def _as_strided_meta(
+    a: TensorLikeType, size: ShapeType, stride: StrideType, storage_offset: int
+) -> TensorLikeType:
+    assert len(size) == len(stride)
+    assert storage_offset >= 0
+    utils.validate_strides(stride)
+    utils.validate_shape(size)
+
+    if reduce(operator.mul, size) == 0:
+        # NOTE: This special case is to avoid having to acquire the storage below
+        # as_strided to shapes with no elements are trivially valid, so it's OK
+        pass
+    elif isinstance(a, torch.Tensor):
+        utils.check_in_bounds_for_storage(
+            a._typed_storage(), size, stride, storage_offset
+        )
+
+    return torch.as_strided(a, size, stride, storage_offset)
+
+
+def _as_strided_aten(
+    a: Tensor, size: ShapeType, stride: StrideType, storage_offset: int
+) -> Tensor:
+    return torch.as_strided(a, size, stride, storage_offset)
+
+
+_as_strided_doc = """
+    Creates a view of the tensor with the given shape (size), strides (stride) and
+    storage offset (storage_offset).
+"""
+
+as_strided = _make_prim(
+    schema="as_strided(Tensor(a!) a, SymInt[] size, SymInt[] stride, SymInt storage_offset) -> Tensor(a!)",
+    meta=_as_strided_meta,
+    impl_aten=_as_strided_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_as_strided_doc,
+)
+
+
+def _broadcast_in_dim_meta(
+    a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int]
+):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # Type checks
+    assert isinstance(a, TensorLike)
+    assert isinstance(shape, Sequence)
+    assert isinstance(broadcast_dimensions, Sequence)
+
+    # every dimension must be accounted for
+    assert a.ndim == len(broadcast_dimensions)
+
+    # broadcast shape must have weakly more dimensions
+    assert len(shape) >= a.ndim
+
+    # broadcast_dimensions must be an ascending sequence
+    # (no relative reordering of dims) of integers and
+    # each dimension must be within the new shape
+    def _greater_than_reduce(acc, x):
+        assert isinstance(x, Dim)
+        assert x > acc
+        assert x < len(shape)
+
+        return x
+
+    reduce(_greater_than_reduce, broadcast_dimensions, -1)
+
+    # shape must be broadcastable to
+    for idx, new_idx in enumerate(broadcast_dimensions):
+        if not guard_size_oblivious(a.shape[idx] == 1):
+            torch._check(
+                a.shape[idx] == shape[new_idx],
+                lambda: f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}",
+            )
+
+    new_strides = []
+    original_idx = 0
+    for idx in range(len(shape)):
+        if idx in broadcast_dimensions:
+            # Assigns a stride of zero to dimensions
+            # which were actually broadcast
+            if guard_size_oblivious(a.shape[original_idx] != shape[idx]):
+                new_strides.append(0)
+            else:
+                new_strides.append(a.stride()[original_idx])
+            original_idx = original_idx + 1
+        else:
+            if guard_size_oblivious(shape[idx] != 1):
+                new_strides.append(0)
+            elif original_idx == a.ndim:
+                new_strides.append(1)
+            else:
+                new_strides.append(a.stride()[original_idx] * a.size()[original_idx])
+
+    return a.as_strided(shape, new_strides, a.storage_offset())
+
+
+def _broadcast_in_dim_aten(a, shape, broadcast_dimensions):
+    s = list(shape)
+    for broadcast_dimension in broadcast_dimensions:
+        s[broadcast_dimension] = -1
+
+    v = a
+    for idx, x in enumerate(s):
+        if x != -1:
+            v = v.unsqueeze(idx)
+
+    return v.expand(shape)
+
+
+_broadcast_in_dim_doc = """
+  Creates a view of a with the specified shape.
+
+  Allows adding dimensions of any length and broadcasting
+  dimensions of length one in a to any length.
+
+  The location of the broadcast dimensions must be specified
+  using the broadcast_dimensions argument. Changing the
+  relative order of dimensions is not supported.
+  """
+
+broadcast_in_dim = _make_prim(
+    schema="broadcast_in_dim(Tensor(a) a, SymInt[] shape, int[] broadcast_dimensions) -> Tensor(a)",
+    meta=_broadcast_in_dim_meta,
+    impl_aten=_broadcast_in_dim_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_broadcast_in_dim_doc,
+)
+
+
+def _validate_collapse_args(a: Tensor, start: int, end: int) -> None:
+    # Special-case for zero dimensional tensors
+    ndim = max(1, a.dim())
+    utils.validate_idx(ndim, start)
+    utils.validate_idx(ndim, end)
+
+    # Verifies end is strictly greater than start
+    # (Collapse requires a non-empty interval)
+    torch._check_value(
+        end >= start,
+        lambda: f"Attempting to collapse but end, {end}, is less than start, {start}!",
+    )
+
+
+def _collapsed_shape(shape: ShapeType, start: int, end: int) -> Tuple[int, ...]:
+    """
+    Returns the shape of a with dims in [start, end) merged into a single dimension.
+    """
+    # Special-case for zero dimensional tensors
+    shape = (1,) if len(shape) == 0 else tuple(shape)
+
+    dim_length = 1
+    for s in shape[start : end + 1]:
+        dim_length = dim_length * s
+
+    return shape[0:start] + (dim_length,) + shape[end + 1 :]
+
+
+def _collapse_view_helper(
+    a: TensorLikeType, start: int, end: int
+) -> Tuple[Optional[ShapeType], Optional[StrideType]]:
+    assert isinstance(a, TensorLike)
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    _validate_collapse_args(a, start, end)
+
+    # Special-case for zero dimensional tensors
+    if a.ndim == 0:
+        shape = (1,)
+        strides = (1,)
+    else:
+        shape = a.shape  # type: ignore[assignment]
+        strides = a.stride()  # type: ignore[assignment]
+
+    if a.ndim == 0 or (end == start):
+        return shape, strides
+
+    length = shape[end]
+    stride = strides[end]
+    for idx in range(end - 1, start - 1, -1):
+        if guard_size_oblivious(shape[idx] == 0) or guard_size_oblivious(
+            shape[idx + 1] == 0
+        ):
+            length = 0
+            stride = 0
+            break
+
+        if guard_size_oblivious(shape[idx] == 1):
+            continue
+
+        length = length * shape[idx]
+        stride = min(stride, strides[idx])
+
+        if (
+            guard_size_oblivious(a.numel() > 0)
+            and guard_size_oblivious(shape[idx + 1] != 1)
+            and not guard_size_oblivious(
+                strides[idx] == strides[idx + 1] * shape[idx + 1]
+            )
+        ):
+            return None, None
+
+    new_shape = shape[:start] + (length,) + shape[end + 1 :]
+    new_strides = strides[:start] + (stride,) + strides[end + 1 :]
+
+    # NOTE: when the input has no elements it's restrided as if it were contiguous
+    if guard_size_oblivious(a.numel() == 0):
+        new_strides = utils.make_contiguous_strides_for(new_shape)
+
+    return new_shape, new_strides
+
+
+def _collapse_view_meta(a: TensorLikeType, start: int, end: int) -> TensorLikeType:
+    new_shape, new_strides = _collapse_view_helper(a, start, end)
+
+    if new_shape is None:
+        msg = "Attempting to view a collapsed tensor, but no such view exists!"
+        raise ValueError(msg)
+
+    assert new_strides is not None
+    return a.as_strided(new_shape, new_strides, a.storage_offset())
+
+
+def _collapse_view_aten(a: Tensor, start: int, end: int) -> Tensor:
+    new_shape = _collapsed_shape(a.shape, start, end)
+    return a.view(new_shape)
+
+
+_collapse_view_doc = """
+  Creates a view of a with the dimensions between
+  start (inclusive) and end (exclusive) merged into a
+  single dimension.
+
+  If it's not possible to take such a view then an error
+  is thrown. See collapse instead.
+
+  The dimensions can be merged if and only if
+  they are all "nested" with each other. That is, they all
+  have the property that
+
+  stride[i] = stride[i+1] * shape[i+1]
+
+  for all i in [start, end - 1).
+  """
+
+collapse_view = _make_prim(
+    schema="collapse_view(Tensor(a) a, int start, int end) -> Tensor(a)",
+    meta=_collapse_view_meta,
+    impl_aten=_collapse_view_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_collapse_view_doc,
+)
+
+
+def _conj_meta(a: TensorLikeType) -> TensorLikeType:
+    if not a.dtype.is_complex:
+        raise RuntimeError("Expected complex dtype in prims.conj")
+    out = a.as_strided(a.shape, a.stride(), a.storage_offset())
+    torch._C._set_conj(out, not a.is_conj())
+    return out
+
+
+_conj_doc = """
+Returns a conjugated view of the original tensor
+"""
+
+conj = _make_prim(
+    schema="conj(Tensor(a) a) -> Tensor(a)",
+    meta=_conj_meta,
+    impl_aten=torch.conj,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_conj_doc,
+)
+
+
+def expand_dims(
+    a: TensorLikeType, dimensions: DimsSequenceType, ndim=None
+) -> TensorLikeType:
+    """
+    Creates a view of a with a.ndim + len(dimensions) dimensions, with new
+    dimensions of length one at the dimensions specified by dimensions.
+    """
+    if ndim is not None:
+        # TODO: this is only here to support the unsqueeze ref
+        dims = sorted(utils.canonicalize_dims(ndim, dimensions))  # type: ignore[arg-type]
+    else:
+        dims = sorted(utils.canonicalize_dims(a.ndim, dimensions))  # type: ignore[arg-type]
+    if len(set(dims)) != len(dims):
+        msg = f"Received duplicate dimensions to expand in {str(dimensions)}"
+        raise ValueError(msg)
+
+    new_shape = list(a.shape)
+    for idx in dims:
+        new_shape.insert(idx, 1)
+
+    broadcast_dimensions = [
+        idx for idx in range(len(new_shape)) if idx not in dimensions
+    ]
+    return broadcast_in_dim(a, new_shape, broadcast_dimensions)
+
+
+# Note: saves the Python slice object because we're about to clobber its name with the slice prim
+pyslice: Type[slice] = slice  # type: ignore[has-type]
+
+
+def _slice_meta(
+    a: TensorLikeType,
+    start_indices: DimsSequenceType,
+    limit_indices: DimsSequenceType,
+    strides: Optional[StrideType] = None,
+) -> TensorLikeType:
+    _strides = strides if strides is not None else [1] * len(start_indices)
+
+    if a.ndim != len(start_indices):
+        msg = f"Attempting to slice tensor of rank {a.ndim} with start_indices of length {len(start_indices)}!"
+        raise ValueError(msg)
+
+    if a.ndim != len(limit_indices):
+        msg = f"Attempting to slice tensor of rank {a.ndim} with limit_indices of length {len(limit_indices)}!"
+        raise ValueError(msg)
+
+    if a.ndim != len(_strides):
+        msg = f"Attempting to slice tensor of rank {a.ndim} with strides of length {len(limit_indices)}!"
+        raise ValueError(msg)
+
+    for x, y in zip(start_indices, a.shape):
+        if x < 0:
+            msg = f"Attempting to slice a tensor with a negative start index of {x}!"
+            raise ValueError(msg)
+        if x > y:
+            msg = (
+                f"Attempting to slice a tensor but a start index in {start_indices} is greater than"
+                f" the length of its corresponding dimension in shape {a.shape}"
+            )
+            raise ValueError(msg)
+
+    for x, y, z in zip(limit_indices, a.shape, start_indices):
+        if x < 0:
+            msg = f"Attempting to slice a tensor with a negative stop index of {x}!"
+            raise ValueError(msg)
+        if x > y:
+            msg = (
+                f"Attempting to slice a tensor but a stop index in {limit_indices} is greater than the length of "
+                f" its corresponding dimension in shape {a.shape}"
+            )
+            raise ValueError(msg)
+        if x < z:
+            msg = (
+                f"Attempting to slice a tensor but a start index in {x} is greater than "
+                f" its corresponding stop index {z}"
+            )
+
+    for x in _strides:
+        if x <= 0:
+            msg = f"Attempting to slice a tensor with a non-positive step of {x}!"
+            raise ValueError(msg)
+
+    new_shape = []
+    for x, y, z in zip(start_indices, limit_indices, _strides):
+        new_shape.append(1 + (y - x - 1) // z)
+
+    new_strides = []
+    for x, y in zip(a.stride(), _strides):
+        new_strides.append(x * y)
+
+    return a.as_strided(new_shape, new_strides, a.storage_offset())
+
+
+def _slice_aten(
+    a: Tensor,
+    start_indices: DimsSequenceType,
+    limit_indices: DimsSequenceType,
+    strides: Optional[StrideType] = None,
+) -> Tensor:
+    _strides = strides if strides is not None else [1] * len(start_indices)
+
+    slices = []
+    for start, stop, step in zip(start_indices, limit_indices, _strides):
+        slices.append(pyslice(start, stop, step))
+
+    return operator.getitem(a, slices)  # type: ignore[call-overload]
+
+
+_slice_doc = """
+    Creates a view of a "bounding box" within the tensor.
+
+    The bounding box is specified independently in each of the tensor's dimensions.
+    start_indices and limit_indices describe the box's boundaries for their corresponding
+    dimensions. If strides is specified then they specify the step size between elements
+    in their corresponding dimension.
+
+    This operation is analogous to slicing in NumPy, but does not permit slices where
+    the stop indices are less than the start indices.
+    """
+
+slice = _make_prim(
+    schema="slice(Tensor(a) a, SymInt[] start_indices, SymInt[] limit_indices, SymInt[]? strides=None) -> Tensor(a)",
+    meta=_slice_meta,
+    impl_aten=_slice_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_slice_doc,
+)
+
+
+def _slice_in_dim_meta(
+    a: TensorLikeType,
+    start_index: int,
+    limit_index: int,
+    stride: int = 1,
+    axis: int = 0,
+) -> TensorLikeType:
+    if axis < 0:
+        msg = f"slice_in_dim: received a negative axis {axis}"
+        raise ValueError(msg)
+    if axis >= a.ndim:
+        msg = f"slice_in_dim: axis {axis} is greater or equal to the rank {a.ndim} of the tensor"
+        raise ValueError(msg)
+
+    if start_index < 0:
+        msg = f"slice_in_dim: received a negative start_index {start_index}"
+        raise ValueError(msg)
+
+    if start_index > a.shape[axis]:
+        msg = f"slice_in_dim: start_index is greater than the length {start_index} of dimension {axis}"
+        raise ValueError(msg)
+
+    if limit_index > a.shape[axis]:
+        msg = f"slice_in_dim: limit_index is greater than the length {limit_index} of dimension {axis}"
+        raise ValueError(msg)
+
+    if limit_index < start_index:
+        msg = f"slice_in_dim: received a limit_index {limit_index} less than the start_index {start_index}"
+        raise ValueError(msg)
+
+    if stride < 0:
+        msg = f"slice_in_dim: received a non-positive stride of {stride}!"
+        raise ValueError(msg)
+
+    start_indices = [0] * a.ndim
+    limit_indices = list(a.shape)
+    strides = [1] * a.ndim
+
+    start_indices[axis] = start_index
+    limit_indices[axis] = limit_index
+    strides[axis] = stride
+
+    return _slice_meta(a, start_indices, limit_indices, strides)
+
+
+def _slice_in_dim_aten(
+    a: Tensor,
+    start_index: int,
+    limit_index: int,
+    stride: int = 1,
+    axis: int = 0,
+) -> Tensor:
+    start_indices = [0] * a.ndim
+    limit_indices = list(a.shape)
+    strides = [1] * a.ndim
+
+    start_indices[axis] = start_index
+    limit_indices[axis] = limit_index
+    strides[axis] = stride
+
+    return slice(a, start_indices, limit_indices, strides)
+
+
+_slice_in_dim_doc = """
+    Convenience wrapper for slicing just one dimension using slice.
+    """
+
+# TODO: make stride SymInt
+slice_in_dim = _make_prim(
+    schema="slice_in_dim(Tensor(a) a, SymInt start_index, SymInt limit_index, int stride=1, int axis=0) -> Tensor(a)",
+    meta=_slice_in_dim_meta,
+    impl_aten=_slice_in_dim_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_slice_in_dim_doc,
+)
+
+
+def _split_dim_meta(a: TensorLikeType, dim: int, outer_length: int) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    utils.validate_idx(a.ndim, dim)
+    utils.validate_dim_length(outer_length)
+
+    # Verifies the dim can be split with the specified lhs_length
+    inner_length = a.shape[dim] // outer_length
+
+    if (a.shape[dim] % outer_length) != 0:
+        msg = "Attempting to split dimension of length {}, but outer length of {} divides it with a remainder!".format(
+            a.shape[dim], outer_length
+        )
+        raise ValueError(msg)
+
+    new_shape: List[int] = []
+    new_strides: List[int] = []
+    for idx in range(a.ndim):
+        if idx == dim:
+            new_shape.extend((outer_length, inner_length))
+            new_strides.extend((a.stride()[idx] * inner_length, a.stride()[idx]))
+        else:
+            new_shape.append(a.shape[idx])
+            new_strides.append(a.stride()[idx])
+
+    return a.as_strided(new_shape, new_strides, a.storage_offset())
+
+
+def _split_dim_aten(a: Tensor, dim: int, outer_length: int) -> Tensor:
+    inner_length = a.shape[dim] // outer_length
+    new_shape = a.shape[0:dim] + (outer_length, inner_length) + a.shape[dim + 1 :]
+
+    return a.view(new_shape)
+
+
+_split_dim_doc = """
+  Creates a view of a with the given dimension (of length l) split
+  into two dimensions, with the outer of the two having
+  length outer_length and the inner of the two having computed
+  length inner_length such outer_length * inner_length = l.
+  """
+
+# TODO: consider renaming split_dim_view
+split_dim = _make_prim(
+    schema="split_dim(Tensor(a) a, int dim, SymInt outer_length) -> Tensor(a)",
+    meta=_split_dim_meta,
+    impl_aten=_split_dim_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_split_dim_doc,
+)
+
+
+# Note: allows dimensions to be specified redundantly
+def _squeeze_meta(a: TensorLikeType, dimensions: Sequence) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+
+    for idx in dimensions:
+        utils.validate_idx(a.ndim, idx)
+        assert a.shape[idx] == 1
+
+    new_shape = []
+    new_strides = []
+    for idx in range(len(a.shape)):
+        if idx in dimensions:
+            continue
+
+        new_shape.append(a.shape[idx])
+        new_strides.append(a.stride()[idx])
+
+    return a.as_strided(new_shape, new_strides, a.storage_offset())
+
+
+_squeeze_doc = """
+  Creates a view of the tensor with the specified dimensions removed.
+
+  The removed dimensions must each have length one.
+  """
+
+squeeze = _make_prim(
+    schema="squeeze(Tensor(a) a, int[] dimensions) -> Tensor(a)",
+    meta=_squeeze_meta,
+    impl_aten=torch.squeeze,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_squeeze_doc,
+)
+
+
+def _transpose_meta(a: TensorLikeType, permutation: DimsSequenceType) -> TensorLikeType:
+    if a.ndim != len(permutation):
+        msg = "Attempting to permute a tensor of rank {}, but received a permutation of length {}!".format(
+            a.ndim, len(permutation)
+        )
+        raise ValueError(msg)
+
+    if not utils.is_valid_permutation(a.ndim, permutation):
+        msg = f"Received an invalid permutation, {permutation}!"
+        raise ValueError(msg)
+
+    new_shape = [0] * a.ndim
+    new_strides = [0] * a.ndim
+    for idx, dim in enumerate(permutation):
+        new_shape[idx] = a.shape[dim]
+        new_strides[idx] = a.stride()[dim]
+
+    return a.as_strided(tuple(new_shape), tuple(new_strides), a.storage_offset())
+
+
+def _transpose_aten(a: Tensor, permutation: DimsSequenceType) -> Tensor:
+    return torch.permute(a, permutation)
+
+
+_transpose_doc = """
+    Creates a view of the tensor with its dimensions permuted.
+
+    The length of the permutation must be the rank of the tensor,
+    and each element of the permutation specifies the new order
+    for the corresponding dimension.
+    """
+
+transpose = _make_prim(
+    schema="transpose(Tensor(a) a, int[] permutation) -> Tensor(a)",
+    meta=_transpose_meta,
+    impl_aten=_transpose_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_transpose_doc,
+)
+
+
+def _view_of_meta(a: TensorLikeType) -> TensorLikeType:
+    return a.as_strided(a.shape, a.stride(), a.storage_offset())
+
+
+def _view_of_aten(a: Tensor) -> Tensor:
+    return a.view(a.shape)
+
+
+_view_of_doc = """
+    Creates a view of the tensor.
+    """
+
+view_of = _make_prim(
+    schema="view_of(Tensor(a) a) -> Tensor",
+    meta=_view_of_meta,
+    impl_aten=_view_of_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_view_of_doc,
+)
+
+
+def _view_element_type_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
+    return a.view(dtype)
+
+
+def _view_element_type_aten(a: Tensor, dtype: torch.dtype) -> Tensor:
+    return a.view(dtype)
+
+
+_view_element_type_doc = """
+    Creates a view of the tensor with a different dtype.
+    """
+
+view_element_type = _make_prim(
+    schema="view_of_dtype(Tensor(a) a, ScalarType dtype) -> Tensor",
+    meta=_view_element_type_meta,
+    impl_aten=_view_element_type_aten,
+    return_type=RETURN_TYPE.VIEW,
+    doc=_view_element_type_doc,
+)
+
+#
+# Functionalized view mutations
+#
+
+
+def _as_strided_scatter_meta(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: int,
+) -> TensorLikeType:
+    utils.validate_shape(size)
+    utils.validate_strides(stride)
+
+    required_size = utils.compute_required_storage_length(size, stride, storage_offset)
+    torch._check(
+        input.numel() >= required_size,
+        lambda: (
+            f"as_strided_scatter: sizes {size}, strides {stride}, storage offset {storage_offset} "
+            f" and itemsize {input.element_size()} requiring a storage size of "
+            f"{required_size * input.element_size()} are out of bounds "
+            f"for storage of size {input.numel() * input.element_size()}"
+        ),
+    )
+    torch._check(
+        utils.is_same_shape(src.shape, size),
+        lambda: f"expected src to have a size equal to the slice of self. src size = {src.shape}, slice size = {size}",
+    )
+
+    return utils.clone_preserve_strides(input)
+
+
+_as_strided_scatter_doc = """
+    Creates a new tensor equivalent to ``out = input.clone()`` after mutation by
+    ``out.as_strided(size, stride, storage_offset).copy_(src)``.
+"""
+
+as_strided_scatter = _make_prim(
+    schema="as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt storage_offset) -> Tensor",
+    meta=_as_strided_scatter_meta,
+    impl_aten=torch.as_strided_scatter,
+    return_type=RETURN_TYPE.NEW,
+    doc=_as_strided_scatter_doc,
+)
+
+
+#
+# Shape operations
+#
+
+
+def _collapse_meta(a: Tensor, start: int, end: int) -> Tensor:
+    # Special-case for zero dimensional tensors
+    _validate_collapse_args(a, start, end)
+    new_shape = _collapsed_shape(a.shape, start, end)
+    return a.new_empty(new_shape)
+
+
+def _collapse_aten(a: Tensor, start: int, end: int) -> Tensor:
+    new_shape = _collapsed_shape(a.shape, start, end)
+    out = a.new_empty(new_shape)
+    with torch.no_grad():
+        out.view_as(a).copy_(a)
+    return out
+
+
+_collapse_doc = """
+Collapse a span of neighboring dimensions into one.
+
+See collapse_view for the corresponding view operation.
+"""
+collapse = _make_prim(
+    schema="collapse(Tensor a, int start, int end) -> Tensor",
+    meta=_collapse_meta,
+    impl_aten=_collapse_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_collapse_doc,
+)
+
+
+# TODO: review stride logic
+# NB: unlike torch.cat, this is more strict about empty tensors and dim is
+# never negative
+def _cat_meta(tensors: Sequence[TensorLikeType], dim: int) -> TensorLikeType:
+    # Verifies same shape (except in the concat dimension)
+    assert dim >= 0
+    shape = tensors[0].shape
+    concat_length = 0
+    for tensor_idx, tensor in enumerate(tensors):
+        assert len(shape) == len(tensor.shape)
+        for idx, (common_length, length) in enumerate(zip(shape, tensor.shape)):
+            if idx == dim:
+                concat_length = concat_length + length
+            else:
+                torch._check(
+                    length == common_length,
+                    lambda: f"Sizes of tensors must match except in dimension {dim}. "
+                    f"Expected {common_length} but got {length} for tensor number "
+                    f"{tensor_idx} in the list",
+                )
+
+    new_shape = list(tensors[0].shape).copy()
+    new_shape[dim] = concat_length
+    return TensorMeta(
+        tensors[0],
+        shape=new_shape,
+        strides=utils.make_contiguous_strides_for(new_shape),
+    )
+
+
+def _cat_aten(tensors: Union[Tuple[Tensor, ...], List[Tensor]], dim: int) -> Tensor:
+    return torch.cat(tensors, dim)
+
+
+_cat_doc = """
+  Concatenates tensors along the specified dimension.
+
+  The tensors' shapes must have the same rank and same length for other dimensions.
+  """
+
+cat = _make_prim(
+    schema="cat(Tensor[] tensors, int dim) -> Tensor",
+    meta=_cat_meta,
+    impl_aten=_cat_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_cat_doc,
+)
+
+
+def _reshape_meta(a: TensorLikeType, shape: ShapeType):
+    assert isinstance(a, TensorLike)
+    utils.validate_shape(shape)
+
+    # Validates the tensor and the requested shape have the
+    # same number of elements
+    numel = reduce(operator.mul, shape)
+    if numel != a.numel():
+        msg = f"Attempting to reshape a tensor with {a.numel()} elements to a shape with {numel} elements!"
+        raise ValueError(msg)
+
+    return TensorMeta(a, shape=shape, strides=utils.make_contiguous_strides_for(shape))
+
+
+def _reshape_aten(a: Tensor, shape: ShapeType) -> Tensor:
+    return a.reshape(shape).contiguous().clone()
+
+
+_reshape_doc = """
+  Creates a contiguous tensor with the specified shape
+  containing a copy of the data in a.
+  """
+reshape = _make_prim(
+    schema="reshape(Tensor a, SymInt[] shape) -> Tensor",
+    meta=_reshape_meta,
+    impl_aten=_reshape_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_reshape_doc,
+)
+
+
+def _rev_meta(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
+    utils.validate_dimension_indices(a.ndim, dims)
+    return torch.empty_like(a, memory_format=torch.preserve_format)
+
+
+_rev_doc = """
+    Reverses the order of elements along the given dimensions.
+    """
+
+rev = _make_prim(
+    schema="rev(Tensor a, int[] dims) -> Tensor",
+    meta=_rev_meta,
+    impl_aten=torch.flip,
+    return_type=RETURN_TYPE.NEW,
+    doc=_rev_doc,
+)
+
+#
+# Conditional prims
+#
+
+
+def _where_meta(
+    pred: TensorLikeType, a: TensorLikeType, b: TensorLikeType
+) -> TensorLikeType:
+    return _prim_elementwise_meta(
+        a,
+        b,
+        type_promotion=ELEMENTWISE_PRIM_TYPE_PROMOTION_KIND.DEFAULT,
+        args_with_fixed_dtypes=(pred,),
+    )
+
+
+_where_doc = """
+  Selects elements from a and b according to pred.
+
+  Where pred is true the result contains the element from a, and
+  where pred is false the result contains the element from b.
+  """
+
+where = _make_prim(
+    schema="where(Tensor pred, Tensor a, Tensor b) -> Tensor",
+    meta=_where_meta,
+    impl_aten=torch.where,
+    return_type=RETURN_TYPE.NEW,
+    doc=_where_doc,
+)
+
+
+#
+# Type conversions
+#
+def _convert_element_type_meta(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
+    # Type checks
+    assert isinstance(a, TensorLike)
+    assert isinstance(dtype, torch.dtype)
+
+    # dtype conversion preserves dense strides
+    if torch._prims_common.is_non_overlapping_and_dense(a):
+        strides = a.stride()
+    else:
+        strides = utils.compute_elementwise_output_strides(a)
+
+    return TensorMeta(a, strides=strides, dtype=dtype)
+
+
+def _convert_element_type_aten(a: Tensor, dtype: torch.dtype) -> Tensor:
+    # Propagates requires grad when possible
+    if not utils.is_grad_dtype(dtype):
+        requires_grad = False
+    else:
+        # TODO: update meta objects so this can be acquired directly
+        try:
+            requires_grad = a.requires_grad
+        except Exception as e:
+            requires_grad = False
+
+    result = torch.empty_like(
+        a, device=a.device, dtype=dtype, requires_grad=requires_grad
+    )
+    with torch.no_grad():
+        return copy_to(result, a)
+
+
+_convert_element_type_doc = """
+  Creates a copy of a tensor with the given dtype.
+  """
+
+convert_element_type = _make_prim(
+    schema="convert_element_type(Tensor a, ScalarType dtype) -> Tensor",
+    meta=_convert_element_type_meta,
+    impl_aten=_convert_element_type_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_convert_element_type_doc,
+    tags=(torch.Tag.pointwise,),
+)
+
+
+def _device_put_meta(
+    a: TensorLikeType, device: Union[str, torch.device]
+) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    assert isinstance(device, (str, torch.device))
+
+    return TensorMeta(a, device=utils.canonicalize_device(device))
+
+
+def _device_put_aten(a: Tensor, device: Union[str, torch.device]) -> Tensor:
+    return a.to(device)
+
+
+_device_put_doc = """
+  Creates a copy of a tensor on the given device.
+  """
+
+device_put = _make_prim(
+    schema="device_put(Tensor a, Device device) -> Tensor",
+    meta=_device_put_meta,
+    impl_aten=_device_put_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_device_put_doc,
+)
+
+
+# NOTE: need to model meta scalars
+# See https://github.com/pytorch/pytorch/issues/78070
+def _item_meta(a: TensorLikeType) -> FakeTensor:
+    number_type = utils.dtype_to_type(a.dtype)
+    return TensorMeta(number_type(-1))
+
+
+_item_doc = """
+    Converts a tensor with one element to a Python number.
+"""
+
+# TODO: create a new return type for scalars?
+# FIXME: currently returns integers for boolean tensors
+# https://github.com/pytorch/pytorch/issues/78071
+item = _make_prim(
+    schema="item(Tensor a) -> Scalar",
+    meta=_item_meta,
+    impl_aten=torch.Tensor.item,
+    return_type=RETURN_TYPE.NEW,
+    doc=_item_doc,
+)
+
+
+# NOTE: need to model meta scalars
+# See https://github.com/pytorch/pytorch/issues/78070
+def _maximum_value_meta(dtype: torch.dtype) -> FakeTensor:
+    number_type = utils.dtype_to_type(dtype)
+    return TensorMeta(number_type(-1))
+
+
+def _maximum_value_aten(dtype: torch.dtype):
+    if dtype == torch.bool:
+        return True
+    elif dtype.is_complex or dtype.is_floating_point:
+        return torch.finfo(dtype).max
+    else:
+        return torch.iinfo(dtype).max
+
+
+_maximum_value_doc = """
+    Return the maximum finite value for a dtype.
+"""
+
+# TODO: create a new return type for scalars?
+# FIXME: currently returns integers for boolean tensors
+# https://github.com/pytorch/pytorch/issues/78071
+maximum_value = _make_prim(
+    schema="maximum_value(ScalarType dtype) -> Scalar",
+    meta=_maximum_value_meta,
+    impl_aten=_maximum_value_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_maximum_value_doc,
+)
+
+
+# NOTE: need to model meta scalars
+# See https://github.com/pytorch/pytorch/issues/78070
+def _minimum_value_meta(dtype: torch.dtype) -> FakeTensor:
+    number_type = utils.dtype_to_type(dtype)
+    return TensorMeta(number_type(-1))
+
+
+def _minimum_value_aten(dtype: torch.dtype):
+    if dtype == torch.bool:
+        return False
+    elif dtype.is_complex or dtype.is_floating_point:
+        return torch.finfo(dtype).min
+    else:
+        return torch.iinfo(dtype).min
+
+
+_minimum_value_doc = """
+    Return the minimum finite value for a dtype.
+"""
+
+# TODO: create a new return type for scalars?
+# FIXME: currently returns integers for boolean tensors
+# https://github.com/pytorch/pytorch/issues/78071
+minimum_value = _make_prim(
+    schema="minimum_value(ScalarType dtype) -> Scalar",
+    meta=_minimum_value_meta,
+    impl_aten=_minimum_value_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_minimum_value_doc,
+)
+
+#
+# Inplace operators
+#
+
+
+def _copy_to_meta(a: TensorLikeType, b: TensorLikeType):
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+
+    # Validates the cast is safe
+    # TODO: move this as an option on the reference
+    # a_typ = utils.dtype_to_type(a.dtype)
+    # b_typ = utils.dtype_to_type(b.dtype)
+    # if a_typ is not utils.get_higher_type(a_typ, b_typ):
+    #     raise RuntimeError(str(b.dtype), " can't be cast safely to ", str(a.dtype), "!")
+
+    # Validates the tensors have the same number of elements
+    if a.numel() != b.numel():
+        msg = f"Attempting to copy {b.numel()} elements to a tensor with {a.numel()} elements!"
+        raise RuntimeError(msg)
+
+    return a
+
+
+def _copy_to_aten(a: Tensor, b: Tensor) -> Tensor:
+    return a.copy_(b)
+
+
+_copy_to_doc = """
+  Copies the data in b to a and returns the modified a.
+  """
+
+# TODO: Remove safe casting and implement on reference instead
+copy_to = _make_prim(
+    schema="copy_to(Tensor(a!) a, Tensor b) -> Tensor(a!)",
+    meta=_copy_to_meta,
+    impl_aten=_copy_to_aten,
+    return_type=RETURN_TYPE.INPLACE,
+    doc=_copy_to_doc,
+)
+
+
+def _copy_strided_meta(a: TensorLikeType, stride: ShapeType):
+    assert isinstance(a, TensorLike)
+    return torch.empty_strided(
+        a.shape,
+        stride,
+        dtype=a.dtype,
+        layout=a.layout,
+        device=a.device,
+        requires_grad=a.requires_grad,
+    )
+
+
+def _copy_strided_aten(a: Tensor, stride: ShapeType) -> Tensor:
+    out = torch.empty_strided(
+        a.size(),
+        stride=stride,
+        dtype=a.dtype,
+        layout=a.layout,
+        device=a.device,
+        requires_grad=a.requires_grad,
+    )
+    out.copy_(a)
+    return out
+
+
+_copy_strided_doc = """
+  Copies the data in a to a new tensor, the new tensor has same shape with a size, but has different stride.
+  """
+
+
+copy_strided = _make_prim(
+    schema="copy_strided(Tensor a, SymInt[] stride) -> Tensor",
+    meta=_copy_strided_meta,
+    impl_aten=_copy_strided_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_copy_strided_doc,
+)
+
+
+def _resize_meta(a: TensorLikeType, shape: ShapeType):
+    return a.resize_(shape)
+
+
+def _resize_aten(a: Tensor, shape: ShapeType) -> Tensor:
+    return a.resize_(shape)
+
+
+_resize_doc = """
+  Gives a tensor with no elements a new shape, returning the modified tensor.
+
+  The tensor's strides are contiguous and its values are unitialized.
+  """
+
+# TODO: review support arbitrary resizes
+resize = _make_prim(
+    schema="resize(Tensor(a!) a, SymInt[] shape) -> Tensor(a!)",
+    meta=_resize_meta,
+    impl_aten=_resize_aten,
+    return_type=RETURN_TYPE.INPLACE,
+    doc=_resize_doc,
+)
+
+
+def _reduction_meta(inp, dims, *, output_dtype=None):
+    """
+    Meta function for single output reduction operations
+    Stride logic is incorrect
+    """
+    assert isinstance(inp, TensorLike)
+    if output_dtype is None:
+        output_dtype = inp.dtype
+    output_shape = utils.compute_reduction_output_shape(inp.shape, dims)
+    return TensorMeta(
+        shape=output_shape,
+        strides=utils.make_contiguous_strides_for(output_shape),
+        dtype=output_dtype,
+        device=inp.device,
+    )
+
+
+def _var_reduction_meta(inp, dims, *, correction):
+    if utils.is_complex_dtype(inp.dtype):
+        output_dtype = utils.corresponding_real_dtype(inp.dtype)
+    else:
+        output_dtype = inp.dtype
+    return _reduction_meta(inp, dims, output_dtype=output_dtype)
+
+
+_sum_doc = """
+    Computes the sum of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_xor_sum_doc = """
+    Computes the xor sum of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_prod_doc = """
+    Computes the product of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_amax_doc = """
+    Computes the maximum value of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_amin_doc = """
+    Computes the minimum value of elements in the input tensor over the list of dimensions
+    specified in the dim argument
+    """
+_var_doc = """
+    Computes the biased variance of x over the list of dimensions specified in the dim argument
+    """
+
+
+def _make_reduction_prim(name: str, impl_aten, doc):
+    """Creates a reduction prim."""
+    return _make_prim(
+        schema=f"{name}(Tensor inp, int[]? dims, *, ScalarType? output_dtype=None) -> Tensor",
+        meta=_reduction_meta,
+        impl_aten=impl_aten,
+        return_type=RETURN_TYPE.NEW,
+        doc=doc,
+    )
+
+
+def _make_var_reduction_prim(name: str, impl_aten, doc):
+    """Creates a reduction prim."""
+    return _make_prim(
+        schema=f"{name}(Tensor inp, int[]? dims, *, float correction, ScalarType? output_dtype=None) -> Tensor",
+        meta=_var_reduction_meta,
+        impl_aten=impl_aten,
+        return_type=RETURN_TYPE.NEW,
+        doc=doc,
+    )
+
+
+sum = _make_reduction_prim(
+    name="sum",
+    impl_aten=torch.sum,
+    doc=_sum_doc,
+)
+
+
+def _xor_sum_aten(
+    inp: TensorLikeType,
+    dims: Optional[DimsSequenceType],
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    raise NotImplementedError("xor_sum only implemented with inductor")
+
+
+xor_sum = _make_reduction_prim(
+    name="xor_sum",
+    impl_aten=_xor_sum_aten,
+    doc=_xor_sum_doc,
+)
+
+
+def _prod_aten(
+    inp: TensorLikeType,
+    dims: Optional[DimsSequenceType],
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    if dims is not None:
+        for d in sorted(dims, reverse=True):
+            assert d >= 0
+            inp = torch.prod(inp, d, dtype=dtype)
+        return inp
+    else:
+        return torch.prod(inp, dims, dtype=dtype)
+
+
+prod = _make_reduction_prim(
+    name="prod",
+    impl_aten=_prod_aten,
+    doc=_prod_doc,
+)
+
+var = _make_var_reduction_prim(
+    name="var",
+    impl_aten=torch.var,
+    doc=_var_doc,
+)
+
+amax = _make_reduction_prim(
+    name="amax",
+    impl_aten=torch.amax,
+    doc=_amax_doc,
+)
+
+amin = _make_reduction_prim(
+    name="amin",
+    impl_aten=torch.amin,
+    doc=_amin_doc,
+)
+
+
+_iota_doc = """
+    Constructs a 1-D tensor t where ``t[i] == start + i * step``.
+"""
+
+
+# TODO: layout, pin_memory, memory_format
+# TODO: model requires_grad on TensorMeta
+def _iota_meta(
+    length: int,
+    *,
+    start: int,
+    step: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    torch._check(
+        utils.is_integer_dtype(dtype),
+        lambda: "prims.iota only supports integer dtypes",
+    )
+    torch._check(step != 0, lambda: "step must be nonzero")
+    return torch.empty(
+        length,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+def _iota_aten(
+    length: int,
+    *,
+    start: int,
+    step: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    end = start + length * step
+    return torch.arange(
+        start, end, step, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+
+iota = _make_prim(
+    schema="iota(SymInt length, *, SymInt start, SymInt step, ScalarType dtype, Device device, bool requires_grad) -> Tensor",  # noqa: B950
+    return_type=RETURN_TYPE.NEW,
+    meta=_iota_meta,
+    impl_aten=_iota_aten,
+    doc=_iota_doc,
+)
+
+
+# TODO: layout, pin_memory, memory_format
+# TODO: model requires_grad on TensorMeta
+def _empty_meta(
+    shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _empty_aten(
+    shape: ShapeType, *, dtype: torch.dtype, device: torch.device, requires_grad: bool
+) -> Tensor:
+    return torch.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+_empty_doc = """
+    Creates a tensor with uninitialized values and the specified shape, dtype, and device.
+"""
+
+empty = _make_prim(
+    schema="empty(SymInt[] shape, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_empty_meta,
+    impl_aten=_empty_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_empty_doc,
+)
+
+
+def _empty_strided_meta(
+    shape: ShapeType,
+    strides: StrideType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+_empty_strided_doc = """
+    Creates a tensor with uninitialized values.
+"""
+
+# TODO: add layout, pin_memory
+empty_strided = _make_prim(
+    schema="empty_strided(SymInt[] shape, SymInt[] strides, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    return_type=RETURN_TYPE.NEW,
+    meta=_empty_strided_meta,
+    impl_aten=torch.empty_strided,
+    doc=_empty_strided_doc,
+)
+
+
+def _empty_permuted_meta(
+    shape: ShapeType,
+    physical_layout: DimsSequenceType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    p_strides = utils.make_contiguous_strides_for([shape[l] for l in physical_layout])
+    dim = len(shape)
+    torch._check(
+        len(physical_layout) == dim,
+        lambda: (
+            "Number of dimensions in the tensor input does not match the "
+            f"length of the physical layout; i.e. len(size) = {dim} "
+            f"is not equal to len(physical_layout) = {len(physical_layout)}"
+        ),
+    )
+    strides = [0] * len(shape)
+    seen_dims = set()
+    for p, l in enumerate(physical_layout):
+        torch._check(
+            0 <= l < dim,
+            lambda: (
+                f"Dimension out of range (expected to be between 0 and {dim - 1}, but got "
+                f"{l} at index {p}).  NB: negative dims "
+                "not currently supported; file an issue if you want it."
+            ),
+        )
+        torch._check(l not in seen_dims, lambda: "Duplicate dim not allowed")
+        strides[l] = p_strides[p]
+        seen_dims.add(l)
+    return TensorMeta(
+        shape=shape,
+        strides=strides,
+        dtype=dtype,
+        device=device,
+    )
+
+
+_empty_permuted_doc = """
+    Creates a tensor with uninitialized values according to some physical layout,
+    that is guaranteed to be non-overlapping and dense.
+"""
+
+# TODO: add layout, pin_memory
+empty_permuted = _make_prim(
+    schema="empty_permuted(SymInt[] shape, int[] physical_layout, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",  # noqa: B950
+    return_type=RETURN_TYPE.NEW,
+    meta=_empty_permuted_meta,
+    impl_aten=torch.empty_permuted,
+    doc=_empty_permuted_doc,
+)
+
+
+def _full_meta(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _full_aten(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> Tensor:
+    # Note that Mypy thinks torch.full can't accept a complex fill_value
+    return torch.full(
+        shape, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+    )
+
+
+_full_doc = """
+    Creates a tensor filled with the given fill value, and with the specified shape, dtype, and device.
+"""
+
+# TODO: add layout
+full = _make_prim(
+    schema="full(SymInt[] shape, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_full_meta,
+    impl_aten=_full_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_full_doc,
+)
+
+
+def _full_like_meta(
+    a: TensorLikeType,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> TensorLikeType:
+    strides = utils.compute_elementwise_output_strides(a)
+    if a.numel() == 0:
+        strides = a.stride()
+
+    return TensorMeta(a, strides=strides, dtype=dtype, device=device)
+
+
+def _full_like_aten(
+    a: Tensor,
+    fill_value: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+) -> Tensor:
+    # Note that Mypy thinks torch.full can't accept a complex fill_value
+    return torch.full_like(
+        a, fill_value, dtype=dtype, device=device, requires_grad=requires_grad  # type: ignore[arg-type]
+    )
+
+
+_full_like_doc = """
+    Creates a tensor filled with the given fill value, and the same shape, dtype, and device as the
+    given tensor by default. The dtype and device settings can be overridden
+    by specifying them explicitly.
+"""
+
+full_like = _make_prim(
+    schema="full_like(Tensor a, Scalar fill_value, *, ScalarType dtype, Device device, bool requires_grad) -> Tensor",
+    meta=_full_like_meta,
+    impl_aten=_full_like_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_full_like_doc,
+)
+
+
+def _scalar_tensor_meta(
+    scalar: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> TensorLikeType:
+    shape: ShapeType = []
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(scalar, shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _scalar_tensor_aten(
+    scalar: NumberType,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> Tensor:
+    if isinstance(scalar, complex) and (
+        dtype is None or not utils.is_complex_dtype(dtype)
+    ):
+        raise TypeError("Complex scalar requires complex tensor dtype.")
+    # Note that Mypy thinks torch.scalar can't accept a complex scalar
+    return torch.scalar_tensor(scalar, dtype=dtype, device=device)  # type: ignore[arg-type]
+
+
+_scalar_tensor_doc = """
+    Wraps a Number into a Tensor with the specified dtype and device.
+"""
+
+# TODO: add layout and pin_memory support
+scalar_tensor = _make_prim(
+    schema="scalar_tensor(Scalar s, *, ScalarType? dtype=None, Device? device=None) -> Tensor",
+    meta=_scalar_tensor_meta,
+    impl_aten=_scalar_tensor_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_scalar_tensor_doc,
+)
+
+
+#
+# Linear algebra (linalg) prims
+#
+
+
+def _svd_meta(
+    A: TensorLikeType, *, full_matrices: bool
+) -> Tuple[TensorLikeType, TensorLikeType, TensorLikeType]:
+    utils.check_is_matrix(A, "linalg.svd")
+    utils.check_fp_or_complex(A.dtype, "linalg.svd", allow_low_precision_dtypes=False)
+
+    A_shape = A.shape
+    batch = A_shape[:-2]
+    m, n = A_shape[-2:]
+    k = min(m, n)
+
+    shape_U = batch + (m, m if full_matrices else k)
+    strides_U = utils.make_contiguous_strides_for(shape_U, row_major=False)
+    U = TensorMeta(shape=shape_U, strides=strides_U, dtype=A.dtype, device=A.device)
+
+    shape_S = batch + (k,)
+    strides_S = utils.make_contiguous_strides_for(shape_S)
+    S = TensorMeta(
+        shape=shape_S,
+        strides=strides_S,
+        dtype=utils.corresponding_real_dtype(A.dtype) if A.is_complex() else A.dtype,
+        device=A.device,
+    )
+
+    shape_Vh = batch + (n if full_matrices else k, n)
+    # The CPU backend returns V, but the cuSolver backend returns V^H
+    # TODO The MAGMA backend returns V, so this is wrong if used with the MAGMA backend
+    is_cuda = A.device.type == "cuda"
+    strides_Vh = utils.make_contiguous_strides_for(shape_Vh, row_major=is_cuda)
+    Vh = TensorMeta(shape=shape_Vh, strides=strides_Vh, dtype=A.dtype, device=A.device)
+    # Also makes sure this is CUDA or HIP:
+    # https://pytorch.org/docs/stable/notes/hip.html#checking-for-hip
+    if A.numel() != 0 and Vh.is_complex() and torch.cuda.is_available():
+        Vh = Vh.conj()
+    return U, S, Vh
+
+
+def _svd_aten(
+    A: TensorLikeType, *, full_matrices: bool
+) -> Tuple[Tensor, Tensor, Tensor]:
+    return torch.linalg.svd(A, full_matrices=full_matrices)
+
+
+_svd_doc = """
+    Returns the SVD of a matrix or batch of matrices.
+
+    The `full_matrices` flag controls whether the full or reduced SVD decomposition is returned.
+"""
+
+svd = _make_prim(
+    schema="svd(Tensor A, *, bool full_matrices) -> (Tensor U, Tensor S, Tensor Vh)",
+    meta=_svd_meta,
+    impl_aten=_svd_aten,
+    return_type=(RETURN_TYPE.NEW, RETURN_TYPE.NEW, RETURN_TYPE.NEW),
+    doc=_svd_doc,
+)
+
+
+#
+# Randomness Prims
+#
+
+
+def _normal_meta(
+    shape: ShapeType,
+    *,
+    mean: Union[float, complex],
+    std: float,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+    generator: Optional[torch.Generator] = None,
+) -> TensorLikeType:
+    torch._check(
+        std >= 0.0,
+        lambda: f"expected non-negative standard deviation, but got std={std}",
+    )
+
+    torch._check(
+        utils.is_float_dtype(dtype) or utils.is_complex_dtype(dtype),
+        lambda: f"expected a floating-point or complex dtype, but got dtype={dtype}",
+    )
+
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _normal_aten(
+    shape: ShapeType,
+    *,
+    mean: Union[float, complex],
+    std: float,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
+    generator: Optional[torch.Generator] = None,
+) -> Tensor:
+    a = torch.empty(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+    with torch.no_grad():
+        # NOTE: normal_ is incorrectly annotated to expect mean to be a float
+        a.normal_(mean, std, generator=generator)  # type: ignore[arg-type]
+    return a
+
+
+_normal_doc = """
+    Constructs a tensor filled with values drawn from a normal distribution with the specified mean
+    and standard deviation.
+
+    Only supports floating-point types.
+"""
+
+normal = _make_prim(
+    schema=(
+        "normal(SymInt[] shape, *, Scalar mean, Scalar std, ScalarType dtype, Device device, bool requires_grad, Generator? generator=None) -> Tensor"  # noqa: B950
+    ),
+    return_type=RETURN_TYPE.NEW,
+    meta=_normal_meta,
+    impl_aten=_normal_aten,
+    doc=_normal_doc,
+)
+
+
+def _uniform_meta(
+    shape: ShapeType,
+    *,
+    low: float,
+    high: float,
+    dtype: torch.dtype,
+    device: torch.device,
+    generator: Optional[torch.Generator] = None,
+) -> TensorLikeType:
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=device)
+
+
+def _uniform_aten(
+    shape: ShapeType,
+    *,
+    low: float,
+    high: float,
+    dtype: torch.dtype,
+    device: torch.device,
+    generator: Optional[torch.Generator] = None,
+) -> Tensor:
+    a = torch.empty(shape, dtype=dtype, device=device)
+    a.uniform_(low, high, generator=generator)
+    return a
+
+
+_uniform_doc = """
+    Constructs a tensor filled with values drawn uniformly from low to high.
+"""
+
+# TODO: we should more seriously review randomness modeling and prims
+_uniform_helper = _make_prim(
+    schema=(
+        "uniform(SymInt[] shape, *, Scalar low, Scalar high, ScalarType dtype, Device device, Generator? generator=None) -> Tensor"
+    ),
+    return_type=RETURN_TYPE.NEW,
+    meta=_uniform_meta,
+    impl_aten=_uniform_aten,
+    doc=_uniform_doc,
+)
+
+#
+# FFT prims
+#
+
+
+def _fft_r2c_meta(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    onesided: bool,
+) -> TensorLikeType:
+    dim = utils.canonicalize_dims(input.ndim, dim)
+    utils.validate_no_repeating_dims(dim)
+
+    shape = list(input.shape)
+    if onesided:
+        last_dim = dim[-1]
+        shape[last_dim] = shape[last_dim] // 2 + 1
+
+    dtype = utils.corresponding_complex_dtype(input.dtype)
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=input.device)
+
+
+def _fft_r2c_aten(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    onesided: bool,
+) -> TensorLikeType:
+    normalization = 0  # No normalization
+    return torch._fft_r2c(input, dim, normalization, onesided)
+
+
+_fft_r2c_doc = """
+    Performs a real to complex Fast Fourier Transform
+"""
+
+
+fft_r2c = _make_prim(
+    schema="fft_r2c(Tensor self, *, int[] dim, bool onesided) -> Tensor",
+    meta=_fft_r2c_meta,
+    impl_aten=_fft_r2c_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_fft_r2c_doc,
+)
+
+
+def _fft_c2c_meta(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    forward: bool,
+) -> TensorLikeType:
+    dim = utils.canonicalize_dims(input.ndim, dim)
+    utils.validate_no_repeating_dims(dim)
+
+    shape = input.shape
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(
+        shape=shape, strides=strides, dtype=input.dtype, device=input.device
+    )
+
+
+def _fft_c2c_aten(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    forward: bool,
+) -> TensorLikeType:
+    normalization = 0  # No normalization
+    return torch._fft_c2c(input, dim, normalization, forward)
+
+
+_fft_c2c_doc = """
+    Performs either a Fast Fourier Transform, or its inverse
+"""
+
+
+fft_c2c = _make_prim(
+    schema="fft_c2c(Tensor self, *, int[] dim, bool forward) -> Tensor",
+    meta=_fft_c2c_meta,
+    impl_aten=_fft_c2c_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_fft_c2c_doc,
+)
+
+
+def _fft_c2r_meta(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    last_dim_size: int,
+) -> TensorLikeType:
+    dim = utils.canonicalize_dims(input.ndim, dim)
+    utils.validate_no_repeating_dims(dim)
+
+    shape = list(input.shape)
+    shape[dim[-1]] = last_dim_size
+    dtype = utils.corresponding_real_dtype(input.dtype)
+    strides = utils.make_contiguous_strides_for(shape)
+    return TensorMeta(shape=shape, strides=strides, dtype=dtype, device=input.device)
+
+
+def _fft_c2r_aten(
+    input: TensorLike,
+    *,
+    dim: DimsSequenceType,
+    last_dim_size: int,
+) -> TensorLikeType:
+    normalization = 0  # No normalization
+    return torch._fft_c2r(input, dim, normalization, last_dim_size)
+
+
+_fft_c2r_doc = """
+    Performs a complex to real Inverse Fast Fourier Transform
+"""
+
+
+fft_c2r = _make_prim(
+    schema="fft_c2r(Tensor self, *, int[] dim, SymInt last_dim_size) -> Tensor",
+    meta=_fft_c2r_meta,
+    impl_aten=_fft_c2r_aten,
+    return_type=RETURN_TYPE.NEW,
+    doc=_fft_c2r_doc,
+)
+
+
+def _frexp_meta(self: TensorLikeType) -> Tuple[TensorLikeType, TensorLikeType]:
+    torch._check(
+        self.dtype.is_floating_point,
+        lambda: "torch.frexp() only supports floating-point dtypes",
+    )
+    return torch.empty_like(self), torch.empty_like(self, dtype=torch.int32)
+
+
+frexp = _make_prim(
+    schema="frexp(Tensor self) -> (Tensor mantissa, Tensor exponent)",
+    meta=_frexp_meta,
+    return_type=(RETURN_TYPE.NEW, RETURN_TYPE.NEW),
+    impl_aten=torch.frexp,
+    doc="",
+)
+
+register_rng_prims()
+register_debug_prims()
diff --git a/MLPY/Lib/site-packages/torch/_prims/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..218174476de6f83f84715c688b87f49650e96e96
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims/__pycache__/context.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims/__pycache__/context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9a7c59eb66a5b2d42282a69a0eebe343390ef2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims/__pycache__/context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims/__pycache__/debug_prims.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims/__pycache__/debug_prims.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48173561973e21da3eb29fe285b82f4e2304bacb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims/__pycache__/debug_prims.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims/__pycache__/executor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims/__pycache__/executor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..061186fd921544dd67f5a5c0f8a2199da265c8b7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims/__pycache__/executor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims/__pycache__/rng_prims.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims/__pycache__/rng_prims.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..334089afc3d976529926efe748cc60c3d99d5a5e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims/__pycache__/rng_prims.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims/context.py b/MLPY/Lib/site-packages/torch/_prims/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf0104178568444168792cea4ffcfbd6c516357
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims/context.py
@@ -0,0 +1,144 @@
+import functools
+from contextlib import nullcontext
+from typing import Any, Callable, Dict, Optional, Sequence
+
+import torch
+
+import torch._decomp
+import torch._prims
+
+import torch._refs
+import torch._refs.nn
+import torch._refs.nn.functional
+import torch._refs.special
+import torch.overrides
+
+from torch._prims_common import torch_function_passthrough
+
+
+@functools.lru_cache(None)
+def torch_to_refs_map():
+    """
+    Mapping of torch API functions to torch._refs functions.
+    E.g. torch_to_refs_map()[torch.add] == torch._refs.add
+    """
+    modules = [
+        (torch, torch._refs),
+        (torch.nn, torch._refs.nn),
+        (torch.nn.functional, torch._refs.nn.functional),
+        (torch.special, torch._refs.special),
+        (torch.fft, torch._refs.fft),
+        (torch.linalg, torch._refs.linalg),
+    ]
+    r: Dict[Any, Any] = {
+        torch.Tensor.__invert__: torch._refs.bitwise_not,
+        torch.Tensor.__xor__: torch._refs.bitwise_xor,
+        torch.Tensor.__and__: torch._refs.bitwise_and,
+        torch.Tensor.__or__: torch._refs.bitwise_or,
+        torch.Tensor.__eq__: torch._refs.eq,
+        torch.Tensor.__rsub__: torch._refs.rsub,
+        torch.Tensor.__rtruediv__: torch._refs.rtruediv,
+        torch.Tensor.__floordiv__: torch._refs.floor_divide,
+        torch.Tensor.__rfloordiv__: torch._refs.rfloordiv,
+        torch.Tensor.__pow__: torch._refs.pow,
+        torch.Tensor.__rpow__: torch._refs.rpow,
+        torch.Tensor.new_empty: torch._refs.new_empty,
+        torch.Tensor.new_full: torch._refs.new_full,
+        torch.Tensor.new_zeros: torch._refs.new_zeros,
+        torch.Tensor.new_ones: torch._refs.new_ones,
+        torch.Tensor.fill_: torch._refs.fill_,
+        torch.Tensor.zero_: torch._refs.zero_,
+        torch.Tensor.to: torch._refs.to,
+        torch.Tensor.sum_to_size: torch._refs.sum_to_size,
+        # TODO: Should these methods be mapped some other way?
+        torch.Tensor.copy_: torch._prims.copy_to,
+        torch.Tensor.resize: torch._prims.resize,
+    }
+    for mod_torch, mod_refs in modules:
+        for s in mod_refs.__all__:  # type: ignore[attr-defined]
+            r[mod_torch.__dict__.get(s)] = mod_refs.__dict__.get(s)
+
+    # Support remapping torch.Tensor.foo to _refs.foo
+    for s in dir(torch.Tensor):
+        if s in torch._refs.__all__:
+            r[getattr(torch.Tensor, s)] = torch._refs.__dict__.get(s)
+
+    # Support conversions
+    for s in torch._refs._conversions.__all__:
+        tensor_attr = getattr(torch.Tensor, s, None) or getattr(torch, s)
+        r[tensor_attr] = torch._refs._conversions.__dict__.get(s)
+
+    return r
+
+
+@functools.lru_cache(None)
+def all_prims():
+    """
+    Set of all prim functions, e.g., torch._prims.add in all_prims()
+    """
+    return {torch._prims.__dict__.get(s) for s in torch._prims.__all__}
+
+
+class TorchRefsMode(torch.overrides.TorchFunctionMode):
+    """
+    Switches the interpretation of torch.* functions and Tensor methods to
+    use PrimTorch refs in torch._refs.  (Direct calls to _refs are unaffected.)
+
+    >>> # xdoctest: +SKIP
+    >>> with TorchRefsMode():
+    ...     torch.add(x, y)  # calls torch._refs.add(x, y)
+
+    By default, this context manager will fall back on the torch.* if the
+    ref does not exist; set strict=True to error if this occurs.
+    If the ref exists we still would like to fall back on the torch.* sometimes,
+    this behavior can be customized by passing a function to should_fallback_fn.
+    """
+
+    def __init__(
+        self,
+        strict=False,
+        should_fallback_fn=lambda *_: False,
+        prims_mode_cls=nullcontext,
+    ):
+        self.strict = strict
+        self.should_fallback_fn = should_fallback_fn
+        self.prims_mode_cls = prims_mode_cls
+
+    def __torch_function__(
+        self,
+        orig_func: Callable,
+        types: Sequence,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Dict] = None,
+    ):
+        if kwargs is None:
+            kwargs = {}
+        # For primitive operations, run them as is without interception
+        # Unless we are in prims_mode, in which case we want to use nvprims
+        if orig_func in torch_function_passthrough or orig_func in all_prims():
+            with self.prims_mode_cls():
+                return orig_func(*args, **kwargs)
+        mapping = torch_to_refs_map()
+        func = mapping.get(orig_func, None)
+
+        # For torch.ops.aten.*, use registered decompositions from torch._decomp
+        # torch._decomp.decomposition_table provides a mapping from
+        # torch.ops.aten.* to torch._refs or torch._decomp.decompositions
+        # implementations.
+        # There're other ways to implement this functionality,
+        # see https://github.com/pytorch/pytorch/pull/82657#discussion_r939776417
+        if func is None and isinstance(orig_func, torch._ops.OpOverload):
+            func = torch._decomp.decomposition_table.get(orig_func, None)
+
+        if func is not None:
+            # If the ref exists query whether we should use it or not
+            if self.should_fallback_fn(self, orig_func, func, args, kwargs):
+                return orig_func(*args, **kwargs)
+            # torch calls inside func should be interpreted as refs calls
+            with self:
+                return func(*args, **kwargs)
+        if self.strict:
+            raise RuntimeError(
+                f"no _refs support for {torch.overrides.resolve_name(orig_func)}"
+            )
+        return orig_func(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/_prims/debug_prims.py b/MLPY/Lib/site-packages/torch/_prims/debug_prims.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd450f08fadb3c4aeb615230fd1dbadf3921e6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims/debug_prims.py
@@ -0,0 +1,59 @@
+import contextlib
+from typing import Optional, Sequence
+
+import torch
+from torch._custom_op.impl import custom_op
+from torch.utils._content_store import ContentStoreReader
+
+LOAD_TENSOR_READER: Optional[ContentStoreReader] = None
+
+
+@contextlib.contextmanager
+def load_tensor_reader(loc):
+    global LOAD_TENSOR_READER
+    assert LOAD_TENSOR_READER is None
+    # load_tensor is an "op", and we will play merry hell on
+    # Inductor's memory planning if we return a tensor that
+    # aliases another tensor that we previously returned from
+    # an operator.  So unlike standard ContentStoreReader use,
+    # we disable the cache so that you always get fresh storages
+    # (no aliasing for you!)
+    LOAD_TENSOR_READER = ContentStoreReader(loc, cache=False)
+    try:
+        yield
+    finally:
+        LOAD_TENSOR_READER = None
+
+
+def register_debug_prims():
+    @custom_op("debugprims::load_tensor")
+    def load_tensor(  # type: ignore[empty-body]
+        name: str,
+        size: Sequence[int],
+        stride: Sequence[int],
+        *,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> torch.Tensor:
+        ...
+
+    @load_tensor.impl_factory()
+    def load_tensor_factory(name, size, stride, dtype, device):
+        if LOAD_TENSOR_READER is None:
+            from torch._dynamo.testing import rand_strided
+
+            return rand_strided(size, stride, dtype, device)
+        else:
+            from torch._dynamo.utils import clone_input
+
+            # device argument here takes care of coercion
+            r = LOAD_TENSOR_READER.read_tensor(name, device=device)
+            assert list(r.size()) == size, f"{r.size()} != {size}"
+            assert list(r.stride()) == stride, f"{r.stride()} != {stride}"
+            assert r.device == device, f"{r.device} != {device}"
+
+            # Unlike the other properties, we will do coercions for dtype
+            # mismatch
+            if r.dtype != dtype:
+                r = clone_input(r, dtype=dtype)
+            return r
diff --git a/MLPY/Lib/site-packages/torch/_prims/executor.py b/MLPY/Lib/site-packages/torch/_prims/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..415738ae14bbfd910fda03a9b68981845eca9c1f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims/executor.py
@@ -0,0 +1,60 @@
+from typing import Callable, Optional
+
+from torch._prims.context import TorchRefsMode
+
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import make_fx, wrapper_and_args_for_make_fx
+
+
+def execute(
+    gm: GraphModule,
+    *args,
+    executor: str = "aten",
+    executor_parameters: Optional[dict] = None,
+):
+    """
+    Prototype ATen executor.
+
+    Just executes the context's graph.
+    """
+
+    if executor == "aten":
+        return gm.forward(*args)
+
+    msg = f"Received unexpected value for 'executor': {executor}. Allowed values are: aten."
+    raise ValueError(msg)
+
+
+def make_traced(fn: Callable):
+    """
+    Returns a function that, when called, will
+    trace its torch operations to prims and then
+    execute those prims on the requested trace executor
+    (possibly lowering them to that trace executor first).
+
+    Only supports the torch operations defined in _torch_to_reference_map
+    in context.py and operations with positional args. All args must
+    be tensors.
+    In the near future all these restrictions will be lifted.
+
+    Example usage:
+
+    def foo(a, b):
+      return torch.add(a, b)
+
+    traced_foo = make_traced(foo)
+
+    a = torch.randn((1, 2, 3, 4, 5), device='cuda')
+    b = torch.randn((1, 2, 3, 4, 5), device='cuda')
+    result = traced_foo(a, b, executor='aten')
+    """
+
+    def _traced(*args, executor="aten", **kwargs):
+        # TODO: caching
+        wrapped, all_args = wrapper_and_args_for_make_fx(fn, args, kwargs)
+
+        with TorchRefsMode():
+            gm = make_fx(wrapped)(all_args)
+        return execute(gm, all_args, executor=executor)
+
+    return _traced
diff --git a/MLPY/Lib/site-packages/torch/_prims/rng_prims.py b/MLPY/Lib/site-packages/torch/_prims/rng_prims.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d7e6e2d3e6e8a90d206816df6fc42cfd6d9e30f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims/rng_prims.py
@@ -0,0 +1,268 @@
+from typing import Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch import _prims
+from torch._C import DispatchKey
+from torch._higher_order_ops.utils import autograd_not_implemented
+from torch._ops import HigherOrderOperator
+
+from torch._prims_common import CUDARngStateHelper, make_contiguous_strides_for
+from torch._prims_common.wrappers import backwards_not_supported
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.proxy_tensor import (
+    disable_proxy_modes_tracing,
+    ProxyTorchDispatchMode,
+    track_tensor_tree,
+)
+from torch.types import _device, _dtype
+
+
+rngprim_namespace = "rngprims"
+rngprim = torch.library.Library(rngprim_namespace, "DEF")
+rngprim_impl = torch.library.Library(
+    rngprim_namespace, "IMPL", "CompositeExplicitAutograd"
+)
+rngprim_autograd_impl = torch.library.Library(rngprim_namespace, "IMPL", "Autograd")
+rngprim_meta_impl = torch.library.Library(rngprim_namespace, "IMPL", "Meta")
+
+
+def throw_on_non_cuda(device):
+    raise RuntimeError(
+        f"You are trying to functionalize a {device.type} RNG operator but {device.type} does not "
+        f"use Philox/counter-based RNG. Therefore, functionalizing a {device.type} RNG operator is "
+        "not supported. We are discussing the possibility of a Philox-based RNG implementation for CPU."
+    )
+
+
+def register_rng_prim(name, schema, impl_aten, impl_meta, doc, tags=None):
+    rngprim.define(schema)
+    rngprim_impl.impl(name, impl_aten)
+    rngprim_meta_impl.impl(name, impl_meta)
+
+    prim_packet = getattr(torch._ops.ops.rngprims, name)
+    prim = prim_packet.default
+    if tags:
+        prim._tags = tags
+
+    rngprim_autograd_impl.impl(name, backwards_not_supported(prim))
+
+    for p in (prim_packet, prim):
+        p.__doc__ = doc
+        p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
+
+        p.schema = schema
+        p.impl_aten = impl_aten
+        p.prim_meta_impl = impl_meta
+
+
+# Philox rand offsets could be shared in future with other philox ops, so
+# keeping these functions in global scope.
+def philox_rand_offset_meta(
+    shape: torch.Size,
+):
+    return _prims.TensorLike(torch.tensor(0, dtype=torch.int64))
+
+
+def philox_rand_offset(
+    shape: torch.Size,
+):
+    # For impl, look at the function calc_execution_policy in the file
+    # aten/src/ATen/native/cuda/DistributionTemplates.h. The impl was copied at
+    # commit hash 72aa0667bd16707d50eb8fa337092a1f5d11dfb6
+    numel_scalar = 1
+    for dim_size in shape:
+        numel_scalar *= dim_size
+    numel = torch.scalar_tensor(numel_scalar, dtype=torch.int64)
+
+    block_size = 256
+    unroll = 4
+    curand4_engine_calls = 4
+    device_property = torch.cuda.get_device_properties(torch.cuda.current_device())
+    blocks_per_sm = device_property.max_threads_per_multi_processor // block_size
+    grid_size = (numel + block_size - 1) // block_size
+    grid_size = min(grid_size, device_property.multi_processor_count * blocks_per_sm)
+    offset = (
+        (numel - 1) // (block_size * grid_size * unroll) + 1
+    ) * curand4_engine_calls
+    return offset
+
+
+def register_philox_rand():
+    name = "philox_rand"
+    schema = "philox_rand(SymInt[] size, Tensor seed, Tensor offset, int[]? stride, Device? device=None, ScalarType? dtype=None) -> (Tensor, Tensor)"  # noqa: B950
+
+    def _philox_rand_meta(
+        shape: torch.Size,
+        seed: torch.Tensor,
+        offset: torch.Tensor,
+        stride: Optional[Tuple[int, ...]],
+        device: _device,
+        dtype: _dtype,
+    ):
+        # stride arg will be useful for distributed usecase. Currently, its unused.
+        assert stride is None
+        stride = make_contiguous_strides_for(shape)
+        random_values = _prims.TensorMeta(
+            shape=shape, strides=stride, dtype=dtype, device=device
+        )
+        offset = philox_rand_offset_meta(shape)
+        return (random_values, offset)
+
+    def _philox_rand(
+        shape: torch.Size,
+        seed: torch.Tensor,
+        offset: torch.Tensor,
+        stride: Optional[Tuple[int, ...]],
+        device: _device,
+        dtype: _dtype,
+    ):
+        # stride arg will be useful for distributed usecase. Currently, its unused.
+        assert stride is None
+        if device.type == "cpu":
+            devices = []
+        else:
+            devices = [device]
+
+        if device.type != "cuda":
+            raise throw_on_non_cuda(device)
+
+        with torch.random.fork_rng(devices):
+            CUDARngStateHelper.set_torch_state_tensor(seed, offset)
+            random_values = torch.rand(shape, device=device, dtype=dtype)
+
+        return random_values, philox_rand_offset(shape)
+
+    register_rng_prim(
+        name=name,
+        schema=schema,
+        impl_aten=_philox_rand,
+        impl_meta=_philox_rand_meta,
+        doc="Philox based stateless rand operator",
+        tags=(torch.Tag.nondeterministic_seeded,),
+    )
+
+
+def get_device(args, kwargs):
+    if kwargs.get("device"):
+        device = kwargs.get("device")
+        if isinstance(device, str):
+            device = torch.device(device)
+        return device.type
+
+    devices = {arg.device.type for arg in args if isinstance(arg, torch.Tensor)}
+    if any(dev == "cuda" for dev in devices):
+        return "cuda"
+    elif any(dev == "cpu" for dev in devices):
+        return "cpu"
+    return None
+
+
+def register_run_and_save_rng_state_op():
+    run_and_save_rng_state = HigherOrderOperator("run_and_save_rng_state")
+
+    run_and_save_rng_state.py_impl(DispatchKey.Autograd)(
+        autograd_not_implemented(run_and_save_rng_state, deferred_error=True)
+    )
+
+    @run_and_save_rng_state.py_impl(DispatchKey.CUDA)
+    def impl_cuda(op, *args, **kwargs):
+        return torch.cuda.get_rng_state(), op(*args, **kwargs)
+
+    @run_and_save_rng_state.py_impl(DispatchKey.CPU)
+    def impl_cpu(op, *args, **kwargs):
+        return torch.get_rng_state(), op(*args, **kwargs)
+
+    @run_and_save_rng_state.py_impl(DispatchKey.BackendSelect)
+    def impl_backend_select(op, *args, **kwargs):
+        impl_map = {"cuda": impl_cuda, "cpu": impl_cpu}
+        device = get_device(args, kwargs)
+        assert device in impl_map, f"Backend not supported for {device}"
+        impl = impl_map[device]
+        return impl(op, *args, **kwargs)
+
+    @run_and_save_rng_state.py_impl(FakeTensorMode)
+    def impl_fake_tensor_mode(mode, op, *args, **kwargs):
+        # Check device to call the right impl
+        with mode:
+            return impl_backend_select(op, *args, **kwargs)
+
+    @run_and_save_rng_state.py_impl(ProxyTorchDispatchMode)
+    def impl_proxy_dispatch_mode(mode, op, *args, **kwargs):
+        if mode.enable_tracing:
+            out = impl_backend_select(op, *args, **kwargs)
+            proxy_args = pytree.tree_map(mode.tracer.unwrap_proxy, (op, *args))
+            proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+            out_proxy = mode.tracer.create_proxy(
+                "call_function", run_and_save_rng_state, proxy_args, proxy_kwargs
+            )
+            return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+        else:
+            return run_and_save_rng_state(op, *args, **kwargs)
+
+    return run_and_save_rng_state
+
+
+def register_run_with_rng_state_op():
+    run_with_rng_state = HigherOrderOperator("run_with_rng_state")
+
+    run_with_rng_state.py_impl(DispatchKey.Autograd)(
+        autograd_not_implemented(run_with_rng_state, deferred_error=True)
+    )
+
+    @run_with_rng_state.py_impl(DispatchKey.CUDA)
+    def impl_cuda(rng_state, op, *args, **kwargs):
+        current_state = torch.cuda.get_rng_state()
+        torch.cuda.set_rng_state(rng_state.cpu())
+        out = op(*args, **kwargs)
+        torch.cuda.set_rng_state(current_state)
+        return out
+
+    @run_with_rng_state.py_impl(DispatchKey.CPU)
+    def impl_cpu(rng_state, op, *args, **kwargs):
+        current_state = torch.get_rng_state()
+        torch.set_rng_state(rng_state)
+        out = op(*args, **kwargs)
+        torch.set_rng_state(current_state)
+        return out
+
+    @run_with_rng_state.py_impl(ProxyTorchDispatchMode)
+    def impl_proxy_dispatch_mode(mode, rng_state, op, *args, **kwargs):
+        if mode.enable_tracing:
+            with disable_proxy_modes_tracing():
+                out = run_with_rng_state(rng_state, op, *args, **kwargs)
+            proxy_args = pytree.tree_map(
+                mode.tracer.unwrap_proxy, (rng_state, op, *args)
+            )
+            proxy_kwargs = pytree.tree_map(mode.tracer.unwrap_proxy, kwargs)
+            out_proxy = mode.tracer.create_proxy(
+                "call_function", run_with_rng_state, proxy_args, proxy_kwargs
+            )
+            return track_tensor_tree(out, out_proxy, constant=None, tracer=mode.tracer)
+        else:
+            return run_with_rng_state(rng_state, op, *args, **kwargs)
+
+    @run_with_rng_state.py_impl(DispatchKey.BackendSelect)
+    def impl_backend_select(rng_state, op, *args, **kwargs):
+        impl_map = {"cuda": impl_cuda, "cpu": impl_cpu}
+        device = get_device(args, kwargs)
+        assert device in impl_map, f"Backend not supported for {device}"
+        impl = impl_map[device]
+        return impl(rng_state, op, *args, **kwargs)
+
+    @run_with_rng_state.py_impl(FakeTensorMode)
+    def impl_fake_tensor_mode(mode, rng_state, op, *args, **kwargs):
+        # Skip setting the set_rng_state as it does not work well with fake tensors.
+        # And it does not matter for the fake tensor mode.
+        with mode:
+            return op(*args, **kwargs)
+
+    return run_with_rng_state
+
+
+run_and_save_rng_state = register_run_and_save_rng_state_op()
+run_with_rng_state = register_run_with_rng_state_op()
+
+
+def register_rng_prims():
+    register_philox_rand()
diff --git a/MLPY/Lib/site-packages/torch/_prims_common/__init__.py b/MLPY/Lib/site-packages/torch/_prims_common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..494c94ffe2d852bb723dd22dd1b1c5e2fbdd5a22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims_common/__init__.py
@@ -0,0 +1,1985 @@
+from __future__ import annotations
+
+import operator
+import warnings
+import weakref
+
+from contextlib import nullcontext
+from enum import Enum
+from functools import cmp_to_key, reduce
+from typing import (
+    Any,
+    Callable,
+    cast,
+    List,
+    NamedTuple,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
+from typing_extensions import TypeAlias
+
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+
+    import sympy
+
+import torch
+from torch import sym_float, sym_int, sym_max
+
+
+ShapeType: TypeAlias = Union[torch.Size, List[int], Tuple[int, ...]]
+StrideType: TypeAlias = Union[List[int], Tuple[int, ...]]
+DimsType: TypeAlias = Union[int, List[int], Tuple[int, ...]]
+DimsSequenceType: TypeAlias = Union[List[int], Tuple[int, ...]]
+# TODO: Type[torch.SymInt], Type[torch.SymFloat]
+NumberTypeType: TypeAlias = Union[Type[bool], Type[int], Type[float], Type[complex]]
+# TODO: This needs a lot more type annotations
+# NumberType = Union[bool, int, float, complex, torch.SymInt, torch.SymFloat]
+NumberType: TypeAlias = Union[bool, int, float, complex]
+RealNumberType: TypeAlias = Union[bool, int, float]
+
+Number = (bool, int, float, complex, torch.SymInt, torch.SymFloat)
+# I don't call it Integral because numbers.Integral includes bool, but IntLike
+# does not
+Dim = int
+IntLike = (int, torch.SymInt)
+FloatLike = (float, torch.SymFloat)
+IntWithoutSymInt = int
+FloatWithoutSymFloat = float
+DeviceLikeType: TypeAlias = Union[str, torch.device, int]
+Tensor = torch.Tensor
+
+
+torch_function_passthrough = {
+    torch.device,
+    torch.sym_not,
+    torch.sym_float,
+    torch.sym_int,
+    torch.sym_max,
+    torch.sym_min,
+    torch._sym_sqrt,  # type: ignore[attr-defined]
+    torch.sym_ite,
+    torch.Tensor.dim,
+    torch.Tensor.ndim.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.numel,
+    torch.Tensor.size,
+    torch.Tensor.storage_offset,
+    torch.Tensor.stride,
+    torch.Tensor.dtype.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.is_sparse.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.shape.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.device.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.requires_grad.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.layout.__get__,  # type: ignore[attr-defined]
+    torch.Tensor.is_contiguous,
+    # For TorchRefsMode only
+    torch.Tensor.__format__,
+    torch.Tensor.__repr__,
+    torch.Tensor.requires_grad.__get__,  # type: ignore[attr-defined]
+}
+
+
+TensorLikeType = torch.Tensor
+TensorLike = torch.Tensor
+TensorSequenceType: TypeAlias = Union[List[TensorLikeType], Tuple[TensorLikeType, ...]]
+TensorOrNumberLikeType: TypeAlias = Union[TensorLikeType, NumberType]
+
+CustomOutParamAnnotation = "__custom_out_param__"
+
+
+def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if len(a) != len(b):
+        return False
+
+    for x, y in zip(a, b):
+        if allow_rhs_unbacked:
+            # TODO: We should check that the symbols are consistent
+            # with each other
+            if isinstance(y, torch.SymInt):
+                continue
+        # NB: Naively, you would not expect to have to do an oblivious guard
+        # here because there is seemingly no broadcasting here, but in fact we
+        # use this in some situations to determine if we need to do an expand
+        # on the tensor because they don't line up, so you can definitely end
+        # up trying to prove u0 != 1 in this situation.  See
+        # python test/test_proxy_tensor.py -k test_cumsum_unbacked
+        if guard_size_oblivious(x != y):
+            return False
+
+    return True
+
+
+def _maybe_get_pytype(t):
+    if t is torch.SymFloat:
+        return float
+    elif t is torch.SymInt:
+        return int
+    elif t is torch.SymBool:
+        return bool
+    else:
+        return t
+
+
+# TODO: look at using torch.testing.assert_close instead with an option
+#   to just compare metadata
+def compare_tensor_meta(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    check_strides=False,
+    *,
+    allow_rhs_unbacked=False,
+    check_conj=True,
+):
+    """
+    Checks that two tensor likes have the same shape,
+    dtype and device.
+
+    In the future this will validate additional metadata, like
+    strides.
+    """
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+
+    if not same_shape(a.shape, b.shape, allow_rhs_unbacked=allow_rhs_unbacked):
+        msg = f"Shapes {a.shape} and {b.shape} are not equal!"
+        raise AssertionError(msg)
+
+    if a.dtype != b.dtype:
+        msg = f"Dtypes {a.dtype} and {b.dtype} are not equal!"
+        raise AssertionError(msg)
+
+    if a.device != b.device:
+        # Handles special cuda:0 vs cuda case
+        # TODO: we should review why this happens and see about fixing it
+        if (str(a.device) == "cuda:0" or str(a.device) == "cuda") and (
+            str(b.device) == "cuda:0" or str(b.device) == "cuda"
+        ):
+            pass
+        else:
+            msg = f"Devices {a.device} and {b.device} are not equal!"
+            raise AssertionError(msg)
+
+    # Stride checking is currently disabled, see https://github.com/pytorch/pytorch/issues/78050
+    if check_strides:
+        same_strides, idx = check_significant_strides(a, b)
+        if not same_strides:
+            msg = f"Stride mismatch! Strides are {a.stride()} and {b.stride()} (mismatched at {idx})!"
+            raise RuntimeError(msg)
+
+        if a.storage_offset() != b.storage_offset():
+            msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!"
+            raise RuntimeError(msg)
+
+    if check_conj:
+        if a.is_conj() != b.is_conj():
+            raise RuntimeError(
+                f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
+            )
+
+    if a.is_neg() != b.is_neg():
+        raise RuntimeError(
+            f"Neg mismatch! is_neg is set to {a.is_neg()} and {b.is_neg()}"
+        )
+
+
+def _check_strides_helper(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True, significant_only=True
+) -> Tuple[bool, Optional[int]]:
+    # NOTE: only on CUDA because CPU elementwise strides are incorrect in PyTorch
+    # See https://github.com/pytorch/pytorch/issues/77553
+    # Only compares strides that are "meaningful" -- strides for dimensions with length > 1
+    # and for tensors with more than one element
+    if (
+        not only_cuda or a.device.type == "cuda" or b.device.type == "cuda"
+    ) and a.numel() > 0:
+        for idx in range(a.ndim):
+            check = not significant_only or a.shape[idx] > 1
+            if a.stride()[idx] != b.stride()[idx] and check:
+                return False, idx
+
+    return True, None
+
+
+def check_significant_strides(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
+) -> Tuple[bool, Optional[int]]:
+    return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=True)
+
+
+def check_all_strides(
+    a: TensorLikeType, b: TensorLikeType, *, only_cuda=True
+) -> Tuple[bool, Optional[int]]:
+    return _check_strides_helper(a, b, only_cuda=only_cuda, significant_only=False)
+
+
+# This function is equivalent to compute_contiguous() from TensorImpl.cpp
+def is_contiguous(a: TensorLikeType) -> bool:
+    """
+    Tests whether a tensor is contiguous or not.
+
+    Tensors are contiguous when they have no elements,
+    one element, or when they have "nested" strides.
+    """
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if guard_size_oblivious(a.numel() < 2):
+        return True
+
+    expected_stride = 1
+    for x, y in reversed(tuple(zip(a.shape, a.stride()))):
+        # Skips checking strides when a dimension has length 1
+        if guard_size_oblivious(x == 1):
+            continue
+
+        if y != expected_stride:
+            return False
+        expected_stride = expected_stride * x
+
+    return True
+
+
+# This function is equivalent to compute_channels_last_contiguous_2d() in TensorImpl.cpp
+def is_channels_last_contiguous_2d(a: Tensor) -> bool:
+    # NHWC or not channels last 2D contiguous
+    if a.ndim != 4:
+        return False
+
+    expected_stride = 1
+    for idx in (1, 3, 2, 0):
+        length = a.shape[idx]
+        if length == 1:
+            continue
+
+        stride = a.stride()[idx]
+        if stride != expected_stride:
+            return False
+
+        expected_stride *= length
+
+    return True
+
+
+def is_channels_last_contiguous_3d(a: Tensor) -> bool:
+    # NDHWC or not channels last 3D contiguous
+    if a.ndim != 5:
+        return False
+
+    expected_stride = 1
+    for idx in (1, 4, 3, 2, 0):
+        length = a.shape[idx]
+        if length == 1:
+            continue
+
+        stride = a.stride()[idx]
+        if stride != expected_stride:
+            return False
+
+        expected_stride *= length
+
+    return True
+
+
+_memory_formats = {
+    torch.contiguous_format,
+    torch.preserve_format,
+    torch.channels_last,
+    torch.channels_last_3d,
+}
+
+
+def validate_memory_format(memory_format: torch.memory_format):
+    torch._check(
+        memory_format in _memory_formats,
+        lambda: f"Received unknown memory format {memory_format}!",
+    )
+
+
+def is_contiguous_for_memory_format(  # type: ignore[return]
+    a: Tensor, *, memory_format: torch.memory_format
+) -> bool:
+    validate_memory_format(memory_format)
+
+    if memory_format == torch.contiguous_format:
+        return is_contiguous(a)
+    if memory_format == torch.channels_last:
+        return is_channels_last_contiguous_2d(a)
+    if memory_format == torch.channels_last_3d:
+        return is_channels_last_contiguous_3d(a)
+
+    torch._check(
+        False,
+        lambda: f"is_contiguous received unsupported memory format {memory_format}",
+    )
+
+
+# NOTE: that tensors with no elements and channels last is ???
+def is_channels_last_contiguous(a: Tensor) -> bool:
+    """
+    True when a tensor is channels-last contiguous.
+
+    This requires that:
+
+      - the tensor is conceptually either 4 (NHWC) or 5 (NDHWC) dimensions
+      - if we name the tensor's dimensions NCHW or NCDHW, then the strides are such that the
+        stride of the 'C' dimension (Cs) is 1 and the strides corresponding to
+        each dimension (Xs) can be ordered Cs <= Ws <= Hs <= (Ds) <= Ns and are
+        "nested" -- so Ws = Cs * Cl, where Cl is the length of the 'C' dimension,
+        for example.
+    """
+    return is_channels_last_contiguous_2d(a) or is_channels_last_contiguous_3d(a)
+
+
+def is_non_overlapping_and_dense(a: Tensor) -> bool:
+    """
+    True when a tensor is non-overlapping and dense.
+
+    A tensor is non-overlapping and dense when there exists a permutation of
+    its dimensions that is contiguous.
+    """
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if a.is_sparse:
+        return False
+
+    # Short-circuits if the tensor is already contiguous or channels-last contiguous
+    if is_contiguous(a) or is_channels_last_contiguous(a):
+        return True
+
+    # The following is equivalent to compute_non_overlapping_and_dense in TensorImpl.cpp
+
+    # Short-circuits for tensors of rank one, which are
+    # non-overlapping and "dense" if their stride is one
+    if a.ndim == 1:
+        return a.stride()[0] == 1
+
+    # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
+    # Sorts (length, stride) pairs by stride
+    #
+    # This sort is done in a size-oblivious way, which helps if we do a
+    # comparison like 2048*u0 > u0; we just want this to return True
+    # (and not worry about what if u0 is zero).
+    class K(NamedTuple):
+        size: int
+        stride: int
+
+        def __lt__(self, other):
+            return guard_size_oblivious(self.stride < other.stride)
+
+        def __gt__(self, other):
+            return guard_size_oblivious(self.stride > other.stride)
+
+        def __le__(self, other):
+            return guard_size_oblivious(self.stride <= other.stride)
+
+        def __ge__(self, other):
+            return guard_size_oblivious(self.stride >= other.stride)
+
+        def __eq__(self, other):
+            return guard_size_oblivious(self.stride == other.stride)
+
+    lengths_and_strides = sorted(map(K, a.shape, a.stride()))
+
+    expected_stride = 1
+    for length, stride in lengths_and_strides:
+        if guard_size_oblivious(length == 1):
+            continue
+
+        if stride != expected_stride:
+            return False
+
+        expected_stride *= length
+
+    return True
+
+
+# NOTE: Based on the implementation in TensorIterator.cpp, but note that
+# the note [Computing output strides] is incorrect, because it
+# says that strides will be preserved even if they are not
+# "non overlapping and dense", but this is incorrect. The
+# output of elementwise operations are always given
+# non overlapping and dense strides.
+# This is also INCORRECT because it does not model TensorIterator's
+# short-circuit, which can cause different strides.
+def compute_elementwise_output_logical_to_physical_perm(
+    *tensors, _skip_checks=False
+) -> List[int]:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if not _skip_checks and len(tensors) == 0:
+        msg = "Can't compute elementwise output strides for zero tensors!"
+        raise ValueError(msg)
+
+    if not _skip_checks:
+        check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
+
+    # Filters the tensors to actual tensors
+    if not _skip_checks:
+        tensors = tuple(
+            a
+            for a in tensors
+            if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
+        )
+
+    # Short-circuits for CPU scalar case
+    if len(tensors) == 0:
+        return []
+
+    # Short-circuits for shapes with zero or one dimensions
+    # TODO: are these necessary?
+    ndim = tensors[0].ndim
+    if ndim == 0:
+        return []
+    if ndim == 1:
+        return [0]
+
+    # Short-circuits if contiguous, following the fake fast path.
+    # This reduces the number of guards we end up making
+    # TODO: do channels last too
+    is_contiguous = True
+    for t in tensors:
+        is_contiguous = is_contiguous and t.is_contiguous(
+            memory_format=torch.contiguous_format
+        )
+
+    if is_contiguous:
+        return list(range(ndim))
+
+    shape = tensors[0].shape
+
+    def should_swap(idx_a, idx_b):
+        for tensor in tensors:
+            stride_a = tensor.stride()[idx_a]
+            stride_b = tensor.stride()[idx_b]
+
+            if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
+                stride_b == 0
+            ):
+                continue
+
+            if guard_size_oblivious(stride_a < stride_b):
+                return -1
+
+            if guard_size_oblivious(stride_a > stride_b):
+                return 1
+
+            # stride_a == stride_b
+            if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
+                return 1
+
+        # Note: this case is hit if all strides are zero,
+        # or all strides are equal and all dimensions have the same length
+        return 0
+
+    # The "sort" order for the permutation is back-to-front, but
+    # the natural order for permutations is front-to-back.  Do the
+    # sorting back-to-front and then reverse it on output.
+    #
+    # also, note this returns the logical to physical shape permutation
+    perm = list(reversed(range(ndim)))
+
+    # insertion sort with support for ambiguous comparisons
+    for i in range(1, ndim):
+        dim1 = i
+        for dim0 in reversed(range(i)):
+            comparison = should_swap(perm[dim0], perm[dim1])
+            if comparison > 0:
+                perm[dim0], perm[dim1] = perm[dim1], perm[dim0]
+                dim1 = dim0
+            elif comparison < 0:
+                break
+
+    return list(reversed(perm))
+
+
+def compute_elementwise_output_strides(*tensors) -> Tuple[int, ...]:
+    """
+    Computes the output strides for elementwise operations.
+    """
+    if len(tensors) == 0:
+        msg = "Can't compute elementwise output strides for zero tensors!"
+        raise ValueError(msg)
+
+    check_same_shape(*tensors, allow_cpu_scalar_tensors=True)
+
+    # Filters the tensors to actual tensors
+    tensors = tuple(
+        a for a in tensors if isinstance(a, TensorLike) and not is_cpu_scalar_tensor(a)
+    )
+
+    # Short-circuits for CPU scalar case
+    if len(tensors) == 0:
+        return ()
+
+    ndim = tensors[0].ndim
+    shape = tensors[0].shape
+
+    if ndim == 0:
+        return ()
+    if ndim == 1:
+        return (1,)
+
+    logical_to_physical_perm = compute_elementwise_output_logical_to_physical_perm(
+        *tensors, _skip_checks=True
+    )
+    permuted_shape = apply_perm(shape, logical_to_physical_perm)  # to physical
+
+    new_strides = make_contiguous_strides_for(permuted_shape)
+    permuted_strides = apply_perm(
+        new_strides, invert_perm(logical_to_physical_perm)
+    )  # to logical
+
+    return tuple(permuted_strides)
+
+
+# Identity permutation is [0, 1, 2]
+def apply_perm(inp, perm):
+    ndim = len(inp)
+    permuted_inp = [-1] * ndim
+    for idx, x in enumerate(perm):
+        permuted_inp[idx] = inp[x]
+    return permuted_inp
+
+
+def invert_perm(perm):
+    ndim = len(perm)
+    new_perm = [-1] * ndim
+    for idx, x in enumerate(perm):
+        new_perm[x] = idx
+    return new_perm
+
+
+#
+# Common helper functions
+#
+
+
+def validate_dim_length(length: int):
+    """
+    Validates that an object represents a valid
+    dimension length.
+    """
+
+    if isinstance(length, (int, torch.SymInt)):
+        torch._check_is_size(length)
+    else:
+        # sometimes called with sympy expression by inductor
+        assert length >= 0
+
+
+def validate_shape(shape: ShapeType):
+    """
+    Validates that a sequence represents a valid shape.
+    """
+
+    assert isinstance(shape, Sequence), type(shape)
+    for l in shape:
+        validate_dim_length(l)
+
+
+def validate_strides(strides: StrideType):
+    """
+    Verifies the object specifies valid strides.
+    """
+
+    assert isinstance(strides, Sequence)
+    for stride in strides:
+        assert stride >= 0
+
+
+def validate_idx(rank: int, idx: int):
+    """
+    Validates that idx is a valid index for the given shape.
+    Assumes the index is already canonicalized.
+    """
+
+    assert isinstance(idx, Dim)
+    assert isinstance(rank, Dim)
+
+    assert idx >= 0 and idx < rank or idx == 0
+
+
+def validate_dimension_indices(rank: int, indices: DimsSequenceType):
+    for idx in indices:
+        validate_idx(rank, idx)
+
+
+def validate_exclusive_idx(rank: int, ex_idx: int):
+    """
+    Validates that ex_idx is a valid exclusive index
+    for the given shape.
+    """
+
+    assert isinstance(ex_idx, Dim)
+    assert isinstance(rank, Dim)
+    assert ex_idx > 0 and ex_idx <= rank
+
+
+# "Wraps" a dim (up to one time) for the given rank, allowing dims to be
+# specified using negative indices. If `wrap_scalar` is true then scalar
+# tensors of rank 0 will allow dimensions in the range [-1, 0]. Otherwise,
+# idx should be in the range [-rank, rank-1].
+def canonicalize_dim(rank: int, idx: int, wrap_scalar: bool = True) -> int:
+    if rank < 0:
+        msg = f"Rank cannot be negative but got {rank}"
+        raise IndexError(msg)
+
+    if rank == 0:
+        if not wrap_scalar:
+            msg = f"Dimension specified as {idx} but tensor has no dimensions"
+            raise IndexError(msg)
+        rank = 1
+
+    if idx >= 0 and idx < rank:
+        return idx
+
+    if idx < 0:
+        _idx = idx + rank
+    else:
+        _idx = idx
+
+    if _idx < 0 or _idx >= rank:
+        # Same error message as in aten/src/ATen/WrapDimUtils.h:49
+        msg = f"Dimension out of range (expected to be in range of [{-rank}, {rank - 1}], but got {idx})"
+        raise IndexError(msg)
+
+    return _idx
+
+
+# Takes a dimension or sequence of dimensions and "wraps" them,
+# mapping negative offsets to positive ones
+@overload
+def canonicalize_dims(
+    rank: int, indices: Sequence[int], wrap_scalar: bool = True
+) -> Tuple[int, ...]:
+    pass
+
+
+@overload
+def canonicalize_dims(rank: int, indices: int, wrap_scalar: bool = True) -> int:
+    pass
+
+
+def canonicalize_dims(rank, indices, wrap_scalar=True):
+    if isinstance(indices, Dim):
+        return canonicalize_dim(rank, indices, wrap_scalar)
+
+    return tuple(canonicalize_dim(rank, x, wrap_scalar) for x in indices)
+
+
+def is_valid_permutation(rank: int, perm: DimsSequenceType) -> bool:
+    """
+    Validates that perm is a permutation of length rank.
+    """
+
+    if not isinstance(perm, Sequence):
+        return False
+
+    if not (tuple(sorted(perm)) == tuple(range(0, rank))):
+        return False
+
+    return True
+
+
+def is_same_shape(a: Sequence, b: Sequence) -> bool:
+    """
+    Compares two shapes a and b, returning True if they are the same
+    (their ranks and corresponding lengths match) and False otherwise.
+    """
+
+    return tuple(a) == tuple(b)
+
+
+def is_cpu_scalar_tensor(a: Any) -> bool:
+    return isinstance(a, TensorLike) and a.ndim == 0 and a.device.type == "cpu"
+
+
+def check_same_device(*args, allow_cpu_scalar_tensors):
+    """
+    Checks that all Tensors in args have the same device.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensor objects in args have different devices, unless one is a CPU scalar tensor and allow_cpu_scalar_tensors is True
+    """
+    # Short-circuits if all (one or fewer) arguments are trivially on the same device
+    if len(args) <= 1:
+        return
+
+    # Note: cannot initialize device to the first arg's device (it may not have one)
+    device = None
+    for arg in args:
+        if isinstance(arg, Number):
+            continue
+        elif isinstance(arg, TensorLike):
+            if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg):
+                continue
+
+            if device is None:
+                device = arg.device
+
+            if device != arg.device:
+                msg = (
+                    "Tensor on device "
+                    + str(arg.device)
+                    + " is not on the expected device "
+                    + str(device)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same device, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+def canonicalize_device(device: DeviceLikeType) -> torch.device:
+    if isinstance(device, torch.device):
+        return device
+
+    assert isinstance(device, str)
+    return torch.device(device)
+
+
+# Asserts if any of the following are true:
+#   - a non-scalar or non-Tensor is given
+#   - the shape of any tensors is distinct
+def check_same_shape(*args, allow_cpu_scalar_tensors: bool):
+    """
+    Checks that all Tensors in args have the same shape.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensor objects in args have different devices
+    """
+    shape = None
+
+    for arg in args:
+        if isinstance(arg, Number):
+            continue
+        elif isinstance(arg, TensorLike):
+            if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg):
+                continue
+
+            if shape is None:
+                shape = arg.shape
+
+            if not is_same_shape(shape, arg.shape):
+                msg = f"Shape {arg.shape} is not the expected shape {shape}!"
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same shape, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+# Acquires a common shape, if it exists, from one or more tensor arguments,
+# filtering number arguments
+def extract_shape(*args, allow_cpu_scalar_tensors: bool) -> Optional[ShapeType]:
+    shape = None
+    scalar_shape = None
+
+    for arg in args:
+        if isinstance(arg, Number):
+            continue
+        elif isinstance(arg, TensorLike):
+            if allow_cpu_scalar_tensors and is_cpu_scalar_tensor(arg):
+                scalar_shape = arg.shape
+                continue
+
+            if shape is None:
+                shape = arg.shape
+
+            if not is_same_shape(shape, arg.shape):
+                return None
+        else:
+            return None
+
+    return shape if shape is not None else scalar_shape
+
+
+# Extracts dimensions that might be passed either as a list/tuple or as varargs.
+# A typical case is Tensor.permute .
+def extract_dims_from_varargs(
+    dims: Union[DimsSequenceType, Tuple[DimsSequenceType, ...]]
+) -> DimsSequenceType:
+    if dims and isinstance(dims[0], Sequence):
+        assert len(dims) == 1
+        dims = cast(Tuple[DimsSequenceType], dims)
+        return dims[0]
+    else:
+        return cast(DimsSequenceType, dims)
+
+
+def extract_shape_from_varargs(
+    shape: Union[ShapeType, Tuple[ShapeType]],
+    validate=True,
+) -> Tuple[int, ...]:
+    """
+    Returns a shape from varargs.
+
+    In PyTorch, operations that accept shapes often accept them as varargs, like
+    foo(*shape). However a user can pass the shape as a sequence of integers,
+    like this:
+
+      foo(1, 2, 3)
+
+    or as a sequence of integers
+
+      foo((1, 2, 3))
+
+    In the first case shape will be a tuple of integers, and in the second case it's a tuple
+    containing a tuple of integers. This validates those inputs and canonicalizes them
+    to a tuple of integers.
+    """
+
+    # Handles tuple unwrapping
+    if len(shape) == 1 and isinstance(shape[0], Sequence):
+        shape = shape[0]
+
+    if validate:
+        validate_shape(shape)  # type: ignore[arg-type]
+    return shape  # type: ignore[return-value]
+
+
+def infer_size_shapes(a: ShapeType, b: ShapeType) -> Tuple[int, ...]:
+    ndim = max(len(a), len(b))
+    expandedSizes = [0] * ndim
+
+    for i in range(ndim - 1, -1, -1):
+        offset = ndim - 1 - i
+        dimA = len(a) - 1 - offset
+        dimB = len(b) - 1 - offset
+        sizeA = a[dimA] if dimA >= 0 else 1
+        sizeB = b[dimB] if dimB >= 0 else 1
+
+        torch._check(
+            (sizeA == sizeB) or (sizeA == 1) or (sizeB == 1),
+            lambda: (
+                f"The size of tensor a ({sizeA}) must match the size of "
+                f"tensor b ({sizeB}) at non-jagged dimension {i}"
+            ),
+        )
+
+        # 1s map to the other size (even 0)
+        expandedSizes[i] = sizeB if sizeA == 1 else sizeA
+
+    return tuple(expandedSizes)
+
+
+def infer_size(shape: ShapeType, numel: int) -> Tuple[int, ...]:
+    """
+    Infers the size of a dim with size -1, if it exists.
+    Also checks that new shape is compatible with the number of elements.
+    """
+    dim = None
+    newsize = 1
+    for i, d in enumerate(shape):
+        if d == -1:
+            torch._check(dim is None, lambda: "only one dimension can be inferred")
+            dim = i
+        elif d >= 0:
+            newsize *= d
+        else:
+            torch._check(False, lambda: f"invalid shape dimension {d}")
+    if dim is None:
+        torch._check(
+            numel == newsize,
+            lambda: f"shape '{list(shape)}' is invalid for input of size {numel}",
+        )
+    else:
+        from torch.fx.experimental.symbolic_shapes import definitely_true
+
+        torch._check(
+            newsize != 0,
+            lambda: (
+                f"cannot reshape tensor of 0 elements into shape {list(shape)} because the "
+                f"unspecified dimension size -1 can be any value and is ambiguous"
+                if definitely_true(numel == 0)
+                else f"shape '{list(shape)}' is invalid for input of size {numel}"
+            ),
+        )
+        torch._check(
+            numel % newsize == 0,
+            lambda: f"shape '{list(shape)}' is invalid for input of size {numel}",
+        )
+        # Convert to list to produce a compatible error message with core
+        # PyTorch, which prints sequences in square brackets.
+        shape = list(shape)
+        shape[dim] = numel // newsize
+        # NB: This is pretty important when you have unbacked SymInts.
+        # Suppose you have (i0, 12) resizing into (2, -1, 12).  The old
+        # range for i0 is typically [2, inf], which means if you divide
+        # by two the new range should be [1, inf].  But this is bad news
+        # if you have an unbacked SymInt: we need to reapply the unsound
+        # assumption that the size is >= 2.
+        torch._check_is_size(shape[dim])
+    return tuple(shape)
+
+
+_integer_dtypes = (
+    torch.uint8,
+    torch.uint16,
+    torch.uint32,
+    torch.uint64,
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.int64,
+)
+_low_precision_dtypes = (torch.float16, torch.bfloat16, torch.complex32)
+_complex_dtypes = (torch.complex32, torch.complex64, torch.complex128)
+
+
+def is_boolean_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype is torch.bool
+
+
+def is_integer_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _integer_dtypes
+
+
+def is_low_precision_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _low_precision_dtypes
+
+
+def is_float_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype.is_floating_point
+
+
+def is_complex_dtype(dtype: torch.dtype) -> bool:
+    assert isinstance(dtype, torch.dtype)
+    return dtype in _complex_dtypes
+
+
+def is_grad_dtype(dtype: torch.dtype) -> bool:
+    """
+    Checks if the dtype can require a gradient.
+    """
+    return dtype.is_floating_point or is_complex_dtype(dtype)
+
+
+_complex_to_real_dtype_map = {
+    torch.complex128: torch.float64,
+    torch.complex64: torch.float32,
+    torch.complex32: torch.float16,
+}
+
+_real_to_complex_dtype_map = {
+    torch.float16: torch.complex32,
+    torch.bfloat16: torch.complex64,
+    torch.float32: torch.complex64,
+    torch.float64: torch.complex128,
+}
+
+
+def corresponding_real_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _complex_to_real_dtype_map[dtype]
+
+
+def corresponding_complex_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _real_to_complex_dtype_map[dtype]
+
+
+def dtype_to_type(dtype: torch.dtype) -> type:
+    """
+    Computes the corresponding Python type (AKA "type kind") for the
+    given dtype.
+    """
+    assert isinstance(dtype, torch.dtype)
+
+    if dtype is torch.bool:
+        return bool
+    if dtype in _integer_dtypes:
+        return int
+    if dtype.is_floating_point:
+        return float
+    if dtype in _complex_dtypes:
+        return complex
+
+    raise ValueError("Invalid dtype!")
+
+
+def dtype_to_type_ctor(dtype: torch.dtype) -> Callable[[NumberType], NumberType]:
+    """
+    Computes the corresponding Python type constructor for the
+    given dtype.
+    """
+    assert isinstance(dtype, torch.dtype)
+
+    if dtype is torch.bool:
+        return lambda x: bool(x)
+    if dtype in _integer_dtypes:
+        return sym_int
+    if dtype.is_floating_point:
+        return sym_float
+    if dtype in _complex_dtypes:
+        # TODO: type error here is real, replace with sym_complex
+        return lambda x: complex(x)  # type: ignore[arg-type]
+
+    raise ValueError("Invalid dtype!")
+
+
+def type_to_dtype(typ: type) -> torch.dtype:
+    """
+    Computes the corresponding dtype for a Number type.
+    """
+
+    assert isinstance(typ, type)
+
+    if typ is bool:
+        return torch.bool
+    if typ in [int, torch.SymInt]:
+        return torch.long
+    if typ in [float, torch.SymFloat]:
+        return torch.get_default_dtype()
+    # TODO: sym_complex_float?
+    if typ is complex:
+        return corresponding_complex_dtype(torch.get_default_dtype())
+
+    raise ValueError("Invalid type!")
+
+
+def get_dtype(x: Union[torch.Tensor, NumberType]):
+    if isinstance(x, torch.Tensor):
+        return x.dtype
+    else:
+        return type_to_dtype(type(x))
+
+
+_ordered_types = (bool, int, float, complex)
+
+
+def check_fp_or_complex(
+    dtype: torch.dtype, fn_name: str, allow_low_precision_dtypes: bool = True
+):
+    """
+    Checks whether the input is floating point or complex.
+    If allow_low_precision_dtypes is True, it allows having float16, bfloat16, and complex32
+    """
+    torch._check(
+        is_float_dtype(dtype) or is_complex_dtype(dtype),
+        lambda: f"{fn_name}: Expected a floating point or complex tensor as input. Got {dtype}",
+    )
+    torch._check(
+        allow_low_precision_dtypes or not is_low_precision_dtype(dtype),
+        lambda: f"{fn_name}: Half precision dtypes not supported. Got {dtype}",
+    )
+
+
+def check_is_matrix(A: TensorLikeType, f_name: str, arg_name: str = "A"):
+    torch._check(
+        len(A.shape) >= 2,
+        lambda: f"{f_name}: The input tensor {arg_name} must have at least 2 dimensions.",
+    )
+
+
+def get_higher_type(a: type, b: type) -> type:
+    """
+    Returns the higher of the two given Number types.
+
+    The types are ordered bool -> int -> float -> complex.
+    """
+    a, b = _maybe_get_pytype(a), _maybe_get_pytype(b)
+    # Type checking
+    if a not in _ordered_types or b not in _ordered_types:
+        raise RuntimeError(f"Expected builtin numeric types, found {a}, {b}")
+
+    if a is b:
+        return a
+
+    for typ in _ordered_types:
+        if a is typ:
+            return b
+        if b is typ:
+            return a
+
+    raise ValueError("Unknown Python scalar type!")
+
+
+# Returns the higher of two torch datatypes a and b or, if the two
+#   are not ordered relative to each other, the next
+#   higher datatype
+def get_higher_dtype(
+    a: Optional[Union[torch.dtype, TensorLikeType, NumberType]],
+    b: Optional[Union[torch.dtype, TensorLikeType, NumberType]],
+) -> Optional[torch.dtype]:
+    """
+    Computes the "lowest" datatype that is weakly
+    "higher" than both a and b.
+    """
+
+    # Type checking
+    assert a is None or isinstance(a, (torch.dtype, TensorLike, Number))
+    assert b is None or isinstance(b, (torch.dtype, TensorLike, Number))
+
+    def _extract_dtype(
+        x: Optional[Union[torch.dtype, TensorLikeType, NumberType]]
+    ) -> Optional[torch.dtype]:
+        if x is None:
+            return None
+        if isinstance(x, torch.dtype):
+            return x
+        if isinstance(x, TensorLike):
+            return x.dtype
+        if isinstance(x, Number):
+            return type_to_dtype(type(x))
+
+        raise RuntimeError("Unexpected type given to _extract_dtype!")
+
+    a, b = _extract_dtype(a), _extract_dtype(b)
+
+    if a is b:
+        return a
+
+    if a is None:
+        return b
+
+    if b is None:
+        return a
+
+    ordered_datatypes = (
+        (torch.bool,),
+        (torch.uint8, torch.int8),
+        (torch.int16,),
+        (torch.int32,),
+        (torch.int64,),
+        (torch.float16, torch.bfloat16),
+        (torch.float32,),
+        (torch.float64,),
+        (torch.complex32,),
+        (torch.complex64,),
+        (torch.complex128,),
+    )
+
+    for idx, dtypes in enumerate(ordered_datatypes):
+        if a in dtypes and b in dtypes:
+            return ordered_datatypes[idx + 1][0]
+        if a in dtypes:
+            return b
+        if b in dtypes:
+            return a
+
+    raise RuntimeError("Unexpected termination!")
+
+
+def check_pin_memory(pin_memory: bool):
+    torch._check_not_implemented(
+        not pin_memory, lambda: "PrimTorch does not support pinned memory"
+    )
+
+
+def check_layout(layout: torch.layout):
+    torch._check_not_implemented(
+        layout == torch.strided, lambda: f"PrimTorch doesn't support layout={layout}"
+    )
+
+
+# TODO: maybe unify with can_cast_to?
+def is_weakly_lesser_type(a: type, b: type) -> bool:
+    """
+    Compares two types, a and b, returning True if a is weakly "less" than b.
+
+    The comparison is determined by the following type ordering: bool, int, float, complex.
+    """
+
+    a, b = _maybe_get_pytype(a), _maybe_get_pytype(b)
+
+    if a not in _ordered_types or b not in _ordered_types:
+        raise RuntimeError(f"Expected builtin numeric types, found {a}, {b}")
+
+    for typ in _ordered_types:
+        if a == typ:
+            return True
+        if b == typ:
+            return False
+
+    raise RuntimeError("Unexpected termination!")
+
+
+def can_safe_cast_to(*, cast_to: torch.dtype, cast_from: torch.dtype) -> bool:
+    for fn in (is_complex_dtype, is_float_dtype, is_integer_dtype, is_boolean_dtype):
+        if fn(cast_to):
+            return True
+        if fn(cast_from):
+            return False
+
+    raise ValueError(f"Received unknown dtypes {cast_to}, {cast_from}!")
+
+
+def check_same_dtype(*args):
+    """
+    Checks that all Tensors in args have the same device and that all Numbers have the
+    same corresponding Python type.
+
+    Raises a RuntimeError when:
+      - args contains an object whose type is not Tensor or Number
+      - two Tensors objects in args have different dtypes
+      - two Number objects in args have different types
+      - there are Tensors and Numbers in args, and one of those Tensors corresponding
+          Python types is different from the type of one of those Numbers
+    """
+    full_dtype = None
+    scalar_type = None
+
+    for arg in args:
+        if isinstance(arg, Number):
+            # Scalar type checking is disabled (and may be removed in the future)
+            continue
+            # if scalar_type is None:
+            #     scalar_type = type(arg)
+
+            # if scalar_type is not type(arg):
+            #     msg = (
+            #         "Scalar of type "
+            #         + str(type(arg))
+            #         + " is not the expected type of "
+            #         + str(scalar_type)
+            #         + "!"
+            #     )
+            #     raise RuntimeError(msg)
+        elif isinstance(arg, TensorLike):
+            if full_dtype is None:
+                full_dtype = arg.dtype
+            if scalar_type is None:
+                scalar_type = dtype_to_type(arg.dtype)
+
+            if full_dtype is not arg.dtype:
+                msg = (
+                    "Tensor with dtype "
+                    + str(arg.dtype)
+                    + " is not the expected dtype of "
+                    + str(full_dtype)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+
+            arg_type = dtype_to_type(arg.dtype)
+            if arg_type is not scalar_type:
+                msg = (
+                    "Tensor with corresponding Python type "
+                    + str(arg_type)
+                    + " is not the expected type of "
+                    + str(scalar_type)
+                    + "!"
+                )
+                raise RuntimeError(msg)
+        else:
+            msg = (
+                "Unexpected type when checking for same dtype, " + str(type(arg)) + "!"
+            )
+            raise RuntimeError(msg)
+
+
+# Maps datatypes to their computation types for elementwise operations
+_computation_dtype_map = {
+    torch.bfloat16: torch.float32,
+    torch.float16: torch.float32,
+    torch.complex32: torch.complex64,
+}
+
+
+def get_computation_dtype(dtype: torch.dtype) -> torch.dtype:
+    return _computation_dtype_map.get(dtype, dtype)
+
+
+_cpu_acc_type_map = {
+    torch.bfloat16: torch.float64,
+    torch.float16: torch.float64,
+    torch.float32: torch.float64,
+    torch.complex32: torch.complex128,
+    torch.complex64: torch.complex128,
+}
+
+
+def get_acc_type(dtype: torch.dtype, device: torch.device) -> torch.dtype:
+    # Equivalent to at::toAccumulateType, prefer computation_dtype where possible
+    if device.type == "cpu":
+        return _cpu_acc_type_map.get(dtype, dtype)
+    else:
+        return get_computation_dtype(dtype)
+
+
+class ELEMENTWISE_TYPE_PROMOTION_KIND(Enum):
+    DEFAULT = (0,)
+    NO_OPMATH = (1,)
+    INT_TO_FLOAT = (2,)
+    ALWAYS_BOOL = (3,)
+    COMPLEX_TO_FLOAT = (4,)
+    BOOL_TO_LONG = (5,)
+
+
+class REDUCTION_OUTPUT_TYPE_KIND(Enum):
+    SAME = (0,)
+    COMPLEX_TO_FLOAT = (1,)  # for complex types outputs corresponding real type
+    KEEP_PROMOTED_TYPE = (2,)  # keep output in opmath type, needed for mean
+    ALWAYS_BOOL = (3,)
+
+
+# Describes the return type of the primitive:
+#
+#   - NEW, a new tensor is created
+#   - VIEW, a view of an input tensor is returned
+#   - INPLACE, one or more input tensors is modified
+#
+# these descriptors are mututally exclusive and exhaustive.
+class RETURN_TYPE(Enum):
+    NEW = (0,)
+    VIEW = (1,)
+    INPLACE = (2,)
+
+
+# TODO: when NumberType contains the sym types, can simplify this
+def number_type(x: Union[NumberType, torch.SymInt, torch.SymFloat]) -> Type:
+    if isinstance(x, torch.SymInt):
+        return int
+    elif isinstance(x, torch.SymFloat):
+        return float
+    else:
+        return type(x)
+
+
+def expr_type(x: sympy.Expr) -> Type:
+    if x.is_integer:  # type: ignore[attr-defined]
+        return int
+    else:
+        # NB: Not strictly correct, but we don't support SymPy complex or bool.
+        return float
+
+
+# TODO: document type promotion kinds
+def elementwise_dtypes(
+    *_args,
+    type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND,
+) -> Tuple[torch.dtype, torch.dtype]:
+    """
+    Computes the computation and result dtypes for elementwise type promotion
+    on the given arguments and with the given elementwise type promotion kind.
+
+    Note that not all inputs to an elementwise operation necessarily participate in type promotion.
+    For example, the "alpha" parameter of torch.add does not participate in type promotion,
+    although it may be cast to the Python type corresponding to the computation dtype that
+    the type promotion algorithm determines.
+
+    Default elementwise type promotion, which all other type promotion kinds tweak (see below),
+    first decides which of four ordered types to use:
+
+    bool -> integer -> floating point -> complex
+
+    The selected type is the "lowest" type in the above list such that all number arguments
+    have a weakly "lower" type and all tensor arguments have a weakly lower corresponding
+    type for their dtype.
+
+    Once the type is determined, the particular result dtype is found. The dtypes are
+    partially ordered as follows:
+
+    bool -> uint8, int8 -> int16 -> int32 -> int64 ->
+      float16, bfloat16 -> float32 -> float64 -> complex32 -> complex64 -> complex128
+
+    The result dtype is selected by:
+      - if no tensor's dtype has the same corresponding type as the one selected,
+          then the result dtype is the (default) dtype corresponding to the selected type
+          (for example, 1.5 + an integer tensor has a result dtype of the default floating point dtype)
+      - if the result type is complex then the dtype is:
+        -  the default complex dtype if there are no floating point or complex tensors
+        -  if there are floating point or complex tensors with one or more dimensions, then
+            the complex dtype corresponding to the highest corresponding complex dtype among those tensors
+            (for example, double + cfloat -> cdouble)
+        -  if there are only floating point or complex tensors with zero dimensions, then
+            the complex dtype corresponding to the highest corresponding complex dtype among those tensors
+      - if the first two cases do not apply, the result dtype is the highest dtype among
+          all tensors with one or more dimensions of the output type, and if there are no such
+          tensors then it's the highest dtype among all tensors with zero dimensions of the output type
+          (for example, long + half -> half, even if the half tensor has zero dimensions)
+
+    The "corresponding complex dtypes" are:
+      float16    -> complex32
+      bfloat16   -> complex64
+      float32    -> complex64
+      float64    -> complex128
+      complex32  -> complex32
+      complex64  -> complex64
+      complex128 -> complex128
+
+    The DEFAULT type promotion kind computes per above, and then uses the result dtype to pick a computation
+    dtype by mapping low precision floating point and complex dtypes as follows:
+
+      float16   -> float32
+      bfloat16  -> float32
+      complex32 -> complex64
+
+    This is referred to as "op math", and the NO_OPMATH type promotion kind disables this mapping, making the
+    computation dtype the same as the result dtype when it's selected. NO_OPMATH is appropriate for kernels
+    which perform no mathematical operations on their tensors (see below for examples).
+
+    The INT_TO_FLOAT type promotion kind maps boolean and integer result dtypes to the default floating point dtype,
+    and computation dtypes to the appropriate op math dtype.
+
+    The COMPLEX_TO_FLOAT type promotion kind maps complex result dtypes to the corresponding float dtype, following this
+    mapping:
+
+        complex32  -> float16
+        complex64  -> float32
+        complex128 -> float64
+
+    Note that COMPLEX_TO_FLOAT derives the computation dtype as the DEFAULT setting does.
+
+    The BOOL_TO_LONG type promotion kind maps boolean computation and result dtypes to long.
+
+    The ALWAYS_BOOL type promotion kind always sets the result dtype to bool.
+
+    Example operators for each type promotion option:
+      DEFAULT                 : add
+      NO_OPMATH               : where, nextafter, cat
+      INT_TO_FLOAT            : sin
+      COMPLEX_TO_FLOAT        : abs
+      BOOL_TO_LONG            : pow
+      ALWAYS_BOOL             : eq
+
+    """
+
+    args = tuple(x for x in _args if x is not None)
+
+    highest_type: type = bool
+
+    # Import sympy locally, as importing it eagerly at a module level is too slow
+    # See https://dev-discuss.pytorch.org/t/delving-into-what-happens-when-you-import-torch/1589
+    import sympy
+
+    for x in args:
+        if not isinstance(x, (Number, TensorLike, sympy.Expr)):
+            msg = f"Unexpected type {str(type(x))} when computing elementwise type promotion!"
+            raise ValueError(msg)
+
+        if isinstance(x, Number):
+            highest_type = get_higher_type(highest_type, number_type(x))
+        elif isinstance(x, sympy.Expr):
+            highest_type = get_higher_type(highest_type, expr_type(x))
+        else:
+            # x is a TensorLike
+            highest_type = get_higher_type(highest_type, dtype_to_type(x.dtype))
+
+    result_dtype = None
+
+    def _find_highest_dtype_filtered(
+        args, filter, *, float_as_complex=False
+    ) -> Optional[torch.dtype]:
+        zero_dim_tensor_dtype = None
+        one_plus_dim_tensor_dtype = None
+        for x in args:
+            if isinstance(x, TensorLike) and filter(x.dtype):
+                _dtype = x.dtype
+                if float_as_complex and is_float_dtype(_dtype):
+                    _dtype = corresponding_complex_dtype(_dtype)
+                if x.ndim == 0:
+                    zero_dim_tensor_dtype = get_higher_dtype(
+                        zero_dim_tensor_dtype, _dtype
+                    )
+                else:
+                    # x.ndim > 0
+                    one_plus_dim_tensor_dtype = get_higher_dtype(
+                        one_plus_dim_tensor_dtype, _dtype
+                    )
+
+        # Prefers dtype of tensors with one or more dimensions
+        if one_plus_dim_tensor_dtype is not None:
+            return one_plus_dim_tensor_dtype
+
+        return zero_dim_tensor_dtype
+
+    if highest_type is float:
+        result_dtype = _find_highest_dtype_filtered(args, is_float_dtype)
+        result_dtype = (
+            torch.get_default_dtype() if result_dtype is None else result_dtype
+        )
+    elif highest_type is complex:
+        result_dtype = _find_highest_dtype_filtered(
+            args,
+            lambda x: is_float_dtype(x) or is_complex_dtype(x),
+            float_as_complex=True,
+        )
+        if result_dtype is None:
+            result_dtype = corresponding_complex_dtype(torch.get_default_dtype())
+    elif highest_type is int:
+        result_dtype = _find_highest_dtype_filtered(args, is_integer_dtype)
+        result_dtype = torch.long if result_dtype is None else result_dtype
+    else:
+        # highest_type is bool
+        result_dtype = torch.bool
+
+    if type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT:
+        return get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH:
+        return result_dtype, result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT:
+        if is_integer_dtype(result_dtype) or is_boolean_dtype(result_dtype):
+            result_dtype = torch.get_default_dtype()
+        return get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT:
+        # NOTE: computation can still occur in a complex dtype
+        computation_dtype = get_computation_dtype(result_dtype)
+        if is_complex_dtype(result_dtype):
+            result_dtype = corresponding_real_dtype(result_dtype)
+        return computation_dtype, result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG:
+        if is_boolean_dtype(result_dtype):
+            return torch.long, torch.long
+        return get_computation_dtype(result_dtype), result_dtype
+    elif type_promotion_kind is ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL:
+        return get_computation_dtype(result_dtype), torch.bool
+    else:
+        raise ValueError(f"Unknown type promotion kind {str(type_promotion_kind)}")
+
+
+def reduction_dtypes(
+    arg,
+    output_dtype_kind: REDUCTION_OUTPUT_TYPE_KIND,
+    dtype: Optional[torch.dtype] = None,
+) -> Tuple[torch.dtype, Optional[torch.dtype]]:
+    # even though some reductions, like amin or amax, don't strictly require type promotion,
+    # all the math ops (including comparisons) are still defined only for a computation type,
+    # so promotion will still happen. We are doing it explicitly here
+    inp_dtype = dtype if dtype is not None else arg.dtype
+    computation_dtype = get_computation_dtype(inp_dtype)
+    if (
+        output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.SAME
+        or output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
+    ):
+        result_dtype = dtype if dtype else arg.dtype
+        if (
+            output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
+            and is_complex_dtype(result_dtype)
+        ):
+            result_dtype = corresponding_real_dtype(result_dtype)
+    elif output_dtype_kind == REDUCTION_OUTPUT_TYPE_KIND.KEEP_PROMOTED_TYPE:
+        result_dtype = None
+    else:  # ALWAYS_BOOL
+        result_dtype = torch.bool
+    return computation_dtype, result_dtype
+
+
+# This function's logic is borrowed from the following functions defined in C++:
+# batched_matrix_contiguous_strides and contiguous_strides
+def make_contiguous_strides_for(
+    shape: ShapeType, row_major: bool = True
+) -> Tuple[int, ...]:
+    """
+    Returns the strides of a contiguous tensor if row_major
+    If row_major=True, it returns the strides of a contiguous batch of Fortran-contiguous matrices
+    This is often used when calling external libraries like BLAS/LAPACK/cuSolver...
+    """
+    # contiguous_strides from c10/util/strides.h
+    validate_shape(shape)
+    if not shape:
+        return ()
+
+    from torch.fx.experimental.symbolic_shapes import is_nested_int
+
+    multiplier = 1
+    strides = []
+    for l in reversed(shape):
+        strides.append(multiplier)
+        multiplier *= l if is_nested_int(l) else sym_max(l, 1)
+
+    result = tuple(reversed(strides))
+
+    # batched_matrix_contiguous_strides from aten/src/ATen/native/LinearAlgebraUtils.h
+    if row_major:
+        return result
+    else:
+        if len(shape) < 2:
+            return result
+        return result[:-2] + (1, max(shape[-2], 1))
+
+
+def make_channels_last_1d_strides_for(shape: ShapeType) -> Tuple[int, ...]:
+    torch._check(
+        len(shape) == 3,
+        lambda: "Only tensors of rank 3 can use the channels_last_1d memory format",
+    )
+
+    multiplier = 1
+    strides = [0] * 3
+    for idx in (1, -1, 0):
+        # NOTE: intentionally divergence from make_contiguous_strides_for
+        # This is consistent with eager
+        strides[idx] = multiplier
+        multiplier *= shape[idx]
+
+    return tuple(strides)
+
+
+def make_channels_last_2d_strides_for(shape: ShapeType) -> Tuple[int, ...]:
+    # TODO: maybe inform the user of channels_last_3d if rank of the tensor is 5?
+    torch._check(
+        len(shape) == 4,
+        lambda: "Only tensors of rank 4 can use the channels_last memory format",
+    )
+
+    multiplier = 1
+    strides = [0] * 4
+    for idx in (1, -1, -2, 0):
+        # NOTE: intentionally divergence from make_contiguous_strides_for
+        # This is consistent with eager
+        strides[idx] = multiplier
+        multiplier *= shape[idx]
+
+    return tuple(strides)
+
+
+def make_channels_last_3d_strides_for(shape: ShapeType) -> Tuple[int, ...]:
+    torch._check(
+        len(shape) == 5,
+        lambda: "Only tensors of rank 5 can use the channels_last_3d memory format",
+    )
+
+    multiplier = 1
+    strides = [0] * 5
+    for idx in (1, -1, -2, -3, 0):
+        # NOTE: intentionally divergence from make_contiguous_strides_for
+        # This is consistent with eager
+        strides[idx] = multiplier
+        multiplier *= shape[idx]
+
+    return tuple(strides)
+
+
+def make_channels_last_strides_for(shape: ShapeType) -> Tuple[int, ...]:
+    ndim = len(shape) if isinstance(shape, Sequence) else 1
+    if ndim == 3:
+        return make_channels_last_1d_strides_for(shape)
+    elif ndim == 4:
+        return make_channels_last_2d_strides_for(shape)
+    elif ndim == 5:
+        return make_channels_last_3d_strides_for(shape)
+    else:
+        raise RuntimeError(
+            f"no channels last format strides exist in {ndim} dimensions"
+        )
+
+
+def compute_reduction_output_shape(
+    shape: ShapeType, dimensions: Sequence
+) -> Tuple[int, ...]:
+    for idx in dimensions:
+        validate_idx(len(shape), idx)
+
+    new_shape = []
+    for idx in range(len(shape)):
+        if idx in dimensions:
+            continue
+
+        new_shape.append(shape[idx])
+
+    return tuple(new_shape)
+
+
+def validate_no_repeating_dims(dims: Sequence):
+    if len(dims) != len(set(dims)):
+        raise RuntimeError("duplicate value in the list of dims")
+
+
+def reduction_dims(shape: ShapeType, dims: Optional[Sequence]) -> Tuple[int, ...]:
+    if dims is None:
+        return tuple(range(len(shape)))
+    dims = tuple(canonicalize_dim(len(shape), idx) for idx in dims)
+    validate_no_repeating_dims(dims)
+    return dims
+
+
+def set_correction(
+    unbiased: Optional[bool] = None,
+    correction: Optional[NumberType] = None,
+) -> float:
+    if correction is not None and unbiased is not None:
+        raise RuntimeError("cannot specify both correction and unbiased arguments")
+    elif correction is None and unbiased is None:
+        correction = 1.0
+    elif correction is None and unbiased is not None:
+        correction = 0.0 if unbiased is False else 1.0
+    # NB: we don't actually support symint here, but it's harmless to accept
+    if not isinstance(correction, (IntLike, FloatLike)):
+        raise ValueError("correction argument should be integer or float")
+    if correction < 0:
+        raise ValueError("correction argument should be non-negative")
+    return sym_float(correction)
+
+
+def compute_required_storage_length(
+    shape: ShapeType, strides: StrideType, storage_offset: int
+) -> int:
+    """Computes the minimum storage size to hold the given tensor geometry.
+
+    Example
+    =======
+
+    This is the size of a newly allocated tensor's storage, in units of elements
+
+    >>> t = torch.empty((10, 20))
+    >>> compute_required_storage_length(t.shape, t.stride(), t.storage_offset())
+    200
+
+    >>> # xdoctest: +SKIP(failing)
+    >>> t2 = torch.empty_strided((1, 2, 3), (5, 7, 11))
+    >>> size = compute_required_storage_length(t2.shape, t2.stride(), t2.storage_offset())
+    >>> size == t.storage().size()
+    True
+
+    A valid tensor may have a larger storage size, but never smaller
+
+    >>> slice = torch.empty(100)[20:40]
+    >>> slice.storage().size()
+    100
+
+    >>> compute_required_storage_length(slice.shape, slice.stride(), slice.storage_offset())
+    40
+
+    """
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # Short-circuits if the shape has no elements
+    if guard_size_oblivious(reduce(operator.mul, shape, 1) == 0):
+        return 0
+
+    max_offset = sum((x - 1) * y for x, y in zip(shape, strides))
+    # +1 to account for the first element which offsets are taken from
+    return 1 + storage_offset + max_offset
+
+
+def check_in_bounds_for_storage(
+    a: torch.TypedStorage, shape: ShapeType, strides: StrideType, storage_offset: int
+):
+    """
+    Determines if the given shape, strides, and offset are valid for the given storage.
+    """
+
+    required_length = compute_required_storage_length(shape, strides, storage_offset)
+    if a.size() < required_length:
+        msg = (
+            "Can't view a storage of size {} with an offset of {}, shape of {}, and strides of {}, "
+            "which requires a storage of size {}".format(
+                a.size(), storage_offset, str(shape), str(strides), required_length
+            )
+        )
+        raise ValueError(msg)
+
+
+# NOTE: This function should ideally be removed, but some Meta internal models
+# packaged with `torch.package` are using it, so it will have to be removed
+# at some point in the future when those models no longer use this function.
+def check(
+    b: bool, s: Callable[[], str], exc_type: Type[Exception] = RuntimeError
+) -> None:
+    """
+    Helper function for raising an error_type (default: RuntimeError) if a boolean condition fails.
+    Error message is a callable producing a string (to avoid wasting time
+    string formatting in non-error case, and also to make it easier for torchdynamo
+    to trace.)
+
+    .. note:: This function is planned for removal in the future. Please use
+        `torch._check*` functions instead.
+    """
+    warnings.warn(
+        DeprecationWarning(
+            "'torch._prims_common.check' will be removed in the future. Please use "
+            "'torch._check*' functions instead"
+        )
+    )
+    torch._check_with(exc_type, b, s)
+
+
+# This combines is_channels_last_strides_2d and is_channels_last_strides_3d in
+# c10/core/MemoryFormat.h into one function
+def are_strides_like_channels_last(
+    shape: Sequence[int], strides: Sequence[int]
+) -> bool:
+    ndim = len(shape)
+
+    if ndim == 4:
+        # Check for channels_last_2d
+        dim_order = [1, 3, 2, 0]
+    elif ndim == 5:
+        # Check for channels_last_3d
+        dim_order = [1, 4, 3, 2, 0]
+    else:
+        return False
+
+    if strides[1] == 0:
+        return False
+
+    min = 0
+    for d in dim_order:
+        if shape[d] == 0:
+            return False
+        if strides[d] < min:
+            return False
+        if d == 0 and min == strides[1]:
+            return False
+        min = strides[d]
+        if strides[d] > 1:
+            min *= shape[d]
+    return True
+
+
+def suggest_memory_format(x: TensorLikeType) -> torch.memory_format:
+    if x.layout != torch.strided:
+        return torch.contiguous_format
+
+    if are_strides_like_channels_last(x.shape, x.stride()):
+        return torch.channels_last if x.ndim == 4 else torch.channels_last_3d
+
+    return torch.contiguous_format
+
+
+def prod(xs: Sequence[NumberType]) -> NumberType:
+    """Product of elements in input sequence. Returns 1 for empty sequence"""
+    return reduce(operator.mul, xs, 1)
+
+
+def is_expandable_to(shape: ShapeType, desired: ShapeType) -> bool:
+    """Checks if a shape can be expanded to another shape.
+    This is equivalent to checking if the two shapes are broadcastable.
+    """
+    # This is a Python implementation of
+    # aten/src/ATen/ExpandUtils.h:is_expandable_to
+    if len(shape) > len(desired):
+        return False
+    for i in range(len(shape)):
+        if shape[-i - 1] != desired[-i - 1] and shape[-i - 1] != 1:
+            return False
+    return True
+
+
+def mask_tensor(mask: TensorLikeType, t: TensorLikeType):
+    """
+    Similar to torch.where(mask, t, 0) but if t is boolean,
+    result is also boolean and not promoted to int.
+    """
+    # torch.where(mask, t, False) is equivalent
+    # but feels hacky and might break in the future
+    if t.dtype is torch.bool:
+        return mask.logical_and(t)
+    else:
+        return torch.where(mask, t, 0)
+
+
+def get_aten_op(fn: Callable, name: str):
+    """
+    Given the __module__ of reference and its name, it returns
+    (our best guess of) the ATen name of the associated operation
+
+    Note: In ATen, the __name__ of a function within a module often
+    starts by the module name. E.g. linalg_eigh, or special_zeta
+    """
+    module = fn.__module__
+    prefix = "torch._refs"
+    assert module.startswith(prefix)
+    module = module[len(prefix) :]
+    # We want to go from .special / .nn.functional
+    # to special and special_ / nn_functional_
+    if module:
+        module = module[1:]
+        module = module.replace(".", "_")
+        module = module + "_"
+    return getattr(torch._ops.ops.aten, f"{module}{name}")
+
+
+def dtype_or_default(dtype: Optional[torch.dtype]) -> torch.dtype:
+    return dtype if dtype is not None else torch.get_default_dtype()
+
+
+def device_or_default(device: Optional[DeviceLikeType]) -> DeviceLikeType:
+    return device if device is not None else torch.device("cpu")
+
+
+def layout_or_default(layout: Optional[torch.layout]) -> torch.layout:
+    return layout if layout is not None else torch.strided
+
+
+def clone_preserve_strides(x):
+    needed_size = compute_required_storage_length(
+        x.size(), x.stride(), x.storage_offset()
+    )
+    # Our eager implementations for *_scatter ops are all primitives w.r.t autograd,
+    # so these as_strided() calls are not seen by autograd.
+    # We need to mimic this behavior in our ref/prim implementations.
+    # TODO: a better way to handle this would be with a new op, "_unsafe_as_strided"
+    # We should revisit this when we add a compositional as_strided op,
+    # and also as part of https://github.com/pytorch/pytorch/issues/90507
+    try:
+        old = torch._C._dispatch_tls_is_dispatch_key_excluded(
+            torch._C.DispatchKey.ADInplaceOrView
+        )
+        torch._C._dispatch_tls_set_dispatch_key_excluded(
+            torch._C.DispatchKey.ADInplaceOrView, True
+        )
+        buffer = torch.as_strided(x, (needed_size,), (1,), 0).clone()
+        return torch.as_strided(buffer, x.size(), x.stride(), x.storage_offset())
+    finally:
+        torch._C._dispatch_tls_set_dispatch_key_excluded(
+            torch._C.DispatchKey.ADInplaceOrView, old
+        )
+
+
+def alert_not_deterministic(caller: str):
+    if torch.are_deterministic_algorithms_enabled():
+        if torch.is_deterministic_algorithms_warn_only_enabled():
+            warnings.warn(
+                f"{caller} does not have a deterministic implementation, but you set "
+                f"'torch.use_deterministic_algorithms(True, warn_only=True)'. "
+                f"You can file an issue at https://github.com/pytorch/pytorch/issues "
+                f"to help us prioritize adding deterministic support for this operation."
+            )
+        else:
+            torch._check(
+                False,
+                lambda: (
+                    f"{caller} does not have a deterministic implementation, but you set "
+                    f"'torch.use_deterministic_algorithms(True)'. You can turn off "
+                    f"determinism just for this operation, or you can use the "
+                    f"'warn_only=True' option, if that's acceptable for your application. "
+                    f"You can also file an issue at https://github.com/pytorch/pytorch/issues "
+                    f"to help us prioritize adding deterministic support for this operation."
+                ),
+            )
+
+
+class CUDARngStateHelper:
+    @staticmethod
+    def get_torch_state_as_tuple(fake_mode=nullcontext()):
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA not available")
+
+        with fake_mode:
+            seed = torch.tensor(torch.cuda.initial_seed())
+            offset = torch.tensor(torch.cuda._get_rng_state_offset())
+            return seed, offset
+
+    @staticmethod
+    def set_torch_state_tensor(seed, offset):
+        # Rng state is [64-bit seed, 64-bit offset]
+        seed_portion = seed.reshape([1]).view(torch.uint8)
+        offset_portion = offset.reshape([1]).view(torch.uint8)
+        new_state = torch.cat([seed_portion, offset_portion])
+        torch.cuda.set_rng_state(new_state)
+
+    @staticmethod
+    def set_new_offset(relative_offset):
+        torch.cuda._set_rng_state_offset(relative_offset.item())
diff --git a/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f307d8e07c12a89f9867b402fd9205f7cb1205d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/wrappers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/wrappers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..986cf751a0b536ff60c669a06a358fd8437da660
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_prims_common/__pycache__/wrappers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_prims_common/wrappers.py b/MLPY/Lib/site-packages/torch/_prims_common/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..bafeb88d67391f9ac9ffcbd9f89e34014cddba31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_prims_common/wrappers.py
@@ -0,0 +1,401 @@
+import inspect
+import warnings
+from functools import wraps
+from itertools import chain
+
+from typing import Callable, NamedTuple, Optional, overload, Sequence, Tuple
+
+import torch
+import torch._prims_common as utils
+from torch._prims_common import (
+    CustomOutParamAnnotation,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    Number,
+    NumberType,
+    ShapeType,
+    TensorLike,
+    TensorLikeType,
+)
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+
+@overload
+def _maybe_convert_to_dtype(a: TensorLikeType, dtype: torch.dtype) -> TensorLikeType:
+    pass
+
+
+@overload
+def _maybe_convert_to_dtype(a: NumberType, dtype: torch.dtype) -> NumberType:
+    pass
+
+
+@overload
+def _maybe_convert_to_dtype(a: Sequence, dtype: torch.dtype) -> Sequence:
+    pass
+
+
+@overload
+def _maybe_convert_to_dtype(a: None, dtype: torch.dtype) -> None:
+    pass
+
+
+# TODO: implement ref.cast with an option to enforce safe casting
+def _maybe_convert_to_dtype(a, dtype):
+    if isinstance(a, TensorLike):
+        if a.dtype != dtype:
+            return a.to(dtype)
+        return a
+    if isinstance(a, Number):
+        return utils.dtype_to_type_ctor(dtype)(a)  # type: ignore[arg-type]
+    if isinstance(a, Sequence):
+        return tuple(_maybe_convert_to_dtype(x, dtype) for x in a)
+    # Passthrough None because some functions wrapped with type promotion
+    # wrapper might have optional args
+    if a is None:
+        return None
+
+    raise ValueError(f"Received type {type(a)} that is neither a tensor or a number!")
+
+
+def _maybe_convert_to_type(a: NumberType, typ: type) -> NumberType:
+    if not isinstance(a, Number):
+        msg = f"Found unknown type {type(a)} when trying to convert scalars!"
+        raise ValueError(msg)
+    if not utils.is_weakly_lesser_type(type(a), typ):
+        msg = f"Scalar {a} of type {type(a)} cannot be safely cast to type {typ}!"
+        raise ValueError(msg)
+
+    return typ(a)
+
+
+def _annotation_has_type(*, typ, annotation):
+    if hasattr(annotation, "__args__"):
+        for a in annotation.__args__:
+            if _annotation_has_type(typ=typ, annotation=a):
+                return True
+        return False
+
+    return typ is annotation
+
+
+class elementwise_type_promotion_wrapper:
+    """
+    Adds elementwise type promotion to a Python reference implementation.
+
+    Takes two kwargs, type_promoting_args and type_promotion_kind.
+
+    type_promoting_args must be a string Sequence specifiying the argument names of all
+    arguments that participate in type promotion (and should be type promoted). If the
+    arg specifies a Sequence-type then every element of the Sequence will participate in
+    type promotion.
+
+    type_promotion_kind must be one of the kinds specified by ELEMENTWISE_TYPE_PROMOTION_KIND.
+    See its documentation for details.
+
+    The return_dtype will be coerced to the wrapped function's dtype arg if it is available and
+    not None.
+
+    Other type promotion behavior, like validating the Python type of scalar arguments, must
+    be handled separately.
+    """
+
+    def __init__(
+        self,
+        *,
+        type_promotion_kind: ELEMENTWISE_TYPE_PROMOTION_KIND,
+        type_promoting_args: Optional[Sequence[str]] = None,
+    ):
+        self.type_promoting_arg_names = type_promoting_args
+        self.type_promotion_kind = type_promotion_kind
+
+    def __call__(self, fn: Callable) -> Callable:
+        sig = inspect.signature(fn)
+
+        @wraps(fn)
+        def _fn(*args, **kwargs):
+            bound = sig.bind(*args, **kwargs)
+            type_promoting_args = tuple(
+                bound.arguments[x]
+                for x in self.type_promoting_arg_names  # type: ignore[union-attr]
+                if x in bound.arguments.keys()
+            )
+
+            flattened_type_promoting_args = pytree.arg_tree_leaves(*type_promoting_args)
+            compute_dtype, result_dtype = utils.elementwise_dtypes(
+                *flattened_type_promoting_args,
+                type_promotion_kind=self.type_promotion_kind,
+            )
+
+            promoted_args = {
+                x: _maybe_convert_to_dtype(bound.arguments[x], compute_dtype)
+                for x in self.type_promoting_arg_names  # type: ignore[union-attr]
+                if x in bound.arguments.keys()
+            }
+            bound.arguments.update(promoted_args)
+
+            result = fn(**bound.arguments)
+
+            # Override the return_dtype if a dtype arg is present and not None
+            if "dtype" in bound.arguments:
+                maybe_dtype = bound.arguments["dtype"]
+                if maybe_dtype:  # dtype cannot be None
+                    result_dtype = maybe_dtype
+
+            if isinstance(result, TensorLike):
+                return _maybe_convert_to_dtype(result, result_dtype)
+            if isinstance(result, Sequence):
+                return tuple(_maybe_convert_to_dtype(x, result_dtype) for x in result)
+            raise AssertionError(f"Unhandled result type: {type(result)}")
+
+        _fn.__signature__ = sig  # type: ignore[attr-defined]
+        return _fn
+
+
+# Returns True if resize is necessary
+def _resize_output_check(out: TensorLikeType, shape: ShapeType):
+    # If the shapes are correct there's nothing to do
+    if utils.same_shape(out.shape, shape):
+        return False
+    if out.numel() != 0:
+        msg = (
+            f"An output with one or more elements was resized since it had shape {str(out.shape)} "
+            "which does not match the required output shape {str(shape)}. "
+            "This behavior is deprecated, and in a future PyTorch release outputs will not "
+            "be resized unless they have zero elements. "
+            "You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0)."
+        )
+        warnings.warn(msg)
+    return True
+
+
+# TODO: handle tuples of tensors
+def _maybe_resize_out(out: TensorLikeType, shape: ShapeType):
+    if _resize_output_check(out, shape):
+        return out.resize_(shape)
+    else:
+        return out
+
+
+def _safe_copy_out(
+    *, copy_from: TensorLikeType, copy_to: TensorLikeType, exact_dtype: bool = False
+):
+    # Checks same device
+    if copy_from.device != copy_to.device:
+        msg = "Attempting to copy from device {} to device {}, but cross-device copies are not allowed!".format(
+            copy_from.device, copy_to.device
+        )
+        raise RuntimeError(msg)
+
+    # Checks safe cast
+    if exact_dtype:
+        torch._check(
+            copy_from.dtype == copy_to.dtype,
+            lambda: f"Expected out tensor to have dtype {copy_from.dtype} "
+            f"but got {copy_to.dtype} instead",
+        )
+    else:
+        torch._check(
+            utils.can_safe_cast_to(cast_from=copy_from.dtype, cast_to=copy_to.dtype),
+            lambda: f"Attempting to cast from {copy_from.dtype} to out tensor with dtype {copy_to.dtype}, "
+            "but this can't be cast because it is not safe!",
+        )
+
+    return copy_to.copy_(copy_from)
+
+
+def out_wrapper(*out_names: str, exact_dtype: bool = False, pass_is_out: bool = False):
+    # The wrapped function needs to convert the output parameters to ensure
+    # compatibility between the Python API (which always uses "out" as the
+    # parameter name and may be a tuple) and the Aten API (which may have
+    # multiple output parameters and use different parameter names such as
+    # "grad_input", "indices" or "values".)
+
+    default_out_names = ("out",)
+    if len(out_names) == 0:
+        # Use default in out name
+        out_names = default_out_names
+
+    is_tensor = len(out_names) == 1
+
+    def _out_wrapper(fn: Callable) -> Callable:
+        """
+        Adds the out parameter to a Python reference.
+        """
+        out_type = (
+            TensorLikeType
+            if is_tensor
+            else Tuple[tuple(TensorLikeType for _ in range(len(out_names)))]
+        )
+        return_type = (
+            TensorLikeType
+            if is_tensor
+            else NamedTuple(
+                f"return_types_{fn.__name__}", [(o, TensorLikeType) for o in out_names]
+            )
+        )
+
+        sig = inspect.signature(fn)
+        factory_kwargs = ("device", "dtype")
+        is_factory_fn = all(p in sig.parameters for p in factory_kwargs)
+
+        @wraps(fn)
+        def _fn(*args, out=None, **kwargs):
+            if is_factory_fn and out is not None:
+                for k in factory_kwargs:
+                    out_attr = getattr(out, k)
+                    if k not in kwargs:
+                        kwargs[k] = out_attr
+            if pass_is_out:
+                result = fn(*args, is_out=(out is not None), **kwargs)
+            else:
+                result = fn(*args, **kwargs)
+            assert (
+                isinstance(result, TensorLike)
+                and is_tensor
+                or isinstance(result, Tuple)  # type: ignore[arg-type]
+                and len(result) == len(out_names)
+            )
+            if out is not None:
+                # Naively you might expect this assert to be true, but
+                # it's not:
+                #
+                #   assert type(out) == type(result)
+                #
+                # The reason is that functions under this wrapper can
+                # get registered to the Meta dispatch key, and that
+                # means they can be executed in a context where tensor
+                # subclasses are disabled (with no_dispatch), which is a
+                # handy way for an is-a tensor subclass (e.g.,
+                # FakeTensor) to have the normal meta backend create a
+                # meta tensor, to be wrapped once it gets returned.
+                # In this situation, you will get a FakeTensor as
+                # the output tensor, but not the result--which will
+                # be a normal meta tensor, but this is perfectly
+                # harmless.
+                if is_tensor:
+                    assert isinstance(out, TensorLike)
+                    # These two operations are done in-place
+                    _maybe_resize_out(out, result.shape)
+                    _safe_copy_out(copy_from=result, copy_to=out, exact_dtype=exact_dtype)  # type: ignore[arg-type]
+                else:
+                    assert isinstance(out, Tuple)  # type: ignore[arg-type]
+                    torch._check_type(
+                        len(out) == len(result),
+                        lambda: f"expected tuple of {len(result)} elements but got {len(out)}",
+                    )
+                    for r, o in zip(result, out):
+                        # These two operations are done in-place
+                        _maybe_resize_out(o, r.shape)
+                        _safe_copy_out(copy_from=r, copy_to=o, exact_dtype=exact_dtype)  # type: ignore[arg-type]
+            else:
+                out = result
+            # mypy does not see through  the definition of out_type given that it's in a different scope
+            return out if is_tensor else return_type(*out)  # type: ignore[operator]
+
+        out_param = inspect.Parameter(
+            "out",
+            kind=inspect.Parameter.KEYWORD_ONLY,
+            default=None,
+            annotation=out_type,
+        )
+        # Mark that the function now returns a tuple
+        assert isinstance(sig.return_annotation, str) or sig.return_annotation in (
+            sig.empty,
+            out_type,
+        )
+        params = chain(sig.parameters.values(), (out_param,))
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params, return_annotation=return_type  # type: ignore[arg-type]
+        )
+
+        _fn.__annotations__ = fn.__annotations__
+        _fn.__annotations__["out"] = out_type
+        _fn.__annotations__["return"] = return_type
+
+        # In the special case of having a single tensor out parameter with a
+        # name other than out, add a special annotation to name the parameter
+        if is_tensor and out_names != default_out_names:
+            _fn.__annotations__[CustomOutParamAnnotation] = out_names[0]
+
+        # Add an indicator attribute that can be used in special cases
+        # where having a function wrapped by `out_wrapper` is not desirable e.g.
+        # jit
+        _fn._torch_decompositions_out_wrapper = f"This function is wrapped by {out_wrapper.__module__}.out_wrapper"  # type: ignore[attr-defined]
+
+        return _fn
+
+    return _out_wrapper
+
+
+def _maybe_remove_out_wrapper(fn: Callable):
+    return inspect.unwrap(
+        fn,
+        stop=lambda f: not hasattr(f, "_torch_decompositions_out_wrapper"),
+    )
+
+
+def backwards_not_supported(prim):
+    def redispatch_prim(args, kwargs):
+        with torch._C._AutoDispatchBelowAutograd():
+            old = torch._C._dispatch_tls_is_dispatch_key_excluded(
+                torch._C.DispatchKey.ADInplaceOrView
+            )
+            return prim(*args, **kwargs)
+
+    class BackwardsNotSupported(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, args_spec, *flat_args):
+            args, kwargs = tree_unflatten(flat_args, args_spec)  # type: ignore[arg-type]
+            return redispatch_prim(args, kwargs)
+
+        @staticmethod
+        def backward(ctx, *args):
+            raise RuntimeError("backwards not supported on prim")
+
+    @wraps(prim)
+    def _autograd_impl(*args, **kwargs):
+        flat_args, args_spec = tree_flatten((args, kwargs))
+        if torch.is_grad_enabled() and any(
+            a.requires_grad for a in flat_args if isinstance(a, torch.Tensor)
+        ):
+            # TODO: There is a subtle bug here: prims like copy_to
+            # return their input argument after mutating it; and custom
+            # autograd function will incorrectly turn the result into
+            # a view which will fail test_python_ref_executor tests.
+            # At the moment, we sidestep this by observing that the
+            # unit tests don't ever try to run the executor with
+            # autograd, so we don't exercise the buggy case, but if
+            # you ever want to feed autograd through this, be aware
+            # of it!  We need a way of properly implementing autograd
+            # for mutating operations in Python to do this.
+            return BackwardsNotSupported.apply(args_spec, *flat_args)
+        else:
+            return redispatch_prim(args, kwargs)
+
+    return _autograd_impl
+
+
+# TODO: when tracing this will add torch tensors and not TensorMeta objects
+# to the trace -- we should fix this by adding a tracing context and NumberMeta classes
+# TODO: this wrapper is currently untested
+def elementwise_unary_scalar_wrapper(fn: Callable) -> Callable:
+    """
+    Allows unary operators that accept tensors to work with Python numbers.
+    """
+    sig = inspect.signature(fn)
+
+    @wraps(fn)
+    def _fn(*args, **kwargs):
+        if len(args) > 0 and isinstance(args[0], Number):
+            dtype = utils.type_to_dtype(type(args[0]))
+            args_ = list(args)
+            args_[0] = torch.tensor(args[0], dtype=dtype)
+            result = fn(*args_, **kwargs)
+            assert isinstance(result, torch.Tensor)
+            return result.item()
+
+        return fn(*args, **kwargs)
+
+    _fn.__signature__ = sig  # type: ignore[attr-defined]
+    return _fn
diff --git a/MLPY/Lib/site-packages/torch/_python_dispatcher.py b/MLPY/Lib/site-packages/torch/_python_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb40dae036840fb44662d7ee13173f1c671be15e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_python_dispatcher.py
@@ -0,0 +1,181 @@
+import re
+
+import torch._C as C
+
+
+"""
+PythonDispatcher class is a thin python-binding to C++ dispatcher and it
+is designed to show how dispatcher precompute works. In particular,
+it shows for a certain op `foo`, what the computed dispatch table looks
+like after user register their kernels to certains dispatch keys.
+
+In the real C++ dispatcher we support many dispatch keys for different
+functionalities. For simplicity PythonDispatcher only supports dispatch
+keys for a single example of each use case. These use cases are listed below:
+
+- CPU/AutogradCPU: represents in-tree backends which we usually have dedicated inference &
+    autograd kernel in pytorch core library.
+    E.g. CPU, CUDA
+- FPGA/AutogradOther: represents in-tree backends which we usually have backend specific
+    inference kernels, but they share the same autograd kernel specified in AutogradOther.
+    E.g. FPGA, SparseCsrCPU
+- XLA/AutogradXLA: represents out-of-tree backends which we don't have either inference or autograd
+    kernel defined in pytorch core library. Backend owner is responsible for registering both
+    inference & autograd kernels in their extensions(e.g. torch-xla) for the operators they support.
+    E.g. XLA, XPU, MPS
+- CompositeExplicitAutograd: alias key mapped to inference kernels of all backends like CPU, CUDA, XLA etc.
+    Kernels registered to this key MUST work for inference for all backends.
+- Autograd: alias key mapped to autograd of all backends like AutogradCPU, AutogradXLA, AutogradOther.
+    Kernels registered to this key MUST work for autograd for all backends.
+- CompositeImplicitAutograd: alias key CompositeImplicitAutograd = CompositeExplicitAutograd + Autograd
+    Kernels registered to this key MUST work for both inference + autograd for all backends.
+
+Note we only allow registrations to alias keys inside pytorch core library. E.g
+you shouldn't register a CompositeImplicitAutograd or CompositeExplicitAutograd
+kernel from torch-xla extension, instead you should upstream the kernel into
+pytorch/pytorch repo so that it's available for all backends and continuously
+tested even without the extension.
+
+Usage:
+  dispatcher = PythonDispatcher()
+  dispatcher.register(["CPU", "XLA", "CompositeImplicitAutograd"])
+  print(dispatcher.dispatchTable()) # This tells you exactly which kernel is used for certain backend.
+  # For more debugging information
+  # print(dispatcher.keys())
+  # print(dispatcher.registrations())
+  # print(dispatcher.rawRegistrations())
+  # print(dispatcher.rawDispatchTable())
+PythonDispatcher calls C++ dispatcher under the hood for to precompute dispatch table.
+This file only provides the simplified API for developers, relevant test code is located in
+test/test_dispatch.py
+"""
+
+
+class PythonDispatcher:
+    namespace = "__test__"
+    name = "foo"
+    # fmt: off
+    runtime_keys = [
+        "CPU", "AutogradCPU",
+        "FPGA", "AutogradOther",
+        "XLA", "AutogradXLA",
+        "Lazy", "AutogradLazy",
+    ]
+    # fmt: on
+    alias_keys = [
+        "CompositeExplicitAutograd",
+        "Autograd",
+        "CompositeImplicitAutograd",
+    ]
+    supported_keys = runtime_keys + alias_keys
+
+    def __init__(self):
+        C._dispatch_check_invariants(self.name)  # type: ignore[attr-defined]
+        self.ref = C._dispatch_library("FRAGMENT", self.namespace, "")
+        self.ref.def_("foo(Tensor x) -> Tensor")
+
+    """
+    Returns a list of dispatch keys supported by PythonDispatcher.
+    You can register kernels to these keys.
+    """
+
+    def keys(self):
+        return self.supported_keys
+
+    """
+    Register kernels to the target dispatchKeys.
+    dispatchKeys(list[str]): a list of dispatch keys that you want to register
+      your own kernel. Note that you don't need to write the kernel yourself in
+      this PythonDispatcher.E.g. for CPU key, a kernel(e.g fn_CPU for CPU) is
+      automatically generated and registered.
+    """
+
+    def register(self, dispatchKeys):
+        # Overriden is not supported and triggers a warning in C++ dispatcher.
+        if len(set(dispatchKeys)) != len(dispatchKeys):
+            raise RuntimeError(
+                f"Overriden is not allowed but found duplicates in {dispatchKeys}."
+            )
+        # We currently forbid this in codegen instead of C++ dispatcher.
+        if (
+            "CompositeImplicitAutograd" in dispatchKeys
+            and "CompositeExplicitAutograd" in dispatchKeys
+        ):
+            raise RuntimeError(
+                "Registration to both CompositeImplicitAutograd and CompositeExplicitAutograd is not allowed."
+            )
+        for key in dispatchKeys:
+            if key not in self.supported_keys:
+                raise RuntimeError(
+                    f"{key} is not supported, please select a dispatch key in {self.supported_keys}."
+                )
+            self.ref.impl_t_t("foo", dispatch=key, debug="fn_" + key)
+
+    """
+    Helper function to format (key, kernel).
+    """
+
+    def _format_line(self, key, kernel):
+        return f"{key:<15} {kernel}\n"
+
+    """
+    Helper function to print a table header.
+    """
+
+    def _format_header(self, header):
+        s = f"""
+{header}
+"""
+        s += self._format_line("key", "kernel")
+        s += "---------------------------\n"
+        return s
+
+    """
+    Returns raw output of all registration info for debugging only.
+    Use registrations() for a simplified version.
+    """
+
+    def rawRegistrations(self):
+        return C._dispatch_dump(f"{self.namespace}::{self.name}")  # type: ignore[attr-defined]
+
+    """
+    Returns raw output of computed dispatch table for debugging only.
+    Use dispatchTable() for a simplified version.
+    """
+
+    def rawDispatchTable(self):
+        return C._dispatch_dump_table(f"{self.namespace}::{self.name}")  # type: ignore[attr-defined]
+
+    """
+    Returns a table(str) including all the registrations from users.
+    Note this includes registrations to both runtime keys and alias keys.
+    """
+
+    def registrations(self):
+        output = self._format_header("Registered Kernels")
+        state = self.rawRegistrations()
+        state_entries = state.split("\n")
+        for line in state_entries:
+            first = line.split(":")[0]
+            if any(first.startswith(k) for k in self.supported_keys):
+                kernel = line.split("::")[0].split(" ")[1]
+                output += self._format_line(first, kernel)
+        return output
+
+    """
+    Returns the computed dispatch table(str). Note this only include
+    runtime keys, registrations to alias keys have been decoded to their
+    mapped runtime keys.
+    """
+
+    def dispatchTable(self):
+        output = self._format_header("Computed Dispatch Table")
+        table = self.rawDispatchTable()
+        table_entries = table.split("\n")
+        regex = re.compile(r"registered at .*FallbackKernel\.cpp.*(\[)")
+        for line in table_entries:
+            k = line.split(":")[0]
+            if k in self.runtime_keys:
+                entry = regex.sub("[", line)
+                output += self._format_line(k, entry.split(": ")[1])
+        return output
diff --git a/MLPY/Lib/site-packages/torch/_refs/__init__.py b/MLPY/Lib/site-packages/torch/_refs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a96d1f82d8156c14cb8ceb254d6057127b1f053
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/__init__.py
@@ -0,0 +1,6443 @@
+import builtins
+import collections
+import inspect
+import itertools
+import math
+import operator
+import warnings
+
+from collections.abc import Iterable
+from enum import Enum
+from functools import partial, reduce, singledispatch, wraps
+from typing import Any, Callable, Dict, List, Optional, overload, Sequence, Tuple, Union
+
+import torch
+
+import torch._prims as prims
+import torch._prims_common as utils
+from torch import sym_float, sym_int
+from torch._prims_common import (
+    DeviceLikeType,
+    Dim,
+    DimsSequenceType,
+    DimsType,
+    dtype_to_type,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    FloatLike,
+    FloatWithoutSymFloat,
+    IntLike,
+    is_weakly_lesser_type,
+    Number,
+    NumberType,
+    RealNumberType,
+    REDUCTION_OUTPUT_TYPE_KIND,
+    ShapeType,
+    StrideType,
+    TensorLike,
+    TensorLikeType,
+    TensorOrNumberLikeType,
+    TensorSequenceType,
+)
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    _maybe_resize_out,
+    _safe_copy_out,
+    elementwise_type_promotion_wrapper,
+    elementwise_unary_scalar_wrapper,
+    out_wrapper,
+)
+
+# Experimental module containing prototype Python references for existing
+#   PyTorch operations.
+
+__all__ = [
+    #
+    # Elementwise Unary References
+    #
+    "abs",
+    "acos",
+    "acosh",
+    "asinh",
+    "asin",
+    "atan",
+    "atanh",
+    "bitwise_not",
+    # "cbrt",  # No corresponding torch operation
+    "ceil",
+    "conj_physical",
+    "cos",
+    "cosh",
+    "count_nonzero",
+    "deg2rad",
+    "digamma",
+    "erf",
+    "erfinv",
+    "erfc",
+    "exp",
+    "expm1",
+    "exponential",
+    "exp2",
+    "fill",
+    "fill_",
+    "floor",
+    "frac",
+    "geometric",
+    "index_add",
+    "index_copy",
+    "index_copy_",
+    "index_select",
+    "index_fill",
+    "index_fill_",
+    "isfinite",
+    "isinf",
+    "isposinf",
+    "isneginf",
+    "isnan",
+    "isreal",
+    "i0",
+    "lerp",
+    "lgamma",
+    "log",
+    "log1p",
+    "log2",
+    "log10",
+    "log_normal",
+    "log_softmax",
+    "mvlgamma",
+    "norm",
+    "normal",
+    "nan_to_num",
+    "neg",
+    "positive",
+    "rad2deg",
+    "reciprocal",
+    "round",  # TODO: model kwargs
+    "sigmoid",
+    "sgn",
+    "sign",
+    "signbit",
+    "sin",
+    "sinc",
+    "sinh",
+    "softmax",
+    "sqrt",
+    "square",
+    "tan",
+    "tanh",
+    "trace",
+    "trunc",
+    #
+    # Elementwise Binary References
+    #
+    "add",
+    "atan2",
+    "bitwise_and",
+    "bitwise_left_shift",
+    "bitwise_or",
+    "bitwise_right_shift",
+    "bitwise_xor",
+    "clamp_min",
+    "clamp_max",
+    "copysign",
+    "div",
+    "eq",
+    "float_power",
+    "floor_divide",
+    "fmax",
+    "fmin",
+    "fmod",
+    "gcd",
+    "ge",
+    "gt",
+    "heaviside",
+    "hypot",
+    "igamma",
+    "igammac",
+    "imag",
+    "isclose",
+    "lcm",
+    # 'ldexp',
+    "le",
+    "logaddexp",
+    "logaddexp2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logsumexp",
+    "lt",
+    # 'max', # implement with reductions
+    "maximum",
+    # 'min', # implement with reductions
+    "minimum",
+    "mul",
+    "ne",
+    "nextafter",
+    # 'polar',  # abs, cos, sin
+    "pow",
+    "real",
+    "rpow",
+    "remainder",
+    "rsub",
+    "rtruediv",
+    "rfloordiv",
+    "sub",
+    "true_divide",
+    "trunc_divide",
+    "xlogy",
+    #
+    # Elementwise Ternary References
+    #
+    "addcdiv",
+    "addcmul",
+    "clamp",
+    #
+    # Conditional references
+    #
+    "masked_fill",
+    "masked_fill_",
+    "where",
+    #
+    # Data conversion and movement references
+    #
+    "clone",
+    "copy_to",  # TODO: add OpInfo (or implement .to)
+    "item",
+    "to",
+    #
+    # Reduction ops
+    #
+    "all",
+    "amax",
+    "amin",
+    "any",
+    "cumsum",
+    "cumprod",
+    "mean",
+    "dot",
+    "vdot",
+    "std",
+    "std_mean",
+    "sum",
+    "sum_to_size",
+    "prod",
+    "var",
+    "var_mean",
+    #
+    # Linear algebra ops
+    #
+    "addr",
+    #
+    # View & Shape Ops
+    #
+    "alias",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "as_strided",
+    "as_strided_scatter",
+    "block_diag",
+    "broadcast_shapes",
+    "broadcast_tensors",
+    "broadcast_to",
+    "cat",
+    "chunk",
+    "column_stack",
+    "conj",
+    "constant_pad_nd",
+    "contiguous",
+    "diag_embed",
+    "diag",
+    "diagonal",
+    "diagonal_copy",
+    "diagonal_scatter",
+    "dsplit",
+    "dstack",
+    "expand",
+    "expand_as",
+    "flatten",
+    "flip",
+    "fliplr",
+    "flipud",
+    "hsplit",
+    "hstack",
+    "meshgrid",
+    "movedim",
+    "narrow",
+    "narrow_copy",
+    "native_group_norm",
+    "native_layer_norm",
+    "permute",
+    "ravel",
+    "repeat",
+    "reshape",
+    "reshape_as",
+    "roll",
+    "rot90",
+    "rsqrt",
+    "stack",
+    "swap_axes",  # alias for transpose
+    "squeeze",
+    "t",
+    "T",
+    "take_along_dim",
+    "tensor_split",
+    "transpose",
+    "unfold",
+    "unfold_copy",
+    "unsqueeze",
+    "view",
+    "view_as",
+    "vsplit",
+    "vstack",
+    "view_as_complex",
+    "unflatten",
+    "unbind",
+    "triu",
+    "tril",
+    "triu_indices",
+    "tril_indices",
+    #
+    # Tensor Creation
+    #
+    "arange",
+    "cauchy",
+    "empty",
+    "empty_like",
+    "empty_permuted",
+    "empty_strided",
+    "eye",
+    "full",
+    "full_like",
+    "linspace",
+    "logspace",
+    "new_empty",
+    "new_empty_strided",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "ones",
+    "ones_like",
+    "randn",
+    "scalar_tensor",
+    "zero",
+    "zeros",
+    "zeros_like",
+    #
+    # Test-related functions
+    #
+    "allclose",
+    "equal",
+    #
+    # Statistical operations
+    #
+    "bucketize",
+    #
+    # Misc
+    #
+    "is_complex",
+    "renorm",
+    "stft",
+    "istft",
+]
+
+Tensor = torch.Tensor
+DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
+aten = torch._ops.ops.aten
+
+# Note that the docstrings for the public methods from this file are in
+# torch/_torch_docs.py
+
+
+def is_noncontiguous_supported(device):
+    if device is not None and device.type == "hpu":
+        return False
+    return True
+
+
+def handle_noncontiguous_outputs(input_tlist, output):
+    device = None
+    from torch._subclasses.fake_tensor import FakeTensor
+
+    for t in input_tlist:
+        if isinstance(t, FakeTensor):
+            device = t.fake_device
+            break
+
+    if not is_noncontiguous_supported(device):
+        output = output.contiguous()
+
+    return output
+
+
+def _broadcast_shapes(*_shapes):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    shapes = tuple(
+        (x,) if isinstance(x, IntLike) else x
+        for x in filter(lambda x: x is not None, _shapes)
+    )
+
+    # Short-circuits on no input
+    if len(shapes) == 0:
+        return None
+
+    # Type checking
+    # TODO: make common validations available as utils
+    for shape in shapes:
+        assert isinstance(shape, Sequence)
+
+    # Computes common shape
+    common_shape = [
+        1,
+    ] * reduce(max, (len(shape) for shape in shapes))
+    for arg_idx, shape in enumerate(shapes):
+        for idx in range(-1, -1 - len(shape), -1):
+            if guard_size_oblivious(common_shape[idx] == 1):
+                if shape[idx] < 0:
+                    raise ValueError(
+                        "Attempting to broadcast a dimension with negative length!"
+                    )
+                common_shape[idx] = shape[idx]
+            elif guard_size_oblivious(shape[idx] != 1):
+                if common_shape[idx] != shape[idx]:
+                    raise RuntimeError(
+                        f"Attempting to broadcast a dimension of length {shape[idx]} at {idx}! "
+                        f"Mismatching argument at index {arg_idx} had {shape}; but expected shape "
+                        f"should be broadcastable to {common_shape}"
+                    )
+
+    return common_shape
+
+
+def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
+    # Computes common shape
+    common_shape = _broadcast_shapes(
+        *(t.shape if isinstance(t, TensorLike) else None for t in args)
+    )
+
+    def __maybe_broadcast(x, shape):
+        if x is None:
+            return None
+        elif isinstance(x, Number):
+            return x
+        elif isinstance(x, TensorLike):
+            if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
+                return x
+
+            if not utils.same_shape(x.shape, common_shape):
+                return x.expand(common_shape)
+
+            return x
+        else:
+            raise RuntimeError(
+                "Unexpected type when broadcasting: " + str(type(x)) + "!"
+            )
+
+    return tuple(__maybe_broadcast(x, common_shape) for x in args)
+
+
+# Utilities should come BEFORE this import
+from torch._decomp import register_decomposition
+
+#
+# Elementwise unary references
+#
+
+infer_aten_op = object()
+
+
+# TODO: add type promotion support
+def _make_elementwise_unary_reference(
+    type_promotion_kind,
+    *,
+    aten_op=infer_aten_op,
+    extra_meta=None,
+) -> Callable:
+    def inner(prim: Callable):
+        nonlocal aten_op
+
+        @wraps(prim)
+        @out_wrapper()
+        @elementwise_unary_scalar_wrapper
+        @elementwise_type_promotion_wrapper(
+            type_promoting_args=("a",),
+            type_promotion_kind=type_promotion_kind,
+        )
+        def _ref(a: TensorLikeType) -> TensorLikeType:
+            if extra_meta is not None:
+                extra_meta(a)
+
+            output = prim(a)
+            return handle_noncontiguous_outputs([a], output)
+
+        if aten_op is infer_aten_op:
+            aten_op = utils.get_aten_op(prim, prim.__name__)
+        if aten_op is not None:
+            register_decomposition(aten_op)(_ref)
+
+        return _ref
+
+    return inner
+
+
+def _make_alias(fn, name):
+    """
+    This function defines an alias of another function and sets its __name__ argument.
+    It also sets its __module__ argument to the module of the caller.
+    Note that when naïvely doing `alias = fn`, we have that `alias.__name__ == "fn"`, and
+    `alias.__module__ == fn.__module__`.
+    """
+
+    def _fn(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    _fn.__name__ = name
+    _fn.__module__ = inspect.currentframe().f_back.f_globals["__name__"]  # type: ignore[union-attr]
+    return _fn
+
+
+def _make_inplace(fn):
+    """
+    Given a function with out variant (i.e. using `out_wrapper()), it returns its in-place variant
+    See https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-do-in-place-operations-work-in-pytorch
+    """
+
+    # nb. We use the name of the first argument used in the unary references
+    @wraps(fn)
+    def _fn(a, *args, **kwargs):
+        return fn(a, *args, out=a, **kwargs)
+
+    inplace_name = f"{fn.__name__}_"
+    _fn.__name__ = inplace_name
+    _fn = register_decomposition(getattr(aten, inplace_name))(_fn)
+
+    # We access the __all__ attribute of the module where fn is defined
+    # There may be a cleaner way of doing this...
+    from inspect import getmodule
+
+    _all = getmodule(fn).__all__  # type: ignore[union-attr]
+    if inplace_name not in _all:
+        _all.append(inplace_name)
+    return _fn
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT)
+def abs(a):
+    return prims.abs(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def acos(a):
+    return prims.acos(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def acosh(a):
+    return prims.acosh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def asin(a):
+    return prims.asin(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def asinh(a):
+    return prims.asinh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def atan(a):
+    return prims.atan(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def atanh(a):
+    return prims.atanh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def bitwise_not(a):
+    return prims.bitwise_not(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def ceil(a):
+    return prims.ceil(a)
+
+
+@register_decomposition(aten.is_complex)
+def is_complex(input: TensorLikeType):
+    return utils.is_complex_dtype(input.dtype)
+
+
+@register_decomposition(aten.conj_physical)
+@out_wrapper()
+def conj_physical(input: TensorLikeType):
+    if not utils.is_complex_dtype(input.dtype):
+        return input
+    return prims.conj_physical(input)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def cos(a):
+    return prims.cos(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def cosh(a):
+    return prims.cosh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def digamma(a):
+    return prims.digamma(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def erf(a):
+    return prims.erf(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def erfinv(a):
+    return prims.erf_inv(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def erfc(a):
+    return prims.erfc(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def exp(a):
+    return prims.exp(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def expm1(a):
+    return prims.expm1(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def exp2(a):
+    return prims.exp2(a)
+
+
+# Fill has its own implementation because it has a value parameter
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a,"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+)
+def fill(a: TensorLikeType, value: NumberType) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    assert isinstance(value, Number)
+
+    python_type = utils.dtype_to_type(a.dtype)
+    if not utils.is_weakly_lesser_type(type(value), python_type):
+        msg = f"value argument of type {type(value)} cannot be safely cast to type {python_type}!"
+        raise ValueError(msg)
+
+    return prims.fill(a, value)
+
+
+def fill_(a: TensorLikeType, value: NumberType) -> TensorLikeType:
+    r = prims.fill(a, value)
+    prims.copy_to(a, r)
+    return a
+
+
+@register_decomposition(aten.zero)
+@out_wrapper()
+def zero(input: TensorLikeType) -> TensorLikeType:
+    return torch.zeros_like(input)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def floor(a):
+    return prims.floor(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def frac(x: TensorLikeType) -> TensorLikeType:
+    trunc_x = torch.mul(torch.floor(torch.abs(x)), torch.sign(x))
+    return torch.sub(x, trunc_x)
+
+
+# imag does not use _make_elementwise_unary_reference because it does not support out
+def imag(a: TensorLikeType) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    torch._check(
+        utils.is_complex_dtype(a.dtype), lambda: "imag only supports complex tensors."
+    )
+    return prims.imag(a)
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=None,  # CompositeImplicitAutograd
+)
+def isfinite(a: TensorLikeType) -> TensorLikeType:
+    if utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype):
+        return prims.isfinite(a)
+
+    return ones_like(a, dtype=torch.bool)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def isinf(a: TensorLikeType) -> TensorLikeType:
+    if utils.is_complex_dtype(a.dtype):
+        return torch.logical_or(isinf(torch.real(a)), isinf(torch.imag(a)))
+    if utils.is_float_dtype(a.dtype):
+        return torch.abs(a) == float("inf")
+    return torch.zeros_like(a, dtype=torch.bool)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def isposinf(a: TensorLikeType) -> TensorLikeType:
+    torch._check(
+        not utils.is_complex_dtype(a.dtype),
+        lambda: f"Complex dtype is not supported for isposinf, got dtype {a.dtype}",
+    )
+    if utils.is_float_dtype(a.dtype):
+        return a == float("inf")
+    return torch.zeros_like(a, dtype=torch.bool)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def isneginf(a: TensorLikeType) -> TensorLikeType:
+    torch._check(
+        not utils.is_complex_dtype(a.dtype),
+        lambda: f"Complex dtype is not supported for isneginf, got dtype {a.dtype}",
+    )
+    if utils.is_float_dtype(a.dtype):
+        return a == float("-inf")
+    return torch.zeros_like(a, dtype=torch.bool)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def isnan(a: TensorLikeType) -> TensorLikeType:
+    return prims.ne(a, a)
+
+
+# alias
+mvlgamma = _make_alias(torch.special.multigammaln, "mvlgamma")  # type: ignore[has-type]
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    aten_op=None,  # CompositeImplicitAutograd
+)
+def isreal(a: TensorLikeType) -> TensorLikeType:
+    if utils.is_complex_dtype(a.dtype):
+        return torch.imag(a) == 0
+    return torch.ones_like(a, dtype=torch.bool)
+
+
+# TODO: if this is special maybe it should be defined there and imported here?
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT, aten_op=aten.i0
+)
+def i0(a):
+    return prims.bessel_i0(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def lgamma(a):
+    return prims.lgamma(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def log(a):
+    return prims.log(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def log1p(a):
+    return prims.log1p(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def log2(a):
+    return prims.log2(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def log10(a):
+    return prims.log10(a)
+
+
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def log_softmax(
+    a: TensorLikeType,
+    dim: int,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    result_dtype = dtype or a.dtype
+    computation_dtype = utils.get_computation_dtype(result_dtype)
+    a_ = _maybe_convert_to_dtype(a, computation_dtype)
+    return _maybe_convert_to_dtype(a_ - logsumexp(a_, dim, keepdim=True), result_dtype)  # type: ignore[return-value]
+
+
+@register_decomposition(aten.logsumexp)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def logsumexp(
+    self: TensorLikeType, dim: DimsType, keepdim: bool = False
+) -> TensorLikeType:
+    if not isinstance(dim, Iterable):
+        dim = (dim,)
+    if self.numel() == 0:
+        return torch.sum(torch.exp(self), dim, keepdim).log()
+    maxes = torch.amax(self, dim, keepdim=True)
+    maxes = torch.masked_fill(maxes, maxes.abs() == float("inf"), 0)
+    maxes_squeezed = maxes if keepdim else torch.squeeze(maxes, dim)
+    result = torch.sum(torch.exp(self - maxes), dim, keepdim)
+    return result.log().add(maxes_squeezed)
+
+
+@register_decomposition(aten.nan_to_num)
+@out_wrapper()
+def nan_to_num(
+    a: TensorLikeType,
+    nan: Optional[NumberType] = 0.0,
+    posinf: Optional[NumberType] = None,
+    neginf: Optional[NumberType] = None,
+) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+
+    if utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype):
+        return a.clone()
+
+    if nan is None:
+        nan = 0.0
+
+    if posinf is None:
+        posinf = torch.finfo(a.dtype).max
+
+    if neginf is None:
+        neginf = torch.finfo(a.dtype).min
+
+    result = torch.where(torch.isnan(a), nan, a)  # type: ignore[call-overload]
+    result = torch.where(torch.isneginf(a), neginf, result)  # type: ignore[call-overload]
+    result = torch.where(torch.isposinf(a), posinf, result)  # type: ignore[call-overload]
+    return result
+
+
+def _neg_meta(a: TensorLikeType):
+    torch._check(
+        a.dtype is not torch.bool,
+        lambda: (
+            "Negation, the `-` operator, on a bool tensor is not supported. "
+            "If you are trying to invert a mask, use the `~` or `logical_not()` "
+            "operator instead."
+        ),
+    )
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT, extra_meta=_neg_meta
+)
+def neg(a):
+    return prims.neg(a)
+
+
+# positive does not use _make_elementwise_unary_reference because it does not support out
+# CompositeImplicitAutograd - don't register decomp
+def positive(a: TensorLikeType) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    if a.dtype is torch.bool:
+        msg = "positive does not support bool tensors."
+        raise RuntimeError(msg)
+    return a
+
+
+# real does not use _make_elementwise_unary_reference because it does not support out
+def real(a: TensorLikeType) -> TensorLikeType:
+    assert isinstance(a, TensorLike)
+    if utils.is_complex_dtype(a.dtype):
+        return prims.real(a)
+    return a
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def reciprocal(a):
+    return prims.reciprocal(a)
+
+
+@register_decomposition(aten.round)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def round(a: TensorLikeType, *, decimals: int = 0) -> TensorLikeType:
+    if decimals == 0:
+        return prims.round(a)
+    else:
+        ten_pow = 10**decimals
+        ten_neg_pow = 10 ** (-decimals)
+        return prims.mul(prims.round(prims.mul(a, ten_pow)), ten_neg_pow)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def rsqrt(a):
+    return prims.rsqrt(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sigmoid(a: TensorLikeType) -> TensorLikeType:
+    return true_divide(1, add(1, exp(neg(a))))
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def sgn(a):
+    if utils.is_complex_dtype(a.dtype):
+        a_abs = a.abs()
+        return torch.where(a_abs == 0, 0, a / a_abs)
+    else:
+        return a.sign()
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def sign(a):
+    return prims.sign(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def signbit(a):
+    return prims.signbit(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sin(a):
+    return prims.sin(a)
+
+
+# Autograd note: This will give the right first derivative at zero (by chance),
+# but not the right second derivative
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sinc(a):
+    a = math.pi * a
+    return torch.where(a == 0, 1, torch.sin(a) / a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sinh(a):
+    return prims.sinh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def sqrt(a):
+    return prims.sqrt(a)
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
+    aten_op=None,  # CompositeImplicitAutograd,
+)
+def square(a: TensorLikeType) -> TensorLikeType:
+    return mul(a, a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def tan(a):
+    return prims.tan(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def tanh(a):
+    return prims.tanh(a)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+def trunc(a):
+    return prims.trunc(a)
+
+
+# TODO: register this as a real ref/decomposition once TorchInductor supports complex!
+def view_as_complex(self: TensorLikeType) -> TensorLikeType:
+    input_dtype = self.dtype
+    torch._check(
+        utils.is_float_dtype(input_dtype),
+        lambda: f"view_as_complex is only supported for floating point"
+        f"tensors, but got a tensor of scalar type: {input_dtype}",
+    )
+    sizes = self.size()
+    torch._check(
+        len(sizes) != 0,
+        lambda: "Input tensor must have one or more dimensions",
+    )
+    torch._check(
+        sizes[-1] == 2,
+        lambda: "Tensor must have a last dimension of size 2",
+    )
+
+    old_strides = self.stride()
+    torch._check(
+        old_strides[-1] == 1,
+        lambda: "Tensor must have a last dimension with stride 1",
+    )
+    dims = old_strides[:-1]
+    torch._check(
+        py_all(stride % 2 == 0 for stride in dims),
+        lambda: "Tensor must have a stride divisible by 2 for all but last dimension",
+    )
+    torch._check(
+        self.storage_offset() % 2 == 0,
+        lambda: "Tensor must have a storage_offset divisible by 2",
+    )
+    return prims.view_element_type(
+        self, utils.corresponding_complex_dtype(input_dtype)
+    ).squeeze(-1)
+
+
+def _make_elementwise_binary_reference(
+    type_promotion_kind,
+    aten_op=infer_aten_op,
+    name=None,
+    has_out=True,
+    supports_lhs_python_scalar=True,
+    supports_rhs_python_scalar=True,
+    supports_two_python_scalars=False,
+    should_register_decomposition=True,
+) -> Callable:
+    def inner(prim: Callable):
+        nonlocal aten_op, name
+        if name is None:
+            name = prim.__name__
+
+        @wraps(prim)
+        @elementwise_type_promotion_wrapper(
+            type_promoting_args=("a", "b"),
+            type_promotion_kind=type_promotion_kind,
+        )
+        def _ref(
+            a: Union[Tensor, NumberType],
+            b: Union[Tensor, NumberType],
+        ) -> Tensor:
+            torch._check_value(
+                supports_lhs_python_scalar or not isinstance(a, Number),
+                lambda: f"{name}: Received a lhs Python scalar to an elementwise binary "
+                "operation that does not accept lhs scalars!",
+            )
+            torch._check_value(
+                supports_rhs_python_scalar or not isinstance(b, Number),
+                lambda: f"{name}: Received a rhs Python scalar to an elementwise binary "
+                "operation that does not accept rhs scalars!",
+            )
+            torch._check_value(
+                supports_two_python_scalars
+                or not (isinstance(a, Number) and isinstance(b, Number)),
+                lambda: f"{name}: Receive two Number inputs to an elementwise binary operation!",
+            )
+            a, b = _maybe_broadcast(a, b)
+            output = prim(a, b)
+            return handle_noncontiguous_outputs([a, b], output)
+
+        if has_out:
+            _ref = out_wrapper()(_ref)
+
+        _ref.__name__ = name
+        if aten_op is infer_aten_op:
+            aten_op = utils.get_aten_op(prim, name)
+        if aten_op is not None and should_register_decomposition:
+            register_decomposition(aten_op)(_ref)
+
+        return _ref
+
+    return inner
+
+
+# Add has its own implementation because it has an alpha argument
+@register_decomposition(aten.add)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def add(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    *,
+    alpha: Optional[NumberType] = None,
+):
+    """
+    Reference implementation of torch.add
+    """
+
+    a, b = _maybe_broadcast(a, b)
+
+    if alpha is not None:
+        dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
+        python_type = utils.dtype_to_type(dtype)
+        if python_type != bool and not utils.is_weakly_lesser_type(
+            type(alpha), python_type
+        ):
+            msg = f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!"
+            raise ValueError(msg)
+        if isinstance(b, TensorLike):
+            b = prims.mul(b, alpha)
+        else:
+            b = b * alpha
+
+    output = prims.add(a, b)
+    return handle_noncontiguous_outputs([a, b], output)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def atan2(a, b):
+    return prims.atan2(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def bitwise_and(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_and(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def bitwise_left_shift(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.shift_left(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def bitwise_or(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_or(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def bitwise_right_shift(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.shift_right_arithmetic(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def bitwise_xor(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.bitwise_xor(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+)
+def copysign(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+):
+    if isinstance(b, Number) and isinstance(a, Tensor):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
+        msg = "Expected divisor (b) to be on the same device ({}) as dividend (a), but it is found on {}!".format(
+            a.device, b.device
+        )
+        raise RuntimeError(msg)
+    return where(signbit(b), neg(abs(a)), abs(a))
+
+
+# complex =  _make_elementwise_binary_reference(prims.complex, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+
+
+@register_decomposition(aten.div)
+@out_wrapper()
+def div(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    *,
+    rounding_mode: Optional[str] = None,
+):
+    """
+    Reference implementation of torch.div
+    """
+    if rounding_mode is None:
+        return true_divide(a, b)
+    elif rounding_mode == "trunc":
+        return trunc_divide(a, b)
+    elif rounding_mode == "floor":
+        return floor_divide(a, b)
+    else:
+        msg = f"div expected rounding_mode to be one of None, 'trunc', or 'floor' but found {rounding_mode}."
+        raise ValueError(msg)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def eq(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.eq(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG,
+)
+def pow(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+) -> TensorLikeType:
+    assert isinstance(a, TensorLikeType) or isinstance(b, TensorLikeType)
+
+    if isinstance(b, Number):
+        if b == 1.0:
+            return a.clone()  # type: ignore[return-value,union-attr]
+        elif b == 2.0:
+            return a * a  # type: ignore[return-value]
+        elif b == 0.5:
+            return torch.sqrt(a)  # type: ignore[arg-type]
+    elif isinstance(a, Number):
+        if a == 1.0:
+            return torch.fill(b, True)
+        if a == 2.0 and (
+            utils.is_float_dtype(b.dtype) or utils.is_complex_dtype(b.dtype)
+        ):
+            return torch.exp2(b)
+
+    return prims.pow(a, b)
+
+
+# Float power has its own implementation because it has unique type promotion.
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def float_power(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+) -> Tensor:
+    if isinstance(a, Number) and isinstance(b, Number):
+        raise ValueError(
+            "Receive two Number inputs to an elementwise binary operation!"
+        )
+
+    # Handles type promotion
+    dtype = utils.get_higher_dtype(a, b)
+    assert dtype is not None
+    if utils.is_complex_dtype(dtype):
+        dtype = torch.complex128
+    else:
+        dtype = torch.float64
+
+    # Float power has the following contiguous cast behavior to be
+    # consistent with its C++ impl
+    a = _maybe_convert_to_dtype(a, dtype)
+    b = _maybe_convert_to_dtype(b, dtype)
+
+    a, b = _maybe_broadcast(a, b)
+    return pow(a, b)
+
+
+# >>> a = torch.tensor(-0.2500, dtype=torch.float64)
+# tensor(-0.250000000000000, dtype=torch.float64)
+#
+# >>> b = torch.tensor(-0.0010, dtype=torch.float64)
+# tensor(-0.001000000000000, dtype=torch.float64)
+#
+# Note: In this case, casting float to double will expand the float mantissa with zeros,
+# while creating a double generates a distinct mantissa.
+# >>> torch.tensor(-0.001).to(dtype=torch.float64)
+# tensor(-0.001000000047497, dtype=torch.float64)
+#
+# Floor Division
+# The difference is caused because torch.remainder(a, b) = -0.001.
+#
+# >>> torch.floor(torch.true_divide(a, b))
+# tensor(250., dtype=torch.float64)
+#
+# >>> torch.div(a, b, rounding_mode='floor')
+# tensor(249., dtype=torch.float64)
+#
+# Definition: a // b = (a - remainder(a, b)) / b
+# >>> torch.true_divide(torch.sub(a, torch.remainder(a, b)), b)
+# tensor(249., dtype=torch.float64)
+#
+# For reference, see CPython's implementation:
+# https://github.com/python/cpython/blob/ace008c531dd685a30c1dd68f9b5ba35f20171cf/Objects/floatobject.c#L636
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_two_python_scalars=True,
+    should_register_decomposition=False,
+)
+def floor_divide(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+):
+    # Wrap scalars because some references only accept tensor arguments.
+    if isinstance(a, Number) and isinstance(b, Number):
+        a = scalar_tensor(a)
+        b = scalar_tensor(b)
+    elif isinstance(b, Number) and isinstance(a, Tensor):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(a, Number) and isinstance(b, Tensor):
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+    elif isinstance(a, Tensor) and isinstance(b, Tensor) and a.device != b.device:
+        if a.device == torch.device("cpu"):
+            msg = "Expected divisor (b) to be on the same device ({}) as dividend (a), but it is found on {}!".format(
+                a.device, b.device
+            )
+            raise RuntimeError(msg)
+        else:
+            b = prims.device_put(b, device=a.device)
+
+    assert isinstance(a, Tensor) and isinstance(b, Tensor)
+    dtype = a.dtype
+    if utils.is_float_dtype(dtype):
+        return _floor_divide_float(a, b)
+    elif utils.is_integer_dtype(dtype):
+        return _floor_divide_integer(a, b)
+    else:
+        torch._check(False, lambda: f"{dtype} not supported for floor_divide")
+
+
+def _floor_divide_integer(a: Tensor, b: Tensor) -> Tensor:
+    a, b = _maybe_broadcast(a, b)
+
+    if not a.dtype.is_signed:
+        return prims.div(a, b)
+
+    # Convert truncation to flooring:
+    offset = (torch.signbit(a) != torch.signbit(b)).logical_and(torch.fmod(a, b) != 0)
+    return prims.div(a, b) - _maybe_convert_to_dtype(offset, a.dtype)
+
+
+def _floor_divide_float(a: Tensor, b: Tensor) -> Tensor:
+    mod = fmod(a, b)
+    div = true_divide(sub(a, mod), b)
+
+    # Ensure that the remainder has the same sign as denominator
+    different_signed_inputs = bitwise_xor(lt(a, 0), lt(b, 0))
+    non_zero_remainder = ne(mod, 0)
+    mask = bitwise_and(non_zero_remainder, different_signed_inputs)
+    div = where(mask, sub(div, 1), div)
+
+    # Map quotient to nearest integer value
+    floor_div = floor(div)
+    mask = gt(sub(div, floor_div), 0.5)
+    floor_div = where(mask, add(floor_div, 1), floor_div)
+
+    basic_div = true_divide(a, b)
+    zero_tensor = scalar_tensor(0, dtype=basic_div.dtype, device=basic_div.device)
+
+    # If quotient is zero, copy signbit from true_divide quotient
+    floor_div = where(ne(div, 0), floor_div, copysign(zero_tensor, basic_div))
+
+    # If denominator is zero, then follow true_divide behavior
+    return where(ne(b, 0), floor_div, basic_div)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def fmax(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmax(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def fmin(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmin(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=True,
+)
+def fmod(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.fmod(a, b)
+
+
+@register_decomposition(aten.frexp)
+@out_wrapper("mantissa", "exponent")
+def frexp(self: TensorLikeType) -> Tuple[TensorLikeType, TensorLikeType]:
+    return torch.return_types.frexp(prims.frexp(self))
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def gcd(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.gcd(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def ge(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.ge(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def gt(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.gt(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def heaviside(input: TensorLikeType, values: TensorLikeType) -> TensorLikeType:
+    input_eq_zero = torch.eq(input, 0)
+    input_lt_zero = torch.logical_or(torch.lt(input, 0), torch.isnan(input))
+    zeros_and_ones = torch.where(input_lt_zero, 0, 1)
+    output = torch.where(input_eq_zero, values, zeros_and_ones)
+    return output
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def hypot(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.hypot(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def igamma(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.igamma(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def igammac(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.igammac(a, b)
+
+
+def _check_close_args(
+    name: str,
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float,
+    atol: float,
+) -> None:
+    torch._check_value(
+        a.dtype == b.dtype,
+        lambda: f"{name}: Attempting to compare tensors of different dtypes {a.dtype} and {b.dtype}!",
+    )
+    torch._check(
+        rtol >= 0,
+        lambda: f"{name}: rtol must be greater than or equal to zero, but got {rtol}!",
+    )
+    torch._check(
+        atol >= 0,
+        lambda: f"{name}: atol must be greater than or equal to zero, but got {atol}!",
+    )
+
+
+# CompositeImplicitAutograd - don't register decomp
+def isclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> TensorLikeType:
+    _check_close_args(name="torch.isclose", a=a, b=b, rtol=rtol, atol=atol)
+
+    close = eq(a, b)
+    if equal_nan and (utils.is_float_dtype(a.dtype) or utils.is_complex_dtype(a.dtype)):
+        close = logical_or(close, logical_and(isnan(a), isnan(b)))
+
+    # Note: In case of zero tolerances the closeness inequality degenerates to an equality check.
+    # In this case, the short-circuit prevents false positives as detailed in the paragraph below.
+    if atol == 0 and rtol == 0:
+        return close
+
+    # Note [closeness error computation]
+    # atol and rtol are provided as doubles, so the computation
+    # rtol * other will produce a float or complex tensor.
+    # When the difference (self - other) is compared to it then the
+    # tensor representing the difference will also be cast to float or complex.
+    # However, since (self - other) in uint8 is very likely to produce a
+    # negative value, this moves the cast forward so the difference is
+    # always computed in a float or complex type.
+    # If the values of the integer tensors cannot be exactly represented
+    # by the default scalar type then this may cause an incorrect result.
+    if not utils.is_float_dtype(a.dtype) and not utils.is_complex_dtype(a.dtype):
+        a = prims.convert_element_type(a, torch.get_default_dtype())
+        b = prims.convert_element_type(b, torch.get_default_dtype())
+
+    allowed_error = add(atol, abs(mul(b, rtol)))
+    actual_error = abs(sub(a, b))
+
+    # Computes finite closeness
+    result = logical_or(
+        close, logical_and(isfinite(actual_error), le(actual_error, allowed_error))
+    )
+
+    return result
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def lcm(a: TensorLikeType, b: TensorLikeType):
+    dtype = a.dtype
+    # promoting to int32 to maintain 100% consistency with C++ and to
+    # prevent overflow in case of int8 and int16
+    promote_to_int = dtype in (torch.int8, torch.int16)
+    if promote_to_int:
+        a = prims.convert_element_type(a, torch.int32)
+        b = prims.convert_element_type(b, torch.int32)
+
+    g = torch.gcd(a, b)
+    # Avoid division by zero in case gcd(0, 0) == 0
+    g = torch.where(g == 0, 1, g)
+    res = torch.abs(prims.div(a, g) * b)
+    return res if not promote_to_int else prims.convert_element_type(res, dtype)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def le(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.le(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def logaddexp(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    # Nb. this implementation does not distribute the gradients evenly when a == b
+    mask = torch.real(a) >= torch.real(b)
+    max_ = torch.where(mask, a, b)
+    min_ = torch.where(mask, b, a)
+    inf_mask = torch.logical_and(
+        torch.logical_not(torch.isfinite(torch.real(a))), torch.real(a) == torch.real(b)
+    )
+    if utils.is_complex_dtype(a.dtype) or utils.is_complex_dtype(b.dtype):
+        # are you wondering what this bunch of codes are for? edge cases!
+        neg_min_mask = torch.real(min_) < 0
+        inf_vals = torch.where(
+            neg_min_mask, min_, torch.log(torch.exp(min_) + torch.exp(max_))
+        )
+        non_nan_vals = torch.where(
+            inf_mask, inf_vals, max_ + torch.log1p(torch.exp(min_ - max_))
+        )
+        # the type for full_like does not include tensor yet
+        nan_mask = torch.isnan(min_)
+        return torch.where(nan_mask, complex(float("nan"), float("nan")), non_nan_vals)  # type: ignore[call-overload]
+    else:
+        return torch.where(inf_mask, a, max_ + torch.log1p(torch.exp(min_ - max_)))
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def logaddexp2(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    torch._check(
+        not (utils.is_complex_dtype(a.dtype) or utils.is_complex_dtype(b.dtype)),
+        lambda: "logaddexp2 doesn't support complex dtypes",
+    )
+    # Nb. this implementation does not distribute the gradients evenly when a == b
+    mask = a >= b
+    max_ = torch.where(mask, a, b)
+    min_ = torch.where(mask, b, a)
+    inf_mask = torch.logical_and(torch.isinf(a), a == b)
+    inv_log_2 = 1.0 / math.log(2)
+    result = max_ + torch.log1p(torch.exp2(min_ - max_)) * inv_log_2
+    return torch.where(inf_mask, a, result)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+def logical_and(a: TensorLikeType, b: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        a = a != 0
+    if not utils.is_boolean_dtype(b.dtype):
+        b = b != 0
+    return a & b
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL)
+def logical_not(a: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        return a == 0
+    return ~a
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+def logical_or(a: TensorLikeType, b: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        a = a != 0
+    if not utils.is_boolean_dtype(b.dtype):
+        b = b != 0
+    return bitwise_or(a, b)
+
+
+# TODO: skip unnecessary conversion of long to float
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+)
+def logical_xor(a: TensorLikeType, b: TensorLikeType):
+    if not utils.is_boolean_dtype(a.dtype):
+        a = a != 0
+    if not utils.is_boolean_dtype(b.dtype):
+        b = b != 0
+    return a ^ b
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def lt(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.lt(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def maximum(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.maximum(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def minimum(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.minimum(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    supports_two_python_scalars=True,
+)
+def mul(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.mul(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL,
+    supports_lhs_python_scalar=False,
+)
+def ne(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.ne(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+    supports_lhs_python_scalar=False,
+    supports_rhs_python_scalar=False,
+)
+def nextafter(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.nextafter(a, b)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def remainder(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.remainder(a, b)
+
+
+# reverse sub
+@register_decomposition(aten.rsub)
+@out_wrapper()
+def rsub(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    alpha: NumberType = 1,
+):
+    if isinstance(a, Number):
+        msg = "Received a Number for the first argument, but expected a Tensor"
+        raise ValueError(msg)
+
+    return torch.sub(b, a, alpha=alpha)
+
+
+# TODO: consider refactoring this with add impl
+# sub has its own implementation because it has an alpha argument
+@register_decomposition(aten.sub)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def sub(
+    a: Union[TensorLikeType, NumberType],
+    b: Union[TensorLikeType, NumberType],
+    *,
+    alpha: NumberType = 1,
+):
+    """
+    Reference implementation of torch.sub
+    """
+
+    a, b = _maybe_broadcast(a, b)
+
+    if alpha != 1:
+        dtype = a.dtype if isinstance(a, TensorLike) else b.dtype  # type: ignore[union-attr]
+        python_type = utils.dtype_to_type(dtype)
+        if not utils.is_weakly_lesser_type(type(alpha), python_type):
+            msg = f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!"
+            raise ValueError(msg)
+        if isinstance(b, torch.Tensor):
+            b = prims.mul(b, alpha)
+        else:
+            # Carefully not to use prims.mul if b is a scalar / symint.
+            # prims.mul always returns a tensor,
+            # which will mess with type promotion.
+            b = b * alpha
+
+    output = prims.sub(a, b)
+    return handle_noncontiguous_outputs([a, b], output)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    name="true_divide",
+    aten_op=None,  # CompositeImplicitAutograd
+    supports_two_python_scalars=True,
+)
+def true_divide(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.div(a, b)
+
+
+@register_decomposition(aten.xlogy)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def xlogy(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]):
+    torch._check(
+        isinstance(a, TensorLike) or isinstance(b, TensorLike),
+        lambda: 'Expected either argument a or b to be a Tensor"',
+    )
+
+    # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
+    if isinstance(b, TensorLike) and isinstance(a, Number):
+        a = scalar_tensor(a, dtype=b.dtype, device=b.device)
+    elif isinstance(a, TensorLike) and isinstance(b, Number):
+        b = scalar_tensor(b, dtype=a.dtype, device=a.device)
+
+    # mypy: expected "Tensor"
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+    rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, torch.log(b)))
+    return torch.where(torch.isnan(b), float("nan"), rhs)
+
+
+@_make_elementwise_binary_reference(
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    aten_op=None,  # CompositeImplicitAutograd
+    supports_two_python_scalars=True,
+)
+def trunc_divide(
+    a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]
+):
+    dtype = utils.get_dtype(a)
+    if utils.is_integer_dtype(dtype):
+        return prims.div(a, b)
+
+    return trunc(prims.div(a, b))
+
+
+#
+# Elementwise Ternary References
+#
+
+
+@register_decomposition(aten.addcdiv)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self", "tensor1", "tensor2"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def addcdiv(
+    self: TensorLikeType,
+    tensor1: TensorLikeType,
+    tensor2: TensorLikeType,
+    *,
+    value: NumberType = 1,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.addcdiv
+    """
+    if value is not None:
+        dtype = self.dtype  # no scalars allowed, see add
+        python_type = utils.dtype_to_type(dtype)
+        torch._check_value(
+            utils.is_weakly_lesser_type(type(value), python_type),
+            lambda: f"value argument of type {type(value)} cannot be safely cast to type {python_type}!",
+        )
+
+    return self + value * tensor1 / tensor2
+
+
+@register_decomposition(aten.addcmul)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self", "tensor1", "tensor2"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def addcmul(
+    self: TensorLikeType,
+    tensor1: TensorLikeType,
+    tensor2: TensorLikeType,
+    *,
+    value: NumberType = 1,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.addcmul
+    """
+    if value is not None:
+        dtype = self.dtype  # no scalars allowed, see add
+        python_type = utils.dtype_to_type(dtype)
+        torch._check_value(
+            utils.is_weakly_lesser_type(type(value), python_type),
+            lambda: f"value argument of type {type(value)} cannot be safely cast to type {python_type}!",
+        )
+
+    return self + value * tensor1 * tensor2
+
+
+@register_decomposition(aten.clamp)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "min", "max"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def clamp(
+    a: TensorLikeType,
+    min: Optional[TensorOrNumberLikeType] = None,
+    max: Optional[TensorOrNumberLikeType] = None,
+) -> TensorLikeType:
+    # NOTE: grad behavior with implementation `where` is not consistent on `nan`
+    if min is None and max is None:
+        msg = "clamp called but both min and max are none!"
+        raise ValueError(msg)
+    if min is not None:
+        a_isnan = torch.isnan(a)
+        condition = torch.bitwise_or(torch.ge(a, min), a_isnan)  # type: ignore[arg-type]
+        # we should also propagate `nan` coming from boundaries. However, that's
+        # not necessary since `ge` would already `False` when either operands has
+        # a `nan`. So this line below is redundant
+        #   `condition = bitwise_and(condition, bitwise_not(isnan(min)))`
+        a = torch.where(condition, a, min)  # type: ignore[arg-type]
+    if max is not None:
+        a_isnan = torch.isnan(a)
+        # same as above, no need to adjust `nan` from `max`
+        condition = torch.bitwise_or(torch.le(a, max), a_isnan)  # type: ignore[arg-type]
+        a = torch.where(condition, a, max)  # type: ignore[arg-type]
+
+    return a
+
+
+@register_decomposition(aten.clamp_min)
+@out_wrapper()
+def clamp_min(
+    self: TensorLikeType,
+    min: Optional[TensorOrNumberLikeType] = None,
+) -> TensorLikeType:
+    return torch.clamp(self, min=min)  # type: ignore[arg-type]
+
+
+@register_decomposition(aten.clamp_max)
+@out_wrapper()
+def clamp_max(
+    self: TensorLikeType,
+    max: Optional[TensorOrNumberLikeType] = None,
+) -> TensorLikeType:
+    return torch.clamp(self, max=max)  # type: ignore[arg-type]
+
+
+#
+# Conditional references
+#
+
+
+# https://pytorch.org/docs/stable/generated/torch.where.html
+# TODO: implement alternate where
+@register_decomposition(aten.where)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+)
+def where(
+    pred: Tensor,
+    a: Optional[TensorOrNumberLikeType] = None,
+    b: Optional[TensorOrNumberLikeType] = None,
+):
+    """ """
+
+    if a is None or b is None:
+        raise NotImplementedError
+
+    utils.check_same_device(pred, a, b, allow_cpu_scalar_tensors=True)
+    torch._check(
+        pred.dtype is torch.bool,
+        lambda: f"expected predicate to be bool, got {pred.dtype}",
+    )
+
+    pred, a, b = _maybe_broadcast(pred, a, b)
+    return prims.where(pred, a, b)
+
+
+#
+# Data Movement References
+#
+@register_decomposition(aten.clone)
+@out_wrapper()
+def clone(
+    a: TensorLikeType, *, memory_format: torch.memory_format = torch.preserve_format
+) -> TensorLikeType:
+    result = prims.clone(a, memory_format=memory_format)
+    return result
+
+
+def copy_to(a: Tensor, b: Tensor, *, allow_cross_device=True):
+    if not allow_cross_device and a.device != b.device:
+        msg = "Attempting to copy from device {} to device {}, but cross-device copies are not allowed!".format(
+            b.device, a.device
+        )
+        raise RuntimeError(msg)
+
+    return prims.copy_to(a, b)
+
+
+@register_decomposition(aten.item)
+def item(a: TensorLikeType) -> NumberType:
+    if a.numel() != 1:
+        msg = f"Can't convert a tensor with {a.numel()} elements to a number!"
+        raise ValueError(msg)
+
+    # NOTE: explicit conversion is necessary for bool!
+    # See https://github.com/pytorch/pytorch/issues/78071
+    number_type = utils.dtype_to_type(a.dtype)
+    return number_type(prims.item(a))
+
+
+# fast path when `to` returns an alias to input. This mimics the same function in aten
+def _to_will_alias(
+    a: TensorLikeType,
+    device: Optional[DeviceLikeType] = None,
+    dtype: Optional[torch.dtype] = None,
+    copy: Optional[bool] = None,
+    layout: Optional[torch.layout] = None,
+    memory_format: Optional[torch.memory_format] = None,
+    pin_memory: Optional[bool] = False,
+    non_blocking: bool = False,  # not using non_blocking
+) -> bool:
+    return (
+        not copy
+        and (device is None or a.device == device)
+        and (dtype is None or a.dtype == dtype)
+        and (layout is None or a.layout == layout)
+        # is_pinned issue #84925
+        # and (pin_memory is None or pin_memory == a.is_pinned())
+        and (
+            memory_format is None
+            or memory_format == torch.preserve_format
+            or utils.is_contiguous_for_memory_format(a, memory_format=memory_format)
+        )
+    )
+
+
+@singledispatch
+def _to_dispatch(*args, **kwargs):
+    raise NotImplementedError
+
+
+@_to_dispatch.register
+def _to_device(
+    device: torch.device,
+    dtype: torch.dtype,
+    non_blocking: bool = False,
+    copy: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+) -> Dict[str, Any]:
+    kwargs = {
+        "device": device,
+        "dtype": dtype,
+        "non_blocking": non_blocking,
+        "copy": copy,
+        "memory_format": memory_format,
+    }
+    return kwargs
+
+
+@_to_dispatch.register
+def _to_device_str(
+    device: str,
+    dtype: torch.dtype,
+    non_blocking: bool = False,
+    copy: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+) -> Dict[str, Any]:
+    kwargs = {
+        "device": torch.device(device),
+        "dtype": dtype,
+        "non_blocking": non_blocking,
+        "copy": copy,
+        "memory_format": memory_format,
+    }
+    return kwargs
+
+
+@_to_dispatch.register
+def _to_dtype(
+    dtype: torch.dtype,
+    non_blocking: bool = False,
+    copy: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+) -> Dict[str, Any]:
+    kwargs = {
+        "dtype": dtype,
+        "non_blocking": non_blocking,
+        "copy": copy,
+        "memory_format": memory_format,
+    }
+    return kwargs
+
+
+@_to_dispatch.register
+def _to_other(
+    other: Tensor,
+    non_blocking: bool = False,
+    copy: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+) -> Dict[str, Any]:
+    device = other.device
+    dtype = other.dtype
+    layout = other.layout
+    # is_pinned issue #84925
+    # pin_memory = other.is_pinned()
+    kwargs = {
+        "device": device,
+        "dtype": dtype,
+        "layout": layout,
+        "non_blocking": non_blocking,
+        "copy": copy,
+        "memory_format": memory_format,
+    }
+    return kwargs
+
+
+# remove to_kwargs that is already present in `a`
+def _canonicalize_to_arguments(a: Tensor, to_kwargs: dict):
+    options_to_check = ["dtype", "device", "layout", "memory_format"]
+    # "device" option could be passed a str instead torch.device
+    if "device" in to_kwargs and isinstance(to_kwargs["device"], str):
+        to_kwargs["device"] = torch.device(to_kwargs["device"])
+
+    for kw in options_to_check:
+        if kw in to_kwargs:
+            if (
+                (kw == "memory_format" and to_kwargs[kw] is torch.preserve_format)
+                or (
+                    kw == "device"
+                    and to_kwargs[kw].type == a.device.type
+                    and (
+                        not to_kwargs[kw].index or to_kwargs[kw].index == a.device.index
+                    )
+                )
+                or (
+                    getattr(a, kw, None) == to_kwargs[kw]
+                )  # this also handles {"memory_format": None}
+            ):
+                to_kwargs.pop(kw)
+
+
+def to(a: TensorLikeType, *args, **kwargs) -> TensorLikeType:
+    # handled dispatch via positional arguments
+    if len(args) != 0:
+        kwargs = _to_dispatch(*args, **kwargs)
+
+    # TODO: is_pinned is not currently supported in refs or fake_tensor
+    # https://github.com/pytorch/pytorch/issues/84925
+    assert "pin_memory" not in kwargs
+    _canonicalize_to_arguments(a, kwargs)
+
+    if _to_will_alias(a, **kwargs):
+        return a
+
+    copy = kwargs.pop("copy") if "copy" in kwargs else False
+    non_blocking = kwargs.pop("non_blocking") if "non_blocking" in kwargs else False
+
+    # short-circuit to `prims.convert_element_type` when `to` is just a dtype change
+    if (
+        (copy or (kwargs.get("dtype", a.dtype) != a.dtype))
+        and (not non_blocking)
+        and ("memory_format" not in kwargs)
+        and ("device" not in kwargs)
+        and ("layout" not in kwargs)
+        # is_pinned issue #84925
+        # and ("pin_memory" not in kwargs)
+    ):
+        return prims.convert_element_type(a, kwargs.get("dtype", a.dtype))
+
+    result = torch.empty_like(a, **kwargs)
+    # TODO: non_blocking should be handled by `copy_to`
+    copy_to(result, a)
+    return result
+
+
+#
+# Reduction references
+#
+
+
+def _reduction(
+    a: TensorLikeType,
+    prim: Callable,
+    *,
+    has_identity: bool = True,
+    accepts_dim_tuple: bool = True,  # to handle min/argmin that accept single dim only
+    dims: Optional[DimsType] = None,
+    keepdims: bool = False,
+    dtype: Optional[torch.dtype] = None,  # should be specified for ops that support it
+    out: Optional[Tensor] = None,
+    output_dtype_kind: REDUCTION_OUTPUT_TYPE_KIND,
+) -> TensorLikeType:  # it is usually SAME, but I want
+    # ref writers to actually think about what to put here
+    assert isinstance(a, TensorLike)
+    if a.ndim > 64:
+        raise RuntimeError(
+            f"Received a tensor with {a.ndim} dimensions, but only tensors with up to 64 dims are supported!"
+        )
+
+    if out is not None:
+        assert isinstance(out, TensorLike)
+        if dtype is not None:
+            # TODO - this is true for eager mode currently, but it's wrong behavior for complex norms
+            if dtype != out.dtype:
+                raise RuntimeError(
+                    "dtype argument and out dtype must match in reduction"
+                )
+    if not accepts_dim_tuple:
+        assert dims is None or isinstance(dims, Dim)
+    if isinstance(dims, Dim):
+        dims = (dims,)  # type: ignore[assignment]
+    dims = utils.reduction_dims(a.shape, dims)
+    if not has_identity:
+        valid_shape = a.ndim == 0 or py_all(a.shape[i] for i in dims)
+        if not valid_shape:
+            raise RuntimeError(
+                "reducing over zero-size dimension for reduction operation without identity"
+            )
+    computation_dtype, result_dtype = utils.reduction_dtypes(
+        a, output_dtype_kind, dtype
+    )
+    a = _maybe_convert_to_dtype(a, computation_dtype)  # type: ignore[method-assign]
+    result = prim(a, dims)
+    if keepdims:
+        output_shape = [a.shape[i] if i not in dims else 1 for i in range(a.ndim)]
+        broadcast_dims = [i for i in range(a.ndim) if i not in dims]
+        result = prims.broadcast_in_dim(result, output_shape, broadcast_dims)
+
+    if out is not None:
+        assert result_dtype is not None
+        if dtype is not None and result_dtype != out.dtype:
+            raise RuntimeError(
+                "Expected the dtype of reduction result and out to match"
+            )
+        out = _maybe_resize_out(out, result.shape)
+        return _safe_copy_out(copy_from=result, copy_to=out)  # type: ignore[arg-type]
+
+    if result.dtype != result_dtype and result_dtype is not None:
+        result = prims.convert_element_type(result, result_dtype)
+
+    return result
+
+
+def _make_copy_from_view(fn):
+    """
+    Given a view function (e.g. torch.diagonal) generates its copy variant (e.g. torch.diagonal_copy)
+    """
+    name = fn.__name__
+    fn = out_wrapper()(fn)
+
+    def _fn(*args, out=None, **kwargs):
+        result = fn(*args, out=out, **kwargs)
+        if out is None:
+            return result.clone(memory_format=torch.contiguous_format)
+        return result
+
+    copy_name = f"{name}_copy"
+    _fn.__name__ = copy_name
+    _fn = register_decomposition(getattr(aten, copy_name))(_fn)
+    return _fn
+
+
+# Saves Python all
+py_all = all
+
+
+@register_decomposition(aten.all)
+@out_wrapper()
+def all(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+) -> TensorLikeType:
+    result = torch.logical_not(torch.any(torch.logical_not(a), dim, keepdim=keepdim))
+
+    if a.dtype == torch.uint8:
+        result = result.to(dtype=torch.uint8)
+
+    return result
+
+
+# Saves Python any
+py_any = any
+
+
+@register_decomposition(aten.any)
+@out_wrapper()
+def any(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+) -> TensorLikeType:
+    a_ = _maybe_convert_to_dtype(a, torch.bool)
+    if isinstance(dim, (list, tuple)) and len(dim) == 0:
+        result = a_.clone()
+    else:
+        result = a_.sum(dim=dim, keepdim=keepdim).ne(False)
+
+    # Preserves uint8 -- probably a legacy mask thing
+    if a.dtype is torch.uint8:
+        return prims.convert_element_type(result, torch.uint8)
+
+    return result
+
+
+@register_decomposition([aten.sum.dim_IntList, aten.sum.IntList_out])
+def sum(
+    a: TensorLikeType,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    keepdim: bool = False,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    if dtype is None:
+        if out is not None:
+            dtype = out.dtype
+        elif utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype):
+            dtype = torch.int64
+        else:
+            dtype = a.dtype
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+    return _reduction(
+        a,
+        prims.sum,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=dtype,
+        out=out,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+def sum_to_size(
+    a: Tensor,
+    *shape,
+) -> Tensor:
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+    torch._check(
+        utils.is_expandable_to(shape, a.shape),
+        lambda: f'sum_to_size: size "{shape}" is not expandable to size "{a.shape}"',
+    )
+    # In ATen scalar tensors are sent through sum and the result is returned as
+    # type promoted
+    if utils.is_same_shape(shape, a.shape) and len(shape) > 0:
+        return prims.view_of(a)
+    leading_dims = a.ndim - len(shape)
+    reduce_dims = tuple(range(leading_dims)) + tuple(
+        i
+        for i in range(leading_dims, len(shape))
+        if shape[i - leading_dims] == 1 and a.shape[i] != 1
+    )
+    return torch.sum(a, dim=reduce_dims, keepdim=True, dtype=None)
+
+
+@register_decomposition(aten.prod)
+def prod(
+    a: TensorLikeType,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    keepdim: bool = False,
+    *,
+    dtype=None,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    if dtype is None:
+        if out is not None:
+            dtype = out.dtype
+        elif utils.is_boolean_dtype(a.dtype) or utils.is_integer_dtype(a.dtype):
+            dtype = torch.int64
+        else:
+            dtype = a.dtype
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+    return _reduction(
+        a,
+        prims.prod,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=dtype,
+        out=out,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+@register_decomposition(aten.amin)
+def amin(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+
+    return _reduction(
+        a,
+        prims.amin,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=None,
+        out=out,
+        has_identity=False,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+@register_decomposition(aten.amax)
+def amax(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+
+    return _reduction(
+        a,
+        prims.amax,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=None,
+        out=out,
+        has_identity=False,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    )
+
+
+def _dim_var_dispatch(dim=None, unbiased=None):
+    # There's the following overload of torch.var:
+    # var(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+    # We need to explicitly convert bool dims to unbiased arg
+    if unbiased is None and isinstance(dim, bool):
+        unbiased = dim
+        dim = None
+    return dim, unbiased
+
+
+@register_decomposition(aten.var)
+@out_wrapper()
+def var(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    unbiased: Optional[bool] = None,
+    keepdim: bool = False,
+    *,
+    correction: Optional[NumberType] = None,
+) -> TensorLikeType:
+    dim, unbiased = _dim_var_dispatch(dim, unbiased)
+    correction = utils.set_correction(unbiased, correction)
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+
+    result = _reduction(
+        a,
+        partial(prims.var, correction=correction),
+        dims=dim,
+        keepdims=keepdim,
+        dtype=None,
+        out=None,
+        has_identity=True,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT,
+    )
+    return result
+
+
+@register_decomposition(aten.std)
+@out_wrapper()
+def std(
+    a: TensorLikeType,
+    dim: Union[Optional[int], Optional[List[int]]] = None,
+    unbiased: Optional[bool] = None,
+    keepdim: bool = False,
+    *,
+    correction: Optional[NumberType] = None,
+) -> TensorLikeType:
+    dim, unbiased = _dim_var_dispatch(dim, unbiased)
+    correction = utils.set_correction(unbiased, correction)
+
+    opmath_dtype, dtype = utils.reduction_dtypes(
+        a, REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
+    )
+    a = _maybe_convert_to_dtype(a, opmath_dtype)
+    a_var = torch.var(a, dim, correction=correction, keepdim=keepdim)
+    a_std = torch.sqrt(a_var)
+    assert dtype is not None
+    return _maybe_convert_to_dtype(a_std, dtype)
+
+
+@register_decomposition(aten.mean)
+def mean(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    dtype=None,
+    out=None,
+) -> TensorLikeType:
+    # reduces over all dimensions if dim=() is passed
+    if dim == () or dim == []:
+        dim = None
+    orig_dtype = dtype
+    if dtype is None:
+        dtype = a.dtype
+    # can't use out wrapper because of this argument
+    torch._check(
+        out is None or out.dtype == dtype,
+        lambda: f"Expected out tensor to have dtype {dtype}, but got {out.dtype} instead",
+    )
+    result = _reduction(
+        a,
+        prims.sum,
+        dims=dim,
+        keepdims=keepdim,
+        dtype=dtype,
+        out=None,
+        output_dtype_kind=REDUCTION_OUTPUT_TYPE_KIND.KEEP_PROMOTED_TYPE,
+    )
+    torch._check(
+        utils.is_float_dtype(dtype) or utils.is_complex_dtype(dtype),
+        lambda: (
+            f"mean(): could not infer output dtype. "
+            f"{'Input' if orig_dtype is None else 'Optional'} dtype must be either "
+            f"a floating point or complex dtype. Got: {dtype}"
+        ),
+    )
+    if isinstance(dim, Dim):
+        dim = (dim,)  # type: ignore[assignment]
+    dims = utils.reduction_dims(a.shape, dim)  # type: ignore[arg-type]
+    nelem = 1 if a.ndim == 0 else reduce(operator.mul, (a.shape[i] for i in dims), 1)
+    result = true_divide(result, nelem)
+    result_dtype = a.dtype if dtype is None else dtype
+    result = _maybe_convert_to_dtype(result, result_dtype)  # type: ignore[method-assign]
+    if out is not None:
+        assert isinstance(out, TensorLike)
+        out = _maybe_resize_out(out, result.shape)
+        return _safe_copy_out(copy_from=result, copy_to=out)  # type: ignore[arg-type]
+    return result
+
+
+@register_decomposition(aten.std_mean)
+@out_wrapper("out0", "out1")
+def std_mean(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    *,
+    unbiased: Optional[bool] = None,
+    keepdim: bool = False,
+    correction: Optional[NumberType] = None,
+):
+    dim, unbiased = _dim_var_dispatch(dim, unbiased)
+    correction = utils.set_correction(unbiased, correction)
+    opmath_dtype, dtype = utils.reduction_dtypes(
+        a, REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT
+    )
+    original_dtype = a.dtype
+    a = _maybe_convert_to_dtype(a, opmath_dtype)
+    a_var, a_mean = torch.var_mean(a, dim, correction=correction, keepdim=keepdim)
+    a_std = torch.sqrt(a_var)
+    assert dtype is not None
+    return (
+        _maybe_convert_to_dtype(a_std, dtype),
+        _maybe_convert_to_dtype(a_mean, original_dtype),
+    )
+
+
+@register_decomposition(aten.var_mean)
+@out_wrapper("out0", "out1")
+def var_mean(
+    a: TensorLikeType,
+    dim: Optional[DimsType] = None,
+    unbiased: Optional[bool] = None,
+    keepdim: bool = False,
+    *,
+    correction: Optional[NumberType] = None,
+):
+    dim, unbiased = _dim_var_dispatch(dim, unbiased)
+    v = var(a, dim, unbiased, keepdim, correction=correction)
+    m = mean(a, dim, keepdim)
+    return v, m
+
+
+@register_decomposition(aten.addr)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self", "vec1", "vec2"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def addr(
+    self: TensorLikeType,
+    vec1: TensorLikeType,
+    vec2: TensorLikeType,
+    *,
+    beta: NumberType = 1,
+    alpha: NumberType = 1,
+) -> TensorLikeType:
+    torch._check(
+        vec1.ndim == 1,
+        lambda: f"addr: Expected 1-D argument vec1, but got {vec1.ndim}-D",
+    )
+    torch._check(
+        vec2.ndim == 1,
+        lambda: f"addr: Expected 1-D argument vec2, but got {vec2.ndim}-D",
+    )
+    self = self.expand(vec1.shape[0], vec2.shape[0])
+    if utils.is_boolean_dtype(self.dtype):
+        # Integers are accepted for booleans
+        torch._check(
+            is_weakly_lesser_type(type(beta), int),
+            lambda: f"expected bool/int beta but got {type(beta)}",
+        )
+        torch._check(
+            is_weakly_lesser_type(type(alpha), int),
+            lambda: f"expected bool/int alpha but got {type(beta)}",
+        )
+        if not beta:
+            return torch.outer(vec1, vec2) if alpha else torch.full_like(self, False)
+        else:
+            return torch.logical_or(
+                self,
+                torch.outer(vec1, vec2) if alpha else torch.full_like(self, False),
+            )
+    else:
+        torch._check(
+            is_weakly_lesser_type(type(beta), dtype_to_type(self.dtype)),
+            lambda: f"cannot safely convert {type(beta)} to {self.dtype}",
+        )
+        torch._check(
+            is_weakly_lesser_type(type(alpha), dtype_to_type(self.dtype)),
+            lambda: f"cannot safely convert {type(alpha)} to {self.dtype}",
+        )
+        if beta == 0:
+            # This means NaNs from self are dropped if beta is zero
+            return alpha * torch.outer(vec1, vec2)
+        else:
+            return beta * self + alpha * torch.outer(vec1, vec2)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def atleast_1d(
+    arg: Union[TensorLikeType, Sequence[TensorLikeType]], *args: TensorLikeType
+) -> Union[TensorLikeType, Tuple[TensorLikeType, ...]]:
+    """Reference implementation of :func:`torch.atleast_1d`."""
+    if not args and isinstance(arg, collections.abc.Sequence):
+        args_ = arg
+    else:
+        assert not isinstance(arg, collections.abc.Sequence)
+        args_ = (arg,) + args
+    res = tuple(a if a.ndim >= 1 else unsqueeze(a, 0) for a in args_)
+    return res if len(res) > 1 else res[0]
+
+
+# Helper function with assert to avoid MyPy error
+# of incompatible type passed to unsqueeze
+def _unsqueeze_atleast(
+    at_least_fn: Callable, dim: int, arg: TensorLikeType
+) -> TensorLikeType:
+    arg_ = at_least_fn(arg)
+    assert isinstance(arg_, TensorLike)
+    return unsqueeze(arg_, dim)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def atleast_2d(
+    arg: Union[TensorLikeType, Sequence[TensorLikeType]], *args: TensorLikeType
+) -> Union[TensorLikeType, Tuple[TensorLikeType, ...]]:
+    """Reference implementation of :func:`torch.atleast_2d`."""
+    if not args and isinstance(arg, collections.abc.Sequence):
+        args_ = arg
+    else:
+        assert not isinstance(arg, collections.abc.Sequence)
+        args_ = (arg,) + args
+    unsqueeze_atleast_1d = partial(_unsqueeze_atleast, atleast_1d, 0)
+    res = tuple(a if a.ndim >= 2 else unsqueeze_atleast_1d(a) for a in args_)
+    return res if len(res) > 1 else res[0]
+
+
+# CompositeImplicitAutograd - don't register decomp
+def atleast_3d(
+    arg: Union[TensorLikeType, Sequence[TensorLikeType]], *args: TensorLikeType
+) -> Union[TensorLikeType, Tuple[TensorLikeType, ...]]:
+    """Reference implementation of :func:`torch.atleast_3d`."""
+    if not args and isinstance(arg, collections.abc.Sequence):
+        args_ = arg
+    else:
+        assert not isinstance(arg, collections.abc.Sequence)
+        args_ = (arg,) + args
+    unsqueeze_atleast_2d = partial(_unsqueeze_atleast, atleast_2d, -1)
+    res = tuple(a if a.ndim >= 3 else unsqueeze_atleast_2d(a) for a in args_)
+    return res if len(res) > 1 else res[0]
+
+
+def as_strided(
+    a: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
+) -> TensorLikeType:
+    storage_offset_int = (
+        storage_offset if storage_offset is not None else a.storage_offset()
+    )
+    return prims.as_strided(a, size, stride, storage_offset_int)
+
+
+@register_decomposition(aten.as_strided_scatter)
+@out_wrapper()
+def as_strided_scatter(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    storage_offset: Optional[int] = None,
+) -> TensorLikeType:
+    storage_offset_int = 0 if storage_offset is None else storage_offset
+    return prims.as_strided_scatter(input, src, size, stride, storage_offset_int)
+
+
+def broadcast_shapes(*shapes) -> ShapeType:
+    return torch.Size(_broadcast_shapes(*shapes))
+
+
+@aten.broadcast_tensors.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+@aten.broadcast_tensors.default.py_impl(DispatchKey.Meta)
+def broadcast_tensors(*tensors) -> List[TensorLikeType]:
+    if len(tensors) == 1 and not isinstance(tensors[0], Tensor):
+        tensors = tensors[0]
+    return list(_maybe_broadcast(*tensors, preserve_cpu_scalar_tensors=False))
+
+
+# CompositeImplicitAutograd - don't register decomp
+def broadcast_to(a: TensorLikeType, size: ShapeType) -> TensorLikeType:
+    start = len(size) - len(a.shape)
+    dims = tuple(range(start, len(a.shape) + start))
+    return prims.broadcast_in_dim(a, size, dims)
+
+
+@register_decomposition(aten.cat)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("tensors",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
+)
+def cat(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
+    def cat_compute_output_memory_format(inputs):
+        format = None
+        for t in inputs:
+            f = utils.suggest_memory_format(t)
+            if f == torch.contiguous_format:
+                return f
+            if format is not None and format != f:
+                return torch.contiguous_format
+            format = f
+        assert format is not None
+        return format
+
+    if len(tensors) == 0:
+        msg = "cat expects at least one tensor, but received zero!"
+        raise ValueError(msg)
+
+    for tensor in tensors:
+        assert isinstance(tensor, TensorLike)
+
+    utils.check_same_device(*tensors, allow_cpu_scalar_tensors=False)
+
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # This is a bit tricky.  Naively, you would expect to just pick one
+    # arbitrary tensor and check that all tensors match this tensor.  However,
+    # there is legacy behavior which says that if you have a 1-D empty tensor
+    # (0,), this is permissible.  So you can't assume that all the tensors
+    # have same dimensionality, and you can't assume that the first tensor is
+    # the correct stencil.
+    #
+    # We'll implement this in a few passes.  First, we will try to infer the
+    # ndim of the cat output.  If this ndim != 1, then we know that all ndim =
+    # 1 inputs must be empty, or are errors.  If this ndim == 1, then life
+    # is easy (the legacy special case coincides with regular handling).
+    #
+    # NB: The regular implementation of cat just filters out empty inputs,
+    # but we do it slightly different here for better handling for unbacked
+    # SymInts
+
+    example = None
+    for i, t in enumerate(tensors):
+        if example is None:
+            if t.ndim != 1:
+                example = t
+        else:
+            if t.ndim != 1:
+                torch._check(
+                    t.ndim == example.ndim,
+                    lambda: "Number of dimensions of tensors must match.  "
+                    f"Expected {example.ndim}-D tensors, but got {t.ndim}-D for "
+                    f"tensor number {i} in the list",
+                )
+
+    if example is None:
+        # example is None if everything is 1-D.  If so, just arbitrarily pick
+        # the first one
+        example = tensors[0]
+
+    shape = example.shape
+    filtered = []
+    for tensor_idx, tensor in enumerate(tensors):
+        if len(shape) != len(tensor.shape):
+            assert tensor.ndim == 1  # we've already checked this above
+            # Don't suggest the legacy behavior in the error message
+            torch._check(
+                tensor.shape[0] == 0,
+                lambda: f"Number of dimensions of tensors must match.  "
+                f"Expected {example.ndim}-D tensors, but got 1-D for "
+                f"tensor number {tensor_idx} in the list",
+            )
+        else:
+            # Remove inputs that are 1-D, zero size
+            if tensor.ndim == 1 and guard_size_oblivious(tensor.shape[0] == 0):
+                continue
+            # Don't bother checking size match, prims.cat will handle it
+            filtered.append(tensor)
+
+    memory_format = cat_compute_output_memory_format(tensors)
+
+    if len(filtered) == 0:
+        t = tensors[0]
+
+        # TODO: fix this to work with meta tensors
+        try:
+            requires_grad = any(x.requires_grad for x in tensors)
+        except Exception:
+            requires_grad = False
+
+        return empty(
+            (0,),
+            dtype=t.dtype,
+            device=t.device,
+            requires_grad=requires_grad,
+            memory_format=memory_format,
+        )
+
+    dim = utils.canonicalize_dim(filtered[0].ndim, dim)
+    utils.validate_idx(filtered[0].ndim, dim)
+
+    return prims.cat(filtered, dim).clone(memory_format=memory_format)
+
+
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def column_stack(tensors: TensorSequenceType) -> TensorLikeType:
+    aligned_tensors = tuple(
+        x if x.ndim > 1 else x.reshape((x.numel(), 1)) for x in tensors
+    )
+    return cat(aligned_tensors, 1)
+
+
+def conj(input: TensorLikeType) -> TensorLikeType:
+    if not utils.is_complex_dtype(input.dtype):
+        return input
+    if input.is_sparse:
+        return torch.conj_physical(input)
+    return prims.conj(input)
+
+
+# This replicates at::constant_pad_nd, defined in ATen/native/PadNd.cpp
+@register_decomposition(aten.constant_pad_nd)
+@out_wrapper()
+def constant_pad_nd(
+    input: TensorLikeType, pad: List[int], value: NumberType = 0
+) -> TensorLikeType:
+    torch._check(
+        len(pad) % 2 == 0,
+        lambda: f"Length of pad must be even but instead it equals {len(pad)}",
+    )
+
+    input_sizes = input.shape
+    l_inp = len(input_sizes)
+
+    l_pad = len(pad) // 2
+    l_diff = l_inp - l_pad
+
+    torch._check(
+        l_inp >= l_pad,
+        lambda: "Length of pad should be no more than twice the number of "
+        f"dimensions of the input. Pad length is {len(pad)} while the input has "
+        f"{l_inp} dimensions.",
+    )
+
+    c_input = input
+    for i in range(l_diff, l_inp):
+        pad_idx = 2 * (l_inp - i - 1)
+        if pad[pad_idx] < 0:
+            c_input = c_input.narrow(i, -pad[pad_idx], c_input.shape[i] + pad[pad_idx])
+
+        if pad[pad_idx + 1] < 0:
+            c_input = c_input.narrow(i, 0, c_input.shape[i] + pad[pad_idx + 1])
+
+    # if none of the pads are positive we can just return the result
+    if builtins.all(p <= 0 for p in pad):
+        return c_input.clone()
+
+    new_shape = list(input_sizes[:l_diff])
+
+    for i in range(l_pad):
+        pad_idx = len(pad) - ((i + 1) * 2)
+        new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]
+        torch._check(
+            new_dim > 0,
+            lambda: f"The input size {input_sizes[l_diff + i]}, plus negative padding "
+            f"{pad[pad_idx]} and {pad[pad_idx + 1]} resulted in a negative output size, "
+            f"which is invalid. Check dimension {l_diff + i} of your input.",
+        )
+        new_shape.append(new_dim)
+
+    memory_format = utils.suggest_memory_format(input)
+    output = torch.empty(
+        new_shape,
+        dtype=input.dtype,
+        device=input.device,
+        requires_grad=input.requires_grad,
+        memory_format=memory_format,
+    )
+
+    if value == 0 and input.dtype == torch.bool:
+        value = False
+    # torch.fill isn't typed to allow complex values
+    output = torch.fill(output, value)  # type: ignore[arg-type]
+
+    c_output = output
+    for i in range(l_diff, l_inp):
+        pad_idx = 2 * (l_inp - i - 1)
+        if pad[pad_idx] > 0:
+            c_output = c_output.narrow(
+                i, pad[pad_idx], c_output.shape[i] - pad[pad_idx]
+            )
+        if pad[pad_idx + 1] > 0:
+            c_output = c_output.narrow(i, 0, c_output.shape[i] - pad[pad_idx + 1])
+
+    prims.copy_to(c_output, c_input)
+    return output
+
+
+def contiguous(
+    a: Tensor, *, memory_format: torch.memory_format = torch.contiguous_format
+) -> Tensor:
+    torch._check(
+        memory_format != torch.preserve_format,
+        lambda: "preserve memory format is unsupported by the contiguous operator",
+    )
+
+    if utils.is_contiguous_for_memory_format(a, memory_format=memory_format):
+        return a
+
+    return torch.clone(a, memory_format=memory_format)
+
+
+@out_wrapper()
+def dstack(tensors: TensorSequenceType) -> TensorLikeType:
+    torch._check(len(tensors) > 0, lambda: "dstack expects a non-empty TensorList")
+    aligned_tensors = atleast_3d(*tensors)
+    return cat(aligned_tensors, 2)
+
+
+@register_decomposition(aten.expand)
+def expand(a: Tensor, *shape) -> Tensor:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    # NOTE: cannot use utils.extract_shape_from_varargs here
+    # because that also validates the shape, but the shape
+    # given to expand may be "invalid"
+    if len(shape) == 1 and isinstance(shape[0], Sequence):
+        shape = tuple(shape[0])
+
+    torch._check(
+        len(shape) >= len(a.shape),
+        lambda: "expand: the requested shape has too few dimensions!",
+    )
+
+    offset = len(shape) - len(a.shape)
+    shape_ = list(shape)
+    for idx, x in enumerate(a.shape):
+        offset_idx = idx + offset
+        requested_length = shape[offset_idx]
+        torch._check(
+            guard_size_oblivious(requested_length == x)
+            or guard_size_oblivious(x == 1)
+            or requested_length == -1,
+            lambda: f"expand: attempting to expand a dimension of length {x}!",
+        )
+
+        shape_[offset_idx] = requested_length if requested_length != -1 else x
+
+    # At this point shape must be valid
+    utils.validate_shape(shape_)
+
+    return prims.broadcast_in_dim(
+        a, shape_, tuple(range(offset, len(a.shape) + offset))
+    )
+
+
+# CompositeImplicitAutograd - don't register decomp
+def expand_as(a: Tensor, b: Tensor) -> Tensor:
+    return a.expand(b.shape)
+
+
+def chunk(a: TensorLikeType, chunks: int, dim: int = 0) -> Tuple[TensorLikeType, ...]:
+    if chunks <= 0:
+        msg = f"Expected at least one chunk, but got {chunks}!"
+        raise ValueError(msg)
+
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    length = a.shape[dim]
+    chunk_size = math.ceil(length / chunks)
+    full_chunks = math.floor(length / chunk_size)
+    tail_chunk_size = length % chunk_size
+
+    result = []
+    for i in range(full_chunks):
+        result.append(narrow(a, dim, i * chunk_size, chunk_size))
+
+    if tail_chunk_size != 0:
+        result.append(narrow(a, dim, full_chunks * chunk_size, tail_chunk_size))
+
+    return tuple(result)
+
+
+# Note: flatten, unlike other shape operators, returns the input tensor on a no-op (unless
+# a 0D tensor is flattened, in which case it's returned in 1D)
+# CompositeImplicitAutograd - don't register decomp
+def flatten(a: TensorLikeType, start_dim: int = 0, end_dim: int = -1) -> TensorLikeType:
+    start_dim = utils.canonicalize_dim(a.ndim, start_dim)
+    end_dim = utils.canonicalize_dim(a.ndim, end_dim)
+
+    # Short-circuits on no-op
+    if start_dim == end_dim and a.ndim != 0:
+        return a
+
+    # Tries to take a view
+    # TODO: we could look at directing collapse_view to skip its meta function here (unsafe_collapse_view)
+    new_shape, new_strides = prims._collapse_view_helper(a, start_dim, end_dim)
+    if new_shape is not None:
+        return prims.collapse_view(a, start_dim, end_dim)
+
+    # Makes a copy if it can't make a view
+    return prims.collapse(a, start_dim, end_dim)
+
+
+@register_decomposition(aten.flip)
+@out_wrapper()
+def flip(a: TensorLikeType, dims: DimsSequenceType) -> TensorLikeType:
+    if not isinstance(dims, tuple) and not isinstance(dims, list):
+        raise ValueError("dims has to be a sequence of ints")
+    dims = utils.canonicalize_dims(a.ndim, dims)  # type: ignore[assignment]
+    utils.validate_no_repeating_dims(dims)
+    return prims.rev(a, dims)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def fliplr(a: TensorLikeType) -> TensorLikeType:
+    if a.ndim < 2:
+        raise RuntimeError("Input must be >= 2-d.")
+
+    return flip(a, (1,))
+
+
+# CompositeImplicitAutograd - don't register decomp
+def flipud(a: TensorLikeType) -> TensorLikeType:
+    if a.ndim < 1:
+        raise RuntimeError("Input must be >= 1-d.")
+
+    return flip(a, (0,))
+
+
+# CompositeImplicitAutograd - don't register decomp
+def narrow(
+    a: TensorLikeType, dim: int, start: Union[int, TensorLikeType], length: int
+) -> TensorLikeType:
+    # Supports Tensor overload that was added for XLA:
+    # https://github.com/pytorch/pytorch/issues/31558
+    if isinstance(start, TensorLike):
+        torch._check(
+            start.dim() == 0 and utils.is_integer_dtype(start.dtype),
+            lambda: "start must be an 0-dim integral Tensor.",
+        )
+        start = start.item()  # type: ignore[assignment]
+    torch._check(a.dim() > 0, lambda: "narrow() cannot be applied to a 0-dim tensor.")
+    torch._check(length >= 0, lambda: "narrow(): length must be non-negative.")
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    dim_length = a.size(dim)
+    torch._check_with(
+        IndexError,
+        -dim_length <= start and start <= dim_length,  # type: ignore[arg-type]
+        lambda: f"start out of range (expected to be in range of [{-dim_length}, {dim_length}], but got {start})",
+    )
+    if start < 0:
+        start = start + dim_length
+    torch._check(
+        start <= dim_length - length,  # type: ignore[arg-type]
+        lambda: f"start ({start}) + length ({length}) exceeds dimension size ({dim_length}).",
+    )
+    return prims.slice_in_dim(a, start, start + length, axis=dim)
+
+
+# TODO: This must return a sparse tensor if the input is sparse, but refs have
+# no sparse support. See narrow_copy_sparse in core.
+narrow_copy = _make_copy_from_view(narrow)
+
+
+def _normalize(
+    a: Tensor, norm_dims: DimsType, eps: float
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """Computes mean and 1/std of a tensor along norm_dims.
+
+    Used as a helper function for normalization layers.
+
+    Args:
+        a (Tensor): input tensor
+        norm_dims (DimsType): dimensions to normalize over
+        eps (float): epsilon for numerical stability
+
+    Returns:
+        out (Tensor): normalized tensor.
+        mean (Tensor): mean of the tensor along norm_dims.
+        rstd (Tensor): 1/std of the tensor along norm_dims.
+    """
+    norm_dims = utils.canonicalize_dims(a.ndim, norm_dims)
+    computation_dtype = utils.get_computation_dtype(a.dtype)
+    a_acc = _maybe_convert_to_dtype(a, computation_dtype)
+    assert isinstance(a_acc, TensorLike)  # to avoid mypy error for var_mean
+    biased_var, mean = torch.var_mean(
+        a_acc, dim=norm_dims, unbiased=False, keepdim=True
+    )
+    rstd = torch.rsqrt(biased_var + eps)
+    out = (a - mean) * rstd
+    return out, mean, rstd
+
+
+# add all specified dimensions
+def _unsqueeze_multiple(x: TensorLikeType, dimensions: List[int]) -> TensorLikeType:
+    for dim in sorted(dimensions):
+        x = torch.unsqueeze(x, dim)
+    return x
+
+
+@register_decomposition(aten.native_group_norm.default)
+def native_group_norm(
+    input: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    batch_size: int,
+    num_channels: int,
+    flattened_inner_size: int,
+    num_groups: int,
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    torch._check(
+        input.ndim >= 2,
+        lambda: f"Expected at least 2 dimensions for input tensor but received {input.ndim}",
+    )
+    torch._check(
+        num_channels % num_groups == 0,
+        lambda: "Expected number of channels in input to be divisible by num_groups, "
+        + f"but got input of shape {input.shape} and num_groups = {num_groups}",
+    )
+
+    # num_channels / num_groups and flattened inner dimension are the reduction axes
+    reduction_dims = [2, 3]
+    input_reshaped = torch.reshape(
+        input,
+        [batch_size, num_groups, num_channels // num_groups, flattened_inner_size],
+    )
+    out, mean, rstd = _normalize(input_reshaped, reduction_dims, eps)
+    out = out.view(input.shape)
+
+    broadcast_dims = [0] + list(range(2, input.ndim))
+    unsqueeze_bias = None
+    if bias is not None:
+        unsqueeze_bias = _unsqueeze_multiple(bias, broadcast_dims)
+    unsqueeze_weight = None
+    if weight is not None:
+        unsqueeze_weight = _unsqueeze_multiple(weight, broadcast_dims)
+
+    if unsqueeze_weight is not None:
+        out = out * unsqueeze_weight
+    if unsqueeze_bias is not None:
+        out = out + unsqueeze_bias
+
+    out = _maybe_convert_to_dtype(out, input.dtype)  # type: ignore[assignment]
+    mean = _maybe_convert_to_dtype(mean, input.dtype)  # type: ignore[assignment]
+    rstd = _maybe_convert_to_dtype(rstd, input.dtype)  # type: ignore[assignment]
+
+    # remove broadcast dimensions from mean and rstd
+    mean = torch.squeeze(mean, reduction_dims)
+    rstd = torch.squeeze(rstd, reduction_dims)
+    return (out, mean, rstd)
+
+
+@register_decomposition(aten.native_layer_norm)
+@out_wrapper("out0", "out1", "out2")
+def native_layer_norm(
+    input: Tensor,
+    normalized_shape: ShapeType,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    eps: float,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    normalized_ndim = len(normalized_shape)
+    torch._check(
+        normalized_ndim >= 1,
+        lambda: "Expected normalized_shape to be at least 1-dimensional, i.e., "
+        + "containing at least one element, but got normalized_shape = "
+        + str(normalized_shape),
+    )
+    # torch.Size([1, 2, 3]) == [1, 2, 3] evaluates to False
+    # while torch.Size([1, 2, 3]) == (1, 2, 3) is True
+    # therefore we use tuple(normalized_shape)
+    torch._check(
+        weight is None or weight.shape == tuple(normalized_shape),
+        lambda: "Expected weight to be of same shape as normalized_shape, but got "
+        + "weight of shape "
+        + str(weight.shape)  # type: ignore[union-attr]
+        + " and normalized_shape = "
+        + str(normalized_shape),
+    )
+    torch._check(
+        bias is None or bias.shape == tuple(normalized_shape),
+        lambda: "Expected bias to be of same shape as normalized_shape, but got "
+        + "bias of shape "
+        + str(bias.shape)  # type: ignore[union-attr]
+        + " and normalized_shape = "
+        + str(normalized_shape),
+    )
+    torch._check(
+        input.ndim >= normalized_ndim
+        and input.shape[(input.ndim - normalized_ndim) :] == tuple(normalized_shape),
+        lambda: "Given normalized_shape="
+        + str(normalized_shape)
+        + ", expected input with shape "
+        + str(normalized_shape)
+        + ", but got input of size "
+        + str(input.shape),
+    )
+
+    input = input.contiguous()
+    if weight is not None:
+        weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+
+    axis = input.ndim - normalized_ndim
+    reduction_dims = list(range(axis, input.ndim))
+    out, mean, rstd = _normalize(input, reduction_dims, eps)
+
+    if weight is None and bias is not None:
+        out = out + bias
+    elif weight is not None and bias is None:
+        out = out * weight
+    elif weight is not None and bias is not None:
+        out = out * weight + bias
+
+    out = _maybe_convert_to_dtype(out, input.dtype)  # type: ignore[assignment]
+    if input.device.type == "cpu":
+        mean = _maybe_convert_to_dtype(mean, input.dtype)  # type: ignore[assignment]
+        rstd = _maybe_convert_to_dtype(rstd, input.dtype)  # type: ignore[assignment]
+    return (out, mean, rstd)
+
+
+# TODO: Adding this as a meta function causes functorch tests to fail when compiled with debug mode.
+# test/test_eager_transforms.py::TestFunctionalizeCPU::test_functionalize_fx_transpose_simple_cpu
+@register_decomposition(aten.permute)
+def permute(a: TensorLikeType, *dims) -> TensorLikeType:
+    _permutation = utils.canonicalize_dims(
+        a.ndim, utils.extract_dims_from_varargs(dims)
+    )
+    return prims.transpose(a, _permutation)
+
+
+@register_decomposition(aten.renorm)
+@out_wrapper()
+def renorm(
+    input: TensorLikeType, p: RealNumberType, dim: int, maxnorm: RealNumberType
+) -> TensorLikeType:
+    torch._check(not isinstance(p, complex), lambda: "renorm: p must be real-valued")
+    torch._check(p > 0, lambda: "renorm: non-positive norm not supported")
+    torch._check(
+        not isinstance(maxnorm, complex), lambda: "renorm: maxnorm must be real-valued"
+    )
+    torch._check(
+        maxnorm >= 0, lambda: f"renorm: expected maxnorm to be >= 0 but got {maxnorm}"
+    )
+    ndim = input.ndim
+    torch._check(
+        ndim > 1,
+        lambda: f"renorm: input needs at least 2 dimensions, got {ndim} dimensions",
+    )
+
+    dim = utils.canonicalize_dim(ndim, dim)
+    reduce_dims = list(range(ndim))
+    del reduce_dims[dim]
+
+    # For half and bfloat16, calculate norm in float precision then cast
+    # normalization factor to half
+    acc_type = utils.get_computation_dtype(input.dtype)
+    if acc_type != input.dtype:
+        norm = torch.linalg.vector_norm(
+            input, p, reduce_dims, keepdim=True, dtype=acc_type
+        )
+    else:
+        norm = torch.linalg.vector_norm(input, p, reduce_dims, keepdim=True)
+
+    eps = 1e-7
+    norm_factor = torch.where(norm > maxnorm, maxnorm / (norm + eps), 1.0)
+    if acc_type != input.dtype:
+        norm_factor = prims.convert_element_type(norm_factor, input.dtype)
+    return (input * norm_factor).contiguous()
+
+
+# CompositeImplicitAutograd - don't register decomp
+@aten.stft.center.py_impl(DispatchKey.CompositeImplicitAutograd)
+def stft(
+    input: Tensor,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    normalized: bool = False,
+    onesided: Optional[bool] = None,
+    return_complex: Optional[bool] = None,
+) -> Tensor:
+    torch._check(
+        window is None or window.device == input.device,
+        lambda: (
+            f"stft input and window must be on the same device but got self on {input.device}"
+            + f" and window on {window.device}"  # type: ignore[union-attr]
+        ),
+    )
+
+    hop_length_ = hop_length if hop_length is not None else n_fft // 4
+    win_length_ = win_length if win_length is not None else n_fft
+
+    if return_complex is None:
+        return_complex_ = input.is_complex() or (
+            window is not None and utils.is_complex_dtype(window.dtype)
+        )
+        torch._check(
+            return_complex_,
+            (
+                "stft requires the return_complex parameter be given for real inputs, "
+                + "and will further require that return_complex=True in a future PyTorch release."
+            ),
+        )
+    else:
+        return_complex_ = return_complex
+
+    torch._check(
+        utils.is_float_dtype(input.dtype) or utils.is_complex_dtype(input.dtype),
+        lambda: "stft expected a tensor of floating point or complex values",
+    )
+    torch._check(1 <= input.ndim <= 2, lambda: "stft expected a 1D or 2D tensor")
+
+    original_ndim = input.ndim
+    if original_ndim == 1:
+        input = input.unsqueeze(0)
+
+    if center:
+        extra_dims = 3 - input.ndim
+        pad_amount = n_fft // 2
+        extended_shape = [*itertools.repeat(1, extra_dims), *input.shape]
+        input = aten.pad(input.view(extended_shape), [pad_amount, pad_amount], pad_mode)
+        input = input.view(input.size()[extra_dims:])
+
+    batch = input.size(0)
+    length = input.size(1)
+    torch._check(
+        0 < n_fft <= length,
+        lambda: f"stft expected 0 < n_fft <= {length}, but got n_fft={n_fft}",
+    )
+    torch._check(
+        hop_length_ > 0,
+        lambda: f"stft expected hop_length > 0 but got hop_length={hop_length_}",
+    )
+    torch._check(
+        0 < win_length_ <= n_fft,
+        lambda: f"stft expected 0 < win_length <= n_fft but got win_length={win_length_}",
+    )
+    torch._check(
+        window is None or window.shape == (win_length_,),
+        lambda: (
+            f"expected a 1D window tensor of size equal to win_length={win_length_}, "
+            + f"but got window with size {window.shape}"  # type: ignore[union-attr]
+        ),
+    )
+
+    if win_length_ < n_fft:
+        if window is None:
+            window = torch.ones(win_length_, dtype=input.dtype, device=input.device)
+        left = (n_fft - win_length_) // 2
+        window = aten.constant_pad_nd(window, [left, n_fft - win_length_ - left])
+
+    input = input.unfold(dimension=-1, size=n_fft, step=hop_length_)
+    if window is not None:
+        input = input * window
+
+    complex_fft = utils.is_complex_dtype(input.dtype)
+    onesided = onesided if onesided is not None else not complex_fft
+    norm = "ortho" if normalized else None
+    if onesided:
+        torch._check(
+            not complex_fft,
+            lambda: "Cannot have onesided output if window or input is complex",
+        )
+        out = torch.fft.rfft(input, dim=-1, norm=norm)
+    else:
+        out = torch.fft.fft(input, dim=-1, norm=norm)
+
+    out.transpose_(1, 2)
+
+    if original_ndim == 1:
+        out = out.squeeze_(0)
+
+    return out if return_complex_ else torch.view_as_real(out)
+
+
+# CompositeImplicitAutograd - don't register decomp
+@aten.istft.default.py_impl(DispatchKey.CompositeImplicitAutograd)
+def istft(
+    input: Tensor,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+    center: bool = True,
+    normalized: bool = False,
+    onesided: Optional[bool] = None,
+    length: Optional[int] = None,
+    return_complex=False,
+) -> Tensor:
+    torch._check(
+        window is None or window.device == input.device,
+        lambda: (
+            f"istft input and window must be on the same device but got self on {input.device}"
+            + f" and window on {window.device}"  # type: ignore[union-attr]
+        ),
+    )
+
+    hop_length_ = hop_length if hop_length is not None else n_fft // 4
+    win_length_ = win_length if win_length is not None else n_fft
+
+    torch._check(
+        utils.is_complex_dtype(input.dtype),
+        lambda: (
+            "istft input and window must be on the same device but got self on "
+            + f"{input.device} and window on {window.device}"  # type: ignore[union-attr]
+        ),
+    )
+    n_frames = input.size(-1)
+    fft_size = input.size(-2)
+
+    expected_output_signal_len = n_fft + hop_length_ * (n_frames - 1)
+    torch._check(input.numel() > 0, lambda: "istft input tensor cannot be empty")
+    torch._check(
+        2 <= input.ndim <= 3,
+        lambda: f"istft expected a tensor with 2 or 3 dimensions, but got {input.ndim}",
+    )
+    onesided_ = onesided if onesided is not None else fft_size != n_fft
+
+    if onesided_:
+        torch._check(
+            n_fft // 2 + 1 == fft_size,
+            lambda: (
+                "istft expected the frequency dimension (3rd to the last) of the input tensor "
+                + "to match n_fft / 2 + 1 when onesided=True, but got {fft_size}"
+            ),
+        )
+    else:
+        torch._check(
+            n_fft == fft_size,
+            lambda: (
+                "istft expected the frequency dimension (3rd to the last) of the input tensor "
+                + "to match n_fft when onesided=False, but got {fft_size}",
+            ),
+        )
+
+    torch._check(
+        0 < hop_length_ <= win_length_,
+        lambda: "istft expected 0 < hop_length <= win_length",
+    )
+    torch._check(
+        0 < win_length_ <= n_fft, lambda: "istft expected 0 < win_length <= n_fft"
+    )
+    torch._check(
+        window is None or window.shape == (win_length_,),
+        lambda: "Invalid window shape. window has to be 1D and length of `win_length`",
+    )
+
+    if window is None:
+        real_dtype = utils.corresponding_real_dtype(input.dtype)
+        window_ = torch.ones(win_length_, dtype=real_dtype, device=input.device)
+    else:
+        window_ = window
+
+    if win_length_ != n_fft:
+        left = (n_fft - win_length_) // 2
+        window_ = aten.constant_pad_nd(window_, (left, n_fft - win_length_ - left), 0)
+
+    original_ndim = input.ndim
+    if input.ndim == 2:
+        input = input.unsqueeze(0)
+
+    input = input.transpose(1, 2)
+    norm = "ortho" if normalized else None
+    if return_complex:
+        torch._check(
+            not onesided_,
+            lambda: "cannot have onesided output if window or input is complex",
+        )
+        input = torch.fft.ifft(input, dim=-1, norm=norm)
+    else:
+        torch._check(
+            window is None or not utils.is_complex_dtype(window.dtype),
+            lambda: "Complex windows are incompatible with return_complex=False",
+        )
+        if not onesided_:
+            input = input.narrow(dim=-1, start=0, length=n_fft // 2 + 1)
+        input = torch.fft.irfft(input, dim=-1, norm=norm)
+
+    assert input.size(2) == n_fft
+
+    y_tmp = input * window_.view([1, 1, n_fft])
+    y = aten.unfold_backward(
+        y_tmp,
+        input_sizes=(y_tmp.size(0), expected_output_signal_len),
+        dim=1,
+        size=n_fft,
+        step=hop_length_,
+    )
+    window_envelop = aten.unfold_backward(
+        window_.pow(2).expand((1, n_frames, n_fft)),
+        input_sizes=(y_tmp.size(0), expected_output_signal_len),
+        dim=1,
+        size=n_fft,
+        step=hop_length_,
+    )
+
+    assert expected_output_signal_len == y.size(1)
+    assert expected_output_signal_len == window_envelop.size(1)
+
+    start = n_fft // 2 if center else 0
+    if length is not None:
+        end = start + length
+    elif center:
+        end = expected_output_signal_len - n_fft // 2
+    else:
+        end = expected_output_signal_len
+
+    length = max(0, end - start)
+    y = y.narrow(dim=1, start=start, length=length)
+    window_envelop = window_envelop.narrow(dim=1, start=start, length=length)
+
+    window_envelop_lowest = window_envelop.abs().min().lt(1e-11)
+    torch._check(
+        not window_envelop_lowest.item(),
+        lambda: "window overlap add min less than 1e-11",
+    )
+
+    y = y / window_envelop
+    if original_ndim == 2:
+        y = y.squeeze(0)
+
+    if end > expected_output_signal_len:
+        warnings.warn(
+            "The length of signal is shorter than the length parameter. Result is being "
+            + "padded with zeros in the tail. Please check your center and hop_length settings"
+        )
+        y = aten.constant_pad_nd(y, (0, end - expected_output_signal_len), 0)
+    return y
+
+
+# Get the new shape and stride after applying unfold to an input tensor
+def _get_unfold_shape_stride(
+    a_shape: ShapeType, a_stride: StrideType, dimension: int, size: int, step: int
+):
+    a_ndim = len(a_shape)
+    dim = utils.canonicalize_dim(a_ndim, dimension, wrap_scalar=True)
+    max_size = 1 if a_ndim == 0 else a_shape[dim]
+    last_stride = 1 if a_ndim == 0 else a_stride[dim]
+
+    torch._check(
+        size <= max_size,
+        lambda: f"Maximum size for tensor at dimension {dim} is {max_size} but size is {size}",
+    )
+
+    torch._check(
+        step > 0,
+        lambda: f"Step is {step} but must be > 0",
+    )
+
+    shape = list(a_shape)
+    strides = list(a_stride)
+    shape.append(size)
+    strides.append(last_stride)
+    if dim < a_ndim:
+        shape[dim] = (shape[dim] - size) // step + 1
+        strides[dim] *= step
+    return shape, strides
+
+
+@register_decomposition(aten.repeat)
+@out_wrapper()
+def repeat(a: Tensor, *repeat_shape) -> Tensor:
+    repeat_shape = utils.extract_shape_from_varargs(repeat_shape, validate=False)
+    torch._check(
+        len(repeat_shape) >= len(a.shape),
+        lambda: "repeat: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor",
+    )
+
+    if len(repeat_shape) == 0:
+        return torch.clone(a)
+
+    num_new_dimensions = len(repeat_shape) - a.ndim
+    padded_shape = [1] * num_new_dimensions
+    for dim_size in a.shape:
+        padded_shape.append(dim_size)
+
+    target_shape = tuple(
+        padded_size * repeat_size
+        for padded_size, repeat_size in zip(padded_shape, repeat_shape)
+    )
+
+    # return an empty tensor if one of the repeat_shape dimensions is zero
+    if 0 in repeat_shape:
+        return torch.empty(
+            target_shape,
+            dtype=a.dtype,
+            device=a.device,
+            requires_grad=a.requires_grad,
+            memory_format=utils.suggest_memory_format(a),
+        )
+
+    urtensor_shape = target_shape
+    urtensor_stride = utils.make_contiguous_strides_for(target_shape)
+    for dim, dim_size in enumerate(padded_shape):
+        # repeat each dimension by using unfold_copy operation
+        urtensor_shape, urtensor_stride = _get_unfold_shape_stride(
+            urtensor_shape, urtensor_stride, dim, dim_size, max(dim_size, 1)
+        )
+
+    # derive permute order by sorting urtensor strides
+    enumerated_stride = list(enumerate(urtensor_stride))
+    enumerated_stride.sort(key=lambda item: item[1], reverse=True)
+    permute_order, sorted_stride = zip(*enumerated_stride)
+
+    # add new and expand dimensions according to urtensor
+    repeat_xtensor = a.expand(urtensor_shape)
+
+    # clone tensor to concretize expanded dimensions
+    cloned_result = torch.clone(repeat_xtensor)
+
+    # transpose axis so strides are in sorted order
+    permuted_result = cloned_result.permute(permute_order)
+
+    # reshape to get contiguous tensor with correct target shape
+    return permuted_result.reshape(target_shape)
+
+
+def _reshape_view_helper(a: TensorLikeType, *shape, allow_copy: bool) -> TensorLikeType:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious, sym_eq
+
+    # Creates a valid shape
+    shape = utils.extract_shape_from_varargs(shape, validate=False)
+    # Reshape may be given a shape with a -1 length
+    # This indicates that the dimension's length should be inferred
+    shape = utils.infer_size(shape, a.numel())
+
+    # Short-circuits if shape is the same
+    if guard_size_oblivious(sym_eq(tuple(a.shape), tuple(shape))):
+        return prims.view_of(a)
+
+    # Special-cases tensors with no elements
+    if guard_size_oblivious(a.numel() == 0):
+        return as_strided(a, shape, utils.make_contiguous_strides_for(shape))
+
+    # Special-cases reshaping zero dim tensors
+    if a.ndim == 0:
+        _a = a
+        for length in shape:
+            assert length == 1
+            _a = unsqueeze(_a, -1)
+        return _a
+
+    # Special-cases reshaping to zero dim tensors
+    if len(shape) == 0:
+        _a = a
+        for length in a.shape:
+            assert length == 1
+            _a = squeeze(_a, -1)
+        return _a
+
+    # Handles general case: a 1+D tensor reshaped into a distinct 1+D shape
+
+    # NOTE [Reshape Algorithm]
+    # This algorithm works by attempting to greedily construct the desired dimensions in
+    # the output shape, left to right. It does this by, conceptually, accumulating
+    # dimensions of the original tensor, also left to right, until the dimension
+    # can be constructed using prims.split_dim.
+    # The algorithm also has special handling for tail squeezes/unsqueezes, like
+    # if a reshape from (5, 5) to (5, 5, 1) or vice versa.
+    #
+    # This algorithm does not flatten the original tensor and then split dims as appropriate
+    # because that would create copies more often than this algorithm. flatten is the only
+    # operation below which can create a view or a copy, and while it prefers creating
+    # views it may sometimes create a copy if the tensor's strides do not permit a view.
+    # As a result, this algorithm tries to minimize flattening.
+    #
+    # Note that a better version of this algorithm may exist. Regions which could be
+    # flattened without creating a copy can be identified in advance, and that might
+    # allow fewer flatten calls or faster short-circuiting to make a copy.
+    idx = 0
+    a_ = a
+    for length in shape:
+        # Handles tail unsqueezes
+        if idx >= a_.ndim:
+            assert length == 1
+            last_dim = a_.ndim - 1
+            # NOTE: using split_dim instead of unsqueeze may seem silly here,
+            # but it's necessary to get the strides correct
+            a_ = prims.split_dim(a_, last_dim, a_.shape[last_dim])
+            idx = idx + 1
+            continue
+
+        # Skips dimensions that are already the correct length
+        if guard_size_oblivious(length == a_.shape[idx]):
+            idx = idx + 1
+            continue
+
+        # Gathers enough original dimensions such that this new dimension can be created
+        # Note that this accumulation will terminate because we've verified a and the shape
+        # specify the same number of elements above
+        accum = a_.shape[idx]
+        end = idx
+        while guard_size_oblivious(accum % length != 0):
+            end = end + 1
+            accum = accum * a_.shape[end]
+        if end != idx:
+            # NOTE: in this case multiple dimensions must be flatten to create the desired dimension
+            # This flattening is why reshape sometimes creates a copy -- because flattening
+            # may return a view of a copy
+
+            # Checks if collapse can be a view and short-circuits to copying reshape if it can't
+            new_shape, new_strides = prims._collapse_view_helper(a_, idx, end)
+            if new_shape is None:
+                if allow_copy:
+                    return prims.reshape(a, shape)
+
+                msg = "Cannot view a tensor with shape {} and strides {} as a tensor with shape {}!".format(
+                    a.shape, a.stride(), shape
+                )
+                raise ValueError(msg)
+
+            a_ = flatten(a_, idx, end)
+
+        # Splits the (possibly flattened) dimension to create the desired dim length
+        if guard_size_oblivious(accum != length):
+            a_ = prims.split_dim(a_, idx, length)
+
+        idx = idx + 1
+
+    # Squeezes tail
+    while idx < a_.ndim:
+        assert a_.shape[idx] == 1
+        a_ = squeeze(a_, idx)
+
+    return a_
+
+
+# CompositeImplicitAutograd - don't register decomp
+# NOTE: shape is a vararg because Tensor.reshape can be called with as
+# Tensor.reshape(a, b, c) or Tensor.reshape((a, b, c)) Function call
+# torch.reshape doesn't support unpacked shapes
+def reshape(a: TensorLikeType, *shape: ShapeType) -> TensorLikeType:
+    return _reshape_view_helper(a, *shape, allow_copy=True)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def reshape_as(self: TensorLikeType, other: TensorLikeType) -> TensorLikeType:
+    return self.reshape(other.size())
+
+
+@register_decomposition(aten.roll)
+@out_wrapper()
+def roll(
+    a: TensorLikeType, shifts: DimsType, dims: DimsType = tuple()
+) -> TensorLikeType:
+    """Reference implementation of :func:`torch.roll`."""
+    dims = utils.canonicalize_dims(a.ndim, dims)
+    # ATen specifies int[1] type for shifts and dims which expands integers to tuples of length 1
+    if not isinstance(shifts, Iterable):
+        shifts = (shifts,)
+    if not isinstance(dims, Iterable):
+        dims = (dims,)
+
+    # Avoid modulo by zero
+    if a.numel() == 0:
+        # Keeping this as ref for now as FakeTensor runs into some issues with complex tensors
+        return a.clone()
+
+    if a.dim() == 0 and len(dims) > 0:
+        raise IndexError(
+            f"Dimension specified as {dims[0]} but tensor has no dimensions"
+        )
+
+    len_shifts = len(shifts)
+    len_dims = len(dims)
+    if len_shifts != 1 or len_dims != 1:
+        if len_shifts == 0:
+            raise RuntimeError("`shifts` required")
+        # Takes care of the case when dims is not specified (default)
+        # By default, the tensor is flattened before shifting, after which the original shape is restored
+        if len_dims == 0 and len_shifts == 1:
+            return torch.roll(torch.flatten(a), shifts, 0).view(a.shape)
+        if len_shifts != len_dims:
+            raise RuntimeError(
+                f"shifts and dimensions must align. shifts: {len_shifts}, dims: {len_dims}"
+            )
+        assert len_dims > 1
+        tail_shifts = shifts[1:]
+        tail_dims = dims[1:]
+        first_dim_rolled = torch.roll(a, (shifts[0],), dims[0])
+        return torch.roll(first_dim_rolled, tail_shifts, tail_dims)
+
+    # This path is taken when only one dimension is rolled
+    # For example to get `first_dim_rolled` above
+    dim = dims[0]
+    size = a.shape[dim]
+    start = (size - shifts[0]) % size
+    idx = torch.arange(size, device=a.device)
+    return a.index_select(dim, torch.fmod(start + idx, size))
+
+
+@register_decomposition(aten.rot90)
+@out_wrapper()
+def rot90(
+    a: TensorLikeType, k: int = 1, dims: DimsSequenceType = (0, 1)
+) -> TensorLikeType:
+    """Reference implementation of :func:`torch.rot90`."""
+    if len(dims) != 2:
+        raise RuntimeError(
+            f"expected total rotation dims == 2, but got dims = {len(dims)}"
+        )
+    if a.ndim < 2:
+        raise RuntimeError(f"expected total dims >= 2, but got total dims = {a.ndim}")
+
+    # Do this after the initial checks to be compatible with the behavior in
+    # core.
+    dims = utils.canonicalize_dims(a.ndim, dims)
+
+    if dims[0] == dims[1]:
+        raise RuntimeError(
+            f"expected rotation dims to be different, but got dim0 = {dims[0]} and dim1 = {dims[1]}"
+        )
+    k = k % 4  # Rotation direction is from the second towards the first axis for k < 0
+    if k == 1:
+        return torch.transpose(torch.flip(a, (dims[1],)), dims[0], dims[1])
+    elif k == 2:
+        return torch.flip(a, dims)
+    elif k == 3:
+        return torch.transpose(torch.flip(a, (dims[0],)), dims[0], dims[1])
+    else:
+        return clone(a, memory_format=torch.contiguous_format)
+
+
+def _check_stack_inputs(tensors: TensorSequenceType) -> None:
+    entry_shape = tensors[0].shape
+    for i in range(1, len(tensors)):
+        assert tensors[i].shape == entry_shape, (
+            f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0"
+            f"and {tensors[i].shape} at entry {i}"
+        )
+
+
+@register_decomposition(aten.stack)
+@out_wrapper()
+def stack(tensors: TensorSequenceType, dim: int = 0) -> TensorLikeType:
+    assert len(tensors) > 0, "stack expects a non-empty TensorList"
+    wrapped_dim = utils.canonicalize_dim(tensors[0].ndim + 1, dim)
+    # Refs need sparse support to check other condition
+    if wrapped_dim < tensors[0].ndim:  # and not tensors[0].is_sparse:
+        _check_stack_inputs(tensors)
+        result_sizes = list(tensors[0].shape)
+        result_sizes.insert(wrapped_dim, len(tensors))
+        out = torch.cat(tensors, wrapped_dim)
+        return out.view(result_sizes)
+
+    # If dim == tensors[0].ndim, view cannot efficiently handle it
+    return torch.cat([t.unsqueeze(wrapped_dim) for t in tensors], dim)
+
+
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def softmax(
+    a: TensorLikeType,
+    dim: int,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    result_dtype = dtype or a.dtype
+    computation_dtype = utils.get_computation_dtype(result_dtype)
+    a_ = _maybe_convert_to_dtype(a, computation_dtype)
+    if a.numel() == 0:
+        a_exp = exp(a_)
+    else:
+        a_max = amax(a_, dim, keepdim=True)
+        a_exp = exp(a_ - a_max)
+    return _maybe_convert_to_dtype(
+        true_divide(a_exp, sum(a_exp, dim, keepdim=True)), result_dtype
+    )  # type: ignore[return-value]
+
+
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def hstack(tensors: TensorSequenceType) -> TensorLikeType:
+    torch._check(len(tensors) > 0, lambda: "hstack expects a non-empty TensorList")
+    aligned_tensors = atleast_1d(*tensors)
+    if aligned_tensors[0].ndim == 1:
+        return cat(aligned_tensors, 0)
+    return cat(aligned_tensors, 1)
+
+
+# CompositeImplicitAutograd - don't register decomp
+@out_wrapper()
+def vstack(tensors: TensorSequenceType) -> TensorLikeType:
+    torch._check(len(tensors) > 0, lambda: "vstack expects a non-empty TensorList")
+    aligned_tensors = atleast_2d(*tensors)
+    return cat(aligned_tensors, 0)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def unflatten(a: TensorLikeType, dim: int, sizes: ShapeType) -> TensorLikeType:
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    torch._check(len(sizes) != 0, lambda: "unflatten: sizes must be non-empty")
+    return a.view(tuple(a.shape[:dim]) + tuple(sizes) + tuple(a.shape[dim + 1 :]))
+
+
+@register_decomposition(aten.unbind)
+def unbind(t: TensorLikeType, dim: int = 0) -> TensorSequenceType:
+    dim = utils.canonicalize_dim(t.ndim, dim)
+    torch._check_index(
+        len(t.shape) > 0,
+        lambda: "Dimension specified as 0 but tensor has no dimensions",
+    )
+    if t.shape[dim] == 0:
+        return tuple()
+    else:
+        return tuple(
+            torch.squeeze(s, dim) for s in torch.tensor_split(t, t.shape[dim], dim)
+        )
+
+
+@out_wrapper()
+def index_copy(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    return x.clone(memory_format=torch.contiguous_format).index_copy_(
+        dim, index, tensor
+    )
+
+
+def index_copy_(x: TensorLike, dim: int, index: TensorLike, tensor: TensorLike):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    torch._check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    # Treat scalars as elements of \R^1
+    y = x.unsqueeze(0) if x.ndim == 0 else x
+    idx = (slice(None),) * dim + (index,)
+    y[idx] = tensor
+    return x
+
+
+@register_decomposition(aten.index_fill)
+@out_wrapper()
+def index_fill(
+    x: TensorLike, dim: int, index: TensorLike, value: Union[NumberType, TensorLike]
+):
+    return _index_fill(x, dim, index, value, inplace=False)
+
+
+@register_decomposition(aten.index_fill_)
+def index_fill_(
+    x: TensorLike, dim: int, index: TensorLike, value: Union[NumberType, TensorLike]
+):
+    return _index_fill(x, dim, index, value, inplace=True)
+
+
+def _index_fill(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    value: Union[NumberType, TensorLike],
+    *,
+    inplace: bool,
+):
+    torch._check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    if isinstance(value, TensorLike):
+        torch._check(
+            value.ndim == 0,
+            lambda: "Only supports 0-dimensional value tensor. "  # type: ignore[union-attr]
+            f"Got a tensor with {value.ndim} dimensions.",
+        )  # type: ignore[arg-type]
+    else:
+        value = torch.scalar_tensor(
+            value, dtype=x.dtype, layout=x.layout, device=x.device  # type: ignore[arg-type]
+        )
+
+    # index_copy has some unnecessary preconditions when x is a scalar. We do this to work through them
+    zero_dim = x.ndim == 0
+    y = x.unsqueeze(0) if zero_dim else x
+    # index_copy does not broadcast on value so we have to do it manually
+    shape = list(y.shape)
+    shape[dim] = index.numel()
+    value = value.expand(shape)
+    index_copy = Tensor.index_copy_ if inplace else torch.index_copy
+    out = index_copy(y, dim, index, value)  # type: ignore[operator]
+    if inplace:
+        return x
+    else:
+        if zero_dim:
+            # The clone is necessary so that it returns a fresh tensor rather than a view
+            out = out.squeeze(0).clone()
+        # index_fill preserves the strides. index_copy always returns contiguous tensors
+        if out.stride() != x.stride():
+            new_out = torch.empty_like(x)
+            new_out.copy_(out)
+            out = new_out
+        return out
+
+
+@out_wrapper()
+def index_add(
+    x: TensorLike,
+    dim: int,
+    index: TensorLike,
+    tensor: TensorLike,
+    *,
+    alpha: NumberType = 1,
+):
+    # index_add always returns a new contiguous tensor
+    return x.clone(memory_format=torch.contiguous_format).index_add_(
+        dim, index, tensor, alpha=alpha  # type: ignore[arg-type]
+    )
+
+
+@register_decomposition(aten.index_select)
+@out_wrapper()
+def index_select(x: TensorLike, dim: int, index: TensorLike):
+    dim = utils.canonicalize_dims(x.ndim, dim)
+    torch._check(
+        index.ndim <= 1,
+        lambda: f"Index should have dimension 1 or 0 (got {index.ndim})",
+    )
+    if index.ndim == 0:
+        index = index.unsqueeze(0)
+    if x.ndim == 0:
+        # Treat scalars as elements of \R^1
+        # We cannot use x[idx] here as it accesses item() (??), hence this awkward construction
+        return torch.empty_like(x).index_copy(0, index, x.expand_as(index))
+
+    idx = (slice(None),) * dim + (index,)
+    return x[idx]
+
+
+@register_decomposition(aten.squeeze.dims)
+def squeeze(a: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    if dim is None:
+        dims = tuple(idx for idx, size in enumerate(a.shape) if size == 1)
+        return prims.squeeze(a, dims) if dims else prims.view_of(a)
+
+    ndim = a.ndim
+    dim = utils.canonicalize_dims(ndim, dim)
+    dims = (dim,) if isinstance(dim, Dim) else dim
+    # Short-circuits if the tensor has no dimensions
+    if ndim == 0:
+        assert len(dims) == 0 or dims == (0,)
+        return prims.view_of(a)
+
+    # Note: squeeze does not modify tensors when the given dim is not a dimension of length 1
+    dims = tuple(d for d in dims if guard_size_oblivious(a.shape[d] == 1))
+    if len(dims) == 0:
+        return prims.view_of(a)
+    if len(dims) == 1:
+        return prims.squeeze(a, dims)
+    dims_list = list(dims)
+    dims_list = sorted(dims_list, reverse=True)
+    for i in dims_list:
+        a = squeeze(a, i)
+    return a
+
+
+# Note: does not work with TensorMetas because of data-dependent control-flow
+# CompositeImplicitAutograd - don't register decomp
+def tensor_split(
+    a: TensorLikeType,
+    indices_or_sections: Union[Tensor, DimsType],
+    dim: int = 0,
+) -> Tuple[TensorLikeType, ...]:
+    _dim = utils.canonicalize_dim(a.ndim, dim)
+    if a.ndim == 0:
+        msg = "tensor_split: received a rank zero tensor, but expected a tensor of rank one or greater!"
+        raise ValueError(msg)
+
+    # If indices_or_sections is a tensor, it must be a CPU Long tensor
+    if isinstance(indices_or_sections, TensorLike):
+        if not indices_or_sections.device.type == "cpu":
+            msg = "tensor_split: if indices_or_sections is a tensor it must be on the CPU, but received one on {}".format(
+                indices_or_sections.device
+            )
+            raise ValueError(msg)
+        if indices_or_sections.dtype != torch.long:
+            msg = "tensor_split: if indices_or_sections is a tensor it must have long dtype, "
+            f" but received one with dtype {indices_or_sections.dtype}"
+            raise ValueError(msg)
+
+    # Case 0 -- indices_or_sections is an integer or a scalar tensor n and a is split along dim into n parts of equal-ish length
+    if isinstance(indices_or_sections, IntLike) or (
+        isinstance(indices_or_sections, TensorLike) and indices_or_sections.ndim == 0
+    ):
+        sections: int = (
+            indices_or_sections  # type: ignore[assignment]
+            if isinstance(indices_or_sections, Number)
+            else indices_or_sections.item()
+        )
+
+        if sections <= 0:
+            msg = f"tensor_split: number of sections must be greater than 0, but was {sections}"
+            raise ValueError(msg)
+
+        splits = []
+        dim_size = a.shape[_dim]
+        min_split_size = math.floor(dim_size / sections)
+        num_splits_one_extra = dim_size % sections
+        start_idx = 0
+        for split_idx in range(sections):
+            split_size = (
+                min_split_size + 1
+                if (split_idx < num_splits_one_extra)
+                else min_split_size
+            )
+            s = prims.slice_in_dim(a, start_idx, start_idx + split_size, axis=_dim)
+            splits.append(s)
+            start_idx = start_idx + split_size
+
+        return tuple(splits)
+    # Case 1 -- indices_or_sections is a sequence of integers or a 1D tensor describing the splits
+    else:
+        indices = indices_or_sections
+        if isinstance(indices_or_sections, TensorLike):
+            if indices_or_sections.ndim != 1:
+                msg = "tensor_split: non-scalar indices_or_sections tensors must have only one dimension, "
+                f"but received a tensor with {indices_or_sections.ndim} dimensions"
+                raise ValueError(msg)
+
+            indices = indices_or_sections.tolist()
+
+        splits = []
+        start_idx = 0
+        for x in indices:
+            splits.append(prims.slice_in_dim(a, start_idx, x, axis=_dim))
+            start_idx = x
+        splits.append(prims.slice_in_dim(a, start_idx, a.shape[_dim], axis=_dim))
+        return tuple(splits)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def hsplit(
+    a: TensorLikeType, indices_or_sections: DimsType
+) -> Tuple[TensorLikeType, ...]:
+    torch._check(
+        a.ndim >= 1,
+        lambda: (
+            "torch.hsplit requires a tensor with at least 1 dimension, but got a tensor with "
+            + str(a.ndim)
+            + " dimensions!"
+        ),
+    )
+    dim = 0 if a.ndim == 1 else 1
+    if isinstance(indices_or_sections, IntLike):
+        split_size = indices_or_sections
+        torch._check(
+            (split_size != 0 and a.shape[dim] % split_size == 0),
+            lambda: (
+                "torch.hsplit attempted to split along dimension "
+                + str(dim)
+                + ", but the size of the dimension "
+                + str(a.shape[dim])
+                + " is not divisible by the split_size "
+                + str(split_size)
+                + "!"
+            ),
+        )
+        return tensor_split(a, split_size, dim)
+
+    torch._check_type(
+        isinstance(indices_or_sections, (list, tuple)),
+        lambda: (
+            "hsplit(): received an invalid combination of arguments. "
+            "Expected indices_or_sections to be of type int, list of ints or tuple of ints "
+            f"but got type {type(indices_or_sections)}"
+        ),
+    )
+
+    split_sizes = indices_or_sections
+    return tensor_split(a, split_sizes, dim)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def vsplit(
+    a: TensorLikeType, indices_or_sections: DimsType
+) -> Tuple[TensorLikeType, ...]:
+    torch._check(
+        a.ndim >= 2,
+        lambda: (
+            "torch.vsplit requires a tensor with at least 2 dimension, but got a tensor with "
+            + str(a.ndim)
+            + " dimensions!"
+        ),
+    )
+    if isinstance(indices_or_sections, IntLike):
+        split_size = indices_or_sections
+        torch._check(
+            (split_size != 0 and a.shape[0] % split_size == 0),
+            lambda: (
+                f"torch.vsplit attempted to split along dimension 0"
+                f", but the size of the dimension "
+                f"{a.shape[0]}"
+                f" is not divisible by the split_size "
+                f"{split_size}"
+                f"!"
+            ),
+        )
+        return tensor_split(a, split_size, 0)
+
+    torch._check_type(
+        isinstance(indices_or_sections, (list, tuple)),
+        lambda: (
+            "vsplit(): received an invalid combination of arguments. "
+            "Expected indices_or_sections to be of type int, list of ints or tuple of ints "
+            f"but got type {type(indices_or_sections)}"
+        ),
+    )
+
+    split_sizes = indices_or_sections
+    return tensor_split(a, split_sizes, 0)
+
+
+@register_decomposition(aten.diag.out)
+@out_wrapper()
+def diag(
+    self: TensorLikeType,
+    offset: int = 0,
+) -> TensorLikeType:
+    ndim = self.dim()
+    torch._check(
+        ndim in (1, 2), lambda: f"diag(): Supports 1D or 2D tensors. Got {ndim}D"
+    )
+    if ndim == 1:
+        return torch.diag_embed(self, offset)
+    else:
+        return torch.diagonal_copy(self, offset)
+
+
+@register_decomposition(aten.diagonal_scatter)
+@out_wrapper()
+def diagonal_scatter(
+    input: TensorLikeType,
+    src: TensorLikeType,
+    offset: int = 0,
+    dim1: int = 0,
+    dim2: int = 1,
+) -> TensorLikeType:
+    out = utils.clone_preserve_strides(input)
+    diag = out.diagonal(offset, dim1, dim2)
+    torch._check(
+        diag.shape == src.shape,
+        lambda: "expected src to have a size equal to the diagonal of the input."
+        f"Got {src.shape} for a diagonal of shape {diag.shape}",
+    )
+    copy_to(diag, src)
+    return out
+
+
+@register_decomposition(aten.diagonal)
+def diagonal(
+    self: TensorLikeType,
+    offset: int = 0,
+    dim1: int = 0,
+    dim2: int = 1,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.diagonal
+    """
+    num_dims = self.dim()
+    dim1 = utils.canonicalize_dim(idx=dim1, rank=num_dims)
+    dim2 = utils.canonicalize_dim(idx=dim2, rank=num_dims)
+
+    torch._check(
+        dim1 != dim2, lambda: f"diagonal dimensions cannot be identical {dim1}, {dim2}"
+    )
+
+    storage_offset = self.storage_offset()
+
+    if offset >= 0:
+        diag_size = max(min(self.size()[dim1], self.size()[dim2] - offset), 0)
+    else:
+        diag_size = max(min(self.size()[dim1] + offset, self.size()[dim2]), 0)
+
+    if diag_size > 0:
+        if offset >= 0:
+            storage_offset += offset * self.stride()[dim2]
+        else:
+            storage_offset -= offset * self.stride()[dim1]
+
+    sizes = [s for i, s in enumerate(self.size()) if i not in (dim1, dim2)]
+    sizes.append(diag_size)
+
+    strides = [s for i, s in enumerate(self.stride()) if i not in (dim1, dim2)]
+    strides.append(self.stride()[dim1] + self.stride()[dim2])
+
+    result = self.as_strided(size=sizes, stride=strides, storage_offset=storage_offset)
+
+    return result
+
+
+diagonal_copy = _make_copy_from_view(diagonal)
+
+
+@register_decomposition(aten.diag_embed)
+@out_wrapper()
+def diag_embed(
+    t: TensorLikeType,
+    offset: int = 0,
+    dim1: int = -2,
+    dim2: int = -1,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.diag_embed
+    """
+    # convert from negative dims
+    rank = t.ndim + 1
+    dim1 = utils.canonicalize_dim(rank=rank, idx=dim1)
+    dim2 = utils.canonicalize_dim(rank=rank, idx=dim2)
+
+    # as per the docs, exchanging dims is equivalent to changing the sign of
+    # offset
+    if dim1 > dim2:
+        dim1, dim2 = dim2, dim1
+        offset = -offset
+
+    torch._check(
+        dim1 != dim2, lambda: f"diagonal dimensions cannot be identical {dim1}, {dim2}"
+    )
+
+    # as per the docs, the size of last dim is placed at dim1 and dim2
+    last_dim = t.size(-1)
+
+    if offset != 0:
+        # add padding to match the new size
+        t_shape = list(t.shape)
+        t_shape[-1] = builtins.abs(offset)
+        z = torch.zeros(t_shape, dtype=t.dtype, device=t.device, requires_grad=False)
+        pair = (z, t) if offset > 0 else (t, z)
+        t = torch.cat(pair, dim=-1)
+        # make sure the diagonal always has the same size
+        last_dim += builtins.abs(offset)
+
+    # preserve original data, but place 1 at dim1 and move last dim to dim2
+    t = t.unsqueeze(dim1).movedim(-1, dim2)
+
+    # generate ranges shifting indices based on offset
+    a_range = torch.arange(last_dim, device=t.device, dtype=torch.int64)
+    b_range = torch.arange(
+        offset, last_dim + offset, device=t.device, dtype=torch.int64
+    )
+
+    # broadcast
+    cond = a_range == b_range.unsqueeze(-1)
+    cond_shape = [last_dim if i in (dim1, dim2) else 1 for i in range(len(t.shape))]
+    cond = cond.reshape(cond_shape)
+
+    # aten.diag_embed always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(cond, t).contiguous()
+
+
+@register_decomposition(aten.block_diag)
+@out_wrapper()
+def _block_diag_iterable(tensors: List[TensorLikeType]) -> TensorLikeType:
+    """
+    Reference implementation of torch.block_diag
+    """
+    tensors_2d = [
+        tensor.view(1, -1) if tensor.dim() <= 1 else tensor for tensor in tensors
+    ]
+
+    ncols = builtins.sum(tensor.shape[1] for tensor in tensors_2d)
+    device = tensors_2d[0].device
+
+    result = []
+
+    col_start = 0
+    for i, tensor in enumerate(tensors_2d):
+        torch._check(
+            tensor.dim() == 2,
+            lambda: "Input tensors must have 2 or fewer dimensions. "
+            f"Input {i} has {tensor.dim()} dimensions",
+        )
+        torch._check(
+            tensor.device == device,
+            lambda: "Input tensors must all be on the same device. "
+            f"Input 0 is on device {device} and input {i} is on device {tensor.device}.",
+        )
+        row, col = tensor.shape
+        left = torch.zeros((row, col_start), device=device, dtype=tensor.dtype)
+        right = torch.zeros(
+            (row, ncols - col_start - col), device=device, dtype=tensor.dtype
+        )
+        result += [torch.cat((left, tensor, right), dim=1)]
+        col_start += col
+
+    return torch.cat(result, dim=0)
+
+
+def block_diag(*tensors: List[TensorLikeType]) -> TensorLikeType:
+    """
+    This is used as an input to PythonRefInfo. `torch.block_diag`
+    expects arguments splatted, but `aten.block_diag` expects only
+    one argument that is a list of Tensors.
+    """
+    return _block_diag_iterable(tensors)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def dsplit(a: TensorLikeType, sections: DimsType) -> TensorSequenceType:
+    if a.ndim < 3:
+        raise RuntimeError(
+            f"torch.dsplit requires a tensor with at least 3 dimension, but got a tensor with {a.ndim} dimensions!"
+        )
+    if isinstance(sections, IntLike) and (sections == 0 or a.shape[2] % sections != 0):
+        raise RuntimeError(
+            "torch.dsplit attempted to split along dimension 2, "
+            + f"but the size of the dimension {a.shape[2]} is not divisible by the split_size {sections}!"
+        )
+    return tensor_split(a, sections, 2)
+
+
+@register_decomposition(aten.t.default)
+def t(a: TensorLikeType):
+    # TODO: Add sparse support
+    # if a.is_sparse:
+    #     sparse_dim = a.sparse_dim()
+    #     dense_dim = a.dense_dim()
+    #     if not (sparse_dim <= 2 and dense_dim == 0):
+    #         raise RuntimeError(
+    #             f"t() expects a tensor with <= 2 sparse and 0 dense dimensions, but got {sparse_dim} sparse and"
+    #             f"{dense_dim} dense dimensions"
+    #         )
+    if a.ndim > 2:
+        raise RuntimeError(
+            f"t() expects a tensor with <= 2 dimensions, but self is {a.ndim}D"
+        )
+    return torch.transpose(a, 0, 0 if a.ndim < 2 else 1)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def T(a: TensorLikeType) -> TensorLikeType:
+    # n != 2 && n != 0 is deprecated in regular PyTorch.
+    torch._check(
+        a.ndim in (0, 2),
+        lambda: (
+            "The use of `x.T` on tensors of dimension other than 0 or 2 "
+            "to reverse their shape is not supported."
+        ),
+    )
+    return a.t()
+
+
+@register_decomposition(aten.alias)
+def alias(a: TensorLikeType) -> TensorLikeType:
+    return prims.view_of(a)
+
+
+@register_decomposition(aten.transpose)
+def transpose(a: TensorLikeType, dim0: int, dim1: int) -> TensorLikeType:
+    _dim0, _dim1 = utils.canonicalize_dims(a.ndim, (dim0, dim1))  # type: ignore[misc]
+
+    if a.ndim <= 1 or dim0 == dim1:
+        return aten.alias.default(a)
+
+    _permutation = list(range(0, a.ndim))
+    _permutation[_dim0] = _dim1
+    _permutation[_dim1] = _dim0
+    return torch.permute(a, _permutation)
+
+
+# Aliases for transpose
+swap_axes = transpose
+
+
+@register_decomposition(aten.unfold)
+def unfold(
+    self: TensorLikeType, dimension: int, size: int, step: int
+) -> TensorLikeType:
+    shape, strides = _get_unfold_shape_stride(
+        self.shape, self.stride(), dimension, size, step
+    )
+    return self.as_strided(shape, strides)
+
+
+@register_decomposition(aten.unfold_copy)
+@out_wrapper()
+def unfold_copy(self: TensorLikeType, dimension: int, size: int, step: int):
+    return self.unfold(dimension, size, step).clone(
+        memory_format=torch.contiguous_format
+    )
+
+
+def _cumsumprod_common(
+    func,
+    init,
+    a: TensorLikeType,
+    dim: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    # We implement all the kwargs of a reduction. ATen just handles dtype
+    # nb. This decomposition may not be as efficient as a backend-specific implementation
+    ndim = a.ndim
+    dim = utils.canonicalize_dim(ndim, dim)
+    if ndim == 0:
+        return func(a.unsqueeze(0), dim=0, dtype=dtype, out=out)
+    a = a.unsqueeze(dim + 1)
+    rg = torch.arange(a.shape[dim], device=a.device)
+    mask = rg.unsqueeze(1) <= rg
+    for _ in range(ndim - dim - 1):
+        mask = mask.unsqueeze(-1)
+    masked_a = torch.where(mask, a, init)
+    return func(masked_a, dim=dim, dtype=dtype, out=out)
+
+
+@register_decomposition(aten.cumsum)
+def cumsum(
+    a: TensorLikeType,
+    dim: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    return _cumsumprod_common(func=sum, init=0, a=a, dim=dim, dtype=dtype, out=out)
+
+
+@register_decomposition(aten.cumprod)
+def cumprod(
+    a: TensorLikeType,
+    dim: int,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    out: Optional[Tensor] = None,
+) -> TensorLikeType:
+    return _cumsumprod_common(func=prod, init=1, a=a, dim=dim, dtype=dtype, out=out)
+
+
+# Note: although squeeze is documented as having the out= kwarg it doesn't
+@register_decomposition(aten.unsqueeze)
+def unsqueeze(a: TensorLikeType, dim: int) -> TensorLikeType:
+    # Note that unsqueeze canonicalizes with rank + 1 because it allows
+    # a new innermost dimension to be specified
+    ndim = a.ndim + 1
+    dim = utils.canonicalize_dim(ndim, dim)
+    return prims.expand_dims(a, (dim,), ndim=ndim)
+
+
+# NOTE: shape is a vararg because Tensor.reshape can be called with as
+# Tensor.view(a, b, c) or Tensor.view((a, b, c)) Function call torch.view
+# doesn't support unpacked shapes
+# TODO: Turn this into a decomposition (currently fails on reshape meta tests)
+@register_decomposition(aten.view.default)
+def view(a: TensorLikeType, *shape: ShapeType) -> TensorLikeType:
+    return _reshape_view_helper(a, *shape, allow_copy=False)
+
+
+# CompositeImplicitAutograd - don't register decomp
+def view_as(self: TensorLikeType, other: TensorLikeType) -> TensorLikeType:
+    return self.view(other.size())
+
+
+# CompositeImplicitAutograd - don't register decomp
+def ravel(a: TensorLikeType) -> TensorLikeType:
+    return reshape(a, (-1,))
+
+
+# CompositeImplicitAutograd - don't register decomp
+# missing ref impl. for aten.gather
+@out_wrapper()
+def take_along_dim(
+    a: torch.Tensor, indices: torch.Tensor, dim: Optional[int] = None
+) -> torch.Tensor:
+    torch._check(
+        a.ndim == indices.ndim,
+        lambda: (
+            "torch.take_along_dim(): input and indices should have the same "
+            f"number of dimensions, but got {a.ndim} dimensions for input, and "
+            f"{indices.ndim} dimensions for indices"
+        ),
+    )
+
+    torch._check(
+        utils.is_integer_dtype(indices.dtype),
+        lambda: (
+            "torch.take_along_dim(): dtype of indices should be int but got "
+            f"{indices.dtype} instead"
+        ),
+    )
+
+    if dim is None:
+        return torch.gather(a.view(-1), 0, indices.view(-1))
+    else:
+        self_sizes = list(a.shape)
+        self_sizes[dim] = indices.size(dim)
+        broadcast_shape = utils.infer_size_shapes(self_sizes, indices.size())
+        indices_broadcast = broadcast_to(indices, broadcast_shape)
+
+        indices_sizes = list(indices.shape)
+        indices_sizes[dim] = a.size(dim)
+        broadcast_shape = utils.infer_size_shapes(indices_sizes, a.size())
+        self_broadcast = broadcast_to(a, broadcast_shape)
+
+        return torch.gather(self_broadcast, dim, indices_broadcast)
+
+
+@out_wrapper()
+def empty(
+    *shape,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+    memory_format: torch.memory_format = torch.contiguous_format,
+) -> TensorLikeType:
+    torch._check(
+        memory_format != torch.preserve_format,
+        lambda: "torch.empty: the Preserve memory format is not supported",
+    )
+
+    shape = utils.extract_shape_from_varargs(shape)
+
+    if memory_format == torch.contiguous_format:
+        strides = utils.make_contiguous_strides_for(shape)
+    elif memory_format == torch.channels_last_3d:
+        strides = utils.make_channels_last_3d_strides_for(shape)
+    else:  # memory_format == torch.channels_last
+        torch._check(
+            memory_format == torch.channels_last,
+            lambda: f"torch.empty: received an unknown memory format {memory_format}!",
+        )
+        strides = utils.make_channels_last_2d_strides_for(shape)
+
+    return torch.empty_strided(
+        shape,
+        strides,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@out_wrapper()
+def empty_permuted(
+    shape,
+    physical_layout,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    return prims.empty_permuted(
+        shape,
+        physical_layout,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.new_empty)
+@out_wrapper()
+def new_empty(
+    a: TensorLikeType,
+    size: ShapeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.empty(
+        size,
+        dtype=dtype,
+        device=device,
+        pin_memory=pin_memory,
+        layout=layout,
+    )
+
+
+@register_decomposition(aten.new_empty_strided)
+@out_wrapper()
+def new_empty_strided(
+    a: TensorLikeType,
+    size: ShapeType,
+    stride: StrideType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.Tensor.new_empty_strided
+    """
+
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.empty_strided(
+        size,
+        stride,
+        dtype=dtype,
+        device=device,
+        pin_memory=pin_memory,
+        layout=layout,
+    )
+
+
+@register_decomposition(aten.zeros.default)
+@out_wrapper()
+def zeros(
+    *size,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    size = utils.extract_shape_from_varargs(size)
+
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    return torch.full(
+        size,
+        False if dtype == torch.bool else 0,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.new_zeros)
+@out_wrapper()
+def new_zeros(
+    a: TensorLikeType,
+    size: ShapeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        False if (dtype or a.dtype) == torch.bool else 0,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.ones.default)
+@out_wrapper()
+def ones(
+    *size,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    size = utils.extract_shape_from_varargs(size)
+
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    return torch.full(
+        size,
+        True if dtype == torch.bool else 1,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.new_ones)
+@out_wrapper()
+def new_ones(
+    a: TensorLikeType,
+    size: ShapeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        True if (dtype or a.dtype) == torch.bool else 1,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.new_full)
+@out_wrapper()
+def new_full(
+    a: TensorLikeType,
+    size: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    return torch.full(
+        size,
+        fill_value,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+    )
+
+
+@register_decomposition(aten.empty_like)
+@out_wrapper()
+def empty_like(
+    a: TensorLikeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[DeviceLikeType] = None,
+    layout: Optional[torch.layout] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> TensorLikeType:
+    dtype = a.dtype if dtype is None else dtype
+    layout = a.layout if layout is None else layout
+    device = a.device if device is None else device
+
+    if memory_format != torch.preserve_format:
+        return torch.empty(
+            a.shape,
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            requires_grad=requires_grad,
+            pin_memory=pin_memory,
+            memory_format=memory_format,
+        )
+
+    # memory_format == torch.preserve_format
+    logical_to_physical_perm = (
+        utils.compute_elementwise_output_logical_to_physical_perm(a)
+    )
+    # identity perm is [2, 1, 0]
+    return torch.empty_permuted(
+        a.shape,
+        logical_to_physical_perm,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition([aten.arange.start_step, aten.arange.start_out])
+@out_wrapper()
+def arange(
+    start: NumberType = 0,
+    end: Optional[NumberType] = None,
+    step: NumberType = 1,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    utils.check_layout(layout)
+    utils.check_pin_memory(pin_memory)
+    device = torch.device(utils.device_or_default(device))
+
+    assert not isinstance(start, complex)
+    assert not isinstance(end, complex)
+    assert not isinstance(step, complex)
+
+    # Case: torch.arange(5)
+    if end is None:
+        end = start
+        start = 0
+    torch._check(step != 0, lambda: "step must be nonzero")
+    if step > 0:
+        torch._check(
+            end >= start,
+            lambda: "upper bound and lower bound inconsistent with step sign",
+        )
+    elif step < 0:
+        torch._check(
+            end <= start,
+            lambda: "upper bound and lower bound inconsistent with step sign",
+        )
+
+    def is_finite(x):
+        return not isinstance(x, FloatWithoutSymFloat) or math.isfinite(x)
+
+    torch._check(
+        is_finite(start) and is_finite(end),
+        lambda: f"unsupported range: {start} -> {end}",
+    )
+    torch._check(
+        is_finite(step),
+        lambda: f"step must be finite but got {step}",
+    )
+
+    if dtype is None:
+        args = (start, end, step)
+        integer_args = builtins.all(isinstance(arg, IntLike) for arg in args)
+        dtype = torch.int64 if integer_args else torch.get_default_dtype()
+
+    is_integer = utils.is_integer_dtype(dtype)
+    if is_integer:
+        xstart = sym_int(start)
+        xend = sym_int(end)
+        xstep = sym_int(step)
+
+    # For int64 we truncate arguments to int before calculating length, but
+    # other integral dtypes we don't. Weird... but needed to match ATen shapes.
+    if dtype == torch.int64:
+        # Uses floordiv to avoid ceil in inductor.
+        sgn = bool(xstep > 0) - bool(xstep < 0)  # type: ignore[possibly-undefined]
+        length = (xend - xstart + xstep - sgn) // xstep  # type: ignore[possibly-undefined]
+    else:
+        length = math.ceil((end - start) / step)
+
+    if is_integer:
+        return prims.iota(
+            length,
+            start=xstart,  # type: ignore[possibly-undefined]
+            step=xstep,  # type: ignore[possibly-undefined]
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+        )
+
+    computation_dtype = utils.get_acc_type(dtype, device)
+    index = prims.iota(
+        length,
+        start=0,
+        step=1,
+        dtype=torch.int64,
+        device=device,
+        requires_grad=False,
+    )
+    index = _maybe_convert_to_dtype(index, computation_dtype)
+    result = start + step * index
+    result = _maybe_convert_to_dtype(result, dtype)
+
+    if requires_grad:
+        result.requires_grad_(True)
+    return result
+
+
+@register_decomposition(aten.lerp)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("start", "end", "weight"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def lerp(start: Tensor, end: Tensor, weight: Union[Tensor, NumberType]):
+    inputs = [start, end]
+    if isinstance(weight, Number):
+        weight = start.new_full((), weight)  # type: ignore[arg-type]
+    else:
+        inputs.append(weight)
+    assert isinstance(weight, Tensor)  # mypy
+    # We implement it this way for numerical stability. We assume (in the stability optimisation)
+    # that 0 <= weight <= 1. We take the abs to deal with complex numbers
+    # We want to perform operations near zero, which is where floating points are most precise
+    # thus, we perform the following optimisation:
+    # If weight.abs() >= 0.5:
+    #    return (1 - weight) * (start - end) + end
+    mask = weight.abs() >= 0.5
+    coeff = torch.where(mask, weight - 1, weight)
+    base = torch.where(mask, end, start)
+    output = coeff * (end - start) + base
+    # make sure the decomposition output's stride is same as non-decomposition path.
+    stride = utils.compute_elementwise_output_strides(*_maybe_broadcast(*inputs))
+    if output.stride() != stride:
+        output = prims.copy_strided(output, stride)
+
+    return handle_noncontiguous_outputs(inputs, output)
+
+
+@register_decomposition(aten.linspace)
+@out_wrapper()
+def linspace(
+    start: Union[NumberType, TensorLikeType],
+    end: Union[NumberType, TensorLikeType],
+    steps: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[DeviceLikeType] = None,
+    layout: torch.layout = torch.strided,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    if isinstance(start, TensorLikeType):
+        torch._check(
+            start.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+        start = _maybe_convert_to_dtype(start, torch.float64)
+    if isinstance(end, TensorLikeType):
+        torch._check(
+            end.dim() == 0,
+            lambda: "linspace only supports 0-dimensional start and end tensors",
+        )
+        end = _maybe_convert_to_dtype(end, torch.float64)
+
+    if py_any(isinstance(arg, complex) for arg in (start, end, steps)):
+        default_complex_dtype = utils.corresponding_complex_dtype(
+            torch.get_default_dtype()
+        )
+        if dtype is None:
+            dtype = default_complex_dtype
+        else:
+            torch._check(
+                utils.is_complex_dtype(dtype),
+                lambda: f"linspace(): inferred dtype {default_complex_dtype} can't be safely cast to passed dtype {dtype}",
+            )
+    else:
+        dtype = dtype or torch.get_default_dtype()
+    assert isinstance(dtype, torch.dtype)
+
+    # steps does not participate in the computation of the dtype
+    torch._check_type(
+        isinstance(steps, IntLike),
+        lambda: f"received an invalid combination of arguments - got \
+({type(start).__name__}, {type(end).__name__}, {type(steps).__name__})",
+    )
+    assert isinstance(steps, IntLike)  # for mypy
+    torch._check(steps >= 0, lambda: "number of steps must be non-negative")
+
+    factory_kwargs = {
+        "layout": layout,
+        "device": device,
+        "pin_memory": pin_memory,
+        "requires_grad": requires_grad,
+    }
+    if steps == 0:
+        return torch.full((0,), 0, dtype=dtype, **factory_kwargs)  # type: ignore[arg-type]
+    if steps == 1:
+        if isinstance(start, TensorLikeType):
+            return torch.empty((steps,), dtype=dtype, **factory_kwargs).copy_(start)  # type: ignore[arg-type]
+        else:
+            return torch.full((steps,), start, dtype=dtype, **factory_kwargs)  # type: ignore[arg-type]
+
+    # Perform in arange in int because some backends like ATen or Triton do not support all the dtypes
+    rg = torch.arange(0, steps, **factory_kwargs)  # type: ignore[arg-type]
+
+    # Small types need to be computed in higher precision as this is, at heart, an associative scan
+    dtype_red = (
+        torch.int64
+        if (utils.is_boolean_dtype(dtype) or utils.is_integer_dtype(dtype))
+        else dtype
+    )
+    computation_dtype, _ = utils.reduction_dtypes(
+        rg, REDUCTION_OUTPUT_TYPE_KIND.SAME, dtype_red
+    )
+    cast_rg = partial(_maybe_convert_to_dtype, dtype=computation_dtype)
+
+    # We implement torch.lerp without performing rg / (steps - 1) explicitly
+    # With this we get out[0] == start, out[-1] == end
+    step = (end - start) / (steps - 1)
+    out = torch.where(
+        rg < steps / 2,
+        start + step * cast_rg(rg),  # type: ignore[arg-type,operator]
+        end - step * cast_rg((steps - 1) - rg),  # type: ignore[arg-type,operator]
+    )
+    return _maybe_convert_to_dtype(out, dtype)  # type: ignore[return-value]
+
+
+@register_decomposition(aten.logspace)
+@out_wrapper()
+def logspace(
+    start: Union[NumberType, TensorLikeType],
+    end: Union[NumberType, TensorLikeType],
+    steps: NumberType,
+    base: NumberType = 10,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[DeviceLikeType] = None,
+    layout: torch.layout = torch.strided,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    # NB: NumPy doesn't have this cast
+    if prims.utils.is_integer_dtype(dtype):
+        if isinstance(start, FloatLike):
+            start = sym_int(start)
+        elif isinstance(start, TensorLikeType):
+            torch._check(
+                start.dim() == 0,
+                lambda: "logspace only supports 0-dimensional start and end tensors",
+            )
+            start = _maybe_convert_to_dtype(start, dtype)
+        if isinstance(end, FloatLike):
+            end = sym_int(end)
+        elif isinstance(end, TensorLikeType):
+            torch._check(
+                end.dim() == 0,
+                lambda: "logspace only supports 0-dimensional start and end tensors",
+            )
+            end = _maybe_convert_to_dtype(end, dtype)
+
+    if py_any(isinstance(arg, complex) for arg in (start, end, steps)):
+        default_complex_dtype = utils.corresponding_complex_dtype(
+            torch.get_default_dtype()
+        )
+        dtype = default_complex_dtype
+        _dtype = None  # torch.linspace will update the correct dtype
+    else:
+        _dtype = torch.float64
+
+    assert not isinstance(base, complex)  # for mypy
+    if base < 0:
+        raise NotImplementedError
+    ret = torch.linspace(  # type: ignore[misc]
+        start,  # type: ignore[arg-type]
+        end,  # type: ignore[arg-type]
+        steps,  # type: ignore[arg-type]
+        dtype=_dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+    return _maybe_convert_to_dtype(torch.pow(base, ret), dtype)  # type: ignore[arg-type,return-value]
+
+
+@overload
+def meshgrid(tensors: Sequence[TensorLikeType], indexing: str):
+    pass
+
+
+@overload
+def meshgrid(*tensors: TensorLikeType, indexing: str):
+    pass
+
+
+@register_decomposition(aten.meshgrid)
+def meshgrid(
+    *tensors: Union[TensorLikeType, List[TensorLikeType], Tuple[TensorLikeType]],
+    indexing: str,
+) -> List[TensorLikeType]:
+    # This ref simultaneously handles two overloads (see stubs above)
+    # The `indexing` argument is currently optional for torch.meshgrid, but we
+    # plan to make the argument required: https://github.com/pytorch/pytorch/issues/50276
+    if isinstance(tensors[0], (list, tuple)):
+        assert len(tensors) == 1
+        tensors = tuple(tensors[0])
+
+    torch._check(
+        py_all(isinstance(a, TensorLike) for a in tensors),
+        lambda: "meshgrid expects its inputs to be tensors",
+    )
+
+    torch._check(len(tensors) > 0, lambda: "meshgrid expects a non-empty TensorList")
+
+    for i in range(len(tensors) - 1):
+        torch._check(
+            tensors[i].dtype == tensors[i + 1].dtype,  # type: ignore[union-attr]
+            lambda: "meshgrid expects all tensors to have the same dtype",
+        )
+        torch._check(
+            tensors[i].device == tensors[i + 1].device,  # type: ignore[union-attr]
+            lambda: "meshgrid expects all tensors to have the same device",
+        )
+
+    swap_first_and_second_tensors = False
+    if indexing == "xy":
+        swap_first_and_second_tensors = len(tensors) >= 2
+        if swap_first_and_second_tensors:
+            tensors = (tensors[1], tensors[0], *tensors[2:])
+    else:
+        torch._check(
+            indexing == "ij",
+            lambda: (
+                'torch.meshgrid: indexing must be one of "xy" or "ij", '
+                f"but received: {indexing}"
+            ),
+        )
+
+    result_shape: List[int] = []
+    for t in tensors:
+        assert isinstance(t, TensorLike)  # mypy
+        torch._check(
+            t.ndim == 0 or t.ndim == 1,
+            lambda: f"torch.meshgrid: Expected 0D or 1D tensor in the tensor list but got: {t}",
+        )
+        result_shape.append(t.numel())
+
+    grids: List[TensorLikeType] = []
+    for i, t in enumerate(tensors):
+        assert isinstance(t, TensorLike)  # mypy
+        if t.ndim == 0:
+            t = t.view((1,))
+        grids.append(prims.broadcast_in_dim(t, result_shape, (i,)))
+
+    if swap_first_and_second_tensors:
+        # Swap outputs if we originally swapped at the beginning
+        grids[0], grids[1] = grids[1], grids[0]
+
+    return grids
+
+
+# CompositeImplicitAutograd - don't register decomp
+def movedim(
+    input: TensorLikeType,
+    source: Union[int, DimsSequenceType],
+    destination: Union[int, DimsSequenceType],
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.movedim
+    """
+    if type(source) is int:
+        source = (source,)
+    if type(destination) is int:
+        destination = (destination,)
+
+    # Converts to list to produce a compatible error message with core PyTorch,
+    # which prints sequences in square brackets.
+    torch._check(
+        len(source) == len(destination),  # type: ignore[arg-type]
+        lambda: (
+            "movedim: Invalid source or destination dims: source "  # type: ignore[arg-type]
+            f"({list(source)} dims) should contain the same number "  # type: ignore[arg-type]
+            f"of dims as destination ({list(destination)} dims)"  # type: ignore[arg-type]
+        ),
+    )
+
+    rank = input.ndim
+    ss = tuple(utils.canonicalize_dims(rank=rank, indices=source))  # type: ignore[arg-type]
+    ds = tuple(utils.canonicalize_dims(rank=rank, indices=destination))  # type: ignore[arg-type]
+
+    sss = set(ss)
+    dss = set(ds)
+
+    # See above on why this converts to list in error messages.
+    torch._check(
+        len(ss) == len(sss),
+        lambda: f"movedim: repeated dim in `source` ({list(source)})",  # type: ignore[arg-type]
+    )
+    torch._check(
+        len(ds) == len(dss),
+        lambda: f"movedim: repeated dim in `destination` ({list(destination)})",  # type: ignore[arg-type]
+    )
+
+    m = dict(zip(ds, ss))
+    dims = []
+    si = 0  # source index
+    for di in range(rank):
+        # check if the destination index is in the mapping
+        s = m.get(di)
+        if s is not None:
+            # insert source index if found
+            dims.append(s)
+        else:
+            # insert source index sequentially, skipping indices from the mapping
+            while si in sss:
+                si += 1
+            dims.append(si)
+            si += 1
+
+    result = torch.permute(input, tuple(dims))
+
+    return result
+
+
+# NOTE: for convenience, shape can be a tuple of ints or a tuple containing a tuple of ints
+@register_decomposition(aten.empty_strided)
+@out_wrapper()
+def empty_strided(
+    shape: Union[ShapeType, Tuple[ShapeType]],
+    strides: StrideType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[DeviceLikeType] = None,
+    layout: torch.layout = torch.strided,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    # Layout == strided, pin_memory is False
+    utils.check_layout(layout)
+    utils.check_pin_memory(pin_memory)
+
+    shape = utils.extract_shape_from_varargs(shape)
+    dtype = torch.get_default_dtype() if dtype is None else dtype
+    device = torch.device("cpu") if device is None else device
+
+    return prims.empty_strided(
+        shape,
+        strides,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+@register_decomposition(aten.eye)
+@out_wrapper()
+def eye(
+    n: int,
+    m: Optional[int] = None,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,  # TODO: unused
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.eye
+    """
+    if m is None:
+        m = n
+
+    torch._check(n >= 0, lambda: f"n must be greater or equal to 0, got {n}")
+    torch._check(m >= 0, lambda: f"m must be greater or equal to 0, got {m}")
+
+    range_n = torch.arange(n, dtype=torch.int64, device=device, requires_grad=False)
+    range_m = torch.arange(m, dtype=torch.int64, device=device, requires_grad=False)
+
+    cond = range_n.unsqueeze(-1) == range_m
+    if dtype is torch.bool:
+        return cond
+    else:
+        one = torch.ones(
+            (1,),
+            dtype=dtype,
+            layout=layout,
+            device=device,
+            pin_memory=pin_memory,
+            requires_grad=False,
+        )
+        return torch.where(cond, one, 0)
+    # TODO: Use requires_grad.  All refs taking the requires_grad kwarg must
+    # return a leaf tensor.
+    # result.requires_grad_(requires_grad)
+
+
+@register_decomposition([aten.full.default, aten.full.out])
+@out_wrapper()
+def full(
+    shape: ShapeType,
+    fill_value: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+) -> TensorLikeType:
+    utils.check_layout(layout)
+    utils.check_pin_memory(pin_memory)
+
+    dtype = dtype if dtype is not None else utils.type_to_dtype(type(fill_value))
+    device = device if device is not None else torch.device("cpu")
+
+    e = empty(
+        shape,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+    )
+    return torch.fill(e, fill_value)  # type: ignore[arg-type]
+
+
+def full_like(
+    a: TensorLikeType,
+    fill_value: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> TensorLikeType:
+    e = torch.empty_like(
+        a,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+        memory_format=memory_format,
+    )
+    return fill(e, fill_value)
+
+
+@register_decomposition(aten.zeros_like)
+@out_wrapper()
+def zeros_like(
+    a: TensorLikeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> TensorLikeType:
+    return torch.full_like(
+        a,
+        False if (dtype or a.dtype) == torch.bool else 0,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+        memory_format=memory_format,
+    )
+
+
+@register_decomposition(aten.ones_like)
+@out_wrapper()
+def ones_like(
+    a: TensorLikeType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: Optional[torch.layout] = None,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+    requires_grad: bool = False,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> TensorLikeType:
+    return torch.full_like(
+        a,
+        True if (dtype or a.dtype) == torch.bool else 1,
+        dtype=dtype,
+        layout=layout,
+        device=device,
+        pin_memory=pin_memory,
+        requires_grad=requires_grad,
+        memory_format=memory_format,
+    )
+
+
+@register_decomposition(aten.randn.default)
+@out_wrapper()
+def randn(
+    *shape,
+    dtype: Optional[torch.dtype] = None,
+    device: Optional[DeviceLikeType] = None,
+    layout: Optional[torch.layout] = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    utils.check_pin_memory(pin_memory)
+
+    shape_ = utils.extract_shape_from_varargs(shape)
+
+    dtype = utils.dtype_or_default(dtype)
+    device = utils.device_or_default(device)
+
+    return prims.normal(
+        shape_,
+        mean=0.0,
+        std=1.0,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+def scalar_tensor(
+    a: NumberType,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device: Optional[DeviceLikeType] = None,
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    utils.check_layout(layout)
+    utils.check_pin_memory(pin_memory)
+    dtype = dtype if dtype is not None else utils.type_to_dtype(type(a))
+    device = device if device is not None else torch.device("cpu")
+    return prims.scalar_tensor(a, dtype=dtype, device=device)
+
+
+#
+# Randomness References
+#
+
+
+def _uniform_helper(
+    shape: ShapeType,
+    low: Union[bool, int, float] = 0.0,
+    high: Union[bool, int, float] = 1.0,
+    *,
+    dtype: torch.dtype,
+    device: DeviceLikeType,
+) -> TensorLikeType:
+    utils.validate_shape(shape)
+
+    assert isinstance(low, Number)
+    assert isinstance(high, Number)
+    low = sym_float(low)
+    high = sym_float(high)
+
+    assert isinstance(dtype, torch.dtype)
+    device = utils.canonicalize_device(device)
+
+    return prims._uniform_helper(shape, low=low, high=high, dtype=dtype, device=device)
+
+
+@register_decomposition(aten.masked_fill)
+@out_wrapper()
+def masked_fill(a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLikeType):
+    python_type = utils.dtype_to_type(a.dtype)
+    if isinstance(value, Number):
+        value_type = type(value)
+    else:
+        # NOTE: Could not use value = item(value) as it resulted in
+        # RuntimeError: Cannot cast FakeTensor(cpu) to number
+        value_ndim = value.ndim
+        torch._check(
+            value_ndim == 0,
+            lambda: f"only supports a 0-dimensional value tensor, but got tensor with {value_ndim} dimension",
+        )
+        # `masked_fill` allows cpu scalar to be moved to cuda and xpu but not otherwise.
+        is_cpu_scalar = a.device.type in ["cuda", "xpu"] and value.device.type == "cpu"
+        torch._check(
+            is_cpu_scalar or value.device == a.device,
+            lambda: "Expected `value` to be on same device as `a`",
+        )
+        value_type = utils.dtype_to_type(value.dtype)
+
+    if value_type is complex:
+        # only downcasting from complex to lower type is not allowed.
+        # We allow casting `value` to lower type for other case
+        # Eg. float -> int.
+        # Ref: https://github.com/pytorch/pytorch/issues/79195
+        torch._check(
+            utils.is_weakly_lesser_type(value_type, python_type),
+            lambda: f"could not convert to type {python_type} without overflow",
+        )
+
+    # Since `where` allows type-promotion,
+    # cast value to correct type before passing to `where`
+    value = _maybe_convert_to_dtype(value, a.dtype)
+    r = torch.where(mask, value, a)  # type: ignore[arg-type]
+
+    # aten.mask_fill always return a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return r.contiguous()
+
+
+@register_decomposition(aten.masked_fill_)
+def masked_fill_(
+    a: TensorLikeType, mask: TensorLikeType, value: TensorOrNumberLikeType
+) -> TensorLikeType:
+    b = torch.masked_fill(a, mask, value)  # type: ignore[arg-type]
+    a.copy_(b)
+    return a
+
+
+# CompositeImplicitAutograd - don't register decomp
+def allclose(
+    a: TensorLikeType,
+    b: TensorLikeType,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    equal_nan: bool = False,
+) -> bool:
+    """
+    Reference implementation of torch.allclose
+    """
+    _check_close_args(name="torch.allclose", a=a, b=b, rtol=rtol, atol=atol)
+
+    return bool(
+        torch.all(torch.isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)).item()
+    )
+
+
+def equal(a: TensorLikeType, b: TensorLikeType) -> bool:
+    utils.check_same_device(a, b, allow_cpu_scalar_tensors=False)
+    utils.check_same_dtype(a, b)
+
+    # Shape check
+    if a.ndim != b.ndim:
+        return False
+
+    for x, y in zip(a.shape, b.shape):
+        if x != y:
+            return False
+
+    # Short-circuits if there are no elements to validate
+    if a.numel() == 0:
+        return True
+
+    return item(all(eq(a, b)))  # type: ignore[return-value]
+
+
+@register_decomposition(aten.norm)
+@out_wrapper(exact_dtype=True)
+def norm(
+    input: TensorLikeType,
+    p: Optional[Union[float, str]] = "fro",
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    # In these cases we compute the "Frobenius norm"
+    if (
+        p == "fro" and (dim is None or isinstance(dim, Dim) or len(dim) <= 2)
+    ) or p is None:
+        p = 2
+    if isinstance(dim, Dim):
+        dim = [dim]
+    if isinstance(p, str):
+        # Here we either call the nuclear norm, or we call matrix_norm with some arguments
+        # that will throw an error
+        if dim is None:
+            dim = tuple(range(input.ndim))
+        return torch.linalg.matrix_norm(input, p, dim, keepdim, dtype=dtype)
+    else:
+        return torch.linalg.vector_norm(input, p, dim, keepdim, dtype=dtype)
+
+
+@register_decomposition(aten.trace)
+@out_wrapper()
+def trace(self: TensorLikeType) -> TensorLikeType:
+    torch._check(
+        self.ndim == 2, lambda: "expected a matrix, but got tensor with dim {self.ndim}"
+    )
+    return torch.sum(torch.diag(self, 0))
+
+
+def _make_r_binary_op(base_op):
+    def rop(
+        a: Union[TensorLikeType, NumberType],
+        b: Union[TensorLikeType, NumberType],
+    ) -> TensorLikeType:
+        return base_op(b, a)
+
+    return rop
+
+
+rtruediv = _make_r_binary_op(true_divide)
+rfloordiv = _make_r_binary_op(floor_divide)
+rpow = _make_r_binary_op(pow)
+
+
+@register_decomposition(aten.triu)
+@out_wrapper()
+def triu(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType:
+    torch._check(
+        a.ndim >= 2, lambda: "triu: input tensor must have at least 2 dimensions"
+    )
+    h, w = a.shape[-2:]
+    mask = (
+        torch.arange(w, device=a.device).unsqueeze(-2)
+        - torch.arange(h, device=a.device).unsqueeze(-1)
+    ) >= diagonal
+
+    # aten.triu always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(mask, a).contiguous()
+
+
+@register_decomposition(aten.tril)
+@out_wrapper()
+def tril(a: TensorLikeType, diagonal: int = 0) -> TensorLikeType:
+    torch._check(
+        a.ndim >= 2, lambda: "tril: input tensor must have at least 2 dimensions"
+    )
+    h, w = a.shape[-2:]
+    mask = (
+        torch.arange(w, device=a.device).unsqueeze(-2)
+        - torch.arange(h, device=a.device).unsqueeze(-1)
+    ) <= diagonal
+
+    # aten.tril always returns a new contiguous tensor
+    # contiguous() is needed to correctly model the output stride
+    return utils.mask_tensor(mask, a).contiguous()
+
+
+# This is based on get_tril_size in aten/src/ATen/native/TensorFactories.h
+# The components of the matrix that belong to the lower triangle with offset
+# form a pentagon that can be broken down into a top trapezoid and a bottom
+# rectangle. For the implementation of tril_indices, we need the sizes of
+# both of these, as well as the length of the top side of the trapezoid.
+def _get_tril_sizes(row: int, col: int, offset: int) -> Tuple[int, int, int]:
+    if row == 0 or col == 0:
+        return 0, 0, 0
+
+    m_first_row = min(col, 1 + offset) if offset > 0 else int(row + offset > 0)
+    m_last_row = max(0, min(col, row + offset))
+    n_row_all = max(0, min(row, row + offset))
+    n_row_trapezoid = m_last_row - m_first_row + 1
+
+    # Number of elements in top trapezoid
+    trapezoid_size = (m_first_row + m_last_row) * n_row_trapezoid // 2
+    # Number of elements in bottom rectangle
+    diff_row = n_row_all - n_row_trapezoid
+    rectangle_size = max(0, diff_row * col)
+
+    return trapezoid_size, rectangle_size, m_first_row
+
+
+def _trilu_checks(
+    name: str,
+    row: int,
+    col: int,
+    dtype: torch.dtype,
+    layout: torch.layout,
+    pin_memory: bool,
+):
+    torch._check(row >= 0, lambda: f"row must be non-negative, got {row}")
+    torch._check(col >= 0, lambda: f"col must be non-negative, got {col}")
+    torch._check(
+        dtype in (torch.int32, torch.int64),
+        lambda: f"\"{name}\" not implemented for '{dtype}'",
+    )
+
+
+# This is based on tril_indices_cuda in aten/src/ATen/native/cuda/TensorFactories.cu
+@register_decomposition(aten.tril_indices)
+@out_wrapper()
+def tril_indices(
+    row: int,
+    col: int,
+    offset: int = 0,
+    *,
+    dtype: torch.dtype = torch.long,
+    layout: torch.layout = torch.strided,
+    device: DeviceLikeType = "cpu",
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    _trilu_checks("tril_indices", row, col, dtype, layout, pin_memory)
+
+    trapezoid_size, rectangle_size, m_first_row = _get_tril_sizes(row, col, offset)
+    row_offset = max(0, -offset)
+
+    arange_kw = partial(
+        torch.arange, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+    # first we do the indices for top trapezoid
+    xs1 = arange_kw(0, trapezoid_size, dtype=torch.float64)
+    b = m_first_row - 0.5
+    row_inds1 = torch.floor(-b + torch.sqrt(b * b + 2 * xs1))
+    col_inds1 = torch.floor(xs1 - (2 * m_first_row - 1 + row_inds1) * row_inds1 * 0.5)
+    row_inds1 = _maybe_convert_to_dtype(row_inds1 + row_offset, dtype)
+    col_inds1 = _maybe_convert_to_dtype(col_inds1, dtype)
+
+    # then bottom rectangle
+    xs2 = arange_kw(0, rectangle_size, dtype=dtype)
+    row_inds2 = xs2 // col + (col - m_first_row + 1 + row_offset)
+    col_inds2 = xs2 % col
+
+    return torch.stack(
+        (torch.cat((row_inds1, row_inds2)), torch.cat((col_inds1, col_inds2)))
+    )
+
+
+# Similar to _get_tril_sizes above, but here there is a top trapezoid and
+# a bottom rectangle instead. Note that you can't reduce this to
+# _get_tril_sizes(col, row, -offset) because that would correspond to
+# decomposing into a left trapezoid and right rectangle.
+def _get_triu_sizes(row: int, col: int, offset: int) -> Tuple[int, int, int]:
+    if row == 0 or col == 0:
+        return 0, 0, 0
+
+    m_first_row = max(0, col - offset) if offset > 0 else col
+
+    # Number of elements in top rectangle
+    rectangle_size = max(0, min(row, -offset) * col)
+
+    # Number of elements in bottom trapezoid
+    trapezoid_size_tril, rectangle_size_tril, _ = _get_tril_sizes(row, col, offset - 1)
+    triu_size = row * col - (trapezoid_size_tril + rectangle_size_tril)
+    trapezoid_size = triu_size - rectangle_size
+
+    return trapezoid_size, rectangle_size, m_first_row
+
+
+@register_decomposition(aten.triu_indices)
+@out_wrapper()
+def triu_indices(
+    row: int,
+    col: int,
+    offset: int = 0,
+    *,
+    dtype: torch.dtype = torch.long,
+    layout: torch.layout = torch.strided,
+    device: DeviceLikeType = "cpu",
+    pin_memory: bool = False,
+) -> TensorLikeType:
+    _trilu_checks("triu_indices", row, col, dtype, layout, pin_memory)
+
+    trapezoid_size, rectangle_size, m_first_row = _get_triu_sizes(row, col, offset)
+    col_offset = max(0, offset)
+
+    arange_kw = partial(
+        torch.arange, layout=layout, device=device, pin_memory=pin_memory
+    )
+
+    # indices for top rectangle
+    xs2 = arange_kw(0, rectangle_size, dtype=dtype)
+    row_inds2 = xs2 // col
+    col_inds2 = xs2 % col
+
+    # bottom trapezoid
+    xs1 = arange_kw(0, trapezoid_size, dtype=torch.float64)
+    b = -0.5 - m_first_row
+    row_inds1 = torch.floor(-b - torch.sqrt(b * b - 2 * xs1))
+    col_inds1 = torch.floor(xs1 - ((2 * m_first_row - 1 - row_inds1) * row_inds1) * 0.5)
+    row_inds1 = _maybe_convert_to_dtype(row_inds1, dtype)
+    col_inds1 = _maybe_convert_to_dtype(col_inds1, dtype)
+
+    if col:
+        row_inds1 = row_inds1 + (rectangle_size // col)
+    col_inds1 = col_inds1 + col_offset
+
+    return torch.stack(
+        (torch.cat((row_inds2, row_inds1)), torch.cat((col_inds2, col_inds1)))
+    )
+
+
+@register_decomposition(aten.bucketize)
+@out_wrapper(exact_dtype=True)
+def bucketize(
+    a: TensorLikeType,
+    boundaries: TensorLikeType,
+    *,
+    out_int32: bool = False,
+    right: bool = False,
+):
+    torch._check(
+        boundaries.dim() == 1,
+        lambda: f"boundaries tensor must be 1 dimension but got dim({boundaries.dim()})",
+    )
+
+    out_dtype = torch.int32 if out_int32 else torch.int64
+    n_boundaries = boundaries.shape[-1]
+    if n_boundaries == 0:
+        return torch.zeros_like(a)
+    # We are trying to find the bucket (defined by pairs of consecutive elements of `boundaries`)
+    # each element of `a` belongs to. We use binary search to achieve logarithimic complexity,
+    # but each step of the search is done "in parallel" over all elements of `a`
+    # can't use int32 as indexes, so we have to do all computations with int64 and convert at the end
+    start = torch.zeros(a.shape, device=a.device, dtype=torch.int64)
+    end = start + n_boundaries
+    # Max depth of the binary search
+    # Since we can't break out of the loop at different points for different elements of a,
+    # we just do the max amount of iterations that binary search requires and add condition
+    # tensor (cond_update below) to stop updating once the search terminates
+
+    # For first iteration through loop we can skip some checks, we have separate implementation
+    mid = start + (end - start) // 2
+    mid_val = boundaries[mid]
+    if right:
+        cond_mid = mid_val > a
+    else:
+        cond_mid = mid_val >= a
+    start = torch.where(cond_mid, start, mid + 1)
+
+    if n_boundaries > 1:
+        cond_update = torch.ones_like(a, dtype=torch.bool)
+        niters = int(math.log2(n_boundaries))
+        for _ in range(niters):
+            end = torch.where(cond_mid & cond_update, mid, end)
+            cond_update = start < end
+            # start might end up pointing to 1 past the end, we guard against that
+            mid = torch.where(cond_update, start + (end - start) // 2, 0)
+            mid_val = boundaries[mid]
+            # If right is true, the buckets are closed on the *left*
+            # (i.e., we are doing the equivalent of std::upper_bound in C++)
+            # Otherwise they are closed on the right (std::lower_bound)
+            if right:
+                cond_mid = mid_val > a
+            else:
+                cond_mid = mid_val >= a
+            start = torch.where((~cond_mid) & cond_update, mid + 1, start)
+
+    return start.to(dtype=out_dtype)
+
+
+@register_decomposition(aten.cauchy)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def cauchy(self, median=0, sigma=1, generator=None):
+    assert generator is None
+    torch._check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"Cauchy distribution is a continuous probability distribution. \
+        dtype must be a floating point but you specified {self.dtype}",
+    )
+    torch._check(
+        sigma > 0.0,
+        lambda: f"cauchy_ expects sigma > 0.0, but found sigma={sigma}",
+    )
+    return median + sigma * torch.tan(math.pi * (torch.rand_like(self) - 0.5))
+
+
+@register_decomposition(aten.exponential)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def exponential(self, rate=1, generator=None):
+    assert generator is None
+    torch._check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"Exponential distribution is a continuous probability distribution. \
+        dtype must be a floating point but you specified {self.dtype}",
+    )
+    torch._check(
+        rate > 0.0,
+        lambda: f"exponential_ expects lambda > 0.0, but found lambda={rate}",
+    )
+    return -1 / rate * torch.log1p(-torch.rand_like(self))
+
+
+@register_decomposition(aten.geometric)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def geometric(self, p, generator=None):
+    assert generator is None
+    # TODO: fix inductor rand_like for integer, bool dtypes
+    torch._check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"geometric not implemented for {self.dtype}",
+    )
+    torch._check(
+        0 < p and p < 1,
+        lambda: f"geometric_ expects p to be in (0, 1), but got p={p}",
+    )
+    return torch.floor(torch.log1p(-torch.rand_like(self)) / math.log1p(-p)) + 1
+
+
+@register_decomposition(aten.log_normal)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def log_normal(self, mean=1, std=2, generator=None):
+    assert generator is None
+    torch._check(
+        not utils.is_complex_dtype(self.dtype)
+        and not utils.is_integer_dtype(self.dtype)
+        and not utils.is_boolean_dtype(self.dtype),
+        lambda: f"log_normal not implemented for {self.dtype}",
+    )
+    torch._check(
+        0 < std,
+        lambda: f"log_normal_ expects std > 0.0, but found std={std}",
+    )
+    return torch.exp(std * torch.randn_like(self) + mean)
+
+
+# TODO: add support for functionalization aten.normal_functional
+# NOTE: the device and dtype will be ignored when shape is None
+@register_decomposition(aten.normal)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=(
+        "mean",
+        "std",
+    ),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def normal(
+    mean=0,
+    std=1,
+    size=None,
+    *,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    assert layout is None or layout == torch.strided
+
+    if not isinstance(std, TensorLike):
+        torch._check(
+            std >= 0, lambda: f"normal expects std >= 0.0, but found std {std}"
+        )
+
+    if size is None:
+        tensors = tuple(t for t in (mean, std) if isinstance(t, TensorLike))
+        torch._check(
+            len(tensors) > 0,
+            lambda: "normal expects that either mean or std is a tensor, or size is defined",
+        )
+        torch._check(
+            layout is None and pin_memory is None,
+            lambda: "Cannot pass layout, or pin_memory without size",
+        )
+
+        size = _broadcast_shapes(*(t.shape for t in tensors))
+        dtype = tensors[0].dtype
+        device = tensors[0].device
+    else:
+        torch._check(
+            not isinstance(mean, TensorLike) and not isinstance(std, TensorLike),
+            lambda: "normal expects mean and std to be scalars when size is defined",
+        )
+        dtype = torch.get_default_dtype() if dtype is None else dtype
+        device = torch.device("cpu") if device is None else device
+
+    normal_samples = prims.normal(
+        size,
+        mean=0.0,
+        std=1.0,
+        dtype=dtype,
+        device=device,
+        requires_grad=False,
+        generator=generator,
+    )
+    return std * normal_samples + mean
+
+
+@register_decomposition(aten.normal_)
+def normal_(self, mean=0, std=1, *, generator=None):
+    return normal(mean, std, self.shape, out=self, generator=generator)
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def rad2deg(self: TensorLikeType):
+    torch._check(
+        not utils.is_complex_dtype(self.dtype),
+        lambda: "rad2deg is not supported for complex tensors.",
+    )
+    M_180_PI = 57.295779513082320876798154814105170332405472466564
+    return self * M_180_PI
+
+
+@_make_elementwise_unary_reference(ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT)
+def deg2rad(self: TensorLikeType):
+    torch._check(
+        not utils.is_complex_dtype(self.dtype),
+        lambda: "deg2rad is not supported for complex tensors.",
+    )
+    M_PI_180 = 0.017453292519943295769236907684886127134428718885417
+    return self * M_PI_180
+
+
+@register_decomposition(aten.count_nonzero)
+@out_wrapper()
+def count_nonzero(self, dim: Optional[DimsType] = None):
+    return (self != 0).sum(dim)
+
+
+def _dot_check(self, other):
+    torch._check(
+        self.dim() == 1 and other.dim() == 1,
+        lambda: f"1D tensors expected, but got {self.dim()}D and {other.dim()}D tensors",
+    )
+
+    def numel_error():
+        return (
+            f"inconsistent tensor size, expected tensor [{self.numel()}] and src [{other.numel()}] to have the"
+            f"same number of elements, but got {self.numel()} and {other.numel()} elements respectively"
+        )
+
+    torch._check(self.numel() == other.numel(), numel_error)
+
+
+@register_decomposition(aten.dot)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self", "other"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def dot(self, other):
+    if self.is_complex():
+        if self.is_conj():
+            if other.is_conj():
+                return torch.dot(self.conj(), other.conj()).conj()
+            else:
+                return torch.vdot(self.conj(), other)
+        elif other.is_conj():
+            return torch.vdot(other.conj(), self)
+
+    _dot_check(self, other)
+    return (self * other).sum()
+
+
+@register_decomposition(aten.vdot)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self", "other"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def vdot(self, other):
+    if not self.is_complex():
+        return torch.dot(self, other)
+
+    if self.is_conj():
+        if other.is_conj():
+            return torch.vdot(other.conj(), self.conj())
+        else:
+            return torch.dot(self.conj(), other)
+    elif other.is_conj():
+        return torch.dot(self, other.conj()).conj()
+
+    _dot_check(self, other)
+    # The decomposition fails if you do self.conj()... not sure why
+    return (self.conj_physical() * other).sum()
+
+
+# inplace
+abs_ = _make_inplace(abs)
+acos_ = _make_inplace(acos)
+acosh_ = _make_inplace(acosh)
+add_ = _make_inplace(add)
+addcmul_ = _make_inplace(addcmul)
+addcdiv_ = _make_inplace(addcdiv)
+asin_ = _make_inplace(asin)
+asinh_ = _make_inplace(asinh)
+atan_ = _make_inplace(atan)
+atanh_ = _make_inplace(atanh)
+atan2_ = _make_inplace(atan2)
+bitwise_and_ = _make_inplace(bitwise_and)
+bitwise_left_shift_ = _make_inplace(bitwise_left_shift)
+bitwise_not_ = _make_inplace(bitwise_not)
+bitwise_or_ = _make_inplace(bitwise_or)
+bitwise_right_shift_ = _make_inplace(bitwise_right_shift)
+bitwise_xor_ = _make_inplace(bitwise_xor)
+ceil_ = _make_inplace(ceil)
+clamp_ = _make_inplace(clamp)
+clamp_min_ = _make_inplace(clamp_min)
+clamp_max_ = _make_inplace(clamp_max)
+conj_physical_ = _make_inplace(conj_physical)
+copysign_ = _make_inplace(copysign)
+cos_ = _make_inplace(cos)
+cosh_ = _make_inplace(cosh)
+cumsum_ = _make_inplace(cumsum)
+cumprod_ = _make_inplace(cumprod)
+deg2rad_ = _make_inplace(deg2rad)
+digamma_ = _make_inplace(digamma)
+div_ = _make_inplace(div)
+eq_ = _make_inplace(eq)
+erf_ = _make_inplace(erf)
+erfc_ = _make_inplace(erfc)
+erfinv_ = _make_inplace(erfinv)
+exp_ = _make_inplace(exp)
+exp2_ = _make_inplace(exp2)
+expm1_ = _make_inplace(expm1)
+float_power_ = _make_inplace(float_power)
+floor_ = _make_inplace(floor)
+floor_divide_ = _make_inplace(floor_divide)
+fmod_ = _make_inplace(fmod)
+frac_ = _make_inplace(frac)
+gcd_ = _make_inplace(gcd)
+ge_ = _make_inplace(ge)
+gt_ = _make_inplace(gt)
+heaviside_ = _make_inplace(heaviside)
+hypot_ = _make_inplace(hypot)
+igamma_ = _make_inplace(igamma)
+igammac_ = _make_inplace(igammac)
+i0_ = _make_inplace(i0)
+lcm_ = _make_inplace(lcm)
+le_ = _make_inplace(le)
+lerp_ = _make_inplace(lerp)
+lgamma_ = _make_inplace(lgamma)
+log10_ = _make_inplace(log10)
+log1p_ = _make_inplace(log1p)
+log2_ = _make_inplace(log2)
+log_ = _make_inplace(log)
+logical_and_ = _make_inplace(logical_and)
+logical_not_ = _make_inplace(logical_not)
+logical_or_ = _make_inplace(logical_or)
+logical_xor_ = _make_inplace(logical_xor)
+lt_ = _make_inplace(lt)
+mul_ = _make_inplace(mul)
+mvlgamma_ = _make_inplace(mvlgamma)
+nan_to_num_ = _make_inplace(nan_to_num)
+ne_ = _make_inplace(ne)
+neg_ = _make_inplace(neg)
+nextafter_ = _make_inplace(nextafter)
+pow_ = _make_inplace(pow)
+rad2deg_ = _make_inplace(rad2deg)
+reciprocal_ = _make_inplace(reciprocal)
+remainder_ = _make_inplace(remainder)
+rsqrt_ = _make_inplace(rsqrt)
+sgn_ = _make_inplace(sgn)
+sigmoid_ = _make_inplace(sigmoid)
+sign_ = _make_inplace(sign)
+sin_ = _make_inplace(sin)
+sinc_ = _make_inplace(sinc)
+sinh_ = _make_inplace(sinh)
+sqrt_ = _make_inplace(sqrt)
+square_ = _make_inplace(square)
+sub_ = _make_inplace(sub)
+tan_ = _make_inplace(tan)
+tanh_ = _make_inplace(tanh)
+tril_ = _make_inplace(tril)
+triu_ = _make_inplace(triu)
+true_divide_ = _make_inplace(true_divide)
+trunc_ = _make_inplace(trunc)
+xlogy_ = _make_inplace(xlogy)
+cauchy_ = _make_inplace(cauchy)
+exponential_ = _make_inplace(exponential)
+geometric_ = _make_inplace(geometric)
+log_normal_ = _make_inplace(log_normal)
+zero_ = _make_inplace(zero)
+
+
+# xref: isStorage in torch/csrc/DynamicTypes.cpp
+def _isStorage(obj):
+    return isinstance(obj, (torch.TypedStorage, torch.UntypedStorage))
+
+
+# xref: compute_sizes in torch/csrc/utils/tensor_new.cpp
+def _compute_sizes(seq, scalar_type):
+    MAX_DIMS = 128
+    is_storage = _isStorage(seq)
+    sizes = []
+    # TODO: this is inaccurate, we actually test PySequence_Check
+    while isinstance(seq, (list, tuple)):
+        length = len(seq)
+        if is_storage:
+            length //= scalar_type.itemsize
+        sizes.append(length)
+        if len(sizes) > MAX_DIMS:
+            raise ValueError(f"too many dimensions '{type(seq).__name__}'")
+        if length == 0:
+            break
+        try:
+            handle = seq[0]
+        except Exception:
+            raise ValueError(  # noqa: TRY200
+                f"could not determine the shape of object type '{type(seq).__name__}'"
+            )
+        seq = handle
+
+    return sizes
+
+
+# xref: infer_scalar_type in torch/csrc/utils/tensor_new.cpp
+def _infer_scalar_type(obj):
+    if isinstance(obj, FloatLike):
+        return torch.get_default_dtype()
+    if isinstance(obj, IntLike) and not isinstance(obj, bool):  # careful!
+        return torch.int64
+    if isinstance(obj, bool):
+        return torch.bool
+    if isinstance(obj, complex):
+        default_dtype = torch.get_default_dtype()
+        if default_dtype is torch.float:
+            return torch.cfloat
+        elif default_dtype is torch.double:
+            return torch.cdouble
+        else:
+            raise RuntimeError("invalid default scalar type for complex")
+    if isinstance(obj, torch.Tensor):
+        return obj.dtype
+    if isinstance(obj, str):
+        raise TypeError(f"new(): invalid data type '{type(obj).__name__}'")
+    # TODO: this is inaccurate, we actually test PySequence_Check
+    if isinstance(obj, (list, tuple)):
+        scalarType = None
+        length = len(obj)
+        # match NumPy semantics, except use default tensor type instead of
+        # double.
+        if length == 0:
+            return torch.get_default_dtype()
+        for i in range(length):
+            cur_item = obj[i]
+            # TODO: test this
+            """
+            if cur_item is obj:
+                raise TypeError("new(): self-referential lists are incompatible")
+            """
+            item_scalarType = _infer_scalar_type(cur_item)  # recurse!
+            if scalarType is not None:
+                scalarType = torch.promote_types(scalarType, item_scalarType)
+            else:
+                scalarType = item_scalarType
+            if scalarType is torch.cdouble:
+                # this won't change (unless we hit undefined, but that will
+                # fail later)
+                return scalarType
+        return scalarType
+    raise RuntimeError(f"Could not infer dtype of {type(obj).__name__}")
+
+
+# Analogous to recursive_store
+# xref: recursive_store in torch/csrc/utils/tensor_new.cpp
+def _recursive_build(scalarType: torch.dtype, obj: TensorOrNumberLikeType):
+    if isinstance(obj, Tensor) and obj.ndim <= 1:
+        obj = obj.item()
+        # fall through into next case
+    if isinstance(obj, Number):
+        return torch.scalar_tensor(obj, dtype=scalarType)
+
+    seq = obj
+    return torch.stack([_recursive_build(scalarType, item) for item in seq])
+
+
+# xref: internal_new_from_data in torch/csrc/utils/tensor_new.cpp
+def _internal_new_from_data(
+    options,
+    scalar_type,
+    device_opt,
+    data,
+    copy_variables,
+    copy_numpy,
+    type_inference,
+    pin_memory=False,
+):
+    if isinstance(data, torch.Tensor):
+        torch._check(
+            not pin_memory, lambda: "Can't pin tensor constructed from a variable"
+        )
+        var = data
+        if copy_variables:
+            var = var.detach()
+        inferred_scalar_type = var.dtype if type_inference else scalar_type
+        device = device_opt if device_opt is not None else var.device
+        return var.to(
+            device=device,
+            dtype=inferred_scalar_type,
+            non_blocking=False,
+            copy=copy_variables,
+        )
+
+    # TODO
+    if hasattr(data, "__cuda_array_interface__"):
+        return NotImplemented
+
+    # TODO: test for numpy input with PyArray_Check
+
+    device = device_opt if device_opt is not None else options["device"]
+    inferred_scalar_type = _infer_scalar_type(data) if type_inference else scalar_type
+
+    # NB: Don't need to avoid tracing, as we aren't going to do any manual
+    # pointer filling tricks
+    if _isStorage(data):
+        return NotImplemented
+    else:
+        if torch.device(device).type == "meta":
+            return NotImplemented
+
+        # In the C implementation, we would directly start poking the memory
+        # of a freshly allocated CPU tensor.  Here, we're going to do an
+        # alternate, heinously slow implementation: turn each individual
+        # scalar into a tensor, and then repeatedly cat them together
+        tensor = _recursive_build(inferred_scalar_type, data)
+
+        tensor = tensor.to(device, inferred_scalar_type, non_blocking=False, copy=False)
+
+    # NB: lift_fresh is not needed, because we built the tensor from scalars
+    # guaranteeing a fresh tensor in this case
+    return tensor
+
+
+# xref: tensor_ctor in torch/csrc/utils/tensor_new.cpp
+def tensor(data, *, dtype=None, device=None, pin_memory=False, requires_grad=False):
+    # TODO (or not): support names kwarg
+    if isinstance(data, torch.Tensor):
+        warnings.warn(
+            "To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() "
+            "or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor)"
+        )
+    type_inference = dtype is None
+    new_tensor = _internal_new_from_data(
+        # device="cpu" because that's what you get with torch.tensor(2) no
+        # device by default
+        {"device": "cpu"},  # TODO: use torch.get_default_tensor_type
+        dtype if dtype is not None else torch.get_default_dtype(),
+        device,
+        data,
+        copy_variables=True,
+        copy_numpy=True,
+        type_inference=type_inference,
+        pin_memory=pin_memory,
+    )
+    new_tensor.detach_()
+    new_tensor.requires_grad_(requires_grad)
+    return new_tensor
+
+
+# Views
+# We can't model these as above, as the pattern of doing `op(a, out=a)` does not work for a view function
+# given that it does not reshape the input (it just copies the result into it)
+
+# squeeze_ = _make_inplace(squeeze)
+# t_ = _make_inplace(t)
+# transpose_ = _make_inplace(transpose)
+# unsqueeze_ = _make_inplace(unsqueeze)
+
+
+import torch._refs._conversions
+import torch._refs.fft
+import torch._refs.linalg
+import torch._refs.nn.functional
+import torch._refs.special
diff --git a/MLPY/Lib/site-packages/torch/_refs/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0df7830c61db3cf2f46d2eb8e5f742c8e243913d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/__pycache__/_conversions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/__pycache__/_conversions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f3c25efae588e4d76c5341b9db360d74fd57fac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/__pycache__/_conversions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/__pycache__/fft.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/__pycache__/fft.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9e8d69b19c2775cdb761fac83b8f2feb82c1137
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/__pycache__/fft.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/_conversions.py b/MLPY/Lib/site-packages/torch/_refs/_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce345330e5676ef107060ed93dc50bc9ddd5ebcb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/_conversions.py
@@ -0,0 +1,118 @@
+import torch
+import torch._prims_common as utils
+
+# Utilities should come BEFORE this import
+from torch._decomp import register_decomposition
+
+from torch._prims_common import TensorLikeType
+from torch._prims_common.wrappers import out_wrapper
+from torch._refs import _broadcast_shapes
+
+# Data conversion references.
+#
+# Note: this module breaks the usual _refs to torch naming scheme where
+# _refs.foo.bar is a ref for torch.foo.bar.  The following definitions are not
+# part of _refs/__init__.py to avoid name clashes with Python builtin types
+# (like int).
+
+__all__ = [
+    # dtypes
+    "bfloat16",
+    "bool",
+    "byte",
+    "cdouble",
+    "cfloat",
+    "chalf",
+    "char",
+    "double",
+    "float",
+    "half",
+    "int",
+    "long",
+    "short",
+    # misc
+    "complex",
+    "polar",
+]
+
+
+def _make_conversion_method(name: str, dtype: torch.dtype):
+    def fn(
+        self: TensorLikeType, memory_format: torch.memory_format = torch.preserve_format
+    ) -> TensorLikeType:
+        return self.to(dtype, memory_format=memory_format)  # type: ignore[call-overload]
+
+    fn.__name__ = name
+    return fn
+
+
+bfloat16 = _make_conversion_method("bfloat16", torch.bfloat16)
+
+bool = _make_conversion_method("bool", torch.bool)
+
+byte = _make_conversion_method("byte", torch.uint8)
+
+cdouble = _make_conversion_method("cdouble", torch.cdouble)
+
+cfloat = _make_conversion_method("cfloat", torch.cfloat)
+
+chalf = _make_conversion_method("chalf", torch.complex32)
+
+char = _make_conversion_method("char", torch.int8)
+
+double = _make_conversion_method("double", torch.double)
+
+float = _make_conversion_method("float", torch.float)
+
+half = _make_conversion_method("half", torch.half)
+
+int = _make_conversion_method("int", torch.int)
+
+long = _make_conversion_method("long", torch.long)
+
+short = _make_conversion_method("short", torch.short)
+
+
+@register_decomposition(torch._ops.ops.aten.complex)
+# Note: complex has type promotion tests disabled due to different semantics.
+# exact_dtype is for compat with complex_check_dtype from core.
+@out_wrapper(exact_dtype=True)
+def complex(real: TensorLikeType, imag: TensorLikeType) -> TensorLikeType:
+    allowed_dtypes = (torch.float32, torch.float64, torch.float16)
+    torch._check(
+        real.dtype in allowed_dtypes and imag.dtype in allowed_dtypes,
+        lambda: (
+            f"Expected both inputs to be Half, Float or Double tensors but got "
+            f"{real.dtype} and {imag.dtype}"
+        ),
+    )
+    torch._check(
+        real.dtype == imag.dtype,
+        lambda: (
+            f"Expected object of scalar type {real.dtype} but got "
+            f"scalar type {imag.dtype} for second argument"
+        ),
+    )
+    result_dtype = utils.corresponding_complex_dtype(real.dtype)  # type: ignore[arg-type]
+    common_shape = _broadcast_shapes(real.shape, imag.shape)
+    result = real.new_empty(
+        common_shape,
+        dtype=result_dtype,
+        layout=real.layout,
+        device=real.device,
+        # pin_memory=real.is_pinned(),  # NYI
+    )
+    result.real = real
+    result.imag = imag
+    return result
+
+
+@register_decomposition(torch._ops.ops.aten.polar)
+# Note: polar has type promotion tests disabled due to different semantics.
+# exact_dtype is for compat with complex_check_dtype from core.
+@out_wrapper(exact_dtype=True)
+def polar(abs: TensorLikeType, angle: TensorLikeType) -> TensorLikeType:
+    result = torch.complex(abs, angle)
+    result.real = abs * torch.cos(angle)
+    result.imag = abs * torch.sin(angle)
+    return result
diff --git a/MLPY/Lib/site-packages/torch/_refs/fft.py b/MLPY/Lib/site-packages/torch/_refs/fft.py
new file mode 100644
index 0000000000000000000000000000000000000000..47500148f8428462ab9b46f4d767047e12276e46
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/fft.py
@@ -0,0 +1,590 @@
+import math
+
+from typing import Iterable, List, Literal, NamedTuple, Optional, Sequence, Tuple, Union
+
+import torch
+import torch._prims as prims
+import torch._prims_common as utils
+from torch._decomp import register_decomposition
+from torch._prims_common import DimsType, ShapeType, TensorLikeType
+from torch._prims_common.wrappers import _maybe_convert_to_dtype, out_wrapper
+
+__all__ = [
+    # Transforms
+    "fft",
+    "fft2",
+    "fftn",
+    "hfft",
+    "hfft2",
+    "hfftn",
+    "rfft",
+    "rfft2",
+    "rfftn",
+    "ifft",
+    "ifft2",
+    "ifftn",
+    "ihfft",
+    "ihfft2",
+    "ihfftn",
+    "irfft",
+    "irfft2",
+    "irfftn",
+    # Helpers
+    "fftshift",
+    "ifftshift",
+]
+
+NormType = Union[None, Literal["forward", "backward", "ortho"]]
+_NORM_VALUES = {None, "forward", "backward", "ortho"}
+aten = torch._ops.ops.aten
+
+
+def _apply_norm(
+    x: TensorLikeType, norm: NormType, signal_numel: int, forward: bool
+) -> TensorLikeType:
+    """Apply normalization to the un-normalized FFT result"""
+    torch._check(norm in _NORM_VALUES, lambda: f"Invalid normalization mode: {norm}")
+
+    if norm == "ortho":
+        return x * (1 / math.sqrt(signal_numel))
+
+    normalize = (not forward and (norm is None or norm == "backward")) or (
+        forward and norm == "forward"
+    )
+    return x * (1 / signal_numel) if normalize else x
+
+
+def _promote_type_fft(
+    dtype: torch.dtype, require_complex: bool, device: torch.device
+) -> torch.dtype:
+    """Helper to promote a dtype to one supported by the FFT primitives"""
+    if dtype.is_complex:
+        return dtype
+
+    # Promote integral to default float type
+    if not dtype.is_floating_point:
+        dtype = torch.get_default_dtype()
+
+    allowed_types = [torch.float32, torch.float64]
+    maybe_support_half = device.type in ["cuda", "meta"]
+
+    if maybe_support_half:
+        allowed_types.append(torch.float16)
+    torch._check(dtype in allowed_types, lambda: f"Unsupported dtype {dtype}")
+
+    if require_complex:
+        dtype = utils.corresponding_complex_dtype(dtype)
+
+    return dtype
+
+
+def _maybe_promote_tensor_fft(
+    t: TensorLikeType, require_complex: bool = False
+) -> TensorLikeType:
+    """Helper to promote a tensor to a dtype supported by the FFT primitives"""
+    cur_type = t.dtype
+    new_type = _promote_type_fft(cur_type, require_complex, t.device)
+    return _maybe_convert_to_dtype(t, new_type)  # type: ignore[return-value]
+
+
+def _resize_fft_input(
+    x: TensorLikeType, dims: Tuple[int, ...], sizes: Tuple[int, ...]
+) -> TensorLikeType:
+    """
+    Fixes the shape of x such that x.size(dims[i]) == sizes[i],
+    either by zero-padding, or by slicing x starting from 0.
+    """
+    assert len(dims) == len(sizes)
+    must_copy = False
+    x_sizes = x.shape
+    pad_amount = [0] * len(x_sizes) * 2
+    for i in range(len(dims)):
+        if sizes[i] == -1:
+            continue
+
+        if x_sizes[dims[i]] < sizes[i]:
+            must_copy = True
+            pad_idx = len(pad_amount) - 2 * dims[i] - 1
+            pad_amount[pad_idx] = sizes[i] - x_sizes[dims[i]]
+
+        if x_sizes[dims[i]] > sizes[i]:
+            x = x.narrow(dims[i], 0, sizes[i])
+
+    return torch.constant_pad_nd(x, pad_amount) if must_copy else x
+
+
+def _fft_c2r(
+    func_name: str,
+    input: TensorLikeType,
+    n: Optional[int],
+    dim: int,
+    norm: NormType,
+    forward: bool,
+) -> TensorLikeType:
+    """Common code for performing any complex to real FFT (irfft or hfft)"""
+    input = _maybe_promote_tensor_fft(input, require_complex=True)
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
+    last_dim_size = n if n is not None else 2 * (input.shape[dim] - 1)
+    torch._check(
+        last_dim_size >= 1,
+        lambda: f"Invalid number of data points ({last_dim_size}) specified",
+    )
+
+    if n is not None:
+        input = _resize_fft_input(input, dims=dims, sizes=(last_dim_size // 2 + 1,))
+
+    if forward:
+        input = torch.conj(input)
+
+    output = prims.fft_c2r(input, dim=dims, last_dim_size=last_dim_size)
+    return _apply_norm(output, norm=norm, signal_numel=last_dim_size, forward=forward)
+
+
+def _fft_r2c(
+    func_name: str,
+    input: TensorLikeType,
+    n: Optional[int],
+    dim: int,
+    norm: NormType,
+    forward: bool,
+    onesided: bool,
+) -> TensorLikeType:
+    """Common code for performing any real to complex FFT (rfft or ihfft)"""
+    torch._check(
+        not input.dtype.is_complex,
+        lambda: f"{func_name} expects a floating point input tensor, but got {input.dtype}",
+    )
+    input = _maybe_promote_tensor_fft(input)
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
+    dim_size = n if n is not None else input.shape[dim]
+    torch._check(
+        dim_size >= 1, lambda: f"Invalid number of data points ({dim_size}) specified"
+    )
+
+    if n is not None:
+        input = _resize_fft_input(input, dims, (n,))
+
+    ret = prims.fft_r2c(input, dim=dims, onesided=onesided)
+    ret = _apply_norm(ret, norm, dim_size, forward)
+    return ret if forward else torch.conj(ret)
+
+
+def _fft_c2c(
+    func_name: str,
+    input: TensorLikeType,
+    n: Optional[int],
+    dim: int,
+    norm: NormType,
+    forward: bool,
+) -> TensorLikeType:
+    """Common code for performing any complex to complex FFT (fft or ifft)"""
+    torch._check(
+        input.dtype.is_complex,
+        lambda: f"{func_name} expects a complex input tensor, but got {input.dtype}",
+    )
+    dims = (utils.canonicalize_dim(input.ndim, dim, wrap_scalar=False),)
+    dim_size = n if n is not None else input.shape[dim]
+    torch._check(
+        dim_size >= 1, lambda: f"Invalid number of data points ({dim_size}) specified"
+    )
+
+    if n is not None:
+        input = _resize_fft_input(input, dims, (n,))
+
+    ret = prims.fft_c2c(input, dim=dims, forward=forward)
+    return _apply_norm(ret, norm, dim_size, forward)
+
+
+@register_decomposition(aten.fft_fft)
+@out_wrapper()
+def fft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    if input.dtype.is_complex:
+        return _fft_c2c("fft", input, n, dim, norm, forward=True)
+    else:
+        return _fft_r2c("fft", input, n, dim, norm, forward=True, onesided=False)
+
+
+@register_decomposition(aten.fft_ifft)
+@out_wrapper()
+def ifft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    if input.dtype.is_complex:
+        return _fft_c2c("ifft", input, n, dim, norm, forward=False)
+    else:
+        return _fft_r2c("ifft", input, n, dim, norm, forward=False, onesided=False)
+
+
+@register_decomposition(aten.fft_rfft)
+@out_wrapper()
+def rfft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    return _fft_r2c("rfft", input, n, dim, norm, forward=True, onesided=True)
+
+
+@register_decomposition(aten.fft_irfft)
+@out_wrapper()
+def irfft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    return _fft_c2r("irfft", input, n, dim, norm, forward=False)
+
+
+@register_decomposition(aten.fft_hfft)
+@out_wrapper()
+def hfft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    return _fft_c2r("hfft", input, n, dim, norm, forward=True)
+
+
+@register_decomposition(aten.fft_ihfft)
+@out_wrapper()
+def ihfft(
+    input: TensorLikeType,
+    n: Optional[int] = None,
+    dim: int = -1,
+    norm: NormType = None,
+) -> TensorLikeType:
+    return _fft_r2c("ihfft", input, n, dim, norm, forward=False, onesided=True)
+
+
+class _ShapeAndDims(NamedTuple):
+    shape: Tuple[int, ...]
+    dims: Tuple[int, ...]
+
+
+def _canonicalize_fft_shape_and_dim_args(
+    input: TensorLikeType, shape: Optional[ShapeType], dim: Optional[DimsType]
+) -> _ShapeAndDims:
+    """Convert the shape and dim arguments into a canonical form where neither are optional"""
+    input_dim = input.ndim
+    input_sizes = input.shape
+
+    if dim is not None:
+        if not isinstance(dim, Sequence):
+            dim = (dim,)
+        ret_dims = utils.canonicalize_dims(input_dim, dim, wrap_scalar=False)
+
+        # Check dims are unique
+        torch._check(
+            len(set(ret_dims)) == len(ret_dims), lambda: "FFT dims must be unique"
+        )
+
+    if shape is not None:
+        if not isinstance(shape, Sequence):
+            shape = (shape,)
+
+        # Has shape, might have dim
+        torch._check(
+            dim is None or len(dim) == len(shape),
+            lambda: "When given, dim and shape arguments must have the same length",
+        )
+        transform_ndim = len(shape)
+
+        torch._check(
+            transform_ndim <= input_dim,
+            lambda: f"Got shape with {transform_ndim} values but input tensor "
+            f"only has {input_dim} dimensions.",
+        )
+
+        # If shape is given, dims defaults to the last len(shape) dimensions
+        if dim is None:
+            ret_dims = tuple(range(input_dim - transform_ndim, input_dim))
+
+        # Translate any -1 values in shape to the default length
+        ret_shape = tuple(
+            s if s != -1 else input_sizes[d] for (s, d) in zip(shape, ret_dims)  # type: ignore[possibly-undefined]
+        )
+    elif dim is None:
+        # No shape, no dim
+        ret_dims = tuple(range(input_dim))
+        ret_shape = tuple(input_sizes)
+    else:
+        # No shape, has dim
+        ret_shape = tuple(input_sizes[d] for d in ret_dims)  # type: ignore[possibly-undefined]
+
+    for n in ret_shape:
+        torch._check(n > 0, lambda: f"Invalid number of data points ({n}) specified")
+
+    return _ShapeAndDims(shape=ret_shape, dims=ret_dims)  # type: ignore[possibly-undefined]
+
+
+def _prod(xs: Iterable[int]) -> int:
+    """Compute product of a list"""
+    prod = 1
+    for x in xs:
+        prod *= x
+    return prod
+
+
+def _fftn_c2c(
+    function_name: str,
+    input: TensorLikeType,
+    shape: Tuple[int, ...],
+    dim: Tuple[int, ...],
+    norm: NormType,
+    forward: bool,
+) -> TensorLikeType:
+    """Common code for n-dimensional complex to complex FFTs (fftn or ifftn)"""
+    torch._check(
+        input.dtype.is_complex,
+        lambda: f"{function_name} expects a complex input tensor, "
+        f"but got {input.dtype}",
+    )
+    x = _resize_fft_input(input, dim, shape)
+    output = prims.fft_c2c(x, dim=dim, forward=forward)
+    return _apply_norm(output, norm=norm, signal_numel=_prod(shape), forward=forward)
+
+
+@register_decomposition(aten.fft_fftn)
+@out_wrapper()
+def fftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    (shape, dim) = _canonicalize_fft_shape_and_dim_args(input, s, dim)
+    x = _maybe_promote_tensor_fft(input, require_complex=True)
+    return _fftn_c2c("fftn", x, shape, dim, norm, forward=True)
+
+
+@register_decomposition(aten.fft_ifftn)
+@out_wrapper()
+def ifftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    (shape, dim) = _canonicalize_fft_shape_and_dim_args(input, s, dim)
+    x = _maybe_promote_tensor_fft(input, require_complex=True)
+    return _fftn_c2c("ifftn", x, shape, dim, norm, forward=False)
+
+
+@register_decomposition(aten.fft_rfftn)
+@out_wrapper()
+def rfftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    torch._check(
+        not input.dtype.is_complex,
+        lambda: f"rfftn expects a real-valued input tensor, but got {input.dtype}",
+    )
+    shape, dim = _canonicalize_fft_shape_and_dim_args(input, s, dim)
+    input = _maybe_promote_tensor_fft(input, require_complex=False)
+    input = _resize_fft_input(input, dim, shape)
+    out = prims.fft_r2c(input, dim=dim, onesided=True)
+    return _apply_norm(out, norm=norm, signal_numel=_prod(shape), forward=True)
+
+
+@register_decomposition(aten.fft_ihfftn)
+@out_wrapper()
+def ihfftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    torch._check(
+        not input.dtype.is_complex,
+        lambda: f"ihfftn expects a real-valued input tensor, but got {input.dtype}",
+    )
+    shape, dim = _canonicalize_fft_shape_and_dim_args(input, s, dim)
+    torch._check(len(shape) > 0, lambda: "ihfftn must transform at least one axis")
+    input = _maybe_promote_tensor_fft(input, require_complex=False)
+    input = _resize_fft_input(input, dim, shape)
+
+    tmp = prims.fft_r2c(input, dim=dim[-1:], onesided=True)
+
+    if len(dim) == 1:
+        tmp = _apply_norm(tmp, norm=norm, signal_numel=shape[0], forward=False)
+        return prims.conj(tmp)
+
+    tmp = prims.conj_physical(tmp)
+    tmp = prims.fft_c2c(tmp, dim=dim[:-1], forward=False)
+    return _apply_norm(tmp, norm=norm, signal_numel=_prod(shape), forward=False)
+
+
+class _CanonicalizeC2rReturn(NamedTuple):
+    shape: Tuple[int, ...]
+    dim: Tuple[int, ...]
+    last_dim_size: int
+
+
+def _canonicalize_fft_c2r_shape_and_dim_args(
+    fname: str,
+    input: TensorLikeType,
+    s: Optional[ShapeType],
+    dim: Optional[DimsType],
+) -> _CanonicalizeC2rReturn:
+    """Canonicalize shape and dim arguments for n-dimensional c2r transforms,
+    as well as calculating the last_dim_size which is shape[dim[-1]] for the output"""
+    (shape, dim) = _canonicalize_fft_shape_and_dim_args(input, s, dim)
+    torch._check(len(shape) > 0, lambda: f"{fname} must transform at least one axis")
+
+    if s is None or s[-1] == -1:
+        last_dim_size = 2 * (input.shape[dim[-1]] - 1)
+    else:
+        last_dim_size = shape[-1]
+
+    torch._check(
+        last_dim_size >= 1,
+        lambda: f"Invalid number of data points ({last_dim_size}) specified",
+    )
+
+    shape_list = list(shape)
+    shape_list[-1] = last_dim_size // 2 + 1
+    return _CanonicalizeC2rReturn(
+        shape=tuple(shape_list), dim=dim, last_dim_size=last_dim_size
+    )
+
+
+@register_decomposition(aten.fft_irfftn)
+@out_wrapper()
+def irfftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    shape, dim, last_dim_size = _canonicalize_fft_c2r_shape_and_dim_args(
+        "irfftn", input, s, dim
+    )
+    input = _maybe_promote_tensor_fft(input, require_complex=True)
+    input = _resize_fft_input(input, dim, shape)
+    out = prims.fft_c2r(input, dim=dim, last_dim_size=last_dim_size)
+    return _apply_norm(out, norm, _prod(out.shape[d] for d in dim), forward=False)
+
+
+@register_decomposition(aten.fft_hfftn)
+@out_wrapper()
+def hfftn(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = None,
+    norm: NormType = None,
+) -> TensorLikeType:
+    shape, dim, last_dim_size = _canonicalize_fft_c2r_shape_and_dim_args(
+        "hfftn", input, s, dim
+    )
+    input = _maybe_promote_tensor_fft(input, require_complex=True)
+    input = _resize_fft_input(input, dim, shape)
+
+    tmp = prims.fft_c2c(input, dim=dim[:-1], forward=True) if len(dim) > 1 else input
+    tmp = _apply_norm(tmp, norm, _prod(shape[:-1]), forward=True)
+    tmp = prims.conj_physical(tmp)
+    out = prims.fft_c2r(tmp, dim=dim[-1:], last_dim_size=last_dim_size)
+    return _apply_norm(out, norm, last_dim_size, forward=True)
+
+
+@register_decomposition(aten.fft_fft2)
+@out_wrapper()
+def fft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.fftn(input, s=s, dim=dim, norm=norm)
+
+
+@register_decomposition(aten.fft_ifft2)
+@out_wrapper()
+def ifft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.ifftn(input, s=s, dim=dim, norm=norm)
+
+
+@register_decomposition(aten.fft_rfft2)
+@out_wrapper()
+def rfft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.rfftn(input, s=s, dim=dim, norm=norm)
+
+
+@register_decomposition(aten.fft_irfft2)
+@out_wrapper()
+def irfft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.irfftn(input, s=s, dim=dim, norm=norm)
+
+
+@register_decomposition(aten.fft_hfft2)
+@out_wrapper()
+def hfft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.hfftn(input, s=s, dim=dim, norm=norm)
+
+
+@register_decomposition(aten.fft_ihfft2)
+@out_wrapper()
+def ihfft2(
+    input: TensorLikeType,
+    s: Optional[ShapeType] = None,
+    dim: Optional[DimsType] = (-2, -1),
+    norm: NormType = None,
+) -> TensorLikeType:
+    return torch.fft.ihfftn(input, s=s, dim=dim, norm=norm)
+
+
+def _default_alldims(dim: Optional[DimsType], x: TensorLikeType) -> List[int]:
+    """Convert Optional[DimsType] to a simple list, defaulting to all dimensions"""
+    if dim is None:
+        return list(range(x.ndim))
+    elif not isinstance(dim, Sequence):
+        return [dim]
+    else:
+        return list(dim)
+
+
+@register_decomposition(aten.fft_fftshift)
+def fftshift(input: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
+    dims = _default_alldims(dim, input)
+    shift = [input.shape[d] // 2 for d in dims]
+    return torch.roll(input, shift, dims)
+
+
+@register_decomposition(aten.fft_ifftshift)
+def ifftshift(input: TensorLikeType, dim: Optional[DimsType] = None) -> TensorLikeType:
+    dims = _default_alldims(dim, input)
+    shift = [(input.shape[d] + 1) // 2 for d in dims]
+    return torch.roll(input, shift, dims)
diff --git a/MLPY/Lib/site-packages/torch/_refs/linalg/__init__.py b/MLPY/Lib/site-packages/torch/_refs/linalg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..492f43e840909c6236f98c3e138022ff8317d9be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/linalg/__init__.py
@@ -0,0 +1,308 @@
+from functools import partial
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+import torch._prims as prims
+
+import torch._prims_common as utils
+import torch._refs as refs
+import torch._refs.linalg as linalg
+from torch import Tensor
+from torch._prims_common import (
+    check_fp_or_complex,
+    check_is_matrix,
+    Dim,
+    DimsType,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    IntLike,
+    NumberType,
+    TensorLikeType,
+)
+from torch._prims_common.wrappers import (
+    _maybe_convert_to_dtype,
+    elementwise_type_promotion_wrapper,
+    out_wrapper,
+)
+
+
+__all__ = [
+    "diagonal",
+    "matrix_norm",
+    "norm",
+    "svd",
+    "svdvals",
+    "vector_norm",
+    "vecdot",
+    "cross",
+]
+
+
+def _check_norm_dtype(dtype: Optional[torch.dtype], x_dtype: torch.dtype, fn_name: str):
+    """
+    Checks related to the dtype kwarg in `linalg.*norm` functions
+    """
+    if dtype is not None:
+        torch._check(
+            utils.is_float_dtype(dtype) or utils.is_complex_dtype(dtype),
+            lambda: f"{fn_name}: dtype should be floating point or complex. Got {dtype}",
+        )
+        torch._check(
+            utils.is_complex_dtype(dtype) == utils.is_complex_dtype(x_dtype),
+            lambda: "{fn_name}: dtype should be {d} for {d} inputs. Got {dtype}".format(
+                fn_name=fn_name,
+                d="complex" if utils.is_complex_dtype(x_dtype) else "real",
+                dtype=dtype,
+            ),
+        )
+        torch._check(
+            utils.get_higher_dtype(dtype, x_dtype) == dtype,
+            lambda: f"{fn_name}: the dtype of the input ({x_dtype}) should be convertible "
+            "without narrowing to the specified dtype ({dtype})",
+        )
+
+
+# Utilities should come BEFORE this import
+from torch._decomp import register_decomposition
+from torch._decomp.decompositions import pw_cast_for_opmath
+
+
+@register_decomposition(torch._ops.ops.aten.linalg_cross)
+@out_wrapper()
+@pw_cast_for_opmath
+def cross(a: Tensor, b: Tensor, dim: int = -1):
+    torch._check(
+        a.ndim == b.ndim,
+        lambda: "linalg.cross: inputs must have the same number of dimensions.",
+    )
+    torch._check(
+        a.size(dim) == 3 and b.size(dim) == 3,
+        lambda: f"linalg.cross: inputs dim {dim} must have length 3, got {a.size(dim)} and {b.size(dim)}",
+    )
+    a, b = torch.broadcast_tensors(a, b)
+    dim = utils.canonicalize_dim(a.ndim, dim)
+    idx = torch.arange(3, device=a.device)
+    return a.index_select(dim, (idx + 1) % 3) * b.index_select(
+        dim, (idx + 2) % 3
+    ) - a.index_select(dim, (idx + 2) % 3) * b.index_select(dim, (idx + 1) % 3)
+
+
+def diagonal(
+    input: TensorLikeType,
+    *,
+    offset: int = 0,
+    dim1: int = -2,
+    dim2: int = -1,
+) -> TensorLikeType:
+    return torch.diagonal(input, offset=offset, dim1=dim1, dim2=dim2)
+
+
+@register_decomposition(torch._ops.ops.aten.linalg_vector_norm)
+@out_wrapper(exact_dtype=True)
+def vector_norm(
+    x: TensorLikeType,
+    ord: Union[float, int] = 2,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    # Checks
+    check_fp_or_complex(x.dtype, "linalg.vector_norm")
+
+    if isinstance(dim, Dim):
+        dim = [dim]  # type: ignore[assignment]
+
+    if x.numel() == 0 and (ord < 0.0 or ord == float("inf")):
+        torch._check(
+            dim is not None and len(dim) != 0,
+            lambda: f"linalg.vector_norm cannot compute the {ord} norm on an empty tensor "
+            "because the operation does not have an identity",
+        )
+        shape = x.shape
+        assert dim is not None  # mypy does not seem to be able to see through check?
+        for d in dim:
+            torch._check(
+                shape[d] != 0,
+                lambda: f"linalg.vector_norm cannot compute the {ord} norm on the "
+                f"dimension {d} because this dimension is empty and the "
+                "operation does not have an identity",
+            )
+    _check_norm_dtype(dtype, x.dtype, "linalg.vector_norm")
+
+    computation_dtype, result_dtype = utils.reduction_dtypes(
+        x, utils.REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT, dtype
+    )
+
+    to_result_dtype = partial(_maybe_convert_to_dtype, dtype=result_dtype)
+
+    # Implementation
+    if ord == 0.0:
+        return torch.sum(torch.ne(x, 0.0), dim=dim, keepdim=keepdim, dtype=result_dtype)
+    elif ord == float("inf"):
+        return to_result_dtype(torch.amax(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value,arg-type]
+    elif ord == float("-inf"):
+        return to_result_dtype(torch.amin(torch.abs(x), dim=dim, keepdim=keepdim))  # type: ignore[return-value,arg-type]
+    else:
+        # From here on the computation dtype is important as the reduction is non-trivial
+        x = _maybe_convert_to_dtype(x, computation_dtype)  # type: ignore[assignment]
+        reduce_sum = partial(torch.sum, dim=dim, keepdim=keepdim)
+
+        is_ord_even = ord % 2 == 0 if isinstance(ord, IntLike) else ord % 2.0 == 0.0
+        if not (is_ord_even and utils.is_float_dtype(x.dtype)):
+            x = torch.abs(x)
+        return to_result_dtype(torch.pow(reduce_sum(torch.pow(x, ord)), 1.0 / ord))  # type: ignore[return-value]
+
+
+def _backshift_permutation(dim0, dim1, ndim):
+    # Auxiliary function for matrix_norm
+    # Computes the permutation that moves the two given dimensions to the back
+    ret = [i for i in range(ndim) if i != dim0 and i != dim1]
+    ret.extend((dim0, dim1))
+    return ret
+
+
+def _inverse_permutation(perm):
+    # Given a permutation, returns its inverse. It's equivalent to argsort on an array
+    return [i for i, j in sorted(enumerate(perm), key=lambda i_j: i_j[1])]
+
+
+# CompositeImplicitAutograd
+@out_wrapper(exact_dtype=True)
+def matrix_norm(
+    A: TensorLikeType,
+    ord: Union[float, str] = "fro",
+    dim: DimsType = (-2, -1),
+    keepdim: bool = False,
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    # shape
+    check_is_matrix(A, "linalg.matrix_norm")
+    # dim
+    dim = utils.canonicalize_dims(A.ndim, dim)
+    if isinstance(dim, Dim):
+        dim = (dim,)  # type: ignore[assignment]
+    torch._check(
+        len(dim) == 2, lambda: "linalg.matrix_norm: dim must be a 2-tuple. Got {dim}"
+    )
+    torch._check(
+        dim[0] != dim[1],
+        lambda: "linalg.matrix_norm: dims must be different. Got ({dim[0]}, {dim[1]})",
+    )
+    # dtype arg
+    _check_norm_dtype(dtype, A.dtype, "linalg.matrix_norm")
+
+    if isinstance(ord, str):
+        # ord
+        torch._check(
+            ord in ("fro", "nuc"),
+            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+        )
+        # dtype
+        check_fp_or_complex(
+            A.dtype, "linalg.matrix_norm", allow_low_precision_dtypes=ord != "nuc"
+        )
+
+        if ord == "fro":
+            return vector_norm(A, 2, dim, keepdim, dtype=dtype)
+        else:  # ord == "nuc"
+            if dtype is not None:
+                A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+            perm = _backshift_permutation(dim[0], dim[1], A.ndim)
+            result = torch.sum(svdvals(prims.transpose(A, perm)), -1, keepdim)
+            if keepdim:
+                inv_perm = _inverse_permutation(perm)
+                result = prims.transpose(torch.unsqueeze(result, -1), inv_perm)
+            return result
+    else:
+        # ord
+        abs_ord = abs(ord)
+        torch._check(
+            abs_ord in (2, 1, float("inf")),
+            lambda: "linalg.matrix_norm: Order {ord} not supported.",
+        )
+        # dtype
+        check_fp_or_complex(
+            A.dtype, "linalg.matrix_norm", allow_low_precision_dtypes=ord != 2
+        )
+
+        max_min = partial(torch.amax if ord > 0.0 else torch.amin, keepdim=keepdim)
+
+        if abs_ord == 2.0:
+            if dtype is not None:
+                A = _maybe_convert_to_dtype(A, dtype)  # type: ignore[assignment]
+            perm = _backshift_permutation(dim[0], dim[1], A.ndim)
+            result = max_min(svdvals(prims.transpose(A, perm)), dim=-1)
+            if keepdim:
+                inv_perm = _inverse_permutation(perm)
+                result = prims.transpose(torch.unsqueeze(result, -1), inv_perm)
+            return result
+        else:  # 1, -1, inf, -inf
+            dim0, dim1 = dim
+            if abs_ord == float("inf"):
+                dim0, dim1 = dim1, dim0
+            if not keepdim and (dim0 < dim1):
+                dim1 -= 1
+            return max_min(
+                vector_norm(A, 1.0, dim=dim0, keepdim=keepdim, dtype=dtype), dim1
+            )
+
+
+# CompositeImplicitAutograd
+@out_wrapper(exact_dtype=True)
+def norm(
+    A: TensorLikeType,
+    ord: Optional[Union[float, str]] = None,
+    dim: Optional[DimsType] = None,
+    keepdim: bool = False,
+    *,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    if dim is not None:
+        if isinstance(dim, Dim):
+            dim = (dim,)  # type: ignore[assignment]
+        torch._check(
+            len(dim) in (1, 2),
+            lambda: "linalg.norm: If dim is specified, it must be of length 1 or 2. Got {dim}",
+        )
+    elif ord is not None:
+        torch._check(
+            A.ndim in (1, 2),
+            lambda: "linalg.norm: If dim is not specified but ord is, the input must be 1D or 2D. Got {A.ndim}D",
+        )
+
+    if ord is not None and (
+        (dim is not None and len(dim) == 2) or (dim is None and A.ndim == 2)
+    ):
+        if dim is None:
+            dim = (0, 1)
+        return matrix_norm(A, ord, dim, keepdim, dtype=dtype)
+    else:
+        if ord is None:
+            ord = 2.0
+        return vector_norm(A, ord, dim, keepdim, dtype=dtype)
+
+
+# CompositeImplicitAutograd
+@out_wrapper("U", "S", "Vh", exact_dtype=True)
+def svd(A: TensorLikeType, full_matrices: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
+    return prims.svd(A, full_matrices=full_matrices)
+
+
+# CompositeImplicitAutograd
+@out_wrapper(exact_dtype=True)
+def svdvals(A: TensorLikeType) -> Tensor:
+    return svd(A, full_matrices=False)[1]
+
+
+# CompositeImplicitAutograd
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("x", "y"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def vecdot(x: Tensor, y: Tensor, dim: int = -1) -> Tensor:
+    check_fp_or_complex(x.dtype, "linalg.vecdot")
+    return (x.conj() * y).sum(dim=dim)
diff --git a/MLPY/Lib/site-packages/torch/_refs/linalg/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/linalg/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4340d23e2be53fba4f6dd5dd4613cb4281b8f57c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/linalg/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/nn/__init__.py b/MLPY/Lib/site-packages/torch/_refs/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..970be144221489803f5ff4fcbe500037775ca79e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/nn/__init__.py
@@ -0,0 +1,3 @@
+from typing import List
+
+__all__: List[str] = []
diff --git a/MLPY/Lib/site-packages/torch/_refs/nn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef53da8d86a863fa1bff3e5a1b262b24400454be
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/nn/functional/__init__.py b/MLPY/Lib/site-packages/torch/_refs/nn/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d0a32d1c7c0da310a3d5e965ebacfed6be163b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/nn/functional/__init__.py
@@ -0,0 +1,1230 @@
+import math
+from functools import wraps
+from typing import Callable, Optional, Union
+
+import torch
+import torch._prims as prims
+import torch._prims_common as utils
+import torch._refs as refs
+from torch._decomp import register_decomposition
+from torch._prims_common import (
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    NumberType,
+    ShapeType,
+    TensorLike,
+    TensorLikeType,
+)
+from torch._prims_common.wrappers import (
+    elementwise_type_promotion_wrapper,
+    elementwise_unary_scalar_wrapper,
+    out_wrapper,
+)
+from torch._refs import _make_inplace
+
+__all__ = [
+    "alpha_dropout",
+    "celu",
+    "celu_",
+    "dropout",
+    "elu",
+    "elu_",
+    "gelu",
+    "glu",
+    "group_norm",
+    "hardshrink",
+    "hardtanh",
+    "hinge_embedding_loss",
+    "huber_loss",
+    "l1_loss",
+    "layer_norm",
+    "leaky_relu",
+    "log_softmax",
+    "margin_ranking_loss",
+    "mish",
+    "mish_",
+    "mse_loss",
+    "nll_loss",
+    "pairwise_distance",
+    "pdist",
+    "poisson_nll_loss",
+    "prelu",
+    "relu",
+    "relu6",
+    "selu",
+    "selu_",
+    "smooth_l1_loss",
+    "softmax",
+    "softmin",
+    "softplus",
+    "softshrink",
+    "tanhshrink",
+    "threshold",
+    "threshold_",
+    "triplet_margin_loss",
+]
+
+Tensor = torch.Tensor
+aten = torch._ops.ops.aten
+DispatchKey = torch._C.DispatchKey  # type: ignore[attr-defined]
+
+
+def _dropout_helper(
+    self: TensorLikeType,
+    val: float,
+) -> TensorLikeType:
+    """
+    Helper function for all dropout-type operators. During training,
+    some of the elements of the input tensor are randomly masked.
+
+    Returns the masked tensor of the boolean values.
+
+    """
+
+    return (
+        refs._uniform_helper(
+            self.shape, low=0.0, high=1.0, dtype=torch.float32, device=self.device
+        )
+        < val
+    )
+
+
+@register_decomposition(aten.alpha_dropout)
+def alpha_dropout(
+    self: TensorLikeType, p: float = 0.5, training: bool = False, inplace: bool = False
+) -> TensorLikeType:
+    if inplace:
+        raise NotImplementedError
+
+    if not training:
+        return self
+
+    torch._check(
+        p <= 1 and p >= 0,
+        lambda: f"dropout probability has to be between 0 and 1, but got, {p}",
+    )
+
+    if p == 1:
+        return torch.zeros_like(self)
+
+    if p == 0:
+        return self
+
+    dropout_mask = _dropout_helper(self, 1 - p)
+
+    # From paper: Self-Normalizing Neural Networks (https://arxiv.org/pdf/1706.02515.pdf)
+    # alpha = - SELU.alpha * SELU.scale, here
+    # SELU.alpha = 1.6732632423543772848170429916717 and
+    # SELU.scale = 1.0507009873554804934193349852946
+    alpha = -1.7580993408473766
+
+    a = 1.0 / math.sqrt((alpha * alpha * p + 1) * (1 - p))
+    b = torch.logical_not(dropout_mask)
+    b = b * (alpha * a) + alpha * a * p
+    dropout_mask = a * dropout_mask
+
+    return self * dropout_mask + b
+
+
+def _inplace_wrapper(fn):
+    """
+    Given a nn.functional non-linearity, implements its `inplace: bool` argument
+    """
+
+    # nb. We use the name of the first argument used in the unary references
+    @wraps(fn)
+    def _fn(a, *args, inplace=False, **kwargs):
+        if inplace:
+            torch._check(
+                "out" not in kwargs,
+                lambda: "Cannot set inplace=True and pass out= at the same time",
+            )
+            return fn(a, *args, inplace=False, out=a, **kwargs)
+        else:
+            return fn(a, *args, inplace=False, **kwargs)
+
+    return _fn
+
+
+# celu is implemented specially because it has an alpha argument
+# celu is very similar to elu
+@register_decomposition(aten.celu)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def celu(
+    a: TensorLikeType, alpha: Optional[NumberType] = None, inplace: bool = False
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.celu
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    rhs: TensorLikeType
+    if alpha is not None:
+        python_type = utils.dtype_to_type(a.dtype)
+        if not utils.is_weakly_lesser_type(type(alpha), python_type):
+            msg = f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!"
+            raise ValueError(msg)
+        rhs = alpha * torch.expm1(torch.true_divide(a, alpha))  # type: ignore[arg-type]
+    else:
+        rhs = torch.expm1(a)
+
+    return torch.where(a > 0, a, rhs)
+
+
+@_inplace_wrapper
+@out_wrapper()
+def dropout(
+    a: TensorLikeType, p: float = 0.5, training: bool = True, inplace: bool = False
+) -> TensorLikeType:
+    if inplace:
+        raise NotImplementedError
+
+    if not training:
+        return a
+
+    torch._check(
+        p <= 1 and p >= 0,
+        lambda: f"dropout probability has to be between 0 and 1, but got, {p}",
+    )
+
+    if p == 1:
+        return torch.zeros_like(a)
+
+    if p == 0:
+        return a
+
+    scale = 1 / (1 - p)
+    dropout_mask = _dropout_helper(a, 1 - p)
+
+    return a * dropout_mask * scale
+
+
+@register_decomposition(aten.elu)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def elu(
+    a: TensorLikeType,
+    alpha: NumberType = 1.0,
+    scale: NumberType = 1.0,
+    input_scale: NumberType = 1.0,
+    inplace: bool = False,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.elu
+    """
+    if inplace:
+        raise NotImplementedError
+
+    # nb. This should be factored out into a can_cast aux function
+    python_type = utils.dtype_to_type(a.dtype)
+    torch._check(
+        utils.is_weakly_lesser_type(type(input_scale), python_type),
+        lambda: f"input_scale argument of type {type(input_scale)} cannot be safely cast to type {python_type}!",
+    )
+    torch._check(
+        utils.is_weakly_lesser_type(type(scale), python_type),
+        lambda: f"scale argument of type {type(scale)} cannot be safely cast to type {python_type}!",
+    )
+    torch._check(
+        utils.is_weakly_lesser_type(type(alpha), python_type),
+        lambda: f"alpha argument of type {type(alpha)} cannot be safely cast to type {python_type}!",
+    )
+
+    return torch.where(a > 0, scale * a, (alpha * scale) * torch.expm1(a * input_scale))
+
+
+@register_decomposition(aten.relu)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def relu(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.relu
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    return torch.where(torch.le(a, 0), 0, a)
+
+
+def group_norm(
+    input: Tensor,
+    num_groups: int,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-5,
+) -> Tensor:
+    """
+    Reference implementation of :func:`torch.nn.functional.group_norm`.
+    """
+    torch._check(
+        input.ndim >= 2,
+        lambda: f"Expected at least 2 dimensions for input tensor but received {input.ndim}",
+    )
+
+    batch_size = input.shape[0]
+    num_channels = input.shape[1]
+    torch._check(
+        num_channels % num_groups == 0,
+        lambda: "Expected number of channels in input to be divisible by num_groups, "
+        + f"but got input of shape {input.shape} and num_groups = {num_groups}",
+    )
+
+    # input shape is (N, C, *), so we flatten all inner dimensions except (N, C)
+    flattened_inner_size = 1
+    for dim_length in input.shape[2:]:
+        flattened_inner_size *= dim_length
+
+    return torch.native_group_norm(
+        input,
+        weight,
+        bias,
+        batch_size,
+        num_channels,
+        flattened_inner_size,
+        num_groups,
+        eps,
+    )[0]
+
+
+def layer_norm(
+    input: Tensor,
+    normalized_shape: ShapeType,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-5,
+) -> Tensor:
+    """
+    Reference implementation of :func:`torch.nn.functional.layer_norm`.
+    """
+    return torch.native_layer_norm(input, normalized_shape, weight, bias, eps)[0]
+
+
+@register_decomposition(aten.leaky_relu)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def leaky_relu(
+    a: TensorLikeType, negative_slope: float = 0.01, inplace: bool = False
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.leaky_relu
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    python_type = utils.dtype_to_type(a.dtype)
+    if not utils.is_weakly_lesser_type(type(negative_slope), python_type):
+        msg = f"negative_slope argument of type {type(negative_slope)} cannot be safely cast to type {python_type}!"
+        raise ValueError(msg)
+    return torch.where(torch.gt(a, 0), a, torch.mul(a, negative_slope))
+
+
+@register_decomposition(aten.mish)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def mish(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.mish
+    """
+
+    if inplace:
+        raise NotImplementedError
+    return a * torch.tanh(torch.nn.functional.softplus(a))
+
+
+@register_decomposition(aten.selu)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def selu(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.selu
+    """
+    if inplace:
+        raise NotImplementedError
+
+    alpha = 1.6732632423543772848170429916717
+    scale = 1.0507009873554804934193349852946
+
+    rhs = alpha * torch.expm1(a)
+
+    return scale * torch.where(a > 0, a, rhs)
+
+
+# Forwarding alias: the functional variant doesn't support the out kwarg
+# CompositeImplicitAutograd - don't register decomp
+def softmax(
+    a: TensorLikeType,
+    dim: Optional[int] = None,
+    _stacklevel: int = 3,  # for compat when using TorchRefsMode(strict=True)
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    # The error is for compat with regular PyTorch, which has this behavior
+    # deprecated.  For PrimTorch, it's fine to drop support for deprecated
+    # behavior because it requires explicit opt in.  This error is to inform
+    # users how to update their calls.
+    torch._check(dim is not None, lambda: "implicit dim not supported, use dim=X")
+    return torch.softmax(a=a, dim=dim, dtype=dtype)  # type: ignore[call-overload]
+
+
+# CompositeImplicitAutograd - don't register decomp
+def softmin(
+    a: TensorLikeType,
+    dim: Optional[int] = None,
+    _stacklevel: int = 3,  # for compat when using TorchRefsMode(strict=True)
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    # The error is for compat with regular PyTorch, which has this behavior
+    # deprecated.  For PrimTorch, it's fine to drop support for deprecated
+    # behavior because it requires explicit opt in.  This error is to inform
+    # users how to update their calls.
+    torch._check(dim is not None, lambda: "implicit dim not supported, use dim=X")
+    return torch.softmax(a=-a, dim=dim, dtype=dtype)  # type: ignore[call-overload]
+
+
+# softplus is implemented specially because it has beta and threshold arguments
+@register_decomposition(aten.softplus)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def softplus(
+    a: TensorLikeType,
+    beta: Optional[NumberType] = None,
+    threshold: NumberType = 20,
+    inplace: bool = False,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.softplus
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    rhs: TensorLikeType
+    if beta is not None:
+        python_type = utils.dtype_to_type(a.dtype)
+        if not utils.is_weakly_lesser_type(type(beta), python_type):
+            msg = f"beta argument of type {type(beta)} cannot be safely cast to type {python_type}!"
+            raise ValueError(msg)
+        scaled_input = a * beta
+        rhs = torch.true_divide(torch.log1p(torch.exp(scaled_input)), beta)  # type: ignore[arg-type]
+
+    else:
+        scaled_input = a
+        rhs = torch.log1p(torch.exp(scaled_input))
+
+    return torch.where(scaled_input > threshold, a, rhs)
+
+
+@aten.hardshrink.default.py_impl(DispatchKey.Autograd)
+@register_decomposition(aten.hardshrink)
+@out_wrapper()
+def hardshrink(a: TensorLikeType, lambd: float = 0.5):
+    # Formula for reference,
+    # hardshrink(x) = x if x > lambd
+    #               = x if x < -lambd
+    #               = 0 otherwise
+    return torch.where(torch.abs(a) <= lambd, 0, a)
+
+
+@aten.softshrink.default.py_impl(DispatchKey.Autograd)
+@register_decomposition(aten.softshrink)
+@out_wrapper()
+def softshrink(a: TensorLikeType, lambd: float = 0.5):
+    # Formula for reference,
+    # softshrink(x) = x - lambd if x > lambd
+    #               = x + lambd if x < -lambd
+    #               = 0 otherwise
+    torch._check(
+        lambd >= 0,
+        lambda: f"lambda must be greater or equal to 0, but found to be {lambd}",
+    )
+    # We implement this in one torch.where to generate better code in the backward
+    # see https://github.com/pytorch/pytorch/pull/107052#discussion_r1293748211
+    return torch.where(torch.abs(a) > lambd, a - torch.sign(a) * lambd, 0)
+
+
+# Losses
+def _reduction_int_to_str(reduction: int) -> str:
+    from torch._decomp.decompositions import Reduction
+
+    if reduction == Reduction.NONE.value:
+        return "none"
+    elif reduction == Reduction.MEAN.value:
+        return "mean"
+    elif reduction == Reduction.SUM.value:
+        return "sum"
+    else:
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+
+
+def _apply_loss_reduction(loss: TensorLikeType, reduction: str) -> TensorLikeType:
+    if reduction == "sum":
+        return torch.sum(loss)
+    elif reduction == "mean":
+        return torch.mean(loss)
+    else:  # reduction == "none"
+        return loss
+
+
+def _check_reduction_value(reduction: str):
+    if reduction not in ("mean", "sum", "none"):
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+
+
+# This helper function maps depreciated arguments, "size_average" and "reduce"
+# to their corresponding "reduction" string argument
+def _get_string_reduction_arg(
+    *, size_average: Optional[bool], reduce: Optional[bool]
+) -> str:
+    if size_average is None:
+        size_average = True
+    if reduce is None:
+        reduce = True
+    if size_average and reduce:
+        ret = "mean"
+    elif reduce:
+        ret = "sum"
+    else:
+        ret = "none"
+    return ret
+
+
+# CompositeImplicitAutograd - don't register decomp
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input", "target"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+def l1_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.l1_loss
+    """
+    if size_average is not None or reduce is not None:
+        # TODO: Raise exception instead of converting value.  This is only for
+        # primTorch since it can drop support for deprecated arguments.
+        # msg = "size_average and reduce args are deprecated, please use reduction argument."
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+    _check_reduction_value(reduction)
+    loss = torch.abs(input - target)
+    return _apply_loss_reduction(loss, reduction)
+
+
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input", "target"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+def smooth_l1_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+    beta: float = 1.0,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.smooth_l1_loss
+    """
+    if size_average is not None or reduce is not None:
+        # TODO: Raise exception instead of converting value.  This is only for
+        # primTorch since it can drop support for deprecated arguments.
+        # msg = "size_average and reduce args are deprecated, please use reduction argument."
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+    _check_reduction_value(reduction)
+
+    if beta == 0.0:
+        return torch.nn.functional.l1_loss(
+            input, target, size_average=size_average, reduce=reduce, reduction=reduction
+        )
+    else:
+        loss = torch.abs(input - target)
+        loss = torch.where(loss < beta, 0.5 * loss**2 / beta, loss - 0.5 * beta)
+        return _apply_loss_reduction(loss, reduction)
+
+
+# Forwarding alias: the functional variant doesn't support the out kwarg
+# CompositeImplicitAutograd - don't register decomp
+def log_softmax(
+    a: TensorLikeType,
+    dim: Optional[int] = None,
+    _stacklevel: int = 3,  # for compat when using TorchRefsMode(strict=True)
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    # The error is for compat with regular PyTorch, which has this behavior
+    # deprecated.  For PrimTorch, it's fine to drop support for deprecated
+    # behavior because it requires explicit opt in.  This error is to inform
+    # users how to update their calls.
+    torch._check(dim is not None, lambda: "implicit dim not supported, use dim=X")
+    return torch.log_softmax(a=a, dim=dim, dtype=dtype)  # type: ignore[call-overload]
+
+
+@register_decomposition(aten.margin_ranking_loss)
+def margin_ranking_loss(
+    input1: TensorLikeType,
+    input2: TensorLikeType,
+    target: TensorLikeType,
+    margin: float = 0.0,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    # loss_without_reduction = max(0, −target * (input1 − input2) + margin)
+    if input1.ndim != input2.ndim or input1.ndim != target.ndim:
+        raise RuntimeError(
+            "margin_ranking_loss : All input tensors should have same dimension but got sizes: "
+            f"input1: {input1.shape}, input2: {input2.shape}, target: {target.shape} "
+        )
+    _check_reduction_value(reduction)
+    loss = torch.clamp_min(-target * (input1 - input2) + margin, 0)
+    return _apply_loss_reduction(loss, reduction)
+
+
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input", "target"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+)
+def mse_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    if size_average is not None or reduce is not None:
+        # TODO: Raise exception instead of converting value.  This is only for
+        # primTorch since it can drop support for deprecated arguments.
+        # msg = "size_average and reduce args are deprecated, please use reduction argument."
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+    _check_reduction_value(reduction)
+    loss = torch.pow(input - target, 2)
+    return _apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.hinge_embedding_loss)
+def hinge_embedding_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    margin: float = 1.0,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    # loss_without_reduction = input if y == 1
+    #                        = max(0, margin - input) if y == -1
+    _check_reduction_value(reduction)
+    margin_clamp = torch.clamp_min(margin - input, 0)
+    output_margin = torch.where(target != 1, margin_clamp, 0)
+    output_self = torch.where(target != -1, input, 0)
+    loss = output_margin + output_self
+    return _apply_loss_reduction(loss, reduction)
+
+
+def _nll_loss_nd(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    weight: Optional[TensorLikeType],
+    reduction: str,
+    ignore_index: int,
+) -> TensorLikeType:
+    torch._check(
+        input.ndim > 0 and input.ndim <= 3,
+        lambda: f"Expected input dimension to be either [1, 2, 3] but received {input.ndim}.",
+    )
+
+    torch._check(
+        (input.ndim == 1) or (input.shape[0] == target.shape[0]),
+        lambda: f"Expected input batch size {input.shape[0]} to match target batch size {target.shape[0]}.",
+    )
+
+    _check_reduction_value(reduction)
+
+    flat_target = torch.flatten(target)
+    ignore_classes_mask = torch.eq(flat_target, ignore_index)
+
+    # TODO: Enable data-dependent checks with debug mode
+    # TODO: This check does not work with FakeTensor inputs; See Issue #85834
+    # Explicit cast for class_check to bool; See Issue #78071
+    """
+    from torch._subclasses.fake_tensor import FakeTensor
+    num_classes = input.shape[1] if input.ndim > 1 else input.shape[0]
+    valid_classes_mask = torch.logical_and(
+        (flat_target >= 0), (flat_target < num_classes)
+    )
+    class_check = torch.all(torch.logical_or(ignore_classes_mask, valid_classes_mask))
+    torch._check(
+        isinstance(target, FakeTensor) or bool(class_check.item()),
+        lambda: "A target class is out-of-bounds and not the ignore index.",
+    )
+    """
+
+    ignore_class_weight = torch.scalar_tensor(0, dtype=input.dtype, device=input.device)
+    class_weight = (
+        torch.scalar_tensor(1, dtype=input.dtype, device=input.device)
+        if weight is None
+        else weight[flat_target]
+    )
+    current_weight = torch.where(
+        ignore_classes_mask,
+        ignore_class_weight,
+        class_weight,
+    )
+
+    if input.ndim == 1:
+        # implicit batch size = 1
+        # input (1 batch size, C classes)
+        loss = -input[target] * current_weight
+    elif input.ndim == 2:
+        # input (N batch size, C classes)
+        batch_size = input.shape[0]
+        loss = -input[torch.arange(batch_size), target] * current_weight
+    else:
+        # 3D case (N batch size, C classe, K dimensions)
+        # input (N batch size, C classes, K)
+        batch_size = input.shape[0]
+        extent = input.shape[2]
+        numel = batch_size * extent
+        indices = torch.arange(numel)
+        bdx = indices // extent
+        kdx = indices % extent
+        loss = -input[bdx, flat_target, kdx] * current_weight
+    loss = torch.reshape(loss, target.shape)
+
+    if reduction == "none":
+        return loss
+    elif reduction == "sum":
+        return torch.sum(loss)
+    else:
+        # calculate weighted mean of the loss function
+        return torch.sum(loss) / torch.sum(current_weight)
+
+
+@register_decomposition(aten.nll_loss)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def nll_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    weight: Optional[TensorLikeType] = None,
+    size_average: Optional[bool] = None,
+    ignore_index: int = -100,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.nll_loss
+    """
+    torch._check(
+        input.ndim > 0,
+        lambda: f"Expected input tensor to have 1 or more dimensions (got {input.ndim})",
+    )
+
+    # TODO: raise exception instead of converting value
+    # msg = "size_average and reduce args are deprecated, please use reduction argument."
+    # Convert these options for consistency with the eager mode
+    if size_average is not None or reduce is not None:
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+
+    # The expected behavior when the target and input have zero elements:
+    #   reduction = 'none' --- tensor([])
+    #   reduction = 'sum'  --- tensor(0.)
+    #   reduction = 'mean' --- tensor(nan)
+    # Mean reduction on empty tensors produces NaN. See the discussion in
+    # https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162
+    if input.numel() == 0 and target.numel() == 0:
+        if reduction == "none":
+            return torch.zeros_like(target)
+        elif reduction == "sum":
+            return torch.empty_like(target)
+        else:
+            return torch.full_like(target, float("nan"))
+
+    # The _nll_loss_nd helper function handles the most common cases.
+    # ndim == 1 (Single Example)
+    #   => Batch Size: 1, Input: (C), Target: ()
+    # ndim == 2 (k = 1)
+    #   => Batch Size: N, Input: (N, C), Target: (N)
+    # ndim == 3 (k > 1)
+    #   => Batch Size: N, Input: (N, C, K), Target: (N, K)
+    if input.ndim <= 3:
+        return _nll_loss_nd(input, target, weight, reduction, ignore_index)
+
+    # For ndim > 3, we reshape the input and target to 3-D case.
+    # Input (N batch-size, C classes, k-dimensions)
+    # Target (N batch-size, k-dimensions)
+    torch._check(
+        input.ndim > 0 and target.ndim > 0 and target.shape[1:] == input.shape[2:],
+        lambda: (
+            "Expected input and target to both have ndim > 0 and "
+            "target.shape[1:] == input.shape[2:], but got "
+            f"target.shape {target.shape} and input.shape {input.shape}"
+        ),
+    )
+
+    batch_size = input.shape[0]
+    num_classes = input.shape[1]
+    out_size = [batch_size] + list(target.shape[1:])
+
+    input = torch.reshape(input, [batch_size, num_classes, -1])
+    target = torch.reshape(target, [batch_size, -1])
+    if reduction != "none":
+        return _nll_loss_nd(input, target, weight, reduction, ignore_index)
+    else:
+        result = _nll_loss_nd(input, target, weight, reduction, ignore_index)
+        # reshape flattened inner-dim to original k-dimensions
+        return torch.reshape(result, out_size)
+
+
+# TODO: This ref supports int reduction and out kwarg to be compatible with ATen:
+# https://github.com/pytorch/pytorch/issues/83931
+# TODO: Could be rewritten to support complex:
+# https://github.com/pytorch/pytorch/pull/85041
+@register_decomposition(aten.huber_loss)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input", "target"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def huber_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    reduction: Union[str, int] = "mean",
+    delta: float = 1.0,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.huber_loss
+    """
+    if type(reduction) is int:
+        reduction = _reduction_int_to_str(reduction)
+    _check_reduction_value(reduction)  # type: ignore[arg-type]
+    torch._check(
+        delta > 0,
+        lambda: "huber_loss does not support non-positive values for delta.",
+    )
+    z = (input - target).abs()
+    loss = torch.where(z < delta, 0.5 * z * z, delta * (z - 0.5 * delta))
+    return _apply_loss_reduction(loss, reduction)  # type: ignore[arg-type]
+
+
+# tanhshrink does not use _make_elementwise_unary_reference because it does not support out
+@elementwise_unary_scalar_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def tanhshrink(a: TensorLikeType) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.tanhshrink
+    """
+    if not isinstance(a, TensorLike):
+        raise RuntimeError(
+            "Expected a tensor input for an elementwise unary operation!"
+        )
+    return a - torch.tanh(a)
+
+
+@register_decomposition(aten.threshold)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def threshold(
+    a: TensorLikeType,
+    threshold: NumberType,
+    value: Union[bool, int, float],
+    inplace: bool = False,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.threshold
+    """
+
+    if inplace:
+        raise NotImplementedError
+
+    return torch.where(a <= threshold, value, a)
+
+
+# CompositeImplicitAutograd - don't register decomp
+# No elementwise type promotion - core op doesn't explicitly type promote
+def triplet_margin_loss(
+    anchor: TensorLikeType,
+    positive: TensorLikeType,
+    negative: TensorLikeType,
+    margin: float = 1.0,
+    p: float = 2,
+    eps: float = 1e-6,
+    swap: bool = False,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    if size_average is not None or reduce is not None:
+        # TODO: Raise exception instead of converting value.  This is only for
+        # primTorch since it can drop support for deprecated arguments.
+        # msg = "size_average and reduce args are deprecated, please use reduction argument."
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+
+    # torch.nn.functional.triplet_margin_with_distance_loss has no ref defined
+    # since it's a pure Python implementation.  Use this helper instead.
+    return _triplet_margin_with_distance_loss(
+        anchor=anchor,
+        positive=positive,
+        negative=negative,
+        distance_function=lambda x, y: torch.pairwise_distance(x, y, p, eps),
+        margin=margin,
+        swap=swap,
+        reduction=reduction,
+    )
+
+
+# Pure Python impl - don't register decomp and don't add a ref.  Defined as a
+# helper here since triplet_margin_loss can be nicely implemented with it.
+def _triplet_margin_with_distance_loss(
+    anchor: TensorLikeType,
+    positive: TensorLikeType,
+    negative: TensorLikeType,
+    *,
+    distance_function: Optional[
+        Callable[[TensorLikeType, TensorLikeType], TensorLikeType]
+    ] = None,
+    margin: float = 1.0,
+    swap: bool = False,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    _check_reduction_value(reduction)
+
+    a_dim = anchor.ndim
+    p_dim = positive.ndim
+    n_dim = negative.ndim
+    torch._check(
+        a_dim == p_dim and p_dim == n_dim,
+        lambda: (
+            f"The anchor, positive, and negative tensors are expected to have "
+            f"the same number of dimensions, but got: anchor {a_dim}D, "
+            f"positive {p_dim}D, and negative {n_dim}D inputs"
+        ),
+    )
+
+    if distance_function is None:
+        distance_function = torch.pairwise_distance
+
+    dist_pos = distance_function(anchor, positive)
+    dist_neg = distance_function(anchor, negative)
+    # The distance swap is described in the paper "Learning shallow
+    # convolutional feature descriptors with triplet losses" by V. Balntas, E.
+    # Riba et al.  If True, and if the positive example is closer to the
+    # negative example than the anchor is, swaps the positive example and the
+    # anchor in the loss computation.
+    if swap:
+        dist_swap = distance_function(positive, negative)
+        dist_neg = torch.minimum(dist_neg, dist_swap)
+    loss = torch.clamp_min(margin + dist_pos - dist_neg, 0)
+    return _apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.hardtanh)
+@_inplace_wrapper
+@out_wrapper()
+@elementwise_unary_scalar_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def hardtanh(
+    a: TensorLikeType,
+    min_val: NumberType = -1,
+    max_val: NumberType = 1,
+    inplace: bool = False,
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.hardtanh
+    """
+    if inplace:
+        raise NotImplementedError
+    if utils.is_boolean_dtype(a.dtype):
+        raise RuntimeError("Bool inputs not supported for hardtanh")
+
+    # preserve legacy behavior of boundaries not causing type promotion
+    if utils.is_integer_dtype(a.dtype):
+        min_val = int(min_val)  # type: ignore[arg-type]
+        max_val = int(max_val)  # type: ignore[arg-type]
+        if not (a.dtype != torch.uint8 or (min_val >= 0 and max_val >= 0)):
+            raise RuntimeError(
+                "Cannot do hardtanh on an unsigned type with negative limits"
+            )
+    return torch.clamp(a, min_val, max_val)  # type: ignore[arg-type]
+
+
+@register_decomposition(aten.gelu)
+@out_wrapper()
+@elementwise_unary_scalar_wrapper
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def gelu(a: TensorLikeType, approximate: str = "none") -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.gelu
+    """
+    if not isinstance(a, TensorLike):
+        raise RuntimeError(
+            "Expected a tensor input for an elementwise unary operation!"
+        )
+    M_SQRT2 = 1.41421356237309504880
+    M_SQRT1_2 = 0.70710678118654752440
+    M_2_SQRTPI = 1.12837916709551257390
+    if approximate == "tanh":
+        kBeta = M_SQRT2 * M_2_SQRTPI * 0.5
+        kKappa = 0.044715
+        a_cube = a * a * a
+        inner = kBeta * (a + kKappa * a_cube)
+        return 0.5 * a * (1 + torch.tanh(inner))
+    elif approximate == "none":
+        kAlpha = M_SQRT1_2
+        return a * 0.5 * (1 + torch.erf(a * kAlpha))
+    else:
+        raise RuntimeError("approximate argument must be either none or tanh.")
+
+
+# CompositeImplicitAutograd - don't register decomp
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("input", "target"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def poisson_nll_loss(
+    input: TensorLikeType,
+    target: TensorLikeType,
+    log_input: bool = True,
+    full: bool = False,
+    size_average: Optional[bool] = None,
+    eps: float = 1e-8,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.poisson_nll_loss
+    """
+    if size_average is not None or reduce is not None:
+        # TODO: Raise exception instead of converting value.  This is only for
+        # primTorch since it can drop support for deprecated arguments.
+        # msg = "size_average and reduce args are deprecated, please use reduction argument."
+        reduction = _get_string_reduction_arg(size_average=size_average, reduce=reduce)
+    _check_reduction_value(reduction)
+    if log_input:
+        loss = torch.exp(input) - target * input
+    else:
+        loss = input - target * torch.log(input + eps)
+
+    if full:
+        stirling_term = (
+            target * torch.log(target) - target + 0.5 * torch.log(2 * torch.pi * target)
+        )
+        # avoid inplace add
+        loss = loss + stirling_term.masked_fill(target <= 1, 0)
+    return _apply_loss_reduction(loss, reduction)
+
+
+@register_decomposition(aten.prelu)
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "weight"),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def prelu(a: TensorLikeType, weight: TensorLikeType) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.prelu
+    """
+    torch._check(
+        isinstance(a, TensorLike),
+        lambda: f"prelu: Expected `a` to be tensor, but got: {type(a)}",
+    )
+    torch._check(
+        isinstance(weight, TensorLike),
+        lambda: f"prelu: Expected `weight` to be tensor, but got: {type(weight)}",
+    )
+
+    if weight.numel() != 1:
+        torch._check(a.ndim > 0, lambda: "Not allow zero-dim input tensor.")
+        channel_size = a.shape[1] if a.ndim >= 2 else 1
+        torch._check(
+            weight.numel() == channel_size,
+            lambda: f"Mismatch of parameter numbers and input channel size. Found parameter numbers ="
+            f" {weight.numel()} and channel size = {channel_size}.",
+        )
+
+    torch._check(
+        weight.ndim == 0 or weight.ndim == 1,
+        lambda: f"prelu: Expected `weight` to be a scalar or 1D tensor, but got: "
+        f"ndim = {weight.ndim}",
+    )
+    if a.ndim == 0:
+        weight = weight[0] if weight.ndim == 1 else weight
+    else:
+        weight = prims.broadcast_in_dim(
+            weight, a.shape, tuple() if weight.ndim == 0 else (0 if a.ndim == 1 else 1,)
+        )
+
+    return torch.where(a > 0, a, a * weight)
+
+
+@register_decomposition(aten.relu6)
+@_inplace_wrapper
+@out_wrapper()
+def relu6(a: TensorLikeType, inplace: bool = False) -> TensorLikeType:
+    """
+    Reference implementation of torch.nn.functional.relu6
+    """
+    if inplace:
+        raise NotImplementedError
+
+    # See https://github.com/pytorch/pytorch/pull/81142#discussion_r918220126
+    # It may be better to use clamp here, but we use hardtanh to replicate
+    # the behavior of the existing implementation
+    return torch.nn.functional.hardtanh(a, 0, 6)
+
+
+@register_decomposition(aten.glu)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def glu(a: TensorLikeType, dim: int = -1) -> TensorLikeType:
+    dim = utils.canonicalize_dims(a.ndim, dim)
+    torch._check(
+        a.shape[dim] % 2 == 0,
+        lambda: f"Halving dimension must be even, but dimension {dim} is size {a.shape[dim]}",
+    )
+    b, c = torch.tensor_split(a, 2, dim)
+
+    return b * torch.sigmoid(c)
+
+
+@register_decomposition(aten.pairwise_distance)
+@out_wrapper()
+def pairwise_distance(
+    x1: TensorLikeType,
+    x2: TensorLikeType,
+    p: NumberType = 2.0,
+    eps: NumberType = 1e-6,
+    keepdim=False,
+) -> TensorLikeType:
+    return torch.linalg.vector_norm(x1 - x2 + eps, ord=p, dim=-1, keepdim=keepdim)
+
+
+@register_decomposition(aten.pdist)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+)
+def pdist(a: TensorLikeType, p: float = 2) -> TensorLikeType:
+    torch._check(a.ndim == 2, lambda: f"pdist only supports 2D tensors, got: {a.ndim}D")
+    torch._check(p >= 0, lambda: "pdist only supports non-negative p values")
+    # For p == 2 we can use an efficient implementation, but other values of p
+    # require creating a much bigger tensor for an intermediate step
+    if p == 2:
+        aTa = torch.mm(a, a.T)
+        aTa_diag = torch.diag(aTa)
+        t = torch.sqrt(torch.clamp(aTa_diag + aTa_diag.unsqueeze(-1) - 2 * aTa, min=0))
+    else:
+        t = torch.linalg.vector_norm(a.unsqueeze(1) - a, ord=p, dim=2)
+    i = torch.triu_indices(t.shape[0], t.shape[1], offset=1, device=a.device)
+    return t.flatten().index_select(0, i[0] * t.shape[0] + i[1])
+
+
+@register_decomposition(aten.pixel_shuffle)
+@out_wrapper()
+def pixel_shuffle(self: Tensor, upscale_factor: int):
+    torch._check(
+        self.dim() >= 3,
+        lambda: f"pixel_shuffle expects input to have at least 3 dimensions, but got input with {self.dim} dimension(s)",
+    )
+    batch = self.shape[:-3]
+    C_out = self.shape[-3] // upscale_factor**2
+    HW_out = (self.shape[-2] * upscale_factor, self.shape[-1] * upscale_factor)
+    n = len(batch)
+    B_dims = range(n)
+    C_dim, r1_dim, r2_dim, H_dim, W_dim = range(n, n + 5)
+    return (
+        self.view(
+            *batch,
+            C_out,
+            upscale_factor,
+            upscale_factor,
+            self.shape[-2],
+            self.shape[-1],
+        )
+        .permute(*B_dims, C_dim, H_dim, r1_dim, W_dim, r2_dim)
+        .reshape(*batch, C_out, *HW_out)
+        .clone(memory_format=utils.suggest_memory_format(self))
+    )
+
+
+@register_decomposition(aten.pixel_unshuffle)
+@out_wrapper()
+def pixel_unshuffle(self: Tensor, downscale_factor: int):
+    torch._check(
+        self.dim() >= 3,
+        lambda: f"pixel_unshuffle expects input to have at least 3 dimensions, but got input with {self.dim} dimension(s)",
+    )
+    batch = self.shape[:-3]
+    C_out = self.shape[-3] * downscale_factor**2
+    HW_out = (self.shape[-2] // downscale_factor, self.shape[-1] // downscale_factor)
+    n = len(batch)
+    B_dims = range(n)
+    C_dim, H_dim, r1_dim, W_dim, r2_dim = range(n, n + 5)
+    return (
+        self.view(
+            *batch,
+            self.shape[-3],
+            HW_out[0],
+            downscale_factor,
+            HW_out[1],
+            downscale_factor,
+        )
+        .permute(*B_dims, C_dim, r1_dim, r2_dim, H_dim, W_dim)
+        .reshape(*batch, C_out, *HW_out)
+        .clone(memory_format=utils.suggest_memory_format(self))
+    )
+
+
+# Needed as aten.{celu_,elu_...} exist (even if they don't have the in-place kwarg)
+celu_ = _make_inplace(celu)
+elu_ = _make_inplace(elu)
+mish_ = _make_inplace(mish)
+selu_ = _make_inplace(selu)
+threshold_ = _make_inplace(threshold)
diff --git a/MLPY/Lib/site-packages/torch/_refs/nn/functional/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/nn/functional/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df4f31ffa83a3615eccde9a75c3d2158a155b9b1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/nn/functional/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_refs/special/__init__.py b/MLPY/Lib/site-packages/torch/_refs/special/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d57e327f049b8555b90011f7f588927f925832c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_refs/special/__init__.py
@@ -0,0 +1,236 @@
+import math
+from typing import Optional, Union
+
+import torch
+import torch._prims as prims
+import torch._prims_common as utils
+import torch._refs as refs
+
+from torch import Tensor
+from torch._decomp import register_decomposition
+from torch._prims_common import (
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    Number,
+    NumberType,
+    TensorLike,
+    TensorLikeType,
+)
+from torch._prims_common.wrappers import elementwise_type_promotion_wrapper, out_wrapper
+from torch._refs import (
+    _make_alias,
+    _make_elementwise_binary_reference,
+    _make_elementwise_unary_reference,
+)
+
+
+__all__ = [
+    "bessel_j0",
+    "bessel_j1",
+    "entr",
+    "erfcx",
+    "expit",
+    "i0e",
+    "i1",
+    "i1e",
+    "log_ndtr",
+    "logit",
+    "log_softmax",
+    "multigammaln",
+    "ndtr",
+    "ndtri",
+    "softmax",
+    "spherical_bessel_j0",
+    "xlog1py",
+    "zeta",
+]
+aten = torch._ops.ops.aten
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def bessel_j0(a: TensorLikeType) -> TensorLikeType:
+    return prims.bessel_j0(a)
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def bessel_j1(a: TensorLikeType) -> TensorLikeType:
+    return prims.bessel_j1(a)
+
+
+@register_decomposition(aten.special_entr)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def entr(a: TensorLikeType) -> TensorLikeType:
+    return torch.where(
+        torch.isnan(a),
+        a,
+        torch.where(a > 0, -a * torch.log(a), torch.where(a == 0, 0, -torch.inf)),
+    )
+
+
+@register_decomposition(aten.special_erfcx)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def erfcx(a: TensorLikeType) -> TensorLikeType:
+    return prims.erfcx(a)
+
+
+# alias for sigmoid
+expit = _make_alias(torch.sigmoid, "expit")
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def i0e(a: TensorLikeType) -> TensorLikeType:
+    return prims.bessel_i0e(a)
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def i1(a: TensorLikeType) -> TensorLikeType:
+    return prims.bessel_i1(a)
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def i1e(a: TensorLikeType) -> TensorLikeType:
+    return prims.bessel_i1e(a)
+
+
+@register_decomposition(aten.special_log_ndtr)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def log_ndtr(a: TensorLikeType) -> TensorLikeType:
+    # Note: M_SQRT1_2 is the value of 1 / √2
+    M_SQRT1_2 = 0.707106781186547524400844362104849039
+    t = a * M_SQRT1_2
+    return torch.where(
+        a < 1.0,
+        torch.log(torch.special.erfcx(-t) / 2) - t * t,
+        torch.log1p(-torch.erfc(t) / 2),
+    )
+
+
+@register_decomposition(aten.logit)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("self",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def logit(self: TensorLikeType, eps: Optional[float] = None) -> TensorLikeType:
+    if eps is None:
+        eps = -1.0
+    lo = eps
+    hi = 1 - eps
+    self = torch.clamp(self, lo, hi)
+    return torch.log(torch.true_divide(self, torch.sub(1, self)))
+
+
+@register_decomposition(aten.special_xlog1py)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a", "b"),
+    type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def xlog1py(a: Union[TensorLikeType, NumberType], b: Union[TensorLikeType, NumberType]):
+    torch._check(
+        isinstance(a, TensorLike) or isinstance(b, TensorLike),
+        lambda: 'Expected either argument a or b to be a Tensor"',
+    )
+
+    # Operations like eq and log do not handle scalar values, so we convert them to scalar_tensors.
+    if isinstance(a, TensorLike) and isinstance(b, Number):
+        b = refs.scalar_tensor(b, dtype=a.dtype, device=a.device)
+    elif isinstance(b, TensorLike) and isinstance(a, Number):
+        a = refs.scalar_tensor(a, dtype=b.dtype, device=b.device)
+
+    # mypy: expected "Tensor"
+    assert isinstance(a, TensorLike)
+    assert isinstance(b, TensorLike)
+    rhs = torch.where(torch.eq(a, 0), 0, torch.mul(a, torch.log1p(b)))
+    return torch.where(torch.isnan(b), float("nan"), rhs)
+
+
+@register_decomposition(aten.mvlgamma)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def multigammaln(a: TensorLikeType, p: int) -> TensorLikeType:
+    c = 0.25 * p * (p - 1) * math.log(math.pi)
+    b = 0.5 * torch.arange(start=(1 - p), end=1, step=1, dtype=a.dtype, device=a.device)
+    return torch.sum(torch.lgamma(a.unsqueeze(-1) + b), dim=-1) + c
+
+
+@register_decomposition(aten.special_ndtr)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def ndtr(a: TensorLikeType) -> TensorLikeType:
+    # Note: M_SQRT1_2 is the value of 1 / √2
+    M_SQRT1_2 = 0.707106781186547524400844362104849039
+    a_sqrt_2 = a * M_SQRT1_2
+    return (1 + torch.erf(a_sqrt_2)) * 0.5
+
+
+@register_decomposition(aten.special_ndtri)
+@out_wrapper()
+@elementwise_type_promotion_wrapper(
+    type_promoting_args=("a",),
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def ndtri(a: TensorLikeType) -> TensorLikeType:
+    return prims.ndtri(a)
+
+
+# Forwarding alias: the special variant doesn't support the out kwarg
+# CompositeImplicitAutograd - don't register decomp
+def log_softmax(
+    a: TensorLikeType,
+    dim: int,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    return torch.log_softmax(a=a, dim=dim, dtype=dtype)  # type: ignore[call-overload]
+
+
+# Forwarding alias: the special variant doesn't support the out kwarg
+# CompositeImplicitAutograd - don't register decomp
+def softmax(
+    a: TensorLikeType,
+    dim: int,
+    dtype: Optional[torch.dtype] = None,
+) -> TensorLikeType:
+    return torch.softmax(a=a, dim=dim, dtype=dtype)  # type: ignore[call-overload]
+
+
+@_make_elementwise_unary_reference(
+    ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def spherical_bessel_j0(a: TensorLikeType) -> TensorLikeType:
+    return prims.spherical_bessel_j0(a)
+
+
+# TODO: add docstring
+@_make_elementwise_binary_reference(
+    type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+)
+def zeta(a: TensorLikeType, b: TensorLikeType) -> TensorLikeType:
+    return prims.zeta(a, b)
diff --git a/MLPY/Lib/site-packages/torch/_refs/special/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_refs/special/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b7bd6bb560e0a1ed99fe5127728b63fb63f951d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_refs/special/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_sources.py b/MLPY/Lib/site-packages/torch/_sources.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5342e5ce12f0e13889fcb944e1df913ea5ccc39
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_sources.py
@@ -0,0 +1,137 @@
+import ast
+import functools
+import inspect
+from textwrap import dedent
+from typing import Any, List, NamedTuple, Optional, Tuple
+
+from torch._C import ErrorReport
+from torch._C._jit_tree_views import SourceRangeFactory
+
+
+def get_source_lines_and_file(
+    obj: Any,
+    error_msg: Optional[str] = None,
+) -> Tuple[List[str], int, Optional[str]]:
+    """
+    Wrapper around inspect.getsourcelines and inspect.getsourcefile.
+
+    Returns: (sourcelines, file_lino, filename)
+    """
+    filename = None  # in case getsourcefile throws
+    try:
+        filename = inspect.getsourcefile(obj)
+        sourcelines, file_lineno = inspect.getsourcelines(obj)
+    except OSError as e:
+        msg = (
+            f"Can't get source for {obj}. TorchScript requires source access in "
+            "order to carry out compilation, make sure original .py files are "
+            "available."
+        )
+        if error_msg:
+            msg += "\n" + error_msg
+        raise OSError(msg) from e
+
+    return sourcelines, file_lineno, filename
+
+
+def normalize_source_lines(sourcelines: List[str]) -> List[str]:
+    """
+    This helper function accepts a list of source lines. It finds the
+    indentation level of the function definition (`def`), then it indents
+    all lines in the function body to a point at or greater than that
+    level. This allows for comments and continued string literals that
+    are at a lower indentation than the rest of the code.
+    Args:
+        sourcelines: function source code, separated into lines by
+                        the '\n' character
+    Returns:
+        A list of source lines that have been correctly aligned
+    """
+
+    def remove_prefix(text, prefix):
+        return text[text.startswith(prefix) and len(prefix) :]
+
+    # Find the line and line number containing the function definition
+    idx = None
+    for i, l in enumerate(sourcelines):
+        if l.lstrip().startswith("def"):
+            idx = i
+            break
+
+    # This will happen when the function is a lambda- we won't find "def" anywhere in the source
+    # lines in that case. Currently trying to JIT compile a lambda will throw an error up in
+    # `parse_def()`, but we might want to handle this case in the future.
+    if idx is None:
+        return sourcelines
+
+    # Get a string representing the amount of leading whitespace
+    fn_def = sourcelines[idx]
+    whitespace = fn_def.split("def")[0]
+
+    # Add this leading whitespace to all lines before and after the `def`
+    aligned_prefix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[:idx]
+    ]
+    aligned_suffix = [
+        whitespace + remove_prefix(s, whitespace) for s in sourcelines[idx + 1 :]
+    ]
+
+    # Put it together again
+    aligned_prefix.append(fn_def)
+    return aligned_prefix + aligned_suffix
+
+
+# Thin wrapper around SourceRangeFactory to store extra metadata
+# about the function-to-be-compiled.
+class SourceContext(SourceRangeFactory):
+    def __init__(
+        self,
+        source,
+        filename,
+        file_lineno,
+        leading_whitespace_len,
+        uses_true_division=True,
+        funcname=None,
+    ):
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
+        self.uses_true_division = uses_true_division
+        self.filename = filename
+        self.funcname = funcname
+
+
+@functools.lru_cache(maxsize=None)
+def make_source_context(*args):
+    return SourceContext(*args)
+
+
+def fake_range():
+    return SourceContext("", None, 0, 0).make_raw_range(0, 1)
+
+
+class ParsedDef(NamedTuple):
+    ast: ast.Module
+    ctx: SourceContext
+    source: str
+    filename: Optional[str]
+    file_lineno: int
+
+
+def parse_def(fn):
+    sourcelines, file_lineno, filename = get_source_lines_and_file(
+        fn, ErrorReport.call_stack()
+    )
+    sourcelines = normalize_source_lines(sourcelines)
+    source = "".join(sourcelines)
+    dedent_src = dedent(source)
+    py_ast = ast.parse(dedent_src)
+    if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+        raise RuntimeError(
+            f"Expected a single top-level function: {filename}:{file_lineno}"
+        )
+    leading_whitespace_len = len(source.split("\n", 1)[0]) - len(
+        dedent_src.split("\n", 1)[0]
+    )
+    ctx = make_source_context(
+        source, filename, file_lineno, leading_whitespace_len, True, fn.__name__
+    )
+    return ParsedDef(py_ast, ctx, source, filename, file_lineno)
diff --git a/MLPY/Lib/site-packages/torch/_storage_docs.py b/MLPY/Lib/site-packages/torch/_storage_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32bb6a1222355fd550a3e23cc8e0ef376eeb4c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_storage_docs.py
@@ -0,0 +1,43 @@
+"""Adds docstrings to Storage functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+
+
+storage_classes = [
+    "StorageBase",
+]
+
+
+def add_docstr_all(method, docstr):
+    for cls_name in storage_classes:
+        cls = getattr(torch._C, cls_name)
+        try:
+            add_docstr(getattr(cls, method), docstr)
+        except AttributeError:
+            pass
+
+
+add_docstr_all(
+    "from_file",
+    """
+from_file(filename, shared=False, size=0) -> Storage
+
+Creates a CPU storage backed by a memory-mapped file.
+
+If ``shared`` is ``True``, then memory is shared between all processes.
+All changes are written to the file. If ``shared`` is ``False``, then the changes on
+the storage do not affect the file.
+
+``size`` is the number of elements in the storage. If ``shared`` is ``False``,
+then the file must contain at least ``size * sizeof(Type)`` bytes
+(``Type`` is the type of storage, in the case of an ``UnTypedStorage`` the file must contain at
+least ``size`` bytes). If ``shared`` is ``True`` the file will be created if needed.
+
+Args:
+    filename (str): file name to map
+    shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                    underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+    size (int): number of elements in the storage
+""",
+)
diff --git a/MLPY/Lib/site-packages/torch/_streambase.py b/MLPY/Lib/site-packages/torch/_streambase.py
new file mode 100644
index 0000000000000000000000000000000000000000..db9fc14b892a5e102a3dba12d3bf54465d123aaa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_streambase.py
@@ -0,0 +1,45 @@
+from abc import ABC, abstractmethod
+
+
+class _StreamBase(ABC):
+    r"""Base stream class abstraction for multi backends Stream to herit from"""
+
+    @abstractmethod
+    def wait_event(self, event):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def wait_stream(self, stream):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def record_event(self, event=None):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def query(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def synchronize(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def __eq__(self, stream):
+        raise NotImplementedError()
+
+
+class _EventBase(ABC):
+    r"""Base Event class abstraction for multi backends Event to herit from"""
+
+    @abstractmethod
+    def wait(self, stream=None):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def query(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def synchronize(self):
+        raise NotImplementedError()
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__init__.py b/MLPY/Lib/site-packages/torch/_subclasses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2eff305df6928c21d603dc08a4f22fb45a8859
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/__init__.py
@@ -0,0 +1,18 @@
+import torch
+
+from torch._subclasses.fake_tensor import (
+    DynamicOutputShapeException,
+    FakeTensor,
+    FakeTensorMode,
+    UnsupportedFakeTensorException,
+)
+
+from torch._subclasses.fake_utils import CrossRefFakeMode
+
+__all__ = [
+    "FakeTensor",
+    "FakeTensorMode",
+    "UnsupportedFakeTensorException",
+    "DynamicOutputShapeException",
+    "CrossRefFakeMode",
+]
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..038616017e4fbb3e5b5996e16e34199bfca315b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_impls.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_impls.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e32be5069551ec5881f86c5b5886337da5770e8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_impls.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4bcc1734f75c445c9c867a2c66f53e4df6285f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ea6742717cebd081ecd75bb1ec5d46b8cddf134
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/fake_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/functional_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/functional_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25aec2437065e447e4e26ed09d8a260fba19ddca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/functional_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/meta_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/meta_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db173b2508722dbd0889f02ab91b0f91bec4bfc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/meta_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/schema_check_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/schema_check_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62fd0e70bdb90698332b0252e9e0220366ec8bef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_subclasses/__pycache__/schema_check_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/fake_impls.py b/MLPY/Lib/site-packages/torch/_subclasses/fake_impls.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7421b485abdbfe8b9ca15e5b7cf16b754b4ce92
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/fake_impls.py
@@ -0,0 +1,1061 @@
+# mypy: ignore-errors
+
+import functools
+import itertools
+import math
+import sys
+from typing import Callable, Union
+
+import torch
+import torch._custom_op
+import torch._logging
+
+from torch._ops import OpOverload
+from torch._prims_common import (
+    elementwise_dtypes,
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    is_boolean_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+)
+
+from torch._subclasses.fake_tensor import (
+    DataDependentOutputException,
+    DynamicOutputShapeException,
+    FakeTensor,
+    in_kernel_invocation_manager,
+    run_fallback_kernel,
+    UnsupportedOperatorException,
+)
+from torch.fx.operator_schemas import normalize_function
+
+from torch.utils._stats import count_label
+
+pytree = torch.utils._pytree
+
+__all__ = [
+    "op_implementations_checks",
+    "get_fast_op_impls",
+    "stride_incorrect_op",
+    "has_meta",
+]
+
+op_implementations_dict = {}
+op_implementations_checks = []
+
+
+aten = torch._ops.ops.aten
+
+
+def ordered_set(*items):
+    return dict.fromkeys(items, True)
+
+
+# This function indicates if the backend device
+# supports non-contiguous tensors
+def is_noncontiguous_supported(device):
+    if device.type == "hpu":
+        return False
+    return True
+
+
+_like_tensor_constructors = ordered_set(
+    aten.empty_like.default,
+    aten.empty_like.out,
+    aten.full_like.default,
+    aten.full_like.out,
+    aten.ones_like.default,
+    aten.ones_like.out,
+    aten.rand_like.default,
+    aten.rand_like.out,
+    aten.randn_like.default,
+    aten.randn_like.out,
+    aten.randint_like.default,
+    aten.randint_like.out,
+    aten.randint_like.low_dtype,
+    aten.randint_like.low_dtype_out,
+    aten.zeros_like.default,
+    aten.zeros_like.out,
+    aten.new_empty.default,
+    aten.new_empty.out,
+    aten.new_empty_strided.default,
+    aten.new_empty_strided.out,
+    aten.new_full.default,
+    aten.new_full.out,
+    aten.new_zeros.default,
+    aten.new_zeros.out,
+    aten.new_ones.default,
+    aten.new_ones.out,
+)
+
+
+_device_not_kwarg_ops = ordered_set(
+    aten._resize_output_.default,
+    aten._nested_tensor_from_tensor_list.default,
+    aten._nested_tensor_from_tensor_list.out,
+    aten.pin_memory.default,
+    aten.is_pinned.default,
+    aten.to.device,
+    aten.to.prim_Device,
+    aten._pin_memory.default,
+    aten._pin_memory.out,
+    aten._resize_output.default,
+    aten._resize_output.out,
+)
+
+# this op is never actually used
+_non_kwarg_device_constructors = (aten._list_to_tensor,)
+
+
+def contains_tensor_types(type):
+    tensor_type = torch._C.TensorType.get()
+    return type.isSubtypeOf(tensor_type) or any(
+        contains_tensor_types(e) for e in type.containedTypes()
+    )
+
+
+@functools.lru_cache(None)
+def _is_tensor_constructor(func: OpOverload):
+    assert isinstance(func, OpOverload)
+    schema = func._schema
+    if any(contains_tensor_types(arg.type) for arg in schema.arguments):
+        return False
+    # TODO: no real reason to restrict multiple outputs
+    return (
+        len(schema.returns) == 1 and schema.returns[0].type is torch._C.TensorType.get()
+    )
+
+
+def register_op_impl(run_impl_check: Union[Callable[[OpOverload], bool], OpOverload]):
+    def impl_decorator(op_impl):
+        if isinstance(run_impl_check, OpOverload):
+            assert (
+                run_impl_check not in op_implementations_dict
+            ), f"duplicate registration: {run_impl_check}"
+            op_implementations_dict[run_impl_check] = op_impl
+        elif isinstance(run_impl_check, (list, tuple)):
+            for op in run_impl_check:
+                register_op_impl(op)(op_impl)
+        else:
+            assert callable(run_impl_check)
+            op_implementations_checks.append((run_impl_check, op_impl))
+
+        return op_impl
+
+    return impl_decorator
+
+
+@register_op_impl(op_implementations_dict.__contains__)
+def dispatch_to_op_implementations_dict(fake_mode, func, *args, **kwargs):
+    return op_implementations_dict[func](fake_mode, func, *args, **kwargs)
+
+
+@register_op_impl(_is_tensor_constructor)
+@register_op_impl([*_like_tensor_constructors])
+def constructors(fake_mode, func, *args, **kwargs):
+    assert func not in _non_kwarg_device_constructors
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    if "names" in kwargs:
+        raise UnsupportedOperatorException(
+            "torch.compile doesn't support named tensors"
+        )
+
+    if func in _like_tensor_constructors:
+        default_device = new_kwargs["input"].device
+        # TODO: file issue
+        args = (new_kwargs.pop("input"),)
+    else:
+        # cpu is default device if none is specified
+        default_device = torch.device("cpu")
+        args = ()
+    out_device = new_kwargs.pop("device", None)
+    out_device = out_device if out_device is not None else default_device
+    new_kwargs["device"] = torch.device("meta")
+    # _like constructors have fake tensor inputs (maybe this causes the non-like
+    # to fail? hmmm)
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(*args, **new_kwargs)
+    return FakeTensor(fake_mode, r, out_device)
+
+
+@register_op_impl(aten.to.prim_Device)
+@register_op_impl(aten.to.device)
+def non_kwarg_to(fake_mode, func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args, kwargs, normalize_to_only_use_kwargs=True
+    )
+    input_device = new_kwargs["device"]
+    out_device = input_device if input_device else new_kwargs["input"].device
+    new_kwargs["device"] = torch.device("meta")
+    inp = new_kwargs.pop("input")
+    with in_kernel_invocation_manager(fake_mode):
+        r = func(inp, **new_kwargs)
+    # TODO: I think this does the wrong thing if r is inp
+    return fake_mode.fake_tensor_converter.from_meta_and_device(
+        fake_mode, r, out_device
+    )
+
+
+def stride_incorrect_op(op):
+    if op.namespace not in ("aten", "prims"):
+        return False
+    if op is aten._fft_c2c.default:
+        return False
+
+    op_name = op.name()
+    if "fft" in op_name:
+        return True
+    return False
+
+
+# These operators have meta implementations with incorrect strides
+@register_op_impl(stride_incorrect_op)
+def wordaround_stride_incorrect_op(fake_mode, func, *args, **kwargs):
+    # This is a workaround for meta implmentations with incorrect strides
+
+    def is_symbolic(x):
+        if isinstance(x, FakeTensor):
+            return x._has_symbolic_sizes_strides
+        if isinstance(x, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+            return True
+        return False
+
+    # For static shapes, we can fall back to eager for the real strides
+    if fake_mode.allow_fallback_kernels:
+        require_dynamic = any(
+            is_symbolic(x) for x in itertools.chain(args, kwargs.values())
+        )
+        if not require_dynamic:
+            flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+            return run_fallback_kernel(fake_mode, func, flat_args, args_spec, None)
+
+    raise UnsupportedOperatorException(func)
+
+
+# Dont default to default device handling,
+# since the device of `the_template` is ignored
+@register_op_impl(aten.resize_as_.default)
+def resize_as_(fake_mode, func, *args, **kwargs):
+    with in_kernel_invocation_manager(fake_mode):
+        return func(*args, **kwargs)
+
+
+@register_op_impl(aten._sparse_coo_tensor_with_dims_and_tensors.default)
+def _sparse_coo_tensor_with_dims_and_tensors(fake_mode, func, *args, **kwargs):
+    # TODO: remove me
+    return constructors(fake_mode, func, *args, **kwargs)
+
+
+# index.Tensor data-dependent in only some conditions
+@register_op_impl(
+    lambda func: torch.Tag.dynamic_output_shape in func.tags
+    and func
+    not in [aten.index.Tensor, aten.nonzero.default, aten.repeat_interleave.Tensor]
+)
+def dyn_shape(fake_mode, func, *args, **kwargs):
+    raise DynamicOutputShapeException(func)
+
+
+@register_op_impl(aten.repeat_interleave.Tensor)
+def repeat_interleave_tensor(fake_mode, func, repeats, output_size=None):
+    if output_size is None:
+        if (
+            fake_mode.shape_env is None
+            or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+        ):
+            raise DynamicOutputShapeException(func)
+
+        output_size = fake_mode.shape_env.create_unbacked_symint()
+
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import _constrain_range_for_size
+
+        _constrain_range_for_size(output_size)
+        # TODO: consider a memo
+    return repeats.new_empty(output_size)
+
+
+@register_op_impl(torch.ops.aten._local_scalar_dense.default)
+def local_scalar_dense(fake_mode, func, arg):
+    if fake_mode.shape_env is None or not fake_mode.shape_env.allow_scalar_outputs:
+        # Without symints/symfloats, cannot handle this
+        raise DataDependentOutputException(func)
+    if is_float_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symfloat()
+    elif is_integer_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symint()
+    elif is_boolean_dtype(arg.dtype):
+        return fake_mode.shape_env.create_unbacked_symbool()
+    else:
+        raise NotImplementedError(f"local_scalar_dense/item NYI for {arg.dtype}")
+
+
+@register_op_impl(torch.ops.aten.nonzero.default)
+def nonzero(fake_mode, func, arg):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    if arg.nonzero_memo is None:
+        nnz = fake_mode.shape_env.create_unbacked_symint()
+
+        # This is unsound, but it works well in practice
+        # See https://docs.google.com/document/d/1lFRYAJo5nrfxRhwIzGnfi2pbLpU6T4ytSRSuLJ5qebI/edit#
+        # TODO: Add a config knob to turn off this unsound behavior
+        #
+        # NB: If numel < 2, the bounds here might be COMPLETELY
+        # disjoint with what can actually occur.  But this is fine:
+        # remember, the hypothesis is that if your later code works
+        # with N >= 2, it will work with N = 1 and N = 0.
+        maxval = sys.maxsize - 1
+
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import (
+            _constrain_range_for_size,
+            has_free_symbols,
+        )
+
+        if not has_free_symbols(arg.numel()):
+            # Don't upgrade the range if numel is less than two, since we then
+            # have an empty range which makes things go explodey.  We also
+            # don't allow for 2 because that would specialize the unbacked
+            # SymInt to 2, which is also likely to be buggy.
+            if arg.numel() > 2:
+                maxval = int(arg.numel())
+
+        _constrain_range_for_size(nnz, max=maxval)
+
+        arg._nonzero_memo = nnz
+        arg._nonzero_memo_vc = arg._version
+
+    return arg.new_empty((arg.nonzero_memo, arg.dim()), dtype=torch.int64)
+
+
+@register_op_impl(torch.ops.aten.masked_select.default)
+def masked_select(fake_mode, func, self, mask):
+    if (
+        fake_mode.shape_env is None
+        or not fake_mode.shape_env.allow_dynamic_output_shape_ops
+    ):
+        # Without symints/symfloats, cannot handle this
+        raise DynamicOutputShapeException(func)
+
+    nnz = fake_mode.shape_env.create_unbacked_symint()
+
+    # see nonzero for commentary
+    maxval = sys.maxsize - 1
+
+    # Avoid importing sympy at a module level
+    from torch.fx.experimental.symbolic_shapes import (
+        _constrain_range_for_size,
+        has_free_symbols,
+    )
+
+    if not has_free_symbols(self.numel()):
+        if self.numel() > 2:
+            maxval = int(self.numel())
+
+    _constrain_range_for_size(nnz, max=maxval)
+
+    return self.new_empty((nnz,))
+
+
+# NB: this must be ordered after local_scalar_dense
+@register_op_impl(lambda func: torch.Tag.data_dependent_output in func.tags)
+def data_dep(fake_mode, func, *args, **kwargs):
+    raise DataDependentOutputException(func)
+
+
+# Bool Indices get Expanded as Masks
+# See: IndexingUtils.h:expandTensors
+def check_no_bool_index_tensors(func, self, indices):
+    for index in indices:
+        if index is not None and index.dtype in (torch.bool, torch.uint8):
+            raise DynamicOutputShapeException(func)
+
+
+def run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    out_device = new_kwargs["input"].device
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(*args, **kwargs)
+        if not is_noncontiguous_supported(out_device):
+            out = out.new_empty(out.shape)
+
+    if out is new_kwargs["input"]:
+        return out  # copy_
+    return FakeTensor(fake_mode, out, out_device)
+
+
+_is_builtin_namespaces = ordered_set("aten", "prims", "prim")
+
+
+def is_builtin(op):
+    return op.namespace in _is_builtin_namespaces
+
+
+def has_meta(func):
+    return torch._C._dispatch_has_computed_kernel_for_dispatch_key(func.name(), "Meta")
+
+
+@register_op_impl(
+    lambda func: is_builtin(func) and "foreach" in func.name() and has_meta(func)
+)
+def foreach_run_and_map_input_device(fake_mode, func, *args, **kwargs):
+    tensor_lists = []
+    for arg in itertools.chain(args, kwargs.values()):
+        if (
+            isinstance(arg, (list, tuple))
+            and len(arg)
+            and isinstance(arg[0], torch.Tensor)
+        ):
+            tensor_lists.append(arg)
+
+    try:
+        with in_kernel_invocation_manager(fake_mode):
+            out_meta = func(*args, **kwargs)
+    except NotImplementedError as not_implemented_error:
+        return NotImplemented
+
+    if not out_meta:
+        return out_meta
+
+    assert tensor_lists
+    out_fake = []
+
+    for i, meta_t in enumerate(out_meta):
+        device, _ = FakeTensor._find_common_device(func, [tl[i] for tl in tensor_lists])
+        out_fake.append(
+            fake_mode.fake_tensor_converter.from_meta_and_device(
+                fake_mode, meta_t, device
+            )
+        )
+
+    return out_fake
+
+
+# Dont default to default device handling,
+# Since op can take in non-zero sized cpu
+# index tensors with cuda self
+@register_op_impl(aten.index.Tensor)
+def index_tensor(fake_mode, func, *args, **kwargs):
+    from torch._meta_registrations import meta_index_Tensor
+
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    out_device = new_kwargs["input"].device
+    # ensure nonzero call goes to fake tensor
+    with fake_mode:
+        out = meta_index_Tensor(*args, **kwargs)
+        return out.to(out_device)
+
+
+# Can take mixed meta/non-meta arguments; the meta registration
+# will roughly do the right thing even when given real devices
+@register_op_impl(aten._embedding_bag.default)
+def embedding_bag(fake_mode, func, *args, **kwargs):
+    from torch._meta_registrations import meta_embedding_bag
+
+    with fake_mode:
+        return meta_embedding_bag(*args, **kwargs)
+
+
+# takes in multiple-devices, dont default to default device handling
+@register_op_impl(aten._unsafe_index_put.default)
+@register_op_impl(aten.copy.default)
+@register_op_impl(aten.copy_.default)
+@register_op_impl(aten.slice_scatter.default)
+def multi_device_op_default(fake_mode, func, *args, **kwargs):
+    return run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
+
+
+# same with multi_device_op_default, but return the input
+@register_op_impl(aten.copy.out)
+@register_op_impl(aten.slice_scatter.out)
+def multi_device_op_out(fake_mode, func, *args, **kwargs):
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(*args, **kwargs)
+
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    return new_kwargs["input"]
+
+
+@register_op_impl(aten.index_put.default)
+@register_op_impl(aten.index_put_.default)
+def index_put_impl(fake_mode, func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    values = new_kwargs["values"]
+    self_device = new_kwargs["input"].fake_device
+    torch._check(
+        self_device == values.fake_device or (values.ndim == 0 and values.numel() == 1),
+        lambda: f"Mismatching {func} device between self ({self_device}) and values ({values.device})",
+    )
+
+    out = run_and_return_new_tensor_of_input_device(fake_mode, func, args, kwargs)
+    if func is aten.index_put_.default:
+        return new_kwargs["input"]
+    else:
+        return out
+
+
+@register_op_impl(aten._nested_tensor_from_tensor_list.default)
+@register_op_impl(aten._nested_tensor_from_tensor_list.out)
+def nested_tensors_unsupported(fake_mode, func, *args, **kwargs):
+    raise UnsupportedOperatorException(
+        "torch.compile does not support strided NestedTensor"
+    )
+
+
+@register_op_impl(
+    [
+        x
+        for x in _device_not_kwarg_ops
+        if x
+        not in (
+            # these are already registered elsewhere
+            aten.to.device,
+            aten.to.prim_Device,
+            aten._nested_tensor_from_tensor_list.default,
+            aten._nested_tensor_from_tensor_list.out,
+        )
+    ]
+)
+def nyi(fake_mode, func, *args, **kwargs):
+    assert func not in _device_not_kwarg_ops, f"NYI: {func}"
+
+
+@register_op_impl([aten.convolution.default, aten.convolution_backward.default])
+def conv(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    device = kwargs["input"].fake_device
+    # need to re-enable mode so the tensors report fake device
+    with fake_mode:
+        # if the input is unsqueezed is done in Convolution.cpp we get segfault
+        k = kwargs["weight"].ndim
+        batch = kwargs["input"].shape[0]
+
+        # Avoid importing sympy at a module level
+        from torch.fx.experimental.symbolic_shapes import has_hint
+
+        if not has_hint(batch):
+            # TODO: We can make this a little more faithful with best effort
+            # channels last detection (but only if it's statically obvious!)
+            mem_fmt = None
+        elif k == 3 and not kwargs["input"].is_mkldnn and not kwargs["input"].is_xpu:
+            mem_fmt = None
+        else:
+            if func is aten.convolution.default:
+                conv_backend = torch._C._select_conv_backend(**kwargs)
+            else:
+                conv_backend = torch._C._select_conv_backend(
+                    kwargs["input"],
+                    kwargs["weight"],
+                    bias=None,
+                    stride=kwargs["stride"],
+                    padding=kwargs["padding"],
+                    dilation=kwargs["dilation"],
+                    transposed=kwargs["transposed"],
+                    output_padding=kwargs["output_padding"],
+                    groups=kwargs["groups"],
+                    bias_sizes=kwargs["bias_sizes"],
+                )
+            mem_fmt = torch._C._conv_determine_backend_memory_format(
+                kwargs["input"], kwargs["weight"], conv_backend
+            )
+
+    def convert(t, mem_fmt):
+        if t is None:
+            return t
+        if mem_fmt is not None:
+            t = t.to(memory_format=mem_fmt)
+        return FakeTensor(fake_mode, t, device)
+
+    with in_kernel_invocation_manager(fake_mode):
+        out = func(**kwargs)
+
+        if func is aten.convolution.default:
+            return convert(out, mem_fmt)
+        else:
+            return (
+                convert(out[0], mem_fmt),
+                convert(out[1], mem_fmt),
+                convert(out[2], None),
+            )
+
+
+@register_op_impl(aten._scaled_dot_product_flash_attention.default)
+def meta__scaled_dot_product_flash(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    return_debug_mask = kwargs["return_debug_mask"]
+    # unused: value, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    batch_size = query.size(0)
+    num_heads = query.size(1)
+    max_seqlen_batch_q = query.size(2)
+    head_dim = query.size(3)
+    max_seqlen_batch_k = key.size(2)
+
+    query_t = query.transpose(1, 2)
+    # empty_like already returns a fake tensor so we don't need to convert it
+    attention = torch.empty_like(query_t).transpose(1, 2)
+    logsumexp = convert_tensor(
+        torch.empty(
+            (batch_size, num_heads, max_seqlen_batch_q),
+            dtype=torch.float,
+            device="meta",
+        ),
+        device=query.device,
+    )
+
+    if return_debug_mask:
+        blocksize_c = 128 if head_dim > 64 else 256
+        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
+        if max_seqlen_batch_k <= 128:
+            max_seqlen_k = 128
+        elif max_seqlen_batch_k <= 256:
+            max_seqlen_k = 256
+        debug_mask = convert_tensor(
+            torch.empty(
+                (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
+                dtype=query.dtype,
+                device="meta",
+            ),
+            device=query.device,
+        )
+    else:
+        debug_mask = convert_tensor(
+            torch.empty(0, dtype=query.dtype, device="meta"),
+            query.device,
+        )
+
+    # Note [Seed and Offset]: device for seed and offset below depends on whether we are
+    # capturing or not, but at the time of tracing we don't know if we
+    # are going to use cudagraphs or not, so we return meta tensors here
+    # it's possible we'll need to have some special handling in inductor for sdpa
+
+    return (
+        attention,
+        logsumexp,
+        None,
+        None,
+        max_seqlen_batch_q,
+        max_seqlen_batch_k,
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        debug_mask,
+    )
+
+
+@register_op_impl(aten._scaled_dot_product_efficient_attention.default)
+def meta__scaled_dot_product_efficient(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    value = kwargs["value"]
+    compute_log_sumexp = kwargs["compute_log_sumexp"]
+    # unused: attn_bias, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    B = query.size(0)
+    M = query.size(1)
+    N = key.size(1)
+    num_heads = query.size(-2)
+    K = query.size(-1)
+    Kv = value.size(-1)
+
+    res = convert_tensor(
+        torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device="meta"),
+        query.device,
+    )
+
+    logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
+    logsum_exp = convert_tensor(
+        torch.empty(
+            (B, num_heads, logsumexp_dim),
+            dtype=torch.float,
+            device="meta",
+        ),
+        query.device,
+    )
+
+    res = res.transpose(1, 2)
+
+    # See Note [Seed and Offset]:
+    seed = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+    offset = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+
+    return res, logsum_exp, seed, offset
+
+
+@register_op_impl(aten._flash_attention_forward.default)
+def meta__flash_attention_forward(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    cum_seq_q = kwargs["cum_seq_q"]
+    cum_seq_k = kwargs["cum_seq_k"]
+    max_q = kwargs["max_q"]
+    max_k = kwargs["max_k"]
+    return_debug_mask = kwargs["return_debug_mask"]
+    # unused: value, dropout_p, is_causal, scale
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    # NB: there are two underlying paths:
+    # 1. normal dense path; expect 4D inputs of shape (batch_size, seqlen, num_heads, head_dim)
+    # 2. varseqlen path; expect 3D inputs of shape (total, num_heads, head_dim) where total
+    #    includes all batch item sequences. cum_seq_q / cum_seq_k contain offsets into total
+    batch_size = query.size(0) if cum_seq_q is None else cum_seq_q.numel() - 1
+    max_seqlen_batch_q = query.size(1) if cum_seq_q is None else max_q
+    max_seqlen_batch_k = key.size(1) if cum_seq_k is None else max_k
+    num_heads = query.size(-2)
+    head_dim = query.size(-1)
+
+    # Cuda Path
+    # note: empty_like already returns a fake tensor, we don't need to wrap it
+    attention = torch.empty_like(query)
+    logsumexp = convert_tensor(
+        torch.empty(
+            (batch_size, num_heads, max_seqlen_batch_q),
+            dtype=torch.float,
+            device="meta",
+        ),
+        device=query.device,
+    )
+
+    if return_debug_mask:
+        blocksize_c = 128 if head_dim > 64 else 256
+        max_seqlen_k = math.ceil(max_seqlen_batch_q / blocksize_c)
+        if max_seqlen_batch_k <= 128:
+            max_seqlen_k = 128
+        elif max_seqlen_batch_k <= 256:
+            max_seqlen_k = 256
+        debug_mask = convert_tensor(
+            torch.empty(
+                (batch_size, num_heads, max_seqlen_batch_q, max_seqlen_k),
+                dtype=query.dtype,
+                device="meta",
+            ),
+            query.device,
+        )
+    else:
+        debug_mask = convert_tensor(
+            torch.empty(0, dtype=query.dtype, device="meta"),
+            query.device,
+        )
+
+    # See Note [Seed and Offset]:
+    return (
+        attention,
+        logsumexp,
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        convert_tensor(torch.empty((), dtype=torch.long, device="meta"), query.device),
+        debug_mask,
+    )
+
+
+@register_op_impl(aten._efficient_attention_forward.default)
+def meta__efficient_attention_forward(fake_mode, func, *args, **kwargs):
+    _, kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    query = kwargs["query"]
+    key = kwargs["key"]
+    value = kwargs["value"]
+    cu_seqlens_q = kwargs["cu_seqlens_q"]
+    max_seqlen_q = kwargs["max_seqlen_q"]
+    max_seqlen_k = kwargs["max_seqlen_k"]
+    compute_log_sumexp = kwargs["compute_log_sumexp"]
+    # unused: bias, cu_seqlens_k, dropout_p, custom_mask_type, scale, causal_diagonal, seqlen_k
+
+    def convert_tensor(t, device):
+        return FakeTensor(fake_mode, t, device)
+
+    B = query.size(0)
+    M = query.size(1)
+    N = key.size(1)
+    num_heads = query.size(-2)
+    K = query.size(-1)
+    Kv = value.size(-1)
+
+    res = convert_tensor(
+        torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device="meta"),
+        query.device,
+    )
+
+    logsumexp_batch_dim = cu_seqlens_q.size(0) - 1 if (cu_seqlens_q is not None) else B
+    actual_max_seqlen_q = M
+    if cu_seqlens_q is not None:
+        assert max_seqlen_q is not None
+        actual_max_seqlen_q = max_seqlen_q
+    actual_max_seqlen_k = max_seqlen_k if max_seqlen_k is not None else N
+    logsumexp_dim = (
+        math.ceil(actual_max_seqlen_q / 32) * 32 if compute_log_sumexp else 0
+    )
+    logsum_exp = convert_tensor(
+        torch.empty(
+            (logsumexp_batch_dim, num_heads, logsumexp_dim),
+            dtype=torch.float,
+            device="meta",
+        ),
+        query.device,
+    )
+
+    # See Note [Seed and Offset]:
+    seed = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+    offset = convert_tensor(
+        torch.empty((), dtype=torch.long, device="meta"), query.device
+    )
+
+    return res, logsum_exp, seed, offset, actual_max_seqlen_q, actual_max_seqlen_k
+
+
+FAST_OP_IMPLEMENTATIONS = {}
+
+
+# Unlike register_op_impl, these don't do the slow iteration for
+# run_impl_check, and these run BEFORE decompositions
+def register_fast_op_impl(func: OpOverload):
+    def impl_decorator(op_impl):
+        FAST_OP_IMPLEMENTATIONS[func] = op_impl
+        return op_impl
+
+    return impl_decorator
+
+
+# infer_size_impl in ExpandUtils
+def infer_size(a, b):
+    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+    dimsA = len(a)
+    dimsB = len(b)
+    ndim = max(dimsA, dimsB)
+    expandedSizes = [0] * ndim
+    for i in range(ndim - 1, -1, -1):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        sizeA = a[dimA] if dimA >= 0 else 1
+        sizeB = b[dimB] if dimB >= 0 else 1
+
+        # NB: It is very important to test for broadcasting, before testing
+        # sizeA == sizeB.  This is because the broadcasting tests are likely
+        # to be statically known (in particular, if sizeA/sizeB is unbacked
+        # but size-like, we will unsoundly assume they never equal 1), but
+        # the sizeA == sizeB test may not be statically known.  However, once
+        # we have established that no broadcasting is happening, the
+        # sizeA == sizeB is now expect_true and we can defer it as a runtime
+        # assert (this works because Python will return the terminal
+        # expression of an or statement as-is, without bool()'ing it; if this
+        # were not the case, we'd need to write this using torch.sym_or() or
+        # something like that).
+        torch._check(
+            guard_size_oblivious(sizeA == 1)
+            or guard_size_oblivious(sizeB == 1)
+            or sizeA == sizeB,
+            lambda: f"The size of tensor a ({sizeA}) "
+            f"must match the size of tensor b ({sizeB}) "
+            f"at non-singleton dimension {i})",
+        )
+        expandedSizes[i] = sizeB if guard_size_oblivious(sizeA == 1) else sizeA
+    return tuple(expandedSizes)
+
+
+def make_fast_binary_impl(slow_ref):
+    def fast_binary_impl(mode, *args, **kwargs):
+        def slow(msg):
+            count_label(f"slow {msg}")
+            with mode:
+                return slow_ref(*args, **kwargs)
+
+        count_label("attempt fast")
+
+        # Fast path (based off of TensorIterator fast path).
+        # Unfortunately, there is no way to easily deduplicate
+        # this with either the TensorIterator C++ implementation
+        # (which we don't want to SymIntify, and also the algorithm
+        # here is slightly different from TensorIterator to allow
+        # for broadcasting), nor the PrimTorch implementation
+        # (which does not actually implement a fast path.)
+
+        operands = args
+
+        # compute_shape
+        has_scalars = False
+        has_tensors = False
+        final_shape = None
+        for op in operands:
+            shape = op.shape if isinstance(op, torch.Tensor) else ()
+            if len(shape) == 0:
+                has_scalars = True
+            else:
+                has_tensors = True
+            if final_shape is None:
+                final_shape = shape
+            # TODO: Minor optimization: track if the shapes
+            # were equal so you can skip the equality check
+            # below if unnecessary
+            final_shape = infer_size(final_shape, shape)
+        assert final_shape is not None
+
+        # Do some extra safety checks to see if the output
+        # stride is obvious
+        for op in operands:
+            if (
+                isinstance(op, torch.Tensor)
+                and len(op.shape) == len(final_shape)
+                and op.shape == final_shape
+            ):
+                break
+        else:
+            return slow("both tensors nontrivially broadcast")
+
+        # compute_types
+        cpu = torch.device("cpu")
+        common_device = cpu
+        common_dtype = None
+        output_dtype = None
+        has_different_input_dtypes = False
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                # Use elementwise_dtypes for the tricky case
+                has_different_input_dtypes = True
+                continue
+            if common_device == cpu and not op.device.type == "cpu":
+                common_device = op.device
+            # Slightly simplified here as target_dtype cannot vary
+            if common_dtype is None:
+                common_dtype = op.dtype
+            elif common_dtype != op.dtype:
+                has_different_input_dtypes = True
+
+        if has_different_input_dtypes:
+            # compute promotion
+            # TODO: we don't need the compute type
+            _, common_dtype = elementwise_dtypes(
+                *operands, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+            )
+
+        # check all tensors on same device
+        # cpu scalars are assumed allow
+        current_cpu_scalars_on_non_cpu = 0
+        max_cpu_scalars_on_non_cpu = 1  # hard coded atm
+        for op in operands:
+            if not isinstance(op, torch.Tensor):
+                continue
+            if common_device != cpu and op.dim() == 0 and op.device == cpu:
+                if current_cpu_scalars_on_non_cpu >= max_cpu_scalars_on_non_cpu:
+                    return slow("error")
+                current_cpu_scalars_on_non_cpu += 1
+            elif op.device != common_device:
+                return slow("error")
+
+        # compute_fast_setup_type
+        is_contiguous = True
+        is_channels_last = True
+        # TODO: is_non-overlapping_and_dense (not bound from Python
+        # no inplace, no out, everything defined
+
+        if is_noncontiguous_supported(common_device):
+            for op in operands:
+                if not isinstance(op, torch.Tensor):
+                    continue
+                is_contiguous = is_contiguous and op.is_contiguous(
+                    memory_format=torch.contiguous_format
+                )
+                is_channels_last = is_channels_last and op.is_contiguous(
+                    memory_format=torch.channels_last
+                )
+        if is_contiguous:
+            # do contiguous
+            count_label("fast is_contiguous")
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.contiguous_format,
+                ),
+                device=common_device,
+            )
+        if is_channels_last:
+            count_label("fast channels_last")
+            # do channels last
+            return FakeTensor(
+                mode,
+                torch.empty(
+                    final_shape,
+                    dtype=common_dtype,
+                    device="meta",
+                    memory_format=torch.channels_last,
+                ),
+                device=common_device,
+            )
+
+        return slow("no contiguity match")
+
+    return fast_binary_impl
+
+
+@functools.lru_cache(None)
+def get_fast_op_impls():
+    import torch._refs
+
+    register_fast_op_impl(torch.ops.aten.add.Tensor)(
+        make_fast_binary_impl(torch._refs.add)
+    )
+    register_fast_op_impl(torch.ops.aten.sub.Tensor)(
+        make_fast_binary_impl(torch._refs.sub)
+    )
+    register_fast_op_impl(torch.ops.aten.mul.Tensor)(make_fast_binary_impl(torch._refs.mul))  # type: ignore[has-type]
+    register_fast_op_impl(torch.ops.aten.div.Tensor)(
+        make_fast_binary_impl(torch._refs.div)
+    )
+    return FAST_OP_IMPLEMENTATIONS
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/fake_tensor.py b/MLPY/Lib/site-packages/torch/_subclasses/fake_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6b9cacbdebf4912c8d0d2bba9575f0933666139
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/fake_tensor.py
@@ -0,0 +1,1819 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import logging
+import os
+import traceback
+import weakref
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type, TYPE_CHECKING, TypeVar
+from weakref import ReferenceType
+
+import torch
+import torch._custom_op
+import torch._logging
+from torch._C._functorch import is_functorch_wrapped_tensor
+
+from torch._guards import Source
+from torch._ops import OpOverload
+from torch._prims_common import suggest_memory_format
+from torch._subclasses.meta_utils import (
+    assert_eq,
+    assert_metadata_eq,
+    is_sparse_any,
+    is_sparse_compressed,
+    MetaConverter,
+)
+from torch._utils import render_call
+from torch.fx.operator_schemas import normalize_function
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.overrides import TorchFunctionMode
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    TorchDispatchMode,
+)
+
+from torch.utils._pytree import PyTree, tree_map
+from torch.utils._stats import count
+from torch.utils.weak import WeakIdRef
+
+if TYPE_CHECKING:
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+DimList = List
+
+log = logging.getLogger(__name__)
+
+# TODO: Hack to unblock https://github.com/pytorch/pytorch/pull/108186
+# Proper fix tracked by https://github.com/pytorch/pytorch/issues/120105
+try:
+    not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+except ValueError as e:
+    if "'not_implemented' not registered" in str(e):
+        import logging as not_implemented_log
+    else:
+        raise e
+
+pytree = torch.utils._pytree
+T = TypeVar("T")
+TensorWeakRef = Any
+
+aten = torch._ops.ops.aten
+
+CONSTANT_NUMEL_LIMIT = 1
+
+RECURSION_COUNT = 0
+
+
+# Small helper that increments recursion count, and
+# resets it when the object goes out of scope.  Useful
+# if you don't want to increase indentation which is
+# what a context manager would do.
+class IncrementRecursionCount:
+    def __init__(self):
+        global RECURSION_COUNT
+        RECURSION_COUNT += 1
+
+    def __del__(self):
+        global RECURSION_COUNT
+        RECURSION_COUNT -= 1
+
+
+@dataclass
+class UnsupportedFakeTensorException(RuntimeError):
+    reason: str
+
+
+@dataclass
+class DynamicOutputShapeException(RuntimeError):
+    func: OpOverload
+
+
+@dataclass
+class DataDependentOutputException(RuntimeError):
+    func: OpOverload
+
+
+@dataclass
+class UnsupportedOperatorException(RuntimeError):
+    func: OpOverload
+
+
+def ordered_set(*items):
+    return dict.fromkeys(items, True)
+
+
+@contextlib.contextmanager
+def unset_fake_temporarily():
+    old = torch._C._unset_dispatch_mode(torch._C._TorchDispatchModeKey.FAKE)
+    try:
+        yield old
+    finally:
+        if old is not None:
+            torch._C._set_dispatch_mode(old)
+
+
+def is_fake(x):
+    if isinstance(x, FakeTensor):
+        return True
+    if is_traceable_wrapper_subclass(x):
+        attrs, _ = type(x).__tensor_flatten__(x)
+        flattened_tensors = [getattr(x, attr) for attr in attrs]
+        # need to recurse because we could have nested subclasses
+        all_fake = all(is_fake(x) for x in flattened_tensors)
+        any_fake = any(is_fake(x) for x in flattened_tensors)
+        assert all_fake == any_fake, "got mixed fake and real tensors!"
+        return all_fake
+    elif isinstance(x, torch.Tensor) and torch._is_functional_tensor(x):
+        reapply_views = torch._C._functionalization_reapply_views_tls()
+        unwrapped = torch._C._functorch._unwrap_functional_tensor(x, reapply_views)
+        return is_fake(unwrapped)
+    elif isinstance(x, torch.Tensor) and is_functorch_wrapped_tensor(x):
+        unwrapped = torch._C._functorch.get_unwrapped(x)
+        return is_fake(unwrapped)
+    return False
+
+
+def maybe_get_fake_mode(t):
+    if isinstance(t, FakeTensor):
+        return t.fake_mode
+    if is_traceable_wrapper_subclass(t):
+        inner_tensor_names, _ = t.__tensor_flatten__()
+        modes = [
+            maybe_get_fake_mode(getattr(t, t_name)) for t_name in inner_tensor_names
+        ]
+        m = modes[0]
+        assert all(m is x for x in modes)
+        return m
+    elif isinstance(t, torch.Tensor) and torch._is_functional_tensor(t):
+        reapply_views = torch._C._functionalization_reapply_views_tls()
+        unwrapped = torch._C._functorch._unwrap_functional_tensor(t, reapply_views)
+        return maybe_get_fake_mode(unwrapped)
+    elif isinstance(t, torch.Tensor) and is_functorch_wrapped_tensor(t):
+        unwrapped = torch._C._functorch.get_unwrapped(t)
+        return maybe_get_fake_mode(unwrapped)
+    return None
+
+
+@functools.lru_cache(None)
+def get_schema_info(func):
+    return torch._C._SchemaInfo(func._schema)  # type: ignore[attr-defined]
+
+
+# many of the decompositions registered to torch/_prims do not at the moment model
+# aliasing or strides, so as an incremental step, just enable the decompositions in
+# torch/_decomp/decompositions.py.
+# decomps are used for aot autograd tracing so we would like to unify on their
+# implementation and add additional testing to them
+@functools.lru_cache(None)
+def torch_decomp_decompositions(func):
+    from torch._decomp import decomposition_table
+
+    decompositions = torch._decomp.decompositions
+    # Note that the function in the decomposition table might be
+    # different from the one in the module because of the difference
+    # in out handling in aten API and torch public API
+    return decomposition_table[func].__module__.startswith(
+        "torch._decomp"
+    ) and decomposition_table[func].__name__ in dir(decompositions)
+
+
+def tree_flatten_only(ty: Type[T], tree: PyTree):
+    flat_vals = pytree.tree_leaves(tree)
+    return [elem for elem in flat_vals if isinstance(elem, ty)]
+
+
+# Similar to `MetaConverter`, this is a class for converting
+# multiple tensors into fake tensors which share the same view/storage
+# structure. Like `MetaConverter`, it uses `WeakIdRef` to
+# hold a weak reference for all memoized tensors.
+class FakeTensorConverter:
+    @property
+    def tensor_memo(self):
+        return self.meta_converter.tensor_memo
+
+    meta_converter: MetaConverter
+    constant_storage_mapping: Dict[StorageWeakRef, List[ReferenceType]]
+
+    def __init__(self):
+        self.meta_converter = MetaConverter()
+
+        # map from to storage to corresponding constant tensors
+        self.constant_storage_mapping = {}
+
+    def add_constant_storage_mapping(self, fake_tensor):
+        # when you have a constant, aliased tensor:
+        # const_tensor.add_(torch.rand([1]))
+        # all aliases of it must become no longer const
+        assert isinstance(fake_tensor, FakeTensor) and fake_tensor.constant is not None
+        weak_st = StorageWeakRef(fake_tensor.constant._typed_storage())
+
+        # we need a map from a weak storage to all of its corresponding
+        # constant tensors. python doesn't have the weak value equivalent
+        # of defaultdict(list), so we are using a WeakValueDictionary as one
+        if weak_st not in self.constant_storage_mapping:
+            self.constant_storage_mapping[weak_st] = []
+        self.constant_storage_mapping[weak_st].append(weakref.ref(fake_tensor))
+
+    def invalidate_constant_aliases(self, tensor):
+        assert not isinstance(tensor, FakeTensor)
+
+        weak_st = StorageWeakRef(tensor._typed_storage())
+        if weak_st not in self.constant_storage_mapping:
+            return
+
+        for weak_tensor_ref in self.constant_storage_mapping[weak_st]:
+            ten = weak_tensor_ref()
+            if ten is not None:
+                ten._fix_weakref()
+                ten.constant = None
+
+        del self.constant_storage_mapping[weak_st]
+
+    def _get_memo(self, t):
+        if WeakIdRef(t) in self.tensor_memo:
+            out = self.tensor_memo[WeakIdRef(t)]
+            out._fix_weakref()
+            return out
+        return None
+
+    def set_tensor_memo(self, t, v):
+        th = WeakIdRef(t)
+
+        # hold a weak ref to self, otherwise it will be kept alive
+        # by the del_ten closure
+        self_weak_ref = weakref.ref(self)
+
+        def del_ten():
+            self_ref = self_weak_ref()
+            if self_ref is None:
+                return
+            # on shutdown, th may not be in memo
+            self_ref.tensor_memo.pop(th, None)
+
+        weakref.finalize(t, del_ten)
+        self.tensor_memo[th] = v
+
+    def from_real_tensor(
+        self,
+        fake_mode,
+        t,
+        make_constant=False,
+        shape_env=None,
+        *,
+        source=None,
+        symbolic_context=None,
+        memoized_only=False,
+    ):
+        # see note [Tensor Fakification and Symbol Caching]
+        if not symbolic_context and not source and shape_env:
+            if tracing_context := torch._guards.TracingContext.try_get():
+                if t in tracing_context.tensor_to_context:
+                    symbolic_context = tracing_context.tensor_to_context[t]
+                    source = symbolic_context.tensor_source
+
+        maybe_memo = self._get_memo(t)
+        if maybe_memo is not None:
+            return maybe_memo
+        if memoized_only:
+            return None
+        existing_device = t.device
+        # not yet supported in metatensors
+        if t.is_quantized:
+            raise UnsupportedFakeTensorException("quantized nyi in meta tensors")
+        if type(t) is torch.nn.Parameter:
+            assert not make_constant
+
+        def mk_fake_tensor(make_meta_t):
+            # NB: don't use in_kernel_invocation_manager. to
+            # ensure FakeTensor can internally do constant computation
+            # as necessary.  Invocation manager is "more correct" as
+            # it works for more operators in make_meta_t, but
+            # invariant is that make_meta_t only calls factories
+            # for which it is not strictly necessary to use the
+            # invocation manager (I think!)
+            with no_dispatch():
+                return FakeTensor(
+                    fake_mode,
+                    make_meta_t(),
+                    existing_device,
+                    constant=t if make_constant else None,
+                )
+
+        out = self.meta_converter(
+            t,
+            shape_env=shape_env,
+            callback=mk_fake_tensor,
+            source=source,
+            symbolic_context=symbolic_context,
+        )
+        if out is NotImplemented:
+            raise UnsupportedFakeTensorException("meta converter nyi")
+        if make_constant:
+            self.add_constant_storage_mapping(out)
+        # NB: meta_converter set the memo
+        return out
+
+    # If you specify the device, it MUST be a meta tensor.
+    def from_meta_and_device(self, fake_mode, t, device):
+        assert (
+            t.device.type == "meta"
+        ), f"tensor's device must be `meta`, got {t.device.type} instead"
+        maybe_memo = self._get_memo(t)
+        if maybe_memo is not None:
+            return maybe_memo
+        out = FakeTensor(fake_mode, t, device)
+        self.set_tensor_memo(t, out)
+        return out
+
+    # You can have a real tensor that you need to convert into a fake tensor.
+    # If you have a meta tensor already, call from_meta_and_device.
+    #
+    # You're allowed to pass a meta tensor to be turned into a fake
+    # tensor; although an odd thing to do, this can occur if you're doing
+    # cross ref testing and the inner test is already operating on meta tensors.
+    def __call__(
+        self,
+        fake_mode,
+        t,
+        *,
+        make_constant=False,
+        shape_env=None,
+        source=None,
+        symbolic_context=None,
+        memoized_only=False,
+    ):
+        return self.from_real_tensor(
+            fake_mode,
+            t,
+            make_constant,
+            shape_env=shape_env,
+            source=source,
+            symbolic_context=symbolic_context,
+            memoized_only=memoized_only,
+        )
+
+
+@functools.lru_cache(None)
+def init_cuda_context():
+    # Backward will error with cuda Fake Tensors if no cuda tensors have been initialized first
+    if torch.cuda.is_available():
+        torch.empty(1, device="cuda") if torch.version.hip is None else torch.zeros(
+            1, device="cuda"
+        )
+
+
+@contextlib.contextmanager
+def in_kernel_invocation_manager(fake_mode):
+    # See: note [Fake Tensor Dispatch Keys]
+    prev_in_kernel = fake_mode.in_kernel_invocation
+    meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+    assert meta_in_tls == prev_in_kernel, f"{meta_in_tls}, {prev_in_kernel}"
+
+    guard = torch._C._DisableTorchDispatch()  # type: ignore[attr-defined]
+    fake_mode.in_kernel_invocation = True
+    torch._C._set_meta_in_tls_dispatch_include(True)
+    try:
+        yield
+    finally:
+        fake_mode.in_kernel_invocation = prev_in_kernel
+        torch._C._set_meta_in_tls_dispatch_include(prev_in_kernel)
+        del guard
+
+
+# Return if the function allows Python numbers to bind to Tensors
+def should_allow_numbers_as_tensors(func: OpOverload):
+    return torch._C._should_allow_numbers_as_tensors(
+        func.name().split("::")[-1].split(".")[0]
+    )
+
+
+class FakeTensorConfig:
+    debug = os.environ.get("TORCH_FAKE_TENSOR_DEBUG", "0") == "1"
+
+
+class FakeTensor(torch.Tensor):
+    """
+    Meta tensors give you the ability to run PyTorch code without having to
+    actually do computation through tensors allocated on a `meta` device.
+    Because the device is `meta`, meta tensors do not model device propagation.
+    FakeTensor extends MetaTensors to also carry an additional `fake_device`
+    which tracks devices that would have been used.
+    """
+
+    fake_device: torch.device
+    fake_mode: "FakeTensorMode"
+    constant: Optional[torch.Tensor]
+
+    # This memorizes the unbacked SymInt representing the number of nonzero
+    # elements in this tensor.  This is helpful if you do something like
+    # x[mask] and y[mask]; mask.nonzero() gets repeatedly called and should
+    # give a consistent unbacked SymInt.  It needs to be invalidated in the
+    # same way constant is.
+    # TODO: Generalize this as needed, e.g., into a trie of memos
+    _nonzero_memo: Optional[torch.SymInt]
+    _nonzero_memo_vc: Optional[int]
+
+    # Indicates to our torch_dispatch dispatching infra that
+    # this is an "infra" mode with lower dispatching precedence.
+    _mode_key = torch._C._TorchDispatchModeKey.FAKE
+
+    @property
+    def nonzero_memo(self):
+        if self._nonzero_memo is None:
+            return None
+        # Version counter based tracking isn't 100% sound but it's close
+        # enough
+        if self._nonzero_memo_vc != self._version:
+            self._nonzero_memo = None
+            return None
+        return self._nonzero_memo
+
+    @property
+    def device(self):
+        if self.fake_mode.in_kernel_invocation:
+            return torch.device("meta")
+        else:
+            return self.fake_device
+
+    # Note: [Fake Tensor Dispatch Keys]
+    # In order to model the behavior of device-specific autocast
+    # and autograd logic, we update the dispatch keys of FakeTensors
+    # to reflect their fake device. This includes the BackendComponent
+    # (DispatchKey::Meta -> DispatchKey::CUDA), and also the BackendComponent
+    # related Autocast and Autograd keys. __torch__dispatch__ sits below
+    # Autocast and Autograd, and is only invoked when we are at the
+    # kernel for the BackendComponent. Then, we add Meta to the
+    # thread-local dispatch include set to hit the meta kernel
+    # instead of the kernel of the BackendComponent for the fake device.
+    # The `device_for_backend_keys` does that below
+    # NOTE: this probably will not do the right thing for backends
+    # that have dispatch keys which are higher than the "meta" key:
+    # https://github.com/pytorch/pytorch/blob/main/c10/core/DispatchKey.h#L189
+
+    # We don't support named tensors; graph break
+    @property
+    def names(self):
+        raise UnsupportedFakeTensorException(
+            "torch.compile doesn't support named tensors"
+        )
+
+    @staticmethod
+    def __new__(cls, fake_mode, elem, device, constant=None):
+        self = torch.Tensor._make_subclass(
+            cls,
+            elem,
+            elem.requires_grad,
+            dispatch_device=True,
+            device_for_backend_keys=device,
+        )
+
+        assert elem.device.type == "meta", elem.device.type
+        device = device if isinstance(device, torch.device) else torch.device(device)
+        # NB: it is fine, if a little confusing, for device to be meta
+        # (we are faking a meta tensor in that case).  However, it often
+        # indicates some sort of confusion (e.g., you accidentally passed
+        # in a meta tensor when you should have passed in the real tensor).
+        # So by default we disallow meta, and if you are working in a situation
+        # where it is helpful (e.g., crossref testing) you can turn it back
+        # on
+        if not fake_mode.allow_meta:
+            assert device.type != "meta"
+        # normalize device.
+        if device.type == "cuda":
+            init_cuda_context()
+
+        if (
+            device.type
+            in ["cuda", "hpu", "xpu", torch._C._get_privateuse1_backend_name()]
+            and device.index is None
+        ):
+            device = torch.device(
+                f"{device.type}:{getattr(torch, device.type).current_device()}"
+            )
+        self.fake_device = device  # type: ignore[attr-defined]
+        self.fake_mode = fake_mode  # type: ignore[attr-defined]
+        self.constant = constant  # type: ignore[attr-defined]
+        self._nonzero_memo = None  # type: ignore[attr-defined]
+        self._nonzero_memo_vc = None  # type: ignore[attr-defined]
+
+        if FakeTensorConfig.debug:
+            import traceback
+
+            self._debug_trace = traceback.extract_stack()  # type: ignore[attr-defined]
+        return self
+
+    # In some circumstances, a conventional torch.Tensor constructor
+    # will get rewritten to call into FakeTensor.  We must provide an
+    # __init__ method that can accept the Python interpreters initialization
+    # in such a situation; we must also be able to handle direct fake
+    # tensor construction via FakeTensor().
+    #
+    # In particular, the __init__ call will look funny in the following case:
+    #
+    #   with FakeTensorMode():
+    #       x = torch.Tensor([1, 2, 3])
+    #
+    # this desugars into:
+    #
+    #   with FakeTensorMode():
+    #       x = torch.Tensor.__new__([1, 2, 3])
+    #       # NB: x is a fake tensor, because of the mode!
+    #       x.__init__([1, 2, 3])  # not the normal fake tensor args!
+    #
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+
+    @staticmethod
+    def from_tensor(t, fake_mode):
+        return fake_mode.from_tensor(t)
+
+    @classmethod
+    @count
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        # need to handle here to avoid infinite recursion
+        # see [in_kernel_invocation]
+        if func == torch.ops.prim.device.default:
+            assert len(args) == 1 and isinstance(args[0], FakeTensor)
+            if args[0].fake_mode.in_kernel_invocation:
+                return torch.device("meta")
+            else:
+                return args[0].fake_device
+
+        # Because fake mode can return NotImplemented (if it sees a subclass
+        # it doesn't know how to deal with), this test here is important
+        # because the next dispatch after a fake mode will attempt to use
+        # subclasses of tensors to dispatch, and any FakeTensor arguments
+        # will be considered eligible.
+        unrecognized_types = [
+            t for t in types if not issubclass(t, FakeTensor) and t is not torch.Tensor
+        ]
+        if unrecognized_types:
+            not_implemented_log.debug(
+                "FakeTensor unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        fake_mode = None
+        for arg in pytree.arg_tree_leaves(*args, **kwargs):
+            if isinstance(arg, FakeTensor):
+                fake_mode = arg.fake_mode
+                break
+
+        assert fake_mode is not None
+
+        # If the fake mode is already active, don't try to reapply it!
+        # NotImplemented is the right thing to return here, because the
+        # typical situation this can occur is if ProxyTensorMode returned a
+        # NotImplemented because of a not implemented subclass; we may have
+        # unluckily attempted to hit FakeTensor's dispatch first,
+        # NotImplemented lets us keep chaining until we find the actual
+        # subclass
+        maybe_cur_fake_mode = torch._C._get_dispatch_mode(
+            torch._C._TorchDispatchModeKey.FAKE
+        )
+        if maybe_cur_fake_mode:
+            not_implemented_log.debug(
+                "FakeTensor mode already active: %s in %s",
+                fake_mode,
+                maybe_cur_fake_mode,
+            )
+            return NotImplemented
+
+        with fake_mode:  # type: ignore[attr-defined]
+            return func(*args, **kwargs)
+
+    @staticmethod
+    def _find_common_device(func, flat_args) -> Tuple[torch.device, bool]:
+        # Returns: (common_device, has_scalar_only_inputs)
+
+        # cpu - zero-dim tensors can be called in cuda kernels,
+        # so overwrite the common_device if it the only existing
+        # device comes from a cpu zero-dim tensor
+        common_device = None
+        has_scalar_only_inputs = False
+        is_cpu_zero_dim = None
+
+        def cpu_zero_dim(t):
+            return t.device.type == "cpu" and t.dim() == 0
+
+        def merge_devices(t):
+            nonlocal common_device
+            nonlocal is_cpu_zero_dim
+            if not isinstance(t, FakeTensor):
+                return
+
+            if common_device is None:
+                common_device = t.device
+                is_cpu_zero_dim = cpu_zero_dim(t)
+                return
+
+            t_is_cpu_zero_dim = cpu_zero_dim(t)
+            if t.device == common_device:
+                if is_cpu_zero_dim:
+                    is_cpu_zero_dim = t_is_cpu_zero_dim
+                return
+
+            # mismatching devices !
+            # if current tensor is cpu 0 dim, defer to existing device
+            if t_is_cpu_zero_dim:
+                return
+
+            # current device is from cpu 0 dim tensor, overwrite
+            if is_cpu_zero_dim:
+                common_device = t.device
+                is_cpu_zero_dim = t_is_cpu_zero_dim
+                return
+
+            # mismatching devices of non-zero dim tensors, throw
+            # This might be valid behavior and need to be explicitly modeled, e.g. reshape_as
+            raise RuntimeError(
+                f"Unhandled FakeTensor Device Propagation for {func}, found two different devices {common_device}, {t.device}"
+            )
+
+        for arg in flat_args:
+            merge_devices(arg)
+
+        # some functions that allow Python numbers to bind to Tensors
+        # if we have failed to find a device, and we're running one of these operators,
+        # we must have scalar only inputs
+        if should_allow_numbers_as_tensors(func) and common_device is None:
+            # ops with scalar only inputs always have result on cpu
+            has_scalar_only_inputs = True
+            common_device = torch.device("cpu")
+
+        assert common_device is not None, f"Could not find common device for {func}"
+
+        return common_device, has_scalar_only_inputs
+
+    # We must handle tolist in a special way for FakeTensors here in the case
+    # where tolist is called from torch dispatch for tensor subclasses.
+    # Ordinarily, if a program calls .tolist compiling still works because there is
+    # special handling in dynamo, but for tensor subclasses if .tolist is called
+    # inside torch dispatch, the .tolist call may be directly on a FakeTensor.
+    # This would result in an error since wrapper subclasses don't have storage.
+    # To avoid this, we handle the FakeTensor case by (1) specializing on the size
+    # of the tensor to create the output Python list, and (2) creating unbacked
+    # symints for each element of the list.
+    def tolist(self):
+        assert self.dim() == 1, "NYI for higher dims"
+        shape_env = self.fake_mode.shape_env
+        out = []
+        # Specialize on the length of the list
+        for _ in range(self.shape[0]):
+            s = shape_env.create_unbacked_symint()
+            # max value?
+            torch._constrain_as_size(s, min=2)
+            out.append(s)
+        return out
+
+
+@dataclass(frozen=True)
+class TensorMetadata:
+    """
+    The Tensor metadata relevant to hashing FakeTensors when caching.
+    """
+
+    dtype: torch.dtype
+    shape: torch.Size
+    stride: Tuple[Any, ...]
+    device: torch.device
+    layout: torch.layout
+    memory_format: Optional[torch.memory_format]
+    storage_offset: int
+    requires_grad: bool
+    is_quantized: bool
+    is_conj: bool
+    is_neg: bool
+    is_inference: bool
+    is_sparse: bool  # read: is sparse COO
+    is_coalesced: Optional[bool]
+    dense_dim: Optional[int]
+    sparse_dim: Optional[int]
+
+
+def extract_tensor_metadata(t: torch.Tensor) -> "TensorMetadata":
+    """
+    Extract the TensorMetadata of a tensor.
+    """
+    memory_format = suggest_memory_format(t)
+    if is_sparse_any(t) or not t.is_contiguous(memory_format=memory_format):
+        memory_format = None
+
+    return TensorMetadata(
+        dtype=t.dtype,
+        shape=t.shape,
+        stride=t.stride() if t.layout == torch.strided else (),
+        device=t.device,
+        layout=t.layout,
+        memory_format=memory_format,
+        storage_offset=t.storage_offset(),
+        requires_grad=t.requires_grad,
+        is_quantized=t.is_quantized,
+        is_conj=t.is_conj(),
+        is_neg=t.is_neg(),
+        is_inference=t.is_inference(),
+        is_sparse=t.is_sparse,
+        is_coalesced=t.is_coalesced() if t.is_sparse else None,
+        dense_dim=t.dense_dim() if t.is_sparse else None,
+        sparse_dim=t.sparse_dim() if t.is_sparse else None,
+    )
+
+
+@dataclass(frozen=True)
+class _ShapeEnvSettings:
+    """
+    Encapsulates all shape env settings that could potentially affect
+    FakeTensor dispatch. Used when creating dispatch cache keys.
+    """
+
+    allow_scalar_outputs: bool
+    allow_dynamic_output_shape_ops: bool
+    assume_static_by_default: bool
+    specialize_zero_one: bool
+    duck_shape: bool
+
+    def __init__(self, env: "ShapeEnv"):
+        # Initialize this way because the class is frozen (to enable hashing):
+        object.__setattr__(self, "allow_scalar_outputs", env.allow_scalar_outputs)
+        object.__setattr__(
+            self, "allow_dynamic_output_shape_ops", env.allow_dynamic_output_shape_ops
+        )
+        object.__setattr__(
+            self, "assume_static_by_default", env.assume_static_by_default
+        )
+        object.__setattr__(self, "specialize_zero_one", env.specialize_zero_one)
+        object.__setattr__(self, "duck_shape", env.duck_shape)
+
+
+class _DispatchCacheKey(list):
+    """
+    Key for the FakeTensor dispatch cache. Inspired by (copied from)
+    _HashedSeq from the functools.lru_cache implementation.
+    """
+
+    __slots__ = "hashvalue"  # noqa: PLC0205
+
+    def __init__(self, tup, hash=hash):
+        self[:] = tup
+        self.hashvalue = hash(tup)
+
+    def __hash__(self):
+        return self.hashvalue
+
+
+@dataclass(frozen=True)
+class _DispatchCacheEntry:
+    """
+    Entry type for the FakeTensor dispatch cache. Accounts for two possibilities:
+    1) The op is inplace, and a hit means we need to alias the argument at a given
+    index. 2) We need to synthesize a new FakeTensor given tensor metadata. For view
+    ops, we further capture the index of the arg to alias.
+    """
+
+    inplace_idx: Optional[int] = None
+    metadata: Optional[TensorMetadata] = None
+    view_idx: Optional[int] = None
+
+
+@dataclass(frozen=True)
+class _BypassDispatchCache(Exception):
+    """
+    Signals cases that should skip FakeTensor caching.
+    """
+
+    reason: str
+
+
+@dataclass(frozen=True)
+class DispatchCacheInfo:
+    """
+    Information about the state of the FakeTensor dispatch cache.
+    """
+
+    hits: int
+    misses: int
+    bypasses: Dict[str, int]
+    size: int
+
+
+# We keep one instantiation of `fake_tensor_converter` active
+# for the duration of `with FakeTensorMode()`.
+# This allows accurate storage aliasing across invocation of
+# different operators. While this will keep all freshly allocated
+# tensors alive during `FakeTensorMode`, there will no be no
+# new allocations of Tensors which have non-meta storage so
+# memory should not significantly increase.
+
+
+class FakeTensorMode(TorchDispatchMode):
+    cache: Dict[_DispatchCacheKey, _DispatchCacheEntry] = {}
+    cache_hits: int = 0
+    cache_misses: int = 0
+    cache_bypasses = defaultdict(int)
+
+    def __init__(
+        self,
+        *,
+        allow_fallback_kernels=True,
+        allow_non_fake_inputs=False,
+        shape_env=None,
+        static_shapes=None,
+    ):
+        log.debug("create_mode 0x%x", id(self))
+        self.allow_fallback_kernels = allow_fallback_kernels
+        self.fake_tensor_converter = FakeTensorConverter()
+        if static_shapes is not None:
+            self.static_shapes = static_shapes
+        else:
+            self.static_shapes = shape_env is None
+
+        import torch._dynamo.config
+        import torch._functorch.config
+
+        self.allow_meta = torch._functorch.config.fake_tensor_allow_meta
+        self.cache_enabled = torch._dynamo.config.fake_tensor_cache_enabled
+        self.cache_crosscheck_enabled = (
+            torch._dynamo.config.fake_tensor_cache_crosscheck_enabled
+        )
+
+        # A flag that controls, whether we want to invoke ops on mix of
+        # real weights/global variables and fake inputs
+        self.allow_non_fake_inputs = allow_non_fake_inputs
+
+        # [in_kernel_invocation]
+        # when FakeTensor is invoked in user code, .device should return
+        # the fake_device of the tensor so that code such as as `if x.is_cuda`
+        # or torch.zeros([10, 10], device=x.device) continues to execute as if
+        # the FakeTensor were real. However, within kernel execution, we return
+        # the `Meta` device because all computation within the kernels should
+        # behave as if the Tensors are on meta devices. Kernels should allocate
+        # new tensors on meta devices, and checks like `is_meta` should return true.
+        # within python refs, we always return the real device by defining
+        # the device property
+        self.in_kernel_invocation = False
+
+        # True if we enter'ed and actually enabled fake tensor mode,
+        # false if it was a no-op.  Not thread safe but neither is
+        # in_kernel_invocation
+        # If another fake mode was already active when we enter, we also stash it here.
+        # That way when we exit, we know to re-enable the previous fake mode.
+        self.enter_stack: List[Tuple[bool, Optional[FakeTensorMode]]] = []
+
+        self.shape_env = shape_env
+
+        self.stack = "".join(traceback.format_stack())
+
+        # Indicates to our torch_dispatch dispatching infra that
+        # this is an "infra" mode with lower dispatching precedence.
+        self._mode_key = torch._C._TorchDispatchModeKey.FAKE
+
+    # Typically, there is only one fake tensor mode and you test for it by
+    # doing an isinstance test.  However, in some situations, there might be
+    # TWO fake tensor modes.  The canonical example of this is exporting
+    # a fake model: there is an outer fake mode created by the user, and
+    # an inner fake mode created by Dynamo.  The two phase process is required
+    # because the outer fake mode typically won't have a ShapeEnv, even if
+    # the user is interested in exporting with dynamic shapes (so the inner
+    # fake mode will actually have a ShapeEnv and swap in symbolic sizes.)
+    #
+    # In this case, it's insufficient to test only one FakeTensor: you need
+    # to distinguish between our fake tensor and other fake tensors.  That's
+    # what this function does.
+    def is_our_fake(self, t):
+        return isinstance(t, FakeTensor) and t.fake_mode is self
+
+    @count
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        # FakeTensorMode should not be set when we're inside of it.
+        assert (
+            torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FAKE) is None
+        ), func
+        try:
+            return self.dispatch(func, types, args, kwargs)
+        except TypeError:
+            log.exception("fake tensor raised TypeError")
+            raise
+
+    # No-op if FakeTensorMode is already in use
+    def __enter__(self):
+        maybe_prev_fake_mode = torch._C._unset_dispatch_mode(self._mode_key)
+        if self is not maybe_prev_fake_mode:
+            self.enter_stack.append((True, maybe_prev_fake_mode))
+            return super().__enter__()
+        else:
+            # no-op (still need to re-set the fake mode though since we unset it)
+            torch._C._set_dispatch_mode(self)
+            self.enter_stack.append((False, None))
+        return self
+
+    def __exit__(self, a, b, c):
+        live, maybe_prev_fake_mode = self.enter_stack.pop()
+        if live:
+            out = super().__exit__(a, b, c)
+            # Re-enable the previous fake mode, if there was one.
+            if maybe_prev_fake_mode is not None:
+                torch._C._set_dispatch_mode(maybe_prev_fake_mode)
+
+    @classmethod
+    def cache_info(cls) -> DispatchCacheInfo:
+        """
+        Query the state of the dispatch cache.
+        """
+        return DispatchCacheInfo(
+            FakeTensorMode.cache_hits,
+            FakeTensorMode.cache_misses,
+            dict(FakeTensorMode.cache_bypasses),
+            len(FakeTensorMode.cache),
+        )
+
+    @classmethod
+    def cache_clear(cls):
+        """
+        Clear the dispatch cache.
+        """
+        cls.cache_hits = 0
+        cls.cache_misses = 0
+        cls.cache_bypasses.clear()
+        cls.cache.clear()
+
+    def _cached_dispatch_impl(
+        self,
+        func: OpOverload,
+        types: Tuple[Any, ...],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ):
+        """
+        Lookup a cache entry for the given arguments. If none exists, dispatch
+        and cache the result (if the result is eligible for caching).
+        """
+        output = unassigned = object()
+        try:
+            key = self._cache_key(func, args, kwargs)
+            entry = FakeTensorMode.cache.get(key, None)
+            if entry is not None:
+                output = self._output_from_cache_entry(entry, func, args)
+                FakeTensorMode.cache_hits += 1
+                if self.cache_crosscheck_enabled:
+                    # For debugging / testing: Validate that the output synthesized
+                    # from the cache matches the output created by normal dispatch.
+                    self._crosscheck_cache_output(output, func, types, args, kwargs)
+            else:
+                output = self._dispatch_impl(func, types, args, kwargs)
+                entry = self._make_cache_entry(key, func, args, kwargs, output)
+                FakeTensorMode.cache[key] = entry
+                FakeTensorMode.cache_misses += 1
+        except _BypassDispatchCache as e:
+            FakeTensorMode.cache_bypasses[e.reason] += 1
+
+        if output is unassigned:
+            output = self._dispatch_impl(func, types, args, kwargs)
+
+        return output
+
+    def _cache_key(
+        self,
+        func: OpOverload,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> _DispatchCacheKey:
+        """
+        Create a cache key given the dispatch args. Raises _BypassDispatchCache
+        for any situation that precludes caching.
+        """
+        # Avoid caching for any ops that would require a more sophisticated
+        # caching implementation, e.g., data dependent ops or ops that modify
+        # the inputs.
+        if torch.Tag.data_dependent_output in func.tags:
+            raise _BypassDispatchCache("data dependent output")
+
+        if torch.Tag.dynamic_output_shape in func.tags:
+            raise _BypassDispatchCache("dynamic output shape")
+
+        if torch.Tag.inplace_view in func.tags:
+            raise _BypassDispatchCache("inplace view")
+
+        if func == aten._unsafe_view.default:
+            raise _BypassDispatchCache("unsafe view")
+
+        if func in self.lift_fns:
+            raise _BypassDispatchCache("lift")
+
+        if not torch._library.utils.is_builtin(func):
+            raise _BypassDispatchCache("non-builtin")
+
+        # In order to handle storage aliasing, we need to establish the alias
+        # for any view op on a cache hit. But CompositeImplicitAutograd ops may
+        # or may not alias the input, so just punt on caching these.
+        if func.is_view and torch._C._dispatch_has_kernel_for_dispatch_key(
+            func.name(), torch._C.DispatchKey.CompositeImplicitAutograd
+        ):
+            raise _BypassDispatchCache("CompositeImplicitAutograd")
+
+        key_values = (
+            func,
+            # Translate any FakeTensor args to metadata.
+            self._prep_args_for_hash(args) if args else (),
+            self._prep_args_for_hash(kwargs) if kwargs else (),
+            # Capture the default_dtype mode since that can affect the output tensor,
+            # e.g., when operating on constant float values.
+            torch.get_default_dtype(),
+            # Capture the current device to support, e.g., cache tensor creation,
+            # where there isn't necessarily a tensor to take the device from.
+            torch._C._get_default_device(),
+            # We want to create tensors from cached metadata only when the inference
+            # mode is the same.
+            torch.is_inference_mode_enabled(),
+            # Shape env settings could affect behavior. One example seen in the wild:
+            # Disasllowing dynamic shapes can introduce a DynamicOutputShapeException
+            # where it wasn't seen on a previous instance of the same op.
+            _ShapeEnvSettings(self.shape_env) if self.shape_env else None,
+        )
+        return _DispatchCacheKey(key_values)
+
+    def _prep_args_for_hash(self, args: Any) -> Any:
+        """
+        Translate the provided args into a form suitable for caching at FakeTensor
+        dispatch, i.e., convert unhashable types like lists & dicts into tuples and
+        convert FakeTensors into metadata. Raises _BypassDispatchCache to signal
+        unsupported cases that should bypass caching.
+        """
+        if isinstance(args, dict):
+            args = list(args.keys()) + list(args.values())
+
+        result = []
+        for arg in args:
+            if isinstance(arg, FakeTensor):
+                if not self.is_our_fake(arg):
+                    raise _BypassDispatchCache("not our fake")
+                if arg._has_symbolic_sizes_strides:
+                    raise _BypassDispatchCache("symbolic shape")
+                if arg.constant is not None:
+                    raise _BypassDispatchCache("constant attribute")
+                if arg.is_sparse:
+                    raise _BypassDispatchCache("sparse tensor")
+                if is_sparse_compressed(arg):
+                    raise _BypassDispatchCache("sparse compressed tensor")
+                result.append(extract_tensor_metadata(arg))
+            elif isinstance(arg, torch.Tensor):
+                raise _BypassDispatchCache("non-fake tensor")
+            elif isinstance(arg, (torch.SymBool, torch.SymInt, torch.SymFloat)):
+                raise _BypassDispatchCache("symbolic shape")
+            elif isinstance(arg, (list, tuple, dict)):
+                result.extend(self._prep_args_for_hash(arg))
+            else:
+                # It's important to capture the type of the arg since, e.g., 1 and 1.0
+                # hash to the same value, but can produce different dtypes for the
+                # output tensor.
+                result.append((type(arg), arg))
+
+        return tuple(result)
+
+    def _make_cache_entry(
+        self,
+        key: _DispatchCacheKey,
+        func: OpOverload,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+        output: FakeTensor,
+    ) -> _DispatchCacheEntry:
+        """
+        Make a cache entry object for the given 'output' Tensor. Raises
+        _BypassDispatchCache if the output tensor has characteristics that
+        prevent caching it.
+        """
+        # Some ops return tuples of Tensors, but it's rare, so avoid
+        # the complexity of caching other types.
+        if not isinstance(output, FakeTensor):
+            raise _BypassDispatchCache("non-FakeTensor output")
+
+        # Avoid caching FakeTensors with constants attached since those
+        # can be invalidated.
+        if output.constant is not None:
+            raise _BypassDispatchCache("constant attribute")
+
+        # TODO: support caching sparse outputs?
+        if output.is_sparse:
+            raise _BypassDispatchCache("sparse output")
+
+        if is_sparse_compressed(output):
+            raise _BypassDispatchCache("sparse compressed output")
+
+        # Can an in-place op really reference a kwarg? If so, then we need
+        # to extend the implementation to handle it.
+        for kval in kwargs.values():
+            if id(kval) == id(output):
+                raise _BypassDispatchCache("kwarg aliases output")
+
+        # If this is an in-place op, the entry records which input arg is aliased.
+        for idx in range(len(args)):
+            if id(args[idx]) == id(output):
+                return _DispatchCacheEntry(
+                    inplace_idx=idx, metadata=None, view_idx=None
+                )
+
+        # Otherwise, create an entry that records the output tensor's metadata.
+        view_idx = None
+        if func.is_view:
+            idxs = [i for i, t in enumerate(args) if isinstance(t, torch.Tensor)]
+            assert len(idxs) == 1
+            view_idx = idxs[0]
+
+        metadata = extract_tensor_metadata(output)
+        entry = _DispatchCacheEntry(
+            inplace_idx=None, metadata=metadata, view_idx=view_idx
+        )
+
+        # N.B.: Some checks for bypassing the cache would be performed on the
+        # output tensor synthesized from the cached metadata. As an optimization,
+        # we can synthesize a tensor here and do the checks on that instance.
+        # This approach keeps the (more frequent) cache-hit path as lightweight
+        # as possible.
+        synth_output = self._output_from_cache_entry(entry, func, args)
+
+        # Make sure the dispatch_key_set from the synthesized output tensor will
+        # be the same.
+        synth_key_set = torch._C._dispatch_key_set(synth_output)
+        key_set = torch._C._dispatch_key_set(output)
+        if synth_key_set != key_set:
+            raise _BypassDispatchCache("dispatch_key_set mismatch")
+
+        return entry
+
+    def _output_from_cache_entry(
+        self, entry: _DispatchCacheEntry, func: OpOverload, args: Tuple[Any, ...]
+    ) -> FakeTensor:
+        """
+        Create a new FakeTensor from the cache entry.
+        """
+        if entry.inplace_idx is not None:
+            # This is an in-place op; return the aliased arg.
+            return args[entry.inplace_idx]
+
+        # Synthesize a new FakeTensor with the cached metadata.
+        metadata = entry.metadata
+        assert not metadata.is_sparse
+
+        empty = torch.empty_strided(
+            metadata.shape,
+            metadata.stride,
+            dtype=metadata.dtype,
+            layout=metadata.layout,
+            device="meta",
+            requires_grad=metadata.requires_grad,
+        )
+
+        if metadata.is_conj:
+            torch._C._set_conj(empty, True)
+        if metadata.is_neg:
+            torch._C._set_neg(empty, True)
+
+        if func.is_view:
+            # For view ops, the storage should be the same as the tensor input.
+            storage = args[entry.view_idx].untyped_storage()
+            with in_kernel_invocation_manager(self):
+                empty.set_(
+                    storage, metadata.storage_offset, metadata.shape, metadata.stride
+                )
+        elif metadata.storage_offset != 0:
+            storage = empty.untyped_storage()
+            with in_kernel_invocation_manager(self):
+                empty.set_(
+                    storage, metadata.storage_offset, metadata.shape, metadata.stride
+                )
+
+        return FakeTensor(self, empty, metadata.device)
+
+    def _crosscheck_cache_output(
+        self,
+        output: FakeTensor,
+        func: OpOverload,
+        types: Tuple[Any, ...],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ):
+        """
+        Helper to validate that the output synthesized from the cache matches
+        the output created by normal dispatch.
+        """
+        try:
+            true_output = self._dispatch_impl(func, types, args, kwargs)
+        except Exception as e:
+            raise RuntimeError(
+                f"FakeTensor cache crosscheck failure: func={func}, "
+                f"args={args}, kwargs={kwargs}: Dispatch raised={e}"
+            ) from e
+        try:
+            assert_metadata_eq(assert_eq, true_output, output)
+        except Exception as e:
+            raise RuntimeError(
+                f"FakeTensor cache crosscheck failure: func={func}, "
+                f"args={args}, kwargs={kwargs}"
+            ) from e
+
+    def dispatch(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        with no_dispatch():
+            log.debug("%s %s %s", func, args, kwargs)
+
+        if func in _DISPATCH_META_HANDLERS:
+            return _DISPATCH_META_HANDLERS[func](args)
+
+        if log.getEffectiveLevel() <= logging.DEBUG:
+            log.debug(
+                "%sFakeTensorMode.__torch_dispatch__: %s", " " * RECURSION_COUNT, func
+            )
+            # NOTE: incr is intentionally unused for a RAII pattern
+            incr = IncrementRecursionCount()
+
+        # Some attribute queries that can be serviced directly
+        # See Note [is_coalesced is dispatched]
+        if func in _DISPATCH_HANDLE_DIRECTLY:
+            # NB: no_dispatch is ok here too, this func is very simple
+            with in_kernel_invocation_manager(self):
+                return func(*args, **kwargs)
+
+        if self.cache_enabled:
+            return self._cached_dispatch_impl(func, types, args, kwargs)
+        else:
+            return self._dispatch_impl(func, types, args, kwargs)
+
+    def _dispatch_impl(self, func, types, args, kwargs):
+        flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+
+        flat_arg_fake_tensors = [
+            t for t in flat_args if isinstance(t, FakeTensor) and self.is_our_fake(t)
+        ]
+        has_symbolic_sizes = any(
+            i._has_symbolic_sizes_strides for i in flat_arg_fake_tensors
+        ) or any(isinstance(a, torch.SymInt) for a in flat_args)
+
+        converter = self.fake_tensor_converter
+
+        def maybe_to_constant(t):
+            if isinstance(t, FakeTensor) and self.is_our_fake(t):
+                return t.constant
+            else:
+                return t
+
+        # To constant propagate through these functions:
+        # 1, If this is a lift due to a torch.tensor call,
+        #    the input tensor is guaranteed to be a
+        #    constant, so we keep a copy of the original argument along so
+        #    we can query it if we're asked to item() it at some later point.
+        #    (Note that you can always call a lift fn manually, so we do
+        #    have to check if there are any fake tensors!)
+        # 2, Some functions that allow Python numbers to bind to Tensors, e.g, torch.div
+        if (func in self.lift_fns and not flat_arg_fake_tensors) or (
+            should_allow_numbers_as_tensors(func)
+            and not has_symbolic_sizes
+            and not flat_arg_fake_tensors
+        ):
+            assert all(
+                t.constant is not None for t in flat_arg_fake_tensors
+            ), f"{func} should not have fake inputs without constants"
+            const_flat_args = [maybe_to_constant(a) for a in flat_args]
+            const_args, const_kwargs = pytree.tree_unflatten(const_flat_args, args_spec)
+            out = func(*const_args, **const_kwargs)
+            if type(out) is torch.Tensor and self.may_turn_const(out):
+                # NB: not in_kernel_invocation_manager because we're doing real
+                # compute here
+                # NB: no_dispatch() here is VERY DANGEROUS (like, segfault
+                # dangerous) if this is actually a wrapper subclass tensor,
+                # therefore the exact type test above
+                with no_dispatch():
+                    out = out.clone()
+                return converter(self, out, make_constant=True)
+
+        # See [subclass inputs] below
+        # NB: If you're seeing a mysterious infinite loop involving fake
+        # tensor, it might be related to this line.  Though I'm not sure
+        # how you'll know to read this comment, as this line won't show up
+        # in the stack trace.
+        unrecognized_types = self.check_for_subclass(flat_args)
+        if unrecognized_types:
+            not_implemented_log.debug(
+                "FakeTensorMode unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        # if we are in the dispatch mode, we will enter this function even if the inputs
+        # are not FakeTensors. For now, throw if any non-Fake Tensor inputs
+        # and just support constructors.
+
+        # this is generated from torch.tensor(), which does not use the
+        # dispatcher, to allow wrapper subclasses to wrap the new tensor
+        if func in self.lift_fns:
+            assert len(kwargs) == 0 and len(args) == 1, f"{args} {kwargs}"
+
+            if type(args[0]) is torch.Tensor:
+                return converter(self, args[0])
+
+        # Recompute flat_arg_fake_tensors here again in case some of the inputs
+        # were real tensors and fakified in validate_and_convert_non_fake_tensors
+        (flat_args, flat_arg_fake_tensors) = self.validate_and_convert_non_fake_tensors(
+            func, converter, flat_args, args_spec
+        )
+        del args, kwargs  # Invalidated
+
+        # The current constant handling only support tracing systems
+        # (aot autograd, torchdynamo) where each operation is run consecutively.
+        # Because each operation is run in order, we can trace out and support
+        # sequences like: x = torch.tensor(0.); y = x.add_(1)
+        # Whenver a constant is written to but with inputs that cannot be evaluated
+        # statically, such as random_(), we invalidate all constants that alias the input
+        # We will rely on functionalization for use of fake tensors constants as persistent
+        # objects on an FX Graph.
+
+        # We dispatch size/stride/numel on the FakeTensor not its constant, so bail on inplace_view
+        all_constant = all(e.constant is not None for e in flat_arg_fake_tensors)
+        if (
+            torch.Tag.nondeterministic_seeded not in func.tags
+            and torch.Tag.inplace_view not in func.tags
+            and all_constant
+            and len(flat_arg_fake_tensors) != 0
+            and not has_symbolic_sizes
+        ):
+            const_flat_args = [maybe_to_constant(a) for a in flat_args]
+            const_args, const_kwargs = pytree.tree_unflatten(const_flat_args, args_spec)
+
+            # NB: not in_kernel_invocation_manager(self) as we want to do REAL
+            # compute
+            with no_dispatch():
+                out = func(*const_args, **const_kwargs)
+
+            flat_out = pytree.tree_leaves(out)
+            flat_out_tensors = [t for t in flat_out if isinstance(t, torch.Tensor)]
+            all_constant = all(self.may_turn_const(t) for t in flat_out_tensors)
+
+            if all_constant:
+                return pytree.tree_map_only(
+                    torch.Tensor,
+                    lambda t: converter(self, t, make_constant=True),
+                    out,
+                )
+
+            # we weren't able to turn outputs to constants,
+            # so invalidate all constants that might be aliases of the outputs
+            for ten in flat_out_tensors:
+                converter.invalidate_constant_aliases(ten)
+
+        # we are falling through to running non constant tensors, any input constant that
+        # is written to must be invalidated
+        args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+        self.invalidate_written_to_constants(func, flat_arg_fake_tensors, args, kwargs)
+
+        # Try for fastpath
+        if has_symbolic_sizes:
+            fast_impl = get_fast_op_impls().get(func)
+            if fast_impl is not None:
+                return fast_impl(self, *args, **kwargs)
+
+        # If there's a Python meta, prefer that over the decomposition
+        from torch._decomp import meta_table as meta_table
+
+        if func not in meta_table and not self.cpp_meta_supports_symint(func):
+            from torch._decomp import decomposition_table
+
+            # Prefer Python decompositions over C++ ones
+            if func in decomposition_table and (
+                has_symbolic_sizes
+                or (
+                    # TODO: Remove these exclusions, so that we can remove
+                    # this leg entirely
+                    torch_decomp_decompositions(func)
+                    and all(not e.is_sparse for e in flat_arg_fake_tensors)
+                )
+            ):
+                with self:
+                    return decomposition_table[func](*args, **kwargs)
+
+            with self:
+                # Decomposes CompositeImplicitAutograd ops
+                r = func.decompose(*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
+
+        # prims already wrap FakeTensor inputs to FakeTensor outputs
+        # and do device logic, we dont need do anything but run them
+        # and ensure that Meta kernels are dispatched to (see)
+        # Fake Tensor Dispatch Keys
+        # TODO - we should be use the prim aten impl
+        # TODO - fix prims complex ops
+        if (
+            "prims::" in func._schema.name
+            and hasattr(func, "prim_meta_impl")
+            and not stride_incorrect_op(func)
+        ):
+            with self:
+                return func.prim_meta_impl(*args, **kwargs)
+
+        # Users can register FakeTensor rules for custom operators
+        # Call them if they exist.
+        maybe_abstract_impl = torch._library.simple_registry.singleton.find(
+            func.name()
+        ).abstract_impl.kernel
+        if maybe_abstract_impl:
+            ctx = torch._library.abstract_impl.AbstractImplCtx(self.shape_env, func)
+            with torch._library.abstract_impl.set_ctx_getter(lambda: ctx), self:
+                result = maybe_abstract_impl(*args, **kwargs)
+                return result
+
+        # special handling for funcs registered through `register_op_impl`,
+        # e.g., manipulating args on constructor calls to construct meta tensors
+        # and then afterwards wrapping them to a FakeTensor
+        for run_impl_check, op_impl in op_implementations_checks:
+            if run_impl_check(func):
+                op_impl_out = op_impl(self, func, *args, **kwargs)
+                if op_impl_out != NotImplemented:
+                    return op_impl_out
+
+        def maybe_run_unsafe_fallback(error=None):
+            # We infer the meta of a custom ops that return None to just
+            # return None. custom ops are not allowed to mutate metadata
+            # of their inputs, so this is safe.
+            if can_generate_trivial_abstract_impl(func):
+                return None
+            # no meta kernel registered, fallback to kernel for the device
+            if has_symbolic_sizes or not self.can_run_unsafe_fallback(func):
+                raise UnsupportedOperatorException(func)
+            if error is None:
+                error = UnsupportedOperatorException(func)
+            return run_fallback_kernel(self, func, flat_args, args_spec, error)
+
+        # Optimization: If there is no Meta kernel, it takes a surprisingly long
+        # amount of time to catch the NotImplementedError, so we check it here.
+        if not has_meta(func):
+            return maybe_run_unsafe_fallback()
+
+        # run kernel registered to meta for func, which include
+        # python meta registrations, prims, decomps, and c++ meta fns (structured kernels)
+        # It's possible that the kernel will return NotImplementedError
+        try:
+            with in_kernel_invocation_manager(self):
+                r = func(*args, **kwargs)
+        except NotImplementedError as not_implemented_error:
+            return maybe_run_unsafe_fallback(not_implemented_error)
+
+        return self.wrap_meta_outputs_with_default_device_logic(
+            r, func, flat_args, device=kwargs.get("device")
+        )
+
+    # WARNING: DO NOT add any additional namespaces/operators here if they refer to operators
+    # outside of the pytorch/pytorch library! Any pre-existing things here
+    # are either in the pytorch/pytorch library or have been grandfathered in.
+    # The fallback does not always work and MAY CRASH and emit unreadable error messages
+    # so it should not be allowed by default.
+    _can_run_unsafe_fallback_allowed_namespaces = ordered_set(
+        "debugprims",
+        "prims",
+        "aten",
+        "xla",
+        "vision",
+        "torchtext",
+        "torchaudio",
+        "quantized",
+    )
+
+    def can_run_unsafe_fallback(self, func: OpOverload):
+        if not self.allow_fallback_kernels:
+            return False
+        # It's OK to try the fallback for built-in ops (e.g. aten, prims)
+        # because we control and test these but the fallback leads to unexpected behavior
+        # in user-defined custom ops
+        return (
+            func.namespace in self._can_run_unsafe_fallback_allowed_namespaces
+            or func.name() == "fbgemm::gmm"
+        )
+
+    # [subclass inputs]
+    # Suppose we enable fake tensor mode.  This means that fake tensor
+    # mode will run first.  But what if we do an operation that
+    # involves a tensor subclass that will desugar into normal tensor
+    # operations?  Without returning NotImplemented, fake tensor mode will run first,
+    # decide that a conversion was made (since there was a non fake
+    # tensor argument), and report an error that converting non
+    # fake tensor is not supported.  What we actually wanted to happen
+    # was to give the subclass a chance to figure out what it wants to
+    # before erroring out. Returning NotImplemented here allows this.
+    def check_for_subclass(self, flat_args):
+        def check(x):
+            return (
+                isinstance(x, torch.Tensor)
+                and not isinstance(x, FakeTensor)
+                and type(x) is not torch.Tensor
+                and type(x) is not torch.nn.Parameter
+            )
+
+        return [type(x) for x in flat_args if check(x)]
+
+    def validate_and_convert_non_fake_tensors(
+        self, func, converter, flat_args, args_spec
+    ):
+        """
+        Checks if the list of tensors are fake tensors.
+        If not, try to convert them to fake tensors.
+        Returns the original args, kwargs, and a flattened list of (args, kwargs) that are fake tensors.
+        """
+        flat_arg_fake_tensors = []
+
+        def validate(x):
+            if not isinstance(x, torch.Tensor):
+                return x
+
+            nonlocal flat_arg_fake_tensors
+            if not self.is_our_fake(x):
+                if torch.Tag.inplace_view in func.tags:
+                    args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+                    raise Exception(
+                        f"Can't call metadata mutating ops on non-Fake Tensor inputs. Found in {render_call(func, args, kwargs)}"
+                    )
+                if not self.allow_non_fake_inputs:
+                    if isinstance(x, FakeTensor) and x.fake_mode is not self:
+                        raise AssertionError("Mixing fake modes NYI")
+                    args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+                    raise Exception(
+                        f"Please convert all Tensors to FakeTensors first or instantiate FakeTensorMode "
+                        f"with 'allow_non_fake_inputs'. Found in {render_call(func, args, kwargs)}"
+                    )
+
+                x = converter(self, x)
+
+            flat_arg_fake_tensors.append(x)
+            return x
+
+        validated_args = [validate(a) for a in flat_args]
+        return validated_args, flat_arg_fake_tensors
+
+    def wrap_meta_outputs_with_default_device_logic(self, r, func, flat_args, device):
+        converter = self.fake_tensor_converter
+
+        # Lazily initialized, in case there are no tensor returns
+        common_device = None
+        has_scalar_only_inputs = False
+
+        def wrap(e):
+            nonlocal common_device
+            nonlocal has_scalar_only_inputs
+
+            if isinstance(e, torch.Tensor) and common_device is None:
+                (
+                    common_device,
+                    has_scalar_only_inputs,
+                ) = FakeTensor._find_common_device(func, flat_args)
+
+            if self.is_our_fake(e):
+                torch._check(
+                    e.device == common_device,
+                    lambda: f"FakeTensor is wrapped to wrong device, found {e.device}, expected {common_device}",
+                )
+
+            if (
+                isinstance(e, torch.Tensor)
+                and not self.is_our_fake(e)
+                and converter is not None
+            ):
+                if has_scalar_only_inputs:
+                    # Under FakeTensorMode, op accepts scalar only inputs, such as aten.add/sub/mul/div,
+                    # returns a real scalar tensor on CPU. See TensorMeta() in _prims/__init__.py for details.
+                    # We thus directly convert real tensor to fake tensor.
+                    return converter(self, e)
+                else:
+                    return converter.from_meta_and_device(
+                        self, e, device or common_device
+                    )
+            else:
+                return e
+
+        return tree_map(wrap, r)
+
+    _cpp_meta_supports_symint = ordered_set(
+        aten.empty.memory_format,
+        aten.empty_strided.default,
+        aten.as_strided_scatter.default,
+        aten.as_strided.default,
+        aten.as_strided_.default,
+        aten.zeros.default,
+        aten.detach.default,
+        aten.view_as_real.default,
+        aten.view_as_complex.default,
+        aten.set_.source_Storage_storage_offset,
+        aten._sparse_coo_tensor_with_dims_and_tensors.default,
+    )
+
+    def cpp_meta_supports_symint(self, func):
+        if torch.Tag.view_copy in func.tags:
+            return True
+        return func in self._cpp_meta_supports_symint
+
+    lift_fns = ordered_set(aten.lift_fresh.default, aten.lift_fresh_copy.default)
+
+    def may_turn_const(self, t):
+        return (
+            t.numel() <= CONSTANT_NUMEL_LIMIT
+            and not t.is_sparse
+            and not self.is_our_fake(t)
+            and not t.device.type == "meta"
+        )
+
+    def invalidate_written_to_constants(
+        self, func, flat_arg_fake_tensors, args, kwargs
+    ):
+        any_constant = any(e.constant is not None for e in flat_arg_fake_tensors)
+        schema_info = get_schema_info(func)
+        if any_constant and schema_info.is_mutable():
+            _, new_kwargs = normalize_function(
+                func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+            )
+            for k, v in new_kwargs.items():
+                k = k if (k != "input" or schema_info.has_argument(k)) else "self"
+                if (
+                    self.is_our_fake(v)
+                    and schema_info.is_mutable(k)
+                    and v.constant is not None
+                ):
+                    self.fake_tensor_converter.invalidate_constant_aliases(v.constant)
+
+    def from_tensor(
+        self,
+        tensor,
+        *,
+        static_shapes=None,
+        source: Optional[Source] = None,
+        symbolic_context=None,
+        # Setting this flag will force FakeTensorMode to return `None` if attempting to convert a tensor we have not
+        # seen before.
+        memoized_only=False,
+    ):
+        shape_env = self.shape_env
+        if static_shapes is None:
+            static_shapes = self.static_shapes
+        if static_shapes:
+            assert (
+                symbolic_context is None
+            ), "cannot set both static_shapes and symbolic_context"
+            shape_env = None
+        # see note [Tensor Fakification and Symbol Caching]
+        if not symbolic_context and not source and not static_shapes:
+            if tracing_context := torch._guards.TracingContext.try_get():
+                if tensor in tracing_context.tensor_to_context:
+                    symbolic_context = tracing_context.tensor_to_context[tensor]
+                    source = symbolic_context.tensor_source
+        return self.fake_tensor_converter(
+            self,
+            tensor,
+            shape_env=shape_env,
+            source=source,
+            symbolic_context=symbolic_context,
+            memoized_only=memoized_only,
+        )
+
+
+# NB: returns fake tensors
+def run_fallback_kernel(
+    fake_mode, func, flat_args, args_spec, orig_not_implemented_exception
+):
+    # these should all be supported, just to be safe
+    # avoid fallback for operators which inplace modify metadata
+    # because the input fake tensors would be umodified
+    if torch.Tag.inplace_view in func.tags:
+        raise orig_not_implemented_exception
+
+    inp_impls = {}
+
+    # Don't use in_kernel_invocation_manager(fake_mode) as we want to do
+    # REAL compute (not with meta device)
+    with no_dispatch():
+
+        def to_real_tensor(e):
+            if fake_mode.is_our_fake(e):
+                out = torch.zeros_like(e, device=e.fake_device)
+                if e.is_sparse:
+                    out._coalesced_(e.is_coalesced())
+                inp_impls[id(out)] = e
+                return out
+            return e
+
+        flat_args = [to_real_tensor(a) for a in flat_args]
+        args, kwargs = pytree.tree_unflatten(flat_args, args_spec)
+
+        r = func(*args, **kwargs)
+
+    tensor_impls = set()
+    storages = set()
+
+    for e in flat_args:
+        if isinstance(e, torch.Tensor):
+            if not e.is_sparse:
+                storages.add(e._typed_storage()._cdata)
+
+    # TODO: also check metadata change on inputs
+    # proper aliasing/metadata relationship between outputs and inputs will
+    # not be set up, bc of conversion to device, unless we can reuse an
+    # input impl
+
+    def map_out(e):
+        if id(e) not in inp_impls and (
+            isinstance(e, torch.Tensor)
+            and not e.is_sparse
+            and e._typed_storage()._cdata in storages
+        ):
+            raise orig_not_implemented_exception
+
+        if isinstance(e, torch.Tensor):
+            if id(e) in inp_impls:
+                return inp_impls[id(e)]
+            else:
+                return fake_mode.fake_tensor_converter(fake_mode, e)
+        else:
+            return e
+
+    return pytree.tree_map(map_out, r)
+
+
+def can_generate_trivial_abstract_impl(op: torch._ops.OpOverload) -> bool:
+    assert isinstance(op, torch._ops.OpOverload)
+    if torch._library.utils.is_builtin(op):
+        # We control the built-ins. These may (in rare cases)
+        # do input metadata mutation (which we have banned on custom ops)
+        return False
+    schema = op._schema
+    # It's suspicious if the op is not mutable but returns nothing, so we return False out of an abundance of caution
+    if not schema.is_mutable:
+        return False
+    if len(schema.returns) > 0:
+        return False
+    # If the op returns nothing, then it has a trivial abstract impl.
+    return True
+
+
+# Just for use to allow copying a module to fake tensors,
+# does not apply elsewhere
+class FakeCopyMode(TorchFunctionMode):
+    def __init__(self, fake_mode):
+        self.fake_mode = fake_mode
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        # clone will get called in Parameter deepcopy
+        if func == torch._C.TensorBase.clone:
+            return func(
+                self.fake_mode.from_tensor(args[0], static_shapes=True), **kwargs
+            )
+        elif func == torch.Tensor.__deepcopy__:
+            assert len(args) == 2 and len(kwargs) == 0
+            tensor, memo = args
+
+            if id(tensor) in memo:
+                return memo[id(tensor)]
+
+            out = self.fake_mode.from_tensor(tensor, static_shapes=True)
+            memo[id(tensor)] = out
+            return out
+        else:
+            with torch._C.DisableTorchFunctionSubclass():
+                return func(*args, **kwargs)
+
+
+def _device_handler(args):
+    # NB: Don't use is_our_fake, just serve the fake information
+    # as is.  Notice we don't use 'self'; we use args[0].fake_mode
+    # because they may not be the same.  It would also be possible
+    # to return NotImplemented here, in which case the FakeTensor
+    # handler on args[0] would handle it, but we're being nice and
+    # short-circuiting quickly.
+    assert len(args) == 1 and isinstance(args[0], FakeTensor)
+    if args[0].fake_mode.in_kernel_invocation:
+        return torch.device("meta")
+    else:
+        return args[0].fake_device
+
+
+_DISPATCH_META_HANDLERS = {
+    torch.ops.prim.device.default: _device_handler,
+    torch.ops.aten.size.default: lambda args: tuple(int(s) for s in args[0].size()),
+    torch.ops.aten.stride.default: lambda args: tuple(int(s) for s in args[0].stride()),
+    torch.ops.aten.storage_offset.default: lambda args: int(args[0].storage_offset()),
+}
+
+_DISPATCH_HANDLE_DIRECTLY = ordered_set(
+    torch.ops.aten.is_coalesced.default,
+    torch.ops.aten.dense_dim.default,
+    torch.ops.aten.sparse_dim.default,
+)
+
+from torch._subclasses.fake_impls import (  # noqa: F401
+    _device_not_kwarg_ops,  # noqa: F401
+    _is_tensor_constructor,  # noqa: F401
+    _like_tensor_constructors,  # noqa: F401
+    contains_tensor_types,  # noqa: F401
+    get_fast_op_impls,
+    has_meta,
+    op_implementations_checks,
+    stride_incorrect_op,
+)
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/fake_utils.py b/MLPY/Lib/site-packages/torch/_subclasses/fake_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7fc4e85946b49ee558ac1898dfc8b41e5d5849
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/fake_utils.py
@@ -0,0 +1,190 @@
+# mypy: ignore-errors
+
+import functools
+import warnings
+from typing import Callable, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch._ops import OpOverload
+from torch._subclasses.fake_tensor import (
+    FakeTensorMode,
+    tree_flatten_only,
+    UnsupportedFakeTensorException,
+)
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+aten = torch._ops.ops.aten
+
+
+def outputs_alias_inputs(outputs, inputs):
+    input_storages = {
+        inp._typed_storage()._cdata
+        for inp in tree_flatten_only(torch.Tensor, inputs)
+        if torch._C._has_storage(inp)
+    }
+    return any(
+        torch._C._has_storage(out) and out._typed_storage()._cdata in input_storages
+        for out in tree_flatten_only(torch.Tensor, outputs)
+    )
+
+
+def outputs_are_inputs(outputs, inputs):
+    input_ids = {id(inp) for inp in tree_flatten_only(torch.Tensor, inputs)}
+    return any(id(out) in input_ids for out in tree_flatten_only(torch.Tensor, outputs))
+
+
+def output_alias_each_other(outputs):
+    storages = set()
+    for out in tree_flatten_only(torch.Tensor, outputs):
+        if not torch._C._has_storage(out):
+            continue
+        stor = out._typed_storage()._cdata
+        if stor in storages:
+            return True
+        storages.add(stor)
+    return False
+
+
+def is_sdpa_error(func, idx, e):
+    if (
+        (
+            func is aten._scaled_dot_product_flash_attention.default
+            or func is aten._flash_attention_forward.default
+        )
+        and idx in (6, 7)
+        and "Devices" in repr(e)
+    ):
+        return True
+    if (
+        (
+            func is aten._scaled_dot_product_efficient_attention.default
+            or func is aten._efficient_attention_forward.default
+        )
+        and idx in (2, 3)
+        and "Devices" in repr(e)
+    ):
+        return True
+    return False
+
+
+class CrossRefFakeMode(TorchDispatchMode):
+    def __init__(
+        self,
+        ignore_op_fn: Union[Callable[[OpOverload], bool], None] = None,
+        *,
+        check_strides=True,
+        check_aliasing=True,
+    ):
+        self.ignore_op_fn = (
+            ignore_op_fn if ignore_op_fn is not None else lambda fn: False
+        )
+        self.check_strides = check_strides
+        self.check_aliasing = check_aliasing
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        fake_r = None
+
+        # empty_like excluded for now due to sparse complex
+        # aten._to_dense.default this one is getting called with csc
+        if (
+            func
+            not in (
+                aten.lift_fresh.default,
+                aten.lift_fresh_copy.default,
+                aten.set_.source_Storage_storage_offset,
+            )
+            and not self.ignore_op_fn(func)
+            and torch.Tag.dynamic_output_shape not in func.tags
+            and torch.Tag.inplace_view not in func.tags
+            and torch.Tag.data_dependent_output not in func.tags
+        ):
+            # Do not import symbolic_shapes at the top of the module as it imports sympy and that's slow
+            from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+            try:
+                # TODO: enable_python_dispatcher() here
+                with FakeTensorMode(shape_env=ShapeEnv()) as fake_mode:
+                    fake_args, fake_kwargs = pytree.tree_map_only(
+                        torch.Tensor,
+                        functools.partial(fake_mode.from_tensor, static_shapes=True),
+                        (args, kwargs),
+                    )
+                    with warnings.catch_warnings():
+                        fake_r = func(*fake_args, **fake_kwargs)
+            except UnsupportedFakeTensorException:
+                pass
+
+        context = (
+            f"When comparing the output of {func} on FakeTensor and concrete Tensors, "
+            f"found"
+        )
+        r = func(*args, **kwargs)
+        if fake_r is not None:
+            r_flat = pytree.tree_leaves(r)
+            f_flat = pytree.tree_leaves(fake_r)
+            assert len(f_flat) == len(
+                r_flat
+            ), f"{context} mismatch in number of returns {len(f_flat)} != {len(r_flat)}"
+
+            if self.check_aliasing:
+                r_aliasing = outputs_alias_inputs(r, (args, kwargs))
+                f_aliasing = outputs_alias_inputs(fake_r, (fake_args, fake_kwargs))
+                assert (
+                    r_aliasing == f_aliasing
+                ), f"{context} mismatch in outputs_alias_inputs check {f_aliasing} != {r_aliasing}"
+
+                r_identity_eq = outputs_are_inputs(r, (args, kwargs))
+                f_identity_eq = outputs_are_inputs(fake_r, (fake_args, fake_kwargs))
+                assert (
+                    r_identity_eq == f_identity_eq
+                ), f"{context} mismatch in outputs_are_inputs check {f_identity_eq} != {r_identity_eq}"
+
+                r_output_alias_each_other = output_alias_each_other(r)
+                f_output_alias_each_other = output_alias_each_other(fake_r)
+                assert r_output_alias_each_other == f_output_alias_each_other, (
+                    f"{context} mismatch in outputs_alias_each_other check "
+                    f"{f_output_alias_each_other} != {r_output_alias_each_other}"
+                )
+
+            for idx, (r_out, fake_out) in enumerate(
+                zip(pytree.tree_leaves(r), pytree.tree_leaves(fake_r))
+            ):
+                r_is_ten = isinstance(r_out, torch.Tensor)
+                assert r_is_ten == isinstance(
+                    fake_out, torch.Tensor
+                ), f"{context} mismatched number of tensor outputs"
+                if r_is_ten:
+                    assert r_out.requires_grad == fake_out.requires_grad, (
+                        f"{context} mismatched requires_grad-ness of outputs. "
+                        f"This usually means that you have added autograd support "
+                        f"for your operator at a dispatch key other than Autograd, "
+                        f"which will lead to problems"
+                    )
+                    if torch._C._has_storage(r_out):
+                        r_offset = r_out.storage_offset()
+                        f_offset = fake_out.storage_offset()
+                        assert (
+                            r_offset == f_offset
+                        ), f"{context} mismatched storage offset"
+
+                    try:
+                        torch._prims.utils.compare_tensor_meta(
+                            r_out,
+                            fake_out,
+                            check_strides=self.check_strides,
+                            allow_rhs_unbacked=True,
+                        )
+                    except Exception as e:
+                        if is_sdpa_error(func, idx, e):
+                            continue
+                        error_message = (
+                            f"{context} mismatched tensor metadata: {e}"
+                            if len(r_flat) == 1
+                            else f"{context} mismatched tensor metadata for output[{idx}]: {e}"
+                        )
+                        raise RuntimeError(error_message) from e
+        return r
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/functional_tensor.py b/MLPY/Lib/site-packages/torch/_subclasses/functional_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b5215a6a2d4195e4cbf0f7e209565237e42acd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/functional_tensor.py
@@ -0,0 +1,653 @@
+import contextlib
+from abc import ABC, abstractmethod
+from typing import Any, Callable, ContextManager, Dict, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import _functionalization_reapply_views_tls as _reapply_views
+from torch._ops import _get_dispatch_mode_pre_dispatch
+from torch.utils._python_dispatch import (
+    _detect_functional_mode,
+    _disable_infra_mode,
+    return_and_correct_aliasing,
+    TorchDispatchMode,
+)
+
+not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+
+
+class FunctionalTensor(torch.Tensor):
+    """
+    Functional tensors represent tensors that will remove mutations
+    from a program. If you perform a mutable operation on a functional tensor,
+    it will re-dispatch to the functional variant of that operation.
+
+    Historically, functionalization is implemented in C++ in the dispatcher.
+    This class is a lightweight python shim around the C++ functionalization logic.
+
+    FunctionalTensor is required to be used with a corresponding
+    FunctionalTensormode active, because it relies
+    on using the mode for dispatch (which can properly handle factory functions).
+    """
+
+    elem: torch.Tensor
+    # Indicates to our torch_dispatch dispatching infra that
+    # this is an "infra" mode with lower dispatching precedence.
+    _mode_key = torch._C._TorchDispatchModeKey.FUNCTIONAL
+
+    # Note: The reason we add these extra keys to our FunctionalTensor subclass
+    # is to mirror the behavior of C++ functionalization (we can choose to change this
+    # later, as long as it doesn't break anything).
+    # FunctionalTensorWrapper copies **all** dispatch keys from the inner tensor
+    # to the wrapper, excluding functorch and python dispatch keys.
+    # Here I'm trying to re-use the keyset the functorch wrapper subclasses copy,
+    # except that they don't include ZeroTensor so I'm manually adding it in.
+    _extra_dispatch_keys = torch._C._additional_keys_to_prop_for_wrapper_tensors.add(
+        torch._C.DispatchKey.ZeroTensor
+    )
+
+    # These are all aten ops that correspond to metadata queries.
+    # We want FunctionalTensor to be able to handle them directly.
+    metadata_fns = [
+        torch.ops.aten.is_contiguous.default,  # type: ignore[has-type]
+        torch.ops.aten.is_contiguous.memory_format,  # type: ignore[has-type]
+        torch.ops.aten.is_strides_like_format.default,  # type: ignore[has-type]
+        torch.ops.aten.is_non_overlapping_and_dense.default,  # type: ignore[has-type]
+        torch.ops.aten.size.default,  # type: ignore[has-type]
+        torch.ops.aten.sym_size.default,  # type: ignore[has-type]
+        torch.ops.aten.stride.default,  # type: ignore[has-type]
+        torch.ops.aten.sym_stride.default,  # type: ignore[has-type]
+        torch.ops.aten.storage_offset.default,  # type: ignore[has-type]
+        torch.ops.aten.sym_storage_offset.default,  # type: ignore[has-type]
+        torch.ops.aten.numel.default,  # type: ignore[has-type]
+        torch.ops.aten.sym_numel.default,  # type: ignore[has-type]
+        torch.ops.aten.dim.default,  # type: ignore[has-type]
+        torch.ops.prim.device.default,  # type: ignore[has-type]
+    ]
+
+    # These are ops that claim to be functional, but actually are maybe-mutating/maybe-aliasing
+    # TODO (tmanlaibaatar) make it a tag
+    maybe_aliasing_or_mutating_ops = [
+        torch.ops.aten.dropout.default,  # type: ignore[has-type]
+        torch.ops.aten.batch_norm.default,  # type: ignore[has-type]
+        torch.ops.aten.native_batch_norm.default,  # type: ignore[has-type]
+        torch.ops.aten._batch_norm_impl_index.default,  # type: ignore[has-type]
+        torch.ops.aten.cudnn_batch_norm.default,  # type: ignore[has-type]
+        torch.ops.aten.miopen_batch_norm.default,  # type: ignore[has-type]
+    ]
+
+    def __new__(cls, elem):
+        assert torch._is_functional_tensor(elem)
+
+        # In general, we'd like our functional tensor subclass to only be in charge of functionalization,
+        # and defer to the inner subclass for all other functionality.
+        # Example: If our inner tensor is a ZeroTensor, we would want to defer running the ZeroTensor fallback
+        # until after we redispatch to our inner ZeroTensor.
+        # However, there are a few keys that we need to mirror between the inner and outer tensors.
+        #   Conjugate
+        #   Negative
+        # Why? These keys are used to test metadata queries, like `.is_conj()` and `.is_neg()`.
+        # We **need** calls to is_conj() to return the same thing on the outer and inner tensors,
+        # Because user code / framework code that branches like so needs to do the same thing
+        # when it sees the outer FunctionalTensor:
+        #     if (x.is_conj()) {
+        #         return at::view_as_real(x.resolve_conj());
+        #     } else {
+        #         return at::view_as_real(x);
+        #     }
+        extra_dispatch_keys = (
+            FunctionalTensor._extra_dispatch_keys & torch._C._dispatch_keys(elem)
+        )
+
+        out = torch.Tensor._make_wrapper_subclass(  # type: ignore[arg-type, attr-defined]
+            # TODO: right now, _make_wrapper_subclass's dynamic shape interaction is not great.
+            # Calling the overload that has kwargs causes us to go down the first overload path,
+            # which will **always** specialize sizes.
+            # We should probably eventually fix this so that the first overload can just handle dynamic shapes.
+            cls,
+            elem.shape,  # sizes
+            elem.stride(),  # strides
+            elem.storage_offset(),  # storage_offset
+            None,  # memory_format
+            elem.dtype,  # dtype
+            elem.layout,  # layout
+            elem.device,  # device
+            False,  # pin_memory
+            elem.requires_grad,  # requires_grad
+            "sizes",  # dispatch_sizes_strides_policy
+            False,  # dispatch_device
+            False,  # dispatch_layout
+            extra_dispatch_keys,  # _extra_dispatch_keys
+        )
+        out.elem = elem
+        return out
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        unrecognized_types = [
+            t
+            for t in types
+            if t not in [torch.Tensor, torch._subclasses.FakeTensor, FunctionalTensor]
+        ]
+        if unrecognized_types:
+            not_implemented_log.debug(
+                "FunctionalTensor unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        if kwargs is None:
+            kwargs = {}
+
+        # FunctionalTensor needs to plumb all metadata requests to the inner tensor.
+        # In theory we don't have to do this - but if we want to service metadata requests here,
+        # we need to carefully make sure all metadata is accurate (including metadata mutations)
+        if func in FunctionalTensor.metadata_fns:
+            # All metadata accesses should be plumbed to the inner tensor, that way we don't have to worry
+            # about the problem of keeping metadata in sync between the wrapper and inner tensor.
+            # This also alleviates us from having to manually handle metadata mutations on the wrapper.
+            assert len(kwargs) == 0
+            if func in [
+                torch.ops.aten.is_strides_like_format.default,
+                torch.ops.aten.is_contiguous.memory_format,
+            ]:
+                assert len(args) == 2 and isinstance(args[0], FunctionalTensor)
+                return func(args[0].elem, args[1])
+            assert len(args) == 1 and isinstance(args[0], FunctionalTensor)
+
+            return func(args[0].elem)
+        # Originally I tried to implement my subclass without giving it a torch_dispatch, but I gave up:
+        # - _make_wrapper_subclass requires a __torch_dispatch__
+        # - If we want to use _make_subclass(), we have a problem: the subclass will share a TensorImpl with the inner tensor,
+        #   which is of type FunctionalTensorWrapper! We explicitly do not want our wrapper to be a FunctionalTensorWrapper.
+        # - If we use the default tensor.__new__(), we have another problem: it returns inner_tensor.alias(),
+        #   which causes every subclass created above autograd to have autograd view metadata
+        #   (in addition to also being a FunctionalTensorWrapper).
+        raise RuntimeError(
+            "Attempting to use FunctionalTensor on its own. Instead, please use it with a corresponding FunctionalTensorMode()"
+        )
+
+    def __repr__(self):
+        return f"FunctionalTensor({repr(self.elem)})"
+
+    @staticmethod
+    def to_functional(x):
+        # We will do the wrapping for the user.
+        assert not torch._is_functional_tensor(x)
+        # The only autograd metadata we care about on the FunctionalTensor is:
+        # - requires_grad (so autograd runs)
+        # - is_leaf (so that mutations on graph inputs that are not leaves are allowed by the autograd engine)
+        #   this is handled by FunctionalTensor.to_functional
+        x_functional = torch._to_functional_tensor(x)
+        # Technically the FunctionalTensormode here is unnecessary,
+        # but it avoids spurious NotImplemented logs during `ProxyTorchDispatchMode` tracing.
+        # _mirror_autograd_meta_to queries tensor sizes,
+        # and otherwise the sym_size() call will go to the proxy mode before hitting
+        # FunctionalTensor.__torch_dispatch__
+
+        functional_mode = _detect_functional_mode()
+        assert functional_mode is not None
+
+        with functional_mode:
+            torch._mirror_autograd_meta_to(x, x_functional)  # type: ignore[attr-defined]
+            out = FunctionalTensor(x_functional)
+            torch._mirror_autograd_meta_to(x_functional, out)  # type: ignore[attr-defined]
+        return out
+
+    def from_functional(self):
+        torch._sync(self)
+        return torch._from_functional_tensor(self.elem)
+
+    def replace_(self, output) -> None:
+        torch._functionalize_replace(self.elem, output)
+
+    def commit_update(self) -> None:
+        torch._functionalize_commit_update(self.elem)
+
+    def sync(self) -> None:
+        torch._functionalize_sync(self.elem)
+
+    def mark_mutation_hidden_from_autograd(self) -> None:
+        torch._functionalize_mark_mutation_hidden_from_autograd(self.elem)
+
+    def tolist(self) -> Any:
+        if self.elem.dim() == 0:
+            return self.elem.item()
+        elif self.elem.dim() == 1:
+            return [elem.item() for elem in self.elem]
+        else:
+            return [elem.tolist() for elem in self.elem]
+
+
+class FunctionalTensorMode(TorchDispatchMode):
+    def __init__(self, pre_dispatch=False, export=False, _allow_token_discovery=False):
+        self.export = export
+        self.is_on_stack = False
+        self.enter_stack = []
+        # Indicates to our torch_dispatch dispatching infra that
+        # this is an "infra" mode with lower dispatching precedence.
+        self._mode_key = torch._C._TorchDispatchModeKey.FUNCTIONAL
+        self.pre_dispatch = pre_dispatch
+        # This will be turned off later for pre-dispatch functionalization
+        self._dispatch_key = torch._C.DispatchKey.PreDispatch if pre_dispatch else None  # type: ignore[attr-defined]
+        # Map of effect type (ex. _EffectType.ORDERED) to a token. The tokens help keep
+        # track of the ordering between side effectful operations.
+        self._tokens: Dict[Any, torch.Tensor] = {}
+
+        # Functionalization runs twice in AOTAutograd, once in
+        # `run_functionalized_fw_and_collect_metadata` to collect metadata to
+        # see which tensors need to be functionalized and discover how many
+        # tokens we need, and another time in `make_fx` which does the actual
+        # tracing to replace ops with their functional variants and handling
+        # side-effectful ops. In the second stage there should be no token
+        # discovery. This flag distinguishes between the two stages.
+        self._allow_token_discovery = _allow_token_discovery
+
+    # No-op if FunctionalTensorMode is already in use
+    def __enter__(self):
+        def _get_prev_mode():
+            if self._dispatch_key == torch._C.DispatchKey.PreDispatch:
+                return _get_dispatch_mode_pre_dispatch(
+                    torch._C._TorchDispatchModeKey.FUNCTIONAL
+                )
+            return torch._C._get_dispatch_mode(
+                torch._C._TorchDispatchModeKey.FUNCTIONAL
+            )
+
+        if _get_prev_mode() is None:
+            self.enter_stack.append(True)
+            return super().__enter__()
+        else:
+            self.enter_stack.append(False)
+            return self
+
+    def __exit__(self, a, b, c):
+        is_on_stack = self.enter_stack.pop()
+        if is_on_stack:
+            super().__exit__(a, b, c)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        unrecognized_types = [
+            t
+            for t in types
+            if not issubclass(t, torch._subclasses.FakeTensor)
+            and t not in [torch.Tensor, FunctionalTensor]
+        ]
+        if unrecognized_types:
+            not_implemented_log.debug(
+                "FunctionalTensor unrecognized subclass(es): %s", unrecognized_types
+            )
+            return NotImplemented
+
+        def _can_decompose(func):
+            # See https://github.com/pytorch/pytorch/pull/115258#issuecomment-1900755832
+            # We never decompose dropout in export
+            if self.export and func == torch.ops.aten.dropout.default:
+                return False
+            # TODO (tmanlaibaatar)
+            # Eventually, we don't want to decompose any aten op at all
+            # but there is a safety and coverage gap that we need to close
+            # before that.
+            #
+            # (1) the "safety" is what we are risking with this PR
+            #     (we are blindly taking every op that advertises as
+            #      functional and sending it to the functional fallback.
+            #      We risk silent correctness if we have an op that lies about its schema,
+            #      that we didn't manually hardcode above) Therefore we always decompose them
+            # (2) the "not every composite inplace op has a functional variant" is a coverage gap,
+            #      but not really a safety risk, since we'll loudly error when we try to generate
+            #      functionalization kernels for these new (composite) inplace/view ops. But until we
+            #      establish such gap more concretely, we still decompose them
+            if self._dispatch_key is not None:
+                # it is unsafe to not decompose ops that claim to be functional but actually aren't
+                if func in FunctionalTensor.maybe_aliasing_or_mutating_ops:
+                    return True
+                # only decompose view or inplace mutating ops
+                alias_info = len(
+                    [i for i in func._schema.arguments if i.alias_info is not None]
+                )
+                return alias_info != 0 or func._schema.is_mutable
+            return True
+
+        if (
+            func not in FunctionalTensor.metadata_fns
+            and _can_decompose(func)
+            # Not all funcs from __torch_dispatch__ are actual dispatcher ops,
+            # e.g. prim.device
+            and torch._C._dispatch_has_kernel(func.name())
+        ):
+            with self:
+                r = func.decompose(*args, **kwargs)
+                if r is not NotImplemented:
+                    return r
+
+        def assert_is_functional(x):
+            assert torch._is_functional_tensor(x)
+
+        def wrap(x):
+            # Only wrap our outputs in subclasses if the inner functionalization call
+            # also wrapped outputs into FunctionalTensorWrappers.
+            # When can this happen? e.g. `torch.div(2, 2)`
+            assert not isinstance(x, FunctionalTensor)
+            if isinstance(x, torch.Tensor) and torch._is_functional_tensor(x):
+                return FunctionalTensor(x)
+            return x
+
+        def unwrap(x):
+            return x.elem
+
+        from torch._higher_order_ops.auto_functionalize import (
+            can_auto_functionalize,
+            do_auto_functionalize,
+        )
+
+        if can_auto_functionalize(
+            func
+        ) and not torch._C._dispatch_has_kernel_for_dispatch_key(
+            func.name(), torch._C.DispatchKey.Functionalize
+        ):
+            if self.pre_dispatch:
+                raise NotImplementedError(
+                    "Auto functionalization is not supported on pre-dispatch tracing"
+                )
+            return do_auto_functionalize(func, args, kwargs)
+
+        from torch._higher_order_ops.effects import handle_effects, has_effects
+
+        if has_effects(func, args, kwargs):
+            assert not torch._C._dispatch_has_kernel_for_dispatch_key(
+                func.name(), torch._C.DispatchKey.Functionalize
+            )
+            return handle_effects(
+                self._allow_token_discovery, self._tokens, func, args, kwargs
+            )
+
+        args_unwrapped, kwargs_unwrapped = pytree.tree_map_only(
+            FunctionalTensor, unwrap, (args, kwargs)
+        )
+
+        # Expectation: functionalization should not **already** be enabled above our mode.
+        # Why would that be bad? when we return a FunctionalTensor here, we don't want functionalization
+        # to run above this mode and further wrap that output in **another** C++ FunctionalTensorWrapper.
+        is_included = torch._C._dispatch_tls_is_dispatch_key_included(
+            torch._C.DispatchKey.Functionalize
+        )
+        is_excluded = torch._C._dispatch_tls_is_dispatch_key_excluded(
+            torch._C.DispatchKey.Functionalize
+        )
+        assert is_excluded or not is_included
+        include_to_set = (
+            torch._C._dispatch_tls_local_include_set()
+            | torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+        exclude_to_set = (
+            torch._C._dispatch_tls_local_exclude_set().remove(
+                torch._C.DispatchKey.Functionalize
+            )
+            - FunctionalTensor._extra_dispatch_keys
+        )
+
+        # All we want to do here is re-use the existing C++ functionalization logic.
+        # This requires swizzling our TLS dispatch keys so that the Functionalize key is active.
+        with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
+            try:
+                # By default for python functionalization (for AOTAutograd), we reapply views.
+                old_apply_views = torch._functionalize_enable_reapply_views(True)  # type: ignore[attr-defined]
+
+                # Sometimes these functions cannot be directly dispatched to functionalize key
+                # because args are sometimes not functional tensors for some reason?
+                if func in FunctionalTensor.metadata_fns:
+                    outs_unwrapped = func(*args_unwrapped, **kwargs_unwrapped)
+                    outs_wrapped = pytree.tree_map_only(
+                        torch.Tensor, wrap, outs_unwrapped
+                    )
+                else:
+                    # When we dispatch to the C++ functionalization kernel, we might need to jump back to the
+                    # PreDispatch mode stack afterwards, to handle any other PreDispatch modes underneath
+                    # FunctionalTensorMode. If we call func() directly, we would need to exclude PreDispatch
+                    # from the TLS in order to avoid infinite looping, but this would prevent us from coming
+                    # back to PreDispatch later
+                    outs_unwrapped = func._op_dk(
+                        torch._C.DispatchKey.Functionalize,
+                        *args_unwrapped,
+                        **kwargs_unwrapped,
+                    )
+                    # We don't allow any mutation on result of dropout
+                    if self.export and func == torch.ops.aten.dropout.default:
+                        torch._freeze_functional_tensor(outs_unwrapped)  # type: ignore[attr-defined]
+                    outs_wrapped = pytree.tree_map_only(
+                        torch.Tensor, wrap, outs_unwrapped
+                    )
+            finally:
+                torch._disable_functionalization()
+                torch._functionalize_enable_reapply_views(old_apply_views)  # type: ignore[attr-defined]
+
+        is_included = torch._C._dispatch_tls_is_dispatch_key_included(
+            torch._C.DispatchKey.Functionalize
+        )
+        is_excluded = torch._C._dispatch_tls_is_dispatch_key_excluded(
+            torch._C.DispatchKey.Functionalize
+        )
+        assert is_excluded or not is_included
+
+        if (
+            # If no outputs are our functional subclass, then don't try to fix up aliasing
+            not any(
+                isinstance(x, FunctionalTensor)
+                for x in pytree.tree_leaves(outs_wrapped)
+            )
+            # Since lift_fresh lifts its argument into a functional tensor, we can skip the
+            # aliasing correction step. Otherwise, we would be setting the storage of a
+            # lifted tensor to that of an unlifted tensor.
+            # Ref: https://github.com/pytorch/pytorch/issues/111506
+            or func == torch.ops.aten.lift_fresh.default
+        ):
+            return outs_wrapped
+        # Wrapper tensor subclasses do not have correct aliasing info! Use this util to manually correct the output aliasing.
+        # inplace ops like `aten.add_()` are expected to return inputs **directly**, instead of creating fresh tensor objects.
+        # Use this util to figure out the right thing to return.
+        # If none of our inputs were wrapped, then we have no FunctionalTensor outputs that we need to fix up storages for.
+        return return_and_correct_aliasing(func, args, kwargs, outs_wrapped)
+
+
+@contextlib.contextmanager
+def disable_functional_mode():
+    return _disable_infra_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
+
+
+# This is similar to torch.func.functionalize, but:
+# - It uses FunctionalTensorMode, and FunctionalTensor (a python subclass).
+#   One important advantage to using this mode is that it will let us
+#   run functionalization underneath __torch_dispatch__,
+#   which we need in AOTAutograd.
+# - Doing so means that it does not automatically compose with other
+#   functorch transforms, since these transforms always run above __torch_dispatch__.
+#   That's why this util lives here, and not in functorch.
+def dispatch_functionalize(func, mode: FunctionalTensorMode = FunctionalTensorMode()):
+    # TODO: pull these from aot autograd
+    def to_fun(t):
+        if isinstance(t, torch.Tensor):
+            return FunctionalTensor.to_functional(t)
+        return t
+
+    def from_fun(t):
+        if not isinstance(t, FunctionalTensor):
+            # quick sanity assert
+            if isinstance(t, torch.Tensor):
+                assert not torch._is_functional_tensor(t)
+            return t
+        torch._sync(t)
+        return torch._from_functional_tensor(t.elem)
+
+    def inner(*args, **kwargs):
+        disable_above = torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+        with disable_above, mode:
+            func_args = pytree.tree_map_only(torch.Tensor, to_fun, args)
+            func_kwargs = pytree.tree_map_only(torch.Tensor, to_fun, kwargs)
+            func_outputs = func(*func_args, **func_kwargs)
+            outputs = pytree.tree_map_only(FunctionalTensor, from_fun, func_outputs)
+
+            return outputs
+
+    return inner
+
+
+class BaseFunctionalizeAPI(ABC):
+    @abstractmethod
+    def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        pass
+
+    @abstractmethod
+    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        pass
+
+    @abstractmethod
+    def functionalize(self, inner_f: Callable) -> Callable:
+        pass
+
+    @abstractmethod
+    def redispatch_to_next(self) -> ContextManager:
+        pass
+
+    @abstractmethod
+    def replace(self, input_tensor, output_tensor) -> None:
+        pass
+
+    @abstractmethod
+    def commit_update(self, tensor) -> None:
+        pass
+
+    @abstractmethod
+    def sync(self, tensor) -> None:
+        pass
+
+    @abstractmethod
+    def mark_mutation_hidden_from_autograd(self, tensor) -> None:
+        pass
+
+
+class PythonFunctionalizeAPI(BaseFunctionalizeAPI):
+    def __init__(
+        self, mode: Optional[FunctionalTensorMode] = None, pre_dispatch: bool = False
+    ) -> None:
+        super().__init__()
+        self.mode = mode if mode else FunctionalTensorMode()
+        self.pre_dispatch = pre_dispatch
+
+    def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        with self.mode:
+            return torch.utils._pytree.tree_map_only(
+                torch.Tensor, FunctionalTensor.to_functional, args
+            )
+
+    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        return torch.utils._pytree.tree_map_only(
+            FunctionalTensor, FunctionalTensor.from_functional, args
+        )
+
+    def functionalize(self, inner_f: Callable) -> Callable:
+        return dispatch_functionalize(inner_f, self.mode)
+
+    def redispatch_to_next(self) -> ContextManager:
+        # [NOTE] We don't do anything here because at the time
+        # we exercise this path, we would have already popped the
+        # FunctionalTensorMode from mode stack. Since FunctionalTensorMode
+        # is now stateful, it is better to explicitly pass in correct mode
+        # directly instead of globally setting it.
+        return contextlib.nullcontext()
+
+    def replace(self, input_tensor, output_tensor) -> None:
+        assert isinstance(input_tensor, FunctionalTensor)
+        assert not isinstance(output_tensor, FunctionalTensor)
+        input_tensor.replace_(output_tensor)
+
+    def commit_update(self, tensor) -> None:
+        assert isinstance(tensor, FunctionalTensor)
+        tensor.commit_update()
+
+    def sync(self, tensor) -> None:
+        assert isinstance(tensor, FunctionalTensor)
+        tensor.sync()
+
+    def mark_mutation_hidden_from_autograd(self, tensor) -> None:
+        assert isinstance(tensor, FunctionalTensor)
+        tensor.mark_mutation_hidden_from_autograd()
+
+
+class CppFunctionalizeAPI(BaseFunctionalizeAPI):
+    def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        from torch._functorch.eager_transforms import _wrap_all_tensors_to_functional
+
+        return _wrap_all_tensors_to_functional(args, level=0)
+
+    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        from torch._functorch.eager_transforms import (
+            _unwrap_all_tensors_from_functional,
+        )
+
+        return _unwrap_all_tensors_from_functional(args, reapply_views=_reapply_views())
+
+    def functionalize(self, inner_f: Callable) -> Callable:
+        return torch.func.functionalize(inner_f)
+
+    def redispatch_to_next(self) -> ContextManager:
+        return torch._C._ExcludeDispatchKeyGuard(
+            torch._C.DispatchKeySet(torch._C.DispatchKey.Functionalize)
+        )
+
+    def replace(self, input_tensor, output_tensor) -> None:
+        torch._functionalize_replace(input_tensor, output_tensor)
+
+    def commit_update(self, tensor) -> None:
+        torch._functionalize_commit_update(tensor)
+
+    def sync(self, tensor) -> None:
+        torch._functionalize_sync(tensor)
+
+    def mark_mutation_hidden_from_autograd(self, tensor) -> None:
+        torch._functionalize_mark_mutation_hidden_from_autograd(tensor)
+
+
+class FunctorchFunctionalizeAPI(BaseFunctionalizeAPI):
+    def __init__(self, interpreter):
+        self.interpreter = interpreter
+
+    def wrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        from torch._functorch.eager_transforms import _wrap_all_tensors_to_functional
+
+        return _wrap_all_tensors_to_functional(args, level=self.interpreter.level())
+
+    def unwrap_tensors(self, args: Tuple[Any]) -> Tuple[Any]:
+        from torch._functorch.eager_transforms import (
+            _unwrap_all_tensors_from_functional,
+        )
+
+        return _unwrap_all_tensors_from_functional(
+            args, reapply_views=self.interpreter.functionalize_add_back_views()
+        )
+
+    def functionalize(self, inner_f: Callable) -> Callable:
+        return torch.func.functionalize(
+            inner_f,
+            remove="mutations_and_views"
+            if self.interpreter.functionalize_add_back_views()
+            else "mutations",
+        )
+
+    def redispatch_to_next(self) -> ContextManager:
+        return self.interpreter.lower()
+
+    def replace(self, input_tensor, output_tensor) -> None:
+        torch._functionalize_replace(input_tensor, output_tensor)
+
+    def commit_update(self, tensor) -> None:
+        torch._functionalize_commit_update(tensor)
+
+    def sync(self, tensor) -> None:
+        torch._functionalize_sync(tensor)
+
+    def mark_mutation_hidden_from_autograd(self, tensor) -> None:
+        torch._functionalize_mark_mutation_hidden_from_autograd(tensor)
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/meta_utils.py b/MLPY/Lib/site-packages/torch/_subclasses/meta_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..984b46d78c651fc059f3361ba8b53b25695efe93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/meta_utils.py
@@ -0,0 +1,987 @@
+import contextlib
+import warnings
+import weakref
+from typing import ContextManager, Dict, List, Optional, Tuple, TYPE_CHECKING
+
+import torch
+from torch._C._functorch import (
+    _add_batch_dim,
+    _unwrap_functional_tensor,
+    _wrap_functional_tensor,
+    current_level,
+    get_unwrapped,
+    is_batchedtensor,
+    is_functorch_wrapped_tensor,
+    is_gradtrackingtensor,
+    maybe_get_bdim,
+    maybe_get_level,
+    peek_interpreter_stack,
+    TransformType,
+)
+from torch._guards import Source
+
+from torch.multiprocessing.reductions import StorageWeakRef
+from torch.utils._python_dispatch import (
+    is_traceable_wrapper_subclass,
+    transform_subclass,
+)
+from torch.utils.weak import WeakIdRef
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # Do not import unconditionally, as they import sympy and importing sympy is very slow
+    from torch.fx.experimental.symbolic_shapes import SymbolicContext
+
+DimList = List
+
+
+def safe_is_leaf(t):
+    try:
+        return t.is_leaf
+    except RuntimeError:
+        # inference mode can trigger this
+        return False
+
+
+def safe_grad(t):
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", "The .grad attribute of a Tensor")
+        return t.grad
+
+
+def assert_eq(a, b):
+    assert a == b, f"{a} != {b}"
+
+
+def assert_metadata_eq(assert_eq, m1, m2, *, skip_symbolic=False):
+    def go(m1, m2):
+        assert_eq(m1.dtype, m2.dtype)
+        if not skip_symbolic:
+            assert_eq(m1.shape, m2.shape)
+        assert_eq(m1.requires_grad, m2.requires_grad)
+        assert_eq(m1.is_leaf, m2.is_leaf)
+        assert_eq(m1.grad_fn is None, m2.grad_fn is None)
+        assert_eq(m1.is_sparse, m2.is_sparse)
+        assert_eq(m1.is_inference(), m2.is_inference())
+        assert_eq(m1.is_conj(), m2.is_conj())
+        assert_eq(m1.is_neg(), m2.is_neg())
+        assert_eq(safe_grad(m1) is not None, safe_grad(m2) is not None)
+        if safe_grad(m1) is not None:
+            go(safe_grad(m1), safe_grad(m2))
+        if m1.is_sparse:
+            assert_eq(m1.dense_dim(), m2.dense_dim())
+            assert_eq(m1.sparse_dim(), m2.sparse_dim())
+            assert_eq(m1.is_coalesced(), m2.is_coalesced())
+        else:
+            if not skip_symbolic:
+                assert_eq(m1.stride(), m2.stride())
+                assert_eq(m1.storage_offset(), m2.storage_offset())
+            assert_eq(m1._is_view(), m2._is_view())
+            if m1._is_view():
+                go(m1._base, m2._base)
+        # TODO: test if is resizable (no direct query for this atm)
+        # TODO: audit AutogradMeta to see if it matches
+        # TODO: test forward AD
+
+    return go(m1, m2)
+
+
+def is_sparse_coo(t):
+    return isinstance(t, torch.Tensor) and t.layout is torch.sparse_coo
+
+
+def is_sparse_compressed(t):
+    return isinstance(t, torch.Tensor) and t.layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }
+
+
+def is_sparse_any(t):
+    return is_sparse_coo(t) or is_sparse_compressed(t)
+
+
+# This is a class for converting multiple tensors into meta tensors which
+# share the same view/storage structure.  The operation model is you allocate
+# one of these, and then call it repeatedly on all the tensors you want to
+# convert.  It's important to use the same object for tensors you want to
+# share storage because this is how we correlate shared storages to the same
+# meta storages. This class will hold weak references to cached tenosrs
+# and tensor storages.
+class MetaConverter:
+    def __init__(self):
+        self.storage_memo = {}
+        self.tensor_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
+        self.maybe_storages_to_delete = []
+        self.check_expired_frequency = 128
+        self.check_expired_count = 0
+        self.hit = 0
+        self.miss = 0
+        self.del_hook = None
+        self.arg_cnt = 0
+
+    def successful(self):
+        return self.hit > 0 and self.miss == 0
+
+    def check_for_expired_weak_storages(self):
+        new_li = []
+        stor_to_delete = []
+        for obj in self.maybe_storages_to_delete:
+            if not obj.expired():
+                new_li.append(obj)
+            else:
+                stor_to_delete.append(obj)
+        for obj in stor_to_delete:
+            self.storage_memo.pop(obj, None)
+        self.maybe_storages_to_delete = new_li
+
+        # if for some reason we have aquired many storages which have not expired
+        # even though a tensor with their storage has expired (aliasing or otherwise)
+        # check for expired storages less often so as to bound the amount of work we
+        # do checking for expired storages
+        self.check_expired_frequency = max(
+            self.check_expired_frequency, len(self.maybe_storages_to_delete)
+        )
+
+    def get_tensor_memo(self, t):
+        return self.tensor_memo.get(WeakIdRef(t), None)
+
+    def set_tensor_memo(self, t, v):
+        # hold a weak ref to self, otherwise it will be kept alive
+        # by the del_ten closure
+        self_weak_ref = weakref.ref(self)
+        if is_sparse_any(t) or t.is_mkldnn or is_functorch_wrapped_tensor(t):
+            weak_st = None
+        else:
+            weak_st = StorageWeakRef(t._typed_storage())
+        tensor_ref_key = WeakIdRef(t)
+
+        def del_ten():
+            # tensor outlives the converter
+            self_ref = self_weak_ref()
+            if self_ref is None:
+                return
+            # on shutdown, tensor_ref_key may not be in memo
+            self_ref.tensor_memo.pop(tensor_ref_key, None)
+            if weak_st and weak_st.expired():
+                self_ref.storage_memo.pop(weak_st, None)
+            elif weak_st is not None:
+                # [expired-storages]
+                # NB: even though the tensor has died,
+                # the deallocation of its storage can take longer,
+                # even when the storage has no other uses/views.
+                # In this case, the StorageWeakRef object will be kept alive
+                # longer than it needs to be, however the storage itself
+                # will be deallocated. We retain the possibly dead storages
+                # and periodically check if any of them are expired and
+                # can be freed.
+                self_ref.maybe_storages_to_delete.append(weak_st)
+
+        weakref.finalize(t, del_ten)
+        self.tensor_memo[tensor_ref_key] = v
+
+    # NB: doesn't actually return a storage, because meta storage is
+    # not supported
+    def meta_storage(self, s, callback):
+        # NB: TypedStorage is freshly allocated and cannot be used as hash
+        # key index.
+
+        # Use a Weak Ref to s in order to not leak memory
+        swr = StorageWeakRef(s)
+        if swr not in self.storage_memo:
+            self.storage_memo[swr] = callback(
+                lambda: torch.empty(s.size(), dtype=torch.uint8, device="meta")
+            ).untyped_storage()
+        return self.storage_memo[swr]
+
+    # This function assumes that it's possible to do the conversion
+    # NB: name here is used in a conventional way by Dynamo; it corresponds
+    # precisely to the Source.name() of the tensor we're fakeifying and
+    # corresponds to a valid Python expression.  When we construct sub-names
+    # as part of this process, we will maintain this invariant!  (Even though
+    # other users of this may not need it this property to be upheld.)
+    def meta_tensor(
+        self,
+        t,
+        shape_env=None,
+        callback=lambda t: t(),
+        source: Optional[Source] = None,
+        symbolic_context: Optional["SymbolicContext"] = None,
+    ):
+        if source is None:
+            from torch._dynamo.source import ConstantSource
+
+            # TODO: make a dedicated UnknownSource for this?
+            source = ConstantSource(
+                f"__meta_utils_unknown_tensor{len(self.tensor_memo)}"
+            )
+
+        # This indicates you set no_dispatch() before calling into this
+        # function.  This is an error: we may be creating fake tensors and
+        # will perform operations on them which need fake tensor mode to
+        # be active.  You will segfault if you are in a no_dispatch() block.
+        assert not torch._C._dispatch_tls_local_exclude_set().has(
+            torch._C.DispatchKey.Python
+        )
+        arg_cnt = self.arg_cnt
+        self.arg_cnt += 1
+
+        # When we make as_strided calls, we end up generating a guard
+        # that the new as_strided tensor is in bounds for the old storage
+        # for the base (since as_strided calls can "bust" out of their
+        # bounding box.)  This guard is unnecessary: if a user is able
+        # to provide us a tensor with the view base setup this way, we
+        # don't need to produce a guard, because the fact that they
+        # were able to produce the view base means its in bounds.
+        #
+        # Now, ordinarily, this guard would be harmless.  However, the
+        # generated guard refers to variables bound on the base variable.
+        # At the moment, Dynamo doesn't actually guard on x._base, because
+        # according to Voz this results in a lot of spurious invalidations,
+        # and also if the user doesn't directly make use of _base, its
+        # pointless anyway (because programs should be parametric over
+        # whether or not the input tensor is a view or not--unless you're
+        # mutating the input, but that's a whole 'nother ballgame).  So
+        # for expediency, we suppress these guards so we don't have to
+        # deal with this (yet, anyway.)
+        #
+        # NB: An old version of this code suppressed guards for ALL operations
+        # happening during meta conversion, not just as_strided calls.
+        # This is too aggressive: we do duck sizing and 0/1 simplification
+        # as we allocate variables, and we do need to register guards for
+        # these cases.
+        maybe_suppress = contextlib.nullcontext
+        if shape_env is not None:
+            maybe_suppress = shape_env.suppress_guards
+
+        def sym_sizes_strides_storage_offset(
+            t, src, symbolic_context=symbolic_context
+        ) -> Tuple[Tuple[int, ...], Tuple[int, ...], int]:
+            if shape_env is not None:
+                fake_mode = torch._subclasses.fake_tensor.maybe_get_fake_mode(t)
+                if fake_mode is not None and fake_mode.shape_env is shape_env:
+                    # Don't reallocate the sizes; the shape envs are the same,
+                    # so reuse the old sizes/strides/etc
+                    return (t.size(), t.stride(), t.storage_offset())
+                else:
+                    return shape_env.create_symbolic_sizes_strides_storage_offset(
+                        t,
+                        src,
+                        symbolic_context=symbolic_context,
+                    )
+            else:
+                assert symbolic_context is None
+            return (t.size(), t.stride(), t.storage_offset())
+
+        def empty_create(inner_t, inner_src, symbolic_context=symbolic_context):
+            (
+                inner_sizes,
+                inner_strides,
+                inner_storage_offset,
+            ) = sym_sizes_strides_storage_offset(inner_t, inner_src, symbolic_context)
+            return torch.empty_strided(
+                inner_sizes,
+                inner_strides,
+                dtype=inner_t.dtype,
+                device="meta",
+            )
+
+        # Creates a subclass instance with empty inner tensors according to the specified
+        # symbolic context.
+        def empty_create_subclass(
+            t,
+            outer_size,
+            outer_stride,
+            symbolic_context=symbolic_context,
+            callback=callback,
+            source=source,
+        ):
+            from torch._dynamo.source import AttrSource
+            from torch.fx.experimental.symbolic_shapes import SubclassSymbolicContext
+
+            assert symbolic_context is None or isinstance(
+                symbolic_context, SubclassSymbolicContext
+            )
+
+            # Note: transform_subclass will use __tensor_unflatten__ to generate
+            # a fresh subclass wrapper with outer sizes / strides according to the
+            # outer symbolic context (passed in to this function). Inner size / stride
+            # / storage offset symbols are allocated according to the appropriate inner
+            # symbolic contexts, after which the checks in transform_subclass() will
+            # relate them to the outer metadata as possible.
+            return transform_subclass(
+                t,
+                lambda attr, inner_t: callback(
+                    lambda: empty_create(
+                        inner_t,
+                        AttrSource(source, attr),
+                        symbolic_context=(
+                            None
+                            if symbolic_context is None
+                            else symbolic_context.inner_contexts[attr]
+                        ),
+                    )
+                ),
+                outer_size=outer_size,
+                outer_stride=outer_stride,
+            )
+
+        # Returns an all-dynamic symbolic context used for metafying the given tensor with
+        # fully dynamic dims. This is useful when fake-ifying intermediate tensors in
+        # closed-over ViewFunc state, as we don't have symbolic contexts for them, but we
+        # don't want to over-specialize during view replay.
+        def all_dynamic_symbolic_context(t, source, shape_env, callback):
+            from torch._dynamo.source import AttrSource
+            from torch.fx.experimental.symbolic_shapes import (
+                DimDynamic,
+                StatelessSymbolicContext,
+                SubclassSymbolicContext,
+                SymbolicContext,
+            )
+
+            view_base_context: Optional[SymbolicContext] = None
+            if t._is_view():
+                view_base_context = all_dynamic_symbolic_context(
+                    t._base, AttrSource(source, "_base"), shape_env, callback
+                )
+
+            t_symbolic_context: SymbolicContext
+            t_dynamic_sizes = [DimDynamic.DYNAMIC] * t.dim()
+            if is_traceable_wrapper_subclass(t):
+                inner_contexts: Dict[str, SymbolicContext] = {}
+                attrs, _ = t.__tensor_flatten__()
+                for attr in attrs:
+                    assert isinstance(attr, str)
+                    inner = getattr(t, attr)
+                    inner_contexts[attr] = all_dynamic_symbolic_context(
+                        inner, AttrSource(source, attr), shape_env, callback
+                    )
+                t_symbolic_context = SubclassSymbolicContext(
+                    dynamic_sizes=t_dynamic_sizes,
+                    constraint_sizes=[None] * t.dim(),
+                    inner_contexts=inner_contexts,
+                    tensor_source=source,
+                    view_base_context=view_base_context,
+                )
+            else:
+                t_symbolic_context = StatelessSymbolicContext(
+                    dynamic_sizes=t_dynamic_sizes,
+                    constraint_sizes=[None] * t.dim(),
+                    view_base_context=view_base_context,
+                )
+
+            return t_symbolic_context
+
+        # Returns a fake-ified version of an input view tensor t, given an already fake-ified
+        # base. At a high level, we want two things:
+        #   1. fake_t should have the same view relationship to the given fake base as the
+        #      input t has to its _base.
+        #   2. fake_t should have symbolic sizes / strides / storage offset according to the
+        #      appropriate symbolic context (i.e. from the automatic dynamic algorithm).
+        #
+        # We currently take different strategies across view types:
+        #   * For dense -> dense views, accomplish both (1) and (2) simultaneously via an
+        #     as_strided() call on the fake-ified base, passing symbolic metadata.
+        #   * For views involving subclasses, perform view replay using view funcs to
+        #     achieve (1). It's necessary for (2) to swap out any closed-over state in
+        #     the view funcs with symbolicized SymInts and fake-ified tensors. Doing this
+        #     avoids specialization (and thus over-eager simplification of symbols) that
+        #     could occur during view replay on the fake-ified base.
+        #
+        # Examples:
+        #   * t.unsqueeze(-1) with dense t is a dense -> dense view. It can be modeled
+        #     with an as_strided() call on the fake base passing symbolic metadata.
+        #   * sub.select(dim=0, index=3) is a subclass -> subclass view. The index arg
+        #     is made symbolic to avoid invalid specialization and view replay is then
+        #     done to reconstruct the view.
+        #   * _nested_from_jagged(values, offsets) is a dense -> subclass view
+        #     that returns a subclass instance from a dense values tensor. The offsets
+        #     tensor is closed over in the view func, as it can be considered view metadata.
+        #     First, the offsets tensor is fake-ified according to the inner symbolic
+        #     context and with the correct relationship to the outer size / stride metadata.
+        #     Then view replay is done, swapping in the fake offsets so the view replay output
+        #     is fully fake with no invalid specialization.
+        def view_from_base(base, t, source=source, shape_env=shape_env):
+            # fake-ify t's metadata according to the outer symbolic context
+            (sizes, strides, storage_offset) = sym_sizes_strides_storage_offset(
+                t, source
+            )
+            if not is_traceable_wrapper_subclass(
+                t
+            ) and not is_traceable_wrapper_subclass(base):
+                # Dense -> Dense view case uses as_strided() to construct view relationship.
+                # TODO: Change this logic to use view replay for consistency?
+                # It's likely there is no view func available.
+                return base.as_strided(sizes, strides, storage_offset)
+
+            from torch._dynamo.source import EphemeralSource
+            from torch.fx.experimental.symbolic_shapes import sym_eq
+
+            def symint_visitor_fn(s):
+                if shape_env is None:
+                    return s
+
+                # NB: The symbol here is expected to be simplified out because we a priori
+                # allocate inner and outer symbols according to the appropriate symbolic
+                # contexts and prefer those over this symbol during symbol simplification
+                # (via usage of EphemeralSource below). This -shouldn't- happen, but if
+                # this symbol somehow leaks out beyond the view tensor's shape metadata, our
+                # assumption of it being simplified out will fail and it may be guarded on,
+                # which will hard error.
+                sym_source = EphemeralSource("symint_visitor_fn")
+                symbol = shape_env.create_symbol(s, sym_source)
+                return shape_env.create_symintnode(symbol, hint=s, source=sym_source)
+
+            real_to_fake_mapping = {}
+            if is_traceable_wrapper_subclass(t):
+                # Fake-ify t naively here; this is only done so we can get fake-ified inner
+                # tensors with the correct relationships to the outer sizes / strides for use
+                # in view replay. It's done beforehand here because it's not easy to do when
+                # visiting tensors one-by-one during view replay.
+                #
+                # Example:
+                #   Consider a Dense -> NJT view. NJT has (values, offsets) components and we
+                #   want a view of values with the offsets closed over. As the offsets component
+                #   is needed to describe the output view, it's important that it's fakeified
+                #   correctly.
+                fake_t = empty_create_subclass(
+                    t, outer_size=sizes, outer_stride=strides
+                )
+                attrs, _ = fake_t.__tensor_flatten__()
+                for attr in attrs:
+                    real_to_fake_mapping[getattr(t, attr)] = getattr(fake_t, attr)
+
+            def tensor_visitor_fn(
+                visited_t, shape_env=shape_env, callback=callback, source=source
+            ):
+                # It's possible to close over an undefined tensor (e.g. NJT's lengths).
+                if visited_t is None:
+                    return None
+
+                # Fake inner tensors of view subclasses will come from the mapping built above.
+                fake_visited_t = real_to_fake_mapping.get(visited_t, None)
+                if fake_visited_t is not None:
+                    return fake_visited_t
+
+                # For other closed-over tensor state, fake-ify it as all dynamic with an
+                # ephemeral source. This avoids invalid specialization during view replay.
+                # If we find that in practice the usage of ephemeral sources isn't enough
+                # to guarantee that we don't have guards on these symbols, we may need to
+                # explicitly suppress guards (as is done for _base in the dense -> dense
+                # view case).
+                temp_source = EphemeralSource("tensor_visitor_fn")
+                return self.meta_tensor(
+                    visited_t,
+                    shape_env,
+                    callback,
+                    source=temp_source,
+                    symbolic_context=all_dynamic_symbolic_context(
+                        visited_t, temp_source, shape_env, callback
+                    ),
+                )
+
+            # Replay the view, swapping out any non-symbolic SymInts or real tensors
+            # for symbolic SymInts or fake tensors.
+            fake_t = t._view_func_unsafe(base, symint_visitor_fn, tensor_visitor_fn)
+
+            # Ensure the output has symbolic shapes according to the outer symbolic context.
+            # These checks should simplify out any symbols created for closed-over view func
+            # SymInts.
+            torch._check(sym_eq(fake_t.size(), sizes))
+            torch._check(sym_eq(fake_t.stride(), strides))
+            torch._check(sym_eq(fake_t.storage_offset(), storage_offset))
+            return fake_t
+
+        # see expired-storages
+        self.check_expired_count += 1
+        if self.check_expired_count >= self.check_expired_frequency:
+            self.check_for_expired_weak_storages()
+            self.check_expired_count = 0
+
+        if self.get_tensor_memo(t) is None:
+            with torch.inference_mode(t.is_inference()):
+                if t.is_sparse:
+                    is_leaf = safe_is_leaf(t)
+
+                    # The lambda function below is similar to
+                    # `t.to(device='meta')` except the latter
+                    # preserves nnz value
+                    r = callback(
+                        lambda: torch.ops.aten._sparse_coo_tensor_with_dims(
+                            t.sparse_dim(),
+                            t.dense_dim(),
+                            t.shape,
+                            dtype=t.dtype,
+                            layout=torch.sparse_coo,
+                            device="meta",
+                        )
+                    )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    # Note [is_coalesced is dispatched]
+                    # Strangely enough, is_coalesced() is a dispatched operator,
+                    # which means that it will get caught by fake tensor mode.
+                    # Ordinarily this would error, but there's some logic in
+                    # fake tensor ensure this doesn't happen.
+                    r._coalesced_(t.is_coalesced())
+                    if t.requires_grad:
+                        r.requires_grad = True
+                    if t.requires_grad and not is_leaf:
+                        with torch.enable_grad():
+                            r = r.clone()
+                            r._coalesced_(t.is_coalesced())
+                elif is_sparse_compressed(t):
+                    is_leaf = safe_is_leaf(t)
+
+                    def mk_meta():
+                        nnz = 0
+                        batch_dim = t.ndim - t.sparse_dim() - t.dense_dim()
+                        batch_size = t.shape[:batch_dim]
+                        if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                            index_dtype = t.crow_indices().dtype
+                            compressed_indices = torch.empty(
+                                t.crow_indices().shape, device="meta", dtype=index_dtype
+                            )
+                            plain_indices = torch.empty(
+                                (*t.col_indices().shape[:-1], nnz),
+                                device="meta",
+                                dtype=index_dtype,
+                            )
+                        else:
+                            index_dtype = t.ccol_indices().dtype
+                            compressed_indices = torch.empty(
+                                t.ccol_indices().shape, device="meta", dtype=index_dtype
+                            )
+                            plain_indices = torch.empty(
+                                (*t.row_indices().shape[:-1], nnz),
+                                device="meta",
+                                dtype=index_dtype,
+                            )
+                        values_shape = t.values().shape
+                        values = torch.empty(
+                            (
+                                *values_shape[:batch_dim],
+                                nnz,
+                                *values_shape[batch_dim + 1 :],
+                            ),
+                            dtype=t.dtype,
+                            device="meta",
+                        )
+                        return torch.ops.aten.sparse_compressed_tensor(
+                            compressed_indices,
+                            plain_indices,
+                            values,
+                            t.shape,
+                            layout=t.layout,
+                            dtype=t.dtype,
+                            device="meta",
+                        )
+
+                    # `mk_meta()` is similar to `t.to(device='meta'))`
+                    # except `to('meta')` preserves nnz value while
+                    # `mk_meta` result has nnz == 0.
+                    r = callback(mk_meta)
+
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = True
+                    if t.requires_grad and not is_leaf:
+                        with torch.enable_grad():
+                            r = r.clone()
+                elif t.is_nested and not is_traceable_wrapper_subclass(t):
+                    # TODO: Handle this better in Dynamo?
+                    # There are checks there now, but this can still be triggered by a dense
+                    # tensor graph input that is a view of a strided NT.
+                    from torch._dynamo.exc import unimplemented
+
+                    unimplemented(
+                        "strided nested tensors are not supported by meta conversion"
+                    )
+                elif t.is_mkldnn:
+                    is_leaf = safe_is_leaf(t)
+                    sizes, strides, _storage_offset = sym_sizes_strides_storage_offset(
+                        t, source
+                    )
+                    r = callback(
+                        lambda: torch.empty_strided(
+                            sizes, strides, dtype=t.dtype, device="meta"
+                        )
+                    )
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = True
+                    if t.requires_grad and not is_leaf:
+                        with torch.enable_grad():
+                            r = r.clone()
+                elif is_functorch_wrapped_tensor(t):
+                    if t._is_view():
+                        from torch._dynamo.exc import unimplemented
+
+                        unimplemented(
+                            "view functorch tensors are not supported by meta conversion"
+                        )
+
+                    # Wraps a functorch tensor class (BatchedTensor, GradTrackingTensor)
+                    # in a FakeTensor
+                    def _to_fake_tensor(t):
+                        if is_batchedtensor(t):
+                            ft = _to_fake_tensor(get_unwrapped(t))
+                            lvl = maybe_get_level(t)
+                            bdim = maybe_get_bdim(t)
+                            r = _add_batch_dim(ft, bdim, lvl)
+                        elif is_gradtrackingtensor(t):
+                            disable_functorch = torch._C._DisableFuncTorch
+                            with disable_functorch():
+                                ft = _to_fake_tensor(get_unwrapped(t))
+                            lvl = torch._C._functorch.maybe_get_level(t)
+                            r = torch._C._functorch._wrap_for_grad(ft, lvl)
+
+                            is_leaf = safe_is_leaf(t)
+                            if t.requires_grad and safe_is_leaf(r):
+                                r.requires_grad = True
+                            elif t.requires_grad and not is_leaf:
+                                with torch.enable_grad():
+                                    r = r.clone()
+                        else:
+                            sizes = t.size()
+                            strides = t.stride()
+                            r = callback(
+                                lambda: torch.empty_strided(
+                                    sizes,
+                                    strides,
+                                    dtype=t.dtype,
+                                    device="meta",
+                                )
+                            )
+                        return r
+
+                    r = _to_fake_tensor(t)
+
+                elif t._is_view():
+                    # Construct views in two steps: recursively meta-fy their
+                    # base, and then create view(s) off that.  NB: doing it
+                    # directly from storage is WRONG because this won't cause
+                    # version counters to get shared.
+                    assert t._is_view()
+
+                    base_symbolic_context = None
+                    if shape_env and symbolic_context is not None:
+                        from torch.fx.experimental.symbolic_shapes import (
+                            StatelessSymbolicContext,
+                        )
+
+                        assert isinstance(symbolic_context, StatelessSymbolicContext)
+                        # NB: This should generally be set when the input is a view,
+                        # but the exception right now is for fake-ifying grads, which is
+                        # a work in progress.
+                        if symbolic_context.view_base_context is not None:
+                            base_symbolic_context = symbolic_context.view_base_context
+
+                    base = self.meta_tensor(
+                        t._base,
+                        shape_env,
+                        callback,
+                        source=torch._dynamo.source.AttrSource(source, "_base"),
+                        symbolic_context=base_symbolic_context,
+                    )
+
+                    def is_c_of_r(complex_dtype, real_dtype):
+                        return (
+                            utils.is_complex_dtype(complex_dtype)
+                            and utils.corresponding_real_dtype(complex_dtype)
+                            == real_dtype
+                        )
+
+                    # In some situations, MetaConverter may be called in a
+                    # context where autograd is disabled.  For the _is_view
+                    # assert to pass, we have to setup the autograd view
+                    # metadata anyway.  Do this by reenabling the
+                    # ADInplaceOrView key.  This is kind of a hack.
+                    old_exclude = torch._C._dispatch_tls_is_dispatch_key_excluded(
+                        torch._C.DispatchKey.ADInplaceOrView
+                    )
+                    torch._C._dispatch_tls_set_dispatch_key_excluded(
+                        torch._C.DispatchKey.ADInplaceOrView, False
+                    )
+                    try:
+                        if base.dtype == t.dtype:
+                            pass
+                        elif is_c_of_r(base.dtype, t.dtype):
+                            base = torch.view_as_real(base)
+                        elif is_c_of_r(t.dtype, base.dtype):
+                            base = torch.view_as_complex(base)
+                        else:
+                            # This is not guaranteed to succeed.  If it fails, it
+                            # means there is another dtype-converting view function
+                            # that hasn't been handled here
+                            base = base.view(t.dtype)
+
+                        # This is very tricky.  Naively, you might expect this
+                        # to hold:
+                        #
+                        #   if t.requires_grad and not safe_is_leaf(t)
+                        #       assert t._base.requires_grad
+                        #
+                        # But it's not true!  As you can see in the following
+                        # program:
+                        #
+                        #   x = torch.zeros(4)
+                        #   y = x.view(1, 4)
+                        #   y.requires_grad = True
+                        #   z = y.view(1, 1, 4)
+                        #   assert z._base is x
+                        #
+                        # So we may have to do *two* views out of the base to
+                        # recreate this situation.
+                        if safe_is_leaf(t):
+                            # Leaf views that track view metadata are created by
+                            # creating a view inside a no_grad block
+                            with torch.no_grad(), maybe_suppress():
+                                r = view_from_base(base, t)
+                            # As it's a leaf, we can directly assign requires_grad
+                            r.requires_grad = t.requires_grad
+                        else:
+                            if t._base.requires_grad == t.requires_grad:
+                                # Easy case, just run the view op
+                                with torch.enable_grad(), maybe_suppress():
+                                    r = view_from_base(base, t)
+
+                                # NB: We don't actaully faithfully replicate
+                                # autograd connectivity, but that doesn't matter
+                                # today. See following for more info:
+                                # https://gist.github.com/soulitzer/e03f015b314c3f5fcf80888c69390913
+                            else:
+                                # Obscure case.  Create a leaf view and give it the
+                                # correct requires_grad, then do the final view.
+                                # NB: Can't have a non-leaf without requiring grad!
+                                assert t.requires_grad
+                                with torch.no_grad():
+                                    mid = base.view(base.shape)
+                                mid.requires_grad = t.requires_grad
+                                with torch.enable_grad(), maybe_suppress():
+                                    r = view_from_base(mid, t)
+                        # The CreationMeta influences whether or not inplace
+                        # mutation is an error or not.  So we need to make
+                        # sure we properly propagate this as well.
+                        torch._C._autograd._set_creation_meta(
+                            r, torch._C._autograd._get_creation_meta(t)
+                        )
+                    finally:
+                        torch._C._dispatch_tls_set_dispatch_key_excluded(
+                            torch._C.DispatchKey.ADInplaceOrView, old_exclude
+                        )
+
+                else:
+                    is_leaf = safe_is_leaf(t)
+
+                    (
+                        sizes,
+                        strides,
+                        storage_offset,
+                    ) = sym_sizes_strides_storage_offset(t, source, symbolic_context)
+
+                    # If we have a subclass that desugars into dense tensors,
+                    # perform our callback on each inner tensor.
+                    if is_traceable_wrapper_subclass(t):
+                        r = empty_create_subclass(
+                            t, outer_size=sizes, outer_stride=strides
+                        )
+                    else:
+                        r = callback(
+                            lambda: torch.empty_strided(
+                                sizes,
+                                strides,
+                                dtype=t.dtype,
+                                device="meta",
+                            )
+                        )
+
+                    assert safe_is_leaf(r), "the callback you passed in doesn't detach"
+                    if t.requires_grad:
+                        r.requires_grad = t.requires_grad
+                        if not is_leaf:
+                            # Fake up some autograd history.
+                            with torch.enable_grad():
+                                # preserve_format is the default, but we want to
+                                # emphasize how important it is to preserve
+                                # format here
+                                r = r.clone(memory_format=torch.preserve_format)
+
+                    # Graph-Break for wrapped tensors
+                    if not (
+                        is_batchedtensor(t) or is_gradtrackingtensor(t)
+                    ) and torch._C._functorch.is_functorch_wrapped_tensor(t):
+                        return NotImplemented
+
+                    s = t.untyped_storage()
+                    swr = StorageWeakRef(s)
+                    if swr not in self.storage_memo and (
+                        r.is_nested
+                        or (
+                            r.stride() == strides
+                            and r.storage_offset() == storage_offset
+                        )
+                    ):
+                        # You're normal and happy, install the fresh storage into the memo
+                        self.storage_memo[swr] = r.untyped_storage()
+                    else:
+                        # You're in crazy town; somehow you gave us a tensor
+                        # that wasn't a view, but had nonzero storage offset,
+                        # nontrivial strides (such that clone() couldn't
+                        # preserve them), or already aliases with another
+                        # tensor's storage.  The most typical way to end
+                        # up here is with set_.  So use set_ to bludgeon this
+                        # in.
+                        r_s = self.meta_storage(s, callback=callback)
+                        # NB: In principle, this should always work, but there
+                        # is some subtle difference in the autograd metadata
+                        # that means we will backprop the set_ call, even if
+                        # r is declared as an input to grad.
+                        # See https://github.com/pytorch/pytorch/issues/87956
+                        # for the reproducer.
+                        # NB: The in_kernel_invocation_manager here is necessary
+                        # for fake tensor.  If we run the set_ call with fake
+                        # tensor on, r will improperly report that it is NOT a
+                        # meta tensor but a cpu tensor, and then the set_ call
+                        # will fail due to device mismatch.  no_dispatch() is
+                        # not enough, because the fake tensor will still claim
+                        # to be a CPU tensor and you'll end up in the CPU
+                        # kernel.  Arguably this is a hack; a cleaner way to
+                        # solve this is to have a FakeStorage concept which
+                        # would report it's CPU device--no problem now!  But
+                        # this is difficult to do because we don't have storage
+                        # subclasses.  Relevant test is
+                        # DynamicShapesFunctionTests::test_add_dynamic_shapes in
+                        # test/dynamo/test_dynamic_shapes.py
+                        maybe_fake_mgr: ContextManager[None] = contextlib.nullcontext()
+                        from torch._subclasses.fake_tensor import (
+                            in_kernel_invocation_manager,
+                            maybe_get_fake_mode,
+                        )
+
+                        mb_fake_mode = maybe_get_fake_mode(r)
+                        if mb_fake_mode is not None:
+                            maybe_fake_mgr = in_kernel_invocation_manager(mb_fake_mode)
+                        with maybe_fake_mgr, torch.no_grad():
+                            r.set_(r_s, storage_offset, sizes, strides)
+
+                if safe_grad(t) is not None:
+                    from torch._dynamo.source import AttrSource
+
+                    # TODO: Use a valid grad-specific symbolic context instead of recycling
+                    # the one from t. This isn't correct if e.g. t._is_view() != t.grad._is_view().
+                    r.grad = self.meta_tensor(
+                        safe_grad(t),
+                        shape_env,
+                        callback,
+                        source=AttrSource(source, "grad"),
+                        symbolic_context=symbolic_context,
+                    )
+                torch._C._set_conj(r, t.is_conj())
+                torch._C._set_neg(r, t.is_neg())
+            # This can be skipped if necessary for performance reasons
+            assert_metadata_eq(assert_eq, t, r, skip_symbolic=True)
+            self.set_tensor_memo(t, r)
+
+        return self.get_tensor_memo(t)
+
+    def __call__(
+        self,
+        t,
+        shape_env=None,
+        *,
+        callback=lambda t: t(),
+        source=None,
+        symbolic_context=None,
+    ):
+        # TODO: zero tensors?  We appear to have eliminated them by
+        # excluding complex for now
+
+        if isinstance(t, torch.Tensor) or is_traceable_wrapper_subclass(t):
+            if t.device.type != "xla" and any(
+                [
+                    t.is_quantized,
+                    t._is_view() and t._base is not None and t._base.is_sparse,
+                    torch._is_functional_tensor(t),
+                    t.device.type in ("lazy"),
+                    # We need a way to test if a tensor is batched but there
+                    # is no official APi to do it
+                    # torch._C._is_batched(t),
+                ]
+            ):
+                # TODO: sparse should support meta
+                # NB technically to('meta') does work but our logging
+                # instrumentation will see the meta conversions and the
+                # tests all break so we just exclude this.  In any case
+                # the to conversion isn't really right anyhow.
+
+                if torch._is_functional_tensor(t) and t.device.type != "lazy":
+                    if t._is_view():
+                        raise RuntimeError(
+                            "Cannot safely fakify a view because this process drops the view information right now."
+                        )
+
+                    st = peek_interpreter_stack()
+                    assert (
+                        st is None or st.key() == TransformType.Functionalize
+                    ), "Expect st to be either None or have Functionalize transform key."
+                    if st is None:
+                        # the case of AOTAutograd
+                        torch._sync(t)
+                        unwrap_t = torch._from_functional_tensor(t)
+                        with torch._dispatch.python.suspend_functionalization():
+                            fake_t = self.meta_tensor(
+                                unwrap_t,
+                                shape_env=shape_env,
+                                callback=callback,
+                                source=source,
+                                symbolic_context=symbolic_context,
+                            )
+                        out = torch._to_functional_tensor(fake_t)
+                        torch._mirror_autograd_meta_to(fake_t, out)
+                        return out
+                    else:
+                        # torch.func.functionalize
+                        reapply_views = torch._C._functionalization_reapply_views_tls()
+                        unwrap_t = _unwrap_functional_tensor(t, reapply_views)
+                        pop_st_ctx = (
+                            torch._functorch.pyfunctorch.temporarily_pop_interpreter_stack()
+                        )
+                        with pop_st_ctx:
+                            fake_t = self.meta_tensor(
+                                unwrap_t,
+                                shape_env=shape_env,
+                                callback=callback,
+                                source=source,
+                                symbolic_context=symbolic_context,
+                            )
+                        return _wrap_functional_tensor(fake_t, current_level())
+                self.miss += 1
+                return NotImplemented
+            else:
+                self.hit += 1
+
+                disable_functorch = torch._C._DisableFuncTorch
+                with disable_functorch():
+                    r = self.meta_tensor(
+                        t,
+                        shape_env=shape_env,
+                        callback=callback,
+                        source=source,
+                        symbolic_context=symbolic_context,
+                    )
+                if type(t) is torch.nn.Parameter:
+                    # NB: Cannot directly use Parameter constructor
+                    # because that would force a detach, not desirable
+                    r._is_param = True
+                return r
+        elif torch.overrides.is_tensor_like(t):
+            self.miss += 1
+            return NotImplemented
+        else:
+            # non-Tensor types don't count as hit or miss
+            return t
+
+
+import torch._prims_common as utils
diff --git a/MLPY/Lib/site-packages/torch/_subclasses/schema_check_mode.py b/MLPY/Lib/site-packages/torch/_subclasses/schema_check_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b2ca093e6acf75ba5fb7ad1d136ccbda992666
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_subclasses/schema_check_mode.py
@@ -0,0 +1,198 @@
+# mypy: ignore-errors
+
+from collections import namedtuple
+from copy import deepcopy
+from itertools import combinations
+
+import torch
+from torch.fx.operator_schemas import normalize_function
+from torch.testing._internal.jit_utils import clone_inputs
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map
+
+# Named Tuples used within SchemaCheckMode
+Mutation = namedtuple("Mutation", ["op_name", "arg_name"])
+Aliasing = namedtuple("Aliasing", ["op_name", "arg_name", "output_number"])
+
+# Simplified naming for C++ classes
+SchemaArgument = torch._C._SchemaArgument
+SchemaArgType = torch._C._SchemaArgType
+SchemaInfo = torch._C._SchemaInfo
+
+# This TorchDispatchMode Subclass is used to verify op schemas
+# This TorchDispatchMode Scubclass currently:
+#  - Records the called ops
+#  - Checks for mutations on all inputs
+#  - Checks for aliasing on all inputs
+
+
+class SchemaCheckMode(TorchDispatchMode):
+    def __init__(self):
+        # Information recorded for testing purposes. For example:
+        #  - incorrect schemas
+        #  - overly conservative schemas
+        self.ops = []
+        self.mutated = []
+        self.aliasing = []
+
+    def reset_cache(self):
+        self.ops.clear()
+        self.mutated.clear()
+        self.aliasing.clear()
+
+    def display_ops(self):
+        print(*self.ops, sep=",")
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        def bitwise_equal(lhs, rhs):
+            if lhs.is_quantized:
+                # TODO: This is only OK if can't have NaN quantized; idk if
+                # this is actually true
+                return torch.equal(lhs, rhs)
+            else:
+                return torch.allclose(lhs, rhs, equal_nan=True)
+
+        def has_mutated(before, after, md):
+            are_tensors = type(before) == torch.Tensor and type(after) == torch.Tensor
+            if (
+                are_tensors
+                and before.layout != torch.sparse_csr
+                and after.layout != torch.sparse_csr
+            ):
+                return not (
+                    before.size() == after.size()
+                    and bitwise_equal(before, after)
+                    and md[0] == after.stride()
+                    and md[1] == after._typed_storage()._cdata
+                )
+            return False
+
+        def has_aliased(lhs, rhs):
+            try:
+                return torch._C._overlaps(lhs, rhs)
+            except Exception as exception:
+                if str(exception).startswith("Cannot inspect value of type "):
+                    return False
+                else:
+                    raise exception
+
+        def standardize_name(name):
+            return name if name != "self" else "input"
+
+        def unwrap(e):
+            if isinstance(e, torch.Tensor) and not type(e) == torch.Tensor:
+                try:
+                    return e.elem
+                except AttributeError as t:
+                    return e
+            return e
+
+        def parse_metadata(e):
+            if isinstance(e, torch.Tensor):
+                if not type(e) == torch.Tensor:
+                    try:
+                        current = e.elem
+                        return (
+                            deepcopy(current.stride()),
+                            current._typed_storage()._cdata,
+                        )
+                    except AttributeError as t:
+                        return None
+                # Sparse CSR tensors do not have strides or storage
+                elif e.layout != torch.sparse_csr:
+                    return (deepcopy(e.stride()), e._typed_storage()._cdata)
+            return None
+
+        self.ops.append(func._schema.name)
+
+        # Clone and process arguments and outputs
+        pre_arguments = normalize_function(
+            func, args, kwargs, normalize_to_only_use_kwargs=True
+        ).kwargs
+
+        c_p_args = dict(zip(pre_arguments.keys(), clone_inputs(pre_arguments.values())))
+        cloned_arguments = {
+            name: tree_map(unwrap, c_p_args.get(name)) for name in c_p_args
+        }
+        cloned_metadata = {
+            name: [
+                parse_metadata(a) for a in pytree.tree_leaves(pre_arguments.get(name))
+            ]
+            for name in pre_arguments
+        }
+
+        out = func(*args, **kwargs)
+        arguments = {
+            name: tree_map(unwrap, pre_arguments.get(name)) for name in pre_arguments
+        }
+        tuple_out = out if isinstance(out, tuple) else (out,)
+        tuple_out = tree_map(unwrap, tuple_out)
+
+        schema_info = SchemaInfo(func._schema)
+        schema_info.add_argument_values(pre_arguments)
+
+        # Process arguments with outputs
+        for i in range(len(func._schema.arguments)):
+            arg = func._schema.arguments[i]
+            name = standardize_name(arg.name)
+            if arguments.get(name) is not None:
+                before = cloned_arguments.get(name)
+                md = cloned_metadata.get(name)
+                after = arguments.get(name)
+                for j in range(len(tuple_out)):
+                    # aten::_unsafe_view is intended to have incorrect aliasing notation (hence unsafe)
+                    unsafe_ops = ("aten::_unsafe_view", "aten::unsafe_split")
+                    if (
+                        has_aliased(tuple_out[j], after)
+                        and func._schema.name not in unsafe_ops
+                    ):
+                        if not schema_info.may_contain_alias(
+                            SchemaArgument(SchemaArgType.output, j),
+                            SchemaArgument(SchemaArgType.input, i),
+                        ):
+                            raise RuntimeError(
+                                f"Argument {name} is not defined to alias output but was aliasing"
+                            )
+                        else:
+                            self.aliasing.append(
+                                Aliasing(func._schema.name, name, f"output_{j}")
+                            )
+                    if after is tuple_out[j] and isinstance(after, torch.Tensor):
+                        # Only mutable ops e.g. (add_, add.out) are allowed to directly return inputs.
+                        if not schema_info.is_mutable(
+                            SchemaArgument(SchemaArgType.input, i)
+                        ) and func not in [
+                            torch.ops.aten.lift.default,
+                            torch.ops.aten.lift_fresh.default,
+                        ]:
+                            raise RuntimeError(
+                                f"""\
+Dispatcher operators below autograd are not allowed to directly return inputs.
+However, we found that `outputs[{str(j)}] is {name}"""
+                            )
+                if any(
+                    has_mutated(a, b, c)
+                    for a, b, c in zip(
+                        pytree.tree_leaves(before), pytree.tree_leaves(after), md
+                    )
+                ):
+                    if not schema_info.is_mutable(
+                        SchemaArgument(SchemaArgType.input, i)
+                    ):
+                        raise RuntimeError(
+                            f"Argument {name} is not defined as mutable but was mutated"
+                        )
+                    else:
+                        self.mutated.append(Mutation(func._schema.name, name))
+
+        # Aliasing between outputs
+        for i, j in combinations(range(len(func._schema.returns)), 2):
+            if has_aliased(tuple_out[i], tuple_out[j]):
+                if not schema_info.may_contain_alias(
+                    SchemaArgument(SchemaArgType.output, i),
+                    SchemaArgument(SchemaArgType.output, j),
+                ):
+                    raise RuntimeError(f"Outputs {i} and {j} alias unexpectedly")
+
+        return out
diff --git a/MLPY/Lib/site-packages/torch/_tensor.py b/MLPY/Lib/site-packages/torch/_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..85a466bd155bf9a40475f31af7ae32c3a45b3393
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_tensor.py
@@ -0,0 +1,1543 @@
+import copyreg
+import enum
+import functools
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+from numbers import Number
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch._C as _C
+import torch.utils.hooks as hooks
+from torch._namedtensor_internals import (
+    check_serializing_named_tensor,
+    is_ellipsis,
+    resolve_ellipsis,
+    single_ellipsis_index,
+    unzip_namedshape,
+    update_names,
+)
+from torch.overrides import (
+    get_default_nowrap_functions,
+    handle_torch_function,
+    has_torch_function,
+    has_torch_function_unary,
+    has_torch_function_variadic,
+)
+from torch.utils.dlpack import DLDeviceType
+
+
+def _handle_torch_function_and_wrap_type_error_to_not_implemented(f):
+    assigned = functools.WRAPPER_ASSIGNMENTS
+
+    @functools.wraps(f, assigned=assigned)
+    def wrapped(*args, **kwargs):
+        try:
+            # See https://github.com/pytorch/pytorch/issues/75462
+            if has_torch_function(args):
+                return handle_torch_function(wrapped, args, *args, **kwargs)
+            return f(*args, **kwargs)
+        except TypeError:
+            return NotImplemented
+
+    return wrapped
+
+
+# Should not be used, this is kept only for BC of loading old serialized Tensor subclasses
+def _rebuild_from_type(func, type, args, dict):
+    if type is Tensor:
+        return func(*args)
+
+    ret = func(*args).as_subclass(type)
+    ret.__dict__ = dict
+    return ret
+
+
+def _rebuild_from_type_v2(func, new_type, args, state):
+    ret = func(*args)
+    if type(ret) is not new_type:
+        ret = ret.as_subclass(new_type)
+    # Tensor does define __setstate__ even though it doesn't define
+    # __getstate__. So only use __setstate__ if it is NOT the one defined
+    # on Tensor
+    if (
+        getattr(ret.__class__, "__setstate__", Tensor.__setstate__)
+        is not Tensor.__setstate__
+    ):
+        ret.__setstate__(state)
+    else:
+        ret = torch._utils._set_obj_state(ret, state)
+    return ret
+
+
+# NB: If you subclass Tensor, and want to share the subclassed class
+# across processes, you must also update torch/multiprocessing/reductions.py
+# to define a ForkingPickler serialization mode for the class.
+#
+# NB: If you add a new method to Tensor, you must update
+# torch/_C/__init__.pyi.in to add a type annotation for your method;
+# otherwise, it will not show up in autocomplete.
+class Tensor(torch._C.TensorBase):
+    def __deepcopy__(self, memo):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__deepcopy__, (self,), self, memo)
+        if not self.is_leaf:
+            raise RuntimeError(
+                "Only Tensors created explicitly by the user "
+                "(graph leaves) support the deepcopy protocol at the moment.  "
+                "If you were attempting to deepcopy a module, this may be because "
+                "of a torch.nn.utils.weight_norm usage, "
+                "see https://github.com/pytorch/pytorch/pull/103001"
+            )
+        if id(self) in memo:
+            return memo[id(self)]
+        with torch.no_grad():
+            # TODO: skipping storage copy is wrong for meta, as meta
+            # does accurate alias tracking; however, the code below
+            # doesn't work because of
+            # https://github.com/pytorch/pytorch/issues/47442
+            # Update the test in test_serialization if you remove 'meta' from here
+            if (
+                self.is_sparse
+                or self.device.type
+                in ["lazy", "xla", "mtia", "mps", "ort", "meta", "ipu"]
+                or (
+                    not torch._C._has_storage(self)
+                    and self.device.type == torch._C._get_privateuse1_backend_name()
+                )
+                or (type(self) is not Tensor and self.data_ptr() == 0)
+            ):
+                new_tensor = self.clone()
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError(
+                        "The default implementation of __deepcopy__() for wrapper subclasses "
+                        "only works for subclass types that implement clone() and for which "
+                        "cloning returns another instance of the same subclass. You should either "
+                        "properly implement clone() for your subclass or override __deepcopy__() "
+                        "if it is intended behavior for clone() to return an instance of a "
+                        "different type."
+                    )
+            else:
+                new_storage = self._typed_storage()._deepcopy(memo)
+                if self.is_quantized:
+                    # quantizer_params can be different type based on torch attribute
+                    quantizer_params: Union[
+                        Tuple[torch.qscheme, float, int],
+                        Tuple[torch.qscheme, Tensor, Tensor, int],
+                    ]
+                    if self.qscheme() == torch.per_tensor_affine:
+                        quantizer_params = (
+                            self.qscheme(),
+                            self.q_scale(),
+                            self.q_zero_point(),
+                        )
+                    elif self.qscheme() in (
+                        torch.per_channel_affine,
+                        torch.per_channel_affine_float_qparams,
+                    ):
+                        quantizer_params = (
+                            self.qscheme(),
+                            self.q_per_channel_scales(),
+                            self.q_per_channel_zero_points(),
+                            self.q_per_channel_axis(),
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"Unsupported qscheme {self.qscheme()} in deepcopy"
+                        )
+                    # TODO: Once we decide to break serialization FC, no longer
+                    # need to wrap with TypedStorage
+                    new_tensor = torch._utils._rebuild_qtensor(
+                        torch.storage.TypedStorage(
+                            wrap_storage=new_storage._untyped_storage,
+                            dtype=self.dtype,
+                            _internal=True,
+                        ),
+                        self.storage_offset(),
+                        self.size(),
+                        self.stride(),
+                        quantizer_params,
+                        self.requires_grad,
+                        self._backward_hooks,
+                    )
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError(
+                            "The default implementation of __deepcopy__() for quantized tensors "
+                            "expects the tensor returned by torch._utils._rebuild_qtensor() to "
+                            "match the type of the instance being copied. If you encounter this, "
+                            "please open an issue on PyTorch's GitHub."
+                        )
+                else:
+                    new_tensor = self.new_empty([])
+                    if type(new_tensor) is not type(self):
+                        raise RuntimeError(
+                            "The default implementation of __deepcopy__() for non-wrapper subclasses "
+                            "only works for subclass types that implement new_empty() and for which "
+                            "that function returns another instance of the same subclass. You should "
+                            "either properly implement new_empty() for your subclass or override "
+                            "__deepcopy__() if it is intended behavior for new_empty() to return "
+                            "an instance of a different type."
+                        )
+                    new_tensor.set_(
+                        new_storage, self.storage_offset(), self.size(), self.stride()
+                    )
+                    if self.is_conj():
+                        new_tensor = new_tensor.conj_physical()
+                    if self.is_neg():
+                        new_tensor = new_tensor.neg()
+            if self.requires_grad:
+                new_tensor.requires_grad_()
+            if self.grad is not None:
+                new_tensor.grad = self.grad.__deepcopy__(memo)
+
+            if type(self) is not Tensor:
+                if type(new_tensor) is not type(self):
+                    raise RuntimeError(
+                        "Type of deepcopy result does not match the type of the source tensor. "
+                        "If you encounter this, please open an issue on PyTorch's GitHub."
+                    )
+
+                # Plain Tensors don't have slots
+                slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+                for slot in slots_to_save:
+                    if hasattr(self, slot):
+                        setattr(new_tensor, slot, deepcopy(getattr(self, slot), memo))
+
+            new_tensor.__dict__ = deepcopy(self.__dict__, memo)
+
+            memo[id(self)] = new_tensor
+            return new_tensor
+
+    def __reduce_ex__(self, proto):
+        state = torch._utils._get_obj_state(self)
+        if type(self) is Tensor and not state:
+            # Fast path for regular tensor without Python state.
+            return self._reduce_ex_internal(proto)
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__reduce_ex__, (self,), self, proto)
+        func, args = self._reduce_ex_internal(proto)
+        return (_rebuild_from_type_v2, (func, type(self), args, state))
+
+    def storage(self):
+        r"""
+        storage() -> torch.TypedStorage
+
+        Returns the underlying :class:`TypedStorage`.
+
+        .. warning::
+
+            :class:`TypedStorage` is deprecated. It will be removed in the future, and
+            :class:`UntypedStorage` will be the only storage class. To access the
+            :class:`UntypedStorage` directly, use :attr:`Tensor.untyped_storage()`.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.storage, (self,), self)
+
+        torch.storage._warn_typed_storage_removal(stacklevel=2)
+        return self._typed_storage()
+
+    # For internal use only, to avoid raising deprecation warning
+    def _typed_storage(self):
+        untyped_storage = self.untyped_storage()
+        return torch.TypedStorage(
+            wrap_storage=untyped_storage, dtype=self.dtype, _internal=True
+        )
+
+    def _reduce_ex_internal(self, proto):
+        check_serializing_named_tensor(self)
+        # See Note [Don't serialize hooks]
+        torch.utils.hooks.warn_if_has_hooks(self)
+        backward_hooks: Dict[Any, Any] = OrderedDict()
+        # Note: Numpy array is chosen to be the rebuild component for XLA, MTIA, ORT Tensors.
+        # We considered a few options:
+        # 1. CPU tensor can't be used here.
+        #    Otherwise in torch.load CPU storage is reconstructed with randomly
+        #    initialized data, moved onto backend device, and then storage is updated
+        #    to the serialized content. This works perfectly for CPU/CUDA but not these backends;
+        #    their tensors are disconnected with storage so they don't get the update.
+        # 2. Python list is not a good fit due to performance reason.
+        #    `tolist()` converts every single element in the tensor into python objects
+        #    and serialize them one by one.
+        if self.device.type in ["xla", "mtia", "ort"] or (
+            not torch._C._has_storage(self)
+            and self.device.type == torch._C._get_privateuse1_backend_name()
+        ):
+            # Convert BFloat16 tesors to Float32 before conversion to numpy, as numpy doesn't
+            # support BFloat16. The rebuild tensor from numpy takes in the original self.dtype,
+            # this would reconstruct the BFloat16 tensor from numpy.
+            numpy_tensor = (
+                self.cpu().numpy()
+                if self.dtype != torch.bfloat16
+                else self.cpu().to(torch.float32).numpy()
+            )
+            return (
+                torch._utils._rebuild_device_tensor_from_numpy,
+                (numpy_tensor, self.dtype, str(self.device), self.requires_grad),
+            )
+        if self.device.type == "meta":
+            # NB: This implementation BREAKS storage sharing.  Current
+            # hypothesis is that no one cares for meta tensors.
+            arg_meta = (
+                self.dtype,
+                tuple(self.size()),
+                self.stride(),
+                self.requires_grad,
+            )
+            return (torch._utils._rebuild_meta_tensor_no_storage, arg_meta)
+        if self.is_quantized:
+            # quantizer_params can be different type based on torch attribute
+            quantizer_params: Union[
+                Tuple[torch.qscheme, float, int], Tuple[Any, Tensor, Tensor, int]
+            ]
+            if self.qscheme() == torch.per_tensor_affine:
+                quantizer_params = (
+                    torch.per_tensor_affine,
+                    self.q_scale(),
+                    self.q_zero_point(),
+                )
+            elif self.qscheme() in (
+                torch.per_channel_affine,
+                torch.per_channel_affine_float_qparams,
+            ):
+                # convert scales and zero points to tuple to avoid recursive calls
+                # when/if we get multi-axis quantized tensors in the future, the shape
+                # is recoverable from the main tensor shape
+                quantizer_params = (
+                    torch.per_channel_affine,
+                    self.q_per_channel_scales(),
+                    self.q_per_channel_zero_points(),
+                    self.q_per_channel_axis(),
+                )
+            else:
+                raise RuntimeError(
+                    f"Serialization is not supported for tensors of type {self.qscheme()}"
+                )
+            # TODO: Once we decide to break serialization FC, no longer
+            # need to wrap with TypedStorage
+            args_qtensor = (
+                torch.storage.TypedStorage(
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
+                ),
+                self.storage_offset(),
+                tuple(self.size()),
+                self.stride(),
+                quantizer_params,
+                self.requires_grad,
+                backward_hooks,
+            )
+            return (torch._utils._rebuild_qtensor, args_qtensor)
+        elif self.is_sparse:
+            if self.layout == torch.sparse_coo:
+                args_sparse = (
+                    self.layout,
+                    (self._indices(), self._values(), self.size(), self.is_coalesced()),
+                )
+            else:
+                raise NotImplementedError(
+                    f"sparse tensor __reduce_ex__ for layout `{self.layout}`"
+                )
+            return (torch._utils._rebuild_sparse_tensor, args_sparse)
+        elif self.layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if self.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                compressed_indices, plain_indices = (
+                    self.crow_indices(),
+                    self.col_indices(),
+                )
+            else:
+                compressed_indices, plain_indices = (
+                    self.ccol_indices(),
+                    self.row_indices(),
+                )
+            args_sparse_compressed = (
+                self.layout,
+                (
+                    compressed_indices,
+                    plain_indices,
+                    self.values(),
+                    self.size(),
+                ),
+            )
+            return (torch._utils._rebuild_sparse_tensor, args_sparse_compressed)
+        elif self.is_nested:
+            args_nested = (
+                # NB: values() currently returns the storage as a buffer in an unsafe way.
+                # Ideally, we'd use a private API for this instead. TODO: Switch to this if
+                # we ever get around to adding it.
+                self.values(),
+                self._nested_tensor_size(),
+                self._nested_tensor_strides(),
+                self._nested_tensor_storage_offsets(),
+            )
+            return (torch._utils._rebuild_nested_tensor, args_nested)
+        elif (
+            self.data_ptr() == 0
+            and type(self) is not torch.Tensor
+            and type(self).__torch_dispatch__ is not torch.Tensor.__torch_dispatch__
+        ):
+            arg_wrapper_subclass = (
+                type(self),
+                self.dtype,
+                tuple(self.size()),
+                self.stride(),
+                self.storage_offset(),
+                self.layout,
+                self.device,
+                self.requires_grad,
+            )
+            return (torch._utils._rebuild_wrapper_subclass, arg_wrapper_subclass)
+        else:
+            v3_dtypes = [
+                torch.float8_e5m2,
+                torch.float8_e4m3fn,
+                torch.float8_e5m2fnuz,
+                torch.float8_e4m3fnuz,
+                torch.bits8,
+                torch.bits16,
+                torch.bits1x8,
+                torch.bits2x4,
+                torch.bits4x2,
+                torch.complex32,
+            ]
+            if self.dtype in v3_dtypes:
+                rebuild_func = torch._utils._rebuild_tensor_v3
+                storage = self.untyped_storage()
+            else:
+                # TODO: Once we decide to break serialization FC, no longer
+                # need to wrap with TypedStorage
+                rebuild_func = torch._utils._rebuild_tensor_v2  # type: ignore[assignment]
+                storage = torch.storage.TypedStorage(
+                    wrap_storage=self._typed_storage()._untyped_storage,
+                    dtype=self.dtype,
+                    _internal=True,
+                )  # type: ignore[assignment]
+            args = (
+                storage,
+                self.storage_offset(),
+                tuple(self.size()),
+                self.stride(),
+                self.requires_grad,
+                backward_hooks,
+            )  # previously was self._backward_hooks
+
+            if isinstance(storage, torch.storage.UntypedStorage):
+                args = args + (self.dtype,)  # type: ignore[assignment]
+
+            metadata = torch._utils.get_tensor_metadata(self)
+            if metadata:
+                args = args + (metadata,)  # type: ignore[assignment]
+
+            return (rebuild_func, args)
+
+    def __setstate__(self, state):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__setstate__, (self,), self, state)
+        # Warning: this method is NOT called when you torch.load() a tensor;
+        # that is managed by _rebuild_tensor_v2
+        if not self.is_leaf:
+            raise RuntimeError("__setstate__ can be only called on leaf Tensors")
+        if len(state) == 4:
+            # legacy serialization of Tensor
+            self.set_(*state)
+            return
+        elif len(state) == 5:
+            # legacy serialization of Variable
+            self.data = state[0]
+            state = (state[3], state[4], state[2])
+        # The setting of _backward_hooks is expected to be a no-op.
+        # See Note [Don't serialize hooks]
+        self.requires_grad, _, self._backward_hooks = state
+
+    def __repr__(self, *, tensor_contents=None):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.__repr__, (self,), self, tensor_contents=tensor_contents
+            )
+        # All strings are unicode in Python 3.
+        return torch._tensor_str._str(self, tensor_contents=tensor_contents)
+
+    def backward(
+        self, gradient=None, retain_graph=None, create_graph=False, inputs=None
+    ):
+        r"""Computes the gradient of current tensor wrt graph leaves.
+
+        The graph is differentiated using the chain rule. If the tensor is
+        non-scalar (i.e. its data has more than one element) and requires
+        gradient, the function additionally requires specifying ``gradient``.
+        It should be a tensor of matching type and location, that contains
+        the gradient of the differentiated function w.r.t. ``self``.
+
+        This function accumulates gradients in the leaves - you might need to zero
+        ``.grad`` attributes or set them to ``None`` before calling it.
+        See :ref:`Default gradient layouts<default-grad-layouts>`
+        for details on the memory layout of accumulated gradients.
+
+        .. note::
+
+            If you run any forward ops, create ``gradient``, and/or call ``backward``
+            in a user-specified CUDA stream context, see
+            :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
+
+        .. note::
+
+            When ``inputs`` are provided and a given input is not a leaf,
+            the current implementation will call its grad_fn (though it is not strictly needed to get this gradients).
+            It is an implementation detail on which the user should not rely.
+            See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+
+        Args:
+            gradient (Tensor or None): Gradient w.r.t. the
+                tensor. If it is a tensor, it will be automatically converted
+                to a Tensor that does not require grad unless ``create_graph`` is True.
+                None values can be specified for scalar Tensors or ones that
+                don't require grad. If a None value would be acceptable then
+                this argument is optional.
+            retain_graph (bool, optional): If ``False``, the graph used to compute
+                the grads will be freed. Note that in nearly all cases setting
+                this option to True is not needed and often can be worked around
+                in a much more efficient way. Defaults to the value of
+                ``create_graph``.
+            create_graph (bool, optional): If ``True``, graph of the derivative will
+                be constructed, allowing to compute higher order derivative
+                products. Defaults to ``False``.
+            inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
+                accumulated into ``.grad``. All other Tensors will be ignored. If not
+                provided, the gradient is accumulated into all the leaf Tensors that were
+                used to compute the attr::tensors.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.backward,
+                (self,),
+                self,
+                gradient=gradient,
+                retain_graph=retain_graph,
+                create_graph=create_graph,
+                inputs=inputs,
+            )
+        torch.autograd.backward(
+            self, gradient, retain_graph, create_graph, inputs=inputs
+        )
+
+    def register_hook(self, hook):
+        r"""Registers a backward hook.
+
+        The hook will be called every time a gradient with respect to the
+        Tensor is computed. The hook should have the following signature::
+
+            hook(grad) -> Tensor or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks.
+
+        Example::
+
+            >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> h = v.register_hook(lambda grad: grad * 2)  # double the gradient
+            >>> v.backward(torch.tensor([1., 2., 3.]))
+            >>> v.grad
+
+             2
+             4
+             6
+            [torch.FloatTensor of size (3,)]
+
+            >>> h.remove()  # removes the hook
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.register_hook, (self,), self, hook)
+        if not self.requires_grad:
+            raise RuntimeError(
+                "cannot register a hook on a tensor that doesn't require gradient"
+            )
+        if self._backward_hooks is None:
+            self._backward_hooks = OrderedDict()
+            if self.grad_fn is not None:
+                self.grad_fn._register_hook_dict(self)
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def register_post_accumulate_grad_hook(self, hook):
+        r"""Registers a backward hook that runs after grad accumulation.
+
+        The hook will be called after all gradients for a tensor have been accumulated,
+        meaning that the .grad field has been updated on that tensor. The post
+        accumulate grad hook is ONLY applicable for leaf tensors (tensors without a
+        .grad_fn field). Registering this hook on a non-leaf tensor will error!
+
+        The hook should have the following signature::
+
+            hook(param: Tensor) -> None
+
+        Note that, unlike other autograd hooks, this hook operates on the tensor
+        that requires grad and not the grad itself. The hook can in-place modify
+        and access its Tensor argument, including its .grad field.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks. Since
+            this hook runs during the backward pass, it will run in no_grad mode (unless
+            create_graph is True). You can use torch.enable_grad() to re-enable autograd
+            within the hook if you need it.
+
+        Example::
+
+            >>> v = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> lr = 0.01
+            >>> # simulate a simple SGD update
+            >>> h = v.register_post_accumulate_grad_hook(lambda p: p.add_(p.grad, alpha=-lr))
+            >>> v.backward(torch.tensor([1., 2., 3.]))
+            >>> v
+            tensor([-0.0100, -0.0200, -0.0300], requires_grad=True)
+
+            >>> h.remove()  # removes the hook
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.register_post_accumulate_grad_hook, (self,), self, hook
+            )
+        if not self.requires_grad:
+            raise RuntimeError(
+                "cannot register a hook on a tensor that doesn't require gradient"
+            )
+        if self.grad_fn is not None:
+            raise RuntimeError(
+                "post accumulate grad hooks cannot be registered on non-leaf tensors"
+            )
+        if self._post_accumulate_grad_hooks is None:
+            self._post_accumulate_grad_hooks: Dict[Any, Any] = OrderedDict()
+        handle = hooks.RemovableHandle(self._post_accumulate_grad_hooks)
+        self._post_accumulate_grad_hooks[handle.id] = hook
+        return handle
+
+    def reinforce(self, reward):
+        def trim(str):
+            return "\n".join([line.strip() for line in str.split("\n")])
+
+        raise RuntimeError(
+            trim(
+                r"""reinforce() was removed.
+            Use torch.distributions instead.
+            See https://pytorch.org/docs/master/distributions.html
+
+            Instead of:
+
+            probs = policy_network(state)
+            action = probs.multinomial()
+            next_state, reward = env.step(action)
+            action.reinforce(reward)
+            action.backward()
+
+            Use:
+
+            probs = policy_network(state)
+            # NOTE: categorical is equivalent to what used to be called multinomial
+            m = torch.distributions.Categorical(probs)
+            action = m.sample()
+            next_state, reward = env.step(action)
+            loss = -m.log_prob(action) * reward
+            loss.backward()
+        """
+            )
+        )
+
+    detach = _C._add_docstr(
+        _C.TensorBase.detach,
+        r"""
+    Returns a new Tensor, detached from the current graph.
+
+    The result will never require gradient.
+
+    This method also affects forward mode AD gradients and the result will never
+    have forward mode AD gradients.
+
+    .. note::
+
+      Returned Tensor shares the same storage with the original one.
+      In-place modifications on either of them will be seen, and may trigger
+      errors in correctness checks.
+    """,
+    )
+
+    detach_ = _C._add_docstr(
+        _C.TensorBase.detach_,
+        r"""
+    Detaches the Tensor from the graph that created it, making it a leaf.
+    Views cannot be detached in-place.
+
+    This method also affects forward mode AD gradients and the result will never
+    have forward mode AD gradients.
+    """,
+    )
+
+    def is_shared(self):
+        r"""Checks if tensor is in shared memory.
+
+        This is always ``True`` for CUDA tensors.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.is_shared, (self,), self)
+        return self._typed_storage()._is_shared()
+
+    def share_memory_(self):
+        r"""Moves the underlying storage to shared memory.
+
+        This is a no-op if the underlying storage is already in shared memory
+        and for CUDA tensors. Tensors in shared memory cannot be resized.
+
+        See :meth:`torch.UntypedStorage.share_memory_` for more details.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.share_memory_, (self,), self)
+        self._typed_storage()._share_memory_()
+        return self
+
+    def module_load(self, other, assign=False):
+        r"""Defines how to transform ``other`` when loading it into ``self`` in :meth:`~nn.Module.load_state_dict`.
+
+        Used when :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
+
+        It is expected that ``self`` is a parameter or buffer in an ``nn.Module`` and ``other`` is the
+        value in the state dictionary with the corresponding key, this method defines
+        how ``other`` is remapped before being swapped with ``self`` via
+        :func:`~torch.utils.swap_tensors`` in ``module.load_state_dict()``.
+
+        .. note::
+            This method should always return a new object that is not ``self`` or ``other``.
+            For example, the default implementation returns ``self.copy_(other).detach()``
+            if ``assign`` is ``False`` or ``other.detach()`` if ``assign`` is ``True``.
+
+        Args:
+            other (Tensor): value in state dict with key corresponding to ``self``
+            assign (bool): the assign argument passed to :meth:`nn.Module.load_state_dict`
+
+        """
+        if has_torch_function_variadic(self, other):
+            return handle_torch_function(
+                Tensor.module_load, (self, other), self, other, assign=assign
+            )
+
+        if assign:
+            return other.detach()
+        else:
+            return self.copy_(other).detach()
+
+    def __reversed__(self):
+        r"""Reverses the tensor along dimension 0."""
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__reversed__, (self,), self)
+        if self.dim() == 0:
+            return self
+        else:
+            return self.flip(0)
+
+    def norm(
+        self,
+        p: Optional[Union[float, str]] = "fro",
+        dim=None,
+        keepdim=False,
+        dtype=None,
+    ):
+        r"""See :func:`torch.norm`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.norm, (self,), self, p=p, dim=dim, keepdim=keepdim, dtype=dtype
+            )
+        return torch.norm(self, p, dim, keepdim, dtype=dtype)
+
+    def solve(self, other):
+        from ._linalg_utils import solve
+
+        return solve(self, other)
+
+    def lstsq(self, other):
+        from ._linalg_utils import lstsq
+
+        return lstsq(self, other)
+
+    def eig(self, eigenvectors=False):
+        from ._linalg_utils import eig
+
+        return eig(self, eigenvectors=eigenvectors)
+
+    def symeig(self, eigenvectors=False):
+        from ._linalg_utils import _symeig
+
+        return _symeig(self, eigenvectors=eigenvectors)
+
+    def lu(self, pivot=True, get_infos=False):
+        r"""See :func:`torch.lu`"""
+        # If get_infos is True, then we don't need to check for errors and vice versa
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.lu, (self,), self, pivot=pivot, get_infos=get_infos
+            )
+
+        LU, pivots, infos = torch._lu_with_info(
+            self, pivot=pivot, check_errors=(not get_infos)
+        )
+        if get_infos:
+            return LU, pivots, infos
+        else:
+            return LU, pivots
+
+    def stft(
+        self,
+        n_fft: int,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        window: "Optional[Tensor]" = None,
+        center: bool = True,
+        pad_mode: str = "reflect",
+        normalized: bool = False,
+        onesided: Optional[bool] = None,
+        return_complex: Optional[bool] = None,
+    ):
+        r"""See :func:`torch.stft`
+
+        .. warning::
+          This function changed signature at version 0.4.1. Calling with
+          the previous signature may cause error or return incorrect result.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.stft,
+                (self,),
+                self,
+                n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                normalized=normalized,
+                onesided=onesided,
+                return_complex=return_complex,
+            )
+        return torch.stft(
+            self,
+            n_fft,
+            hop_length,
+            win_length,
+            window,
+            center,
+            pad_mode,
+            normalized,
+            onesided,
+            return_complex=return_complex,
+        )
+
+    def istft(
+        self,
+        n_fft: int,
+        hop_length: Optional[int] = None,
+        win_length: Optional[int] = None,
+        window: "Optional[Tensor]" = None,
+        center: bool = True,
+        normalized: bool = False,
+        onesided: Optional[bool] = None,
+        length: Optional[int] = None,
+        return_complex: bool = False,
+    ):
+        r"""See :func:`torch.istft`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.istft,
+                (self,),
+                self,
+                n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                center=center,
+                normalized=normalized,
+                onesided=onesided,
+                length=length,
+                return_complex=return_complex,
+            )
+        return torch.istft(
+            self,
+            n_fft,
+            hop_length,
+            win_length,
+            window,
+            center,
+            normalized,
+            onesided,
+            length,
+            return_complex=return_complex,
+        )
+
+    def resize(self, *sizes):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.resize, (self,), self, *sizes)
+        warnings.warn("non-inplace resize is deprecated")
+        from torch.autograd._functions import Resize
+
+        return Resize.apply(self, sizes)
+
+    def resize_as(self, tensor):
+        if has_torch_function_variadic(self, tensor):
+            return handle_torch_function(Tensor.resize_as, (self, tensor), self, tensor)
+        warnings.warn("non-inplace resize_as is deprecated")
+        from torch.autograd._functions import Resize
+
+        return Resize.apply(self, tensor.size())
+
+    def split(self, split_size, dim=0):
+        r"""See :func:`torch.split`"""
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.split, (self,), self, split_size, dim=dim
+            )
+        if isinstance(split_size, Tensor):
+            try:
+                split_size = int(split_size)
+            except ValueError:
+                pass
+
+        if isinstance(split_size, (int, torch.SymInt)):
+            return torch._VF.split(self, split_size, dim)  # type: ignore[attr-defined]
+        else:
+            return torch._VF.split_with_sizes(self, split_size, dim)
+
+    def unique(self, sorted=True, return_inverse=False, return_counts=False, dim=None):
+        r"""Returns the unique elements of the input tensor.
+
+        See :func:`torch.unique`
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.unique,
+                (self,),
+                self,
+                sorted=sorted,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dim=dim,
+            )
+        return torch.unique(
+            self,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+            dim=dim,
+        )
+
+    def unique_consecutive(self, return_inverse=False, return_counts=False, dim=None):
+        r"""Eliminates all but the first element from every consecutive group of equivalent elements.
+
+        See :func:`torch.unique_consecutive`
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.unique_consecutive,
+                (self,),
+                self,
+                return_inverse=return_inverse,
+                return_counts=return_counts,
+                dim=dim,
+            )
+        return torch.unique_consecutive(
+            self, return_inverse=return_inverse, return_counts=return_counts, dim=dim
+        )
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rsub__(self, other):
+        return _C._VariableFunctions.rsub(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rdiv__(self, other):
+        return self.reciprocal() * other
+
+    __rtruediv__ = __rdiv__
+    __itruediv__ = _C.TensorBase.__idiv__
+
+    __pow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(
+        _C.TensorBase.pow
+    )
+    __ipow__ = _handle_torch_function_and_wrap_type_error_to_not_implemented(
+        _C.TensorBase.pow_
+    )
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rmod__(self, other):
+        return torch.remainder(other, self)
+
+    def __format__(self, format_spec):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__format__, (self,), self, format_spec)
+        if self.dim() == 0 and not self.is_meta and type(self) is Tensor:
+            return self.item().__format__(format_spec)
+        return object.__format__(self, format_spec)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rpow__(self, other):
+        return torch.pow(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __floordiv__(self, other):
+        return torch.floor_divide(self, other)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rfloordiv__(self, other):
+        return torch.floor_divide(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rlshift__(self, other):
+        return torch.bitwise_left_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rrshift__(self, other):
+        return torch.bitwise_right_shift(other, self)
+
+    @_handle_torch_function_and_wrap_type_error_to_not_implemented
+    def __rmatmul__(self, other):
+        return torch.matmul(other, self)
+
+    __pos__ = _C.TensorBase.positive
+    __neg__ = _C.TensorBase.neg
+    __abs__ = _C.TensorBase.abs
+
+    def __len__(self):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__len__, (self,), self)
+        if self.dim() == 0:
+            raise TypeError("len() of a 0-d tensor")
+        if torch._C._get_tracing_state():
+            warnings.warn(
+                "Using len to get tensor shape might cause the trace to be incorrect. "
+                "Recommended usage would be tensor.shape[0]. "
+                "Passing a tensor of different shape might lead to errors or silently give "
+                "incorrect results.",
+                category=torch.jit.TracerWarning,
+                stacklevel=2,
+            )
+        return self.shape[0]
+
+    def __iter__(self):
+        # NB: we use 'imap' and not 'map' here, so that in Python 2 we get a
+        # generator and don't eagerly perform all the indexes.  This could
+        # save us work, and also helps keep trace ordering deterministic
+        # (e.g., if you zip(*hiddens), the eager map will force all the
+        # indexes of hiddens[0] before hiddens[1], while the generator
+        # map will interleave them.)
+        # NB: We have intentionally skipped __torch_function__ dispatch here.
+        # See gh-54457
+        if self.dim() == 0:
+            raise TypeError("iteration over a 0-d tensor")
+        if torch._C._get_tracing_state():
+            warnings.warn(
+                "Iterating over a tensor might cause the trace to be incorrect. "
+                "Passing a tensor of different shape won't change the number of "
+                "iterations executed (and might lead to errors or silently give "
+                "incorrect results).",
+                category=torch.jit.TracerWarning,
+                stacklevel=2,
+            )
+        return iter(self.unbind(0))
+
+    def __hash__(self):
+        # Do NOT handle __torch_function__ here as user's default
+        # implementation that handle most functions will most likely do it wrong.
+        # It can be easily overridden by defining this method on the user
+        # subclass if needed.
+        return id(self)
+
+    def __dir__(self):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dir__, (self,), self)
+        tensor_methods = dir(self.__class__)
+        tensor_methods.remove("volatile")  # deprecated
+        attrs = list(self.__dict__.keys())
+        keys = tensor_methods + attrs
+
+        # property only available dense, cuda tensors
+        if (not self.is_cuda) or self.is_sparse:
+            keys.remove("__cuda_array_interface__")
+
+        return sorted(keys)
+
+    # Numpy array interface, to support `numpy.asarray(tensor) -> ndarray`
+    __array_priority__ = 1000  # prefer Tensor ops over numpy ones
+
+    def __array__(self, dtype=None):
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__array__, (self,), self, dtype=dtype)
+        if dtype is None:
+            return self.numpy()
+        else:
+            return self.numpy().astype(dtype, copy=False)
+
+    # Wrap Numpy array again in a suitable tensor when done, to support e.g.
+    # `numpy.sin(tensor) -> tensor` or `numpy.greater(tensor, 0) -> ByteTensor`
+    def __array_wrap__(self, array):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.__array_wrap__, (self,), self, array=array
+            )
+        if array.dtype == bool:
+            # Workaround, torch has no built-in bool tensor
+            array = array.astype("uint8")
+        return torch.from_numpy(array)
+
+    def __contains__(self, element):
+        r"""Check if `element` is present in tensor
+
+        Args:
+            element (Tensor or scalar): element to be checked
+                for presence in current tensor"
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__contains__, (self,), self, element)
+        if isinstance(
+            element, (torch.Tensor, Number, torch.SymInt, torch.SymFloat, torch.SymBool)
+        ):
+            # type hint doesn't understand the __contains__ result array
+            return (element == self).any().item()  # type: ignore[union-attr]
+
+        raise RuntimeError(
+            f"Tensor.__contains__ only supports Tensor or scalar, but you passed in a {type(element)}."
+        )
+
+    @property
+    def __cuda_array_interface__(self):
+        """Array view description for cuda tensors.
+
+        See:
+        https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
+        """
+        if has_torch_function_unary(self):
+            # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
+            return handle_torch_function(Tensor.__cuda_array_interface__.__get__, (self,), self)  # type: ignore[attr-defined]
+
+        # raise AttributeError for unsupported tensors, so that
+        # hasattr(cpu_tensor, "__cuda_array_interface__") is False.
+        if not self.is_cuda:
+            raise AttributeError(
+                "Can't get __cuda_array_interface__ on non-CUDA tensor type: %s "
+                "If CUDA data is required use tensor.cuda() to copy tensor to device memory."
+                % self.type()
+            )
+
+        if self.is_sparse:
+            raise AttributeError(
+                "Can't get __cuda_array_interface__ on sparse type: %s "
+                "Use Tensor.to_dense() to convert to a dense tensor first."
+                % self.type()
+            )
+
+        # RuntimeError, matching tensor.__array__() behavior.
+        if self.requires_grad:
+            raise RuntimeError(
+                "Can't get __cuda_array_interface__ on Variable that requires grad. "
+                "If gradients aren't required, use var.detach() to get Variable that doesn't require grad."
+            )
+
+        # CUDA devices are little-endian and tensors are stored in native byte
+        # order. 1-byte entries are endian-agnostic.
+        typestr = {
+            torch.complex64: "<c8",
+            torch.complex128: "<c16",
+            torch.float16: "<f2",
+            torch.float32: "<f4",
+            torch.float64: "<f8",
+            torch.uint8: "|u1",
+            torch.int8: "|i1",
+            torch.int16: "<i2",
+            torch.int32: "<i4",
+            torch.int64: "<i8",
+        }[self.dtype]
+
+        itemsize = self.element_size()
+
+        shape = tuple(self.shape)
+        if self.is_contiguous():
+            # __cuda_array_interface__ v2 requires the strides to be omitted
+            # (either not set or set to None) for C-contiguous arrays.
+            strides = None
+        else:
+            strides = tuple(s * itemsize for s in self.stride())
+        data_ptr = self.data_ptr() if self.numel() > 0 else 0
+        data = (data_ptr, False)  # read-only is false
+
+        return dict(typestr=typestr, shape=shape, strides=strides, data=data, version=2)
+
+    def storage_type(self):
+        r"""storage_type() -> type
+
+        Returns the type of the underlying storage.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.storage_type, (self,), self)
+
+        torch.storage._warn_typed_storage_removal()
+
+        return self._typed_storage()._get_legacy_storage_class()
+
+    def refine_names(self, *names):
+        r"""Refines the dimension names of :attr:`self` according to :attr:`names`.
+
+        Refining is a special case of renaming that "lifts" unnamed dimensions.
+        A ``None`` dim can be refined to have any name; a named dim can only be
+        refined to have the same name.
+
+        Because named tensors can coexist with unnamed tensors, refining names
+        gives a nice way to write named-tensor-aware code that works with both
+        named and unnamed tensors.
+
+        :attr:`names` may contain up to one Ellipsis (``...``).
+        The Ellipsis is expanded greedily; it is expanded in-place to fill
+        :attr:`names` to the same length as ``self.dim()`` using names from the
+        corresponding indices of ``self.names``.
+
+        Python 2 does not support Ellipsis but one may use a string literal
+        instead (``'...'``).
+
+        Args:
+            names (iterable of str): The desired names of the output tensor. May
+                contain up to one Ellipsis.
+
+        Examples::
+
+            >>> imgs = torch.randn(32, 3, 128, 128)
+            >>> named_imgs = imgs.refine_names('N', 'C', 'H', 'W')
+            >>> named_imgs.names
+            ('N', 'C', 'H', 'W')
+
+            >>> tensor = torch.randn(2, 3, 5, 7, 11)
+            >>> tensor = tensor.refine_names('A', ..., 'B', 'C')
+            >>> tensor.names
+            ('A', None, None, 'B', 'C')
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.refine_names, (self,), self, *names)
+        names = resolve_ellipsis(names, self.names, "refine_names")
+        return super().refine_names(names)
+
+    def align_to(self, *names):
+        r"""Permutes the dimensions of the :attr:`self` tensor to match the order
+        specified in :attr:`names`, adding size-one dims for any new names.
+
+        All of the dims of :attr:`self` must be named in order to use this method.
+        The resulting tensor is a view on the original tensor.
+
+        All dimension names of :attr:`self` must be present in :attr:`names`.
+        :attr:`names` may contain additional names that are not in ``self.names``;
+        the output tensor has a size-one dimension for each of those new names.
+
+        :attr:`names` may contain up to one Ellipsis (``...``).
+        The Ellipsis is expanded to be equal to all dimension names of :attr:`self`
+        that are not mentioned in :attr:`names`, in the order that they appear
+        in :attr:`self`.
+
+        Python 2 does not support Ellipsis but one may use a string literal
+        instead (``'...'``).
+
+        Args:
+            names (iterable of str): The desired dimension ordering of the
+                output tensor. May contain up to one Ellipsis that is expanded
+                to all unmentioned dim names of :attr:`self`.
+
+        Examples::
+
+            >>> tensor = torch.randn(2, 2, 2, 2, 2, 2)
+            >>> named_tensor = tensor.refine_names('A', 'B', 'C', 'D', 'E', 'F')
+
+            # Move the F and E dims to the front while keeping the rest in order
+            >>> named_tensor.align_to('F', 'E', ...)
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.align_to, (self,), self, *names)
+        ellipsis_idx = single_ellipsis_index(names, "align_to")
+        if ellipsis_idx is None:
+            return super().align_to(names)
+        return super().align_to(
+            [name for name in names if not is_ellipsis(name)], ellipsis_idx
+        )
+
+    def unflatten(self, dim, sizes):
+        r"""
+        unflatten(dim, sizes) -> Tensor
+
+        See :func:`torch.unflatten`.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.unflatten, (self,), self, dim, sizes)
+
+        if not sizes:
+            raise RuntimeError("unflatten: sizes must be non-empty")
+
+        names = None
+        if isinstance(sizes, OrderedDict) or (
+            isinstance(sizes, (tuple, list)) and isinstance(sizes[0], (tuple, list))
+        ):
+            names, sizes = unzip_namedshape(sizes)
+            return super().unflatten(dim, sizes, names)
+        else:
+            return super().unflatten(dim, sizes)
+
+    def rename_(self, *names, **rename_map):
+        """In-place version of :meth:`~Tensor.rename`."""
+
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.rename_, (self,), self, *names, **rename_map
+            )
+
+        # Note [rename_ / rename API]
+        # The Python API for these is different from the C++ API. In Python:
+        # 1) tensor.rename(*names) takes a vararglist of names
+        # 2) tensor.rename(**rename_map) takes a map of names to rename.
+        # C++ is static, making it difficult to implement similar behavior.
+        return update_names(self, names, rename_map, inplace=True)
+
+    def rename(self, *names, **rename_map):
+        """Renames dimension names of :attr:`self`.
+
+        There are two main usages:
+
+        ``self.rename(**rename_map)`` returns a view on tensor that has dims
+        renamed as specified in the mapping :attr:`rename_map`.
+
+        ``self.rename(*names)`` returns a view on tensor, renaming all
+        dimensions positionally using :attr:`names`.
+        Use ``self.rename(None)`` to drop names on a tensor.
+
+        One cannot specify both positional args :attr:`names` and keyword args
+        :attr:`rename_map`.
+
+        Examples::
+
+            >>> imgs = torch.rand(2, 3, 5, 7, names=('N', 'C', 'H', 'W'))
+            >>> renamed_imgs = imgs.rename(N='batch', C='channels')
+            >>> renamed_imgs.names
+            ('batch', 'channels', 'H', 'W')
+
+            >>> renamed_imgs = imgs.rename(None)
+            >>> renamed_imgs.names
+            (None, None, None, None)
+
+            >>> renamed_imgs = imgs.rename('batch', 'channel', 'height', 'width')
+            >>> renamed_imgs.names
+            ('batch', 'channel', 'height', 'width')
+
+        .. warning::
+            The named tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor.rename, (self,), self, *names, **rename_map
+            )
+
+        # See Note [rename_ / rename API]
+        return update_names(self, names, rename_map, inplace=False)
+
+    def to_sparse_coo(self):
+        """Convert a tensor to :ref:`coordinate format <sparse-coo-docs>`.
+
+        Examples::
+
+             >>> dense = torch.randn(5, 5)
+             >>> sparse = dense.to_sparse_coo()
+             >>> sparse._nnz()
+             25
+
+        """
+        return self.to_sparse()
+
+    def dim_order(self):
+        """
+
+        dim_order() -> tuple
+
+        Returns a tuple of int describing the dim order or physical layout of :attr:`self`.
+
+        Args:
+            None
+
+        Dim order represents how dimensions are laid out in memory,
+        starting from the outermost to the innermost dimension.
+
+        Example::
+            >>> torch.empty((2, 3, 5, 7)).dim_order()
+            (0, 1, 2, 3)
+            >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).dim_order()
+            (0, 2, 3, 1)
+
+        .. warning::
+            The dim_order tensor API is experimental and subject to change.
+
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.dim_order, (self,), self)
+
+        import torch._prims_common as utils
+
+        return tuple(utils.compute_elementwise_output_logical_to_physical_perm(self))
+
+    def _update_names(self, names, inplace):
+        if has_torch_function_unary(self):
+            return handle_torch_function(
+                Tensor._update_names, (self,), self, names, inplace
+            )
+
+        # See Note [rename_ / rename API]
+        if inplace:
+            return super().rename_(names)
+        else:
+            return super().rename(names)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """
+        This __torch_function__ implementation wraps subclasses such that
+        methods called on subclasses return a subclass instance instead of
+        a ``torch.Tensor`` instance.
+
+        One corollary to this is that you need coverage for torch.Tensor
+        methods if implementing __torch_function__ for subclasses.
+
+        We recommend always calling ``super().__torch_function__`` as the base
+        case when doing the above.
+
+        While not mandatory, we recommend making `__torch_function__` a classmethod.
+        """
+        if kwargs is None:
+            kwargs = {}
+
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        with _C.DisableTorchFunctionSubclass():
+            ret = func(*args, **kwargs)
+            if func in get_default_nowrap_functions():
+                return ret
+            else:
+                return _convert(ret, cls)
+
+    __torch_dispatch__ = _C._disabled_torch_dispatch_impl
+
+    def __dlpack__(self, stream=None):
+        """
+        Creates a DLpack `capsule https://data-apis.org/array-api/latest/design_topics/data_interchange.html#data-interchange`_
+        of the current tensor to be exported to other libraries.
+
+        This function will be called from the `from_dlpack` method
+        of the library that will consume the capsule. `from_dlpack` passes the current
+        stream to this method as part of the specification.
+
+        Args:
+            stream (integer or None): An optional Python integer representing a
+            pointer to a CUDA stream. The current stream is synchronized with
+            this stream before the capsule is created, and since the capsule
+            shares its storage with the tensor this make it safe to access from
+            both streams.  If None or -1 is passed then no synchronization is performed.
+            If 1 (on CUDA) or 0 (on ROCM) then the default stream is used for
+            synchronization.
+        """
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dlpack__, (self,), self, stream)
+
+        # DLPack capsules can't capture all of PyTorch's semantics,
+        # so we prohibit exporting tensors that would lose their properties like
+        # requires_grad and having the conjugate bit set.
+        if self.requires_grad:
+            raise RuntimeError(
+                "Can't export tensors that require gradient, use tensor.detach()"
+            )
+        if self.is_conj():
+            raise RuntimeError("Can't export tensors with the conjugate bit set")
+        if self.layout != torch.strided:
+            raise RuntimeError(
+                "Can't export tensors with layout other than torch.strided"
+            )
+
+        if stream is not None and type(stream) is not int:
+            # Stream pointers in CUDA/ROCm are uniquely numbered and can
+            # be retrieved from their integer value.
+            raise TypeError("stream must be ``int`` or ``none``")
+        elif stream is not None and stream != -1:
+            if self.device.type == "cuda":
+                # NB: This logic handles the special case values for default
+                # streams and must be kept in sync with from_dlpack in
+                # torch/utils/dlpack.py
+                if stream == 1 and torch.version.hip is None:
+                    stream = torch.cuda.default_stream()
+                elif stream == 0 and torch.version.hip is not None:
+                    stream = torch.cuda.default_stream()
+                else:
+                    stream = torch.cuda.ExternalStream(stream)
+                # Only synchronize on different streams
+                sync_stream = torch.cuda.current_stream()
+                if stream != sync_stream:
+                    event = torch.cuda.Event()
+                    event.record(sync_stream)
+                    stream.wait_event(event)
+        return torch.to_dlpack(self)
+
+    def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]:
+        if has_torch_function_unary(self):
+            return handle_torch_function(Tensor.__dlpack_device__, (self,), self)
+        device = self.device
+        idx = device.index if device.index is not None else 0
+        torch_device_type = device.type
+        if torch_device_type == "cuda" and torch.version.hip is not None:
+            device_type = DLDeviceType.kDLROCM
+        elif torch_device_type == "cpu" and self.is_pinned():
+            device_type = DLDeviceType.kDLCPUPinned
+        elif torch_device_type == "cuda":
+            device_type = DLDeviceType.kDLGPU
+        elif torch_device_type == "cpu":
+            device_type = DLDeviceType.kDLCPU
+        elif self.device.type == "xpu":
+            device_type = DLDeviceType.kDLOneAPI
+        else:
+            raise ValueError(f"Unknown device type {torch_device_type} for Dlpack")
+        return (device_type, idx)
+
+    __module__ = "torch"
+
+
+def _convert(ret, cls):
+    if cls is Tensor:
+        return ret
+
+    if isinstance(ret, Tensor) and not isinstance(ret, cls):
+        ret = ret.as_subclass(cls)
+
+    if isinstance(ret, (tuple, list)):
+        # Also handles things like namedtuples
+        ret = type(ret)(_convert(r, cls) for r in ret)
+
+    return ret
diff --git a/MLPY/Lib/site-packages/torch/_tensor_docs.py b/MLPY/Lib/site-packages/torch/_tensor_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0fa09eaf864be4301db09f40bbe854d702e4555
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_tensor_docs.py
@@ -0,0 +1,6976 @@
+"""Adds docstrings to Tensor functions"""
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+from torch._torch_docs import parse_kwargs, reproducibility_notes
+
+
+def add_docstr_all(method, docstr):
+    add_docstr(getattr(torch._C.TensorBase, method), docstr)
+
+
+common_args = parse_kwargs(
+    """
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.preserve_format``.
+"""
+)
+
+new_common_args = parse_kwargs(
+    """
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
+        Default: if None, same :class:`torch.dtype` as this tensor.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, same :class:`torch.device` as this tensor.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+    layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+        Default: ``torch.strided``.
+"""
+)
+
+add_docstr_all(
+    "new_tensor",
+    """
+new_tensor(data, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a new Tensor with :attr:`data` as the tensor data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+.. warning::
+
+    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
+    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
+    or :func:`torch.Tensor.detach`.
+    If you have a numpy array and want to avoid a copy, use
+    :func:`torch.from_numpy`.
+
+.. warning::
+
+    When data is a tensor `x`, :func:`new_tensor()` reads out 'the data' from whatever it is passed,
+    and constructs a leaf variable. Therefore ``tensor.new_tensor(x)`` is equivalent to ``x.clone().detach()``
+    and ``tensor.new_tensor(x, requires_grad=True)`` is equivalent to ``x.clone().detach().requires_grad_(True)``.
+    The equivalents using ``clone()`` and ``detach()`` are recommended.
+
+Args:
+    data (array_like): The returned Tensor copies :attr:`data`.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.int8)
+    >>> data = [[0, 1], [2, 3]]
+    >>> tensor.new_tensor(data)
+    tensor([[ 0,  1],
+            [ 2,  3]], dtype=torch.int8)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_full",
+    """
+new_full(size, fill_value, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with :attr:`fill_value`.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    fill_value (scalar): the number to fill the output tensor with.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones((2,), dtype=torch.float64)
+    >>> tensor.new_full((3, 4), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416,  3.1416]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty",
+    """
+new_empty(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with uninitialized data.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty((2, 3))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_empty_strided",
+    """
+new_empty_strided(size, stride, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` and strides :attr:`stride` filled with
+uninitialized data. By default, the returned Tensor has the same
+:class:`torch.dtype` and :class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.ones(())
+    >>> tensor.new_empty_strided((2, 3), (3, 1))
+    tensor([[ 5.8182e-18,  4.5765e-41, -1.0545e+30],
+            [ 3.0949e-41,  4.4842e-44,  0.0000e+00]])
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_ones",
+    """
+new_ones(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``1``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.int32)
+    >>> tensor.new_ones((2, 3))
+    tensor([[ 1,  1,  1],
+            [ 1,  1,  1]], dtype=torch.int32)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "new_zeros",
+    """
+new_zeros(size, *, dtype=None, device=None, requires_grad=False, layout=torch.strided, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a Tensor of size :attr:`size` filled with ``0``.
+By default, the returned Tensor has the same :class:`torch.dtype` and
+:class:`torch.device` as this tensor.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+
+Keyword args:
+    {dtype}
+    {device}
+    {requires_grad}
+    {layout}
+    {pin_memory}
+
+Example::
+
+    >>> tensor = torch.tensor((), dtype=torch.float64)
+    >>> tensor.new_zeros((2, 3))
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]], dtype=torch.float64)
+
+""".format(
+        **new_common_args
+    ),
+)
+
+add_docstr_all(
+    "abs",
+    r"""
+abs() -> Tensor
+
+See :func:`torch.abs`
+""",
+)
+
+add_docstr_all(
+    "abs_",
+    r"""
+abs_() -> Tensor
+
+In-place version of :meth:`~Tensor.abs`
+""",
+)
+
+add_docstr_all(
+    "absolute",
+    r"""
+absolute() -> Tensor
+
+Alias for :func:`abs`
+""",
+)
+
+add_docstr_all(
+    "absolute_",
+    r"""
+absolute_() -> Tensor
+
+In-place version of :meth:`~Tensor.absolute`
+Alias for :func:`abs_`
+""",
+)
+
+add_docstr_all(
+    "acos",
+    r"""
+acos() -> Tensor
+
+See :func:`torch.acos`
+""",
+)
+
+add_docstr_all(
+    "acos_",
+    r"""
+acos_() -> Tensor
+
+In-place version of :meth:`~Tensor.acos`
+""",
+)
+
+add_docstr_all(
+    "arccos",
+    r"""
+arccos() -> Tensor
+
+See :func:`torch.arccos`
+""",
+)
+
+add_docstr_all(
+    "arccos_",
+    r"""
+arccos_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccos`
+""",
+)
+
+add_docstr_all(
+    "acosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.acosh`
+""",
+)
+
+add_docstr_all(
+    "acosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.acosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh",
+    r"""
+acosh() -> Tensor
+
+See :func:`torch.arccosh`
+""",
+)
+
+add_docstr_all(
+    "arccosh_",
+    r"""
+acosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arccosh`
+""",
+)
+
+add_docstr_all(
+    "add",
+    r"""
+add(other, *, alpha=1) -> Tensor
+
+Add a scalar or tensor to :attr:`self` tensor. If both :attr:`alpha`
+and :attr:`other` are specified, each element of :attr:`other` is scaled by
+:attr:`alpha` before being used.
+
+When :attr:`other` is a tensor, the shape of :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor
+
+See :func:`torch.add`
+""",
+)
+
+add_docstr_all(
+    "add_",
+    r"""
+add_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.add`
+""",
+)
+
+add_docstr_all(
+    "addbmm",
+    r"""
+addbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addbmm_",
+    r"""
+addbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addbmm`
+""",
+)
+
+add_docstr_all(
+    "addcdiv",
+    r"""
+addcdiv(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcdiv_",
+    r"""
+addcdiv_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcdiv`
+""",
+)
+
+add_docstr_all(
+    "addcmul",
+    r"""
+addcmul(tensor1, tensor2, *, value=1) -> Tensor
+
+See :func:`torch.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addcmul_",
+    r"""
+addcmul_(tensor1, tensor2, *, value=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addcmul`
+""",
+)
+
+add_docstr_all(
+    "addmm",
+    r"""
+addmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmm_",
+    r"""
+addmm_(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmm`
+""",
+)
+
+add_docstr_all(
+    "addmv",
+    r"""
+addmv(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addmv`
+""",
+)
+
+add_docstr_all(
+    "addmv_",
+    r"""
+addmv_(mat, vec, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addmv`
+""",
+)
+
+add_docstr_all(
+    "sspaddmm",
+    r"""
+sspaddmm(mat1, mat2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.sspaddmm`
+""",
+)
+
+add_docstr_all(
+    "smm",
+    r"""
+smm(mat) -> Tensor
+
+See :func:`torch.smm`
+""",
+)
+
+add_docstr_all(
+    "addr",
+    r"""
+addr(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.addr`
+""",
+)
+
+add_docstr_all(
+    "addr_",
+    r"""
+addr_(vec1, vec2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.addr`
+""",
+)
+
+add_docstr_all(
+    "align_as",
+    r"""
+align_as(other) -> Tensor
+
+Permutes the dimensions of the :attr:`self` tensor to match the dimension order
+in the :attr:`other` tensor, adding size-one dims for any new names.
+
+This operation is useful for explicit broadcasting by names (see examples).
+
+All of the dims of :attr:`self` must be named in order to use this method.
+The resulting tensor is a view on the original tensor.
+
+All dimension names of :attr:`self` must be present in ``other.names``.
+:attr:`other` may contain named dimensions that are not in ``self.names``;
+the output tensor has a size-one dimension for each of those new names.
+
+To align a tensor to a specific order, use :meth:`~Tensor.align_to`.
+
+Examples::
+
+    # Example 1: Applying a mask
+    >>> mask = torch.randint(2, [127, 128], dtype=torch.bool).refine_names('W', 'H')
+    >>> imgs = torch.randn(32, 128, 127, 3, names=('N', 'H', 'W', 'C'))
+    >>> imgs.masked_fill_(mask.align_as(imgs), 0)
+
+
+    # Example 2: Applying a per-channel-scale
+    >>> def scale_channels(input, scale):
+    >>>    scale = scale.refine_names('C')
+    >>>    return input * scale.align_as(input)
+
+    >>> num_channels = 3
+    >>> scale = torch.randn(num_channels, names=('C',))
+    >>> imgs = torch.rand(32, 128, 128, num_channels, names=('N', 'H', 'W', 'C'))
+    >>> more_imgs = torch.rand(32, num_channels, 128, 128, names=('N', 'C', 'H', 'W'))
+    >>> videos = torch.randn(3, num_channels, 128, 128, 128, names=('N', 'C', 'H', 'W', 'D'))
+
+    # scale_channels is agnostic to the dimension order of the input
+    >>> scale_channels(imgs, scale)
+    >>> scale_channels(more_imgs, scale)
+    >>> scale_channels(videos, scale)
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "all",
+    r"""
+all(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.all`
+""",
+)
+
+add_docstr_all(
+    "allclose",
+    r"""
+allclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.allclose`
+""",
+)
+
+add_docstr_all(
+    "angle",
+    r"""
+angle() -> Tensor
+
+See :func:`torch.angle`
+""",
+)
+
+add_docstr_all(
+    "any",
+    r"""
+any(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.any`
+""",
+)
+
+add_docstr_all(
+    "apply_",
+    r"""
+apply_(callable) -> Tensor
+
+Applies the function :attr:`callable` to each element in the tensor, replacing
+each element with the value returned by :attr:`callable`.
+
+.. note::
+
+    This function only works with CPU tensors and should not be used in code
+    sections that require high performance.
+""",
+)
+
+add_docstr_all(
+    "asin",
+    r"""
+asin() -> Tensor
+
+See :func:`torch.asin`
+""",
+)
+
+add_docstr_all(
+    "asin_",
+    r"""
+asin_() -> Tensor
+
+In-place version of :meth:`~Tensor.asin`
+""",
+)
+
+add_docstr_all(
+    "arcsin",
+    r"""
+arcsin() -> Tensor
+
+See :func:`torch.arcsin`
+""",
+)
+
+add_docstr_all(
+    "arcsin_",
+    r"""
+arcsin_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsin`
+""",
+)
+
+add_docstr_all(
+    "asinh",
+    r"""
+asinh() -> Tensor
+
+See :func:`torch.asinh`
+""",
+)
+
+add_docstr_all(
+    "asinh_",
+    r"""
+asinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.asinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh",
+    r"""
+arcsinh() -> Tensor
+
+See :func:`torch.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "arcsinh_",
+    r"""
+arcsinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.arcsinh`
+""",
+)
+
+add_docstr_all(
+    "as_strided",
+    r"""
+as_strided(size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided`
+""",
+)
+
+add_docstr_all(
+    "as_strided_",
+    r"""
+as_strided_(size, stride, storage_offset=None) -> Tensor
+
+In-place version of :meth:`~Tensor.as_strided`
+""",
+)
+
+add_docstr_all(
+    "atan",
+    r"""
+atan() -> Tensor
+
+See :func:`torch.atan`
+""",
+)
+
+add_docstr_all(
+    "atan_",
+    r"""
+atan_() -> Tensor
+
+In-place version of :meth:`~Tensor.atan`
+""",
+)
+
+add_docstr_all(
+    "arctan",
+    r"""
+arctan() -> Tensor
+
+See :func:`torch.arctan`
+""",
+)
+
+add_docstr_all(
+    "arctan_",
+    r"""
+arctan_() -> Tensor
+
+In-place version of :meth:`~Tensor.arctan`
+""",
+)
+
+add_docstr_all(
+    "atan2",
+    r"""
+atan2(other) -> Tensor
+
+See :func:`torch.atan2`
+""",
+)
+
+add_docstr_all(
+    "atan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2",
+    r"""
+arctan2(other) -> Tensor
+
+See :func:`torch.arctan2`
+""",
+)
+
+add_docstr_all(
+    "arctan2_",
+    r"""
+atan2_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctan2`
+""",
+)
+
+add_docstr_all(
+    "atanh",
+    r"""
+atanh() -> Tensor
+
+See :func:`torch.atanh`
+""",
+)
+
+add_docstr_all(
+    "atanh_",
+    r"""
+atanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.atanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh",
+    r"""
+arctanh() -> Tensor
+
+See :func:`torch.arctanh`
+""",
+)
+
+add_docstr_all(
+    "arctanh_",
+    r"""
+arctanh_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.arctanh`
+""",
+)
+
+add_docstr_all(
+    "baddbmm",
+    r"""
+baddbmm(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+See :func:`torch.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "baddbmm_",
+    r"""
+baddbmm_(batch1, batch2, *, beta=1, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.baddbmm`
+""",
+)
+
+add_docstr_all(
+    "bernoulli",
+    r"""
+bernoulli(*, generator=None) -> Tensor
+
+Returns a result tensor where each :math:`\texttt{result[i]}` is independently
+sampled from :math:`\text{Bernoulli}(\texttt{self[i]})`. :attr:`self` must have
+floating point ``dtype``, and the result will have the same ``dtype``.
+
+See :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bernoulli_",
+    r"""
+bernoulli_(p=0.5, *, generator=None) -> Tensor
+
+Fills each location of :attr:`self` with an independent sample from
+:math:`\text{Bernoulli}(\texttt{p})`. :attr:`self` can have integral
+``dtype``.
+
+:attr:`p` should either be a scalar or tensor containing probabilities to be
+used for drawing the binary random number.
+
+If it is a tensor, the :math:`\text{i}^{th}` element of :attr:`self` tensor
+will be set to a value sampled from
+:math:`\text{Bernoulli}(\texttt{p\_tensor[i]})`. In this case `p` must have
+floating point ``dtype``.
+
+See also :meth:`~Tensor.bernoulli` and :func:`torch.bernoulli`
+""",
+)
+
+add_docstr_all(
+    "bincount",
+    r"""
+bincount(weights=None, minlength=0) -> Tensor
+
+See :func:`torch.bincount`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not",
+    r"""
+bitwise_not() -> Tensor
+
+See :func:`torch.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_not_",
+    r"""
+bitwise_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_not`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and",
+    r"""
+bitwise_and() -> Tensor
+
+See :func:`torch.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_and_",
+    r"""
+bitwise_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_and`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or",
+    r"""
+bitwise_or() -> Tensor
+
+See :func:`torch.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_or_",
+    r"""
+bitwise_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_or`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor",
+    r"""
+bitwise_xor() -> Tensor
+
+See :func:`torch.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_xor_",
+    r"""
+bitwise_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_xor`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift",
+    r"""
+bitwise_left_shift(other) -> Tensor
+
+See :func:`torch.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_left_shift_",
+    r"""
+bitwise_left_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_left_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift",
+    r"""
+bitwise_right_shift(other) -> Tensor
+
+See :func:`torch.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "bitwise_right_shift_",
+    r"""
+bitwise_right_shift_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.bitwise_right_shift`
+""",
+)
+
+add_docstr_all(
+    "broadcast_to",
+    r"""
+broadcast_to(shape) -> Tensor
+
+See :func:`torch.broadcast_to`.
+""",
+)
+
+add_docstr_all(
+    "logical_and",
+    r"""
+logical_and() -> Tensor
+
+See :func:`torch.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_and_",
+    r"""
+logical_and_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_and`
+""",
+)
+
+add_docstr_all(
+    "logical_not",
+    r"""
+logical_not() -> Tensor
+
+See :func:`torch.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_not_",
+    r"""
+logical_not_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_not`
+""",
+)
+
+add_docstr_all(
+    "logical_or",
+    r"""
+logical_or() -> Tensor
+
+See :func:`torch.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_or_",
+    r"""
+logical_or_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_or`
+""",
+)
+
+add_docstr_all(
+    "logical_xor",
+    r"""
+logical_xor() -> Tensor
+
+See :func:`torch.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "logical_xor_",
+    r"""
+logical_xor_() -> Tensor
+
+In-place version of :meth:`~Tensor.logical_xor`
+""",
+)
+
+add_docstr_all(
+    "bmm",
+    r"""
+bmm(batch2) -> Tensor
+
+See :func:`torch.bmm`
+""",
+)
+
+add_docstr_all(
+    "cauchy_",
+    r"""
+cauchy_(median=0, sigma=1, *, generator=None) -> Tensor
+
+Fills the tensor with numbers drawn from the Cauchy distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{\pi} \dfrac{\sigma}{(x - \text{median})^2 + \sigma^2}
+
+.. note::
+  Sigma (:math:`\sigma`) is used to denote the scale parameter in Cauchy distribution.
+""",
+)
+
+add_docstr_all(
+    "ceil",
+    r"""
+ceil() -> Tensor
+
+See :func:`torch.ceil`
+""",
+)
+
+add_docstr_all(
+    "ceil_",
+    r"""
+ceil_() -> Tensor
+
+In-place version of :meth:`~Tensor.ceil`
+""",
+)
+
+add_docstr_all(
+    "cholesky",
+    r"""
+cholesky(upper=False) -> Tensor
+
+See :func:`torch.cholesky`
+""",
+)
+
+add_docstr_all(
+    "cholesky_solve",
+    r"""
+cholesky_solve(input2, upper=False) -> Tensor
+
+See :func:`torch.cholesky_solve`
+""",
+)
+
+add_docstr_all(
+    "cholesky_inverse",
+    r"""
+cholesky_inverse(upper=False) -> Tensor
+
+See :func:`torch.cholesky_inverse`
+""",
+)
+
+add_docstr_all(
+    "clamp",
+    r"""
+clamp(min=None, max=None) -> Tensor
+
+See :func:`torch.clamp`
+""",
+)
+
+add_docstr_all(
+    "clamp_",
+    r"""
+clamp_(min=None, max=None) -> Tensor
+
+In-place version of :meth:`~Tensor.clamp`
+""",
+)
+
+add_docstr_all(
+    "clip",
+    r"""
+clip(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp`.
+""",
+)
+
+add_docstr_all(
+    "clip_",
+    r"""
+clip_(min=None, max=None) -> Tensor
+
+Alias for :meth:`~Tensor.clamp_`.
+""",
+)
+
+add_docstr_all(
+    "clone",
+    r"""
+clone(*, memory_format=torch.preserve_format) -> Tensor
+
+See :func:`torch.clone`
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "coalesce",
+    r"""
+coalesce() -> Tensor
+
+Returns a coalesced copy of :attr:`self` if :attr:`self` is an
+:ref:`uncoalesced tensor <sparse-uncoalesced-coo-docs>`.
+
+Returns :attr:`self` if :attr:`self` is a coalesced tensor.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+""",
+)
+
+add_docstr_all(
+    "contiguous",
+    r"""
+contiguous(memory_format=torch.contiguous_format) -> Tensor
+
+Returns a contiguous in memory tensor containing the same data as :attr:`self` tensor. If
+:attr:`self` tensor is already in the specified memory format, this function returns the
+:attr:`self` tensor.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "copy_",
+    r"""
+copy_(src, non_blocking=False) -> Tensor
+
+Copies the elements from :attr:`src` into :attr:`self` tensor and returns
+:attr:`self`.
+
+The :attr:`src` tensor must be :ref:`broadcastable <broadcasting-semantics>`
+with the :attr:`self` tensor. It may be of a different data type or reside on a
+different device.
+
+Args:
+    src (Tensor): the source tensor to copy from
+    non_blocking (bool): if ``True`` and this copy is between CPU and GPU,
+        the copy may occur asynchronously with respect to the host. For other
+        cases, this argument has no effect.
+""",
+)
+
+add_docstr_all(
+    "conj",
+    r"""
+conj() -> Tensor
+
+See :func:`torch.conj`
+""",
+)
+
+add_docstr_all(
+    "conj_physical",
+    r"""
+conj_physical() -> Tensor
+
+See :func:`torch.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "conj_physical_",
+    r"""
+conj_physical_() -> Tensor
+
+In-place version of :meth:`~Tensor.conj_physical`
+""",
+)
+
+add_docstr_all(
+    "resolve_conj",
+    r"""
+resolve_conj() -> Tensor
+
+See :func:`torch.resolve_conj`
+""",
+)
+
+add_docstr_all(
+    "resolve_neg",
+    r"""
+resolve_neg() -> Tensor
+
+See :func:`torch.resolve_neg`
+""",
+)
+
+add_docstr_all(
+    "copysign",
+    r"""
+copysign(other) -> Tensor
+
+See :func:`torch.copysign`
+""",
+)
+
+add_docstr_all(
+    "copysign_",
+    r"""
+copysign_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.copysign`
+""",
+)
+
+add_docstr_all(
+    "cos",
+    r"""
+cos() -> Tensor
+
+See :func:`torch.cos`
+""",
+)
+
+add_docstr_all(
+    "cos_",
+    r"""
+cos_() -> Tensor
+
+In-place version of :meth:`~Tensor.cos`
+""",
+)
+
+add_docstr_all(
+    "cosh",
+    r"""
+cosh() -> Tensor
+
+See :func:`torch.cosh`
+""",
+)
+
+add_docstr_all(
+    "cosh_",
+    r"""
+cosh_() -> Tensor
+
+In-place version of :meth:`~Tensor.cosh`
+""",
+)
+
+add_docstr_all(
+    "cpu",
+    r"""
+cpu(memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CPU memory.
+
+If this object is already in CPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    {memory_format}
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "count_nonzero",
+    r"""
+count_nonzero(dim=None) -> Tensor
+
+See :func:`torch.count_nonzero`
+""",
+)
+
+add_docstr_all(
+    "cov",
+    r"""
+cov(*, correction=1, fweights=None, aweights=None) -> Tensor
+
+See :func:`torch.cov`
+""",
+)
+
+add_docstr_all(
+    "corrcoef",
+    r"""
+corrcoef() -> Tensor
+
+See :func:`torch.corrcoef`
+""",
+)
+
+add_docstr_all(
+    "cross",
+    r"""
+cross(other, dim=None) -> Tensor
+
+See :func:`torch.cross`
+""",
+)
+
+add_docstr_all(
+    "cuda",
+    r"""
+cuda(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in CUDA memory.
+
+If this object is already in CUDA memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination GPU device.
+        Defaults to the current CUDA device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "ipu",
+    r"""
+ipu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in IPU memory.
+
+If this object is already in IPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination IPU device.
+        Defaults to the current IPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "xpu",
+    r"""
+xpu(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of this object in XPU memory.
+
+If this object is already in XPU memory and on the correct device,
+then no copy is performed and the original object is returned.
+
+Args:
+    device (:class:`torch.device`): The destination XPU device.
+        Defaults to the current XPU device.
+    non_blocking (bool): If ``True`` and the source is in pinned memory,
+        the copy will be asynchronous with respect to the host.
+        Otherwise, the argument has no effect. Default: ``False``.
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "logcumsumexp",
+    r"""
+logcumsumexp(dim) -> Tensor
+
+See :func:`torch.logcumsumexp`
+""",
+)
+
+add_docstr_all(
+    "cummax",
+    r"""
+cummax(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummax`
+""",
+)
+
+add_docstr_all(
+    "cummin",
+    r"""
+cummin(dim) -> (Tensor, Tensor)
+
+See :func:`torch.cummin`
+""",
+)
+
+add_docstr_all(
+    "cumprod",
+    r"""
+cumprod(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumprod_",
+    r"""
+cumprod_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumprod`
+""",
+)
+
+add_docstr_all(
+    "cumsum",
+    r"""
+cumsum(dim, dtype=None) -> Tensor
+
+See :func:`torch.cumsum`
+""",
+)
+
+add_docstr_all(
+    "cumsum_",
+    r"""
+cumsum_(dim, dtype=None) -> Tensor
+
+In-place version of :meth:`~Tensor.cumsum`
+""",
+)
+
+add_docstr_all(
+    "data_ptr",
+    r"""
+data_ptr() -> int
+
+Returns the address of the first element of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dequantize",
+    r"""
+dequantize() -> Tensor
+
+Given a quantized Tensor, dequantize it and return the dequantized float Tensor.
+""",
+)
+
+add_docstr_all(
+    "dense_dim",
+    r"""
+dense_dim() -> int
+
+Return the number of dense dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+
+.. note::
+  Returns ``len(self.shape)`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.sparse_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "diag",
+    r"""
+diag(diagonal=0) -> Tensor
+
+See :func:`torch.diag`
+""",
+)
+
+add_docstr_all(
+    "diag_embed",
+    r"""
+diag_embed(offset=0, dim1=-2, dim2=-1) -> Tensor
+
+See :func:`torch.diag_embed`
+""",
+)
+
+add_docstr_all(
+    "diagflat",
+    r"""
+diagflat(offset=0) -> Tensor
+
+See :func:`torch.diagflat`
+""",
+)
+
+add_docstr_all(
+    "diagonal",
+    r"""
+diagonal(offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal`
+""",
+)
+
+add_docstr_all(
+    "diagonal_scatter",
+    r"""
+diagonal_scatter(src, offset=0, dim1=0, dim2=1) -> Tensor
+
+See :func:`torch.diagonal_scatter`
+""",
+)
+
+add_docstr_all(
+    "as_strided_scatter",
+    r"""
+as_strided_scatter(src, size, stride, storage_offset=None) -> Tensor
+
+See :func:`torch.as_strided_scatter`
+""",
+)
+
+add_docstr_all(
+    "fill_diagonal_",
+    r"""
+fill_diagonal_(fill_value, wrap=False) -> Tensor
+
+Fill the main diagonal of a tensor that has at least 2-dimensions.
+When dims>2, all dimensions of input must be of equal length.
+This function modifies the input tensor in-place, and returns the input tensor.
+
+Arguments:
+    fill_value (Scalar): the fill value
+    wrap (bool): the diagonal 'wrapped' after N columns for tall matrices.
+
+Example::
+
+    >>> a = torch.zeros(3, 3)
+    >>> a.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+    >>> b = torch.zeros(7, 3)
+    >>> b.fill_diagonal_(5)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.]])
+    >>> c = torch.zeros(7, 3)
+    >>> c.fill_diagonal_(5, wrap=True)
+    tensor([[5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.],
+            [0., 0., 0.],
+            [5., 0., 0.],
+            [0., 5., 0.],
+            [0., 0., 5.]])
+
+""",
+)
+
+add_docstr_all(
+    "floor_divide",
+    r"""
+floor_divide(value) -> Tensor
+
+See :func:`torch.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "floor_divide_",
+    r"""
+floor_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.floor_divide`
+""",
+)
+
+add_docstr_all(
+    "diff",
+    r"""
+diff(n=1, dim=-1, prepend=None, append=None) -> Tensor
+
+See :func:`torch.diff`
+""",
+)
+
+add_docstr_all(
+    "digamma",
+    r"""
+digamma() -> Tensor
+
+See :func:`torch.digamma`
+""",
+)
+
+add_docstr_all(
+    "digamma_",
+    r"""
+digamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.digamma`
+""",
+)
+
+add_docstr_all(
+    "dim",
+    r"""
+dim() -> int
+
+Returns the number of dimensions of :attr:`self` tensor.
+""",
+)
+
+add_docstr_all(
+    "dist",
+    r"""
+dist(other, p=2) -> Tensor
+
+See :func:`torch.dist`
+""",
+)
+
+add_docstr_all(
+    "div",
+    r"""
+div(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.div`
+""",
+)
+
+add_docstr_all(
+    "div_",
+    r"""
+div_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.div`
+""",
+)
+
+add_docstr_all(
+    "divide",
+    r"""
+divide(value, *, rounding_mode=None) -> Tensor
+
+See :func:`torch.divide`
+""",
+)
+
+add_docstr_all(
+    "divide_",
+    r"""
+divide_(value, *, rounding_mode=None) -> Tensor
+
+In-place version of :meth:`~Tensor.divide`
+""",
+)
+
+add_docstr_all(
+    "dot",
+    r"""
+dot(other) -> Tensor
+
+See :func:`torch.dot`
+""",
+)
+
+add_docstr_all(
+    "element_size",
+    r"""
+element_size() -> int
+
+Returns the size in bytes of an individual element.
+
+Example::
+
+    >>> torch.tensor([]).element_size()
+    4
+    >>> torch.tensor([], dtype=torch.uint8).element_size()
+    1
+
+""",
+)
+
+add_docstr_all(
+    "eq",
+    r"""
+eq(other) -> Tensor
+
+See :func:`torch.eq`
+""",
+)
+
+add_docstr_all(
+    "eq_",
+    r"""
+eq_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.eq`
+""",
+)
+
+add_docstr_all(
+    "equal",
+    r"""
+equal(other) -> bool
+
+See :func:`torch.equal`
+""",
+)
+
+add_docstr_all(
+    "erf",
+    r"""
+erf() -> Tensor
+
+See :func:`torch.erf`
+""",
+)
+
+add_docstr_all(
+    "erf_",
+    r"""
+erf_() -> Tensor
+
+In-place version of :meth:`~Tensor.erf`
+""",
+)
+
+add_docstr_all(
+    "erfc",
+    r"""
+erfc() -> Tensor
+
+See :func:`torch.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfc_",
+    r"""
+erfc_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfc`
+""",
+)
+
+add_docstr_all(
+    "erfinv",
+    r"""
+erfinv() -> Tensor
+
+See :func:`torch.erfinv`
+""",
+)
+
+add_docstr_all(
+    "erfinv_",
+    r"""
+erfinv_() -> Tensor
+
+In-place version of :meth:`~Tensor.erfinv`
+""",
+)
+
+add_docstr_all(
+    "exp",
+    r"""
+exp() -> Tensor
+
+See :func:`torch.exp`
+""",
+)
+
+add_docstr_all(
+    "exp_",
+    r"""
+exp_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp`
+""",
+)
+
+add_docstr_all(
+    "exp2",
+    r"""
+exp2() -> Tensor
+
+See :func:`torch.exp2`
+""",
+)
+
+add_docstr_all(
+    "exp2_",
+    r"""
+exp2_() -> Tensor
+
+In-place version of :meth:`~Tensor.exp2`
+""",
+)
+
+add_docstr_all(
+    "expm1",
+    r"""
+expm1() -> Tensor
+
+See :func:`torch.expm1`
+""",
+)
+
+add_docstr_all(
+    "expm1_",
+    r"""
+expm1_() -> Tensor
+
+In-place version of :meth:`~Tensor.expm1`
+""",
+)
+
+add_docstr_all(
+    "exponential_",
+    r"""
+exponential_(lambd=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the PDF (probability density function):
+
+.. math::
+
+    f(x) = \lambda e^{-\lambda x}, x > 0
+
+.. note::
+  In probability theory, exponential distribution is supported on interval [0, :math:`\inf`) (i.e., :math:`x >= 0`)
+  implying that zero can be sampled from the exponential distribution.
+  However, :func:`torch.Tensor.exponential_` does not sample zero,
+  which means that its actual support is the interval (0, :math:`\inf`).
+
+  Note that :func:`torch.distributions.exponential.Exponential` is supported on the interval [0, :math:`\inf`) and can sample zero.
+""",
+)
+
+add_docstr_all(
+    "fill_",
+    r"""
+fill_(value) -> Tensor
+
+Fills :attr:`self` tensor with the specified value.
+""",
+)
+
+add_docstr_all(
+    "floor",
+    r"""
+floor() -> Tensor
+
+See :func:`torch.floor`
+""",
+)
+
+add_docstr_all(
+    "flip",
+    r"""
+flip(dims) -> Tensor
+
+See :func:`torch.flip`
+""",
+)
+
+add_docstr_all(
+    "fliplr",
+    r"""
+fliplr() -> Tensor
+
+See :func:`torch.fliplr`
+""",
+)
+
+add_docstr_all(
+    "flipud",
+    r"""
+flipud() -> Tensor
+
+See :func:`torch.flipud`
+""",
+)
+
+add_docstr_all(
+    "roll",
+    r"""
+roll(shifts, dims) -> Tensor
+
+See :func:`torch.roll`
+""",
+)
+
+add_docstr_all(
+    "floor_",
+    r"""
+floor_() -> Tensor
+
+In-place version of :meth:`~Tensor.floor`
+""",
+)
+
+add_docstr_all(
+    "fmod",
+    r"""
+fmod(divisor) -> Tensor
+
+See :func:`torch.fmod`
+""",
+)
+
+add_docstr_all(
+    "fmod_",
+    r"""
+fmod_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.fmod`
+""",
+)
+
+add_docstr_all(
+    "frac",
+    r"""
+frac() -> Tensor
+
+See :func:`torch.frac`
+""",
+)
+
+add_docstr_all(
+    "frac_",
+    r"""
+frac_() -> Tensor
+
+In-place version of :meth:`~Tensor.frac`
+""",
+)
+
+add_docstr_all(
+    "frexp",
+    r"""
+frexp(input) -> (Tensor mantissa, Tensor exponent)
+
+See :func:`torch.frexp`
+""",
+)
+
+add_docstr_all(
+    "flatten",
+    r"""
+flatten(start_dim=0, end_dim=-1) -> Tensor
+
+See :func:`torch.flatten`
+""",
+)
+
+add_docstr_all(
+    "gather",
+    r"""
+gather(dim, index) -> Tensor
+
+See :func:`torch.gather`
+""",
+)
+
+add_docstr_all(
+    "gcd",
+    r"""
+gcd(other) -> Tensor
+
+See :func:`torch.gcd`
+""",
+)
+
+add_docstr_all(
+    "gcd_",
+    r"""
+gcd_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gcd`
+""",
+)
+
+add_docstr_all(
+    "ge",
+    r"""
+ge(other) -> Tensor
+
+See :func:`torch.ge`.
+""",
+)
+
+add_docstr_all(
+    "ge_",
+    r"""
+ge_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ge`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal",
+    r"""
+greater_equal(other) -> Tensor
+
+See :func:`torch.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "greater_equal_",
+    r"""
+greater_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater_equal`.
+""",
+)
+
+add_docstr_all(
+    "geometric_",
+    r"""
+geometric_(p, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements drawn from the geometric distribution:
+
+.. math::
+
+    P(X=k) = (1 - p)^{k - 1} p, k = 1, 2, ...
+
+.. note::
+  :func:`torch.Tensor.geometric_` `k`-th trial is the first success hence draws samples in :math:`\{1, 2, \ldots\}`, whereas
+  :func:`torch.distributions.geometric.Geometric` :math:`(k+1)`-th trial is the first success
+  hence draws samples in :math:`\{0, 1, \ldots\}`.
+""",
+)
+
+add_docstr_all(
+    "geqrf",
+    r"""
+geqrf() -> (Tensor, Tensor)
+
+See :func:`torch.geqrf`
+""",
+)
+
+add_docstr_all(
+    "ger",
+    r"""
+ger(vec2) -> Tensor
+
+See :func:`torch.ger`
+""",
+)
+
+add_docstr_all(
+    "inner",
+    r"""
+inner(other) -> Tensor
+
+See :func:`torch.inner`.
+""",
+)
+
+add_docstr_all(
+    "outer",
+    r"""
+outer(vec2) -> Tensor
+
+See :func:`torch.outer`.
+""",
+)
+
+add_docstr_all(
+    "hypot",
+    r"""
+hypot(other) -> Tensor
+
+See :func:`torch.hypot`
+""",
+)
+
+add_docstr_all(
+    "hypot_",
+    r"""
+hypot_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.hypot`
+""",
+)
+
+add_docstr_all(
+    "i0",
+    r"""
+i0() -> Tensor
+
+See :func:`torch.i0`
+""",
+)
+
+add_docstr_all(
+    "i0_",
+    r"""
+i0_() -> Tensor
+
+In-place version of :meth:`~Tensor.i0`
+""",
+)
+
+add_docstr_all(
+    "igamma",
+    r"""
+igamma(other) -> Tensor
+
+See :func:`torch.igamma`
+""",
+)
+
+add_docstr_all(
+    "igamma_",
+    r"""
+igamma_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.igamma`
+""",
+)
+
+add_docstr_all(
+    "igammac",
+    r"""
+igammac(other) -> Tensor
+See :func:`torch.igammac`
+""",
+)
+
+add_docstr_all(
+    "igammac_",
+    r"""
+igammac_(other) -> Tensor
+In-place version of :meth:`~Tensor.igammac`
+""",
+)
+
+add_docstr_all(
+    "indices",
+    r"""
+indices() -> Tensor
+
+Return the indices tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.values`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "get_device",
+    r"""
+get_device() -> Device ordinal (Integer)
+
+For CUDA tensors, this function returns the device ordinal of the GPU on which the tensor resides.
+For CPU tensors, this function returns `-1`.
+
+Example::
+
+    >>> x = torch.randn(3, 4, 5, device='cuda:0')
+    >>> x.get_device()
+    0
+    >>> x.cpu().get_device()
+    -1
+""",
+)
+
+add_docstr_all(
+    "values",
+    r"""
+values() -> Tensor
+
+Return the values tensor of a :ref:`sparse COO tensor <sparse-coo-docs>`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See also :meth:`Tensor.indices`.
+
+.. note::
+  This method can only be called on a coalesced sparse tensor. See
+  :meth:`Tensor.coalesce` for details.
+""",
+)
+
+add_docstr_all(
+    "gt",
+    r"""
+gt(other) -> Tensor
+
+See :func:`torch.gt`.
+""",
+)
+
+add_docstr_all(
+    "gt_",
+    r"""
+gt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.gt`.
+""",
+)
+
+add_docstr_all(
+    "greater",
+    r"""
+greater(other) -> Tensor
+
+See :func:`torch.greater`.
+""",
+)
+
+add_docstr_all(
+    "greater_",
+    r"""
+greater_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.greater`.
+""",
+)
+
+add_docstr_all(
+    "has_names",
+    r"""
+Is ``True`` if any of this tensor's dimensions are named. Otherwise, is ``False``.
+""",
+)
+
+add_docstr_all(
+    "hardshrink",
+    r"""
+hardshrink(lambd=0.5) -> Tensor
+
+See :func:`torch.nn.functional.hardshrink`
+""",
+)
+
+add_docstr_all(
+    "heaviside",
+    r"""
+heaviside(values) -> Tensor
+
+See :func:`torch.heaviside`
+""",
+)
+
+add_docstr_all(
+    "heaviside_",
+    r"""
+heaviside_(values) -> Tensor
+
+In-place version of :meth:`~Tensor.heaviside`
+""",
+)
+
+add_docstr_all(
+    "histc",
+    r"""
+histc(bins=100, min=0, max=0) -> Tensor
+
+See :func:`torch.histc`
+""",
+)
+
+add_docstr_all(
+    "histogram",
+    r"""
+histogram(input, bins, *, range=None, weight=None, density=False) -> (Tensor, Tensor)
+
+See :func:`torch.histogram`
+""",
+)
+
+add_docstr_all(
+    "index_add_",
+    r"""
+index_add_(dim, index, source, *, alpha=1) -> Tensor
+
+Accumulate the elements of :attr:`alpha` times ``source`` into the :attr:`self`
+tensor by adding to the indices in the order given in :attr:`index`. For example,
+if ``dim == 0``, ``index[i] == j``, and ``alpha=-1``, then the ``i``\ th row of
+``source`` is subtracted from the ``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor the output is given as::
+
+    self[index[i], :, :] += alpha * src[i, :, :]  # if dim == 0
+    self[:, index[i], :] += alpha * src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] += alpha * src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+            should have dtype either `torch.int64` or `torch.int32`
+    source (Tensor): the tensor containing values to add
+
+Keyword args:
+    alpha (Number): the scalar multiplier for ``source``
+
+Example::
+
+    >>> x = torch.ones(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_add_(0, index, t)
+    tensor([[  2.,   3.,   4.],
+            [  1.,   1.,   1.],
+            [  8.,   9.,  10.],
+            [  1.,   1.,   1.],
+            [  5.,   6.,   7.]])
+    >>> x.index_add_(0, index, t, alpha=-1)
+    tensor([[  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.],
+            [  1.,   1.,   1.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_copy_",
+    r"""
+index_copy_(dim, index, tensor) -> Tensor
+
+Copies the elements of :attr:`tensor` into the :attr:`self` tensor by selecting
+the indices in the order given in :attr:`index`. For example, if ``dim == 0``
+and ``index[i] == j``, then the ``i``\ th row of :attr:`tensor` is copied to the
+``j``\ th row of :attr:`self`.
+
+The :attr:`dim`\ th dimension of :attr:`tensor` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+.. note::
+    If :attr:`index` contains duplicate entries, multiple elements from
+    :attr:`tensor` will be copied to the same index of :attr:`self`. The result
+    is nondeterministic since it depends on which copy occurs last.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`tensor` to select from
+    tensor (Tensor): the tensor containing values to copy
+
+Example::
+
+    >>> x = torch.zeros(5, 3)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2])
+    >>> x.index_copy_(0, index, t)
+    tensor([[ 1.,  2.,  3.],
+            [ 0.,  0.,  0.],
+            [ 7.,  8.,  9.],
+            [ 0.,  0.,  0.],
+            [ 4.,  5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "index_fill_",
+    r"""
+index_fill_(dim, index, value) -> Tensor
+
+Fills the elements of the :attr:`self` tensor with value :attr:`value` by
+selecting the indices in the order given in :attr:`index`.
+
+Args:
+    dim (int): dimension along which to index
+    index (LongTensor): indices of :attr:`self` tensor to fill in
+    value (float): the value to fill with
+
+Example::
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float)
+    >>> index = torch.tensor([0, 2])
+    >>> x.index_fill_(1, index, -1)
+    tensor([[-1.,  2., -1.],
+            [-1.,  5., -1.],
+            [-1.,  8., -1.]])
+""",
+)
+
+add_docstr_all(
+    "index_put_",
+    r"""
+index_put_(indices, values, accumulate=False) -> Tensor
+
+Puts values from the tensor :attr:`values` into the tensor :attr:`self` using
+the indices specified in :attr:`indices` (which is a tuple of Tensors). The
+expression ``tensor.index_put_(indices, values)`` is equivalent to
+``tensor[indices] = values``. Returns :attr:`self`.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
+contain duplicate elements.
+
+Args:
+    indices (tuple of LongTensor): tensors used to index into `self`.
+    values (Tensor): tensor of same dtype as `self`.
+    accumulate (bool): whether to accumulate into self
+""",
+)
+
+add_docstr_all(
+    "index_put",
+    r"""
+index_put(indices, values, accumulate=False) -> Tensor
+
+Out-place version of :meth:`~Tensor.index_put_`.
+""",
+)
+
+add_docstr_all(
+    "index_reduce_",
+    r"""
+index_reduce_(dim, index, source, reduce, *, include_self=True) -> Tensor
+
+Accumulate the elements of ``source`` into the :attr:`self`
+tensor by accumulating to the indices in the order given in :attr:`index`
+using the reduction given by the ``reduce`` argument. For example, if ``dim == 0``,
+``index[i] == j``, ``reduce == prod`` and ``include_self == True`` then the ``i``\ th
+row of ``source`` is multiplied by the ``j``\ th row of :attr:`self`. If
+:obj:`include_self="True"`, the values in the :attr:`self` tensor are included
+in the reduction, otherwise, rows in the :attr:`self` tensor that are accumulated
+to are treated as if they were filled with the reduction identites.
+
+The :attr:`dim`\ th dimension of ``source`` must have the same size as the
+length of :attr:`index` (which must be a vector), and all other dimensions must
+match :attr:`self`, or an error will be raised.
+
+For a 3-D tensor with :obj:`reduce="prod"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i], :, :] *= src[i, :, :]  # if dim == 0
+    self[:, index[i], :] *= src[:, i, :]  # if dim == 1
+    self[:, :, index[i]] *= src[:, :, i]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    This function only supports floating point tensors.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): dimension along which to index
+    index (Tensor): indices of ``source`` to select from,
+        should have dtype either `torch.int64` or `torch.int32`
+    source (FloatTensor): the tensor containing values to accumulate
+    reduce (str): the reduction operation to apply
+        (:obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+
+Keyword args:
+    include_self (bool): whether the elements from the ``self`` tensor are
+        included in the reduction
+
+Example::
+
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], dtype=torch.float)
+    >>> index = torch.tensor([0, 4, 2, 0])
+    >>> x.index_reduce_(0, index, t, 'prod')
+    tensor([[20., 44., 72.],
+            [ 2.,  2.,  2.],
+            [14., 16., 18.],
+            [ 2.,  2.,  2.],
+            [ 8., 10., 12.]])
+    >>> x = torch.empty(5, 3).fill_(2)
+    >>> x.index_reduce_(0, index, t, 'prod', include_self=False)
+    tensor([[10., 22., 36.],
+            [ 2.,  2.,  2.],
+            [ 7.,  8.,  9.],
+            [ 2.,  2.,  2.],
+            [ 4.,  5.,  6.]])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "index_select",
+    r"""
+index_select(dim, index) -> Tensor
+
+See :func:`torch.index_select`
+""",
+)
+
+add_docstr_all(
+    "sparse_mask",
+    r"""
+sparse_mask(mask) -> Tensor
+
+Returns a new :ref:`sparse tensor <sparse-docs>` with values from a
+strided tensor :attr:`self` filtered by the indices of the sparse
+tensor :attr:`mask`. The values of :attr:`mask` sparse tensor are
+ignored. :attr:`self` and :attr:`mask` tensors must have the same
+shape.
+
+.. note::
+
+  The returned sparse tensor might contain duplicate values if :attr:`mask`
+  is not coalesced. It is therefore advisable to pass ``mask.coalesce()``
+  if such behavior is not desired.
+
+.. note::
+
+  The returned sparse tensor has the same indices as the sparse tensor
+  :attr:`mask`, even when the corresponding values in :attr:`self` are
+  zeros.
+
+Args:
+    mask (Tensor): a sparse tensor whose indices are used as a filter
+
+Example::
+
+    >>> nse = 5
+    >>> dims = (5, 5, 2, 2)
+    >>> I = torch.cat([torch.randint(0, dims[0], size=(nse,)),
+    ...                torch.randint(0, dims[1], size=(nse,))], 0).reshape(2, nse)
+    >>> V = torch.randn(nse, dims[2], dims[3])
+    >>> S = torch.sparse_coo_tensor(I, V, dims).coalesce()
+    >>> D = torch.randn(dims)
+    >>> D.sparse_mask(S)
+    tensor(indices=tensor([[0, 0, 0, 2],
+                           [0, 1, 4, 3]]),
+           values=tensor([[[ 1.6550,  0.2397],
+                           [-0.1611, -0.0779]],
+
+                          [[ 0.2326, -1.0558],
+                           [ 1.4711,  1.9678]],
+
+                          [[-0.5138, -0.0411],
+                           [ 1.9417,  0.5158]],
+
+                          [[ 0.0793,  0.0036],
+                           [-0.2569, -0.1055]]]),
+           size=(5, 5, 2, 2), nnz=4, layout=torch.sparse_coo)
+""",
+)
+
+add_docstr_all(
+    "inverse",
+    r"""
+inverse() -> Tensor
+
+See :func:`torch.inverse`
+""",
+)
+
+add_docstr_all(
+    "isnan",
+    r"""
+isnan() -> Tensor
+
+See :func:`torch.isnan`
+""",
+)
+
+add_docstr_all(
+    "isinf",
+    r"""
+isinf() -> Tensor
+
+See :func:`torch.isinf`
+""",
+)
+
+add_docstr_all(
+    "isposinf",
+    r"""
+isposinf() -> Tensor
+
+See :func:`torch.isposinf`
+""",
+)
+
+add_docstr_all(
+    "isneginf",
+    r"""
+isneginf() -> Tensor
+
+See :func:`torch.isneginf`
+""",
+)
+
+add_docstr_all(
+    "isfinite",
+    r"""
+isfinite() -> Tensor
+
+See :func:`torch.isfinite`
+""",
+)
+
+add_docstr_all(
+    "isclose",
+    r"""
+isclose(other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+See :func:`torch.isclose`
+""",
+)
+
+add_docstr_all(
+    "isreal",
+    r"""
+isreal() -> Tensor
+
+See :func:`torch.isreal`
+""",
+)
+
+add_docstr_all(
+    "is_coalesced",
+    r"""
+is_coalesced() -> bool
+
+Returns ``True`` if :attr:`self` is a :ref:`sparse COO tensor
+<sparse-coo-docs>` that is coalesced, ``False`` otherwise.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse COO tensor.
+
+See :meth:`coalesce` and :ref:`uncoalesced tensors <sparse-uncoalesced-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "is_contiguous",
+    r"""
+is_contiguous(memory_format=torch.contiguous_format) -> bool
+
+Returns True if :attr:`self` tensor is contiguous in memory in the order specified
+by memory format.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): Specifies memory allocation
+        order. Default: ``torch.contiguous_format``.
+""",
+)
+
+add_docstr_all(
+    "is_pinned",
+    r"""
+Returns true if this tensor resides in pinned memory.
+""",
+)
+
+add_docstr_all(
+    "is_floating_point",
+    r"""
+is_floating_point() -> bool
+
+Returns True if the data type of :attr:`self` is a floating point data type.
+""",
+)
+
+add_docstr_all(
+    "is_complex",
+    r"""
+is_complex() -> bool
+
+Returns True if the data type of :attr:`self` is a complex data type.
+""",
+)
+
+add_docstr_all(
+    "is_inference",
+    r"""
+is_inference() -> bool
+
+See :func:`torch.is_inference`
+""",
+)
+
+add_docstr_all(
+    "is_conj",
+    r"""
+is_conj() -> bool
+
+Returns True if the conjugate bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_neg",
+    r"""
+is_neg() -> bool
+
+Returns True if the negative bit of :attr:`self` is set to true.
+""",
+)
+
+add_docstr_all(
+    "is_signed",
+    r"""
+is_signed() -> bool
+
+Returns True if the data type of :attr:`self` is a signed data type.
+""",
+)
+
+add_docstr_all(
+    "is_set_to",
+    r"""
+is_set_to(tensor) -> bool
+
+Returns True if both tensors are pointing to the exact same memory (same
+storage, offset, size and stride).
+""",
+)
+
+add_docstr_all(
+    "item",
+    r"""
+item() -> number
+
+Returns the value of this tensor as a standard Python number. This only works
+for tensors with one element. For other cases, see :meth:`~Tensor.tolist`.
+
+This operation is not differentiable.
+
+Example::
+
+    >>> x = torch.tensor([1.0])
+    >>> x.item()
+    1.0
+
+""",
+)
+
+add_docstr_all(
+    "kron",
+    r"""
+kron(other) -> Tensor
+
+See :func:`torch.kron`
+""",
+)
+
+add_docstr_all(
+    "kthvalue",
+    r"""
+kthvalue(k, dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.kthvalue`
+""",
+)
+
+add_docstr_all(
+    "ldexp",
+    r"""
+ldexp(other) -> Tensor
+
+See :func:`torch.ldexp`
+""",
+)
+
+add_docstr_all(
+    "ldexp_",
+    r"""
+ldexp_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ldexp`
+""",
+)
+
+add_docstr_all(
+    "lcm",
+    r"""
+lcm(other) -> Tensor
+
+See :func:`torch.lcm`
+""",
+)
+
+add_docstr_all(
+    "lcm_",
+    r"""
+lcm_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lcm`
+""",
+)
+
+add_docstr_all(
+    "le",
+    r"""
+le(other) -> Tensor
+
+See :func:`torch.le`.
+""",
+)
+
+add_docstr_all(
+    "le_",
+    r"""
+le_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.le`.
+""",
+)
+
+add_docstr_all(
+    "less_equal",
+    r"""
+less_equal(other) -> Tensor
+
+See :func:`torch.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "less_equal_",
+    r"""
+less_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less_equal`.
+""",
+)
+
+add_docstr_all(
+    "lerp",
+    r"""
+lerp(end, weight) -> Tensor
+
+See :func:`torch.lerp`
+""",
+)
+
+add_docstr_all(
+    "lerp_",
+    r"""
+lerp_(end, weight) -> Tensor
+
+In-place version of :meth:`~Tensor.lerp`
+""",
+)
+
+add_docstr_all(
+    "lgamma",
+    r"""
+lgamma() -> Tensor
+
+See :func:`torch.lgamma`
+""",
+)
+
+add_docstr_all(
+    "lgamma_",
+    r"""
+lgamma_() -> Tensor
+
+In-place version of :meth:`~Tensor.lgamma`
+""",
+)
+
+add_docstr_all(
+    "log",
+    r"""
+log() -> Tensor
+
+See :func:`torch.log`
+""",
+)
+
+add_docstr_all(
+    "log_",
+    r"""
+log_() -> Tensor
+
+In-place version of :meth:`~Tensor.log`
+""",
+)
+
+add_docstr_all(
+    "log10",
+    r"""
+log10() -> Tensor
+
+See :func:`torch.log10`
+""",
+)
+
+add_docstr_all(
+    "log10_",
+    r"""
+log10_() -> Tensor
+
+In-place version of :meth:`~Tensor.log10`
+""",
+)
+
+add_docstr_all(
+    "log1p",
+    r"""
+log1p() -> Tensor
+
+See :func:`torch.log1p`
+""",
+)
+
+add_docstr_all(
+    "log1p_",
+    r"""
+log1p_() -> Tensor
+
+In-place version of :meth:`~Tensor.log1p`
+""",
+)
+
+add_docstr_all(
+    "log2",
+    r"""
+log2() -> Tensor
+
+See :func:`torch.log2`
+""",
+)
+
+add_docstr_all(
+    "log2_",
+    r"""
+log2_() -> Tensor
+
+In-place version of :meth:`~Tensor.log2`
+""",
+)
+
+add_docstr_all(
+    "logaddexp",
+    r"""
+logaddexp(other) -> Tensor
+
+See :func:`torch.logaddexp`
+""",
+)
+
+add_docstr_all(
+    "logaddexp2",
+    r"""
+logaddexp2(other) -> Tensor
+
+See :func:`torch.logaddexp2`
+""",
+)
+
+add_docstr_all(
+    "log_normal_",
+    r"""
+log_normal_(mean=1, std=2, *, generator=None)
+
+Fills :attr:`self` tensor with numbers samples from the log-normal distribution
+parameterized by the given mean :math:`\mu` and standard deviation
+:math:`\sigma`. Note that :attr:`mean` and :attr:`std` are the mean and
+standard deviation of the underlying normal distribution, and not of the
+returned distribution:
+
+.. math::
+
+    f(x) = \dfrac{1}{x \sigma \sqrt{2\pi}}\ e^{-\frac{(\ln x - \mu)^2}{2\sigma^2}}
+""",
+)
+
+add_docstr_all(
+    "logsumexp",
+    r"""
+logsumexp(dim, keepdim=False) -> Tensor
+
+See :func:`torch.logsumexp`
+""",
+)
+
+add_docstr_all(
+    "lt",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.lt`.
+""",
+)
+
+add_docstr_all(
+    "lt_",
+    r"""
+lt_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.lt`.
+""",
+)
+
+add_docstr_all(
+    "less",
+    r"""
+lt(other) -> Tensor
+
+See :func:`torch.less`.
+""",
+)
+
+add_docstr_all(
+    "less_",
+    r"""
+less_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.less`.
+""",
+)
+
+add_docstr_all(
+    "lu_solve",
+    r"""
+lu_solve(LU_data, LU_pivots) -> Tensor
+
+See :func:`torch.lu_solve`
+""",
+)
+
+add_docstr_all(
+    "map_",
+    r"""
+map_(tensor, callable)
+
+Applies :attr:`callable` for each element in :attr:`self` tensor and the given
+:attr:`tensor` and stores the results in :attr:`self` tensor. :attr:`self` tensor and
+the given :attr:`tensor` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+The :attr:`callable` should have the signature::
+
+    def callable(a, b) -> number
+""",
+)
+
+add_docstr_all(
+    "masked_scatter_",
+    r"""
+masked_scatter_(mask, source)
+
+Copies elements from :attr:`source` into :attr:`self` tensor at positions where
+the :attr:`mask` is True. Elements from :attr:`source` are copied into :attr:`self`
+starting at position 0 of :attr:`source` and continuing in order one-by-one for each
+occurrence of :attr:`mask` being True.
+The shape of :attr:`mask` must be :ref:`broadcastable <broadcasting-semantics>`
+with the shape of the underlying tensor. The :attr:`source` should have at least
+as many elements as the number of ones in :attr:`mask`.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    source (Tensor): the tensor to copy from
+
+.. note::
+
+    The :attr:`mask` operates on the :attr:`self` tensor, not on the given
+    :attr:`source` tensor.
+
+Example:
+
+    >>> self = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter_(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "masked_fill_",
+    r"""
+masked_fill_(mask, value)
+
+Fills elements of :attr:`self` tensor with :attr:`value` where :attr:`mask` is
+True. The shape of :attr:`mask` must be
+:ref:`broadcastable <broadcasting-semantics>` with the shape of the underlying
+tensor.
+
+Args:
+    mask (BoolTensor): the boolean mask
+    value (float): the value to fill in with
+""",
+)
+
+add_docstr_all(
+    "masked_select",
+    r"""
+masked_select(mask) -> Tensor
+
+See :func:`torch.masked_select`
+""",
+)
+
+add_docstr_all(
+    "matrix_power",
+    r"""
+matrix_power(n) -> Tensor
+
+.. note:: :meth:`~Tensor.matrix_power` is deprecated, use :func:`torch.linalg.matrix_power` instead.
+
+Alias for :func:`torch.linalg.matrix_power`
+""",
+)
+
+add_docstr_all(
+    "matrix_exp",
+    r"""
+matrix_exp() -> Tensor
+
+See :func:`torch.matrix_exp`
+""",
+)
+
+add_docstr_all(
+    "max",
+    r"""
+max(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.max`
+""",
+)
+
+add_docstr_all(
+    "amax",
+    r"""
+amax(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amax`
+""",
+)
+
+add_docstr_all(
+    "maximum",
+    r"""
+maximum(other) -> Tensor
+
+See :func:`torch.maximum`
+""",
+)
+
+add_docstr_all(
+    "fmax",
+    r"""
+fmax(other) -> Tensor
+
+See :func:`torch.fmax`
+""",
+)
+
+add_docstr_all(
+    "argmax",
+    r"""
+argmax(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmax`
+""",
+)
+
+add_docstr_all(
+    "argwhere",
+    r"""
+argwhere() -> Tensor
+
+See :func:`torch.argwhere`
+""",
+)
+
+add_docstr_all(
+    "mean",
+    r"""
+mean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.mean`
+""",
+)
+
+add_docstr_all(
+    "nanmean",
+    r"""
+nanmean(dim=None, keepdim=False, *, dtype=None) -> Tensor
+
+See :func:`torch.nanmean`
+""",
+)
+
+add_docstr_all(
+    "median",
+    r"""
+median(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.median`
+""",
+)
+
+add_docstr_all(
+    "nanmedian",
+    r"""
+nanmedian(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.nanmedian`
+""",
+)
+
+add_docstr_all(
+    "min",
+    r"""
+min(dim=None, keepdim=False) -> Tensor or (Tensor, Tensor)
+
+See :func:`torch.min`
+""",
+)
+
+add_docstr_all(
+    "amin",
+    r"""
+amin(dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.amin`
+""",
+)
+
+add_docstr_all(
+    "minimum",
+    r"""
+minimum(other) -> Tensor
+
+See :func:`torch.minimum`
+""",
+)
+
+add_docstr_all(
+    "aminmax",
+    r"""
+aminmax(*, dim=None, keepdim=False) -> (Tensor min, Tensor max)
+
+See :func:`torch.aminmax`
+""",
+)
+
+add_docstr_all(
+    "fmin",
+    r"""
+fmin(other) -> Tensor
+
+See :func:`torch.fmin`
+""",
+)
+
+add_docstr_all(
+    "argmin",
+    r"""
+argmin(dim=None, keepdim=False) -> LongTensor
+
+See :func:`torch.argmin`
+""",
+)
+
+add_docstr_all(
+    "mm",
+    r"""
+mm(mat2) -> Tensor
+
+See :func:`torch.mm`
+""",
+)
+
+add_docstr_all(
+    "mode",
+    r"""
+mode(dim=None, keepdim=False) -> (Tensor, LongTensor)
+
+See :func:`torch.mode`
+""",
+)
+
+add_docstr_all(
+    "movedim",
+    r"""
+movedim(source, destination) -> Tensor
+
+See :func:`torch.movedim`
+""",
+)
+
+add_docstr_all(
+    "moveaxis",
+    r"""
+moveaxis(source, destination) -> Tensor
+
+See :func:`torch.moveaxis`
+""",
+)
+
+add_docstr_all(
+    "mul",
+    r"""
+mul(value) -> Tensor
+
+See :func:`torch.mul`.
+""",
+)
+
+add_docstr_all(
+    "mul_",
+    r"""
+mul_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.mul`.
+""",
+)
+
+add_docstr_all(
+    "multiply",
+    r"""
+multiply(value) -> Tensor
+
+See :func:`torch.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multiply_",
+    r"""
+multiply_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.multiply`.
+""",
+)
+
+add_docstr_all(
+    "multinomial",
+    r"""
+multinomial(num_samples, replacement=False, *, generator=None) -> Tensor
+
+See :func:`torch.multinomial`
+""",
+)
+
+add_docstr_all(
+    "mv",
+    r"""
+mv(vec) -> Tensor
+
+See :func:`torch.mv`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma",
+    r"""
+mvlgamma(p) -> Tensor
+
+See :func:`torch.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "mvlgamma_",
+    r"""
+mvlgamma_(p) -> Tensor
+
+In-place version of :meth:`~Tensor.mvlgamma`
+""",
+)
+
+add_docstr_all(
+    "narrow",
+    r"""
+narrow(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow`.
+""",
+)
+
+add_docstr_all(
+    "narrow_copy",
+    r"""
+narrow_copy(dimension, start, length) -> Tensor
+
+See :func:`torch.narrow_copy`.
+""",
+)
+
+add_docstr_all(
+    "ndimension",
+    r"""
+ndimension() -> int
+
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "nan_to_num",
+    r"""
+nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+See :func:`torch.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "nan_to_num_",
+    r"""
+nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor
+
+In-place version of :meth:`~Tensor.nan_to_num`.
+""",
+)
+
+add_docstr_all(
+    "ne",
+    r"""
+ne(other) -> Tensor
+
+See :func:`torch.ne`.
+""",
+)
+
+add_docstr_all(
+    "ne_",
+    r"""
+ne_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.ne`.
+""",
+)
+
+add_docstr_all(
+    "not_equal",
+    r"""
+not_equal(other) -> Tensor
+
+See :func:`torch.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "not_equal_",
+    r"""
+not_equal_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.not_equal`.
+""",
+)
+
+add_docstr_all(
+    "neg",
+    r"""
+neg() -> Tensor
+
+See :func:`torch.neg`
+""",
+)
+
+add_docstr_all(
+    "negative",
+    r"""
+negative() -> Tensor
+
+See :func:`torch.negative`
+""",
+)
+
+add_docstr_all(
+    "neg_",
+    r"""
+neg_() -> Tensor
+
+In-place version of :meth:`~Tensor.neg`
+""",
+)
+
+add_docstr_all(
+    "negative_",
+    r"""
+negative_() -> Tensor
+
+In-place version of :meth:`~Tensor.negative`
+""",
+)
+
+add_docstr_all(
+    "nelement",
+    r"""
+nelement() -> int
+
+Alias for :meth:`~Tensor.numel`
+""",
+)
+
+add_docstr_all(
+    "nextafter",
+    r"""
+nextafter(other) -> Tensor
+See :func:`torch.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nextafter_",
+    r"""
+nextafter_(other) -> Tensor
+In-place version of :meth:`~Tensor.nextafter`
+""",
+)
+
+add_docstr_all(
+    "nonzero",
+    r"""
+nonzero() -> LongTensor
+
+See :func:`torch.nonzero`
+""",
+)
+
+add_docstr_all(
+    "nonzero_static",
+    r"""
+nonzero_static(input, *, size, fill_value=-1) -> Tensor
+
+Returns a 2-D tensor where each row is the index for a non-zero value.
+The returned Tensor has the same `torch.dtype` as `torch.nonzero()`.
+
+Args:
+    input (Tensor): the input tensor to count non-zero elements.
+
+Keyword args:
+    size (int): the size of non-zero elements expected to be included in the out
+        tensor. Pad the out tensor with `fill_value` if the `size` is larger
+        than total number of non-zero elements, truncate out tensor if `size`
+        is smaller. The size must be a non-negative integer.
+    fill_value (int): the value to fill the output tensor with when `size` is larger
+        than the total number of non-zero elements. Default is `-1` to represent
+        invalid index.
+
+Example:
+
+    # Example 1: Padding
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 4
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0],
+            [  1,   1],
+            [  -1, -1]], dtype=torch.int64)
+
+    # Example 2: Truncating
+    >>> input_tensor = torch.tensor([[1, 0], [3, 2]])
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([[  0,   0],
+            [  1,   0]], dtype=torch.int64)
+
+    # Example 3: 0 size
+    >>> input_tensor = torch.tensor([10])
+    >>> static_size = 0
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(0, 1), dtype=torch.int64)
+
+    # Example 4: 0 rank input
+    >>> input_tensor = torch.tensor(10)
+    >>> static_size = 2
+    >>> t = torch.nonzero_static(input_tensor, size = static_size)
+    tensor([], size=(2, 0), dtype=torch.int64)
+""",
+)
+
+add_docstr_all(
+    "norm",
+    r"""
+norm(p=2, dim=None, keepdim=False) -> Tensor
+
+See :func:`torch.norm`
+""",
+)
+
+add_docstr_all(
+    "normal_",
+    r"""
+normal_(mean=0, std=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with elements samples from the normal distribution
+parameterized by :attr:`mean` and :attr:`std`.
+""",
+)
+
+add_docstr_all(
+    "numel",
+    r"""
+numel() -> int
+
+See :func:`torch.numel`
+""",
+)
+
+add_docstr_all(
+    "numpy",
+    r"""
+numpy(*, force=False) -> numpy.ndarray
+
+Returns the tensor as a NumPy :class:`ndarray`.
+
+If :attr:`force` is ``False`` (the default), the conversion
+is performed only if the tensor is on the CPU, does not require grad,
+does not have its conjugate bit set, and is a dtype and layout that
+NumPy supports. The returned ndarray and the tensor will share their
+storage, so changes to the tensor will be reflected in the ndarray
+and vice versa.
+
+If :attr:`force` is ``True`` this is equivalent to
+calling ``t.detach().cpu().resolve_conj().resolve_neg().numpy()``.
+If the tensor isn't on the CPU or the conjugate or negative bit is set,
+the tensor won't share its storage with the returned ndarray.
+Setting :attr:`force` to ``True`` can be a useful shorthand.
+
+Args:
+    force (bool): if ``True``, the ndarray may be a copy of the tensor
+               instead of always sharing memory, defaults to ``False``.
+""",
+)
+
+add_docstr_all(
+    "orgqr",
+    r"""
+orgqr(input2) -> Tensor
+
+See :func:`torch.orgqr`
+""",
+)
+
+add_docstr_all(
+    "ormqr",
+    r"""
+ormqr(input2, input3, left=True, transpose=False) -> Tensor
+
+See :func:`torch.ormqr`
+""",
+)
+
+add_docstr_all(
+    "permute",
+    r"""
+permute(*dims) -> Tensor
+
+See :func:`torch.permute`
+""",
+)
+
+add_docstr_all(
+    "polygamma",
+    r"""
+polygamma(n) -> Tensor
+
+See :func:`torch.polygamma`
+""",
+)
+
+add_docstr_all(
+    "polygamma_",
+    r"""
+polygamma_(n) -> Tensor
+
+In-place version of :meth:`~Tensor.polygamma`
+""",
+)
+
+add_docstr_all(
+    "positive",
+    r"""
+positive() -> Tensor
+
+See :func:`torch.positive`
+""",
+)
+
+add_docstr_all(
+    "pow",
+    r"""
+pow(exponent) -> Tensor
+
+See :func:`torch.pow`
+""",
+)
+
+add_docstr_all(
+    "pow_",
+    r"""
+pow_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.pow`
+""",
+)
+
+add_docstr_all(
+    "float_power",
+    r"""
+float_power(exponent) -> Tensor
+
+See :func:`torch.float_power`
+""",
+)
+
+add_docstr_all(
+    "float_power_",
+    r"""
+float_power_(exponent) -> Tensor
+
+In-place version of :meth:`~Tensor.float_power`
+""",
+)
+
+add_docstr_all(
+    "prod",
+    r"""
+prod(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.prod`
+""",
+)
+
+add_docstr_all(
+    "put_",
+    r"""
+put_(index, source, accumulate=False) -> Tensor
+
+Copies the elements from :attr:`source` into the positions specified by
+:attr:`index`. For the purpose of indexing, the :attr:`self` tensor is treated as if
+it were a 1-D tensor.
+
+:attr:`index` and :attr:`source` need to have the same number of elements, but not necessarily
+the same shape.
+
+If :attr:`accumulate` is ``True``, the elements in :attr:`source` are added to
+:attr:`self`. If accumulate is ``False``, the behavior is undefined if :attr:`index`
+contain duplicate elements.
+
+Args:
+    index (LongTensor): the indices into self
+    source (Tensor): the tensor containing values to copy from
+    accumulate (bool): whether to accumulate into self
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+    ...                     [6, 7, 8]])
+    >>> src.put_(torch.tensor([1, 3]), torch.tensor([9, 10]))
+    tensor([[  4,   9,   5],
+            [ 10,   7,   8]])
+""",
+)
+
+add_docstr_all(
+    "put",
+    r"""
+put(input, index, source, accumulate=False) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.put_`.
+`input` corresponds to `self` in :meth:`torch.Tensor.put_`.
+""",
+)
+
+add_docstr_all(
+    "qr",
+    r"""
+qr(some=True) -> (Tensor, Tensor)
+
+See :func:`torch.qr`
+""",
+)
+
+add_docstr_all(
+    "qscheme",
+    r"""
+qscheme() -> torch.qscheme
+
+Returns the quantization scheme of a given QTensor.
+""",
+)
+
+add_docstr_all(
+    "quantile",
+    r"""
+quantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.quantile`
+""",
+)
+
+add_docstr_all(
+    "nanquantile",
+    r"""
+nanquantile(q, dim=None, keepdim=False, *, interpolation='linear') -> Tensor
+
+See :func:`torch.nanquantile`
+""",
+)
+
+add_docstr_all(
+    "q_scale",
+    r"""
+q_scale() -> float
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the scale of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_zero_point",
+    r"""
+q_zero_point() -> int
+
+Given a Tensor quantized by linear(affine) quantization,
+returns the zero_point of the underlying quantizer().
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_scales",
+    r"""
+q_per_channel_scales() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a Tensor of scales of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_zero_points",
+    r"""
+q_per_channel_zero_points() -> Tensor
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns a tensor of zero_points of the underlying quantizer. It has the number of
+elements that matches the corresponding dimensions (from q_per_channel_axis) of
+the tensor.
+""",
+)
+
+add_docstr_all(
+    "q_per_channel_axis",
+    r"""
+q_per_channel_axis() -> int
+
+Given a Tensor quantized by linear (affine) per-channel quantization,
+returns the index of dimension on which per-channel quantization is applied.
+""",
+)
+
+add_docstr_all(
+    "random_",
+    r"""
+random_(from=0, to=None, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the discrete uniform
+distribution over ``[from, to - 1]``. If not specified, the values are usually
+only bounded by :attr:`self` tensor's data type. However, for floating point
+types, if unspecified, range will be ``[0, 2^mantissa]`` to ensure that every
+value is representable. For example, `torch.tensor(1, dtype=torch.double).random_()`
+will be uniform in ``[0, 2^53]``.
+""",
+)
+
+add_docstr_all(
+    "rad2deg",
+    r"""
+rad2deg() -> Tensor
+
+See :func:`torch.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "rad2deg_",
+    r"""
+rad2deg_() -> Tensor
+
+In-place version of :meth:`~Tensor.rad2deg`
+""",
+)
+
+add_docstr_all(
+    "deg2rad",
+    r"""
+deg2rad() -> Tensor
+
+See :func:`torch.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "deg2rad_",
+    r"""
+deg2rad_() -> Tensor
+
+In-place version of :meth:`~Tensor.deg2rad`
+""",
+)
+
+add_docstr_all(
+    "ravel",
+    r"""
+ravel() -> Tensor
+
+see :func:`torch.ravel`
+""",
+)
+
+add_docstr_all(
+    "reciprocal",
+    r"""
+reciprocal() -> Tensor
+
+See :func:`torch.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "reciprocal_",
+    r"""
+reciprocal_() -> Tensor
+
+In-place version of :meth:`~Tensor.reciprocal`
+""",
+)
+
+add_docstr_all(
+    "record_stream",
+    r"""
+record_stream(stream)
+
+Marks the tensor as having been used by this stream.  When the tensor
+is deallocated, ensure the tensor memory is not reused for another tensor
+until all work queued on :attr:`stream` at the time of deallocation is
+complete.
+
+.. note::
+
+    The caching allocator is aware of only the stream where a tensor was
+    allocated. Due to the awareness, it already correctly manages the life
+    cycle of tensors on only one stream. But if a tensor is used on a stream
+    different from the stream of origin, the allocator might reuse the memory
+    unexpectedly. Calling this method lets the allocator know which streams
+    have used the tensor.
+
+.. warning::
+
+    This method is most suitable for use cases where you are providing a
+    function that created a tensor on a side stream, and want users to be able
+    to make use of the tensor without having to think carefully about stream
+    safety when making use of them.  These safety guarantees come at some
+    performance and predictability cost (analogous to the tradeoff between GC
+    and manual memory management), so if you are in a situation where
+    you manage the full lifetime of your tensors, you may consider instead
+    manually managing CUDA events so that calling this method is not necessary.
+    In particular, when you call this method, on later allocations the
+    allocator will poll the recorded stream to see if all operations have
+    completed yet; you can potentially race with side stream computation and
+    non-deterministically reuse or fail to reuse memory for an allocation.
+
+    You can safely use tensors allocated on side streams without
+    :meth:`~Tensor.record_stream`; you must manually ensure that
+    any non-creation stream uses of a tensor are synced back to the creation
+    stream before you deallocate the tensor.  As the CUDA caching allocator
+    guarantees that the memory will only be reused with the same creation stream,
+    this is sufficient to ensure that writes to future reallocations of the
+    memory will be delayed until non-creation stream uses are done.
+    (Counterintuitively, you may observe that on the CPU side we have already
+    reallocated the tensor, even though CUDA kernels on the old tensor are
+    still in progress.  This is fine, because CUDA operations on the new
+    tensor will appropriately wait for the old operations to complete, as they
+    are all on the same stream.)
+
+    Concretely, this looks like this::
+
+        with torch.cuda.stream(s0):
+            x = torch.zeros(N)
+
+        s1.wait_stream(s0)
+        with torch.cuda.stream(s1):
+            y = some_comm_op(x)
+
+        ... some compute on s0 ...
+
+        # synchronize creation stream s0 to side stream s1
+        # before deallocating x
+        s0.wait_stream(s1)
+        del x
+
+    Note that some discretion is required when deciding when to perform
+    ``s0.wait_stream(s1)``.  In particular, if we were to wait immediately
+    after ``some_comm_op``, there wouldn't be any point in having the side
+    stream; it would be equivalent to have run ``some_comm_op`` on ``s0``.
+    Instead, the synchronization must be placed at some appropriate, later
+    point in time where you expect the side stream ``s1`` to have finished
+    work.  This location is typically identified via profiling, e.g., using
+    Chrome traces produced
+    :meth:`torch.autograd.profiler.profile.export_chrome_trace`.  If you
+    place the wait too early, work on s0 will block until ``s1`` has finished,
+    preventing further overlapping of communication and computation.  If you
+    place the wait too late, you will use more memory than is strictly
+    necessary (as you are keeping ``x`` live for longer.)  For a concrete
+    example of how this guidance can be applied in practice, see this post:
+    `FSDP and CUDACachingAllocator
+    <https://dev-discuss.pytorch.org/t/fsdp-cudacachingallocator-an-outsider-newb-perspective/1486>`_.
+""",
+)
+
+add_docstr_all(
+    "remainder",
+    r"""
+remainder(divisor) -> Tensor
+
+See :func:`torch.remainder`
+""",
+)
+
+add_docstr_all(
+    "remainder_",
+    r"""
+remainder_(divisor) -> Tensor
+
+In-place version of :meth:`~Tensor.remainder`
+""",
+)
+
+add_docstr_all(
+    "renorm",
+    r"""
+renorm(p, dim, maxnorm) -> Tensor
+
+See :func:`torch.renorm`
+""",
+)
+
+add_docstr_all(
+    "renorm_",
+    r"""
+renorm_(p, dim, maxnorm) -> Tensor
+
+In-place version of :meth:`~Tensor.renorm`
+""",
+)
+
+add_docstr_all(
+    "repeat",
+    r"""
+repeat(*sizes) -> Tensor
+
+Repeats this tensor along the specified dimensions.
+
+Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
+
+.. warning::
+
+    :meth:`~Tensor.repeat` behaves differently from
+    `numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
+    but is more similar to
+    `numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
+    For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
+
+Args:
+    sizes (torch.Size or int...): The number of times to repeat this tensor along each
+        dimension
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.repeat(4, 2)
+    tensor([[ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3],
+            [ 1,  2,  3,  1,  2,  3]])
+    >>> x.repeat(4, 2, 1).size()
+    torch.Size([4, 2, 3])
+""",
+)
+
+add_docstr_all(
+    "repeat_interleave",
+    r"""
+repeat_interleave(repeats, dim=None, *, output_size=None) -> Tensor
+
+See :func:`torch.repeat_interleave`.
+""",
+)
+
+add_docstr_all(
+    "requires_grad_",
+    r"""
+requires_grad_(requires_grad=True) -> Tensor
+
+Change if autograd should record operations on this tensor: sets this tensor's
+:attr:`requires_grad` attribute in-place. Returns this tensor.
+
+:func:`requires_grad_`'s main use case is to tell autograd to begin recording
+operations on a Tensor ``tensor``. If ``tensor`` has ``requires_grad=False``
+(because it was obtained through a DataLoader, or required preprocessing or
+initialization), ``tensor.requires_grad_()`` makes it so that autograd will
+begin to record operations on ``tensor``.
+
+Args:
+    requires_grad (bool): If autograd should record operations on this tensor.
+        Default: ``True``.
+
+Example::
+
+    >>> # Let's say we want to preprocess some saved weights and use
+    >>> # the result as new weights.
+    >>> saved_weights = [0.1, 0.2, 0.3, 0.25]
+    >>> loaded_weights = torch.tensor(saved_weights)
+    >>> weights = preprocess(loaded_weights)  # some function
+    >>> weights
+    tensor([-0.5503,  0.4926, -2.1158, -0.8303])
+
+    >>> # Now, start to record operations done to weights
+    >>> weights.requires_grad_()
+    >>> out = weights.pow(2).sum()
+    >>> out.backward()
+    >>> weights.grad
+    tensor([-1.1007,  0.9853, -4.2316, -1.6606])
+
+""",
+)
+
+add_docstr_all(
+    "reshape",
+    r"""
+reshape(*shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`self`
+but with the specified shape. This method returns a view if :attr:`shape` is
+compatible with the current shape. See :meth:`torch.Tensor.view` on when it is
+possible to return a view.
+
+See :func:`torch.reshape`
+
+Args:
+    shape (tuple of ints or int...): the desired shape
+
+""",
+)
+
+add_docstr_all(
+    "reshape_as",
+    r"""
+reshape_as(other) -> Tensor
+
+Returns this tensor as the same shape as :attr:`other`.
+``self.reshape_as(other)`` is equivalent to ``self.reshape(other.sizes())``.
+This method returns a view if ``other.sizes()`` is compatible with the current
+shape. See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
+Please see :meth:`reshape` for more information about ``reshape``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same shape
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "resize_",
+    r"""
+resize_(*sizes, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes :attr:`self` tensor to the specified size. If the number of elements is
+larger than the current storage size, then the underlying storage is resized
+to fit the new number of elements. If the number of elements is smaller, the
+underlying storage is not changed. Existing elements are preserved but any new
+memory is uninitialized.
+
+.. warning::
+
+    This is a low-level method. The storage is reinterpreted as C-contiguous,
+    ignoring the current strides (unless the target size equals the current
+    size, in which case the tensor is left unchanged). For most purposes, you
+    will instead want to use :meth:`~Tensor.view()`, which checks for
+    contiguity, or :meth:`~Tensor.reshape()`, which copies data if needed. To
+    change the size in-place with custom strides, see :meth:`~Tensor.set_()`.
+
+.. note::
+
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, new elements are initialized to prevent nondeterministic behavior
+    from using the result as an input to an operation. Floating point and
+    complex values are set to NaN, and integer values are set to the maximum
+    value.
+
+Args:
+    sizes (torch.Size or int...): the desired size
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``sizes``.
+
+Example::
+
+    >>> x = torch.tensor([[1, 2], [3, 4], [5, 6]])
+    >>> x.resize_(2, 2)
+    tensor([[ 1,  2],
+            [ 3,  4]])
+""",
+)
+
+add_docstr_all(
+    "resize_as_",
+    r"""
+resize_as_(tensor, memory_format=torch.contiguous_format) -> Tensor
+
+Resizes the :attr:`self` tensor to be the same size as the specified
+:attr:`tensor`. This is equivalent to ``self.resize_(tensor.size())``.
+
+Args:
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        Tensor. Default: ``torch.contiguous_format``. Note that memory format of
+        :attr:`self` is going to be unaffected if ``self.size()`` matches ``tensor.size()``.
+
+""",
+)
+
+add_docstr_all(
+    "rot90",
+    r"""
+rot90(k, dims) -> Tensor
+
+See :func:`torch.rot90`
+""",
+)
+
+add_docstr_all(
+    "round",
+    r"""
+round(decimals=0) -> Tensor
+
+See :func:`torch.round`
+""",
+)
+
+add_docstr_all(
+    "round_",
+    r"""
+round_(decimals=0) -> Tensor
+
+In-place version of :meth:`~Tensor.round`
+""",
+)
+
+add_docstr_all(
+    "rsqrt",
+    r"""
+rsqrt() -> Tensor
+
+See :func:`torch.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "rsqrt_",
+    r"""
+rsqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.rsqrt`
+""",
+)
+
+add_docstr_all(
+    "scatter_",
+    r"""
+scatter_(dim, index, src, *, reduce=None) -> Tensor
+
+Writes all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor. For each value in :attr:`src`, its output
+index is specified by its index in :attr:`src` for ``dimension != dim`` and by
+the corresponding value in :attr:`index` for ``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+This is the reverse operation of the manner described in :meth:`~Tensor.gather`.
+
+:attr:`self`, :attr:`index` and :attr:`src` (if it is a Tensor) should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
+between ``0`` and ``self.size(dim) - 1`` inclusive.
+
+.. warning::
+
+    When indices are not unique, the behavior is non-deterministic (one of the
+    values from ``src`` will be picked arbitrarily) and the gradient will be
+    incorrect (it will be propagated to all locations in the source that
+    correspond to the same index)!
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Additionally accepts an optional :attr:`reduce` argument that allows
+specification of an optional reduction operation, which is applied to all
+values in the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index`. For each value in :attr:`src`, the reduction
+operation is applied to an index in :attr:`self` which is specified by
+its index in :attr:`src` for ``dimension != dim`` and by the corresponding
+value in :attr:`index` for ``dimension = dim``.
+
+Given a 3-D tensor and reduction using the multiplication operation, :attr:`self`
+is updated as::
+
+    self[index[i][j][k]][j][k] *= src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] *= src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] *= src[i][j][k]  # if dim == 2
+
+Reducing with the addition operation is the same as using
+:meth:`~torch.Tensor.scatter_add_`.
+
+.. warning::
+    The reduce argument with Tensor ``src`` is deprecated and will be removed in
+    a future PyTorch release. Please use :meth:`~torch.Tensor.scatter_reduce_`
+    instead for more reduction options.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    src (Tensor): the source element(s) to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> src = torch.arange(1, 11).reshape((2, 5))
+    >>> src
+    tensor([[ 1,  2,  3,  4,  5],
+            [ 6,  7,  8,  9, 10]])
+    >>> index = torch.tensor([[0, 1, 2, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(0, index, src)
+    tensor([[1, 0, 0, 4, 0],
+            [0, 2, 0, 0, 0],
+            [0, 0, 3, 0, 0]])
+    >>> index = torch.tensor([[0, 1, 2], [0, 1, 4]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_(1, index, src)
+    tensor([[1, 2, 3, 0, 0],
+            [6, 7, 0, 0, 8],
+            [0, 0, 0, 0, 0]])
+
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='multiply')
+    tensor([[2.0000, 2.0000, 2.4600, 2.0000],
+            [2.0000, 2.0000, 2.0000, 2.4600]])
+    >>> torch.full((2, 4), 2.).scatter_(1, torch.tensor([[2], [3]]),
+    ...            1.23, reduce='add')
+    tensor([[2.0000, 2.0000, 3.2300, 2.0000],
+            [2.0000, 2.0000, 2.0000, 3.2300]])
+
+.. function:: scatter_(dim, index, value, *, reduce=None) -> Tensor:
+   :noindex:
+
+Writes the value from :attr:`value` into :attr:`self` at the indices
+specified in the :attr:`index` tensor.  This operation is equivalent to the previous version,
+with the :attr:`src` tensor filled entirely with :attr:`value`.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter, can be either empty
+        or of the same dimensionality as ``src``. When empty, the operation
+        returns ``self`` unchanged.
+    value (Scalar): the value to scatter.
+
+Keyword args:
+    reduce (str, optional): reduction operation to apply, can be either
+        ``'add'`` or ``'multiply'``.
+
+Example::
+
+    >>> index = torch.tensor([[0, 1]])
+    >>> value = 2
+    >>> torch.zeros(3, 5).scatter_(0, index, value)
+    tensor([[2., 0., 0., 0., 0.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 0., 0., 0.]])
+""",
+)
+
+add_docstr_all(
+    "scatter_add_",
+    r"""
+scatter_add_(dim, index, src) -> Tensor
+
+Adds all values from the tensor :attr:`src` into :attr:`self` at the indices
+specified in the :attr:`index` tensor in a similar fashion as
+:meth:`~torch.Tensor.scatter_`. For each value in :attr:`src`, it is added to
+an index in :attr:`self` which is specified by its index in :attr:`src`
+for ``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``.
+
+For a 3-D tensor, :attr:`self` is updated as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+:attr:`self`, :attr:`index` and :attr:`src` should have same number of
+dimensions. It is also required that ``index.size(d) <= src.size(d)`` for all
+dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
+``d != dim``. Note that ``index`` and ``src`` do not broadcast.
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and add, can be
+        either empty or of the same dimensionality as ``src``. When empty, the
+        operation returns ``self`` unchanged.
+    src (Tensor): the source elements to scatter and add
+
+Example::
+
+    >>> src = torch.ones((2, 5))
+    >>> index = torch.tensor([[0, 1, 2, 0, 0]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[1., 0., 0., 1., 1.],
+            [0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 0.]])
+    >>> index = torch.tensor([[0, 1, 2, 0, 0], [0, 1, 2, 2, 2]])
+    >>> torch.zeros(3, 5, dtype=src.dtype).scatter_add_(0, index, src)
+    tensor([[2., 0., 0., 1., 1.],
+            [0., 2., 0., 0., 0.],
+            [0., 0., 2., 1., 1.]])
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "scatter_reduce_",
+    r"""
+scatter_reduce_(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Reduces all values from the :attr:`src` tensor to the indices specified in
+the :attr:`index` tensor in the :attr:`self` tensor using the applied reduction
+defined via the :attr:`reduce` argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`,
+:obj:`"amax"`, :obj:`"amin"`). For each value in :attr:`src`, it is reduced to an
+index in :attr:`self` which is specified by its index in :attr:`src` for
+``dimension != dim`` and by the corresponding value in :attr:`index` for
+``dimension = dim``. If :obj:`include_self="True"`, the values in the :attr:`self`
+tensor are included in the reduction.
+
+:attr:`self`, :attr:`index` and :attr:`src` should all have
+the same number of dimensions. It is also required that
+``index.size(d) <= src.size(d)`` for all dimensions ``d``, and that
+``index.size(d) <= self.size(d)`` for all dimensions ``d != dim``.
+Note that ``index`` and ``src`` do not broadcast.
+
+For a 3-D tensor with :obj:`reduce="sum"` and :obj:`include_self=True` the
+output is given as::
+
+    self[index[i][j][k]][j][k] += src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] += src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] += src[i][j][k]  # if dim == 2
+
+Note:
+    {forward_reproducibility_note}
+
+.. note::
+
+    The backward pass is implemented only for ``src.shape == index.shape``.
+
+.. warning::
+
+    This function is in beta and may change in the near future.
+
+Args:
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to scatter and reduce.
+    src (Tensor): the source elements to scatter and reduce
+    reduce (str): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`)
+    include_self (bool): whether elements from the :attr:`self` tensor are
+        included in the reduction
+
+Example::
+
+    >>> src = torch.tensor([1., 2., 3., 4., 5., 6.])
+    >>> index = torch.tensor([0, 1, 0, 1, 2, 1])
+    >>> input = torch.tensor([1., 2., 3., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum")
+    tensor([5., 14., 8., 4.])
+    >>> input.scatter_reduce(0, index, src, reduce="sum", include_self=False)
+    tensor([4., 12., 5., 4.])
+    >>> input2 = torch.tensor([5., 4., 3., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax")
+    tensor([5., 6., 5., 2.])
+    >>> input2.scatter_reduce(0, index, src, reduce="amax", include_self=False)
+    tensor([3., 6., 5., 2.])
+
+
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr_all(
+    "select",
+    r"""
+select(dim, index) -> Tensor
+
+See :func:`torch.select`
+""",
+)
+
+add_docstr_all(
+    "select_scatter",
+    r"""
+select_scatter(src, dim, index) -> Tensor
+
+See :func:`torch.select_scatter`
+""",
+)
+
+add_docstr_all(
+    "slice_scatter",
+    r"""
+slice_scatter(src, dim=0, start=None, end=None, step=1) -> Tensor
+
+See :func:`torch.slice_scatter`
+""",
+)
+
+add_docstr_all(
+    "set_",
+    r"""
+set_(source=None, storage_offset=0, size=None, stride=None) -> Tensor
+
+Sets the underlying storage, size, and strides. If :attr:`source` is a tensor,
+:attr:`self` tensor will share the same storage and have the same size and
+strides as :attr:`source`. Changes to elements in one tensor will be reflected
+in the other.
+
+If :attr:`source` is a :class:`~torch.Storage`, the method sets the underlying
+storage, offset, size, and stride.
+
+Args:
+    source (Tensor or Storage): the tensor or storage to use
+    storage_offset (int, optional): the offset in the storage
+    size (torch.Size, optional): the desired size. Defaults to the size of the source.
+    stride (tuple, optional): the desired stride. Defaults to C-contiguous strides.
+""",
+)
+
+add_docstr_all(
+    "sigmoid",
+    r"""
+sigmoid() -> Tensor
+
+See :func:`torch.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "sigmoid_",
+    r"""
+sigmoid_() -> Tensor
+
+In-place version of :meth:`~Tensor.sigmoid`
+""",
+)
+
+add_docstr_all(
+    "logit",
+    r"""
+logit() -> Tensor
+
+See :func:`torch.logit`
+""",
+)
+
+add_docstr_all(
+    "logit_",
+    r"""
+logit_() -> Tensor
+
+In-place version of :meth:`~Tensor.logit`
+""",
+)
+
+add_docstr_all(
+    "sign",
+    r"""
+sign() -> Tensor
+
+See :func:`torch.sign`
+""",
+)
+
+add_docstr_all(
+    "sign_",
+    r"""
+sign_() -> Tensor
+
+In-place version of :meth:`~Tensor.sign`
+""",
+)
+
+add_docstr_all(
+    "signbit",
+    r"""
+signbit() -> Tensor
+
+See :func:`torch.signbit`
+""",
+)
+
+add_docstr_all(
+    "sgn",
+    r"""
+sgn() -> Tensor
+
+See :func:`torch.sgn`
+""",
+)
+
+add_docstr_all(
+    "sgn_",
+    r"""
+sgn_() -> Tensor
+
+In-place version of :meth:`~Tensor.sgn`
+""",
+)
+
+add_docstr_all(
+    "sin",
+    r"""
+sin() -> Tensor
+
+See :func:`torch.sin`
+""",
+)
+
+add_docstr_all(
+    "sin_",
+    r"""
+sin_() -> Tensor
+
+In-place version of :meth:`~Tensor.sin`
+""",
+)
+
+add_docstr_all(
+    "sinc",
+    r"""
+sinc() -> Tensor
+
+See :func:`torch.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinc_",
+    r"""
+sinc_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinc`
+""",
+)
+
+add_docstr_all(
+    "sinh",
+    r"""
+sinh() -> Tensor
+
+See :func:`torch.sinh`
+""",
+)
+
+add_docstr_all(
+    "sinh_",
+    r"""
+sinh_() -> Tensor
+
+In-place version of :meth:`~Tensor.sinh`
+""",
+)
+
+add_docstr_all(
+    "size",
+    r"""
+size(dim=None) -> torch.Size or int
+
+Returns the size of the :attr:`self` tensor. If ``dim`` is not specified,
+the returned value is a :class:`torch.Size`, a subclass of :class:`tuple`.
+If ``dim`` is specified, returns an int holding the size of that dimension.
+
+Args:
+  dim (int, optional): The dimension for which to retrieve the size.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.size(dim=1)
+    4
+
+""",
+)
+
+add_docstr_all(
+    "shape",
+    r"""
+shape() -> torch.Size
+
+Returns the size of the :attr:`self` tensor. Alias for :attr:`size`.
+
+See also :meth:`Tensor.size`.
+
+Example::
+
+    >>> t = torch.empty(3, 4, 5)
+    >>> t.size()
+    torch.Size([3, 4, 5])
+    >>> t.shape
+    torch.Size([3, 4, 5])
+
+""",
+)
+
+add_docstr_all(
+    "sort",
+    r"""
+sort(dim=-1, descending=False) -> (Tensor, LongTensor)
+
+See :func:`torch.sort`
+""",
+)
+
+add_docstr_all(
+    "msort",
+    r"""
+msort() -> Tensor
+
+See :func:`torch.msort`
+""",
+)
+
+add_docstr_all(
+    "argsort",
+    r"""
+argsort(dim=-1, descending=False) -> LongTensor
+
+See :func:`torch.argsort`
+""",
+)
+
+add_docstr_all(
+    "sparse_dim",
+    r"""
+sparse_dim() -> int
+
+Return the number of sparse dimensions in a :ref:`sparse tensor <sparse-docs>` :attr:`self`.
+
+.. note::
+  Returns ``0`` if :attr:`self` is not a sparse tensor.
+
+See also :meth:`Tensor.dense_dim` and :ref:`hybrid tensors <sparse-hybrid-coo-docs>`.
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_",
+    r"""
+sparse_resize_(size, sparse_dim, dense_dim) -> Tensor
+
+Resizes :attr:`self` :ref:`sparse tensor <sparse-docs>` to the desired
+size and the number of sparse and dense dimensions.
+
+.. note::
+  If the number of specified elements in :attr:`self` is zero, then
+  :attr:`size`, :attr:`sparse_dim`, and :attr:`dense_dim` can be any
+  size and positive integers such that ``len(size) == sparse_dim +
+  dense_dim``.
+
+  If :attr:`self` specifies one or more elements, however, then each
+  dimension in :attr:`size` must not be smaller than the corresponding
+  dimension of :attr:`self`, :attr:`sparse_dim` must equal the number
+  of sparse dimensions in :attr:`self`, and :attr:`dense_dim` must
+  equal the number of dense dimensions in :attr:`self`.
+
+.. warning::
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size. If :attr:`self` is non-empty
+      sparse tensor, the desired size cannot be smaller than the
+      original size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sparse_resize_and_clear_",
+    r"""
+sparse_resize_and_clear_(size, sparse_dim, dense_dim) -> Tensor
+
+Removes all specified elements from a :ref:`sparse tensor
+<sparse-docs>` :attr:`self` and resizes :attr:`self` to the desired
+size and the number of sparse and dense dimensions.
+
+.. warning:
+  Throws an error if :attr:`self` is not a sparse tensor.
+
+Args:
+    size (torch.Size): the desired size.
+    sparse_dim (int): the number of sparse dimensions
+    dense_dim (int): the number of dense dimensions
+""",
+)
+
+add_docstr_all(
+    "sqrt",
+    r"""
+sqrt() -> Tensor
+
+See :func:`torch.sqrt`
+""",
+)
+
+add_docstr_all(
+    "sqrt_",
+    r"""
+sqrt_() -> Tensor
+
+In-place version of :meth:`~Tensor.sqrt`
+""",
+)
+
+add_docstr_all(
+    "square",
+    r"""
+square() -> Tensor
+
+See :func:`torch.square`
+""",
+)
+
+add_docstr_all(
+    "square_",
+    r"""
+square_() -> Tensor
+
+In-place version of :meth:`~Tensor.square`
+""",
+)
+
+add_docstr_all(
+    "squeeze",
+    r"""
+squeeze(dim=None) -> Tensor
+
+See :func:`torch.squeeze`
+""",
+)
+
+add_docstr_all(
+    "squeeze_",
+    r"""
+squeeze_(dim=None) -> Tensor
+
+In-place version of :meth:`~Tensor.squeeze`
+""",
+)
+
+add_docstr_all(
+    "std",
+    r"""
+std(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.std`
+""",
+)
+
+add_docstr_all(
+    "storage_offset",
+    r"""
+storage_offset() -> int
+
+Returns :attr:`self` tensor's offset in the underlying storage in terms of
+number of storage elements (not bytes).
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4, 5])
+    >>> x.storage_offset()
+    0
+    >>> x[3:].storage_offset()
+    3
+
+""",
+)
+
+add_docstr_all(
+    "untyped_storage",
+    r"""
+untyped_storage() -> torch.UntypedStorage
+
+Returns the underlying :class:`UntypedStorage`.
+""",
+)
+
+add_docstr_all(
+    "stride",
+    r"""
+stride(dim) -> tuple or int
+
+Returns the stride of :attr:`self` tensor.
+
+Stride is the jump necessary to go from one element to the next one in the
+specified dimension :attr:`dim`. A tuple of all strides is returned when no
+argument is passed in. Otherwise, an integer value is returned as the stride in
+the particular dimension :attr:`dim`.
+
+Args:
+    dim (int, optional): the desired dimension in which stride is required
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> x.stride()
+    (5, 1)
+    >>> x.stride(0)
+    5
+    >>> x.stride(-1)
+    1
+
+""",
+)
+
+add_docstr_all(
+    "sub",
+    r"""
+sub(other, *, alpha=1) -> Tensor
+
+See :func:`torch.sub`.
+""",
+)
+
+add_docstr_all(
+    "sub_",
+    r"""
+sub_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.sub`
+""",
+)
+
+add_docstr_all(
+    "subtract",
+    r"""
+subtract(other, *, alpha=1) -> Tensor
+
+See :func:`torch.subtract`.
+""",
+)
+
+add_docstr_all(
+    "subtract_",
+    r"""
+subtract_(other, *, alpha=1) -> Tensor
+
+In-place version of :meth:`~Tensor.subtract`.
+""",
+)
+
+add_docstr_all(
+    "sum",
+    r"""
+sum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.sum`
+""",
+)
+
+add_docstr_all(
+    "nansum",
+    r"""
+nansum(dim=None, keepdim=False, dtype=None) -> Tensor
+
+See :func:`torch.nansum`
+""",
+)
+
+add_docstr_all(
+    "svd",
+    r"""
+svd(some=True, compute_uv=True) -> (Tensor, Tensor, Tensor)
+
+See :func:`torch.svd`
+""",
+)
+
+add_docstr_all(
+    "swapdims",
+    r"""
+swapdims(dim0, dim1) -> Tensor
+
+See :func:`torch.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapdims_",
+    r"""
+swapdims_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapdims`
+""",
+)
+
+add_docstr_all(
+    "swapaxes",
+    r"""
+swapaxes(axis0, axis1) -> Tensor
+
+See :func:`torch.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "swapaxes_",
+    r"""
+swapaxes_(axis0, axis1) -> Tensor
+
+In-place version of :meth:`~Tensor.swapaxes`
+""",
+)
+
+add_docstr_all(
+    "t",
+    r"""
+t() -> Tensor
+
+See :func:`torch.t`
+""",
+)
+
+add_docstr_all(
+    "t_",
+    r"""
+t_() -> Tensor
+
+In-place version of :meth:`~Tensor.t`
+""",
+)
+
+add_docstr_all(
+    "tile",
+    r"""
+tile(dims) -> Tensor
+
+See :func:`torch.tile`
+""",
+)
+
+add_docstr_all(
+    "to",
+    r"""
+to(*args, **kwargs) -> Tensor
+
+Performs Tensor dtype and/or device conversion. A :class:`torch.dtype` and :class:`torch.device` are
+inferred from the arguments of ``self.to(*args, **kwargs)``.
+
+.. note::
+
+    If the ``self`` Tensor already
+    has the correct :class:`torch.dtype` and :class:`torch.device`, then ``self`` is returned.
+    Otherwise, the returned tensor is a copy of ``self`` with the desired
+    :class:`torch.dtype` and :class:`torch.device`.
+
+Here are the ways to call ``to``:
+
+.. method:: to(dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`dtype`
+
+    Args:
+        {memory_format}
+
+.. method:: to(device=None, dtype=None, non_blocking=False, copy=False, memory_format=torch.preserve_format) -> Tensor
+   :noindex:
+
+    Returns a Tensor with the specified :attr:`device` and (optional)
+    :attr:`dtype`. If :attr:`dtype` is ``None`` it is inferred to be ``self.dtype``.
+    When :attr:`non_blocking`, tries to convert asynchronously with respect to
+    the host if possible, e.g., converting a CPU Tensor with pinned memory to a
+    CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+    Args:
+        {memory_format}
+
+.. method:: to(other, non_blocking=False, copy=False) -> Tensor
+   :noindex:
+
+    Returns a Tensor with same :class:`torch.dtype` and :class:`torch.device` as
+    the Tensor :attr:`other`. When :attr:`non_blocking`, tries to convert
+    asynchronously with respect to the host if possible, e.g., converting a CPU
+    Tensor with pinned memory to a CUDA Tensor.
+    When :attr:`copy` is set, a new Tensor is created even when the Tensor
+    already matches the desired conversion.
+
+Example::
+
+    >>> tensor = torch.randn(2, 2)  # Initially dtype=float32, device=cpu
+    >>> tensor.to(torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64)
+
+    >>> cuda0 = torch.device('cuda:0')
+    >>> tensor.to(cuda0)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], device='cuda:0')
+
+    >>> tensor.to(cuda0, dtype=torch.float64)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+
+    >>> other = torch.randn((), dtype=torch.float64, device=cuda0)
+    >>> tensor.to(other, non_blocking=True)
+    tensor([[-0.5044,  0.0005],
+            [ 0.3310, -0.0584]], dtype=torch.float64, device='cuda:0')
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "byte",
+    r"""
+byte(memory_format=torch.preserve_format) -> Tensor
+
+``self.byte()`` is equivalent to ``self.to(torch.uint8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bool",
+    r"""
+bool(memory_format=torch.preserve_format) -> Tensor
+
+``self.bool()`` is equivalent to ``self.to(torch.bool)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "char",
+    r"""
+char(memory_format=torch.preserve_format) -> Tensor
+
+``self.char()`` is equivalent to ``self.to(torch.int8)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "bfloat16",
+    r"""
+bfloat16(memory_format=torch.preserve_format) -> Tensor
+``self.bfloat16()`` is equivalent to ``self.to(torch.bfloat16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "double",
+    r"""
+double(memory_format=torch.preserve_format) -> Tensor
+
+``self.double()`` is equivalent to ``self.to(torch.float64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "float",
+    r"""
+float(memory_format=torch.preserve_format) -> Tensor
+
+``self.float()`` is equivalent to ``self.to(torch.float32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cdouble",
+    r"""
+cdouble(memory_format=torch.preserve_format) -> Tensor
+
+``self.cdouble()`` is equivalent to ``self.to(torch.complex128)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "cfloat",
+    r"""
+cfloat(memory_format=torch.preserve_format) -> Tensor
+
+``self.cfloat()`` is equivalent to ``self.to(torch.complex64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "chalf",
+    r"""
+chalf(memory_format=torch.preserve_format) -> Tensor
+
+``self.chalf()`` is equivalent to ``self.to(torch.complex32)``. See :func:`to`.
+
+Args:
+     {memory_format}
+ """.format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "half",
+    r"""
+half(memory_format=torch.preserve_format) -> Tensor
+
+``self.half()`` is equivalent to ``self.to(torch.float16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int",
+    r"""
+int(memory_format=torch.preserve_format) -> Tensor
+
+``self.int()`` is equivalent to ``self.to(torch.int32)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "int_repr",
+    r"""
+int_repr() -> Tensor
+
+Given a quantized Tensor,
+``self.int_repr()`` returns a CPU Tensor with uint8_t as data type that stores the
+underlying uint8_t values of the given Tensor.
+""",
+)
+
+
+add_docstr_all(
+    "long",
+    r"""
+long(memory_format=torch.preserve_format) -> Tensor
+
+``self.long()`` is equivalent to ``self.to(torch.int64)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "short",
+    r"""
+short(memory_format=torch.preserve_format) -> Tensor
+
+``self.short()`` is equivalent to ``self.to(torch.int16)``. See :func:`to`.
+
+Args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr_all(
+    "take",
+    r"""
+take(indices) -> Tensor
+
+See :func:`torch.take`
+""",
+)
+
+add_docstr_all(
+    "take_along_dim",
+    r"""
+take_along_dim(indices, dim) -> Tensor
+
+See :func:`torch.take_along_dim`
+""",
+)
+
+add_docstr_all(
+    "tan",
+    r"""
+tan() -> Tensor
+
+See :func:`torch.tan`
+""",
+)
+
+add_docstr_all(
+    "tan_",
+    r"""
+tan_() -> Tensor
+
+In-place version of :meth:`~Tensor.tan`
+""",
+)
+
+add_docstr_all(
+    "tanh",
+    r"""
+tanh() -> Tensor
+
+See :func:`torch.tanh`
+""",
+)
+
+add_docstr_all(
+    "softmax",
+    r"""
+softmax(dim) -> Tensor
+
+Alias for :func:`torch.nn.functional.softmax`.
+""",
+)
+
+add_docstr_all(
+    "tanh_",
+    r"""
+tanh_() -> Tensor
+
+In-place version of :meth:`~Tensor.tanh`
+""",
+)
+
+add_docstr_all(
+    "tolist",
+    r"""
+tolist() -> list or number
+
+Returns the tensor as a (nested) list. For scalars, a standard
+Python number is returned, just like with :meth:`~Tensor.item`.
+Tensors are automatically moved to the CPU first if necessary.
+
+This operation is not differentiable.
+
+Examples::
+
+    >>> a = torch.randn(2, 2)
+    >>> a.tolist()
+    [[0.012766935862600803, 0.5415473580360413],
+     [-0.08909505605697632, 0.7729271650314331]]
+    >>> a[0,0].tolist()
+    0.012766935862600803
+""",
+)
+
+add_docstr_all(
+    "topk",
+    r"""
+topk(k, dim=None, largest=True, sorted=True) -> (Tensor, LongTensor)
+
+See :func:`torch.topk`
+""",
+)
+
+add_docstr_all(
+    "to_dense",
+    r"""
+to_dense(dtype=None, *, masked_grad=True) -> Tensor
+
+Creates a strided copy of :attr:`self` if :attr:`self` is not a strided tensor, otherwise returns :attr:`self`.
+
+Keyword args:
+    {dtype}
+    masked_grad (bool, optional): If set to ``True`` (default) and
+      :attr:`self` has a sparse layout then the backward of
+      :meth:`to_dense` returns ``grad.sparse_mask(self)``.
+
+Example::
+
+    >>> s = torch.sparse_coo_tensor(
+    ...        torch.tensor([[1, 1],
+    ...                      [0, 2]]),
+    ...        torch.tensor([9, 10]),
+    ...        size=(3, 3))
+    >>> s.to_dense()
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+""",
+)
+
+add_docstr_all(
+    "to_sparse",
+    r"""
+to_sparse(sparseDims) -> Tensor
+
+Returns a sparse copy of the tensor.  PyTorch supports sparse tensors in
+:ref:`coordinate format <sparse-coo-docs>`.
+
+Args:
+    sparseDims (int, optional): the number of sparse dimensions to include in the new sparse tensor
+
+Example::
+
+    >>> d = torch.tensor([[0, 0, 0], [9, 0, 10], [0, 0, 0]])
+    >>> d
+    tensor([[ 0,  0,  0],
+            [ 9,  0, 10],
+            [ 0,  0,  0]])
+    >>> d.to_sparse()
+    tensor(indices=tensor([[1, 1],
+                           [0, 2]]),
+           values=tensor([ 9, 10]),
+           size=(3, 3), nnz=2, layout=torch.sparse_coo)
+    >>> d.to_sparse(1)
+    tensor(indices=tensor([[1]]),
+           values=tensor([[ 9,  0, 10]]),
+           size=(3, 3), nnz=1, layout=torch.sparse_coo)
+
+.. method:: to_sparse(*, layout=None, blocksize=None, dense_dim=None) -> Tensor
+   :noindex:
+
+Returns a sparse tensor with the specified layout and blocksize.  If
+the :attr:`self` is strided, the number of dense dimensions could be
+specified, and a hybrid sparse tensor will be created, with
+`dense_dim` dense dimensions and `self.dim() - 2 - dense_dim` batch
+dimension.
+
+.. note:: If the :attr:`self` layout and blocksize parameters match
+          with the specified layout and blocksize, return
+          :attr:`self`. Otherwise, return a sparse tensor copy of
+          :attr:`self`.
+
+Args:
+
+    layout (:class:`torch.layout`, optional): The desired sparse
+      layout. One of ``torch.sparse_coo``, ``torch.sparse_csr``,
+      ``torch.sparse_csc``, ``torch.sparse_bsr``, or
+      ``torch.sparse_bsc``. Default: if ``None``,
+      ``torch.sparse_coo``.
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR or BSC tensor. For other layouts,
+      specifying the block size that is not ``None`` will result in a
+      RuntimeError exception.  A block size must be a tuple of length
+      two such that its items evenly divide the two sparse dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR, CSC, BSR or BSC tensor.  This argument should be
+      used only if :attr:`self` is a strided tensor, and must be a
+      value between 0 and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> x = torch.tensor([[1, 0], [0, 0], [2, 3]])
+    >>> x.to_sparse(layout=torch.sparse_coo)
+    tensor(indices=tensor([[0, 2, 2],
+                           [0, 0, 1]]),
+           values=tensor([1, 2, 3]),
+           size=(3, 2), nnz=3, layout=torch.sparse_coo)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(1, 2))
+    tensor(crow_indices=tensor([0, 1, 1, 2]),
+           col_indices=tensor([0, 0]),
+           values=tensor([[[1, 0]],
+                          [[2, 3]]]), size=(3, 2), nnz=2, layout=torch.sparse_bsr)
+    >>> x.to_sparse(layout=torch.sparse_bsr, blocksize=(2, 1))
+    RuntimeError: Tensor size(-2) 3 needs to be divisible by blocksize[0] 2
+    >>> x.to_sparse(layout=torch.sparse_csr, blocksize=(3, 1))
+    RuntimeError: to_sparse for Strided to SparseCsr conversion does not use specified blocksize
+
+    >>> x = torch.tensor([[[1], [0]], [[0], [0]], [[2], [3]]])
+    >>> x.to_sparse(layout=torch.sparse_csr, dense_dim=1)
+    tensor(crow_indices=tensor([0, 1, 1, 3]),
+           col_indices=tensor([0, 0, 1]),
+           values=tensor([[1],
+                          [2],
+                          [3]]), size=(3, 2, 1), nnz=3, layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csr",
+    r"""
+to_sparse_csr(dense_dim=None) -> Tensor
+
+Convert a tensor to compressed row storage format (CSR).  Except for
+strided tensors, only works with 2D tensors.  If the :attr:`self` is
+strided, then the number of dense dimensions could be specified, and a
+hybrid CSR tensor will be created, with `dense_dim` dense dimensions
+and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csr(dense_dim=2)
+    tensor(crow_indices=tensor([0, 1, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_csc",
+    r"""
+to_sparse_csc() -> Tensor
+
+Convert a tensor to compressed column storage (CSC) format.  Except
+for strided tensors, only works with 2D tensors.  If the :attr:`self`
+is strided, then the number of dense dimensions could be specified,
+and a hybrid CSC tensor will be created, with `dense_dim` dense
+dimensions and `self.dim() - 2 - dense_dim` batch dimension.
+
+Args:
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting CSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(5, 5)
+    >>> sparse = dense.to_sparse_csc()
+    >>> sparse._nnz()
+    25
+
+    >>> dense = torch.zeros(3, 3, 1, 1)
+    >>> dense[0, 0] = dense[1, 2] = dense[2, 1] = 1
+    >>> dense.to_sparse_csc(dense_dim=2)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 2, 1]),
+           values=tensor([[[1.]],
+
+                          [[1.]],
+
+                          [[1.]]]), size=(3, 3, 1, 1), nnz=3,
+           layout=torch.sparse_csc)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsr",
+    r"""
+to_sparse_bsr(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse row (BSR) storage format of given
+blocksize.  If the :attr:`self` is strided, then the number of dense
+dimensions could be specified, and a hybrid BSR tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSR tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSR tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsr = sparse.to_sparse_bsr((5, 5))
+    >>> sparse_bsr.col_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsr((2, 1), 1)
+    tensor(crow_indices=tensor([0, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsr)
+
+""",
+)
+
+add_docstr_all(
+    "to_sparse_bsc",
+    r"""
+to_sparse_bsc(blocksize, dense_dim) -> Tensor
+
+Convert a tensor to a block sparse column (BSC) storage format of
+given blocksize.  If the :attr:`self` is strided, then the number of
+dense dimensions could be specified, and a hybrid BSC tensor will be
+created, with `dense_dim` dense dimensions and `self.dim() - 2 -
+dense_dim` batch dimension.
+
+Args:
+
+    blocksize (list, tuple, :class:`torch.Size`, optional): Block size
+      of the resulting BSC tensor. A block size must be a tuple of
+      length two such that its items evenly divide the two sparse
+      dimensions.
+
+    dense_dim (int, optional): Number of dense dimensions of the
+      resulting BSC tensor.  This argument should be used only if
+      :attr:`self` is a strided tensor, and must be a value between 0
+      and dimension of :attr:`self` tensor minus two.
+
+Example::
+
+    >>> dense = torch.randn(10, 10)
+    >>> sparse = dense.to_sparse_csr()
+    >>> sparse_bsc = sparse.to_sparse_bsc((5, 5))
+    >>> sparse_bsc.row_indices()
+    tensor([0, 1, 0, 1])
+
+    >>> dense = torch.zeros(4, 3, 1)
+    >>> dense[0:2, 0] = dense[0:2, 2] = dense[2:4, 1] = 1
+    >>> dense.to_sparse_bsc((2, 1), 1)
+    tensor(ccol_indices=tensor([0, 1, 2, 3]),
+           row_indices=tensor([0, 1, 0]),
+           values=tensor([[[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]],
+
+
+                          [[[1.]],
+
+                           [[1.]]]]), size=(4, 3, 1), nnz=3,
+           layout=torch.sparse_bsc)
+
+""",
+)
+
+add_docstr_all(
+    "to_mkldnn",
+    r"""
+to_mkldnn() -> Tensor
+Returns a copy of the tensor in ``torch.mkldnn`` layout.
+
+""",
+)
+
+add_docstr_all(
+    "trace",
+    r"""
+trace() -> Tensor
+
+See :func:`torch.trace`
+""",
+)
+
+add_docstr_all(
+    "transpose",
+    r"""
+transpose(dim0, dim1) -> Tensor
+
+See :func:`torch.transpose`
+""",
+)
+
+add_docstr_all(
+    "transpose_",
+    r"""
+transpose_(dim0, dim1) -> Tensor
+
+In-place version of :meth:`~Tensor.transpose`
+""",
+)
+
+add_docstr_all(
+    "triangular_solve",
+    r"""
+triangular_solve(A, upper=True, transpose=False, unitriangular=False) -> (Tensor, Tensor)
+
+See :func:`torch.triangular_solve`
+""",
+)
+
+add_docstr_all(
+    "tril",
+    r"""
+tril(diagonal=0) -> Tensor
+
+See :func:`torch.tril`
+""",
+)
+
+add_docstr_all(
+    "tril_",
+    r"""
+tril_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.tril`
+""",
+)
+
+add_docstr_all(
+    "triu",
+    r"""
+triu(diagonal=0) -> Tensor
+
+See :func:`torch.triu`
+""",
+)
+
+add_docstr_all(
+    "triu_",
+    r"""
+triu_(diagonal=0) -> Tensor
+
+In-place version of :meth:`~Tensor.triu`
+""",
+)
+
+add_docstr_all(
+    "true_divide",
+    r"""
+true_divide(value) -> Tensor
+
+See :func:`torch.true_divide`
+""",
+)
+
+add_docstr_all(
+    "true_divide_",
+    r"""
+true_divide_(value) -> Tensor
+
+In-place version of :meth:`~Tensor.true_divide_`
+""",
+)
+
+add_docstr_all(
+    "trunc",
+    r"""
+trunc() -> Tensor
+
+See :func:`torch.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix",
+    r"""
+fix() -> Tensor
+
+See :func:`torch.fix`.
+""",
+)
+
+add_docstr_all(
+    "trunc_",
+    r"""
+trunc_() -> Tensor
+
+In-place version of :meth:`~Tensor.trunc`
+""",
+)
+
+add_docstr_all(
+    "fix_",
+    r"""
+fix_() -> Tensor
+
+In-place version of :meth:`~Tensor.fix`
+""",
+)
+
+add_docstr_all(
+    "type",
+    r"""
+type(dtype=None, non_blocking=False, **kwargs) -> str or Tensor
+Returns the type if `dtype` is not provided, else casts this object to
+the specified type.
+
+If this is already of the correct type, no copy is performed and the
+original object is returned.
+
+Args:
+    dtype (dtype or string): The desired type
+    non_blocking (bool): If ``True``, and the source is in pinned memory
+        and destination is on the GPU or vice versa, the copy is performed
+        asynchronously with respect to the host. Otherwise, the argument
+        has no effect.
+    **kwargs: For compatibility, may contain the key ``async`` in place of
+        the ``non_blocking`` argument. The ``async`` arg is deprecated.
+""",
+)
+
+add_docstr_all(
+    "type_as",
+    r"""
+type_as(tensor) -> Tensor
+
+Returns this tensor cast to the type of the given tensor.
+
+This is a no-op if the tensor is already of the correct type. This is
+equivalent to ``self.type(tensor.type())``
+
+Args:
+    tensor (Tensor): the tensor which has the desired type
+""",
+)
+
+add_docstr_all(
+    "unfold",
+    r"""
+unfold(dimension, size, step) -> Tensor
+
+Returns a view of the original tensor which contains all slices of size :attr:`size` from
+:attr:`self` tensor in the dimension :attr:`dimension`.
+
+Step between two slices is given by :attr:`step`.
+
+If `sizedim` is the size of dimension :attr:`dimension` for :attr:`self`, the size of
+dimension :attr:`dimension` in the returned tensor will be
+`(sizedim - size) / step + 1`.
+
+An additional dimension of size :attr:`size` is appended in the returned tensor.
+
+Args:
+    dimension (int): dimension in which unfolding happens
+    size (int): the size of each slice that is unfolded
+    step (int): the step between each slice
+
+Example::
+
+    >>> x = torch.arange(1., 8)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.])
+    >>> x.unfold(0, 2, 1)
+    tensor([[ 1.,  2.],
+            [ 2.,  3.],
+            [ 3.,  4.],
+            [ 4.,  5.],
+            [ 5.,  6.],
+            [ 6.,  7.]])
+    >>> x.unfold(0, 2, 2)
+    tensor([[ 1.,  2.],
+            [ 3.,  4.],
+            [ 5.,  6.]])
+""",
+)
+
+add_docstr_all(
+    "uniform_",
+    r"""
+uniform_(from=0, to=1, *, generator=None) -> Tensor
+
+Fills :attr:`self` tensor with numbers sampled from the continuous uniform
+distribution:
+
+.. math::
+    f(x) = \dfrac{1}{\text{to} - \text{from}}
+""",
+)
+
+add_docstr_all(
+    "unsqueeze",
+    r"""
+unsqueeze(dim) -> Tensor
+
+See :func:`torch.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "unsqueeze_",
+    r"""
+unsqueeze_(dim) -> Tensor
+
+In-place version of :meth:`~Tensor.unsqueeze`
+""",
+)
+
+add_docstr_all(
+    "var",
+    r"""
+var(dim=None, *, correction=1, keepdim=False) -> Tensor
+
+See :func:`torch.var`
+""",
+)
+
+add_docstr_all(
+    "vdot",
+    r"""
+vdot(other) -> Tensor
+
+See :func:`torch.vdot`
+""",
+)
+
+add_docstr_all(
+    "view",
+    r"""
+view(*shape) -> Tensor
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`shape`.
+
+The returned tensor shares the same data and must have the same number
+of elements, but may have a different size. For a tensor to be viewed, the new
+view size must be compatible with its original size and stride, i.e., each new
+view dimension must either be a subspace of an original dimension, or only span
+across original dimensions :math:`d, d+1, \dots, d+k` that satisfy the following
+contiguity-like condition that :math:`\forall i = d, \dots, d+k-1`,
+
+.. math::
+
+  \text{stride}[i] = \text{stride}[i+1] \times \text{size}[i+1]
+
+Otherwise, it will not be possible to view :attr:`self` tensor as :attr:`shape`
+without copying it (e.g., via :meth:`contiguous`). When it is unclear whether a
+:meth:`view` can be performed, it is advisable to use :meth:`reshape`, which
+returns a view if the shapes are compatible, and copies (equivalent to calling
+:meth:`contiguous`) otherwise.
+
+Args:
+    shape (torch.Size or int...): the desired size
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x.size()
+    torch.Size([4, 4])
+    >>> y = x.view(16)
+    >>> y.size()
+    torch.Size([16])
+    >>> z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
+    >>> z.size()
+    torch.Size([2, 8])
+
+    >>> a = torch.randn(1, 2, 3, 4)
+    >>> a.size()
+    torch.Size([1, 2, 3, 4])
+    >>> b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
+    >>> b.size()
+    torch.Size([1, 3, 2, 4])
+    >>> c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
+    >>> c.size()
+    torch.Size([1, 3, 2, 4])
+    >>> torch.equal(b, c)
+    False
+
+
+.. method:: view(dtype) -> Tensor
+   :noindex:
+
+Returns a new tensor with the same data as the :attr:`self` tensor but of a
+different :attr:`dtype`.
+
+If the element size of :attr:`dtype` is different than that of ``self.dtype``,
+then the size of the last dimension of the output will be scaled
+proportionally.  For instance, if :attr:`dtype` element size is twice that of
+``self.dtype``, then each pair of elements in the last dimension of
+:attr:`self` will be combined, and the size of the last dimension of the output
+will be half that of :attr:`self`. If :attr:`dtype` element size is half that
+of ``self.dtype``, then each element in the last dimension of :attr:`self` will
+be split in two, and the size of the last dimension of the output will be
+double that of :attr:`self`. For this to be possible, the following conditions
+must be true:
+
+    * ``self.dim()`` must be greater than 0.
+    * ``self.stride(-1)`` must be 1.
+
+Additionally, if the element size of :attr:`dtype` is greater than that of
+``self.dtype``, the following conditions must be true as well:
+
+    * ``self.size(-1)`` must be divisible by the ratio between the element
+      sizes of the dtypes.
+    * ``self.storage_offset()`` must be divisible by the ratio between the
+      element sizes of the dtypes.
+    * The strides of all dimensions, except the last dimension, must be
+      divisible by the ratio between the element sizes of the dtypes.
+
+If any of the above conditions are not met, an error is thrown.
+
+.. warning::
+
+    This overload is not supported by TorchScript, and using it in a Torchscript
+    program will cause undefined behavior.
+
+
+Args:
+    dtype (:class:`torch.dtype`): the desired dtype
+
+Example::
+
+    >>> x = torch.randn(4, 4)
+    >>> x
+    tensor([[ 0.9482, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+    >>> x.dtype
+    torch.float32
+
+    >>> y = x.view(torch.int32)
+    >>> y
+    tensor([[ 1064483442, -1124191867,  1069546515, -1089989247],
+            [-1105482831,  1061112040,  1057999968, -1084397505],
+            [-1071760287, -1123489973, -1097310419, -1084649136],
+            [-1101533110,  1073668768, -1082790149, -1088634448]],
+        dtype=torch.int32)
+    >>> y[0, 0] = 1000000000
+    >>> x
+    tensor([[ 0.0047, -0.0310,  1.4999, -0.5316],
+            [-0.1520,  0.7472,  0.5617, -0.8649],
+            [-2.4724, -0.0334, -0.2976, -0.8499],
+            [-0.2109,  1.9913, -0.9607, -0.6123]])
+
+    >>> x.view(torch.cfloat)
+    tensor([[ 0.0047-0.0310j,  1.4999-0.5316j],
+            [-0.1520+0.7472j,  0.5617-0.8649j],
+            [-2.4724-0.0334j, -0.2976-0.8499j],
+            [-0.2109+1.9913j, -0.9607-0.6123j]])
+    >>> x.view(torch.cfloat).size()
+    torch.Size([4, 2])
+
+    >>> x.view(torch.uint8)
+    tensor([[  0, 202, 154,  59, 182, 243, 253, 188, 185, 252, 191,  63, 240,  22,
+               8, 191],
+            [227, 165,  27, 190, 128,  72,  63,  63, 146, 203,  15,  63,  22, 106,
+              93, 191],
+            [205,  59,  30, 192, 112, 206,   8, 189,   7,  95, 152, 190,  12, 147,
+              89, 191],
+            [ 43, 246,  87, 190, 235, 226, 254,  63, 111, 240, 117, 191, 177, 191,
+              28, 191]], dtype=torch.uint8)
+    >>> x.view(torch.uint8).size()
+    torch.Size([4, 16])
+""",
+)
+
+add_docstr_all(
+    "view_as",
+    r"""
+view_as(other) -> Tensor
+
+View this tensor as the same size as :attr:`other`.
+``self.view_as(other)`` is equivalent to ``self.view(other.size())``.
+
+Please see :meth:`~Tensor.view` for more information about ``view``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "expand",
+    r"""
+expand(*sizes) -> Tensor
+
+Returns a new view of the :attr:`self` tensor with singleton dimensions expanded
+to a larger size.
+
+Passing -1 as the size for a dimension means not changing the size of
+that dimension.
+
+Tensor can be also expanded to a larger number of dimensions, and the
+new ones will be appended at the front. For the new dimensions, the
+size cannot be set to -1.
+
+Expanding a tensor does not allocate new memory, but only creates a
+new view on the existing tensor where a dimension of size one is
+expanded to a larger size by setting the ``stride`` to 0. Any dimension
+of size 1 can be expanded to an arbitrary value without allocating new
+memory.
+
+Args:
+    *sizes (torch.Size or int...): the desired expanded size
+
+.. warning::
+
+    More than one element of an expanded tensor may refer to a single
+    memory location. As a result, in-place operations (especially ones that
+    are vectorized) may result in incorrect behavior. If you need to write
+    to the tensors, please clone them first.
+
+Example::
+
+    >>> x = torch.tensor([[1], [2], [3]])
+    >>> x.size()
+    torch.Size([3, 1])
+    >>> x.expand(3, 4)
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+    >>> x.expand(-1, 4)   # -1 means not changing the size of that dimension
+    tensor([[ 1,  1,  1,  1],
+            [ 2,  2,  2,  2],
+            [ 3,  3,  3,  3]])
+""",
+)
+
+add_docstr_all(
+    "expand_as",
+    r"""
+expand_as(other) -> Tensor
+
+Expand this tensor to the same size as :attr:`other`.
+``self.expand_as(other)`` is equivalent to ``self.expand(other.size())``.
+
+Please see :meth:`~Tensor.expand` for more information about ``expand``.
+
+Args:
+    other (:class:`torch.Tensor`): The result tensor has the same size
+        as :attr:`other`.
+""",
+)
+
+add_docstr_all(
+    "sum_to_size",
+    r"""
+sum_to_size(*size) -> Tensor
+
+Sum ``this`` tensor to :attr:`size`.
+:attr:`size` must be broadcastable to ``this`` tensor size.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+""",
+)
+
+
+add_docstr_all(
+    "zero_",
+    r"""
+zero_() -> Tensor
+
+Fills :attr:`self` tensor with zeros.
+""",
+)
+
+add_docstr_all(
+    "matmul",
+    r"""
+matmul(tensor2) -> Tensor
+
+See :func:`torch.matmul`
+""",
+)
+
+add_docstr_all(
+    "chunk",
+    r"""
+chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_chunk",
+    r"""
+unsafe_chunk(chunks, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_chunk`
+""",
+)
+
+add_docstr_all(
+    "unsafe_split",
+    r"""
+unsafe_split(split_size, dim=0) -> List of Tensors
+
+See :func:`torch.unsafe_split`
+""",
+)
+
+add_docstr_all(
+    "tensor_split",
+    r"""
+tensor_split(indices_or_sections, dim=0) -> List of Tensors
+
+See :func:`torch.tensor_split`
+""",
+)
+
+add_docstr_all(
+    "hsplit",
+    r"""
+hsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.hsplit`
+""",
+)
+
+add_docstr_all(
+    "vsplit",
+    r"""
+vsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.vsplit`
+""",
+)
+
+add_docstr_all(
+    "dsplit",
+    r"""
+dsplit(split_size_or_sections) -> List of Tensors
+
+See :func:`torch.dsplit`
+""",
+)
+
+add_docstr_all(
+    "stft",
+    r"""
+stft(frame_length, hop, fft_size=None, return_onesided=True, window=None, pad_end=0) -> Tensor
+
+See :func:`torch.stft`
+""",
+)
+
+add_docstr_all(
+    "istft",
+    r"""
+istft(n_fft, hop_length=None, win_length=None, window=None,
+ center=True, normalized=False, onesided=True, length=None) -> Tensor
+
+See :func:`torch.istft`
+""",
+)
+
+add_docstr_all(
+    "det",
+    r"""
+det() -> Tensor
+
+See :func:`torch.det`
+""",
+)
+
+add_docstr_all(
+    "where",
+    r"""
+where(condition, y) -> Tensor
+
+``self.where(condition, y)`` is equivalent to ``torch.where(condition, self, y)``.
+See :func:`torch.where`
+""",
+)
+
+add_docstr_all(
+    "logdet",
+    r"""
+logdet() -> Tensor
+
+See :func:`torch.logdet`
+""",
+)
+
+add_docstr_all(
+    "slogdet",
+    r"""
+slogdet() -> (Tensor, Tensor)
+
+See :func:`torch.slogdet`
+""",
+)
+
+add_docstr_all(
+    "unbind",
+    r"""
+unbind(dim=0) -> seq
+
+See :func:`torch.unbind`
+""",
+)
+
+add_docstr_all(
+    "pin_memory",
+    r"""
+pin_memory() -> Tensor
+
+Copies the tensor to pinned memory, if it's not already pinned.
+""",
+)
+
+add_docstr_all(
+    "pinverse",
+    r"""
+pinverse() -> Tensor
+
+See :func:`torch.pinverse`
+""",
+)
+
+add_docstr_all(
+    "index_add",
+    r"""
+index_add(dim, index, source, *, alpha=1) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_add_`.
+""",
+)
+
+add_docstr_all(
+    "index_copy",
+    r"""
+index_copy(dim, index, tensor2) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_copy_`.
+""",
+)
+
+add_docstr_all(
+    "index_fill",
+    r"""
+index_fill(dim, index, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.index_fill_`.
+""",
+)
+
+add_docstr_all(
+    "scatter",
+    r"""
+scatter(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_`
+""",
+)
+
+add_docstr_all(
+    "scatter_add",
+    r"""
+scatter_add(dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+""",
+)
+
+add_docstr_all(
+    "scatter_reduce",
+    r"""
+scatter_reduce(dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""",
+)
+
+add_docstr_all(
+    "masked_scatter",
+    r"""
+masked_scatter(mask, tensor) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_scatter_`
+
+.. note::
+
+    The inputs :attr:`self` and :attr:`mask`
+    :ref:`broadcast <broadcasting-semantics>`.
+
+Example:
+
+    >>> self = torch.tensor([0, 0, 0, 0, 0])
+    >>> mask = torch.tensor([[0, 0, 0, 1, 1], [1, 1, 0, 1, 1]])
+    >>> source = torch.tensor([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
+    >>> self.masked_scatter(mask, source)
+    tensor([[0, 0, 0, 0, 1],
+            [2, 3, 0, 4, 5]])
+
+""",
+)
+
+add_docstr_all(
+    "xlogy",
+    r"""
+xlogy(other) -> Tensor
+
+See :func:`torch.xlogy`
+""",
+)
+
+add_docstr_all(
+    "xlogy_",
+    r"""
+xlogy_(other) -> Tensor
+
+In-place version of :meth:`~Tensor.xlogy`
+""",
+)
+
+add_docstr_all(
+    "masked_fill",
+    r"""
+masked_fill(mask, value) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.masked_fill_`
+""",
+)
+
+add_docstr_all(
+    "grad",
+    r"""
+This attribute is ``None`` by default and becomes a Tensor the first time a call to
+:func:`backward` computes gradients for ``self``.
+The attribute will then contain the gradients computed and future calls to
+:func:`backward` will accumulate (add) gradients into it.
+""",
+)
+
+add_docstr_all(
+    "retain_grad",
+    r"""
+retain_grad() -> None
+
+Enables this Tensor to have their :attr:`grad` populated during
+:func:`backward`. This is a no-op for leaf tensors.
+""",
+)
+
+add_docstr_all(
+    "retains_grad",
+    r"""
+Is ``True`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+populated during :func:`backward`, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "requires_grad",
+    r"""
+Is ``True`` if gradients need to be computed for this Tensor, ``False`` otherwise.
+
+.. note::
+
+    The fact that gradients need to be computed for a Tensor do not mean that the :attr:`grad`
+    attribute will be populated, see :attr:`is_leaf` for more details.
+
+""",
+)
+
+add_docstr_all(
+    "is_leaf",
+    r"""
+All Tensors that have :attr:`requires_grad` which is ``False`` will be leaf Tensors by convention.
+
+For Tensors that have :attr:`requires_grad` which is ``True``, they will be leaf Tensors if they were
+created by the user. This means that they are not the result of an operation and so
+:attr:`grad_fn` is None.
+
+Only leaf Tensors will have their :attr:`grad` populated during a call to :func:`backward`.
+To get :attr:`grad` populated for non-leaf Tensors, you can use :func:`retain_grad`.
+
+Example::
+
+    >>> a = torch.rand(10, requires_grad=True)
+    >>> a.is_leaf
+    True
+    >>> b = torch.rand(10, requires_grad=True).cuda()
+    >>> b.is_leaf
+    False
+    # b was created by the operation that cast a cpu Tensor into a cuda Tensor
+    >>> c = torch.rand(10, requires_grad=True) + 2
+    >>> c.is_leaf
+    False
+    # c was created by the addition operation
+    >>> d = torch.rand(10).cuda()
+    >>> d.is_leaf
+    True
+    # d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+    >>> e = torch.rand(10).cuda().requires_grad_()
+    >>> e.is_leaf
+    True
+    # e requires gradients and has no operations creating it
+    >>> f = torch.rand(10, requires_grad=True, device="cuda")
+    >>> f.is_leaf
+    True
+    # f requires grad, has no operation creating it
+
+
+""",
+)
+
+add_docstr_all(
+    "names",
+    r"""
+Stores names for each of this tensor's dimensions.
+
+``names[idx]`` corresponds to the name of tensor dimension ``idx``.
+Names are either a string if the dimension is named or ``None`` if the
+dimension is unnamed.
+
+Dimension names may contain characters or underscore. Furthermore, a dimension
+name must be a valid Python variable name (i.e., does not start with underscore).
+
+Tensors may not have two named dimensions with the same name.
+
+.. warning::
+    The named tensor API is experimental and subject to change.
+
+""",
+)
+
+add_docstr_all(
+    "is_cuda",
+    r"""
+Is ``True`` if the Tensor is stored on the GPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_cpu",
+    r"""
+Is ``True`` if the Tensor is stored on the CPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xla",
+    r"""
+Is ``True`` if the Tensor is stored on an XLA device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_ipu",
+    r"""
+Is ``True`` if the Tensor is stored on the IPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_xpu",
+    r"""
+Is ``True`` if the Tensor is stored on the XPU, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_quantized",
+    r"""
+Is ``True`` if the Tensor is quantized, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_meta",
+    r"""
+Is ``True`` if the Tensor is a meta tensor, ``False`` otherwise.  Meta tensors
+are like normal tensors, but they carry no data.
+""",
+)
+
+add_docstr_all(
+    "is_mps",
+    r"""
+Is ``True`` if the Tensor is stored on the MPS device, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse",
+    r"""
+Is ``True`` if the Tensor uses sparse COO storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "is_sparse_csr",
+    r"""
+Is ``True`` if the Tensor uses sparse CSR storage layout, ``False`` otherwise.
+""",
+)
+
+add_docstr_all(
+    "device",
+    r"""
+Is the :class:`torch.device` where this Tensor is.
+""",
+)
+
+add_docstr_all(
+    "ndim",
+    r"""
+Alias for :meth:`~Tensor.dim()`
+""",
+)
+
+add_docstr_all(
+    "itemsize",
+    r"""
+Alias for :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "nbytes",
+    r"""
+Returns the number of bytes consumed by the "view" of elements of the Tensor
+if the Tensor does not use sparse storage layout.
+Defined to be :meth:`~Tensor.numel()` * :meth:`~Tensor.element_size()`
+""",
+)
+
+add_docstr_all(
+    "T",
+    r"""
+Returns a view of this tensor with its dimensions reversed.
+
+If ``n`` is the number of dimensions in ``x``,
+``x.T`` is equivalent to ``x.permute(n-1, n-2, ..., 0)``.
+
+.. warning::
+    The use of :func:`Tensor.T` on tensors of dimension other than 2 to reverse their shape
+    is deprecated and it will throw an error in a future release. Consider :attr:`~.Tensor.mT`
+    to transpose batches of matrices or `x.permute(*torch.arange(x.ndim - 1, -1, -1))` to reverse
+    the dimensions of a tensor.
+""",
+)
+
+add_docstr_all(
+    "H",
+    r"""
+Returns a view of a matrix (2-D tensor) conjugated and transposed.
+
+``x.H`` is equivalent to ``x.transpose(0, 1).conj()`` for complex matrices and
+``x.transpose(0, 1)`` for real matrices.
+
+.. seealso::
+
+        :attr:`~.Tensor.mH`: An attribute that also works on batches of matrices.
+""",
+)
+
+add_docstr_all(
+    "mT",
+    r"""
+Returns a view of this tensor with the last two dimensions transposed.
+
+``x.mT`` is equivalent to ``x.transpose(-2, -1)``.
+""",
+)
+
+add_docstr_all(
+    "mH",
+    r"""
+Accessing this property is equivalent to calling :func:`adjoint`.
+""",
+)
+
+add_docstr_all(
+    "adjoint",
+    r"""
+adjoint() -> Tensor
+
+Alias for :func:`adjoint`
+""",
+)
+
+add_docstr_all(
+    "real",
+    r"""
+Returns a new tensor containing real values of the :attr:`self` tensor for a complex-valued input tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+Returns :attr:`self` if :attr:`self` is a real-valued tensor tensor.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.real
+    tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+
+""",
+)
+
+add_docstr_all(
+    "imag",
+    r"""
+Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+.. warning::
+    :func:`imag` is only supported for tensors with complex dtypes.
+
+Example::
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.imag
+    tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+
+""",
+)
+
+add_docstr_all(
+    "as_subclass",
+    r"""
+as_subclass(cls) -> Tensor
+
+Makes a ``cls`` instance with the same data pointer as ``self``. Changes
+in the output mirror changes in ``self``, and the output stays attached
+to the autograd graph. ``cls`` must be a subclass of ``Tensor``.
+""",
+)
+
+add_docstr_all(
+    "crow_indices",
+    r"""
+crow_indices() -> IntTensor
+
+Returns the tensor containing the compressed row indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``crow_indices`` tensor is strictly of shape (:attr:`self`.size(0) + 1)
+and of type ``int32`` or ``int64``. When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.crow_indices()
+    tensor([0, 1, 2, 3, 4, 5], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "col_indices",
+    r"""
+col_indices() -> IntTensor
+
+Returns the tensor containing the column indices of the :attr:`self`
+tensor when :attr:`self` is a sparse CSR tensor of layout ``sparse_csr``.
+The ``col_indices`` tensor is strictly of shape (:attr:`self`.nnz())
+and of type ``int32`` or ``int64``.  When using MKL routines such as sparse
+matrix multiplication, it is necessary to use ``int32`` indexing in order
+to avoid downcasting and potentially losing information.
+
+Example::
+    >>> csr = torch.eye(5,5).to_sparse_csr()
+    >>> csr.col_indices()
+    tensor([0, 1, 2, 3, 4], dtype=torch.int32)
+
+""",
+)
+
+add_docstr_all(
+    "to_padded_tensor",
+    r"""
+to_padded_tensor(padding, output_size=None) -> Tensor
+See :func:`to_padded_tensor`
+""",
+)
diff --git a/MLPY/Lib/site-packages/torch/_tensor_str.py b/MLPY/Lib/site-packages/torch/_tensor_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffed793f56286b58d9a0c1711706738ea5a0d96c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_tensor_str.py
@@ -0,0 +1,697 @@
+import contextlib
+import dataclasses
+import math
+import textwrap
+from typing import Any, Dict, Optional
+
+import torch
+from torch import inf
+
+
+@dataclasses.dataclass
+class __PrinterOptions:
+    precision: int = 4
+    threshold: float = 1000
+    edgeitems: int = 3
+    linewidth: int = 80
+    sci_mode: Optional[bool] = None
+
+
+PRINT_OPTS = __PrinterOptions()
+
+
+# We could use **kwargs, but this will give better docs
+def set_printoptions(
+    precision=None,
+    threshold=None,
+    edgeitems=None,
+    linewidth=None,
+    profile=None,
+    sci_mode=None,
+):
+    r"""Set options for printing. Items shamelessly taken from NumPy
+
+    Args:
+        precision: Number of digits of precision for floating point output
+            (default = 4).
+        threshold: Total number of array elements which trigger summarization
+            rather than full `repr` (default = 1000).
+        edgeitems: Number of array items in summary at beginning and end of
+            each dimension (default = 3).
+        linewidth: The number of characters per line for the purpose of
+            inserting line breaks (default = 80). Thresholded matrices will
+            ignore this parameter.
+        profile: Sane defaults for pretty printing. Can override with any of
+            the above options. (any one of `default`, `short`, `full`)
+        sci_mode: Enable (True) or disable (False) scientific notation. If
+            None (default) is specified, the value is defined by
+            `torch._tensor_str._Formatter`. This value is automatically chosen
+            by the framework.
+
+    Example::
+
+        >>> # Limit the precision of elements
+        >>> torch.set_printoptions(precision=2)
+        >>> torch.tensor([1.12345])
+        tensor([1.12])
+        >>> # Limit the number of elements shown
+        >>> torch.set_printoptions(threshold=5)
+        >>> torch.arange(10)
+        tensor([0, 1, 2, ..., 7, 8, 9])
+        >>> # Restore defaults
+        >>> torch.set_printoptions(profile='default')
+        >>> torch.tensor([1.12345])
+        tensor([1.1235])
+        >>> torch.arange(10)
+        tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    """
+    if profile is not None:
+        if profile == "default":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+        elif profile == "short":
+            PRINT_OPTS.precision = 2
+            PRINT_OPTS.threshold = 1000
+            PRINT_OPTS.edgeitems = 2
+            PRINT_OPTS.linewidth = 80
+        elif profile == "full":
+            PRINT_OPTS.precision = 4
+            PRINT_OPTS.threshold = inf
+            PRINT_OPTS.edgeitems = 3
+            PRINT_OPTS.linewidth = 80
+
+    if precision is not None:
+        PRINT_OPTS.precision = precision
+    if threshold is not None:
+        PRINT_OPTS.threshold = threshold
+    if edgeitems is not None:
+        PRINT_OPTS.edgeitems = edgeitems
+    if linewidth is not None:
+        PRINT_OPTS.linewidth = linewidth
+    PRINT_OPTS.sci_mode = sci_mode
+
+
+def get_printoptions() -> Dict[str, Any]:
+    r"""Gets the current options for printing, as a dictionary that
+    can be passed as ``**kwargs`` to set_printoptions().
+    """
+    return dataclasses.asdict(PRINT_OPTS)
+
+
+@contextlib.contextmanager
+def printoptions(**kwargs):
+    r"""Context manager that temporarily changes the print options.  Accepted
+    arguments are same as :func:`set_printoptions`."""
+    old_kwargs = get_printoptions()
+    set_printoptions(**kwargs)
+    try:
+        yield
+    finally:
+        set_printoptions(**old_kwargs)
+
+
+def tensor_totype(t):
+    dtype = torch.float if t.is_mps else torch.double
+    return t.to(dtype=dtype)
+
+
+class _Formatter:
+    def __init__(self, tensor):
+        self.floating_dtype = tensor.dtype.is_floating_point
+        self.int_mode = True
+        self.sci_mode = False
+        self.max_width = 1
+
+        with torch.no_grad():
+            tensor_view = tensor.reshape(-1)
+
+        if not self.floating_dtype:
+            for value in tensor_view:
+                value_str = f"{value}"
+                self.max_width = max(self.max_width, len(value_str))
+
+        else:
+            nonzero_finite_vals = torch.masked_select(
+                tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0)
+            )
+
+            if nonzero_finite_vals.numel() == 0:
+                # no valid number, do nothing
+                return
+
+            # Convert to double for easy calculation. HalfTensor overflows with 1e8, and there's no div() on CPU.
+            nonzero_finite_abs = tensor_totype(nonzero_finite_vals.abs())
+            nonzero_finite_min = tensor_totype(nonzero_finite_abs.min())
+            nonzero_finite_max = tensor_totype(nonzero_finite_abs.max())
+
+            for value in nonzero_finite_vals:
+                if value != torch.ceil(value):
+                    self.int_mode = False
+                    break
+
+            if self.int_mode:
+                # in int_mode for floats, all numbers are integers, and we append a decimal to nonfinites
+                # to indicate that the tensor is of floating type. add 1 to the len to account for this.
+                if (
+                    nonzero_finite_max / nonzero_finite_min > 1000.0
+                    or nonzero_finite_max > 1.0e8
+                ):
+                    self.sci_mode = True
+                    for value in nonzero_finite_vals:
+                        value_str = f"{{:.{PRINT_OPTS.precision}e}}".format(value)
+                        self.max_width = max(self.max_width, len(value_str))
+                else:
+                    for value in nonzero_finite_vals:
+                        value_str = f"{value:.0f}"
+                        self.max_width = max(self.max_width, len(value_str) + 1)
+            else:
+                # Check if scientific representation should be used.
+                if (
+                    nonzero_finite_max / nonzero_finite_min > 1000.0
+                    or nonzero_finite_max > 1.0e8
+                    or nonzero_finite_min < 1.0e-4
+                ):
+                    self.sci_mode = True
+                    for value in nonzero_finite_vals:
+                        value_str = f"{{:.{PRINT_OPTS.precision}e}}".format(value)
+                        self.max_width = max(self.max_width, len(value_str))
+                else:
+                    for value in nonzero_finite_vals:
+                        value_str = f"{{:.{PRINT_OPTS.precision}f}}".format(value)
+                        self.max_width = max(self.max_width, len(value_str))
+
+        if PRINT_OPTS.sci_mode is not None:
+            self.sci_mode = PRINT_OPTS.sci_mode
+
+    def width(self):
+        return self.max_width
+
+    def format(self, value):
+        if self.floating_dtype:
+            if self.sci_mode:
+                ret = f"{{:{self.max_width}.{PRINT_OPTS.precision}e}}".format(value)
+            elif self.int_mode:
+                ret = f"{value:.0f}"
+                if not (math.isinf(value) or math.isnan(value)):
+                    ret += "."
+            else:
+                ret = f"{{:.{PRINT_OPTS.precision}f}}".format(value)
+        else:
+            ret = f"{value}"
+        return (self.max_width - len(ret)) * " " + ret
+
+
+def _scalar_str(self, formatter1, formatter2=None):
+    if formatter2 is not None:
+        real_str = _scalar_str(self.real, formatter1)
+        imag_str = (_scalar_str(self.imag, formatter2) + "j").lstrip()
+        # handles negative numbers, +0.0, -0.0
+        if imag_str[0] == "+" or imag_str[0] == "-":
+            return real_str + imag_str
+        else:
+            return real_str + "+" + imag_str
+    else:
+        return formatter1.format(self.item())
+
+
+def _vector_str(self, indent, summarize, formatter1, formatter2=None):
+    # length includes spaces and comma between elements
+    element_length = formatter1.width() + 2
+    if formatter2 is not None:
+        # width for imag_formatter + an extra j for complex
+        element_length += formatter2.width() + 1
+
+    elements_per_line = max(
+        1, int(math.floor((PRINT_OPTS.linewidth - indent) / (element_length)))
+    )
+
+    def _val_formatter(val, formatter1=formatter1, formatter2=formatter2):
+        if formatter2 is not None:
+            real_str = formatter1.format(val.real)
+            imag_str = (formatter2.format(val.imag) + "j").lstrip()
+            # handles negative numbers, +0.0, -0.0
+            if imag_str[0] == "+" or imag_str[0] == "-":
+                return real_str + imag_str
+            else:
+                return real_str + "+" + imag_str
+        else:
+            return formatter1.format(val)
+
+    if summarize and not PRINT_OPTS.edgeitems:
+        # Deal with edge case that negative zero is zero
+        data = ["..."]
+    elif summarize and self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        data = (
+            [_val_formatter(val) for val in self[: PRINT_OPTS.edgeitems].tolist()]
+            + [" ..."]
+            + [_val_formatter(val) for val in self[-PRINT_OPTS.edgeitems :].tolist()]
+        )
+    else:
+        data = [_val_formatter(val) for val in self.tolist()]
+
+    data_lines = [
+        data[i : i + elements_per_line] for i in range(0, len(data), elements_per_line)
+    ]
+    lines = [", ".join(line) for line in data_lines]
+    return "[" + ("," + "\n" + " " * (indent + 1)).join(lines) + "]"
+
+
+# formatter2 is only used for printing complex tensors.
+# For complex tensors, formatter1 and formatter2 are the formatters for tensor.real
+# and tensor.imag respesectively
+def _tensor_str_with_formatter(self, indent, summarize, formatter1, formatter2=None):
+    dim = self.dim()
+
+    if dim == 0:
+        return _scalar_str(self, formatter1, formatter2)
+
+    if dim == 1:
+        return _vector_str(self, indent, summarize, formatter1, formatter2)
+
+    if summarize and self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        slices = (
+            [
+                _tensor_str_with_formatter(
+                    self[i], indent + 1, summarize, formatter1, formatter2
+                )
+                for i in range(0, PRINT_OPTS.edgeitems)
+            ]
+            + ["..."]
+            + [
+                _tensor_str_with_formatter(
+                    self[i], indent + 1, summarize, formatter1, formatter2
+                )
+                for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))
+            ]
+        )
+    else:
+        slices = [
+            _tensor_str_with_formatter(
+                self[i], indent + 1, summarize, formatter1, formatter2
+            )
+            for i in range(0, self.size(0))
+        ]
+
+    tensor_str = ("," + "\n" * (dim - 1) + " " * (indent + 1)).join(slices)
+    return "[" + tensor_str + "]"
+
+
+def _tensor_str(self, indent):
+    if self.numel() == 0:
+        return "[]"
+
+    if self.has_names():
+        # There are two main codepaths (possibly more) that tensor printing goes through:
+        # - tensor data can fit comfortably on screen
+        # - tensor data needs to be summarized
+        # Some of the codepaths don't fully support named tensors, so we send in
+        # an unnamed tensor to the formatting code as a workaround.
+        self = self.rename(None)
+
+    summarize = self.numel() > PRINT_OPTS.threshold
+
+    if self._is_zerotensor():
+        self = self.clone()
+
+    # handle the negative bit
+    if self.is_neg():
+        self = self.resolve_neg()
+
+    if self.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+    ]:
+        self = self.float()
+
+    if self.dtype is torch.complex32:
+        self = self.cfloat()
+
+    if self.dtype.is_complex:
+        # handle the conjugate bit
+        self = self.resolve_conj()
+        real_formatter = _Formatter(
+            get_summarized_data(self.real) if summarize else self.real
+        )
+        imag_formatter = _Formatter(
+            get_summarized_data(self.imag) if summarize else self.imag
+        )
+        return _tensor_str_with_formatter(
+            self, indent, summarize, real_formatter, imag_formatter
+        )
+    else:
+        formatter = _Formatter(get_summarized_data(self) if summarize else self)
+        return _tensor_str_with_formatter(self, indent, summarize, formatter)
+
+
+def _add_suffixes(tensor_str, suffixes, indent, force_newline):
+    tensor_strs = [tensor_str]
+    last_line_len = len(tensor_str) - tensor_str.rfind("\n") + 1
+    for suffix in suffixes:
+        suffix_len = len(suffix)
+        if force_newline or last_line_len + suffix_len + 2 > PRINT_OPTS.linewidth:
+            tensor_strs.append(",\n" + " " * indent + suffix)
+            last_line_len = indent + suffix_len
+            force_newline = False
+        else:
+            tensor_strs.append(", " + suffix)
+            last_line_len += suffix_len + 2
+    tensor_strs.append(")")
+    return "".join(tensor_strs)
+
+
+def get_summarized_data(self):
+    dim = self.dim()
+    if dim == 0:
+        return self
+    if dim == 1:
+        if self.size(0) > 2 * PRINT_OPTS.edgeitems:
+            return torch.cat(
+                (self[: PRINT_OPTS.edgeitems], self[-PRINT_OPTS.edgeitems :])
+            )
+        else:
+            return self
+    if not PRINT_OPTS.edgeitems:
+        return self.new_empty([0] * self.dim())
+    elif self.size(0) > 2 * PRINT_OPTS.edgeitems:
+        start = [self[i] for i in range(0, PRINT_OPTS.edgeitems)]
+        end = [self[i] for i in range(len(self) - PRINT_OPTS.edgeitems, len(self))]
+        return torch.stack([get_summarized_data(x) for x in (start + end)])
+    else:
+        return torch.stack([get_summarized_data(x) for x in self])
+
+
+def _str_intern(inp, *, tensor_contents=None):
+    if torch._C._functorch.is_functorch_wrapped_tensor(inp):
+        return _functorch_wrapper_str_intern(inp, tensor_contents=tensor_contents)
+    is_plain_tensor = type(inp) is torch.Tensor or type(inp) is torch.nn.Parameter
+    if inp.is_nested:
+        prefix = "nested_tensor("
+    elif is_plain_tensor:
+        prefix = "tensor("
+    else:
+        prefix = f"{type(inp).__name__}("
+    indent = len(prefix)
+    suffixes = []
+    custom_contents_provided = tensor_contents is not None
+    if custom_contents_provided:
+        tensor_str = tensor_contents
+
+    # This is used to extract the primal value and thus disable the forward AD
+    # within this function.
+    # TODO(albanD) This needs to be updated when more than one level is supported
+    self, tangent = torch.autograd.forward_ad.unpack_dual(inp)
+
+    # Note [Print tensor device]:
+    # A general logic here is we only print device when it doesn't match
+    # the device specified in default tensor type.
+    # Currently torch.set_default_tensor_type() only supports CPU/CUDA, thus
+    # torch._C._get_default_device() only returns either cpu or cuda.
+    # In other cases, we don't have a way to set them as default yet,
+    # and we should always print out device for them.
+    if (
+        self.device.type != torch._C._get_default_device()
+        or (
+            self.device.type == "cuda"
+            and torch.cuda.current_device() != self.device.index
+        )
+        or (self.device.type == "mps")
+    ):
+        suffixes.append("device='" + str(self.device) + "'")
+
+    # Tensor printing performs tensor operations like slice, indexing, etc to make it in a
+    # representable format. These operations on ipu/xla/lazy/mtia tensor results in compilations. Hence,
+    # to avoid compilations, copying the tensor to cpu before printing.
+    if self.device.type in ["xla", "lazy", "ipu", "mtia"]:
+        self = self.to("cpu")
+
+    # TODO: add an API to map real -> complex dtypes
+    _default_complex_dtype = (
+        torch.cdouble if torch.get_default_dtype() == torch.double else torch.cfloat
+    )
+    has_default_dtype = self.dtype in (
+        torch.get_default_dtype(),
+        _default_complex_dtype,
+        torch.int64,
+        torch.bool,
+    )
+    if self.is_sparse:
+        suffixes.append("size=" + str(tuple(self.shape)))
+        from torch._subclasses.fake_tensor import FakeTensor
+
+        is_meta = self.is_meta or isinstance(self, FakeTensor)
+        if not is_meta:
+            suffixes.append("nnz=" + str(self._nnz()))
+        if not has_default_dtype:
+            suffixes.append("dtype=" + str(self.dtype))
+        if not custom_contents_provided:
+            indices_prefix = "indices=tensor("
+            indices = self._indices().detach()
+            if is_meta:
+                indices_str = "..."
+            else:
+                indices_str = _tensor_str(indices, indent + len(indices_prefix))
+            if indices.numel() == 0 or is_meta:
+                indices_str += ", size=" + str(tuple(indices.shape))
+            values_prefix = "values=tensor("
+            values = self._values().detach()
+            if is_meta:
+                values_str = "..."
+            else:
+                values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0 or is_meta:
+                values_str += ", size=" + str(tuple(values.shape))
+            tensor_str = (
+                indices_prefix
+                + indices_str
+                + "),\n"
+                + " " * indent
+                + values_prefix
+                + values_str
+                + ")"
+            )
+    elif self.layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        from torch._subclasses.fake_tensor import FakeTensor
+
+        suffixes.append("size=" + str(tuple(self.shape)))
+        is_meta = self.is_meta or isinstance(self, FakeTensor)
+        if not is_meta:
+            suffixes.append("nnz=" + str(self._nnz()))
+        if not has_default_dtype:
+            suffixes.append("dtype=" + str(self.dtype))
+        if not custom_contents_provided:
+            compressed_indices_method, plain_indices_method = {
+                torch.sparse_csr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+                torch.sparse_csc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+                torch.sparse_bsr: (torch.Tensor.crow_indices, torch.Tensor.col_indices),
+                torch.sparse_bsc: (torch.Tensor.ccol_indices, torch.Tensor.row_indices),
+            }[self.layout]
+            if self.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                cdimname, pdimname = "row", "column"
+            else:
+                cdimname, pdimname = "column", "row"
+            compressed_indices_prefix = f"c{cdimname[:3]}_indices=tensor("
+            compressed_indices = compressed_indices_method(self).detach()
+            if is_meta:
+                compressed_indices_str = "..."
+            else:
+                compressed_indices_str = _tensor_str(
+                    compressed_indices, indent + len(compressed_indices_prefix)
+                )
+            if compressed_indices.numel() == 0 or is_meta:
+                compressed_indices_str += ", size=" + str(
+                    tuple(compressed_indices.shape)
+                )
+            plain_indices_prefix = f"{pdimname[:3]}_indices=tensor("
+            plain_indices = plain_indices_method(self).detach()
+            if is_meta:
+                plain_indices_str = "..."
+            else:
+                plain_indices_str = _tensor_str(
+                    plain_indices, indent + len(plain_indices_prefix)
+                )
+            if plain_indices.numel() == 0 or is_meta:
+                plain_indices_str += ", size=" + str(tuple(plain_indices.shape))
+            values_prefix = "values=tensor("
+            values = self.values().detach()
+            if is_meta:
+                values_str = "..."
+            else:
+                values_str = _tensor_str(values, indent + len(values_prefix))
+            if values.numel() == 0 or is_meta:
+                values_str += ", size=" + str(tuple(values.shape))
+            tensor_str = (
+                compressed_indices_prefix
+                + compressed_indices_str
+                + "),\n"
+                + " " * indent
+                + plain_indices_prefix
+                + plain_indices_str
+                + "),\n"
+                + " " * indent
+                + values_prefix
+                + values_str
+                + ")"
+            )
+    elif self.is_quantized:
+        suffixes.append("size=" + str(tuple(self.shape)))
+        if not has_default_dtype:
+            suffixes.append("dtype=" + str(self.dtype))
+        suffixes.append("quantization_scheme=" + str(self.qscheme()))
+        if (
+            self.qscheme() == torch.per_tensor_affine
+            or self.qscheme() == torch.per_tensor_symmetric
+        ):
+            suffixes.append("scale=" + str(self.q_scale()))
+            suffixes.append("zero_point=" + str(self.q_zero_point()))
+        elif (
+            self.qscheme() == torch.per_channel_affine
+            or self.qscheme() == torch.per_channel_symmetric
+            or self.qscheme() == torch.per_channel_affine_float_qparams
+        ):
+            suffixes.append("scale=" + str(self.q_per_channel_scales()))
+            suffixes.append("zero_point=" + str(self.q_per_channel_zero_points()))
+            suffixes.append("axis=" + str(self.q_per_channel_axis()))
+        if not custom_contents_provided:
+            tensor_str = _tensor_str(self.dequantize(), indent)
+    elif self.is_nested:
+        if not custom_contents_provided:
+
+            def indented_str(s, indent):
+                return "\n".join(f"  {line}" for line in s.split("\n"))
+
+            strs = ",\n".join(
+                indented_str(str(t), indent + 1)
+                for t in torch.ops.aten.unbind.int(self, 0)
+            )
+            tensor_str = f"[\n{strs}\n]"
+    elif torch._is_functional_tensor(self):
+        prefix = "_to_functional_tensor("
+        tensor_str = repr(torch._from_functional_tensor(self))
+    else:
+        # Circular import problem, so we import it here
+        from torch._subclasses.fake_tensor import FakeTensor
+
+        if self.is_meta or isinstance(self, FakeTensor):
+            suffixes.append("size=" + str(tuple(self.shape)))
+            if self.dtype != torch.get_default_dtype():
+                suffixes.append("dtype=" + str(self.dtype))
+            # TODO: This implies that ellipses is valid syntax for allocating
+            # a meta tensor or FakeTensor, which it could be, but it isn't right now
+            if not custom_contents_provided:
+                tensor_str = "..."
+        else:
+            if self.numel() == 0 and not self.is_sparse:
+                # Explicitly print the shape if it is not (0,), to match NumPy behavior
+                if self.dim() != 1:
+                    suffixes.append("size=" + str(tuple(self.shape)))
+
+                # In an empty tensor, there are no elements to infer if the dtype
+                # should be int64, so it must be shown explicitly.
+                if self.dtype != torch.get_default_dtype():
+                    suffixes.append("dtype=" + str(self.dtype))
+                if not custom_contents_provided:
+                    tensor_str = "[]"
+            else:
+                if not PRINT_OPTS.edgeitems:
+                    suffixes.append("size=" + str(tuple(self.shape)))
+
+                if not has_default_dtype:
+                    suffixes.append("dtype=" + str(self.dtype))
+
+                if not custom_contents_provided:
+                    if self.layout != torch.strided:
+                        tensor_str = _tensor_str(self.to_dense(), indent)
+                    else:
+                        tensor_str = _tensor_str(self, indent)
+
+    if self.layout != torch.strided:
+        suffixes.append("layout=" + str(self.layout))
+
+    # Use inp here to get the original grad_fn and not the one generated by the forward grad
+    # unpacking.
+    grad_fn_name = None
+    try:
+        grad_fn = inp.grad_fn
+    except RuntimeError:
+        # Accessing the grad_fn calls rebasing logic which would cause an error
+        # if that tensor is a view created in no-grad mode modified in-place in
+        # no-grad mode. See: https://github.com/pytorch/pytorch/issues/99968
+        grad_fn_name = "Invalid"
+
+    if grad_fn_name is None and grad_fn is not None:  # type: ignore[possibly-undefined]
+        grad_fn_name = type(grad_fn).__name__
+        if grad_fn_name == "CppFunction":
+            grad_fn_name = grad_fn.name().rsplit("::", 1)[-1]
+
+    if grad_fn_name is not None:
+        suffixes.append(f"grad_fn=<{grad_fn_name}>")
+    elif inp.requires_grad:
+        suffixes.append("requires_grad=True")
+
+    if self.has_names():
+        suffixes.append(f"names={self.names}")
+
+    if tangent is not None:
+        suffixes.append(f"tangent={tangent}")
+
+    string_repr = _add_suffixes(
+        prefix + tensor_str, suffixes, indent, force_newline=self.is_sparse  # type: ignore[possibly-undefined]
+    )
+
+    # Check if this instance is flagged as a parameter and change the repr accordingly.
+    # Unfortunately, this function has to be aware of this detail.
+    # NB: This is currently skipped for plain tensor parameters to maintain BC. In the future,
+    # this should be done for those as well to produce a valid repr.
+    if isinstance(self, torch.nn.Parameter) and not is_plain_tensor:
+        string_repr = f"Parameter({string_repr})"
+
+    return string_repr
+
+
+def _functorch_wrapper_str_intern(tensor, *, tensor_contents=None):
+    level = torch._C._functorch.maybe_get_level(tensor)
+    assert level != -1
+
+    if torch._C._functorch.is_functionaltensor(tensor):
+        # Since we're unwrapping the FunctionalTensorWrapper, we need to make sure
+        # that it's up to date first
+        torch._sync(tensor)
+
+    value = torch._C._functorch.get_unwrapped(tensor)
+    value_repr = repr(value)
+
+    indented_value_repr = textwrap.indent(value_repr, " " * 4)
+    if torch._C._functorch.is_batchedtensor(tensor):
+        bdim = torch._C._functorch.maybe_get_bdim(tensor)
+        assert bdim != -1
+        return (
+            f"BatchedTensor(lvl={level}, bdim={bdim}, value=\n"
+            f"{indented_value_repr}\n"
+            f")"
+        )
+    if torch._C._functorch.is_gradtrackingtensor(tensor):
+        return (
+            f"GradTrackingTensor(lvl={level}, value=\n" f"{indented_value_repr}\n" f")"
+        )
+    if torch._C._functorch.is_functionaltensor(tensor):
+        return f"FunctionalTensor(lvl={level}, value=\\\n{value_repr})"
+
+    raise ValueError("We don't know how to print this, please file us an issue")
+
+
+def _str(self, *, tensor_contents=None):
+    with torch.no_grad(), torch.utils._python_dispatch._disable_current_modes():
+        guard = torch._C._DisableFuncTorch()
+        return _str_intern(self, tensor_contents=tensor_contents)
diff --git a/MLPY/Lib/site-packages/torch/_torch_docs.py b/MLPY/Lib/site-packages/torch/_torch_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..a95c21974800ca3f889a6850d97e3e71d2856edb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_torch_docs.py
@@ -0,0 +1,14192 @@
+"""Adds docstrings to functions defined in the torch._C module."""
+
+import re
+
+import torch._C
+from torch._C import _add_docstr as add_docstr
+
+
+def parse_kwargs(desc):
+    r"""Map a description of args to a dictionary of {argname: description}.
+
+    Input:
+        ('    weight (Tensor): a weight tensor\n' +
+         '        Some optional description')
+    Output: {
+        'weight': \
+        'weight (Tensor): a weight tensor\n        Some optional description'
+    }
+    """
+    # Split on exactly 4 spaces after a newline
+    regx = re.compile(r"\n\s{4}(?!\s)")
+    kwargs = [section.strip() for section in regx.split(desc)]
+    kwargs = [section for section in kwargs if len(section) > 0]
+    return {desc.split(" ")[0]: desc for desc in kwargs}
+
+
+def merge_dicts(*dicts):
+    """Merge dictionaries into a single dictionary."""
+    return {x: d[x] for d in dicts for x in d}
+
+
+common_args = parse_kwargs(
+    """
+    input (Tensor): the input tensor.
+    generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
+    out (Tensor, optional): the output tensor.
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned tensor. Default: ``torch.preserve_format``.
+"""
+)
+
+reduceops_common_args = merge_dicts(
+    common_args,
+    parse_kwargs(
+        """
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        If specified, the input tensor is casted to :attr:`dtype` before the operation
+        is performed. This is useful for preventing data type overflows. Default: None.
+    keepdim (bool): whether the output tensor has :attr:`dim` retained or not.
+"""
+    ),
+)
+
+multi_dim_common = merge_dicts(
+    reduceops_common_args,
+    parse_kwargs(
+        """
+    dim (int or tuple of ints): the dimension or dimensions to reduce.
+"""
+    ),
+    {
+        "keepdim_details": """
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in the
+output tensor having 1 (or ``len(dim)``) fewer dimension(s).
+"""
+    },
+    {
+        "opt_dim": """
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+        If ``None``, all dimensions are reduced.
+"""
+    },
+)
+
+single_dim_common = merge_dicts(
+    reduceops_common_args,
+    parse_kwargs(
+        """
+    dim (int): the dimension to reduce.
+"""
+    ),
+    {
+        "keepdim_details": """If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension :attr:`dim` where it is of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the output tensor having 1 fewer dimension than :attr:`input`."""
+    },
+)
+
+factory_common_args = merge_dicts(
+    common_args,
+    parse_kwargs(
+        """
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+    layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+        Default: ``torch.strided``.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, uses the current device for the default tensor type
+        (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.contiguous_format``.
+    check_invariants (bool, optional): If sparse tensor invariants are checked.
+        Default: as returned by :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled`,
+        initially False.
+"""
+    ),
+    {
+        "sparse_factory_device_note": """\
+.. note::
+
+   If the ``device`` argument is not specified the device of the given
+   :attr:`values` and indices tensor(s) must match. If, however, the
+   argument is specified the input Tensors will be converted to the
+   given device and in turn determine the device of the constructed
+   sparse tensor."""
+    },
+)
+
+factory_like_common_args = parse_kwargs(
+    """
+    input (Tensor): the size of :attr:`input` will determine size of the output tensor.
+    layout (:class:`torch.layout`, optional): the desired layout of returned tensor.
+        Default: if ``None``, defaults to the layout of :attr:`input`.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+        Default: if ``None``, defaults to the dtype of :attr:`input`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, defaults to the device of :attr:`input`.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+        returned Tensor. Default: ``torch.preserve_format``.
+"""
+)
+
+factory_data_common_args = parse_kwargs(
+    """
+    data (array_like): Initial data for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, infers data type from :attr:`data`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if ``None``, uses the current device for the default tensor type
+        (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+"""
+)
+
+tf32_notes = {
+    "tf32_note": """This operator supports :ref:`TensorFloat32<tf32_on_ampere>`."""
+}
+
+rocm_fp16_notes = {
+    "rocm_fp16_note": """On certain ROCm devices, when using float16 inputs this module will use \
+:ref:`different precision<fp16_on_mi200>` for backward."""
+}
+
+reproducibility_notes = {
+    "forward_reproducibility_note": """This operation may behave nondeterministically when given tensors on \
+a CUDA device. See :doc:`/notes/randomness` for more information.""",
+    "backward_reproducibility_note": """This operation may produce nondeterministic gradients when given tensors on \
+a CUDA device. See :doc:`/notes/randomness` for more information.""",
+    "cudnn_reproducibility_note": """In some circumstances when given tensors on a CUDA device \
+and using CuDNN, this operator may select a nondeterministic algorithm to increase performance. If this is \
+undesirable, you can try to make the operation deterministic (potentially at \
+a performance cost) by setting ``torch.backends.cudnn.deterministic = True``. \
+See :doc:`/notes/randomness` for more information.""",
+}
+
+sparse_support_notes = {
+    "sparse_beta_warning": """
+.. warning::
+    Sparse support is a beta feature and some layout(s)/dtype/device combinations may not be supported,
+    or may not have autograd support. If you notice missing functionality please
+    open a feature request.""",
+}
+
+add_docstr(
+    torch.abs,
+    r"""
+abs(input, *, out=None) -> Tensor
+
+Computes the absolute value of each element in :attr:`input`.
+
+.. math::
+    \text{out}_{i} = |\text{input}_{i}|
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.abs(torch.tensor([-1, -2, 3]))
+    tensor([ 1,  2,  3])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.absolute,
+    r"""
+absolute(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.abs`
+""",
+)
+
+add_docstr(
+    torch.acos,
+    r"""
+acos(input, *, out=None) -> Tensor
+
+Computes the inverse cosine of each element in :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cos^{-1}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.3348, -0.5889,  0.2005, -0.1584])
+    >>> torch.acos(a)
+    tensor([ 1.2294,  2.2004,  1.3690,  1.7298])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arccos,
+    r"""
+arccos(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.acos`.
+""",
+)
+
+add_docstr(
+    torch.acosh,
+    r"""
+acosh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the inverse hyperbolic cosine of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cosh^{-1}(\text{input}_{i})
+
+Note:
+    The domain of the inverse hyperbolic cosine is `[1, inf)` and values outside this range
+    will be mapped to ``NaN``, except for `+ INF` for which the output is mapped to `+ INF`.
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4).uniform_(1, 2)
+    >>> a
+    tensor([ 1.3192, 1.9915, 1.9674, 1.7151 ])
+    >>> torch.acosh(a)
+    tensor([ 0.7791, 1.3120, 1.2979, 1.1341 ])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arccosh,
+    r"""
+arccosh(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.acosh`.
+""",
+)
+
+add_docstr(
+    torch.index_add,
+    r"""
+index_add(input, dim, index, source, *, alpha=1, out=None) -> Tensor
+
+See :meth:`~Tensor.index_add_` for function description.
+""",
+)
+
+add_docstr(
+    torch.index_copy,
+    r"""
+index_copy(input, dim, index, source, *, out=None) -> Tensor
+
+See :meth:`~Tensor.index_add_` for function description.
+""",
+)
+
+add_docstr(
+    torch.index_reduce,
+    r"""
+index_reduce(input, dim, index, source, reduce, *, include_self=True, out=None) -> Tensor
+
+See :meth:`~Tensor.index_reduce_` for function description.
+""",
+)
+
+add_docstr(
+    torch.add,
+    r"""
+add(input, other, *, alpha=1, out=None) -> Tensor
+
+Adds :attr:`other`, scaled by :attr:`alpha`, to :attr:`input`.
+
+.. math::
+    \text{{out}}_i = \text{{input}}_i + \text{{alpha}} \times \text{{other}}_i
+"""
+    + r"""
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+
+Args:
+    {input}
+    other (Tensor or Number): the tensor or number to add to :attr:`input`.
+
+Keyword arguments:
+    alpha (Number): the multiplier for :attr:`other`.
+    {out}
+
+Examples::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.0202,  1.0985,  1.3506, -0.6056])
+    >>> torch.add(a, 20)
+    tensor([ 20.0202,  21.0985,  21.3506,  19.3944])
+
+    >>> b = torch.randn(4)
+    >>> b
+    tensor([-0.9732, -0.3497,  0.6245,  0.4022])
+    >>> c = torch.randn(4, 1)
+    >>> c
+    tensor([[ 0.3743],
+            [-1.7724],
+            [-0.5811],
+            [-0.8017]])
+    >>> torch.add(b, c, alpha=10)
+    tensor([[  2.7695,   3.3930,   4.3672,   4.1450],
+            [-18.6971, -18.0736, -17.0994, -17.3216],
+            [ -6.7845,  -6.1610,  -5.1868,  -5.4090],
+            [ -8.9902,  -8.3667,  -7.3925,  -7.6147]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.addbmm,
+    r"""
+addbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices stored
+in :attr:`batch1` and :attr:`batch2`,
+with a reduced add step (all matrix multiplications get accumulated
+along the first dimension).
+:attr:`input` is added to the final result.
+
+:attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the
+same number of matrices.
+
+If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+:math:`(b \times m \times p)` tensor, :attr:`input` must be
+:ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+and :attr:`out` will be a :math:`(n \times p)` tensor.
+
+.. math::
+    out = \beta\ \text{input} + \alpha\ (\sum_{i=0}^{b-1} \text{batch1}_i \mathbin{@} \text{batch2}_i)
+
+If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+it will not be propagated.
+"""
+    + r"""
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha`
+must be real numbers, otherwise they should be integers.
+
+{tf32_note}
+
+{rocm_fp16_note}
+
+Args:
+    batch1 (Tensor): the first batch of matrices to be multiplied
+    batch2 (Tensor): the second batch of matrices to be multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    input (Tensor): matrix to be added
+    alpha (Number, optional): multiplier for `batch1 @ batch2` (:math:`\alpha`)
+    {out}
+
+Example::
+
+    >>> M = torch.randn(3, 5)
+    >>> batch1 = torch.randn(10, 3, 4)
+    >>> batch2 = torch.randn(10, 4, 5)
+    >>> torch.addbmm(M, batch1, batch2)
+    tensor([[  6.6311,   0.0503,   6.9768, -12.0362,  -2.1653],
+            [ -4.8185,  -1.4255,  -6.6760,   8.9453,   2.5743],
+            [ -3.8202,   4.3691,   1.0943,  -1.1109,   5.4730]])
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes
+    ),
+)
+
+add_docstr(
+    torch.addcdiv,
+    r"""
+addcdiv(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+
+Performs the element-wise division of :attr:`tensor1` by :attr:`tensor2`,
+multiplies the result by the scalar :attr:`value` and adds it to :attr:`input`.
+
+.. warning::
+    Integer division with addcdiv is no longer supported, and in a future
+    release addcdiv will perform a true division of tensor1 and tensor2.
+    The historic addcdiv behavior can be implemented as
+    (input + value * torch.trunc(tensor1 / tensor2)).to(input.dtype)
+    for integer inputs and as (input + value * tensor1 / tensor2) for float inputs.
+    The future addcdiv behavior is just the latter implementation:
+    (input + value * tensor1 / tensor2), for all dtypes.
+
+.. math::
+    \text{out}_i = \text{input}_i + \text{value} \times \frac{\text{tensor1}_i}{\text{tensor2}_i}
+"""
+    + r"""
+
+The shapes of :attr:`input`, :attr:`tensor1`, and :attr:`tensor2` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+a real number, otherwise an integer.
+
+Args:
+    input (Tensor): the tensor to be added
+    tensor1 (Tensor): the numerator tensor
+    tensor2 (Tensor): the denominator tensor
+
+Keyword args:
+    value (Number, optional): multiplier for :math:`\text{{tensor1}} / \text{{tensor2}}`
+    {out}
+
+Example::
+
+    >>> t = torch.randn(1, 3)
+    >>> t1 = torch.randn(3, 1)
+    >>> t2 = torch.randn(1, 3)
+    >>> torch.addcdiv(t, t1, t2, value=0.1)
+    tensor([[-0.2312, -3.6496,  0.1312],
+            [-1.0428,  3.4292, -0.1030],
+            [-0.5369, -0.9829,  0.0430]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.addcmul,
+    r"""
+addcmul(input, tensor1, tensor2, *, value=1, out=None) -> Tensor
+
+Performs the element-wise multiplication of :attr:`tensor1`
+by :attr:`tensor2`, multiplies the result by the scalar :attr:`value`
+and adds it to :attr:`input`.
+
+.. math::
+    \text{out}_i = \text{input}_i + \text{value} \times \text{tensor1}_i \times \text{tensor2}_i
+"""
+    + r"""
+The shapes of :attr:`tensor`, :attr:`tensor1`, and :attr:`tensor2` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+For inputs of type `FloatTensor` or `DoubleTensor`, :attr:`value` must be
+a real number, otherwise an integer.
+
+Args:
+    input (Tensor): the tensor to be added
+    tensor1 (Tensor): the tensor to be multiplied
+    tensor2 (Tensor): the tensor to be multiplied
+
+Keyword args:
+    value (Number, optional): multiplier for :math:`tensor1 .* tensor2`
+    {out}
+
+Example::
+
+    >>> t = torch.randn(1, 3)
+    >>> t1 = torch.randn(3, 1)
+    >>> t2 = torch.randn(1, 3)
+    >>> torch.addcmul(t, t1, t2, value=0.1)
+    tensor([[-0.8635, -0.6391,  1.6174],
+            [-0.7617, -0.5879,  1.7388],
+            [-0.8353, -0.6249,  1.6511]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.addmm,
+    r"""
+addmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+
+Performs a matrix multiplication of the matrices :attr:`mat1` and :attr:`mat2`.
+The matrix :attr:`input` is added to the final result.
+
+If :attr:`mat1` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+:math:`(m \times p)` tensor, then :attr:`input` must be
+:ref:`broadcastable <broadcasting-semantics>` with a :math:`(n \times p)` tensor
+and :attr:`out` will be a :math:`(n \times p)` tensor.
+
+:attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+:attr:`mat1` and :attr:`mat2` and the added matrix :attr:`input` respectively.
+
+.. math::
+    \text{out} = \beta\ \text{input} + \alpha\ (\text{mat1}_i \mathbin{@} \text{mat2}_i)
+
+If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+it will not be propagated.
+"""
+    + r"""
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers.
+
+This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. If
+:attr:`input` is sparse the result will have the same layout and if :attr:`out`
+is provided it must have the same layout as :attr:`input`.
+
+{sparse_beta_warning}
+
+{tf32_note}
+
+{rocm_fp16_note}
+
+Args:
+    input (Tensor): matrix to be added
+    mat1 (Tensor): the first matrix to be matrix multiplied
+    mat2 (Tensor): the second matrix to be matrix multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+    {out}
+
+Example::
+
+    >>> M = torch.randn(2, 3)
+    >>> mat1 = torch.randn(2, 3)
+    >>> mat2 = torch.randn(3, 3)
+    >>> torch.addmm(M, mat1, mat2)
+    tensor([[-4.8716,  1.4671, -1.3746],
+            [ 0.7573, -3.9555, -2.8681]])
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes, **sparse_support_notes
+    ),
+)
+
+add_docstr(
+    torch.adjoint,
+    r"""
+adjoint(Tensor) -> Tensor
+Returns a view of the tensor conjugated and with the last two dimensions transposed.
+
+``x.adjoint()`` is equivalent to ``x.transpose(-2, -1).conj()`` for complex tensors and
+to ``x.transpose(-2, -1)`` for real tensors.
+
+Example::
+    >>> x = torch.arange(4, dtype=torch.float)
+    >>> A = torch.complex(x, x).reshape(2, 2)
+    >>> A
+    tensor([[0.+0.j, 1.+1.j],
+            [2.+2.j, 3.+3.j]])
+    >>> A.adjoint()
+    tensor([[0.-0.j, 2.-2.j],
+            [1.-1.j, 3.-3.j]])
+    >>> (A.adjoint() == A.mH).all()
+    tensor(True)
+""",
+)
+
+add_docstr(
+    torch.sspaddmm,
+    r"""
+sspaddmm(input, mat1, mat2, *, beta=1, alpha=1, out=None) -> Tensor
+
+Matrix multiplies a sparse tensor :attr:`mat1` with a dense tensor
+:attr:`mat2`, then adds the sparse tensor :attr:`input` to the result.
+
+Note: This function is equivalent to :func:`torch.addmm`, except
+:attr:`input` and :attr:`mat1` are sparse.
+
+Args:
+    input (Tensor): a sparse matrix to be added
+    mat1 (Tensor): a sparse matrix to be matrix multiplied
+    mat2 (Tensor): a dense matrix to be matrix multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+    {out}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.smm,
+    r"""
+smm(input, mat) -> Tensor
+
+Performs a matrix multiplication of the sparse matrix :attr:`input`
+with the dense matrix :attr:`mat`.
+
+Args:
+    input (Tensor): a sparse matrix to be matrix multiplied
+    mat (Tensor): a dense matrix to be matrix multiplied
+""",
+)
+
+add_docstr(
+    torch.addmv,
+    r"""
+addmv(input, mat, vec, *, beta=1, alpha=1, out=None) -> Tensor
+
+Performs a matrix-vector product of the matrix :attr:`mat` and
+the vector :attr:`vec`.
+The vector :attr:`input` is added to the final result.
+
+If :attr:`mat` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+size `m`, then :attr:`input` must be
+:ref:`broadcastable <broadcasting-semantics>` with a 1-D tensor of size `n` and
+:attr:`out` will be 1-D tensor of size `n`.
+
+:attr:`alpha` and :attr:`beta` are scaling factors on matrix-vector product between
+:attr:`mat` and :attr:`vec` and the added tensor :attr:`input` respectively.
+
+.. math::
+    \text{out} = \beta\ \text{input} + \alpha\ (\text{mat} \mathbin{@} \text{vec})
+
+If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+it will not be propagated.
+"""
+    + r"""
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers.
+
+Args:
+    input (Tensor): vector to be added
+    mat (Tensor): matrix to be matrix multiplied
+    vec (Tensor): vector to be matrix multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`mat @ vec` (:math:`\alpha`)
+    {out}
+
+Example::
+
+    >>> M = torch.randn(2)
+    >>> mat = torch.randn(2, 3)
+    >>> vec = torch.randn(3)
+    >>> torch.addmv(M, mat, vec)
+    tensor([-0.3768, -5.5565])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.addr,
+    r"""
+addr(input, vec1, vec2, *, beta=1, alpha=1, out=None) -> Tensor
+
+Performs the outer-product of vectors :attr:`vec1` and :attr:`vec2`
+and adds it to the matrix :attr:`input`.
+
+Optional values :attr:`beta` and :attr:`alpha` are scaling factors on the
+outer product between :attr:`vec1` and :attr:`vec2` and the added matrix
+:attr:`input` respectively.
+
+.. math::
+    \text{out} = \beta\ \text{input} + \alpha\ (\text{vec1} \otimes \text{vec2})
+
+If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+it will not be propagated.
+"""
+    + r"""
+If :attr:`vec1` is a vector of size `n` and :attr:`vec2` is a vector
+of size `m`, then :attr:`input` must be
+:ref:`broadcastable <broadcasting-semantics>` with a matrix of size
+:math:`(n \times m)` and :attr:`out` will be a matrix of size
+:math:`(n \times m)`.
+
+Args:
+    input (Tensor): matrix to be added
+    vec1 (Tensor): the first vector of the outer product
+    vec2 (Tensor): the second vector of the outer product
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`\text{{vec1}} \otimes \text{{vec2}}` (:math:`\alpha`)
+    {out}
+
+Example::
+
+    >>> vec1 = torch.arange(1., 4.)
+    >>> vec2 = torch.arange(1., 3.)
+    >>> M = torch.zeros(3, 2)
+    >>> torch.addr(M, vec1, vec2)
+    tensor([[ 1.,  2.],
+            [ 2.,  4.],
+            [ 3.,  6.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.allclose,
+    r"""
+allclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> bool
+
+This function checks if :attr:`input` and :attr:`other` satisfy the condition:
+
+.. math::
+    \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+"""
+    + r"""
+elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
+`numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
+
+Args:
+    input (Tensor): first tensor to compare
+    other (Tensor): second tensor to compare
+    atol (float, optional): absolute tolerance. Default: 1e-08
+    rtol (float, optional): relative tolerance. Default: 1e-05
+    equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+
+Example::
+
+    >>> torch.allclose(torch.tensor([10000., 1e-07]), torch.tensor([10000.1, 1e-08]))
+    False
+    >>> torch.allclose(torch.tensor([10000., 1e-08]), torch.tensor([10000.1, 1e-09]))
+    True
+    >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]))
+    False
+    >>> torch.allclose(torch.tensor([1.0, float('nan')]), torch.tensor([1.0, float('nan')]), equal_nan=True)
+    True
+""",
+)
+
+add_docstr(
+    torch.all,
+    r"""
+all(input) -> Tensor
+
+Tests if all elements in :attr:`input` evaluate to `True`.
+
+.. note:: This function matches the behaviour of NumPy in returning
+          output of dtype `bool` for all supported dtypes except `uint8`.
+          For `uint8` the dtype of output is `uint8` itself.
+
+Example::
+
+    >>> a = torch.rand(1, 2).bool()
+    >>> a
+    tensor([[False, True]], dtype=torch.bool)
+    >>> torch.all(a)
+    tensor(False, dtype=torch.bool)
+    >>> a = torch.arange(0, 3)
+    >>> a
+    tensor([0, 1, 2])
+    >>> torch.all(a)
+    tensor(False)
+
+.. function:: all(input, dim, keepdim=False, *, out=None) -> Tensor
+   :noindex:
+
+For each row of :attr:`input` in the given dimension :attr:`dim`,
+returns `True` if all elements in the row evaluate to `True` and `False` otherwise.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.rand(4, 2).bool()
+    >>> a
+    tensor([[True, True],
+            [True, False],
+            [True, True],
+            [True, True]], dtype=torch.bool)
+    >>> torch.all(a, dim=1)
+    tensor([ True, False,  True,  True], dtype=torch.bool)
+    >>> torch.all(a, dim=0)
+    tensor([ True, False], dtype=torch.bool)
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.any,
+    r"""
+any(input) -> Tensor
+
+Tests if any element in :attr:`input` evaluates to `True`.
+
+.. note:: This function matches the behaviour of NumPy in returning
+          output of dtype `bool` for all supported dtypes except `uint8`.
+          For `uint8` the dtype of output is `uint8` itself.
+
+Example::
+
+    >>> a = torch.rand(1, 2).bool()
+    >>> a
+    tensor([[False, True]], dtype=torch.bool)
+    >>> torch.any(a)
+    tensor(True, dtype=torch.bool)
+    >>> a = torch.arange(0, 3)
+    >>> a
+    tensor([0, 1, 2])
+    >>> torch.any(a)
+    tensor(True)
+
+.. function:: any(input, dim, keepdim=False, *, out=None) -> Tensor
+   :noindex:
+
+For each row of :attr:`input` in the given dimension :attr:`dim`,
+returns `True` if any element in the row evaluate to `True` and `False` otherwise.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4, 2) < 0
+    >>> a
+    tensor([[ True,  True],
+            [False,  True],
+            [ True,  True],
+            [False, False]])
+    >>> torch.any(a, 1)
+    tensor([ True,  True,  True, False])
+    >>> torch.any(a, 0)
+    tensor([True, True])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.angle,
+    r"""
+angle(input, *, out=None) -> Tensor
+
+Computes the element-wise angle (in radians) of the given :attr:`input` tensor.
+
+.. math::
+    \text{out}_{i} = angle(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+.. note:: Starting in PyTorch 1.8, angle returns pi for negative real numbers,
+          zero for non-negative real numbers, and propagates NaNs. Previously
+          the function would return zero for all real numbers and not propagate
+          floating-point NaNs.
+
+Example::
+
+    >>> torch.angle(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))*180/3.14159
+    tensor([ 135.,  135,  -45])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.as_strided,
+    r"""
+as_strided(input, size, stride, storage_offset=None) -> Tensor
+
+Create a view of an existing `torch.Tensor` :attr:`input` with specified
+:attr:`size`, :attr:`stride` and :attr:`storage_offset`.
+
+.. warning::
+    Prefer using other view functions, like :meth:`torch.Tensor.expand`,
+    to setting a view's strides manually with `as_strided`, as this
+    function's behavior depends on the implementation of a tensor's storage.
+    The constructed view of the storage must only refer to elements within
+    the storage or a runtime error will be thrown, and if the view is
+    "overlapped" (with multiple indices referring to the same element in
+    memory) its behavior is undefined.
+
+Args:
+    {input}
+    size (tuple or ints): the shape of the output tensor
+    stride (tuple or ints): the stride of the output tensor
+    storage_offset (int, optional): the offset in the underlying storage of the output tensor.
+        If ``None``, the storage_offset of the output tensor will match the input tensor.
+
+Example::
+
+    >>> x = torch.randn(3, 3)
+    >>> x
+    tensor([[ 0.9039,  0.6291,  1.0795],
+            [ 0.1586,  2.1939, -0.4900],
+            [-0.1909, -0.7503,  1.9355]])
+    >>> t = torch.as_strided(x, (2, 2), (1, 2))
+    >>> t
+    tensor([[0.9039, 1.0795],
+            [0.6291, 0.1586]])
+    >>> t = torch.as_strided(x, (2, 2), (1, 2), 1)
+    tensor([[0.6291, 0.1586],
+            [1.0795, 2.1939]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.as_tensor,
+    r"""
+as_tensor(data, dtype=None, device=None) -> Tensor
+
+Converts :attr:`data` into a tensor, sharing data and preserving autograd
+history if possible.
+
+If :attr:`data` is already a tensor with the requested dtype and device
+then :attr:`data` itself is returned, but if :attr:`data` is a
+tensor with a different dtype or device then it's copied as if using
+`data.to(dtype=dtype, device=device)`.
+
+If :attr:`data` is a NumPy array (an ndarray) with the same dtype and device then a
+tensor is constructed using :func:`torch.from_numpy`.
+
+.. seealso::
+
+    :func:`torch.tensor` never shares its data and creates a new "leaf tensor" (see :doc:`/notes/autograd`).
+
+
+Args:
+    {data}
+    {dtype}
+    device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+        then the device of data is used. If None and data is not a tensor then
+        the result tensor is constructed on the current device.
+
+
+Example::
+
+    >>> a = numpy.array([1, 2, 3])
+    >>> t = torch.as_tensor(a)
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([-1,  2,  3])
+
+    >>> a = numpy.array([1, 2, 3])
+    >>> t = torch.as_tensor(a, device=torch.device('cuda'))
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([1,  2,  3])
+""".format(
+        **factory_data_common_args
+    ),
+)
+
+add_docstr(
+    torch.asin,
+    r"""
+asin(input, *, out=None) -> Tensor
+
+Returns a new tensor with the arcsine of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sin^{-1}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.5962,  1.4985, -0.4396,  1.4525])
+    >>> torch.asin(a)
+    tensor([-0.6387,     nan, -0.4552,     nan])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arcsin,
+    r"""
+arcsin(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.asin`.
+""",
+)
+
+add_docstr(
+    torch.asinh,
+    r"""
+asinh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the inverse hyperbolic sine of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sinh^{-1}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.1606, -1.4267, -1.0899, -1.0250 ])
+    >>> torch.asinh(a)
+    tensor([ 0.1599, -1.1534, -0.9435, -0.8990 ])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arcsinh,
+    r"""
+arcsinh(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.asinh`.
+""",
+)
+
+add_docstr(
+    torch.atan,
+    r"""
+atan(input, *, out=None) -> Tensor
+
+Returns a new tensor with the arctangent of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tan^{-1}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.2341,  0.2539, -0.6256, -0.6448])
+    >>> torch.atan(a)
+    tensor([ 0.2299,  0.2487, -0.5591, -0.5727])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arctan,
+    r"""
+arctan(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.atan`.
+""",
+)
+
+add_docstr(
+    torch.atan2,
+    r"""
+atan2(input, other, *, out=None) -> Tensor
+
+Element-wise arctangent of :math:`\text{{input}}_{{i}} / \text{{other}}_{{i}}`
+with consideration of the quadrant. Returns a new tensor with the signed angles
+in radians between vector :math:`(\text{{other}}_{{i}}, \text{{input}}_{{i}})`
+and vector :math:`(1, 0)`. (Note that :math:`\text{{other}}_{{i}}`, the second
+parameter, is the x-coordinate, while :math:`\text{{input}}_{{i}}`, the first
+parameter, is the y-coordinate.)
+
+The shapes of ``input`` and ``other`` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the first input tensor
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.9041,  0.0196, -0.3108, -2.4423])
+    >>> torch.atan2(a, torch.randn(4))
+    tensor([ 0.9833,  0.0811, -1.9743, -1.4151])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arctan2,
+    r"""
+arctan2(input, other, *, out=None) -> Tensor
+Alias for :func:`torch.atan2`.
+""",
+)
+
+add_docstr(
+    torch.atanh,
+    r"""
+atanh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the inverse hyperbolic tangent of the elements of :attr:`input`.
+
+Note:
+    The domain of the inverse hyperbolic tangent is `(-1, 1)` and values outside this range
+    will be mapped to ``NaN``, except for the values `1` and `-1` for which the output is
+    mapped to `+/-INF` respectively.
+
+.. math::
+    \text{out}_{i} = \tanh^{-1}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4).uniform_(-1, 1)
+    >>> a
+    tensor([ -0.9385, 0.2968, -0.8591, -0.1871 ])
+    >>> torch.atanh(a)
+    tensor([ -1.7253, 0.3060, -1.2899, -0.1893 ])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.arctanh,
+    r"""
+arctanh(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.atanh`.
+""",
+)
+
+add_docstr(
+    torch.asarray,
+    r"""
+asarray(obj, *, dtype=None, device=None, copy=None, requires_grad=False) -> Tensor
+
+Converts :attr:`obj` to a tensor.
+
+:attr:`obj` can be one of:
+
+1. a tensor
+2. a NumPy array or a NumPy scalar
+3. a DLPack capsule
+4. an object that implements Python's buffer protocol
+5. a scalar
+6. a sequence of scalars
+
+When :attr:`obj` is a tensor, NumPy array, or DLPack capsule the returned tensor will,
+by default, not require a gradient, have the same datatype as :attr:`obj`, be on the
+same device, and share memory with it. These properties can be controlled with the
+:attr:`dtype`, :attr:`device`, :attr:`copy`, and :attr:`requires_grad` keyword arguments.
+If the returned tensor is of a different datatype, on a different device, or a copy is
+requested then it will not share its memory with :attr:`obj`. If :attr:`requires_grad`
+is ``True`` then the returned tensor will require a gradient, and if :attr:`obj` is
+also a tensor with an autograd history then the returned tensor will have the same history.
+
+When :attr:`obj` is not a tensor, NumPy array, or DLPack capsule but implements Python's
+buffer protocol then the buffer is interpreted as an array of bytes grouped according to
+the size of the datatype passed to the :attr:`dtype` keyword argument. (If no datatype is
+passed then the default floating point datatype is used, instead.) The returned tensor
+will have the specified datatype (or default floating point datatype if none is specified)
+and, by default, be on the CPU device and share memory with the buffer.
+
+When :attr:`obj` is a NumPy scalar, the returned tensor will be a 0-dimensional tensor on
+the CPU and that doesn't share its memory (i.e. ``copy=True``). By default datatype will
+be the PyTorch datatype corresponding to the NumPy's scalar's datatype.
+
+When :attr:`obj` is none of the above but a scalar, or a sequence of scalars then the
+returned tensor will, by default, infer its datatype from the scalar values, be on the
+current default device, and not share its memory.
+
+.. seealso::
+
+    :func:`torch.tensor` creates a tensor that always copies the data from the input object.
+    :func:`torch.from_numpy` creates a tensor that always shares memory from NumPy arrays.
+    :func:`torch.frombuffer` creates a tensor that always shares memory from objects that
+    implement the buffer protocol.
+    :func:`torch.from_dlpack` creates a tensor that always shares memory from
+    DLPack capsules.
+
+Args:
+    obj (object): a tensor, NumPy array, DLPack Capsule, object that implements Python's
+           buffer protocol, scalar, or sequence of scalars.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the datatype of the returned tensor.
+           Default: ``None``, which causes the datatype of the returned tensor to be
+           inferred from :attr:`obj`.
+    copy (bool, optional): controls whether the returned tensor shares memory with :attr:`obj`.
+           Default: ``None``, which causes the returned tensor to share memory with :attr:`obj`
+           whenever possible. If ``True`` then the returned tensor does not share its memory.
+           If ``False`` then the returned tensor shares its memory with :attr:`obj` and an
+           error is thrown if it cannot.
+    device (:class:`torch.device`, optional): the device of the returned tensor.
+           Default: ``None``, which causes the device of :attr:`obj` to be used. Or, if
+           :attr:`obj` is a Python sequence, the current default device will be used.
+    requires_grad (bool, optional): whether the returned tensor requires grad.
+           Default: ``False``, which causes the returned tensor not to require a gradient.
+           If ``True``, then the returned tensor will require a gradient, and if :attr:`obj`
+           is also a tensor with an autograd history then the returned tensor will have
+           the same history.
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> # Shares memory with tensor 'a'
+    >>> b = torch.asarray(a)
+    >>> a.data_ptr() == b.data_ptr()
+    True
+    >>> # Forces memory copy
+    >>> c = torch.asarray(a, copy=True)
+    >>> a.data_ptr() == c.data_ptr()
+    False
+
+    >>> a = torch.tensor([1., 2., 3.], requires_grad=True)
+    >>> b = a + 2
+    >>> b
+    tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+    >>> # Shares memory with tensor 'b', with no grad
+    >>> c = torch.asarray(b)
+    >>> c
+    tensor([3., 4., 5.])
+    >>> # Shares memory with tensor 'b', retaining autograd history
+    >>> d = torch.asarray(b, requires_grad=True)
+    >>> d
+    tensor([3., 4., 5.], grad_fn=<AddBackward0>)
+
+    >>> array = numpy.array([1, 2, 3])
+    >>> # Shares memory with array 'array'
+    >>> t1 = torch.asarray(array)
+    >>> array.__array_interface__['data'][0] == t1.data_ptr()
+    True
+    >>> # Copies memory due to dtype mismatch
+    >>> t2 = torch.asarray(array, dtype=torch.float32)
+    >>> array.__array_interface__['data'][0] == t2.data_ptr()
+    False
+
+    >>> scalar = numpy.float64(0.5)
+    >>> torch.asarray(scalar)
+    tensor(0.5000, dtype=torch.float64)
+""",
+)
+
+add_docstr(
+    torch.baddbmm,
+    r"""
+baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices in :attr:`batch1`
+and :attr:`batch2`.
+:attr:`input` is added to the final result.
+
+:attr:`batch1` and :attr:`batch2` must be 3-D tensors each containing the same
+number of matrices.
+
+If :attr:`batch1` is a :math:`(b \times n \times m)` tensor, :attr:`batch2` is a
+:math:`(b \times m \times p)` tensor, then :attr:`input` must be
+:ref:`broadcastable <broadcasting-semantics>` with a
+:math:`(b \times n \times p)` tensor and :attr:`out` will be a
+:math:`(b \times n \times p)` tensor. Both :attr:`alpha` and :attr:`beta` mean the
+same as the scaling factors used in :meth:`torch.addbmm`.
+
+.. math::
+    \text{out}_i = \beta\ \text{input}_i + \alpha\ (\text{batch1}_i \mathbin{@} \text{batch2}_i)
+
+If :attr:`beta` is 0, then :attr:`input` will be ignored, and `nan` and `inf` in
+it will not be propagated.
+"""
+    + r"""
+For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and
+:attr:`alpha` must be real numbers, otherwise they should be integers.
+
+{tf32_note}
+
+{rocm_fp16_note}
+
+Args:
+    input (Tensor): the tensor to be added
+    batch1 (Tensor): the first batch of matrices to be multiplied
+    batch2 (Tensor): the second batch of matrices to be multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`\text{{batch1}} \mathbin{{@}} \text{{batch2}}` (:math:`\alpha`)
+    {out}
+
+Example::
+
+    >>> M = torch.randn(10, 3, 5)
+    >>> batch1 = torch.randn(10, 3, 4)
+    >>> batch2 = torch.randn(10, 4, 5)
+    >>> torch.baddbmm(M, batch1, batch2).size()
+    torch.Size([10, 3, 5])
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes
+    ),
+)
+
+add_docstr(
+    torch.bernoulli,
+    r"""
+bernoulli(input, *, generator=None, out=None) -> Tensor
+
+Draws binary random numbers (0 or 1) from a Bernoulli distribution.
+
+The :attr:`input` tensor should be a tensor containing probabilities
+to be used for drawing the binary random number.
+Hence, all values in :attr:`input` have to be in the range:
+:math:`0 \leq \text{input}_i \leq 1`.
+
+The :math:`\text{i}^{th}` element of the output tensor will draw a
+value :math:`1` according to the :math:`\text{i}^{th}` probability value given
+in :attr:`input`.
+
+.. math::
+    \text{out}_{i} \sim \mathrm{Bernoulli}(p = \text{input}_{i})
+"""
+    + r"""
+The returned :attr:`out` tensor only has values 0 or 1 and is of the same
+shape as :attr:`input`.
+
+:attr:`out` can have integral ``dtype``, but :attr:`input` must have floating
+point ``dtype``.
+
+Args:
+    input (Tensor): the input tensor of probability values for the Bernoulli distribution
+
+Keyword args:
+    {generator}
+    {out}
+
+Example::
+
+    >>> a = torch.empty(3, 3).uniform_(0, 1)  # generate a uniform random matrix with range [0, 1]
+    >>> a
+    tensor([[ 0.1737,  0.0950,  0.3609],
+            [ 0.7148,  0.0289,  0.2676],
+            [ 0.9456,  0.8937,  0.7202]])
+    >>> torch.bernoulli(a)
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  0.,  0.],
+            [ 1.,  1.,  1.]])
+
+    >>> a = torch.ones(3, 3) # probability of drawing "1" is 1
+    >>> torch.bernoulli(a)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+    >>> a = torch.zeros(3, 3) # probability of drawing "1" is 0
+    >>> torch.bernoulli(a)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bincount,
+    r"""
+bincount(input, weights=None, minlength=0) -> Tensor
+
+Count the frequency of each value in an array of non-negative ints.
+
+The number of bins (size 1) is one larger than the largest value in
+:attr:`input` unless :attr:`input` is empty, in which case the result is a
+tensor of size 0. If :attr:`minlength` is specified, the number of bins is at least
+:attr:`minlength` and if :attr:`input` is empty, then the result is tensor of size
+:attr:`minlength` filled with zeros. If ``n`` is the value at position ``i``,
+``out[n] += weights[i]`` if :attr:`weights` is specified else
+``out[n] += 1``.
+
+Note:
+    {backward_reproducibility_note}
+
+Arguments:
+    input (Tensor): 1-d int tensor
+    weights (Tensor): optional, weight for each value in the input tensor.
+        Should be of same size as input tensor.
+    minlength (int): optional, minimum number of bins. Should be non-negative.
+
+Returns:
+    output (Tensor): a tensor of shape ``Size([max(input) + 1])`` if
+    :attr:`input` is non-empty, else ``Size(0)``
+
+Example::
+
+    >>> input = torch.randint(0, 8, (5,), dtype=torch.int64)
+    >>> weights = torch.linspace(0, 1, steps=5)
+    >>> input, weights
+    (tensor([4, 3, 6, 3, 4]),
+     tensor([ 0.0000,  0.2500,  0.5000,  0.7500,  1.0000])
+
+    >>> torch.bincount(input)
+    tensor([0, 0, 0, 2, 2, 0, 1])
+
+    >>> input.bincount(weights)
+    tensor([0.0000, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000, 0.5000])
+""".format(
+        **reproducibility_notes
+    ),
+)
+
+add_docstr(
+    torch.bitwise_not,
+    r"""
+bitwise_not(input, *, out=None) -> Tensor
+
+Computes the bitwise NOT of the given input tensor. The input tensor must be of
+integral or Boolean types. For bool tensors, it computes the logical NOT.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_not(torch.tensor([-1, -2, 3], dtype=torch.int8))
+    tensor([ 0,  1, -4], dtype=torch.int8)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bmm,
+    r"""
+bmm(input, mat2, *, out=None) -> Tensor
+
+Performs a batch matrix-matrix product of matrices stored in :attr:`input`
+and :attr:`mat2`.
+
+:attr:`input` and :attr:`mat2` must be 3-D tensors each containing
+the same number of matrices.
+
+If :attr:`input` is a :math:`(b \times n \times m)` tensor, :attr:`mat2` is a
+:math:`(b \times m \times p)` tensor, :attr:`out` will be a
+:math:`(b \times n \times p)` tensor.
+
+.. math::
+    \text{out}_i = \text{input}_i \mathbin{@} \text{mat2}_i
+"""
+    + r"""
+{tf32_note}
+
+{rocm_fp16_note}
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+          For broadcasting matrix products, see :func:`torch.matmul`.
+
+Args:
+    input (Tensor): the first batch of matrices to be multiplied
+    mat2 (Tensor): the second batch of matrices to be multiplied
+
+Keyword Args:
+    {out}
+
+Example::
+
+    >>> input = torch.randn(10, 3, 4)
+    >>> mat2 = torch.randn(10, 4, 5)
+    >>> res = torch.bmm(input, mat2)
+    >>> res.size()
+    torch.Size([10, 3, 5])
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes
+    ),
+)
+
+add_docstr(
+    torch.bitwise_and,
+    r"""
+bitwise_and(input, other, *, out=None) -> Tensor
+
+Computes the bitwise AND of :attr:`input` and :attr:`other`. The input tensor must be of
+integral or Boolean types. For bool tensors, it computes the logical AND.
+
+Args:
+    input: the first input tensor
+    other: the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_and(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+    tensor([1, 0,  3], dtype=torch.int8)
+    >>> torch.bitwise_and(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+    tensor([ False, True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bitwise_or,
+    r"""
+bitwise_or(input, other, *, out=None) -> Tensor
+
+Computes the bitwise OR of :attr:`input` and :attr:`other`. The input tensor must be of
+integral or Boolean types. For bool tensors, it computes the logical OR.
+
+Args:
+    input: the first input tensor
+    other: the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_or(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+    tensor([-1, -2,  3], dtype=torch.int8)
+    >>> torch.bitwise_or(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+    tensor([ True, True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bitwise_xor,
+    r"""
+bitwise_xor(input, other, *, out=None) -> Tensor
+
+Computes the bitwise XOR of :attr:`input` and :attr:`other`. The input tensor must be of
+integral or Boolean types. For bool tensors, it computes the logical XOR.
+
+Args:
+    input: the first input tensor
+    other: the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_xor(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+    tensor([-2, -2,  0], dtype=torch.int8)
+    >>> torch.bitwise_xor(torch.tensor([True, True, False]), torch.tensor([False, True, False]))
+    tensor([ True, False, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bitwise_left_shift,
+    r"""
+bitwise_left_shift(input, other, *, out=None) -> Tensor
+
+Computes the left arithmetic shift of :attr:`input` by :attr:`other` bits.
+The input tensor must be of integral type. This operator supports
+:ref:`broadcasting to a common shape <broadcasting-semantics>` and
+:ref:`type promotion <type-promotion-doc>`.
+
+The operation applied is:
+
+.. math::
+    \text{{out}}_i = \text{{input}}_i << \text{{other}}_i
+
+Args:
+    input (Tensor or Scalar): the first input tensor
+    other (Tensor or Scalar): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_left_shift(torch.tensor([-1, -2, 3], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+    tensor([-2, -2, 24], dtype=torch.int8)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.bitwise_right_shift,
+    r"""
+bitwise_right_shift(input, other, *, out=None) -> Tensor
+
+Computes the right arithmetic shift of :attr:`input` by :attr:`other` bits.
+The input tensor must be of integral type. This operator supports
+:ref:`broadcasting to a common shape <broadcasting-semantics>` and
+:ref:`type promotion <type-promotion-doc>`.
+In any case, if the value of the right operand is negative or is greater
+or equal to the number of bits in the promoted left operand, the behavior is undefined.
+
+The operation applied is:
+
+.. math::
+    \text{{out}}_i = \text{{input}}_i >> \text{{other}}_i
+
+Args:
+    input (Tensor or Scalar): the first input tensor
+    other (Tensor or Scalar): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.bitwise_right_shift(torch.tensor([-2, -7, 31], dtype=torch.int8), torch.tensor([1, 0, 3], dtype=torch.int8))
+    tensor([-1, -7,  3], dtype=torch.int8)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.broadcast_to,
+    r"""
+broadcast_to(input, shape) -> Tensor
+
+Broadcasts :attr:`input` to the shape :attr:`\shape`.
+Equivalent to calling ``input.expand(shape)``. See :meth:`~Tensor.expand` for details.
+
+Args:
+    {input}
+    shape (list, tuple, or :class:`torch.Size`): the new shape.
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> torch.broadcast_to(x, (3, 3))
+    tensor([[1, 2, 3],
+            [1, 2, 3],
+            [1, 2, 3]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.stack,
+    r"""
+stack(tensors, dim=0, *, out=None) -> Tensor
+
+Concatenates a sequence of tensors along a new dimension.
+
+All tensors need to be of the same size.
+
+.. seealso::
+
+    :func:`torch.cat` concatenates the given sequence along an existing dimension.
+
+Arguments:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+    dim (int, optional): dimension to insert. Has to be between 0 and the number
+        of dimensions of concatenated tensors (inclusive). Default: 0
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.3367,  0.1288,  0.2345],
+            [ 0.2303, -1.1229, -0.1863]])
+    >>> x = torch.stack((x, x)) # same as torch.stack((x, x), dim=0)
+    >>> x
+    tensor([[[ 0.3367,  0.1288,  0.2345],
+             [ 0.2303, -1.1229, -0.1863]],
+
+            [[ 0.3367,  0.1288,  0.2345],
+             [ 0.2303, -1.1229, -0.1863]]])
+    >>> x.size()
+    torch.Size([2, 2, 3])
+    >>> x = torch.stack((x, x), dim=1)
+    tensor([[[ 0.3367,  0.1288,  0.2345],
+             [ 0.3367,  0.1288,  0.2345]],
+
+            [[ 0.2303, -1.1229, -0.1863],
+             [ 0.2303, -1.1229, -0.1863]]])
+    >>> x = torch.stack((x, x), dim=2)
+    tensor([[[ 0.3367,  0.3367],
+             [ 0.1288,  0.1288],
+             [ 0.2345,  0.2345]],
+
+            [[ 0.2303,  0.2303],
+             [-1.1229, -1.1229],
+             [-0.1863, -0.1863]]])
+    >>> x = torch.stack((x, x), dim=-1)
+    tensor([[[ 0.3367,  0.3367],
+             [ 0.1288,  0.1288],
+             [ 0.2345,  0.2345]],
+
+            [[ 0.2303,  0.2303],
+             [-1.1229, -1.1229],
+             [-0.1863, -0.1863]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.hstack,
+    r"""
+hstack(tensors, *, out=None) -> Tensor
+
+Stack tensors in sequence horizontally (column wise).
+
+This is equivalent to concatenation along the first axis for 1-D tensors, and along the second axis for all other tensors.
+
+Args:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> b = torch.tensor([4, 5, 6])
+    >>> torch.hstack((a,b))
+    tensor([1, 2, 3, 4, 5, 6])
+    >>> a = torch.tensor([[1],[2],[3]])
+    >>> b = torch.tensor([[4],[5],[6]])
+    >>> torch.hstack((a,b))
+    tensor([[1, 4],
+            [2, 5],
+            [3, 6]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.vstack,
+    r"""
+vstack(tensors, *, out=None) -> Tensor
+
+Stack tensors in sequence vertically (row wise).
+
+This is equivalent to concatenation along the first axis after all 1-D tensors have been reshaped by :func:`torch.atleast_2d`.
+
+Args:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> b = torch.tensor([4, 5, 6])
+    >>> torch.vstack((a,b))
+    tensor([[1, 2, 3],
+            [4, 5, 6]])
+    >>> a = torch.tensor([[1],[2],[3]])
+    >>> b = torch.tensor([[4],[5],[6]])
+    >>> torch.vstack((a,b))
+    tensor([[1],
+            [2],
+            [3],
+            [4],
+            [5],
+            [6]])
+
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.dstack,
+    r"""
+dstack(tensors, *, out=None) -> Tensor
+
+Stack tensors in sequence depthwise (along third axis).
+
+This is equivalent to concatenation along the third axis after 1-D and 2-D tensors have been reshaped by :func:`torch.atleast_3d`.
+
+Args:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> b = torch.tensor([4, 5, 6])
+    >>> torch.dstack((a,b))
+    tensor([[[1, 4],
+             [2, 5],
+             [3, 6]]])
+    >>> a = torch.tensor([[1],[2],[3]])
+    >>> b = torch.tensor([[4],[5],[6]])
+    >>> torch.dstack((a,b))
+    tensor([[[1, 4]],
+            [[2, 5]],
+            [[3, 6]]])
+
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.tensor_split,
+    r"""
+tensor_split(input, indices_or_sections, dim=0) -> List of Tensors
+
+Splits a tensor into multiple sub-tensors, all of which are views of :attr:`input`,
+along dimension :attr:`dim` according to the indices or number of sections specified
+by :attr:`indices_or_sections`. This function is based on NumPy's
+:func:`numpy.array_split`.
+
+Args:
+    input (Tensor): the tensor to split
+    indices_or_sections (Tensor, int or list or tuple of ints):
+        If :attr:`indices_or_sections` is an integer ``n`` or a zero dimensional long tensor
+        with value ``n``, :attr:`input` is split into ``n`` sections along dimension :attr:`dim`.
+        If :attr:`input` is divisible by ``n`` along dimension :attr:`dim`, each
+        section will be of equal size, :code:`input.size(dim) / n`. If :attr:`input`
+        is not divisible by ``n``, the sizes of the first :code:`int(input.size(dim) % n)`
+        sections will have size :code:`int(input.size(dim) / n) + 1`, and the rest will
+        have size :code:`int(input.size(dim) / n)`.
+
+        If :attr:`indices_or_sections` is a list or tuple of ints, or a one-dimensional long
+        tensor, then :attr:`input` is split along dimension :attr:`dim` at each of the indices
+        in the list, tuple or tensor. For instance, :code:`indices_or_sections=[2, 3]` and :code:`dim=0`
+        would result in the tensors :code:`input[:2]`, :code:`input[2:3]`, and :code:`input[3:]`.
+
+        If :attr:`indices_or_sections` is a tensor, it must be a zero-dimensional or one-dimensional
+        long tensor on the CPU.
+
+    dim (int, optional): dimension along which to split the tensor. Default: ``0``
+
+Example::
+
+    >>> x = torch.arange(8)
+    >>> torch.tensor_split(x, 3)
+    (tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7]))
+
+    >>> x = torch.arange(7)
+    >>> torch.tensor_split(x, 3)
+    (tensor([0, 1, 2]), tensor([3, 4]), tensor([5, 6]))
+    >>> torch.tensor_split(x, (1, 6))
+    (tensor([0]), tensor([1, 2, 3, 4, 5]), tensor([6]))
+
+    >>> x = torch.arange(14).reshape(2, 7)
+    >>> x
+    tensor([[ 0,  1,  2,  3,  4,  5,  6],
+            [ 7,  8,  9, 10, 11, 12, 13]])
+    >>> torch.tensor_split(x, 3, dim=1)
+    (tensor([[0, 1, 2],
+            [7, 8, 9]]),
+     tensor([[ 3,  4],
+            [10, 11]]),
+     tensor([[ 5,  6],
+            [12, 13]]))
+    >>> torch.tensor_split(x, (1, 6), dim=1)
+    (tensor([[0],
+            [7]]),
+     tensor([[ 1,  2,  3,  4,  5],
+            [ 8,  9, 10, 11, 12]]),
+     tensor([[ 6],
+            [13]]))
+""",
+)
+
+add_docstr(
+    torch.chunk,
+    r"""
+chunk(input, chunks, dim=0) -> List of Tensors
+
+Attempts to split a tensor into the specified number of chunks. Each chunk is a view of
+the input tensor.
+
+
+.. note::
+
+    This function may return fewer than the specified number of chunks!
+
+.. seealso::
+
+    :func:`torch.tensor_split` a function that always returns exactly the specified number of chunks
+
+If the tensor size along the given dimension :attr:`dim` is divisible by :attr:`chunks`,
+all returned chunks will be the same size.
+If the tensor size along the given dimension :attr:`dim` is not divisible by :attr:`chunks`,
+all returned chunks will be the same size, except the last one.
+If such division is not possible, this function may return fewer
+than the specified number of chunks.
+
+Arguments:
+    input (Tensor): the tensor to split
+    chunks (int): number of chunks to return
+    dim (int): dimension along which to split the tensor
+
+Example:
+    >>> torch.arange(11).chunk(6)
+    (tensor([0, 1]),
+     tensor([2, 3]),
+     tensor([4, 5]),
+     tensor([6, 7]),
+     tensor([8, 9]),
+     tensor([10]))
+    >>> torch.arange(12).chunk(6)
+    (tensor([0, 1]),
+     tensor([2, 3]),
+     tensor([4, 5]),
+     tensor([6, 7]),
+     tensor([8, 9]),
+     tensor([10, 11]))
+    >>> torch.arange(13).chunk(6)
+    (tensor([0, 1, 2]),
+     tensor([3, 4, 5]),
+     tensor([6, 7, 8]),
+     tensor([ 9, 10, 11]),
+     tensor([12]))
+""",
+)
+
+add_docstr(
+    torch.unsafe_chunk,
+    r"""
+unsafe_chunk(input, chunks, dim=0) -> List of Tensors
+
+Works like :func:`torch.chunk` but without enforcing the autograd restrictions
+on inplace modification of the outputs.
+
+.. warning::
+    This function is safe to use as long as only the input, or only the outputs
+    are modified inplace after calling this function. It is user's
+    responsibility to ensure that is the case. If both the input and one or more
+    of the outputs are modified inplace, gradients computed by autograd will be
+    silently incorrect.
+""",
+)
+
+add_docstr(
+    torch.unsafe_split,
+    r"""
+unsafe_split(tensor, split_size_or_sections, dim=0) -> List of Tensors
+
+Works like :func:`torch.split` but without enforcing the autograd restrictions
+on inplace modification of the outputs.
+
+.. warning::
+    This function is safe to use as long as only the input, or only the outputs
+    are modified inplace after calling this function. It is user's
+    responsibility to ensure that is the case. If both the input and one or more
+    of the outputs are modified inplace, gradients computed by autograd will be
+    silently incorrect.
+""",
+)
+
+add_docstr(
+    torch.hsplit,
+    r"""
+hsplit(input, indices_or_sections) -> List of Tensors
+
+Splits :attr:`input`, a tensor with one or more dimensions, into multiple tensors
+horizontally according to :attr:`indices_or_sections`. Each split is a view of
+:attr:`input`.
+
+If :attr:`input` is one dimensional this is equivalent to calling
+torch.tensor_split(input, indices_or_sections, dim=0) (the split dimension is
+zero), and if :attr:`input` has two or more dimensions it's equivalent to calling
+torch.tensor_split(input, indices_or_sections, dim=1) (the split dimension is 1),
+except that if :attr:`indices_or_sections` is an integer it must evenly divide
+the split dimension or a runtime error will be thrown.
+
+This function is based on NumPy's :func:`numpy.hsplit`.
+
+Args:
+    input (Tensor): tensor to split.
+    indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+
+Example::
+    >>> t = torch.arange(16.0).reshape(4,4)
+    >>> t
+    tensor([[ 0.,  1.,  2.,  3.],
+            [ 4.,  5.,  6.,  7.],
+            [ 8.,  9., 10., 11.],
+            [12., 13., 14., 15.]])
+    >>> torch.hsplit(t, 2)
+    (tensor([[ 0.,  1.],
+             [ 4.,  5.],
+             [ 8.,  9.],
+             [12., 13.]]),
+     tensor([[ 2.,  3.],
+             [ 6.,  7.],
+             [10., 11.],
+             [14., 15.]]))
+    >>> torch.hsplit(t, [3, 6])
+    (tensor([[ 0.,  1.,  2.],
+             [ 4.,  5.,  6.],
+             [ 8.,  9., 10.],
+             [12., 13., 14.]]),
+     tensor([[ 3.],
+             [ 7.],
+             [11.],
+             [15.]]),
+     tensor([], size=(4, 0)))
+
+""",
+)
+
+add_docstr(
+    torch.vsplit,
+    r"""
+vsplit(input, indices_or_sections) -> List of Tensors
+
+Splits :attr:`input`, a tensor with two or more dimensions, into multiple tensors
+vertically according to :attr:`indices_or_sections`. Each split is a view of
+:attr:`input`.
+
+This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=0)
+(the split dimension is 0), except that if :attr:`indices_or_sections` is an integer
+it must evenly divide the split dimension or a runtime error will be thrown.
+
+This function is based on NumPy's :func:`numpy.vsplit`.
+
+Args:
+    input (Tensor): tensor to split.
+    indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+
+Example::
+    >>> t = torch.arange(16.0).reshape(4,4)
+    >>> t
+    tensor([[ 0.,  1.,  2.,  3.],
+            [ 4.,  5.,  6.,  7.],
+            [ 8.,  9., 10., 11.],
+            [12., 13., 14., 15.]])
+    >>> torch.vsplit(t, 2)
+    (tensor([[0., 1., 2., 3.],
+             [4., 5., 6., 7.]]),
+     tensor([[ 8.,  9., 10., 11.],
+             [12., 13., 14., 15.]]))
+    >>> torch.vsplit(t, [3, 6])
+    (tensor([[ 0.,  1.,  2.,  3.],
+             [ 4.,  5.,  6.,  7.],
+             [ 8.,  9., 10., 11.]]),
+     tensor([[12., 13., 14., 15.]]),
+     tensor([], size=(0, 4)))
+
+""",
+)
+
+add_docstr(
+    torch.dsplit,
+    r"""
+dsplit(input, indices_or_sections) -> List of Tensors
+
+Splits :attr:`input`, a tensor with three or more dimensions, into multiple tensors
+depthwise according to :attr:`indices_or_sections`. Each split is a view of
+:attr:`input`.
+
+This is equivalent to calling torch.tensor_split(input, indices_or_sections, dim=2)
+(the split dimension is 2), except that if :attr:`indices_or_sections` is an integer
+it must evenly divide the split dimension or a runtime error will be thrown.
+
+This function is based on NumPy's :func:`numpy.dsplit`.
+
+Args:
+    input (Tensor): tensor to split.
+    indices_or_sections (int or list or tuple of ints): See argument in :func:`torch.tensor_split`.
+
+Example::
+    >>> t = torch.arange(16.0).reshape(2, 2, 4)
+    >>> t
+    tensor([[[ 0.,  1.,  2.,  3.],
+             [ 4.,  5.,  6.,  7.]],
+            [[ 8.,  9., 10., 11.],
+             [12., 13., 14., 15.]]])
+    >>> torch.dsplit(t, 2)
+    (tensor([[[ 0.,  1.],
+            [ 4.,  5.]],
+           [[ 8.,  9.],
+            [12., 13.]]]),
+     tensor([[[ 2.,  3.],
+              [ 6.,  7.]],
+             [[10., 11.],
+              [14., 15.]]]))
+
+    >>> torch.dsplit(t, [3, 6])
+    (tensor([[[ 0.,  1.,  2.],
+              [ 4.,  5.,  6.]],
+             [[ 8.,  9., 10.],
+              [12., 13., 14.]]]),
+     tensor([[[ 3.],
+              [ 7.]],
+             [[11.],
+              [15.]]]),
+     tensor([], size=(2, 2, 0)))
+
+""",
+)
+
+add_docstr(
+    torch.can_cast,
+    r"""
+can_cast(from, to) -> bool
+
+Determines if a type conversion is allowed under PyTorch casting rules
+described in the type promotion :ref:`documentation <type-promotion-doc>`.
+
+Args:
+    from (dtype): The original :class:`torch.dtype`.
+    to (dtype): The target :class:`torch.dtype`.
+
+Example::
+
+    >>> torch.can_cast(torch.double, torch.float)
+    True
+    >>> torch.can_cast(torch.float, torch.int)
+    False
+""",
+)
+
+add_docstr(
+    torch.corrcoef,
+    r"""
+corrcoef(input) -> Tensor
+
+Estimates the Pearson product-moment correlation coefficient matrix of the variables given by the :attr:`input` matrix,
+where rows are the variables and columns are the observations.
+
+.. note::
+
+    The correlation coefficient matrix R is computed using the covariance matrix C as given by
+    :math:`R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }`
+
+.. note::
+
+    Due to floating point rounding, the resulting array may not be Hermitian and its diagonal elements may not be 1.
+    The real and imaginary values are clipped to the interval [-1, 1] in an attempt to improve this situation.
+
+Args:
+    input (Tensor): A 2D matrix containing multiple variables and observations, or a
+        Scalar or 1D vector representing a single variable.
+
+Returns:
+    (Tensor) The correlation coefficient matrix of the variables.
+
+.. seealso::
+
+        :func:`torch.cov` covariance matrix.
+
+Example::
+
+    >>> x = torch.tensor([[0, 1, 2], [2, 1, 0]])
+    >>> torch.corrcoef(x)
+    tensor([[ 1., -1.],
+            [-1.,  1.]])
+    >>> x = torch.randn(2, 4)
+    >>> x
+    tensor([[-0.2678, -0.0908, -0.3766,  0.2780],
+            [-0.5812,  0.1535,  0.2387,  0.2350]])
+    >>> torch.corrcoef(x)
+    tensor([[1.0000, 0.3582],
+            [0.3582, 1.0000]])
+    >>> torch.corrcoef(x[0])
+    tensor(1.)
+""",
+)
+
+add_docstr(
+    torch.cov,
+    r"""
+cov(input, *, correction=1, fweights=None, aweights=None) -> Tensor
+
+Estimates the covariance matrix of the variables given by the :attr:`input` matrix, where rows are
+the variables and columns are the observations.
+
+A covariance matrix is a square matrix giving the covariance of each pair of variables. The diagonal contains
+the variance of each variable (covariance of a variable with itself). By definition, if :attr:`input` represents
+a single variable (Scalar or 1D) then its variance is returned.
+
+The sample covariance of the variables :math:`x` and :math:`y` is given by:
+
+.. math::
+    \text{cov}(x,y) = \frac{\sum^{N}_{i = 1}(x_{i} - \bar{x})(y_{i} - \bar{y})}{\max(0,~N~-~\delta N)}
+
+where :math:`\bar{x}` and :math:`\bar{y}` are the simple means of the :math:`x` and :math:`y` respectively, and
+:math:`\delta N` is the :attr:`correction`.
+
+If :attr:`fweights` and/or :attr:`aweights` are provided, the weighted covariance
+is calculated, which is given by:
+
+.. math::
+    \text{cov}_w(x,y) = \frac{\sum^{N}_{i = 1}w_i(x_{i} - \mu_x^*)(y_{i} - \mu_y^*)}
+    {\max(0,~\sum^{N}_{i = 1}w_i~-~\frac{\sum^{N}_{i = 1}w_ia_i}{\sum^{N}_{i = 1}w_i}~\delta N)}
+
+where :math:`w` denotes :attr:`fweights` or :attr:`aweights` (``f`` and ``a`` for brevity) based on whichever is
+provided, or :math:`w = f \times a` if both are provided, and
+:math:`\mu_x^* = \frac{\sum^{N}_{i = 1}w_ix_{i} }{\sum^{N}_{i = 1}w_i}` is the weighted mean of the variable. If not
+provided, ``f`` and/or ``a`` can be seen as a :math:`\mathbb{1}` vector of appropriate size.
+
+Args:
+    input (Tensor): A 2D matrix containing multiple variables and observations, or a
+        Scalar or 1D vector representing a single variable.
+
+Keyword Args:
+    correction (int, optional): difference between the sample size and sample degrees of freedom.
+        Defaults to Bessel's correction, ``correction = 1`` which returns the unbiased estimate,
+        even if both :attr:`fweights` and :attr:`aweights` are specified. ``correction = 0``
+        will return the simple average. Defaults to ``1``.
+    fweights (tensor, optional): A Scalar or 1D tensor of observation vector frequencies representing the number of
+        times each observation should be repeated. Its numel must equal the number of columns of :attr:`input`.
+        Must have integral dtype. Ignored if ``None``. Defaults to ``None``.
+    aweights (tensor, optional): A Scalar or 1D array of observation vector weights.
+        These relative weights are typically large for observations considered "important" and smaller for
+        observations considered less "important". Its numel must equal the number of columns of :attr:`input`.
+        Must have floating point dtype. Ignored if ``None``. Defaults to ``None``.
+
+Returns:
+    (Tensor) The covariance matrix of the variables.
+
+.. seealso::
+
+        :func:`torch.corrcoef` normalized covariance matrix.
+
+Example::
+    >>> x = torch.tensor([[0, 2], [1, 1], [2, 0]]).T
+    >>> x
+    tensor([[0, 1, 2],
+            [2, 1, 0]])
+    >>> torch.cov(x)
+    tensor([[ 1., -1.],
+            [-1.,  1.]])
+    >>> torch.cov(x, correction=0)
+    tensor([[ 0.6667, -0.6667],
+            [-0.6667,  0.6667]])
+    >>> fw = torch.randint(1, 10, (3,))
+    >>> fw
+    tensor([1, 6, 9])
+    >>> aw = torch.rand(3)
+    >>> aw
+    tensor([0.4282, 0.0255, 0.4144])
+    >>> torch.cov(x, fweights=fw, aweights=aw)
+    tensor([[ 0.4169, -0.4169],
+            [-0.4169,  0.4169]])
+""",
+)
+
+add_docstr(
+    torch.cat,
+    r"""
+cat(tensors, dim=0, *, out=None) -> Tensor
+
+Concatenates the given sequence of :attr:`seq` tensors in the given dimension.
+All tensors must either have the same shape (except in the concatenating
+dimension) or be a 1-D empty tensor with size ``(0,)``.
+
+:func:`torch.cat` can be seen as an inverse operation for :func:`torch.split`
+and :func:`torch.chunk`.
+
+:func:`torch.cat` can be best understood via examples.
+
+.. seealso::
+
+    :func:`torch.stack` concatenates the given sequence along a new dimension.
+
+Args:
+    tensors (sequence of Tensors): any python sequence of tensors of the same type.
+        Non-empty tensors provided must have the same shape, except in the
+        cat dimension.
+    dim (int, optional): the dimension over which the tensors are concatenated
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497]])
+    >>> torch.cat((x, x, x), 0)
+    tensor([[ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497],
+            [ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497],
+            [ 0.6580, -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497]])
+    >>> torch.cat((x, x, x), 1)
+    tensor([[ 0.6580, -1.0969, -0.4614,  0.6580, -1.0969, -0.4614,  0.6580,
+             -1.0969, -0.4614],
+            [-0.1034, -0.5790,  0.1497, -0.1034, -0.5790,  0.1497, -0.1034,
+             -0.5790,  0.1497]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.concat,
+    r"""
+concat(tensors, dim=0, *, out=None) -> Tensor
+
+Alias of :func:`torch.cat`.
+""",
+)
+
+add_docstr(
+    torch.concatenate,
+    r"""
+concatenate(tensors, axis=0, out=None) -> Tensor
+
+Alias of :func:`torch.cat`.
+""",
+)
+
+add_docstr(
+    torch.ceil,
+    r"""
+ceil(input, *, out=None) -> Tensor
+
+Returns a new tensor with the ceil of the elements of :attr:`input`,
+the smallest integer greater than or equal to each element.
+
+For integer inputs, follows the array-api convention of returning a
+copy of the input tensor.
+
+.. math::
+    \text{out}_{i} = \left\lceil \text{input}_{i} \right\rceil
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.6341, -1.4208, -1.0900,  0.5826])
+    >>> torch.ceil(a)
+    tensor([-0., -1., -1.,  1.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.real,
+    r"""
+real(input) -> Tensor
+
+Returns a new tensor containing real values of the :attr:`self` tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.real
+    tensor([ 0.3100, -0.5445, -1.6492, -0.0638])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.imag,
+    r"""
+imag(input) -> Tensor
+
+Returns a new tensor containing imaginary values of the :attr:`self` tensor.
+The returned tensor and :attr:`self` share the same underlying storage.
+
+.. warning::
+    :func:`imag` is only supported for tensors with complex dtypes.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.3100+0.3553j), (-0.5445-0.7896j), (-1.6492-0.0633j), (-0.0638-0.8119j)])
+    >>> x.imag
+    tensor([ 0.3553, -0.7896, -0.0633, -0.8119])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.view_as_real,
+    r"""
+view_as_real(input) -> Tensor
+
+Returns a view of :attr:`input` as a real tensor. For an input complex tensor of
+:attr:`size` :math:`m1, m2, \dots, mi`, this function returns a new
+real tensor of size :math:`m1, m2, \dots, mi, 2`, where the last dimension of size 2
+represents the real and imaginary components of complex numbers.
+
+.. warning::
+    :func:`view_as_real` is only supported for tensors with ``complex dtypes``.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x=torch.randn(4, dtype=torch.cfloat)
+    >>> x
+    tensor([(0.4737-0.3839j), (-0.2098-0.6699j), (0.3470-0.9451j), (-0.5174-1.3136j)])
+    >>> torch.view_as_real(x)
+    tensor([[ 0.4737, -0.3839],
+            [-0.2098, -0.6699],
+            [ 0.3470, -0.9451],
+            [-0.5174, -1.3136]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.view_as_complex,
+    r"""
+view_as_complex(input) -> Tensor
+
+Returns a view of :attr:`input` as a complex tensor. For an input complex
+tensor of :attr:`size` :math:`m1, m2, \dots, mi, 2`, this function returns a
+new complex tensor of :attr:`size` :math:`m1, m2, \dots, mi` where the last
+dimension of the input tensor is expected to represent the real and imaginary
+components of complex numbers.
+
+.. warning::
+    :func:`view_as_complex` is only supported for tensors with
+    :class:`torch.dtype` ``torch.float64`` and ``torch.float32``.  The input is
+    expected to have the last dimension of :attr:`size` 2. In addition, the
+    tensor must have a `stride` of 1 for its last dimension. The strides of all
+    other dimensions must be even numbers.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x=torch.randn(4, 2)
+    >>> x
+    tensor([[ 1.6116, -0.5772],
+            [-1.4606, -0.9120],
+            [ 0.0786, -1.7497],
+            [-0.6561, -1.6623]])
+    >>> torch.view_as_complex(x)
+    tensor([(1.6116-0.5772j), (-1.4606-0.9120j), (0.0786-1.7497j), (-0.6561-1.6623j)])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.reciprocal,
+    r"""
+reciprocal(input, *, out=None) -> Tensor
+
+Returns a new tensor with the reciprocal of the elements of :attr:`input`
+
+.. math::
+    \text{out}_{i} = \frac{1}{\text{input}_{i}}
+
+.. note::
+    Unlike NumPy's reciprocal, torch.reciprocal supports integral inputs. Integral
+    inputs to reciprocal are automatically :ref:`promoted <type-promotion-doc>` to
+    the default scalar type.
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.4595, -2.1219, -1.4314,  0.7298])
+    >>> torch.reciprocal(a)
+    tensor([-2.1763, -0.4713, -0.6986,  1.3702])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.cholesky,
+    r"""
+cholesky(input, upper=False, *, out=None) -> Tensor
+
+Computes the Cholesky decomposition of a symmetric positive-definite
+matrix :math:`A` or for batches of symmetric positive-definite matrices.
+
+If :attr:`upper` is ``True``, the returned matrix ``U`` is upper-triangular, and
+the decomposition has the form:
+
+.. math::
+
+  A = U^TU
+
+If :attr:`upper` is ``False``, the returned matrix ``L`` is lower-triangular, and
+the decomposition has the form:
+
+.. math::
+
+    A = LL^T
+
+If :attr:`upper` is ``True``, and :math:`A` is a batch of symmetric positive-definite
+matrices, then the returned tensor will be composed of upper-triangular Cholesky factors
+of each of the individual matrices. Similarly, when :attr:`upper` is ``False``, the returned
+tensor will be composed of lower-triangular Cholesky factors of each of the individual
+matrices.
+
+.. warning::
+
+    :func:`torch.cholesky` is deprecated in favor of :func:`torch.linalg.cholesky`
+    and will be removed in a future PyTorch release.
+
+    ``L = torch.cholesky(A)`` should be replaced with
+
+    .. code:: python
+
+        L = torch.linalg.cholesky(A)
+
+    ``U = torch.cholesky(A, upper=True)`` should be replaced with
+
+    .. code:: python
+
+        U = torch.linalg.cholesky(A).mH
+
+    This transform will produce equivalent results for all valid (symmetric positive definite) inputs.
+
+Args:
+    input (Tensor): the input tensor :math:`A` of size :math:`(*, n, n)` where `*` is zero or more
+                batch dimensions consisting of symmetric positive-definite matrices.
+    upper (bool, optional): flag that indicates whether to return a
+                            upper or lower triangular matrix. Default: ``False``
+
+Keyword args:
+    out (Tensor, optional): the output matrix
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a = a @ a.mT + 1e-3 # make symmetric positive-definite
+    >>> l = torch.cholesky(a)
+    >>> a
+    tensor([[ 2.4112, -0.7486,  1.4551],
+            [-0.7486,  1.3544,  0.1294],
+            [ 1.4551,  0.1294,  1.6724]])
+    >>> l
+    tensor([[ 1.5528,  0.0000,  0.0000],
+            [-0.4821,  1.0592,  0.0000],
+            [ 0.9371,  0.5487,  0.7023]])
+    >>> l @ l.mT
+    tensor([[ 2.4112, -0.7486,  1.4551],
+            [-0.7486,  1.3544,  0.1294],
+            [ 1.4551,  0.1294,  1.6724]])
+    >>> a = torch.randn(3, 2, 2) # Example for batched input
+    >>> a = a @ a.mT + 1e-03 # make symmetric positive-definite
+    >>> l = torch.cholesky(a)
+    >>> z = l @ l.mT
+    >>> torch.dist(z, a)
+    tensor(2.3842e-07)
+""",
+)
+
+add_docstr(
+    torch.cholesky_solve,
+    r"""
+cholesky_solve(B, L, upper=False, *, out=None) -> Tensor
+
+Computes the solution of a system of linear equations with complex Hermitian
+or real symmetric positive-definite lhs given its Cholesky decomposition.
+
+Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+and :math:`L` its Cholesky decomposition such that:
+
+.. math::
+
+    A = LL^{\text{H}}
+
+where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+and the transpose when :math:`L` is real-valued.
+
+Returns the solution :math:`X` of the following linear system:
+
+.. math::
+
+    AX = B
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :math:`A` or :math:`B` is a batch of matrices
+then the output has the same batch dimensions.
+
+Args:
+    B (Tensor): right-hand side tensor of shape `(*, n, k)`
+        where :math:`*` is zero or more batch dimensions
+    L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+        consisting of lower or upper triangular Cholesky decompositions of
+        symmetric or Hermitian positive-definite matrices.
+    upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+        or upper triangular. Default: ``False``.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+    >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+    >>> B = torch.randn(3, 2)
+    >>> torch.cholesky_solve(B, L)
+    tensor([[ -8.1625,  19.6097],
+            [ -5.8398,  14.2387],
+            [ -4.3771,  10.4173]])
+    >>> A.inverse() @  B
+    tensor([[ -8.1626,  19.6097],
+            [ -5.8398,  14.2387],
+            [ -4.3771,  10.4173]])
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+    >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+    >>> L = torch.linalg.cholesky(A)
+    >>> B = torch.randn(2, 1, dtype=torch.complex64)
+    >>> X = torch.cholesky_solve(B, L)
+    >>> torch.dist(X, A.inverse() @ B)
+    tensor(1.6881e-5)
+""",
+)
+
+add_docstr(
+    torch.cholesky_inverse,
+    r"""
+cholesky_inverse(L, upper=False, *, out=None) -> Tensor
+
+Computes the inverse of a complex Hermitian or real symmetric
+positive-definite matrix given its Cholesky decomposition.
+
+Let :math:`A` be a complex Hermitian or real symmetric positive-definite matrix,
+and :math:`L` its Cholesky decomposition such that:
+
+.. math::
+
+    A = LL^{\text{H}}
+
+where :math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex,
+and the transpose when :math:`L` is real-valued.
+
+Computes the inverse matrix :math:`A^{-1}`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :math:`A` is a batch of matrices
+then the output has the same batch dimensions.
+
+Args:
+    L (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+        consisting of lower or upper triangular Cholesky decompositions of
+        symmetric or Hermitian positive-definite matrices.
+    upper (bool, optional): flag that indicates whether :math:`L` is lower triangular
+        or upper triangular. Default: ``False``
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.T + torch.eye(3) * 1e-3 # Creates a symmetric positive-definite matrix
+    >>> L = torch.linalg.cholesky(A) # Extract Cholesky decomposition
+    >>> torch.cholesky_inverse(L)
+    tensor([[ 1.9314,  1.2251, -0.0889],
+            [ 1.2251,  2.4439,  0.2122],
+            [-0.0889,  0.2122,  0.1412]])
+    >>> A.inverse()
+    tensor([[ 1.9314,  1.2251, -0.0889],
+            [ 1.2251,  2.4439,  0.2122],
+            [-0.0889,  0.2122,  0.1412]])
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.complex64)
+    >>> A = A @ A.mH + torch.eye(2) * 1e-3 # Batch of Hermitian positive-definite matrices
+    >>> L = torch.linalg.cholesky(A)
+    >>> torch.dist(torch.inverse(A), torch.cholesky_inverse(L))
+    tensor(5.6358e-7)
+""",
+)
+
+add_docstr(
+    torch.clone,
+    r"""
+clone(input, *, memory_format=torch.preserve_format) -> Tensor
+
+Returns a copy of :attr:`input`.
+
+.. note::
+
+    This function is differentiable, so gradients will flow back from the
+    result of this operation to :attr:`input`. To create a tensor without an
+    autograd relationship to :attr:`input` see :meth:`~Tensor.detach`.
+
+Args:
+    {input}
+
+Keyword args:
+    {memory_format}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.clamp,
+    r"""
+clamp(input, min=None, max=None, *, out=None) -> Tensor
+
+Clamps all elements in :attr:`input` into the range `[` :attr:`min`, :attr:`max` `]`.
+Letting min_value and max_value be :attr:`min` and :attr:`max`, respectively, this returns:
+
+.. math::
+    y_i = \min(\max(x_i, \text{min\_value}_i), \text{max\_value}_i)
+
+If :attr:`min` is ``None``, there is no lower bound.
+Or, if :attr:`max` is ``None`` there is no upper bound.
+"""
+    + r"""
+
+.. note::
+    If :attr:`min` is greater than :attr:`max` :func:`torch.clamp(..., min, max) <torch.clamp>`
+    sets all elements in :attr:`input` to the value of :attr:`max`.
+
+Args:
+    {input}
+    min (Number or Tensor, optional): lower-bound of the range to be clamped to
+    max (Number or Tensor, optional): upper-bound of the range to be clamped to
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-1.7120,  0.1734, -0.0478, -0.0922])
+    >>> torch.clamp(a, min=-0.5, max=0.5)
+    tensor([-0.5000,  0.1734, -0.0478, -0.0922])
+
+    >>> min = torch.linspace(-1, 1, steps=4)
+    >>> torch.clamp(a, min=min)
+    tensor([-1.0000,  0.1734,  0.3333,  1.0000])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.clip,
+    r"""
+clip(input, min=None, max=None, *, out=None) -> Tensor
+
+Alias for :func:`torch.clamp`.
+""",
+)
+
+add_docstr(
+    torch.column_stack,
+    r"""
+column_stack(tensors, *, out=None) -> Tensor
+
+Creates a new tensor by horizontally stacking the tensors in :attr:`tensors`.
+
+Equivalent to ``torch.hstack(tensors)``, except each zero or one dimensional tensor ``t``
+in :attr:`tensors` is first reshaped into a ``(t.numel(), 1)`` column before being stacked horizontally.
+
+Args:
+    tensors (sequence of Tensors): sequence of tensors to concatenate
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 2, 3])
+    >>> b = torch.tensor([4, 5, 6])
+    >>> torch.column_stack((a, b))
+    tensor([[1, 4],
+        [2, 5],
+        [3, 6]])
+    >>> a = torch.arange(5)
+    >>> b = torch.arange(10).reshape(5, 2)
+    >>> torch.column_stack((a, b, b))
+    tensor([[0, 0, 1, 0, 1],
+            [1, 2, 3, 2, 3],
+            [2, 4, 5, 4, 5],
+            [3, 6, 7, 6, 7],
+            [4, 8, 9, 8, 9]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.complex,
+    r"""
+complex(real, imag, *, out=None) -> Tensor
+
+Constructs a complex tensor with its real part equal to :attr:`real` and its
+imaginary part equal to :attr:`imag`.
+
+Args:
+    real (Tensor): The real part of the complex tensor. Must be half, float or double.
+    imag (Tensor): The imaginary part of the complex tensor. Must be same dtype
+        as :attr:`real`.
+
+Keyword args:
+    out (Tensor): If the inputs are ``torch.float32``, must be
+        ``torch.complex64``. If the inputs are ``torch.float64``, must be
+        ``torch.complex128``.
+
+Example::
+
+    >>> real = torch.tensor([1, 2], dtype=torch.float32)
+    >>> imag = torch.tensor([3, 4], dtype=torch.float32)
+    >>> z = torch.complex(real, imag)
+    >>> z
+    tensor([(1.+3.j), (2.+4.j)])
+    >>> z.dtype
+    torch.complex64
+
+""",
+)
+
+add_docstr(
+    torch.polar,
+    r"""
+polar(abs, angle, *, out=None) -> Tensor
+
+Constructs a complex tensor whose elements are Cartesian coordinates
+corresponding to the polar coordinates with absolute value :attr:`abs` and angle
+:attr:`angle`.
+
+.. math::
+    \text{out} = \text{abs} \cdot \cos(\text{angle}) + \text{abs} \cdot \sin(\text{angle}) \cdot j
+
+.. note::
+    `torch.polar` is similar to
+    `std::polar <https://en.cppreference.com/w/cpp/numeric/complex/polar>`_
+    and does not compute the polar decomposition
+    of a complex tensor like Python's `cmath.polar` and SciPy's `linalg.polar` do.
+    The behavior of this function is undefined if `abs` is negative or NaN, or if `angle` is
+    infinite.
+
+"""
+    + r"""
+Args:
+    abs (Tensor): The absolute value the complex tensor. Must be float or double.
+    angle (Tensor): The angle of the complex tensor. Must be same dtype as
+        :attr:`abs`.
+
+Keyword args:
+    out (Tensor): If the inputs are ``torch.float32``, must be
+        ``torch.complex64``. If the inputs are ``torch.float64``, must be
+        ``torch.complex128``.
+
+Example::
+
+    >>> import numpy as np
+    >>> abs = torch.tensor([1, 2], dtype=torch.float64)
+    >>> angle = torch.tensor([np.pi / 2, 5 * np.pi / 4], dtype=torch.float64)
+    >>> z = torch.polar(abs, angle)
+    >>> z
+    tensor([(0.0000+1.0000j), (-1.4142-1.4142j)], dtype=torch.complex128)
+""",
+)
+
+add_docstr(
+    torch.conj_physical,
+    r"""
+conj_physical(input, *, out=None) -> Tensor
+
+Computes the element-wise conjugate of the given :attr:`input` tensor.
+If :attr:`input` has a non-complex dtype, this function just returns :attr:`input`.
+
+.. note::
+   This performs the conjugate operation regardless of the fact conjugate bit is set or not.
+
+.. warning:: In the future, :func:`torch.conj_physical` may return a non-writeable view for an :attr:`input` of
+             non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+             when :attr:`input` is of non-complex dtype to be compatible with this change.
+
+.. math::
+    \text{out}_{i} = conj(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.conj_physical(torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j]))
+    tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.conj,
+    r"""
+conj(input) -> Tensor
+
+Returns a view of :attr:`input` with a flipped conjugate bit. If :attr:`input` has a non-complex dtype,
+this function just returns :attr:`input`.
+
+.. note::
+    :func:`torch.conj` performs a lazy conjugation, but the actual conjugated tensor can be materialized
+    at any time using :func:`torch.resolve_conj`.
+
+.. warning:: In the future, :func:`torch.conj` may return a non-writeable view for an :attr:`input` of
+             non-complex dtype. It's recommended that programs not modify the tensor returned by :func:`torch.conj_physical`
+             when :attr:`input` is of non-complex dtype to be compatible with this change.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+    >>> x.is_conj()
+    False
+    >>> y = torch.conj(x)
+    >>> y.is_conj()
+    True
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.resolve_conj,
+    r"""
+resolve_conj(input) -> Tensor
+
+Returns a new tensor with materialized conjugation if :attr:`input`'s conjugate bit is set to `True`,
+else returns :attr:`input`. The output tensor will always have its conjugate bit set to `False`.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+    >>> y = x.conj()
+    >>> y.is_conj()
+    True
+    >>> z = y.resolve_conj()
+    >>> z
+    tensor([-1 - 1j, -2 - 2j, 3 + 3j])
+    >>> z.is_conj()
+    False
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.resolve_neg,
+    r"""
+resolve_neg(input) -> Tensor
+
+Returns a new tensor with materialized negation if :attr:`input`'s negative bit is set to `True`,
+else returns :attr:`input`. The output tensor will always have its negative bit set to `False`.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x = torch.tensor([-1 + 1j, -2 + 2j, 3 - 3j])
+    >>> y = x.conj()
+    >>> z = y.imag
+    >>> z.is_neg()
+    True
+    >>> out = z.resolve_neg()
+    >>> out
+    tensor([-1., -2., 3.])
+    >>> out.is_neg()
+    False
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.copysign,
+    r"""
+copysign(input, other, *, out=None) -> Tensor
+
+Create a new floating-point tensor with the magnitude of :attr:`input` and the sign of :attr:`other`, elementwise.
+
+.. math::
+    \text{out}_{i} = \begin{cases}
+        -|\text{input}_{i}| & \text{if } \text{other}_{i} \leq -0.0 \\
+         |\text{input}_{i}| & \text{if } \text{other}_{i} \geq 0.0 \\
+    \end{cases}
+"""
+    + r"""
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+and integer and float inputs.
+
+Args:
+    input (Tensor): magnitudes.
+    other (Tensor or Number): contains value(s) whose signbit(s) are
+        applied to the magnitudes in :attr:`input`.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([-1.2557, -0.0026, -0.5387,  0.4740, -0.9244])
+    >>> torch.copysign(a, 1)
+    tensor([1.2557, 0.0026, 0.5387, 0.4740, 0.9244])
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.7079,  0.2778, -1.0249,  0.5719],
+            [-0.0059, -0.2600, -0.4475, -1.3948],
+            [ 0.3667, -0.9567, -2.5757, -0.1751],
+            [ 0.2046, -0.0742,  0.2998, -0.1054]])
+    >>> b = torch.randn(4)
+    tensor([ 0.2373,  0.3120,  0.3190, -1.1128])
+    >>> torch.copysign(a, b)
+    tensor([[ 0.7079,  0.2778,  1.0249, -0.5719],
+            [ 0.0059,  0.2600,  0.4475, -1.3948],
+            [ 0.3667,  0.9567,  2.5757, -0.1751],
+            [ 0.2046,  0.0742,  0.2998, -0.1054]])
+    >>> a = torch.tensor([1.])
+    >>> b = torch.tensor([-0.])
+    >>> torch.copysign(a, b)
+    tensor([-1.])
+
+.. note::
+    copysign handles signed zeros. If the other argument has a negative zero (-0),
+    the corresponding output value will be negative.
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.cos,
+    r"""
+cos(input, *, out=None) -> Tensor
+
+Returns a new tensor with the cosine  of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cos(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 1.4309,  1.2706, -0.8562,  0.9796])
+    >>> torch.cos(a)
+    tensor([ 0.1395,  0.2957,  0.6553,  0.5574])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.cosh,
+    r"""
+cosh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic cosine  of the elements of
+:attr:`input`.
+
+.. math::
+    \text{out}_{i} = \cosh(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.1632,  1.1835, -0.6979, -0.7325])
+    >>> torch.cosh(a)
+    tensor([ 1.0133,  1.7860,  1.2536,  1.2805])
+
+.. note::
+   When :attr:`input` is on the CPU, the implementation of torch.cosh may use
+   the Sleef library, which rounds very large results to infinity or negative
+   infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.cross,
+    r"""
+cross(input, other, dim=None, *, out=None) -> Tensor
+
+
+Returns the cross product of vectors in dimension :attr:`dim` of :attr:`input`
+and :attr:`other`.
+
+Supports input of float, double, cfloat and cdouble dtypes. Also supports batches
+of vectors, for which it computes the product along the dimension :attr:`dim`.
+In this case, the output has the same batch dimensions as the inputs.
+
+.. warning::
+    If :attr:`dim` is not given, it defaults to the first dimension found
+    with the size 3. Note that this might be unexpected.
+
+    This behavior is deprecated and will be changed to match that of :func:`torch.linalg.cross`
+    in a future release.
+
+.. seealso::
+        :func:`torch.linalg.cross` which has dim=-1 as default.
+
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+    dim  (int, optional): the dimension to take the cross-product in.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4, 3)
+    >>> a
+    tensor([[-0.3956,  1.1455,  1.6895],
+            [-0.5849,  1.3672,  0.3599],
+            [-1.1626,  0.7180, -0.0521],
+            [-0.1339,  0.9902, -2.0225]])
+    >>> b = torch.randn(4, 3)
+    >>> b
+    tensor([[-0.0257, -1.4725, -1.2251],
+            [-1.1479, -0.7005, -1.9757],
+            [-1.3904,  0.3726, -1.1836],
+            [-0.9688, -0.7153,  0.2159]])
+    >>> torch.cross(a, b, dim=1)
+    tensor([[ 1.0844, -0.5281,  0.6120],
+            [-2.4490, -1.5687,  1.9792],
+            [-0.8304, -1.3037,  0.5650],
+            [-1.2329,  1.9883,  1.0551]])
+    >>> torch.cross(a, b)
+    tensor([[ 1.0844, -0.5281,  0.6120],
+            [-2.4490, -1.5687,  1.9792],
+            [-0.8304, -1.3037,  0.5650],
+            [-1.2329,  1.9883,  1.0551]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logcumsumexp,
+    r"""
+logcumsumexp(input, dim, *, out=None) -> Tensor
+Returns the logarithm of the cumulative summation of the exponentiation of
+elements of :attr:`input` in the dimension :attr:`dim`.
+
+For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+
+    .. math::
+        \text{{logcumsumexp}}(x)_{{ij}} = \log \sum\limits_{{j=0}}^{{i}} \exp(x_{{ij}})
+
+Args:
+    {input}
+    dim  (int): the dimension to do the operation over
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> torch.logcumsumexp(a, dim=0)
+    tensor([-0.42296738, -0.04462666,  0.86278635,  0.94622083,  1.05277811,
+             1.39202815,  1.83525007,  1.84492621,  2.06084887,  2.06844475]))
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.cummax,
+    r"""
+cummax(input, dim, *, out=None) -> (Tensor, LongTensor)
+Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative maximum of
+elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+location of each maximum value found in the dimension :attr:`dim`.
+
+.. math::
+    y_i = max(x_1, x_2, x_3, \dots, x_i)
+
+Args:
+    {input}
+    dim  (int): the dimension to do the operation over
+
+Keyword args:
+    out (tuple, optional): the result tuple of two output tensors (values, indices)
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> a
+    tensor([-0.3449, -1.5447,  0.0685, -1.5104, -1.1706,  0.2259,  1.4696, -1.3284,
+         1.9946, -0.8209])
+    >>> torch.cummax(a, dim=0)
+    torch.return_types.cummax(
+        values=tensor([-0.3449, -0.3449,  0.0685,  0.0685,  0.0685,  0.2259,  1.4696,  1.4696,
+         1.9946,  1.9946]),
+        indices=tensor([0, 0, 2, 2, 2, 5, 6, 6, 8, 8]))
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.cummin,
+    r"""
+cummin(input, dim, *, out=None) -> (Tensor, LongTensor)
+Returns a namedtuple ``(values, indices)`` where ``values`` is the cumulative minimum of
+elements of :attr:`input` in the dimension :attr:`dim`. And ``indices`` is the index
+location of each maximum value found in the dimension :attr:`dim`.
+
+.. math::
+    y_i = min(x_1, x_2, x_3, \dots, x_i)
+
+Args:
+    {input}
+    dim  (int): the dimension to do the operation over
+
+Keyword args:
+    out (tuple, optional): the result tuple of two output tensors (values, indices)
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> a
+    tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,
+         0.9165,  1.6684])
+    >>> torch.cummin(a, dim=0)
+    torch.return_types.cummin(
+        values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
+        -1.3298, -1.3298]),
+        indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.cumprod,
+    r"""
+cumprod(input, dim, *, dtype=None, out=None) -> Tensor
+
+Returns the cumulative product of elements of :attr:`input` in the dimension
+:attr:`dim`.
+
+For example, if :attr:`input` is a vector of size N, the result will also be
+a vector of size N, with elements.
+
+.. math::
+    y_i = x_1 \times x_2\times x_3\times \dots \times x_i
+
+Args:
+    {input}
+    dim  (int): the dimension to do the operation over
+
+Keyword args:
+    {dtype}
+    {out}
+
+Example::
+
+    >>> a = torch.randn(10)
+    >>> a
+    tensor([ 0.6001,  0.2069, -0.1919,  0.9792,  0.6727,  1.0062,  0.4126,
+            -0.2129, -0.4206,  0.1968])
+    >>> torch.cumprod(a, dim=0)
+    tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0158, -0.0065,
+             0.0014, -0.0006, -0.0001])
+
+    >>> a[5] = 0.0
+    >>> torch.cumprod(a, dim=0)
+    tensor([ 0.6001,  0.1241, -0.0238, -0.0233, -0.0157, -0.0000, -0.0000,
+             0.0000, -0.0000, -0.0000])
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.cumsum,
+    r"""
+cumsum(input, dim, *, dtype=None, out=None) -> Tensor
+
+Returns the cumulative sum of elements of :attr:`input` in the dimension
+:attr:`dim`.
+
+For example, if :attr:`input` is a vector of size N, the result will also be
+a vector of size N, with elements.
+
+.. math::
+    y_i = x_1 + x_2 + x_3 + \dots + x_i
+
+Args:
+    {input}
+    dim  (int): the dimension to do the operation over
+
+Keyword args:
+    {dtype}
+    {out}
+
+Example::
+
+    >>> a = torch.randint(1, 20, (10,))
+    >>> a
+    tensor([13,  7,  3, 10, 13,  3, 15, 10,  9, 10])
+    >>> torch.cumsum(a, dim=0)
+    tensor([13, 20, 23, 33, 46, 49, 64, 74, 83, 93])
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.count_nonzero,
+    r"""
+count_nonzero(input, dim=None) -> Tensor
+
+Counts the number of non-zero values in the tensor :attr:`input` along the given :attr:`dim`.
+If no dim is specified then all non-zeros in the tensor are counted.
+
+Args:
+    {input}
+    dim (int or tuple of ints, optional): Dim or tuple of dims along which to count non-zeros.
+
+Example::
+
+    >>> x = torch.zeros(3,3)
+    >>> x[torch.randn(3,3) > 0.5] = 1
+    >>> x
+    tensor([[0., 1., 1.],
+            [0., 0., 0.],
+            [0., 0., 1.]])
+    >>> torch.count_nonzero(x)
+    tensor(3)
+    >>> torch.count_nonzero(x, dim=0)
+    tensor([0, 1, 2])
+""".format(
+        **reduceops_common_args
+    ),
+)
+
+add_docstr(
+    torch.dequantize,
+    r"""
+dequantize(tensor) -> Tensor
+
+Returns an fp32 Tensor by dequantizing a quantized Tensor
+
+Args:
+    tensor (Tensor): A quantized Tensor
+
+.. function:: dequantize(tensors) -> sequence of Tensors
+   :noindex:
+
+Given a list of quantized Tensors, dequantize them and return a list of fp32 Tensors
+
+Args:
+     tensors (sequence of Tensors): A list of quantized Tensors
+""",
+)
+
+add_docstr(
+    torch.diag,
+    r"""
+diag(input, diagonal=0, *, out=None) -> Tensor
+
+- If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+  with the elements of :attr:`input` as the diagonal.
+- If :attr:`input` is a matrix (2-D tensor), then returns a 1-D tensor with
+  the diagonal elements of :attr:`input`.
+
+The argument :attr:`diagonal` controls which diagonal to consider:
+
+- If :attr:`diagonal` = 0, it is the main diagonal.
+- If :attr:`diagonal` > 0, it is above the main diagonal.
+- If :attr:`diagonal` < 0, it is below the main diagonal.
+
+Args:
+    {input}
+    diagonal (int, optional): the diagonal to consider
+
+Keyword args:
+    {out}
+
+.. seealso::
+
+        :func:`torch.diagonal` always returns the diagonal of its input.
+
+        :func:`torch.diagflat` always constructs a tensor with diagonal elements
+        specified by the input.
+
+Examples:
+
+Get the square matrix where the input vector is the diagonal::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([ 0.5950,-0.0872, 2.3298])
+    >>> torch.diag(a)
+    tensor([[ 0.5950, 0.0000, 0.0000],
+            [ 0.0000,-0.0872, 0.0000],
+            [ 0.0000, 0.0000, 2.3298]])
+    >>> torch.diag(a, 1)
+    tensor([[ 0.0000, 0.5950, 0.0000, 0.0000],
+            [ 0.0000, 0.0000,-0.0872, 0.0000],
+            [ 0.0000, 0.0000, 0.0000, 2.3298],
+            [ 0.0000, 0.0000, 0.0000, 0.0000]])
+
+Get the k-th diagonal of a given matrix::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-0.4264, 0.0255,-0.1064],
+            [ 0.8795,-0.2429, 0.1374],
+            [ 0.1029,-0.6482,-1.6300]])
+    >>> torch.diag(a, 0)
+    tensor([-0.4264,-0.2429,-1.6300])
+    >>> torch.diag(a, 1)
+    tensor([ 0.0255, 0.1374])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.diag_embed,
+    r"""
+diag_embed(input, offset=0, dim1=-2, dim2=-1) -> Tensor
+
+Creates a tensor whose diagonals of certain 2D planes (specified by
+:attr:`dim1` and :attr:`dim2`) are filled by :attr:`input`.
+To facilitate creating batched diagonal matrices, the 2D planes formed by
+the last two dimensions of the returned tensor are chosen by default.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+The size of the new matrix will be calculated to make the specified diagonal
+of the size of the last input dimension.
+Note that for :attr:`offset` other than :math:`0`, the order of :attr:`dim1`
+and :attr:`dim2` matters. Exchanging them is equivalent to changing the
+sign of :attr:`offset`.
+
+Applying :meth:`torch.diagonal` to the output of this function with
+the same arguments yields a matrix identical to input. However,
+:meth:`torch.diagonal` has different default dimensions, so those
+need to be explicitly specified.
+
+Args:
+    {input} Must be at least 1-dimensional.
+    offset (int, optional): which diagonal to consider. Default: 0
+        (main diagonal).
+    dim1 (int, optional): first dimension with respect to which to
+        take diagonal. Default: -2.
+    dim2 (int, optional): second dimension with respect to which to
+        take diagonal. Default: -1.
+
+Example::
+
+    >>> a = torch.randn(2, 3)
+    >>> torch.diag_embed(a)
+    tensor([[[ 1.5410,  0.0000,  0.0000],
+             [ 0.0000, -0.2934,  0.0000],
+             [ 0.0000,  0.0000, -2.1788]],
+
+            [[ 0.5684,  0.0000,  0.0000],
+             [ 0.0000, -1.0845,  0.0000],
+             [ 0.0000,  0.0000, -1.3986]]])
+
+    >>> torch.diag_embed(a, offset=1, dim1=0, dim2=2)
+    tensor([[[ 0.0000,  1.5410,  0.0000,  0.0000],
+             [ 0.0000,  0.5684,  0.0000,  0.0000]],
+
+            [[ 0.0000,  0.0000, -0.2934,  0.0000],
+             [ 0.0000,  0.0000, -1.0845,  0.0000]],
+
+            [[ 0.0000,  0.0000,  0.0000, -2.1788],
+             [ 0.0000,  0.0000,  0.0000, -1.3986]],
+
+            [[ 0.0000,  0.0000,  0.0000,  0.0000],
+             [ 0.0000,  0.0000,  0.0000,  0.0000]]])
+""".format(
+        **common_args
+    ),
+)
+
+
+add_docstr(
+    torch.diagflat,
+    r"""
+diagflat(input, offset=0) -> Tensor
+
+- If :attr:`input` is a vector (1-D tensor), then returns a 2-D square tensor
+  with the elements of :attr:`input` as the diagonal.
+- If :attr:`input` is a tensor with more than one dimension, then returns a
+  2-D tensor with diagonal elements equal to a flattened :attr:`input`.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+Args:
+    {input}
+    offset (int, optional): the diagonal to consider. Default: 0 (main
+        diagonal).
+
+Examples::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([-0.2956, -0.9068,  0.1695])
+    >>> torch.diagflat(a)
+    tensor([[-0.2956,  0.0000,  0.0000],
+            [ 0.0000, -0.9068,  0.0000],
+            [ 0.0000,  0.0000,  0.1695]])
+    >>> torch.diagflat(a, 1)
+    tensor([[ 0.0000, -0.2956,  0.0000,  0.0000],
+            [ 0.0000,  0.0000, -0.9068,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  0.1695],
+            [ 0.0000,  0.0000,  0.0000,  0.0000]])
+
+    >>> a = torch.randn(2, 2)
+    >>> a
+    tensor([[ 0.2094, -0.3018],
+            [-0.1516,  1.9342]])
+    >>> torch.diagflat(a)
+    tensor([[ 0.2094,  0.0000,  0.0000,  0.0000],
+            [ 0.0000, -0.3018,  0.0000,  0.0000],
+            [ 0.0000,  0.0000, -0.1516,  0.0000],
+            [ 0.0000,  0.0000,  0.0000,  1.9342]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.diagonal,
+    r"""
+diagonal(input, offset=0, dim1=0, dim2=1) -> Tensor
+
+Returns a partial view of :attr:`input` with the its diagonal elements
+with respect to :attr:`dim1` and :attr:`dim2` appended as a dimension
+at the end of the shape.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+Applying :meth:`torch.diag_embed` to the output of this function with
+the same arguments yields a diagonal matrix with the diagonal entries
+of the input. However, :meth:`torch.diag_embed` has different default
+dimensions, so those need to be explicitly specified.
+
+Args:
+    {input} Must be at least 2-dimensional.
+    offset (int, optional): which diagonal to consider. Default: 0
+        (main diagonal).
+    dim1 (int, optional): first dimension with respect to which to
+        take diagonal. Default: 0.
+    dim2 (int, optional): second dimension with respect to which to
+        take diagonal. Default: 1.
+
+.. note::  To take a batch diagonal, pass in dim1=-2, dim2=-1.
+
+Examples::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-1.0854,  1.1431, -0.1752],
+            [ 0.8536, -0.0905,  0.0360],
+            [ 0.6927, -0.3735, -0.4945]])
+
+
+    >>> torch.diagonal(a, 0)
+    tensor([-1.0854, -0.0905, -0.4945])
+
+
+    >>> torch.diagonal(a, 1)
+    tensor([ 1.1431,  0.0360])
+
+
+    >>> x = torch.randn(2, 5, 4, 2)
+    >>> torch.diagonal(x, offset=-1, dim1=1, dim2=2)
+    tensor([[[-1.2631,  0.3755, -1.5977, -1.8172],
+             [-1.1065,  1.0401, -0.2235, -0.7938]],
+
+            [[-1.7325, -0.3081,  0.6166,  0.2335],
+             [ 1.0500,  0.7336, -0.3836, -1.1015]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.diagonal_scatter,
+    r"""
+diagonal_scatter(input, src, offset=0, dim1=0, dim2=1) -> Tensor
+
+Embeds the values of the :attr:`src` tensor into :attr:`input` along
+the diagonal elements of :attr:`input`, with respect to :attr:`dim1`
+and :attr:`dim2`.
+
+This function returns a tensor with fresh storage; it does not
+return a view.
+
+The argument :attr:`offset` controls which diagonal to consider:
+
+- If :attr:`offset` = 0, it is the main diagonal.
+- If :attr:`offset` > 0, it is above the main diagonal.
+- If :attr:`offset` < 0, it is below the main diagonal.
+
+Args:
+    {input} Must be at least 2-dimensional.
+    src (Tensor): the tensor to embed into :attr:`input`.
+    offset (int, optional): which diagonal to consider. Default: 0
+        (main diagonal).
+    dim1 (int, optional): first dimension with respect to which to
+        take diagonal. Default: 0.
+    dim2 (int, optional): second dimension with respect to which to
+        take diagonal. Default: 1.
+
+.. note::
+
+    :attr:`src` must be of the proper size in order to be embedded
+    into :attr:`input`. Specifically, it should have the same shape as
+    ``torch.diagonal(input, offset, dim1, dim2)``
+
+Examples::
+
+    >>> a = torch.zeros(3, 3)
+    >>> a
+    tensor([[0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.]])
+
+    >>> torch.diagonal_scatter(a, torch.ones(3), 0)
+    tensor([[1., 0., 0.],
+            [0., 1., 0.],
+            [0., 0., 1.]])
+
+    >>> torch.diagonal_scatter(a, torch.ones(2), 1)
+    tensor([[0., 1., 0.],
+            [0., 0., 1.],
+            [0., 0., 0.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.as_strided_scatter,
+    r"""
+as_strided_scatter(input, src, size, stride, storage_offset=None) -> Tensor
+
+Embeds the values of the :attr:`src` tensor into :attr:`input` along
+the elements corresponding to the result of calling
+input.as_strided(size, stride, storage_offset).
+
+This function returns a tensor with fresh storage; it does not
+return a view.
+
+Args:
+    {input}
+    size (tuple or ints): the shape of the output tensor
+    stride (tuple or ints): the stride of the output tensor
+    storage_offset (int, optional): the offset in the underlying storage of the output tensor
+
+.. note::
+
+    :attr:`src` must be of the proper size in order to be embedded
+    into :attr:`input`. Specifically, it should have the same shape as
+    `torch.as_strided(input, size, stride, storage_offset)`
+
+Example::
+
+    >>> a = torch.arange(4).reshape(2, 2) + 1
+    >>> a
+    tensor([[1, 2],
+            [3, 4]])
+    >>> b = torch.zeros(3, 3)
+    >>> b
+    tensor([[0., 0., 0.],
+            [0., 0., 0.],
+            [0., 0., 0.]])
+    >>> torch.as_strided_scatter(b, a, (2, 2), (1, 2))
+    tensor([[1., 3., 2.],
+            [4., 0., 0.],
+            [0., 0., 0.]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.diff,
+    r"""
+diff(input, n=1, dim=-1, prepend=None, append=None) -> Tensor
+
+Computes the n-th forward difference along the given dimension.
+
+The first-order differences are given by `out[i] = input[i + 1] - input[i]`. Higher-order
+differences are calculated by using :func:`torch.diff` recursively.
+
+Args:
+    input (Tensor): the tensor to compute the differences on
+    n (int, optional): the number of times to recursively compute the difference
+    dim (int, optional): the dimension to compute the difference along.
+        Default is the last dimension.
+    prepend, append (Tensor, optional): values to prepend or append to
+        :attr:`input` along :attr:`dim` before computing the difference.
+        Their dimensions must be equivalent to that of input, and their shapes
+        must match input's shape except on :attr:`dim`.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([1, 3, 2])
+    >>> torch.diff(a)
+    tensor([ 2, -1])
+    >>> b = torch.tensor([4, 5])
+    >>> torch.diff(a, append=b)
+    tensor([ 2, -1,  2,  1])
+    >>> c = torch.tensor([[1, 2, 3], [3, 4, 5]])
+    >>> torch.diff(c, dim=0)
+    tensor([[2, 2, 2]])
+    >>> torch.diff(c, dim=1)
+    tensor([[1, 1],
+            [1, 1]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.digamma,
+    r"""
+digamma(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.digamma`.
+""",
+)
+
+add_docstr(
+    torch.dist,
+    r"""
+dist(input, other, p=2) -> Tensor
+
+Returns the p-norm of (:attr:`input` - :attr:`other`)
+
+The shapes of :attr:`input` and :attr:`other` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    {input}
+    other (Tensor): the Right-hand-side input tensor
+    p (float, optional): the norm to be computed
+
+Example::
+
+    >>> x = torch.randn(4)
+    >>> x
+    tensor([-1.5393, -0.8675,  0.5916,  1.6321])
+    >>> y = torch.randn(4)
+    >>> y
+    tensor([ 0.0967, -1.0511,  0.6295,  0.8360])
+    >>> torch.dist(x, y, 3.5)
+    tensor(1.6727)
+    >>> torch.dist(x, y, 3)
+    tensor(1.6973)
+    >>> torch.dist(x, y, 0)
+    tensor(4.)
+    >>> torch.dist(x, y, 1)
+    tensor(2.6537)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.div,
+    r"""
+div(input, other, *, rounding_mode=None, out=None) -> Tensor
+
+Divides each element of the input ``input`` by the corresponding element of
+:attr:`other`.
+
+.. math::
+    \text{{out}}_i = \frac{{\text{{input}}_i}}{{\text{{other}}_i}}
+
+.. note::
+    By default, this performs a "true" division like Python 3.
+    See the :attr:`rounding_mode` argument for floor division.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+Always promotes integer types to the default scalar type.
+
+Args:
+    input (Tensor): the dividend
+    other (Tensor or Number): the divisor
+
+Keyword args:
+    rounding_mode (str, optional): Type of rounding applied to the result:
+
+        * None - default behavior. Performs no rounding and, if both :attr:`input` and
+          :attr:`other` are integer types, promotes the inputs to the default scalar type.
+          Equivalent to true division in Python (the ``/`` operator) and NumPy's ``np.true_divide``.
+        * ``"trunc"`` - rounds the results of the division towards zero.
+          Equivalent to C-style integer division.
+        * ``"floor"`` - rounds the results of the division down.
+          Equivalent to floor division in Python (the ``//`` operator) and NumPy's ``np.floor_divide``.
+
+    {out}
+
+Examples::
+
+    >>> x = torch.tensor([ 0.3810,  1.2774, -0.2972, -0.3719,  0.4637])
+    >>> torch.div(x, 0.5)
+    tensor([ 0.7620,  2.5548, -0.5944, -0.7438,  0.9274])
+
+    >>> a = torch.tensor([[-0.3711, -1.9353, -0.4605, -0.2917],
+    ...                   [ 0.1815, -1.0111,  0.9805, -1.5923],
+    ...                   [ 0.1062,  1.4581,  0.7759, -1.2344],
+    ...                   [-0.1830, -0.0313,  1.1908, -1.4757]])
+    >>> b = torch.tensor([ 0.8032,  0.2930, -0.8113, -0.2308])
+    >>> torch.div(a, b)
+    tensor([[-0.4620, -6.6051,  0.5676,  1.2639],
+            [ 0.2260, -3.4509, -1.2086,  6.8990],
+            [ 0.1322,  4.9764, -0.9564,  5.3484],
+            [-0.2278, -0.1068, -1.4678,  6.3938]])
+
+    >>> torch.div(a, b, rounding_mode='trunc')
+    tensor([[-0., -6.,  0.,  1.],
+            [ 0., -3., -1.,  6.],
+            [ 0.,  4., -0.,  5.],
+            [-0., -0., -1.,  6.]])
+
+    >>> torch.div(a, b, rounding_mode='floor')
+    tensor([[-1., -7.,  0.,  1.],
+            [ 0., -4., -2.,  6.],
+            [ 0.,  4., -1.,  5.],
+            [-1., -1., -2.,  6.]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.divide,
+    r"""
+divide(input, other, *, rounding_mode=None, out=None) -> Tensor
+
+Alias for :func:`torch.div`.
+""",
+)
+
+add_docstr(
+    torch.dot,
+    r"""
+dot(input, other, *, out=None) -> Tensor
+
+Computes the dot product of two 1D tensors.
+
+.. note::
+
+    Unlike NumPy's dot, torch.dot intentionally only supports computing the dot product
+    of two 1D tensors with the same number of elements.
+
+Args:
+    input (Tensor): first tensor in the dot product, must be 1D.
+    other (Tensor): second tensor in the dot product, must be 1D.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.dot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+    tensor(7)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.vdot,
+    r"""
+vdot(input, other, *, out=None) -> Tensor
+
+Computes the dot product of two 1D vectors along a dimension.
+
+In symbols, this function computes
+
+.. math::
+
+    \sum_{i=1}^n \overline{x_i}y_i.
+
+where :math:`\overline{x_i}` denotes the conjugate for complex
+vectors, and it is the identity for real vectors.
+
+.. note::
+
+    Unlike NumPy's vdot, torch.vdot intentionally only supports computing the dot product
+    of two 1D tensors with the same number of elements.
+
+.. seealso::
+
+        :func:`torch.linalg.vecdot` computes the dot product of two batches of vectors along a dimension.
+
+Args:
+    input (Tensor): first tensor in the dot product, must be 1D. Its conjugate is used if it's complex.
+    other (Tensor): second tensor in the dot product, must be 1D.
+
+Keyword args:
+"""
+    + rf"""
+.. note:: {common_args["out"]}
+"""
+    + r"""
+
+Example::
+
+    >>> torch.vdot(torch.tensor([2, 3]), torch.tensor([2, 1]))
+    tensor(7)
+    >>> a = torch.tensor((1 +2j, 3 - 1j))
+    >>> b = torch.tensor((2 +1j, 4 - 0j))
+    >>> torch.vdot(a, b)
+    tensor([16.+1.j])
+    >>> torch.vdot(b, a)
+    tensor([16.-1.j])
+""",
+)
+
+add_docstr(
+    torch.eq,
+    r"""
+eq(input, other, *, out=None) -> Tensor
+
+Computes element-wise equality
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is equal to :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.eq(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[ True, False],
+            [False, True]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.equal,
+    r"""
+equal(input, other) -> bool
+
+``True`` if two tensors have the same size and elements, ``False`` otherwise.
+
+Example::
+
+    >>> torch.equal(torch.tensor([1, 2]), torch.tensor([1, 2]))
+    True
+""",
+)
+
+add_docstr(
+    torch.erf,
+    r"""
+erf(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.erf`.
+""",
+)
+
+add_docstr(
+    torch.erfc,
+    r"""
+erfc(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.erfc`.
+""",
+)
+
+add_docstr(
+    torch.erfinv,
+    r"""
+erfinv(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.erfinv`.
+""",
+)
+
+add_docstr(
+    torch.exp,
+    r"""
+exp(input, *, out=None) -> Tensor
+
+Returns a new tensor with the exponential of the elements
+of the input tensor :attr:`input`.
+
+.. math::
+    y_{i} = e^{x_{i}}
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.exp(torch.tensor([0, math.log(2.)]))
+    tensor([ 1.,  2.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.exp2,
+    r"""
+exp2(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.exp2`.
+""",
+)
+
+add_docstr(
+    torch.expm1,
+    r"""
+expm1(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.expm1`.
+""",
+)
+
+add_docstr(
+    torch.eye,
+    r"""
+eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 2-D tensor with ones on the diagonal and zeros elsewhere.
+
+Args:
+    n (int): the number of rows
+    m (int, optional): the number of columns with default being :attr:`n`
+
+Keyword arguments:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 2-D tensor with ones on the diagonal and zeros elsewhere
+
+Example::
+
+    >>> torch.eye(3)
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1.,  0.],
+            [ 0.,  0.,  1.]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.floor,
+    r"""
+floor(input, *, out=None) -> Tensor
+
+Returns a new tensor with the floor of the elements of :attr:`input`,
+the largest integer less than or equal to each element.
+
+For integer inputs, follows the array-api convention of returning a
+copy of the input tensor.
+
+.. math::
+    \text{out}_{i} = \left\lfloor \text{input}_{i} \right\rfloor
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.8166,  1.5308, -0.2530, -0.2091])
+    >>> torch.floor(a)
+    tensor([-1.,  1., -1., -1.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.floor_divide,
+    r"""
+floor_divide(input, other, *, out=None) -> Tensor
+
+.. note::
+
+    Before PyTorch 1.13 :func:`torch.floor_divide` incorrectly performed
+    truncation division. To restore the previous behavior use
+    :func:`torch.div` with ``rounding_mode='trunc'``.
+
+Computes :attr:`input` divided by :attr:`other`, elementwise, and floors
+the result.
+
+.. math::
+    \text{{out}}_i = \text{floor} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right)
+
+"""
+    + r"""
+
+Supports broadcasting to a common shape, type promotion, and integer and float inputs.
+
+Args:
+    input (Tensor or Number): the dividend
+    other (Tensor or Number): the divisor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([4.0, 3.0])
+    >>> b = torch.tensor([2.0, 2.0])
+    >>> torch.floor_divide(a, b)
+    tensor([2.0, 1.0])
+    >>> torch.floor_divide(a, 1.4)
+    tensor([2.0, 2.0])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.fmod,
+    r"""
+fmod(input, other, *, out=None) -> Tensor
+
+Applies C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_ entrywise.
+The result has the same sign as the dividend :attr:`input` and its absolute value
+is less than that of :attr:`other`.
+
+This function may be defined in terms of :func:`torch.div` as
+
+.. code:: python
+
+    torch.fmod(a, b) == a - a.div(b, rounding_mode="trunc") * b
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+
+.. note::
+
+    When the divisor is zero, returns ``NaN`` for floating point dtypes
+    on both CPU and GPU; raises ``RuntimeError`` for integer division by
+    zero on CPU; Integer division by zero on GPU may return any value.
+
+.. note::
+
+   Complex inputs are not supported. In some cases, it is not mathematically
+   possible to satisfy the definition of a modulo operation with complex numbers.
+
+.. seealso::
+
+    :func:`torch.remainder` which implements Python's modulus operator.
+    This one is defined using division rounding down the result.
+
+Args:
+    input (Tensor): the dividend
+    other (Tensor or Scalar): the divisor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.fmod(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+    tensor([-1., -0., -1.,  1.,  0.,  1.])
+    >>> torch.fmod(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+    tensor([1.0000, 0.5000, 0.0000, 1.0000, 0.5000])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.frac,
+    r"""
+frac(input, *, out=None) -> Tensor
+
+Computes the fractional portion of each element in :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \text{input}_{i} - \left\lfloor |\text{input}_{i}| \right\rfloor * \operatorname{sgn}(\text{input}_{i})
+
+Example::
+
+    >>> torch.frac(torch.tensor([1, 2.5, -3.2]))
+    tensor([ 0.0000,  0.5000, -0.2000])
+""",
+)
+
+add_docstr(
+    torch.frexp,
+    r"""
+frexp(input, *, out=None) -> (Tensor mantissa, Tensor exponent)
+
+Decomposes :attr:`input` into mantissa and exponent tensors
+such that :math:`\text{input} = \text{mantissa} \times 2^{\text{exponent}}`.
+
+The range of mantissa is the open interval (-1, 1).
+
+Supports float inputs.
+
+Args:
+    input (Tensor): the input tensor
+
+
+Keyword args:
+    out (tuple, optional): the output tensors
+
+Example::
+
+    >>> x = torch.arange(9.)
+    >>> mantissa, exponent = torch.frexp(x)
+    >>> mantissa
+    tensor([0.0000, 0.5000, 0.5000, 0.7500, 0.5000, 0.6250, 0.7500, 0.8750, 0.5000])
+    >>> exponent
+    tensor([0, 1, 2, 2, 3, 3, 3, 3, 4], dtype=torch.int32)
+    >>> torch.ldexp(mantissa, exponent)
+    tensor([0., 1., 2., 3., 4., 5., 6., 7., 8.])
+""",
+)
+
+add_docstr(
+    torch.from_numpy,
+    r"""
+from_numpy(ndarray) -> Tensor
+
+Creates a :class:`Tensor` from a :class:`numpy.ndarray`.
+
+The returned tensor and :attr:`ndarray` share the same memory. Modifications to
+the tensor will be reflected in the :attr:`ndarray` and vice versa. The returned
+tensor is not resizable.
+
+It currently accepts :attr:`ndarray` with dtypes of ``numpy.float64``,
+``numpy.float32``, ``numpy.float16``, ``numpy.complex64``, ``numpy.complex128``,
+``numpy.int64``, ``numpy.int32``, ``numpy.int16``, ``numpy.int8``, ``numpy.uint8``,
+and ``bool``.
+
+.. warning::
+    Writing to a tensor created from a read-only NumPy array is not supported and will result in undefined behavior.
+
+Example::
+
+    >>> a = numpy.array([1, 2, 3])
+    >>> t = torch.from_numpy(a)
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([-1,  2,  3])
+""",
+)
+
+add_docstr(
+    torch.frombuffer,
+    r"""
+frombuffer(buffer, *, dtype, count=-1, offset=0, requires_grad=False) -> Tensor
+
+Creates a 1-dimensional :class:`Tensor` from an object that implements
+the Python buffer protocol.
+
+Skips the first :attr:`offset` bytes in the buffer, and interprets the rest of
+the raw bytes as a 1-dimensional tensor of type :attr:`dtype` with :attr:`count`
+elements.
+
+Note that either of the following must be true:
+
+1. :attr:`count` is a positive non-zero number, and the total number of bytes
+in the buffer is more than :attr:`offset` plus :attr:`count` times the size
+(in bytes) of :attr:`dtype`.
+
+2. :attr:`count` is negative, and the length (number of bytes) of the buffer
+subtracted by the :attr:`offset` is a multiple of the size (in bytes) of
+:attr:`dtype`.
+
+The returned tensor and buffer share the same memory. Modifications to
+the tensor will be reflected in the buffer and vice versa. The returned
+tensor is not resizable.
+
+.. note::
+    This function increments the reference count for the object that
+    owns the shared memory. Therefore, such memory will not be deallocated
+    before the returned tensor goes out of scope.
+
+.. warning::
+    This function's behavior is undefined when passed an object implementing
+    the buffer protocol whose data is not on the CPU. Doing so is likely to
+    cause a segmentation fault.
+
+.. warning::
+    This function does not try to infer the :attr:`dtype` (hence, it is not
+    optional). Passing a different :attr:`dtype` than its source may result
+    in unexpected behavior.
+
+Args:
+    buffer (object): a Python object that exposes the buffer interface.
+
+Keyword args:
+    dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+    count (int, optional): the number of desired elements to be read.
+        If negative, all the elements (until the end of the buffer) will be
+        read. Default: -1.
+    offset (int, optional): the number of bytes to skip at the start of
+        the buffer. Default: 0.
+    {requires_grad}
+
+Example::
+
+    >>> import array
+    >>> a = array.array('i', [1, 2, 3])
+    >>> t = torch.frombuffer(a, dtype=torch.int32)
+    >>> t
+    tensor([ 1,  2,  3])
+    >>> t[0] = -1
+    >>> a
+    array([-1,  2,  3])
+
+    >>> # Interprets the signed char bytes as 32-bit integers.
+    >>> # Each 4 signed char elements will be interpreted as
+    >>> # 1 signed 32-bit integer.
+    >>> import array
+    >>> a = array.array('b', [-1, 0, 0, 0])
+    >>> torch.frombuffer(a, dtype=torch.int32)
+    tensor([255], dtype=torch.int32)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.from_file,
+    r"""
+from_file(filename, shared=None, size=0, *, dtype=None, layout=None, device=None, pin_memory=False)
+
+Creates a CPU tensor with a storage backed by a memory-mapped file.
+
+If ``shared`` is True, then memory is shared between processes. All changes are written to the file.
+If ``shared`` is False, then changes to the tensor do not affect the file.
+
+``size`` is the number of elements in the Tensor. If ``shared`` is ``False``, then the file must contain
+at least ``size * sizeof(dtype)`` bytes. If ``shared`` is ``True`` the file will be created if needed.
+
+.. note::
+    Only CPU tensors can be mapped to files.
+
+.. note::
+    For now, tensors with storages backed by a memory-mapped file cannot be created in pinned memory.
+
+
+Args:
+    filename (str): file name to map
+    shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                    underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+    size (int): number of elements in the tensor
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {pin_memory}
+
+Example::
+    >>> t = torch.randn(2, 5, dtype=torch.float64)
+    >>> t.numpy().tofile('storage.pt')
+    >>> t_mapped = torch.from_file('storage.pt', shared=False, size=10, dtype=torch.float64)
+    """.format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.flatten,
+    r"""
+flatten(input, start_dim=0, end_dim=-1) -> Tensor
+
+Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+The order of elements in :attr:`input` is unchanged.
+
+Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+
+.. note::
+    Flattening a zero-dimensional tensor will return a one-dimensional view.
+
+Args:
+    {input}
+    start_dim (int): the first dim to flatten
+    end_dim (int): the last dim to flatten
+
+Example::
+
+    >>> t = torch.tensor([[[1, 2],
+    ...                    [3, 4]],
+    ...                   [[5, 6],
+    ...                    [7, 8]]])
+    >>> torch.flatten(t)
+    tensor([1, 2, 3, 4, 5, 6, 7, 8])
+    >>> torch.flatten(t, start_dim=1)
+    tensor([[1, 2, 3, 4],
+            [5, 6, 7, 8]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.unflatten,
+    r"""
+unflatten(input, dim, sizes) -> Tensor
+
+Expands a dimension of the input tensor over multiple dimensions.
+
+.. seealso::
+
+    :func:`torch.flatten` the inverse of this function. It coalesces several dimensions into one.
+
+Args:
+    {input}
+    dim (int): Dimension to be unflattened, specified as an index into
+         ``input.shape``.
+    sizes (Tuple[int]): New shape of the unflattened dimension.
+         One of its elements can be `-1` in which case the corresponding output
+         dimension is inferred. Otherwise, the product of ``sizes`` *must*
+         equal ``input.shape[dim]``.
+
+Returns:
+    A View of input with the specified dimension unflattened.
+
+Examples::
+    >>> torch.unflatten(torch.randn(3, 4, 1), 1, (2, 2)).shape
+    torch.Size([3, 2, 2, 1])
+    >>> torch.unflatten(torch.randn(3, 4, 1), 1, (-1, 2)).shape
+    torch.Size([3, 2, 2, 1])
+    >>> torch.unflatten(torch.randn(5, 12, 3), -2, (2, 2, 3, 1, 1)).shape
+    torch.Size([5, 2, 2, 3, 1, 1, 3])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.gather,
+    r"""
+gather(input, dim, index, *, sparse_grad=False, out=None) -> Tensor
+
+Gathers values along an axis specified by `dim`.
+
+For a 3-D tensor the output is specified by::
+
+    out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+    out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+    out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+
+:attr:`input` and :attr:`index` must have the same number of dimensions.
+It is also required that ``index.size(d) <= input.size(d)`` for all
+dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+Note that ``input`` and ``index`` do not broadcast against each other.
+
+Args:
+    input (Tensor): the source tensor
+    dim (int): the axis along which to index
+    index (LongTensor): the indices of elements to gather
+
+Keyword arguments:
+    sparse_grad (bool, optional): If ``True``, gradient w.r.t. :attr:`input` will be a sparse tensor.
+    out (Tensor, optional): the destination tensor
+
+Example::
+
+    >>> t = torch.tensor([[1, 2], [3, 4]])
+    >>> torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]]))
+    tensor([[ 1,  1],
+            [ 4,  3]])
+""",
+)
+
+
+add_docstr(
+    torch.gcd,
+    r"""
+gcd(input, other, *, out=None) -> Tensor
+
+Computes the element-wise greatest common divisor (GCD) of :attr:`input` and :attr:`other`.
+
+Both :attr:`input` and :attr:`other` must have integer types.
+
+.. note::
+    This defines :math:`gcd(0, 0) = 0`.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([5, 10, 15])
+    >>> b = torch.tensor([3, 4, 5])
+    >>> torch.gcd(a, b)
+    tensor([1, 2, 5])
+    >>> c = torch.tensor([3])
+    >>> torch.gcd(a, c)
+    tensor([1, 1, 3])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.ge,
+    r"""
+ge(input, other, *, out=None) -> Tensor
+
+Computes :math:`\text{input} \geq \text{other}` element-wise.
+"""
+    + r"""
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is greater than or equal to :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.ge(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[True, True], [False, True]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.greater_equal,
+    r"""
+greater_equal(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.ge`.
+""",
+)
+
+add_docstr(
+    torch.gradient,
+    r"""
+gradient(input, *, spacing=1, dim=None, edge_order=1) -> List of Tensors
+
+Estimates the gradient of a function :math:`g : \mathbb{R}^n \rightarrow \mathbb{R}` in
+one or more dimensions using the `second-order accurate central differences method
+<https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ and
+either first or second order estimates at the boundaries.
+
+The gradient of :math:`g` is estimated using samples. By default, when :attr:`spacing` is not
+specified, the samples are entirely described by :attr:`input`, and the mapping of input coordinates
+to an output is the same as the tensor's mapping of indices to values. For example, for a three-dimensional
+:attr:`input` the function described is :math:`g : \mathbb{R}^3 \rightarrow \mathbb{R}`, and
+:math:`g(1, 2, 3)\ == input[1, 2, 3]`.
+
+When :attr:`spacing` is specified, it modifies the relationship between :attr:`input` and input coordinates.
+This is detailed in the "Keyword Arguments" section below.
+
+The gradient is estimated by estimating each partial derivative of :math:`g` independently. This estimation is
+accurate if :math:`g` is in :math:`C^3` (it has at least 3 continuous derivatives), and the estimation can be
+improved by providing closer samples. Mathematically, the value at each interior point of a partial derivative
+is estimated using `Taylor's theorem with remainder <https://en.wikipedia.org/wiki/Taylor%27s_theorem>`_.
+Letting :math:`x` be an interior point with :math:`x-h_l` and :math:`x+h_r` be points neighboring
+it to the left and right respectively, :math:`f(x+h_r)` and :math:`f(x-h_l)` can be estimated using:
+
+.. math::
+    \begin{aligned}
+        f(x+h_r) = f(x) + h_r f'(x) + {h_r}^2  \frac{f''(x)}{2} + {h_r}^3 \frac{f'''(\xi_1)}{6}, \xi_1 \in (x, x+h_r) \\
+        f(x-h_l) = f(x) - h_l f'(x) + {h_l}^2  \frac{f''(x)}{2} - {h_l}^3 \frac{f'''(\xi_2)}{6}, \xi_2 \in (x, x-h_l) \\
+    \end{aligned}
+
+Using the fact that :math:`f \in C^3` and solving the linear system, we derive:
+
+.. math::
+    f'(x) \approx \frac{ {h_l}^2 f(x+h_r) - {h_r}^2 f(x-h_l)
+          + ({h_r}^2-{h_l}^2 ) f(x) }{ {h_r} {h_l}^2 + {h_r}^2 {h_l} }
+
+.. note::
+    We estimate the gradient of functions in complex domain
+    :math:`g : \mathbb{C}^n \rightarrow \mathbb{C}` in the same way.
+
+The value of each partial derivative at the boundary points is computed differently. See edge_order below.
+
+Args:
+    input (``Tensor``): the tensor that represents the values of the function
+
+Keyword args:
+    spacing (``scalar``, ``list of scalar``, ``list of Tensor``, optional): :attr:`spacing` can be used to modify
+        how the :attr:`input` tensor's indices relate to sample coordinates. If :attr:`spacing` is a scalar then
+        the indices are multiplied by the scalar to produce the coordinates. For example, if :attr:`spacing=2` the
+        indices (1, 2, 3) become coordinates (2, 4, 6). If :attr:`spacing` is a list of scalars then the corresponding
+        indices are multiplied. For example, if :attr:`spacing=(2, -1, 3)` the indices (1, 2, 3) become coordinates (2, -2, 9).
+        Finally, if :attr:`spacing` is a list of one-dimensional tensors then each tensor specifies the coordinates for
+        the corresponding dimension. For example, if the indices are (1, 2, 3) and the tensors are (t0, t1, t2), then
+        the coordinates are (t0[1], t1[2], t2[3])
+
+    dim (``int``, ``list of int``, optional): the dimension or dimensions to approximate the gradient over.  By default
+        the partial  gradient in every dimension is computed. Note that when :attr:`dim` is  specified the elements of
+        the :attr:`spacing` argument must correspond with the specified dims."
+
+    edge_order (``int``, optional): 1 or 2, for `first-order
+        <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_ or
+        `second-order <https://www.ams.org/journals/mcom/1988-51-184/S0025-5718-1988-0935077-0/S0025-5718-1988-0935077-0.pdf>`_
+        estimation of the boundary ("edge") values, respectively.
+
+Examples::
+
+    >>> # Estimates the gradient of f(x)=x^2 at points [-2, -1, 2, 4]
+    >>> coordinates = (torch.tensor([-2., -1., 1., 4.]),)
+    >>> values = torch.tensor([4., 1., 1., 16.], )
+    >>> torch.gradient(values, spacing = coordinates)
+    (tensor([-3., -2., 2., 5.]),)
+
+    >>> # Estimates the gradient of the R^2 -> R function whose samples are
+    >>> # described by the tensor t. Implicit coordinates are [0, 1] for the outermost
+    >>> # dimension and [0, 1, 2, 3] for the innermost dimension, and function estimates
+    >>> # partial derivative for both dimensions.
+    >>> t = torch.tensor([[1, 2, 4, 8], [10, 20, 40, 80]])
+    >>> torch.gradient(t)
+    (tensor([[ 9., 18., 36., 72.],
+             [ 9., 18., 36., 72.]]),
+     tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+             [10.0000, 15.0000, 30.0000, 40.0000]]))
+
+    >>> # A scalar value for spacing modifies the relationship between tensor indices
+    >>> # and input coordinates by multiplying the indices to find the
+    >>> # coordinates. For example, below the indices of the innermost
+    >>> # 0, 1, 2, 3 translate to coordinates of [0, 2, 4, 6], and the indices of
+    >>> # the outermost dimension 0, 1 translate to coordinates of [0, 2].
+    >>> torch.gradient(t, spacing = 2.0) # dim = None (implicitly [0, 1])
+    (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+              [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+     tensor([[ 0.5000, 0.7500, 1.5000, 2.0000],
+              [ 5.0000, 7.5000, 15.0000, 20.0000]]))
+    >>> # doubling the spacing between samples halves the estimated partial gradients.
+
+    >>>
+    >>> # Estimates only the partial derivative for dimension 1
+    >>> torch.gradient(t, dim = 1) # spacing = None (implicitly 1.)
+    (tensor([[ 1.0000, 1.5000, 3.0000, 4.0000],
+             [10.0000, 15.0000, 30.0000, 40.0000]]),)
+
+    >>> # When spacing is a list of scalars, the relationship between the tensor
+    >>> # indices and input coordinates changes based on dimension.
+    >>> # For example, below, the indices of the innermost dimension 0, 1, 2, 3 translate
+    >>> # to coordinates of [0, 3, 6, 9], and the indices of the outermost dimension
+    >>> # 0, 1 translate to coordinates of [0, 2].
+    >>> torch.gradient(t, spacing = [3., 2.])
+    (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+             [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+     tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+             [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+
+    >>> # The following example is a replication of the previous one with explicit
+    >>> # coordinates.
+    >>> coords = (torch.tensor([0, 2]), torch.tensor([0, 3, 6, 9]))
+    >>> torch.gradient(t, spacing = coords)
+    (tensor([[ 4.5000, 9.0000, 18.0000, 36.0000],
+             [ 4.5000, 9.0000, 18.0000, 36.0000]]),
+     tensor([[ 0.3333, 0.5000, 1.0000, 1.3333],
+             [ 3.3333, 5.0000, 10.0000, 13.3333]]))
+
+""",
+)
+
+add_docstr(
+    torch.geqrf,
+    r"""
+geqrf(input, *, out=None) -> (Tensor, Tensor)
+
+This is a low-level function for calling LAPACK's geqrf directly. This function
+returns a namedtuple (a, tau) as defined in `LAPACK documentation for geqrf`_ .
+
+Computes a QR decomposition of :attr:`input`.
+Both `Q` and `R` matrices are stored in the same output tensor `a`.
+The elements of `R` are stored on and above the diagonal.
+Elementary reflectors (or Householder vectors) implicitly defining matrix `Q`
+are stored below the diagonal.
+The results of this function can be used together with :func:`torch.linalg.householder_product`
+to obtain the `Q` matrix or
+with :func:`torch.ormqr`, which uses an implicit representation of the `Q` matrix,
+for an efficient matrix-matrix multiplication.
+
+See `LAPACK documentation for geqrf`_ for further details.
+
+.. note::
+    See also :func:`torch.linalg.qr`, which computes Q and R matrices, and :func:`torch.linalg.lstsq`
+    with the ``driver="gels"`` option for a function that can solve matrix equations using a QR decomposition.
+
+Args:
+    input (Tensor): the input matrix
+
+Keyword args:
+    out (tuple, optional): the output tuple of (Tensor, Tensor). Ignored if `None`. Default: `None`.
+
+.. _LAPACK documentation for geqrf:
+    http://www.netlib.org/lapack/explore-html/df/dc5/group__variants_g_ecomputational_ga3766ea903391b5cf9008132f7440ec7b.html
+
+""",
+)
+
+add_docstr(
+    torch.inner,
+    r"""
+inner(input, other, *, out=None) -> Tensor
+
+Computes the dot product for 1D tensors. For higher dimensions, sums the product
+of elements from :attr:`input` and :attr:`other` along their last dimension.
+
+.. note::
+
+    If either :attr:`input` or :attr:`other` is a scalar, the result is equivalent
+    to `torch.mul(input, other)`.
+
+    If both :attr:`input` and :attr:`other` are non-scalars, the size of their last
+    dimension must match and the result is equivalent to `torch.tensordot(input,
+    other, dims=([-1], [-1]))`
+
+Args:
+    input (Tensor): First input tensor
+    other (Tensor): Second input tensor
+
+Keyword args:
+    out (Tensor, optional): Optional output tensor to write result into. The output
+                            shape is `input.shape[:-1] + other.shape[:-1]`.
+
+Example::
+
+    # Dot product
+    >>> torch.inner(torch.tensor([1, 2, 3]), torch.tensor([0, 2, 1]))
+    tensor(7)
+
+    # Multidimensional input tensors
+    >>> a = torch.randn(2, 3)
+    >>> a
+    tensor([[0.8173, 1.0874, 1.1784],
+            [0.3279, 0.1234, 2.7894]])
+    >>> b = torch.randn(2, 4, 3)
+    >>> b
+    tensor([[[-0.4682, -0.7159,  0.1506],
+            [ 0.4034, -0.3657,  1.0387],
+            [ 0.9892, -0.6684,  0.1774],
+            [ 0.9482,  1.3261,  0.3917]],
+
+            [[ 0.4537,  0.7493,  1.1724],
+            [ 0.2291,  0.5749, -0.2267],
+            [-0.7920,  0.3607, -0.3701],
+            [ 1.3666, -0.5850, -1.7242]]])
+    >>> torch.inner(a, b)
+    tensor([[[-0.9837,  1.1560,  0.2907,  2.6785],
+            [ 2.5671,  0.5452, -0.6912, -1.5509]],
+
+            [[ 0.1782,  2.9843,  0.7366,  1.5672],
+            [ 3.5115, -0.4864, -1.2476, -4.4337]]])
+
+    # Scalar input
+    >>> torch.inner(a, torch.tensor(2))
+    tensor([[1.6347, 2.1748, 2.3567],
+            [0.6558, 0.2469, 5.5787]])
+""",
+)
+
+add_docstr(
+    torch.outer,
+    r"""
+outer(input, vec2, *, out=None) -> Tensor
+
+Outer product of :attr:`input` and :attr:`vec2`.
+If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of
+size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): 1-D input vector
+    vec2 (Tensor): 1-D input vector
+
+Keyword args:
+    out (Tensor, optional): optional output matrix
+
+Example::
+
+    >>> v1 = torch.arange(1., 5.)
+    >>> v2 = torch.arange(1., 4.)
+    >>> torch.outer(v1, v2)
+    tensor([[  1.,   2.,   3.],
+            [  2.,   4.,   6.],
+            [  3.,   6.,   9.],
+            [  4.,   8.,  12.]])
+""",
+)
+
+add_docstr(
+    torch.ger,
+    r"""
+ger(input, vec2, *, out=None) -> Tensor
+
+Alias of :func:`torch.outer`.
+
+.. warning::
+    This function is deprecated and will be removed in a future PyTorch release.
+    Use :func:`torch.outer` instead.
+""",
+)
+
+add_docstr(
+    torch.get_default_dtype,
+    r"""
+get_default_dtype() -> torch.dtype
+
+Get the current default floating point :class:`torch.dtype`.
+
+Example::
+
+    >>> torch.get_default_dtype()  # initial default for floating point is torch.float32
+    torch.float32
+    >>> torch.set_default_dtype(torch.float64)
+    >>> torch.get_default_dtype()  # default is now changed to torch.float64
+    torch.float64
+
+""",
+)
+
+add_docstr(
+    torch.get_num_threads,
+    r"""
+get_num_threads() -> int
+
+Returns the number of threads used for parallelizing CPU operations
+""",
+)
+
+add_docstr(
+    torch.get_num_interop_threads,
+    r"""
+get_num_interop_threads() -> int
+
+Returns the number of threads used for inter-op parallelism on CPU
+(e.g. in JIT interpreter)
+""",
+)
+
+add_docstr(
+    torch.gt,
+    r"""
+gt(input, other, *, out=None) -> Tensor
+
+Computes :math:`\text{input} > \text{other}` element-wise.
+"""
+    + r"""
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is greater than :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.gt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[False, True], [False, False]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.greater,
+    r"""
+greater(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.gt`.
+""",
+)
+
+add_docstr(
+    torch.histc,
+    r"""
+histc(input, bins=100, min=0, max=0, *, out=None) -> Tensor
+
+Computes the histogram of a tensor.
+
+The elements are sorted into equal width bins between :attr:`min` and
+:attr:`max`. If :attr:`min` and :attr:`max` are both zero, the minimum and
+maximum values of the data are used.
+
+Elements lower than min and higher than max and ``NaN`` elements are ignored.
+
+Args:
+    {input}
+    bins (int): number of histogram bins
+    min (Scalar): lower end of the range (inclusive)
+    max (Scalar): upper end of the range (inclusive)
+
+Keyword args:
+    {out}
+
+Returns:
+    Tensor: Histogram represented as a tensor
+
+Example::
+
+    >>> torch.histc(torch.tensor([1., 2, 1]), bins=4, min=0, max=3)
+    tensor([ 0.,  2.,  1.,  0.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.histogram,
+    r"""
+histogram(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor)
+
+Computes a histogram of the values in a tensor.
+
+:attr:`bins` can be an integer or a 1D tensor.
+
+If :attr:`bins` is an int, it specifies the number of equal-width bins.
+By default, the lower and upper range of the bins is determined by the
+minimum and maximum elements of the input tensor. The :attr:`range`
+argument can be provided to specify a range for the bins.
+
+If :attr:`bins` is a 1D tensor, it specifies the sequence of bin edges
+including the rightmost edge. It should contain at least 2 elements
+and its elements should be increasing.
+
+Args:
+    {input}
+    bins: int or 1D Tensor. If int, defines the number of equal-width bins. If tensor,
+          defines the sequence of bin edges including the rightmost edge.
+
+Keyword args:
+    range (tuple of float): Defines the range of the bins.
+    weight (Tensor): If provided, weight should have the same shape as input. Each value in
+                     input contributes its associated weight towards its bin's result.
+    density (bool): If False, the result will contain the count (or total weight) in each bin.
+                    If True, the result is the value of the probability density function over the bins,
+                    normalized such that the integral over the range of the bins is 1.
+    {out} (tuple, optional): The result tuple of two output tensors (hist, bin_edges).
+
+Returns:
+    hist (Tensor): 1D Tensor containing the values of the histogram.
+    bin_edges(Tensor): 1D Tensor containing the edges of the histogram bins.
+
+Example::
+
+    >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]))
+    (tensor([ 0.,  5.,  2.,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+    >>> torch.histogram(torch.tensor([1., 2, 1]), bins=4, range=(0., 3.), weight=torch.tensor([1., 2., 4.]), density=True)
+    (tensor([ 0.,  0.9524,  0.3810,  0.]), tensor([0., 0.75, 1.5, 2.25, 3.]))
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.histogramdd,
+    r"""
+histogramdd(input, bins, *, range=None, weight=None, density=False, out=None) -> (Tensor, Tensor[])
+
+Computes a multi-dimensional histogram of the values in a tensor.
+
+Interprets the elements of an input tensor whose innermost dimension has size N
+as a collection of N-dimensional points. Maps each of the points into a set of
+N-dimensional bins and returns the number of points (or total weight) in each bin.
+
+:attr:`input` must be a tensor with at least 2 dimensions.
+If input has shape (M, N), each of its M rows defines a point in N-dimensional space.
+If input has three or more dimensions, all but the last dimension are flattened.
+
+Each dimension is independently associated with its own strictly increasing sequence
+of bin edges. Bin edges may be specified explicitly by passing a sequence of 1D
+tensors. Alternatively, bin edges may be constructed automatically by passing a
+sequence of integers specifying the number of equal-width bins in each dimension.
+
+For each N-dimensional point in input:
+    - Each of its coordinates is binned independently among the bin edges
+        corresponding to its dimension
+    - Binning results are combined to identify the N-dimensional bin (if any)
+        into which the point falls
+    - If the point falls into a bin, the bin's count (or total weight) is incremented
+    - Points which do not fall into any bin do not contribute to the output
+
+:attr:`bins` can be a sequence of N 1D tensors, a sequence of N ints, or a single int.
+
+If :attr:`bins` is a sequence of N 1D tensors, it explicitly specifies the N sequences
+of bin edges. Each 1D tensor should contain a strictly increasing sequence with at
+least one element. A sequence of K bin edges defines K-1 bins, explicitly specifying
+the left and right edges of all bins. Every bin is exclusive of its left edge. Only
+the rightmost bin is inclusive of its right edge.
+
+If :attr:`bins` is a sequence of N ints, it specifies the number of equal-width bins
+in each dimension. By default, the leftmost and rightmost bin edges in each dimension
+are determined by the minimum and maximum elements of the input tensor in the
+corresponding dimension. The :attr:`range` argument can be provided to manually
+specify the leftmost and rightmost bin edges in each dimension.
+
+If :attr:`bins` is an int, it specifies the number of equal-width bins for all dimensions.
+
+.. note::
+    See also :func:`torch.histogram`, which specifically computes 1D histograms.
+    While :func:`torch.histogramdd` infers the dimensionality of its bins and
+    binned values from the shape of :attr:`input`, :func:`torch.histogram`
+    accepts and flattens :attr:`input` of any shape.
+
+Args:
+    {input}
+    bins: Tensor[], int[], or int.
+            If Tensor[], defines the sequences of bin edges.
+            If int[], defines the number of equal-width bins in each dimension.
+            If int, defines the number of equal-width bins for all dimensions.
+Keyword args:
+    range (sequence of float): Defines the leftmost and rightmost bin edges
+                                in each dimension.
+    weight (Tensor): By default, each value in the input has weight 1. If a weight
+                        tensor is passed, each N-dimensional coordinate in input
+                        contributes its associated weight towards its bin's result.
+                        The weight tensor should have the same shape as the :attr:`input`
+                        tensor excluding its innermost dimension N.
+    density (bool): If False (default), the result will contain the count (or total weight)
+                    in each bin. If True, each count (weight) is divided by the total count
+                    (total weight), then divided by the volume of its associated bin.
+Returns:
+    hist (Tensor): N-dimensional Tensor containing the values of the histogram.
+    bin_edges(Tensor[]): sequence of N 1D Tensors containing the bin edges.
+
+Example::
+    >>> torch.histogramdd(torch.tensor([[0., 1.], [1., 0.], [2., 0.], [2., 2.]]), bins=[3, 3],
+    ...                   weight=torch.tensor([1., 2., 4., 8.]))
+        torch.return_types.histogramdd(
+            hist=tensor([[0., 1., 0.],
+                         [2., 0., 0.],
+                         [4., 0., 8.]]),
+            bin_edges=(tensor([0.0000, 0.6667, 1.3333, 2.0000]),
+                       tensor([0.0000, 0.6667, 1.3333, 2.0000])))
+
+    >>> torch.histogramdd(torch.tensor([[0., 0.], [1., 1.], [2., 2.]]), bins=[2, 2],
+    ...                   range=[0., 1., 0., 1.], density=True)
+        torch.return_types.histogramdd(
+           hist=tensor([[2., 0.],
+                        [0., 2.]]),
+           bin_edges=(tensor([0.0000, 0.5000, 1.0000]),
+                      tensor([0.0000, 0.5000, 1.0000])))
+
+""".format(
+        **common_args
+    ),
+)
+# TODO: Fix via https://github.com/pytorch/pytorch/issues/75798
+torch.histogramdd.__module__ = "torch"
+
+add_docstr(
+    torch.hypot,
+    r"""
+hypot(input, other, *, out=None) -> Tensor
+
+Given the legs of a right triangle, return its hypotenuse.
+
+.. math::
+    \text{out}_{i} = \sqrt{\text{input}_{i}^{2} + \text{other}_{i}^{2}}
+
+The shapes of ``input`` and ``other`` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+"""
+    + r"""
+Args:
+    input (Tensor): the first input tensor
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.hypot(torch.tensor([4.0]), torch.tensor([3.0, 4.0, 5.0]))
+    tensor([5.0000, 5.6569, 6.4031])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.i0,
+    r"""
+i0(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.i0`.
+""",
+)
+
+add_docstr(
+    torch.igamma,
+    r"""
+igamma(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.gammainc`.
+""",
+)
+
+add_docstr(
+    torch.igammac,
+    r"""
+igammac(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.gammaincc`.
+""",
+)
+
+add_docstr(
+    torch.index_select,
+    r"""
+index_select(input, dim, index, *, out=None) -> Tensor
+
+Returns a new tensor which indexes the :attr:`input` tensor along dimension
+:attr:`dim` using the entries in :attr:`index` which is a `LongTensor`.
+
+The returned tensor has the same number of dimensions as the original tensor
+(:attr:`input`).  The :attr:`dim`\ th dimension has the same size as the length
+of :attr:`index`; other dimensions have the same size as in the original tensor.
+
+.. note:: The returned tensor does **not** use the same storage as the original
+          tensor.  If :attr:`out` has a different shape than expected, we
+          silently change it to the correct shape, reallocating the underlying
+          storage if necessary.
+
+Args:
+    {input}
+    dim (int): the dimension in which we index
+    index (IntTensor or LongTensor): the 1-D tensor containing the indices to index
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> x
+    tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+            [-0.4664,  0.2647, -0.1228, -1.1068],
+            [-1.1734, -0.6571,  0.7230, -0.6004]])
+    >>> indices = torch.tensor([0, 2])
+    >>> torch.index_select(x, 0, indices)
+    tensor([[ 0.1427,  0.0231, -0.5414, -1.0009],
+            [-1.1734, -0.6571,  0.7230, -0.6004]])
+    >>> torch.index_select(x, 1, indices)
+    tensor([[ 0.1427, -0.5414],
+            [-0.4664, -0.1228],
+            [-1.1734,  0.7230]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.inverse,
+    r"""
+inverse(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.linalg.inv`
+""",
+)
+
+add_docstr(
+    torch.isin,
+    r"""
+isin(elements, test_elements, *, assume_unique=False, invert=False) -> Tensor
+
+Tests if each element of :attr:`elements` is in :attr:`test_elements`. Returns
+a boolean tensor of the same shape as :attr:`elements` that is True for elements
+in :attr:`test_elements` and False otherwise.
+
+.. note::
+    One of :attr:`elements` or :attr:`test_elements` can be a scalar, but not both.
+
+Args:
+    elements (Tensor or Scalar): Input elements
+    test_elements (Tensor or Scalar): Values against which to test for each input element
+    assume_unique (bool, optional): If True, assumes both :attr:`elements` and
+        :attr:`test_elements` contain unique elements, which can speed up the
+        calculation. Default: False
+    invert (bool, optional): If True, inverts the boolean return tensor, resulting in True
+        values for elements *not* in :attr:`test_elements`. Default: False
+
+Returns:
+    A boolean tensor of the same shape as :attr:`elements` that is True for elements in
+    :attr:`test_elements` and False otherwise
+
+Example:
+    >>> torch.isin(torch.tensor([[1, 2], [3, 4]]), torch.tensor([2, 3]))
+    tensor([[False,  True],
+            [ True, False]])
+""",
+)
+
+add_docstr(
+    torch.isinf,
+    r"""
+isinf(input) -> Tensor
+
+Tests if each element of :attr:`input` is infinite
+(positive or negative infinity) or not.
+
+.. note::
+    Complex values are infinite when their real or imaginary part is
+    infinite.
+
+Args:
+    {input}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is infinite and False elsewhere
+
+Example::
+
+    >>> torch.isinf(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+    tensor([False,  True,  False,  True,  False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.isposinf,
+    r"""
+isposinf(input, *, out=None) -> Tensor
+Tests if each element of :attr:`input` is positive infinity or not.
+
+Args:
+  {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+    >>> torch.isposinf(a)
+    tensor([False,  True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.isneginf,
+    r"""
+isneginf(input, *, out=None) -> Tensor
+Tests if each element of :attr:`input` is negative infinity or not.
+
+Args:
+  {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> a = torch.tensor([-float('inf'), float('inf'), 1.2])
+    >>> torch.isneginf(a)
+    tensor([ True, False, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.isclose,
+    r"""
+isclose(input, other, rtol=1e-05, atol=1e-08, equal_nan=False) -> Tensor
+
+Returns a new tensor with boolean elements representing if each element of
+:attr:`input` is "close" to the corresponding element of :attr:`other`.
+Closeness is defined as:
+
+.. math::
+    \lvert \text{input} - \text{other} \rvert \leq \texttt{atol} + \texttt{rtol} \times \lvert \text{other} \rvert
+"""
+    + r"""
+
+where :attr:`input` and :attr:`other` are finite. Where :attr:`input`
+and/or :attr:`other` are nonfinite they are close if and only if
+they are equal, with NaNs being considered equal to each other when
+:attr:`equal_nan` is True.
+
+Args:
+    input (Tensor): first tensor to compare
+    other (Tensor): second tensor to compare
+    atol (float, optional): absolute tolerance. Default: 1e-08
+    rtol (float, optional): relative tolerance. Default: 1e-05
+    equal_nan (bool, optional): if ``True``, then two ``NaN`` s will be considered equal. Default: ``False``
+
+Examples::
+
+    >>> torch.isclose(torch.tensor((1., 2, 3)), torch.tensor((1 + 1e-10, 3, 4)))
+    tensor([ True, False, False])
+    >>> torch.isclose(torch.tensor((float('inf'), 4)), torch.tensor((float('inf'), 6)), rtol=.5)
+    tensor([True, True])
+""",
+)
+
+add_docstr(
+    torch.isfinite,
+    r"""
+isfinite(input) -> Tensor
+
+Returns a new tensor with boolean elements representing if each element is `finite` or not.
+
+Real values are finite when they are not NaN, negative infinity, or infinity.
+Complex values are finite when both their real and imaginary parts are finite.
+
+Args:
+    {input}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is finite and False elsewhere
+
+Example::
+
+    >>> torch.isfinite(torch.tensor([1, float('inf'), 2, float('-inf'), float('nan')]))
+    tensor([True,  False,  True,  False,  False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.isnan,
+    r"""
+isnan(input) -> Tensor
+
+Returns a new tensor with boolean elements representing if each element of :attr:`input`
+is NaN or not. Complex values are considered NaN when either their real
+and/or imaginary part is NaN.
+
+Arguments:
+    {input}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is NaN and False elsewhere
+
+Example::
+
+    >>> torch.isnan(torch.tensor([1, float('nan'), 2]))
+    tensor([False, True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.isreal,
+    r"""
+isreal(input) -> Tensor
+
+Returns a new tensor with boolean elements representing if each element of :attr:`input` is real-valued or not.
+All real-valued types are considered real. Complex values are considered real when their imaginary part is 0.
+
+Arguments:
+    {input}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is real and False elsewhere
+
+Example::
+
+    >>> torch.isreal(torch.tensor([1, 1+1j, 2+0j]))
+    tensor([True, False, True])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_floating_point,
+    r"""
+is_floating_point(input) -> (bool)
+
+Returns True if the data type of :attr:`input` is a floating point data type i.e.,
+one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
+
+Args:
+    {input}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_complex,
+    r"""
+is_complex(input) -> (bool)
+
+Returns True if the data type of :attr:`input` is a complex data type i.e.,
+one of ``torch.complex64``, and ``torch.complex128``.
+
+Args:
+    {input}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_grad_enabled,
+    r"""
+is_grad_enabled() -> (bool)
+
+Returns True if grad mode is currently enabled.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_inference_mode_enabled,
+    r"""
+is_inference_mode_enabled() -> (bool)
+
+Returns True if inference mode is currently enabled.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_inference,
+    r"""
+is_inference(input) -> (bool)
+
+Returns True if :attr:`input` is an inference tensor.
+
+A non-view tensor is an inference tensor if and only if it was
+allocated during inference mode. A view tensor is an inference
+tensor if and only if the tensor it is a view of is an inference tensor.
+
+For details on inference mode please see
+`Inference Mode <https://pytorch.org/cppdocs/notes/inference_mode.html>`_.
+
+Args:
+    {input}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_conj,
+    r"""
+is_conj(input) -> (bool)
+
+Returns True if the :attr:`input` is a conjugated tensor, i.e. its conjugate bit is set to `True`.
+
+Args:
+    {input}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.is_nonzero,
+    r"""
+is_nonzero(input) -> (bool)
+
+Returns True if the :attr:`input` is a single element tensor which is not equal to zero
+after type conversions.
+i.e. not equal to ``torch.tensor([0.])`` or ``torch.tensor([0])`` or
+``torch.tensor([False])``.
+Throws a ``RuntimeError`` if ``torch.numel() != 1`` (even in case
+of sparse tensors).
+
+Args:
+    {input}
+
+Examples::
+
+    >>> torch.is_nonzero(torch.tensor([0.]))
+    False
+    >>> torch.is_nonzero(torch.tensor([1.5]))
+    True
+    >>> torch.is_nonzero(torch.tensor([False]))
+    False
+    >>> torch.is_nonzero(torch.tensor([3]))
+    True
+    >>> torch.is_nonzero(torch.tensor([1, 3, 5]))
+    Traceback (most recent call last):
+    ...
+    RuntimeError: bool value of Tensor with more than one value is ambiguous
+    >>> torch.is_nonzero(torch.tensor([]))
+    Traceback (most recent call last):
+    ...
+    RuntimeError: bool value of Tensor with no values is ambiguous
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.kron,
+    r"""
+kron(input, other, *, out=None) -> Tensor
+
+Computes the Kronecker product, denoted by :math:`\otimes`, of :attr:`input` and :attr:`other`.
+
+If :attr:`input` is a :math:`(a_0 \times a_1 \times \dots \times a_n)` tensor and :attr:`other` is a
+:math:`(b_0 \times b_1 \times \dots \times b_n)` tensor, the result will be a
+:math:`(a_0*b_0 \times a_1*b_1 \times \dots \times a_n*b_n)` tensor with the following entries:
+
+.. math::
+    (\text{input} \otimes \text{other})_{k_0, k_1, \dots, k_n} =
+        \text{input}_{i_0, i_1, \dots, i_n} * \text{other}_{j_0, j_1, \dots, j_n},
+
+where :math:`k_t = i_t * b_t + j_t` for :math:`0 \leq t \leq n`.
+If one tensor has fewer dimensions than the other it is unsqueezed until it has the same number of dimensions.
+
+Supports real-valued and complex-valued inputs.
+
+.. note::
+    This function generalizes the typical definition of the Kronecker product for two matrices to two tensors,
+    as described above. When :attr:`input` is a :math:`(m \times n)` matrix and :attr:`other` is a
+    :math:`(p \times q)` matrix, the result will be a :math:`(p*m \times q*n)` block matrix:
+
+    .. math::
+        \mathbf{A} \otimes \mathbf{B}=\begin{bmatrix}
+        a_{11} \mathbf{B} & \cdots & a_{1 n} \mathbf{B} \\
+        \vdots & \ddots & \vdots \\
+        a_{m 1} \mathbf{B} & \cdots & a_{m n} \mathbf{B} \end{bmatrix}
+
+    where :attr:`input` is :math:`\mathbf{A}` and :attr:`other` is :math:`\mathbf{B}`.
+
+Arguments:
+    input (Tensor)
+    other (Tensor)
+
+Keyword args:
+    out (Tensor, optional): The output tensor. Ignored if ``None``. Default: ``None``
+
+Examples::
+
+    >>> mat1 = torch.eye(2)
+    >>> mat2 = torch.ones(2, 2)
+    >>> torch.kron(mat1, mat2)
+    tensor([[1., 1., 0., 0.],
+            [1., 1., 0., 0.],
+            [0., 0., 1., 1.],
+            [0., 0., 1., 1.]])
+
+    >>> mat1 = torch.eye(2)
+    >>> mat2 = torch.arange(1, 5).reshape(2, 2)
+    >>> torch.kron(mat1, mat2)
+    tensor([[1., 2., 0., 0.],
+            [3., 4., 0., 0.],
+            [0., 0., 1., 2.],
+            [0., 0., 3., 4.]])
+""",
+)
+
+add_docstr(
+    torch.kthvalue,
+    r"""
+kthvalue(input, k, dim=None, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+
+Returns a namedtuple ``(values, indices)`` where ``values`` is the :attr:`k` th
+smallest element of each row of the :attr:`input` tensor in the given dimension
+:attr:`dim`. And ``indices`` is the index location of each element found.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+If :attr:`keepdim` is ``True``, both the :attr:`values` and :attr:`indices` tensors
+are the same size as :attr:`input`, except in the dimension :attr:`dim` where
+they are of size 1. Otherwise, :attr:`dim` is squeezed
+(see :func:`torch.squeeze`), resulting in both the :attr:`values` and
+:attr:`indices` tensors having 1 fewer dimension than the :attr:`input` tensor.
+
+.. note::
+    When :attr:`input` is a CUDA tensor and there are multiple valid
+    :attr:`k` th values, this function may nondeterministically return
+    :attr:`indices` for any of them.
+
+Args:
+    {input}
+    k (int): k for the k-th smallest element
+    dim (int, optional): the dimension to find the kth value along
+    {keepdim}
+
+Keyword args:
+    out (tuple, optional): the output tuple of (Tensor, LongTensor)
+                           can be optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.arange(1., 6.)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.])
+    >>> torch.kthvalue(x, 4)
+    torch.return_types.kthvalue(values=tensor(4.), indices=tensor(3))
+
+    >>> x=torch.arange(1.,7.).resize_(2,3)
+    >>> x
+    tensor([[ 1.,  2.,  3.],
+            [ 4.,  5.,  6.]])
+    >>> torch.kthvalue(x, 2, 0, True)
+    torch.return_types.kthvalue(values=tensor([[4., 5., 6.]]), indices=tensor([[1, 1, 1]]))
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.lcm,
+    r"""
+lcm(input, other, *, out=None) -> Tensor
+
+Computes the element-wise least common multiple (LCM) of :attr:`input` and :attr:`other`.
+
+Both :attr:`input` and :attr:`other` must have integer types.
+
+.. note::
+    This defines :math:`lcm(0, 0) = 0` and :math:`lcm(0, a) = 0`.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([5, 10, 15])
+    >>> b = torch.tensor([3, 4, 5])
+    >>> torch.lcm(a, b)
+    tensor([15, 20, 15])
+    >>> c = torch.tensor([3])
+    >>> torch.lcm(a, c)
+    tensor([15, 30, 15])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.ldexp,
+    r"""
+ldexp(input, other, *, out=None) -> Tensor
+
+Multiplies :attr:`input` by 2 ** :attr:`other`.
+
+.. math::
+    \text{{out}}_i = \text{{input}}_i * 2^\text{{other}}_i
+"""
+    + r"""
+
+Typically this function is used to construct floating point numbers by multiplying
+mantissas in :attr:`input` with integral powers of two created from the exponents
+in :attr:`other`.
+
+Args:
+    {input}
+    other (Tensor): a tensor of exponents, typically integers.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.ldexp(torch.tensor([1.]), torch.tensor([1]))
+    tensor([2.])
+    >>> torch.ldexp(torch.tensor([1.0]), torch.tensor([1, 2, 3, 4]))
+    tensor([ 2.,  4.,  8., 16.])
+
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.le,
+    r"""
+le(input, other, *, out=None) -> Tensor
+
+Computes :math:`\text{input} \leq \text{other}` element-wise.
+"""
+    + r"""
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or Scalar): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is less than or equal to
+    :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.le(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[True, False], [True, True]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.less_equal,
+    r"""
+less_equal(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.le`.
+""",
+)
+
+add_docstr(
+    torch.lerp,
+    r"""
+lerp(input, end, weight, *, out=None)
+
+Does a linear interpolation of two tensors :attr:`start` (given by :attr:`input`) and :attr:`end` based
+on a scalar or tensor :attr:`weight` and returns the resulting :attr:`out` tensor.
+
+.. math::
+    \text{out}_i = \text{start}_i + \text{weight}_i \times (\text{end}_i - \text{start}_i)
+"""
+    + r"""
+The shapes of :attr:`start` and :attr:`end` must be
+:ref:`broadcastable <broadcasting-semantics>`. If :attr:`weight` is a tensor, then
+the shapes of :attr:`weight`, :attr:`start`, and :attr:`end` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the tensor with the starting points
+    end (Tensor): the tensor with the ending points
+    weight (float or tensor): the weight for the interpolation formula
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> start = torch.arange(1., 5.)
+    >>> end = torch.empty(4).fill_(10)
+    >>> start
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> end
+    tensor([ 10.,  10.,  10.,  10.])
+    >>> torch.lerp(start, end, 0.5)
+    tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+    >>> torch.lerp(start, end, torch.full_like(start, 0.5))
+    tensor([ 5.5000,  6.0000,  6.5000,  7.0000])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.lgamma,
+    r"""
+lgamma(input, *, out=None) -> Tensor
+
+Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \ln |\Gamma(\text{input}_{i})|
+"""
+    + """
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.arange(0.5, 2, 0.5)
+    >>> torch.lgamma(a)
+    tensor([ 0.5724,  0.0000, -0.1208])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.linspace,
+    r"""
+linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+spaced from :attr:`start` to :attr:`end`, inclusive. That is, the value are:
+
+.. math::
+    (\text{start},
+    \text{start} + \frac{\text{end} - \text{start}}{\text{steps} - 1},
+    \ldots,
+    \text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{\text{steps} - 1},
+    \text{end})
+"""
+    + """
+
+From PyTorch 1.11 linspace requires the steps argument. Use steps=100 to restore the previous behavior.
+
+Args:
+    start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+    end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+    steps (int): size of the constructed tensor
+
+Keyword arguments:
+    {out}
+    dtype (torch.dtype, optional): the data type to perform the computation in.
+        Default: if None, uses the global default dtype (see torch.get_default_dtype())
+        when both :attr:`start` and :attr:`end` are real,
+        and corresponding complex dtype when either is complex.
+    {layout}
+    {device}
+    {requires_grad}
+
+
+Example::
+
+    >>> torch.linspace(3, 10, steps=5)
+    tensor([  3.0000,   4.7500,   6.5000,   8.2500,  10.0000])
+    >>> torch.linspace(-10, 10, steps=5)
+    tensor([-10.,  -5.,   0.,   5.,  10.])
+    >>> torch.linspace(start=-10, end=10, steps=5)
+    tensor([-10.,  -5.,   0.,   5.,  10.])
+    >>> torch.linspace(start=-10, end=10, steps=1)
+    tensor([-10.])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.log,
+    r"""
+log(input, *, out=None) -> Tensor
+
+Returns a new tensor with the natural logarithm of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{e} (x_{i})
+"""
+    + r"""
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.rand(5) * 5
+    >>> a
+    tensor([4.7767, 4.3234, 1.2156, 0.2411, 4.5739])
+    >>> torch.log(a)
+    tensor([ 1.5637,  1.4640,  0.1952, -1.4226,  1.5204])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.log10,
+    r"""
+log10(input, *, out=None) -> Tensor
+
+Returns a new tensor with the logarithm to the base 10 of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{10} (x_{i})
+"""
+    + r"""
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.rand(5)
+    >>> a
+    tensor([ 0.5224,  0.9354,  0.7257,  0.1301,  0.2251])
+
+
+    >>> torch.log10(a)
+    tensor([-0.2820, -0.0290, -0.1392, -0.8857, -0.6476])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.log1p,
+    r"""
+log1p(input, *, out=None) -> Tensor
+
+Returns a new tensor with the natural logarithm of (1 + :attr:`input`).
+
+.. math::
+    y_i = \log_{e} (x_i + 1)
+"""
+    + r"""
+.. note:: This function is more accurate than :func:`torch.log` for small
+          values of :attr:`input`
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([-1.0090, -0.9923,  1.0249, -0.5372,  0.2492])
+    >>> torch.log1p(a)
+    tensor([    nan, -4.8653,  0.7055, -0.7705,  0.2225])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.log2,
+    r"""
+log2(input, *, out=None) -> Tensor
+
+Returns a new tensor with the logarithm to the base 2 of the elements
+of :attr:`input`.
+
+.. math::
+    y_{i} = \log_{2} (x_{i})
+"""
+    + r"""
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.rand(5)
+    >>> a
+    tensor([ 0.8419,  0.8003,  0.9971,  0.5287,  0.0490])
+
+
+    >>> torch.log2(a)
+    tensor([-0.2483, -0.3213, -0.0042, -0.9196, -4.3504])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logaddexp,
+    r"""
+logaddexp(input, other, *, out=None) -> Tensor
+
+Logarithm of the sum of exponentiations of the inputs.
+
+Calculates pointwise :math:`\log\left(e^x + e^y\right)`. This function is useful
+in statistics where the calculated probabilities of events may be so small as to
+exceed the range of normal floating point numbers. In such cases the logarithm
+of the calculated probability is stored. This function allows adding
+probabilities stored in such a fashion.
+
+This op should be disambiguated with :func:`torch.logsumexp` which performs a
+reduction on a single tensor.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> torch.logaddexp(torch.tensor([-1.0]), torch.tensor([-1.0, -2, -3]))
+    tensor([-0.3069, -0.6867, -0.8731])
+    >>> torch.logaddexp(torch.tensor([-100.0, -200, -300]), torch.tensor([-1.0, -2, -3]))
+    tensor([-1., -2., -3.])
+    >>> torch.logaddexp(torch.tensor([1.0, 2000, 30000]), torch.tensor([-1.0, -2, -3]))
+    tensor([1.1269e+00, 2.0000e+03, 3.0000e+04])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logaddexp2,
+    r"""
+logaddexp2(input, other, *, out=None) -> Tensor
+
+Logarithm of the sum of exponentiations of the inputs in base-2.
+
+Calculates pointwise :math:`\log_2\left(2^x + 2^y\right)`. See
+:func:`torch.logaddexp` for more details.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword arguments:
+    {out}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.xlogy,
+    r"""
+xlogy(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.xlogy`.
+""",
+)
+
+add_docstr(
+    torch.logical_and,
+    r"""
+logical_and(input, other, *, out=None) -> Tensor
+
+Computes the element-wise logical AND of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+treated as ``True``.
+
+Args:
+    {input}
+    other (Tensor): the tensor to compute AND with
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.logical_and(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+    tensor([ True, False, False])
+    >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+    >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+    >>> torch.logical_and(a, b)
+    tensor([False, False,  True, False])
+    >>> torch.logical_and(a.double(), b.double())
+    tensor([False, False,  True, False])
+    >>> torch.logical_and(a.double(), b)
+    tensor([False, False,  True, False])
+    >>> torch.logical_and(a, b, out=torch.empty(4, dtype=torch.bool))
+    tensor([False, False,  True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logical_not,
+    r"""
+logical_not(input, *, out=None) -> Tensor
+
+Computes the element-wise logical NOT of the given input tensor. If not specified, the output tensor will have the bool
+dtype. If the input tensor is not a bool tensor, zeros are treated as ``False`` and non-zeros are treated as ``True``.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.logical_not(torch.tensor([True, False]))
+    tensor([False,  True])
+    >>> torch.logical_not(torch.tensor([0, 1, -10], dtype=torch.int8))
+    tensor([ True, False, False])
+    >>> torch.logical_not(torch.tensor([0., 1.5, -10.], dtype=torch.double))
+    tensor([ True, False, False])
+    >>> torch.logical_not(torch.tensor([0., 1., -10.], dtype=torch.double), out=torch.empty(3, dtype=torch.int16))
+    tensor([1, 0, 0], dtype=torch.int16)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logical_or,
+    r"""
+logical_or(input, other, *, out=None) -> Tensor
+
+Computes the element-wise logical OR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+treated as ``True``.
+
+Args:
+    {input}
+    other (Tensor): the tensor to compute OR with
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.logical_or(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+    tensor([ True, False,  True])
+    >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+    >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+    >>> torch.logical_or(a, b)
+    tensor([ True,  True,  True, False])
+    >>> torch.logical_or(a.double(), b.double())
+    tensor([ True,  True,  True, False])
+    >>> torch.logical_or(a.double(), b)
+    tensor([ True,  True,  True, False])
+    >>> torch.logical_or(a, b, out=torch.empty(4, dtype=torch.bool))
+    tensor([ True,  True,  True, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logical_xor,
+    r"""
+logical_xor(input, other, *, out=None) -> Tensor
+
+Computes the element-wise logical XOR of the given input tensors. Zeros are treated as ``False`` and nonzeros are
+treated as ``True``.
+
+Args:
+    {input}
+    other (Tensor): the tensor to compute XOR with
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.logical_xor(torch.tensor([True, False, True]), torch.tensor([True, False, False]))
+    tensor([False, False,  True])
+    >>> a = torch.tensor([0, 1, 10, 0], dtype=torch.int8)
+    >>> b = torch.tensor([4, 0, 1, 0], dtype=torch.int8)
+    >>> torch.logical_xor(a, b)
+    tensor([ True,  True, False, False])
+    >>> torch.logical_xor(a.double(), b.double())
+    tensor([ True,  True, False, False])
+    >>> torch.logical_xor(a.double(), b)
+    tensor([ True,  True, False, False])
+    >>> torch.logical_xor(a, b, out=torch.empty(4, dtype=torch.bool))
+    tensor([ True,  True, False, False])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logspace,
+    """
+logspace(start, end, steps, base=10.0, *, \
+         out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+
+Creates a one-dimensional tensor of size :attr:`steps` whose values are evenly
+spaced from :math:`{{\text{{base}}}}^{{\text{{start}}}}` to
+:math:`{{\text{{base}}}}^{{\text{{end}}}}`, inclusive, on a logarithmic scale
+with base :attr:`base`. That is, the values are:
+
+.. math::
+    (\text{base}^{\text{start}},
+    \text{base}^{(\text{start} + \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+    \ldots,
+    \text{base}^{(\text{start} + (\text{steps} - 2) * \frac{\text{end} - \text{start}}{ \text{steps} - 1})},
+    \text{base}^{\text{end}})
+"""
+    + """
+
+
+From PyTorch 1.11 logspace requires the steps argument. Use steps=100 to restore the previous behavior.
+
+Args:
+    start (float or Tensor): the starting value for the set of points. If `Tensor`, it must be 0-dimensional
+    end (float or Tensor): the ending value for the set of points. If `Tensor`, it must be 0-dimensional
+    steps (int): size of the constructed tensor
+    base (float, optional): base of the logarithm function. Default: ``10.0``.
+
+Keyword arguments:
+    {out}
+    dtype (torch.dtype, optional): the data type to perform the computation in.
+        Default: if None, uses the global default dtype (see torch.get_default_dtype())
+        when both :attr:`start` and :attr:`end` are real,
+        and corresponding complex dtype when either is complex.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.logspace(start=-10, end=10, steps=5)
+    tensor([ 1.0000e-10,  1.0000e-05,  1.0000e+00,  1.0000e+05,  1.0000e+10])
+    >>> torch.logspace(start=0.1, end=1.0, steps=5)
+    tensor([  1.2589,   2.1135,   3.5481,   5.9566,  10.0000])
+    >>> torch.logspace(start=0.1, end=1.0, steps=1)
+    tensor([1.2589])
+    >>> torch.logspace(start=2, end=2, steps=1, base=2)
+    tensor([4.0])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.logsumexp,
+    r"""
+logsumexp(input, dim, keepdim=False, *, out=None)
+
+Returns the log of summed exponentials of each row of the :attr:`input`
+tensor in the given dimension :attr:`dim`. The computation is numerically
+stabilized.
+
+For summation index :math:`j` given by `dim` and other indices :math:`i`, the result is
+
+    .. math::
+        \text{{logsumexp}}(x)_{{i}} = \log \sum_j \exp(x_{{ij}})
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+    {keepdim}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> torch.logsumexp(a, 1)
+    tensor([1.4907, 1.0593, 1.5696])
+    >>> torch.dist(torch.logsumexp(a, 1), torch.log(torch.sum(torch.exp(a), 1)))
+    tensor(1.6859e-07)
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.lt,
+    r"""
+lt(input, other, *, out=None) -> Tensor
+
+Computes :math:`\text{input} < \text{other}` element-wise.
+"""
+    + r"""
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is less than :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.lt(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[False, False], [True, False]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.lu_unpack,
+    r"""
+lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True, *, out=None) -> (Tensor, Tensor, Tensor)
+
+Unpacks the LU decomposition returned by :func:`~linalg.lu_factor` into the `P, L, U` matrices.
+
+.. seealso::
+
+    :func:`~linalg.lu` returns the matrices from the LU decomposition. Its gradient formula is more efficient
+    than that of doing :func:`~linalg.lu_factor` followed by :func:`~linalg.lu_unpack`.
+
+Args:
+    LU_data (Tensor): the packed LU factorization data
+    LU_pivots (Tensor): the packed LU factorization pivots
+    unpack_data (bool): flag indicating if the data should be unpacked.
+                        If ``False``, then the returned ``L`` and ``U`` are empty tensors.
+                        Default: ``True``
+    unpack_pivots (bool): flag indicating if the pivots should be unpacked into a permutation matrix ``P``.
+                          If ``False``, then the returned ``P`` is  an empty tensor.
+                          Default: ``True``
+
+Keyword args:
+    out (tuple, optional): output tuple of three tensors. Ignored if `None`.
+
+Returns:
+    A namedtuple ``(P, L, U)``
+
+Examples::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> P, L, U = torch.lu_unpack(LU, pivots)
+    >>> # We can recover A from the factorization
+    >>> A_ = P @ L @ U
+    >>> torch.allclose(A, A_)
+    True
+
+    >>> # LU factorization of a rectangular matrix:
+    >>> A = torch.randn(2, 3, 2)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> P, L, U = torch.lu_unpack(LU, pivots)
+    >>> # P, L, U are the same as returned by linalg.lu
+    >>> P_, L_, U_ = torch.linalg.lu(A)
+    >>> torch.allclose(P, P_) and torch.allclose(L, L_) and torch.allclose(U, U_)
+    True
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.less,
+    r"""
+less(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.lt`.
+""",
+)
+
+add_docstr(
+    torch.lu_solve,
+    r"""
+lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor
+
+Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
+LU factorization of A from :func:`~linalg.lu_factor`.
+
+This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+
+.. warning::
+
+    :func:`torch.lu_solve` is deprecated in favor of :func:`torch.linalg.lu_solve`.
+    :func:`torch.lu_solve` will be removed in a future PyTorch release.
+    ``X = torch.lu_solve(B, LU, pivots)`` should be replaced with
+
+    .. code:: python
+
+        X = linalg.lu_solve(LU, pivots, B)
+
+Arguments:
+    b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*`
+                is zero or more batch dimensions.
+    LU_data (Tensor): the pivoted LU factorization of A from :meth:`~linalg.lu_factor` of size :math:`(*, m, m)`,
+                       where :math:`*` is zero or more batch dimensions.
+    LU_pivots (IntTensor): the pivots of the LU factorization from :meth:`~linalg.lu_factor` of size :math:`(*, m)`,
+                           where :math:`*` is zero or more batch dimensions.
+                           The batch dimensions of :attr:`LU_pivots` must be equal to the batch dimensions of
+                           :attr:`LU_data`.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> b = torch.randn(2, 3, 1)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> x = torch.lu_solve(b, LU, pivots)
+    >>> torch.dist(A @ x, b)
+    tensor(1.00000e-07 *
+           2.8312)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.masked_select,
+    r"""
+masked_select(input, mask, *, out=None) -> Tensor
+
+Returns a new 1-D tensor which indexes the :attr:`input` tensor according to
+the boolean mask :attr:`mask` which is a `BoolTensor`.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor don't need
+to match, but they must be :ref:`broadcastable <broadcasting-semantics>`.
+
+.. note:: The returned tensor does **not** use the same storage
+          as the original tensor
+
+Args:
+    {input}
+    mask  (BoolTensor): the tensor containing the binary mask to index with
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> x
+    tensor([[ 0.3552, -2.3825, -0.8297,  0.3477],
+            [-1.2035,  1.2252,  0.5002,  0.6248],
+            [ 0.1307, -2.0608,  0.1244,  2.0139]])
+    >>> mask = x.ge(0.5)
+    >>> mask
+    tensor([[False, False, False, False],
+            [False, True, True, True],
+            [False, False, False, True]])
+    >>> torch.masked_select(x, mask)
+    tensor([ 1.2252,  0.5002,  0.6248,  2.0139])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.matrix_power,
+    r"""
+matrix_power(input, n, *, out=None) -> Tensor
+
+Alias for :func:`torch.linalg.matrix_power`
+""",
+)
+
+add_docstr(
+    torch.matrix_exp,
+    r"""
+matrix_exp(A) -> Tensor
+
+Alias for :func:`torch.linalg.matrix_exp`.
+""",
+)
+
+add_docstr(
+    torch.max,
+    r"""
+max(input) -> Tensor
+
+Returns the maximum value of all elements in the ``input`` tensor.
+
+.. warning::
+    This function produces deterministic (sub)gradients unlike ``max(dim=0)``
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.6763,  0.7445, -2.2369]])
+    >>> torch.max(a)
+    tensor(0.7445)
+
+.. function:: max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+   :noindex:
+
+Returns a namedtuple ``(values, indices)`` where ``values`` is the maximum
+value of each row of the :attr:`input` tensor in the given dimension
+:attr:`dim`. And ``indices`` is the index location of each maximum value found
+(argmax).
+
+If ``keepdim`` is ``True``, the output tensors are of the same size
+as ``input`` except in the dimension ``dim`` where they are of size 1.
+Otherwise, ``dim`` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensors having 1 fewer dimension than ``input``.
+
+.. note:: If there are multiple maximal values in a reduced row then
+          the indices of the first maximal value are returned.
+
+Args:
+    {input}
+    {dim}
+    {keepdim} Default: ``False``.
+
+Keyword args:
+    out (tuple, optional): the result tuple of two output tensors (max, max_indices)
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-1.2360, -0.2942, -0.1222,  0.8475],
+            [ 1.1949, -1.1127, -2.2379, -0.6702],
+            [ 1.5717, -0.9207,  0.1297, -1.8768],
+            [-0.6172,  1.0036, -0.6060, -0.2432]])
+    >>> torch.max(a, 1)
+    torch.return_types.max(values=tensor([0.8475, 1.1949, 1.5717, 1.0036]), indices=tensor([3, 0, 0, 1]))
+
+.. function:: max(input, other, *, out=None) -> Tensor
+   :noindex:
+
+See :func:`torch.maximum`.
+
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.maximum,
+    r"""
+maximum(input, other, *, out=None) -> Tensor
+
+Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+
+.. note::
+    If one of the elements being compared is a NaN, then that element is returned.
+    :func:`maximum` is not supported for tensors with complex dtypes.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor((1, 2, -1))
+    >>> b = torch.tensor((3, 0, 4))
+    >>> torch.maximum(a, b)
+    tensor([3, 2, 4])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.fmax,
+    r"""
+fmax(input, other, *, out=None) -> Tensor
+
+Computes the element-wise maximum of :attr:`input` and :attr:`other`.
+
+This is like :func:`torch.maximum` except it handles NaNs differently:
+if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the maximum.
+Only if both elements are NaN is NaN propagated.
+
+This function is a wrapper around C++'s ``std::fmax`` and is similar to NumPy's ``fmax`` function.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([9.7, float('nan'), 3.1, float('nan')])
+    >>> b = torch.tensor([-2.2, 0.5, float('nan'), float('nan')])
+    >>> torch.fmax(a, b)
+    tensor([9.7000, 0.5000, 3.1000,    nan])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.amax,
+    r"""
+amax(input, dim, keepdim=False, *, out=None) -> Tensor
+
+Returns the maximum value of each slice of the :attr:`input` tensor in the given
+dimension(s) :attr:`dim`.
+
+.. note::
+    The difference between ``max``/``min`` and ``amax``/``amin`` is:
+        - ``amax``/``amin`` supports reducing on multiple dimensions,
+        - ``amax``/``amin`` does not return indices,
+        - ``amax``/``amin`` evenly distributes gradient between equal values,
+          while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+          index in the source tensor.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.8177,  1.4878, -0.2491,  0.9130],
+            [-0.7158,  1.1775,  2.0992,  0.4817],
+            [-0.0053,  0.0164, -1.3738, -0.0507],
+            [ 1.9700,  1.1106, -1.0318, -1.0816]])
+    >>> torch.amax(a, 1)
+    tensor([1.4878, 2.0992, 0.0164, 1.9700])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.argmax,
+    r"""
+argmax(input) -> LongTensor
+
+Returns the indices of the maximum value of all elements in the :attr:`input` tensor.
+
+This is the second value returned by :meth:`torch.max`. See its
+documentation for the exact semantics of this method.
+
+.. note:: If there are multiple maximal values then the indices of the first maximal value are returned.
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+            [-0.7401, -0.8805, -0.3402, -1.1936],
+            [ 0.4907, -1.3948, -1.0691, -0.3132],
+            [-1.6092,  0.5419, -0.2993,  0.3195]])
+    >>> torch.argmax(a)
+    tensor(0)
+
+.. function:: argmax(input, dim, keepdim=False) -> LongTensor
+   :noindex:
+
+Returns the indices of the maximum values of a tensor across a dimension.
+
+This is the second value returned by :meth:`torch.max`. See its
+documentation for the exact semantics of this method.
+
+Args:
+    {input}
+    {dim} If ``None``, the argmax of the flattened input is returned.
+    {keepdim}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 1.3398,  0.2663, -0.2686,  0.2450],
+            [-0.7401, -0.8805, -0.3402, -1.1936],
+            [ 0.4907, -1.3948, -1.0691, -0.3132],
+            [-1.6092,  0.5419, -0.2993,  0.3195]])
+    >>> torch.argmax(a, dim=1)
+    tensor([ 0,  2,  0,  1])
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.argwhere,
+    r"""
+argwhere(input) -> Tensor
+
+Returns a tensor containing the indices of all non-zero elements of
+:attr:`input`.  Each row in the result contains the indices of a non-zero
+element in :attr:`input`. The result is sorted lexicographically, with
+the last index changing the fastest (C-style).
+
+If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+:attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+non-zero elements in the :attr:`input` tensor.
+
+.. note::
+    This function is similar to NumPy's `argwhere`.
+
+    When :attr:`input` is on CUDA, this function causes host-device synchronization.
+
+Args:
+    {input}
+
+Example::
+
+    >>> t = torch.tensor([1, 0, 1])
+    >>> torch.argwhere(t)
+    tensor([[0],
+            [2]])
+    >>> t = torch.tensor([[1, 0, 1], [0, 1, 1]])
+    >>> torch.argwhere(t)
+    tensor([[0, 0],
+            [0, 2],
+            [1, 1],
+            [1, 2]])
+""",
+)
+
+add_docstr(
+    torch.mean,
+    r"""
+mean(input, *, dtype=None) -> Tensor
+
+Returns the mean value of all elements in the :attr:`input` tensor. Input must be floating point or complex.
+
+Args:
+    input (Tensor):
+      the input tensor, either of floating point or complex dtype
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.2294, -0.5481,  1.3288]])
+    >>> torch.mean(a)
+    tensor(0.3367)
+
+.. function:: mean(input, dim, keepdim=False, *, dtype=None, out=None) -> Tensor
+   :noindex:
+
+Returns the mean value of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+reduce over all of them.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    {dtype}
+    {out}
+
+.. seealso::
+
+    :func:`torch.nanmean` computes the mean value of `non-NaN` elements.
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.3841,  0.6320,  0.4254, -0.7384],
+            [-0.9644,  1.0131, -0.6549, -1.4279],
+            [-0.2951, -1.3350, -0.7694,  0.5600],
+            [ 1.0842, -0.9580,  0.3623,  0.2343]])
+    >>> torch.mean(a, 1)
+    tensor([-0.0163, -0.5085, -0.4599,  0.1807])
+    >>> torch.mean(a, 1, True)
+    tensor([[-0.0163],
+            [-0.5085],
+            [-0.4599],
+            [ 0.1807]])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.nanmean,
+    r"""
+nanmean(input, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+
+Computes the mean of all `non-NaN` elements along the specified dimensions.
+
+This function is identical to :func:`torch.mean` when there are no `NaN` values
+in the :attr:`input` tensor. In the presence of `NaN`, :func:`torch.mean` will
+propagate the `NaN` to the output whereas :func:`torch.nanmean` will ignore the
+`NaN` values (`torch.nanmean(a)` is equivalent to `torch.mean(a[~a.isnan()])`).
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+    {keepdim}
+
+Keyword args:
+    {dtype}
+    {out}
+
+.. seealso::
+
+    :func:`torch.mean` computes the mean value, propagating `NaN`.
+
+Example::
+
+    >>> x = torch.tensor([[torch.nan, 1, 2], [1, 2, 3]])
+    >>> x.mean()
+    tensor(nan)
+    >>> x.nanmean()
+    tensor(1.8000)
+    >>> x.mean(dim=0)
+    tensor([   nan, 1.5000, 2.5000])
+    >>> x.nanmean(dim=0)
+    tensor([1.0000, 1.5000, 2.5000])
+
+    # If all elements in the reduced dimensions are NaN then the result is NaN
+    >>> torch.tensor([torch.nan]).nanmean()
+    tensor(nan)
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.median,
+    r"""
+median(input) -> Tensor
+
+Returns the median of the values in :attr:`input`.
+
+.. note::
+    The median is not unique for :attr:`input` tensors with an even number
+    of elements. In this case the lower of the two medians is returned. To
+    compute the mean of both medians, use :func:`torch.quantile` with ``q=0.5`` instead.
+
+.. warning::
+    This function produces deterministic (sub)gradients unlike ``median(dim=0)``
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 1.5219, -1.5212,  0.2202]])
+    >>> torch.median(a)
+    tensor(0.2202)
+
+.. function:: median(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+   :noindex:
+
+Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+in the dimension :attr:`dim`, and ``indices`` contains the index of the median values found in the dimension :attr:`dim`.
+
+By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size
+as :attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the outputs tensor having 1 fewer dimension than :attr:`input`.
+
+.. note::
+    The median is not unique for :attr:`input` tensors with an even number
+    of elements in the dimension :attr:`dim`. In this case the lower of the
+    two medians is returned. To compute the mean of both medians in
+    :attr:`input`, use :func:`torch.quantile` with ``q=0.5`` instead.
+
+.. warning::
+    ``indices`` does not necessarily contain the first occurrence of each
+    median value found, unless it is unique.
+    The exact implementation details are device-specific.
+    Do not expect the same result when run on CPU and GPU in general.
+    For the same reason do not expect the gradients to be deterministic.
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                      tensor, which must have dtype long, with their indices in the dimension
+                                      :attr:`dim` of :attr:`input`.
+
+Example::
+
+    >>> a = torch.randn(4, 5)
+    >>> a
+    tensor([[ 0.2505, -0.3982, -0.9948,  0.3518, -1.3131],
+            [ 0.3180, -0.6993,  1.0436,  0.0438,  0.2270],
+            [-0.2751,  0.7303,  0.2192,  0.3321,  0.2488],
+            [ 1.0778, -1.9510,  0.7048,  0.4742, -0.7125]])
+    >>> torch.median(a, 1)
+    torch.return_types.median(values=tensor([-0.3982,  0.2270,  0.2488,  0.4742]), indices=tensor([1, 4, 4, 3]))
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.nanmedian,
+    r"""
+nanmedian(input) -> Tensor
+
+Returns the median of the values in :attr:`input`, ignoring ``NaN`` values.
+
+This function is identical to :func:`torch.median` when there are no ``NaN`` values in :attr:`input`.
+When :attr:`input` has one or more ``NaN`` values, :func:`torch.median` will always return ``NaN``,
+while this function will return the median of the non-``NaN`` elements in :attr:`input`.
+If all the elements in :attr:`input` are ``NaN`` it will also return ``NaN``.
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.tensor([1, float('nan'), 3, 2])
+    >>> a.median()
+    tensor(nan)
+    >>> a.nanmedian()
+    tensor(2.)
+
+.. function:: nanmedian(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+   :noindex:
+
+Returns a namedtuple ``(values, indices)`` where ``values`` contains the median of each row of :attr:`input`
+in the dimension :attr:`dim`, ignoring ``NaN`` values, and ``indices`` contains the index of the median values
+found in the dimension :attr:`dim`.
+
+This function is identical to :func:`torch.median` when there are no ``NaN`` values in a reduced row. When a reduced row has
+one or more ``NaN`` values, :func:`torch.median` will always reduce it to ``NaN``, while this function will reduce it to the
+median of the non-``NaN`` elements. If all the elements in a reduced row are ``NaN`` then it will be reduced to ``NaN``, too.
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    out ((Tensor, Tensor), optional): The first tensor will be populated with the median values and the second
+                                      tensor, which must have dtype long, with their indices in the dimension
+                                      :attr:`dim` of :attr:`input`.
+
+Example::
+
+    >>> a = torch.tensor([[2, 3, 1], [float('nan'), 1, float('nan')]])
+    >>> a
+    tensor([[2., 3., 1.],
+            [nan, 1., nan]])
+    >>> a.median(0)
+    torch.return_types.median(values=tensor([nan, 1., nan]), indices=tensor([1, 1, 1]))
+    >>> a.nanmedian(0)
+    torch.return_types.nanmedian(values=tensor([2., 1., 1.]), indices=tensor([0, 1, 0]))
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.quantile,
+    r"""
+quantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+
+Computes the q-th quantiles of each row of the :attr:`input` tensor along the dimension :attr:`dim`.
+
+To compute the quantile, we map q in [0, 1] to the range of indices [0, n] to find the location
+of the quantile in the sorted input. If the quantile lies between two data points ``a < b`` with
+indices ``i`` and ``j`` in the sorted order, result is computed according to the given
+:attr:`interpolation` method as follows:
+
+- ``linear``: ``a + (b - a) * fraction``, where ``fraction`` is the fractional part of the computed quantile index.
+- ``lower``: ``a``.
+- ``higher``: ``b``.
+- ``nearest``: ``a`` or ``b``, whichever's index is closer to the computed quantile index (rounding down for .5 fractions).
+- ``midpoint``: ``(a + b) / 2``.
+
+If :attr:`q` is a 1D tensor, the first dimension of the output represents the quantiles and has size
+equal to the size of :attr:`q`, the remaining dimensions are what remains from the reduction.
+
+.. note::
+    By default :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation.
+
+Args:
+    {input}
+    q (float or Tensor): a scalar or 1D tensor of values in the range [0, 1].
+    {dim}
+    {keepdim}
+
+Keyword arguments:
+    interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                            Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                            Default is ``linear``.
+    {out}
+
+Example::
+
+    >>> a = torch.randn(2, 3)
+    >>> a
+    tensor([[ 0.0795, -1.2117,  0.9765],
+            [ 1.1707,  0.6706,  0.4884]])
+    >>> q = torch.tensor([0.25, 0.5, 0.75])
+    >>> torch.quantile(a, q, dim=1, keepdim=True)
+    tensor([[[-0.5661],
+            [ 0.5795]],
+
+            [[ 0.0795],
+            [ 0.6706]],
+
+            [[ 0.5280],
+            [ 0.9206]]])
+    >>> torch.quantile(a, q, dim=1, keepdim=True).shape
+    torch.Size([3, 2, 1])
+    >>> a = torch.arange(4.)
+    >>> a
+    tensor([0., 1., 2., 3.])
+    >>> torch.quantile(a, 0.6, interpolation='linear')
+    tensor(1.8000)
+    >>> torch.quantile(a, 0.6, interpolation='lower')
+    tensor(1.)
+    >>> torch.quantile(a, 0.6, interpolation='higher')
+    tensor(2.)
+    >>> torch.quantile(a, 0.6, interpolation='midpoint')
+    tensor(1.5000)
+    >>> torch.quantile(a, 0.6, interpolation='nearest')
+    tensor(2.)
+    >>> torch.quantile(a, 0.4, interpolation='nearest')
+    tensor(1.)
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.nanquantile,
+    r"""
+nanquantile(input, q, dim=None, keepdim=False, *, interpolation='linear', out=None) -> Tensor
+
+This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values,
+computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did
+not exist. If all values in a reduced row are ``NaN`` then the quantiles for
+that reduction will be ``NaN``. See the documentation for :func:`torch.quantile`.
+
+Args:
+    {input}
+    q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1]
+    {dim}
+    {keepdim}
+
+Keyword arguments:
+    interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+                            Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
+                            Default is ``linear``.
+    {out}
+
+Example::
+
+    >>> t = torch.tensor([float('nan'), 1, 2])
+    >>> t.quantile(0.5)
+    tensor(nan)
+    >>> t.nanquantile(0.5)
+    tensor(1.5000)
+    >>> t = torch.tensor([[float('nan'), float('nan')], [1, 2]])
+    >>> t
+    tensor([[nan, nan],
+            [1., 2.]])
+    >>> t.nanquantile(0.5, dim=0)
+    tensor([1., 2.])
+    >>> t.nanquantile(0.5, dim=1)
+    tensor([   nan, 1.5000])
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.min,
+    r"""
+min(input) -> Tensor
+
+Returns the minimum value of all elements in the :attr:`input` tensor.
+
+.. warning::
+    This function produces deterministic (sub)gradients unlike ``min(dim=0)``
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.6750,  1.0857,  1.7197]])
+    >>> torch.min(a)
+    tensor(0.6750)
+
+.. function:: min(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+   :noindex:
+
+Returns a namedtuple ``(values, indices)`` where ``values`` is the minimum
+value of each row of the :attr:`input` tensor in the given dimension
+:attr:`dim`. And ``indices`` is the index location of each minimum value found
+(argmin).
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+:attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting in
+the output tensors having 1 fewer dimension than :attr:`input`.
+
+.. note:: If there are multiple minimal values in a reduced row then
+          the indices of the first minimal value are returned.
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    out (tuple, optional): the tuple of two output tensors (min, min_indices)
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[-0.6248,  1.1334, -1.1899, -0.2803],
+            [-1.4644, -0.2635, -0.3651,  0.6134],
+            [ 0.2457,  0.0384,  1.0128,  0.7015],
+            [-0.1153,  2.9849,  2.1458,  0.5788]])
+    >>> torch.min(a, 1)
+    torch.return_types.min(values=tensor([-1.1899, -1.4644,  0.0384, -0.1153]), indices=tensor([2, 0, 1, 0]))
+
+.. function:: min(input, other, *, out=None) -> Tensor
+   :noindex:
+
+See :func:`torch.minimum`.
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.minimum,
+    r"""
+minimum(input, other, *, out=None) -> Tensor
+
+Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+
+.. note::
+    If one of the elements being compared is a NaN, then that element is returned.
+    :func:`minimum` is not supported for tensors with complex dtypes.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor((1, 2, -1))
+    >>> b = torch.tensor((3, 0, 4))
+    >>> torch.minimum(a, b)
+    tensor([1, 0, -1])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.fmin,
+    r"""
+fmin(input, other, *, out=None) -> Tensor
+
+Computes the element-wise minimum of :attr:`input` and :attr:`other`.
+
+This is like :func:`torch.minimum` except it handles NaNs differently:
+if exactly one of the two elements being compared is a NaN then the non-NaN element is taken as the minimum.
+Only if both elements are NaN is NaN propagated.
+
+This function is a wrapper around C++'s ``std::fmin`` and is similar to NumPy's ``fmin`` function.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer and floating-point inputs.
+
+Args:
+    {input}
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([2.2, float('nan'), 2.1, float('nan')])
+    >>> b = torch.tensor([-9.3, 0.1, float('nan'), float('nan')])
+    >>> torch.fmin(a, b)
+    tensor([-9.3000, 0.1000, 2.1000,    nan])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.amin,
+    r"""
+amin(input, dim, keepdim=False, *, out=None) -> Tensor
+
+Returns the minimum value of each slice of the :attr:`input` tensor in the given
+dimension(s) :attr:`dim`.
+
+.. note::
+    The difference between ``max``/``min`` and ``amax``/``amin`` is:
+        - ``amax``/``amin`` supports reducing on multiple dimensions,
+        - ``amax``/``amin`` does not return indices,
+        - ``amax``/``amin`` evenly distributes gradient between equal values,
+          while ``max(dim)``/``min(dim)`` propagates gradient only to a single
+          index in the source tensor.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.6451, -0.4866,  0.2987, -1.3312],
+            [-0.5744,  1.2980,  1.8397, -0.2713],
+            [ 0.9128,  0.9214, -1.7268, -0.2995],
+            [ 0.9023,  0.4853,  0.9075, -1.6165]])
+    >>> torch.amin(a, 1)
+    tensor([-1.3312, -0.5744, -1.7268, -1.6165])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.aminmax,
+    r"""
+aminmax(input, *, dim=None, keepdim=False, out=None) -> (Tensor min, Tensor max)
+
+Computes the minimum and maximum values of the :attr:`input` tensor.
+
+Args:
+    input (Tensor):
+        The input tensor
+
+Keyword Args:
+    dim (Optional[int]):
+        The dimension along which to compute the values. If `None`,
+        computes the values over the entire :attr:`input` tensor.
+        Default is `None`.
+    keepdim (bool):
+        If `True`, the reduced dimensions will be kept in the output
+        tensor as dimensions with size 1 for broadcasting, otherwise
+        they will be removed, as if calling (:func:`torch.squeeze`).
+        Default is `False`.
+    out (Optional[Tuple[Tensor, Tensor]]):
+        Optional tensors on which to write the result. Must have the same
+        shape and dtype as the expected output.
+        Default is `None`.
+
+Returns:
+    A named tuple `(min, max)` containing the minimum and maximum values.
+
+Raises:
+    RuntimeError
+        If any of the dimensions to compute the values over has size 0.
+
+.. note::
+    NaN values are propagated to the output if at least one value is NaN.
+
+.. seealso::
+    :func:`torch.amin` computes just the minimum value
+    :func:`torch.amax` computes just the maximum value
+
+Example::
+
+    >>> torch.aminmax(torch.tensor([1, -3, 5]))
+    torch.return_types.aminmax(
+    min=tensor(-3),
+    max=tensor(5))
+
+    >>> # aminmax propagates NaNs
+    >>> torch.aminmax(torch.tensor([1, -3, 5, torch.nan]))
+    torch.return_types.aminmax(
+    min=tensor(nan),
+    max=tensor(nan))
+
+    >>> t = torch.arange(10).view(2, 5)
+    >>> t
+    tensor([[0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9]])
+    >>> t.aminmax(dim=0, keepdim=True)
+    torch.return_types.aminmax(
+    min=tensor([[0, 1, 2, 3, 4]]),
+    max=tensor([[5, 6, 7, 8, 9]]))
+""",
+)
+
+add_docstr(
+    torch.argmin,
+    r"""
+argmin(input, dim=None, keepdim=False) -> LongTensor
+
+Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
+
+This is the second value returned by :meth:`torch.min`. See its
+documentation for the exact semantics of this method.
+
+.. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
+
+Args:
+    {input}
+    {dim} If ``None``, the argmin of the flattened input is returned.
+    {keepdim}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
+            [ 1.0100, -1.1975, -0.0102, -0.4732],
+            [-0.9240,  0.1207, -0.7506, -1.0213],
+            [ 1.7809, -1.2960,  0.9384,  0.1438]])
+    >>> torch.argmin(a)
+    tensor(13)
+    >>> torch.argmin(a, dim=1)
+    tensor([ 2,  1,  3,  1])
+    >>> torch.argmin(a, dim=1, keepdim=True)
+    tensor([[2],
+            [1],
+            [3],
+            [1]])
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.mm,
+    r"""
+mm(input, mat2, *, out=None) -> Tensor
+
+Performs a matrix multiplication of the matrices :attr:`input` and :attr:`mat2`.
+
+If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`mat2` is a
+:math:`(m \times p)` tensor, :attr:`out` will be a :math:`(n \times p)` tensor.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+          For broadcasting matrix products, see :func:`torch.matmul`.
+
+Supports strided and sparse 2-D tensors as inputs, autograd with
+respect to strided inputs.
+
+This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`.
+If :attr:`out` is provided it's layout will be used. Otherwise, the result
+layout will be deduced from that of :attr:`input`.
+
+{sparse_beta_warning}
+
+{tf32_note}
+
+{rocm_fp16_note}
+
+Args:
+    input (Tensor): the first matrix to be matrix multiplied
+    mat2 (Tensor): the second matrix to be matrix multiplied
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> mat1 = torch.randn(2, 3)
+    >>> mat2 = torch.randn(3, 3)
+    >>> torch.mm(mat1, mat2)
+    tensor([[ 0.4851,  0.5037, -0.3633],
+            [-0.0760, -3.6705,  2.4784]])
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes, **sparse_support_notes
+    ),
+)
+
+add_docstr(
+    torch.hspmm,
+    r"""
+hspmm(mat1, mat2, *, out=None) -> Tensor
+
+Performs a matrix multiplication of a :ref:`sparse COO matrix
+<sparse-coo-docs>` :attr:`mat1` and a strided matrix :attr:`mat2`. The
+result is a (1 + 1)-dimensional :ref:`hybrid COO matrix
+<sparse-hybrid-coo-docs>`.
+
+Args:
+    mat1 (Tensor): the first sparse matrix to be matrix multiplied
+    mat2 (Tensor): the second strided matrix to be matrix multiplied
+
+Keyword args:
+    {out}
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.matmul,
+    r"""
+matmul(input, other, *, out=None) -> Tensor
+
+Matrix product of two tensors.
+
+The behavior depends on the dimensionality of the tensors as follows:
+
+- If both tensors are 1-dimensional, the dot product (scalar) is returned.
+- If both arguments are 2-dimensional, the matrix-matrix product is returned.
+- If the first argument is 1-dimensional and the second argument is 2-dimensional,
+  a 1 is prepended to its dimension for the purpose of the matrix multiply.
+  After the matrix multiply, the prepended dimension is removed.
+- If the first argument is 2-dimensional and the second argument is 1-dimensional,
+  the matrix-vector product is returned.
+- If both arguments are at least 1-dimensional and at least one argument is
+  N-dimensional (where N > 2), then a batched matrix multiply is returned.  If the first
+  argument is 1-dimensional, a 1 is prepended to its dimension for the purpose of the
+  batched matrix multiply and removed after.  If the second argument is 1-dimensional, a
+  1 is appended to its dimension for the purpose of the batched matrix multiple and removed after.
+  The non-matrix (i.e. batch) dimensions are :ref:`broadcasted <broadcasting-semantics>` (and thus
+  must be broadcastable).  For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)`
+  tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor.
+
+  Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs
+  are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a
+  :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)`
+  tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the
+  matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor.
+
+This operation has support for arguments with :ref:`sparse layouts<sparse-docs>`. In particular the
+matrix-matrix (both arguments 2-dimensional) supports sparse arguments with the same restrictions
+as :func:`torch.mm`
+
+{sparse_beta_warning}
+
+{tf32_note}
+
+{rocm_fp16_note}
+
+.. note::
+
+    The 1-dimensional dot product version of this function does not support an :attr:`out` parameter.
+
+Arguments:
+    input (Tensor): the first tensor to be multiplied
+    other (Tensor): the second tensor to be multiplied
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> # vector x vector
+    >>> tensor1 = torch.randn(3)
+    >>> tensor2 = torch.randn(3)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([])
+    >>> # matrix x vector
+    >>> tensor1 = torch.randn(3, 4)
+    >>> tensor2 = torch.randn(4)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([3])
+    >>> # batched matrix x broadcasted vector
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(4)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3])
+    >>> # batched matrix x batched matrix
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(10, 4, 5)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3, 5])
+    >>> # batched matrix x broadcasted matrix
+    >>> tensor1 = torch.randn(10, 3, 4)
+    >>> tensor2 = torch.randn(4, 5)
+    >>> torch.matmul(tensor1, tensor2).size()
+    torch.Size([10, 3, 5])
+
+""".format(
+        **common_args, **tf32_notes, **rocm_fp16_notes, **sparse_support_notes
+    ),
+)
+
+add_docstr(
+    torch.mode,
+    r"""
+mode(input, dim=-1, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+
+Returns a namedtuple ``(values, indices)`` where ``values`` is the mode
+value of each row of the :attr:`input` tensor in the given dimension
+:attr:`dim`, i.e. a value which appears most often
+in that row, and ``indices`` is the index location of each mode value found.
+
+By default, :attr:`dim` is the last dimension of the :attr:`input` tensor.
+
+If :attr:`keepdim` is ``True``, the output tensors are of the same size as
+:attr:`input` except in the dimension :attr:`dim` where they are of size 1.
+Otherwise, :attr:`dim` is squeezed (see :func:`torch.squeeze`), resulting
+in the output tensors having 1 fewer dimension than :attr:`input`.
+
+.. note:: This function is not defined for ``torch.cuda.Tensor`` yet.
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    out (tuple, optional): the result tuple of two output tensors (values, indices)
+
+Example::
+
+    >>> b = torch.tensor(
+           [[0, 0, 0, 2, 0, 0, 2],
+            [0, 3, 0, 0, 2, 0, 1],
+            [2, 2, 2, 0, 0, 0, 3],
+            [2, 2, 3, 0, 1, 1, 0],
+            [1, 1, 0, 0, 2, 0, 2]])
+    >>> torch.mode(b, 0)
+    torch.return_types.mode(
+    values=tensor([0, 2, 0, 0, 0, 0, 2]),
+    indices=tensor([1, 3, 4, 4, 2, 4, 4]))
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.mul,
+    r"""
+mul(input, other, *, out=None) -> Tensor
+
+Multiplies :attr:`input` by :attr:`other`.
+
+
+.. math::
+    \text{out}_i = \text{input}_i \times \text{other}_i
+"""
+    + r"""
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+
+Args:
+    {input}
+    other (Tensor or Number) - the tensor or number to multiply input by.
+
+Keyword args:
+    {out}
+
+Examples::
+
+    >>> a = torch.randn(3)
+    >>> a
+    tensor([ 0.2015, -0.4255,  2.6087])
+    >>> torch.mul(a, 100)
+    tensor([  20.1494,  -42.5491,  260.8663])
+
+    >>> b = torch.randn(4, 1)
+    >>> b
+    tensor([[ 1.1207],
+            [-0.3137],
+            [ 0.0700],
+            [ 0.8378]])
+    >>> c = torch.randn(1, 4)
+    >>> c
+    tensor([[ 0.5146,  0.1216, -0.5244,  2.2382]])
+    >>> torch.mul(b, c)
+    tensor([[ 0.5767,  0.1363, -0.5877,  2.5083],
+            [-0.1614, -0.0382,  0.1645, -0.7021],
+            [ 0.0360,  0.0085, -0.0367,  0.1567],
+            [ 0.4312,  0.1019, -0.4394,  1.8753]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.multiply,
+    r"""
+multiply(input, other, *, out=None)
+
+Alias for :func:`torch.mul`.
+""",
+)
+
+add_docstr(
+    torch.multinomial,
+    r"""
+multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor
+
+Returns a tensor where each row contains :attr:`num_samples` indices sampled
+from the multinomial (a stricter definition would be multivariate,
+refer to torch.distributions.multinomial.Multinomial for more details)
+probability distribution located in the corresponding row
+of tensor :attr:`input`.
+
+.. note::
+    The rows of :attr:`input` do not need to sum to one (in which case we use
+    the values as weights), but must be non-negative, finite and have
+    a non-zero sum.
+
+Indices are ordered from left to right according to when each was sampled
+(first samples are placed in first column).
+
+If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.
+
+If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
+:math:`(m \times \text{{num\_samples}})`.
+
+If replacement is ``True``, samples are drawn with replacement.
+
+If not, they are drawn without replacement, which means that when a
+sample index is drawn for a row, it cannot be drawn again for that row.
+
+.. note::
+    When drawn without replacement, :attr:`num_samples` must be lower than
+    number of non-zero elements in :attr:`input` (or the min number of non-zero
+    elements in each row of :attr:`input` if it is a matrix).
+
+Args:
+    input (Tensor): the input tensor containing probabilities
+    num_samples (int): number of samples to draw
+    replacement (bool, optional): whether to draw with replacement or not
+
+Keyword args:
+    {generator}
+    {out}
+
+Example::
+
+    >>> weights = torch.tensor([0, 10, 3, 0], dtype=torch.float) # create a tensor of weights
+    >>> torch.multinomial(weights, 2)
+    tensor([1, 2])
+    >>> torch.multinomial(weights, 4) # ERROR!
+    RuntimeError: invalid argument 2: invalid multinomial distribution (with replacement=False,
+    not enough non-negative category to sample) at ../aten/src/TH/generic/THTensorRandom.cpp:320
+    >>> torch.multinomial(weights, 4, replacement=True)
+    tensor([ 2,  1,  1,  1])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.mv,
+    r"""
+mv(input, vec, *, out=None) -> Tensor
+
+Performs a matrix-vector product of the matrix :attr:`input` and the vector
+:attr:`vec`.
+
+If :attr:`input` is a :math:`(n \times m)` tensor, :attr:`vec` is a 1-D tensor of
+size :math:`m`, :attr:`out` will be 1-D of size :math:`n`.
+
+.. note:: This function does not :ref:`broadcast <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): matrix to be multiplied
+    vec (Tensor): vector to be multiplied
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> mat = torch.randn(2, 3)
+    >>> vec = torch.randn(3)
+    >>> torch.mv(mat, vec)
+    tensor([ 1.0404, -0.6361])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.mvlgamma,
+    r"""
+mvlgamma(input, p, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.multigammaln`.
+""",
+)
+
+add_docstr(
+    torch.movedim,
+    r"""
+movedim(input, source, destination) -> Tensor
+
+Moves the dimension(s) of :attr:`input` at the position(s) in :attr:`source`
+to the position(s) in :attr:`destination`.
+
+Other dimensions of :attr:`input` that are not explicitly moved remain in
+their original order and appear at the positions not specified in :attr:`destination`.
+
+Args:
+    {input}
+    source (int or tuple of ints): Original positions of the dims to move. These must be unique.
+    destination (int or tuple of ints): Destination positions for each of the original dims. These must also be unique.
+
+Examples::
+
+    >>> t = torch.randn(3,2,1)
+    >>> t
+    tensor([[[-0.3362],
+            [-0.8437]],
+
+            [[-0.9627],
+            [ 0.1727]],
+
+            [[ 0.5173],
+            [-0.1398]]])
+    >>> torch.movedim(t, 1, 0).shape
+    torch.Size([2, 3, 1])
+    >>> torch.movedim(t, 1, 0)
+    tensor([[[-0.3362],
+            [-0.9627],
+            [ 0.5173]],
+
+            [[-0.8437],
+            [ 0.1727],
+            [-0.1398]]])
+    >>> torch.movedim(t, (1, 2), (0, 1)).shape
+    torch.Size([2, 1, 3])
+    >>> torch.movedim(t, (1, 2), (0, 1))
+    tensor([[[-0.3362, -0.9627,  0.5173]],
+
+            [[-0.8437,  0.1727, -0.1398]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.moveaxis,
+    r"""
+moveaxis(input, source, destination) -> Tensor
+
+Alias for :func:`torch.movedim`.
+
+This function is equivalent to NumPy's moveaxis function.
+
+Examples::
+
+    >>> t = torch.randn(3,2,1)
+    >>> t
+    tensor([[[-0.3362],
+            [-0.8437]],
+
+            [[-0.9627],
+            [ 0.1727]],
+
+            [[ 0.5173],
+            [-0.1398]]])
+    >>> torch.moveaxis(t, 1, 0).shape
+    torch.Size([2, 3, 1])
+    >>> torch.moveaxis(t, 1, 0)
+    tensor([[[-0.3362],
+            [-0.9627],
+            [ 0.5173]],
+
+            [[-0.8437],
+            [ 0.1727],
+            [-0.1398]]])
+    >>> torch.moveaxis(t, (1, 2), (0, 1)).shape
+    torch.Size([2, 1, 3])
+    >>> torch.moveaxis(t, (1, 2), (0, 1))
+    tensor([[[-0.3362, -0.9627,  0.5173]],
+
+            [[-0.8437,  0.1727, -0.1398]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.swapdims,
+    r"""
+swapdims(input, dim0, dim1) -> Tensor
+
+Alias for :func:`torch.transpose`.
+
+This function is equivalent to NumPy's swapaxes function.
+
+Examples::
+
+    >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+    >>> x
+    tensor([[[0, 1],
+            [2, 3]],
+
+            [[4, 5],
+            [6, 7]]])
+    >>> torch.swapdims(x, 0, 1)
+    tensor([[[0, 1],
+            [4, 5]],
+
+            [[2, 3],
+            [6, 7]]])
+    >>> torch.swapdims(x, 0, 2)
+    tensor([[[0, 4],
+            [2, 6]],
+
+            [[1, 5],
+            [3, 7]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.swapaxes,
+    r"""
+swapaxes(input, axis0, axis1) -> Tensor
+
+Alias for :func:`torch.transpose`.
+
+This function is equivalent to NumPy's swapaxes function.
+
+Examples::
+
+    >>> x = torch.tensor([[[0,1],[2,3]],[[4,5],[6,7]]])
+    >>> x
+    tensor([[[0, 1],
+            [2, 3]],
+
+            [[4, 5],
+            [6, 7]]])
+    >>> torch.swapaxes(x, 0, 1)
+    tensor([[[0, 1],
+            [4, 5]],
+
+            [[2, 3],
+            [6, 7]]])
+    >>> torch.swapaxes(x, 0, 2)
+    tensor([[[0, 4],
+            [2, 6]],
+
+            [[1, 5],
+            [3, 7]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.narrow,
+    r"""
+narrow(input, dim, start, length) -> Tensor
+
+Returns a new tensor that is a narrowed version of :attr:`input` tensor. The
+dimension :attr:`dim` is input from :attr:`start` to ``start + length``. The
+returned tensor and :attr:`input` tensor share the same underlying storage.
+
+Args:
+    input (Tensor): the tensor to narrow
+    dim (int): the dimension along which to narrow
+    start (int or Tensor): index of the element to start the narrowed dimension
+        from. Can be negative, which means indexing from the end of `dim`. If
+        `Tensor`, it must be an 0-dim integral `Tensor` (bools not allowed)
+    length (int): length of the narrowed dimension, must be weakly positive
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> torch.narrow(x, 0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> torch.narrow(x, 1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
+    >>> torch.narrow(x, -1, torch.tensor(-1), 1)
+    tensor([[3],
+            [6],
+            [9]])
+""",
+)
+
+add_docstr(
+    torch.narrow_copy,
+    r"""
+narrow_copy(input, dim, start, length, *, out=None) -> Tensor
+
+Same as :meth:`Tensor.narrow` except this returns a copy rather
+than shared storage. This is primarily for sparse tensors, which
+do not have a shared-storage narrow method.
+
+Args:
+    input (Tensor): the tensor to narrow
+    dim (int): the dimension along which to narrow
+    start (int): index of the element to start the narrowed dimension from. Can
+        be negative, which means indexing from the end of `dim`
+    length (int): length of the narrowed dimension, must be weakly positive
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    >>> torch.narrow_copy(x, 0, 0, 2)
+    tensor([[ 1,  2,  3],
+            [ 4,  5,  6]])
+    >>> torch.narrow_copy(x, 1, 1, 2)
+    tensor([[ 2,  3],
+            [ 5,  6],
+            [ 8,  9]])
+    >>> s = torch.arange(16).reshape(2, 2, 2, 2).to_sparse(2)
+    >>> torch.narrow_copy(s, 0, 0, 1)
+    tensor(indices=tensor([[0, 0],
+                           [0, 1]]),
+           values=tensor([[[0, 1],
+                           [2, 3]],
+
+                          [[4, 5],
+                           [6, 7]]]),
+           size=(1, 2, 2, 2), nnz=2, layout=torch.sparse_coo)
+
+.. seealso::
+
+        :func:`torch.narrow` for a non copy variant
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.nan_to_num,
+    r"""
+nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor
+
+Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input`
+with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively.
+By default, :literal:`NaN`\ s are replaced with zero, positive infinity is replaced with the
+greatest finite value representable by :attr:`input`'s dtype, and negative infinity
+is replaced with the least finite value representable by :attr:`input`'s dtype.
+
+Args:
+    {input}
+    nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero.
+    posinf (Number, optional): if a Number, the value to replace positive infinity values with.
+        If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+    neginf (Number, optional): if a Number, the value to replace negative infinity values with.
+        If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype.
+        Default is None.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14])
+    >>> torch.nan_to_num(x)
+    tensor([ 0.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0)
+    tensor([ 2.0000e+00,  3.4028e+38, -3.4028e+38,  3.1400e+00])
+    >>> torch.nan_to_num(x, nan=2.0, posinf=1.0)
+    tensor([ 2.0000e+00,  1.0000e+00, -3.4028e+38,  3.1400e+00])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.ne,
+    r"""
+ne(input, other, *, out=None) -> Tensor
+
+Computes :math:`\text{input} \neq \text{other}` element-wise.
+"""
+    + r"""
+
+The second argument can be a number or a tensor whose shape is
+:ref:`broadcastable <broadcasting-semantics>` with the first argument.
+
+Args:
+    input (Tensor): the tensor to compare
+    other (Tensor or float): the tensor or value to compare
+
+Keyword args:
+    {out}
+
+Returns:
+    A boolean tensor that is True where :attr:`input` is not equal to :attr:`other` and False elsewhere
+
+Example::
+
+    >>> torch.ne(torch.tensor([[1, 2], [3, 4]]), torch.tensor([[1, 1], [4, 4]]))
+    tensor([[False, True], [True, False]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.not_equal,
+    r"""
+not_equal(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.ne`.
+""",
+)
+
+add_docstr(
+    torch.neg,
+    r"""
+neg(input, *, out=None) -> Tensor
+
+Returns a new tensor with the negative of the elements of :attr:`input`.
+
+.. math::
+    \text{out} = -1 \times \text{input}
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(5)
+    >>> a
+    tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    >>> torch.neg(a)
+    tensor([-0.0090,  0.2262,  0.0682,  0.2866, -0.3940])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.negative,
+    r"""
+negative(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.neg`
+""",
+)
+
+add_docstr(
+    torch.nextafter,
+    r"""
+nextafter(input, other, *, out=None) -> Tensor
+
+Return the next floating-point value after :attr:`input` towards :attr:`other`, elementwise.
+
+The shapes of ``input`` and ``other`` must be
+:ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    input (Tensor): the first input tensor
+    other (Tensor): the second input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> eps = torch.finfo(torch.float32).eps
+    >>> torch.nextafter(torch.tensor([1.0, 2.0]), torch.tensor([2.0, 1.0])) == torch.tensor([eps + 1, 2 - eps])
+    tensor([True, True])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.nonzero,
+    r"""
+nonzero(input, *, out=None, as_tuple=False) -> LongTensor or tuple of LongTensors
+
+.. note::
+    :func:`torch.nonzero(..., as_tuple=False) <torch.nonzero>` (default) returns a
+    2-D tensor where each row is the index for a nonzero value.
+
+    :func:`torch.nonzero(..., as_tuple=True) <torch.nonzero>` returns a tuple of 1-D
+    index tensors, allowing for advanced indexing, so ``x[x.nonzero(as_tuple=True)]``
+    gives all nonzero values of tensor ``x``. Of the returned tuple, each index tensor
+    contains nonzero indices for a certain dimension.
+
+    See below for more details on the two behaviors.
+
+    When :attr:`input` is on CUDA, :func:`torch.nonzero() <torch.nonzero>` causes
+    host-device synchronization.
+
+**When** :attr:`as_tuple` **is** ``False`` **(default)**:
+
+Returns a tensor containing the indices of all non-zero elements of
+:attr:`input`.  Each row in the result contains the indices of a non-zero
+element in :attr:`input`. The result is sorted lexicographically, with
+the last index changing the fastest (C-style).
+
+If :attr:`input` has :math:`n` dimensions, then the resulting indices tensor
+:attr:`out` is of size :math:`(z \times n)`, where :math:`z` is the total number of
+non-zero elements in the :attr:`input` tensor.
+
+**When** :attr:`as_tuple` **is** ``True``:
+
+Returns a tuple of 1-D tensors, one for each dimension in :attr:`input`,
+each containing the indices (in that dimension) of all non-zero elements of
+:attr:`input` .
+
+If :attr:`input` has :math:`n` dimensions, then the resulting tuple contains :math:`n`
+tensors of size :math:`z`, where :math:`z` is the total number of
+non-zero elements in the :attr:`input` tensor.
+
+As a special case, when :attr:`input` has zero dimensions and a nonzero scalar
+value, it is treated as a one-dimensional tensor with one element.
+
+Args:
+    {input}
+
+Keyword args:
+    out (LongTensor, optional): the output tensor containing indices
+
+Returns:
+    LongTensor or tuple of LongTensor: If :attr:`as_tuple` is ``False``, the output
+    tensor containing indices. If :attr:`as_tuple` is ``True``, one 1-D tensor for
+    each dimension, containing the indices of each nonzero element along that
+    dimension.
+
+Example::
+
+    >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]))
+    tensor([[ 0],
+            [ 1],
+            [ 2],
+            [ 4]])
+    >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+    ...                             [0.0, 0.4, 0.0, 0.0],
+    ...                             [0.0, 0.0, 1.2, 0.0],
+    ...                             [0.0, 0.0, 0.0,-0.4]]))
+    tensor([[ 0,  0],
+            [ 1,  1],
+            [ 2,  2],
+            [ 3,  3]])
+    >>> torch.nonzero(torch.tensor([1, 1, 1, 0, 1]), as_tuple=True)
+    (tensor([0, 1, 2, 4]),)
+    >>> torch.nonzero(torch.tensor([[0.6, 0.0, 0.0, 0.0],
+    ...                             [0.0, 0.4, 0.0, 0.0],
+    ...                             [0.0, 0.0, 1.2, 0.0],
+    ...                             [0.0, 0.0, 0.0,-0.4]]), as_tuple=True)
+    (tensor([0, 1, 2, 3]), tensor([0, 1, 2, 3]))
+    >>> torch.nonzero(torch.tensor(5), as_tuple=True)
+    (tensor([0]),)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.normal,
+    r"""
+normal(mean, std, *, generator=None, out=None) -> Tensor
+
+Returns a tensor of random numbers drawn from separate normal distributions
+whose mean and standard deviation are given.
+
+The :attr:`mean` is a tensor with the mean of
+each output element's normal distribution
+
+The :attr:`std` is a tensor with the standard deviation of
+each output element's normal distribution
+
+The shapes of :attr:`mean` and :attr:`std` don't need to match, but the
+total number of elements in each tensor need to be the same.
+
+.. note:: When the shapes do not match, the shape of :attr:`mean`
+          is used as the shape for the returned output tensor
+
+.. note:: When :attr:`std` is a CUDA tensor, this function synchronizes
+          its device with the CPU.
+
+Args:
+    mean (Tensor): the tensor of per-element means
+    std (Tensor): the tensor of per-element standard deviations
+
+Keyword args:
+    {generator}
+    {out}
+
+Example::
+
+    >>> torch.normal(mean=torch.arange(1., 11.), std=torch.arange(1, 0, -0.1))
+    tensor([  1.0425,   3.5672,   2.7969,   4.2925,   4.7229,   6.2134,
+              8.0505,   8.1408,   9.0563,  10.0566])
+
+.. function:: normal(mean=0.0, std, *, out=None) -> Tensor
+   :noindex:
+
+Similar to the function above, but the means are shared among all drawn
+elements.
+
+Args:
+    mean (float, optional): the mean for all distributions
+    std (Tensor): the tensor of per-element standard deviations
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.normal(mean=0.5, std=torch.arange(1., 6.))
+    tensor([-1.2793, -1.0732, -2.0687,  5.1177, -1.2303])
+
+.. function:: normal(mean, std=1.0, *, out=None) -> Tensor
+   :noindex:
+
+Similar to the function above, but the standard deviations are shared among
+all drawn elements.
+
+Args:
+    mean (Tensor): the tensor of per-element means
+    std (float, optional): the standard deviation for all distributions
+
+Keyword args:
+    out (Tensor, optional): the output tensor
+
+Example::
+
+    >>> torch.normal(mean=torch.arange(1., 6.))
+    tensor([ 1.1552,  2.6148,  2.6535,  5.8318,  4.2361])
+
+.. function:: normal(mean, std, size, *, out=None) -> Tensor
+   :noindex:
+
+Similar to the function above, but the means and standard deviations are shared
+among all drawn elements. The resulting tensor has size given by :attr:`size`.
+
+Args:
+    mean (float): the mean for all distributions
+    std (float): the standard deviation for all distributions
+    size (int...): a sequence of integers defining the shape of the output tensor.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.normal(2, 3, size=(1, 4))
+    tensor([[-1.3987, -1.9544,  3.6048,  0.7909]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.numel,
+    r"""
+numel(input) -> int
+
+Returns the total number of elements in the :attr:`input` tensor.
+
+Args:
+    {input}
+
+Example::
+
+    >>> a = torch.randn(1, 2, 3, 4, 5)
+    >>> torch.numel(a)
+    120
+    >>> a = torch.zeros(4,4)
+    >>> torch.numel(a)
+    16
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.ones,
+    r"""
+ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `1`, with the shape defined
+by the variable argument :attr:`size`.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword arguments:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.ones(2, 3)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+
+    >>> torch.ones(5)
+    tensor([ 1.,  1.,  1.,  1.,  1.])
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.ones_like,
+    r"""
+ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor filled with the scalar value `1`, with the same size as
+:attr:`input`. ``torch.ones_like(input)`` is equivalent to
+``torch.ones(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+.. warning::
+    As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+    the old ``torch.ones_like(input, out=output)`` is equivalent to
+    ``torch.ones(input.size(), out=output)``.
+
+Args:
+    {input}
+
+Keyword arguments:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+Example::
+
+    >>> input = torch.empty(2, 3)
+    >>> torch.ones_like(input)
+    tensor([[ 1.,  1.,  1.],
+            [ 1.,  1.,  1.]])
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.orgqr,
+    r"""
+orgqr(input, tau) -> Tensor
+
+Alias for :func:`torch.linalg.householder_product`.
+""",
+)
+
+add_docstr(
+    torch.ormqr,
+    r"""
+ormqr(input, tau, other, left=True, transpose=False, *, out=None) -> Tensor
+
+Computes the matrix-matrix multiplication of a product of Householder matrices with a general matrix.
+
+Multiplies a :math:`m \times n` matrix `C` (given by :attr:`other`) with a matrix `Q`,
+where `Q` is represented using Householder reflectors `(input, tau)`.
+See `Representation of Orthogonal or Unitary Matrices`_ for further details.
+
+If :attr:`left` is `True` then `op(Q)` times `C` is computed, otherwise the result is `C` times `op(Q)`.
+When :attr:`left` is `True`, the implicit matrix `Q` has size :math:`m \times m`.
+It has size :math:`n \times n` otherwise.
+If :attr:`transpose` is `True` then `op` is the conjugate transpose operation, otherwise it's a no-op.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batched inputs, and, if the input is batched, the output is batched with the same dimensions.
+
+.. seealso::
+        :func:`torch.geqrf` can be used to form the Householder representation `(input, tau)` of matrix `Q`
+        from the QR decomposition.
+
+.. note::
+        This function supports backward but it is only fast when ``(input, tau)`` do not require gradients
+        and/or ``tau.size(-1)`` is very small.
+        ``
+
+Args:
+    input (Tensor): tensor of shape `(*, mn, k)` where `*` is zero or more batch dimensions
+                    and `mn` equals to `m` or `n` depending on the :attr:`left`.
+    tau (Tensor): tensor of shape `(*, min(mn, k))` where `*` is zero or more batch dimensions.
+    other (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    left (bool): controls the order of multiplication.
+    transpose (bool): controls whether the matrix `Q` is conjugate transposed or not.
+
+Keyword args:
+    out (Tensor, optional): the output Tensor. Ignored if `None`. Default: `None`.
+
+.. _Representation of Orthogonal or Unitary Matrices:
+    https://www.netlib.org/lapack/lug/node128.html
+""",
+)
+
+add_docstr(
+    torch.permute,
+    r"""
+permute(input, dims) -> Tensor
+
+Returns a view of the original tensor :attr:`input` with its dimensions permuted.
+
+Args:
+    {input}
+    dims (tuple of int): The desired ordering of dimensions
+
+Example:
+    >>> x = torch.randn(2, 3, 5)
+    >>> x.size()
+    torch.Size([2, 3, 5])
+    >>> torch.permute(x, (2, 0, 1)).size()
+    torch.Size([5, 2, 3])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.poisson,
+    r"""
+poisson(input, generator=None) -> Tensor
+
+Returns a tensor of the same size as :attr:`input` with each element
+sampled from a Poisson distribution with rate parameter given by the corresponding
+element in :attr:`input` i.e.,
+
+.. math::
+    \text{{out}}_i \sim \text{{Poisson}}(\text{{input}}_i)
+
+:attr:`input` must be non-negative.
+
+Args:
+    input (Tensor): the input tensor containing the rates of the Poisson distribution
+
+Keyword args:
+    {generator}
+
+Example::
+
+    >>> rates = torch.rand(4, 4) * 5  # rate parameter between 0 and 5
+    >>> torch.poisson(rates)
+    tensor([[9., 1., 3., 5.],
+            [8., 6., 6., 0.],
+            [0., 4., 5., 3.],
+            [2., 1., 4., 2.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.polygamma,
+    r"""
+polygamma(n, input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.polygamma`.
+""",
+)
+
+add_docstr(
+    torch.positive,
+    r"""
+positive(input) -> Tensor
+
+Returns :attr:`input`.
+Throws a runtime error if :attr:`input` is a bool tensor.
+"""
+    + r"""
+Args:
+    {input}
+
+Example::
+
+    >>> t = torch.randn(5)
+    >>> t
+    tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+    >>> torch.positive(t)
+    tensor([ 0.0090, -0.2262, -0.0682, -0.2866,  0.3940])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.pow,
+    r"""
+pow(input, exponent, *, out=None) -> Tensor
+
+Takes the power of each element in :attr:`input` with :attr:`exponent` and
+returns a tensor with the result.
+
+:attr:`exponent` can be either a single ``float`` number or a `Tensor`
+with the same number of elements as :attr:`input`.
+
+When :attr:`exponent` is a scalar value, the operation applied is:
+
+.. math::
+    \text{out}_i = x_i ^ \text{exponent}
+
+When :attr:`exponent` is a tensor, the operation applied is:
+
+.. math::
+    \text{out}_i = x_i ^ {\text{exponent}_i}
+"""
+    + r"""
+When :attr:`exponent` is a tensor, the shapes of :attr:`input`
+and :attr:`exponent` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Args:
+    {input}
+    exponent (float or tensor): the exponent value
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.4331,  1.2475,  0.6834, -0.2791])
+    >>> torch.pow(a, 2)
+    tensor([ 0.1875,  1.5561,  0.4670,  0.0779])
+    >>> exp = torch.arange(1., 5.)
+
+    >>> a = torch.arange(1., 5.)
+    >>> a
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> exp
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> torch.pow(a, exp)
+    tensor([   1.,    4.,   27.,  256.])
+
+.. function:: pow(self, exponent, *, out=None) -> Tensor
+   :noindex:
+
+:attr:`self` is a scalar ``float`` value, and :attr:`exponent` is a tensor.
+The returned tensor :attr:`out` is of the same shape as :attr:`exponent`
+
+The operation applied is:
+
+.. math::
+    \text{{out}}_i = \text{{self}} ^ {{\text{{exponent}}_i}}
+
+Args:
+    self (float): the scalar base value for the power operation
+    exponent (Tensor): the exponent tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> exp = torch.arange(1., 5.)
+    >>> base = 2
+    >>> torch.pow(base, exp)
+    tensor([  2.,   4.,   8.,  16.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.float_power,
+    r"""
+float_power(input, exponent, *, out=None) -> Tensor
+
+Raises :attr:`input` to the power of :attr:`exponent`, elementwise, in double precision.
+If neither input is complex returns a ``torch.float64`` tensor,
+and if one or more inputs is complex returns a ``torch.complex128`` tensor.
+
+.. note::
+    This function always computes in double precision, unlike :func:`torch.pow`,
+    which implements more typical :ref:`type promotion <type-promotion-doc>`.
+    This is useful when the computation needs to be performed in a wider or more precise dtype,
+    or the results of the computation may contain fractional values not representable in the input dtypes,
+    like when an integer base is raised to a negative integer exponent.
+
+Args:
+    input (Tensor or Number): the base value(s)
+    exponent (Tensor or Number): the exponent value(s)
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randint(10, (4,))
+    >>> a
+    tensor([6, 4, 7, 1])
+    >>> torch.float_power(a, 2)
+    tensor([36., 16., 49.,  1.], dtype=torch.float64)
+
+    >>> a = torch.arange(1, 5)
+    >>> a
+    tensor([ 1,  2,  3,  4])
+    >>> exp = torch.tensor([2, -3, 4, -5])
+    >>> exp
+    tensor([ 2, -3,  4, -5])
+    >>> torch.float_power(a, exp)
+    tensor([1.0000e+00, 1.2500e-01, 8.1000e+01, 9.7656e-04], dtype=torch.float64)
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.prod,
+    r"""
+prod(input, *, dtype=None) -> Tensor
+
+Returns the product of all elements in the :attr:`input` tensor.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[-0.8020,  0.5428, -1.5854]])
+    >>> torch.prod(a)
+    tensor(0.6902)
+
+.. function:: prod(input, dim, keepdim=False, *, dtype=None) -> Tensor
+   :noindex:
+
+Returns the product of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+    {keepdim}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(4, 2)
+    >>> a
+    tensor([[ 0.5261, -0.3837],
+            [ 1.1857, -0.2498],
+            [-1.1646,  0.0705],
+            [ 1.1131, -1.0629]])
+    >>> torch.prod(a, 1)
+    tensor([-0.2018, -0.2962, -0.0821, -1.1831])
+""".format(
+        **single_dim_common
+    ),
+)
+
+add_docstr(
+    torch.promote_types,
+    r"""
+promote_types(type1, type2) -> dtype
+
+Returns the :class:`torch.dtype` with the smallest size and scalar kind that is
+not smaller nor of lower kind than either `type1` or `type2`. See type promotion
+:ref:`documentation <type-promotion-doc>` for more information on the type
+promotion logic.
+
+Args:
+    type1 (:class:`torch.dtype`)
+    type2 (:class:`torch.dtype`)
+
+Example::
+
+    >>> torch.promote_types(torch.int32, torch.float32)
+    torch.float32
+    >>> torch.promote_types(torch.uint8, torch.long)
+    torch.long
+""",
+)
+
+add_docstr(
+    torch.qr,
+    r"""
+qr(input, some=True, *, out=None) -> (Tensor, Tensor)
+
+Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
+and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
+with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and
+:math:`R` being an upper triangular matrix or batch of upper triangular matrices.
+
+If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization.
+Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization.
+
+.. warning::
+
+    :func:`torch.qr` is deprecated in favor of :func:`torch.linalg.qr`
+    and will be removed in a future PyTorch release. The boolean parameter :attr:`some` has been
+    replaced with a string parameter :attr:`mode`.
+
+    ``Q, R = torch.qr(A)`` should be replaced with
+
+    .. code:: python
+
+        Q, R = torch.linalg.qr(A)
+
+    ``Q, R = torch.qr(A, some=False)`` should be replaced with
+
+    .. code:: python
+
+        Q, R = torch.linalg.qr(A, mode="complete")
+
+.. warning::
+          If you plan to backpropagate through QR, note that the current backward implementation
+          is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
+          columns of :attr:`input` are linearly independent.
+          This behavior will probably change once QR supports pivoting.
+
+.. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+          and may produce different (valid) decompositions on different device types
+          or different platforms.
+
+Args:
+    input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
+                batch dimensions consisting of matrices of dimension :math:`m \times n`.
+    some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for
+                complete QR decomposition. If `k = min(m, n)` then:
+
+                  * ``some=True`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default)
+
+                  * ``'some=False'``: returns `(Q, R)` with dimensions (m, m), (m, n)
+
+Keyword args:
+    out (tuple, optional): tuple of `Q` and `R` tensors.
+                The dimensions of `Q` and `R` are detailed in the description of :attr:`some` above.
+
+Example::
+
+    >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+    >>> q, r = torch.qr(a)
+    >>> q
+    tensor([[-0.8571,  0.3943,  0.3314],
+            [-0.4286, -0.9029, -0.0343],
+            [ 0.2857, -0.1714,  0.9429]])
+    >>> r
+    tensor([[ -14.0000,  -21.0000,   14.0000],
+            [   0.0000, -175.0000,   70.0000],
+            [   0.0000,    0.0000,  -35.0000]])
+    >>> torch.mm(q, r).round()
+    tensor([[  12.,  -51.,    4.],
+            [   6.,  167.,  -68.],
+            [  -4.,   24.,  -41.]])
+    >>> torch.mm(q.t(), q).round()
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1., -0.],
+            [ 0., -0.,  1.]])
+    >>> a = torch.randn(3, 4, 5)
+    >>> q, r = torch.qr(a, some=False)
+    >>> torch.allclose(torch.matmul(q, r), a)
+    True
+    >>> torch.allclose(torch.matmul(q.mT, q), torch.eye(5))
+    True
+""",
+)
+
+add_docstr(
+    torch.rad2deg,
+    r"""
+rad2deg(input, *, out=None) -> Tensor
+
+Returns a new tensor with each of the elements of :attr:`input`
+converted from angles in radians to degrees.
+
+Args:
+    {input}
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([[3.142, -3.142], [6.283, -6.283], [1.570, -1.570]])
+    >>> torch.rad2deg(a)
+    tensor([[ 180.0233, -180.0233],
+            [ 359.9894, -359.9894],
+            [  89.9544,  -89.9544]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.deg2rad,
+    r"""
+deg2rad(input, *, out=None) -> Tensor
+
+Returns a new tensor with each of the elements of :attr:`input`
+converted from angles in degrees to radians.
+
+Args:
+    {input}
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([[180.0, -180.0], [360.0, -360.0], [90.0, -90.0]])
+    >>> torch.deg2rad(a)
+    tensor([[ 3.1416, -3.1416],
+            [ 6.2832, -6.2832],
+            [ 1.5708, -1.5708]])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.heaviside,
+    r"""
+heaviside(input, values, *, out=None) -> Tensor
+
+Computes the Heaviside step function for each element in :attr:`input`.
+The Heaviside step function is defined as:
+
+.. math::
+    \text{{heaviside}}(input, values) = \begin{cases}
+        0, & \text{if input < 0}\\
+        values, & \text{if input == 0}\\
+        1, & \text{if input > 0}
+    \end{cases}
+"""
+    + r"""
+
+Args:
+    {input}
+    values (Tensor): The values to use where :attr:`input` is zero.
+
+Keyword arguments:
+    {out}
+
+Example::
+
+    >>> input = torch.tensor([-1.5, 0, 2.0])
+    >>> values = torch.tensor([0.5])
+    >>> torch.heaviside(input, values)
+    tensor([0.0000, 0.5000, 1.0000])
+    >>> values = torch.tensor([1.2, -2.0, 3.5])
+    >>> torch.heaviside(input, values)
+    tensor([0., -2., 1.])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.rand,
+    """
+rand(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, \
+requires_grad=False, pin_memory=False) -> Tensor
+"""
+    + r"""
+Returns a tensor filled with random numbers from a uniform distribution
+on the interval :math:`[0, 1)`
+
+The shape of the tensor is defined by the variable argument :attr:`size`.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
+    {generator}
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Example::
+
+    >>> torch.rand(4)
+    tensor([ 0.5204,  0.2503,  0.3525,  0.5673])
+    >>> torch.rand(2, 3)
+    tensor([[ 0.8237,  0.5781,  0.6879],
+            [ 0.3816,  0.7249,  0.0998]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.rand_like,
+    r"""
+rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` that is filled with
+random numbers from a uniform distribution on the interval :math:`[0, 1)`.
+``torch.rand_like(input)`` is equivalent to
+``torch.rand(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.randint,
+    """
+randint(low=0, high, size, \\*, generator=None, out=None, \
+dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with random integers generated uniformly
+between :attr:`low` (inclusive) and :attr:`high` (exclusive).
+
+The shape of the tensor is defined by the variable argument :attr:`size`.
+
+.. note::
+    With the global dtype default (``torch.float32``), this function returns
+    a tensor with dtype ``torch.int64``.
+
+Args:
+    low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+    high (int): One above the highest integer to be drawn from the distribution.
+    size (tuple): a tuple defining the shape of the output tensor.
+
+Keyword args:
+    {generator}
+    {out}
+    dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
+        this function returns a tensor with dtype ``torch.int64``.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.randint(3, 5, (3,))
+    tensor([4, 3, 4])
+
+
+    >>> torch.randint(10, (2, 2))
+    tensor([[0, 2],
+            [5, 5]])
+
+
+    >>> torch.randint(3, 10, (2, 2))
+    tensor([[4, 5],
+            [6, 7]])
+
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.randint_like,
+    """
+randint_like(input, low=0, high, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor with the same shape as Tensor :attr:`input` filled with
+random integers generated uniformly between :attr:`low` (inclusive) and
+:attr:`high` (exclusive).
+
+.. note:
+    With the global dtype default (``torch.float32``), this function returns
+    a tensor with dtype ``torch.int64``.
+
+Args:
+    {input}
+    low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
+    high (int): One above the highest integer to be drawn from the distribution.
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.randn,
+    """
+randn(*size, *, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+pin_memory=False) -> Tensor
+"""
+    + r"""
+
+Returns a tensor filled with random numbers from a normal distribution
+with mean `0` and variance `1` (also called the standard normal
+distribution).
+
+.. math::
+    \text{{out}}_{{i}} \sim \mathcal{{N}}(0, 1)
+
+For complex dtypes, the tensor is i.i.d. sampled from a `complex normal distribution`_ with zero mean and
+unit variance as
+
+.. math::
+    \text{{out}}_{{i}} \sim \mathcal{{CN}}(0, 1)
+
+This is equivalent to separately sampling the real :math:`(\operatorname{{Re}})` and imaginary
+:math:`(\operatorname{{Im}})` part of :math:`\text{{out}}_i` as
+
+.. math::
+    \operatorname{{Re}}(\text{{out}}_{{i}}) \sim \mathcal{{N}}(0, \frac{{1}}{{2}}),\quad
+    \operatorname{{Im}}(\text{{out}}_{{i}}) \sim \mathcal{{N}}(0, \frac{{1}}{{2}})
+
+The shape of the tensor is defined by the variable argument :attr:`size`.
+
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
+    {generator}
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Example::
+
+    >>> torch.randn(4)
+    tensor([-2.1436,  0.9966,  2.3426, -0.6366])
+    >>> torch.randn(2, 3)
+    tensor([[ 1.5954,  2.8929, -1.0923],
+            [ 1.1719, -0.4709, -0.1996]])
+
+.. _complex normal distribution: https://en.wikipedia.org/wiki/Complex_normal_distribution
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.randn_like,
+    r"""
+randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` that is filled with
+random numbers from a normal distribution with mean 0 and variance 1. Please refer to :func:`torch.randn` for the
+sampling process of complex dtypes. ``torch.randn_like(input)`` is equivalent to
+``torch.randn(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.randperm,
+    """
+randperm(n, *, generator=None, out=None, dtype=torch.int64,layout=torch.strided, \
+device=None, requires_grad=False, pin_memory=False) -> Tensor
+"""
+    + r"""
+Returns a random permutation of integers from ``0`` to ``n - 1``.
+
+Args:
+    n (int): the upper bound (exclusive)
+
+Keyword args:
+    {generator}
+    {out}
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: ``torch.int64``.
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Example::
+
+    >>> torch.randperm(4)
+    tensor([2, 1, 0, 3])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.tensor,
+    r"""
+tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+
+Constructs a tensor with no autograd history (also known as a "leaf tensor", see :doc:`/notes/autograd`) by copying :attr:`data`.
+
+.. warning::
+
+    When working with tensors prefer using :func:`torch.Tensor.clone`,
+    :func:`torch.Tensor.detach`, and :func:`torch.Tensor.requires_grad_` for
+    readability. Letting `t` be a tensor, ``torch.tensor(t)`` is equivalent to
+    ``t.clone().detach()``, and ``torch.tensor(t, requires_grad=True)``
+    is equivalent to ``t.clone().detach().requires_grad_(True)``.
+
+.. seealso::
+
+    :func:`torch.as_tensor` preserves autograd history and avoids copies where possible.
+    :func:`torch.from_numpy` creates a tensor that shares storage with a NumPy array.
+
+Args:
+    {data}
+
+Keyword args:
+    {dtype}
+    device (:class:`torch.device`, optional): the device of the constructed tensor. If None and data is a tensor
+        then the device of data is used. If None and data is not a tensor then
+        the result tensor is constructed on the current device.
+    {requires_grad}
+    {pin_memory}
+
+
+Example::
+
+    >>> torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+    tensor([[ 0.1000,  1.2000],
+            [ 2.2000,  3.1000],
+            [ 4.9000,  5.2000]])
+
+    >>> torch.tensor([0, 1])  # Type inference on data
+    tensor([ 0,  1])
+
+    >>> torch.tensor([[0.11111, 0.222222, 0.3333333]],
+    ...              dtype=torch.float64,
+    ...              device=torch.device('cuda:0'))  # creates a double tensor on a CUDA device
+    tensor([[ 0.1111,  0.2222,  0.3333]], dtype=torch.float64, device='cuda:0')
+
+    >>> torch.tensor(3.14159)  # Create a zero-dimensional (scalar) tensor
+    tensor(3.1416)
+
+    >>> torch.tensor([])  # Create an empty tensor (of size (0,))
+    tensor([])
+""".format(
+        **factory_data_common_args
+    ),
+)
+
+add_docstr(
+    torch.range,
+    r"""
+range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1`
+with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is
+the gap between two values in the tensor.
+
+.. math::
+    \text{out}_{i+1} = \text{out}_i + \text{step}.
+"""
+    + r"""
+.. warning::
+    This function is deprecated and will be removed in a future release because its behavior is inconsistent with
+    Python's range builtin. Instead, use :func:`torch.arange`, which produces values in [start, end).
+
+Args:
+    start (float): the starting value for the set of points. Default: ``0``.
+    end (float): the ending value for the set of points
+    step (float): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
+    {out}
+    {dtype} If `dtype` is not given, infer the data type from the other input
+        arguments. If any of `start`, `end`, or `stop` are floating-point, the
+        `dtype` is inferred to be the default dtype, see
+        :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+        be `torch.int64`.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.range(1, 4)
+    tensor([ 1.,  2.,  3.,  4.])
+    >>> torch.range(1, 4, 0.5)
+    tensor([ 1.0000,  1.5000,  2.0000,  2.5000,  3.0000,  3.5000,  4.0000])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.arange,
+    r"""
+arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil`
+with values from the interval ``[start, end)`` taken with common difference
+:attr:`step` beginning from `start`.
+
+Note that non-integer :attr:`step` is subject to floating point rounding errors when
+comparing against :attr:`end`; to avoid inconsistency, we advise subtracting a small epsilon from :attr:`end`
+in such cases.
+
+.. math::
+    \text{out}_{{i+1}} = \text{out}_{i} + \text{step}
+"""
+    + r"""
+Args:
+    start (Number): the starting value for the set of points. Default: ``0``.
+    end (Number): the ending value for the set of points
+    step (Number): the gap between each pair of adjacent points. Default: ``1``.
+
+Keyword args:
+    {out}
+    {dtype} If `dtype` is not given, infer the data type from the other input
+        arguments. If any of `start`, `end`, or `stop` are floating-point, the
+        `dtype` is inferred to be the default dtype, see
+        :meth:`~torch.get_default_dtype`. Otherwise, the `dtype` is inferred to
+        be `torch.int64`.
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.arange(5)
+    tensor([ 0,  1,  2,  3,  4])
+    >>> torch.arange(1, 4)
+    tensor([ 1,  2,  3])
+    >>> torch.arange(1, 2.5, 0.5)
+    tensor([ 1.0000,  1.5000,  2.0000])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.ravel,
+    r"""
+ravel(input) -> Tensor
+
+Return a contiguous flattened tensor. A copy is made only if needed.
+
+Args:
+    {input}
+
+Example::
+
+    >>> t = torch.tensor([[[1, 2],
+    ...                    [3, 4]],
+    ...                   [[5, 6],
+    ...                    [7, 8]]])
+    >>> torch.ravel(t)
+    tensor([1, 2, 3, 4, 5, 6, 7, 8])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.remainder,
+    r"""
+remainder(input, other, *, out=None) -> Tensor
+
+Computes
+`Python's modulus operation <https://docs.python.org/3/reference/expressions.html#binary-arithmetic-operations>`_
+entrywise.  The result has the same sign as the divisor :attr:`other` and its absolute value
+is less than that of :attr:`other`.
+
+It may also be defined in terms of :func:`torch.div` as
+
+.. code:: python
+
+    torch.remainder(a, b) == a - a.div(b, rounding_mode="floor") * b
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer and float inputs.
+
+.. note::
+    Complex inputs are not supported. In some cases, it is not mathematically
+    possible to satisfy the definition of a modulo operation with complex numbers.
+    See :func:`torch.fmod` for how division by zero is handled.
+
+.. seealso::
+
+    :func:`torch.fmod` which implements C++'s `std::fmod <https://en.cppreference.com/w/cpp/numeric/math/fmod>`_.
+    This one is defined in terms of division rounding towards zero.
+
+Args:
+    input (Tensor or Scalar): the dividend
+    other (Tensor or Scalar): the divisor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.remainder(torch.tensor([-3., -2, -1, 1, 2, 3]), 2)
+    tensor([ 1.,  0.,  1.,  1.,  0.,  1.])
+    >>> torch.remainder(torch.tensor([1, 2, 3, 4, 5]), -1.5)
+    tensor([ -0.5000, -1.0000,  0.0000, -0.5000, -1.0000 ])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.renorm,
+    r"""
+renorm(input, p, dim, maxnorm, *, out=None) -> Tensor
+
+Returns a tensor where each sub-tensor of :attr:`input` along dimension
+:attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower
+than the value :attr:`maxnorm`
+
+.. note:: If the norm of a row is lower than `maxnorm`, the row is unchanged
+
+Args:
+    {input}
+    p (float): the power for the norm computation
+    dim (int): the dimension to slice over to get the sub-tensors
+    maxnorm (float): the maximum norm to keep each sub-tensor under
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.ones(3, 3)
+    >>> x[1].fill_(2)
+    tensor([ 2.,  2.,  2.])
+    >>> x[2].fill_(3)
+    tensor([ 3.,  3.,  3.])
+    >>> x
+    tensor([[ 1.,  1.,  1.],
+            [ 2.,  2.,  2.],
+            [ 3.,  3.,  3.]])
+    >>> torch.renorm(x, 1, 0, 5)
+    tensor([[ 1.0000,  1.0000,  1.0000],
+            [ 1.6667,  1.6667,  1.6667],
+            [ 1.6667,  1.6667,  1.6667]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.reshape,
+    r"""
+reshape(input, shape) -> Tensor
+
+Returns a tensor with the same data and number of elements as :attr:`input`,
+but with the specified shape. When possible, the returned tensor will be a view
+of :attr:`input`. Otherwise, it will be a copy. Contiguous inputs and inputs
+with compatible strides can be reshaped without copying, but you should not
+depend on the copying vs. viewing behavior.
+
+See :meth:`torch.Tensor.view` on when it is possible to return a view.
+
+A single dimension may be -1, in which case it's inferred from the remaining
+dimensions and the number of elements in :attr:`input`.
+
+Args:
+    input (Tensor): the tensor to be reshaped
+    shape (tuple of int): the new shape
+
+Example::
+
+    >>> a = torch.arange(4.)
+    >>> torch.reshape(a, (2, 2))
+    tensor([[ 0.,  1.],
+            [ 2.,  3.]])
+    >>> b = torch.tensor([[0, 1], [2, 3]])
+    >>> torch.reshape(b, (-1,))
+    tensor([ 0,  1,  2,  3])
+""",
+)
+
+
+add_docstr(
+    torch.result_type,
+    r"""
+result_type(tensor1, tensor2) -> dtype
+
+Returns the :class:`torch.dtype` that would result from performing an arithmetic
+operation on the provided input tensors. See type promotion :ref:`documentation <type-promotion-doc>`
+for more information on the type promotion logic.
+
+Args:
+    tensor1 (Tensor or Number): an input tensor or number
+    tensor2 (Tensor or Number): an input tensor or number
+
+Example::
+
+    >>> torch.result_type(torch.tensor([1, 2], dtype=torch.int), 1.0)
+    torch.float32
+    >>> torch.result_type(torch.tensor([1, 2], dtype=torch.uint8), torch.tensor(1))
+    torch.uint8
+""",
+)
+
+add_docstr(
+    torch.row_stack,
+    r"""
+row_stack(tensors, *, out=None) -> Tensor
+
+Alias of :func:`torch.vstack`.
+""",
+)
+
+add_docstr(
+    torch.round,
+    r"""
+round(input, *, decimals=0, out=None) -> Tensor
+
+Rounds elements of :attr:`input` to the nearest integer.
+
+For integer inputs, follows the array-api convention of returning a
+copy of the input tensor.
+The return type of output is same as that of input's dtype.
+
+.. note::
+    This function implements the "round half to even" to
+    break ties when a number is equidistant from two
+    integers (e.g. `round(2.5)` is 2).
+
+    When the :attr:\`decimals\` argument is specified the
+    algorithm used is similar to NumPy's `around`. This
+    algorithm is fast but inexact and it can easily
+    overflow for low precision dtypes.
+    Eg. `round(tensor([10000], dtype=torch.float16), decimals=3)` is `inf`.
+
+.. seealso::
+    :func:`torch.ceil`, which rounds up.
+    :func:`torch.floor`, which rounds down.
+    :func:`torch.trunc`, which rounds towards zero.
+
+Args:
+    {input}
+    decimals (int): Number of decimal places to round to (default: 0).
+        If decimals is negative, it specifies the number of positions
+        to the left of the decimal point.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.round(torch.tensor((4.7, -2.3, 9.1, -7.7)))
+    tensor([ 5.,  -2.,  9., -8.])
+
+    >>> # Values equidistant from two integers are rounded towards the
+    >>> #   the nearest even value (zero is treated as even)
+    >>> torch.round(torch.tensor([-0.5, 0.5, 1.5, 2.5]))
+    tensor([-0., 0., 2., 2.])
+
+    >>> # A positive decimals argument rounds to the to that decimal place
+    >>> torch.round(torch.tensor([0.1234567]), decimals=3)
+    tensor([0.1230])
+
+    >>> # A negative decimals argument rounds to the left of the decimal
+    >>> torch.round(torch.tensor([1200.1234567]), decimals=-3)
+    tensor([1000.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.rsqrt,
+    r"""
+rsqrt(input, *, out=None) -> Tensor
+
+Returns a new tensor with the reciprocal of the square-root of each of
+the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \frac{1}{\sqrt{\text{input}_{i}}}
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.0370,  0.2970,  1.5420, -0.9105])
+    >>> torch.rsqrt(a)
+    tensor([    nan,  1.8351,  0.8053,     nan])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.scatter,
+    r"""
+scatter(input, dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_`
+""",
+)
+
+add_docstr(
+    torch.scatter_add,
+    r"""
+scatter_add(input, dim, index, src) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_add_`
+""",
+)
+
+add_docstr(
+    torch.scatter_reduce,
+    r"""
+scatter_reduce(input, dim, index, src, reduce, *, include_self=True) -> Tensor
+
+Out-of-place version of :meth:`torch.Tensor.scatter_reduce_`
+""",
+)
+
+add_docstr(
+    torch.select,
+    r"""
+select(input, dim, index) -> Tensor
+
+Slices the :attr:`input` tensor along the selected dimension at the given index.
+This function returns a view of the original tensor with the given dimension removed.
+
+.. note:: If :attr:`input` is a sparse tensor and returning a view of
+          the tensor is not possible, a RuntimeError exception is
+          raised. In this is the case, consider using
+          :func:`torch.select_copy` function.
+
+Args:
+    {input}
+    dim (int): the dimension to slice
+    index (int): the index to select with
+
+.. note::
+
+    :meth:`select` is equivalent to slicing. For example,
+    ``tensor.select(0, index)`` is equivalent to ``tensor[index]`` and
+    ``tensor.select(2, index)`` is equivalent to ``tensor[:,:,index]``.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.select_scatter,
+    r"""
+select_scatter(input, src, dim, index) -> Tensor
+
+Embeds the values of the :attr:`src` tensor into :attr:`input` at the given index.
+This function returns a tensor with fresh storage; it does not create a view.
+
+
+Args:
+    {input}
+    src (Tensor): The tensor to embed into :attr:`input`
+    dim (int): the dimension to insert the slice into.
+    index (int): the index to select with
+
+.. note::
+
+    :attr:`src` must be of the proper size in order to be embedded
+    into :attr:`input`. Specifically, it should have the same shape as
+    ``torch.select(input, dim, index)``
+
+Example::
+
+    >>> a = torch.zeros(2, 2)
+    >>> b = torch.ones(2)
+    >>> a.select_scatter(b, 0, 0)
+    tensor([[1., 1.],
+            [0., 0.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.slice_scatter,
+    r"""
+slice_scatter(input, src, dim=0, start=None, end=None, step=1) -> Tensor
+
+Embeds the values of the :attr:`src` tensor into :attr:`input` at the given
+dimension.
+This function returns a tensor with fresh storage; it does not create a view.
+
+
+Args:
+    {input}
+    src (Tensor): The tensor to embed into :attr:`input`
+    dim (int): the dimension to insert the slice into
+    start (Optional[int]): the start index of where to insert the slice
+    end (Optional[int]): the end index of where to insert the slice
+    step (int): the how many elements to skip in
+
+Example::
+
+    >>> a = torch.zeros(8, 8)
+    >>> b = torch.ones(2, 8)
+    >>> a.slice_scatter(b, start=6)
+    tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
+            [0., 0., 0., 0., 0., 0., 0., 0.],
+            [0., 0., 0., 0., 0., 0., 0., 0.],
+            [0., 0., 0., 0., 0., 0., 0., 0.],
+            [0., 0., 0., 0., 0., 0., 0., 0.],
+            [0., 0., 0., 0., 0., 0., 0., 0.],
+            [1., 1., 1., 1., 1., 1., 1., 1.],
+            [1., 1., 1., 1., 1., 1., 1., 1.]])
+
+    >>> b = torch.ones(8, 2)
+    >>> a.slice_scatter(b, dim=1, start=2, end=6, step=2)
+    tensor([[0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.],
+            [0., 0., 1., 0., 1., 0., 0., 0.]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.set_flush_denormal,
+    r"""
+set_flush_denormal(mode) -> bool
+
+Disables denormal floating numbers on CPU.
+
+Returns ``True`` if your system supports flushing denormal numbers and it
+successfully configures flush denormal mode.  :meth:`~torch.set_flush_denormal`
+is supported on x86 architectures supporting SSE3 and AArch64 architecture.
+
+Args:
+    mode (bool): Controls whether to enable flush denormal mode or not
+
+Example::
+
+    >>> torch.set_flush_denormal(True)
+    True
+    >>> torch.tensor([1e-323], dtype=torch.float64)
+    tensor([ 0.], dtype=torch.float64)
+    >>> torch.set_flush_denormal(False)
+    True
+    >>> torch.tensor([1e-323], dtype=torch.float64)
+    tensor(9.88131e-324 *
+           [ 1.0000], dtype=torch.float64)
+""",
+)
+
+add_docstr(
+    torch.set_num_threads,
+    r"""
+set_num_threads(int)
+
+Sets the number of threads used for intraop parallelism on CPU.
+
+.. warning::
+    To ensure that the correct number of threads is used, set_num_threads
+    must be called before running eager, JIT or autograd code.
+""",
+)
+
+add_docstr(
+    torch.set_num_interop_threads,
+    r"""
+set_num_interop_threads(int)
+
+Sets the number of threads used for interop parallelism
+(e.g. in JIT interpreter) on CPU.
+
+.. warning::
+    Can only be called once and before any inter-op parallel work
+    is started (e.g. JIT execution).
+""",
+)
+
+add_docstr(
+    torch.sigmoid,
+    r"""
+sigmoid(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.expit`.
+""",
+)
+
+add_docstr(
+    torch.logit,
+    r"""
+logit(input, eps=None, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.logit`.
+""",
+)
+
+add_docstr(
+    torch.sign,
+    r"""
+sign(input, *, out=None) -> Tensor
+
+Returns a new tensor with the signs of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \operatorname{sgn}(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+    >>> a
+    tensor([ 0.7000, -1.2000,  0.0000,  2.3000])
+    >>> torch.sign(a)
+    tensor([ 1., -1.,  0.,  1.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.signbit,
+    r"""
+signbit(input, *, out=None) -> Tensor
+
+Tests if each element of :attr:`input` has its sign bit set or not.
+
+Args:
+  {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> a = torch.tensor([0.7, -1.2, 0., 2.3])
+    >>> torch.signbit(a)
+    tensor([ False, True,  False,  False])
+    >>> a = torch.tensor([-0.0, 0.0])
+    >>> torch.signbit(a)
+    tensor([ True,  False])
+
+.. note::
+    signbit handles signed zeros, so negative zero (-0) returns True.
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.sgn,
+    r"""
+sgn(input, *, out=None) -> Tensor
+
+This function is an extension of torch.sign() to complex tensors.
+It computes a new tensor whose elements have
+the same angles as the corresponding elements of :attr:`input` and
+absolute values (i.e. magnitudes) of one for complex tensors and
+is equivalent to torch.sign() for non-complex tensors.
+
+.. math::
+    \text{out}_{i} = \begin{cases}
+                    0 & |\text{{input}}_i| == 0 \\
+                    \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|} & \text{otherwise}
+                    \end{cases}
+
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+  {out}
+
+Example::
+
+    >>> t = torch.tensor([3+4j, 7-24j, 0, 1+2j])
+    >>> t.sgn()
+    tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.sin,
+    r"""
+sin(input, *, out=None) -> Tensor
+
+Returns a new tensor with the sine of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sin(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-0.5461,  0.1347, -2.7266, -0.2746])
+    >>> torch.sin(a)
+    tensor([-0.5194,  0.1343, -0.4032, -0.2711])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.sinc,
+    r"""
+sinc(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.sinc`.
+""",
+)
+
+add_docstr(
+    torch.sinh,
+    r"""
+sinh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic sine of the elements of
+:attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sinh(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.5380, -0.8632, -0.1265,  0.9399])
+    >>> torch.sinh(a)
+    tensor([ 0.5644, -0.9744, -0.1268,  1.0845])
+
+.. note::
+   When :attr:`input` is on the CPU, the implementation of torch.sinh may use
+   the Sleef library, which rounds very large results to infinity or negative
+   infinity. See `here <https://sleef.org/purec.xhtml>`_ for details.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.sort,
+    r"""
+sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+
+Sorts the elements of the :attr:`input` tensor along a given dimension
+in ascending order by value.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+If :attr:`descending` is ``True`` then the elements are sorted in descending
+order by value.
+
+If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+the order of equivalent elements.
+
+A namedtuple of (values, indices) is returned, where the `values` are the
+sorted values and `indices` are the indices of the elements in the original
+`input` tensor.
+
+Args:
+    {input}
+    dim (int, optional): the dimension to sort along
+    descending (bool, optional): controls the sorting order (ascending or descending)
+    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+       of equivalent elements is preserved.
+
+Keyword args:
+    out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
+        be optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.randn(3, 4)
+    >>> sorted, indices = torch.sort(x)
+    >>> sorted
+    tensor([[-0.2162,  0.0608,  0.6719,  2.3332],
+            [-0.5793,  0.0061,  0.6058,  0.9497],
+            [-0.5071,  0.3343,  0.9553,  1.0960]])
+    >>> indices
+    tensor([[ 1,  0,  2,  3],
+            [ 3,  1,  0,  2],
+            [ 0,  3,  1,  2]])
+
+    >>> sorted, indices = torch.sort(x, 0)
+    >>> sorted
+    tensor([[-0.5071, -0.2162,  0.6719, -0.5793],
+            [ 0.0608,  0.0061,  0.9497,  0.3343],
+            [ 0.6058,  0.9553,  1.0960,  2.3332]])
+    >>> indices
+    tensor([[ 2,  0,  0,  1],
+            [ 0,  1,  1,  2],
+            [ 1,  2,  2,  0]])
+    >>> x = torch.tensor([0, 1] * 9)
+    >>> x.sort()
+    torch.return_types.sort(
+        values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+        indices=tensor([ 2, 16,  4,  6, 14,  8,  0, 10, 12,  9, 17, 15, 13, 11,  7,  5,  3,  1]))
+    >>> x.sort(stable=True)
+    torch.return_types.sort(
+        values=tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
+        indices=tensor([ 0,  2,  4,  6,  8, 10, 12, 14, 16,  1,  3,  5,  7,  9, 11, 13, 15, 17]))
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.argsort,
+    r"""
+argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+
+Returns the indices that sort a tensor along a given dimension in ascending
+order by value.
+
+This is the second value returned by :meth:`torch.sort`.  See its documentation
+for the exact semantics of this method.
+
+If :attr:`stable` is ``True`` then the sorting routine becomes stable, preserving
+the order of equivalent elements. If ``False``, the relative order of values
+which compare equal is not guaranteed. ``True`` is slower.
+
+Args:
+    {input}
+    dim (int, optional): the dimension to sort along
+    descending (bool, optional): controls the sorting order (ascending or descending)
+    stable (bool, optional): controls the relative order of equivalent elements
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.0785,  1.5267, -0.8521,  0.4065],
+            [ 0.1598,  0.0788, -0.0745, -1.2700],
+            [ 1.2208,  1.0722, -0.7064,  1.2564],
+            [ 0.0669, -0.2318, -0.8229, -0.9280]])
+
+
+    >>> torch.argsort(a, dim=1)
+    tensor([[2, 0, 3, 1],
+            [3, 2, 1, 0],
+            [2, 1, 0, 3],
+            [3, 2, 1, 0]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.msort,
+    r"""
+msort(input, *, out=None) -> Tensor
+
+Sorts the elements of the :attr:`input` tensor along its first dimension
+in ascending order by value.
+
+.. note:: `torch.msort(t)` is equivalent to `torch.sort(t, dim=0)[0]`.
+          See also :func:`torch.sort`.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> t = torch.randn(3, 4)
+    >>> t
+    tensor([[-0.1321,  0.4370, -1.2631, -1.1289],
+            [-2.0527, -1.1250,  0.2275,  0.3077],
+            [-0.0881, -0.1259, -0.5495,  1.0284]])
+    >>> torch.msort(t)
+    tensor([[-2.0527, -1.1250, -1.2631, -1.1289],
+            [-0.1321, -0.1259, -0.5495,  0.3077],
+            [-0.0881,  0.4370,  0.2275,  1.0284]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_compressed_tensor,
+    r"""sparse_compressed_tensor(compressed_indices, plain_indices, values, size=None, """
+    r"""*, dtype=None, layout=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in Compressed Sparse format - CSR,
+CSC, BSR, or BSC - <sparse-compressed-docs>` with specified values at
+the given :attr:`compressed_indices` and :attr:`plain_indices`. Sparse
+matrix multiplication operations in Compressed Sparse format are
+typically faster than that for sparse tensors in COO format. Make you
+have a look at :ref:`the note on the data type of the indices
+<sparse-compressed-docs>`.
+
+{sparse_factory_device_note}
+
+Args:
+    compressed_indices (array_like): (B+1)-dimensional array of size
+        ``(*batchsize, compressed_dim_size + 1)``.  The last element of
+        each batch is the number of non-zero elements or blocks. This
+        tensor encodes the index in ``values`` and ``plain_indices``
+        depending on where the given compressed dimension (row or
+        column) starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        elements or blocks in a given compressed dimension.
+    plain_indices (array_like): Plain dimension (column or row)
+        co-ordinates of each element or block in values. (B+1)-dimensional
+        tensor with the same length as values.
+
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types.  that
+        represents a (1+K)-dimensional (for CSR and CSC layouts) or
+        (1+2+K)-dimensional tensor (for BSR and BSC layouts) where
+        ``K`` is the number of dense dimensions.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+        blocksize[1], *densesize)`` where ``blocksize[0] ==
+        blocksize[1] == 1`` for CSR and CSC formats. If not provided,
+        the size will be inferred as the minimum size big enough to
+        hold all non-zero elements or blocks.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    layout (:class:`torch.layout`, required): the desired layout of
+        returned tensor: :attr:`torch.sparse_csr`,
+        :attr:`torch.sparse_csc`, :attr:`torch.sparse_bsr`, or
+        :attr:`torch.sparse_bsc`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_device`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+
+Example::
+    >>> compressed_indices = [0, 2, 4]
+    >>> plain_indices = [0, 1, 0, 1]
+    >>> values = [1, 2, 3, 4]
+    >>> torch.sparse_compressed_tensor(torch.tensor(compressed_indices, dtype=torch.int64),
+    ...                                torch.tensor(plain_indices, dtype=torch.int64),
+    ...                                torch.tensor(values), dtype=torch.double, layout=torch.sparse_csr)
+    tensor(crow_indices=tensor([0, 2, 4]),
+           col_indices=tensor([0, 1, 0, 1]),
+           values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+           dtype=torch.float64, layout=torch.sparse_csr)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_csr_tensor,
+    r"""sparse_csr_tensor(crow_indices, col_indices, values, size=None, """
+    r"""*, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in CSR (Compressed Sparse Row) <sparse-csr-docs>` with specified
+values at the given :attr:`crow_indices` and :attr:`col_indices`. Sparse matrix multiplication operations
+in CSR format are typically faster than that for sparse tensors in COO format. Make you have a look
+at :ref:`the note on the data type of the indices <sparse-csr-docs>`.
+
+{sparse_factory_device_note}
+
+Args:
+    crow_indices (array_like): (B+1)-dimensional array of size
+        ``(*batchsize, nrows + 1)``.  The last element of each batch
+        is the number of non-zeros. This tensor encodes the index in
+        values and col_indices depending on where the given row
+        starts. Each successive number in the tensor subtracted by the
+        number before it denotes the number of elements in a given
+        row.
+    col_indices (array_like): Column co-ordinates of each element in
+        values. (B+1)-dimensional tensor with the same length
+        as values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types that
+        represents a (1+K)-dimensional tensor where ``K`` is the number
+        of dense dimensions.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+        not provided, the size will be inferred as the minimum size
+        big enough to hold all non-zero elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_device`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+
+Example::
+    >>> crow_indices = [0, 2, 4]
+    >>> col_indices = [0, 1, 0, 1]
+    >>> values = [1, 2, 3, 4]
+    >>> torch.sparse_csr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+    ...                         torch.tensor(col_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(crow_indices=tensor([0, 2, 4]),
+           col_indices=tensor([0, 1, 0, 1]),
+           values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+           dtype=torch.float64, layout=torch.sparse_csr)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_csc_tensor,
+    r"""sparse_csc_tensor(ccol_indices, row_indices, values, size=None, """
+    r"""*, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in CSC (Compressed Sparse Column)
+<sparse-csc-docs>` with specified values at the given
+:attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+multiplication operations in CSC format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-csc-docs>`.
+
+{sparse_factory_device_note}
+
+Args:
+    ccol_indices (array_like): (B+1)-dimensional array of size
+        ``(*batchsize, ncols + 1)``.  The last element of each batch
+        is the number of non-zeros. This tensor encodes the index in
+        values and row_indices depending on where the given column
+        starts. Each successive number in the tensor subtracted by the
+        number before it denotes the number of elements in a given
+        column.
+    row_indices (array_like): Row co-ordinates of each element in
+        values. (B+1)-dimensional tensor with the same length as
+        values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types that
+        represents a (1+K)-dimensional tensor where ``K`` is the number
+        of dense dimensions.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor: ``(*batchsize, nrows, ncols, *densesize)``. If
+        not provided, the size will be inferred as the minimum size
+        big enough to hold all non-zero elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_device`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+
+Example::
+    >>> ccol_indices = [0, 2, 4]
+    >>> row_indices = [0, 1, 0, 1]
+    >>> values = [1, 2, 3, 4]
+    >>> torch.sparse_csc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+    ...                         torch.tensor(row_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(ccol_indices=tensor([0, 2, 4]),
+           row_indices=tensor([0, 1, 0, 1]),
+           values=tensor([1., 2., 3., 4.]), size=(2, 2), nnz=4,
+           dtype=torch.float64, layout=torch.sparse_csc)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_bsr_tensor,
+    r"""sparse_bsr_tensor(crow_indices, col_indices, values, size=None, """
+    r"""*, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in BSR (Block Compressed Sparse Row))
+<sparse-bsr-docs>` with specified 2-dimensional blocks at the given
+:attr:`crow_indices` and :attr:`col_indices`. Sparse matrix
+multiplication operations in BSR format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-bsr-docs>`.
+
+{sparse_factory_device_note}
+
+Args:
+    crow_indices (array_like): (B+1)-dimensional array of size
+        ``(*batchsize, nrowblocks + 1)``.  The last element of each
+        batch is the number of non-zeros. This tensor encodes the
+        block index in values and col_indices depending on where the
+        given row block starts. Each successive number in the tensor
+        subtracted by the number before it denotes the number of
+        blocks in a given row.
+    col_indices (array_like): Column block co-ordinates of each block
+        in values. (B+1)-dimensional tensor with the same length as
+        values.
+    values (array_list): Initial values for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, scalar, and other types that
+        represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+        number of dense dimensions.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+        blocksize[1], *densesize)`` where ``blocksize ==
+        values.shape[1:3]``. If not provided, the size will be
+        inferred as the minimum size big enough to hold all non-zero
+        blocks.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_device`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+
+Example::
+    >>> crow_indices = [0, 1, 2]
+    >>> col_indices = [0, 1]
+    >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+    >>> torch.sparse_bsr_tensor(torch.tensor(crow_indices, dtype=torch.int64),
+    ...                         torch.tensor(col_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(crow_indices=tensor([0, 1, 2]),
+           col_indices=tensor([0, 1]),
+           values=tensor([[[1., 2.],
+                           [3., 4.]],
+                          [[5., 6.],
+                           [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+           layout=torch.sparse_bsr)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_bsc_tensor,
+    r"""sparse_bsc_tensor(ccol_indices, row_indices, values, size=None, """
+    r"""*, dtype=None, device=None, requires_grad=False, check_invariants=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in BSC (Block Compressed Sparse
+Column)) <sparse-bsc-docs>` with specified 2-dimensional blocks at the
+given :attr:`ccol_indices` and :attr:`row_indices`. Sparse matrix
+multiplication operations in BSC format are typically faster than that
+for sparse tensors in COO format. Make you have a look at :ref:`the
+note on the data type of the indices <sparse-bsc-docs>`.
+
+{sparse_factory_device_note}
+
+Args:
+    ccol_indices (array_like): (B+1)-dimensional array of size
+        ``(*batchsize, ncolblocks + 1)``. The last element of each
+        batch is the number of non-zeros. This tensor encodes the
+        index in values and row_indices depending on where the given
+        column starts. Each successive number in the tensor subtracted
+        by the number before it denotes the number of elements in a
+        given column.
+    row_indices (array_like): Row block co-ordinates of each block in
+        values. (B+1)-dimensional tensor with the same length
+        as values.
+    values (array_list): Initial blocks for the tensor. Can be a list,
+        tuple, NumPy ``ndarray``, and other types that
+        represents a (1 + 2 + K)-dimensional tensor where ``K`` is the
+        number of dense dimensions.
+    size (list, tuple, :class:`torch.Size`, optional): Size of the
+        sparse tensor: ``(*batchsize, nrows * blocksize[0], ncols *
+        blocksize[1], *densesize)`` If not provided, the size will be
+        inferred as the minimum size big enough to hold all non-zero
+        blocks.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of
+        returned tensor.  Default: if None, infers data type from
+        :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of
+        returned tensor.  Default: if None, uses the current device
+        for the default tensor type (see
+        :func:`torch.set_default_device`). :attr:`device` will be
+        the CPU for CPU tensor types and the current CUDA device for
+        CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+
+Example::
+    >>> ccol_indices = [0, 1, 2]
+    >>> row_indices = [0, 1]
+    >>> values = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+    >>> torch.sparse_bsc_tensor(torch.tensor(ccol_indices, dtype=torch.int64),
+    ...                         torch.tensor(row_indices, dtype=torch.int64),
+    ...                         torch.tensor(values), dtype=torch.double)
+    tensor(ccol_indices=tensor([0, 1, 2]),
+           row_indices=tensor([0, 1]),
+           values=tensor([[[1., 2.],
+                           [3., 4.]],
+                          [[5., 6.],
+                           [7., 8.]]]), size=(2, 2), nnz=2, dtype=torch.float64,
+           layout=torch.sparse_bsc)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sparse_coo_tensor,
+    r"""sparse_coo_tensor(indices, values, size=None, """
+    r"""*, dtype=None, device=None, requires_grad=False, check_invariants=None, is_coalesced=None) -> Tensor
+
+Constructs a :ref:`sparse tensor in COO(rdinate) format
+<sparse-coo-docs>` with specified values at the given
+:attr:`indices`.
+
+.. note::
+
+   This function returns an :ref:`uncoalesced tensor
+   <sparse-uncoalesced-coo-docs>` when :attr:`is_coalesced` is
+   unspecified or ``None``.
+
+{sparse_factory_device_note}
+
+Args:
+    indices (array_like): Initial data for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types. Will be cast to a :class:`torch.LongTensor`
+        internally. The indices are the coordinates of the non-zero values in the matrix, and thus
+        should be two-dimensional where the first dimension is the number of tensor dimensions and
+        the second dimension is the number of non-zero values.
+    values (array_like): Initial values for the tensor. Can be a list, tuple,
+        NumPy ``ndarray``, scalar, and other types.
+    size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not
+        provided the size will be inferred as the minimum size big enough to hold all non-zero
+        elements.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if None, infers data type from :attr:`values`.
+    device (:class:`torch.device`, optional): the desired device of returned tensor.
+        Default: if None, uses the current device for the default tensor type
+        (see :func:`torch.set_default_device`). :attr:`device` will be the CPU
+        for CPU tensor types and the current CUDA device for CUDA tensor types.
+    {requires_grad}
+    {check_invariants}
+    is_coalesced (bool, optional): When``True``, the caller is
+        responsible for providing tensor indices that correspond to a
+        coalesced tensor.  If the :attr:`check_invariants` flag is
+        False, no error will be raised if the prerequisites are not
+        met and this will lead to silently incorrect results. To force
+        coalescion please use :meth:`coalesce` on the resulting
+        Tensor.
+        Default: None: except for trivial cases (e.g. nnz < 2) the
+        resulting Tensor has is_coalesced set to ``False```.
+
+Example::
+
+    >>> i = torch.tensor([[0, 1, 1],
+    ...                   [2, 0, 2]])
+    >>> v = torch.tensor([3, 4, 5], dtype=torch.float32)
+    >>> torch.sparse_coo_tensor(i, v, [2, 4])
+    tensor(indices=tensor([[0, 1, 1],
+                           [2, 0, 2]]),
+           values=tensor([3., 4., 5.]),
+           size=(2, 4), nnz=3, layout=torch.sparse_coo)
+
+    >>> torch.sparse_coo_tensor(i, v)  # Shape inference
+    tensor(indices=tensor([[0, 1, 1],
+                           [2, 0, 2]]),
+           values=tensor([3., 4., 5.]),
+           size=(2, 3), nnz=3, layout=torch.sparse_coo)
+
+    >>> torch.sparse_coo_tensor(i, v, [2, 4],
+    ...                         dtype=torch.float64,
+    ...                         device=torch.device('cuda:0'))
+    tensor(indices=tensor([[0, 1, 1],
+                           [2, 0, 2]]),
+           values=tensor([3., 4., 5.]),
+           device='cuda:0', size=(2, 4), nnz=3, dtype=torch.float64,
+           layout=torch.sparse_coo)
+
+    # Create an empty sparse tensor with the following invariants:
+    #   1. sparse_dim + dense_dim = len(SparseTensor.shape)
+    #   2. SparseTensor._indices().shape = (sparse_dim, nnz)
+    #   3. SparseTensor._values().shape = (nnz, SparseTensor.shape[sparse_dim:])
+    #
+    # For instance, to create an empty sparse tensor with nnz = 0, dense_dim = 0 and
+    # sparse_dim = 1 (hence indices is a 2D tensor of shape = (1, 0))
+    >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), [], [1])
+    tensor(indices=tensor([], size=(1, 0)),
+           values=tensor([], size=(0,)),
+           size=(1,), nnz=0, layout=torch.sparse_coo)
+
+    # and to create an empty sparse tensor with nnz = 0, dense_dim = 1 and
+    # sparse_dim = 1
+    >>> S = torch.sparse_coo_tensor(torch.empty([1, 0]), torch.empty([0, 2]), [1, 2])
+    tensor(indices=tensor([], size=(1, 0)),
+           values=tensor([], size=(0, 2)),
+           size=(1, 2), nnz=0, layout=torch.sparse_coo)
+
+.. _torch.sparse: https://pytorch.org/docs/stable/sparse.html
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.sqrt,
+    r"""
+sqrt(input, *, out=None) -> Tensor
+
+Returns a new tensor with the square-root of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \sqrt{\text{input}_{i}}
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+    >>> torch.sqrt(a)
+    tensor([    nan,  1.0112,  0.2883,  0.6933])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.square,
+    r"""
+square(input, *, out=None) -> Tensor
+
+Returns a new tensor with the square of the elements of :attr:`input`.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-2.0755,  1.0226,  0.0831,  0.4806])
+    >>> torch.square(a)
+    tensor([ 4.3077,  1.0457,  0.0069,  0.2310])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.squeeze,
+    r"""
+squeeze(input, dim=None) -> Tensor
+
+Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.
+
+For example, if `input` is of shape:
+:math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
+will be of shape: :math:`(A \times B \times C \times D)`.
+
+When :attr:`dim` is given, a squeeze operation is done only in the given
+dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
+``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
+will squeeze the tensor to the shape :math:`(A \times B)`.
+
+.. note:: The returned tensor shares the storage with the input tensor,
+          so changing the contents of one will change the contents of the other.
+
+.. warning:: If the tensor has a batch dimension of size 1, then `squeeze(input)`
+          will also remove the batch dimension, which can lead to unexpected
+          errors. Consider specifying only the dims you wish to be squeezed.
+
+Args:
+    {input}
+    dim (int or tuple of ints, optional): if given, the input will be squeezed
+           only in the specified dimensions.
+
+        .. versionchanged:: 2.0
+           :attr:`dim` now accepts tuples of dimensions.
+
+Example::
+
+    >>> x = torch.zeros(2, 1, 2, 1, 2)
+    >>> x.size()
+    torch.Size([2, 1, 2, 1, 2])
+    >>> y = torch.squeeze(x)
+    >>> y.size()
+    torch.Size([2, 2, 2])
+    >>> y = torch.squeeze(x, 0)
+    >>> y.size()
+    torch.Size([2, 1, 2, 1, 2])
+    >>> y = torch.squeeze(x, 1)
+    >>> y.size()
+    torch.Size([2, 2, 1, 2])
+    >>> y = torch.squeeze(x, (1, 2, 3))
+    torch.Size([2, 2, 2])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.std,
+    r"""
+std(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+
+Calculates the standard deviation over the dimensions specified by :attr:`dim`.
+:attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+reduce over all dimensions.
+
+The standard deviation (:math:`\sigma`) is calculated as
+
+.. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
+
+Args:
+    {input}
+    {dim}
+
+Keyword args:
+    correction (int): difference between the sample size and sample degrees of freedom.
+        Defaults to `Bessel's correction`_, ``correction=1``.
+
+        .. versionchanged:: 2.0
+            Previously this argument was called ``unbiased`` and was a boolean
+            with ``True`` corresponding to ``correction=1`` and ``False`` being
+            ``correction=0``.
+    {keepdim}
+    {out}
+
+Example:
+
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.std(a, dim=1, keepdim=True)
+    tensor([[1.0311],
+            [0.7477],
+            [1.2204],
+            [0.9087]])
+
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.std_mean,
+    r"""
+std_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+
+Calculates the standard deviation and mean over the dimensions specified by
+:attr:`dim`. :attr:`dim` can be a single dimension, list of dimensions, or
+``None`` to reduce over all dimensions.
+
+The standard deviation (:math:`\sigma`) is calculated as
+
+.. math:: \sigma = \sqrt{\frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2}
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+
+"""
+    + r"""
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+
+Keyword args:
+    correction (int): difference between the sample size and sample degrees of freedom.
+        Defaults to `Bessel's correction`_, ``correction=1``.
+
+        .. versionchanged:: 2.0
+            Previously this argument was called ``unbiased`` and was a boolean
+            with ``True`` corresponding to ``correction=1`` and ``False`` being
+            ``correction=0``.
+    {keepdim}
+    {out}
+
+Returns:
+    A tuple (std, mean) containing the standard deviation and mean.
+
+Example:
+
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.std_mean(a, dim=0, keepdim=True)
+    (tensor([[1.2620, 1.0028, 1.0957, 0.6038]]),
+     tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.sub,
+    r"""
+sub(input, other, *, alpha=1, out=None) -> Tensor
+
+Subtracts :attr:`other`, scaled by :attr:`alpha`, from :attr:`input`.
+
+.. math::
+    \text{{out}}_i = \text{{input}}_i - \text{{alpha}} \times \text{{other}}_i
+"""
+    + r"""
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`,
+:ref:`type promotion <type-promotion-doc>`, and integer, float, and complex inputs.
+
+Args:
+    {input}
+    other (Tensor or Number): the tensor or number to subtract from :attr:`input`.
+
+Keyword args:
+    alpha (Number): the multiplier for :attr:`other`.
+    {out}
+
+Example::
+
+    >>> a = torch.tensor((1, 2))
+    >>> b = torch.tensor((0, 1))
+    >>> torch.sub(a, b, alpha=2)
+    tensor([1, 0])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.subtract,
+    r"""
+subtract(input, other, *, alpha=1, out=None) -> Tensor
+
+Alias for :func:`torch.sub`.
+""",
+)
+
+add_docstr(
+    torch.sum,
+    r"""
+sum(input, *, dtype=None) -> Tensor
+
+Returns the sum of all elements in the :attr:`input` tensor.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(1, 3)
+    >>> a
+    tensor([[ 0.1133, -0.9567,  0.2958]])
+    >>> torch.sum(a)
+    tensor(-0.5475)
+
+.. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+   :noindex:
+
+Returns the sum of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`. If :attr:`dim` is a list of dimensions,
+reduce over all of them.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+    {keepdim}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.randn(4, 4)
+    >>> a
+    tensor([[ 0.0569, -0.2475,  0.0737, -0.3429],
+            [-0.2993,  0.9138,  0.9337, -1.6864],
+            [ 0.1132,  0.7892, -0.1003,  0.5688],
+            [ 0.3637, -0.9906, -0.4752, -1.5197]])
+    >>> torch.sum(a, 1)
+    tensor([-0.4598, -0.1381,  1.3708, -2.6217])
+    >>> b = torch.arange(4 * 5 * 6).view(4, 5, 6)
+    >>> torch.sum(b, (2, 1))
+    tensor([  435.,  1335.,  2235.,  3135.])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.nansum,
+    r"""
+nansum(input, *, dtype=None) -> Tensor
+
+Returns the sum of all elements, treating Not a Numbers (NaNs) as zero.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> a = torch.tensor([1., 2., float('nan'), 4.])
+    >>> torch.nansum(a)
+    tensor(7.)
+
+.. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor
+   :noindex:
+
+Returns the sum of each row of the :attr:`input` tensor in the given
+dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero.
+If :attr:`dim` is a list of dimensions, reduce over all of them.
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+    {keepdim}
+
+Keyword args:
+    {dtype}
+
+Example::
+
+    >>> torch.nansum(torch.tensor([1., float("nan")]))
+    1.0
+    >>> a = torch.tensor([[1, 2], [3., float("nan")]])
+    >>> torch.nansum(a)
+    tensor(6.)
+    >>> torch.nansum(a, dim=0)
+    tensor([4., 2.])
+    >>> torch.nansum(a, dim=1)
+    tensor([3., 3.])
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.svd,
+    r"""
+svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor)
+
+Computes the singular value decomposition of either a matrix or batch of
+matrices :attr:`input`. The singular value decomposition is represented as a
+namedtuple `(U, S, V)`, such that :attr:`input` :math:`= U \text{diag}(S) V^{\text{H}}`.
+where :math:`V^{\text{H}}` is the transpose of `V` for real inputs,
+and the conjugate transpose of `V` for complex inputs.
+If :attr:`input` is a batch of matrices, then `U`, `S`, and `V` are also
+batched with the same batch dimensions as :attr:`input`.
+
+If :attr:`some` is `True` (default), the method returns the reduced singular
+value decomposition. In this case, if the last two dimensions of :attr:`input` are
+`m` and `n`, then the returned `U` and `V` matrices will contain only
+`min(n, m)` orthonormal columns.
+
+If :attr:`compute_uv` is `False`, the returned `U` and `V` will be
+zero-filled matrices of shape `(m, m)` and `(n, n)`
+respectively, and the same device as :attr:`input`. The argument :attr:`some`
+has no effect when :attr:`compute_uv` is `False`.
+
+Supports :attr:`input` of float, double, cfloat and cdouble data types.
+The dtypes of `U` and `V` are the same as :attr:`input`'s. `S` will
+always be real-valued, even if :attr:`input` is complex.
+
+.. warning::
+
+    :func:`torch.svd` is deprecated in favor of :func:`torch.linalg.svd`
+    and will be removed in a future PyTorch release.
+
+    ``U, S, V = torch.svd(A, some=some, compute_uv=True)`` (default) should be replaced with
+
+    .. code:: python
+
+        U, S, Vh = torch.linalg.svd(A, full_matrices=not some)
+        V = Vh.mH
+
+    ``_, S, _ = torch.svd(A, some=some, compute_uv=False)`` should be replaced with
+
+    .. code:: python
+
+        S = torch.linalg.svdvals(A)
+
+.. note:: Differences with :func:`torch.linalg.svd`:
+
+             * :attr:`some` is the opposite of
+               :func:`torch.linalg.svd`'s :attr:`full_matrices`. Note that
+               default value for both is `True`, so the default behavior is
+               effectively the opposite.
+             * :func:`torch.svd` returns `V`, whereas :func:`torch.linalg.svd` returns
+               `Vh`, that is, :math:`V^{\text{H}}`.
+             * If :attr:`compute_uv` is `False`, :func:`torch.svd` returns zero-filled
+               tensors for `U` and `Vh`, whereas :func:`torch.linalg.svd` returns
+               empty tensors.
+
+.. note:: The singular values are returned in descending order. If :attr:`input` is a batch of matrices,
+          then the singular values of each matrix in the batch are returned in descending order.
+
+.. note:: The `S` tensor can only be used to compute gradients if :attr:`compute_uv` is `True`.
+
+.. note:: When :attr:`some` is `False`, the gradients on `U[..., :, min(m, n):]`
+          and `V[..., :, min(m, n):]` will be ignored in the backward pass, as those vectors
+          can be arbitrary bases of the corresponding subspaces.
+
+.. note:: The implementation of :func:`torch.linalg.svd` on CPU uses LAPACK's routine `?gesdd`
+          (a divide-and-conquer algorithm) instead of `?gesvd` for speed. Analogously,
+          on GPU, it uses cuSOLVER's routines `gesvdj` and `gesvdjBatched` on CUDA 10.1.243
+          and later, and MAGMA's routine `gesdd` on earlier versions of CUDA.
+
+.. note:: The returned `U` will not be contiguous. The matrix (or batch of matrices) will
+          be represented as a column-major matrix (i.e. Fortran-contiguous).
+
+.. warning:: The gradients with respect to `U` and `V` will only be finite when the input does not
+             have zero nor repeated singular values.
+
+.. warning:: If the distance between any two singular values is close to zero, the gradients with respect to
+             `U` and `V` will be numerically unstable, as they depends on
+             :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`. The same happens when the matrix
+             has small singular values, as these gradients also depend on `S^{-1}`.
+
+.. warning:: For complex-valued :attr:`input` the singular value decomposition is not unique,
+             as `U` and `V` may be multiplied by an arbitrary phase factor :math:`e^{i \phi}` on every column.
+             The same happens when :attr:`input` has repeated singular values, where one may multiply
+             the columns of the spanning subspace in `U` and `V` by a rotation matrix
+             and `the resulting vectors will span the same subspace`_.
+             Different platforms, like NumPy, or inputs on different device types,
+             may produce different `U` and `V` tensors.
+
+Args:
+    input (Tensor): the input tensor of size `(*, m, n)` where `*` is zero or more
+                    batch dimensions consisting of `(m, n)` matrices.
+    some (bool, optional): controls whether to compute the reduced or full decomposition, and
+                           consequently, the shape of returned `U` and `V`. Default: `True`.
+    compute_uv (bool, optional): controls whether to compute `U` and `V`. Default: `True`.
+
+Keyword args:
+    out (tuple, optional): the output tuple of tensors
+
+Example::
+
+    >>> a = torch.randn(5, 3)
+    >>> a
+    tensor([[ 0.2364, -0.7752,  0.6372],
+            [ 1.7201,  0.7394, -0.0504],
+            [-0.3371, -1.0584,  0.5296],
+            [ 0.3550, -0.4022,  1.5569],
+            [ 0.2445, -0.0158,  1.1414]])
+    >>> u, s, v = torch.svd(a)
+    >>> u
+    tensor([[ 0.4027,  0.0287,  0.5434],
+            [-0.1946,  0.8833,  0.3679],
+            [ 0.4296, -0.2890,  0.5261],
+            [ 0.6604,  0.2717, -0.2618],
+            [ 0.4234,  0.2481, -0.4733]])
+    >>> s
+    tensor([2.3289, 2.0315, 0.7806])
+    >>> v
+    tensor([[-0.0199,  0.8766,  0.4809],
+            [-0.5080,  0.4054, -0.7600],
+            [ 0.8611,  0.2594, -0.4373]])
+    >>> torch.dist(a, torch.mm(torch.mm(u, torch.diag(s)), v.t()))
+    tensor(8.6531e-07)
+    >>> a_big = torch.randn(7, 5, 3)
+    >>> u, s, v = torch.svd(a_big)
+    >>> torch.dist(a_big, torch.matmul(torch.matmul(u, torch.diag_embed(s)), v.mT))
+    tensor(2.6503e-06)
+
+.. _the resulting vectors will span the same subspace:
+       (https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD)
+""",
+)
+
+
+add_docstr(
+    torch.t,
+    r"""
+t(input) -> Tensor
+
+Expects :attr:`input` to be <= 2-D tensor and transposes dimensions 0
+and 1.
+
+0-D and 1-D tensors are returned as is. When input is a 2-D tensor this
+is equivalent to ``transpose(input, 0, 1)``.
+
+Args:
+    {input}
+
+Example::
+
+    >>> x = torch.randn(())
+    >>> x
+    tensor(0.1995)
+    >>> torch.t(x)
+    tensor(0.1995)
+    >>> x = torch.randn(3)
+    >>> x
+    tensor([ 2.4320, -0.4608,  0.7702])
+    >>> torch.t(x)
+    tensor([ 2.4320, -0.4608,  0.7702])
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 0.4875,  0.9158, -0.5872],
+            [ 0.3938, -0.6929,  0.6932]])
+    >>> torch.t(x)
+    tensor([[ 0.4875,  0.3938],
+            [ 0.9158, -0.6929],
+            [-0.5872,  0.6932]])
+
+See also :func:`torch.transpose`.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.flip,
+    r"""
+flip(input, dims) -> Tensor
+
+Reverse the order of an n-D tensor along given axis in dims.
+
+.. note::
+    `torch.flip` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flip`,
+    which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+    `torch.flip` is expected to be slower than `np.flip`.
+
+Args:
+    {input}
+    dims (a list or tuple): axis to flip on
+
+Example::
+
+    >>> x = torch.arange(8).view(2, 2, 2)
+    >>> x
+    tensor([[[ 0,  1],
+             [ 2,  3]],
+
+            [[ 4,  5],
+             [ 6,  7]]])
+    >>> torch.flip(x, [0, 1])
+    tensor([[[ 6,  7],
+             [ 4,  5]],
+
+            [[ 2,  3],
+             [ 0,  1]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.fliplr,
+    r"""
+fliplr(input) -> Tensor
+
+Flip tensor in the left/right direction, returning a new tensor.
+
+Flip the entries in each row in the left/right direction.
+Columns are preserved, but appear in a different order than before.
+
+Note:
+    Requires the tensor to be at least 2-D.
+
+.. note::
+    `torch.fliplr` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.fliplr`,
+    which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+    `torch.fliplr` is expected to be slower than `np.fliplr`.
+
+Args:
+    input (Tensor): Must be at least 2-dimensional.
+
+Example::
+
+    >>> x = torch.arange(4).view(2, 2)
+    >>> x
+    tensor([[0, 1],
+            [2, 3]])
+    >>> torch.fliplr(x)
+    tensor([[1, 0],
+            [3, 2]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.flipud,
+    r"""
+flipud(input) -> Tensor
+
+Flip tensor in the up/down direction, returning a new tensor.
+
+Flip the entries in each column in the up/down direction.
+Rows are preserved, but appear in a different order than before.
+
+Note:
+    Requires the tensor to be at least 1-D.
+
+.. note::
+    `torch.flipud` makes a copy of :attr:`input`'s data. This is different from NumPy's `np.flipud`,
+    which returns a view in constant time. Since copying a tensor's data is more work than viewing that data,
+    `torch.flipud` is expected to be slower than `np.flipud`.
+
+Args:
+    input (Tensor): Must be at least 1-dimensional.
+
+Example::
+
+    >>> x = torch.arange(4).view(2, 2)
+    >>> x
+    tensor([[0, 1],
+            [2, 3]])
+    >>> torch.flipud(x)
+    tensor([[2, 3],
+            [0, 1]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.roll,
+    r"""
+roll(input, shifts, dims=None) -> Tensor
+
+Roll the tensor :attr:`input` along the given dimension(s). Elements that are
+shifted beyond the last position are re-introduced at the first position. If
+:attr:`dims` is `None`, the tensor will be flattened before rolling and then
+restored to the original shape.
+
+Args:
+    {input}
+    shifts (int or tuple of ints): The number of places by which the elements
+        of the tensor are shifted. If shifts is a tuple, dims must be a tuple of
+        the same size, and each dimension will be rolled by the corresponding
+        value
+    dims (int or tuple of ints): Axis along which to roll
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
+    >>> x
+    tensor([[1, 2],
+            [3, 4],
+            [5, 6],
+            [7, 8]])
+    >>> torch.roll(x, 1)
+    tensor([[8, 1],
+            [2, 3],
+            [4, 5],
+            [6, 7]])
+    >>> torch.roll(x, 1, 0)
+    tensor([[7, 8],
+            [1, 2],
+            [3, 4],
+            [5, 6]])
+    >>> torch.roll(x, -1, 0)
+    tensor([[3, 4],
+            [5, 6],
+            [7, 8],
+            [1, 2]])
+    >>> torch.roll(x, shifts=(2, 1), dims=(0, 1))
+    tensor([[6, 5],
+            [8, 7],
+            [2, 1],
+            [4, 3]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.rot90,
+    r"""
+rot90(input, k=1, dims=[0,1]) -> Tensor
+
+Rotate an n-D tensor by 90 degrees in the plane specified by dims axis.
+Rotation direction is from the first towards the second axis if k > 0, and from the second towards the first for k < 0.
+
+Args:
+    {input}
+    k (int): number of times to rotate. Default value is 1
+    dims (a list or tuple): axis to rotate. Default value is [0, 1]
+
+Example::
+
+    >>> x = torch.arange(4).view(2, 2)
+    >>> x
+    tensor([[0, 1],
+            [2, 3]])
+    >>> torch.rot90(x, 1, [0, 1])
+    tensor([[1, 3],
+            [0, 2]])
+
+    >>> x = torch.arange(8).view(2, 2, 2)
+    >>> x
+    tensor([[[0, 1],
+             [2, 3]],
+
+            [[4, 5],
+             [6, 7]]])
+    >>> torch.rot90(x, 1, [1, 2])
+    tensor([[[1, 3],
+             [0, 2]],
+
+            [[5, 7],
+             [4, 6]]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.take,
+    r"""
+take(input, index) -> Tensor
+
+Returns a new tensor with the elements of :attr:`input` at the given indices.
+The input tensor is treated as if it were viewed as a 1-D tensor. The result
+takes the same shape as the indices.
+
+Args:
+    {input}
+    index (LongTensor): the indices into tensor
+
+Example::
+
+    >>> src = torch.tensor([[4, 3, 5],
+    ...                     [6, 7, 8]])
+    >>> torch.take(src, torch.tensor([0, 2, 5]))
+    tensor([ 4,  5,  8])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.take_along_dim,
+    r"""
+take_along_dim(input, indices, dim=None, *, out=None) -> Tensor
+
+Selects values from :attr:`input` at the 1-dimensional indices from :attr:`indices` along the given :attr:`dim`.
+
+If :attr:`dim` is None, the input array is treated as if it has been flattened to 1d.
+
+Functions that return indices along a dimension, like :func:`torch.argmax` and :func:`torch.argsort`,
+are designed to work with this function. See the examples below.
+
+.. note::
+    This function is similar to NumPy's `take_along_axis`.
+    See also :func:`torch.gather`.
+
+Args:
+    {input}
+    indices (tensor): the indices into :attr:`input`. Must have long dtype.
+    dim (int, optional): dimension to select along.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> t = torch.tensor([[10, 30, 20], [60, 40, 50]])
+    >>> max_idx = torch.argmax(t)
+    >>> torch.take_along_dim(t, max_idx)
+    tensor([60])
+    >>> sorted_idx = torch.argsort(t, dim=1)
+    >>> torch.take_along_dim(t, sorted_idx, dim=1)
+    tensor([[10, 20, 30],
+            [40, 50, 60]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.tan,
+    r"""
+tan(input, *, out=None) -> Tensor
+
+Returns a new tensor with the tangent of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tan(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([-1.2027, -1.7687,  0.4412, -1.3856])
+    >>> torch.tan(a)
+    tensor([-2.5930,  4.9859,  0.4722, -5.3366])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.tanh,
+    r"""
+tanh(input, *, out=None) -> Tensor
+
+Returns a new tensor with the hyperbolic tangent of the elements
+of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \tanh(\text{input}_{i})
+"""
+    + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 0.8986, -0.7279,  1.1745,  0.2611])
+    >>> torch.tanh(a)
+    tensor([ 0.7156, -0.6218,  0.8257,  0.2553])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    # torch.softmax doc str. Point this to torch.nn.functional.softmax
+    torch.softmax,
+    r"""
+softmax(input, dim, *, dtype=None) -> Tensor
+
+Alias for :func:`torch.nn.functional.softmax`.
+""",
+)
+
+add_docstr(
+    torch.topk,
+    r"""
+topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor)
+
+Returns the :attr:`k` largest elements of the given :attr:`input` tensor along
+a given dimension.
+
+If :attr:`dim` is not given, the last dimension of the `input` is chosen.
+
+If :attr:`largest` is ``False`` then the `k` smallest elements are returned.
+
+A namedtuple of `(values, indices)` is returned with the `values` and
+`indices` of the largest `k` elements of each row of the `input` tensor in the
+given dimension `dim`.
+
+The boolean option :attr:`sorted` if ``True``, will make sure that the returned
+`k` elements are themselves sorted
+
+Args:
+    {input}
+    k (int): the k in "top-k"
+    dim (int, optional): the dimension to sort along
+    largest (bool, optional): controls whether to return largest or
+           smallest elements
+    sorted (bool, optional): controls whether to return the elements
+           in sorted order
+
+Keyword args:
+    out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be
+        optionally given to be used as output buffers
+
+Example::
+
+    >>> x = torch.arange(1., 6.)
+    >>> x
+    tensor([ 1.,  2.,  3.,  4.,  5.])
+    >>> torch.topk(x, 3)
+    torch.return_types.topk(values=tensor([5., 4., 3.]), indices=tensor([4, 3, 2]))
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.trace,
+    r"""
+trace(input) -> Tensor
+
+Returns the sum of the elements of the diagonal of the input 2-D matrix.
+
+Example::
+
+    >>> x = torch.arange(1., 10.).view(3, 3)
+    >>> x
+    tensor([[ 1.,  2.,  3.],
+            [ 4.,  5.,  6.],
+            [ 7.,  8.,  9.]])
+    >>> torch.trace(x)
+    tensor(15.)
+""",
+)
+
+add_docstr(
+    torch.transpose,
+    r"""
+transpose(input, dim0, dim1) -> Tensor
+
+Returns a tensor that is a transposed version of :attr:`input`.
+The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.
+
+If :attr:`input` is a strided tensor then the resulting :attr:`out`
+tensor shares its underlying storage with the :attr:`input` tensor, so
+changing the content of one would change the content of the other.
+
+If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
+resulting :attr:`out` tensor *does not* share the underlying storage
+with the :attr:`input` tensor.
+
+If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
+layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
+:attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
+both be sparse dimensions. The batch dimensions of a sparse tensor are the
+dimensions preceding the sparse dimensions.
+
+.. note::
+    Transpositions which interchange the sparse dimensions of a `SparseCSR`
+    or `SparseCSC` layout tensor will result in the layout changing between
+    the two options. Transposition of the sparse dimensions of a ` SparseBSR`
+    or `SparseBSC` layout tensor will likewise generate a result with the
+    opposite layout.
+
+
+Args:
+    {input}
+    dim0 (int): the first dimension to be transposed
+    dim1 (int): the second dimension to be transposed
+
+Example::
+
+    >>> x = torch.randn(2, 3)
+    >>> x
+    tensor([[ 1.0028, -0.9893,  0.5809],
+            [-0.1669,  0.7299,  0.4942]])
+    >>> torch.transpose(x, 0, 1)
+    tensor([[ 1.0028, -0.1669],
+            [-0.9893,  0.7299],
+            [ 0.5809,  0.4942]])
+
+See also :func:`torch.t`.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.triangular_solve,
+    r"""
+triangular_solve(b, A, upper=True, transpose=False, unitriangular=False, *, out=None) -> (Tensor, Tensor)
+
+Solves a system of equations with a square upper or lower triangular invertible matrix :math:`A`
+and multiple right-hand sides :math:`b`.
+
+In symbols, it solves :math:`AX = b` and assumes :math:`A` is square upper-triangular
+(or lower-triangular if :attr:`upper`\ `= False`) and does not have zeros on the diagonal.
+
+`torch.triangular_solve(b, A)` can take in 2D inputs `b, A` or inputs that are
+batches of 2D matrices. If the inputs are batches, then returns
+batched outputs `X`
+
+If the diagonal of :attr:`A` contains zeros or elements that are very close to zero and
+:attr:`unitriangular`\ `= False` (default) or if the input matrix is badly conditioned,
+the result may contain `NaN` s.
+
+Supports input of float, double, cfloat and cdouble data types.
+
+.. warning::
+
+    :func:`torch.triangular_solve` is deprecated in favor of :func:`torch.linalg.solve_triangular`
+    and will be removed in a future PyTorch release.
+    :func:`torch.linalg.solve_triangular` has its arguments reversed and does not return a
+    copy of one of the inputs.
+
+    ``X = torch.triangular_solve(B, A).solution`` should be replaced with
+
+    .. code:: python
+
+        X = torch.linalg.solve_triangular(A, B)
+
+Args:
+    b (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where
+                :math:`*` is zero of more batch dimensions
+    A (Tensor): the input triangular coefficient matrix of size :math:`(*, m, m)`
+                where :math:`*` is zero or more batch dimensions
+    upper (bool, optional): whether :math:`A` is upper or lower triangular. Default: ``True``.
+    transpose (bool, optional): solves `op(A)X = b` where `op(A) = A^T` if this flag is ``True``,
+                                and `op(A) = A` if it is ``False``. Default: ``False``.
+    unitriangular (bool, optional): whether :math:`A` is unit triangular.
+        If True, the diagonal elements of :math:`A` are assumed to be
+        1 and not referenced from :math:`A`. Default: ``False``.
+
+Keyword args:
+    out ((Tensor, Tensor), optional): tuple of two tensors to write
+        the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A namedtuple `(solution, cloned_coefficient)` where `cloned_coefficient`
+    is a clone of :math:`A` and `solution` is the solution :math:`X` to :math:`AX = b`
+    (or whatever variant of the system of equations, depending on the keyword arguments.)
+
+Examples::
+
+    >>> A = torch.randn(2, 2).triu()
+    >>> A
+    tensor([[ 1.1527, -1.0753],
+            [ 0.0000,  0.7986]])
+    >>> b = torch.randn(2, 3)
+    >>> b
+    tensor([[-0.0210,  2.3513, -1.5492],
+            [ 1.5429,  0.7403, -1.0243]])
+    >>> torch.triangular_solve(b, A)
+    torch.return_types.triangular_solve(
+    solution=tensor([[ 1.7841,  2.9046, -2.5405],
+            [ 1.9320,  0.9270, -1.2826]]),
+    cloned_coefficient=tensor([[ 1.1527, -1.0753],
+            [ 0.0000,  0.7986]]))
+""",
+)
+
+add_docstr(
+    torch.tril,
+    r"""
+tril(input, diagonal=0, *, out=None) -> Tensor
+
+Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices
+:attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+
+The lower triangular part of the matrix is defined as the elements on and
+below the diagonal.
+
+The argument :attr:`diagonal` controls which diagonal to consider. If
+:attr:`diagonal` = 0, all elements on and below the main diagonal are
+retained. A positive value includes just as many diagonals above the main
+diagonal, and similarly a negative value excludes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+:math:`d_{1}, d_{2}` are the dimensions of the matrix.
+"""
+    + r"""
+Args:
+    {input}
+    diagonal (int, optional): the diagonal to consider
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[-1.0813, -0.8619,  0.7105],
+            [ 0.0935,  0.1380,  2.2112],
+            [-0.3409, -0.9828,  0.0289]])
+    >>> torch.tril(a)
+    tensor([[-1.0813,  0.0000,  0.0000],
+            [ 0.0935,  0.1380,  0.0000],
+            [-0.3409, -0.9828,  0.0289]])
+
+    >>> b = torch.randn(4, 6)
+    >>> b
+    tensor([[ 1.2219,  0.5653, -0.2521, -0.2345,  1.2544,  0.3461],
+            [ 0.4785, -0.4477,  0.6049,  0.6368,  0.8775,  0.7145],
+            [ 1.1502,  3.2716, -1.1243, -0.5413,  0.3615,  0.6864],
+            [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0978]])
+    >>> torch.tril(b, diagonal=1)
+    tensor([[ 1.2219,  0.5653,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 0.4785, -0.4477,  0.6049,  0.0000,  0.0000,  0.0000],
+            [ 1.1502,  3.2716, -1.1243, -0.5413,  0.0000,  0.0000],
+            [-0.0614, -0.7344, -1.3164, -0.7648, -1.4024,  0.0000]])
+    >>> torch.tril(b, diagonal=-1)
+    tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 0.4785,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+            [ 1.1502,  3.2716,  0.0000,  0.0000,  0.0000,  0.0000],
+            [-0.0614, -0.7344, -1.3164,  0.0000,  0.0000,  0.0000]])
+""".format(
+        **common_args
+    ),
+)
+
+# docstr is split in two parts to avoid format mis-captureing :math: braces '{}'
+# as common args.
+add_docstr(
+    torch.tril_indices,
+    r"""
+tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+
+Returns the indices of the lower triangular part of a :attr:`row`-by-
+:attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+coordinates of all indices and the second row contains column coordinates.
+Indices are ordered based on rows and then columns.
+
+The lower triangular part of the matrix is defined as the elements on and
+below the diagonal.
+
+The argument :attr:`offset` controls which diagonal to consider. If
+:attr:`offset` = 0, all elements on and below the main diagonal are
+retained. A positive value includes just as many diagonals above the main
+diagonal, and similarly a negative value excludes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+
+.. note::
+    When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+    prevent overflow during calculation.
+"""
+    + r"""
+Args:
+    row (``int``): number of rows in the 2-D matrix.
+    col (``int``): number of columns in the 2-D matrix.
+    offset (``int``): diagonal offset from the main diagonal.
+        Default: if not provided, 0.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, ``torch.long``.
+    {device}
+    layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+
+Example::
+
+    >>> a = torch.tril_indices(3, 3)
+    >>> a
+    tensor([[0, 1, 1, 2, 2, 2],
+            [0, 0, 1, 0, 1, 2]])
+
+    >>> a = torch.tril_indices(4, 3, -1)
+    >>> a
+    tensor([[1, 2, 2, 3, 3, 3],
+            [0, 0, 1, 0, 1, 2]])
+
+    >>> a = torch.tril_indices(4, 3, 1)
+    >>> a
+    tensor([[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
+            [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.triu,
+    r"""
+triu(input, diagonal=0, *, out=None) -> Tensor
+
+Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
+:attr:`input`, the other elements of the result tensor :attr:`out` are set to 0.
+
+The upper triangular part of the matrix is defined as the elements on and
+above the diagonal.
+
+The argument :attr:`diagonal` controls which diagonal to consider. If
+:attr:`diagonal` = 0, all elements on and above the main diagonal are
+retained. A positive value excludes just as many diagonals above the main
+diagonal, and similarly a negative value includes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]` where
+:math:`d_{1}, d_{2}` are the dimensions of the matrix.
+"""
+    + r"""
+Args:
+    {input}
+    diagonal (int, optional): the diagonal to consider
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(3, 3)
+    >>> a
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.2072, -1.0680,  0.6602],
+            [ 0.3480, -0.5211, -0.4573]])
+    >>> torch.triu(a)
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.0000, -1.0680,  0.6602],
+            [ 0.0000,  0.0000, -0.4573]])
+    >>> torch.triu(a, diagonal=1)
+    tensor([[ 0.0000,  0.5207,  2.0049],
+            [ 0.0000,  0.0000,  0.6602],
+            [ 0.0000,  0.0000,  0.0000]])
+    >>> torch.triu(a, diagonal=-1)
+    tensor([[ 0.2309,  0.5207,  2.0049],
+            [ 0.2072, -1.0680,  0.6602],
+            [ 0.0000, -0.5211, -0.4573]])
+
+    >>> b = torch.randn(4, 6)
+    >>> b
+    tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+            [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+            [ 0.4333,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+            [-0.9888,  1.0679, -1.3337, -1.6556,  0.4798,  0.2830]])
+    >>> torch.triu(b, diagonal=1)
+    tensor([[ 0.0000, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+            [ 0.0000,  0.0000, -1.2919,  1.3378, -0.1768, -1.0857],
+            [ 0.0000,  0.0000,  0.0000, -1.0432,  0.9348, -0.4410],
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.4798,  0.2830]])
+    >>> torch.triu(b, diagonal=-1)
+    tensor([[ 0.5876, -0.0794, -1.8373,  0.6654,  0.2604,  1.5235],
+            [-0.2447,  0.9556, -1.2919,  1.3378, -0.1768, -1.0857],
+            [ 0.0000,  0.3146,  0.6576, -1.0432,  0.9348, -0.4410],
+            [ 0.0000,  0.0000, -1.3337, -1.6556,  0.4798,  0.2830]])
+""".format(
+        **common_args
+    ),
+)
+
+# docstr is split in two parts to avoid format mis-capturing :math: braces '{}'
+# as common args.
+add_docstr(
+    torch.triu_indices,
+    r"""
+triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor
+
+Returns the indices of the upper triangular part of a :attr:`row` by
+:attr:`col` matrix in a 2-by-N Tensor, where the first row contains row
+coordinates of all indices and the second row contains column coordinates.
+Indices are ordered based on rows and then columns.
+
+The upper triangular part of the matrix is defined as the elements on and
+above the diagonal.
+
+The argument :attr:`offset` controls which diagonal to consider. If
+:attr:`offset` = 0, all elements on and above the main diagonal are
+retained. A positive value excludes just as many diagonals above the main
+diagonal, and similarly a negative value includes just as many diagonals below
+the main diagonal. The main diagonal are the set of indices
+:math:`\lbrace (i, i) \rbrace` for :math:`i \in [0, \min\{d_{1}, d_{2}\} - 1]`
+where :math:`d_{1}, d_{2}` are the dimensions of the matrix.
+
+.. note::
+    When running on CUDA, ``row * col`` must be less than :math:`2^{59}` to
+    prevent overflow during calculation.
+"""
+    + r"""
+Args:
+    row (``int``): number of rows in the 2-D matrix.
+    col (``int``): number of columns in the 2-D matrix.
+    offset (``int``): diagonal offset from the main diagonal.
+        Default: if not provided, 0.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        Default: if ``None``, ``torch.long``.
+    {device}
+    layout (:class:`torch.layout`, optional): currently only support ``torch.strided``.
+
+Example::
+
+    >>> a = torch.triu_indices(3, 3)
+    >>> a
+    tensor([[0, 0, 0, 1, 1, 2],
+            [0, 1, 2, 1, 2, 2]])
+
+    >>> a = torch.triu_indices(4, 3, -1)
+    >>> a
+    tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3],
+            [0, 1, 2, 0, 1, 2, 1, 2, 2]])
+
+    >>> a = torch.triu_indices(4, 3, 1)
+    >>> a
+    tensor([[0, 0, 1],
+            [1, 2, 2]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.true_divide,
+    r"""
+true_divide(dividend, divisor, *, out) -> Tensor
+
+Alias for :func:`torch.div` with ``rounding_mode=None``.
+""",
+)
+
+add_docstr(
+    torch.trunc,
+    r"""
+trunc(input, *, out=None) -> Tensor
+
+Returns a new tensor with the truncated integer values of
+the elements of :attr:`input`.
+
+For integer inputs, follows the array-api convention of returning a
+copy of the input tensor.
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.randn(4)
+    >>> a
+    tensor([ 3.4742,  0.5466, -0.8008, -0.9079])
+    >>> torch.trunc(a)
+    tensor([ 3.,  0., -0., -0.])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.fake_quantize_per_tensor_affine,
+    r"""
+fake_quantize_per_tensor_affine(input, scale, zero_point, quant_min, quant_max) -> Tensor
+
+Returns a new tensor with the data in :attr:`input` fake quantized using :attr:`scale`,
+:attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`.
+
+.. math::
+    \text{output} = (
+        min(
+            \text{quant\_max},
+            max(
+                \text{quant\_min},
+                \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+            )
+        ) - \text{zero\_point}
+    ) \times \text{scale}
+
+Args:
+    input (Tensor): the input value(s), ``torch.float32`` tensor
+    scale (double scalar or ``float32`` Tensor): quantization scale
+    zero_point (int64 scalar or ``int32`` Tensor): quantization zero_point
+    quant_min (int64): lower bound of the quantized domain
+    quant_max (int64): upper bound of the quantized domain
+
+Returns:
+    Tensor: A newly fake_quantized ``torch.float32`` tensor
+
+Example::
+
+    >>> x = torch.randn(4)
+    >>> x
+    tensor([ 0.0552,  0.9730,  0.3973, -1.0780])
+    >>> torch.fake_quantize_per_tensor_affine(x, 0.1, 0, 0, 255)
+    tensor([0.1000, 1.0000, 0.4000, 0.0000])
+    >>> torch.fake_quantize_per_tensor_affine(x, torch.tensor(0.1), torch.tensor(0), 0, 255)
+    tensor([0.1000, 1.0000, 0.4000, 0.0000])
+""",
+)
+
+add_docstr(
+    torch.fake_quantize_per_channel_affine,
+    r"""
+fake_quantize_per_channel_affine(input, scale, zero_point, axis, quant_min, quant_max) -> Tensor
+
+Returns a new tensor with the data in :attr:`input` fake quantized per channel using :attr:`scale`,
+:attr:`zero_point`, :attr:`quant_min` and :attr:`quant_max`, across the channel specified by :attr:`axis`.
+
+.. math::
+    \text{output} = (
+        min(
+            \text{quant\_max},
+            max(
+                \text{quant\_min},
+                \text{std::nearby\_int}(\text{input} / \text{scale}) + \text{zero\_point}
+            )
+        ) - \text{zero\_point}
+    ) \times \text{scale}
+
+Args:
+    input (Tensor): the input value(s), in ``torch.float32``
+    scale (Tensor): quantization scale, per channel in ``torch.float32``
+    zero_point (Tensor): quantization zero_point, per channel in ``torch.int32`` or ``torch.half`` or ``torch.float32``
+    axis (int32): channel axis
+    quant_min (int64): lower bound of the quantized domain
+    quant_max (int64): upper bound of the quantized domain
+
+Returns:
+    Tensor: A newly fake_quantized per channel ``torch.float32`` tensor
+
+Example::
+
+    >>> x = torch.randn(2, 2, 2)
+    >>> x
+    tensor([[[-0.2525, -0.0466],
+             [ 0.3491, -0.2168]],
+
+            [[-0.5906,  1.6258],
+             [ 0.6444, -0.0542]]])
+    >>> scales = (torch.randn(2) + 1) * 0.05
+    >>> scales
+    tensor([0.0475, 0.0486])
+    >>> zero_points = torch.zeros(2).to(torch.int32)
+    >>> zero_points
+    tensor([0, 0])
+    >>> torch.fake_quantize_per_channel_affine(x, scales, zero_points, 1, 0, 255)
+    tensor([[[0.0000, 0.0000],
+             [0.3405, 0.0000]],
+
+            [[0.0000, 1.6134],
+            [0.6323, 0.0000]]])
+""",
+)
+
+add_docstr(
+    torch.fix,
+    r"""
+fix(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.trunc`
+""",
+)
+
+add_docstr(
+    torch.unsqueeze,
+    r"""
+unsqueeze(input, dim) -> Tensor
+
+Returns a new tensor with a dimension of size one inserted at the
+specified position.
+
+The returned tensor shares the same underlying data with this tensor.
+
+A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
+can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
+applied at :attr:`dim` = ``dim + input.dim() + 1``.
+
+Args:
+    {input}
+    dim (int): the index at which to insert the singleton dimension
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 4])
+    >>> torch.unsqueeze(x, 0)
+    tensor([[ 1,  2,  3,  4]])
+    >>> torch.unsqueeze(x, 1)
+    tensor([[ 1],
+            [ 2],
+            [ 3],
+            [ 4]])
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.var,
+    r"""
+var(input, dim=None, *, correction=1, keepdim=False, out=None) -> Tensor
+
+Calculates the variance over the dimensions specified by :attr:`dim`. :attr:`dim`
+can be a single dimension, list of dimensions, or ``None`` to reduce over all
+dimensions.
+
+The variance (:math:`\sigma^2`) is calculated as
+
+.. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+
+Keyword args:
+    correction (int): difference between the sample size and sample degrees of freedom.
+        Defaults to `Bessel's correction`_, ``correction=1``.
+
+        .. versionchanged:: 2.0
+            Previously this argument was called ``unbiased`` and was a boolean
+            with ``True`` corresponding to ``correction=1`` and ``False`` being
+            ``correction=0``.
+    {keepdim}
+    {out}
+
+Example:
+
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.var(a, dim=1, keepdim=True)
+    tensor([[1.0631],
+            [0.5590],
+            [1.4893],
+            [0.8258]])
+
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.var_mean,
+    r"""
+var_mean(input, dim=None, *, correction=1, keepdim=False, out=None) -> (Tensor, Tensor)
+
+Calculates the variance and mean over the dimensions specified by :attr:`dim`.
+:attr:`dim` can be a single dimension, list of dimensions, or ``None`` to
+reduce over all dimensions.
+
+The variance (:math:`\sigma^2`) is calculated as
+
+.. math:: \sigma^2 = \frac{1}{\max(0,~N - \delta N)}\sum_{i=0}^{N-1}(x_i-\bar{x})^2
+
+where :math:`x` is the sample set of elements, :math:`\bar{x}` is the
+sample mean, :math:`N` is the number of samples and :math:`\delta N` is
+the :attr:`correction`.
+"""
+    + r"""
+
+{keepdim_details}
+
+Args:
+    {input}
+    {opt_dim}
+
+Keyword args:
+    correction (int): difference between the sample size and sample degrees of freedom.
+        Defaults to `Bessel's correction`_, ``correction=1``.
+
+        .. versionchanged:: 2.0
+            Previously this argument was called ``unbiased`` and was a boolean
+            with ``True`` corresponding to ``correction=1`` and ``False`` being
+            ``correction=0``.
+    {keepdim}
+    {out}
+
+Returns:
+    A tuple (var, mean) containing the variance and mean.
+
+Example:
+
+    >>> a = torch.tensor(
+    ...     [[ 0.2035,  1.2959,  1.8101, -0.4644],
+    ...      [ 1.5027, -0.3270,  0.5905,  0.6538],
+    ...      [-1.5745,  1.3330, -0.5596, -0.6548],
+    ...      [ 0.1264, -0.5080,  1.6420,  0.1992]])
+    >>> torch.var_mean(a, dim=0, keepdim=True)
+    (tensor([[1.5926, 1.0056, 1.2005, 0.3646]]),
+     tensor([[ 0.0645,  0.4485,  0.8707, -0.0665]]))
+
+.. _Bessel's correction: https://en.wikipedia.org/wiki/Bessel%27s_correction
+
+""".format(
+        **multi_dim_common
+    ),
+)
+
+add_docstr(
+    torch.zeros,
+    r"""
+zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Returns a tensor filled with the scalar value `0`, with the shape defined
+by the variable argument :attr:`size`.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.zeros(2, 3)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+
+    >>> torch.zeros(5)
+    tensor([ 0.,  0.,  0.,  0.,  0.])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.zeros_like,
+    r"""
+zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor filled with the scalar value `0`, with the same size as
+:attr:`input`. ``torch.zeros_like(input)`` is equivalent to
+``torch.zeros(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+.. warning::
+    As of 0.4, this function does not support an :attr:`out` keyword. As an alternative,
+    the old ``torch.zeros_like(input, out=output)`` is equivalent to
+    ``torch.zeros(input.size(), out=output)``.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+Example::
+
+    >>> input = torch.empty(2, 3)
+    >>> torch.zeros_like(input)
+    tensor([[ 0.,  0.,  0.],
+            [ 0.,  0.,  0.]])
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.empty,
+    """
+empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False, \
+memory_format=torch.contiguous_format) -> Tensor
+
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument :attr:`size`.
+
+.. note::
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, the output tensor is initialized to prevent any possible
+    nondeterministic behavior from using the data as an input to an operation.
+    Floating point and complex tensors are filled with NaN, and integer tensors
+    are filled with the maximum value.
+
+Args:
+    size (int...): a sequence of integers defining the shape of the output tensor.
+        Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword args:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+    {memory_format}
+
+Example::
+
+    >>> torch.empty((2,3), dtype=torch.int64)
+    tensor([[ 9.4064e+13,  2.8000e+01,  9.3493e+13],
+            [ 7.5751e+18,  7.1428e+18,  7.5955e+18]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.empty_like,
+    r"""
+empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
+
+Returns an uninitialized tensor with the same size as :attr:`input`.
+``torch.empty_like(input)`` is equivalent to
+``torch.empty(input.size(), dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+.. note::
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, the output tensor is initialized to prevent any possible
+    nondeterministic behavior from using the data as an input to an operation.
+    Floating point and complex tensors are filled with NaN, and integer tensors
+    are filled with the maximum value.
+
+Args:
+    {input}
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+
+Example::
+
+    >>> a=torch.empty((2,3), dtype=torch.int32, device = 'cuda')
+    >>> torch.empty_like(a)
+    tensor([[0, 0, 0],
+            [0, 0, 0]], device='cuda:0', dtype=torch.int32)
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.empty_strided,
+    r"""
+empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+
+Creates a tensor with the specified :attr:`size` and :attr:`stride` and filled with undefined data.
+
+.. warning::
+    If the constructed tensor is "overlapped" (with multiple indices referring to the same element
+    in memory) its behavior is undefined.
+
+.. note::
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, the output tensor is initialized to prevent any possible
+    nondeterministic behavior from using the data as an input to an operation.
+    Floating point and complex tensors are filled with NaN, and integer tensors
+    are filled with the maximum value.
+
+Args:
+    size (tuple of int): the shape of the output tensor
+    stride (tuple of int): the strides of the output tensor
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Example::
+
+    >>> a = torch.empty_strided((2, 3), (1, 2))
+    >>> a
+    tensor([[8.9683e-44, 4.4842e-44, 5.1239e+07],
+            [0.0000e+00, 0.0000e+00, 3.0705e-41]])
+    >>> a.stride()
+    (1, 2)
+    >>> a.size()
+    torch.Size([2, 3])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.empty_permuted,
+    r"""
+empty_permuted(size, physical_layout, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor
+
+Creates an uninitialized, non-overlapping and dense tensor with the
+specified :attr:`size`, with :attr:`physical_layout` specifying how the
+dimensions are physically laid out in memory (each logical dimension is listed
+from outermost to innermost).  :attr:`physical_layout` is a generalization
+of NCHW/NHWC notation: if each dimension is assigned a number according to
+what order they occur in size (N=0, C=1, H=2, W=3), then NCHW is ``(0, 1, 2, 3)``
+while NHWC is ``(0, 2, 3, 1)``.  Equivalently, the strides of the output
+tensor ``t`` are such that ``t.stride(physical_layout[i]) == contiguous_strides[i]``
+(notably, this function is *not* equivalent to ``torch.empty(size).permute(physical_layout)``).
+
+Unlike :func:`torch.empty_strided`, this is guaranteed to produce a dense
+tensor with no overlaps.  If possible, prefer using this function over
+:func:`torch.empty_strided` or manual use of :func:`torch.as_strided`.
+
+.. note::
+    If :func:`torch.use_deterministic_algorithms()` and
+    :attr:`torch.utils.deterministic.fill_uninitialized_memory` are both set to
+    ``True``, the output tensor is initialized to prevent any possible
+    nondeterministic behavior from using the data as an input to an operation.
+    Floating point and complex tensors are filled with NaN, and integer tensors
+    are filled with the maximum value.
+
+Args:
+    size (tuple of int): the shape of the output tensor
+    physical_layout (tuple of int): the ordering of dimensions physically in memory
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {pin_memory}
+
+Examples:
+
+    >>> torch.empty((2, 3, 5, 7)).stride()
+    (105, 35, 7, 1)
+    >>> torch.empty_permuted((2, 3, 5, 7), (0, 1, 2, 3)).stride()
+    (105, 35, 7, 1)
+    >>> torch.empty((2, 3, 5, 7), memory_format=torch.channels_last).stride()
+    (105, 1, 21, 3)
+    >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).stride()
+    (105, 1, 21, 3)
+    >>> torch.empty_permuted((2, 3, 5, 7), (0, 2, 3, 1)).dim_order()
+    (0, 2, 3, 1)
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.full,
+    r"""
+full(size, fill_value, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Creates a tensor of size :attr:`size` filled with :attr:`fill_value`. The
+tensor's dtype is inferred from :attr:`fill_value`.
+
+Args:
+    size (int...): a list, tuple, or :class:`torch.Size` of integers defining the
+        shape of the output tensor.
+    fill_value (Scalar): the value to fill the output tensor with.
+
+Keyword args:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example::
+
+    >>> torch.full((2, 3), 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416]])
+""".format(
+        **factory_common_args
+    ),
+)
+
+add_docstr(
+    torch.full_like,
+    """
+full_like(input, fill_value, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+memory_format=torch.preserve_format) -> Tensor
+
+Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
+``torch.full_like(input, fill_value)`` is equivalent to
+``torch.full(input.size(), fill_value, dtype=input.dtype, layout=input.layout, device=input.device)``.
+
+Args:
+    {input}
+    fill_value: the number to fill the output tensor with.
+
+Keyword args:
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+    {memory_format}
+""".format(
+        **factory_like_common_args
+    ),
+)
+
+add_docstr(
+    torch.det,
+    r"""
+det(input) -> Tensor
+
+Alias for :func:`torch.linalg.det`
+""",
+)
+
+add_docstr(
+    torch.where,
+    r"""
+where(condition, input, other, *, out=None) -> Tensor
+
+Return a tensor of elements selected from either :attr:`input` or :attr:`other`, depending on :attr:`condition`.
+
+The operation is defined as:
+
+.. math::
+    \text{out}_i = \begin{cases}
+        \text{input}_i & \text{if } \text{condition}_i \\
+        \text{other}_i & \text{otherwise} \\
+    \end{cases}
+"""
+    + r"""
+.. note::
+    The tensors :attr:`condition`, :attr:`input`, :attr:`other` must be :ref:`broadcastable <broadcasting-semantics>`.
+
+Arguments:
+    condition (BoolTensor): When True (nonzero), yield input, otherwise yield other
+    input (Tensor or Scalar): value (if :attr:`input` is a scalar) or values selected at indices
+                          where :attr:`condition` is ``True``
+    other (Tensor or Scalar): value (if :attr:`other` is a scalar) or values selected at indices
+                          where :attr:`condition` is ``False``
+
+Keyword args:
+    {out}
+
+Returns:
+    Tensor: A tensor of shape equal to the broadcasted shape of :attr:`condition`, :attr:`input`, :attr:`other`
+
+Example::
+
+    >>> x = torch.randn(3, 2)
+    >>> y = torch.ones(3, 2)
+    >>> x
+    tensor([[-0.4620,  0.3139],
+            [ 0.3898, -0.7197],
+            [ 0.0478, -0.1657]])
+    >>> torch.where(x > 0, 1.0, 0.0)
+    tensor([[0., 1.],
+            [1., 0.],
+            [1., 0.]])
+    >>> torch.where(x > 0, x, y)
+    tensor([[ 1.0000,  0.3139],
+            [ 0.3898,  1.0000],
+            [ 0.0478,  1.0000]])
+    >>> x = torch.randn(2, 2, dtype=torch.double)
+    >>> x
+    tensor([[ 1.0779,  0.0383],
+            [-0.8785, -1.1089]], dtype=torch.float64)
+    >>> torch.where(x > 0, x, 0.)
+    tensor([[1.0779, 0.0383],
+            [0.0000, 0.0000]], dtype=torch.float64)
+
+.. function:: where(condition) -> tuple of LongTensor
+   :noindex:
+
+``torch.where(condition)`` is identical to
+``torch.nonzero(condition, as_tuple=True)``.
+
+.. note::
+    See also :func:`torch.nonzero`.
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.logdet,
+    r"""
+logdet(input) -> Tensor
+
+Calculates log determinant of a square matrix or batches of square matrices.
+
+It returns ``-inf`` if the input has a determinant of zero, and ``NaN`` if it has
+a negative determinant.
+
+.. note::
+    Backward through :meth:`logdet` internally uses SVD results when :attr:`input`
+    is not invertible. In this case, double backward through :meth:`logdet` will
+    be unstable in when :attr:`input` doesn't have distinct singular values. See
+    :func:`torch.linalg.svd` for details.
+
+.. seealso::
+
+        :func:`torch.linalg.slogdet` computes the sign (resp. angle) and natural logarithm of the
+        absolute value of the determinant of real-valued (resp. complex) square matrices.
+
+Arguments:
+    input (Tensor): the input tensor of size ``(*, n, n)`` where ``*`` is zero or more
+                batch dimensions.
+
+Example::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.det(A)
+    tensor(0.2611)
+    >>> torch.logdet(A)
+    tensor(-1.3430)
+    >>> A
+    tensor([[[ 0.9254, -0.6213],
+             [-0.5787,  1.6843]],
+
+            [[ 0.3242, -0.9665],
+             [ 0.4539, -0.0887]],
+
+            [[ 1.1336, -0.4025],
+             [-0.7089,  0.9032]]])
+    >>> A.det()
+    tensor([1.1990, 0.4099, 0.7386])
+    >>> A.det().log()
+    tensor([ 0.1815, -0.8917, -0.3031])
+""",
+)
+
+add_docstr(
+    torch.slogdet,
+    r"""
+slogdet(input) -> (Tensor, Tensor)
+
+Alias for :func:`torch.linalg.slogdet`
+""",
+)
+
+add_docstr(
+    torch.pinverse,
+    r"""
+pinverse(input, rcond=1e-15) -> Tensor
+
+Alias for :func:`torch.linalg.pinv`
+""",
+)
+
+add_docstr(
+    torch.hann_window,
+    """
+hann_window(window_length, periodic=True, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+Hann window function.
+
+.. math::
+    w[n] = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{N - 1} \right)\right] =
+            \sin^2 \left( \frac{\pi n}{N - 1} \right),
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+``torch.hann_window(L, periodic=True)`` equal to
+``torch.hann_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+"""
+    + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+
+Keyword args:
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.hamming_window,
+    """
+hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+Hamming window function.
+
+.. math::
+    w[n] = \alpha - \beta\ \cos \left( \frac{2 \pi n}{N - 1} \right),
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+``torch.hamming_window(L, periodic=True)`` equal to
+``torch.hamming_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+
+.. note::
+    This is a generalized version of :meth:`torch.hann_window`.
+"""
+    + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+    alpha (float, optional): The coefficient :math:`\alpha` in the equation above
+    beta (float, optional): The coefficient :math:`\beta` in the equation above
+
+Keyword args:
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window.
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.bartlett_window,
+    """
+bartlett_window(window_length, periodic=True, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+Bartlett window function.
+
+.. math::
+    w[n] = 1 - \left| \frac{2n}{N-1} - 1 \right| = \begin{cases}
+        \frac{2n}{N - 1} & \text{if } 0 \leq n \leq \frac{N - 1}{2} \\
+        2 - \frac{2n}{N - 1} & \text{if } \frac{N - 1}{2} < n < N \\
+    \end{cases},
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+``torch.bartlett_window(L, periodic=True)`` equal to
+``torch.bartlett_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+"""
+    + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+
+Keyword args:
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.blackman_window,
+    """
+blackman_window(window_length, periodic=True, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+Blackman window function.
+
+.. math::
+    w[n] = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{N - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{N - 1} \right)
+
+where :math:`N` is the full window size.
+
+The input :attr:`window_length` is a positive integer controlling the
+returned window size. :attr:`periodic` flag determines whether the returned
+window trims off the last duplicate value from the symmetric window and is
+ready to be used as a periodic window with functions like
+:meth:`torch.stft`. Therefore, if :attr:`periodic` is true, the :math:`N` in
+above formula is in fact :math:`\text{window\_length} + 1`. Also, we always have
+``torch.blackman_window(L, periodic=True)`` equal to
+``torch.blackman_window(L + 1, periodic=False)[:-1])``.
+
+.. note::
+    If :attr:`window_length` :math:`=1`, the returned window contains a single value 1.
+"""
+    + r"""
+Arguments:
+    window_length (int): the size of returned window
+    periodic (bool, optional): If True, returns a window to be used as periodic
+        function. If False, return a symmetric window.
+
+Keyword args:
+    {dtype} Only floating point types are supported.
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+Returns:
+    Tensor: A 1-D tensor of size :math:`(\text{{window\_length}},)` containing the window
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.kaiser_window,
+    """
+kaiser_window(window_length, periodic=True, beta=12.0, *, dtype=None, \
+layout=torch.strided, device=None, requires_grad=False) -> Tensor
+"""
+    + r"""
+Computes the Kaiser window with window length :attr:`window_length` and shape parameter :attr:`beta`.
+
+Let I_0 be the zeroth order modified Bessel function of the first kind (see :func:`torch.i0`) and
+``N = L - 1`` if :attr:`periodic` is False and ``L`` if :attr:`periodic` is True,
+where ``L`` is the :attr:`window_length`. This function computes:
+
+.. math::
+    out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+
+Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling
+``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``.
+The :attr:`periodic` argument is intended as a helpful shorthand
+to produce a periodic window as input to functions like :func:`torch.stft`.
+
+.. note::
+    If :attr:`window_length` is one, then the returned window is a single element tensor containing a one.
+
+"""
+    + r"""
+Args:
+    window_length (int): length of the window.
+    periodic (bool, optional): If True, returns a periodic window suitable for use in spectral analysis.
+        If False, returns a symmetric window suitable for use in filter design.
+    beta (float, optional): shape parameter for the window.
+
+Keyword args:
+    {dtype}
+    layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only
+          ``torch.strided`` (dense layout) is supported.
+    {device}
+    {requires_grad}
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.vander,
+    """
+vander(x, N=None, increasing=False) -> Tensor
+"""
+    + r"""
+Generates a Vandermonde matrix.
+
+The columns of the output matrix are elementwise powers of the input vector :math:`x^{{(N-1)}}, x^{{(N-2)}}, ..., x^0`.
+If increasing is True, the order of the columns is reversed :math:`x^0, x^1, ..., x^{{(N-1)}}`. Such a
+matrix with a geometric progression in each row is named for Alexandre-Theophile Vandermonde.
+
+Arguments:
+    x (Tensor): 1-D input tensor.
+    N (int, optional): Number of columns in the output. If N is not specified,
+        a square array is returned :math:`(N = len(x))`.
+    increasing (bool, optional): Order of the powers of the columns. If True,
+        the powers increase from left to right, if False (the default) they are reversed.
+
+Returns:
+    Tensor: Vandermonde matrix. If increasing is False, the first column is :math:`x^{{(N-1)}}`,
+    the second :math:`x^{{(N-2)}}` and so forth. If increasing is True, the columns
+    are :math:`x^0, x^1, ..., x^{{(N-1)}}`.
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 5])
+    >>> torch.vander(x)
+    tensor([[  1,   1,   1,   1],
+            [  8,   4,   2,   1],
+            [ 27,   9,   3,   1],
+            [125,  25,   5,   1]])
+    >>> torch.vander(x, N=3)
+    tensor([[ 1,  1,  1],
+            [ 4,  2,  1],
+            [ 9,  3,  1],
+            [25,  5,  1]])
+    >>> torch.vander(x, N=3, increasing=True)
+    tensor([[ 1,  1,  1],
+            [ 1,  2,  4],
+            [ 1,  3,  9],
+            [ 1,  5, 25]])
+
+""".format(
+        **factory_common_args
+    ),
+)
+
+
+add_docstr(
+    torch.unbind,
+    r"""
+unbind(input, dim=0) -> seq
+
+Removes a tensor dimension.
+
+Returns a tuple of all slices along a given dimension, already without it.
+
+Arguments:
+    input (Tensor): the tensor to unbind
+    dim (int): dimension to remove
+
+Example::
+
+    >>> torch.unbind(torch.tensor([[1, 2, 3],
+    >>>                            [4, 5, 6],
+    >>>                            [7, 8, 9]]))
+    (tensor([1, 2, 3]), tensor([4, 5, 6]), tensor([7, 8, 9]))
+""",
+)
+
+
+add_docstr(
+    torch.combinations,
+    r"""
+combinations(input, r=2, with_replacement=False) -> seq
+
+Compute combinations of length :math:`r` of the given tensor. The behavior is similar to
+python's `itertools.combinations` when `with_replacement` is set to `False`, and
+`itertools.combinations_with_replacement` when `with_replacement` is set to `True`.
+
+Arguments:
+    input (Tensor): 1D vector.
+    r (int, optional): number of elements to combine
+    with_replacement (bool, optional): whether to allow duplication in combination
+
+Returns:
+    Tensor: A tensor equivalent to converting all the input tensors into lists, do
+    `itertools.combinations` or `itertools.combinations_with_replacement` on these
+    lists, and finally convert the resulting list into tensor.
+
+Example::
+
+    >>> a = [1, 2, 3]
+    >>> list(itertools.combinations(a, r=2))
+    [(1, 2), (1, 3), (2, 3)]
+    >>> list(itertools.combinations(a, r=3))
+    [(1, 2, 3)]
+    >>> list(itertools.combinations_with_replacement(a, r=2))
+    [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]
+    >>> tensor_a = torch.tensor(a)
+    >>> torch.combinations(tensor_a)
+    tensor([[1, 2],
+            [1, 3],
+            [2, 3]])
+    >>> torch.combinations(tensor_a, r=3)
+    tensor([[1, 2, 3]])
+    >>> torch.combinations(tensor_a, with_replacement=True)
+    tensor([[1, 1],
+            [1, 2],
+            [1, 3],
+            [2, 2],
+            [2, 3],
+            [3, 3]])
+
+""",
+)
+
+add_docstr(
+    torch.trapezoid,
+    r"""
+trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+
+Computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_ along
+:attr:`dim`. By default the spacing between elements is assumed to be 1, but
+:attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+used to specify arbitrary spacing along :attr:`dim`.
+
+
+Assuming :attr:`y` is a one-dimensional tensor with elements :math:`{y_0, y_1, ..., y_n}`,
+the default computation is
+
+.. math::
+    \begin{aligned}
+        \sum_{i = 1}^{n-1} \frac{1}{2} (y_i + y_{i-1})
+    \end{aligned}
+
+When :attr:`dx` is specified the computation becomes
+
+.. math::
+    \begin{aligned}
+        \sum_{i = 1}^{n-1} \frac{\Delta x}{2} (y_i + y_{i-1})
+    \end{aligned}
+
+effectively multiplying the result by :attr:`dx`. When :attr:`x` is specified,
+assuming :attr:`x` is also a one-dimensional tensor with
+elements :math:`{x_0, x_1, ..., x_n}`, the computation becomes
+
+.. math::
+    \begin{aligned}
+        \sum_{i = 1}^{n-1} \frac{(x_i - x_{i-1})}{2} (y_i + y_{i-1})
+    \end{aligned}
+
+When :attr:`x` and :attr:`y` have the same size, the computation is as described above and no broadcasting is needed.
+The broadcasting behavior of this function is as follows when their sizes are different. For both :attr:`x`
+and :attr:`y`, the function computes the difference between consecutive elements along
+dimension :attr:`dim`. This effectively creates two tensors, `x_diff` and `y_diff`, that have
+the same shape as the original tensors except their lengths along the dimension :attr:`dim` is reduced by 1.
+After that, those two tensors are broadcast together to compute final output as part of the trapezoidal rule.
+See the examples below for details.
+
+.. note::
+    The trapezoidal rule is a technique for approximating the definite integral of a function
+    by averaging its left and right Riemann sums. The approximation becomes more accurate as
+    the resolution of the partition increases.
+
+Arguments:
+    y (Tensor): Values to use when computing the trapezoidal rule.
+    x (Tensor): If specified, defines spacing between values as specified above.
+
+Keyword arguments:
+    dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+        are specified then this defaults to 1. Effectively multiplies the result by its value.
+    dim (int): The dimension along which to compute the trapezoidal rule.
+        The last (inner-most) dimension by default.
+
+Examples::
+
+    >>> # Computes the trapezoidal rule in 1D, spacing is implicitly 1
+    >>> y = torch.tensor([1, 5, 10])
+    >>> torch.trapezoid(y)
+    tensor(10.5)
+
+    >>> # Computes the same trapezoidal rule directly to verify
+    >>> (1 + 10 + 10) / 2
+    10.5
+
+    >>> # Computes the trapezoidal rule in 1D with constant spacing of 2
+    >>> # NOTE: the result is the same as before, but multiplied by 2
+    >>> torch.trapezoid(y, dx=2)
+    21.0
+
+    >>> # Computes the trapezoidal rule in 1D with arbitrary spacing
+    >>> x = torch.tensor([1, 3, 6])
+    >>> torch.trapezoid(y, x)
+    28.5
+
+    >>> # Computes the same trapezoidal rule directly to verify
+    >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+    28.5
+
+    >>> # Computes the trapezoidal rule for each row of a 3x3 matrix
+    >>> y = torch.arange(9).reshape(3, 3)
+    tensor([[0, 1, 2],
+            [3, 4, 5],
+            [6, 7, 8]])
+    >>> torch.trapezoid(y)
+    tensor([ 2., 8., 14.])
+
+    >>> # Computes the trapezoidal rule for each column of the matrix
+    >>> torch.trapezoid(y, dim=0)
+    tensor([ 6., 8., 10.])
+
+    >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+    >>> #   with the same arbitrary spacing
+    >>> y = torch.ones(3, 3)
+    >>> x = torch.tensor([1, 3, 6])
+    >>> torch.trapezoid(y, x)
+    array([5., 5., 5.])
+
+    >>> # Computes the trapezoidal rule for each row of a 3x3 ones matrix
+    >>> #   with different arbitrary spacing per row
+    >>> y = torch.ones(3, 3)
+    >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+    >>> torch.trapezoid(y, x)
+    array([2., 4., 6.])
+""",
+)
+
+add_docstr(
+    torch.trapz,
+    r"""
+trapz(y, x, *, dim=-1) -> Tensor
+
+Alias for :func:`torch.trapezoid`.
+""",
+)
+
+add_docstr(
+    torch.cumulative_trapezoid,
+    r"""
+cumulative_trapezoid(y, x=None, *, dx=None, dim=-1) -> Tensor
+
+Cumulatively computes the `trapezoidal rule <https://en.wikipedia.org/wiki/Trapezoidal_rule>`_
+along :attr:`dim`. By default the spacing between elements is assumed to be 1, but
+:attr:`dx` can be used to specify a different constant spacing, and :attr:`x` can be
+used to specify arbitrary spacing along :attr:`dim`.
+
+For more details, please read :func:`torch.trapezoid`. The difference between :func:`torch.trapezoid`
+and this function is that, :func:`torch.trapezoid` returns a value for each integration,
+where as this function returns a cumulative value for every spacing within the integration. This
+is analogous to how `.sum` returns a value and `.cumsum` returns a cumulative sum.
+
+Arguments:
+    y (Tensor): Values to use when computing the trapezoidal rule.
+    x (Tensor): If specified, defines spacing between values as specified above.
+
+Keyword arguments:
+    dx (float): constant spacing between values. If neither :attr:`x` or :attr:`dx`
+        are specified then this defaults to 1. Effectively multiplies the result by its value.
+    dim (int): The dimension along which to compute the trapezoidal rule.
+        The last (inner-most) dimension by default.
+
+Examples::
+
+    >>> # Cumulatively computes the trapezoidal rule in 1D, spacing is implicitly 1.
+    >>> y = torch.tensor([1, 5, 10])
+    >>> torch.cumulative_trapezoid(y)
+    tensor([3., 10.5])
+
+    >>> # Computes the same trapezoidal rule directly up to each element to verify
+    >>> (1 + 5) / 2
+    3.0
+    >>> (1 + 10 + 10) / 2
+    10.5
+
+    >>> # Cumulatively computes the trapezoidal rule in 1D with constant spacing of 2
+    >>> # NOTE: the result is the same as before, but multiplied by 2
+    >>> torch.cumulative_trapezoid(y, dx=2)
+    tensor([6., 21.])
+
+    >>> # Cumulatively computes the trapezoidal rule in 1D with arbitrary spacing
+    >>> x = torch.tensor([1, 3, 6])
+    >>> torch.cumulative_trapezoid(y, x)
+    tensor([6., 28.5])
+
+    >>> # Computes the same trapezoidal rule directly up to each element to verify
+    >>> ((3 - 1) * (1 + 5)) / 2
+    6.0
+    >>> ((3 - 1) * (1 + 5) + (6 - 3) * (5 + 10)) / 2
+    28.5
+
+    >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 matrix
+    >>> y = torch.arange(9).reshape(3, 3)
+    tensor([[0, 1, 2],
+            [3, 4, 5],
+            [6, 7, 8]])
+    >>> torch.cumulative_trapezoid(y)
+    tensor([[ 0.5,  2.],
+            [ 3.5,  8.],
+            [ 6.5, 14.]])
+
+    >>> # Cumulatively computes the trapezoidal rule for each column of the matrix
+    >>> torch.cumulative_trapezoid(y, dim=0)
+    tensor([[ 1.5,  2.5,  3.5],
+            [ 6.0,  8.0, 10.0]])
+
+    >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+    >>> #   with the same arbitrary spacing
+    >>> y = torch.ones(3, 3)
+    >>> x = torch.tensor([1, 3, 6])
+    >>> torch.cumulative_trapezoid(y, x)
+    tensor([[2., 5.],
+            [2., 5.],
+            [2., 5.]])
+
+    >>> # Cumulatively computes the trapezoidal rule for each row of a 3x3 ones matrix
+    >>> #   with different arbitrary spacing per row
+    >>> y = torch.ones(3, 3)
+    >>> x = torch.tensor([[1, 2, 3], [1, 3, 5], [1, 4, 7]])
+    >>> torch.cumulative_trapezoid(y, x)
+    tensor([[1., 2.],
+            [2., 4.],
+            [3., 6.]])
+""",
+)
+
+add_docstr(
+    torch.repeat_interleave,
+    r"""
+repeat_interleave(input, repeats, dim=None, *, output_size=None) -> Tensor
+
+Repeat elements of a tensor.
+
+.. warning::
+
+    This is different from :meth:`torch.Tensor.repeat` but similar to ``numpy.repeat``.
+
+Args:
+    {input}
+    repeats (Tensor or int): The number of repetitions for each element.
+        repeats is broadcasted to fit the shape of the given axis.
+    dim (int, optional): The dimension along which to repeat values.
+        By default, use the flattened input array, and return a flat output
+        array.
+
+Keyword args:
+    output_size (int, optional): Total output size for the given axis
+        ( e.g. sum of repeats). If given, it will avoid stream synchronization
+        needed to calculate output shape of the tensor.
+
+Returns:
+    Tensor: Repeated tensor which has the same shape as input, except along the given axis.
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.repeat_interleave(2)
+    tensor([1, 1, 2, 2, 3, 3])
+    >>> y = torch.tensor([[1, 2], [3, 4]])
+    >>> torch.repeat_interleave(y, 2)
+    tensor([1, 1, 2, 2, 3, 3, 4, 4])
+    >>> torch.repeat_interleave(y, 3, dim=1)
+    tensor([[1, 1, 1, 2, 2, 2],
+            [3, 3, 3, 4, 4, 4]])
+    >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0)
+    tensor([[1, 2],
+            [3, 4],
+            [3, 4]])
+    >>> torch.repeat_interleave(y, torch.tensor([1, 2]), dim=0, output_size=3)
+    tensor([[1, 2],
+            [3, 4],
+            [3, 4]])
+
+If the `repeats` is `tensor([n1, n2, n3, ...])`, then the output will be
+`tensor([0, 0, ..., 1, 1, ..., 2, 2, ..., ...])` where `0` appears `n1` times,
+`1` appears `n2` times, `2` appears `n3` times, etc.
+
+.. function:: repeat_interleave(repeats, *) -> Tensor
+   :noindex:
+
+Repeats 0 repeats[0] times, 1 repeats[1] times, 2 repeats[2] times, etc.
+
+Args:
+    repeats (Tensor): The number of repetitions for each element.
+
+Returns:
+    Tensor: Repeated tensor of size `sum(repeats)`.
+
+Example::
+
+    >>> torch.repeat_interleave(torch.tensor([1, 2, 3]))
+    tensor([0, 1, 1, 2, 2, 2])
+
+""".format(
+        **common_args
+    ),
+)
+
+add_docstr(
+    torch.tile,
+    r"""
+tile(input, dims) -> Tensor
+
+Constructs a tensor by repeating the elements of :attr:`input`.
+The :attr:`dims` argument specifies the number of repetitions
+in each dimension.
+
+If :attr:`dims` specifies fewer dimensions than :attr:`input` has, then
+ones are prepended to :attr:`dims` until all dimensions are specified.
+For example, if :attr:`input` has shape (8, 6, 4, 2) and :attr:`dims`
+is (2, 2), then :attr:`dims` is treated as (1, 1, 2, 2).
+
+Analogously, if :attr:`input` has fewer dimensions than :attr:`dims`
+specifies, then :attr:`input` is treated as if it were unsqueezed at
+dimension zero until it has as many dimensions as :attr:`dims` specifies.
+For example, if :attr:`input` has shape (4, 2) and :attr:`dims`
+is (3, 3, 2, 2), then :attr:`input` is treated as if it had the
+shape (1, 1, 4, 2).
+
+.. note::
+
+    This function is similar to NumPy's tile function.
+
+Args:
+    input (Tensor): the tensor whose elements to repeat.
+    dims (tuple): the number of repetitions per dimension.
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3])
+    >>> x.tile((2,))
+    tensor([1, 2, 3, 1, 2, 3])
+    >>> y = torch.tensor([[1, 2], [3, 4]])
+    >>> torch.tile(y, (2, 2))
+    tensor([[1, 2, 1, 2],
+            [3, 4, 3, 4],
+            [1, 2, 1, 2],
+            [3, 4, 3, 4]])
+""",
+)
+
+add_docstr(
+    torch.quantize_per_tensor,
+    r"""
+quantize_per_tensor(input, scale, zero_point, dtype) -> Tensor
+
+Converts a float tensor to a quantized tensor with given scale and zero point.
+
+Arguments:
+    input (Tensor): float tensor or list of tensors to quantize
+    scale (float or Tensor): scale to apply in quantization formula
+    zero_point (int or Tensor): offset in integer value that maps to float zero
+    dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+
+Returns:
+    Tensor: A newly quantized tensor or list of quantized tensors.
+
+Example::
+
+    >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8)
+    tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10)
+    >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8).int_repr()
+    tensor([ 0, 10, 20, 30], dtype=torch.uint8)
+    >>> torch.quantize_per_tensor([torch.tensor([-1.0, 0.0]), torch.tensor([-2.0, 2.0])],
+    >>> torch.tensor([0.1, 0.2]), torch.tensor([10, 20]), torch.quint8)
+    (tensor([-1.,  0.], size=(2,), dtype=torch.quint8,
+        quantization_scheme=torch.per_tensor_affine, scale=0.1, zero_point=10),
+        tensor([-2.,  2.], size=(2,), dtype=torch.quint8,
+        quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=20))
+    >>> torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.tensor(0.1), torch.tensor(10), torch.quint8)
+    tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+       quantization_scheme=torch.per_tensor_affine, scale=0.10, zero_point=10)
+""",
+)
+
+add_docstr(
+    torch.quantize_per_tensor_dynamic,
+    r"""
+quantize_per_tensor_dynamic(input, dtype, reduce_range) -> Tensor
+
+Converts a float tensor to a quantized tensor with scale and zero_point calculated
+dynamically based on the input.
+
+Arguments:
+    input (Tensor): float tensor or list of tensors to quantize
+    dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``
+    reduce_range (bool): a flag to indicate whether to reduce the range of quantized
+    data by 1 bit, it's required to avoid instruction overflow for some hardwares
+
+Returns:
+    Tensor: A newly (dynamically) quantized tensor
+
+Example::
+
+    >>> t = torch.quantize_per_tensor_dynamic(torch.tensor([-1.0, 0.0, 1.0, 2.0]), torch.quint8, False)
+    >>> print(t)
+    tensor([-1.,  0.,  1.,  2.], size=(4,), dtype=torch.quint8,
+           quantization_scheme=torch.per_tensor_affine, scale=0.011764705882352941,
+           zero_point=85)
+    >>> t.int_repr()
+    tensor([  0,  85, 170, 255], dtype=torch.uint8)
+""",
+)
+
+add_docstr(
+    torch.quantize_per_channel,
+    r"""
+quantize_per_channel(input, scales, zero_points, axis, dtype) -> Tensor
+
+Converts a float tensor to a per-channel quantized tensor with given scales and zero points.
+
+Arguments:
+    input (Tensor): float tensor to quantize
+    scales (Tensor): float 1D tensor of scales to use, size should match ``input.size(axis)``
+    zero_points (int): integer 1D tensor of offset to use, size should match ``input.size(axis)``
+    axis (int): dimension on which apply per-channel quantization
+    dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+        Has to be one of the quantized dtypes: ``torch.quint8``, ``torch.qint8``, ``torch.qint32``
+
+Returns:
+    Tensor: A newly quantized tensor
+
+Example::
+
+    >>> x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
+    >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8)
+    tensor([[-1.,  0.],
+            [ 1.,  2.]], size=(2, 2), dtype=torch.quint8,
+           quantization_scheme=torch.per_channel_affine,
+           scale=tensor([0.1000, 0.0100], dtype=torch.float64),
+           zero_point=tensor([10,  0]), axis=0)
+    >>> torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8).int_repr()
+    tensor([[  0,  10],
+            [100, 200]], dtype=torch.uint8)
+""",
+)
+
+
+add_docstr(
+    torch.quantized_batch_norm,
+    r"""
+quantized_batch_norm(input, weight=None, bias=None, mean, var, eps, output_scale, output_zero_point) -> Tensor
+
+Applies batch normalization on a 4D (NCHW) quantized tensor.
+
+.. math::
+
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+Arguments:
+    input (Tensor): quantized tensor
+    weight (Tensor): float tensor that corresponds to the gamma, size C
+    bias (Tensor):  float tensor that corresponds to the beta, size C
+    mean (Tensor): float mean value in batch normalization, size C
+    var (Tensor): float tensor for variance, size C
+    eps (float): a value added to the denominator for numerical stability.
+    output_scale (float): output quantized tensor scale
+    output_zero_point (int): output quantized tensor zero_point
+
+Returns:
+    Tensor: A quantized tensor with batch normalization applied.
+
+Example::
+
+    >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+    >>> torch.quantized_batch_norm(qx, torch.ones(2), torch.zeros(2), torch.rand(2), torch.rand(2), 0.00001, 0.2, 2)
+    tensor([[[[-0.2000, -0.2000],
+          [ 1.6000, -0.2000]],
+
+         [[-0.4000, -0.4000],
+          [-0.4000,  0.6000]]],
+
+
+        [[[-0.2000, -0.2000],
+          [-0.2000, -0.2000]],
+
+         [[ 0.6000, -0.4000],
+          [ 0.6000, -0.4000]]]], size=(2, 2, 2, 2), dtype=torch.quint8,
+       quantization_scheme=torch.per_tensor_affine, scale=0.2, zero_point=2)
+""",
+)
+
+
+add_docstr(
+    torch.quantized_max_pool1d,
+    r"""
+quantized_max_pool1d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+
+Applies a 1D max pooling over an input quantized tensor composed of several input planes.
+
+Arguments:
+    input (Tensor): quantized tensor
+    kernel_size (list of int): the size of the sliding window
+    stride (``list of int``, optional): the stride of the sliding window
+    padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+    dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+    ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+        Defaults to False.
+
+
+Returns:
+    Tensor: A quantized tensor with max_pool1d applied.
+
+Example::
+
+    >>> qx = torch.quantize_per_tensor(torch.rand(2, 2), 1.5, 3, torch.quint8)
+    >>> torch.quantized_max_pool1d(qx, [2])
+    tensor([[0.0000],
+            [1.5000]], size=(2, 1), dtype=torch.quint8,
+        quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+""",
+)
+
+
+add_docstr(
+    torch.quantized_max_pool2d,
+    r"""
+quantized_max_pool2d(input, kernel_size, stride=[], padding=0, dilation=1, ceil_mode=False) -> Tensor
+
+Applies a 2D max pooling over an input quantized tensor composed of several input planes.
+
+Arguments:
+    input (Tensor): quantized tensor
+    kernel_size (``list of int``): the size of the sliding window
+    stride (``list of int``, optional): the stride of the sliding window
+    padding (``list of int``, optional): padding to be added on both sides, must be >= 0 and <= kernel_size / 2
+    dilation (``list of int``, optional): The stride between elements within a sliding window, must be > 0. Default 1
+    ceil_mode (bool, optional):  If True, will use ceil instead of floor to compute the output shape.
+        Defaults to False.
+
+
+Returns:
+    Tensor: A quantized tensor with max_pool2d applied.
+
+Example::
+
+    >>> qx = torch.quantize_per_tensor(torch.rand(2, 2, 2, 2), 1.5, 3, torch.quint8)
+    >>> torch.quantized_max_pool2d(qx, [2,2])
+    tensor([[[[1.5000]],
+
+            [[1.5000]]],
+
+
+            [[[0.0000]],
+
+            [[0.0000]]]], size=(2, 2, 1, 1), dtype=torch.quint8,
+        quantization_scheme=torch.per_tensor_affine, scale=1.5, zero_point=3)
+""",
+)
+
+
+add_docstr(
+    torch.Generator,
+    r"""
+Generator(device='cpu') -> Generator
+
+Creates and returns a generator object that manages the state of the algorithm which
+produces pseudo random numbers. Used as a keyword argument in many :ref:`inplace-random-sampling`
+functions.
+
+Arguments:
+    device (:class:`torch.device`, optional): the desired device for the generator.
+
+Returns:
+    Generator: An torch.Generator object.
+
+Example::
+
+    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+    >>> g_cpu = torch.Generator()
+    >>> g_cuda = torch.Generator(device='cuda')
+""",
+)
+
+
+add_docstr(
+    torch.Generator.set_state,
+    r"""
+Generator.set_state(new_state) -> void
+
+Sets the Generator state.
+
+Arguments:
+    new_state (torch.ByteTensor): The desired state.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu_other = torch.Generator()
+    >>> g_cpu.set_state(g_cpu_other.get_state())
+""",
+)
+
+
+add_docstr(
+    torch.Generator.get_state,
+    r"""
+Generator.get_state() -> Tensor
+
+Returns the Generator state as a ``torch.ByteTensor``.
+
+Returns:
+    Tensor: A ``torch.ByteTensor`` which contains all the necessary bits
+    to restore a Generator to a specific point in time.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu.get_state()
+""",
+)
+
+
+add_docstr(
+    torch.Generator.manual_seed,
+    r"""
+Generator.manual_seed(seed) -> Generator
+
+Sets the seed for generating random numbers. Returns a `torch.Generator` object. Any 32-bit integer is a valid seed.
+
+Arguments:
+    seed (int): The desired seed. Value must be within the inclusive range
+        `[-0x8000_0000_0000_0000, 0xffff_ffff_ffff_ffff]`. Otherwise, a RuntimeError
+        is raised. Negative inputs are remapped to positive values with the formula
+        `0xffff_ffff_ffff_ffff + seed`.
+
+Returns:
+    Generator: An torch.Generator object.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu.manual_seed(2147483647)
+""",
+)
+
+
+add_docstr(
+    torch.Generator.initial_seed,
+    r"""
+Generator.initial_seed() -> int
+
+Returns the initial seed for generating random numbers.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu.initial_seed()
+    2147483647
+""",
+)
+
+
+add_docstr(
+    torch.Generator.seed,
+    r"""
+Generator.seed() -> int
+
+Gets a non-deterministic random number from std::random_device or the current
+time and uses it to seed a Generator.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu.seed()
+    1516516984916
+""",
+)
+
+
+add_docstr(
+    torch.Generator.device,
+    r"""
+Generator.device -> device
+
+Gets the current device of the generator.
+
+Example::
+
+    >>> g_cpu = torch.Generator()
+    >>> g_cpu.device
+    device(type='cpu')
+""",
+)
+
+add_docstr(
+    torch._assert_async,
+    r"""
+_assert_async(tensor) -> void
+
+Asynchronously assert that the contents of tensor are nonzero.  For CPU tensors,
+this is equivalent to ``assert tensor`` or ``assert tensor.is_nonzero()``; for
+CUDA tensors, we DO NOT synchronize and you may only find out the assertion
+failed at a later CUDA kernel launch.  Asynchronous assertion can be helpful for
+testing invariants in CUDA tensors without giving up performance.  This function
+is NOT intended to be used for regular error checking, as it will trash your CUDA
+context if the assert fails (forcing you to restart your PyTorch process.)
+
+Args:
+    tensor (Tensor): a one element tensor to test to see if it is nonzero.  Zero
+        elements (including False for boolean tensors) cause an assertion failure
+        to be raised.
+""",
+)
+
+add_docstr(
+    torch.searchsorted,
+    r"""
+searchsorted(sorted_sequence, values, *, out_int32=False, right=False, side=None, out=None, sorter=None) -> Tensor
+
+Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the
+corresponding values in :attr:`values` were inserted before the indices, when sorted, the order
+of the corresponding *innermost* dimension within :attr:`sorted_sequence` would be preserved.
+Return a new tensor with the same size as :attr:`values`. More formally,
+the returned index satisfies the following rules:
+
+.. list-table::
+   :widths: 12 10 78
+   :header-rows: 1
+
+   * - :attr:`sorted_sequence`
+     - :attr:`right`
+     - *returned index satisfies*
+   * - 1-D
+     - False
+     - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]``
+   * - 1-D
+     - True
+     - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]``
+   * - N-D
+     - False
+     - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]``
+   * - N-D
+     - True
+     - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]``
+
+Args:
+    sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost*
+                              dimension unless :attr:`sorter` is provided, in which case the sequence does not
+                              need to be sorted
+    values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+
+Keyword args:
+    out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                Default value is False, i.e. default output data type is torch.int64.
+    right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                            last such index. If no suitable index found, return 0 for non-numerical value
+                            (eg. nan, inf) or the size of *innermost* dimension within :attr:`sorted_sequence`
+                            (one pass the last index of the *innermost* dimension). In other words, if False,
+                            gets the lower bound index for each value in :attr:`values` on the corresponding
+                            *innermost* dimension of the :attr:`sorted_sequence`. If True, gets the upper
+                            bound index instead. Default value is False. :attr:`side` does the same and is
+                            preferred. It will error if :attr:`side` is set to "left" while this is True.
+    side (str, optional): the same as :attr:`right` but preferred. "left" corresponds to False for :attr:`right`
+                            and "right" corresponds to True for :attr:`right`. It will error if this is set to
+                            "left" while :attr:`right` is True. Default value is None.
+    out (Tensor, optional): the output tensor, must be the same size as :attr:`values` if provided.
+    sorter (LongTensor, optional): if provided, a tensor matching the shape of the unsorted
+                            :attr:`sorted_sequence` containing a sequence of indices that sort it in the
+                            ascending order on the innermost dimension
+
+
+Example::
+
+    >>> sorted_sequence = torch.tensor([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]])
+    >>> sorted_sequence
+    tensor([[ 1,  3,  5,  7,  9],
+            [ 2,  4,  6,  8, 10]])
+    >>> values = torch.tensor([[3, 6, 9], [3, 6, 9]])
+    >>> values
+    tensor([[3, 6, 9],
+            [3, 6, 9]])
+    >>> torch.searchsorted(sorted_sequence, values)
+    tensor([[1, 3, 4],
+            [1, 2, 4]])
+    >>> torch.searchsorted(sorted_sequence, values, side='right')
+    tensor([[2, 3, 5],
+            [1, 3, 4]])
+
+    >>> sorted_sequence_1d = torch.tensor([1, 3, 5, 7, 9])
+    >>> sorted_sequence_1d
+    tensor([1, 3, 5, 7, 9])
+    >>> torch.searchsorted(sorted_sequence_1d, values)
+    tensor([[1, 3, 4],
+            [1, 3, 4]])
+""",
+)
+
+add_docstr(
+    torch.bucketize,
+    r"""
+bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor
+
+Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the
+boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
+as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
+this behavior is opposite the behavior of
+`numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
+More formally, the returned index satisfies the following rules:
+
+.. list-table::
+   :widths: 15 85
+   :header-rows: 1
+
+   * - :attr:`right`
+     - *returned index satisfies*
+   * - False
+     - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]``
+   * - True
+     - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]``
+
+Args:
+    input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s).
+    boundaries (Tensor): 1-D tensor, must contain a strictly increasing sequence, or the return value is undefined.
+
+Keyword args:
+    out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise.
+                                Default value is False, i.e. default output data type is torch.int64.
+    right (bool, optional): if False, return the first suitable location that is found. If True, return the
+                            last such index. If no suitable index found, return 0 for non-numerical value
+                            (eg. nan, inf) or the size of :attr:`boundaries` (one pass the last index).
+                            In other words, if False, gets the lower bound index for each value in :attr:`input`
+                            from :attr:`boundaries`. If True, gets the upper bound index instead.
+                            Default value is False.
+    out (Tensor, optional): the output tensor, must be the same size as :attr:`input` if provided.
+
+
+Example::
+
+    >>> boundaries = torch.tensor([1, 3, 5, 7, 9])
+    >>> boundaries
+    tensor([1, 3, 5, 7, 9])
+    >>> v = torch.tensor([[3, 6, 9], [3, 6, 9]])
+    >>> v
+    tensor([[3, 6, 9],
+            [3, 6, 9]])
+    >>> torch.bucketize(v, boundaries)
+    tensor([[1, 3, 4],
+            [1, 3, 4]])
+    >>> torch.bucketize(v, boundaries, right=True)
+    tensor([[2, 3, 5],
+            [2, 3, 5]])
+""",
+)
+
+add_docstr(
+    torch.view_as_real_copy,
+    r"""
+Performs the same operation as :func:`torch.view_as_real`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.view_as_complex_copy,
+    r"""
+Performs the same operation as :func:`torch.view_as_complex`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.as_strided_copy,
+    r"""
+Performs the same operation as :func:`torch.as_strided`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.diagonal_copy,
+    r"""
+Performs the same operation as :func:`torch.diagonal`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.expand_copy,
+    r"""
+Performs the same operation as :func:`torch.expand`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.permute_copy,
+    r"""
+Performs the same operation as :func:`torch.permute`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.select_copy,
+    r"""
+Performs the same operation as :func:`torch.select`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.detach_copy,
+    r"""
+Performs the same operation as :func:`torch.detach`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.slice_copy,
+    r"""
+Performs the same operation as :func:`torch.slice`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.split_copy,
+    r"""
+Performs the same operation as :func:`torch.split`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.split_with_sizes_copy,
+    r"""
+Performs the same operation as :func:`torch.split_with_sizes`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.squeeze_copy,
+    r"""
+Performs the same operation as :func:`torch.squeeze`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.t_copy,
+    r"""
+Performs the same operation as :func:`torch.t`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.transpose_copy,
+    r"""
+Performs the same operation as :func:`torch.transpose`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.unsqueeze_copy,
+    r"""
+Performs the same operation as :func:`torch.unsqueeze`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.indices_copy,
+    r"""
+Performs the same operation as :func:`torch.indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.values_copy,
+    r"""
+Performs the same operation as :func:`torch.values`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.crow_indices_copy,
+    r"""
+Performs the same operation as :func:`torch.crow_indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.col_indices_copy,
+    r"""
+Performs the same operation as :func:`torch.col_indices`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.unbind_copy,
+    r"""
+Performs the same operation as :func:`torch.unbind`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.view_copy,
+    r"""
+Performs the same operation as :func:`torch.view`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.unfold_copy,
+    r"""
+Performs the same operation as :func:`torch.unfold`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+add_docstr(
+    torch.alias_copy,
+    r"""
+Performs the same operation as :func:`torch.alias`, but all output tensors
+are freshly created instead of aliasing the input.
+""",
+)
+
+for unary_base_func_name in (
+    "exp",
+    "sqrt",
+    "abs",
+    "acos",
+    "asin",
+    "atan",
+    "ceil",
+    "cos",
+    "cosh",
+    "erf",
+    "erfc",
+    "expm1",
+    "floor",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "neg",
+    "tan",
+    "tanh",
+    "sin",
+    "sinh",
+    "round",
+    "lgamma",
+    "frac",
+    "reciprocal",
+    "sigmoid",
+    "trunc",
+    "zero",
+):
+    unary_foreach_func_name = f"_foreach_{unary_base_func_name}"
+    if hasattr(torch, unary_foreach_func_name):
+        add_docstr(
+            getattr(torch, unary_foreach_func_name),
+            rf"""
+{unary_foreach_func_name}(self: List[Tensor]) -> List[Tensor]
+
+Apply :func:`torch.{unary_base_func_name}` to each Tensor of the input list.
+            """,
+        )
+    unary_inplace_foreach_func_name = f"{unary_foreach_func_name}_"
+    if hasattr(torch, unary_inplace_foreach_func_name):
+        add_docstr(
+            getattr(torch, unary_inplace_foreach_func_name),
+            rf"""
+{unary_inplace_foreach_func_name}(self: List[Tensor]) -> None
+
+Apply :func:`torch.{unary_base_func_name}` to each Tensor of the input list.
+        """,
+        )
diff --git a/MLPY/Lib/site-packages/torch/_utils.py b/MLPY/Lib/site-packages/torch/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8f51000ed63dd93dae1fd22d1bd19d507e127b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_utils.py
@@ -0,0 +1,937 @@
+import copyreg
+import functools
+import sys
+import traceback
+import warnings
+from collections import defaultdict
+from typing import Any, DefaultDict, List, Optional
+
+import torch
+
+
+def _type(self, dtype=None, non_blocking=False, **kwargs):
+    """Returns the type if `dtype` is not provided, else casts this object to
+    the specified type.
+
+    If this is already of the correct type, no copy is performed and the
+    original object is returned.
+
+    Args:
+        dtype (type or string): The desired type
+        non_blocking (bool): If ``True``, and the source is in pinned memory
+            and destination is on the GPU or vice versa, the copy is performed
+            asynchronously with respect to the host. Otherwise, the argument
+            has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument. The ``async`` arg is deprecated.
+    """
+    non_blocking = _get_async_or_non_blocking("type", non_blocking, kwargs)
+    if dtype is None:
+        return self.__module__ + "." + self.__class__.__name__
+
+    if isinstance(dtype, str):
+        dtype = _import_dotted_name(dtype)
+    if dtype == type(self):
+        return self
+    if self.is_sparse:
+        if not dtype.is_sparse:
+            raise RuntimeError("Cannot cast sparse tensor to dense tensor")
+        new_module_name = dtype.__module__.replace(".sparse", "")
+        new_values_type_name = new_module_name + "." + dtype.__name__
+        new_values = torch.Tensor._values(self).type(new_values_type_name, non_blocking)
+        new_indices_type_name = new_module_name + ".LongTensor"
+        new_indices = torch.Tensor._indices(self).type(
+            new_indices_type_name, non_blocking
+        )
+        return dtype(new_indices, new_values, self.size())
+    if dtype.is_sparse:
+        raise RuntimeError("Cannot cast dense tensor to sparse tensor")
+    return dtype(self.size()).copy_(self, non_blocking)
+
+
+def _hpu(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in HPU memory.
+
+    If this object is already in HPU memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination HPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("hpu", non_blocking, kwargs)
+    hpu = getattr(torch, "hpu", None)
+    assert hpu is not None, "HPU device module is not loaded"
+    if self.is_hpu:
+        if device is None:
+            device = hpu.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with hpu.device(device):
+        assert not self.is_sparse, "sparse storage is not supported for HPU tensors"
+        untyped_storage = torch.UntypedStorage(self.size(), device=torch.device("hpu"))
+        untyped_storage.copy_(self, non_blocking)
+        return untyped_storage
+
+
+def _cuda(self, device=None, non_blocking=False, **kwargs):
+    """Returns a copy of this object in CUDA memory.
+
+    If this object is already in CUDA memory and on the correct device, then
+    no copy is performed and the original object is returned.
+
+    Args:
+        device (int): The destination GPU id. Defaults to the current device.
+        non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        **kwargs: For compatibility, may contain the key ``async`` in place of
+            the ``non_blocking`` argument.
+    """
+    non_blocking = _get_async_or_non_blocking("cuda", non_blocking, kwargs)
+    if self.is_cuda:
+        if device is None:
+            device = torch.cuda.current_device()
+        if self.get_device() == device:
+            return self
+    else:
+        if device is None:
+            device = -1
+    with torch.cuda.device(device):
+        if self.is_sparse:
+            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
+            indices = torch.Tensor._indices(self).cuda(device, non_blocking)
+            values = torch.Tensor._values(self).cuda(device, non_blocking)
+            return new_type(indices, values, self.size())
+        else:
+            untyped_storage = torch.UntypedStorage(
+                self.size(), device=torch.device("cuda")
+            )
+            untyped_storage.copy_(self, non_blocking)
+            return untyped_storage
+
+
+def _get_async_or_non_blocking(function_name, non_blocking, kwargs):
+    """Return the non-blocking flag given the function name and kwargs.
+
+    Args:
+        function_name (str): the name of the function being used.
+        non_blocking (bool): the default value.
+        **kwargs (dict): the kwargs passed to the function.
+    """
+    if not kwargs:
+        return non_blocking
+    if len(kwargs) != 1 or "async" not in kwargs:
+        message = "{}() got an unexpected keyword argument '{}'"
+        argument = list(kwargs.keys()).pop()
+        raise TypeError(message.format(function_name, argument))
+    warnings.warn("'async' is deprecated; use 'non_blocking'")
+    return kwargs["async"]
+
+
+# Note [Don't serialize hooks]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Since time immemorial, we have serialized the backward hooks associated with
+# variables.  This kind of half-worked--Python can pickle global functions
+# (but not closures!)--but there were problems.
+#
+#   - It's fragile.  If you serialize a backward hook into a saved
+#     model, and then you rename the function associated with the hook,
+#     now your saved model is broken and you can't load it anymore.
+#
+#   - It's not actually used.  The standard recommendation is to
+#     serialize the *state_dict* of a model, not the model itself
+#     (since this is more stable to code changes affecting the model
+#     serialization), and the state dict saves "data" only, thus
+#     stripping the backward hooks.  In some cases, hooks are
+#     essential to the well-functioning of a model (e.g., DDP),
+#     but DDP already manages readding the hooks!
+#
+#   - We didn't serialize them in many cases.  Prior to #10220, we
+#     were dropping backward hooks in ForkingPickler.  We "fixed" this
+#     to be convenient with other serialization sites, but lack of
+#     serializing backward hooks wasn't actually the root cause of
+#     the bug.
+#
+# With these cases in mind, we have decided that a better strategy
+# is to just NOT serialize hooks at all.
+#
+# Since this is a BC-breaking change, we should warn when we previously
+# serialized a hook, but no longer do so. This will be done by adding a special
+# sentinel property to hooks will be used to suppress this warning. If a hook
+# has the property _torch_serialize_ignore, we will not emit a warning if we
+# attempt to serialize a Tensor with this hook attached to it.
+#
+# By the way, when _backward_hooks is skipped, we must give an EMPTY
+# OrderedDict(), if you pass a None you'll run afoul #12219.
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_tensor(storage, storage_offset, size, stride):
+    # first construct a tensor with the correct dtype/device
+    t = torch.empty((0,), dtype=storage.dtype, device=storage._untyped_storage.device)
+    return t.set_(storage._untyped_storage, storage_offset, size, stride)
+
+
+def get_tensor_metadata(tensor):
+    # Tensor's Metadata for serializing.
+    # Currently, this only returns a dict[string, bool] specifing whether
+    # `conj` or `neg` bit is set.
+    assert isinstance(tensor, torch.Tensor)
+    return torch._C._get_tensor_metadata(tensor)  # type: ignore[attr-defined]
+
+
+def set_tensor_metadata(tensor, metadata):
+    # See `get_tensor_metadata` above
+    assert isinstance(metadata, dict)
+    assert isinstance(tensor, torch.Tensor)
+    torch._C._set_tensor_metadata(tensor, metadata)  # type: ignore[attr-defined]
+
+
+def _rebuild_tensor_v2(
+    storage, storage_offset, size, stride, requires_grad, backward_hooks, metadata=None
+):
+    tensor = _rebuild_tensor(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    if metadata:
+        set_tensor_metadata(tensor, metadata)
+
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_tensor_v3(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    requires_grad,
+    backward_hooks,
+    dtype,
+    metadata=None,
+):
+    t = torch.empty(
+        (0,),
+        dtype=dtype,
+        device=storage._untyped_storage.device,
+        requires_grad=requires_grad,
+    )
+    t.set_(storage._untyped_storage, storage_offset, size, stride)
+    if metadata:
+        set_tensor_metadata(t, metadata)
+    t._backward_hooks = backward_hooks
+    return t
+
+
+_sparse_tensors_to_validate: List["torch.Tensor"] = []
+
+
+# In _legacy_load() in serialization.py we unpickle storages after the sparse
+# tensors have been already unpickled. Those storages contain data necessary for
+# validating sparse tensors: indices and values. That's why sparse tensors are
+# first unpickled without any validation, and then this function is called just
+# before _legacy_load() returns, so that all the sparse tensors can be validated
+# in bulk.
+#
+# The same procedure must be followed by _load() in serialization.py because due
+# to Pickler semantics, we have to use the same (non-validating) function for
+# unpickling sparse tensors, regardless of the caller.
+def _validate_loaded_sparse_tensors():
+    try:
+        for t in _sparse_tensors_to_validate:
+            if t.layout is torch.sparse_coo:
+                torch._validate_sparse_coo_tensor_args(
+                    t._indices(), t._values(), t.size(), t.is_coalesced()
+                )
+            elif t.layout in {
+                torch.sparse_csr,
+                torch.sparse_csc,
+                torch.sparse_bsr,
+                torch.sparse_bsc,
+            }:
+                # TODO: Validation currently involves an expensive traversal
+                # on CPU, which may include a device transfer.
+                if t.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                    compressed_indices, plain_indices = (
+                        t.crow_indices(),
+                        t.col_indices(),
+                    )
+                else:
+                    compressed_indices, plain_indices = (
+                        t.ccol_indices(),
+                        t.row_indices(),
+                    )
+                torch._validate_sparse_compressed_tensor_args(
+                    compressed_indices, plain_indices, t.values(), t.size(), t.layout
+                )
+            else:
+                raise NotImplementedError(
+                    f"_validate_loaded_sparse_tensors for layout `{t.layout}`"
+                )
+
+    finally:
+        _sparse_tensors_to_validate.clear()
+
+
+def _rebuild_sparse_tensor(layout, data):
+    """
+    Rebuilds a sparse tensor from its sparse storage representation.
+
+    Args:
+        layout (str): The sparse storage layout of the tensor.
+        data (tuple): The tensor's sparse storage representation.
+    """
+    if layout == torch.sparse_coo:
+        if len(data) == 3:
+            # For BC:
+            indices, values, size = data
+            is_coalesced = None
+        else:
+            indices, values, size, is_coalesced = data
+        result = torch.sparse_coo_tensor(
+            indices, values, size, check_invariants=False, is_coalesced=is_coalesced
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    elif layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        compressed_indices, plain_indices, values, size = data
+        result = torch.sparse_compressed_tensor(
+            compressed_indices,
+            plain_indices,
+            values,
+            size,
+            layout=layout,
+            check_invariants=False,
+        )
+        _sparse_tensors_to_validate.append(result)
+        return result
+
+    raise NotImplementedError(f"rebuilding sparse tensor for layout {layout}")
+
+
+def _rebuild_nested_tensor(buffer, sizes, strides, storage_offsets):
+    return torch._nested_view_from_buffer(buffer, sizes, strides, storage_offsets)
+
+
+def _rebuild_device_tensor_from_numpy(data, dtype, device, requires_grad):
+    tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
+    tensor.requires_grad = requires_grad
+    return tensor
+
+
+# Should not be used, only here to be able to load Tensors serialized with older versions of pytorch
+_rebuild_xla_tensor = _rebuild_device_tensor_from_numpy
+
+
+def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
+    return torch.empty_strided(
+        size, stride, dtype=dtype, device="meta", requires_grad=requires_grad
+    )
+
+
+def _rebuild_wrapper_subclass(
+    cls, dtype, size, stride, storage_offset, layout, device, requires_grad
+):
+    return torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+        cls,
+        size,
+        strides=stride,
+        storage_offset=storage_offset,
+        layout=layout,
+        device=device,
+        requires_grad=requires_grad,
+    )
+
+
+# TODO: Once we decide to break serialization FC, `storage` no longer needs to
+# be a TypedStorage
+def _rebuild_qtensor(
+    storage,
+    storage_offset,
+    size,
+    stride,
+    quantizer_params,
+    requires_grad,
+    backward_hooks,
+):
+    qscheme = quantizer_params[0]
+    if qscheme == torch.per_tensor_affine:
+        _, scale, zero_point = quantizer_params
+        tensor = torch._empty_affine_quantized(
+            size,
+            scale=scale,
+            zero_point=zero_point,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    elif qscheme in (torch.per_channel_affine, torch.per_channel_affine_float_qparams):
+        _, scales, zero_points, axis = quantizer_params
+        if type(scales) is list and type(zero_points) is list:
+            if qscheme == torch.per_channel_affine:
+                scales = torch.tensor(scales, dtype=torch.double, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.long, device=storage.device
+                )
+            else:
+                scales = torch.tensor(scales, dtype=torch.float, device=storage.device)
+                zero_points = torch.tensor(
+                    zero_points, dtype=torch.float, device=storage.device
+                )
+        tensor = torch._empty_per_channel_affine_quantized(
+            size,
+            scales=scales,
+            zero_points=zero_points,
+            axis=axis,
+            dtype=storage.dtype,
+            device=storage.device,
+        )
+    else:
+        raise RuntimeError(f"Can't deserialize quantized tensor with qscheme {qscheme}")
+    tensor.set_(storage, storage_offset, size, stride)
+    tensor.requires_grad = requires_grad
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    tensor._backward_hooks = backward_hooks
+    return tensor
+
+
+def _rebuild_parameter(data, requires_grad, backward_hooks):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    return param
+
+
+def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
+    param = torch.nn.Parameter(data, requires_grad)
+    # NB: This line exists only for backwards compatibility; the
+    # general expectation is that backward_hooks is an empty
+    # OrderedDict.  See Note [Don't serialize hooks]
+    param._backward_hooks = backward_hooks
+
+    # Restore state on Parameter like python attr.
+    param = _set_obj_state(param, state)
+    return param
+
+
+def _get_obj_state(obj):
+    # Get the state of the python subclass
+    # This loosely mimicks the function on the object class but since Tensor do not inherit
+    # from it, we cannot call that function directly
+    # https://github.com/python/cpython/blob/c83919bd635f4433f1c6ae8504996a9fe3c215e5/Objects/typeobject.c#L4891
+    # Note that starting with Python 3.11, this `__getstate__` is always defined and thus
+    # the else branch will never be taken.
+    getstate_fn = getattr(obj, "__getstate__", None)
+    if getstate_fn:
+        state = getstate_fn()
+    else:
+        slots_to_save = copyreg._slotnames(obj.__class__)  # type: ignore[attr-defined]
+        if slots_to_save:
+            state = (
+                obj.__dict__,
+                {
+                    name: getattr(obj, name)
+                    for name in slots_to_save
+                    if hasattr(obj, name)
+                },
+            )
+        else:
+            state = obj.__dict__
+
+    return state
+
+
+def _set_obj_state(obj, state):
+    if isinstance(state, tuple):
+        if not len(state) == 2:
+            raise RuntimeError(f"Invalid serialized state: {state}")
+        dict_state = state[0]
+        slots_state = state[1]
+    else:
+        dict_state = state
+        slots_state = None
+
+    # Starting with Python 3.11, the __dict__ attribute is lazily created
+    # and is serialized as None when not needed.
+    if dict_state:
+        for k, v in dict_state.items():
+            setattr(obj, k, v)
+
+    if slots_state:
+        for k, v in slots_state.items():
+            setattr(obj, k, v)
+    return obj
+
+
+def _import_dotted_name(name):
+    components = name.split(".")
+    obj = __import__(components[0])
+    for component in components[1:]:
+        obj = getattr(obj, component)
+    return obj
+
+
+def _flatten_dense_tensors(tensors):
+    """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
+    same dense type.
+
+    Since inputs are dense, the resulting tensor will be a concatenated 1D
+    buffer. Element-wise operation on this buffer will be equivalent to
+    operating individually.
+
+    Args:
+        tensors (Iterable[Tensor]): dense tensors to flatten.
+
+    Returns:
+        A contiguous 1D buffer containing input tensors.
+    """
+    return torch._C._nn.flatten_dense_tensors(tensors)
+
+
+def _flatten_sparse_tensors(tensors):
+    """Flatten sparse tensors into two contiguous 1D buffers, one of indices and
+    one of values. Assume tensors are of same sparse type.
+
+    Args:
+        tensors (Iterable[Tensor]): sparse tensors to flatten.
+
+    Returns:
+        A tuple of two contiguous 1D buffers, one containing input tensors'
+        indices and the other containing the values.
+    """
+    flat_indices = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._indices(t) for t in tensors]
+    )
+    flat_values = torch._C._nn.flatten_dense_tensors(
+        [torch.Tensor._values(t) for t in tensors]
+    )
+    return flat_indices, flat_values
+
+
+def _unflatten_dense_tensors(flat, tensors):
+    """View a flat buffer using the sizes of tensors. Assume that tensors are of
+    same dense type, and that flat is given by _flatten_dense_tensors.
+
+    Args:
+        flat (Tensor): flattened dense tensors to unflatten.
+        tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened dense tensors with sizes same as tensors and values from
+        flat.
+    """
+    return torch._C._nn.unflatten_dense_tensors(flat, tensors)
+
+
+def _unflatten_sparse_tensors(flat, tensors):
+    """View flat buffer (containing indices and values) using the sizes of
+    tensors. Assume that tensors are of same sparse type, and that flat is given
+    by _flatten_sparse_tensors.
+
+    Args:
+        flat (tuple(Tensor, Tensor)): flattened indices and values of sparse
+          tensors to unflatten.
+        tensors (Iterable[Tensor]): sparse tensors whose sizes will be used to
+          unflatten flat.
+
+    Returns:
+        Unflattened sparse tensors with sizes same as tensors and values from
+        flat.
+    """
+    flat_indices, flat_values = flat
+    indices = torch._C._nn.unflatten_dense_tensors(
+        flat_indices, [torch.Tensor._indices(t) for t in tensors]
+    )
+    values = torch._C._nn.unflatten_dense_tensors(
+        flat_values, [torch.Tensor._values(t) for t in tensors]
+    )
+    outputs = []
+    for t, i, v in zip(tensors, indices, values):
+        outputs.append(t.new(i, v, t.size()))
+    return tuple(outputs)
+
+
+def _reorder_tensors_as(tensors, ordered_tensors):
+    """Assume that tensors are of same order as ordered_tensors within their
+    types, e.g., from _take_tensors. Reorder them to be of same order as
+    ordered_tensors.
+
+    Args:
+        tensors (Iterable[Tensor]): tensors to be reordered. They should be of
+          the same order as ordered_tensors within their own types.
+        ordered_tensors (Iterable[Tensor]): tensors whose order will be the
+          reference.
+
+    Returns:
+        Ordered tuple of tensors with contents from tensors and order of
+        ordered_tensors.
+    """
+    type_dict = defaultdict(list)
+    for tensor in tensors:
+        type_dict[tensor.type()].append(tensor)
+    type_dict_ = {t: iter(coll) for t, coll in type_dict.items()}
+    return tuple(next(type_dict_[tensor.type()]) for tensor in ordered_tensors)
+
+
+def _take_tensors(tensors, size_limit):
+    """Group tensors into chunks. This generator yields a chunk at each time,
+    each containing tensors of same type up to certain byte limit in total size.
+
+    Args:
+        tensors (Sequence): A sequence of tensors to be separated into chunks.
+        size_limit (int): The limit of each chunk in bytes.
+
+    Yields:
+        Blocks of tensors of same type and within size_limit. The yielded
+        tensors are only ordered as the original sequence within its types.
+    """
+    buf_dict: DefaultDict[str, List] = defaultdict(lambda: [[], 0])
+    for tensor in tensors:
+        t = tensor.type()
+        if tensor.is_sparse:
+            indices = torch.Tensor._indices(tensor)
+            values = torch.Tensor._values(tensor)
+            size = (
+                indices.numel() * indices.element_size()
+                + values.numel() * values.element_size()
+            )
+        else:
+            size = tensor.numel() * tensor.element_size()
+        buf_and_size = buf_dict[t]
+        if buf_and_size[1] + size > size_limit and buf_and_size[1] > 0:
+            yield buf_and_size[0]
+            buf_and_size = buf_dict[t] = [[], 0]
+        buf_and_size[0].append(tensor)
+        buf_and_size[1] += size
+    for buf, _ in buf_dict.values():
+        if len(buf) > 0:
+            yield buf
+
+
+# annotation decorator to get annotations in a way that is compatible
+# with both Python 2 and 3
+def annotate(ret, **kwargs):
+    def dec(fun):
+        fun.__annotations__ = dict(kwargs)
+        fun.__annotations__["return"] = ret
+        return fun
+
+    return dec
+
+
+def render_call(fn, args, kwargs):
+    str_fn = torch.overrides.resolve_name(fn)
+    if str_fn is None:
+        str_fn = str(fn)
+
+    str_args: List[str] = []
+    with torch._tensor_str.printoptions(threshold=0, edgeitems=0):
+        str_args.extend(repr(a) for a in args)
+        str_args.extend(f"{k}={repr(v)}" for k, v in kwargs.items())
+        r = f"{str_fn}({', '.join(str_args)})"
+    return r
+
+
+# NOTE [ Python Traceback Reference Cycle Problem ]
+#
+# When using sys.exc_info(), it is important to **not** store the exc_info[2],
+# which is the traceback, because otherwise you will run into the traceback
+# reference cycle problem, i.e., the traceback holding reference to the frame,
+# and the frame (which holds reference to all the object in its temporary scope)
+# holding reference the traceback.
+
+
+class KeyErrorMessage(str):
+    r"""str subclass that returns itself in repr"""
+
+    def __repr__(self):
+        return self
+
+
+class ExceptionWrapper:
+    r"""Wraps an exception plus traceback to communicate across threads"""
+
+    def __init__(self, exc_info=None, where="in background"):
+        # It is important that we don't store exc_info, see
+        # NOTE [ Python Traceback Reference Cycle Problem ]
+        if exc_info is None:
+            exc_info = sys.exc_info()
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+        self.where = where
+
+    def reraise(self):
+        r"""Reraises the wrapped exception in the current thread"""
+        # Format a message such as: "Caught ValueError in DataLoader worker
+        # process 2. Original Traceback:", followed by the traceback.
+        msg = f"Caught {self.exc_type.__name__} {self.where}.\nOriginal {self.exc_msg}"
+        if self.exc_type == KeyError:
+            # KeyError calls repr() on its argument (usually a dict key). This
+            # makes stack traces unreadable. It will not be changed in Python
+            # (https://bugs.python.org/issue2651), so we work around it.
+            msg = KeyErrorMessage(msg)
+        elif getattr(self.exc_type, "message", None):
+            # Some exceptions have first argument as non-str but explicitly
+            # have message field
+            raise self.exc_type(message=msg)
+        try:
+            exception = self.exc_type(msg)
+        except TypeError:
+            # If the exception takes multiple arguments, don't try to
+            # instantiate since we don't know how to
+            raise RuntimeError(msg) from None
+        raise exception
+
+
+def _get_available_device_type():
+    if torch.cuda.is_available():
+        return "cuda"
+    if hasattr(torch, "xpu") and torch.xpu.is_available():  # type: ignore[attr-defined]
+        return "xpu"
+    custom_backend_name = torch._C._get_privateuse1_backend_name()
+    custom_device_mod = getattr(torch, custom_backend_name, None)
+    if custom_device_mod and custom_device_mod.is_available():
+        return custom_backend_name
+    # add more available device types here
+    return None
+
+
+def _get_device_attr(get_member):
+    device_type = _get_available_device_type()
+    if device_type and device_type.lower() == "cuda":
+        return get_member(torch.cuda)
+    if device_type and device_type.lower() == "xpu":
+        return get_member(torch.xpu)  # type: ignore[attr-defined]
+    if device_type == torch._C._get_privateuse1_backend_name():
+        return get_member(getattr(torch, device_type))
+    # add more available device types here
+    return None
+
+
+def _get_current_device_index():
+    # current device index
+    return _get_device_attr(lambda m: m.current_device())
+
+
+def _get_all_device_indices():
+    # all device index
+    return _get_device_attr(lambda m: list(range(m.device_count())))
+
+
+def _get_devices_properties(device_ids):
+    # all device properties
+    return [_get_device_attr(lambda m: m.get_device_properties(i)) for i in device_ids]
+
+
+def get_current_device_index() -> int:
+    r"""Checks if there are CUDA devices available and
+    returns the device index of the current default CUDA device.
+    Returns -1 in case there are no CUDA devices available.
+    Arguments: ``None``
+    """
+    if torch.cuda.device_count() > 0:
+        return torch.cuda.current_device()
+    return -1
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    has index. Note that for a device without a specified index,
+    i.e., ``torch.device('xxx')``, this will return the current default
+    device of that type if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default
+    device of the supported runtime platform if :attr:`optional` is ``True``.
+    i.e., the current default CUDA device will be returned if CUDA runtime is supported.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    device_idx: Optional[int] = None
+    if isinstance(device, torch.device):
+        if not allow_cpu and device.type == "cpu":
+            raise ValueError(f"Expected a non cpu device, but got: {device}")
+        device_idx = -1 if device.type == "cpu" else device.index
+    if isinstance(device, int):
+        device_idx = device
+    if device_idx is None:
+        if optional:
+            # The eager API _get_current_device_index uses `lambda` functions which are
+            # not supported in JIT and hence not scriptable. The JIT equivalent API to get
+            # the current device index is `get_current_device_index()` which can
+            # be scripted. We use is_scripting to check the mode we are in and call the
+            # appropriate API.
+            if torch.jit.is_scripting():
+                device_idx = get_current_device_index()
+            else:
+                device_idx = _get_current_device_index()
+        else:
+            raise ValueError(
+                f"Expected a torch.device with a specified index or an integer, but got:{device}"
+            )
+    return device_idx
+
+
+def _handle_complex(tensor):
+    """
+    Returns a real view of a tensor if complex dtype else just the tensor
+    need to check if a UninitializedParameter because otherwise checking is_complex is an error for a LazyModule
+    """
+    return (
+        torch.view_as_real(tensor)
+        if not isinstance(tensor, torch.nn.UninitializedParameter)
+        and tensor.is_complex()
+        else tensor
+    )
+
+
+def _element_size(dtype):
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if not isinstance(dtype, torch.dtype):
+        raise RuntimeError(f"expected torch.dtype, but got {type(dtype)}")
+
+    if dtype.is_complex:
+        return torch.finfo(dtype).bits >> 2
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype).bits >> 3
+    elif dtype == torch.bool:
+        # NOTE: torch.bool is not supported in torch.iinfo()
+        return 1
+    else:
+        return torch.iinfo(dtype).bits >> 3
+
+
+class _ClassPropertyDescriptor:
+    def __init__(self, fget, fset=None):
+        self.fget = fget
+
+    def __get__(self, instance, owner=None):
+        if owner is None:
+            owner = type(instance)
+        return self.fget.__get__(instance, owner)()
+
+
+def classproperty(func):
+    if not isinstance(func, (classmethod, staticmethod)):
+        func = classmethod(func)
+    return _ClassPropertyDescriptor(func)
+
+
+def is_compiling() -> bool:
+    """
+    Indicates whether we are tracing/compiling with torch.compile() or torch.export().
+
+    TODO(khabinov): we should deprecate this function and use torch.compiler.is_compiling().
+    """
+    return torch.compiler.is_compiling()
+
+
+def _functionalize_sync(t):
+    # This code lives in python instead of C++ since conditioning on a certain python subclass
+    # is much more of a pain in C++.
+    from torch._subclasses.functional_tensor import FunctionalTensor
+
+    if isinstance(t, FunctionalTensor):
+        # If a FunctionalTensorMode is active while syncing, we don't want it to intercept any ops that get called
+        # when we sync our inner tensor.
+        # Why?
+        # (1) If there are input mutations in the graph, then they will be re-applied during
+        #     AOTAutograd when we call _sync() from inside of our functionalization kernels.
+        # (2) _sync() causes us to regenerate our updated the tensor from the updated base,
+        #     which dispatches to a bunch of view ops
+        # (3) The input to these view ops is our inner FunctionalTensorWrapper
+        #     (since the sync was called from C++), not the python FunctionalTensor
+        # (4) if a python FunctionalTensorMode is active, it will complain when it intercepts
+        #     the view op, since it will see an input that is a C++ FunctionalTensorWrapper
+        #     (aka a normal torch.Tensor) instead of a python `FunctionalTensor).
+        maybe_functional_mode = torch._C._unset_dispatch_mode(
+            torch._C._TorchDispatchModeKey.FUNCTIONAL
+        )
+        try:
+            torch._functionalize_sync(t.elem)  # type: ignore[attr-defined]
+        finally:
+            if maybe_functional_mode is not None:
+                torch._C._set_dispatch_mode(maybe_functional_mode)
+    else:
+        torch._functionalize_sync(t)  # type: ignore[attr-defined]
+
+
+@functools.lru_cache(2)
+def _get_device_module(device_type: str):
+    device_module = getattr(torch, device_type, None)
+    if device_module is None:
+        raise RuntimeError(
+            f"Device '{device_type}' does not have a corresponding module registered as 'torch.{device_type}'."
+        )
+    return device_module
+
+
+def _dummy_type(name: str) -> type:
+    def get_err_fn(is_init: bool):
+        def err_fn(obj, *args, **kwargs):
+            if is_init:
+                class_name = obj.__class__.__name__
+            else:
+                class_name = obj.__name__
+            raise RuntimeError(f"Tried to instantiate dummy base class {class_name}")
+
+        return err_fn
+
+    return type(
+        name, (object,), {"__init__": get_err_fn(True), "__new__": get_err_fn(False)}
+    )
+
+
+class _LazySeedTracker:
+    # Since seeding is memory-less, only track the latest seed.
+    # Note: `manual_seed_all` followed by `manual_seed` overwrites
+    # the seed on current device. We track the order of **latest**
+    # calls between these two API.
+    def __init__(self):
+        self.manual_seed_all_cb = None
+        self.manual_seed_cb = None
+        self.call_order = []
+
+    def queue_seed_all(self, cb, traceback):
+        self.manual_seed_all_cb = (cb, traceback)
+        # update seed_all to be latest
+        self.call_order = [self.manual_seed_cb, self.manual_seed_all_cb]
+
+    def queue_seed(self, cb, traceback):
+        self.manual_seed_cb = (cb, traceback)
+        # update seed to be latest
+        self.call_order = [self.manual_seed_all_cb, self.manual_seed_cb]
+
+    def get_calls(self) -> List:
+        return self.call_order
diff --git a/MLPY/Lib/site-packages/torch/_utils_internal.py b/MLPY/Lib/site-packages/torch/_utils_internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..200d07bb05661a3d03bad1faae607be939c5055d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_utils_internal.py
@@ -0,0 +1,138 @@
+import functools
+import logging
+import os
+import sys
+import tempfile
+from typing import Any, Dict
+
+import torch
+
+log = logging.getLogger(__name__)
+
+
+# this arbitrary-looking assortment of functionality is provided here
+# to have a central place for overrideable behavior. The motivating
+# use is the FB build environment, where this source file is replaced
+# by an equivalent.
+
+if torch._running_with_deploy():
+    # __file__ is meaningless in the context of frozen torch used in torch deploy.
+    # setting empty torch_parent should allow below functions to operate without crashing,
+    # but it's unclear if there is a valid use case for them in the context of deploy.
+    torch_parent = ""
+else:
+    if os.path.basename(os.path.dirname(__file__)) == "shared":
+        torch_parent = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+    else:
+        torch_parent = os.path.dirname(os.path.dirname(__file__))
+
+
+def get_file_path(*path_components: str) -> str:
+    return os.path.join(torch_parent, *path_components)
+
+
+def get_file_path_2(*path_components: str) -> str:
+    return os.path.join(*path_components)
+
+
+def get_writable_path(path: str) -> str:
+    if os.access(path, os.W_OK):
+        return path
+    return tempfile.mkdtemp(suffix=os.path.basename(path))
+
+
+def prepare_multiprocessing_environment(path: str) -> None:
+    pass
+
+
+def resolve_library_path(path: str) -> str:
+    return os.path.realpath(path)
+
+
+def throw_abstract_impl_not_imported_error(opname, module, context):
+    if module in sys.modules:
+        raise NotImplementedError(
+            f"{opname}: We could not find the abstract impl for this operator. "
+        )
+    else:
+        raise NotImplementedError(
+            f"{opname}: We could not find the abstract impl for this operator. "
+            f"The operator specified that you may need to import the '{module}' "
+            f"Python module to load the abstract impl. {context}"
+        )
+
+
+# Meta only, see
+# https://www.internalfb.com/intern/wiki/ML_Workflow_Observability/User_Guides/Adding_instrumentation_to_your_code/
+#
+# This will cause an event to get logged to Scuba via the signposts API.  You
+# can view samples on the API at https://fburl.com/scuba/workflow_signpost/zh9wmpqs
+# we log to subsystem "torch", and the category and name you provide here.
+# Each of the arguments translate into a Scuba column.  We're still figuring
+# out local conventions in PyTorch, but category should be something like
+# "dynamo" or "inductor", and name should be a specific string describing what
+# kind of event happened.
+#
+# Killswitch is at
+# https://www.internalfb.com/intern/justknobs/?name=pytorch%2Fsignpost#event
+def signpost_event(category: str, name: str, parameters: Dict[str, Any]):
+    log.info("%s %s: %r", category, name, parameters)
+
+
+def log_compilation_event(metrics):
+    log.info("%s", metrics)
+
+
+def upload_graph(graph):
+    pass
+
+
+def set_pytorch_distributed_envs_from_justknobs():
+    pass
+
+
+def log_export_usage(**kwargs):
+    pass
+
+
+def justknobs_check(name: str) -> bool:
+    """
+    This function can be used to killswitch functionality in FB prod,
+    where you can toggle this value to False in JK without having to
+    do a code push.  In OSS, we always have everything turned on all
+    the time, because downstream users can simply choose to not update
+    PyTorch.  (If more fine-grained enable/disable is needed, we could
+    potentially have a map we lookup name in to toggle behavior.  But
+    the point is that it's all tied to source code in OSS, since there's
+    no live server to query.)
+
+    This is the bare minimum functionality I needed to do some killswitches.
+    We have a more detailed plan at
+    https://docs.google.com/document/d/1Ukerh9_42SeGh89J-tGtecpHBPwGlkQ043pddkKb3PU/edit
+    In particular, in some circumstances it may be necessary to read in
+    a knob once at process start, and then use it consistently for the
+    rest of the process.  Future functionality will codify these patterns
+    into a better high level API.
+
+    WARNING: Do NOT call this function at module import time, JK is not
+    fork safe and you will break anyone who forks the process and then
+    hits JK again.
+    """
+    return True
+
+
+@functools.lru_cache(None)
+def max_clock_rate():
+    from triton.testing import nvsmi
+
+    return nvsmi(["clocks.max.sm"])[0]
+
+
+TEST_MASTER_ADDR = "127.0.0.1"
+TEST_MASTER_PORT = 29500
+# USE_GLOBAL_DEPS controls whether __init__.py tries to load
+# libtorch_global_deps, see Note [Global dependencies]
+USE_GLOBAL_DEPS = True
+# USE_RTLD_GLOBAL_WITH_LIBTORCH controls whether __init__.py tries to load
+# _C.so with RTLD_GLOBAL during the call to dlopen.
+USE_RTLD_GLOBAL_WITH_LIBTORCH = False
diff --git a/MLPY/Lib/site-packages/torch/_vendor/__init__.py b/MLPY/Lib/site-packages/torch/_vendor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/_vendor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_vendor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ded06d8d1b17b21b3c63c8196eac5f8f2cb4409f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_vendor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/__init__.py b/MLPY/Lib/site-packages/torch/_vendor/packaging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d2ad1ca0a2bf4d73bb6dc5252c3407dd0f20d14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_vendor/packaging/__init__.py
@@ -0,0 +1,15 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+
+__title__ = "packaging"
+__summary__ = "Core utilities for Python packages"
+__uri__ = "https://github.com/pypa/packaging"
+
+__version__ = "23.2"
+
+__author__ = "Donald Stufft and individual contributors"
+__email__ = "donald@stufft.io"
+
+__license__ = "BSD-2-Clause or Apache-2.0"
+__copyright__ = "2014 %s" % __author__
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cf156ffc6b1ee368f76d67efaca417d5ec52172
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/_structures.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/_structures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2dd495706ff043fd13f2fa6af28f972ce0e3309
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/_structures.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/version.cpython-39.pyc b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/version.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc3b025be8309c8ad5d34b52bf4c9095fa1fd6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/_vendor/packaging/__pycache__/version.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/_structures.py b/MLPY/Lib/site-packages/torch/_vendor/packaging/_structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc91962d80e24f98b76d0da1d765fc78b0a1dcb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_vendor/packaging/_structures.py
@@ -0,0 +1,61 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+
+
+class InfinityType:
+    def __repr__(self) -> str:
+        return "Infinity"
+
+    def __hash__(self) -> int:
+        return hash(repr(self))
+
+    def __lt__(self, other: object) -> bool:
+        return False
+
+    def __le__(self, other: object) -> bool:
+        return False
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+
+    def __gt__(self, other: object) -> bool:
+        return True
+
+    def __ge__(self, other: object) -> bool:
+        return True
+
+    def __neg__(self: object) -> "NegativeInfinityType":
+        return NegativeInfinity
+
+
+Infinity = InfinityType()
+
+
+class NegativeInfinityType:
+    def __repr__(self) -> str:
+        return "-Infinity"
+
+    def __hash__(self) -> int:
+        return hash(repr(self))
+
+    def __lt__(self, other: object) -> bool:
+        return True
+
+    def __le__(self, other: object) -> bool:
+        return True
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+
+    def __gt__(self, other: object) -> bool:
+        return False
+
+    def __ge__(self, other: object) -> bool:
+        return False
+
+    def __neg__(self: object) -> InfinityType:
+        return Infinity
+
+
+NegativeInfinity = NegativeInfinityType()
diff --git a/MLPY/Lib/site-packages/torch/_vendor/packaging/version.py b/MLPY/Lib/site-packages/torch/_vendor/packaging/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1cca483cee045aa1acfa9f5cf27c0331cc532aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_vendor/packaging/version.py
@@ -0,0 +1,563 @@
+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+"""
+.. testsetup::
+
+    from packaging.version import parse, Version
+"""
+
+import itertools
+import re
+from typing import Any, Callable, NamedTuple, Optional, SupportsInt, Tuple, Union
+
+from ._structures import Infinity, InfinityType, NegativeInfinity, NegativeInfinityType
+
+__all__ = ["VERSION_PATTERN", "parse", "Version", "InvalidVersion"]
+
+LocalType = Tuple[Union[int, str], ...]
+
+CmpPrePostDevType = Union[InfinityType, NegativeInfinityType, Tuple[str, int]]
+CmpLocalType = Union[
+    NegativeInfinityType,
+    Tuple[Union[Tuple[int, str], Tuple[NegativeInfinityType, Union[int, str]]], ...],
+]
+CmpKey = Tuple[
+    int,
+    Tuple[int, ...],
+    CmpPrePostDevType,
+    CmpPrePostDevType,
+    CmpPrePostDevType,
+    CmpLocalType,
+]
+VersionComparisonMethod = Callable[[CmpKey, CmpKey], bool]
+
+
+class _Version(NamedTuple):
+    epoch: int
+    release: Tuple[int, ...]
+    dev: Optional[Tuple[str, int]]
+    pre: Optional[Tuple[str, int]]
+    post: Optional[Tuple[str, int]]
+    local: Optional[LocalType]
+
+
+def parse(version: str) -> "Version":
+    """Parse the given version string.
+
+    >>> parse('1.0.dev1')
+    <Version('1.0.dev1')>
+
+    :param version: The version string to parse.
+    :raises InvalidVersion: When the version string is not a valid version.
+    """
+    return Version(version)
+
+
+class InvalidVersion(ValueError):
+    """Raised when a version string is not a valid version.
+
+    >>> Version("invalid")
+    Traceback (most recent call last):
+        ...
+    packaging.version.InvalidVersion: Invalid version: 'invalid'
+    """
+
+
+class _BaseVersion:
+    _key: Tuple[Any, ...]
+
+    def __hash__(self) -> int:
+        return hash(self._key)
+
+    # Please keep the duplicated `isinstance` check
+    # in the six comparisons hereunder
+    # unless you find a way to avoid adding overhead function calls.
+    def __lt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key < other._key
+
+    def __le__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key <= other._key
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key == other._key
+
+    def __ge__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key >= other._key
+
+    def __gt__(self, other: "_BaseVersion") -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key > other._key
+
+    def __ne__(self, other: object) -> bool:
+        if not isinstance(other, _BaseVersion):
+            return NotImplemented
+
+        return self._key != other._key
+
+
+# Deliberately not anchored to the start and end of the string, to make it
+# easier for 3rd party code to reuse
+_VERSION_PATTERN = r"""
+    v?
+    (?:
+        (?:(?P<epoch>[0-9]+)!)?                           # epoch
+        (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
+        (?P<pre>                                          # pre-release
+            [-_\.]?
+            (?P<pre_l>alpha|a|beta|b|preview|pre|c|rc)
+            [-_\.]?
+            (?P<pre_n>[0-9]+)?
+        )?
+        (?P<post>                                         # post release
+            (?:-(?P<post_n1>[0-9]+))
+            |
+            (?:
+                [-_\.]?
+                (?P<post_l>post|rev|r)
+                [-_\.]?
+                (?P<post_n2>[0-9]+)?
+            )
+        )?
+        (?P<dev>                                          # dev release
+            [-_\.]?
+            (?P<dev_l>dev)
+            [-_\.]?
+            (?P<dev_n>[0-9]+)?
+        )?
+    )
+    (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
+"""
+
+VERSION_PATTERN = _VERSION_PATTERN
+"""
+A string containing the regular expression used to match a valid version.
+
+The pattern is not anchored at either end, and is intended for embedding in larger
+expressions (for example, matching a version number as part of a file name). The
+regular expression should be compiled with the ``re.VERBOSE`` and ``re.IGNORECASE``
+flags set.
+
+:meta hide-value:
+"""
+
+
+class Version(_BaseVersion):
+    """This class abstracts handling of a project's versions.
+
+    A :class:`Version` instance is comparison aware and can be compared and
+    sorted using the standard Python interfaces.
+
+    >>> v1 = Version("1.0a5")
+    >>> v2 = Version("1.0")
+    >>> v1
+    <Version('1.0a5')>
+    >>> v2
+    <Version('1.0')>
+    >>> v1 < v2
+    True
+    >>> v1 == v2
+    False
+    >>> v1 > v2
+    False
+    >>> v1 >= v2
+    False
+    >>> v1 <= v2
+    True
+    """
+
+    _regex = re.compile(r"^\s*" + VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE)
+    _key: CmpKey
+
+    def __init__(self, version: str) -> None:
+        """Initialize a Version object.
+
+        :param version:
+            The string representation of a version which will be parsed and normalized
+            before use.
+        :raises InvalidVersion:
+            If the ``version`` does not conform to PEP 440 in any way then this
+            exception will be raised.
+        """
+
+        # Validate the version and parse it into pieces
+        match = self._regex.search(version)
+        if not match:
+            raise InvalidVersion(f"Invalid version: '{version}'")
+
+        # Store the parsed out pieces of the version
+        self._version = _Version(
+            epoch=int(match.group("epoch")) if match.group("epoch") else 0,
+            release=tuple(int(i) for i in match.group("release").split(".")),
+            pre=_parse_letter_version(match.group("pre_l"), match.group("pre_n")),
+            post=_parse_letter_version(
+                match.group("post_l"), match.group("post_n1") or match.group("post_n2")
+            ),
+            dev=_parse_letter_version(match.group("dev_l"), match.group("dev_n")),
+            local=_parse_local_version(match.group("local")),
+        )
+
+        # Generate a key which will be used for sorting
+        self._key = _cmpkey(
+            self._version.epoch,
+            self._version.release,
+            self._version.pre,
+            self._version.post,
+            self._version.dev,
+            self._version.local,
+        )
+
+    def __repr__(self) -> str:
+        """A representation of the Version that shows all internal state.
+
+        >>> Version('1.0.0')
+        <Version('1.0.0')>
+        """
+        return f"<Version('{self}')>"
+
+    def __str__(self) -> str:
+        """A string representation of the version that can be rounded-tripped.
+
+        >>> str(Version("1.0a5"))
+        '1.0a5'
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        # Pre-release
+        if self.pre is not None:
+            parts.append("".join(str(x) for x in self.pre))
+
+        # Post-release
+        if self.post is not None:
+            parts.append(f".post{self.post}")
+
+        # Development release
+        if self.dev is not None:
+            parts.append(f".dev{self.dev}")
+
+        # Local version segment
+        if self.local is not None:
+            parts.append(f"+{self.local}")
+
+        return "".join(parts)
+
+    @property
+    def epoch(self) -> int:
+        """The epoch of the version.
+
+        >>> Version("2.0.0").epoch
+        0
+        >>> Version("1!2.0.0").epoch
+        1
+        """
+        return self._version.epoch
+
+    @property
+    def release(self) -> Tuple[int, ...]:
+        """The components of the "release" segment of the version.
+
+        >>> Version("1.2.3").release
+        (1, 2, 3)
+        >>> Version("2.0.0").release
+        (2, 0, 0)
+        >>> Version("1!2.0.0.post0").release
+        (2, 0, 0)
+
+        Includes trailing zeroes but not the epoch or any pre-release / development /
+        post-release suffixes.
+        """
+        return self._version.release
+
+    @property
+    def pre(self) -> Optional[Tuple[str, int]]:
+        """The pre-release segment of the version.
+
+        >>> print(Version("1.2.3").pre)
+        None
+        >>> Version("1.2.3a1").pre
+        ('a', 1)
+        >>> Version("1.2.3b1").pre
+        ('b', 1)
+        >>> Version("1.2.3rc1").pre
+        ('rc', 1)
+        """
+        return self._version.pre
+
+    @property
+    def post(self) -> Optional[int]:
+        """The post-release number of the version.
+
+        >>> print(Version("1.2.3").post)
+        None
+        >>> Version("1.2.3.post1").post
+        1
+        """
+        return self._version.post[1] if self._version.post else None
+
+    @property
+    def dev(self) -> Optional[int]:
+        """The development number of the version.
+
+        >>> print(Version("1.2.3").dev)
+        None
+        >>> Version("1.2.3.dev1").dev
+        1
+        """
+        return self._version.dev[1] if self._version.dev else None
+
+    @property
+    def local(self) -> Optional[str]:
+        """The local version segment of the version.
+
+        >>> print(Version("1.2.3").local)
+        None
+        >>> Version("1.2.3+abc").local
+        'abc'
+        """
+        if self._version.local:
+            return ".".join(str(x) for x in self._version.local)
+        else:
+            return None
+
+    @property
+    def public(self) -> str:
+        """The public portion of the version.
+
+        >>> Version("1.2.3").public
+        '1.2.3'
+        >>> Version("1.2.3+abc").public
+        '1.2.3'
+        >>> Version("1.2.3+abc.dev1").public
+        '1.2.3'
+        """
+        return str(self).split("+", 1)[0]
+
+    @property
+    def base_version(self) -> str:
+        """The "base version" of the version.
+
+        >>> Version("1.2.3").base_version
+        '1.2.3'
+        >>> Version("1.2.3+abc").base_version
+        '1.2.3'
+        >>> Version("1!1.2.3+abc.dev1").base_version
+        '1!1.2.3'
+
+        The "base version" is the public version of the project without any pre or post
+        release markers.
+        """
+        parts = []
+
+        # Epoch
+        if self.epoch != 0:
+            parts.append(f"{self.epoch}!")
+
+        # Release segment
+        parts.append(".".join(str(x) for x in self.release))
+
+        return "".join(parts)
+
+    @property
+    def is_prerelease(self) -> bool:
+        """Whether this version is a pre-release.
+
+        >>> Version("1.2.3").is_prerelease
+        False
+        >>> Version("1.2.3a1").is_prerelease
+        True
+        >>> Version("1.2.3b1").is_prerelease
+        True
+        >>> Version("1.2.3rc1").is_prerelease
+        True
+        >>> Version("1.2.3dev1").is_prerelease
+        True
+        """
+        return self.dev is not None or self.pre is not None
+
+    @property
+    def is_postrelease(self) -> bool:
+        """Whether this version is a post-release.
+
+        >>> Version("1.2.3").is_postrelease
+        False
+        >>> Version("1.2.3.post1").is_postrelease
+        True
+        """
+        return self.post is not None
+
+    @property
+    def is_devrelease(self) -> bool:
+        """Whether this version is a development release.
+
+        >>> Version("1.2.3").is_devrelease
+        False
+        >>> Version("1.2.3.dev1").is_devrelease
+        True
+        """
+        return self.dev is not None
+
+    @property
+    def major(self) -> int:
+        """The first item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").major
+        1
+        """
+        return self.release[0] if len(self.release) >= 1 else 0
+
+    @property
+    def minor(self) -> int:
+        """The second item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").minor
+        2
+        >>> Version("1").minor
+        0
+        """
+        return self.release[1] if len(self.release) >= 2 else 0
+
+    @property
+    def micro(self) -> int:
+        """The third item of :attr:`release` or ``0`` if unavailable.
+
+        >>> Version("1.2.3").micro
+        3
+        >>> Version("1").micro
+        0
+        """
+        return self.release[2] if len(self.release) >= 3 else 0
+
+
+def _parse_letter_version(
+    letter: Optional[str], number: Union[str, bytes, SupportsInt, None]
+) -> Optional[Tuple[str, int]]:
+
+    if letter:
+        # We consider there to be an implicit 0 in a pre-release if there is
+        # not a numeral associated with it.
+        if number is None:
+            number = 0
+
+        # We normalize any letters to their lower case form
+        letter = letter.lower()
+
+        # We consider some words to be alternate spellings of other words and
+        # in those cases we want to normalize the spellings to our preferred
+        # spelling.
+        if letter == "alpha":
+            letter = "a"
+        elif letter == "beta":
+            letter = "b"
+        elif letter in ["c", "pre", "preview"]:
+            letter = "rc"
+        elif letter in ["rev", "r"]:
+            letter = "post"
+
+        return letter, int(number)
+    if not letter and number:
+        # We assume if we are given a number, but we are not given a letter
+        # then this is using the implicit post release syntax (e.g. 1.0-1)
+        letter = "post"
+
+        return letter, int(number)
+
+    return None
+
+
+_local_version_separators = re.compile(r"[\._-]")
+
+
+def _parse_local_version(local: Optional[str]) -> Optional[LocalType]:
+    """
+    Takes a string like abc.1.twelve and turns it into ("abc", 1, "twelve").
+    """
+    if local is not None:
+        return tuple(
+            part.lower() if not part.isdigit() else int(part)
+            for part in _local_version_separators.split(local)
+        )
+    return None
+
+
+def _cmpkey(
+    epoch: int,
+    release: Tuple[int, ...],
+    pre: Optional[Tuple[str, int]],
+    post: Optional[Tuple[str, int]],
+    dev: Optional[Tuple[str, int]],
+    local: Optional[LocalType],
+) -> CmpKey:
+
+    # When we compare a release version, we want to compare it with all of the
+    # trailing zeros removed. So we'll use a reverse the list, drop all the now
+    # leading zeros until we come to something non zero, then take the rest
+    # re-reverse it back into the correct order and make it a tuple and use
+    # that for our sorting key.
+    _release = tuple(
+        reversed(list(itertools.dropwhile(lambda x: x == 0, reversed(release))))
+    )
+
+    # We need to "trick" the sorting algorithm to put 1.0.dev0 before 1.0a0.
+    # We'll do this by abusing the pre segment, but we _only_ want to do this
+    # if there is not a pre or a post segment. If we have one of those then
+    # the normal sorting rules will handle this case correctly.
+    if pre is None and post is None and dev is not None:
+        _pre: CmpPrePostDevType = NegativeInfinity
+    # Versions without a pre-release (except as noted above) should sort after
+    # those with one.
+    elif pre is None:
+        _pre = Infinity
+    else:
+        _pre = pre
+
+    # Versions without a post segment should sort before those with one.
+    if post is None:
+        _post: CmpPrePostDevType = NegativeInfinity
+
+    else:
+        _post = post
+
+    # Versions without a development segment should sort after those with one.
+    if dev is None:
+        _dev: CmpPrePostDevType = Infinity
+
+    else:
+        _dev = dev
+
+    if local is None:
+        # Versions without a local segment should sort before those with one.
+        _local: CmpLocalType = NegativeInfinity
+    else:
+        # Versions with a local segment need that segment parsed to implement
+        # the sorting rules in PEP440.
+        # - Alpha numeric segments sort before numeric segments
+        # - Alpha numeric segments sort lexicographically
+        # - Numeric segments sort numerically
+        # - Shorter versions sort before longer versions when the prefixes
+        #   match exactly
+        _local = tuple(
+            (i, "") if isinstance(i, int) else (NegativeInfinity, i) for i in local
+        )
+
+    return epoch, _release, _pre, _post, _dev, _local
diff --git a/MLPY/Lib/site-packages/torch/_vmap_internals.py b/MLPY/Lib/site-packages/torch/_vmap_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a2541bb41d7be8dab651d8b5757e3589ec3fd3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_vmap_internals.py
@@ -0,0 +1,237 @@
+import functools
+import warnings
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.utils._pytree import _broadcast_to_and_flatten, tree_flatten, tree_unflatten
+
+in_dims_t = Union[int, Tuple]
+out_dims_t = Union[int, Tuple[int, ...]]
+
+
+# Checks that all args-to-be-batched have the same batch dim size
+def _validate_and_get_batch_size(
+    flat_in_dims: List[Optional[int]], flat_args: List
+) -> int:
+    batch_sizes = [
+        arg.size(in_dim)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+        if in_dim is not None
+    ]
+    if batch_sizes and any(size != batch_sizes[0] for size in batch_sizes):
+        raise ValueError(
+            f"vmap: Expected all tensors to have the same size in the mapped "
+            f"dimension, got sizes {batch_sizes} for the mapped dimension"
+        )
+    return batch_sizes[0]
+
+
+def _num_outputs(batched_outputs: Union[Tensor, Tuple[Tensor, ...]]) -> int:
+    if isinstance(batched_outputs, tuple):
+        return len(batched_outputs)
+    return 1
+
+
+# If value is a tuple, check it has length `num_elements`.
+# If value is not a tuple, make a tuple with `value` repeated `num_elements` times
+def _as_tuple(
+    value: Any, num_elements: int, error_message_lambda: Callable[[], str]
+) -> Tuple:
+    if not isinstance(value, tuple):
+        return (value,) * num_elements
+    if len(value) != num_elements:
+        raise ValueError(error_message_lambda())
+    return value
+
+
+# Creates BatchedTensors for every Tensor in arg that should be batched.
+# Returns the (potentially) batched arguments and the batch_size.
+def _create_batched_inputs(
+    in_dims: in_dims_t, args: Tuple, vmap_level: int, func: Callable
+) -> Tuple[Tuple, int]:
+    if not isinstance(in_dims, int) and not isinstance(in_dims, tuple):
+        raise ValueError(
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"expected `in_dims` to be int or a (potentially nested) tuple "
+            f"matching the structure of inputs, got: {type(in_dims)}."
+        )
+    if len(args) == 0:
+        raise ValueError(
+            f"vmap({_get_name(func)})(<inputs>): got no inputs. Maybe you forgot to add "
+            f"inputs, or you are trying to vmap over a function with no inputs. "
+            f"The latter is unsupported."
+        )
+
+    flat_args, args_spec = tree_flatten(args)
+    flat_in_dims = _broadcast_to_and_flatten(in_dims, args_spec)
+    if flat_in_dims is None:
+        raise ValueError(
+            f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+            f"in_dims is not compatible with the structure of `inputs`. "
+            f"in_dims has structure {tree_flatten(in_dims)[1]} but inputs "
+            f"has structure {args_spec}."
+        )
+
+    for arg, in_dim in zip(flat_args, flat_in_dims):
+        if not isinstance(in_dim, int) and in_dim is not None:
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but in_dim must be either "
+                f"an integer dimension or None."
+            )
+        if isinstance(in_dim, int) and not isinstance(arg, Tensor):
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for an input but the input is of type "
+                f"{type(arg)}. We cannot vmap over non-Tensor arguments, "
+                f"please use None as the respective in_dim"
+            )
+        if in_dim is not None and (in_dim < 0 or in_dim >= arg.dim()):
+            raise ValueError(
+                f"vmap({_get_name(func)}, in_dims={in_dims}, ...)(<inputs>): "
+                f"Got in_dim={in_dim} for some input, but that input is a Tensor "
+                f"of dimensionality {arg.dim()} so expected in_dim to satisfy "
+                f"0 <= in_dim < {arg.dim()}."
+            )
+
+    batch_size = _validate_and_get_batch_size(flat_in_dims, flat_args)
+    # See NOTE [Ignored _remove_batch_dim, _add_batch_dim]
+    batched_inputs = [
+        arg if in_dim is None else torch._add_batch_dim(arg, in_dim, vmap_level)
+        for in_dim, arg in zip(flat_in_dims, flat_args)
+    ]
+    return tree_unflatten(batched_inputs, args_spec), batch_size
+
+
+# Undos the batching (and any batch dimensions) associated with the `vmap_level`.
+def _unwrap_batched(
+    batched_outputs: Union[Tensor, Tuple[Tensor, ...]],
+    out_dims: out_dims_t,
+    vmap_level: int,
+    batch_size: int,
+    func: Callable,
+    allow_none_pass_through: bool = False,
+) -> Tuple:
+    num_outputs = _num_outputs(batched_outputs)
+    out_dims_as_tuple = _as_tuple(
+        out_dims,
+        num_outputs,
+        lambda: f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must "
+        f"have one dim per output (got {num_outputs} outputs) of {_get_name(func)}.",
+    )
+
+    # NOTE [Ignored _remove_batch_dim, _add_batch_dim]
+    # There is something wrong with our type bindings for functions that begin
+    # with '_', see #40397.
+    if isinstance(batched_outputs, Tensor):
+        out_dim = out_dims_as_tuple[0]
+        return torch._remove_batch_dim(batched_outputs, vmap_level, batch_size, out_dim)  # type: ignore[return-value]
+    if allow_none_pass_through:
+        return tuple(
+            (
+                torch._remove_batch_dim(out, vmap_level, batch_size, out_dim)
+                if out is not None
+                else None
+            )
+            for out, out_dim in zip(batched_outputs, out_dims_as_tuple)
+        )
+    else:
+        return tuple(
+            torch._remove_batch_dim(out, vmap_level, batch_size, out_dim)
+            for out, out_dim in zip(batched_outputs, out_dims_as_tuple)
+        )
+
+
+# Checks that `fn` returned one or more Tensors and nothing else.
+# NB: A python function that return multiple arguments returns a single tuple,
+# so we are effectively checking that `outputs` is a single Tensor or a tuple of
+# Tensors.
+def _validate_outputs(outputs: Any, func: Callable) -> None:
+    if isinstance(outputs, Tensor):
+        return
+    if not isinstance(outputs, tuple):
+        raise ValueError(
+            f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return "
+            f"Tensors, got type {type(outputs)} as the return."
+        )
+    for idx, output in enumerate(outputs):
+        if isinstance(output, Tensor):
+            continue
+        raise ValueError(
+            f"vmap({_get_name(func)}, ...): `{_get_name(func)}` must only return "
+            f"Tensors, got type {type(output)} for return {idx}."
+        )
+
+
+def _check_out_dims_is_int_or_int_tuple(out_dims: out_dims_t, func: Callable) -> None:
+    if isinstance(out_dims, int):
+        return
+    if not isinstance(out_dims, tuple) or not all(
+        isinstance(out_dim, int) for out_dim in out_dims
+    ):
+        raise ValueError(
+            f"vmap({_get_name(func)}, ..., out_dims={out_dims}): `out_dims` must be "
+            f"an int or a tuple of int representing where in the outputs the "
+            f"vmapped dimension should appear."
+        )
+
+
+def _get_name(func: Callable):
+    if hasattr(func, "__name__"):
+        return func.__name__
+
+    # Not all callables have __name__, in fact, only static functions/methods do.
+    # A callable created via functools.partial or an nn.Module, to name some
+    # examples, don't have a __name__.
+    return repr(func)
+
+
+# vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
+# sends those into func, and then unwraps the output BatchedTensors. Operations
+# on BatchedTensors perform the batched operations that the user is asking for.
+def vmap(func: Callable, in_dims: in_dims_t = 0, out_dims: out_dims_t = 0) -> Callable:
+    """
+    Please use torch.vmap instead of this API.
+    """
+    warnings.warn(
+        "Please use torch.vmap instead of torch._vmap_internals.vmap. ",
+        stacklevel=2,
+    )
+    return _vmap(func, in_dims, out_dims)
+
+
+# A version of vmap but without the initial "experimental prototype" warning
+def _vmap(
+    func: Callable,
+    in_dims: in_dims_t = 0,
+    out_dims: out_dims_t = 0,
+    allow_none_pass_through: bool = False,
+) -> Callable:
+    # The `allow_none_pass_through` argument is a temporary workaround may be removed.
+    # Currently it enables us to wrap the call in `autograd.grad` to the autograd engine,
+    # which may return None if any of the inputs are unused. See the issue discussing this:
+    # https://github.com/facebookresearch/functorch/issues/159.
+    @functools.wraps(func)
+    def wrapped(*args):
+        _check_out_dims_is_int_or_int_tuple(out_dims, func)
+        vmap_level = torch._C._vmapmode_increment_nesting()
+        try:
+            batched_inputs, batch_size = _create_batched_inputs(
+                in_dims, args, vmap_level, func
+            )
+            batched_outputs = func(*batched_inputs)
+            if not allow_none_pass_through:
+                _validate_outputs(batched_outputs, func)
+            return _unwrap_batched(
+                batched_outputs,
+                out_dims,
+                vmap_level,
+                batch_size,
+                func,
+                allow_none_pass_through=allow_none_pass_through,
+            )
+        finally:
+            torch._C._vmapmode_decrement_nesting()
+
+    return wrapped
diff --git a/MLPY/Lib/site-packages/torch/_weights_only_unpickler.py b/MLPY/Lib/site-packages/torch/_weights_only_unpickler.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf2d467522c237df56e7abd4dd7af0274fbb8ea1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/_weights_only_unpickler.py
@@ -0,0 +1,306 @@
+# Unpickler restricted to loading only state dicts
+# Restrict constructing types to a list defined in _get_allowed_globals()
+# Restrict BUILD operation to `Tensor`, `Parameter` and `OrderedDict` types only
+# Restrict APPEND/APPENDS to `list`
+# In `GLOBALS` operation do not do class lookup by name, but rather rely on dictionary
+# defined by `_get_allowed_globals()` method, that contains:
+# - torch types (Storage, dtypes, Tensor, `torch.Size`),
+# - `torch._utils._rebuild` functions.
+# - `torch.nn.Parameter`
+# - `collections.OrderedDict`
+
+# Based of https://github.com/python/cpython/blob/main/Lib/pickle.py
+# Expected to be useful for loading PyTorch model weights
+# For example:
+# data = urllib.request.urlopen('https://download.pytorch.org/models/resnet50-0676ba61.pth').read()
+# buf = io.BytesIO(data)
+# weights = torch.load(buf, weights_only = True)
+
+import functools as _functools
+from collections import OrderedDict
+from pickle import (
+    APPEND,
+    APPENDS,
+    BINFLOAT,
+    BINGET,
+    BININT,
+    BININT1,
+    BININT2,
+    BINPERSID,
+    BINPUT,
+    BINUNICODE,
+    BUILD,
+    bytes_types,
+    decode_long,
+    EMPTY_DICT,
+    EMPTY_LIST,
+    EMPTY_SET,
+    EMPTY_TUPLE,
+    GLOBAL,
+    LONG1,
+    LONG_BINGET,
+    LONG_BINPUT,
+    MARK,
+    NEWFALSE,
+    NEWOBJ,
+    NEWTRUE,
+    NONE,
+    PROTO,
+    REDUCE,
+    SETITEM,
+    SETITEMS,
+    SHORT_BINSTRING,
+    STOP,
+    TUPLE,
+    TUPLE1,
+    TUPLE2,
+    TUPLE3,
+    UnpicklingError,
+)
+from struct import unpack
+from sys import maxsize
+from typing import Any, Dict, List
+
+import torch
+
+
+# Unpickling machinery
+@_functools.lru_cache(maxsize=1)
+def _get_allowed_globals():
+    rc: Dict[str, Any] = {
+        "collections.OrderedDict": OrderedDict,
+        "torch.nn.parameter.Parameter": torch.nn.Parameter,
+        "torch.serialization._get_layout": torch.serialization._get_layout,
+        "torch.Size": torch.Size,
+        "torch.Tensor": torch.Tensor,
+    }
+    # dtype
+    for t in [
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ]:
+        rc[str(t)] = t
+    # Tensor classes
+    for tt in torch._tensor_classes:
+        rc[f"{tt.__module__}.{tt.__name__}"] = tt
+    # Storage classes
+    for ts in torch._storage_classes:
+        if ts not in (torch.storage.TypedStorage, torch.storage.UntypedStorage):
+            # Wrap legacy storage types in a dummy class
+            rc[f"{ts.__module__}.{ts.__name__}"] = torch.serialization.StorageType(
+                ts.__name__
+            )
+        else:
+            rc[f"{ts.__module__}.{ts.__name__}"] = ts
+    # Rebuild functions
+    for f in [
+        torch._utils._rebuild_parameter,
+        torch._utils._rebuild_tensor,
+        torch._utils._rebuild_tensor_v2,
+        torch._utils._rebuild_tensor_v3,
+        torch._utils._rebuild_sparse_tensor,
+        torch._utils._rebuild_meta_tensor_no_storage,
+        torch._utils._rebuild_nested_tensor,
+    ]:
+        rc[f"torch._utils.{f.__name__}"] = f
+
+    # Handles Tensor Subclasses, Tensor's with attributes.
+    # NOTE: It calls into above rebuild functions for regular Tensor types.
+    rc["torch._tensor._rebuild_from_type_v2"] = torch._tensor._rebuild_from_type_v2
+    return rc
+
+
+class Unpickler:
+    def __init__(self, file, *, encoding: str = "bytes"):
+        self.encoding = encoding
+        self.readline = file.readline
+        self.read = file.read
+        self.memo: Dict[int, Any] = {}
+
+    def load(self):
+        """Read a pickled object representation from the open file.
+
+        Return the reconstituted object hierarchy specified in the file.
+        """
+        self.metastack = []
+        self.stack: List[Any] = []
+        self.append = self.stack.append
+        read = self.read
+        readline = self.readline
+        while True:
+            key = read(1)
+            if not key:
+                raise EOFError
+            assert isinstance(key, bytes_types)
+            # Risky operators
+            if key[0] == GLOBAL[0]:
+                module = readline()[:-1].decode("utf-8")
+                name = readline()[:-1].decode("utf-8")
+                full_path = f"{module}.{name}"
+                if full_path in _get_allowed_globals():
+                    self.append(_get_allowed_globals()[full_path])
+                else:
+                    raise RuntimeError(f"Unsupported class {full_path}")
+            elif key[0] == NEWOBJ[0]:
+                args = self.stack.pop()
+                cls = self.stack.pop()
+                if cls is not torch.nn.Parameter:
+                    raise RuntimeError(f"Trying to instantiate unsupported class {cls}")
+                self.append(torch.nn.Parameter(*args))
+            elif key[0] == REDUCE[0]:
+                args = self.stack.pop()
+                func = self.stack[-1]
+                if func not in _get_allowed_globals().values():
+                    raise RuntimeError(
+                        f"Trying to call reduce for unrecognized function {func}"
+                    )
+                self.stack[-1] = func(*args)
+            elif key[0] == BUILD[0]:
+                state = self.stack.pop()
+                inst = self.stack[-1]
+                if type(inst) is torch.Tensor:
+                    # Legacy unpickling
+                    inst.set_(*state)
+                elif type(inst) is torch.nn.Parameter:
+                    inst.__setstate__(state)
+                elif type(inst) is OrderedDict:
+                    inst.__dict__.update(state)
+                else:
+                    raise RuntimeError(
+                        f"Can only build Tensor, parameter or dict objects, but got {type(inst)}"
+                    )
+            # Stack manipulation
+            elif key[0] == APPEND[0]:
+                item = self.stack.pop()
+                list_obj = self.stack[-1]
+                if type(list_obj) is not list:
+                    raise RuntimeError(
+                        f"Can only append to lists, but got {type(list_obj)}"
+                    )
+                list_obj.append(item)
+            elif key[0] == APPENDS[0]:
+                items = self.pop_mark()
+                list_obj = self.stack[-1]
+                if type(list_obj) is not list:
+                    raise RuntimeError(
+                        f"Can only extend lists, but got {type(list_obj)}"
+                    )
+                list_obj.extend(items)
+            elif key[0] == SETITEM[0]:
+                (v, k) = (self.stack.pop(), self.stack.pop())
+                self.stack[-1][k] = v
+            elif key[0] == SETITEMS[0]:
+                items = self.pop_mark()
+                for i in range(0, len(items), 2):
+                    self.stack[-1][items[i]] = items[i + 1]
+            elif key[0] == MARK[0]:
+                self.metastack.append(self.stack)
+                self.stack = []
+                self.append = self.stack.append
+            elif key[0] == TUPLE[0]:
+                items = self.pop_mark()
+                self.append(tuple(items))
+            elif key[0] == TUPLE1[0]:
+                self.stack[-1] = (self.stack[-1],)
+            elif key[0] == TUPLE2[0]:
+                self.stack[-2:] = [(self.stack[-2], self.stack[-1])]
+            elif key[0] == TUPLE3[0]:
+                self.stack[-3:] = [(self.stack[-3], self.stack[-2], self.stack[-1])]
+            # Basic types construction
+            elif key[0] == NONE[0]:
+                self.append(None)
+            elif key[0] == NEWFALSE[0]:
+                self.append(False)
+            elif key[0] == NEWTRUE[0]:
+                self.append(True)
+            elif key[0] == EMPTY_TUPLE[0]:
+                self.append(())
+            elif key[0] == EMPTY_LIST[0]:
+                self.append([])
+            elif key[0] == EMPTY_DICT[0]:
+                self.append({})
+            elif key[0] == EMPTY_SET[0]:
+                self.append(set())
+            elif key[0] == BININT[0]:
+                self.append(unpack("<i", read(4))[0])
+            elif key[0] == BININT1[0]:
+                self.append(self.read(1)[0])
+            elif key[0] == BININT2[0]:
+                self.append(unpack("<H", read(2))[0])
+            elif key[0] == BINFLOAT[0]:
+                self.append(unpack(">d", self.read(8))[0])
+            elif key[0] == BINUNICODE[0]:
+                strlen = unpack("<I", read(4))[0]
+                if strlen > maxsize:
+                    raise RuntimeError("String is too long")
+                strval = str(read(strlen), "utf-8", "surrogatepass")
+                self.append(strval)
+            elif key[0] == SHORT_BINSTRING[0]:
+                strlen = read(1)[0]
+                strdata = read(strlen)
+                if self.encoding != "bytes":
+                    strdata = strdata.decode(self.encoding, "strict")
+                self.append(strdata)
+            elif key[0] == BINPERSID[0]:
+                pid = self.stack.pop()
+                # Only allow persistent load of storage
+                if type(pid) is not tuple and not type(pid) is not int:
+                    raise RuntimeError(
+                        f"persistent_load id must be tuple or int, but got {type(pid)}"
+                    )
+                if (
+                    type(pid) is tuple
+                    and len(pid) > 0
+                    and torch.serialization._maybe_decode_ascii(pid[0]) != "storage"
+                ):
+                    raise RuntimeError(
+                        f"Only persistent_load of storage is allowed, but got {pid[0]}"
+                    )
+                self.append(self.persistent_load(pid))
+            elif key[0] in [BINGET[0], LONG_BINGET[0]]:
+                idx = (read(1) if key[0] == BINGET[0] else unpack("<I", read(4)))[0]
+                self.append(self.memo[idx])
+            elif key[0] in [BINPUT[0], LONG_BINPUT[0]]:
+                i = (read(1) if key[0] == BINPUT[0] else unpack("<I", read(4)))[0]
+                if i < 0:
+                    raise ValueError("negative argument")
+                self.memo[i] = self.stack[-1]
+            elif key[0] == LONG1[0]:
+                n = read(1)[0]
+                data = read(n)
+                self.append(decode_long(data))
+            # First and last deserializer ops
+            elif key[0] == PROTO[0]:
+                # Read and ignore proto version
+                read(1)[0]
+            elif key[0] == STOP[0]:
+                rc = self.stack.pop()
+                return rc
+            else:
+                raise RuntimeError(f"Unsupported operand {key[0]}")
+
+    # Return a list of items pushed in the stack after last MARK instruction.
+    def pop_mark(self):
+        items = self.stack
+        self.stack = self.metastack.pop()
+        self.append = self.stack.append
+        return items
+
+    def persistent_load(self, pid):
+        raise UnpicklingError("unsupported persistent id encountered")
+
+
+def load(file, *, encoding: str = "ASCII"):
+    return Unpickler(file, encoding=encoding).load()
diff --git a/MLPY/Lib/site-packages/torch/amp/__init__.py b/MLPY/Lib/site-packages/torch/amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb013e46b1cf2754a15a526144873bd9fb54e057
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/amp/__init__.py
@@ -0,0 +1,2 @@
+from .autocast_mode import _enter_autocast, _exit_autocast, autocast
+from .grad_scaler import GradScaler
diff --git a/MLPY/Lib/site-packages/torch/amp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/amp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9b38b533d849d135c3ed9bba2ec4c75da68b26b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/amp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/amp/__pycache__/autocast_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/amp/__pycache__/autocast_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0097c16e61cfceee79a20427fa9c233507d0744
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/amp/__pycache__/autocast_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/amp/__pycache__/grad_scaler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/amp/__pycache__/grad_scaler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da276fc839fc05935486874f65e8b0605e6c02aa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/amp/__pycache__/grad_scaler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/amp/autocast_mode.py b/MLPY/Lib/site-packages/torch/amp/autocast_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..287e52ad1de40b0cafb8a850695a1135adfbb5fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/amp/autocast_mode.py
@@ -0,0 +1,436 @@
+import functools
+import warnings
+
+from typing import Any, Optional
+
+import torch
+from torch.types import _dtype
+
+__all__ = ["autocast_decorator", "autocast"]
+
+
+def autocast_decorator(autocast_instance, func):
+    @functools.wraps(func)
+    def decorate_autocast(*args, **kwargs):
+        with autocast_instance:
+            return func(*args, **kwargs)
+
+    decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode"  # type: ignore[attr-defined]
+    return decorate_autocast
+
+
+class autocast:
+    r"""
+    Instances of :class:`autocast` serve as context managers or decorators that
+    allow regions of your script to run in mixed precision.
+
+    In these regions, ops run in an op-specific dtype chosen by autocast
+    to improve performance while maintaining accuracy.
+    See the :ref:`Autocast Op Reference<autocast-op-reference>` for details.
+
+    When entering an autocast-enabled region, Tensors may be any type.
+    You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting.
+
+    :class:`autocast` should wrap only the forward pass(es) of your network, including the loss
+    computation(s).  Backward passes under autocast are not recommended.
+    Backward ops run in the same type that autocast used for corresponding forward ops.
+
+    Example for CUDA Devices::
+
+        # Creates model and optimizer in default precision
+        model = Net().cuda()
+        optimizer = optim.SGD(model.parameters(), ...)
+
+        for input, target in data:
+            optimizer.zero_grad()
+
+            # Enables autocasting for the forward pass (model + loss)
+            with torch.autocast(device_type="cuda"):
+                output = model(input)
+                loss = loss_fn(output, target)
+
+            # Exits the context manager before backward()
+            loss.backward()
+            optimizer.step()
+
+    See the :ref:`CUDA Automatic Mixed Precision examples<amp-examples>` for usage (along with gradient scaling)
+    in more complex scenarios (e.g., gradient penalty, multiple models/losses, custom autograd functions).
+
+    :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model::
+
+        class AutocastModel(nn.Module):
+            ...
+            @torch.autocast(device_type="cuda")
+            def forward(self, input):
+                ...
+
+    Floating-point Tensors produced in an autocast-enabled region may be ``float16``.
+    After returning to an autocast-disabled region, using them with floating-point
+    Tensors of different dtypes may cause type mismatch errors.  If so, cast the Tensor(s)
+    produced in the autocast region back to ``float32`` (or other dtype if desired).
+    If a Tensor from the autocast region is already ``float32``, the cast is a no-op,
+    and incurs no additional overhead.
+    CUDA Example::
+
+        # Creates some tensors in default dtype (here assumed to be float32)
+        a_float32 = torch.rand((8, 8), device="cuda")
+        b_float32 = torch.rand((8, 8), device="cuda")
+        c_float32 = torch.rand((8, 8), device="cuda")
+        d_float32 = torch.rand((8, 8), device="cuda")
+
+        with torch.autocast(device_type="cuda"):
+            # torch.mm is on autocast's list of ops that should run in float16.
+            # Inputs are float32, but the op runs in float16 and produces float16 output.
+            # No manual casts are required.
+            e_float16 = torch.mm(a_float32, b_float32)
+            # Also handles mixed input types
+            f_float16 = torch.mm(d_float32, e_float16)
+
+        # After exiting autocast, calls f_float16.float() to use with d_float32
+        g_float32 = torch.mm(d_float32, f_float16.float())
+
+    CPU Training Example::
+
+        # Creates model and optimizer in default precision
+        model = Net()
+        optimizer = optim.SGD(model.parameters(), ...)
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+
+                # Runs the forward pass with autocasting.
+                with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+                    output = model(input)
+                    loss = loss_fn(output, target)
+
+                loss.backward()
+                optimizer.step()
+
+
+    CPU Inference Example::
+
+        # Creates model in default precision
+        model = Net().eval()
+
+        with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
+            for input in data:
+                # Runs the forward pass with autocasting.
+                output = model(input)
+
+    CPU Inference Example with Jit Trace::
+
+        class TestModel(nn.Module):
+            def __init__(self, input_size, num_classes):
+                super().__init__()
+                self.fc1 = nn.Linear(input_size, num_classes)
+            def forward(self, x):
+                return self.fc1(x)
+
+        input_size = 2
+        num_classes = 2
+        model = TestModel(input_size, num_classes).eval()
+
+        # For now, we suggest to disable the Jit Autocast Pass,
+        # As the issue: https://github.com/pytorch/pytorch/issues/75956
+        torch._C._jit_set_autocast_mode(False)
+
+        with torch.cpu.amp.autocast(cache_enabled=False):
+            model = torch.jit.trace(model, torch.randn(1, input_size))
+        model = torch.jit.freeze(model)
+        # Models Run
+        for _ in range(3):
+            model(torch.randn(1, input_size))
+
+    Type mismatch errors *in* an autocast-enabled region are a bug; if this is what you observe,
+    please file an issue.
+
+    ``autocast(enabled=False)`` subregions can be nested in autocast-enabled regions.
+    Locally disabling autocast can be useful, for example, if you want to force a subregion
+    to run in a particular ``dtype``.  Disabling autocast gives you explicit control over
+    the execution type.  In the subregion, inputs from the surrounding region
+    should be cast to ``dtype`` before use::
+
+        # Creates some tensors in default dtype (here assumed to be float32)
+        a_float32 = torch.rand((8, 8), device="cuda")
+        b_float32 = torch.rand((8, 8), device="cuda")
+        c_float32 = torch.rand((8, 8), device="cuda")
+        d_float32 = torch.rand((8, 8), device="cuda")
+
+        with torch.autocast(device_type="cuda"):
+            e_float16 = torch.mm(a_float32, b_float32)
+            with torch.autocast(device_type="cuda", enabled=False):
+                # Calls e_float16.float() to ensure float32 execution
+                # (necessary because e_float16 was created in an autocasted region)
+                f_float32 = torch.mm(c_float32, e_float16.float())
+
+            # No manual casts are required when re-entering the autocast-enabled region.
+            # torch.mm again runs in float16 and produces float16 output, regardless of input types.
+            g_float16 = torch.mm(d_float32, f_float32)
+
+    The autocast state is thread-local.  If you want it enabled in a new thread, the context manager or decorator
+    must be invoked in that thread.  This affects :class:`torch.nn.DataParallel` and
+    :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
+    (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
+
+    Args:
+        device_type(str, required):  Device type to use. Possible values are: 'cuda', 'cpu', 'xpu' and 'hpu'.
+                                     The type is the same as the `type` attribute of a :class:`torch.device`.
+                                     Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
+        enabled(bool, optional):  Whether autocasting should be enabled in the region.
+            Default: ``True``
+        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
+        cache_enabled(bool, optional):  Whether the weight cache inside autocast should be enabled.
+            Default: ``True``
+    """
+
+    def __init__(
+        self,
+        device_type: str,
+        dtype: Optional[_dtype] = None,
+        enabled: bool = True,
+        cache_enabled: Optional[bool] = None,
+    ):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = device_type
+            self.fast_dtype = dtype
+            # TODO: support get_autocast_gpu/cpu_dtype
+            assert dtype is not None
+            return
+        self.device = device_type
+        self.custom_backend_name = torch._C._get_privateuse1_backend_name()
+        if self.device == "cuda":
+            self.fast_dtype = torch.get_autocast_gpu_dtype()
+        elif self.device == "cpu":
+            self.fast_dtype = torch.get_autocast_cpu_dtype()
+        elif self.device == "xpu":
+            self.fast_dtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
+        elif self.device == "ipu":
+            self.fast_dtype = torch.get_autocast_ipu_dtype()  # type: ignore[attr-defined]
+        elif self.device == "hpu":
+            self.fast_dtype = torch.hpu.get_autocast_hpu_dtype()  # type: ignore[attr-defined]
+        elif self.device == "xla":
+            self.fast_dtype = torch.get_autocast_xla_dtype()  # type: ignore[attr-defined]
+        elif self.device == self.custom_backend_name:
+            necessary_funcs = [
+                "is_autocast_enabled",
+                "set_autocast_enabled",
+                "get_autocast_dtype",
+                "set_autocast_dtype",
+                "get_amp_supported_dtype",
+            ]
+            message = f"Tried to use AMP with the `{self.custom_backend_name}` backend, but the backend has not "
+            message += "registered a module or  the module miss some necessary funcs. The backend should register "
+            message += "a module by `torch._register_device_module`, and the module must have these funcs: \n"
+            message += "`is_autocast_enabled() -> bool`, `set_autocast_enabled(bool) -> None`, "
+            message += "`get_autocast_dtype() -> torch.dtype`, `set_autocast_dtype(torch.dtype) "
+            message += (
+                "-> None` and `get_amp_supported_dtype() -> List[torch.dtype]`. \n"
+            )
+
+            assert hasattr(torch, self.custom_backend_name), message
+            self.custom_device_mod = getattr(torch, self.custom_backend_name)
+            for func in necessary_funcs:
+                assert hasattr(self.custom_device_mod, func), (
+                    message + f"But the func `{func}` is missing. \n"
+                )
+
+            self.fast_dtype = self.custom_device_mod.get_autocast_dtype()
+        else:
+            raise RuntimeError(
+                f"User specified an unsupported autocast device_type '{self.device}'"
+            )
+        self._cache_enabled = torch.is_autocast_cache_enabled()
+        if (
+            enabled
+            and torch.cuda.amp.common.amp_definitely_not_available()
+            and self.device == "cuda"
+        ):
+            warnings.warn(
+                "User provided device_type of 'cuda', but CUDA is not available. Disabling"
+            )
+            enabled = False
+        if dtype is not None:
+            self.fast_dtype = dtype
+        if cache_enabled is not None:
+            self._cache_enabled = cache_enabled
+
+        if self.device == "cpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype and enabled:
+                error_message = "In CPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "CPU Autocast only supports dtype of "
+                error_message += (
+                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "xpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In XPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "XPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "ipu":
+            supported_dtypes = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtypes:
+                error_message = "In IPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "IPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "hpu":
+            supported_dtype = [torch.bfloat16, torch.float16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In HPU autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += "HPU Autocast only supports dtypes of torch.bfloat16 and torch.float16 currently."
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == self.custom_backend_name:
+            supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
+            if self.fast_dtype not in supported_dtype:
+                error_message = f"In {self.custom_backend_name} autocast, but the target dtype is not supported. "
+                error_message += f"Disabling autocast.\n {self.custom_backend_name} Autocast only supports dtypes of "
+                error_message += (
+                    ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+        elif self.device == "cuda":
+            if (
+                enabled
+                and self.fast_dtype == torch.bfloat16
+                and not torch.cuda.is_bf16_supported()
+            ):
+                raise RuntimeError(
+                    "Current CUDA Device does not support bfloat16. Please switch dtype to float16."
+                )
+        elif self.device == "xla":
+            supported_dtype = [torch.float16, torch.bfloat16]
+            if self.fast_dtype not in supported_dtype:
+                error_message = "In XLA autocast, but the target dtype is not supported. Disabling autocast.\n"
+                error_message += (
+                    "XLA Autocast only supports dtype of torch.bfloat16 currently."
+                )
+                warnings.warn(error_message)
+                enabled = False
+        self._enabled = enabled
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            assert self.fast_dtype is not None
+            return self
+
+        self.prev_cache_enabled = torch.is_autocast_cache_enabled()
+        if self.device == "cpu":
+            self.prev = torch.is_autocast_cpu_enabled()
+            self.prev_fastdtype = torch.get_autocast_cpu_dtype()
+            torch.set_autocast_cpu_enabled(self._enabled)
+            torch.set_autocast_cpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
+            torch.autocast_increment_nesting()
+        elif self.device == "xpu":
+            self.prev = torch.xpu.is_autocast_xpu_enabled()  # type: ignore[attr-defined]
+            self.prev_fastdtype = torch.xpu.get_autocast_xpu_dtype()  # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_enabled(self._enabled)  # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
+            torch.autocast_increment_nesting()
+        elif self.device == "ipu":
+            self.prev = torch.is_autocast_ipu_enabled()  # type: ignore[attr-defined]
+            self.prev_fastdtype = torch.get_autocast_ipu_dtype()  # type: ignore[attr-defined]
+            torch.set_autocast_ipu_enabled(self._enabled)  # type: ignore[attr-defined]
+            torch.set_autocast_ipu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
+            torch.autocast_increment_nesting()
+        elif self.device == "hpu":
+            self.prev = torch.hpu.is_autocast_hpu_enabled()  # type: ignore[attr-defined]
+            self.prev_fastdtype = torch.hpu.get_autocast_hpu_dtype()  # type: ignore[attr-defined]
+            torch.hpu.set_autocast_hpu_enabled(self._enabled)  # type: ignore[attr-defined]
+            torch.hpu.set_autocast_hpu_dtype(self.fast_dtype)  # type: ignore[attr-defined]
+            torch.autocast_increment_nesting()
+        elif self.device == "xla":
+            self.prev = torch.is_autocast_xla_enabled()  # type: ignore[attr-defined]
+            self.prev_fastdtype = torch.get_autocast_xla_dtype()  # type: ignore[attr-defined]
+            torch.set_autocast_xla_enabled(self._enabled)  # type: ignore[attr-defined]
+            torch.set_autocast_xla_dtype(self.fast_dtype)  # type: ignore[attr-defined]
+            torch.autocast_increment_nesting()
+        elif self.device == self.custom_backend_name:
+            self.prev = self.custom_device_mod.is_autocast_enabled()
+            self.prev_fastdtype = self.custom_device_mod.get_autocast_dtype()
+            self.custom_device_mod.set_autocast_enabled(self._enabled)
+            self.custom_device_mod.set_autocast_dtype(self.fast_dtype)
+            torch.autocast_increment_nesting()
+        else:
+            self.prev = torch.is_autocast_enabled()
+            self.prev_fastdtype = torch.get_autocast_gpu_dtype()
+            torch.set_autocast_gpu_dtype(self.fast_dtype)  # type: ignore[arg-type]
+            torch.set_autocast_enabled(self._enabled)
+            torch.autocast_increment_nesting()
+        torch.set_autocast_cache_enabled(self._cache_enabled)
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+
+        # Drop the cache when we exit to a nesting level that's outside any instance of autocast.
+        if self.device == "cpu":
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_cpu_enabled(self.prev)
+            torch.set_autocast_cpu_dtype(self.prev_fastdtype)
+        elif self.device == "xpu":
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.xpu.set_autocast_xpu_enabled(self.prev)  # type: ignore[attr-defined]
+            torch.xpu.set_autocast_xpu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
+        elif self.device == "ipu":
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_ipu_enabled(self.prev)  # type: ignore[attr-defined]
+            torch.set_autocast_ipu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
+        elif self.device == "hpu":
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.hpu.set_autocast_hpu_enabled(self.prev)  # type: ignore[attr-defined]
+            torch.hpu.set_autocast_hpu_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
+        elif self.device == "xla":
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_xla_enabled(self.prev)  # type: ignore[attr-defined]
+            torch.set_autocast_xla_dtype(self.prev_fastdtype)  # type: ignore[attr-defined]
+        elif self.device == self.custom_backend_name:
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            self.custom_device_mod.set_autocast_enabled(self.prev)
+            self.custom_device_mod.set_autocast_dtype(self.prev_fastdtype)
+        else:
+            if torch.autocast_decrement_nesting() == 0:
+                torch.clear_autocast_cache()
+            torch.set_autocast_enabled(self.prev)
+            torch.set_autocast_gpu_dtype(self.prev_fastdtype)
+        torch.set_autocast_cache_enabled(self.prev_cache_enabled)
+        return False
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return autocast_decorator(self, func)
+
+
+# These functions aren't meant for public usage.
+# They are what we trace into a graph during pre_dispatch tracing
+# when we encounter an autocast context manager.
+def _enter_autocast(*vals):
+    # For pre-dispatch tracing, if a TorchFunction mode is active, we'll want to trace this into a graph.
+    if torch._C._is_torch_function_mode_enabled():
+        return torch.overrides.handle_torch_function(
+            torch.amp._enter_autocast, [], *vals
+        )
+    mode = torch.amp.autocast(*vals)
+    mode.__enter__()
+    return mode
+
+
+def _exit_autocast(mode):
+    if torch._C._is_torch_function_mode_enabled():
+        return torch.overrides.handle_torch_function(torch.amp._exit_autocast, [], mode)
+    mode.__exit__(None, None, None)
diff --git a/MLPY/Lib/site-packages/torch/amp/grad_scaler.py b/MLPY/Lib/site-packages/torch/amp/grad_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73fab2274046679ed47c267bf57f52705c2d09c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/amp/grad_scaler.py
@@ -0,0 +1,681 @@
+from __future__ import annotations
+
+import inspect
+import warnings
+from collections import abc, defaultdict
+from enum import Enum
+from typing import Any, cast, Dict, Iterable, List, Optional, overload, Tuple, Union
+
+import torch
+
+
+__all__ = ["OptState", "GradScaler"]
+
+
+class _MultiDeviceReplicator:
+    """Lazily serves copies of a tensor to requested devices.
+
+    Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device: torch.device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state() -> Dict[str, Any]:
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler:
+    """An instance ``scaler`` of :class:`GradScaler`.
+
+    Helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        device (str, optional, default="cuda"): Device type to use. Possible values are: 'cuda' and 'cpu'.
+            The type is the same as the `type` attribute of a :class:`torch.device`.
+            Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+            Default: ``True``
+    """
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        self._device = device
+        self._enabled = enabled
+        if self._device == "cuda":
+            if enabled and torch.cuda.amp.common.amp_definitely_not_available():
+                warnings.warn(
+                    "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling."
+                )
+                self._enabled = False
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale: Optional[torch.Tensor] = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker: Optional[torch.Tensor] = None
+            self._per_optimizer_states: Dict[int, Dict[str, Any]] = defaultdict(
+                _refresh_per_optimizer_state
+            )
+
+    def _check_scale_growth_tracker(
+        self, funcname: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, (
+            f"Attempted {funcname} but _scale is None.  " + fix
+        )
+        assert self._growth_tracker is not None, (
+            f"Attempted {funcname} but _growth_tracker is None.  " + fix
+        )
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self, dev: torch.device) -> None:
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = torch.full((), self._init_scale, dtype=torch.float32, device=dev)
+        self._growth_tracker = torch.full(
+            (), self._init_growth_tracker, dtype=torch.int32, device=dev
+        )
+
+    @overload
+    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
+        ...
+
+    @overload
+    def scale(self, outputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        ...
+
+    @overload
+    def scale(self, outputs: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]:
+        ...
+
+    @overload
+    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
+        ...
+
+    def scale(
+        self,
+        outputs: Union[torch.Tensor, Iterable[torch.Tensor]],
+    ) -> Union[torch.Tensor, Iterable[torch.Tensor]]:
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        stash: List[
+            _MultiDeviceReplicator
+        ] = []  # holds a reference that can be overwritten by apply_scale
+
+        def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
+            if isinstance(val, torch.Tensor):
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            if isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterable)
+                return iterable
+            raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(
+        self,
+        optimizer: torch.optim.Optimizer,
+        inv_scale: torch.Tensor,
+        found_inf: torch.Tensor,
+        allow_fp16: bool,
+    ) -> Dict[torch.device, torch.Tensor]:
+        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads: Dict[
+            torch.device, Dict[torch.dtype, List[torch.Tensor]]
+        ] = defaultdict(lambda: defaultdict(list))
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    assert isinstance(param, torch.Tensor)
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][
+                        to_unscale.dtype
+                    ].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(
+                        grads,
+                        per_device_found_inf.get(device),
+                        per_device_inv_scale.get(device),
+                    )
+
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update()."
+            )
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full((), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, False
+        )
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(
+        self,
+        optimizer: torch.optim.Optimizer,
+        optimizer_state: Dict[str, Any],
+        *args: Any,
+        **kwargs: Any,
+    ) -> Optional[float]:
+        retval: Optional[float] = None
+        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
+            retval = optimizer.step(*args, **kwargs)
+        return retval
+
+    def step(
+        self, optimizer: torch.optim.Optimizer, *args: Any, **kwargs: Any
+    ) -> Optional[float]:
+        """Invoke ``unscale_(optimizer)`` followed by parameter update, if gradients are not infs/NaN.
+
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if not self._enabled:
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError(
+                "Closure use is not currently supported if GradScaler is enabled."
+            )
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError(
+                "step() has already been called since the last update()."
+            )
+
+        retval: Optional[float] = None
+
+        if getattr(optimizer, "_step_supports_amp_scaling", False):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
+            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
+            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
+            # to skip the parameter updates or unscale gradients before updating parameters in
+            # the fused kernel, e.g. `FusedAdamMathFunctor`.
+            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
+            # while the method is expected to be called by users side, i.e. their optimizers.
+            kwargs_ = kwargs
+            has_grad_scaler_kwarg = (
+                "grad_scaler" in inspect.signature(optimizer.step).parameters
+            )
+            if has_grad_scaler_kwarg:
+                warnings.warn(
+                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
+                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
+                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
+                    FutureWarning,
+                )
+                kwargs_.update({"grad_scaler": self})
+            else:
+                if optimizer_state["stage"] is OptState.READY:
+                    self._check_inf_per_device(optimizer)
+                scaler = self._get_scale_async()
+                assert scaler is not None
+                found_inf = cast(
+                    torch.Tensor,
+                    sum(
+                        [
+                            t.to(scaler.device, non_blocking=True)
+                            for t in optimizer_state["found_inf_per_device"].values()
+                        ]
+                    ),
+                )
+                optimizer.grad_scale = (  # type: ignore[attr-defined]
+                    None if optimizer_state["stage"] == OptState.UNSCALED else scaler
+                )
+                optimizer.found_inf = found_inf  # type: ignore[attr-defined]
+            retval = optimizer.step(*args, **kwargs_)
+            optimizer_state["stage"] = OptState.STEPPED
+            if not has_grad_scaler_kwarg:
+                del optimizer.grad_scale  # type: ignore[attr-defined]
+                del optimizer.found_inf  # type: ignore[attr-defined]
+            return retval
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert (
+            len(optimizer_state["found_inf_per_device"]) > 0
+        ), "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None:
+        """Update the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+
+        .. warning::
+            For performance reasons, we do not check the scale factor value to avoid synchronizations,
+            so the scale factor is not guaranteed to be above 1. If the scale falls below 1 and/or
+            you are seeing NaNs in your gradients or loss, something is likely wrong. For example,
+            bf16-pretrained models are often incompatible with AMP/fp16 due to differing dynamic ranges.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            assert self._scale is not None
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+                assert new_scale.device.type == self._device, reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            torch._amp_update_scale_(
+                _scale,
+                _growth_tracker,
+                found_inf_combined,
+                self._growth_factor,
+                self._backoff_factor,
+                self._growth_interval,
+            )
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self) -> Optional[torch.Tensor]:
+        return self._scale
+
+    def get_scale(self) -> float:
+        """Return a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return (
+                self._init_scale
+                if (scale := self._get_scale_async()) is None
+                else cast(float, scale.item())
+            )
+        return 1.0
+
+    def get_growth_factor(self) -> float:
+        r"""Return a Python float containing the scale growth factor."""
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor: float) -> None:
+        r"""Set a new scale growth factor.
+
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self) -> float:
+        r"""Return a Python float containing the scale backoff factor."""
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor: float) -> None:
+        r"""Set a new scale backoff factor.
+
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self) -> int:
+        r"""Return a Python int containing the growth interval."""
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval: int) -> None:
+        r"""Set a new growth interval.
+
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self) -> int:
+        if self._enabled:
+            return (
+                self._init_growth_tracker
+                if self._growth_tracker is None
+                else cast(int, self._growth_tracker.item())
+            )
+        return 0
+
+    def is_enabled(self) -> bool:
+        r"""Return a bool indicating whether this instance is enabled."""
+        return self._enabled
+
+    def state_dict(self) -> Dict[str, Any]:
+        r"""Return the state of the scaler as a :class:`dict`.
+
+        It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        if self._enabled:
+            return {
+                "scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker(),
+            }
+        return {}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        r"""Load the scaler state.
+
+        If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError(
+                "The source state dict is empty, possibly because it was saved "
+                "from a disabled instance of GradScaler."
+            )
+
+        self._init_scale = cast(float, state_dict["scale"])
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = cast(float, state_dict["growth_factor"])
+        self._backoff_factor = cast(float, state_dict["backoff_factor"])
+        self._growth_interval = cast(int, state_dict["growth_interval"])
+        self._init_growth_tracker = cast(int, state_dict["_growth_tracker"])
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self) -> Dict[str, Any]:
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, (
+                "A GradScaler instance may only be pickled at the beginning "
+                "of an iteration, or at the end after scaler.update()."
+            )
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state["_init_scale"] = self.get_scale()
+            state["_init_growth_tracker"] = self._get_growth_tracker()
+            state["_scale"] = None
+            state["_growth_tracker"] = None
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = torch.full((), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full((), 0.0, dtype=torch.float32, device=_scale.device)
+
+        self._per_optimizer_states[id(optimizer)][
+            "found_inf_per_device"
+        ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer: torch.optim.Optimizer) -> Dict[str, Any]:
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
diff --git a/MLPY/Lib/site-packages/torch/ao/__init__.py b/MLPY/Lib/site-packages/torch/ao/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d07d31dfa156371aceefffd993bb34afb7042db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/__init__.py
@@ -0,0 +1,16 @@
+# torch.ao is a package with a lot of interdependencies.
+# We will use lazy import to avoid cyclic dependencies here.
+
+
+__all__ = [
+    "nn",
+    "ns",
+    "quantization",
+    "pruning",
+]
+
+def __getattr__(name):
+    if name in __all__:
+        import importlib
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/MLPY/Lib/site-packages/torch/ao/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33c6e4c4f69972d81ecc4fff7891838b6f7c15eb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..126fe0e0ce0589f3d069296d642445c596024ab6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/__init__.py
@@ -0,0 +1,19 @@
+# We are exposing all subpackages to the end-user.
+# Because of possible inter-dependency, we want to avoid
+# the cyclic imports, thus implementing lazy version
+# as per https://peps.python.org/pep-0562/
+
+import importlib
+
+__all__ = [
+    "intrinsic",
+    "qat",
+    "quantizable",
+    "quantized",
+    "sparse",
+]
+
+def __getattr__(name):
+    if name in __all__:
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd12db298c1797de70b121da5f62a0b29bf940ae
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..19abcc6c5a918bc1cf45618ee4aaa02631cdb11e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__init__.py
@@ -0,0 +1,36 @@
+from .modules import *  # noqa: F403
+from .modules.fused import _FusedModule  # noqa: F403
+
+# # Subpackages
+# from . import qat  # noqa: F403
+# from . import quantized  # noqa: F403
+
+__all__ = [
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearBn1d',
+    'LinearLeakyReLU',
+    'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
+]
+
+# We are exposing all subpackages to the end-user.
+# Because of possible inter-dependency, we want to avoid
+# the cyclic imports, thus implementing lazy version
+# as per https://peps.python.org/pep-0562/
+def __getattr__(name):
+    if name in __all__:
+        import importlib
+        return importlib.import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e073ed05116639d374aa96434d646fbafbcac1b8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3256e90e3e1c94f0bba202bf1741adce7e072f23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__init__.py
@@ -0,0 +1,38 @@
+from .fused import _FusedModule  # noqa: F401
+from .fused import ConvBn1d
+from .fused import ConvBn2d
+from .fused import ConvBn3d
+from .fused import ConvBnReLU1d
+from .fused import ConvBnReLU2d
+from .fused import ConvBnReLU3d
+from .fused import ConvReLU1d
+from .fused import ConvReLU2d
+from .fused import ConvReLU3d
+from .fused import LinearReLU
+from .fused import BNReLU2d
+from .fused import BNReLU3d
+from .fused import LinearBn1d
+from .fused import LinearLeakyReLU
+from .fused import LinearTanh
+from .fused import ConvAdd2d
+from .fused import ConvAddReLU2d
+
+__all__ = [
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearBn1d',
+    'LinearLeakyReLU',
+    'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..907df24381036865500530497fa7ad89509a805b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0d282ff392c7f7891f7f9656f937ae5a31ccb68
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/fused.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..36285ab3d4cc107dd23ea3a49617c5b56e4cc366
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/modules/fused.py
@@ -0,0 +1,160 @@
+import torch
+from torch.nn import Conv1d, Conv2d, Conv3d, ReLU, Linear, BatchNorm1d, BatchNorm2d, BatchNorm3d
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+__all__ = ['ConvReLU1d', 'ConvReLU2d', 'ConvReLU3d', 'LinearReLU', 'ConvBn1d', 'ConvBn2d',
+           'ConvBnReLU1d', 'ConvBnReLU2d', 'ConvBn3d', 'ConvBnReLU3d', 'BNReLU2d', 'BNReLU3d',
+           'LinearBn1d', 'LinearLeakyReLU', 'LinearTanh', 'ConvAdd2d', 'ConvAddReLU2d']
+
+# Used for identifying intrinsic modules used in quantization
+class _FusedModule(torch.nn.Sequential):
+    pass
+
+class ConvReLU1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv1d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, relu):
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(relu) == ReLU, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}'
+        super().__init__(conv, relu)
+
+class ConvReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, relu):
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(relu) == ReLU, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}'
+        super().__init__(conv, relu)
+
+class ConvReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv3d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, relu):
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(relu) == ReLU, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(relu)}'
+        super().__init__(conv, relu)
+
+class LinearReLU(_FusedModule):
+    r"""This is a sequential container which calls the Linear and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, linear, relu):
+        assert type_before_parametrizations(linear) == Linear and type_before_parametrizations(relu) == ReLU, \
+            'Incorrect types for input modules{}{}'.format(
+                type_before_parametrizations(linear), type_before_parametrizations(relu))
+        super().__init__(linear, relu)
+
+class ConvBn1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 1d and Batch Norm 1d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn):
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(bn) == BatchNorm1d, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}'
+        super().__init__(conv, bn)
+
+class ConvBn2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 2d and Batch Norm 2d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn):
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}'
+        super().__init__(conv, bn)
+
+class ConvBnReLU1d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 1d, Batch Norm 1d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn, relu):
+        assert type_before_parametrizations(conv) == Conv1d and type_before_parametrizations(bn) == BatchNorm1d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+        super().__init__(conv, bn, relu)
+
+class ConvBnReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 2d, Batch Norm 2d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn, relu):
+        assert type_before_parametrizations(conv) == Conv2d and type_before_parametrizations(bn) == BatchNorm2d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+        super().__init__(conv, bn, relu)
+
+class ConvBn3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 3d and Batch Norm 3d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn):
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(bn) == BatchNorm3d, \
+            f'Incorrect types for input modules{type_before_parametrizations(conv)}{type_before_parametrizations(bn)}'
+        super().__init__(conv, bn)
+
+class ConvBnReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the Conv 3d, Batch Norm 3d, and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, bn, relu):
+        assert type_before_parametrizations(conv) == Conv3d and type_before_parametrizations(bn) == BatchNorm3d and \
+            type_before_parametrizations(relu) == ReLU, 'Incorrect types for input modules{}{}{}' \
+            .format(type_before_parametrizations(conv), type_before_parametrizations(bn), type_before_parametrizations(relu))
+        super().__init__(conv, bn, relu)
+
+
+class BNReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the BatchNorm 2d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, batch_norm, relu):
+        assert type_before_parametrizations(batch_norm) == BatchNorm2d and type_before_parametrizations(relu) == ReLU, \
+            'Incorrect types for input modules{}{}'.format(
+                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
+        super().__init__(batch_norm, relu)
+
+class BNReLU3d(_FusedModule):
+    r"""This is a sequential container which calls the BatchNorm 3d and ReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, batch_norm, relu):
+        assert type_before_parametrizations(batch_norm) == BatchNorm3d and type_before_parametrizations(relu) == ReLU, \
+            'Incorrect types for input modules{}{}'.format(
+                type_before_parametrizations(batch_norm), type_before_parametrizations(relu))
+        super().__init__(batch_norm, relu)
+
+
+class LinearBn1d(_FusedModule):
+    r"""This is a sequential container which calls the Linear and BatchNorm1d modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, linear, bn):
+        assert type_before_parametrizations(linear) == Linear and type_before_parametrizations(bn) == BatchNorm1d, \
+            f'Incorrect types for input modules{type_before_parametrizations(linear)}{type_before_parametrizations(bn)}'
+        super().__init__(linear, bn)
+
+class LinearLeakyReLU(_FusedModule):
+    r"""This is a sequential container which calls the Linear and LeakyReLU modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, linear, leaky_relu):
+        assert type(linear) == Linear and type(leaky_relu) == torch.nn.LeakyReLU, \
+            f'Incorrect types for input modules{type(linear)}{type(leaky_relu)}'
+        super().__init__(linear, leaky_relu)
+
+class LinearTanh(_FusedModule):
+    r"""This is a sequential container which calls the Linear and Tanh modules.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, linear, tanh):
+        assert type(linear) == Linear and type(tanh) == torch.nn.Tanh, \
+            f'Incorrect types for input modules{type(linear)}{type(tanh)}'
+        super().__init__(linear, tanh)
+
+class ConvAdd2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d modules with extra Add.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, add):
+        super().__init__(conv)
+        self.add = add
+
+    def forward(self, x1, x2):
+        return self.add(self[0](x1), x2)
+
+class ConvAddReLU2d(_FusedModule):
+    r"""This is a sequential container which calls the Conv2d, add, Relu.
+    During quantization this will be replaced with the corresponding fused module."""
+    def __init__(self, conv, add, relu):
+        super().__init__(conv)
+        self.add = add
+        self.relu = relu
+
+    def forward(self, x1, x2):
+        return self.relu(self.add(self[0](x1), x2))
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb27d182b411f4afab4c5012c8c31fca410ee8d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..673e1d6d401d3dde3f49863751c122d1de786e82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__init__.py
@@ -0,0 +1,31 @@
+from .linear_relu import LinearReLU
+from .linear_fused import LinearBn1d
+from .conv_fused import (
+    ConvBn1d,
+    ConvBn2d,
+    ConvBn3d,
+    ConvBnReLU1d,
+    ConvBnReLU2d,
+    ConvBnReLU3d,
+    ConvReLU1d,
+    ConvReLU2d,
+    ConvReLU3d,
+    update_bn_stats,
+    freeze_bn_stats,
+)
+
+__all__ = [
+    "LinearReLU",
+    "LinearBn1d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "update_bn_stats",
+    "freeze_bn_stats",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..557a13bcacc4c6f335bdd06fca42a1822dc29352
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5b21a1166ff83ddcaecfb4513b0309b8ade9700
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..553ef90a6ce481bf0553253cd9f1be9959d3b428
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7339710e6e4e4f84876c5b9e208486c1fa17ba1c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd7632734f137998df1e07372ab29f447dd8e2ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/conv_fused.py
@@ -0,0 +1,825 @@
+import math
+import torch
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.qat as nnqat
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.utils import fuse_conv_bn_weights
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.parameter import Parameter
+from typing import TypeVar
+
+__all__ = ['ConvBn1d', 'ConvBnReLU1d', 'ConvReLU1d', 'ConvBn2d', 'ConvBnReLU2d', 'ConvReLU2d', 'ConvBn3d',
+           'ConvBnReLU3d', 'ConvReLU3d', 'update_bn_stats', 'freeze_bn_stats']
+_BN_CLASS_MAP = {
+    1: nn.BatchNorm1d,
+    2: nn.BatchNorm2d,
+    3: nn.BatchNorm3d,
+}
+
+
+MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
+
+
+class _ConvBnNd(nn.modules.conv._ConvNd, nni._FusedModule):
+
+    _version = 2
+    _FLOAT_MODULE = MOD
+
+    def __init__(self,
+                 # ConvNd args
+                 in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, transposed, output_padding,
+                 groups,
+                 bias,
+                 padding_mode,
+                 # BatchNormNd args
+                 # num_features: out_channels
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None,
+                 dim=2):
+        nn.modules.conv._ConvNd.__init__(self, in_channels, out_channels, kernel_size,
+                                         stride, padding, dilation, transposed,
+                                         output_padding, groups, False, padding_mode)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.freeze_bn = freeze_bn if self.training else True
+        self.bn = _BN_CLASS_MAP[dim](out_channels, eps, momentum, True, True)
+        self.weight_fake_quant = self.qconfig.weight()
+        if bias:
+            self.bias = Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_bn_parameters()
+
+        # this needs to be called after reset_bn_parameters,
+        # as they modify the same state
+        if self.training:
+            if freeze_bn:
+                self.freeze_bn_stats()
+            else:
+                self.update_bn_stats()
+        else:
+            self.freeze_bn_stats()
+
+        self._enable_slow_path_for_better_numerical_stability = False
+
+    def reset_running_stats(self):
+        self.bn.reset_running_stats()
+
+    def reset_bn_parameters(self):
+        self.bn.reset_running_stats()
+        init.uniform_(self.bn.weight)
+        init.zeros_(self.bn.bias)
+        # note: below is actually for conv, not BN
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def reset_parameters(self):
+        super().reset_parameters()
+
+    def update_bn_stats(self):
+        self.freeze_bn = False
+        self.bn.training = True
+        return self
+
+    def freeze_bn_stats(self):
+        self.freeze_bn = True
+        self.bn.training = False
+        return self
+
+    def _forward(self, input):
+        if self._enable_slow_path_for_better_numerical_stability:
+            return self._forward_slow(input)
+        return self._forward_approximate(input)
+
+    def _forward_approximate(self, input):
+        """Approximated method to fuse conv and bn. It requires only one forward pass.
+        conv_orig = conv / scale_factor where scale_factor = bn.weight / running_std
+        """
+        assert self.bn.running_var is not None
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape))
+        # using zero bias here since the bias for original conv
+        # will be added later
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias, dtype=input.dtype)
+        else:
+            zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device, dtype=input.dtype)
+        conv = self._conv_forward(input, scaled_weight, zero_bias)
+        conv_orig = conv / scale_factor.reshape(bias_shape)
+        if self.bias is not None:
+            conv_orig = conv_orig + self.bias.reshape(bias_shape)
+        conv = self.bn(conv_orig)
+        return conv
+
+    def _forward_slow(self, input):
+        """
+        A more accurate but slow method to compute conv bn fusion, following https://arxiv.org/pdf/1806.08342.pdf
+        It requires two forward passes but handles the case bn.weight == 0
+
+        Conv: Y = WX + B_c
+        Conv without bias: Y0 = WX = Y - B_c, Y = Y0 + B_c
+
+        Batch statistics:
+          mean_Y = Y.mean()
+                 = Y0.mean() + B_c
+          var_Y = (Y - mean_Y)^2.mean()
+                = (Y0 - Y0.mean())^2.mean()
+        BN (r: bn.weight, beta: bn.bias):
+          Z = r * (Y - mean_Y) / sqrt(var_Y + eps) + beta
+            = r * (Y0 - Y0.mean()) / sqrt(var_Y + eps) + beta
+
+        Fused Conv BN training (std_Y = sqrt(var_Y + eps)):
+          Z = (r * W / std_Y) * X + r * (B_c - mean_Y) / std_Y + beta
+            = (r * W / std_Y) * X - r * Y0.mean() / std_Y + beta
+
+        Fused Conv BN inference (running_std = sqrt(running_var + eps)):
+          Z = (r * W / running_std) * X - r * (running_mean - B_c) / running_std + beta
+
+        QAT with fused conv bn:
+          Z_train = fake_quant(r * W / running_std) * X * (running_std / std_Y) - r * Y0.mean() / std_Y + beta
+                  = conv(X, fake_quant(r * W / running_std)) * (running_std / std_Y) - r * Y0.mean() / std_Y + beta
+          Z_inference = conv(X, fake_quant(r * W / running_std)) - r * (running_mean - B_c) / running_std + beta
+        """
+
+        assert self.bn.running_var is not None
+        assert self.bn.running_mean is not None
+
+        # using zero bias here since the bias for original conv
+        # will be added later
+        zero_bias = torch.zeros(self.out_channels, device=self.weight.device, dtype=input.dtype)
+
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+
+        if self.bn.training:
+            # needed to compute batch mean/std
+            conv_out = self._conv_forward(input, self.weight, zero_bias)
+            # update bn statistics
+            with torch.no_grad():
+                conv_out_bias = (
+                    conv_out if self.bias is None else conv_out + self.bias.reshape(bias_shape)
+                )
+                self.bn(conv_out_bias)
+
+        # fused conv + bn without bias using bn running statistics
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        scaled_weight = self.weight_fake_quant(
+            self.weight * scale_factor.reshape(weight_shape)
+        )
+        # fused conv without bias for inference: (r * W / running_std) * X
+        conv_bn = self._conv_forward(input, scaled_weight, zero_bias)
+
+        if self.bn.training:
+            avg_dims = [0] + list(range(2, len(self.weight.shape)))
+            batch_mean = conv_out.mean(avg_dims)  # type: ignore[possibly-undefined]
+            batch_var = torch.square(conv_out - batch_mean.reshape(bias_shape)).mean(
+                avg_dims
+            )
+            batch_std = torch.sqrt(batch_var + self.bn.eps)
+
+            # scale to use batch std in training mode
+            # conv(X, r * W / std_Y) = conv(X, r * W / running_std) * (running_std / std_Y)
+            unscale_factor = running_std / batch_std
+            conv_bn *= unscale_factor.reshape(bias_shape)
+
+            fused_mean = batch_mean
+            fused_std = batch_std
+        else:
+            fused_mean = self.bn.running_mean - (self.bias if self.bias is not None else 0)
+            fused_std = running_std
+
+        # fused bias = beta - r * mean / std
+        fused_bias = self.bn.bias - self.bn.weight * fused_mean / fused_std
+        conv_bn += fused_bias.reshape(bias_shape)
+
+        # HACK to let conv bias participate in loss to avoid DDP error (parameters
+        #   were not used in producing loss)
+        if self.bias is not None:
+            conv_bn += (self.bias - self.bias).reshape(bias_shape)
+
+        return conv_bn
+
+    def extra_repr(self):
+        # TODO(jerryzh): extend
+        return super().extra_repr()
+
+    def forward(self, input):
+        return self._forward(input)
+
+    def train(self, mode=True):
+        """
+        Batchnorm's training behavior is using the self.training flag. Prevent
+        changing it if BN is frozen. This makes sure that calling `model.train()`
+        on a model with a frozen BN will behave properly.
+        """
+        self.training = mode
+        if not self.freeze_bn:
+            for module in self.children():
+                module.train(mode)
+        return self
+
+    # ===== Serialization version history =====
+    #
+    # Version 1/None
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- gamma : Tensor
+    #   |--- beta : Tensor
+    #   |--- running_mean : Tensor
+    #   |--- running_var : Tensor
+    #   |--- num_batches_tracked : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- bn : Module
+    #        |--- weight : Tensor (moved from v1.self.gamma)
+    #        |--- bias : Tensor (moved from v1.self.beta)
+    #        |--- running_mean : Tensor (moved from v1.self.running_mean)
+    #        |--- running_var : Tensor (moved from v1.self.running_var)
+    #        |--- num_batches_tracked : Tensor (moved from v1.self.num_batches_tracked)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version == 1:
+            # BN related parameters and buffers were moved into the BN module for v2
+            v2_to_v1_names = {
+                'bn.weight': 'gamma',
+                'bn.bias': 'beta',
+                'bn.running_mean': 'running_mean',
+                'bn.running_var': 'running_var',
+                'bn.num_batches_tracked': 'num_batches_tracked',
+            }
+            for v2_name, v1_name in v2_to_v1_names.items():
+                if prefix + v1_name in state_dict:
+                    state_dict[prefix + v2_name] = state_dict[prefix + v1_name]
+                    state_dict.pop(prefix + v1_name)
+                elif prefix + v2_name in state_dict:
+                    # there was a brief period where forward compatibility
+                    # for this module was broken (between
+                    # https://github.com/pytorch/pytorch/pull/38478
+                    # and https://github.com/pytorch/pytorch/pull/38820)
+                    # and modules emitted the v2 state_dict format while
+                    # specifying that version == 1. This patches the forward
+                    # compatibility issue by allowing the v2 style entries to
+                    # be used.
+                    pass
+                elif strict:
+                    missing_keys.append(prefix + v2_name)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+            Args: `mod` a float module, either produced by torch.ao.quantization utilities
+            or directly from user
+        """
+        # The ignore is because _FLOAT_MODULE is a TypeVar here where the bound
+        # has no __name__ (code is fine though)
+        assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
+            cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        qconfig = mod.qconfig
+        conv, bn = mod[0], mod[1]
+        qat_convbn = cls(conv.in_channels, conv.out_channels, conv.kernel_size,
+                         conv.stride, conv.padding, conv.dilation,
+                         conv.groups, conv.bias is not None,
+                         conv.padding_mode,
+                         bn.eps, bn.momentum,
+                         False,
+                         qconfig)
+        qat_convbn.weight = conv.weight
+        qat_convbn.bias = conv.bias
+        qat_convbn.bn.weight = bn.weight
+        qat_convbn.bn.bias = bn.bias
+        qat_convbn.bn.running_mean = bn.running_mean
+        qat_convbn.bn.running_var = bn.running_var
+        # mypy error: Cannot determine type of 'num_batches_tracked'
+        qat_convbn.bn.num_batches_tracked = bn.num_batches_tracked  # type: ignore[has-type]
+        return qat_convbn
+
+    def to_float(self):
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined]
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.bias is not None,
+            self.padding_mode)
+        conv.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            conv.bias = torch.nn.Parameter(self.bias.detach())
+
+        if cls._FLOAT_BN_MODULE:  # type: ignore[attr-defined]
+            # fuse bn into conv
+            assert self.bn.running_var is not None and self.bn.running_mean is not None
+            conv.weight, conv.bias = fuse_conv_bn_weights(
+                conv.weight,
+                conv.bias,
+                self.bn.running_mean,
+                self.bn.running_var,
+                self.bn.eps,
+                self.bn.weight,
+                self.bn.bias
+            )
+
+        if cls._FLOAT_RELU_MODULE:  # type: ignore[attr-defined]
+            modules = []
+            modules.append(conv)
+            relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
+            modules.append(relu)
+            conv_relu = cls._FUSED_FLOAT_MODULE(*modules)  # type: ignore[attr-defined]
+            conv_relu.train(self.training)
+            return conv_relu
+        else:
+            conv.train(self.training)
+            return conv
+
+class ConvBn1d(_ConvBnNd, nn.Conv1d):
+    r"""
+    A ConvBn1d module is a module fused from Conv1d and BatchNorm1d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv1d` and
+    :class:`torch.nn.BatchNorm1d`.
+
+    Similar to :class:`torch.nn.Conv1d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_BN_MODULE = nn.BatchNorm1d
+    _FLOAT_RELU_MODULE: None = None
+    _FLOAT_MODULE = nni.ConvBn1d
+    _FLOAT_CONV_MODULE = nn.Conv1d
+
+    def __init__(self,
+                 # Conv1d args
+                 in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=None,
+                 padding_mode='zeros',
+                 # BatchNorm1d args
+                 # num_features: out_channels
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        _ConvBnNd.__init__(self, in_channels, out_channels, kernel_size, stride,
+                           padding, dilation, False, _single(0), groups, bias, padding_mode,
+                           eps, momentum, freeze_bn, qconfig, dim=1)
+
+class ConvBnReLU1d(ConvBn1d):
+    r"""
+    A ConvBnReLU1d module is a module fused from Conv1d, BatchNorm1d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv1d` and
+    :class:`torch.nn.BatchNorm1d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv1d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    # base class defines _FLOAT_MODULE as "ConvBn1d"
+    _FLOAT_MODULE = nni.ConvBnReLU1d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv1d
+    _FLOAT_BN_MODULE = nn.BatchNorm1d
+    _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU1d
+
+    def __init__(self,
+                 # Conv1d args
+                 in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=None,
+                 padding_mode='zeros',
+                 # BatchNorm1d args
+                 # num_features: out_channels
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias,
+                         padding_mode, eps, momentum,
+                         freeze_bn,
+                         qconfig)
+
+    def forward(self, input):
+        return F.relu(ConvBn1d._forward(self, input))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+class ConvReLU1d(nnqat.Conv1d, nni._FusedModule):
+    r"""A ConvReLU1d module is a fused module of Conv1d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv1d` and
+    :class:`~torch.nn.BatchNorm1d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvReLU1d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv1d
+    _FLOAT_BN_MODULE: None = None
+    _FLOAT_RELU_MODULE = nn.ReLU
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=True, padding_mode='zeros',
+                 qconfig=None):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride=stride, padding=padding, dilation=dilation,
+                         groups=groups, bias=bias, padding_mode=padding_mode,
+                         qconfig=qconfig)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+class ConvBn2d(_ConvBnNd, nn.Conv2d):
+    r"""
+    A ConvBn2d module is a module fused from Conv2d and BatchNorm2d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv2d` and
+    :class:`torch.nn.BatchNorm2d`.
+
+    Similar to :class:`torch.nn.Conv2d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvBn2d
+    _FLOAT_CONV_MODULE = nn.Conv2d
+    _FLOAT_BN_MODULE = nn.BatchNorm2d
+    _FLOAT_RELU_MODULE: None = None
+
+    def __init__(self,
+                 # ConvNd args
+                 in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=None,
+                 padding_mode='zeros',
+                 # BatchNorm2d args
+                 # num_features: out_channels
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        _ConvBnNd.__init__(self, in_channels, out_channels, kernel_size, stride,
+                           padding, dilation, False, _pair(0), groups, bias, padding_mode,
+                           eps, momentum, freeze_bn, qconfig, dim=2)
+
+class ConvBnReLU2d(ConvBn2d):
+    r"""
+    A ConvBnReLU2d module is a module fused from Conv2d, BatchNorm2d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv2d` and
+    :class:`torch.nn.BatchNorm2d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    # base class defines _FLOAT_MODULE as "ConvBn2d"
+    _FLOAT_MODULE = nni.ConvBnReLU2d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv2d
+    _FLOAT_BN_MODULE = nn.BatchNorm2d
+    _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU2d
+
+    def __init__(self,
+                 # Conv2d args
+                 in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=None,
+                 padding_mode='zeros',
+                 # BatchNorm2d args
+                 # num_features: out_channels
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias,
+                         padding_mode, eps, momentum,
+                         freeze_bn,
+                         qconfig)
+
+    def forward(self, input):
+        return F.relu(ConvBn2d._forward(self, input))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+class ConvReLU2d(nnqat.Conv2d, nni._FusedModule):
+    r"""A ConvReLU2d module is a fused module of Conv2d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv2d` and
+    :class:`~torch.nn.BatchNorm2d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvReLU2d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv2d
+    _FLOAT_BN_MODULE: None = None
+    _FLOAT_RELU_MODULE = nn.ReLU
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1,
+                 bias=True, padding_mode='zeros',
+                 qconfig=None):
+        super().__init__(in_channels, out_channels, kernel_size,
+                         stride=stride, padding=padding, dilation=dilation,
+                         groups=groups, bias=bias, padding_mode=padding_mode,
+                         qconfig=qconfig)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+class ConvBn3d(_ConvBnNd, nn.Conv3d):
+    r"""
+    A ConvBn3d module is a module fused from Conv3d and BatchNorm3d,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv3d` and
+    :class:`torch.nn.BatchNorm3d`.
+
+    Similar to :class:`torch.nn.Conv3d`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvBn3d
+    _FLOAT_CONV_MODULE = nn.Conv3d
+    _FLOAT_BN_MODULE = nn.BatchNorm3d
+    _FLOAT_RELU_MODULE: None = None
+
+    def __init__(
+        self,
+        # ConvNd args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm3d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        _ConvBnNd.__init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            _triple(0),
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+            dim=3,
+        )
+
+class ConvBnReLU3d(ConvBn3d):
+    r"""
+    A ConvBnReLU3d module is a module fused from Conv3d, BatchNorm3d and ReLU,
+    attached with FakeQuantize modules for weight,
+    used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Conv3d` and
+    :class:`torch.nn.BatchNorm3d` and :class:`torch.nn.ReLU`.
+
+    Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvBnReLU3d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv3d
+    _FLOAT_BN_MODULE = nn.BatchNorm3d
+    _FLOAT_RELU_MODULE = nn.ReLU  # type: ignore[assignment]
+    # module class after fusing bn into conv
+    _FUSED_FLOAT_MODULE = nni.ConvReLU3d
+
+    def __init__(
+        self,
+        # Conv3d args
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=None,
+        padding_mode="zeros",
+        # BatchNorm3d args
+        # num_features: out_channels
+        eps=1e-05,
+        momentum=0.1,
+        # affine: True
+        # track_running_stats: True
+        # Args for this module
+        freeze_bn=False,
+        qconfig=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            eps,
+            momentum,
+            freeze_bn,
+            qconfig,
+        )
+
+    def forward(self, input):
+        return F.relu(ConvBn3d._forward(self, input))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+class ConvReLU3d(nnqat.Conv3d, nni._FusedModule):
+    r"""A ConvReLU3d module is a fused module of Conv3d and ReLU, attached with
+    FakeQuantize modules for weight for
+    quantization aware training.
+
+    We combined the interface of :class:`~torch.nn.Conv3d` and
+    :class:`~torch.nn.BatchNorm3d`.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    _FLOAT_MODULE = nni.ConvReLU3d  # type: ignore[assignment]
+    _FLOAT_CONV_MODULE = nn.Conv3d
+    _FLOAT_BN_MODULE: None = None
+    _FLOAT_RELU_MODULE = nn.ReLU
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        padding_mode="zeros",
+        qconfig=None,
+    ):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+        )
+        assert qconfig, "qconfig must be provided for QAT module"
+        self.qconfig = qconfig
+        self.weight_fake_quant = self.qconfig.weight()
+
+    def forward(self, input):
+        return F.relu(
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+        )
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+def update_bn_stats(mod):
+    if type(mod) in {ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d}:
+        mod.update_bn_stats()
+
+def freeze_bn_stats(mod):
+    if type(mod) in {ConvBnReLU1d, ConvBnReLU2d, ConvBnReLU3d, ConvBn1d, ConvBn2d, ConvBn3d}:
+        mod.freeze_bn_stats()
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..abcbfdcb2a38ea9d5a2f46c33133cf3ae57ece84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_fused.py
@@ -0,0 +1,171 @@
+import torch
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.nn.functional as F
+from torch.nn import init
+from torch.nn.parameter import Parameter
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+
+__all__ = [
+    "LinearBn1d",
+]
+
+class LinearBn1d(nn.modules.linear.Linear, nni._FusedModule):
+    r"""
+    A LinearBn1d module is a module fused from Linear and BatchNorm1d, attached
+    with FakeQuantize modules for weight, used in quantization aware training.
+
+    We combined the interface of :class:`torch.nn.Linear` and
+    :class:torch.nn.BatchNorm1d`.
+
+    Similar to :class:`torch.nn.Linear`, with FakeQuantize modules initialized
+    to default.
+
+    Attributes:
+        freeze_bn:
+        weight_fake_quant: fake quant module for weight
+
+    """
+    def __init__(self,
+                 # Linear args
+                 in_features, out_features, bias=True,
+                 # BatchNorm1d args
+                 # num_features: out_features
+                 eps=1e-05, momentum=0.1,
+                 # affine: True
+                 # track_running_stats: True
+                 # Args for this module
+                 freeze_bn=False,
+                 qconfig=None):
+        nn.modules.linear.Linear.__init__(self, in_features, out_features, bias)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.freeze_bn = freeze_bn if self.training else True
+        self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True)
+        self.weight_fake_quant = self.qconfig.weight()
+        if bias:
+            self.bias = Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_bn_parameters()
+
+        # this needs to be called after reset_bn_parameters,
+        # as they modify the same state
+        if self.training:
+            if freeze_bn:
+                self.freeze_bn_stats()
+            else:
+                self.update_bn_stats()
+        else:
+            self.freeze_bn_stats()
+
+    def reset_running_stats(self):
+        self.bn.reset_running_stats()
+
+    def reset_bn_parameters(self):
+        self.bn.reset_running_stats()
+        init.uniform_(self.bn.weight)
+        init.zeros_(self.bn.bias)
+
+    def reset_parameters(self):
+        super().reset_parameters()
+
+    def update_bn_stats(self):
+        self.freeze_bn = False
+        self.bn.training = True
+        return self
+
+    def freeze_bn_stats(self):
+        self.freeze_bn = True
+        self.bn.training = False
+        return self
+
+    def forward(self, input):
+        assert self.bn.running_var is not None
+
+        # Scale the linear weights by BN's running statistics to reduce
+        # weight jitter, see https://arxiv.org/pdf/1806.08342.pdf, page 18
+        # for motivation.
+        #
+        # Instead of
+        #
+        #   x1 = F.linear(x0, fq(w), b)
+        #   x2 = self.bn(x1)
+        #
+        # We have
+        #
+        #   # scale the weight by previous batch's running statistics
+        #   scale_factor = bn.w / bn.running_std_from_prev_batch
+        #   # do the linear transformation without bias
+        #   x1_scaled = F.linear(x0, fq(w * scale_factor), 0)
+        #   # reverse the scaling and add original bias
+        #   x1_orig = x1_scaled / scale_factor + b
+        #   x2 = self.bn(x1_orig)
+
+        running_std = torch.sqrt(self.bn.running_var + self.bn.eps)
+        scale_factor = self.bn.weight / running_std
+        weight_shape = [1] * len(self.weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(self.weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape))
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias)
+        else:
+            zero_bias = torch.zeros(self.out_features, device=scaled_weight.device)
+        linear_out = F.linear(input, scaled_weight, zero_bias)
+        linear_out_orig = linear_out / scale_factor.reshape(bias_shape)
+        if self.bias is not None:
+            linear_out_orig = linear_out_orig + self.bias.reshape(bias_shape)
+        bn_out = self.bn(linear_out_orig)
+        return bn_out
+
+    def train(self, mode=True):
+        """
+        Batchnorm's training behavior is using the self.training flag. Prevent
+        changing it if BN is frozen. This makes sure that calling `model.train()`
+        on a model with a frozen BN will behave properly.
+        """
+        self.training = mode
+        if not self.freeze_bn:
+            for module in self.children():
+                module.train(mode)
+        return self
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+
+            Args: `mod' a float module, either produced by torch.ao.quantization
+            utilities or directly from user
+        """
+        assert type(mod) == nni.LinearBn1d, 'qat.' + cls.__name__ + \
+            '.from_float only works for ' + nni.LinearBn1d.__name__
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid config'
+        qconfig = mod.qconfig
+        linear, bn = mod[0], mod[1]
+        qat_linearbn = cls(linear.in_features, linear.out_features, linear.bias is not None,
+                           bn.eps, bn.momentum,
+                           False, qconfig)
+        qat_linearbn.weight = linear.weight
+        qat_linearbn.bias = linear.bias
+        qat_linearbn.bn.weight = bn.weight
+        qat_linearbn.bn.bias = bn.bias
+        qat_linearbn.bn.running_mean = bn.running_mean
+        qat_linearbn.bn.running_var = bn.running_var
+        qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked
+        return qat_linearbn
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features)
+        assert self.bn.running_var is not None and self.bn.running_mean is not None
+        linear.weight, linear.bias = fuse_linear_bn_weights(
+            self.weight,
+            self.bias,
+            self.bn.running_mean,
+            self.bn.running_var,
+            self.bn.eps,
+            self.bn.weight,
+            self.bn.bias)
+        return linear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d28073322abd3d0c0cc61636466d9e50fb80ce7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/qat/modules/linear_relu.py
@@ -0,0 +1,48 @@
+import torch
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.intrinsic as nni
+import torch.nn.functional as F
+
+class LinearReLU(nnqat.Linear, nni._FusedModule):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules, attached with
+    FakeQuantize modules for weight, used in
+    quantization aware training.
+
+    We adopt the same interface as :class:`torch.nn.Linear`.
+
+    Similar to `torch.ao.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.qat.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True,
+                 qconfig=None):
+        super().__init__(in_features, out_features, bias, qconfig)
+
+    def forward(self, input):
+        return F.relu(F.linear(input, self.weight_fake_quant(self.weight), self.bias))
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features, self.bias is not None)
+        linear.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            linear.bias = torch.nn.Parameter(self.bias.detach())
+        relu = torch.nn.ReLU()
+        return torch.ao.nn.intrinsic.LinearReLU(linear, relu)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18b3aa317a68bcef55db3b0b837e83224833b23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__init__.py
@@ -0,0 +1,14 @@
+from .modules import *  # noqa: F403
+
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'LinearLeakyReLU',
+    'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db171c4562b5809419294c6c01d78b7e7fa9f9dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3549cf40d305aa16f62c6f7f28ef7465e0cc09a1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1832b330e841f595524b3d83b015a4e4795deda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,6 @@
+import torch
+from .linear_relu import LinearReLU
+
+__all__ = [
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f5844282a5fc0e6f0f5298227cd968f73ebeb51
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47e726d5a9de42f5a509ce2437cccc74795e6db2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e7c02eec9161d9d1d78f06cfde43e4818e07146
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -0,0 +1,55 @@
+import torch
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.intrinsic as nni
+
+__all__ = [
+    "LinearReLU"
+]
+
+class LinearReLU(nnqd.Linear):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules that can be used
+    for dynamic quantization.
+    Supports both, FP16 and INT8 quantization.
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.dynamic.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.dynamic.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.quantized.dynamic.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self._packed_params.dtype == torch.qint8:
+            # TODO check if we should set reduce_rage = True by default here
+            Y = torch.ops.quantized.linear_relu_dynamic(
+                x, self._packed_params._packed_params, reduce_range=True)
+        elif self._packed_params.dtype == torch.float16:
+            Y = torch.ops.quantized.linear_relu_dynamic_fp16(
+                x, self._packed_params._packed_params)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear relu!')
+        return Y.to(x.dtype)
+
+    def _get_name(self):
+        return 'DynamicQuantizedLinearReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qlinear_relu):
+        return super().from_reference(ref_qlinear_relu[0])
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa94c98c12d6de780f0c4f8688f258d5179ff959
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__init__.py
@@ -0,0 +1,17 @@
+from .linear_relu import LinearReLU, LinearLeakyReLU, LinearTanh
+from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
+from .bn_relu import BNReLU2d, BNReLU3d
+from .conv_add import ConvAdd2d, ConvAddReLU2d
+
+__all__ = [
+    'LinearReLU',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearLeakyReLU',
+    'LinearTanh',
+    'ConvAdd2d',
+    'ConvAddReLU2d',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6045a6f089769c206c0f384acb761d92505add2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c13c0ee110cb71a2f28355e1c873b9f9d91d9a8c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b71e9e66c7519528b6bf78b20395d7920d62efc1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_add.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cb2503f047bbcadf63fe4e63b5c4eb895fd0ade
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bc43786fa64b0a5a32983ed2322580055f48ec7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e6bfe9e52e28c30569bd758958b75965a1affb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/bn_relu.py
@@ -0,0 +1,82 @@
+
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.ao.nn.quantized as nnq
+
+__all__ = [
+    "BNReLU2d",
+    "BNReLU3d"
+]
+
+class BNReLU2d(nnq.BatchNorm2d):
+    r"""
+    A BNReLU2d module is a fused module of BatchNorm2d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.BatchNorm2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU2d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super().__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return torch.ops.quantized.batch_norm2d_relu(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedBNReLU2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        # TODO: Add qat support for BNReLU2d
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
+
+class BNReLU3d(nnq.BatchNorm3d):
+    r"""
+    A BNReLU3d module is a fused module of BatchNorm3d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.BatchNorm3d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.BatchNorm3d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.BNReLU3d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        super().__init__(num_features, eps=eps, momentum=momentum, device=device, dtype=dtype)
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        return torch.ops.quantized.batch_norm3d_relu(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedBNReLU3d'
+
+    @classmethod
+    def from_float(cls, mod):
+        # TODO: Add qat support for BNReLU3d
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, bn_relu, output_scale, output_zero_point):
+        return super().from_reference(bn_relu[0], output_scale, output_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..632dd1832af380fd74d01dba4768fa2e5c154ca9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_add.py
@@ -0,0 +1,93 @@
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.nn.functional as F
+import torch.ao.nn.quantized as nnq
+
+_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
+
+class ConvAdd2d(nnq.Conv2d):
+    r"""
+    A ConvAdd2d module is a fused module of Conv2d and Add
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAdd2d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input, extra_input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv2d_add(
+            input, extra_input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvAdd2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+class ConvAddReLU2d(nnq.Conv2d):
+    r"""
+    A ConvAddReLU2d module is a fused module of Conv2d, Add and Relu
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvAddReLU2d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input, extra_input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv2d_add_relu(
+            input, extra_input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvAddReLU2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c920844733b9cad6d2d52e6bcac09fc581da335b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/conv_relu.py
@@ -0,0 +1,175 @@
+
+import torch
+import torch.ao.nn.intrinsic
+import torch.ao.nn.intrinsic.qat
+import torch.nn.functional as F
+import torch.ao.nn.quantized as nnq
+
+from torch.nn.utils import fuse_conv_bn_weights
+
+__all__ = [
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+]
+
+_reverse_repeat_padding = nnq.modules.conv._reverse_repeat_padding
+
+# TODO: factor out the common parts to ConvNd
+class ConvReLU1d(nnq.Conv1d):
+    r"""
+    A ConvReLU1d module is a fused module of Conv1d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv1d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv1d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU1d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != 'zeros':
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv1d_relu(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvReLU1d'
+
+    @classmethod
+    def from_float(cls, mod):
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU1d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                mod.bn.eps, mod.bn.weight, mod.bn.bias)
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU1d, \
+            "BatchNorm1d should be fused into Conv1d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+class ConvReLU2d(nnq.Conv2d):
+    r"""
+    A ConvReLU2d module is a fused module of Conv2d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv2d`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Conv2d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU2d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv2d_relu(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvReLU2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU2d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                mod.bn.eps, mod.bn.weight, mod.bn.bias)
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU2d, \
+            "BatchNorm2d should be fused into Conv2d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
+
+
+class ConvReLU3d(nnq.Conv3d):
+    r"""
+    A ConvReLU3d module is a fused module of Conv3d and ReLU
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Conv3d`.
+
+    Attributes: Same as torch.ao.nn.quantized.Conv3d
+
+    """
+    _FLOAT_MODULE = torch.ao.nn.intrinsic.ConvReLU3d  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias,
+            padding_mode=padding_mode, device=device, dtype=dtype)
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return torch.ops.quantized.conv3d_relu(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedConvReLU3d'
+
+    @classmethod
+    def from_float(cls, mod):
+        if type(mod) == torch.ao.nn.intrinsic.qat.ConvBnReLU3d:
+            assert mod.bn.running_var is not None and mod.bn.running_mean is not None
+            mod.weight, mod.bias = fuse_conv_bn_weights(
+                mod.weight,
+                mod.bias,
+                mod.bn.running_mean,
+                mod.bn.running_var,
+                mod.bn.eps,
+                mod.bn.weight,
+                mod.bn.bias,
+            )
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        assert type(ref_qconv) != torch.ao.nn.intrinsic.ConvBnReLU3d, \
+            "BatchNorm3d should be fused into Conv3d before converting to reference module"
+        return super().from_reference(ref_qconv[0], output_scale, output_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..08fb6b51bdca30bd8473d21b96dc884314e71899
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/intrinsic/quantized/modules/linear_relu.py
@@ -0,0 +1,177 @@
+import torch
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.intrinsic as nni
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+
+__all__ = [
+    "LinearReLU",
+    "LinearLeakyReLU",
+    "LinearTanh",
+]
+
+class LinearReLU(nnq.Linear):
+    r"""
+    A LinearReLU module fused from Linear and ReLU modules
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearReLU(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_relu(
+            x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedLinearReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_linear_relu, output_scale, output_zero_point):
+        return super().from_reference(ref_linear_relu[0], output_scale, output_zero_point)
+
+class LinearLeakyReLU(nnq.Linear):
+    r"""
+    For onednn backend only
+    A LinearLeakyReLU module fused from Linear and LeakyReLU modules
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+        + negative_slope
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearLeakyReLU(20, 30, 0.01)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearLeakyReLU  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, negative_slope, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+        self.negative_slope = negative_slope
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_leaky_relu(
+            x, self._packed_params._packed_params, self.scale, self.zero_point, self.negative_slope)
+
+    def _get_name(self):
+        return 'QuantizedLinearLeakyReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        assert type(mod) == nni.LinearLeakyReLU, 'Input float module should be LinearLeakyReLU'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        activation_post_process = mod.activation_post_process
+        leaky_relu = mod[1]
+        mod = mod[0]
+        weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear_leaky_relu = cls(
+            mod.in_features,
+            mod.out_features,
+            leaky_relu.negative_slope,
+            dtype=dtype)
+        qlinear_leaky_relu.set_weight_bias(qweight, mod.bias)
+        qlinear_leaky_relu.scale = float(act_scale)
+        qlinear_leaky_relu.zero_point = int(act_zp)
+        return qlinear_leaky_relu
+
+    @classmethod
+    def from_reference(cls, ref_mod, output_scale, output_zero_point):
+        linear = ref_mod[0]
+        leaky_relu = ref_mod[1]
+        qlinear_leaky_relu = cls(
+            linear.in_features,
+            linear.out_features,
+            leaky_relu.negative_slope)
+        qweight = linear.get_quantized_weight()
+        qlinear_leaky_relu.set_weight_bias(qweight, linear.bias)
+        qlinear_leaky_relu.scale = float(output_scale)
+        qlinear_leaky_relu.zero_point = int(output_zero_point)
+        return qlinear_leaky_relu
+
+class LinearTanh(nnq.Linear):
+    r"""
+    A LinearTanh module fused from Linear and Tanh modules
+
+    We adopt the same interface as :class:`torch.ao.nn.quantized.Linear`.
+
+    Attributes:
+        Same as torch.ao.nn.quantized.Linear
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.intrinsic.LinearTanh(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _FLOAT_MODULE = nni.LinearTanh  # type: ignore[assignment]
+
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias, dtype)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear_tanh(
+            x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedLinearTanh'
+
+    @classmethod
+    def from_float(cls, mod):
+        assert type(mod) == nni.LinearTanh, 'Input float module should be LinearTanh'
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        activation_post_process = mod.activation_post_process
+        mod = mod[0]
+        weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[union-attr,operator]
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear_tanh = cls(
+            mod.in_features,
+            mod.out_features,
+            dtype=dtype)
+        qlinear_tanh.set_weight_bias(qweight, mod.bias)
+        qlinear_tanh.scale = float(act_scale)
+        qlinear_tanh.zero_point = int(act_zp)
+        return qlinear_tanh
+
+    @classmethod
+    def from_reference(cls, ref_mod, output_scale, output_zero_point):
+        linear = ref_mod[0]
+        qlinear_tanh = cls(
+            linear.in_features,
+            linear.out_features)
+        qweight = linear.get_quantized_weight()
+        qlinear_tanh.set_weight_bias(qweight, linear.bias)
+        qlinear_tanh.scale = float(output_scale)
+        qlinear_tanh.zero_point = int(output_zero_point)
+        return qlinear_tanh
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6be35e6be423632a6fab8ae3224c4a7f17e1498e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e31c3e0fe6bab106332ccfd4324b8b8918ba260f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f656409ea408920a9eb2f4d28ccf0aeb0f9b50ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__init__.py
@@ -0,0 +1,3 @@
+from .linear import Linear
+
+__all__ = ["Linear"]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3291390ebb9e9a4bab69aba78dcca9ddd063065
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c33276cc34add1850b60e773a70747ff6da3dea8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..fccb87e291c632481985786c26b06752ad03c8ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/dynamic/modules/linear.py
@@ -0,0 +1,25 @@
+import torch
+
+__all__ = ["Linear"]
+
+class Linear(torch.ao.nn.qat.Linear):
+    r"""
+    A linear module attached with FakeQuantize modules for weight,
+    used for dynamic quantization aware training.
+
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
+    for documentation.
+
+    Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to
+    default.
+    """
+
+    def __init__(self, in_features, out_features, bias=True,
+                 qconfig=None, device=None, dtype=None) -> None:
+        super().__init__(in_features, out_features, bias, qconfig, device, dtype)
+        if not torch.ao.quantization.qconfig._activation_is_memoryless(qconfig):
+            raise ValueError(
+                "Dynamic QAT requires a memoryless observer." +
+                "This means a MovingAverage observer with averaging constant equal to 1"
+            )
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee80f17bb2660e8ce319854bb9273b8e7f4f909
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__init__.py
@@ -0,0 +1,14 @@
+from .linear import Linear
+from .conv import Conv1d
+from .conv import Conv2d
+from .conv import Conv3d
+from .embedding_ops import EmbeddingBag, Embedding
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bc95d50b6a08acc4900ace8fef0f690caedfb51
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a9743e17ae5d15868a2de189badd275df0034e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af75f4498f422ea8c63e1eef35f172476af803c6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1318814a07b6690ed64460369004ccce246d6f0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/conv.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..c201f917a1db4fd007cde0e2c6c039f997299130
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/conv.py
@@ -0,0 +1,270 @@
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.ao.nn.intrinsic import _FusedModule
+from typing import Tuple, TypeVar, Union
+from torch.nn.common_types import _size_1_t, _size_2_t, _size_3_t
+
+__all__ = [
+    "Conv1d",
+    "Conv2d",
+    "Conv3d"
+]
+
+MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
+
+class _ConvNd(nn.modules.conv._ConvNd):
+
+    _FLOAT_MODULE = MOD
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Tuple[int, ...],
+                 stride: Tuple[int, ...],
+                 padding: Tuple[int, ...],
+                 dilation: Tuple[int, ...],
+                 transposed: bool,
+                 output_padding: Tuple[int, ...],
+                 groups: int,
+                 bias: bool,
+                 padding_mode: str,
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        nn.modules.conv._ConvNd.__init__(self, in_channels, out_channels, kernel_size,
+                                         stride, padding, dilation, transposed,
+                                         output_padding, groups, bias, padding_mode, **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @staticmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module
+
+            Args:
+               `mod`: a float module, either produced by torch.ao.quantization utilities
+               or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, (
+            "qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        )
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        if issubclass(type(mod), _FusedModule):
+            mod = mod[0]  # type: ignore[index]
+        qconfig = mod.qconfig
+        qat_conv = cls(mod.in_channels, mod.out_channels, mod.kernel_size,
+                       stride=mod.stride, padding=mod.padding, dilation=mod.dilation,
+                       groups=mod.groups, bias=mod.bias is not None,
+                       padding_mode=mod.padding_mode, qconfig=qconfig)
+        qat_conv.weight = mod.weight
+        qat_conv.bias = mod.bias
+        return qat_conv
+
+    def to_float(self):
+        """ This works for both single qat conv, and the qat conv - relu modules
+        to convert the qat module to a floating point module
+        """
+        cls = type(self)
+        conv = cls._FLOAT_CONV_MODULE(  # type: ignore[attr-defined, operator]
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,  # type: ignore[arg-type]
+            self.stride,  # type: ignore[arg-type]
+            self.padding,  # type: ignore[arg-type]
+            self.dilation,  # type: ignore[arg-type]
+            self.groups,
+            self.bias is not None,
+            self.padding_mode)
+        conv.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            conv.bias = torch.nn.Parameter(self.bias.detach())
+        # conv relu
+        if issubclass(cls, _FusedModule):
+            modules = [conv]
+            assert hasattr(cls, "_FLOAT_RELU_MODULE")
+            relu = cls._FLOAT_RELU_MODULE()  # type: ignore[attr-defined]
+            modules.append(relu)
+            fused = cls._FLOAT_MODULE(*modules)  # type: ignore[arg-type, attr-defined, operator]
+            fused.train(self.training)
+            return fused
+        else:
+            return conv
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    r"""
+    A Conv1d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as :class:`~torch.nn.Conv1d`
+
+    Similar to :class:`~torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv1d
+    _FLOAT_CONV_MODULE = nn.Conv1d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: Union[str, _size_1_t] = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_single(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    r"""
+    A Conv2d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv2d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv2d#torch.nn.Conv2d
+    for documentation.
+
+    Similar to `torch.nn.Conv2d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv2d
+    _FLOAT_CONV_MODULE = nn.Conv2d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_2_t,
+                 stride: _size_2_t = 1,
+                 padding: Union[str, _size_2_t] = 0,
+                 dilation: _size_2_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_pair(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
+
+class Conv3d(_ConvNd, nn.Conv3d):
+    r"""
+    A Conv3d module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Conv3d`, please see
+    https://pytorch.org/docs/stable/nn.html?highlight=conv3d#torch.nn.Conv3d
+    for documentation.
+
+    Similar to `torch.nn.Conv3d`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight_fake_quant: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Conv3d
+    _FLOAT_CONV_MODULE = nn.Conv3d
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_3_t,
+                 stride: _size_3_t = 1,
+                 padding: Union[str, _size_3_t] = 0,
+                 dilation: _size_3_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 qconfig=None,
+                 device=None,
+                 dtype=None) -> None:
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size_,
+            stride=stride_,
+            padding=padding_,
+            dilation=dilation_,
+            transposed=False,
+            output_padding=_triple(0),
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            qconfig=qconfig,
+            device=device,
+            dtype=dtype)
+
+    def forward(self, input):
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(cls, mod)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/embedding_ops.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..35172bcd4df39fb8d946ca624aa774e8f3ab6475
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/embedding_ops.py
@@ -0,0 +1,143 @@
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['Embedding', 'EmbeddingBag']
+
+class Embedding(nn.Embedding):
+    r"""
+    An embedding bag module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Embedding`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding
+    for documentation.
+
+    Similar to `torch.nn.Embedding`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Embedding
+
+    def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
+                 max_norm=None, norm_type=2.0, scale_grad_by_freq=False,
+                 sparse=False, _weight=None, device=None, dtype=None, qconfig=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_embeddings, embedding_dim, padding_idx, max_norm,
+                         norm_type, scale_grad_by_freq, sparse, _weight,
+                         **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, \
+            'Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
+            str(qconfig.weight().qscheme)
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input) -> Tensor:
+        return F.embedding(input, self.weight_fake_quant(self.weight), self.padding_idx,
+                           self.max_norm, self.norm_type, self.scale_grad_by_freq,
+                           self.sparse)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module
+
+            Args: `mod` a float module, either produced by torch.ao.quantization utilities
+            or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \
+            cls._FLOAT_MODULE.__name__
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, \
+            'Embedding weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
+            str(weight_qscheme)
+
+        qconfig = mod.qconfig
+        qat_embedding_bag = cls(mod.num_embeddings, mod.embedding_dim, mod.padding_idx,
+                                mod.max_norm, mod.norm_type, mod.scale_grad_by_freq,
+                                mod.sparse, mod.weight, qconfig=qconfig)
+
+        return qat_embedding_bag
+
+    def to_float(self):
+        embedding_bag = torch.nn.Embedding(self.num_embeddings, self.embedding_dim, self.padding_idx,
+                                           self.max_norm, self.norm_type, self.scale_grad_by_freq,
+                                           self.sparse, None)
+        embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
+        embedding_bag.train(self.training)
+        return embedding_bag
+
+class EmbeddingBag(nn.EmbeddingBag):
+    r"""
+    An embedding bag module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.EmbeddingBag`, please see
+    https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag
+    for documentation.
+
+    Similar to `torch.nn.EmbeddingBag`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.EmbeddingBag
+
+    def __init__(self, num_embeddings, embedding_dim, max_norm=None,
+                 norm_type=2.0, scale_grad_by_freq=False, mode='mean',
+                 sparse=False, _weight=None, include_last_offset=False,
+                 padding_idx=None, qconfig=None, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_embeddings, embedding_dim, max_norm, norm_type,
+                         scale_grad_by_freq, mode, sparse, _weight,
+                         include_last_offset, padding_idx, **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        assert qconfig.weight().qscheme == torch.per_channel_affine_float_qparams, \
+            'Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
+            str(qconfig.weight().qscheme)
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input, offsets=None, per_sample_weights=None) -> Tensor:
+        return F.embedding_bag(input, self.weight_fake_quant(self.weight), offsets,
+                               self.max_norm, self.norm_type,
+                               self.scale_grad_by_freq, self.mode, self.sparse,
+                               per_sample_weights, self.include_last_offset,
+                               self.padding_idx)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module
+
+            Args: `mod` a float module, either produced by torch.ao.quantization utilities
+            or directly from user
+        """
+        assert type(mod) == cls._FLOAT_MODULE, ' qat.' + cls.__name__ + '.from_float only works for ' + \
+            cls._FLOAT_MODULE.__name__
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        assert mod.qconfig, 'Input float module must have a valid qconfig'
+        weight_qscheme = mod.qconfig.weight().qscheme  # type: ignore[union-attr, operator]
+        assert weight_qscheme == torch.per_channel_affine_float_qparams, \
+            'Embedding Bag weights requires a qscheme of torch.per_channel_affine_float_qparams Got ' + \
+            str(weight_qscheme)
+
+        qconfig = mod.qconfig
+        qat_embedding_bag = cls(mod.num_embeddings, mod.embedding_dim, mod.max_norm, mod.norm_type,
+                                mod.scale_grad_by_freq, mod.mode, mod.sparse, mod.weight,
+                                mod.include_last_offset, mod.padding_idx, qconfig=qconfig)
+
+        return qat_embedding_bag
+
+    def to_float(self):
+        embedding_bag = torch.nn.EmbeddingBag(self.num_embeddings, self.embedding_dim, self.max_norm,
+                                              self.norm_type, self.scale_grad_by_freq, self.mode, self.sparse,
+                                              None, self.include_last_offset, self.padding_idx)
+        embedding_bag.weight = torch.nn.Parameter(self.weight.detach())
+        embedding_bag.train(self.training)
+        return embedding_bag
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..7986f0c9f5237d3eef6f6b4717c7593e290828e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/qat/modules/linear.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.nn.utils.parametrize import (
+    is_parametrized,
+    type_before_parametrizations,
+    transfer_parametrizations_and_params,
+)
+
+__all__ = [
+    "Linear"
+]
+
+class Linear(nn.Linear):
+    r"""
+    A linear module attached with FakeQuantize modules for weight,
+    used for quantization aware training.
+
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
+    for documentation.
+
+    Similar to `torch.nn.Linear`, with FakeQuantize modules initialized to
+    default.
+
+    Attributes:
+        weight: fake quant module for weight
+    """
+    _FLOAT_MODULE = nn.Linear
+
+    def __init__(self, in_features, out_features, bias=True,
+                 qconfig=None, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(in_features, out_features, bias, **factory_kwargs)
+        assert qconfig, 'qconfig must be provided for QAT module'
+        self.qconfig = qconfig
+        self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
+
+    def forward(self, input):
+        return F.linear(input, self.weight_fake_quant(self.weight), self.bias)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a qat module from a float module or qparams_dict
+            Args: `mod` a float module, either produced by torch.ao.quantization utilities
+            or directly from user
+        """
+        assert type_before_parametrizations(mod) == cls._FLOAT_MODULE, (
+            " qat."
+            + cls.__name__
+            + ".from_float only works for "
+            + cls._FLOAT_MODULE.__name__
+        )
+        assert hasattr(mod, "qconfig"), "Input float module must have qconfig defined"
+        assert mod.qconfig, "Input float module must have a valid qconfig"
+        if type_before_parametrizations(mod) == LinearReLU:
+            mod = mod[0]
+
+        qconfig = mod.qconfig
+        qat_linear = cls(mod.in_features, mod.out_features, bias=mod.bias is not None, qconfig=qconfig)
+
+        if is_parametrized(mod, "weight"):
+            transfer_parametrizations_and_params(mod, qat_linear, "weight")
+        else:
+            qat_linear.weight = mod.weight
+
+        if is_parametrized(mod, "bias"):
+            transfer_parametrizations_and_params(mod, qat_linear, "bias")
+        else:
+            qat_linear.bias = mod.bias
+
+        return qat_linear
+
+    def to_float(self):
+        linear = torch.nn.Linear(self.in_features, self.out_features, self.bias is not None)
+        linear.weight = torch.nn.Parameter(self.weight.detach())
+        if self.bias is not None:
+            linear.bias = torch.nn.Parameter(self.bias.detach())
+        linear.train(self.training)
+        return linear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68685e82dbb1a5a5a23f6c3415982ef3bf9dea66
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e55197cb4e3a166e09790606b7503224705b0c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__init__.py
@@ -0,0 +1,9 @@
+from .activation import MultiheadAttention
+from .rnn import LSTM
+from .rnn import LSTMCell
+
+__all__ = [
+    'LSTM',
+    'LSTMCell',
+    'MultiheadAttention',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f074ffc402f90e2b90838a04be3e108352356ca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42964880df47e392ed82199e77d530624c23ae34
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c90913c7b8d712de624e148c54efa6a83165e592
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/activation.py b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..90975fbb3de65e9556cc71533abb1b7c03e7a304
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/activation.py
@@ -0,0 +1,465 @@
+import torch
+import torch.jit  # this is needed to avoid a circular import
+from torch import nn
+import torch.nn.functional as nnF
+
+from torch import Tensor
+from typing import Optional, Tuple
+
+import warnings
+
+__all__ = [
+    "MultiheadAttention"
+]
+
+class MultiheadAttention(nn.MultiheadAttention):
+    _FLOAT_MODULE = nn.MultiheadAttention
+
+    r"""Quantizable implementation of the MultiheadAttention.
+
+    Note::
+        Please, refer to :class:`~torch.nn.MultiheadAttention` for more
+        information
+
+    Allows the model to jointly attend to information from different
+    representation subspaces.
+    See reference: Attention Is All You Need
+
+    The original MHA module is not quantizable.
+    This reimplements it by explicitly instantiating the linear layers.
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in value. Default: None.
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> multihead_attn = nnqa.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+
+    Note::
+        Please, follow the quantization flow to convert the quantizable MHA.
+    """
+    __constants__ = ['batch_first']
+
+    def __init__(self, embed_dim: int, num_heads: int,
+                 dropout: float = 0., bias: bool = True,
+                 add_bias_kv: bool = False, add_zero_attn: bool = False,
+                 kdim: Optional[int] = None, vdim: Optional[int] = None, batch_first: bool = False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(embed_dim, num_heads, dropout,
+                         bias, add_bias_kv,
+                         add_zero_attn, kdim, vdim, batch_first,
+                         **factory_kwargs)
+        self.linear_Q = nn.Linear(self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs)
+        self.linear_K = nn.Linear(self.kdim, self.embed_dim, bias=bias, **factory_kwargs)
+        self.linear_V = nn.Linear(self.vdim, self.embed_dim, bias=bias, **factory_kwargs)
+        # for the type: ignore, see https://github.com/pytorch/pytorch/issues/58969
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=bias, **factory_kwargs)  # type: ignore[assignment]
+
+        # Functionals
+        self.q_scaling_product = torch.ao.nn.quantized.FloatFunctional()
+        # note: importing torch.ao.nn.quantized at top creates a circular import
+
+        # Quant/Dequant
+        self.quant_attn_output = torch.ao.quantization.QuantStub()
+        self.quant_attn_output_weights = torch.ao.quantization.QuantStub()
+        self.dequant_q = torch.ao.quantization.DeQuantStub()
+        self.dequant_k = torch.ao.quantization.DeQuantStub()
+        self.dequant_v = torch.ao.quantization.DeQuantStub()
+
+    def _get_name(self):
+        return 'QuantizableMultiheadAttention'
+
+    @classmethod
+    def from_float(cls, other):
+        assert type(other) == cls._FLOAT_MODULE
+        assert hasattr(other, 'qconfig'), "The float module must have 'qconfig'"
+        # Setting the dropout to 0.0!
+        observed = cls(other.embed_dim, other.num_heads, other.dropout,
+                       (other.in_proj_bias is not None),
+                       (other.bias_k is not None),
+                       other.add_zero_attn, other.kdim, other.vdim,
+                       other.batch_first)
+        observed.bias_k = other.bias_k
+        observed.bias_v = other.bias_v
+        observed.qconfig = other.qconfig
+
+        # Set the linear weights
+        # for the type: ignores, see https://github.com/pytorch/pytorch/issues/58969
+        observed.out_proj.weight = other.out_proj.weight  # type: ignore[has-type]
+        observed.out_proj.bias = other.out_proj.bias  # type: ignore[has-type]
+        if other._qkv_same_embed_dim:
+            # Use separate params
+            bias = other.in_proj_bias
+            _start = 0
+            _end = _start + other.embed_dim
+            weight = other.in_proj_weight[_start:_end, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad)
+            observed.linear_Q.weight = torch.nn.Parameter(weight,
+                                                          weight.requires_grad)
+            observed.linear_Q.bias = bias
+
+            bias = other.in_proj_bias
+            _start = _end
+            _end = _start + other.embed_dim
+            weight = other.in_proj_weight[_start:_end, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:_end], bias.requires_grad)
+            observed.linear_K.weight = torch.nn.Parameter(weight,
+                                                          weight.requires_grad)
+            observed.linear_K.bias = bias
+
+            bias = other.in_proj_bias
+            _start = _end
+            weight = other.in_proj_weight[_start:, :]
+            if bias is not None:
+                bias = torch.nn.Parameter(bias[_start:], bias.requires_grad)
+            observed.linear_V.weight = torch.nn.Parameter(weight,
+                                                          weight.requires_grad)
+            observed.linear_V.bias = bias
+        else:
+            observed.linear_Q.weight = nn.Parameter(other.q_proj_weight)
+            observed.linear_K.weight = nn.Parameter(other.k_proj_weight)
+            observed.linear_V.weight = nn.Parameter(other.v_proj_weight)
+            if other.in_proj_bias is None:
+                observed.linear_Q.bias = None  # type: ignore[assignment]
+                observed.linear_K.bias = None  # type: ignore[assignment]
+                observed.linear_V.bias = None  # type: ignore[assignment]
+            else:
+                observed.linear_Q.bias = nn.Parameter(other.in_proj_bias[0:other.embed_dim])
+                observed.linear_K.bias = nn.Parameter(other.in_proj_bias[other.embed_dim:(other.embed_dim * 2)])
+                observed.linear_V.bias = nn.Parameter(other.in_proj_bias[(other.embed_dim * 2):])
+        observed.eval()
+        # Explicit prepare
+        observed = torch.ao.quantization.prepare(observed, inplace=True)
+        return observed
+
+    @torch.jit.unused
+    def dequantize(self):
+        r"""Utility to convert the quantized MHA back to float.
+
+        The motivation for this is that it is not trivial to conver the weights
+        from the format that is used in the quantized version back to the
+        float.
+        """
+        fp = self._FLOAT_MODULE(self.embed_dim, self.num_heads, self.dropout,
+                                (self.linear_Q._weight_bias()[1] is not None),
+                                (self.bias_k is not None),
+                                self.add_zero_attn, self.kdim, self.vdim, self.batch_first)
+        assert fp._qkv_same_embed_dim == self._qkv_same_embed_dim
+        if self.bias_k is not None:
+            fp.bias_k = nn.Parameter(self.bias_k.dequantize())
+        if self.bias_v is not None:
+            fp.bias_v = nn.Parameter(self.bias_v.dequantize())
+
+        # Set the linear weights
+        # Note: Because the linear layers are quantized, mypy does not nkow how
+        # to deal with them -- might need to ignore the typing checks.
+        # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
+        w, b = self.out_proj._weight_bias()  # type: ignore[operator, has-type]
+        fp.out_proj.weight = nn.Parameter(w.dequantize())
+        if b is not None:
+            fp.out_proj.bias = nn.Parameter(b)
+
+        wQ, bQ = self.linear_Q._weight_bias()  # type: ignore[operator]
+        wQ = wQ.dequantize()
+        wK, bK = self.linear_K._weight_bias()  # type: ignore[operator]
+        wK = wK.dequantize()
+        wV, bV = self.linear_V._weight_bias()  # type: ignore[operator]
+        wV = wV.dequantize()
+        if fp._qkv_same_embed_dim:
+            # Use separate params
+            _start = 0
+            _end = _start + fp.embed_dim
+            fp.in_proj_weight[_start:_end, :] = wQ
+            if fp.in_proj_bias is not None:
+                assert all(bQ == 0)
+                fp.in_proj_bias[_start:_end] = bQ
+
+            _start = _end
+            _end = _start + fp.embed_dim
+            fp.in_proj_weight[_start:_end, :] = wK
+            if fp.in_proj_bias is not None:
+                assert all(bK == 0)
+                fp.in_proj_bias[_start:_end] = bK
+
+            _start = _end
+            fp.in_proj_weight[_start:, :] = wV
+            if fp.in_proj_bias is not None:
+                assert all(bV == 0)
+                fp.in_proj_bias[_start:] = bV
+        else:
+            fp.q_proj_weight = nn.Parameter(wQ)
+            fp.k_proj_weight = nn.Parameter(wK)
+            fp.v_proj_weight = nn.Parameter(wV)
+            if fp.in_proj_bias is None:
+                self.linear_Q.bias = None
+                self.linear_K.bias = None
+                self.linear_V.bias = None
+            else:
+                fp.in_proj_bias[0:fp.embed_dim] = bQ
+                fp.in_proj_bias[fp.embed_dim:(fp.embed_dim * 2)] = bK
+                fp.in_proj_bias[(fp.embed_dim * 2):] = bV
+
+        return fp
+
+
+    @classmethod
+    def from_observed(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does float -> observed only
+        # See nn.quantized.MultiheadAttention
+        raise NotImplementedError("It looks like you are trying to prepare an "
+                                  "MHA module. Please, see "
+                                  "the examples on quantizable MHAs.")
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True,
+                attn_mask: Optional[Tensor] = None,
+                average_attn_weights: bool = True,
+                is_causal: bool = False) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""
+    Note::
+        Please, refer to :func:`~torch.nn.MultiheadAttention.forward` for more
+        information
+
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension. :math:`(N, S, E)` if ``batch_first`` is ``True``.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - is_causal: If specified, applies a causal mask as attention mask. Mutually exclusive with providing attn_mask.
+          Default: ``False``.
+        - average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+          heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+          effect when ``need_weights=True.``. Default: True (i.e. average weights across heads)
+
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``.
+        - attn_output_weights: If ``average_attn_weights=True``, returns attention weights averaged
+          across heads of shape :math:`(N, L, S)`, where N is the batch size, L is the target sequence length,
+          S is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+          head of shape :math:`(N, num_heads, L, S)`.
+        """
+        return self._forward_impl(query, key, value, key_padding_mask,
+                                  need_weights, attn_mask, average_attn_weights,
+                                  is_causal)
+
+    def _forward_impl(self,
+                      query: Tensor,
+                      key: Tensor,
+                      value: Tensor,
+                      key_padding_mask: Optional[Tensor] = None,
+                      need_weights: bool = True,
+                      attn_mask: Optional[Tensor] = None,
+                      average_attn_weights: bool = True,
+                      is_causal: bool = False) -> Tuple[Tensor, Optional[Tensor]]:
+        # This version will not deal with the static key/value pairs.
+        # Keeping it here for future changes.
+        #
+        # TODO: This method has some duplicate lines with the
+        # `torch.nn.functional.multi_head_attention`. Will need to refactor.
+        static_k = None
+        static_v = None
+
+        if attn_mask is not None and is_causal:
+            raise AssertionError("Only allow causal mask or attn_mask")
+
+        if is_causal:
+            raise AssertionError("causal mask not supported by AO MHA module")
+
+        if self.batch_first:
+            query, key, value = (x.transpose(0, 1) for x in (query, key, value))
+
+        tgt_len, bsz, embed_dim_to_check = query.size()
+        assert self.embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = self.embed_dim // self.num_heads
+        assert head_dim * self.num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+
+        q = self.linear_Q(query)
+        k = self.linear_K(key)
+        v = self.linear_V(value)
+
+        q = self.q_scaling_product.mul_scalar(q, scaling)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+                attn_mask = attn_mask.to(torch.bool)
+            assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \
+                f'Only float and bool types are supported for attn_mask, not {attn_mask.dtype}'
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError('The size of the 2D attn_mask is not correct.')
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [bsz * self.num_heads, query.size(0), key.size(0)]:
+                    raise RuntimeError('The size of the 3D attn_mask is not correct.')
+            else:
+                raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+            # attn_mask's dim is 3 now.
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            key_padding_mask = key_padding_mask.to(torch.bool)
+        if self.bias_k is not None and self.bias_v is not None:
+            if static_k is None and static_v is None:
+
+                # Explicitly assert that bias_k and bias_v are not None
+                # in a way that TorchScript can understand.
+                bias_k = self.bias_k
+                assert bias_k is not None
+                bias_v = self.bias_v
+                assert bias_v is not None
+
+                k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+                v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+                if attn_mask is not None:
+                    attn_mask = nnF.pad(attn_mask, (0, 1))
+                if key_padding_mask is not None:
+                    key_padding_mask = nnF.pad(key_padding_mask, (0, 1))
+            else:
+                assert static_k is None, "bias cannot be added to static key."
+                assert static_v is None, "bias cannot be added to static value."
+        else:
+            assert self.bias_k is None
+            assert self.bias_v is None
+
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+
+        if static_k is not None:
+            assert static_k.size(0) == bsz * self.num_heads
+            assert static_k.size(2) == head_dim
+            k = static_k
+
+        if static_v is not None:
+            assert static_v.size(0) == bsz * self.num_heads
+            assert static_v.size(2) == head_dim
+            v = static_v
+
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.add_zero_attn:
+            src_len += 1
+            k_zeros = torch.zeros((k.size(0), 1) + k.size()[2:])
+            if k.is_quantized:
+                k_zeros = torch.quantize_per_tensor(k_zeros, k.q_scale(), k.q_zero_point(), k.dtype)
+            k = torch.cat([k, k_zeros], dim=1)
+            v_zeros = torch.zeros((v.size(0), 1) + k.size()[2:])
+            if v.is_quantized:
+                v_zeros = torch.quantize_per_tensor(v_zeros, v.q_scale(), v.q_zero_point(), v.dtype)
+            v = torch.cat([v, v_zeros], dim=1)
+
+            if attn_mask is not None:
+                attn_mask = nnF.pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = nnF.pad(key_padding_mask, (0, 1))
+
+        # Leaving the quantized zone here
+        q = self.dequant_q(q)
+        k = self.dequant_k(k)
+        v = self.dequant_v(v)
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
+            attn_output_weights = attn_output_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_output_weights = nnF.softmax(
+            attn_output_weights, dim=-1)
+        attn_output_weights = nnF.dropout(attn_output_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * self.num_heads, tgt_len, head_dim]
+        if self.batch_first:
+            attn_output = attn_output.view(bsz, tgt_len, self.embed_dim)
+        else:
+            attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+
+        # Reentering the quantized zone
+        attn_output = self.quant_attn_output(attn_output)
+        # for the type: ignore[has-type], see https://github.com/pytorch/pytorch/issues/58969
+        attn_output = self.out_proj(attn_output)  # type: ignore[has-type]
+        attn_output_weights = self.quant_attn_output_weights(attn_output_weights)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            if average_attn_weights:
+                attn_output_weights = attn_output_weights.mean(dim=1)
+            return attn_output, attn_output_weights
+        else:
+            return attn_output, None
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/rnn.py b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec4a532e548abc536a534dee906f06d454cd71a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantizable/modules/rnn.py
@@ -0,0 +1,411 @@
+import numbers
+from typing import Optional, Tuple
+import warnings
+
+import torch
+from torch import Tensor
+
+"""
+We will recreate all the RNN modules as we require the modules to be decomposed
+into its building blocks to be able to observe.
+"""
+
+__all__ = [
+    "LSTMCell",
+    "LSTM"
+]
+
+class LSTMCell(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM) cell.
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell`
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTMCell(10, 20)
+        >>> input = torch.randn(6, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+    """
+    _FLOAT_MODULE = torch.nn.LSTMCell
+
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.input_size = input_dim
+        self.hidden_size = hidden_dim
+        self.bias = bias
+
+        self.igates = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=bias, **factory_kwargs)
+        self.hgates = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=bias, **factory_kwargs)
+        self.gates = torch.ao.nn.quantized.FloatFunctional()
+
+        self.input_gate = torch.nn.Sigmoid()
+        self.forget_gate = torch.nn.Sigmoid()
+        self.cell_gate = torch.nn.Tanh()
+        self.output_gate = torch.nn.Sigmoid()
+
+        self.fgate_cx = torch.ao.nn.quantized.FloatFunctional()
+        self.igate_cgate = torch.ao.nn.quantized.FloatFunctional()
+        self.fgate_cx_igate_cgate = torch.ao.nn.quantized.FloatFunctional()
+
+        self.ogate_cy = torch.ao.nn.quantized.FloatFunctional()
+
+        self.initial_hidden_state_qparams: Tuple[float, int] = (1.0, 0)
+        self.initial_cell_state_qparams: Tuple[float, int] = (1.0, 0)
+        self.hidden_state_dtype: torch.dtype = torch.quint8
+        self.cell_state_dtype: torch.dtype = torch.quint8
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        if hidden is None or hidden[0] is None or hidden[1] is None:
+            hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
+        hx, cx = hidden
+
+        igates = self.igates(x)
+        hgates = self.hgates(hx)
+        gates = self.gates.add(igates, hgates)
+
+        input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1)
+
+        input_gate = self.input_gate(input_gate)
+        forget_gate = self.forget_gate(forget_gate)
+        cell_gate = self.cell_gate(cell_gate)
+        out_gate = self.output_gate(out_gate)
+
+        fgate_cx = self.fgate_cx.mul(forget_gate, cx)
+        igate_cgate = self.igate_cgate.mul(input_gate, cell_gate)
+        fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate)
+        cy = fgate_cx_igate_cgate
+
+        # TODO: make this tanh a member of the module so its qparams can be configured
+        tanh_cy = torch.tanh(cy)
+        hy = self.ogate_cy.mul(out_gate, tanh_cy)
+        return hy, cy
+
+    def initialize_hidden(self, batch_size: int, is_quantized: bool = False) -> Tuple[Tensor, Tensor]:
+        h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros((batch_size, self.hidden_size))
+        if is_quantized:
+            (h_scale, h_zp) = self.initial_hidden_state_qparams
+            (c_scale, c_zp) = self.initial_cell_state_qparams
+            h = torch.quantize_per_tensor(h, scale=h_scale, zero_point=h_zp, dtype=self.hidden_state_dtype)
+            c = torch.quantize_per_tensor(c, scale=c_scale, zero_point=c_zp, dtype=self.cell_state_dtype)
+        return h, c
+
+    def _get_name(self):
+        return 'QuantizableLSTMCell'
+
+    @classmethod
+    def from_params(cls, wi, wh, bi=None, bh=None):
+        """Uses the weights and biases to create a new LSTM cell.
+
+        Args:
+            wi, wh: Weights for the input and hidden layers
+            bi, bh: Biases for the input and hidden layers
+        """
+        assert (bi is None) == (bh is None)  # Either both None or both have values
+        input_size = wi.shape[1]
+        hidden_size = wh.shape[1]
+        cell = cls(input_dim=input_size, hidden_dim=hidden_size,
+                   bias=(bi is not None))
+        cell.igates.weight = torch.nn.Parameter(wi)
+        if bi is not None:
+            cell.igates.bias = torch.nn.Parameter(bi)
+        cell.hgates.weight = torch.nn.Parameter(wh)
+        if bh is not None:
+            cell.hgates.bias = torch.nn.Parameter(bh)
+        return cell
+
+    @classmethod
+    def from_float(cls, other):
+        assert type(other) == cls._FLOAT_MODULE
+        assert hasattr(other, 'qconfig'), "The float module must have 'qconfig'"
+        observed = cls.from_params(other.weight_ih, other.weight_hh,
+                                   other.bias_ih, other.bias_hh)
+        observed.qconfig = other.qconfig
+        observed.igates.qconfig = other.qconfig
+        observed.hgates.qconfig = other.qconfig
+        return observed
+
+
+class _LSTMSingleLayer(torch.nn.Module):
+    r"""A single one-directional LSTM layer.
+
+    The difference between a layer and a cell is that the layer can process a
+    sequence, while the cell only expects an instantaneous value.
+    """
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.cell = LSTMCell(input_dim, hidden_dim, bias=bias, **factory_kwargs)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        result = []
+        seq_len = x.shape[0]
+        for i in range(seq_len):
+            hidden = self.cell(x[i], hidden)
+            result.append(hidden[0])  # type: ignore[index]
+        result_tensor = torch.stack(result, 0)
+        return result_tensor, hidden
+
+    @classmethod
+    def from_params(cls, *args, **kwargs):
+        cell = LSTMCell.from_params(*args, **kwargs)
+        layer = cls(cell.input_size, cell.hidden_size, cell.bias)
+        layer.cell = cell
+        return layer
+
+
+class _LSTMLayer(torch.nn.Module):
+    r"""A single bi-directional LSTM layer."""
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
+                 batch_first: bool = False, bidirectional: bool = False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.batch_first = batch_first
+        self.bidirectional = bidirectional
+        self.layer_fw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias, **factory_kwargs)
+        if self.bidirectional:
+            self.layer_bw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias, **factory_kwargs)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        if hidden is None:
+            hx_fw, cx_fw = (None, None)
+        else:
+            hx_fw, cx_fw = hidden
+        hidden_bw: Optional[Tuple[Tensor, Tensor]] = None
+        if self.bidirectional:
+            if hx_fw is None:
+                hx_bw = None
+            else:
+                hx_bw = hx_fw[1]
+                hx_fw = hx_fw[0]
+            if cx_fw is None:
+                cx_bw = None
+            else:
+                cx_bw = cx_fw[1]
+                cx_fw = cx_fw[0]
+            if hx_bw is not None and cx_bw is not None:
+                hidden_bw = hx_bw, cx_bw
+        if hx_fw is None and cx_fw is None:
+            hidden_fw = None
+        else:
+            hidden_fw = torch.jit._unwrap_optional(hx_fw), torch.jit._unwrap_optional(cx_fw)
+        result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
+
+        if hasattr(self, 'layer_bw') and self.bidirectional:
+            x_reversed = x.flip(0)
+            result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
+            result_bw = result_bw.flip(0)
+
+            result = torch.cat([result_fw, result_bw], result_fw.dim() - 1)
+            if hidden_fw is None and hidden_bw is None:
+                h = None
+                c = None
+            elif hidden_fw is None:
+                (h, c) = torch.jit._unwrap_optional(hidden_bw)
+            elif hidden_bw is None:
+                (h, c) = torch.jit._unwrap_optional(hidden_fw)
+            else:
+                h = torch.stack([hidden_fw[0], hidden_bw[0]], 0)  # type: ignore[list-item]
+                c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore[list-item]
+        else:
+            result = result_fw
+            h, c = torch.jit._unwrap_optional(hidden_fw)  # type: ignore[assignment]
+
+        if self.batch_first:
+            result.transpose_(0, 1)
+
+        return result, (h, c)
+
+    @classmethod
+    def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
+        r"""
+        There is no FP equivalent of this class. This function is here just to
+        mimic the behavior of the `prepare` within the `torch.ao.quantization`
+        flow.
+        """
+        assert hasattr(other, 'qconfig') or (qconfig is not None)
+
+        input_size = kwargs.get('input_size', other.input_size)
+        hidden_size = kwargs.get('hidden_size', other.hidden_size)
+        bias = kwargs.get('bias', other.bias)
+        batch_first = kwargs.get('batch_first', other.batch_first)
+        bidirectional = kwargs.get('bidirectional', other.bidirectional)
+
+        layer = cls(input_size, hidden_size, bias, batch_first, bidirectional)
+        layer.qconfig = getattr(other, 'qconfig', qconfig)
+        wi = getattr(other, f'weight_ih_l{layer_idx}')
+        wh = getattr(other, f'weight_hh_l{layer_idx}')
+        bi = getattr(other, f'bias_ih_l{layer_idx}', None)
+        bh = getattr(other, f'bias_hh_l{layer_idx}', None)
+
+        layer.layer_fw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+
+        if other.bidirectional:
+            wi = getattr(other, f'weight_ih_l{layer_idx}_reverse')
+            wh = getattr(other, f'weight_hh_l{layer_idx}_reverse')
+            bi = getattr(other, f'bias_ih_l{layer_idx}_reverse', None)
+            bh = getattr(other, f'bias_hh_l{layer_idx}_reverse', None)
+            layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+        return layer
+
+
+class LSTM(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples below.
+
+    Examples::
+
+        >>> import torch.ao.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+        >>> # To get the weights:
+        >>> # xdoctest: +SKIP
+        >>> print(rnn.layers[0].weight_ih)
+        tensor([[...]])
+        >>> print(rnn.layers[0].weight_hh)
+        AssertionError: There is no reverse path in the non-bidirectional layer
+    """
+    _FLOAT_MODULE = torch.nn.LSTM
+
+    def __init__(self, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True,
+                 batch_first: bool = False, dropout: float = 0.,
+                 bidirectional: bool = False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.training = False  # Default to eval mode. If we want to train, we will explicitly set to training.
+        num_directions = 2 if bidirectional else 1
+
+        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
+                isinstance(dropout, bool):
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0:
+            warnings.warn("dropout option for quantizable LSTM is ignored. "
+                          "If you are training, please, use nn.LSTM version "
+                          "followed by `prepare` step.")
+            if num_layers == 1:
+                warnings.warn("dropout option adds dropout after all but last "
+                              "recurrent layer, so non-zero dropout expects "
+                              f"num_layers greater than 1, but got dropout={dropout} "
+                              f"and num_layers={num_layers}")
+
+        layers = [_LSTMLayer(self.input_size, self.hidden_size,
+                             self.bias, batch_first=False,
+                             bidirectional=self.bidirectional, **factory_kwargs)]
+        for layer in range(1, num_layers):
+            layers.append(_LSTMLayer(self.hidden_size, self.hidden_size,
+                                     self.bias, batch_first=False,
+                                     bidirectional=self.bidirectional,
+                                     **factory_kwargs))
+        self.layers = torch.nn.ModuleList(layers)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        max_batch_size = x.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if hidden is None:
+            zeros = torch.zeros(num_directions, max_batch_size,
+                                self.hidden_size, dtype=torch.float,
+                                device=x.device)
+            zeros.squeeze_(0)
+            if x.is_quantized:
+                zeros = torch.quantize_per_tensor(zeros, scale=1.0,
+                                                  zero_point=0, dtype=x.dtype)
+            hxcx = [(zeros, zeros) for _ in range(self.num_layers)]
+        else:
+            hidden_non_opt = torch.jit._unwrap_optional(hidden)
+            if isinstance(hidden_non_opt[0], Tensor):
+                hx = hidden_non_opt[0].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size)
+                cx = hidden_non_opt[1].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size)
+                hxcx = [(hx[idx].squeeze(0), cx[idx].squeeze(0)) for idx in range(self.num_layers)]
+            else:
+                hxcx = hidden_non_opt
+
+        hx_list = []
+        cx_list = []
+        for idx, layer in enumerate(self.layers):
+            x, (h, c) = layer(x, hxcx[idx])
+            hx_list.append(torch.jit._unwrap_optional(h))
+            cx_list.append(torch.jit._unwrap_optional(c))
+        hx_tensor = torch.stack(hx_list)
+        cx_tensor = torch.stack(cx_list)
+
+        # We are creating another dimension for bidirectional case
+        # need to collapse it
+        hx_tensor = hx_tensor.reshape(-1, hx_tensor.shape[-2], hx_tensor.shape[-1])
+        cx_tensor = cx_tensor.reshape(-1, cx_tensor.shape[-2], cx_tensor.shape[-1])
+
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        return x, (hx_tensor, cx_tensor)
+
+    def _get_name(self):
+        return 'QuantizableLSTM'
+
+    @classmethod
+    def from_float(cls, other, qconfig=None):
+        assert isinstance(other, cls._FLOAT_MODULE)
+        assert (hasattr(other, 'qconfig') or qconfig)
+        observed = cls(other.input_size, other.hidden_size, other.num_layers,
+                       other.bias, other.batch_first, other.dropout,
+                       other.bidirectional)
+        observed.qconfig = getattr(other, 'qconfig', qconfig)
+        for idx in range(other.num_layers):
+            observed.layers[idx] = _LSTMLayer.from_float(other, idx, qconfig,
+                                                         batch_first=False)
+
+        # Prepare the model
+        if other.training:
+            observed.train()
+            observed = torch.ao.quantization.prepare_qat(observed, inplace=True)
+        else:
+            observed.eval()
+            observed = torch.ao.quantization.prepare(observed, inplace=True)
+        return observed
+
+    @classmethod
+    def from_observed(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does float -> observed only
+        raise NotImplementedError("It looks like you are trying to convert a "
+                                  "non-quantizable LSTM module. Please, see "
+                                  "the examples on quantizable LSTMs.")
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd57a11b784d7fc16ec305418426c1c23f6b0e39
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__init__.py
@@ -0,0 +1,38 @@
+from . import functional
+from .modules import *  # noqa: F403
+from .modules import MaxPool2d
+
+__all__ = [
+    'BatchNorm2d',
+    'BatchNorm3d',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'DeQuantize',
+    'ELU',
+    'Embedding',
+    'EmbeddingBag',
+    'GroupNorm',
+    'Hardswish',
+    'InstanceNorm1d',
+    'InstanceNorm2d',
+    'InstanceNorm3d',
+    'LayerNorm',
+    'LeakyReLU',
+    'Linear',
+    'LSTM',
+    'MultiheadAttention',
+    'Quantize',
+    'ReLU6',
+    'Sigmoid',
+    'Softmax',
+    'Dropout',
+    'PReLU',
+    # Wrapper modules
+    'FloatFunctional',
+    'FXFloatFunctional',
+    'QFunctional',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e37528bdcf1a4ff91be2f8f9c77f0fdaffbbca8a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf33864f6c48e4f2a914f687c382bd313f94565c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b342ef72a0f1335167c79039010f765779dd2849
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0992314f6465b6e79721c91f323e4a98fbfbd18b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,19 @@
+
+from .linear import Linear
+from .rnn import LSTM, GRU, LSTMCell, RNNCell, GRUCell
+from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+
+__all__ = [
+    'Linear',
+    'LSTM',
+    'GRU',
+    'LSTMCell',
+    'RNNCell',
+    'GRUCell',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97b32fce8c6cf380cf2d8a29428b1682d2b75f5d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19b99ec11b651a7eb4f93f05644b987f29d94d64
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12063b5dedb9af9083db51c28bc7c43c49c8645b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf6665b83f1d77e22a8ac86fc201618adbbea501
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9b078a18fc86ae4603396338ffbc774383b8b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/conv.py
@@ -0,0 +1,399 @@
+r"""Dynamically quantized convolution modules."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch import Tensor
+from torch._ops import ops
+from torch.nn.common_types import _size_1_t
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.ao.nn.quantized.modules.conv import _reverse_repeat_padding
+import torch.ao.nn.quantized as nnq
+import warnings
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
+
+class Conv1d(nnq.Conv1d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv1d` and :class:`~torch.ao.nn.quantized.dynamic.Conv1d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.quantized.dynamic.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 100)
+        >>> output = m(input)
+
+    """
+
+    _FLOAT_MODULE = nn.Conv1d
+    _NNIQAT_CONV_BN_MODULE = None  # type: ignore[assignment]
+    _NNI_CONV_RELU_MODULE = None  # type: ignore[assignment]
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: _size_1_t = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 device=None,
+                 dtype=None,
+                 reduce_range=True):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = padding if isinstance(padding, str) else _single(padding)
+        dilation = _single(dilation)
+
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConv1d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != 'zeros':
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv1d_dynamic(input, self._packed_params, reduce_range)
+
+
+class Conv2d(nnq.Conv2d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv2d` and :class:`~torch.ao.nn.quantized.dynamic.Conv2d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.dynamic.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+
+    """
+    _FLOAT_MODULE = nn.Conv2d
+    _NNIQAT_CONV_BN_MODULE = None  # type: ignore[assignment]
+    _NNI_CONV_RELU_MODULE = None  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConv2d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv2d_dynamic(
+            input, self._packed_params, reduce_range)
+
+
+class Conv3d(nnq.Conv3d):
+    r"""A dynamically quantized conv module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv3d` and :class:`~torch.ao.nn.quantized.dynamic.Conv3d` and
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.dynamic.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2))
+        >>> input = torch.randn(20, 16, 56, 56, 56)
+        >>> output = m(input)
+
+    """
+    _FLOAT_MODULE = nn.Conv3d
+    _NNIQAT_CONV_BN_MODULE = None  # type: ignore[assignment]
+    _NNI_CONV_RELU_MODULE = None  # type: ignore[assignment]
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        super()._init(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConv3d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv3d_dynamic(
+            input, self._packed_params, reduce_range)
+
+
+class ConvTranspose1d(nnq.ConvTranspose1d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose1d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv1d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nndq.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nndq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nndq.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nndq.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose1d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConvTranspose1d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        return torch.ops.quantized.conv_transpose1d_dynamic(
+            input, self._packed_params, reduce_range)
+
+
+class ConvTranspose2d(nnq.ConvTranspose2d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose2d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv2d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With square kernels and equal stride
+        >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose2d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConvTranspose2d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return ops.quantized.conv_transpose2d_dynamic(
+            input, self._packed_params, reduce_range)
+
+
+class ConvTranspose3d(nnq.ConvTranspose3d):
+    r"""A dynamically quantized transposed convolution module with floating point tensors as inputs and outputs.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose3d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.dynamic.Conv3d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With cubic kernels and equal stride
+        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-cubic kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose3d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        warnings.warn(
+            "The current implementation of the {} module has poor numerical accuracy and its use is not recommended".format(
+                self._get_name()
+            )
+        )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedConvTranspose3d'
+
+    def forward(self, input: Tensor, reduce_range: bool = True) -> Tensor:
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, T, H, W)`!")
+        return ops.quantized.conv_transpose3d_dynamic(
+            input, self._packed_params, reduce_range)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..407ecd9abbcf3c8d10057f5c6843de95ffabea4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/linear.py
@@ -0,0 +1,132 @@
+import torch
+import torch.ao.nn.quantized as nnq
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+import torch.ao.nn.intrinsic as nni
+
+__all__ = [
+    "Linear",
+]
+
+
+class Linear(nnq.Linear):
+    r"""
+    A dynamic quantized linear module with floating point tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation.
+
+    Similar to :class:`torch.nn.Linear`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module which are of
+                         shape :math:`(\text{out\_features}, \text{in\_features})`.
+        bias (Tensor): the non-learnable floating point bias of the module of shape
+                       :math:`(\text{out\_features})`. If :attr:`bias` is ``True``,
+                       the values are initialized to zero.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> m = nn.quantized.dynamic.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    # version used in this class is different from the parent class nnq.Linear
+    _version = 4
+
+    def __init__(self, in_features, out_features, bias_=True, dtype=torch.qint8):
+        super().__init__(in_features, out_features, bias_, dtype=dtype)
+        # We don't muck around with buffers or attributes or anything here
+        # to keep the module simple. *everything* is simply a Python attribute.
+        # Serialization logic is explicitly handled in the below serialization and
+        # deserialization modules
+        self.version = 4
+
+    def forward(self, x):
+        # Note that we can handle self.bias == None case.
+        if self._packed_params.dtype == torch.qint8:
+            if self.version is None or self.version < 4:
+                Y = torch.ops.quantized.linear_dynamic(
+                    x, self._packed_params._packed_params)
+            else:
+                Y = torch.ops.quantized.linear_dynamic(
+                    x, self._packed_params._packed_params, reduce_range=True)
+        elif self._packed_params.dtype == torch.float16:
+            Y = torch.ops.quantized.linear_dynamic_fp16(
+                x, self._packed_params._packed_params)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear!')
+        return Y.to(x.dtype)
+
+    def _get_name(self):
+        return 'DynamicQuantizedLinear'
+
+    def extra_repr(self):
+        extra_repr_str = 'in_features={}, out_features={}, dtype={}'.format(
+            self.in_features, self.out_features, self._packed_params.dtype
+        )
+        if self._packed_params.dtype == torch.qint8:
+            extra_repr_str += f', qscheme={self.weight().qscheme()}'
+        return extra_repr_str
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        self.version = version
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a dynamic quantized module from a float module or qparams_dict
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+        """
+        float_modules = [torch.nn.Linear, torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
+                         torch.ao.nn.intrinsic.modules.fused.LinearReLU, torch.ao.nn.qat.dynamic.Linear]
+
+        assert type(mod) in float_modules, \
+            'nn.quantized.dynamic.Linear.from_float only works for one of' + \
+            str([float_mod.__name__ for float_mod in float_modules])
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        if type(mod) == nni.LinearReLU:
+            mod = mod[0]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer = mod.qconfig.weight()
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+            weight_observer = default_dynamic_qconfig.weight()
+        dtype = weight_observer.dtype
+        assert dtype in [torch.qint8, torch.float16], "The only supported dtypes for " \
+            f"dynamic quantized linear are qint8 and float16 got: {dtype}"
+        weight_observer(mod.weight)
+        if dtype == torch.qint8:
+            qweight = _quantize_weight(mod.weight.float(), weight_observer)
+        elif dtype == torch.float16:
+            qweight = mod.weight.float()
+        else:
+            raise RuntimeError('Unsupported dtype specified for dynamic quantized Linear!')
+        qlinear = cls(mod.in_features, mod.out_features, dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias)
+        return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear):
+        """ Create a (fbgemm/qnnpack) dynamic quantized module from a reference quantized
+        module
+        Args:
+            ref_qlinear (Module): a reference quantized  module, either produced by
+            torch.ao.quantization functions or provided by the user
+        """
+        qlinear = cls(ref_qlinear.in_features, ref_qlinear.out_features, dtype=ref_qlinear.weight_dtype)
+        qweight = ref_qlinear.get_quantized_weight()
+        bias = ref_qlinear.bias
+        qlinear.set_weight_bias(qweight, bias)
+        return qlinear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1b305ce61818fefdfd366736d6f12ceb61765ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/dynamic/modules/rnn.py
@@ -0,0 +1,1096 @@
+import numbers
+import warnings
+
+import torch
+import torch.nn as nn
+from torch import Tensor  # noqa: F401
+from torch._jit_internal import Tuple, Optional, List, Union, Dict  # noqa: F401
+from torch.nn.utils.rnn import PackedSequence
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+
+__all__ = ['pack_weight_bias', 'PackedParameter', 'RNNBase', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', 'LSTMCell',
+           'GRUCell', "apply_permutation"]
+
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    warnings.warn("apply_permutation is deprecated, please use tensor.index_select(dim, permutation) instead")
+    return _apply_permutation(tensor, permutation, dim)
+
+
+def pack_weight_bias(qweight, bias, dtype):
+
+    if dtype == torch.qint8:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   w_ih, w_hh
+        packed_weight = \
+            torch.ops.quantized.linear_prepack(qweight, bias)
+
+        return packed_weight
+    else:
+        # for each layer, for each direction we need to quantize and pack
+        # weights and pack parameters in this order:
+        #
+        #   packed_ih, packed_hh, b_ih, b_hh
+        packed_weight = torch.ops.quantized.linear_prepack_fp16(
+            qweight, bias)
+
+        return packed_weight
+
+
+class PackedParameter(torch.nn.Module):
+    def __init__(self, param):
+        super().__init__()
+        self.param = param
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'param'] = self.param
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.param = state_dict[prefix + 'param']
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+
+class RNNBase(torch.nn.Module):
+
+    _FLOAT_MODULE = nn.RNNBase
+
+    _version = 2
+
+    def __init__(self, mode, input_size, hidden_size,
+                 num_layers=1, bias=True, batch_first=False,
+                 dropout=0., bidirectional=False, dtype=torch.qint8):
+        super().__init__()
+
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.dtype = dtype
+        self.version = 2
+        self.training = False
+        num_directions = 2 if bidirectional else 1
+
+        # "type: ignore" is required since ints and Numbers are not fully comparable
+        # https://github.com/python/mypy/issues/8566
+        if not isinstance(dropout, numbers.Number) \
+                or not 0 <= dropout <= 1 or isinstance(dropout, bool):  # type: ignore[operator]
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0 and num_layers == 1:  # type: ignore[operator]
+            warnings.warn("dropout option adds dropout after all but last "
+                          "recurrent layer, so non-zero dropout expects "
+                          f"num_layers greater than 1, but got dropout={dropout} and "
+                          f"num_layers={num_layers}")
+
+        if mode == 'LSTM':
+            gate_size = 4 * hidden_size
+        elif mode == 'GRU':
+            gate_size = 3 * hidden_size
+        else:
+            raise ValueError("Unrecognized RNN mode: " + mode)
+
+        _all_weight_values = []
+        for layer in range(num_layers):
+            for direction in range(num_directions):
+                layer_input_size = input_size if layer == 0 else hidden_size * num_directions
+
+                w_ih = torch.randn(gate_size, layer_input_size).to(torch.float)
+                w_hh = torch.randn(gate_size, hidden_size).to(torch.float)
+                b_ih = torch.randn(gate_size).to(torch.float)
+                b_hh = torch.randn(gate_size).to(torch.float)
+                if dtype == torch.qint8:
+                    w_ih = torch.quantize_per_tensor(w_ih, scale=0.1, zero_point=0, dtype=torch.qint8)
+                    w_hh = torch.quantize_per_tensor(w_hh, scale=0.1, zero_point=0, dtype=torch.qint8)
+                    packed_ih = \
+                        torch.ops.quantized.linear_prepack(w_ih, b_ih)
+                    packed_hh = \
+                        torch.ops.quantized.linear_prepack(w_hh, b_hh)
+                    if self.version is None or self.version < 2:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh)
+                    else:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh, True)
+                else:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh)
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        self._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+    def _get_name(self):
+        return 'DynamicQuantizedRNN'
+
+    def extra_repr(self):
+        s = '{input_size}, {hidden_size}'
+        if self.num_layers != 1:
+            s += ', num_layers={num_layers}'
+        if self.bias is not True:
+            s += ', bias={bias}'
+        if self.batch_first is not False:
+            s += ', batch_first={batch_first}'
+        if self.dropout != 0:
+            s += ', dropout={dropout}'
+        if self.bidirectional is not False:
+            s += ', bidirectional={bidirectional}'
+        return s.format(**self.__dict__)
+
+    def __repr__(self):
+        # We don't want to show `ModuleList` children, hence custom
+        # `__repr__`. This is the same as nn.Module.__repr__, except the check
+        # for the `PackedParameter` and `nn.ModuleList`.
+        # You should still override `extra_repr` to add more info.
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split('\n')
+        child_lines = []
+        for key, module in self._modules.items():
+            if isinstance(module, (PackedParameter, nn.ModuleList)):
+                continue
+            mod_str = repr(module)
+            mod_str = nn.modules.module._addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+        main_str += ')'
+        return main_str
+
+    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
+        expected_input_dim = 2 if batch_sizes is not None else 3
+        if input.dim() != expected_input_dim:
+            raise RuntimeError(
+                f'input must have {expected_input_dim} dimensions, got {input.dim()}')
+        if self.input_size != input.size(-1):
+            raise RuntimeError(
+                f'input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}')
+
+    def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+        return expected_hidden_size
+
+    def check_hidden_size(
+        self, hx: Tensor, expected_hidden_size: Tuple[int, int, int],
+        msg: str = 'Expected hidden size {}, got {}'
+    ) -> None:
+        if hx.size() != expected_hidden_size:
+            raise RuntimeError(msg.format(
+                expected_hidden_size, list(hx.size())))
+
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+        self.check_hidden_size(hidden, expected_hidden_size,
+                               msg='Expected hidden size {}, got {}')
+
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        self.version = version
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    def set_weight_bias(self, weight_bias_dict):
+
+        def weight_bias_name(ihhh, layer, suffix):
+            weight_name = f"weight_{ihhh}_l{layer}{suffix}"
+            bias_name = f"bias_{ihhh}_l{layer}{suffix}"
+            return weight_name, bias_name
+
+        num_directions = 2 if self.bidirectional else 1
+        # TODO: dedup with __init__ of RNNBase
+        _all_weight_values = []
+        for layer in range(self.num_layers):
+            for direction in range(num_directions):
+                suffix = "_reverse" if direction == 1 else ""
+                w_ih_name, b_ih_name = weight_bias_name("ih", layer, suffix)
+                w_hh_name, b_hh_name = weight_bias_name("hh", layer, suffix)
+                w_ih = weight_bias_dict[w_ih_name]
+                b_ih = weight_bias_dict[b_ih_name]
+                w_hh = weight_bias_dict[w_hh_name]
+                b_hh = weight_bias_dict[b_hh_name]
+                if w_ih.dtype == torch.qint8:
+                    packed_ih = torch.ops.quantized.linear_prepack(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack(w_hh, b_hh)
+                    if self.version is None or self.version < 2:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh)
+                    else:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, b_ih, b_hh, True)
+                else:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(w_ih, b_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(w_hh, b_hh)
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh)
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        self._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+    @classmethod
+    def from_float(cls, mod):
+        assert type(mod) in {torch.nn.LSTM,
+                             torch.nn.GRU}, 'nn.quantized.dynamic.RNNBase.from_float only works for nn.LSTM and nn.GRU'
+        assert hasattr(
+            mod,
+            'qconfig'
+        ), 'Input float module must have qconfig defined'
+
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer_method = mod.qconfig.weight
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+            weight_observer_method = default_dynamic_qconfig.weight
+
+        dtype = weight_observer_method().dtype
+        supported_scalar_types = [torch.qint8, torch.float16]
+        if dtype not in supported_scalar_types:
+            raise RuntimeError(f'Unsupported dtype for dynamic RNN quantization: {dtype}')
+        # RNNBase can be either LSTM or GRU
+        qRNNBase: Union[LSTM, GRU]
+        if mod.mode == 'LSTM':
+            qRNNBase = LSTM(mod.input_size, mod.hidden_size, mod.num_layers,
+                            mod.bias, mod.batch_first, mod.dropout, mod.bidirectional, dtype)
+        elif mod.mode == 'GRU':
+            qRNNBase = GRU(mod.input_size, mod.hidden_size, mod.num_layers,
+                           mod.bias, mod.batch_first, mod.dropout, mod.bidirectional, dtype)
+        else:
+            raise NotImplementedError('Only LSTM/GRU is supported for QuantizedRNN for now')
+
+        num_directions = 2 if mod.bidirectional else 1
+
+        assert mod.bias
+
+        _all_weight_values = []
+        for layer in range(qRNNBase.num_layers):
+            for direction in range(num_directions):
+                suffix = '_reverse' if direction == 1 else ''
+
+                def retrieve_weight_bias(ihhh):
+                    weight_name = f'weight_{ihhh}_l{layer}{suffix}'
+                    bias_name = f'bias_{ihhh}_l{layer}{suffix}'
+                    weight = getattr(mod, weight_name)
+                    bias = getattr(mod, bias_name)
+                    return weight, bias
+
+                weight_ih, bias_ih = retrieve_weight_bias('ih')
+                weight_hh, bias_hh = retrieve_weight_bias('hh')
+
+                if dtype == torch.qint8:
+                    def quantize_and_pack(w, b):
+                        weight_observer = weight_observer_method()
+                        weight_observer(w)
+                        qweight = _quantize_weight(w.float(), weight_observer)
+                        packed_weight = \
+                            torch.ops.quantized.linear_prepack(qweight, b)
+                        return packed_weight
+                    packed_ih = quantize_and_pack(weight_ih, bias_ih)
+                    packed_hh = quantize_and_pack(weight_hh, bias_hh)
+                    if qRNNBase.version is None or qRNNBase.version < 2:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, bias_ih, bias_hh)
+                    else:
+                        cell_params = torch.ops.quantized.make_quantized_cell_params_dynamic(
+                            packed_ih, packed_hh, bias_ih, bias_hh, True)
+
+                elif dtype == torch.float16:
+                    packed_ih = torch.ops.quantized.linear_prepack_fp16(
+                        weight_ih.float(), bias_ih)
+                    packed_hh = torch.ops.quantized.linear_prepack_fp16(
+                        weight_hh.float(), bias_hh)
+
+                    cell_params = torch.ops.quantized.make_quantized_cell_params_fp16(
+                        packed_ih, packed_hh)
+                else:
+                    raise RuntimeError('Unsupported dtype specified for dynamic quantized LSTM!')
+
+                _all_weight_values.append(PackedParameter(cell_params))
+        qRNNBase._all_weight_values = torch.nn.ModuleList(_all_weight_values)
+
+        return qRNNBase
+
+    def _weight_bias(self):
+        # Returns a dict of weights and biases
+        weight_bias_dict: Dict[str, Dict] = {'weight' : {}, 'bias' : {}}
+        count = 0
+        num_directions = 2 if self.bidirectional else 1
+        for layer in range(self.num_layers):
+            for direction in range(num_directions):
+                suffix = '_reverse' if direction == 1 else ''
+                key_name1 = f'weight_ih_l{layer}{suffix}'
+                key_name2 = f'weight_hh_l{layer}{suffix}'
+                # packed weights are part of torchbind class, CellParamsSerializationType
+                # Within the packed weight class, the weight and bias are accessible as Tensors
+                packed_weight_bias = self._all_weight_values[count].param.__getstate__()[0][4]
+                weight_bias_dict['weight'][key_name1] = packed_weight_bias[0].__getstate__()[0][0]
+                weight_bias_dict['weight'][key_name2] = packed_weight_bias[1].__getstate__()[0][0]
+                key_name1 = f'bias_ih_l{layer}{suffix}'
+                key_name2 = f'bias_hh_l{layer}{suffix}'
+                weight_bias_dict['bias'][key_name1] = packed_weight_bias[0].__getstate__()[0][1]
+                weight_bias_dict['bias'][key_name2] = packed_weight_bias[1].__getstate__()[0][1]
+                count = count + 1
+        return weight_bias_dict
+
+    def get_weight(self):
+        return self._weight_bias()['weight']
+
+    def get_bias(self):
+        return self._weight_bias()['bias']
+
+
+class LSTM(RNNBase):
+    r"""
+    A dynamic quantized LSTM module with floating point tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.LSTM`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+    """
+    _FLOAT_MODULE = nn.LSTM
+
+    __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
+
+    def __init__(self, *args, **kwargs):
+        super().__init__('LSTM', *args, **kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedLSTM'
+
+    def forward_impl(
+        self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]],
+        batch_sizes: Optional[Tensor], max_batch_size: int,
+        sorted_indices: Optional[Tensor]
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            zeros = torch.zeros(self.num_layers * num_directions,
+                                max_batch_size, self.hidden_size,
+                                dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+
+        _all_params = ([m.param for m in self._all_weight_values])
+        if batch_sizes is None:
+            result = torch.quantized_lstm(input, hx, _all_params, self.bias, self.num_layers,
+                                          float(self.dropout), self.training, self.bidirectional,
+                                          self.batch_first, dtype=self.dtype, use_dynamic=True)
+        else:
+            result = torch.quantized_lstm(input, batch_sizes, hx, _all_params, self.bias,
+                                          self.num_layers, float(self.dropout), self.training,
+                                          self.bidirectional, dtype=self.dtype, use_dynamic=True)
+        output = result[0]
+        hidden = result[1:]
+
+        return output, hidden
+
+    @torch.jit.export
+    def forward_tensor(
+        self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
+    ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        batch_sizes = None
+        max_batch_size = input.size(0) if self.batch_first else input.size(1)
+        sorted_indices = None
+        unsorted_indices = None
+
+        output, hidden = self.forward_impl(
+            input, hx, batch_sizes, max_batch_size, sorted_indices)
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    @torch.jit.export
+    def forward_packed(
+        self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
+    ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
+
+        output_, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        output = PackedSequence(output_, batch_sizes,
+                                sorted_indices, unsorted_indices)
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    # "type: ignore" is required due to issue #43072
+    def permute_hidden(  # type: ignore[override]
+        self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]
+    ) -> Tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(hx[1], permutation)
+
+    # "type: ignore" is required due to issue #43072
+    def check_forward_args(  # type: ignore[override]
+        self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]
+    ) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(hidden[0], expected_hidden_size,
+                               'Expected hidden[0] size {}, got {}')
+        self.check_hidden_size(hidden[1], expected_hidden_size,
+                               'Expected hidden[1] size {}, got {}')
+
+    @torch.jit.ignore
+    def forward(self, input, hx=None):
+        if isinstance(input, PackedSequence):
+            return self.forward_packed(input, hx)
+        else:
+            return self.forward_tensor(input, hx)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 "
+        "exists in LSTM, may need to relax the assumption to support the use case"
+        qmod = cls(
+            ref_mod.input_size,
+            ref_mod.hidden_size,
+            ref_mod.num_layers,
+            ref_mod.bias,
+            ref_mod.batch_first,
+            ref_mod.dropout,
+            ref_mod.bidirectional,
+            # assuming there is layer 0, which should be OK
+            ref_mod.weight_ih_l0_dtype,
+        )
+        qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict())
+        return qmod
+
+
+class GRU(RNNBase):
+    r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)}
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two GRUs together to form a `stacked GRU`,
+            with the second GRU taking in outputs of the first GRU and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            GRU layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
+
+    Inputs: input, h_0
+        - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
+          of the input sequence. The input can also be a packed variable length
+          sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
+          for details.
+        - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the initial hidden state for each element in the batch.
+          Defaults to zero if not provided. If the RNN is bidirectional,
+          num_directions should be 2, else it should be 1.
+
+    Outputs: output, h_n
+        - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
+          containing the output features h_t from the last layer of the GRU,
+          for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
+          given as the input, the output will also be a packed sequence.
+          For the unpacked case, the directions can be separated
+          using ``output.view(seq_len, batch, num_directions, hidden_size)``,
+          with forward and backward being direction `0` and `1` respectively.
+
+          Similarly, the directions can be separated in the packed case.
+        - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
+          containing the hidden state for `t = seq_len`
+
+          Like *output*, the layers can be separated using
+          ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
+
+    Shape:
+        - Input1: :math:`(L, N, H_{in})` tensor containing input features where
+          :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
+        - Input2: :math:`(S, N, H_{out})` tensor
+          containing the initial hidden state for each element in the batch.
+          :math:`H_{out}=\text{hidden\_size}`
+          Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
+          If the RNN is bidirectional, num_directions should be 2, else it should be 1.
+        - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
+        - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
+          for each element in the batch
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks.
+        In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the
+        previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix
+        `W` and addition of bias:
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn})
+            \end{aligned}
+
+        This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}`
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn}))
+            \end{aligned}
+
+        This implementation differs on purpose for efficiency.
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.GRU(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+    _FLOAT_MODULE = nn.GRU
+
+    __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
+
+    def __init__(self, *args, **kwargs):
+        super().__init__('GRU', *args, **kwargs)
+
+    def _get_name(self):
+        return 'DynamicQuantizedGRU'
+
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None:
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(hidden, expected_hidden_size,
+                               'Expected hidden size {}, got {}')
+
+    def forward_impl(
+        self, input: Tensor, hx: Optional[Tensor],
+        batch_sizes: Optional[Tensor], max_batch_size: int,
+        sorted_indices: Optional[Tensor]
+    ) -> Tuple[Tensor, Tensor]:
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            zeros = torch.zeros(self.num_layers * num_directions,
+                                max_batch_size, self.hidden_size,
+                                dtype=input.dtype, device=input.device)
+            hx = zeros
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+
+        _all_params = ([m.param for m in self._all_weight_values])
+        if batch_sizes is None:
+            result = torch.quantized_gru(input,
+                                         hx,
+                                         _all_params,
+                                         self.bias,
+                                         self.num_layers,
+                                         self.dropout,
+                                         self.training,
+                                         self.bidirectional,
+                                         self.batch_first)
+        else:
+            result = torch.quantized_gru(input,
+                                         batch_sizes,
+                                         hx,
+                                         _all_params,
+                                         self.bias,
+                                         self.num_layers,
+                                         self.dropout,
+                                         self.training,
+                                         self.bidirectional)
+        output = result[0]
+        hidden = result[1]
+
+        return output, hidden
+
+
+    @torch.jit.export
+    def forward_tensor(
+        self, input: Tensor, hx: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Tensor]:
+        batch_sizes = None
+        max_batch_size = input.size(0) if self.batch_first else input.size(1)
+        sorted_indices = None
+        unsorted_indices = None
+
+        output, hidden = self.forward_impl(
+            input, hx, batch_sizes, max_batch_size, sorted_indices)
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    @torch.jit.export
+    def forward_packed(
+        self, input: PackedSequence, hx: Optional[Tensor] = None
+    ) -> Tuple[PackedSequence, Tensor]:
+        input_, batch_sizes, sorted_indices, unsorted_indices = input
+        max_batch_size = int(batch_sizes[0])
+        output_, hidden = self.forward_impl(
+            input_, hx, batch_sizes, max_batch_size, sorted_indices
+        )
+
+        output = PackedSequence(output_, batch_sizes,
+                                sorted_indices, unsorted_indices)
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def permute_hidden(
+        self, hx: Tensor, permutation: Optional[Tensor]
+    ) -> Tensor:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+    @torch.jit.ignore
+    def forward(self, input, hx=None):
+        if isinstance(input, PackedSequence):
+            return self.forward_packed(input, hx)
+        else:
+            return self.forward_tensor(input, hx)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_l0_dtype"), "We are assuming weight_ih_l0 "
+        "exists in LSTM, may need to relax the assumption to support the use case"
+        qmod = cls(
+            ref_mod.input_size,
+            ref_mod.hidden_size,
+            ref_mod.num_layers,
+            ref_mod.bias,
+            ref_mod.batch_first,
+            ref_mod.dropout,
+            ref_mod.bidirectional,
+            # assuming there is layer 0, which should be OK
+            ref_mod.weight_ih_l0_dtype,
+        )
+        qmod.set_weight_bias(ref_mod.get_quantized_weight_bias_dict())
+        return qmod
+
+class RNNCellBase(torch.nn.Module):
+    # _FLOAT_MODULE = nn.CellRNNBase
+    __constants__ = ['input_size', 'hidden_size', 'bias']
+
+    def __init__(self, input_size, hidden_size, bias=True, num_chunks=4, dtype=torch.qint8):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_dtype = dtype
+        if bias:
+            self.bias_ih = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
+            self.bias_hh = torch.randn(num_chunks * hidden_size).to(dtype=torch.float)
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+
+        weight_ih = torch.randn(num_chunks * hidden_size, input_size).to(torch.float)
+        weight_hh = torch.randn(num_chunks * hidden_size, hidden_size).to(torch.float)
+        if dtype == torch.qint8:
+            weight_ih = torch.quantize_per_tensor(weight_ih, scale=1, zero_point=0, dtype=torch.qint8)
+            weight_hh = torch.quantize_per_tensor(weight_hh, scale=1, zero_point=0, dtype=torch.qint8)
+
+        if dtype == torch.qint8:
+            # for each layer, for each direction we need to quantize and pack
+            # weights and pack parameters in this order:
+            #
+            #   w_ih, w_hh
+            packed_weight_ih = \
+                torch.ops.quantized.linear_prepack(weight_ih, self.bias_ih)
+            packed_weight_hh = \
+                torch.ops.quantized.linear_prepack(weight_hh, self.bias_hh)
+        else:
+            # for each layer, for each direction we need to quantize and pack
+            # weights and pack parameters in this order:
+            #
+            #   packed_ih, packed_hh, b_ih, b_hh
+            packed_weight_ih = torch.ops.quantized.linear_prepack_fp16(
+                weight_ih, self.bias_ih)
+            packed_weight_hh = torch.ops.quantized.linear_prepack_fp16(
+                weight_hh, self.bias_hh)
+
+        self._packed_weight_ih = packed_weight_ih
+        self._packed_weight_hh = packed_weight_hh
+
+    def _get_name(self):
+        return 'DynamicQuantizedRNNBase'
+
+    def extra_repr(self):
+        s = '{input_size}, {hidden_size}'
+        if 'bias' in self.__dict__ and self.bias is not True:
+            s += ', bias={bias}'
+        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
+            s += ', nonlinearity={nonlinearity}'
+        return s.format(**self.__dict__)
+
+    def check_forward_input(self, input):
+        if input.size(1) != self.input_size:
+            raise RuntimeError(
+                f"input has inconsistent input_size: got {input.size(1)}, expected {self.input_size}")
+
+    def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = '') -> None:
+        if input.size(0) != hx.size(0):
+            raise RuntimeError(
+                f"Input batch size {input.size(0)} doesn't match hidden{hidden_label} batch size {hx.size(0)}")
+
+        if hx.size(1) != self.hidden_size:
+            raise RuntimeError(
+                f"hidden{hidden_label} has inconsistent hidden_size: got {hx.size(1)}, expected {self.hidden_size}")
+
+    @classmethod
+    def from_float(cls, mod):
+        assert type(mod) in {torch.nn.LSTMCell,
+                             torch.nn.GRUCell,
+                             torch.nn.RNNCell}, 'nn.quantized.dynamic.RNNCellBase.from_float \
+                                 only works for nn.LSTMCell, nn.GRUCell and nn.RNNCell'
+        assert hasattr(
+            mod, 'qconfig'), 'Input float module must have qconfig defined'
+
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer_method = mod.qconfig.weight
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+            weight_observer_method = default_dynamic_qconfig.weight
+
+        dtype = weight_observer_method().dtype
+        supported_scalar_types = [torch.qint8, torch.float16]
+        if dtype not in supported_scalar_types:
+            raise RuntimeError(f'Unsupported dtype for dynamic RNN quantization: {dtype}')
+
+        qRNNCellBase: Union[LSTMCell, GRUCell, RNNCell]
+
+        if type(mod) == torch.nn.LSTMCell:
+            qRNNCellBase = LSTMCell(mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype)
+        elif type(mod) == torch.nn.GRUCell:
+            qRNNCellBase = GRUCell(mod.input_size, mod.hidden_size, bias=mod.bias, dtype=dtype)
+        elif type(mod) == torch.nn.RNNCell:
+            qRNNCellBase = RNNCell(mod.input_size, mod.hidden_size, bias=mod.bias, nonlinearity=mod.nonlinearity, dtype=dtype)
+        else:
+            raise NotImplementedError('Only LSTMCell, GRUCell and RNNCell \
+            are supported for QuantizedRNN for now')
+
+        assert mod.bias
+
+        def _observe_and_quantize_weight(weight):
+            if dtype == torch.qint8:
+                weight_observer = weight_observer_method()
+                weight_observer(weight)
+                qweight = _quantize_weight(weight.float(), weight_observer)
+                return qweight
+            else:
+                return weight.float()
+
+        qRNNCellBase._packed_weight_ih = pack_weight_bias(_observe_and_quantize_weight(mod.weight_ih), mod.bias_ih, dtype)
+        qRNNCellBase._packed_weight_hh = pack_weight_bias(_observe_and_quantize_weight(mod.weight_hh), mod.bias_hh, dtype)
+        return qRNNCellBase
+
+    @classmethod
+    def from_reference(cls, ref_mod):
+        assert hasattr(ref_mod, "weight_ih_dtype"), "We are assuming weight_ih "
+        "exists in reference module, may need to relax the assumption to support the use case"
+        if hasattr(ref_mod, "nonlinearity"):
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                ref_mod.nonlinearity,
+                dtype=ref_mod.weight_ih_dtype
+            )
+        else:
+            qmod = cls(
+                ref_mod.input_size,
+                ref_mod.hidden_size,
+                ref_mod.bias,
+                dtype=ref_mod.weight_ih_dtype
+            )
+        weight_bias_dict = {
+            "weight": {
+                "weight_ih": ref_mod.get_quantized_weight_ih(),
+                "weight_hh": ref_mod.get_quantized_weight_hh(),
+            },
+            "bias": {
+                "bias_ih": ref_mod.bias_ih,
+                "bias_hh": ref_mod.bias_hh,
+            }
+        }
+        qmod.set_weight_bias(weight_bias_dict)
+        return qmod
+
+    def _weight_bias(self):
+        # Returns a dict of weights and biases
+        weight_bias_dict: Dict[str, Dict] = {'weight' : {}, 'bias' : {}}
+        w1, b1 = self._packed_weight_ih.__getstate__()[0]
+        w2, b2 = self._packed_weight_hh.__getstate__()[0]
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
+        weight_bias_dict['weight']['weight_ih'] = w1
+        weight_bias_dict['weight']['weight_hh'] = w2
+        weight_bias_dict['bias']['bias_ih'] = b1
+        weight_bias_dict['bias']['bias_hh'] = b2
+        return weight_bias_dict
+
+    def get_weight(self):
+        return self._weight_bias()['weight']
+
+    def get_bias(self):
+        return self._weight_bias()['bias']
+
+    def set_weight_bias(self, weight_bias_dict):
+        # TODO: these can be simplified to one level? e.g. using weight_ih as key
+        # directly
+        self._packed_weight_ih = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_ih"],
+            weight_bias_dict["bias"]["bias_ih"],
+            self.weight_dtype)
+        self._packed_weight_hh = pack_weight_bias(
+            weight_bias_dict["weight"]["weight_hh"],
+            weight_bias_dict["bias"]["bias_hh"],
+            self.weight_dtype)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + '_packed_weight_ih'] = self._packed_weight_ih
+        destination[prefix + '_packed_weight_hh'] = self._packed_weight_hh
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self._packed_weight_ih = state_dict.pop(prefix + '_packed_weight_ih')
+        self._packed_weight_hh = state_dict.pop(prefix + '_packed_weight_hh')
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+
+class RNNCell(RNNCellBase):
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+    A dynamic quantized RNNCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.RNNCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.RNNCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.RNNCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+    __constants__ = ['input_size', 'hidden_size', 'bias', 'nonlinearity']
+
+    def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh", dtype=torch.qint8):
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, dtype=dtype)
+        self.nonlinearity = nonlinearity
+
+    def _get_name(self):
+        return 'DynamicQuantizedRNNCell'
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        self.check_forward_input(input)
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        self.check_forward_hidden(input, hx, '')
+        if self.nonlinearity == "tanh":
+            ret = torch.ops.quantized.quantized_rnn_tanh_cell_dynamic(
+                input, hx,
+                self._packed_weight_ih, self._packed_weight_hh,
+                self.bias_ih, self.bias_hh)
+        elif self.nonlinearity == "relu":
+            ret = torch.ops.quantized.quantized_rnn_relu_cell_dynamic(
+                input, hx,
+                self._packed_weight_ih, self._packed_weight_hh,
+                self.bias_ih, self.bias_hh)
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(
+                f"Unknown nonlinearity: {self.nonlinearity}")
+        return ret
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+
+class LSTMCell(RNNCellBase):
+    r"""A long short-term memory (LSTM) cell.
+
+    A dynamic quantized LSTMCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.LSTMCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.LSTMCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, num_chunks=4, **kwargs)  # type: ignore[misc]
+
+    def _get_name(self):
+        return 'DynamicQuantizedLSTMCell'
+
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        self.check_forward_input(input)
+        if hx is None:
+            zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        self.check_forward_hidden(input, hx[0], '[0]')
+        self.check_forward_hidden(input, hx[1], '[1]')
+        return torch.ops.quantized.quantized_lstm_cell_dynamic(
+            input, hx,
+            self._packed_weight_ih, self._packed_weight_hh,
+            self.bias_ih, self.bias_hh)
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
+
+
+class GRUCell(RNNCellBase):
+    r"""A gated recurrent unit (GRU) cell
+
+    A dynamic quantized GRUCell module with floating point tensor as inputs and outputs.
+    Weights are quantized to 8 bits. We adopt the same interface as `torch.nn.GRUCell`,
+    please see https://pytorch.org/docs/stable/nn.html#torch.nn.GRUCell for documentation.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> rnn = nn.GRUCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    def __init__(self, input_size, hidden_size, bias=True, dtype=torch.qint8):
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, dtype=dtype)
+
+    def _get_name(self):
+        return 'DynamicQuantizedGRUCell'
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        self.check_forward_input(input)
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        self.check_forward_hidden(input, hx, '')
+        return torch.ops.quantized.quantized_gru_cell_dynamic(
+            input, hx,
+            self._packed_weight_ih, self._packed_weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
+
+    @classmethod
+    def from_float(cls, mod):
+        return super().from_float(mod)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/functional.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..c91cbb1b603deaa8da3db139273a9cd7ca9f8452
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/functional.py
@@ -0,0 +1,644 @@
+r""" Functional interface (quantized)."""
+from typing import List, Optional
+import warnings
+
+import torch
+from torch import Tensor
+from torch.nn.modules.utils import _pair, _triple
+from torch.jit.annotations import BroadcastingList2
+
+from .modules.utils import _pair_from_first
+
+# Although some of the functions and docstrings are mirrored from the torch.nn,
+# we want to have them here for future changes.
+
+__all__ = [
+    "avg_pool2d",
+    "avg_pool3d",
+    "adaptive_avg_pool2d",
+    "adaptive_avg_pool3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "interpolate",
+    "linear",
+    "max_pool1d",
+    "max_pool2d",
+    "celu",
+    "leaky_relu",
+    "hardtanh",
+    "hardswish",
+    "threshold",
+    "elu",
+    "hardsigmoid",
+    "clamp",
+    "upsample",
+    "upsample_bilinear",
+    "upsample_nearest",
+]
+
+def avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False,
+               count_include_pad=True, divisor_override=None):
+    r"""
+    Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size
+    :math:`sH \times sW` steps. The number of output features is equal to the number of
+    input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AvgPool2d` for details and output shape.
+
+    Args:
+        input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        kernel_size: size of the pooling region. Can be a single number or a
+          tuple `(kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+          tuple `(sH, sW)`. Default: :attr:`kernel_size`
+        padding: implicit zero paddings on both sides of the input. Can be a
+          single number or a tuple `(padH, padW)`. Default: 0
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape. Default: ``False``
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation. Default: ``True``
+        divisor_override: if specified, it will be used as divisor, otherwise
+             size of the pooling region will be used. Default: None
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.avg_pool2d' must be quantized!")
+    return torch.nn.functional.avg_pool2d(input, kernel_size, stride, padding,
+                                          ceil_mode, count_include_pad,
+                                          divisor_override)
+
+def avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False,
+               count_include_pad=True, divisor_override=None):
+    r"""
+    Applies 3D average-pooling operation in :math:`kD \ times kH \times kW` regions by step size
+    :math:`sD \times sH \times sW` steps. The number of output features is equal to the number of
+    input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    Args:
+        input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        kernel_size: size of the pooling region. Can be a single number or a
+          tuple `(kD, kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+          tuple `(sD, sH, sW)`. Default: :attr:`kernel_size`
+        padding: implicit zero paddings on both sides of the input. Can be a
+          single number or a tuple `(padD, padH, padW)`. Default: 0
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape. Default: ``False``
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation. Default: ``True``
+        divisor_override: if specified, it will be used as divisor, otherwise
+             size of the pooling region will be used. Default: None
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.avg_pool3d' must be quantized!")
+    return torch.nn.functional.avg_pool3d(input, kernel_size, stride, padding,
+                                          ceil_mode, count_include_pad,
+                                          divisor_override)
+
+def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
+    r"""
+    Applies a 2D adaptive average pooling over a quantized input signal composed
+    of several quantized input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+                     double-integer tuple)
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.functional.adaptive_avg_pool2d' must be quantized!")
+    return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
+
+def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
+    r"""
+    Applies a 3D adaptive average pooling over a quantized input signal composed
+    of several quantized input planes.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.ao.nn.quantized.AdaptiveAvgPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+                     double-integer tuple)
+    """
+    if not input.is_quantized:
+        raise ValueError(
+            "Input to 'quantized.functional.adaptive_avg_pool3d' must be quantized!")
+    return torch.nn.functional.adaptive_avg_pool3d(input, output_size)
+
+def conv1d(input, weight, bias,
+           stride=1, padding=0, dilation=1, groups=1,
+           padding_mode='zeros',
+           scale=1.0, zero_point=0,
+           dtype=torch.quint8):
+    r"""
+    Applies a 1D convolution over a quantized 1D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv1d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
+        weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , iW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sW,)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padW,)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dW,)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+          number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(33, 16, 3, dtype=torch.float)
+        >>> inputs = torch.randn(20, 16, 50, dtype=torch.float)
+        >>> bias = torch.randn(33, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv1d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != 'zeros':
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError("Only torch.quint8 is supported for activation tensor!")
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 3:
+        raise ValueError("Input shape must be `(N, C, L)`!")
+    stride = _pair_from_first(stride)
+    padding = _pair_from_first(padding)
+    dilation = _pair_from_first(dilation)
+
+    packed_params = torch.ops.quantized.conv1d_prepack(
+        weight, bias, stride, padding, dilation, groups)
+    return torch.ops.quantized.conv1d(input, packed_params, scale, zero_point)
+
+def conv2d(input, weight, bias,
+           stride=1, padding=0, dilation=1, groups=1,
+           padding_mode='zeros',
+           scale=1.0, zero_point=0,
+           dtype=torch.quint8):
+    r"""
+    Applies a 2D convolution over a quantized 2D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv2d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        weight: quantized filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW)`
+        bias: **non-quantized** bias tensor of shape :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sH, sW)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padH, padW)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dH, dW)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+          number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(8, 4, 3, 3, dtype=torch.float)
+        >>> inputs = torch.randn(1, 4, 5, 5, dtype=torch.float)
+        >>> bias = torch.randn(8, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv2d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != 'zeros':
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError("Only torch.quint8 is supported for activation tensor!")
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 4:
+        raise ValueError("Input shape must be `(N, C, H, W)`!")
+    stride = _pair(stride)
+    padding = _pair(padding)
+    dilation = _pair(dilation)
+
+    packed_params = torch.ops.quantized.conv2d_prepack(
+        weight, bias, stride, padding, dilation, groups)
+    return torch.ops.quantized.conv2d(input, packed_params, scale, zero_point)
+
+def conv3d(input, weight, bias, stride=1, padding=0, dilation=1, groups=1,
+           padding_mode='zeros', scale=1.0, zero_point=0, dtype=torch.quint8):
+    r"""
+    Applies a 3D convolution over a quantized 3D input composed of several input
+    planes.
+
+    See :class:`~torch.ao.nn.quantized.Conv3d` for details and output shape.
+
+    Args:
+        input: quantized input tensor of shape
+          :math:`(\text{minibatch} , \text{in\_channels} , iD , iH , iW)`
+        weight: quantized filters of shape
+          :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kD , kH , kW)`
+        bias: **non-quantized** bias tensor of shape
+          :math:`(\text{out\_channels})`. The tensor type must be `torch.float`.
+        stride: the stride of the convolving kernel. Can be a single number or a
+          tuple `(sD, sH, sW)`. Default: 1
+        padding: implicit paddings on both sides of the input. Can be a
+          single number or a tuple `(padD, padH, padW)`. Default: 0
+        dilation: the spacing between kernel elements. Can be a single number or
+          a tuple `(dD, dH, dW)`. Default: 1
+        groups: split input into groups, :math:`\text{in\_channels}` should be
+          divisible by the number of groups. Default: 1
+        padding_mode: the padding mode to use. Only "zeros" is supported for
+          quantized convolution at the moment. Default: "zeros"
+        scale: quantization scale for the output. Default: 1.0
+        zero_point: quantization zero_point for the output. Default: 0
+        dtype: quantization data type to use. Default: ``torch.quint8``
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> from torch.ao.nn.quantized import functional as qF
+        >>> filters = torch.randn(8, 4, 3, 3, 3, dtype=torch.float)
+        >>> inputs = torch.randn(1, 4, 5, 5, 5, dtype=torch.float)
+        >>> bias = torch.randn(8, dtype=torch.float)
+        >>>
+        >>> scale, zero_point = 1.0, 0
+        >>> dtype_inputs = torch.quint8
+        >>> dtype_filters = torch.qint8
+        >>>
+        >>> q_filters = torch.quantize_per_tensor(filters, scale, zero_point, dtype_filters)
+        >>> q_inputs = torch.quantize_per_tensor(inputs, scale, zero_point, dtype_inputs)
+        >>> qF.conv3d(q_inputs, q_filters, bias, padding=1, scale=scale, zero_point=zero_point)
+    """  # noqa: E501
+    if padding_mode != 'zeros':
+        raise NotImplementedError("Only zero-padding is supported!")
+    if input.dtype != torch.quint8:
+        raise NotImplementedError("Only torch.quint8 is supported for activation tensor!")
+    if weight.dtype != torch.qint8:
+        raise NotImplementedError("Only torch.qint8 is supported for weight tensor!")
+    if input.ndim != 5:
+        raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+    stride = _triple(stride)
+    padding = _triple(padding)
+    dilation = _triple(dilation)
+
+    packed_params = torch.ops.quantized.conv3d_prepack(
+        weight, bias, stride, padding, dilation, groups)
+    return torch.ops.quantized.conv3d(input, packed_params, scale, zero_point)
+
+def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Down/up samples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D/3D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.interpolate' must be quantized!")
+    return torch.nn.functional.interpolate(input, size, scale_factor, mode,
+                                           align_corners)
+
+def linear(
+    input: Tensor, weight: Tensor, bias: Optional[Tensor] = None,
+    scale: Optional[float] = None, zero_point: Optional[int] = None
+) -> Tensor:
+    r"""
+    Applies a linear transformation to the incoming quantized data:
+    :math:`y = xA^T + b`.
+    See :class:`~torch.ao.nn.quantized.Linear`
+
+    .. note::
+
+      Current implementation packs weights on every call, which has penalty on performance.
+      If you want to avoid the overhead, use :class:`~torch.ao.nn.quantized.Linear`.
+
+    Args:
+      input (Tensor): Quantized input of type `torch.quint8`
+      weight (Tensor): Quantized weight of type `torch.qint8`
+      bias (Tensor): None or fp32 bias of type `torch.float`
+      scale (double): output scale. If None, derived from the input scale
+      zero_point (long): output zero point. If None, derived from the input zero_point
+
+    Shape:
+        - Input: :math:`(N, *, in\_features)` where `*` means any number of
+          additional dimensions
+        - Weight: :math:`(out\_features, in\_features)`
+        - Bias: :math:`(out\_features)`
+        - Output: :math:`(N, *, out\_features)`
+    """
+    if scale is None:
+        scale = input.q_scale()
+    if zero_point is None:
+        zero_point = input.q_zero_point()
+    _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+    return torch.ops.quantized.linear(input, _packed_params, scale, zero_point)
+
+def max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 1D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.ao.nn.quantized.MaxPool1d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.nn.functional.max_pool1d(input, kernel_size, stride, padding,
+                                          dilation, ceil_mode=ceil_mode, return_indices=return_indices)
+
+def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
+               ceil_mode=False, return_indices=False):
+    r"""Applies a 2D max pooling over a quantized input signal composed of
+    several quantized input planes.
+
+    .. note:: The input quantization parameters are propagated to the output.
+
+    See :class:`~torch.ao.nn.quantized.MaxPool2d` for details.
+    """
+    if return_indices:
+        raise NotImplementedError("return_indices is not yet implemented!")
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding,
+                                          dilation, ceil_mode=ceil_mode, return_indices=return_indices)
+
+def celu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.) -> Tensor:
+    r"""celu(input, scale, zero_point, alpha=1.) -> Tensor
+
+    Applies the quantized CELU function element-wise.
+
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x / \alpha) - 1))
+
+    Args:
+        input: quantized input
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.celu' must be quantized!")
+    return torch.ops.quantized.celu(input, scale, zero_point, alpha)
+
+
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False,
+               scale: Optional[float] = None, zero_point: Optional[int] = None):
+    r"""
+    Quantized version of the.
+    leaky_relu(input, negative_slope=0.01, inplace=False, scale, zero_point) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)`
+
+    Args:
+        input: Quantized input
+        negative_slope: The slope of the negative input
+        inplace: Inplace modification of the input tensor
+        scale, zero_point: Scale and zero point of the output tensor.
+
+    See :class:`~torch.nn.LeakyReLU` for more details.
+    """
+    if scale is not None and zero_point is not None:
+        assert not inplace, "Cannot rescale with `inplace`"
+        output = torch._empty_affine_quantized(
+            input.shape, scale=scale, zero_point=int(zero_point), dtype=input.dtype)
+        torch._C._nn.leaky_relu(input, negative_slope, out=output)
+        return output
+    if inplace:
+        result = torch._C._nn.leaky_relu_(input, negative_slope)
+    else:
+        result = torch._C._nn.leaky_relu(input, negative_slope)
+    return result
+
+def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace: bool = False) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardtanh`.
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardtanh' must be quantized!")
+    if inplace:
+        return torch._C._nn.hardtanh_(input, min_val, max_val)
+    return torch._C._nn.hardtanh(input, min_val, max_val)
+
+def hardswish(input: Tensor, scale: float, zero_point: int) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardswish`.
+
+    Args:
+        input: quantized input
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardswish' must be quantized!")
+    return torch._ops.ops.quantized.hardswish(input, scale, zero_point)
+
+def threshold(input: Tensor, threshold: float, value: float) -> Tensor:
+    r"""Applies the quantized version of the threshold function element-wise:
+
+    .. math::
+        x = \begin{cases}
+                x & \text{if~} x > \text{threshold} \\
+                \text{value} & \text{otherwise}
+            \end{cases}
+
+    See :class:`~torch.nn.Threshold` for more details.
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.threshold' must be quantized!")
+    if threshold is None:
+        raise ValueError("Input to 'threshold' must be specified!")
+    if value is None:
+        raise ValueError("Input to 'value' must be specified!")
+    return torch._ops.ops.quantized.threshold(input, threshold, value)
+
+def elu(input: Tensor, scale: float, zero_point: int, alpha: float = 1.) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.elu`.
+
+    Args:
+        input: quantized input
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        alpha: the alpha constant
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.elu' must be quantized!")
+    return torch.ops.quantized.elu(input, scale, zero_point, alpha)
+
+def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""This is the quantized version of :func:`~torch.nn.functional.hardsigmoid`.
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.hardsigmoid' must be quantized!")
+    if inplace:
+        return torch._C._nn.hardsigmoid_(input)  # type: ignore[attr-defined]
+    return torch._C._nn.hardsigmoid(input)
+
+def clamp(input: Tensor, min_: float, max_: float) -> Tensor:
+    r"""float(input, min\_, max\_) -> Tensor
+
+    Applies the clamp function element-wise.
+    See :class:`~torch.ao.nn.quantized.clamp` for more details.
+
+    Args:
+        input: quantized input
+        min_: minimum value for clamping
+        max_: maximum value for clamping
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.clamp' must be quantized!")
+    return torch.clamp(input, min_, max_)
+
+def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(...)``.
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): quantized input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to be an integer.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`bilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+    """
+    warnings.warn("nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode, align_corners)
+
+def upsample_bilinear(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using bilinear upsampling.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with
+        ``nn.quantized.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int]): output spatial size.
+        scale_factor (int or Tuple[int, int]): multiplier for spatial size
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='bilinear', align_corners=True)
+
+def upsample_nearest(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using nearest neighbours' pixel values.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.ao.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(..., mode='nearest')``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial
+            size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='nearest')
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd65cb5415fe4b5152e05957b92eefd97b49d63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__init__.py
@@ -0,0 +1,131 @@
+import torch
+
+# The quantized modules use `torch.nn` and `torch.ao.nn.quantizable`
+# packages. However, the `quantizable` package uses "lazy imports"
+# to avoid circular dependency.
+# Hence we need to include it here to make sure it is resolved before
+# they are used in the modules.
+import torch.ao.nn.quantizable
+
+from torch.nn.modules.pooling import MaxPool2d
+
+from .activation import ReLU6, Hardswish, ELU, LeakyReLU, Sigmoid, Softmax, MultiheadAttention, PReLU
+from .dropout import Dropout
+from .batchnorm import BatchNorm2d, BatchNorm3d
+from .normalization import LayerNorm, GroupNorm, InstanceNorm1d, \
+    InstanceNorm2d, InstanceNorm3d
+from .conv import Conv1d, Conv2d, Conv3d
+from .conv import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from .linear import Linear
+from .embedding_ops import Embedding, EmbeddingBag
+from .rnn import LSTM
+
+from .functional_modules import FloatFunctional, FXFloatFunctional, QFunctional
+
+__all__ = [
+    'BatchNorm2d',
+    'BatchNorm3d',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'DeQuantize',
+    'ELU',
+    'Embedding',
+    'EmbeddingBag',
+    'GroupNorm',
+    'Hardswish',
+    'InstanceNorm1d',
+    'InstanceNorm2d',
+    'InstanceNorm3d',
+    'LayerNorm',
+    'LeakyReLU',
+    'Linear',
+    'LSTM',
+    'MultiheadAttention',
+    'Quantize',
+    'ReLU6',
+    'Sigmoid',
+    'Softmax',
+    'Dropout',
+    'PReLU',
+    # Wrapper modules
+    'FloatFunctional',
+    'FXFloatFunctional',
+    'QFunctional',
+]
+
+class Quantize(torch.nn.Module):
+    r"""Quantizes an incoming tensor
+
+    Args:
+     `scale`: scale of the output Quantized Tensor
+     `zero_point`: zero_point of output Quantized Tensor
+     `dtype`: data type of output Quantized Tensor
+     `factory_kwargs`: Dictionary of kwargs used for configuring initialization
+         of internal buffers. Currently, `device` and `dtype` are supported.
+         Example: `factory_kwargs={'device': 'cuda', 'dtype': torch.float64}`
+         will initialize internal buffers as type `torch.float64` on the current CUDA device.
+         Note that `dtype` only applies to floating-point buffers.
+
+    Examples::
+        >>> t = torch.tensor([[1., -1.], [1., -1.]])
+        >>> scale, zero_point, dtype = 1.0, 2, torch.qint8
+        >>> qm = Quantize(scale, zero_point, dtype)
+        >>> # xdoctest: +SKIP
+        >>> qt = qm(t)
+        >>> print(qt)
+        tensor([[ 1., -1.],
+                [ 1., -1.]], size=(2, 2), dtype=torch.qint8, scale=1.0, zero_point=2)
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(self, scale, zero_point, dtype, factory_kwargs=None):
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        super().__init__()
+        self.register_buffer('scale', torch.tensor([scale], **factory_kwargs))
+        self.register_buffer('zero_point',
+                             torch.tensor([zero_point], dtype=torch.long,
+                                          **{k: v for k, v in factory_kwargs.items() if k != 'dtype'}))
+        self.dtype = dtype
+
+    def forward(self, X):
+        return torch.quantize_per_tensor(X, float(self.scale),
+                                         int(self.zero_point), self.dtype)
+
+    @staticmethod
+    def from_float(mod):
+        assert hasattr(mod, 'activation_post_process')
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Quantize(scale.float().item(), zero_point.long().item(), mod.activation_post_process.dtype)
+
+    def extra_repr(self):
+        return f'scale={self.scale}, zero_point={self.zero_point}, dtype={self.dtype}'
+
+
+class DeQuantize(torch.nn.Module):
+    r"""Dequantizes an incoming tensor
+
+    Examples::
+        >>> input = torch.tensor([[1., -1.], [1., -1.]])
+        >>> scale, zero_point, dtype = 1.0, 2, torch.qint8
+        >>> qm = Quantize(scale, zero_point, dtype)
+        >>> # xdoctest: +SKIP
+        >>> quantized_input = qm(input)
+        >>> dqm = DeQuantize()
+        >>> dequantized = dqm(quantized_input)
+        >>> print(dequantized)
+        tensor([[ 1., -1.],
+                [ 1., -1.]], dtype=torch.float32)
+    """
+
+    def forward(self, Xq):
+        return Xq.dequantize()
+
+    @staticmethod
+    def from_float(mod):
+        return DeQuantize()
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8eb16b0882963cd4151a846c098a3e629fc2ba6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e14731901c9be9c9309ea18edb29a019dbace8cd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eaefaf306a5618e8303f6dbfdc409815acda3a4d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8682f6872ee504869f523fecfb6f2d2907936572
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da9707959dbffccefcd07ab8c0d997d886842e61
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bacc8b57e5e6877b7b7bcc0d860496aeae5f9c26
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..924b5a5e3811dae0d22a7dad8899c47ec05813a5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..becb30764e289271fd0a1bd13533d8fecf4c9eec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cc9d5e8491e49051cf4b0e84bbfbb240af12f68
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfc14a983b768b3e268c30a00aab04e3fa5ab50a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27ab7a8041cc6a8c6ac74753db50fa20391280a1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/activation.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae043965f25978d786209fa86b454c70d040b28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/activation.py
@@ -0,0 +1,302 @@
+import torch
+from warnings import warn
+__all__ = [
+    "ReLU6",
+    "Hardswish",
+    "ELU",
+    "LeakyReLU",
+    "Sigmoid",
+    "Softmax",
+    "MultiheadAttention",
+    "PReLU"
+]
+
+class ReLU6(torch.nn.ReLU):
+    r"""Applies the element-wise function:
+
+    :math:`\text{ReLU6}(x) = \min(\max(x_0, x), q(6))`, where :math:`x_0` is the
+    zero_point, and :math:`q(6)` is the quantized representation of number 6.
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: ../scripts/activation_images/ReLU6.png
+
+    Examples::
+
+        >>> m = nn.quantized.ReLU6()
+        >>> input = torch.randn(2)
+        >>> # xdoctest: +SKIP
+        >>> input = torch.quantize_per_tensor(input, 1.0, 0, dtype=torch.qint32)
+        >>> output = m(input)
+    """
+    def __init__(self, inplace=False):
+        super().__init__(inplace)
+        self.inplace = inplace
+
+    def forward(self, input):
+        return torch.ops.quantized.relu6(input, self.inplace)
+
+    def _get_name(self):
+        return 'QuantizedReLU6'
+
+    @staticmethod
+    def from_float(mod):
+        return ReLU6(mod.inplace)
+
+class Hardswish(torch.nn.Hardswish):
+    r"""This is the quantized version of :class:`~torch.nn.Hardswish`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+    def __init__(self, scale, zero_point, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.hardswish(input, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedHardswish'
+
+    @staticmethod
+    def from_float(mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Hardswish(float(scale), int(zero_point))
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point))
+
+class ELU(torch.nn.ELU):
+    r"""This is the quantized equivalent of :class:`~torch.nn.ELU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        alpha: the alpha constant
+    """
+    def __init__(self, scale, zero_point, alpha=1.):
+        super().__init__(alpha)
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def forward(self, input):
+        return torch.ao.nn.quantized.functional.elu(
+            input, self.scale, self.zero_point, self.alpha)
+
+    def _get_name(self):
+        return 'QuantizedELU'
+
+    @staticmethod
+    def from_float(mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return ELU(float(scale), int(zero_point), mod.alpha)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.alpha)
+
+class LeakyReLU(torch.nn.LeakyReLU):
+    r"""This is the quantized equivalent of :class:`~torch.nn.LeakyReLU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        negative_slope: Controls the angle of the negative slope. Default: 1e-2
+    """
+    def __init__(self, scale: float, zero_point: int, negative_slope: float = 1e-2,
+                 inplace: bool = False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(negative_slope, inplace)
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.leaky_relu(
+            input, self.negative_slope, self.inplace, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedLeakyReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(float(scale), int(zero_point), mod.negative_slope, mod.inplace)
+
+class Sigmoid(torch.nn.Sigmoid):
+    r"""This is the quantized equivalent of :class:`~torch.nn.Sigmoid`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+
+    def __init__(self, output_scale: float, output_zero_point: int):
+        super().__init__()
+        self.output_scale = output_scale
+        self.output_zero_point = output_zero_point
+
+    def forward(self, input):
+        return torch.ops.quantized.sigmoid(input, self.output_scale, self.output_zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        output_scale, output_zero_point = mod.activation_post_process.calculate_qparams()
+        return cls(float(output_scale), int(output_zero_point))
+
+class Softmax(torch.nn.Softmax):
+    r"""This is the quantized version of :class:`~torch.nn.Softmax`.
+
+    Args:
+        dim: A dimension along which Softmax will be computed (so every slice along dim will sum to 1).
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+    """
+    def __init__(self, dim=None, scale=1.0, zero_point=0):
+        super().__init__()
+        self.dim = dim
+        self.scale = scale
+        self.zero_point = zero_point
+
+    def forward(self, input):
+        dim = self.dim
+        if dim is None:
+            stacklevel = 3
+            # Note: adding the mypy ignore on _get_softmax_dim seems less bad
+            # than making `_get_softmax_dim` an official API.
+            dim = torch.nn.functional._get_softmax_dim(  # type: ignore[attr-defined]
+                "softmax", input.dim(), stacklevel)
+        return torch.ops.quantized.softmax(
+            input, dim, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedSoftmax'
+
+    @staticmethod
+    def from_float(mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        return Softmax(mod.dim, float(scale), int(zero_point))
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.dim, float(scale), int(zero_point))
+
+
+class MultiheadAttention(torch.ao.nn.quantizable.MultiheadAttention):
+    _FLOAT_MODULE = torch.ao.nn.quantizable.MultiheadAttention
+
+    def _get_name(self):
+        return "QuantizedMultiheadAttention"
+
+    @classmethod
+    def from_float(cls, other):
+        # The whole flow is float -> observed -> quantized
+        # This class does observed -> quantized only
+        raise NotImplementedError("It looks like you are trying to convert a "
+                                  "non-observed MHA module. Please, see "
+                                  "the examples on quantizable MHAs.")
+
+    @classmethod
+    def from_observed(cls, other):
+        converted = torch.ao.quantization.convert(other, mapping=None,
+                                                  inplace=False,
+                                                  remove_qconfig=True,
+                                                  convert_custom_config_dict=None)
+        converted.__class__ = cls
+        # Remove the parameters for the bias_k and bias_v to quantize them
+        # TODO: This is a potential source of accuracy drop.
+        #       quantized cat takes the scale and zp of the first
+        #       element, which might lose the precision in the bias_k
+        #       and the bias_v (which are cat'ed with k/v being first).
+        if converted.bias_k is not None:
+            bias_k = converted._parameters.pop('bias_k')
+            sc, zp = torch._choose_qparams_per_tensor(bias_k,
+                                                      reduce_range=False)
+            bias_k = torch.quantize_per_tensor(bias_k, sc, zp, torch.quint8)
+            setattr(converted, 'bias_k', bias_k)  # noqa: B010
+
+        if converted.bias_v is not None:
+            bias_v = converted._parameters.pop('bias_v')
+            sc, zp = torch._choose_qparams_per_tensor(bias_k,  # type: ignore[possibly-undefined]
+                                                      reduce_range=False)
+            bias_v = torch.quantize_per_tensor(bias_v, sc, zp, torch.quint8)
+            setattr(converted, 'bias_v', bias_v)  # noqa: B010
+
+        del converted.in_proj_weight
+        del converted.in_proj_bias
+
+        return converted
+
+class PReLU(torch.nn.Module):
+    r"""This is the quantized equivalent of :class:`~torch.nn.PReLU`.
+
+    Args:
+        scale: quantization scale of the output tensor
+        zero_point: quantization zero point of the output tensor
+        num_parameters: number of parameters: 1, or the number of channels at input. Default: 1
+    """
+    def __init__(self, output_scale: float, output_zero_point: int,
+                 num_parameters: int = 1) -> None:
+        super().__init__()
+        self.num_parameters = num_parameters
+        self.scale = output_scale
+        self.zero_point = output_zero_point
+        w = torch.randn(num_parameters, dtype=torch.float)
+        qw = torch.quantize_per_tensor(w, scale=1.0, zero_point=0, dtype=torch.quint8)
+        self.set_weight(qw)
+
+    def set_weight(self, w: torch.Tensor) -> None:
+        self.weight = w
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.prelu(input, self.weight, self.scale, self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedPReLU'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        qprelu = cls(float(scale), int(zero_point), mod.num_parameters)
+        float_wt = mod.weight.float()
+        observer = mod.qconfig.weight()
+        observer(float_wt)
+        if observer.dtype != torch.quint8:
+            warn(
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+            )
+        wt_scale, wt_zp = observer.calculate_qparams()
+        qweight = torch.quantize_per_tensor(
+            float_wt, float(wt_scale), int(wt_zp), torch.quint8)
+        qprelu.set_weight(qweight)
+        return qprelu
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        qprelu = cls(float(scale), int(zero_point), mod.num_parameters)
+        float_wt = mod.weight.float()
+        observer = mod.qconfig.weight()
+        observer(float_wt)
+        if observer.dtype != torch.quint8:
+            warn(
+                f"PReLU's weight observer should have dtype quint8 but got {observer.dtype}"
+            )
+        wt_scale, wt_zp = observer.calculate_qparams()
+        qweight = torch.quantize_per_tensor(
+            float_wt, float(wt_scale), int(wt_zp), torch.quint8)
+        qprelu.set_weight(qweight)
+        return qprelu
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/batchnorm.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cfc51ccf73c2ab44571604f7759ed008c0be29b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/batchnorm.py
@@ -0,0 +1,106 @@
+import torch
+import torch.ao.nn.intrinsic as nni
+
+__all__ = [
+    "BatchNorm2d",
+    "BatchNorm3d"
+]
+
+class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, True, True, **factory_kwargs)
+        self.register_buffer('scale', torch.tensor(1.0, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(0, **factory_kwargs))
+
+    @staticmethod
+    def from_float(cls, mod):
+        activation_post_process = mod.activation_post_process
+        if type(mod) == cls._NNI_BN_RELU_MODULE:
+            mod = mod[0]
+        scale, zero_point = activation_post_process.calculate_qparams()
+        new_mod = cls(mod.num_features, mod.eps)
+        new_mod.weight = mod.weight
+        new_mod.bias = mod.bias
+        new_mod.running_mean = mod.running_mean
+        new_mod.running_var = mod.running_var
+        new_mod.scale = scale
+        new_mod.zero_point = zero_point
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, bn, output_scale, output_zero_point):
+        qbn = cls(
+            bn.num_features,
+            bn.eps,
+            bn.momentum,
+            device=bn.weight.device,
+            dtype=bn.weight.dtype
+        )
+        qbn.weight = bn.weight
+        qbn.bias = bn.bias
+        qbn.running_mean = bn.running_mean
+        qbn.running_var = bn.running_var
+        qbn.scale = output_scale
+        qbn.zero_point = output_zero_point
+        return qbn
+
+class BatchNorm2d(_BatchNorm):
+    r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`.
+    """
+
+    _NNI_BN_RELU_MODULE = nni.BNReLU2d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedBatchNorm2d'
+
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm2d(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        return _BatchNorm.from_float(cls, mod)
+
+class BatchNorm3d(_BatchNorm):
+    r"""This is the quantized version of :class:`~torch.nn.BatchNorm3d`.
+    """
+
+    _NNI_BN_RELU_MODULE = nni.BNReLU3d
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedBatchNorm3d'
+
+    def _check_input_dim(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # disabling this since this is not symbolically traceable
+        # self._check_input_dim(input)
+        return torch.ops.quantized.batch_norm3d(
+            input, self.weight, self.bias, self.running_mean,
+            self.running_var, self.eps, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        return _BatchNorm.from_float(cls, mod)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/conv.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..0234874aa6b1027a9cf3cfe505fe1fa076c38e68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/conv.py
@@ -0,0 +1,945 @@
+r"""Quantized convolution modules."""
+
+from typing import Optional, List, TypeVar
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+
+from torch._ops import ops
+from torch.nn.common_types import _size_1_t
+from torch.nn.modules.utils import _single, _pair, _triple
+from torch.nn.utils import fuse_conv_bn_weights
+
+from .utils import _quantize_weight, WeightedQuantizedModule
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
+_SUPPORTED_PADDING = {
+    'zeros',
+    'reflect'
+}
+
+
+def _reverse_repeat_padding(padding: List[int]) -> List[int]:
+    _reversed_padding_repeated_twice: List[int] = []
+    N = len(padding)
+    for idx in range(N):
+        for _ in range(2):
+            _reversed_padding_repeated_twice.append(padding[N - idx - 1])
+    return _reversed_padding_repeated_twice
+
+
+class _ConvNd(WeightedQuantizedModule):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        # All subclasses have this signature - See PR #49702s
+        raise NotImplementedError
+
+    def _init(self, in_channels, out_channels, kernel_size, stride,
+              padding, dilation,
+              transposed, output_padding,
+              groups, bias,
+              padding_mode='zeros',
+              device=None,
+              dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.output_padding = output_padding
+        self.groups = groups
+        if padding_mode not in _SUPPORTED_PADDING:
+            raise ValueError(f"'padding_mode' {padding_mode} is not supported by quantized convolution")
+        self.padding_mode = padding_mode
+        # Initialize as NCHW. set_weight will internally transpose to NHWC.
+        if self.transposed:
+            weight_shape = [in_channels, out_channels // self.groups]
+        else:
+            weight_shape = [out_channels, in_channels // self.groups]
+        qweight = torch._empty_affine_quantized(
+            weight_shape + list(kernel_size),
+            scale=1, zero_point=0, dtype=torch.qint8,
+            **{k: v for k, v in factory_kwargs.items() if k != 'dtype'})
+        bias_float = (
+            torch.zeros(out_channels, dtype=torch.float,
+                        **{k: v for k, v in factory_kwargs.items() if k != 'dtype'}) if bias else None)
+
+        self.set_weight_bias(qweight, bias_float)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    def set_weight_bias(self, qweight, bias_float):
+        raise NotImplementedError
+
+    def bias(self):
+        raise NotImplementedError
+
+    def _weight_bias(self):
+        raise NotImplementedError
+
+    def extra_repr(self):
+        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', stride={stride}, scale={scale}, zero_point={zero_point}')
+        if self.padding != (0,) * len(self.padding):
+            s += ', padding={padding}'
+        if self.dilation != (1,) * len(self.dilation):
+            s += ', dilation={dilation}'
+        if self.output_padding != (0,) * len(self.output_padding):
+            s += ', output_padding={output_padding}'
+        if self.groups != 1:
+            s += ', groups={groups}'
+        if self.bias() is None:
+            s += ', bias=False'
+        return s.format(**self.__dict__)
+
+    # ===== Serialization methods =====
+    # The special consideration here is that we have to unpack the weights into
+    # their regular QTensor form for serialization. Packed weights should not
+    # live outside the process in which they were created, rather they should be
+    # derived from the QTensor weight.
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # TODO: maybe change to this when https://github.com/pytorch/pytorch/pull/32958 is landed
+    #   self
+    #   |--- _packed_params : Conv2dPackedParamsBase or Conv3dPackedParamsBase
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        (w, b) = self._weight_bias()
+        destination[prefix + 'weight'] = w
+        destination[prefix + 'bias'] = b
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    @torch.jit.export
+    def __getstate__(self):
+        (w, b) = self._weight_bias()
+        return (
+            self.in_channels,
+            self.out_channels,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.transposed,
+            self.output_padding,
+            self.groups,
+            self.padding_mode,
+            w,
+            b,
+            self.scale,
+            self.zero_point,
+            self.training
+        )
+
+    # ===== Deserialization methods =====
+    # Counterpart to the serialization methods, we must pack the serialized
+    # QTensor weight into its packed format for use by the FBGEMM ops.
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.set_weight_bias(
+            state_dict[prefix + 'weight'], state_dict[prefix + 'bias'])
+        state_dict.pop(prefix + 'weight')
+        state_dict.pop(prefix + 'bias')
+        self.scale = float(state_dict[prefix + 'scale'])
+        state_dict.pop(prefix + 'scale')
+        self.zero_point = int(state_dict[prefix + 'zero_point'])
+        state_dict.pop(prefix + 'zero_point')
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False, missing_keys,
+            unexpected_keys, error_msgs)
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        self.in_channels = state[0]
+        self.out_channels = state[1]
+        self.kernel_size = state[2]
+        self.stride = state[3]
+        self.padding = state[4]
+        self.dilation = state[5]
+        self.transposed = state[6]
+        self.output_padding = state[7]
+        self.groups = state[8]
+        self.padding_mode = state[9]
+        self.set_weight_bias(state[10], state[11])
+        self.scale = state[12]
+        self.zero_point = state[13]
+        self.training = state[14]
+
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        torch.nn.Module.__init__(new_instance)
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def __copy__(self):
+        return self.__deepcopy__({})
+
+    @classmethod
+    def get_qconv(cls, mod, activation_post_process, weight_post_process=None):
+        r"""Creates a qconv object and returns it.
+        """
+        if weight_post_process is None:
+            weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        assert weight_post_process.dtype == torch.qint8, \
+            'Weight observer must have a dtype of qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        # the __init__ call used is the one from derived classes and not the one from _ConvNd
+        qconv = cls(mod.in_channels, mod.out_channels, mod.kernel_size,
+                    mod.stride, mod.padding, mod.dilation, mod.groups,
+                    mod.bias is not None, mod.padding_mode)
+        qconv.set_weight_bias(qweight, mod.bias)
+        if activation_post_process is None or activation_post_process.dtype == torch.float:
+            return qconv  # dynamic quantization doesn't need scale/zero_point
+        else:
+            act_scale, act_zp = activation_post_process.calculate_qparams()
+            qconv.scale = float(act_scale)
+            qconv.zero_point = int(act_zp)
+            return qconv
+
+    @staticmethod
+    def from_float(cls, mod):
+        if hasattr(mod, "weight_fake_quant"):
+            # assert type(mod) == cls.__QAT_MODULE, " nnq." + cls.__name__ + \
+            # ".from_float only works for " + cls.__QAT_MODULE.__name__
+            if type(mod) == cls._NNIQAT_CONV_BN_MODULE:
+                mod.weight, mod.bias = fuse_conv_bn_weights(
+                    mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                    mod.bn.eps, mod.bn.weight, mod.bn.bias)
+            assert hasattr(mod, "activation_post_process"), \
+                "Input QAT module must have observer attached"
+            weight_post_process = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            assert type(mod) == cls._FLOAT_MODULE, \
+                " nnq." + cls.__name__ + ".from_float only works for " + \
+                cls._FLOAT_MODULE.__name__ + " but got:" + str(type(mod))
+            assert hasattr(mod, "qconfig"), \
+                "Input float module must have qconfig defined."
+            activation_post_process = None if not hasattr(
+                mod, "activation_post_process") else mod.activation_post_process
+            if type(mod) in [cls._NNI_CONV_RELU_MODULE, cls._NNI_CONV_ADD_MODULE, cls._NNI_CONV_ADD_RELU_MODULE]:
+                mod = mod[0]
+            weight_post_process = mod.qconfig.weight()
+        return cls.get_qconv(mod, activation_post_process, weight_post_process)
+
+    @classmethod
+    def from_reference(cls, ref_qconv, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+        Args:
+            ref_qconv (Module): a reference quantized  module, either produced by torch.ao.quantization
+                                utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qconv = cls(
+            ref_qconv.in_channels,
+            ref_qconv.out_channels,
+            ref_qconv.kernel_size,  # type: ignore[arg-type]
+            ref_qconv.stride,  # type: ignore[arg-type]
+            ref_qconv.padding,  # type: ignore[arg-type]
+            ref_qconv.dilation,  # type: ignore[arg-type]
+            ref_qconv.groups,
+            ref_qconv.bias is not None,  # type: ignore[arg-type]
+            ref_qconv.padding_mode,
+            device=ref_qconv.weight.device,
+            dtype=ref_qconv.weight.dtype)
+        qweight = ref_qconv.get_quantized_weight()
+        qconv.set_weight_bias(qweight, ref_qconv.bias)
+        qconv.scale = float(output_scale)
+        qconv.zero_point = int(output_zero_point)
+        return qconv
+
+
+class Conv1d(_ConvNd):
+    r"""Applies a 1D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv1d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv1d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> m = nn.quantized.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 100)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0,
+        ...                                     dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+
+    _FLOAT_MODULE = nn.Conv1d
+    _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn1d
+    _NNI_CONV_RELU_MODULE = nni.ConvReLU1d
+    _NNI_CONV_ADD_MODULE: None = None
+    _NNI_CONV_ADD_RELU_MODULE: None = None
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: _size_1_t = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = 'zeros',
+                 device=None,
+                 dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = padding if isinstance(padding, str) else _single(padding)
+        dilation = _single(dilation)
+
+        # Subclasses of _ConvNd needs to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _single(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConv1d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == 'zeros':
+            self._packed_params = torch.ops.quantized.conv1d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups)
+        else:
+            self._packed_params = torch.ops.quantized.conv1d_prepack(
+                w, b, self.stride, _pair(0), self.dilation,
+                self.groups)
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv1d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        if self.padding_mode != 'zeros':
+            # Padding in Conv1d is stored as (p, p), need to get (p,)
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding[:1])
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv1d(input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(cls, mod)
+
+
+class Conv2d(_ConvNd):
+    r"""Applies a 2D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv2d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+    _FLOAT_MODULE = nn.Conv2d
+    _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn2d
+    _NNI_CONV_RELU_MODULE = nni.ConvReLU2d
+    _NNI_CONV_ADD_MODULE = nni.ConvAdd2d
+    _NNI_CONV_ADD_RELU_MODULE = nni.ConvAddReLU2d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _pair(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConv2d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == 'zeros':
+            self._packed_params = torch.ops.quantized.conv2d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups)
+        else:
+            self._packed_params = torch.ops.quantized.conv2d_prepack(
+                w, b, self.stride, _pair(0), self.dilation, self.groups)
+
+    def _weight_bias(self):
+        return self._packed_params.unpack()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv2d(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(cls, mod)
+
+
+class Conv3d(_ConvNd):
+    r"""Applies a 3D convolution over a quantized input signal composed of
+    several quantized input planes.
+
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.Conv3d`.
+
+    .. note::
+        Only `zeros` is supported for the :attr:`padding_mode` argument.
+
+    .. note::
+        Only `torch.quint8` is supported for the input data type.
+
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+
+    See :class:`~torch.nn.Conv3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # With square kernels and equal stride
+        >>> m = nn.quantized.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.quantized.Conv3d(16, 33, (3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2), dilation=(1, 2, 2))
+        >>> input = torch.randn(20, 16, 56, 56, 56)
+        >>> # quantize input to quint8
+        >>> # xdoctest: +SKIP
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+
+    """
+    _FLOAT_MODULE = nn.Conv3d
+    _NNIQAT_CONV_BN_MODULE = nniqat.ConvBn3d
+    _NNI_CONV_RELU_MODULE = nni.ConvReLU3d
+    _NNI_CONV_ADD_MODULE: None = None
+    _NNI_CONV_ADD_RELU_MODULE: None = None
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros', device=None, dtype=None):
+        assert padding_mode != 'reflect', "Conv3d does not support reflection padding"
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConv3d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        if self.padding_mode == 'zeros':
+            self._packed_params = torch.ops.quantized.conv3d_prepack(
+                w, b, self.stride, self.padding, self.dilation, self.groups)
+        else:
+            self._packed_params = torch.ops.quantized.conv3d_prepack(
+                w, b, self.stride, _triple(0), self.dilation, self.groups)
+
+    def _weight_bias(self):
+        return self._packed_params.unpack()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, D, H, W)`!")
+        if self.padding_mode != 'zeros':
+            _reversed_padding_repeated_twice = _reverse_repeat_padding(self.padding)
+            input = F.pad(input, _reversed_padding_repeated_twice,
+                          mode=self.padding_mode)
+        return ops.quantized.conv3d(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Creates a quantized module from a float module or qparams_dict.
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        return _ConvNd.from_float(cls, mod)
+
+# === Transposed Convolutions ===
+MOD = TypeVar('MOD', bound=nn.modules.conv._ConvNd)
+
+
+class _ConvTransposeNd(_ConvNd):
+
+    _FLOAT_MODULE = MOD
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, transposed, output_padding,
+                 groups, bias, padding_mode, device=None, dtype=None):
+        if padding_mode != 'zeros':
+            raise ValueError(f'Only "zeros" padding mode is supported for {self.__class__.__name__}')
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        # Subclasses of _ConvNd need to call _init rather than __init__. See
+        # discussion on PR #49702
+        super()._init(
+            in_channels, out_channels, kernel_size, stride,
+            padding, dilation, transposed, output_padding,
+            groups, bias, padding_mode, **factory_kwargs)
+
+    def _input_padding(self, kernel_size: List[int], dilation: List[int], padding: List[int]) -> List[int]:
+        res = torch.jit.annotate(List[int], [])
+        for kdx in range(len(kernel_size)):
+            pad = (dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx])
+            res.append(pad)
+        return res
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Creates a quantized module from a float module or qparams_dict.
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+              utilities or provided by the user
+        """
+        # derived classes override cls._FLOAT_MODULE attribute
+        msg = ' nnq.' + cls.__name__ + '.from_float only works for ' + \
+              cls._FLOAT_MODULE.__name__  # type: ignore[attr-defined]
+        assert type(mod) == cls._FLOAT_MODULE, msg
+        assert hasattr(mod, 'qconfig'), \
+            'Input float module must have qconfig defined.'
+        weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        assert weight_post_process.dtype == torch.qint8, \
+            'Weight observer must have a dtype of qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        # the __init__ call used is the one from derived classes and not the one from _ConvTransposeNd
+        qconv = cls(mod.in_channels, mod.out_channels, mod.kernel_size,  # type: ignore[call-arg]
+                    mod.stride, mod.padding, mod.output_padding, mod.groups,
+                    mod.bias is not None, mod.dilation, mod.padding_mode)
+        qconv.set_weight_bias(qweight, mod.bias)
+        if not hasattr(mod, "activation_post_process") or mod.activation_post_process.dtype == torch.float:
+            return qconv  # dynamic quantization doesn't need scale/zero_point
+        else:
+            act_scale, act_zp = mod.activation_post_process.calculate_qparams()
+            qconv.scale = float(act_scale)
+            qconv.zero_point = int(act_zp)
+            return qconv
+
+    @staticmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+        Args:
+            ref_qconvt (Module): a reference quantized  module, either produced by torch.ao.quantization
+                                 utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qconv = cls(
+            ref_qconvt.in_channels,
+            ref_qconvt.out_channels,
+            ref_qconvt.kernel_size,  # type: ignore[arg-type]
+            ref_qconvt.stride,  # type: ignore[arg-type]
+            ref_qconvt.padding,  # type: ignore[arg-type]
+            ref_qconvt.output_padding,  # type: ignore[arg-type]
+            ref_qconvt.groups,
+            ref_qconvt.bias is not None,  # type: ignore[arg-type]
+            ref_qconvt.dilation,  # type: ignore[arg-type]
+            ref_qconvt.padding_mode,
+            device=ref_qconvt.weight.device,
+            dtype=ref_qconvt.weight.dtype)
+        qweight = ref_qconvt.get_quantized_weight()
+        qconv.set_weight_bias(qweight, ref_qconvt.bias)
+        qconv.scale = float(output_scale)
+        qconv.zero_point = int(output_zero_point)
+        return qconv
+
+
+class ConvTranspose1d(_ConvTransposeNd):
+    r"""Applies a 1D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose1d`.
+
+    .. note:: Currently only the QNNPACK engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'qnnpack'`
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv1d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> torch.backends.quantized.engine = 'qnnpack'
+        >>> from torch.ao.nn import quantized as nnq
+        >>> # With square kernels and equal stride
+        >>> m = nnq.ConvTranspose1d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose1d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv1d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose1d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose1d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        output_padding = _single(output_padding)
+
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConvTranspose1d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(
+            w, b, self.stride, self.padding, self.output_padding, self.dilation,
+            self.groups)
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv_transpose1d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 3:
+            raise ValueError("Input shape must be `(N, C, L)`!")
+        return torch.ops.quantized.conv_transpose1d(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
+
+
+class ConvTranspose2d(_ConvTransposeNd):
+    r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose2d`.
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv2d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose2d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> # QNNPACK or FBGEMM as backend
+        >>> torch.backends.quantized.engine = 'qnnpack'
+        >>> # With square kernels and equal stride
+        >>> import torch.ao.nn.quantized as nnq
+        >>> m = nnq.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose2d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConvTranspose2d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(
+            w, b, self.stride, self.padding, self.output_padding, self.dilation,
+            self.groups)
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv2d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 4:
+            raise ValueError("Input shape must be `(N, C, H, W)`!")
+        return ops.quantized.conv_transpose2d(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
+
+
+class ConvTranspose3d(_ConvTransposeNd):
+    r"""Applies a 3D transposed convolution operator over an input image
+    composed of several input planes.
+    For details on input arguments, parameters, and implementation see
+    :class:`~torch.nn.ConvTranspose3d`.
+
+    .. note:: Currently only the FBGEMM engine is implemented.
+        Please, set the `torch.backends.quantized.engine = 'fbgemm'`
+
+    For special notes, please, see :class:`~torch.ao.nn.quantized.Conv3d`
+
+    Attributes:
+        weight (Tensor):     packed tensor derived from the learnable weight
+                             parameter.
+        scale (Tensor):      scalar for the output scale
+        zero_point (Tensor): scalar for the output zero point
+    See :class:`~torch.nn.ConvTranspose3d` for other attributes.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> torch.backends.quantized.engine = 'fbgemm'
+        >>> from torch.ao.nn import quantized as nnq
+        >>> # With cubic kernels and equal stride
+        >>> m = nnq.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-cubic kernels and unequal stride and with padding
+        >>> m = nnq.ConvTranspose3d(16, 33, (3, 3, 5), stride=(2, 1, 1), padding=(4, 2, 2))
+        >>> input = torch.randn(20, 16, 50, 100, 100)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> output = m(q_input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12, 12)
+        >>> q_input = torch.quantize_per_tensor(input, scale=1.0, zero_point=0, dtype=torch.quint8)
+        >>> downsample = nnq.Conv3d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nnq.ConvTranspose3d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(q_input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6, 6])
+        >>> # xdoctest: +SKIP("FIXME: output_size is not a parameter)
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12, 12])
+    """
+
+    _FLOAT_MODULE = nn.ConvTranspose3d
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0, groups=1, bias=True,
+                 dilation=1, padding_mode='zeros', device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        output_padding = _triple(output_padding)
+
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def _get_name(self):
+        return 'QuantizedConvTranspose3d'
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(
+            w, b, self.stride, self.padding, self.output_padding, self.dilation,
+            self.groups)
+
+    def _weight_bias(self):
+        w, b = torch.ops.quantized.conv3d_unpack(self._packed_params)
+        return w, b
+
+    def weight(self):
+        (w, _) = self._weight_bias()
+        return w
+
+    def bias(self):
+        (_, b) = self._weight_bias()
+        return b
+
+    def forward(self, input):
+        # Temporarily using len(shape) instead of ndim due to JIT issue
+        # https://github.com/pytorch/pytorch/issues/23890
+        if len(input.shape) != 5:
+            raise ValueError("Input shape must be `(N, C, T, H, W)`!")
+        return ops.quantized.conv_transpose3d(
+            input, self._packed_params, self.scale, self.zero_point)
+
+    @classmethod
+    def from_reference(cls, ref_qconvt, output_scale, output_zero_point):
+        return _ConvTransposeNd.from_reference(cls, ref_qconvt, output_scale, output_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/dropout.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ace7f68a58326ef635ba29b65c172509bcd5c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/dropout.py
@@ -0,0 +1,27 @@
+import torch
+
+__all__ = ['Dropout']
+
+class Dropout(torch.nn.Dropout):
+    r"""This is the quantized equivalent of :class:`~torch.nn.Dropout`.
+        And this is a placeholder to enable models where fp32 tensors
+        had dropout to work with quantized tensors in train and eval mode.
+
+    Args:
+        p: probability of an element to be zeroed
+        inplace: can optionally do the operation in-place. Default: ``False``
+    """
+
+    def forward(self, input):
+        return input
+
+    def _get_name(self):
+        return 'QuantizedDropout'
+
+    @classmethod
+    def from_float(cls, mod):
+        return cls(mod.p, mod.inplace)
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(mod.p, mod.inplace)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..377007e64c3907677c5ca2804c69bc0a41c2b256
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/embedding_ops.py
@@ -0,0 +1,295 @@
+import torch
+import torch.nn as nn
+from torch import Tensor  # noqa: F401
+from torch._jit_internal import Optional, List  # noqa: F401
+
+from .utils import _hide_packed_params_repr
+from .utils import _quantize_weight
+
+__all__ = ['EmbeddingPackedParams', 'Embedding', 'EmbeddingBag']
+
+class EmbeddingPackedParams(torch.nn.Module):
+    _version = 1
+
+    def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
+        super().__init__()
+        self.dtype = dtype
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            scales = torch.ones(num_embeddings, dtype=torch.float)
+            zero_points = torch.zeros(num_embeddings, dtype=torch.float)
+            wq = torch._empty_per_channel_affine_quantized([num_embeddings, embedding_dim], scales=scales,
+                                                           zero_points=zero_points,
+                                                           axis=0, dtype=self.dtype)
+            self.set_weight(wq)
+        else:
+            raise NotImplementedError(f'Unsupported dtype on quantized embedding! Supports quint8 and quint4x2. Got dtype: {dtype}')
+
+    @torch.jit.export
+    def set_weight(self, weight: torch.Tensor) -> None:
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight)
+        else:
+            raise NotImplementedError('Unsupported dtype for quantized embedding prepack! Supports quint8 and quint4x2.')
+
+
+    @torch.jit.export
+    def _weight(self):
+        if self.dtype in [torch.quint8, torch.quint4x2]:
+            return torch.ops.quantized.embedding_bag_unpack(self._packed_weight)
+        else:
+            raise NotImplementedError('Unsupported dtype for quantized embedding unpack! Supports quint8 and quint4x2.')
+
+    def forward(self, x):
+        return x
+
+    # Version 1
+    #   self
+    #   |--- _packed_weight : Tensor representing weight of EmbeddingPackedParamsBase
+    #   |--- dtype : torch.dtype
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'dtype'] = self.dtype
+        destination[prefix + '_packed_weight'] = self._weight()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.dtype = state_dict[prefix + 'dtype']
+        state_dict.pop(prefix + 'dtype')
+
+        weight = state_dict[prefix + '_packed_weight']
+        state_dict.pop(prefix + '_packed_weight')
+        self.set_weight(weight)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    def __repr__(self):
+        return self._weight().__repr__()
+
+class Embedding(torch.nn.Module):
+    r"""
+    A quantized Embedding module with quantized packed weights as inputs.
+    We adopt the same interface as `torch.nn.Embedding`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding for documentation.
+
+    Similar to :class:`~torch.nn.Embedding`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`.
+
+    Examples::
+        >>> m = nn.quantized.Embedding(num_embeddings=10, embedding_dim=12)
+        >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8])
+        >>> output = m(indices)
+        >>> print(output.size())
+        torch.Size([9, 12])
+
+    """
+    _version = 1
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 sparse: bool = False, _weight: Optional[Tensor] = None, dtype=torch.quint8) -> None:
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.dtype = dtype
+
+        if _weight is None:
+            scales = torch.ones(num_embeddings, dtype=torch.float)
+            zero_points = torch.zeros(num_embeddings, dtype=torch.float)
+            qweight = torch._empty_per_channel_affine_quantized([num_embeddings, embedding_dim],
+                                                                scales=scales, zero_points=zero_points,
+                                                                axis=0, dtype=torch.quint8)
+        else:
+            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
+                'Shape of weight does not match num_embeddings and embedding_dim'
+            qweight = _weight
+
+        self._packed_params = EmbeddingPackedParams(num_embeddings, embedding_dim, dtype)
+        self._packed_params.set_weight(qweight)
+
+    def forward(self, indices: Tensor) -> Tensor:
+        if self.dtype == torch.quint4x2:
+            return torch.ops.quantized.embedding_4bit(self._packed_params._packed_weight, indices)
+        else:
+            return torch.ops.quantized.embedding_byte(self._packed_params._packed_weight, indices)
+
+    def _get_name(self):
+        return 'QuantizedEmbedding'
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, EmbeddingPackedParams)
+
+    def extra_repr(self):
+        extra_repr_str = 'num_embeddings={}, embedding_dim={}, dtype={}, qscheme={}'.format(
+            self.num_embeddings, self.embedding_dim, self._packed_params.dtype, self.weight().qscheme()
+        )
+
+        return extra_repr_str
+
+    def set_weight(self, w: torch.Tensor) -> None:
+        self._packed_params.set_weight(w)
+
+    def weight(self):
+        return self._packed_params._weight()
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized embedding module from a float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by user
+        """
+        if hasattr(mod, 'weight_fake_quant'):
+            assert type(mod) == torch.ao.nn.qat.Embedding, 'nnq.' + cls.__name__ + '.from_float ' + \
+                'with fake quant only works for ' + torch.ao.nn.qat.Embedding.__name__
+            weight_observer = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            assert type(mod) == nn.Embedding, 'nnq.' + cls.__name__ + '.from_float only works for ' + \
+                nn.Embedding.__name__
+            assert hasattr(mod, 'qconfig'), 'Embedding input float module must have qconfig defined'
+            from torch.ao.quantization import float_qparams_weight_only_qconfig
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+            else:
+                weight_observer = float_qparams_weight_only_qconfig.weight()
+
+        dtype = weight_observer.dtype
+        is_float_qparams_qconfig = weight_observer.qscheme == torch.per_channel_affine_float_qparams
+        assert is_float_qparams_qconfig, \
+            'Embedding quantization is only supported with float_qparams_weight_only_qconfig.'
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, \
+            f'The only supported dtype for nnq.Embedding is torch.quint8 and torch.quint4x2, got {dtype}'
+
+        # Run the observer to calculate qparams.
+        weight_observer(mod.weight)
+        qweight = _quantize_weight(mod.weight.float(), weight_observer)
+
+        # Create quantized Embedding module and pass in the quantized weight
+        qembedding = Embedding(mod.num_embeddings, mod.embedding_dim)
+        qembedding.set_weight(qweight)
+        return qembedding
+
+    @classmethod
+    def from_reference(cls, ref_embedding):
+        qembedding = cls(
+            ref_embedding.num_embeddings,
+            ref_embedding.embedding_dim,
+            ref_embedding.padding_idx,
+            ref_embedding.max_norm,
+            ref_embedding.norm_type,
+            ref_embedding.scale_grad_by_freq,
+            ref_embedding.sparse,
+            ref_embedding.get_quantized_weight(),
+            ref_embedding.weight_dtype,
+        )
+        return qembedding
+
+class EmbeddingBag(Embedding):
+    r"""
+    A quantized EmbeddingBag module with quantized packed weights as inputs.
+    We adopt the same interface as `torch.nn.EmbeddingBag`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.EmbeddingBag for documentation.
+
+    Similar to :class:`~torch.nn.EmbeddingBag`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{num\_embeddings}, \text{embedding\_dim})`.
+
+    Examples::
+        >>> m = nn.quantized.EmbeddingBag(num_embeddings=10, embedding_dim=12, include_last_offset=True, mode='sum')
+        >>> indices = torch.tensor([9, 6, 5, 7, 8, 8, 9, 2, 8, 6, 6, 9, 1, 6, 8, 8, 3, 2, 3, 6, 3, 6, 5, 7, 0, 8, 4, 6, 5, 8, 2, 3])
+        >>> offsets = torch.tensor([0, 19, 20, 28, 28, 32])
+        >>> output = m(indices, offsets)
+        >>> print(output.size())
+        torch.Size([5, 12])
+
+    """
+    _version = 1
+
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 mode: str = 'sum', sparse: bool = False, _weight: Optional[Tensor] = None,
+                 include_last_offset: bool = False, dtype=torch.quint8) -> None:
+        super().__init__(num_embeddings, embedding_dim, _weight=_weight, dtype=dtype)
+
+        self.mode = mode
+        self.pruned_weights = False
+        self.include_last_offset = include_last_offset
+        self.dtype = dtype
+
+    def forward(self, indices: Tensor, offsets: Optional[Tensor] = None, per_sample_weights: Optional[Tensor] = None,
+                compressed_indices_mapping: Optional[Tensor] = None) -> Tensor:
+        if self.dtype == torch.quint4x2:
+            return torch.ops.quantized.embedding_bag_4bit(self._packed_params._packed_weight, indices, offsets, False, 0,
+                                                          self.pruned_weights, per_sample_weights, compressed_indices_mapping,
+                                                          self.include_last_offset)
+        else:
+            return torch.ops.quantized.embedding_bag_byte(self._packed_params._packed_weight, indices, offsets, False, 0,
+                                                          self.pruned_weights, per_sample_weights, compressed_indices_mapping,
+                                                          self.include_last_offset)
+
+    def _get_name(self):
+        return 'QuantizedEmbeddingBag'
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized embedding_bag module from a float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by user
+        """
+        if hasattr(mod, 'weight_fake_quant'):
+            weight_observer = mod.weight_fake_quant
+        else:
+            assert type(mod) == nn.EmbeddingBag, 'nnq.' + cls.__name__ + '.from_float only works for ' + \
+                nn.EmbeddingBag.__name__
+            assert hasattr(mod, 'qconfig'), 'EmbeddingBag input float module must have qconfig defined'
+            from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
+            if mod.qconfig is not None and mod.qconfig.weight is not None:  # type: ignore[union-attr]
+                weight_observer = mod.qconfig.weight()  # type: ignore[union-attr, operator]
+            else:
+                weight_observer = float_qparams_weight_only_qconfig.weight()
+
+        dtype = weight_observer.dtype
+        is_float_qparams_qconfig = weight_observer.qscheme == torch.per_channel_affine_float_qparams
+        assert is_float_qparams_qconfig, \
+            'EmbeddingBag quantization is only supported with float_qparams_weight_only_qconfig.'
+
+        assert dtype == torch.quint8 or dtype == torch.quint4x2, \
+            f'The only supported dtype for nnq.EmbeddingBag is torch.quint8 and torch.quint4x2, got {dtype}'
+
+        # Run the observer to calculate qparams.
+        weight_observer(mod.weight)
+        qweight = _quantize_weight(mod.weight.float(), weight_observer)
+
+        # Create quantized EmbeddingBag module and pass in the quantized weight
+        qembedding_bag = EmbeddingBag(mod.num_embeddings, mod.embedding_dim, dtype=dtype)
+        qembedding_bag.set_weight(qweight)
+        return qembedding_bag
+
+    @classmethod
+    def from_reference(cls, ref_embedding_bag):
+        qembedding_bag = cls(
+            ref_embedding_bag.num_embeddings,
+            ref_embedding_bag.embedding_dim,
+            ref_embedding_bag.max_norm,
+            ref_embedding_bag.norm_type,
+            ref_embedding_bag.scale_grad_by_freq,
+            ref_embedding_bag.mode,
+            ref_embedding_bag.sparse,
+            ref_embedding_bag.get_quantized_weight(),
+            ref_embedding_bag.include_last_offset,
+            ref_embedding_bag.weight_dtype,
+        )
+        return qembedding_bag
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/functional_modules.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/functional_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7751c2533d5d9201b320ed2c6e4196efa2eafaf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/functional_modules.py
@@ -0,0 +1,249 @@
+from typing import List
+
+import torch
+from torch import Tensor
+from torch._ops import ops
+
+__all__ = ['FloatFunctional', 'FXFloatFunctional', 'QFunctional']
+
+class FloatFunctional(torch.nn.Module):
+    r"""State collector class for float operations.
+
+    The instance of this class can be used instead of the ``torch.`` prefix for
+    some operations. See example usage below.
+
+    .. note::
+
+        This class does not provide a ``forward`` hook. Instead, you must use
+        one of the underlying functions (e.g. ``add``).
+
+    Examples::
+
+        >>> f_add = FloatFunctional()
+        >>> a = torch.tensor(3.0)
+        >>> b = torch.tensor(4.0)
+        >>> f_add.add(a, b)  # Equivalent to ``torch.add(a, b)``
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+    def __init__(self):
+        super().__init__()
+        self.activation_post_process = torch.nn.Identity()
+
+    def forward(self, x):
+        raise RuntimeError("FloatFunctional is not intended to use the " +
+                           "'forward'. Please use the underlying operation")
+
+    r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.add(Tensor, float)``"""
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.add(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.mul(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.mul(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.cat``"""
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
+        r = torch.cat(x, dim=dim)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``relu(torch.add(x,y))``"""
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = torch.nn.functional.relu(r)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``"""
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.matmul(x, y)
+        r = self.activation_post_process(r)
+        return r
+
+class FXFloatFunctional(torch.nn.Module):
+    r""" module to replace FloatFunctional module before FX graph mode quantization,
+    since activation_post_process will be inserted in top level module directly
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+    def forward(self, x):
+        raise RuntimeError("FloatFunctional is not intended to use the " +
+                           "'forward'. Please use the underlying operation")
+
+    r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.add(Tensor, float)``"""
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.add(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.mul(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = torch.mul(x, y)
+        return r
+
+    r"""Operation equivalent to ``torch.cat``"""
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
+        r = torch.cat(x, dim=dim)
+        return r
+
+    r"""Operation equivalent to ``relu(torch.add(x,y))``"""
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.add(x, y)
+        r = torch.nn.functional.relu(r)
+        return r
+
+    r"""Operation equivalent to ``torch.matmul(Tensor, Tensor)``"""
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = torch.matmul(x, y)
+        return r
+
+class QFunctional(torch.nn.Module):
+    r"""Wrapper class for quantized operations.
+
+    The instance of this class can be used instead of the
+    ``torch.ops.quantized`` prefix. See example usage below.
+
+    .. note::
+
+        This class does not provide a ``forward`` hook. Instead, you must use
+        one of the underlying functions (e.g. ``add``).
+
+    Examples::
+
+        >>> q_add = QFunctional()
+        >>> # xdoctest: +SKIP
+        >>> a = torch.quantize_per_tensor(torch.tensor(3.0), 1.0, 0, torch.qint32)
+        >>> b = torch.quantize_per_tensor(torch.tensor(4.0), 1.0, 0, torch.qint32)
+        >>> q_add.add(a, b)  # Equivalent to ``torch.ops.quantized.add(a, b, 1.0, 0)``
+
+    Valid operation names:
+        - add
+        - cat
+        - mul
+        - add_relu
+        - add_scalar
+        - mul_scalar
+    """
+    def __init__(self):
+        super().__init__()
+        self.scale = 1.0
+        self.zero_point = 0
+        self.activation_post_process = torch.nn.Identity()
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+
+        self.scale = float(state_dict.pop(prefix + 'scale'))
+        self.zero_point = int(state_dict.pop(prefix + 'zero_point'))
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    def _get_name(self):
+        return 'QFunctional'
+
+    def extra_repr(self):
+        return f'scale={self.scale}, zero_point={self.zero_point}'
+
+    def forward(self, x):
+        raise RuntimeError("Functional is not intended to use the " +
+                           "'forward'. Please use the underlying operation")
+
+    r"""Operation equivalent to ``torch.ops.quantized.add``"""
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``"""
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = ops.quantized.add_scalar(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``"""
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``"""
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
+        r = ops.quantized.mul_scalar(x, y)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.cat``"""
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
+        r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.add_relu``"""
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point)
+        r = self.activation_post_process(r)
+        return r
+
+    r"""Operation equivalent to ``torch.ops.quantized.matmul(Tensor, Tensor)``"""
+    def matmul(self, x: Tensor, y: Tensor) -> Tensor:
+        r = ops.quantized.matmul(x, y, scale=self.scale, zero_point=self.zero_point)
+        # Note: this operation is not observed because the observation is not
+        # needed for the quantized op.
+        return r
+
+    @classmethod
+    def from_float(cls, mod):
+        assert type(mod) == FloatFunctional, \
+            "QFunctional.from_float expects an instance of FloatFunctional"
+        scale, zero_point = mod.activation_post_process.calculate_qparams()  # type: ignore[operator]
+        new_mod = QFunctional()
+        new_mod.scale = float(scale)
+        new_mod.zero_point = int(zero_point)
+        return new_mod
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..e84f6da2f68ffde2a5291e16ebb2e12c14cef09b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/linear.py
@@ -0,0 +1,303 @@
+from collections.abc import Iterable
+import torch
+
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+from torch.nn.utils.fusion import fuse_linear_bn_weights
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from typing import Optional
+
+from .utils import _quantize_weight, _hide_packed_params_repr, WeightedQuantizedModule
+
+__all__ = ['LinearPackedParams', 'Linear']
+
+
+class LinearPackedParams(torch.nn.Module):
+    _version = 3
+
+    def __init__(self, dtype=torch.qint8):
+        super().__init__()
+        self.dtype = dtype
+        if self.dtype == torch.qint8:
+            wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8)
+        elif self.dtype == torch.float16:
+            wq = torch.zeros([1, 1], dtype=torch.float)
+        self.set_weight_bias(wq, None)  # type: ignore[possibly-undefined]
+
+    @torch.jit.export
+    def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor]) -> None:
+        if self.dtype == torch.qint8:
+            self._packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+        elif self.dtype == torch.float16:
+            self._packed_params = torch.ops.quantized.linear_prepack_fp16(weight, bias)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear!')
+
+
+    @torch.jit.export
+    def _weight_bias(self):
+        if self.dtype == torch.qint8:
+            return torch.ops.quantized.linear_unpack(self._packed_params)
+        elif self.dtype == torch.float16:
+            return torch.ops.quantized.linear_unpack_fp16(self._packed_params)
+        else:
+            raise RuntimeError('Unsupported dtype on dynamic quantized linear!')
+
+    def forward(self, x):
+        return x
+
+    # Version 1
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #   |--- dtype : torch.dtype
+    #
+    # Version 3
+    #   self
+    #   |--- _packed_params : (Tensor, Tensor) representing (weight, bias)
+    #                         of LinearPackedParams
+    #   |--- dtype : torch.dtype
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'dtype'] = self.dtype
+        destination[prefix + '_packed_params'] = self._weight_bias()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        if version is None or version < 2:
+            self.dtype = torch.qint8
+        else:
+            self.dtype = state_dict[prefix + 'dtype']
+            state_dict.pop(prefix + 'dtype')
+
+        if version is None or version < 3:
+            self.set_weight_bias(state_dict[prefix + 'weight'], state_dict[prefix + 'bias'])
+            state_dict.pop(prefix + 'weight')
+            state_dict.pop(prefix + 'bias')
+
+        if version == 3:
+            weight, bias = state_dict[prefix + '_packed_params']
+            state_dict.pop(prefix + '_packed_params')
+            self.set_weight_bias(weight, bias)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+
+class Linear(WeightedQuantizedModule):
+    r"""
+    A quantized linear module with quantized tensor as inputs and outputs.
+    We adopt the same interface as `torch.nn.Linear`, please see
+    https://pytorch.org/docs/stable/nn.html#torch.nn.Linear for documentation.
+
+    Similar to :class:`~torch.nn.Linear`, attributes will be randomly
+    initialized at module creation time and will be overwritten later
+
+    Attributes:
+        weight (Tensor): the non-learnable quantized weights of the module of
+                         shape :math:`(\text{out\_features}, \text{in\_features})`.
+        bias (Tensor): the non-learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized to zero.
+        scale: `scale` parameter of output Quantized Tensor, type: double
+        zero_point: `zero_point` parameter for output Quantized Tensor, type: long
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_QENGINE)
+        >>> m = nn.quantized.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> # xdoctest: +SKIP
+        >>> input = torch.quantize_per_tensor(input, 1.0, 0, torch.quint8)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    _version = 3
+    _FLOAT_MODULE = (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear)
+
+    def __init__(self, in_features, out_features, bias_=True,
+                 dtype=torch.qint8):
+        super().__init__()
+        # We don't muck around with buffers or attributes or anything here
+        # to keep the module simple. *everything* is simply a Python attribute.
+        # Serialization logic is explicitly handled in the below serialization and
+        # deserialization modules
+        self.in_features = in_features
+        self.out_features = out_features
+        bias = None
+        if bias_:
+            bias = torch.zeros(out_features, dtype=torch.float)
+
+        if dtype == torch.qint8:
+            qweight = torch._empty_affine_quantized(
+                [out_features, in_features], scale=1, zero_point=0, dtype=torch.qint8)
+        elif dtype == torch.float16:
+            qweight = torch.zeros([out_features, in_features], dtype=torch.float)
+        else:
+            raise RuntimeError('Unsupported dtype specified for quantized Linear!')
+
+        self._packed_params = LinearPackedParams(dtype)
+        self._packed_params.set_weight_bias(qweight, bias)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    def _get_name(self):
+        return 'QuantizedLinear'
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
+            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.quantized.linear(
+            x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    # ===== Serialization methods =====
+    # The special consideration here is that we have to unpack the weights into their
+    # regular QTensor form for serialization. Packed weights should not live
+    # outside the process in which they were created, rather they should be derived
+    # from the QTensor weight.
+    #
+    # Version 1
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- weight : Tensor
+    #   |--- bias : Tensor
+    #
+    # Version 2
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- weight : Tensor
+    #        |--- bias : Tensor
+    #
+    # Version 3
+    #   self
+    #   |--- scale : float
+    #   |--- zero_point : int
+    #   |--- _packed_params : Module
+    #        |--- _packed_params : (Tensor, Tensor) representing weight, bias
+    #                              of LinearPackedParams C++ struct
+    #
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    # ===== Deserialization methods =====
+    # Counterpart to the serialization methods, we must pack the serialized QTensor
+    # weight into its packed format for use by the FBGEMM ops.
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.scale = float(state_dict[prefix + 'scale'])
+        state_dict.pop(prefix + 'scale')
+
+        self.zero_point = int(state_dict[prefix + 'zero_point'])
+        state_dict.pop(prefix + 'zero_point')
+
+        version = local_metadata.get('version', None)
+
+        if version is None or version == 1:
+            # We moved the parameters into a LinearPackedParameters submodule
+            weight = state_dict.pop(prefix + 'weight')
+            bias = state_dict.pop(prefix + 'bias')
+            state_dict.update({prefix + '_packed_params.weight': weight,
+                               prefix + '_packed_params.bias': bias})
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+    # Function rather than property to make sure that JIT serialization doesn't
+    # register this as an attribute
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
+        self._packed_params.set_weight_bias(w, b)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized module from an observed float module
+
+        Args:
+            mod (Module): a float module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+        """
+        if hasattr(mod, 'weight_fake_quant'):
+            if type_before_parametrizations(mod) == nniqat.LinearBn1d:
+                mod.weight, mod.bias = fuse_linear_bn_weights(
+                    mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var,
+                    mod.bn.eps, mod.bn.weight, mod.bn.bias)
+            weight_post_process = mod.weight_fake_quant
+            activation_post_process = mod.activation_post_process
+        else:
+            # This function does not participate in JIT, so it is OK to ignore
+            # the type mismatch in assignment. Also, mypy has an issue with
+            # iterables not being implemented, so we are ignoring those too.
+            if not isinstance(cls._FLOAT_MODULE, Iterable):
+                cls._FLOAT_MODULE = [cls._FLOAT_MODULE]  # type: ignore[assignment]
+            supported_modules = ', '.join([float_mod.__name__ for float_mod in cls._FLOAT_MODULE])  # type: ignore[attr-defined]
+            error_msg = f'nnq.{cls.__name__}.from_float only works for {supported_modules}, but got: {type(mod)}'
+            assert type_before_parametrizations(mod) in cls._FLOAT_MODULE, error_msg.format()  # type: ignore[attr-defined]
+            assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+            activation_post_process = mod.activation_post_process
+            if type_before_parametrizations(mod) == nni.LinearReLU:
+                mod = mod[0]
+            weight_post_process = mod.qconfig.weight()
+        weight_post_process(mod.weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        qweight = _quantize_weight(mod.weight.float(), weight_post_process)
+        qlinear = cls(mod.in_features,
+                      mod.out_features,
+                      dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias)
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
+
+    @classmethod
+    def from_reference(cls, ref_qlinear, output_scale, output_zero_point):
+        r"""Create a (fbgemm/qnnpack) quantized module from a reference quantized module
+
+        Args:
+            ref_qlinear (Module): a reference quantized linear module, either produced by torch.ao.quantization
+                          utilities or provided by the user
+            output_scale (float): scale for output Tensor
+            output_zero_point (int): zero point for output Tensor
+        """
+        qlinear = cls(
+            ref_qlinear.in_features,
+            ref_qlinear.out_features)
+        qweight = ref_qlinear.get_quantized_weight()
+        qlinear.set_weight_bias(qweight, ref_qlinear.bias)
+
+        qlinear.scale = float(output_scale)
+        qlinear.zero_point = int(output_zero_point)
+        return qlinear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/normalization.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e8ecb0baa4548a0dd94b05d810cc982ffe900ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/normalization.py
@@ -0,0 +1,199 @@
+import torch
+
+__all__ = ['LayerNorm', 'GroupNorm', 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d']
+
+class LayerNorm(torch.nn.LayerNorm):
+    r"""This is the quantized version of :class:`~torch.nn.LayerNorm`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+
+    def __init__(self, normalized_shape, weight, bias, scale, zero_point, eps=1e-5,
+                 elementwise_affine=True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine,
+                         **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.layer_norm(
+            input, self.normalized_shape, weight=self.weight, bias=self.bias,
+            eps=self.eps, output_scale=self.scale, output_zero_point=self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedLayerNorm'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.normalized_shape, mod.weight, mod.bias, float(scale),
+            int(zero_point), mod.eps, mod.elementwise_affine)
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.normalized_shape, mod.weight, mod.bias, float(scale),
+            int(zero_point), mod.eps, mod.elementwise_affine)
+
+class GroupNorm(torch.nn.GroupNorm):
+    r"""This is the quantized version of :class:`~torch.nn.GroupNorm`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+    __constants__ = ['num_groups', 'num_channels', 'eps', 'affine']
+
+    def __init__(self, num_groups, num_channels, weight, bias, scale, zero_point, eps=1e-5,
+                 affine=True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_groups, num_channels, eps, affine, **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.group_norm(
+            input, self.num_groups, self.weight, self.bias, self.eps, self.scale,
+            self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedGroupNorm'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_groups, mod.num_channels, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+        return new_mod
+
+class InstanceNorm1d(torch.nn.InstanceNorm1d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm1d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+    def __init__(self, num_features, weight, bias, scale, zero_point,
+                 eps=1e-5, momentum=0.1, affine=False,
+                 track_running_stats=False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale,
+            self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedInstanceNorm1d'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+
+class InstanceNorm2d(torch.nn.InstanceNorm2d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm2d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+    def __init__(self, num_features, weight, bias, scale, zero_point,
+                 eps=1e-5, momentum=0.1, affine=False,
+                 track_running_stats=False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale,
+            self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedInstanceNorm2d'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+
+class InstanceNorm3d(torch.nn.InstanceNorm3d):
+    r"""This is the quantized version of :class:`~torch.nn.InstanceNorm3d`.
+
+    Additional args:
+        * **scale** - quantization scale of the output, type: double.
+        * **zero_point** - quantization zero point of the output, type: long.
+
+    """
+    def __init__(self, num_features, weight, bias, scale, zero_point,
+                 eps=1e-5, momentum=0.1, affine=False,
+                 track_running_stats=False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+        self.weight = weight
+        self.bias = bias
+        self.register_buffer('scale', torch.tensor(scale, **factory_kwargs))
+        self.register_buffer('zero_point', torch.tensor(zero_point, **factory_kwargs))
+
+    def forward(self, input):
+        return torch.ops.quantized.instance_norm(
+            input, self.weight, self.bias, self.eps, self.scale,
+            self.zero_point)
+
+    def _get_name(self):
+        return 'QuantizedInstanceNorm3d'
+
+    @classmethod
+    def from_float(cls, mod):
+        scale, zero_point = mod.activation_post_process.calculate_qparams()
+        new_mod = cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
+        return new_mod
+
+    @classmethod
+    def from_reference(cls, mod, scale, zero_point):
+        return cls(
+            mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
+            mod.eps, mod.affine)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/rnn.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcf10fe4d97ffef46bc38c261cab21f269f98a25
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/rnn.py
@@ -0,0 +1,51 @@
+import torch
+
+__all__ = [
+    "LSTM",
+]
+
+class LSTM(torch.ao.nn.quantizable.LSTM):
+    r"""A quantized long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples in :class:`~torch.ao.nn.quantizable.LSTM`
+
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> custom_module_config = {
+        ...     'float_to_observed_custom_module_class': {
+        ...         nn.LSTM: nn.quantizable.LSTM,
+        ...     },
+        ...     'observed_to_quantized_custom_module_class': {
+        ...         nn.quantizable.LSTM: nn.quantized.LSTM,
+        ...     }
+        ... }
+        >>> tq.prepare(model, prepare_custom_module_class=custom_module_config)
+        >>> tq.convert(model, convert_custom_module_class=custom_module_config)
+    """
+    _FLOAT_MODULE = torch.ao.nn.quantizable.LSTM  # type: ignore[assignment]
+
+    def _get_name(self):
+        return 'QuantizedLSTM'
+
+    @classmethod
+    def from_float(cls, *args, **kwargs):
+        # The whole flow is float -> observed -> quantized
+        # This class does observed -> quantized only
+        raise NotImplementedError("It looks like you are trying to convert a "
+                                  "non-observed LSTM module. Please, see "
+                                  "the examples on quantizable LSTMs.")
+
+    @classmethod
+    def from_observed(cls, other):
+        assert type(other) == cls._FLOAT_MODULE  # type: ignore[has-type]
+        converted = torch.ao.quantization.convert(other, inplace=False,
+                                                  remove_qconfig=True)
+        converted.__class__ = cls
+        return converted
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/utils.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a31f792351be933c70d8b77dd7a993ed26663d07
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/modules/utils.py
@@ -0,0 +1,117 @@
+import abc
+import torch
+import itertools
+import collections
+from torch.nn.modules.module import _addindent
+
+__all__ = [
+    "WeightedQuantizedModule",
+]
+
+class WeightedQuantizedModule(torch.nn.Module, metaclass=abc.ABCMeta):
+    """Wrapper for quantized modules than can be lowered from reference modules."""
+    @classmethod
+    @abc.abstractmethod
+    def from_reference(cls, ref_module, output_scale, output_zero_point):
+        raise NotImplementedError
+
+def _get_weight_observer(observer):
+    # FakeQuantize observer
+    if hasattr(observer, "activation_post_process"):
+        observer = observer.activation_post_process
+    # UniformQuantizationObserverBase observer
+    return observer
+
+def _needs_weight_clamping(observer, dtype):
+    observer = _get_weight_observer(observer)
+    if dtype in [torch.qint8, torch.quint8, torch.qint32]:
+        info = torch.iinfo(dtype)
+        return observer.quant_min > info.min or observer.quant_max < info.max
+    return False
+
+def _clamp_weights(qweight, observer, scale, zp):
+    if not _needs_weight_clamping(observer, qweight.dtype):
+        return qweight
+
+    observer = _get_weight_observer(observer)
+    min_, max_ = observer.quant_min, observer.quant_max
+
+    # Doing this because can't use torch.ops.quantized.clamp() with per_channel qscheme yet.
+    qw_int_max = torch.clone(qweight.int_repr()).fill_(max_)
+    qw_int_min = torch.clone(qweight.int_repr()).fill_(min_)
+    qw_int = torch.minimum(torch.maximum(qweight.int_repr(), qw_int_min), qw_int_max)
+
+    if observer.qscheme in [torch.per_tensor_symmetric,
+                            torch.per_tensor_affine]:
+        qweight = torch._make_per_tensor_quantized_tensor(qw_int, scale.item(), zp.item())
+    elif observer.qscheme in [torch.per_channel_symmetric,
+                              torch.per_channel_affine,
+                              torch.per_channel_affine_float_qparams]:
+        qweight = torch._make_per_channel_quantized_tensor(qw_int, scale, zp, axis=observer.ch_axis)
+    else:
+        raise ValueError("Unexpected qscheme " + observer.qscheme)
+    return qweight
+
+def _quantize_weight(float_wt, observer):
+    wt_scale, wt_zp = observer.calculate_qparams()
+    if observer.qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]:
+        qweight = torch.quantize_per_tensor(
+            float_wt,
+            float(wt_scale), int(wt_zp), torch.qint8)
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    elif observer.qscheme in [torch.per_channel_symmetric, torch.per_channel_affine]:
+        wt_axis = observer.ch_axis
+        qweight = torch.quantize_per_channel(
+            float_wt,
+            wt_scale.to(torch.double), wt_zp.to(torch.int64), wt_axis, torch.qint8)
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    elif observer.qscheme in [torch.per_channel_affine_float_qparams]:
+        qweight = torch.quantize_per_channel(
+            float_wt,
+            wt_scale.to(torch.float), wt_zp.to(torch.float), observer.ch_axis, observer.dtype)
+        qweight = _clamp_weights(qweight, observer, wt_scale, wt_zp)
+    else:
+        raise ValueError("Unexpected qscheme " + observer.qscheme)
+    return qweight
+
+def _ntuple_from_first(n):
+    """Converts the argument to a tuple of size n
+    with the first element repeated."""
+    def parse(x):
+        while isinstance(x, collections.abc.Sequence):
+            if len(x) == n:
+                break
+            x = x[0]
+        return tuple(itertools.repeat(x, n))
+    return parse
+
+def _hide_packed_params_repr(self, params):
+    # We don't want to show `PackedParams` children, hence custom
+    # `__repr__`. This is the same as nn.Module.__repr__, except the check
+    # for the `params module`.
+    extra_lines = []
+    extra_repr = self.extra_repr()
+    # empty string will be split into list ['']
+    if extra_repr:
+        extra_lines = extra_repr.split('\n')
+    child_lines = []
+    for key, module in self._modules.items():
+        if isinstance(module, params):
+            continue
+        mod_str = repr(module)
+        mod_str = _addindent(mod_str, 2)
+        child_lines.append('(' + key + '): ' + mod_str)
+    lines = extra_lines + child_lines
+
+    main_str = self._get_name() + '('
+    if lines:
+        # simple one-liner info, which most builtin Modules will use
+        if len(extra_lines) == 1 and not child_lines:
+            main_str += extra_lines[0]
+        else:
+            main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+    main_str += ')'
+    return main_str
+
+_pair_from_first = _ntuple_from_first(2)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db56382dc7634a09159e944b9dbbc234ae1e16b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__init__.py
@@ -0,0 +1,18 @@
+from .modules import *  # noqa: F403
+
+__all__ = [
+    'Linear',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'RNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'LSTM',
+    'GRU',
+    'Embedding',
+    'EmbeddingBag',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d07bccfe77e6a5286bf3ef719482bed0250ff00
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2f5206435155cd6074cfbab55ee7a70f6d43fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__init__.py
@@ -0,0 +1,21 @@
+from .linear import Linear
+from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from .rnn import RNNCell, LSTMCell, GRUCell, LSTM, GRU
+from .sparse import Embedding, EmbeddingBag
+
+__all__ = [
+    'Linear',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'RNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'LSTM',
+    'GRU',
+    'Embedding',
+    'EmbeddingBag',
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1377534f853a6fb8c78733417dd60b4a87ad9ca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84d177f82b9b0f452304f73acce77a7c31516b52
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..698626b0b0603706f7c83c898196aaf8962d02d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..047155df3cc65dffc936511bc7caf86784a7bd05
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1beec7cfd6c27fa993417a45ef1fac866108821
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/sparse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..478ff47be8d56270e06c4bf1994ee60882c8b729
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/conv.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd0dde9d99c6bc6d5bbd0e9bbe8fac9c233fb5f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/conv.py
@@ -0,0 +1,318 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict, Any, List
+from torch.nn.common_types import _size_1_t
+from .utils import ReferenceQuantizedModule
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
+class _ConvNd(torch.nn.modules.conv._ConvNd, ReferenceQuantizedModule):
+    """ A reference version of nn.quantized.Conv2d
+        we will not pack the parameters in this module, since weight packing is an
+        optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
+        this is useful when user want to use this module in other backends like Glow.
+    """
+    __annotations__ = {"bias": Optional[torch.Tensor]}
+    _IS_REFERENCE = True
+
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams)
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+class Conv1d(_ConvNd, nn.Conv1d):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: _size_1_t = 0,
+                 dilation: _size_1_t = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 padding_mode: str = "zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv1d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv1d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+        weight_quant_dequant = self.get_weight()
+        result = F.conv1d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, self.dilation, self.groups)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class Conv2d(_ConvNd, nn.Conv2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode='zeros',
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv2d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv2d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        weight_quant_dequant = self.get_weight()
+        result = F.conv2d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, self.dilation, self.groups)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv2d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class Conv3d(_ConvNd, nn.Conv3d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True,
+                 padding_mode="zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.Conv3d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, bias, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.conv3d ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.conv3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+        weight_quant_dequant = self.get_weight()
+        result = F.conv3d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, self.dilation, self.groups)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConv3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvNd.from_float(cls, float_conv, weight_qparams)
+
+class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd):
+    """ A reference version of nn.quantized.ConvTranspose2d
+        we will not pack the parameters in this module, since weight packing is an
+        optimization for quantized backends supported in PyTorch (fbgemm/qnnpack),
+        this is useful when user want to use this module in other backends like Glow.
+    """
+    @staticmethod
+    def from_float(cls, float_conv, weight_qparams):
+        qref_conv = cls(
+            float_conv.in_channels,
+            float_conv.out_channels,
+            float_conv.kernel_size,  # type: ignore[arg-type]
+            float_conv.stride,  # type: ignore[arg-type]
+            float_conv.padding,  # type: ignore[arg-type]
+            float_conv.output_padding,  # type: ignore[arg-type]
+            float_conv.groups,
+            float_conv.bias is not None,  # type: ignore[arg-type]
+            float_conv.dilation,  # type: ignore[arg-type]
+            float_conv.padding_mode,
+            device=float_conv.weight.device,
+            dtype=float_conv.weight.dtype,
+            weight_qparams=weight_qparams)
+        qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach())
+        if float_conv.bias is not None:
+            qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach())
+        return qref_conv
+
+
+class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: _size_1_t,
+                 stride: _size_1_t = 1,
+                 padding: _size_1_t = 0,
+                 output_padding: _size_1_t = 0,
+                 groups: int = 1,
+                 bias: bool = True,
+                 dilation: _size_1_t = 1,
+                 padding_mode: str = "zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.ConvTranspose1d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose1d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose1d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv1d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose1d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose1d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0,
+                 groups=1, bias=True, dilation=1,
+                 padding_mode='zeros',
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+
+        nn.ConvTranspose2d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose2d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose2d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv2d
+        """
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose2d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose2d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
+
+class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, output_padding=0,
+                 groups=1, bias=True, dilation=1,
+                 padding_mode="zeros",
+                 device=None,
+                 dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None):
+        nn.ConvTranspose3d.__init__(
+            self, in_channels, out_channels, kernel_size, stride, padding, output_padding,
+            groups, bias, dilation, padding_mode, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.convTranspose3d ---
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.convTranspose3d --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized conv3d
+        """
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size, self.dilation)  # type: ignore[arg-type]
+
+        weight_quant_dequant = self.get_weight()
+        result = F.conv_transpose3d(
+            x, weight_quant_dequant, self.bias, self.stride,
+            self.padding, output_padding, self.groups, self.dilation)
+        return result
+
+    def _get_name(self):
+        return "QuantizedConvTranspose3d(Reference)"
+
+    @classmethod
+    def from_float(cls, float_conv, weight_qparams):
+        return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams)
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec5def7ad1ae00c47b68742219504be9cf06eb3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/linear.py
@@ -0,0 +1,57 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict, Any
+from .utils import ReferenceQuantizedModule
+
+__all__ = ['Linear']
+
+class Linear(nn.Linear, ReferenceQuantizedModule):
+    """ A reference quantized linear module that fits into the FX
+    Graph Mode Quantization workflow
+    activation will be floating point Tensor, we will store floating
+    point weight as well in the module, but in forward we'll quantize
+    and dequantize the weight before running the floating point functional
+    linear operator.
+    """
+    _IS_REFERENCE = True
+
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias_: bool = True,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+            weight_qparams: Optional[Dict[str, Any]] = None):
+        super().__init__(in_features, out_features, bias_, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedLinear(Reference)"
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        we have:
+        w(float) -- quant - dequant \
+        x(float) ------------- F.linear ---
+
+        In the full model, we will see
+        w(float) -- quant - *dequant \
+        x -- quant --- *dequant --  *F.linear --- *quant - dequant
+        and the backend should be able to fuse the ops with `*` into a quantized linear
+        """
+        weight_quant_dequant = self.get_weight()
+        result = F.linear(x, weight_quant_dequant, self.bias)
+        return result
+
+    @classmethod
+    def from_float(cls, float_linear, weight_qparams):
+        qref_linear = Linear(
+            float_linear.in_features, float_linear.out_features,
+            float_linear.bias is not None, device=float_linear.weight.device,
+            dtype=float_linear.weight.dtype, weight_qparams=weight_qparams)
+        qref_linear.weight = torch.nn.Parameter(float_linear.weight.detach())
+        if float_linear.bias is not None:
+            qref_linear.bias = torch.nn.Parameter(float_linear.bias.detach())
+        return qref_linear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..98fa588e012178bc6af29529c13c9193a5b75213
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/rnn.py
@@ -0,0 +1,614 @@
+import torch
+import torch.nn as nn
+from torch import Tensor
+from .utils import _quantize_and_dequantize_weight
+from .utils import _quantize_weight
+from typing import Optional, Dict, Any, Tuple
+from torch import _VF
+from torch.nn.utils.rnn import PackedSequence
+
+__all__ = ['RNNCellBase', 'RNNCell', 'LSTMCell', 'GRUCell', 'RNNBase', 'LSTM', 'GRU', 'get_quantized_weight']
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+def _get_weight_and_quantization_params(module, wn):
+    weight = getattr(module, wn)
+    params = [weight]
+    for param_name in [wn + n for n in ["_qscheme", "_dtype", "_scale", "_zero_point", "_axis_int"]]:
+        if hasattr(module, param_name):
+            param = getattr(module, param_name)
+        else:
+            param = None
+        params.append(param)
+    return params
+
+def get_quantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = _get_weight_and_quantization_params(module, wn)
+    weight = _quantize_weight(*params)
+    return weight
+
+def _get_quantize_and_dequantized_weight(module, wn):
+    if not hasattr(module, wn):
+        return None
+    params = _get_weight_and_quantization_params(module, wn)
+    weight = _quantize_and_dequantize_weight(*params)
+    return weight
+
+class RNNCellBase(nn.RNNCellBase):
+    def __init__(self, input_size: int, hidden_size: int, bias: bool, num_chunks: int,
+                 device=None, dtype=None, weight_qparams_dict=None) -> None:
+        super().__init__(input_size, hidden_size, bias, num_chunks, device=device, dtype=dtype)
+        # TODO(jerryzh168): maybe make this arg a required arg
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+            weight_qparams_dict = {
+                "weight_ih": weight_qparams,
+                "weight_hh": weight_qparams,
+                "is_decomposed": False,
+            }
+        assert len(weight_qparams_dict) == 3, "Expected length for weight_qparams_dict to be 3 for QuantizedRNNCellBase(Reference)"
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        assert weight_qparams_dict is not None
+        self.is_decomposed = weight_qparams_dict["is_decomposed"]
+        for key, weight_qparams in weight_qparams_dict.items():
+            if key == "is_decomposed":
+                continue
+            # TODO: refactor the duplicated code to utils.py
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+                Exception(f"qscheme: {weight_qscheme} is not support in {self._get_name()}")
+            if weight_qscheme is not None:
+                scale = weight_qparams["scale"]
+                scale_tensor = scale.clone().detach() \
+                    if isinstance(scale, torch.Tensor) else \
+                    torch.tensor(scale, dtype=torch.float, device=device)
+                self.register_buffer(key + "_scale", scale_tensor)
+                zp = weight_qparams["zero_point"]
+                zp_tensor = zp.clone().detach() \
+                    if isinstance(zp, torch.Tensor) else \
+                    torch.tensor(zp, dtype=torch.int, device=device)
+                self.register_buffer(key + "_zero_point", zp_tensor)
+                if weight_qscheme == torch.per_channel_affine:
+                    axis = weight_qparams["axis"]
+                    axis_tensor = axis.clone().detach() \
+                        if isinstance(axis, torch.Tensor) else \
+                        torch.tensor(axis, dtype=torch.int, device=device)
+                    self.register_buffer(key + "_axis", axis_tensor)
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device))
+                setattr(self, key + "_axis_int", getattr(self, key + "_axis").item())
+
+    def _get_name(self):
+        return "QuantizedRNNCellBase(Reference)"
+
+    def get_quantized_weight_ih(self):
+        return get_quantized_weight(self, "weight_ih")
+
+    def get_quantized_weight_hh(self):
+        return get_quantized_weight(self, "weight_hh")
+
+    def get_weight_ih(self):
+        return _get_quantize_and_dequantized_weight(self, "weight_ih")
+
+    def get_weight_hh(self):
+        return _get_quantize_and_dequantized_weight(self, "weight_hh")
+
+class RNNCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True, nonlinearity: str = "tanh",
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Any]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        self.nonlinearity = nonlinearity
+
+    def _get_name(self):
+        return "QuantizedRNNCell(Reference)"
+
+    # TODO: refactor nn.RNNCell to have a _forward that takes weight_ih and weight_hh as input
+    # and remove duplicated code, same for the other two Cell modules
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        assert input.dim() in (1, 2), \
+            f"RNNCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        if self.nonlinearity == "tanh":
+            ret = _VF.rnn_tanh_cell(
+                input, hx,
+                self.get_weight_ih(), self.get_weight_hh(),
+                self.bias_ih, self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = _VF.rnn_relu_cell(
+                input, hx,
+                self.get_weight_ih(), self.get_weight_hh(),
+                self.bias_ih, self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(
+                f"Unknown nonlinearity: {self.nonlinearity}")
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.nonlinearity,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class LSTMCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Any]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedLSTMCell(Reference)"
+
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        assert input.dim() in (1, 2), \
+            f"LSTMCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        else:
+            hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx
+
+        ret = _VF.lstm_cell(
+            input, hx,
+            self.get_weight_ih(), self.get_weight_hh(),
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = (ret[0].squeeze(0), ret[1].squeeze(0))
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class GRUCell(RNNCellBase):
+    """
+    We'll store weight_qparams for all the weights (weight_ih and weight_hh),
+    we need to pass in a `weight_qparams_dict` that maps from weight name,
+    e.g. weight_ih, to the weight_qparams for that weight
+    """
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None, weight_qparams_dict: Optional[Dict[str, Any]] = None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype, 'weight_qparams_dict': weight_qparams_dict}
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+
+    def _get_name(self):
+        return "QuantizedGRUCell(Reference)"
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        assert input.dim() in (1, 2), \
+            f"GRUCell: Expected input to be 1-D or 2-D but received {input.dim()}-D tensor"
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        ret = _VF.gru_cell(
+            input, hx,
+            self.get_weight_ih(), self.get_weight_hh(),
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.bias,
+            mod.weight_ih.device,
+            mod.weight_ih.dtype,
+            weight_qparams_dict)
+        ref_mod.weight_ih = mod.weight_ih
+        ref_mod.weight_hh = mod.weight_hh
+        ref_mod.bias_ih = mod.bias_ih
+        ref_mod.bias_hh = mod.bias_hh
+        return ref_mod
+
+class RNNBase(nn.RNNBase):
+    def __init__(self, mode: str, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True, batch_first: bool = False,
+                 dropout: float = 0., bidirectional: bool = False, proj_size: int = 0,
+                 device=None, dtype=None,
+                 weight_qparams_dict: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(
+            mode, input_size, hidden_size, num_layers, bias, batch_first, dropout,
+            bidirectional, proj_size, device, dtype
+        )
+        # TODO(jerryzh168): maybe make this arg a required arg
+        if weight_qparams_dict is None:
+            weight_qparams = {
+                'qscheme': torch.per_tensor_affine,
+                'dtype': torch.quint8,
+                'scale': 1.0,
+                'zero_point': 0
+            }
+            weight_qparams_dict = {"is_decomposed": False}  # type: ignore[dict-item]
+            for wn in self._flat_weights_names:
+                if wn.startswith("weight"):
+                    weight_qparams_dict[wn] = weight_qparams
+        self._init_weight_qparams_dict(weight_qparams_dict, device)
+
+    def _init_weight_qparams_dict(self, weight_qparams_dict, device):
+        self.is_decomposed = weight_qparams_dict["is_decomposed"]
+        for key, weight_qparams in weight_qparams_dict.items():
+            if key == "is_decomposed":
+                continue
+            weight_qscheme = weight_qparams["qscheme"]
+            weight_dtype = weight_qparams["dtype"]
+            setattr(self, key + "_qscheme", weight_qscheme)
+            setattr(self, key + "_dtype", weight_dtype)
+            assert weight_qscheme in [None, torch.per_tensor_affine, torch.per_channel_affine], \
+                Exception(f"qscheme: {weight_qscheme} is not support in {self._get_name()}")
+            if weight_qscheme is not None:
+                self.register_buffer(
+                    key + "_scale",
+                    torch.tensor(weight_qparams["scale"], dtype=torch.float, device=device))
+                self.register_buffer(
+                    key + "_zero_point",
+                    torch.tensor(weight_qparams["zero_point"], dtype=torch.int, device=device))
+                if weight_qscheme == torch.per_channel_affine:
+                    self.register_buffer(
+                        key + "_axis",
+                        torch.tensor(weight_qparams["axis"], dtype=torch.int, device=device))
+                else:
+                    # added for TorchScriptability, not used
+                    self.register_buffer(
+                        key + "_axis", torch.tensor(0, dtype=torch.int, device=device))
+                setattr(self, key + "_axis_int", getattr(self, key + "_axis").item())
+
+class LSTM(RNNBase):
+    """ Reference Quantized LSTM Module
+    We'll store weight_qparams for all the weights in _flat_weights, we need to pass in
+    a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0,
+    to the weight_qparams for that weight
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__('LSTM', *args, **kwargs)
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    def permute_hidden(self,  # type: ignore[override]
+                       hx: Tuple[Tensor, Tensor],
+                       permutation: Optional[Tensor]
+                       ) -> Tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(hx[1], permutation)
+
+    def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+        return expected_hidden_size
+
+    # In the future, we should prevent mypy from applying contravariance rules here.
+    # See torch/nn/modules/module.py::_forward_unimplemented
+    def check_forward_args(self,  # type: ignore[override]
+                           input: Tensor,
+                           hidden: Tuple[Tensor, Tensor],
+                           batch_sizes: Optional[Tensor],
+                           ):
+        self.check_input(input, batch_sizes)
+        self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
+                               'Expected hidden[0] size {}, got {}')
+        self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
+                               'Expected hidden[1] size {}, got {}')
+
+    def get_quantized_weight_bias_dict(self):
+        """ dictionary from flat_weight_name to quantized weight or (unquantized) bias
+        e.g.
+        {
+          "weight_ih_l0": quantized_weight,
+          "bias_ih_l0": unquantized_bias,
+          ...
+        }
+        """
+        quantized_weight_bias_dict = {}
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                if wn.startswith("weight"):
+                    weight_or_bias = get_quantized_weight(self, wn)
+                else:
+                    weight_or_bias = getattr(self, wn)
+            else:
+                weight_or_bias = None
+            quantized_weight_bias_dict[wn] = weight_or_bias
+        return quantized_weight_bias_dict
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                weight = getattr(self, wn)
+                if wn.startswith("weight"):
+                    params = _get_weight_and_quantization_params(self, wn)
+                    weight = _quantize_and_dequantize_weight(*params)
+            else:
+                weight = None
+            flat_weights.append(weight)
+        return flat_weights
+
+    def forward(self, input, hx=None):  # noqa: F811
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        batch_sizes = None
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+            h_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, real_hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            c_zeros = torch.zeros(self.num_layers * num_directions,
+                                  max_batch_size, self.hidden_size,
+                                  dtype=input.dtype, device=input.device)
+            hx = (h_zeros, c_zeros)
+        else:
+            if batch_sizes is None:  # If not PackedSequence input.
+                if is_batched:  # type: ignore[possibly-undefined]
+                    if (hx[0].dim() != 3 or hx[1].dim() != 3):
+                        msg = ("For batched 3-D input, hx and cx should "
+                               f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                else:
+                    if hx[0].dim() != 2 or hx[1].dim() != 2:
+                        msg = ("For unbatched 2-D input, hx and cx should "
+                               f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                    hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1))
+
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.lstm(input, hx, self.get_flat_weights(), self.bias, self.num_layers,
+                              self.dropout, self.training, self.bidirectional, self.batch_first)
+        else:
+            result = _VF.lstm(input, batch_sizes, hx, self.get_flat_weights(), self.bias,
+                              self.num_layers, self.dropout, self.training, self.bidirectional)
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def _get_name(self):
+        return "QuantizedLSTM(Reference)"
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.num_layers,
+            mod.bias,
+            mod.batch_first,
+            mod.dropout,
+            mod.bidirectional,
+            weight_qparams_dict=weight_qparams_dict)
+        for wn in mod._flat_weights_names:
+            setattr(ref_mod, wn, getattr(mod, wn))
+        return ref_mod
+
+class GRU(RNNBase):
+    """ Reference Quantized GRU Module
+    We'll store weight_qparams for all the weights in _flat_weights, we need to pass in
+    a `weight_qparams_dict` that maps from weight name, e.g. weight_ih_l0,
+    to the weight_qparams for that weight
+    """
+    def __init__(self, *args, **kwargs):
+        if 'proj_size' in kwargs:
+            raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
+        super().__init__('GRU', *args, **kwargs)
+
+    def get_quantized_weight_bias_dict(self):
+        """ dictionary from flat_weight_name to quantized weight or (unquantized) bias
+        e.g.
+        {
+          "weight_ih_l0": quantized_weight,
+          "bias_ih_l0": unquantized_bias,
+          ...
+        }
+        """
+        quantized_weight_bias_dict = {}
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                if wn.startswith("weight"):
+                    weight_or_bias = get_quantized_weight(self, wn)
+                else:
+                    weight_or_bias = getattr(self, wn)
+            else:
+                weight_or_bias = None
+            quantized_weight_bias_dict[wn] = weight_or_bias
+        return quantized_weight_bias_dict
+
+    def get_flat_weights(self):
+        flat_weights = []
+        for wn in self._flat_weights_names:
+            if hasattr(self, wn):
+                weight = getattr(self, wn)
+                if wn.startswith("weight"):
+                    params = _get_weight_and_quantization_params(self, wn)
+                    weight = _quantize_and_dequantize_weight(*params)
+            else:
+                weight = None
+            flat_weights.append(weight)
+        return flat_weights
+
+    def forward(self, input, hx=None):  # noqa: F811
+        # Note: this is copied from the forward of GRU in https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/rnn.py
+        # only changed self._flat_weights to self.get_flat_weights()
+        # TODO: maybe we can try inheriting from that class and define get_flat_weights
+        # as a @property? this might interfere with TorchScript, if we remove that
+        # requirement in the future we should be able to do this
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = int(batch_sizes[0])
+        else:
+            batch_sizes = None
+            assert (input.dim() in (2, 3)), f"GRU: Expected input to be 2-D or 3-D but received {input.dim()}-D tensor"
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor")
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor")
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+
+        if hx is None:
+            num_directions = 2 if self.bidirectional else 1
+            hx = torch.zeros(self.num_layers * num_directions,
+                             max_batch_size, self.hidden_size,
+                             dtype=input.dtype, device=input.device)
+        else:
+            # Each batch of the hidden state should match the input sequence that
+            # the user believes he/she is passing in.
+            hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.gru(input, hx, self.get_flat_weights(), self.bias, self.num_layers,
+                             self.dropout, self.training, self.bidirectional, self.batch_first)
+        else:
+            result = _VF.gru(input, batch_sizes, hx, self.get_flat_weights(), self.bias,
+                             self.num_layers, self.dropout, self.training, self.bidirectional)
+        output = result[0]
+        hidden = result[1]
+
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = hidden.squeeze(1)
+
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+    def _get_name(self):
+        return "QuantizedGRU(Reference)"
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams_dict):
+        ref_mod = cls(
+            mod.input_size,
+            mod.hidden_size,
+            mod.num_layers,
+            mod.bias,
+            mod.batch_first,
+            mod.dropout,
+            mod.bidirectional,
+            weight_qparams_dict=weight_qparams_dict)
+        for wn in mod._flat_weights_names:
+            setattr(ref_mod, wn, getattr(mod, wn))
+        return ref_mod
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2d6141d9de6321f89ac691404c060c821cd7719
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/sparse.py
@@ -0,0 +1,94 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from .utils import ReferenceQuantizedModule
+from typing import Optional, Dict, Any
+
+__all__ = ['Embedding', 'EmbeddingBag']
+
+class Embedding(nn.Embedding, ReferenceQuantizedModule):
+    """ A reference quantized Embedding module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 sparse: bool = False, _weight: Optional[Tensor] = None,
+                 device=None, dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(num_embeddings, embedding_dim, padding_idx, max_norm,
+                         norm_type, scale_grad_by_freq, sparse, _weight, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(self, input: Tensor) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding(
+            input, weight_quant_dequant, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.padding_idx,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.sparse,
+            mod.weight,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams)
+
+class EmbeddingBag(nn.EmbeddingBag, ReferenceQuantizedModule):
+    """ A reference quantized EmbeddingBag module that fits into the
+    FX Graph Mode Quantization workflow, activation will be floating point Tensor,
+    we will store floating point weight as well in the module, but in forward we'll
+    quantize and dequantize the weight before running the floating point functional
+    embedding operator.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 mode: str = 'mean', sparse: bool = False, _weight: Optional[Tensor] = None,
+                 include_last_offset: bool = False, padding_idx: Optional[int] = None,
+                 device=None, dtype=None,
+                 weight_qparams: Optional[Dict[str, Any]] = None) -> None:
+        super().__init__(num_embeddings, embedding_dim, max_norm, norm_type,
+                         scale_grad_by_freq, mode, sparse, _weight, include_last_offset,
+                         padding_idx, device, dtype)
+        self._init_weight_qparams(weight_qparams, device)
+
+    def _get_name(self):
+        return "QuantizedEmbedding(Reference)"
+
+    def forward(self, input: Tensor, offsets: Optional[Tensor] = None, per_sample_weights: Optional[Tensor] = None) -> Tensor:
+        weight_quant_dequant = self.get_weight()
+        return F.embedding_bag(input, weight_quant_dequant, offsets,
+                               self.max_norm, self.norm_type,
+                               self.scale_grad_by_freq, self.mode, self.sparse,
+                               per_sample_weights, self.include_last_offset,
+                               self.padding_idx)
+
+    @classmethod
+    def from_float(cls, mod, weight_qparams):
+        return cls(
+            mod.num_embeddings,
+            mod.embedding_dim,
+            mod.max_norm,
+            mod.norm_type,
+            mod.scale_grad_by_freq,
+            mod.mode,
+            mod.sparse,
+            mod.weight,
+            mod.include_last_offset,
+            mod.padding_idx,
+            mod.weight.device,
+            mod.weight.dtype,
+            weight_qparams
+        )
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/utils.py b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..502f77496c4b658ed521e27404af99e4e7cab4b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/quantized/reference/modules/utils.py
@@ -0,0 +1,323 @@
+import torch
+import typing
+
+__all__ = [
+    "ReferenceQuantizedModule",
+]
+
+class ReferenceQuantizedModule(torch.nn.Module):
+    def _init_weight_qparams(self, weight_qparams, device):
+        if weight_qparams is None:
+            weight_qparams = {
+                "qscheme": torch.per_tensor_affine,
+                "dtype": torch.quint8,
+                "scale": 1.0,
+                "zero_point": 0
+            }
+        self.weight_qscheme: torch.qscheme = weight_qparams["qscheme"]
+        self.weight_dtype = weight_qparams["dtype"]
+        assert self.weight_qscheme in [
+            None, torch.per_tensor_affine, torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams], \
+            Exception(f"qscheme: {self.weight_qscheme} is not support in reference quantized {self._get_name()}")
+        if self.weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
+            zero_point_dtype = weight_qparams["zero_point"].dtype if \
+                isinstance(weight_qparams["zero_point"], torch.Tensor) else \
+                torch.int
+            w_scale = weight_qparams["scale"]
+            w_scale_tensor = w_scale.clone().detach() \
+                if isinstance(w_scale, torch.Tensor) \
+                else torch.tensor(w_scale, dtype=torch.float, device=device)
+            self.register_buffer("weight_scale", w_scale_tensor)
+            w_zp = weight_qparams["zero_point"]
+            w_zp_tensor = w_zp.clone().detach() \
+                if isinstance(w_zp, torch.Tensor) \
+                else torch.tensor(w_zp, dtype=zero_point_dtype, device=device)
+            self.register_buffer("weight_zero_point", w_zp_tensor)
+            if self.weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
+                w_axis = weight_qparams["axis"]
+                w_axis_tensor = w_axis.clone().detach() \
+                    if isinstance(w_axis, torch.Tensor) \
+                    else torch.tensor(w_axis, dtype=torch.int, device=device)
+                self.register_buffer("weight_axis", w_axis_tensor)
+            else:
+                # added for TorchScriptability, not used
+                self.register_buffer(
+                    "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
+        else:
+            # added for TorchScriptability, and for torch.float
+            self.register_buffer("weight_scale", torch.tensor(1.0, dtype=torch.float, device=device))
+            self.register_buffer("weight_zero_point", torch.tensor(0, dtype=torch.int, device=device))
+            self.register_buffer(
+                "weight_axis", torch.tensor(0, dtype=torch.int, device=device))
+        self.is_decomposed: bool = weight_qparams.get("is_decomposed", False)
+        # store weight_axis as weight_axis_int due to some constraints of torchdynamo.export
+        # for capturing `.item` operations
+        self.weight_axis_int: int = self.weight_axis.item()  # type: ignore[operator, assignment]
+        self.weight_quant_min: typing.Optional[int] = weight_qparams.get("quant_min", None)
+        self.weight_quant_max: typing.Optional[int] = weight_qparams.get("quant_max", None)
+
+    def get_weight(self):
+        """
+        Fake quantize (quantize and dequantize) the weight with
+        the quantization parameters for weight, this is used to
+        simulate the numerics for the quantized weight in a quantized
+        model
+        """
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        if self.is_decomposed:
+            return _quantize_and_dequantize_weight_decomposed(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+                self.weight_quant_min,
+                self.weight_quant_max)
+        else:
+            return _quantize_and_dequantize_weight(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int)
+
+    def get_quantized_weight(self):
+        # suppress mypy warning
+        assert isinstance(self.weight_scale, torch.Tensor)
+        assert isinstance(self.weight_zero_point, torch.Tensor)
+        # assert isinstance(self.weight_axis, torch.Tensor)
+        if self.is_decomposed:
+            return _quantize_weight_decomposed(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int,
+                self.weight_quant_min,
+                self.weight_quant_max)
+        else:
+            return _quantize_weight(
+                self.weight,  # type: ignore[arg-type]
+                self.weight_qscheme,
+                self.weight_dtype,
+                self.weight_scale,
+                self.weight_zero_point,
+                self.weight_axis_int)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        _save_weight_qparams(
+            destination, prefix, self.weight_qscheme, self.weight_dtype,
+            self.weight_scale, self.weight_zero_point, self.weight_axis)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        for key in _get_weight_qparam_keys(state_dict, prefix):
+            setattr(self, key, state_dict[prefix + key])
+            state_dict.pop(prefix + key)
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+def _quantize_weight_decomposed(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis: int,
+        weight_quant_min: typing.Optional[int],
+        weight_quant_max: typing.Optional[int],
+) -> torch.Tensor:
+    _DTYPE_TO_QVALUE_BOUNDS = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (-(2**31), 2**31 - 1),
+    }
+    # TODO: add an util function for converting qdtype to dtype
+    _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+    }
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+            if weight_quant_min is None or weight_quant_max is None:
+                weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[weight_dtype_]
+            weight = torch.ops.quantized_decomposed.quantize_per_tensor(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_
+            )
+            return weight
+    elif weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
+        # TODO: torch.quint4x2 is not supported
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+            if weight_quant_min is None or weight_quant_max is None:
+                weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[weight_dtype_]
+            weight = torch.ops.quantized_decomposed.quantize_per_channel(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_axis,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_)  # type: ignore[arg-type]
+            return weight
+    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+def _dequantize_weight_decomposed(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis: int,
+        weight_quant_min: typing.Optional[int],
+        weight_quant_max: typing.Optional[int],
+) -> torch.Tensor:
+    # TODO: get the quant_min and quant_max from activation_post_process
+    _DTYPE_TO_QVALUE_BOUNDS = {
+        torch.uint8: (0, 255),
+        torch.int8: (-128, 127),
+        torch.int32: (-(2**31), 2**31 - 1),
+    }
+    # TODO: add an util function for converting qdtype to dtype
+    _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+    }
+    weight_dtype_ = _QDTYPE_TO_UNDERLYING_INT_REPR_DTYPE[weight_dtype]
+    if weight_quant_min is None or weight_quant_max is None:
+        weight_quant_min, weight_quant_max = _DTYPE_TO_QVALUE_BOUNDS[weight_dtype_]
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.ops.quantized_decomposed.dequantize_per_tensor(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_
+            )
+            return weight
+    elif weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
+        # TODO: torch.quint4x2 is not supported
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.ops.quantized_decomposed.dequantize_per_channel(
+                weight,
+                weight_scale,
+                weight_zero_point,
+                weight_axis,
+                weight_quant_min,
+                weight_quant_max,
+                weight_dtype_)  # type: ignore[arg-type]
+            return weight
+    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+def _quantize_weight(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis_int: int
+) -> torch.Tensor:
+    if weight_dtype == torch.float16:
+        weight = weight.to(weight_dtype)
+        return weight
+
+    if weight_qscheme == torch.per_tensor_affine:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.qint32]:
+            weight = torch.quantize_per_tensor(weight, weight_scale, weight_zero_point, weight_dtype)
+            return weight
+    elif weight_qscheme in [torch.per_channel_affine, torch.per_channel_affine_float_qparams]:
+        if weight_dtype in [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32]:
+            weight = torch.quantize_per_channel(
+                weight, weight_scale,
+                weight_zero_point, weight_axis_int, weight_dtype)  # type: ignore[arg-type]
+            return weight
+    raise Exception(f"Unsupported dtype and qscheme: {weight_dtype}, {weight_qscheme}")
+
+def _quantize_and_dequantize_weight_decomposed(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis_int: int,
+        weight_quant_min: typing.Optional[int],
+        weight_quant_max: typing.Optional[int],
+) -> torch.Tensor:
+    """ Quantize and then dequantize the weight based on
+    the quantization parameters
+    """
+    if weight_qscheme in [
+            torch.per_tensor_affine,
+            torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams]:
+        weight_quant = _quantize_weight_decomposed(
+            weight, weight_qscheme, weight_dtype, weight_scale, weight_zero_point, weight_axis_int,
+            weight_quant_min, weight_quant_max)
+        weight_dequant = _dequantize_weight_decomposed(
+            weight_quant, weight_qscheme, weight_dtype, weight_scale, weight_zero_point,
+            weight_axis_int, weight_quant_min, weight_quant_max)
+    else:
+        weight_dequant = weight
+    return weight_dequant
+
+def _quantize_and_dequantize_weight(
+        weight: torch.Tensor,
+        weight_qscheme: torch.qscheme,
+        weight_dtype: torch.dtype,
+        weight_scale: torch.Tensor,
+        weight_zero_point: torch.Tensor,
+        weight_axis_int: int
+) -> torch.Tensor:
+    """ Quantize and then dequantize the weight based on
+    the quantization parameters
+    """
+    if weight_qscheme in [
+            torch.per_tensor_affine,
+            torch.per_channel_affine,
+            torch.per_channel_affine_float_qparams]:
+        weight_quant = _quantize_weight(
+            weight, weight_qscheme, weight_dtype, weight_scale, weight_zero_point, weight_axis_int)
+        weight_dequant = weight_quant.dequantize()
+    else:
+        weight_dequant = weight
+    return weight_dequant
+
+def _save_weight_qparams(destination, prefix, weight_qscheme, weight_dtype, weight_scale, weight_zero_point, weight_axis):
+    destination[prefix + "weight_qscheme"] = weight_qscheme
+    destination[prefix + "weight_dtype"] = weight_dtype
+    if weight_qscheme is not None:
+        destination[prefix + "weight_scale"] = weight_scale
+        destination[prefix + "weight_zero_point"] = weight_zero_point
+        if weight_qscheme == torch.per_channel_affine:
+            destination[prefix + "weight_axis"] = weight_axis
+
+def _get_weight_qparam_keys(
+        state_dict: typing.Dict[str, typing.Any],
+        prefix: str):
+    keys = ["weight_qscheme", "weight_dtype"]
+    weight_qscheme = state_dict[prefix + "weight_qscheme"]
+    if weight_qscheme is not None:
+        keys.append("weight_scale")
+        keys.append("weight_zero_point")
+        if weight_qscheme == torch.quantize_per_channel:
+            keys.append("weight_axis")
+    return keys
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..308f4d5b55c44b6749a5ede08d5ccc40a09b7bca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/__init__.py
@@ -0,0 +1 @@
+from . import quantized
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fac7e7d34ad9937514fe9bb19e71be2fc368f5b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b030a9c22844690d61bb20af96bc26960968f01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__init__.py
@@ -0,0 +1,10 @@
+from torch.ao.nn.sparse.quantized import dynamic
+
+from .linear import Linear
+from .linear import LinearPackedParams
+
+__all__ = [
+    "dynamic",
+    "Linear",
+    "LinearPackedParams",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4ae3e84e4fc8af7986698914eb78a02f79c12e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..474eb8e50dd07082f1de8bbc2f7e0eb7506ab294
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fad593ee14add0029a6078f8dc3e1f58920e39c5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a961737c6c5944b99b5341c55d12fbd95c603e8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__init__.py
@@ -0,0 +1,5 @@
+from .linear import Linear
+
+__all__ = [
+    "Linear",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ecb1c9b4de541dd32269a3f2fd226afc829cf92
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ed9c1b3d07b7223b7dd0f857cbec2ee7e03372
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..855c7ad391de4bf69b0246f58d44acefca2d064b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/dynamic/linear.py
@@ -0,0 +1,139 @@
+from typing import Optional
+
+import torch
+import torch.ao.nn.intrinsic as nni
+
+from torch.ao.nn.sparse.quantized import linear
+from torch.ao.nn.sparse.quantized.utils import LinearBlockSparsePattern
+from torch.ao.nn.quantized.modules.utils import _quantize_weight, _hide_packed_params_repr
+
+__all__ = ['Linear']
+
+class Linear(torch.nn.Module):
+    r"""
+    A dynamically quantized sparse linear module with float tensor as inputs and outputs.
+    """
+    _version = 1
+    _op_type = "sparse_dynamic"
+    _FLOAT_MODULE = torch.nn.Linear
+
+    def __init__(self, in_features, out_features, row_block_size, col_block_size, bias=True, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Only QINT8 is supported for Sparse Quantized Linear Dynamic")
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if bias:
+            bias = torch.zeros(self.out_features, dtype=torch.float)
+        else:
+            bias = None
+
+        qweight = torch._empty_affine_quantized([out_features, in_features],
+                                                scale=1, zero_point=0, dtype=torch.qint8)
+        self._packed_params = linear.LinearPackedParams(row_block_size=row_block_size,
+                                                        col_block_size=col_block_size,
+                                                        dtype=dtype)
+        self._packed_params.set_weight_bias(qweight, bias, row_block_size, col_block_size)
+
+    def _get_name(self):
+        return 'SparseQuantizedDynamicLinear'
+
+    def extra_repr(self):
+        return f'in_features={self.in_features}, out_features={self.out_features}, qscheme={self.weight().qscheme()}'
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, linear.LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sparse.qlinear_dynamic(x, self._packed_params._packed_params)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'op_type'] = self._op_type
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        op_type = int(state_dict[prefix + 'op_type'])
+        assert op_type == 'sparse', \
+            f"Cannot load from op_type [{op_type}], expecting [{self._op_type}]"
+        state_dict.pop(prefix + 'op_type')
+
+        version = local_metadata.get('version', None)
+        assert version <= self._version
+
+        # Is this code valid? In old quantization it seemed to be used to load
+        # older model
+        weight = state_dict.pop(prefix + 'weight')
+        bias = state_dict.pop(prefix + 'bias')
+        state_dict.update({prefix + '_packed_params.weight': weight,
+                           prefix + '_packed_params.bias': bias})
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor],
+                        row_block_size: Optional[int], col_block_size: Optional[int]) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self.out_features = w.shape[0]
+        self.in_features = w.shape[1]
+        self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized sparse dynamic module from a float module.
+
+        We only care about the convert at this stage, no need for observers just yet.
+        """
+        assert type(mod) == cls._FLOAT_MODULE, ' nnq.' + cls.__name__ + '.from_float only works for ' + \
+            cls._FLOAT_MODULE.__name__
+        # TODO: Need to add options to qconfig to avoid the calibration.
+        # TODO: Add calibration for the sparsity
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        if type(mod) == nni.LinearReLU:
+            mod = mod[0]
+        if mod.qconfig is not None and mod.qconfig.weight is not None:
+            weight_observer = mod.qconfig.weight()
+        else:
+            # We have the circular import issues if we import the qconfig in the beginning of this file:
+            # https://github.com/pytorch/pytorch/pull/24231. The current workaround is to postpone the
+            # import until we need it.
+            from torch.ao.quantization.qconfig import default_dynamic_qconfig
+            weight_observer = default_dynamic_qconfig.weight()
+
+        # It is important to multiply by the mask BEFORE calling the `weight_observer`
+        # TODO (zaf): Mask might not be part of the qconfig (T83295194)
+        weight = mod.weight
+        if getattr(mod.qconfig, 'mask', False):
+            weight = mod.qconfig.mask * mod.weight
+
+        weight_observer(weight)
+        dtype = weight_observer.dtype
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        w_sc, w_zp = weight_observer.calculate_qparams()
+        if isinstance(w_zp, torch.Tensor):
+            assert not torch.any(w_zp.bool()), "All weight zero points must map to 0"
+        else:
+            assert w_zp == 0, 'Weight zero point must map to 0'
+        qweight = _quantize_weight(weight.float(), weight_observer)
+
+        row_block_size, col_block_size = LinearBlockSparsePattern.block_size()
+        qlinear = cls(mod.in_features,
+                      mod.out_features,
+                      row_block_size,
+                      col_block_size,
+                      dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias, row_block_size, col_block_size)
+        return qlinear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/linear.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9723e1760c9e88417928bcf545d5dccbad20e3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/linear.py
@@ -0,0 +1,197 @@
+from typing import Optional
+
+import torch
+from torch.ao.nn.quantized.modules.utils import _quantize_weight, _hide_packed_params_repr
+
+__all__ = ['LinearPackedParams', 'Linear']
+
+# TODO (zaf): Inherit from `quantized.LinearPackedParams` (T83294430)
+class LinearPackedParams(torch.nn.Module):
+    _version = 1
+
+    def __init__(self, row_block_size=1, col_block_size=4, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Linear prepacking only supports QINT8")
+        self.dtype = dtype
+        wq = torch._empty_affine_quantized([1, 1], scale=1.0, zero_point=0, dtype=torch.qint8)
+        self.set_weight_bias(wq, None, row_block_size, col_block_size)
+
+    def _get_name(self):
+        return "SparseQuantizedLinearPackedParams"
+
+    @torch.jit.export
+    def set_weight_bias(self, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                        row_block_size: Optional[int], col_block_size: Optional[int]) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params = torch.ops.sparse.qlinear_prepack(weight, bias, row_block_size, col_block_size)
+
+    @torch.jit.export
+    def _weight_bias(self):
+        (weight, bias, block_sizes) = torch.ops.sparse.qlinear_unpack(self._packed_params)
+        return (weight, bias, block_sizes[0], block_sizes[1])
+
+    def forward(self, x):
+        return x
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'dtype'] = self.dtype
+        destination[prefix + '_packed_params'] = self._weight_bias()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        assert version <= self._version
+
+        self.dtype = state_dict.pop(prefix + 'dtype')
+        weight, bias, row_block_size, col_block_size = state_dict.pop(prefix + '_packed_params')
+        self.set_weight_bias(weight, bias, row_block_size, col_block_size)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, False,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+    @torch.jit.export
+    def __getstate__(self):
+        return self._packed_params, self.training, self.dtype
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        (self._packed_params, self.training, self.dtype) = state
+
+    def __repr__(self):
+        return self._weight_bias().__repr__()
+
+# TODO (zaf): Inherit from `quantized.Linear` (T83294430)
+class Linear(torch.nn.Module):
+    r"""
+    A quantized sparse linear module with quantized tensor as inputs and outputs.
+    """
+    _version = 1
+    _FLOAT_MODULE = torch.nn.Linear
+
+    def __init__(self, in_features, out_features, row_block_size, col_block_size, bias=True, dtype=torch.qint8):
+        super().__init__()
+
+        if dtype != torch.qint8:
+            raise NotImplementedError("Only QINT8 is supported for Sparse Quantized Linear")
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        if bias:
+            bias = torch.zeros(self.out_features, dtype=torch.float)
+        else:
+            bias = None
+
+        qweight = torch._empty_affine_quantized([out_features, in_features],
+                                                scale=1, zero_point=0, dtype=torch.qint8)
+        self._packed_params = LinearPackedParams(row_block_size=row_block_size,
+                                                 col_block_size=col_block_size,
+                                                 dtype=dtype)
+        self._packed_params.set_weight_bias(qweight, bias, row_block_size, col_block_size)
+        self.scale = 1.0
+        self.zero_point = 0
+
+    @classmethod
+    def _get_name(cls):
+        return 'SparseQuantizedLinear'
+
+    def extra_repr(self):
+        return 'in_features={}, out_features={}, scale={}, zero_point={}, qscheme={}'.format(
+            self.in_features, self.out_features, self.scale, self.zero_point, self.weight().qscheme()
+        )
+
+    def __repr__(self):
+        return _hide_packed_params_repr(self, LinearPackedParams)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.ops.sparse.qlinear(x, self._packed_params._packed_params, self.scale, self.zero_point)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = torch.tensor(self.scale)
+        destination[prefix + 'zero_point'] = torch.tensor(self.zero_point)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        self.scale = float(state_dict[prefix + 'scale'])
+        state_dict.pop(prefix + 'scale')
+
+        self.zero_point = int(state_dict[prefix + 'zero_point'])
+        state_dict.pop(prefix + 'zero_point')
+
+        op_type = int(state_dict[prefix + 'op_type'])
+        state_dict.pop(prefix + 'op_type')
+
+        version = local_metadata.get('version', None)
+        assert version <= self._version
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, False,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def _weight_bias(self):
+        return self._packed_params._weight_bias()
+
+    def weight(self):
+        return self._weight_bias()[0]
+
+    def bias(self):
+        return self._weight_bias()[1]
+
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor],
+                        row_block_size: Optional[int], col_block_size: Optional[int]) -> None:
+        assert row_block_size is not None and col_block_size is not None
+        self._packed_params.set_weight_bias(w, b, row_block_size, col_block_size)
+
+    @classmethod
+    def from_float(cls, mod):
+        r"""Create a quantized sparse module from a float module.
+
+        We only care about the convert at this stage, no need for observers just yet.
+
+        TODO(zaf): Need to add the sparse params to the qconfig
+        """
+        assert type(mod) == cls._FLOAT_MODULE, cls._get_name() + \
+            '.from_float only works for ' + cls._FLOAT_MODULE.__name__
+        assert hasattr(mod, 'sparse_params'), \
+            ('Expecting the Linear to have `sparse_params`. Make sure you have provided arguments '
+             'in the `sparsifier.squash_mask(params_to_save=("sparse_block_shape",))` method.')
+        sparse_block_shape = mod.sparse_params.get('sparse_block_shape', None)  # type: ignore[operator, union-attr]
+        assert isinstance(sparse_block_shape, (tuple, list))
+        assert len(sparse_block_shape) == 2
+        # TODO: Need to add options to qconfig to avoid the calibration.
+        # TODO: Add calibration for the sparsity
+        assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined'
+        activation_post_process = mod.activation_post_process
+        weight_post_process = mod.qconfig.weight()  # type: ignore[operator, union-attr]
+
+        # Assumption is that the weight is already sparsified by the
+        # `sparsifier.convert`
+        weight = mod.weight
+
+        weight_post_process(weight)
+        dtype = weight_post_process.dtype
+        act_scale, act_zp = activation_post_process.calculate_qparams()  # type: ignore[operator, union-attr]
+        assert dtype == torch.qint8, 'Weight observer must have dtype torch.qint8'
+        w_sc, w_zp = weight_post_process.calculate_qparams()
+        if isinstance(w_zp, torch.Tensor):
+            assert not torch.any(w_zp.bool()), "All weight zero points must map to 0"
+        else:
+            assert w_zp == 0, 'Weight zero point must map to 0'
+        qweight = _quantize_weight(weight.float(), weight_post_process)
+
+        row_block_size = mod.sparse_params['sparse_block_shape'][0]  # type: ignore[index]
+        col_block_size = mod.sparse_params['sparse_block_shape'][1]  # type: ignore[index]
+        qlinear = cls(mod.in_features,
+                      mod.out_features,
+                      row_block_size,
+                      col_block_size,
+                      dtype=dtype)
+        qlinear.set_weight_bias(qweight, mod.bias,
+                                row_block_size, col_block_size)  # type: ignore[arg-type]
+        qlinear.scale = float(act_scale)
+        qlinear.zero_point = int(act_zp)
+        return qlinear
diff --git a/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/utils.py b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e7338d26eac828b43b9198b5cadec1b3f5e386b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/nn/sparse/quantized/utils.py
@@ -0,0 +1,42 @@
+import threading
+
+__all__ = [
+    "LinearBlockSparsePattern"
+]
+
+def _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size):
+    return (row_block_size == 1 and col_block_size == 4) or \
+           (row_block_size == 8 and col_block_size == 1)
+
+# This is a stop-gap measure as current flow does not allow module
+# specific block sparse pattern.
+# Infact there is no way to convey sparse pattern via module config
+# of quantization flow. Thus using the global context to convey
+# sparsity pattern.
+# Once the flow supports it, this should be removed.
+class LinearBlockSparsePattern:
+    rlock = threading.RLock()
+    row_block_size = 1
+    col_block_size = 4
+    prev_row_block_size = 1
+    prev_col_block_size = 4
+
+    def __init__(self, row_block_size=1, col_block_size=4):
+        assert _is_valid_linear_block_sparse_pattern(row_block_size, col_block_size)
+        LinearBlockSparsePattern.rlock.acquire()
+        LinearBlockSparsePattern.prev_row_block_size = LinearBlockSparsePattern.row_block_size
+        LinearBlockSparsePattern.prev_col_block_size = LinearBlockSparsePattern.col_block_size
+        LinearBlockSparsePattern.row_block_size = row_block_size
+        LinearBlockSparsePattern.col_block_size = col_block_size
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_value, backtrace):
+        LinearBlockSparsePattern.row_block_size = LinearBlockSparsePattern.prev_row_block_size
+        LinearBlockSparsePattern.col_block_size = LinearBlockSparsePattern.prev_col_block_size
+        LinearBlockSparsePattern.rlock.release()
+
+    @staticmethod
+    def block_size():
+        return LinearBlockSparsePattern.row_block_size, LinearBlockSparsePattern.col_block_size
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/__init__.py b/MLPY/Lib/site-packages/torch/ao/ns/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3a4c7c21ff04220eb10136a55f79a7967672f09
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97595300ceaaa0a90efe7d1be6283cee121d74c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca71c88be3d1ef7cbc2c769ba7043f91a1155499
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/__pycache__/_numeric_suite_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite.py b/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..582708217a89b8c8b964bdef0d48382e5d7ef257
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite.py
@@ -0,0 +1,526 @@
+import torch
+import torch.nn as nn
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+from torch.ao.quantization import prepare
+from typing import Dict, List, Optional, Any, Union, Callable, Set
+
+from torch.ao.quantization.quantization_mappings import (
+    get_default_compare_output_module_list,
+)
+
+NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST = {
+    nnqd.Linear,
+    nnq.Linear,
+    nnqd.LSTM,
+    nn.LSTM,
+}
+
+
+def _find_match(
+    str_list: Union[Dict[str, Any], List[str]], key_str: str,
+    postfix: str,
+) -> Optional[str]:
+    split_str = key_str.split(".")
+    if split_str[-1] == postfix:
+        match_string = "".join(key_str.split(".")[0:-1])
+        for s2 in str_list:
+            pattern1 = "".join(s2.split(".")[0:-1])
+            pattern2 = "".join(s2.split(".")[0:-2])
+            if match_string == pattern1:
+                return s2
+            if match_string == pattern2:
+                return s2
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        if postfix == "_packed_params":
+            match_string = "".join(key_str.split(".")[0:-2])
+            if len(match_string) == 0:
+                return None
+            for s2 in str_list:
+                pattern1 = "".join(s2.split(".")[0:-1])
+                pattern2 = "".join(s2.split(".")[0:-2])
+                if match_string == pattern1:
+                    return s2
+                if match_string == pattern2:
+                    return s2
+        return None
+    else:
+        return None
+
+
+def compare_weights(
+    float_dict: Dict[str, Any], quantized_dict: Dict[str, Any]
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Compare the weights of the float module with its corresponding quantized
+    module. Return a dict with key corresponding to module names and each entry being
+    a dictionary with two keys 'float' and 'quantized', containing the float and
+    quantized weights. This dict can be used to compare and compute the quantization
+    error of the weights of float and quantized models.
+
+    Example usage::
+
+        wt_compare_dict = compare_weights(
+            float_model.state_dict(), qmodel.state_dict())
+        for key in wt_compare_dict:
+            print(
+                key,
+                compute_error(
+                    wt_compare_dict[key]['float'],
+                    wt_compare_dict[key]['quantized'].dequantize()
+                )
+            )
+
+    Args:
+        float_dict: state dict of the float model
+        quantized_dict: state dict of the quantized model
+
+    Return:
+        weight_dict: dict with key corresponding to module names and each entry being
+        a dictionary with two keys 'float' and 'quantized', containing the float and
+        quantized weights
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_weights")
+    weight_dict: Dict[str, Dict] = {}
+    for key in quantized_dict:
+        match_key = _find_match(float_dict, key, "weight")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key]
+            continue
+
+        # For matching "fc.weight" and "fc._packed_params._packed_params"
+        match_key = _find_match(float_dict, key, "_packed_params")
+        if match_key is not None:
+            weight_dict[key] = {}
+            weight_dict[key]["float"] = float_dict[match_key]
+            weight_dict[key]["quantized"] = quantized_dict[key][0]
+
+        # For LSTM
+        split_str = key.split(".")
+        if split_str[-1] == "param" and split_str[-3] == "_all_weight_values":
+            layer = split_str[-2]
+            module_name = ".".join(split_str[:-3])
+            float_weight_ih_key = module_name + ".weight_ih_l" + layer
+            float_weight_hh_key = module_name + ".weight_hh_l" + layer
+            if float_weight_ih_key in float_dict and float_weight_hh_key in float_dict:
+                weight_dict[key] = {}
+                weight_dict[key]["float"] = float_dict[float_weight_ih_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][0].__getstate__()[0][0]
+                )
+                weight_dict[key]["float"] = float_dict[float_weight_hh_key]
+                weight_dict[key]["quantized"] = (
+                    quantized_dict[key].__getstate__()[0][4][1].__getstate__()[0][0]
+                )
+
+    return weight_dict
+
+
+def _get_logger_dict_helper(
+    mod: nn.Module, target_dict: Dict[str, Any],
+    prefix: str = "",
+) -> None:
+    r"""This is the helper function for get_logger_dict
+
+    Args:
+        mod: module we want to save all logger stats
+        prefix: prefix for the current module
+        target_dict: the dictionary used to save all logger stats
+    """
+
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + "."
+
+    for name, child in mod.named_children():
+        if isinstance(child, Logger):
+            target_dict[get_prefix(prefix) + "stats"] = child.stats
+            break
+
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        _get_logger_dict_helper(child, target_dict, module_prefix)
+
+
+def get_logger_dict(mod: nn.Module, prefix: str = "") -> Dict[str, Dict]:
+    r"""Traverse the modules and save all logger stats into target dict.
+    This is mainly used for quantization accuracy debug.
+
+    Type of loggers supported:
+        ShadowLogger: used to log the outputs of the quantized module and its matching float shadow module,
+        OutputLogger: used to log the outputs of the modules
+
+    Args:
+        mod: module we want to save all logger stats
+        prefix: prefix for the current module
+
+    Return:
+        target_dict: the dictionary used to save all logger stats
+
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.get_logger_dict")
+
+    target_dict: Dict[str, Dict] = {}
+    _get_logger_dict_helper(mod, target_dict, prefix)
+    return target_dict
+
+
+class Logger(nn.Module):
+    r"""Base class for stats logging
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.stats = {}
+        # We only insert observer if the op is quantized with static quantization,
+        # which is identified by activation_observer.dtype == quint8.  This is needed
+        # when attaching Logger as observer for FX mode
+        self.dtype = torch.quint8
+
+    def forward(self, x):
+        """
+        """  # blank docblock to make autodoc happy
+        pass
+
+
+class ShadowLogger(Logger):
+    r"""Class used in Shadow module to record the outputs of the original and
+    shadow modules.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.stats["float"] = []
+        self.stats["quantized"] = []
+
+    def forward(self, x, y):
+        """
+        """  # blank docblock to make autodoc happy
+        if len(x) > 1:
+            x = x[0]
+        if len(y) > 1:
+            y = y[0]
+        self.stats["quantized"].append(x.detach())
+        self.stats["float"].append(y.detach())
+
+
+class OutputLogger(Logger):
+    r"""Class used to log the outputs of the module
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.stats["tensor_val"] = []
+
+
+    def forward(self, x):
+        """
+        """  # blank docblock to make autodoc happy
+        self.stats["tensor_val"].append(x)
+        return x
+
+
+def _convert_tuple_to_list(t: Any) -> Any:
+    return [_convert_tuple_to_list(x) for x in t] if type(t) is tuple else t
+
+
+def _dequantize_tensor_list(t: Any) -> Any:
+    return (
+        [_dequantize_tensor_list(x) for x in t]
+        if type(t) is list
+        else t.dequantize()
+        if t.is_quantized
+        else t
+    )
+
+
+class Shadow(nn.Module):
+    r"""Shadow module attaches the float module to its matching quantized module
+    as the shadow. Then it uses Logger module to process the outputs of both
+    modules.
+
+    Args:
+        q_module: module quantized from float_module that we want to shadow
+        float_module: float module used to shadow q_module
+        logger_cls: type of logger used to process the outputs of q_module and
+            float_module. ShadowLogger or custom loggers can be used.
+    """
+
+    def __init__(self, q_module, float_module, logger_cls):
+        super().__init__()
+        self.orig_module = q_module
+        self.shadow_module = float_module
+        self.dequant = nnq.DeQuantize()
+        self.logger = logger_cls()
+
+    def forward(self, *x) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        xl = _convert_tuple_to_list(x)
+        output = self.orig_module(*xl)
+        xl_float = _dequantize_tensor_list(xl)
+        shadow_output = self.shadow_module(*xl_float)
+        self.logger(output, shadow_output)
+        return output
+
+    def add(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.add(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.add(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def add_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.add_scalar(x, y)
+        x = x.dequantize()
+        shadow_output = self.shadow_module.add_scalar(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def mul(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.mul(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.mul(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def mul_scalar(self, x: torch.Tensor, y: float) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.mul_scalar(x, y)
+        x = x.dequantize()
+        shadow_output = self.shadow_module.mul_scalar(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+    def cat(self, x: List[torch.Tensor], dim: int = 0) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.cat(x, dim)
+        x = [y.dequantize() for y in x]
+        shadow_output = self.shadow_module.cat(x, dim)
+        self.logger(output, shadow_output)
+        return output
+
+    def add_relu(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        """
+        """  # blank docblock to make autodoc happy
+        output = self.orig_module.add_relu(x, y)
+        x = x.dequantize()
+        y = y.dequantize()
+        shadow_output = self.shadow_module.add_relu(x, y)
+        self.logger(output, shadow_output)
+        return output
+
+
+def prepare_model_with_stubs(
+    float_module: nn.Module, q_module: nn.Module,
+    module_swap_list: Set[type], logger_cls: Callable,
+) -> None:
+    r"""Prepare the model by attaching the float module to its matching quantized
+    module as the shadow if the float module type is in module_swap_list.
+
+    Example usage::
+
+        prepare_model_with_stubs(float_model, q_model, module_swap_list, Logger)
+        q_model(data)
+        ob_dict = get_logger_dict(q_model)
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+        module_swap_list: list of float module types to attach the shadow
+        logger_cls: type of logger to be used in shadow module to process the outputs of
+            quantized module and its float shadow module
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.prepare_model_with_stubs")
+
+    float_module_children = {}
+    for name, mod in float_module.named_children():
+        float_module_children[name] = mod
+
+    reassign = {}
+    for name, mod in q_module.named_children():
+
+        if name not in float_module_children:
+            continue
+
+        float_mod = float_module_children[name]
+
+        if type(float_mod) not in module_swap_list:
+            prepare_model_with_stubs(float_mod, mod, module_swap_list, logger_cls)
+
+        # Insert shadow module only if the module is not of the same type as
+        # the floating point module
+        if type(float_mod) in module_swap_list and not _is_identical_module_type(mod, float_mod):
+            reassign[name] = Shadow(mod, float_mod, logger_cls)
+
+    for key, value in reassign.items():
+        q_module._modules[key] = value
+
+def _is_identical_module_type(mod1, mod2):
+    # Compare if two modules have the same dtype
+    mod1_module_types = [type(mod) for mod in mod1.modules()]
+    mod2_module_types = [type(mod) for mod in mod2.modules()]
+    return mod1_module_types == mod2_module_types
+
+
+
+def compare_model_stub(
+    float_model: nn.Module, q_model: nn.Module, module_swap_list: Set[type],
+    *data, logger_cls=ShadowLogger
+) -> Dict[str, Dict]:
+    r"""Compare quantized module in a model with its floating point counterpart,
+    feeding both of them the same input. Return a dict with key corresponding to
+    module names and each entry being a dictionary with two keys 'float' and
+    'quantized', containing the output tensors of quantized and its matching
+    float shadow module. This dict can be used to compare and compute the module
+    level quantization error.
+
+    This function first call prepare_model_with_stubs() to swap the quantized
+    module that we want to compare with the Shadow module, which takes quantized
+    module, corresponding float module and logger as input, and creates a forward
+    path inside to make the float module to shadow quantized module sharing the
+    same input. The logger can be customizable, default logger is ShadowLogger
+    and it will save the outputs of the quantized module and float module that
+    can be used to compute the module level quantization error.
+
+    Example usage::
+
+        module_swap_list = [torchvision.models.quantization.resnet.QuantizableBasicBlock]
+        ob_dict = compare_model_stub(float_model,qmodel,module_swap_list, data)
+        for key in ob_dict:
+            print(key, compute_error(ob_dict[key]['float'], ob_dict[key]['quantized'].dequantize()))
+
+    Args:
+        float_model: float model used to generate the q_model
+        q_model: model quantized from float_model
+        module_swap_list: list of float module types at which shadow modules will
+            be attached.
+        data: input data used to run the prepared q_model
+        logger_cls: type of logger to be used in shadow module to process the outputs of
+            quantized module and its float shadow module
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_model_stub")
+    prepare_model_with_stubs(float_model, q_model, module_swap_list, logger_cls)
+    q_model(*data)
+    ob_dict = get_logger_dict(q_model)
+    return ob_dict
+
+
+def get_matching_activations(
+    float_module: nn.Module, q_module: nn.Module,
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Find the matching activation between float and quantized modules.
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+
+    Return:
+        act_dict: dict with key corresponding to quantized module names and each
+        entry being a dictionary with two keys 'float' and 'quantized', containing
+        the matching float and quantized activations
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.get_matching_activations")
+    float_dict = get_logger_dict(float_module)
+    quantized_dict = get_logger_dict(q_module)
+    act_dict: Dict[str, Dict] = {}
+    for key in quantized_dict:
+        if len(quantized_dict[key]["tensor_val"]) == 0:
+            continue
+        match_key = _find_match(sorted(float_dict, reverse=True), key, "stats")
+        if match_key is not None:
+            act_dict[key] = {}
+            act_dict[key]["float"] = float_dict[match_key]["tensor_val"]
+            act_dict[key]["quantized"] = quantized_dict[key]["tensor_val"]
+    return act_dict
+
+
+def prepare_model_outputs(
+    float_module: nn.Module,
+    q_module: nn.Module,
+    logger_cls=OutputLogger,
+    allow_list=None
+) -> None:
+    r"""Prepare the model by attaching the logger to both float module
+    and quantized module if they are in the allow_list.
+
+    Args:
+        float_module: float module used to generate the q_module
+        q_module: module quantized from float_module
+        logger_cls: type of logger to be attached to float_module and q_module
+        allow_list: list of module types to attach logger
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.prepare_model_outputs")
+    if allow_list is None:
+        allow_list = get_default_compare_output_module_list()
+
+    qconfig_debug = torch.ao.quantization.QConfig(activation=logger_cls, weight=None)
+    float_module.qconfig = qconfig_debug  # type: ignore[assignment]
+    prepare(float_module, inplace=True, allow_list=allow_list, prepare_custom_config_dict={})
+    q_module.qconfig = qconfig_debug  # type: ignore[assignment]
+    prepare(
+        q_module,
+        inplace=True,
+        allow_list=allow_list,
+        observer_non_leaf_module_list=NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST,
+        prepare_custom_config_dict={}
+    )
+
+
+def compare_model_outputs(
+    float_model: nn.Module,
+    q_model: nn.Module,
+    *data,
+    logger_cls=OutputLogger,
+    allow_list=None
+) -> Dict[str, Dict[str, torch.Tensor]]:
+    r"""Compare output activations between float and quantized models at
+    corresponding locations for the same input. Return a dict with key corresponding
+    to quantized module names and each entry being a dictionary with two keys
+    'float' and 'quantized', containing the activations of quantized model and
+    float model at matching locations. This dict can be used to compare and
+    compute the propagation quantization error.
+
+    Example usage::
+
+        act_compare_dict = compare_model_outputs(float_model, qmodel, data)
+        for key in act_compare_dict:
+            print(
+                key,
+                compute_error(
+                    act_compare_dict[key]['float'],
+                    act_compare_dict[key]['quantized'].dequantize()
+                )
+            )
+
+    Args:
+        float_model: float model used to generate the q_model
+        q_model: model quantized from float_model
+        data: input data used to run the prepared float_model and q_model
+        logger_cls: type of logger to be attached to float_module and q_module
+        allow_list: list of module types to attach logger
+
+    Return:
+        act_compare_dict: dict with key corresponding to quantized module names
+        and each entry being a dictionary with two keys 'float' and 'quantized',
+        containing the matching float and quantized activations
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite.compare_model_outputs")
+    if allow_list is None:
+        allow_list = get_default_compare_output_module_list()
+    prepare_model_outputs(float_model, q_model, logger_cls, allow_list)
+    float_model(*data)
+    q_model(*data)
+    act_compare_dict = get_matching_activations(float_model, q_model)
+    return act_compare_dict
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite_fx.py b/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce1e5e574b66281024cd64bac940fcf8a7fa8bc3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/_numeric_suite_fx.py
@@ -0,0 +1,1025 @@
+"""
+This module contains tooling to compare weights and activations
+across models. Example usage::
+
+    import copy
+    import torch
+    import torch.ao.quantization.quantize_fx as quantize_fx
+    import torch.ao.ns._numeric_suite_fx as ns
+
+    m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)).eval()
+    mp = quantize_fx.prepare_fx(m, {'': torch.ao.quantization.default_qconfig})
+    # We convert a copy because we need the original prepared model
+    # to be available for comparisons, and `quantize_fx.convert_fx` is inplace.
+    mq = quantize_fx.convert_fx(copy.deepcopy(mp))
+
+    #
+    # Comparing weights
+    #
+
+    # extract weight pairs
+    weight_comparison = ns.extract_weights('a', mp, 'b', mq)
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        weight_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
+        'sqnr')
+
+    # weight_comparison contains the weights from `mp` and `mq` stored
+    # in pairs, and can be used for further analysis.
+
+
+    #
+    # Comparing activations, with error propagation
+    #
+
+    # add loggers
+    mp_ns, mq_ns = ns.add_loggers(
+        'a', copy.deepcopy(mp),
+        'b', copy.deepcopy(mq),
+        ns.OutputLogger)
+
+    # send an example datum to capture intermediate activations
+    datum = torch.randn(1, 1, 1, 1)
+    mp_ns(datum)
+    mq_ns(datum)
+
+    # extract intermediate activations
+    act_comparison = ns.extract_logger_info(
+        mp_ns, mq_ns, ns.OutputLogger, 'b')
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        act_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
+        'sqnr')
+
+    # act_comparison contains the activations from `mp_ns` and `mq_ns` stored
+    # in pairs, and can be used for further analysis.
+
+    #
+    # Comparing activations, without error propagation
+    #
+
+    # create shadow model
+    mp_shadows_mq = ns.add_shadow_loggers(
+        'a', copy.deepcopy(mp),
+        'b', copy.deepcopy(mq),
+        ns.OutputLogger)
+
+    # send an example datum to capture intermediate activations
+    datum = torch.randn(1, 1, 1, 1)
+    mp_shadows_mq(datum)
+
+    # extract intermediate activations
+    shadow_act_comparison = ns.extract_shadow_logger_info(
+        mp_shadows_mq, ns.OutputLogger, 'b')
+
+    # add SQNR for each comparison, inplace
+    ns.extend_logger_results_with_comparison(
+        shadow_act_comparison, 'a', 'b', torch.ao.ns.fx.utils.compute_sqnr,
+        'sqnr')
+
+    # shadow_act_comparison contains the activations from `mp_ns` and `mq_ns` stored
+    # in pairs, and can be used for further analysis.
+
+"""
+
+import collections
+
+import torch
+import torch.nn as nn
+import torch.ao.quantization.quantize_fx as quantize_fx
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+from torch.ao.ns.fx.mappings import (
+    get_base_name_to_sets_of_related_ops,
+)
+from torch.ao.ns.fx.graph_matcher import (
+    get_matching_subgraph_pairs,
+    get_type_a_related_to_b,
+)
+
+from .fx.weight_utils import (
+    extract_weight_from_node,
+)
+
+from .fx.graph_passes import (
+    add_loggers_to_model,
+    create_a_shadows_b,
+)
+
+from .fx.utils import (
+    rekey_logger_info_on_node_name_of_model,
+    maybe_add_missing_fqns,
+    get_target_type_str,
+)
+
+from .fx.ns_types import (
+    NSSingleResultValuesType,
+    NSResultsType,
+    NSNodeTargetType,
+)
+from torch.ao.quantization.backend_config.utils import get_fusion_pattern_to_root_node_getter
+from torch.ao.quantization.backend_config import BackendConfig
+from torch.ao.quantization.fx.match_utils import _find_matches
+from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
+from torch.ao.quantization.fx.qconfig_mapping_utils import _generate_node_name_to_qconfig
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.qconfig import QConfigAny
+from torch.ao.quantization import QConfigMapping
+from torch.ao.ns.fx.n_shadows_utils import (
+    OutputProp,
+    _get_dedup_subgraphs,
+    SHADOW_WRAPPER_NODE_NAME_PREFIX,
+    group_results_by_subgraph,
+    create_results_comparison,
+    print_n_shadows_summary,
+    create_n_transformed_and_logged_copies_of_subgraph,
+    create_add_loggers_graph,
+    extract_weight_comparison,
+)
+from torch.ao.ns.fx.qconfig_multi_mapping import QConfigMultiMapping
+
+from typing import Dict, Tuple, Callable, List, Optional, Set, Any, Type
+
+RNNReturnType = Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
+
+class OutputLogger(nn.Module):
+    """
+    Base class for capturing intermediate values.
+    """
+    stats: List[torch.Tensor]
+    stats_rnn: List[RNNReturnType]
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(
+        self,
+        ref_node_name: str,
+        prev_node_name: str,
+        model_name: str,
+        ref_name: str,
+        prev_node_target_type: str,
+        ref_node_target_type: str,
+        results_type: str,
+        index_within_arg: int,
+        index_of_arg: int,
+        fqn: Optional[str],
+        qconfig_str: Optional[str] = '',
+    ):
+        super().__init__()
+        self.stats: List[torch.Tensor] = []
+        self.stats_rnn: List[RNNReturnType] = []
+
+        # name of the node which was responsible for adding this logger
+        # Note:
+        # - if we are logging node outputs, this is the same as prev_node_name
+        # - if we are logging node inputs, this is the name of the node
+        #   whose input this logger is logging.
+        #
+        # example, where logger1 is logging input of op1 and logger2 is logging
+        #    the output of op1:
+        #
+        #  x1 -> logger1 -> op1 -> logger2 -> x2
+        #
+        # in this example,
+        #   - logger1's prev_node_name is x1 and ref_node_name is op1
+        #   - logger2's prev_node_name is op1 and ref_node_name is op1
+        self.ref_node_name = ref_node_name
+        # name of the node whose output this Logger is capturing
+        self.prev_node_name = prev_node_name
+
+        # name of the model from which the node originated from
+        self.model_name = model_name
+        # reference name, used to match loggers from separate models
+        # to each other
+        self.ref_name = ref_name
+        # type of the target of the node whose output this logger is logging
+        self.prev_node_target_type = prev_node_target_type
+        # type of the target of the node which was responsible for adding this
+        # logger
+        self.ref_node_target_type = ref_node_target_type
+        # what kind of values are inside of stats
+        self.results_type = results_type
+        # index of this node within the arg of the input/output node
+        # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1
+        self.index_within_arg = index_within_arg
+        # index of this node within the args of the input/output node
+        # for example, in add(x1, x2), x2 would have index_of_arg == 1
+        self.index_of_arg = index_of_arg
+        # fully qualified name
+        self.fqn = fqn
+        # if loggers are added before prepare_fx, but we do not want
+        # collect results of calibration, only results after convert_fx
+        # so, we add a flag to control whether this logger collects data
+        self.enabled = True
+        # string representation of qconfig
+        self.qconfig_str = qconfig_str
+        # this can be turned off to reduce memory usage during calibration
+        self.save_activations = True
+
+    # Note: cannot annotate the type of x because TorchScript does not support
+    #   the Union type.
+    def forward(self, x):
+        """
+        """  # blank docblock to make autodoc happy
+        # TODO(future PR): consider designing this better, as the difference
+        # between these two flags is subtle and not obvious.
+        if not self.enabled:
+            return x
+        if not self.save_activations:
+            return x
+        # TODO(future PR): consider refactoring this to better reuse the parent
+        # class
+        if isinstance(x, torch.Tensor):
+            self.stats.append(x.detach())
+        elif isinstance(x, tuple) and len(x) == 2 and len(x[1]) == 2:
+            new_res = (x[0].detach(), (x[1][0].detach(), x[1][1].detach()))
+            self.stats_rnn.append(new_res)
+        return x
+
+    def __repr__(self):
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != 'training') and not k.startswith('_')
+        }
+        return f"OutputLogger({clean_dict})"
+
+
+class OutputComparisonLogger(OutputLogger):
+    """
+    Same as OutputLogger, but also requires the original activation
+    in order to calculate the comparison at calibration time
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO(future PR): make the comparison function configurable
+        self.comparison_fn = torch.ao.ns.fx.utils.compute_sqnr
+        self.comparison_fn_name = 'sqnr'
+        # precalculated comparisons of logger output versus reference
+        self.comparisons = []
+        # precalculated comparisons function
+
+    def forward(self, x, x_ref):
+        """
+        """  # blank docblock to make autodoc happy
+        if not self.enabled:
+            return x
+        assert isinstance(x, torch.Tensor), 'non-tensor inputs not yet supported'
+        if self.save_activations:
+            # save the activation, for debugging
+            self.stats.append(x.detach())
+        # save the comparison
+        self.comparisons.append(self.comparison_fn(x, x_ref))
+        return x
+
+    def __repr__(self):
+        clean_dict = {
+            k: v
+            for k, v in self.__dict__.items()
+            # skip nn.Module keys
+            if (k != 'training') and not k.startswith('_')
+        }
+        return f"OutputComparisonLogger({clean_dict})"
+
+
+class NSTracer(quantize_fx.QuantizationTracer):
+    """
+    Just like a regular FX quantization tracer, but treats observers and fake_quantize
+    modules as leaf modules.
+    """
+    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> bool:
+        """
+        """  # blank docblock to make autodoc happy
+        if isinstance(m, torch.ao.quantization.ObserverBase):
+            return True
+        elif isinstance(m, torch.ao.quantization.FakeQuantizeBase):
+            return True
+        return super().is_leaf_module(m, module_qualified_name)
+
+
+def _extract_weights_one_model(
+    model_name: str,
+    model: GraphModule,
+    nodes_and_names_to_instrument: List[Tuple[Node, str]],
+    results: NSResultsType,
+    op_to_type_to_weight_extraction_fn: Optional[Dict[str, Dict[Callable, Callable]]] = None,
+) -> None:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._extract_weights_one_model")
+    for node, ref_name in nodes_and_names_to_instrument:
+        res_type = NSSingleResultValuesType.WEIGHT.value
+        extracted_weight = extract_weight_from_node(
+            node, model, op_to_type_to_weight_extraction_fn)
+        if extracted_weight:
+            if ref_name not in results:
+                results[ref_name] = {res_type: {}}
+            results[ref_name][res_type][model_name] = [extracted_weight]
+
+
+def _extract_weights_impl(
+    model_name_a: str,
+    gm_a: GraphModule,
+    model_name_b: str,
+    gm_b: GraphModule,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    op_to_type_to_weight_extraction_fn: Optional[Dict[str, Dict[Callable, Callable]]] = None,
+) -> NSResultsType:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._extract_weights_impl")
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b, base_name_to_sets_of_related_ops,
+        unmatchable_types_map)
+
+    # split the subgraph pairs into one data structure for each model
+    nodes_and_names_to_instrument_a: List[Tuple[Node, str]] = []
+    nodes_and_names_to_instrument_b: List[Tuple[Node, str]] = []
+    for match_name, match in matched_subgraph_pairs.items():
+        subgraph_a, subgraph_b = match
+        nodes_and_names_to_instrument_a.append((subgraph_a.base_op_node, match_name))
+        nodes_and_names_to_instrument_b.append((subgraph_b.base_op_node, match_name))
+
+    # populate the results, one model at a time
+    results: NSResultsType = {}
+    _extract_weights_one_model(
+        model_name_a, gm_a, nodes_and_names_to_instrument_a, results,
+        op_to_type_to_weight_extraction_fn)
+    _extract_weights_one_model(
+        model_name_b, gm_b, nodes_and_names_to_instrument_b, results,
+        op_to_type_to_weight_extraction_fn)
+
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+
+    # rekey on names of nodes in gm_b
+    results = rekey_logger_info_on_node_name_of_model(results, model_name_b)
+
+    return results
+
+
+def extract_weights(
+    model_name_a: str,
+    model_a: nn.Module,
+    model_name_b: str,
+    model_b: nn.Module,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    op_to_type_to_weight_extraction_fn: Optional[Dict[str, Dict[Callable, Callable]]] = None,
+) -> NSResultsType:
+    """
+    Extract weights from model A and model B, and return a comparison.
+
+    Args:
+        model_name_a: string name of model A to use in results
+        model_a: model A
+        model_name_b: string name of model B to use in results
+        model_b: model B
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+        op_to_type_to_weight_extraction_fn: optional override of function which extracts weight
+            from a type, subject to change
+
+    Return:
+        NSResultsType, containing the weight comparisons
+    """
+
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.extract_weights")
+    if base_name_to_sets_of_related_ops is None:
+        base_name_to_sets_of_related_ops = \
+            get_base_name_to_sets_of_related_ops()
+    type_a_related_to_b = \
+        get_type_a_related_to_b(base_name_to_sets_of_related_ops)
+
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _extract_weights_impl(
+        model_name_a, gm_a, model_name_b, gm_b, base_name_to_sets_of_related_ops,
+        unmatchable_types_map, op_to_type_to_weight_extraction_fn)
+
+
+def _add_loggers_one_model(
+    model_name: str,
+    model: GraphModule,
+    nodes_and_names_to_instrument_inputs: List[Tuple[Node, str, str]],
+    nodes_and_names_to_instrument_outputs: List[Tuple[Node, str, str]],
+    logger_cls: Callable,
+) -> nn.Module:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_loggers_one_model")
+
+    # TODO(future PR): do not observe nodes we do not care
+    #   about (both fp32, denylist, etc)
+    node_to_instrument_inputs_to_ref_name: Dict[Node, Tuple[str, str]] = {}
+    node_to_instrument_outputs_to_ref_name: Dict[Node, Tuple[str, str]] = {}
+    for node, ref_name, ref_node_type in nodes_and_names_to_instrument_inputs:
+        node_to_instrument_inputs_to_ref_name[node] = (ref_name, ref_node_type)
+    for node, ref_name, ref_node_type in nodes_and_names_to_instrument_outputs:
+        node_to_instrument_outputs_to_ref_name[node] = (ref_name, ref_node_type)
+
+    model = add_loggers_to_model(
+        model, node_to_instrument_inputs_to_ref_name,
+        node_to_instrument_outputs_to_ref_name, logger_cls, model_name)
+    return model
+
+
+def _add_loggers_impl(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> Tuple[nn.Module, nn.Module]:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_loggers_impl")
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b,
+        base_name_to_sets_of_related_ops, unmatchable_types_map)
+    nodes_and_names_to_instrument_inputs_a = []
+    nodes_and_names_to_instrument_inputs_b = []
+    nodes_and_names_to_instrument_outputs_a = []
+    nodes_and_names_to_instrument_outputs_b = []
+    for match_name, (subgraph_a, subgraph_b) in matched_subgraph_pairs.items():
+        ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a)
+        ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b)
+        # Note: for matching inputs we use start_node, such as observing
+        # the input of linear in linear-relu
+        if should_log_inputs:
+            nodes_and_names_to_instrument_inputs_a.append(
+                (subgraph_a.start_node, match_name, ref_node_type_a))
+            nodes_and_names_to_instrument_inputs_b.append(
+                (subgraph_b.start_node, match_name, ref_node_type_b))
+        # Note: for matching activations we always use end_node,
+        # such as observing the output of relu in linear-relu
+        nodes_and_names_to_instrument_outputs_a.append(
+            (subgraph_a.end_node, match_name, ref_node_type_a))
+        nodes_and_names_to_instrument_outputs_b.append(
+            (subgraph_b.end_node, match_name, ref_node_type_b))
+
+    new_model_a = _add_loggers_one_model(
+        name_a, gm_a, nodes_and_names_to_instrument_inputs_a,
+        nodes_and_names_to_instrument_outputs_a, logger_cls)
+    new_model_b = _add_loggers_one_model(
+        name_b, gm_b, nodes_and_names_to_instrument_inputs_b,
+        nodes_and_names_to_instrument_outputs_b, logger_cls)
+    return (new_model_a, new_model_b)
+
+
+def add_loggers(
+    name_a: str,
+    model_a: nn.Module,
+    name_b: str,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    should_log_inputs : bool = False,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> Tuple[nn.Module, nn.Module]:
+    """
+    Instrument model A and model B with loggers.
+
+    Args:
+        name_a: string name of model A to use in results
+        model_a: model A
+        name_b: string name of model B to use in results
+        model_b: model B
+        logger_cls: class of Logger to use
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+
+    Return:
+        Returns a tuple of (model_a_with_loggers, model_b_with_loggers).  Modifies both models inplace.
+    """
+
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.add_loggers")
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _add_loggers_impl(
+        name_a, gm_a, name_b, gm_b, logger_cls,
+        should_log_inputs=should_log_inputs,
+        base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops,
+        unmatchable_types_map=unmatchable_types_map)
+
+
+def _extract_logger_info_one_model(
+    model: nn.Module,
+    results: NSResultsType,
+    logger_cls: Callable,
+) -> None:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._extract_logger_info_one_model")
+    for gm_name, mod in model.named_modules():
+        # TODO(future PR): better check when scripted
+        is_logger = (
+            isinstance(mod, logger_cls)  # type: ignore[arg-type]
+            or (
+                isinstance(mod, torch.jit.RecursiveScriptModule)
+                and mod.original_name == 'OutputLogger'
+            )
+        )
+        if is_logger:
+            key = mod.ref_name
+            if key not in results:
+                results[key] = {}
+            assert mod.model_name not in results[key], \
+                f"{mod.model_name} is already present in results"
+            if mod.results_type not in results[key]:
+                results[key][mod.results_type] = {}
+            if mod.model_name not in results[key][mod.results_type]:
+                results[key][mod.results_type][mod.model_name] = []
+            stats_to_use = mod.stats
+            if len(mod.stats_rnn) > 0:
+                stats_to_use = mod.stats_rnn
+            data = {
+                'type': mod.results_type,
+                'values': stats_to_use,
+                'ref_node_name': mod.ref_node_name,
+                'ref_node_target_type': mod.ref_node_target_type,
+                'prev_node_name': mod.prev_node_name,
+                'prev_node_target_type': mod.prev_node_target_type,
+                'index_within_arg': mod.index_within_arg,
+                'index_of_arg': mod.index_of_arg,
+                'fqn': mod.fqn,
+                'qconfig_str': mod.qconfig_str,
+            }
+            if hasattr(mod, 'comparisons'):
+                data['comparisons'] = mod.comparisons
+                data['comparison_fn_name'] = mod.comparison_fn_name
+            else:
+                data['comparisons'] = []
+                data['comparison_fn_name'] = ''
+            results[key][mod.results_type][mod.model_name].append(data)
+            # ensure the list stays sorted
+            results[key][mod.results_type][mod.model_name].sort(
+                key=lambda res:
+                f"{res['index_of_arg']}:{res['index_within_arg']}"
+            )
+
+
+# TODO(future PR): align on naming
+# this is equivalent of just the comparison extraction part of `ns.compare_model_outputs`
+def extract_logger_info(
+    model_a: nn.Module,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    model_name_to_use_for_layer_names: str,
+) -> NSResultsType:
+    """
+    Traverse all loggers in `model_a` and `model_b`, and extract the logged
+    information.
+
+    Args:
+        model_a: model A
+        model_b: model B
+        logger_cls: class of Logger to use
+        model_name_to_use_for_layer_names: string name of model to use for
+          layer names in the output
+
+    Return:
+        NSResultsType, containing the logged comparisons
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.extract_logger_info")
+    results: NSResultsType = {}
+    for model in (model_a, model_b):
+        _extract_logger_info_one_model(model, results, logger_cls)
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+    # rekey on the name of model b
+    results = rekey_logger_info_on_node_name_of_model(
+        results, model_name_to_use_for_layer_names)
+    return results
+
+
+def _add_shadow_loggers_impl(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    node_type_to_io_type_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> nn.Module:
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx._add_shadow_loggers_impl")
+    matched_subgraph_pairs = get_matching_subgraph_pairs(
+        gm_a, gm_b, base_name_to_sets_of_related_ops,
+        unmatchable_types_map)
+    gm_a_shadows_b = create_a_shadows_b(
+        name_a, gm_a, name_b, gm_b, matched_subgraph_pairs, logger_cls,
+        should_log_inputs=should_log_inputs,
+        node_type_to_io_type_map=node_type_to_io_type_map)
+    return gm_a_shadows_b
+
+
+def add_shadow_loggers(
+    name_a: str,
+    model_a: nn.Module,
+    name_b: str,
+    model_b: nn.Module,
+    logger_cls: Callable,
+    should_log_inputs: bool = False,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    node_type_to_io_type_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> nn.Module:
+    """
+    Instrument model A and model B with shadow loggers.
+
+    Args:
+        name_a: string name of model A to use in results
+        model_a: model A
+        name_b: string name of model B to use in results
+        model_b: model B
+        logger_cls: class of Logger to use
+        should_log_inputs: whether to log inputs
+        base_name_to_sets_of_related_ops: optional override of subgraph base nodes, subject to change
+        unmatchable_types_map: optional override of unmatchable types, subject to change
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.add_shadow_loggers")
+    # TODO(future PR): expose these
+    skipped_module_names: List[str] = []
+    skipped_module_classes: List[Callable] = []
+    tracer_a = NSTracer(skipped_module_names, skipped_module_classes)
+    tracer_b = NSTracer(skipped_module_names, skipped_module_classes)
+    gm_a = GraphModule(model_a, tracer_a.trace(model_a))
+    maybe_model_a_node_name_to_scope = _get_observed_graph_module_attr(model_a, 'node_name_to_scope')
+    if maybe_model_a_node_name_to_scope is not None:
+        gm_a._node_name_to_scope = maybe_model_a_node_name_to_scope
+    gm_b = GraphModule(model_b, tracer_b.trace(model_b))
+    maybe_model_b_node_name_to_scope = _get_observed_graph_module_attr(model_b, 'node_name_to_scope')
+    if maybe_model_b_node_name_to_scope is not None:
+        gm_b._node_name_to_scope = maybe_model_b_node_name_to_scope
+    return _add_shadow_loggers_impl(
+        name_a, gm_a, name_b, gm_b, logger_cls,
+        should_log_inputs=should_log_inputs,
+        base_name_to_sets_of_related_ops=base_name_to_sets_of_related_ops,
+        node_type_to_io_type_map=node_type_to_io_type_map,
+        unmatchable_types_map=unmatchable_types_map)
+
+
+def extract_shadow_logger_info(
+    model_a_shadows_b: nn.Module,
+    logger_cls: Callable,
+    model_name_to_use_for_layer_names: str,
+) -> NSResultsType:
+    """
+    Traverse all loggers in a shadow model, and extract the logged
+    information.
+
+    Args:
+        model_a_shadows_b: shadow model
+        logger_cls: class of Logger to use
+        model_name_to_use_for_layer_names: string name of model to use for
+          layer names in the output
+
+    Return:
+        NSResultsType, containing the logged comparisons
+    """
+    torch._C._log_api_usage_once("quantization_api._numeric_suite_fx.extract_shadow_logger_info")
+    results: NSResultsType = collections.defaultdict(dict)
+    _extract_logger_info_one_model(model_a_shadows_b, results, logger_cls)
+    # fill in missing fqn entries
+    maybe_add_missing_fqns(results)
+    # rekey on the name of model b
+    results = rekey_logger_info_on_node_name_of_model(
+        results, model_name_to_use_for_layer_names)
+    return dict(results)
+
+
+def extend_logger_results_with_comparison(
+    results: NSResultsType,
+    model_name_1: str,
+    model_name_2: str,
+    comparison_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+    comparison_name: str,
+) -> None:
+    """
+    Compares the logged values from `model_name_2` against the corresponding
+    values in `model_name_1`, using `comparison_fn`. Records the result
+    in `model_name_2`'s results under `comparison_name`. Modifies `results` inplace.
+
+    Args:
+        results: the result data structure from `extract_logger_info` or
+          `extract_shadow_logger_info`.
+        model_name_1: string name of model 1
+        model_name_2: string name of model 2
+        comparison_fn: function to compare two Tensors
+        comparison_name: string name of model to use for
+          layer names in the output
+    """
+    for results_type_to_results in results.values():
+        for model_name_to_results in results_type_to_results.values():
+            assert model_name_1 in model_name_to_results, \
+                f"{model_name_1} not found in results"
+            assert model_name_2 in model_name_to_results, \
+                f"{model_name_2} not found in results"
+
+            results_1 = model_name_to_results[model_name_1]
+            results_2 = model_name_to_results[model_name_2]
+
+            for result_2 in results_2:
+                index_within_arg_2 = result_2['index_within_arg']
+                index_of_arg_2 = result_2['index_of_arg']
+                # find corresponding result_1
+                result_1 = None
+                for cur_result_1 in results_1:
+                    index_within_arg_1 = cur_result_1['index_within_arg']
+                    index_of_arg_1 = cur_result_1['index_of_arg']
+                    if (
+                        (index_within_arg_1 == index_within_arg_2) and
+                        (index_of_arg_1 == index_of_arg_2)
+                    ):
+                        result_1 = cur_result_1
+                        break
+                assert result_1 is not None
+
+                values_1 = result_1['values']
+                values_2 = result_2['values']
+                result_2[comparison_name] = []
+                for value_1, value_2 in zip(values_1, values_2):
+                    comparison_result = comparison_fn(value_1, value_2)
+                    result_2[comparison_name].append(comparison_result)
+
+def prepare_n_shadows_model(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_multi_mapping: QConfigMultiMapping,
+    backend_config: BackendConfig,
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Optional[Dict[str, Any]] = None,
+    custom_tracer: Any = None,
+) -> GraphModule:
+    """
+    Given a model with a graph with M ops such as
+
+
+      args_kwargs_m -> op_m -> output_m
+
+
+    And a set of N qconfigs for each op, creates a new model, with
+    each of the subgraph of `op_m` transformed into
+
+    .. code::
+
+           |---------> op_m_n -> log_m_n
+           |                     /
+      args_kwargs_m ---------> op_m -> log_m_0
+
+    Where op_m_n is op_m wrapped in a submodule and transformed with
+    qconfig_n, and its inner graph looks like
+
+    .. code::
+
+      args_m -------- op_m_prepared_with_qconfig_n -> out_m_n
+                  /
+      kwargs_m ---
+
+    This is useful for testing different quantization of multiple layers in
+    a single pass through the model.
+
+    High level TODOs for future PRs:
+    * figure out a better way to name the output structure
+    * return a results data structure instead of printing it out
+    * add examples to docblocks
+    """
+
+    if custom_tracer is None:
+        tracer = quantize_fx.QuantizationTracer([], [])
+    else:
+        tracer = custom_tracer
+    mt = torch.fx.GraphModule(model, tracer.trace(model))
+    # this is necessary to ensure logger FQNs get populated
+    mt._node_name_to_scope = tracer.node_name_to_scope
+
+    # run example input propagation, we need this to call prepare_fx on
+    # individual subgraphs
+    output_prop = OutputProp(mt)
+    output_prop.propagate(*example_inputs)
+
+    # Find the set of subgraphs in the original graph which we need to
+    # consider.
+    modules = dict(mt.named_modules(remove_duplicate=False))
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
+    root_node_getter_mapping = \
+        get_fusion_pattern_to_root_node_getter(backend_config)
+    standalone_module_names: List[str] = []
+    standalone_module_classes: List[Type] = []
+    custom_module_classes: List[Type] = []
+    matches = _find_matches(
+        mt.graph, modules, patterns, root_node_getter_mapping,
+        standalone_module_names, standalone_module_classes, custom_module_classes)
+    subgraphs_dedup: Dict[str, List[Node]] = \
+        _get_dedup_subgraphs(matches)
+
+    # generate node to qconfig for each subgraph
+    # TODO(future PR): deduplicate repeating entries
+    list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]] = []
+    for qconfig_mapping in qconfig_multi_mapping.qconfig_mappings_list:
+        node_name_to_qconfig = _generate_node_name_to_qconfig(
+            mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope)
+        list_of_node_name_to_qconfig.append(node_name_to_qconfig)
+
+    # For each region in the model, do the following:
+    #   For each qconfig for that region, do the following:
+    #     1. create a copy of the region wrapped in a module
+    #     2. pass original args, original kwargs, and expected output to module
+    #     3. add an output comparison logger and hook it up to compare
+    #        actual output to expected output
+    #     4. run `prepare_fx` on the module
+    for (subgraph_idx, (match_name, nodes_in_this_subgraph)) in \
+            enumerate(subgraphs_dedup.items()):
+        create_n_transformed_and_logged_copies_of_subgraph(
+            mt, subgraph_idx, match_name, nodes_in_this_subgraph,
+            qconfig_multi_mapping.qconfig_mappings_list, list_of_node_name_to_qconfig,
+            custom_prepare_fn, custom_prepare_kwargs  # type: ignore[arg-type]
+        )
+
+    return mt
+
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _prepare_n_shadows_add_loggers_model(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> torch.nn.Module:
+    r"""
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+
+    This creates a model which provides logging for the following
+    problem: if we quantize `model` with `qconfig_mapping` and feed
+    the same input through both models, log the comparisons of
+    corresponding intermediate layers.
+
+    The problem is solved with a single model.  Specifically, we
+    partition `model` into N subgraphs, create a copy of each relevant
+    subgraph, wrap it in a module, apply the quantization API to that
+    module, and hook up loggers to measure the comparisons.
+
+    Example starting graph:
+
+      x0 -> op0 -> x1 -> op1 -> x2
+
+    Example config: quantize op0 to int8, do nothing to op1.
+    The following graph will be created:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_0 -> x2_1 ----> clog
+
+    Where op0_0 is op0, op0_1 is op0 wrapped in a submodule and quantized
+    to int8, op1_0 is op1 (appearing in the graph twice), log is a logger,
+    and clog is a comparison logger.
+    """
+
+    tracer = quantize_fx.QuantizationTracer([], [])
+    mt = torch.fx.GraphModule(model, tracer.trace(model))
+    # this is necessary to ensure logger FQNs get populated
+    mt._node_name_to_scope = tracer.node_name_to_scope
+
+    # run example input propagation, we need this to call prepare_fx on
+    # individual subgraphs
+    output_prop = OutputProp(mt)
+    output_prop.propagate(*example_inputs)
+
+    # Find the set of subgraphs in the original graph which we need to
+    # consider.
+    modules = dict(mt.named_modules(remove_duplicate=False))
+    patterns = _get_pattern_to_quantize_handlers(backend_config)
+    root_node_getter_mapping = \
+        get_fusion_pattern_to_root_node_getter(backend_config)
+    standalone_module_names: List[str] = []
+    standalone_module_classes: List[Type] = []
+    custom_module_classes: List[Type] = []
+    matches = _find_matches(
+        mt.graph, modules, patterns, root_node_getter_mapping,
+        standalone_module_names, standalone_module_classes, custom_module_classes)
+    subgraphs_dedup: Dict[str, List[Node]] = \
+        _get_dedup_subgraphs(matches)
+
+    # generate node to qconfig for each subgraph
+    node_name_to_qconfig = _generate_node_name_to_qconfig(
+        mt, modules, mt.graph, qconfig_mapping, tracer.node_name_to_scope)
+
+    # Now, mutate the graph to be the add_loggers graph with propagation
+    # error.
+    create_add_loggers_graph(
+        mt, subgraphs_dedup, qconfig_mapping, node_name_to_qconfig)
+
+    return mt
+
+# TODO(future PR): we should rethink the names of all the PNP APIs
+def _n_shadows_compare_weights(
+    model: torch.nn.Module,
+    example_inputs: Any,
+    qconfig_mapping: QConfigMapping,
+    backend_config: BackendConfig,
+) -> NSResultsType:
+    """
+    Note: this API is not recommended for wide usage, it is only
+    provided for customers who need to migrate from the `add_loggers`
+    API.
+    """
+    qconfig_multi_mapping = \
+        QConfigMultiMapping.from_list_qconfig_mapping([qconfig_mapping])
+    mp = prepare_n_shadows_model(
+        model, example_inputs, qconfig_multi_mapping, backend_config)
+    # passing inputs through the model is necessary to populate
+    # observers which observe weights with real values
+    mp(*example_inputs)
+    mq = convert_n_shadows_model(mp)
+    weight_comparison = extract_weight_comparison(mq)
+    return weight_comparison
+
+# TODO(future PR): consider aligning API signature with other similar quantization
+# functions (enable_fake_quant, etc)
+def loggers_set_enabled(model: torch.nn.Module, enabled: bool) -> None:
+    """
+    Sets the `enabled` setting on a `model`'s loggers
+    """
+    for name, child in model.named_modules():
+        if isinstance(child, OutputLogger):
+            child.enabled = enabled
+
+# TODO(future PR): consider aligning API signature with other similar quantization
+# functions (enable_fake_quant, etc)
+def loggers_set_save_activations(
+    model: torch.nn.Module,
+    save_activations: bool,
+) -> None:
+    """
+    Sets the `save_activations` setting on a `model`'s loggers
+    """
+    for name, child in model.named_modules():
+        if isinstance(child, OutputLogger):
+            child.save_activations = save_activations
+
+def convert_n_shadows_model(
+    model: GraphModule,
+    custom_convert_fn: Optional[Callable] = None,
+    custom_convert_kwargs: Optional[Dict[str, Any]] = None
+) -> GraphModule:
+    """
+    Given a model from `prepare_n_shadows_model`, runs `convert_fx`
+    on each shadow submodule.
+    """
+    for node in model.graph.nodes:
+        # TODO(future PR): consider matching in a safer way than
+        # node name string match
+        if node.name.startswith(SHADOW_WRAPPER_NODE_NAME_PREFIX):
+            orig_mod = getattr(model, node.name)
+            if custom_convert_fn is None:
+                converted_mod = torch.ao.quantization.quantize_fx.convert_fx(
+                    orig_mod)
+            else:
+                if custom_convert_kwargs is None:
+                    custom_convert_kwargs = {}
+                converted_mod = custom_convert_fn(orig_mod, **custom_convert_kwargs)
+            setattr(model, node.name, converted_mod)
+
+    return model
+
+def extract_results_n_shadows_model(model: torch.nn.Module) -> NSResultsType:
+    """
+    Extracts logger results from `model`.
+    """
+    results: NSResultsType = {}
+    _extract_logger_info_one_model(model, results, OutputLogger)
+    return results
+
+def print_comparisons_n_shadows_model(results: NSResultsType) -> None:
+    """
+    Prints a summary of extracted `results`.
+    """
+    results_grouped = group_results_by_subgraph(results)
+    results_comparison = create_results_comparison(results_grouped)
+    print_n_shadows_summary(results_comparison)
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__init__.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b3f3add42f99ba5508751ca43f6b0f3a91f58be
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3fd6e092e73b9013bde6a53429a8766943f2999
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_matcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..341d4a94cd925dd65042b94283735a5eda45365e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/graph_passes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50bced046c6aa05bb99e87c0cd43dbb48922d097
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b81696a6b94b2e21319adb56db6152937829e437
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/n_shadows_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae626398dde9004facd2d3cb027d56306f94190f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/ns_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7ad29d69044dfe584061d8c1501a5305d7942fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/pattern_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04620e9fd53b062d183406a917d64e9e3adbbf5e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/qconfig_multi_mapping.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3608f7f6c6a6c4a40c5e9c42c05a1253ba917b04
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..207d02d704dc62ee42b2bc7ba31001c11a0a4dc3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/ns/fx/__pycache__/weight_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_matcher.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..b28b1607880575fbca4248c22d5e57642d7c16c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_matcher.py
@@ -0,0 +1,460 @@
+import collections
+import enum
+
+import torch
+toq = torch.ops.quantized
+
+from torch.fx import GraphModule
+from torch.fx.graph import Graph, Node
+
+from torch.ao.quantization.utils import getattr_from_fqn
+from .ns_types import NSSubgraph, NSNodeTargetType
+from .mappings import (
+    get_base_name_to_sets_of_related_ops,
+    get_unmatchable_types_map,
+)
+from .pattern_utils import (
+    get_type_a_related_to_b,
+    get_reversed_fusions,
+    end_node_matches_reversed_fusion,
+)
+from torch.ao.quantization import (
+    ObserverBase,
+    FakeQuantizeBase,
+)
+
+from typing import Dict, Tuple, List, Optional, Set, Any
+
+def _get_output_nodes(g: Graph) -> List[Node]:
+    return [n for n in g.nodes if n.op == 'output']
+
+class _NSGraphMatchableSubgraphsIterator:
+    """
+    Iterates through the graph of gm, starting with the output nodes
+    and continuing backwards.
+    1. Returns matchable subgraphs, in order. A subgraph is defined by
+       (start_node, end_node).
+    2. Skips over non-matchable subgraphs
+    """
+    def __init__(
+        self,
+        gm: GraphModule,
+        non_matchable_functions: Set[NSNodeTargetType],
+        non_matchable_modules: Set[NSNodeTargetType],
+        non_matchable_methods: Set[NSNodeTargetType],
+    ):
+        self.gm: GraphModule = gm
+        self.non_matchable_functions: Set[NSNodeTargetType] = non_matchable_functions
+        self.non_matchable_modules: Set[NSNodeTargetType] = non_matchable_modules
+        self.non_matchable_methods: Set[NSNodeTargetType] = non_matchable_methods
+        self.seen_nodes: Set[Node] = set()
+        self.stack: List[Node] = []
+        for start_node in _get_output_nodes(self.gm.graph):
+            self.stack.append(start_node)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> NSSubgraph:
+        """
+        Returns the next matchable subgraph.
+        """
+        while len(self.stack) > 0:
+            cur_end_node = self.stack.pop()
+            if cur_end_node in self.seen_nodes:
+                continue
+
+            # for subgraphs which are single nodes, start_node == end_node
+            # for subgraphs with more than one node, start node != end_node
+            cur_start_node = cur_end_node
+            # Subgraphs like linear-relu have the base node as the start node.
+            # Subgraphs like dequantize-linear-relu-to(torch.float16) have the
+            #   base node as the second node.
+            # The cur_base_op_node var will move to the actual node during
+            #   the fusion matching later in this code block.
+            cur_base_op_node = cur_end_node
+
+            # Check for potential fusions. For now, we are greedy
+            # and always skip all non-base nodes of a fusion.  For example,
+            # if we match linear-relu backwards, we will always skip the
+            # relu node and attempt to match the linear node.  This can
+            # be made configurable later if needed.
+            for _reverse_fusion_ops, base_op_idx in get_reversed_fusions():
+                is_match = end_node_matches_reversed_fusion(
+                    cur_end_node, _reverse_fusion_ops, self.gm, self.seen_nodes)
+                if is_match:
+                    # navigate to the base node
+                    for rev_fusion_idx in range(len(_reverse_fusion_ops) - 1):
+                        self.seen_nodes.add(cur_start_node)
+                        # for now, assume that there are no other nodes
+                        # which need to be added to the stack
+                        cur_start_node = cur_start_node.args[0]  # type: ignore[assignment]
+                        # if the base op index matches the current node, set it
+                        rev_base_op_idx = \
+                            len(_reverse_fusion_ops) - 2 - base_op_idx
+                        if rev_fusion_idx == rev_base_op_idx:
+                            cur_base_op_node = cur_start_node
+                    break
+
+            self.seen_nodes.add(cur_start_node)
+            # add args of previous nodes to stack
+            for arg in cur_start_node.all_input_nodes:
+                self._recursively_add_node_arg_to_stack(arg)
+
+            # skip unmatchable nodes
+            # note: this check is done on the start_node, i.e.
+            # if we are matching linear-relu in reverse, this would do the matchable
+            # check on the linear
+            if not self._is_matchable(cur_base_op_node):
+                continue
+
+            # If an observer or a fake_quant was not matched as a part of
+            # a pattern of multiple nodes, ignore it. One case where this is
+            # relevant is an observer on a graph input, which was added because
+            # it is necessary for the next node.
+            if cur_end_node.op == 'call_module' and cur_start_node is cur_end_node:
+                maybe_obs = getattr_from_fqn(self.gm, cur_end_node.target)  # type: ignore[arg-type]
+                if isinstance(maybe_obs, (ObserverBase, FakeQuantizeBase)):
+                    continue
+
+            return NSSubgraph(
+                start_node=cur_start_node, end_node=cur_end_node,
+                base_op_node=cur_base_op_node)
+
+        raise StopIteration
+
+    def _recursively_add_node_arg_to_stack(self, arg: Any) -> None:
+        """
+        Adds all of the nodes in this arg to the stack, properly navigating
+        through list, dicts and tuples.
+        """
+        if isinstance(arg, Node):
+            self.stack.append(arg)
+        elif isinstance(arg, torch.fx.immutable_collections.immutable_list) or type(arg) is tuple:
+            for inner_arg in arg:
+                self._recursively_add_node_arg_to_stack(inner_arg)
+        elif isinstance(arg, torch.fx.immutable_collections.immutable_dict):
+            for value in arg.values():
+                self._recursively_add_node_arg_to_stack(value)
+
+    def _is_matchable(self, node: Node) -> bool:
+        if node.op == 'call_function':
+            return node.target not in self.non_matchable_functions
+        elif node.op == 'call_module':
+            assert isinstance(node.target, str)
+            target_mod = getattr_from_fqn(self.gm, node.target)
+            return not \
+                any(isinstance(target_mod, t)  # type: ignore[arg-type]
+                    for t in self.non_matchable_modules)
+        elif node.op == 'call_method':
+            return node.target not in self.non_matchable_methods
+        else:
+            return False
+
+class GraphMatchingException(Exception):
+    """
+    Exception raised when two graphs cannot be matched.
+    """
+    pass
+
+class SubgraphTypeRelationship(enum.Enum):
+    # same type, known
+    # example: F.linear and F.linear, or nn.Conv2d and nn.Conv2d
+    EQUAL = enum.auto()
+    # same type, but the type is not known to Numerical Suite
+    # (user defined type, etc).
+    EQUAL_BUT_UKNOWN = enum.auto()
+    # known, same subgraph_relationship set, but not the same type
+    # example: F.linear and toq.linear
+    RELATED_BUT_NOT_EQUAL = enum.auto()
+    # not related
+    NOT_RELATED = enum.auto()
+
+def _get_subgraph_relationship_type(
+    subgraph_a: NSSubgraph,
+    subgraph_b: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    type_a_related_to_b: Set[Tuple[NSNodeTargetType, NSNodeTargetType]],
+) -> SubgraphTypeRelationship:
+    node_a = subgraph_a.base_op_node
+    node_b = subgraph_b.base_op_node
+
+    # TODO(next): make this code handle matching by what is before the base op
+    if node_a.op != node_b.op:
+        if not (
+            node_a.op in ('call_function', 'call_method') and
+            node_b.op in ('call_function', 'call_method')
+        ):
+            return SubgraphTypeRelationship.NOT_RELATED
+
+    if node_a.op in ('call_function', 'call_method'):
+        key = (node_a.target, node_b.target)
+
+        if key not in type_a_related_to_b:
+            if node_a.target == node_b.target:
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        # after this point, we are dealing with known types
+
+        if node_a.target == node_b.target:
+            node_a_has_prev = subgraph_a.base_op_node == subgraph_a.start_node
+            node_b_has_prev = subgraph_b.base_op_node == subgraph_b.start_node
+            if node_a_has_prev and (not node_b_has_prev):
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and node_b_has_prev:
+                return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+            elif (not node_a_has_prev) and (not node_b_has_prev):
+                return SubgraphTypeRelationship.EQUAL
+            else:
+                # TODO(future PR): check for matches start_op_node and base_op_node
+                return SubgraphTypeRelationship.EQUAL
+
+        if key in type_a_related_to_b:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+        else:
+            return SubgraphTypeRelationship.NOT_RELATED
+    elif node_a.op == 'call_module':
+        assert (subgraph_a.base_op_node == subgraph_a.start_node and
+                subgraph_b.base_op_node == subgraph_b.start_node), \
+            "Matching call_module patterns where base_op_node != start_node is not supported yet"
+        # for call_module, we need to look up the modules to do the type check
+        assert isinstance(node_a.target, str)
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        assert isinstance(node_b.target, str)
+        mod_b = getattr_from_fqn(gm_b, node_b.target)
+
+        key = (type(mod_a), type(mod_b))
+
+        if key not in type_a_related_to_b:
+            if type(mod_a) == type(mod_b):
+                return SubgraphTypeRelationship.EQUAL_BUT_UKNOWN
+            else:
+                return SubgraphTypeRelationship.NOT_RELATED
+        elif type(mod_a) == type(mod_b):
+            return SubgraphTypeRelationship.EQUAL
+        else:
+            return SubgraphTypeRelationship.RELATED_BUT_NOT_EQUAL
+
+    return SubgraphTypeRelationship.NOT_RELATED
+
+def _get_name_for_subgraph(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]],
+    existing_names: Set[str],
+) -> str:
+    """
+    Returns a unique name for a subgraph. This name is based on two things:
+    1. the name of the set containing the underlying type of the base op in the
+       subgraph (i.e. 'torch.nn.functional.linear' if this is related to a linear op)
+    2. the number of previous subgraphs with related underlying type of the base op
+
+    For example, in the graph
+
+    linear0 -> relu0 -> linear1 -> relu1
+
+    The subgraphs are (linear0, relu0) and (linear1, relu1).  If we iterate
+    from the output node backwards, the name given to (linear1, relu1) will be
+    `base_op_torch.nn.functional.linear_0`, and the name given to (linear0, relu0)
+    will be `base_op_torch.nn.functional.linear_1`.
+
+    Why are we not just using the node name? Answer: because of two requirements:
+    A. fusions must be supported
+    B. some Numeric Suite APIs can be called without having all of the models in memory
+
+    For example, let's say we need to match nodes of
+
+    (1) ... -> linear0 -> relu0 -> ...
+
+    And
+
+    (2) ... -> linear_relu0 -> ...
+
+    Without being able to inspect them together. With the current naming scheme, if
+    we iterate through both of these graphs in the same order, and assuming the rest
+    of the graphs match, both of these subgraphs will get the same name without
+    (1) and (2) knowing anything about each other.
+    """
+    target_type = _get_node_target_type(subgraph_a.base_op_node, gm_a)
+    target_base_type = None
+    for base_name, sets_of_related_ops in base_name_to_sets_of_related_ops.items():
+        if target_type in sets_of_related_ops:
+            target_base_type = base_name
+    target_base_name = 'base_op_' + str(target_base_type)
+    counter = 0
+    proposed_name = target_base_name + '_' + str(counter)
+    while proposed_name in existing_names:
+        counter += 1
+        proposed_name = target_base_name + '_' + str(counter)
+    existing_names.add(proposed_name)
+    return proposed_name
+
+def _get_node_target_type(node: Node, gm: GraphModule) -> Optional[NSNodeTargetType]:
+    if node.op in ('call_function', 'call_method'):
+        return node.target
+    elif node.op == 'call_module':
+        assert isinstance(node.target, str)
+        mod = getattr_from_fqn(gm, node.target)
+        return type(mod)
+    return None
+
+def get_matching_subgraph_pairs(
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    base_name_to_sets_of_related_ops: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+    unmatchable_types_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> Dict[str, Tuple[NSSubgraph, NSSubgraph]]:
+    """
+    Matches matchable subgraphs of graph_a to graph_b.
+
+    For a node, "matchable" is defined as a node which is not an observer,
+    fake_quants, quant or dequant.
+
+    A subgraph can contain one or more nodes.  A subgraph is matchable if
+    at least one node inside of it is matchable.  Currently, all nodes in
+    a subgraph must be matchable (because we assume no observers will be
+    inserted in the middle of a fusion).
+
+    A subgraph is defined by (start_node, end_node).  We assume that only
+    start_node and end_node are linked with the surrounding graph, all other
+    nodes in a subgraph are self-contained.
+
+    A pair of nodes is "related" if both nodes represent the same mathematical
+    operation across different quantization flavors. For example,
+    `F.linear` and `torch.ops.quantized.linear` are related, and
+    `F.linear` and `torch.nn.Conv` are not related.
+
+    For each matchable pair of nodes node_a and node_b, they will match
+    if node_a and node_b are related.
+
+    For graphs A and B, they will match iff:
+    1. the number of matchable subgraphs in A and B is equivalent
+    2. when iterating through the matchable subgraphs of A and B in the same order, each
+       corresponding pair of base nodes is related.
+
+    This enables us to find the corresponding subgraphs between
+    graphs of related models.  For example, if we had two graphs such as:
+
+    graph_a: x0 -> conv_0 (type: nn.Conv2d) -> obs_0 -> x1
+             w  -/
+             b  -/
+
+    graph_b: x0 -> quant_0 -> qconv_0 (type: nnq.Conv2d) -> dequant_0 -> x1
+           packed_params_0 -/
+
+    This function will return the following result:
+    {
+        'conv_0': (  # the name of the node in graph_b
+          (conv_0, conv_0),  # (start_node_a, end_node_a)
+          (qconv_0, qconv_0),  # (start_node_b, end_node_b)
+        ),
+    }
+
+    Or, if we have a fusion pattern,
+
+    graph_a: x0 -> linear_0 -> relu_0 -> obs_0 -> x1
+             w  -/
+             b  -/
+
+    graph_b: x0 -> quant_0 -> linear_relu_0 -> dequant_0 -> x1
+           packed_params_0 -/
+
+    This function will return the following result:
+    {
+        'linear_relu_0': (  # the name of the node in graph_b
+          (linear_0, relu_0),  # (start_node_a, end_node_a)
+          (linear_relu_0, linear_relu_0),  # (start_node_b, end_node_b)
+        ),
+    }
+    """
+    if unmatchable_types_map is None:
+        unmatchable_types_map = get_unmatchable_types_map()
+    non_matchable_functions = unmatchable_types_map['funs_unmatchable']
+    non_matchable_modules = unmatchable_types_map['mods_unmatchable']
+    non_matchable_methods = unmatchable_types_map['meths_unmatchable']
+
+    graph_a_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_a, non_matchable_functions, non_matchable_modules,
+        non_matchable_methods)
+    graph_b_iterator = _NSGraphMatchableSubgraphsIterator(
+        gm_b, non_matchable_functions, non_matchable_modules,
+        non_matchable_methods)
+    results = collections.OrderedDict()
+    if base_name_to_sets_of_related_ops is None:
+        base_name_to_sets_of_related_ops = get_base_name_to_sets_of_related_ops()
+    type_a_related_to_b = \
+        get_type_a_related_to_b(base_name_to_sets_of_related_ops)
+
+    existing_names_a: Set[str] = set()
+    existing_names_b: Set[str] = set()
+
+    while True:
+        # fetch the next subgraphs from a and b
+        cur_subgraph_a, cur_subgraph_b = None, None
+        try:
+            cur_subgraph_a = next(graph_a_iterator)
+        except StopIteration:
+            pass
+        try:
+            cur_subgraph_b = next(graph_b_iterator)
+        except StopIteration:
+            pass
+
+        # look up types of a and b for useful error messages
+        type_start_a, type_start_b = None, None
+        if cur_subgraph_a is not None:
+            type_start_a = _get_node_target_type(cur_subgraph_a.start_node, gm_a)
+        if cur_subgraph_b is not None:
+            type_start_b = _get_node_target_type(cur_subgraph_b.start_node, gm_b)
+
+        # check for results and determine what to do next
+        if cur_subgraph_a is not None and cur_subgraph_b is not None:
+            # both nodes were fetched, check for subgraph_relationship
+            # note: subgraph_relationship is checked on the start node, i.e.
+            # if a linear-relu pattern is checked, we would check for subgraph_relationship
+            # of the linear
+            subgraph_relationship = _get_subgraph_relationship_type(
+                cur_subgraph_a, cur_subgraph_b,
+                gm_a, gm_b, type_a_related_to_b)
+            if subgraph_relationship == SubgraphTypeRelationship.NOT_RELATED:
+                msg = f"""
+The subgraphs
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b})
+are not related. Please ensure that the two models you pass in have the same number
+of subgraphs, and each pair of subgraphs is related to each other."""
+                raise GraphMatchingException(msg)
+            elif subgraph_relationship == SubgraphTypeRelationship.EQUAL_BUT_UKNOWN:
+                # skip matching but unknown types
+                continue
+            key_name_a = _get_name_for_subgraph(
+                cur_subgraph_a, gm_a, base_name_to_sets_of_related_ops,
+                existing_names_a)
+            key_name_b = _get_name_for_subgraph(
+                cur_subgraph_b, gm_b, base_name_to_sets_of_related_ops,
+                existing_names_b)
+            assert key_name_a == key_name_b, \
+                f"Subgraph names {key_name_a} and {key_name_b} do not match"
+            results[key_name_a] = (cur_subgraph_a, cur_subgraph_b)
+            continue
+        elif cur_subgraph_a is None and cur_subgraph_b is None:
+            # we reached the end of both graphs
+            break
+        else:
+            # only one node was fetched, no match possible, throw error
+            msg = f"""
+Attempting to match
+({cur_subgraph_a}, {type_start_a}) and
+({cur_subgraph_b}, {type_start_b}),
+one of which is empty. Please ensure that the two models you pass in have the same number
+of subgraphs."""
+            raise GraphMatchingException(msg)
+
+    # The subgraph pairs are originally created by traversing the two graphs
+    # from the outputs to the inputs. Reverse the results to return the
+    # subgraphs in their order of execution.
+    results = collections.OrderedDict(reversed(list(results.items())))
+
+    return results
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_passes.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_passes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5f610544d1ea6ca56045cf1e0d63a0bba89aa61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/graph_passes.py
@@ -0,0 +1,950 @@
+import torch
+from torch.fx import GraphModule, map_arg
+from torch.fx.graph import Graph, Node
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+
+from .utils import (
+    get_node_first_input_and_output_type,
+    getattr_from_fqn,
+    NodeInputOrOutputType,
+    return_first_non_observer_node,
+    get_number_of_non_param_args,
+    get_target_type_str,
+    get_arg_indices_of_inputs_to_log,
+    get_node_input_qparams,
+    op_type_supports_shadowing,
+    get_normalized_nth_input,
+)
+
+from .ns_types import (
+    NSSingleResultValuesType,
+    NSSubgraph,
+    NSNodeTargetType,
+)
+from torch.ao.ns.fx.mappings import (
+    get_node_type_to_io_type_map,
+)
+from torch.ao.quantization.observer import _is_activation_post_process
+
+from typing import Dict, Tuple, Callable, List, Any, Union, Optional, Set
+
+def _maybe_get_fqn(node: Node, gm: GraphModule) -> Optional[str]:
+    fqn = None
+    if hasattr(gm, '_node_name_to_scope'):
+        # fqn on observers is not present, because they do not
+        # exist when the fqns are created during tracing. If this is
+        # an observer, get the fqn of the node being observed.
+        node_to_use_for_fqn = node
+        if node.op == 'call_module':
+            assert isinstance(node.target, str)
+            module = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(module):
+                node_to_use_for_fqn = get_normalized_nth_input(node, gm, 0)
+        fqn = gm._node_name_to_scope[node_to_use_for_fqn.name][0]  # type: ignore[index]
+    return fqn  # type: ignore[return-value]
+
+def _insert_logger_after_node(
+    node: Node,
+    gm: GraphModule,
+    logger_cls: Callable,
+    logger_node_name_suffix: str,
+    ref_node_name: str,
+    model_name: str,
+    ref_name: str,
+    ref_node_target_type: str,
+    results_type: str,
+    index_within_arg: int,
+    index_of_arg: int,
+    fqn: Optional[str],
+) -> Node:
+    """
+    Given a starting graph of
+
+    prev_node -> node -> next_node
+
+    This function creates a new logger_cls obj and adds it
+    after node, resulting in
+
+    prev_node -> node -> logger_obj -> next_node
+    """
+    # create new name
+    logger_node_name = \
+        get_new_attr_name_with_prefix(node.name + logger_node_name_suffix)(gm)
+    target_type = get_target_type_str(node, gm)
+    # create the logger object
+    logger_obj = logger_cls(
+        ref_node_name, node.name, model_name, ref_name, target_type,
+        ref_node_target_type,
+        results_type, index_within_arg, index_of_arg, fqn)
+    # attach the logger object to the parent module
+    setattr(gm, logger_node_name, logger_obj)
+    logger_node = node.graph.create_node(
+        'call_module', logger_node_name, (node,), {})
+    return logger_node
+
+def add_loggers_to_model(
+    gm: GraphModule,
+    node_to_instrument_inputs_to_ref_node_name: Dict[Node, Tuple[str, str]],
+    node_to_instrument_outputs_to_ref_node_name: Dict[Node, Tuple[str, str]],
+    logger_cls: Callable,
+    model_name: str,
+) -> GraphModule:
+    """
+    Takes the graph of gm, adds loggers to the output
+    of each node in nodes_to_instrument. Returns a GraphModule with the new
+    graph.
+    """
+
+    new_graph = Graph()
+    env: Dict[str, Any] = {}
+    modules = dict(gm.named_modules())
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+
+    for node in gm.graph.nodes:
+        if node.op == 'output':
+            new_graph.output(map_arg(get_normalized_nth_input(node, gm, 0), load_arg))
+            continue
+
+        if (
+            (node in node_to_instrument_inputs_to_ref_node_name) or
+            (node in node_to_instrument_outputs_to_ref_node_name)
+        ):
+            fqn = _maybe_get_fqn(node, gm)
+
+            if node in node_to_instrument_inputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_inputs_to_ref_node_name[node]
+                # Ops such add and mul are special because either
+                # one or two of the first two arguments can be tensors,
+                # and if one argument is a tensor it can be first or
+                # second (x + 1 versus 1 + x).
+                arg_indices_to_log = get_arg_indices_of_inputs_to_log(node)
+                for node_arg_idx in arg_indices_to_log:
+                    node_arg = get_normalized_nth_input(node, gm, node_arg_idx)
+                    if type(node_arg) == Node:
+                        # create a single input logger
+                        prev_node = env[node_arg.name]
+                        env[node_arg.name] = _insert_logger_after_node(
+                            prev_node, gm, logger_cls, '_ns_logger_', node.name,
+                            model_name, ref_name, ref_node_type,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=node_arg_idx,
+                            fqn=fqn)
+                    elif type(node_arg) == torch.fx.immutable_collections.immutable_list:
+                        # create N input loggers, one for each node
+                        for arg_idx, arg in enumerate(node_arg):  # type: ignore[var-annotated, arg-type]
+                            prev_node = env[arg.name]
+                            env[prev_node.name] = _insert_logger_after_node(
+                                prev_node, gm, logger_cls, '_ns_logger_', node.name,
+                                model_name, ref_name, ref_node_type,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx, index_of_arg=node_arg_idx,
+                                fqn=fqn)
+                    else:
+                        pass
+
+            # ensure env is populated with base node
+            # Note: runs for both inputs and outputs
+            env[node.name] = new_graph.node_copy(node, load_arg)
+
+            if node in node_to_instrument_outputs_to_ref_node_name:
+                ref_name, ref_node_type = node_to_instrument_outputs_to_ref_node_name[node]
+                # add the logger after the base node
+                env[node.name] = _insert_logger_after_node(
+                    env[node.name], gm, logger_cls, '_ns_logger_', node.name,
+                    model_name, ref_name, ref_node_type,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0, fqn=fqn)
+
+        else:
+            env[node.name] = new_graph.node_copy(node, load_arg)
+
+    new_gm = GraphModule(gm, new_graph)
+    return new_gm
+
+def _insert_quantize_per_tensor_node(
+    prev_node_c: Node,
+    node_a: Node,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    scale: Union[torch.Tensor, float],
+    zero_point: Union[torch.Tensor, int],
+    dtype_cast_name: str,
+) -> Node:
+    # copy scale
+    scale_node_name = \
+        get_new_attr_name_with_prefix(
+            node_a.name + '_input_scale_')(gm_b)
+    setattr(gm_b, scale_node_name, scale)
+    scale_node = graph_c.create_node(
+        'get_attr', scale_node_name, (), {}, scale_node_name)
+    # copy zero_point
+    zero_point_node_name = \
+        get_new_attr_name_with_prefix(
+            node_a.name + '_input_zero_point_')(gm_b)
+    setattr(gm_b, zero_point_node_name, zero_point)
+    zero_point_node = graph_c.create_node(
+        'get_attr', zero_point_node_name, (), {}, zero_point_node_name)
+    # create the quantize_per_tensor call
+    return graph_c.create_node(
+        'call_function', torch.quantize_per_tensor,
+        (prev_node_c, scale_node, zero_point_node, torch.quint8), {},
+        dtype_cast_name)
+
+def _insert_dtype_cast_after_node(
+    node_a: Node,
+    node_c: Node,
+    prev_node_c: Union[Node, List[Node]],
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+    node_name_prefix: str,
+    logger_cls: Callable,
+    node_type_to_io_type_map: Dict[str, Set[NSNodeTargetType]],
+) -> Union[Node, List[Node]]:
+    """
+    Given a starting graph C (derived from graph B) of
+
+    ... -> prev_node_c -> node_c -> ...
+
+    And a corresponding related node_a, inserts the correct dtype
+    cast node after prev_node_c to cast into the dtype expected
+    by node_a, resulting in:
+
+                          dtype_cast
+                        /
+    ... -> prev_node_c -> node_c -> ...
+
+    For example, if node_c is an int8 op and node_a is an fp32 op, this function
+    will insert a dequant.
+    """
+    dtype_cast_op = None
+    dtype_cast_mod_cls = None
+    dtype_cast_method = None
+    dtype_cast_method_dtype = None
+    dtype_cast_scale = None
+    dtype_cast_zero_point = None
+    node_input_type_a, _node_output_type_a = \
+        get_node_first_input_and_output_type(
+            node_a, gm_a, logger_cls, node_type_to_io_type_map)
+    node_input_type_c, _node_output_type_c = \
+        get_node_first_input_and_output_type(
+            node_c, gm_b, logger_cls, node_type_to_io_type_map)
+
+    if (
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.INT8) or
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.FP16) or
+        # TODO(future PR): determine the actual dtype of node_c,
+        # the current code only works because dequantize works with
+        # multiple input dtypes.
+        (node_input_type_a == NodeInputOrOutputType.FP32 and
+         node_input_type_c == NodeInputOrOutputType.FP32_OR_INT8)
+    ):
+        dtype_cast_op = torch.dequantize
+    elif (
+        node_input_type_a == node_input_type_c and
+        node_input_type_a != NodeInputOrOutputType.UNKNOWN
+    ):
+        dtype_cast_mod_cls = torch.nn.Identity
+    elif (
+        node_input_type_a == NodeInputOrOutputType.INT8 and
+        node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        # int8 shadows fp32, the dtype cast needs to quantize to int8
+        # with the right qparams.
+        node_a_input_qparams = get_node_input_qparams(
+            node_a, gm_a, node_type_to_io_type_map)
+        if node_a_input_qparams is not None:
+            dtype_cast_op = torch.quantize_per_tensor  # type: ignore[assignment]
+            dtype_cast_scale, dtype_cast_zero_point = node_a_input_qparams
+    elif (
+        node_input_type_a == NodeInputOrOutputType.FP16 and
+        node_input_type_c == NodeInputOrOutputType.FP32
+    ):
+        dtype_cast_method = 'to'
+        dtype_cast_method_dtype = torch.float16
+    else:
+        raise AssertionError(
+            f"dtype cast from {node_input_type_c} {node_c.format_node()} to " +
+            f"{node_input_type_a} {node_a.format_node()} needs to be implemented")
+
+    if isinstance(prev_node_c, Node):
+        new_dtype_cast_name = \
+            get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        if dtype_cast_op:
+            if dtype_cast_scale is not None and dtype_cast_zero_point is not None:
+                return _insert_quantize_per_tensor_node(
+                    prev_node_c, node_a, gm_b, graph_c, dtype_cast_scale,
+                    dtype_cast_zero_point, new_dtype_cast_name)
+            else:
+                return graph_c.create_node(
+                    'call_function', dtype_cast_op, (prev_node_c,), {},
+                    new_dtype_cast_name)
+        elif dtype_cast_method:
+            return graph_c.create_node(
+                'call_method', dtype_cast_method,
+                (prev_node_c, dtype_cast_method_dtype), {}, new_dtype_cast_name)
+        else:
+            assert dtype_cast_mod_cls
+            dtype_cast_mod = dtype_cast_mod_cls()
+            setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+            return graph_c.create_node(
+                'call_module', new_dtype_cast_name, (prev_node_c,), {},
+                new_dtype_cast_name)
+    elif isinstance(prev_node_c, list):
+        results = []
+        for prev_node_c_inner in prev_node_c:
+            new_dtype_cast_name = \
+                get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+            if dtype_cast_op:
+                # TODO(future PR): add handling for quantize_per_tensor
+                new_dtype_cast_node = graph_c.create_node(
+                    'call_function', dtype_cast_op, (prev_node_c_inner,), {},
+                    new_dtype_cast_name)
+                results.append(new_dtype_cast_node)
+            else:
+                assert dtype_cast_mod_cls
+                dtype_cast_mod = dtype_cast_mod_cls()
+                setattr(gm_b, new_dtype_cast_name, dtype_cast_mod)
+                new_dtype_cast_node = graph_c.create_node(
+                    'call_module', new_dtype_cast_name, (prev_node_c_inner,), {},
+                    new_dtype_cast_name)
+                results.append(new_dtype_cast_node)
+        return results
+    else:
+        raise AssertionError(f"type f{type(prev_node_c)} is not handled")
+
+# TODO(future PR): look into using copy_node API instead
+def _copy_node_from_a_to_c(
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    graph_c: Graph,
+) -> Node:
+    """
+    Simple copy of node_a to graph_c.
+    """
+    if node_a.op == 'get_attr':
+        node_a_copy_name = \
+            get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+        node_a_obj = getattr_from_fqn(gm_a, node_a.target)  # type: ignore[arg-type]
+        if torch.is_tensor(node_a_obj):
+            node_a_obj = node_a_obj.detach()
+        setattr(gm_b, node_a_copy_name, node_a_obj)
+        node_a_copy = graph_c.create_node(
+            node_a.op, node_a_copy_name, (), {}, node_a_copy_name)
+        return node_a_copy
+    elif node_a.op == 'call_method':
+        assert node_a.target in ('dequantize', 'to'), \
+            f"target {node_a.target} is not implemented"
+        if node_a.target == 'dequantize':
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0),
+                gm_a, gm_b, graph_c)  # type: ignore[arg-type]
+            node_a_copy_name = \
+                get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op, node_a.target, (arg_copy,), {}, node_a_copy_name)
+            return node_a_copy
+        else:  # to
+            arg_copy = _copy_node_from_a_to_c(
+                get_normalized_nth_input(node_a, gm_a, 0), gm_a, gm_b, graph_c)  # type: ignore[arg-type]
+            node_a_copy_name = \
+                get_new_attr_name_with_prefix(node_a.name + '_shadow_copy_')(gm_b)
+            node_a_copy = graph_c.create_node(
+                node_a.op, node_a.target,
+                (arg_copy, get_normalized_nth_input(node_a, gm_a, 1)),
+                {}, node_a_copy_name)
+            return node_a_copy
+
+    else:
+        raise AssertionError(
+            f"handling of node {node_a.format_node()} with op {node_a.op} is not implemented")
+
+def _can_insert_copy_of_subgraph_a(
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    num_non_param_args_node_a: int,
+) -> bool:
+    """
+    This function returns `False` if the input subgraph cannot be copied by
+    `_insert_copy_of_subgraph_a_after_input_node_c`. This usually means
+    that there is a corner case logic for which copy is not yet implemented.
+    """
+    # populate the list of nodes we need to check
+    nodes = []
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        nodes.append(cur_node)
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+    nodes.append(cur_node)
+    nodes.reverse()
+
+    def _can_insert(node_a_arg, gm_a):
+        if isinstance(node_a_arg, Node):
+            arg_a = return_first_non_observer_node(node_a_arg, gm_a)
+            if arg_a.op == 'call_method':
+                return arg_a.target in ('dequantize', 'to')
+            elif arg_a.op == 'get_attr':
+                return True
+            else:
+                return False
+        elif isinstance(node_a_arg, (list, tuple)):
+            for el in node_a_arg:
+                if not isinstance(el, Node):
+                    return False
+        return True
+
+    # For each node, check if we handle the copy behavior. This follows the
+    # logic in `_insert_copy_of_subgraph_a_after_input_node_c`.
+    for node_a in nodes:
+
+        local_num_non_param_args_node_a = num_non_param_args_node_a \
+            if node_a is nodes[0] else 1
+
+        norm_args_kwargs = node_a.normalized_arguments(
+            gm_a, normalize_to_only_use_kwargs=True)
+        if norm_args_kwargs is not None:
+            norm_args, norm_kwargs = norm_args_kwargs
+        else:
+            norm_args, norm_kwargs = node_a.args, node_a.kwargs
+
+        cur_idx = 0
+
+        while cur_idx < len(norm_args):
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(norm_args[cur_idx], gm_a):
+                    return False
+            cur_idx += 1
+
+        for kwarg_val in norm_kwargs.values():
+            # stitch the inputs from base graph
+            if cur_idx == 0:
+                pass
+            elif cur_idx == 1 and local_num_non_param_args_node_a == 2:
+                pass
+            else:
+                if not _can_insert(kwarg_val, gm_a):
+                    return False
+            cur_idx += 1
+
+    return True
+
+def _insert_copy_of_subgraph_a_after_input_node_c(
+    input_node_c: Union[Node, List[Node]],
+    input_node_c_2: Optional[Union[Node, List[Node]]],
+    subgraph_a: NSSubgraph,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    TODO(before land): real docblock
+    """
+    if isinstance(input_node_c, Node):
+        graph_c = input_node_c.graph
+    else:
+        assert isinstance(input_node_c, list)
+        graph_c = input_node_c[0].graph
+
+    # create a sequential list of the subgraphs' nodes from start to end,
+    # because we need to add the nodes to graph C in non-reverse order
+    nodes_of_a = [subgraph_a.end_node]
+    cur_node = subgraph_a.end_node
+    while cur_node != subgraph_a.start_node:
+        cur_node = get_normalized_nth_input(cur_node, gm_a, 0)  # type: ignore[assignment]
+        nodes_of_a.insert(0, cur_node)
+
+    # go through nodes of a in order, and insert them into the graph of c
+    # sequentially
+    cur_node_a = nodes_of_a[0]
+    cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+        input_node_c,
+        input_node_c_2,
+        cur_node_a,
+        gm_a,
+        gm_b,
+        node_name_prefix)
+    for cur_idx_a in range(1, len(nodes_of_a)):
+        cur_node_a = nodes_of_a[cur_idx_a]
+        prev_node_c = cur_node_c  # previous added node is the input to next node
+        cur_node_c = _insert_copy_of_node_a_after_input_node_c(
+            prev_node_c,
+            # TODO(future PR): enable multiple inputs for nodes which are not at start of subgraph
+            None,
+            cur_node_a,
+            gm_a,
+            gm_b,
+            node_name_prefix)
+    # return the last inserted node
+    return cur_node_c
+
+
+def _insert_copy_of_node_a_after_input_node_c(
+    input_node_c: Union[Node, List[Node]],
+    input_node_c_2: Optional[Union[Node, List[Node]]],
+    node_a: Node,
+    gm_a: GraphModule,
+    gm_b: GraphModule,
+    node_name_prefix: str,
+) -> Node:
+    """
+    Assume that node_a from graph_a has
+      args (input, (input2)?, arg1, ...), and
+      kwargs {kw0: kwarg0, ...}
+
+    Note: input2 is optional. If it equals to None, we assume that the op
+    has a single non-param input.  If it is specified, we assume that the op
+    has two non-param inputs.
+
+    Copies the underlying values of arg1..argn and kwarg0..kwargn into gm_b,
+    and creates the corresponding nodes in graph_c. Note: observers are ignored,
+    so if an arg is an observer we navigate up until we find a non-observer parent.
+
+    If node_a is a call_module, points the module pointed to by node_a to gm_b.
+
+    Creates the copy of node_a in graph_c, with input as the first arg,
+    and all other args and kwargs pointing to the copies of the objects
+    in gm_b created above.
+
+    An example in pictures:
+
+    graph A:
+    ========
+
+    input -------------> node_a
+                         / / /
+    (input_2)?----------/ / /
+                         / /
+    weight -> weight_obs  /
+                         /
+    bias ----------------
+
+    graph C (derived from B):
+    =========================
+
+    input_node_c --> node_a_copy
+                     / / /
+    (input_node_c_2)? / /
+                     / /
+    weight_copy ----/ /
+                     /
+    bias_copy ------/
+    """
+    if isinstance(input_node_c, Node):
+        graph_c = input_node_c.graph
+    else:
+        assert isinstance(input_node_c, list)
+        graph_c = input_node_c[0].graph
+
+    norm_args_kwargs = node_a.normalized_arguments(
+        gm_a, normalize_to_only_use_kwargs=True)
+    if norm_args_kwargs is not None:
+        norm_args, norm_kwargs = norm_args_kwargs
+    else:
+        norm_args, norm_kwargs = node_a.args, node_a.kwargs
+
+    new_args = []
+    new_kwargs = {}
+
+    def _copy_arg(arg):
+        # copy the other inputs from the other graph
+        if isinstance(arg, Node):
+            arg = return_first_non_observer_node(arg, gm_a)
+            arg = _copy_node_from_a_to_c(arg, gm_a, gm_b, graph_c)
+            return arg
+        elif isinstance(arg, (int, float, torch.dtype)):
+            return arg
+        elif isinstance(kwarg_val, (list, tuple)):
+            for el in kwarg_val:
+                assert not isinstance(el, Node), \
+                    "handling of Node inside list is not implemented"
+            return arg
+        else:
+            raise AssertionError(
+                f"handling for kwarg of type {type(kwarg_val)} is not implemented")
+
+    cur_idx = 0
+
+    while cur_idx < len(norm_args):
+        if cur_idx == 0:
+            new_arg = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_arg = input_node_c_2
+        else:
+            new_arg = _copy_arg(norm_args[cur_idx])
+        new_args.append(new_arg)
+        cur_idx += 1
+
+    for kwarg_name, kwarg_val in norm_kwargs.items():
+        # stitch the inputs from base graph
+        if cur_idx == 0:
+            new_kwargs[kwarg_name] = input_node_c
+        elif cur_idx == 1 and input_node_c_2 is not None:
+            new_kwargs[kwarg_name] = input_node_c_2
+        else:
+            new_kwargs[kwarg_name] = _copy_arg(kwarg_val)
+        cur_idx += 1
+
+    new_args = tuple(new_args)  # type: ignore[assignment]
+
+    node_a_shadows_c_name = \
+        get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+
+    if node_a.op == 'call_module':
+        # if target is a module, we point to the module from gm_b
+        new_mod_copy_name = \
+            get_new_attr_name_with_prefix(node_name_prefix)(gm_b)
+        # fetch the corresponding module from gm_a
+        assert isinstance(node_a.target, str)
+        mod_a = getattr_from_fqn(gm_a, node_a.target)
+        setattr(gm_b, new_mod_copy_name, mod_a)
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op, new_mod_copy_name, new_args,
+            new_kwargs, node_a_shadows_c_name)
+        return node_a_shadows_c
+    else:
+        assert node_a.op in ('call_function', 'call_method')
+        node_a_shadows_c = graph_c.create_node(
+            node_a.op, node_a.target, new_args,
+            new_kwargs, node_a_shadows_c_name)
+        return node_a_shadows_c
+
+def create_a_shadows_b(
+    name_a: str,
+    gm_a: GraphModule,
+    name_b: str,
+    gm_b: GraphModule,
+    matched_subgraph_pairs: Dict[str, Tuple[NSSubgraph, NSSubgraph]],
+    logger_cls: Callable,
+    should_log_inputs: bool,
+    node_type_to_io_type_map: Optional[Dict[str, Set[NSNodeTargetType]]] = None,
+) -> GraphModule:
+    """
+    Creates a new GraphModule consisting of the graph of C, with the meaningful
+    nodes of A shadowing the corresponding nodes of B.  For example,
+
+    Graph A:
+    a0 -> op0_fp32 -> a1 -> op1_fp32 -> a2
+
+    Graph B:
+    b0 -> op0_int8 -> b1 -> op1_int8 -> b2
+
+    matched_node_pairs: {'op0': (op0_fp32, op0_int8), 'op1': (op1_fp32, op1_int8)}
+
+    Graph C (A shadows B):
+
+        / dequant0 -> op0_fp32 -> logger_a_0  / dequant_1 -> op1_fp32 -> logger_a_1
+       /                                     /
+    b0 -------------> op0_int8 -> logger_b_0 --------------> op1_int8 -> logger_b_1
+
+    In a nutshell, this function does the following for each node pair:
+    * copies the necessary attributes and modules from gm_a to gm_b,
+      keeping names unique
+    * adds a dtype cast op (dequant, quant, etc)
+    * adds a copy of node_a in gm_b's graph
+    * adds loggers to the outputs of node_a and node_b
+    """
+
+    if node_type_to_io_type_map is None:
+        node_type_to_io_type_map = get_node_type_to_io_type_map()
+
+    # graph_c is the graph created from copying the nodes of graph_b and inserting
+    # the shadows with the nodes copied from graph_a
+    graph_c = Graph()
+    env_c: Dict[str, Any] = {}
+    modules = dict(gm_b.named_modules())
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env_c[node.name])
+
+    start_node_b_to_matched_subgraph_a_and_name = {}
+    end_node_b_to_matched_subgraph_a_and_name = {}
+    for match_name, match in matched_subgraph_pairs.items():
+        subgraph_a, subgraph_b = match
+        ref_node_type_a = get_target_type_str(subgraph_a.base_op_node, gm_a)
+        ref_node_type_b = get_target_type_str(subgraph_b.base_op_node, gm_b)
+        start_node_b_to_matched_subgraph_a_and_name[subgraph_b.start_node] = \
+            (subgraph_a, match_name, ref_node_type_a, ref_node_type_b)
+        end_node_b_to_matched_subgraph_a_and_name[subgraph_b.end_node] = \
+            (subgraph_a, match_name, ref_node_type_a, ref_node_type_b)
+
+    for node_b in gm_b.graph.nodes:
+        if node_b.op == 'output':
+            graph_c.output(map_arg(node_b.args[0], load_arg))
+            continue
+
+        # calculate the flags to determine what to do with this node
+        node_b_is_start_node = node_b in start_node_b_to_matched_subgraph_a_and_name
+        node_b_is_end_node = node_b in end_node_b_to_matched_subgraph_a_and_name
+
+        if (node_b_is_start_node or node_b_is_end_node):
+
+            if node_b_is_start_node:
+                subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \
+                    start_node_b_to_matched_subgraph_a_and_name[node_b]
+            else:
+                assert node_b_is_end_node
+                subgraph_a, ref_name, ref_node_type_a, ref_node_type_b = \
+                    end_node_b_to_matched_subgraph_a_and_name[node_b]
+
+            all_op_types_support_shadowing = (
+                op_type_supports_shadowing(subgraph_a.start_node) and
+                op_type_supports_shadowing(node_b)
+            )
+            if not all_op_types_support_shadowing:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unsupported')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            # For both start_node and end_node verify that we know how to do
+            # the dtype cast. If we do not, skip.
+            node_input_type_a, node_output_type_a = \
+                get_node_first_input_and_output_type(
+                    subgraph_a.start_node, gm_a, logger_cls,
+                    node_type_to_io_type_map)
+            node_input_type_b, node_output_type_b = \
+                get_node_first_input_and_output_type(
+                    node_b, gm_b, logger_cls,
+                    node_type_to_io_type_map)
+            node_io_types_known_a_and_b = (
+                node_input_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_a != NodeInputOrOutputType.UNKNOWN and
+                node_input_type_b != NodeInputOrOutputType.UNKNOWN and
+                node_output_type_b != NodeInputOrOutputType.UNKNOWN
+            )
+            if not node_io_types_known_a_and_b:
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unknown dtype cast')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            # If we are shadowing from fp32 to int8, we need to insert
+            # quantize_per_tensor call with qparams from the previous node.
+            # Only do this if we are able to infer these qparams from the graph.
+            if (
+                node_input_type_a == NodeInputOrOutputType.INT8 and
+                node_input_type_b == NodeInputOrOutputType.FP32
+            ):
+                node_a_input_qparams = get_node_input_qparams(
+                    subgraph_a.start_node, gm_a, node_type_to_io_type_map)
+                if not node_a_input_qparams:
+                    print(
+                        f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                        f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                        ', unknown input qparams')
+                    env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                    continue
+
+            num_non_param_args_node_a = \
+                get_number_of_non_param_args(subgraph_a.start_node, gm_a)
+            if not _can_insert_copy_of_subgraph_a(subgraph_a, gm_a, num_non_param_args_node_a):
+                print(
+                    f'skipping shadow loggers for node_b: {get_target_type_str(node_b, gm_b)}' +
+                    f', start_node_a: {get_target_type_str(subgraph_a.start_node, gm_a)}' +
+                    ', unhandled logic in subgraph copy')
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                continue
+
+            fqn_base_a = _maybe_get_fqn(subgraph_a.base_op_node, gm_a)
+            fqn_base_b = _maybe_get_fqn(subgraph_b.base_op_node, gm_b)  # type: ignore[possibly-undefined]
+
+            if node_b_is_start_node:
+
+                # if necessary, log the input of node_c
+                if should_log_inputs:
+                    prev_node_b = get_normalized_nth_input(node_b, gm_b, 0)
+                    if isinstance(prev_node_b, Node):
+                        prev_node_c = env_c[prev_node_b.name]
+                        env_c[prev_node_c.name] = _insert_logger_after_node(
+                            prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_',
+                            node_b.name, name_b, ref_name, ref_node_type_b,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=0,
+                            fqn=fqn_base_b)
+                    elif isinstance(prev_node_b, list):
+                        # first, save the prev_node instances, because they
+                        # will be overwritten in the env after the first logger
+                        # is added
+                        prev_node_c_list = [env_c[arg.name] for arg in prev_node_b]
+
+                        for arg_idx, arg in enumerate(prev_node_b):
+                            prev_node_c = prev_node_c_list[arg_idx]
+                            env_c[prev_node_c.name] = _insert_logger_after_node(
+                                prev_node_c, gm_b, logger_cls, '_ns_logger_b_inp_',
+                                node_b.name, name_b, ref_name, ref_node_type_b,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=arg_idx, index_of_arg=0,
+                                fqn=fqn_base_b)
+                    else:
+                        # logging of inputs which are not lists is not supported yet
+                        raise AssertionError(f"type {type(prev_node_b)} is not handled yet")
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)?
+
+            # Note: this if statement is always True, spelling it out to clarify code
+            # intent.
+            if node_b_is_start_node or node_b_is_end_node:
+                # ensure env_c is populated with base node
+                env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+                node_c = env_c[node_b.name]
+
+                # after this point,
+                #
+                # node_a is the original node from graph_a, with parent module gm_a
+                # node_b is the original node from graph_b, with parent module gm_b
+                # node_c is the copy of node_b in graph_c
+                #
+                # subgraph so far:
+                #
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+            if node_b_is_start_node:
+
+                # cast dtype from the dtype of node_c's input to the dtype of
+                # node_a's input (dequant, etc)
+                # prev_node_c = node_c.args[0]
+                prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)  # type: ignore[possibly-undefined]
+                if should_log_inputs:
+                    # skip the input logger when inserting a dtype cast
+                    if isinstance(prev_node_c, Node):
+                        prev_node_c = get_normalized_nth_input(node_c, gm_b, 0)
+                    elif isinstance(prev_node_c, list):
+                        prev_node_c = [get_normalized_nth_input(arg, gm_b, 0) for arg in prev_node_c]
+                dtype_cast_node = _insert_dtype_cast_after_node(
+                    subgraph_a.start_node, node_c, prev_node_c, gm_a, gm_b, graph_c,
+                    node_b.name + '_dtype_cast_', logger_cls,
+                    node_type_to_io_type_map)
+                # note: not inserting to env_c because all nodes which use the dtype
+                #   casts are copied from graph_a
+                #
+                # subgraph so far:
+                #
+                #           (dtype_cast_node)+
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+                # if input logging is enabled, log the input to the subgraph
+                if should_log_inputs:
+                    # TODO: explain this
+                    ref_node_name = ''
+                    if isinstance(dtype_cast_node, Node):
+                        dtype_cast_node = _insert_logger_after_node(
+                            dtype_cast_node, gm_b, logger_cls, '_ns_logger_a_inp_',
+                            ref_node_name, name_a, ref_name, ref_node_type_a,
+                            NSSingleResultValuesType.NODE_INPUT.value,
+                            index_within_arg=0, index_of_arg=0,
+                            fqn=fqn_base_a)
+                        input_logger: Union[Node, List[Node]] = dtype_cast_node
+                    else:
+                        assert isinstance(dtype_cast_node, list)
+                        new_loggers = []
+                        for dtype_cast_idx, dtype_cast_node_inner in enumerate(dtype_cast_node):
+                            dtype_cast_logger = _insert_logger_after_node(
+                                dtype_cast_node_inner, gm_b, logger_cls, '_ns_logger_a_inp_',
+                                ref_node_name, name_a, ref_name, ref_node_type_a,
+                                NSSingleResultValuesType.NODE_INPUT.value,
+                                index_within_arg=dtype_cast_idx,
+                                index_of_arg=0,
+                                fqn=fqn_base_a)
+                            new_loggers.append(dtype_cast_logger)
+                        dtype_cast_node = new_loggers
+                        input_logger = dtype_cast_node
+                    # subgraph so far:
+                    #
+                    #       (dtype_cast_node)+ -> (logger_a_input)?
+                    #                  /
+                    # prev_node_c -> (logger_c_input)? -> node_start_c
+
+                # hook up the new mod_a copy to be in the graph, receiving the
+                # same inputs as mod_b does, with dtype cast to match a
+                # Some ops, such as LSTMs, have two non-param inputs. If we have
+                # such an op, pass the second param as well. Note: dtype casting
+                # for the second param is not implemented yet, it can be added
+                # later if there is a use case.
+                node_c_second_non_param_arg = None
+                num_non_param_args_node_a = get_number_of_non_param_args(subgraph_a.start_node, gm_a)
+                if num_non_param_args_node_a == 2:
+                    # node_c_second_non_param_arg = node_c.args[1]
+                    node_c_second_non_param_arg = get_normalized_nth_input(node_c, gm_b, 1)
+                node_a_shadows_c = _insert_copy_of_subgraph_a_after_input_node_c(
+                    dtype_cast_node, node_c_second_non_param_arg,
+                    subgraph_a, gm_a, gm_b, node_c.name + '_shadow_copy_')
+                env_c[node_a_shadows_c.name] = node_a_shadows_c
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy(args/kwargs not shown)
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+                if should_log_inputs:
+                    # When we created the input logger, we left the ref_node_name
+                    # as an empty string, because the subgraph copy did not exist
+                    # yet. Now that the subgraph copy exists, we modify this name
+                    # to its true value.
+                    # Note: the alternative to this is to create the input logger
+                    # after creating the subgraph, which is slightly more
+                    # complicated. This is the lesser of two evils.
+                    # input_logger = env_c[dtype_cast_node.name]
+                    # Find the first node in the subgraph
+                    cur_node = node_a_shadows_c
+                    while get_normalized_nth_input(cur_node, gm_b, 0) != input_logger:  # type: ignore[possibly-undefined]
+                        cur_node = get_normalized_nth_input(cur_node, gm_b, 0)  # type: ignore[assignment]
+                    if isinstance(input_logger, Node):
+                        input_logger_mod = getattr(gm_b, input_logger.name)
+                        input_logger_mod.ref_node_name = cur_node.name
+                    else:
+                        assert isinstance(input_logger, list)
+                        for input_logger_inner in input_logger:
+                            input_logger_mod = getattr(gm_b, input_logger_inner.name)
+                            input_logger_mod.ref_node_name = cur_node.name
+
+                # hook up a logger to the mod_a copy
+                env_c[node_a_shadows_c.name] = _insert_logger_after_node(
+                    env_c[node_a_shadows_c.name], gm_b, logger_cls, '_ns_logger_a_',
+                    node_a_shadows_c.name, name_a, ref_name, ref_node_type_a,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0,
+                    fqn=fqn_base_a)
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c)+ -> (logger_c_input)? -> node_start_c
+
+            if node_b_is_end_node:
+
+                # hook up a logger to the mod_b copy
+                env_c[node_b.name] = _insert_logger_after_node(
+                    env_c[node_b.name], gm_b, logger_cls, '_ns_logger_b_',
+                    node_b.name, name_b, ref_name, ref_node_type_b,
+                    NSSingleResultValuesType.NODE_OUTPUT.value,
+                    index_within_arg=0, index_of_arg=0,
+                    fqn=fqn_base_b)
+                # subgraph so far:
+                #
+                #       dtype_cast_node -> (logger_a_input)? -> subgraph_a_copy -> logger_a
+                #                  /
+                # (prev_node_c+) -> (logger_c_input)? -> node_start_c -> ... -> node_end_c -> logger_c
+                #
+                # Note: node_start_c may be the same node as node_end_c, or they
+                # may have nodes inbetween.
+
+        else:
+            env_c[node_b.name] = graph_c.node_copy(node_b, load_arg)
+
+    gm_c = GraphModule(gm_b, graph_c)
+    return gm_c
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/mappings.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bf49f74c958da1b456f616f5f5d28c5c714d79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/mappings.py
@@ -0,0 +1,761 @@
+import operator
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+toq = torch.ops.quantized
+
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.qat.dynamic as nnqatd
+from torch.ao.quantization.backend_config import get_native_backend_config
+import torch.ao.quantization.fx._lower_to_native_backend as \
+    _lower_to_native_backend
+import torch.ao.quantization.quantization_mappings as quantization_mappings
+
+from .ns_types import NSNodeTargetType
+
+from typing import Callable, Dict, List, Optional, Set, Tuple
+
+
+def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]:
+    # note: this set is modified below by items from backend_config
+    sets_of_related_ops: List[Set[NSNodeTargetType]] = [
+        # conv modules
+        {
+            nn.Conv1d,
+        },
+        {
+            nn.Conv2d,
+        },
+        {
+            nn.Conv3d,
+        },
+        # conv functionals
+        {
+            F.conv1d,
+        },
+        {
+            F.conv2d,
+        },
+        {
+            F.conv3d,
+        },
+        # linear modules
+        {
+            nn.Linear,
+        },
+        # linear functionals
+        {
+            F.linear,
+        },
+        # average pool
+        {
+            nn.AvgPool1d,
+            torch.avg_pool1d,
+        },
+        {
+            nn.AvgPool2d,
+            torch._C._nn.avg_pool2d,
+        },
+        {
+            nn.AvgPool3d,
+            torch._C._nn.avg_pool3d,
+        },
+        # adaptive average pool
+        {
+            nn.AdaptiveAvgPool1d,
+            F.adaptive_avg_pool1d,
+        },
+        {
+            nn.AdaptiveAvgPool2d,
+            F.adaptive_avg_pool2d,
+        },
+        {
+            nn.AdaptiveAvgPool3d,
+            F.adaptive_avg_pool3d,
+        },
+        # LSTM
+        {
+            nn.LSTM,
+        },
+        # add
+        {
+            torch.add,
+            operator.add,  # x + y
+        },
+        # cat
+        {
+            torch.cat,
+        },
+        # mul
+        {
+            torch.mul,
+            operator.mul,
+        },
+        # relu
+        {
+            F.relu,
+            nn.ReLU,
+            'relu',
+            'relu_',
+            torch.relu,
+        },
+        # maxpool
+        {
+            nn.MaxPool1d,
+            F.max_pool1d,
+        },
+        {
+            nn.MaxPool2d,
+            F.max_pool2d,
+        },
+        {
+            nn.MaxPool3d,
+            F.max_pool3d,
+        },
+        # sigmoid
+        {
+            torch.sigmoid,
+            'sigmoid',
+            'sigmoid_',
+            nn.Sigmoid,
+            F.sigmoid,
+        },
+        # BatchNorm
+        {
+            nn.BatchNorm2d,
+        },
+        {
+            nn.BatchNorm3d,
+        },
+        # ConvTranspose
+        {
+            nn.ConvTranspose1d,
+        },
+        {
+            nn.ConvTranspose2d,
+        },
+        {
+            nn.ConvTranspose3d,
+        },
+        # functional transposed conv
+        {
+            F.conv_transpose1d,
+        },
+        {
+            F.conv_transpose2d,
+        },
+        {
+            F.conv_transpose3d,
+        },
+        # ELU
+        {
+            nn.ELU,
+        },
+        # Embedding
+        {
+            nn.Embedding,
+        },
+        # EmbeddingBag
+        {
+            nn.EmbeddingBag,
+        },
+        # GroupNorm
+        {
+            nn.GroupNorm,
+        },
+        # Hardswish
+        {
+            nn.Hardswish,
+        },
+        # InstanceNorm
+        {
+            nn.InstanceNorm1d,
+        },
+        {
+            nn.InstanceNorm2d,
+        },
+        {
+            nn.InstanceNorm3d,
+        },
+        # LayerNorm
+        {
+            nn.LayerNorm,
+        },
+        # LeakyReLU
+        {
+            nn.LeakyReLU,
+        },
+        # ReLU6
+        {
+            nn.ReLU6,
+            F.relu6,
+        },
+        # F.elu
+        {
+            F.elu,
+        },
+        # F.hardswish
+        {
+            F.hardswish,
+        },
+        # F.group_norm
+        {
+            F.group_norm,
+        },
+        # F.instance_norm
+        {
+            F.instance_norm,
+        },
+        # F.layer_norm
+        {
+            F.layer_norm,
+        },
+        # F.leaky_relu
+        {
+            F.leaky_relu,
+        },
+        # F.silu
+        {
+            nn.SiLU,
+            F.silu,
+        },
+        # F.mish
+        {
+            nn.Mish,
+            F.mish,
+        },
+        # F.tanh
+        {
+            nn.Tanh,
+            F.tanh,
+            torch.tanh,
+            'tanh_',
+            'tanh',
+        },
+        # F.hardsigmoid
+        {
+            'hardsigmoid_',
+            'hardsigmoid',
+            F.hardsigmoid,
+            nn.Hardsigmoid,
+        },
+        # F.hardtanh
+        {
+            nn.Hardtanh,
+            F.hardtanh,
+            F.hardtanh_,
+        },
+        # floordiv
+        {
+            operator.floordiv,
+        },
+        # unsqueeze
+        {
+            torch.unsqueeze,
+        },
+        # stack
+        {
+            torch.stack,
+        },
+        # squeeze
+        {
+            torch.squeeze,
+        },
+        # sort
+        {
+            torch.sort,
+        },
+        # repeat_interleave
+        {
+            torch.repeat_interleave,
+        },
+        # min
+        {
+            torch.min,
+        },
+        # mean
+        {
+            torch.mean,
+        },
+        # max
+        {
+            torch.max,
+        },
+        # transpose
+        {
+            torch.transpose,
+        },
+        # flatten
+        {
+            torch.flatten,
+        },
+        # clamp
+        {
+            torch.clamp,
+        },
+        # chunk
+        {
+            torch.chunk,
+        },
+        # interpolate
+        {
+            torch.nn.functional.interpolate,
+        },
+        # dropout
+        {
+            nn.Dropout,
+        },
+        # F.dropout
+        {
+            F.dropout,
+        },
+        # matmul
+        {
+            torch.matmul,
+        },
+        # Softmax
+        {
+            nn.Softmax,
+        },
+        # PReLU
+        {
+            nn.PReLU,
+            nnq.PReLU,
+        },
+        # F.prelu
+        {
+            F.prelu,
+            toq.prelu,
+        },
+        # pixel shuffle
+        {
+            nn.PixelShuffle,
+        },
+        {
+            F.pixel_shuffle,
+        },
+        # pixel unshuffle
+        {
+            nn.PixelUnshuffle,
+        },
+        {
+            F.pixel_unshuffle,
+        },
+        # narrow
+        {
+            torch.narrow,
+        },
+    ]
+
+    # for each floating point op, add versions of the op added by
+    # backend_config
+    backend_config = get_native_backend_config()
+
+    new_connections: List[Tuple[Callable, Callable]] = [
+        # technical debt edge case
+        (nn.Linear, nn.modules.linear.NonDynamicallyQuantizableLinear),
+    ]
+
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+
+        # pattern format: (c, (b, a))
+        first_element = pattern
+        # look from the end, because pattern is in reverse order
+        while isinstance(first_element, (list, tuple)):
+            first_element = first_element[-1]
+
+        if config.fused_module is not None:
+            # case 1: pattern fuses a pattern of ops into an op
+            # example: nn.Conv1d, nn.ReLU fused into nni.ConvReLU1d
+            new_connections.append((first_element, config.fused_module))
+
+        if config.qat_module is not None:
+            # case 2: pattern swaps a module into a QAT module
+            # example: nni.ConvReLU1d swapped into nniqat.ConvReLU1d
+            new_connections.append((first_element, config.qat_module))
+
+        if config.reference_quantized_module is not None:
+            # case 3: reference version of floating point module, such as
+            # nn.Conv2d and nnqr.Conv2d
+            new_connections.append((first_element, config.reference_quantized_module))
+
+    #
+    # Add reference module swaps from default lowering path
+    #
+
+    for source_to_target in (
+        _lower_to_native_backend.STATIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_MODULE_MAP,
+        _lower_to_native_backend.WEIGHT_ONLY_LOWER_MODULE_MAP,
+        _lower_to_native_backend.SPECIAL_PATTERN_LOWER_MODULE_MAP,
+    ):
+        for source, target in source_to_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target))
+
+    for source_to_double_target in (
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_MAP,
+        _lower_to_native_backend.STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP,
+        _lower_to_native_backend.DYNAMIC_LOWER_FUSED_MODULE_MAP,
+    ):
+        for source, (target1, target2) in source_to_double_target.items():  # type: ignore[attr-defined]
+            new_connections.append((source, target1))
+            new_connections.append((source, target2))
+
+    #
+    # Add function swaps from default lowering path
+    #
+
+    for source, (target1, target2) in \
+            _lower_to_native_backend.STATIC_LOWER_FUNCTIONAL_MAP.items():
+        new_connections.append((source, target1))
+        new_connections.append((source, target2))
+
+    for source_to_target in (
+        _lower_to_native_backend.QBIN_OP_MAPPING,
+        _lower_to_native_backend.QBIN_RELU_OP_MAPPING,
+        quantization_mappings.DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():
+            new_connections.append((source, target))
+
+    #
+    # Add other swaps, ideally in the future this could be removed
+    # after the lowering code stops using these.
+    #
+    for source_to_target in (
+        quantization_mappings.DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    ):
+        for source, target in source_to_target.items():
+            new_connections.append((source, target))
+
+
+    # add the new connections from backend_config
+    for item1, item2 in new_connections:
+        for set_of_related_ops in sets_of_related_ops:
+            if item1 in set_of_related_ops or item2 in set_of_related_ops:
+                set_of_related_ops.add(item1)
+                set_of_related_ops.add(item2)
+                break
+
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]] = {}
+
+    counter = 0
+    for set_of_related_ops in sets_of_related_ops:
+        base_name = str(counter)
+        counter += 1
+        base_name_to_sets_of_related_ops[base_name] = set_of_related_ops
+
+    return base_name_to_sets_of_related_ops
+
+
+def get_base_name_for_op(
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]],
+    op: NSNodeTargetType,
+) -> Optional[str]:
+    for base_name, set_of_related_ops in base_name_to_sets_of_related_ops.items():
+        if op in set_of_related_ops:
+            return base_name
+    return None
+
+
+def add_op_to_sets_of_related_ops(
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]],
+    op: NSNodeTargetType,
+    related_op: Optional[NSNodeTargetType],
+) -> None:
+    if related_op is not None:
+        for set_of_related_ops in base_name_to_sets_of_related_ops.values():
+            if related_op in set_of_related_ops:
+                set_of_related_ops.add(op)
+                return
+        # if we got here, related_op was not found
+        raise AssertionError(f"{related_op} was not found")
+    else:
+        counter = 0
+        while str(counter) in base_name_to_sets_of_related_ops:
+            counter += 1
+        base_name_to_sets_of_related_ops[str(counter)] = {op}
+
+
+# TODO(future PR): clean this up
+def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]:
+    FUNS_IO_TYPE_FP32: Set[NSNodeTargetType] = {
+        F.linear,
+        F.conv1d,
+        F.conv2d,
+        F.conv3d,
+        torch.cat,
+        F.elu,
+        F.hardswish,
+        F.instance_norm,
+        F.layer_norm,
+        F.leaky_relu,
+        F.dropout,
+        F.silu,
+        F.mish,
+        operator.add,
+        torch.add,
+        operator.mul,
+        torch.mul,
+        torch.sum,
+        F.prelu,
+    }
+
+    FUNS_IO_TYPE_FP16: Set[NSNodeTargetType] = set()
+
+    FUNS_IO_TYPE_INT8: Set[NSNodeTargetType] = {
+        toq.linear,
+        toq.linear_relu,
+        toq.conv1d,
+        toq.conv1d_relu,
+        toq.conv2d,
+        toq.conv2d_relu,
+        toq.conv3d,
+        toq.conv3d_relu,
+        toq.cat,
+        toq.elu,
+        toq.hardswish,
+        toq.instance_norm,
+        toq.layer_norm,
+        toq.leaky_relu,
+        toq.dropout,
+        toq.prelu,
+        # TODO(future PR): implement shadowing for binary ops and
+        # uncomment below
+        # toq.add,
+        # toq.mul,
+    }
+
+    FUNS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
+        F.relu,
+        F.tanh,
+        torch.tanh,
+        F.sigmoid,
+        torch.sigmoid,
+        F.hardsigmoid,
+        operator.floordiv,
+        torch.adaptive_avg_pool1d,
+        F.adaptive_avg_pool2d,
+        F.adaptive_avg_pool3d,
+        F.dropout,
+        F.hardtanh,
+        F.hardtanh_,
+        F.interpolate,
+        F.max_pool1d,
+        F.max_pool2d,
+        F.max_pool3d,
+        F.relu6,
+        F.pixel_shuffle,
+        F.pixel_unshuffle,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.cat,
+        torch.chunk,
+        torch.clamp,
+        torch.flatten,
+        torch.transpose,
+        torch.max,
+        torch.mean,
+        torch.min,
+        torch.narrow,
+        torch.repeat_interleave,
+        torch.sort,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+        operator.add,
+    }
+
+    MODS_IO_TYPE_FP32: Set[NSNodeTargetType] = {
+        nn.Linear,
+        nnqat.Linear,
+        nnqatd.Linear,
+        nnqd.Linear,
+        torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.Conv3d,
+        nnqat.Conv1d,
+        nnqat.Conv2d,
+        nnqat.Conv3d,
+        nnqat.Embedding,
+        nnqat.EmbeddingBag,
+        nn.LSTM,
+        # note: nnqd.Linear is an instance of nnq.Linear, so this
+        # check has to happen before the int8 module check
+        nnqd.LSTM,
+        nn.BatchNorm2d,
+        nn.BatchNorm3d,
+        nn.Dropout,
+        nn.ConvTranspose1d,
+        nn.ConvTranspose2d,
+        nn.ConvTranspose3d,
+        nn.ELU,
+        nn.GroupNorm,
+        nn.InstanceNorm1d,
+        nn.InstanceNorm2d,
+        nn.InstanceNorm3d,
+        nn.LayerNorm,
+        nn.Hardswish,
+        nn.LeakyReLU,
+        nn.ReLU6,
+        nn.SiLU,
+        nn.Mish,
+        nn.Softmax,
+        nn.PReLU,
+        nni.BNReLU2d,
+        nni.BNReLU3d,
+        nni.ConvReLU1d,
+        nni.ConvReLU2d,
+        nni.ConvReLU3d,
+        nni.LinearReLU,
+        nni.LinearBn1d,
+        nni.ConvBn1d,
+        nni.ConvBn2d,
+        nni.ConvBn3d,
+        nniqat.ConvBn1d,
+        nniqat.ConvBn2d,
+        nniqat.ConvBn3d,
+        nniqat.ConvBnReLU1d,
+        nniqat.ConvBnReLU2d,
+        nniqat.ConvBnReLU3d,
+        nniqat.ConvReLU1d,
+        nniqat.ConvReLU2d,
+        nniqat.ConvReLU3d,
+        nniqat.LinearReLU,
+        nniqat.LinearBn1d,
+        nniqd.LinearReLU,
+        nni.LinearLeakyReLU,
+        nni.LinearTanh,
+        nni.ConvAdd2d,
+        nni.ConvAddReLU2d,
+    }
+
+    MODS_IO_TYPE_INT8: Set[NSNodeTargetType] = {
+        nnq.Linear,
+        nnq.Conv1d,
+        nnq.Conv2d,
+        nnq.Conv3d,
+        nnq.BatchNorm2d,
+        nnq.BatchNorm3d,
+        nnq.Dropout,
+        nnq.ConvTranspose1d,
+        nnq.ConvTranspose2d,
+        nnq.ELU,
+        nnq.InstanceNorm1d,
+        nnq.InstanceNorm2d,
+        nnq.InstanceNorm3d,
+        nnq.LayerNorm,
+        nnq.Hardswish,
+        nnq.LeakyReLU,
+        nnq.Embedding,
+        nnq.EmbeddingBag,
+        nnq.Dropout,
+        nnq.Softmax,
+        nnq.PReLU,
+        nniq.BNReLU2d,
+        nniq.BNReLU3d,
+        nniq.ConvReLU1d,
+        nniq.ConvReLU2d,
+        nniq.ConvReLU3d,
+        nniq.LinearReLU,
+        nniq.LinearLeakyReLU,
+        nniq.LinearTanh,
+        nniq.ConvAdd2d,
+        nniq.ConvAddReLU2d,
+    }
+
+    MODS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
+        nn.ReLU,
+        nn.Tanh,
+        nn.Sigmoid,
+        nn.Hardsigmoid,
+        nn.AdaptiveAvgPool1d,
+        nn.AdaptiveAvgPool2d,
+        nn.AdaptiveAvgPool3d,
+        nn.AvgPool1d,
+        nn.AvgPool2d,
+        nn.AvgPool3d,
+        nn.Dropout,
+        nn.Hardtanh,
+        nn.Identity,
+        nn.MaxPool1d,
+        nn.MaxPool2d,
+        nn.MaxPool3d,
+        nn.PixelShuffle,
+        nn.PixelUnshuffle,
+        nn.ReLU6,
+    }
+
+    METHS_IO_TYPE_FP32_OR_INT8: Set[NSNodeTargetType] = {
+        'sigmoid_',
+        'sigmoid',
+        'tanh_',
+        'tanh',
+        'hardsigmoid_',
+        'hardsigmoid',
+        'relu_',
+        'relu',
+    }
+
+    return {
+        'funs_io_type_fp32': FUNS_IO_TYPE_FP32,
+        'funs_io_type_fp16': FUNS_IO_TYPE_FP16,
+        'funs_io_type_int8': FUNS_IO_TYPE_INT8,
+        'funs_io_type_fp32_or_int8': FUNS_IO_TYPE_FP32_OR_INT8,
+        'mods_io_type_fp32': MODS_IO_TYPE_FP32,
+        'mods_io_type_int8': MODS_IO_TYPE_INT8,
+        'mods_io_type_fp32_or_int8': MODS_IO_TYPE_FP32_OR_INT8,
+        'meths_io_type_fp32_or_int8': METHS_IO_TYPE_FP32_OR_INT8,
+    }
+
+
+def get_unmatchable_types_map() -> Dict[str, Set[NSNodeTargetType]]:
+
+    FUNS_UNMATCHABLE: Set[NSNodeTargetType] = {
+        torch.quantize_per_tensor,
+        operator.getitem,
+    }
+
+    MODS_UNMATCHABLE: Set[NSNodeTargetType] = {
+        nn.Identity,
+    }
+
+    METHS_UNMATCHABLE: Set[NSNodeTargetType] = {
+        'to',
+        'dequantize',
+        'reshape',
+        'view',
+        'unsqueeze_',
+        'unsqueeze',
+        'transpose',
+        'squeeze_',
+        'squeeze',
+        'size',
+        'shape',
+        'resize_',
+        'repeat_interleave',
+        'repeat',
+        'permute',
+        'numel',
+        'mean',
+        'detach_',
+        'detach',
+        'contiguous',
+        'clamp',
+        'chunk',
+    }
+
+    return {
+        'funs_unmatchable': FUNS_UNMATCHABLE,
+        'mods_unmatchable': MODS_UNMATCHABLE,
+        'meths_unmatchable': METHS_UNMATCHABLE,
+    }
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/n_shadows_utils.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/n_shadows_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eda789155e55da8ed1aaead191ffd7b51f045b13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/n_shadows_utils.py
@@ -0,0 +1,1311 @@
+import torch
+import torch.fx
+from torch.fx import (
+    Node,
+    GraphModule,
+    Graph,
+)
+
+from torch.ao.ns.fx.utils import (
+    # TODO(future PR): make this work correctly for methods
+    get_target_type_str,
+    get_normalized_nth_input,
+)
+from torch.ao.ns.fx.ns_types import (
+    NSSingleResultValuesType,
+    NSResultsType,
+)
+from torch.ao.ns.fx.graph_passes import _maybe_get_fqn
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig import QConfigAny
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.ao.quantization.fx.match_utils import _MatchResult
+from torch.utils._pytree import tree_map
+
+import collections
+import copy
+from typing import List, Dict, Set, Tuple, Callable, Any, Optional
+import operator
+
+SHADOW_NODE_NAME_PREFIX = 'shadow'
+SHADOW_WRAPPER_NODE_NAME_PREFIX = 'shadow_wrapper'
+
+# TODO(future PR): reuse existing mapping instead of creating a new one
+BINARY_FUNCTIONS = {
+    torch.add,
+    torch.Tensor.add,
+    operator.add,
+    torch.mul,
+    torch.Tensor.mul,
+    operator.mul,
+}
+
+def _get_attr_name(subgraph_idx, subgraph_candidate_idx):
+    return f"{SHADOW_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}"
+
+def _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx):
+    return f"{SHADOW_WRAPPER_NODE_NAME_PREFIX}_{subgraph_idx}_{subgraph_candidate_idx}"
+
+
+class OutputProp:
+    """
+    Output propagation (modeled from shape propagation).
+
+    Given a GraphModule and an example input, saves the output flowing
+    through each node on `node.traced_result`.
+
+    Code based on the example from
+    https://pytorch.org/docs/stable/fx.html#the-interpreter-pattern
+    """
+    def __init__(self, mod):
+        self.mod = mod
+        self.graph = mod.graph
+        self.modules = dict(self.mod.named_modules())
+
+    def propagate(self, *args):
+        args_iter = iter(args)
+        env : Dict[str, Node] = {}
+
+        def load_arg(a):
+            return torch.fx.graph.map_arg(a, lambda n: env[n.name])
+
+        def fetch_attr(target : str):
+            target_atoms = target.split('.')
+            attr_itr = self.mod
+            for i, atom in enumerate(target_atoms):
+                if not hasattr(attr_itr, atom):
+                    raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+                attr_itr = getattr(attr_itr, atom)
+            return attr_itr
+
+        for node in self.graph.nodes:
+            if node.op == 'placeholder':
+                result = next(args_iter)
+            elif node.op == 'get_attr':
+                result = fetch_attr(node.target)
+            elif node.op == 'call_function':
+                result = node.target(*load_arg(node.args), **load_arg(node.kwargs))
+            elif node.op == 'call_method':
+                self_obj, *args = load_arg(node.args)
+                kwargs = load_arg(node.kwargs)
+                result = getattr(self_obj, node.target)(*args, **kwargs)
+            elif node.op == 'call_module':
+                result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs))
+
+            if isinstance(result, torch.Tensor):  # type: ignore[possibly-undefined]
+                node.traced_result = result
+
+            env[node.name] = result
+
+        return None
+
+def _get_dedup_subgraphs(
+    matches: Dict[str, _MatchResult]
+) -> Dict[str, List[Node]]:
+    # the original matches variable is unique by node, make it unique by subgraph
+    # instead
+    seen_nodes = set()
+    subgraphs_dedup = {}
+
+    # Dict items are not reversible until Python 3.8, so we hack it
+    # to be compatible with previous Python versions
+    # TODO(future PR): try reversed(list(matches.items()))
+    matches_items_reversed: List[Tuple[str, _MatchResult]] = []
+    for name, cur_match in matches.items():
+        matches_items_reversed.insert(0, (name, cur_match))
+
+    # Note: the order is important.  `matches` currently provides the matches
+    # in reverse order.  We would like to process the matches in non-reverse
+    # order, so that we can create an intuitive naming scheme, such as
+    # naming the first op's submodules `shadow_0_0` through `shadow_0_(n-1)`
+    for name, cur_match in matches_items_reversed:  # type: ignore[call-overload]
+        was_seen = False
+        for node_or_tuple in cur_match[1]:
+
+            # Cur_match[1] has an unusual type. It says that it's a `List[Node]`,
+            # but it is really not. Furthermore, the contents of this field
+            # can change from match results of multiple nodes of the same pattern
+            #
+            # For example, for conv -> bn -> relu, we see
+            # match_results = {
+            #   'conv': (relu, [(bn, conv), relu], ...),
+            #   'bn': (relu, [(bn, conv), relu], ...),
+            #   'relu': (relu, [(bn, conv), relu], ...),
+            # }
+            #
+            # Ideally we should clean up the `find_matches` function to make
+            # this more intuitive. For the purposes of this prototype, we hack
+            # around it.
+
+            if isinstance(node_or_tuple, Node):
+                if node_or_tuple in seen_nodes:
+                    was_seen = True
+                seen_nodes.add(node_or_tuple)
+
+            else:
+                assert isinstance(node_or_tuple, tuple)
+                for node in node_or_tuple:
+                    assert isinstance(node, Node)
+                    if node in seen_nodes:
+                        was_seen = True
+                    seen_nodes.add(node)
+
+        if was_seen:
+            continue
+
+        # Start with the unusual type, convert it to [op_0, ..., op_n]
+        list_of_nodes = []
+
+        if len(cur_match[1]) == 1:
+            list_of_nodes = cur_match[1]
+        else:
+            assert len(cur_match[1]) == 2
+            # either (a, b), or ((a, b), c) or (c, (a, b))
+            # cannot make any assumptions on order, not clear what the
+            # _find_matches function is doing to populate this
+            # TODO(future PR): make this code less confusing,  see discussion
+            # in https://github.com/pytorch/pytorch/pull/80521/files#r975918836
+
+            def _order_nodes(node_a, node_b, node_c) -> List[Node]:
+                nodes = [node_a, node_b, node_c]
+                first_node = None
+                mid_node = None
+                last_node = None
+                for n in nodes:
+                    prev_n = n.args[0]
+                    next_n = next(iter(n.users))
+                    if prev_n not in nodes:
+                        first_node = n
+                    elif next_n not in nodes:
+                        last_node = n
+                    else:
+                        mid_node = n
+                assert first_node is not None and mid_node is not None and \
+                    last_node is not None
+                assert mid_node.args[0] is first_node
+                assert last_node.args[0] is mid_node
+                return [last_node, mid_node, first_node]
+
+            if isinstance(cur_match[1][0], Node) and isinstance(cur_match[1][1], Node):
+                # (a, b)
+                list_of_nodes = cur_match[1]
+            elif isinstance(cur_match[1][0], tuple):
+                # ((a, b), c)
+                node_a, node_b = cur_match[1][0]
+                node_c = cur_match[1][1]
+                list_of_nodes = _order_nodes(node_a, node_b, node_c)
+            elif isinstance(cur_match[1][1], tuple):
+                # (a, (b, c))
+                node_a, node_b = cur_match[1][1]
+                node_c = cur_match[1][0]
+                list_of_nodes = _order_nodes(node_a, node_b, node_c)
+
+        # [node_n, ..., node_0], note that the order is reversed
+        # to make it chronological for simple subgraphs
+        list_of_nodes.reverse()
+        subgraphs_dedup[name] = list_of_nodes
+
+    return subgraphs_dedup
+
+def _get_logger_for_subgraph(
+    model: GraphModule,
+    first_node: Node,
+    last_node: Node,
+    subgraph_idx: int,
+    subgraph_candidate_idx: int,
+    qconfig_str: str,
+    logger_cls: Callable,
+    fqn: Optional[str],
+) -> torch.nn.Module:
+    """
+    Given a model and a linear subgraph starting from `first_node` and
+    ending with `last_node`, creates a logger for the end of this
+    subgraph.
+    """
+    if fqn is None:
+        fqn = ''
+    logger_mod_orig = logger_cls(
+        first_node.name,  # ref_node_name
+        last_node.name,  # prev_node_name
+        f'subgraph_{subgraph_idx}_{subgraph_candidate_idx}',  # model_name
+        'model',  # ref_name
+        get_target_type_str(last_node, model),  # prev_node_target_type
+        get_target_type_str(first_node, model),  # ref_node_target_type
+        NSSingleResultValuesType.NODE_OUTPUT.value,  # results_type
+        0,  # index_within_arg
+        0,  # index_of_arg
+        fqn,  # fqn
+        qconfig_str,
+    )
+    # Usually we expect the user to add loggers, then calibrate, then convert,
+    # and then populate loggers.  This is why the loggers start disabled.
+    # TODO(future PR): reconsider the design to make this more intuitive.
+    logger_mod_orig.enabled = False
+    return logger_mod_orig
+
+def create_submodule_from_subgraph(
+    model: torch.nn.Module,
+    first_node: Node,
+    last_node: Node,
+) -> GraphModule:
+    """
+    Input: a model, and a linear subgraph within the model from first_node to
+      last_node.
+
+    Output: a new submodule containing a copy of the subgraph, with the inputs
+      to the first node becoming the inputs to the submodule, and all other
+      nodes in the subgraph being copied.
+
+    Example inputs:
+
+    `model`: a module with graph
+
+      x0 -> op1 -> x1 -> op2 -> x2
+             |
+            arg1
+
+    `first_node`: op1
+    `last_node`: op2
+
+    Example output: a new module with graph
+
+      input1 -> op1_copy -> x1 -> op2_copy -> output1
+                   |
+                  arg1
+    """
+
+    #
+    # create a blank GraphModule with an empty graph
+    #
+
+    class M(torch.nn.Module):
+        def forward(self, x):
+            pass
+
+    m = M()
+    gm = torch.fx.symbolic_trace(m)
+    g = gm.graph
+    for node in reversed(gm.graph.nodes):
+        g.erase_node(node)
+
+    #
+    # modify the graph to have a copy of our subgraph
+    #
+
+    cur_node_orig = first_node
+    cur_args_orig = cur_node_orig.args
+    cur_kwargs_orig = cur_node_orig.kwargs
+
+    cur_name_idx = 0
+
+    iteration_limit = 100
+    cur_iteration = 0
+
+    while True:
+        if cur_node_orig is first_node:
+            # we are at the first node, we need to set up graph inputs
+            # TODO(future): some graphs could have placeholders which are unrelated
+            # to the first node, need to handle this
+            cur_args_copy = []
+            cur_kwargs_copy = {}
+            seen_names: Set[str] = set()
+            old_name_to_new_node: Dict[str, Node] = {}
+
+            def _add_placeholder(
+                g: Graph, node: Node, seen_names, old_name_to_new_node
+            ):
+                # note: for graphs starting with patterns such as `y = x + x`, we
+                # need to ensure we do not add multiple placeholders with the
+                # same name
+                counter = 0
+                while node.name + '_' + str(counter) in seen_names:
+                    counter += 1
+                cur_name = node.name + '_' + str(counter)
+                seen_names.add(cur_name)
+                placeholder = g.placeholder(cur_name)
+                old_name_to_new_node[node.name] = placeholder
+                return placeholder
+
+            for arg in cur_node_orig.args:
+                if isinstance(arg, Node):
+                    p = _add_placeholder(
+                        g, arg, seen_names, old_name_to_new_node)
+                    cur_args_copy.append(p)
+                elif isinstance(arg, (list, tuple)):
+                    new_arg = []
+                    for inner_arg in arg:
+                        if isinstance(inner_arg, Node):
+                            new_arg.append(_add_placeholder(
+                                g, inner_arg, seen_names, old_name_to_new_node))
+                        else:
+                            new_arg.append(inner_arg)
+                    cur_args_copy.append(new_arg)
+                else:
+                    cur_args_copy.append(arg)
+
+            # TODO(future PR): handle non-normalized kwargs
+            for kwarg_name, kwarg in cur_node_orig.kwargs.items():
+                if isinstance(kwarg, Node):
+                    cur_kwargs_copy[kwarg_name] = _add_placeholder(
+                        g, kwarg, seen_names, old_name_to_new_node)
+                elif isinstance(kwarg, (list, tuple)):
+                    new_kwarg = []
+                    for inner_kwarg in kwarg:
+                        p = _add_placeholder(
+                            g, inner_kwarg, seen_names, old_name_to_new_node)
+                        new_kwarg.append(p)
+                    cur_kwargs_copy[kwarg_name] = new_kwarg
+                else:
+                    cur_kwargs_copy[kwarg_name] = kwarg
+
+            cur_args_copy = tuple(cur_args_copy)  # type: ignore[assignment]
+        else:
+            # we are not at first node, first arg is from the previous node,
+            # and all other args are copied
+
+            # the current implementation is simplistic and cannot handle
+            # ops with two or more arguments which need to be passed from
+            # the previous op, so we assert them out
+            assert cur_node_orig.target not in BINARY_FUNCTIONS
+
+            # at this point in the code, cur_node_copy is pointing to the copy
+            # of the previous node
+            # TODO(future PR): this is not handling complicated graphs correctly, need to
+            # look at actual relationships instead of assuming sequential graph
+            # TODO(future PR): this is ignoring kwargs, will need to support kwargs
+            # for any fusion pattern which has them for a node that is not the
+            # first node.
+            cur_args_copy = [cur_node_copy]  # type: ignore[has-type, possibly-undefined]  # noqa: F821
+
+            if len(cur_node_orig.args) > 1:
+                for arg in cur_node_orig.args[1:]:
+                    if isinstance(arg, torch.nn.Parameter):
+                        new_arg = arg.clone().detach()  # type: ignore[assignment]
+                        mod_name = f"mod_{cur_name_idx}"
+                        cur_name_idx += 1
+                        setattr(gm, mod_name, new_arg)
+                        new_arg_placeholder = gm.placeholder(mod_name)
+                        cur_args_copy.append(new_arg_placeholder)
+                    elif isinstance(arg, (float, int, torch.dtype)):
+                        cur_args_copy.append(arg)
+                    else:
+                        raise AssertionError(f'arg of type {type(arg)} not handled yet')
+            cur_args_copy = tuple(cur_args_copy)  # type: ignore[assignment]
+
+        # copy the node
+        if cur_node_orig.op == 'call_module':
+            orig_mod = getattr_from_fqn(model, cur_node_orig.target)  # type: ignore[arg-type]
+            orig_mod_copy = copy.deepcopy(orig_mod)
+            mod_name = f"mod_{cur_name_idx}"
+            setattr(gm, mod_name, orig_mod_copy)
+            cur_name_idx += 1
+            cur_node_copy = g.call_module(mod_name, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
+
+        elif cur_node_orig.op == 'call_function':
+            cur_node_copy = g.call_function(
+                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
+
+        elif cur_node_orig.op == 'call_method':
+            cur_node_copy = g.call_method(
+                cur_node_orig.target, cur_args_copy, cur_kwargs_copy)  # type: ignore[possibly-undefined]
+
+        else:
+            raise AssertionError(f'{cur_node_orig.op} not supported yet')
+
+        if cur_node_orig is last_node:
+            break
+
+        # go to next node
+        assert len(cur_node_orig.users.keys()) == 1, \
+            f'{cur_node_orig} has more than 1 users, not supported yet'
+        cur_node_orig = next(iter(cur_node_orig.users.keys()))
+        cur_args_orig = cur_node_orig.args
+        cur_kwargs_orig = cur_node_orig.kwargs
+
+        cur_iteration += 1
+        if cur_iteration > iteration_limit:
+            raise AssertionError('iteration limit exceeded')
+
+    # set up outputs
+    g.output(cur_node_copy)
+
+    gm.recompile()
+    return gm
+
+def create_one_transformed_and_logged_copy_of_subgraph(
+    mt: GraphModule,
+    subgraph_idx: int,
+    subgraph_candidate_idx: int,
+    first_node: Node,
+    last_node: Node,
+    fqn: Optional[str],
+    list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]],
+    example_inputs: Any,
+    last_added_shadow_node_list: List[Optional[Node]],
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Given a subgraph in `mt` and a subgraph candidate idx, inserts the
+    subgraph candidate copy and instruments it with loggers.
+
+    If subgraph_candidate_idx is 0, this is the baseline fp32 subgraph and we just
+    add a logger to the end.
+
+    If subgraph_candidate_idx is not 0, we create a copy of the subgraph and
+    prepare it with `prepare_fx`.
+    """
+
+    # TODO(future PR): move logger classes to utils to remove circular dependency
+    from torch.ao.ns._numeric_suite_fx import OutputLogger, OutputComparisonLogger
+
+    if subgraph_candidate_idx == 0:
+        # idx = 0 is the floating point (original) version of the subgraph
+        # We keep the subgraph as is, and add a logger at the end
+
+        qconfig_str = ''
+        logger_mod_orig = _get_logger_for_subgraph(
+            mt, first_node, last_node, subgraph_idx, subgraph_candidate_idx,
+            qconfig_str, OutputLogger, fqn)
+
+        attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+        assert not hasattr(mt, attr_name)
+        setattr(mt, attr_name, logger_mod_orig)
+        with mt.graph.inserting_after(last_node):
+            new_node = mt.graph.call_module(attr_name, args=(last_node,), kwargs={})
+            last_added_shadow_node_list[0] = new_node
+
+    else:
+        # idx > 0 means we have a candidate qconfig to try, so we need
+        # to make a copy of the subgraph, feed it with the right inputs,
+        # and add a logger at the end
+
+        # get the qconfig
+        # subtract one because the first candidate is the floating point
+        # version of the subgraph
+        node_name_to_qconfig = \
+            list_of_node_name_to_qconfig[subgraph_candidate_idx - 1]
+        qconfig = node_name_to_qconfig[first_node.name]
+
+        # if no quantization is requested, skip
+        # TODO(future PR): deduplicate equivalent qconfigs that come from
+        #   different qconfig mapping objects
+        if qconfig is None:
+            return
+
+        qconfig_mapping = QConfigMapping().set_global(qconfig)
+
+        # create a copy of the submodule, wrapped in a separate module
+        orig_mod_copy_wrapped = create_submodule_from_subgraph(
+            mt, first_node, last_node)
+
+        # add a call to prepare_fx on the wrapper module
+        if custom_prepare_fn is None:
+            orig_mod_copy_wrapped = torch.ao.quantization.quantize_fx.prepare_fx(
+                orig_mod_copy_wrapped, qconfig_mapping, example_inputs=example_inputs)
+        else:
+            if custom_prepare_kwargs is None:
+                custom_prepare_kwargs = {}
+            for kwarg_name in ["example_inputs", "prepare_custom_config", "qconfig_mapping"]:
+                assert kwarg_name not in custom_prepare_kwargs, f"cannot specify {kwarg_name} in custom_prepare_kwargs"
+            prepare_kwargs: Dict[str, Any] = {
+                "example_inputs": example_inputs,
+                "qconfig_mapping": qconfig_mapping
+            }
+            prepare_kwargs.update(custom_prepare_kwargs)
+            orig_mod_copy_wrapped = custom_prepare_fn(
+                orig_mod_copy_wrapped,
+                **prepare_kwargs)
+
+        # attach the wrapper to the model
+        attr_name = _get_attr_wrapper_name(subgraph_idx, subgraph_candidate_idx)
+        assert not hasattr(mt, attr_name)
+        setattr(mt, attr_name, orig_mod_copy_wrapped)
+
+        # add a call to the wrapper module from the parent graph
+        insert_after_node = last_added_shadow_node_list[0]
+        with mt.graph.inserting_after(insert_after_node):
+            # TODO(future PR): handle fusion patterns where non-first nodes
+            # need inputs
+
+            # pass in all node args and kwargs
+
+            new_args = []
+            for arg in first_node.args:
+                if isinstance(arg, Node):
+                    new_args.append(arg)
+                elif isinstance(arg, (list, tuple)) and len(arg) and isinstance(arg[0], Node):
+                    for inner_arg in arg:
+                        if isinstance(inner_arg, Node):
+                            new_args.append(inner_arg)
+
+            new_kwargs = {}
+            for name, old_kwarg in first_node.kwargs.items():
+                if isinstance(old_kwarg, Node):
+                    new_kwargs[name] = old_kwarg
+                elif isinstance(old_kwarg, (list, tuple)) and len(old_kwarg):
+                    # TODO(future PR): clarify why we are adding kwargs to args
+                    new_args.extend(old_kwarg)
+
+            new_args = tuple(new_args)  # type: ignore[assignment]
+
+            new_node = mt.graph.call_module(
+                attr_name, args=new_args, kwargs=new_kwargs)
+
+        # add a logger to parent graph to observe the shadow wrapper
+        logger_mod_orig = _get_logger_for_subgraph(
+            mt, first_node, last_node, subgraph_idx, subgraph_candidate_idx,
+            str(qconfig), OutputComparisonLogger, fqn)
+
+        attr_name = _get_attr_name(subgraph_idx, subgraph_candidate_idx)
+        assert not hasattr(mt, attr_name)
+        setattr(mt, attr_name, logger_mod_orig)
+        with mt.graph.inserting_after(new_node):
+            logger = mt.graph.call_module(attr_name, args=(new_node, last_node), kwargs={})
+            last_added_shadow_node_list[0] = logger
+
+    mt.recompile()
+
+def create_n_transformed_and_logged_copies_of_subgraph(
+    mt: GraphModule,
+    subgraph_idx: int,
+    match_name: str,
+    nodes_in_this_subgraph: List[Any],
+    qconfig_mappings: List[QConfigMapping],
+    list_of_node_name_to_qconfig: List[Dict[str, QConfigAny]],
+    custom_prepare_fn: Optional[Callable] = None,
+    custom_prepare_kwargs: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Given a model `mt` and a subgraph_idx, creates the needed copies
+    of the subgraph for all qconfigs, and instruments them with loggers.
+    """
+    # for now, assume that
+    # 1. the first node has one input
+    # 2. the last node has one output
+
+    # for now, ignore all subgraphs that contain non-nodes (tuples, etc)
+    # TODO(future PR): implement this
+    if any(
+        not isinstance(node, Node)
+        for node in nodes_in_this_subgraph
+    ):
+        return
+
+    first_node = nodes_in_this_subgraph[0]
+    last_node = nodes_in_this_subgraph[-1]
+    # We used output propagation to populate example values on each
+    # node. Use the example values from the previous node as the input
+    # to the current node.
+    prev_node = get_normalized_nth_input(first_node, mt, 0)
+    if isinstance(prev_node, list):
+        example_inputs = [x.traced_result for x in prev_node]
+    elif isinstance(prev_node, tuple):
+        example_inputs = (x.traced_result for x in prev_node)  # type: ignore[assignment]
+    else:
+        # currently some customer models do not have a traced_result in
+        # every node, so we have to guard for this case since we cannot
+        # quantize without an example input
+        # TODO(future PR): add a test case for this once we have an easy
+        # repro, see https://github.com/pytorch/pytorch/pull/80521/files#r975940489
+        # for additional context
+        if hasattr(prev_node, 'traced_result'):
+            example_inputs = (prev_node.traced_result,)  # type: ignore[attr-defined, assignment]
+        else:
+            print(
+                'unable to get example input for node ' +
+                f'{first_node.format_node()}, skipping')
+            return
+
+    # If there are no quantization configs for this subgraph, skip adding
+    # loggers. This reduces memory usage for models where not all layers are
+    # quantized.
+    # TODO(future): consider making this configurable
+    found_at_least_one_qconfig = False
+    for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
+
+        if subgraph_candidate_idx == 0:
+            # fp32 baseline does not need a qconfig
+            continue
+
+        # a. we have N shadows, so len(qconfig_mappings) is N
+        # b. we will have the fp32 layer + N shadows, so overall number of
+        #    (original_op) + (*shadows) will be N+1
+        # c. since `subgraph_candidate_idx` represents (b), we need
+        #    to subtract 1 to query from (a)
+        node_name_to_qconfig = \
+            list_of_node_name_to_qconfig[subgraph_candidate_idx - 1]
+        qconfig = node_name_to_qconfig[first_node.name]
+        if qconfig is not None:
+            found_at_least_one_qconfig = True
+            break
+    if not found_at_least_one_qconfig:
+        print('unable to find at least one qconfig for node ' +
+              f'{first_node.format_node()}, skipping')
+        return
+
+    fqn = _maybe_get_fqn(first_node, mt)
+
+    # We want the results to contain the subgraphs in natural order,
+    # and the graph to also contain shadow wrappers and shadow loggers
+    # in natural order.
+    # If we just iterate in reverse, the graph will be in natural
+    # order but the eventual results will be in reverse order.
+    # So, we keep track of the last shadow logger we added and
+    # always insert after it.
+    last_added_shadow_node_list: List[Optional[Node]] = [None]
+    for subgraph_candidate_idx in range(len(qconfig_mappings) + 1):
+
+        create_one_transformed_and_logged_copy_of_subgraph(
+            mt, subgraph_idx, subgraph_candidate_idx, first_node,
+            last_node, fqn, list_of_node_name_to_qconfig,
+            example_inputs, last_added_shadow_node_list, custom_prepare_fn,
+            custom_prepare_kwargs)
+
+def create_add_loggers_graph(
+    model: GraphModule,
+    subgraphs_dedup: Dict[str, List[Node]],
+    qconfig_mapping: QConfigMapping,
+    node_name_to_qconfig: Dict[str, QConfigAny],
+) -> None:
+    r"""
+    Given a model, a model graph partition (currently a set of matched
+    subgraphs) and instructions how to transform each subgraph
+    (currently quantizing it according to qconfig_mapping), modifies
+    the model graph to create an alternate path through the original graph,
+    with each of the subgraphs quantized.  This is useful to compare
+    propagation error of a transformation such as quantization.
+
+    For example, given layer op0 and op1, there are four cases when handling op1:
+    1. op0 and op1 quantized
+    2. op0 and op1 unquantized
+    3. op0 quantized, op1 unquantized
+    4. op0 unquantized, op1 quantized
+
+    Example input, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \          \                 \       # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog    op1_1 -> x2_1 ----> clog
+
+    Example output, case 1:
+
+    .. code::
+
+      x0_0 -> op0_0 -> x1_0 -> log -----> op1_0 -> x2_0 -> log
+       \                        \                           \        # noqa: W605
+         ---> op0_1 -> x1_1 ----> clog -> op1_1 -> x2_1 ----> clog
+
+    """
+    # TODO(future PR): move logger classes to utils to remove circular dependency
+    from torch.ao.ns._numeric_suite_fx import OutputLogger, OutputComparisonLogger
+
+    def _get_subgraph_containing_node(node, subgraphs_dedup):
+        for subgraph in subgraphs_dedup.values():
+            if node in subgraph:
+                return subgraph
+        return None
+
+    # First, we need to create shadow branches, going from
+    #
+    #   x0 -> op0 -> x1 -> ...
+    #
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog
+    #
+    # Later, the outputs of each shadow will be rerouted to calculate
+    # propagation error.
+
+    # Note: we cannot iterate over matched subgraphs because some nodes
+    # may not be matched. So, we iterate over nodes in the graph, and
+    # associate them to matched subgraphs if possible.
+
+    nodes_to_skip = set()
+    # for each subgraph, save a mapping from first node of subgraph
+    # to first and last node of the shadow of this subgraph
+    orig_first_node_to_shadow_in_node = {}
+    orig_first_node_to_shadow_out_node = {}
+    # need to record original list because we will mutate the graph as we go
+    orig_nodes = list(model.graph.nodes)  # type: ignore[union-attr, arg-type]
+    cur_subgraph_idx = 0
+    for n in orig_nodes:
+        if n.op in ('placeholder', 'get_attr', 'output') or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        insert_submodule_copy = False
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            for node_to_skip in maybe_subgraph:
+                nodes_to_skip.add(node_to_skip)
+            qconfig = node_name_to_qconfig[first_node.name]
+            if qconfig is not None:
+                insert_submodule_copy = True
+        else:
+            first_node, last_node = n, n
+
+        if insert_submodule_copy:
+            match_name = first_node.name
+            create_n_transformed_and_logged_copies_of_subgraph(
+                model, cur_subgraph_idx, match_name, maybe_subgraph,
+                [qconfig_mapping], [node_name_to_qconfig],
+                None, None  # type: ignore[arg-type]
+            )
+            # find the created shadow module and record it so we
+            # can find it easily in step 2
+            expected_shadow_target = f"shadow_wrapper_{cur_subgraph_idx}_1"
+            new_shadow_mod = None
+            for maybe_shadow_mod in model.graph.nodes:
+                if maybe_shadow_mod.op == 'call_module' and \
+                        maybe_shadow_mod.target == expected_shadow_target:
+                    new_shadow_mod = maybe_shadow_mod
+                    break
+            assert new_shadow_mod is not None
+            orig_first_node_to_shadow_in_node[first_node] = new_shadow_mod
+            orig_first_node_to_shadow_out_node[first_node] = new_shadow_mod
+
+        else:
+            # create a copy of the subgraph by only copying FX nodes
+            # but not copying any parameters, to minimize memory usage
+            subgraph_to_use = maybe_subgraph if maybe_subgraph is not None \
+                else [first_node]
+
+            # add a regular logger after last_node
+            qconfig_str = ''
+            subgraph_candidate_idx = 0
+            fqn = _maybe_get_fqn(first_node, model)
+            logger_mod_orig = _get_logger_for_subgraph(
+                model, first_node, last_node, cur_subgraph_idx, subgraph_candidate_idx,
+                qconfig_str, OutputLogger, fqn)
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            assert not hasattr(model, attr_name)
+            setattr(model, attr_name, logger_mod_orig)
+            insertion_point = last_node
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(last_node,), kwargs={})
+                insertion_point = logger
+
+            # create a copy of the subgraph
+            cur_node_orig = first_node
+            cur_node_copy = None
+            first_node_copy = None
+            while cur_node_orig in subgraph_to_use:
+                # TODO(future PR): make this support all possible args/kwargs
+                if cur_node_orig is first_node:
+                    new_args = cur_node_orig.args
+                    new_kwargs = cur_node_orig.kwargs
+                else:
+                    first_arg_for_copy = cur_node_copy
+                    new_args = tuple([first_arg_for_copy, *cur_node_orig.args[1:]])  # noqa: C409
+                    new_kwargs = cur_node_orig.kwargs
+                # make a copy of cur_node_orig
+                with model.graph.inserting_after(insertion_point):
+                    cur_node_copy = model.graph.create_node(
+                        cur_node_orig.op,
+                        cur_node_orig.target,
+                        new_args,
+                        new_kwargs,
+                        # cur_node_orig.name,  # TODO(future PR): set name explicitly
+                    )
+                    if first_node_copy is None:
+                        first_node_copy = cur_node_copy
+                # since now only linear subgraphs are supported, all nodes
+                # except the last one must have only one user
+                if cur_node_orig != last_node:
+                    assert len(cur_node_orig.users.keys()) == 1
+                cur_node_orig = next(iter(cur_node_orig.users.keys()))
+                assert not cur_node_orig.name.startswith(SHADOW_NODE_NAME_PREFIX)
+                insertion_point = cur_node_copy
+
+            # add a comparison logger after last_node's copy
+            subgraph_candidate_idx = 1
+            logger_mod_orig = _get_logger_for_subgraph(
+                model, first_node, last_node, cur_subgraph_idx, subgraph_candidate_idx,
+                qconfig_str, OutputComparisonLogger, fqn)
+            attr_name = _get_attr_name(cur_subgraph_idx, subgraph_candidate_idx)
+            assert not hasattr(model, attr_name)
+            setattr(model, attr_name, logger_mod_orig)
+            with model.graph.inserting_after(insertion_point):
+                logger = model.graph.call_module(
+                    attr_name, args=(cur_node_copy, last_node), kwargs={})
+
+            # save the final node so we can use it in step 2
+            orig_first_node_to_shadow_in_node[first_node] = first_node_copy
+            orig_first_node_to_shadow_out_node[first_node] = cur_node_copy
+
+        cur_subgraph_idx += 1
+
+    model.recompile()
+
+    # Now, we go from
+    #
+    #   x0 -> op0_0 -> x1_0 -> log -> x1 -> op1_0 -> ...
+    #    \                     \       \
+    #      -> op0_1 -> x1_1 -> clog      -> op1_1 -> ...
+    #
+    # to
+    #
+    #   x0 -> op0_0 -> x1_0 -> log --> x1_0 -> op1_0 -> ...
+    #    \                     \
+    #      -> op0_1 -> x1_1 -> clog -> x1_1 -> op1_1 -> ...
+    #
+    # sample values of key internal variables for the example above:
+    #
+    #   orig_first_node_to_shadow_in_node = {op0_0: op0_1, op1_0: op1_1}
+    #   orig_first_node_to_shadow_out_node = {op0_0: op0_1, op1_0: op1_1}
+    #
+    # note: for subgraphs with more than one node, in_node will be different
+    # compared to out_node
+
+
+    nodes_to_skip = set()
+    for n in orig_nodes:
+        if n.op in ('placeholder', 'get_attr', 'output') or n in nodes_to_skip:
+            continue
+
+        maybe_subgraph = _get_subgraph_containing_node(n, subgraphs_dedup)
+        if maybe_subgraph is not None:
+            first_node, last_node = maybe_subgraph[0], maybe_subgraph[-1]
+            for node_to_skip in maybe_subgraph:
+                nodes_to_skip.add(node_to_skip)
+        else:
+            first_node, last_node = n, n
+
+        def maybe_remap_node_to_shadow(node):
+            """
+            If unshadowed `node` has a shadow version, return that. If not,
+            return `node`.
+            """
+            if not isinstance(node, Node):
+                # handle scalars
+                return node
+
+            if node.op in ('placeholder', 'get_attr'):
+                return node
+
+            # Find the shadowed version of this arg from the previous
+            # subgraph. For this, we need to:
+            # 1. navigate to the first node of the previous subgraph
+            # 2. get the output of the shadow wrapper which has (1) as an input
+
+            # For now, assume the arg is in matched subgraphs. In the
+            # future we may have to handle the case where this is not true.
+            prev_subgraph = _get_subgraph_containing_node(
+                node, subgraphs_dedup)
+            if prev_subgraph is None:
+                prev_subgraph = [node]
+            prev_first_node = prev_subgraph[0]
+            prev_shadow_output = \
+                orig_first_node_to_shadow_out_node[prev_first_node]
+            return prev_shadow_output
+
+        cur_shadow_input = \
+            orig_first_node_to_shadow_in_node[first_node]
+        assert cur_shadow_input is not None
+        cur_shadow_input.args = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.args)
+        cur_shadow_input.kwargs = tree_map(
+            maybe_remap_node_to_shadow, cur_shadow_input.kwargs)
+
+        model.recompile()
+
+def _get_weight_info_from_shadow_wrapper(shadow_wrapper: torch.nn.Module):
+    # input: shadow wrapper module
+    # output if shadow wrapper module has a weighted op:
+    #   (quantize_fn, (quantize_fn_args))
+    # output if shadow wrapper module doesn't have a weighted op:
+    #   None
+
+    # For now, assume that the weight is the second input
+    # to the shadow module. If that changes, we can fix it later.
+    placeholders_seen = 0
+    for shadow_n in shadow_wrapper.graph.nodes:  # type: ignore[union-attr]
+        if shadow_n.op != 'placeholder':
+            continue
+
+        placeholders_seen += 1
+        if placeholders_seen != 2:
+            continue
+
+        # the subgraph looks like
+        #
+        #   _input_scale_1 = self._input_scale_1
+        #   _input_zero_point_1 = self._input_zero_point_1
+        #   quantize_per_channel = torch.quantize_per_channel(
+        #       w2_0, _input_scale_1, _input_zero_point_1,
+        #       0, torch.qint8)
+        #
+        #  we have `w2_0`, and are navigating this subgraph
+        #  to get `_input_scale_1` and `_input_zero_point_1`
+
+        assert len(shadow_n.users) == 1
+        quant_node = next(iter(shadow_n.users.keys()))
+        new_args: Any = None
+        if quant_node.target == torch.quantize_per_channel:
+            _weight, scale_node, zp_node, axis, dtype = quant_node.args
+            scale_val = getattr_from_fqn(
+                shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(
+                shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, axis, dtype)
+        else:
+            assert quant_node.target == torch.quantize_per_tensor
+            _weight, scale_node, zp_node, dtype = quant_node.args
+            scale_val = getattr_from_fqn(
+                shadow_wrapper, scale_node.target)
+            zp_val = getattr_from_fqn(
+                shadow_wrapper, zp_node.target)
+            new_args = (scale_val, zp_val, dtype)
+        return (quant_node.target, new_args)
+
+    return None
+
+
+def extract_weight_comparison(m: GraphModule) -> NSResultsType:
+
+    # example graph:
+    #
+    #   w1 = self.w1
+    #   b1 = self.b1
+    #   linear = torch._C._nn.linear(x, w1, b1)
+    #   shadow_0_0 = self.shadow_0_0(linear)
+    #   shadow_wrapper_0_1 = self.shadow_wrapper_0_1(x, w1, b1)
+    #   shadow_0_1 = self.shadow_0_1(shadow_wrapper_0_1, linear)
+    #
+    # algorithm:
+    # 1. for each call_function node matching our allowlist:
+    # 2.   if corresponding shadow wrapper exists, extract the weight pair
+    #
+    # Note: this is not super robust, but that's ok because this is
+    # just for legacy customers who depend on the previous two-model version
+    # of this API. TBD if we need to make this robust.
+    # Note: modules are not supported, since existing customers only
+    # use functions.
+
+    # TODO(future PR): move this to config
+    weighted_ops = {
+        torch.nn.functional.linear,
+    }
+
+    results: NSResultsType = {
+        'model': {NSSingleResultValuesType.WEIGHT.value: {}}
+    }
+
+    for n in m.graph.nodes:  # type: ignore[union-attr]
+        if not (n.op == 'call_function' and n.target in weighted_ops):
+            continue
+
+        # Check if we have a corresponding shadow wrapper
+        # TODO(future PR, if needed): support kwargs
+        # TODO(future PR, if needed): support multiple shadow users
+        first_arg = n.args[0]
+        shadow_wrapper_node = None
+        for user in first_arg.users:
+            # TODO(before land): fix string match
+            if user.op == 'call_module' and \
+                    user.target.startswith('shadow_wrapper'):
+                shadow_wrapper_node = user
+                break
+
+        if shadow_wrapper_node is None:
+            continue
+
+        shadow_wrapper = getattr_from_fqn(
+            m, shadow_wrapper_node.target)  # type: ignore[arg-type]
+        weight_info = _get_weight_info_from_shadow_wrapper(
+            shadow_wrapper)
+        if weight_info is None:
+            continue
+
+        # get weight
+        w_node = n.args[1]
+        w_obj = getattr_from_fqn(m, w_node.target).detach()
+
+        # get a quantized version of weight
+        quant_fn, quant_fn_args_except_first = weight_info
+        new_args = (w_obj, *quant_fn_args_except_first)
+        w_obj_q = quant_fn(*new_args)
+
+        # add a comparison
+        ref_node_name = n.name
+        prev_node_name = n.name
+        ref_node_type = get_target_type_str(n, m)
+        prev_node_type = ref_node_type
+        fqn = None
+        if hasattr(m, '_node_name_to_scope'):
+            fqn = m._node_name_to_scope[n.name][0]  # type: ignore[index]
+        comparison = torch.ao.ns.fx.utils.compute_sqnr(w_obj, w_obj_q)
+        result_fp32 = {
+            'res_type': NSSingleResultValuesType.WEIGHT.value,
+            'values': [w_obj],
+            'prev_node_name': prev_node_name,
+            'prev_node_target_type': prev_node_type,
+            'ref_node_name': ref_node_name,
+            'ref_node_target_type': ref_node_type,
+            'index_within_arg': 0,
+            'index_of_arg': 0,
+            'fqn': fqn,
+            'qconfig_str': '',
+            'comparisons': [comparison],
+            'comparison_fn_name': 'sqnr',
+        }
+        result_q = {
+            'res_type': NSSingleResultValuesType.WEIGHT.value,
+            'values': [w_obj_q],
+            'prev_node_name': prev_node_name,
+            'prev_node_target_type': prev_node_type,
+            'ref_node_name': ref_node_name,
+            'ref_node_target_type': ref_node_type,
+            'index_within_arg': 0,
+            'index_of_arg': 0,
+            'fqn': fqn,
+            'qconfig_str': '',
+            'comparisons': [comparison],
+            'comparison_fn_name': 'sqnr',
+        }
+
+        # go from subgraph_n_1 to subgraph_n_0
+        _1, _2, node_idx, _3 = shadow_wrapper_node.target.split('_')
+        name_fp32 = f"subgraph_{node_idx}_0"
+        name_q = f"subgraph_{node_idx}_1"
+
+        results['model'][NSSingleResultValuesType.WEIGHT.value][name_fp32] = \
+            [result_fp32]
+        results['model'][NSSingleResultValuesType.WEIGHT.value][name_q] = \
+            [result_q]
+
+    return results
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def group_results_by_subgraph(results: NSResultsType) -> Any:
+    """
+    Creates a comparison of results
+
+    Input:
+
+    {
+      'model': {
+        'node_output': {
+          'subgraph_0_0': [
+            'values': [torch.tensor(...), ...], ...
+            'ref_node_name': ...,
+            'ref_node_target_type': ...,
+            'qconfig_str': ...,
+            'comparisons': [], ...
+            'comparison_fn_name': '',
+            'fqn': '...',
+          ],
+          'subgraph_0_1': [
+            'values': [torch.tensor(...), ...], ...
+            'ref_node_name': ...,
+            'ref_node_target_type': ...,
+            'qconfig_str': ...,
+            'comparisons': [torch.tensor(...), ...], ...
+            'comparison_fn_name': '...',
+            'fqn': '...',
+          ],
+          ...
+        },
+      },
+    }
+
+    Output:
+    {
+      'subgraph_0': {
+        '0': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': None,
+          'comparisons': [torch.tensor(...), ...], ...
+          'comparison_fn_name': '...',
+          'fqn': '...',
+        },
+        '1': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '...',
+          'comparisons': [torch.tensor(...), ...], ...
+          'comparison_fn_name': '...',
+          'fqn': '...',
+        },
+      },
+    }
+
+    """
+    subgraph_name_to_subgraph_results: Any = collections.defaultdict(dict)
+
+    # node_output or weight
+    key_to_use = next(iter(results['model'].keys()))
+
+    for subgraph_name_with_idx, subgraph_candidate_results in \
+            results['model'][key_to_use].items():
+
+        # convert from `subgraph_m_n` to `subgraph_m` and `n`
+        subgraph_str, subgraph_idx, subgraph_candidate_idx = \
+            subgraph_name_with_idx.split('_')
+        subgraph_name = f'{subgraph_str}_{subgraph_idx}'
+
+        subgraph_results = {
+            'ref_node_name': subgraph_candidate_results[0]['ref_node_name'],
+            'ref_node_target_type': subgraph_candidate_results[0]['ref_node_target_type'],
+            'fqn': subgraph_candidate_results[0]['fqn'],
+            'values': subgraph_candidate_results[0]['values'],
+            'qconfig_str': subgraph_candidate_results[0]['qconfig_str'],
+            'comparisons': subgraph_candidate_results[0]['comparisons'],
+            'comparison_fn_name': subgraph_candidate_results[0]['comparison_fn_name'],
+        }
+
+        subgraph_name_to_subgraph_results[subgraph_name][subgraph_candidate_idx] = \
+            subgraph_results
+
+    return dict(subgraph_name_to_subgraph_results)
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def create_results_comparison(
+    results_grouped,
+) -> Any:
+    """
+    Input:
+
+    {
+      'subgraph_0': {
+        '0': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '',
+          'comparisons': [],
+          'comparison_fn_name': '',
+          'fqn': '...',
+        },
+        '1': {
+          'ref_node_name': '...',
+          'ref_node_target_type': ...,
+          'values': [torch.tensor(...), ...],
+          'qconfig_str': '...',
+          'comparisons': [torch.tensor(...), ...],
+          'comparison_fn_name': 'sqnr',
+          'fqn': '...',
+        },
+      },
+    }
+
+    Output:
+    {
+      'subgraph_0': {
+        'ref_node_name': '...',
+        'ref_node_target_type': '...',
+        'fqn': '...',
+        'candidates': {
+          '1': {
+            'qconfig_str': ...,
+            'comparison_fn_name': 'sqnr',
+            'cmp_raw': [..., ...],
+            'cmp_mean': ...,
+          },
+          ...,
+        },
+      },
+    }
+    """
+
+    results_comparison = {}
+
+    for subgraph_name, subgraph_results in results_grouped.items():
+
+        candidates = {}
+        for subgraph_inner_name, subgraph_inner_result in subgraph_results.items():
+            # skip comparing baseline to baseline
+            if subgraph_inner_name == '0':
+                continue
+
+            # we expect the comparisons to be precalculated from
+            # calibration, so we just fetch them here
+            cmp_raw = subgraph_inner_result['comparisons']
+            cmp_raw_tensor = torch.stack(cmp_raw)
+
+            candidates[subgraph_inner_name] = {
+                'qconfig_str': subgraph_inner_result['qconfig_str'],
+                'comparison_fn_name': subgraph_inner_result['comparison_fn_name'],
+                'cmp_raw': cmp_raw_tensor,
+                'cmp_mean': torch.mean(cmp_raw_tensor),
+            }
+
+        results_comparison[subgraph_name] = {
+            'ref_node_name': subgraph_results['0']['ref_node_name'],
+            'ref_node_target_type': subgraph_results['0']['ref_node_target_type'],
+            'fqn': subgraph_results['0']['fqn'],
+            'candidates': candidates,
+        }
+
+    return results_comparison
+
+# TODO(future PR): redesign this to make it easier to consume outputs
+def print_n_shadows_summary(
+    results_comparison,
+) -> None:
+    """
+    Input:
+
+    {
+      'subgraph_0': {
+        'ref_node_name': 'linear1',
+        'ref_node_target_type': '...',
+        'fqn': '...',
+        'candidates': {
+          '1': {
+            'qconfig_str': ...,
+            'comparison_fn_name': ...,
+            'cmp_raw': [45.0, 55.0],
+            'cmp_mean': 50.0,
+          },
+          ...,
+        },
+      },
+    }
+
+    Prints:
+
+    node_name | node_type | fqn | 0    | 1    | ...
+    linear1   | ...       | ... | 45.0 | 50.0 | ...
+    """
+
+    try:
+        from tabulate import tabulate
+    except ImportError:
+        print("`print_tabular` relies on the library `tabulate`, "
+              "which could not be found on this machine. Run `pip "
+              "install tabulate` to install the library.")
+        return
+
+    results = []
+    for subgraph_data in results_comparison.values():
+        mean_all_candidates = [
+            candidate['cmp_mean']
+            for candidate_name, candidate in subgraph_data['candidates'].items()
+        ]
+
+        data_row = [
+            subgraph_data['ref_node_name'],
+            subgraph_data['ref_node_target_type'],
+            subgraph_data['fqn'],
+            *mean_all_candidates,
+        ]
+        results.append(data_row)
+
+    max_candidate_idx_len = -1
+    for data_row in results:
+        max_candidate_idx_len = max(max_candidate_idx_len, len(data_row[1]))
+    candidate_idx_headers = [str(x) for x in range(max_candidate_idx_len)]
+
+    headers = ['node_name', 'node_type', 'fqn', *candidate_idx_headers]
+    print(tabulate(results, headers=headers))
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/ns_types.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/ns_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..564e5d2041ee7c353311907b84bea27d4643d75b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/ns_types.py
@@ -0,0 +1,64 @@
+import enum
+from typing import NamedTuple
+
+from torch.fx.graph import Node
+
+from typing import Dict, Any, List, Union, Callable
+
+class NSSingleResultValuesType(str, enum.Enum):
+    WEIGHT = 'weight'
+    NODE_OUTPUT = 'node_output'
+    NODE_INPUT = 'node_input'
+
+class NSSubgraph(NamedTuple):
+    start_node: Node
+    end_node: Node
+    base_op_node: Node
+
+# TODO(future PR): see if we can use typing_extensions's TypedDict instead
+# to properly type the various keys
+# {
+#   # one of NSSingleResultValuesType
+#   'type': 'weight',
+#   # the values of type specified above
+#   'values': [torch.tensor(...), ...],
+#   # name of the node directly before the logger
+#   'prev_node_name': 'linear1',
+#   # type of the underlying function or module
+#   'prev_node_target_type': torch.nn.functional.linear  # or torch.nn.Linear, etc
+#   # name of the node responsible for adding this logger
+#   # Note: this may differ from prev_node_name if we are logging inputs
+#   'ref_node_name': 'linear1',
+#   # index of this node within the arg of the input/output node
+#   # for example, in cat([x1, x2, x3], dim=0), x2 would have index_within_arg == 1
+#   'index_within_arg': 0,
+#   # index of this node within the args of the input/output node
+#   # for example, in add(x1, x2), x2 would have index_of_arg == 1
+#   'index_of_arg': 0,
+#   # precomputed comparisons of logger values to reference values
+#   'comparisons': [torch.tensor(...), ...]
+#   # name of function used for precomputed comparisons
+#   'comparison_fn_name': 'sqnr',
+#   # string representation of qconfig responsible for creating this logger
+#   'qconfig_str': 'QConfig(...)',
+# }
+NSSingleResultType = Dict[str, Any]
+
+# {
+#   'layer_name_1': {  # subgraph name
+#     'node_output': {  # results type (node_output, node_input, weight)
+#       'model_name_a':  # model name
+#          [NSSingleResultType, ...],  # results, ordered by index_within_arg
+#       'model_name_b':
+#          [NSSingleResultType, ...],
+#     },
+#   },
+# }
+#
+NSResultsType = Dict[str, Dict[str, Dict[str, List[NSSingleResultType]]]]
+
+# Defines the underlying target type of a node, for example:
+# `F.conv1d` for a `call_function` conv node
+# `nn.Conv1d` for a `call_module` node calling the forward of a `nn.Conv1d` module
+# `'sigmoid'` for a `call_method` node calling `x.sigmoid()`
+NSNodeTargetType = Union[Callable, str]
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/pattern_utils.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/pattern_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b0e806d0536c06210821b82001fcad0e627667
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/pattern_utils.py
@@ -0,0 +1,200 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+toq = torch.ops.quantized
+
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from torch.ao.quantization.backend_config import get_native_backend_config
+from torch.ao.quantization.fx.quantize_handler import _get_pattern_to_quantize_handlers
+from torch.ao.quantization.utils import getattr_from_fqn
+from .ns_types import NSNodeTargetType
+from torch.ao.quantization import (
+    ObserverBase,
+    FakeQuantizeBase,
+)
+
+from typing import Dict, Tuple, Set, Callable, Any, Union, List
+
+
+def get_type_a_related_to_b(
+    base_name_to_sets_of_related_ops: Dict[str, Set[NSNodeTargetType]],
+) -> Set[Tuple[NSNodeTargetType, NSNodeTargetType]]:
+    # TODO(future PR): allow customizations
+    # TODO(future PR): reuse existing quantization mappings
+    # TODO(future PR): add the rest of modules and ops here
+    type_a_related_to_b: Set[Tuple[NSNodeTargetType, NSNodeTargetType]] = set()
+
+    for s in base_name_to_sets_of_related_ops.values():
+        s_list = list(s)
+        # add every bidirectional pair
+        for idx_0 in range(0, len(s_list)):
+            for idx_1 in range(idx_0, len(s_list)):
+                type_a_related_to_b.add((s_list[idx_0], s_list[idx_1]))
+                type_a_related_to_b.add((s_list[idx_1], s_list[idx_0]))
+
+    return type_a_related_to_b
+
+
+NSFusionElType = Union[
+    Callable,  # call_function or call_module type, example: F.linear or nn.Conv2d
+    str,  # call_method name, example: "dequantize"
+    Tuple[str, Any],  # call_method name and first argument, example: ("to", torch.float16)
+]
+NSFusionType = Union[
+    Tuple[NSFusionElType, NSFusionElType],
+    Tuple[NSFusionElType, NSFusionElType, NSFusionElType, NSFusionElType],
+]
+
+def get_reversed_fusions() -> List[Tuple[NSFusionType, int]]:
+    """
+    Set of potential fusions, in reverse order.  The order is reversed
+    to match how fusion patterns are defined in quantization code.
+
+    Fusion format:
+    ((fusion_op_0, fusion_op_1), base_op_idx)
+
+    Where base_op_idx is the idx of the op we should use to match other related
+    ops. Note: base_op_idx is specified in non-reverse order, i.e. a base_op_idx
+    of 0 represents the first op in regular (non-reverse) order, 1 represents the
+    second op, etc.
+    """
+    results: List[Tuple[NSFusionType, int]] = []
+
+    # Possible syntaxes:
+    # * single op: torch.nn.Conv2d
+    # * multiple ops: (torch.nn.ReLU, torch.nn.Conv2d)
+    # For fusions, we only care about patterns composed of multiple ops.
+    # TODO(future PR): allow customizations from default patterns.
+    all_quant_patterns = _get_pattern_to_quantize_handlers(get_native_backend_config())
+
+    default_base_op_idx = 0
+    for quant_pattern in all_quant_patterns.keys():
+        # TODO: this is a temporary hack to flatten the patterns from quantization so
+        # that it works with the ns matcher function, maybe we should use `_is_match`
+        # in torch.ao.quantization.fx.match_utils to match the patterns
+        if isinstance(quant_pattern, tuple) and len(quant_pattern) == 2 and \
+           isinstance(quant_pattern[1], tuple) and len(quant_pattern[1]) == 2:
+            # flatten the pattern with form (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d))
+            quant_pattern = (quant_pattern[0], quant_pattern[1][0], quant_pattern[1][1])
+
+        # Only patterns of multiple ops are fusions, ignore
+        # patterns which contain a single ops (they get matched
+        # without caring about fusions).
+        if isinstance(quant_pattern, tuple):
+            results.append((quant_pattern, default_base_op_idx))  # type: ignore[arg-type]
+
+        # For each pattern, add additional patterns with observers and
+        # fake quants at the end.
+        # TODO(future PR): if needed, implement matching for a node
+        #   having multiple output observers.
+        for cls in (ObserverBase, FakeQuantizeBase):
+            if isinstance(quant_pattern, tuple):
+                new_pattern = (cls, *quant_pattern)
+            else:
+                new_pattern = (cls, quant_pattern)
+            results.append((new_pattern, default_base_op_idx))  # type: ignore[arg-type]
+
+
+    # After this point, results contains values such as
+    # [..., ((torch.nn.Relu, torch.nn.Conv2d), 0), ...]
+
+    # Patterns for matching fp16 emulation are not specified in the quantization
+    # fusion mappings.  For now, define them here.
+    fp16_em_base_op_idx = 1
+    patterns_to_add = [
+        # linear-relu fp16 emulation:
+        # fp16_to_fp32 -> linear -> relu -> fp32_to_fp16
+        ((("to", torch.float16), F.relu, F.linear, "dequantize"), fp16_em_base_op_idx,),
+        # Conv-BN fusion (this happens outside of quantization patterns,
+        # which is why it is defined separately here).
+        ((nn.BatchNorm1d, nn.Conv1d), default_base_op_idx),
+        ((nn.BatchNorm2d, nn.Conv2d), default_base_op_idx),
+        ((nn.BatchNorm3d, nn.Conv3d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm1d, nn.Conv1d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm2d, nn.Conv2d), default_base_op_idx),
+        ((nn.ReLU, nn.BatchNorm3d, nn.Conv3d), default_base_op_idx),
+    ]
+    for p in patterns_to_add:
+        results.append(p)  # type: ignore[arg-type]
+        results.append(((ObserverBase, *p[0]), p[1]))  # type: ignore[arg-type]
+        results.append(((FakeQuantizeBase, *p[0]), p[1]))  # type: ignore[arg-type]
+
+    return results
+
+
+def end_node_matches_reversed_fusion(
+    end_node: Node,
+    reversed_fusion: NSFusionType,
+    gm: GraphModule,
+    seen_nodes: Set[Node],
+) -> bool:
+    """
+    Returns true if a pattern ending with `end_node` matches
+    the fusion pattern.
+    """
+    cur_node = end_node
+    for fusion_idx in range(len(reversed_fusion)):
+        # each node can only belong to one matched pattern
+        if cur_node in seen_nodes:
+            return False
+
+        cur_fusion_el = reversed_fusion[fusion_idx]
+
+        if cur_node.op == 'call_function':
+            fusion_el_is_fun = (not isinstance(cur_fusion_el, str)) and \
+                (not isinstance(cur_fusion_el, type))
+            if fusion_el_is_fun:
+                if cur_node.target != cur_fusion_el:
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == 'call_module':
+            fusion_el_is_mod = isinstance(cur_fusion_el, type)
+            if fusion_el_is_mod:
+                assert isinstance(cur_node.target, str)
+                target_mod = getattr_from_fqn(gm, cur_node.target)
+                if not isinstance(cur_fusion_el, type):
+                    return False
+                if not isinstance(target_mod, cur_fusion_el):
+                    return False
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+
+        elif cur_node.op == 'call_method':
+            fusion_el_is_meth_with_second_arg = \
+                isinstance(cur_fusion_el, tuple) and len(cur_fusion_el) == 2
+            fusion_el_is_meth_without_args = isinstance(cur_fusion_el, str)
+            if fusion_el_is_meth_without_args or fusion_el_is_meth_with_second_arg:
+                if fusion_el_is_meth_without_args:
+                    if cur_node.target != cur_fusion_el:
+                        return False
+                else:
+                    assert isinstance(cur_fusion_el, tuple)
+                    if cur_node.target != cur_fusion_el[0]:
+                        return False
+                    elif len(cur_node.args) < 2:
+                        return False
+                    elif cur_node.args[1] != cur_fusion_el[1]:
+                        return False
+
+                if len(cur_node.args) > 0 and isinstance(cur_node.args[0], Node):
+                    cur_node = cur_node.args[0]
+                else:
+                    return False
+            else:
+                return False
+        else:
+            return False
+
+    return True
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..98aafbb14d816ea6bda7d81f68f7862b33e2f699
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/qconfig_multi_mapping.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import copy
+from typing import Any, Callable, Dict, List, Union
+
+import torch
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig_mapping import _QCONFIG_STYLE_ORDER
+from torch.ao.quantization.qconfig import QConfigAny
+
+__all__ = ["QConfigMultiMapping"]
+
+_QCONFIG_STYLE_TO_METHOD: Dict[str, str] = {
+    "global_qconfig": "set_global",
+    "object_type_qconfigs": "set_object_type",
+    "module_name_regex_qconfigs": "set_module_name_regex",
+    "module_name_qconfigs": "set_module_name",
+    "module_name_object_type_order_qconfigs": "set_module_name_object_type_order",
+}
+
+def _remove_duplicates_and_none(qconfig_list: List[QConfigAny]) -> None:
+    to_remove = []
+    for index, cur_qconfig in enumerate(qconfig_list):
+        if cur_qconfig is None:
+            to_remove.append(index)
+            break
+        for checked_qconfig in qconfig_list[:index]:
+            if torch.ao.quantization.qconfig_equals(cur_qconfig, checked_qconfig):
+                to_remove.append(index)
+                break
+    for index in to_remove[::-1]:
+        qconfig_list.pop(index)
+
+class QConfigMultiMapping:
+    """
+    This class, used with the prepare_n_shadows_model API, stores a list of :class:`torch.ao.quantization.QConfigMapping`s
+    so that multiple QConfigs can be specified for each QConfig matching style.
+
+    The user can specify QConfigs using the following methods (in increasing match priority):
+
+        ``set_global`` : sets the global (default) QConfigs
+
+        ``set_object_type`` : sets the QConfigs for a given module type, function, or method name
+
+        ``set_module_name_regex`` : sets the QConfigs for modules matching the given regex string
+
+        ``set_module_name`` : sets the QConfigs for modules matching the given module name
+
+        ``set_module_name_object_type_order`` : sets the QConfigs for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
+
+    Note: Usage of set methods is the same as in QConfigMapping except with a passed in list of QConfigs rather than a
+    single QConfig.
+
+    Example usage::
+
+        qconfig_mapping = QConfigMultiMapping()
+            .set_global([qconfig1, qconfig2])
+            .set_object_type(torch.nn.Linear, [qconfig2, qconfig3])
+            .set_object_type(torch.nn.ReLU, [qconfig1])
+            .set_module_name_regex("foo.*bar.*conv[0-9]+", [qconfig2])
+            .set_module_name_regex("foo.*", [qconfig1, qconfig2, qconfig3])
+            .set_module_name("module1", [None])
+            .set_module_name("module2", [qconfig2])
+            .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, [qconfig3])
+
+    """
+
+    def __init__(self):
+        # initialize this with 1 QConfigMapping to avoid corner cases
+        self.qconfig_mappings_list: List[QConfigMapping] = [QConfigMapping()]
+
+    def _handle_list_size_mismatch(
+        self, qconfig_list: List[QConfigAny], style: str
+    ) -> None:
+        # this method handles cases where the size of qconfig_list does not match
+        # the size of qconfig_mappings_list.
+        # Issue: Consider a user inserting global_qconfig A and B first, then inserting
+        # qconfig C as an object_type_qconfig for conv ops. If we internally store
+        # 1 QConfigMapping with A and C and another with just B, then the
+        # second QConfigMapping will match B to conv ops (which is not wanted), since B is global.
+
+        # we avoid this by maintaining the invariant that if any QConfigMapping
+        # has a qconfig style+key with a qconfig in it, all QConfigMappings must
+        # have either a qconfig or None for that same style+key. In the above
+        # example, a None qconfig would prevent the unwanted match in the
+        # second QConfigMapping
+
+        if len(qconfig_list) > len(self.qconfig_mappings_list):
+            # Case: we have more qconfigs (in qconfig_list) than QConfigMappings
+
+            # Add new QConfigMappings (initialized so we maintain the `invariant`)
+
+            new_qconfig_mapping = QConfigMapping()
+            # searches other QConfigMappings for qconfig style+keys
+            # that need to be inserted as `None` into the new QConfigMapping
+            for qconfig_mapping in self.qconfig_mappings_list:
+
+                # global_qconfig has None by default
+                for check_style in _QCONFIG_STYLE_ORDER[1:]:
+                    qconfigs_dict = getattr(qconfig_mapping, check_style)
+                    target_qconfigs_dict = getattr(new_qconfig_mapping, check_style)
+                    for key in qconfigs_dict:
+                        target_qconfigs_dict[key] = None
+                break
+
+            # insert copies of this new QConfigMapping until all entires
+            # in qconfig_list can fit among the QConfigMappings
+            while len(qconfig_list) > len(self.qconfig_mappings_list):
+                self.qconfig_mappings_list.append(copy.deepcopy(new_qconfig_mapping))
+        else:
+            # Case: we have fewer qconfigs in qconfig_list than QConfigMappings
+
+            # pad qconfig_list with `None` until length is same
+            while len(qconfig_list) < len(self.qconfig_mappings_list):
+                qconfig_list.append(None)
+
+    # this function applies the insertion method across each QConfigMapping
+    def _insert_qconfig_list(
+        self,
+        style: str,
+        args: List[Union[str, int, Callable]],
+        qconfig_list: List[QConfigAny],
+    ) -> None:
+
+        # we remove duplicates and None to make the ordering of qconfigs
+        # deterministic upon insertion.
+        _remove_duplicates_and_none(qconfig_list)
+
+        self._handle_list_size_mismatch(qconfig_list, style)
+        method_name = _QCONFIG_STYLE_TO_METHOD[style]
+        for qconfig_mapping, qconfig in zip(self.qconfig_mappings_list, qconfig_list):
+            # uses QConfigMapping set method to insert qconfig
+            set_method = getattr(qconfig_mapping, method_name)
+            set_method(*args, qconfig)
+
+    def set_global(self, global_qconfig_list: List[QConfigAny]) -> QConfigMultiMapping:
+        """
+        Set global QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_global()` for more info
+        """
+        self._insert_qconfig_list("global_qconfig", [], global_qconfig_list)
+        return self
+
+    def set_object_type(
+        self, object_type: Union[Callable, str], qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set object type QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_object_type()` for more info
+        """
+        self._insert_qconfig_list("object_type_qconfigs", [object_type], qconfig_list)
+        return self
+
+    def set_module_name_regex(
+        self, module_name_regex: str, qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name_regex QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_regex()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_regex_qconfigs", [module_name_regex], qconfig_list
+        )
+        return self
+
+    def set_module_name(
+        self, module_name: str, qconfig_list: List[QConfigAny]
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name()` for more info
+        """
+        self._insert_qconfig_list("module_name_qconfigs", [module_name], qconfig_list)
+        return self
+
+    def set_module_name_object_type_order(
+        self,
+        module_name: str,
+        object_type: Callable,
+        index: int,
+        qconfig_list: List[QConfigAny],
+    ) -> QConfigMultiMapping:
+        """
+        Set module_name QConfigs
+        see :func:`~torch.ao.quantization.QConfigMapping.set_module_name_object_type_order()` for more info
+        """
+        self._insert_qconfig_list(
+            "module_name_object_type_order_qconfigs",
+            [module_name, object_type, index],
+            qconfig_list,
+        )
+        return self
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__ +
+            " [" +
+            "".join(f"\n{qconfig_mapping.__repr__()}," for qconfig_mapping in self.qconfig_mappings_list) +
+            "\n]"
+        )
+
+    @classmethod
+    def from_list_qconfig_mapping(
+        cls, qconfig_mapping_list: List[QConfigMapping]
+    ) -> QConfigMultiMapping:
+        """
+        Creates a QConfigMultiMapping from a list of QConfigMappings
+        """
+        new_qconfig_multi_mapping = cls()
+
+        new_qconfig_multi_mapping.qconfig_mappings_list = copy.deepcopy(
+            qconfig_mapping_list
+        )
+
+        # we need to avoid the issue described in _handle_list_size_mismatch,
+        # so we reinsert all the qconfigs using the QConfigMultiMapping
+        # set methods
+
+        # go through all qconfig styles
+        # note: global can be ignored since it is None by default
+        for style in _QCONFIG_STYLE_ORDER[1:]:
+
+            # gather all key+qconfigs for current style
+            # into qconfig_dict_list
+            qconfig_dict_list: Dict[Any, List[QConfigAny]] = {}
+            for qconfig_mapping in qconfig_mapping_list:
+                qconfig_dict = getattr(qconfig_mapping, style)
+                for key, qconfig in qconfig_dict.items():
+                    if key not in qconfig_dict_list:
+                        qconfig_dict_list[key] = []
+                    qconfig_dict_list[key].append(qconfig)
+
+            # reinsert all gathered key+qconfigs
+            set_method_name = _QCONFIG_STYLE_TO_METHOD[style]
+            set_method = getattr(new_qconfig_multi_mapping, set_method_name)
+            for key, qconfig_list in qconfig_dict_list.items():
+                if isinstance(key, tuple):
+                    set_method(*key, qconfig_list)
+                else:
+                    set_method(key, qconfig_list)
+
+        return new_qconfig_multi_mapping
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/utils.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d64d572e58dcf246fd357b0a1072c29999e5d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/utils.py
@@ -0,0 +1,533 @@
+import enum
+import operator
+
+import torch
+import torch.nn as nn
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.quantized as nnq
+
+toq = torch.ops.quantized
+from typing import Tuple, Callable, Dict, Set, List, Optional, Union
+
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+from torch.ao.quantization import (
+    ObserverBase,
+    FakeQuantizeBase,
+)
+from torch.ao.quantization.utils import getattr_from_fqn
+from torch.ao.quantization.observer import _is_activation_post_process
+
+from .ns_types import NSNodeTargetType, NSResultsType
+
+# TODO(future PR): consider deleting this enum and using the torch types
+# directly.  This might be tricky because it is not a one to one mapping.
+class NodeInputOrOutputType(enum.Enum):
+    FP32 = enum.auto()  # torch.float
+    INT8 = enum.auto()  # torch.qint8 or torch.quint8
+    FP16 = enum.auto()  # torch.float16
+    UNKNOWN = enum.auto()  # we cannot determine input/output dtype
+    # TODO(future PR): while these functions can support multiple dtypes,
+    #   for the purposes of numerical debugging we want to get the actual
+    #   dtype used in the model. We will likely need some kind of dtype
+    #   propagation to estimate this.
+    FP32_OR_INT8 = enum.auto()  # either torch.float or torch.quint8 or torch.qint8
+    # TODO(future PRs): dynamic quant, fake quant, etc
+
+
+def get_node_first_input_and_output_type(
+    node: Node,
+    gm: GraphModule,
+    logger_cls: Callable,
+    node_type_to_io_type_map: Dict[str, Set[NSNodeTargetType]],
+) -> Tuple[NodeInputOrOutputType, NodeInputOrOutputType]:
+
+    # TODO(future PR): clean this up
+    FUNS_IO_TYPE_FP32 = node_type_to_io_type_map["funs_io_type_fp32"]
+    FUNS_IO_TYPE_FP16 = node_type_to_io_type_map["funs_io_type_fp16"]
+    FUNS_IO_TYPE_INT8 = node_type_to_io_type_map["funs_io_type_int8"]
+    FUNS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["funs_io_type_fp32_or_int8"]
+    MODS_IO_TYPE_FP32 = node_type_to_io_type_map["mods_io_type_fp32"]
+    MODS_IO_TYPE_INT8 = node_type_to_io_type_map["mods_io_type_int8"]
+    MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"]
+    METHS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["meths_io_type_fp32_or_int8"]
+
+    if node.op == "call_function":
+        if node.target in FUNS_IO_TYPE_FP32:
+            return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        if node.target in FUNS_IO_TYPE_FP16:
+            return (NodeInputOrOutputType.FP16, NodeInputOrOutputType.FP16)
+        elif node.target in FUNS_IO_TYPE_INT8:
+            return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        elif node.target in FUNS_IO_TYPE_FP32_OR_INT8:
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            assert isinstance(first_arg, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+    elif node.op == "call_module":
+        assert node.op == "call_module"
+        assert isinstance(node.target, str)
+        mod = getattr_from_fqn(gm, node.target)
+        is_known_fp32_or_int8_input_module = any(
+            isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
+        )
+        if (
+            isinstance(mod, (logger_cls, ObserverBase, FakeQuantizeBase))  # type: ignore[arg-type]
+            or is_known_fp32_or_int8_input_module
+        ):
+            # A logger or observer's input and output type is the output
+            # type of the preceding node.
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            assert isinstance(first_arg, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+        is_known_fp32_input_module = any(
+            isinstance(mod, target_type) for target_type in MODS_IO_TYPE_FP32  # type: ignore[arg-type]
+        )
+        is_known_int8_input_module = any(
+            isinstance(mod, target_type) for target_type in MODS_IO_TYPE_INT8  # type: ignore[arg-type]
+        )
+        if is_known_fp32_input_module:
+            return (NodeInputOrOutputType.FP32, NodeInputOrOutputType.FP32)
+        elif is_known_int8_input_module:
+            return (NodeInputOrOutputType.INT8, NodeInputOrOutputType.INT8)
+        else:
+            return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+    elif node.op == "call_method":
+        if node.target == "dequantize":
+            # Dequantize is a special node because it allows multiple input types.
+            # So, we look up the output type of the previous node and return that
+            # as the input type of this node instance.
+            prev_node = get_normalized_nth_input(node, gm, 0)
+            assert isinstance(prev_node, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                prev_node, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, NodeInputOrOutputType.FP32)
+
+        elif node.target == "to":
+            # to is a special node because it allows multiple input types.
+            # So, we look up the output type of the previous node and return that
+            # as the input type of this node instance. We also look up the target
+            # of to and return the correct output type.
+            prev_node = get_normalized_nth_input(node, gm, 0)
+            assert isinstance(prev_node, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                prev_node, gm, logger_cls, node_type_to_io_type_map
+            )
+
+            cur_node_dtype_target = get_normalized_nth_input(node, gm, 1)
+            assert (
+                cur_node_dtype_target is torch.float16
+            ), f"{cur_node_dtype_target} handling needs to be added"
+
+            return (prev_node_output_type, NodeInputOrOutputType.FP16)
+
+        elif node.target in METHS_IO_TYPE_FP32_OR_INT8:
+            first_arg = get_normalized_nth_input(node, gm, 0)
+            assert isinstance(first_arg, Node)
+            (
+                _prev_node_input_type,
+                prev_node_output_type,
+            ) = get_node_first_input_and_output_type(
+                first_arg, gm, logger_cls, node_type_to_io_type_map
+            )
+            return (prev_node_output_type, prev_node_output_type)
+
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+    else:
+        return (NodeInputOrOutputType.UNKNOWN, NodeInputOrOutputType.UNKNOWN)
+
+
+def get_node_input_qparams(
+    node: Node,
+    gm: GraphModule,
+    node_type_to_io_type_map: Dict[str, Set[NSNodeTargetType]],
+) -> Optional[Tuple[Union[torch.Tensor, float], Union[torch.Tensor, int]]]:
+    """
+    Returns the qparams (scale, zero_point) of the first input to `node`,
+    if they can be inferred from the graph.
+    """
+    prev_node = get_normalized_nth_input(node, gm, 0)
+
+    if not isinstance(prev_node, Node):
+        return None
+
+    MODS_IO_TYPE_FP32_OR_INT8 = node_type_to_io_type_map["mods_io_type_fp32_or_int8"]
+
+    def _get_scale_zp_from_function_args(node, gm, scale_arg_idx, zp_arg_idx):
+        scale_node = get_normalized_nth_input(node, gm, scale_arg_idx)
+        zp_node = get_normalized_nth_input(node, gm, zp_arg_idx)
+        assert isinstance(scale_node, Node) and isinstance(scale_node.target, str)
+        assert isinstance(zp_node, Node) and isinstance(zp_node.target, str)
+        scale_obj = getattr_from_fqn(gm, scale_node.target)
+        zp_obj = getattr_from_fqn(gm, zp_node.target)
+        return (scale_obj, zp_obj)
+
+    if prev_node.op == "call_function":
+
+        # quantize - read the args directly
+        if prev_node.target == torch.quantize_per_tensor:
+            return _get_scale_zp_from_function_args(prev_node, gm, 1, 2)
+        elif prev_node.target in (toq.add, toq.add_relu, toq.mul, toq.mul_relu):
+            return _get_scale_zp_from_function_args(prev_node, gm, 2, 3)
+
+        return None
+        # TODO(future PR): handle more functionals
+        # TODO(future PR): handle functional ops which inherit qparams from input
+
+    elif prev_node.op == "call_module":
+
+        # get type of the module
+        assert isinstance(prev_node.target, str)
+        module_obj = getattr_from_fqn(gm, prev_node.target)
+        if isinstance(
+            module_obj,
+            (
+                nnq.Linear,
+                nnq.Conv1d,
+                nnq.Conv2d,
+                nniq.ConvReLU2d,
+                nnq.Conv3d,
+                nnq.BatchNorm2d,
+                nnq.BatchNorm3d,
+                nnq.ConvTranspose1d,
+                nnq.ConvTranspose2d,
+                nnq.ELU,
+                nnq.GroupNorm,
+                nnq.InstanceNorm1d,
+                nnq.InstanceNorm2d,
+                nnq.InstanceNorm3d,
+                nnq.LayerNorm,
+                nnq.Hardswish,
+                nnq.LeakyReLU,
+                nnq.ReLU6,
+                nniq.BNReLU2d,
+                nniq.BNReLU3d,
+                nniq.ConvReLU1d,
+                nniq.ConvReLU2d,
+                nniq.ConvReLU3d,
+                nniq.LinearReLU,
+            ),
+        ):
+            return (module_obj.scale, module_obj.zero_point)  # type: ignore[return-value]
+
+        is_known_fp32_or_int8_input_module = any(
+            isinstance(module_obj, target_type) for target_type in MODS_IO_TYPE_FP32_OR_INT8  # type: ignore[arg-type]
+        )
+        if is_known_fp32_or_int8_input_module:
+            return get_node_input_qparams(prev_node, gm, node_type_to_io_type_map)
+
+    return None
+
+
+def return_first_non_observer_node(
+    node: Node,
+    gm: GraphModule,
+) -> Node:
+    """
+    If node is not an observer, returns it.  If node is an observer,
+    navigates up the graph and returns the first parent which is not an
+    observer.  For example,
+
+    graph: (node_non_obs), node = node_non_obs : returns node_non_obs
+    graph: (node_non_obs -> obs0), node = obs0 : returns node_non_obs
+    graph: (node_non_obs -> obs0 -> fq0), node = fq0 : returns node_non_obs
+    """
+    if node.op == "call_module":
+        node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
+        if _is_activation_post_process(node_obj):
+            assert len(node.args) == 1
+            assert isinstance(node.args[0], Node)
+            node = node.args[0]
+            # code duplication intended, not worth refactoring
+            assert isinstance(node.target, str)
+            node_obj = getattr_from_fqn(gm, node.target)
+            if _is_activation_post_process(node_obj):
+                assert len(node.args) == 1
+                assert isinstance(node.args[0], Node)
+                node = node.args[0]
+    return node
+
+
+def get_number_of_non_param_args(
+    node: Node,
+    gm: GraphModule,
+) -> int:
+    """
+    Assumes that all non-param args occur first. Returns the number of
+    non-param args expected for a node.  For example, for
+
+      F.linear(x, weight, bias)
+
+    Returns 1, because x is a non-param arg and weight and bias are params.
+    For
+
+      lstm_mod(x, hid)
+
+    Returns 2, because both x and hid are non-param args.
+    """
+    if node.op == "call_module":
+        node_obj = getattr_from_fqn(gm, node.target)  # type: ignore[arg-type]
+        if isinstance(node_obj, nn.LSTM):
+            return 2
+
+    # default is 1
+    return 1
+
+
+def get_arg_indices_of_inputs_to_log(node: Node) -> List[int]:
+    """
+    Returns the indices of args of the node which we should attach
+    loggers to, if input logging is enabled.
+
+    For example,
+    * for (x + y), returns [0, 1]
+    * for (1 + y), returns [1]
+    * for (x + 1), returns [0]
+    * for (linear(x, w, b)) returns [0]
+    * by default, returns [0]
+    """
+    if len(node.args) == 0:
+        return []
+    if node.op == "call_function" and (
+        # TODO(future PR): use relationship map instead of hardcoding
+        node.target in (torch.add, torch.ops.quantized.add, operator.add)
+        or node.target in (torch.mul, torch.ops.quantized.mul, operator.mul)
+    ):
+        result = []
+        for i in range(2):
+            if type(node.args[i]) == Node:
+                result.append(i)
+        return result
+    return [0]
+
+
+def get_target_type_str(node: Node, gm: GraphModule) -> str:
+    """
+    Returns a string representation of the type of the function or module
+    pointed to by this node, or '' for other node types.
+    """
+    target_type = ""
+    if node.op in ("call_function", "call_method"):
+        target_type = torch.typename(node.target)
+    elif node.op == "call_module":
+        assert isinstance(node.target, str)
+        target_mod = getattr_from_fqn(gm, node.target)
+        target_type = torch.typename(target_mod)
+    return target_type
+
+
+def rekey_logger_info_on_node_name_of_model(
+    results: NSResultsType,
+    model_name: str,
+) -> NSResultsType:
+    """
+    Rekeys the layer name of a results dictionary to use node names
+    from `model_name`.
+
+    For example, transforms
+
+        {'base_op_1_0': {'node_output': {'model_a':
+          [{'ref_node_name': 'linear1', ...}]}}}
+
+    into
+
+        {'linear1': {'node_output': {'model_a':
+          [{'ref_node_name': 'linear1', ...}]}}}
+
+    Note: we cannot use these node names directly because they are not
+    guaranteed to be consistent across models. This is why we extract
+    the results first and rekey afterwards.
+    """
+    new_results = {}
+    for old_layer_name, result_type_to_results in results.items():
+        new_layer_name = None
+        for model_name_to_results in result_type_to_results.values():
+            for cur_model_name, list_of_results in model_name_to_results.items():
+                if cur_model_name == model_name:
+                    assert len(list_of_results)
+                    new_layer_name = list_of_results[0]["ref_node_name"]
+                else:
+                    continue
+        if new_layer_name is not None:
+            new_results[new_layer_name] = result_type_to_results
+        else:
+            new_results[old_layer_name] = result_type_to_results
+    return new_results
+
+
+def maybe_add_missing_fqns(results: NSResultsType) -> None:
+    """
+    If `fqn` entries are filled in for one of the models in `results`, copies
+    them over to any models which do not have them filled out.
+
+    A common use case benefitting from this is comparing a model prepared by
+    quantization to a quantized model. In this case, the model prepared by
+    quantization would have `fqn` entries, and the quantized model would not.
+    """
+
+    # Check in the first result to find any model with fqn entries defined.
+    model_name_with_fqns = None
+    for result_type_to_results in results.values():
+        for model_name_to_results in result_type_to_results.values():
+            for model_name, model_results in model_name_to_results.items():
+                if len(model_results) > 0:
+                    if model_results[0]["fqn"] is not None:
+                        model_name_with_fqns = model_name
+                        break
+            break
+        break
+
+    if model_name_with_fqns:
+        for result_type_to_results in results.values():
+            for model_name_to_results in result_type_to_results.values():
+                ref_model_results = model_name_to_results[model_name_with_fqns]
+                for model_name, model_results in model_name_to_results.items():
+                    if model_name == model_name_with_fqns:
+                        continue
+                    for i in range(len(model_results)):
+                        fqn = ref_model_results[i]["fqn"]
+                        model_results[i]["fqn"] = fqn
+
+
+def maybe_dequantize_first_two_tensor_args_and_handle_tuples(f):
+    def inner(*args, **kwargs):
+        a0, a1, *a_other = args
+
+        if (isinstance(a0, tuple) and isinstance(a1, tuple)) or (
+            isinstance(a0, list) and isinstance(a1, list)
+        ):
+            results = []
+            for el0, el1 in zip(a0, a1):
+                new_args = (el0, el1, *a_other)
+                results.append(inner(*new_args, **kwargs))
+            return results
+
+        elif isinstance(a0, torch.Tensor) and isinstance(a1, torch.Tensor):
+            if a0.is_quantized:
+                a0 = a0.dequantize()
+            if a1.is_quantized:
+                a1 = a1.dequantize()
+
+        # for the purposes of this util, only handle floats
+        if a0.dtype != torch.float or a1.dtype != torch.float:
+            return None
+
+        new_args = (a0, a1, *a_other)
+        return f(*new_args, **kwargs)
+
+    return inner
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_sqnr(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the SQNR between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    Ps = torch.norm(x)
+    Pn = torch.norm(x - y)
+    return 20 * torch.log10(Ps / Pn)
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_normalized_l2_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the normalized L2 error between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    return torch.sqrt(((x - y) ** 2).sum() / (x ** 2).sum())
+
+
+@maybe_dequantize_first_two_tensor_args_and_handle_tuples
+def compute_cosine_similarity(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the cosine similarity between `x` and `y`.
+
+    Args:
+        x: Tensor or tuple of tensors
+        y: Tensor or tuple of tensors
+
+    Return:
+        float or tuple of floats
+    """
+    # For convolutions, the shape of the quantized weight has one additional
+    # dimension compared to the shape of the fp32 weight. Match the shapes
+    # to enable cosine similarity comparison.
+    x = x.reshape(1, -1)
+    y = y.reshape(1, -1)
+    return torch.nn.functional.cosine_similarity(x, y)
+
+def op_type_supports_shadowing(node: Node) -> bool:
+    if node.op == 'call_function':
+        if node.target in (torch.add, torch.mul, operator.add, operator.mul, torch.cat, torch.stack):
+            # shadowing for ops with multiple tensor inputs is not implemented yet
+            return False
+    return True
+
+def get_normalized_nth_input(node: Node, gm: GraphModule, idx: int) -> Node:
+    """
+    Given a node, gets the n'th input to that node, normalizing
+    args and kwargs to the best of its ability.
+    """
+    try:
+        norm_args_and_kwargs = node.normalized_arguments(
+            gm, normalize_to_only_use_kwargs=True)
+        if norm_args_and_kwargs is not None:
+            norm_args, norm_kwargs = norm_args_and_kwargs
+            assert len(norm_args) + len(norm_kwargs) > idx
+            if idx < len(norm_args):
+                return norm_args[idx]
+            else:
+                # note: in Python 3.7+ dicts are ordered
+                return list(norm_kwargs.values())[idx]
+        else:
+            assert len(node.args) + len(node.kwargs) > idx
+            if idx < len(node.args):
+                return node.args[idx]  # type: ignore[return-value]
+            else:
+                kwargs_idx = idx + len(node.args)
+                return list(node.kwargs.values())[kwargs_idx]  # type: ignore[return-value]
+    except RuntimeError:
+        # this RuntimeError happens when node argument normalization
+        # requires typehints to proceed, such as for torch.add where
+        # either the first, second or both arguments could be tensors
+        assert len(node.args) + len(node.kwargs) > idx
+        if idx < len(node.args):
+            return node.args[idx]  # type: ignore[return-value]
+        else:
+            kwargs_idx = idx + len(node.args)
+            return list(node.kwargs.values())[kwargs_idx]  # type: ignore[return-value]
diff --git a/MLPY/Lib/site-packages/torch/ao/ns/fx/weight_utils.py b/MLPY/Lib/site-packages/torch/ao/ns/fx/weight_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e242d2f58bf6ba328b079f2ed8d026b69150dade
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/ns/fx/weight_utils.py
@@ -0,0 +1,275 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.quantized as nniq
+toq = torch.ops.quantized
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+
+from .utils import (
+    get_target_type_str,
+    getattr_from_fqn,
+    return_first_non_observer_node,
+)
+
+from .ns_types import (
+    NSSingleResultValuesType,
+    NSSingleResultType,
+)
+
+from typing import List, Optional, Dict, Callable
+
+def mod_weight_detach(mod: nn.Module) -> torch.Tensor:
+    return mod.weight.detach()  # type: ignore[operator]
+
+def mod_0_weight_detach(mod: nn.Module) -> torch.Tensor:
+    return mod[0].weight.detach()  # type: ignore[index]
+
+def mod_weight_bias_0(mod: nn.Module) -> torch.Tensor:
+    return mod._weight_bias()[0]  # type: ignore[operator]
+
+def get_lstm_weight(mod: nn.Module) -> List[torch.Tensor]:
+    res = []
+    for idx, param_name in enumerate(mod._flat_weights_names):  # type: ignore[arg-type]
+        if 'weight_ih_l' in param_name or 'weight_hh_l' in param_name:
+            param_value = mod._flat_weights[idx].detach()  # type: ignore[index]
+            res.append(param_value)
+    return res
+
+def get_qlstm_weight(mod: nn.Module) -> List[torch.Tensor]:
+    res = []
+    for weight_value in mod._all_weight_values:  # type: ignore[union-attr]
+        res.append(weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0])
+        res.append(weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0])
+    return res
+
+def get_conv_mod_weight(mod: nn.Module) -> torch.Tensor:
+    if (
+        isinstance(mod, (nn.Conv1d, nn.Conv2d, nn.Conv3d))
+    ):
+        return mod.weight.detach()
+    elif (
+        isinstance(mod, (nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d))
+    ):
+        return mod[0].weight.detach()
+    else:
+        return mod._weight_bias()[0]  # type: ignore[operator]
+
+def get_linear_mod_weight(mod: nn.Module) -> torch.Tensor:
+    if isinstance(mod, nn.Linear):
+        return mod.weight.detach()
+    elif isinstance(mod, nni.LinearReLU):
+        return mod[0].weight.detach()
+    else:
+        return mod._weight_bias()[0]  # type: ignore[operator]
+
+def get_lstm_mod_weights(mod: nn.Module) -> List[torch.Tensor]:
+    # TODO(future PR): make more generic, handle everything
+    if isinstance(mod, nn.LSTM):
+        res = []
+        for idx, param_name in enumerate(mod._flat_weights_names):
+            if 'weight_ih_l' in param_name or 'weight_hh_l' in param_name:
+                param_value = mod._flat_weights[idx].detach()
+                res.append(param_value)
+        return res
+    else:
+        assert isinstance(mod, nnqd.LSTM), f"type {type(mod)} not handled yet"
+        res = []
+        for weight_value in mod._all_weight_values:
+            res.append(weight_value.param.__getstate__()[0][4][0].__getstate__()[0][0])
+            res.append(weight_value.param.__getstate__()[0][4][1].__getstate__()[0][0])
+        return res
+
+def get_conv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # traverse backwards from the weight arg, accounting for any observers
+    weight_arg_node = node.args[1]
+    assert isinstance(weight_arg_node, Node)
+    weight_node = return_first_non_observer_node(weight_arg_node, gm)
+    assert isinstance(weight_node, Node)
+    assert weight_node.op == 'get_attr'
+    weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+    return weight.detach()
+
+def get_qconv_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # qconv state is arg 1
+    qconv_state_node = node.args[1]
+    assert isinstance(qconv_state_node, Node)
+    assert qconv_state_node.op == 'get_attr'
+    qconv_state_obj = getattr_from_fqn(gm, qconv_state_node.target)  # type: ignore[arg-type]
+    return qconv_state_obj.weight()
+
+def get_linear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # traverse backwards from the weight arg, accounting for any observers
+    # supported patterns:
+    # weight -> obs -> linear
+    # weight -> to(torch.float16) -> dequantize -> linear
+    linear_second_arg = node.args[1]
+    assert isinstance(linear_second_arg, Node)
+
+    if linear_second_arg.op == 'call_module':
+        # weight -> obs -> linear
+        weight_arg_node = node.args[1]
+        assert isinstance(weight_arg_node, Node)
+        weight_node = weight_arg_node.args[0]
+        assert isinstance(weight_node, Node)
+        assert weight_node.op == 'get_attr'
+        weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+        return weight.detach()
+    elif linear_second_arg.op == 'call_method':
+        # weight -> to(torch.float16) -> dequantize -> linear
+        assert linear_second_arg.op == 'call_method'
+        dequant_node = node.args[1]
+        assert isinstance(dequant_node, Node)
+        to_fp16_node = dequant_node.args[0]
+        assert isinstance(to_fp16_node, Node)
+        # extract the dtype, so we can cast to it before returning
+        target_dtype = to_fp16_node.args[1]
+        weight_node = to_fp16_node.args[0]
+        assert isinstance(weight_node, Node)
+        assert weight_node.op == 'get_attr'
+        weight = getattr_from_fqn(gm, weight_node.target)  # type: ignore[arg-type]
+        # return the weight with fp16 cast
+        return weight.detach().to(target_dtype)
+    else:
+        assert linear_second_arg.op == 'get_attr'
+        weight = getattr_from_fqn(gm, linear_second_arg.target)  # type: ignore[arg-type]
+        return weight.detach()
+
+def get_qlinear_fun_weight(node: Node, gm: GraphModule) -> torch.Tensor:
+    # packed weight is arg 1
+    packed_weight_node = node.args[1]
+    assert isinstance(packed_weight_node, Node)
+    assert packed_weight_node.op == 'get_attr'
+    packed_weight = getattr_from_fqn(gm, packed_weight_node.target)  # type: ignore[arg-type]
+    # TODO(future PR): why does packed_weight.unpack() not work?
+    (weight, _bias), _name = packed_weight.__getstate__()
+    return weight
+
+def get_op_to_type_to_weight_extraction_fn() -> Dict[str, Dict[Callable, Callable]]:
+
+    op_to_type_to_weight_extraction_fn: Dict[str, Dict[Callable, Callable]] = {
+        'call_module': {
+            # Conv1d
+            nn.Conv1d: mod_weight_detach,
+            nni.ConvReLU1d: mod_0_weight_detach,
+            nnq.Conv1d: mod_weight_bias_0,
+            nnqat.Conv1d: mod_weight_detach,
+            nniqat.ConvBn1d: mod_weight_detach,
+            nniqat.ConvBnReLU1d: mod_weight_detach,
+            nniqat.ConvReLU1d: mod_weight_detach,
+            nniq.ConvReLU1d: mod_weight_bias_0,
+            # Conv2d
+            nn.Conv2d: mod_weight_detach,
+            nni.ConvReLU2d: mod_0_weight_detach,
+            nnq.Conv2d: mod_weight_bias_0,
+            nnqat.Conv2d: mod_weight_detach,
+            nniqat.ConvBn2d: mod_weight_detach,
+            nniqat.ConvBnReLU2d: mod_weight_detach,
+            nniqat.ConvReLU2d: mod_weight_detach,
+            nniq.ConvReLU2d: mod_weight_bias_0,
+            # Conv3d
+            nn.Conv3d: mod_weight_detach,
+            nni.ConvReLU3d: mod_0_weight_detach,
+            nnq.Conv3d: mod_weight_bias_0,
+            nnqat.Conv3d: mod_weight_detach,
+            nniqat.ConvBn3d: mod_weight_detach,
+            nniqat.ConvBnReLU3d: mod_weight_detach,
+            nniqat.ConvReLU3d: mod_weight_detach,
+            nniq.ConvReLU3d: mod_weight_bias_0,
+            # Linear
+            nn.Linear: mod_weight_detach,
+            nnq.Linear: mod_weight_bias_0,
+            nni.LinearReLU: mod_0_weight_detach,
+            nniq.LinearReLU: mod_weight_bias_0,
+            nnqat.Linear: mod_weight_detach,
+            nnqd.Linear: mod_weight_bias_0,
+            nniqat.LinearReLU: mod_weight_detach,
+            nniqat.LinearBn1d: mod_weight_detach,
+            nn.modules.linear.NonDynamicallyQuantizableLinear: mod_weight_detach,
+            # LSTM
+            nn.LSTM: get_lstm_weight,
+            nnqd.LSTM: get_qlstm_weight,
+        },
+        'call_function': {
+            # Conv
+            F.conv1d: get_conv_fun_weight,
+            F.conv2d: get_conv_fun_weight,
+            F.conv3d: get_conv_fun_weight,
+            toq.conv1d: get_qconv_fun_weight,
+            toq.conv2d: get_qconv_fun_weight,
+            toq.conv3d: get_qconv_fun_weight,
+            toq.conv1d_relu: get_qconv_fun_weight,
+            toq.conv2d_relu: get_qconv_fun_weight,
+            toq.conv3d_relu: get_qconv_fun_weight,
+            # Linear
+            F.linear: get_linear_fun_weight,
+            toq.linear: get_qlinear_fun_weight,
+            toq.linear_relu: get_qlinear_fun_weight,
+        },
+    }
+
+    return op_to_type_to_weight_extraction_fn
+
+def extract_weight_from_node(
+    node: Node,
+    gm: GraphModule,
+    op_to_type_to_weight_extraction_fn: Optional[Dict[str, Dict[Callable, Callable]]] = None,
+) -> Optional[NSSingleResultType]:
+    res_type = NSSingleResultValuesType.WEIGHT.value
+
+    # Not all graphmodules have _node_name_to_scope, so only fill it
+    # out if it exists.
+    fqn = None
+    if hasattr(gm, '_node_name_to_scope'):
+        fqn = gm._node_name_to_scope[node.name][0]  # type: ignore[index]
+
+    if op_to_type_to_weight_extraction_fn is None:
+        op_to_type_to_weight_extraction_fn = get_op_to_type_to_weight_extraction_fn()
+
+    ref_node_type = get_target_type_str(node, gm)
+    # for extracting weights, these are always the same
+    prev_node_type = ref_node_type
+
+    if node.op == 'call_function':
+        function_mapping = op_to_type_to_weight_extraction_fn['call_function']
+        for target_fn_type, weight_extraction_fn in function_mapping.items():
+            if node.target == target_fn_type:
+                weight = weight_extraction_fn(node, gm)
+                return {
+                    'type': res_type,
+                    'values': [weight],
+                    'prev_node_name': node.name,
+                    'prev_node_target_type': prev_node_type,
+                    'ref_node_name': node.name,
+                    'ref_node_target_type': ref_node_type,
+                    'index_within_arg': 0,
+                    'index_of_arg': 0,
+                    'fqn': fqn,
+                }
+
+    elif node.op == 'call_module':
+        # for call_module, we need to look up the modules to do the type check
+        assert isinstance(node.target, str)
+        mod = getattr_from_fqn(gm, node.target)
+        module_mapping = op_to_type_to_weight_extraction_fn['call_module']
+        for target_mod_type, weight_extraction_fn in module_mapping.items():
+            if type(mod) == target_mod_type:
+                weight = weight_extraction_fn(mod)
+                return {
+                    'type': res_type,
+                    'values': [weight],
+                    'prev_node_name': node.name,
+                    'prev_node_target_type': prev_node_type,
+                    'ref_node_name': node.name,
+                    'ref_node_target_type': ref_node_type,
+                    'index_within_arg': 0,
+                    'index_of_arg': 0,
+                    'fqn': fqn,
+                }
+
+    return None
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6bcce09809ecf3899bf7bfb1e4a4e8c6c6f5345
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/__init__.py
@@ -0,0 +1,19 @@
+# Variables
+from ._mappings import get_dynamic_sparse_quantized_mapping
+from ._mappings import get_static_sparse_quantized_mapping
+
+# Sparsifier
+from .sparsifier.base_sparsifier import BaseSparsifier
+from .sparsifier.weight_norm_sparsifier import WeightNormSparsifier
+from .sparsifier.nearly_diagonal_sparsifier import NearlyDiagonalSparsifier
+
+# Scheduler
+from .scheduler.base_scheduler import BaseScheduler
+from .scheduler.lambda_scheduler import LambdaSL
+from .scheduler.cubic_scheduler import CubicSL
+
+# Parametrizations
+from .sparsifier.utils import FakeSparsity
+from .sparsifier.utils import module_to_fqn
+from .sparsifier.utils import fqn_to_module
+from .sparsifier.utils import get_arg_info_from_tensor_fqn
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd43c6268e9ba7f73b044e4c4d7bc059a96549c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bda8485f6658452e833f74e033099b60cfde06c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/__pycache__/_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66d63690867e579df48cce9bc185da3f340f4126
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..377efe1717afd5f56a4c45ec229f9fe4d32d4388
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae04648c6dd0c65ce571ef9aa0843787f4844919
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/__pycache__/activation_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..35eb38ab11dd912166722d241d9f026f6a2f7060
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/activation_sparsifier/activation_sparsifier.py
@@ -0,0 +1,418 @@
+from typing import Any, Dict, List, Optional
+import torch
+from collections import defaultdict
+from torch import nn
+import copy
+from ...sparsifier.utils import fqn_to_module, module_to_fqn
+import warnings
+
+__all__ = ['ActivationSparsifier']
+
+
+class ActivationSparsifier:
+    r"""
+    The Activation sparsifier class aims to sparsify/prune activations in a neural
+    network. The idea is to attach the sparsifier to a layer (or layers) and it
+    zeroes out the activations based on the mask_fn (or sparsification function)
+    input by the user.
+    The mask_fn is applied once all the inputs are aggregated and reduced i.e.
+    mask = mask_fn(reduce_fn(aggregate_fn(activations)))
+
+    Note::
+        The sparsification mask is computed on the input **before it goes through the attached layer**.
+
+    Args:
+        model (nn.Module):
+            The model whose layers will be sparsified. The layers that needs to be
+            sparsified should be added separately using the register_layer() function
+        aggregate_fn (Optional, Callable):
+            default aggregate_fn that is used if not specified while registering the layer.
+            specifies how inputs should be aggregated over time.
+            The aggregate_fn should usually take 2 torch tensors and return the aggregated tensor.
+            Example
+                def add_agg_fn(tensor1, tensor2):  return tensor1 + tensor2
+                reduce_fn (Optional, Callable):
+                    default reduce_fn that is used if not specified while registering the layer.
+                    reduce_fn will be called on the aggregated tensor i.e. the tensor obtained after
+                    calling agg_fn() on all inputs.
+                    Example
+                def mean_reduce_fn(agg_tensor):    return agg_tensor.mean(dim=0)
+                mask_fn (Optional, Callable):
+                    default mask_fn that is used to create the sparsification mask using the tensor obtained after
+                    calling the reduce_fn(). This is used by default if a custom one is passed in the
+                    register_layer().
+                    Note that the mask_fn() definition should contain the sparse arguments that is passed in sparse_config
+                    arguments.
+                features (Optional, list):
+                    default selected features to sparsify.
+                    If this is non-empty, then the mask_fn will be applied for each feature of the input.
+                    For example,
+                mask = [mask_fn(reduce_fn(aggregated_fn(input[feature])) for feature in features]
+                feature_dim (Optional, int):
+                    default dimension of input features. Again, features along this dim will be chosen
+                    for sparsification.
+                sparse_config (Dict):
+                    Default configuration for the mask_fn. This config will be passed
+                    with the mask_fn()
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> model = SomeModel()
+        >>> act_sparsifier = ActivationSparsifier(...)  # init activation sparsifier
+        >>> # Initialize aggregate_fn
+        >>> def agg_fn(x, y):
+        >>>     return x + y
+        >>>
+        >>> # Initialize reduce_fn
+        >>> def reduce_fn(x):
+        >>>     return torch.mean(x, dim=0)
+        >>>
+        >>> # Initialize mask_fn
+        >>> def mask_fn(data):
+        >>>     return torch.eye(data.shape).to(data.device)
+        >>>
+        >>>
+        >>> act_sparsifier.register_layer(model.some_layer, aggregate_fn=agg_fn, reduce_fn=reduce_fn, mask_fn=mask_fn)
+        >>>
+        >>> # start training process
+        >>> for _ in [...]:
+        >>>     # epoch starts
+        >>>         # model.forward(), compute_loss() and model.backwards()
+        >>>     # epoch ends
+        >>>     act_sparsifier.step()
+        >>> # end training process
+        >>> sparsifier.squash_mask()
+    """
+    def __init__(self, model: nn.Module, aggregate_fn=None, reduce_fn=None, mask_fn=None,
+                 features=None, feature_dim=None, **sparse_config):
+        self.model = model
+        self.defaults: Dict[str, Any] = defaultdict()
+        self.defaults['sparse_config'] = sparse_config
+
+        # functions
+        self.defaults['aggregate_fn'] = aggregate_fn
+        self.defaults['reduce_fn'] = reduce_fn
+        self.defaults['mask_fn'] = mask_fn
+
+        # default feature and feature_dim
+        self.defaults['features'] = features
+        self.defaults['feature_dim'] = feature_dim
+
+        self.data_groups: Dict[str, Dict] = defaultdict(dict)  # contains all relevant info w.r.t each registered layer
+
+        self.state: Dict[str, Any] = defaultdict(dict)  # layer name -> mask
+
+    @staticmethod
+    def _safe_rail_checks(args):
+        """Makes sure that some of the functions and attributes are not passed incorrectly
+        """
+
+        # if features are not None, then feature_dim must not be None
+        features, feature_dim = args['features'], args['feature_dim']
+        if features is not None:
+            assert feature_dim is not None, "need feature dim to select features"
+
+        # all the *_fns should be callable
+        fn_keys = ['aggregate_fn', 'reduce_fn', 'mask_fn']
+        for key in fn_keys:
+            fn = args[key]
+            assert callable(fn), 'function should be callable'
+
+    def _aggregate_hook(self, name):
+        """Returns hook that computes aggregate of activations passing through.
+        """
+
+        # gather some data
+        feature_dim = self.data_groups[name]['feature_dim']
+        features = self.data_groups[name]['features']
+        agg_fn = self.data_groups[name]['aggregate_fn']
+
+        def hook(module, input) -> None:
+            input_data = input[0]
+
+            data = self.data_groups[name].get('data')  # aggregated data
+            if features is None:
+                # no features associated, data should not be a list
+                if data is None:
+                    data = torch.zeros_like(input_data)
+                    self.state[name]['mask'] = torch.ones_like(input_data)
+                out_data = agg_fn(data, input_data)
+            else:
+                # data should be a list [aggregated over each feature only]
+                if data is None:
+                    out_data = [0 for _ in range(0, len(features))]  # create one incase of 1st forward
+                    self.state[name]['mask'] = [0 for _ in range(0, len(features))]
+                else:
+                    out_data = data  # a list
+
+                # compute aggregate over each feature
+                for feature_idx in range(len(features)):
+                    # each feature is either a list or scalar, convert it to torch tensor
+                    feature_tensor = torch.Tensor([features[feature_idx]]).long().to(input_data.device)
+                    data_feature = torch.index_select(input_data, feature_dim, feature_tensor)
+                    if data is None:
+                        curr_data = torch.zeros_like(data_feature)
+                        self.state[name]['mask'][feature_idx] = torch.ones_like(data_feature)
+                    else:
+                        curr_data = data[feature_idx]
+                    out_data[feature_idx] = agg_fn(curr_data, data_feature)
+            self.data_groups[name]['data'] = out_data
+        return hook
+
+    def register_layer(self, layer: nn.Module, aggregate_fn=None, reduce_fn=None,
+                       mask_fn=None, features=None, feature_dim=None, **sparse_config):
+        r"""
+        Registers a layer for sparsification. The layer should be part of self.model.
+        Specifically, registers a pre-forward hook to the layer. The hook will apply the aggregate_fn
+        and store the aggregated activations that is input over each step.
+
+        Note::
+            - There is no need to pass in the name of the layer as it is automatically computed as per
+              the fqn convention.
+
+            - All the functions (fn) passed as argument will be called at a dim, feature level.
+        """
+        name = module_to_fqn(self.model, layer)
+        assert name is not None, "layer not found in the model"  # satisfy mypy
+
+        if name in self.data_groups:  # unregister layer if already present
+            warnings.warn("layer already attached to the sparsifier, deregistering the layer and registering with new config")
+            self.unregister_layer(name=name)
+
+        local_args = copy.deepcopy(self.defaults)
+        update_dict = {
+            'aggregate_fn': aggregate_fn,
+            'reduce_fn': reduce_fn,
+            'mask_fn': mask_fn,
+            'features': features,
+            'feature_dim': feature_dim,
+            'layer': layer
+        }
+        local_args.update((arg, val) for arg, val in update_dict.items() if val is not None)
+        local_args['sparse_config'].update(sparse_config)
+
+        self._safe_rail_checks(local_args)
+
+        self.data_groups[name] = local_args
+        agg_hook = layer.register_forward_pre_hook(self._aggregate_hook(name=name))
+
+        self.state[name]['mask'] = None  # mask will be created when model forward is called.
+
+        # attach agg hook
+        self.data_groups[name]['hook'] = agg_hook
+
+        # for serialization purposes, we know whether aggregate_hook is attached
+        # or sparsify_hook()
+        self.data_groups[name]['hook_state'] = "aggregate"  # aggregate hook is attached
+
+    def get_mask(self, name: Optional[str] = None, layer: Optional[nn.Module] = None):
+        """
+        Returns mask associated to the layer.
+
+        The mask is
+            - a torch tensor is features for that layer is None.
+            - a list of torch tensors for each feature, otherwise
+
+        Note::
+            The shape of the mask is unknown until model.forward() is applied.
+            Hence, if get_mask() is called before model.forward(), an
+            error will be raised.
+        """
+        assert name is not None or layer is not None, "Need at least name or layer obj to retrieve mask"
+
+        if name is None:
+            assert layer is not None
+            name = module_to_fqn(self.model, layer)
+            assert name is not None, "layer not found in the specified model"
+
+        if name not in self.state:
+            raise ValueError("Error: layer with the given name not found")
+
+        mask = self.state[name].get('mask', None)
+
+        if mask is None:
+            raise ValueError("Error: shape unknown, call layer() routine at least once to infer mask")
+        return mask
+
+    def unregister_layer(self, name):
+        """Detaches the sparsifier from the layer
+        """
+
+        # detach any hooks attached
+        self.data_groups[name]['hook'].remove()
+
+        # pop from the state dict
+        self.state.pop(name)
+
+        # pop from the data groups
+        self.data_groups.pop(name)
+
+    def step(self):
+        """Internally calls the update_mask() function for each layer
+        """
+        with torch.no_grad():
+            for name, configs in self.data_groups.items():
+                data = configs['data']
+                self.update_mask(name, data, configs)
+
+                self.data_groups[name].pop('data')  # reset the accumulated data
+
+    def update_mask(self, name, data, configs):
+        """
+        Called for each registered layer and does the following-
+            1. apply reduce_fn on the aggregated activations
+            2. use mask_fn to compute the sparsification mask
+
+        Note:
+            the reduce_fn and mask_fn is called for each feature, dim over the data
+        """
+        mask = self.get_mask(name)
+        sparse_config = configs['sparse_config']
+        features = configs['features']
+        reduce_fn = configs['reduce_fn']
+        mask_fn = configs['mask_fn']
+        if features is None:
+            data = reduce_fn(data)
+            mask.data = mask_fn(data, **sparse_config)
+        else:
+            for feature_idx in range(len(features)):
+                data_feature = reduce_fn(data[feature_idx])
+                mask[feature_idx].data = mask_fn(data_feature, **sparse_config)
+
+    def _sparsify_hook(self, name):
+        """Returns hook that applies sparsification mask to input entering the attached layer
+        """
+        mask = self.get_mask(name)
+        features = self.data_groups[name]['features']
+        feature_dim = self.data_groups[name]['feature_dim']
+
+        def hook(module, input):
+            input_data = input[0]
+            if features is None:
+                # apply to all the features
+                return input_data * mask
+            else:
+                # apply per feature, feature_dim
+                for feature_idx in range(0, len(features)):
+                    feature = torch.Tensor([features[feature_idx]]).long().to(input_data.device)
+                    sparsified = torch.index_select(input_data, feature_dim, feature) * mask[feature_idx]
+                    input_data.index_copy_(feature_dim, feature, sparsified)
+                return input_data
+        return hook
+
+    def squash_mask(self, attach_sparsify_hook=True, **kwargs):
+        """
+        Unregisters aggregate hook that was applied earlier and registers sparsification hooks if
+        attach_sparsify_hook = True.
+        """
+        for name, configs in self.data_groups.items():
+            # unhook agg hook
+            configs['hook'].remove()
+            configs.pop('hook')
+            self.data_groups[name]['hook_state'] = "None"
+            if attach_sparsify_hook:
+                configs['hook'] = configs['layer'].register_forward_pre_hook(self._sparsify_hook(name))
+            configs['hook_state'] = "sparsify"  # signals that sparsify hook is now attached
+
+    def _get_serializable_data_groups(self):
+        """Exclude hook and layer from the config keys before serializing
+
+        TODO: Might have to treat functions (reduce_fn, mask_fn etc) in a different manner while serializing.
+              For time-being, functions are treated the same way as other attributes
+        """
+        data_groups: Dict[str, Any] = defaultdict()
+        for name, config in self.data_groups.items():
+            new_config = {key: value for key, value in config.items() if key not in ['hook', 'layer']}
+            data_groups[name] = new_config
+        return data_groups
+
+    def _convert_mask(self, states_dict, sparse_coo=True):
+        r"""Converts the mask to sparse coo or dense depending on the `sparse_coo` argument.
+        If `sparse_coo=True`, then the mask is stored as sparse coo else dense tensor
+        """
+        states = copy.deepcopy(states_dict)
+        for state in states.values():
+            if state['mask'] is not None:
+                if isinstance(state['mask'], List):
+                    for idx in range(len(state['mask'])):
+                        if sparse_coo:
+                            state['mask'][idx] = state['mask'][idx].to_sparse_coo()
+                        else:
+                            state['mask'][idx] = state['mask'][idx].to_dense()
+                else:
+                    if sparse_coo:
+                        state['mask'] = state['mask'].to_sparse_coo()
+                    else:
+                        state['mask'] = state['mask'].to_dense()
+        return states
+
+    def state_dict(self) -> Dict[str, Any]:
+        r"""Returns the state of the sparsifier as a :class:`dict`.
+
+        It contains:
+        * state - contains name -> mask mapping.
+        * data_groups - a dictionary containing all config information for each
+            layer
+        * defaults - the default config while creating the constructor
+        """
+        data_groups = self._get_serializable_data_groups()
+        state = self._convert_mask(self.state)
+        return {
+            'state': state,
+            'data_groups': data_groups,
+            'defaults': self.defaults
+        }
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        r"""The load_state_dict() restores the state of the sparsifier based on the state_dict
+
+        Args:
+        * state_dict - the dictionary that to which the current sparsifier needs to be restored to
+        """
+        state = state_dict['state']
+        data_groups, defaults = state_dict['data_groups'], state_dict['defaults']
+
+        self.__set_state__({'state': state, 'data_groups': data_groups, 'defaults': defaults})
+
+    def __get_state__(self) -> Dict[str, Any]:
+
+        data_groups = self._get_serializable_data_groups()
+        state = self._convert_mask(self.state)
+        return {
+            'defaults': self.defaults,
+            'state': state,
+            'data_groups': data_groups,
+        }
+
+    def __set_state__(self, state: Dict[str, Any]) -> None:
+        state['state'] = self._convert_mask(state['state'], sparse_coo=False)  # convert mask to dense tensor
+        self.__dict__.update(state)
+
+        # need to attach layer and hook info into the data_groups
+        for name, config in self.data_groups.items():
+            # fetch layer
+            layer = fqn_to_module(self.model, name)
+            assert layer is not None  # satisfy mypy
+
+            # if agg_mode is True, then layer in aggregate mode
+            if "hook_state" in config and config['hook_state'] == "aggregate":
+                hook = layer.register_forward_pre_hook(self._aggregate_hook(name))
+
+            elif "hook_state" in config and config["hook_state"] == "sparsify":
+                hook = layer.register_forward_pre_hook(self._sparsify_hook(name))
+
+            config['layer'] = layer
+            config['hook'] = hook  # type: ignore[possibly-undefined]
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        for name, config in self.data_groups.items():
+            format_string += '\n'
+            format_string += '\tData Group\n'
+            format_string += f'\t    name: {name}\n'
+            for key in sorted(config.keys()):
+                if key in ['data', 'hook', 'reduce_fn', 'mask_fn', 'aggregate_fn']:
+                    continue
+                format_string += f'\t    {key}: {config[key]}\n'
+        format_string += ')'
+        return format_string
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4880dcec5ff6ee8c258fb2bb09ea29a6011fc41b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__init__.py
@@ -0,0 +1,5 @@
+from .base_data_scheduler import BaseDataScheduler
+
+__all__ = [
+    "BaseDataScheduler",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ac53cf15a104508937cb84c37be159ca809c23f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..729c8d3db625c29b2179522d251e9efb51eec0eb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/__pycache__/base_data_scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c3b65273df4ab7fbd7be7f362881aa9bdbf850
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_scheduler/base_data_scheduler.py
@@ -0,0 +1,180 @@
+from functools import wraps
+import weakref
+import abc
+import warnings
+
+from ..data_sparsifier import BaseDataSparsifier
+
+__all__ = ['BaseDataScheduler']
+
+
+class BaseDataScheduler:
+    r"""
+    The BaseDataScheduler is the abstract scheduler class specifically for the
+    BaseDataSparsifier class. This class controls a specific hyperparameter of
+    the sparsifier class and varies it across the training process (or across time).
+
+    Args:
+        data_sparsifier (instance of BaseDataSparsifier)
+            Implemented class data sparsifier class wherein the update_mask is implemented
+        schedule_param (str)
+            A specific hyperparameter of the passed sparsifier that needs to be scheduled/varied
+        last_epoch (int, default=-1)
+            This is specifically is passed when training needs to be resumed from a particular
+            point.
+        verbose (bool, default=False)
+            Verbosity of the BaseDataScheduler
+
+    The *get_hyperparam()* function needs to be implemented by the user.
+    """
+    def __init__(self, data_sparsifier, schedule_param: str, last_epoch=-1, verbose=False):
+        # Attach sparsifier
+        if not isinstance(data_sparsifier, BaseDataSparsifier):
+            raise TypeError('{} is not an instance of torch.ao.pruning.BaseDataSparsifier'.format(
+                type(data_sparsifier).__name__))
+        self.data_sparsifier = data_sparsifier
+        self.schedule_param = schedule_param
+
+        # Initialize epoch and base hyper-params
+        self.base_param = {
+            name: config.get(schedule_param, None)
+            for name, config in self.data_sparsifier.data_groups.items()
+        }
+
+        self.last_epoch = last_epoch
+
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `scheduler.step()` is called after
+        # `sparsifier.step()`
+        def with_counter(method):
+            if getattr(method, '_with_counter', False):
+                # `sparsifier.step()` has already been replaced, return.
+                return method
+
+            # Keep a weak reference to the sparsifier instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)
+            # Get the unbound method for the same purpose.
+            func = method.__func__
+            cls = instance_ref().__class__
+            del method
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._step_count += 1  # type: ignore[union-attr]
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True  # type: ignore[attr-defined]
+            return wrapper
+
+        self.data_sparsifier.step = with_counter(self.data_sparsifier.step)  # type: ignore[assignment]
+        self.data_sparsifier._step_count = 0  # type: ignore[attr-defined]
+        self._step_count: int = 0
+        self.verbose = verbose
+
+        # Housekeeping
+        self._get_sp_called_within_step: bool = False  # sp -> schedule parameter
+        self.step()
+
+    @abc.abstractmethod
+    def get_schedule_param(self):
+        r"""
+        Abstract method that needs to be implemented by the child class.
+        The expected return type should is a dictionary of name to schedule_param value
+        The returned values will be updated in sparsifier when the scheduler step() function
+        is called.
+
+        Example:
+            >>> def get_schedule_param(self):
+            ...     new_param = {}
+            ...     for name in self.sparsifier.data_groups.keys():
+            ...         new_param[name] = self.sparsifier.data_groups[name][self.schedule_param] * 0.5
+            ...     return new_param
+
+        When the step() function is called, the value in self.sparsifier.data_groups[name][self.schedule_param]
+        would be halved
+        """
+        raise NotImplementedError
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        format_string += '\n'
+        format_string += f'Data Sparsifier {self.data_sparsifier}\n'
+        format_string += f'    {self.schedule_param}: {self.base_param}\n'
+        format_string += ')'
+        return format_string
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the sparsifier.
+
+        Note:
+            The scheduler class does not track the state of the data_sparsifier.
+            Make sure to store the state of the sparsifier before storing the
+            state of the scheduler
+        """
+        return {key: value for key, value in self.__dict__.items() if key != 'data_sparsifier'}
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Note:
+            Remember to restore the state of the data_sparsifier before the scheduler.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_param(self):
+        return self._last_param
+
+    def step(self):
+        # Raise warning if trying to call scheduler step before the sparsifier.
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._step_count == 1:
+            if not hasattr(self.data_sparsifier.step, "_with_counter"):
+                warnings.warn("Seems like `data_sparsifier.step()` has been overridden after sparsity scheduler "
+                              "initialization. Please, make sure to call `data_sparsifier.step()` before "
+                              "`scheduler.step()`.", UserWarning)
+
+            # Just check if there were two first scheduler.step() calls before sparsifier.step()
+            elif self.data_sparsifier._step_count < 1:  # type: ignore[attr-defined]
+                warnings.warn("Detected call of `scheduler.step()` before `data_sparsifier.step()`. "
+                              "You have to make sure you run the data_sparsifier.step() BEFORE any "
+                              "calls to the scheduler.step().", UserWarning)
+        self._step_count += 1
+
+        class _enable_get_sp_call:
+
+            def __init__(self, o):
+                self.o = o
+
+            def __enter__(self):
+                self.o._get_sp_called_within_step = True
+                return self
+
+            def __exit__(self, type, value, traceback):
+                self.o._get_sp_called_within_step = False
+
+        with _enable_get_sp_call(self):
+            self.last_epoch += 1
+            updated_scheduler_params = self.get_schedule_param()
+
+        for name, param in updated_scheduler_params.items():
+            self.data_sparsifier.data_groups[name][self.schedule_param] = param
+            if self.verbose:
+                print(f"Adjusting {self.schedule_param} for group {name} to {param}")
+
+        self._last_param = {
+            name: config.get(self.schedule_param, None)
+            for name, config in self.data_sparsifier.data_groups.items()
+        }
+        self.data_sparsifier.enable_mask_update = True
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dd919555b8ccec0431198c9d43a62b0af2eb82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__init__.py
@@ -0,0 +1,7 @@
+from .base_data_sparsifier import BaseDataSparsifier
+from .data_norm_sparsifier import DataNormSparsifier
+
+__all__ = [
+    "BaseDataSparsifier",
+    "DataNormSparsifier",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..290d0567b66bbdd6bf30906c288b15bff707ef55
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/base_data_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/base_data_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b1961ab5f3814d5c195848ea17cd114889ad15e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/base_data_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/data_norm_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/data_norm_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..287e0414c1493981d84d18fd436207222fe6e5e0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/data_norm_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/quantization_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/quantization_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..383e415d642d79b116079df0b6745ff9dfdf3510
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/__pycache__/quantization_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..74bdf3abf8089742e7db8c565af5f31f53e98e3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/base_data_sparsifier.py
@@ -0,0 +1,309 @@
+import abc
+import torch
+from typing import Optional, Tuple, List, Any, Dict
+from ...sparsifier import base_sparsifier
+from collections import defaultdict
+from torch import nn
+import copy
+from ...sparsifier import utils
+from torch.nn.utils import parametrize
+import sys
+import warnings
+
+if not sys.warnoptions:
+    # to suppress repeated warnings when being used in a training loop.
+    warnings.simplefilter("once")
+
+__all__ = ['BaseDataSparsifier']
+
+EMBEDDING_TYPES = {
+    nn.Embedding,
+    nn.EmbeddingBag,
+}
+
+SUPPORTED_TYPES = {
+    torch.Tensor,
+    nn.Parameter,
+    *EMBEDDING_TYPES,
+}
+
+
+class _Container(nn.Module):
+    pass
+
+
+class BaseDataSparsifier(base_sparsifier.BaseSparsifier):
+    r"""
+    Base Data Sparsifier class for all Data sparsifiers.
+    The abstract class accepts raw torch tensors / embedding / embedding bags (refer to SUPPORTED_TYPES above)
+    to prepare for sparsification.
+    In this case, mask (and parametrizations) is owned by the class and not by the user.
+    Specifically, the container object inside the class maintains the mask and parametrizations of the input data
+
+    Args:
+        data_list (list of tuples)
+            list of (name, data) tuples to sparsify. Lookup SUPPORTED_TYPES
+            for type of data. Internally, a container module handles the data sparsification.
+
+        defaults (dict)
+            default configurations will be attached to the
+            configuration. Only the keys that don't exist in the `config` will
+            be updated.
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> data_list = [('tensor_1', torch.randn(3,3)), ('tensor_2', torch.randn(4,4))]
+        >>> defaults = {'sparsity_level': 0.7}
+        >>> sparsifier = DerivedDataSparsifier(data_list = data_list, **defaults) # Some sparsifier that inherits BaseDataSparsifier
+        >>> new_tensor_to_add = {'name': 'tensor_3', 'data': torch.randn(5,5), 'sparsity_level': 0.3}
+        >>> sparsifier.add_data(**new_tensor_to_add)
+        >>> # tensor_1 and tensor_2 will have sparsity_level of 0.7 but tensor_3 will have sparsity_level=0.3
+    """
+    def __init__(self, data_list: Optional[List[Tuple[str, Any]]] = None, **defaults):
+        super().__init__(defaults=defaults)
+
+        self._container = _Container()
+
+        self.data_groups: Dict[str, Dict] = defaultdict(dict)  # name -> {**config}
+        if data_list is not None:
+            # add data with default config here
+            [self.add_data(name, data, **self.defaults) for name, data in data_list]
+
+    def prepare(self):
+        raise NotImplementedError("this function is undefined for this class")
+
+    def _extract_weight(self, data):
+        # extract the weight parameter instead of underlying data
+        if type(data) in [torch.Tensor, nn.Parameter]:
+            return data
+        elif type(data) in EMBEDDING_TYPES:
+            return data.weight
+
+    def add_data(self, name: str, data, reuse_mask=True, **config):
+        r""" Configures and parametrizes the internal container model with name and data.
+
+        **Note**:
+            1. If the data with name already exists, it replaces the data.
+            2. While replacing, the old mask is reused when `reuse_mask=True`
+            3. If `reuse_mask=True`, then the replacing data needs to have the same shape as that of old data.
+            4. By default, the config of the replaced data is used as config for the replacing data, unless something
+               is specified in the config dictionary.
+        """
+        assert type(data) in SUPPORTED_TYPES, \
+            "specified data type not supported at the moment"
+        local_args = copy.deepcopy(self.defaults)
+        local_args.update(config)
+        weight = self._extract_weight(data)
+
+        # Bookkeeping in the container class
+        mask = local_args.get('mask', torch.ones_like(weight))
+        param_class = local_args.get('parametrization', utils.FakeSparsity)
+
+        if name in self.state:
+            # If the named data already exists - replace
+            warnings.warn("Replacing existing data of the same name. - Did you mean a different name?")
+
+            # reuse old config
+            old_args = self.data_groups[name]
+            local_args = copy.deepcopy(old_args)
+            local_args.update(config)
+
+            if reuse_mask:
+                current_data = self.get_data(name=name)
+                assert weight.shape == current_data.shape, \
+                    "to retain the old mask, the shape of the new data must be the same as the previous one"
+                mask = self.get_mask(name=name)  # reuse mask instead of creating a new one
+
+            self._delete_data(name=name)
+
+        # parameter creates a deepcopy of the weight inside, so create a buffer
+        self._container.register_buffer(name=name, tensor=weight)
+        parametrize.register_parametrization(self._container, name, param_class(mask))
+        self.state[name]['mask'] = mask
+        self.data_groups[name] = local_args
+        return getattr(self._container, name)
+
+    def get_data(self, name: str, return_original: bool = True):
+        r"""Returns weight tensor (or data)
+        Args:
+            - name: name of the data to be returned
+            - return_original returns weight tensor without applying parametrization if True
+                else - returns the sparsified version (parametrized)
+        """
+        if name not in self.data_groups:
+            raise ValueError("data with specified name does not exist")
+
+        if return_original:
+            if not parametrize.is_parametrized(self._container, name):
+                raise ValueError("mask squashed - original mask value does not exist")
+            data = getattr(self._container.parametrizations, name).original
+            return data
+        else:
+            return getattr(self._container, name)
+
+    def _convert_mask(self, states, sparse_coo=True):
+        r"""Converts the mask to sparse coo or dense tensors depending on the `sparse_coo` argument.
+        """
+        states = copy.deepcopy(states)
+        for state in states.values():
+            if sparse_coo:
+                state['mask'] = state['mask'].to_sparse_coo()
+            else:
+                state['mask'] = state['mask'].to_dense()
+
+        return states
+
+    def state_dict(self):
+        r"""Returns the state of the optimizer as a :class:`dict`.
+
+        It contains:
+        * state - contains name -> mask mapping.
+        * data_groups - a list containing all sparsity configuration groups
+            with the key name specifying the name of the data
+        * container_state_dict - the state dictionary of the internal
+            container model used for sparsification
+        """
+        state = self._convert_mask(self.state)
+        return {
+            'state': state,
+            'data_groups': self.data_groups,
+            '_container': self._container.state_dict()
+        }
+
+    def _load_container_from_state(self, states, data_groups, container_state_dict):
+        r"""This restores the state of the container specifically based on the data present in state and data_groups
+        If the data was parametrized, then the data would be added to the container and then parametrized,
+        else it would just add the attribute the container.
+        """
+        for name, state in states.items():
+            config_name = data_groups.get(name, None)
+            if config_name is None:
+                raise RuntimeError(f"Error loading {name}")
+
+            # check if the data with such a name was parametrized, if so parametrize
+            # otherwise just set the attribute and continue
+            parametrized_name = f'parametrizations.{name}.original'
+            parametrized = False
+            data = container_state_dict.get(name, None)
+            if name in container_state_dict:
+                # the parametrization was probably removed for this
+                data = container_state_dict.get(name)
+
+            elif parametrized_name in container_state_dict:
+                # so the weight was parametrized
+                data = container_state_dict.get(parametrized_name)
+                parametrized = True
+
+            else:
+                raise RuntimeError(f"Error loading {name}")
+
+            self._container.register_buffer(name=name, tensor=data)
+
+            if parametrized:
+                # register parameter if parametrized
+                mask = state.get('mask', torch.ones_like(data))
+                param_class = data_groups.get('parametrization', utils.FakeSparsity)  # change once public_api for utils is fixed!
+                parametrize.register_parametrization(self._container, name, param_class(mask))
+
+    def load_state_dict(self, state_dict, strict=True):
+        r"""The load_state_dict() restores the state of the sparsifier based on the state_dict
+
+        Args:
+        * state_dict - the dictionary that to which the current sparsifier needs to be restored to
+        * strict - If True - the sparsifier is reset and is restored exactly to the state in state_dict.
+            If False - the current sparsifier is not reset before loading the state_dict i.e. data added
+            before loading the state_dict is not erased.
+        """
+        states = copy.deepcopy(state_dict['state'])
+        data_groups = copy.deepcopy(state_dict['data_groups'])
+        container_state_dict = copy.deepcopy(state_dict['_container'])
+
+        states = self._convert_mask(states, sparse_coo=False)  # convert sparse coo mask to dense
+        if strict:
+            # if strict load -> then reset container
+            self._container = _Container()
+
+        self._load_container_from_state(states, data_groups, container_state_dict)
+
+        if not strict:
+            states.update(self.state)
+            data_groups.update(self.data_groups)
+
+        self.__setstate__({'state': states, 'data_groups': data_groups})
+
+    def __setstate__(self, state):
+        if '_container' in state:  # If container object is in state then load model
+            container_dict = state.pop('_container')
+            self._container = _Container()
+            state['state'] = self._convert_mask(state['state'], sparse_coo=False)  # convert sparse coo mask to dense
+            self._load_container_from_state(state['state'], state['data_groups'], container_dict)
+
+        self.__dict__.update(state)
+
+    def __getstate__(self):
+        state = self._convert_mask(self.state)
+        return {
+            'defaults': self.defaults,
+            'state': state,
+            'data_groups': self.data_groups,
+            '_container': self._container.state_dict()
+        }
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        for name, sparse_args in self.data_groups.items():
+            format_string += '\n'
+            format_string += '\tData Group\n'
+            format_string += f'\t    name: {name}\n'
+            for key in sorted(sparse_args.keys()):
+                if key == 'data':
+                    continue
+                format_string += f'\t    {key}: {sparse_args[key]}\n'
+        format_string += ')'
+        return format_string
+
+    def get_mask(self, name: str):
+        if name not in self.state:
+            raise ValueError("data with specified name does not exist")
+        return self.state[name]['mask']
+
+    def squash_mask(self, *args, leave_parametrized=True, names=None, **kwargs):
+        r"""Squashes the sparse masks into the appropriate tensors. Also, accepts list of strings
+        to squash mask for. If none, squashes mask for all the keys
+        kwargs:
+            * names: list of strings to squash mask for
+            * sparsified: if true - applies the mask before squashing
+                          if false - does not apply the mask before squashing
+        """
+        if names is None:
+            names = list(self.data_groups.keys())
+        for name in names:
+            parametrize.remove_parametrizations(self._container, name, leave_parametrized=leave_parametrized)
+
+    def step(self):
+        if not self.enable_mask_update:
+            return
+        with torch.no_grad():
+            for name, config in self.data_groups.items():
+                # get non-sparsified data
+                data = self.get_data(name)
+                # need name for the mask otherwise can directly pass mask?
+                self.update_mask(name, data, **config)
+
+    @abc.abstractmethod
+    def update_mask(self, name, data, **kwargs):
+        pass
+
+    def _delete_data(self, name):
+        """Detaches some data from the sparsifier.
+
+        Args:
+            name (str)
+                Name of the data to be removed from the sparsifier
+
+        Note:
+            Currently private. Kind of used as a helper function when replacing data of the same name
+        """
+        self.squash_mask(names=[name], leave_parametrized=False)  # do not apply the mask while deleting
+        delattr(self._container, name)
+        self.state.pop(name)
+        self.data_groups.pop(name)
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a335041963bf15d2c1d6c117239c6562b94180c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/data_norm_sparsifier.py
@@ -0,0 +1,153 @@
+import torch
+from torch.nn import functional as F
+from functools import reduce
+from typing import Any, List, Optional, Tuple
+
+from .base_data_sparsifier import BaseDataSparsifier
+import operator
+
+__all__ = ['DataNormSparsifier']
+
+
+class DataNormSparsifier(BaseDataSparsifier):
+    r"""L1-Norm Sparsifier
+    This sparsifier computes the *L1-norm* of every sparse block and "zeroes-out" the
+    ones with the lowest norm. The level of sparsity defines how many of the
+    blocks is removed.
+    This sparsifier is controlled by three variables:
+    1. `sparsity_level` defines the number of *sparse blocks* that are zeroed-out
+    2. `sparse_block_shape` defines the shape of the sparse blocks. Note that
+        the sparse blocks originate at the zero-index of the tensor.
+    3. `zeros_per_block` is the number of zeros that we are expecting in each
+        sparse block. By default we assume that all elements within a block are
+        zeroed-out. However, setting this variable sets the target number of
+        zeros per block. The zeros within each block are chosen as the *smallest
+        absolute values*.
+    Args:
+        sparsity_level: The target level of sparsity
+        sparse_block_shape: The shape of a sparse block
+        zeros_per_block: Number of zeros in a sparse block
+    Note::
+        All arguments to the DataNormSparsifier constructor are "default"
+        arguments and could be overriden by the configuration provided in the
+        `add_data` step.
+    """
+    def __init__(self, data_list: Optional[List[Tuple[str, Any]]] = None, sparsity_level: float = 0.5,
+                 sparse_block_shape: Tuple[int, int] = (1, 4),
+                 zeros_per_block: Optional[int] = None, norm: str = 'L1'):
+        if zeros_per_block is None:
+            zeros_per_block = reduce(operator.mul, sparse_block_shape)
+
+        assert norm in ['L1', 'L2'], "only L1 and L2 norm supported at the moment"
+
+        defaults = {'sparsity_level': sparsity_level, 'sparse_block_shape': sparse_block_shape,
+                    'zeros_per_block': zeros_per_block}
+        self.norm = norm
+        super().__init__(data_list=data_list, **defaults)
+
+    def __get_scatter_folded_mask(self, data, dim, indices, output_size, sparse_block_shape):
+        mask = torch.ones_like(data)
+        mask.scatter_(dim=dim, index=indices, value=0)  # zeroing out
+        mask = F.fold(mask, output_size=output_size, kernel_size=sparse_block_shape,
+                      stride=sparse_block_shape)
+        mask = mask.to(torch.int8)
+        return mask
+
+    def __get_block_level_mask(self, data,
+                               sparse_block_shape, zeros_per_block):
+
+        # Assume data is a squeezed tensor
+        height, width = data.shape[-2], data.shape[-1]
+        block_height, block_width = sparse_block_shape
+        values_per_block = block_height * block_width
+
+        # just return zeros if zeroing all elements in block
+        if values_per_block == zeros_per_block:
+            return torch.zeros_like(data, dtype=torch.int8)
+
+        # creating additional height and width to support padding
+        dh = (block_height - height % block_height) % block_height
+        dw = (block_width - width % block_width) % block_width
+
+        # create a new padded tensor like data (to match the block_shape)
+        padded_data = torch.ones(height + dh, width + dw, dtype=data.dtype, device=data.device)
+        padded_data = padded_data * torch.nan  # can also be replaced with 0 to stop the removal of edge data
+        padded_data[0:height, 0:width] = data
+        unfolded_data = F.unfold(padded_data[None, None, :], kernel_size=sparse_block_shape,
+                                 stride=sparse_block_shape)
+
+        _, sorted_idx = torch.sort(unfolded_data, dim=1)
+        sorted_idx = sorted_idx[:, :zeros_per_block, :]  # zero out zeros_per_block number of elements
+
+        mask = self.__get_scatter_folded_mask(data=unfolded_data, dim=1, indices=sorted_idx, output_size=padded_data.shape,
+                                              sparse_block_shape=sparse_block_shape)
+
+        mask = mask.squeeze(0).squeeze(0)[:height, :width].contiguous()  # remove padding and make contiguous
+        return mask
+
+    def __get_data_level_mask(self, data, sparsity_level,
+                              sparse_block_shape):
+
+        height, width = data.shape[-2], data.shape[-1]
+        block_height, block_width = sparse_block_shape
+        dh = (block_height - height % block_height) % block_height
+        dw = (block_width - width % block_width) % block_width
+
+        data_norm = F.avg_pool2d(data[None, None, :], kernel_size=sparse_block_shape,
+                                 stride=sparse_block_shape, ceil_mode=True)
+
+        values_per_block = reduce(operator.mul, sparse_block_shape)
+
+        data_norm = data_norm.flatten()
+        num_blocks = len(data_norm)
+
+        data_norm = data_norm.repeat(1, values_per_block, 1)  # get similar shape after unfold
+        _, sorted_idx = torch.sort(data_norm, dim=2)
+
+        threshold_idx = round(sparsity_level * num_blocks)  # number of blocks to remove
+        sorted_idx = sorted_idx[:, :, :threshold_idx]
+
+        mask = self.__get_scatter_folded_mask(data=data_norm, dim=2, indices=sorted_idx,
+                                              output_size=(height + dh, width + dw),
+                                              sparse_block_shape=sparse_block_shape)
+
+        mask = mask.squeeze(0).squeeze(0)[:height, :width]  # squeeze only the first 2 dimension
+        return mask
+
+    def update_mask(self, name, data, sparsity_level,
+                    sparse_block_shape, zeros_per_block, **kwargs):
+
+        values_per_block = reduce(operator.mul, sparse_block_shape)
+        if zeros_per_block > values_per_block:
+            raise ValueError("Number of zeros per block cannot be more than "
+                             "the total number of elements in that block.")
+        if zeros_per_block < 0:
+            raise ValueError("Number of zeros per block should be positive.")
+
+        if self.norm == 'L1':
+            data_norm = torch.abs(data).squeeze()  # absolute value based (L1)
+        else:
+            data_norm = (data * data).squeeze()  # square every element for L2
+
+        if len(data_norm.shape) > 2:  # only supports 2 dimensional data at the moment
+            raise ValueError("only supports 2-D at the moment")
+
+        elif len(data_norm.shape) == 1:  # in case the data is bias (or 1D)
+            data_norm = data_norm[None, :]
+
+        mask = self.get_mask(name)
+        if sparsity_level <= 0 or zeros_per_block == 0:
+            mask.data = torch.ones_like(mask)
+        elif sparsity_level >= 1.0 and (zeros_per_block == values_per_block):
+            mask.data = torch.zeros_like(mask)
+
+        # Fetch the high level mask that zeros out entire blocks
+        data_lvl_mask = self.__get_data_level_mask(data=data_norm, sparsity_level=sparsity_level,
+                                                   sparse_block_shape=sparse_block_shape)
+
+        # Fetch block level mask that zeros out 'zeros_per_block' number of elements in every block
+        block_lvl_mask = self.__get_block_level_mask(data=data_norm, sparse_block_shape=sparse_block_shape,
+                                                     zeros_per_block=zeros_per_block)
+
+        # zero out the entries inside those blocks whose block is sparsified
+        mask.data = torch.where(data_lvl_mask == 1, data_lvl_mask, block_lvl_mask)
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e51323a27f71aae2fb7348fe2e15816673271ef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f73f2ac8fb0d10a1ad32909db656eba883f40c7a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5628ef4f83f632a0310a52da2c79349312d60cfb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/_data_sparstity_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17f4e10d603bca0c8bd2ce67d1ab7e1aa1751770
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/__pycache__/data_sparsity.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/_data_sparstity_utils.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/_data_sparstity_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c0bdcf03cd2589c5019d6490c718f46becceb85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/_data_sparstity_utils.py
@@ -0,0 +1,39 @@
+import logging
+from torch.ao.pruning._experimental.data_sparsifier.base_data_sparsifier import SUPPORTED_TYPES
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _attach_model_to_data_sparsifier(module, data_sparsifier, config=None):
+    """Attaches a data sparsifier to all the layers of the module.
+    Essentially, loop over all the weight parameters in the module and
+    attach it to the data sparsifier.
+    Note::
+        The '.' in the layer names are replaced with '_' (refer to _get_valid_name() below)
+        before attaching to the sparsifier. This is because, the data
+        sparsifier uses a dummy model inside to store the weight parameters.
+    """
+    if config is None:
+        config = {}
+    for name, parameter in module.named_parameters():
+        if type(parameter) in SUPPORTED_TYPES:
+            valid_name = _get_valid_name(name)
+            # will be defaulted to default configs
+            data_sparsifier.add_data(name=valid_name, data=parameter, **config.get(valid_name, {}))
+
+
+def _get_valid_name(name):
+    return name.replace('.', '_')  # . is not allowed as a name
+
+
+def _log_sparsified_level(model, data_sparsifier) -> None:
+    # Show the level of sparsity AFTER step:
+    for name, parameter in model.named_parameters():
+        if type(parameter) not in SUPPORTED_TYPES:
+            continue
+        valid_name = _get_valid_name(name)
+        mask = data_sparsifier.get_mask(name=valid_name)
+        sparsity_level = 1.0 - mask.float().mean()
+        logger.info(
+            "Sparsity in layer %s = % .2%", name, sparsity_level
+        )
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2ad926a6485ea0fc5907445599605ead33c8d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/lightning/callbacks/data_sparsity.py
@@ -0,0 +1,165 @@
+from collections import defaultdict
+from copy import deepcopy
+import torch
+from typing import Any, Optional, Dict
+import pytorch_lightning as pl  # type: ignore[import]
+
+from ._data_sparstity_utils import (
+    _attach_model_to_data_sparsifier,
+    _log_sparsified_level,
+    _get_valid_name
+)
+
+
+class PostTrainingDataSparsity(pl.callbacks.Callback):
+    """Lightning callback that enables post-training sparsity.
+
+    This callback aims to sparsify the model inside lightning module after training.
+    **Note that the model is copied and then sparsified, so the existing model is not modified**
+
+    The sparsified model can be used for comparison and can be accessed using
+        <callback_obj>.sparsified
+
+    Args:
+        data_sparsifier_class (some implemented class of BaseDataSparsifier)
+            The data sparsifier object of this class is created when the
+            training starts.
+            Note: Objects should not be passed in here as they are created
+            once the training completes.
+
+        data_sparsifier_args (Dict)
+            Dictionary of args to be passed to the data sparsifier.
+            Note: data_list arg should be ignored
+
+    Hooks implemented:
+        on_fit_end()
+            1. copies the model and attaches it to the sparsifier
+            2. sparsier step() is called
+            3. squashes the mask()
+    """
+    def __init__(self, data_sparsifier_class, data_sparsifier_args):
+        super().__init__()
+        self.data_sparsifier_class = data_sparsifier_class
+        self.data_sparsifier_args = data_sparsifier_args
+        self.data_sparsifier: Any = None
+        self.sparsified: Optional[torch.nn.Module] = None
+
+    def on_fit_end(self, trainer, pl_module) -> None:
+        self.sparsified = deepcopy(pl_module.model).eval()
+        self.data_sparsifier = self.data_sparsifier_class(**self.data_sparsifier_args)
+
+        _attach_model_to_data_sparsifier(self.sparsified, self.data_sparsifier)
+
+        self.data_sparsifier.step()
+
+        self.data_sparsifier.squash_mask()  # currently squashes params for all mask
+
+        _log_sparsified_level(self.sparsified, self.data_sparsifier)
+
+
+class TrainingAwareDataSparsity(pl.callbacks.Callback):
+    """Lightning callback that enables in-training sparsity.
+
+    This callback aims to sparsify the model inside lightning module during training.
+    **Note that the model is copied and then sparsified, so the existing model is not modified**
+
+    The sparsified model can be used for comparison and can be accessed using
+        <callback_obj>.sparsified
+
+    Args:
+        data_sparsifier_class (some implemented class of BaseDataSparsifier)
+            The data sparsifier object of this class is created when the
+            training starts.
+            Note: Objects should not be passed in here as they are created
+            when the training starts.
+
+        data_sparsifier_args (Dict)
+            Dictionary of args to be passed to the data sparsifier.
+            Note: data_list arg should be ignored
+
+        data_scheduler_class (some implemented class of BaseDataScheduler)
+            The data scheduler of this class is created when the training starts
+            Note: Objects should not be passed in here as they are created
+            when the training starts.
+
+        data_scheduler_args(Dict)
+            Dictionary of args to be passed to the data scheduler.
+            **Note: data_sparsifier arg should be ignored as the recipe
+            creates and pass sparsifier object into the class**
+
+    Hooks implemented:
+        on_train_start()
+            Data sparsifier and scheduler objects are created.
+            Pytorch model attached to the sparsifier
+
+        on_train_epoch_start()
+            Loads the state_dict of the data sparsifier
+
+        on_train_epoch_end()
+            1. Copies the model and attaches it to the sparsifier
+            2. sparsifier step() and scheduler step()
+            3. Dump state_dict of the current sparsifier
+
+        on_train_end()
+            squash mask
+    """
+    def __init__(self, data_sparsifier_class, data_sparsifier_args,
+                 data_scheduler_class, data_scheduler_args):
+        super().__init__()
+        # data sparsifier objects
+        self.data_sparsifier_class = data_sparsifier_class
+        self.data_sparsifier_args = data_sparsifier_args
+
+        # scheduler objects
+        self.data_scheduler_class = data_scheduler_class
+        self.data_scheduler_args = data_scheduler_args
+
+        # fields
+        self.data_sparsifier: Any = None
+        self.data_scheduler: Any = None
+        self.sparsified: Optional[torch.nn.Module] = None
+
+        self.data_sparsifier_state_dict: Any = None
+
+    def on_train_start(self, trainer, pl_module) -> None:
+        # create sparsifier
+        self.data_sparsifier = self.data_sparsifier_class(**self.data_sparsifier_args)
+        self.sparsified = deepcopy(pl_module.model)
+
+        _attach_model_to_data_sparsifier(self.sparsified, self.data_sparsifier)  # just to populate the base_sl in the scheduler
+
+        # create scheduler
+        args = deepcopy(self.data_scheduler_args)
+        args['data_sparsifier'] = self.data_sparsifier
+        self.data_scheduler = self.data_scheduler_class(**args)
+
+    def on_train_epoch_start(self, trainer, pl_module):
+        if self.data_sparsifier_state_dict is None:
+            return  # probably first epoch
+
+        # load the existing config for each data
+        self.data_sparsifier.load_state_dict(self.data_sparsifier_state_dict)
+
+    def __create_config_based_on_state(self, pl_module):
+        config: Dict = defaultdict()
+        if self.data_sparsifier_state_dict is None:
+            return config
+        for name, _ in pl_module.model.named_parameters():
+            valid_name = _get_valid_name(name)
+            config[valid_name] = self.data_sparsifier.data_groups[valid_name]
+
+        return config
+
+    def on_train_epoch_end(self, trainer, pl_module):
+        self.sparsified = deepcopy(pl_module.model)
+        config = self.__create_config_based_on_state(pl_module)
+
+        # attach model to the data sparsifier
+        _attach_model_to_data_sparsifier(self.sparsified, self.data_sparsifier, config=config)
+        self.data_sparsifier.step()
+        self.data_scheduler.step()
+
+        self.data_sparsifier_state_dict = self.data_sparsifier.state_dict()
+
+    def on_train_end(self, trainer, pl_module):
+        self.data_sparsifier.squash_mask()
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca33b242a4deae1394f520653df2160b98cb74ca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/data_sparsifier/quantization_utils.py
@@ -0,0 +1,130 @@
+import torch
+import torch.nn as nn
+from torch.ao.pruning.sparsifier.utils import module_to_fqn, fqn_to_module
+from typing import Dict, List, Optional
+
+SUPPORTED_MODULES = {
+    nn.Embedding,
+    nn.EmbeddingBag
+}
+
+
+def _fetch_all_embeddings(model):
+    """Fetches Embedding and EmbeddingBag modules from the model
+    """
+    embedding_modules = []
+    stack = [model]
+    while stack:
+        module = stack.pop()
+        for _, child in module.named_children():
+            fqn_name = module_to_fqn(model, child)
+            if type(child) in SUPPORTED_MODULES:
+                embedding_modules.append((fqn_name, child))
+            else:
+                stack.append(child)
+    return embedding_modules
+
+
+def post_training_sparse_quantize(model,
+                                  data_sparsifier_class,
+                                  sparsify_first=True,
+                                  select_embeddings: Optional[List[nn.Module]] = None,
+                                  **sparse_config):
+    """Takes in a model and applies sparsification and quantization to only embeddings & embeddingbags.
+    The quantization step can happen before or after sparsification depending on the `sparsify_first` argument.
+
+    Args:
+        - model (nn.Module)
+            model whose embeddings needs to be sparsified
+        - data_sparsifier_class (type of data sparsifier)
+            Type of sparsification that needs to be applied to model
+        - sparsify_first (bool)
+            if true, sparsifies first and then quantizes
+            otherwise, quantizes first and then sparsifies.
+        - select_embeddings (List of Embedding modules)
+            List of embedding modules to in the model to be sparsified & quantized.
+            If None, all embedding modules with be sparsified
+        - sparse_config (Dict)
+            config that will be passed to the constructor of data sparsifier object.
+
+    Note:
+        1. When `sparsify_first=False`, quantization occurs first followed by sparsification.
+            - before sparsifying, the embedding layers are dequantized.
+            - scales and zero-points are saved
+            - embedding layers are sparsified and `squash_mask` is applied
+            - embedding weights are requantized using the saved scales and zero-points
+        2. When `sparsify_first=True`, sparsification occurs first followed by quantization.
+            - embeddings are sparsified first
+            - quantization is applied on the sparsified embeddings
+    """
+    data_sparsifier = data_sparsifier_class(**sparse_config)
+
+    # if select_embeddings is None, perform it on all embeddings
+    if select_embeddings is None:
+        embedding_modules = _fetch_all_embeddings(model)
+
+    else:
+        embedding_modules = []
+        assert isinstance(select_embeddings, List), "the embedding_modules must be a list of embedding modules"
+        for emb in select_embeddings:
+            assert type(emb) in SUPPORTED_MODULES, "the embedding_modules list must be an embedding or embedding bags"
+            fqn_name = module_to_fqn(model, emb)
+            assert fqn_name is not None, "the embedding modules must be part of input model"
+            embedding_modules.append((fqn_name, emb))
+
+    if sparsify_first:
+        # sparsify
+        for name, emb_module in embedding_modules:
+            valid_name = name.replace('.', '_')
+            data_sparsifier.add_data(name=valid_name, data=emb_module)
+
+        data_sparsifier.step()
+        data_sparsifier.squash_mask()
+
+        # quantize
+        for _, emb_module in embedding_modules:
+            emb_module.qconfig = torch.ao.quantization.float_qparams_weight_only_qconfig
+
+        torch.ao.quantization.prepare(model, inplace=True)
+        torch.ao.quantization.convert(model, inplace=True)
+
+    else:
+        # quantize
+        for _, emb_module in embedding_modules:
+            emb_module.qconfig = torch.ao.quantization.float_qparams_weight_only_qconfig
+
+        torch.ao.quantization.prepare(model, inplace=True)
+        torch.ao.quantization.convert(model, inplace=True)
+
+        # retrieve scale & zero_points
+        quantize_params: Dict[str, Dict] = {'scales': {}, 'zero_points': {},
+                                            'dequant_weights': {}, 'axis': {},
+                                            'dtype': {}}
+
+        for name, _ in embedding_modules:
+            quantized_emb = fqn_to_module(model, name)
+            assert quantized_emb is not None  # satisfy mypy
+
+            quantized_weight = quantized_emb.weight()  # type: ignore[operator]
+            quantize_params['scales'][name] = quantized_weight.q_per_channel_scales()
+            quantize_params['zero_points'][name] = quantized_weight.q_per_channel_zero_points()
+            quantize_params['dequant_weights'][name] = torch.dequantize(quantized_weight)
+            quantize_params['axis'][name] = quantized_weight.q_per_channel_axis()
+            quantize_params['dtype'][name] = quantized_weight.dtype
+
+            # attach data to sparsifier
+            data_sparsifier.add_data(name=name.replace('.', '_'), data=quantize_params['dequant_weights'][name])
+
+        data_sparsifier.step()
+        data_sparsifier.squash_mask()
+
+        for name, _ in embedding_modules:
+            quantized_emb = fqn_to_module(model, name)
+            assert quantized_emb is not None  # satisfy mypy
+            requantized_vector = torch.quantize_per_channel(quantize_params['dequant_weights'][name],
+                                                            scales=quantize_params['scales'][name],
+                                                            zero_points=quantize_params['zero_points'][name],
+                                                            dtype=quantize_params['dtype'][name],
+                                                            axis=quantize_params['axis'][name])
+
+            quantized_emb.set_weight(requantized_vector)  # type: ignore[operator]
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb8ce411719996ffb6edce123328aad500b83b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/FPGM_pruner.py
@@ -0,0 +1,93 @@
+from typing import Callable, Optional, Union
+
+import torch
+
+from .base_structured_sparsifier import BaseStructuredSparsifier
+
+__all__ = ["FPGMPruner"]
+
+
+class FPGMPruner(BaseStructuredSparsifier):
+    r"""Filter Pruning via Geometric Median (FPGM) Structured Pruner
+    This sparsifier prune fliter (row) in a tensor according to distances among filters according to
+    `Filter Pruning via Geometric Median for Deep Convolutional Neural Networks Acceleration <https://arxiv.org/abs/1811.00250>`_.
+
+    This sparsifier is controlled by three variables:
+    1. `sparsity_level` defines the number of filters (rows) that are zeroed-out.
+    2. `dist` defines the distance measurement type. Default: 3 (L2 distance).
+    Available options are: [1, 2, (custom callable distance function)].
+
+    Note::
+        Inputs should be a 4D convolutional tensor of shape (N, C, H, W).
+            - N: output channels size
+            - C: input channels size
+            - H: height of kernel
+            - W: width of kernel
+    """
+
+    def __init__(
+        self, sparsity_level: float = 0.5, dist: Optional[Union[Callable, int]] = None
+    ):
+        defaults = {
+            "sparsity_level": sparsity_level,
+        }
+
+        if dist is None:
+            dist = 2
+
+        if callable(dist):
+            self.dist_fn = dist
+        elif dist == 1:
+            self.dist_fn = lambda x: torch.cdist(x, x, p=1)
+        elif dist == 2:
+            self.dist_fn = lambda x: torch.cdist(x, x, p=2)
+        else:
+            raise NotImplementedError("Distance function is not yet implemented.")
+        super().__init__(defaults=defaults)
+
+    def _compute_distance(self, t):
+        r"""Compute distance across all entries in tensor `t` along all dimension
+        except for the one identified by dim.
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+        Returns:
+            distance (torch.Tensor): distance computed across filtters
+        """
+        dim = 0  # prune filter (row)
+
+        size = t.size(dim)
+        slc = [slice(None)] * t.dim()
+
+        # flatten the tensor along the dimension
+        t_flatten = [
+            t[tuple(slc[:dim] + [slice(i, i + 1)] + slc[dim + 1 :])].reshape(-1)
+            for i in range(size)
+        ]
+        t_flatten = torch.stack(t_flatten)
+
+        # distance measurement
+        dist_matrix = self.dist_fn(t_flatten)
+
+        # more similar with other filter indicates large in the sum of row
+        distance = torch.sum(torch.abs(dist_matrix), 1)
+
+        return distance
+
+    def update_mask(self, module, tensor_name, sparsity_level, **kwargs):
+        tensor_weight = getattr(module, tensor_name)
+        mask = getattr(module.parametrizations, tensor_name)[0].mask
+
+        if sparsity_level <= 0:
+            mask.data = torch.ones_like(mask).bool()
+        elif sparsity_level >= 1.0:
+            mask.data = torch.zeros_like(mask).bool()
+        else:
+            distance = self._compute_distance(tensor_weight)
+
+            tensor_size = tensor_weight.shape[0]  # prune filter (row)
+            nparams_toprune = round(sparsity_level * tensor_size)
+            nparams_toprune = min(
+                max(nparams_toprune, 0), tensor_size
+            )  # clamp to [0, tensor_size]
+            topk = torch.topk(distance, k=nparams_toprune, largest=False)
+            mask[topk.indices] = False
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9589fa6afeb5458172dfa0fd6217c8f6496da45a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__init__.py
@@ -0,0 +1,8 @@
+from .base_structured_sparsifier import BaseStructuredSparsifier
+from .parametrization import (
+    FakeStructuredSparsity,
+    BiasHook,
+)
+from .saliency_pruner import SaliencyPruner
+from .lstm_saliency_pruner import LSTMSaliencyPruner
+from .FPGM_pruner import FPGMPruner
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/FPGM_pruner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/FPGM_pruner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..704126199851897985a00785ee12518bb625269b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/FPGM_pruner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee71803163359f389c33e3b7cff04eb75f9d3191
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/base_structured_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/base_structured_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51b7586135a49e4ce66fe107de668db8beed7be7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/base_structured_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/lstm_saliency_pruner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/lstm_saliency_pruner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4cb3ea0ef6f24d5a339d419de988d2aff2396a9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/lstm_saliency_pruner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b623940073893ff7be57a010c320d0c5ec8e9ab2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/match_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/parametrization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/parametrization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac8d7e81e1c7e925e9f93976ca44cdbb8adaf8a9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/parametrization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e9379bcdb2128cf9799652c73fe3ebe85fa693a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/prune_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/saliency_pruner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/saliency_pruner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf073dcb0aba5cefa98ab190d07b30ff6ef5edb1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/__pycache__/saliency_pruner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c33b02fff1eca775d2513bacb156add4533cf2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/base_structured_sparsifier.py
@@ -0,0 +1,310 @@
+from itertools import chain
+from operator import getitem
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.fx import symbolic_trace
+from torch.nn.utils import parametrize
+from typing import Type, Set, Dict, Callable, Tuple, Optional, Union
+
+from torch.ao.pruning import BaseSparsifier
+from .parametrization import FakeStructuredSparsity, BiasHook, module_contains_param
+from .match_utils import apply_match, MatchAllNode
+from .prune_functions import (
+    prune_linear,
+    prune_linear_linear,
+    prune_linear_activation_linear,
+    prune_conv2d,
+    prune_conv2d_conv2d,
+    prune_conv2d_activation_conv2d,
+    prune_conv2d_activation_pool_conv2d,
+    prune_conv2d_pool_activation_conv2d,
+    prune_conv2d_pool_flatten_linear,
+    prune_lstm_output_linear,
+    prune_lstm_output_layernorm_linear,
+)
+
+
+def _get_supported_structured_pruning_modules():
+    SUPPORTED_STRUCTURED_PRUNING_MODULES = {  # added to config if None given
+        nn.Linear,
+        nn.Conv2d,
+        nn.LSTM,
+    }
+    return SUPPORTED_STRUCTURED_PRUNING_MODULES
+
+
+def _get_supported_activation_functions():
+    SUPPORTED_ACTIVATION_FUNCTIONS = {
+        F.relu,
+        F.rrelu,
+        F.hardtanh,
+        F.relu6,
+        F.sigmoid,
+        F.hardsigmoid,
+        F.tanh,
+        F.silu,
+        F.mish,
+        F.hardswish,
+        F.elu,
+        F.celu,
+        F.selu,
+        F.hardshrink,
+        F.leaky_relu,
+        F.logsigmoid,
+        F.softplus,
+        F.prelu,
+        F.softsign,
+        F.tanhshrink,
+        F.gelu,
+    }
+    return SUPPORTED_ACTIVATION_FUNCTIONS
+
+
+def _get_supported_activation_modules():
+    SUPPORTED_ACTIVATION_MODULES = {
+        nn.ReLU,
+        nn.RReLU,
+        nn.Hardtanh,
+        nn.ReLU6,
+        nn.Sigmoid,
+        nn.Hardsigmoid,
+        nn.Tanh,
+        nn.SiLU,
+        nn.Mish,
+        nn.Hardswish,
+        nn.ELU,
+        nn.CELU,
+        nn.SELU,
+        nn.Hardshrink,
+        nn.LeakyReLU,
+        nn.LogSigmoid,
+        nn.Softplus,
+        nn.PReLU,
+        nn.Softsign,
+        nn.Tanhshrink,
+        nn.GELU,
+    }
+    return SUPPORTED_ACTIVATION_MODULES
+
+
+def _get_default_structured_pruning_patterns() -> Dict[
+    Tuple[Union[Type[nn.Module], Callable, MatchAllNode, str], ...],
+    Callable[..., None],
+]:
+    """
+    Returns the patterns for conv2d / linear conversion for each element in the activation functions/modules defined above.
+    """
+    patterns: Dict[
+        Tuple[Union[Type[nn.Module], Callable, MatchAllNode, str], ...],
+        Callable[..., None],
+    ] = {
+        # linear -> linear
+        (nn.Linear, "output"): prune_linear,
+        (nn.Linear, nn.Linear): prune_linear_linear,
+        # conv2d -> conv2d
+        (nn.Conv2d, "output"): prune_conv2d,
+        (nn.Conv2d, nn.Conv2d): prune_conv2d_conv2d,
+        # TODO LSTM Structured pruning does not support returned state currently.
+        # Should find a way to explicitly match getitem(0) instead of getitem.
+        # This will also require changing the pruning function.
+        # lstm -> getitem(0) -> linear
+        (nn.LSTM, getitem, nn.Linear): prune_lstm_output_linear,
+        # lstm -> getitem(0) -> layernorm -> linear
+        (nn.LSTM, getitem, nn.LayerNorm, nn.Linear): prune_lstm_output_layernorm_linear,
+    }
+
+    for activation in chain(
+        _get_supported_activation_functions(), _get_supported_activation_modules()
+    ):
+        patterns.update(
+            {
+                # linear -> activation -> linear
+                (nn.Linear, activation, nn.Linear): prune_linear_activation_linear,
+                # conv2d -> activation -> conv2d
+                (nn.Conv2d, activation, nn.Conv2d): prune_conv2d_activation_conv2d,
+                # conv2d -> activation -> pool -> conv2d
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.AvgPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.avg_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    nn.MaxPool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                (
+                    nn.Conv2d,
+                    activation,
+                    F.max_pool2d,
+                    nn.Conv2d,
+                ): prune_conv2d_activation_pool_conv2d,
+                # conv2d -> pool -> activation -> conv2d
+                (
+                    nn.Conv2d,
+                    nn.AvgPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.avg_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    nn.MaxPool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                (
+                    nn.Conv2d,
+                    F.max_pool2d,
+                    activation,
+                    nn.Conv2d,
+                ): prune_conv2d_pool_activation_conv2d,
+                # conv2d -> adaptive pool -> flatten -> linear
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveAvgPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    nn.Flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+                (
+                    nn.Conv2d,
+                    nn.AdaptiveMaxPool2d,
+                    torch.flatten,
+                    nn.Linear,
+                ): prune_conv2d_pool_flatten_linear,
+            }
+        )
+    return patterns
+
+
+class BaseStructuredSparsifier(BaseSparsifier):
+    r"""Base class for structured pruning.
+
+    Abstract methods that need to be implemented:
+        - update_mask: Function to compute a new mask for all keys in the
+            `groups` attribute.
+
+    Args:
+        - defaults [dict]: default configurations will be attached to the
+            configuration. Only the keys that don't exist in the `config` will
+            be updated.
+    """
+
+    def __init__(self, defaults, patterns=None):
+        super().__init__(defaults)
+        if patterns is None:
+            patterns = _get_default_structured_pruning_patterns()
+        self.patterns = patterns
+
+    def make_config_from_model(
+        self,
+        model: nn.Module,
+        SUPPORTED_MODULES: Optional[Set[Type]] = None,
+    ) -> None:
+        if SUPPORTED_MODULES is None:
+            SUPPORTED_MODULES = _get_supported_structured_pruning_modules()
+        super().make_config_from_model(model, SUPPORTED_MODULES=SUPPORTED_MODULES)
+
+    def _prepare(self, *args, **kwargs) -> None:
+        r"""This function will attach the FakeStructuredSparsity parameterizations
+        and BiasHooks at the appropriate points in the model.
+        """
+        for config in self.groups:
+            module = config["module"]
+            tensor_name = config["tensor_name"]
+            parametrization = config.get("parametrization", FakeStructuredSparsity)
+            tensor = getattr(module, tensor_name)
+
+            mask = config.get(
+                "mask",
+                torch.ones(tensor.shape[0], dtype=torch.bool, device=tensor.device),
+            )
+            self.state[config["tensor_fqn"]]["mask"] = mask
+            parametrize.register_parametrization(
+                module, tensor_name, parametrization(mask)
+            )
+
+            # if linear / conv, we add in bias hooks
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                prune_bias = config.get("prune_bias", True)
+                if module.bias is not None:
+                    module.register_parameter(
+                        "_bias", nn.Parameter(module.bias.detach())
+                    )
+                    module.bias = None
+                    module.prune_bias = prune_bias
+
+                module.register_forward_hook(
+                    BiasHook(module.parametrizations.weight[0], prune_bias)
+                )
+
+    def prune(self) -> None:
+        r"""
+        This function will FX symbolically trace the model and then find instances of the patterns
+        defined in self.patterns (by default SUPPORTED_STRUCTURED_PRUNING_PATTERNS ).
+
+        For each pattern, it will apply to corresponding conversion function, which will modify the output
+        and input size expected by the modules within the pattern
+        """
+
+        self.traced = symbolic_trace(self.model)
+        modules = dict(self.traced.named_modules())
+
+        # Right now we check for matches simply by iterating across all the patterns
+        # if this is slow we can store patterns in a trie-structure and modify this code for faster lookup
+        for node in self.traced.graph.nodes:
+            for pattern, convert_fn in self.patterns.items():
+                matched = apply_match(modules, pattern, node, [])
+                if matched is None:
+                    continue
+
+                first_module = modules.get(node.target)
+                # check if first module exists and has appropriate parameterization, otherwise skip
+                if (
+                    first_module is not None
+                    and parametrize.is_parametrized(first_module)
+                    and module_contains_param(first_module, FakeStructuredSparsity)
+                ):
+                    convert_block = []
+                    for node in matched:
+                        if node.op == "call_module":
+                            convert_block.append(modules.get(node.target))
+                        elif node.op == "call_function":
+                            convert_block.append(node.target)
+                    convert_fn(*convert_block)
+
+        for module in self.traced.modules():
+            if module_contains_param(module, FakeStructuredSparsity):
+                raise Exception(
+                    f"Error: {module} still contains FakeStructuredSparsity parametrizations!"
+                )
+
+        self.traced.graph.lint()
+        self.traced.recompile()
+        return self.traced
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbd069e202dd860701a32dc3e6853f9d2e5fb689
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/lstm_saliency_pruner.py
@@ -0,0 +1,48 @@
+from typing import cast
+
+import torch
+from .base_structured_sparsifier import BaseStructuredSparsifier, FakeStructuredSparsity
+
+class LSTMSaliencyPruner(BaseStructuredSparsifier):
+    """
+    Prune packed LSTM weights based on saliency.
+    For each layer {k} inside a LSTM, we have two packed weight matrices
+    - weight_ih_l{k}
+    - weight_hh_l{k}
+
+    These tensors pack the weights for the 4 linear layers together for efficiency.
+
+    [W_ii | W_if | W_ig | W_io]
+
+    Pruning this tensor directly will lead to weights being misassigned when unpacked.
+    To ensure that each packed linear layer is pruned the same amount:
+        1. We split the packed weight into the 4 constituent linear parts
+        2. Update the mask for each individual piece using saliency individually
+
+    This applies to both weight_ih_l{k} and weight_hh_l{k}.
+    """
+
+    def update_mask(self, module, tensor_name, **kwargs):
+        weights = getattr(module, tensor_name)
+
+        for p in getattr(module.parametrizations, tensor_name):
+            if isinstance(p, FakeStructuredSparsity):
+                mask = cast(torch.Tensor, p.mask)
+
+                # select weights based on magnitude
+                if weights.dim() <= 1:
+                    raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+                # take norm over all but first dim
+                dims = tuple(range(1, weights.dim()))
+                saliency = weights.norm(dim=dims, p=1)
+
+                # handle weights in 4 groups
+                split_size = len(mask) // 4
+                masks = torch.split(mask, split_size)
+                saliencies = torch.split(saliency, split_size)
+
+                for keep_mask, sal in zip(masks, saliencies):
+                    # mask smallest k values to be removed
+                    k = int(len(keep_mask) * kwargs["sparsity_level"])
+                    prune = sal.topk(k, largest=False, sorted=False).indices
+                    keep_mask.data[prune] = False  # modifies underlying p.mask directly
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/match_utils.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/match_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f995d96279eaaf34264706fdf91a0555341412
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/match_utils.py
@@ -0,0 +1,59 @@
+"""
+Contains utility functions to check if a pattern is in the graph and return the matching nodes
+"""
+import torch
+from torch import nn
+from torch.ao.quantization.utils import (
+    MatchAllNode,
+)
+from torch.fx import Node
+from torch.nn.utils import parametrize
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+def _match(modules: Dict[str, nn.ModuleDict], node: Node, current: Union[nn.Module, Any]) -> bool:
+    r"""
+    checks to see if a single node of a pattern matches
+    """
+    if isinstance(current, type) and issubclass(current, MatchAllNode):
+        return True
+    if not isinstance(node, Node):
+        return False
+    if isinstance(current, type) and issubclass(current, torch.nn.Module):
+        return (
+            node.op == "call_module"
+            and parametrize.type_before_parametrizations(modules[node.target])
+            == current
+        )
+    elif callable(current):
+        return node.op == "call_function" and node.target is current
+    elif isinstance(current, str):
+        return node.target == current
+    return False
+
+def apply_match(
+    modules: Dict[str, nn.ModuleDict],
+    pattern: Union[Tuple[Any], Any],
+    node: Node,
+    matched_node_pattern: List[Node],
+) -> Optional[List[Node]]:
+    r"""
+    This function will return the matched nodes if the pattern matches the node given
+    If there is no match, it will return None
+    """
+    if isinstance(pattern, tuple):
+        if len(pattern) == 1:
+            if _match(modules, node, pattern[0]):
+                return matched_node_pattern + [node]
+
+        first, *rest = pattern
+        if _match(modules, node, first):
+            if rest is None:
+                return matched_node_pattern + [node]
+
+            for user in node.users:
+                return apply_match(
+                    modules, tuple(rest), user, matched_node_pattern + [node]
+                )
+    elif _match(modules, node, pattern):
+        return [node]
+    return None
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e87299cd894c0731518a37bc224376e84e130ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/parametrization.py
@@ -0,0 +1,59 @@
+import torch
+from torch import nn
+from torch.nn.utils.parametrize import is_parametrized
+
+
+def module_contains_param(module, parametrization):
+    if is_parametrized(module):
+        # see if any of the module tensors have a parametriztion attached that matches the one passed in
+        return any(
+            any(isinstance(param, parametrization) for param in param_list)
+            for key, param_list in module.parametrizations.items()
+        )
+    return False
+
+
+# Structured Pruning Parameterizations
+class FakeStructuredSparsity(nn.Module):
+    r"""
+    Parametrization for Structured Pruning. Like FakeSparsity, this should be attached to
+    the  'weight' or any other parameter that requires a mask.
+
+    Instead of an element-wise bool mask, this parameterization uses a row-wise bool mask.
+    """
+
+    def __init__(self, mask):
+        super().__init__()
+        self.register_buffer("mask", mask)
+
+    def forward(self, x):
+        assert isinstance(self.mask, torch.Tensor)
+        assert self.mask.shape[0] == x.shape[0]
+        shape = [1] * len(x.shape)
+        shape[0] = -1
+        return self.mask.reshape(shape) * x
+
+    def state_dict(self, *args, **kwargs):
+        # avoid double saving masks
+        return {}
+
+
+class BiasHook:
+    def __init__(self, parametrization, prune_bias):
+        self.param = parametrization
+        self.prune_bias = prune_bias
+
+    def __call__(self, module, input, output):
+
+        if getattr(module, "_bias", None) is not None:
+            bias = module._bias.data
+            if self.prune_bias:
+                bias[~self.param.mask] = 0
+
+            # reshape bias to broadcast over output dimensions
+            idx = [1] * len(output.shape)
+            idx[1] = -1
+            bias = bias.reshape(idx)
+
+            output += bias
+        return output
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/prune_functions.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/prune_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..55fb7a973ae0a25c4bb12fd99245c45c740c4aaf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/prune_functions.py
@@ -0,0 +1,475 @@
+"""
+Collection of conversion functions for linear / conv2d structured pruning
+Also contains utilities for bias propagation
+"""
+from typing import cast, List, Optional, Callable, Tuple
+
+import torch
+from torch import nn, Tensor
+from torch.nn.utils import parametrize
+from torch.nn.utils.parametrize import ParametrizationList
+from .parametrization import FakeStructuredSparsity, BiasHook
+
+# BIAS PROPAGATION
+def _remove_bias_handles(module: nn.Module) -> None:
+    if hasattr(module, "_forward_hooks"):
+        bias_hooks: List[int] = []
+        for key, hook in module._forward_hooks.items():
+            if isinstance(hook, BiasHook):
+                bias_hooks.append(key)
+
+        for key in bias_hooks:
+            del module._forward_hooks[key]
+
+
+def _get_adjusted_next_layer_bias(
+    next_layer: nn.Module, pruned_biases: Tensor, mask: Tensor
+) -> nn.Parameter:
+    r"""Returns new adjusted bias for the second supported module"""
+    if parametrize.is_parametrized(next_layer):
+        # need to access original weight
+        parametrization_dict = cast(nn.ModuleDict, next_layer.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        next_weight = weight_parameterizations.original
+    else:
+        next_weight = cast(Tensor, next_layer.weight)
+
+    scaling_weight = next_weight[:, ~mask]
+    if isinstance(next_layer, nn.Conv2d):  # checking for Conv2d
+        # Propagating first layer pruned biases and calculating the new second layer bias
+        # involves more steps since the Conv2d scaling weight has extra dimensions,
+        # so adding bias involves broadcasting, logically:
+        # for each channel k in range(oC):
+        #     scaled_biases = sum(first_bias[pruned_idx] @ next_weight[k, pruned_idx, :, :].T)
+        #     new_next_bias[k] = old_next_bias[k] + scaled_biases
+        scaling_product = torch.matmul(
+            pruned_biases.reshape(1, -1), torch.transpose(scaling_weight, 1, 2)
+        )
+        sum_range = list(range(len(scaling_product.shape)))[
+            1:
+        ]  # all but the first dimension
+        scaled_biases = torch.sum(scaling_product, sum_range)
+    elif isinstance(next_layer, nn.Linear):  # Linear
+        scaled_biases = torch.matmul(
+            pruned_biases, torch.transpose(scaling_weight, 0, 1)
+        )  # recall b2_new = b1 @ w2.T + b2
+    else:
+        raise NotImplementedError(f"Type {type(next_layer)} not supported yet.")
+
+    if (
+        parametrize.is_parametrized(next_layer)
+        and getattr(next_layer, "_bias", None) is not None
+    ):  # next_layer is parametrized & has original bias ._bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer._bias)
+    elif (
+        not parametrize.is_parametrized(next_layer) and next_layer.bias is not None
+    ):  # next_layer not parametrized & has .bias
+        adjusted_bias = nn.Parameter(scaled_biases + next_layer.bias)
+    else:  # next_layer has no bias
+        adjusted_bias = nn.Parameter(scaled_biases)
+    return adjusted_bias
+
+
+def _prune_module_bias(module: nn.Module, mask: Tensor) -> None:
+    r"""Applies mask to given modules bias"""
+    # prune bias along with weights, discard pruned indices of bias
+    original_bias = cast(Tensor, getattr(module, "_bias", module.bias))
+    if original_bias is not None:
+        module.bias = nn.Parameter(original_bias[mask])
+
+    #  remove _bias parameter
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+
+def _propogate_module_bias(module: nn.Module, mask: Tensor) -> Optional[Tensor]:
+    r"""
+    In the case that we need to propagate biases, this function will return the biases we need
+    """
+    # set current module bias
+    if module.bias is not None:
+        module.bias = nn.Parameter(cast(Tensor, module.bias)[mask])
+    elif getattr(module, "_bias", None) is not None:
+        module.bias = nn.Parameter(cast(Tensor, module._bias)[mask])
+
+    # get pruned biases to propagate to subsequent layer
+    if getattr(module, "_bias", None) is not None:
+        pruned_biases = cast(Tensor, module._bias)[~mask]
+    else:
+        pruned_biases = None
+
+    if hasattr(module, "_bias"):
+        delattr(module, "_bias")
+
+    return pruned_biases
+
+
+# LINEAR
+def _prune_linear_helper(linear: nn.Linear) -> Tensor:
+    # expects linear to be a parameterized linear module
+    parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(linear, "weight", leave_parametrized=True)
+        linear.weight = nn.Parameter(linear.weight[mask])  # type: ignore[possibly-undefined]
+    linear.out_features = linear.weight.shape[0]
+    _remove_bias_handles(linear)
+
+    return mask
+
+
+def prune_linear(linear: nn.Linear) -> None:
+    mask = _prune_linear_helper(linear)
+    if getattr(linear, "prune_bias", False):
+        _prune_module_bias(linear, mask)
+
+
+def prune_linear_linear(linear1: nn.Linear, linear2: nn.Linear) -> None:
+    prune_linear_activation_linear(linear1, None, linear2)
+
+
+def prune_linear_activation_linear(
+    linear1: nn.Linear,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    linear2: nn.Linear,
+):
+    mask = _prune_linear_helper(linear1)
+    if getattr(linear1, "prune_bias", False):
+        _prune_module_bias(linear1, mask)
+    else:
+        pruned_biases = _propogate_module_bias(linear1, mask)
+        if pruned_biases is not None:
+            if activation:
+                pruned_biases = activation(pruned_biases)
+            linear2.bias = _get_adjusted_next_layer_bias(linear2, pruned_biases, mask)
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear2):
+            parametrization_dict = cast(nn.ModuleDict, linear2.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, mask]
+            )
+            linear2.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear2.weight = nn.Parameter(linear2.weight[:, mask])
+            linear2.in_features = linear2.weight.shape[1]
+
+
+# CONV2D
+def _prune_conv2d_helper(conv2d: nn.Conv2d) -> Tensor:
+    parametrization_dict = cast(nn.ModuleDict, conv2d.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d, "weight", leave_parametrized=True)
+        conv2d.weight = nn.Parameter(conv2d.weight[mask])  # type: ignore[possibly-undefined]
+    conv2d.out_channels = conv2d.weight.shape[0]
+
+    _remove_bias_handles(conv2d)
+    return mask
+
+
+def prune_conv2d_padded(conv2d_1: nn.Conv2d) -> None:
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    with torch.no_grad():
+        parametrize.remove_parametrizations(conv2d_1, "weight", leave_parametrized=True)
+
+    if getattr(conv2d_1, "_bias", None) is not None:
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has original bias and bias propagated from previous layer
+            new_bias = torch.zeros(conv2d_1.bias.shape)
+            new_bias[mask] = conv2d_1.bias[mask]  # type: ignore[possibly-undefined]
+            # adjusted bias that to keep in conv2d_1
+            new_bias[~mask] = cast(Tensor, conv2d_1._bias)[~mask]
+            # pruned biases that are kept instead of propagated
+            conv2d_1.bias = nn.Parameter(new_bias)
+        else:  # conv2d_1 has only original bias
+            conv2d_1.bias = nn.Parameter(cast(Tensor, conv2d_1._bias))
+    else:
+        # no original bias, only propagated bias
+        if (
+            conv2d_1.bias is not None
+        ):  # conv2d_1 has bias propagated from previous layer
+            conv2d_1.bias.data[~mask] = 0  # type: ignore[possibly-undefined]
+
+    if hasattr(conv2d_1, "_bias"):
+        delattr(conv2d_1, "_bias")
+
+
+def prune_conv2d(conv2d: nn.Conv2d) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+
+
+def prune_conv2d_conv2d(conv2d_1: nn.Conv2d, conv2d_2: nn.Conv2d) -> None:
+    prune_conv2d_activation_conv2d(conv2d_1, None, conv2d_2)
+
+
+def prune_conv2d_activation_conv2d(
+    conv2d_1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    conv2d_2: nn.Conv2d,
+):
+    r"""
+    Fusion Pattern for conv2d -> some activation module / function -> conv2d layers
+    """
+    parametrization_dict = cast(nn.ModuleDict, conv2d_1.parametrizations)
+    weight_parameterizations = cast(ParametrizationList, parametrization_dict.weight)
+    for p in weight_parameterizations:
+        if isinstance(p, FakeStructuredSparsity):
+            mask = cast(Tensor, p.mask)
+
+    prune_bias = getattr(conv2d_1, "prune_bias", False)
+    if (
+        hasattr(conv2d_2, "padding")
+        and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+        and (conv2d_1.bias is not None or getattr(conv2d_1, "_bias", None) is not None)
+    ):
+        prune_conv2d_padded(conv2d_1)
+    else:
+        mask = _prune_conv2d_helper(conv2d_1)
+        if prune_bias:
+            _prune_module_bias(conv2d_1, mask)
+        else:
+            pruned_biases = _propogate_module_bias(conv2d_1, mask)
+            if pruned_biases is not None:
+                if activation:
+                    pruned_biases = activation(pruned_biases)
+                conv2d_2.bias = _get_adjusted_next_layer_bias(
+                    conv2d_2, pruned_biases, mask
+                )
+
+        if (
+            not (
+                hasattr(conv2d_2, "padding")
+                and cast(Tuple[int], conv2d_2.padding) > (0, 0)
+            )
+            or conv2d_1.bias is None
+        ):
+            with torch.no_grad():
+                if parametrize.is_parametrized(conv2d_2):
+                    parametrization_dict = cast(
+                        nn.ModuleDict, conv2d_2.parametrizations
+                    )
+                    weight_parameterizations = cast(
+                        ParametrizationList, parametrization_dict.weight
+                    )
+                    weight_parameterizations.original = nn.Parameter(
+                        weight_parameterizations.original[:, mask]
+                    )
+                    conv2d_2.in_channels = weight_parameterizations.original.shape[1]
+                else:
+                    conv2d_2.weight = nn.Parameter(conv2d_2.weight[:, mask])
+                    conv2d_2.in_channels = conv2d_2.weight.shape[1]
+
+
+def prune_conv2d_pool_activation_conv2d(
+    c1: nn.Conv2d,
+    pool: nn.Module,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_activation_pool_conv2d(
+    c1: nn.Conv2d,
+    activation: Optional[Callable[[Tensor], Tensor]],
+    pool: nn.Module,
+    c2: nn.Conv2d,
+) -> None:
+    prune_conv2d_activation_conv2d(c1, activation, c2)
+
+
+def prune_conv2d_pool_flatten_linear(
+    conv2d: nn.Conv2d,
+    pool: nn.Module,
+    flatten: Optional[Callable[[Tensor], Tensor]],
+    linear: nn.Linear,
+) -> None:
+    mask = _prune_conv2d_helper(conv2d)
+
+    # We map the pruned indices of the Conv2d output to the flattened indices of the Linear following the Flatten layer.
+    # we determine the flattening scale (h * w), and readjust `first_pruned_indices`
+    # (each idx maps to range idx * h * w to (idx+1) * h * w), `first_valid_indices`,
+    # and `pruned_biases` (repeat each bias by h * w).
+    if parametrize.is_parametrized(linear):
+        parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+        weight_parameterizations = cast(
+            ParametrizationList, parametrization_dict.weight
+        )
+        linear_ic = weight_parameterizations.original.shape[1]
+    else:
+        linear_ic = linear.weight.shape[1]
+
+    conv2d_oc = len(mask)
+    assert (
+        linear_ic % conv2d_oc == 0
+    ), f"Flattening from dimensions {conv2d_oc} to {linear_ic} not supported"
+
+    flatten_scale = linear_ic // conv2d_oc
+    flattened_mask = torch.tensor(
+        [[val] * flatten_scale for val in mask], dtype=torch.bool, device=mask.device
+    ).flatten()
+
+    if getattr(conv2d, "prune_bias", False):
+        _prune_module_bias(conv2d, mask)
+    else:
+        pruned_biases = cast(Tensor, _propogate_module_bias(conv2d, mask))
+        flattened_pruned_biases = torch.tensor(
+            [[bias] * flatten_scale for bias in pruned_biases], device=mask.device
+        ).flatten()
+        linear.bias = _get_adjusted_next_layer_bias(
+            linear, flattened_pruned_biases, flattened_mask
+        )
+
+    with torch.no_grad():
+        if parametrize.is_parametrized(linear):
+            parametrization_dict = cast(nn.ModuleDict, linear.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict.weight
+            )
+            weight_parameterizations.original = nn.Parameter(
+                weight_parameterizations.original[:, flattened_mask]
+            )
+            linear.in_features = weight_parameterizations.original.shape[1]
+        else:
+            linear.weight = nn.Parameter(linear.weight[:, flattened_mask])
+            linear.in_features = linear.weight.shape[1]
+
+
+def prune_lstm_output_linear(
+    lstm: nn.LSTM, getitem: Callable, linear: nn.Linear
+) -> None:
+    prune_lstm_output_layernorm_linear(lstm, getitem, None, linear)
+
+
+def prune_lstm_output_layernorm_linear(
+    lstm: nn.LSTM,
+    getitem: Callable,
+    layernorm: Optional[nn.LayerNorm],
+    linear: nn.Linear,
+) -> None:
+    for i in range(lstm.num_layers):
+        if parametrize.is_parametrized(lstm, f"weight_ih_l{i}"):
+            parametrization_dict = cast(nn.ModuleDict, lstm.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict[f"weight_ih_l{i}"]
+            )
+            mask = weight_parameterizations[0].mask
+
+            with torch.no_grad():
+                parametrize.remove_parametrizations(
+                    lstm, f"weight_ih_l{i}", leave_parametrized=True
+                )
+                setattr(
+                    lstm,
+                    f"weight_ih_l{i}",
+                    nn.Parameter(getattr(lstm, f"weight_ih_l{i}")[mask]),
+                )
+                setattr(
+                    lstm,
+                    f"bias_ih_l{i}",
+                    nn.Parameter(getattr(lstm, f"bias_ih_l{i}")[mask]),
+                )
+
+        if parametrize.is_parametrized(lstm, f"weight_hh_l{i}"):
+            parametrization_dict = cast(nn.ModuleDict, lstm.parametrizations)
+            weight_parameterizations = cast(
+                ParametrizationList, parametrization_dict[f"weight_hh_l{i}"]
+            )
+            mask = weight_parameterizations[0].mask
+
+            with torch.no_grad():
+                parametrize.remove_parametrizations(
+                    lstm, f"weight_hh_l{i}", leave_parametrized=True
+                )
+                # splitting out hidden-hidden masks
+                W_hi, W_hf, W_hg, W_ho = torch.split(
+                    getattr(lstm, f"weight_hh_l{i}"), lstm.hidden_size
+                )
+                M_hi, M_hf, M_hg, M_ho = torch.split(mask, lstm.hidden_size)
+
+                # resize each individual weight separately
+                W_hi = W_hi[M_hi][:, M_hi]
+                W_hf = W_hf[M_hf][:, M_hf]
+                W_hg = W_hg[M_hg][:, M_hg]
+                W_ho = W_ho[M_ho][:, M_ho]
+
+                # concat, use this as new weight
+                new_weight = torch.cat((W_hi, W_hf, W_hg, W_ho))
+                setattr(lstm, f"weight_hh_l{i}", nn.Parameter(new_weight))
+                setattr(
+                    lstm,
+                    f"bias_hh_l{i}",
+                    nn.Parameter(getattr(lstm, f"bias_hh_l{i}")[mask]),
+                )
+
+            # If this is the final layer, then we need to prune linear layer columns
+            if i + 1 == lstm.num_layers:
+                lstm.hidden_size = int(M_hi.sum())
+                with torch.no_grad():
+                    if parametrize.is_parametrized(linear):
+                        parametrization_dict = cast(
+                            nn.ModuleDict, linear.parametrizations
+                        )
+                        weight_parameterizations = cast(
+                            ParametrizationList, parametrization_dict.weight
+                        )
+
+                        weight_parameterizations.original = nn.Parameter(
+                            weight_parameterizations.original[:, M_ho]
+                        )
+                        linear.in_features = weight_parameterizations.original.shape[1]
+                    else:
+                        linear.weight = nn.Parameter(linear.weight[:, M_ho])
+                        linear.in_features = linear.weight.shape[1]
+
+                    # if layernorm module, prune weight and bias
+                    if layernorm is not None:
+                        layernorm.normalized_shape = (linear.in_features,)
+                        layernorm.weight = nn.Parameter(layernorm.weight[M_ho])
+                        layernorm.bias = nn.Parameter(layernorm.bias[M_ho])
+
+            # otherwise need to prune the columns of the input of the next LSTM layer
+            else:
+                with torch.no_grad():
+                    if parametrize.is_parametrized(lstm, f"weight_ih_l{i+1}"):
+                        parametrization_dict = cast(
+                            nn.ModuleDict, lstm.parametrizations
+                        )
+                        weight_parameterizations = cast(
+                            ParametrizationList,
+                            getattr(parametrization_dict, f"weight_ih_l{i+1}"),
+                        )
+
+                        weight_parameterizations.original = nn.Parameter(
+                            weight_parameterizations.original[:, M_ho]
+                        )
+                    else:
+                        next_layer_weight = getattr(lstm, f"weight_ih_l{i+1}")
+                        setattr(
+                            lstm,
+                            f"weight_ih_l{i+1}",
+                            nn.Parameter(next_layer_weight[:, M_ho]),
+                        )
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/saliency_pruner.py b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe196576a9f50157ca088380e573b99988cee574
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_experimental/pruner/saliency_pruner.py
@@ -0,0 +1,29 @@
+from .base_structured_sparsifier import BaseStructuredSparsifier
+
+
+class SaliencyPruner(BaseStructuredSparsifier):
+    """
+    Prune rows based on the saliency (L1 norm) of each row.
+
+    This pruner works on N-Dimensional weight tensors.
+    For each row, we will calculate the saliency, whic is the sum the L1 norm of all weights in that row.
+    We expect that the resulting saliency vector has the same shape as our mask.
+    We then pick elements to remove until we reach the target sparsity_level.
+    """
+
+    def update_mask(self, module, tensor_name, **kwargs):
+        # tensor_name will give you the FQN, all other entries in sparse config is present in kwargs
+        weights = getattr(module, tensor_name)
+        mask = getattr(module.parametrizations, tensor_name)[0].mask
+
+        # use negative weights so we can use topk (we prune out the smallest)
+        if weights.dim() <= 1:
+            raise Exception("Structured pruning can only be applied to a 2+dim weight tensor!")
+        saliency = -weights.norm(dim=tuple(range(1, weights.dim())), p=1)
+        assert saliency.shape == mask.shape
+
+        num_to_pick = int(len(mask) * kwargs["sparsity_level"])
+        prune = saliency.topk(num_to_pick).indices
+
+        # Set the mask to be false for the rows we want to prune
+        mask.data[prune] = False
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/_mappings.py b/MLPY/Lib/site-packages/torch/ao/pruning/_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaad70fb7b9e6da48bfbd1ad31653efdc6e24bb9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/_mappings.py
@@ -0,0 +1,18 @@
+__all__ = [
+    "get_static_sparse_quantized_mapping",
+    "get_dynamic_sparse_quantized_mapping",
+]
+
+def get_static_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+    _static_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.Linear,
+    }
+    return _static_sparse_quantized_mapping
+
+def get_dynamic_sparse_quantized_mapping():
+    import torch.ao.nn.sparse
+    _dynamic_sparse_quantized_mapping = {
+        torch.nn.Linear: torch.ao.nn.sparse.quantized.dynamic.Linear,
+    }
+    return _dynamic_sparse_quantized_mapping
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d73b0664ecf49ca2fa809bc87ab1184cde94686
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a9b97c3b0e336562af0f3995b62f7f08ed9ecb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/base_scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d7b50bbacda61fb3fb3de930b374ee643a8129d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/cubic_scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3460022221cef516fb1a969d22ad759a1b11fce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/__pycache__/lambda_scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/base_scheduler.py b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/base_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaec38eb3112d756b52f302106377609a4b92317
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/base_scheduler.py
@@ -0,0 +1,159 @@
+
+from torch.ao.pruning import BaseSparsifier
+
+from functools import wraps
+import warnings
+import weakref
+
+__all__ = ["BaseScheduler"]
+
+class BaseScheduler:
+
+    def __init__(self, sparsifier, last_epoch=-1, verbose=False):
+
+        # Attach sparsifier
+        if not isinstance(sparsifier, BaseSparsifier):
+            raise TypeError(f'{type(sparsifier).__name__} is not an instance of torch.ao.pruning.BaseSparsifier')
+        self.sparsifier = sparsifier
+
+        # Initialize epoch and base sparsity levels
+
+        self.base_sl = [group['sparsity_level'] for group in sparsifier.groups]
+        self.last_epoch = last_epoch
+
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `scheduler.step()` is called after
+        # `sparsifier.step()`
+        def with_counter(method):
+            if getattr(method, '_with_counter', False):
+                # `sparsifier.step()` has already been replaced, return.
+                return method
+
+            # Keep a weak reference to the sparsifier instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)
+            # Get the unbound method for the same purpose.
+            func = method.__func__
+            cls = instance_ref().__class__
+            del method
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._step_count += 1  # type: ignore[union-attr]
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True  # type: ignore[attr-defined]
+            return wrapper
+
+        self.sparsifier.step = with_counter(self.sparsifier.step)  # type: ignore[assignment]
+        self.sparsifier._step_count = 0  # type: ignore[attr-defined]
+        self._step_count: int = 0
+        self.verbose = verbose
+
+        # Housekeeping
+        self._get_sl_called_within_step: bool = False
+
+        self.step()
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the sparsifier.
+        """
+        return {key: value for key, value in self.__dict__.items() if key != 'sparsifier'}
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_sl(self):
+        """ Return last computed sparsity level by current scheduler.
+        """
+        return self._last_sl
+
+    def get_sl(self):
+        # Compute sparsity level using chainable form of the scheduler
+        # Note: This method is not intended to be called directly, and is only
+        #       used by the ".step" method. Use .get_last_sl() instead.
+        if not self._get_sl_called_within_step:
+            warnings.warn(
+                "To get the last sparsity level computed by the scheduler, "
+                "please use `get_last_sl()`.")
+        raise NotImplementedError
+
+    def print_sl(self, is_verbose, group, sl, epoch=None):
+        """Display the current sparsity level.
+        """
+        if is_verbose:
+            if epoch is None:
+                print(f'Adjusting sparsity level of group {group} to {sl:.4e}.')
+            else:
+                print(f'Epoch {epoch:5d}: adjusting sparsity level of group {group} to {sl:.4e}.')
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' ('
+        format_string += '\n'
+        format_string += f'Sparsifier {self.sparsifier}\n'
+        format_string += f'    base_sl: {self.base_sl}\n'
+        format_string += ')'
+        return format_string
+
+    def step(self, epoch=None):
+        # Raise warning if trying to call scheduler step before the sparsifier.
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._step_count == 1:
+            if not hasattr(self.sparsifier.step, "_with_counter"):
+                warnings.warn("Seems like `sparsifier.step()` has been overridden after sparsity scheduler "
+                              "initialization. Please, make sure to call `sparsifier.step()` before "
+                              "`scheduler.step()`.", UserWarning)
+
+            # Just check if there were two first scheduler.step() calls before sparsifier.step()
+            elif self.sparsifier._step_count < 1:  # type: ignore[attr-defined]
+                warnings.warn("Detected call of `scheduler.step()` before `sparsifier.step()`. "
+                              "You have to make sure you run the sparsifier.step() BEFORE any "
+                              "calls to the scheduler.step().", UserWarning)
+        self._step_count += 1
+
+        class _enable_get_sl_call:
+
+            def __init__(self, o):
+                self.o = o
+
+            def __enter__(self):
+                self.o._get_sl_called_within_step = True
+                return self
+
+            def __exit__(self, type, value, traceback):
+                self.o._get_sl_called_within_step = False
+
+        with _enable_get_sl_call(self):
+            self.last_epoch += 1
+            values = self.get_sl()
+
+        for i, data in enumerate(zip(self.sparsifier.groups, values)):
+            param_group, sl = data
+            param_group['sparsity_level'] = sl
+            self.print_sl(self.verbose, i, sl, epoch)
+
+        self._last_sl = [group['sparsity_level'] for group in self.sparsifier.groups]
+        self.sparsifier.enable_mask_update = True
+
+    def _make_sure_a_list(self, var):
+        r"""Utility that extends it to the same length as the .groups, ensuring it is a list"""
+        n = len(self.sparsifier.groups)
+        if not isinstance(var, (list, tuple)):
+            return [var] * n
+        else:
+            if len(var) != n:
+                raise ValueError(f"Expected variable of length {n}, but got {len(var)}")
+            return list(var)  # We want the result to be in a list, not tuple
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/cubic_scheduler.py b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/cubic_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac7f8212478892e94322873f870bea5722a657e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/cubic_scheduler.py
@@ -0,0 +1,107 @@
+import warnings
+
+from .base_scheduler import BaseScheduler
+
+__all__ = ["CubicSL"]
+
+def _clamp(x, lo, hi):
+    return max(lo, min(hi, x))
+
+
+class CubicSL(BaseScheduler):
+    r"""Sets the sparsity level of each parameter group to the final sl
+    plus a given exponential function.
+
+    .. math::
+
+        s_i = s_f + (s_0 - s_f) \cdot \left( 1 - \frac{t - t_0}{n\Delta t} \right)^3
+
+    where :math:`s_i` is the sparsity at epoch :math:`t`, :math;`s_f` is the final
+    sparsity level, :math:`f(i)` is the function to be applied to the current epoch
+    :math:`t`, initial epoch :math:`t_0`, and final epoch :math:`t_f`.
+    :math:`\Delta t` is used to control how often the update of the sparsity level
+    happens. By default,
+
+    Args:
+        sparsifier (BaseSparsifier): Wrapped sparsifier.
+        init_sl (int, list): Initial level of sparsity
+        init_t (int, list): Initial step, when pruning starts
+        delta_t (int, list): Pruning frequency
+        total_t (int, list): Total number of pruning steps
+        initially_zero (bool, list): If True, sets the level of sparsity to 0
+            before init_t (:math:`t_0`). Otherwise, the sparsity level before
+            init_t (:math:`t_0`) is set to init_sl(:math:`s_0`)
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+    """
+    def __init__(self,
+                 sparsifier,
+                 init_sl=0.0,
+                 init_t=0,
+                 delta_t=10,
+                 total_t=100,
+                 initially_zero=False,
+                 last_epoch=-1,
+                 verbose=False
+                 ):
+        self.sparsifier = sparsifier
+
+        self.init_sl = self._make_sure_a_list(init_sl)
+        self.init_t = self._make_sure_a_list(init_t)
+        self.delta_t = self._make_sure_a_list(delta_t)
+        self.total_t = self._make_sure_a_list(total_t)
+
+        self.initially_zero = self._make_sure_a_list(initially_zero)
+
+        super().__init__(sparsifier, last_epoch, verbose)
+
+    @staticmethod
+    def sparsity_compute_fn(s_0, s_f, t, t_0, dt, n, initially_zero=False):
+        r""""Computes the current level of sparsity.
+
+        Based on https://arxiv.org/pdf/1710.01878.pdf
+
+        Args:
+            s_0: Initial level of sparsity, :math:`s_i`
+            s_f: Target level of sparsity, :math:`s_f`
+            t: Current step, :math:`t`
+            t_0: Initial step, :math:`t_0`
+            dt: Pruning frequency, :math:`\Delta T`
+            n: Pruning steps, :math:`n`
+            initially_zero: Sets the level of sparsity to 0 before t_0.
+                If False, sets to s_0
+
+        Returns:
+            The sparsity level :math:`s_t` at the current step :math:`t`
+        """
+        if initially_zero and t < t_0:
+            return 0
+        s_t = s_f + (s_0 - s_f) * (1.0 - (t - t_0) / (dt * n)) ** 3
+        s_t = _clamp(s_t, s_0, s_f)
+        return s_t
+
+    def get_sl(self):
+        if not self._get_sl_called_within_step:
+            warnings.warn(
+                "To get the last sparsity level computed by the scheduler, "
+                "please use `get_last_sl()`.")
+        return [
+            self.sparsity_compute_fn(
+                s_0=initial_sparsity,
+                s_f=final_sparsity,
+                t=self.last_epoch,
+                t_0=initial_epoch,
+                dt=delta_epoch,
+                n=interval_epochs,
+                initially_zero=initially_zero
+            ) for initial_sparsity, final_sparsity, initial_epoch, delta_epoch, interval_epochs, initially_zero in
+            zip(
+                self.init_sl,
+                self.base_sl,
+                self.init_t,
+                self.delta_t,
+                self.total_t,
+                self.initially_zero
+            )
+        ]
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/lambda_scheduler.py b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/lambda_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bf3998757bfe6f35c1bb57d6281c016473fc0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/scheduler/lambda_scheduler.py
@@ -0,0 +1,47 @@
+import warnings
+
+from .base_scheduler import BaseScheduler
+
+__all__ = ["LambdaSL"]
+
+class LambdaSL(BaseScheduler):
+    """Sets the sparsity level of each parameter group to the final sl
+    times a given function. When last_epoch=-1, sets initial sl as zero.
+    Args:
+        sparsifier (BaseSparsifier): Wrapped sparsifier.
+        sl_lambda (function or list): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such
+            functions, one for each group in sparsifier.param_groups.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+    Example:
+        >>> # Assuming sparsifier has two groups.
+        >>> lambda1 = lambda epoch: epoch // 30
+        >>> lambda2 = lambda epoch: 0.95 ** epoch
+        >>> # xdoctest: +SKIP
+        >>> scheduler = LambdaSL(sparsifier, sl_lambda=[lambda1, lambda2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, sparsifier, sl_lambda, last_epoch=-1, verbose=False):
+        self.sparsifier = sparsifier
+
+        if not isinstance(sl_lambda, list) and not isinstance(sl_lambda, tuple):
+            self.sl_lambdas = [sl_lambda] * len(sparsifier.groups)
+        else:
+            if len(sl_lambda) != len(sparsifier.groups):
+                raise ValueError(f"Expected {len(sparsifier.groups)} lr_lambdas, but got {len(sl_lambda)}")
+            self.sl_lambdas = list(sl_lambda)
+        super().__init__(sparsifier, last_epoch, verbose)
+
+    def get_sl(self):
+        if not self._get_sl_called_within_step:
+            warnings.warn(
+                "To get the last sparsity level computed by the scheduler, "
+                "please use `get_last_sl()`.")
+        return [base_sl * lmbda(self.last_epoch)
+                for lmbda, base_sl in zip(self.sl_lambdas, self.base_sl)]
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__init__.py b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..518459be9ee9c87f384d8686d8c6fb5b168749e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbb8273c33af3b7a6f6ad335e33aa3b8b727e98b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/base_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b94ee81735e01b3b6ee61146204755a009a133
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/nearly_diagonal_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87557d0f95fde903085ef80887350effbe11950c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8537ad7d0c9bd51fdb1a2ab9abf50de06c6cd2db
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/__pycache__/weight_norm_sparsifier.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/base_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/base_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..89165aab967982bce9700b047f17421cb023c742
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/base_sparsifier.py
@@ -0,0 +1,353 @@
+import abc
+import copy
+from collections import defaultdict
+from typing import Any, Dict, Optional, Set, Tuple, List, Type
+
+import torch
+from torch import nn
+from torch.nn.utils import parametrize
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from .utils import (
+    module_contains_param,
+    swap_module,
+    FakeSparsity,
+    get_arg_info_from_tensor_fqn,
+    module_to_fqn,
+)
+
+__all__ = ["BaseSparsifier"]
+
+SUPPORTED_MODULES = {nn.Linear}
+
+KEYS_NOT_IN_STATE_DICT = ["module", "module_fqn", "tensor_name"]
+
+__all__ = ["BaseSparsifier"]
+
+
+# TODO update desc with new config args
+class BaseSparsifier(abc.ABC):
+    r"""Base class for all sparsifiers.
+
+    Abstract methods that need to be implemented:
+
+    - update_mask: Function to compute a new mask for all keys in the
+        `groups`.
+
+    Args:
+        - model [nn.Module]: model to configure. The model itself is not saved
+            but used for the state_dict saving / loading.
+        - config [list]: configuration elements should be a dict map that includes
+            `tensor_fqn` of tensors to sparsify
+        - defaults [dict]: default configurations will be attached to the
+            configuration. Only the keys that don't exist in the `config` will
+            be updated.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Can't instantiate abstract class BaseSparsifier with abstract method update_mask")
+        >>> config = [{'tensor_fqn': 'layer1.weight', 'tensor_fqn': 'linear2.weight2', 'sparsity_level': 0.5}]
+        >>> defaults = {'sparsity_level': 0.7}
+        >>> # model.layer1.weight will have `sparsity_level` = 0.7 (getting default)
+        >>> sparsifier = BaseSparsifier(config, defaults)
+    """
+
+    def __init__(self, defaults: Optional[Dict[str, Any]] = None):
+        super().__init__()
+        self.defaults: Dict[str, Any] = defaults or {}
+
+        self.state: Dict[str, Dict] = defaultdict(dict)
+        self.groups: List[Dict[str, Any]] = []
+        self.enable_mask_update = True
+
+    def __getstate__(self) -> Dict[str, Any]:
+        return {
+            "defaults": self.defaults,
+            "state": self.state,
+            "groups": self.groups,
+        }
+
+    def __setstate__(self, state: Dict[str, Dict[str, Any]]) -> None:
+        self.__dict__.update(state)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + " ("
+        for i, sparse_args in enumerate(self.groups):
+            module = sparse_args["module"]
+            format_string += "\n"
+            format_string += f"\tGroup {i}\n"
+            format_string += f"\t    module: {module}\n"
+            for key in sorted(sparse_args.keys()):
+                if key == "module":
+                    continue
+                format_string += f"\t    {key}: {sparse_args[key]}\n"
+        format_string += ")"
+        return format_string
+
+    def state_dict(self) -> Dict[str, Any]:
+        r"""Returns the state of the optimizer as a :class:`dict`.
+
+        It contains:
+        * state - current state of the sparsification.
+        * groups - a list containing all sparsity configuration groups
+            with the key 'tensor_fqn' specifying the path to the sparsified tensor within a model
+
+        TODO: Need a clean way of loading the state of the "prepared" module
+        """
+
+        groups: List[Dict[str, Any]] = [
+            dict(
+                filter(
+                    lambda key_value: key_value[0] not in KEYS_NOT_IN_STATE_DICT,
+                    mg.items(),
+                )
+            )
+            for mg in self.groups
+        ]
+
+        return {
+            "state": self.state,
+            "groups": groups,
+        }
+
+    def load_state_dict(self, state_dict: Dict[str, Any], strict: bool = True):
+        groups = copy.deepcopy(state_dict["groups"])
+        states = state_dict["state"]
+        for tensor_fqn, s in states.items():
+            arg_info = get_arg_info_from_tensor_fqn(self.model, tensor_fqn)
+            module = arg_info["module"]
+            tensor_name = arg_info["tensor_name"]
+            if strict and module is None:
+                raise RuntimeError(f"Error loading {tensor_fqn} into the model")
+
+            found = False
+            for p in module.parametrizations[tensor_name]:
+                if isinstance(p, FakeSparsity):
+                    found = True
+                    break
+            if not found:
+                p = FakeSparsity(torch.ones(getattr(module, tensor_name).shape))
+                parametrize.register_parametrization(module, tensor_name, p)
+            if s.get("mask", None) is not None:
+                mask = s.pop("mask")
+                p.mask = mask
+
+            for mg in groups:
+                if mg["tensor_fqn"] == tensor_fqn:
+                    mg.update(arg_info)
+        self.__setstate__({"state": states, "groups": groups})
+
+    def make_config_from_model(
+        self,
+        model: nn.Module,
+        SUPPORTED_MODULES: Set[Type] = SUPPORTED_MODULES,
+    ) -> None:
+        self.config = []
+        stack = [model]
+        while stack:
+            module = stack.pop()
+            for name, child in module.named_children():
+                if type(child) in SUPPORTED_MODULES:
+                    module_fqn = module_to_fqn(model, child)
+                    assert isinstance(module_fqn, str)  # for mypy
+                    self.config.append({"tensor_fqn": module_fqn + ".weight"})
+                else:
+                    stack.append(child)
+
+    def prepare(self, model, config):
+        r"""Prepares a model, by adding the parametrizations.
+
+        Note::
+
+            The model is modified inplace. If you need to preserve the original
+            model, use copy.deepcopy.
+        """
+        self.model = model  # TODO: Need to figure out how to load without this.
+        self.config = config
+
+        # If no config -- try getting all the supported layers
+        if self.config is None:
+            self.make_config_from_model(model)
+
+        # TODO: Remove the configuration by reference ('module')
+        for module_config in self.config:
+            assert isinstance(module_config, dict), (
+                "config elements should be dicts not modules i.e.:"
+                "[{`tensor_fqn`: `foo.bar.weight`}, {`tensor_fqn`: ... }, ...]"
+            )
+
+            assert isinstance(self.defaults, Dict)  # for mypy
+            local_args = copy.deepcopy(self.defaults)
+            local_args.update(module_config)
+
+            tensor_fqn = local_args.get("tensor_fqn", None)
+            assert tensor_fqn is not None, (
+                "tensor_fqn is a required argument in the sparsity config which"
+                "replaces previous `module` and [module]`fqn` arguments"
+            )
+
+            # populate all information from tensor_fqn
+            info_from_tensor_fqn = get_arg_info_from_tensor_fqn(model, tensor_fqn)
+
+            # check that whatever was put into local_args agrees with what was obtained
+            # from tensor_fqn
+            for key in info_from_tensor_fqn.keys():
+                if key in local_args:
+                    assert (
+                        info_from_tensor_fqn[key] == local_args[key]
+                        or (
+                            key == "tensor_fqn"
+                            and "." + info_from_tensor_fqn[key] == local_args[key]
+                        )
+                        # info_from_tensor_fqn will chop leading '.' from tensor_fqn so ignore that
+                    ), (
+                        f"Given both `{key}` and `tensor_fqn` in the config, it is expected them to agree!"
+                    )
+            local_args.update(info_from_tensor_fqn)
+            self.groups.append(local_args)
+        self._prepare()
+
+    def _prepare(self, *args, **kwargs):
+        r"""Adds mask parametrization to the layer weight"""
+        for config in self.groups:
+            module = config["module"]
+            tensor_name = config["tensor_name"]
+            parametrization = config.get("parametrization", FakeSparsity)
+            mask = config.get("mask", torch.ones_like(getattr(module, tensor_name)))
+            self.state[config["tensor_fqn"]]["mask"] = mask
+            parametrize.register_parametrization(
+                module, tensor_name, parametrization(mask)
+            )
+
+    def squash_mask(
+        self,
+        params_to_keep: Optional[Tuple[str, ...]] = None,
+        params_to_keep_per_layer: Optional[Dict[str, Tuple[str, ...]]] = None,
+        *args,
+        **kwargs,
+    ):
+        r"""Squashes the sparse masks into the appropriate tensors.
+
+        If either the `params_to_keep` or `params_to_keep_per_layer` is set,
+        the module will have a `sparse_params` dict attached to it.
+
+        Args:
+            params_to_keep: List of keys to save in the module or a dict
+                            representing the modules and keys that will have
+                            sparsity parameters saved
+            params_to_keep_per_layer: Dict to specify the params that should be
+                            saved for specific layers. The keys in the dict
+                            should be the module fqn, while the values should
+                            be a list of strings with the names of the variables
+                            to save in the `sparse_params`
+
+        Examples:
+            >>> # xdoctest: +SKIP("locals are undefined")
+            >>> # Don't save any sparse params
+            >>> sparsifier.squash_mask()
+            >>> hasattr(model.submodule1, 'sparse_params')
+            False
+
+            >>> # Keep sparse params per layer
+            >>> sparsifier.squash_mask(
+            ...     params_to_keep_per_layer={
+            ...         'submodule1.linear1': ('foo', 'bar'),
+            ...         'submodule2.linear42': ('baz',)
+            ...     })
+            >>> print(model.submodule1.linear1.sparse_params)
+            {'foo': 42, 'bar': 24}
+            >>> print(model.submodule2.linear42.sparse_params)
+            {'baz': 0.1}
+
+            >>> # Keep sparse params for all layers
+            >>> sparsifier.squash_mask(params_to_keep=('foo', 'bar'))
+            >>> print(model.submodule1.linear1.sparse_params)
+            {'foo': 42, 'bar': 24}
+            >>> print(model.submodule2.linear42.sparse_params)
+            {'foo': 42, 'bar': 24}
+
+            >>> # Keep some sparse params for all layers, and specific ones for
+            >>> # some other layers
+            >>> sparsifier.squash_mask(
+            ...     params_to_keep=('foo', 'bar'),
+            ...     params_to_keep_per_layer={
+            ...         'submodule2.linear42': ('baz',)
+            ...     })
+            >>> print(model.submodule1.linear1.sparse_params)
+            {'foo': 42, 'bar': 24}
+            >>> print(model.submodule2.linear42.sparse_params)
+            {'foo': 42, 'bar': 24, 'baz': 0.1}
+        """
+        for config in self.groups:
+            module = config["module"]
+            tensor_name = config["tensor_name"]
+            parametrize.remove_parametrizations(
+                module, tensor_name, leave_parametrized=True
+            )
+            sparse_params = {}
+            if params_to_keep is not None:
+                global_params = {k: config[k] for k in params_to_keep}
+                sparse_params.update(global_params)
+            if params_to_keep_per_layer is not None:
+                params = params_to_keep_per_layer.get(config["module_fqn"], None)
+                if params is not None:
+                    per_layer_params = {k: config[k] for k in params}
+                    sparse_params.update(per_layer_params)
+            if sparse_params:
+                # TODO handle multiple tensor being quantized on a single module, where to store sparse_params?
+                module.sparse_params = sparse_params
+
+    def convert(
+        self,
+        module: nn.Module,
+        mapping: Optional[Dict[Type[nn.Module], Type[nn.Module]]] = None,
+        inplace: bool = False,
+        parameterization: Type[nn.Module] = FakeSparsity,
+    ):
+        r"""Converts submodules in input module to a different module according to `mapping`
+        by calling `from_dense` method on the target module class
+        Args:
+            module: input module
+            mapping: a dictionary that maps from source module type to target
+                module type, can be overwritten to allow swapping user defined
+                Modules
+            inplace: carry out model transformations in-place, the original module
+                is mutated
+        """
+        if mapping is None:
+            raise NotImplementedError("Need to auto generate mapping ")
+        if not inplace:
+            module = copy.deepcopy(module)
+
+        reassign = {}
+        for name, mod in module.named_children():
+            # leaf node
+            if (
+                module_contains_param(mod, parameterization)
+                and type_before_parametrizations(mod) in mapping
+            ):
+                reassign[name] = swap_module(mod, mapping)
+            else:
+                # recurse
+                reassign[name] = self.convert(
+                    mod,
+                    mapping=mapping,
+                    inplace=True,
+                    parameterization=parameterization,
+                )
+
+        for key, value in reassign.items():
+            module._modules[key] = value
+
+        return module
+
+    def step(self, use_path: bool = True) -> None:
+        if not self.enable_mask_update:
+            return
+        with torch.no_grad():
+            for config in self.groups:
+                self.update_mask(**config)
+
+    @abc.abstractmethod
+    def update_mask(self, module: nn.Module, tensor_name: str, **kwargs):
+        pass
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..2664b3480cdf14e11d7c25beabedb035aab1306b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/nearly_diagonal_sparsifier.py
@@ -0,0 +1,55 @@
+import torch
+
+from . import base_sparsifier
+
+
+class NearlyDiagonalSparsifier(base_sparsifier.BaseSparsifier):
+    r"""Nearly Diagonal Sparsifier
+
+    This sparsifier creates a nearly diagonal mask to be applied to the weight matrix.
+    Nearly Diagonal Matrix is a matrix that contains non-zero elements near the diagonal and the rest are zero.
+    An example of a nearly diagonal matrix with degree (or nearliness) 3 and 5 are follows respectively.
+    1 1 0 0       1 1 1 0
+    1 1 1 0       1 1 1 1
+    0 1 1 1       1 1 1 1
+    0 0 1 1       0 1 1 1
+    Note that a nearly diagonal matrix with degree 1 is just a matrix with main diagonal populated
+
+    This sparsifier is controlled by one variable:
+    1. `nearliness` defines the number of non-zero diagonal lines that are closest to the main diagonal.
+        Currently - supports only odd number
+
+    Note:
+        This can be accelerated (vectorized) once the Spdiagonal feature (PR: #78439) is landed or the banded matrix
+        feature is landed: https://stackoverflow.com/questions/52463972/generating-banded-matrices-using-numpy
+
+    Args:
+        nearliness: The degree of nearliness (default = 1)
+
+    """
+    def __init__(self, nearliness: int = 1):
+        defaults = {'nearliness': nearliness}
+        super().__init__(defaults=defaults)
+
+    def update_mask(self, module, tensor_name, nearliness,
+                    **kwargs):
+        mask = getattr(module.parametrizations, tensor_name)[0].mask
+        mask.data = torch.zeros_like(mask)
+        if nearliness <= 0:
+            return
+
+        tensor = getattr(module, tensor_name)
+        height, width = tensor.shape
+
+        if nearliness % 2 == 0:
+            raise ValueError("nearliness can only be an odd number")
+        dist_to_diagonal = nearliness // 2
+        # check
+        if dist_to_diagonal >= min(height, width):
+            raise ValueError("nearliness cannot be larger than the dimensions of tensor.")
+
+        for row in range(0, height):
+            # Bounds of entries that needs to be set to 1
+            low = max(0, row - dist_to_diagonal)
+            high = min(width, row + dist_to_diagonal + 1)
+            mask[row, low:high].fill_(1)
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/utils.py b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3df67b3b53b9548538ce5d298de3a976397c5f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/utils.py
@@ -0,0 +1,136 @@
+from typing import Any, Dict, Optional, Type
+from torch.nn.utils.parametrize import type_before_parametrizations, is_parametrized
+from itertools import chain
+
+from torch import nn
+
+__all__ = [
+    "module_contains_param",
+    "swap_module",
+    "module_to_fqn",
+    "fqn_to_module",
+    "get_arg_info_from_tensor_fqn",
+    "FakeSparsity",
+]
+
+
+def module_contains_param(module: nn.Module, parametrization: Type[nn.Module]) -> bool:
+    if is_parametrized(module):
+        # see if any of the module tensors have a parametriztion attached that matches the one passed in
+        return any(
+            any(isinstance(param, parametrization) for param in param_list)
+            for key, param_list in module.parametrizations.items()  # type: ignore[union-attr,operator]
+        )
+    return False
+
+
+def swap_module(
+    mod: nn.Module, mapping: Dict[Type[nn.Module], Type[nn.Module]]
+) -> nn.Module:
+    r"""Swaps the module using from_dense according to the mapping passed in.
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to sparse nn module
+    Return:
+        The corresponding sparse module of `mod` according to mapping, created using from_dense
+    """
+    if type_before_parametrizations(mod) in mapping:
+        sparse_mod = mapping[type_before_parametrizations(mod)]
+
+        # TODO Fix this typing, as Type[Module] has no attribute "from_dense"
+        new_mod = sparse_mod.from_dense(mod)  # type: ignore[attr-defined]
+
+        # Preserve module's pre forward hooks. They'll be called on quantized input
+        for pre_hook_fn in mod._forward_pre_hooks.values():
+            new_mod.register_forward_pre_hook(pre_hook_fn)
+        # Preserve module's post forward hooks except _observer_forward_hook
+        # After convert they'll work with quantized output
+        for hook_fn in mod._forward_hooks.values():
+            new_mod.register_forward_hook(hook_fn)
+
+        # respect device affinity when swapping modules
+        devices = {p.device for p in chain(mod.parameters(), mod.buffers())}
+        assert len(devices) <= 1, (
+            f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+        )
+        device = next(iter(devices)) if len(devices) > 0 else None
+        if device:
+            new_mod.to(device)
+
+        return new_mod
+
+    else:
+        return mod
+
+
+def module_to_fqn(
+    model: nn.Module, module: nn.Module, prefix: str = ""
+) -> Optional[str]:
+    """
+    Returns the fqn for a module or None if module not a descendent of model.
+    """
+    if module is model:
+        return ""
+    for name, child in model.named_children():
+        fqn = module_to_fqn(child, module, ".")
+        if isinstance(fqn, str):
+            return prefix + name + fqn
+    return None
+
+
+def fqn_to_module(model: Optional[nn.Module], path: str) -> Optional[nn.Module]:
+    """
+    Given an fqn, returns the corresponding module or tensor or None if the fqn given by `path`
+    doesn't correspond to anything. Similar to model.get_submodule(path) but works for tensors.
+    """
+    if path != "":
+        for name in path.split("."):
+            model = getattr(model, name, None)
+    return model
+
+
+def get_arg_info_from_tensor_fqn(model: nn.Module, tensor_fqn: str) -> Dict[str, Any]:
+    """
+    Uses tensor_fqn to obtain a dict containing module_fqn, module and tensor_name
+    """
+    # string manip to split tensor_fqn into module_fqn and tensor_name
+    # if tensor_fqn is 'weight' then module_fqn and tensor_name are '' and 'weight'
+    # if tensor_fqn is 'linear.weight' then module_fqn and tensor_name are 'linear' and 'weight'
+    tensor_name = tensor_fqn.split(".")[-1]
+    module_fqn = tensor_fqn[: -len(tensor_name) - ("." in tensor_fqn)]
+
+    module = fqn_to_module(model, module_fqn)
+
+    return {
+        "module_fqn": module_fqn,
+        "module": module,
+        "tensor_name": tensor_name,
+        "tensor_fqn": tensor_fqn,
+    }
+
+
+# Parametrizations
+class FakeSparsity(nn.Module):
+    r"""Parametrization for the weights. Should be attached to the 'weight' or
+    any other parameter that requires a mask applied to it.
+
+    Note::
+
+        Once the mask is passed, the variable should not change the id. The
+        contents of the mask can change, but the mask reference itself should
+        not.
+    """
+
+    def __init__(self, mask):
+        super().__init__()
+        self.register_buffer("mask", mask)
+
+    def forward(self, x):
+        assert self.mask.shape == x.shape
+        return self.mask * x
+
+    def state_dict(self, *args, **kwargs):
+        # We don't want to let the parametrizations to save the mask.
+        # That way we make sure that the linear module doesn't store the masks
+        # alongside their parametrizations.
+        return {}
diff --git a/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f1a47280a1fff3e169fb6f2b2dc69e8062132c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/pruning/sparsifier/weight_norm_sparsifier.py
@@ -0,0 +1,200 @@
+from functools import reduce
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+from .base_sparsifier import BaseSparsifier
+import operator
+
+__all__ = ["WeightNormSparsifier"]
+
+def _flat_idx_to_2d(idx, shape):
+    rows = idx // shape[1]
+    cols = idx % shape[1]
+    return rows, cols
+
+class WeightNormSparsifier(BaseSparsifier):
+    r"""Weight-Norm Sparsifier
+
+    This sparsifier computes the norm of every sparse block and "zeroes-out" the
+    ones with the lowest norm. The level of sparsity defines how many of the
+    blocks is removed.
+
+    This sparsifier is controlled by three variables:
+    1. `sparsity_level` defines the number of *sparse blocks* that are zeroed-out
+    2. `sparse_block_shape` defines the shape of the sparse blocks. Note that
+        the sparse blocks originate at the zero-index of the tensor.
+    3. `zeros_per_block` is the number of zeros that we are expecting in each
+        sparse block. By default we assume that all elements within a block are
+        zeroed-out. However, setting this variable sets the target number of
+        zeros per block. The zeros within each block are chosen as the *smallest
+        absolute values*.
+
+    Args:
+
+        sparsity_level: The target level of sparsity
+        sparse_block_shape: The shape of a sparse block (see note below)
+        zeros_per_block: Number of zeros in a sparse block
+        norm: Norm to use. Could be either `int` or a callable.
+            If `int`, only L1 and L2 are implemented.
+
+    Note::
+        The `sparse_block_shape` is tuple representing (block_ROWS, block_COLS),
+        irrespective of what the rows / cols mean in the data tensor. That means,
+        if you were to sparsify a weight tensor in the nn.Linear, which has a
+        weight shape `(Cout, Cin)`, the `block_ROWS` would refer to the output
+        channels, while the `block_COLS` would refer to the input channels.
+
+    Note::
+        All arguments to the WeightNormSparsifier constructor are "default"
+        arguments and could be overriden by the configuration provided in the
+        `prepare` step.
+    """
+    def __init__(self,
+                 sparsity_level: float = 0.5,
+                 sparse_block_shape: Tuple[int, int] = (1, 4),
+                 zeros_per_block: Optional[int] = None,
+                 norm: Optional[Union[Callable, int]] = None):
+        if zeros_per_block is None:
+            zeros_per_block = reduce(operator.mul, sparse_block_shape)
+        defaults = {
+            "sparsity_level": sparsity_level,
+            "sparse_block_shape": sparse_block_shape,
+            "zeros_per_block": zeros_per_block,
+        }
+        if norm is None:
+            norm = 2
+        if callable(norm):
+            self.norm_fn = norm
+        elif norm == 1:
+            self.norm_fn = lambda T: T.abs()
+        elif norm == 2:
+            self.norm_fn = lambda T: T * T
+        else:
+            raise NotImplementedError(f"L-{norm} is not yet implemented.")
+        super().__init__(defaults=defaults)
+
+    def _scatter_fold_block_mask(self, output_shape, dim, indices, block_shape,
+                                 mask=None, input_shape=None, device=None):
+        r"""Creates patches of size `block_shape` after scattering the indices."""
+        if mask is None:
+            assert input_shape is not None
+            mask = torch.ones(input_shape, device=device)
+        mask.scatter_(dim=dim, index=indices, value=0)
+        mask.data = F.fold(mask, output_size=output_shape, kernel_size=block_shape, stride=block_shape)
+        return mask
+
+    def _make_tensor_mask(self, data, input_shape, sparsity_level, sparse_block_shape, mask=None):
+        r"""Creates a tensor-level mask.
+
+        Tensor-level mask is described as a mask, where the granularity of sparsification of the
+        smallest patch is the sparse_block_shape. That means, that for a given mask and a
+        sparse_block_shape, the smallest "patch" of zeros/ones could be the sparse_block_shape.
+
+        In this context, `sparsity_level` describes the fraction of sparse patches.
+        """
+        h, w = data.shape[-2:]
+        block_h, block_w = sparse_block_shape
+        dh = (block_h - h % block_h) % block_h
+        dw = (block_w - w % block_w) % block_w
+
+        if mask is None:
+            mask = torch.ones(h + dh, w + dw, device=data.device)
+
+        if sparsity_level >= 1.0:
+            mask.data = torch.zeros_like(mask)
+            return mask
+        elif sparsity_level <= 0.0:
+            mask.data = torch.ones_like(mask)
+            return mask
+
+        values_per_block = reduce(operator.mul, sparse_block_shape)
+        if values_per_block > 1:
+            # Reduce the data
+            data = F.avg_pool2d(
+                data[None, None, :], kernel_size=sparse_block_shape, stride=sparse_block_shape, ceil_mode=True
+            )
+        data = data.flatten()
+        num_blocks = len(data)
+
+        data = data.repeat(1, values_per_block, 1)
+
+        threshold_idx = int(round(sparsity_level * num_blocks))
+        threshold_idx = max(0, min(num_blocks - 1, threshold_idx))  # Sanity check
+        _, sorted_idx = torch.topk(data, k=threshold_idx, dim=2, largest=False)
+
+        # Temp reshape for mask
+        mask_reshape = mask.reshape(data.shape)  # data might be reshaped
+        self._scatter_fold_block_mask(
+            dim=2, output_shape=(h + dh, w + dw),
+            indices=sorted_idx, block_shape=sparse_block_shape, mask=mask_reshape
+        )
+        mask.data = mask_reshape.squeeze().reshape(mask.shape)[:h, :w].contiguous()
+        return mask
+
+    def _make_block_mask(self, data, sparse_block_shape, zeros_per_block, mask=None):
+        r"""Creates a block-level mask.
+
+        Block-level mask is described as a mask, where the granularity of sparsification of the
+        largest patch is the sparse_block_shape. That means that for a given mask and a
+        sparse_block_shape, the sparsity is computed only within a patch of a size sparse_block_shape.
+
+        In this context the `zeros_per_block` describes the number of zeroed-out elements within a patch.
+        """
+        h, w = data.shape[-2:]
+        block_h, block_w = sparse_block_shape
+        dh = (block_h - h % block_h) % block_h
+        dw = (block_w - w % block_w) % block_w
+        values_per_block = reduce(operator.mul, sparse_block_shape)
+
+        if mask is None:
+            mask = torch.ones((h + dh, w + dw), device=data.device)
+
+        if values_per_block == zeros_per_block:
+            # Everything should be sparsified
+            mask.data = torch.zeros_like(mask)
+            return mask
+
+        # create a new padded tensor like data (to match the block_shape)
+        padded_data = torch.ones(h + dh, w + dw, dtype=data.dtype, device=data.device)
+        padded_data.fill_(torch.nan)
+        padded_data[:h, :w] = data
+        unfolded_data = F.unfold(padded_data[None, None, :], kernel_size=sparse_block_shape, stride=sparse_block_shape)
+
+        # Temp reshape for mask
+        mask_reshape = mask.reshape(unfolded_data.shape)
+        _, sorted_idx = torch.topk(unfolded_data, k=zeros_per_block, dim=1, largest=False)
+
+        self._scatter_fold_block_mask(
+            dim=1, indices=sorted_idx, output_shape=padded_data.shape, block_shape=sparse_block_shape, mask=mask_reshape
+        )
+
+        mask.data = mask_reshape.squeeze().reshape(mask.shape).contiguous()
+        return mask
+
+    def update_mask(self, module, tensor_name, sparsity_level, sparse_block_shape,
+                    zeros_per_block, **kwargs):
+        values_per_block = reduce(operator.mul, sparse_block_shape)
+        if zeros_per_block > values_per_block:
+            raise ValueError(
+                "Number of zeros per block cannot be more than the total number of elements in that block."
+            )
+        if zeros_per_block < 0:
+            raise ValueError("Number of zeros per block should be positive.")
+
+        mask = getattr(module.parametrizations, tensor_name)[0].mask
+        if sparsity_level <= 0 or zeros_per_block == 0:
+            mask.data = torch.ones_like(mask)
+        elif sparsity_level >= 1.0 and (zeros_per_block == values_per_block):
+            mask.data = torch.zeros_like(mask)
+        else:
+            ww = self.norm_fn(getattr(module, tensor_name))
+            tensor_mask = self._make_tensor_mask(
+                data=ww, input_shape=ww.shape, sparsity_level=sparsity_level, sparse_block_shape=sparse_block_shape
+            )
+            if values_per_block != zeros_per_block:
+                block_mask = self._make_block_mask(data=ww, sparse_block_shape=sparse_block_shape,
+                                                   zeros_per_block=zeros_per_block)
+                tensor_mask = torch.logical_or(tensor_mask, block_mask)
+            mask.data = tensor_mask
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4d7921b6d8702567e01038b3eea39e847562a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/__init__.py
@@ -0,0 +1,189 @@
+# flake8: noqa: F403
+
+from .fake_quantize import *  # noqa: F403
+from .fuse_modules import fuse_modules  # noqa: F403
+from .fuse_modules import fuse_modules_qat  # noqa: F403
+from .fuser_method_mappings import *  # noqa: F403
+from .observer import *  # noqa: F403
+from .qconfig import *  # noqa: F403
+from .qconfig_mapping import *  # noqa: F403
+from .quant_type import *  # noqa: F403
+from .quantization_mappings import *  # type: ignore[no-redef]
+from .quantize import *  # noqa: F403
+from .quantize_jit import *  # noqa: F403
+from .stubs import *  # noqa: F403
+from .pt2e.export_utils import _move_exported_model_to_eval as move_exported_model_to_eval
+from .pt2e.export_utils import _move_exported_model_to_train as move_exported_model_to_train
+from .pt2e.export_utils import _allow_exported_model_train_eval as allow_exported_model_train_eval
+from .pt2e.generate_numeric_debug_handle import generate_numeric_debug_handle  # noqa: F401
+from typing import Union, List, Callable, Tuple, Optional
+from torch import Tensor
+import torch
+
+ObserverOrFakeQuantize = Union[ObserverBase, FakeQuantizeBase]
+ObserverOrFakeQuantize.__module__ = "torch.ao.quantization"
+
+__all__ = [
+    "DeQuantStub",
+    "FakeQuantize",
+    "FakeQuantizeBase",
+    "FixedQParamsFakeQuantize",
+    "FixedQParamsObserver",
+    "FusedMovingAvgObsFakeQuantize",
+    "HistogramObserver",
+    "MatchAllNode",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "ObserverOrFakeQuantize",
+    "Pattern",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "QConfig",
+    "QConfigAny",
+    "QConfigDynamic",
+    "QConfigMapping",
+    "QuantStub",
+    "QuantType",
+    "QuantWrapper",
+    "RecordingObserver",
+    "ReuseInputObserver",
+    "UniformQuantizationObserverBase",
+    "add_quant_dequant",
+    "convert",
+    "convert_dynamic_jit",
+    "convert_jit",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_affine_fixed_qparams_observer",
+    "default_debug_observer",
+    "default_dynamic_fake_quant",
+    "default_dynamic_quant_observer",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_eval_fn",
+    "default_fake_quant",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_float_qparams_observer",
+    "default_float_qparams_observer_4bit",
+    "default_fused_act_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_histogram_fake_quant",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_fake_quant",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_reuse_input_observer",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_symmetric_fixed_qparams_observer",
+    "default_weight_fake_quant",
+    "default_weight_observer",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer",
+    "fuse_conv_bn",
+    "fuse_conv_bn_jit",
+    "fuse_conv_bn_relu",
+    "fuse_convtranspose_bn",
+    "fuse_linear_bn",
+    "fuse_modules",
+    "fuse_modules_qat",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+    "fused_wt_fake_quant_range_neg_127_to_127",
+    "get_combined_dict",
+    "get_default_compare_output_module_list",
+    "get_default_custom_config_dict",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_dynamic_sparse_quant_module_mappings",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qat_qconfig",
+    "get_default_qat_qconfig_dict",
+    "get_default_qat_qconfig_mapping",
+    "get_default_qconfig",
+    "get_default_qconfig_dict",
+    "get_default_qconfig_mapping",
+    "get_default_qconfig_propagation_list",
+    "get_default_static_quant_module_mappings",
+    "get_default_static_quant_reference_module_mappings",
+    "get_default_static_sparse_quant_module_mappings",
+    "get_dynamic_quant_module_class",
+    "get_embedding_qat_module_mappings",
+    "get_embedding_static_quant_module_mappings",
+    "get_fuser_method",
+    "get_fuser_method_new",
+    "get_observer_state_dict",
+    "get_quantized_operator",
+    "get_static_quant_module_class",
+    "load_observer_state_dict",
+    "move_exported_model_to_eval",
+    "move_exported_model_to_train",
+    "allow_exported_model_train_eval",
+    "no_observer_set",
+    "per_channel_weight_observer_range_neg_127_to_127",
+    "prepare",
+    "prepare_dynamic_jit",
+    "prepare_jit",
+    "prepare_qat",
+    "propagate_qconfig_",
+    "qconfig_equals",
+    "quantize",
+    "quantize_dynamic",
+    "quantize_dynamic_jit",
+    "quantize_jit",
+    "quantize_qat",
+    "script_qconfig",
+    "script_qconfig_dict",
+    "swap_module",
+    "weight_observer_range_neg_127_to_127",
+    "generate_numeric_debug_handle",
+]
+
+def default_eval_fn(model, calib_data):
+    r"""Define the default evaluation function.
+
+    Default evaluation function takes a torch.utils.data.Dataset or a list of
+    input Tensors and run the model on the dataset
+    """
+    for data, target in calib_data:
+        model(data)
+
+class _DerivedObserverOrFakeQuantize(ObserverBase):
+    r"""This observer is used to describe an observer whose quantization parameters
+    are derived from other observers
+    """
+
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        obs_or_fqs: List[ObserverOrFakeQuantize],
+        derive_qparams_fn: Callable[[List[ObserverOrFakeQuantize]], Tuple[Tensor, Tensor]],
+        quant_min: Optional[int]=None,
+        quant_max: Optional[int]=None,
+        qscheme: Optional[torch.qscheme]=None,
+        ch_axis: Optional[int] = None
+    ):
+        super().__init__(dtype)
+        self.obs_or_fqs = obs_or_fqs
+        self.derive_qparams_fn = derive_qparams_fn
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.qscheme = qscheme
+        self.ch_axis = ch_axis
+
+        from .utils import is_per_channel
+        if is_per_channel(self.qscheme):
+            assert self.ch_axis is not None, "Must provide a valid ch_axis if qscheme is per channel"
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x
+
+    def calculate_qparams(self):
+        return self.derive_qparams_fn(self.obs_or_fqs)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b9648e9edd7a70d95d3c5ea7da36384b74694bd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_correct_bias.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_correct_bias.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..860feec201c411ed8e851bd097ca801b476969bc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_correct_bias.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_equalize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_equalize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1f7f6000842706549ff42ab6afdfa82421e0f17
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_equalize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c24c19627051ff791d121e630d637429fb698d0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/_learnable_fake_quantize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fake_quantize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fake_quantize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6033f1ec149dadee7bd33c16996e56dbbbb44616
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fake_quantize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuse_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuse_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a413d9072c59484bd9ebd3f797a404563d5c9bf0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuse_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7188bfb694bd5849d22ec108f01b90583886fd5d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/observer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/observer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c892e88019eaa92b1fa7159a3c5fb6eb86bb56da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/observer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e74006f270c51a920b784c89a62c2b58daf9086c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec961f6314650716a1e60da4136eb3266ae9afbe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/qconfig_mapping.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quant_type.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quant_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59e0220b163a10ea87d3e5f2b90a23e416397e42
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quant_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantization_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantization_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be729b742a9ea0abb7ca9225e1bd6efed8465cf9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantization_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3d5a3a04274739a3139b9623d23ee5131216977
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ae86c421b71ef7108056cd51cb412747d87e8e9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad41b6793bbc33fed71186ddf89702397cc0a9b3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_jit.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bcba94bb5d81805476dbe9aabde36bbbd5d2872
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/quantize_pt2e.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78ec8b908dea789d1b69d45b7664a15d29071e9d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/stubs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c1425eb9deafd13fc5846c4ac67b0e2cec9e8d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/_correct_bias.py b/MLPY/Lib/site-packages/torch/ao/quantization/_correct_bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..646cbb7492bbfa7fadbbdd5bbc7a1677a9510546
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/_correct_bias.py
@@ -0,0 +1,144 @@
+import torch
+import torch.nn as nn
+import torch.ao.nn.quantized as nnq
+
+import torch.ao.quantization
+import torch.ao.ns._numeric_suite as ns
+
+__all__ = [
+    "get_module",
+    "parent_child_names",
+    "get_param",
+    "MeanShadowLogger",
+    "bias_correction",
+]
+
+_supported_modules = {nn.Linear, nn.Conv2d}
+_supported_modules_quantized = {nnq.Linear, nnq.Conv2d}
+
+def get_module(model, name):
+    """Given name of submodule, this function grabs the submodule from given model."""
+    return dict(model.named_modules())[name]
+
+def parent_child_names(name):
+    """Split full name of submodule into parent submodule's full name and submodule's name."""
+    split_name = name.rsplit('.', 1)
+    if len(split_name) == 1:
+        return '', split_name[0]
+    else:
+        return split_name[0], split_name[1]
+
+def get_param(module, attr):
+    """Get the parameter given a module and attribute.
+
+    Sometimes the weights/bias attribute gives you the raw tensor, but sometimes
+    gives a function that will give you the raw tensor, this function takes care of that logic
+    """
+    param = getattr(module, attr, None)
+    if callable(param):
+        return param()
+    else:
+        return param
+
+class MeanShadowLogger(ns.Logger):
+    """Mean Logger for a Shadow module.
+
+    A logger for a Shadow module whose purpose is to record the rolling mean
+    of the data passed to the floating point and quantized models
+    """
+
+    def __init__(self):
+        """Set up initial values for float and quantized stats, count, float sum, and quant sum."""
+        super().__init__()
+        self.stats["float"] = None
+        self.stats["quantized"] = None
+        self.count = 0
+        self.float_sum = None
+        self.quant_sum = None
+
+    def forward(self, x, y):
+        """Compute the average of quantized and floating-point data from modules.
+
+        The inputs x,y are output data from the quantized and floating-point modules.
+        x is for the quantized module, y is for the floating point module
+        """
+        if x.is_quantized:
+            x = x.dequantize()
+
+        self.count += 1
+        if self.stats["quantized"] is None:
+            self.stats["quantized"] = x
+            self.quant_sum = x
+        else:
+            self.quant_sum += x
+            self.stats["quantized"] = self.quant_sum / self.count
+
+        if self.stats["float"] is None:
+            self.stats["float"] = y
+            self.float_sum = y
+        else:
+            self.float_sum += y
+            self.stats["float"] = self.float_sum / self.count
+
+    def clear(self):
+        self.stats["float"] = None
+        self.stats["quantized"] = None
+        self.count = 0
+        self.float_sum = None
+        self.quant_sum = None
+
+def bias_correction(float_model, quantized_model, img_data, target_modules=_supported_modules_quantized, neval_batches=None):
+    """Perform bias correction on a module.
+
+    Using numeric suite shadow module, the expected output of the floating point and quantized modules
+    is recorded. Using that data the bias of supported modules is shifted to compensate for the drift caused
+    by quantization
+    Paper reference: https://arxiv.org/pdf/1906.04721.pdf (Section 4.2)
+
+    Args:
+        float_model: a trained model that serves as a reference to what bias correction should aim for
+        quantized_model: quantized form of float_model that bias correction is to applied to
+        img_data: calibration data to estimate the expected output (used to find quantization error)
+        target_modules: specifies what submodules in quantized_model need bias correction (can be extended to
+                unquantized submodules)
+        neval_batches: a cap to the number of batches you want to be used for estimating the expected output
+    """
+    ns.prepare_model_with_stubs(float_model, quantized_model, _supported_modules, MeanShadowLogger)
+
+    uncorrected_modules = {}
+    for name, submodule in quantized_model.named_modules():
+        if type(submodule) in target_modules:
+            uncorrected_modules[name] = submodule
+
+    for uncorrected_module in uncorrected_modules:
+        quantized_submodule = get_module(quantized_model, uncorrected_module)
+        bias = get_param(quantized_submodule, 'bias')
+        if bias is not None:
+
+            count = 0
+            for data in img_data:
+                quantized_model(data[0])
+                count += 1
+                if count == neval_batches:
+                    break
+            ob_dict = ns.get_logger_dict(quantized_model)
+            parent_name, _ = parent_child_names(uncorrected_module)
+
+            float_data = ob_dict[parent_name + '.stats']['float']
+            quant_data = ob_dict[parent_name + '.stats']['quantized']
+
+            # math for expected_error
+            quantization_error = quant_data - float_data
+            dims = list(range(quantization_error.dim()))
+            # Note: we don't want to take the mean over the output channel dimension
+            dims.remove(1)
+            expected_error = torch.mean(quantization_error, dims)
+
+            updated_bias = bias.data - expected_error
+
+            bias.data = updated_bias
+
+            # Resets the data contained in the loggers
+            for name, submodule in quantized_model.named_modules():
+                if isinstance(submodule, MeanShadowLogger):
+                    submodule.clear()
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/_equalize.py b/MLPY/Lib/site-packages/torch/ao/quantization/_equalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..a752ecc3406b5b83fa3d993960af18f007e9987a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/_equalize.py
@@ -0,0 +1,182 @@
+import torch
+import copy
+from typing import Dict, Any
+
+__all__ = [
+    "set_module_weight",
+    "set_module_bias",
+    "get_module_weight",
+    "get_module_bias",
+    "max_over_ndim",
+    "min_over_ndim",
+    "channel_range",
+    "cross_layer_equalization",
+    "equalize",
+    "converged",
+]
+
+_supported_types = {torch.nn.Conv2d, torch.nn.Linear}
+_supported_intrinsic_types = {torch.ao.nn.intrinsic.ConvReLU2d, torch.ao.nn.intrinsic.LinearReLU}
+_all_supported_types = _supported_types.union(_supported_intrinsic_types)
+
+def set_module_weight(module, weight) -> None:
+    if type(module) in _supported_types:
+        module.weight = torch.nn.Parameter(weight)
+    else:
+        module[0].weight = torch.nn.Parameter(weight)
+
+def set_module_bias(module, bias) -> None:
+    if type(module) in _supported_types:
+        module.bias = torch.nn.Parameter(bias)
+    else:
+        module[0].bias = torch.nn.Parameter(bias)
+
+def get_module_weight(module):
+    if type(module) in _supported_types:
+        return module.weight
+    else:
+        return module[0].weight
+
+def get_module_bias(module):
+    if type(module) in _supported_types:
+        return module.bias
+    else:
+        return module[0].bias
+
+def max_over_ndim(input, axis_list, keepdim=False):
+    """Apply 'torch.max' over the given axes."""
+    axis_list.sort(reverse=True)
+    for axis in axis_list:
+        input, _ = input.max(axis, keepdim)
+    return input
+
+def min_over_ndim(input, axis_list, keepdim=False):
+    """Apply 'torch.min' over the given axes."""
+    axis_list.sort(reverse=True)
+    for axis in axis_list:
+        input, _ = input.min(axis, keepdim)
+    return input
+
+def channel_range(input, axis=0):
+    """Find the range of weights associated with a specific channel."""
+    size_of_tensor_dim = input.ndim
+    axis_list = list(range(size_of_tensor_dim))
+    axis_list.remove(axis)
+
+    mins = min_over_ndim(input, axis_list)
+    maxs = max_over_ndim(input, axis_list)
+
+    assert mins.size(0) == input.size(axis), "Dimensions of resultant channel range does not match size of requested axis"
+    return maxs - mins
+
+def cross_layer_equalization(module1, module2, output_axis=0, input_axis=1):
+    """Scale the range of Tensor1.output to equal Tensor2.input.
+
+    Given two adjacent tensors', the weights are scaled such that
+    the ranges of the first tensors' output channel are equal to the
+    ranges of the second tensors' input channel
+    """
+    if type(module1) not in _all_supported_types or type(module2) not in _all_supported_types:
+        raise ValueError("module type not supported:", type(module1), " ", type(module2))
+
+    weight1 = get_module_weight(module1)
+    weight2 = get_module_weight(module2)
+
+    if weight1.size(output_axis) != weight2.size(input_axis):
+        raise TypeError("Number of output channels of first arg do not match \
+        number input channels of second arg")
+
+    bias = get_module_bias(module1)
+
+    weight1_range = channel_range(weight1, output_axis)
+    weight2_range = channel_range(weight2, input_axis)
+
+    # producing scaling factors to applied
+    weight2_range += 1e-9
+    scaling_factors = torch.sqrt(weight1_range / weight2_range)
+    inverse_scaling_factors = torch.reciprocal(scaling_factors)
+
+    bias = bias * inverse_scaling_factors
+
+    # formatting the scaling (1D) tensors to be applied on the given argument tensors
+    # pads axis to (1D) tensors to then be broadcasted
+    size1 = [1] * weight1.ndim
+    size1[output_axis] = weight1.size(output_axis)
+    size2 = [1] * weight2.ndim
+    size2[input_axis] = weight2.size(input_axis)
+
+    scaling_factors = torch.reshape(scaling_factors, size2)
+    inverse_scaling_factors = torch.reshape(inverse_scaling_factors, size1)
+
+    weight1 = weight1 * inverse_scaling_factors
+    weight2 = weight2 * scaling_factors
+
+    set_module_weight(module1, weight1)
+    set_module_bias(module1, bias)
+    set_module_weight(module2, weight2)
+
+def equalize(model, paired_modules_list, threshold=1e-4, inplace=True):
+    """Equalize modules until convergence is achieved.
+
+    Given a list of adjacent modules within a model, equalization will
+    be applied between each pair, this will repeated until convergence is achieved
+
+    Keeps a copy of the changing modules from the previous iteration, if the copies
+    are not that different than the current modules (determined by converged_test),
+    then the modules have converged enough that further equalizing is not necessary
+
+    Implementation of this referced section 4.1 of this paper https://arxiv.org/pdf/1906.04721.pdf
+
+    Args:
+        model: a model (nn.module) that equalization is to be applied on
+        paired_modules_list: a list of lists where each sublist is a pair of two
+            submodules found in the model, for each pair the two submodules generally
+            have to be adjacent in the model to get expected/reasonable results
+        threshold: a number used by the converged function to determine what degree
+            similarity between models is necessary for them to be called equivalent
+        inplace: determines if function is inplace or not
+    """
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    name_to_module : Dict[str, torch.nn.Module] = {}
+    previous_name_to_module: Dict[str, Any] = {}
+    name_set = {name for pair in paired_modules_list for name in pair}
+
+    for name, module in model.named_modules():
+        if name in name_set:
+            name_to_module[name] = module
+            previous_name_to_module[name] = None
+    while not converged(name_to_module, previous_name_to_module, threshold):
+        for pair in paired_modules_list:
+            previous_name_to_module[pair[0]] = copy.deepcopy(name_to_module[pair[0]])
+            previous_name_to_module[pair[1]] = copy.deepcopy(name_to_module[pair[1]])
+
+            cross_layer_equalization(name_to_module[pair[0]], name_to_module[pair[1]])
+
+    return model
+
+def converged(curr_modules, prev_modules, threshold=1e-4):
+    """Test whether modules are converged to a specified threshold.
+
+    Tests for the summed norm of the differences between each set of modules
+    being less than the given threshold
+
+    Takes two dictionaries mapping names to modules, the set of names for each dictionary
+    should be the same, looping over the set of names, for each name take the difference
+    between the associated modules in each dictionary
+
+    """
+    if curr_modules.keys() != prev_modules.keys():
+        raise ValueError("The keys to the given mappings must have the same set of names of modules")
+
+    summed_norms = torch.tensor(0.)
+    if None in prev_modules.values():
+        return False
+    for name in curr_modules.keys():
+        curr_weight = get_module_weight(curr_modules[name])
+        prev_weight = get_module_weight(prev_modules[name])
+
+        difference = curr_weight.sub(prev_weight)
+        summed_norms += torch.norm(difference)
+    return bool(summed_norms < threshold)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/_learnable_fake_quantize.py b/MLPY/Lib/site-packages/torch/ao/quantization/_learnable_fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..df448bd15c15f48c4983c8fc3694560a9034c090
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/_learnable_fake_quantize.py
@@ -0,0 +1,164 @@
+import torch
+from torch.nn.parameter import Parameter
+from typing import List
+
+__all__: List[str] = []
+
+class _LearnableFakeQuantize(torch.ao.quantization.FakeQuantizeBase):
+    r"""Generalized extension of the FakeQuantize module in fake_quantize.py.
+
+    This is an extension of the FakeQuantize module in fake_quantize.py, which
+    supports more generalized lower-bit quantization and support learning of the scale
+    and zero point parameters through backpropagation. For literature references,
+    please see the class _LearnableFakeQuantizePerTensorOp.
+
+    In addition to the attributes in the original FakeQuantize module, the _LearnableFakeQuantize
+    module also includes the following attributes to support quantization parameter learning.
+
+    * :attr:`channel_len` defines the length of the channel when initializing scale and zero point
+      for the per channel case.
+
+    * :attr:`use_grad_scaling` defines the flag for whether the gradients for scale and zero point are
+      normalized by the constant, which is proportional to the square root of the number of
+      elements in the tensor. The related literature justifying the use of this particular constant
+      can be found here: https://openreview.net/pdf?id=rkgO66VKDS.
+
+    * :attr:`fake_quant_enabled` defines the flag for enabling fake quantization on the output.
+
+    * :attr:`static_enabled` defines the flag for using observer's static estimation for
+      scale and zero point.
+
+    * :attr:`learning_enabled` defines the flag for enabling backpropagation for scale and zero point.
+    """
+    def __init__(self, observer, quant_min=0, quant_max=255, scale=1., zero_point=0., channel_len=-1,
+                 use_grad_scaling=False, **observer_kwargs):
+        super().__init__()
+        assert quant_min < quant_max, 'quant_min must be strictly less than quant_max.'
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        # also pass quant_min and quant_max to observer
+        observer_kwargs["quant_min"] = quant_min
+        observer_kwargs["quant_max"] = quant_max
+        self.use_grad_scaling = use_grad_scaling
+        if channel_len == -1:
+            self.scale = Parameter(torch.tensor([scale]))
+            self.zero_point = Parameter(torch.tensor([zero_point]))
+        else:
+            assert isinstance(channel_len, int) and channel_len > 0, "Channel size must be a positive integer."
+            self.scale = Parameter(torch.tensor([scale] * channel_len))
+            self.zero_point = Parameter(torch.tensor([zero_point] * channel_len))
+
+        self.activation_post_process = observer(**observer_kwargs)
+        assert torch.iinfo(self.activation_post_process.dtype).min <= quant_min, \
+            'quant_min out of bound'
+        assert quant_max <= torch.iinfo(self.activation_post_process.dtype).max, \
+            'quant_max out of bound'
+        self.dtype = self.activation_post_process.dtype
+        self.qscheme = self.activation_post_process.qscheme
+        self.ch_axis = self.activation_post_process.ch_axis \
+            if hasattr(self.activation_post_process, 'ch_axis') else -1
+        self.register_buffer('fake_quant_enabled', torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer('static_enabled', torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer('learning_enabled', torch.tensor([0], dtype=torch.uint8))
+
+        bitrange = torch.tensor(quant_max - quant_min + 1).double()
+        self.bitwidth = int(torch.log2(bitrange).item())
+        self.register_buffer('eps', torch.tensor([torch.finfo(torch.float32).eps]))
+
+    @torch.jit.export
+    def enable_param_learning(self):
+        r"""Enable parameter learning over static observer estimates.
+
+        Enables learning of quantization parameters and
+        disables static observer estimates. Forward path returns fake quantized X.
+        """
+        self.toggle_qparam_learning(enabled=True) \
+            .toggle_fake_quant(enabled=True) \
+            .toggle_observer_update(enabled=False)
+        return self
+
+    @torch.jit.export
+    def enable_static_estimate(self):
+        """Enable static estimates of quantization parameters.
+
+        Enables static observer estimates and disables learning of
+        quantization parameters. Forward path returns fake quantized X.
+        """
+        self.toggle_qparam_learning(enabled=False) \
+            .toggle_fake_quant(enabled=True) \
+            .toggle_observer_update(enabled=True)
+
+    @torch.jit.export
+    def enable_static_observation(self):
+        """Enable accumulation of data without updating quantization parameters.
+
+        Enables static observer accumulating data from input but doesn't
+        update the quantization parameters. Forward path returns the original X.
+        """
+        self.toggle_qparam_learning(enabled=False) \
+            .toggle_fake_quant(enabled=False) \
+            .toggle_observer_update(enabled=True)
+
+    @torch.jit.export
+    def toggle_observer_update(self, enabled=True):
+        self.static_enabled[0] = int(enabled)  # type: ignore[operator]
+        return self
+
+    @torch.jit.export
+    def enable_observer(self, enabled=True):
+        self.toggle_observer_update(enabled)
+
+    @torch.jit.export
+    def toggle_qparam_learning(self, enabled=True):
+        self.learning_enabled[0] = int(enabled)  # type: ignore[operator]
+        self.scale.requires_grad = enabled
+        self.zero_point.requires_grad = enabled
+        return self
+
+    @torch.jit.export
+    def toggle_fake_quant(self, enabled=True):
+        self.fake_quant_enabled[0] = int(enabled)
+        return self
+
+    @torch.jit.export
+    def observe_quant_params(self):
+        print(f'_LearnableFakeQuantize Scale: {self.scale.detach()}')
+        print(f'_LearnableFakeQuantize Zero Point: {self.zero_point.detach()}')
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        self.scale.data.clamp_(min=self.eps.item())  # type: ignore[operator]
+        scale = self.scale.detach()
+        zero_point = self.zero_point.detach().round().clamp(self.quant_min, self.quant_max).long()
+        return scale, zero_point
+
+    def forward(self, X):
+        if self.static_enabled[0] == 1:  # type: ignore[index]
+            self.activation_post_process(X.detach())
+            _scale, _zero_point = self.activation_post_process.calculate_qparams()
+            _scale = _scale.to(self.scale.device)
+            _zero_point = _zero_point.to(self.zero_point.device)
+            self.scale.data.copy_(_scale)
+            self.zero_point.data.copy_(_zero_point)
+        else:
+            self.scale.data.clamp_(min=self.eps.item())  # type: ignore[operator]
+
+        if self.fake_quant_enabled[0] == 1:
+            if self.qscheme in (torch.per_channel_symmetric, torch.per_tensor_symmetric):
+                self.zero_point.data.zero_()
+
+            if self.use_grad_scaling:
+                grad_factor = 1.0 / (X.numel() * self.quant_max) ** 0.5
+            else:
+                grad_factor = 1.0
+            if self.qscheme in (
+                    torch.per_channel_symmetric, torch.per_channel_affine):
+                X = torch._fake_quantize_learnable_per_channel_affine(
+                    X, self.scale, self.zero_point, self.ch_axis,
+                    self.quant_min, self.quant_max, grad_factor)
+            else:
+                X = torch._fake_quantize_learnable_per_tensor_affine(
+                    X, self.scale, self.zero_point,
+                    self.quant_min, self.quant_max, grad_factor)
+
+        return X
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e4baa9c3222f05ca78d5c7baa45fb16bf71332
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__init__.py
@@ -0,0 +1,23 @@
+from .backend_config import BackendConfig, BackendPatternConfig, DTypeConfig, DTypeWithConstraints, ObservationType
+from .fbgemm import get_fbgemm_backend_config
+from .native import get_native_backend_config, get_native_backend_config_dict
+from .qnnpack import get_qnnpack_backend_config
+from .tensorrt import get_tensorrt_backend_config, get_tensorrt_backend_config_dict
+from .executorch import get_executorch_backend_config
+from .onednn import get_onednn_backend_config
+
+__all__ = [
+    "get_fbgemm_backend_config",
+    "get_native_backend_config",
+    "get_native_backend_config_dict",
+    "get_qnnpack_backend_config",
+    "get_tensorrt_backend_config",
+    "get_tensorrt_backend_config_dict",
+    "get_executorch_backend_config",
+    "BackendConfig",
+    "BackendPatternConfig",
+    "DTypeConfig",
+    "DTypeWithConstraints",
+    "ObservationType",
+    "get_onednn_backend_config",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dace7a7a7f1216fbbb0f9c99560d4b2599d7ee03
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_common_operator_config_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_common_operator_config_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9be231216650f232bf2c8fea10f2a2d5208692da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_common_operator_config_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_qnnpack_pt2e.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_qnnpack_pt2e.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91bf2fc178c005941244392d7b6ed0b0fc483265
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/_qnnpack_pt2e.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/backend_config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/backend_config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d411d098c913907ca8edb1c227ea8c39bef7a543
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/backend_config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/executorch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/executorch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a442d8fefb81859fe14cd97bc6644cb1c64f88cf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/executorch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/fbgemm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/fbgemm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8bde6b1749de64b132c53372c110ef8a3fe5eec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/fbgemm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/native.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/native.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c0a064164fe8d8fd12ce73a93b1bbc4a36a3b46
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/native.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/observation_type.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/observation_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa380a61c9a22045aee48e0b94942b0b31aa3b10
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/observation_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/onednn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/onednn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20d30980f99738c93948c061c5dfe89f3529f462
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/onednn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/qnnpack.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/qnnpack.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af0d3dd2e267b042350f48a5d508f63c4d99cdee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/qnnpack.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/tensorrt.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/tensorrt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4695bbe9d038876e1305725bf90a276c528ea204
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/tensorrt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5cb014f5f756de0664e115dd35e56fd7de886de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/x86.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/x86.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c1058dacfa21a9165773c945172707cf2596684
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/__pycache__/x86.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_common_operator_config_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_common_operator_config_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6707c6bb9bb31c06068036e8e6baf7a06618768c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_common_operator_config_utils.py
@@ -0,0 +1,637 @@
+import copy
+import operator
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.quantized.reference as nnqr
+from collections import namedtuple
+from typing import Callable, Dict, List, Union
+from .backend_config import (
+    BackendPatternConfig,
+    DTypeConfig,
+    DTypeWithConstraints,
+    ObservationType,
+)
+from ..fuser_method_mappings import (
+    _sequential_wrapper2,
+    fuse_conv_bn,
+    fuse_conv_bn_relu,
+    fuse_linear_bn,
+    fuse_convtranspose_bn,
+)
+
+__all__: List[str] = []
+
+# TODO: rename to be more explicit, e.g. qat_conv_relu
+_ConvMetadata = namedtuple(
+    "_ConvMetadata",
+    ["root", "transpose", "bn", "reference", "transpose_reference",
+     "fused_conv_relu", "fused_conv_bn", "fused_conv_bn_relu",
+     "qat", "relu_qat", "bn_qat", "bn_relu_qat",
+     "func", "func_transpose"])
+_Conv1dMetadata = _ConvMetadata(
+    nn.Conv1d, nn.ConvTranspose1d, nn.BatchNorm1d, nnqr.Conv1d, nnqr.ConvTranspose1d,
+    nni.ConvReLU1d, nni.ConvBn1d, nni.ConvBnReLU1d,
+    nnqat.Conv1d, nniqat.ConvReLU1d, nniqat.ConvBn1d, nniqat.ConvBnReLU1d,
+    F.conv1d, F.conv_transpose1d)
+_Conv2dMetadata = _ConvMetadata(
+    nn.Conv2d, nn.ConvTranspose2d, nn.BatchNorm2d, nnqr.Conv2d, nnqr.ConvTranspose2d,
+    nni.ConvReLU2d, nni.ConvBn2d, nni.ConvBnReLU2d,
+    nnqat.Conv2d, nniqat.ConvReLU2d, nniqat.ConvBn2d, nniqat.ConvBnReLU2d,
+    F.conv2d, F.conv_transpose2d)
+_Conv3dMetadata = _ConvMetadata(
+    nn.Conv3d, nn.ConvTranspose3d, nn.BatchNorm3d, nnqr.Conv3d, nnqr.ConvTranspose3d,
+    nni.ConvReLU3d, nni.ConvBn3d, nni.ConvBnReLU3d,
+    nnqat.Conv3d, nniqat.ConvReLU3d, nniqat.ConvBn3d, nniqat.ConvBnReLU3d,
+    F.conv3d, F.conv_transpose3d)
+
+# Add constraints for fixed qparams ops like sigmoid and tanh to ensure values
+# fall within the proper ranges, e.g. [0, 1] for sigmoid, [-1, 1] for tanh
+_FIXED_QPARAM_OP_0TO1_CONSTRAINTS = DTypeWithConstraints(
+    dtype=torch.quint8,
+    quant_min_lower_bound=0,
+    quant_max_upper_bound=255,
+    scale_exact_match=1.0 / 256.0,
+    zero_point_exact_match=0,
+)
+_FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS = DTypeWithConstraints(
+    dtype=torch.quint8,
+    quant_min_lower_bound=0,
+    quant_max_upper_bound=255,
+    scale_exact_match=2.0 / 256.0,
+    zero_point_exact_match=128,
+)
+_FIXED_QPARAMS_OP_TO_CONSTRAINTS: Dict[Union[Callable, str], DTypeWithConstraints] = {
+    torch.nn.Hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.functional.hardsigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "hardsigmoid": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "hardsigmoid_": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Sigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.sigmoid: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "sigmoid": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    "sigmoid_": _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Softmax: _FIXED_QPARAM_OP_0TO1_CONSTRAINTS,
+    torch.nn.Tanh: _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    torch.tanh: _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    "tanh": _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+    "tanh_": _FIXED_QPARAM_OP_NEG1TO1_CONSTRAINTS,
+}
+
+def _get_binary_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    binary_op_configs: List[BackendPatternConfig] = []
+    num_tensor_args_to_observation_type_mapping = {
+        # TODO: this is not used right now since we have extra check in prepare
+        # will need to change this to NO_OBSERVER later after we implemented
+        # Tensor dtype inference properly
+        0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        1: ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+        2: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    }
+    for op_with_quantized_bop_scalar_variant in [operator.add, torch.add, operator.mul, torch.mul]:
+        bop_patterns = [
+            (op_with_quantized_bop_scalar_variant, nn.ReLU),
+            (op_with_quantized_bop_scalar_variant, F.relu),
+            (op_with_quantized_bop_scalar_variant, torch.relu),
+            op_with_quantized_bop_scalar_variant
+        ]
+        for bop_pattern in bop_patterns:
+            binary_op_configs.append(
+                BackendPatternConfig(bop_pattern)
+                    .set_dtype_configs(dtype_configs)  # noqa: E131
+                    ._set_num_tensor_args_to_observation_type(num_tensor_args_to_observation_type_mapping))
+    # matmul
+    binary_op_configs.append(
+        BackendPatternConfig(torch.matmul)
+        .set_dtype_configs(dtype_configs)  # noqa: E131
+    )
+    return binary_op_configs
+
+def _get_linear_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    """
+    Return all configs related to linear modules and ops.
+    """
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    linear_configs: List[BackendPatternConfig] = []
+
+    # (1) Single linear modules/functions
+    # -------------------------------------
+    # linear module
+    linear_configs.append(
+        BackendPatternConfig(torch.nn.Linear)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear)
+            .set_qat_module(nnqat.Linear))
+    # linear qat module
+    linear_configs.append(
+        BackendPatternConfig(nnqat.Linear)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear))
+    # functional linear
+    linear_configs.append(
+        BackendPatternConfig(torch.nn.functional.linear)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            ._set_input_type_to_index({"weight": 1, "bias": 2}))
+
+    # (2) Linear + relu
+    # -------------------
+    # 2.1 linear module + relu fusion config
+    # linear relu, linear module + relu module
+    linear_configs.append(
+        BackendPatternConfig((torch.nn.Linear, torch.nn.ReLU))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(_sequential_wrapper2(nni.LinearReLU))
+            .set_fused_module(nni.LinearReLU))
+    # linear relu, linear module + functional relu
+    linear_configs.append(
+        BackendPatternConfig((torch.nn.Linear, torch.nn.functional.relu))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(_sequential_wrapper2(nni.LinearReLU))
+            .set_fused_module(nni.LinearReLU))
+
+    # 2.2 linear module + relu, fused module configs
+    # linear relu, fused module
+    linear_configs.append(
+        BackendPatternConfig(nni.LinearReLU)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear)
+            .set_qat_module(nniqat.LinearReLU))
+    # linear relu, qat fused module
+    linear_configs.append(
+        BackendPatternConfig(nniqat.LinearReLU)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear))
+    # 2.3 functional linear + relu configs
+    # linear relu, functional linear + relu module
+    linear_configs.append(
+        BackendPatternConfig((F.linear, torch.nn.ReLU))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs))
+    # linear relu, functional linear + functional relu
+    linear_configs.append(
+        BackendPatternConfig((F.linear, F.relu))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs))
+
+    # (3) Linear + batchnorm
+    # ------------------------
+    # 3.1 linear bn fusion
+    linear_configs.append(
+        BackendPatternConfig((nn.Linear, nn.BatchNorm1d))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(fuse_linear_bn)
+            .set_fused_module(nni.LinearBn1d))
+
+    # 3.2 linear bn fused
+    # linear bn, fused module
+    linear_configs.append(
+        BackendPatternConfig(nni.LinearBn1d)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear)
+            .set_qat_module(nniqat.LinearBn1d))
+    # linear bn, qat fused module
+    linear_configs.append(
+        BackendPatternConfig(nniqat.LinearBn1d)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(torch.nn.Linear)
+            .set_reference_quantized_module(nnqr.Linear))
+    return linear_configs
+
+def _get_conv_configs(dtype_configs):
+    """
+    Return all configs related to conv modules and ops.
+    """
+    conv_configs = []
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    for convs in [_Conv1dMetadata, _Conv2dMetadata, _Conv3dMetadata]:
+
+        # (1) Single conv modules/functions
+        # -----------------------------------
+        # conv module
+        conv_configs.append(
+            BackendPatternConfig(convs.root)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference)
+                .set_qat_module(convs.qat))
+        # conv qat module
+        conv_configs.append(
+            BackendPatternConfig(convs.qat)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference))
+        # functional conv
+        conv_configs.append(
+            BackendPatternConfig(convs.func)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                ._set_input_type_to_index({"weight": 1, "bias": 2}))
+
+        # (2) Conv + relu
+        # -----------------
+        # 2.1 conv module + relu fusion configs
+        # conv relu fusion, conv module + relu module
+        conv_configs.append(
+            BackendPatternConfig((convs.root, torch.nn.ReLU))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fused_module(convs.fused_conv_relu))
+        # conv relu fusion, conv module + functional relu
+        conv_configs.append(
+            BackendPatternConfig((convs.root, F.relu))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(_sequential_wrapper2(convs.fused_conv_relu))
+                .set_fused_module(convs.fused_conv_relu))
+        # 2.2 conv module + relu fused module configs
+        # conv relu, fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_relu)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference)
+                .set_qat_module(convs.relu_qat))
+        # conv relu, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.relu_qat)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference))
+        # 2.3 functional conv + relu configs
+        # conv relu, functional conv + relu module
+        conv_configs.append(
+            BackendPatternConfig((convs.func, torch.nn.ReLU))
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs))
+        # conv relu, functional conv + functional relu
+        conv_configs.append(
+            BackendPatternConfig((convs.func, F.relu))
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs))
+
+        # fused conv relu
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_relu)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_qat_module(convs.relu_qat))
+
+        conv_configs.append(
+            BackendPatternConfig(convs.relu_qat)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference))
+
+        # (3) Conv + batchnorm (+ relu)
+        # -------------------------------
+        # 3.1 conv bn fusion configs
+        # conv + bn fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(fuse_conv_bn)
+                .set_fused_module(convs.fused_conv_bn))
+        # conv + bn + relu module fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn, nn.ReLU))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(fuse_conv_bn_relu)
+                .set_fused_module(convs.fused_conv_bn_relu))
+        # conv + bn + relu functional fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn, F.relu))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_root_module(convs.root)
+                .set_fuser_method(fuse_conv_bn_relu)
+                .set_fused_module(convs.fused_conv_bn_relu))
+        # TODO: we can add fusion for torch.relu as well
+
+        # 3.2 conv + bn (+ relu) fused module configs
+        # fused conv bn
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_bn)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_qat_module(convs.bn_qat))
+
+        # fused conv bn relu
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_bn_relu)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_qat_module(convs.bn_relu_qat))
+
+        # conv bn, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.bn_qat)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference))
+        # conv bn relu, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.bn_relu_qat)
+                .set_observation_type(observation_type)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(convs.root)
+                .set_reference_quantized_module(convs.reference))
+
+        # (4) conv transpose and its fusion
+        # 4.1 conv transpose config
+        conv_configs.append(
+            BackendPatternConfig(convs.transpose)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_root_module(convs.transpose)
+                .set_reference_quantized_module(convs.transpose_reference))
+
+        # 4.2 conv transpose + bn fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.transpose, convs.bn))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(fuse_convtranspose_bn)
+                .set_root_module(convs.transpose)
+                .set_reference_quantized_module(convs.transpose_reference))
+
+        # 4.3 functional conv transpose
+        conv_configs.append(
+            BackendPatternConfig(convs.func_transpose)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                ._set_input_type_to_index({"weight": 1, "bias": 2}))
+
+    return conv_configs
+
+def _get_cat_config(dtype_configs: List[DTypeConfig]) -> BackendPatternConfig:
+    return BackendPatternConfig(torch.cat) \
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT) \
+        .set_dtype_configs(dtype_configs)
+
+def _get_ln_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    ln_configs = []
+    ln_configs.append(
+        BackendPatternConfig(torch.nn.LayerNorm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+    )
+    ln_configs.append(
+        BackendPatternConfig(torch.nn.functional.layer_norm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 2, "bias": 3})
+    )
+    return ln_configs
+
+def _get_default_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    configs = []
+    default_ops = [
+        torch.nn.ELU,
+        torch.nn.LeakyReLU,
+        torch.nn.Hardswish,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.Dropout,
+        torch.nn.PReLU,
+        torch.nn.functional.elu,
+        torch.nn.functional.hardswish,
+        torch.nn.functional.leaky_relu,
+        torch.nn.functional.dropout,
+    ]
+    for op in default_ops:
+        configs.append(
+            BackendPatternConfig(op)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs))
+
+    configs.append(
+        BackendPatternConfig(torch.nn.functional.group_norm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 2, "bias": 3})
+    )
+
+    configs.append(
+        BackendPatternConfig(torch.nn.functional.instance_norm)
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 3, "bias": 4})
+    )
+    return configs
+
+def _add_fixed_qparams_to_dtype_configs(
+    dtype_configs: List[DTypeConfig],
+    constraints: DTypeWithConstraints,
+) -> List[DTypeConfig]:
+    """
+    Return a copy of the list of DTypeConfigs where activations are subject to the specified
+    constraints required for fixed qparams ops.
+
+    If the data type doesn't match the one in the constraints, simply leave the corresponding
+    DTypeConfig unchanged.
+
+    If `scale_min_lower_bound` or `scale_max_upper_bound` is specified in the activations,
+    throw an exception since these settings are incompatible with fixed qparams ops.
+    """
+    new_dtype_configs = []
+    for dtype_config in dtype_configs:
+        dc = copy.deepcopy(dtype_config)
+        for orig_constraints in [dc.input_dtype_with_constraints, dc.output_dtype_with_constraints]:
+            if orig_constraints.dtype != constraints.dtype:
+                continue
+            if orig_constraints.scale_min_lower_bound is not None:
+                raise ValueError(f"scale_min_lower_bound is invalid for fixed qparams ops: {dtype_config}")
+            if orig_constraints.scale_max_upper_bound is not None:
+                raise ValueError(f"scale_max_upper_bound is invalid for fixed qparams ops: {dtype_config}")
+            orig_constraints.quant_min_lower_bound = constraints.quant_min_lower_bound
+            orig_constraints.quant_max_upper_bound = constraints.quant_max_upper_bound
+            orig_constraints.scale_exact_match = constraints.scale_exact_match
+            orig_constraints.zero_point_exact_match = constraints.zero_point_exact_match
+        new_dtype_configs.append(dc)
+    return new_dtype_configs
+
+def _get_fixed_qparams_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    fixed_qparams_op_configs = []
+    for fixed_qparam_op, constraints in _FIXED_QPARAMS_OP_TO_CONSTRAINTS.items():
+        new_dtype_configs = _add_fixed_qparams_to_dtype_configs(dtype_configs, constraints)
+        fixed_qparams_op_configs.append(
+            BackendPatternConfig(fixed_qparam_op)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(new_dtype_configs))
+    return fixed_qparams_op_configs
+
+def _get_share_qparams_op_configs(dtype_configs):
+    """ Get the operator config for the operators that works for both float and quantized input
+    if input is quantized, the output Tensor shares the same quantization parameter
+    with input.
+    Example operator: avgpool2d, reshape, transpose, maxpool2d
+    Example observed operator:
+    observer_0 - avgpool2d - observer_0 (same observer instance as input)
+    """
+
+    def _get_share_qprams_op_backend_config(op):
+        return BackendPatternConfig(op) \
+            .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT) \
+            .set_dtype_configs(dtype_configs)
+
+    share_qparams_ops = [
+        torch.nn.AdaptiveAvgPool1d,
+        torch.nn.AdaptiveAvgPool2d,
+        torch.nn.AdaptiveAvgPool3d,
+        torch.nn.AvgPool1d,
+        torch.nn.AvgPool2d,
+        torch.nn.AvgPool3d,
+        torch.nn.Hardtanh,
+        torch.nn.Identity,
+        torch.nn.MaxPool1d,
+        torch.nn.MaxPool2d,
+        torch.nn.MaxPool3d,
+        torch.nn.PixelShuffle,
+        torch.nn.PixelUnshuffle,
+        torch.nn.ReLU,
+        torch.nn.ReLU6,
+        torch.adaptive_avg_pool1d,
+        torch.nn.functional.adaptive_avg_pool2d,
+        torch.nn.functional.adaptive_avg_pool3d,
+        torch.nn.functional.hardtanh,
+        torch.nn.functional.hardtanh_,
+        torch.nn.functional.interpolate,
+        torch.nn.functional.max_pool1d,
+        torch.nn.functional.max_pool2d,
+        torch.nn.functional.max_pool3d,
+        torch.nn.functional.pixel_shuffle,
+        torch.nn.functional.pixel_unshuffle,
+        torch.nn.functional.relu,
+        torch.nn.functional.relu6,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.clamp,
+        torch.flatten,
+        torch.mean,
+        torch.narrow,
+        torch.repeat_interleave,
+        torch.transpose,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+        operator.floordiv,
+        "contiguous",
+        "clamp",
+        "detach",
+        "detach_",
+        "mean",
+        "permute",
+        "repeat",
+        "repeat_interleave",
+        "reshape",
+        "resize_",
+        "relu",
+        "relu_",
+        "squeeze",
+        "squeeze_",
+        "transpose",
+        "unsqueeze",
+        "unsqueeze_",
+        "view"
+    ]
+    return [_get_share_qprams_op_backend_config(op) for op in share_qparams_ops]
+
+def _get_bn_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    """ Get configs related to batchnorm. """
+    bn_configs = []
+    bn_to_fused_bn = {
+        torch.nn.BatchNorm2d: nni.BNReLU2d,
+        torch.nn.BatchNorm3d: nni.BNReLU3d,
+    }
+    for bn in bn_to_fused_bn.keys():
+        fused_bn = bn_to_fused_bn[bn]
+        # bn module + relu module fusion config
+        bn_configs.append(
+            BackendPatternConfig((bn, nn.ReLU))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(_sequential_wrapper2(fused_bn))
+                .set_fused_module(fused_bn))
+        # bn module + F.relu fusion config
+        bn_configs.append(
+            BackendPatternConfig((bn, F.relu))
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                .set_fuser_method(_sequential_wrapper2(fused_bn))
+                .set_fused_module(fused_bn))
+        bn_configs.append(
+            BackendPatternConfig(bn)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs))
+
+    # fused bn configs
+    for fused_bn in bn_to_fused_bn.values():
+        bn_configs.append(
+            BackendPatternConfig(fused_bn)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs))
+    return bn_configs
+
+def _get_rnn_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    rnn_op_configs = []
+    for rnn_op, ref_rnn_op in [
+            (nn.GRUCell, nnqr.GRUCell),
+            (nn.LSTMCell, nnqr.LSTMCell),
+            (nn.RNNCell, nnqr.RNNCell),
+            (nn.LSTM, nnqr.LSTM),
+            (nn.GRU, nnqr.GRU)
+    ]:
+        rnn_op_configs.append(
+            BackendPatternConfig(rnn_op)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(rnn_op)
+                .set_reference_quantized_module(ref_rnn_op))
+    return rnn_op_configs
+
+def _get_embedding_op_configs(dtype_configs: List[DTypeConfig]) -> List[BackendPatternConfig]:
+    embedding_op_configs = []
+    for embedding_op, qat_embedding_op, ref_embedding_op in [
+            (nn.Embedding, nnqat.Embedding, nnqr.Embedding),
+            (nn.EmbeddingBag, nnqat.EmbeddingBag, nnqr.EmbeddingBag),
+    ]:
+        embedding_op_configs.append(
+            BackendPatternConfig(embedding_op)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_qat_module(qat_embedding_op)
+                .set_root_module(embedding_op)
+                .set_reference_quantized_module(ref_embedding_op))
+
+        # config for qat op
+        embedding_op_configs.append(
+            BackendPatternConfig(qat_embedding_op)
+                .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT)  # noqa: E131
+                .set_dtype_configs(dtype_configs)
+                .set_root_module(embedding_op)
+                .set_reference_quantized_module(ref_embedding_op))
+    return embedding_op_configs
+
+def _get_tensor_info_op_configs(dtype_configs):
+    """
+    These ops work on tensors of different dtypes but return non-tensors
+    containing information about the input tensor.
+    """
+
+    def _get_config(op):
+        return BackendPatternConfig(op) \
+            .set_observation_type(ObservationType.INPUT_OUTPUT_NOT_OBSERVED) \
+            .set_dtype_configs(dtype_configs)
+
+    return [_get_config(op) for op in ("shape", "size")]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_qnnpack_pt2e.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_qnnpack_pt2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..09bc1a9453c001cfbb337417d5b84c8d52d957da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/_qnnpack_pt2e.py
@@ -0,0 +1,160 @@
+import operator
+import torch
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    DTypeConfig,
+    ObservationType,
+    BackendPatternConfig,
+)
+
+weighted_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+from typing import List
+
+def get_linear_configs():
+    linear_configs = []
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [weighted_op_quint8_dtype_config]
+
+    # TODO: need to fix the way we insert observers for this pattern
+    # should be solved in the new fusion API
+    # reason that this doesn't work: the pattern is a bit complicated and we don't
+    # have a way to specify which input of the pattern we would like to observe
+    # pattern:
+    # bias input weight
+    # \     |    /
+    #  \    |   t
+    #   \   |  /
+    #    addmm
+    # we want to observe "weight" as weight, but there is not way to convey this
+    # information with current pattern language
+    #
+    # right now:
+    # original:
+    #         weight - t \
+    #         input  - addmm
+    # observed (no hack):
+    #      weight - t - observer \
+    #       input - observer - addmm
+    # target:
+    #      weight - observer - t \
+    #        input - observer - addmm
+
+    # def root_node_getter(node_pattern):
+    #     addmm, bias, act, weight = node_pattern
+    #     return addmm
+
+    # linear_configs.append(
+    #     BackendPatternConfig((torch.ops.aten.addmm.default, MatchAllNode, MatchAllNode, torch.ops.aten.t.default))
+    #     .set_observation_type(observation_type)  # noqa: E131
+    #     .set_dtype_configs(dtype_configs)
+    #     ._set_root_node_getter(root_node_getter))
+
+    linear_configs.append(
+        BackendPatternConfig(torch.ops.aten.addmm.default)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 2, "bias": 0})
+    )
+    # linear is decomposed to `t - mm` if bias is not present
+    linear_configs.append(
+        BackendPatternConfig(torch.ops.aten.mm.default)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 1})
+    )
+    return linear_configs
+
+def get_conv_configs():
+    conv_configs = []
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [weighted_op_quint8_dtype_config]
+    conv_configs.append(
+        BackendPatternConfig(torch.ops.aten.convolution.default)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 1, "bias": 2})
+    )
+    conv_configs.append(
+        BackendPatternConfig((torch.ops.aten.convolution.default, torch.ops.aten.relu.default))
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 1, "bias": 2})
+    )
+    # TODO: remove when functionalization is supported in PT2 mode
+    conv_configs.append(
+        BackendPatternConfig((torch.ops.aten.convolution.default, torch.ops.aten.relu_.default))
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 1, "bias": 2})
+    )
+    return conv_configs
+
+def get_pooling_configs():
+    backend_pattern_configs = []
+    observation_type = ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+    dtype_configs = [weighted_op_quint8_dtype_config]
+
+    def root_node_getter(node_pattern):
+        getitem, maxpool, index = node_pattern
+        return maxpool
+
+    backend_pattern_configs.append(
+        BackendPatternConfig()
+        ._set_pattern_complex_format((operator.getitem, torch.ops.aten.max_pool2d_with_indices.default, 0))
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_root_node_getter(root_node_getter)
+    )
+
+    return backend_pattern_configs
+
+def get_relu_configs():
+    backend_pattern_configs = []
+    observation_type = ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+    dtype_configs = [weighted_op_quint8_dtype_config]
+    backend_pattern_configs.append(
+        BackendPatternConfig(torch.ops.aten.relu.default)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs))
+    return backend_pattern_configs
+
+def get_binary_op_configs():
+    binary_op_configs: List[BackendPatternConfig] = []
+    dtype_configs = [weighted_op_quint8_dtype_config]
+    num_tensor_args_to_observation_type_mapping = {
+        # TODO: this is not used right now since we have extra check in prepare
+        # will need to change this to NO_OBSERVER later after we implemented
+        # Tensor dtype inference properly
+        0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        1: ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+        2: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    }
+    for op_with_quantized_bop_scalar_variant in [torch.ops.aten.add.Tensor, torch.ops.aten.add_.Tensor]:
+        bop_patterns = [
+            (op_with_quantized_bop_scalar_variant, torch.ops.aten.relu.default),
+            op_with_quantized_bop_scalar_variant,
+            # TODO: remove when functionalization is supported in pt2_mode
+            (op_with_quantized_bop_scalar_variant, torch.ops.aten.relu_.default),
+        ]
+        for bop_pattern in bop_patterns:
+            binary_op_configs.append(
+                BackendPatternConfig(bop_pattern)
+                    .set_dtype_configs(dtype_configs)  # noqa: E131
+                    ._set_num_tensor_args_to_observation_type(num_tensor_args_to_observation_type_mapping))
+
+    return binary_op_configs
+
+def get_qnnpack_pt2e_backend_config():
+    return (
+        BackendConfig("qnnpack_pytorch_2.0_export")
+        .set_backend_pattern_configs(get_linear_configs())
+        .set_backend_pattern_configs(get_binary_op_configs())
+        .set_backend_pattern_configs(get_conv_configs())
+        .set_backend_pattern_configs(get_pooling_configs())
+        .set_backend_pattern_configs(get_relu_configs())
+    )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/backend_config.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/backend_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c287f9aca396b29fdc8b71ce7913c9ec1361d67e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/backend_config.py
@@ -0,0 +1,659 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+
+import torch
+from torch.ao.quantization.utils import Pattern
+from enum import Enum
+
+
+__all__ = [
+    "BackendConfig",
+    "BackendPatternConfig",
+    "DTypeConfig",
+    "DTypeWithConstraints",
+    "ObservationType",
+]
+
+
+# DTypeConfig dict keys
+INPUT_DTYPE_DICT_KEY = "input_dtype"
+OUTPUT_DTYPE_DICT_KEY = "output_dtype"
+WEIGHT_DTYPE_DICT_KEY = "weight_dtype"
+BIAS_DTYPE_DICT_KEY = "bias_dtype"
+IS_DYNAMIC_DICT_KEY = "is_dynamic"
+
+# BackendConfig dict keys
+NAME_DICT_KEY = "name"
+CONFIGS_DICT_KEY = "configs"
+
+# BackendPatternConfig dict keys
+PATTERN_DICT_KEY = "pattern"
+PATTERN_COMPLEX_FORMAT_DICT_KEY = "pattern_complex_format"
+OBSERVATION_TYPE_DICT_KEY = "observation_type"
+DTYPE_CONFIGS_DICT_KEY = "dtype_configs"
+ROOT_MODULE_DICT_KEY = "root_module"
+QAT_MODULE_DICT_KEY = "qat_module"
+REFERENCE_QUANTIZED_MODULE_DICT_KEY = "reference_quantized_module_for_root"
+FUSED_MODULE_DICT_KEY = "fused_module"
+FUSER_METHOD_DICT_KEY = "fuser_method"
+ROOT_NODE_GETTER_DICT_KEY = "root_node_getter"
+EXTRA_INPUTS_GETTER_DICT_KEY = "extra_inputs_getter"
+NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY = "num_tensor_args_to_observation_type"
+INPUT_TYPE_TO_INDEX_DICT_KEY = "input_type_to_index"
+
+
+# TODO: maybe rename this to something that's not related to observer
+# e.g. QParamsType
+class ObservationType(Enum):
+    """ An enum that represents different ways of how an operator/operator pattern
+    should be observed
+    """
+
+    OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT = 0
+    """this means input and output are observed with different observers, based
+    on qconfig.activation
+    example: conv, linear, softmax
+    """
+
+    OUTPUT_SHARE_OBSERVER_WITH_INPUT = 1
+    """this means the output will use the same observer instance as input, based
+    on qconfig.activation
+    example: torch.cat, maxpool
+    """
+
+    INPUT_OUTPUT_NOT_OBSERVED = 2
+    """this means the input and output are never observed
+    example: x.shape, x.size
+    """
+
+
+@dataclass
+class DTypeWithConstraints:
+    """
+    Config for specifying additional constraints for a given dtype, such as quantization
+    value ranges, scale value ranges, and fixed quantization params, to be used in
+    :class:`~torch.ao.quantization.backend_config.DTypeConfig`.
+
+    The constraints currently supported are:
+
+    * `quant_min_lower_bound` and `quant_max_upper_bound`: Lower and upper
+      bounds for the minimum and maximum quantized values respectively. If
+      the QConfig’s `quant_min` and `quant_max` fall outside this range,
+      then the QConfig will be ignored.
+
+    * `scale_min_lower_bound` and `scale_max_upper_bound`: Lower and upper
+      bounds for the minimum and maximum scale values respectively. If the
+      QConfig’s minimum scale value (currently exposed as `eps`) falls below
+      the lower bound, then the QConfig will be ignored. Note that the upper
+      bound is currently not enforced.
+
+    * `scale_exact_match` and `zero_point_exact_match`: Exact match requirements
+      for scale and zero point, to be used for operators with fixed quantization
+      parameters such as sigmoid and tanh. If the observer specified in the QConfig
+      is neither `FixedQParamsObserver` nor `FixedQParamsFakeQuantize`, or if
+      the quantization parameters don't match, then the QConfig will be ignored.
+    """
+    dtype: Optional[torch.dtype] = None
+    quant_min_lower_bound: Union[int, float, None] = None
+    quant_max_upper_bound: Union[int, float, None] = None
+    scale_min_lower_bound: Union[int, float, None] = None
+    scale_max_upper_bound: Union[int, float, None] = None
+    scale_exact_match: Optional[float] = None
+    zero_point_exact_match: Optional[int] = None
+
+
+@dataclass
+class DTypeConfig:
+    """
+    Config object that specifies the supported data types passed as arguments to
+    quantize ops in the reference model spec, for input and output activations,
+    weights, and biases.
+
+    For example, consider the following reference model:
+
+      quant1 - [dequant1 - fp32_linear - quant2] - dequant2
+
+    The pattern in the square brackets refers to the reference pattern of
+    statically quantized linear. Setting the input dtype as `torch.quint8`
+    in the DTypeConfig means we pass in `torch.quint8` as the dtype argument
+    to the first quantize op (quant1). Similarly, setting the output dtype as
+    `torch.quint8` means we pass in `torch.quint8` as the dtype argument to
+    the second quantize op (quant2).
+
+    Note that the dtype here does not refer to the interface dtypes of the
+    op. For example, the "input dtype" here is not the dtype of the input
+    tensor passed to the quantized linear op. Though it can still be the
+    same as the interface dtype, this is not always the case, e.g. the
+    interface dtype is fp32 in dynamic quantization but the "input dtype"
+    specified in the DTypeConfig would still be quint8. The semantics of
+    dtypes here are the same as the semantics of the dtypes specified in
+    the observers.
+
+    These dtypes are matched against the ones specified in the user’s
+    QConfig. If there is a match, and the QConfig satisfies the constraints
+    specified in the DTypeConfig (if any), then we will quantize the given
+    pattern using this DTypeConfig. Otherwise, the QConfig is ignored and
+    the pattern will not be quantized.
+
+    Example usage::
+
+        >>> # xdoctest: +SKIP(failing)
+        >>> dtype_config1 = DTypeConfig(
+        ...     input_dtype=torch.quint8,
+        ...     output_dtype=torch.quint8,
+        ...     weight_dtype=torch.qint8,
+        ...     bias_dtype=torch.float)
+
+        >>> dtype_config2 = DTypeConfig(
+        ...     input_dtype=DTypeWithConstraints(
+        ...         dtype=torch.quint8,
+        ...         quant_min_lower_bound=0,
+        ...         quant_max_upper_bound=255,
+        ...     ),
+        ...     output_dtype=DTypeWithConstraints(
+        ...         dtype=torch.quint8,
+        ...         quant_min_lower_bound=0,
+        ...         quant_max_upper_bound=255,
+        ...     ),
+        ...     weight_dtype=DTypeWithConstraints(
+        ...         dtype=torch.qint8,
+        ...         quant_min_lower_bound=-128,
+        ...         quant_max_upper_bound=127,
+        ...     ),
+        ...     bias_dtype=torch.float)
+
+        >>> dtype_config1.input_dtype
+        torch.quint8
+
+        >>> dtype_config2.input_dtype
+        torch.quint8
+
+        >>> dtype_config2.input_dtype_with_constraints
+        DTypeWithConstraints(dtype=torch.quint8, quant_min_lower_bound=0, quant_max_upper_bound=255, \
+scale_min_lower_bound=None, scale_max_upper_bound=None)
+    """
+    input_dtype_with_constraints: DTypeWithConstraints
+    output_dtype_with_constraints: DTypeWithConstraints
+    weight_dtype_with_constraints: DTypeWithConstraints
+    bias_dtype: Optional[torch.dtype]
+    is_dynamic: Optional[bool]
+
+    def __init__(
+        self,
+        input_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
+        output_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
+        weight_dtype: Union[torch.dtype, DTypeWithConstraints, None] = None,
+        bias_dtype: Optional[torch.dtype] = None,
+        is_dynamic: Optional[bool] = None,
+    ):
+        if isinstance(input_dtype, DTypeWithConstraints):
+            self.input_dtype_with_constraints = input_dtype
+        else:
+            self.input_dtype_with_constraints = DTypeWithConstraints(dtype=input_dtype)
+
+        if isinstance(output_dtype, DTypeWithConstraints):
+            self.output_dtype_with_constraints = output_dtype
+        else:
+            self.output_dtype_with_constraints = DTypeWithConstraints(dtype=output_dtype)
+
+        if isinstance(weight_dtype, DTypeWithConstraints):
+            self.weight_dtype_with_constraints = weight_dtype
+        else:
+            self.weight_dtype_with_constraints = DTypeWithConstraints(dtype=weight_dtype)
+
+        self.bias_dtype = bias_dtype
+        self.is_dynamic = is_dynamic
+
+    @property
+    def input_dtype(self) -> Optional[torch.dtype]:
+        return self.input_dtype_with_constraints.dtype
+
+    @property
+    def output_dtype(self) -> Optional[torch.dtype]:
+        return self.output_dtype_with_constraints.dtype
+
+    @property
+    def weight_dtype(self) -> Optional[torch.dtype]:
+        return self.weight_dtype_with_constraints.dtype
+
+    @classmethod
+    def from_dict(cls, dtype_config_dict: Dict[str, Any]) -> DTypeConfig:
+        """
+        Create a ``DTypeConfig`` from a dictionary with the following items (all optional):
+            "input_dtype": torch.dtype or ``DTypeWithConstraints``
+            "output_dtype": torch.dtype or ``DTypeWithConstraints``
+            "weight_dtype": torch.dtype or ``DTypeWithConstraints``
+            "bias_type": torch.dtype
+            "is_dynamic": bool
+        """
+        input_dtype = dtype_config_dict.get(INPUT_DTYPE_DICT_KEY, None)
+        if input_dtype is not None and not isinstance(input_dtype, (torch.dtype, DTypeWithConstraints)):
+            raise ValueError("Expected input_dtype to be a torch.dtype or DTypeWithConstraints")
+        output_dtype = dtype_config_dict.get(OUTPUT_DTYPE_DICT_KEY, None)
+        if output_dtype is not None and not isinstance(output_dtype, (torch.dtype, DTypeWithConstraints)):
+            raise ValueError("Expected output_dtype to be a torch.dtype or DTypeWithConstraints")
+        weight_dtype = dtype_config_dict.get(WEIGHT_DTYPE_DICT_KEY, None)
+        if weight_dtype is not None and not isinstance(weight_dtype, (torch.dtype, DTypeWithConstraints)):
+            raise ValueError("Expected weight_dtype to be a torch.dtype or DTypeWithConstraints")
+        bias_dtype = dtype_config_dict.get(BIAS_DTYPE_DICT_KEY, None)
+        is_dynamic = dtype_config_dict.get(IS_DYNAMIC_DICT_KEY, None)
+        return cls(input_dtype, output_dtype, weight_dtype, bias_dtype, is_dynamic)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``DTypeConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.backend_config.DTypeConfig.from_dict`.
+        """
+        dtype_config_dict: Dict[str, Any] = {}
+        if self.input_dtype is not None:
+            dtype_config_dict[INPUT_DTYPE_DICT_KEY] = self.input_dtype_with_constraints
+        if self.output_dtype is not None:
+            dtype_config_dict[OUTPUT_DTYPE_DICT_KEY] = self.output_dtype_with_constraints
+        if self.weight_dtype is not None:
+            dtype_config_dict[WEIGHT_DTYPE_DICT_KEY] = self.weight_dtype_with_constraints
+        if self.bias_dtype is not None:
+            dtype_config_dict[BIAS_DTYPE_DICT_KEY] = self.bias_dtype
+        if self.is_dynamic is not None:
+            dtype_config_dict[IS_DYNAMIC_DICT_KEY] = self.is_dynamic
+        return dtype_config_dict
+
+
+class BackendConfig:
+    # TODO: refer to NativeBackendConfig once that is implemented
+    """Config that defines the set of patterns that can be quantized on a given backend, and how reference
+    quantized models can be produced from these patterns.
+
+    A pattern in this context refers to a module, a functional, an operator, or a directed acyclic graph
+    of the above. Each pattern supported on the target backend can be individually configured through
+    :class:`~torch.ao.quantization.backend_config.BackendPatternConfig` in terms of:
+
+    (1) The supported input/output activation, weight, and bias data types
+
+    (2) How observers and quant/dequant ops are inserted in order to construct the reference pattern, and
+
+    (3) (Optionally) Fusion, QAT, and reference module mappings.
+
+    The format of the patterns is described in:
+    https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/backend_config/README.md
+
+    Example usage::
+
+        import torch
+        from torch.ao.quantization.backend_config import (
+            BackendConfig,
+            BackendPatternConfig,
+            DTypeConfig,
+            ObservationType,
+        )
+
+        weighted_int8_dtype_config = DTypeConfig(
+            input_dtype=torch.quint8,
+            output_dtype=torch.quint8,
+            weight_dtype=torch.qint8,
+            bias_dtype=torch.float)
+
+        def fuse_conv2d_relu(is_qat, conv, relu):
+            return torch.ao.nn.intrinsic.ConvReLU2d(conv, relu)
+
+        # For quantizing Linear
+        linear_config = BackendPatternConfig(torch.nn.Linear) \
+            .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+            .add_dtype_config(weighted_int8_dtype_config) \
+            .set_root_module(torch.nn.Linear) \
+            .set_qat_module(torch.ao.nn.qat.Linear) \
+            .set_reference_quantized_module(torch.ao.nn.quantized.reference.Linear)
+
+        # For fusing Conv2d + ReLU into ConvReLU2d
+        conv_relu_config = BackendPatternConfig((torch.nn.Conv2d, torch.nn.ReLU)) \
+            .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+            .add_dtype_config(weighted_int8_dtype_config) \
+            .set_fused_module(torch.ao.nn.intrinsic.ConvReLU2d) \
+            .set_fuser_method(fuse_conv2d_relu)
+
+        # For quantizing ConvReLU2d
+        fused_conv_relu_config = BackendPatternConfig(torch.ao.nn.intrinsic.ConvReLU2d) \
+            .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+            .add_dtype_config(weighted_int8_dtype_config) \
+            .set_root_module(torch.nn.Conv2d) \
+            .set_qat_module(torch.ao.nn.intrinsic.qat.ConvReLU2d) \
+            .set_reference_quantized_module(torch.ao.nn.quantized.reference.Conv2d)
+
+        backend_config = BackendConfig("my_backend") \
+            .set_backend_pattern_config(linear_config) \
+            .set_backend_pattern_config(conv_relu_config) \
+            .set_backend_pattern_config(fused_conv_relu_config)
+
+    """
+    def __init__(self, name: str = ""):
+        self.name = name
+        # Store all BackendPatternConfigs in a map to handle duplicates
+        # Note: the key in this map uses the complex reversed tuple format.
+        # This is intended only for internal use; users who wish to access
+        # the original patterns should go through `self.configs` instead.
+        self._pattern_complex_format_to_config: Dict[Pattern, BackendPatternConfig] = {}
+
+    def __repr__(self):
+        return f"BackendConfig({self.__dict__})"
+
+    def set_name(self, name: str) -> BackendConfig:
+        """
+        Set the name of the target backend.
+        """
+        self.name = name
+        return self
+
+    def set_backend_pattern_config(self, config: BackendPatternConfig) -> BackendConfig:
+        """
+        Set the config for an pattern that can be run on the target backend.
+        This overrides any existing config for the given pattern.
+        """
+        # Avoid circular dependencies
+        pattern_complex_format = torch.ao.quantization.backend_config.utils \
+            ._get_pattern_in_reversed_nested_tuple_format(config)  # type: ignore[attr-defined]
+        self._pattern_complex_format_to_config[pattern_complex_format] = config
+        return self
+
+    def set_backend_pattern_configs(self, configs: List[BackendPatternConfig]) -> BackendConfig:
+        """
+        Set the configs for patterns that can be run on the target backend.
+        This overrides any existing config for a given pattern if it was previously registered already.
+        """
+        for conf in configs:
+            self.set_backend_pattern_config(conf)
+        return self
+
+    @property
+    def configs(self) -> List[BackendPatternConfig]:
+        """
+        Return a copy of the list of configs set in this `BackendConfig`.
+        """
+        return list(self._pattern_complex_format_to_config.values())
+
+    @classmethod
+    def from_dict(cls, backend_config_dict: Dict[str, Any]) -> BackendConfig:
+        """
+        Create a ``BackendConfig`` from a dictionary with the following items:
+
+            "name": the name of the target backend
+
+            "configs": a list of dictionaries that each represents a `BackendPatternConfig`
+
+        """
+        conf = cls(backend_config_dict.get(NAME_DICT_KEY, ""))
+        for d in backend_config_dict.get(CONFIGS_DICT_KEY, []):
+            if isinstance(d, BackendPatternConfig):
+                conf.set_backend_pattern_config(d)
+            elif isinstance(d, Dict):
+                conf.set_backend_pattern_config(BackendPatternConfig.from_dict(d))
+            else:
+                raise ValueError(f"Expected backend_config_dict['{CONFIGS_DICT_KEY}'] to be a dictionary")
+        return conf
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``BackendConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.backend_config.BackendConfig.from_dict`.
+        """
+        return {
+            NAME_DICT_KEY: self.name,
+            CONFIGS_DICT_KEY: [c.to_dict() for c in self.configs],
+        }
+
+
+class BackendPatternConfig:
+    """
+    Config object that specifies quantization behavior for a given operator pattern.
+    For a detailed example usage, see :class:`~torch.ao.quantization.backend_config.BackendConfig`.
+    """
+    def __init__(self, pattern: Optional[Pattern] = None):
+        self.pattern: Optional[Pattern] = pattern
+        self.observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+        self.dtype_configs: List[DTypeConfig] = []
+        self.root_module: Optional[Type[torch.nn.Module]] = None
+        self.qat_module: Optional[Type[torch.nn.Module]] = None
+        self.reference_quantized_module: Optional[Type[torch.nn.Module]] = None
+        self.fused_module: Optional[Type[torch.nn.Module]] = None
+        self.fuser_method: Optional[Callable] = None
+
+        # Temporary/internal configs
+        self._root_node_getter: Optional[Callable] = None
+        self._extra_inputs_getter: Optional[Callable] = None
+        self._num_tensor_args_to_observation_type: Dict[int, ObservationType] = {}
+        self._input_type_to_index: Dict[str, int] = {}
+        self._pattern_complex_format: Optional[Pattern] = None
+
+    def __repr__(self):
+        dict_nonempty = {
+            k: v for k, v in self.__dict__.items()
+            if (
+                (not isinstance(v, (list, dict)) and v is not None)
+                or (isinstance(v, (list, dict)) and len(v) > 0)
+            )
+        }
+        return f"BackendPatternConfig({dict_nonempty})"
+
+    def set_pattern(self, pattern: Pattern) -> BackendPatternConfig:
+        """
+        Set the pattern to configure.
+
+        The pattern can be a float module, functional operator, pytorch operator, or a tuple
+        combination of the above. Tuple patterns are treated as sequential patterns, and
+        currently only tuples of 2 or 3 elements are supported.
+        """
+        if self._pattern_complex_format is not None:
+            raise ValueError("Only one of 'pattern' or 'pattern_complex_format' can be set")
+        self.pattern = pattern
+        return self
+
+    def set_observation_type(self, observation_type: ObservationType) -> BackendPatternConfig:
+        """
+        Set how observers should be inserted in the graph for this pattern.
+
+        Observation type here refers to how observers (or quant-dequant ops) will be placed
+        in the graph. This is used to produce the desired reference patterns understood by
+        the backend. Weighted ops such as linear and conv require different observers
+        (or quantization parameters passed to quantize ops in the reference model) for the
+        input and the output.
+
+        There are two observation types:
+
+            `OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT` (default): the output observer instance
+            will be different from the input. This is the most common observation type.
+
+            `OUTPUT_SHARE_OBSERVER_WITH_INPUT`: the output observer instance will be the
+            same as the input. This is useful for operators like `cat`.
+
+        Note: This will be renamed in the near future, since we will soon insert QuantDeQuantStubs
+        with observers (and fake quantizes) attached instead of observers themselves.
+        """
+        self.observation_type = observation_type
+        return self
+
+    def add_dtype_config(self, dtype_config: DTypeConfig) -> BackendPatternConfig:
+        """
+        Add a set of supported data types passed as arguments to quantize ops in the
+        reference model spec.
+        """
+        self.dtype_configs.append(dtype_config)
+        return self
+
+    def set_dtype_configs(self, dtype_configs: List[DTypeConfig]) -> BackendPatternConfig:
+        """
+        Set the supported data types passed as arguments to quantize ops in the
+        reference model spec, overriding all previously registered data types.
+        """
+        self.dtype_configs = dtype_configs
+        return self
+
+    def set_root_module(self, root_module: Type[torch.nn.Module]) -> BackendPatternConfig:
+        """
+        Set the module that represents the root for this pattern.
+
+        When we construct the reference quantized model during the convert phase,
+        the root modules (e.g. torch.nn.Linear for torch.ao.nn.intrinsic.LinearReLU)
+        will be swapped to the corresponding reference quantized modules (e.g.
+        torch.ao.nn.reference.quantized.Linear). This allows custom backends to
+        specify custom reference quantized module implementations to match the
+        numerics of their lowered operators. Since this is a one-to-one mapping,
+        both the root module and the reference quantized module must be specified
+        in the same BackendPatternConfig in order for the conversion to take place.
+        """
+        self.root_module = root_module
+        return self
+
+    def set_qat_module(self, qat_module: Type[torch.nn.Module]) -> BackendPatternConfig:
+        """
+        Set the module that represents the QAT implementation for this pattern.
+        """
+        self.qat_module = qat_module
+        return self
+
+    def set_reference_quantized_module(self, reference_quantized_module: Type[torch.nn.Module]) -> BackendPatternConfig:
+        """
+        Set the module that represents the reference quantized implementation for
+        this pattern's root module.
+
+        For more detail, see :func:`~torch.ao.quantization.backend_config.BackendPatternConfig.set_root_module`.
+        """
+        self.reference_quantized_module = reference_quantized_module
+        return self
+
+    def set_fused_module(self, fused_module: Type[torch.nn.Module]) -> BackendPatternConfig:
+        """
+        Set the module that represents the fused implementation for this pattern.
+        """
+        self.fused_module = fused_module
+        return self
+
+    def set_fuser_method(self, fuser_method: Callable) -> BackendPatternConfig:
+        """
+        Set the function that specifies how to fuse this BackendPatternConfig's pattern.
+
+        The first argument of this function should be `is_qat`, and the rest of the arguments
+        should be the items in the tuple pattern. The return value of this function should be
+        the resulting fused module.
+
+        For example, the fuser method for the pattern `(torch.nn.Linear, torch.nn.ReLU)` can be:
+
+            def fuse_linear_relu(is_qat, linear, relu):
+                return torch.ao.nn.intrinsic.LinearReLU(linear, relu)
+
+        For a more complicated example, see https://gist.github.com/jerryzh168/8bea7180a8ba3c279f2c9b050f2a69a6.
+        """
+        self.fuser_method = fuser_method
+        return self
+
+    def _set_root_node_getter(self, root_node_getter: Callable) -> BackendPatternConfig:
+        self._root_node_getter = root_node_getter
+        return self
+
+    def _set_extra_inputs_getter(self, extra_inputs_getter: Callable) -> BackendPatternConfig:
+        self._extra_inputs_getter = extra_inputs_getter
+        return self
+
+    def _set_num_tensor_args_to_observation_type(
+            self, num_tensor_args_to_observation_type: Dict[int, ObservationType]) -> BackendPatternConfig:
+        self._num_tensor_args_to_observation_type = num_tensor_args_to_observation_type
+        return self
+
+    def _set_input_type_to_index(self, input_type_to_index: Dict[str, int]) -> BackendPatternConfig:
+        self._input_type_to_index = input_type_to_index
+        return self
+
+    def _set_pattern_complex_format(self, pattern: Pattern) -> BackendPatternConfig:
+        """
+        Set the pattern to configure, using the reversed nested tuple format.
+
+        See the BackendConfig README for more detail:
+        https://github.com/pytorch/pytorch/blob/master/torch/ao/quantization/backend_config/README.md#advanced-pattern-specification
+        """
+        if self.pattern is not None:
+            raise ValueError("Only one of 'pattern' or 'pattern_complex_format' can be set")
+        self._pattern_complex_format = pattern
+        return self
+
+    @classmethod
+    def from_dict(cls, backend_pattern_config_dict: Dict[str, Any]) -> BackendPatternConfig:
+        """
+        Create a ``BackendPatternConfig`` from a dictionary with the following items:
+
+            "pattern": the pattern being configured
+            "observation_type": the :class:`~torch.ao.quantization.backend_config.ObservationType` that specifies how
+            observers should be inserted for this pattern
+            "dtype_configs": a list of dictionaries that represents :class:`~torch.ao.quantization.backend_config.DTypeConfig` s
+            "root_module": a :class:`torch.nn.Module` that represents the root for this pattern
+            "qat_module": a :class:`torch.nn.Module` that represents the QAT implementation for this pattern
+            "reference_quantized_module": a :class:`torch.nn.Module` that represents the reference quantized
+            implementation for this pattern's root module.
+            "fused_module": a :class:`torch.nn.Module` that represents the fused implementation for this pattern
+            "fuser_method": a function that specifies how to fuse the pattern for this pattern
+            "pattern_complex_format": the pattern specified in the reversed nested tuple format (deprecated)
+
+        """
+        def _get_dtype_config(obj: Any) -> DTypeConfig:
+            """
+            Convert the given object into a ``DTypeConfig`` if possible, else throw an exception.
+            """
+            if isinstance(obj, DTypeConfig):
+                return obj
+            if isinstance(obj, Dict):
+                return DTypeConfig.from_dict(obj)
+            raise ValueError(
+                f"Expected a list of DTypeConfigs in "
+                f"backend_pattern_config_dict[\"{DTYPE_CONFIGS_DICT_KEY}\"], got '{type(obj)}'"
+            )
+
+        conf = cls()
+        if PATTERN_DICT_KEY in backend_pattern_config_dict:
+            conf.set_pattern(backend_pattern_config_dict[PATTERN_DICT_KEY])
+        if OBSERVATION_TYPE_DICT_KEY in backend_pattern_config_dict:
+            conf.set_observation_type(backend_pattern_config_dict[OBSERVATION_TYPE_DICT_KEY])
+        for d in backend_pattern_config_dict.get(DTYPE_CONFIGS_DICT_KEY, []):
+            conf.add_dtype_config(_get_dtype_config(d))
+        conf.set_root_module(backend_pattern_config_dict.get(ROOT_MODULE_DICT_KEY, None))
+        conf.set_qat_module(backend_pattern_config_dict.get(QAT_MODULE_DICT_KEY, None))
+        conf.set_reference_quantized_module(backend_pattern_config_dict.get(REFERENCE_QUANTIZED_MODULE_DICT_KEY, None))
+        conf.set_fused_module(backend_pattern_config_dict.get(FUSED_MODULE_DICT_KEY, None))
+        conf.set_fuser_method(backend_pattern_config_dict.get(FUSER_METHOD_DICT_KEY, None))
+        conf._set_root_node_getter(backend_pattern_config_dict.get(ROOT_NODE_GETTER_DICT_KEY, None))
+        conf._set_extra_inputs_getter(backend_pattern_config_dict.get(EXTRA_INPUTS_GETTER_DICT_KEY, None))
+        conf._set_num_tensor_args_to_observation_type(
+            backend_pattern_config_dict.get(NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY, {}))
+        conf._set_input_type_to_index(backend_pattern_config_dict.get(INPUT_TYPE_TO_INDEX_DICT_KEY, {}))
+        if PATTERN_COMPLEX_FORMAT_DICT_KEY in backend_pattern_config_dict:
+            conf._set_pattern_complex_format(backend_pattern_config_dict[PATTERN_COMPLEX_FORMAT_DICT_KEY])
+        return conf
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``BackendPatternConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.backend_config.BackendPatternConfig.from_dict`.
+        """
+        backend_pattern_config_dict: Dict[str, Any] = {
+            OBSERVATION_TYPE_DICT_KEY: self.observation_type,
+            DTYPE_CONFIGS_DICT_KEY: [c.to_dict() for c in self.dtype_configs],
+        }
+        if self.pattern is not None:
+            backend_pattern_config_dict[PATTERN_DICT_KEY] = self.pattern
+        if self.root_module is not None:
+            backend_pattern_config_dict[ROOT_MODULE_DICT_KEY] = self.root_module
+        if self.qat_module is not None:
+            backend_pattern_config_dict[QAT_MODULE_DICT_KEY] = self.qat_module
+        if self.reference_quantized_module is not None:
+            backend_pattern_config_dict[REFERENCE_QUANTIZED_MODULE_DICT_KEY] = self.reference_quantized_module
+        if self.fused_module is not None:
+            backend_pattern_config_dict[FUSED_MODULE_DICT_KEY] = self.fused_module
+        if self.fuser_method is not None:
+            backend_pattern_config_dict[FUSER_METHOD_DICT_KEY] = self.fuser_method
+        if self._root_node_getter is not None:
+            backend_pattern_config_dict[ROOT_NODE_GETTER_DICT_KEY] = self._root_node_getter
+        if self._extra_inputs_getter is not None:
+            backend_pattern_config_dict[EXTRA_INPUTS_GETTER_DICT_KEY] = self._extra_inputs_getter
+        if len(self._num_tensor_args_to_observation_type) > 0:
+            backend_pattern_config_dict[NUM_TENSOR_ARGS_TO_OBSERVATION_TYPE_DICT_KEY] = self._num_tensor_args_to_observation_type
+        if len(self._input_type_to_index) > 0:
+            backend_pattern_config_dict[INPUT_TYPE_TO_INDEX_DICT_KEY] = self._input_type_to_index
+        if self._pattern_complex_format is not None:
+            backend_pattern_config_dict[PATTERN_COMPLEX_FORMAT_DICT_KEY] = self._pattern_complex_format
+        return backend_pattern_config_dict
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/executorch.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/executorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..871b969de4e2b14c209ebe96315cd17cd628045e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/executorch.py
@@ -0,0 +1,494 @@
+# TODO: rename executorch to qnnpack_executorch since executorch is a general runtime
+# not a specific backend
+
+import operator
+from typing import List
+
+import torch
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.quantized.reference as nnqr
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..fuser_method_mappings import (
+    _sequential_wrapper2,
+    fuse_conv_bn,
+    fuse_conv_bn_relu,
+)
+from ._common_operator_config_utils import _Conv2dMetadata
+from .backend_config import (
+    BackendConfig,
+    BackendPatternConfig,
+    DTypeConfig,
+    DTypeWithConstraints,
+    ObservationType,
+)
+from .qnnpack import (
+    qnnpack_default_op_qint8_symmetric_dtype_config,
+    qnnpack_weighted_op_qint8_symmetric_dtype_config,
+)
+
+
+__all__ = [
+    "get_executorch_backend_config",
+]
+
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+executorch_weighted_op_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+executorch_default_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+executorch_default_dynamic_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+executorch_act_qint8_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    scale_min_lower_bound=2**-12,
+)
+
+executorch_weight_qint8_neg_127_to_127_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    quant_min_lower_bound=-127,
+    quant_max_upper_bound=127,
+    scale_min_lower_bound=2**-12,
+)
+
+executorch_default_dynamic_qint8_dtype_config = DTypeConfig(
+    input_dtype=executorch_act_qint8_scale_min_2_neg_12,
+    output_dtype=torch.float,
+    weight_dtype=executorch_weight_qint8_neg_127_to_127_scale_min_2_neg_12,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+executorch_default_dynamic_float16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+executorch_weight_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint8,
+)
+
+
+# =============================
+# |  BACKEND PATTERN CONFIGS  |
+# =============================
+
+
+def _get_linear_configs() -> List[BackendPatternConfig]:
+    """
+    Return all configs related to linear modules and ops.
+    """
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
+        executorch_weighted_op_int8_dtype_config,
+        executorch_default_dynamic_quint8_dtype_config,
+        executorch_default_dynamic_qint8_dtype_config,
+        executorch_default_dynamic_float16_dtype_config,
+    ]
+    linear_configs: List[BackendPatternConfig] = []
+    # linear module
+    linear_configs.append(
+        BackendPatternConfig(torch.nn.Linear)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        .set_root_module(torch.nn.Linear)
+        .set_reference_quantized_module(nnqr.Linear)
+        .set_qat_module(nnqat.Linear)
+    )
+    # linear qat module
+    linear_configs.append(
+        BackendPatternConfig(nnqat.Linear)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        .set_root_module(torch.nn.Linear)
+        .set_reference_quantized_module(nnqr.Linear)
+    )
+    # functional linear
+    linear_configs.append(
+        BackendPatternConfig(torch.nn.functional.linear)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+        ._set_input_type_to_index({"weight": 1, "bias": 2})
+    )
+    return linear_configs
+
+
+def _get_conv_configs() -> List[BackendPatternConfig]:
+    """
+    Return all configs related to conv modules and ops.
+    """
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
+        executorch_weighted_op_int8_dtype_config,
+    ]
+    conv_configs = []
+    for convs in [_Conv2dMetadata]:
+        # (1) Single conv modules/functions
+        # -----------------------------------
+        # conv module
+        conv_configs.append(
+            BackendPatternConfig(convs.root)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+            .set_qat_module(convs.qat)
+        )
+        # conv qat module
+        conv_configs.append(
+            BackendPatternConfig(convs.qat)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+        )
+        # functional conv
+        conv_configs.append(
+            BackendPatternConfig(convs.func)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            ._set_input_type_to_index({"weight": 1, "bias": 2})
+        )
+
+        # (2) Conv + relu
+        # -----------------------------------
+        # conv module + relu module
+        conv_configs.append(
+            BackendPatternConfig((convs.root, nn.ReLU))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(_sequential_wrapper2(convs.fused_conv_relu))
+            .set_fused_module(convs.fused_conv_relu)
+        )
+        # conv module + functional relu
+        conv_configs.append(
+            BackendPatternConfig((convs.root, F.relu))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(_sequential_wrapper2(convs.fused_conv_relu))
+            .set_fused_module(convs.fused_conv_relu)
+        )
+        # fused conv relu module
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_relu)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+            .set_qat_module(convs.relu_qat)
+        )
+        # conv relu, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.relu_qat)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+        )
+        # functional conv + relu module
+        conv_configs.append(
+            BackendPatternConfig((convs.func, nn.ReLU))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+        )
+        # functional conv + functional relu
+        conv_configs.append(
+            BackendPatternConfig((convs.func, F.relu))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+        )
+        # fused conv relu
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_relu)
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_qat_module(convs.relu_qat)
+        )
+
+        conv_configs.append(
+            BackendPatternConfig(convs.relu_qat)
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+        )
+
+        # (3) Conv + batchnorm (+ relu)
+        # -------------------------------
+        # conv + batchnorm (+ relu)
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(fuse_conv_bn)
+            .set_fused_module(convs.fused_conv_bn)
+        )
+        # conv + bn + relu module fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn, nn.ReLU))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(fuse_conv_bn_relu)
+            .set_fused_module(convs.fused_conv_bn_relu)
+        )
+        # conv + bn + relu functional fusion
+        conv_configs.append(
+            BackendPatternConfig((convs.root, convs.bn, F.relu))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_root_module(convs.root)
+            .set_fuser_method(fuse_conv_bn_relu)
+            .set_fused_module(convs.fused_conv_bn_relu)
+        )
+        # TODO: we can add fusion for torch.relu as well
+        # 3.2 conv + bn (+ relu) fused module configs
+        # fused conv bn
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_bn)
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_qat_module(convs.bn_qat)
+        )
+
+        # fused conv bn relu
+        conv_configs.append(
+            BackendPatternConfig(convs.fused_conv_bn_relu)
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_qat_module(convs.bn_relu_qat)
+        )
+
+        # conv bn, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.bn_qat)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+        )
+        # conv bn relu, qat fused module
+        conv_configs.append(
+            BackendPatternConfig(convs.bn_relu_qat)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(convs.root)
+            .set_reference_quantized_module(convs.reference)
+        )
+    return conv_configs
+
+
+def _get_binary_ops_configs() -> List[BackendPatternConfig]:
+    """
+    Return all configs related to binary ops.
+    """
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_weighted_op_int8_dtype_config,
+    ]
+    num_tensor_args_to_observation_type_mapping = {
+        # TODO: this is not used right now since we have extra check in prepare
+        # will need to change this to NO_OBSERVER later after we implemented
+        # Tensor dtype inference properly
+        0: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        1: ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+        2: ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    }
+    binary_op_configs: List[BackendPatternConfig] = []
+    for op in [operator.add, torch.add, operator.sub, torch.sub, operator.mul, torch.mul]:
+        bop_patterns = [
+            (op, torch.nn.ReLU),
+            (op, torch.nn.functional.relu),
+            (op, torch.relu),
+            op
+        ]
+        for bop_pattern in bop_patterns:
+            binary_op_configs.append(
+                BackendPatternConfig(bop_pattern)
+                .set_dtype_configs(dtype_configs)  # noqa: E131
+                ._set_num_tensor_args_to_observation_type(
+                    num_tensor_args_to_observation_type_mapping
+                )
+            )
+    return binary_op_configs
+
+
+def _get_share_qparams_ops_configs() -> List[BackendPatternConfig]:
+    """
+    Return the operator configs for the operators that works for both float and quantized
+    input if input is quantized, the output Tensor shares the same quantization parameter
+    with input.
+
+    Example operator: avgpool2d, reshape, transpose, maxpool2d
+    Example observed operator:
+    observer_0 - avgpool2d - observer_0 (same observer instance as input)
+    """
+    observation_type = ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config,
+    ]
+    share_qparams_ops = [
+        torch.nn.Flatten,
+        F.adaptive_avg_pool2d,
+        F.elu,
+        F.hardtanh,
+        F.max_pool2d,
+        F.pad,
+        F.relu,
+        F.relu6,
+        F.leaky_relu,
+        F.leaky_relu_,
+        torch.nn.AdaptiveAvgPool2d,
+        torch.nn.ConstantPad2d,
+        torch.nn.ELU,
+        torch.nn.MaxPool2d,
+        torch.nn.ReLU6,
+        torch.nn.Hardtanh,
+        torch.nn.LeakyReLU,
+        torch.clamp,
+        torch.flatten,
+        torch.mean,
+        torch.permute,
+        torch.permute_copy,
+        torch.squeeze,
+        "clamp",
+        "mean",
+        "permute",
+        "reshape",
+        "relu",
+        "relu_",
+        "squeeze",
+        "squeeze_",
+        "leaky_relu",
+    ]
+    share_qparams_op_configs: List[BackendPatternConfig] = []
+    for op in share_qparams_ops:
+        share_qparams_op_configs.append(
+            BackendPatternConfig(op)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+        )
+    return share_qparams_op_configs
+
+
+def _get_bn_configs() -> List[BackendPatternConfig]:
+    """
+    Return all configs related to batchnorm.
+    """
+    observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config,
+    ]
+    bn_configs = []
+    bn_configs.append(
+        BackendPatternConfig(nn.BatchNorm2d)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(dtype_configs)
+    )
+    return bn_configs
+
+
+def _get_cat_configs() -> List[BackendPatternConfig]:
+    dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        executorch_default_op_quint8_dtype_config,
+    ]
+    cat_configs = []
+    cat_configs.append(
+        BackendPatternConfig(torch.cat)
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT)
+        .set_dtype_configs(dtype_configs)
+    )
+    cat_configs.append(
+        BackendPatternConfig(torch.concat)
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT)
+        .set_dtype_configs(dtype_configs)
+    )
+    cat_configs.append(
+        BackendPatternConfig(torch.concatenate)
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT)
+        .set_dtype_configs(dtype_configs)
+    )
+    return cat_configs
+
+
+def _get_embedding_op_configs() -> List[BackendPatternConfig]:
+    dtype_configs = [
+        executorch_weight_only_quint8_dtype_config,
+    ]
+    embedding_op_configs = []
+    for embedding_op, qat_embedding_op, ref_embedding_op in [
+        (nn.Embedding, nnqat.Embedding, nnqr.Embedding),
+        (nn.EmbeddingBag, nnqat.EmbeddingBag, nnqr.EmbeddingBag),
+    ]:
+        embedding_op_configs.append(
+            BackendPatternConfig(embedding_op)
+            .set_observation_type(
+                ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+            )  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_qat_module(qat_embedding_op)
+            .set_root_module(embedding_op)
+            .set_reference_quantized_module(ref_embedding_op)
+        )
+        # config for qat op
+        embedding_op_configs.append(
+            BackendPatternConfig(qat_embedding_op)
+            .set_observation_type(
+                ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+            )  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(embedding_op)
+            .set_reference_quantized_module(ref_embedding_op)
+        )
+
+        # config for functional embedding
+        embedding_op_configs.append(
+            BackendPatternConfig(torch.nn.functional.embedding)
+            .set_observation_type(
+                ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+            )  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            ._set_input_type_to_index({"weight": 1})
+        )
+    return embedding_op_configs
+
+
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+
+def get_executorch_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for backends PyTorch lowers to through the Executorch stack.
+    """
+    return (
+        BackendConfig("executorch")
+        .set_backend_pattern_configs(_get_linear_configs())
+        .set_backend_pattern_configs(_get_conv_configs())
+        .set_backend_pattern_configs(_get_binary_ops_configs())
+        .set_backend_pattern_configs(_get_share_qparams_ops_configs())
+        .set_backend_pattern_configs(_get_bn_configs())
+        .set_backend_pattern_configs(_get_cat_configs())
+        .set_backend_pattern_configs(_get_embedding_op_configs())
+    )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/fbgemm.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/fbgemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fca2d267ee1597ed2ef360611cdaaf1735313587
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/fbgemm.py
@@ -0,0 +1,116 @@
+import torch
+from ._common_operator_config_utils import (
+    _get_binary_op_configs,
+    _get_bn_configs,
+    _get_cat_config,
+    _get_conv_configs,
+    _get_default_op_configs,
+    _get_embedding_op_configs,
+    _get_fixed_qparams_op_configs,
+    _get_linear_configs,
+    _get_rnn_op_configs,
+    _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
+)
+from .backend_config import BackendConfig, DTypeConfig
+
+__all__ = [
+    "get_fbgemm_backend_config",
+]
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+# TODO: For now, these DTypeConfigs are identical to the ones defined in native.py
+# In the future, once we support specifying quant_min/quant_max and scale_min/scale_max,
+# these will diverge. In particular, for FBGEMM, we will restrict the activation quantized
+# values to within [0, 127].
+
+fbgemm_weighted_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+fbgemm_default_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+fbgemm_default_op_fp16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float16,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float16,
+)
+
+fbgemm_default_dynamic_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+fbgemm_default_dynamic_float16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+fbgemm_weight_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint8,
+)
+
+fbgemm_weight_only_quint4x2_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint4x2,
+)
+
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+def get_fbgemm_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch's native FBGEMM backend.
+    """
+    conv_dtype_configs = [fbgemm_weighted_op_quint8_dtype_config]
+    linear_dtype_configs = [
+        fbgemm_weighted_op_quint8_dtype_config,
+        fbgemm_default_dynamic_int8_dtype_config,
+        fbgemm_default_dynamic_float16_dtype_config,
+    ]
+    binary_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    default_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    fixed_qparams_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    share_qparams_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [fbgemm_default_op_quint8_dtype_config]
+    rnn_op_dtype_configs = [
+        fbgemm_default_dynamic_int8_dtype_config,
+        fbgemm_default_dynamic_float16_dtype_config,
+    ]
+    embedding_op_dtype_configs = [
+        fbgemm_weight_only_quint8_dtype_config,
+        fbgemm_weight_only_quint4x2_dtype_config,
+    ]
+    return BackendConfig("fbgemm") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/native.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/native.py
new file mode 100644
index 0000000000000000000000000000000000000000..5425e5173fd7711c00e52a5f56197b9460d2b466
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/native.py
@@ -0,0 +1,204 @@
+import torch
+from ._common_operator_config_utils import (
+    _get_binary_op_configs,
+    _get_bn_configs,
+    _get_cat_config,
+    _get_conv_configs,
+    _get_default_op_configs,
+    _get_embedding_op_configs,
+    _get_fixed_qparams_op_configs,
+    _get_linear_configs,
+    _get_ln_configs,
+    _get_rnn_op_configs,
+    _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
+)
+from .backend_config import BackendConfig, DTypeConfig
+
+__all__ = [
+    "get_test_only_legacy_native_backend_config",
+    "default_op_quint8_dtype_config",
+    "default_op_fp16_dtype_config",
+    "default_dynamic_int8_dtype_config",
+    "default_dynamic_float16_dtype_config",
+    "input_output_only_quint8_dtype_config",
+    "weight_only_quint8_dtype_config",
+    "weight_only_quint4x2_dtype_config",
+    "get_native_backend_config",
+    "get_native_backend_config_dict",
+    "get_test_only_legacy_native_backend_config_dict",
+]
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+# weighted op int8 dtype config
+# this is config for ops that has quantized weights, like linear, conv
+weighted_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+default_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+default_op_fp16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float16,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float16,
+)
+
+default_dynamic_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    # currently the dtype check is not yet enabled, so we provided the dtype_configs but
+    # it is not really used yet,
+    # we will enable it a bit later after we moved everything to backend_config_dict
+    is_dynamic=True,
+)
+
+default_dynamic_float16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float,
+    # currently the dtype check is not yet enabled, so we provided the dtype_configs but
+    # it is not really used yet,
+    # we will enable it a bit later after we moved everything to backend_config_dict
+    is_dynamic=True,
+)
+
+# Needed for LayerNorm and f.layer_norm, since currently the kernel only supports float weights
+input_output_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.float,
+    bias_dtype=torch.float,
+)
+
+weight_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint8,
+)
+
+weight_only_quint4x2_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint4x2,
+)
+
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+def get_test_only_legacy_native_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch Native backend (fbgemm/qnnpack) with various additional fp16 ops.
+    """
+    conv_dtype_configs = [weighted_op_quint8_dtype_config]
+    linear_dtype_configs = [
+        weighted_op_quint8_dtype_config,
+        default_dynamic_int8_dtype_config,
+        default_dynamic_float16_dtype_config,
+        default_op_fp16_dtype_config,
+    ]
+    binary_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+        default_op_fp16_dtype_config,
+    ]
+    default_op_dtype_configs = [default_op_quint8_dtype_config]
+    fixed_qparams_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+        default_op_fp16_dtype_config,
+    ]
+    share_qparams_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+        default_op_fp16_dtype_config
+    ]
+    tensor_info_op_dtype_configs = [
+        default_op_quint8_dtype_config,
+    ]
+    rnn_op_dtype_configs = [
+        default_dynamic_int8_dtype_config,
+        default_dynamic_float16_dtype_config,
+    ]
+    embedding_op_dtype_configs = [
+        weight_only_quint8_dtype_config,
+        weight_only_quint4x2_dtype_config,
+    ]
+    layer_norm_op_dtype_configs = [input_output_only_quint8_dtype_config]
+    return BackendConfig("_native_and_fp16") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
+
+def get_native_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch Native backend (fbgemm/qnnpack).
+    """
+    # TODO: express this BackendConfig as a union of the FBGEMM and QNNPACK BackendConfigs
+    conv_dtype_configs = [weighted_op_quint8_dtype_config]
+    linear_dtype_configs = [
+        weighted_op_quint8_dtype_config,
+        default_dynamic_int8_dtype_config,
+        default_dynamic_float16_dtype_config,
+    ]
+    binary_op_dtype_configs = [default_op_quint8_dtype_config]
+    default_op_dtype_configs = [default_op_quint8_dtype_config]
+    fixed_qparams_op_dtype_configs = [default_op_quint8_dtype_config]
+    share_qparams_op_dtype_configs = [default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [default_op_quint8_dtype_config]
+    rnn_op_dtype_configs = [
+        default_dynamic_int8_dtype_config,
+        default_dynamic_float16_dtype_config,
+    ]
+    embedding_op_dtype_configs = [
+        weight_only_quint8_dtype_config,
+        weight_only_quint4x2_dtype_config,
+    ]
+    layer_norm_op_dtype_configs = [input_output_only_quint8_dtype_config]
+    return BackendConfig("native") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
+
+def get_native_backend_config_dict():
+    """
+    Return the `BackendConfig` for PyTorch Native backend (fbgemm/qnnpack) in dictionary form.
+    """
+    return get_native_backend_config().to_dict()
+
+def get_test_only_legacy_native_backend_config_dict():
+    """
+    Return the `BackendConfig` for PyTorch Native backend (fbgemm/qnnpack) with various additional
+    fp16 ops in dictionary form.
+    """
+    return get_test_only_legacy_native_backend_config().to_dict()
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/observation_type.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/observation_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/onednn.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/onednn.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d2f7ff42c3913c4eaee43a919251ad12fc20e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/onednn.py
@@ -0,0 +1,542 @@
+import torch
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+import torch.nn.functional as F
+import torch.ao.nn.quantized.reference as nnqr
+from ._common_operator_config_utils import (
+    _get_conv_configs,
+    _get_linear_configs,
+    _get_binary_op_configs,
+    _get_bn_configs,
+    _get_cat_config,
+    _get_default_op_configs,
+    _get_embedding_op_configs,
+    _get_fixed_qparams_op_configs,
+    _get_ln_configs,
+    _get_rnn_op_configs,
+    _get_share_qparams_op_configs,
+)
+from .backend_config import (
+    BackendPatternConfig,
+    BackendConfig,
+    DTypeConfig,
+    ObservationType,
+)
+from ..fuser_method_mappings import (
+    _sequential_wrapper2,
+)
+import operator
+from torch.ao.quantization.utils import MatchAllNode
+import itertools
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+onednn_weighted_op_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+onednn_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+onednn_dynamic_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+onednn_weight_only_qint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+)
+
+onednn_input_output_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.float,
+    bias_dtype=torch.float,
+)
+
+# ===================
+# |  FUSER METHODS  |
+# ===================
+
+def _fuse_linear_bn_leaky_relu(is_qat, linear, bn, leaky_relu):
+    r"""Given the linear, bn and leaky_relu modules, fuses them and returns the fused module
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+                or post training quantization fusion
+        linear: Module instance of type Linear
+        bn: BatchNorm1d instance that needs to be fused with the linear layer
+        leaky_relu: LeakyReLU instance that needs to be fused with the linear layer
+    Examples::
+        >>> # xdoctest: +SKIP(failing)
+        >>> m1 = nn.Linear(20, 10)
+        >>> b1 = nn.BatchNorm1d(10)
+        >>> lr = nn.LeakyReLU(0.01)
+        >>> m2 = _fuse_linear_bn_leaky_relu(m1, b1, lr)
+    """
+    assert linear.training == bn.training and bn.training == leaky_relu.training, \
+        "Linear, BN and LeakyReLU all must be in the same mode (train or eval)."
+
+    if is_qat:
+        raise NotImplementedError(f"Cannot fuse train modules: {(linear, bn, leaky_relu)}")
+    else:
+        map_to_fused_module_eval = {
+            nn.Linear: nni.LinearLeakyReLU,
+        }
+        fused_module = map_to_fused_module_eval.get(type(linear), None)
+        if fused_module is not None:
+            fused_linear = nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
+            fm = fused_module(fused_linear, leaky_relu)
+            return fm
+        else:
+            raise NotImplementedError(f"Cannot fuse eval modules: {(linear, bn, leaky_relu)}")
+
+# ======================
+# |  CONFIGS FOR CONV  |
+# ======================
+observation_type = ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT
+
+conv_dtype_configs = [onednn_weighted_op_int8_dtype_config]
+conv_configs = _get_conv_configs(conv_dtype_configs)
+
+# (1) Conv2d + Add
+
+# conv2d   Y
+#   \   /
+#    add
+
+# include:
+# conv2d conv2d
+#   \   /
+#    add
+
+def _fuse_conv_add_left(is_qat, add, conv, _):
+    return nni.ConvAdd2d(conv, add)
+
+def _conv_add_root_node_getter_left(pattern):
+    _, conv, _ = pattern
+    return conv
+
+def _conv_add_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, conv, extra_input = pattern
+    return [extra_input]
+
+# conv2d
+#  \
+#  bn   Y
+#   \   /
+#    add
+
+def _fuse_conv_bn_add_left(is_qat, add, bn_conv, _):
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, add)}")
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAdd2d(fused_conv, add)
+
+def _conv_bn_add_root_node_getter_left(add_pattern):
+    _, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_extra_inputs_getter_left(add_pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, bn_conv, extra_input = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_left_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_left_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_left)
+                ._set_root_node_getter(_conv_bn_add_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_bn_add_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAdd2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, nn.Conv2d, MatchAllNode))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_left)
+                ._set_root_node_getter(_conv_add_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_add_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAdd2d))
+
+#  Y   conv2d
+#   \   /
+#    add
+
+def _fuse_conv_add_right(is_qat, add, _, conv):
+    return nni.ConvAdd2d(conv, add)
+
+def _conv_add_root_node_getter_right(pattern):
+    add, _, conv = pattern
+    return conv
+
+def _conv_add_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, extra_input, conv = pattern
+    return [extra_input]
+
+#      conv2d
+#        /
+#  Y    bn
+#   \   /
+#    add
+
+def _fuse_conv_bn_add_right(is_qat, add, _, bn_conv):
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, add)}")
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAdd2d(fused_conv, add)
+
+def _conv_bn_add_root_node_getter_right(pattern):
+    add, _, bn_conv = pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    _, extra_input, bn_conv = pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_right)
+                ._set_root_node_getter(_conv_bn_add_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_bn_add_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAdd2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((add_op, MatchAllNode, nn.Conv2d))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_right)
+                ._set_root_node_getter(_conv_add_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_add_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAdd2d))
+
+conv_configs.append(
+    BackendPatternConfig(nni.ConvAdd2d)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(conv_dtype_configs)
+        .set_root_module(nn.Conv2d)
+        .set_reference_quantized_module(nnqr.Conv2d))
+
+# (2) Conv2d + Add + Relu
+
+# conv2d Y
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_add_relu_left(is_qat, relu, add_pattern):
+    add, conv, _ = add_pattern
+    return nni.ConvAddReLU2d(conv, add, relu)
+
+def _conv_add_relu_root_node_getter_left(pattern):
+    relu, add_pattern = pattern
+    _, conv, _ = add_pattern
+    return conv
+
+def _conv_add_relu_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, conv, extra_input = add_pattern
+    return [extra_input]
+
+# conv2d
+#  \
+#  bn   Y
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_bn_add_relu_left(is_qat, relu, add_pattern):
+    add, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, add, relu)}")
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAddReLU2d(fused_conv, add, relu)
+
+def _conv_bn_add_relu_root_node_getter_left(pattern):
+    relu, add_pattern = pattern
+    _, bn_conv, _ = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_relu_extra_inputs_getter_left(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, bn_conv, extra_input = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_relu_left_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_relu_left_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_relu_left)
+                ._set_root_node_getter(_conv_bn_add_relu_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_bn_add_relu_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAddReLU2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, nn.Conv2d, MatchAllNode)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_relu_left)
+                ._set_root_node_getter(_conv_add_relu_root_node_getter_left)
+                ._set_extra_inputs_getter(_conv_add_relu_extra_inputs_getter_left)
+                .set_fused_module(nni.ConvAddReLU2d))
+
+#  Y   conv2d
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_add_relu_right(is_qat, relu, add_pattern):
+    add, _, conv = add_pattern
+    return nni.ConvAddReLU2d(conv, add, relu)
+
+def _conv_add_relu_root_node_getter_right(pattern):
+    relu, add_pattern = pattern
+    _, _, conv = add_pattern
+    return conv
+
+def _conv_add_relu_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, extra_input, conv = add_pattern
+    return [extra_input]
+
+#      conv2d
+#        /
+#  Y    bn
+#   \   /
+#    add
+#     \
+#     relu
+
+def _fuse_conv_bn_add_relu_right(is_qat, relu, add_pattern):
+    add, _, bn_conv = add_pattern
+    bn, conv = bn_conv
+    if is_qat:
+        raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, add, relu)}")
+    else:
+        fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+        return nni.ConvAddReLU2d(fused_conv, add, relu)
+
+def _conv_bn_add_relu_root_node_getter_right(pattern):
+    relu, add_pattern = pattern
+    _, _, bn_conv = add_pattern
+    bn, conv = bn_conv
+    return conv
+
+def _conv_bn_add_relu_extra_inputs_getter_right(pattern):
+    """ get inputs pattern for extra inputs, inputs for root node
+    are assumed to be copied over from root node to the fused node
+    """
+    relu, add_pattern = pattern
+    _, extra_input, bn_conv = add_pattern
+    bn, conv = bn_conv
+    return [extra_input]
+
+conv_add_relu_optioins = itertools.product(
+    [True, False],  # with_bn
+    [torch.add, operator.add],  # add_op
+)
+
+for with_bn, add_op in conv_add_relu_optioins:
+    if with_bn:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, MatchAllNode, (nn.BatchNorm2d, nn.Conv2d))))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_bn_add_relu_right)
+                ._set_root_node_getter(_conv_bn_add_relu_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_bn_add_relu_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAddReLU2d))
+    else:
+        conv_configs.append(
+            BackendPatternConfig()
+                ._set_pattern_complex_format((nn.ReLU, (add_op, MatchAllNode, nn.Conv2d)))  # noqa: E131
+                .set_observation_type(observation_type)
+                .set_dtype_configs(conv_dtype_configs)
+                .set_fuser_method(_fuse_conv_add_relu_right)
+                ._set_root_node_getter(_conv_add_relu_root_node_getter_right)
+                ._set_extra_inputs_getter(_conv_add_relu_extra_inputs_getter_right)
+                .set_fused_module(nni.ConvAddReLU2d))
+
+conv_configs.append(
+    BackendPatternConfig(nni.ConvAddReLU2d)
+        .set_observation_type(observation_type)  # noqa: E131
+        .set_dtype_configs(conv_dtype_configs)
+        .set_root_module(nn.Conv2d)
+        .set_reference_quantized_module(nnqr.Conv2d))
+
+# ========================
+# |  CONFIGS FOR LINEAR  |
+# ========================
+
+linear_dtype_configs = [
+    onednn_weighted_op_int8_dtype_config,
+    onednn_dynamic_int8_dtype_config,
+]
+linear_configs = _get_linear_configs(linear_dtype_configs)
+
+def _add_eltwise_fusion_configs(configs, root_module, root_op, post_module, post_op,
+                                dtype_configs, fuser_method, fused_module, observation_type,
+                                ref_quant_module):
+    # 1 base module + op module fusion config
+    configs.append(
+        BackendPatternConfig((root_module, post_module))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(fuser_method)
+            .set_fused_module(fused_module))
+    # base module + functional post op
+    configs.append(
+        BackendPatternConfig((root_module, post_op))
+            .set_dtype_configs(dtype_configs)  # noqa: E131
+            .set_fuser_method(fuser_method)
+            .set_fused_module(fused_module))
+
+    # 2 fused module configs
+    configs.append(
+        BackendPatternConfig(fused_module)
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs)
+            .set_root_module(root_module)
+            .set_reference_quantized_module(ref_quant_module))
+
+    # 3 functional base op + post op configs
+    configs.append(
+        BackendPatternConfig((root_op, post_module))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs))
+    configs.append(
+        BackendPatternConfig((root_op, post_op))
+            .set_observation_type(observation_type)  # noqa: E131
+            .set_dtype_configs(dtype_configs))
+
+# Configs for linear + leaky_relu fusion
+_add_eltwise_fusion_configs(linear_configs, nn.Linear, F.linear,
+                            nn.LeakyReLU, F.leaky_relu, linear_dtype_configs,
+                            _sequential_wrapper2(nni.LinearLeakyReLU),
+                            nni.LinearLeakyReLU, observation_type, nnqr.Linear)
+
+# Configs for linear module + batchnorm + leaky_relu
+linear_configs.append(
+    BackendPatternConfig((nn.Linear, nn.BatchNorm1d, nn.LeakyReLU))
+        .set_dtype_configs(linear_dtype_configs)  # noqa: E131
+        .set_fuser_method(_fuse_linear_bn_leaky_relu)
+        .set_fused_module(nni.LinearLeakyReLU))
+
+# Configs for linear + tanh fusion
+_add_eltwise_fusion_configs(linear_configs, nn.Linear, F.linear,
+                            nn.Tanh, torch.tanh, linear_dtype_configs,
+                            _sequential_wrapper2(nni.LinearTanh),
+                            nni.LinearTanh, observation_type, nnqr.Linear)
+
+# ===========================
+# |  CONFIGS FOR OTHER OPS  |
+# ===========================
+
+binary_op_dtype_configs = [onednn_op_quint8_dtype_config]
+default_op_dtype_configs = [onednn_op_quint8_dtype_config]
+fixed_qparams_op_dtype_configs = [onednn_op_quint8_dtype_config]
+share_qparams_op_dtype_configs = [onednn_op_quint8_dtype_config]
+rnn_op_dtype_configs = [onednn_dynamic_int8_dtype_config]
+embedding_op_dtype_configs = [onednn_weight_only_qint8_dtype_config]
+layer_norm_op_dtype_configs = [onednn_input_output_only_quint8_dtype_config]
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+def get_onednn_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch's native ONEDNN backend.
+    """
+    return BackendConfig("onednn") \
+        .set_backend_pattern_configs(conv_configs) \
+        .set_backend_pattern_configs(linear_configs) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_ln_configs(layer_norm_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
+
+__all__ = [
+    "get_onednn_backend_config",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/qnnpack.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/qnnpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b6a5fa8cd5c29b4251e6115e27ce6039c7c31c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/qnnpack.py
@@ -0,0 +1,160 @@
+import torch
+from ._common_operator_config_utils import (
+    _get_binary_op_configs,
+    _get_bn_configs,
+    _get_cat_config,
+    _get_conv_configs,
+    _get_default_op_configs,
+    _get_embedding_op_configs,
+    _get_fixed_qparams_op_configs,
+    _get_linear_configs,
+    _get_rnn_op_configs,
+    _get_share_qparams_op_configs,
+)
+from .backend_config import BackendConfig, DTypeConfig, DTypeWithConstraints
+
+__all__ = [
+    "get_qnnpack_backend_config",
+]
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+qnnpack_weighted_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+qnnpack_default_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+qnnpack_default_op_fp16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float16,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float16,
+)
+
+qnnpack_default_dynamic_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+qnnpack_default_dynamic_float16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+qnnpack_weight_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint8,
+)
+
+qnnpack_weight_only_quint4x2_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint4x2,
+)
+
+# xnnpack compatible dtype configs
+
+# We restrict scale values to be 2 ** -12 to ensure the
+# requantization scale never falls below the xnnpack lower
+# threshold. Additionally, for qint8 weight, we restrict
+# the quantization values to [-127, +127], excluding -128.
+# For more detail, refer to the description of
+# `default_symmetric_qnnpack_qconfig`.
+
+# TODO: add additional restriction on qscheme to ensure it
+# is either per_tensor_symmetric or per_channel_symmetric
+
+qnnpack_act_qint8_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    scale_min_lower_bound=2 ** -12,
+)
+
+qnnpack_weight_qint8_neg_127_to_127_scale_min_2_neg_12 = DTypeWithConstraints(
+    dtype=torch.qint8,
+    quant_min_lower_bound=-127,
+    quant_max_upper_bound=127,
+    scale_min_lower_bound=2 ** -12,
+)
+
+qnnpack_weighted_op_qint8_symmetric_dtype_config = DTypeConfig(
+    input_dtype=qnnpack_act_qint8_scale_min_2_neg_12,
+    output_dtype=qnnpack_act_qint8_scale_min_2_neg_12,
+    weight_dtype=qnnpack_weight_qint8_neg_127_to_127_scale_min_2_neg_12,
+    bias_dtype=torch.float,
+)
+
+qnnpack_default_op_qint8_symmetric_dtype_config = DTypeConfig(
+    input_dtype=qnnpack_act_qint8_scale_min_2_neg_12,
+    output_dtype=qnnpack_act_qint8_scale_min_2_neg_12,
+)
+
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+def get_qnnpack_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch's native QNNPACK backend.
+    """
+    conv_dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
+        qnnpack_weighted_op_quint8_dtype_config,
+    ]
+    linear_dtype_configs = [
+        qnnpack_weighted_op_qint8_symmetric_dtype_config,
+        qnnpack_weighted_op_quint8_dtype_config,
+        qnnpack_default_dynamic_int8_dtype_config,
+        qnnpack_default_dynamic_float16_dtype_config,
+    ]
+    binary_op_dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        qnnpack_default_op_quint8_dtype_config,
+    ]
+    default_op_dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        qnnpack_default_op_quint8_dtype_config,
+    ]
+    fixed_qparams_op_dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        qnnpack_default_op_quint8_dtype_config,
+    ]
+    share_qparams_op_dtype_configs = [
+        qnnpack_default_op_qint8_symmetric_dtype_config,
+        qnnpack_default_op_quint8_dtype_config,
+    ]
+    rnn_op_dtype_configs = [
+        qnnpack_default_dynamic_int8_dtype_config,
+        qnnpack_default_dynamic_float16_dtype_config,
+    ]
+    embedding_op_dtype_configs = [
+        qnnpack_weight_only_quint8_dtype_config,
+        qnnpack_weight_only_quint4x2_dtype_config,
+    ]
+    return BackendConfig("qnnpack") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/tensorrt.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd2c8c169a587ab0889d0eb36ad7e055c83cb72f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/tensorrt.py
@@ -0,0 +1,81 @@
+import torch
+from .backend_config import (
+    BackendConfig,
+    BackendPatternConfig,
+    DTypeConfig,
+    ObservationType
+)
+from ._common_operator_config_utils import (
+    _get_binary_op_configs,
+    _get_linear_configs,
+    _get_conv_configs,
+    _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
+)
+
+__all__ = [
+    "get_tensorrt_backend_config",
+    "get_tensorrt_backend_config_dict",
+]
+
+def get_tensorrt_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for the TensorRT backend.
+    NOTE: Current api will change in the future, it's just to unblock experimentation for
+    new backends, please don't use it right now.
+    TODO: add a README when it's more stable
+    """
+    # dtype configs
+    weighted_op_qint8_dtype_config = DTypeConfig(
+        input_dtype=torch.qint8,
+        output_dtype=torch.qint8,
+        weight_dtype=torch.qint8,
+        bias_dtype=torch.float,
+    )
+    non_weighted_op_qint8_dtype_config = DTypeConfig(
+        input_dtype=torch.qint8,
+        output_dtype=torch.qint8,
+    )
+
+    addmm_config = BackendPatternConfig(torch.addmm) \
+        .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+        .add_dtype_config(weighted_op_qint8_dtype_config) \
+        ._set_input_type_to_index({
+            "bias": 0,
+            "input": 1,
+            "weight": 2,
+        })
+    cat_config = BackendPatternConfig(torch.cat) \
+        .set_observation_type(ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT) \
+        .add_dtype_config(non_weighted_op_qint8_dtype_config)
+    conv_dtype_configs = [
+        weighted_op_qint8_dtype_config,
+    ]
+    linear_dtype_configs = [
+        weighted_op_qint8_dtype_config,
+    ]
+    binary_op_dtype_configs = [
+        weighted_op_qint8_dtype_config,
+    ]
+    share_qparams_op_dtype_configs = [
+        non_weighted_op_qint8_dtype_config,
+    ]
+    tensor_info_op_dtype_configs = [
+        non_weighted_op_qint8_dtype_config,
+    ]
+    # there might be things not supported in fx2trt, but it will error out
+    # during fx2trt conversion and can support them after that
+    return BackendConfig("tensorrt") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_config(addmm_config) \
+        .set_backend_pattern_config(cat_config) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs))
+
+def get_tensorrt_backend_config_dict():
+    """
+    Return the `BackendConfig` for the TensorRT backend in dictionary form.
+    """
+    return get_tensorrt_backend_config().to_dict()
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee900e062f2fe9eee9e7127c992360d2671e0d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/utils.py
@@ -0,0 +1,279 @@
+from typing import Dict, Any, List, Callable, Union, Tuple, Type
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .backend_config import (
+    BackendConfig,
+    BackendPatternConfig,
+    DTypeConfig,
+)
+from ..utils import Pattern
+from ..fuser_method_mappings import (
+    _reverse2,
+    _reverse3,
+)
+
+__all__ = [
+    "get_pattern_to_dtype_configs",
+    "get_qat_module_classes",
+    "get_fused_module_classes",
+    "get_pattern_to_input_type_to_index",
+    "get_root_module_to_quantized_reference_module",
+    "get_fuser_method_mapping",
+    "get_module_to_qat_module",
+    "get_fusion_pattern_to_root_node_getter",
+    "get_fusion_pattern_to_extra_inputs_getter",
+    "remove_boolean_dispatch_from_name",
+    "pattern_to_human_readable",
+    "entry_to_pretty_str",
+]
+
+def get_pattern_to_dtype_configs(backend_config: BackendConfig) -> Dict[Pattern, List[DTypeConfig]]:
+    pattern_to_dtype_configs: Dict[Pattern, List[DTypeConfig]] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        pattern_to_dtype_configs[pattern] = config.dtype_configs
+    return pattern_to_dtype_configs
+
+def get_qat_module_classes(backend_config: BackendConfig) -> Tuple[type, ...]:
+    qat_module_classes = []
+    for config in backend_config.configs:
+        if config.qat_module is not None:
+            qat_module_classes.append(config.qat_module)
+    return tuple(set(qat_module_classes))
+
+def get_fused_module_classes(backend_config: BackendConfig) -> Tuple[type, ...]:
+    fused_module_classes = []
+    for config in backend_config.configs:
+        if config.fused_module is not None:
+            fused_module_classes.append(config.fused_module)
+    return tuple(set(fused_module_classes))
+
+def get_pattern_to_input_type_to_index(backend_config: BackendConfig) -> Dict[Pattern, Dict[str, int]]:
+    pattern_to_input_type_to_index: Dict[Pattern, Dict[str, int]] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        pattern_to_input_type_to_index[pattern] = config._input_type_to_index
+    return pattern_to_input_type_to_index
+
+def get_root_module_to_quantized_reference_module(
+        backend_config: BackendConfig) -> Dict[Type[torch.nn.Module], Type[torch.nn.Module]]:
+    mapping: Dict[Type[torch.nn.Module], Type[torch.nn.Module]] = {}
+    for config in backend_config.configs:
+        if config.root_module is not None and config.reference_quantized_module is not None:
+            mapping[config.root_module] = config.reference_quantized_module
+    return mapping
+
+def get_fuser_method_mapping(backend_config: BackendConfig) -> Dict[Pattern, Union[nn.Sequential, Callable]]:
+    fuser_method_mapping : Dict[Pattern, Union[nn.Sequential, Callable]] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        if config.fuser_method is not None:
+            # Note: both the fuser method and the pattern are specified in forward order in the
+            # BackendConfig, but the internal pattern matching code uses the reversed nested tuple
+            # format, so we need to convert both to the internal format
+            fuser_method = _get_fuser_method_in_reversed_nested_tuple_format(config)
+            fuser_method_mapping[pattern] = fuser_method
+    return fuser_method_mapping
+
+def get_module_to_qat_module(backend_config: BackendConfig) -> Dict[Pattern, Type[torch.nn.Module]]:
+    module_to_qat_module: Dict[Pattern, Type[torch.nn.Module]] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        if config.qat_module is not None:
+            module_to_qat_module[pattern] = config.qat_module
+    return module_to_qat_module
+
+def get_fusion_pattern_to_root_node_getter(backend_config: BackendConfig) -> Dict[Pattern, Callable]:
+    """ Get a map from fusion pattern to a function that returns the root node
+    from the fusion pattern, e.g. the most common one is:
+    def get_root_node(node_pattern):
+        while not isinstance(node_pattern[-1], Node):
+            node_pattern = node_pattern[-1]
+        return node_pattern[-1]
+    This can work for all patterns whose root node is the "last node" in the pattern,
+    e.g. (torch.add, MatchAllNode, (torch.ReLU, torch.Conv2d))
+    """
+    root_node_getter_mapping: Dict[Pattern, Callable] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        if config._root_node_getter is not None:
+            root_node_getter_mapping[pattern] = config._root_node_getter
+    return root_node_getter_mapping
+
+def get_fusion_pattern_to_extra_inputs_getter(backend_config: BackendConfig) -> Dict[Pattern, Callable]:
+    """ Get a map from fusion pattern to a function that returns extra input nodes
+    from the fusion pattern, in the order required by the root node. This is optional,
+    if not specified, we will not copy over any extra inputs for the root node.
+    Example:
+    # Let's say we have the pattern (torch.add, MatchAllNode, (torch.nn.BatchNorm2d, torch.nn.Conv2d))
+    # and root node is torch.nn.Conv2d, and the node in MatchAllNode would be an extra
+    # argument to the fused module, we can unpack the pattern and return the node at
+    # MatchAllNode here
+    # we can implement extra_inputs_getter as follows:
+    def extra_inputs_getter(pattern) -> List[Any]:
+        add, extra_input, conv_pattern = pattern
+        return [extra_input]
+    """
+    extra_inputs_getter_mapping: Dict[Pattern, Callable] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        if config._extra_inputs_getter is not None:
+            extra_inputs_getter_mapping[pattern] = config._extra_inputs_getter
+    return extra_inputs_getter_mapping
+
+def remove_boolean_dispatch_from_name(p) -> Any:
+    """
+    Some ops have a default string representation such as
+    '<function boolean_dispatch.<locals>.fn at 0x7ff1106bf280>',
+    this function replaces them with the hardcoded function names.
+    """
+    if p is F.fractional_max_pool2d:
+        return "torch.nn.functional.fractional_max_pool2d"
+    elif p is F.fractional_max_pool3d:
+        return "torch.nn.functional.fractional_max_pool3d"
+    elif p is F.max_pool1d:
+        return "torch.nn.functional.max_pool1d"
+    elif p is F.max_pool2d:
+        return "torch.nn.functional.max_pool2d"
+    elif p is F.max_pool3d:
+        return "torch.nn.functional.max_pool3d"
+    elif p is F.adaptive_max_pool1d:
+        return "torch.nn.functional.adaptive_max_pool1d"
+    elif p is F.adaptive_max_pool2d:
+        return "torch.nn.functional.adaptive_max_pool2d"
+    elif p is F.adaptive_max_pool3d:
+        return "torch.nn.functional.adaptive_max_pool3d"
+    assert "boolean_dispatch" not in str(p), \
+        f"{p} does not have a human readable representation in " + \
+        "quantization documentation"
+    return p
+
+def pattern_to_human_readable(p) -> Any:
+    if isinstance(p, tuple):
+        # nested patterns, recurse
+        return tuple(pattern_to_human_readable(inner_p) for inner_p in p)
+    elif isinstance(p, str):
+        # method names are already human readable
+        return p
+    else:
+        p = remove_boolean_dispatch_from_name(p)
+        return p
+
+# TODO(future PR): move backend_config_dict to use dataclass and move this logic to
+# the corresponding __str__ function
+def entry_to_pretty_str(entry) -> str:
+    """
+    Given a backend_config_dict entry, returns a string with the human readable
+    representation of it.
+    """
+    s = "{\n"
+
+    # always output the pattern first
+    if "pattern" in entry:
+        pattern_str = pattern_to_human_readable(entry["pattern"])
+
+        s += f"  'pattern': {pattern_str},\n"
+
+    # custom output for dtype_configs to make it look nice
+    if "dtype_configs" in entry:
+        s += "  'dtype_configs': [\n"
+        for dtype_config in entry["dtype_configs"]:
+            s += "    {\n"
+            for k, v in dtype_config.items():
+                s += f"      '{k}': {v},\n"
+            s += "    },\n"
+        s += "  ],\n"
+
+    # custom output for num_tensor_args_to_observation_type to make it look nice
+    if "num_tensor_args_to_observation_type" in entry:
+        s += "  'num_tensor_args_to_observation_type': {\n"
+        for k, v in entry["num_tensor_args_to_observation_type"].items():
+            s += f"    {k}: {v},\n"
+        s += "  },\n"
+
+    # output all the other fields
+    custom_handled_fields = [
+        "pattern",
+        "dtype_configs",
+        "num_tensor_args_to_observation_type",
+    ]
+    for field_name in entry:
+        if field_name in custom_handled_fields:
+            continue
+        s += f"  '{field_name}': {entry[field_name]},\n"
+
+    s += "}"
+    return s
+
+def _get_pattern_in_reversed_nested_tuple_format(config: BackendPatternConfig) -> Pattern:
+    """
+    Return the pattern specified in the given config in the reversed nested tuple format
+    used internally in the quantization pattern matching code.
+
+    If the pattern is not a tuple, or the pattern is already specified in the reversed
+    nested tuple format, return the pattern as is. Otherwise:
+
+    For 2-tuples (a, b), return (b, a).
+    For 3-tuples (a, b, c), return (c, (b, a)).
+
+    For example:
+        * Given nn.Linear, return nn.Linear
+        * Given (nn.Linear, nn.ReLU), return (nn.ReLU, nn.Linear)
+        * Given (nn.Conv2d, nn.BatchNorm2d, nn.ReLU), return
+          (nn.ReLU, (nn.BatchNorm2d, nn.Conv2d))
+
+    For context, the reason why this is needed is the user-facing BackendConfig
+    API accepts the flat 2-or-3-tuple format in forward order. While this simple
+    format handles the vast majority of use cases, it does not handle the more
+    complex ones, and so the internal pattern matching code for quantization uses
+    the following, more general reversed nested tuple format instead:
+
+        operator = module_type | functional | torch op | native op | MatchAllNode
+        Pattern = (operator, Pattern, Pattern, ...) | operator
+
+    In the future, we expect to replace the above complex format with the one used
+    by the subgraph rewriter in torch.fx, so we don't have to maintain our own
+    complex pattern matching code. Then we won't need this helper function anymore.
+    """
+    if config._pattern_complex_format is not None:
+        return config._pattern_complex_format
+    if config.pattern is None:
+        raise ValueError("Either 'pattern' or 'pattern_complex_format' must be specified")
+    if not isinstance(config.pattern, tuple):
+        return config.pattern
+
+    # Pattern is specified in the simple tuple format, need to convert
+    if len(config.pattern) == 2:
+        (a, b) = config.pattern
+        return (b, a)
+    elif len(config.pattern) == 3:
+        (a, b, c) = config.pattern
+        return (c, (b, a))
+    else:
+        raise ValueError("Expected a tuple with 2 or 3 elements, got: ", config.pattern)
+
+def _get_fuser_method_in_reversed_nested_tuple_format(config: BackendPatternConfig) -> Callable:
+    """
+    Return the fuser method specified in the given config in the reversed nested
+    tuple format used internally in the quantization pattern matching code.
+
+    If pattern is specified in the reversed nested tuple format, we assume the
+    fuser method is also specified in this format and simply return it as is.
+    Otherwise, we convert the fuser method as follows:
+
+        * Given f(is_qat, conv, relu), return f'(is_qat, relu, conv)
+        * Given f(is_qat, conv, bn, relu), return f'(is_qat, relu, bn_conv),
+          where bn_conv is a 2-tuple (bn, conv)
+
+    The first argument of a fuser method is always `is_qat` and is not affected
+    in the conversion. We currently only support functions with 3 or 4 arguments.
+    """
+    assert config.fuser_method is not None
+    if config._pattern_complex_format is not None:
+        return config.fuser_method
+    if not isinstance(config.pattern, tuple):
+        raise ValueError("Expected pattern to be a tuple, got: ", config.pattern)
+
+    # Pattern is specified in the simple tuple format, need to convert
+    if len(config.pattern) == 2:
+        return _reverse2(config.fuser_method)
+    elif len(config.pattern) == 3:
+        return _reverse3(config.fuser_method)
+    else:
+        raise ValueError("Expected a tuple with 2 or 3 elements, got: ", config.pattern)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/x86.py b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/x86.py
new file mode 100644
index 0000000000000000000000000000000000000000..2daaded0499048a80100c184af4d5bd5ee8ea01d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/backend_config/x86.py
@@ -0,0 +1,113 @@
+import torch
+from ._common_operator_config_utils import (
+    _get_binary_op_configs,
+    _get_bn_configs,
+    _get_cat_config,
+    _get_conv_configs,
+    _get_default_op_configs,
+    _get_embedding_op_configs,
+    _get_fixed_qparams_op_configs,
+    _get_linear_configs,
+    _get_rnn_op_configs,
+    _get_share_qparams_op_configs,
+    _get_tensor_info_op_configs,
+)
+from .backend_config import BackendConfig, DTypeConfig
+
+__all__ = [
+    "get_x86_backend_config",
+]
+
+# ===================
+# |  DTYPE CONFIGS  |
+# ===================
+
+# X86 aligns with FBGEMM for now
+
+x86_weighted_op_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+)
+
+x86_default_op_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.quint8,
+)
+
+x86_default_op_fp16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float16,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float16,
+)
+
+x86_default_dynamic_int8_dtype_config = DTypeConfig(
+    input_dtype=torch.quint8,
+    output_dtype=torch.float,
+    weight_dtype=torch.qint8,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+x86_default_dynamic_float16_dtype_config = DTypeConfig(
+    input_dtype=torch.float16,
+    output_dtype=torch.float,
+    weight_dtype=torch.float16,
+    bias_dtype=torch.float,
+    is_dynamic=True,
+)
+
+x86_weight_only_quint8_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint8,
+)
+
+x86_weight_only_quint4x2_dtype_config = DTypeConfig(
+    input_dtype=torch.float,
+    output_dtype=torch.float,
+    weight_dtype=torch.quint4x2,
+)
+
+
+# =====================
+# |  BACKEND CONFIGS  |
+# =====================
+
+def get_x86_backend_config() -> BackendConfig:
+    """
+    Return the `BackendConfig` for PyTorch's native x86 backend.
+    """
+    conv_dtype_configs = [x86_weighted_op_int8_dtype_config]
+    linear_dtype_configs = [
+        x86_weighted_op_int8_dtype_config,
+        x86_default_dynamic_int8_dtype_config,
+        x86_default_dynamic_float16_dtype_config,
+    ]
+    binary_op_dtype_configs = [x86_weighted_op_int8_dtype_config]
+    default_op_dtype_configs = [x86_default_op_quint8_dtype_config]
+    fixed_qparams_op_dtype_configs = [x86_weighted_op_int8_dtype_config]
+    share_qparams_op_dtype_configs = [x86_default_op_quint8_dtype_config]
+    tensor_info_op_dtype_configs = [x86_default_op_quint8_dtype_config]
+    rnn_op_dtype_configs = [
+        x86_default_dynamic_int8_dtype_config,
+        x86_default_dynamic_float16_dtype_config,
+    ]
+    embedding_op_dtype_configs = [
+        x86_weight_only_quint8_dtype_config,
+        x86_weight_only_quint4x2_dtype_config,
+    ]
+    return BackendConfig("x86") \
+        .set_backend_pattern_configs(_get_conv_configs(conv_dtype_configs)) \
+        .set_backend_pattern_configs(_get_linear_configs(linear_dtype_configs)) \
+        .set_backend_pattern_configs(_get_binary_op_configs(binary_op_dtype_configs)) \
+        .set_backend_pattern_config(_get_cat_config(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_default_op_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_fixed_qparams_op_configs(fixed_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_share_qparams_op_configs(share_qparams_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_tensor_info_op_configs(tensor_info_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_bn_configs(default_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_rnn_op_configs(rnn_op_dtype_configs)) \
+        .set_backend_pattern_configs(_get_embedding_op_configs(embedding_op_dtype_configs))
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fake_quantize.py b/MLPY/Lib/site-packages/torch/ao/quantization/fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..4307d7fda0470b2b400cebdcdd4176675f7b9249
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fake_quantize.py
@@ -0,0 +1,546 @@
+"""Implements modules  used to perform fake quantization."""
+
+import torch
+from torch.nn import Module
+from torch.ao.quantization.observer import (
+    MovingAverageMinMaxObserver,
+    HistogramObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    FixedQParamsObserver,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
+    _with_args,
+)
+import re
+from abc import ABC, abstractmethod
+from typing import Any, Tuple
+
+__all__ = [
+    "FakeQuantizeBase",
+    "FakeQuantize",
+    "FixedQParamsFakeQuantize",
+    "FusedMovingAvgObsFakeQuantize",
+    "disable_fake_quant",
+    "disable_observer",
+    "enable_fake_quant",
+    "enable_observer",
+    "default_fake_quant",
+    "default_weight_fake_quant",
+    "default_dynamic_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_symmetric_fixed_qparams_fake_quant",
+    "default_affine_fixed_qparams_fake_quant",
+    "default_per_channel_weight_fake_quant",
+    "default_embedding_fake_quant",
+    "default_embedding_fake_quant_4bit",
+    "default_histogram_fake_quant",
+    "default_fused_act_fake_quant",
+    "default_fused_wt_fake_quant",
+    "default_fused_per_channel_wt_fake_quant",
+    "fused_wt_fake_quant_range_neg_127_to_127",
+    "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
+]
+
+def _is_per_channel(qscheme: 'torch.qscheme') -> bool:
+    return qscheme in [torch.per_channel_symmetric, torch.per_channel_affine, torch.per_channel_affine_float_qparams]
+
+def _is_per_tensor(qscheme: 'torch.qscheme') -> bool:
+    return qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]
+
+def _is_symmetric_quant(qscheme: 'torch.qscheme') -> bool:
+    return qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]
+
+def _is_float_qparams(qscheme: 'torch.qscheme') -> bool:
+    return qscheme in [torch.per_channel_affine_float_qparams, ]
+
+class FakeQuantizeBase(ABC, Module):
+    r"""Base fake quantize module.
+
+    Base fake quantize module
+    Any fake quantize implementation should derive from this class.
+
+    Concrete fake quantize module should follow the same API. In forward, they will update
+    the statistics of the observed Tensor and fake quantize the input. They should also provide a
+    `calculate_qparams` function that computes the quantization parameters given
+    the collected statistics.
+
+    """
+
+    fake_quant_enabled: torch.Tensor
+    observer_enabled: torch.Tensor
+
+    def __init__(self):
+        """Set fake_quant_enabled and observer_enabled."""
+        super().__init__()
+        # fake_quant_enabled and observer_enabled are buffers to support their
+        # replication in DDP. Data type is uint8 because NCCL does not support
+        # bool tensors.
+        self.register_buffer('fake_quant_enabled', torch.tensor([1], dtype=torch.uint8))
+        self.register_buffer('observer_enabled', torch.tensor([1], dtype=torch.uint8))
+
+    @abstractmethod
+    def forward(self, x):
+        pass
+
+    @abstractmethod
+    def calculate_qparams(self, **kwargs):
+        pass
+
+    @torch.jit.export
+    def enable_fake_quant(self, enabled: bool = True) -> None:
+        self.fake_quant_enabled[0] = 1 if enabled else 0
+
+    @torch.jit.export
+    def disable_fake_quant(self):
+        self.enable_fake_quant(False)
+
+    @torch.jit.export
+    def enable_observer(self, enabled: bool = True) -> None:
+        self.observer_enabled[0] = 1 if enabled else 0
+
+    @torch.jit.export
+    def disable_observer(self):
+        self.enable_observer(False)
+
+    @classmethod
+    def with_args(cls, **kwargs):
+        fake_quant_constructor = _with_args(cls, **kwargs)
+        # need to assign the correct module to fake_quantize
+        # constructors to satisfy public v private requirements
+        fake_quant_constructor.__module__ = "torch.ao.quantization.fake_quantize"
+        return fake_quant_constructor
+
+class FakeQuantize(FakeQuantizeBase):
+    r"""Simulate the quantize and dequantize operations in training time.
+
+    The output of this module is given by::
+
+        x_out = (
+          clamp(round(x/scale + zero_point), quant_min, quant_max) - zero_point
+        ) * scale
+
+    * :attr:`is_dynamic` indicates whether the fake quantie is a placeholder for dynamic quantization
+      operators (choose_qparams -> q -> dq) or static quantization operators (q -> dq)
+
+    * :attr:`scale` defines the scale factor used for quantization.
+
+    * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to
+
+    * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that
+      statistics can still be updated.
+
+    * :attr:`observer_enabled` controls statistics collection on tensors
+
+    * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization,
+        allowable values are torch.qint8 and torch.quint8.
+
+    Args:
+
+        observer (module): Module for observing statistics on input tensors and calculating scale
+          and zero-point.
+        observer_kwargs (optional): Arguments for the observer module
+
+    Attributes:
+        activation_post_process (Module): User provided module that collects statistics on the input tensor and
+          provides a method to calculate scale and zero-point.
+
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=None, quant_max=None, is_dynamic=False, **observer_kwargs):
+        super().__init__()
+        # Populate quant_min/quant_max to observer_kwargs if valid
+        if quant_min is not None and quant_max is not None:
+            assert quant_min <= quant_max, \
+                'quant_min must be less than or equal to quant_max'
+            dtype = observer_kwargs.get("dtype", torch.quint8)
+            if hasattr(observer, "p"):
+                # In case observer is _PartialWrapper, dtype can be stored in
+                # observer.p.keywords["dtype"]
+                dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
+                    "dtype", dtype
+                )
+            assert torch.iinfo(dtype).min <= quant_min, 'quant_min out of bound'
+            assert quant_max <= torch.iinfo(dtype).max, 'quant_max out of bound'
+            observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
+        observer_kwargs["is_dynamic"] = is_dynamic
+        self.activation_post_process = observer(**observer_kwargs)
+        # TODO: keeping self.quant_min/max for BC; remove after a couple releases
+        # Users should use self.activation_post_process.quant_min
+        self.quant_min = self.activation_post_process.quant_min
+        self.quant_max = self.activation_post_process.quant_max
+        self.is_dynamic = self.activation_post_process.is_dynamic
+        if _is_float_qparams(self.activation_post_process.qscheme):
+            zero_point_dtype = torch.float
+        else:
+            zero_point_dtype = torch.int
+        self.register_buffer('scale', torch.tensor([1.0], dtype=torch.float))
+        self.register_buffer('zero_point', torch.tensor([0], dtype=zero_point_dtype))
+        self.dtype = self.activation_post_process.dtype
+        self.qscheme = self.activation_post_process.qscheme
+        self.ch_axis = self.activation_post_process.ch_axis \
+            if hasattr(self.activation_post_process, 'ch_axis') else -1
+        assert _is_per_channel(self.qscheme) or \
+            _is_per_tensor(self.qscheme), \
+            'Only per channel and per tensor quantization are supported in fake quantize' + \
+            ' got qscheme: ' + str(self.qscheme)
+        self.is_per_channel = _is_per_channel(self.qscheme)
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        return self.activation_post_process.calculate_qparams()
+
+    def forward(self, X):
+        if self.observer_enabled[0] == 1:
+            self.activation_post_process(X.detach())
+            _scale, _zero_point = self.calculate_qparams()
+            _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(self.zero_point.device)
+            if self.scale.shape != _scale.shape:
+                self.scale.resize_(_scale.shape)
+                self.zero_point.resize_(_zero_point.shape)
+            self.scale.copy_(_scale)
+            self.zero_point.copy_(_zero_point)
+
+        if self.fake_quant_enabled[0] == 1:
+            if self.is_per_channel:
+                X = torch.fake_quantize_per_channel_affine(
+                    X, self.scale, self.zero_point,
+                    self.ch_axis, self.activation_post_process.quant_min, self.activation_post_process.quant_max)
+            else:
+                X = torch.fake_quantize_per_tensor_affine(
+                    X, self.scale, self.zero_point,
+                    self.activation_post_process.quant_min, self.activation_post_process.quant_max)
+        return X
+
+    @torch.jit.export
+    def extra_repr(self):
+        return 'fake_quant_enabled={}, observer_enabled={}, ' \
+               'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
+               'scale={}, zero_point={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.activation_post_process.quant_min, self.activation_post_process.quant_max,
+                   self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        # We cannot currently register scalar values as buffers, so need to manually
+        # specify serialization here.
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + 'scale'] = self.scale
+        destination[prefix + 'zero_point'] = self.zero_point
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        # Removing this function throws an error that the size of the loaded tensor does not match the original size
+        # i.e., These buffers start out with numel 0 and become numel 1 once they have their first forward pass.
+        local_state = ['scale', 'zero_point']
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                # Custom handling to allow loading scale and zero_point
+                # of size N into uninitialized buffers of size 0. The
+                # buffers are resized here, and the values are copied in
+                # the default state_dict loading code of the parent.
+                if name == 'scale':
+                    self.scale.resize_(val.shape)
+                else:
+                    assert name == 'zero_point'
+                    self.zero_point.resize_(val.shape)
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == 'scale':
+                        self.scale.copy_(val)
+                    else:
+                        assert name == 'zero_point'
+                        self.zero_point.copy_(val)
+            elif strict:
+                missing_keys.append(key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
+                                      missing_keys, unexpected_keys, error_msgs)
+
+
+class FixedQParamsFakeQuantize(FakeQuantize):
+    """Simulate quantize and dequantize in training time.
+
+    Simulate quantize and dequantize with fixed quantization
+    parameters in training time. Only per tensor quantization
+    is supported.
+    """
+
+    # TODO: rename observer to observer_ctr
+    def __init__(self, observer):
+        super().__init__(observer=observer)
+        assert type(self.activation_post_process) == FixedQParamsObserver, \
+            f"{self.__class__.__name__}'s observer must be a {FixedQParamsObserver.__name__}"
+        self._observer_ctr = observer
+        self.scale = self.activation_post_process.scale
+        self.zero_point = self.activation_post_process.zero_point
+        assert _is_per_tensor(self.qscheme), 'Only per tensor quantization is supported' + \
+            ' FixedQParamsFakeQuantize module, got qscheme:' + str(self.qscheme)
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        return self.scale, self.zero_point
+
+    @torch.jit.export
+    def extra_repr(self):
+        """Define a string representation of the object's attributes."""
+        return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
+               'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
+                   self.fake_quant_enabled, self.observer_enabled,
+                   self.scale, self.zero_point, self.dtype,
+                   self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.qscheme)
+
+
+class FusedMovingAvgObsFakeQuantize(FakeQuantize):
+    r"""Define a fused module to observe the tensor.
+
+    Fused module that is used to observe the input tensor (compute min/max), compute
+    scale/zero_point and fake_quantize the tensor.
+    This module uses calculation similar MovingAverageMinMaxObserver for the inputs,
+    to compute the min/max values in order to compute the scale/zero_point.
+    The qscheme input in the observer is used to differentiate between symmetric/affine
+    quantization scheme.
+
+    The output of this module is given by
+    x_out = (clamp(round(x/scale + zero_point), quant_min, quant_max)-zero_point)*scale
+
+    Similar to :class:`~torch.ao.quantization.FakeQuantize`, and accepts the same attributes as the
+    base class.
+
+    """
+
+    def __init__(
+        self,
+        observer: Any = MovingAverageMinMaxObserver,
+        quant_min: int = 0,
+        quant_max: int = 255,
+        **observer_kwargs: Any
+    ) -> None:
+        super().__init__(observer, quant_min, quant_max, **observer_kwargs)
+        assert isinstance(self.activation_post_process, (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver)), \
+            "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
+        self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
+        self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
+        self.is_symmetric_quant = _is_symmetric_quant(self.activation_post_process.qscheme)
+
+    @torch.jit.export
+    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.activation_post_process.calculate_qparams()
+
+    @torch.jit.export
+    def extra_repr(self) -> str:
+        return (
+            "fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, "
+            "dtype={}, quant_min={}, quant_max={}, qscheme={}, reduce_range={}".format(
+                self.fake_quant_enabled,
+                self.observer_enabled,
+                self.scale,
+                self.zero_point,
+                self.dtype,
+                self.activation_post_process.quant_min,
+                self.activation_post_process.quant_max,
+                self.qscheme,
+                self.activation_post_process.reduce_range,
+            )
+        )
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        return torch.fused_moving_avg_obs_fake_quant(
+            X,
+            self.observer_enabled,
+            self.fake_quant_enabled,
+            self.activation_post_process.min_val,
+            self.activation_post_process.max_val,
+            self.scale,
+            self.zero_point,
+            self.activation_post_process.averaging_constant,
+            self.activation_post_process.quant_min,
+            self.activation_post_process.quant_max,
+            self.ch_axis,
+            self.is_per_channel,
+            self.is_symmetric_quant,
+        )
+
+default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
+                                            dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
+"""
+Default fake_quant for activations.
+"""
+
+default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
+                                                   dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, reduce_range=False)
+"""
+Default fake_quant for weights.
+Observer is memoryless since averaging_constant is 1.
+"""
+
+default_dynamic_fake_quant = FakeQuantize.with_args(
+    observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, is_dynamic=True,
+    dtype=torch.quint8, averaging_constant=1)
+"""
+Default dynamic fake_quant for activations.
+"""
+
+default_fixed_qparams_range_neg1to1_fake_quant = (
+    FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_neg1to1_observer)
+)
+default_fixed_qparams_range_0to1_fake_quant = (
+    FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_0to1_observer)
+)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_fake_quant = default_fixed_qparams_range_neg1to1_fake_quant
+default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant
+
+default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
+                                                               quant_min=-128,
+                                                               quant_max=127,
+                                                               dtype=torch.qint8,
+                                                               qscheme=torch.per_channel_symmetric,
+                                                               reduce_range=False,
+                                                               ch_axis=0)
+"""
+Default fake_quant for per-channel weights.
+Observer is memoryless since averaging_constant is 1.
+"""
+default_embedding_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
+                                                      qscheme=torch.per_channel_affine_float_qparams,
+                                                      dtype=torch.quint8,
+                                                      quant_min=0,
+                                                      quant_max=255,
+                                                      ch_axis=0,
+                                                      averaging_constant=1)
+"""
+Default fake_quant for embeddings.
+Observer is memoryless since averaging_constant is 1.
+"""
+
+default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
+                                                           qscheme=torch.per_channel_affine_float_qparams,
+                                                           ch_axis=0,
+                                                           dtype=torch.quint4x2,
+                                                           averaging_constant=1)
+
+default_histogram_fake_quant = FakeQuantize.with_args(observer=HistogramObserver,
+                                                      quant_min=0,
+                                                      quant_max=255,
+                                                      dtype=torch.quint8,
+                                                      qscheme=torch.per_tensor_affine,
+                                                      reduce_range=True)
+"""
+Fake_quant for activations using a histogram..
+"""
+
+
+default_fused_act_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                       quant_min=0,
+                                                                       quant_max=255,
+                                                                       dtype=torch.quint8,)
+
+"""
+Fused version of `default_fake_quant`, with improved performance.
+"""
+
+
+default_fused_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                      quant_min=-128,
+                                                                      quant_max=127,
+                                                                      dtype=torch.qint8,
+                                                                      qscheme=torch.per_tensor_symmetric)
+"""
+Fused version of `default_weight_fake_quant`, with improved performance.
+"""
+
+default_fused_per_channel_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
+                                                                                  quant_min=-128,
+                                                                                  quant_max=127,
+                                                                                  dtype=torch.qint8,
+                                                                                  qscheme=torch.per_channel_symmetric)
+"""
+Fused version of `default_per_channel_weight_fake_quant`, with improved performance.
+"""
+
+fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                   quant_min=-127,
+                                                                                   quant_max=127,
+                                                                                   dtype=torch.qint8,
+                                                                                   qscheme=torch.per_tensor_symmetric,
+                                                                                   eps=2 ** -12)
+"""
+Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+fused_per_channel_wt_fake_quant_range_neg_127_to_127 = \
+    FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
+                                            quant_min=-127,
+                                            quant_max=127,
+                                            dtype=torch.qint8,
+                                            qscheme=torch.per_channel_symmetric,
+                                            eps=2 ** -12)
+
+"""
+Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+
+def _is_fake_quant_script_module(mod):
+    """Return true if given mod is an instance of FakeQuantize script module."""
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.ao.quantization.fake_quantize.___torch_mangle_2.FakeQuantize'
+        suffix = mod._c.qualified_name.split('.', 1)[1]
+        name = re.sub(r'\.___torch_mangle_\d+', '', suffix)
+        return name == 'torch.ao.quantization.fake_quantize.FakeQuantize' or \
+            name == 'torch.ao.quantization.fake_quantize.FusedMovingAvgObsFakeQuantize'
+    return False
+
+def disable_fake_quant(mod):
+    """Disable fake quantization for the module.
+
+    Disable fake quantization for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.disable_fake_quant)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.disable_fake_quant()
+
+def enable_fake_quant(mod):
+    """Enable fake quantization for the module.
+
+    Enable fake quantization for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.enable_fake_quant)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.enable_fake_quant()
+
+def disable_observer(mod):
+    """Disable observation for this module.
+
+    Disable observation for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.disable_observer)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.disable_observer()
+
+def enable_observer(mod):
+    """Enable observation for this module.
+
+    Enable observation for this module, if applicable. Example usage::
+
+      # model is any PyTorch model
+      model.apply(torch.ao.quantization.enable_observer)
+
+    """
+    if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
+        mod.enable_observer()
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fuse_modules.py b/MLPY/Lib/site-packages/torch/ao/quantization/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..af23dd75b1a55caa0f4ed7fe51cafa739bf30d93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fuse_modules.py
@@ -0,0 +1,175 @@
+import copy
+
+import torch.nn as nn
+
+from torch.ao.quantization.fuser_method_mappings import get_fuser_method
+# for backward compatibility
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn  # noqa: F401
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn_relu  # noqa: F401
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+from typing import List, Optional
+
+__all__ = [
+    "fuse_known_modules",
+    "fuse_modules",
+    "fuse_modules_qat",
+]
+
+# Generalization of getattr
+def _get_module(model, submodule_key):
+    tokens = submodule_key.split('.')
+    cur_mod = model
+    for s in tokens:
+        cur_mod = getattr(cur_mod, s)
+    return cur_mod
+
+# Generalization of setattr
+def _set_module(model, submodule_key, module):
+    tokens = submodule_key.split('.')
+    sub_tokens = tokens[:-1]
+    cur_mod = model
+    for s in sub_tokens:
+        cur_mod = getattr(cur_mod, s)
+
+    setattr(cur_mod, tokens[-1], module)
+
+def fuse_known_modules(mod_list, is_qat, additional_fuser_method_mapping=None):
+    r"""Return a list of known fuse modules.
+
+    Returns a list of modules that fuses the operations specified
+     in the input module list.
+
+    Fuses only the following sequence of modules:
+    conv, bn
+    conv, bn, relu
+    conv, relu
+    linear, bn
+    linear, relu
+    For these sequences, the first element in the output module list performs
+    the fused operation. The rest of the elements are set to nn.Identity()
+    """
+    types = tuple(type_before_parametrizations(m) for m in mod_list)
+    fuser_method = get_fuser_method(types, additional_fuser_method_mapping)
+    if fuser_method is None:
+        raise NotImplementedError(f"Cannot fuse modules: {types}")
+    new_mod : List[Optional[nn.Module]] = [None] * len(mod_list)
+    fused = fuser_method(is_qat, *mod_list)
+    # NOTE: forward hooks not processed in the two following for loops will be lost after the fusion
+    # Move pre forward hooks of the base module to resulting fused module
+    for pre_hook_fn in mod_list[0]._forward_pre_hooks.values():
+        fused.register_forward_pre_hook(pre_hook_fn)
+    mod_list[0]._forward_pre_hooks.clear()
+    # Move post forward hooks of the last module to resulting fused module
+    for hook_fn in mod_list[-1]._forward_hooks.values():
+        fused.register_forward_hook(hook_fn)
+    mod_list[-1]._forward_hooks.clear()
+    new_mod[0] = fused
+
+    for i in range(1, len(mod_list)):
+        identity = nn.Identity()
+        identity.training = mod_list[0].training
+        new_mod[i] = identity
+
+    return new_mod
+
+def _fuse_modules_helper(model, modules_to_fuse, is_qat, fuser_func=fuse_known_modules, fuse_custom_config_dict=None):
+    if fuse_custom_config_dict is None:
+        fuse_custom_config_dict = {}
+    additional_fuser_method_mapping = fuse_custom_config_dict.get("additional_fuser_method_mapping", {})
+    mod_list = []
+    for item in modules_to_fuse:
+        mod_list.append(_get_module(model, item))
+
+    # Fuse list of modules
+    new_mod_list = fuser_func(mod_list, is_qat, additional_fuser_method_mapping)
+
+    # Replace original module list with fused module list
+    for i, item in enumerate(modules_to_fuse):
+        _set_module(model, item, new_mod_list[i])
+
+def _fuse_modules(model, modules_to_fuse, is_qat, inplace=False, fuser_func=fuse_known_modules, fuse_custom_config_dict=None):
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    if all(isinstance(module_element, str) for module_element in modules_to_fuse):
+        # Handle case of modules_to_fuse being a list
+        _fuse_modules_helper(model, modules_to_fuse, is_qat, fuser_func, fuse_custom_config_dict)
+    else:
+        # Handle case of modules_to_fuse being a list of lists
+        for module_list in modules_to_fuse:
+            _fuse_modules_helper(model, module_list, is_qat, fuser_func, fuse_custom_config_dict)
+    return model
+
+def fuse_modules(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_modules, fuse_custom_config_dict=None):
+    r"""Fuse a list of modules into a single module.
+
+    Fuses only the following sequence of modules:
+    conv, bn
+    conv, bn, relu
+    conv, relu
+    linear, relu
+    bn, relu
+    All other sequences are left unchanged.
+    For these sequences, replaces the first item in the list
+    with the fused module, replacing the rest of the modules
+    with identity.
+
+    Args:
+        model: Model containing the modules to be fused
+        modules_to_fuse: list of list of module names to fuse. Can also be a list
+                         of strings if there is only a single list of modules to fuse.
+        inplace: bool specifying if fusion happens in place on the model, by default
+                 a new model is returned
+        fuser_func: Function that takes in a list of modules and outputs a list of fused modules
+                    of the same length. For example,
+                    fuser_func([convModule, BNModule]) returns the list [ConvBNModule, nn.Identity()]
+                    Defaults to torch.ao.quantization.fuse_known_modules
+        `fuse_custom_config_dict`: custom configuration for fusion
+
+    .. code-block:: python
+
+       # Example of fuse_custom_config_dict
+       fuse_custom_config_dict = {
+           # Additional fuser_method mapping
+           "additional_fuser_method_mapping": {
+               (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn
+           },
+       }
+
+    Returns:
+        model with fused modules. A new copy is created if inplace=True.
+
+    Examples::
+
+            >>> # xdoctest: +SKIP
+            >>> m = M().eval()
+            >>> # m is a module containing the sub-modules below
+            >>> modules_to_fuse = [ ['conv1', 'bn1', 'relu1'], ['submodule.conv', 'submodule.relu']]
+            >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse)
+            >>> output = fused_m(input)
+
+            >>> m = M().eval()
+            >>> # Alternately provide a single list of modules to fuse
+            >>> modules_to_fuse = ['conv1', 'bn1', 'relu1']
+            >>> fused_m = torch.ao.quantization.fuse_modules(m, modules_to_fuse)
+            >>> output = fused_m(input)
+
+    """
+    return _fuse_modules(
+        model,
+        modules_to_fuse,
+        is_qat=False,
+        inplace=inplace,
+        fuser_func=fuser_func,
+        fuse_custom_config_dict=fuse_custom_config_dict)
+
+def fuse_modules_qat(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_modules, fuse_custom_config_dict=None):
+    """QAT version for `fuse_modules`."""
+    return _fuse_modules(
+        model,
+        modules_to_fuse,
+        is_qat=True,
+        inplace=inplace,
+        fuser_func=fuser_func,
+        fuse_custom_config_dict=fuse_custom_config_dict)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fuser_method_mappings.py b/MLPY/Lib/site-packages/torch/ao/quantization/fuser_method_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23f4247b3c30ba96bcaa9c0eec3c9f4e7a9d51c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fuser_method_mappings.py
@@ -0,0 +1,259 @@
+import torch.nn as nn
+import torch.ao.nn.intrinsic as nni
+
+from typing import Any, Union, Callable, List, Tuple, Dict, Optional, Type
+from torch.ao.quantization.utils import Pattern, get_combined_dict, MatchAllNode
+import itertools
+
+__all__ = [
+    "fuse_conv_bn",
+    "fuse_conv_bn_relu",
+    "fuse_linear_bn",
+    "fuse_convtranspose_bn",
+    "get_fuser_method",
+    "get_fuser_method_new",
+]
+
+def fuse_conv_bn(is_qat, conv, bn):
+    r"""Return the fused the conv and bn modules.
+    Given the conv and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        conv: Module instance of type conv2d/conv3d
+        bn: Spatial BN instance that needs to be fused with the conv
+
+    Examples::
+
+        >>> m1 = nn.Conv2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_conv_bn(m1, b1)
+    """
+    assert conv.training == bn.training, \
+        "Conv and BN both must be in the same mode (train or eval)."
+
+    fused_module_class_map = {
+        nn.Conv1d: nni.ConvBn1d,
+        nn.Conv2d: nni.ConvBn2d,
+        nn.Conv3d: nni.ConvBn3d,
+    }
+
+    if is_qat:
+        assert bn.num_features == conv.out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
+        assert bn.affine, 'Only support fusing BatchNorm2d with affine set to True'
+        assert bn.track_running_stats, 'Only support fusing BatchNorm2d with tracking_running_stats set to True'
+        fused_module_class = fused_module_class_map.get((type(conv)), None)
+        if fused_module_class is not None:
+            return fused_module_class(conv, bn)
+        else:
+            raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn)}")
+    else:
+        return nn.utils.fuse_conv_bn_eval(conv, bn)
+
+def fuse_conv_bn_relu(is_qat, conv, bn, relu):
+    r"""Return the fused conv and bv modules.
+
+    Given the conv and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        conv: Module instance of type conv2d/conv3d
+        bn: Spatial BN instance that needs to be fused with the conv
+
+    Examples::
+
+        >>> m1 = nn.Conv2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> r1 = nn.ReLU(inplace=False)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_conv_bn_relu(m1, b1, r1)
+    """
+    assert conv.training == bn.training == relu.training, \
+        "Conv and BN both must be in the same mode (train or eval)."
+    fused_module : Optional[Type[nn.Sequential]] = None
+    if is_qat:
+        map_to_fused_module_train = {
+            nn.Conv1d: nni.ConvBnReLU1d,
+            nn.Conv2d: nni.ConvBnReLU2d,
+            nn.Conv3d: nni.ConvBnReLU3d,
+        }
+        assert bn.num_features == conv.out_channels, 'Output channel of Conv must match num_features of BatchNorm'
+        assert bn.affine, 'Only support fusing BatchNorm with affine set to True'
+        assert bn.track_running_stats, 'Only support fusing BatchNorm with tracking_running_stats set to True'
+        fused_module = map_to_fused_module_train.get(type(conv), None)
+        if fused_module is not None:
+            return fused_module(conv, bn, relu)
+        else:
+            raise NotImplementedError(f"Cannot fuse train modules: {(conv, bn, relu)}")
+    else:
+        map_to_fused_module_eval = {
+            nn.Conv1d: nni.ConvReLU1d,
+            nn.Conv2d: nni.ConvReLU2d,
+            nn.Conv3d: nni.ConvReLU3d,
+        }
+        fused_module = map_to_fused_module_eval.get(type(conv), None)
+        if fused_module is not None:
+            fused_conv = nn.utils.fusion.fuse_conv_bn_eval(conv, bn)
+            return fused_module(fused_conv, relu)
+        else:
+            raise NotImplementedError(f"Cannot fuse eval modules: {(conv, bn, relu)}")
+
+def fuse_linear_bn(is_qat, linear, bn):
+    r"""Return the fused linear and bn modules.
+    Given the linear and bn modules, fuses them and returns the fused module
+
+    Args:
+        is_qat: a flag for whether we are using quantization aware training fusion
+        or post training quantization fusion
+        linear: Module instance of type Linear
+        bn: BatchNorm1d instance that needs to be fused with the linear layer
+
+    Examples::
+
+        >>> m1 = nn.Linear(20, 10)
+        >>> b1 = nn.BatchNorm1d(10)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_linear_bn(m1, b1)
+    """
+    assert linear.training == bn.training, \
+        "Linear and BN both must be in the same mode (train or eval)."
+
+    if is_qat:
+        assert bn.num_features == linear.out_features, \
+            "Output features of Linear must match num_features of BatchNorm1d"
+        assert bn.affine, "Only support fusing BatchNorm1d with affine set to True"
+        assert bn.track_running_stats, \
+            "Only support fusing BatchNorm1d with tracking_running_stats set to True"
+        return nni.LinearBn1d(linear, bn)
+    else:
+        return nn.utils.fusion.fuse_linear_bn_eval(linear, bn)
+
+def fuse_convtranspose_bn(is_qat, convt, bn):
+    r"""Return the fused ConvTranspose and bn modules.
+    Given ConvTranspose and bn modules, fuses them and returns the fused module
+
+    Args:
+        convt: Module instance of type ConvTransposeNd
+        bn: BatchNormNd instance that needs to be fused with the linear layer.
+            batch norm N should match the ConvTranspose N
+
+    Examples::
+
+        >>> m1 = nn.ConvTranspose2d(10, 20, 3)
+        >>> b1 = nn.BatchNorm2d(20)
+        >>> # xdoctest: +SKIP
+        >>> m2 = fuse_convtranspose_bn(m1, b1)
+    """
+    assert convt.training == bn.training, \
+        "ConvTranspose and BN both must be in the same mode (train or eval)."
+
+    if is_qat:
+        raise Exception("Fusing ConvTranspose+BatchNorm not yet supported in QAT.")
+    else:
+        return nn.utils.fusion.fuse_conv_bn_eval(convt, bn, transpose=True)
+
+def _sequential_wrapper2(sequential):
+    """Return a sequential wrapped that for is_qat and two modules.
+    Given a sequential class for two modules, return a function that takes
+    is_qat, and then two modules as argument, that ignores the is_qat flag
+    and always returns the sequential that combines the two input modules
+    """
+    def fuser_method(is_qat, m1, m2):
+        return sequential(m1, m2)
+    return fuser_method
+
+_DEFAULT_OP_LIST_TO_FUSER_METHOD: Dict[Tuple, Union[nn.Sequential, Callable]] = {
+    (nn.Conv1d, nn.BatchNorm1d): fuse_conv_bn,
+    (nn.Conv1d, nn.BatchNorm1d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv2d, nn.BatchNorm2d): fuse_conv_bn,
+    (nn.Conv2d, nn.BatchNorm2d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv3d, nn.BatchNorm3d): fuse_conv_bn,
+    (nn.Conv3d, nn.BatchNorm3d, nn.ReLU): fuse_conv_bn_relu,
+    (nn.Conv1d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU1d),
+    (nn.Conv2d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU2d),
+    (nn.Conv3d, nn.ReLU): _sequential_wrapper2(nni.ConvReLU3d),
+    (nn.Linear, nn.BatchNorm1d): fuse_linear_bn,
+    (nn.Linear, nn.ReLU): _sequential_wrapper2(nni.LinearReLU),
+    (nn.BatchNorm2d, nn.ReLU): _sequential_wrapper2(nni.BNReLU2d),
+    (nn.BatchNorm3d, nn.ReLU): _sequential_wrapper2(nni.BNReLU3d),
+    (nn.ConvTranspose1d, nn.BatchNorm1d): fuse_convtranspose_bn,
+    (nn.ConvTranspose2d, nn.BatchNorm2d): fuse_convtranspose_bn,
+    (nn.ConvTranspose3d, nn.BatchNorm3d): fuse_convtranspose_bn,
+}
+
+def get_fuser_method(op_list, additional_fuser_method_mapping=None):
+    """Get fuser method for the given list of module types.
+
+    Get fuser method for the given list of module types,
+    return None if fuser method does not exist
+    """
+    if additional_fuser_method_mapping is None:
+        additional_fuser_method_mapping = {}
+    all_mappings = get_combined_dict(_DEFAULT_OP_LIST_TO_FUSER_METHOD,
+                                     additional_fuser_method_mapping)
+    fuser_method = all_mappings.get(op_list, None)
+    assert fuser_method is not None, f"did not find fuser method for: {op_list} "
+    return fuser_method
+
+def _reverse2(f):
+    def reversed(is_qat, x, y):
+        return f(is_qat, y, x)
+    return reversed
+
+def _reverse3(f):
+    def reversed(is_qat, x, w):
+        y, z = w
+        return f(is_qat, z, y, x)
+    return reversed
+
+def _get_valid_patterns(op_pattern):
+    """Return a list of valid patterns generated from the op_pattern.
+
+    Returns a list of valid patterns generated from the op_pattern,
+    since MatchAllNode can match all types of nodes,
+    e.g. pattern (torch.nn.Conv2d, torch.add) should also be able to match keys like
+    (MatchAllNode, torch.add) and (torch.nn.Conv2d, MatchAllNode)
+
+    Example Input:
+    (torch.add, (torch.nn.ReLU, torch.nn.Conv2d))
+
+    Example Output:
+    [(torch.add, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (torch.add, (torch.nn.ReLU, MatchAllNode)),
+     (torch.add, (MatchAllNode, torch.nn.Conv2d)),
+     (torch.add, (MatchAllNode, MatchAllNode)),
+     (MatchAllNode, (torch.nn.ReLU, torch.nn.Conv2d)),
+     (MatchAllNode, (torch.nn.ReLU, MatchAllNode)),
+     (MatchAllNode, (MatchAllNode, torch.nn.Conv2d)),
+     (MatchAllNode, (MatchAllNode, MatchAllNode)),
+    ]
+    """
+    result: List[Any]
+    if isinstance(op_pattern, (tuple, list)):
+        sub_combs = []
+        for sub_pattern in op_pattern:
+            sub_combs.append(_get_valid_patterns(sub_pattern))
+        result = list(itertools.product(*sub_combs))
+    else:
+        result = [op_pattern, MatchAllNode]
+    return result
+
+def get_fuser_method_new(
+        op_pattern: Pattern,
+        fuser_method_mapping: Dict[Pattern, Union[nn.Sequential, Callable]]):
+    """Get fuser method.
+
+    This will be made default after we deprecate the get_fuser_method
+    Would like to implement this first and have a separate PR for deprecation
+    """
+    op_patterns = _get_valid_patterns(op_pattern)
+    fuser_method = None
+    for op_pattern in op_patterns:
+        fuser_method = fuser_method_mapping.get(op_pattern, None)
+        if fuser_method is not None:
+            break
+    assert fuser_method is not None, f"did not find fuser method for: {op_pattern} "
+    return fuser_method
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..030caec3731699dc8972417d6401b6b0b8eb0f2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__init__.py
@@ -0,0 +1,3 @@
+from .prepare import prepare
+from .convert import convert
+from .fuse import fuse
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca3ce9346ecb53ed58362d62ab5e4eda14bfd99c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_decomposed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_decomposed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..035d37a69ab28ab0f0758805ab5825829ea5dfc4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_decomposed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_equalize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_equalize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45f73b2559d66ef2a13b7f5c686a57cb3142b05c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_equalize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_lower_to_native_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_lower_to_native_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f81805e749c9022d190fd081be3018acfe2489f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/_lower_to_native_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/convert.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/convert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be5b23071b4faedc22afb0325fc4bc6045bb1631
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/convert.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/custom_config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/custom_config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ebf54aa9efcaebc3b9dddec40511fe43e5000b7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/custom_config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1199effa912f593ecb4307b68dedb7c6c8602e91
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse_handler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse_handler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc75ab0940467987fc903517e06f381d84cfbbfa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/fuse_handler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/graph_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/graph_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d477f3451edef53526de5c87e56672e9faac3f1a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/graph_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_fbgemm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_fbgemm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12732d8ffe74cbc7136cdd1428b15146a3abab70
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_fbgemm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_qnnpack.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_qnnpack.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..618563a7eb9204faa873e54451c8bdbe5cc4fa1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lower_to_qnnpack.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lstm_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lstm_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..300ccd9ce964ef3b128efaa5d6468a693e1dff7e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/lstm_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/match_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/match_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a515d531f11e343ed64145ba10f31d78805c3390
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/match_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13133cf1bb150e7297fc4c5364436ffdcad902d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/prepare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/prepare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bd11c669927c92cc176a7089e1483321443b8f6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/prepare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/qconfig_mapping_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/qconfig_mapping_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5adc177a714cc32f3fe4de3a900717f51a65e107
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/qconfig_mapping_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/quantize_handler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/quantize_handler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2656b4d1843e70e78070b110fdd62655598420c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/quantize_handler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/tracer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/tracer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7747022f30d273a096031c4161a9e264435b9ea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/tracer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2124e41fc84fdc48e7669459e5e5994ea626da36
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_decomposed.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_decomposed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6159cad7c94b157393947e018ffe24d8075b1d20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_decomposed.py
@@ -0,0 +1,925 @@
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch.library import Library, impl
+from torch.ao.quantization.utils import determine_qparams, validate_qmin_qmax
+from torch._refs import _unsqueeze_multiple
+
+
+# Note: decomposed means decomposed quantized tensor, using decomposed so that the
+# name is not too long
+quantized_decomposed_lib = Library("quantized_decomposed", "DEF")
+
+_DTYPE_TO_QVALUE_BOUNDS = {
+    torch.uint8: (0, 255),
+    torch.int8: (-128, 127),
+    torch.int16: (-(2**15), 2**15 - 1),
+    torch.int32: (-(2**31), 2**31 - 1)
+}
+
+# Helper to check the passed in quant min and max are valid for the dtype
+def _quant_min_max_bounds_check(quant_min, quant_max, dtype):
+    if dtype not in _DTYPE_TO_QVALUE_BOUNDS:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+    quant_min_lower_bound, quant_max_upper_bound = _DTYPE_TO_QVALUE_BOUNDS[dtype]
+
+    assert quant_min >= quant_min_lower_bound, \
+        "quant_min out of bound for dtype, " \
+        f"quant_min_lower_bound: {quant_min_lower_bound} quant_min: {quant_min}"
+
+    assert quant_max <= quant_max_upper_bound, \
+        "quant_max out of bound for dtype, " \
+        f"quant_max_upper_bound: {quant_max_upper_bound} quant_max: {quant_max}"
+
+quantized_decomposed_lib.define(
+    "quantize_per_tensor(Tensor input, float scale, int zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor", "CompositeExplicitAutograd")
+def quantize_per_tensor(
+        input: torch.Tensor,
+        scale: float,
+        zero_point: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine quantization for the Tensor using the same quantization parameters to map
+    from floating point to quantized values
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scale (float): quantization parameter for affine quantization
+       zero_point (int): quantization parameter for affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+
+    inv_scale = 1.0 / scale
+    return torch.clamp(torch.round(input * inv_scale) + zero_point, quant_min, quant_max).to(dtype)
+
+quantized_decomposed_lib.define(
+    "quantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "CompositeExplicitAutograd")
+def quantize_per_tensor_tensor(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine quantization for the Tensor using the same quantization parameters to map
+    from floating point to quantized values
+    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor", "Meta")
+def quantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    return torch.empty_like(input, dtype=dtype)
+
+# TODO: remove other variants and keep this one
+quantized_decomposed_lib.define(
+    "quantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, "
+    "Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor2", "CompositeExplicitAutograd")
+def quantize_per_tensor_tensor2(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: torch.Tensor,
+        quant_max: torch.Tensor,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine quantization for the Tensor using the same quantization parameters to map
+    from floating point to quantized values
+    Same as `quantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    return quantize_per_tensor(input, scale.item(), zero_point.item(), quant_min.item(), quant_max.item(), dtype)
+
+@impl(quantized_decomposed_lib, "quantize_per_tensor.tensor2", "Meta")
+def quantize_per_tensor_tensor2_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    return quantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype)
+
+# Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
+# the signature as metadata for the input Tensor, this might be useful for pattern
+# matching in the future
+# We will revisit this later if we found there are no use cases for it
+quantized_decomposed_lib.define(
+    "dequantize_per_tensor(Tensor input, float scale, int zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor", "CompositeExplicitAutograd")
+def dequantize_per_tensor(
+        input: torch.Tensor,
+        scale: float,
+        zero_point: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine dequantization for the Tensor using the same quantization parameters to map
+    from quantized values to floating point values
+
+    Args:
+       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
+       e.g. (`torch.uint8`), it is a per tensor quantized Tensor if combined with
+       quantization parameters in the argument of this function (scale/zero_point)
+
+       scale (float): quantization parameter for affine quantization
+
+       zero_point (int): quantization parameter for affine quantization
+
+       quant_min (int): minimum quantized value for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+       quant_max (int): maximum quantized value for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+       dtype (torch.dtype): dtype for input Tensor (not used in computation,
+       reserved for pattern matching)
+
+    Returns:
+       dequantized float32 Tensor
+    """
+    assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}, but got {input.dtype}"
+    if dtype in _DTYPE_TO_QVALUE_BOUNDS:
+        # TODO: investigate why
+        # (input - zero_point).to(torch.float32) * scale
+        # failed the test
+        return (input.to(torch.float32) - zero_point) * scale
+    else:
+        raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
+
+
+quantized_decomposed_lib.define(
+    "dequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "CompositeExplicitAutograd")
+def dequantize_per_tensor_tensor(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine dequantization for the Tensor using the same quantization parameters to map
+    from quantized values to floating point values
+    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min, quant_max, dtype)
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor", "Meta")
+def dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    assert input.dtype == dtype, f"Expecting input to have dtype: {dtype}"
+    if dtype in _DTYPE_TO_QVALUE_BOUNDS:
+        return torch.empty_like(input, dtype=torch.float32)
+    else:
+        raise ValueError(f"Unsupported dtype in dequantize_per_tensor: {dtype}")
+
+# TODO: remove other variants and keep this one
+quantized_decomposed_lib.define(
+    "dequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, "
+    "Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor2", "CompositeExplicitAutograd")
+def dequantize_per_tensor_tensor2(
+        input: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: torch.Tensor,
+        quant_min: torch.Tensor,
+        quant_max: torch.Tensor,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine dequantization for the Tensor using the same quantization parameters to map
+    from quantized values to floating point values
+    Same as `dequantize_per_tensor` but scale and zero_point are Scalar Tensor instead of
+    scalar values
+    """
+    assert zero_point.numel() == 1, f"Expecting zero_point tensor to be one element, but received : {zero_point.numel()}"
+    assert scale.numel() == 1, f"Expecting scale tensor to be one element, but received : {scale.numel()}"
+    return dequantize_per_tensor(input, scale.item(), zero_point.item(), quant_min.item(), quant_max.item(), dtype)
+
+@impl(quantized_decomposed_lib, "dequantize_per_tensor.tensor2", "Meta")
+def dequantize_per_tensor_tensor2_meta(input, scale, zero_point, quant_min, quant_max, dtype):
+    return dequantize_per_tensor_tensor_meta(input, scale, zero_point, quant_min, quant_max, dtype)
+
+quantized_decomposed_lib.define(
+    "choose_qparams.tensor(Tensor input, int quant_min, int quant_max, "
+    "float eps, ScalarType dtype) -> (Tensor, Tensor)")
+
+@impl(quantized_decomposed_lib, "choose_qparams.tensor", "CompositeExplicitAutograd")
+def choose_qparams_tensor(
+        input: torch.Tensor,
+        qmin: int,
+        qmax: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Given an input Tensor, derive the per tensor affine quantization parameter
+    (scale and zero_point) for target quantized Tensor from the Tensor
+
+    Args:
+       input (torch.Tensor): floating point input Tensor
+       quant_min (int): minimum quantized value for target quantized Tensor
+       quant_max (int): maximum quantized value for target quantized Tensor
+       dtype (torch.dtype): dtype for target quantized Tensor
+
+    Returns:
+       scale (float): quantization parameter for the target quantized Tensor
+       zero_point (int): quantization parameter for the target quantized Tensor
+    """
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert dtype in _DTYPE_TO_QVALUE_BOUNDS, \
+        f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+    validate_qmin_qmax(qmin, qmax)
+
+    min_val, max_val = torch.aminmax(input)
+
+    return determine_qparams(
+        min_val, max_val, qmin, qmax, dtype, torch.Tensor([eps]), has_customized_qrange=False)
+
+quantized_decomposed_lib.define(
+    "choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, "
+    "float eps, ScalarType dtype) -> (Tensor, Tensor)")
+
+@impl(quantized_decomposed_lib, "choose_qparams_symmetric.tensor", "CompositeExplicitAutograd")
+def choose_qparams_symmetric_tensor(
+        input: torch.Tensor,
+        qmin: int,
+        qmax: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """ Given an input Tensor, derive the per tensor affine quantization parameter
+    (scale and zero_point) for target quantized Tensor from the Tensor
+
+    Args:
+       input (torch.Tensor): floating point input Tensor
+       quant_min (int): minimum quantized value for target quantized Tensor
+       quant_max (int): maximum quantized value for target quantized Tensor
+       dtype (torch.dtype): dtype for target quantized Tensor
+
+    Returns:
+       scale (float): quantization parameter for the target quantized Tensor
+       zero_point (int): quantization parameter for the target quantized Tensor
+    """
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert dtype in _DTYPE_TO_QVALUE_BOUNDS, \
+        f"Expecting target dtype to be one of {_DTYPE_TO_QVALUE_BOUNDS.keys()}, but got: {dtype}"
+    validate_qmin_qmax(qmin, qmax)
+
+    min_val, max_val = torch.aminmax(input)
+    return determine_qparams(
+        min_val,
+        max_val,
+        qmin,
+        qmax,
+        dtype,
+        torch.Tensor([eps]),
+        has_customized_qrange=False,
+        qscheme=torch.per_tensor_symmetric
+    )
+
+@impl(quantized_decomposed_lib, "choose_qparams.tensor", "Meta")
+def choose_qparams_tensor_meta(
+        input: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert quant_min < quant_max, f"Expecting quant_min to be smaller than quant_max but received min: \
+        {quant_min} max: {quant_max}"
+    return torch.empty(1, dtype=torch.double, device=input.device), torch.empty(1, dtype=torch.int64, device=input.device)
+
+@impl(quantized_decomposed_lib, "choose_qparams_symmetric.tensor", "Meta")
+def choose_qparams_symmetric_tensor_meta(
+        input: torch.Tensor,
+        quant_min: int,
+        quant_max: int,
+        eps: float,
+        dtype: torch.dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return torch.empty(1, dtype=torch.double, device=input.device), torch.empty(1, dtype=torch.int64, device=input.device)
+
+# Helper function used to implement per-channel quantization against any axis
+def _permute_to_axis_zero(x, axis):
+    new_axis_list = list(range(x.dim()))
+    new_axis_list[axis] = 0
+    new_axis_list[0] = axis
+    y = x.permute(tuple(new_axis_list))
+    return y, new_axis_list
+
+quantized_decomposed_lib.define(
+    "quantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "quantize_per_channel", "CompositeExplicitAutograd")
+def quantize_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine per channel quantization for the Tensor using the same quantization
+    parameters for each channel/axis to map from floating point to quantized values
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scales (torch.Tensor): a list of scale quantization parameter for
+       affine quantization, one per channel
+       zero_point (torch.Tensor): a list of zero_point quantization parameter for
+       affine quantization, one per channel
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    input, permute_axis_list = _permute_to_axis_zero(input, axis)
+    res = torch.zeros_like(input)
+
+    for i in range(input.size(0)):
+        res[i] = torch.clamp(
+            torch.round(input[i] * (1.0 / scales[i])) + zero_points[i],
+            quant_min,
+            quant_max
+        )
+
+    out = res.permute(tuple(permute_axis_list))
+    return out.to(dtype)
+
+@impl(quantized_decomposed_lib, "quantize_per_channel", "Meta")
+def quantize_per_channel_meta(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    if input.dtype == torch.bfloat16:
+        input = input.to(torch.float32)
+    assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    return torch.empty_like(input, dtype=dtype)
+
+# Note: quant_min/quant_max/dtype are not used in the operator, but for now it's kept in
+# the signature as metadata for the input Tensor, this might be useful for pattern
+# matching in the future
+# We will revisit this later if we found there are no use cases for it
+quantized_decomposed_lib.define(
+    "dequantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor")
+
+@impl(quantized_decomposed_lib, "dequantize_per_channel", "CompositeExplicitAutograd")
+def dequantize_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    """ Affine per channel dequantization for the Tensor using the same quantization
+    parameters for each channel/axis to map from quantized values to floating point values
+
+    Args:
+       input (torch.Tensor): Tensor with dtype matching `dtype` argument,
+       e.g. (`torch.uint8`), it is a per channel quantized Tensor if combined with
+       quantization parameter in the argument of this function (scales/zero_points/axis)
+
+       scales (torch.Tensor): a list of scale quantization parameter for
+       affine quantization, one per channel
+
+       zero_points (torch.Tensor): a list of zero_point quantization parameter for
+       affine quantization, one per channel
+
+       quant_min (int): minimum quantized value for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+       quant_max (int): maximum quantized value for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+       dtype (torch.dtype): requested dtype for output Tensor (not used in computation,
+       reserved for pattern matching)
+
+    Returns:
+       dequantized float32 Tensor
+    """
+    assert input.dtype == dtype, f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    input, permute_axis_list = _permute_to_axis_zero(input, axis)
+    res = torch.zeros_like(input, dtype=torch.float32)
+
+    for i in range(input.size(0)):
+        # TODO: investigate why
+        # (input[i] - zero_points[i]).to(torch.float32) * scales[i]
+        # failed the test
+        res[i] = (input[i].to(torch.float32) - zero_points[i]) * scales[i]
+
+    out = res.permute(tuple(permute_axis_list))
+    return out
+
+@impl(quantized_decomposed_lib, "dequantize_per_channel", "Meta")
+def dequantize_per_channel_meta(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+        dtype: torch.dtype
+) -> torch.Tensor:
+    assert input.dtype == dtype, f"Expecting input to have dtype {dtype}, but got dtype: {input.dtype}"
+    assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    return torch.empty_like(input, dtype=torch.float32)
+
+
+quantized_decomposed_lib.define(
+    "choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token",
+    "CompositeExplicitAutograd",
+)
+def choose_qparams_per_token(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32/float16 Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+
+    Returns:
+        scales and zero_points, both float32 Tensors
+    """
+
+    scales = input.abs().amax(dim=-1, keepdim=True)
+    if scales.dtype == torch.float16:
+        scales = (
+            scales.float()
+        )  # want float scales to avoid overflows for fp16, (bf16 has wide enough range)
+    if dtype == torch.int8:
+        n_bits = 8
+        quant_max = 2 ** (n_bits - 1) - 1
+    else:
+        raise Exception(f"unsupported dtype in choose_qparams_per_token: {dtype}")
+
+    scales = scales.clamp(min=1e-5).div(quant_max)
+    zero_points = torch.zeros_like(scales)
+    return scales, zero_points
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token",
+    "Meta",
+)
+def choose_qparams_per_token_meta(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    size = (1, input.size(-1))
+    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
+        size, dtype=torch.int64, device=input.device
+    )
+
+
+# TODO: move this to https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/fx/_decomposed.py
+quantized_decomposed_lib.define(
+    "choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token_asymmetric",
+    "CompositeExplicitAutograd",
+)
+def choose_qparams_per_token_asymmetric(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Choose quantization parameters for per token quantization. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32/float16 Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+
+    Returns:
+        scales and zero_points, both float32 Tensors
+    """
+    # Based on https://github.com/google/XNNPACK/blob/df156f0cf3db5a4576cc711123eeb54915f82ffc/src/xnnpack/quantization.h#L18
+    qmin, qmax = -128, 127
+    min_val, max_val = torch.aminmax(input, dim=-1, keepdim=True)
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    eps = torch.finfo(torch.float32).eps  # use xnnpack eps?
+
+    # scale
+    scale = (max_val_pos - min_val_neg) / float(qmax - qmin)
+    scale = scale.clamp(min=eps)
+
+    # zero point
+    descaled_min = min_val_neg / scale
+    descaled_max = max_val_pos / scale
+    zero_point_from_min_error = qmin + descaled_min
+    zero_point_from_max_error = qmax + descaled_max
+    zero_point = torch.where(
+        zero_point_from_min_error + zero_point_from_max_error > 0,
+        qmin - descaled_min,
+        qmax - descaled_max,
+    )
+    zero_point = torch.clamp(zero_point, qmin, qmax).round()
+
+    return scale.to(torch.float32), zero_point.to(torch.float32)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "choose_qparams_per_token_asymmetric",
+    "Meta",
+)
+def choose_qparams_per_token_asymmetric_meta(
+    input: torch.Tensor,
+    dtype: torch.dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    size = (1, input.size(-1))
+    return torch.empty(size, dtype=torch.double, device=input.device), torch.empty(
+        size, dtype=torch.int64, device=input.device
+    )
+
+
+def _per_token_quant_qparam_dim_check(input, scales, zero_points):
+    num_tokens = math.prod(list(input.size())[:-1])
+    assert (
+        num_tokens == scales.numel()
+    ), f"num_tokens: {num_tokens} scales: {scales.size()}"
+    assert (
+        num_tokens == zero_points.numel()
+    ), f"num_tokens: {num_tokens} zero_points: {zero_points.size()}"
+
+
+quantized_decomposed_lib.define(
+    "quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
+    "int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+)
+
+
+@impl(quantized_decomposed_lib, "quantize_per_token", "CompositeExplicitAutograd")
+def quantize_per_token(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+    """Per token quantization for the Tensor using the quantization parameters to map
+    from floating point to quantized values. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    _per_token_quant_qparam_dim_check(input, scales, zero_points)
+    input = (
+        torch.round(input / scales + zero_points).clamp(quant_min, quant_max).to(dtype)
+    )
+    return input
+
+
+@impl(quantized_decomposed_lib, "quantize_per_token", "Meta")
+def quantize_per_token_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+):
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    return torch.empty_like(input, dtype=dtype)
+
+
+quantized_decomposed_lib.define(
+    "dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, "
+    "int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor"
+)
+
+
+@impl(quantized_decomposed_lib, "dequantize_per_token", "CompositeExplicitAutograd")
+def dequantize_per_token(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    output_dtype: torch.dtype = torch.float32,
+):
+    """Per token dequantization for the Tensor using the quantization parameters to map
+    from floating point to quantized values. This means for a N dimension Tensor
+    (M1, M2, ...Mn, N), we calculate scales/zero_points for each N elements and quantize
+    every N elements with the same quantization parameter. The dimension for scales/zero_points
+    will be (M1 * M2 ... * Mn)
+
+    Args:
+       input (torch.Tensor): quantized Tensor (uint8, int8 etc.)
+       scales (float32 torch.Tensor): quantization parameter for per token affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per token affine quantization
+       quant_min (int): minimum quantized value for input Tensor
+       quant_max (int): maximum quantized value for input Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
+
+    Returns:
+       dequantized Tensor with dtype `output_dtype`
+    """
+    input = input - zero_points
+    input = input.to(output_dtype) * scales
+    return input
+
+
+@impl(quantized_decomposed_lib, "dequantize_per_token", "Meta")
+def dequantize_per_token_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    output_dtype: torch.dtype = torch.float32,
+):
+    _quant_min_max_bounds_check(quant_min, quant_max, dtype)
+    # TODO: support fp16
+    return torch.empty_like(input, dtype=output_dtype)
+
+
+quantized_decomposed_lib.define(
+    "quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, "
+    "int quant_max, ScalarType dtype, int group_size) -> Tensor"
+)
+
+
+# TODO: dtype is ignored for now
+@impl(
+    quantized_decomposed_lib, "quantize_per_channel_group", "CompositeExplicitAutograd"
+)
+def quantize_per_channel_group(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size=128,
+):
+    assert group_size > 1
+    # needed for GPTQ single column quantize
+    if group_size > input.shape[-1] and scales.shape[-1] == 1:
+        group_size = input.shape[-1]
+
+    assert input.shape[-1] % group_size == 0
+    assert input.dim() == 2
+
+    # TODO: check for dtype, currently we can't express torch.int4 so it's omitted
+    to_quant = input.reshape(-1, group_size)
+    assert torch.isnan(to_quant).sum() == 0
+
+    scales = scales.reshape(-1, 1)
+    zero_points = zero_points.reshape(-1, 1)
+
+    input_int8 = (
+        to_quant.div(scales)
+        .add(zero_points)
+        .round()
+        .clamp_(quant_min, quant_max)
+        .to(dtype)
+        .reshape_as(input)
+    )
+
+    return input_int8
+
+
+@impl(quantized_decomposed_lib, "quantize_per_channel_group", "Meta")
+def quantize_per_channel_group_meta(
+    input: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: torch.Tensor,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size=128,
+):
+    """Groupwise quantization within each channel for an 2-d Tensor using the quantization parameters
+    to map from floating point to quantized values. This means for each row of a 2-d Tensor
+    (M, N), we calculate scales/zero_points for each `group_size` elements
+    and quantize every `group_size` elements with the same quantization parameter.
+    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
+
+    Args:
+       input (torch.Tensor): original float32 or bfloat16 Tensor
+       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
+       quant_min (int): minimum quantized value for output Tensor
+       quant_max (int): maximum quantized value for output Tensor
+       dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor
+
+    Returns:
+       Tensor with requested dtype (e.g. torch.uint8), note the quantization parameters
+       are not stored in the Tensor, we are storing them in function arguments instead
+    """
+    assert group_size > 1
+    # needed for GPTQ single column quantize
+    if group_size > input.shape[-1] and scales.shape[-1] == 1:
+        group_size = input.shape[-1]
+
+    assert input.shape[-1] % group_size == 0
+    assert input.dim() == 2
+    return torch.empty_like(input, dtype=dtype)
+
+
+quantized_decomposed_lib.define(
+    "dequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, "
+    "int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor"
+)
+
+
+@impl(
+    quantized_decomposed_lib,
+    "dequantize_per_channel_group",
+    "CompositeExplicitAutograd",
+)
+def dequantize_per_channel_group(
+    w_int8: torch.Tensor,
+    scales: torch.Tensor,
+    zero_points: Optional[torch.Tensor],
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+    group_size: int = 128,
+    output_dtype: torch.dtype = torch.float32,
+):
+    """Groupwise dequantization within each channel for an 2-d Tensor using the quantization parameters
+    to map from floating point to quantized values. This means for each row of a 2-d Tensor
+    (M, N), we calculate scales/zero_points for each `group_size` elements
+    and quantize every `group_size` elements with the same quantization parameter.
+    The dimension for scales/zero_points will be (M * ceil(N, group_size),)
+
+    Args:
+       input (torch.Tensor): quantized Tensor (uint8/int8 etc.)
+       scales (float32 torch.Tensor): quantization parameter for per channel group affine quantization
+       zero_points (int32 torch.Tensor): quantization parameter for per channel group affine quantization
+       quant_min (int): minimum quantized value for input Tensor
+       quant_max (int): maximum quantized value for input Tensor
+       dtype (torch.dtype): dtype (e.g. torch.uint8) for input Tensor
+       output_dtype (torch.dtype): dtype (e.g. torch.float32) for output Tensor
+
+    Returns:
+       dequantized Tensor with dtype `output_dtype`
+    """
+
+    assert group_size > 1
+    # needed for GPTQ single column dequantize
+    if group_size > w_int8.shape[-1] and scales.shape[-1] == 1:
+        group_size = w_int8.shape[-1]
+    assert w_int8.shape[-1] % group_size == 0
+    assert w_int8.dim() == 2
+
+    w_int8_grouped = w_int8.reshape(-1, group_size)
+    scales = scales.reshape(-1, 1)
+    if zero_points is not None:
+        zp = zero_points.reshape(-1, 1)
+    else:
+        zp = torch.zeros([], dtype=torch.int32, device=scales.device)
+    w_dq = w_int8_grouped.sub(zp).mul(scales).reshape_as(w_int8).to(output_dtype)
+    return w_dq
+
+
+quantized_decomposed_lib.define(
+    "fake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, "
+    "int quant_min, int quant_max) -> Tensor")
+
+class FakeQuantPerChannel(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, scales, zero_points, axis, quant_min, quant_max):
+        with torch._C._AutoDispatchBelowAutograd():
+            if input.dtype == torch.bfloat16:
+                input = input.to(torch.float32)
+            if scales.dtype != torch.float32:
+                scales = scales.to(torch.float32)
+            if zero_points.dtype != torch.int32:
+                zero_points = zero_points.to(torch.int32)
+            assert input.dtype == torch.float32, f"Expecting input to have dtype torch.float32, but got dtype: {input.dtype}"
+            assert axis < input.dim(), f"Expecting axis to be < {input.dim()}"
+            broadcast_dims = list(range(0, axis)) + list(range(axis + 1, input.ndim))
+            unsqueeze_scales = _unsqueeze_multiple(scales, broadcast_dims)
+            unsqueeze_zero_points = _unsqueeze_multiple(zero_points, broadcast_dims)
+            temp = torch.round(input * (1.0 / unsqueeze_scales)) + unsqueeze_zero_points
+            out = (torch.clamp(temp, quant_min, quant_max) - unsqueeze_zero_points) * unsqueeze_scales
+            mask = torch.logical_and((temp >= quant_min), (temp <= quant_max))
+
+        ctx.save_for_backward(mask)
+        return out
+
+    @staticmethod
+    def backward(ctx, gy):
+        mask, = ctx.saved_tensors
+        return gy * mask, None, None, None, None, None
+
+@impl(quantized_decomposed_lib, "fake_quant_per_channel", "AutogradCPU")
+def fake_quant_per_channel(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+) -> torch.Tensor:
+    return FakeQuantPerChannel.apply(input, scales, zero_points, axis, quant_min, quant_max)
+
+@impl(quantized_decomposed_lib, "fake_quant_per_channel", "Meta")
+def fake_quant_per_channel_meta(
+        input: torch.Tensor,
+        scales: torch.Tensor,
+        zero_points: torch.Tensor,
+        axis: int,
+        quant_min: int,
+        quant_max: int,
+) -> torch.Tensor:
+    return torch.empty_like(input)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_equalize.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_equalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd06631f0fb1de448162e4b80a675343750081ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_equalize.py
@@ -0,0 +1,820 @@
+import warnings
+
+from collections import namedtuple
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.ao.nn.intrinsic as nni
+from torch.fx import GraphModule
+from torch.fx.graph import Node
+from torch.ao.quantization.fx.graph_module import _get_observed_graph_module_attr
+
+from ..observer import _with_args, ObserverBase, PerChannelMinMaxObserver
+from ..utils import _parent_name, check_min_max_valid
+
+from .utils import (
+    get_new_attr_name_with_prefix,
+    maybe_get_next_module,
+    node_arg_is_weight,
+)
+
+CUSTOM_MODULE_SUPP_LIST: List[Any] = []
+
+def reshape_scale(scale: torch.Tensor, axis: int, input: torch.Tensor) -> torch.Tensor:
+    """Reshapes the scale so that we can multiply it to the input by the given axis.
+    """
+    new_shape = [1] * input.ndim
+    new_shape[axis] = input.size(axis)
+    return scale.view(new_shape)
+
+qsheme_mapping_per_tensor_to_per_channel = {
+    torch.per_tensor_affine: torch.per_channel_affine,
+    torch.per_tensor_symmetric: torch.per_channel_symmetric,
+}
+
+
+class _InputEqualizationObserver(nn.Module):
+    r"""Observer for tracking the running min/max values of input columns, and
+    computing the quantization parameters for the overall min/max input values.
+
+    Args:
+        dtype: Quantized data type
+        qscheme: Quantization scheme
+        quant_min: Minimum quantization value. If unspecified, it will
+            follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will
+            follow the 8-bit setup.
+
+    The running minimum/maximum :math:`x_\text{min/max}` are computed in the
+    same way as :class:`~torch.ao.quantization.observer.PerChannelMinMaxObserver`,
+    with the difference that the running min/max values are stored per column.
+    This observer is intended to be used along with a WeightEqualizationObserver
+    to calculate the equalization scale.
+    """
+
+    def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
+                 quant_min=None, quant_max=None, factory_kwargs=None) -> None:
+        super().__init__()
+
+        if qscheme not in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+            raise TypeError("Input qscheme must be per-tensor")
+
+        self.dtype = dtype
+        self.qscheme = qscheme
+
+        per_channel_qscheme = qsheme_mapping_per_tensor_to_per_channel[qscheme]
+        self.input_obs = PerChannelMinMaxObserver(ch_axis=1, dtype=dtype,
+                                                  qscheme=per_channel_qscheme,
+                                                  quant_min=quant_min,
+                                                  quant_max=quant_max,
+                                                  factory_kwargs=factory_kwargs)
+
+        self.equalization_scale = torch.tensor(1)
+        self.equalization_shape: List[int] = []
+
+    def forward(self, x_orig):
+        if not (x_orig.ndim >= 2 and x_orig.ndim <= 5):
+            raise ValueError("InputEqualizationObserver only supports Linear and Conv layers")
+
+        # Calculate the shape needed to reshape the equalization scale later (needed for Conv layers)
+        self.equalization_shape = [1] * x_orig.ndim
+        self.equalization_shape[1] = x_orig.size(1)
+
+        return self.input_obs(x_orig)
+
+    def get_input_minmax(self):
+        return (self.input_obs.min_val, self.input_obs.max_val)
+
+    def set_equalization_scale(self, equalization_scale):
+        # Reshape the equalization scale along axis=1 so that it can be
+        # multiplied with the input along axis=1
+        if equalization_scale.nelement() == 1 and equalization_scale == torch.tensor(1):
+            return
+        self.equalization_scale = torch.reshape(equalization_scale, self.equalization_shape)
+
+    def calculate_scaled_minmax(self):
+        r""" Returns the scaled min/max inputs
+        """
+        if self.equalization_scale.nelement() == 1 and self.equalization_scale == torch.tensor(1):
+            warnings.warn(
+                "Must call calculate_equalization_scale before calling calculate_scaled_minmax. " +
+                "Will not scale the next quantization observer."
+            )
+            return None, None
+
+        # Calculate qparams for the scaled min/max inputs
+        # Scale the input by the equalization scale located at the same column
+        # index
+        (min_inputs, max_inputs) = self.get_input_minmax()
+        equalization_scale_reshaped = reshape_scale(self.equalization_scale, 0, min_inputs)
+        min_input_scaled = torch.min(torch.mul(min_inputs, equalization_scale_reshaped))
+        max_input_scaled = torch.max(torch.mul(max_inputs, equalization_scale_reshaped))
+
+        return min_input_scaled, max_input_scaled
+
+    with_args = classmethod(_with_args)
+
+
+class _WeightEqualizationObserver(nn.Module):
+    r"""Observer for tracking the running min/max values of weight columns and
+    rows, and computing the quantization parameters for the weight rows.
+
+    Args:
+        dtype: Quantized data type
+        qscheme: Quantization scheme
+        quant_min: Minimum quantization value. If unspecified, it will
+            follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will
+            follow the 8-bit setup.
+
+    This observer is made up of 1 PerChannelMinMaxObserver `weight_col_obs` used
+    to record the running minimum and maximum of columns of incoming weight
+    tensors. This observer is intended to be used along with an
+    InputEqualizationObserver to calculate the equalization scale.
+
+    The running minimum/maximum :math:`w_\text{min/max}` are computed in the
+    same way as :class:`~torch.ao.quantization.observer.PerChannelMinMaxObserver`.
+    """
+
+    def __init__(self, dtype=torch.qint8, qscheme=torch.per_tensor_affine, quant_min=None,
+                 quant_max=None, factory_kwargs=None) -> None:
+        super().__init__()
+
+        self.dtype = dtype
+        self.qscheme = qscheme
+        self.ch_axis = 1
+
+        per_channel_qscheme = qscheme
+        if qscheme in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+            per_channel_qscheme = qsheme_mapping_per_tensor_to_per_channel[qscheme]
+        self.weight_col_obs = PerChannelMinMaxObserver(ch_axis=1, dtype=dtype,
+                                                       qscheme=per_channel_qscheme,
+                                                       quant_min=quant_min,
+                                                       quant_max=quant_max,
+                                                       factory_kwargs=factory_kwargs)
+
+        self.equalization_scale = torch.tensor(1)
+
+    def forward(self, w_orig):
+        if not (w_orig.ndim >= 2 and w_orig.ndim <= 5):
+            raise ValueError("InputEqualizationObserver only supports Linear and Conv layers")
+
+        return self.weight_col_obs(w_orig)
+
+    def get_weight_col_minmax(self):
+        return (self.weight_col_obs.min_val, self.weight_col_obs.max_val)
+
+    def set_equalization_scale(self, equalization_scale):
+        self.equalization_scale = equalization_scale
+
+    with_args = classmethod(_with_args)
+
+
+def calculate_equalization_scale(input_obs: _InputEqualizationObserver,
+                                 weight_obs: _WeightEqualizationObserver) -> torch.Tensor:
+    r""" Calculates the equalization scale and sets the equalization_scale value
+    in the observers.
+
+    Args:
+        input_obs: Observer that tracks the ranges for the input columns
+        weight_obs: Observer that tracks the ranges for the weight columns
+    """
+
+    (min_inputs, max_inputs) = input_obs.get_input_minmax()
+    (min_weights, max_weights) = weight_obs.get_weight_col_minmax()
+
+    if not (check_min_max_valid(min_inputs, max_inputs) and check_min_max_valid(min_weights, max_weights)):
+        warnings.warn(
+            "Must run observer before calling calculate_equalization_scale. " +
+            "Returning default equalization scale torch.tensor(1)."
+        )
+        return torch.tensor(1)
+
+    if not (min_inputs.shape == min_weights.shape):
+        raise ValueError(
+            "Input and Weight must have the same column dimension. " +
+            f"Found {min_inputs.shape} and {min_weights.shape} shapes instead."
+        )
+
+    equalization_scale = torch.sqrt((max_weights - min_weights) / (max_inputs - min_inputs))
+    # Replace all 'inf', 'nan', 0's with 1s to prevent errors
+    equalization_scale[equalization_scale == 0.] = 1
+    equalization_scale = torch.nan_to_num(equalization_scale, nan=1, posinf=1, neginf=1)
+    return equalization_scale
+
+
+class EqualizationQConfig(namedtuple('EqualizationQConfig', ['input_activation', 'weight'])):
+    """
+    Describes how to quantize a layer or a part of the network specifically for
+    input-weight equalization by providing settings (observer classes) for
+    inputs, outputs, and weights.
+
+    Note that EqualizationQConfig needs to contain observer **classes** (like
+    MinMaxObserver) or a callable that returns instances on invocation, not the
+    concrete observer instances themselves.
+    Quantization function will instantiate observers multiple times for each of
+    the layers.
+
+    Observer classes have usually reasonable default arguments, but they can be
+    overwritten with `with_args` method (that behaves like functools.partial):
+
+    my_qconfig = EqualizationQConfig(input_activation=_InputEqualizationObserver.with_args(dtype=torch.qint8),
+                                    weight=_WeightEqualizationObserver.with_args(dtype=torch.qint8))
+    """
+    def __new__(cls, input_activation=torch.nn.Identity, weight=torch.nn.Identity):
+        if isinstance(input_activation, nn.Module) or isinstance(weight, nn.Module):
+            raise ValueError("EqualizationQConfig received observer instance, please pass observer class instead. " +
+                             "Use MyObserver.with_args(x=1) to override arguments to constructor if needed")
+        self = super().__new__(cls, input_activation, weight)
+        return self
+
+
+input_equalization_observer = _InputEqualizationObserver.with_args(
+    dtype=torch.quint8, qscheme=torch.per_tensor_symmetric)
+weight_equalization_observer = _WeightEqualizationObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_channel_symmetric)
+default_equalization_qconfig = EqualizationQConfig(input_activation=input_equalization_observer,
+                                                   weight=weight_equalization_observer)
+
+
+def fused_module_supports_equalization(module) -> bool:
+    """ Checks if the fused node supports equalization. """
+    return type(module) in [nni.LinearReLU, nni.ConvReLU1d, nni.ConvReLU2d, nni.ConvReLU3d]
+
+def nn_module_supports_equalization(module) -> bool:
+    """ Checks if the torch.nn node supports equalization. """
+    return type(module) in [nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d]
+
+def custom_module_supports_equalization(module) -> bool:
+    """ Checks if the custom node supports equalization. """
+    return type(module) in CUSTOM_MODULE_SUPP_LIST
+
+
+def node_supports_equalization(node: Node, modules) -> bool:
+    """ Checks if the current node supports equalization
+    Currently we only support nn.Linear/F.Linear and nn.Conv/F.conv layers
+    """
+    if node.op == 'call_module':
+        return nn_module_supports_equalization(modules[str(node.target)]) or \
+            fused_module_supports_equalization(modules[str(node.target)]) or \
+            custom_module_supports_equalization(modules[str(node.target)])
+    elif node.op == 'call_function':
+        return node.target in [F.linear, F.conv1d, F.conv2d, F.conv3d]
+    return False
+
+def is_equalization_observer(observer: nn.Module) -> bool:
+    return (isinstance(observer, (_InputEqualizationObserver, _WeightEqualizationObserver)))
+
+
+###############################################################################
+# Functions for equalization during convert                                   #
+###############################################################################
+
+def get_op_node_and_weight_eq_obs(
+    input_eq_obs_node: Node,
+    model: GraphModule,
+    modules: Dict[str, nn.Module]
+) -> Tuple[Optional[Node], Optional[_WeightEqualizationObserver]]:
+    """ Gets the following weight equalization observer. There should always
+    exist a weight equalization observer after an input equalization observer.
+
+    Returns the operation node that follows the input equalization observer node
+    and the weight equalization observer
+    """
+
+    # Find the op node that comes directly after the input equalization observer
+    op_node = None
+    for user in input_eq_obs_node.users.keys():
+        if node_supports_equalization(user, modules):
+            op_node = user
+            break
+
+    assert op_node is not None
+    if op_node.op == 'call_module':
+        # If the op_node is a nn.Linear layer, then it must have a
+        # WeightEqualizationObserver configuration
+        maybe_equalization_node_name_to_config = _get_observed_graph_module_attr(model, "equalization_node_name_to_qconfig")
+        assert maybe_equalization_node_name_to_config is not None
+        equalization_node_name_to_qconfig: Dict[str, Any] = maybe_equalization_node_name_to_config  # type: ignore[assignment]
+        assert equalization_node_name_to_qconfig.get(op_node.name, None) is not None
+        weight_eq_obs = equalization_node_name_to_qconfig.get(op_node.name, None).weight()
+
+        assert isinstance(weight_eq_obs, _WeightEqualizationObserver)
+        return op_node, weight_eq_obs
+
+    elif op_node.op == 'call_function':
+        weight_node = maybe_get_weight_eq_obs_node(op_node, modules)
+        if weight_node is not None:
+            weight_eq_obs = modules[str(weight_node.target)]
+            assert isinstance(weight_eq_obs, _WeightEqualizationObserver)
+            return op_node, weight_eq_obs
+
+    return None, None
+
+def maybe_get_weight_eq_obs_node(op_node: Node, modules: Dict[str, nn.Module]) -> Optional[Node]:
+    """ Gets the weight equalization observer node if it exists.
+    """
+    assert op_node.op == 'call_function'
+    for node_arg in op_node.args:
+        if node_arg_is_weight(op_node, node_arg):
+            assert (isinstance(node_arg, Node) and node_arg.op == 'call_module' and
+                   isinstance(modules[str(node_arg.target)], _WeightEqualizationObserver))
+            return node_arg
+    return None
+
+def maybe_get_next_input_eq_obs(node: Node, modules: Dict[str, nn.Module]) -> Optional[_InputEqualizationObserver]:
+    """ Gets the following input equalization observer if it exists.
+
+    For example, in the case of connecting linear layers:
+        x -> inp_obs1 -> eq_obs1 -> linear1 -> out_obs1 -> eq_obs2 -> linear2 -> out_obs2
+    If the node being passed in is the linear1 node, then we want to return eq_obs2,
+    the following equalization observer for linear2.
+
+    However, if there are no connecting layers:
+        x -> inp_obs1 -> eq_obs1 -> linear1 -> out_obs1 -> add
+    Then we want to return None.
+
+    In the case of an unfused linear-relu layer with a connecting linear layer:
+        linear1 -> relu -> out_obs1 -> eq_obs2 -> linear2 -> out_obs2
+    Since it is unfused, we want to skip over the relu layer and return eq_obs2,
+    the following equalization observer for linear2.
+    """
+
+    assert node_supports_equalization(node, modules)
+
+    # Locate the following nn.ReLU or F.relu node if it exists
+    maybe_relu_node = maybe_get_next_module(node, modules, nn.ReLU)
+    if maybe_relu_node is None:
+        maybe_relu_node = maybe_get_next_module(node, modules, target_functional_type=F.relu)
+
+    # Locate the following output observer if it exists.
+    # We will skip the relu node if it exists.
+    maybe_obs_node = (
+        maybe_get_next_module(node, modules, ObserverBase)
+        if maybe_relu_node is None
+        else maybe_get_next_module(maybe_relu_node, modules, ObserverBase)
+    )
+    if maybe_obs_node is None:
+        return None
+
+    maybe_eq_obs_node = maybe_get_next_module(maybe_obs_node, modules, _InputEqualizationObserver)
+    if maybe_eq_obs_node is None:
+        return None
+
+    maybe_eq_obs = modules[str(maybe_eq_obs_node)]
+    assert isinstance(maybe_eq_obs, _InputEqualizationObserver)
+    return maybe_eq_obs
+
+def maybe_get_next_equalization_scale(node: Node, modules: Dict[str, nn.Module]) -> Optional[torch.Tensor]:
+    """ If the next next node is an InputEqualizationObserver then we want to
+    return its equalization scale, else we return 1
+
+    This is used in the case where there are two connecting linear layers:
+        linear1 -> LinearOutObs -> InputEqObs -> linear2
+    In this case, the node given is linear1 and we want to locate the InputEqObs.
+    """
+    next_inp_eq_obs = maybe_get_next_input_eq_obs(node, modules)
+    if next_inp_eq_obs:
+        if next_inp_eq_obs.equalization_scale.nelement() == 1 and \
+           next_inp_eq_obs.equalization_scale == torch.tensor(1):
+            return None
+        return next_inp_eq_obs.equalization_scale
+    return None
+
+def scale_input_observer(node: Node, modules: Dict[str, nn.Module]) -> None:
+    """ Scales the following input quantization observer's min/max values by
+    updating the values with the scaled min/max values calculated by the input
+    equalization observer
+    """
+    input_eq_obs = modules[str(node.target)]
+    assert isinstance(input_eq_obs, _InputEqualizationObserver)
+
+    input_quant_obs_node = node.args[0]
+    assert isinstance(input_quant_obs_node, Node)
+
+    input_quant_obs = modules[str(input_quant_obs_node.target)]
+    if not isinstance(input_quant_obs, ObserverBase):
+        return
+
+    min_input_scaled, max_input_scaled = input_eq_obs.calculate_scaled_minmax()
+    if min_input_scaled is None and max_input_scaled is None:
+        return
+    input_quant_obs.min_val = min_input_scaled
+    input_quant_obs.max_val = max_input_scaled
+
+def scale_weight_node(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    equalization_scale: torch.Tensor,
+    next_equalization_scale: Optional[torch.Tensor],
+) -> None:
+    """ Scale the weights for input-weight equalization by multiplying the
+    weight by 1/equalization_scale and next_equalization_scale
+
+    Args:
+        node: Current node whose weights we want to scale
+        equalization_scale: Current node's calculated equalization scale
+        next_equalization_scale: Next node's calculated equalization scale if
+           the following node needs to be equalized, 1 otherwise
+    """
+    if equalization_scale is None:
+        return
+
+    if fused_module_supports_equalization(modules[str(node.target)]):
+        op_module = modules[str(node.target)][0]    # type: ignore[index]
+    else:
+        op_module = modules[str(node.target)]
+    assert nn_module_supports_equalization(op_module) or custom_module_supports_equalization(op_module)
+
+    # Scale the weights for input-weight equalization
+    # If the following layer needs to be equalized then we will multiply its scale
+    weight = op_module.weight
+    assert isinstance(weight, torch.Tensor)
+
+    # Scale the weights by the reciprocal of the equalization scale
+    # Reshape the equalization scale so that we can multiply it to the weight along axis=1
+    equalization_scale_reshaped = reshape_scale(equalization_scale, 1, weight)
+    scaled_weight = torch.mul(weight, torch.reciprocal(equalization_scale_reshaped))
+
+    if next_equalization_scale is None:
+        op_module.weight = nn.Parameter(scaled_weight)
+        return
+
+    # Multiply the weights row wise by the next equalization scale
+    # Reshape the equalization scale so that we can multiply it to the weight along axis=0
+    next_equalization_scale_reshaped = reshape_scale(next_equalization_scale, 0, weight)
+    scaled_weight = torch.mul(scaled_weight, next_equalization_scale_reshaped)
+
+    op_module.weight = nn.Parameter(scaled_weight)
+
+    # Multiply the bias element wise by the next equalization scale
+    bias = op_module.bias
+    if bias is None:
+        return
+    assert isinstance(bias, torch.Tensor)
+
+    # Reshape the equalization scale so that we can multiply it element-wise to the bias
+    next_equalization_scale_reshaped = reshape_scale(next_equalization_scale, 0, bias)
+    scaled_bias = torch.mul(bias, next_equalization_scale_reshaped)
+    op_module.bias = nn.Parameter(scaled_bias)
+
+def scale_weight_functional(
+    op_node: Node,
+    model: GraphModule,
+    modules: Dict[str, nn.Module],
+    equalization_scale: torch.Tensor,
+    next_equalization_scale: Optional[torch.Tensor],
+) -> None:
+    """ Scales the weight value for functional layers
+    """
+    if equalization_scale is None:
+        return
+
+    # From the given op_node, the path looks like:
+    #   get_attr(weight) -> weight_quant_obs -> weight_eq_obs -> op_node
+    # So we want to trace back from the op_node to get the equalization observer
+    # node, then the quantization observer node, and then finally the weight
+    # node which contains the weight values.
+
+    # Get the equalization observer node
+    weight_eq_obs_node = maybe_get_weight_eq_obs_node(op_node, modules)
+    if weight_eq_obs_node is None:
+        return
+
+    # Get the quantization observer node
+    weight_quant_obs_node = weight_eq_obs_node.args[0]
+    if weight_quant_obs_node is None:
+        return
+    assert (isinstance(weight_quant_obs_node, Node) and
+           isinstance(modules[str(weight_quant_obs_node.target)], ObserverBase))
+
+    # Get the get_attr(weight) node
+    weight_node = weight_quant_obs_node.args[0]
+    if weight_node is None:
+        return
+    assert isinstance(weight_node, Node) and weight_node.op == 'get_attr'
+
+    weight_parent_name, weight_name = _parent_name(weight_node.target)
+    weight = getattr(modules[weight_parent_name], weight_name)
+
+    # Scale the weights for input-weight equalization
+    # If the following layer needs to be equalized then we will multiply its scale
+    # Reshape the equalization scale so that we can multiply it to the weight along axis=1
+    equalization_scale_reshaped = reshape_scale(equalization_scale, 1, weight)
+    scaled_weight = torch.mul(weight, torch.reciprocal(equalization_scale_reshaped))
+
+    if next_equalization_scale is None:
+        setattr(modules[weight_parent_name], weight_name, scaled_weight)
+        return
+
+    # Multiply the weights row wise by the next equalization scale
+    # Reshape the equalization scale so that we can multiply it to the weight along axis=1
+    next_equalization_scale_reshaped = reshape_scale(next_equalization_scale, 0, scaled_weight)
+    scaled_weight = torch.mul(scaled_weight, next_equalization_scale_reshaped)
+
+    setattr(modules[weight_parent_name], weight_name, scaled_weight)
+    assert torch.allclose(model.get_buffer(str(weight_node.target)), scaled_weight)
+
+    # Multiply the bias element wise by the next equalization scale
+    bias_node = None
+    for node in op_node.args:
+        # Find the node containing the weight values
+        if isinstance(node, Node) and node.op == 'get_attr' and 'bias' in node.name:
+            bias_node = node
+            break
+    if bias_node is None:
+        return
+
+    bias_parent_name, bias_name = _parent_name(bias_node.target)
+    bias = getattr(modules[bias_parent_name], bias_name)
+
+    # Reshape the equalization scale so that we can multiply it element-wise to the bias
+    next_equalization_scale_reshaped = reshape_scale(next_equalization_scale, 0, bias)
+    scaled_bias = torch.mul(bias, next_equalization_scale_reshaped)
+    setattr(modules[bias_parent_name], bias_name, scaled_bias)
+
+def clear_weight_quant_obs_node(op_node: Node, modules: Dict[str, nn.Module]) -> None:
+    """ Given the operation node, we want find the corresponding quantization
+    observer and reset its min/max values
+    """
+    weight_eq_obs_node = maybe_get_weight_eq_obs_node(op_node, modules)
+    if weight_eq_obs_node is None:
+        return
+
+    weight_quant_obs_node = weight_eq_obs_node.args[0]
+    if weight_quant_obs_node is None:
+        return
+    assert isinstance(weight_quant_obs_node, Node)
+
+    weight_quant_obs = modules[str(weight_quant_obs_node.target)]
+    assert isinstance(modules[str(weight_quant_obs_node.target)], ObserverBase)
+    weight_quant_obs.reset_min_max_vals()   # type: ignore[operator]
+
+def remove_node(model: GraphModule, node: Node, prev_node: Node):
+    """ Removes the given node from the model by replacing all of its users with
+    the given previous node
+    """
+    # For all of the current node's users, replace the current node with
+    # the input quantization observer node
+    orig_users = list(node.users.keys())
+    for user_node in orig_users:
+        user_node.replace_input_with(node, prev_node)
+
+    # Erase the InputEqualizationObserver node
+    model.graph.erase_node(node)
+
+def update_obs_for_equalization(model: GraphModule, modules: Dict[str, nn.Module]) -> Dict[str, _WeightEqualizationObserver]:
+    """ Update all of the observer's equalization scale. For each
+    InputEqualizationObserver, we will find the location of the next
+    WeightEqualizationObserver, create it, and calculate the equalization scale
+    based on the two observers.
+
+    We will then return a dictionary mapping operation node names to
+    the corresponding WeightEqualizationObservers for that operation.
+    """
+    weight_eq_obs_dict = {}
+    for node in model.graph.nodes:
+        if node.op == 'call_module' and isinstance(modules[node.target], _InputEqualizationObserver):
+            input_eq_obs = modules[node.target]
+            assert isinstance(input_eq_obs, _InputEqualizationObserver)
+            op_node, weight_eq_obs = get_op_node_and_weight_eq_obs(node, model, modules)
+
+            if op_node is None or weight_eq_obs is None:
+                continue
+
+            if op_node.op == 'call_module':
+                # Calibrate the weight equalization observer since it has just
+                # been created
+                if fused_module_supports_equalization(modules[str(op_node.target)]):
+                    module = modules[str(op_node.target)][0]   # type: ignore[index]
+                    assert nn_module_supports_equalization(module)
+                    weight_eq_obs(module.weight)
+                else:
+                    weight_eq_obs(modules[str(op_node.target)].weight)
+
+            # Calculate and set the equalization scale values
+            equalization_scale = calculate_equalization_scale(input_eq_obs, weight_eq_obs)
+            input_eq_obs.set_equalization_scale(equalization_scale)
+            weight_eq_obs.set_equalization_scale(equalization_scale)
+
+            weight_eq_obs_dict[op_node.name] = weight_eq_obs
+
+    return weight_eq_obs_dict
+
+def convert_eq_obs(
+    model: GraphModule,
+    modules: Dict[str, nn.Module],
+    weight_eq_obs_dict: Dict[str, _WeightEqualizationObserver],
+) -> None:
+    """ Converts the equalization operations and updates the other nodes in the
+    following way:
+        - Removes the input equalization observers and inserts a mul operator
+          along with an equalization scale node wherever applicable (we do not
+          want to insert a mul operator between connecting linear layers).
+        - Updates the input quantization observers with the scaled input min/max
+          values.
+        - Scales the weights by the current and next equalization scales.
+        - Removes the weight equalization observer node if it exists.
+
+    Before (after prepare):
+                                    weight values
+                                          |
+                                    WeightQuantObs
+                                          |
+                                      WeightEqObs
+                                          |
+        x -> InpQuantObs -> InpEqObs -> linear -> OutQuantObs
+
+    After this function:
+                                              scaled weight values
+                                                      |
+       equalization scale                       WeightQuantObs
+              |                                       |
+        x -> mul -> InpQuantObs (scaled min/max) -> linear -> OutQuantObs
+
+    After convert:
+       equalization scale                 scaled weight values
+              |                                    |
+        x -> mul -> quantize_per_tensor -> quantized::linear
+
+    Note that although the equalization observer appeared after the quantization
+    observer after prepare_fx, the mul node appears before the quantization node
+    after convert_fx. This is because placing the equalization observer after
+    the quantization observer in prepare_fx would allow us to keep the invariant
+    that the graph before the current node inserts its observers is not
+    modified.
+
+    Having the equalization observer before the quantization observer would also
+    cause some inconsistences between the ordering of the quantization and
+    equalization observers.
+    For example, a single linear layer would look like:
+        x -> InpEqObs1 -> InpQuantObs1 -> linear1 -> OutQuantObs1
+    But between two connected linear layers, it would look like:
+        linear1 -> OutQuantObs1 -> InpEqObs2 -> linear2 -> OutQuantObs2
+    """
+    for node in model.graph.nodes:
+        if node.op == 'call_module' and isinstance(modules[node.target], _InputEqualizationObserver):
+            inp_quant_obs_node = node.args[0]
+            prev_node = inp_quant_obs_node.args[0]
+
+            # If the previous node is a layer that needs to be equalized, then
+            # we will remove the current node because we do not need to add any
+            # equalization nodes between two layers that need to be equalized
+
+            # Before: linear1/relu (prev_node) -> output_quant_obs1 (inp_quant_obs_node) -> input_eq_obs2 (node) -> linear2
+            # After: linear1/relu (prev_node) -> output_quant_obs1 (inp_quant_obs_node) -> linear2
+            if node_supports_equalization(prev_node, modules) or "relu" in prev_node.name:
+                remove_node(model, node, inp_quant_obs_node)
+                continue
+
+            # Update the following input quantization observer's min/max values
+            scale_input_observer(node, modules)
+
+            # Remove the InputEqualization node and add a mul operator before
+            # the quantization observer node that appears before the equalization node
+            # Before: x -> input_quant_obs -> input_eq_obs -> linear
+            # After: x -> mul -> input_quant_obs -> linear
+
+            # Create a node containing the equalization scale
+            with model.graph.inserting_before(inp_quant_obs_node):
+                get_new_eq_scale_name = get_new_attr_name_with_prefix(prev_node.name + '_equalization_scale')
+                name = get_new_eq_scale_name(modules)
+                setattr(model, name, modules[node.target].equalization_scale)
+                eq_scale_node = model.graph.create_node('get_attr', name)
+
+            # Create a node multiplying the input with the equalization scale
+            with model.graph.inserting_after(eq_scale_node):
+                inputs = (prev_node, eq_scale_node)
+                mul_node = model.graph.create_node("call_function", torch.mul, inputs)
+
+            # Set the mul nod to be the input_quant_obs_node's input instead of
+            # the previous node
+            inp_quant_obs_node.replace_input_with(prev_node, mul_node)
+            remove_node(model, node, inp_quant_obs_node)
+
+        elif weight_eq_obs_dict.get(node.name, None) is not None:
+            weight_eq_obs = weight_eq_obs_dict.get(node.name)
+            assert isinstance(weight_eq_obs, _WeightEqualizationObserver)
+            equalization_scale = weight_eq_obs.equalization_scale
+
+            if equalization_scale.nelement() == 1 and equalization_scale == torch.tensor(1):
+                equalization_scale = None  # type: ignore[assignment]
+            maybe_next_equalization_scale = maybe_get_next_equalization_scale(node, modules)
+
+            # Scale the weight nodes
+            if node.op == 'call_module':
+                scale_weight_node(node, modules, equalization_scale, maybe_next_equalization_scale)
+            elif node.op == 'call_function':
+                scale_weight_functional(node, model, modules, equalization_scale, maybe_next_equalization_scale)
+
+                weight_eq_obs_node = maybe_get_weight_eq_obs_node(node, modules)
+                if weight_eq_obs_node is None:
+                    return
+                assert isinstance(modules[str(weight_eq_obs_node.target)], _WeightEqualizationObserver)
+
+                # Clear the quantization observer's min/max values so that they
+                # can get updated later based on the new scale values
+                clear_weight_quant_obs_node(node, modules)
+
+                # Erase the weight equalization observer node
+                prev_node = weight_eq_obs_node.args[0]
+                remove_node(model, weight_eq_obs_node, prev_node)
+            else:
+                raise ValueError("Expected operation node to be 'call_module' or 'call_function" +
+                                 f"Instead got node {node.name} as '{node.op}'.")
+
+def _convert_equalization_ref(model: GraphModule):
+    """ Reference function which applies changes needed for equalization, but
+    does not quantize the nodes
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+
+    # Calculate the equalization scale, update the observers with the scaled
+    # inputs, and scale the weight
+    weight_eq_obs_dict = update_obs_for_equalization(model, modules)
+    convert_eq_obs(model, modules, weight_eq_obs_dict)
+
+    return GraphModule(model, model.graph)
+
+
+###############################################################################
+# Functions for running the equalized model on the Numeric Suite              #
+###############################################################################
+
+def get_layer_sqnr_dict(model_a: nn.Module, model_b: nn.Module, x: torch.Tensor) -> Dict[str, float]:
+    """ Runs the Numeric Suite on model_a and model_b and returns a dictionary
+    containing the SQNR between layers in model_a and model_b.
+
+    Note: In order to support equalized models, this function has a hacky fix in
+    which we do not match any torch.mul operators. This is because equalized
+    models contain extra mul operators to scale the input by the equalization
+    scale, but this edge case has not been resolved yet within the numeric suite code.
+
+    Args:
+        model_a: A float model
+        model_b: A quantized model
+        x: Inputs to use during calibration
+    """
+    import torch.ao.ns._numeric_suite_fx as ns
+    from torch.ao.ns.fx.mappings import get_unmatchable_types_map
+
+    unmatchable_types_map = get_unmatchable_types_map()
+    unmatchable_types_map["funs_unmatchable"].add(torch.mul)
+
+    model_a_ns, model_b_ns = ns.add_loggers(
+        'fp32', model_a,
+        'int8', model_b,
+        ns.OutputLogger,
+        unmatchable_types_map=unmatchable_types_map
+    )
+
+    model_a_ns(x)
+    model_b_ns(x)
+
+    activation_comparison_dict = ns.extract_logger_info(
+        model_a_ns,
+        model_b_ns,
+        ns.OutputLogger,
+        'int8')
+    ns.extend_logger_results_with_comparison(
+        activation_comparison_dict,
+        'fp32', 'int8',
+        torch.ao.ns.fx.utils.compute_sqnr, 'sqnr'
+    )
+
+    # Construct a dictionary mapping layer names to the SQNR values
+    layer_sqnr_dict = {}
+    for key in activation_comparison_dict:
+        layer = activation_comparison_dict[key]['node_output']['int8'][0]['fqn']
+        sqnr = activation_comparison_dict[key]['node_output']['int8'][0]['sqnr'][0]
+        layer_sqnr_dict[layer] = sqnr
+
+    return layer_sqnr_dict
+
+def get_equalization_qconfig_dict(
+    layer_sqnr_dict: Dict[str, float],
+    num_layers_to_equalize: int
+) -> Any:
+    """ Given the layer to SQNR dictionary, find the layers with the highest
+    quantization errors, and return an equalization_qconfig_dict
+    specifying to only equalize those top layers.
+
+    Args:
+        layer_sqnr_dict: Dictionary mapping layer names to SQNR values (found
+            when comparing an equalized model against a float model)
+        num_layers_to_equalize: Number of layers with the highest quantization
+           errors to equalize
+    """
+
+    # Sort the layer_sqnr_dictionary values and get the layers with the lowest
+    # SQNR values (aka highest quantization errors)
+    layer_sqnr_sorted = sorted(layer_sqnr_dict.items(), key=lambda item: item[1])
+    layers_to_equalize = layer_sqnr_sorted[:num_layers_to_equalize]
+
+    # Constructs an equalization_qconfig_dict that specifies to only equalize
+    # the layers with the highest quantization errors
+    module_to_qconfig_list = [(item[0], default_equalization_qconfig) for item in layers_to_equalize]
+    equalization_qconfig_dict = {"module_name": module_to_qconfig_list}
+    return equalization_qconfig_dict
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_lower_to_native_backend.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_lower_to_native_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0144595cd1c4f392d99991e47a4af7082bcd83b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -0,0 +1,1170 @@
+import torch
+from torch.fx import map_arg, Node
+from torch.fx.graph import Graph
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.quantized.reference as nnqr
+from torch.ao.nn.quantized.modules.utils import WeightedQuantizedModule
+from torch.fx import GraphModule
+from .utils import (
+    collect_producer_nodes,
+    get_linear_prepack_op_for_dtype,
+    get_new_attr_name_with_prefix,
+    get_qconv_prepack_op,
+    graph_module_from_producer_nodes,
+)
+from ..utils import _parent_name
+from ..qconfig import QConfigAny
+from ..quantization_mappings import get_quantized_operator
+from .utils import create_node_from_old_node_preserve_meta
+from typing import Dict, Tuple, Type, List, Callable, Any, Union, Set, Optional
+import operator
+
+QOP_TO_ARG_NAMES_TO_SKIP = {
+    torch._ops.ops.quantized.hardswish: ['inplace'],
+    torch._ops.ops.quantized.elu: ['inplace'],
+    torch._ops.ops.quantized.dropout: ['inplace'],
+    torch._ops.ops.quantized.instance_norm:
+    ['running_mean', 'running_var', 'use_input_stats', 'momentum'],
+}
+
+def _is_node_in_list(node, modules, func_list, method_list, module_type_list):
+    is_call_function = node.op == "call_function" and node.target in func_list
+    is_call_method = node.op == "call_method" and node.target in method_list
+    is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list
+    return is_call_function, is_call_method, is_call_module
+
+def is_fixed_qparams_node(node, modules):
+    func_list = [
+        torch.nn.functional.hardsigmoid,
+        torch.nn.functional.sigmoid,
+        torch.sigmoid,
+        torch.tanh,
+    ]
+    method_list = [
+        "hardsigmoid",
+        "hardsigmoid_",
+        "sigmoid",
+        "sigmoid_",
+        "tanh",
+        "tanh_",
+    ]
+    module_type_list = [
+        torch.nn.Hardsigmoid,
+        torch.nn.Sigmoid,
+        torch.nn.Tanh,
+        torch.nn.Softmax,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_default_node(node, modules):
+    func_list = [
+        torch.nn.functional.elu,
+        torch.nn.functional.hardswish,
+        torch.nn.functional.instance_norm,
+        torch.nn.functional.layer_norm,
+        torch.nn.functional.leaky_relu,
+        torch.nn.functional.dropout,
+    ]
+    method_list: List[Any] = []
+    module_type_list = [
+        nnqr.ConvTranspose1d,
+        nnqr.ConvTranspose2d,
+        nnqr.ConvTranspose3d,
+        torch.nn.ELU,
+        torch.nn.LeakyReLU,
+        torch.nn.Hardswish,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.Dropout,
+        torch.nn.PReLU,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.ao.nn.intrinsic.BNReLU2d,
+        torch.ao.nn.intrinsic.BNReLU3d,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_copy_node(node, modules):
+    func_list = [
+        torch.adaptive_avg_pool1d,
+        torch.nn.functional.adaptive_avg_pool2d,
+        torch.nn.functional.adaptive_avg_pool3d,
+        torch.nn.functional.hardtanh,
+        torch.nn.functional.hardtanh_,
+        torch.nn.functional.interpolate,
+        torch.nn.functional.max_pool1d,
+        torch.nn.functional.max_pool2d,
+        torch.nn.functional.max_pool3d,
+        torch.nn.functional.relu,
+        torch.nn.functional.relu6,
+        torch.avg_pool1d,
+        torch._C._nn.avg_pool2d,
+        torch._C._nn.avg_pool3d,
+        torch.clamp,
+        torch.flatten,
+        torch.mean,
+        operator.floordiv,
+        # F.channel_shuffle and torch.channel_shuffle are essentially the same thing
+        # so we only need to put one of them here
+        torch.channel_shuffle,
+    ]
+    method_list = [
+        "clamp",
+        "mean",
+        "relu",
+        "relu_",
+    ]
+    module_type_list = [
+        torch.nn.AdaptiveAvgPool1d,
+        torch.nn.AdaptiveAvgPool2d,
+        torch.nn.AdaptiveAvgPool3d,
+        torch.nn.AvgPool1d,
+        torch.nn.AvgPool2d,
+        torch.nn.AvgPool3d,
+        torch.nn.Hardtanh,
+        torch.nn.MaxPool1d,
+        torch.nn.MaxPool2d,
+        torch.nn.MaxPool3d,
+        torch.nn.ReLU,
+        torch.nn.ReLU6,
+        torch.nn.ChannelShuffle,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_general_tensor_shape_node(node, modules):
+    func_list = [
+        torch.narrow,
+        torch.transpose,
+        torch.repeat_interleave,
+        torch.squeeze,
+        torch.stack,
+        torch.unsqueeze,
+        torch.nn.functional.pixel_shuffle,
+        torch.nn.functional.pixel_unshuffle,
+    ]
+    method_list = [
+        "contiguous",
+        "detach",
+        "detach_",
+        "permute",
+        "repeat",
+        "repeat_interleave",
+        "reshape",
+        "resize_",
+        "shape",
+        "size",
+        "squeeze",
+        "squeeze_",
+        "transpose",
+        "unsqueeze",
+        "unsqueeze_",
+        "view",
+    ]
+    module_type_list = [
+        torch.nn.Identity,
+        torch.nn.PixelShuffle,
+        torch.nn.PixelUnshuffle,
+    ]
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_other_node(node, modules):
+    func_list = [
+        torch.cat,
+    ]
+    method_list: List[Any] = []
+    module_type_list: List[Any] = []
+    return _is_node_in_list(node, modules, func_list, method_list, module_type_list)
+
+def is_special_pattern_node(node, modules):
+    res_function, res_method, res_module = False, False, False
+    for checker in [is_fixed_qparams_node, is_default_node, is_copy_node, is_general_tensor_shape_node, is_other_node]:
+        is_call_function, is_call_method, is_call_module = checker(node, modules)
+        res_function = res_function or is_call_function
+        res_method = res_method or is_call_method
+        res_module = res_module or is_call_module
+    return res_function, res_method, res_module
+
+def is_dequantize_node(node):
+    return isinstance(node, Node) and node.op == "call_method" and node.target == "dequantize"
+
+def is_getattr_tensor_metadata_node(node):
+    return node.op == "call_function" and \
+        node.target == getattr and \
+        node.args[1] in ["shape"]
+
+def is_get_tensor_info_node(node):
+    return node.op == "call_method" and \
+        node.target in ["shape", "size"]
+
+def should_skip_lowering(op: torch.fx.node.Node, qconfig_map: Dict[str, QConfigAny]):
+    """
+    Return True if the op is configured with a None qconfig, False otherwise.
+    Note: maybe need to generalize this to also check for the dtype, and we
+    only lower when dtype matches, but right now fbgemm/qnnpack only support
+    a single dtype, so it is OK for now.
+    """
+    return op.name in qconfig_map and qconfig_map[op.name] is None
+
+# Mapping from reference module class to the replacement static quantized module class for lowering
+STATIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[WeightedQuantizedModule]] = {
+    nnqr.Linear: nnq.Linear,
+    nnqr.Conv1d: nnq.Conv1d,
+    nnqr.Conv2d: nnq.Conv2d,
+    nnqr.Conv3d: nnq.Conv3d,
+}
+
+# Mapping from reference module class to the replacement dynamic quantized module class for lowering
+DYNAMIC_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = {
+    nnqr.Linear: nnqd.Linear,
+    nnqr.GRUCell: nnqd.GRUCell,
+    nnqr.LSTMCell: nnqd.LSTMCell,
+    nnqr.RNNCell: nnqd.RNNCell,
+    nnqr.LSTM: nnqd.LSTM,
+    nnqr.GRU: nnqd.GRU,
+}
+
+# Mapping from reference module class to the replacement weight only quantized module class for lowering
+# TODO: correct the namespace for these modules
+WEIGHT_ONLY_LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[nn.Module]] = {
+    nnqr.Embedding: nnq.Embedding,
+    nnqr.EmbeddingBag: nnq.EmbeddingBag,
+}
+
+# TODO: merge with STATIC_LOWER_MODULE_MAP after we merge
+# _lower_static_weighted_ref_module and special_pattern_replacement
+SPECIAL_PATTERN_LOWER_MODULE_MAP = {
+    nn.BatchNorm2d: nnq.BatchNorm2d,
+    nn.BatchNorm3d: nnq.BatchNorm3d,
+    nnqr.ConvTranspose1d: nnq.ConvTranspose1d,
+    nnqr.ConvTranspose2d: nnq.ConvTranspose2d,
+    nnqr.ConvTranspose3d: nnq.ConvTranspose3d,
+    nn.ELU: nnq.ELU,
+    nn.LeakyReLU: nnq.LeakyReLU,
+    nn.Hardswish: nnq.Hardswish,
+    nn.InstanceNorm1d: nnq.InstanceNorm1d,
+    nn.InstanceNorm2d: nnq.InstanceNorm2d,
+    nn.InstanceNorm3d: nnq.InstanceNorm3d,
+    nn.LayerNorm: nnq.LayerNorm,
+    nn.Dropout: nnq.Dropout,
+    nn.Softmax: nnq.Softmax,
+    nn.PReLU: nnq.PReLU,
+    nni.BNReLU2d: nniq.BNReLU2d,
+    nni.BNReLU3d: nniq.BNReLU3d,
+}
+
+# Mapping from fused module class to a 2-tuple of:
+#   1) The inner reference module class
+#   2) The replacement static quantized module class for lowering
+STATIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[WeightedQuantizedModule]]] = {
+    nni.LinearReLU: (nnqr.Linear, nniq.LinearReLU),
+    # TODO: LinearLeakyReLU is registered as global but it is only fused and
+    # lowered when ondnn's backend config is used. Maybe need to separate
+    # registration and lowering functions for different backends in the future.
+    nni.LinearLeakyReLU: (nnqr.Linear, nniq.LinearLeakyReLU),
+    nni.LinearTanh: (nnqr.Linear, nniq.LinearTanh),
+    nni.ConvReLU1d: (nnqr.Conv1d, nniq.ConvReLU1d),
+    nni.ConvReLU2d: (nnqr.Conv2d, nniq.ConvReLU2d),
+    nni.ConvReLU3d: (nnqr.Conv3d, nniq.ConvReLU3d),
+}
+
+# The difference between STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP and STATIC_LOWER_FUSED_MODULE_MAP:
+# The refer node inside STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP has 2 inputs.
+# Mapping from fused module class to a 2-tuple of:
+#   1) The inner reference module class
+#   2) The replacement static quantized module class for lowering
+STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[WeightedQuantizedModule]]] = {
+    nni.ConvAdd2d: (nnqr.Conv2d, nniq.ConvAdd2d),
+    nni.ConvAddReLU2d: (nnqr.Conv2d, nniq.ConvAddReLU2d),
+}
+
+# Mapping from fused module class to a 2-tuple of:
+#   1) The inner reference module class
+#   2) The replacement dynamic quantized module class for lowering
+DYNAMIC_LOWER_FUSED_MODULE_MAP: Dict[Type[nn.Module], Tuple[Type[nn.Module], Type[nn.Module]]] = {
+    nni.LinearReLU: (nnqr.Linear, nniqd.LinearReLU),
+}
+
+# Mapping from a functional to lower to a 2-tuple of
+#   1) The quantized version of the op
+#   2) The quantized version of the op fused with relu, if it exists, else None
+STATIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Tuple[Callable, Optional[Callable]]] = {
+    F.linear: (torch.ops.quantized.linear, torch.ops.quantized.linear_relu),
+    F.conv1d: (torch.ops.quantized.conv1d, torch.ops.quantized.conv1d_relu),
+    F.conv2d: (torch.ops.quantized.conv2d, torch.ops.quantized.conv2d_relu),
+    F.conv3d: (torch.ops.quantized.conv3d, torch.ops.quantized.conv3d_relu),
+    F.conv_transpose1d: (torch.ops.quantized.conv_transpose1d, None),
+    F.conv_transpose2d: (torch.ops.quantized.conv_transpose2d, None),
+    F.conv_transpose3d: (torch.ops.quantized.conv_transpose3d, None),
+}
+
+WEIGHT_PREPACK_OPS: Set[Callable] = {
+    torch._ops.ops.quantized.linear_prepack,
+    torch._ops.ops.quantized.linear_prepack_fp16,
+    torch._ops.ops.quantized.conv1d_prepack,
+    torch._ops.ops.quantized.conv2d_prepack,
+    torch._ops.ops.quantized.conv3d_prepack,
+    torch.ops.quantized.conv_transpose1d_prepack,
+    torch.ops.quantized.conv_transpose2d_prepack,
+    torch.ops.quantized.conv_transpose3d_prepack,
+}
+
+# Mapping from a functional to a dictionary, where the key is a 2-tuple of
+# (input_activation_dtype, weight_dtype) and the value is a 2-tuple of
+#   1) The dynamically quantized version of the op
+#   2) The dynamically quantized version of the op fused with relu, if it exists, else None
+DYNAMIC_LOWER_FUNCTIONAL_MAP: Dict[Callable, Dict[Tuple[torch.dtype, torch.dtype], Tuple[Callable, Optional[Callable]]]] = {
+    F.linear: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.linear_dynamic,
+                                      torch.ops.quantized.linear_relu_dynamic),
+        (torch.float16, torch.float16): (torch.ops.quantized.linear_dynamic_fp16,
+                                         torch.ops.quantized.linear_relu_dynamic_fp16)
+    },
+    # dynamic conv + relu is not available yet
+    F.conv1d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv1d_dynamic, None),
+    },
+    F.conv2d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv2d_dynamic, None),
+    },
+    F.conv3d: {
+        (torch.quint8, torch.qint8): (torch.ops.quantized.conv3d_dynamic, None),
+    },
+}
+
+CONV_FUNCTIONAL_OPS: Set[Callable] = {
+    F.conv1d,
+    F.conv2d,
+    F.conv3d,
+}
+
+CONV_TRANSPOSE_FUNCTIONAL_OPS: Set[Callable] = {
+    F.conv_transpose1d,
+    F.conv_transpose2d,
+    F.conv_transpose3d,
+}
+
+# TODO: add tests for lowering these ops
+QBIN_OP_MAPPING: Dict[Union[Callable, str], Callable] = {
+    operator.add: torch.ops.quantized.add,
+    torch.add: torch.ops.quantized.add,
+    operator.mul: torch.ops.quantized.mul,
+    operator.matmul: torch.ops.quantized.matmul,
+    torch.mul: torch.ops.quantized.mul,
+    torch.matmul: torch.ops.quantized.matmul,
+}
+QBIN_RELU_OP_MAPPING: Dict[Union[Callable, str], Callable] = {
+    operator.add: torch.ops.quantized.add_relu,
+    torch.add: torch.ops.quantized.add_relu,
+    operator.mul: torch.ops.quantized.mul_relu,
+    torch.mul: torch.ops.quantized.mul_relu,
+}
+
+def _save_packed_weight(self, destination, prefix, keep_vars):
+    for attr_name in dir(self):
+        if "_packed_weight" in attr_name and \
+           isinstance(getattr(self, attr_name), torch._C.ScriptObject):  # type: ignore[attr-defined]
+            packed_weight = getattr(self, attr_name)
+            destination[prefix + attr_name] = packed_weight
+
+def _load_packed_weight(self, state_dict, prefix, local_metadata, strict,
+                        missing_keys, unexpected_keys, error_msgs):
+    attrs_to_pop = []
+    for attr_name in state_dict:
+        if attr_name.startswith("_packed_weight") and isinstance(state_dict[attr_name], torch._C.ScriptObject):  # type: ignore[attr-defined] # noqa: B950
+            setattr(self, attr_name, state_dict[attr_name])
+            attrs_to_pop.append(attr_name)
+
+    # pop the packed param attributesn
+    for attr_name in attrs_to_pop:
+        state_dict.pop(attr_name)
+
+def fold_weight(
+    quantized_model: GraphModule,
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> GraphModule:
+    """
+    Trace back from the weight node util we hit getattr, reconstruct the
+    graph module with the traced nodes and run the graph module to pack the
+    weight. then replace the original chain of ops with the packed weight.
+    """
+    packed_weights = {}
+    # map from folded node name to the prepacked weight name
+    folded_nodes = {}
+    # get packed weights
+    for node in quantized_model.graph.nodes:
+        if node.op == 'call_function' and node.target in WEIGHT_PREPACK_OPS:
+            nodes_to_fold = collect_producer_nodes(node)
+            if nodes_to_fold is not None:
+                for node_to_fold in nodes_to_fold:
+                    folded_nodes[node_to_fold.name] = node
+
+                prepacking_module = graph_module_from_producer_nodes(
+                    quantized_model, nodes_to_fold)
+                packed_weight = prepacking_module()
+                packed_weights[node.name] = packed_weight
+
+    # remove folded nodes and replace the prepacking node with getattr
+    folded_graph = Graph()
+    env: Dict[Any, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+
+    for node in quantized_model.graph.nodes:
+        prepack_node = folded_nodes.get(node.name, None)
+        if prepack_node is node:
+            packed_weight = packed_weights[node.name]
+            # add a prepacked attribute to root
+            op_node = next(iter(prepack_node.users))
+            module_path, _ = node_name_to_scope[op_node.name]
+            get_new_packed_weight_name = \
+                get_new_attr_name_with_prefix(module_path + '_packed_weight_')
+            packed_weight_name = get_new_packed_weight_name(quantized_model)
+            setattr(quantized_model, packed_weight_name, packed_weight)
+            # replace prepack node with a getattr node
+            env[node.name] = folded_graph.create_node(
+                'get_attr', packed_weight_name, (), {})
+        elif prepack_node is not None:
+            # remove the foled node
+            continue
+        else:
+            # copy other nodes
+            env[node.name] = folded_graph.node_copy(node, load_arg)
+
+    quantized_model = GraphModule(quantized_model, folded_graph)
+    quantized_model._register_state_dict_hook(_save_packed_weight)
+    quantized_model._register_load_state_dict_pre_hook(_load_packed_weight, with_module=True)
+    return quantized_model
+
+def _get_module(node: Node, modules: Dict[str, nn.Module]) -> Optional[nn.Module]:
+    """
+    Return the `torch.nn.Module` that corresponds to the specified node's target.
+    If no such node exists, return None.
+    """
+    if node.op == "call_module" and str(node.target) in modules:
+        return modules[str(node.target)]
+    else:
+        return None
+
+def _match_static_pattern(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    qconfig_map: Dict[str, QConfigAny],
+    matching_modules_or_ops: List[Callable],
+    dequantize_node_arg_indices: List[int]
+) -> Union[Tuple[Node, Node, Node], Tuple[None, None, None]]:
+    """
+    Match the pattern (dequantize - ref node - quantize) against the node provided.
+
+    If there is a match, return a 3-tuple of:
+      1) q_node: the quantize node,
+      2) relu_node: a relu node wrapping the ref_node, and
+      3) ref_node: a reference module or functional node to replace with its quantized counterpart
+    Otherwise, if there is no match, return a 3-tuple of (None, None, None).
+
+    Parameters:
+      node: The `torch.fx.Node` to match against.
+      modules: A mapping from node names to modules in the model graph, used for module lookup.
+      qconfig_map: A mapping from node names to the qconfigs associated with the nodes.
+          If the corresponding qconfig for the reference node is None, then return no match.
+      matching_modules_or_ops: Either a list of functions or a list of `torch.nn.Module`s.
+          If the reference node is not in this list, then return no match.
+      dequantize_node_arg_indices: A list of indices in the reference node args where dequantize
+          nodes may be present. An empty list means skipping the check for dequantize nodes.
+    """
+    SKIP_LOWERING_VALUE = (None, None, None)
+
+    # Match quantize node
+    if node.op != "call_function" or node.target != torch.quantize_per_tensor:
+        return SKIP_LOWERING_VALUE
+    q_node = node
+    ref_node = q_node.args[0]
+    assert isinstance(ref_node, Node)
+
+    # Handle cases where the node is wrapped in a ReLU
+    if (ref_node.op == "call_function" and ref_node.target in (F.relu, torch.relu)) or\
+            (ref_node.op == "call_module" and type(_get_module(ref_node, modules)) == nn.ReLU):
+        relu_node = ref_node
+        ref_node = relu_node.args[0]
+        assert isinstance(ref_node, Node)
+    else:
+        relu_node = None
+    if should_skip_lowering(ref_node, qconfig_map):
+        return SKIP_LOWERING_VALUE
+
+    # Match reference module or functional
+    if isinstance(matching_modules_or_ops[0], type) and issubclass(matching_modules_or_ops[0], nn.Module):
+        expected_op = "call_module"
+        match_key = type(_get_module(ref_node, modules))
+    else:
+        expected_op = "call_function"
+        match_key = ref_node.target
+    if ref_node.op != expected_op or match_key not in matching_modules_or_ops:
+        return SKIP_LOWERING_VALUE
+
+    # Match dequantize node(s). Both of the following conditions must pass:
+    # (1) All `torch.fx.Node`s at the matching indices must be a dequantize node
+    # (2) There must be at least one dequantize node
+    matched_dequantize = False
+    for i in dequantize_node_arg_indices:
+        assert i < len(ref_node.args), \
+            f"Dequantize index {i} exceeded reference node's arg length {len(ref_node.args)}"
+        arg = ref_node.args[i]
+        if is_dequantize_node(arg):
+            matched_dequantize = True
+        elif isinstance(arg, Node):
+            return SKIP_LOWERING_VALUE
+    if not matched_dequantize:
+        return SKIP_LOWERING_VALUE
+
+    return (q_node, relu_node, ref_node)
+
+def _match_static_pattern_with_two_inputs(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    qconfig_map: Dict[str, QConfigAny],
+    matching_modules_or_ops: List[Callable]
+) -> Union[Tuple[Node, Node], Tuple[None, None]]:
+    """
+                      (dequantize \
+    Match the pattern (dequantize - ref node - quantize) against the node provided.
+
+    If there is a match, return a 2-tuple of:
+      1) q_node: the quantize node,
+      2) ref_node: a reference module or functional node to replace with its quantized counterpart
+    Otherwise, if there is no match, return a 2-tuple of (None, None).
+
+    Parameters:
+      node: The `torch.fx.Node` to match against.
+      modules: A mapping from node names to modules in the model graph, used for module lookup.
+      qconfig_map: A mapping from node names to the qconfigs associated with the nodes.
+          If the corresponding qconfig for the reference node is None, then return no match.
+      matching_modules_or_ops: Either a list of functions or a list of `torch.nn.Module`s.
+          If the reference node is not in this list, then return no match.
+    """
+    SKIP_LOWERING_VALUE = (None, None)
+
+    # Match quantize node
+    if node.op != "call_function" or node.target != torch.quantize_per_tensor:
+        return SKIP_LOWERING_VALUE
+    q_node = node
+    ref_node = q_node.args[0]
+    assert isinstance(ref_node, Node)
+
+    if should_skip_lowering(ref_node, qconfig_map):
+        return SKIP_LOWERING_VALUE
+
+    # Match reference module or functional
+    if isinstance(matching_modules_or_ops[0], type) and issubclass(matching_modules_or_ops[0], nn.Module):
+        expected_op = "call_module"
+        match_key = type(_get_module(ref_node, modules))
+    else:
+        # This pass only support op of "call_module"
+        return SKIP_LOWERING_VALUE
+
+    if ref_node.op != expected_op or match_key not in matching_modules_or_ops:
+        return SKIP_LOWERING_VALUE
+
+    # Check ref_node has 2 input nodes, both are dq node.
+    if len(ref_node.args) != 2:
+        return SKIP_LOWERING_VALUE
+    for i in range(len(ref_node.args)):
+        arg = ref_node.args[i]
+        if not is_dequantize_node(arg):
+            return SKIP_LOWERING_VALUE
+
+    return (q_node, ref_node)
+
+def _lower_static_weighted_ref_module(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and find dequantize - ref module - quantize patterns
+    and replace them with the quantized version of the ref module.
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        matching_modules = list(STATIC_LOWER_MODULE_MAP.keys()) + list(STATIC_LOWER_FUSED_MODULE_MAP.keys())
+        (q_node, relu_node, ref_node) = _match_static_pattern(
+            n, modules, qconfig_map, matching_modules, dequantize_node_arg_indices=[0])  # type: ignore[arg-type]
+        if q_node is None:
+            continue
+        assert ref_node is not None
+        (_, scale_node, zero_point_node, _) = q_node.args
+        ref_module = _get_module(ref_node, modules)
+        ref_class = type(ref_module)
+        assert isinstance(scale_node, Node)
+        assert isinstance(zero_point_node, Node)
+        assert issubclass(ref_class, nn.Module)
+
+        # Step 1: Change this pattern to use the corresponding quantized module
+        # For fused modules, we also check whether the inner module is a reference module
+        # If so, we replace the entire fused module with the corresponding quantized module
+        if ref_class in STATIC_LOWER_FUSED_MODULE_MAP:
+            inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+                continue
+        else:
+            q_class = STATIC_LOWER_MODULE_MAP[ref_class]
+        output_scale = getattr(model, scale_node.target)
+        output_zero_point = getattr(model, zero_point_node.target)
+        q_module = q_class.from_reference(ref_module, output_scale, output_zero_point)
+        # replace reference module with quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(modules[parent_name], module_name, q_module)
+
+        # Step 2: Reroute around dq_node, and remove q_node and its args
+        assert len(ref_node.args) == 1
+        dq_node = ref_node.args[0]
+        assert isinstance(dq_node, Node)
+        ref_node.replace_input_with(dq_node, dq_node.args[0])
+        q_node.replace_all_uses_with(ref_node)
+        model.graph.erase_node(q_node)
+        model.graph.erase_node(scale_node)
+        model.graph.erase_node(zero_point_node)
+
+def _lower_static_weighted_ref_module_with_two_inputs(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and find patterns
+    dequantize   dequantize
+       \\         //
+        ref module
+            \\
+          quantize
+    and replace them with the quantized version of the ref module.
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        #                                            (dequantize \
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        matching_modules = list(STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP.keys())
+        (q_node, ref_node) = _match_static_pattern_with_two_inputs(
+            n, modules, qconfig_map, matching_modules)  # type: ignore[arg-type]
+        if q_node is None:
+            continue
+        assert ref_node is not None
+        (_, scale_node, zero_point_node, _) = q_node.args
+        ref_module = _get_module(ref_node, modules)
+        ref_class = type(ref_module)
+        assert isinstance(scale_node, Node)
+        assert isinstance(zero_point_node, Node)
+        assert issubclass(ref_class, nn.Module)
+
+        # Step 1: Change this pattern to use the corresponding quantized module
+        # For fused modules, we also check whether the inner module is a reference module
+        # If so, we replace the entire fused module with the corresponding quantized module
+        if ref_class in STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP:
+            inner_ref_class, q_class = STATIC_LOWER_FUSED_MODULE_TWO_INPUTS_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:  # type: ignore[index]
+                continue
+        else:
+            continue
+        output_scale = getattr(model, scale_node.target)
+        output_zero_point = getattr(model, zero_point_node.target)
+        q_module = q_class.from_reference(ref_module, output_scale, output_zero_point)
+        # replace reference module with quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(modules[parent_name], module_name, q_module)
+
+        # Step 2: Reroute around dq_node, and remove q_node and its args
+        assert len(ref_node.args) == 2
+        for arg in ref_node.args:
+            if not is_dequantize_node(arg):
+                continue
+            dq_node = arg
+            assert isinstance(dq_node, Node)
+            ref_node.replace_input_with(dq_node, dq_node.args[0])
+
+        q_node.replace_all_uses_with(ref_node)
+        model.graph.erase_node(q_node)
+        model.graph.erase_node(scale_node)
+        model.graph.erase_node(zero_point_node)
+
+def _lower_dynamic_weighted_ref_module(model: GraphModule):
+    """
+    Traverse the graph and find quantize_per_tensor_dynamic - dequantize - ref_module patterns
+    and replace them with the dynamically quantized version of the ref module.
+    """
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        if n.op != "call_module" or \
+           type(named_modules[str(n.target)]) not in \
+           set(DYNAMIC_LOWER_MODULE_MAP.keys()).union(
+               set(DYNAMIC_LOWER_FUSED_MODULE_MAP.keys())):
+            continue
+        ref_node = n
+        dq_node = ref_node.args[0]
+        if dq_node.op != "call_method" or dq_node.target != "dequantize":
+            continue
+
+        input_dynamic_q_node = dq_node.args[0]
+
+        if input_dynamic_q_node.op != "call_function" or \
+           input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic:
+            continue
+
+        activation_dtype = input_dynamic_q_node.args[1]
+        is_fp16 = activation_dtype == torch.float16
+        is_int8 = activation_dtype in [torch.quint8, torch.qint8]
+        if not is_int8 and not is_fp16:
+            continue
+
+        ref_module = named_modules[str(ref_node.target)]
+        ref_class = type(ref_module)
+        if ref_class in DYNAMIC_LOWER_FUSED_MODULE_MAP:
+            inner_ref_class, q_class = DYNAMIC_LOWER_FUSED_MODULE_MAP[ref_class]
+            if type(ref_module[0]) != inner_ref_class:
+                continue
+        else:
+            q_class = DYNAMIC_LOWER_MODULE_MAP.get(ref_class)  # type: ignore[assignment]
+        # TODO: maybe define a WeightedDynamicallyQuantizedModule
+        q_module = q_class.from_reference(ref_module)  # type: ignore[attr-defined]
+
+        # replace reference module with dynamically quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(named_modules[parent_name], module_name, q_module)
+        ref_node.replace_input_with(dq_node, input_dynamic_q_node.args[0])
+
+def _lower_weight_only_weighted_ref_module(model: GraphModule):
+    """
+    Traverse the graph and find ref_module patterns
+    and replace them with the weight only quantized version of the ref module.
+    """
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        if n.op != "call_module" or \
+           type(named_modules[str(n.target)]) not in \
+           set(WEIGHT_ONLY_LOWER_MODULE_MAP.keys()):
+            continue
+        ref_node = n
+        ref_module = named_modules[str(ref_node.target)]
+        ref_class = type(ref_module)
+        q_class = WEIGHT_ONLY_LOWER_MODULE_MAP.get(ref_class)
+        # TODO: WeightedQuantizedModule is currently assuming static quant apis
+        # with output_scale, output_zero_point in from_reference, we may want to
+        # relax that, or rename this
+        # TODO: maybe define a WeightedWeightOnlyQuantizedModule
+        q_module = q_class.from_reference(ref_module)  # type: ignore[union-attr]
+
+        # replace reference module with dynamically quantized module
+        parent_name, module_name = _parent_name(ref_node.target)
+        setattr(named_modules[parent_name], module_name, q_module)
+
+def _lower_static_weighted_ref_functional(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and replace functional reference patterns with their quantized versions.
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - functional op - quantize)
+        matching_ops = list(STATIC_LOWER_FUNCTIONAL_MAP.keys())
+        (q_node, relu_node, func_node) = _match_static_pattern(
+            n, modules, qconfig_map, matching_ops, dequantize_node_arg_indices=[0, 1])
+        if q_node is None:
+            continue
+        assert func_node is not None
+        (_, output_scale_node, output_zp_node, _) = q_node.args
+        (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args
+        assert isinstance(output_zp_node, Node)
+        assert isinstance(input_dq_node, Node)
+        assert isinstance(weight_dq_node, Node)
+        quantized_weight = weight_dq_node.args[0]
+        assert isinstance(quantized_weight, Node)
+        if quantized_weight.op != "call_function" or\
+                quantized_weight.target not in (torch.quantize_per_tensor, torch.quantize_per_channel):
+            continue
+
+        # Step 1: Replace quantized weights with packed weights, which will be folded later
+        # Use the right prepack op and prepare the corresponding args
+        # Linear prepack args: (quantized weights[, bias])
+        # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups])
+        prepack_args = [quantized_weight] + remaining_func_args
+        if func_node.target == F.linear:
+            weight_dtype = quantized_weight.args[-1]
+            prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
+        elif func_node.target in CONV_FUNCTIONAL_OPS:
+            prepack_op = get_qconv_prepack_op(func_node.target)  # type: ignore[arg-type]
+            # For conv1d, the stride, padding, and dilation args may be ints,
+            # in which case we need to convert them to tuples
+            if func_node.target == F.conv1d:
+                for i in [2, 3, 4]:
+                    if len(prepack_args) > i and isinstance(prepack_args[i], int):
+                        prepack_args[i] = (prepack_args[i],)
+        elif func_node.target in CONV_TRANSPOSE_FUNCTIONAL_OPS:
+            prepack_op = get_qconv_prepack_op(func_node.target)  # type: ignore[arg-type]
+            # For conv_transpose1d, the stride, padding, and dilation args may be ints,
+            # in which case we need to convert them to tuples
+            if func_node.target == F.conv_transpose1d:
+                # Note prepack_args[5] is groups.
+                for i in [2, 3, 4, 6]:
+                    if len(prepack_args) > i and isinstance(prepack_args[i], int):
+                        prepack_args[i] = (prepack_args[i],)
+            # swap dilation and groups
+            # prepack op has arguments: {w, b, stride, padding, output_padding, dilation, groups}
+            # transposed conv op has arguments: {x, w, b, stride, padding, output_padding, groups, dilation}
+            if (len(prepack_args) > 6):
+                prepack_args[5], prepack_args[6] = prepack_args[6], prepack_args[5]
+        else:
+            raise ValueError(f"Lowering is not supported for op '{func_node.target}'")
+        with model.graph.inserting_before(output_scale_node):
+            # kwargs of the func node are needed for prepack op (i.e., quantized::linear_prepack)
+            # They are not needed for compute op (i.e., quantized::linear)
+            kwargs = func_node.kwargs
+            # F.linear uses 'bias' key for bias while qlinear_prepack uses 'B' for bias
+            if func_node.target == F.linear and 'bias' in kwargs:
+                kwargs = kwargs.copy()
+                kwargs['B'] = kwargs['bias']
+                del kwargs['bias']
+            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), kwargs)
+
+        # Step 2: Replace reference pattern with the corresponding quantized op
+        (q_func, q_relu_func) = STATIC_LOWER_FUNCTIONAL_MAP[func_node.target]  # type: ignore[index]
+        # conv_transpose does not support fusion with relu yet. q_relu_func is None in such cases
+        if q_relu_func is not None:
+            func_node.target = q_relu_func if relu_node is not None else q_func
+        else:
+            func_node.target = q_func
+        func_node.args = (input_dq_node.args[0], packed_weight, output_scale_node, output_zp_node)
+        # kwargs for func_node has been moved to kwargs for prepack op
+        func_node.kwargs = {}
+        q_node.replace_all_uses_with(func_node)
+        # Move func_node after output_zp_node in the graph
+        output_zp_node.append(func_node)
+
+        # Clean up: Remove quantize node, and the relu node if it exists
+        model.graph.erase_node(q_node)
+        if relu_node is not None and q_relu_func is not None:
+            model.graph.erase_node(relu_node)
+
+def _lower_dynamic_weighted_ref_functional(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    """
+    Traverse the graph and replace functional reference patterns with their dynamically
+    quantized versions.
+    Examples:
+    quantize_per_tensor_dynamic - dequantize - functional linear --> linear_dynamic
+    to(torch.float16) - dequantize - functional linear --> linear_dynamic_fp16
+    """
+    modules = dict(model.named_modules(remove_duplicate=False))
+    nodes = list(model.graph.nodes)
+    # we want to search in reserved order so that we can match the larger patterns first
+    # e.g. we want to match linear - relu before linear.
+    for n in reversed(model.graph.nodes):
+
+        # Step 0: Find nodes that match this pattern
+        # (quantize_per_tensor_dynamic - dequantize - dynamically quantized op)
+        # We search for the pattern backwards, starting with the quantize node
+        # Quantize node args: (func, scale, zp, dtype)
+        func_node = n
+        # Handle cases where the functional op is wrapped in a ReLU
+        if func_node.op == "call_function" and func_node.target == F.relu or \
+           func_node.op == "call_module" and \
+           type(modules[str(func_node.target)]) == torch.nn.ReLU:
+            relu_node = func_node
+            func_node = relu_node.args[0]
+        else:
+            relu_node = None
+        if should_skip_lowering(func_node, qconfig_map):
+            continue
+        # Linear args: (dequantized inputs, dequantized weights[, bias])
+        # Conv args: (dequantized inputs, dequantized weights[, bias, stride, padding, dilation, groups])
+        if func_node.op != "call_function" or func_node.target not in DYNAMIC_LOWER_FUNCTIONAL_MAP:
+            continue
+        (input_dq_node, weight_dq_node, *remaining_func_args) = func_node.args
+        if input_dq_node.op != "call_method" or input_dq_node.target != "dequantize" or \
+           weight_dq_node.op != "call_method" or weight_dq_node.target != "dequantize":
+            continue
+
+        input_dynamic_q_node = input_dq_node.args[0]
+
+        if input_dynamic_q_node.op != "call_function" or \
+           input_dynamic_q_node.target != torch.quantize_per_tensor_dynamic:
+            continue
+
+        reduce_range_node = None
+        (pattern_input, activation_dtype, reduce_range_node) = input_dynamic_q_node.args
+        is_fp16 = activation_dtype == torch.float16
+        is_int8 = activation_dtype in [torch.quint8, torch.qint8]
+        if not is_int8 and not is_fp16:
+            continue
+
+        quantized_weight = weight_dq_node.args[0]
+        weight_dtype = quantized_weight.args[-1]
+
+        # Step 1: Try to select reference pattern with the corresponding quantized op
+        dynamic_quant_dtype_key = (activation_dtype, weight_dtype)
+        if dynamic_quant_dtype_key not in DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target]:
+            print(f"Didn't find dtype combination {dynamic_quant_dtype_key} during "
+                  f"dynamic quantized op lowering for {func_node.target}")
+            continue
+        (q_func, q_relu_func) = DYNAMIC_LOWER_FUNCTIONAL_MAP[func_node.target][dynamic_quant_dtype_key]
+
+        if q_func is None or q_relu_func is None:
+            print("Didn't find corresponding quantized function or quantized relu function "
+                  f"for {func_node.target}, {dynamic_quant_dtype_key}")
+            continue
+
+        # Step 2: Replace quantized weights with packed weights, which will be folded later
+        # Use the right prepack op and prepare the corresponding args
+        # Linear prepack args: (quantized weights[, bias])
+        # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups])
+        prepack_args = [quantized_weight] + remaining_func_args
+        if func_node.target == F.linear:
+            prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
+        elif func_node.target in CONV_FUNCTIONAL_OPS:
+            prepack_op = get_qconv_prepack_op(func_node.target)
+            # For conv1d, the stride, padding, and dilation args may be ints,
+            # in which case we need to convert them to tuples
+            if func_node.target == F.conv1d:
+                for i in [2, 3, 4]:
+                    if len(prepack_args) > i and isinstance(prepack_args[i], int):
+                        prepack_args[i] = (prepack_args[i],)
+        else:
+            raise ValueError(f"Lowering is not supported for op '{func_node.target}'")
+        with model.graph.inserting_before(func_node):
+            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {})
+
+        # Step 3: Replace reference pattern with the corresponding quantized op
+        func_node.target = q_relu_func if relu_node is not None else q_func
+        if is_int8:
+            func_node.args = (pattern_input, packed_weight, reduce_range_node)
+        else:
+            func_node.args = (pattern_input, packed_weight)
+
+        if relu_node is not None:
+            relu_node.replace_all_uses_with(func_node)
+
+        # Step 4: Remove the relu node if it exists
+        if relu_node is not None:
+            model.graph.erase_node(relu_node)
+
+def _lower_quantized_binary_op(
+        model: GraphModule,
+        qconfig_map: Dict[str, QConfigAny]):
+    binary_ops_to_lower: List[Callable] = [operator.add, torch.add, operator.mul, torch.mul, torch.matmul]
+    modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        # Step 0: Find nodes that match this pattern (dequantize - ref module - quantize)
+        (q_node, relu_node, bop_node) = _match_static_pattern(
+            n, modules, qconfig_map, binary_ops_to_lower, dequantize_node_arg_indices=[0, 1])
+        if q_node is None:
+            continue
+        assert bop_node is not None
+        (_, scale_node, zero_point_node, _) = q_node.args
+
+        # Step 1: Remove dequant nodes
+        num_dq_nodes = 0
+        for arg in bop_node.args:
+            if not is_dequantize_node(arg):
+                continue
+            dq_node = arg
+            assert isinstance(dq_node, Node)
+            dn_input = dq_node.args[0]
+            bop_node.replace_input_with(dq_node, dn_input)
+            num_dq_nodes += 1
+        assert num_dq_nodes > 0
+
+        # Step 2: Swap binary op to quantized binary op
+        assert bop_node.target in QBIN_OP_MAPPING
+        binop_to_qbinop = QBIN_OP_MAPPING if relu_node is None else QBIN_RELU_OP_MAPPING
+        qbin_op = binop_to_qbinop[bop_node.target]
+        # prepare the args for quantized binary op
+        # (x, y)
+        qop_node_args = list(bop_node.args)
+        # (x, y, scale, zero_point)
+        # add scale and zero_point arguments for Tensor - Tensor operation
+        if num_dq_nodes == 2:
+            qop_node_args.extend([scale_node, zero_point_node])
+        # insert a call to quantized binary op and remove the original binary op
+        with model.graph.inserting_after(q_node):
+            qop_node = create_node_from_old_node_preserve_meta(
+                model.graph,
+                ("call_function", qbin_op, tuple(qop_node_args), {}),
+                bop_node)
+            q_node.replace_all_uses_with(qop_node)
+
+        # Step 3: Remove quantize node, binary op node, and relu node if any
+        model.graph.erase_node(q_node)
+        if relu_node is not None:
+            model.graph.erase_node(relu_node)
+        model.graph.erase_node(bop_node)
+
+def special_pattern_replacement(model: GraphModule):
+    modules = dict(model.named_modules(remove_duplicate=False))
+    for n in model.graph.nodes:
+        q_node = n
+        is_quantize = q_node.target == torch.quantize_per_tensor
+        is_to_fp16 = q_node.op == "call_method" and q_node.target == "to" and \
+            len(q_node.args) == 2 and q_node.args[1] == torch.float16
+        if not (is_quantize or is_to_fp16):
+            continue
+        ref_node = q_node.args[0]
+        # get output scale/zero_point/dtype from the quantize node
+        # ref_node, scale_node, zero_point_node, dtype = q_node.args
+        # TODO: add safety checks that users for the ref_node and dq_node needs to be one
+        is_call_function, is_call_method, is_call_module = is_fixed_qparams_node(ref_node, modules)
+        if is_to_fp16 and (is_call_function or is_call_method or is_call_module):
+            # TODO: add a warning or error out here? (bc-breaking if error out)
+            # warnings.warn(
+            #     "Only reference patterns are currently supported for {dtype} dtype with {op} op"
+            #     "".format(dtype=dtypes, op=ref_node))
+            continue
+
+        is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules)
+        if is_to_fp16 and (is_call_function or is_call_method or is_call_module):
+            # TODO: add a warning or error out here? (bc-breaking if error out)
+            continue
+
+        # This check includes all supported ops
+        is_call_function, is_call_method, is_call_module = is_special_pattern_node(ref_node, modules)
+        if not (is_call_module or is_call_function or is_call_method):
+            continue
+        assert len(ref_node.args) > 0 or len(ref_node.kwargs) > 0
+        dq_node_or_nodes = ref_node.args[0] if len(ref_node.args) > 0 else next(iter(ref_node.kwargs.values()))
+        assert isinstance(dq_node_or_nodes, (Node, tuple, list))
+        is_dequantize = False
+        if isinstance(dq_node_or_nodes, Node):
+            is_dequantize = dq_node_or_nodes.op == 'call_method' and \
+                dq_node_or_nodes.target == 'dequantize'
+        elif isinstance(dq_node_or_nodes, (tuple, list)):
+            is_dequantize = all(
+                x.op == 'call_method' and x.target == 'dequantize'
+                for x in dq_node_or_nodes)
+
+        if not is_dequantize:
+            continue
+
+        # TODO: enable we have patterns that needs to swap the modules
+        if is_call_module:
+            ref_module = modules[ref_node.target]
+            if type(ref_module) in SPECIAL_PATTERN_LOWER_MODULE_MAP and is_quantize:
+                qmodule_cls = SPECIAL_PATTERN_LOWER_MODULE_MAP.get(type(ref_module))
+                scale_node = q_node.args[1]
+                zero_point_node = q_node.args[2]
+                output_scale = getattr(model, scale_node.target)
+                output_zero_point = getattr(model, zero_point_node.target)
+
+                qmodule = qmodule_cls.from_reference(ref_module, output_scale, output_zero_point)  # type:ignore[union-attr]
+                # replace reference module with quantized module
+                parent_name, module_name = _parent_name(ref_node.target)
+                setattr(modules[parent_name], module_name, qmodule)
+
+        # reroute around dq node:
+        dq_nodes: List[Node] = []
+        if isinstance(dq_node_or_nodes, Node):
+            dq_nodes = [dq_node_or_nodes]
+        elif isinstance(dq_node_or_nodes, (tuple, list)):
+            dq_nodes = list(dq_node_or_nodes)
+
+        for dq_node in dq_nodes:
+            dn_input = dq_node.args[0]
+            ref_node.replace_input_with(dq_node, dn_input)
+
+        # store q node args
+        qnode_qparams = list(q_node.args)[1:]
+        # replace uses of q node with input and remove q node
+        q_node_input = q_node.args[0]
+        q_node.replace_all_uses_with(q_node_input)
+        model.graph.erase_node(q_node)
+
+        is_call_function, is_call_method, is_call_module = is_default_node(ref_node, modules)
+        if is_call_function:
+            # pass scale/zer_point arguments from quantize_per_tensor to the default node operator
+            # insert an op after the zero_point node so that the scale/zero_point
+            # nodes are is available
+            qop = get_quantized_operator(ref_node.target)
+            args = list(ref_node.args)
+            kwargs = dict(ref_node.kwargs)
+            if qop in QOP_TO_ARG_NAMES_TO_SKIP:
+                args_to_skip = QOP_TO_ARG_NAMES_TO_SKIP[qop]
+                for arg in args_to_skip:
+                    if arg in kwargs:
+                        kwargs.pop(arg)
+            kwargs["output_scale"] = qnode_qparams[0]
+            kwargs["output_zero_point"] = qnode_qparams[1]
+            with model.graph.inserting_after(qnode_qparams[1]):
+                qop_node = create_node_from_old_node_preserve_meta(
+                    model.graph,
+                    ("call_function", qop, tuple(args), kwargs),
+                    ref_node)
+                ref_node.replace_all_uses_with(qop_node)
+                model.graph.erase_node(ref_node)
+        else:
+            # remove scale/zero_point node for quantize node
+            for n in qnode_qparams:
+                if isinstance(n, Node):
+                    model.graph.erase_node(n)
+
+    return model
+
+def _lower_getattr_tensor_metadta_op(model: GraphModule):
+    """ Modified the graph of the model inplace, to skip extra dequantize op before
+    the general tensor shape ops when possible
+    """
+    for n in model.graph.nodes:
+        if is_getattr_tensor_metadata_node(n):
+            maybe_dq = n.args[0]
+            if maybe_dq.op != "call_method" or maybe_dq.target != "dequantize":
+                continue
+            # skip the dequantize node
+            args = list(n.args)
+            args[0] = n.args[0].args[0]
+            n.args = tuple(args)
+
+def _lower_get_tensor_info_op(model: GraphModule):
+    """ Modified the graph of the model inplace, to skip extra dequantize op before
+    the general tensor shape ops when possible
+    """
+    for n in model.graph.nodes:
+        if not is_get_tensor_info_node(n):
+            continue
+        maybe_dq = n.args[0]
+        if maybe_dq.op != "call_method" or maybe_dq.target != "dequantize":
+            continue
+        # skip the dequantize node
+        args = list(n.args)
+        args[0] = n.args[0].args[0]
+        n.args = tuple(args)
+
+def _lower_to_native_backend(
+    model: GraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> GraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to the native backend in PyTorch (fbgemm/qnnpack), both backends shares the same
+    operator signature so they can be lowered with the same function
+    """
+    _lower_static_weighted_ref_module(model, qconfig_map)
+    _lower_static_weighted_ref_module_with_two_inputs(model, qconfig_map)
+    _lower_dynamic_weighted_ref_module(model)
+    _lower_weight_only_weighted_ref_module(model)
+    _lower_static_weighted_ref_functional(model, qconfig_map)
+    _lower_dynamic_weighted_ref_functional(model, qconfig_map)
+    _lower_quantized_binary_op(model, qconfig_map)
+    _lower_getattr_tensor_metadta_op(model)
+    _lower_get_tensor_info_op(model)
+    special_pattern_replacement(model)
+    model.graph.eliminate_dead_code()
+    model = fold_weight(model, node_name_to_scope)
+    model.graph.eliminate_dead_code()
+    model.recompile()
+    model.graph.lint()
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b86a063bc1a4fb42906a8a67ebe6d09d4e7dedfe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/detector.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/detector.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..540effe9f8366fd3a70e5817ee3f42b63bdfbfe5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/detector.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae2ecd4a82a75a0cf6ba05ad34c646fae282c1dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_observer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_observer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fb05ad105873718499c36d0e8f9e0f640d95809
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_observer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_visualizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_visualizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5348a5d5dc663a8707e12c7ae4d445f7add09a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/__pycache__/model_report_visualizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/detector.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dbd3ed9538ae7379ce1e37aee071fadc19db081
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/detector.py
@@ -0,0 +1,1539 @@
+from typing import Any, Dict, Set, Tuple, Callable, List
+
+import torch
+import torch.nn as nn
+import torch.ao.nn.qat as nnqat
+from abc import ABC, abstractmethod
+from torch.ao.quantization.fake_quantize import FakeQuantize
+from torch.ao.quantization.fx.graph_module import GraphModule
+from torch.ao.quantization.fx._model_report.model_report_observer import ModelReportObserver
+from torch.ao.quantization.qconfig import (
+    QConfig,
+    default_qconfig,
+    _assert_valid_qconfig,
+)
+from torch.ao.quantization.observer import (
+    ObserverBase,
+    default_dynamic_quant_observer,
+    default_per_channel_weight_observer,
+    default_observer,
+    default_weight_observer,
+)
+from torch.ao.quantization.fx._equalize import (
+    default_equalization_qconfig,
+    EqualizationQConfig,
+)
+from torch.ao.quantization.observer import _is_activation_post_process
+
+# Names for observer insert keys
+DETECTOR_TARGET_NODE_KEY = "target_node"
+DETECTOR_OBS_TO_INSERT_KEY = "observer_to_insert"
+DETECTOR_IS_POST_OBS_KEY = "is_post_observer"
+DETECTOR_OBS_ARGS_KEY = "observer_args"
+
+# Mapping related code
+class DetectorQConfigInfo:
+    r"""
+    This class contains the QConfig information for a single module.
+    The list of variables / values this contains can grow depending on the
+    extensibility of the qconfig mapping feature set but this currently includes:
+    - if activation observer is dynamic
+    - if weight observer is per channel
+
+
+    Args:
+        module_fqn (str): The fully qualified name (fqn) of the module that this
+            information contains info relevant to qconfig for
+    """
+
+    def __init__(self, module_fqn: str):
+        super().__init__()
+        self.module_fqn = module_fqn
+
+        # populate this section with all the variables we might find important
+        # change from none if your detector is actually using this
+        self.is_activation_dynamic = False
+        self.is_weight_per_channel = False
+
+        # equalization related options
+        self.is_equalization_recommended = False
+
+    def generate_quantization_qconfig(self, module: torch.nn.Module) -> QConfig:
+        r"""
+        Args:
+            module (torch.nn.Module) The module we are generating
+            the qconfig for
+
+        Returns the generated quantization QConfig according to what a valid configuration is
+        """
+        # Apply suggestions to new qconfig
+        module_qconfig = default_qconfig
+
+        # keep track of dynamic and per_channel recommendations
+        recommendations_list = []
+        # append as if a list of combinations
+        recommendations_list.append((self.is_activation_dynamic, self.is_weight_per_channel))
+        recommendations_list.append((self.is_activation_dynamic, False))  # only trying dynamic rec
+        recommendations_list.append((False, self.is_weight_per_channel))  # only trying dynamic
+
+        # now we try each of the combinations
+        for rec in recommendations_list:
+            # rec[0] -> dynamic recommended
+            # rec[1] -> per channel recommended
+            activation = default_dynamic_quant_observer if rec[0] else default_observer
+            weight = default_per_channel_weight_observer if rec[1] else default_weight_observer
+            test_config = QConfig(activation, weight)
+            try:
+                _assert_valid_qconfig(test_config, module)
+                module_qconfig = test_config
+                break
+            except AssertionError:
+                # if not a valid configuration, we move on to the next one in priority
+                continue
+
+        # return the QConfig chosen
+        return module_qconfig
+
+    def generate_equalization_qconfig(self) -> EqualizationQConfig:
+        r"""
+        This returns the equalization configuration for a module.
+
+        For now, it just returns the default, but as more equalization options become
+        possible, this method can get more fleshed out with more nuanced granularity.
+
+
+        Returns the generated equalization QConfig according to what a valid configuration is
+        """
+        # in this case, we just return default equalization config
+        # we know this is valid because only valid modules would even
+        # have this option
+        return default_equalization_qconfig
+
+# Adding base class for detectors
+class DetectorBase(ABC):
+    r""" Base Detector Module
+    Any detector class should derive from this class.
+
+    Concrete detectors should follow the same general API, which includes:
+    - A method to calculate and return observer insertion points
+        - Should return both the fqns and the Observer class to insert
+    - A method to return a report based on the detector
+        - Should return a str-based report and dict info in Tuple[str,Dict] format
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.detector_config_info = None
+
+    @abstractmethod
+    def determine_observer_insert_points(self, model) -> Dict:
+        r"""
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to a Dict.
+            This dict maps string keys to detector specific information
+        """
+        pass
+
+    @abstractmethod
+    def get_detector_name(self) -> str:
+        r""" Returns the name of the current detector """
+        pass
+
+
+    @abstractmethod
+    def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
+        r""" Returns the DetectorQConfigInfo for each module_fqn relevant
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to:
+            A DetectorQConfigInfo with the information to generate a QConfig for a specific module
+        """
+        pass
+
+    def _get_targeting_node(self, prepared_fx_model: GraphModule, target_fqn: str) -> torch.fx.node.Node:
+        r"""
+        Takes in a GraphModule and the target_fqn and finds the node whose target is this fqn.
+
+        If it's not found, it means it is most likely inside a fused layer
+            We just go one layer up in terms of the fqn we are searching for until we find parent node
+            If we get to empty string, then we know that it doesn't exist
+
+        The reason for the recursion is that if the model that we are looking for got fused,
+        we will have module fqn as e.g. x.linear.0 but the graph will only have a node for the fused module,
+        which would have fqn as x.linear so they will not match.
+        To handle this, if we don't match, we then take off the last bit of the fqn e.g. x.linear.0 -> x.linear,
+        or more generally foo.bar.baz -> foo.bar and search again, this will allow us to locate the correct module
+        even in cases with fusion
+
+        Args:
+            prepared_fx_model (GraphModule):  The prepared Fx GraphModule
+            target_fqn (str): The fqn of the layer we are trying to target
+
+        Returns the node object we are trying to add observers around
+        """
+        for node in prepared_fx_model.graph.nodes:
+            # if the node's target is our target, return it
+            if node.target == target_fqn:
+                return node
+
+        # getting here means node not found
+        # if no "." we are already at base and failed
+        parent_fqn_sep_index = target_fqn.rfind(".")
+        if parent_fqn_sep_index == -1:
+            raise ValueError("passed in target_fqn not found in graph's targets.")
+        else:
+            # recursively call it with parent fqn
+            return self._get_targeting_node(prepared_fx_model, target_fqn[:parent_fqn_sep_index])
+
+    @abstractmethod
+    def generate_detector_report(self, model) -> Tuple[str, Dict[str, Any]]:
+        r"""
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Tuple of two elements:
+            Str: string report of the suggested improvements
+            Dict: contains useful data collected by the observer pertinent to this report
+        """
+        pass
+
+class PerChannelDetector(DetectorBase):
+    r""" This class is used to detect if any Linear or Conv layers in a model utilize per_channel quantization.
+        Only Linear and Conv layers can use per_channel as of now so only these two are currently checked.
+
+        per_channel quantization can lead to major benefits in the form of accuracy.
+        Therefore, if the backend used by the user supports it, it is recommended to use
+
+        Args:
+            backend (str, optional): the backend the user wishes to use in production
+                Default value is current torch.backends.quantized.engine
+    """
+
+    # Keys for return dictionary
+    BACKEND_KEY = "backend"
+    PER_CHAN_SUPPORTED_KEY = "per_channel_quantization_supported"
+    PER_CHAN_USED_KEY = "per_channel_quantization_used"
+
+    # Default map for representing supported per channel quantization modules for different backends
+    DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES: Dict[str, Set[Any]] = {
+        "fbgemm": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "qnnpack": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "onednn": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+        "x86": {nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d, nnqat.Linear, nnqat.Conv1d, nnqat.Conv2d, nnqat.Conv3d},
+    }
+
+    def __init__(self, backend: str = torch.backends.quantized.engine):
+        super().__init__()
+
+        # store the backend information
+        self.backend_chosen = backend
+        self.supported_modules = set()
+        if self.backend_chosen in self.DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES:
+            self.supported_modules = self.DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES[self.backend_chosen]
+        else:
+            raise ValueError(f"Not configured to work with {self.backend_chosen}. Try a different default backend")
+
+    def get_detector_name(self) -> str:
+        r""" returns the string name of this detector"""
+        return "per_channel_detector"
+
+    def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
+        r""" Returns the DetectorQConfigInfo for each module_fqn relevant
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to:
+            A DetectorQConfigInfo with the information to generate a QConfig for a specific module
+        """
+        # run the helper function to populate the dictionary
+        per_channel_info = self._detect_per_channel_helper(model)
+
+        # we actually have a qconfig info object we are populating
+        module_fqn_to_detector_qconfig_info = {}
+
+        for module_fqn in per_channel_info:
+            # create a detector info instance
+            detector_qconfig_info = DetectorQConfigInfo(module_fqn)
+
+            # see if per channel quantization is supported
+            per_chan_supported: bool = per_channel_info[module_fqn][self.PER_CHAN_SUPPORTED_KEY]
+            detector_qconfig_info.is_weight_per_channel = per_chan_supported
+            module_fqn_to_detector_qconfig_info[module_fqn] = detector_qconfig_info
+
+        return module_fqn_to_detector_qconfig_info
+
+    def determine_observer_insert_points(self, model: nn.Module) -> Dict:
+        r"""
+        There is no observers inserted for the PerChannelDetector.
+
+        Returns an empty dictionary since no observers are added or needed
+        """
+        return {}
+
+
+    def _detect_per_channel_helper(self, model: nn.Module):
+        r"""
+        determines if per_channel quantization is supported in modules and submodules.
+
+        Returns a dictionary in the higher level _detect_per_channel function.
+        Each entry maps the fully-qualified-name to information on whether per_channel quantization.
+
+        Args:
+            model: The current module that is being checked to see if it is per_channel quantizable
+
+        Returns dictionary mapping fqns to if per_channel quantization is possible
+        """
+        # create dict we will return
+        per_channel_info: Dict = {}
+
+        # get the fully qualified name and check if in list of modules to include and list of modules to ignore
+        for fqn, module in model.named_modules():
+
+            is_in_include_list = sum([isinstance(module, x) for x in self.supported_modules]) > 0
+
+            # check if the module per_channel is supported
+            # based on backend
+            per_channel_supported = False
+
+            if is_in_include_list:
+                per_channel_supported = True
+
+                # assert statement for MyPy
+                q_config_file = module.qconfig
+                assert isinstance(q_config_file, QConfig)
+
+                # this object should either be fake quant or observer
+                q_or_s_obj = module.qconfig.weight.p.func()
+                assert isinstance(q_or_s_obj, (FakeQuantize, ObserverBase))
+
+                per_channel_used = False  # will be true if found in qconfig
+
+                if hasattr(q_or_s_obj, "ch_axis"):  # then we know that per_channel quantization used
+
+                    # all fake quants have channel axis so need to check is_per_channel
+                    if isinstance(q_or_s_obj, FakeQuantize):
+                        if hasattr(q_or_s_obj, "is_per_channel") and q_or_s_obj.is_per_channel:
+                            per_channel_used = True
+                    elif isinstance(q_or_s_obj, ObserverBase):
+                        # should be an observer otherwise
+                        per_channel_used = True
+                    else:
+                        raise ValueError("Should be either observer or fake quant")
+
+                per_channel_info[fqn] = {
+                    self.PER_CHAN_SUPPORTED_KEY: per_channel_supported,
+                    self.PER_CHAN_USED_KEY: per_channel_used,
+                    self.BACKEND_KEY: self.backend_chosen
+                }
+
+        return per_channel_info
+
+    def generate_detector_report(self, model: nn.Module) -> Tuple[str, Dict[str, Any]]:
+        r"""Checks if any Linear or Conv layers in the model utilize per_channel quantization.
+        Only Linear and Conv layers can use per_channel as of now so only these two are currently checked.
+
+        Looks at q_config format and backend to determine if per_channel can be utilized.
+        Uses the DEFAULT_BACKEND_PER_CHANNEL_SUPPORTED_MODULES structure to determine support
+
+        Args:
+            model: The prepared and calibrated model we want to check if using per_channel
+
+        Returns a tuple with two elements:
+            String report of potential actions to improve model (if per_channel quantization is available in backend)
+            Dictionary mapping per_channel quantizable elements to:
+                whether per_channel quantization is supported by the backend
+                if it is being utilized in the current model
+        """
+
+        # run the helper function to populate the dictionary
+        per_channel_info = self._detect_per_channel_helper(model)
+
+        # String to let the user know of further optimizations
+        further_optims_str = f"Further Optimizations for backend {self.backend_chosen}: \n"
+
+        optimizations_possible = False
+        for fqn in per_channel_info:
+            fqn_dict = per_channel_info[fqn]
+            if fqn_dict[self.PER_CHAN_SUPPORTED_KEY] and not fqn_dict[self.PER_CHAN_USED_KEY]:
+                optimizations_possible = True
+                further_optims_str += f"Module {fqn} can be configured to use per_channel quantization.\n"
+
+        if optimizations_possible:
+            further_optims_str += (
+                "To use per_channel quantization, make sure the qconfig has a per_channel weight observer."
+            )
+        else:
+            further_optims_str += "No further per_channel optimizations possible."
+
+        # return the string and the dictionary form of same information
+        return (further_optims_str, per_channel_info)
+
+
+class DynamicStaticDetector(DetectorBase):
+    r"""
+    Determines whether dynamic or static quantization is more appropriate for a given module.
+
+    Takes advantage of the ModelReportObserver that records range information.
+    Stationary distribution of data are strictly above tolerance level for the comparison statistic:
+
+        S = average_batch_activation_range/epoch_activation_range
+
+    Nonstationary distributions are below or at the tolerance level for this metric.
+
+    If the distribution of data right after the module is non-stationary, recommend dynamic quantization
+        Otherwise recommend static quantization
+
+    Args:
+        tolerance (float, optional): The threshold where S metric is stationary above and non-stationary otherwise. Default: 0.5
+    """
+    # names for the pre and post observers that are inserted
+    DEFAULT_PRE_OBSERVER_NAME = "model_report_pre_observer"
+    DEFAULT_POST_OBSERVER_NAME = "model_report_post_observer"
+
+    # naming conventions for stationary vs non-stationary data
+    STATIONARY_STR = "stationary"
+    NON_STATIONARY_STR = "non-stationary"
+
+    # naming for activation
+    INPUT_ACTIVATION_PREFIX = "input_activation_"
+    OUTPUT_ACTIVATION_PREFIX = "output_activation_"
+
+    # naming conventions for the keys of the return module info
+    TOLERANCE_KEY = "dynamic_static_tolerance"
+    DEFAULT_DYNAMIC_REC_KEY = "dynamic_recommended"
+    PRE_OBS_COMP_STAT_KEY = INPUT_ACTIVATION_PREFIX + "dynamic_static_comp_stat"
+    POST_OBS_COMP_STAT_KEY = OUTPUT_ACTIVATION_PREFIX + "dynamic_static_comp_stat"
+    PRE_OBS_DATA_DIST_KEY = INPUT_ACTIVATION_PREFIX + "dynamic_static_data_classification"
+    POST_OBS_DATA_DIST_KEY = OUTPUT_ACTIVATION_PREFIX + "dynamic_static_data_classification"
+    IS_CURRENTLY_SUPPORTED_KEY = "is_dynamic_supported"
+
+    # modules that are supported both dynamic and static for this report function
+    DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED = {nn.Linear}
+
+    # modules that will be supported soon for both
+    DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED = {nn.Conv1d, nn.Conv2d, nn.Conv3d}
+
+    def __init__(self, tolerance=0.5):
+        super().__init__()
+
+        # set tolerance level and initialize a set to keep track of useful fqn locations
+        self.tolerance = tolerance
+        self.useful_observer_fqns: Set[str] = set()
+
+    def determine_observer_insert_points(self, prepared_fx_model: GraphModule) -> Dict[str, Dict[str, Any]]:
+        r"""
+        Determines where observers need to be inserted for the Dynamic vs Static detector.
+        For this detector, we want to place observers on either side of linear layers in the model.
+
+        Currently inserts observers for:
+            linear layers
+
+        Args:
+            prepared_fx_model (GraphModule):  The prepared Fx GraphModule
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to a Dict with:
+            key "target_node" -> the node we are trying to observe with this observer (torch.fx.node.Node)
+            key "observer_to_insert" -> the observer we wish to insert (ObserverBase)
+            key "is_post_observer" -> True if this is meant to be a post-observer for target_node, False if pre-observer
+            key "observer_args" -> The arguments that are meant to be passed into the observer
+        """
+
+        # observer for this detector is ModelReportObserver
+        obs_ctr = ModelReportObserver
+
+        # return dict
+        obs_fqn_to_info: Dict[str, Dict[str, Any]] = {}
+
+        for fqn, module in prepared_fx_model.named_modules():
+            # make sure module is supported
+            if self._is_supported(module, insert=True):
+                # if it's a supported type, we want to get node and add observer insert locations
+                targeted_node = self._get_targeting_node(prepared_fx_model, fqn)
+
+                # add entry for pre-observer
+                pre_obs_fqn = fqn + "." + self.DEFAULT_PRE_OBSERVER_NAME
+
+                obs_fqn_to_info[pre_obs_fqn] = {
+                    DETECTOR_TARGET_NODE_KEY: targeted_node,
+                    DETECTOR_OBS_TO_INSERT_KEY: obs_ctr(),
+                    DETECTOR_IS_POST_OBS_KEY: False,
+                    DETECTOR_OBS_ARGS_KEY: targeted_node.args
+                }
+
+                # add entry for post-observer
+                post_obs_fqn = fqn + "." + self.DEFAULT_POST_OBSERVER_NAME
+
+                obs_fqn_to_info[post_obs_fqn] = {
+                    DETECTOR_TARGET_NODE_KEY: targeted_node,
+                    DETECTOR_OBS_TO_INSERT_KEY: obs_ctr(),
+                    DETECTOR_IS_POST_OBS_KEY: True,
+                    DETECTOR_OBS_ARGS_KEY: (targeted_node,)
+                }
+
+        return obs_fqn_to_info
+
+    def get_detector_name(self) -> str:
+        r""" returns the string name of this detector"""
+        return "dynamic_vs_static_detector"
+
+
+    def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
+        r""" Returns the DetectorQConfigInfo for each module_fqn relevant
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to:
+            A DetectorQConfigInfo with the information to generate a QConfig for a specific module
+        """
+        # run the helper function to populate the dictionary
+        dynamic_static_info = self._generate_dict_info(model)
+
+        # we actually have a qconfig info object we are populating
+        module_fqn_to_detector_qconfig_info = {}
+
+        for module_fqn in dynamic_static_info:
+            # create a detector info instance
+            detector_qconfig_info = DetectorQConfigInfo(module_fqn)
+
+            # see if per channel quantization is supported
+            dynamic_static_recommended: bool = dynamic_static_info[module_fqn][self.DEFAULT_DYNAMIC_REC_KEY]
+            detector_qconfig_info.is_activation_dynamic = dynamic_static_recommended
+            module_fqn_to_detector_qconfig_info[module_fqn] = detector_qconfig_info
+
+        return module_fqn_to_detector_qconfig_info
+
+    def _is_supported(self, module: nn.Module, insert: bool = False) -> bool:
+        r"""Returns whether the given module is supported for observers
+
+        Args
+            module: The module to check and ensure is supported
+            insert: True if this is check for observer insertion, false if for report gen
+
+        Returns True if the module is supported by observer, False otherwise
+        """
+        # check to see if module is of a supported type
+        is_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED]) > 0
+
+        # check if it will be supported
+        future_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_FUTURE_SUPPORTED]) > 0
+
+        # supported
+        supported = is_supported_type or future_supported_type
+
+        # this is check for observer insertion
+        if insert:
+            return supported
+        else:
+            # this is for report gen and we also need to check if it contains observers
+            has_obs = hasattr(module, self.DEFAULT_PRE_OBSERVER_NAME) and hasattr(module, self.DEFAULT_POST_OBSERVER_NAME)
+            return supported and has_obs
+
+    def _generate_dict_info(self, model: GraphModule) -> Dict[str, Any]:
+        r"""
+        Helper function for generate_detector_report that does the generation of the dictionary.
+        This process is done as specified in generate_detector_report documentation
+
+        Args:
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a Dictionary mapping modules with ModelReportObservers around them to:
+                whether dynamic quantization is recommended
+                their S metric of input to module
+                whether input to module is stationary or non-stationary
+                their S metric of output of module
+                whether output of module is stationary or non-stationary
+                the tolerance level to decided whether input/output is stationary or non-stationary
+                whether it is currently supported or planned for the future
+        """
+        # store modules dynamic vs static information
+        module_dynamic_static_info = {}
+
+        # This for loop goes through the modules, and extracts all relevant information into module_dynamic_static_info
+        #   This information primary includes whether the data distributions around a supported module is stationary or not
+        #   Based on this, it is recorded whether dynamic or static quantization is recommended
+
+        # loop through all submodules included nested ones
+        for fqn, module in model.named_modules():
+            # if module is Linear has the ModelReportObserver attached to it
+            if self._is_supported(module):
+                # get pre and post observers for the module
+                pre_obs = getattr(module, self.DEFAULT_PRE_OBSERVER_NAME)
+                post_obs = getattr(module, self.DEFAULT_POST_OBSERVER_NAME)
+
+                # get the statistics for each module
+                pre_stat = pre_obs.get_batch_to_epoch_ratio()
+                post_stat = post_obs.get_batch_to_epoch_ratio()
+
+                # record module, pre and post stat, and whether to do dynamic or static based off it
+                # true if post observer data distribution is non-stationary, false if it's stationary
+                dynamic_recommended = post_stat <= self.tolerance
+
+                # specify the classifications for whether data distributions considered stationary or non-stationary
+                pre_obs_dist_classif = self.STATIONARY_STR if pre_stat > self.tolerance else self.NON_STATIONARY_STR
+                post_obs_dist_classif = self.STATIONARY_STR if post_stat > self.tolerance else self.NON_STATIONARY_STR
+
+                # check if current support or future support
+                is_supported_type = sum([isinstance(module, x) for x in self.DEFAULT_DYNAMIC_STATIC_CHECK_SUPPORTED]) > 0
+
+                # store the set of important information for this module
+                module_info = {
+                    self.TOLERANCE_KEY: self.tolerance,
+                    self.DEFAULT_DYNAMIC_REC_KEY: dynamic_recommended,
+                    self.PRE_OBS_COMP_STAT_KEY: pre_stat,
+                    self.PRE_OBS_DATA_DIST_KEY: pre_obs_dist_classif,
+                    self.POST_OBS_COMP_STAT_KEY: post_stat,
+                    self.POST_OBS_DATA_DIST_KEY: post_obs_dist_classif,
+                    self.IS_CURRENTLY_SUPPORTED_KEY: is_supported_type,
+                }
+
+                module_dynamic_static_info[fqn] = module_info
+
+        return module_dynamic_static_info
+
+    def generate_detector_report(self, model: GraphModule) -> Tuple[str, Dict[str, Any]]:
+        r"""
+        Determines whether dynamic or static quantization is more appropriate for a given module.
+
+        Takes advantage of the ModelReportObserver that records range information.
+        Stationary distribution of data are strictly above tolerance level for the comparison statistic:
+
+            S = average_batch_activation_range/epoch_activation_range
+
+        Nonstationary distributions are below or at the tolerance level for this metric.
+
+        If the distribution of data right after the module is non-stationary, recommend dynamic quantization
+            Otherwise recommend static quantization
+
+        This will then generate suggestions for dynamic vs static quantization focused around Linear.
+
+        Args:
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a tuple with two elements:
+            String report of of whether dynamic or static quantization is recommended for certain modules
+            Dictionary mapping modules with ModelReportObservers around them to:
+                whether dynamic quantization is recommended
+                their S metric of input to module
+                whether input to module is stationary or non-stationary
+                their S metric of output of module
+                whether output of module is stationary or non-stationary
+                the tolerance level to decided whether input/output is stationary or non-stationary
+                whether it is currently supported or planned for the future
+        """
+
+        # get the dictionary of the information to format the string report
+        module_dynamic_static_info = self._generate_dict_info(model)
+
+        dynamic_vs_static_string = "Dynamic vs. Static Quantization suggestions: \n"
+
+        modules_added: bool = False  # check to make sure at least 1 module added.
+
+        dynamic_benefit = " You will get more accurate results if you use dynamic quantization"
+        static_benefit = " You can increase model efficiency if you use static quantization"
+        future_support_str = ". This layer is not yet supported for dynamic quantization"
+        # This for loop goes through the information collected in module_dynamic_static_info and:
+        #   Populates the string based report with the information from module_dynamic_static_info
+        #   Compiles the complete report by appending relevant formatted strings
+
+        for module_fqn in module_dynamic_static_info.keys():
+
+            # there is at least 1 module for suggestion
+            modules_added = True
+            module_info = module_dynamic_static_info[module_fqn]
+            suggestion_string_template = "For module {} it is suggested to use {} quantization because {}.\n"
+
+            # decide what string formatting values will be
+            quantization_type = ""
+            quantization_reasoning = "the distribution of data before {} is {} and the distribution after is {}."
+
+            benefit_str = ""
+
+            # strings for if dynamic quantized per tensor is needed
+            recommend_per_tensor = ". We recommend to add a {} before this module if it is static."
+            rec_lay_to_add = "dynamic quantize per tensor layer"
+            dynamic_per_tensor_string = recommend_per_tensor.format(rec_lay_to_add)
+            dynamic_per_tensor_reasoning_string = (
+                " This is because the input to this module has a non-stationary distribution"
+            )
+
+            # start composing explanation
+            if module_info[self.DEFAULT_DYNAMIC_REC_KEY]:
+                quantization_type = "dynamic"
+                # check if currently supported or future supported
+                benefit_str = dynamic_benefit
+                if not module_info[self.IS_CURRENTLY_SUPPORTED_KEY]:
+                    benefit_str += future_support_str
+            else:
+                quantization_type = "static"
+                benefit_str = static_benefit
+
+            # now set the quantization explanation string
+            quantization_reasoning = (
+                quantization_reasoning.format(
+                    module_fqn, module_info[self.PRE_OBS_DATA_DIST_KEY], module_info[self.POST_OBS_DATA_DIST_KEY]
+                )
+                + benefit_str
+            )
+
+            # if we have a non-stationary input -> linear -> stationary we suggested static
+            # however, we want to also recommend they add a dynamic quantize per tensor right if this change is made
+            if (
+                module_info[self.PRE_OBS_DATA_DIST_KEY] == self.NON_STATIONARY_STR
+                and module_info[self.POST_OBS_DATA_DIST_KEY] == self.STATIONARY_STR
+            ):
+                quantization_reasoning = (
+                    quantization_reasoning + dynamic_per_tensor_string + dynamic_per_tensor_reasoning_string
+                )
+
+            # format the overall suggestion string with the specific inputs
+            module_suggestion_string = suggestion_string_template.format(
+                module_fqn, quantization_type, quantization_reasoning
+            )
+
+            # append to overall suggestion
+            dynamic_vs_static_string += module_suggestion_string
+
+        if not modules_added:
+            dynamic_vs_static_string += "No applicable layers for suggestions. Only linear and conv are valid.\n"
+
+        # return the string as well as the dictionary of information
+        return (dynamic_vs_static_string, module_dynamic_static_info)
+
+
+class InputWeightEqualizationDetector(DetectorBase):
+    r"""
+    Determines whether input-weight equalization can help improve quantization for certain modules.
+
+    Specifically, this list of modules includes:
+        linear
+        conv
+
+    Determines whether input-weight equalization is recommended based on the comp stat:
+        s_c = sqrt(w_c/W)/sqrt(i_c/I)
+        where:
+            w_c is range of weight for channel c, W is range of weight over all channels
+            i_c is range of input for channel c, I is range of input over all channels
+
+        if s_c >= threshold or <= 1 / threshold, recommends input-weight equalization
+
+    Args:
+        ratio_threshold (float): The threshold for s_c to determine if input-weight equalization is suggested
+            Should be between 0 and 1 (both non-inclusive)
+        ch_axis (int, optional): The channel axis being observed to determine input weight equalization
+            Default: 1
+
+    * :attr:`ratio_threshold`: The threshold for s_c to determine if input-weight equalization is suggested
+        Should be between 0 and 1
+
+    * :attr:`ch_axis`: The channel axis being observed to determine input weight equalization
+
+    * :attr:`SUPPORTED_MODULES`: This specifies the modules that are supported for input-weight equalization
+
+    * :attr:`DEFAULT_PRE_OBSERVER_NAME`: The name of the pre-observer to be inserted for this detector
+    """
+
+    SUPPORTED_MODULES: Set[Callable] = {nn.Linear,
+                                        nn.Conv1d,
+                                        nn.Conv2d,
+                                        nn.Conv3d,
+                                        nnqat.Linear,
+                                        nnqat.Conv1d,
+                                        nnqat.Conv2d,
+                                        nnqat.Conv3d}
+
+    # names for the pre and post observers that are inserted
+    DEFAULT_PRE_OBSERVER_NAME: str = "model_report_pre_observer"
+
+    # weight / activation prefix for each of the below info
+    WEIGHT_PREFIX = "weight_"
+    ACTIVATION_PREFIX = "input_activation_"
+
+    # string names for keys of info dictionaries
+    PER_CHANNEL_MAX_KEY = "per_channel_max"
+    PER_CHANNEL_MIN_KEY = "per_channel_min"
+    GLOBAL_MAX_KEY = "global_max"
+    GLOBAL_MIN_KEY = "global_min"
+
+    # keys for return dict of recommendations
+    RECOMMENDED_KEY = "input_weight_equalization_recommended"
+    COMP_METRIC_KEY = "input_weight_channel_comparison_metrics"
+    THRESHOLD_KEY = "input_weight_threshold"
+    CHANNEL_KEY = "input_weight_channel_axis"
+
+    # default weight and info strings
+    WEIGHT_STR = "weight"
+    INPUT_STR = "input"
+
+    # default for what ratio we recommend input weight
+    DEFAULT_RECOMMEND_INPUT_WEIGHT_CHANNEL_RATIO = 0.4
+
+    def __init__(self, ratio_threshold: float, ch_axis: int = 1):
+        # ensure passed in inputs are valid
+        if ratio_threshold <= 0 or ratio_threshold >= 1:
+            raise ValueError("Make sure threshold is > 0 and < 1")
+
+        # initialize attributes based on args
+        self.ratio_threshold: float = ratio_threshold
+        self.ch_axis: int = ch_axis
+
+    def _is_supported(self, module: nn.Module, insert: bool = False) -> bool:
+        r"""Returns whether the given module is supported for observers
+
+        Args
+            module: The module to check and ensure is supported
+            insert: True if this is check for observer insertion, false if for report gen
+
+        Returns True if the module is supported by observer, False otherwise
+        """
+        # check to see if module is of a supported type
+        is_supported_type = sum([type(module) is x for x in self.SUPPORTED_MODULES]) > 0
+
+        # this is check for observer insertion
+        if insert:
+            return is_supported_type
+        else:
+            # this is for report gen and we also need to check if it contains observers
+            has_obs = hasattr(module, self.DEFAULT_PRE_OBSERVER_NAME)
+            return is_supported_type and has_obs
+
+    def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
+        r""" Returns the DetectorQConfigInfo for each module_fqn relevant
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to:
+            A DetectorQConfigInfo with the information to generate a QConfig for a specific module
+        """
+        # run the helper function to populate the dictionary
+        # find the range of inputs
+        input_values: Dict[str, Dict] = self._extract_input_info(model)
+
+        # find the range of weights
+        weight_values: Dict[str, Dict] = self._extract_weight_info(model)
+
+        # calculate per_channel comparison statistic s_c
+        comp_stats: Dict[str, torch.Tensor] = self._generate_comparison_values(input_values, weight_values)
+
+        # generate the return dictionary
+        input_weight_equalization_info: Dict[str, Dict] = self._generate_dict_info(input_values, weight_values, comp_stats)
+
+        # we actually have a qconfig info object we are populating
+        module_fqn_to_detector_qconfig_info = {}
+
+        for module_fqn in input_weight_equalization_info:
+            # create a detector info instance
+            detector_qconfig_info = DetectorQConfigInfo(module_fqn)
+
+            # see if per channel quantization is supported
+            input_weight_recommended: bool = input_weight_equalization_info[module_fqn][self.RECOMMENDED_KEY]
+            detector_qconfig_info.is_equalization_recommended = input_weight_recommended
+            module_fqn_to_detector_qconfig_info[module_fqn] = detector_qconfig_info
+
+        return module_fqn_to_detector_qconfig_info
+
+    def determine_observer_insert_points(self, prepared_fx_model: GraphModule) -> Dict[str, Dict[str, Any]]:
+        r"""Determines where observers need to be inserted for the Input Weight Equalization Detector.
+        For this detector, we want to place observers in front of supported layers.
+
+        Currently inserts observers for:
+            linear layers
+            conv layers
+
+        Args:
+            prepared_fx_model (GraphModule):  The prepared Fx GraphModule
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to a Dict with:
+            key "target_node" -> the node we are trying to observe with this observer (torch.fx.node.Node)
+            key "observer_to_insert" -> the observer we wish to insert (ObserverBase)
+            key "is_post_observer" -> True if this is meant to be a post-observer for target_node, False if pre-observer
+            key "observer_args" -> The arguments that are meant to be passed into the observer
+        """
+
+        # observer for this detector is ModelReportObserver
+        obs_ctr = ModelReportObserver
+
+        # return dict
+        obs_fqn_to_info: Dict[str, Dict[str, Any]] = {}
+
+        for fqn, module in prepared_fx_model.named_modules():
+            # check to see if module is of a supported type
+            if self._is_supported(module, insert=True):
+                # if it's a supported type, we want to get node and add observer insert locations
+                targeted_node = self._get_targeting_node(prepared_fx_model, fqn)
+
+                # add entry for pre-observer
+                pre_obs_fqn = fqn + "." + self.DEFAULT_PRE_OBSERVER_NAME
+
+                obs_fqn_to_info[pre_obs_fqn] = {
+                    DETECTOR_TARGET_NODE_KEY: targeted_node,
+                    DETECTOR_OBS_TO_INSERT_KEY: obs_ctr(ch_axis=self.ch_axis),
+                    DETECTOR_IS_POST_OBS_KEY: False,
+                    DETECTOR_OBS_ARGS_KEY: targeted_node.args,
+                }
+
+        return obs_fqn_to_info
+
+    def get_detector_name(self) -> str:
+        r"""Returns the name of this detector"""
+        return "input_weight_equalization_detector"
+
+    def _extract_input_info(self, model: GraphModule) -> Dict[str, Dict]:
+        r"""
+        Takes in a calibrated GraphModule and then finds the relevant observers.
+        It then extracts the input information for each observer returns it
+
+        Args
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a dict mapping relevant module fqns (str) to a dict with keys:
+            "input_activation_per_channel_max" : maps to the per_channel max values
+            "input_activation_per_channel_min" : maps to the per_channel min values
+            "input_activation_global_max" : maps to the global max recorded
+            "input_activation_global_min" : maps to the global min recorded
+        """
+
+        # return dictionary mapping observer fqns to desired info
+        input_info: Dict[str, Dict] = {}
+
+        for fqn, module in model.named_modules():
+            # if module is supported and it has a pre-observer
+            if self._is_supported(module):
+                # get pre observer for the module
+                pre_obs = getattr(module, self.DEFAULT_PRE_OBSERVER_NAME)
+
+                input_info[fqn] = {
+                    self.ACTIVATION_PREFIX + self.PER_CHANNEL_MAX_KEY: pre_obs.max_val,
+                    self.ACTIVATION_PREFIX + self.PER_CHANNEL_MIN_KEY: pre_obs.min_val,
+                    self.ACTIVATION_PREFIX + self.GLOBAL_MAX_KEY: max(pre_obs.max_val),
+                    self.ACTIVATION_PREFIX + self.GLOBAL_MIN_KEY: min(pre_obs.min_val),
+                }
+
+        return input_info
+
+    def _extract_weight_info(self, model: GraphModule) -> Dict[str, Dict]:
+        r"""
+        Takes in a calibrated GraphModule and then finds the relevant observers.
+        It then extracts the weight information for each layer an observer is attached to.
+
+        Args
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a dict mapping module fqns (str) to a dict with keys:
+            "per_channel_max" : maps to the per_channel max values
+            "per_channel_min" : maps to the per_channel min values
+            "global_max" : maps to the global max recorded
+            "global_min" : maps to the global min recorded
+        """
+        # return dictionary mapping observer fqns to desired info
+        weight_info: Dict[str, Dict] = {}
+
+        for fqn, module in model.named_modules():
+            # if module is supported and it has a pre-observer
+            if self._is_supported(module):
+                # we don't need actual observer, just the module weights
+                # calculate min and max vals
+                device = module.weight.device
+                min_val: torch.Tensor = torch.tensor([float('inf')], device=device)
+                max_val: torch.Tensor = torch.tensor([float('-inf')], device=device)
+                x_copy = module.weight
+                x_dim = x_copy.size()
+
+                new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+                new_axis_list[self.ch_axis] = 0
+                new_axis_list[0] = self.ch_axis
+                y = x_copy.permute(new_axis_list)
+
+                # Need to match dtype of min/max because the updates to buffers
+                # are done in place and types need to match for comparisons
+                y = y.to(min_val.dtype)
+                y = torch.flatten(y, start_dim=1)
+                if min_val.numel() == 0 or max_val.numel() == 0:
+                    min_val, max_val = torch.aminmax(y, dim=1)
+                else:
+                    min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+                    min_val = torch.min(min_val_cur, min_val)
+                    max_val = torch.max(max_val_cur, max_val)
+
+                weight_info[fqn] = {
+                    self.WEIGHT_PREFIX + self.PER_CHANNEL_MAX_KEY: max_val,
+                    self.WEIGHT_PREFIX + self.PER_CHANNEL_MIN_KEY: min_val,
+                    self.WEIGHT_PREFIX + self.GLOBAL_MAX_KEY: max(max_val),
+                    self.WEIGHT_PREFIX + self.GLOBAL_MIN_KEY: min(min_val),
+                }
+
+        return weight_info
+
+    def _calculate_range_ratio(self, info_dict: Dict, info_str: str, module_fqn: str) -> torch.Tensor:
+        r"""
+        Takes in an info dict and calculates the s_c matrix.
+
+        Args:
+            info_dict (dict): A dictionary of either input or weight range info
+            info_str (str): A str describing whether currently looking at weight or input info
+                Either "weight" or "input"
+            module_fqn (str): The fqn of the module we are looking at
+
+        Returns a tensor of values, where each value is the s_c stat for a different channel
+        """
+        # calculate the ratios of the info
+        # get the prefix str
+        prefix_str = self.ACTIVATION_PREFIX if info_str == self.INPUT_STR else self.WEIGHT_PREFIX
+
+        per_channel_range = info_dict[prefix_str + self.PER_CHANNEL_MAX_KEY] - info_dict[prefix_str + self.PER_CHANNEL_MIN_KEY]
+        global_range = info_dict[prefix_str + self.GLOBAL_MAX_KEY] - info_dict[prefix_str + self.GLOBAL_MIN_KEY]
+
+        if global_range == 0:
+            range_zero_explanation = "We recommend removing this channel as it doesn't provide any useful information."
+            raise ValueError(
+                "The range of the {} data for module {} is 0, which means you have a constant value channel. {}".format(
+                    info_str, module_fqn, range_zero_explanation
+                )
+            )
+
+        ratio = per_channel_range / global_range
+
+        return ratio
+
+    def _generate_comparison_values(self, input_info: Dict, weight_info: Dict) -> Dict[str, torch.Tensor]:
+        r"""
+        Takes in the information on the min and max values of the inputs and weights and:
+            Calculates the comp stat for each channel: s_c = sqrt(w_c/W)/sqrt(i_c/I)
+
+        Args:
+            input_info (dict): A dict mapping each observer to input range information
+            weight_info (dict): A dict mapping each observer to weight range information
+
+        Returns a dict mapping relevant observer fqns (str) to a 1-D tensor.
+            Each value is a different s_c value for a different channel
+        """
+        # create return dictionary for each observer
+        module_fqn_to_channel: Dict[str, torch.Tensor] = {}
+
+        # for each module (both passed in dicts should have same keys)
+        for module_fqn in input_info:
+
+            # raise error if not in weight info
+            if module_fqn not in weight_info:
+                raise KeyError(f"Unable to find weight range stats for module {module_fqn}")
+
+            # calculate the ratios of the weight info and input info
+            weight_ratio = self._calculate_range_ratio(weight_info[module_fqn], self.WEIGHT_STR, module_fqn)
+            input_ratio = self._calculate_range_ratio(input_info[module_fqn], self.INPUT_STR, module_fqn)
+
+            # if mismatched size, because of grouping, we want to replicate weight enough times
+            weight_channels = len(weight_ratio)
+            input_channels = len(input_ratio)
+            if weight_channels != input_channels:
+                # we try to replicate
+                assert input_channels % weight_channels == 0, "input channels should be divisible by weight channels."
+                # get replication factor
+                rep_factor: int = input_channels // weight_channels
+
+                # weight ratio is (n,), input ratio is (k,), we just repeat weight ratio k // n
+                weight_ratio = weight_ratio.repeat(rep_factor)
+
+            # calculate the s metric per channel
+            s = torch.sqrt(weight_ratio) / torch.sqrt(input_ratio)
+            module_fqn_to_channel[module_fqn] = s
+
+        # return compiled observer ratios
+        return module_fqn_to_channel
+
+    def _generate_dict_info(self, input_info: Dict, weight_info: Dict, comp_stats: Dict) -> Dict[str, Dict]:
+        r"""
+        Helper function for generate_detector_report that does the generation of the dictionary.
+        This process is done as specified in generate_detector_report documentation
+
+        Args:
+            input_info (dict): A dict mapping each module to input range information
+            weight_info (dict): A dict mapping each module to weight range information
+            comp_stats (dict): A dict mapping each module to its corresponding comp stat
+
+        Returns a dictionary mapping each module with relevant ModelReportObservers around them to:
+            whether input weight equalization is recommended
+            their s_c metric compared to the threshold
+            the threshold used to make the recommendation
+            the channel used for recording data
+            the input channel range info
+            the weight channel range info
+        """
+        # store modules input weight equalization info
+        input_weight_equalization_info: Dict[str, Dict] = {}
+
+        # for each module we add separate set of suggestions
+        for module_fqn in input_info:
+
+            # get relevant info for this module
+            mod_input_info: Dict = input_info[module_fqn]
+            mod_weight_info: Dict = weight_info[module_fqn]
+            mod_comp_stat: Dict = comp_stats[module_fqn]
+
+            # decide if each channel should have input weight equalization or not
+            channel_rec_vals: list = []
+
+            for val in mod_comp_stat:
+                float_rep: float = val.item()
+
+                # decide if recommending input weight equalization
+                recommended: bool = float_rep >= self.ratio_threshold and float_rep <= 1 / self.ratio_threshold
+                channel_rec_vals.append(recommended)
+
+            # build the return dict input
+            # also unpack input and weight dicts into it
+            input_weight_equalization_info[module_fqn] = {
+                self.RECOMMENDED_KEY: channel_rec_vals,
+                self.COMP_METRIC_KEY: mod_comp_stat,
+                self.THRESHOLD_KEY: self.ratio_threshold,
+                self.CHANNEL_KEY: self.ch_axis,
+                **mod_input_info,
+                **mod_weight_info,
+            }
+
+        # return our compiled info for each module
+        return input_weight_equalization_info
+
+    def generate_detector_report(self, model: GraphModule) -> Tuple[str, Dict[str, Any]]:
+        r"""
+        Determines whether input weight equalization is appropriate for a given module.
+
+        Takes advantage of the ModelReport Observer which records per channel information of input range
+        It then uses the passed in weight info inconjunction to compute the desired ratio
+        Finally, it gives suggestions based on this information for each module of interest
+
+        Args:
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a tuple with two elements:
+            String report of of whether input weight equalization is recommended for certain modules
+            Dictionary mapping modules of interest to:
+                whether input weight equalization is recommended
+                their s_c metric compared to the threshold
+                the threshold used to make the recommendation
+                the channel used for recording data
+                the input channel range info
+                the weight channel range info
+        """
+
+        # find the range of inputs
+        input_values: Dict[str, Dict] = self._extract_input_info(model)
+
+        # find the range of weights
+        weight_values: Dict[str, Dict] = self._extract_weight_info(model)
+
+        # calculate per_channel comparison statistic s_c
+        comp_stats: Dict[str, torch.Tensor] = self._generate_comparison_values(input_values, weight_values)
+
+        # generate the return dictionary
+        input_weight_equalization_info: Dict[str, Dict] = self._generate_dict_info(input_values, weight_values, comp_stats)
+
+        # now we can generate report based on this information
+        input_weight_string = "Input-Weight Equalization suggestions: \n"
+
+        # some strings to be formatted depending on module we are adding
+        module_suggestion_str = "For Module {} looked at with axis {}: \n"
+        channel_suggestion_str = "\tWe suggest {} input weight equalization because {}\n"
+        use_str = "to use"
+        no_use_str = "to not use"
+        input_weight_benefit_str = "{}/{} channels would benefit and we expect significant reduction in quantization error."
+        input_weight_non_benefit_reasoning = "{}/{} channels benefitting from input-weight equalization being applied."
+        input_weight_non_benefit_str = "we don't expect much improvement from input-weight equalization based on {}"
+
+        # added module check
+        added_module: bool = False
+
+        # compile the suggestion string
+        for module_fqn in input_weight_equalization_info:
+            # we added at least 1 module
+            added_module = True
+            # add the module level description
+            input_weight_string += module_suggestion_str.format(module_fqn, self.ch_axis)
+
+            mod_info: Dict[str, Any] = input_weight_equalization_info[module_fqn]
+
+            # gather info on how many channels would benefit from input weight and
+            recommendation_per_channel: torch.Tensor = mod_info[self.RECOMMENDED_KEY]
+            num_recs = sum(recommendation_per_channel)
+
+            if num_recs / len(recommendation_per_channel) >= self.DEFAULT_RECOMMEND_INPUT_WEIGHT_CHANNEL_RATIO:
+                input_benefit_formatted = input_weight_benefit_str.format(num_recs, len(recommendation_per_channel))
+                channel_str = channel_suggestion_str.format(use_str, input_benefit_formatted)
+                input_weight_string += channel_str
+            else:
+                non_benefit_reason_formatted = input_weight_non_benefit_reasoning.format(num_recs, len(recommendation_per_channel))
+                non_benefit_str = input_weight_non_benefit_str.format(non_benefit_reason_formatted)
+                channel_str = channel_suggestion_str.format(no_use_str, non_benefit_str)
+                input_weight_string += channel_str
+
+        # if no modules looked at, amend return string
+        if not added_module:
+            input_weight_string += "No applicable layers for suggestions. Only linear and conv valid.\n"
+
+        # return a tuple with the string explanation and the compiled dict info
+        return (input_weight_string, input_weight_equalization_info)
+
+
+class OutlierDetector(DetectorBase):
+    r"""
+    Determines whether there are significant outliers in activation data around a certain layer.
+
+    This is ideally used in conjunction with information on stationary vs. non-stationary distribution:
+        If the data is stationary, and there are significant outliers, then we want to flag them
+        We want to do this on a per channel basis for detecting outliers
+
+    Determines whether activation data is flagged as outlier based on if data is stationary and:
+        p_r = avg(100th percentile / "reference_percentile"th percentile)
+        where:
+            p_r is average percentile ratio across all batches in the epoch
+            reference_percentile is a percentile values between 0 and 100 exclusive
+
+        if p_r is above some threshold, then we consider the activations to have significant outliers
+
+    Args:
+        ratio_threshold (float, optional): The threshold for p_r to determine if there are outliers in activations
+            Should be >= 1
+            Default: 3.5
+        reference_percentile (float, optional): The denominator to find the relative scale of the 100th percentile
+            Should be between 0 and 1
+            Default: 0.975
+        fraction_batches_used_threshold (float, optional): Threshold of fraction of batches per channel to determine outlier
+            If fraction is below this, we deem number of samples used to calculate outliers as insignificant and alert user
+            regardless of whether we detected outliers or not in channel to take a closer look at channel results
+            Should be between 0 and 1
+            Default: 0.95
+        ch_axis (int, optional): The channel axis being observed to determine input weight equalization
+            Default: 1
+
+    * :attr:`ratio_threshold`: The threshold for p_r to determine if there are outliers in activations
+        The p_r value (average ratio of 100th percentile/reference_percentile) is compared to ratio_threshold
+        If it is significantly greater, then we consider it an outlier
+        This threshold was calculated based on the ratio of the percentiles in a normal distribution
+        The calculations behind value choice: https://drive.google.com/file/d/1N2wdtXWI-kOH8S7HH4-PYB_NmqzZil4p/view?usp=sharing
+
+    * :attr:`reference_percentile`: The denominator of the top fraction to find the relative scale of the 100th percentile
+        Should be between 0 and 1
+        The calculations behind value choice: https://drive.google.com/file/d/1N2wdtXWI-kOH8S7HH4-PYB_NmqzZil4p/view?usp=sharing
+
+    * :attr:`fraction_batches_used_threshold`: The fraction of batches to determine outliers for each channel should be above this
+        Some batches may not be used because of 0-based errors, so this is to ensure a good amount of the total batches are used
+        Should be between 0 and 1
+
+    * :attr:`ch_axis`: The channel axis being observed to determine outliers
+
+    * :attr:`DEFAULT_PRE_OBSERVER_NAME`: The name of the pre-observer to be inserted for this detector
+    """
+
+    # names for the pre observers that are inserted
+    DEFAULT_PRE_OBSERVER_NAME: str = "model_report_pre_observer"
+
+    # pre activation prefix
+    INPUT_ACTIVATION_PREFIX = "input_activation_"
+
+    # names for dict keys
+    OUTLIER_KEY = "outliers_detected"
+    NUM_BATCHES_KEY = "outlier_detection_batches_used"
+    IS_SUFFICIENT_BATCHES_KEY = "outlier_detection_is_sufficient_batches"
+    COMP_METRIC_KEY = "outlier_detection_percentile_ratios"
+    RATIO_THRES_KEY = "outlier_detection_ratio_threshold"
+    REF_PERCENTILE_KEY = "outlier_detection_reference_percentile"
+    CHANNEL_AXIS_KEY = "outlier_detection_channel_axis"
+    MAX_VALS_KEY = INPUT_ACTIVATION_PREFIX + "per_channel_max"
+    CONSTANT_COUNTS_KEY = "constant_batch_counts"
+
+    def __init__(
+        self,
+        ratio_threshold: float = 3.5,
+        reference_percentile: float = 0.975,
+        fraction_batches_used_threshold: float = 0.95,
+        ch_axis: int = 1,
+    ):
+        # initialize the variables of interest
+        self.ratio_threshold = ratio_threshold
+
+        # make sure passed in percentile is valid
+        assert reference_percentile >= 0 and reference_percentile <= 1
+        assert fraction_batches_used_threshold >= 0 and fraction_batches_used_threshold <= 1
+        self.reference_percentile = reference_percentile
+        self.fraction_batches_used_threshold = fraction_batches_used_threshold
+        self.ch_axis = ch_axis
+
+    def get_detector_name(self) -> str:
+        r"""Returns the name of this detector"""
+        return "outlier_detector"
+
+    def _supports_insertion(self, module: nn.Module) -> bool:
+        r"""Returns whether the given module is supported for observers insertion
+
+        Any module that doesn't have children and isn't an observer itself is supported
+
+        Args
+            module: The module to check and ensure is supported
+
+        Returns True if the module is supported by observer, False otherwise
+        """
+        # case for insertion of module
+        # check if the module has any children and isn't observer
+        num_children = len(list(module.children()))
+        return num_children == 0 and not _is_activation_post_process(module)
+
+    def get_qconfig_info(self, model) -> Dict[str, DetectorQConfigInfo]:
+        r""" Returns the DetectorQConfigInfo for each module_fqn relevant
+        Args
+            model (nn.Module or subclass): model to find observer insertion points
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to:
+            A DetectorQConfigInfo with the information to generate a QConfig for a specific module
+        """
+        # currently doesn't do anything for outlier detector
+        return {}
+
+    def _supports_report_gen(self, module: nn.Module) -> bool:
+        r"""Returns whether the given module is supported for report generation
+
+        Any module that has a model report pre-observer is supported
+
+        Args
+            module: The module to check and ensure is supported
+
+        Returns True if the module is supported by observer, False otherwise
+        """
+        return hasattr(module, self.DEFAULT_PRE_OBSERVER_NAME)
+
+    def determine_observer_insert_points(self, prepared_fx_model: GraphModule) -> Dict[str, Dict[str, Any]]:
+        r""" Determines where observers need to be inserted for the Outlier Detector.
+
+        For this detector, we want to place observers in front of supported layers.
+
+        Currently inserts observers for:
+            all layers that do not have children (leaf level layers)
+
+        Args:
+            prepared_fx_model (GraphModule):  The prepared Fx GraphModule
+
+        Returns a Dict mapping from unique observer fqns (where we want to insert them) to a Dict with:
+            key "target_node" -> the node we are trying to observe with this observer (torch.fx.node.Node)
+            key "observer_to_insert" -> the observer we wish to insert (ObserverBase)
+            key "is_post_observer" -> True if this is meant to be a post-observer for target_node, False if pre-observer
+            key "observer_args" -> The arguments that are meant to be passed into the observer
+        """
+        # observer for this detector is ModelReportObserver
+        obs_ctr = ModelReportObserver
+
+        # return dict
+        obs_fqn_to_info: Dict[str, Dict[str, Any]] = {}
+
+        for fqn, module in prepared_fx_model.named_modules():
+            # check to see if module is of a supported type
+            if self._supports_insertion(module):
+                # if it's a supported type, we want to get node and add observer insert locations
+                targeted_node = self._get_targeting_node(prepared_fx_model, fqn)
+
+                # add entry for pre-observer
+                pre_obs_fqn = fqn + "." + self.DEFAULT_PRE_OBSERVER_NAME
+
+                obs_fqn_to_info[pre_obs_fqn] = {
+                    DETECTOR_TARGET_NODE_KEY: targeted_node,
+                    DETECTOR_OBS_TO_INSERT_KEY: obs_ctr(ch_axis=self.ch_axis, comp_percentile=self.reference_percentile),
+                    DETECTOR_IS_POST_OBS_KEY: False,
+                    DETECTOR_OBS_ARGS_KEY: targeted_node.args,
+                }
+
+        return obs_fqn_to_info
+
+    def _calculate_outlier_info(
+        self,
+        percentile_ratios: torch.Tensor,
+        counted_batches: torch.Tensor,
+        total_batches: int,
+    ) -> Dict[str, List[bool]]:
+        r"""
+        Gives info on whether the percentile ratios calculated would be considered outliers
+        Also gives information on whether the collected data is statistically significant to make this claim
+
+        Args:
+            percentile_ratios (torch.Tensor): The average percentile_ratios per channel calculated by the observer
+            counted_batches (torch.Tensor): The number of batches used for average calculation per tensor
+            total_batches (int): The total number of batches that passed through observer in this epoch
+
+        Returns a dictionary mapping:
+            "outliers_detected" : list of bools per channel that are true if it is considered an outlier
+            "is_sufficient_batches": if o_r was >= fraction_batches_used_threshold:
+                where o_r = counted_batches / total_batches
+        """
+        outlier_dict: Dict[str, List[bool]] = {self.OUTLIER_KEY: [], self.IS_SUFFICIENT_BATCHES_KEY: []}
+
+        # get both as flattened lists for easy mapping
+        ratios_list: List = percentile_ratios.tolist()
+        num_batches_list: List = counted_batches.tolist()
+
+        # calculate whether channels were statistically significant
+        significant_size = [
+            batch_size / total_batches >= self.fraction_batches_used_threshold for batch_size in num_batches_list
+        ]
+        outlier_dict[self.IS_SUFFICIENT_BATCHES_KEY] = significant_size
+
+        # calculate for each channel whether it's an outlier or not based on ratio
+        outlier_detected = [ratio > self.ratio_threshold for ratio in ratios_list]
+        outlier_dict[self.OUTLIER_KEY] = outlier_detected
+
+        # return the dictionary with the two lists
+        return outlier_dict
+
+    def _generate_info_dict(self, model: GraphModule) -> Dict[str, Dict]:
+        r"""
+        Helper function for generate_detector_report that does the generation of the dictionary.
+        This process is done as specified in generate_detector_report documentation
+
+        Args:
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a dict mapping relevant module fqns to:
+            whether there were outliers found in activation before
+            the number of batches used for each channel
+            whether fraction of applicable batches used is above fraction_batches_used_threshold
+            their p_r metric compared to the threshold
+            the threshold used to make the recommendation
+            the reference_percentile used to make the recommendation
+            the channel axis used to determine individual channels
+            the constant batch counts per channel
+            the per channel max values
+        """
+        # return dictionary mapping observer fqns to desired info
+        info_dict: Dict[str, Dict] = {}
+
+        for fqn, module in model.named_modules():
+            # if module is supported and it has a pre-observer
+            if self._supports_report_gen(module):
+                # get pre observer for the module
+                pre_obs: ModelReportObserver = getattr(module, self.DEFAULT_PRE_OBSERVER_NAME)
+
+                # get the number of batches and calculated ratio thresholds
+                num_batches: torch.Tensor = pre_obs.percentile_batches_tracked
+                average_ratios: torch.Tensor = pre_obs.average_percentile_ratio
+                channel_batch_cnts: torch.Tensor = pre_obs.constant_channels
+                total_batches: int = pre_obs.num_batches_tracked
+
+                # also get the max values
+                max_vals: torch.Tensor = pre_obs.max_val
+
+                # we have to specifically modify how we are recording negative ratio for pre-relu layers
+                for index, ratio_val in enumerate(average_ratios):
+                    # check if we have a negative ratio
+                    # a ratio might be negative if we have a situation where the 100th percentile is
+                    # > 0 while the nth percentile is < 0, in which case this would not be detected
+                    # as an outlier. Since we care more about magnitude, we make it positive.
+                    if ratio_val.item() < 0:
+                        # first make it positive
+                        average_ratios[index] = -ratio_val
+
+                    if ratio_val.item() < 1:
+                        # if it's less than 1 we have the flip it as well
+                        average_ratios[index] = 1 / ratio_val
+
+                outlier_calcs = self._calculate_outlier_info(average_ratios, num_batches, total_batches)
+
+                # calculate whether ratios were outliers
+                info_dict[fqn] = {
+                    self.CHANNEL_AXIS_KEY: self.ch_axis,
+                    self.REF_PERCENTILE_KEY: self.reference_percentile,
+                    self.RATIO_THRES_KEY: self.ratio_threshold,
+                    self.COMP_METRIC_KEY: average_ratios,
+                    self.NUM_BATCHES_KEY: num_batches,
+                    self.OUTLIER_KEY: outlier_calcs[self.OUTLIER_KEY],
+                    self.IS_SUFFICIENT_BATCHES_KEY: outlier_calcs[self.IS_SUFFICIENT_BATCHES_KEY],
+                    self.CONSTANT_COUNTS_KEY: channel_batch_cnts,
+                    self.MAX_VALS_KEY: max_vals
+                }
+
+        return info_dict
+
+    def generate_detector_report(self, model: GraphModule) -> Tuple[str, Dict[str, Any]]:
+        r"""
+        Determines whether input weight equalization is appropriate for a given module.
+
+        Takes advantage of the ModelReport Observer which records the relevant percentile information
+
+        Args:
+            model (GraphModule): The prepared and calibrated GraphModule with inserted ModelReportObservers
+
+        Returns a tuple with two elements:
+            String report of of whether there are outliers in the activations around certain modules
+            Dictionary mapping modules of interest to:
+                whether there were outliers found in activation before
+                the number of batches used for each channel
+                whether fraction of applicable batches used is above fraction_batches_used_threshold
+                their p_r metric compared to the threshold
+                the threshold used to make the recommendation
+                the reference_percentile used to make the recommendation
+                the channel axis used to determine individual channels
+                the constant batch counts per channel
+                the per channel max values
+        """
+        # generate the information dictionary of outlier information
+        info_dict = self._generate_info_dict(model)
+
+        # now we can generate report based on this information
+        outlier_string = "Outlier detection report: \n"
+
+        # added module check
+        added_module: bool = False
+
+        # some strings to be formatted depending on module we are adding
+        module_suggestion_str = "For Module {} looked at with axis {}: \n"
+        channel_suggestion_str = "\tFor channel {}, we found outliers in the preceding activation data with {}.\n"
+        channel_max_value_str = "a max value across all batches of {}"
+        note_string = "Note: outlier detection is only reliable for {}. We recommend {} to ensure the most accurate results."
+        note_distribution = "stationary distributions"
+        note_rec = "running the static vs. dynamic detector to ensure activation data before modules above is stationary"
+
+        # suggestion for constant batch check since that can make it no outliers
+        constant_str = "\tFor channel {}, we found {} constant value batches. {}\n"
+        constant_suggestion = "We recommend taking a look at the dict and data to see how frequent this occurred and why."
+
+        # compile the suggestion string
+        for module_fqn in info_dict:
+            # get module specific info
+            mod_info: Dict[str, Any] = info_dict[module_fqn]
+            # check to see if we already added high level model desc
+            added_model_desc = False
+            # look at each individual channel and add a suggestion
+            for index, outlier_detected in enumerate(mod_info[self.OUTLIER_KEY]):
+                if outlier_detected:
+                    # we found at least 1 outlier
+                    if not added_model_desc:
+                        # add the module level description
+                        outlier_string += module_suggestion_str.format(module_fqn, self.ch_axis)
+                        added_model_desc = True
+
+                    # we mark that we found at least one outlier
+                    added_module = True
+                    max_value_found_str = channel_max_value_str.format(mod_info[self.MAX_VALS_KEY][index])
+                    channel_str = channel_suggestion_str.format(index, max_value_found_str)
+                    outlier_string += channel_str
+
+                # also check if we found constant batch
+                if mod_info[self.CONSTANT_COUNTS_KEY][index] != 0:
+                    # make sure we add a module level highlight.
+                    if not added_model_desc:
+                        # add the module level description
+                        outlier_string += module_suggestion_str.format(module_fqn, self.ch_axis)
+                        added_model_desc = True
+
+                    constant_values_for_channel = mod_info[self.CONSTANT_COUNTS_KEY][index]
+                    formatted_str = constant_str.format(index, constant_values_for_channel, constant_suggestion)
+                    outlier_string += formatted_str
+                    # we also added at least one thing to description
+                    added_module = True
+
+
+        # if found outlier, give suggestion, else give default response
+        if added_module:
+            # compose the note string
+            note_composed = note_string.format(note_distribution, note_rec)
+            outlier_string += note_composed
+        else:
+            outlier_string += "There were no outliers found in the activations.\n"
+
+        return (outlier_string, info_dict)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report.py
new file mode 100644
index 0000000000000000000000000000000000000000..934008931291cd5820b9b2e83c65f31add014f9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report.py
@@ -0,0 +1,606 @@
+from typing import Any, Dict, Set, Tuple, Callable
+from collections import OrderedDict
+import torch
+from torch.ao.quantization.fx._model_report.detector import (
+    DetectorBase,
+    DETECTOR_OBS_ARGS_KEY,
+    DETECTOR_OBS_TO_INSERT_KEY,
+    DETECTOR_IS_POST_OBS_KEY,
+    DETECTOR_TARGET_NODE_KEY,
+    DetectorQConfigInfo
+)
+from torch.ao.quantization.fx._model_report.model_report_visualizer import ModelReportVisualizer
+from torch.ao.quantization.fx.graph_module import GraphModule
+from torch.ao.quantization.observer import ObserverBase
+from torch.ao.quantization.qconfig_mapping import QConfigMapping, QConfig
+from torch.ao.quantization.fx._equalize import EqualizationQConfig
+
+class ModelReport:
+    r"""
+    The ModelReport class aims to provide users an easy way to diagnose issues that they run into
+    with their models. The class works with all traceable GraphModules to help diagnose issues,
+    though the requirements on the type of model more-so depends on the specific report the user
+    is trying to generate. With respect to the reports, the ModelReport class is initialized with
+    a set of Detector classes, each of which generate reports on quantization configuration
+    issues a use might have.
+
+    Currently supports generating reports on:
+    - Suggestions for per-channel vs. per-tensor quantization (nn.Module)
+    - Suggestions for dynamic vs static quantization for linear layers (Graph Modules)
+    - Suggestions for input-weight equalization for linear and conv layers (Graph Modules)
+    - Suggestions for outlier detection for all layers (Graph Modules)
+
+    The ModelReport class has the primary functionality of inserting observers (primarily the ModelReportObserver)
+    where needed for each detector to gather the information it needs, and then after callibration, the ModelReport
+    class compiles the report generated by each Detector class into a single report to return to the user. It also
+    has the capability to remove all the observers it inserted as well.
+
+    * :attr:`_model` The model we wish to generate the report for. Must be a traceable GraphModule
+
+    * :attr:`_desired_report_detectors` The set of Detectors representing desired reports from the ModelReport class
+        Make sure that these are all unique types of detectors [do not have more than 1 of the same class]
+
+    * :attr:`_desired_detector_names` The set of detector names of the _desired_report_detectors.
+        This set is generated by calling the get_detector_name() of each detector
+
+    * :attr:`_detector_name_to_observer_fqns` The mapping from each detector to fqns of observers of interest
+        The purpose of this is to keep track of what observers were inserted for each detector, so that they
+        can be removed at the end if desired
+
+    * :attr:`_prepared_flag` A boolean flag that keeps track of whether we have prepared the model or not
+        This is to ensure we only insert observers once with the ModelReport instance
+
+    * :attr:`_removed_observers` A boolean to track if we have removed observers already
+        The purpose is to ensure we don't attempt to remove observers twice with the same ModelReport
+        instance. This also allows the functionality where we can generate the report multiple times
+        as long as we haven't removed the observers yet.
+
+    Note:
+        This class was initially designed to work with the Fx Graph Mode workflow in mind. However,
+        full functionality is available as long as there is a traceable GraphModule that is being used.
+        One method to get a traceable GraphModule without going through the Fx workflow is to use
+        the QuantizationTracer class.
+
+    General Flow for Fx workflow:
+    1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects and model
+    2.) Prepare your model with prepare_fx
+    3.) Call model_report.prepare_detailed_calibration to add relevant observers
+    4.) Callibrate your model with data
+    5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
+    Optional
+        6.) Call model_report.generate_visualizer to get a ModelReportVisualizer instance
+        7.) To help in parsing report information and debugging, view report info as a:
+            - Table
+            - Histogram
+            - Line plot
+    8.) Call model_report.generate_qconfigs to generate the qconfigs based on the report suggestions
+
+    Example (with QuantizationTracer):
+        >>> # xdoctest: +SKIP
+        >>> # get the necessary qconfig
+        >>> config = PrepareCustomConfig()
+        >>> skipped_module_names, skipped_module_classes = get_skipped_module_name_and_classes(config, False)
+
+        >>> # initialize our model and get GraphModule
+        >>> model = SomeModel()
+        >>> tracer = QuantizationTracer(skipped_module_names, skipped_module_classes)
+        >>> graph_module = GraphModule(model, tracer.trace(model))
+
+        >>> # get our set of detectors and ModelReport instance
+        >>> detector_set = set([DynamicStaticDetector(tolerance=0.5), InputWeightEqualizationDetector(ratio_threshold=0.7)])
+        >>> tracer_reporter = ModelReport(graph_module, tracer_detector_set)
+
+        >>> # now we insert the observers and callibrate the model
+        >>> tracer_model_with_observers = tracer_reporter.prepare_detailed_calibration()
+        >>> for i in range(num_callibration_batches):
+        >>>     example_input = get_callibration_input()
+        >>>     tracer_model_with_observers(example_input)
+
+        >>> # finally we generate the reports and optionally remove the observers we inserted
+        >>> reports = tracer_reporter.generate_model_report(remove_inserted_observers=True)
+
+        >>> # Optional: we can generate the qconfig mapping based on the suggestions
+        >>> qconfigs = model_report.generate_qconfig_mapping()
+
+        >>> # Optional: we can generate the equalization mapping based on the suggestions
+        >>> qconfigs = model_report.generate_equalization_mapping()
+
+        >>> # Optional: we get a ModelReportVisualizer instance to do any visualizations desired
+        >>> model_report_visualizer = tracer_reporter.generate_visualizer()
+
+    """
+
+    def __init__(self, model: GraphModule, desired_report_detectors: Set[DetectorBase]):
+
+        if len(desired_report_detectors) == 0:
+            raise ValueError("Should include at least 1 desired report")
+
+        # keep track of the model we wish to generate report for
+        self._model: GraphModule = model
+
+        # keep the reports private so they can't be modified
+        self._desired_report_detectors = desired_report_detectors
+        self._desired_detector_names = {detector.get_detector_name() for detector in desired_report_detectors}
+
+        # keep a mapping of desired reports to observers of interest
+        # this is to get the readings, and to remove them, can create a large set
+        # this set can then be used to traverse the graph and remove added observers
+        self._detector_name_to_observer_fqns: Dict[str, Set[str]] = {}
+
+        # initialize each report to have empty set of observers of interest
+        for desired_report in self._desired_detector_names:
+            self._detector_name_to_observer_fqns[desired_report] = set()
+
+        # flags to ensure that we can only prepare and remove observers once
+        self._prepared_flag = False
+        self._removed_observers = False
+
+        # store the reports that we generated for visualization purposes
+        # initially empty since no reports generated
+        self._generated_reports: Dict[str, Dict] = {}
+
+    def get_desired_reports_names(self) -> Set[str]:
+        """ Returns a copy of the desired reports for viewing """
+        return self._desired_detector_names.copy()
+
+    def get_observers_of_interest(self) -> Dict[str, Set[str]]:
+        """ Returns a copy of the observers of interest for viewing """
+        return self._detector_name_to_observer_fqns.copy()
+
+    def prepare_detailed_calibration(self) -> GraphModule:
+        r"""
+        Takes in a graph model and inserts the following observers:
+        - ModelReportObserver
+
+        Each observer is inserted based on the desired_reports into the relevant locations
+
+        Right now, each report in self._desired_detector_names has independent insertions
+            However, if a module already has a Observer of the same type, the insertion will not occur
+            This is because all of the same type of Observer collect same information, so redundant
+
+        Returns the same GraphModule with the observers inserted
+        """
+
+        # if already prepared once, cannot prepare again
+        if self._prepared_flag:
+            raise ValueError("Already ran preparing detailed callibration. Run the report generation next after callibration.")
+
+        # loop through each detector, find where placements should be, and keep track
+        insert_observers_fqns: Dict[str, Any] = {}
+
+        for detector in self._desired_report_detectors:
+            # determine observer points for each detector
+            obs_fqn_to_info = detector.determine_observer_insert_points(self._model)
+            # map each insert point to the observer to use
+            insert_observers_fqns.update(obs_fqn_to_info)
+            # update the set of observers this report cares about
+            self._detector_name_to_observer_fqns[detector.get_detector_name()] = set(obs_fqn_to_info.keys())
+
+        # now insert all the observers at their desired locations
+        for observer_fqn in insert_observers_fqns:
+            target_node = insert_observers_fqns[observer_fqn][DETECTOR_TARGET_NODE_KEY]
+            insert_obs = insert_observers_fqns[observer_fqn][DETECTOR_OBS_TO_INSERT_KEY]
+            insert_post = insert_observers_fqns[observer_fqn][DETECTOR_IS_POST_OBS_KEY]
+            observer_args = insert_observers_fqns[observer_fqn][DETECTOR_OBS_ARGS_KEY]
+            self._insert_observer_around_module(
+                observer_fqn, target_node, insert_obs, observer_args, insert_post
+            )
+
+        self._prepared_flag = True
+
+        return self._model
+
+    def _insert_observer_around_module(
+        self,
+        obs_fqn: str,
+        target_node: torch.fx.node.Node,
+        obs_to_insert: ObserverBase,
+        observer_args: Tuple,
+        insert_post: bool
+    ):
+        r"""
+        Helper function that inserts the observer into both the graph structure and the module of the model
+
+        Args
+            node_fqn (str): The fully qualified name of the observer we want to insert
+            target_node (torch.fx.node.Node): The node in model we are inserting observers around
+            obs_to_insert (ObserverBase): The observer we are inserting around target_node
+            observer_args (Tuple): The arguments we want to pass into the observer
+            insert_post (bool): whether this is meant to be a post observer for this node
+        """
+        # if we are inserting post, then our target node is the next node
+        if insert_post:
+            target_node = target_node.next
+
+        with self._model.graph.inserting_before(target_node):
+            self._model.add_submodule(obs_fqn, obs_to_insert)
+            self._model.graph.create_node(op="call_module", target=obs_fqn, args=observer_args)
+
+        # recompile model after inserts are made
+        self._model.recompile()
+
+    def _get_node_from_fqn(self, node_fqn: str) -> torch.fx.node.Node:
+        r"""
+        Takes in a node fqn and returns the node based on the fqn
+
+        Args
+            node_fqn (str): The fully qualified name of the node we want to find in model
+
+        Returns the Node object of the given node_fqn otherwise returns None
+        """
+        node_to_return = None
+        for node in self._model.graph.nodes:
+            # if the target matches the fqn, it's the node we are looking for
+            if node.target == node_fqn:
+                node_to_return = node
+                break
+
+        if node_to_return is None:
+            raise ValueError("The node_fqn is was not found within the module.")
+
+        # assert for MyPy
+        assert isinstance(node_to_return, torch.fx.node.Node)
+
+        return node_to_return
+
+    def generate_model_report(
+        self, remove_inserted_observers: bool
+    ) -> Dict[str, Tuple[str, Dict]]:
+        r"""
+        Generates all the requested reports.
+
+        Note:
+            You should have callibrated the model with relevant data before calling this
+
+        The reports generated are specified by the desired_reports specified in desired_reports
+
+        Can optionally remove all the observers inserted by the ModelReport instance
+
+        Args:
+            remove_inserted_observers (bool): True to remove the observers inserted by this ModelReport instance
+
+        Returns a mapping of each desired report name to a tuple with:
+            The textual summary of that report information
+            A dictionary containing relevant statistics or information for that report
+
+        Note:
+            Throws exception if we try to generate report on model we already removed observers from
+            Throws exception if we try to generate report without preparing for callibration
+        """
+        # if we haven't prepped model for callibration, then we shouldn't generate report yet
+        if not self._prepared_flag:
+            raise Exception("Cannot generate report without preparing model for callibration")
+
+        # if we already removed the observers, we cannot generate report
+        if self._removed_observers:
+            raise Exception("Cannot generate report on model you already removed observers from")
+
+        # keep track of all the reports of interest and their outputs
+        reports_of_interest = {}
+
+        for detector in self._desired_report_detectors:
+            # generate the individual report for the detector
+            report_output = detector.generate_detector_report(self._model)
+            reports_of_interest[detector.get_detector_name()] = report_output
+
+        # if user wishes to remove inserted observers, go ahead and remove
+        if remove_inserted_observers:
+            self._removed_observers = True
+            # get the set of all Observers inserted by this instance of ModelReport
+            all_observers_of_interest: Set[str] = set()
+            for desired_report in self._detector_name_to_observer_fqns:
+                observers_of_interest = self._detector_name_to_observer_fqns[desired_report]
+                all_observers_of_interest.update(observers_of_interest)
+
+            # go through all_observers_of_interest and remove them from the graph and model
+            for observer_fqn in all_observers_of_interest:
+                # remove the observer from the model
+                self._model.delete_submodule(observer_fqn)
+
+                # remove the observer from the graph structure
+                node_obj = self._get_node_from_fqn(observer_fqn)
+
+                if node_obj:
+                    self._model.graph.erase_node(node_obj)
+                else:
+                    raise ValueError("Node no longer exists in GraphModule structure")
+
+            # remember to recompile the model
+            self._model.recompile()
+
+        # save the generated reports for visualization purposes
+        saved_reports: Dict[str, Dict] = {
+            report_name : report_tuple[1] for report_name, report_tuple in reports_of_interest.items()
+        }
+
+        self._generated_reports = saved_reports
+
+        # return the reports of interest
+        return reports_of_interest
+
+    def _is_same_info_for_same_key(self, info_dict_a: Dict, info_dict_b: Dict) -> bool:
+        r"""
+        Takes in two dictionaries and ensures that any common keys between the two have the same
+        values.
+
+        Args:
+            info_dict_a (Dict): First dictionary we wish to compare
+            info_dict_b (Dict): Second dictionary we wish to compare
+
+        Returns True if all shared keys have same values, false otherwise
+        """
+        # get the set of keys for both
+        dict_a_keys: Set = set(info_dict_a.keys())
+        dict_b_keys: Set = set(info_dict_b.keys())
+
+        # get the insersection keys and check if same value for both dicts
+        intersecting_keys: Set = dict_a_keys.intersection(dict_b_keys)
+
+        for key in intersecting_keys:
+            dict_a_val = info_dict_a[key]
+            dict_b_val = info_dict_b[key]
+
+            # if it's a tensor we have to handle separately
+            if type(dict_a_val) == torch.Tensor:
+                # if dict_b_val not tensor, automatically false
+                if type(dict_b_val) != torch.Tensor or sum(dict_a_val != dict_b_val) != 0:
+                    return False
+            else:
+                # for non-tensor vals
+                if dict_a_val != dict_b_val:
+                    return False
+
+        # if no non matching shared keys found, return true
+        return True
+
+    def _reformat_reports_for_visualizer(self) -> OrderedDict:
+        r"""
+        Takes the generated reports and reformats them into the format that is desired by the
+        ModelReportVisualizer
+
+        Returns an OrderedDict mapping module_fqns to their features
+        """
+        # we want to reorder and reformat the information so it is ordered in terms of order
+        # found in the model
+
+        # first create new dict with all modules as keys and features under respective module
+        module_fqns_to_features: Dict[str, Dict] = {}
+
+        for report_name in self._generated_reports:
+            # get mod -> feature dict and go through
+            module_info = self._generated_reports[report_name]
+
+            for module_fqn in module_info:
+                # check if already in our accumulation dict
+                if module_fqn in module_fqns_to_features:
+                    # we merge all the features together
+                    new_info: Dict = module_info[module_fqn]
+                    present_info: Dict = module_fqns_to_features[module_fqn]
+
+                    # merge them together into the new unioned dict
+                    # same features keys -> same info, so okay if override
+
+                    # do safety check to make sure shared keys have same info
+                    if self._is_same_info_for_same_key(new_info, present_info):
+                        module_fqns_to_features[module_fqn] = {**new_info, **present_info}
+                    else:
+                        error_str = "You have the same key with different values across detectors. "
+                        error_str += "Someone incorrectly implemented a detector with conflicting keys to existing detectors."
+                        raise ValueError(error_str)
+                else:
+                    # we just set it
+                    module_fqns_to_features[module_fqn] = module_info[module_fqn]
+
+        # our ordered dict so that modules can be ordered in order of how they appear in model
+        features_by_module: OrderedDict[str, Dict] = OrderedDict()
+
+        # we loop through modules in graph in order
+        for fqn, module in self._model.named_modules():
+            # find that fqn in fqns_to_features
+            if fqn in module_fqns_to_features:
+                # add it to our ordered dict
+                features_by_module[fqn] = module_fqns_to_features[fqn]
+
+        # return the ordered dict of info we created
+        return features_by_module
+
+    def generate_visualizer(self) -> ModelReportVisualizer:
+        r"""
+        Generates a ModelReportVisualizer instance using the reports generated
+        by the generate_model_report() method.
+
+        Returns the generated ModelReportVisualizer instance initialized
+
+        Note:
+            Throws exception if attempt to get visualizers without generating report
+        """
+        # check if user has generated reports at least once
+        if len(self._generated_reports) == 0:
+            raise Exception("Unable to generate visualizers without first generating reports")
+
+        # get the ordered dict mapping modules to their full set of collected features / stats
+        module_fqns_to_features: OrderedDict = self._reformat_reports_for_visualizer()
+
+        # create and return ModelReportVisualizer instance
+        visualizer: ModelReportVisualizer = ModelReportVisualizer(module_fqns_to_features)
+
+        return visualizer
+
+    def _generate_qconfig_mapping_helper(
+        self,
+        detector_qconfig_info_combined: Dict[str, DetectorQConfigInfo],
+        generation_function: Callable
+    ) -> QConfigMapping:
+        r"""
+        This helper takes in the compiled detector qconfig info that
+        has been compiled together and merges it into a QConfigMapping
+        """
+        # keep track of the qconfigmapping
+        qconfig_mapping = QConfigMapping()
+
+        # loop through each module / fqn and attempt to create QConfigMapping
+        for fqn, module in self._model.named_modules():
+            # if we have a qconfig info for this module
+            if fqn in detector_qconfig_info_combined:
+                qconfig_info_compiled = detector_qconfig_info_combined[fqn]
+
+                # now generate the qconfig and add it to the mapping
+                generated_qconfig = generation_function(qconfig_info_compiled, module)
+
+                # add to our config
+                qconfig_mapping.set_module_name(fqn, generated_qconfig)
+
+        # return compiled mapping
+        return qconfig_mapping
+
+    def _update_detector_quantizaiton_qconfig_info(self, combined_info: DetectorQConfigInfo, new_info: DetectorQConfigInfo):
+        r"""
+        Takes in the old and new information and updates the combined information.
+
+        Args:
+            combined_info (DetectorQConfigInfo): The DetectorQConfigInfo we are compiling all of the information in
+            new_info (DetectorQConfigInfo): The DetectorQConfigInfo with the information we are trying to merge the new info
+                into it
+        """
+        combined_info.is_activation_dynamic = combined_info.is_activation_dynamic or new_info.is_activation_dynamic
+        combined_info.is_weight_per_channel = combined_info.is_weight_per_channel or new_info.is_weight_per_channel
+
+    def _update_detector_equalization_qconfig_info(self, combined_info: DetectorQConfigInfo, new_info: DetectorQConfigInfo):
+        r"""
+        Takes in the old and new information and updates the combined information.
+
+        Args:
+            combined_info (DetectorQConfigInfo): The DetectorQConfigInfo we are compiling all of the information in
+            new_info (DetectorQConfigInfo): The DetectorQConfigInfo with the information we are trying to merge the new info
+                into it
+        """
+        is_equalization_recommended = combined_info.is_equalization_recommended or new_info.is_equalization_recommended
+        combined_info.is_equalization_recommended = is_equalization_recommended
+
+    def _generate_module_fqn_to_detector_info_mapping(
+        self,
+        update_qconfig_info_function: Callable
+    ) -> Dict[str, DetectorQConfigInfo]:
+        r"""
+        Generates a QConfigMapping based on the suggestions of the
+        ModelReport API. The generated mapping encompasses all the
+        different types of feedback from the different detectors
+        all into one place.
+
+        These configs are based on the suggestions provided by the ModelReport API
+        and can only be generated once the reports have been generated.
+
+        Args:
+            update_qconfig_info_function (Callable) takes in a function that takes in two DetectorQConfigInfo
+            and updates the one that is being compiled
+
+        Returns a Dict mapping module_fqns to DetectorQConfigInfo objects
+
+        Note:
+            Throws exception if we try to generate mapping on model we already removed observers from
+            Throws exception if we try to generate mapping without preparing for callibration
+        """
+        # if we haven't prepped model for callibration, then we shouldn't generate mapping yet
+        if not self._prepared_flag:
+            raise Exception("Cannot generate report without preparing model for callibration")
+
+        # if we already removed the observers, we cannot mapping
+        if self._removed_observers:
+            raise Exception("Cannot generate report on model you already removed observers from")
+
+        # keep track of qconfig info for each module across detectors
+        detector_qconfig_info_combined: Dict[str, DetectorQConfigInfo] = {}
+
+        for detector in self._desired_report_detectors:
+            # get the info from the detector
+            detector_info: Dict[str, DetectorQConfigInfo] = detector.get_qconfig_info(self._model)
+
+            # we go through the modules
+            for module_fqn in detector_info:
+                # see if we already have info on it
+                if module_fqn in detector_qconfig_info_combined:
+                    # we combine the current options with what is there
+                    current_options = detector_qconfig_info_combined[module_fqn]
+                    detector_options = detector_info[module_fqn]
+
+                    update_qconfig_info_function(current_options, detector_options)
+                else:
+                    # we just use this for now
+                    detector_qconfig_info_combined[module_fqn] = detector_info[module_fqn]
+
+        return detector_qconfig_info_combined
+
+    def generate_qconfig_mapping(self) -> QConfigMapping:
+        r"""
+        Generates a QConfigMapping based on the suggestions of the
+        ModelReport API. The generated mapping encompasses all the
+        different types of feedback from the different detectors
+        all into one place.
+
+        These configs are based on the suggestions provided by the ModelReport API
+        and can only be generated once the reports have been generated.
+
+        Returns a QConfigMapping for the quantization configuration
+
+        Note:
+            Throws exception if we try to generate mapping on model we already removed observers from
+            Throws exception if we try to generate mapping without preparing for callibration
+        """
+        # get the mapping info
+        detector_qconfig_info_combined = self._generate_module_fqn_to_detector_info_mapping(
+            self._update_detector_quantizaiton_qconfig_info
+        )
+
+        # we will do a bit of processing and remove fqns that don't have input weight recommended
+
+        # now we generate the QConfig for each of the options
+        mapping: QConfigMapping = self._generate_qconfig_mapping_helper(
+            detector_qconfig_info_combined,
+            self._quantization_config_generator
+        )
+
+        # return the generated mapping
+        return mapping
+
+    def _quantization_config_generator(self, detector_qconfig_info: DetectorQConfigInfo, module: torch.nn.Module) -> QConfig:
+        r"""
+        Returns the quantization configuration generated by the DetectorQConfigInfo object
+        """
+        return detector_qconfig_info.generate_quantization_qconfig(module)
+
+    def _equalization_config_generator(
+        self,
+        detector_qconfig_info: DetectorQConfigInfo,
+        module: torch.nn.Module
+    ) -> EqualizationQConfig:
+        r"""
+        We ignore the module argument here, and only focus on thedetector_qconfig_info
+
+        Returns the equalization configuration generated by the DetectorQConfigInfo object
+        """
+        return detector_qconfig_info.generate_equalization_qconfig()
+
+    def generate_equalization_mapping(self) -> QConfigMapping:
+        r"""
+        Generates a QConfigMapping based on the suggestions of the
+        ModelReport API for equalization. The generated mapping encompasses all the
+        different types of feedback from the input-weight equalization detector.
+
+        These configs are based on the suggestions provided by the ModelReport API
+        and can only be generated once the reports have been generated.
+
+        Returns a QConfigMapping for the equalization configuration
+        """
+        # get the mapping info
+        detector_qconfig_info_combined = self._generate_module_fqn_to_detector_info_mapping(
+            self._update_detector_equalization_qconfig_info
+        )
+
+        # now we generate the QConfig for each of the options
+        mapping: QConfigMapping = self._generate_qconfig_mapping_helper(
+            detector_qconfig_info_combined,
+            self._equalization_config_generator
+        )
+
+        # return the generated mapping
+        return mapping
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_observer.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_observer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbea160cc43229e21e713c69009bd8925ef69867
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_observer.py
@@ -0,0 +1,265 @@
+import torch
+from torch.ao.quantization.observer import ObserverBase
+
+
+class ModelReportObserver(ObserverBase):
+    r"""This observer is used to record additional information regarding keeping track
+    of S = average_batch_activation_range/epoch_activation_range.
+
+    The purpose of this information is to prepare a report to present to users on whether
+    Dynamic or Static Quantization is more appropriate for their model given the general
+    distributions of their data.
+
+    Args:
+        ch_axis (int, optional): The channel axis for which the range and outlier stats are computed
+            Default: 1
+        comp_percentile (float, optional): The percentile to compare against 100 percentile to find outliers
+            Should be between 0 and 1 exclusive
+            Default: 0.9
+
+    * :attr:`num_batches_tracked` specifies number of batches passed through the observer
+
+    * :attr:`average_batch_activation_range` defines average across the ranges of each batch passed through
+
+    * :attr:`epoch_activation_min` defines the minimum value passed through the observer
+
+    * :attr:`epoch_activation_max` defines the maximum value passed through the observer
+
+    * :attr:`ch_axis` defines the channel being used to compute per channel min max stats
+
+    * :attr:`min_val` defines the per channel minimum values passed through
+
+    * :attr:`max_val` defines the per channel maximum values passed through
+
+    * :attr:`comp_percentile` defines comparison percentile to find outliers
+
+    * :attr:`average_percentile_ratio` defines the per channel average percentile ratios
+
+    * :attr:`percentile_batches_tracked` defines the number of percentile batches tracked for each channel
+
+    * :attr:`constant_channels` defines the number of batches that aren't constant channels per channel
+
+    Note: this tool is meant for FX Graph Mode Quantization
+    """
+
+    epoch_activation_min: torch.Tensor
+    epoch_activation_max: torch.Tensor
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+    comp_percentile: torch.Tensor
+    average_percentile_ratio: torch.Tensor
+    percentile_batches_tracked: torch.Tensor
+    constant_channels: torch.Tensor
+
+    def __init__(self, ch_axis: int = 1, comp_percentile: float = 0.9):
+        super().__init__(torch.qint8)
+        self.num_batches_tracked = 0
+
+        # keep track of the min and mix of the range for average batch and epoch as a whole
+        self.average_batch_activation_range: torch.Tensor = torch.tensor(float(0))
+        self.register_buffer("epoch_activation_min", torch.tensor(float("inf")))
+        self.register_buffer("epoch_activation_max", torch.tensor(float("-inf")))
+
+        # keep track of per channel min max information using the given channel
+        self.ch_axis: int = ch_axis
+        self.register_buffer("min_val", torch.tensor([]))
+        self.register_buffer("max_val", torch.tensor([]))
+
+        # keep track of percentile ratio information per channel
+        self.register_buffer("comp_percentile", torch.tensor([comp_percentile]))
+        self.register_buffer("average_percentile_ratio", torch.tensor([]))
+        self.register_buffer("percentile_batches_tracked", torch.tensor([]))
+        self.register_buffer("constant_channels", torch.tensor([]))
+
+    def forward(self, x):
+        x_copy = x.detach()  # avoid keeping autograd tape
+        x_copy = x_copy.to(self.epoch_activation_min.dtype)
+
+        x_copy = self._calculate_range_stats(x_copy)
+        x_copy = self._calculate_min_max_stats(x_copy)
+        x_copy = self._calculate_percentile_stats(x_copy)
+
+        # return the passed in the value
+        return x
+
+    def _calculate_range_stats(self, x_copy):
+        r"""Calculates and stores range stats with forward values.
+
+        Args
+            x_copy: A copy of the forward data
+
+        Returns the passed in x_copy
+        """
+        # get the min, max values of the data
+        min_val_cur, max_val_cur = torch.aminmax(x_copy)
+
+        # calculate new epoch range values
+        epoch_min_val = torch.min(self.epoch_activation_min, min_val_cur)
+        epoch_max_val = torch.max(self.epoch_activation_max, max_val_cur)
+
+        self.epoch_activation_min.copy_(epoch_min_val)
+        self.epoch_activation_max.copy_(epoch_max_val)
+
+        # calculate the average batch activation range
+        current_batch_range = max_val_cur - min_val_cur
+        new_range = (
+            self.average_batch_activation_range * self.num_batches_tracked
+            + current_batch_range
+        ) / (self.num_batches_tracked + 1)
+
+        self.average_batch_activation_range = new_range
+        self.num_batches_tracked += 1  # new batch was processed
+
+        return x_copy
+
+    def _calculate_min_max_stats(self, x_copy):
+        r"""Calculates and stores the per_channel min, max stats with forward values.
+        Does calculation based on channel axis: self.ch_axis
+
+        Args
+            x_copy: A copy of the forward data
+
+        Returns the passed in x_copy
+        """
+        # get the current min and max vals
+        min_val = self.min_val
+        max_val = self.max_val
+        x_dim = x_copy.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x_copy.permute(new_axis_list)
+        # Need to match dtype of min/max because the updates to buffers
+        # are done in place and types need to match for comparisons
+        y = y.to(self.min_val.dtype)
+        y = torch.flatten(y, start_dim=1)
+        if min_val.numel() == 0 or max_val.numel() == 0:
+            min_val, max_val = torch.aminmax(y, dim=1)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+            min_val = torch.min(min_val_cur, min_val)
+            max_val = torch.max(max_val_cur, max_val)
+
+        self.min_val.resize_(min_val.shape)
+        self.max_val.resize_(max_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+
+        return x_copy
+
+    def _calculate_percentile_stats(self, x_copy):
+        r"""Calculates and stores the per_channel percentile stats with forward values.
+        Does calculation based on channel axis: self.ch_axis
+
+        Args
+            x_copy: A copy of the forward data
+
+        Returns the passed in x_copy
+        """
+        # get the dimension of the copy
+        x_dim = x_copy.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x_copy.permute(new_axis_list)
+        # Need to match dtype of min/max because the updates to buffers
+        # are done in place and types need to match for comparisons
+        y = y.to(self.min_val.dtype)
+        y = torch.flatten(y, start_dim=1)
+        y = y.to(dtype=self.min_val.dtype, device="cpu")
+
+        # find the percentile values along the axis
+        # we want both 100th percentile and comp_percentile
+        # we also want to find 0th quartile to see if we have constant channel
+        quantiles_list = [0, self.comp_percentile, 1.00]
+        quantiles_to_find = torch.tensor(quantiles_list, dtype=self.min_val.dtype)
+
+        # find the quantiles
+        desired_quantiles = torch.quantile(y, quantiles_to_find, dim=self.ch_axis, interpolation="lower")
+        zero_quantile = desired_quantiles[0]
+        comp_quantile = desired_quantiles[1]
+        hundreth_quartile = desired_quantiles[2]
+
+        # if any of the channels have 0s, we ignore that channel for this calculation
+        any_non_zero_quantile_value: torch.Tensor = (comp_quantile != torch.tensor([0])) | (hundreth_quartile != torch.tensor([0]))
+        any_non_zero_quantile_value = any_non_zero_quantile_value.int()  # transform boolean values to int values
+
+        # we also check if we have a constant channel
+        any_constant_channels: torch.Tensor = (hundreth_quartile - zero_quantile) == torch.tensor([0])
+        any_constant_channels = any_constant_channels.int()  # transform boolean values to int values
+
+        # possibilities to get nan as an answer
+        #   will ignore any of these three cases with 0s and just not deal with them for now
+        # case (1) 0 in numerator: issue if 0 is largest, all negative, and rest are really negative
+        # case (2) 0 in denominator: is possible unless case 3, we just ignore
+        # case (3) 0 in both: not outlier, channel just kinda useless, ignore
+
+        # get the ratio and get rid of nan values
+        quantile_ratios = hundreth_quartile / comp_quantile
+        quantile_ratios = torch.nan_to_num(quantile_ratios)
+        # update averages, remembering to only update if didn't have zeros
+        ratio_if_not_zero = any_non_zero_quantile_value * quantile_ratios
+
+        # if num_batches and average_ratio are not initialized, we want to initialize them
+        if self.percentile_batches_tracked.shape[0] == 0 or self.average_percentile_ratio.shape[0] == 0:
+            self.percentile_batches_tracked = torch.zeros_like(any_non_zero_quantile_value)
+            self.average_percentile_ratio = torch.zeros_like(ratio_if_not_zero)
+
+        # also initialize the constant channel var if that is not initialized separately
+        if self.constant_channels.shape[0] == 0:
+            self.constant_channels = torch.zeros_like(any_constant_channels)
+
+        # get current num batches and average ratio
+        num_batches = self.percentile_batches_tracked
+        average_ratio = self.average_percentile_ratio
+
+        # calculate new_number of batches, new_ratios, and get rid of nans because of 0 size batches
+        new_number_of_batches: torch.Tensor = num_batches + any_non_zero_quantile_value
+        new_ratios: torch.Tensor = ((average_ratio * num_batches) + ratio_if_not_zero) / new_number_of_batches
+        new_ratios = torch.nan_to_num(new_ratios)
+
+        # update the number of non-constant channels
+        new_constant_count: torch.Tensor = self.constant_channels + any_constant_channels
+
+        # update the values locally
+        self.percentile_batches_tracked.copy_(new_number_of_batches)
+        self.average_percentile_ratio.copy_(new_ratios)
+        self.constant_channels.copy_(new_constant_count)
+
+        return x_copy
+
+    @torch.jit.export
+    def get_batch_to_epoch_ratio(self):
+        epoch_activation_range = self.epoch_activation_max - self.epoch_activation_min
+
+        if epoch_activation_range == torch.tensor(float(0)):
+            raise ValueError("Range for Epoch is 0")
+        elif epoch_activation_range == torch.tensor(float("inf")):
+            raise ValueError(
+                "No data has been run through observer or infinity value present"
+            )
+        else:
+            return self.average_batch_activation_range / epoch_activation_range
+
+    @torch.jit.export
+    def reset_batch_and_epoch_values(self):
+        # set all the values back to their original defaults for a new epoch
+        # keep device
+        device = self.max_val.device
+        self.num_batches_tracked = 0
+        self.average_batch_activation_range = torch.tensor(float(0), device=device)
+        self.epoch_activation_min = torch.tensor(float("inf"), device=device)
+        self.epoch_activation_max = torch.tensor(float("-inf"), device=device)
+        self.min_val = torch.tensor([], device=device)
+        self.max_val = torch.tensor([], device=device)
+        self.average_percentile_ratio = torch.tensor([], device=device)
+        self.percentile_batches_tracked = torch.tensor([], device=device)
+        self.constant_channels = torch.tensor([], device=device)
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        raise Exception(
+            "calculate_qparams should not be called for ModelReportObserver"
+        )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58463e59a979b24097f0e362a388ec441258048
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -0,0 +1,666 @@
+import torch
+from typing import Any, Set, Dict, List, Tuple, OrderedDict
+from collections import OrderedDict as OrdDict
+
+# try to import tablate
+got_tabulate = True
+try:
+    from tabulate import tabulate
+except ImportError:
+    got_tabulate = False
+
+
+# var to see if we could import matplotlib
+got_matplotlib = True
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    got_matplotlib = False
+
+class ModelReportVisualizer:
+    r"""
+    The ModelReportVisualizer class aims to provide users a way to visualize some of the statistics
+    that were generated by the ModelReport API. However, at a higher level, the class aims to provide
+    some level of visualization of statistics to PyTorch in order to make it easier to parse data and
+    diagnose any potential issues with data or a specific model. With respect to the visualizations,
+    the ModelReportVisualizer class currently supports several methods of visualizing data.
+
+    Supported Visualization Methods Include:
+    - Table format
+    - Plot format (line graph)
+    - Histogram format
+
+    For all of the existing visualization methods, there is the option to filter data based on:
+    - A module fqn prefix
+    - Feature [required for the plot and histogram]
+
+    * :attr:`generated_reports` The reports generated by the ModelReport class in the structure below
+        Ensure sure that features that are the same across different report contain the same name
+        Ensure that objects representing the same features are the same type / dimension (where applicable)
+
+    Note:
+        Currently, the ModelReportVisualizer class supports visualization of data generated by the
+        ModelReport class. However, this structure is extensible and should allow the visualization of
+        other information as long as the information is structured in the following general format:
+
+        Report Structure
+        -- module_fqn [module with attached detectors]
+            |
+            -- feature keys [not every detector extracts same information]
+                                    [same collected info has same keys, unless can be specific to detector]
+
+
+    The goal behind the class is that the generated visualizations can be used in conjunction with the generated
+    report for people to get a better understanding of issues and what the fix might be. It is also just to provide
+    a good visualization platform, since it might be hard to parse through the ModelReport returned dictionary as
+    that grows in size.
+
+    General Use Flow Expected
+    1.) Initialize ModelReport object with reports of interest by passing in initialized detector objects
+    2.) Prepare your model with prepare_fx
+    3.) Call model_report.prepare_detailed_calibration on your model to add relevant observers
+    4.) Callibrate your model with data
+    5.) Call model_report.generate_report on your model to generate report and optionally remove added observers
+    6.) Use output of model_report.generate_report to initialize ModelReportVisualizer instance
+    7.) Use instance to view different views of data as desired, applying filters as needed
+        8.) Either see the super detailed information or just the actual printed or shown table / plot / histogram
+
+    """
+
+    # keys for table dict
+    TABLE_TENSOR_KEY = "tensor_level_info"
+    TABLE_CHANNEL_KEY = "channel_level_info"
+
+    # Constants for header vals
+    NUM_NON_FEATURE_TENSOR_HEADERS = 2
+    NUM_NON_FEATURE_CHANNEL_HEADERS = 3
+
+    # Constants for row index in header
+    CHANNEL_NUM_INDEX = 2
+
+    def __init__(self, generated_reports: OrderedDict[str, Any]):
+        r"""
+        Initializes the ModelReportVisualizer instance with the necessary reports.
+
+        Args:
+            generated_reports (Dict[str, Any]): The reports generated by the ModelReport class
+                can also be a dictionary generated in another manner, as long as format is same
+        """
+        self.generated_reports = generated_reports
+
+    def get_all_unique_module_fqns(self) -> Set[str]:
+        r"""
+        The purpose of this method is to provide a user the set of all module_fqns so that if
+        they wish to use some of the filtering capabilities of the ModelReportVisualizer class,
+        they don't need to manually parse the generated_reports dictionary to get this information.
+
+        Returns all the unique module fqns present in the reports the ModelReportVisualizer
+        instance was initialized with.
+        """
+        # returns the keys of the ordered dict
+        return set(self.generated_reports.keys())
+
+    def get_all_unique_feature_names(self, plottable_features_only: bool = True) -> Set[str]:
+        r"""
+        The purpose of this method is to provide a user the set of all feature names so that if
+        they wish to use the filtering capabilities of the generate_table_view(), or use either of
+        the generate_plot_view() or generate_histogram_view(), they don't need to manually parse
+        the generated_reports dictionary to get this information.
+
+        Args:
+            plottable_features_only (bool): True if the user is only looking for plottable features,
+                False otherwise
+                plottable features are those that are tensor values
+                Default: True (only return those feature names that are plottable)
+
+        Returns all the unique module fqns present in the reports the ModelReportVisualizer
+        instance was initialized with.
+        """
+        unique_feature_names = set()
+        for module_fqn in self.generated_reports:
+            # get dict of the features
+            feature_dict: Dict[str, Any] = self.generated_reports[module_fqn]
+
+            # loop through features
+            for feature_name in feature_dict:
+                # if we need plottable, ensure type of val is tensor
+                if not plottable_features_only or type(feature_dict[feature_name]) == torch.Tensor:
+                    unique_feature_names.add(feature_name)
+
+        # return our compiled set of unique feature names
+        return unique_feature_names
+
+    def _get_filtered_data(self, feature_filter: str, module_fqn_filter: str) -> OrderedDict[str, Any]:
+        r"""
+        Filters the data and returns it in the same ordered dictionary format so the relevant views can be displayed.
+
+        Args:
+            feature_filter (str): The feature filter, if we want to filter the set of data to only include
+                a certain set of features that include feature_filter
+                If feature = "", then we do not filter based on any features
+            module_fqn_filter (str): The filter on prefix for the module fqn. All modules that have fqn with
+                this prefix will be included
+                If module_fqn_filter = "" we do not filter based on module fqn, and include all modules
+
+        First, the data is filtered based on module_fqn, and then filtered based on feature
+        Returns an OrderedDict (sorted in order of model) mapping:
+            module_fqns -> feature_names -> values
+        """
+        # create return dict
+        filtered_dict: OrderedDict[str, Any] = OrdDict()
+
+        for module_fqn in self.generated_reports:
+            # first filter based on module
+            if module_fqn_filter == "" or module_fqn_filter in module_fqn:
+                # create entry for module and loop through features
+                filtered_dict[module_fqn] = {}
+                module_reports = self.generated_reports[module_fqn]
+                for feature_name in module_reports:
+                    # check if filtering on features and do so if desired
+                    if feature_filter == "" or feature_filter in feature_name:
+                        filtered_dict[module_fqn][feature_name] = module_reports[feature_name]
+
+        # we have populated the filtered dict, and must return it
+
+        return filtered_dict
+
+    def _generate_tensor_table(
+        self,
+        filtered_data: OrderedDict[str, Dict[str, Any]],
+        tensor_features: List[str]
+    ) -> Tuple[List, List]:
+        r"""
+        Takes in the filtered data and features list and generates the tensor headers and table
+
+        Currently meant to generate the headers and table for both the tensor information.
+
+        Args:
+            filtered_data (OrderedDict[str, Dict[str, Any]]): An OrderedDict (sorted in order of model) mapping:
+                module_fqns -> feature_names -> values
+            tensor_features (List[str]): A list of the tensor level features
+
+        Returns a tuple with:
+            A list of the headers of the tensor table
+            A list of lists containing the table information row by row
+            The 0th index row will contain the headers of the columns
+            The rest of the rows will contain data
+        """
+        # now we compose the tensor information table
+        tensor_table: List[List[Any]] = []
+        tensor_headers: List[str] = []
+
+        # append the table row to the table only if we have features
+        if len(tensor_features) > 0:
+            # now we add all the data
+            for index, module_fqn in enumerate(filtered_data):
+                # we make a new row for the tensor table
+                tensor_table_row = [index, module_fqn]
+                for feature in tensor_features:
+                    # we iterate in same order of added features
+
+                    if feature in filtered_data[module_fqn]:
+                        # add value if applicable to module
+                        feature_val = filtered_data[module_fqn][feature]
+                    else:
+                        # add that it is not applicable
+                        feature_val = "Not Applicable"
+
+                    # if it's a tensor we want to extract val
+                    if isinstance(feature_val, torch.Tensor):
+                        feature_val = feature_val.item()
+
+                    # we add to our list of values
+                    tensor_table_row.append(feature_val)
+
+                tensor_table.append(tensor_table_row)
+
+        # add row of headers of we actually have something, otherwise just empty
+        if len(tensor_table) != 0:
+            tensor_headers = ["idx", "layer_fqn"] + tensor_features
+
+        return (tensor_headers, tensor_table)
+
+    def _generate_channels_table(
+        self,
+        filtered_data: OrderedDict[str, Any],
+        channel_features: List[str],
+        num_channels: int
+    ) -> Tuple[List, List]:
+        r"""
+        Takes in the filtered data and features list and generates the channels headers and table
+
+        Currently meant to generate the headers and table for both the channels information.
+
+        Args:
+            filtered_data (OrderedDict[str, Any]): An OrderedDict (sorted in order of model) mapping:
+                module_fqns -> feature_names -> values
+            channel_features (List[str]): A list of the channel level features
+            num_channels (int): Number of channels in the channel data
+
+        Returns a tuple with:
+            A list of the headers of the channel table
+            A list of lists containing the table information row by row
+            The 0th index row will contain the headers of the columns
+            The rest of the rows will contain data
+        """
+        # now we compose the table for the channel information table
+        channel_table: List[List[Any]] = []
+        channel_headers: List[str] = []
+
+        # counter to keep track of number of entries in
+        channel_table_entry_counter: int = 0
+
+        if len(channel_features) > 0:
+            # now we add all channel data
+            for module_fqn in filtered_data:
+                # we iterate over all channels
+                for channel in range(num_channels):
+                    # we make a new row for the channel
+                    new_channel_row = [channel_table_entry_counter, module_fqn, channel]
+                    for feature in channel_features:
+                        if feature in filtered_data[module_fqn]:
+                            # add value if applicable to module
+                            feature_val = filtered_data[module_fqn][feature][channel]
+                        else:
+                            # add that it is not applicable
+                            feature_val = "Not Applicable"
+
+                        # if it's a tensor we want to extract val
+                        if type(feature_val) is torch.Tensor:
+                            feature_val = feature_val.item()
+
+                        # add value to channel specific row
+                        new_channel_row.append(feature_val)
+
+                    # add to table and increment row index counter
+                    channel_table.append(new_channel_row)
+                    channel_table_entry_counter += 1
+
+        # add row of headers of we actually have something, otherwise just empty
+        if len(channel_table) != 0:
+            channel_headers = ["idx", "layer_fqn", "channel"] + channel_features
+
+        return (channel_headers, channel_table)
+
+    def generate_filtered_tables(self, feature_filter: str = "", module_fqn_filter: str = "") -> Dict[str, Tuple[List, List]]:
+        r"""
+        Takes in optional filter values and generates two tables with desired information.
+
+        The generated tables are presented in both a list-of-lists format
+
+        The reason for the two tables are that they handle different things:
+        1.) the first table handles all tensor level information
+        2.) the second table handles and displays all channel based information
+
+        The reasoning for this is that having all the info in one table can make it ambiguous which collected
+            statistics are global, and which are actually per-channel, so it's better to split it up into two
+            tables. This also makes the information much easier to digest given the plethora of statistics collected
+
+        Tensor table columns:
+            idx  layer_fqn  feature_1   feature_2   feature_3   .... feature_n
+            ----  ---------  ---------   ---------   ---------        ---------
+
+        Per-Channel table columns:
+            idx  layer_fqn  channel  feature_1   feature_2   feature_3   .... feature_n
+            ----  ---------  -------  ---------   ---------   ---------        ---------
+
+        Args:
+            feature_filter (str, optional): Filters the features presented to only those that
+                contain this filter substring
+                Default = "", results in all the features being printed
+            module_fqn_filter (str, optional): Only includes modules that contains this string
+                Default = "", results in all the modules in the reports to be visible in the table
+
+        Returns a dictionary with two keys:
+            (Dict[str, Tuple[List, List]]) A dict containing two keys:
+            "tensor_level_info", "channel_level_info"
+                Each key maps to a tuple with:
+                    A list of the headers of each table
+                    A list of lists containing the table information row by row
+                    The 0th index row will contain the headers of the columns
+                    The rest of the rows will contain data
+
+        Example Use:
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> mod_report_visualizer.generate_filtered_tables(
+            ...     feature_filter = "per_channel_min",
+            ...     module_fqn_filter = "block1"
+            ... ) # generates table with per_channel_min info for all modules in block 1 of the model
+        """
+        # first get the filtered data
+        filtered_data: OrderedDict[str, Any] = self._get_filtered_data(feature_filter, module_fqn_filter)
+
+        # now we split into tensor and per-channel data
+        tensor_features: Set[str] = set()
+        channel_features: Set[str] = set()
+
+        # keep track of the number of channels we have
+        num_channels: int = 0
+
+        for module_fqn in filtered_data:
+            for feature_name in filtered_data[module_fqn]:
+                # get the data for that specific feature
+                feature_data = filtered_data[module_fqn][feature_name]
+
+                # check if not zero dim tensor
+                is_tensor: bool = isinstance(feature_data, torch.Tensor)
+                is_not_zero_dim: bool = is_tensor and len(feature_data.shape) != 0
+
+                if is_not_zero_dim or isinstance(feature_data, list):
+                    # works means per channel
+                    channel_features.add(feature_name)
+                    num_channels = len(feature_data)
+                else:
+                    # means is per-tensor
+                    tensor_features.add(feature_name)
+
+        # we make them lists for iteration purposes
+        tensor_features_list: List[str] = sorted(tensor_features)
+        channel_features_list: List[str] = sorted(channel_features)
+
+        # get the tensor info
+        tensor_headers, tensor_table = self._generate_tensor_table(filtered_data, tensor_features_list)
+
+        # get the channel info
+        channel_headers, channel_table = self._generate_channels_table(
+            filtered_data, channel_features_list, num_channels
+        )
+
+        # let's now create the dictionary to return
+        table_dict = {
+            self.TABLE_TENSOR_KEY : (tensor_headers, tensor_table),
+            self.TABLE_CHANNEL_KEY : (channel_headers, channel_table)
+        }
+
+        # return the two tables
+        return table_dict
+
+    def generate_table_visualization(self, feature_filter: str = "", module_fqn_filter: str = ""):
+        r"""
+        Takes in optional filter values and prints out formatted tables of the information.
+
+        The reason for the two tables printed out instead of one large one are that they handle different things:
+        1.) the first table handles all tensor level information
+        2.) the second table handles and displays all channel based information
+
+        The reasoning for this is that having all the info in one table can make it ambiguous which collected
+            statistics are global, and which are actually per-channel, so it's better to split it up into two
+            tables. This also makes the information much easier to digest given the plethora of statistics collected
+
+        Tensor table columns:
+         idx  layer_fqn  feature_1   feature_2   feature_3   .... feature_n
+        ----  ---------  ---------   ---------   ---------        ---------
+
+        Per-Channel table columns:
+
+         idx  layer_fqn  channel  feature_1   feature_2   feature_3   .... feature_n
+        ----  ---------  -------  ---------   ---------   ---------        ---------
+
+        Args:
+            feature_filter (str, optional): Filters the features presented to only those that
+                contain this filter substring
+                Default = "", results in all the features being printed
+            module_fqn_filter (str, optional): Only includes modules that contains this string
+                Default = "", results in all the modules in the reports to be visible in the table
+
+        Example Use:
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> mod_report_visualizer.generate_table_visualization(
+            ...     feature_filter = "per_channel_min",
+            ...     module_fqn_filter = "block1"
+            ... )
+            >>> # prints out neatly formatted table with per_channel_min info
+            >>> # for all modules in block 1 of the model
+        """
+        # see if we got tabulate
+        if not got_tabulate:
+            print("Make sure to install tabulate and try again.")
+            return None
+
+        # get the table dict and the specific tables of interest
+        table_dict = self.generate_filtered_tables(feature_filter, module_fqn_filter)
+        tensor_headers, tensor_table = table_dict[self.TABLE_TENSOR_KEY]
+        channel_headers, channel_table = table_dict[self.TABLE_CHANNEL_KEY]
+
+        # get the table string and print it out
+        # now we have populated the tables for each one
+        # let's create the strings to be returned
+        table_str = ""
+        # the tables will have some headers columns that are non-feature
+        # ex. table index, module name, channel index, etc.
+        # we want to look at header columns for features, that come after those headers
+        if len(tensor_headers) > self.NUM_NON_FEATURE_TENSOR_HEADERS:
+            # if we have at least one tensor level feature to be added we add tensor table
+            table_str += "Tensor Level Information \n"
+            table_str += tabulate(tensor_table, headers=tensor_headers)
+        if len(channel_headers) > self.NUM_NON_FEATURE_CHANNEL_HEADERS:
+            # if we have at least one channel level feature to be added we add tensor table
+            table_str += "\n\n Channel Level Information \n"
+            table_str += tabulate(channel_table, headers=channel_headers)
+
+        # if no features at all, let user know
+        if table_str == "":
+            table_str = "No data points to generate table with."
+
+        print(table_str)
+
+    def _get_plottable_data(self, feature_filter: str, module_fqn_filter: str) -> Tuple[List, List[List], bool]:
+        r"""
+        Takes in the feature filters and module filters and outputs the x and y data for plotting
+
+        Args:
+            feature_filter (str): Filters the features presented to only those that
+                contain this filter substring
+            module_fqn_filter (str): Only includes modules that contains this string
+
+        Returns a tuple of three elements
+            The first is a list containing relevant x-axis data
+            The second is a list containing the corresponding y-axis data
+            If the data is per channel
+        """
+        # get the table dict and the specific tables of interest
+        table_dict = self.generate_filtered_tables(feature_filter, module_fqn_filter)
+        tensor_headers, tensor_table = table_dict[self.TABLE_TENSOR_KEY]
+        channel_headers, channel_table = table_dict[self.TABLE_CHANNEL_KEY]
+
+        # make sure it is only 1 feature that is being plotted
+        # get the number of features in each of these
+        tensor_info_features_count = len(tensor_headers) - ModelReportVisualizer.NUM_NON_FEATURE_TENSOR_HEADERS
+        channel_info_features_count = len(channel_headers) - ModelReportVisualizer.NUM_NON_FEATURE_CHANNEL_HEADERS
+
+        # see if valid tensor or channel plot
+        is_valid_per_tensor_plot: bool = tensor_info_features_count == 1
+        is_valid_per_channel_plot: bool = channel_info_features_count == 1
+
+        # offset should either be one of tensor or channel table or neither
+        feature_column_offset = ModelReportVisualizer.NUM_NON_FEATURE_TENSOR_HEADERS
+        table = tensor_table
+
+        # if a per_channel plot, we have different offset and table
+        if is_valid_per_channel_plot:
+            feature_column_offset = ModelReportVisualizer.NUM_NON_FEATURE_CHANNEL_HEADERS
+            table = channel_table
+
+        x_data: List = []
+        y_data: List[List] = []
+        # the feature will either be a tensor feature or channel feature
+        if is_valid_per_tensor_plot:
+            for table_row_num, row in enumerate(table):
+                # get x_value to append
+                x_val_to_append = table_row_num
+                # the index of the feature will the 0 + num non feature columns
+                tensor_feature_index = feature_column_offset
+                row_value = row[tensor_feature_index]
+                if not type(row_value) == str:
+                    x_data.append(x_val_to_append)
+                    y_data.append(row_value)
+        elif is_valid_per_channel_plot:
+            # gather the x_data and multiple y_data
+            # calculate the number of channels
+            num_channels: int = max(row[self.CHANNEL_NUM_INDEX] for row in table) + 1
+            for channel in range(num_channels):
+                y_data.append([])  # separate data list per channel
+
+            for table_row_num, row in enumerate(table):
+                # get x_value to append
+                x_val_to_append = table_row_num
+                current_channel = row[self.CHANNEL_NUM_INDEX]  # initially chose current channel
+                new_module_index: int = table_row_num // num_channels
+                x_val_to_append = new_module_index
+
+                # the index of the feature will the 0 + num non feature columns
+                tensor_feature_index = feature_column_offset
+                row_value = row[tensor_feature_index]
+                if not type(row_value) == str:
+                    # only append if new index we are appending
+                    if len(x_data) == 0 or x_data[-1] != x_val_to_append:
+                        x_data.append(x_val_to_append)
+
+                    # append value for that channel
+                    y_data[current_channel].append(row_value)
+        else:
+            # more than one feature was chosen
+            error_str = "Make sure to pick only a single feature with your filter to plot a graph."
+            error_str += " We recommend calling get_all_unique_feature_names() to find unique feature names."
+            error_str += " Pick one of those features to plot."
+            raise ValueError(error_str)
+
+        # return x, y values, and if data is per-channel
+        return (x_data, y_data, is_valid_per_channel_plot)
+
+    def generate_plot_visualization(self, feature_filter: str, module_fqn_filter: str = ""):
+        r"""
+        Takes in a feature and optional module_filter and plots of the desired data.
+
+        For per channel features, it averages the value across the channels and plots a point
+        per module. The reason for this is that for models with hundreds of channels, it can
+        be hard to differentiate one channel line from another, and so the point of generating
+        a single average point per module is to give a sense of general trends that encourage
+        further deep dives.
+
+        Note:
+            Only features in the report that have tensor value data are plottable by this class
+            When the tensor information is plotted, it will plot:
+                idx as the x val, feature value as the y_val
+            When the channel information is plotted, it will plot:
+                the first idx of each module as the x val, feature value as the y_val [for each channel]
+                The reason for this is that we want to be able to compare values across the
+                channels for same layer, and it will be hard if values are staggered by idx
+                This means each module is represented by only 1 x value
+        Args:
+            feature_filter (str): Filters the features presented to only those that
+                contain this filter substring
+            module_fqn_filter (str, optional): Only includes modules that contains this string
+                Default = "", results in all the modules in the reports to be visible in the table
+
+        Example Use:
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> mod_report_visualizer.generate_plot_visualization(
+            ...     feature_filter = "per_channel_min",
+            ...     module_fqn_filter = "block1"
+            ... )
+            >>> # outputs line plot of per_channel_min information for all
+            >>> # modules in block1 of model each channel gets it's own line,
+            >>> # and it's plotted across the in-order modules on the x-axis
+        """
+        # checks if we have matplotlib and let's user know to install it if don't
+        if not got_matplotlib:
+            print("make sure to install matplotlib and try again.")
+            return None
+
+        # get the x and y data and if per channel
+        x_data, y_data, data_per_channel = self._get_plottable_data(feature_filter, module_fqn_filter)
+
+        # plot based on whether data is per channel or not
+        ax = plt.subplot()
+        ax.set_ylabel(feature_filter)
+        ax.set_title(feature_filter + " Plot")
+        plt.xticks(x_data)  # only show ticks for actual points
+
+        if data_per_channel:
+            ax.set_xlabel("First idx of module")
+            # set the legend as well
+            # plot a single line that is average of the channel values
+            num_modules = len(y_data[0])  # all y_data have same length, so get num modules
+            num_channels = len(y_data)  # we want num channels to be able to calculate average later
+
+            avg_vals = [sum(y_data[:][index]) / num_channels for index in range(num_modules)]
+
+            # plot the three things we measured
+            ax.plot(x_data, avg_vals, label=f"Average Value Across {num_channels} Channels")
+            ax.legend(loc='upper right')
+        else:
+            ax.set_xlabel("idx")
+            ax.plot(x_data, y_data)
+
+        # actually show the plot
+        plt.show()
+
+    def generate_histogram_visualization(self, feature_filter: str, module_fqn_filter: str = "", num_bins: int = 10):
+        r"""
+        Takes in a feature and optional module_filter and plots the histogram of desired data.
+
+        Note:
+            Only features in the report that have tensor value data can be viewed as a histogram
+            If you want to plot a histogram from all the channel values of a specific feature for
+                a specific model, make sure to specify both the model and the feature properly
+                in the filters and you should be able to see a distribution of the channel data
+
+        Args:
+            feature_filter (str, optional): Filters the features presented to only those that
+                contain this filter substring
+                Default = "", results in all the features being printed
+            module_fqn_filter (str, optional): Only includes modules that contains this string
+                Default = "", results in all the modules in the reports to be visible in the table
+            num_bins (int, optional): The number of bins to create the histogram with
+                Default = 10, the values will be split into 10 equal sized bins
+
+        Example Use:
+            >>> # xdoctest: +SKIP
+            >>> mod_report_visualizer.generategenerate_histogram_visualization_plot_visualization(
+            ...     feature_filter = "per_channel_min",
+            ...     module_fqn_filter = "block1"
+            ... )
+            # outputs histogram of per_channel_min information for all modules in block1 of model
+                information is gathered across all channels for all modules in block 1 for the
+                per_channel_min and is displayed in a histogram of equally sized bins
+        """
+        # checks if we have matplotlib and let's user know to install it if don't
+        if not got_matplotlib:
+            print("make sure to install matplotlib and try again.")
+            return None
+
+        # get the x and y data and if per channel
+        x_data, y_data, data_per_channel = self._get_plottable_data(feature_filter, module_fqn_filter)
+
+        # for histogram, we just care about plotting the y data
+        # plot based on whether data is per channel or not
+        ax = plt.subplot()
+        ax.set_xlabel(feature_filter)
+        ax.set_ylabel("Frequency")
+        ax.set_title(feature_filter + " Histogram")
+
+        if data_per_channel:
+            # set the legend as well
+            # combine all the data
+            all_data = []
+            for channel_info in y_data:
+                all_data.extend(channel_info)
+
+            val, bins, _ = plt.hist(
+                all_data,
+                bins=num_bins,
+                stacked=True,
+                rwidth=0.8,
+            )
+            plt.xticks(bins)
+        else:
+            val, bins, _ = plt.hist(
+                y_data,
+                bins=num_bins,
+                stacked=False,
+                rwidth=0.8,
+            )
+            plt.xticks(bins)
+
+        plt.show()
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/convert.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..028c4a94186939afe64bfc1d904a2459da9fc80c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/convert.py
@@ -0,0 +1,1131 @@
+# mypy: ignore-errors
+
+from typing import Any, Dict, List, Optional, Set, Tuple, Union, Type, Callable
+from torch.ao.quantization.quant_type import QuantType
+import torch
+import copy
+import warnings
+from torch.fx import (
+    GraphModule,
+)
+from torch.fx.graph import (
+    Graph,
+    Node,
+    Argument,
+)
+from ..utils import (
+    activation_is_statically_quantized,
+    weight_is_quantized,
+    get_qparam_dict,
+    _parent_name,
+    get_swapped_custom_module_class,
+)
+from ..qconfig import (
+    QConfigAny,
+    qconfig_equals
+)
+from ..qconfig_mapping import QConfigMapping
+from .qconfig_mapping_utils import (
+    _generate_node_name_to_qconfig,
+    _compare_prepare_convert_qconfig_mappings,
+    _update_qconfig_for_fusion,
+    _is_qconfig_supported_by_dtype_configs,
+    _update_qconfig_for_qat,
+)
+from torch.ao.quantization.backend_config.utils import (
+    get_root_module_to_quantized_reference_module,
+    get_pattern_to_dtype_configs,
+    get_fused_module_classes,
+    get_qat_module_classes,
+)
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    get_native_backend_config,
+)
+from torch.ao.quantization.observer import _is_activation_post_process
+from .graph_module import (
+    _is_observed_module,
+    _is_observed_standalone_module,
+)
+from ._equalize import update_obs_for_equalization, convert_eq_obs
+from torch.nn.utils.parametrize import type_before_parametrizations
+from .utils import (
+    _get_module,
+    _is_custom_module_lstm,
+    _is_custom_module_mha,
+    assert_and_get_unique_device,
+    get_custom_module_class_keys,
+    create_getattr_from_value,
+    collect_producer_nodes,
+    graph_module_from_producer_nodes,
+    node_arg_is_weight,
+)
+from torch.ao.quantization.utils import (
+    is_per_channel,
+    to_underlying_dtype,
+)
+from torch.ao.quantization.quantize import (
+    _remove_qconfig,
+)
+from torch.ao.quantization.stubs import DeQuantStub
+from .custom_config import (
+    ConvertCustomConfig,
+    PrepareCustomConfig,
+)
+from .lower_to_fbgemm import lower_to_fbgemm
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
+import operator
+
+__all__ = [
+    "convert",
+    "convert_custom_module",
+    "convert_standalone_module",
+    "convert_weighted_module",
+]
+
+_QSCHEME_TO_CHOOSE_QPARAMS_OP = {
+    torch.per_tensor_affine: torch.ops.quantized_decomposed.choose_qparams.tensor,
+    torch.per_tensor_symmetric: torch.ops.quantized_decomposed.choose_qparams_symmetric.tensor,
+}
+
+def _replace_observer_with_quantize_dequantize_node_decomposed(
+        model: torch.fx.GraphModule,
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]) -> None:
+    """ Replace activation_post_process module call node with quantize and
+    dequantize node working with decomposed Tensor
+
+    Before:
+    ... -> observer_0(x) -> ...
+    After:
+    ... -> torch.ops.quantized_decomposed.quantize_per_tensor(x, ...) ->
+    torch.ops.quantized_decomposed.dequantize_per_tensor() -> ...
+
+    or quantize_per_channel and dequantize_per_channel
+    """
+    graph = model.graph
+    assert modules is not None
+    assert isinstance(node.target, str)
+    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    activation_post_process = modules[node.target]
+    if hasattr(activation_post_process, "convert"):
+        activation_post_process.convert(model, node)
+        return
+    # skip replacing observers to quant/dequant nodes if the qconfigs of all
+    # consumers and producers of this observer are None
+    skip_replacement = all(_has_none_qconfig(n, node_name_to_qconfig) for n in
+                           list(node.args) + list(node.users.keys()))
+    if skip_replacement or not _is_conversion_supported(activation_post_process):
+        # didn't find corresponding quantize op and info for the activation_post_process
+        # so we just remove the observer
+        with graph.inserting_before(node):
+            node.replace_all_uses_with(node.args[0])
+            graph.erase_node(node)
+        return
+
+    # otherwise, we can convert the activation_post_process module call to quantize/dequantize node
+
+    # 1. extract the information from activation_post_process module for generating
+    # the quantize and dequantize operator
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[assignment]
+
+    if dtype in [torch.quint8, torch.qint8, torch.qint32, torch.uint8, torch.int8, torch.int16, torch.int32] and \
+            (not is_dynamic):
+        # TODO: probably should cleanup this condition check, it's hard
+        # to reason about this if and the following elif
+
+        # uint8/int8/int32 static quantization branch
+
+        # 1. extract information for inserting q/dq node from activation_post_process
+        node_type = "call_function"
+        quantize_op : Optional[Callable] = None
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
+        if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
+            quantize_op = torch.ops.quantized_decomposed.quantize_per_channel.default
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_channel.default
+            quant_min = activation_post_process.quant_min
+            quant_max = activation_post_process.quant_max
+            dtype_ = to_underlying_dtype(dtype)
+            qparams = {
+                "_scale_": scale,
+                "_zero_point_": zero_point,
+                "_axis_": ch_axis,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype_
+            }
+        else:
+            quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor.default
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            scale = float(scale)
+            zero_point = int(zero_point)
+            quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+            quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+            dtype_ = to_underlying_dtype(dtype)
+            qparams = {
+                "_scale_": scale,
+                "_zero_point_": zero_point,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype_
+            }
+
+        # 2. replace activation_post_process node with quantize and dequantize
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_'] and (not isinstance(value_or_node, (float, int))):
+                    # For scale and zero_point values we register them as buffers in the root module.
+                    # However, note that when the values are not tensors, as in the case of
+                    # per_tensor quantization, they will be treated as literals.
+                    # However, registering them as a node seems to cause issue with dynamo
+                    # tracing where it may consider tensor overload as opposed to default.
+                    # With extra check of scale and zero_point being scalar, it makes
+                    # sure that the default overload can be used.
+                    # TODO: maybe need more complex attr name here
+                    qparam_node = create_getattr_from_value(
+                        model, graph, module_path + prefix + key, value_or_node)
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            # use the same qparams from quantize op
+            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+            dequantized_node = graph.call_function(
+                dequantize_op,
+                tuple(dq_inputs),
+                {}
+            )
+
+            def remap_fn(x):
+                return dequantized_node if x is node else x
+
+            # remap numeric_debug_handle
+            for user_node in node.users:
+                if "numeric_debug_handle" in user_node.meta:
+                    numeric_debug_handle = user_node.meta["numeric_debug_handle"]
+                    user_node.meta["numeric_debug_handle"] = {remap_fn(k): v for k, v in numeric_debug_handle.items()}
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif is_dynamic:
+
+        # uint8/int8/fp16 dynamic quantization
+
+        # 1. extract information for inserting q/dq node from activation_post_process
+        node_type = "call_function"
+        quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor.tensor
+        # we only use choose_qparams for is_decomposed now,
+        # but we should probably align the non-decomposed path with this as well,
+        # and that can be done after we remove reduce_range flag
+        # 1. extract qparams from activation_post_process module
+        dtype_ = to_underlying_dtype(dtype)
+        assert dtype_ in [torch.uint8, torch.int8], \
+            "only uint8 and int8 are supported in reference flow for " \
+            "dynamic quantization right now"
+        quant_min = activation_post_process.quant_min  # type: ignore[attr-defined]
+        quant_max = activation_post_process.quant_max  # type: ignore[attr-defined]
+        qscheme = getattr(activation_post_process, "qscheme", torch.per_tensor_affine)  # type: ignore[attr-defined]
+        eps = getattr(activation_post_process, "eps", torch.finfo(torch.float32).eps)  # type: ignore[attr-defined]
+        # note: scale and zero_point are missing for quantize_per_tensor op
+        # we'll need to get this from choose_qparams op, which we'll add after
+        # this step
+        qparams = {
+            "_quant_min_": quant_min,
+            "_quant_max_": quant_max,
+            "_eps_": eps,
+            "_dtype_": dtype_
+        }
+
+        choose_qparams_op = _QSCHEME_TO_CHOOSE_QPARAMS_OP[qscheme]
+        # 2. insert choose_qparams op and update the qparams list
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            choose_qparams_op_inputs = [node.args[0]]
+            for key, value in qparams.items():
+                # we have quant_min, quant_max and dtype, all should be stored
+                # as literals
+                choose_qparams_op_inputs.append(value)
+            choose_qparams_node = graph.create_node(
+                "call_function",
+                choose_qparams_op,
+                tuple(choose_qparams_op_inputs),
+                {}
+            )
+            # choose_qparms returns (scale, zero_point)
+            scale_node = graph.create_node(
+                "call_function",
+                operator.getitem,
+                (choose_qparams_node, 0),
+                {}
+            )
+            zero_point_node = graph.create_node(
+                "call_function",
+                operator.getitem,
+                (choose_qparams_node, 1),
+                {}
+            )
+            quant_min = qparams["_quant_min_"]
+            quant_max = qparams["_quant_max_"]
+            dtype = qparams["_dtype_"]
+            qparams = {
+                "_scale_": scale_node,
+                "_zero_point_": zero_point_node,
+                "_quant_min_": quant_min,
+                "_quant_max_": quant_max,
+                "_dtype_": dtype
+            }
+
+        # 3. replace activation_post_process node to quantize and dequantize node
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # in this case we have a node in the graph since it's dynamically
+                    # computed from the input, with choose_qparams op
+                    qparam_node = value_or_node
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we
+                    # store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            # use the same qparams from quantize op
+            dq_inputs = [quantized_node] + quantize_op_inputs[1:]
+            # need to use the tensor variant of this op, since scale and zero_point
+            # from choose_qparam are Tensors, instead of float/int, this is to
+            # prevent these nodes being traced away by downstream systems
+            dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor.tensor
+            dequantized_node = graph.call_function(
+                dequantize_op,
+                tuple(dq_inputs),
+                {}
+            )
+
+            def remap_fn(x):
+                return dequantized_node if x is node else x
+
+            # remap numeric_debug_handle
+            for user_node in node.users:
+                if "numeric_debug_handle" in user_node.meta:
+                    numeric_debug_handle = user_node.meta["numeric_debug_handle"]
+                    user_node.meta["numeric_debug_handle"] = {remap_fn(k): v for k, v in numeric_debug_handle.items()}
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif dtype == torch.float16:
+        raise NotImplementedError("decomposed to float16 op not implemented yet")
+
+    # should not reach since we have checks in the beginning to make sure the
+    # activation_post_process is supported
+
+def _replace_observer_with_quantize_dequantize_node(
+        model: torch.fx.GraphModule,
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]) -> None:
+    """ Replace activation_post_process module call node with quantize and
+    dequantize node
+
+    Before:
+    ... -> observer_0(x) -> ...
+    After:
+    ... -> torch.quantize_per_tensor(x, ...) -> x.dequantize() -> ...
+    """
+    assert modules is not None
+    assert isinstance(node.target, str)
+    graph = model.graph
+    module_path, prefix = _get_module_path_and_prefix(node, node_name_to_scope, node_name_to_qconfig)
+    activation_post_process = modules[node.target]
+    # skip replacing observers to quant/dequant nodes if the qconfigs of all
+    # consumers and producers of this observer are None
+    skip_replacement = all(_has_none_qconfig(n, node_name_to_qconfig) for n in
+                           list(node.args) + list(node.users.keys()))
+    if skip_replacement or not _is_conversion_supported(activation_post_process):
+        # didn't find corresponding quantize op and info for the activation_post_process
+        # so we just remove the observer
+        with graph.inserting_before(node):
+            node.replace_all_uses_with(node.args[0])
+            graph.erase_node(node)
+        return
+
+    # otherwise, we can convert the activation_post_process module call to quantize/dequantize node
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
+
+    if dtype in [torch.quint8, torch.qint8, torch.qint32] and \
+            (not is_dynamic):
+        # TODO: probably should cleanup this condition check, it's hard
+        # to reason about this if and the following elif
+
+        # uint8/int8/int32 static quantization branch
+
+        # 1. extract the information from activation_post_process module for generating
+        # the quantize and dequantize operator
+        node_type = "call_function"
+        quantize_op : Optional[Callable] = None
+        scale, zero_point = activation_post_process.calculate_qparams()  # type: ignore[attr-defined, operator]
+        if is_per_channel(activation_post_process.qscheme):  # type: ignore[attr-defined]
+            ch_axis = int(activation_post_process.ch_axis)  # type: ignore[attr-defined, arg-type]
+            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_axis_": ch_axis, "_dtype_": dtype}
+            quantize_op = torch.quantize_per_channel
+        else:
+            scale = float(scale)
+            zero_point = int(zero_point)
+            qparams = {"_scale_": scale, "_zero_point_": zero_point, "_dtype_": dtype}
+            quantize_op = torch.quantize_per_tensor
+
+        # 2. replace activation_post_process node with quantize and dequantize
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value_or_node in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                if key in ['_scale_', '_zero_point_']:
+                    # For scale and zero_point values we register them as buffers in the root module.
+                    # TODO: maybe need more complex attr name here
+                    qparam_node = create_getattr_from_value(
+                        model, graph, module_path + prefix + key, value_or_node)
+                    quantize_op_inputs.append(qparam_node)
+                else:
+                    # for qparams that are not scale/zero_point (like axis, dtype) we store them as literals in the graph.
+                    quantize_op_inputs.append(value_or_node)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif is_dynamic:
+
+        # uint8/int8/fp16 dynamic quantization branch
+
+        node_type = "call_function"
+        quantize_op = torch.quantize_per_tensor_dynamic
+        # TODO: get reduce range from observer
+        # reduce_range = activation_post_process.reduce_range
+        reduce_range = torch.backends.quantized.engine in ("fbgemm", "x86")
+        qparams = {"_dtype_": dtype, "_reduce_range_": reduce_range}
+
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value in qparams.items():
+                quantize_op_inputs.append(value)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+    elif dtype == torch.float16:
+        node_type = "call_method"
+        quantize_op = "to"  # type: ignore[assignment]
+        qparams = {"_dtype_": dtype}
+        with graph.inserting_before(node):
+            input_node = node.args[0]
+            quantize_op_inputs = [input_node]
+            for key, value in qparams.items():
+                # TODO: we can add the information of whether a value needs to
+                # be registered as an attribute in qparams dict itself
+                quantize_op_inputs.append(value)
+
+            quantized_node = graph.create_node(node_type, quantize_op, tuple(quantize_op_inputs), {})
+            dequantized_node = graph.call_method("dequantize", args=(quantized_node,))
+            node.replace_all_uses_with(dequantized_node)
+            graph.erase_node(node)
+
+    # should not reach since we have checks in the beginning to make sure the
+    # activation_post_process is supported
+
+# this is a temporary hack for custom module, we may want to implement
+# this properly after the custom module class design is finalized
+# TODO: DeQuantStubs are currently inserted only after custom module LSTM, while observers are inserted
+# after all other custom modules. In the future, we should simply insert QuantStubs before and DeQuantStubs
+# after custom modules in general, and replace these with "quantize" and "dequantize" nodes respectively.
+def _replace_observer_or_dequant_stub_with_dequantize_node(node: Node, graph: Graph) -> None:
+    call_custom_module_node = node.args[0]
+    assert isinstance(call_custom_module_node, Node), \
+        f"Expecting the for call custom module node to be a Node, but got {call_custom_module_node}"
+    node.replace_all_uses_with(call_custom_module_node)
+    graph.erase_node(node)
+    _insert_dequantize_node(call_custom_module_node, graph)
+
+def _is_conversion_supported(activation_post_process: torch.nn.Module) -> bool:
+    dtype = activation_post_process.dtype  # type: ignore[attr-defined]
+
+    is_dynamic = False
+    if hasattr(activation_post_process, "is_dynamic"):
+        is_dynamic = activation_post_process.is_dynamic  # type: ignore[attr-defined, assignment]
+
+    return (
+        (dtype in [
+            torch.quint8,
+            torch.qint8,
+            torch.qint32,
+            torch.uint8,
+            torch.int8,
+            torch.int16,
+            torch.int32
+        ] and (not is_dynamic)) or  # type: ignore[return-value]
+        is_dynamic or
+        dtype == torch.float16
+    )
+
+def _has_none_qconfig(node: Argument, node_name_to_qconfig: Dict[str, QConfigAny]) -> bool:
+    """ Check if a node has a qconfig of None, i.e. user requested to not quantize
+    the node
+    """
+    return isinstance(node, Node) and node.name in node_name_to_qconfig and node_name_to_qconfig[node.name] is None
+
+def _run_weight_observers(observed: GraphModule, backend_config: BackendConfig) -> None:
+    """ Extract the subgraph that produces the weight for dynamic quant
+    or weight only quant node and run the subgraph to observe the weight.
+    Note that the observers of dynamic quant or weight only quant ops are
+    run during the convert step.
+    """
+    for node in observed.graph.nodes:
+        if node.op != "call_function":
+            continue
+        for node_arg in node.args:
+            # node_arg is weight
+            if node_arg and node_arg_is_weight(node, node_arg):
+                weight_observer_nodes = collect_producer_nodes(node_arg)
+                if weight_observer_nodes is None:
+                    continue
+                weight_observer_module = \
+                    graph_module_from_producer_nodes(
+                        observed, weight_observer_nodes)
+                # run the weight observer
+                weight_observer_module()
+
+def _maybe_recursive_remove_dequantize(arg: Any, node: Node, graph: Graph) -> None:
+    """ If the arg is a dequantize Node, or a list/tuple/dict of dequantize Node,
+    we'll recursively remove the dequantize Node
+    """
+    if isinstance(arg, Node) and \
+       arg.op == "call_method" and \
+       arg.target == "dequantize":
+        quantize_node = arg.args[0]
+        # we only replace the specific use since dequantize could be used by other nodes
+        # as well
+        node.replace_input_with(arg, quantize_node)
+    elif isinstance(arg, (list, tuple)):
+        for arg_element in arg:
+            _maybe_recursive_remove_dequantize(arg_element, node, graph)
+    elif isinstance(arg, dict):
+        for arg_element in arg.values():
+            _maybe_recursive_remove_dequantize(arg_element, node, graph)
+    else:
+        warnings.warn(f"Unsupported node type in recursive remove dequantize: {type(arg)}")
+
+def _get_module_path_and_prefix(
+        obs_node: Node,
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        node_name_to_qconfig: Dict[str, QConfigAny]) -> Tuple[str, str]:
+    """ Given and observer node, get the `Scope` or the fully qualified name for
+    the submodule containing the observed node, also return a prefix of "_input"
+    when the observed node is an input of a F.linear op, and not the output of another
+    quantized op.
+    TODO: this logic is hacky, we should think about how to remove it or make it more
+    general
+    """
+    observed_node = obs_node.args[0]
+    # an observer can be inserted for both input of the next operator or output of the previous
+    # operator (they can be the same)
+    # this flag identifies if the observer is inserted only because the observed node is
+    # the input of the next operator
+    assert isinstance(observed_node, Node), \
+        f"Expecting observed node to be a Node, but got {observed_node}"
+    is_input_observer_only = node_name_to_qconfig[observed_node.name] is None \
+        if observed_node.name in node_name_to_qconfig else None
+    if is_input_observer_only:
+        # if the quantize function is at the input of op, then we find the first user of the observer_node
+        # to get the path. If a linear call_function is in the user list, we return the first instance
+        # of linear node to get the FQN.
+        users = list(obs_node.users)
+        first_linear_use_or_first_use = users[0] if users else None
+        linear_node = None
+        for n in users:
+            if n.op == "call_function" and n.target == torch.nn.functional.linear:
+                linear_node = n
+                break
+        if linear_node:
+            first_linear_use_or_first_use = linear_node
+        prefix = "_input"
+    else:
+        # if the quantize function is at the output of the op, we use the observer input node to get the path
+        first_linear_use_or_first_use = observed_node
+        prefix = ""
+
+    if first_linear_use_or_first_use and first_linear_use_or_first_use.name in node_name_to_scope:
+        module_path, _ = node_name_to_scope[first_linear_use_or_first_use.name]
+    else:
+        # TODO: it's not used, so actually we can skip quantization
+        # but this requires changing return type of quantize_node
+        # we can fix it later if needed
+        module_path = ""
+    return module_path, prefix
+
+def _insert_dequantize_node(
+        node: Node,
+        graph: Graph) -> None:
+    """ Inserts dequantize node for `node` in `graph`
+    """
+    with graph.inserting_after(node):
+        dequantize_node = graph.call_method("dequantize", (node,))
+        for user_node in dict(node.users):
+            if user_node is not dequantize_node:
+                user_node.replace_input_with(node, dequantize_node)
+
+def _maybe_get_observer_for_node(
+        node: Node,
+        modules: Dict[str, torch.nn.Module]
+) -> Optional[torch.nn.Module]:
+    """
+    If the node is observed, return the observer
+    instance. Otherwise, return None.
+    """
+    for maybe_obs_node in node.users.keys():
+        if maybe_obs_node.op == 'call_module':
+            maybe_obs = modules[str(maybe_obs_node.target)]
+            if _is_activation_post_process(maybe_obs):
+                return maybe_obs
+    return None
+
+def convert_standalone_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        model: torch.fx.GraphModule,
+        is_reference: bool,
+        backend_config: Optional[BackendConfig]) -> None:
+    """ Converts a observed standalone module to a quantized standalone module by calling
+    the fx convert api, currently using the same `is_reference` flag as parent, but we may
+    changing this behavior in the future (e.g. separating quantization and lowering for
+    standalone module as well)
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - model: original model
+      - is_reference: a flag from parent provided by user to decide if we want to
+        produce a reference model or a fbgemm/qnnpack model
+      - backend_config: backend configuration of the target backend of quantization
+    """
+    # TODO: remove is_reference flag
+    if is_reference:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_to_reference_fx
+    else:
+        convert_fn = torch.ao.quantization.quantize_fx.convert_fx  # type: ignore[attr-defined]
+    # We know that observed standalone module is a GraphModule since
+    # it's produced by us
+    observed_standalone_module : GraphModule = modules[str(node.target)]  # type: ignore[assignment]
+    sm_input_quantized_idxs = \
+        observed_standalone_module \
+        .meta["_observed_graph_module_attrs"].standalone_module_input_quantized_idxs
+    # remove the dequantize nodes for inputs
+    args = list(node.args)
+    for idx in range(len(args)):
+        if idx in sm_input_quantized_idxs:
+            arg = args[idx]
+            if arg.op == "call_method" and arg.target == "dequantize":  # type: ignore[union-attr]
+                quantize_node = arg.args[0]  # type: ignore[union-attr]
+                node.replace_input_with(arg, quantize_node)
+                if len(arg.users) == 0:  # type: ignore[union-attr]
+                    model.graph.erase_node(arg)
+    # add dequantize node for output
+    sm_output_quantized_idxs = \
+        observed_standalone_module \
+        .meta["_observed_graph_module_attrs"].standalone_module_output_quantized_idxs
+    if len(sm_output_quantized_idxs) > 0:
+        assert sm_output_quantized_idxs[0] == 0, "Currently only quantized"
+        "output idxs = [0] is supported"
+
+        # if it's non-empty, then it means the output is kept in quantized form
+        # we'll just add a dequantize node after this node
+        _insert_dequantize_node(node, model.graph)
+
+    # TODO: allow convert_custom_config to override backend_config
+    # for standalone module
+    quantized_standalone_module = convert_fn(
+        observed_standalone_module,
+        backend_config=backend_config)
+    parent_name, name = _parent_name(node.target)
+    # update the modules dict
+    setattr(modules[parent_name], name, quantized_standalone_module)
+    modules[str(node.target)] = quantized_standalone_module
+
+def convert_weighted_module(
+        node: Node,
+        modules: Dict[str, torch.nn.Module],
+        observed_node_names: Set[str],
+        node_name_to_qconfig: Dict[str, QConfigAny],
+        backend_config: BackendConfig,
+        is_decomposed: bool = False,
+        is_reference: bool = False,
+) -> None:
+    """ Convert a weighted module to reference quantized module in the model
+    If the QConfig of a QAT module is not set, the module will still be converted to
+    a float module.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - modules: named_module of original model
+      - observed_node_names: names for the set of observed fx node, we can skip
+        this conversion if the node is not observed
+    """
+    original_module = modules[str(node.target)]
+    qconfig: QConfigAny = original_module.qconfig  # type: ignore[assignment]
+    weight_post_process = None
+    qat_module_classes = get_qat_module_classes(backend_config)
+
+    if isinstance(
+            original_module,
+            qat_module_classes):
+        # Converting qat module to a float module, we need to attach
+        # weight fake_quant to the module, weight fake_quant is assumed to be run during
+        # QAT so we don't need to run it again here
+        weight_post_process = original_module.weight_fake_quant
+        original_module = original_module.to_float()  # type: ignore[operator]
+        # change qat module to float module
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name, original_module)
+
+    is_observed = node.name in observed_node_names
+    # If a qconfig is not defined for this node, then skip converting to a reference module
+    if qconfig is None or _has_none_qconfig(node, node_name_to_qconfig) or not is_observed:
+        return
+
+    # skip converting to reference quantized module if the qconfig is not supported
+    pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config)
+    dtype_configs = pattern_to_dtype_configs.get(type(original_module), [])
+    if not _is_qconfig_supported_by_dtype_configs(qconfig, dtype_configs):
+        return
+
+    # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized
+    is_weight_quantized = weight_is_quantized(qconfig)
+
+    # the condition for swapping the module to reference quantized module is:
+    # weights need to be quantized
+    if not is_weight_quantized:
+        return
+
+    fused_module = None
+    float_module = original_module
+    # extract the individual float_module and fused module
+    if isinstance(original_module, torch.ao.nn.intrinsic._FusedModule):
+        fused_module = float_module
+        float_module = fused_module[0]  # type: ignore[index]
+
+    # TODO: move this to the reference quantized module
+    # weight_qparams or weight_qparams dict
+    wq_or_wq_dict = {"is_decomposed": is_decomposed}
+    if isinstance(float_module, torch.nn.RNNCellBase):
+        weight_post_process_ih = qconfig.weight()  # type: ignore[union-attr, operator]
+        weight_post_process_hh = qconfig.weight()  # type: ignore[union-attr, operator]
+        weight_post_process_ih(float_module.weight_ih)
+        weight_post_process_hh(float_module.weight_hh)
+        weight_qparams_ih = get_qparam_dict(weight_post_process_ih)
+        weight_qparams_hh = get_qparam_dict(weight_post_process_hh)
+        wq_or_wq_dict.update({
+            "weight_ih": weight_qparams_ih,
+            "weight_hh": weight_qparams_hh,
+        })
+    elif isinstance(float_module, (torch.nn.LSTM, torch.nn.GRU)):
+        # format for wq_or_wq_dict (flattened attributes):
+        # {"weight_ih_l0_scale": ..., "weight_ih_l0_qscheme": ..., ...}
+        for wn in float_module._flat_weights_names:
+            if hasattr(float_module, wn) and wn.startswith("weight"):
+                weight = getattr(float_module, wn)
+                weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
+                if weight_post_process.dtype == torch.qint8:  # type: ignore[union-attr]
+                    weight_post_process(weight)  # type: ignore[operator, misc]
+                wq_or_wq_dict[wn] = get_qparam_dict(weight_post_process)
+    else:
+        # weight_post_process is None means the original module is not a QAT module
+        # we need to get weight_post_process from qconfig in this case
+        is_ptq = weight_post_process is None
+        if is_ptq:
+            weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
+            device = assert_and_get_unique_device(float_module)
+            if device:
+                weight_post_process.to(device)
+
+        # Call weight observer/fake_quant at least once to ensure the scales and zero points
+        # have the right shapes. Note: there are two cases where we don't have to do this:
+        #
+        # (1) QAT: The model's forward method already calls the weight observer/fake_quant,
+        #     and this typically happens during training, so we don't need to do it here.
+        #
+        # (2) Non-reference (lowered) case: The quantized module's from_float method already
+        #     calls the weight observer/fake_quant, so we don't have to do it here.
+        #
+        # Currently we ignore both cases and call the weight observer/fake_quant here
+        # regardless, which is technically incorrect. For (1), this is mainly to preserve BC
+        # in test code, which may not always train before convert. In the future, we should
+        # break BC for these two cases. See https://github.com/pytorch/pytorch/issues/73941.
+        #
+        # For PT2, however, we don't need to preserve BC here, so we can skip this hack
+        # for QAT. We identify this case as (is_decomposed + is_reference + is_qat).
+        # Note that we still need it for PTQ in the PT2 flow since the model's forward
+        # method doesn't call the weight observer.
+        is_qat = not is_ptq
+        if not (is_decomposed and is_reference and is_qat):
+            weight_post_process(float_module.weight)  # type: ignore[operator]
+
+        wq_or_wq_dict.update(get_qparam_dict(weight_post_process))
+
+    # We use the same reference module for all modes of quantization: static, dynamic, weight_only
+    # root_module_to_quantized_reference_module: module mapping from root (floating point) module class
+    # to quantized reference module class, e.g. nn.Conv2d to nn.quantized._reference.Conv2d
+    root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config)
+    ref_qmodule_cls = root_module_to_quantized_reference_module.get(type_before_parametrizations(float_module), None)
+    assert (
+        ref_qmodule_cls is not None
+    ), f"No reference quantized module class configured for {type_before_parametrizations(float_module)}"
+    ref_qmodule = ref_qmodule_cls.from_float(float_module, wq_or_wq_dict)  # type: ignore[attr-defined]
+    if fused_module is not None:
+        fused_module[0] = ref_qmodule  # type: ignore[operator]
+    else:
+        parent_name, name = _parent_name(node.target)
+        setattr(modules[parent_name], name, ref_qmodule)
+
+def _remove_previous_dequantize_in_custom_module(node: Node, prev_node: Node, graph: Graph) -> None:
+    """
+    Given a custom module `node`, if the previous node is a dequantize, reroute the custom as follows:
+
+    Before: quantize - dequantize - custom_module
+    After: quantize - custom_module
+                 \\ - dequantize
+    """
+    # expecting the input node for a custom module node to be a Node
+    assert isinstance(prev_node, Node), \
+        f"Expecting the argument for custom module node to be a Node, but got {prev_node}"
+    if prev_node.op == "call_method" and prev_node.target == "dequantize":
+        node.replace_input_with(prev_node, prev_node.args[0])
+        # Remove the dequantize node if it doesn't have other users
+        if len(prev_node.users) == 0:
+            graph.erase_node(prev_node)
+
+def convert_custom_module(
+        node: Node,
+        graph: Graph,
+        modules: Dict[str, torch.nn.Module],
+        custom_module_class_mapping: Dict[QuantType, Dict[Type, Type]],
+        statically_quantized_custom_module_nodes: Set[Node]) -> None:
+    """ Converts an observed custom module to a quantized custom module based on
+    `custom_module_class_mapping`
+    For static quantization, we'll also remove the previous `dequantize` node and
+    attach the observer node for output to the module, the observer for the node
+    will be converted to a dequantize node instead of quantize-dequantize pairs
+    later in the graph. In the end we would have a quantized custom module that
+    has the same interface as a default quantized module in nn.quantized namespace,
+    i.e. quantized input and quantized output.
+
+    Args:
+      - node: The call_module node of the observed standalone module
+      - graph: The graph containing the node
+      - modules: named_module of original model
+      - custom_module_class_mapping: mapping from observed custom module class to
+        quantized custom module class, used to swap custom modules
+      - statically_quantized_custom_module_nodes: we'll add the custom module node
+        if we find it is statically quantized, this will be used later when converting
+        observers to quant/dequant node pairs, if the observed node is a statically
+        quantized custom module nodes, we'll convert the observer to a dequantize node,
+        this is to keep the interface the same as the default quantized module.
+        TODO: maybe we want to redesign this part to align with reference model design
+        as well, but there has been some discussions around the interface, so we can do
+        it later.
+    """
+    observed_custom_module = modules[str(node.target)]
+    maybe_obs = _maybe_get_observer_for_node(node, modules)
+    qconfig = observed_custom_module.qconfig
+    if activation_is_statically_quantized(qconfig):
+        statically_quantized_custom_module_nodes.add(node)
+        if _is_custom_module_lstm(node, modules):
+            # The inputs are tuples in the form (input, (hidden0, hidden1))
+            # Ensure all three input nodes are quantized
+            assert (
+                len(node.args) == 2 and
+                isinstance(node.args[1], tuple) and
+                len(node.args[1]) == 2
+            )
+            (inputs, (hidden0, hidden1)) = node.args  # type: ignore[misc]
+            assert isinstance(inputs, Node)
+            assert isinstance(hidden0, Node)
+            assert isinstance(hidden1, Node)
+            _remove_previous_dequantize_in_custom_module(node, inputs, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden0, graph)
+            _remove_previous_dequantize_in_custom_module(node, hidden1, graph)
+        elif _is_custom_module_mha(node, modules):
+            # Inputs are in the form (query, key, value)
+            # TODO: This is the first step in enabling the full fx custom module
+            # quantization path for MultiheadAttention, and only covers the inputs
+            # to the module.
+            # Additional handling is yet to be implemented for the outputs, similar
+            # to LSTM custom module
+            assert len(node.args) == 3
+            query, key, value = node.args
+            assert isinstance(query, Node)
+            assert isinstance(key, Node)
+            assert isinstance(value, Node)
+            _remove_previous_dequantize_in_custom_module(node, query, graph)
+            _remove_previous_dequantize_in_custom_module(node, key, graph)
+            _remove_previous_dequantize_in_custom_module(node, value, graph)
+        else:
+            # remove the previous dequant node to ensure the inputs are quantized
+            arg = node.args[0]
+            assert isinstance(arg, Node)
+            _remove_previous_dequantize_in_custom_module(node, arg, graph)
+            # absorb the following observer into the module conversion
+            activation_post_process = _maybe_get_observer_for_node(node, modules)
+            assert activation_post_process is not None
+            observed_custom_module.activation_post_process = activation_post_process
+
+    # swap the observed custom module to quantized custom module
+    quantized_custom_module_class = get_swapped_custom_module_class(
+        observed_custom_module, custom_module_class_mapping, qconfig)
+    quantized_custom_module = \
+        quantized_custom_module_class.from_observed(observed_custom_module)
+    parent_name, name = _parent_name(node.target)
+    setattr(modules[parent_name], name, quantized_custom_module)
+
+def convert(
+        model: GraphModule, is_reference: bool = False,
+        convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+        is_standalone_module: bool = False,
+        _remove_qconfig_flag: bool = True,
+        qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+        backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+        is_decomposed: bool = False) -> GraphModule:
+    """
+    We will convert an observed model (a module with observer calls) to a reference
+    quantized model, the rule is simple:
+    1. for each observer module call in the graph, we'll convert it to calls to
+       quantize and dequantize functions based on the observer instance
+    2. for weighted operations like linear/conv, we need to convert them to reference
+       quantized module, this requires us to know whether the dtype configured for the
+       weight is supported in the backend, this is done in prepare step and the result
+       is stored in observed_node_names, we can decide whether we need to swap the
+       module based on this set
+
+    Args:
+       * `is_standalone_module`: when this flag is True, it means we are quantizing
+       a submodule that is not inlined in parent module, and will be quantized
+       separately as one unit.
+
+       * `is_decomposed`: a boolean flag to indicate whether we want to use the
+        quantize operator for decomposed quantized tensor
+        (torch.ops.quantized_decomposed.quantize_per_tensor) or default/standalone
+        quantized tensor (torch.quantize_per_tensor)
+
+    Returns:
+         a quantized standalone module, whether input/output is quantized is
+         specified by prepare_custom_config, with
+         input_quantized_idxs, output_quantized_idxs, please
+         see docs for :func:`~torch.ao.quantization.prepare_fx` for details
+    """
+    if convert_custom_config is None:
+        convert_custom_config = ConvertCustomConfig()
+
+    if isinstance(convert_custom_config, Dict):
+        warnings.warn(
+            "Passing a convert_custom_config_dict to convert is deprecated and will not be supported "
+            "in a future version. Please pass in a ConvertCustomConfig instead.")
+        convert_custom_config = ConvertCustomConfig.from_dict(convert_custom_config)
+
+    if isinstance(qconfig_mapping, Dict):
+        warnings.warn(
+            "Passing a QConfig dictionary to convert is deprecated and will not be supported "
+            "in a future version. Please pass in a QConfigMapping instead.")
+        qconfig_mapping = QConfigMapping.from_dict(qconfig_mapping) if qconfig_mapping else None
+    qconfig_mapping = copy.deepcopy(qconfig_mapping)
+    assert qconfig_mapping is None or isinstance(qconfig_mapping, QConfigMapping)
+
+    if isinstance(backend_config, Dict):
+        warnings.warn(
+            "Passing a backend_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a BackendConfig instead.")
+        backend_config = BackendConfig.from_dict(backend_config)
+
+    if backend_config is None:
+        backend_config = get_native_backend_config()
+
+    assert _is_observed_module(model), \
+        'incoming model must be produced by prepare_fx'
+    observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
+    node_name_to_scope: Dict[str, Tuple[str, type]] = observed_graph_module_attrs.node_name_to_scope
+    prepare_custom_config: PrepareCustomConfig = observed_graph_module_attrs.prepare_custom_config
+    observed_node_names: Set[str] = observed_graph_module_attrs.observed_node_names
+    node_name_to_qconfig: Dict[str, QConfigAny] = observed_graph_module_attrs.node_name_to_qconfig  # type: ignore[assignment]
+
+    # mapping from fully qualified module name to module instance
+    # for example,
+    # {
+    #   '': Model(...),
+    #   'linear': Linear(...),
+    #   'linear.weight_fake_quant': PerChannelMinMaxObserver(...),
+    # }
+    # We use remove_duplicate=False here because torch.cat uses
+    # the same activation_post_process module instance but different names
+    modules = dict(model.named_modules(remove_duplicate=False))
+
+    # TODO refactor this code once we update the prepare logic to have additional information on
+    # which graph nodes have been observed and share that with convert to decide which observers to ignore.
+    if qconfig_mapping:
+        prepare_qconfig_mapping: QConfigMapping = observed_graph_module_attrs.qconfig_mapping  # type: ignore[assignment]
+        modules_copy = copy.deepcopy(modules)
+
+        if observed_graph_module_attrs.is_qat:
+            _update_qconfig_for_qat(qconfig_mapping, backend_config)
+        _update_qconfig_for_fusion(model, qconfig_mapping)
+
+        _compare_prepare_convert_qconfig_mappings(prepare_qconfig_mapping, qconfig_mapping)  # type: ignore[arg-type]
+        convert_node_name_to_qconfig = _generate_node_name_to_qconfig(
+            model, modules_copy, model.graph, qconfig_mapping, node_name_to_scope)
+        # check the convert_node_name_to_qconfig generated and ensure that
+        # all the values either match what was set in prepare node_name_to_qconfig
+        # or are set to None in the convert_node_name_to_qconfig.
+        for k, v in node_name_to_qconfig.items():
+            assert k in convert_node_name_to_qconfig, f'Expected key {k} in convert node_name_to_qconfig'
+            if convert_node_name_to_qconfig[k] is not None:
+                assert qconfig_equals(v, convert_node_name_to_qconfig[k]), \
+                    f"Expected k {k} to have the same value in prepare and convert QConfigMappings, " \
+                    f"but {v} was updated to {convert_node_name_to_qconfig[k]}"
+        node_name_to_qconfig = convert_node_name_to_qconfig
+
+    custom_module_classes = get_custom_module_class_keys(convert_custom_config.observed_to_quantized_mapping)
+    custom_module_class_mapping = convert_custom_config.observed_to_quantized_mapping
+
+    if observed_graph_module_attrs.equalization_node_name_to_qconfig is not None:
+        # If we want to do equalization then do the following:
+        # Calculate the equalization scale, update the observers with the scaled
+        # inputs, and scale the weight
+        weight_eq_obs_dict = update_obs_for_equalization(model, modules)
+        convert_eq_obs(model, modules, weight_eq_obs_dict)
+
+    # always run weight observers in the top level forward method
+    # for dynamic quant ops or weight only quant ops
+    _run_weight_observers(model, backend_config)
+
+    graph_inputs: List[str] = []
+    for node in model.graph.nodes:
+        if node.op == 'placeholder':
+            graph_inputs.append(node.name)
+
+    # additional state to override inputs to be quantized, if specified
+    # by the user
+    placeholder_node_seen_cnt = 0
+    input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+    output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
+
+    root_module_to_quantized_reference_module = get_root_module_to_quantized_reference_module(backend_config)
+    # convert tuples so that it can work with isinstance(module, tuple_of_classes)
+    root_module_classes = tuple(root_module_to_quantized_reference_module.keys())
+    qat_module_classes = get_qat_module_classes(backend_config)
+    fused_module_classes = get_fused_module_classes(backend_config)
+    statically_quantized_custom_module_nodes: Set[Node] = set()
+
+    for node in list(model.graph.nodes):
+        if node.op == 'placeholder':
+            cur_placeholder_node_idx = placeholder_node_seen_cnt
+            placeholder_node_seen_cnt += 1
+            if cur_placeholder_node_idx in input_quantized_idxs:
+                # Inputs are assumed to be quantized if the user specified the
+                # input_quantized_idxs override.
+                # we need to dequantize the inputs since all operators took
+                # floating point inputs in reference quantized models
+                _insert_dequantize_node(node, model.graph)
+        elif node.op == "output":
+            # If the argument is empty we don't need to do anything
+            if len(output_quantized_idxs) == 0:
+                continue
+            # Result are kept quantized if the user specified the
+            # output_quantized_idxs override.
+            # Remove the dequantize operator for the node in the end if any
+            return_node = node
+            output = node.args[0]
+            # outputs can be Node, list, tuple, dict, other cases are not supported yet
+            if isinstance(output, (list, tuple)):
+                for idx in output_quantized_idxs:
+                    _maybe_recursive_remove_dequantize(output[idx], return_node, model.graph)
+            elif isinstance(output, (Node, dict)):
+                # we treat dict as a single argument currently, but it can be extended
+                # to support {"key": dtype} after we change output_quantized_idxs to
+                # dict
+                if 0 in output_quantized_idxs:
+                    _maybe_recursive_remove_dequantize(output, return_node, model.graph)
+            else:
+                warnings.warn(f"Unsupported node type for output_quantized_idxs: {type(output)}")
+        elif node.op == "call_module":
+            mod = _get_module(node, modules)
+            assert mod is not None
+            if _is_activation_post_process(mod):
+                observed_node = node.args[0]
+                if observed_node in statically_quantized_custom_module_nodes:
+                    _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
+                else:
+                    if is_decomposed:
+                        _replace_observer_with_quantize_dequantize_node_decomposed(
+                            model, node, modules, node_name_to_scope,
+                            node_name_to_qconfig)
+                    else:
+                        _replace_observer_with_quantize_dequantize_node(
+                            model, node, modules, node_name_to_scope,
+                            node_name_to_qconfig)
+            elif isinstance(mod, DeQuantStub):
+                _replace_observer_or_dequant_stub_with_dequantize_node(node, model.graph)
+            elif _is_observed_standalone_module(mod):
+                convert_standalone_module(
+                    node, modules, model, is_reference, backend_config)
+            # below this point `type_before_parametrizations` is used
+            # instead of `type` to handle situations with fx quant + sparsity
+            elif type_before_parametrizations(mod) in set(
+                    root_module_classes).union(qat_module_classes).union(fused_module_classes):
+                # extra check for fused module classes to make sure they are fused module classes
+                # of target modules
+                if type_before_parametrizations(mod) in fused_module_classes and \
+                   type_before_parametrizations(mod[0]) not in root_module_classes:  # type: ignore[index]
+                    continue
+                convert_weighted_module(
+                    node, modules, observed_node_names, node_name_to_qconfig, backend_config,
+                    is_decomposed, is_reference)
+            elif type_before_parametrizations(mod) in custom_module_classes:
+                convert_custom_module(
+                    node, model.graph, modules, custom_module_class_mapping,
+                    statically_quantized_custom_module_nodes)
+
+    # remove deadcode after converting observers to quant/dequant ops
+    model.graph.eliminate_dead_code()
+    model = GraphModule(model, model.graph)
+
+    # TODO: maybe move this to quantize_fx.py
+    if not is_reference:
+        model = lower_to_fbgemm(model, node_name_to_qconfig, node_name_to_scope)
+
+    # TODO: this looks hacky, we want to check why we need this and see if we can
+    # remove this
+    # removes qconfig and activation_post_process modules
+    if _remove_qconfig_flag:
+        _remove_qconfig(model)
+    model.delete_all_unused_submodules()
+    model.meta.pop("_observed_graph_module_attrs", None)
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/custom_config.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/custom_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7cab65279fb41141c271cd6e349f211eaa9281
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/custom_config.py
@@ -0,0 +1,419 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.backend_config import BackendConfig
+from torch.ao.quantization.quant_type import QuantType, _quant_type_from_str, _get_quant_type_to_str
+
+
+__all__ = [
+    "ConvertCustomConfig",
+    "FuseCustomConfig",
+    "PrepareCustomConfig",
+    "StandaloneModuleConfigEntry",
+]
+
+
+# TODO: replace all usages with these constants
+STANDALONE_MODULE_NAME_DICT_KEY = "standalone_module_name"
+STANDALONE_MODULE_CLASS_DICT_KEY = "standalone_module_class"
+FLOAT_TO_OBSERVED_DICT_KEY = "float_to_observed_custom_module_class"
+OBSERVED_TO_QUANTIZED_DICT_KEY = "observed_to_quantized_custom_module_class"
+NON_TRACEABLE_MODULE_NAME_DICT_KEY = "non_traceable_module_name"
+NON_TRACEABLE_MODULE_CLASS_DICT_KEY = "non_traceable_module_class"
+INPUT_QUANTIZED_INDEXES_DICT_KEY = "input_quantized_idxs"
+OUTPUT_QUANTIZED_INDEXES_DICT_KEY = "output_quantized_idxs"
+PRESERVED_ATTRIBUTES_DICT_KEY = "preserved_attributes"
+
+
+@dataclass
+class StandaloneModuleConfigEntry:
+    # qconfig_mapping for the prepare function called in the submodule,
+    # None means use qconfig from parent qconfig_mapping
+    qconfig_mapping: Optional[QConfigMapping]
+    example_inputs: Tuple[Any, ...]
+    prepare_custom_config: Optional[PrepareCustomConfig]
+    backend_config: Optional[BackendConfig]
+
+
+class PrepareCustomConfig:
+    """
+    Custom configuration for :func:`~torch.ao.quantization.quantize_fx.prepare_fx` and
+    :func:`~torch.ao.quantization.quantize_fx.prepare_qat_fx`.
+
+    Example usage::
+
+        prepare_custom_config = PrepareCustomConfig() \
+            .set_standalone_module_name("module1", qconfig_mapping, example_inputs, \
+                child_prepare_custom_config, backend_config) \
+            .set_standalone_module_class(MyStandaloneModule, qconfig_mapping, example_inputs, \
+                child_prepare_custom_config, backend_config) \
+            .set_float_to_observed_mapping(FloatCustomModule, ObservedCustomModule) \
+            .set_non_traceable_module_names(["module2", "module3"]) \
+            .set_non_traceable_module_classes([NonTraceableModule1, NonTraceableModule2]) \
+            .set_input_quantized_indexes([0]) \
+            .set_output_quantized_indexes([0]) \
+            .set_preserved_attributes(["attr1", "attr2"])
+    """
+    def __init__(self):
+        self.standalone_module_names: Dict[str, StandaloneModuleConfigEntry] = {}
+        self.standalone_module_classes: Dict[Type, StandaloneModuleConfigEntry] = {}
+        self.float_to_observed_mapping: Dict[QuantType, Dict[Type, Type]] = {}
+        self.non_traceable_module_names: List[str] = []
+        self.non_traceable_module_classes: List[Type] = []
+        self.input_quantized_indexes: List[int] = []
+        self.output_quantized_indexes: List[int] = []
+        self.preserved_attributes: List[str] = []
+
+    def __repr__(self):
+        dict_nonempty = {
+            k: v for k, v in self.__dict__.items()
+            if len(v) > 0
+        }
+        return f"PrepareCustomConfig({dict_nonempty})"
+
+    def set_standalone_module_name(
+            self,
+            module_name: str,
+            qconfig_mapping: Optional[QConfigMapping],
+            example_inputs: Tuple[Any, ...],
+            prepare_custom_config: Optional[PrepareCustomConfig],
+            backend_config: Optional[BackendConfig]) -> PrepareCustomConfig:
+        """
+        Set the configuration for running a standalone module identified by ``module_name``.
+
+        If ``qconfig_mapping`` is None, the parent ``qconfig_mapping`` will be used instead.
+        If ``prepare_custom_config`` is None, an empty ``PrepareCustomConfig`` will be used.
+        If ``backend_config`` is None, the parent ``backend_config`` will be used instead.
+        """
+        self.standalone_module_names[module_name] = \
+            StandaloneModuleConfigEntry(qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
+        return self
+
+    def set_standalone_module_class(
+            self,
+            module_class: Type,
+            qconfig_mapping: Optional[QConfigMapping],
+            example_inputs: Tuple[Any, ...],
+            prepare_custom_config: Optional[PrepareCustomConfig],
+            backend_config: Optional[BackendConfig]) -> PrepareCustomConfig:
+        """
+        Set the configuration for running a standalone module identified by ``module_class``.
+
+        If ``qconfig_mapping`` is None, the parent ``qconfig_mapping`` will be used instead.
+        If ``prepare_custom_config`` is None, an empty ``PrepareCustomConfig`` will be used.
+        If ``backend_config`` is None, the parent ``backend_config`` will be used instead.
+        """
+        self.standalone_module_classes[module_class] = \
+            StandaloneModuleConfigEntry(qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
+        return self
+
+    def set_float_to_observed_mapping(
+            self,
+            float_class: Type,
+            observed_class: Type,
+            quant_type: QuantType = QuantType.STATIC) -> PrepareCustomConfig:
+        """
+        Set the mapping from a custom float module class to a custom observed module class.
+
+        The observed module class must have a ``from_float`` class method that converts the float module class
+        to the observed module class. This is currently only supported for static quantization.
+        """
+        if quant_type != QuantType.STATIC:
+            raise ValueError("set_float_to_observed_mapping is currently only supported for static quantization")
+        if quant_type not in self.float_to_observed_mapping:
+            self.float_to_observed_mapping[quant_type] = {}
+        self.float_to_observed_mapping[quant_type][float_class] = observed_class
+        return self
+
+    def set_non_traceable_module_names(self, module_names: List[str]) -> PrepareCustomConfig:
+        """
+        Set the modules that are not symbolically traceable, identified by name.
+        """
+        self.non_traceable_module_names = module_names
+        return self
+
+    def set_non_traceable_module_classes(self, module_classes: List[Type]) -> PrepareCustomConfig:
+        """
+        Set the modules that are not symbolically traceable, identified by class.
+        """
+        self.non_traceable_module_classes = module_classes
+        return self
+
+    def set_input_quantized_indexes(self, indexes: List[int]) -> PrepareCustomConfig:
+        """
+        Set the indexes of the inputs of the graph that should be quantized.
+        Inputs are otherwise assumed to be in fp32 by default instead.
+        """
+        self.input_quantized_indexes = indexes
+        return self
+
+    def set_output_quantized_indexes(self, indexes: List[int]) -> PrepareCustomConfig:
+        """
+        Set the indexes of the outputs of the graph that should be quantized.
+        Outputs are otherwise assumed to be in fp32 by default instead.
+        """
+        self.output_quantized_indexes = indexes
+        return self
+
+    def set_preserved_attributes(self, attributes: List[str]) -> PrepareCustomConfig:
+        """
+        Set the names of the attributes that will persist in the graph module even if they are not used in
+        the model's ``forward`` method.
+        """
+        self.preserved_attributes = attributes
+        return self
+
+    # TODO: remove this
+    @classmethod
+    def from_dict(cls, prepare_custom_config_dict: Dict[str, Any]) -> PrepareCustomConfig:
+        """
+        Create a ``PrepareCustomConfig`` from a dictionary with the following items:
+
+            "standalone_module_name": a list of (module_name, qconfig_mapping, example_inputs,
+            child_prepare_custom_config, backend_config) tuples
+
+            "standalone_module_class" a list of (module_class, qconfig_mapping, example_inputs,
+            child_prepare_custom_config, backend_config) tuples
+
+            "float_to_observed_custom_module_class": a nested dictionary mapping from quantization
+            mode to an inner mapping from float module classes to observed module classes, e.g.
+            {"static": {FloatCustomModule: ObservedCustomModule}}
+
+            "non_traceable_module_name": a list of modules names that are not symbolically traceable
+            "non_traceable_module_class": a list of module classes that are not symbolically traceable
+            "input_quantized_idxs": a list of indexes of graph inputs that should be quantized
+            "output_quantized_idxs": a list of indexes of graph outputs that should be quantized
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
+
+        This function is primarily for backward compatibility and may be removed in the future.
+        """
+        def _get_qconfig_mapping(obj: Any, dict_key: str) -> Optional[QConfigMapping]:
+            """
+            Convert the given object into a QConfigMapping if possible, else throw an exception.
+            """
+            if isinstance(obj, QConfigMapping) or obj is None:
+                return obj
+            if isinstance(obj, Dict):
+                return QConfigMapping.from_dict(obj)
+            raise ValueError(f"Expected QConfigMapping in prepare_custom_config_dict[\"{dict_key}\"], got '{type(obj)}'")
+
+        def _get_prepare_custom_config(obj: Any, dict_key: str) -> Optional[PrepareCustomConfig]:
+            """
+            Convert the given object into a PrepareCustomConfig if possible, else throw an exception.
+            """
+            if isinstance(obj, PrepareCustomConfig) or obj is None:
+                return obj
+            if isinstance(obj, Dict):
+                return PrepareCustomConfig.from_dict(obj)
+            raise ValueError(f"Expected PrepareCustomConfig in prepare_custom_config_dict[\"{dict_key}\"], got '{type(obj)}'")
+
+        def _get_backend_config(obj: Any, dict_key: str) -> Optional[BackendConfig]:
+            """
+            Convert the given object into a BackendConfig if possible, else throw an exception.
+            """
+            if isinstance(obj, BackendConfig) or obj is None:
+                return obj
+            if isinstance(obj, Dict):
+                return BackendConfig.from_dict(obj)
+            raise ValueError(f"Expected BackendConfig in prepare_custom_config_dict[\"{dict_key}\"], got '{type(obj)}'")
+
+        conf = cls()
+        for (module_name, qconfig_dict, example_inputs, _prepare_custom_config_dict, backend_config_dict) in\
+                prepare_custom_config_dict.get(STANDALONE_MODULE_NAME_DICT_KEY, []):
+            qconfig_mapping = _get_qconfig_mapping(qconfig_dict, STANDALONE_MODULE_NAME_DICT_KEY)
+            prepare_custom_config = _get_prepare_custom_config(_prepare_custom_config_dict, STANDALONE_MODULE_NAME_DICT_KEY)
+            backend_config = _get_backend_config(backend_config_dict, STANDALONE_MODULE_NAME_DICT_KEY)
+            conf.set_standalone_module_name(
+                module_name, qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
+        for (module_class, qconfig_dict, example_inputs, _prepare_custom_config_dict, backend_config_dict) in\
+                prepare_custom_config_dict.get(STANDALONE_MODULE_CLASS_DICT_KEY, []):
+            qconfig_mapping = _get_qconfig_mapping(qconfig_dict, STANDALONE_MODULE_CLASS_DICT_KEY)
+            prepare_custom_config = _get_prepare_custom_config(_prepare_custom_config_dict, STANDALONE_MODULE_CLASS_DICT_KEY)
+            backend_config = _get_backend_config(backend_config_dict, STANDALONE_MODULE_CLASS_DICT_KEY)
+            conf.set_standalone_module_class(
+                module_class, qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
+        for quant_type_name, custom_module_mapping in prepare_custom_config_dict.get(FLOAT_TO_OBSERVED_DICT_KEY, {}).items():
+            quant_type = _quant_type_from_str(quant_type_name)
+            for float_class, observed_class in custom_module_mapping.items():
+                conf.set_float_to_observed_mapping(float_class, observed_class, quant_type)
+        conf.set_non_traceable_module_names(prepare_custom_config_dict.get(NON_TRACEABLE_MODULE_NAME_DICT_KEY, []))
+        conf.set_non_traceable_module_classes(prepare_custom_config_dict.get(NON_TRACEABLE_MODULE_CLASS_DICT_KEY, []))
+        conf.set_input_quantized_indexes(prepare_custom_config_dict.get(INPUT_QUANTIZED_INDEXES_DICT_KEY, []))
+        conf.set_output_quantized_indexes(prepare_custom_config_dict.get(OUTPUT_QUANTIZED_INDEXES_DICT_KEY, []))
+        conf.set_preserved_attributes(prepare_custom_config_dict.get(PRESERVED_ATTRIBUTES_DICT_KEY, []))
+        return conf
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``PrepareCustomConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig.from_dict`.
+        """
+        def _make_tuple(key: Any, e: StandaloneModuleConfigEntry):
+            qconfig_dict = e.qconfig_mapping.to_dict() if e.qconfig_mapping else None
+            prepare_custom_config_dict = e.prepare_custom_config.to_dict() if e.prepare_custom_config else None
+            return (key, qconfig_dict, e.example_inputs, prepare_custom_config_dict, e.backend_config)
+
+        d: Dict[str, Any] = {}
+        for module_name, sm_config_entry in self.standalone_module_names.items():
+            if STANDALONE_MODULE_NAME_DICT_KEY not in d:
+                d[STANDALONE_MODULE_NAME_DICT_KEY] = []
+            d[STANDALONE_MODULE_NAME_DICT_KEY].append(_make_tuple(module_name, sm_config_entry))
+        for module_class, sm_config_entry in self.standalone_module_classes.items():
+            if STANDALONE_MODULE_CLASS_DICT_KEY not in d:
+                d[STANDALONE_MODULE_CLASS_DICT_KEY] = []
+            d[STANDALONE_MODULE_CLASS_DICT_KEY].append(_make_tuple(module_class, sm_config_entry))
+        for quant_type, float_to_observed_mapping in self.float_to_observed_mapping.items():
+            if FLOAT_TO_OBSERVED_DICT_KEY not in d:
+                d[FLOAT_TO_OBSERVED_DICT_KEY] = {}
+            d[FLOAT_TO_OBSERVED_DICT_KEY][_get_quant_type_to_str(quant_type)] = float_to_observed_mapping
+        if len(self.non_traceable_module_names) > 0:
+            d[NON_TRACEABLE_MODULE_NAME_DICT_KEY] = self.non_traceable_module_names
+        if len(self.non_traceable_module_classes) > 0:
+            d[NON_TRACEABLE_MODULE_CLASS_DICT_KEY] = self.non_traceable_module_classes
+        if len(self.input_quantized_indexes) > 0:
+            d[INPUT_QUANTIZED_INDEXES_DICT_KEY] = self.input_quantized_indexes
+        if len(self.output_quantized_indexes) > 0:
+            d[OUTPUT_QUANTIZED_INDEXES_DICT_KEY] = self.output_quantized_indexes
+        if len(self.preserved_attributes) > 0:
+            d[PRESERVED_ATTRIBUTES_DICT_KEY] = self.preserved_attributes
+        return d
+
+
+class ConvertCustomConfig:
+    """
+    Custom configuration for :func:`~torch.ao.quantization.quantize_fx.convert_fx`.
+
+    Example usage::
+
+        convert_custom_config = ConvertCustomConfig() \
+            .set_observed_to_quantized_mapping(ObservedCustomModule, QuantizedCustomModule) \
+            .set_preserved_attributes(["attr1", "attr2"])
+    """
+
+    def __init__(self):
+        self.observed_to_quantized_mapping: Dict[QuantType, Dict[Type, Type]] = {}
+        self.preserved_attributes: List[str] = []
+
+    def __repr__(self):
+        dict_nonempty = {
+            k: v for k, v in self.__dict__.items()
+            if len(v) > 0
+        }
+        return f"ConvertCustomConfig({dict_nonempty})"
+
+    def set_observed_to_quantized_mapping(
+            self,
+            observed_class: Type,
+            quantized_class: Type,
+            quant_type: QuantType = QuantType.STATIC) -> ConvertCustomConfig:
+        """
+        Set the mapping from a custom observed module class to a custom quantized module class.
+
+        The quantized module class must have a ``from_observed`` class method that converts the observed module class
+        to the quantized module class.
+        """
+        if quant_type not in self.observed_to_quantized_mapping:
+            self.observed_to_quantized_mapping[quant_type] = {}
+        self.observed_to_quantized_mapping[quant_type][observed_class] = quantized_class
+        return self
+
+    def set_preserved_attributes(self, attributes: List[str]) -> ConvertCustomConfig:
+        """
+        Set the names of the attributes that will persist in the graph module even if they are not used in
+        the model's ``forward`` method.
+        """
+        self.preserved_attributes = attributes
+        return self
+
+    # TODO: remove this
+    @classmethod
+    def from_dict(cls, convert_custom_config_dict: Dict[str, Any]) -> ConvertCustomConfig:
+        """
+        Create a ``ConvertCustomConfig`` from a dictionary with the following items:
+
+            "observed_to_quantized_custom_module_class": a nested dictionary mapping from quantization
+            mode to an inner mapping from observed module classes to quantized module classes, e.g.::
+            {
+            "static": {FloatCustomModule: ObservedCustomModule},
+            "dynamic": {FloatCustomModule: ObservedCustomModule},
+            "weight_only": {FloatCustomModule: ObservedCustomModule}
+            }
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
+
+        This function is primarily for backward compatibility and may be removed in the future.
+        """
+        conf = cls()
+        for quant_type_name, custom_module_mapping in convert_custom_config_dict.get(OBSERVED_TO_QUANTIZED_DICT_KEY, {}).items():
+            quant_type = _quant_type_from_str(quant_type_name)
+            for observed_class, quantized_class in custom_module_mapping.items():
+                conf.set_observed_to_quantized_mapping(observed_class, quantized_class, quant_type)
+        conf.set_preserved_attributes(convert_custom_config_dict.get(PRESERVED_ATTRIBUTES_DICT_KEY, []))
+        return conf
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``ConvertCustomConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig.from_dict`.
+        """
+        d: Dict[str, Any] = {}
+        for quant_type, observed_to_quantized_mapping in self.observed_to_quantized_mapping.items():
+            if OBSERVED_TO_QUANTIZED_DICT_KEY not in d:
+                d[OBSERVED_TO_QUANTIZED_DICT_KEY] = {}
+            d[OBSERVED_TO_QUANTIZED_DICT_KEY][_get_quant_type_to_str(quant_type)] = observed_to_quantized_mapping
+        if len(self.preserved_attributes) > 0:
+            d[PRESERVED_ATTRIBUTES_DICT_KEY] = self.preserved_attributes
+        return d
+
+
+class FuseCustomConfig:
+    """
+    Custom configuration for :func:`~torch.ao.quantization.quantize_fx.fuse_fx`.
+
+    Example usage::
+
+        fuse_custom_config = FuseCustomConfig().set_preserved_attributes(["attr1", "attr2"])
+    """
+
+    def __init__(self):
+        self.preserved_attributes: List[str] = []
+
+    def __repr__(self):
+        dict_nonempty = {
+            k: v for k, v in self.__dict__.items()
+            if len(v) > 0
+        }
+        return f"FuseCustomConfig({dict_nonempty})"
+
+    def set_preserved_attributes(self, attributes: List[str]) -> FuseCustomConfig:
+        """
+        Set the names of the attributes that will persist in the graph module even if they are not used in
+        the model's ``forward`` method.
+        """
+        self.preserved_attributes = attributes
+        return self
+
+    # TODO: remove this
+    @classmethod
+    def from_dict(cls, fuse_custom_config_dict: Dict[str, Any]) -> FuseCustomConfig:
+        """
+        Create a ``ConvertCustomConfig`` from a dictionary with the following items:
+
+            "preserved_attributes": a list of attributes that persist even if they are not used in ``forward``
+
+        This function is primarily for backward compatibility and may be removed in the future.
+        """
+        conf = cls()
+        conf.set_preserved_attributes(fuse_custom_config_dict.get(PRESERVED_ATTRIBUTES_DICT_KEY, []))
+        return conf
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``FuseCustomConfig`` to a dictionary with the items described in
+        :func:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig.from_dict`.
+        """
+        d: Dict[str, Any] = {}
+        if len(self.preserved_attributes) > 0:
+            d[PRESERVED_ATTRIBUTES_DICT_KEY] = self.preserved_attributes
+        return d
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c17062c1858f28b010cb1ac77d222d06182ba4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse.py
@@ -0,0 +1,161 @@
+from torch.fx import (
+    GraphModule,
+    Node,
+    map_arg
+)
+from torch.fx.graph import Graph
+from .match_utils import (
+    _is_match,
+    MatchAllNode,
+)
+from .pattern_utils import (
+    _sorted_patterns_dict,
+)
+
+from ..backend_config import (
+    BackendConfig,
+    get_native_backend_config,
+)
+from ..backend_config.utils import (
+    get_fuser_method_mapping,
+    get_fusion_pattern_to_root_node_getter,
+    get_fusion_pattern_to_extra_inputs_getter,
+)
+
+from .custom_config import FuseCustomConfig
+
+from .fuse_handler import (
+    _get_fusion_pattern_to_fuse_handler_cls,
+    FuseHandler,
+)
+
+from typing import Any, Callable, Dict, List, Tuple, Union
+import warnings
+
+from torch.ao.quantization.utils import Pattern, NodePattern
+
+
+__all__ = [
+    "fuse",
+    # TODO: We should make this private in the future
+    # This is currently needed for test_public_bindings for some reason
+    "FuseHandler",
+]
+
+
+def fuse(
+    model: GraphModule,
+    is_qat: bool,
+    fuse_custom_config: Union[FuseCustomConfig, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    if fuse_custom_config is None:
+        fuse_custom_config = FuseCustomConfig()
+
+    if isinstance(fuse_custom_config, Dict):
+        warnings.warn(
+            "Passing a fuse_custom_config_dict to fuse is deprecated and will not be supported "
+            "in a future version. Please pass in a FuseCustomConfig instead.")
+        fuse_custom_config = FuseCustomConfig.from_dict(fuse_custom_config)
+
+    if isinstance(backend_config, Dict):
+        warnings.warn(
+            "Passing a backend_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a BackendConfig instead.")
+        backend_config = BackendConfig.from_dict(backend_config)
+
+    named_modules = dict(model.named_modules())
+
+    if backend_config is None:
+        backend_config = get_native_backend_config()
+
+    fusion_pattern_to_fuse_handler_cls = _sorted_patterns_dict(_get_fusion_pattern_to_fuse_handler_cls(backend_config))
+    fuser_method_mapping = get_fuser_method_mapping(backend_config)
+    fusion_pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
+    fusion_pattern_to_extra_inputs_getter = get_fusion_pattern_to_extra_inputs_getter(backend_config)
+
+    # find fusion
+    fusion_pairs = _find_matches(
+        model, model.graph, fusion_pattern_to_fuse_handler_cls)
+    # TODO: change this to inplace changes to graph, since we no longer construct
+    # new GraphModule anymore
+    fused_graph = Graph()
+    env: Dict[Any, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node.name])
+
+    def default_root_node_getter(node_pattern):
+        while not isinstance(node_pattern[-1], Node):
+            node_pattern = node_pattern[-1]
+        return node_pattern[-1]
+
+    for node in model.graph.nodes:
+        maybe_last_node, pattern, matched_node_pattern, obj, node_to_subpattern = \
+            fusion_pairs.get(node.name, (None, None, None, None, None))
+        # get the corresponding subpattern for the current node
+        if node_to_subpattern is not None:
+            node_subpattern = node_to_subpattern.get(node, None)
+        else:
+            node_subpattern = None
+        if maybe_last_node is node:
+            assert obj is not None
+            root_node_getter = fusion_pattern_to_root_node_getter.get(pattern, default_root_node_getter)
+            root_node = root_node_getter(matched_node_pattern)  # type: ignore[index]
+            extra_inputs_getter = fusion_pattern_to_extra_inputs_getter.get(pattern, None)
+            extra_inputs = []
+            if extra_inputs_getter is not None:
+                extra_inputs = extra_inputs_getter(matched_node_pattern)
+            # TODO: add validation that root_node is a module and has the same type
+            # as the root_module in the configuration
+            env[node.name] = obj.fuse(
+                load_arg, named_modules, fused_graph, root_node, extra_inputs, matched_node_pattern,  # type: ignore[arg-type]
+                fuse_custom_config, fuser_method_mapping, is_qat)
+        elif maybe_last_node is None or node_subpattern is MatchAllNode:
+            env[node.name] = fused_graph.node_copy(node, load_arg)
+        # node matched in patterns and is not root is removed here
+
+    model = GraphModule(model, fused_graph)
+    return model
+
+def _find_matches(
+        root: GraphModule,
+        graph: Graph,
+        pattern_to_fuse_handler_cls: Dict[Pattern, Callable],
+) -> Dict[str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]]:
+    modules = dict(root.named_modules())
+    # node name -> (root_node, match_value)
+    match_map : Dict[
+        str, Tuple[Node, Pattern, NodePattern, FuseHandler, Dict[Node, Any]]] = {}
+    # a map from node to the matched subpattern
+    node_to_subpattern: Dict[Node, Any] = {}
+
+    # TODO: dedup with quantization matching function in match_utils.py
+    def apply_match(pattern, node, match, matched_node_pattern, node_to_subpattern):
+        if isinstance(pattern, tuple):
+            s, *args = pattern
+            current_node_pattern: List[Node] = []
+            apply_match(s, node, match, current_node_pattern, node_to_subpattern)
+            for subpattern, arg in zip(args, node.args):
+                apply_match(subpattern, arg, match, current_node_pattern, node_to_subpattern)
+            matched_node_pattern.append(tuple(current_node_pattern))
+        else:
+            # the first pattern matches will take precedence
+            if node.name not in match_map:
+                matched_node_pattern.append(node)
+                # MatchAllNode here is actually MatchAllInputNode which should not
+                # be added to match_map
+                if pattern is not MatchAllNode:
+                    node_to_subpattern[node] = pattern
+                    root_node, pattern, handler = match
+                    match_map[node.name] = (root_node, pattern, matched_node_pattern, handler, node_to_subpattern)
+
+    for node in reversed(graph.nodes):
+        if node.name not in match_map:
+            for pattern, fuse_handler_cls in pattern_to_fuse_handler_cls.items():
+                matched_node_pattern: List[Node] = []
+                if _is_match(modules, node, pattern):
+                    apply_match(pattern, node, (node, pattern, fuse_handler_cls(node)), matched_node_pattern, node_to_subpattern)
+                    break
+
+    return match_map
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse_handler.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8752d4fe2d4486dd82a75e6c5e531e2efa888842
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/fuse_handler.py
@@ -0,0 +1,120 @@
+import torch
+from torch.ao.quantization.backend_config import BackendConfig
+from torch.fx.graph import Node, Graph
+from ..utils import _parent_name, NodePattern, Pattern
+from ..fuser_method_mappings import get_fuser_method_new
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Union
+from .custom_config import FuseCustomConfig
+from .match_utils import MatchAllNode
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+__all__ = [
+    "DefaultFuseHandler",
+    "FuseHandler",
+]
+
+
+# ----------------------------
+# Fusion Pattern Registrations
+# ----------------------------
+
+# Base Pattern Handler
+class FuseHandler(ABC):
+    """ Base handler class for the fusion patterns
+    """
+    @abstractmethod
+    def __init__(self, node: Node):
+        pass
+
+    @abstractmethod
+    def fuse(self,
+             load_arg: Callable,
+             named_modules: Dict[str, torch.nn.Module],
+             fused_graph: Graph,
+             root_node: Node,
+             extra_inputs: List[Any],
+             matched_node_pattern: NodePattern,
+             fuse_custom_config: FuseCustomConfig,
+             fuser_method_mapping: Dict[Pattern, Union[torch.nn.Sequential, Callable]],
+             is_qat: bool) -> Node:
+        pass
+
+class DefaultFuseHandler(FuseHandler):
+    def __init__(
+            self,
+            node: Node):
+        super().__init__(node)
+
+    def fuse(self,
+             load_arg: Callable,
+             named_modules: Dict[str, torch.nn.Module],
+             fused_graph: Graph,
+             root_node: Node,
+             extra_inputs: List[Any],
+             matched_node_pattern: NodePattern,
+             fuse_custom_config: FuseCustomConfig,
+             fuser_method_mapping: Dict[Pattern, Union[torch.nn.Sequential, Callable]],
+             is_qat: bool) -> Node:
+        assert root_node.op == "call_module", "Expecting module node to be a call_module Node"
+        root_module = named_modules[str(root_node.target)]
+
+        def get_modules(pattern):
+            """ Given a node pattern, extract the corresponding modules
+            e.g. input: (relu_node, (bn_node, conv_node))
+                 output: (relu_module, (bn_module, conv_module))
+            """
+            if isinstance(pattern, (tuple, list)):
+                n, *args = pattern
+                modules: List[torch.nn.Module] = []
+                modules.append(get_modules(n))
+                for a in args:
+                    modules.append(get_modules(a))
+                return tuple(modules)
+            else:
+                n = pattern
+                if n.op == "call_module":
+                    return named_modules[n.target]
+                elif n.op == "call_function" and n.target == torch.nn.functional.relu:
+                    relu = torch.nn.ReLU()
+                    relu.training = root_module.training
+                    return relu
+                elif n.op == "call_function" or n.op == "call_method":
+                    return n.target
+                else:
+                    return MatchAllNode
+
+        # since relu can be used multiple times, we'll need to create a relu module for each match
+        matched_modules = get_modules(matched_node_pattern)
+
+        def get_matched_types(m):
+            if isinstance(m, tuple):
+                return tuple(map(get_matched_types, m))
+            if isinstance(m, torch.nn.Module):
+                return type_before_parametrizations(m)
+            return m
+
+        matched_module_types = get_matched_types(matched_modules)
+        module_parent_name, module_name = _parent_name(root_node.target)
+        fuser_method = get_fuser_method_new(matched_module_types, fuser_method_mapping)
+        # TODO: change the signature for fuser_method to take matched module patterns
+        # as input
+        fused_module = fuser_method(is_qat, *matched_modules)
+        setattr(named_modules[module_parent_name], module_name, fused_module)
+        extra_args = []
+        for input in extra_inputs:
+            extra_args.append(load_arg(input))
+        node = fused_graph.node_copy(root_node, load_arg)
+        args = list(node.args)
+        args.extend(extra_args)
+        node.args = tuple(args)
+        return node
+
+def _get_fusion_pattern_to_fuse_handler_cls(
+        backend_config: BackendConfig) -> Dict[Pattern, Callable]:
+    fusion_pattern_to_fuse_handlers: Dict[Pattern, Callable] = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        if config.fuser_method is not None:
+            # TODO: is this logic right?
+            fusion_pattern_to_fuse_handlers[pattern] = DefaultFuseHandler
+    return fusion_pattern_to_fuse_handlers
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/graph_module.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/graph_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..c17701b589a6689eceb27677888ecdeb519ff13b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/graph_module.py
@@ -0,0 +1,119 @@
+import torch
+import copy
+from torch.fx import GraphModule
+from torch.fx.graph import Graph
+from typing import Union, Dict, Any, Set
+
+__all__ = [
+    "FusedGraphModule",
+    "ObservedGraphModule",
+    "ObservedStandaloneGraphModule",
+    "QuantizedGraphModule",
+]
+
+class FusedGraphModule(GraphModule):
+    def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
+        self.preserved_attr_names = preserved_attr_names
+        preserved_attrs = {attr: getattr(root, attr) for attr in self.preserved_attr_names if hasattr(root, attr)}
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+
+    # GraphModule does not copy attributes which are not in the __dict__
+    # of vanilla nn.Module.  So, we override __deepcopy__ in order
+    # to copy the quantization specific attributes correctly.
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return FusedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
+
+class ObservedGraphModule(GraphModule):
+
+    def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
+        self.preserved_attr_names = {
+            '_activation_post_process_map',
+            '_activation_post_process_indexes',
+            '_patterns',
+            '_node_name_to_qconfig',
+            '_prepare_custom_config',
+            '_equalization_node_name_to_qconfig',
+            '_node_name_to_scope',
+            '_qconfig_mapping',
+            '_is_qat',
+            '_observed_node_names'}.union(preserved_attr_names)
+        preserved_attrs = {attr: getattr(root, attr) for attr in self.preserved_attr_names if hasattr(root, attr)}
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+
+    # GraphModule does not copy attributes which are not in the __dict__
+    # of vanilla nn.Module.  So, we override __deepcopy__ in order
+    # to copy the quantization specific attributes correctly.
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
+
+def _is_observed_module(module: Any) -> bool:
+    return hasattr(module, "meta") and "_observed_graph_module_attrs" in module.meta
+
+def _get_observed_graph_module_attr(model: Union[torch.nn.Module, GraphModule], attr_name: str) -> Any:
+    if hasattr(model, "meta") and "_observed_graph_module_attrs" in model.meta:  # type: ignore[operator, index]
+        return getattr(model.meta["_observed_graph_module_attrs"], attr_name)  # type: ignore[index]
+    return None
+
+class ObservedStandaloneGraphModule(ObservedGraphModule):
+    def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
+        preserved_attr_names = preserved_attr_names.union({
+            "_standalone_module_input_quantized_idxs",
+            "_standalone_module_output_quantized_idxs"})
+        super().__init__(root, graph, preserved_attr_names)
+
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return ObservedStandaloneGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
+
+def _is_observed_standalone_module(module: Any) -> bool:
+    return _is_observed_module(module) and module.meta["_observed_graph_module_attrs"].is_observed_standalone_module
+
+def _save_packed_weight(self, destination, prefix, keep_vars):
+    for attr_name in dir(self):
+        if "_packed_weight" in attr_name and \
+           isinstance(getattr(self, attr_name), torch._C.ScriptObject):  # type: ignore[attr-defined]
+            packed_weight = getattr(self, attr_name)
+            destination[prefix + attr_name] = packed_weight
+
+class QuantizedGraphModule(GraphModule):
+    """ This class is created to make sure PackedParams
+    (e.g. LinearPackedParams, Conv2dPackedParams) to appear in state_dict
+    so that we can serialize and deserialize quantized graph module with
+    torch.save(m.state_dict()) and m.load_state_dict(state_dict)
+    """
+    def __init__(self, root: Union[torch.nn.Module, Dict[str, Any]], graph: Graph, preserved_attr_names: Set[str]):
+        self.preserved_attr_names = preserved_attr_names
+        preserved_attrs = {attr: getattr(root, attr) for attr in self.preserved_attr_names if hasattr(root, attr)}
+        super().__init__(root, graph)
+        for attr in preserved_attrs:
+            setattr(self, attr, preserved_attrs[attr])
+        self._register_state_dict_hook(_save_packed_weight)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        attrs_to_pop = []
+        for attr_name in state_dict:
+            if attr_name.startswith("_packed_weight") and isinstance(state_dict[attr_name], torch._C.ScriptObject):  # type: ignore[attr-defined] # noqa: B950
+                setattr(self, attr_name, state_dict[attr_name])
+                attrs_to_pop.append(attr_name)
+
+        # pop the packed param attributesn
+        for attr_name in attrs_to_pop:
+            state_dict.pop(attr_name)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+
+
+    def __deepcopy__(self, memo):
+        fake_mod = torch.nn.Module()
+        fake_mod.__dict__ = copy.deepcopy(self.__dict__)
+        return QuantizedGraphModule(fake_mod, copy.deepcopy(self.graph), copy.deepcopy(self.preserved_attr_names))
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_fbgemm.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_fbgemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d526dfc4f62c76324b712489a2a34278a582ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_fbgemm.py
@@ -0,0 +1,16 @@
+from ._lower_to_native_backend import _lower_to_native_backend
+from ..qconfig import QConfigAny
+from torch.fx import GraphModule
+from typing import Dict, Tuple
+
+__all__ = ['lower_to_fbgemm']
+
+def lower_to_fbgemm(
+    model: GraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> GraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to fbgemm
+    """
+    return _lower_to_native_backend(model, qconfig_map, node_name_to_scope)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_qnnpack.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_qnnpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d816a214a8fbb45fe4fd425bc5d1fcefc46c78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lower_to_qnnpack.py
@@ -0,0 +1,18 @@
+from ._lower_to_native_backend import _lower_to_native_backend
+from ..qconfig import QConfigAny
+from torch.fx import GraphModule
+from typing import Dict, Tuple
+
+__all__ = [
+    "lower_to_qnnpack"
+]
+
+def lower_to_qnnpack(
+    model: GraphModule,
+    qconfig_map: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+) -> GraphModule:
+    """ Lower a quantized reference model (with reference quantized operator patterns)
+    to qnnpack
+    """
+    return _lower_to_native_backend(model, qconfig_map, node_name_to_scope)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/lstm_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lstm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee786f1d42deb4187db0afda2fb6f1e496d8a0b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/lstm_utils.py
@@ -0,0 +1,183 @@
+import copy
+import operator
+import torch
+from typing import Any, Callable, Optional, Tuple
+from torch.ao.quantization import (
+    default_weight_observer,
+    default_weight_fake_quant,
+    FakeQuantizeBase,
+    QConfig,
+    QConfigMapping,
+)
+from torch.ao.quantization.backend_config import BackendConfig
+from torch.ao.quantization.observer import _PartialWrapper
+from torch.ao.quantization.quantize_fx import (
+    convert_to_reference_fx,
+    prepare_fx,
+)
+
+# TODO: move all LSTM util functions from fx/utils.py to this file
+def _get_lstm_with_individually_observed_parts(
+    float_lstm: torch.nn.LSTM,
+    example_inputs: Tuple[Any, ...],
+    backend_config: Optional[BackendConfig] = None,
+    linear_output_obs_ctr: Optional[_PartialWrapper] = None,
+    sigmoid_obs_ctr: Optional[_PartialWrapper] = None,
+    tanh_obs_ctr: Optional[_PartialWrapper] = None,
+    cell_state_obs_ctr: Optional[_PartialWrapper] = None,
+    hidden_state_obs_ctr: Optional[_PartialWrapper] = None,
+) -> torch.ao.nn.quantizable.LSTM:
+    """
+    Return an observed `torch.ao.nn.quantizable.LSTM` created from a `torch.nn.LSTM`
+    with specific observers or fake quantizes assigned to the inner ops or submodules.
+
+    In both eager and FX graph mode quantization, `torch.ao.nn.quantizable.LSTM` is
+    used as an observed custom module, which is responsible for inserting its own
+    observers. By default, all inner ops inherit the parent custom module's QConfig.
+    Users who wish to override this behavior may extend `torch.ao.nn.quantizable.LSTM`
+    and use this helper function to customize the observer insertion logic.
+
+    This is meant to be used to convert a float module to an observed module in the
+    custom module flow.
+
+    Args:
+        `float_lstm`: The float LSTM module
+        `example_inputs`: example inputs for the forward function of the LSTM module
+        `backend_config`: BackendConfig to use to observe the LSTM module
+        `linear_output_obs_ctr`: observer or fake quantize for linear outputs Wx + b,
+            where W is the weight matrix, b is the bias, and x is either the inputs
+            or the hidden state from the previous layer (if any)
+        `sigmoid_obs_ctr`: observer or fake quantize for sigmoid activations
+        `tanh_obs_ctr`: observer or fake quantize for tanh activations
+        `cell_state_obs_ctr`: observer or fake quantize for the cell state
+        `hidden_state_obs_ctr`: observer or fake quantize for the hidden state and
+            the output
+
+    Return:
+        A `torch.ao.nn.quantizable.LSTM` with the specified observers or fake quantizes
+        assigned to the inner ops.
+    """
+    def make_qconfig(obs_ctr: _PartialWrapper) -> QConfig:
+        """
+        Make a QConfig with fixed qparams observers or fake quantizes.
+        """
+        if isinstance(obs_ctr(), FakeQuantizeBase):
+            weight = default_weight_fake_quant
+        else:
+            weight = default_weight_observer
+        return QConfig(activation=obs_ctr, weight=weight)
+
+    quantizable_lstm = torch.ao.nn.quantizable.LSTM(
+        float_lstm.input_size, float_lstm.hidden_size, float_lstm.num_layers, float_lstm.bias,
+        float_lstm.batch_first, float_lstm.dropout, float_lstm.bidirectional)
+    quantizable_lstm.qconfig = float_lstm.qconfig
+
+    for idx in range(float_lstm.num_layers):
+        quantizable_lstm.layers[idx] = torch.ao.nn.quantizable.modules.rnn._LSTMLayer.from_float(float_lstm,
+                                                                                                 idx,
+                                                                                                 float_lstm.qconfig,
+                                                                                                 batch_first=False)
+
+    # Build QConfigMapping for the LSTM cell
+    # Note: FloatFunctional qconfigs will be configured separately below
+    cell_qm = QConfigMapping().set_global(float_lstm.qconfig)  # type: ignore[arg-type]
+    if sigmoid_obs_ctr is not None:
+        cell_qm.set_module_name("input_gate", make_qconfig(sigmoid_obs_ctr))
+        cell_qm.set_module_name("forget_gate", make_qconfig(sigmoid_obs_ctr))
+        cell_qm.set_module_name("output_gate", make_qconfig(sigmoid_obs_ctr))
+    if tanh_obs_ctr is not None:
+        cell_qm.set_module_name("cell_gate", make_qconfig(tanh_obs_ctr))
+
+    # Insert observers into each LSTM cell
+    # TODO: maybe make this work for layer_bw as well
+    for layer in quantizable_lstm.layers:
+        cell = layer.layer_fw.cell
+        cell = prepare_fx(cell, cell_qm, example_inputs, backend_config=backend_config)
+        # HACK: Manually replace the activation_post_process following these ops.
+        # This is needed for FloatFunctional ops because there is currently no way
+        # to configure these ops in FX graph mode quantization today. This is because
+        # the FloatFunctional modules simply disappear from the graph after tracing.
+        # In the future, we should rewrite quantizable LSTM without FloatFunctionals.
+        op_index_to_activation_post_process_ctr = {
+            (torch.add, 0): linear_output_obs_ctr,  # gates.add
+            (torch.mul, 0): cell_state_obs_ctr,  # fgate_cx.mul
+            (torch.mul, 1): cell_state_obs_ctr,  # igate_cgate.mul
+            (torch.add, 1): cell_state_obs_ctr,  # fgate_cx_igate_cgate.add
+            (torch.mul, 2): hidden_state_obs_ctr,  # ogate_cy.mul
+        }
+        add_count = 0
+        mul_count = 0
+        for node in cell.graph.nodes:
+            op_index: Optional[Tuple[Callable, int]] = None  # e.g. (torch.add, 1)
+            if node.target == torch.add:
+                op_index = (torch.add, add_count)
+                add_count += 1
+            elif node.target == torch.mul:
+                op_index = (torch.mul, mul_count)
+                mul_count += 1
+            else:
+                # Neither torch.add nor torch.mul
+                continue
+            if op_index not in op_index_to_activation_post_process_ctr:
+                continue
+            assert len(node.users) == 1
+            activation_post_process_name = next(iter(node.users.keys())).name
+            activation_post_process_ctr = op_index_to_activation_post_process_ctr[op_index]
+            if activation_post_process_ctr is not None:
+                setattr(cell, activation_post_process_name, activation_post_process_ctr())
+        layer.layer_fw.cell = cell
+    return quantizable_lstm
+
+def _get_reference_quantized_lstm_module(
+    observed_lstm: torch.ao.nn.quantizable.LSTM,
+    backend_config: Optional[BackendConfig] = None,
+) -> torch.ao.nn.quantized.LSTM:
+    """
+    Return a `torch.ao.nn.quantized.LSTM` created from a `torch.ao.nn.quantizable.LSTM`
+    with observers or fake quantizes inserted through `prepare_fx`, e.g. from
+    `_get_lstm_with_individually_observed_parts`.
+
+    This is meant to be used to convert an observed module to a quantized module in the
+    custom module flow.
+
+    Args:
+        `observed_lstm`: a `torch.ao.nn.quantizable.LSTM` observed through `prepare_fx`
+        `backend_config`: BackendConfig to use to produce the reference quantized model
+
+    Return:
+        A reference `torch.ao.nn.quantized.LSTM` module.
+    """
+    quantized_lstm = torch.ao.nn.quantized.LSTM(
+        observed_lstm.input_size, observed_lstm.hidden_size, observed_lstm.num_layers,
+        observed_lstm.bias, observed_lstm.batch_first, observed_lstm.dropout,
+        observed_lstm.bidirectional)
+
+    for i, layer in enumerate(quantized_lstm.layers):
+        cell = copy.deepcopy(observed_lstm.layers.get_submodule(str(i)).layer_fw.cell)  # type: ignore[union-attr]
+        cell = convert_to_reference_fx(cell, backend_config=backend_config)  # type: ignore[arg-type]
+        assert isinstance(cell, torch.fx.GraphModule)
+        # HACK: Manually remove input quantize nodes and output dequantize nodes,
+        # since custom modules expect quint8 inputs and outputs for now. Note that
+        # this functionality is supposedly handled through PrepareCustomConfig's
+        # `set_input_quantized_indexes` and `set_output_quantized_indexes`, but that
+        # API doesn't currently handle tuple inputs and outputs, so we have to do
+        # this manually for now. In the future we should (1) relax the restriction
+        # on custom module input/output dtypes, and (2) expand support for complex
+        # input/output structures.
+        for node in cell.graph.nodes:
+            if node.target == torch.quantize_per_tensor:
+                arg = node.args[0]
+                # Remove quantize(x), quantize(hidden[0]), and quantize(hidden[1])
+                if arg.target == "x" or (arg.target == operator.getitem and arg.args[0].target == "hidden"):
+                    with cell.graph.inserting_before(node):
+                        node.replace_all_uses_with(arg)
+                        cell.graph.erase_node(node)
+            if node.target == "output":
+                # Remove all dequantize nodes in the output tuple
+                for arg in node.args[0]:
+                    with cell.graph.inserting_before(node):
+                        node.replace_input_with(arg, arg.args[0])
+        cell.graph.eliminate_dead_code()
+        cell.recompile()
+        layer.layer_fw.cell = cell
+    return quantized_lstm
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/match_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/match_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b999f0b1d81b45a9748a79645bebcaedbc2fbf90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/match_utils.py
@@ -0,0 +1,237 @@
+import sys
+import torch
+from torch.fx.graph import (
+    Graph,
+    Node,
+)
+from torch.ao.quantization.utils import Pattern
+from .quantize_handler import (
+    QuantizeHandler,
+)
+from ..qconfig import (
+    QConfigAny,
+)
+from ..utils import (
+    MatchAllNode
+)
+from .graph_module import (
+    _is_observed_standalone_module,
+)
+from torch.nn.utils.parametrize import type_before_parametrizations
+from typing import Any, Dict, List, Callable, Optional, Tuple, Type, Set, Iterable
+
+
+__all__: List[str] = []
+
+# TODO(future PR): the 1st argument is typed as `List[Node]`, but a better type
+# would be a recursive `List[Union[Node, Tuple[Union[Node, ...]]]]`
+_MatchResult = Tuple[Node, List[Node], Optional[Pattern], QuantizeHandler]
+
+_MatchResultWithQConfig = Tuple[Node, List[Node], Optional[Pattern], QuantizeHandler,
+                                QConfigAny]
+
+# Note: The order of patterns is important! match function will take whatever is matched first, so we'll
+# need to put the fusion patterns before single patterns. For example, add_relu should be registered come before relu.
+# decorators are applied in the reverse order we see. Also when we match the nodes in the graph with these patterns,
+# we'll start from the last node of the graph and traverse back.
+def _is_match(modules, node, pattern, max_uses=sys.maxsize):
+    """ Matches a node in fx against a pattern
+    """
+    if isinstance(pattern, tuple):
+        self_match, *arg_matches = pattern
+        if self_match is getattr:
+            assert len(pattern) == 2, 'Expecting getattr pattern to have two elements'
+            arg_matches = []
+    else:
+        self_match = pattern
+        arg_matches = []
+
+    if isinstance(self_match, type) and issubclass(self_match, MatchAllNode):
+        return True
+
+    if node == pattern:
+        return True
+
+    if not isinstance(node, Node) or len(node.users) > max_uses:
+        return False
+
+    if isinstance(self_match, type) and issubclass(self_match, torch.nn.Module):
+        if node.op != 'call_module':
+            return False
+        if not type_before_parametrizations(modules[node.target]) == self_match:
+            return False
+    elif callable(self_match):
+        if node.op != 'call_function' or node.target is not self_match:
+            return False
+        elif node.target is getattr:
+            if node.args[1] != pattern[1]:
+                return False
+    elif isinstance(self_match, str):
+        if node.op != 'call_method' or node.target != self_match:
+            return False
+    elif node.target != self_match:
+        return False
+
+    if not arg_matches:
+        return True
+
+    if len(arg_matches) != len(node.args):
+        return False
+
+    return all(_is_match(modules, node, arg_match, max_uses=1) for node, arg_match in zip(node.args, arg_matches))
+
+def _find_matches(
+        graph: Graph,
+        modules: Dict[str, torch.nn.Module],
+        patterns: Dict[Pattern, QuantizeHandler],
+        root_node_getter_mapping: Dict[Pattern, Callable],
+        standalone_module_names: Optional[List[str]] = None,
+        standalone_module_classes: Optional[List[Type]] = None,
+        custom_module_classes: Optional[List[Any]] = None) -> Dict[str, _MatchResult]:
+    """
+    Matches the nodes in the input graph to quantization patterns, and
+    outputs the information needed to quantize them in future steps.
+
+    Inputs:
+      - graph: an fx.Graph object
+      - modules: a mapping of fully qualified module name to instance,
+          for example, {'foo': ModuleFoo, ...}
+      - patterns: a mapping from a tuple of nodes in reverse order to
+          uninitialized QuantizeHandler subclass.
+
+    Outputs a map of
+      node_name ->
+        (node, matched_values, matched_pattern, QuantizeHandler instance,
+         qconfig)
+
+    For example, {
+      'relu_1': (relu_1, [relu_1], torch.nn.functional.relu,
+                 <CopyNodeQuantizeHandler instance>, QConfig(...)),
+      ...
+    }
+    """
+    if custom_module_classes is None:
+        custom_module_classes = []
+
+    if standalone_module_classes is None:
+        standalone_module_classes = []
+
+    if standalone_module_names is None:
+        standalone_module_names = []
+
+    match_map: Dict[str, _MatchResult] = {}
+    all_matched : Set[str] = set()
+
+    def _recursive_record_node_in_match_map(
+            last_node,
+            match_map,
+            node_pattern,
+            matched_node_pattern,
+            pattern,
+            match_value):
+        if isinstance(node_pattern, Node):
+            match_map[node_pattern.name] = (
+                last_node, matched_node_pattern, pattern, match_value)
+        elif not isinstance(node_pattern, Iterable):
+            return
+        else:
+            for n in node_pattern:
+                _recursive_record_node_in_match_map(last_node, match_map, n, matched_node_pattern, pattern, match_value)
+
+    # TODO: 1. merge with fuse matcher 2. document the code
+    def record_match(
+            pattern,
+            node,
+            last_node,
+            matched_node_pattern,
+            match_map):
+        if isinstance(pattern, tuple):
+            s, *args = pattern
+            is_single_arg = len(args) == 1
+            current_node_pattern: List[Node] = []
+            record_match(
+                s,
+                node,
+                last_node,
+                matched_node_pattern,
+                match_map)
+            if pattern[0] is not getattr:
+                for subpattern, arg in zip(args, node.args):
+                    record_match(
+                        subpattern,
+                        arg,
+                        node,
+                        current_node_pattern,
+                        match_map)
+            if len(current_node_pattern) > 1:
+                # current_node_pattern is  the node pattern we get from matching
+                # the subpattern with arguments of the node
+                # we use is_single_arg to recover the original structure of the pattern
+                # if the original pattern has a single argument, we will have
+                # (original_op, (original_arg, ...))
+                # otherwise, we'll have a list of arguments
+                # (original_op, arg0, arg1, arg2, ...)
+                if is_single_arg:
+                    matched_node_pattern.append(tuple(current_node_pattern))
+                else:
+                    matched_node_pattern.extend(list(current_node_pattern))
+            else:
+                matched_node_pattern.append(current_node_pattern[0])
+        else:
+            matched_node_pattern.append(node)
+
+    for node in reversed(graph.nodes):
+        if node.name not in match_map and node.name not in all_matched:
+            for pattern, quantize_handler_cls in patterns.items():
+                root_node_getter = root_node_getter_mapping.get(pattern, None)
+                if _is_match(modules, node, pattern) and node.name not in match_map:
+                    matched_node_pattern: List[Node] = []
+                    record_match(
+                        pattern,
+                        node,
+                        node,
+                        matched_node_pattern,
+                        match_map)
+                    quantize_handler = quantize_handler_cls(  # type: ignore[operator]
+                        matched_node_pattern,
+                        modules,
+                        root_node_getter)
+                    last_node = node
+                    # record the match for all nodes in the pattern
+                    _recursive_record_node_in_match_map(
+                        last_node,
+                        match_map,
+                        # we need to record all nodes in the matched pattern in the match_map
+                        matched_node_pattern,
+                        # this is a part of the value corresponding to the node
+                        matched_node_pattern,
+                        pattern,
+                        quantize_handler)
+                    break
+
+    # add custom module instances to the match result
+    assert modules is not None
+    for node in graph.nodes:
+        if node.op == 'call_module' and \
+           type(modules[node.target]) in custom_module_classes:
+            match_map[node.name] = (
+                node, node, None, QuantizeHandler(node, modules, is_custom_module=True))
+
+    def is_standalone_module(node_target: str, modules: Dict[str, torch.nn.Module]):
+        assert modules is not None
+        return (
+            node_target in standalone_module_names or  # type: ignore[operator]
+            type(modules[node_target]) in standalone_module_classes  # type: ignore[operator]
+        )
+
+    # add standalone modules to the match
+    for node in graph.nodes:
+        if node.op == 'call_module' and \
+           (is_standalone_module(node.target, modules) or
+                _is_observed_standalone_module(modules[node.target])):
+            # add node to matched nodes
+            match_map[node.name] = (
+                node, node, None,
+                QuantizeHandler(node, modules, is_standalone_module=True))
+
+    return match_map
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/pattern_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/pattern_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53eeff6a8e4053849a8255b839ebc34f3b842b32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/pattern_utils.py
@@ -0,0 +1,87 @@
+from collections import OrderedDict
+from typing import Dict, Any
+from torch.ao.quantization.utils import Pattern
+from ..fake_quantize import FixedQParamsFakeQuantize
+from ..observer import ObserverBase
+import copy
+
+__all__ = [
+    "get_default_fusion_patterns",
+    "get_default_quant_patterns",
+    "get_default_output_activation_post_process_map",
+]
+
+# TODO(future PR): fix the typing on QuantizeHandler (currently a circular dependency)
+QuantizeHandler = Any
+
+# pattern for conv bn fusion
+_DEFAULT_FUSION_PATTERNS: Dict[Pattern, QuantizeHandler] = OrderedDict()
+def _register_fusion_pattern(pattern):
+    def insert(fn):
+        _DEFAULT_FUSION_PATTERNS[pattern] = fn
+        return fn
+    return insert
+
+def get_default_fusion_patterns() -> Dict[Pattern, QuantizeHandler]:
+    return copy.copy(_DEFAULT_FUSION_PATTERNS)
+
+_DEFAULT_QUANTIZATION_PATTERNS: Dict[Pattern, QuantizeHandler] = OrderedDict()
+
+# Mapping from pattern to activation_post_process(observer/fake_quant) constructor for output activation
+# e.g. pattern: torch.sigmoid,
+#      output_activation_post_process: default_fixed_qparams_range_0to1_fake_quant
+_DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP: Dict[Pattern, QuantizeHandler] = {}
+_DEFAULT_OUTPUT_OBSERVER_MAP: Dict[Pattern, QuantizeHandler] = {}
+
+# Register pattern for both static quantization and qat
+def _register_quant_pattern(pattern, fixed_qparams_observer=None):
+    def insert(fn):
+        _DEFAULT_QUANTIZATION_PATTERNS[pattern] = fn
+        if fixed_qparams_observer is not None:
+            _DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP[pattern] = FixedQParamsFakeQuantize.with_args(observer=fixed_qparams_observer)
+            _DEFAULT_OUTPUT_OBSERVER_MAP[pattern] = fixed_qparams_observer
+        return fn
+    return insert
+
+# Get patterns for both static quantization and qat
+def get_default_quant_patterns() -> Dict[Pattern, QuantizeHandler]:
+    return copy.copy(_DEFAULT_QUANTIZATION_PATTERNS)
+
+# a map from pattern to output activation post process constructor
+# e.g. torch.sigmoid -> default_affine_fixed_qparam_fake_quant
+def get_default_output_activation_post_process_map(is_training) -> Dict[Pattern, ObserverBase]:
+    if is_training:
+        return copy.copy(_DEFAULT_OUTPUT_FAKE_QUANTIZE_MAP)
+    else:
+        return copy.copy(_DEFAULT_OUTPUT_OBSERVER_MAP)
+
+# Example use of register pattern function:
+# @_register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
+# class ConvOrLinearBNReLUFusion():
+#     def __init__(...):
+#         ...
+#
+
+def _sorted_patterns_dict(patterns_dict: Dict[Pattern, QuantizeHandler]) -> Dict[Pattern, QuantizeHandler]:
+    """
+    Return a sorted version of the patterns dictionary such that longer patterns are matched first,
+    e.g. match (F.relu, F.linear) before F.relu.
+    This works for current use cases, but we may need to have a more clever way to sort
+    things to address more complex patterns
+    """
+
+    def get_len(pattern):
+        """ this will calculate the length of the pattern by counting all the entries
+        in the pattern.
+        this will make sure (nn.ReLU, (nn.BatchNorm, nn.Conv2d)) comes before
+        (nn.BatchNorm, nn.Conv2d) so that we can match the former first
+        """
+        len = 0
+        if isinstance(pattern, tuple):
+            for item in pattern:
+                len += get_len(item)
+        else:
+            len += 1
+        return len
+
+    return OrderedDict(sorted(patterns_dict.items(), key=lambda kv: -get_len(kv[0]) if isinstance(kv[0], tuple) else 1))
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/prepare.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f47c21a048b0cca8af05aeb358cc1c55312c8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/prepare.py
@@ -0,0 +1,1880 @@
+import copy
+import torch
+import warnings
+from torch.fx import (
+    GraphModule,
+)
+from torch.fx.graph import (
+    Graph,
+    Node,
+)
+from torch.fx.node import Argument
+
+from ..quantize import (
+    propagate_qconfig_,
+)
+from ..observer import (
+    _is_activation_post_process,
+    _PartialWrapper,
+)
+from ..qconfig import (
+    _is_reuse_input_qconfig,
+    QConfigAny,
+)
+from ..qconfig_mapping import (
+    QConfigMapping,
+)
+from .qconfig_mapping_utils import (
+    _generate_node_name_to_qconfig,
+    _update_qconfig_for_fusion,
+    _get_flattened_qconfig_dict,
+    _update_qconfig_for_qat,
+)
+
+from .quantize_handler import (
+    _default_root_node_getter,
+    _get_pattern_to_quantize_handlers,
+    QuantizeHandler,
+)
+
+from torch.ao.quantization import (
+    ObserverBase,
+    FixedQParamsObserver,
+    FixedQParamsFakeQuantize,
+    _DerivedObserverOrFakeQuantize,
+)
+
+from torch.ao.quantization.utils import (
+    Pattern,
+    NodePattern,
+)
+
+from ._equalize import (
+    is_equalization_observer,
+    node_supports_equalization,
+)
+
+from .pattern_utils import (
+    _sorted_patterns_dict,
+)
+
+from .match_utils import (
+    _MatchResultWithQConfig,
+    _find_matches,
+)
+
+from .utils import (
+    _insert_dequant_stubs_for_custom_module_lstm_output,
+    _is_custom_module_lstm,
+    _maybe_get_custom_module_lstm_from_node_arg,
+    _qconfig_satisfies_dtype_config_constraints,
+    get_custom_module_class_keys,
+    all_node_args_have_no_tensors,
+    assert_and_get_unique_device,
+    get_non_observable_arg_indexes_and_types,
+    get_new_attr_name_with_prefix,
+    node_arg_is_weight,
+    node_arg_is_bias,
+    NON_QUANTIZABLE_WEIGHT_OPS,
+    ObservedGraphModuleAttrs,
+)
+
+from torch.ao.quantization import (
+    PlaceholderObserver
+)
+from torch.ao.quantization.quantize import (
+    convert
+)
+
+from ..utils import (
+    _parent_name,
+    get_qconfig_dtypes,
+    get_swapped_custom_module_class,
+)
+
+from ..backend_config.utils import (
+    get_pattern_to_dtype_configs,
+    get_module_to_qat_module,
+    get_fusion_pattern_to_root_node_getter,
+)
+from ..backend_config import (
+    BackendConfig,
+    DTypeConfig,
+    get_native_backend_config,
+)
+from .custom_config import (
+    PrepareCustomConfig,
+    StandaloneModuleConfigEntry,
+)
+from torch.ao.quantization.quantizer import (
+    EdgeOrNode,
+    QuantizationSpec,
+    QuantizationSpecBase,
+    FixedQParamsQuantizationSpec,
+    SharedQuantizationSpec,
+    DerivedQuantizationSpec,
+)
+from torch.ao.quantization import ObserverOrFakeQuantize
+
+from torch._subclasses import FakeTensor
+
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
+from dataclasses import asdict
+
+__all__ = [
+    "insert_observers_for_model",
+    "prepare",
+    "propagate_dtypes_for_known_nodes",
+]
+
+
+# list of dtypes to not add observers to
+_DO_NOT_OBS_DTYPE_LIST = [int, float, torch.bool, None]
+_OBS_DTYPE_LIST = [
+    torch.quint8,
+    torch.qint8,
+    torch.qint32,
+    torch.float16,
+    torch.uint8,
+    torch.int8,
+    torch.int16,
+    torch.int32
+]
+
+_DEFAULT_FP32_OBS_OR_FQ_CTR = PlaceholderObserver.with_args(dtype=torch.float)
+
+# note: the following default target dtype info dicts are temporary,
+# should be moved to the new programmable API class soon
+_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO = {
+    "input_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig.activation,
+    "output_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig.activation
+}
+
+_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO = {
+    "input_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_quint8_placeholder_qconfig.activation,
+    "output_act_obs_or_fq_ctr": torch.ao.quantization.qconfig._default_quint8_placeholder_qconfig.activation
+}
+
+
+def _get_observer_kwargs(quant_spec: Union[QuantizationSpec, FixedQParamsQuantizationSpec]):
+    kwargs_dict = asdict(quant_spec)
+    return copy.deepcopy(kwargs_dict)
+
+def _get_qspec_for_arg(
+    arg: Node,
+    input_qspec_map: Dict[Node, QuantizationSpecBase],
+    named_modules: Dict[str, torch.nn.Module]
+) -> Optional[QuantizationSpecBase]:
+    while _is_activation_post_process_node(arg, named_modules):
+        arg = arg.args[0]  # type: ignore[assignment]
+    return input_qspec_map.get(arg, None)
+
+def _create_obs_or_fq_from_qspec(
+    quantization_spec: Optional[QuantizationSpecBase],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+):
+    """ Create observer or fake quantize objects based on quantization spec
+
+    Args:
+       quantization_spec: used to store parameters to create the observer or fake quantizer
+       obs_or_fq_map: this is a map from edge/output to the corresponding observer/fake_quant
+       instance, it may be reused for different edge/output depending on configuration
+    """
+    if quantization_spec is None:
+        return None
+    if isinstance(quantization_spec, SharedQuantizationSpec):
+        edge_or_node = quantization_spec.edge_or_node
+        assert edge_or_node in obs_or_fq_map, \
+            "please make sure only refer to edge or node that has " \
+            f"observer/fake_quant inserted: '{edge_or_node}' not in\n{obs_or_fq_map.keys()}"
+        return obs_or_fq_map[edge_or_node]
+    elif isinstance(quantization_spec, DerivedQuantizationSpec):
+        # can't use asdict, so not calling get_observer_kwargs here
+        kwargs = {
+            "dtype": quantization_spec.dtype,
+            "derive_qparams_fn": quantization_spec.derive_qparams_fn,
+            "quant_min": quantization_spec.quant_min,
+            "quant_max": quantization_spec.quant_max,
+            "qscheme": quantization_spec.qscheme,
+            "ch_axis": quantization_spec.ch_axis,
+        }
+        edge_or_nodes = quantization_spec.derived_from
+        obs_or_fqs = [obs_or_fq_map[k] for k in edge_or_nodes]
+        kwargs["obs_or_fqs"] = obs_or_fqs
+        return _DerivedObserverOrFakeQuantize.with_args(**kwargs)()
+    elif isinstance(quantization_spec, FixedQParamsQuantizationSpec):
+        kwargs = _get_observer_kwargs(quantization_spec)
+        observer_ctr = FixedQParamsObserver.with_args(**kwargs)
+        if is_qat:
+            return FixedQParamsFakeQuantize.with_args(observer=observer_ctr)
+        else:
+            return observer_ctr()
+
+    assert isinstance(quantization_spec, QuantizationSpec)
+    observer_or_fake_quant_ctr = quantization_spec.observer_or_fake_quant_ctr
+    kwargs = _get_observer_kwargs(quantization_spec)
+    kwargs.pop("observer_or_fake_quant_ctr")
+    # we will remove is_dynamic from QuantizationSpec because
+    # it seems that dynamic range quantization
+    obs_or_fq_class = observer_or_fake_quant_ctr
+    if isinstance(observer_or_fake_quant_ctr, _PartialWrapper):
+        obs_or_fq_class = observer_or_fake_quant_ctr.p.func  # type: ignore[union-attr, assignment]
+    if "PerChannel" not in obs_or_fq_class.__name__:  # type: ignore[operator, union-attr]
+        kwargs.pop("ch_axis")
+    return observer_or_fake_quant_ctr.with_args(**kwargs)()
+
+def _needs_obs_or_fq(
+        prev_output_dtype: Any,
+        prev_output_is_dynamic: bool,
+        cur_target_dtype: Any,
+        cur_target_is_dynamic: bool,
+        reuse_input_obs_or_fq: bool,
+        is_zeroth_arg: bool = False) -> bool:
+    """
+    note: we will treat "not specified" as torch.float for now
+    utility function that checks if we should insert an observer or fake quant node
+    base on the requested dtype for the nodes from user
+
+    is_zeroth_arg: we only dynamically quantize the first arg of the node right now
+      this should be removed when we enable configuring dynamic quantization
+      for a specific argument, this can be removed if we deprecate fx graph mode
+      quantization
+
+    """
+
+    # need to insert placeholder observer for dynamic quantization so that it can
+    # be converted to choose_qparams -> q -> dq in convert step
+    if cur_target_is_dynamic:
+        assert cur_target_dtype in _OBS_DTYPE_LIST, \
+            f"Expected cur_target_dtype to be torch.float, but got: {cur_target_dtype}"
+        assert prev_output_dtype not in _DO_NOT_OBS_DTYPE_LIST
+        return is_zeroth_arg
+    if reuse_input_obs_or_fq:
+        return False
+    # non dynamic quantization
+    if cur_target_dtype in _OBS_DTYPE_LIST:
+        return prev_output_dtype in _OBS_DTYPE_LIST + [torch.float] and cur_target_dtype != prev_output_dtype
+
+    # lots of error checking are skipped here for now
+    return False
+
+def _is_activation_post_process_node(node: Node, named_modules: Dict[str, torch.nn.Module]) -> bool:
+    return isinstance(node, torch.fx.Node) and node.op == "call_module" and \
+        _is_activation_post_process(named_modules[str(node.target)])
+
+def _get_dtype_and_is_dynamic(obs_or_fq: Optional[ObserverOrFakeQuantize]) -> Tuple[Optional[torch.dtype], bool]:
+    """ Given a constructor for observer or fake quant module, returns
+    a Tuple of dtype and is_dynamic
+    """
+    # TODO: instead of instantiating the instance, we can use inspect to get the default args
+    if obs_or_fq is None:
+        return None, False
+    else:
+        return obs_or_fq.dtype, getattr(obs_or_fq, "is_dynamic", False)  # type: ignore[return-value]
+
+def _is_input_arg_dtype_supported_by_backend(
+    arg: Argument,
+    node: Node,
+    qconfig: QConfigAny,
+    dtype_config: DTypeConfig,
+    backend_config: BackendConfig,
+) -> bool:
+    """ Check if the configured qconfig for the argument
+    is supported by the backend or not
+    """
+    if isinstance(arg, (list, tuple)):
+        return all(_is_input_arg_dtype_supported_by_backend(
+            a, node, qconfig,
+            dtype_config, backend_config) for a in arg)
+    if not isinstance(arg, Node):
+        return True
+    # TODO: support check for standalone module
+    is_weight = node_arg_is_weight(node, arg)
+    is_bias = node_arg_is_bias(node, arg)
+    is_activation = not is_weight and not is_bias
+    if is_activation:
+        input_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr")
+        input_act_obs_or_fq = input_act_obs_or_fq_ctr() if input_act_obs_or_fq_ctr else None
+        qconfig_dtype, qconfig_is_dynamic = _get_dtype_and_is_dynamic(input_act_obs_or_fq)
+        # TODO(future PR): remove the cast to bool below after figuring
+        # out why backend_config has is_dynamic set to None in some cases.
+        return (dtype_config.input_dtype is None) or (
+            dtype_config.input_dtype == qconfig_dtype and
+            bool(dtype_config.is_dynamic) == bool(qconfig_is_dynamic) and
+            _qconfig_satisfies_dtype_config_constraints(qconfig, dtype_config.input_dtype_with_constraints)
+        )
+    elif is_weight:
+        # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
+        weight_obs_or_fq_ctr = node.meta["target_dtype_info"].get("weight_obs_or_fq_ctr", None)
+        weight_obs_or_fq = weight_obs_or_fq_ctr() if weight_obs_or_fq_ctr else None
+        qconfig_weight_dtype, _ = _get_dtype_and_is_dynamic(weight_obs_or_fq)
+        backend_config_weight_dtype = dtype_config.weight_dtype
+        dtype_matches = qconfig_weight_dtype == backend_config_weight_dtype
+        qconfig_satisfies_constraints = _qconfig_satisfies_dtype_config_constraints(
+            qconfig, dtype_config.weight_dtype_with_constraints, is_activation=False)
+        return backend_config_weight_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
+    else:  # bias
+        # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
+        bias_obs_or_fq_ctr = node.meta["target_dtype_info"].get("bias_obs_or_fq_ctr", None)
+        bias_obs_or_fq = bias_obs_or_fq_ctr() if bias_obs_or_fq_ctr else None
+        qconfig_bias_dtype, _ = _get_dtype_and_is_dynamic(bias_obs_or_fq)
+        backend_config_bias_dtype = dtype_config.bias_dtype
+        return backend_config_bias_dtype is None or qconfig_bias_dtype == backend_config_bias_dtype
+
+def _is_output_dtype_supported_by_backend(
+    node: Node,
+    qconfig: QConfigAny,
+    dtype_config: DTypeConfig,
+) -> bool:
+    """ Check if the configured qconfig for the output
+    is supported by the backend or not
+    """
+    # TODO: move dtype check into `_qconfig_satisfies_dtype_config_constraints` as well
+    backend_config_output_dtype = dtype_config.output_dtype
+    # TODO: we should check is_dynamic here as well, the code from _is_input_arg_dtype_supported_by_backend
+    # from input activation check can be reused here
+    qconfig_output_dtype = None
+    output_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("output_act_obs_or_fq_ctr", _DEFAULT_FP32_OBS_OR_FQ_CTR)
+    output_act_obs_or_fq = output_act_obs_or_fq_ctr() if output_act_obs_or_fq_ctr else None
+    qconfig_output_dtype, qconfig_output_is_dynamic = _get_dtype_and_is_dynamic(output_act_obs_or_fq)
+    # TODO: this is a hack because we can only specify one activation_obs_or_fq for
+    # qconfig (qconfig.activation), and we are only supporting dynamically quantized
+    # linear op which has fp32 output dtype, this should be removed if we generalize
+    # the structure of qconfig in the future
+    if qconfig_output_is_dynamic:
+        qconfig_output_dtype = torch.float32
+    dtype_matches = qconfig_output_dtype == backend_config_output_dtype
+    qconfig_satisfies_constraints = _qconfig_satisfies_dtype_config_constraints(
+        qconfig, dtype_config.output_dtype_with_constraints)
+    return backend_config_output_dtype is None or (dtype_matches and qconfig_satisfies_constraints)
+
+def _is_observer_in_same_graph(
+    node: Node,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat,
+):
+    """ Check if observer in same graph
+    when the node output is not fp32 and input is 'placeholder'
+    the input is assumed to be quantized, so it is observed
+    in a different place rather than not observed.
+    """
+    node_output_dtype = _get_arg_target_dtype_as_output(node, named_modules, obs_or_fq_map, is_qat)
+    if len(node.args) > 0 and isinstance(node.args[0], Node):
+        if node_output_dtype in [torch.quint8, torch.uint8] and node.args[0].op == 'placeholder':
+            return False
+    return True
+
+def _is_pattern_dtype_config_and_qconfig_supported_by_backend(
+    pattern: Optional[Pattern],
+    matched_node_pattern: Optional[List[Node]],
+    qconfig: QConfigAny,
+    backend_config: BackendConfig,
+) -> bool:
+    """ Check if the dtype configuration of a pattern is supported by
+    the backend or not, and whether the qconfig satisfies constraints
+    specified in the corresponding dtype config.
+    """
+    if backend_config is None or pattern is None:
+        return True
+    assert matched_node_pattern is not None and len(matched_node_pattern) >= 1
+    pattern_to_dtype_configs = get_pattern_to_dtype_configs(backend_config)
+    dtype_configs: List[DTypeConfig] = pattern_to_dtype_configs.get(pattern, [])
+    pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
+
+    root_node_getter = pattern_to_root_node_getter.get(pattern, _default_root_node_getter)
+    root_node = root_node_getter(matched_node_pattern)
+    input_node = root_node
+    output_node = matched_node_pattern[0]
+    for dtype_config in dtype_configs:
+        # check if arg dtype are supported
+        supported = True
+        for arg in list(input_node.args) + list(input_node.kwargs.values()):
+            supported = supported and _is_input_arg_dtype_supported_by_backend(
+                arg, input_node, qconfig, dtype_config, backend_config)
+        # check if output dtype is supported
+        supported = supported and _is_output_dtype_supported_by_backend(
+            output_node, qconfig, dtype_config)
+        if supported:
+            return True
+    return False
+
+def _get_standalone_module_configs(
+    node: Node,
+    named_modules: Dict[str, torch.nn.Module],
+    prepare_custom_config: PrepareCustomConfig,
+    parent_qconfig: QConfigAny,
+    parent_backend_config: Optional[BackendConfig],
+) -> Tuple[QConfigMapping, Tuple[Any, ...], PrepareCustomConfig, Optional[BackendConfig]]:
+    """
+    Returns the standalone module QConfigMapping and PrepareCustomConfig
+    for `node`, assuming that the module pointed to by `node` is
+    a standalone modules.
+    """
+    module_name = str(node.target)
+    module_type = type(named_modules[module_name])  # type: ignore[index]
+    # name config has precedence over type config
+    config_entry = StandaloneModuleConfigEntry(None, (), None, None)
+    config_entry = prepare_custom_config.standalone_module_classes.get(module_type, config_entry)
+    config_entry = prepare_custom_config.standalone_module_names.get(module_name, config_entry)
+    # fallback to use parent module's qconfig if user didn't specify qconfig dict
+    qconfig_mapping = config_entry.qconfig_mapping or QConfigMapping().set_global(parent_qconfig)
+    example_inputs = config_entry.example_inputs
+    prepare_custom_config = config_entry.prepare_custom_config or PrepareCustomConfig()
+    backend_config = config_entry.backend_config or parent_backend_config
+    return (qconfig_mapping, example_inputs, prepare_custom_config, backend_config)
+
+def _qat_swap_modules(
+        root: torch.nn.Module,
+        module_to_qat_module: Dict[Pattern, Type[torch.nn.Module]]) -> None:
+    convert(root, mapping=module_to_qat_module, inplace=True, remove_qconfig=False)
+
+def _add_matched_node_name_to_set(matched_node_pattern: NodePattern, s: Set[str]):
+    if isinstance(matched_node_pattern, Node):
+        s.add(matched_node_pattern.name)
+    elif isinstance(matched_node_pattern, (list, tuple)):
+        for maybe_node in matched_node_pattern:
+            _add_matched_node_name_to_set(maybe_node, s)
+
+def _insert_obs_or_fq(
+    node: Node,
+    obs_or_fq: ObserverOrFakeQuantize,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+) -> Node:
+    """
+    Attaches `obs_or_fq` to `model`, and creates a node which calls
+    `obs_or_fq` on the output of `node`.
+
+    obs_or_fq: an instance of Observer or FakeQuantize module
+    """
+    model_device = assert_and_get_unique_device(model)
+    if model_device:
+        obs_or_fq.to(model_device)
+    # add obs_or_fq module as attribute
+    if is_equalization_observer(obs_or_fq):
+        prefix = node.name + '_equalization_process_'
+    else:
+        prefix = 'activation_post_process_'
+    get_new_obs_or_fq_name = get_new_attr_name_with_prefix(prefix)
+    obs_or_fq_name = get_new_obs_or_fq_name(model)
+    setattr(model, obs_or_fq_name, obs_or_fq)
+    named_modules[obs_or_fq_name] = obs_or_fq
+    with graph.inserting_after(node):
+        new_obs = graph.create_node(
+            'call_module', obs_or_fq_name, (node,), {})
+    return new_obs
+
+def _set_target_dtype_info_for_matched_node_pattern(
+    matched_node_pattern: NodePattern,
+    last_node: Node,
+    qconfig: QConfigAny,
+    qhandler: Optional[QuantizeHandler],
+    backend_config: BackendConfig,
+    named_modules: Dict[str, torch.nn.Module],
+    cache_for_no_tensor_check: Dict[Node, bool],
+    processed_nodes: Set[Node],
+) -> None:
+    """ Sets the target_dtype_info for each node in matched_node_pattern
+    Note: processed_nodes is used to ensure we only process each node once
+    """
+    if isinstance(matched_node_pattern, (list, tuple)):
+        for node_pattern in matched_node_pattern:
+            _set_target_dtype_info_for_matched_node_pattern(
+                node_pattern,
+                last_node,
+                qconfig,
+                qhandler,
+                backend_config,
+                named_modules,
+                cache_for_no_tensor_check,
+                processed_nodes
+            )
+
+    # set target_dtype_info if matched_node_pattern is a Node
+    # other types of matched object, e.g. int, float literals, are ignored
+    elif isinstance(matched_node_pattern, Node):
+        # for pyre
+        assert isinstance(matched_node_pattern, Node)
+        node = matched_node_pattern
+        if node in processed_nodes:
+            return
+        processed_nodes.add(node)
+
+        if qconfig is None:
+            return
+        # TODO: refactor the following code in terms of apply a qconfig to a pattern
+        # e.g. for a pattern with op1 -> op2 -> op3, and qconfig = QConfig(input_act=obs0, output_act=obs1)
+        # we set the input_obs_or_fq_ctr for the arguments of op1 to based on qconfig.input_act,
+        # and set output_obs_or_fq_ctr based on qconfig.output_act
+        # this also requires we extend the structure of QConfig to support more fine
+        # grained configurations
+        target_dtype_info: Dict[str, Any] = (
+            _get_target_activation_dtype_for_node(
+                node,
+                qconfig,
+                qhandler,
+                named_modules,
+                backend_config,
+                cache_for_no_tensor_check,
+            )
+        )
+        node.meta["target_dtype_info"] = target_dtype_info
+
+def _get_target_activation_dtype_for_node(
+    node: Node,
+    qconfig: QConfigAny,
+    qhandler: Optional[QuantizeHandler],
+    named_modules: Dict[str, torch.nn.Module],
+    backend_config: BackendConfig,
+    cache_for_no_tensor_check: Dict[Node, bool],
+) -> Dict[str, Any]:
+    """
+    For each op attribute in the op's input activation, output activation,
+    weight, bias - returns the settings of dtype and is_dynamic we expect
+    for the `quantize` call in the reference model representation, or None
+    if there is no `quantize` call needed.
+
+    For example, if we have a node corresponding to `op0` in
+
+      x0 -> op0 -> x1
+
+    And we want a reference quantized representation to be
+
+      x0 -> quant_static -> dequant -> op0 -> quant_dynamic -> dequant -> x1
+
+    Then this function will return
+
+      {
+        "input_act_obs_or_fq_ctr": MinMaxObserver.with_args(dtype=torch.quint8, is_dynamic=False),
+        "output_act_obs_or_fq_ctr": MinMaxObserver.with_args(dtype=torch.quint8, is_dynamic=False),
+      }
+
+    TODO(future PR, if needed): explicitly spell out the non-Tensor
+    dtypes.
+    """
+    args_have_no_tensors = \
+        all_node_args_have_no_tensors(
+            node, named_modules, cache_for_no_tensor_check)
+    if args_have_no_tensors:
+        return {
+            "input_act_obs_or_fq_ctr": None,
+            "output_act_obs_or_fq_ctr": None,
+        }
+    # get qconfig to determine the eventual dtype of this node
+    if qconfig is not None:
+        act_dtype, weight_dtype, input_act_is_dynamic = \
+            get_qconfig_dtypes(qconfig)
+
+        # Currently `QConfig` only has one `activation` field.
+        # For static quantization, it is reused for both input
+        # and output activation. For dynamic quantization, this
+        # field is currently only used for the input activation,
+        # with the output activation being in fp32.
+        # In the future this may change as we add more fields
+        # to the `QConfig` object.
+        output_act_dtype = act_dtype \
+            if (not input_act_is_dynamic) else torch.float
+
+        bias_dtype = torch.float16 \
+            if (
+                act_dtype == torch.float16
+                and weight_dtype == torch.float16
+                and (not input_act_is_dynamic)
+            ) else torch.float
+
+        is_general_tensor_value_op = \
+            (qhandler is not None and qhandler.is_general_tensor_value_op())
+
+        _is_standalone_module = (
+            qhandler is not None and qhandler.is_standalone_module()
+        )
+
+        weight_index = None
+        if isinstance(node, Node) and node.op == "call_function" and \
+           node.target in backend_config._pattern_complex_format_to_config:
+            weight_index = backend_config._pattern_complex_format_to_config[node.target]._input_type_to_index.get("weight")
+
+        bias_index = None
+        if isinstance(node, Node) and node.op == "call_function" and \
+           node.target in backend_config._pattern_complex_format_to_config:
+            bias_index = backend_config._pattern_complex_format_to_config[node.target]._input_type_to_index.get("bias")
+
+        return {
+            "input_act_obs_or_fq_ctr": qconfig.activation,
+            "weight_obs_or_fq_ctr": qconfig.weight,
+            "bias_obs_or_fq_ctr": PlaceholderObserver.with_args(dtype=bias_dtype),
+            "weight_index": weight_index,
+            "bias_index": bias_index,
+            "output_act_obs_or_fq_ctr": qconfig.activation,
+            "reuse_input_obs_or_fq": _is_reuse_input_qconfig(qconfig),
+            "input_output_share_observers": is_general_tensor_value_op,
+            "_is_standalone_module": _is_standalone_module,
+        }
+    return copy.copy(_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO)
+
+def _get_output_act_obs_or_fq(
+    arg: Node,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> ObserverOrFakeQuantize:
+    """ Get the constructor for observer or fake quant object for
+    the argument in the original graph as the output of previous node,
+    skipping inserted observers
+
+    We are assuming that the observers are inserted correctly, and the dtype for
+    argument in quantized graph will match what is specified by the qconfig
+    """
+    assert isinstance(arg, Node)
+    if "quantization_annotation" in arg.meta:
+        return _create_obs_or_fq_from_qspec(arg.meta["quantization_annotation"].output_qspec, obs_or_fq_map, is_qat)
+
+    # Custom module LSTM output is a tuple that we broke down into the internal nodes in order
+    # to insert DeQuantStubs (see `_insert_dequant_stubs_for_custom_module_lstm_output`).
+    # Since we modified the graph in this case, we must trace back from the args through
+    # the specific nodes we added in order to reach the original LSTM node. Otherwise, we would
+    # not be able to accurately detect whether this node is a consumer of custom module LSTM.
+    custom_module_lstm_node = _maybe_get_custom_module_lstm_from_node_arg(arg, named_modules)
+    output_act_obs_or_fq_ctr = None
+    if custom_module_lstm_node is not None:
+        output_act_obs_or_fq_ctr = custom_module_lstm_node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
+        output_act_obs_or_fq = output_act_obs_or_fq_ctr() if output_act_obs_or_fq_ctr else None
+    elif _is_activation_post_process_node(arg, named_modules):
+        observed_arg = arg.args[0]
+        assert isinstance(observed_arg, Node), "Currently we only support observing Node"
+        if "quantization_annotation" in observed_arg.meta:
+            output_act_obs_or_fq = \
+                _create_obs_or_fq_from_qspec(
+                    observed_arg.meta["quantization_annotation"].output_qspec, obs_or_fq_map, is_qat)
+        else:
+            assert "target_dtype_info" in observed_arg.meta
+            output_act_obs_or_fq_ctr = observed_arg.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
+            output_act_obs_or_fq = output_act_obs_or_fq_ctr() if output_act_obs_or_fq_ctr else None
+    else:
+        if "target_dtype_info" in arg.meta:
+            output_act_obs_or_fq_ctr = \
+                arg.meta["target_dtype_info"].get("output_act_obs_or_fq_ctr", _DEFAULT_FP32_OBS_OR_FQ_CTR)
+        else:
+            output_act_obs_or_fq_ctr = _DEFAULT_FP32_OBS_OR_FQ_CTR
+        output_act_obs_or_fq = output_act_obs_or_fq_ctr() if output_act_obs_or_fq_ctr else None
+
+    return output_act_obs_or_fq
+
+def _get_arg_target_dtype_as_output(
+    arg: Node,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Optional[torch.dtype]:
+    arg_as_output_act_obs_or_fq = _get_output_act_obs_or_fq(arg, named_modules, obs_or_fq_map, is_qat)
+    arg_as_output_target_dtype, _ = _get_dtype_and_is_dynamic(arg_as_output_act_obs_or_fq)
+    return arg_as_output_target_dtype
+
+def _get_arg_as_input_act_obs_or_fq(
+    arg: Node,
+    node: Node,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Optional[ObserverOrFakeQuantize]:
+    """ Get the observer or fake quant constructor for the Argument `arg`, as input
+    to Node `node`
+    """
+    assert isinstance(arg, Node)
+    # "input_qspec_map" is the more general design we'll use for pt2e path
+    # it is a map from input argument node to observer or fake quant constructor, for example
+    # for the following graph:
+    # x -> conv -> output
+    #
+    # we may annotate conv node like the following:
+    # conv.meta[...] = QuantizationAnnotation("input_qspec_map": {x: MinMaxObserver.with_args(dtype=torch.qint8)}, ...)
+    #
+    if "quantization_annotation" in node.meta:
+        input_qspec_map = node.meta["quantization_annotation"].input_qspec_map
+        input_arg_qspec = _get_qspec_for_arg(arg, input_qspec_map, named_modules)
+        if input_arg_qspec is None:
+            input_arg_obs_or_fq = _DEFAULT_FP32_OBS_OR_FQ_CTR()
+        else:
+            input_arg_obs_or_fq = _create_obs_or_fq_from_qspec(input_arg_qspec, obs_or_fq_map, is_qat)
+        return input_arg_obs_or_fq
+
+    # we can remove the following path in the future if fx graph mode quantization is
+    # no longer used
+    is_weight = node_arg_is_weight(node, arg)
+    is_bias = node_arg_is_bias(node, arg)
+    is_activation = not is_weight and not is_bias
+    obs_or_fq_ctr = None
+    if is_activation:
+        obs_or_fq_ctr = node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr", _DEFAULT_FP32_OBS_OR_FQ_CTR)
+    elif is_weight:
+        if node.target not in NON_QUANTIZABLE_WEIGHT_OPS:
+            obs_or_fq_ctr = node.meta["target_dtype_info"].get("weight_obs_or_fq_ctr", _DEFAULT_FP32_OBS_OR_FQ_CTR)
+    else:
+        obs_or_fq_ctr = node.meta["target_dtype_info"].get("bias_obs_or_fq_ctr", _DEFAULT_FP32_OBS_OR_FQ_CTR)
+    return obs_or_fq_ctr() if obs_or_fq_ctr else None
+
+def _maybe_insert_input_observer_for_arg_or_kwarg(
+    node: Union[Node, Any],
+    arg: Argument,
+    qconfig: QConfigAny,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    qhandler: Optional[QuantizeHandler],
+    prepare_custom_config: PrepareCustomConfig,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+    backend_config: Optional[BackendConfig] = None,
+) -> Argument:
+    """
+    Given a `node` and an `arg`, inserts an input observer between
+    `node` and `arg` if necessary.
+    """
+    # for ops such as torch.cat([x0, x1]),
+    # traverse through the list
+    if isinstance(arg, (list, tuple)):
+        new_arg_to_return = []
+        for inner_arg in arg:
+            new_inner_arg = _maybe_insert_input_observer_for_arg_or_kwarg(
+                node, inner_arg, qconfig, model, named_modules,
+                graph,
+                qhandler,
+                prepare_custom_config,
+                obs_or_fq_map,
+                is_qat,
+                backend_config)
+            new_arg_to_return.append(new_inner_arg)
+        return type(arg)(new_arg_to_return)
+
+    if not isinstance(arg, Node):
+        return arg
+    assert isinstance(arg, Node)
+    # default (no observer)
+    new_arg = arg
+
+    is_standalone_module = qhandler is not None and qhandler.is_standalone_module()
+    # TODO: move this to a separate function
+    if not is_standalone_module:
+        # Note: qconfig can be None in this branch this we are getting act/fq from
+        # node.meta now
+        # regular flow for most nodes, except standalone modules
+
+        if "quantization_annotation" in node.meta:
+            reuse_input_obs_or_fq = node.meta["quantization_annotation"]._reuse_input_obs_or_fq
+        else:
+            assert "target_dtype_info" in node.meta
+            # TODO: we are assuming "target_dtype_info" exists here, maybe
+            # a default value also need to be provided here
+            target_dtype_info = node.meta["target_dtype_info"]
+            # for nodes that doesn't have `reuse_input_obs_or_fq` configured,
+            # we'll default to False, this makes configuring this field optional for users
+            reuse_input_obs_or_fq = target_dtype_info.get("reuse_input_obs_or_fq", False)
+        arg_as_input_act_obs_or_fq = _get_arg_as_input_act_obs_or_fq(arg, node, named_modules, obs_or_fq_map, is_qat)
+        arg_as_input_target_dtype, arg_as_input_target_is_dynamic = _get_dtype_and_is_dynamic(arg_as_input_act_obs_or_fq)
+
+        arg_as_output_act_obs_or_fq = _get_output_act_obs_or_fq(arg, named_modules, obs_or_fq_map, is_qat)
+        arg_as_output_target_dtype, arg_as_output_target_is_dynamic = _get_dtype_and_is_dynamic(arg_as_output_act_obs_or_fq)
+
+
+        needs_obs_or_fq = _needs_obs_or_fq(
+            arg_as_output_target_dtype,
+            arg_as_output_target_is_dynamic,
+            arg_as_input_target_dtype,
+            arg_as_input_target_is_dynamic,
+            reuse_input_obs_or_fq,
+            is_zeroth_arg=len(node.args) > 0 and arg is node.args[0],
+        )
+
+    else:
+        assert qconfig is not None
+        # custom flow for standalone modules
+        _, _, sm_prepare_custom_config, _ = \
+            _get_standalone_module_configs(
+                node, named_modules, prepare_custom_config, qconfig, backend_config)
+        sm_input_quantized_idxs = sm_prepare_custom_config.input_quantized_indexes
+
+        # for args, this is set to the index of the current arg
+        # for kwargs, this is left at None
+        cur_input_idx = None
+        for arg_idx, arg_to_check in enumerate(node.args):
+            if arg_to_check is arg:
+                cur_input_idx = arg_idx
+                break
+
+        if cur_input_idx is None:
+            needs_obs_or_fq = False
+        else:
+            arg_as_output_target_dtype = _get_arg_target_dtype_as_output(arg, named_modules, obs_or_fq_map, is_qat)
+            arg_as_input_target_dtype = torch.quint8 if cur_input_idx in sm_input_quantized_idxs \
+                else torch.float
+            needs_obs_or_fq = (
+                (arg_as_output_target_dtype != arg_as_input_target_dtype) and
+                (arg_as_input_target_dtype != torch.float)
+            )
+
+        act_post_process_ctr = qconfig.activation
+        arg_as_input_act_obs_or_fq = act_post_process_ctr() if act_post_process_ctr else None
+
+    if needs_obs_or_fq:
+
+        existing_obs_node = None
+
+        # Before using the new observer, check if an observer
+        # of the correct type already exists. If it does, use it.
+        # This prevents duplicate observer insertions if a node is
+        # used by multiple nodes.
+        # TODO: this is looking into how the value is used in the future
+        # we should remove this
+        # removing this means we insert one observer for each use, even if they
+        # have the same dtype, we can have an extra pass that removes the extra observers
+        for maybe_obs_node in arg.users.keys():
+            if maybe_obs_node.op == 'call_module':
+                maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
+                if (
+                    type(maybe_obs_mod) == type(arg_as_input_act_obs_or_fq) and
+                    maybe_obs_mod.dtype == arg_as_input_target_dtype  # type: ignore[possibly-undefined]
+                ):
+                    arg_as_input_act_obs_or_fq = maybe_obs_mod  # type: ignore[assignment]
+                    existing_obs_node = maybe_obs_node
+                    break
+
+        assert arg_as_input_act_obs_or_fq is not None
+        obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq
+        if existing_obs_node is None:
+            new_obs_node = _insert_obs_or_fq(
+                arg, arg_as_input_act_obs_or_fq, model, named_modules, graph)
+            # override this arg to be the observed arg
+            new_arg = new_obs_node
+        else:
+            new_arg = existing_obs_node
+
+    return new_arg
+
+
+def _maybe_insert_input_observers_for_node(
+    node: Node,
+    qconfig: QConfigAny,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    qhandler: Optional[QuantizeHandler],
+    prepare_custom_config: PrepareCustomConfig,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+    backend_config: Optional[BackendConfig] = None
+) -> None:
+    """
+    If needed, inserts observers to the input args and kwargs of `node`.
+    Note: modifies `node` inplace.
+
+    For example, if cur_node needs an observer after prev_node, we change from
+
+      prev_node -> cur_node
+
+    To
+
+      prev_node -> obs -> cur_node
+
+    Note: backend_config only needed for standalone_module node
+    """
+    # Look through every input arg.  If that arg's target dtype does not
+    # match the current node's target dtype, insert an observer.
+    new_args = []
+    for arg in node.args:
+        new_arg = _maybe_insert_input_observer_for_arg_or_kwarg(
+            node, arg, qconfig, model, named_modules, graph,
+            qhandler,
+            prepare_custom_config,
+            obs_or_fq_map,
+            is_qat,
+            backend_config)
+        new_args.append(new_arg)
+
+    new_kwargs = {}
+    for k, kwarg in node.kwargs.items():
+        new_kwarg = _maybe_insert_input_observer_for_arg_or_kwarg(
+            node, kwarg, qconfig, model, named_modules, graph,
+            qhandler,
+            prepare_custom_config,
+            obs_or_fq_map,
+            is_qat,
+            backend_config)
+        new_kwargs[k] = new_kwarg
+
+    # assign the new args and kwargs to the node, inplace
+    node.args = tuple(new_args)
+    node.kwargs = new_kwargs
+
+def _maybe_insert_input_equalization_observers_for_node(
+    node: Node,
+    equalization_qconfig: Any,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    is_branch: bool,
+) -> None:
+    """
+    If `node` needs to be equalized, find the input/weight observers it needs in
+    `equalization_qconfig`, creates them, and inserts it into `graph`.
+
+    If `node` does not need an equalization observer, returns None.
+    """
+    if equalization_qconfig is None or not node_supports_equalization(node, named_modules):
+        return
+
+    if is_branch:
+        warnings.warn(
+            f"Cannot equalize {node} because it is part of a branch."
+        )
+        return
+
+    new_args = []
+    for arg in node.args:
+        if not isinstance(arg, Node) or node_arg_is_bias(node, arg):
+            new_args.append(arg)
+            continue
+
+        is_weight = node_arg_is_weight(node, arg)
+
+        act_eq_process_ctr = equalization_qconfig.weight if is_weight else \
+            equalization_qconfig.input_activation
+
+        new_eq_obs_mod = act_eq_process_ctr()
+        new_eq_obs_node = _insert_obs_or_fq(
+            arg, new_eq_obs_mod, model, named_modules, graph)
+
+        new_args.append(new_eq_obs_node)
+
+    # assign the new args and kwargs to the node, inplace
+    node.args = tuple(new_args)
+
+def _maybe_insert_output_observer_for_node(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Optional[Node]:
+    """
+    If `node` needs an output observer, creates it, inserts it into `graph`
+    and returns it.
+
+    If `node` does not need an output observer, returns None.
+
+    Note: inserting dynamic quantization ops for output is not supported in fx graph mode
+    quantization code path right now
+    """
+    assert node.op != 'output', 'observer insertion for outputs is handled elsewhere'
+
+    is_standalone_module = False
+    if "quantization_annotation" in node.meta:
+        output_act_obs_or_fq = _create_obs_or_fq_from_qspec(
+            node.meta["quantization_annotation"].output_qspec, obs_or_fq_map, is_qat
+        )
+    else:
+        assert "target_dtype_info" in node.meta
+        is_standalone_module = node.meta["target_dtype_info"].get("_is_standalone_module", False)
+        output_act_obs_or_fq_ctr = node.meta["target_dtype_info"].get("output_act_obs_or_fq_ctr")
+        output_act_obs_or_fq = output_act_obs_or_fq_ctr() if output_act_obs_or_fq_ctr else None
+    target_dtype, target_is_dynamic = _get_dtype_and_is_dynamic(output_act_obs_or_fq)
+    # uncomment after we support reuse_input_obs_or_fq properly by having separate
+    # implemntations for this key instead of reusing the input_output_share_observers
+    # code
+    # reuse_input_obs_or_fq = node.meta["target_dtype_info"].get("reuse_input_obs_or_fq", False)
+    # for now we set this to False since reuse_input_obs_or_fq for
+    # the output of a node is implementation in the same code path as observer sharing,
+    # we should refactor this part to make it clearer in the future
+    # and we would be able to read this from config directly
+    reuse_input_obs_or_fq = False
+
+    # Note: prev_output_dtype = torch.float and prev_output_is_dynamic=False
+    # because the prev_output is the output of an fp32 op, althought technically
+    # we should get the dtype of the output from node.meta["val"] in the future
+    # if we deprecate fx graph mode quantization
+    needs_obs_or_fq = _needs_obs_or_fq(torch.float, False, target_dtype, target_is_dynamic, reuse_input_obs_or_fq)
+    # currently the activation in QConfig(activation=...,) is for both input
+    # and output, and when the activation is configured to be dynamic quantization
+    # e.g. PlaceholderObserver(dtype=torch.quint8, is_dynamic=True, ...), it means
+    # the input should by dynamically quantized, but output should not be quantized
+    #
+    # there is no way we can specify different observer/fq for input and output
+    # activation through QConfig today, this limitation is lifted in the
+    # quantizer/annotation API in pytorch 2.0 export quantization code path,
+    # but since this code is reused, annotating output to be dynamically quantized
+    # would not work either for that.
+    # we can change QConfig to support input/output activation if we want
+    # to remove the following check, or if we can deprecate fx graph mode quantization
+    if target_is_dynamic:
+        needs_obs_or_fq = False
+
+    # we never insert observers to output of standalone module, we assume
+    # if needed, they are inserted inside the standalone module
+    needs_obs_or_fq = needs_obs_or_fq and \
+        (not is_standalone_module)
+
+    if needs_obs_or_fq:
+        obs_or_fq_map[node] = output_act_obs_or_fq
+        return _insert_obs_or_fq(node, output_act_obs_or_fq, model, named_modules, graph)
+    else:
+        return None
+
+def _maybe_insert_observers_before_graph_output(
+    graph_output_node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> None:
+    """
+    If the output needs to be quantized and there are any nodes
+    in the output which are not already observed, inserts observers
+    for those nodes.
+    """
+
+    def _recursive_maybe_replace_node_with_obs(
+        maybe_node: Argument,
+        model: torch.nn.Module,
+        named_modules: Dict[str, torch.nn.Module],
+        graph: Graph,
+    ) -> Argument:
+        """
+        Navigate an arbitrary data structure of lists, tuples, dicts.
+        For each container type, recurse on all inputs. Once any Node
+        is found, insert an observer if needed and do not recurse further.
+
+        For example, given a structure of
+
+          {'foo1': [[bar1]], 'foo2': {'foo3': [[[bar3]]]}}
+
+        we recurse down to bar1 and bar3, observe them if necessary,
+        and if we inserted an observer then replace the original node
+        with its observer.
+
+        Returns the data structure with all nodes needing observation being
+        replaced by their observers.
+        """
+        if isinstance(maybe_node, Node):
+            # check dtype of this node
+            arg_as_output_target_dtype = _get_arg_target_dtype_as_output(maybe_node, named_modules, obs_or_fq_map, is_qat)
+            observer_mod = None
+            arg_as_input_target_dtype = torch.float
+            if "target_dtype_info" in maybe_node.meta:
+                observer_cls = maybe_node.meta["target_dtype_info"].get("input_act_obs_or_fq_ctr", None)
+                if observer_cls is not None:
+                    observer_mod = observer_cls()
+                    arg_as_input_target_dtype = observer_mod.dtype
+            # TODO: this does not handle dynamic quantization yet
+            need_obs = (
+                arg_as_output_target_dtype != arg_as_input_target_dtype and
+                arg_as_input_target_dtype != torch.float
+            )
+            if need_obs:
+                assert observer_mod is not None
+                # insert observer
+                observer_node = _insert_obs_or_fq(
+                    maybe_node, observer_mod, model, named_modules, graph)
+                return observer_node
+            else:
+                return maybe_node
+        elif isinstance(maybe_node, (list, tuple)):
+            results = []
+            for inner_node in maybe_node:
+                results.append(_recursive_maybe_replace_node_with_obs(
+                    inner_node, model, named_modules, graph))
+            if isinstance(maybe_node, list):
+                return results
+            else:
+                return tuple(results)
+        elif isinstance(maybe_node, dict):
+            results_dict = {}
+            for k, inner_v in maybe_node.items():
+                results_dict[k] = _recursive_maybe_replace_node_with_obs(
+                    inner_v, model, named_modules, graph)
+            return results_dict
+        elif maybe_node is None:
+            return None
+        else:
+            raise Exception("Unhandled type for returned node:", maybe_node)
+
+    new_args = []
+    for old_arg in graph_output_node.args:
+        new_args.append(
+            _recursive_maybe_replace_node_with_obs(
+                old_arg, model, named_modules, graph))
+
+    graph_output_node.args = tuple(new_args)  # type: ignore[assignment]
+
+
+def _maybe_propagate_dtype_for_node(
+    node: Node,
+    target_dtype: Union[torch.dtype, type],
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
+) -> None:
+    """
+    Assigns `target_dtype` to `node`, setting `is_dynamic` to False. If `node`
+    is a general tensor shape op, also call this function recursively on
+    the first argument, to propagate the dtype to the caller.
+    """
+    node.meta["target_dtype_info"]["input_act_obs_or_fq_ctr"] = None
+    node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"] = None
+    # if this is a copy node, propagate to first arg
+    root_node, _, pattern, qhandler, qconfig = node_name_to_match_result_with_qconfig.get(
+        node.name, (None, None, None, None, None))
+    # TODO: probably need to remove `is_general_tensor_value_op`
+    if qhandler is not None and qhandler.is_general_tensor_value_op():
+        prev_node = node.args[0]
+        if isinstance(prev_node, Node):
+            _maybe_propagate_dtype_for_node(
+                prev_node, target_dtype, node_name_to_match_result_with_qconfig)
+
+def propagate_dtypes_for_known_nodes(
+    graph: Graph,
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
+) -> None:
+    """
+    Currently we assume that inputs to the graph are either `torch.float` or
+    `torch.quint8`, which is not always correct. For ops such as
+    `x.masked_fill(mask, value)`, we know that the dtype of  `mask` is a
+    `BoolTensor`. Propagate this information throughout the graph.
+
+    Note: not all dtypes in the graph will be correct after this pass, but a
+    higher percentage of them will be correct. Hopefully in the future we can
+    replace this with a better way to reason about dtypes of tensors.
+    """
+    for node in graph.nodes:
+        non_observable_arg_dict = get_non_observable_arg_indexes_and_types(node)
+
+        for arg_type in non_observable_arg_dict:
+            non_observable_indices = non_observable_arg_dict[arg_type](node)
+
+            for index in non_observable_indices:
+                arg = node.args[index]
+
+                # when an argument is a tuple, it does not show up as another node so we need to go through
+                # all elements of the tuple manually
+                if isinstance(arg, (tuple, list)):
+                    arg_list = list(arg)
+                else:
+                    arg_list = [arg]
+
+                for cur_arg in arg_list:
+                    # hard coded arguments show up but aren't `Node` typed and do not need dtype propagated
+                    if isinstance(cur_arg, torch.fx.node.Node):
+                        _maybe_propagate_dtype_for_node(
+                            cur_arg, arg_type, node_name_to_match_result_with_qconfig)
+
+def _maybe_make_input_output_share_observers(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+) -> bool:
+    """
+    Ensures that we share an observer
+    for all input arguments as well as the output argument. In detail, given
+    a graph of
+
+      x0 -> obs0 -> op -> x2
+                  /
+      x1 -> obs1 /
+
+    where node obs0 points to observer instance observer0,
+    obs1 points to observer1 and obs2 points to observer2, we make nodes obs1
+    and ob2 point to observer0.
+    Returns: whether the operation succeeded or not
+    """
+    first_arg = None
+    # find the first non-Tensor arg
+    for i in range(len(node.args)):
+        if isinstance(node.args[i], (Node, list, tuple)):
+            first_arg = node.args[i]
+            break
+
+    # if there is no non-Tensor arg, return directly
+    if first_arg is None:
+        return False
+
+    if isinstance(first_arg, (list, tuple)):
+        first_arg_arg = first_arg[0]
+    elif isinstance(first_arg, Node):
+        first_arg_arg = first_arg
+    else:
+        return False
+
+    # if we have a graph such as
+    #   observed_node -> non_observed_node -> cat
+    # we need to navigate up to the first observer
+    iteration_guard = 0
+    while not _is_activation_post_process_node(first_arg_arg, named_modules):
+        if not isinstance(first_arg_arg, Node):
+            return False
+        # did not find an activation_post_process for the op
+        if first_arg_arg.op == "placeholder":
+            return False
+        # trace back the args until we found the first Tensor/Node
+        trace_back_node = None
+        for i in range(len(first_arg_arg.args)):
+            trace_back_node = first_arg_arg.args[i]
+            if isinstance(trace_back_node, Node):
+                break
+        if trace_back_node is None:
+            return False
+        first_arg_arg = trace_back_node
+
+        iteration_guard += 1
+        if iteration_guard > 10000:
+            raise AssertionError('Unable to find observer of previous node')
+
+    assert isinstance(first_arg_arg, Node)
+    target_to_use = first_arg_arg.target
+    assert isinstance(target_to_use, str)
+    obs_mod_to_use = named_modules[target_to_use]
+
+    if isinstance(first_arg, (list, tuple)):
+        # set all other input observer nodes to use that module
+        for input_idx, input_arg in enumerate(first_arg):
+            if input_idx == 0:
+                continue
+            iteration_guard = 0
+            while not _is_activation_post_process_node(input_arg, named_modules):
+                # failed to trace back since no input arg for the current node
+                if len(input_arg.args) < 1:
+                    return False
+                input_arg = input_arg.args[0]
+                iteration_guard += 1
+                if iteration_guard > 10000:
+                    raise AssertionError('Unable to find observer of previous node')
+
+            parent_name, name = _parent_name(input_arg.target)
+            setattr(named_modules[parent_name], name, obs_mod_to_use)
+
+    # set the output observer node to use that module
+    for output_obs_node in node.users.keys():
+        assert _is_activation_post_process_node(output_obs_node, named_modules)
+        parent_name, name = _parent_name(output_obs_node.target)
+        setattr(named_modules[parent_name], name, obs_mod_to_use)
+
+    # TODO(future PR): delete the orphaned observer modules
+    return True
+
+def _remove_output_observer(
+        node: Node,
+        model: torch.nn.Module,
+        named_modules: Dict[str, torch.nn.Module]):
+    items = list(node.users.items())
+    for output_obs_node, _ in items:
+        assert _is_activation_post_process_node(output_obs_node, named_modules)
+        output_obs_node.replace_all_uses_with(node)
+        model.graph.erase_node(output_obs_node)  # type: ignore[union-attr, operator]
+
+def _swap_custom_module_to_observed(
+        node: Node,
+        qconfig: QConfigAny,
+        named_modules: Dict[str, torch.nn.Module],
+        prepare_custom_config: PrepareCustomConfig):
+    custom_module = named_modules[node.target]  # type: ignore[index]
+    custom_module_class_mapping = prepare_custom_config.float_to_observed_mapping
+    observed_custom_module_class = \
+        get_swapped_custom_module_class(
+            custom_module, custom_module_class_mapping, qconfig)
+    observed_custom_module = \
+        observed_custom_module_class.from_float(custom_module)
+    parent_name, name = _parent_name(node.target)
+    setattr(named_modules[parent_name], name, observed_custom_module)
+
+def insert_observers_for_model(
+    model: GraphModule,
+    node_name_to_match_result_with_qconfig: Dict[str, _MatchResultWithQConfig],
+    node_name_to_qconfig: Dict[str, QConfigAny],
+    prepare_custom_config: PrepareCustomConfig,
+    equalization_config_map: Dict[str, Any],
+    backend_config: BackendConfig,
+    observed_node_names: Set[str],
+    is_qat: bool,
+) -> Optional[Node]:
+    """
+    Inserts observers, using the following high level algorithm:
+
+    For each node in the graph:
+      1. determine the target dtype of this node in the quantized graph, and save
+           it for future steps
+      2. determine the target dtype or all args and kwargs of this node
+      3. if any arg or kwarg's target dtype does not match the current node's
+           dtype, insert an observer
+      4. if the current node needs an output observer, insert it
+
+    For example:
+
+    - starting graph:
+        x0 -> linear -> x1
+
+    - observed graph after processing x0:
+        x0(fp32)
+
+    - observed graph after processing linear:
+        x0(fp32) -> x0_obs0(int8) -> linear(int8) -> linear_obs0(int8)
+
+    - observed graph after processing x1:
+        x0(fp32) -> x0_obs0(int8) -> linear(int8) -> linear_obs0(int8) -> x1
+
+    After a node is processed, the naive observer placement is guaranteed to be
+    complete for that node and all of its predecessors. There can be future
+    passes which optimize the graph by deduplicating observers, etc.
+    """
+
+    # node.meta["target_dtype_info"] stores the target dtype information
+    # that's derived from qconfig for the Node, for example, if we have
+    # a conv2d node that has a qconfig
+    # qconfig = QConfig(activation=..., weight=...)
+    # # information for input and bias node omitted
+    # # for getattr node
+    # # weight = getattr(self, 'weight')
+    # weight.meta["target_dtype_info"] = {
+    #    'output_act_obs_or_fq_ctr': qconfig.weight,
+    # }
+    # # for conv2d node
+    # # conv2d = call_function[target=torch.nn.functional.conv2d](
+    # #            args=(input, weight, bias))
+    # conv2d.meta["target_dtype_info"] = {
+    #   'input_act_obs_or_fq_ctr': qconfig.activation
+    #   'weight_obs_or_fq_ctr': qconfig.weight,
+    #   'bias_obs_or_fq_ctr': PlaceholderObserver.with_args(dtype=torch.float32),
+    #   'output_act_obs_or_fq_ctr': qconfig.activation,
+    # }
+    #
+    cache_for_no_tensor_check: Dict[Node, bool] = {}
+
+    # first, populate the dtype map based only on qconfig and qhandler
+    # this assumes:
+    # graph inputs are fp32 by default, and int8 where overriden
+    # other nodes output dtype is specified by the qconfig
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+
+    input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+    output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
+    processed_nodes: Set[Node] = set()
+    # initialize target_dtype_info
+    for node in model.graph.nodes:
+        node.meta["target_dtype_info"] = copy.copy(_DEFAULT_FP32_QCONFIG_FOR_TARGET_DTYPE_INFO)
+
+    inputs_seen_counter = 0
+    outputs_seen_counter = 0
+    placeholder_node_to_input_index: Dict[Node, int] = {}
+    # TODO: we probably don't need this counter since each graph will only have
+    # one output node?
+    output_node_to_output_index: Dict[Node, int] = {}
+    for node in model.graph.nodes:
+        if node.op == "placeholder":
+            placeholder_node_to_input_index[node] = inputs_seen_counter
+            inputs_seen_counter += 1
+        if node.op == "output":
+            output_node_to_output_index[node] = outputs_seen_counter
+            outputs_seen_counter += 1
+
+    # Step 1, set the observer or fake quantize module constructor for each node in the
+    # matched_node_pattern
+
+    for match_res_with_qconfig in node_name_to_match_result_with_qconfig.values():
+        last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
+        assert qhandler is not None
+        _set_target_dtype_info_for_matched_node_pattern(
+            matched_node_pattern,
+            last_node,
+            qconfig,
+            qhandler,
+            backend_config,
+            named_modules,
+            cache_for_no_tensor_check,
+            processed_nodes
+        )
+
+    # Step 2. Special cases for some operators, we might be able to remove them
+    # in the future if we know dtype information of each node better
+
+    # Step 2.1. some settings are not based on patterns, we need to process each node
+    # instead
+    for node in model.graph.nodes:
+        if node.op == "placeholder" and placeholder_node_to_input_index[node] in input_quantized_idxs:
+            # users are not supposed to call calculate_qparams on PlaceholderObserver, and
+            # this is OK because we are using this as a way to encode the dtypes of input
+            # tensor, we won't actually insert these observers in the graph and won't
+            # actually call calculate_qparams
+            node.meta["target_dtype_info"] = copy.copy(_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO)
+        elif node.op in ("call_module", "call_method", "call_function"):
+            args_have_no_tensors = \
+                all_node_args_have_no_tensors(
+                    node, named_modules, cache_for_no_tensor_check)
+            if args_have_no_tensors:
+                node.meta["target_dtype_info"] = {
+                    "input_act_obs_or_fq_ctr": None,
+                    "output_act_obs_or_fq_ctr": None,
+                }
+        elif node.op == "output" and output_node_to_output_index[node] in output_quantized_idxs:
+            # TODO(future PR): update the output_quantized_idxs API to match
+            # arbitrary data structures. There is always a single output, and
+            # that output can have arbitrary nesting of values. List[int] is
+            # not the right data type for this.
+
+            # TODO(future PR): support more dtypes in model outputs, if necessary
+            node.meta["target_dtype_info"] = copy.copy(_DEFAULT_QUINT8_QCONFIG_FOR_TARGET_DTYPE_INFO)
+
+    # Step 2.2, for nodes with known input dtypes, propagate them throughout the
+    # graph. For example, if there is a call such as
+    #   x1 = x0.masked_fill(mask, 1)
+    # we propagate the type of mask to be torch.bool
+    propagate_dtypes_for_known_nodes(model.graph, node_name_to_match_result_with_qconfig)
+
+    # Step 3, check if the requested target_dtype_info is supported by backend or not
+    # if not, we'll reset the target_dtye_info to use the default (float Tensor)
+
+    # reset the counters and set of processed_nodes
+    processed_nodes: Set[Node] = set()
+    for match_res_with_qconfig in node_name_to_match_result_with_qconfig.values():
+        last_node, matched_node_pattern, pattern, qhandler, qconfig = match_res_with_qconfig
+        is_supported_by_backend = _is_pattern_dtype_config_and_qconfig_supported_by_backend(
+            pattern, matched_node_pattern, qconfig, backend_config)
+        assert qhandler is not None
+
+        # get output_act_dtype so that we don't also reset the special typed nodes
+        # TODO: we might want to handle these more uniformly with the default path
+        # this can be improved if we can use node.meta["val"]
+        output_act_or_fq_ctr = node.meta["target_dtype_info"]["output_act_obs_or_fq_ctr"]
+        output_act_or_fq = output_act_or_fq_ctr() if output_act_or_fq_ctr else None
+        output_act_dtype, _ = _get_dtype_and_is_dynamic(output_act_or_fq)
+        if not is_supported_by_backend and output_act_dtype not in [None, int, float, torch.bool]:
+            # restore target_dtype_info to default if it is not supported by backend
+            _set_target_dtype_info_for_matched_node_pattern(
+                matched_node_pattern,
+                last_node,
+                torch.ao.quantization.qconfig._default_fp32_placeholder_qconfig,
+                None,
+                backend_config,
+                named_modules,
+                cache_for_no_tensor_check,
+                processed_nodes
+            )
+
+    # After this point, the current node and all of its arguments
+    # have a target_dtype_info assigned. Now, we insert observers for inputs
+    # of this node (if needed for this node), and the output of this node
+    # (if needed for this node).
+
+    # Since we are mutating the graph as we go, we iterate over the original
+    # nodes before observer insertion, instead of model.graph.nodes.
+    nodes_before_observation = list(model.graph.nodes)
+
+    # Avoid duplicates custom module swaps for multiple nodes with same target.
+    custom_module_names_already_swapped: Set[str] = set()
+
+    # TODO: reuse placeholder_node_to_input_index and output_node_to_output_index
+    # reset inputs/outputs counters
+    inputs_seen_counter = 0
+    outputs_seen_counter = 0
+    results_node = None
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+
+    # TODO: change this to insert obs/fq by pattern instead of by node
+    for node in nodes_before_observation:
+
+        if node.op == 'placeholder':
+            # if a graph input is in fp32, it does not need observation
+            # if a graph input is in int8, we assume the observation happens
+            #   outside of the graph, and no additional observation is needed
+            pass
+
+        elif node.op in ('call_module', 'call_method', 'call_function', 'output'):
+            # check for matches
+            last_node, matched_node_pattern, pattern, qhandler, qconfig = (
+                node_name_to_match_result_with_qconfig.get(node.name, (None, None, None, None, None))  # type: ignore[assignment]
+            )
+            equalization_qconfig = equalization_config_map.get(node.name, None)
+
+            this_node_dtype_info = node.meta["target_dtype_info"]
+            if "val" in node.meta:
+                output_is_a_tensor = (
+                    this_node_dtype_info is not None and
+                    isinstance(node.meta["val"], FakeTensor)
+                )
+            else:
+                output_is_a_tensor = this_node_dtype_info is not None
+
+            skip_inserting_observers = (
+                (qconfig is None) or
+                not output_is_a_tensor
+            ) and (
+                not node.op == 'output'
+            )
+
+            # TODO: take a closer look to see if we can remove this check
+            # right now it is here because of `observed_node_names`, we are using
+            # it as an indicator for swapping the modules to reference modules in
+            # convert
+            is_supported_by_backend = _is_pattern_dtype_config_and_qconfig_supported_by_backend(
+                pattern, matched_node_pattern, qconfig, backend_config)
+
+            if not skip_inserting_observers and is_supported_by_backend:
+                named_modules = dict(model.named_modules(remove_duplicate=False))
+                if node.op != 'output':
+                    assert matched_node_pattern is not None
+                    # add matched nodes to the observed node name set
+                    _add_matched_node_name_to_set(matched_node_pattern, observed_node_names)
+
+                    # This is currently only used for equalization.
+                    # Checks if the current node is in a branch in which the two
+                    # first layers are both being quantized.
+                    #
+                    # ex.       conv2
+                    #         /
+                    #      x -> conv1
+                    #
+                    # If this is the case, we will not apply equalization to the
+                    # initial two layers.
+                    is_quantized_branch = False
+                    if (
+                        len(node.args) > 0 and
+                        isinstance(node.args[0], Node) and
+                        len(node.args[0].users) > 1
+                    ):
+                        for user in node.args[0].users:
+                            # Checks if there exists another user being quantized
+                            is_user_quantized = (
+                                node_name_to_qconfig.get(user.name, None) is not None or
+                                (user.op == 'call_module' and isinstance(named_modules[str(user.target)], ObserverBase))
+                            )
+                            if user != node and is_user_quantized:
+                                is_quantized_branch = True
+
+                    pattern_to_root_node_getter = get_fusion_pattern_to_root_node_getter(backend_config)
+                    root_node_getter = pattern_to_root_node_getter.get(pattern, _default_root_node_getter)
+                    root_node = root_node_getter(matched_node_pattern)
+                    is_input_node_of_the_pattern = node is root_node
+                    if is_input_node_of_the_pattern:
+                        # this modifies node inplace
+                        _maybe_insert_input_observers_for_node(
+                            node, qconfig, model, named_modules, model.graph,
+                            qhandler,
+                            prepare_custom_config,
+                            obs_or_fq_map,
+                            is_qat,
+                            backend_config)
+
+                        # insert equalization input observers if needed
+                        _maybe_insert_input_equalization_observers_for_node(
+                            node, equalization_qconfig, model, named_modules, model.graph,
+                            is_quantized_branch)
+
+                    is_last_node_of_pattern = node is last_node
+                    input_output_share_observers = node.meta["target_dtype_info"].get("input_output_share_observers", False)
+                    reuse_input_obs_or_fq = node.meta["target_dtype_info"].get("reuse_input_obs_or_fq", False)
+
+                    if is_last_node_of_pattern:
+                        if _is_custom_module_lstm(node, named_modules, qconfig, qhandler):
+                            # Currently custom module outputs are assumed to be already quantized,
+                            # so we need to insert a DeQuantStub after the output. For custom module
+                            # LSTM specifically, the outputs are also a nested tuple, so we must first
+                            # break down the tuple to insert DeQuantStubs after the internal nodes.
+
+                            # TODO: This currently diverges from how custom modules are handled today,
+                            # where we insert observers after the output instead of DeQuantStubs, and
+                            # replace these observers with "dequantize" nodes during convert. Conceptually,
+                            # these output observers are the same as DeQuantStubs. In the future, we
+                            # should resolve this inconsistency by inserting DeQuantStubs for all custom
+                            # modules, not just for LSTM.
+                            _insert_dequant_stubs_for_custom_module_lstm_output(node, model, named_modules, model.graph)
+                            if node.target not in custom_module_names_already_swapped:
+                                custom_module_names_already_swapped.add(node.target)
+                                _swap_custom_module_to_observed(node, qconfig, named_modules, prepare_custom_config)
+                        else:
+                            # this returns the new observer node if it was needed
+                            maybe_output_obs_node = _maybe_insert_output_observer_for_node(
+                                node, model, named_modules, model.graph, obs_or_fq_map, is_qat)
+
+                            if maybe_output_obs_node is not None:
+                                # Update users of original node to use the output observer
+                                # instead. For example, change
+                                #
+                                #           next_node
+                                #          /
+                                #   cur_node -> obs
+                                #
+                                # to
+                                #
+                                #                 next_node
+                                #                 /
+                                #   cur_node -> obs
+                                #
+                                # We need to save orig users before updating uses because
+                                # the list of users will change as we update uses
+                                orig_users = list(node.users.keys())
+                                for user_node in orig_users:
+                                    if user_node is maybe_output_obs_node:
+                                        continue
+                                    user_node.replace_input_with(node, maybe_output_obs_node)
+
+                                _is_observer_in_same_graph_ = _is_observer_in_same_graph(
+                                    node, named_modules, obs_or_fq_map, is_qat)
+
+                                # for ops whose inputs and outputs share observer/fqs, we modify the graph
+                                # to make all inputs and outputs use the first input's
+                                # observer/fq
+                                if (input_output_share_observers and _is_observer_in_same_graph_) or \
+                                        reuse_input_obs_or_fq:
+                                    if not _maybe_make_input_output_share_observers(node, model, named_modules):
+                                        _remove_output_observer(node, model, named_modules)
+
+                                if qhandler is not None and qhandler.is_custom_module():
+                                    if node.target not in custom_module_names_already_swapped:
+                                        custom_module_names_already_swapped.add(node.target)
+                                        _swap_custom_module_to_observed(node, qconfig, named_modules, prepare_custom_config)
+
+                else:  # output
+                    _maybe_insert_observers_before_graph_output(node, model, named_modules, model.graph, obs_or_fq_map, is_qat)
+
+        #
+        # After this point, the current node has input and output observers
+        # that it needs for itself inserted.
+        #
+
+        # increment the counters, so future inputs and outputs are assigned
+        # correct dtypes
+        if node.op == 'placeholder':
+            inputs_seen_counter += 1
+        elif node.op == 'output':
+            outputs_seen_counter += 1
+            results_node = node
+
+    return results_node
+
+def _run_prepare_fx_on_standalone_modules(
+    model: torch.nn.Module,
+    is_qat: bool,
+    named_modules: Dict[str, torch.nn.Module],
+    node_name_to_match_result_with_qconfig: Any,
+    prepare_custom_config: PrepareCustomConfig,
+    backend_config: BackendConfig,
+) -> None:
+    """
+    Runs prepare_fx on each standalone module. Note: this does
+    not modify the graph, it just replaces the unobserved modules with
+    their observed versions.
+    """
+    for (root_node, _, pattern, qhandler, qconfig) in node_name_to_match_result_with_qconfig.values():
+        if qhandler is None:
+            continue
+        elif not qhandler.is_standalone_module():
+            continue
+
+        sm_qconfig_mapping, sm_example_inputs, sm_prepare_custom_config, \
+            sm_backend_config = _get_standalone_module_configs(
+                root_node, named_modules, prepare_custom_config, qconfig, backend_config)
+
+        standalone_module = named_modules[root_node.target]
+        prepare = \
+            torch.ao.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore[attr-defined]
+        observed_standalone_module = \
+            prepare(
+                standalone_module,
+                sm_qconfig_mapping,
+                is_qat,
+                example_inputs=sm_example_inputs,
+                prepare_custom_config=sm_prepare_custom_config,
+                backend_config=sm_backend_config)
+        parent_name, name = _parent_name(root_node.target)
+        setattr(named_modules[parent_name], name, observed_standalone_module)
+        named_modules[root_node.target] = observed_standalone_module
+
+def _save_state(
+    observed: GraphModule,
+    node_name_to_qconfig: Dict[str, QConfigAny],
+    node_name_to_scope: Dict[str, Tuple[str, type]],
+    prepare_custom_config: PrepareCustomConfig,
+    equalization_node_name_to_qconfig: Dict[str, Any],
+    qconfig_mapping: QConfigMapping,
+    is_qat: bool,
+    observed_node_names: Set[str],
+) -> None:
+    observed.meta["_observed_graph_module_attrs"] = (
+        ObservedGraphModuleAttrs(
+            node_name_to_qconfig=node_name_to_qconfig,
+            node_name_to_scope=node_name_to_scope,
+            prepare_custom_config=prepare_custom_config,
+            equalization_node_name_to_qconfig=equalization_node_name_to_qconfig,
+            qconfig_mapping=qconfig_mapping,
+            is_qat=is_qat,
+            observed_node_names=observed_node_names,
+        )
+    )
+
+def prepare(
+        model: GraphModule,
+        qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
+        is_qat: bool,
+        node_name_to_scope: Dict[str, Tuple[str, type]],
+        example_inputs: Tuple[Any, ...],
+        prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
+        _equalization_config: Union[QConfigMapping, Dict[str, Any], None] = None,
+        backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+        is_standalone_module: bool = False) -> GraphModule:
+    """ standalone_module means it a submodule that is not inlined in
+    parent module, and will be quantized separately as one unit.
+
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+    Args:
+        node_name_to_scope: mapping from node name to the scope of the module which contains the node.
+        The scope is a tuple of fully qualified path of the module and the type of the module
+    Returns:
+        model(GraphModule): prepared standalone module
+        attributes related to standalone module
+        in model.meta["_observed_graph_module_attrs"]:
+            is_observed_standalone_module (bool): boolean value that shows whether the
+            current model is a observed standalone module or not
+            standalone_module_input_quantized_idxs(List[Int]): a list of
+                indexes for the graph input that is expected to be quantized,
+                same as input_quantized_idxs configuration provided
+                for the standalone module
+            standalone_module_output_quantized_idxs(List[Int]): a list of
+                indexs for the graph output that is quantized
+                same as input_quantized_idxs configuration provided
+                for the standalone module
+    """
+    if prepare_custom_config is None:
+        prepare_custom_config = PrepareCustomConfig()
+    if _equalization_config is None:
+        _equalization_config = QConfigMapping()
+
+    if isinstance(qconfig_mapping, Dict):
+        warnings.warn(
+            "Passing a QConfig dictionary to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a QConfigMapping instead.")
+        qconfig_mapping = QConfigMapping.from_dict(qconfig_mapping)
+
+    if isinstance(_equalization_config, Dict):
+        warnings.warn(
+            "Passing a QConfig dictionary to prepare for equalization is deprecated and will not "
+            "be supported in a future version. Please pass in a QConfigMapping instead.")
+        _equalization_config = QConfigMapping.from_dict(_equalization_config)
+
+    if isinstance(prepare_custom_config, Dict):
+        warnings.warn(
+            "Passing a prepare_custom_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a PrepareCustomConfig instead.")
+        prepare_custom_config = PrepareCustomConfig.from_dict(prepare_custom_config)
+
+    if isinstance(backend_config, Dict):
+        warnings.warn(
+            "Passing a backend_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a BackendConfig instead.")
+        backend_config = BackendConfig.from_dict(backend_config)
+
+    assert isinstance(qconfig_mapping, QConfigMapping)
+    assert isinstance(_equalization_config, QConfigMapping)
+    qconfig_mapping = copy.deepcopy(qconfig_mapping)
+    _equalization_config = copy.deepcopy(_equalization_config)
+
+    # mapping from a tuple of nodes in reverse order to uninitialized
+    #   QuantizeHandler subclass. For example,
+    # {
+    #   # match a single node
+    #   (<class 'torch.nn.modules.conv.Conv3d'>:
+    #     <class 'torch.ao.quantization.fx.quantize.ConvRelu'>),
+    #   # match multiple nodes in reverse order
+    #   ((<function relu at 0x7f766a7360d0>, <built-in function add>):
+    #     <class 'torch.ao.quantization.fx.quantize.Add'>),
+    # }
+
+    pattern_to_quantize_handler: Dict[Pattern, QuantizeHandler] = {}
+    if backend_config is None:
+        backend_config = get_native_backend_config()
+    pattern_to_quantize_handler = _get_pattern_to_quantize_handlers(backend_config)
+    pattern_to_quantize_handler = _sorted_patterns_dict(pattern_to_quantize_handler)
+
+    root_node_getter_mapping = \
+        get_fusion_pattern_to_root_node_getter(backend_config)
+
+    _update_qconfig_for_fusion(model, qconfig_mapping)
+    _update_qconfig_for_fusion(model, _equalization_config)
+    flattened_qconfig_dict = _get_flattened_qconfig_dict(qconfig_mapping)
+    # TODO: support regex as well
+    propagate_qconfig_(model, flattened_qconfig_dict, prepare_custom_config.to_dict())
+
+    if is_qat:
+        module_to_qat_module = get_module_to_qat_module(backend_config)
+        _qat_swap_modules(model, module_to_qat_module)
+        _update_qconfig_for_qat(qconfig_mapping, backend_config)
+
+    # mapping from fully qualified module name to module instance
+    # for example,
+    # {
+    #   '': Model(...),
+    #   'linear': Linear(...),
+    #   'linear.weight_fake_quant': PerChannelMinMaxObserver(...),
+    # }
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+
+    # fill node_name_to_qconfig, a map from node name to qconfig, used in _find_matches
+    equalization_node_name_to_qconfig = _generate_node_name_to_qconfig(
+        model, named_modules, model.graph, _equalization_config, node_name_to_scope)
+    node_name_to_qconfig = _generate_node_name_to_qconfig(model, named_modules, model.graph, qconfig_mapping, node_name_to_scope)
+
+    # match the patterns that will get quantized
+    standalone_module_names = list(prepare_custom_config.standalone_module_names.keys())
+    standalone_module_classes = list(prepare_custom_config.standalone_module_classes.keys())
+
+    custom_module_classes = get_custom_module_class_keys(prepare_custom_config.float_to_observed_mapping)
+    matches_without_qconfig = _find_matches(
+        model.graph, named_modules, pattern_to_quantize_handler, root_node_getter_mapping,
+        standalone_module_names, standalone_module_classes, custom_module_classes)
+
+    # map qconfig instances to matches
+    node_name_to_match_result_with_qconfig = {}
+    for node_name, match_without_qconfig in matches_without_qconfig.items():
+        match_with_qconfig = (*match_without_qconfig, node_name_to_qconfig[node_name])
+        node_name_to_match_result_with_qconfig[node_name] = match_with_qconfig
+
+    _run_prepare_fx_on_standalone_modules(
+        model, is_qat, named_modules, node_name_to_match_result_with_qconfig, prepare_custom_config, backend_config)
+
+    # record names for the set of observed node, so that in convert step
+    # we know whether we need to convert a floating point module to reference
+    # quantized module or not
+    observed_node_names: Set[str] = set()
+
+    result_node = insert_observers_for_model(
+        model,
+        node_name_to_match_result_with_qconfig,
+        node_name_to_qconfig,
+        prepare_custom_config,
+        equalization_node_name_to_qconfig,
+        backend_config,
+        observed_node_names,
+        is_qat,
+    )
+    model = GraphModule(model, model.graph)
+
+    _save_state(model, node_name_to_qconfig, node_name_to_scope,
+                prepare_custom_config, equalization_node_name_to_qconfig,
+                qconfig_mapping, is_qat, observed_node_names)
+
+    if is_standalone_module:
+        assert result_node is not None
+        assert isinstance(result_node.args[0], Node), \
+            "standalone module only supports returning simple value currently"\
+            "(not tuple, dict etc.)"
+        # these inputs are observed in parent
+        # converting List[int] to Tensor since module attribute is
+        # Union[Tensor, Module]
+        input_quantized_idxs: List[int] = prepare_custom_config.input_quantized_indexes
+        output_quantized_idxs: List[int] = prepare_custom_config.output_quantized_indexes
+        observed_graph_module_attrs = model.meta["_observed_graph_module_attrs"]
+        # inplace modification
+        observed_graph_module_attrs.is_observed_standalone_module = True
+        observed_graph_module_attrs.standalone_module_input_quantized_idxs = \
+            input_quantized_idxs
+        observed_graph_module_attrs.standalone_module_output_quantized_idxs = \
+            output_quantized_idxs
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/qconfig_mapping_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/qconfig_mapping_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc428539d8efb0650101339def74b019520890f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/qconfig_mapping_utils.py
@@ -0,0 +1,343 @@
+import torch
+import re
+from collections import defaultdict, OrderedDict
+from typing import Callable, Any, Dict, Tuple, Set, List, Union
+from torch.ao.quantization import QConfig
+from torch.ao.quantization.qconfig import _add_module_to_qconfig_obs_ctr, QConfigAny, qconfig_equals
+from torch.ao.quantization.observer import (
+    _is_activation_post_process,
+)
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    DTypeConfig,
+)
+from torch.ao.quantization.backend_config.utils import (
+    get_module_to_qat_module,
+)
+
+from torch.fx import (
+    GraphModule,
+)
+from torch.fx.graph import (
+    Graph,
+)
+from torch.ao.nn.intrinsic import _FusedModule
+
+from ..utils import (
+    _parent_name,
+    get_qconfig_dtypes,
+)
+from ..qconfig_mapping import (
+    _OBJECT_TYPE_DICT_KEY,
+    _MODULE_NAME_DICT_KEY,
+    _MODULE_NAME_REGEX_DICT_KEY,
+    QConfigMapping,
+)
+
+__all__: List[str] = []
+
+
+
+def _maybe_adjust_qconfig_for_module_name_object_type_order(
+    qconfig_mapping: QConfigMapping,
+    cur_module_path: str,
+    cur_object_type: Callable,
+    cur_object_type_idx: int,
+    fallback_qconfig: QConfigAny,
+) -> QConfigAny:
+    for (module_name, object_type, index), qconfig in qconfig_mapping.module_name_object_type_order_qconfigs.items():
+        if (
+            (module_name == cur_module_path) and
+            (object_type == cur_object_type) and
+            (index == cur_object_type_idx)
+        ):
+            return qconfig
+    return fallback_qconfig
+
+
+def _update_qconfig_for_fusion(model: GraphModule, qconfig_mapping: QConfigMapping):
+    """
+    Update the QConfigMapping to account for fused modules such as LinearReLU.
+    This assumes the QConfigMapping's attributes have already been converted to OrderedDicts.
+    """
+    object_type_dict = qconfig_mapping.object_type_qconfigs
+    if len(object_type_dict) == 0:
+        return qconfig_mapping
+
+    modules = dict(model.named_modules())
+
+    for node in model.graph.nodes:
+        if node.op == 'call_module' and node.target in modules:
+            maybe_fused_module = modules[str(node.target)]
+            if not isinstance(maybe_fused_module, _FusedModule):
+                continue
+
+            ops = list(maybe_fused_module._modules.values())
+            fused_qconfig = object_type_dict.get(type(ops[0]), None)
+
+            # Raise an error if the modules in the fused module have
+            # different qconfigs specified in the qconfig_dict
+            # TODO: currently it only works for modules,
+            # need to make this work for torch.nn.functional.relu
+            # TODO: currently it only works for object_type configurations,
+            # ideally it should work for different types of configurations,
+            # maybe we want to redesign this part
+            for op in ops[1:]:
+                if not qconfig_equals(object_type_dict.get(type(op), None), fused_qconfig):
+                    raise LookupError(
+                        "During fusion, we need to specify the same " +
+                        f"qconfigs for all module types in {type(maybe_fused_module)} " +
+                        f"offending type: {type(op)}")
+
+            if fused_qconfig is not None:
+                object_type_dict[type(maybe_fused_module)] = fused_qconfig
+
+def _generate_node_name_to_qconfig(
+        root: torch.nn.Module,
+        modules: Dict[str, torch.nn.Module],
+        input_graph: Graph,
+        qconfig_mapping: QConfigMapping,
+        node_name_to_scope: Dict[str, Tuple[str, type]]) -> Dict[str, QConfigAny]:
+    global_qconfig = qconfig_mapping.global_qconfig
+    node_name_to_qconfig = {}
+
+    # example:
+    #
+    #   {'foo.bar': {F.linear: 0, F.conv2d: 1, ...}, ...}
+    #
+    # meaning in submodule 'foo.bar', we have seen 0 F.linear and
+    # 1 F.conv2d invocations so far.
+    submodule_to_object_type_to_cur_idx: Dict[str, Dict[Callable, int]] = \
+        defaultdict(lambda: defaultdict(int))
+    for node in input_graph.nodes:
+        qconfig = None
+        if node.op == "get_attr":
+            module_name, _ = _parent_name(node.target)
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
+                qconfig_mapping, type(modules[module_name]), module_name, global_qconfig)
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+        elif node.op == "call_function":
+            # precedence: module_name_qconfig
+            # > function_qconfig > global_qconfig
+            # module_name takes precedence over function qconfig
+            function_qconfig = _get_object_type_qconfig(
+                qconfig_mapping, node.target, global_qconfig)
+            module_path, module_type = node_name_to_scope[node.name]
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
+                qconfig_mapping, module_type, module_path, function_qconfig)
+
+            cur_object_type_idx = \
+                submodule_to_object_type_to_cur_idx[module_path][node.target]
+            submodule_to_object_type_to_cur_idx[module_path][node.target] += 1
+            qconfig = _maybe_adjust_qconfig_for_module_name_object_type_order(
+                qconfig_mapping, module_path, node.target, cur_object_type_idx, qconfig)
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+
+        elif node.op == "call_method":
+            module_path, module_type = node_name_to_scope[node.name]
+            # first use node.target (string) to get the qconfig
+            # this is to support configs like
+            # "object_type": [("reshape", qconfig)]
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
+                qconfig_mapping, node.target, module_path, global_qconfig)
+            # if there is no special config for the method, we'll fall back to the
+            # config for the module that contains the call_method node
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
+                qconfig_mapping, module_type, module_path, qconfig)
+            # currently call_method does not support modifying qconfig
+            # by order, we can add this later if it is needed.
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+
+        elif node.op == 'call_module':
+            # if the node is an observer, just continue - don't add it to the qconfig_map
+            if _is_activation_post_process(modules[node.target]):
+                continue
+            qconfig = _maybe_adjust_qconfig_for_module_type_or_name(
+                qconfig_mapping, type(modules[node.target]), node.target, global_qconfig)
+
+            module_path, module_type = node_name_to_scope[node.name]
+            # Note: for call_module, the module_path is the current module's name.
+            # to meaningfully count invocations, we need to count them in the parent
+            # module.
+            parent_name, _ = _parent_name(module_path)
+            cur_object_type_idx = \
+                submodule_to_object_type_to_cur_idx[parent_name][module_type]
+            submodule_to_object_type_to_cur_idx[parent_name][module_type] += 1
+            qconfig = _maybe_adjust_qconfig_for_module_name_object_type_order(
+                qconfig_mapping, parent_name, module_type, cur_object_type_idx,
+                qconfig)
+            qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(qconfig, modules.get(node.target, None))
+
+            # regex is not supported eager mode propagate_qconfig_, we'll
+            # need to set the qconfig explicitly here in case regex
+            # is used
+            modules[node.target].qconfig = qconfig_with_device_check
+        else:
+            qconfig_with_device_check = None
+
+        node_name_to_qconfig[node.name] = qconfig_with_device_check
+    return node_name_to_qconfig
+
+
+def _check_is_valid_config_dict(config_dict: Any, allowed_keys: Set[str], dict_name: str) -> None:
+    r""" Checks if the given config_dict has the correct keys
+
+    Args:
+      `config_dict`: dictionary whose keys we want to check
+    """
+
+    for k in config_dict.keys():
+        if k not in allowed_keys:
+            raise ValueError(
+                'Expected ' + dict_name + ' to have the following keys: ' +
+                str(allowed_keys) + '. But found \'' + k +
+                '\' instead.')
+
+
+def _compare_prepare_convert_qconfig_mappings(
+        prepare_qconfig_mapping: QConfigMapping,
+        convert_qconfig_mapping: QConfigMapping):
+    r""" Compare the qconfig_mapping passed in convert to the one from prepare and check the values
+
+    Args:
+      `prepare_qconfig_mapping`: configuration for prepare quantization step
+      `convert_qconfig_mapping`: configuration for convert quantization step
+    """
+    assert qconfig_equals(prepare_qconfig_mapping.global_qconfig, convert_qconfig_mapping.global_qconfig), \
+        "Expected global qconfigs to be the same in the prepare and convert quantization configs"
+    prepare_dicts: List[OrderedDict] = [
+        prepare_qconfig_mapping.object_type_qconfigs,
+        prepare_qconfig_mapping.module_name_qconfigs,
+        prepare_qconfig_mapping.module_name_regex_qconfigs,
+    ]
+    convert_dicts: List[OrderedDict] = [
+        convert_qconfig_mapping.object_type_qconfigs,
+        convert_qconfig_mapping.module_name_qconfigs,
+        convert_qconfig_mapping.module_name_regex_qconfigs,
+    ]
+    dict_names = [_OBJECT_TYPE_DICT_KEY, _MODULE_NAME_DICT_KEY, _MODULE_NAME_REGEX_DICT_KEY]
+    for i in range(len(prepare_dicts)):
+        for name in prepare_dicts[i].keys():
+            assert name in convert_dicts[i], f"Missing key {dict_names[i]} {name} in convert QConfigMapping \
+                when it was present in prepare"
+            assert convert_dicts[i][name] is None \
+                or qconfig_equals(prepare_dicts[i][name], convert_dicts[i][name]), \
+                f"Expected convert QConfigMapping to have the same qconfig as prepare for key {dict_names[i]} {name}; \
+                prepare: {prepare_dicts[i][name]}; convert: {convert_dicts[i][name]}"
+
+def _is_qconfig_supported_by_dtype_configs(qconfig: QConfig, dtype_configs: List[DTypeConfig]):
+    for dtype_config in dtype_configs:
+        is_dynamic = dtype_config.is_dynamic
+        if is_dynamic is None:
+            is_dynamic = False
+        input_dtype = dtype_config.input_dtype or torch.float
+        weight_dtype = dtype_config.weight_dtype or torch.float
+        bias_dtype = dtype_config.bias_dtype or torch.float
+        output_dtype = dtype_config.output_dtype or torch.float
+        qconfig_activation_dtype, qconfig_weight_dtype, qconfig_input_act_is_dynamic = \
+            get_qconfig_dtypes(qconfig)
+        qconfig_bias_dtype = torch.float16 \
+            if (
+                qconfig_activation_dtype == torch.float16
+                and qconfig_weight_dtype == torch.float16
+                and not is_dynamic
+            ) else torch.float
+
+        if is_dynamic:
+            is_match = qconfig_input_act_is_dynamic and \
+                input_dtype == qconfig_activation_dtype and \
+                output_dtype == torch.float and \
+                weight_dtype == qconfig_weight_dtype
+        else:
+            is_match = input_dtype == qconfig_activation_dtype and \
+                output_dtype == qconfig_activation_dtype and \
+                weight_dtype == qconfig_weight_dtype and \
+                bias_dtype == qconfig_bias_dtype
+        if is_match:
+            return True
+    return False
+
+def _get_object_type_qconfig(
+        qconfig_mapping: QConfigMapping,
+        object_type: Union[Callable, str],
+        fallback_qconfig: QConfigAny) -> QConfigAny:
+    return qconfig_mapping.object_type_qconfigs.get(object_type, fallback_qconfig)
+
+
+def _get_module_name_regex_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+    for regex_pattern, qconfig in qconfig_mapping.module_name_regex_qconfigs.items():
+        if re.match(regex_pattern, module_name):
+            # first match wins
+            return qconfig
+    return fallback_qconfig
+
+
+def _get_module_name_qconfig(qconfig_mapping, module_name, fallback_qconfig):
+    if module_name == '':
+        # module name qconfig not found
+        return fallback_qconfig
+    if module_name in qconfig_mapping.module_name_qconfigs:
+        return qconfig_mapping.module_name_qconfigs[module_name]
+    else:
+        parent, _ = _parent_name(module_name)
+        return _get_module_name_qconfig(qconfig_mapping, parent, fallback_qconfig)
+
+
+def _maybe_adjust_qconfig_for_module_type_or_name(qconfig_mapping, module_type, module_name, global_qconfig):
+    # get qconfig for module_name,
+    # fallback to module_name_regex_qconfig, module_type_qconfig,
+    # global_qconfig if necessary
+    module_type_qconfig = _get_object_type_qconfig(
+        qconfig_mapping, module_type, global_qconfig)
+    module_name_regex_qconfig = _get_module_name_regex_qconfig(
+        qconfig_mapping, module_name, module_type_qconfig)
+    module_name_qconfig = _get_module_name_qconfig(
+        qconfig_mapping, module_name, module_name_regex_qconfig)
+    return module_name_qconfig
+
+
+def _get_flattened_qconfig_dict(qconfig_mapping: QConfigMapping) -> Dict[Union[Callable, str], QConfigAny]:
+    """ flatten the global, object_type and module_name qconfig
+    to the same qconfig_dict so that it can be used by
+    propagate_qconfig_ function.
+    "module_name_regex" is ignored for now since it's not supported
+    in propagate_qconfig_, but it can be fixed later.
+
+    For example:
+    Input: {
+      "": qconfig,
+      "object_type": [
+        (torch.add, qconfig)
+      ],
+      "module_name": [
+        ("conv", qconfig)
+      ]
+    }
+
+    Output: {
+      "": qconfig,
+      torch.add: qconfig,
+      "conv": qconfig
+    }
+    """
+    flattened: Dict[Union[Callable, str], QConfigAny] = {"": qconfig_mapping.global_qconfig}
+    for obj, qconfig in qconfig_mapping.object_type_qconfigs.items():
+        flattened[obj] = qconfig
+    for obj, qconfig in qconfig_mapping.module_name_qconfigs.items():
+        flattened[obj] = qconfig
+    return flattened
+
+
+def _update_qconfig_for_qat(
+        qconfig_mapping: QConfigMapping,
+        backend_config: BackendConfig):
+    """
+    Update the qconfig_mapping to account for module swaps during QAT.
+    During QAT we perform a module swap on the nn.Module types to the corresponding nn.qat.modules types.
+    """
+    module_to_qat_module_class = get_module_to_qat_module(backend_config)
+    object_type_dict = qconfig_mapping.object_type_qconfigs
+    new_object_type_dict = object_type_dict.copy()
+    for k, v in new_object_type_dict.items():
+        if k in module_to_qat_module_class:
+            object_type_dict[module_to_qat_module_class[k]] = v
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/quantize_handler.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/quantize_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bda8e0210590306933d20680530ec9911c0537b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/quantize_handler.py
@@ -0,0 +1,197 @@
+from abc import ABC
+from typing import Callable, Dict, List, Optional, Type
+
+import torch
+
+from torch.ao.quantization.backend_config import (
+    BackendConfig,
+    DTypeConfig,
+    ObservationType,
+)
+from torch.ao.quantization.utils import NodePattern, Pattern, QuantizerCls
+from torch.fx.graph import Node
+
+from .utils import all_node_args_have_no_tensors
+
+
+__all__ = [
+    "QuantizeHandler",
+    "BinaryOpQuantizeHandler",
+    "CatQuantizeHandler",
+    "ConvReluQuantizeHandler",
+    "LinearReLUQuantizeHandler",
+    "BatchNormQuantizeHandler",
+    "EmbeddingQuantizeHandler",
+    "RNNDynamicQuantizeHandler",
+    "DefaultNodeQuantizeHandler",
+    "FixedQParamsOpQuantizeHandler",
+    "CopyNodeQuantizeHandler",
+    "GeneralTensorShapeOpQuantizeHandler",
+    "CustomModuleQuantizeHandler",
+    "StandaloneModuleQuantizeHandler",
+]
+
+def _default_root_node_getter(node_pattern):
+    if node_pattern is None:
+        return node_pattern
+    while not isinstance(node_pattern, Node):
+        node_pattern = node_pattern[-1]
+    return node_pattern
+
+# Base Pattern Handler
+class QuantizeHandler(ABC):  # noqa: B024
+    """ Base handler class for the quantizer patterns
+    """
+    def __init__(
+            self,
+            node_pattern: NodePattern,
+            modules: Dict[str, torch.nn.Module],
+            root_node_getter: Optional[Callable] = None,
+            is_custom_module=False,
+            is_standalone_module=False):
+        """ Records pattern information in __init__, which will be used
+        in convert
+        """
+        self.node_pattern = node_pattern
+        self.modules = modules
+        if root_node_getter is None:
+            root_node_getter = _default_root_node_getter
+        self.root_node = root_node_getter(node_pattern)
+        self.is_custom_module_ = is_custom_module
+        self.is_standalone_module_ = is_standalone_module
+        self.num_tensor_args = 0
+        # determine how many of the first two args are Tensors (versus scalars)
+        # this distinguishes things like "x + y" from "x + 2" or "2 + x"
+        if isinstance(self.root_node, Node):
+            cache_for_no_tensor_check: Dict[Node, bool] = {}
+            for arg_idx in range(len(self.root_node.args)):
+                arg = self.root_node.args[arg_idx]
+                if isinstance(arg, Node) and (
+                        not all_node_args_have_no_tensors(
+                            arg, self.modules, cache_for_no_tensor_check)):
+                    self.num_tensor_args += 1
+
+    def is_general_tensor_value_op(self) -> bool:
+        """
+        Returns True if the operator works for both floating point and
+        quantized input, and does some computation based on the input Tensor,
+        or the ops that only re-arranges the Tensor values or query some metadata
+        about the Tensor
+        so we need to insert observer/fake_quant for the output of the
+        operator (same observer instance as input)
+        since the distribution of values is different for input and output
+        Tensors (for HistogramObserver) while they share the same quantization
+        parameters
+        Example operator: avgpool2d, reshape, transpose, maxpool2d
+        Example observed operator:
+        observer_0 - avgpool2d - observer_0 (same observer instance as input)
+        """
+        return False
+
+    def is_custom_module(self):
+        return self.is_custom_module_
+
+    def is_standalone_module(self):
+        return self.is_standalone_module_
+
+def _get_quantize_handler_cls(
+        observation_type: ObservationType,
+        dtype_configs: List[DTypeConfig],
+        num_tensor_args_to_observation_type: Dict[int, ObservationType]) -> Type[QuantizeHandler]:
+    """
+    Return a configurable QuantizeHandler that matches the given specifications from the backend.
+    """
+
+    class ConfigurableQuantizeHandler(QuantizeHandler):
+        def __init__(
+                self,
+                node_pattern: NodePattern,
+                modules: Dict[str, torch.nn.Module],
+                root_node_getter: Optional[Callable] = None):
+            super().__init__(node_pattern, modules, root_node_getter)
+            if num_tensor_args_to_observation_type:
+                assert self.num_tensor_args in num_tensor_args_to_observation_type, \
+                    f"Must provide observation_type config for tensor number {self.num_tensor_args}" \
+                    f" in num_tensor_args_to_observation_type for {node_pattern}"
+                self.observation_type = num_tensor_args_to_observation_type[self.num_tensor_args]
+            else:
+                self.observation_type = observation_type
+            self.dtype_configs = dtype_configs
+
+        def is_general_tensor_value_op(self) -> bool:
+            return self.observation_type == ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT
+
+    return ConfigurableQuantizeHandler
+
+def _get_pattern_to_quantize_handlers(backend_config: BackendConfig) -> Dict[Pattern, QuantizerCls]:
+    """
+    Note: Quantize handler is just a holder for some check methods like
+    (should_insert_observer_for_output), maybe this can be a enum as well,
+    we can refactor this after we convert the path for fbgemm/qnnpack fully to the
+    new path, this is not exposed to backend developers
+    """
+    pattern_to_quantize_handlers = {}
+    for pattern, config in backend_config._pattern_complex_format_to_config.items():
+        observation_type = config.observation_type
+        dtype_configs = config.dtype_configs
+        num_tensor_args_to_observation_type = config._num_tensor_args_to_observation_type
+        pattern_to_quantize_handlers[pattern] = \
+            _get_quantize_handler_cls(
+                observation_type,
+                dtype_configs,
+                num_tensor_args_to_observation_type)
+    return pattern_to_quantize_handlers
+
+# TODO: remove this class, this is still exposed in torch.ao.quantization
+# but we should be able to break bc
+class BinaryOpQuantizeHandler(QuantizeHandler):
+    pass
+
+class CatQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class ConvReluQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class LinearReLUQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class BatchNormQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class EmbeddingQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class RNNDynamicQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove this class
+class DefaultNodeQuantizeHandler(QuantizeHandler):
+    """ Common quantized op, first input and first output will be quantized
+    """
+    pass
+
+# TODO: remove this class
+class FixedQParamsOpQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove
+class CopyNodeQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: remove
+class GeneralTensorShapeOpQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: not used, can be removed after torch.ao.quantization namespace is deprecated
+class CustomModuleQuantizeHandler(QuantizeHandler):
+    pass
+
+# TODO: not used, can be removed after torch.ao.quantization namespace is deprecated
+class StandaloneModuleQuantizeHandler(QuantizeHandler):
+    pass
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/tracer.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..914779749d4b5af6426e0890309908e7141ba050
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/tracer.py
@@ -0,0 +1,45 @@
+import torch
+from torch.fx._symbolic_trace import Tracer
+from torch.fx.proxy import Scope
+from torch.ao.nn.intrinsic import _FusedModule
+from typing import List, Callable
+
+__all__ = [
+    "QuantizationTracer",
+]
+
+class ScopeContextManager(torch.fx.proxy.ScopeContextManager):
+    def __init__(
+        self,
+        scope: Scope,
+        current_module: torch.nn.Module,
+        current_module_path: str
+    ):
+        super().__init__(scope, Scope(current_module_path, type(current_module)))
+
+
+class QuantizationTracer(Tracer):
+    def __init__(
+        self, skipped_module_names: List[str], skipped_module_classes: List[Callable]
+    ):
+        super().__init__()
+        self.skipped_module_names = skipped_module_names
+        self.skipped_module_classes = skipped_module_classes
+        # NB: initialized the module_type of top level module to None
+        # we are assuming people won't configure the model with the type of top level
+        # module here, since people can use "" for global config
+        # We can change this if there is a use case that configures
+        # qconfig using top level module type
+        self.scope = Scope("", None)
+        self.record_stack_traces = True
+
+    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
+        return (
+            (
+                (m.__module__.startswith("torch.nn") or m.__module__.startswith("torch.ao.nn"))
+                and not isinstance(m, torch.nn.Sequential)
+            )
+            or module_qualified_name in self.skipped_module_names
+            or type(m) in self.skipped_module_classes
+            or isinstance(m, _FusedModule)
+        )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/fx/utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/fx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..87a8d7b6a933bdd0dc3eb805ec477b59980c5a9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/fx/utils.py
@@ -0,0 +1,885 @@
+import copy
+import torch
+import torch.nn as nn
+from torch.ao.quantization import (
+    QConfigAny,
+    QuantType,
+)
+from torch.ao.quantization.backend_config import (
+    DTypeWithConstraints,
+)
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantizeBase,
+    FixedQParamsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    FixedQParamsObserver,
+    ObserverBase,
+)
+from torch.ao.quantization.qconfig import (
+    float16_static_qconfig,
+    float16_dynamic_qconfig,
+    qconfig_equals,
+)
+from torch.ao.quantization.stubs import DeQuantStub
+from torch.ao.quantization.utils import (
+    activation_is_statically_quantized,
+)
+from torch.ao.quantization.observer import _is_activation_post_process
+from torch.ao.quantization.qconfig_mapping import QConfigMapping
+
+from torch.fx import GraphModule, map_arg
+
+from torch.fx.graph import (
+    Graph,
+    Node,
+)
+from .custom_config import PrepareCustomConfig
+# importing the lib so that the quantized_decomposed ops are registered
+from ._decomposed import quantized_decomposed_lib  # noqa: F401
+
+from typing import Callable, Optional, List, Dict, Any, Set, Tuple, Union, Type
+from dataclasses import dataclass
+from collections import namedtuple
+import operator
+import warnings
+
+# TODO: revisit this list. Many helper methods shouldn't be public
+__all__ = [
+    "all_node_args_except_first",
+    "all_node_args_have_no_tensors",
+    "assert_and_get_unique_device",
+    "collect_producer_nodes",
+    "create_getattr_from_value",
+    "create_node_from_old_node_preserve_meta",
+    "EMPTY_ARG_DICT",
+    "get_custom_module_class_keys",
+    "get_linear_prepack_op_for_dtype",
+    "get_new_attr_name_with_prefix",
+    "get_non_observable_arg_indexes_and_types",
+    "get_qconv_prepack_op",
+    "get_skipped_module_name_and_classes",
+    "graph_module_from_producer_nodes",
+    "maybe_get_next_module",
+    "NodeInfo",
+    "node_arg_is_bias",
+    "node_arg_is_weight",
+    "NON_OBSERVABLE_ARG_DICT",
+    "NON_QUANTIZABLE_WEIGHT_OPS",
+    "return_arg_list",
+    "ObservedGraphModuleAttrs",
+]
+
+NON_QUANTIZABLE_WEIGHT_OPS = {torch.nn.functional.layer_norm, torch.nn.functional.group_norm, torch.nn.functional.instance_norm}
+
+@dataclass
+class ObservedGraphModuleAttrs:
+    node_name_to_qconfig: Dict[str, QConfigAny]
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+    prepare_custom_config: PrepareCustomConfig
+    equalization_node_name_to_qconfig: Dict[str, Any]
+    qconfig_mapping: QConfigMapping
+    is_qat: bool
+    observed_node_names: Set[str]
+    is_observed_standalone_module: bool = False
+    standalone_module_input_quantized_idxs: Optional[List[int]] = None
+    standalone_module_output_quantized_idxs: Optional[List[int]] = None
+
+def node_arg_is_weight(node: Node, arg: Any) -> bool:
+    """Returns if node arg is weight"""
+    weight_index = None
+    if "target_dtype_info" in node.meta:
+        weight_index = node.meta["target_dtype_info"].get("weight_index", None)
+    if weight_index is not None and weight_index < len(node.args) and node.args[weight_index] is arg:
+        return True
+    return node.kwargs.get("weight") is arg
+
+def node_arg_is_bias(node: Node, arg: Any) -> bool:
+    """Returns if node arg is bias"""
+    bias_index = None
+    if "target_dtype_info" in node.meta:
+        bias_index = node.meta["target_dtype_info"].get("bias_index", None)
+    if bias_index is not None and bias_index < len(node.args) and node.args[bias_index] is arg:
+        return True
+    return node.kwargs.get("bias") is arg
+
+def get_custom_module_class_keys(custom_module_mapping: Dict[QuantType, Dict[Type, Type]]) -> List[Any]:
+    r""" Get all the unique custom module keys in the custom config dict
+    e.g.
+    Input:
+    {
+        QuantType.STATIC: {
+            CustomModule1: ObservedCustomModule
+        },
+        QuantType.DYNAMIC: {
+            CustomModule2: DynamicObservedCustomModule
+        },
+        QuantType.WEIGHT_ONLY: {
+            CustomModule3: WeightOnlyObservedCustomModule
+        },
+    }
+
+    Output:
+    # extract the keys across all inner STATIC, DYNAMIC, and WEIGHT_ONLY dicts
+    [CustomModule1, CustomModule2, CustomModule3]
+    """
+    # using set to dedup
+    float_custom_module_classes : Set[Any] = set()
+    for quant_mode in [QuantType.STATIC, QuantType.DYNAMIC, QuantType.WEIGHT_ONLY]:
+        quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {})
+        quant_mode_custom_module_classes = set(quant_mode_custom_module_config.keys())
+        float_custom_module_classes |= quant_mode_custom_module_classes
+    return list(float_custom_module_classes)
+
+def get_linear_prepack_op_for_dtype(dtype):
+    if dtype == torch.float16:
+        return torch.ops.quantized.linear_prepack_fp16
+    elif dtype == torch.qint8:
+        return torch.ops.quantized.linear_prepack
+    else:
+        raise Exception("can't get linear prepack op for dtype:", dtype)
+
+def get_qconv_prepack_op(conv_op: Callable) -> Callable:
+    prepack_ops = {
+        torch.nn.functional.conv1d: torch.ops.quantized.conv1d_prepack,
+        torch.nn.functional.conv2d: torch.ops.quantized.conv2d_prepack,
+        torch.nn.functional.conv3d: torch.ops.quantized.conv3d_prepack,
+        torch.nn.functional.conv_transpose1d: torch.ops.quantized.conv_transpose1d_prepack,
+        torch.nn.functional.conv_transpose2d: torch.ops.quantized.conv_transpose2d_prepack,
+        torch.nn.functional.conv_transpose3d: torch.ops.quantized.conv_transpose3d_prepack,
+    }
+    prepack_op = prepack_ops.get(conv_op, None)
+    assert prepack_op, f"Didn't find prepack op for {conv_op}"
+    return prepack_op
+
+# Returns a function that can get a new attribute name for module with given
+# prefix, for example,
+# >> get_new_observer_name = get_new_attr_name_with_prefix('_observer')
+# >> new_name = get_new_observer_name(module)
+# new_name will be an unused attribute name on module, e.g. `_observer_1`
+def get_new_attr_name_with_prefix(prefix: str) -> Callable:
+    prefix = prefix.replace(".", "_")
+
+    def get_new_attr_name(module: torch.nn.Module):
+        def get_attr_name(i: int):
+            return prefix + str(i)
+        i = 0
+        attr_name = get_attr_name(i)
+        while hasattr(module, attr_name):
+            i += 1
+            attr_name = get_attr_name(i)
+        return attr_name
+    return get_new_attr_name
+
+def collect_producer_nodes(node: Node) -> Optional[List[Node]]:
+    r''' Starting from a target node, trace back until we hit inpu or
+    getattr node. This is used to extract the chain of operators
+    starting from getattr to the target node, for example
+    def forward(self, x):
+      observed = self.observer(self.weight)
+      return F.linear(x, observed)
+    collect_producer_nodes(observed) will either return a list of nodes that
+    produces the observed node or None if we can't extract a self contained
+    graph without free variables(inputs of the forward function).
+    '''
+    nodes = [node]
+    frontier = [node]
+    while frontier:
+        node = frontier.pop()
+        all_args = list(node.args) + list(node.kwargs.values())
+        for arg in all_args:
+            if not isinstance(arg, Node):
+                continue
+            if arg.op == 'placeholder':
+                # hit input, can't fold in this case
+                return None
+            nodes.append(arg)
+            if not (arg.op == 'call_function' and arg.target == getattr):
+                frontier.append(arg)
+    return nodes
+
+def graph_module_from_producer_nodes(
+        root: GraphModule, producer_nodes: List[Node]) -> GraphModule:
+    r''' Construct a graph module from extracted producer nodes
+    from `collect_producer_nodes` function
+    Args:
+      root: the root module for the original graph
+      producer_nodes: a list of nodes we use to construct the graph
+    Return:
+      A graph module constructed from the producer nodes
+    '''
+    assert len(producer_nodes) > 0, 'list of producer nodes can not be empty'
+    # since we traced back from node to getattr
+    producer_nodes.reverse()
+    graph = Graph()
+    env: Dict[Any, Any] = {}
+
+    def load_arg(a):
+        return map_arg(a, lambda node: env[node])
+    for producer_node in producer_nodes:
+        env[producer_node] = graph.node_copy(producer_node, load_arg)
+    graph.output(load_arg(producer_nodes[-1]))
+    graph_module = GraphModule(root, graph)
+    return graph_module
+
+def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
+    """
+    Returns the unique device for a module, or None if no device is found.
+    Throws an error if multiple devices are detected.
+    """
+    devices = {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+    """
+    As a temp workaround for AIMP HHC publish we added CPU check.remove it later. T163614564
+    """
+    if {torch.device("cpu"), torch.device("meta")} == devices:
+        warnings.warn("Both 'meta' and 'cpu' are present in the list of devices. Module can have one device. We Select 'cpu'.")
+        devices = {torch.device("cpu")}
+    ""
+    assert len(devices) <= 1, (
+        "prepare only works with cpu or single-device CUDA modules, "
+        f"but got devices {devices}"
+    )
+    device = next(iter(devices)) if len(devices) > 0 else None
+    return device
+
+def create_getattr_from_value(module: torch.nn.Module, graph: Graph, prefix: str, value: Any) -> Node:
+    """
+    Given a value of any type, creates a getattr node corresponding to the value and
+    registers the value as a buffer to the module.
+    """
+    get_new_attr_name = get_new_attr_name_with_prefix(prefix)
+    attr_name = get_new_attr_name(module)
+    device = assert_and_get_unique_device(module)
+    new_value = value.clone().detach() if isinstance(value, torch.Tensor) \
+        else torch.tensor(value, device=device)
+    module.register_buffer(attr_name, new_value)
+    # Create get_attr with value
+    attr_node = graph.create_node("get_attr", attr_name)
+    return attr_node
+
+def all_node_args_have_no_tensors(node: Node, modules: Dict[str, torch.nn.Module], cache: Dict[Node, bool]) -> bool:
+    """
+    If we know for sure that all of this node's args have no
+    tensors (are primitives), return True.  If we either
+    find a tensor or are not sure, return False. Note: this
+    function is not exact.
+    """
+    if cache and node in cache:
+        return cache[node]
+
+    result = False  # will be overwritten
+    if not isinstance(node, Node):
+        result = True
+    elif node.op == 'placeholder':
+        result = False
+    elif node.op == 'call_module':
+        assert isinstance(node.target, str)
+        if _is_activation_post_process(modules[node.target]):
+            result = all_node_args_have_no_tensors(node.args[0], modules, cache)  # type: ignore[arg-type]
+    elif node.op == 'call_module':
+        result = False
+    elif node.op == 'call_function' and node.target is operator.getitem:
+        result = all_node_args_have_no_tensors(node.args[0], modules, cache)  # type: ignore[arg-type]
+    elif node.op == 'get_attr':
+        result = False
+    elif node.target is getattr and node.args[1] in ['ndim', 'shape']:
+        # x1 = x0.ndim
+        result = True
+    elif node.op == 'call_method' and node.target == 'size':
+        # x1 = x0.size(0)
+        result = True
+    else:
+        found_one_tensor = False
+        for arg in node.args:
+            if isinstance(arg, list):
+                for list_el in arg:
+                    if isinstance(list_el, Node):
+                        this_list_el_args_have_no_tensors = \
+                            all_node_args_have_no_tensors(list_el, modules, cache)
+                        found_one_tensor = found_one_tensor or \
+                            (not this_list_el_args_have_no_tensors)
+                        # If found_one_tensor is True, there is no point in
+                        # recursing further as the end result will always
+                        # be True.
+                        # TODO(future PR): remove this entire function  and
+                        # change to dtype inference without recursion.
+                        if found_one_tensor:
+                            result = not found_one_tensor
+                            if cache:
+                                cache[node] = result
+                            return result
+            elif isinstance(arg, int):
+                pass
+            else:
+                if isinstance(arg, Node):
+                    this_arg_args_have_no_tensors = all_node_args_have_no_tensors(arg, modules, cache)
+                    found_one_tensor = found_one_tensor or \
+                        (not this_arg_args_have_no_tensors)
+                    # If found_one_tensor is True, there is no point in
+                    # recursing further as the end result will always
+                    # be True.
+                    # TODO(future PR): remove this entire function  and
+                    # change to dtype inference without recursion.
+                    if found_one_tensor:
+                        result = not found_one_tensor
+                        if cache:
+                            cache[node] = result
+                        return result
+                else:
+                    found_one_tensor = True
+            result = not found_one_tensor
+    if cache:
+        cache[node] = result
+    return result
+
+def all_node_args_except_first(node: Node) -> List[int]:
+    """
+    Returns all node arg indices after first
+    """
+    return list(range(1, len(node.args)))
+
+def return_arg_list(arg_indices: List[int]) -> Callable[[Node], List[int]]:
+    """
+    Constructs a function that takes a node as arg and returns the arg_indices
+    that are valid for node.args
+    """
+    def arg_indices_func(node: Node) -> List[int]:
+        return [i for i in arg_indices if i < len(node.args)]
+    return arg_indices_func
+
+NodeInfo = namedtuple("NodeInfo", "op target")
+
+# this dict identifies which indices of a node are non tensors
+# so that they can be propagated correctly since inserting observers
+# for them would cause errors
+
+NON_OBSERVABLE_ARG_DICT: Dict[NodeInfo, Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]] = {
+    NodeInfo("call_method", "masked_fill") : {
+        torch.bool: return_arg_list([1]),
+        float: return_arg_list([2])
+    },
+    NodeInfo("call_method", "permute") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "repeat") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "reshape") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "size") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "transpose") : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", torch.transpose) : {
+        int: all_node_args_except_first
+    },
+    NodeInfo("call_method", "unsqueeze") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "unsqueeze_") : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", torch.unsqueeze) : {
+        int: return_arg_list([1])
+    },
+    NodeInfo("call_method", "view") : {
+        int: all_node_args_except_first
+    },
+}
+
+EMPTY_ARG_DICT: Dict[Union[type, torch.dtype], Callable[[Node], List[int]]] = {}
+
+def get_non_observable_arg_indexes_and_types(node: Node) -> Dict[Union[type, torch.dtype], Callable[[Node], List[int]]]:
+    """
+    Returns a dict with of non float tensor types as keys and values which correspond to a
+    function to retrieve the list (which takes the node as an argument)
+    """
+    info = NodeInfo(node.op, node.target)
+
+    return NON_OBSERVABLE_ARG_DICT.get(info, EMPTY_ARG_DICT)
+
+def maybe_get_next_module(
+    node: Node,
+    modules: Dict[str, nn.Module],
+    target_module_type: Optional[Type[nn.Module]] = None,
+    target_functional_type: Any = None,
+) -> Optional[Node]:
+    """ Gets the next module that matches what is needed in
+    is_target_module_type if it exists
+
+    Args:
+        node: The node whose users we want to look at
+        target_module_type: Module type that we want to check
+        target_functional_type: Functional type that we want to check
+    """
+
+    for user in node.users.keys():
+        if user.op == 'call_module' and target_module_type is not None and \
+           isinstance(modules[str(user.target)], target_module_type):
+            return user
+        elif (user.op == 'call_function' and target_functional_type is not None and
+              user.target == target_functional_type):
+            return user
+
+    return None
+
+def create_node_from_old_node_preserve_meta(
+    quantized_graph: Graph,
+    create_node_args: Tuple[Any, ...],
+    old_node: Node,
+) -> Node:
+    """
+    Creates `new_node` and copies the necessary metadata to it from `old_node`.
+    """
+    new_node = quantized_graph.create_node(*create_node_args)
+    new_node.stack_trace = old_node.stack_trace
+    return new_node
+
+def get_skipped_module_name_and_classes(
+        prepare_custom_config: PrepareCustomConfig,
+        is_standalone_module: bool) -> Tuple[List[str], List[Type[Any]]]:
+    skipped_module_names = copy.copy(prepare_custom_config.non_traceable_module_names)
+    skipped_module_classes = copy.copy(prepare_custom_config.non_traceable_module_classes)
+    if not is_standalone_module:
+        # standalone module and custom module config are applied in top level module
+        skipped_module_names += list(prepare_custom_config.standalone_module_names.keys())
+        skipped_module_classes += list(prepare_custom_config.standalone_module_classes.keys())
+        skipped_module_classes += get_custom_module_class_keys(prepare_custom_config.float_to_observed_mapping)
+
+    return skipped_module_names, skipped_module_classes
+
+def _is_custom_module_lstm(
+        node: Node,
+        named_modules: Dict[str, torch.nn.Module],
+        qconfig: QConfigAny = None,
+        # QuantizeHandler, but we cannot include the type here due to circular imports
+        qhandler: Optional[Any] = None,
+) -> bool:
+    """
+    Return whether this refers to the custom module LSTM flow.
+    """
+    mod = _get_module(node, named_modules)
+    if qconfig is not None and qhandler is not None:
+        assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
+        return isinstance(mod, torch.nn.LSTM) and \
+            activation_is_statically_quantized(qconfig) and \
+            qhandler.is_custom_module()
+    else:
+        return isinstance(mod, torch.ao.nn.quantizable.LSTM)
+
+def _is_custom_module_mha(
+        node: Node,
+        named_modules: Dict[str, torch.nn.Module],
+        qconfig: QConfigAny = None,
+        # QuantizeHandler, but we cannot include the type here due to circular imports
+        qhandler: Optional[Any] = None,
+) -> bool:
+    """
+    Return whether this refers to the custom module MultiheadAttention flow.
+    """
+    mod = _get_module(node, named_modules)
+    if qconfig is not None and qhandler is not None:
+        assert isinstance(qhandler, torch.ao.quantization.fx.quantize_handler.QuantizeHandler)  # type: ignore[attr-defined]
+        return isinstance(mod, torch.nn.MultiheadAttention) and \
+            activation_is_statically_quantized(qconfig) and \
+            qhandler.is_custom_module()
+    else:
+        return isinstance(mod, torch.ao.nn.quantizable.MultiheadAttention)
+
+def _get_module(node: Node, named_modules: Dict[str, torch.nn.Module]) -> Optional[torch.nn.Module]:
+    """
+    If `node` refers to a call_module node, return the module, else None.
+    """
+    if node.op == "call_module" and str(node.target) in named_modules:
+        return named_modules[str(node.target)]
+    else:
+        return None
+
+def _insert_dequant_stub(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+) -> Node:
+    """
+    Attach a `DeQuantStub` to the model and create a node that calls this
+    `DeQuantStub` on the output of `node`, similar to how observers are inserted.
+    """
+    prefix = "dequant_stub_"
+    get_new_dequant_stub_name = get_new_attr_name_with_prefix(prefix)
+    dequant_stub_name = get_new_dequant_stub_name(model)
+    dequant_stub = DeQuantStub()
+    setattr(model, dequant_stub_name, dequant_stub)
+    named_modules[dequant_stub_name] = dequant_stub
+    with graph.inserting_after(node):
+        return graph.call_module(dequant_stub_name, (node,))
+
+def _insert_dequant_stubs_for_custom_module_lstm_output(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+) -> Node:
+    """
+    Insert DeQuantStubs after each internal output node of custom module LSTM.
+
+    Custom module LSTM outputs are nested tuples of the structure (output, (hidden0, hidden1)),
+    Since we cannot dequantize a tuple as a whole, we must first break down the tuple into its
+    components through `getitem`. This function transforms the graph as follows:
+
+      (1) Split the LSTM node into (output, (hidden0, hidden1))
+      (2) Insert a DeQuantStub after each internal node
+      (3) Recombine the DeQuantStubs into the same structure as before
+      (4) Reroute all consumers of the original LSTM node and its sub-nodes
+          (e.g. lstm[0])
+
+    Before:
+                   lstm_output
+                        |
+                        v
+                  original_user(s)
+    After:
+                   lstm_output
+                  /           \\
+                 /  (getitem)  \\
+                /               \\
+               v                 v
+             output            hidden
+               |               /   \\
+         (DeQuantStub)        (getitem)
+               |             /       \\
+               v            v         v
+           output_dq     hidden0    hidden1
+               |            |         |
+               |    (DeQuantStub) (DeQuantStub)
+               |            |         |
+               |            v         v
+               |      hidden0_dq  hidden1_dq
+               |            \\       /
+               |              (tuple)
+               |              \\   /
+               |               v  v
+               |             hidden_dq
+               \\               /
+                \\   (tuple)   /
+                 v            v
+                 lstm_output_dq
+                       |
+                       v
+                original_user(s)
+
+    For step (4), reroute all users of the original LSTM node(s) as follows:
+      lstm_output -> lstm_output_dq
+      lstm_output[0] -> output_dq
+      lstm_output[1] -> hidden_dq
+      lstm_output[1][0] -> hidden0_dq
+      lstm_output[1][1] -> hidden1_dq
+
+    Return the node `lstm_output_dq`.
+    """
+    # (1) Split the LSTM node into (output, (hidden0, hidden1))
+    # (2) Insert a DeQuantStub after each internal node
+    with graph.inserting_after(node):
+        output = graph.call_function(operator.getitem, (node, 0))
+        output_dq = _insert_dequant_stub(output, model, named_modules, graph)
+    with graph.inserting_after(output_dq):
+        hidden = graph.call_function(operator.getitem, (node, 1))
+    with graph.inserting_after(hidden):
+        hidden0 = graph.call_function(operator.getitem, (hidden, 0))
+        hidden0_dq = _insert_dequant_stub(hidden0, model, named_modules, graph)
+    with graph.inserting_after(hidden0_dq):
+        hidden1 = graph.call_function(operator.getitem, (hidden, 1))
+        hidden1_dq = _insert_dequant_stub(hidden1, model, named_modules, graph)
+
+    # (3) Recombine the DeQuantStubs into the same structure as before
+    with graph.inserting_after(hidden1_dq):
+        hidden_dq = graph.call_function(tuple, ([hidden0_dq, hidden1_dq],))
+    with graph.inserting_after(hidden_dq):
+        lstm_output_dq = graph.call_function(tuple, ([output_dq, hidden_dq],))
+
+    # (4) Reroute all consumers of the original LSTM node and its sub-nodes
+    for user in list(node.users.keys()):
+        if user != output and user != hidden:
+            user.replace_input_with(node, lstm_output_dq)
+    # The getitem and tuple nodes we added here may interfere with reference quantized
+    # pattern matching, so we need to redirect the consumers of internal nodes to the
+    # corresponding nodes with DeQuantStubs (e.g. lstm_output_dq[0] -> output_dq) attached,
+    # in order to preserve reference patterns like "dequantize - consumer - quantize".
+    _reroute_tuple_getitem_pattern(graph)
+    return lstm_output_dq
+
+def _maybe_get_custom_module_lstm_from_node_arg(
+    arg: Node,
+    named_modules: Dict[str, torch.nn.Module],
+) -> Optional[Node]:
+    """
+    Given an argument of a node, if the argument refers to the path through which the node
+    is a consumer of custom module LSTM, return the custom module LSTM node, or None otherwise.
+
+    This is used to determine whether a node is a consumer of custom module LSTM, and, if so,
+    skip inserting input observers for this node. This is because custom module LSTM produces
+    quantized outputs, so inserting an input observer for the consumer of custom module LSTM
+    would unnecessarily quantize the outputs again.
+
+      lstm -> consumer
+
+    In practice, however, custom module LSTM outputs a tuple (output, (hidden0, hidden1)) with
+    DeQuantStubs attached to each internal node (see `_insert_dequant_stubs_for_custom_module_lstm_output`).
+    This tuple can be consumed in one of four ways:
+
+      lstm -> getitem -> DeQuantStub -> consumer                       # consume lstm[0]
+      lstm -> getitem -> getitem -> DeQuantStub -> tuple -> consumer   # consume lstm[1]
+      lstm -> getitem -> getitem -> DeQuantStub -> consumer            # consume lstm[1][0] or lstm[1][1]
+      lstm -> getitem -> DeQuantStub -> tuple -> consumer              # consume lstm
+
+    Thus, we must match against the above patterns instead of simply checking the parent node
+    to determine whether this node is a consumer of a custom module LSTM.
+    """
+    def match_dq(a):
+        return isinstance(_get_module(a, named_modules), DeQuantStub)
+
+    def match_lstm(a):
+        return _is_custom_module_lstm(a, named_modules)
+
+    def match_getitem(a):
+        return a.op == "call_function" and a.target == operator.getitem
+
+    def match_tuple(a):
+        return a.op == "call_function" and a.target == tuple
+
+    def _match_pattern(match_pattern: List[Callable]) -> Optional[Node]:
+        """
+        Traverse up the graph and match the args one by one.
+        If there is a match, return the last matched node, or None otherwise.
+        """
+        a = arg
+        for i, match in enumerate(match_pattern):
+            if not match(a):
+                return None
+            # Match next arg, for tuple the arg is a tuple of a list, e.g. ([dq_1, other_node],)
+            if i < len(match_pattern) - 1:
+                if match == match_tuple:
+                    a = a.args[0][0]  # type: ignore[assignment,index]
+                else:
+                    a = a.args[0]  # type: ignore[assignment]
+        return a
+
+    all_match_patterns = [
+        [match_dq, match_getitem, match_lstm],
+        [match_tuple, match_dq, match_getitem, match_getitem, match_lstm],
+        [match_dq, match_getitem, match_getitem, match_lstm],
+        [match_tuple, match_dq, match_getitem, match_lstm],
+    ]
+
+    for p in all_match_patterns:
+        matched_node = _match_pattern(p)
+        if matched_node is not None:
+            return matched_node
+    return None
+
+def _reroute_tuple_getitem_pattern(graph: Graph):
+    """
+    Search for patterns where N consecutive `tuple` call_function nodes are followed by
+    N consecutive `getitem` call_function nodes that are "reverses" of the `tuple` nodes.
+    If we find this pattern, reroute the consumers of the last `getitem` to skip these
+    N `tuple` and `getitem` nodes.
+
+    Before:
+
+        a   b     c
+        |   \\   /
+        \\   tuple
+         \\   /
+          tuple
+            |
+        getitem(1)
+            |
+        getitem(0)
+            |
+            d
+
+    After:
+
+        b
+        |
+        d
+    """
+    def find_patterns(
+            node: Node,
+            index_stack: List[int],
+            current_pattern: List[Node],
+            matched_patterns: List[List[Node]],
+            seen: Set[Tuple[Node, Tuple[int, ...]]]):
+        """
+        Traverse the graph recursively to match for the N-tuple - N-getitem patterns,
+        starting at the given node.
+
+        We use a stack to keep track of the expected `getitem` indices, since these are
+        reversed from the `tuple` indices. In the above example, the stack after
+        (b -> tuple -> tuple) will be [0, 1], which will be popped by getitem(1) first
+        and then by getitem(0).
+
+        TODO: traverse upwards from the output and handle the case when tuple is not a
+        separate node, e.g. graph.call_function(operator.getitem, args=(a, (b, c)))
+        """
+        if len(index_stack) == 0 and len(current_pattern) > 0:
+            matched_patterns.append(copy.copy(current_pattern))
+            current_pattern.clear()
+
+        # Avoid duplicating work
+        state = (node, tuple(index_stack))
+        if state in seen:
+            return
+        seen.add(state)
+
+        # Iterate through users of this node to find tuple/getitem nodes to match
+        for user in node.users:
+            if user.op == "call_function" and user.target == tuple:
+                for i, user_arg in enumerate(user.args[0]):  # type: ignore[arg-type]
+                    if user_arg == node:
+                        index_stack.append(i)
+                        current_pattern.append(user)
+                        find_patterns(user, index_stack, current_pattern, matched_patterns, seen)
+            elif user.op == "call_function" and user.target == operator.getitem:
+                if len(index_stack) > 0:
+                    if user.args[1] == index_stack[-1]:
+                        index_stack.pop()
+                        current_pattern.append(user)
+                        find_patterns(user, index_stack, current_pattern, matched_patterns, seen)
+        return matched_patterns
+
+    # Collect all matched patterns
+    matched_patterns: List[List[Node]] = []
+    seen: Set[Tuple[Node, Tuple[int, ...]]] = set()  # (node, index_stack)
+    for node in graph.nodes:
+        find_patterns(node, [], [], matched_patterns, seen)
+
+    # For each pattern, redirect all consumers of the last getitem node to the correct input
+    # of the first tuple node
+    for pattern in matched_patterns:
+        first_tuple = pattern[0]
+        last_getitem = pattern[-1]
+        assert first_tuple.op == "call_function" and first_tuple.target == tuple
+        assert last_getitem.op == "call_function" and last_getitem.target == operator.getitem
+        last_getitem_index = last_getitem.args[1]
+        new_input = first_tuple.args[0][last_getitem_index]  # type: ignore[index]
+        for user in list(last_getitem.users.keys()):
+            user.replace_input_with(last_getitem, new_input)
+
+def _get_observer_from_activation_post_process(
+    activation_post_process: Union[ObserverBase, FakeQuantizeBase],
+) -> ObserverBase:
+    """
+    If `activation_post_process` is an observer, return the observer.
+    If `activation_post_process` is a fake quantize, return the internal observer.
+    """
+    if isinstance(activation_post_process, ObserverBase):
+        return activation_post_process
+    else:
+        assert isinstance(activation_post_process, FakeQuantizeBase)
+        return activation_post_process.activation_post_process  # type: ignore[return-value]
+
+def _qconfig_satisfies_dtype_config_constraints(
+        qconfig: QConfigAny,
+        dtype_with_constraints: DTypeWithConstraints,
+        is_activation: bool = True) -> bool:
+    """
+    Return whether `qconfig` satisfies the following constraints from the backend,
+    specified through the activation and weight DTypeWithConstraints.
+
+        1. QConfig specified a quantization range that falls within the backend's, if any
+        2. QConfig specified a min scale value that is >= the backend's, if any
+        3. QConfig specified a FixedQParamsObserver or FixedQParamsFakeQuantize that has
+           scale and zero point that match the backend's, if any
+
+    If `is_activation` is True, we check `qconfig.activation`, else we check `qconfig.weight`.
+    If `qconfig` or `dtype_with_constraints.dtype` is None, or the dtypes do not match, return True.
+    """
+    # TODO: log warnings only when the user enabled a debug flag
+    def _activation_post_process_satisfies_dtype_config_constraints(
+            activation_post_process: Union[ObserverBase, FakeQuantizeBase],
+            dtype_with_constraints: DTypeWithConstraints,
+            debug_string: str) -> bool:
+        observer = _get_observer_from_activation_post_process(activation_post_process)
+        app_quant_min = getattr(observer, "quant_min", None)
+        app_quant_max = getattr(observer, "quant_max", None)
+        # TODO: for now, just use the existing eps value as scale_min. In the future, we should
+        # resolve the differences between the two, either by renaming eps or some other way
+        app_scale_min = getattr(observer, "eps", None)
+        backend_quant_min = dtype_with_constraints.quant_min_lower_bound
+        backend_quant_max = dtype_with_constraints.quant_max_upper_bound
+        backend_scale_min = dtype_with_constraints.scale_min_lower_bound
+        backend_scale_exact_match = dtype_with_constraints.scale_exact_match
+        backend_zero_point_exact_match = dtype_with_constraints.zero_point_exact_match
+        # check quantization ranges
+        if backend_quant_min is not None and backend_quant_max is not None:
+            if app_quant_min is None or app_quant_max is None:
+                warnings.warn(f"QConfig {debug_string} must specify 'quant_min' and 'quant_max', ignoring {qconfig}")
+                return False
+            elif app_quant_min < backend_quant_min or app_quant_max > backend_quant_max:
+                warnings.warn(
+                    f"QConfig {debug_string} quantization range must fall within the backend's:\n"
+                    f"QConfig range = ({app_quant_min}, {app_quant_max}), "
+                    f"BackendConfig range = ({backend_quant_min}, {backend_quant_max}), "
+                    f"ignoring {qconfig}"
+                )
+                return False
+        # check scale min
+        if backend_scale_min is not None:
+            if app_scale_min is None:
+                warnings.warn(f"QConfig {debug_string} must specify 'eps', ignoring {qconfig}")
+                return False
+            if app_scale_min < backend_scale_min:
+                warnings.warn(
+                    f"QConfig {debug_string} eps ({app_scale_min}) must be greater than or equal to "
+                    f"the backend's min scale value ({backend_scale_min}), ignoring {qconfig}"
+                )
+                return False
+        # check fixed scale and zero point
+        if backend_scale_exact_match is not None and backend_zero_point_exact_match is not None:
+            # For tests only, accept the following qconfigs for now
+            # TODO: handle fp16 qconfigs properly
+            for accepted_qconfig in [float16_static_qconfig, float16_dynamic_qconfig]:
+                if qconfig_equals(qconfig, accepted_qconfig):
+                    return True
+            suggestion_str = (
+                "Please use torch.ao.quantization.get_default_qconfig_mapping or "
+                "torch.ao.quantization.get_default_qat_qconfig_mapping. Example:\n"
+                "    qconfig_mapping = get_default_qconfig_mapping(\"fbgemm\")\n"
+                "    model = prepare_fx(model, qconfig_mapping, example_inputs)"
+            )
+            if not isinstance(activation_post_process, FixedQParamsObserver) and \
+                    not isinstance(activation_post_process, FixedQParamsFakeQuantize):
+                warnings.warn(
+                    f"QConfig must specify a FixedQParamsObserver or a FixedQParamsFakeQuantize "
+                    f"for fixed qparams ops, ignoring {qconfig}.\n{suggestion_str}"
+                )
+                return False
+            if observer.scale != backend_scale_exact_match or observer.zero_point != backend_zero_point_exact_match:
+                warnings.warn(
+                    f"QConfig fixed scale ({observer.scale}) and zero point ({observer.zero_point}) "
+                    f"do not match the backend's ({backend_scale_exact_match} and {backend_zero_point_exact_match}), "
+                    f"ignoring {qconfig}.\n{suggestion_str}"
+                )
+                return False
+        return True
+
+    if qconfig is None or dtype_with_constraints.dtype is None:
+        return True
+
+    activation_post_process_ctr = qconfig.activation if is_activation else qconfig.weight
+    debug_string = "activation" if is_activation else "weight"
+    satisfies_constraints = True
+    if activation_post_process_ctr is not None:
+        activation_post_process = activation_post_process_ctr()
+        assert _is_activation_post_process(activation_post_process)
+        # If dtypes don't match, don't check the activation_post_process and return True early
+        if activation_post_process.dtype != dtype_with_constraints.dtype:
+            return True
+        satisfies_constraints = _activation_post_process_satisfies_dtype_config_constraints(
+            activation_post_process, dtype_with_constraints, debug_string)
+    return satisfies_constraints
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/observer.py b/MLPY/Lib/site-packages/torch/ao/quantization/observer.py
new file mode 100644
index 0000000000000000000000000000000000000000..45036534daf7d0ce4cfd9270fe242a39c0c315e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/observer.py
@@ -0,0 +1,1688 @@
+"""
+This module implements observers which are used to collect statistics about
+the values observed during calibration (PTQ) or training (QAT).
+"""
+
+import re
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from functools import partial
+from typing import Any, List, Tuple, Optional, Dict
+
+import torch
+import torch.nn as nn
+from torch.ao.quantization.utils import (
+    check_min_max_valid, calculate_qmin_qmax, is_per_tensor, is_per_channel, validate_qmin_qmax)
+
+__all__ = [
+    "default_affine_fixed_qparams_observer",
+    "default_debug_observer",
+    "default_dynamic_quant_observer",
+    "default_fixed_qparams_range_0to1_observer",
+    "default_fixed_qparams_range_neg1to1_observer",
+    "default_float_qparams_observer",
+    "default_float_qparams_observer_4bit",
+    "default_histogram_observer",
+    "default_observer",
+    "default_per_channel_weight_observer",
+    "default_placeholder_observer",
+    "default_reuse_input_observer",
+    "default_symmetric_fixed_qparams_observer",
+    "default_weight_observer",
+    "get_observer_state_dict",
+    "load_observer_state_dict",
+    "per_channel_weight_observer_range_neg_127_to_127",
+    "weight_observer_range_neg_127_to_127",
+    "FixedQParamsObserver",
+    "HistogramObserver",
+    "MinMaxObserver",
+    "MovingAverageMinMaxObserver",
+    "MovingAveragePerChannelMinMaxObserver",
+    "NoopObserver",
+    "ObserverBase",
+    "PerChannelMinMaxObserver",
+    "PlaceholderObserver",
+    "RecordingObserver",
+    "ReuseInputObserver",
+    "UniformQuantizationObserverBase",
+]
+
+
+class _PartialWrapper:
+    def __init__(self, p):
+        self.p = p
+        self.callable_args = {}
+
+    def __call__(self, *args, **keywords):
+        # call each arg in callable_args and add them partial, then run with keywords
+        # skip if arg_name in keywords so its possible to overwrite
+        for arg_name in self.callable_args:
+            if arg_name not in keywords:
+                keywords = {**keywords, arg_name: self.callable_args[arg_name]()}
+        return self.p(*args, **keywords)
+
+    def __repr__(self):
+        return self.p.__repr__() + self.callable_args.__repr__()
+
+    def with_args(self, **kwargs):
+        return _with_args(self, **kwargs)
+
+    def with_callable_args(self, **kwargs):
+        result = _PartialWrapper(p=self.p)
+        result.callable_args = {**self.callable_args, **kwargs}
+        return result
+
+
+def _with_args(cls_or_self, **kwargs):
+    r"""Wrapper that allows creation of class factories.
+
+    This can be useful when there is a need to create classes with the same
+    constructor arguments, but different instances. Can be used in conjunction with
+    _callable_args
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined vars")
+        >>> Foo.with_args = classmethod(_with_args)
+        >>> foo_builder = Foo.with_args(a=3, b=4).with_args(answer=42)
+        >>> foo_instance1 = foo_builder()
+        >>> foo_instance2 = foo_builder()
+        >>> id(foo_instance1) == id(foo_instance2)
+        False
+    """
+    r = _PartialWrapper(partial(cls_or_self, **kwargs))
+    return r
+
+def _with_callable_args(cls_or_self, **kwargs):
+    r"""Wrapper that allows creation of class factories args that need to be
+    called at construction time.
+
+    This can be useful when there is a need to create classes with the same
+    constructor arguments, but different instances and those arguments should only
+    be calculated at construction time. Can be used in conjunction with _with_args
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined vars")
+        >>> Foo.with_callable_args = classmethod(_with_callable_args)
+        >>> Foo.with_args = classmethod(_with_args)
+        >>> foo_builder = Foo.with_callable_args(cur_time=get_time_func).with_args(name="dan")
+        >>> foo_instance1 = foo_builder()
+        >>> # wait 50
+        >>> foo_instance2 = foo_builder()
+        >>> id(foo_instance1.creation_time) == id(foo_instance2.creation_time)
+        False
+    """
+    r = _PartialWrapper(partial(cls_or_self))
+    return r.with_callable_args(**kwargs)
+
+
+ABC: Any = ABCMeta("ABC", (object,), {})  # compatible with Python 2 *and* 3:
+
+
+class ObserverBase(ABC, nn.Module):
+    r"""Base observer Module.
+    Any observer implementation should derive from this class.
+
+    Concrete observers should follow the same API. In forward, they will update
+    the statistics of the observed Tensor. And they should provide a
+    `calculate_qparams` function that computes the quantization parameters given
+    the collected statistics.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        is_dynamic: indicator for whether the observer is a placeholder for dynamic quantization
+        or static quantization
+    """
+
+    def __init__(self, dtype, is_dynamic=False):
+        super().__init__()
+        self.dtype = dtype
+        self.is_dynamic = is_dynamic
+
+    @abstractmethod
+    def forward(self, x):
+        pass
+
+    @abstractmethod
+    def calculate_qparams(self, **kwargs):
+        pass
+
+    with_args = classmethod(_with_args)
+    with_callable_args = classmethod(_with_callable_args)
+
+
+class UniformQuantizationObserverBase(ObserverBase):
+    r"""Common base for all observers using uniform quantization to calculate
+    scale and zero_point.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used.
+        reduce_range: Reduces the range of the quantized data type by 1 bit.
+                      This is sometimes required to avoid instruction overflow.
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    .. warning::
+
+        :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
+               or `torch.int8` or `torch.uint8`
+
+    .. warning::
+
+        :attr:`qscheme` can only take one of the following options:
+
+        - ``torch.per_tensor_affine``
+        - ``torch.per_tensor_symmetric``
+        - ``torch.per_channel_affine``
+        - ``torch.per_channel_symmetric``
+    """
+
+    # Note: the version is shared by all observer types
+    #
+    # Version 1/None
+    #   self
+    #
+    # Version 2 (base class only, does not include child class buffers)
+    #   self
+    #   |--- eps : Tensor
+    #
+    # Version 3
+    #   for HistogramObserver only, changed the shape of uninitialized
+    #   min_val and max_val buffers from torch.Size([0]) to torch.Size([])
+    #   for PerChannelObservers, changed the name of the buffers from min_vals
+    #   to min_val and from max_vals to max_val.
+    _version = 3
+
+    eps: torch.Tensor
+
+    def __init__(
+        self,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs)
+        self.qscheme = qscheme
+        if reduce_range:
+            warnings.warn(
+                "Please use quant_min and quant_max to specify the range for observers. \
+                    reduce_range will be deprecated in a future release of PyTorch."
+            )
+        self.reduce_range = reduce_range
+        self.register_buffer(
+            "eps", torch.tensor([eps], **factory_kwargs)
+        )
+        assert self.qscheme in (
+            torch.per_tensor_affine,
+            torch.per_tensor_symmetric,
+            torch.per_channel_affine,
+            torch.per_channel_symmetric,
+            torch.per_channel_affine_float_qparams,
+        ), "Default Observer only works for per_tensor_affine, \
+                per_tensor_symmetric, per_channel_affine, \
+                per_channel_symmetric and per_channel_float_qparams quantization scheme"
+
+        _ALLOWED_DTYPES = (
+            torch.qint8,
+            torch.quint8,
+            torch.quint4x2,
+            torch.qint32,
+            torch.int8,
+            torch.uint8,
+            torch.int16,
+            torch.int32,
+        )
+
+        assert self.dtype in _ALLOWED_DTYPES, f"Default Observer only works for {_ALLOWED_DTYPES} data type"
+        self.has_customized_qrange = (quant_min is not None) and (quant_max is not None)
+        if self.has_customized_qrange:
+            validate_qmin_qmax(quant_min, quant_max)
+        self.quant_min, self.quant_max = \
+            calculate_qmin_qmax(quant_min, quant_max, self.has_customized_qrange, self.dtype, self.reduce_range)
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+
+        version = local_metadata.get("version", None)
+
+        if version is None or version == 1:
+            # eps was moved to a buffer in version 2
+            eps = torch.tensor([torch.finfo(torch.float32).eps])
+            state_dict[prefix + "eps"] = eps
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def _validate_qmin_qmax(self, quant_min: int, quant_max: int) -> None:
+        r"""Validates that the user-specified quantization range is properly initialized
+        and within the given bound supported by the observer dtype.
+
+        To accommodate lower-bit quantization with respect to the existing torch.qint8 and
+        torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing
+        in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax
+        values are used to calculate static estimates of the scale and zero point for aggressive lower-bit
+        fake quantization. These estimates are compared against parameters learned through backpropagation.
+        The related literatures for scale and zero point via backpropagation are as follows:
+
+        Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS
+        Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf
+        """
+        # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
+        # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+        assert (
+            quant_min <= 0 <= quant_max
+        ), "Used-specified quantization range must include 0."
+        assert (
+            quant_min < quant_max
+        ), "qmin must be strictly less than qmax for user-specified quantization range."
+
+    @torch.jit.export
+    def _calculate_qparams(
+        self, min_val: torch.Tensor, max_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Calculates the quantization parameters, given min and max
+        value tensors. Works for both per tensor and per channel cases
+
+        Args:
+            min_val: Minimum values per channel
+            max_val: Maximum values per channel
+
+        Returns:
+            scales: Scales tensor of shape (#channels,)
+            zero_points: Zero points tensor of shape (#channels,)
+        """
+        # Functionally equivalent to 'determine_qparams' in utils.py. Observers must be torchscriptable however and qscheme
+        # as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+        # to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code
+        # seems unlikey to change (last update over 1 year ago) and when torchscript is fully deprecated we can refactor.
+        # TODO(jakeszwe, jerryzh168)
+        if not check_min_max_valid(min_val, max_val):
+            return torch.tensor([1.0], device=min_val.device.type), torch.tensor([0], device=min_val.device.type)
+
+        quant_min, quant_max = self.quant_min, self.quant_max
+        min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+        max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+
+        device = min_val_neg.device
+        scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device)
+        zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+        if (
+            self.qscheme == torch.per_tensor_symmetric
+            or self.qscheme == torch.per_channel_symmetric
+        ):
+            max_val_pos = torch.max(-min_val_neg, max_val_pos)
+            scale = max_val_pos / (float(quant_max - quant_min) / 2)
+            scale = torch.max(scale, self.eps)
+            if self.dtype in [torch.quint8, torch.uint8]:
+                if self.has_customized_qrange:
+                    # When customized quantization range is used, down-rounded midpoint of the range is chosen.
+                    zero_point = zero_point.new_full(
+                        zero_point.size(), (quant_min + quant_max) // 2
+                    )
+                else:
+                    zero_point = zero_point.new_full(zero_point.size(), 128)
+        elif self.qscheme == torch.per_channel_affine_float_qparams:
+            scale = (max_val - min_val) / float(quant_max - quant_min)
+            scale = torch.where(scale > self.eps, scale, torch.ones_like(scale))
+            # We use the quantize function
+            # xq = Round(Xf * inv_scale + zero_point),
+            # setting zero_point to (-1 * min *inv_scale) we get
+            # Xq = Round((Xf - min) * inv_scale)
+            zero_point = -1 * min_val / scale
+        else:
+            scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+            scale = torch.max(scale, self.eps)
+            zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+            zero_point = torch.clamp(zero_point, quant_min, quant_max)
+
+        # For scalar values, cast them to Tensors of size 1 to keep the shape
+        # consistent with default values in FakeQuantize.
+        if len(scale.shape) == 0:
+            # TODO: switch to scale.item() after adding JIT support
+            scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device)
+        if len(zero_point.shape) == 0:
+            # TODO: switch to zero_point.item() after adding JIT support
+            zero_point = torch.tensor(
+                [int(zero_point)], dtype=zero_point.dtype, device=device
+            )
+            if self.qscheme == torch.per_channel_affine_float_qparams:
+                zero_point = torch.tensor(
+                    [float(zero_point)], dtype=zero_point.dtype, device=device
+                )
+
+        return scale, zero_point
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        raise NotImplementedError("Cannot reset min/max values in the given observer.")
+
+
+# Originally, this class was called `_ObserverBase`.  Keeping the old name around
+# for backwards compatibility.
+# TODO(after v1.13): delete this
+_ObserverBase = UniformQuantizationObserverBase
+
+
+class MinMaxObserver(UniformQuantizationObserverBase):
+    r"""Observer module for computing the quantization parameters based on the
+    running min and max values.
+
+    This observer uses the tensor min/max statistics to compute the quantization
+    parameters. The module records the running minimum and maximum of incoming
+    tensors, and uses this statistic to compute the quantization parameters.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    Given running min/max as :math:`x_\text{min}` and :math:`x_\text{max}`,
+    scale :math:`s` and zero point :math:`z` are computed as:
+
+    The running minimum/maximum :math:`x_\text{min/max}` is computed as:
+
+    .. math::
+
+        \begin{array}{ll}
+        x_\text{min} &= \begin{cases}
+            \min(X) & \text{if~}x_\text{min} = \text{None} \\
+            \min\left(x_\text{min}, \min(X)\right) & \text{otherwise}
+        \end{cases}\\
+        x_\text{max} &= \begin{cases}
+            \max(X) & \text{if~}x_\text{max} = \text{None} \\
+            \max\left(x_\text{max}, \max(X)\right) & \text{otherwise}
+        \end{cases}\\
+        \end{array}
+
+    where :math:`X` is the observed tensor.
+
+    The scale :math:`s` and zero point :math:`z` are then computed as:
+
+    .. math::
+
+        \begin{aligned}
+            \text{if Symmetric:}&\\
+            &s = 2 \max(|x_\text{min}|, x_\text{max}) /
+                \left( Q_\text{max} - Q_\text{min} \right) \\
+            &z = \begin{cases}
+                0 & \text{if dtype is qint8} \\
+                128 & \text{otherwise}
+            \end{cases}\\
+            \text{Otherwise:}&\\
+                &s = \left( x_\text{max} - x_\text{min}  \right ) /
+                    \left( Q_\text{max} - Q_\text{min} \right ) \\
+                &z = Q_\text{min} - \text{round}(x_\text{min} / s)
+        \end{aligned}
+
+    where :math:`Q_\text{min}` and :math:`Q_\text{max}` are the minimum and
+    maximum of the quantized data type.
+
+    .. warning:: :attr:`dtype` can only take ``torch.qint8`` or ``torch.quint8``.
+
+    .. note:: If the running minimum equals to the running maximum, the scale
+              and zero_point are set to 1.0 and 0.
+    """
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                "MinMaxObserver's qscheme only support torch.per_tensor_symmetric \
+                    and torch.per_tensor_affine."
+            )
+        # TODO: MinMaxObserver by itself doesn't support dynamic quantization, but
+        # if it's inherited by MovingAverageObserver, and averaging_constant is 1, it
+        # supports dynamic quantization, we may need to better error checking here
+
+        # For x86 quantized kernels, we need to ensure that the vpmaddubsw
+        # instruction does not overflow. We allow for a reduce_range argument to
+        # observers that reduces the quantized range to (0,127) or (-64, 63).
+        # For more details see aten/src/ATen/native/quantized/cpu/qconv.cpp
+        # This is not an optimal choice for non x86 backends as it loses a bit
+        # of precision for activations.
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
+        if (
+            self.qscheme == torch.per_tensor_symmetric
+            and self.reduce_range
+            and self.dtype == torch.quint8
+        ):
+            raise NotImplementedError(
+                "Cannot reduce range for symmetric \
+                                       quantization for quint8"
+            )
+
+    def forward(self, x_orig):
+        r"""Records the running minimum and maximum of ``x``."""
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val_cur, max_val_cur = torch.aminmax(x)
+        min_val = torch.min(min_val_cur, self.min_val)
+        max_val = torch.max(max_val_cur, self.max_val)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        r"""Calculates the quantization parameters."""
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+    @torch.jit.export
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        """Resets the min/max values."""
+        self.min_val.copy_(torch.tensor(float("inf")))
+        self.max_val.copy_(torch.tensor(float("-inf")))
+
+class MovingAverageMinMaxObserver(MinMaxObserver):
+    r"""Observer module for computing the quantization parameters based on the
+    moving average of the min and max values.
+
+    This observer computes the quantization parameters based on the moving
+    averages of minimums and maximums of the incoming tensors. The module
+    records the average minimum and maximum of incoming tensors, and uses this
+    statistic to compute the quantization parameters.
+
+    Args:
+        averaging_constant: Averaging constant for min/max.
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The moving average min/max is computed as follows
+
+    .. math::
+
+        \begin{array}{ll}
+                x_\text{min} = \begin{cases}
+                    \min(X) & \text{if~}x_\text{min} = \text{None} \\
+                    (1 - c) x_\text{min} + c \min(X) & \text{otherwise}
+                \end{cases}\\
+                x_\text{max} = \begin{cases}
+                    \max(X) & \text{if~}x_\text{max} = \text{None} \\
+                    (1 - c) x_\text{max} + c \max(X) & \text{otherwise}
+                \end{cases}\\
+        \end{array}
+
+    where :math:`x_\text{min/max}` is the running average min/max, :math:`X` is
+    is the incoming tensor, and :math:`c` is the ``averaging_constant``.
+
+    The scale and zero point are then computed as in
+    :class:`~torch.ao.quantization.observer.MinMaxObserver`.
+
+    .. note:: Only works with ``torch.per_tensor_affine`` quantization scheme.
+
+    .. note:: If the running minimum equals to the running maximum, the scale
+              and zero_point are set to 1.0 and 0.
+    """
+
+    def __init__(
+        self,
+        averaging_constant=0.01,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                f"MovingAverageMinMaxObserver's qscheme only support \
+                torch.per_tensor_symmetric and torch.per_tensor_affine. \
+                but got: {qscheme}"
+            )
+        self.averaging_constant = averaging_constant
+        if is_dynamic and self.averaging_constant != 1:
+            raise NotImplementedError(
+                "MovingAverageMinMaxObserver doesn't support dynamic quantization for "
+                f"averaging constant of {self.averaging_constant}"
+            )
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs
+        )
+
+    def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val = self.min_val
+        max_val = self.max_val
+        if min_val == float("inf") and max_val == float("-inf"):
+            min_val, max_val = torch.aminmax(x)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(x)
+            min_val = min_val + self.averaging_constant * (min_val_cur - min_val)
+            max_val = max_val + self.averaging_constant * (max_val_cur - max_val)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+
+class PerChannelMinMaxObserver(UniformQuantizationObserverBase):
+    r"""Observer module for computing the quantization parameters based on the
+    running per channel min and max values.
+
+    This observer uses the tensor min/max statistics to compute the per channel
+    quantization parameters. The module records the running minimum and maximum
+    of incoming tensors, and uses this statistic to compute the quantization
+    parameters.
+
+    Args:
+        ch_axis: Channel axis
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The quantization parameters are computed the same way as in
+    :class:`~torch.ao.quantization.observer.MinMaxObserver`, with the difference
+    that the running min/max values are stored per channel.
+    Scales and zero points are thus computed per channel as well.
+
+    .. note:: If the running minimum equals to the running maximum, the scales
+              and zero_points are set to 1.0 and 0.
+    """
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        ch_axis=0,
+        dtype=torch.quint8,
+        qscheme=torch.per_channel_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_channel(qscheme):
+            raise NotImplementedError(
+                "PerChannelMinMaxObserver's qscheme only support \
+                    torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "PerChannelMinMaxObserver doesn't support dynamic quantization"
+            )
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs,
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.ch_axis = ch_axis
+        self.register_buffer("min_val", torch.tensor([], **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor([], **factory_kwargs))
+        if (
+            self.qscheme == torch.per_channel_symmetric
+            and self.reduce_range
+            and self.dtype == torch.quint8
+        ):
+            raise NotImplementedError(
+                "Cannot reduce range for symmetric quantization for quint8"
+            )
+
+    def forward(self, x_orig):
+        return self._forward(x_orig)
+
+    def _forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        min_val = self.min_val
+        max_val = self.max_val
+        x_dim = x.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(new_axis_list)
+        # Need to match dtype of min/max because the updates to buffers
+        # are done in place and types need to match for comparisons
+        y = y.to(self.min_val.dtype)
+        y = torch.flatten(y, start_dim=1)
+        if min_val.numel() == 0 or max_val.numel() == 0:
+            min_val, max_val = torch.aminmax(y, dim=1)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+            min_val = torch.min(min_val_cur, min_val)
+            max_val = torch.max(max_val_cur, max_val)
+        self.min_val.resize_(min_val.shape)
+        self.max_val.resize_(max_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        return self._calculate_qparams(self.min_val, self.max_val)
+
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+    def _load_from_state_dict(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        local_metadata: Dict[str, torch.Tensor],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+        version = local_metadata.get("version", None)
+        if version is not None and version < 3:
+            local_state = ["min_vals", "max_vals"]
+            expected_min_name = "min_vals"
+            expected_max_name = "max_vals"
+        else:
+            local_state = ["min_val", "max_val"]
+            expected_min_name = "min_val"
+            expected_max_name = "max_val"
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                # Custom handling to allow loading min_val or max_val
+                # of size N into uninitialized buffers of size 0. The
+                # buffers are resized here, and the values are copied in
+                # the default state_dict loading code of the parent.
+                if name == expected_min_name:
+                    self.min_val.resize_(val.shape)
+                elif name == expected_max_name:
+                    self.max_val.resize_(val.shape)
+                else:
+                    warnings.warn(f"Observer load_from_state_dict got unexpected name {name}")
+                # For torchscript module we need to update the attributes here since we do not
+                # call the `_load_from_state_dict` function defined module.py
+                if torch.jit.is_scripting():
+                    if name == expected_min_name:
+                        self.min_val.copy_(val)
+                    elif name == expected_max_name:
+                        self.max_val.copy_(val)
+                    else:
+                        warnings.warn(f"Observer load_from_state_dict got unexpected name {name}")
+            elif strict:
+                missing_keys.append(key)
+
+        if not torch.jit.is_scripting():
+            super()._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                False,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+
+    def _load_from_state_dict_script(
+        self,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        local_metadata: Dict[str, torch.Tensor],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+
+        self._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    @torch.jit.export
+    def reset_min_max_vals(self):
+        """Resets the min/max values."""
+        # This used to be torch.ones but that does not work because
+        # JIT compiler can optimize it via common subexpression elimination
+        # in which case both min_val and max_val point to the same tensor.
+        self.min_val = torch.rand(0, )
+        self.max_val = torch.rand(0, )
+
+
+class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
+    r"""Observer module for computing the quantization parameters based on the
+    running per channel min and max values.
+
+    This observer uses the tensor min/max statistics to compute the per channel
+    quantization parameters. The module records the running minimum and maximum
+    of incoming tensors, and uses this statistic to compute the quantization
+    parameters.
+
+    Args:
+        averaging_constant: Averaging constant for min/max.
+        ch_axis: Channel axis
+        dtype: Quantized data type
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        quant_min: Minimum quantization value. If unspecified, it will follow the 8-bit setup.
+        quant_max: Maximum quantization value. If unspecified, it will follow the 8-bit setup.
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The quantization parameters are computed the same way as in
+    :class:`~torch.ao.quantization.observer.MovingAverageMinMaxObserver`, with the
+    difference that the running min/max values are stored per channel.
+    Scales and zero points are thus computed per channel as well.
+
+    .. note:: If the running minimum equals to the running maximum, the scales
+              and zero_points are set to 1.0 and 0.
+    """
+
+    def __init__(
+        self,
+        averaging_constant=0.01,
+        ch_axis=0,
+        dtype=torch.quint8,
+        qscheme=torch.per_channel_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs
+    ) -> None:
+        if not is_per_channel(qscheme):
+            raise NotImplementedError(
+                "MovingAveragePerChannelMinMaxObserver's qscheme only support \
+                    torch.per_channel_symmetric, torch.per_channel_affine and torch.per_channel_affine_float_qparams."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "MovingAveragePerChannelMinMaxObserver doesn't support dynamic quantization"
+            )
+        super().__init__(
+            ch_axis=ch_axis,
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs
+        )
+        self.averaging_constant = averaging_constant
+
+    def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+        x = x.to(self.min_val.dtype)
+        min_val = self.min_val
+        max_val = self.max_val
+        x_dim = x.size()
+
+        new_axis_list = [i for i in range(len(x_dim))]  # noqa: C416
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(new_axis_list)
+        y = torch.flatten(y, start_dim=1)
+        if min_val.numel() == 0 or max_val.numel() == 0:
+            min_val, max_val = torch.aminmax(y, dim=1)
+        else:
+            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
+            min_val = min_val + self.averaging_constant * (min_val_cur - min_val)
+            max_val = max_val + self.averaging_constant * (max_val_cur - max_val)
+        self.min_val.resize_(min_val.shape)
+        self.max_val.resize_(max_val.shape)
+        self.min_val.copy_(min_val)
+        self.max_val.copy_(max_val)
+        return x_orig
+
+
+class HistogramObserver(UniformQuantizationObserverBase):
+    r"""
+    The module records the running histogram of tensor values along with
+    min/max values. ``calculate_qparams`` will calculate scale and zero_point.
+
+    Args:
+        bins: Number of bins to use for the histogram
+        upsample_rate: Factor by which the histograms are upsampled, this is
+                       used to interpolate histograms with varying ranges across observations
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+        eps: Epsilon value for float32, Defaults to `torch.finfo(torch.float32).eps`.
+
+    The scale and zero point are computed as follows:
+
+    1. Create the histogram of the incoming inputs.
+        The histogram is computed continuously, and the ranges per bin change
+        with every new tensor observed.
+    2. Search the distribution in the histogram for optimal min/max values.
+        The search for the min/max values ensures the minimization of the
+        quantization error with respect to the floating point model.
+    3. Compute the scale and zero point the same way as in the
+        :class:`~torch.ao.quantization.MinMaxObserver`
+    """
+    histogram: torch.Tensor
+    min_val: torch.Tensor
+    max_val: torch.Tensor
+
+    def __init__(
+        self,
+        bins: int = 2048,
+        upsample_rate: int = 128,
+        dtype: torch.dtype = torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        reduce_range=False,
+        quant_min=None,
+        quant_max=None,
+        factory_kwargs=None,
+        eps=torch.finfo(torch.float32).eps,
+        is_dynamic=False,
+        **kwargs,
+    ) -> None:
+        if not is_per_tensor(qscheme):
+            raise NotImplementedError(
+                "HistogramObserver's qscheme only support torch.per_tensor_symmetric \
+                    and torch.per_tensor_affine."
+            )
+        if is_dynamic:
+            raise NotImplementedError(
+                "HistogramObserver doesn't support dynamic quantization"
+            )
+        # bins: The number of bins used for histogram calculation.
+        super().__init__(
+            dtype=dtype,
+            qscheme=qscheme,
+            reduce_range=reduce_range,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            factory_kwargs=factory_kwargs,
+            eps=eps,
+            is_dynamic=is_dynamic,
+            **kwargs
+        )
+        factory_kwargs = torch.nn.factory_kwargs(factory_kwargs)
+        self.bins = bins
+        self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs))
+        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
+        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
+        self.dst_nbins = 2 ** torch.iinfo(self.dtype).bits
+        self.upsample_rate = upsample_rate
+
+    def _get_norm(
+        self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Compute the norm of the values uniformaly distributed between
+        delta_begin and delta_end.
+        Currently only L2 norm is supported.
+
+        norm = density * (integral_{begin, end} x^2)
+             = density * (end^3 - begin^3) / 3
+        """
+        norm = (
+            delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin
+        ) / 3
+        return density * norm
+
+    def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int):
+        r"""
+        Compute the quantization error if we use start_bin to end_bin as the
+        min and max to do the quantization.
+        """
+        bin_width = (self.max_val.item() - self.min_val.item()) / self.bins
+
+        dst_bin_width = bin_width * (next_end_bin - next_start_bin + 1) / self.dst_nbins
+        if dst_bin_width == 0.0:
+            return 0.0
+
+        src_bin = torch.arange(self.bins, device=self.histogram.device)
+        # distances from the beginning of first dst_bin to the beginning and
+        # end of src_bin
+        src_bin_begin = (src_bin - next_start_bin) * bin_width
+        src_bin_end = src_bin_begin + bin_width
+
+        # which dst_bins the beginning and end of src_bin belong to?
+        dst_bin_of_begin = torch.clamp(
+            torch.div(src_bin_begin, dst_bin_width, rounding_mode='floor'), 0, self.dst_nbins - 1
+        )
+        dst_bin_of_begin_center = (dst_bin_of_begin + 0.5) * dst_bin_width
+
+        dst_bin_of_end = torch.clamp(
+            torch.div(src_bin_end, dst_bin_width, rounding_mode='floor'), 0, self.dst_nbins - 1
+        )
+        density = self.histogram / bin_width
+
+        norm = torch.zeros(self.bins, device=self.histogram.device)
+
+        delta_begin = src_bin_begin - dst_bin_of_begin_center
+        delta_end = dst_bin_width / 2
+        norm += self._get_norm(delta_begin,
+                               torch.ones(self.bins, device=self.histogram.device) * delta_end,
+                               density)
+
+        norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm(
+            torch.tensor(-dst_bin_width / 2), torch.tensor(dst_bin_width / 2), density
+        )
+
+        dst_bin_of_end_center = dst_bin_of_end * dst_bin_width + dst_bin_width / 2
+
+        delta_begin = -dst_bin_width / 2
+        delta_end = src_bin_end - dst_bin_of_end_center
+        norm += self._get_norm(torch.tensor(delta_begin), delta_end, density)
+
+        return norm.sum().item()
+
+    def _non_linear_param_search(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""Non-linear parameter search.
+
+        An approximation for L2 error minimization for selecting min/max.
+        By selecting new min/max, we filter out outliers in input distribution.
+        This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
+        caffe2/quantization/server/norm_minimization.cc
+        """
+        assert self.histogram.size()[0] == self.bins, "bins mismatch"
+        bin_width = (self.max_val - self.min_val) / self.bins
+
+        # cumulative sum
+        total = torch.sum(self.histogram).item()
+        cSum = torch.cumsum(self.histogram, dim=0)
+
+        stepsize = 1e-5  # granularity
+        alpha = 0.0  # lower bound
+        beta = 1.0  # upper bound
+        start_bin = 0
+        end_bin = self.bins - 1
+        norm_min = float("inf")
+
+        while alpha < beta:
+            # Find the next step
+            next_alpha = alpha + stepsize
+            next_beta = beta - stepsize
+
+            # find the left and right bins between the quantile bounds
+            l = start_bin
+            r = end_bin
+            while l < end_bin and cSum[l] < next_alpha * total:
+                l = l + 1
+            while r > start_bin and cSum[r] > next_beta * total:
+                r = r - 1
+
+            # decide the next move
+            next_start_bin = start_bin
+            next_end_bin = end_bin
+            if (l - start_bin) > (end_bin - r):
+                # move the start bin
+                next_start_bin = l
+                alpha = next_alpha
+            else:
+                # move the end bin
+                next_end_bin = r
+                beta = next_beta
+
+            if next_start_bin == start_bin and next_end_bin == end_bin:
+                continue
+
+            # calculate the quantization error using next_start_bin and next_end_bin
+            norm = self._compute_quantization_error(next_start_bin, next_end_bin)
+
+            if norm > norm_min:
+                break
+            norm_min = norm
+            start_bin = next_start_bin
+            end_bin = next_end_bin
+
+        new_min = self.min_val + bin_width * start_bin
+        new_max = self.min_val + bin_width * (end_bin + 1)
+        return new_min, new_max
+
+    def _adjust_min_max(
+        self, combined_min: torch.Tensor, combined_max: torch.Tensor, upsample_rate: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
+        # We ensure that:
+        # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
+        # This allows us to have a common grid of resolution s, where we can align
+        # the input histogram
+        # start_idx maps min_val to the histogram bin index.
+
+        # Compute the width of histogram bins is a straightforward solution, where
+        # hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate)
+        # Underflow happens if the numerator is close to the smallest positive subnormal number of FP32
+        # Therefore, we avoid such division operation.
+        downsample_rate = int(
+            torch.ceil(
+                ((combined_max - combined_min) / (self.max_val - self.min_val)) * upsample_rate
+            ).item()
+        )
+        e = downsample_rate / upsample_rate * (self.max_val - self.min_val) - (combined_max - combined_min)
+        start_idx = int(
+            torch.round((self.min_val - combined_min) / (self.max_val - self.min_val) * self.bins * upsample_rate).item()
+        )
+        combined_max = combined_max + e
+        return combined_min, combined_max, downsample_rate, start_idx
+
+    def _combine_histograms(
+        self,
+        orig_hist: torch.Tensor,
+        new_hist: torch.Tensor,
+        upsample_rate: int,
+        downsample_rate: int,
+        start_idx: int,
+        Nbins: int,
+    ) -> torch.Tensor:
+        # First up-sample the histogram with new data by a factor of L
+        # This creates an approximate probability density thats piecewise constant
+        upsampled_histogram = new_hist.repeat_interleave(upsample_rate)
+        # Now insert the upsampled histogram into the output
+        # histogram, which is initialized with zeros.
+        # The offset at which the histogram is introduced is determined
+        # by the start index as the output histogram can cover a wider range
+        histogram_with_output_range = torch.zeros(
+            (Nbins * downsample_rate), device=orig_hist.device
+        )
+        histogram_with_output_range[
+            start_idx : Nbins * upsample_rate + start_idx
+        ] = upsampled_histogram
+        # Compute integral histogram, double precision is needed to ensure
+        # that there are no overflows
+        integral_histogram = torch.cumsum(
+            histogram_with_output_range, 0, dtype=torch.double
+        )[downsample_rate - 1 :: downsample_rate]
+        # Finally perform interpolation
+        shifted_integral_histogram = torch.zeros((Nbins), device=orig_hist.device)
+        shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1]
+        interpolated_histogram = (
+            integral_histogram - shifted_integral_histogram
+        ) / upsample_rate
+        orig_hist = orig_hist + interpolated_histogram.to(torch.float)
+        return orig_hist
+
+    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()
+        x_min, x_max = torch.aminmax(x)
+        # want to ignore torch.inf since we don't actually
+        # want to make our quantization range infinite
+        # and in practice those values will be clamped
+        if x_min == -torch.inf or x_max == torch.inf:
+            warnings.warn("torch.inf detected in input tensor, ignoring input")
+            x = x[x.abs() != torch.inf]
+            if x.numel() == 0:
+                return x_orig
+            x_min, x_max = torch.aminmax(x)
+        min_val = self.min_val
+        max_val = self.max_val
+        same_values = min_val.item() == max_val.item()
+        is_uninitialized = min_val == float("inf") and max_val == float("-inf")
+        if is_uninitialized or same_values:
+            min_val, max_val = x_min, x_max
+            self.min_val.resize_(min_val.shape)
+            self.min_val.copy_(min_val)
+            self.max_val.resize_(max_val.shape)
+            self.max_val.copy_(max_val)
+            assert (
+                min_val.numel() == 1 and max_val.numel() == 1
+            ), "histogram min/max values must be scalar."
+            torch.histc(
+                x, self.bins, min=min_val, max=max_val, out=self.histogram  # type: ignore[arg-type]
+            )
+        else:
+            new_min, new_max = x_min, x_max
+            combined_min = torch.min(new_min, min_val)
+            combined_max = torch.max(new_max, max_val)
+            # combine the existing histogram and new histogram into 1 histogram
+            # We do this by first upsampling the histogram to a dense grid
+            # and then downsampling the histogram efficiently
+            (
+                combined_min,
+                combined_max,
+                downsample_rate,
+                start_idx,
+            ) = self._adjust_min_max(combined_min, combined_max, self.upsample_rate)
+            assert (
+                combined_min.numel() == 1 and combined_max.numel() == 1
+            ), "histogram min/max values must be scalar."
+
+            # TODO: For some reason, this is required for it to pass torchscript test
+            # combined_min and combined_max should already have requires_grad set to False
+            combined_min, combined_max = combined_min.detach(), combined_max.detach()
+
+            combined_histogram = torch.histc(
+                x, self.bins, min=combined_min, max=combined_max  # type: ignore[arg-type]
+            )
+            if combined_min == min_val and combined_max == max_val:
+                combined_histogram += self.histogram
+            else:
+                combined_histogram = self._combine_histograms(
+                    combined_histogram,
+                    self.histogram,
+                    self.upsample_rate,
+                    downsample_rate,
+                    start_idx,
+                    self.bins,
+                )
+
+            self.histogram.detach_().resize_(combined_histogram.shape)
+            self.histogram.copy_(combined_histogram)
+            self.min_val.detach_().resize_(combined_min.shape)
+            self.min_val.copy_(combined_min)
+            self.max_val.detach_().resize_(combined_max.shape)
+            self.max_val.copy_(combined_max)
+        return x_orig
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        is_uninitialized = self.min_val == float("inf") and self.max_val == float(
+            "-inf"
+        )
+        if is_uninitialized:
+            warnings.warn(
+                "must run observer before calling calculate_qparams.\
+                                    Returning default scale and zero point "
+            )
+            return torch.tensor([1.0], device=self.min_val.device.type), torch.tensor([0], device=self.min_val.device.type)
+        assert self.bins == len(self.histogram), (
+            "The number of bins in histogram should be equal to the number of bins "
+            "supplied while making this observer"
+        )
+
+        new_min, new_max = self._non_linear_param_search()
+
+        return self._calculate_qparams(new_min, new_max)
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        super()._save_to_state_dict(destination, prefix, keep_vars)
+        destination[prefix + "min_val"] = self.min_val
+        destination[prefix + "max_val"] = self.max_val
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 3:
+            # if min_val and max_val are not initialized, update their shape
+            # to account for the differences between v2 and v3
+            min_val_name, max_val_name = prefix + "min_val", prefix + "max_val"
+            if min_val_name in state_dict:
+                if state_dict[min_val_name].shape == torch.Size([0]):
+                    state_dict[min_val_name] = torch.tensor(float("inf"))
+            if max_val_name in state_dict:
+                if state_dict[max_val_name].shape == torch.Size([0]):
+                    state_dict[max_val_name] = torch.tensor(float("-inf"))
+
+        local_state = ["min_val", "max_val"]
+        for name in local_state:
+            key = prefix + name
+            if key in state_dict:
+                val = state_dict[key]
+                setattr(self, name, val)
+            elif strict:
+                missing_keys.append(key)
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def extra_repr(self):
+        return f"min_val={self.min_val}, max_val={self.max_val}"
+
+
+class FixedQParamsObserver(ObserverBase):
+    r"""
+    Observer that simulates quantize and dequantize with fixed
+    quantization parameters in training time. Only per tensor
+    quantization is supported.
+
+    Args:
+        `scale` (float): fixed scale for the observer
+        `zero_point` (int): fixed zero point for the observer
+        `dtype`, `qscheme`, `quant_min`, `quant_max`
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+
+    def __init__(
+        self,
+        scale,
+        zero_point,
+        dtype=torch.quint8,
+        qscheme=torch.per_tensor_affine,
+        quant_min=0,
+        quant_max=255,
+        is_dynamic=False,
+        **kwargs,
+    ):
+        if is_dynamic:
+            raise NotImplementedError(
+                "FixedQParamsObserver doesn't support dynamic quantization"
+            )
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic, **kwargs)
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.register_buffer('scale', torch.tensor([scale], dtype=torch.float))
+        self.register_buffer('zero_point', torch.tensor([zero_point], dtype=torch.int))
+        self.dtype = dtype
+        self.qscheme = qscheme
+
+    def forward(self, X):
+        return X
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        return self.scale, self.zero_point
+
+
+class PlaceholderObserver(ObserverBase):
+    r"""
+    Observer that doesn't do anything and just passes its configuration to the
+    quantized module's ``.from_float()``.
+
+    Can be used for quantization to float16 which doesn't require determining
+    ranges.
+
+    Args:
+        dtype: dtype argument to the `quantize` node needed to implement the
+               reference model spec.
+        quant_min: minimum value in quantized domain (TODO: align behavior with other observers)
+        quant_max: maximum value in quantized domain
+        custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
+                        (Can be used in Graph Mode Passes for special case ops).
+        compute_dtype (deprecated): if set, marks the future quantize function to use
+                       dynamic quantization instead of static quantization.
+                       This field is deprecated, use `is_dynamic=True` instead.
+        is_dynamic: if True, the `quantize` function in the reference model
+                    representation taking stats from this observer instance will
+                    use dynamic quantization.
+    """
+
+    def __init__(
+        self, dtype=torch.float32, custom_op_name="", compute_dtype=None,
+        quant_min=None, quant_max=None, qscheme=None, eps=None,
+        is_dynamic=False,
+    ) -> None:
+        super().__init__(dtype=dtype, is_dynamic=is_dynamic)
+        if qscheme is None:
+            qscheme = torch.per_tensor_affine
+        if eps is None:
+            eps = torch.finfo(torch.float32).eps
+
+        # dtype of input of the target operator, e.g. for dynamic quantization
+        # ops, the dtype will be float32
+        self.dtype = dtype
+        self.qscheme = qscheme
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.eps = eps
+        self.custom_op = custom_op_name
+        # used for configuration of computation type for dynamic quantization
+        if compute_dtype:
+            is_dynamic = True
+            warnings.warn(
+                "Please use `is_dynamic` instead of `compute_dtype`. \
+                    `compute_dtype` will be deprecated in a future release \
+                    of PyTorch."
+            )
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def extra_repr(self):
+        return f"dtype={self.dtype}, is_dynamic={self.is_dynamic}"
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        raise Exception(
+            "calculate_qparams should not be called for PlaceholderObserver"
+        )
+
+
+class RecordingObserver(ObserverBase):
+    r"""
+    The module is mainly for debug and records the tensor values during runtime.
+
+    Args:
+        dtype: Quantized data type
+        qscheme: Quantization scheme to be used
+        reduce_range: Reduces the range of the quantized data type by 1 bit
+    """
+    __annotations__ = {"tensor_val": List[Optional[torch.Tensor]]}
+
+    def __init__(self, dtype=torch.quint8):
+        super().__init__(dtype=dtype, is_dynamic=False)  # type: ignore[call-arg]
+        self.tensor_val = []
+
+    def forward(self, x):
+        self.tensor_val.append(x.clone())
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        raise Exception("calculate_qparams should not be called for RecordingObserver")
+
+    @torch.jit.export
+    def get_tensor_value(self):
+        return self.tensor_val
+
+
+class NoopObserver(ObserverBase):
+    r"""
+    Observer that doesn't do anything and just passes its configuration to the
+    quantized module's ``.from_float()``.
+
+    Primarily used for quantization to float16 which doesn't require determining
+    ranges.
+
+    Args:
+        dtype: Quantized data type
+        custom_op_name: (temporary) specify this observer for an operator that doesn't require any observation
+                        (Can be used in Graph Mode Passes for special case ops).
+    """
+
+    def __init__(self, dtype=torch.float16, custom_op_name="") -> None:
+        super().__init__(dtype=dtype, is_dynamic=False)
+        self.dtype = dtype
+        self.custom_op = custom_op_name
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        raise Exception("calculate_qparams should not be called for NoopObserver")
+
+class ReuseInputObserver(ObserverBase):
+    r""" This observer is used when we want to reuse the observer from the operator
+    that produces the input Tensor, typically used for operators like reshape, e.g.
+    ```
+    x0 = ...
+    x1 = x0.reshape()
+    ```
+    if we configure x0 to be observed by some observer, let's say MinMaxObserver,
+    and reshape is configured with ReuseInputObserver, we'll reuse the observer instance
+    for x0 for x1 (output of reshape). If x0 is not observed, we also won't observe x1.
+
+    Note: this is only enabled in FX Graph Mode Quantization
+    """
+    def __init__(self):
+        super().__init__(torch.quint8, is_dynamic=False)
+
+    def forward(self, x):
+        return x
+
+    @torch.jit.export
+    def calculate_qparams(self):
+        raise Exception("calculate_qparams should not be called for ReuseInputObserver")
+
+def _is_observer_script_module(mod, obs_type_name):
+    """Returns true if given mod is an instance of Observer script module."""
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        # qualified name looks like '__torch__.torch.ao.quantization.observer.___torch_mangle_2.MinMaxObserver'
+        suffix = mod._c.qualified_name.split(".", 1)[1]
+        name = re.sub(r"\.___torch_mangle_\d+", "", suffix)
+        return obs_type_name in name
+    return False
+
+
+def _is_activation_post_process(module):
+    return (
+        isinstance(module, (torch.ao.quantization.ObserverBase,
+                            torch.ao.quantization.FakeQuantizeBase)) or _is_observer_script_module(module, "quantization.observer")
+    )
+
+
+def _is_per_channel_script_obs_instance(module):
+    if isinstance(module, torch.jit.RecursiveScriptModule):
+        return _is_observer_script_module(
+            module, "quantization.observer.PerChannelMinMaxObserver"
+        ) or _is_observer_script_module(
+            module, "quantization.observer.MovingAveragePerChannelMinMaxObserver"
+        )
+    return False
+
+
+def get_observer_state_dict(mod):
+    r"""
+    Returns the state dict corresponding to the observer stats.
+    Traverse the model state_dict and extract out the stats.
+    """
+    od = OrderedDict()
+    if isinstance(mod, torch.jit.RecursiveScriptModule):
+        for k, v in mod.state_dict().items():
+            if "observer" in k:
+                od[k] = v
+    else:
+        # path for GraphModule and nn.Module (eager mode)
+        for k, v in mod.state_dict().items():
+            if "activation_post_process" in k:
+                od[k] = v
+    od._metadata = mod.state_dict()._metadata  # type: ignore[attr-defined]
+    return od
+
+
+def load_observer_state_dict(mod, obs_dict):
+    r"""
+    Given input model and a state_dict containing model observer stats,
+    load the stats back into the model. The observer state_dict can be saved
+    using torch.ao.quantization.get_observer_state_dict
+    """
+    missing_keys: List[str] = []
+    unexpected_keys: List[str] = []
+    for name, module in mod.named_modules():
+        prefix = name + "."
+        if _is_activation_post_process(module):
+            if _is_per_channel_script_obs_instance(module):
+                # For per-channel observers we need to call a custom load_from_state_dict to resize the tensor.
+                # However this is not called when the module is scripted and we end up calling the default one in module.py
+                module._load_from_state_dict_script(
+                    obs_dict, prefix, {}, True, missing_keys, unexpected_keys, []
+                )
+            else:
+                module._load_from_state_dict(
+                    obs_dict, prefix, {}, False, missing_keys, unexpected_keys, []
+                )
+    for k in missing_keys:
+        if "observer" in k or "activation_post_process" in k:
+            raise Exception(f"Missing keys for observer {k} in state_dict")
+    for k in unexpected_keys:
+        if "observer" in k or "activation_post_process" in k:
+            raise Exception(f"Unexpected keys for observer {k} in state_dict")
+
+
+# Restrict activations to be in the range (0,127)
+default_observer = MinMaxObserver.with_args(quant_min=0, quant_max=127)
+"""
+Default observer for static quantization, usually used for debugging.
+"""
+
+default_placeholder_observer = PlaceholderObserver
+"""
+Default placeholder observer, usually used for quantization to torch.float16.
+"""
+
+default_debug_observer = RecordingObserver
+"""
+Default debug-only observer.
+"""
+
+default_weight_observer = MinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric
+)
+"""
+Default weight observer.
+"""
+
+weight_observer_range_neg_127_to_127 = MinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric,
+    quant_min=-127, quant_max=127, eps=2 ** -12)
+"""
+Symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+default_histogram_observer = HistogramObserver.with_args(quant_min=0, quant_max=127)
+"""
+Default histogram observer, usually used for PTQ.
+"""
+
+default_per_channel_weight_observer = PerChannelMinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_channel_symmetric
+)
+"""
+Default per-channel weight observer, usually used on backends where per-channel
+weight quantization is supported, such as `fbgemm`.
+"""
+
+per_channel_weight_observer_range_neg_127_to_127 = PerChannelMinMaxObserver.with_args(
+    dtype=torch.qint8, qscheme=torch.per_channel_symmetric,
+    quant_min=-127, quant_max=127, eps=2 ** -12)
+"""
+Per-channel, symmetric weight observer with the 8-bit values restricted to [-127, +127], excluding -128.
+"""
+
+default_dynamic_quant_observer = PlaceholderObserver.with_args(
+    dtype=torch.quint8, quant_min=0, quant_max=255, is_dynamic=True,
+)
+"""
+Default observer for dynamic quantization.
+"""
+
+default_float_qparams_observer = PerChannelMinMaxObserver.with_args(
+    dtype=torch.quint8, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+)
+"""
+Default observer for a floating point zero-point.
+"""
+
+default_float_qparams_observer_4bit = PerChannelMinMaxObserver.with_args(
+    dtype=torch.quint4x2, qscheme=torch.per_channel_affine_float_qparams, ch_axis=0
+)
+"""
+Default observer for a floating point zero-point and 4 bit activations.
+"""
+
+# TODO(future PR): remove these defaults and enforce activation functions
+# to explicitly specify their output range
+default_fixed_qparams_range_neg1to1_observer = FixedQParamsObserver.with_args(
+    scale=2.0 / 256.0, zero_point=128, dtype=torch.quint8, quant_min=0, quant_max=255)
+default_fixed_qparams_range_0to1_observer = FixedQParamsObserver.with_args(
+    scale=1.0 / 256.0, zero_point=0, dtype=torch.quint8, quant_min=0, quant_max=255)
+# TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
+default_symmetric_fixed_qparams_observer = default_fixed_qparams_range_neg1to1_observer
+default_affine_fixed_qparams_observer = default_fixed_qparams_range_0to1_observer
+
+"""
+Default observers for fixed qparams operations.
+"""
+
+default_reuse_input_observer = ReuseInputObserver
+"""
+Default observer for operators like reshape that reuses the observer of input to
+the operator
+"""
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d4c3a5c76441472b7241c9545d3eeeeb49269f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32be676ab67b098d8236792f350199d87e06e3a8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/duplicate_dq_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e88759f060a10af9dde483824aac3064a07abbc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/export_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/generate_numeric_debug_handle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/generate_numeric_debug_handle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1becfa40540a4d5095cb797d710130bedabc0985
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/generate_numeric_debug_handle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/graph_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/graph_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d32dc3b1523ddc9824e9b061e6b117d659f30cae
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/graph_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/port_metadata_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/port_metadata_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48949f110c41e99577b5c2b0c5fbdc24101640e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/port_metadata_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/prepare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/prepare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..400fbea40d61442e714fc2789fa85abc6a5d8e00
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/prepare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/qat_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/qat_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b940ca39992969e4e4b1e897a9f095db9fbc207b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/qat_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da8152f5c770ee7cf2cd22f610b3495bb73bc73a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/duplicate_dq_pass.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/duplicate_dq_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..8882292db139d23d5cb4f752663e8fc6d522d07e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/duplicate_dq_pass.py
@@ -0,0 +1,83 @@
+import logging
+import operator
+
+import torch
+
+from torch.ao.quantization.pt2e.utils import (
+    _filter_sym_size_users,
+    _is_valid_annotation,
+)
+
+from torch.fx.node import map_arg
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+__all__ = ["DuplicateDQPass"]
+
+_QUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+]
+
+_DEQUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+]
+
+
+def _maybe_duplicate_dq(
+    gm: torch.fx.GraphModule, dq_node: torch.fx.Node, user: torch.fx.Node
+):
+    annotation = user.meta.get("quantization_annotation", None)
+    if not _is_valid_annotation(annotation):
+        return
+    with gm.graph.inserting_after(dq_node):
+        new_node = gm.graph.node_copy(dq_node)
+
+        def maybe_replace_node(n: torch.fx.Node) -> torch.fx.Node:
+            if n == dq_node:
+                return new_node
+            else:
+                return n
+
+        new_args = map_arg(user.args, maybe_replace_node)
+        new_kwargs = map_arg(user.kwargs, maybe_replace_node)
+        user.args = new_args
+        user.kwargs = new_kwargs
+
+
+class DuplicateDQPass(PassBase):
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op == "call_function" and node.target in _DEQUANTIZE_OPS:
+                dq_users = _filter_sym_size_users(node)
+                if len(dq_users) <= 1:
+                    continue
+                # Do not duplicate dq for dynamic quantization
+                # Pattern: choose_qparam - getitem - q - dq
+                q_node = node.args[0]
+                if q_node.op == "call_function" and q_node.target in _QUANTIZE_OPS:
+                    getitem_node = q_node.args[1]
+                    if (
+                        isinstance(getitem_node, torch.fx.node.Node)
+                        and getitem_node.op == "call_function"
+                        and getitem_node.target == operator.getitem
+                    ):
+                        choose_qparam_node = getitem_node.args[0]
+                        if (
+                            isinstance(choose_qparam_node, torch.fx.node.Node)
+                            and choose_qparam_node.op == "call_function"
+                            and choose_qparam_node.target
+                            == torch.ops.quantized_decomposed.choose_qparams.tensor
+                        ):
+                            continue
+                for user in dq_users:
+                    _maybe_duplicate_dq(graph_module, node, user)
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/export_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe884508bcb9137b9772e6c11f120cd9549cd30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/export_utils.py
@@ -0,0 +1,211 @@
+import types
+
+import torch
+import torch.nn.functional as F
+
+
+__all__ = [
+    "model_is_exported",
+    "_WrapperModule",
+]
+
+
+class _WrapperModule(torch.nn.Module):
+    """Class to wrap a callable in an :class:`torch.nn.Module`. Use this if you
+    are trying to export a callable.
+    """
+
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, *args, **kwargs):
+        """Simple forward that just calls the ``fn`` provided to :meth:`WrapperModule.__init__`."""
+        return self.fn(*args, **kwargs)
+
+
+def model_is_exported(m: torch.nn.Module) -> bool:
+    """
+    Return True if the `torch.nn.Module` was exported, False otherwise
+    (e.g. if the model was FX symbolically traced or not traced at all).
+    """
+    return isinstance(m, torch.fx.GraphModule) and any(
+        "val" in n.meta for n in m.graph.nodes
+    )
+
+
+def _replace_dropout(m: torch.fx.GraphModule, train_to_eval: bool):
+    """
+    Switch dropout patterns in the model between train and eval modes.
+
+    Dropout has different behavior in train vs eval mode. For exported models,
+    however, calling `model.train()` or `model.eval()` does not automatically switch
+    the dropout behavior between the two modes, so here we need to rewrite the aten
+    dropout patterns manually to achieve the same effect.
+
+    See https://github.com/pytorch/pytorch/issues/103681.
+    """
+    # Avoid circular dependencies
+    from .utils import get_aten_graph_module
+
+    # Needed to ensure subgraph matches are self-contained
+    m.graph.eliminate_dead_code()
+    m.recompile()
+
+    for inplace in [False, True]:
+
+        def dropout_train(x):
+            return F.dropout(x, p=0.5, training=True, inplace=inplace)
+
+        def dropout_eval(x):
+            return F.dropout(x, p=0.5, training=False, inplace=inplace)
+
+        example_inputs = (torch.randn(1),)
+        if train_to_eval:
+            match_pattern = get_aten_graph_module(
+                _WrapperModule(dropout_train), example_inputs
+            )
+            replacement_pattern = get_aten_graph_module(
+                _WrapperModule(dropout_eval), example_inputs
+            )
+        else:
+            match_pattern = get_aten_graph_module(
+                _WrapperModule(dropout_eval), example_inputs
+            )
+            replacement_pattern = get_aten_graph_module(
+                _WrapperModule(dropout_train), example_inputs
+            )
+
+        from torch.fx.subgraph_rewriter import replace_pattern_with_filters
+
+        replace_pattern_with_filters(
+            m,
+            match_pattern,
+            replacement_pattern,
+            match_filters=[],
+            ignore_literals=True,
+        )
+        m.recompile()
+
+
+def _replace_batchnorm(m: torch.fx.GraphModule, train_to_eval: bool):
+    """
+    Switch batchnorm patterns in the model between train and eval modes.
+
+    Batchnorm has different behavior in train vs eval mode. For exported models,
+    however, calling `model.train()` or `model.eval()` does not automatically switch
+    the batchnorm behavior between the two modes, so here we need to rewrite the aten
+    batchnorm patterns manually to achieve the same effect.
+    """
+    # TODO(Leslie): This function still fails to support custom momentum and eps value.
+    # Enable this support in future updates.
+
+    # Avoid circular dependencies
+    from .utils import get_aten_graph_module
+
+    # Needed to ensure subgraph matches are self-contained
+    m.graph.eliminate_dead_code()
+    m.recompile()
+
+    def bn_train(
+        x: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ):
+        return F.batch_norm(
+            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True
+        )
+
+    def bn_eval(
+        x: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ):
+        return F.batch_norm(
+            x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=False
+        )
+
+    example_inputs = (
+        torch.randn(1, 1, 3, 3),  # x
+        torch.randn(1),  # bn_weight
+        torch.randn(1),  # bn_bias
+        torch.randn(1),  # bn_running_mean
+        torch.randn(1),  # bn_running_var
+    )
+    if train_to_eval:
+        match_pattern = get_aten_graph_module(_WrapperModule(bn_train), example_inputs)
+        replacement_pattern = get_aten_graph_module(
+            _WrapperModule(bn_eval), example_inputs
+        )
+    else:
+        match_pattern = get_aten_graph_module(_WrapperModule(bn_eval), example_inputs)
+        replacement_pattern = get_aten_graph_module(
+            _WrapperModule(bn_train), example_inputs
+        )
+
+    from torch.fx.subgraph_rewriter import replace_pattern_with_filters
+
+    replace_pattern_with_filters(
+        m,
+        match_pattern,
+        replacement_pattern,
+        match_filters=[],
+        ignore_literals=True,
+    )
+    m.recompile()
+
+
+# TODO: expose these under this namespace?
+def _move_exported_model_to_eval(model: torch.fx.GraphModule):
+    """
+    Move an exported GraphModule to eval mode.
+
+    This is equivalent to model.eval() but only for certain special ops like dropout, batchnorm.
+    QAT users should call this before performing inference on the model.
+    """
+    _replace_dropout(model, train_to_eval=True)
+    _replace_batchnorm(model, train_to_eval=True)
+    return model
+
+
+def _move_exported_model_to_train(model: torch.fx.GraphModule):
+    """
+    Move an exported GraphModule to train mode.
+
+    This is equivalent to model.train() but only for certain special ops like dropout, batchnorm.
+    QAT users should call this before performing training on the model.
+    """
+    _replace_dropout(model, train_to_eval=False)
+    _replace_batchnorm(model, train_to_eval=False)
+    return model
+
+
+def _allow_exported_model_train_eval(model: torch.fx.GraphModule):
+    """
+    Allow users to call `model.train()` and `model.eval()` on an exported model,
+    but with the effect of changing behavior between the two modes limited to special
+    ops only, which are currently dropout and batchnorm.
+
+    Note: This does not achieve the same effect as what `model.train()` and `model.eval()`
+    does in eager models, but only provides an approximation. In particular, user code
+    branching on `training` flag will not function correctly in general because the branch
+    is already specialized at export time. Additionally, other ops beyond dropout and batchnorm
+    that have different train/eval behavior will also not be converted properly.
+    """
+
+    def _train(self, mode: bool = True):
+        if mode:
+            _move_exported_model_to_train(self)
+        else:
+            _move_exported_model_to_eval(self)
+
+    def _eval(self):
+        _move_exported_model_to_eval(self)
+
+    model.train = types.MethodType(_train, model)  # type: ignore[method-assign]
+    model.eval = types.MethodType(_eval, model)  # type: ignore[method-assign]
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/generate_numeric_debug_handle.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/generate_numeric_debug_handle.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcb555b4756b72f99233f4ebf27dc517282ecc0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/generate_numeric_debug_handle.py
@@ -0,0 +1,17 @@
+from torch.fx import GraphModule, Node
+
+__all__ = ["generate_numeric_debug_handle"]
+
+
+def generate_numeric_debug_handle(graph_module: GraphModule) -> None:
+    unique_id = 0
+    for node in graph_module.graph.nodes:
+        if node.op == "call_function":
+            node.meta["numeric_debug_handle"] = {}
+            for arg in node.args:
+                if isinstance(arg, Node):
+                    node.meta["numeric_debug_handle"][arg] = unique_id
+                    unique_id += 1
+
+            node.meta["numeric_debug_handle"]["output"] = unique_id
+            unique_id += 1
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/graph_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/graph_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff0644a3785bfccdb58527d58b90ad0e0b66aa48
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/graph_utils.py
@@ -0,0 +1,109 @@
+import itertools
+from typing import Any, List, OrderedDict, Set, Optional, Callable
+import operator
+from torch.fx import Node
+
+import torch
+
+from torch.fx.passes.utils.source_matcher_utils import (
+    check_subgraphs_connected,
+    get_source_partitions,
+    SourcePartition,
+)
+
+__all__ = [
+    "find_sequential_partitions",
+    "get_equivalent_types",
+    "update_equivalent_types_dict",
+]
+
+_EQUIVALENT_TYPES: List[Set] = [
+    {torch.nn.Conv1d, torch.nn.functional.conv1d},
+    {torch.nn.Conv2d, torch.nn.functional.conv2d},
+    {torch.nn.AdaptiveAvgPool2d, torch.nn.functional.adaptive_avg_pool2d},
+    {torch.nn.ReLU, torch.nn.functional.relu, torch.nn.functional.relu_},
+    {torch.nn.BatchNorm2d, torch.nn.functional.batch_norm},
+    {torch.nn.Hardtanh, torch.nn.functional.hardtanh, torch.nn.functional.hardtanh_},
+    {torch.add, operator.add, operator.iadd, "add", "add_"},
+    {torch.mul, operator.mul, operator.imul, "mul", "mul_"},
+]
+
+
+def _create_equivalent_types_dict():
+    _DICT = {}
+    for values in _EQUIVALENT_TYPES:
+        for v in values:
+            _DICT[v] = list(values)
+    return _DICT
+
+
+_EQUIVALENT_TYPES_DICT = _create_equivalent_types_dict()
+
+def get_equivalent_types() -> List[Set]:
+    return _EQUIVALENT_TYPES
+
+def update_equivalent_types_dict(customized_equivalent_types=None):
+    """Help function for user who wants to customize the _EQUIVALENT_TYPES and _EQUIVALENT_TYPES_DICT.
+    When customized_equivalent_types passes in,
+    re-generate _EQUIVALENT_TYPES and _EQUIVALENT_TYPES_DICT.
+    """
+    if customized_equivalent_types is None:
+        raise ValueError("customized_equivalent_types should not be None")
+    global _EQUIVALENT_TYPES
+    global _EQUIVALENT_TYPES_DICT
+    _EQUIVALENT_TYPES = customized_equivalent_types
+    _EQUIVALENT_TYPES_DICT = _create_equivalent_types_dict()
+
+def _partitions_sequential(partitions: List[SourcePartition]):
+    prev_partition = None
+    for partition in partitions:
+        if prev_partition is not None and not check_subgraphs_connected(
+            prev_partition, partition
+        ):
+            return False
+        prev_partition = partition
+    return True
+
+
+def _get_matching_types(partition_type):
+    matching_types = [partition_type]
+    if partition_type in _EQUIVALENT_TYPES_DICT:
+        matching_types.extend(_EQUIVALENT_TYPES_DICT[partition_type])
+    return matching_types
+
+
+def _valid_type_sequence(partition_types: List[Any]):
+    partition_types_set = set()  # type: ignore[var-annotated]
+    for partition_type in partition_types:
+        matching_types = _get_matching_types(partition_type)
+        matching_types_set = set(matching_types)
+        if len(partition_types_set & matching_types_set) > 0:
+            return False
+        partition_types_set |= matching_types_set
+    return True
+
+
+def find_sequential_partitions(
+    gm: torch.fx.GraphModule,
+    partition_types: List[Any],
+    include_functional_equivalent=True,
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+):
+    if not _valid_type_sequence(partition_types):
+        raise ValueError(
+            f"Invalid partition types: {partition_types}. Each type in the sequence must be unique"
+        )
+
+    typed_partitions: OrderedDict[Any, List[SourcePartition]] = OrderedDict()
+    for partition_type in partition_types:
+        types_to_match = _get_matching_types(partition_type)
+        partitions = get_source_partitions(gm.graph, types_to_match, filter_fn)
+        typed_partitions[partition_type] = list(itertools.chain.from_iterable(partitions.values()))
+
+    typed_partitions_list = list(typed_partitions.values())
+    fusion_candidates = itertools.product(*typed_partitions_list)
+    fused_partitions = []
+    for candidate in fusion_candidates:
+        if _partitions_sequential(candidate):  # type: ignore[arg-type]
+            fused_partitions.append(candidate)
+    return fused_partitions
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/port_metadata_pass.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/port_metadata_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..68a836ff60ab1f93961ced1415d357b6e1855d64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -0,0 +1,198 @@
+import logging
+from typing import Optional
+
+import torch
+from torch._export.error import InternalError
+
+from torch.ao.quantization.pt2e.utils import (
+    _filter_sym_size_users,
+    _find_q_dq_node_for_user,
+    _is_valid_annotation,
+)
+
+from torch.ao.quantization.quantizer import QuantizationSpecBase
+
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.ERROR)
+
+__all__ = ["PortNodeMetaForQDQ"]
+
+_METADATA_TO_PORT = [
+    "stack_trace",
+    "quantization_tag",
+]
+
+_QUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+]
+
+_DEQUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+]
+
+
+def _add_metadata(to_node: torch.fx.Node, from_node: torch.fx.Node) -> None:
+    from_meta = from_node.meta
+    for meta_name in _METADATA_TO_PORT:
+        if meta_name in from_meta:
+            to_node.meta[meta_name] = from_meta[meta_name]
+
+
+def _has_quant_annotation(node: torch.fx.Node) -> bool:
+    return "quantization_annotation" in node.meta
+
+
+def _find_choose_qparams_node(node: torch.fx.Node) -> Optional[torch.fx.Node]:
+    # BFS to look for choose qparams
+    from collections import deque
+
+    queue = deque(list(node.users.keys()))
+    while len(queue):
+        n = queue.popleft()
+        if n.op == "output":
+            continue
+        if (
+            n.op == "call_function"
+            and n.target == torch.ops.quantized_decomposed.choose_qparams.tensor
+        ):
+            return n
+        for k in n.users.keys():
+            queue.append(k)
+    return None
+
+
+def _port_metadata_for_input_quant_nodes(
+    input_node: torch.fx.Node,
+    node: torch.fx.Node,
+    qspec: Optional[QuantizationSpecBase],
+):
+    if qspec is None:
+        return
+
+    is_dynamic_quant = getattr(qspec, "is_dynamic", None)
+    if is_dynamic_quant is not None and is_dynamic_quant is True:
+        choose_qparams_node = _find_choose_qparams_node(input_node)
+        if choose_qparams_node is None:
+            raise ValueError(f"No chose qparams node found for {node}")
+        choose_qparam_users = _filter_sym_size_users(choose_qparams_node)
+        if len(choose_qparam_users) != 2:
+            raise InternalError(f"Expecting exactly two user for {choose_qparams_node}")
+        scale_node = choose_qparam_users.pop()
+        dynamic_q_node = next(iter(scale_node.users.keys()))
+        dynamic_q_node_users = _filter_sym_size_users(dynamic_q_node)
+        if len(dynamic_q_node_users) > 1:
+            raise InternalError(f"Expecting single user for {dynamic_q_node}")
+        dynamic_dq_node = dynamic_q_node_users.pop()
+        _add_metadata(choose_qparams_node, node)
+        _add_metadata(dynamic_q_node, node)
+        _add_metadata(dynamic_dq_node, node)
+    else:
+        q_node, dq_node = _find_q_dq_node_for_user(input_node, node)
+        if q_node is None or dq_node is None:
+            return
+        # add metadata for all the node between q_node and get_attr node
+        # if the q_node can be traced back to get_attr node
+        q_to_get_attr_nodes = [q_node]
+        q_node_input = q_node.args[0]
+        while isinstance(q_node_input, torch.fx.Node) and q_node_input.op not in [
+            "placeholder",
+            "get_attr",
+        ]:
+            q_to_get_attr_nodes.append(q_node_input)
+            q_node_input = q_node_input.args[0]
+        if isinstance(q_node_input, torch.fx.Node) and q_node_input.op == "get_attr":
+            for n in q_to_get_attr_nodes:
+                _add_metadata(n, q_node_input)
+        _add_metadata(dq_node, node)
+
+
+def _port_metadata_for_output_quant_nodes(
+    node: torch.fx.Node, qspec: Optional[QuantizationSpecBase]
+):
+    if qspec is None:
+        return
+
+    node_users = _filter_sym_size_users(node)
+    if len(node_users) != 1:
+        raise InternalError(f"Expecting {node} to have single user")
+    q_node = node_users.pop()
+    if q_node.op != "call_function" or q_node.target not in _QUANTIZE_OPS:
+        logger.warning(
+            f"Expecting {node} user to be a quantized op but got {q_node}"  # noqa: G004
+        )  # noqa: G004
+        return
+
+    _add_metadata(q_node, node)
+
+
+class PortNodeMetaForQDQ(PassBase):
+    """
+    Port metadata for nodes added by quantization flow.
+    For static quant these are:
+    - quantizer_per_tensor.default, dequantize_per_tensor.default
+    - quantizer_per_channel.default, dequantize_per_channel.default
+    For dynamic quant these are:
+    - choose_qparams.tensor
+    - quantizer_per_tensor.tensor, dequantize_per_tensor.tensor
+    - quantizer_per_channel.default, dequantize_per_channel.default
+
+    Rules of porting metadata:
+    - Metadata to be ported:
+      - nn_module_stack
+      - stack_trace
+      - quantization_tag
+    - Metadata to NOT be ported:
+      - Everything else
+    - Rules:
+      - Statically quantized patterns:
+        - Dequantize nodes on the inputs to be quantized inherit metadata of the consumer node.
+        - Quantize nodes on the outputs inherit metadata of the producer node.
+        - Example 1:
+          - Original: [Conv -> AvgPool -> Linear]
+          - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
+          - Inner brackets specify which nodes Q/DQ inherit metdata from
+          - [Q-> [DQ -> Conv -> Q] -> [DQ -> AvgPool -> Q] -> [DQ -> Linear -> Q] -> DQ]
+          - Note first Q and last DQ do not inherit metadata from any nodes
+        - Example 2:
+          - Original: [Conv -> AvgPool -> Linear]
+          - AvgPool is not quantized
+          - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> Linear -> Q -> DQ]
+          - Inner brackets specify which nodes Q/DQ inherit metdata from
+          - [Q-> [DQ -> Conv -> Q] -> DQ -> [AvgPool] -> Q -> [DQ -> Linear -> Q] -> DQ]
+          - Note DQ and Q nodes around AvgPool do not inherit metadata from AvgPool because
+            AvgPool was not supposed to be quantized. Metadata porting relies on quantization_annotation
+            on the nodes (in this case AvgPool node) to conclude if the node or patter was
+            supposed to be quantized. And subsequntly decide if the preceding Q, if any, should
+            inherit metadata from AvgPool.
+      - Dynamically quantized patterns:
+        - Input that are dynamically quantized have choose_qparams, quantize and dequantize nodes
+        - For example, below linear is dynamically quantized while rest statically:
+          - Original: [Conv -> AvgPool -> Linear]
+          - Quantized [Q-> DQ -> Conv -> Q -> DQ -> AvgPool -> Q -> DQ -> choose_params -> Q -> DQ -> Linear]
+          - Quantized [Q-> [DQ -> Conv -> Q] -> [DQ -> AvgPool -> Q] -> DQ -> [choose_params -> Q -> DQ -> Linear]]
+          - Note first Q does not inherit metadata from any nodes
+    NB:
+    - The best place for porting metadata is during observer conversion to q/dq. This is because it precisely
+      knows which quantization spec is converted to q/dq and thus from where the metadata should be ported.
+      However, since FX and PT2E quant workflow are on a common code-base, this hurts readability quite a bit.
+      Doing it via a separate pass, helps readability of the code. Once we are able to refactor PT2E quant
+      code, this pass should like to be integrated in the refactored variant of "convert" step.
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            annotation = node.meta.get("quantization_annotation", None)
+            if _is_valid_annotation(annotation):
+                input_qspec_map = node.meta["quantization_annotation"].input_qspec_map
+                output_qspec = node.meta["quantization_annotation"].output_qspec
+                for input_node, qspec in input_qspec_map.items():
+                    _port_metadata_for_input_quant_nodes(input_node, node, qspec)
+                _port_metadata_for_output_quant_nodes(node, output_qspec)
+        return PassResult(graph_module, True)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/prepare.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb45684e3a162789c447641876866da5b214fa0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/prepare.py
@@ -0,0 +1,489 @@
+import torch
+from torch._subclasses import FakeTensor
+from torch.ao.quantization.fx.prepare import (
+    _insert_obs_or_fq,
+    _save_state,
+    _is_activation_post_process_node,
+    _create_obs_or_fq_from_qspec,
+)
+from torch.fx import (
+    GraphModule,
+    Graph,
+    Node,
+)
+from torch.fx.node import Argument
+
+from torch.ao.quantization import QConfigMapping
+from torch.ao.quantization.qconfig import QConfigAny
+from torch.ao.quantization.fx.custom_config import PrepareCustomConfig
+from typing import Dict, Tuple, Union, Any, Optional
+from torch.ao.quantization.quantizer import (
+    EdgeOrNode,
+    SharedQuantizationSpec,
+    QuantizationSpecBase,
+)
+from torch.ao.quantization import ObserverOrFakeQuantize
+
+# TODO: make pt2e folder private?
+__all__ = [
+    "prepare",
+]
+
+
+def _find_root_edge_or_node(edge_or_node: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> EdgeOrNode:
+    """Find the root node for the sharing tree
+    Args:
+        edge_or_node: edge/node that we want to find the root
+        shared_with_map: each edge/node points to the parent, the root node will points to itself
+
+    Returns:
+        root edge/node
+    """
+    parent = shared_with_map[edge_or_node]
+    if parent == edge_or_node:
+        return edge_or_node
+    root = _find_root_edge_or_node(parent, shared_with_map)
+    # path compression
+    shared_with_map[edge_or_node] = root
+    return root
+
+def _union(parent: EdgeOrNode, child: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> None:
+    """Merge the subtree for `child` with `parent`, the order is important here
+    """
+    root_parent = _find_root_edge_or_node(parent, shared_with_map)
+    root_child = _find_root_edge_or_node(child, shared_with_map)
+    # union the two trees by pointing the root of child to root of parent
+    shared_with_map[root_child] = root_parent
+
+def _update_shared_with(child: EdgeOrNode, qspec: QuantizationSpecBase, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]):
+    """Update the `shared_with_map` based on the qspec, this applies the `SharedQuantizationSpec`
+    configuration and established the relationship between `edge_or_node` with the edge/node that it
+    is pointing to, we'll use this information in the end to get the group id
+    """
+    if isinstance(qspec, SharedQuantizationSpec):
+        parent = qspec.edge_or_node
+        # we point from edge_or_node to the node that it is sharing_with, e.g.
+        # qspec for a = SharedQuantizationSpec(b) means `a` points to `b`
+        _union(parent, child, shared_with_map)
+
+def _unwrap_shared_qspec(
+    qspec: QuantizationSpecBase,
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase],
+    shared_with_map: Dict[EdgeOrNode, EdgeOrNode]
+) -> QuantizationSpecBase:
+    """Unwraps qspec to get the final root qspec (non SharedQuantizationSpec)
+    if qspec is SharedQuantizationSpec
+       (1). tries to find the root edge or node for the node that the qspec points to
+       (2). recursively find the root qspec based on the qspec for the root node
+    """
+    if isinstance(qspec, SharedQuantizationSpec):
+        sharing_with = qspec.edge_or_node
+        root = _find_root_edge_or_node(sharing_with, shared_with_map)
+        qspec = edge_or_node_to_qspec[root]
+        return _unwrap_shared_qspec(qspec, edge_or_node_to_qspec, shared_with_map)
+    return qspec
+
+def _has_same_dtype(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase):
+    return (
+        hasattr(qspec_a, "dtype") and
+        hasattr(qspec_b, "dtype") and
+        qspec_a.dtype == qspec_b.dtype
+    )
+
+def _has_same_is_dynamic(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase):
+    return (
+        hasattr(qspec_a, "is_dynamic") and
+        hasattr(qspec_b, "is_dynamic") and
+        qspec_a.is_dynamic == qspec_b.is_dynamic
+    )
+
+def _get_edge_or_node_to_qspec(model: torch.fx.GraphModule) -> Dict[EdgeOrNode, QuantizationSpecBase]:
+    """Get a map from EdgeOrNode to quantization spec based on annotations on the nodes
+    """
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase] = {}
+    for n in model.graph.nodes:
+        if hasattr(n, "meta") and "quantization_annotation" in n.meta:
+            qa = n.meta["quantization_annotation"]
+            for input_to_n, qspec in qa.input_qspec_map.items():
+                input_edge = (input_to_n, n)
+                edge_or_node_to_qspec[input_edge] = qspec
+            if qa.output_qspec is not None:
+                output_node = n
+                qspec = qa.output_qspec
+                edge_or_node_to_qspec[output_node] = qspec
+    return edge_or_node_to_qspec
+
+def _union_input_edge_with(input_edge, input_edge_root_qspec, edge_or_node, edge_or_node_to_qspec, shared_with_map):
+    """Union input edge with another edge or node, used in implicit sharing to point the current input
+    edge to other user edges of the producer node, or the output of producer node since these are
+    referring to the same Tensor
+    """
+    root_qspec = None
+    if edge_or_node in edge_or_node_to_qspec:
+        qspec = edge_or_node_to_qspec[edge_or_node]
+        root_qspec = _unwrap_shared_qspec(qspec, edge_or_node_to_qspec, shared_with_map)
+    # TODO: add assertions for types of root qspecs
+    if (
+        root_qspec is not None and
+        _has_same_dtype(root_qspec, input_edge_root_qspec) and
+        _has_same_is_dynamic(root_qspec, input_edge_root_qspec)
+    ):
+        # the input arg to the node should reuse the existing output observer for arg
+        # since dtype is the same (we may want to extend this to be a more strict check
+        # in the future)
+        # so we point from `input_edge` to `arg` (output of the argument)
+        _union(edge_or_node, input_edge, shared_with_map)
+
+
+def _get_edge_or_node_to_group_id(edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase]) -> Dict[EdgeOrNode, int]:
+    """Map from edge/node to the group ID, generated from quantization annotations,
+    edge/node with the same group ID should use the same observer/fake_quant instance
+
+    This is applying SharedQuantizationSpec configuration and map each edge/node to a group
+    There is another implicit sharing that's built in the quantization, when we have the following:
+       * op1 -> op2
+       * output of op1: int8_qspec
+       * (op1 -> op2) input edge: int8_qspec
+    we'll assume sharing between the output of op1 and input of (op1 -> op2) since these are the same Tensor.
+
+    Figuring out the correct group ID for all edge/node is a standard union find problem:
+    https://www.geeksforgeeks.org/introduction-to-disjoint-set-data-structure-or-union-find-algorithm/
+
+    Args:
+        edge_or_node_to_qspec: Dictionary from edge_or_node to the qspec, derived from annotations
+    Returns:
+        edge_or_node_to_group_id: Dictionary from edge_or_node to group_id (int), all edge or node that
+        belongs to the same group should have the same id
+
+    Example:
+        op2 -> cat1 -> cat2
+           op1 /        /
+                     op3
+        edge_or_node_to_qspec: {
+            op1: int8_qspec,
+            op2: int8_qspec,
+            (op1, cat1): int8_qspc,
+            (op2, cat1): SharedQuantizationSpec((op1, cat1)),
+            cat1: SharedQuantizationSpec((op1, cat1)),
+            (op3, cat2): int8_qspec,
+            (cat1, cat2): SharedQuantizationSpec((op3, cat2)),
+            cat2: SharedQuantizationSpec((op3, cat2)),
+        }
+
+        edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec)
+        edge_or_node_to_group_id: {
+            op1: 1,
+            op2: 1,
+            (op1, cat1): 1,
+            (op2, cat1): 1,
+            cat1: 1,
+            (op3, cat2): 1,
+            (cat1, cat2): 1,
+            cat2: 1,
+        }
+        # everything are in the same group because (cat1) and (cat1, cat2) are implicitly shared, which
+        # connects the two sharing group around cat1 and cat2 op due to transitive sharing
+    """
+    # means the observer of key should be shared with observer with value, by default it will
+    # be shared with itself
+    shared_with_map: Dict[EdgeOrNode, EdgeOrNode] = {k: k for k in edge_or_node_to_qspec.keys()}
+    for edge_or_node, qspec in edge_or_node_to_qspec.items():
+        if isinstance(edge_or_node, torch.fx.Node):
+            output_node = edge_or_node
+            _update_shared_with(output_node, qspec, shared_with_map)
+        else:
+            input_edge = edge_or_node
+            input_edge_root_qspec = _unwrap_shared_qspec(qspec, edge_or_node_to_qspec, shared_with_map)
+
+            assert isinstance(input_edge, tuple)
+            arg, n = input_edge
+            if n.meta["quantization_annotation"].allow_implicit_sharing:
+                # NOTE: the order is important here, we first share with other users and then share with previous
+                # output because the reverse order could cause circular dependency
+                # e.g node1 -> node2
+                #          \ -> node3
+                # when processing (node1, node2), if we first point (node1, node2) to node1
+                # Step 1. shared_map = {(node1, node2): node1}
+                # Step 2. after that, we point the (node1, node2) to its other user (node1, node3) ,
+                # which means shared_map = {(node1, node2): node1, node1: (node1, node3)}
+                # because we will point the root of (node1, node2) (in this case node1) to the root of (node1, node3)
+                # Step 3. and when we process (node1, node3), it can try to point to node1 as well, then we'll
+                # have a circular dependency
+                # the following order works around this issue, but this does not allow arbitrary configuration
+                # of sharing so it might break in a different case in the future, when it breaks
+                # quantizer writer can check the notes here to debug the issue
+
+                # sharing with other users of the producer node
+                # (arg, user)
+                if not isinstance(arg, Node) or not isinstance(n, Node):
+                    raise Exception(f"Expected input_edge to have type Tuple[Node, Node], but got: {arg, n}")
+                for user in arg.users:
+                    if user is n:
+                        continue
+                    arg_to_user_edge = (arg, user)
+                    _union_input_edge_with(
+                        input_edge,
+                        input_edge_root_qspec,
+                        arg_to_user_edge,
+                        edge_or_node_to_qspec,
+                        shared_with_map
+                    )
+
+                # sharing with output of producer node
+                _union_input_edge_with(input_edge, input_edge_root_qspec, arg, edge_or_node_to_qspec, shared_with_map)
+
+            _update_shared_with(input_edge, qspec, shared_with_map)
+
+    # now that we get the sharing relations between all edges and nodes, we can assingn group ids
+    cur_group_id = 0
+    edge_or_node_to_group_id: Dict[EdgeOrNode, int] = {}
+    for edge_or_node in shared_with_map.keys():
+        root = _find_root_edge_or_node(edge_or_node, shared_with_map)
+        if root not in edge_or_node_to_group_id:
+            edge_or_node_to_group_id[root] = cur_group_id
+            cur_group_id += 1
+        edge_or_node_to_group_id[edge_or_node] = edge_or_node_to_group_id[root]
+
+    return edge_or_node_to_group_id
+
+def _get_obs_or_fq_map(
+    edge_or_node_to_group_id: Dict[EdgeOrNode, int],
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase],
+    is_qat: bool
+) -> Dict[EdgeOrNode, ObserverOrFakeQuantize]:
+    """Generates the EdgeOrNode to observer/fake_quant instances
+    Makes sure that for EdgeOrNode that has the same group_id should have the same observer or fake quant
+    instances
+    """
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+    group_id_to_obs_or_fq: Dict[int, ObserverOrFakeQuantize] = {}
+    for edge_or_node, qspec in edge_or_node_to_qspec.items():
+        group_id = edge_or_node_to_group_id[edge_or_node]
+        if group_id not in group_id_to_obs_or_fq:
+            # TODO: maybe edge_or_node_to_qspec should be edge_or_node_to_root_qspec, this will simplify
+            # the implementation for _create_obs_or_fq_from_qspec
+            group_id_to_obs_or_fq[group_id] = _create_obs_or_fq_from_qspec(qspec, obs_or_fq_map, is_qat)
+        obs_or_fq_map[edge_or_node] = group_id_to_obs_or_fq[group_id]
+    return obs_or_fq_map
+
+def _maybe_insert_input_observer_for_arg_or_kwarg(
+    node: Union[Node, Any],
+    arg: Argument,
+    qconfig: QConfigAny,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Argument:
+    """
+    Given a `node` and an `arg`, inserts an input observer between
+    `node` and `arg` if necessary.
+    """
+    # for ops such as torch.cat([x0, x1]),
+    # traverse through the list
+    if isinstance(arg, (list, tuple)):
+        new_arg_to_return = []
+        for inner_arg in arg:
+            new_inner_arg = _maybe_insert_input_observer_for_arg_or_kwarg(
+                node, inner_arg, qconfig, model, named_modules, obs_or_fq_map, is_qat,
+            )
+            new_arg_to_return.append(new_inner_arg)
+        return type(arg)(new_arg_to_return)
+
+    if not isinstance(arg, Node):
+        return arg
+    assert isinstance(arg, Node)
+    # default (no observer)
+    new_arg = arg
+
+    # find the original `arg` node to the current node, skipping inserted observer/fake_quant nodes
+    original_arg = arg
+    while _is_activation_post_process_node(original_arg, named_modules):
+        original_arg = original_arg.args[0]  # type: ignore[assignment]
+    assert isinstance(original_arg, Node), f"expect original argument to be a Node, but got: {type(original_arg)}"
+
+    input_edge = (original_arg, node)
+    if input_edge not in obs_or_fq_map:
+        return new_arg
+    # input_edge needs to be observed
+    input_edge_obs_or_fq = obs_or_fq_map[input_edge]
+    if input_edge_obs_or_fq is None:
+        return new_arg
+
+    arg_as_output_obs_or_fq = obs_or_fq_map.get(original_arg, None)
+    # the arg is observed as the output and is using the same instance as the input_edge
+    # we'll reuse the inserted observer/fake_quant
+    if arg_as_output_obs_or_fq is not None and id(arg_as_output_obs_or_fq) == id(input_edge_obs_or_fq):
+        return new_arg
+
+    # otherwise, we'll insert a new observer/fake_quant node
+
+    existing_obs_node = None
+    # skip inserting new observers if the same observer instance is inserted before for another user
+    # Example:
+    # conv1 -> obs1 -> existing_obs -> conv2
+    #             \ -> conv3
+    #
+    # instead of inserting new observers we will have:
+    # conv1 -> obs1 -> existing_obs -> conv2
+    #                            \ -> conv3
+    for maybe_obs_node in arg.users.keys():
+        if not _is_activation_post_process_node(maybe_obs_node, named_modules):
+            continue
+        maybe_obs_mod = named_modules[maybe_obs_node.target]  # type: ignore[index]
+        if id(maybe_obs_mod) == id(input_edge_obs_or_fq):
+            return maybe_obs_node
+
+    new_arg = _insert_obs_or_fq(arg, input_edge_obs_or_fq, model, named_modules, model.graph)
+    return new_arg
+
+def _maybe_insert_input_observers_for_node(
+    node: Node,
+    qconfig: QConfigAny,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> None:
+    """
+    If needed, inserts observers to the input args and kwargs of `node`.
+    Note: modifies `node` inplace.
+
+    For example, if cur_node needs an observer after prev_node, we change from
+
+      prev_node -> cur_node
+
+    To
+
+      prev_node -> obs -> cur_node
+
+    """
+    # Look through every input arg.  If that arg's target dtype does not
+    # match the current node's target dtype, insert an observer.
+    new_args = []
+    # map from old arg to new arg, used for updating the numeric debug handle map
+    remap = {}
+    for arg in node.args:
+        new_arg = _maybe_insert_input_observer_for_arg_or_kwarg(
+            node, arg, qconfig, model, named_modules, obs_or_fq_map, is_qat,
+        )
+        new_args.append(new_arg)
+        remap[arg] = new_arg
+
+    if "numeric_debug_handle" in node.meta:
+
+        def remap_fn(x):
+            return remap.get(x, x)
+
+        numeric_debug_handle = node.meta["numeric_debug_handle"]
+        node.meta["numeric_debug_handle"] = {remap_fn(k): v for k, v in numeric_debug_handle.items()}
+
+    # Clone has a memory_format kwarg and zeros_like has a pin_memory kwarg
+    # that persist in exported graph. This is just a work around for these.
+    assert (
+        node.target == torch.ops.aten.clone.default or
+        node.target == torch.ops.aten.zeros_like.default or
+        len(node.kwargs) == 0
+    ), " expecting kwargs for aten op IR to be empty"
+
+    # assign the new args to the node, inplace
+    node.args = tuple(new_args)
+
+def _maybe_insert_output_observer_for_node(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Optional[Node]:
+    if node in obs_or_fq_map:
+        output_act_obs_or_fq = obs_or_fq_map[node]
+        return _insert_obs_or_fq(node, output_act_obs_or_fq, model, named_modules, graph)
+    return None
+
+def _maybe_insert_input_and_output_observers_for_node(
+    node: Node,
+    model: torch.fx.GraphModule,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+):
+    this_node_quantization_annotation = node.meta["quantization_annotation"] if "quantization_annotation" in node.meta else None
+    if this_node_quantization_annotation is None:
+        return
+
+    named_modules = dict(model.named_modules(remove_duplicate=False))
+    _maybe_insert_input_observers_for_node(
+        node,
+        None,  # qconfig
+        model,
+        named_modules,
+        obs_or_fq_map,
+        is_qat,
+    )
+
+    output_is_a_tensor = "val" in node.meta and isinstance(node.meta["val"], FakeTensor)
+    if not output_is_a_tensor:
+        return
+
+    # this returns the new observer node if it was needed
+    maybe_output_obs_node = _maybe_insert_output_observer_for_node(
+        node, model, named_modules, model.graph, obs_or_fq_map, is_qat)
+
+    if maybe_output_obs_node is None:
+        return
+    # Update users of original node to use the output observer
+    # instead. For example, change
+    #
+    #           next_node
+    #          /
+    #   cur_node -> obs
+    #
+    # to
+    #
+    #                 next_node
+    #                 /
+    #   cur_node -> obs
+    #
+    # We need to save orig users before updating uses because
+    # the list of users will change as we update uses
+    orig_users = list(node.users.keys())
+    for user_node in orig_users:
+        if user_node is maybe_output_obs_node:
+            continue
+        user_node.replace_input_with(node, maybe_output_obs_node)
+
+def prepare(
+    model: GraphModule,
+    node_name_to_scope: Dict[str, Tuple[str, type]],
+    is_qat: bool,
+) -> GraphModule:
+    # Since we are mutating the graph as we go, we iterate over the original
+    # nodes before observer insertion, instead of model.graph.nodes.
+    nodes_before_observation = list(model.graph.nodes)
+
+    # At the high level we construct a map from EdgeOrNode to a observer_or_fake_quant instance
+    # all edge/nodes that belongs to the same group will use the same instance
+    # and when we insert observers we'll just query this map to get the correct observer_or_fake_quant
+    # instance
+    edge_or_node_to_qspec = _get_edge_or_node_to_qspec(model)
+    edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec)
+    obs_or_fq_map = _get_obs_or_fq_map(edge_or_node_to_group_id, edge_or_node_to_qspec, is_qat)
+
+    for node in nodes_before_observation:
+        # TODO: simplify logic for inserting observers
+        _maybe_insert_input_and_output_observers_for_node(node, model, obs_or_fq_map, is_qat)
+
+    model = GraphModule(model, model.graph)
+
+    _save_state(
+        model,
+        {},  # node_name_to_qconfig
+        node_name_to_scope,
+        PrepareCustomConfig(),
+        {},  # equalization_node_name_to_qconfig
+        QConfigMapping(),
+        is_qat,
+        set()  # observed_node_names
+    )
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/qat_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/qat_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0527d506d2bdb750640d822b4bc4f6f4e638a75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/qat_utils.py
@@ -0,0 +1,788 @@
+import dataclasses
+import itertools
+import operator
+from typing import Any, Callable, Dict, List, Tuple, TYPE_CHECKING
+
+import torch
+from torch.fx import Graph, GraphModule, Node
+from torch.fx.subgraph_rewriter import (
+    replace_pattern_with_filters,
+    ReplacedPatterns,
+)
+import torch.nn.functional as F
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
+from torch.ao.quantization.quantizer import (
+    DerivedQuantizationSpec,
+    EdgeOrNode,
+    SharedQuantizationSpec,
+    QuantizationSpecBase,
+)
+from .utils import (
+    _conv1d_bn_example_inputs,
+    _conv2d_bn_example_inputs,
+    _is_conv,
+    _is_bn_node,
+    fold_bn_weights_into_conv_node,
+    get_aten_graph_module,
+)
+
+if TYPE_CHECKING:
+    from torch.fx.passes.utils.matcher_with_name_node_map_utils import InternalMatch
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+# Example inputs for quantized and folded conv-bn1d patterns used in convert
+_quantized_conv1d_bn_example_inputs = (
+    torch.randn(1, 1, 3),  # x
+    torch.randn(1, 1, 1),  # conv_weight
+    torch.randn(1),        # bn_weight
+    torch.randn(1),        # bn_bias
+    torch.randn(1),        # bn_running_mean
+    torch.randn(1),        # bn_running_var
+)
+
+# Example inputs for quantized and folded conv-bn2d patterns used in convert
+_quantized_conv2d_bn_example_inputs = (
+    torch.randn(1, 1, 3, 3),  # x
+    torch.randn(1, 1, 1, 1),  # conv_weight
+    torch.randn(1),           # bn_weight
+    torch.randn(1),           # bn_bias
+    torch.randn(1),           # bn_running_mean
+    torch.randn(1),           # bn_running_var
+)
+
+
+def _get_quantized_conv_bn_example_inputs_kwargs(
+    is_per_channel: bool,
+    has_bias: bool,
+    is_cuda: bool,
+) -> Dict[str, Any]:
+    """
+    Optional example inputs for quantized and folded conv-bn patterns
+    used in convert, expressed as kwargs.
+    """
+    kwargs = {}
+    # Per tensor quantization uses literals to represent scale and zero
+    # point, so there is no need to include them here as kwargs
+    if is_per_channel:
+        kwargs["scale"] = torch.tensor([1], dtype=torch.float)
+        kwargs["zero_point"] = torch.tensor([0], dtype=torch.int)
+    if has_bias:
+        kwargs["conv_bias"] = torch.randn(1)
+    if is_cuda:
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                kwargs[k] = v.cuda()
+    return kwargs
+
+def _get_conv_bn_pattern(conv_fn: Callable) -> Callable:
+    def _conv_bn_pattern(
+        x: torch.Tensor,
+        conv_weight: torch.Tensor,
+        conv_bias: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ) -> torch.Tensor:
+        x = conv_fn(x, conv_weight, conv_bias)
+        x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True)
+        return x
+    return _WrapperModule(_conv_bn_pattern)
+
+# TODO: merge this with the `no_conv_bias` case
+def _get_qat_conv_bn_pattern(conv_fn: Callable) -> Callable:
+    def _qat_conv_bn_pattern(
+        x: torch.Tensor,
+        conv_weight: torch.Tensor,
+        conv_bias: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Approximated method to fuse conv and bn. It requires only one forward pass.
+        conv_orig = conv / scale_factor where scale_factor = bn.weight / running_std.
+        This is based on `nniqat.ConvBn2d._forward_approximate`.
+        """
+        # TODO: allow setting eps
+        bn_eps = 1e-5
+        running_std = torch.sqrt(bn_running_var + bn_eps)
+        scale_factor = bn_weight / running_std
+        weight_shape = [1] * len(conv_weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(conv_weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
+        zero_bias = torch.zeros_like(conv_bias, dtype=x.dtype)
+        x = conv_fn(x, scaled_weight, zero_bias)
+        x = x / scale_factor.reshape(bias_shape)
+        x = x + conv_bias.reshape(bias_shape)
+        x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True, eps=bn_eps)
+        return x
+    return _WrapperModule(_qat_conv_bn_pattern)
+
+def _get_qat_conv_bn_pattern_no_conv_bias(conv_fn: Callable) -> Callable:
+    def _qat_conv_bn_pattern_no_conv_bias(
+        x: torch.Tensor,
+        conv_weight: torch.Tensor,
+        # Not used, only for matching convenience
+        conv_bias: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Same as `_get_qat_conv_bn_pattern`, but handles the case with no conv bias.
+        """
+        # TODO: allow setting eps
+        bn_eps = 1e-5
+        running_std = torch.sqrt(bn_running_var + bn_eps)
+        scale_factor = bn_weight / running_std
+        weight_shape = [1] * len(conv_weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(conv_weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
+        x = conv_fn(x, scaled_weight, None)
+        x = x / scale_factor.reshape(bias_shape)
+        x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=True, eps=bn_eps)
+        return x
+    return _WrapperModule(_qat_conv_bn_pattern_no_conv_bias)
+
+def _append_qdq(x, is_per_channel, kwargs):
+    """
+    Helper function to append q-dq ops after `x`, using dummy values for the qparams
+    and qmin/qmax. We use dummy values here because we match with `ignore_literals=True`
+    and will manually replace these values after subgraph rewriting.
+
+    Return the dq node.
+    """
+    # Dummy args to be passed into q-dq ops
+    per_channel_axis = 0
+    scale = kwargs["scale"] if is_per_channel else 1.0
+    zp = kwargs["zero_point"] if is_per_channel else 0
+    qmin = -127
+    qmax = 127
+    dtype = torch.int8
+
+    qd = torch.ops.quantized_decomposed
+    if is_per_channel:
+        x = qd.quantize_per_channel(x, scale, zp, per_channel_axis, qmin, qmax, dtype)
+        x = qd.dequantize_per_channel(x, scale, zp, per_channel_axis, qmin, qmax, dtype)
+    else:
+        x = qd.quantize_per_tensor(x, scale, zp, qmin, qmax, dtype)
+        x = qd.dequantize_per_tensor(x, scale, zp, qmin, qmax, dtype)
+    return x
+
+def _get_quantized_qat_conv_bn_pattern(
+    is_per_channel: bool,
+    has_bias: bool,
+    bias_is_quantized: bool,
+    conv_fn: Callable,
+    bn_is_training: bool,
+) -> Callable:
+    """
+    Return the quantized version of QAT conv + BN pattern.
+    This is based on `nniqat.ConvBn2d._forward_approximate`,
+    used in QAT convert. We first match this pattern and replace
+    it with the normal [conv - bn] pattern, then fold the BN
+    weights into conv.
+    """
+    # TODO: allow setting eps
+    bn_eps = 1e-5
+
+    def _quantized_qat_conv_bn_pattern(
+        x: torch.Tensor,
+        conv_weight: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        running_std = torch.sqrt(bn_running_var + bn_eps)
+        scale_factor = bn_weight / running_std
+        weight_shape = [1] * len(conv_weight.shape)
+        weight_shape[0] = -1
+        bias_shape = [1] * len(conv_weight.shape)
+        bias_shape[1] = -1
+        scaled_weight = conv_weight * scale_factor.reshape(weight_shape)
+        scaled_weight = _append_qdq(scaled_weight, is_per_channel, kwargs)
+        if has_bias:
+            zero_bias = torch.zeros_like(kwargs["conv_bias"], dtype=x.dtype)
+            if bias_is_quantized:
+                zero_bias = _append_qdq(zero_bias, is_per_channel, kwargs)
+            x = conv_fn(x, scaled_weight, zero_bias)
+        else:
+            x = conv_fn(x, scaled_weight, None)
+        x = x / scale_factor.reshape(bias_shape)
+        if has_bias:
+            x = x + kwargs["conv_bias"].reshape(bias_shape)
+        x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=bn_is_training, eps=bn_eps)
+        return x
+    return _WrapperModule(_quantized_qat_conv_bn_pattern)
+
+def _get_folded_quantized_qat_conv_bn_pattern(
+    is_per_channel: bool,
+    has_bias: bool,
+    bias_is_quantized: bool,
+    conv_fn: Callable,
+    bn_is_training: bool,
+) -> Callable:
+    """
+    Quantized QAT conv - bn pattern with bn weights being folded into conv.
+    """
+    # TODO: allow setting eps
+    bn_eps = 1e-5
+
+    def _folded_quantized_qat_conv_bn_pattern(
+        x: torch.Tensor,
+        conv_weight: torch.Tensor,
+        bn_weight: torch.Tensor,
+        bn_bias: torch.Tensor,
+        bn_running_mean: torch.Tensor,
+        bn_running_var: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        conv_weight = _append_qdq(conv_weight, is_per_channel, kwargs)
+        if has_bias:
+            bias = kwargs["conv_bias"]
+            if bias_is_quantized:
+                bias = _append_qdq(bias, is_per_channel, kwargs)
+        else:
+            bias = None
+        x = conv_fn(x, conv_weight, bias)
+        x = F.batch_norm(x, bn_running_mean, bn_running_var, bn_weight, bn_bias, training=bn_is_training, eps=bn_eps)
+        return x
+    return _WrapperModule(_folded_quantized_qat_conv_bn_pattern)
+
+def _has_conv_bias_filter(
+    match: "InternalMatch",
+    original_graph: Graph,
+    pattern_graph: Graph,
+) -> bool:
+    """
+    Match filter for the subgraph rewriter that returns True if the conv node in
+    the original graph has bias.
+    """
+    for n in match.nodes_map.values():
+        if _is_conv(n):
+            return len(n.args) > 2 and n.args[2] is not None
+    raise ValueError("Could not find conv node in matched conv + bn pattern")
+
+def _no_conv_bias_filter(
+    match: "InternalMatch",
+    original_graph: Graph,
+    pattern_graph: Graph,
+) -> bool:
+    """
+    Match filter for the subgraph rewriter that returns True if the conv node in
+    the original graph does NOT have bias.
+    """
+    return not _has_conv_bias_filter(match, original_graph, pattern_graph)
+
+def _is_quantize(n: Node) -> bool:
+    return n.target in [
+        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+        torch.ops.quantized_decomposed.quantize_per_channel.default,
+    ]
+
+def _is_dequantize(n: Node) -> bool:
+    return n.target in [
+        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+        torch.ops.quantized_decomposed.dequantize_per_channel.default,
+    ]
+
+def _get_conv_bn_pattern_nodes(r: ReplacedPatterns) -> Dict[str, Tuple[Node, Node]]:
+    """
+    Helper function to extract the nodes in the conv-bn fusion pattern after
+    subgraph rewriting, in the form of a map:
+
+        {name: (original_node, replacement_node)}
+
+    The following names must exist in the map:
+
+        "conv", "conv_weight", "conv_input", "bn", "getitem"
+
+    The following names may exist in the map:
+
+        "conv_weight_q", "conv_weight_dq", "conv_bias",
+        "conv_bias_q", "conv_bias_dq"
+    """
+    def _get_nodes(nodes: List[Node]) -> Tuple[Node, Node, Node]:
+        """
+        Return a 3-tuple of (conv_node, bn_node, getitem_node).
+        This asserts that the match contains exactly one of each node.
+        """
+        conv_node, bn_node, getitem_node = None, None, None
+        for n in nodes:
+            if n.op != "call_function":
+                continue
+            if _is_conv(n):
+                assert conv_node is None
+                conv_node = n
+            if _is_bn_node(n):
+                assert bn_node is None
+                bn_node = n
+            if n.target == operator.getitem:
+                assert getitem_node is None
+                getitem_node = n
+        assert conv_node is not None
+        assert bn_node is not None
+        assert getitem_node is not None
+        return (conv_node, bn_node, getitem_node)
+
+    def _get_q_dq_nodes(n: Node) -> Tuple[Node, Node, Node]:
+        """
+        Return a 3-tuple of (orig_node, q_node, dq_node).
+        """
+        assert _is_dequantize(n)
+        q_node = n.args[0]
+        assert isinstance(q_node, Node)
+        assert _is_quantize(q_node)
+        orig_node = q_node.args[0]
+        assert isinstance(orig_node, Node)
+        return (orig_node, q_node, n)
+
+    original_nodes = list(_filter_nodes_map(r.nodes_map).values())
+    o_conv, o_bn, o_getitem = _get_nodes(original_nodes)
+    r_conv, r_bn, r_getitem = _get_nodes(r.replacements)
+
+    # Create the mapping from original node to replacement node
+    mapping = {
+        "conv": (o_conv, r_conv),
+        "bn": (o_bn, r_bn),
+        "getitem": (o_getitem, r_getitem),
+    }
+
+    # Extract conv input and weight
+    # Note: here we extract the original nodes indirectly through the pattern nodes
+    # because the args of the original nodes are no longer available after replacement
+    (p_conv, _, _) = _get_nodes(list(r.nodes_map.keys()))
+    (p_conv_input, p_conv_weight, *_) = p_conv.args
+    (r_conv_input, r_conv_weight, *_) = r_conv.args
+    assert isinstance(p_conv_input, Node)
+    assert isinstance(p_conv_weight, Node)
+    assert isinstance(r_conv_input, Node)
+    assert isinstance(r_conv_weight, Node)
+    o_conv_input = r.nodes_map[p_conv_input]
+    o_conv_weight = r.nodes_map[p_conv_weight]
+
+    # If conv weight is quantized, extract the q - dq nodes
+    if _is_dequantize(p_conv_weight):
+        p_conv_weight, p_conv_weight_q, p_conv_weight_dq = _get_q_dq_nodes(p_conv_weight)
+        r_conv_weight, r_conv_weight_q, r_conv_weight_dq = _get_q_dq_nodes(r_conv_weight)
+        o_conv_weight = r.nodes_map[p_conv_weight]
+        o_conv_weight_q = r.nodes_map[p_conv_weight_q]
+        o_conv_weight_dq = r.nodes_map[p_conv_weight_dq]
+        mapping["conv_weight_q"] = (o_conv_weight_q, r_conv_weight_q)
+        mapping["conv_weight_dq"] = (o_conv_weight_dq, r_conv_weight_dq)
+    mapping["conv_input"] = (o_conv_input, r_conv_input)
+    mapping["conv_weight"] = (o_conv_weight, r_conv_weight)
+
+    # Extract conv bias
+    if len(p_conv.args) > 2 and len(r_conv.args) > 2:
+        p_conv_bias = p_conv.args[2]
+        r_conv_bias = r_conv.args[2]
+        assert isinstance(p_conv_bias, Node)
+        assert isinstance(r_conv_bias, Node)
+        o_conv_bias = r.nodes_map[p_conv_bias]
+
+        # If conv bias is quantized, extract the q - dq nodes
+        if _is_dequantize(p_conv_bias):
+            p_conv_bias, p_conv_bias_q, p_conv_bias_dq = _get_q_dq_nodes(p_conv_bias)
+            r_conv_bias, r_conv_bias_q, r_conv_bias_dq = _get_q_dq_nodes(r_conv_bias)
+            o_conv_bias = r.nodes_map[p_conv_bias]
+            o_conv_bias_q = r.nodes_map[p_conv_bias_q]
+            o_conv_bias_dq = r.nodes_map[p_conv_bias_dq]
+            mapping["conv_bias_q"] = (o_conv_bias_q, r_conv_bias_q)
+            mapping["conv_bias_dq"] = (o_conv_bias_dq, r_conv_bias_dq)
+        mapping["conv_bias"] = (o_conv_bias, r_conv_bias)
+    return mapping
+
+def _filter_nodes_map(nodes_map: Dict[Node, Node]) -> Dict[Node, Node]:
+    """
+    Return a filtered `nodes_map` returned from the subgraph rewriter.
+    The filtered `nodes_map` will contain only nodes that are actually
+    matched in the pattern, excluding None or placeholder nodes.
+    """
+    new_nodes_map: Dict[Node, Node] = {}
+    for pattern_node, graph_node in nodes_map.items():
+        # bias can be None
+        if graph_node is None:
+            continue
+        # skip pattern placeholder nodes
+        if pattern_node.op == "placeholder":
+            continue
+        new_nodes_map[pattern_node] = graph_node
+    return new_nodes_map
+
+# TODO: this is error prone, use the replace_literals_with_placeholders hack instead
+def _copy_over_literal_conv_args(original_node: Node, new_node: Node):
+    """
+    Copy over literal args in conv, such as stride and padding, from the matched node
+    in the original graph to its replacement in the new graph.
+
+    This is needed due to the following limitation in the subgraph rewriter when used
+    with dynamo export: literal (non-tensor) args are not supported in the match and
+    replacement patterns. This is because dynamo export automatically inlines these
+    literal args, making them dead placeholder nodes. In the future, we should check
+    if dynamo export can optionally disable this inlining, or if subgraph rewriter
+    can do the copying for us. See https://github.com/pytorch/pytorch/issues/100419.
+
+    Note: Unlike other tensor args like conv weights and biases, literal args are
+    preserved in the original nodes after replacement, so we can access them here.
+    """
+    assert _is_conv(original_node)
+    assert _is_conv(new_node)
+    # x, weight, bias, [stride, padding, dilation, transposed, output_padding, groups]
+    new_args = list(new_node.args)
+    if len(new_args) < 3:
+        # bias is optional, when it is not present, it means it is None
+        new_args.append(None)
+    new_node.args = tuple(new_args[:3]) + original_node.args[3:]
+
+def _update_conv_input_qspec_map_after_replacement(original_node: Node, replacement_node: Node):
+    """
+    Update the `input_qspec_map` in the annotation after subgraph rewriting.
+
+    The original annotation referred to the nodes in the original graph,
+    so the keys in the `input_qspec_map` will need to be updated to reflect
+    the corresponding nodes in the replacement graph.
+    """
+    assert _is_conv(original_node)
+    assert _is_conv(replacement_node)
+    if "quantization_annotation" not in original_node.meta:
+        return
+    original_input_qspec_map = original_node.meta["quantization_annotation"].input_qspec_map
+    input_qspec_map = {}
+    # get the list of configs, it should be ordered as input, weight, bias
+    # note: this is really hacky, we need a better solution, hopefully
+    # in subgraph_rewriter, issue tracking the problem: https://github.com/pytorch/pytorch/issues/101820
+    all_configs = list(original_input_qspec_map.items())
+    # input activation
+    input_qspec_map[replacement_node.args[0]] = all_configs[0][1]
+    # weight
+    input_qspec_map[replacement_node.args[1]] = all_configs[1][1]
+    # bias
+    if len(replacement_node.args) > 2 and len(all_configs) > 2:
+        input_qspec_map[replacement_node.args[2]] = all_configs[2][1]
+    replacement_node.meta["quantization_annotation"].input_qspec_map = input_qspec_map
+
+def _update_special_qspecs_after_replacement(
+    node: Node,
+    original_to_replacement_node: Dict[Node, Node],
+):
+    """
+    Update the `SharedQuantizationSpec`s and `DerivedQuantizationSpec`s
+    used in `node`'s quantization annotation after subgraph rewriting.
+
+    The original annotation referred to the nodes in the original graph,
+    so the nodes used in these special quantization specs will need to
+    be updated to the corresponding nodes in the replacement graph.
+    """
+    def _get_new_edge_or_node(edge_or_node: EdgeOrNode):
+        if isinstance(edge_or_node, Node):
+            _node = edge_or_node
+            return original_to_replacement_node.get(_node, _node)
+        elif isinstance(edge_or_node, tuple) and len(edge_or_node) == 2 and all(isinstance(x, Node) for x in edge_or_node):
+            src, dest = edge_or_node
+            return (
+                original_to_replacement_node.get(src, src),
+                original_to_replacement_node.get(dest, dest),
+            )
+        else:
+            raise ValueError("unexpected type for edge_or_node: ", type(edge_or_node))
+
+    def _get_new_qspec(qspec: QuantizationSpecBase):
+        if isinstance(qspec, SharedQuantizationSpec):
+            new_edge_or_node = _get_new_edge_or_node(qspec.edge_or_node)
+            return SharedQuantizationSpec(new_edge_or_node)
+        elif isinstance(qspec, DerivedQuantizationSpec):
+            new_derived_from = [_get_new_edge_or_node(x) for x in qspec.derived_from]
+            return dataclasses.replace(qspec, derived_from=new_derived_from)
+        else:
+            return qspec
+
+    if "quantization_annotation" not in node.meta:
+        return
+    annotation = node.meta["quantization_annotation"]
+    for input_node, qspec in annotation.input_qspec_map.items():
+        annotation.input_qspec_map[input_node] = _get_new_qspec(qspec)
+    annotation.output_qspec = _get_new_qspec(annotation.output_qspec)
+
+def _fuse_conv_bn_qat(m: GraphModule) -> GraphModule:
+    has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
+    if not has_bn:
+        return m
+    m = _fuse_conv_bn_qat_helper(m, F.conv1d, _conv1d_bn_example_inputs, is_cuda=False)
+    m = _fuse_conv_bn_qat_helper(m, F.conv2d, _conv2d_bn_example_inputs, is_cuda=False)
+    if torch.cuda.is_available():
+        m = _fuse_conv_bn_qat_helper(m, F.conv1d, _conv1d_bn_example_inputs, is_cuda=True)
+        m = _fuse_conv_bn_qat_helper(m, F.conv2d, _conv2d_bn_example_inputs, is_cuda=True)
+    return m
+
+def _fuse_conv_bn_qat_helper(
+    m: GraphModule,
+    conv_fn: Callable,
+    example_inputs: Tuple[Any, ...],
+    is_cuda: bool,
+) -> GraphModule:
+    """
+    Given a graph of decomposed aten ops, replace the (conv + bn) pattern with
+    the fused QAT subgraph equivalent. The input graph should already be annotated.
+    The annotations in the original nodes will be preserved in the corresponding
+    nodes in the new subgraph.
+
+    Note: This also handles the (conv + bn + relu) pattern.
+    """
+    m.graph.eliminate_dead_code()
+    m.recompile()
+    conv_bn_pattern = _get_conv_bn_pattern(conv_fn)
+    match_pattern = get_aten_graph_module(conv_bn_pattern, example_inputs, is_cuda)
+
+    # Step (1): Replace patterns with conv bias
+    #
+    # Here we do replacement separately for cases with and without conv bias, since
+    # the replacement patterns for these two cases are substantially different.
+    # TODO: use the public replace_pattern API once it also returns replacement nodes
+
+    qat_conv_bn_pattern = _get_qat_conv_bn_pattern(conv_fn)
+    replacement_pattern_with_conv_bias = get_aten_graph_module(
+        qat_conv_bn_pattern,
+        example_inputs,
+        is_cuda,
+    )
+    replacements_with_conv_bias = replace_pattern_with_filters(
+        m,
+        match_pattern,
+        replacement_pattern_with_conv_bias,
+        match_filters=[_has_conv_bias_filter],
+        ignore_literals=True,
+    )
+    m.recompile()
+
+    # Step (2): Replace patterns without conv bias
+
+    qat_conv_bn_pattern_no_conv_bias = _get_qat_conv_bn_pattern_no_conv_bias(conv_fn)
+    replacement_pattern_no_conv_bias = get_aten_graph_module(
+        qat_conv_bn_pattern_no_conv_bias,
+        example_inputs,
+        is_cuda,
+    )
+    replacements_no_conv_bias = replace_pattern_with_filters(
+        m,
+        match_pattern,
+        replacement_pattern_no_conv_bias,
+        match_filters=[_no_conv_bias_filter],
+        ignore_literals=True,
+    )
+    m.recompile()
+
+    # Step (3): Post processing
+    #
+    # Due to limited functionality in the subgraph rewriter, here we manually
+    # update the replacement graph as follows:
+    #
+    #   (a) Copy over metadata from original subgraph. This ensures the stack traces
+    #       and annotations are preserved in the new subgraph
+    #
+    #   (b) Copy over literal args for conv from the original subgraph
+    #       TODO: do this for literal args for batchnorm as well
+    #
+    #   (c) Update all references of the old nodes in the original subgraph to refer
+    #       to the corresponding nodes in the new subgraph in the annotations
+    #
+    # In the future, we should try to push as much of this functionality into the
+    # subgraph rewriter as possible, so we don't have to manually copy anything over.
+    # For more detail, see https://github.com/pytorch/pytorch/issues/100419.
+
+    all_original_to_replacement_nodes = {}
+    for r in replacements_with_conv_bias + replacements_no_conv_bias:
+        for original_node, replacement_node in _get_conv_bn_pattern_nodes(r).values():
+            # Step (3a): Copy over metadata for all nodes in [conv - bn - getitem]
+            replacement_node.meta = original_node.meta
+            if _is_conv(original_node):
+                # Step (3b): Copy over conv literal args
+                _copy_over_literal_conv_args(original_node, replacement_node)
+                # Step (3c): Update old references in the conv node's input_qspec_map
+                _update_conv_input_qspec_map_after_replacement(original_node, replacement_node)
+            all_original_to_replacement_nodes[original_node] = replacement_node
+
+    # Step (3c): Update old references in the special qspecs for all nodes in the graph
+    for n in m.graph.nodes:
+        _update_special_qspecs_after_replacement(n, all_original_to_replacement_nodes)
+
+    return m
+
+def _duplicate_dequantize_node(m: GraphModule):
+    """
+    Helper function to duplicate all dequantize nodes in the graph if the
+    node has more than one user. For example:
+
+    Before:
+      quantize -> dequantize -> a
+                          \\--> b
+                          \\--> c
+
+    After:
+      quantize -> dequantize_1 -> a
+            \\--> dequantize_2 -> b
+            \\--> dequantize_3 -> c
+
+    This is useful for subgraph rewriting. E.g. if we wish to match the
+    pattern [dequantize - a] above, subgraph matching would fail because
+    the dequantize node has users outside the matched portion of the graph.
+    Instead, we match [dequantize_1 - a], which is safe.
+    """
+    dq_op = torch.ops.quantized_decomposed.dequantize_per_tensor
+    for n in m.graph.nodes:
+        if n.op != "call_function" or n.target != dq_op or len(n.users) == 1:
+            continue
+        for user in list(n.users):
+            with m.graph.inserting_before(n):
+                new_node = m.graph.create_node("call_function", dq_op, n.args, n.kwargs)
+            user.replace_input_with(n, new_node)
+        m.graph.erase_node(n)
+    m.recompile()
+
+def _remove_extra_dequantize(m: GraphModule):
+    """
+    Removes duplicate dequant nodes in the graph, for an operator that has
+    multiple dequant nodes as a user, replace them with a single dequant node
+    that can be shared across all the uses. This should be seen as the "reverse"
+    of `_duplicate_dequantize_node`.
+    """
+    dq_op = torch.ops.quantized_decomposed.dequantize_per_tensor
+    for n in m.graph.nodes:
+        dq_users = [user for user in n.users if user.op == "call_function" and user.target == dq_op]
+        if len(dq_users) > 1:
+            with m.graph.inserting_after(dq_users[0]):
+                new_node = m.graph.create_node("call_function", dq_op, dq_users[0].args, {})
+            for dq_user in dq_users:
+                dq_user.replace_all_uses_with(new_node)
+                m.graph.erase_node(dq_user)
+    m.recompile()
+
+def _copy_over_q_dq_args(original_node: Node, replacement_node: Node):
+    """
+    Given a pair of quantize or dequantize nodes, copy over all literal args
+    from the original node to the replacement node.
+    """
+    # For quantize_per_tensor, scale and zp are literals and need to be copied
+    # For quantize_per_channel, scale and zp are get_attr nodes and should be skipped
+    assert original_node.target == replacement_node.target
+    if original_node.target in (
+        torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    ):
+        # Args: input, [scale, zp, qmin, qmax, dtype]
+        start_copy_arg_index = 1
+    elif original_node.target in (
+        torch.ops.quantized_decomposed.quantize_per_channel.default,
+        torch.ops.quantized_decomposed.dequantize_per_channel.default,
+    ):
+        # Args: input, scale, zp, [axis, qmin, qmax, dtype]
+        start_copy_arg_index = 3
+    else:
+        raise ValueError("Expected quantize/dequantize nodes, got '%s'" % original_node.target)
+    replacement_node.args = (
+        replacement_node.args[:start_copy_arg_index] + original_node.args[start_copy_arg_index:]
+    )
+
+def _fold_conv_bn_qat(m: GraphModule) -> GraphModule:
+    has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
+    if not has_bn:
+        return m
+    m = _fold_conv_bn_qat_helper(m, F.conv1d, _quantized_conv1d_bn_example_inputs, is_cuda=False)
+    m = _fold_conv_bn_qat_helper(m, F.conv2d, _quantized_conv2d_bn_example_inputs, is_cuda=False)
+    if torch.cuda.is_available():
+        m = _fold_conv_bn_qat_helper(m, F.conv1d, _quantized_conv1d_bn_example_inputs, is_cuda=True)
+        m = _fold_conv_bn_qat_helper(m, F.conv2d, _quantized_conv2d_bn_example_inputs, is_cuda=True)
+    return m
+
+def _fold_conv_bn_qat_helper(
+    m: GraphModule,
+    conv_fn: Callable,
+    example_inputs: Tuple[Any, ...],
+    is_cuda: bool,
+) -> GraphModule:
+    """
+    Replace the quantized (conv + bn) pattern with conv with bn weights folded into the weights of conv.
+    """
+    m.graph.eliminate_dead_code()
+    m.recompile()
+    _duplicate_dequantize_node(m)
+
+    # Step (1): Replace QAT pattern with simple [conv - bn] pattern
+    replacements = []
+    replacement_options = itertools.product(
+        [True, False],  # is_per_channel
+        [True, False],  # has_bias
+        [True, False],  # bias_is_quantized
+        [True, False],  # bn_is_training
+    )
+    for is_per_channel, has_bias, bias_is_quantized, bn_is_training in replacement_options:
+        # For the cases without bias, `bias_is_quantized` is irrelevant, so here we arbitrarily
+        # filter out one of the values for this flag to avoid having duplicate patterns
+        if not has_bias and bias_is_quantized:
+            continue
+        kwargs = _get_quantized_conv_bn_example_inputs_kwargs(is_per_channel, has_bias, is_cuda)
+        match_pattern = _get_quantized_qat_conv_bn_pattern(
+            is_per_channel, has_bias, bias_is_quantized, conv_fn, bn_is_training
+        )
+        match_pattern = get_aten_graph_module(match_pattern, example_inputs, is_cuda, **kwargs)
+        replacement_pattern = _get_folded_quantized_qat_conv_bn_pattern(
+            is_per_channel, has_bias, bias_is_quantized, conv_fn, bn_is_training
+        )
+        replacement_pattern = get_aten_graph_module(replacement_pattern, example_inputs, is_cuda, **kwargs)
+        replacements.extend(
+            replace_pattern_with_filters(
+                m,
+                match_pattern,
+                replacement_pattern,
+                ignore_literals=True,
+            )
+        )
+    m.recompile()
+    _remove_extra_dequantize(m)
+
+    for r in replacements:
+        node_map = _get_conv_bn_pattern_nodes(r)
+
+        # Step (2): Copy over metadata from original subgraph
+        for original_node, replacement_node in node_map.values():
+            replacement_node.meta = original_node.meta
+
+        # Step (3): Copy over args for weight (and optionally bias) q - dq nodes
+        _copy_over_q_dq_args(*node_map["conv_weight_q"])
+        _copy_over_q_dq_args(*node_map["conv_weight_dq"])
+        if "conv_bias_q" in node_map:
+            assert "conv_bias_dq" in node_map
+            _copy_over_q_dq_args(*node_map["conv_bias_q"])
+            _copy_over_q_dq_args(*node_map["conv_bias_dq"])
+
+        # Step (4): Fold BN weights into conv
+        conv_bias = None
+        (_, conv_node) = node_map["conv"]
+        (_, bn_node) = node_map["bn"]
+        (_, conv_weight) = node_map["conv_weight"]
+        if "conv_bias" in node_map:
+            (_, conv_bias) = node_map["conv_bias"]
+        fold_bn_weights_into_conv_node(conv_node, conv_weight, conv_bias, bn_node, m)
+
+        # Copy over literal args for conv
+        for original_node in _filter_nodes_map(r.nodes_map).values():
+            if _is_conv(original_node):
+                _copy_over_literal_conv_args(original_node, conv_node)
+
+    m.graph.eliminate_dead_code()
+    m.recompile()
+    return m
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..510943b9feb5a6b86af0d6ad7048425f8c018a44
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__init__.py
@@ -0,0 +1,5 @@
+from .rewrite import reference_representation_rewrite
+
+__all__ = [
+    "reference_representation_rewrite",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79200fb8d840a18062a7b1dfc1ef51ab4e7f57da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/rewrite.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/rewrite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce754c009110a22961ef7f8b829d2593ac172f9d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/__pycache__/rewrite.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py
new file mode 100644
index 0000000000000000000000000000000000000000..68357be3577e057ce88af1e9d07541a96eec16e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/representation/rewrite.py
@@ -0,0 +1,600 @@
+import torch
+from torch.fx import GraphModule
+from ..export_utils import _WrapperModule
+from ..utils import (
+    get_aten_graph_module,
+    remove_tensor_overload_for_qdq_ops,
+    _replace_literals_with_new_placeholders,
+    _replace_literals_with_existing_placeholders,
+)
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+from torch.fx.subgraph_rewriter import replace_pattern
+from torch._higher_order_ops.out_dtype import out_dtype
+from typing import Optional, Callable, Tuple, Any
+from dataclasses import dataclass
+
+from functools import partial
+
+__all__ = [
+    "reference_representation_rewrite",
+]
+
+
+_QUANTIZED_LINEAR_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (2, 5), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randint(-128, 127, (5, 5), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-127], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randn(1, dtype=torch.float),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _qdq_quantized_linear(
+    x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+    out_scale, out_zero_point, out_quant_min, out_quant_max
+):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+    out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
+    return out_i8
+
+def _reference_quantized_linear(
+    x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+    out_scale, out_zero_point, out_quant_min, out_quant_max
+):
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max)
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.linear.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None)
+    # TODO: change to mul.Scalar
+    # Note: we are quantizing bias with these scales without signal from user, but it might be OK
+    bias_scale = x_scale * weight_scale
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    acc_i32 = acc_i32 + bias_i32
+    # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values
+    acc_i32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, acc_i32, x_scale * weight_scale / out_scale) + out_zero_point
+    out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8)
+    return out_i8
+
+
+_DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS = (
+    torch.randn((2, 5), dtype=torch.float),
+    -128,
+    127,
+    torch.finfo(torch.float32).eps,
+    torch.randint(-128, 127, (5, 5), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-127], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randn(1, dtype=torch.float),
+)
+
+
+def _qdq_dynamic_quantized_linear(
+    x_fp32, x_quant_min, x_quant_max, x_eps,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+):
+    x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams(x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8)
+    x_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        x_fp32, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+    out_fp32 = torch.ops.aten.linear.default(x_fp32, weight_fp32, bias_fp32)
+    return out_fp32
+
+def _reference_dynamic_quantized_linear(
+    x_fp32, x_quant_min, x_quant_max, x_eps,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+):
+    x_scale, x_zero_point = torch.ops.quantized_decomposed.choose_qparams(x_fp32, x_quant_min, x_quant_max, x_eps, torch.int8)
+    # decomposed representation for quantize_per_tensor
+    # TODO: use out_dtype(mul, ...) here when the op is ready
+    x_fp32 = x_fp32 / x_scale  # fp32
+    # round modes might be different here
+    # pytorch is rounding to even, which is also common for most of the backends
+    x_fp32 = torch.round(x_fp32)  # fp32
+    x_i32 = x_fp32.to(dtype=torch.int32)  # int32
+    x_i32 = x_i32 + x_zero_point  # int32
+    # clamp works for fp32, int32 and int8 dtypes
+    x_i32 = torch.clamp(x_i32, x_quant_min, x_quant_max)  # int32
+    x_i8 = x_i32.to(dtype=torch.int8)
+
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.linear.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None)
+    bias_scale = x_scale * weight_scale
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    acc_i32 = acc_i32 + bias_i32
+    out_fp32 = acc_i32 * (x_scale * weight_scale)
+    return out_fp32
+
+
+_QUANTIZED_CONV2d_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-127], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randn(1, dtype=torch.float),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _qdq_quantized_conv2d(
+    x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+    out_scale, out_zero_point, out_quant_min, out_quant_max
+):
+    stride = [1, 1]
+    padding = [0, 0]
+    dilation = [1, 1]
+    transposed = False
+    output_padding = [0, 0]
+    groups = 1
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    weight_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(
+        weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max, torch.int8)
+    out_fp32 = torch.ops.aten.convolution.default(
+        x_fp32, weight_fp32, bias_fp32, stride, padding, dilation, transposed, output_padding, groups)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
+    return out_i8
+
+def _reference_quantized_conv2d(
+    x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max,
+    weight_i8, weight_scale, weight_zero_point, weight_quant_min, weight_quant_max,
+    bias_fp32,
+    out_scale, out_zero_point, out_quant_min, out_quant_max
+):
+    stride = [1, 1]
+    padding = [0, 0]
+    dilation = [1, 1]
+    transposed = False
+    output_padding = [0, 0]
+    groups = 1
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, x_quant_min, x_quant_max)
+    weight_i8 = torch.ops.aten.clamp(weight_i8, weight_quant_min, weight_quant_max)
+
+    x_i16 = x_i8.to(torch.int16)
+    weight_i16 = weight_i8.to(torch.int16)
+    # always set bias to None so that the same representation can work for the case
+    # no matter if bias_scale == x_scale * weight_scale or not
+    acc_i32 = out_dtype(
+        torch.ops.aten.convolution.default,
+        torch.int32,
+        x_i16 - x_zero_point,
+        weight_i16 - weight_zero_point,
+        None, stride, padding, dilation, transposed, output_padding, groups)
+    # Note: we are quantizing bias with these scales without signal from user, but it might be OK
+    bias_scale = x_scale * weight_scale
+    # bias quantization to int32 uses bias_scale = x_scale * weight_scale due to:
+    # Take linear calculation for example
+    # Out_(i, j)_fp32 = Sum_(over k)[X_(i, k)_fp32 * W_(i, k)_fp32] + bias_(i)_fp32
+    # Represent X, W fp32 as their dequant transforms
+    # A_fp32 = (A_q - A_zero_point)/A_scale
+    # Out_(i, j)_fp32 = Sum_(over k)[(X_(i, k)_fp32 - X_zp) * X_scale * (W_(i, k)_fp32 - W_zp) * W_scale] + bias_(i)_fp32
+    # Factor out X_scale and W_scale
+    # Out_(i, j)_fp32 = ((X_scale * W_scale) * Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)]) + bias_(i)_fp32
+    # In order to addition of bias_(i)_fp32 inside, we must do
+    # Out_(i, j)_fp32 = (X_scale * W_scale) * (Sum_(over k)[(X_(i, k)_fp32 - X_zp) * (W_(i, k)_fp32 - W_zp)] + (1 / (X_scale * W_scale)) * bias_(i)_fp32)W_scale  # noqa: B950
+    # Note we had to multiply bias_fp32 qith X_scale * W_scale = bias_scale
+    # Thus bias quantization to int32 must be with X_scale * W_scale
+
+    bias_i32 = out_dtype(torch.ops.aten.div.Tensor, torch.int32, bias_fp32, bias_scale)
+    # Unsqueeze to match broadcast dims
+    # Unfortnuately I cannot do bias_i32.unsqueeze(0) due to literal matching nightmare
+    # in graph pattern replacement
+    bias_i32 = bias_i32.unsqueeze(-1)
+    bias_i32 = bias_i32.unsqueeze(-1)
+    acc_i32 = acc_i32 + bias_i32
+    # TODO: change to mul.Scalar when we make x_scale/weight_scale etc. Scalar values
+    acc_i32 = out_dtype(
+        torch.ops.aten.mul.Tensor, torch.int32, acc_i32, x_scale * weight_scale / out_scale) + out_zero_point
+    out_i8 = torch.ops.aten.clamp(acc_i32, out_quant_min, out_quant_max).to(torch.int8)
+    return out_i8
+
+
+_QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _qdq_quantized_add_relu(
+    x_i8, x_scale, x_zero_point, y_i8, y_scale, y_zero_point,
+    out_scale, out_zero_point, quant_min, quant_max
+):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8)
+    y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8)
+    out_fp32 = x_fp32 + y_fp32
+    out_fp32 = torch.ops.aten.relu(out_fp32)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+def _reference_quantized_add_relu(
+    x_i8, x_scale, x_zero_point, y_i8, y_scale, y_zero_point,
+    out_scale, out_zero_point, quant_min, quant_max
+):
+    """
+    See comments for `_reference_quantized_add` for more information on
+    how to derive the formula for out_i8 based on x_i8 and y_i8
+    """
+    x_i32 = x_i8.to(torch.int32)
+    y_i32 = y_i8.to(torch.int32)
+    # TODO: change this to mul.Scalar?
+    x_i32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, (x_i32 - x_zero_point), (x_scale / out_scale))
+    y_i32 = out_dtype(torch.ops.aten.mul.Tensor, torch.int32, (y_i32 - y_zero_point), (y_scale / out_scale))
+    out_i32 = x_i32 + y_i32 + out_zero_point
+    # out_i32 = torch.ops.aten.clamp(out_i32, out_zero_point)
+    out_i8 = torch.ops.aten.clamp(out_i32, out_zero_point, quant_max).to(torch.int8)
+    return out_i8
+
+def _qdq_quantized_add(x_i8, x_scale, x_zero_point, y_i8, y_scale, y_zero_point, out_scale, out_zero_point, quant_min, quant_max):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(x_i8, x_scale, x_zero_point, quant_min, quant_max, torch.int8)
+    y_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(y_i8, y_scale, y_zero_point, quant_min, quant_max, torch.int8)
+    out_fp32 = x_fp32 + y_fp32
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+def _reference_quantized_add(
+    x_i8, x_scale, x_zero_point, y_i8, y_scale, y_zero_point,
+    out_scale, out_zero_point, quant_min, quant_max
+):
+    """
+    # How to Derive the formula for out_i8 based on x_i8 and y_i8
+    # (since quantized add takes x_i8, y_i8 and their quantization parameters, and produce an out_i8)
+
+    # out_i8 is quantized output, we can write down the formula for it first:
+out_i8 = out_f32 / out_scale + out_zero_point           (1)
+
+    # then out_fp32 is computed from x_f32 + y_f32, and the x_fp32 and y_fp32 are the dequantized x_i8 and y_i8
+    out_f32 = x_f32 + y_f32           (2)
+    x_fp32 = (x_i8 - x_zero_point) * x_scale         (3)
+    y_fp32 = (y_i8 - y_zero_point) * y_scale         (4)
+
+    # applying the above fomula to the out_i8 equation we can get the following:
+    out_i8 = out_fp32 / out_scale + out_zero_point             # (1)
+       = (x_f32 + y_f32) / out_scale + out_zero_point      # applying (2) to substitute out_fp32 with x_fp32 + y_fp32
+       = ((x_i8 - x_zero_point) * x_scale + (y_i8 - y_zero_point) * y_scale) / out_scale + out_zero_point  # apply (3) and (4)
+    """
+    x_i32 = x_i8.to(torch.int32)
+    y_i32 = y_i8.to(torch.int32)
+    # TODO: use out_dtype op
+    x_i32 = torch.round((x_scale / out_scale) * (x_i32 - x_zero_point)).to(torch.int32)
+    y_i32 = torch.round((y_scale / out_scale) * (y_i32 - y_zero_point)).to(torch.int32)
+    out_i32 = x_i32 + y_i32 + out_zero_point
+    quant_min = -128
+    quant_max = 127
+    out_i8 = torch.ops.aten.clamp(out_i32, quant_min, quant_max).to(torch.int8)
+    return out_i8
+
+_QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _qdq_quantized_max_pool2d(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, out_scale, out_zero_point, out_quant_min, out_quant_max):
+    kernel_size = 1
+    stride = 1
+    padding = 0
+    dilation = 1
+    ceil_mode = False
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, torch.int8)
+    out_fp32, _ = torch.ops.aten.max_pool2d_with_indices.default(x_fp32, kernel_size, stride, padding, dilation, ceil_mode)
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_tensor(
+        out_fp32, out_scale, out_zero_point, out_quant_min, out_quant_max, torch.int8)
+    return out_i8
+
+def _reference_quantized_max_pool2d(
+        x_i8, x_scale, x_zero_point, x_quant_min, x_quant_max, out_scale, out_zero_point, out_quant_min, out_quant_max):
+    kernel_size = 1
+    stride = 1
+    padding = 0
+    dilation = 1
+    ceil_mode = False
+    # to preserve x_quant_min, x_quant_max in the graph for pattern matching
+    x_i8 = torch.clamp(x_i8, x_quant_min, x_quant_max)
+    x_i32 = x_i8.to(torch.int32)
+    out_i32, _ = torch.ops.aten.max_pool2d_with_indices.default(
+        x_i32 - x_zero_point,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        ceil_mode
+    )
+    out_fp32 = out_i32 * (x_scale / out_scale) + out_zero_point
+    out_fp32 = torch.clamp(out_fp32, out_quant_min, out_quant_max)
+    out_i8 = out_fp32.to(torch.int8)
+    return out_i8
+
+_QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = (
+    torch.randn(1, 3, 3, 3, dtype=torch.float),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _quantize_per_tensor_int8(x_fp32, scale, zero_point, quant_min, quant_max):
+    x = torch.ops.quantized_decomposed.quantize_per_tensor(x_fp32, scale, zero_point, quant_min, quant_max, torch.int8)
+    return x
+
+def _reference_quantize_per_tensor_int8(x_fp32, scale, zero_point, quant_min, quant_max):
+    # TODO: use out_dtype(mul, ...) here when the op is ready
+    x = x_fp32 / scale  # fp32
+    # round modes might be different here
+    # pytorch is rounding to even, which is also common for most of the backends
+    x = torch.round(x)  # fp32
+    x = x.to(dtype=torch.int32)  # int32
+    x = x + zero_point  # int32
+    # clamp works for fp32, int32 and int8 dtypes
+    x = torch.clamp(x, quant_min, quant_max)  # int32
+    x = x.to(dtype=torch.int8)
+    return x
+
+_DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(1, dtype=torch.float),
+    torch.zeros(1, dtype=torch.int),
+    torch.tensor([-128], dtype=torch.int),
+    torch.tensor([127], dtype=torch.int),
+)
+
+def _dequantize_per_tensor_int8(x_i8, scale, zero_point, quant_min, quant_max):
+    x_fp32 = torch.ops.quantized_decomposed.dequantize_per_tensor(x_i8, scale, zero_point, quant_min, quant_max, torch.int8)
+    return x_fp32
+
+def _reference_dequantize_per_tensor_int8(x_i8, scale, zero_point, quant_min, quant_max):
+    # without using quant_min/max in clamp, the traced graph will not have quant_mi/max args.
+    # This results in failure to match the pattern.
+    # Therefore, we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max)
+    # TODO: use out_dtype op
+    # note: x_i8.to(torch.int32) does not work here
+    # TODO: debug the implementation later when torchdynamo time out issue is resolved
+    return ((x_i8.to(torch.float32) - zero_point) * scale).to(dtype=torch.float32)
+
+_QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = (
+    torch.randn(1, 3, 3, 3, dtype=torch.float),
+    torch.randn(3, dtype=torch.float),
+    torch.zeros(3, dtype=torch.int),
+    1,
+    -128,
+    127,
+)
+
+def _quantize_per_channel_int8(x_fp32, scales, zero_points, ch_axis, quant_min, quant_max):
+    out_i8 = torch.ops.quantized_decomposed.quantize_per_channel(
+        x_fp32, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8
+    )
+    return out_i8
+
+def _reference_quantize_per_channel_int8(x_fp32, scales, zero_points, ch_axis, quant_min, quant_max):
+    x_fp32 = torch.transpose(x_fp32, ch_axis, -1)
+    out_i32 = torch.ops.aten.clamp(torch.round(x_fp32 / scales).to(torch.int32) + zero_points, quant_min, quant_max)
+    out_i32 = torch.transpose(out_i32, ch_axis, -1)
+    return out_i32.to(torch.int8)
+
+_DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS = (
+    torch.randint(-128, 127, (1, 3, 3, 3), dtype=torch.int8),
+    torch.randn(3, dtype=torch.float),
+    torch.zeros(3, dtype=torch.int),
+    1,
+    -128,
+    127,
+)
+
+def _dequantize_per_channel_int8(x_i8, scales, zero_points, ch_axis, quant_min, quant_max):
+    # the following will be replaced as placeholders
+    out_fp32 = torch.ops.quantized_decomposed.dequantize_per_channel(
+        x_i8, scales, zero_points, ch_axis, quant_min, quant_max, torch.int8
+    )
+    return out_fp32
+
+def _reference_dequantize_per_channel_int8(x_i8, scales, zero_points, ch_axis, quant_min, quant_max):
+    # the following will be replaced as placeholders
+    # in order to preserve the quant_min/quant_max args for pattern matching (e.g. matching for int4 quantized ops)
+    # we call a torch.ops.aten.clamp here
+    x_i8 = torch.ops.aten.clamp(x_i8, quant_min, quant_max)
+    x_i8 = torch.transpose(x_i8, ch_axis, -1)
+    x_i32 = x_i8.to(torch.int32)
+    out_fp32 = (x_i32 - zero_points).to(torch.float) * scales
+    out_fp32 = torch.transpose(out_fp32, ch_axis, -1)
+    return out_fp32
+
+def _replace_ph_qdq_per_channel_replacement(gm: torch.fx.GraphModule):
+    return _replace_literals_with_existing_placeholders(
+        gm,
+        exclude_literals=[-1],
+        literal_to_ph_idx={1: 3, -128: 4, 127: 5}
+    )
+
+
+@dataclass
+class _RewriteInfo:
+    """Data needed for rewrite, this includes example inputs, pattern and replacement functions
+    and post transformation functions for the exported pattern and replacement GraphModule
+    """
+
+    # example inputs used for exporting the pattern into GraphModule
+    example_inputs: Tuple[Any, ...]
+    pattern: Callable
+    replacement: Callable
+    # post transformation on the exported pattern and replacement GraphModule
+    pattern_post_trans: Optional[Callable[[GraphModule], GraphModule]] = None
+    replacement_post_trans: Optional[Callable[[GraphModule], GraphModule]] = None
+
+_REWRITE_INFO_LIST = [
+    _RewriteInfo(
+        _DYNAMIC_QUANTIZED_LINEAR_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_dynamic_quantized_linear),
+        _WrapperModule(_reference_dynamic_quantized_linear),
+        partial(
+            _replace_literals_with_existing_placeholders,
+            literal_to_ph_idx={
+                -128: 1,
+                127: 2,
+                torch.finfo(torch.float32).eps: 3
+            }
+        ),
+        partial(
+            _replace_literals_with_existing_placeholders,
+            literal_to_ph_idx={
+                -128: 1,
+                127: 2,
+                torch.finfo(torch.float32).eps: 3
+            }
+        ),
+    ),
+    _RewriteInfo(
+        _QUANTIZED_LINEAR_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_quantized_linear),
+        _WrapperModule(_reference_quantized_linear),
+        _replace_literals_with_new_placeholders,
+        _replace_literals_with_new_placeholders,
+    ),
+    _RewriteInfo(
+        _QUANTIZED_CONV2d_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_quantized_conv2d),
+        _WrapperModule(_reference_quantized_conv2d),
+        partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
+        partial(_replace_literals_with_new_placeholders, exclude_literals=[-1]),
+    ),
+    _RewriteInfo(
+        _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_quantized_add_relu),
+        _WrapperModule(_reference_quantized_add_relu),
+    ),
+    _RewriteInfo(
+        _QUANTIZED_ADD_OR_ADD_RELU_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_quantized_add),
+        _WrapperModule(_reference_quantized_add),
+    ),
+    _RewriteInfo(
+        _QUANTIZED_MAX_POOL2D_EXAMPLE_INPUTS,
+        _WrapperModule(_qdq_quantized_max_pool2d),
+        _WrapperModule(_reference_quantized_max_pool2d),
+        _replace_literals_with_new_placeholders,
+        _replace_literals_with_new_placeholders
+    ),
+    _RewriteInfo(
+        _QUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
+        _WrapperModule(_quantize_per_tensor_int8),
+        _WrapperModule(_reference_quantize_per_tensor_int8),
+    ),
+    _RewriteInfo(
+        _DEQUANTIZE_PER_TENSOR_INT8_EXAMPLE_INPUTS,
+        _WrapperModule(_dequantize_per_tensor_int8),
+        _WrapperModule(_reference_dequantize_per_tensor_int8),
+    ),
+    _RewriteInfo(
+        _QUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
+        _WrapperModule(_quantize_per_channel_int8),
+        _WrapperModule(_reference_quantize_per_channel_int8),
+        _replace_ph_qdq_per_channel_replacement,
+        _replace_ph_qdq_per_channel_replacement
+    ),
+    _RewriteInfo(
+        _DEQUANTIZE_PER_CHANNEL_INT8_EXAMPLE_INPUTS,
+        _WrapperModule(_dequantize_per_channel_int8),
+        _WrapperModule(_reference_dequantize_per_channel_int8),
+        _replace_ph_qdq_per_channel_replacement,
+        _replace_ph_qdq_per_channel_replacement
+    ),
+]
+
+def reference_representation_rewrite(model: GraphModule) -> GraphModule:
+    remove_tensor_overload_for_qdq_ops(model)
+    for rewrite_info in _REWRITE_INFO_LIST:
+        example_inputs = rewrite_info.example_inputs
+        pattern = rewrite_info.pattern
+        replacement = rewrite_info.replacement
+        pattern_post_trans = rewrite_info.pattern_post_trans
+        replacement_post_trans = rewrite_info.replacement_post_trans
+        pattern = get_aten_graph_module(pattern, example_inputs)  # type: ignore[arg-type, assignment]
+        remove_tensor_overload_for_qdq_ops(pattern)  # type: ignore[arg-type]
+        replacement = get_aten_graph_module(replacement, example_inputs)  # type: ignore[arg-type, assignment]
+        remove_tensor_overload_for_qdq_ops(replacement)  # type: ignore[arg-type]
+        if pattern_post_trans:
+            pattern = pattern_post_trans(pattern)
+        if replacement_post_trans:
+            replacement = replacement_post_trans(replacement)
+        pattern.recompile()  # type: ignore[attr-defined]
+        replacement.recompile()  # type: ignore[attr-defined]
+        matches = replace_pattern(model, pattern, replacement)
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7cbb8c8c8f4f44db255a22e143a56b6d0e12766
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/pt2e/utils.py
@@ -0,0 +1,540 @@
+import operator
+import types
+
+import torch
+from torch._export import capture_pre_autograd_graph
+from torch.fx import (
+    GraphModule,
+    Node,
+)
+from torch.nn.utils.fusion import fuse_conv_bn_weights
+from typing import Any, Callable, Dict, Optional, Tuple, List, Union
+from torch.utils._pytree import LeafSpec
+from torch.export.unflatten import _AttrKind, _assign_attr
+
+# Makes sure that quantized_decomposed ops are registered
+from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
+
+from torch.ao.quantization.quantizer import QuantizationAnnotation
+
+
+__all__ = [
+    "fold_bn_weights_into_conv_node",
+    "get_aten_graph_module",
+    "remove_tensor_overload_for_qdq_ops",
+]
+
+_QUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+]
+
+
+_DEQUANTIZE_OPS = [
+    torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+    torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.dequantize_per_channel.default,
+]
+
+# Example inputs for conv-bn1d patterns
+_conv1d_bn_example_inputs = (
+    torch.randn(1, 1, 3),  # x
+    torch.randn(1, 1, 1),  # conv_weight
+    torch.randn(1),        # conv_bias
+    torch.randn(1),        # bn_weight
+    torch.randn(1),        # bn_bias
+    torch.randn(1),        # bn_running_mean
+    torch.randn(1),        # bn_running_var
+)
+
+# Example inputs for conv-bn2d patterns
+_conv2d_bn_example_inputs = (
+    torch.randn(1, 1, 3, 3),  # x
+    torch.randn(1, 1, 1, 1),  # conv_weight
+    torch.randn(1),           # conv_bias
+    torch.randn(1),           # bn_weight
+    torch.randn(1),           # bn_bias
+    torch.randn(1),           # bn_running_mean
+    torch.randn(1),           # bn_running_var
+)
+
+def _is_connected(source: torch.fx.Node, dest: torch.fx.Node) -> bool:
+    """
+    Assuming dest is one of the ops inserted by quant workflow, this function
+    finds if source and dest are connected. Assumption is that only quant workflow
+    inserted ops exist between source and dest
+    """
+    quant_workflow_ops = _QUANTIZE_OPS + _DEQUANTIZE_OPS
+    quant_workflow_ops.append(torch.ops.quantized_decomposed.choose_qparams.tensor)
+    while dest.target in quant_workflow_ops:
+        if not isinstance(dest.args[0], torch.fx.Node):
+            raise ValueError(f"expected arg[0] of quant workflow ops to be a node but found {dest.args[0]}")
+        dest = dest.args[0]
+    return (dest == source)
+
+
+def _find_q_dq_node_for_user(
+    produer: torch.fx.Node, user: torch.fx.Node
+) -> Tuple[Any, Any]:
+    """
+    Find q, dq pair corresponding to [producer -> q -> dq -> user]
+    Utils works by finding dq arg of user and ensuring it is connected to
+    producer
+    """
+    dq_node = None
+    for n in user.args:
+        if isinstance(n, torch.fx.Node) and n.op == "call_function" and n.target in _DEQUANTIZE_OPS:
+            if _is_connected(produer, n):
+                dq_node = n
+                break
+    if dq_node is None:
+        for n in user.kwargs:
+            if isinstance(n, torch.fx.Node) and n.op == "call_function" and n.target in _DEQUANTIZE_OPS:
+                if _is_connected(produer, n):
+                    dq_node = n
+                    break
+    if dq_node is None:
+        return (None, None)
+
+    q_node = None
+    if dq_node.args[0].op == "call_function" and dq_node.args[0].target in _QUANTIZE_OPS:
+        q_node = dq_node.args[0]
+    return (q_node, dq_node)
+
+
+
+def _is_sym_size_node(node: Node):
+    return (
+        node.op == "call_function"
+        and node.target == torch.ops.aten.sym_size.default
+        or node.target == torch.ops.aten.sym_numel.default
+        or node.target == torch.ops.aten.sym_numel
+        or node.target == torch.ops.aten.sym_size
+    )
+
+
+def _filter_sym_size_users(node: torch.fx.Node) -> List[torch.fx.Node]:
+    node_users = list(filter((lambda x: (_is_sym_size_node(x) is False)), node.users))
+    return node_users
+
+
+def _is_valid_annotation(annotation: QuantizationAnnotation) -> bool:
+    if annotation is None:
+        return False
+    input_qspec_map = annotation.input_qspec_map
+    output_qspec = annotation.output_qspec
+    if len(input_qspec_map) == 0 and output_qspec is None:
+        return False
+    return True
+
+
+def _get_tensor_constant_from_node(node, m):
+    if node is None:
+        return None
+    assert node.op == "get_attr"
+    target_atoms = node.target.split('.')
+    attr_itr = m
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+def _get_all_arguments(orig_args, orig_kwargs, args_schema):
+    all_args = []
+    for i, schema in enumerate(args_schema):
+        if schema.name in orig_kwargs:
+            all_args.append(orig_kwargs[schema.name])
+        elif not schema.kwarg_only and i < len(orig_args):
+            all_args.append(orig_args[i])
+        else:
+            all_args.append(schema.default_value)
+    return all_args
+
+def _is_supported_batch_norm_for_training(node: Node):
+    """
+    Return True if the given node refers to an aten batch norm op QAT supports.
+    """
+    supported_ops = [
+        torch.ops.aten._native_batch_norm_legit.default,
+        # Note: we won't need this op anymore after batch norm consolidation
+        # For now, we need to continue to support it because it gives better
+        # training numerics than `_native_batch_norm_legit`
+        torch.ops.aten.cudnn_batch_norm.default,
+        torch.ops.aten.miopen_batch_norm.default,
+    ]
+    return node.target in supported_ops
+
+# TODO: rename this to _is_conv_node
+def _is_conv(n: Node):
+    """
+    Return whether the node refers to an aten conv op.
+    """
+    return n.op == "call_function" and n.target in [
+        torch.ops.aten.conv1d.default,
+        torch.ops.aten.conv2d.default,
+    ]
+
+# TODO: rename this to _is_conv_transpose_node
+def _is_conv_transpose(n: Node):
+    """
+    Return whether the node refers to an aten conv_transpose op.
+    """
+    return n.op == "call_function" and n.target in [
+        torch.ops.aten.conv_transpose1d,
+        torch.ops.aten.conv_transpose2d,
+    ]
+
+def _is_bn_node(n: Node):
+    return _is_supported_batch_norm_for_training(n) or n.target == torch.ops.aten._native_batch_norm_legit_no_training.default
+
+def fold_bn_weights_into_conv_node(
+    conv_node: Node,
+    conv_weight_node: Node,
+    conv_bias_node: Optional[Node],
+    bn_node: Node,
+    m: GraphModule
+) -> None:
+    # conv args: input, weight, bias, stride, padding, dilation, ...
+    conv_w = _get_tensor_constant_from_node(conv_weight_node, m)
+    conv_b = _get_tensor_constant_from_node(conv_bias_node, m)
+    transpose = _is_conv_transpose(conv_node)
+
+    # eval bn args: input, weight, bias, running mean, running var, momentum, eps
+    # train bn args: input, weight, bias, running mean, running var, training, momentum, eps
+    bn_args_schema = bn_node.target._schema.arguments  # type: ignore[union-attr]
+    bn_args = _get_all_arguments(bn_node.args, bn_node.kwargs, bn_args_schema)
+    bn_w = _get_tensor_constant_from_node(bn_args[1], m)
+    bn_b = _get_tensor_constant_from_node(bn_args[2], m)
+    bn_rm = _get_tensor_constant_from_node(bn_args[3], m)
+    bn_rv = _get_tensor_constant_from_node(bn_args[4], m)
+    if bn_node.target == torch.ops.aten._native_batch_norm_legit_no_training.default:
+        eps_arg_index = 6
+    elif _is_supported_batch_norm_for_training(bn_node):
+        eps_arg_index = 7
+    else:
+        raise ValueError("BN node target is unexpected ", bn_node.target)
+    bn_eps = bn_args[eps_arg_index]
+
+    fused_weight, fused_bias = fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b, transpose=transpose)
+
+    # update the weight and bias for conv
+    conv_args = list(conv_node.args)
+    # filling in the default bias argument
+    if len(conv_args) == 2:
+        conv_args.append(None)
+
+    # calling data since the fused_weight and fused_bias are nn.Parameter
+    weight_attr_name = conv_weight_node.target
+    assert isinstance(weight_attr_name, str)
+    _assign_attr(fused_weight, m, weight_attr_name, _AttrKind.PARAMETER)
+    if conv_bias_node is not None:
+        bias_attr_name = conv_bias_node.target
+        _assign_attr(fused_bias, m, str(bias_attr_name), _AttrKind.PARAMETER)
+    else:
+        bias_attr_name = weight_attr_name + "_bias"
+        _assign_attr(fused_bias, m, bias_attr_name, _AttrKind.PARAMETER)
+        with m.graph.inserting_before(conv_node):
+            get_bias_node = m.graph.get_attr(bias_attr_name)
+        # NOTE: here we assume the bias of conv is not quantized!
+        conv_args[2] = get_bias_node
+    conv_node.args = tuple(conv_args)
+
+    # native_batch_norm has 3 outputs, we expect getitem calls on the output
+    # and we want to replace the uses of getitem 0 with the output of conv
+    #
+    # Before:
+    # conv -> bn - (first output) -> users1
+    #          \ - (second output) -> users2
+    #          \ - (third output) -> users3
+    # After:
+    # conv -> (first output) -> users1
+    #       bn -
+    #          \ - (second output) -> users2
+    #          \ - (third output) -> users3
+    # if users2 and users3 are empty then bn will be removed through dead code elimination
+
+    for user in bn_node.users:
+        if user.op != "call_function" or user.target != operator.getitem or user.args[1] != 0:
+            continue
+        user.replace_all_uses_with(conv_node)
+
+# fuse conv bn weights, inplace modification of the graph_module and graph
+def _fuse_conv_bn_(m: GraphModule) -> None:
+    has_bn = any(_is_bn_node(n) for n in m.graph.nodes)
+    if not has_bn:
+        return
+    for n in m.graph.nodes:
+        if n.op != "call_function" or n.target != torch.ops.aten._native_batch_norm_legit_no_training.default:
+            continue
+        bn_node = n
+        n = bn_node.args[0]
+        if not _is_conv(n):
+            continue
+        conv_node = n
+        conv_weight_node = conv_node.args[1]
+        conv_bias_node = conv_node.args[2] if len(conv_node.args) > 2 else None
+        fold_bn_weights_into_conv_node(conv_node, conv_weight_node, conv_bias_node, bn_node, m)
+
+    m.graph.eliminate_dead_code()
+    m.recompile()
+
+def _get_node_name_to_scope(model: GraphModule) -> Dict[str, Tuple[str, type]]:
+    # TODO: move this information to fx node itself
+    node_name_to_scope: Dict[str, Tuple[str, type]] = {}
+    for n in model.graph.nodes:
+        nn_module_stack = n.meta.get("nn_module_stack", None)
+        current_scope = ("", type(None))
+        if nn_module_stack:
+            bt = list(nn_module_stack.values())[-1]
+            current_scope = (bt[0].split(".")[-1], bt[1])
+        node_name_to_scope[n.name] = current_scope
+    return node_name_to_scope
+
+def get_aten_graph_module(
+    pattern: Callable,
+    example_inputs: Tuple[Any, ...],
+    is_cuda: bool = False,
+    **kwargs,
+) -> GraphModule:
+    """
+    Convert the pattern to an FX graph with decomposed aten ops.
+    """
+    if is_cuda:
+        example_inputs = tuple([x.cuda() if isinstance(x, torch.Tensor) else x for x in example_inputs])
+    aten_pattern = capture_pre_autograd_graph(
+        pattern,
+        example_inputs,
+        kwargs,
+    )
+    aten_pattern.graph.eliminate_dead_code()
+    aten_pattern.recompile()
+    return aten_pattern
+
+def remove_tensor_overload_for_qdq_ops(match_pattern: GraphModule) -> None:
+    """ Remove .tensor overload for quantize/dequantize ops so that we can
+    use the match_pattern that we get from torchdynamo export to match the output of convert_pt2e
+    """
+    _MAP = {
+        torch.ops.quantized_decomposed.quantize_per_tensor.default: torch.ops.quantized_decomposed.quantize_per_tensor,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.default: torch.ops.quantized_decomposed.dequantize_per_tensor,
+        torch.ops.quantized_decomposed.quantize_per_tensor.tensor: torch.ops.quantized_decomposed.quantize_per_tensor,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.tensor: torch.ops.quantized_decomposed.dequantize_per_tensor,
+        torch.ops.quantized_decomposed.quantize_per_tensor.tensor2: torch.ops.quantized_decomposed.quantize_per_tensor,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.tensor2: torch.ops.quantized_decomposed.dequantize_per_tensor,
+        torch.ops.quantized_decomposed.quantize_per_channel.default: torch.ops.quantized_decomposed.quantize_per_channel,
+        torch.ops.quantized_decomposed.dequantize_per_channel.default: torch.ops.quantized_decomposed.dequantize_per_channel,
+        torch.ops.aten.clamp.Tensor: torch.ops.aten.clamp,
+    }
+    for n in match_pattern.graph.nodes:
+        if n.op != "call_function":
+            continue
+        if n.target in _MAP:
+            n.target = _MAP[n.target]
+
+def _is_literal(arg):
+    if isinstance(arg, (int, float)):
+        return True
+    if isinstance(arg, (tuple, list)):
+        return all(map(_is_literal, arg))
+    return False
+
+def _replace_literals_with_new_placeholders(
+    gm: torch.fx.GraphModule,
+    merge_dup: bool = False,
+    exclude_literals: Optional[List[Any]] = None
+):
+    """Replace the literals in the graph with placeholder nodes that's created on the fly while we
+    traverse the graph, so that the literal arguments in the graph can be matched and replaced
+
+    To use this, the pattern and replacement graph should have the exact same number of literal args
+    and they should be used in the exact same order in the pattern and replacement graph.
+
+    If the literal arguments are not used in the same order in pattern and replacement graph, please
+    use `_replace_literals_with_existing_placeholders` instead
+
+    Args:
+        `gm`: input GraphModule that we'll transform
+        `merge_dup`: boolean flag to indicate that if the same literal appears multiple times in
+         the graph, whether they should correspond to the same placeholder or not
+        `exclude_literals`: a list of literals that will not be replaced with placeholders
+
+    Example:
+
+    # 1. Original Graph
+    def pattern(self, x):
+        return x + 3
+
+    def replacement(self, x):
+        return x - 3
+
+    example_inputs = (torch.randn(1, 3, 3, 3),)
+    pattern_gm = get_aten_graph_module(pattern, example_inputs)
+    replacement_gm = get_aten_graph_module(pattern, example_inptus)
+
+    # 2. Before calling replace literals we'll see the following graph:
+    def pattern(self, x):
+        return x + 3
+
+    def replacement(self, x):
+        return x - 3
+
+    pattern_gm = _replace_literals_with_new_placeholders(pattern_gm)
+    replacement_gm = _replace_literals_with_new_placeholders(replacement_gm)
+
+    # 3. After replacing literals with new placeholder nodes
+
+    def pattern(self, x, new_ph):
+        return x + new_ph
+
+    def pattern(self, x, new_ph):
+        return x - new_ph
+
+    """
+    last_ph = None
+    cnt = 0
+    literal_to_ph: Dict[Union[float, bool, int, torch.dtype], Node] = {}
+    if exclude_literals is None:
+        exclude_literals = []
+
+    in_spec = gm._in_spec
+    args_spec = in_spec.children_specs[0]
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            last_ph = node
+            cnt += 1
+            continue
+        with gm.graph.inserting_after(last_ph):
+            new_args = []
+            for arg in node.args:
+                if _is_literal(arg) and arg not in exclude_literals:
+                    if merge_dup and arg in literal_to_ph:
+                        new_args.append(literal_to_ph[arg])
+                    else:
+                        ph_node = gm.graph.placeholder("arg" + str(cnt))
+                        new_args.append(ph_node)
+                        args_spec.children_specs.append(LeafSpec())
+                        cnt += 1
+                        if merge_dup:
+                            literal_to_ph[arg] = ph_node
+                else:
+                    new_args.append(arg)
+            new_args = tuple(new_args)
+
+        node.args = new_args
+
+    # Update `num_nodes`, `num_leaves`, `num_children`.
+    args_spec.__post_init__()
+    in_spec.__post_init__()
+    return gm
+
+
+def _replace_literals_with_existing_placeholders(
+    gm: torch.fx.GraphModule,
+    exclude_literals: Optional[List[Any]] = None,
+    literal_to_ph_idx: Optional[Dict[Union[float, int, bool, torch.dtype], int]] = None
+):
+    """Replace the literals in the graph with **existing** placeholder nodes, so that the literal arguments
+    in the graph can be matched and replaced
+
+    To use this, all literal args in the graph should be unique and each of them should correspond
+    to exactly one placeholder node
+
+    # 1. Original Graph
+    def pattern(self, x_i8, scale, zero_point, quant_min, quant_max):
+        return torch.dequantize_per_tensor(x_i8, scale, zero_point, quant_min, quant_max)
+
+    def replacement(x_i8, scale, zero_point, quant_min, quant_max):
+        x_i8 = torch.clamp(x_i8, quant_min, quant_max)
+        return ((x_i8.to(torch.float32) - zero_point) * scale).to(dtype=torch.float32)
+
+    example_inputs = (
+        torch.randn(1, 3, 3, 3),
+        1.0,
+        0,
+        -128,
+        127,
+    )
+    pattern_gm = get_aten_graph_module(pattern, example_inputs)
+    replacement_gm = get_aten_graph_module(pattern, example_inptus)
+
+    # 2. Before calling replace literals we'll see the following graph:
+    def pattern(self, x_i8, scale, zero_point, quant_min, quant_max):
+        # scale/zero_point/quant_min/quant_max are burnt in since they are scalar values
+        return torch.dequantize_per_tensor(x_i8, 1.0, 0, -128, 127)
+
+    def replacement(x_i8, scale, zero_point, quant_min, quant_max):
+        # scale/zero_point/quant_min/quant_max are burnt in since they are scalar values
+        x_i8 = torch.clamp(x_i8, -128, 127)
+        return ((x_i8.to(torch.float32) - 0) * 1.0).to(dtype=torch.float32)
+
+    # Note that literal args appear in different order in pattern and replacement graph, so
+    # we can't use _replace_literals_with_new_placeholders
+
+    literal_to_ph_idx = {1.0: 1, 0: 2, -128: 3, 127: 4}
+    pattern_gm = _replace_literals_with_existing_placeholders(pattern_gm, literal_to_ph_idx)
+    replacement_gm = _replace_literals_with_existing_placeholders(replacement_gm, literal_to_ph_idx)
+
+    # 3. After replacing literals with existing placeholder nodes
+
+    def pattern(self, x_i8, scale, zero_point, quant_min, quant_max):
+        # scale/zero_point/quant_min/quant_max are burnt in since they are scalar values
+        return torch.dequantize_per_tensor(x_i8, scale, zero_point, quant_min, quant_max)
+
+    def replacement(x_i8, scale, zero_point, quant_min, quant_max):
+        # scale/zero_point/quant_min/quant_max are burnt in since they are scalar values
+        x_i8 = torch.clamp(x_i8, quant_min, quant_max)
+        return ((x_i8.to(torch.float32) - zero_point) * scale).to(dtype=torch.float32)
+    """
+    if exclude_literals is None:
+        exclude_literals = []
+
+    if literal_to_ph_idx is None:
+        literal_to_ph_idx = {}
+
+    phs = [node for node in gm.graph.nodes if node.op == "placeholder"]
+
+    for node in gm.graph.nodes:
+        if node.op != "call_function":
+            continue
+        new_args = []
+        for arg in node.args:
+            if _is_literal(arg) and arg not in exclude_literals and arg in literal_to_ph_idx:
+                ph_idx = literal_to_ph_idx[arg]
+                ph_node = phs[ph_idx]
+                new_args.append(ph_node)
+            else:
+                new_args.append(arg)
+        new_args = tuple(new_args)
+        node.args = new_args
+    return gm
+
+# TODO: Handle this in export itself and don't wrap the model in another GraphModule
+# in prepare and convert
+def _disallow_eval_train(model: GraphModule):
+    """
+    Disallow calling `model.train()` or `model.eval()` on the given GraphModule.
+    This is useful for exported models, where these methods don't actually behave as expected.
+    """
+    error_message = \
+        """
+        Calling train() or eval() is not supported for exported models.
+        Please call `torch.ao.quantization.move_exported_model_to_train(model)` (or eval) instead.
+
+        If you cannot replace the calls to `model.train()` and `model.eval()`, you may override
+        the behavior for these methods by calling `torch.ao.quantization.allow_exported_model_train_eval(model)`,
+        which does the above automatically for you. Note that this has limited effect on switching
+        behavior between train and eval modes, and should be used only for special ops such as dropout
+        and batchnorm.
+        """
+
+    def _train(self, mode: bool = True):
+        raise NotImplementedError(error_message)
+
+    def _eval(self, mode: bool = True):
+        raise NotImplementedError(error_message)
+
+    model.train = types.MethodType(_train, model)  # type: ignore[method-assign]
+    model.eval = types.MethodType(_eval, model)  # type: ignore[method-assign]
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/qconfig.py b/MLPY/Lib/site-packages/torch/ao/quantization/qconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..99cc79f2607e6c3bffe2a044e367e693a039ba3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/qconfig.py
@@ -0,0 +1,560 @@
+from collections import namedtuple
+from typing import Optional, Any, Union, Type
+
+import torch
+import torch.nn as nn
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FakeQuantizeBase,
+    default_fake_quant,
+    default_dynamic_fake_quant,
+    default_per_channel_weight_fake_quant,
+    default_weight_fake_quant,
+    default_fused_act_fake_quant,
+    default_fused_wt_fake_quant,
+    FusedMovingAvgObsFakeQuantize,
+    default_fused_per_channel_wt_fake_quant,
+    default_embedding_fake_quant,
+    default_embedding_fake_quant_4bit,
+    fused_wt_fake_quant_range_neg_127_to_127,
+    fused_per_channel_wt_fake_quant_range_neg_127_to_127,
+)
+
+from .observer import (
+    _PartialWrapper,
+    MinMaxObserver,
+    HistogramObserver,
+    MovingAverageMinMaxObserver,
+    NoopObserver,
+    PlaceholderObserver,
+    ReuseInputObserver,
+    default_debug_observer,
+    default_dynamic_quant_observer,
+    default_float_qparams_observer,
+    default_float_qparams_observer_4bit,
+    default_observer,
+    default_per_channel_weight_observer,
+    default_placeholder_observer,
+    default_weight_observer,
+    weight_observer_range_neg_127_to_127,
+    per_channel_weight_observer_range_neg_127_to_127,
+    default_reuse_input_observer,
+    ObserverBase,
+)
+import warnings
+import copy
+
+__all__ = [
+    "QConfig",
+    # TODO: deprecated, remove
+    "QConfigDynamic",
+    "default_qconfig",
+    "default_debug_qconfig",
+    "default_per_channel_qconfig",
+    "default_dynamic_qconfig",
+    "float16_dynamic_qconfig",
+    "float16_static_qconfig",
+    "per_channel_dynamic_qconfig",
+    "float_qparams_weight_only_qconfig",
+    "float_qparams_weight_only_qconfig_4bit",
+    "default_quint8_weight_qconfig",
+    "default_qat_qconfig",
+    "default_dynamic_qat_qconfig",
+    "default_weight_only_qconfig",
+    "default_activation_only_qconfig",
+    "default_qat_qconfig_v2",
+    "default_reuse_input_qconfig",
+    "default_symmetric_qnnpack_qconfig",
+    "default_per_channel_symmetric_qnnpack_qconfig",
+    "default_symmetric_qnnpack_qat_qconfig",
+    "default_per_channel_symmetric_qnnpack_qat_qconfig",
+    "default_embedding_qat_qconfig",
+    "default_embedding_qat_qconfig_4bit",
+    "get_default_qconfig",
+    "get_default_qat_qconfig",
+    "get_default_qconfig_dict",
+    "get_default_qat_qconfig_dict",
+    "QConfigAny",
+    "qconfig_equals",
+
+]
+
+class QConfig(namedtuple('QConfig', ['activation', 'weight'])):
+    """
+    Describes how to quantize a layer or a part of the network by providing
+    settings (observer classes) for activations and weights respectively.
+
+
+    Note that QConfig needs to contain observer **classes** (like MinMaxObserver) or a callable that returns
+    instances on invocation, not the concrete observer instances themselves.
+    Quantization preparation function will instantiate observers multiple times for each of the layers.
+
+
+    Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args`
+    method (that behaves like functools.partial)::
+
+      my_qconfig = QConfig(
+          activation=MinMaxObserver.with_args(dtype=torch.qint8),
+          weight=default_observer.with_args(dtype=torch.qint8))
+
+    """
+    def __new__(cls, activation, weight):
+        # catch common mistakes
+        if isinstance(activation, nn.Module) or isinstance(weight, nn.Module):
+            raise ValueError("QConfig received observer instance, please pass observer class instead. " +
+                             "Use MyObserver.with_args(x=1) to override arguments to constructor if needed")
+        return super().__new__(cls, activation, weight)
+
+
+class QConfigDynamic(namedtuple('QConfigDynamic', ['activation', 'weight'])):
+    """
+    Describes how to dynamically quantize a layer or a part of the network by providing
+    settings (observer classes) for weights.
+
+    It's like QConfig, but for dynamic quantization.
+
+    Note that QConfigDynamic needs to contain observer **classes** (like MinMaxObserver) or a callable that returns
+    instances on invocation, not the concrete observer instances themselves.
+    Quantization function will instantiate observers multiple times for each of the layers.
+
+    Observer classes have usually reasonable default arguments, but they can be overwritten with `with_args`
+    method (that behaves like functools.partial)::
+
+      my_qconfig = QConfigDynamic(weight=default_observer.with_args(dtype=torch.qint8))
+    """
+    def __new__(cls, activation=torch.nn.Identity, weight=torch.nn.Identity):
+        # catch common mistakes
+        if isinstance(weight, nn.Module):
+            raise ValueError("QConfigDynamic received observer instance, please pass observer class instead. " +
+                             "Use MyObserver.with_args(x=1) to override arguments to constructor if needed")
+        warnings.warn("QConfigDynamic is going to be deprecated in PyTorch 1.12, please use QConfig instead")
+        return super().__new__(cls, activation, weight)
+
+
+default_qconfig = QConfig(activation=default_observer,
+                          weight=default_weight_observer)
+"""
+Default qconfig configuration.
+"""
+
+default_debug_qconfig = QConfig(weight=default_weight_observer,
+                                activation=default_debug_observer)
+"""
+Default qconfig configuration for debugging.
+"""
+
+default_per_channel_qconfig = QConfig(activation=default_observer,
+                                      weight=default_per_channel_weight_observer)
+"""
+Default qconfig configuration for per channel weight quantization.
+"""
+
+default_dynamic_qconfig = QConfig(activation=default_dynamic_quant_observer,
+                                  weight=default_weight_observer)
+"""
+Default dynamic qconfig.
+"""
+
+float16_dynamic_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float16, is_dynamic=True),
+                                  weight=PlaceholderObserver.with_args(dtype=torch.float16))
+"""
+Dynamic qconfig with weights quantized to `torch.float16`.
+"""
+
+float16_static_qconfig = QConfig(activation=PlaceholderObserver.with_args(dtype=torch.float16),
+                                 weight=PlaceholderObserver.with_args(dtype=torch.float16))
+"""
+Dynamic qconfig with both activations and weights quantized to `torch.float16`.
+"""
+
+per_channel_dynamic_qconfig = QConfig(activation=default_dynamic_quant_observer,
+                                      weight=default_per_channel_weight_observer)
+"""
+Dynamic qconfig with weights quantized per channel.
+"""
+
+float_qparams_weight_only_qconfig = QConfig(
+    activation=default_placeholder_observer,
+    weight=default_float_qparams_observer)
+"""
+Dynamic qconfig with weights quantized with a floating point zero_point.
+"""
+
+float_qparams_weight_only_qconfig_4bit = QConfig(
+    activation=default_placeholder_observer,
+    weight=default_float_qparams_observer_4bit)
+
+default_qat_qconfig = QConfig(activation=default_fake_quant,
+                              weight=default_weight_fake_quant)
+"""
+Default qconfig for QAT.
+"""
+
+default_dynamic_qat_qconfig = QConfig(activation=default_dynamic_fake_quant,
+                                      weight=default_weight_fake_quant)
+"""
+Default qconfig for dynamic QAT.
+"""
+
+default_weight_only_qconfig = QConfig(activation=torch.nn.Identity,
+                                      weight=default_weight_fake_quant)
+"""
+Default qconfig for quantizing weights only.
+"""
+
+default_activation_only_qconfig = QConfig(activation=default_fake_quant,
+                                          weight=torch.nn.Identity)
+"""
+Default qconfig for quantizing activations only.
+"""
+
+# QAT config that uses a fused observer + fake quant modules for optimized training performance.
+# to modify the activation/weight observers, the default entries in fake_quantize.py can be modified.
+default_qat_qconfig_v2 = QConfig(activation=default_fused_act_fake_quant, weight=default_fused_wt_fake_quant)
+"""
+Fused version of `default_qat_config`, has performance benefits.
+"""
+
+default_reuse_input_qconfig = QConfig(activation=default_reuse_input_observer,
+                                      weight=NoopObserver)
+"""
+Default qconfig for operators that reuse the observers from input Tensor, e.g. reshape
+"""
+
+def get_default_qconfig(backend='x86', version=0):
+    """
+    Returns the default PTQ qconfig for the specified backend.
+
+    Args:
+      * `backend` (str): a string representing the target backend. Currently supports
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
+
+    Return:
+        qconfig
+    """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: " + str(backend) +
+            f" not supported. backend must be one of {supported_backends}"
+        )
+
+    if version == 0:
+        if backend == 'fbgemm':
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
+                              weight=default_per_channel_weight_observer)
+        elif backend == 'qnnpack':
+            # TODO: make this compatible with xnnpack constraints
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
+                              weight=default_weight_observer)
+        elif backend == 'onednn':
+            if not torch.cpu._is_cpu_support_vnni():
+                warnings.warn(
+                    "Default qconfig of oneDNN backend with reduce_range of false may have accuracy issues "
+                    "on CPU without Vector Neural Network Instruction support.")
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=False),
+                              weight=default_per_channel_weight_observer)
+        elif backend == 'x86':
+            qconfig = QConfig(activation=HistogramObserver.with_args(reduce_range=True),
+                              weight=default_per_channel_weight_observer)
+        else:
+            # won't reach
+            qconfig = default_qconfig
+    else:
+        raise AssertionError("Version number: " + str(version) +
+                             " in get_default_qconfig is not supported. Version number must be 0")
+
+    return qconfig
+
+"""
+Default, symmetric PTQ qconfig for the specified backend. And a per_channel
+variant of the same.
+
+Symmetric here applies to signed weights with zero point = 0, and additional
+value restrictions. The activations are also signed 8-bit integers with this
+qconfig.
+
+    * Once this change is merged [as of 3/17/22], with backend or qengine =
+    'qnnpack', some quantized operators with this symmetric qconfig may use
+    operators from xnnpack library.
+
+        ** Support to use xnnpack ops with `qnnpack` backed for asymmetric
+        qconfig (returned by get_default_qconfig()) is not available yet.
+
+    * This qconfig uses signed activations and weights. Weights have added
+    restrictions such as zero point is forced to be 0, making the weights
+    symmetric, hence the name. And the 8-bit quantized values are
+    restricting to to [-127, +127], excluding -128.
+
+    * xnnpack has a requantization scale value restriction, 0x1p-32 <=
+    requantization_scale < 256.0 where, `requantization_scale = (input_scale
+    * kernel_scale) / (output_scale)`. Using this eps (w/ assumed max value
+    of 256) is to prevent requantization_scale to go below xnnpack lower
+    threshold.
+"""
+default_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8,
+                                                                                   reduce_range=False,
+                                                                                   eps=2 ** -12),
+                                            weight=weight_observer_range_neg_127_to_127)
+
+default_per_channel_symmetric_qnnpack_qconfig = QConfig(activation=HistogramObserver.with_args(dtype=torch.qint8,
+                                                                                               reduce_range=False,
+                                                                                               eps=2 ** -12),
+                                                        weight=per_channel_weight_observer_range_neg_127_to_127)
+
+default_embedding_qat_qconfig = QConfig(activation=NoopObserver.with_args(dtype=torch.float32),
+                                        weight=default_embedding_fake_quant)
+
+default_embedding_qat_qconfig_4bit = QConfig(activation=NoopObserver.with_args(dtype=torch.float32),
+                                             weight=default_embedding_fake_quant_4bit)
+
+default_quint8_weight_qconfig = QConfig(activation=HistogramObserver, weight=MinMaxObserver)
+
+def get_default_qat_qconfig(backend='x86', version=1):
+    """
+    Returns the default QAT qconfig for the specified backend.
+
+    Args:
+      * `backend` (str): a string representing the target backend. Currently supports
+        `x86` (default), `fbgemm`, `qnnpack` and `onednn`.
+      * `version`: version, for backwards compatibility. Can be `None` or `1`.
+
+    Return:
+        qconfig
+    """
+    supported_backends = ["fbgemm", "x86", "qnnpack", "onednn"]
+    if backend not in supported_backends:
+        raise AssertionError(
+            "backend: " + str(backend) +
+            f" not supported. backend must be one of {supported_backends}"
+        )
+
+    # Histogram observer is too slow for quantization aware training
+    if version == 0:
+        if backend == 'fbgemm':
+            qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                quant_min=0,
+                                                                quant_max=255,
+                                                                reduce_range=True),
+                              weight=default_per_channel_weight_fake_quant)
+        elif backend == 'qnnpack':
+            qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                quant_min=0,
+                                                                quant_max=255,
+                                                                reduce_range=False),
+                              weight=default_weight_fake_quant)
+        elif backend == 'onednn':
+            qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                quant_min=0,
+                                                                quant_max=255),
+                              weight=default_per_channel_weight_fake_quant)
+        elif backend == 'x86':
+            qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                quant_min=0,
+                                                                quant_max=255,
+                                                                reduce_range=True),
+                              weight=default_per_channel_weight_fake_quant)
+        else:
+            qconfig = default_qat_qconfig
+    # Use the fused observe + fake_quant modules for doing QAT.
+    elif version == 1:
+        if backend == 'fbgemm':
+            qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                 quant_min=0,
+                                                                                 quant_max=255,
+                                                                                 reduce_range=True),
+                              weight=default_fused_per_channel_wt_fake_quant)
+        elif backend == 'qnnpack':
+            # TODO: make this compatible with xnnpack constraints
+            qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                 quant_min=0,
+                                                                                 quant_max=255,
+                                                                                 reduce_range=False),
+                              weight=default_fused_wt_fake_quant)
+        elif backend == 'onednn':
+            qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                 quant_min=0,
+                                                                                 quant_max=255),
+                              weight=default_fused_per_channel_wt_fake_quant)
+        elif backend == 'x86':
+            qconfig = QConfig(activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                                                 quant_min=0,
+                                                                                 quant_max=255,
+                                                                                 reduce_range=True),
+                              weight=default_fused_per_channel_wt_fake_quant)
+        else:
+            qconfig = default_qat_qconfig_v2
+    else:
+        raise AssertionError("Version number: " + str(version) +
+                             "in get_default_qat_qconfig is not supported. Version number must be 0 or 1")
+
+    return qconfig
+
+"""
+Default symmetric QAT qconfig for qnnpack. And its per channel weight variant.
+"""
+default_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                       quant_min=-128,
+                                                       quant_max=127,
+                                                       dtype=torch.qint8,
+                                                       reduce_range=False,
+                                                       eps=2 ** -12),
+    weight=fused_wt_fake_quant_range_neg_127_to_127)
+
+default_per_channel_symmetric_qnnpack_qat_qconfig = QConfig(
+    activation=FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
+                                                       quant_min=-128,
+                                                       quant_max=127,
+                                                       dtype=torch.qint8,
+                                                       reduce_range=False,
+                                                       eps=2 ** -12),
+    weight=fused_per_channel_wt_fake_quant_range_neg_127_to_127)
+
+_default_fp32_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.float32),
+    weight=PlaceholderObserver.with_args(dtype=torch.float32)
+)
+
+_default_quint8_placeholder_qconfig = QConfig(
+    activation=PlaceholderObserver.with_args(dtype=torch.quint8),
+    # operators using this qconfig doesn't have weights
+    weight=None,
+)
+
+def get_default_qconfig_dict(backend='x86', version=0):
+    warnings.warn(
+        "torch.ao.quantization.get_default_qconfig_dict is deprecated and will be removed in "
+        "a future version. Please use torch.ao.quantization.get_default_qconfig_mapping instead.")
+    return torch.ao.quantization.get_default_qconfig_mapping(backend, version).to_dict()
+
+def get_default_qat_qconfig_dict(backend='x86', version=1):
+    warnings.warn(
+        "torch.ao.quantization.get_default_qat_qconfig_dict is deprecated and will be removed in "
+        "a future version. Please use torch.ao.quantization.get_default_qat_qconfig_mapping instead.")
+    return torch.ao.quantization.get_default_qat_qconfig_mapping(backend, version).to_dict()
+
+def _assert_valid_qconfig(qconfig: Optional[QConfig],
+                          mod: torch.nn.Module) -> None:
+    """
+    Verifies that this `qconfig` is valid.
+    """
+    if qconfig is None:
+        return
+    is_conv_transpose_mod = (
+        isinstance(mod, (torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d, torch.nn.ConvTranspose3d)))
+    if is_conv_transpose_mod:
+        if qconfig.weight is None:
+            # for now, we assume that any qconfig for ConvTranspose without a weight is valid
+            return
+        example_observer = qconfig.weight()
+        is_per_channel = (
+            isinstance(example_observer, (torch.ao.quantization.PerChannelMinMaxObserver,
+                                          torch.ao.quantization.MovingAveragePerChannelMinMaxObserver))
+        )
+        assert not is_per_channel, \
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.'
+
+QConfigAny = Optional[QConfig]
+QConfigAny.__module__ = "torch.ao.quantization.qconfig"
+
+def _add_module_to_qconfig_obs_ctr(
+        qconfig: QConfigAny,
+        module: Optional[nn.Module]) -> Any:
+    r"""This is a helper function for use in quantization prepare that updates a qconfig so that
+    the constructors stored in the qconfig will create observers on the same device that
+    'module' is on. This is intended to be used when the qconfigs are propagated to each
+    module in order to avoid potential device alignment issues.
+
+    Args:
+        qconfig: QConfig with obs constructors stored in activation and weight
+        module: module which the qconfig is related to
+
+    Return:
+        qconfig: configured so that obs constructors set to construct on the same device as module
+    """
+
+    if module is None or qconfig is None or qconfig._fields != ('activation', 'weight'):
+        return qconfig
+
+    def get_factory_kwargs_based_on_module_device():
+        assert isinstance(module, torch.nn.Module)
+        devices = {p.device for p in module.parameters()} | \
+            {p.device for p in module.buffers()}
+        device = next(iter(devices)) if len(devices) > 0 else None
+        return None if device is None else {'device': device}
+
+    def configure_constructor_to_put_obs_on_module_device(original_constructor):
+        try:
+            # check if constructor can accept factory_kwargs
+            check = original_constructor.with_args(factory_kwargs=None)
+            check()
+            return original_constructor.with_callable_args(factory_kwargs=get_factory_kwargs_based_on_module_device)
+        except AttributeError:  # qconfig doesn't have activation or weight
+            return original_constructor
+        except TypeError:  # the class doesn't accept factory_kwargs argument
+            return original_constructor
+
+    activation = configure_constructor_to_put_obs_on_module_device(qconfig.activation)
+    weight = configure_constructor_to_put_obs_on_module_device(qconfig.weight)
+
+    return QConfig(activation, weight)
+
+_ObserverOrFakeQuantizeConstructor = Union[_PartialWrapper, Type[ObserverBase], Type[FakeQuantizeBase]]
+
+def _obs_or_fq_ctr_equals(obs_or_fq1: _ObserverOrFakeQuantizeConstructor, obs_or_fq2: _ObserverOrFakeQuantizeConstructor):
+    if isinstance(obs_or_fq1, _PartialWrapper) and isinstance(obs_or_fq2, _PartialWrapper):
+        return _partial_wrapper_equals(obs_or_fq1, obs_or_fq2)
+    return obs_or_fq1 == obs_or_fq2
+
+def _partial_wrapper_equals(obs_or_fq1: _PartialWrapper, obs_or_fq2: _PartialWrapper):
+    """
+    Return whether the two partial wrappers are equal,
+    """
+    # functools.partial has no __eq__ operator defined so '==' defaults to 'is'
+    obs_or_fq1_keywords = copy.copy(obs_or_fq1.p.keywords)
+    obs_or_fq2_keywords = copy.copy(obs_or_fq2.p.keywords)
+    keywords_equal = True
+    # compare observer constructor with _obs_or_fq_ctr_equals since direct compare would fail
+    if "observer" in obs_or_fq1_keywords and "observer" in obs_or_fq2_keywords:
+        keywords_equal = keywords_equal and _obs_or_fq_ctr_equals(obs_or_fq1_keywords["observer"], obs_or_fq2_keywords["observer"])
+        obs_or_fq1_keywords.pop("observer")
+        obs_or_fq2_keywords.pop("observer")
+    keywords_equal = keywords_equal and obs_or_fq1_keywords == obs_or_fq2_keywords
+    return obs_or_fq1.p.func == obs_or_fq2.p.func and obs_or_fq1.p.args == obs_or_fq2.p.args and keywords_equal
+
+def qconfig_equals(q1: QConfigAny, q2: QConfigAny):
+    """
+    Returns `True` if `q1` equals `q2`, and `False` otherwise.
+    """
+    if q1 is None or q2 is None:
+        return q1 == q2
+    else:
+        assert q1 is not None and q2 is not None
+        try:
+            # Qconfig weight and activation can be either a partial wrapper,
+            # or an observer class. Special handling is required (above) for
+            # comparing partial wrappers.
+            activation_same = _obs_or_fq_ctr_equals(q1.activation, q2.activation)
+            weight_same = _obs_or_fq_ctr_equals(q1.weight, q2.weight)
+            return activation_same and weight_same
+        except AttributeError:
+            return q1 == q2
+
+def _activation_is_memoryless(qconfig: QConfig):
+    """
+    Return whether the observer for activations defined in the given QConfig is memoryless.
+    This means a MovingAverage observer with averaging constant equal to 1.
+    """
+    def _is_memoryless(observer):
+        return hasattr(observer, "averaging_constant") and observer.averaging_constant == 1
+    act = qconfig.activation()
+    if isinstance(act, FakeQuantizeBase) and hasattr(act, "activation_post_process"):
+        return _is_memoryless(act.activation_post_process)
+    else:
+        return _is_memoryless(act)
+
+def _is_reuse_input_qconfig(qconfig: Optional[QConfig]):
+    return qconfig is not None and \
+        isinstance(qconfig.activation(), ReuseInputObserver) and \
+        isinstance(qconfig.weight(), NoopObserver)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/qconfig_mapping.py b/MLPY/Lib/site-packages/torch/ao/quantization/qconfig_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6280daefccba3eae33541598398d152165db8e8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/qconfig_mapping.py
@@ -0,0 +1,350 @@
+from __future__ import annotations
+from collections import OrderedDict
+from typing import Any, Callable, Dict, Tuple, Union, List
+
+import torch
+
+from .fake_quantize import (
+    default_weight_fake_quant,
+    FixedQParamsFakeQuantize,
+)
+from .observer import (
+    _PartialWrapper,
+    default_fixed_qparams_range_0to1_observer,
+    default_fixed_qparams_range_neg1to1_observer,
+    default_placeholder_observer,
+    default_weight_observer,
+)
+from .qconfig import (
+    default_reuse_input_qconfig,
+    default_symmetric_qnnpack_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
+    get_default_qconfig,
+    get_default_qat_qconfig,
+    QConfig,
+    QConfigAny,
+    default_quint8_weight_qconfig
+)
+
+
+__all__ = [
+    "get_default_qconfig_mapping",
+    "get_default_qat_qconfig_mapping",
+    "QConfigMapping",
+]
+
+
+# TODO: replace all usages with these constants
+_GLOBAL_DICT_KEY = ""
+_OBJECT_TYPE_DICT_KEY = "object_type"
+_MODULE_NAME_REGEX_DICT_KEY = "module_name_regex"
+_MODULE_NAME_DICT_KEY = "module_name"
+_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY = "module_name_object_type_order"
+
+# TODO: derive this map from the BackendConfig
+_FIXED_QPARAMS_OP_TO_OBSERVER: Dict[Union[Callable, str], _PartialWrapper] = {
+    torch.nn.Hardsigmoid: default_fixed_qparams_range_0to1_observer,
+    torch.nn.functional.hardsigmoid: default_fixed_qparams_range_0to1_observer,
+    "hardsigmoid": default_fixed_qparams_range_0to1_observer,
+    "hardsigmoid_": default_fixed_qparams_range_0to1_observer,
+    torch.nn.Sigmoid: default_fixed_qparams_range_0to1_observer,
+    torch.sigmoid: default_fixed_qparams_range_0to1_observer,
+    "sigmoid": default_fixed_qparams_range_0to1_observer,
+    "sigmoid_": default_fixed_qparams_range_0to1_observer,
+    torch.nn.Softmax: default_fixed_qparams_range_0to1_observer,
+    torch.nn.Tanh: default_fixed_qparams_range_neg1to1_observer,
+    torch.tanh: default_fixed_qparams_range_neg1to1_observer,
+    "tanh": default_fixed_qparams_range_neg1to1_observer,
+    "tanh_": default_fixed_qparams_range_neg1to1_observer,
+}
+
+
+def _get_default_qconfig_mapping(is_qat: bool, backend: str, version: int) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for the given quantization type and backend.
+    """
+    if is_qat:
+        qconfig = get_default_qat_qconfig(backend, version)
+    else:
+        qconfig = get_default_qconfig(backend, version)
+    default_weight = default_weight_fake_quant if is_qat else default_weight_observer
+
+    # default_per_channel_weight_observer is not currently compatible with fbgemm backend
+    # so we have to modify the weight observer to default_weight_observer or another
+    # per tensor supported observer.
+    # see https://github.com/pytorch/pytorch/issues/47535
+    if backend in ("fbgemm", "x86"):
+        qconfig_transpose = QConfig(activation=qconfig.activation, weight=default_weight)
+    else:
+        qconfig_transpose = qconfig
+
+    # currently layernorm only supports float weights
+    # we have to add this because otherwise there will be a extra quantize-dequantize pair
+    qconfig_layernorm = QConfig(activation=qconfig.activation, weight=default_placeholder_observer)
+
+    qconfig_mapping = QConfigMapping() \
+        .set_global(qconfig) \
+        .set_object_type("reshape", default_reuse_input_qconfig) \
+        .set_object_type(torch.nn.ConvTranspose1d, qconfig_transpose) \
+        .set_object_type(torch.nn.ConvTranspose2d, qconfig_transpose) \
+        .set_object_type(torch.nn.ConvTranspose3d, qconfig_transpose) \
+        .set_object_type(torch.nn.functional.conv_transpose1d, qconfig_transpose) \
+        .set_object_type(torch.nn.functional.conv_transpose2d, qconfig_transpose) \
+        .set_object_type(torch.nn.functional.conv_transpose3d, qconfig_transpose) \
+        .set_object_type(torch.nn.functional.layer_norm, qconfig_layernorm) \
+        .set_object_type(torch.nn.LayerNorm, qconfig_layernorm) \
+        .set_object_type(torch.nn.PReLU, default_quint8_weight_qconfig) \
+
+    # Use special observers for ops with fixed qparams
+    fixed_qparams_observer_to_qconfig: Dict[Any, QConfigAny] = {}
+    for fixed_qparams_op, observer in _FIXED_QPARAMS_OP_TO_OBSERVER.items():
+        if observer in fixed_qparams_observer_to_qconfig:
+            fixed_qparams_qconfig = fixed_qparams_observer_to_qconfig[observer]
+        else:
+            if is_qat:
+                activation = FixedQParamsFakeQuantize.with_args(observer=observer)
+            else:
+                activation = observer
+            fixed_qparams_qconfig = QConfig(activation=activation, weight=default_weight)
+            fixed_qparams_observer_to_qconfig[observer] = fixed_qparams_qconfig
+        qconfig_mapping.set_object_type(fixed_qparams_op, fixed_qparams_qconfig)
+
+    # TODO Currently it's required that separate ops in a fused op/module have the same qconfig.
+    #      Need to be able to support fusion of ops with different qconfigs
+
+    return qconfig_mapping
+
+def get_default_qconfig_mapping(backend="x86", version=0) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for post training quantization.
+
+    Args:
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
+      * ``version`` (int) : the version for the default qconfig mapping
+    """
+    # TODO: add assert for backend choices
+    return _get_default_qconfig_mapping(False, backend, version)
+
+def get_default_qat_qconfig_mapping(backend="x86", version=1) -> QConfigMapping:
+    """
+    Return the default QConfigMapping for quantization aware training.
+
+    Args:
+      * ``backend`` (str) : the quantization backend for the default qconfig mapping, should be
+         one of ["x86" (default), "fbgemm", "qnnpack", "onednn"]
+      * ``version`` (int) : the version for the default qconfig mapping
+    """
+    return _get_default_qconfig_mapping(True, backend, version)
+
+def _get_symmetric_qnnpack_qconfig_mapping() -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qconfig`
+    as the default QConfig.
+    """
+    default_qconfig = default_symmetric_qnnpack_qconfig
+    return _get_default_qconfig_mapping_with_default_qconfig(False, "qnnpack", default_qconfig)
+
+def _get_symmetric_qnnpack_qat_qconfig_mapping() -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses `torch.ao.quantization.default_symmetric_qnnpack_qat_qconfig`
+    as the default QConfig.
+    """
+    default_qconfig = default_symmetric_qnnpack_qat_qconfig
+    return _get_default_qconfig_mapping_with_default_qconfig(True, "qnnpack", default_qconfig)
+
+def _get_default_qconfig_mapping_with_default_qconfig(
+    is_qat: bool,
+    backend: str,
+    default_qconfig: QConfig,
+) -> QConfigMapping:
+    """
+    Return a QConfigMapping that uses the provided qconfig as the default QConfig.
+    """
+    if is_qat:
+        qconfig_mapping = get_default_qat_qconfig_mapping(backend)
+    else:
+        qconfig_mapping = get_default_qconfig_mapping(backend)
+    qconfig_mapping.set_global(default_qconfig)
+    for pattern in qconfig_mapping.object_type_qconfigs.keys():
+        if pattern not in _FIXED_QPARAMS_OP_TO_OBSERVER:
+            qconfig_mapping.set_object_type(pattern, default_qconfig)
+    return qconfig_mapping
+
+_QCONFIG_STYLE_ORDER: List[str] = [
+    "global_qconfig",
+    "object_type_qconfigs",
+    "module_name_regex_qconfigs",
+    "module_name_qconfigs",
+    "module_name_object_type_order_qconfigs",
+]
+
+class QConfigMapping:
+    """
+    Mapping from model ops to :class:`torch.ao.quantization.QConfig` s.
+
+    The user can specify QConfigs using the following methods (in increasing match priority):
+
+        ``set_global`` : sets the global (default) QConfig
+
+        ``set_object_type`` : sets the QConfig for a given module type, function, or method name
+
+        ``set_module_name_regex`` : sets the QConfig for modules matching the given regex string
+
+        ``set_module_name`` : sets the QConfig for modules matching the given module name
+
+        ``set_module_name_object_type_order`` : sets the QConfig for modules matching a combination
+        of the given module name, object type, and the index at which the module appears
+
+    Example usage::
+
+        qconfig_mapping = QConfigMapping()
+            .set_global(global_qconfig)
+            .set_object_type(torch.nn.Linear, qconfig1)
+            .set_object_type(torch.nn.ReLU, qconfig1)
+            .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1)
+            .set_module_name_regex("foo.*", qconfig2)
+            .set_module_name("module1", qconfig1)
+            .set_module_name("module2", qconfig2)
+            .set_module_name_object_type_order("foo.bar", torch.nn.functional.linear, 0, qconfig3)
+
+    """
+
+    def __init__(self):
+        # In increasing match priority:
+        self.global_qconfig: QConfigAny = None
+        self.object_type_qconfigs: OrderedDict[Union[Callable, str], QConfigAny] = OrderedDict()
+        self.module_name_regex_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
+        self.module_name_qconfigs: OrderedDict[str, QConfigAny] = OrderedDict()
+        self.module_name_object_type_order_qconfigs: OrderedDict[Tuple[str, Callable, int], QConfigAny] =\
+            OrderedDict()
+
+    def set_global(self, global_qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the global (default) QConfig.
+        """
+        self.global_qconfig = global_qconfig
+        return self
+
+    def set_object_type(self, object_type: Union[Callable, str], qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the QConfig for a given module type, function, or method name.
+        If the QConfig for an existing object type was already set, the new QConfig will override the old one.
+        """
+        self.object_type_qconfigs[object_type] = qconfig
+        return self
+
+    def set_module_name_regex(self, module_name_regex: str, qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching the given regex string.
+
+        Regexes will be matched in the order in which they are registered through this method.
+        Thus, the caller should register more specific patterns first, e.g.::
+
+            qconfig_mapping = QConfigMapping()
+                .set_module_name_regex("foo.*bar.*conv[0-9]+", qconfig1)
+                .set_module_name_regex("foo.*bar.*", qconfig2)
+                .set_module_name_regex("foo.*", qconfig3)
+
+        In this example, "foo.bar.conv0" would match qconfig1, "foo.bar.linear" would match qconfig2,
+        and "foo.baz.relu" would match qconfig3.
+
+        If the QConfig for an existing module name regex was already set, the new QConfig will override the
+        old one while preserving the order in which the regexes were originally registered.
+        """
+        self.module_name_regex_qconfigs[module_name_regex] = qconfig
+        return self
+
+    def set_module_name(self, module_name: str, qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching the given module name.
+        If the QConfig for an existing module name was already set, the new QConfig will override the old one.
+        """
+        self.module_name_qconfigs[module_name] = qconfig
+        return self
+
+    def set_module_name_object_type_order(
+            self,
+            module_name: str,
+            object_type: Callable,
+            index: int,
+            qconfig: QConfigAny) -> QConfigMapping:
+        """
+        Set the QConfig for modules matching a combination of the given module name, object type,
+        and the index at which the module appears.
+
+        If the QConfig for an existing (module name, object type, index)  was already set, the new QConfig
+        will override the old one.
+        """
+        self.module_name_object_type_order_qconfigs[(module_name, object_type, index)] = qconfig
+        return self
+
+    def __repr__(self) -> str:
+        output = self.__class__.__name__ + " ("
+        for style_name in _QCONFIG_STYLE_ORDER:
+            output += f"\n {style_name}"
+            qconfigs = getattr(self, style_name)
+            if isinstance(qconfigs, OrderedDict) and len(qconfigs) > 0:
+                for key, qconfig in qconfigs.items():
+                    output += f"\n  {key}: {qconfig}"
+            else:
+                output += f"\n  {qconfigs}"
+        return output + "\n)"
+
+    # TODO: remove this
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert this ``QConfigMapping`` to a dictionary with the following keys:
+
+            "" (for global QConfig)
+
+            "object_type"
+
+            "module_name_regex"
+
+            "module_name"
+
+            "module_name_object_type_order"
+
+        The values of this dictionary are lists of tuples.
+        """
+        return {
+            _GLOBAL_DICT_KEY: self.global_qconfig,
+            _OBJECT_TYPE_DICT_KEY: list(self.object_type_qconfigs.items()),
+            _MODULE_NAME_REGEX_DICT_KEY: list(self.module_name_regex_qconfigs.items()),
+            _MODULE_NAME_DICT_KEY: list(self.module_name_qconfigs.items()),
+            _MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY: [
+                (*k, v) for k, v in self.module_name_object_type_order_qconfigs.items()
+            ],
+        }
+
+    # TODO: remove this
+    @classmethod
+    def from_dict(cls, qconfig_dict: Dict[str, Any]) -> QConfigMapping:
+        """
+        Create a ``QConfigMapping`` from a dictionary with the following keys (all optional):
+
+            "" (for global QConfig)
+
+            "object_type"
+
+            "module_name_regex"
+
+            "module_name"
+
+            "module_name_object_type_order"
+
+        The values of this dictionary are expected to be lists of tuples.
+        """
+        conf = cls()
+        if _GLOBAL_DICT_KEY in qconfig_dict:
+            conf.set_global(qconfig_dict[_GLOBAL_DICT_KEY])
+        for object_type, qconfig in qconfig_dict.get(_OBJECT_TYPE_DICT_KEY, []):
+            conf.set_object_type(object_type, qconfig)
+        for module_name_regex, qconfig in qconfig_dict.get(_MODULE_NAME_REGEX_DICT_KEY, []):
+            conf.set_module_name_regex(module_name_regex, qconfig)
+        for module_name, qconfig in qconfig_dict.get(_MODULE_NAME_DICT_KEY, []):
+            conf.set_module_name(module_name, qconfig)
+        for module_name, object_type, index, qconfig in qconfig_dict.get(_MODULE_NAME_OBJECT_TYPE_ORDER_DICT_KEY, []):
+            conf.set_module_name_object_type_order(module_name, object_type, index, qconfig)
+        return conf
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quant_type.py b/MLPY/Lib/site-packages/torch/ao/quantization/quant_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1676d986388ac889ed97b40cb5e776f04dc96e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quant_type.py
@@ -0,0 +1,30 @@
+import enum
+
+__all__ = [
+    "QuantType",
+]
+
+# Quantization type (dynamic quantization, static quantization).
+# Should match the c++ enum in quantization_type.h
+class QuantType(enum.IntEnum):
+    DYNAMIC = 0
+    STATIC = 1
+    QAT = 2
+    WEIGHT_ONLY = 3
+
+_quant_type_to_str = {
+    QuantType.STATIC: "static",
+    QuantType.DYNAMIC: "dynamic",
+    QuantType.QAT: "qat",
+    QuantType.WEIGHT_ONLY: "weight_only",
+}
+
+# TODO: make this private
+def _get_quant_type_to_str(quant_type: QuantType) -> str:
+    return _quant_type_to_str[quant_type]
+
+def _quant_type_from_str(name: str) -> QuantType:
+    for quant_type, s in _quant_type_to_str.items():
+        if name == s:
+            return quant_type
+    raise ValueError(f"Unknown QuantType name '{name}'")
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantization_mappings.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantization_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0afe0acd712e5740a3602f280a9386309cdeed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantization_mappings.py
@@ -0,0 +1,348 @@
+import copy
+
+import torch
+from torch import nn
+
+import torch.nn.functional as F
+import torch.ao.nn.intrinsic as nni
+import torch.ao.nn.intrinsic.quantized as nniq
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.intrinsic.qat as nniqat
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.reference as nnqr
+import torch.ao.nn.quantized.dynamic as nnqd
+import torch.ao.nn.qat as nnqat
+import torch.ao.nn.qat.dynamic as nnqatd
+
+from typing import Optional, Union, Dict, Set, Callable, Any
+
+# Because `torch.ao.nn` uses lazy imports, we need to make
+# sure we import the contents explicitly here.
+import torch.ao.nn.sparse
+import torch.ao.nn as ao_nn
+from torch.ao.quantization.stubs import QuantStub, DeQuantStub
+from torch.ao.quantization.fake_quantize import (
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
+)
+from torch.ao.quantization.utils import get_combined_dict
+from torch.nn.utils.parametrize import type_before_parametrizations
+
+__all__ = [
+    "DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_STATIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_QAT_MODULE_MAPPINGS",
+    "DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS",
+    "DEFAULT_MODULE_TO_ACT_POST_PROCESS",
+    "DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS",
+    "DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS",
+    "no_observer_set",
+    "get_default_static_quant_module_mappings",
+    "get_default_static_quant_reference_module_mappings",
+    "get_embedding_static_quant_module_mappings",
+    "get_default_static_sparse_quant_module_mappings",
+    "get_static_quant_module_class",
+    "get_dynamic_quant_module_class",
+    "get_default_qat_module_mappings",
+    "get_embedding_qat_module_mappings",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_dynamic_sparse_quant_module_mappings",
+    "get_default_qconfig_propagation_list",
+    "get_default_compare_output_module_list",
+    "get_default_float_to_quantized_operator_mappings",
+    "get_quantized_operator",
+]
+
+# Default map for swapping float module to reference quantized modules
+DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
+    nn.Linear: nnqr.Linear,
+    nn.Conv1d: nnqr.Conv1d,
+    nn.Conv2d: nnqr.Conv2d,
+    nn.Conv3d: nnqr.Conv3d,
+    nn.ConvTranspose1d: nnqr.ConvTranspose1d,
+    nn.ConvTranspose2d: nnqr.ConvTranspose2d,
+    nn.ConvTranspose3d: nnqr.ConvTranspose3d,
+    nn.Embedding: nnqr.Embedding,
+    nn.EmbeddingBag: nnqr.EmbeddingBag,
+    nn.GRUCell: nnqr.GRUCell,
+    nn.LSTMCell: nnqr.LSTMCell,
+    nn.RNNCell: nnqr.RNNCell,
+    nn.LSTM: nnqr.LSTM,
+}
+
+# Default map for swapping float module to quantized ones
+DEFAULT_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
+    nn.BatchNorm2d: nnq.BatchNorm2d,
+    nn.BatchNorm3d: nnq.BatchNorm3d,
+    nn.Dropout: nnq.Dropout,
+    nn.Conv1d: nnq.Conv1d,
+    nn.Conv2d: nnq.Conv2d,
+    nn.Conv3d: nnq.Conv3d,
+    nn.ConvTranspose1d: nnq.ConvTranspose1d,
+    nn.ConvTranspose2d: nnq.ConvTranspose2d,
+    nn.ConvTranspose3d: nnq.ConvTranspose3d,
+    nn.ELU: nnq.ELU,
+    nn.Embedding: nnq.Embedding,
+    nn.EmbeddingBag: nnq.EmbeddingBag,
+    nn.GroupNorm: nnq.GroupNorm,
+    nn.Hardswish: nnq.Hardswish,
+    nn.InstanceNorm1d: nnq.InstanceNorm1d,
+    nn.InstanceNorm2d: nnq.InstanceNorm2d,
+    nn.InstanceNorm3d: nnq.InstanceNorm3d,
+    nn.LayerNorm: nnq.LayerNorm,
+    nn.LeakyReLU: nnq.LeakyReLU,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnq.Linear,
+    nn.Linear: nnq.Linear,
+    nn.ReLU6: nnq.ReLU6,
+    nn.Dropout: nnq.Dropout,
+    nn.PReLU: nnq.PReLU,
+    # Wrapper Modules:
+    nnq.FloatFunctional: nnq.QFunctional,
+    # Intrinsic modules:
+    nni.BNReLU2d: nniq.BNReLU2d,
+    nni.BNReLU3d: nniq.BNReLU3d,
+    nni.ConvReLU1d: nniq.ConvReLU1d,
+    nni.ConvReLU2d: nniq.ConvReLU2d,
+    nni.ConvReLU3d: nniq.ConvReLU3d,
+    nni.ConvAdd2d: nniq.ConvAdd2d,
+    nni.ConvAddReLU2d: nniq.ConvAddReLU2d,
+    nni.LinearReLU: nniq.LinearReLU,
+    nni.LinearLeakyReLU: nniq.LinearLeakyReLU,
+    nni.LinearTanh: nniq.LinearTanh,
+    nniqat.ConvBn1d: nnq.Conv1d,
+    nniqat.ConvBn2d: nnq.Conv2d,
+    nniqat.ConvBn3d: nnq.Conv3d,
+    nniqat.ConvBnReLU1d: nniq.ConvReLU1d,
+    nniqat.ConvBnReLU2d: nniq.ConvReLU2d,
+    nniqat.ConvBnReLU3d: nniq.ConvReLU3d,
+    nniqat.ConvReLU2d: nniq.ConvReLU2d,
+    nniqat.ConvReLU3d: nniq.ConvReLU3d,
+    nniqat.LinearReLU: nniq.LinearReLU,
+    nniqat.LinearBn1d: nnq.Linear,
+    # QAT modules:
+    nnqat.Linear: nnq.Linear,
+    nnqat.Conv2d: nnq.Conv2d,
+    nnqat.Conv3d: nnq.Conv3d,
+}
+
+# Default map for swapping float module to qat modules
+DEFAULT_QAT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    nn.Conv2d: nnqat.Conv2d,
+    nn.Conv3d: nnqat.Conv3d,
+    nn.Linear: nnqat.Linear,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnqat.Linear,
+    # Intrinsic modules:
+    nni.ConvBn1d: nniqat.ConvBn1d,
+    nni.ConvBn2d: nniqat.ConvBn2d,
+    nni.ConvBn3d: nniqat.ConvBn3d,
+    nni.ConvBnReLU1d: nniqat.ConvBnReLU1d,
+    nni.ConvBnReLU2d: nniqat.ConvBnReLU2d,
+    nni.ConvBnReLU3d: nniqat.ConvBnReLU3d,
+    nni.ConvReLU2d: nniqat.ConvReLU2d,
+    nni.ConvReLU3d: nniqat.ConvReLU3d,
+    nni.LinearReLU: nniqat.LinearReLU,
+    nni.LinearBn1d: nniqat.LinearBn1d,
+}
+
+# Default map for swapping dynamic modules
+DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    nn.GRUCell: nnqd.GRUCell,
+    nn.Linear: nnqd.Linear,
+    nnqatd.Linear: nnqd.Linear,
+    nn.modules.linear.NonDynamicallyQuantizableLinear: nnqd.Linear,
+    nn.LSTM: nnqd.LSTM,
+    nn.GRU: nnqd.GRU,
+    nn.LSTMCell: nnqd.LSTMCell,
+    nn.RNNCell: nnqd.RNNCell,
+    nni.LinearReLU: nniqd.LinearReLU,
+    nn.EmbeddingBag: nnq.EmbeddingBag,
+    nn.Embedding: nnq.Embedding,
+    # Don't want to enable these by default because the numerical
+    # accuracy is poor compared to other dynamic ops
+    # nn.Conv1d: nnqd.Conv1d,
+    # nn.Conv2d: nnqd.Conv2d,
+    # nn.Conv3d: nnqd.Conv3d,
+    # nn.ConvTranspose1d: nnqd.ConvTranspose1d,
+    # nn.ConvTranspose2d: nnqd.ConvTranspose2d,
+    # nn.ConvTranspose3d: nnqd.ConvTranspose3d,
+}
+
+# Allowlist for propagating the qconfig
+_INCLUDE_QCONFIG_PROPAGATE_LIST : Set[Callable] = {
+    nn.Sequential,
+}
+
+# Default mapping from floating point function or torch ops to quantized ops
+# TODO: merge with default static mapping
+DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS : Dict[Union[Callable, str], Callable] = {
+    F.elu: torch.ops.quantized.elu,
+    F.hardswish: torch.ops.quantized.hardswish,
+    F.instance_norm: torch.ops.quantized.instance_norm,
+    F.layer_norm: torch.ops.quantized.layer_norm,
+    F.leaky_relu: torch.ops.quantized.leaky_relu,
+    F.dropout: torch.ops.quantized.dropout,
+}
+
+# mapping from module to output activation post process class
+DEFAULT_MODULE_TO_ACT_POST_PROCESS : Dict[Callable, Callable] = {
+    nn.Hardsigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Sigmoid: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Softmax: default_fixed_qparams_range_0to1_fake_quant,
+    nn.Tanh: default_fixed_qparams_range_neg1to1_fake_quant,
+}
+
+# Default map for swapping float module to static sparse quantized ones
+DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    nn.Linear: ao_nn.sparse.quantized.Linear
+}
+
+# Default map for swapping float module to dynamic sparse quantized ones
+DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = {
+    nn.Linear: ao_nn.sparse.quantized.dynamic.Linear
+}
+
+def no_observer_set() -> Set[Any]:
+    r"""These modules cannot have observers inserted by default."""
+    no_observers = {
+        nn.quantizable.LSTM,
+        nn.quantizable.MultiheadAttention
+    }
+    return no_observers
+
+def get_default_static_quant_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping for post training static quantization
+    '''
+    return copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS)
+
+def get_default_static_quant_reference_module_mappings() -> Dict[Callable, Any]:
+    ''' Get reference module mapping for post training static quantization
+    '''
+    return copy.deepcopy(DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS)
+
+def get_embedding_static_quant_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping, including mapping for embedding QAT
+    '''
+    mapping = copy.deepcopy(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS)
+    mapping[nnqat.EmbeddingBag] = nnq.EmbeddingBag
+    mapping[nnqat.Embedding] = nnq.Embedding
+    return mapping
+
+def get_default_static_sparse_quant_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping for post training static sparse quantization
+    '''
+    return copy.deepcopy(DEFAULT_STATIC_SPARSE_QUANT_MODULE_MAPPINGS)
+
+def get_static_quant_module_class(
+        float_module_class: Callable,
+        additional_static_quant_mapping: Optional[Dict[Callable, Any]] = None,
+        is_reference: bool = False) -> Any:
+    r"""n Get the statically quantized module class corresponding to
+    the floating point module class
+    """
+    if additional_static_quant_mapping is None:
+        additional_static_quant_mapping = {}
+    all_mappings = get_combined_dict(
+        DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS if is_reference
+        else DEFAULT_STATIC_QUANT_MODULE_MAPPINGS, additional_static_quant_mapping)
+    static_quant_module_class = all_mappings.get(float_module_class, None)
+    assert static_quant_module_class is not None, \
+        f"Floating point module class {str(float_module_class)}" + \
+        " does not have a corresponding quantized module class"
+    return copy.deepcopy(static_quant_module_class)
+
+def get_dynamic_quant_module_class(
+        float_module_class: Callable,
+        additional_dynamic_quant_mapping: Optional[Dict[Callable, Any]] = None) -> Any:
+    r"""n Get the dynamically quantized module class corresponding to
+    the floating point module class
+    """
+    if additional_dynamic_quant_mapping is None:
+        additional_dynamic_quant_mapping = {}
+    all_mappings = get_combined_dict(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, additional_dynamic_quant_mapping)
+    dynamic_quant_module_class = all_mappings.get(float_module_class, None)
+    assert dynamic_quant_module_class is not None, \
+        f"Floating point module class {str(float_module_class)}" + \
+        " does not have a corresponding quantized module class"
+    return copy.deepcopy(dynamic_quant_module_class)
+
+def get_default_qat_module_mappings() -> Dict[Callable, Any]:
+    ''' Get default module mapping for quantization aware training
+    '''
+    return copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
+
+def get_embedding_qat_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping for quantization aware training
+        This is includes default values in addition to
+        enabling qat for embeddings.
+    '''
+    mapping = copy.deepcopy(DEFAULT_QAT_MODULE_MAPPINGS)
+    mapping[nn.EmbeddingBag] = nnqat.EmbeddingBag
+    mapping[nn.Embedding] = nnqat.Embedding
+    return mapping
+
+def get_default_dynamic_quant_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping for post training dynamic quantization
+    '''
+    return DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS
+
+def get_default_dynamic_sparse_quant_module_mappings() -> Dict[Callable, Any]:
+    ''' Get module mapping for post training dynamic sparse quantization
+    '''
+    return DEFAULT_DYNAMIC_SPARSE_QUANT_MODULE_MAPPINGS
+
+def get_default_qconfig_propagation_list() -> Set[Callable]:
+    ''' Get the default list of module types that we'll attach qconfig
+    attribute to in prepare
+    '''
+    QCONFIG_PROPAGATE_MODULE_CLASS_LIST = (
+        set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys()) |
+        set(DEFAULT_QAT_MODULE_MAPPINGS.keys()) |
+        set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys()) |
+        _INCLUDE_QCONFIG_PROPAGATE_LIST
+    )
+    return copy.deepcopy(QCONFIG_PROPAGATE_MODULE_CLASS_LIST)
+
+def get_default_compare_output_module_list() -> Set[Callable]:
+    ''' Get list of module class types that we will record output
+    in numeric suite
+    '''
+    NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST = (
+        set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_QAT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.values())
+        | set(DEFAULT_STATIC_QUANT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_QAT_MODULE_MAPPINGS.keys())
+        | set(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS.keys())
+        | _INCLUDE_QCONFIG_PROPAGATE_LIST
+    )
+    return copy.deepcopy(NUMERIC_SUITE_COMPARE_MODEL_OUTPUT_MODULE_LIST)
+
+def get_default_float_to_quantized_operator_mappings(
+) -> Dict[Union[Callable, str], Callable]:
+    return copy.deepcopy(DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS)
+
+# TODO: merge with get_static_quant_module_class
+def get_quantized_operator(float_op: Union[Callable, str]) -> Callable:
+    ''' Get the quantized operator corresponding to the float operator
+    '''
+    quantized_op = DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS.get(float_op, None)
+    assert quantized_op is not None, \
+        f'Operator {str(float_op)} does not have corresponding quantized op'
+    return quantized_op
+
+def _get_special_act_post_process(module: torch.nn.Module) -> Optional[Callable]:
+    r""" Get the special activation post process for `module`, this has
+    higher priority than the activation post process in `qconfig`
+    e.g.
+    input: torch.nn.Sigmoid
+    output: default_affine_fixed_qparam_fake_quant
+    """
+    return DEFAULT_MODULE_TO_ACT_POST_PROCESS.get(type_before_parametrizations(module), None)
+
+def _has_special_act_post_process(module: torch.nn.Module) -> bool:
+    return module.training and type(module) in DEFAULT_MODULE_TO_ACT_POST_PROCESS
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantize.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f02818a99eaf0bdd666b12864bb1f8c590f22842
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantize.py
@@ -0,0 +1,664 @@
+import copy
+import itertools
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.ao.nn.quantized as nnq
+from torch.ao.nn.intrinsic import _FusedModule
+
+from torch.ao.quantization.quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_static_quant_module_mappings,
+    get_default_static_quant_reference_module_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    no_observer_set,
+    _has_special_act_post_process,
+    _get_special_act_post_process,
+)
+from .utils import get_qparam_dict, has_no_children_ignoring_parametrizations
+from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper
+from torch.ao.quantization.qconfig import (
+    _add_module_to_qconfig_obs_ctr,
+    default_dynamic_qconfig,
+    float16_dynamic_qconfig,
+    float_qparams_weight_only_qconfig,
+    float_qparams_weight_only_qconfig_4bit,
+    _activation_is_memoryless)
+from torch.nn.utils.parametrize import type_before_parametrizations
+from torch.ao.quantization.observer import _is_activation_post_process
+
+# TODO remove this once BC is no longer required to avoid a SEV
+from torch.ao.quantization.observer import (   # noqa: F401
+    _is_activation_post_process as is_activation_post_process
+)
+
+__all__ = [
+    "get_default_custom_config_dict",
+    "propagate_qconfig_",
+    "add_quant_dequant",
+    "prepare",
+    "quantize",
+    "quantize_dynamic",
+    "prepare_qat",
+    "quantize_qat",
+    "convert",
+    "swap_module",
+]
+
+_DEFAULT_CUSTOM_CONFIG_DICT = {
+    'float_to_observed_custom_module_class': {
+        nn.LSTM: nn.quantizable.LSTM,
+        nn.MultiheadAttention: nn.quantizable.MultiheadAttention,
+    },
+    'observed_to_quantized_custom_module_class': {
+        nn.quantizable.LSTM: nn.quantized.LSTM,
+        nn.quantizable.MultiheadAttention: nn.quantized.MultiheadAttention,
+    }
+}
+
+def get_default_custom_config_dict():
+    r"""Defines the default custom config dict.
+    """
+    return _DEFAULT_CUSTOM_CONFIG_DICT
+
+def _propagate_qconfig_helper(module, qconfig_dict,
+                              qconfig_parent=None, prefix='', prepare_custom_config_dict=None):
+    r"""This is a helper function for `propagate_qconfig_`
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name of submodule to quantization
+                     configuration
+        qconfig_parent: quantization config of parent module, we will fallback to
+                       this config when there is no specified config for current
+                       module
+        prefix: corresponding prefix of the current module, used as key in
+                qconfig_dict
+        prepare_custom_config_dict: dictionary for custom handling of modules
+                                    see docs for :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+
+    module_qconfig = qconfig_dict.get(type_before_parametrizations(module), qconfig_parent)
+    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
+    module_qconfig = getattr(module, 'qconfig', module_qconfig)
+
+    torch.ao.quantization.qconfig._assert_valid_qconfig(module_qconfig, module)
+
+    qconfig_with_device_check = _add_module_to_qconfig_obs_ctr(module_qconfig, module)
+    module.qconfig = qconfig_with_device_check
+
+    for name, child in module.named_children():
+        module_prefix = prefix + '.' + name if prefix else name
+        #  do no not propagate qconfig to child if child is non traceable
+        if prepare_custom_config_dict is None or not (
+            name in prepare_custom_config_dict.get("non_traceable_module_name", [])
+            or type(child) in prepare_custom_config_dict.get("non_traceable_module_class", [])
+        ):
+            _propagate_qconfig_helper(
+                child, qconfig_dict, qconfig_with_device_check, module_prefix
+            )
+
+def propagate_qconfig_(module, qconfig_dict=None, prepare_custom_config_dict=None):
+    r"""Propagate qconfig through the module hierarchy and assign `qconfig`
+    attribute on each leaf module
+
+    Args:
+        module: input module
+        qconfig_dict: dictionary that maps from name or type of submodule to
+            quantization configuration, qconfig applies to all submodules of a
+            given module unless qconfig for the submodules are specified (when
+            the submodule already has qconfig attribute)
+        prepare_custom_config_dict: dictionary for custom handling of modules
+            see docs for :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+        None, module is modified inplace with qconfig attached
+    """
+    if qconfig_dict is None:
+        qconfig_dict = {}
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = {}
+    _propagate_qconfig_helper(module, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict)
+
+def _observer_forward_hook(self, input, output):
+    r"""Forward hook that calls observer on the output
+    """
+    return self.activation_post_process(output)
+
+def _observer_forward_pre_hook(self, input):
+    r"""Forward pre hook that calls observer on the output
+    """
+    return self.activation_post_process(input[0])
+
+def _register_activation_post_process_hook(module, pre_hook=False):
+    assert hasattr(module, 'activation_post_process'), \
+        'Expect activation_post_process attribute already attached to the module'
+    if pre_hook:
+        handle = module.register_forward_pre_hook(
+            _observer_forward_pre_hook, prepend=True
+        )
+    else:
+        handle = module.register_forward_hook(
+            _observer_forward_hook, prepend=True
+        )
+
+
+def _add_observer_(module, qconfig_propagation_list=None, non_leaf_module_list=None, device=None, custom_module_class_mapping=None):
+    r"""Add observer for the leaf child of the module.
+
+    This function insert observer module to all leaf child module that
+    has a valid qconfig attribute.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules that we want to quantize
+        qconfig_propagation_list: a list of quantizable modules that will have observers added to them
+            if they are leaf nodes
+        device: parent device, if any
+        non_leaf_module_list: list of non-leaf modules we want to add observer
+
+    Return:
+        None, module is modified inplace with added observer modules and forward_hooks
+    """
+    if qconfig_propagation_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+
+    if custom_module_class_mapping is None:
+        custom_module_class_mapping = {}
+
+    # respect device affinity when adding observers
+    if device is None:
+        devices = _get_unique_devices_(module)
+        assert len(devices) <= 1, (
+            f"_add_observer_ only works with cpu or single-device CUDA modules, but got devices {devices}"
+        )
+        device = next(iter(devices)) if len(devices) > 0 else None
+
+    def get_activation_post_process(qconfig, device, special_act_post_process=None):
+        activation = qconfig.activation() if special_act_post_process is None else special_act_post_process()
+        if device is not None:
+            activation.to(device)
+        return activation
+
+    def needs_observation(m):
+        return hasattr(m, 'qconfig') and m.qconfig is not None
+
+    def insert_activation_post_process(m, special_act_post_process=None):
+        """ Adds an activation post process module and register
+        a pre or post hook that calls the module
+        """
+        # We don't insert observer/fake_quantize for DeQuantStub
+        if needs_observation(m) and not isinstance(m, DeQuantStub):
+            # observer and hook will be gone after we swap the module
+            m.add_module('activation_post_process', get_activation_post_process(
+                m.qconfig, device, special_act_post_process))
+            # Register observer as the first entry in the hook list
+            # All post forward hooks are preserved and will be executed after the observer before convert
+            _register_activation_post_process_hook(m, pre_hook=_activation_is_memoryless(m.qconfig))
+
+    for name, child in module.named_children():
+        # TODO remove Dropout special after codebase stable
+        if type_before_parametrizations(child) in [nn.Dropout]:
+            continue
+        elif issubclass(type_before_parametrizations(child), (nnq.FloatFunctional, nnq.QFunctional)):
+            if needs_observation(child):
+                assert hasattr(child, "activation_post_process"), (
+                    f"functional class {type_before_parametrizations(child)} has no pre-defined `activation_post_process`"
+                )
+                child.activation_post_process = get_activation_post_process(child.qconfig, device)
+        elif isinstance(child, _FusedModule):
+            # activation_post_process are now added directly to nn.Sequential/_FusedModule
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif non_leaf_module_list is not None and type_before_parametrizations(child) in non_leaf_module_list:
+            if needs_observation(child):
+                insert_activation_post_process(child)
+        elif _has_special_act_post_process(child):
+            special_act_post_process = _get_special_act_post_process(child)
+            insert_activation_post_process(child, special_act_post_process)
+        elif needs_observation(child) and type_before_parametrizations(child) in custom_module_class_mapping:
+            observed_child = custom_module_class_mapping[type_before_parametrizations(child)].from_float(child)
+            setattr(module, name, observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if custom_module_class_mapping[type_before_parametrizations(child)] not in no_observer_set():
+                insert_activation_post_process(observed_child)
+        else:
+            _add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
+
+    # Insert observers only for leaf nodes, note that this observer is for
+    # the output of the module, for input QuantStub will observe them
+    if has_no_children_ignoring_parametrizations(module) and not isinstance(module, torch.nn.Sequential) \
+       and type_before_parametrizations(module) in qconfig_propagation_list:
+        insert_activation_post_process(module)
+
+def _get_unique_devices_(module):
+    return {p.device for p in module.parameters()} | \
+        {p.device for p in module.buffers()}
+
+def add_quant_dequant(module):
+    r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
+    Note that this function will modify the children of module inplace and it
+    can return a new module which wraps the input module as well.
+
+    Args:
+        module: input module with qconfig attributes for all the leaf modules
+        that we want to quantize
+
+    Return:
+        Either the inplace modified module with submodules wrapped in
+        `QuantWrapper` based on qconfig or a new `QuantWrapper` module which
+        wraps the input module, the latter case only happens when the input
+        module is a leaf module and we want to quantize it.
+    """
+    if has_no_children_ignoring_parametrizations(module) and hasattr(module, 'qconfig') and module.qconfig:
+        return QuantWrapper(module)
+
+    for name, child in module.named_children():
+        module._modules[name] = add_quant_dequant(child)
+    return module
+
+def prepare(model, inplace=False, allow_list=None,
+            observer_non_leaf_module_list=None,
+            prepare_custom_config_dict=None):
+    r"""Prepares a copy of the model for quantization calibration or quantization-aware training.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    The model will be attached with observer or fake quant modules, and qconfig
+    will be propagated.
+
+    Args:
+        `model`: input model to be modified in-place
+        `inplace`: carry out model transformations in-place, the original module is mutated
+        `allow_list`: list of quantizable modules
+        `observer_non_leaf_module_list`: list of non-leaf modules we want to add observer
+        `prepare_custom_config_dict`: customization configuration dictionary for prepare function
+
+    .. code-block:: python
+
+       # Example of prepare_custom_config_dict:
+       prepare_custom_config_dict = {
+           # user will manually define the corresponding observed
+           # module class which has a from_float class method that converts
+           # float custom module to observed custom module
+           "float_to_observed_custom_module_class": {
+               CustomModule: ObservedCustomModule
+           }
+        }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare")
+    if prepare_custom_config_dict is None:
+        prepare_custom_config_dict = get_default_custom_config_dict()
+    custom_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    # TODO: remove allow_list
+    qconfig_propagation_list = allow_list
+    if allow_list is None:
+        qconfig_propagation_list = get_default_qconfig_propagation_list()
+    propagate_qconfig_(model, qconfig_dict=None)
+
+    # sanity check common API misusage
+    if not any(hasattr(m, 'qconfig') and m.qconfig for m in model.modules()):
+        warnings.warn("None of the submodule got qconfig applied. Make sure you "
+                      "passed correct configuration through `qconfig_dict` or "
+                      "by assigning the `.qconfig` attribute directly on submodules")
+
+    _add_observer_(
+        model, qconfig_propagation_list, observer_non_leaf_module_list,
+        custom_module_class_mapping=custom_module_class_mapping)
+    return model
+
+def _remove_activation_post_process(module):
+    # TODO: maybe we should change activation_post_process to _activation_post_process
+    # to prevent it from being used by user
+    if hasattr(module, 'activation_post_process') and \
+       _is_activation_post_process(module.activation_post_process):
+        delattr(module, 'activation_post_process')
+
+    # remove activation_post_process pre and post hooks
+    def remove_hooks(pre_hook=False):
+        hook_map = module._forward_pre_hooks if pre_hook else module._forward_hooks
+        observer_hook = _observer_forward_pre_hook if pre_hook else _observer_forward_hook
+        handle_ids_to_remove = set()
+        for handle_id, hook_fn in hook_map.items():
+            if hook_fn is observer_hook:
+                handle_ids_to_remove.add(handle_id)
+        for handle_id in handle_ids_to_remove:
+            hook_map.pop(handle_id)
+
+    remove_hooks(pre_hook=True)
+    remove_hooks(pre_hook=False)
+
+# TODO: rename to something more general
+def _remove_qconfig(module):
+    r"""Clean up the qconfig left in the module so that new qconfig can be
+    propagated.
+
+    Args:
+        module: module to be cleaned up
+    """
+    for child in module.children():
+        _remove_qconfig(child)
+
+    if hasattr(module, "qconfig"):
+        del module.qconfig
+
+    _remove_activation_post_process(module)
+
+def quantize(model, run_fn, run_args, mapping=None, inplace=False):
+    r"""Quantize the input float model with post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        model: input float model
+        run_fn: a calibration function for calibrating the prepared model
+        run_args: positional arguments for `run_fn`
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: correspondence between original module types and quantized counterparts
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize")
+    if mapping is None:
+        mapping = get_default_static_quant_module_mappings()
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    prepare(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, mapping, inplace=True)
+    return model
+
+def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
+                     mapping=None, inplace=False):
+    r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
+
+    Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
+
+    For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
+    by default is performed for layers with large weights size - i.e. Linear and RNN variants.
+
+    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
+    If `qconfig` is provided, the `dtype` argument is ignored.
+
+    Args:
+        model: input model
+        qconfig_spec: Either:
+
+            - A dictionary that maps from name or type of submodule to quantization
+              configuration, qconfig applies to all submodules of a given
+              module unless qconfig for the submodules are specified (when the
+              submodule already has qconfig attribute). Entries in the dictionary
+              need to be QConfig instances.
+
+            - A set of types and/or submodule names to apply dynamic quantization to,
+              in which case the `dtype` argument is used to specify the bit-width
+
+        inplace: carry out model transformations in-place, the original module is mutated
+        mapping: maps type of a submodule to a type of corresponding dynamically quantized version
+            with which the submodule needs to be replaced
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_dynamic")
+    if qconfig_spec is None:
+        if dtype == torch.qint8:
+            qconfig_spec = {
+                nn.Linear : default_dynamic_qconfig,
+                nn.LSTM : default_dynamic_qconfig,
+                nn.GRU : default_dynamic_qconfig,
+                nn.LSTMCell : default_dynamic_qconfig,
+                nn.RNNCell : default_dynamic_qconfig,
+                nn.GRUCell : default_dynamic_qconfig,
+            }
+        elif dtype == torch.float16:
+            qconfig_spec = {
+                nn.Linear : float16_dynamic_qconfig,
+                nn.LSTM : float16_dynamic_qconfig,
+                nn.GRU : float16_dynamic_qconfig,
+                nn.LSTMCell : float16_dynamic_qconfig,
+                nn.RNNCell : float16_dynamic_qconfig,
+                nn.GRUCell : float16_dynamic_qconfig,
+            }
+        elif dtype == torch.quint8:
+            qconfig_spec = {
+                nn.EmbeddingBag : float_qparams_weight_only_qconfig,
+                nn.Embedding : float_qparams_weight_only_qconfig,
+            }
+        elif dtype == torch.quint4x2:
+            qconfig_spec = {
+                nn.EmbeddingBag : float_qparams_weight_only_qconfig_4bit,
+            }
+        else:
+            raise ValueError(
+                f"Don't know how to quantize with default settings for {dtype}. Provide full qconfig please")
+    elif isinstance(qconfig_spec, set):
+        if dtype is torch.qint8:
+            default_qconfig = default_dynamic_qconfig
+        elif dtype is torch.float16:
+            default_qconfig = float16_dynamic_qconfig
+        elif dtype is torch.quint8:
+            default_qconfig = float_qparams_weight_only_qconfig
+        elif dtype is torch.quint4x2:
+            default_qconfig = float_qparams_weight_only_qconfig_4bit
+        else:
+            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
+        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
+
+    if mapping is None:
+        mapping = get_default_dynamic_quant_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.eval()
+    propagate_qconfig_(model, qconfig_spec)
+    convert(model, mapping, inplace=True)
+    return model
+
+def prepare_qat(model, mapping=None, inplace=False):
+    r"""
+    Prepares a copy of the model for quantization calibration or
+    quantization-aware training and converts it to quantized version.
+
+    Quantization configuration should be assigned preemptively
+    to individual submodules in `.qconfig` attribute.
+
+    Args:
+        model: input model to be modified in-place
+        mapping: dictionary that maps float modules to quantized modules to be
+                 replaced.
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.prepare_qat")
+    assert model.training, "prepare_qat only works on models in training mode"
+    if mapping is None:
+        mapping = get_default_qat_module_mappings()
+
+    if not inplace:
+        model = copy.deepcopy(model)
+
+    propagate_qconfig_(model, qconfig_dict=None)
+    convert(model, mapping=mapping, inplace=True, remove_qconfig=False)
+    prepare(model, observer_non_leaf_module_list=set(mapping.values()), inplace=True)
+    return model
+
+def quantize_qat(model, run_fn, run_args, inplace=False):
+    r"""Do quantization aware training and output a quantized model
+
+    Args:
+        model: input model
+        run_fn: a function for evaluating the prepared model, can be a
+                function that simply runs the prepared model or a training
+                loop
+        run_args: positional arguments for `run_fn`
+
+    Return:
+        Quantized model.
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.quantize_qat")
+    if not inplace:
+        model = copy.deepcopy(model)
+    model.train()
+    prepare_qat(model, inplace=True)
+    run_fn(model, *run_args)
+    convert(model, inplace=True)
+    return model
+
+def convert(
+        module, mapping=None, inplace=False, remove_qconfig=True,
+        is_reference=False, convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class. And remove qconfig at the
+    end if remove_qconfig is set to True.
+
+    Args:
+        `module`: prepared and calibrated module
+        `mapping`: a dictionary that maps from source module type to target
+                   module type, can be overwritten to allow swapping user defined
+                   Modules
+        `inplace`: carry out model transformations in-place, the original module
+                   is mutated
+        `convert_custom_config_dict`: custom configuration dictionary for convert function
+
+    .. code-block:: python
+
+       # Example of convert_custom_config_dict:
+       convert_custom_config_dict = {
+           # user will manually define the corresponding quantized
+           # module class which has a from_observed class method that converts
+           # observed custom module to quantized custom module
+           "observed_to_quantized_custom_module_class": {
+               ObservedCustomModule: QuantizedCustomModule
+           }
+       }
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize.convert")
+    if not inplace:
+        module = copy.deepcopy(module)
+    _convert(
+        module, mapping, inplace=True, is_reference=is_reference,
+        convert_custom_config_dict=convert_custom_config_dict)
+    if remove_qconfig:
+        _remove_qconfig(module)
+    return module
+
+def _convert(
+        module, mapping=None, inplace=False,
+        is_reference=False, convert_custom_config_dict=None):
+    r"""Converts submodules in input module to a different module according to `mapping`
+    by calling `from_float` method on the target module class
+
+    Args:
+        module: input module
+        mapping: a dictionary that maps from source module type to target
+                 module type, can be overwritten to allow swapping user defined
+                 Modules
+        inplace: carry out model transformations in-place, the original module
+                 is mutated
+        is_reference: a flag to enable quantized reference module
+
+    """
+    if mapping is None:
+        mapping = get_default_static_quant_reference_module_mappings() if is_reference \
+            else get_default_static_quant_module_mappings()
+    if convert_custom_config_dict is None:
+        convert_custom_config_dict = get_default_custom_config_dict()
+    custom_module_class_mapping = convert_custom_config_dict.get("observed_to_quantized_custom_module_class", {})
+
+    if not inplace:
+        module = copy.deepcopy(module)
+    reassign = {}
+    for name, mod in module.named_children():
+        # both fused modules and observed custom modules are
+        # swapped as one unit
+        if not isinstance(mod, _FusedModule) and \
+           type_before_parametrizations(mod) not in custom_module_class_mapping:
+            _convert(mod, mapping, True,  # inplace
+                     is_reference, convert_custom_config_dict)
+        reassign[name] = swap_module(mod, mapping, custom_module_class_mapping)
+
+    for key, value in reassign.items():
+        module._modules[key] = value
+
+    return module
+
+def swap_module(mod, mapping, custom_module_class_mapping):
+    r"""Swaps the module if it has a quantized counterpart and it has an
+    `observer` attached.
+
+    Args:
+        mod: input module
+        mapping: a dictionary that maps from nn module to nnq module
+
+    Return:
+        The corresponding quantized module of `mod`
+    """
+    new_mod = mod
+    if hasattr(mod, 'qconfig') and mod.qconfig is not None:
+        swapped = False
+        if type_before_parametrizations(mod) in custom_module_class_mapping:
+            new_mod = custom_module_class_mapping[type_before_parametrizations(mod)].from_observed(mod)
+            swapped = True
+        elif type_before_parametrizations(mod) in mapping:
+            qmod = mapping[type_before_parametrizations(mod)]
+            if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE:
+                assert mod.qconfig is not None
+                weight_post_process = mod.qconfig.weight()
+                weight_post_process(mod.weight)
+                weight_qparams = get_qparam_dict(weight_post_process)
+                new_mod = qmod.from_float(mod, weight_qparams)
+            else:
+                new_mod = qmod.from_float(mod)
+            swapped = True
+
+        if swapped:
+            # Preserve module's pre forward hooks. They'll be called on quantized input
+            for pre_hook_fn in mod._forward_pre_hooks.values():
+                new_mod.register_forward_pre_hook(pre_hook_fn)
+            # Preserve module's post forward hooks except _observer_forward_hook
+            # After convert they'll work with quantized output
+            for hook_fn in mod._forward_hooks.values():
+                if hook_fn is not _observer_forward_hook:
+                    new_mod.register_forward_hook(hook_fn)
+
+            # respect device affinity when swapping modules
+            devices = _get_unique_devices_(mod)
+            assert len(devices) <= 1, (
+                f"swap_module only works with cpu or single-device CUDA modules, but got devices {devices}"
+            )
+            device = next(iter(devices)) if len(devices) > 0 else None
+            if device:
+                new_mod.to(device)
+    return new_mod
+
+def _get_observer_dict(mod, target_dict, prefix=""):
+    r"""Traverse the modules and save all observers into dict.
+    This is mainly used for quantization accuracy debug
+    Args:
+        mod: the top module we want to save all observers
+        prefix: the prefix for the current module
+        target_dict: the dictionary used to save all the observers
+    """
+    def get_prefix(prefix):
+        return prefix if prefix == "" else prefix + '.'
+
+    if hasattr(mod, 'activation_post_process'):
+        target_dict[get_prefix(prefix) + 'activation_post_process'] = mod.activation_post_process
+    for name, child in mod.named_children():
+        module_prefix = get_prefix(prefix) + name if prefix else name
+        _get_observer_dict(child, target_dict, module_prefix)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantize_fx.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b42fcee88c32e8e357ccf9e961e7774bfd52fe98
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_fx.py
@@ -0,0 +1,726 @@
+from typing import Any, Dict, Optional, Tuple, Union
+import warnings
+
+import torch
+import copy
+from torch.fx import GraphModule
+from torch.fx.graph_module import _USER_PRESERVED_ATTRIBUTES_KEY
+from .fx.tracer import QuantizationTracer
+from .fx.tracer import (  # noqa: F401
+    Scope,
+    ScopeContextManager
+)
+from .fx.fuse import fuse  # noqa: F401
+from .fx.prepare import prepare  # noqa: F401
+from .fx.convert import convert
+from .backend_config import (  # noqa: F401
+    BackendConfig,
+    get_tensorrt_backend_config,
+)
+from .fx.graph_module import ObservedGraphModule  # noqa: F401
+from .fx.custom_config import (
+    ConvertCustomConfig,
+    FuseCustomConfig,
+    PrepareCustomConfig,
+)
+from .fx.utils import get_custom_module_class_keys  # noqa: F401
+from .fx.utils import get_skipped_module_name_and_classes
+from .qconfig_mapping import QConfigMapping
+
+def attach_preserved_attrs_to_model(
+    model: Union[GraphModule, torch.nn.Module],
+    preserved_attrs: Dict[str, Any],
+) -> None:
+    """ Store preserved attributes to the model.meta so that it can be preserved during deepcopy
+    """
+    model.meta[_USER_PRESERVED_ATTRIBUTES_KEY] = copy.copy(preserved_attrs)  # type: ignore[operator, index, assignment]
+    # set the preserved attributes in the model so that user can call
+    # model.attr as they do before calling fx graph mode quantization
+    for attr_name, attr in model.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():  # type: ignore[index, union-attr]
+        setattr(model, attr_name, attr)
+
+def _check_is_graph_module(model: torch.nn.Module) -> None:
+    if not isinstance(model, GraphModule):
+        raise ValueError(
+            "input model must be a GraphModule, "
+            + "Got type:"
+            + str(type(model))
+            + " Please make "
+            + "sure to follow the tutorials."
+        )
+
+def _attach_meta_to_node_if_not_exist(model: GraphModule) -> None:
+    """ Attach meta field to all nodes of the graph if it does not exist,
+    meta field is a field stores some meta information about the node, such
+    as dtype and shape information for output of the node, this only exists
+    if the program is captured by make_fx (used in quantize_pt2e flow), if
+    the program is captured by torch.fx symbolic tracing, this field may not exist,
+    so we add it here to avoid checking this all over the places
+    """
+    for node in model.graph.nodes:
+        if not hasattr(node, "meta"):
+            node.meta = {}
+
+def _swap_ff_with_fxff(model: torch.nn.Module) -> None:
+    r""" Swap FloatFunctional with FXFloatFunctional
+    """
+    modules_to_swap = []
+    for name, module in model.named_children():
+        if isinstance(module, torch.ao.nn.quantized.FloatFunctional):
+            modules_to_swap.append(name)
+        else:
+            _swap_ff_with_fxff(module)
+
+    for name in modules_to_swap:
+        del model._modules[name]
+        model._modules[name] = torch.ao.nn.quantized.FXFloatFunctional()
+
+
+def _fuse_fx(
+    model: GraphModule,
+    is_qat: bool,
+    fuse_custom_config: Union[FuseCustomConfig, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Internal helper function to fuse modules in preparation for quantization
+
+    Args:
+        model: GraphModule object from symbolic tracing (torch.fx.symbolic_trace)
+    """
+    _check_is_graph_module(model)
+    return fuse(
+        model, is_qat, fuse_custom_config, backend_config)  # type: ignore[operator]
+
+def _prepare_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
+    is_qat: bool,
+    example_inputs: Tuple[Any, ...],
+    prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
+    _equalization_config: Optional[Union[QConfigMapping, Dict[str, Any]]] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+    is_standalone_module: bool = False,
+) -> GraphModule:
+    r""" Internal helper function for prepare_fx
+    Args:
+      `model`, `qconfig_mapping`, `prepare_custom_config`, `_equalization_config`:
+      see docs for :func:`~torch.ao.quantization.prepare_fx`
+      `is_standalone_module`: a boolean flag indicates whether we are
+      quantizing a standalone module or not, a standalone module
+      is a submodule of the parent module that is not inlined in the
+forward graph of the parent module,
+      the way we quantize standalone module is described in:
+      :func:`~torch.ao.quantization._prepare_standalone_module_fx`
+    """
+    if prepare_custom_config is None:
+        prepare_custom_config = PrepareCustomConfig()
+    if _equalization_config is None:
+        _equalization_config = QConfigMapping()
+
+    if isinstance(prepare_custom_config, Dict):
+        warnings.warn(
+            "Passing a prepare_custom_config_dict to prepare is deprecated and will not be supported "
+            "in a future version. Please pass in a PrepareCustomConfig instead.")
+        prepare_custom_config = PrepareCustomConfig.from_dict(prepare_custom_config)
+
+    # swap FloatFunctional with FXFloatFunctional
+    _swap_ff_with_fxff(model)
+
+    skipped_module_names, skipped_module_classes = \
+        get_skipped_module_name_and_classes(prepare_custom_config, is_standalone_module)
+    preserved_attr_names = prepare_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(model, attr) for attr in preserved_attr_names if hasattr(model, attr)}
+    # symbolically trace the model
+    tracer = QuantizationTracer(skipped_module_names, skipped_module_classes)  # type: ignore[arg-type]
+    graph_module = GraphModule(model, tracer.trace(model))
+    _attach_meta_to_node_if_not_exist(graph_module)
+
+    fuse_custom_config = FuseCustomConfig().set_preserved_attributes(prepare_custom_config.preserved_attributes)
+    graph_module = _fuse_fx(
+        graph_module,
+        is_qat,
+        fuse_custom_config,
+        backend_config)
+    prepared = prepare(
+        graph_module,
+        qconfig_mapping,
+        is_qat,
+        tracer.node_name_to_scope,
+        example_inputs=example_inputs,
+        prepare_custom_config=prepare_custom_config,
+        _equalization_config=_equalization_config,
+        backend_config=backend_config,
+        is_standalone_module=is_standalone_module,
+    )  # type: ignore[operator]
+
+    attach_preserved_attrs_to_model(prepared, preserved_attrs)
+    return prepared
+
+
+def _prepare_standalone_module_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
+    is_qat: bool,
+    example_inputs: Tuple[Any, ...],
+    prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" [Internal use only] Prepare a standalone module, so that it can be used when quantizing the
+    parent module.
+    standalone_module means it a submodule that is not inlined in parent module,
+    and will be quantized separately as one unit.
+
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+
+    Returns:
+
+        * model(GraphModule): prepared standalone module. It has these attributes in
+          model.meta:
+
+            * `standalone_module_input_quantized_idxs(List[Int])`: a list of
+              indexes for the graph input that is expected to be quantized,
+              same as input_quantized_idxs configuration provided
+              for the standalone module
+            * `standalone_module_output_quantized_idxs(List[Int])`: a list of
+              indexs for the graph output that is quantized
+              same as input_quantized_idxs configuration provided
+              for the standalone module
+
+    """
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        is_qat,
+        example_inputs,
+        prepare_custom_config,
+        backend_config=backend_config,
+        is_standalone_module=True,
+    )
+
+
+def fuse_fx(
+    model: torch.nn.Module,
+    fuse_custom_config: Union[FuseCustomConfig, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Fuse modules like conv+bn, conv+bn+relu etc, model must be in eval mode.
+    Fusion rules are defined in torch.ao.quantization.fx.fusion_pattern.py
+
+    Args:
+
+        * `model` (torch.nn.Module): a torch.nn.Module model
+        * `fuse_custom_config` (FuseCustomConfig): custom configurations for fuse_fx.
+            See :class:`~torch.ao.quantization.fx.custom_config.FuseCustomConfig` for more details
+    Example::
+
+        from torch.ao.quantization import fuse_fx
+        m = Model().eval()
+        m = fuse_fx(m)
+
+    """
+    if fuse_custom_config is None:
+        fuse_custom_config = FuseCustomConfig()
+
+    if isinstance(fuse_custom_config, Dict):
+        warnings.warn(
+            "Passing a fuse_custom_config_dict to fuse is deprecated and will not be supported "
+            "in a future version. Please pass in a FuseCustomConfig instead.")
+        fuse_custom_config = FuseCustomConfig.from_dict(fuse_custom_config)
+
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.fuse_fx")
+    preserved_attr_names = fuse_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(model, attr) for attr in preserved_attr_names if hasattr(model, attr)}
+
+    graph_module = torch.fx.symbolic_trace(model)
+    _attach_meta_to_node_if_not_exist(graph_module)
+    graph_module = _fuse_fx(graph_module, False, fuse_custom_config, backend_config)
+
+    attach_preserved_attrs_to_model(graph_module, preserved_attrs)
+    return graph_module
+
+def prepare_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
+    example_inputs: Tuple[Any, ...],
+    prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
+    _equalization_config: Optional[Union[QConfigMapping, Dict[str, Any]]] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Prepare a model for post training quantization
+
+    Args:
+      * `model` (torch.nn.Module): torch.nn.Module model
+
+      * `qconfig_mapping` (QConfigMapping): QConfigMapping object to configure how a model is
+         quantized, see :class:`~torch.ao.quantization.qconfig_mapping.QConfigMapping`
+         for more details
+
+      * `example_inputs` (Tuple[Any, ...]): Example inputs for forward function of the model,
+         Tuple of positional args (keyword args can be passed as positional args as well)
+
+      * `prepare_custom_config` (PrepareCustomConfig): customization configuration for quantization tool.
+          See :class:`~torch.ao.quantization.fx.custom_config.PrepareCustomConfig` for more details
+
+      * `_equalization_config`: config for specifying how to perform equalization on the model
+
+      * `backend_config` (BackendConfig): config that specifies how operators are quantized
+         in a backend, this includes how the operators are observed,
+         supported fusion patterns, how quantize/dequantize ops are
+         inserted, supported dtypes etc. See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
+
+    Return:
+      A GraphModule with observer (configured by qconfig_mapping), ready for calibration
+
+    Example::
+
+        import torch
+        from torch.ao.quantization import get_default_qconfig_mapping
+        from torch.ao.quantization.quantize_fx import prepare_fx
+
+        class Submodule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.sub = Submodule()
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.sub(x) + x
+                return x
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define calibration function
+        def calibrate(model, data_loader):
+            model.eval()
+            with torch.no_grad():
+                for image, target in data_loader:
+                    model(image)
+
+        # qconfig is the configuration for how we insert observers for a particular
+        # operator
+        # qconfig = get_default_qconfig("fbgemm")
+        # Example of customizing qconfig:
+        # qconfig = torch.ao.quantization.QConfig(
+        #    activation=MinMaxObserver.with_args(dtype=torch.qint8),
+        #    weight=MinMaxObserver.with_args(dtype=torch.qint8))
+        # `activation` and `weight` are constructors of observer module
+
+        # qconfig_mapping is a collection of quantization configurations, user can
+        # set the qconfig for each operator (torch op calls, functional calls, module calls)
+        # in the model through qconfig_mapping
+        # the following call will get the qconfig_mapping that works best for models
+        # that target "fbgemm" backend
+        qconfig_mapping = get_default_qconfig_mapping("fbgemm")
+
+        # We can customize qconfig_mapping in different ways.
+        # e.g. set the global qconfig, which means we will use the same qconfig for
+        # all operators in the model, this can be overwritten by other settings
+        # qconfig_mapping = QConfigMapping().set_global(qconfig)
+        # e.g. quantize the linear submodule with a specific qconfig
+        # qconfig_mapping = QConfigMapping().set_module_name("linear", qconfig)
+        # e.g. quantize all nn.Linear modules with a specific qconfig
+        # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig)
+        # for a more complete list, please see the docstring for :class:`torch.ao.quantization.QConfigMapping`
+        # argument
+
+        # example_inputs is a tuple of inputs, that is used to infer the type of the
+        # outputs in the model
+        # currently it's not used, but please make sure model(*example_inputs) runs
+        example_inputs = (torch.randn(1, 3, 224, 224),)
+
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        # `prepare_fx` inserts observers in the model based on qconfig_mapping and
+        # backend_config. If the configuration for an operator in qconfig_mapping
+        # is supported in the backend_config (meaning it's supported by the target
+        # hardware), we'll insert observer modules according to the qconfig_mapping
+        # otherwise the configuration in qconfig_mapping will be ignored
+        #
+        # Example:
+        # in qconfig_mapping, user sets linear module to be quantized with quint8 for
+        # activation and qint8 for weight:
+        # qconfig = torch.ao.quantization.QConfig(
+        #     observer=MinMaxObserver.with_args(dtype=torch.quint8),
+        #     weight=MinMaxObserver.with-args(dtype=torch.qint8))
+        # Note: current qconfig api does not support setting output observer, but
+        # we may extend this to support these more fine grained control in the
+        # future
+        #
+        # qconfig_mapping = QConfigMapping().set_object_type(torch.nn.Linear, qconfig)
+        # in backend config, linear module also supports in this configuration:
+        # weighted_int8_dtype_config = DTypeConfig(
+        #   input_dtype=torch.quint8,
+        #   output_dtype=torch.quint8,
+        #   weight_dtype=torch.qint8,
+        #   bias_type=torch.float)
+
+        # linear_pattern_config = BackendPatternConfig(torch.nn.Linear) \
+        #    .set_observation_type(ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT) \
+        #    .add_dtype_config(weighted_int8_dtype_config) \
+        #    ...
+
+        # backend_config = BackendConfig().set_backend_pattern_config(linear_pattern_config)
+        # `prepare_fx` will check that the setting requested by suer in qconfig_mapping
+        # is supported by the backend_config and insert observers and fake quant modules
+        # in the model
+        prepared_model = prepare_fx(float_model, qconfig_mapping, example_inputs)
+        # Run calibration
+        calibrate(prepared_model, sample_inference_data)
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_fx")
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        False,  # is_qat
+        example_inputs,
+        prepare_custom_config,
+        _equalization_config,
+        backend_config,
+    )
+
+
+def prepare_qat_fx(
+    model: torch.nn.Module,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any]],
+    example_inputs: Tuple[Any, ...],
+    prepare_custom_config: Union[PrepareCustomConfig, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Prepare a model for quantization aware training
+
+    Args:
+      * `model` (torch.nn.Module): torch.nn.Module model
+      * `qconfig_mapping` (QConfigMapping): see :func:`~torch.ao.quantization.prepare_fx`
+      * `example_inputs` (Tuple[Any, ...]): see :func:`~torch.ao.quantization.prepare_fx`
+      * `prepare_custom_config` (PrepareCustomConfig): see :func:`~torch.ao.quantization.prepare_fx`
+      * `backend_config` (BackendConfig): see :func:`~torch.ao.quantization.prepare_fx`
+
+    Return:
+      A GraphModule with fake quant modules (configured by qconfig_mapping and backend_config), ready for
+      quantization aware training
+
+    Example::
+
+        import torch
+        from torch.ao.quantization import get_default_qat_qconfig_mapping
+        from torch.ao.quantization.quantize_fx import prepare_qat_fx
+
+        class Submodule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 5)
+                self.sub = Submodule()
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.sub(x) + x
+                return x
+
+        # initialize a floating point model
+        float_model = M().train()
+        # (optional, but preferred) load the weights from pretrained model
+        # float_model.load_weights(...)
+
+        # define the training loop for quantization aware training
+        def train_loop(model, train_data):
+            model.train()
+            for image, target in data_loader:
+                ...
+
+        # qconfig is the configuration for how we insert observers for a particular
+        # operator
+        # qconfig = get_default_qconfig("fbgemm")
+        # Example of customizing qconfig:
+        # qconfig = torch.ao.quantization.QConfig(
+        #    activation=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8)),
+        #    weight=FakeQuantize.with_args(observer=MinMaxObserver.with_args(dtype=torch.qint8)))
+        # `activation` and `weight` are constructors of observer module
+
+        # qconfig_mapping is a collection of quantization configurations, user can
+        # set the qconfig for each operator (torch op calls, functional calls, module calls)
+        # in the model through qconfig_mapping
+        # the following call will get the qconfig_mapping that works best for models
+        # that target "fbgemm" backend
+        qconfig_mapping = get_default_qat_qconfig("fbgemm")
+
+        # We can customize qconfig_mapping in different ways, please take a look at
+        # the docstring for :func:`~torch.ao.quantization.prepare_fx` for different ways
+        # to configure this
+
+        # example_inputs is a tuple of inputs, that is used to infer the type of the
+        # outputs in the model
+        # currently it's not used, but please make sure model(*example_inputs) runs
+        example_inputs = (torch.randn(1, 3, 224, 224),)
+
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        # `prepare_qat_fx` inserts observers in the model based on qconfig_mapping and
+        # backend_config, if the configuration for an operator in qconfig_mapping
+        # is supported in the backend_config (meaning it's supported by the target
+        # hardware), we'll insert fake_quantize modules according to the qconfig_mapping
+        # otherwise the configuration in qconfig_mapping will be ignored
+        # see :func:`~torch.ao.quantization.prepare_fx` for a detailed explanation of
+        # how qconfig_mapping interacts with backend_config
+        prepared_model = prepare_qat_fx(float_model, qconfig_mapping, example_inputs)
+        # Run training
+        train_loop(prepared_model, train_loop)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.prepare_qat_fx")
+    return _prepare_fx(
+        model,
+        qconfig_mapping,
+        True,  # is_qat
+        example_inputs,
+        prepare_custom_config,
+        backend_config=backend_config,
+    )
+
+
+def _convert_fx(
+    graph_module: GraphModule,
+    is_reference: bool,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+    is_standalone_module: bool = False,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+    is_decomposed: bool = False,
+) -> GraphModule:
+    """ `is_standalone_module`: see docs in :func:`~torch.ao.quantization.prepare_standalone_module_fx`
+    """
+    if convert_custom_config is None:
+        convert_custom_config = ConvertCustomConfig()
+
+    if isinstance(convert_custom_config, Dict):
+        warnings.warn(
+            "Passing a convert_custom_config_dict to convert is deprecated and will not be supported "
+            "in a future version. Please pass in a ConvertCustomConfig instead.")
+        convert_custom_config = ConvertCustomConfig.from_dict(convert_custom_config)
+
+    _check_is_graph_module(graph_module)
+    preserved_attr_names = convert_custom_config.preserved_attributes
+    preserved_attrs = {attr: getattr(graph_module, attr) for attr in preserved_attr_names if hasattr(graph_module, attr)}
+
+    quantized = convert(
+        graph_module,
+        is_reference,
+        convert_custom_config,
+        is_standalone_module,
+        _remove_qconfig_flag=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        is_decomposed=is_decomposed,
+    )
+
+    attach_preserved_attrs_to_model(quantized, preserved_attrs)
+    return quantized
+
+
+def convert_fx(
+    graph_module: GraphModule,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Convert a calibrated or trained model to a quantized model
+
+    Args:
+        * `graph_module` (torch.fx.GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :class:`~torch.ao.quantization.fx.custom_config.ConvertCustomConfig` for more details
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+
+           The keys must include the ones in the qconfig_mapping passed to `prepare_fx` or `prepare_qat_fx`,
+           with the same values or `None`. Additional keys can be specified with values set to `None`.
+
+          For each entry whose value is set to None, we skip quantizing that entry in the model::
+
+            qconfig_mapping = QConfigMapping
+                .set_global(qconfig_from_prepare)
+                .set_object_type(torch.nn.functional.add, None)  # skip quantizing torch.nn.functional.add
+                .set_object_type(torch.nn.functional.linear, qconfig_from_prepare)
+                .set_module_name("foo.bar", None)  # skip quantizing module "foo.bar"
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend, this includes quantization
+            mode support (static/dynamic/weight_only), dtype support (quint8/qint8 etc.),
+            observer placement for each operators and fused operators.
+            See :class:`~torch.ao.quantization.backend_config.BackendConfig` for more details
+
+    Return:
+        A quantized model (torch.nn.Module)
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # convert_fx converts a calibrated/trained model to a quantized model for the
+        # target hardware, this includes converting the model first to a reference
+        # quantized model, and then lower the reference quantized model to a backend
+        # Currently, the supported backends are fbgemm (onednn), qnnpack (xnnpack) and
+        # they share the same set of quantized operators, so we are using the same
+        # lowering procedure
+        #
+        # backend_config defines the corresponding reference quantized module for
+        # the weighted modules in the model, e.g. nn.Linear
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        quantized_model = convert_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=False,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+    )
+
+
+def convert_to_reference_fx(
+    graph_module: GraphModule,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+    _remove_qconfig: bool = True,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Convert a calibrated or trained model to a reference quantized model,
+    see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
+    reference quantized model is a standard representation of a quantized model provided
+    by FX Graph Mode Quantization, it can be further lowered to run on the target
+    hardware, like accelerators
+
+    Args:
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend. See
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+    Return:
+        A reference quantized model (GraphModule)
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        reference_quantized_model = convert_to_reference_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx.convert_to_reference_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=True,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=_remove_qconfig,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+    )
+
+def _convert_to_reference_decomposed_fx(
+    graph_module: GraphModule,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+    qconfig_mapping: Union[QConfigMapping, Dict[str, Any], None] = None,
+    backend_config: Union[BackendConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" Convert a calibrated or trained model to a reference quantized model, with
+    decomposed representation for quantized Tensor
+    see https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md for more details,
+    reference quantized model is a standard representation of a quantized model provided
+    by FX Graph Mode Quantization, it can be further lowered to run on the target
+    hardware, like accelerators
+
+    Note: this is not public API
+
+    Args:
+        * `graph_module` (GraphModule): A prepared and calibrated/trained model (GraphModule)
+
+        * `convert_custom_config` (ConvertCustomConfig): custom configurations for convert function.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+        * `_remove_qconfig` (bool): Option to remove the qconfig attributes in the model after convert.
+
+        * `qconfig_mapping` (QConfigMapping): config for specifying how to convert a model for quantization.
+            See :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+         * `backend_config` (BackendConfig): A configuration for the backend which describes how
+            operators should be quantized in the backend. See
+            :func:`~torch.ao.quantization.quantize_fx.convert_fx` for more details.
+
+    Return:
+        A reference quantized model (GraphModule) with operators working with decomposed quantized Tensor
+
+    Example::
+
+        # prepared_model: the model after prepare_fx/prepare_qat_fx and calibration/training
+        # TODO: add backend_config after we split the backend_config for fbgemm and qnnpack
+        # e.g. backend_config = get_default_backend_config("fbgemm")
+        reference_quantized_model = _convert_to_reference_decomposed_fx(prepared_model)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_fx._convert_to_reference_decomposed_fx")
+    return _convert_fx(
+        graph_module,
+        is_reference=True,
+        convert_custom_config=convert_custom_config,
+        _remove_qconfig=False,
+        qconfig_mapping=qconfig_mapping,
+        backend_config=backend_config,
+        is_decomposed=True,
+    )
+
+
+def _convert_standalone_module_fx(
+    graph_module: GraphModule,
+    is_reference: bool = False,
+    convert_custom_config: Union[ConvertCustomConfig, Dict[str, Any], None] = None,
+) -> GraphModule:
+    r""" [Internal use only] Convert a model produced by :func:`~torch.ao.quantization.prepare_standalone_module_fx`
+    and convert it to a quantized model
+
+    Returns a quantized standalone module, whether input/output is quantized is
+    specified by prepare_custom_config, with
+    input_quantized_idxs, output_quantized_idxs, please
+    see docs for prepare_fx for details
+    """
+    return _convert_fx(
+        graph_module,
+        is_reference,
+        convert_custom_config,
+        is_standalone_module=True,
+    )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantize_jit.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3184b8f963133f57e76121ffa7656eb1b2f6af2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_jit.py
@@ -0,0 +1,335 @@
+
+import torch
+from torch.ao.quantization.qconfig import QConfig
+from torch.ao.quantization.quant_type import QuantType
+from torch.jit._recursive import wrap_cpp_module
+
+__all__ = [
+    "script_qconfig",
+    "script_qconfig_dict",
+    "fuse_conv_bn_jit",
+    "prepare_jit",
+    "prepare_dynamic_jit",
+    "convert_jit",
+    "convert_dynamic_jit",
+    "quantize_jit",
+    "quantize_dynamic_jit",
+]
+
+def _check_is_script_module(model):
+    if not isinstance(model, torch.jit.ScriptModule):
+        raise ValueError('input must be a script module, got: ' + str(type(model)))
+
+def _check_forward_method(model):
+    if not model._c._has_method('forward'):
+        raise ValueError('input script module does not have forward method')
+
+def script_qconfig(qconfig):
+    r"""Instantiate the activation and weight observer modules and script
+    them, these observer module instances will be deepcopied during
+    prepare_jit step.
+    """
+    return QConfig(
+        activation=torch.jit.script(qconfig.activation())._c,
+        weight=torch.jit.script(qconfig.weight())._c)
+
+def script_qconfig_dict(qconfig_dict):
+    r"""Helper function used by `prepare_jit`.
+    Apply `script_qconfig` for all entries in `qconfig_dict` that is
+    not None.
+    """
+    return {k: script_qconfig(v) if v else None for k, v in qconfig_dict.items()}
+
+def fuse_conv_bn_jit(model, inplace=False):
+    r""" Fuse conv - bn module
+    Works for eval model only.
+
+    Args:
+        model: TorchScript model from scripting or tracing
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.fuse_conv_bn_jit")
+    model_c = model._c
+    model_c = torch._C._jit_pass_fold_convbn(model_c)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+def _prepare_jit(model, qconfig_dict, inplace=False, quant_type=QuantType.STATIC):
+    _check_is_script_module(model)
+    _check_forward_method(model)
+    if not all(isinstance(x, str) for x in qconfig_dict.keys()):
+        raise ValueError('qconfig_dict should only contain names(str) as keys.')
+    scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
+    model = fuse_conv_bn_jit(model, inplace)
+    model_c = torch._C._jit_pass_insert_observers(model._c,
+                                                  'forward',
+                                                  scripted_qconfig_dict,
+                                                  inplace,
+                                                  quant_type)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+def _prepare_ondevice_jit(model, qconfig_dict, method_name='forward', inplace=False, quant_type=QuantType.STATIC):
+    _check_is_script_module(model)
+    if not all(isinstance(x, str) for x in qconfig_dict.keys()):
+        raise ValueError('qconfig_dict should only contain names(str) as keys.')
+    scripted_qconfig_dict = script_qconfig_dict(qconfig_dict)
+    method_graph = model._c._get_method(method_name).graph
+    torch._C._jit_pass_inline(method_graph)
+    model = fuse_conv_bn_jit(model, inplace)
+    model_c = torch._C._jit_pass_insert_observer_method_for_ondevice_ptq(model._c,
+                                                                         method_name,
+                                                                         scripted_qconfig_dict,
+                                                                         inplace,
+                                                                         quant_type)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+def prepare_jit(model, qconfig_dict, inplace=False):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_jit")
+    return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.STATIC)
+
+def prepare_dynamic_jit(model, qconfig_dict, inplace=False):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.prepare_dynamic_jit")
+    return _prepare_jit(model, qconfig_dict, inplace, quant_type=QuantType.DYNAMIC)
+
+
+def _prepare_ondevice_dynamic_jit(model, qconfig_dict, method_name='forward', inplace=False):
+    return _prepare_ondevice_jit(model, qconfig_dict, method_name, inplace, quant_type=QuantType.DYNAMIC)
+
+def _convert_jit(model, inplace=False, debug=False, quant_type=QuantType.STATIC,
+                 preserved_attrs=None):
+    _check_is_script_module(model)
+    model.eval()
+    model_c = model._c
+    model_c = torch._C._jit_pass_insert_quant_dequant(model_c, 'forward', inplace, debug, quant_type)
+    if not debug:
+        is_xpu = all(p.device.type == 'xpu' for p in model.parameters())
+        if not is_xpu:
+            # Moving model parameters to CPU since quantized operators
+            # are only supported on CPU and XPU right now
+            model.cpu()
+        if preserved_attrs is None:
+            preserved_attrs = []
+        model_c = torch._C._jit_pass_quant_finalize(model_c, quant_type, preserved_attrs)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    torch._C._jit_pass_constant_propagation(model.graph)
+    torch._C._jit_pass_dce(model.graph)
+    return model
+
+
+def _convert_ondevice_jit(model, method_name, inplace=False, debug=False, quant_type=QuantType.STATIC):
+    _check_is_script_module(model)
+    assert quant_type == QuantType.DYNAMIC, "This API, while should work for static quant, is only tested for dynamic quant."
+    assert not method_name.startswith("observe_"), "Pass in valid method to be quantized, e.g. forward"
+    observe_method_name = "observe_" + method_name
+    quantize_method_name = "quantize_" + method_name
+    model_c = model._c
+    model_c = torch._C._jit_pass_insert_quant_dequant_for_ondevice_ptq(
+        model._c, observe_method_name, inplace, debug, QuantType.DYNAMIC)
+    model_c = torch._C._jit_pass_quant_finalize_for_ondevice_ptq(model_c, QuantType.DYNAMIC, quantize_method_name)
+    if inplace:
+        model._reconstruct(model_c)
+    else:
+        model = wrap_cpp_module(model_c)
+    return model
+
+def convert_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_jit")
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.STATIC, preserved_attrs=preserved_attrs)
+
+def convert_dynamic_jit(model, inplace=False, debug=False, preserved_attrs=None):
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.convert_dynamic_jit")
+    return _convert_jit(model, inplace, debug, quant_type=QuantType.DYNAMIC, preserved_attrs=preserved_attrs)
+
+
+def _convert_ondevice_dynamic_jit(model, method_name, inplace=False, debug=False):
+    return _convert_ondevice_jit(model, method_name, inplace, debug, quant_type=QuantType.DYNAMIC)
+
+
+def _quantize_ondevice_dynamic_jit_impl(model, qconfig_dict, method_name, inplace=False):
+    model = _prepare_ondevice_dynamic_jit(model, qconfig_dict, method_name, inplace)
+    model = _convert_ondevice_dynamic_jit(model, method_name, inplace)
+    return model
+
+def _quantize_jit(model, qconfig_dict, run_fn=None, run_args=None, inplace=False, debug=False, quant_type=QuantType.STATIC):
+    # Always do inplace convert because the Tensor is already
+    # copied in prepare_jit when inplace is False
+    if quant_type == QuantType.DYNAMIC:
+        model = prepare_dynamic_jit(model, qconfig_dict, inplace)
+        model = convert_dynamic_jit(model, True, debug)
+    else:
+        assert run_fn, "Must provide calibration function for post training static quantization"
+        assert run_args, "Must provide calibration dataset for post training static quantization"
+        model = prepare_jit(model, qconfig_dict, inplace)
+        run_fn(model, *run_args)
+        model = convert_jit(model, True, debug)
+
+    torch._C._jit_pass_constant_propagation(model.graph)
+    torch._C._jit_pass_dce(model.graph)
+    return model
+
+def quantize_jit(model, qconfig_dict, run_fn, run_args, inplace=False, debug=False):
+    r"""Quantize the input float TorchScript model with
+    post training static quantization.
+
+    First it will prepare the model for calibration, then it calls
+    `run_fn` which will run the calibration step, after that we will
+    convert the model to a quantized model.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, empty key means the qconfig will be applied
+        to whole model unless it's overwritten by more specific configurations, the
+        qconfig for each module is either found in the dictionary or fallback to
+         the qconfig of parent module.
+
+        Right now qconfig_dict is the only way to configure how the model is quantized,
+        and it is done in the granularity of module, that is, we only support one type
+        of qconfig for each torch.nn.Module, and the qconfig for sub module will
+        override the qconfig for parent module, empty string means global configuration.
+        `run_fn`: a calibration function for calibrating the prepared model
+        `run_args`: positional arguments for `run_fn`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import get_default_qconfig
+    from torch.ao.quantization import quantize_jit
+
+    ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    quantized_model = quantize_jit(
+        ts_model,
+        {'': qconfig},
+        calibrate,
+        [data_loader_test])
+    ```
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_jit")
+    return _quantize_jit(model, qconfig_dict, run_fn, run_args, inplace, debug, quant_type=QuantType.STATIC)
+
+def quantize_dynamic_jit(model, qconfig_dict, inplace=False, debug=False):
+    r"""Quantize the input float TorchScript model with
+    post training dynamic quantization.
+    Currently only qint8 quantization of torch.nn.Linear is supported.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, please see detailed
+        descriptions in :func:`~torch.ao.quantization.quantize_jit`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+        `debug`: flag for producing a debug friendly model (preserve weight attribute)
+
+    Return:
+        Quantized TorchSciprt model.
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import per_channel_dynamic_qconfig
+    from torch.ao.quantization import quantize_dynamic_jit
+
+    ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig('fbgemm')
+    def calibrate(model, data_loader):
+        model.eval()
+        with torch.no_grad():
+            for image, target in data_loader:
+                model(image)
+
+    quantized_model = quantize_dynamic_jit(
+        ts_model,
+        {'': qconfig},
+        calibrate,
+        [data_loader_test])
+    ```
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_jit.quantize_dynamic_jit")
+    return _quantize_jit(model, qconfig_dict, inplace=inplace, debug=debug, quant_type=QuantType.DYNAMIC)
+
+
+def _quantize_ondevice_dynamic_jit(model, qconfig_dict, method_name='forward', inplace=False):
+    r"""Prepares the input float TorchScript model with
+    *on-device* post training dynamic quantization.
+    Currently only qint8 quantization of torch.nn.Linear is supported.
+
+    Args:
+        `model`: input float TorchScript model
+        `qconfig_dict`: qconfig_dict is a dictionary with names of sub modules as key and
+        qconfig for that module as value, please see detailed
+        `method_name`: Name of the method within the model, to be prepared for quantization
+        descriptions in :func:`~torch.ao.quantization.quantize_jit`
+        `inplace`: carry out model transformations in-place, the original module is
+        mutated
+
+    Return:
+        TorchScript model that is ready for on device quantization.
+        This means that the returned
+        model has:
+        - Method is inlined.
+        - Model has observer modules inserted in the model.
+        - Model has packed params inserted in the model. However they are empty as in they dont
+          contain valid quantized weights.
+        - observe_<method_name> is added that observe the values to be quantized.
+        - reset_observers_<method_name> to reset observers.
+        - quantize_<method_name> is added to the model.
+          - This method extract scale, zero points.
+          - Quantizes observed weights.
+          - Creates packed params from it and update the attribute of the model with the new values
+            for the packed params.
+          - Reset the original fp32 weights with empty tensor using SetAttr.
+        - quantized_<method_name> is added to the model.
+          - This method uses quantized weights and quantized linear ops instead of fp32 op.
+          - This method should be used for inference post PTQ.
+        - Note that all method's signatures should be the same as method_name.
+
+        Later on device:
+        - Run reset_observers_<method_name>
+        - Run observe_<method_name>
+        - Run quantize_<method_name>
+        - Now model can be saved and loaded later.
+        - Run model with quantized_<method_name>
+
+    Example:
+    ```python
+    import torch
+    from torch.ao.quantization import per_channel_dynamic_qconfig
+    from torch.ao.quantization.quantize_jit import _quantize_ondevice_dynamic_jit
+
+    ts_model = torch.jit.script(float_model.eval())  # or torch.jit.trace(float_model, input)
+    qconfig = get_default_qconfig('fbgemm')
+    quant_ready_model = _quantize_ondevice_dynamic_jit(
+        ts_model,
+        {'': qconfig},
+        'forward',
+        True)
+    ```
+    """
+    return _quantize_ondevice_dynamic_jit_impl(model, qconfig_dict, method_name, inplace=inplace)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantize_pt2e.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_pt2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..33267f8d81a9757aff3852d2444c9c8af6fed42a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantize_pt2e.py
@@ -0,0 +1,250 @@
+import torch
+from torch.fx import GraphModule
+from torch.fx import Node
+
+from .pt2e.prepare import prepare
+from .pt2e.qat_utils import (
+    _fuse_conv_bn_qat,
+    _fold_conv_bn_qat,
+)
+from .pt2e.utils import (
+    _get_node_name_to_scope,
+    _fuse_conv_bn_,
+    _disallow_eval_train,
+)
+from .pt2e.representation import reference_representation_rewrite
+from .quantize_fx import _convert_to_reference_decomposed_fx
+from torch.ao.quantization.quantizer import (  # noqa: F401
+    Quantizer,
+    QuantizationSpecBase,
+    QuantizationSpec,
+    FixedQParamsQuantizationSpec,
+    SharedQuantizationSpec,
+    DerivedQuantizationSpec,
+    QuantizationAnnotation,
+)
+from torch.fx.passes.infra.pass_manager import PassManager
+from torch.ao.quantization.pt2e.duplicate_dq_pass import DuplicateDQPass
+from torch.ao.quantization.pt2e.port_metadata_pass import PortNodeMetaForQDQ
+from torch._inductor.constant_folding import constant_fold
+
+__all__ = [
+    "prepare_pt2e",
+    "prepare_qat_pt2e",
+    "convert_pt2e",
+]
+
+
+def prepare_pt2e(
+    model: GraphModule,
+    quantizer: Quantizer,
+) -> GraphModule:
+    """Prepare a model for post training quantization
+
+    Args:
+      * `model` (torch.fx.GraphModule): a model captured by `torch.export` API
+        in the short term we are using `torch._export.capture_pre_autograd_graph`,
+        in the long term we'll migrate to some `torch.export` API
+      * `quantizer`: A backend specific quantizer that conveys how user want the
+        model to be quantized. Tutorial for how to write a quantizer can be found here:
+        https://pytorch.org/tutorials/prototype/pt2e_quantizer.html
+
+    Return:
+      A GraphModule with observer (based on quantizer annotation), ready for calibration
+
+    Example::
+
+        import torch
+        from torch.ao.quantization.quantize_pt2e import prepare_pt2e
+        from torch._export import capture_pre_autograd_graph
+        from torch.ao.quantization.quantizer import (
+            XNNPACKQuantizer,
+            get_symmetric_quantization_config,
+        )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+           def forward(self, x):
+               return self.linear(x)
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define calibration function
+        def calibrate(model, data_loader):
+            model.eval()
+            with torch.no_grad():
+                for image, target in data_loader:
+                    model(image)
+
+        # Step 1. program capture
+        # NOTE: this API will be updated to torch.export API in the future, but the captured
+        # result shoud mostly stay the same
+        m = capture_pre_autograd_graph(m, *example_inputs)
+        # we get a model with aten ops
+
+        # Step 2. quantization
+        # backend developer will write their own Quantizer and expose methods to allow
+        # users to express how they
+        # want the model to be quantized
+        quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config())
+        m = prepare_pt2e(m, quantizer)
+
+        # run calibration
+        # calibrate(m, sample_inference_data)
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_pt2e")
+    original_graph_meta = model.meta
+    node_name_to_scope = _get_node_name_to_scope(model)
+    # TODO: check qconfig_mapping to make sure conv and bn are both configured
+    # to be quantized before fusion
+    # TODO: (maybe) rewrite this with subgraph_rewriter
+    _fuse_conv_bn_(model)
+    quantizer.transform_for_annotation(model)
+    quantizer.annotate(model)
+    quantizer.validate(model)
+    model = prepare(model, node_name_to_scope, is_qat=False)
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
+
+def prepare_qat_pt2e(
+    model: GraphModule,
+    quantizer: Quantizer,
+) -> GraphModule:
+    """Prepare a model for quantization aware training
+
+    Args:
+      * `model` (torch.fx.GraphModule): see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e`
+      * `quantizer`: see :func:`~torch.ao.quantization.quantize_pt2e.prepare_pt2e`
+
+    Return:
+      A GraphModule with fake quant modules (based on quantizer annotation), ready for
+      quantization aware training
+
+    Example::
+        import torch
+        from torch.ao.quantization.quantize_pt2e import prepare_qat_pt2e
+        from torch._export import capture_pre_autograd_graph
+        from torch.ao.quantization.quantizer import (
+            XNNPACKQuantizer,
+            get_symmetric_quantization_config,
+        )
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(5, 10)
+
+           def forward(self, x):
+               return self.linear(x)
+
+        # initialize a floating point model
+        float_model = M().eval()
+
+        # define the training loop for quantization aware training
+        def train_loop(model, train_data):
+            model.train()
+            for image, target in data_loader:
+                ...
+
+        # Step 1. program capture
+        # NOTE: this API will be updated to torch.export API in the future, but the captured
+        # result shoud mostly stay the same
+        m = capture_pre_autograd_graph(m, *example_inputs)
+        # we get a model with aten ops
+
+        # Step 2. quantization
+        # backend developer will write their own Quantizer and expose methods to allow
+        # users to express how they
+        # want the model to be quantized
+        quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config())
+        m = prepare_qat_pt2e(m, quantizer)
+
+        # run quantization aware training
+        train_loop(prepared_model, train_loop)
+
+    """
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.prepare_qat_pt2e")
+    original_graph_meta = model.meta
+    node_name_to_scope = _get_node_name_to_scope(model)
+    quantizer.transform_for_annotation(model)
+    quantizer.annotate(model)
+    quantizer.validate(model)
+    # Perform fusion after annotate to avoid quantizing ops in the new
+    # subgraph that don't need to be quantized
+    # TODO: only fuse if conv and bn are both configured to be quantized
+    _fuse_conv_bn_qat(model)
+    model = prepare(model, node_name_to_scope, is_qat=True)
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
+
+_QUANT_OPS = [
+    torch.ops.quantized_decomposed.quantize_per_tensor.default,
+    torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+    torch.ops.quantized_decomposed.quantize_per_channel.default,
+]
+def _quant_node_constraint(n: Node) -> bool:
+    """If there is any pure ops between get_attr and quantize op they will be const propagated
+    e.g. get_attr(weight) -> transpose -> quantize -> dequantize*
+    (Note: dequantize op is not going to be constant propagated)
+
+    This filter is added because we don't want to constant fold the things that are not
+    related to quantization
+    """
+    return n.op == "call_function" and n.target in _QUANT_OPS
+
+def convert_pt2e(
+    model: GraphModule,
+    use_reference_representation: bool = False,
+    fold_quantize: bool = True,
+) -> GraphModule:
+    """Convert a calibrated/trained model to a quantized model
+
+    Args:
+      * `model` (torch.fx.GraphModule): calibrated/trained model
+      * `use_reference_representation` (bool): boolean flag to indicate whether to produce referece representation or not
+      * `fold_quantize` (bool): boolean flag for whether fold the quantize op or not
+
+    Returns:
+        quantized model, either in q/dq representation or reference representation
+
+    Example::
+
+        # prepared_model: the model produced by `prepare_pt2e`/`prepare_qat_pt2e` and calibration/training
+        # `convert_pt2e` produces a quantized model that represents quantized computation with
+        # quantize dequantize ops and fp32 ops by default.
+        # Please refer to
+        # https://pytorch.org/tutorials/prototype/pt2e_quant_ptq_static.html#convert-the-calibrated-model-to-a-quantized-model
+        # for detailed explanation of output quantized model
+        quantized_model = convert_pt2e(prepared_model)
+
+    """  # flake8: noqa
+    torch._C._log_api_usage_once("quantization_api.quantize_pt2e.convert_pt2e")
+    if not isinstance(use_reference_representation, bool):
+        raise ValueError(
+            "Unexpected argument type for `use_reference_representation`, "
+            f"please make sure you intend to pass argument {use_reference_representation} to convert_pt2e")
+    original_graph_meta = model.meta
+    model = _convert_to_reference_decomposed_fx(model)
+    model = _fold_conv_bn_qat(model)
+
+    pm = PassManager([DuplicateDQPass()])
+    model = pm(model).graph_module
+
+    pm = PassManager([PortNodeMetaForQDQ()])
+    model = pm(model).graph_module
+
+    if fold_quantize:
+        constant_fold(model, _quant_node_constraint)
+
+    if use_reference_representation:
+        model = reference_representation_rewrite(model)
+
+    model.meta.update(original_graph_meta)
+    model = _disallow_eval_train(model)
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__init__.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..232f79dd3591caa2b70e15626031658c00cdff0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__init__.py
@@ -0,0 +1,21 @@
+from .quantizer import (
+    DerivedQuantizationSpec,
+    EdgeOrNode,
+    FixedQParamsQuantizationSpec,
+    QuantizationAnnotation,
+    QuantizationSpec,
+    QuantizationSpecBase,
+    Quantizer,
+    SharedQuantizationSpec,
+)
+
+__all__ = [
+    "EdgeOrNode",
+    "Quantizer",
+    "QuantizationSpecBase",
+    "QuantizationSpec",
+    "FixedQParamsQuantizationSpec",
+    "SharedQuantizationSpec",
+    "DerivedQuantizationSpec",
+    "QuantizationAnnotation",
+]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c683af5ac2a237cbf9f1aede59a716d8b5a02ea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/composable_quantizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/composable_quantizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4c0b4211813bb2d2830870e0b4b08acd676f75b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/composable_quantizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/embedding_quantizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/embedding_quantizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cfe70be517f579f5d5338872120fac86f06a3b7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/embedding_quantizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/quantizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/quantizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af219fd828a423715a9cf3c2adcf28043e2f8046
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/quantizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c8dec6ba3c86f62e73e661e5dac06cefe9b3a65
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/x86_inductor_quantizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/x86_inductor_quantizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..086552d922ce599503a26f50ae01fff650dbebcc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/x86_inductor_quantizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d92d1b13b20525e1468c2e957dd3b44c78629bf3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0186d8b8bc58fd47b2f065ed16faf8efc175eb5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/__pycache__/xnnpack_quantizer_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..34289a1bba9d1521e4decd6f3b79ca1f43a19c6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/composable_quantizer.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from typing import Dict, List
+
+import torch
+
+from torch.fx import Node
+
+from .quantizer import QuantizationAnnotation, Quantizer
+
+__all__ = [
+    "ComposableQuantizer",
+]
+
+
+class ComposableQuantizer(Quantizer):
+    """
+    ComposableQuantizer allows users to combine more than one quantizer into a single quantizer.
+    This allows users to quantize a model with multiple quantizers. E.g., embedding quantization
+    maybe supported by one quantizer while linear layers and other ops might be supported by another
+    quantizer.
+
+    ComposableQuantizer is initialized with a list of `Quantizer` instances.
+    The order of the composition matters since that is the order in which the quantizers will be
+    applies.
+    Example:
+    ```
+    embedding_quantizer = EmbeddingQuantizer()
+    linear_quantizer = MyLinearQuantizer()
+    xnnpack_quantizer = XNNPackQuantizer() # to handle ops not quantized by previous two quantizers
+    composed_quantizer = ComposableQuantizer([embedding_quantizer, linear_quantizer, xnnpack_quantizer])
+    prepared_m = prepare_pt2e(model, composed_quantizer)
+    ```
+    """
+
+    def __init__(self, quantizers: List[Quantizer]):
+        super().__init__()
+        self.quantizers = quantizers
+        self._graph_annotations: Dict[Node, QuantizationAnnotation] = {}
+
+    def _record_and_validate_annotations(
+        self, gm: torch.fx.GraphModule, quantizer: Quantizer
+    ) -> None:
+        for n in gm.graph.nodes:
+            if "quantization_annotation" in n.meta:
+                # check if the annotation has been changed by
+                # comparing QuantizationAnnotation object id
+                if n in self._graph_annotations and (
+                    id(self._graph_annotations[n])
+                    != id(n.meta["quantization_annotation"])
+                ):
+                    raise RuntimeError(
+                        f"Quantizer {quantizer.__class__.__name__} has changed annotations on node {n}"
+                    )
+                else:
+                    self._graph_annotations[n] = n.meta["quantization_annotation"]
+            else:
+                if n in self._graph_annotations:
+                    raise RuntimeError(
+                        f"Quantizer {quantizer.__class__.__name__} has removed annotations on node {n}"
+                    )
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        for quantizer in self.quantizers:
+            quantizer.annotate(model)
+            self._record_and_validate_annotations(model, quantizer)
+        return model
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        for quantizer in self.quantizers:
+            model = quantizer.transform_for_annotation(model)
+        return model
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcbc6115f129394d441b470c2890acd6605f8de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/embedding_quantizer.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import copy
+from typing import List, Set
+
+import torch
+import torch.nn.functional as F
+from torch.ao.quantization.observer import PerChannelMinMaxObserver
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    Quantizer,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    OperatorConfig,
+    OperatorPatternType,
+    QuantizationConfig,
+)
+
+__all__ = [
+    "get_embedding_operators_config",
+    "EmbeddingQuantizer",
+]
+
+
+def get_embedding_operators_config() -> OperatorConfig:
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.uint8,
+        qscheme=torch.per_channel_affine_float_qparams,
+        ch_axis=0,
+        observer_or_fake_quant_ctr=PerChannelMinMaxObserver.with_args(eps=2**-12),
+    )
+    quantization_config = QuantizationConfig(None, None, weight_quantization_spec, None)
+    ops: List[OperatorPatternType] = [[torch.nn.Embedding]]
+    ops.append([F.embedding])
+    supported_config_and_operators = OperatorConfig(
+        config=quantization_config, operators=ops
+    )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+class EmbeddingQuantizer(Quantizer):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
+        op_configs: Set[QuantizationConfig] = set({})
+        for spec, _ in cls.get_supported_operators():
+            op_configs.add(spec)
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: QuantizationConfig
+    ) -> List[OperatorPatternType]:
+        for config, ops in cls.get_supported_operators():
+            # note: this assumes each entry in cls.supported_spec_and_operators
+            # corresponds to one spec, e.g. we don't have
+            # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)]
+            # where the first and second entry have the same spec but did not
+            # merge the op list
+            if config == quantization_config:
+                return ops
+        return []
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        self._annotate_embedding_ops(model.graph)
+        return model
+
+    def _annotate_embedding_ops(self, graph: torch.fx.Graph) -> None:
+        embedding_config: OperatorConfig = get_embedding_operators_config()
+        for node in graph.nodes:
+            # Keep node parsing based annotations instead of module partitioners
+            # just as an example of alternate ways of annotating
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.embedding.default
+            ):
+                if embedding_config.config.weight is None:
+                    raise ValueError(
+                        "Embedding config must have a valid weight quantization spec."
+                    )
+                node.meta["quantization_annotation"] = QuantizationAnnotation(
+                    input_qspec_map={
+                        node.args[0]: embedding_config.config.weight,
+                    }
+                )
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return [get_embedding_operators_config()]
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/quantizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be2acc70b340519dd8971a66eceeffea89890ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/quantizer.py
@@ -0,0 +1,158 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.ao.quantization import ObserverOrFakeQuantize
+from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+from torch.fx import Node
+
+__all__ = [
+    "Quantizer",
+    "QuantizationSpecBase",
+    "QuantizationSpec",
+    "FixedQParamsQuantizationSpec",
+    "EdgeOrNode",
+    "SharedQuantizationSpec",
+    "DerivedQuantizationSpec",
+    "QuantizationAnnotation",
+]
+
+
+class QuantizationSpecBase(ABC):  # noqa: B024
+    """Base class for different types of quantization specs that allows users to
+    specify how to quantize a Tensor (input/output of a Node) in the model
+    """
+
+    pass
+
+
+@dataclass(eq=True, frozen=True)
+class QuantizationSpec(QuantizationSpecBase):
+    """Quantization spec for common operators that allows user to specify how to
+    quantize a Tensor, this includes dtype, quant_min, quant_max etc.
+    """
+
+    dtype: torch.dtype
+    # observer or fake_quantize constructor such as
+    # MinMaxObserver, PerChannelHistogramObserver etc.
+    # or we can attach some custom args to them
+    # e.g. MinMaxObserver.with_args(eps=eps)
+    observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor
+    quant_min: Optional[int] = None
+    quant_max: Optional[int] = None
+    qscheme: Optional[torch.qscheme] = None
+    ch_axis: Optional[int] = None
+    is_dynamic: bool = False
+
+    def __post_init__(self):
+        # quant_min must be less than quant_max
+        if (
+            self.quant_min is not None
+            and self.quant_max is not None
+            and self.quant_min > self.quant_max
+        ):
+            raise ValueError(
+                f"quant_min {self.quant_min} must be <= quant_max {self.quant_max}."
+            )
+
+        # ch_axis must be less than the number of channels
+        # but no way to check here. Just check that it is not < 0.
+        if self.ch_axis is not None and self.ch_axis < 0:
+            raise ValueError("Ch_axis is < 0.")
+
+
+@dataclass(eq=True, frozen=True)
+class FixedQParamsQuantizationSpec(QuantizationSpecBase):
+    dtype: torch.dtype
+    scale: float
+    zero_point: int
+    quant_min: Optional[int] = None
+    quant_max: Optional[int] = None
+    qscheme: Optional[torch.qscheme] = None
+
+
+"""
+The way we refer to other points of quantization in the graph will be either
+an input edge or an output value
+input edge is the connection between input node and the node consuming the input, so it's a Tuple[Node, Node]
+output value is an fx Node
+"""
+EdgeOrNode = Union[Tuple[Node, Node], Node]
+EdgeOrNode.__module__ = "torch.ao.quantization.quantizer.quantizer"
+
+
+@dataclass(eq=True, frozen=True)
+class SharedQuantizationSpec(QuantizationSpecBase):
+    """
+    Quantization spec for the Tensors whose quantization parameters are shared with other Tensors
+    """
+
+    # the edge or node to share observer or fake quant instances with
+    edge_or_node: EdgeOrNode
+
+
+@dataclass(eq=True, frozen=True)
+class DerivedQuantizationSpec(QuantizationSpecBase):
+    """Quantization spec for the Tensors whose quantization parameters are derived from other Tensors"""
+
+    derived_from: List[EdgeOrNode]
+    derive_qparams_fn: Callable[[List[ObserverOrFakeQuantize]], Tuple[Tensor, Tensor]]
+    dtype: torch.dtype
+    quant_min: Optional[int] = None
+    quant_max: Optional[int] = None
+    qscheme: Optional[torch.qscheme] = None
+    ch_axis: Optional[int] = None
+
+
+@dataclass
+class QuantizationAnnotation:
+    """How are input arguemnt or output should be quantized,
+    expressed as QuantizationSpec, this corresponds to how a Tensor in the
+    operator Graph is observed (PTQ) or fake quantized (QAT)
+    """
+
+    # a map from torch.fx.Node to a type of QuantizationSpecBase
+    input_qspec_map: Dict[Node, Optional[QuantizationSpecBase]] = field(
+        default_factory=dict
+    )
+
+    # How the output of this node is quantized, expressed as QuantizationSpec
+    # TODO: change the value to QuantizationSpec in a separate PR
+    output_qspec: Optional[QuantizationSpecBase] = None
+
+    # For a Node: node1 and edge: (node1, node2), since they are observing the same
+    # Tensor, we may want to implicitly share observers, this flag allows people to
+    # turn off this behavior for the output of the node
+    allow_implicit_sharing: bool = True
+
+    # whether the node is annotated or not
+    _annotated: bool = False
+
+
+class Quantizer(ABC):
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """Allows for user defined transforms to run before annotating the graph.
+        This allows quantizer to allow quantizing part of the model that are otherwise not quantizable.
+        For example quantizer can
+        a) decompose a compound operator like scaled dot product attention,
+        into bmm and softmax if quantizer knows how to quantize bmm/softmax but not sdpa
+        or b) transform scalars to tensor to allow quantizing scalares.
+
+        Note: this is an optional method
+        """
+        return model
+
+    # annotate nodes in the graph with observer or fake quant constructors
+    # to convey the desired way of quantization
+    @abstractmethod
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        pass
+
+    # validate the annotated graph is supported by the backend
+    @abstractmethod
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2553bb8faa43875e9c4673dbad157d1b5af313e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/utils.py
@@ -0,0 +1,49 @@
+from typing import List
+
+from torch.ao.quantization.pt2e.utils import _is_sym_size_node
+
+from torch.ao.quantization.quantizer.quantizer import QuantizationAnnotation
+from torch.fx import Node
+
+
+def _annotate_input_qspec_map(node: Node, input_node: Node, qspec):
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    if quantization_annotation.input_qspec_map is None:
+        quantization_annotation.input_qspec_map = {}
+    quantization_annotation.input_qspec_map[input_node] = qspec
+    node.meta["quantization_annotation"] = quantization_annotation
+
+
+def _annotate_output_qspec(node: Node, qspec):
+    quantization_annotation = node.meta.get(
+        "quantization_annotation", QuantizationAnnotation()
+    )
+    quantization_annotation.output_qspec = qspec
+    node.meta["quantization_annotation"] = quantization_annotation
+
+
+def _node_only_used_for_sym_size(node: Node, partition_nodes: List[Node]):
+    """
+    This utility is used to handle cases when dynami_shape=True tracing leads
+    to symint nodes in the pattern of linear module. In those cases, we need to
+    distinguish between the nodes that are in input for just extracting value of
+    some dimentions (and symint nodes) vs. the one that is activation.
+    For example:
+    graph(x, y, weight):
+       size_0 = torch.ops.aten.sym_size([x], [0])
+       size_1 = torch.ops.aten.sym_size([y], [1])
+       view_size = size_0 * size_1
+       size_3 = torch.ops.aten.sym_size([x], [2])
+       vie_out = torch.ops.aten.view(x, [view_size, size_3])
+       return mm(view_out, weight)
+    In the example above y node is not actual input. It exist only to extract size_1
+    """
+    if _is_sym_size_node(node):
+        return True
+
+    return all(
+        ((user not in partition_nodes) or _is_sym_size_node(user))
+        for user in node.users
+    )
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..620032293480d3b8d938f78fec4fd187c9ca6533
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/x86_inductor_quantizer.py
@@ -0,0 +1,1016 @@
+import copy
+import functools
+import itertools
+import operator
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    HistogramObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+)
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+from torch.ao.quantization.quantizer.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    Quantizer,
+    SharedQuantizationSpec,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    _is_annotated,
+    get_bias_qspec,
+    get_input_act_qspec,
+    get_output_act_qspec,
+    get_weight_qspec,
+    OperatorConfig,
+    OperatorPatternType,
+    QuantizationConfig,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.source_matcher_utils import (
+    get_source_partitions,
+    SourcePartition,
+)
+
+__all__ = [
+    "X86InductorQuantizer",
+    "get_default_x86_inductor_quantization_config",
+]
+
+
+@dataclass
+class _X86InductorQuantizationAnnotation(QuantizationAnnotation):
+    # _is_output_of_quantized_pattern:
+    #  * Node as output node of a fusion pattern.
+    #  * The fusion pattern supports int8 data type.
+    #  * The fusion pattern has inputs annotated to insert observer.
+    _is_output_of_quantized_pattern: bool = False
+
+
+# Operations that:
+# 1. Operations are optimized to run with int8 when int8 input provided.
+# 2. Operations do not support int8 input and produce fp32 output.
+int8_in_int8_out_ops_pt2e: Set = {
+    torch.ops.aten.max_pool2d.default,
+    torch.ops.aten.cat.default,
+    torch.ops.aten.avg_pool2d.default,
+    torch.ops.aten.adaptive_avg_pool2d.default,
+    torch.ops.aten.flatten.using_ints,
+}
+
+
+# Operations support the int8 data type and exclude operations such as conv and linear.
+# A superset of int8_in_int8_out_ops_pt2e incorporating additional operators.
+quantizable_ops_pt2e = copy.deepcopy(int8_in_int8_out_ops_pt2e)
+
+QUANT_ANNOTATION_KEY = "quantization_annotation"
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if node is not None:
+            if QUANT_ANNOTATION_KEY not in node.meta:
+                node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation()
+            node.meta[QUANT_ANNOTATION_KEY]._annotated = True
+
+
+def _is_node_annotated(_node):
+    """
+    return True if the node is annotated, otherwise return False
+    """
+    return (
+        QUANT_ANNOTATION_KEY in _node.meta
+        and _node.meta[QUANT_ANNOTATION_KEY]._annotated
+    )
+
+
+def _is_any_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    check if any of the node is annotated, return True if any of the node
+    is annotated, otherwise return False.
+    """
+    return any(_is_node_annotated(node) for node in nodes)
+
+
+def _is_all_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    return True if all of the node is annotated, otherwise return False.
+    """
+    return all(_is_node_annotated(node) for node in nodes)
+
+
+def _is_quantized_op_pt2e(node: torch.fx.Node):
+    """
+    Used for pt2e flow to check if the node is a quantized node:
+    Case1: the node has been annotated as output node of a fusion pattern.
+    Case2: the node has been annotated as single quantized node.
+    """
+    if not _is_any_annotated([node]):
+        # The node has not been annotated, directly return False
+        return False
+    quantization_annotation = node.meta.get(QUANT_ANNOTATION_KEY, None)
+    assert isinstance(quantization_annotation, _X86InductorQuantizationAnnotation)
+    return quantization_annotation._is_output_of_quantized_pattern
+
+
+def _supported_quantized_operators() -> Dict[str, List[OperatorPatternType]]:
+    # TODO: Add more supported operators here.
+    supported_operators: Dict[str, List[OperatorPatternType]] = {
+        "conv2d": [
+            [torch.nn.Conv2d],
+            [F.conv2d],
+        ],
+    }
+
+    # Append Conv Optional(Add) Optioinal(ReLU)
+    conv_add_relu_options = itertools.product(
+        [torch.nn.Conv2d, F.conv2d],
+        [torch.add, operator.add, None],  # add
+        [torch.nn.ReLU, F.relu, None],  # relu
+    )
+    for conv_op, add_op, relu_op in conv_add_relu_options:
+        if add_op is None:
+            # Append Conv ReLU
+            supported_operators["conv2d"].append([conv_op, relu_op])  # type: ignore[list-item]
+        elif relu_op is None:
+            # Append Conv Add
+            supported_operators["conv2d"].append([conv_op, add_op])  # type: ignore[list-item]
+        else:
+            # Append Conv Add ReLU
+            supported_operators["conv2d"].append([conv_op, add_op, relu_op])  # type: ignore[list-item]
+
+    return copy.deepcopy(supported_operators)
+
+
+def _get_supported_x86_inductor_config_and_operators() -> List[OperatorConfig]:
+    supported_config_and_operators: List[OperatorConfig] = []
+    for quantization_config in [
+        get_default_x86_inductor_quantization_config(),
+    ]:
+        ops = _supported_quantized_operators()
+        for pattern_list in ops.values():
+            supported_config_and_operators.append(
+                OperatorConfig(quantization_config, pattern_list)
+            )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+@functools.lru_cache
+def get_default_x86_inductor_quantization_config(
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+):
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    # Copy from x86 default qconfig from torch/ao/quantization/qconfig.py
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.uint8,
+        quant_min=0,
+        quant_max=255,  # reduce_range=False
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        FusedMovingAvgObsFakeQuantize if is_qat else PerChannelMinMaxObserver
+    )
+
+    if is_qat:
+        # Only support per channel quant for now
+        extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_channel_symmetric,
+        ch_axis=0,  # 0 corresponding to weight shape = (oc, ic, kh, kw) of conv
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+    bias_quantization_spec = None  # will use placeholder observer by default
+    quantization_config = QuantizationConfig(
+        act_quantization_spec,
+        act_quantization_spec,
+        weight_quantization_spec,
+        bias_quantization_spec,
+        is_qat,
+    )
+    return quantization_config
+
+
+def _get_supported_config_and_operators() -> List[OperatorConfig]:
+    return _get_supported_x86_inductor_config_and_operators()
+
+
+class X86InductorQuantizer(Quantizer):
+    supported_config_and_operators = _get_supported_config_and_operators()
+
+    def __init__(self):
+        super().__init__()
+        self.global_config: QuantizationConfig = None  # type: ignore[assignment]
+        self.operator_type_config: Dict[str, Optional[QuantizationConfig]] = {}
+
+    @classmethod
+    def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
+        op_configs: Set[QuantizationConfig] = set({})
+        for spec, _ in cls.supported_config_and_operators:
+            op_configs.add(spec)
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: Optional[QuantizationConfig]
+    ) -> List[OperatorPatternType]:
+        if quantization_config is None:
+            all_ops = []
+            for _, ops in cls.supported_config_and_operators:
+                all_ops.extend(ops)
+            return all_ops
+
+        for config, ops in cls.supported_config_and_operators:
+            if config == quantization_config:
+                return ops
+        return []
+
+    def set_global(self, quantization_config: QuantizationConfig):
+        self.global_config = quantization_config
+        return self
+
+    def set_config_for_operator_type(
+        self, operator_type: str, quantization_config: QuantizationConfig
+    ):
+        self.operator_type_config[operator_type] = quantization_config
+        return self
+
+    def _annotate_conv_node_helper(
+        self,
+        conv_node: torch.fx.Node,
+        annotate_output: bool,
+        quantization_config: QuantizationConfig,
+    ) -> None:
+        """Helper function to annotate the conv node"""
+        input_qspec_map = {}
+        input_node = conv_node.args[0]
+        assert isinstance(input_node, Node)
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        weight_node = conv_node.args[1]
+        assert isinstance(weight_node, Node)
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+        bias_node = None if len(conv_node.args) == 2 else conv_node.args[2]
+        if isinstance(bias_node, Node):
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+        if annotate_output:
+            conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+        else:
+            conv_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+            )
+
+    def _annotate_linear_node_helper(
+        self,
+        linear_node: torch.fx.Node,
+        annotate_output: bool,
+        quantization_config: QuantizationConfig,
+    ) -> None:
+        """Helper function to annotate the linear node"""
+        input_qspec_map = {}
+        assert linear_node.target in (torch.ops.aten.linear.default,)
+        has_bias = len(linear_node.args) == 3
+        input_index = 0
+        weight_index = 1
+        bias_index = 2
+
+        input_node = linear_node.args[input_index]
+        assert isinstance(input_node, Node)
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+
+        weight_node = linear_node.args[weight_index]
+        assert isinstance(weight_node, Node)
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+
+        bias_node = linear_node.args[bias_index] if has_bias else None
+        if isinstance(bias_node, Node):
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+
+        if annotate_output:
+            linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+        else:
+            linear_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=input_qspec_map, _annotated=True
+            )
+
+    def _get_output_nodes_of_partitions(
+        self,
+        partition_list: List[SourcePartition],
+    ) -> List[torch.fx.Node]:
+        """Helper function to get the output node list from partition list"""
+        output_node_list = []
+        for partition in partition_list:
+            if len(partition.output_nodes) > 1:
+                raise ValueError("Input partition has more than one output node")
+            output_node = partition.output_nodes[0]
+            assert isinstance(output_node, Node)
+            output_node_list.append(output_node)
+        if len(output_node_list) != len(partition_list):
+            raise ValueError(
+                "length of output_node_list should equal to length of partition_list"
+            )
+        return output_node_list
+
+    def _get_input_idx_for_binary_node(
+        self,
+        conv_gemm_node: torch.fx.Node,
+        binary_node: torch.fx.Node,
+    ):
+        """Helper function to check conv_gemm and extra input node index
+        for binary node fused with conv_gemm.
+        """
+        conv_gemm_node_idx = None
+        extra_input_node_idx = None
+        if (binary_node.args[0].op == "call_function") and (  # type: ignore[union-attr]
+            binary_node.args[0] == conv_gemm_node
+        ):
+            conv_gemm_node_idx = 0
+            extra_input_node_idx = 1
+        elif (binary_node.args[1].op == "call_function") and (  # type: ignore[union-attr]
+            binary_node.args[1] == conv_gemm_node
+        ):
+            conv_gemm_node_idx = 1
+            extra_input_node_idx = 0
+        extra_input_node = binary_node.args[extra_input_node_idx]  # type: ignore[index]
+        assert isinstance(extra_input_node, Node)
+        return conv_gemm_node_idx, extra_input_node_idx
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        if self.global_config and self.global_config.input_activation.is_dynamic:  # type: ignore[union-attr]
+            model = self._annotate_for_dynamic_quantization_config(model)
+        else:
+            model = self._annotate_for_static_quantization_config(model)
+        return model
+
+    def _annotate_for_static_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        r"""
+        High-level description of quantization recipe for X86 Inductor Backend:
+        Step 1: Apply quantization recipe for fusion patterns of conv/linear to enable int8 data type actively.
+        Step 2: Propagate quantization annotation for patterns besides conv/linear. Go through the pattern in model
+        from start to the end. If a pattern supports computation with int8 data type and inputs connected to
+        quantized patterns, annotate its inputs as quantized pattern.
+        Step 3: Since in step 2, we only annotate the inputs of quantized pattern. For some quantized patterns,
+        such as maxpool2d, which only supports output with int8 data type when the input is with int8 data type,
+        we need to annotate the output of this pattern.
+        """
+
+        config = self.global_config
+
+        # Step1: Recipe of fusion patterns like conv/linear.
+        if config.is_qat:
+            # Annotate QAT specific pattern: mainly due to BN not folded in prepare_qat
+            self._annotate_qat_conv2d_fusion_pattern(model, config)
+
+        self._annotate_conv2d_fusion_pattern(model, config)
+
+        # Step2: Recipe to propagate annotation for patterns beside conv/linear.
+        # Go through all the nodes from start to end.
+        # Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/
+        # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538
+        for node in model.graph.nodes:
+            self._annotation_propagation_quantizable_pattern(node, config)
+
+        # Step3: For quantizable ops, such as maxpool2d, we need to quantize its output if it is quantized
+        # in inputs. So, we can fuse dq-operator-q into a quantized op.
+        # Refer to https://github.com/intel/intel-extension-for-pytorch/blob/
+        # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487
+        for node in model.graph.nodes:
+            self._annotate_output_for_int8_in_int8_out_pattern(node, config)
+
+        return model
+
+    def _annotate_for_dynamic_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        config = self.global_config
+        self._annotate_linear(model, config)
+        return model
+
+    def _annotate_qat_conv2d_fusion_pattern(
+        self, model: torch.fx.GraphModule, config: QuantizationConfig
+    ):
+        # Annotate QAT Specific patterns
+        self._annotate_qat_conv2d_bn_binary_unary(model, config)
+        self._annotate_qat_conv2d_bn_binary(model, config)
+        self._annotate_qat_conv2d_bn_unary(model, config)
+        self._annotate_qat_conv2d_bn(model, config)
+
+    def _annotate_qat_conv2d_bn_binary_unary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add, torch.nn.ReLU]
+        )
+        for fused_partition in fused_partitions:
+            (
+                conv_partition,
+                bn_partition,
+                binary_partition,
+                unary_partition,
+            ) = fused_partition
+
+            (
+                conv_node,
+                bn_output_node,
+                binary_node,
+                unary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, binary_partition, unary_partition]
+            )
+            if len(bn_output_node.users) != 1:
+                # Conv BN pattern should only has 1 user.
+                continue
+            (
+                bn_output_node_idx,
+                extra_input_node_idx,
+            ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node)
+            if (bn_output_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if bn_output_node != binary_node.args[bn_output_node_idx]:
+                raise ValueError(f"{bn_output_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _is_annotated([unary_node, binary_node, bn_output_node, conv_node]):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=binary_node_input_qspec_map,
+                _annotated=True,
+            )
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(binary_partition.nodes))
+            nodes_to_mark_annotated.extend(list(unary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn_binary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d, operator.add]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition, binary_partition = fused_partition
+            (
+                conv_node,
+                bn_output_node,
+                binary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, binary_partition]
+            )
+            if len(bn_output_node.users) != 1:
+                # Conv BN pattern should only has 1 user.
+                continue
+            (
+                bn_output_node_idx,
+                extra_input_node_idx,
+            ) = self._get_input_idx_for_binary_node(bn_output_node, binary_node)
+            if (bn_output_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if bn_output_node != binary_node.args[bn_output_node_idx]:
+                raise ValueError(f"{bn_output_node} doesn't match input of binary node")
+
+            extra_input_node = binary_node.args[extra_input_node_idx]
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _is_annotated([binary_node, bn_output_node, conv_node]):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=binary_node_input_qspec_map,
+                # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(binary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn_unary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        fused_partitions = []
+        unary_patterns = [
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.Hardswish],
+            [torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU6],
+        ]
+        for unary_pattern in unary_patterns:
+            partitions = find_sequential_partitions(gm, unary_pattern)
+            if partitions:
+                # Extend the fused_partitions if partitions is not empty
+                fused_partitions.extend(partitions)
+
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition, unary_partition = fused_partition
+            (
+                conv_node,
+                bn_output_node,
+                unary_node,
+            ) = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition, unary_partition]
+            )
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _is_annotated([unary_node, bn_output_node, conv_node]):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            nodes_to_mark_annotated.extend(list(unary_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_qat_conv2d_bn(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, torch.nn.BatchNorm2d]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, bn_partition = fused_partition
+            conv_node, bn_output_node = self._get_output_nodes_of_partitions(
+                [conv_partition, bn_partition]
+            )
+
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+
+            if _is_annotated([bn_output_node, conv_node]):
+                continue
+
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            bn_output_node.meta[
+                QUANT_ANNOTATION_KEY
+            ] = _X86InductorQuantizationAnnotation(
+                # TODO<leslie> Remove the annotate of output in QAT when qat util support pattern matcher.
+                output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+            nodes_to_mark_annotated = list(conv_partition.nodes)
+            nodes_to_mark_annotated.extend(list(bn_partition.nodes))
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+
+    def _annotate_conv2d_fusion_pattern(
+        self, model: torch.fx.GraphModule, config: QuantizationConfig
+    ):
+        self._annotate_conv2d_binary_unary(model, config)
+        self._annotate_conv2d_binary(model, config)
+        self._annotate_conv2d_unary(model, config)
+        self._annotate_conv2d(model, config)
+        self._annotate_linear_unary(model, config)
+        self._annotate_linear(model, config)
+
+    def _annotate_conv2d_binary_unary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        # Conv2d + add + unary op
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, operator.add, torch.nn.ReLU]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, binary_partition, unary_partition = fused_partition
+            conv_node, binary_node, unary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, binary_partition, unary_partition]
+            )
+            if len(conv_node.users) != 1:
+                # Conv Node should only has 1 user node
+                continue
+            conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node(
+                conv_node, binary_node
+            )
+            if (conv_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if conv_node != binary_node.args[conv_node_idx]:
+                raise ValueError(f"{conv_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                # No conv node found to be fused with add
+                continue
+            if _is_annotated([unary_node, binary_node, conv_node]):
+                continue
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=binary_node_input_qspec_map,
+                _annotated=True,
+            )
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d_binary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        # Conv2d + add
+        fused_partitions = find_sequential_partitions(
+            gm, [torch.nn.Conv2d, operator.add]
+        )
+        for fused_partition in fused_partitions:
+            conv_partition, binary_partition = fused_partition
+            conv_node, binary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, binary_partition]
+            )
+            if len(conv_node.users) != 1:
+                # Conv Node should only has 1 user node
+                continue
+            conv_node_idx, extra_input_node_idx = self._get_input_idx_for_binary_node(
+                conv_node, binary_node
+            )
+            if (conv_node_idx is None) or (extra_input_node_idx is None):
+                continue
+            if conv_node != binary_node.args[conv_node_idx]:
+                raise ValueError(f"{conv_node} doesn't match input of binary node")
+            extra_input_node = binary_node.args[extra_input_node_idx]
+            assert isinstance(conv_node, Node)
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                # No conv node found to be fused with add
+                continue
+            if _is_annotated([binary_node, conv_node]):
+                continue
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            binary_node_input_qspec_map = {}
+            binary_node_input_qspec_map[extra_input_node] = get_input_act_qspec(
+                quantization_config
+            )
+            binary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                input_qspec_map=binary_node_input_qspec_map,
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d_unary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        fused_partitions = []
+        unary_patterns = [
+            [torch.nn.Conv2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, torch.nn.Hardtanh],
+            [torch.nn.Conv2d, torch.nn.Hardswish],
+            [torch.nn.Conv2d, torch.nn.ReLU6],
+        ]
+        for unary_pattern in unary_patterns:
+            partitions = find_sequential_partitions(gm, unary_pattern)
+            if partitions:
+                # Extend the fused_partitions if partitions is not empty
+                fused_partitions.extend(partitions)
+
+        for fused_partition in fused_partitions:
+            conv_partition, unary_partition = fused_partition
+            conv_node, unary_node = self._get_output_nodes_of_partitions(
+                [conv_partition, unary_partition]
+            )
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                continue
+            if _is_annotated([unary_node, conv_node]):
+                continue
+            self._annotate_conv_node_helper(conv_node, False, quantization_config)
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def _annotate_conv2d(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        conv_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Conv2d, torch.nn.functional.conv2d]
+        )
+        conv_partitions = list(itertools.chain.from_iterable(conv_partitions.values()))
+        for conv_partition in conv_partitions:
+            if len(conv_partition.output_nodes) > 1:
+                raise ValueError("conv partition has more than one output node")
+            conv_node = conv_partition.output_nodes[0]
+            if (
+                conv_node.op != "call_function"
+                or conv_node.target != torch.ops.aten.conv2d.default
+            ):
+                raise ValueError(f"{conv_node} is not an aten conv2d operator")
+            # skip annotation if it is already annotated
+            if _is_annotated([conv_node]):
+                continue
+            self._annotate_conv_node_helper(conv_node, True, quantization_config)
+
+    def _annotate_maxpool2d(
+        self, node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        if node.target is not torch.ops.aten.max_pool2d.default:
+            return
+        maxpool_node = node
+        if _is_any_annotated(
+            [
+                maxpool_node,
+            ]
+        ):
+            return
+        input_node = maxpool_node.args[0]
+        assert isinstance(input_node, Node)
+        input_qspec_map = {}
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        maxpool_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+            _is_output_of_quantized_pattern=True,
+        )
+
+    def _annotate_cat(
+        self, node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        cat_node = node
+        input_nodes = cat_node.args[0]
+        assert isinstance(input_nodes, Sequence)
+        first_input_node = input_nodes[0]
+        input_qspec_map = {}
+        assert isinstance(first_input_node, Node)
+        assert isinstance(cat_node, Node)
+        input_qspec_map[first_input_node] = get_input_act_qspec(quantization_config)
+        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+            (first_input_node, cat_node)
+        )
+
+        for input_node in input_nodes[1:]:
+            if input_node not in input_qspec_map:
+                # There has the case of cat same nodes: torch.cat([input0, input0], 1)
+                assert isinstance(input_node, Node)
+                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+
+        cat_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+            _is_output_of_quantized_pattern=True,
+        )
+
+    def _annotation_propagation_quantizable_pattern(
+        self, node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        # Propagate annotation to quantizable patterns.
+        if (
+            (node.target in quantizable_ops_pt2e)
+            and (not _is_any_annotated([node]))
+            and (node.op == "call_function")
+        ):
+
+            def is_all_inputs_connected_to_quantized_op(input_nodes):
+                # Ensure all the inputs connect to fusion pattern or quantized node
+                for input_node in input_nodes:
+                    if not _is_quantized_op_pt2e(input_node):
+                        return False
+                return True
+
+            if node.target is torch.ops.aten.max_pool2d.default:
+                # Recipe of maxpool2d: check input arg[0] of maxpool2d is quantized or not
+                input_nodes_to_check = [node.all_input_nodes[0]]
+                if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
+                    return
+                self._annotate_maxpool2d(node, quantization_config)
+                return
+            elif node.target is torch.ops.aten.cat.default:
+                input_nodes_to_check = node.all_input_nodes
+                if not is_all_inputs_connected_to_quantized_op(input_nodes_to_check):
+                    return
+                self._annotate_cat(node, quantization_config)
+            else:
+                input_node = node.all_input_nodes[0]
+                if not is_all_inputs_connected_to_quantized_op(
+                    [
+                        input_node,
+                    ]
+                ):
+                    return
+                input_qspec_map = {}
+                input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+                node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                    input_qspec_map=input_qspec_map,
+                    _annotated=True,
+                    _is_output_of_quantized_pattern=True,
+                )
+        return
+
+    def _annotate_output_share_observer_as_input(
+        self, input_node: Node, source_node: Node
+    ):
+        source_node_quantization_annotation = (
+            source_node.meta[QUANT_ANNOTATION_KEY]
+            if QUANT_ANNOTATION_KEY in source_node.meta
+            else None
+        )
+        if (
+            source_node_quantization_annotation
+            and source_node_quantization_annotation._is_output_of_quantized_pattern
+        ):
+            edge_or_node = (input_node, source_node)
+            source_node_quantization_annotation.output_qspec = SharedQuantizationSpec(
+                edge_or_node
+            )
+        return
+
+    def _annotate_output_for_int8_in_int8_out_pattern(
+        self, node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        r"""
+        Check and insert observer at output of node in int8_in_int8_out_ops_pt2e if needed.
+        Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/
+        90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
+        """
+        edge_or_node: Tuple[Node, Node]
+        if (node.target in int8_in_int8_out_ops_pt2e) and (_is_any_annotated([node])):
+            if node.target == torch.ops.aten.max_pool2d.default:
+                maxpool_node = node
+                if not _is_all_annotated(
+                    [
+                        maxpool_node,
+                    ]
+                ):
+                    return
+                # Get the quantization_annotation from getitem_node
+                maxpool_node_quantization_annotation = (
+                    maxpool_node.meta[QUANT_ANNOTATION_KEY]
+                    if QUANT_ANNOTATION_KEY in maxpool_node.meta
+                    else None
+                )
+                if (
+                    maxpool_node_quantization_annotation
+                    and maxpool_node_quantization_annotation._is_output_of_quantized_pattern
+                ):
+                    # Annotate the output_qspec of getitem_node
+                    input_act = maxpool_node.args[0]
+                    assert isinstance(input_act, Node)
+                    assert isinstance(maxpool_node, Node)
+                    edge_or_node = (input_act, maxpool_node)
+                    maxpool_node_quantization_annotation.output_qspec = (
+                        SharedQuantizationSpec(edge_or_node)
+                    )
+            else:
+                input_node = node.all_input_nodes[0]
+                self._annotate_output_share_observer_as_input(input_node, node)
+        return
+
+    def _annotate_linear(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        linear_partitions = get_source_partitions(
+            gm.graph, [torch.nn.Linear, torch.nn.functional.linear]
+        )
+        linear_partitions = list(
+            itertools.chain.from_iterable(linear_partitions.values())
+        )
+        for partition in linear_partitions:
+            if len(partition.output_nodes) > 1:
+                raise ValueError(
+                    "Linear partition cannot have more than one output node"
+                )
+            linear_node = partition.output_nodes[0]
+            if linear_node.op != "call_function" or linear_node.target not in (
+                torch.ops.aten.linear.default,
+            ):
+                raise ValueError(f"{linear_node} is not an aten linear operator")
+            # skip annotation if it is already annotated
+            if _is_annotated([linear_node]):
+                continue
+            self._annotate_linear_node_helper(linear_node, True, quantization_config)
+
+    def _annotate_linear_unary(
+        self, gm: torch.fx.GraphModule, quantization_config: QuantizationConfig
+    ) -> None:
+        postop_list = [
+            torch.nn.ReLU,
+            torch.nn.LeakyReLU,
+            torch.nn.Tanh,
+        ]
+        fused_partitions: List[tuple] = []
+        for postop in postop_list:
+            fused_partitions = fused_partitions + find_sequential_partitions(
+                gm, [torch.nn.Linear, postop]
+            )
+        for fused_partition in fused_partitions:
+            linear_partition, unary_partition = fused_partition
+            linear_node, unary_node = self._get_output_nodes_of_partitions(
+                [linear_partition, unary_partition]
+            )
+            if linear_node.op != "call_function" or linear_node.target not in (
+                torch.ops.aten.linear.default,
+            ):
+                continue
+            if _is_annotated([unary_node, linear_node]):
+                continue
+            self._annotate_linear_node_helper(linear_node, False, quantization_config)
+            unary_node.meta[QUANT_ANNOTATION_KEY] = _X86InductorQuantizationAnnotation(
+                _annotated=True,
+                _is_output_of_quantized_pattern=True,
+            )
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return cls.supported_config_and_operators
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ffdfc409b8795c8344b58b13296493f7855554
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer.py
@@ -0,0 +1,453 @@
+from __future__ import annotations
+
+import copy
+import functools
+
+from typing import Any, Callable, Dict, List, Optional, Set
+
+import torch
+import torch._dynamo as torchdynamo
+import torch.nn.functional as F
+from torch.ao.quantization.fake_quantize import (
+    FakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
+from torch.ao.quantization.observer import (
+    HistogramObserver,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+)
+
+from torch.ao.quantization.qconfig import _ObserverOrFakeQuantizeConstructor
+
+from torch.ao.quantization.quantizer import QuantizationSpec, Quantizer
+
+from torch.ao.quantization.quantizer.xnnpack_quantizer_utils import (
+    _convert_scalars_to_attrs,
+    OP_TO_ANNOTATOR,
+    OperatorConfig,
+    OperatorPatternType,
+    propagate_annotation,
+    QuantizationConfig,
+)
+
+from torch.fx import Node
+
+
+__all__ = [
+    "XNNPACKQuantizer",
+    "get_symmetric_quantization_config",
+]
+
+
+def _get_dynamo_graph(function: Callable, inputs) -> torch.fx.Graph:
+    gm, _ = torchdynamo.export(function, aten_graph=True)(*inputs)
+    gm.graph.eliminate_dead_code()
+    return gm.graph
+
+
+def _get_linear_patterns(input_size: List[int]):
+    in_channels = input_size[-1]
+    out_channels = 8  # hard coding but this should not matter
+    weight = torch.ones((out_channels, in_channels))
+    bias = torch.ones((out_channels,))
+    act = torch.ones(input_size)
+
+    def linear_op(act, weight, bias=None):
+        return F.linear(act, weight, bias)
+
+    pattern_w_bias = _get_dynamo_graph(linear_op, (act, weight, bias))
+    pattern_wo_bias = _get_dynamo_graph(linear_op, (act, weight))
+    return [pattern_w_bias, pattern_wo_bias]
+
+
+def _supported_symmetric_quantized_operators() -> Dict[str, List[OperatorPatternType]]:
+    supported_operators: Dict[str, List[OperatorPatternType]] = {
+        # Both conv and linear should be able to handle relu + hardtanh fusion since
+        # those are clamp ops
+        "conv2d": [
+            [torch.nn.Conv2d, torch.nn.ReLU],
+            [torch.nn.Conv2d, F.relu],
+            [F.conv2d, torch.nn.ReLU],
+            [F.conv2d, F.relu],
+        ],
+        "linear": [[torch.nn.Linear], [F.linear]],
+        "add": [[torch.add]],
+        "max_pool2d": [[torch.nn.MaxPool2d], [F.max_pool2d]],
+        "adaptive_avg_pool2d": [
+            [torch.nn.AdaptiveAvgPool2d],
+            [F.adaptive_avg_pool2d],
+        ],
+    }
+    return copy.deepcopy(supported_operators)
+
+
+def _get_supported_symmetric_config_and_operators() -> List[OperatorConfig]:
+    supported_config_and_operators: List[OperatorConfig] = []
+    for quantization_config in [
+        get_symmetric_quantization_config(),
+        get_symmetric_quantization_config(is_qat=True),
+        get_symmetric_quantization_config(is_per_channel=True),
+        get_symmetric_quantization_config(is_per_channel=True, is_qat=True),
+    ]:
+        ops = _supported_symmetric_quantized_operators()
+        for pattern_list in ops.values():
+            supported_config_and_operators.append(
+                OperatorConfig(quantization_config, pattern_list)
+            )
+    return copy.deepcopy(supported_config_and_operators)
+
+
+@functools.lru_cache
+def get_symmetric_quantization_config(
+    is_per_channel: bool = False,
+    is_qat: bool = False,
+    is_dynamic: bool = False,
+    act_qmin: int = -128,
+    act_qmax: int = 127,
+    weight_qmin: int = -127,
+    weight_qmax: int = 127,
+):
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = FakeQuantize
+            dynamic_quant_observer = MovingAverageMinMaxObserver.with_args(
+                averaging_constant=1
+            )
+            extra_args["observer"] = dynamic_quant_observer
+        else:
+            act_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize  # type: ignore[assignment]
+    else:
+        if is_dynamic:
+            act_observer_or_fake_quant_ctr = PlaceholderObserver  # type: ignore[assignment]
+        else:
+            act_observer_or_fake_quant_ctr = HistogramObserver  # type: ignore[assignment]
+
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=act_qmin,
+        quant_max=act_qmax,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=is_dynamic,
+        observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr.with_args(
+            **extra_args,
+        ),
+    )
+    weight_qscheme = (
+        torch.per_channel_symmetric if is_per_channel else torch.per_tensor_symmetric
+    )
+    weight_observer_or_fake_quant_ctr: _ObserverOrFakeQuantizeConstructor = (
+        MinMaxObserver
+    )
+    if is_qat:
+        # TODO: qat + per channel?
+        weight_observer_or_fake_quant_ctr = FusedMovingAvgObsFakeQuantize
+    elif is_per_channel:
+        weight_observer_or_fake_quant_ctr = PerChannelMinMaxObserver
+
+    extra_args: Dict[str, Any] = {"eps": 2**-12}
+    if is_qat:
+        if weight_qscheme == torch.per_tensor_symmetric:
+            extra_args["observer"] = MovingAverageMinMaxObserver
+        else:
+            extra_args["observer"] = MovingAveragePerChannelMinMaxObserver  # type: ignore[dict-item]
+    weight_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=weight_qmin,
+        quant_max=weight_qmax,
+        qscheme=weight_qscheme,
+        ch_axis=0,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=weight_observer_or_fake_quant_ctr.with_args(
+            **extra_args
+        ),
+    )
+
+    bias_quantization_spec = None
+    if is_dynamic:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            None,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    else:
+        quantization_config = QuantizationConfig(
+            act_quantization_spec,
+            act_quantization_spec,
+            weight_quantization_spec,
+            bias_quantization_spec,
+            is_qat,
+        )
+    return quantization_config
+
+
+def _get_supported_config_and_operators() -> List[OperatorConfig]:
+    return _get_supported_symmetric_config_and_operators()
+
+
+def _get_module_name_filter(module_name: str):
+    """Get the module_name_filter function for a given module name, the filter accepts
+    a node and checks if the node comes from a module that has certain module name
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with name blocks.sub.linear1
+
+
+    >> module_name_filter = _get_module_name_filter("blocks.sub")
+    >> print(module_name_filter(node))
+    True  # the node is from "blocks.sub" based on the fully qualified name "blocks.sub.linear1"
+    """
+
+    def module_name_filter(n: Node) -> bool:
+        # example: {
+        #    'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #    'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        # get_attr nodes doesn't have nn_module_stack?
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+        names = [n[len("L['self'].") :] for n, klass in nn_module_stack.values()]
+        return module_name in names
+
+    return module_name_filter
+
+
+def _get_module_type_filter(tp: Callable):
+    """Get the module_type_filter function for a given module type, the filter accepts
+    a node and checks if the node comes from a module that has certain module type
+
+    For example:
+        node: linear_op = call_function[...](...)  # comes from a module with type Block -> Sub -> Linear
+
+
+    >> module_type_filter = _get_module_type_filter(Sub)  # submodule with type `Sub`, under the `Block` submodule
+    >> print(module_type_filter(node))
+    True  # the node is from the submodule `Sub` (same for `Block` and `Linear` as well)
+    """
+
+    def module_type_filter(n: Node) -> bool:
+        # example: {
+        #     'L__self___sub': ("L['self'].sub", <class '....Sub'>),
+        #     'L__self___sub_linear': ("L['self'].sub.linear", <class 'torch.nn.modules.linear.Linear'>)
+        # }
+        nn_module_stack = n.meta.get("nn_module_stack", {})
+        types = [t for _, t in nn_module_stack.values()]
+        return tp in types
+
+    return module_type_filter
+
+
+def _get_not_module_type_or_name_filter(
+    tp_list: List[Callable], module_name_list: List[str]
+) -> Callable[[Node], bool]:
+    module_type_filters = [_get_module_type_filter(tp) for tp in tp_list]
+    module_name_list_filters = [_get_module_name_filter(m) for m in module_name_list]
+
+    def not_module_type_or_name_filter(n: Node) -> bool:
+        return not any(f(n) for f in module_type_filters + module_name_list_filters)
+
+    return not_module_type_or_name_filter
+
+
+class XNNPACKQuantizer(Quantizer):
+    supported_config_and_operators = _get_supported_config_and_operators()
+    STATIC_QAT_ONLY_OPS = [
+        "conv_bn_relu",
+        "conv_bn",
+    ]
+
+    # static quantization ops (both PTQ and QAT)
+    # Preserve the order that fusions come before singular ops
+    STATIC_OPS = [
+        "linear_relu",
+        "linear",
+        "conv_relu",
+        "conv",
+        "adaptive_avg_pool2d",
+        # TODO: move this to BoltNNQuantizer?
+        "gru_io_only",
+        "max_pool2d",
+        "add_relu",
+        "add",
+        "mul_relu",
+        "mul",
+        "cat",
+    ]
+
+    DYNAMIC_OPS = [
+        "linear",
+    ]
+
+    def __init__(self):
+        super().__init__()
+        self.global_config: Optional[QuantizationConfig] = None
+        self.operator_type_config: Dict[
+            torch._ops.OpOverloadPacket, Optional[QuantizationConfig]
+        ] = {}
+        self.module_type_config: Dict[Callable, Optional[QuantizationConfig]] = {}
+        self.module_name_config: Dict[str, Optional[QuantizationConfig]] = {}
+
+    @classmethod
+    def get_supported_quantization_configs(cls) -> List[QuantizationConfig]:
+        op_configs: Set[QuantizationConfig] = set({})
+        for spec, _ in cls.supported_config_and_operators:
+            op_configs.add(spec)
+        return list(op_configs)
+
+    @classmethod
+    def get_supported_operator_for_quantization_config(
+        cls, quantization_config: Optional[QuantizationConfig]
+    ) -> List[OperatorPatternType]:
+        if quantization_config is None:
+            all_ops = []
+            for _, ops in cls.supported_config_and_operators:
+                all_ops.extend(ops)
+            return all_ops
+
+        for config, ops in cls.supported_config_and_operators:
+            # note: this assumes each entry in cls.supported_spec_and_operators
+            # corresponds to one spec, e.g. we don't have
+            # [(spec1, op_list1), (spec1, op_list2), (spec2, op_list3)]
+            # where the first and second entry have the same spec but did not
+            # merge the op list
+            if config == quantization_config:
+                return ops
+        return []
+
+    def set_global(self, quantization_config: QuantizationConfig) -> XNNPACKQuantizer:
+        self.global_config = quantization_config
+        return self
+
+    def set_operator_type(
+        self,
+        operator_type: torch._ops.OpOverloadPacket,
+        quantization_config: QuantizationConfig,
+    ) -> XNNPACKQuantizer:
+        self.operator_type_config[operator_type] = quantization_config
+        return self
+
+    def set_module_type(
+        self, module_type: Callable, quantization_config: QuantizationConfig
+    ):
+        """Set quantization_config for a submodule with type: `module_type`, for example:
+        quantizer.set_module_name(Sub) or quantizer.set_module_name(nn.Linear), it will quantize all supported operator/operator
+        patterns in the submodule with this module type with the given `quantization_config`
+        """
+        self.module_type_config[module_type] = quantization_config
+        return self
+
+    def set_module_name(
+        self, module_name: str, quantization_config: Optional[QuantizationConfig]
+    ):
+        """Set quantization_config for a submodule with name: `module_name`, for example:
+        quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
+        patterns in the submodule with this module name with the given `quantization_config`
+        """
+        assert (
+            quantization_config is not None
+        ), " quantization_config == None is not supported yet"
+        self.module_name_config[module_name] = quantization_config
+        return self
+
+    def transform_for_annotation(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        """Transforms scalar values to tensor attributes"""
+        return _convert_scalars_to_attrs(model)
+
+    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+        """just handling global spec for now"""
+        # hacked for handling dynamic linear quant. will fix later.
+        if self.global_config and self.global_config.input_activation.is_dynamic:  # type: ignore[union-attr]
+            model = self._annotate_for_dynamic_quantization_config(model)
+        else:
+            model = self._annotate_for_static_quantization_config(model)
+        propagate_annotation(model)
+        return model
+
+    def _annotate_all_static_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[Callable[[Node], bool]] = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        if quantization_config.is_qat:
+            for op in self.STATIC_QAT_ONLY_OPS:
+                OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        for op in self.STATIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_all_dynamic_patterns(
+        self,
+        model: torch.fx.GraphModule,
+        quantization_config: Optional[QuantizationConfig],
+        filter_fn: Optional[Callable[[Node], bool]] = None,
+    ) -> torch.fx.GraphModule:
+        # TODO: implement the support for None to be canceling out previous annotations
+        if quantization_config is None:
+            return model
+
+        for op in self.DYNAMIC_OPS:
+            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
+        return model
+
+    def _annotate_for_static_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_static_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_static_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def _annotate_for_dynamic_quantization_config(
+        self, model: torch.fx.GraphModule
+    ) -> torch.fx.GraphModule:
+        module_name_list = list(self.module_name_config.keys())
+        for module_name, config in self.module_name_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_name_filter(module_name)
+            )
+
+        tp_list = list(self.module_type_config.keys())
+        for module_type, config in self.module_type_config.items():
+            self._annotate_all_dynamic_patterns(
+                model, config, _get_module_type_filter(module_type)
+            )
+
+        self._annotate_all_dynamic_patterns(
+            model,
+            self.global_config,
+            _get_not_module_type_or_name_filter(tp_list, module_name_list),
+        )
+        return model
+
+    def validate(self, model: torch.fx.GraphModule) -> None:
+        pass
+
+    @classmethod
+    def get_supported_operators(cls) -> List[OperatorConfig]:
+        return cls.supported_config_and_operators
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61a38a28c30a4fc1f970caa1523922ecc127bc69
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/quantizer/xnnpack_quantizer_utils.py
@@ -0,0 +1,1032 @@
+import itertools
+import operator
+from dataclasses import dataclass
+from typing import Callable, Dict, List, NamedTuple, Optional
+
+import torch
+import torch.nn.functional as F
+from torch._subclasses import FakeTensor
+from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
+from torch.ao.quantization.pt2e.export_utils import _WrapperModule
+from torch.ao.quantization.pt2e.graph_utils import find_sequential_partitions
+from torch.ao.quantization.pt2e.utils import (
+    _conv1d_bn_example_inputs,
+    _conv2d_bn_example_inputs,
+    get_aten_graph_module,
+)
+from torch.ao.quantization.quantizer import (
+    QuantizationAnnotation,
+    QuantizationSpec,
+    QuantizationSpecBase,
+    SharedQuantizationSpec,
+)
+
+from torch.ao.quantization.quantizer.utils import (
+    _annotate_input_qspec_map,
+    _annotate_output_qspec,
+)
+from torch.fx import Node
+from torch.fx.passes.utils.matcher_with_name_node_map_utils import (
+    SubgraphMatcherWithNameNodeMap,
+)
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+__all__ = [
+    "OperatorConfig",
+    "OperatorPatternType",
+    "QuantizationConfig",
+    "get_input_act_qspec",
+    "get_output_act_qspec",
+    "get_weight_qspec",
+    "get_bias_qspec",
+    "OP_TO_ANNOTATOR",
+    "propagate_annotation",
+]
+
+
+# In the absence of better name, just winging it with QuantizationConfig
+@dataclass(eq=True, frozen=True)
+class QuantizationConfig:
+    input_activation: Optional[QuantizationSpec]
+    output_activation: Optional[QuantizationSpec]
+    weight: Optional[QuantizationSpec]
+    bias: Optional[QuantizationSpec]
+    # TODO: remove, since we can use observer_or_fake_quant_ctr to express this
+    is_qat: bool = False
+
+
+OperatorPatternType = List[Callable]
+OperatorPatternType.__module__ = (
+    "torch.ao.quantization.quantizer.xnnpack_quantizer_utils"
+)
+
+AnnotatorType = Callable[
+    [
+        torch.fx.GraphModule,
+        Optional[QuantizationConfig],
+        Optional[Callable[[Node], bool]],
+    ],
+    Optional[List[List[Node]]],
+]
+OP_TO_ANNOTATOR: Dict[str, AnnotatorType] = {}
+
+
+def register_annotator(op: str):
+    def decorator(annotator: AnnotatorType):
+        OP_TO_ANNOTATOR[op] = annotator
+
+    return decorator
+
+
+class OperatorConfig(NamedTuple):
+    # fix List[str] with List[List[Union[nn.Module, FunctionType, BuiltinFunctionType]]]
+    # Basically we are mapping a quantization config to some list of patterns.
+    # a pattern is defined as a list of nn module, function or builtin function names
+    # e.g. [nn.Conv2d, torch.relu, torch.add]
+    # We have not resolved whether fusion can be considered internal details of the
+    # quantizer hence it does not need communication to user.
+    # Note this pattern is not really informative since it does not really
+    # tell us the graph structure resulting from the list of ops.
+    config: QuantizationConfig
+    operators: List[OperatorPatternType]
+
+
+def _is_annotated(nodes: List[Node]):
+    """
+    Given a list of nodes (that represents an operator pattern),
+    check if any of the node is annotated, return True if any of the node
+    is annotated, otherwise return False
+    """
+    annotated = False
+    for node in nodes:
+        annotated = annotated or (
+            "quantization_annotation" in node.meta
+            and node.meta["quantization_annotation"]._annotated
+        )
+    return annotated
+
+
+def _mark_nodes_as_annotated(nodes: List[Node]):
+    for node in nodes:
+        if node is not None:
+            if "quantization_annotation" not in node.meta:
+                node.meta["quantization_annotation"] = QuantizationAnnotation()
+            node.meta["quantization_annotation"]._annotated = True
+
+
+def get_input_act_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    if quantization_config.input_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.input_activation
+    assert quantization_spec.qscheme in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]
+    return quantization_spec
+
+
+def get_output_act_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    if quantization_config.output_activation is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.output_activation
+    assert quantization_spec.qscheme in [
+        torch.per_tensor_affine,
+        torch.per_tensor_symmetric,
+    ]
+    return quantization_spec
+
+
+def get_weight_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    assert quantization_config is not None
+    if quantization_config.weight is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.weight
+    if quantization_spec.qscheme not in [
+        torch.per_tensor_symmetric,
+        torch.per_channel_symmetric,
+    ]:
+        raise ValueError(
+            f"Unsupported quantization_spec {quantization_spec} for weight"
+        )
+    return quantization_spec
+
+
+def get_bias_qspec(quantization_config: Optional[QuantizationConfig]):
+    if quantization_config is None:
+        return None
+    assert quantization_config is not None
+    if quantization_config.bias is None:
+        return None
+    quantization_spec: QuantizationSpec = quantization_config.bias
+    assert (
+        quantization_spec.dtype == torch.float
+    ), "Only float dtype for bias is supported for bias right now"
+    return quantization_spec
+
+
+@register_annotator("linear")
+def _annotate_linear(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target != torch.ops.aten.linear.default:
+            continue
+        if filter_fn and not filter_fn(node):
+            continue
+        act_node = node.args[0]
+        weight_node = node.args[1]
+        bias_node = None
+        if len(node.args) > 2:
+            bias_node = node.args[2]
+
+        if _is_annotated([node]) is False:  # type: ignore[list-item]
+            _annotate_input_qspec_map(
+                node,
+                act_node,
+                input_act_qspec,
+            )
+            _annotate_input_qspec_map(
+                node,
+                weight_node,
+                weight_qspec,
+            )
+            nodes_to_mark_annotated = [node, weight_node]
+            if bias_node:
+                _annotate_input_qspec_map(
+                    node,
+                    bias_node,
+                    bias_qspec,
+                )
+                nodes_to_mark_annotated.append(bias_node)
+            _annotate_output_qspec(node, output_act_qspec)
+            _mark_nodes_as_annotated(nodes_to_mark_annotated)
+            annotated_partitions.append(nodes_to_mark_annotated)
+
+    return annotated_partitions
+
+
+@register_annotator("linear_relu")
+def _annotate_linear_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    input_act_qspec = get_input_act_qspec(quantization_config)
+    output_act_qspec = get_output_act_qspec(quantization_config)
+    weight_qspec = get_weight_qspec(quantization_config)
+    bias_qspec = get_bias_qspec(quantization_config)
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = node
+        maybe_linear_node = node.args[0]
+        if (
+            not isinstance(maybe_linear_node, Node)
+            or maybe_linear_node.op != "call_function"
+            or maybe_linear_node.target != torch.ops.aten.linear.default
+        ):
+            continue
+
+        linear_node = maybe_linear_node
+        input_qspec_map = {}
+        input_act = linear_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = input_act_qspec
+
+        weight = linear_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = weight_qspec
+
+        # adding weight node to the partition as well
+        partition = [relu_node, linear_node, weight]
+        bias = linear_node.args[2] if len(linear_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = bias_qspec
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        linear_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv")
+def _annotate_conv(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.conv1d.default,
+            torch.ops.aten.conv2d.default,
+        ]:
+            continue
+        conv_node = n
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [conv_node, conv_node.args[1]]
+
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=get_output_act_qspec(quantization_config),
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv_relu")
+def _annotate_conv_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    for n in gm.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.relu.default,
+            torch.ops.aten.relu_.default,
+        ]:
+            continue
+        relu_node = n
+        maybe_conv_node = n.args[0]
+        if (
+            not isinstance(maybe_conv_node, Node)
+            or maybe_conv_node.op != "call_function"
+            or maybe_conv_node.target
+            not in [
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+            ]
+        ):
+            continue
+        conv_node = maybe_conv_node
+
+        input_qspec_map = {}
+        input_act = conv_node.args[0]
+        assert isinstance(input_act, Node)
+        input_qspec_map[input_act] = get_input_act_qspec(quantization_config)
+
+        weight = conv_node.args[1]
+        assert isinstance(weight, Node)
+        input_qspec_map[weight] = get_weight_qspec(quantization_config)
+
+        # adding weight node to the partition as well
+        partition = [relu_node, conv_node, conv_node.args[1]]
+        bias = conv_node.args[2] if len(conv_node.args) > 2 else None
+        if isinstance(bias, Node):
+            input_qspec_map[bias] = get_bias_qspec(quantization_config)
+            partition.append(bias)
+
+        if _is_annotated(partition):
+            continue
+
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map, _annotated=True
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("conv_bn")
+def _annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv + batchnorm parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=False)
+
+
+@register_annotator("conv_bn_relu")
+def _annotate_conv_bn_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """
+    Find conv + batchnorm + relu parititions
+    Note: This is only used for QAT. In PTQ, batchnorm should already be fused into the conv.
+    """
+    return _do_annotate_conv_bn(gm, quantization_config, filter_fn, has_relu=True)
+
+
+def _do_annotate_conv_bn(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]],
+    has_relu: bool,
+) -> List[List[Node]]:
+    """
+    Given a function that takes in a `conv_fn` and returns a conv-bn[-relu] pattern,
+    return a list of annotated partitions.
+
+    The output of the pattern must include a dictionary from string name to node
+    for the following names: "input", "conv", "weight", "bias", and "output".
+    """
+
+    def get_pattern(conv_fn: Callable, relu_is_inplace: bool):
+        def _conv_bn(x, conv_weight, conv_bias, bn_weight, bn_bias, bn_rm, bn_rv):
+            conv = conv_fn(x, conv_weight, conv_bias)
+            bn = F.batch_norm(conv, bn_rm, bn_rv, bn_weight, bn_bias, training=True)
+            if has_relu:
+                output = F.relu_(bn) if relu_is_inplace else F.relu(bn)
+            else:
+                output = bn
+            return output, {
+                "input": x,
+                "conv": conv,
+                "weight": conv_weight,
+                "bias": conv_bias,
+                "output": output,
+            }
+
+        return _WrapperModule(_conv_bn)
+
+    # Needed for matching, otherwise the matches gets filtered out due to unused
+    # nodes returned by batch norm
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+    matches = []
+    combinations = [
+        (F.conv1d, _conv1d_bn_example_inputs),
+        (F.conv2d, _conv2d_bn_example_inputs),
+    ]
+
+    # Add `is_cuda` and `relu_is_inplace` dimensions
+    combinations = itertools.product(
+        combinations,
+        [True, False] if torch.cuda.is_available() else [False],  # is_cuda
+        [True, False] if has_relu else [False],  # relu_is_inplace
+    )
+
+    # Match against all conv dimensions and cuda variants
+    for (conv_fn, example_inputs), is_cuda, relu_is_inplace in combinations:
+        pattern = get_pattern(conv_fn, relu_is_inplace)
+        pattern = get_aten_graph_module(pattern, example_inputs, is_cuda)
+        pattern.graph.eliminate_dead_code()
+        pattern.recompile()
+        matcher = SubgraphMatcherWithNameNodeMap(pattern, ignore_literals=True)
+        matches.extend(matcher.match(gm.graph))
+
+    # Annotate nodes returned in the matches
+    annotated_partitions = []
+    for match in matches:
+        name_node_map = match.name_node_map
+        input_node = name_node_map["input"]
+        conv_node = name_node_map["conv"]
+        weight_node = name_node_map["weight"]
+        bias_node = name_node_map["bias"]
+        output_node = name_node_map["output"]
+
+        # TODO: annotate the uses of input, weight, and bias separately instead
+        # of assuming they come from a single conv node. This is not possible today
+        # because input may have multiple users, and we can't rely on the conv node
+        # always being the first user. This was the case in models with skip
+        # connections like resnet18
+
+        # Validate conv args
+        if conv_node.args[0] is not input_node:
+            raise ValueError("Conv arg did not contain input node ", input_node)
+        if conv_node.args[1] is not weight_node:
+            raise ValueError("Conv arg did not contain weight node ", weight_node)
+        if len(conv_node.args) > 2 and conv_node.args[2] is not bias_node:
+            raise ValueError("Conv arg did not contain bias node ", bias_node)
+
+        # Skip if the partition is already annotated or is filtered out by the user
+        partition = [conv_node, weight_node]
+        if bias_node is not None:
+            partition.append(bias_node)
+        if _is_annotated(partition):
+            continue
+        if filter_fn and any(not filter_fn(n) for n in partition):
+            continue
+
+        # Annotate conv inputs and pattern output
+        input_qspec_map = {}
+        input_qspec_map[input_node] = get_input_act_qspec(quantization_config)
+        input_qspec_map[weight_node] = get_weight_qspec(quantization_config)
+        if bias_node is not None:
+            input_qspec_map[bias_node] = get_bias_qspec(quantization_config)
+        conv_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        output_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=get_output_act_qspec(quantization_config),  # type: ignore[arg-type]
+            _annotated=True,
+        )
+        _mark_nodes_as_annotated(partition)
+        annotated_partitions.append(partition)
+    return annotated_partitions
+
+
+@register_annotator("gru_io_only")
+def _annotate_gru_io_only(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    gru_partitions = get_source_partitions(gm.graph, [torch.nn.GRU], filter_fn)
+    gru_partitions = list(itertools.chain.from_iterable(gru_partitions.values()))
+    annotated_partitions = []
+    for gru_partition in gru_partitions:
+        annotated_partitions.append(gru_partition.nodes)
+        output_nodes = gru_partition.output_nodes
+        input_nodes = gru_partition.input_nodes
+        # skip annotation if it is already annotated
+        if _is_annotated(input_nodes + output_nodes):
+            continue
+        # inside each GRU partition, we should be able to annotate each linear
+        # subgraph
+        input_qspec_map: Dict[Node, QuantizationSpecBase] = {}
+        input_act = input_nodes[0]
+        input_act_user = next(iter(input_act.users.keys()))
+        assert isinstance(input_act, Node)
+        assert isinstance(input_act_user, Node)
+        input_act_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        hidden_state = input_nodes[1]
+        hidden_state_user = next(iter(hidden_state.users.keys()))
+        assert isinstance(hidden_state, Node)
+        assert isinstance(hidden_state_user, Node)
+        hidden_state_user.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                hidden_state: get_input_act_qspec(quantization_config),
+            },
+            _annotated=True,
+        )
+
+        assert len(output_nodes) == 2, "expecting GRU to have two outputs"
+        for output in output_nodes:
+            output.meta["quantization_annotation"] = QuantizationAnnotation(
+                output_qspec=get_output_act_qspec(quantization_config),
+                _annotated=True,
+            )
+        nodes_to_mark_annotated = list(gru_partition.nodes)
+        _mark_nodes_as_annotated(nodes_to_mark_annotated)
+    return annotated_partitions
+
+
+@register_annotator("max_pool2d")
+def _annotate_max_pool2d(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    module_partitions = get_source_partitions(
+        gm.graph, [torch.nn.MaxPool2d, torch.nn.functional.max_pool2d], filter_fn
+    )
+    maxpool_partitions = list(itertools.chain.from_iterable(module_partitions.values()))
+    annotated_partitions = []
+    for maxpool_partition in maxpool_partitions:
+        annotated_partitions.append(maxpool_partition.nodes)
+        output_node = maxpool_partition.output_nodes[0]
+        maxpool_node = None
+        for n in maxpool_partition.nodes:
+            if n.target == torch.ops.aten.max_pool2d.default:
+                maxpool_node = n
+        assert (
+            maxpool_node is not None
+        ), "XNNPACKQuantizer only works with torch.ops.aten.max_pool2d.default, "
+        "please make sure you are exporting the model correctly"
+        if _is_annotated([output_node, maxpool_node]):  # type: ignore[list-item]
+            continue
+
+        input_act = maxpool_node.args[0]  # type: ignore[union-attr]
+        assert isinstance(input_act, Node)
+
+        # only annotate maxpool when the output of the input node is annotated
+        if (
+            "quantization_annotation" not in input_act.meta
+            or not input_act.meta["quantization_annotation"]._annotated
+            or input_act.meta["quantization_annotation"].output_qspec is None
+        ):
+            continue
+        # input and output of maxpool will share quantization parameter with input of maxpool
+        act_qspec = SharedQuantizationSpec(input_act)
+        # act_qspec = get_act_qspec(quantization_config)
+        maxpool_node.meta["quantization_annotation"] = QuantizationAnnotation(  # type: ignore[union-attr]
+            input_qspec_map={
+                input_act: act_qspec,
+            },
+            _annotated=True,
+        )
+        output_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("adaptive_avg_pool2d")
+def _annotate_adaptive_avg_pool2d(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    """Always annotate adaptive_avg_pool2d op"""
+    module_partitions = get_source_partitions(
+        gm.graph, [torch.nn.AdaptiveAvgPool2d, F.adaptive_avg_pool2d], filter_fn
+    )
+    partitions = list(itertools.chain.from_iterable(module_partitions.values()))
+    annotated_partitions = []
+    for partition in partitions:
+        pool_node = partition.output_nodes[0]
+        if (
+            pool_node.op != "call_function"
+            or pool_node.target != torch.ops.aten.adaptive_avg_pool2d.default
+        ):
+            raise ValueError(f"{pool_node} is not an aten adaptive_avg_pool2d operator")
+
+        if _is_annotated([pool_node]):
+            continue
+
+        annotated_partitions.append(partition.nodes)
+        input_act = pool_node.args[0]
+        assert isinstance(input_act, Node)
+
+        # only annotate input output sharing operator
+        # when the output of the input node is annotated
+        if (
+            "quantization_annotation" not in input_act.meta
+            or not input_act.meta["quantization_annotation"]._annotated
+            or input_act.meta["quantization_annotation"].output_qspec is None
+        ):
+            input_act_qspec = get_input_act_qspec(quantization_config)
+        else:
+            input_act_qspec = SharedQuantizationSpec(input_act)
+
+        # output sharing with input
+        output_act_qspec = SharedQuantizationSpec((input_act, pool_node))
+        pool_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                input_act: input_act_qspec,
+            },
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_input_large_scalar(node: Node, gm: torch.fx.GraphModule):
+    """Check if input is a large scalar value. So that we can skip quantization for the node
+    since histc op (in HistogramObserver) only works for values up to certain upper bound
+    """
+    if node.op == "get_attr":
+        tensor = getattr(gm, node.target)  # type: ignore[arg-type]
+        # torch.histc works until this upper bound
+        HISTC_UPPER_BOUND = 3.4028235e15
+        return tensor.numel() == 1 and abs(tensor.item()) > HISTC_UPPER_BOUND
+    return False
+
+
+def _is_input_non_float_tensor(node: Node):
+    """Check if the input is not a float tensor, so that we can skip quantization for the node
+    since observers only works with float Tensors
+    """
+    if "val" not in node.meta or not isinstance(node.meta["val"], FakeTensor):
+        return True
+    return node.meta["val"].dtype != torch.float32
+
+
+@register_annotator("add_relu")
+def _annotate_add_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    fused_partitions = find_sequential_partitions(
+        gm, [torch.add, torch.nn.ReLU], filter_fn=filter_fn
+    )
+    annotated_partitions = []
+    for fused_partition in fused_partitions:
+        add_partition, relu_partition = fused_partition
+        annotated_partitions.append(add_partition.nodes + relu_partition.nodes)
+        if len(relu_partition.output_nodes) > 1:
+            raise ValueError("Relu partition has more than one output node")
+        relu_node = relu_partition.output_nodes[0]
+        if len(add_partition.output_nodes) > 1:
+            raise ValueError("add partition has more than one output node")
+        add_node = add_partition.output_nodes[0]
+
+        if _is_annotated([relu_node, add_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("add")
+def _annotate_add(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    add_partitions = get_source_partitions(
+        gm.graph, [operator.add, torch.add, operator.iadd], filter_fn
+    )
+    add_partitions = list(itertools.chain.from_iterable(add_partitions.values()))
+    annotated_partitions = []
+    for add_partition in add_partitions:
+        annotated_partitions.append(add_partition.nodes)
+        add_node = add_partition.output_nodes[0]
+        if _is_annotated([add_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = add_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = add_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("mul_relu")
+def _annotate_mul_relu(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    fused_partitions = find_sequential_partitions(
+        gm, [torch.mul, torch.nn.ReLU], filter_fn=filter_fn
+    )
+    annotated_partitions = []
+    for fused_partition in fused_partitions:
+        mul_partition, relu_partition = fused_partition
+        annotated_partitions.append(mul_partition.nodes + relu_partition.nodes)
+        if len(relu_partition.output_nodes) > 1:
+            raise ValueError("Relu partition has more than one output node")
+        relu_node = relu_partition.output_nodes[0]
+        if len(mul_partition.output_nodes) > 1:
+            raise ValueError("mul partition has more than one output node")
+        mul_node = mul_partition.output_nodes[0]
+
+        if _is_annotated([relu_node, mul_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            _annotated=True,
+        )
+        relu_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+@register_annotator("mul")
+def _annotate_mul(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    mul_partitions = get_source_partitions(
+        gm.graph, ["mul", "mul_", operator.mul, torch.mul, operator.imul], filter_fn
+    )
+    mul_partitions = list(itertools.chain.from_iterable(mul_partitions.values()))
+    annotated_partitions = []
+    for mul_partition in mul_partitions:
+        annotated_partitions.append(mul_partition.nodes)
+        mul_node = mul_partition.output_nodes[0]
+        if _is_annotated([mul_node]):
+            continue
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        output_act_qspec = get_output_act_qspec(quantization_config)
+
+        input_qspec_map = {}
+        input_act0 = mul_node.args[0]
+        if isinstance(input_act0, Node):
+            if _is_input_large_scalar(input_act0, gm):
+                continue
+            if _is_input_non_float_tensor(input_act0):
+                continue
+            input_qspec_map[input_act0] = input_act_qspec
+
+        input_act1 = mul_node.args[1]
+        if isinstance(input_act1, Node):
+            if _is_input_large_scalar(input_act1, gm):
+                continue
+            if _is_input_non_float_tensor(input_act1):
+                continue
+            input_qspec_map[input_act1] = input_act_qspec
+
+        mul_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+# TODO: remove Optional in return type, fix annotated_partitions logic
+@register_annotator("cat")
+def _annotate_cat(
+    gm: torch.fx.GraphModule,
+    quantization_config: Optional[QuantizationConfig],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    cat_partitions = get_source_partitions(gm.graph, [torch.cat], filter_fn)
+    cat_partitions = list(itertools.chain.from_iterable(cat_partitions.values()))
+    annotated_partitions = []
+    for cat_partition in cat_partitions:
+        cat_node = cat_partition.output_nodes[0]
+        if _is_annotated([cat_node]):
+            continue
+
+        if cat_node.target != torch.ops.aten.cat.default:
+            # TODO: change this to AnnotationException
+            raise Exception(
+                f"Expected cat node: torch.ops.aten.cat.default, but found {cat_node.target}"
+                " please check if you are calling the correct capture API"
+            )
+
+        annotated_partitions.append(cat_partition.nodes)
+
+        input_act_qspec = get_input_act_qspec(quantization_config)
+        inputs = cat_node.args[0]
+
+        input_qspec_map = {}
+        input_act0 = inputs[0]
+        if isinstance(input_act0, Node):
+            input_qspec_map[input_act0] = input_act_qspec
+
+        shared_with_input0_qspec = SharedQuantizationSpec((input_act0, cat_node))
+        for input_act in inputs[1:]:
+            input_qspec_map[input_act] = shared_with_input0_qspec
+
+        output_act_qspec = shared_with_input0_qspec
+
+        cat_node.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=output_act_qspec,
+            _annotated=True,
+        )
+    return annotated_partitions
+
+
+def _is_share_obs_or_fq_op(op: Callable) -> bool:
+    return op in [
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.hardtanh_.default,
+        torch.ops.aten.mean.default,
+        torch.ops.aten.mean.dim,
+        torch.ops.aten.permute.default,
+        torch.ops.aten.permute_copy.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze_copy.dim,
+        # TODO: remove?
+        torch.ops.aten.adaptive_avg_pool2d.default,
+        torch.ops.aten.view_copy.default,
+        torch.ops.aten.view.default,
+        torch.ops.aten.slice_copy.Tensor,
+        torch.ops.aten.flatten.using_ints,
+    ]
+
+
+def propagate_annotation(model: torch.fx.GraphModule) -> None:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target):
+            continue
+
+        prev_node = n.args[0]
+        if not isinstance(prev_node, Node):
+            continue
+
+        quantization_annotation = prev_node.meta.get("quantization_annotation", None)
+        if not quantization_annotation:
+            continue
+
+        output_qspec = quantization_annotation.output_qspec
+        if not output_qspec:
+            continue
+
+        # make sure current node is not annotated
+        if (
+            "quantization_annotation" in n.meta
+            and n.meta["quantization_annotation"]._annotated
+        ):
+            continue
+
+        shared_qspec = SharedQuantizationSpec(prev_node)
+        # propagate the previous output_qspec to the current node
+        n.meta["quantization_annotation"] = QuantizationAnnotation(
+            input_qspec_map={
+                prev_node: shared_qspec,
+            },
+            output_qspec=shared_qspec,
+            _annotated=True,
+        )
+
+
+# TODO: make the list of ops customizable
+def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    for n in model.graph.nodes:
+        if n.op != "call_function" or n.target not in [
+            torch.ops.aten.add.Tensor,
+            torch.ops.aten.mul.Tensor,
+        ]:
+            continue
+        args = list(n.args)
+        new_args = []
+        for i in range(len(args)):
+            if isinstance(args[i], torch.fx.Node):
+                new_args.append(args[i])
+                continue
+            prefix = "_tensor_constant_"
+            get_new_attr_name = get_new_attr_name_with_prefix(prefix)
+            tensor_constant_name = get_new_attr_name(model)
+            float_tensor = torch.tensor(float(args[i]))
+            model.register_buffer(tensor_constant_name, float_tensor)
+            fake_mode = n.meta["val"].fake_mode
+            with model.graph.inserting_before(n):
+                get_attr_node = model.graph.create_node(
+                    "get_attr", tensor_constant_name, (), {}
+                )
+                get_attr_node.meta["val"] = fake_mode.from_tensor(
+                    float_tensor, static_shapes=True
+                )
+                new_args.append(get_attr_node)
+        n.args = tuple(new_args)
+    model.recompile()
+    return model
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/stubs.py b/MLPY/Lib/site-packages/torch/ao/quantization/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..42a90c8e193e5f2b22874c721b7057a4ef022cf6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/stubs.py
@@ -0,0 +1,64 @@
+
+from torch import nn
+
+class QuantStub(nn.Module):
+    r"""Quantize stub module, before calibration, this is same as an observer,
+    it will be swapped as `nnq.Quantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+    def __init__(self, qconfig=None):
+        super().__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x):
+        return x
+
+
+class DeQuantStub(nn.Module):
+    r"""Dequantize stub module, before calibration, this is same as identity,
+    this will be swapped as `nnq.DeQuantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+    def __init__(self, qconfig=None):
+        super().__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x):
+        return x
+
+
+class QuantWrapper(nn.Module):
+    r"""A wrapper class that wraps the input module, adds QuantStub and
+    DeQuantStub and surround the call to module with call to quant and dequant
+    modules.
+
+    This is used by the `quantization` utility functions to add the quant and
+    dequant modules, before `convert` function `QuantStub` will just be observer,
+    it observes the input tensor, after `convert`, `QuantStub`
+    will be swapped to `nnq.Quantize` which does actual quantization. Similarly
+    for `DeQuantStub`.
+    """
+    quant: QuantStub
+    dequant: DeQuantStub
+    module: nn.Module
+
+    def __init__(self, module):
+        super().__init__()
+        qconfig = getattr(module, "qconfig", None)
+        self.add_module('quant', QuantStub(qconfig))
+        self.add_module('dequant', DeQuantStub(qconfig))
+        self.add_module('module', module)
+        self.train(module.training)
+
+    def forward(self, X):
+        X = self.quant(X)
+        X = self.module(X)
+        return self.dequant(X)
diff --git a/MLPY/Lib/site-packages/torch/ao/quantization/utils.py b/MLPY/Lib/site-packages/torch/ao/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8835b2ce281c5bd4f6b6be9087a2e83731051d55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/ao/quantization/utils.py
@@ -0,0 +1,703 @@
+"""
+Utils shared by different modes of quantization (eager/graph)
+"""
+import functools
+import warnings
+from collections import OrderedDict
+from inspect import getfullargspec, signature
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+import torch
+from torch.ao.quantization.quant_type import QuantType
+from torch.fx import Node
+from torch.nn.utils.parametrize import is_parametrized
+
+NodePattern = Union[Tuple[Node, Node], Tuple[Node, Tuple[Node, Node]], Any]
+NodePattern.__module__ = "torch.ao.quantization.utils"
+
+# This is the Quantizer class instance from torch/quantization/fx/quantize.py.
+# Define separately to prevent circular imports.
+# TODO(future PR): improve this.
+# make this public once fixed (can't be public as is because setting the module directly
+# doesn't work)
+QuantizerCls = Any
+
+# Type for fusion patterns, it can be more complicated than the following actually,
+# see pattern.md for docs
+# TODO: not sure if typing supports recursive data types
+Pattern = Union[
+    Callable, Tuple[Callable, Callable], Tuple[Callable, Tuple[Callable, Callable]], Any
+]
+Pattern.__module__ = "torch.ao.quantization.utils"
+
+# TODO: maybe rename this to MatchInputNode
+class MatchAllNode:
+    """ A node pattern that matches all nodes, used in defining
+    fusion patterns in FX Graph Mode Quantization
+    """
+    pass
+
+module_type_list = {
+    torch.nn.ReLU,
+    torch.nn.ReLU6,
+    torch.nn.AdaptiveAvgPool1d,
+    torch.nn.AdaptiveAvgPool2d,
+    torch.nn.AdaptiveAvgPool3d,
+    torch.nn.AvgPool1d,
+    torch.nn.AvgPool2d,
+    torch.nn.AvgPool3d,
+    torch.nn.MaxPool1d,
+    torch.nn.MaxPool2d,
+    torch.nn.MaxPool3d,
+    torch.nn.Identity,
+    torch.nn.Hardsigmoid,
+    torch.nn.Sigmoid,
+    torch.nn.Tanh,
+}
+func_list = {
+    torch.nn.functional.adaptive_avg_pool1d,
+    torch.nn.functional.adaptive_avg_pool2d,
+    torch.nn.functional.adaptive_avg_pool3d,
+    torch.nn.functional.elu,
+    torch.nn.functional.hardswish,
+    torch.nn.functional.instance_norm,
+    torch.nn.functional.layer_norm,
+    torch.nn.functional.leaky_relu,
+    torch.nn.functional.silu,
+    torch.nn.functional.mish,
+    torch.nn.functional.dropout,
+    torch.nn.functional.max_pool1d,
+    torch.nn.functional.max_pool2d,
+    torch.nn.functional.max_pool3d,
+    torch.nn.functional.relu,
+    torch.nn.functional.hardtanh,
+    torch.nn.functional.hardtanh_,
+    torch.nn.functional.hardsigmoid,
+    torch.nn.functional.sigmoid,
+    torch.transpose,
+    torch.repeat_interleave,
+    torch.sigmoid,
+    torch.squeeze,
+    torch.stack,
+    torch.sum,
+    torch.tanh,
+    torch.unsqueeze,
+    torch.cat,
+}
+method_list = {
+    torch.mean,
+    'relu',
+    'relu_',
+    'contiguous',
+    'detach',
+    'detach_',
+    'hardsigmoid',
+    'hardsigmoid_',
+    'permute',
+    'repeat',
+    'repeat_interleave',
+    'reshape',
+    'resize_',
+    'shape',
+    'sigmoid',
+    'sigmoid_',
+    'size',
+    'squeeze',
+    'squeeze_',
+    'tanh',
+    'tanh_',
+    'transpose',
+    'unsqueeze',
+    'unsqueeze_',
+    'view',
+}
+
+# TODO: not used now, remove
+def check_node(node, modules):
+    # TODO: reuse is_fixed_qparam_node after we move this function to _lower_to_native_backend.py
+    is_call_function = node.op == "call_function" and node.target in func_list
+    is_call_method = node.op == "call_method" and node.target in method_list
+    is_call_module = node.op == "call_module" and type(modules[str(node.target)]) in module_type_list
+    return is_call_function, is_call_method, is_call_module
+
+def get_combined_dict(default_dict, additional_dict):
+    d = default_dict.copy()
+    d.update(additional_dict)
+    return d
+
+def is_per_tensor(qscheme):
+    return qscheme == torch.per_tensor_affine or \
+        qscheme == torch.per_tensor_symmetric
+
+def is_per_channel(qscheme):
+    return qscheme in [torch.per_channel_affine,
+                       torch.per_channel_affine_float_qparams,
+                       torch.per_channel_symmetric]
+
+def getattr_from_fqn(obj: Any, fqn: str) -> Any:
+    """
+    Given an obj and a fqn such as "foo.bar.baz", returns gm.foo.bar.baz.
+    """
+    return functools.reduce(getattr, fqn.split("."), obj)
+
+def to_underlying_dtype(qdtype):
+    DTYPE_MAPPING = {
+        torch.quint8: torch.uint8,
+        torch.qint8: torch.int8,
+        torch.qint32: torch.int32,
+        torch.quint4x2: torch.uint8,
+        torch.quint2x4: torch.uint8,
+        torch.uint8: torch.uint8,
+        torch.int8: torch.int8,
+        torch.int16: torch.int16,
+        torch.int32: torch.int32,
+    }
+    assert qdtype in DTYPE_MAPPING, "Unsupported dtype: " + str(qdtype)
+    return DTYPE_MAPPING[qdtype]
+
+def get_qparam_dict(observer_or_fake_quant):
+    from torch.ao.quantization.observer import PlaceholderObserver
+
+    qscheme = getattr(observer_or_fake_quant, "qscheme", None)
+    dtype = observer_or_fake_quant.dtype
+    qparams = {"qscheme": qscheme, "dtype": dtype}
+
+    if not qscheme or isinstance(observer_or_fake_quant, PlaceholderObserver):
+        return {"qscheme": None, "dtype": dtype}
+
+    if is_per_tensor(qscheme):
+        qscheme = torch.per_tensor_affine
+    elif is_per_channel(qscheme):
+        # change symmetric to affine since we do not have symmetric
+        # quantized Tensor
+        if qscheme == torch.per_channel_symmetric:
+            qscheme = torch.per_channel_affine
+        qparams["axis"] = observer_or_fake_quant.ch_axis
+    else:
+        raise RuntimeError(f"Unrecognized qscheme: {qscheme}")
+    # update qscheme, since we don't have symmetric quant qscheme
+    # in quantized Tensor
+    qparams["qscheme"] = qscheme
+
+    scale, zero_point = observer_or_fake_quant.calculate_qparams()
+    qparams["scale"] = scale
+    qparams["zero_point"] = zero_point
+
+    if hasattr(observer_or_fake_quant, "quant_min"):
+        qparams["quant_min"] = observer_or_fake_quant.quant_min
+    if hasattr(observer_or_fake_quant, "quant_max"):
+        qparams["quant_max"] = observer_or_fake_quant.quant_max
+
+    return qparams
+
+
+def get_swapped_custom_module_class(custom_module, custom_module_class_mapping, qconfig):
+    """ Get the observed/quantized custom module class that we need
+    to swap `custom_module` to
+    Input:
+        custom_module: input, can be an instance of either a float or observed custom module
+        custom_module_class_mapping: the float to observed or observed to quantized custom module class mapping
+        qconfig: qconfig configured for the custom module
+
+    Output:
+        corresponding observed/quantized custom module class for input custom module instance
+    """
+    quant_type = get_quant_type(qconfig)
+    class_mapping = custom_module_class_mapping.get(quant_type, {})
+    assert type(custom_module) in class_mapping, "did not find corresponding observed " \
+        f"module class for {type(custom_module)} in mapping: {class_mapping}"
+    return class_mapping[type(custom_module)]
+
+def activation_dtype(qconfig):
+    assert qconfig is not None
+    activation = qconfig.activation()
+    return activation.dtype
+
+def weight_dtype(qconfig):
+    assert qconfig is not None
+    weight = qconfig.weight()
+    return weight.dtype
+
+def activation_is_statically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized or not, this includes quantizing to quint8, qint8 and qint32 and float16
+    """
+    return (
+        activation_dtype(qconfig) in [
+            torch.quint8,
+            torch.qint8,
+            torch.qint32,
+            torch.float16,
+            torch.uint8,
+            torch.int8,
+            torch.int16,
+            torch.int32
+        ]
+        and (not activation_is_dynamically_quantized(qconfig))
+    )
+
+def activation_is_dynamically_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    dynamically quantized or not, this includes dynamically quantizing to
+    quint8, qint8 and float16
+    """
+    activation_dtype, _, activation_is_dynamic = \
+        get_qconfig_dtypes(qconfig)
+    return activation_is_dynamic
+
+def activation_is_int8_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized to int8 or not, this includes quantizing to quint8, qint8
+    """
+    return activation_dtype(qconfig) in [torch.quint8, torch.qint8, torch.uint8, torch.int8]
+
+def activation_is_int32_quantized(qconfig):
+    """ Given a qconfig, decide if the activation needs to be
+    quantized to int32 or not
+    """
+    return activation_dtype(qconfig) in [torch.qint32, torch.int32]
+
+def weight_is_quantized(qconfig):
+    """ Given a qconfig, decide if the weight needs to be
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [
+        torch.quint8,
+        torch.qint8,
+        torch.float16,
+        torch.quint4x2,
+        torch.uint8,
+        torch.int8,
+        torch.int16,
+        torch.int32
+    ]
+
+def weight_is_statically_quantized(qconfig):
+    """ Given a qconfig, decide if the weight needs to be statically
+    quantized or not
+    """
+    return weight_dtype(qconfig) in [torch.quint8, torch.qint8, torch.uint8, torch.int8]
+
+def op_is_int8_dynamically_quantized(qconfig) -> bool:
+    """ Given a qconfig, returns True if this op is using int8 dynamic
+    quantization
+    """
+    activation_dtype, weight_dtype, activation_is_dynamic = \
+        get_qconfig_dtypes(qconfig)
+    return (
+        activation_dtype in [torch.quint8, torch.uint8] and
+        # for now, the lines below assume fbgemm or qnnpack
+        weight_dtype in [torch.qint8, torch.int8] and
+        activation_is_dynamic
+    )
+
+def get_qconfig_dtypes(qconfig):
+    r""" returns the qconfig tuple for qconfig:
+    (activation_dtype, weight_dtype, activation_is_dynamic)
+    """
+    assert qconfig is not None
+    activation = qconfig.activation()
+    weight = qconfig.weight()
+    act_is_dynamic = getattr(activation, "is_dynamic", False)
+    return (activation.dtype, weight.dtype, act_is_dynamic)
+
+def get_quant_type(qconfig):
+    assert qconfig is not None
+    activation = qconfig.activation()
+    weight = qconfig.weight()
+    static_dtypes = [torch.quint8, torch.qint8, torch.quint4x2, torch.qint32, torch.uint8, torch.int8, torch.int16, torch.int32]
+    if weight.dtype in static_dtypes:
+        if hasattr(activation, 'is_dynamic') and activation.is_dynamic:
+            return QuantType.DYNAMIC
+        elif activation.dtype in static_dtypes:
+            return QuantType.STATIC
+        else:
+            return QuantType.WEIGHT_ONLY
+
+    if weight.dtype == torch.float16:
+        if hasattr(activation, 'is_dynamic') and activation.is_dynamic:
+            return QuantType.DYNAMIC
+        elif activation.dtype == torch.float16:
+            return QuantType.STATIC
+
+    raise Exception(f"Unrecognized dtype combination in get_quant_type: activation({activation.dtype}),"
+                    f"weight({weight.dtype})")
+
+def check_min_max_valid(min_val: torch.Tensor, max_val: torch.Tensor) -> bool:
+    """ Checks if the given minimum and maximum values are valid, meaning that
+    they exist and the min value is less than the max value.
+    """
+    if min_val.numel() == 0 or max_val.numel() == 0:
+        warnings.warn(
+            "must run observer before calling calculate_qparams. " +
+            "Returning default values."
+        )
+        return False
+
+    if min_val.dim() == 0 or max_val.dim() == 0:
+        if min_val == float("inf") and max_val == float("-inf"):
+            warnings.warn(
+                "must run observer before calling calculate_qparams. " +
+                "Returning default values."
+            )
+
+            return False
+
+        assert min_val <= max_val, f"min {min_val} should be less than max {max_val}"
+    else:
+        assert torch.all(
+            min_val <= max_val
+        ), f"min {min_val} should be less than max {max_val}"
+
+    return True
+
+
+def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: bool, dtype: torch.dtype,
+                        reduce_range: bool) -> Tuple[int, int]:
+    r"""Calculates actual qmin and qmax based on the quantization range,
+    observer datatype and if range is reduced.
+    """
+    # TODO(jerryzh): Figure out why custom quant_min/quant_max are still adjusted.
+    if has_customized_qrange:
+        # This initialization here is to be resolve TorchScript compilation issues and allow
+        # using of refinement to decouple initial_qmin and initial_qmax from quantization range.
+        # The actual values of initial_qmin and initial_qmax will be reset below.
+        if dtype in [torch.qint32, torch.int32]:
+            initial_quant_min, initial_quant_max = 0, 2**32 - 1
+        else:
+            initial_quant_min, initial_quant_max = 0, 255
+        # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the
+        # attribute from Optional valid integers for use, based on TorchScript's requirements.
+        custom_quant_min, custom_quant_max = quant_min, quant_max
+        if custom_quant_min is not None and custom_quant_max is not None:
+            initial_quant_min, initial_quant_max = (
+                custom_quant_min,
+                custom_quant_max,
+            )
+
+        qrange_len = initial_quant_max - initial_quant_min + 1
+        if dtype in [torch.qint8, torch.int8]:
+            assert (
+                0 < qrange_len <= 256
+            ), "quantization range should be positive and not exceed the maximum bit range (=256)."
+        elif dtype in [torch.qint32, torch.int32]:
+            assert (
+                0 < qrange_len <= 2**32
+            ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)."
+        if reduce_range:
+            quant_min, quant_max = quant_min // 2, quant_max // 2
+    else:
+        # Fallback onto default 8-bit qmin and qmax calculation if dynamic range is not used.
+        if dtype in [torch.qint8, torch.int8]:
+            if reduce_range:
+                quant_min, quant_max = -64, 63
+            else:
+                quant_min, quant_max = -128, 127
+        elif dtype in [torch.quint8, torch.uint8]:
+            if reduce_range:
+                quant_min, quant_max = 0, 127
+            else:
+                quant_min, quant_max = 0, 255
+        elif dtype in [torch.qint32, torch.int32]:
+            quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1
+        else:
+            quant_min, quant_max = 0, 15
+    return quant_min, quant_max
+
+
+def _parent_name(target):
+    """
+    Turn 'foo.bar' into ['foo', 'bar']
+    """
+    r = target.rsplit('.', 1)
+    if len(r) == 1:
+        return '', r[0]
+    else:
+        return r[0], r[1]
+
+def has_no_children_ignoring_parametrizations(module):
+    """
+    Checks if module._modules is empty or
+    if module is a parametrization, checks that module._modules only has
+    the 'parametrizations' module
+    """
+    if len(module._modules) == 0:
+        return True
+    elif is_parametrized(module):
+        return len(module._modules) == 1 and 'parametrizations' in module._modules
+    else:
+        return False
+
+def _get_path_of_module(root: torch.nn.Module, submodule: torch.nn.Module) -> Optional[str]:
+    """ Get the path (fully qualified name) of a submodule
+
+    Example::
+
+    >> class M(torch.nn.Module):
+           def __init__(self):
+               self.linear = torch.nn.Linear(5, 5)
+           def forward(self, x):
+               return self.linear(x)
+
+    >> m = M()
+    >> l = m.linear
+    >> _get_path_of_module(m, l)
+    "linear"
+    """
+    for n, p in root.named_modules():
+        if submodule is p:
+            return n
+    return None
+
+def _get_signature_locals(f: Callable, loc: Dict[str, Any]) -> Dict[str, Any]:
+    """ Get local keyword arguments
+
+    Example::
+
+    >> def f(self, a, b=9):
+           pass
+    >> loc = {"a": 6, "c": 7}
+    >> _get_signature_locals(f, loc)
+    {"a": 6}
+    """
+    return {k: v for k, v in loc.items() if k in signature(f).parameters}
+
+def _get_default_kwargs(f: Callable) -> "OrderedDict[str, Any]":
+    """ Get all default keyword arguments from function signature
+
+    Example::
+
+    >> def f(self, a, b=9):
+           pass
+    >> _get_default_kwargs(f)
+    {"b": 9}
+    """
+    kwargs = {}
+    for name, param in signature(f).parameters.items():
+        if param.default is not param.empty:
+            kwargs[name] = param.default
+        elif param.kind is param.VAR_POSITIONAL:
+            kwargs[name] = ()
+        elif param.kind is param.VAR_KEYWORD:
+            kwargs[name] = {}
+    return OrderedDict(kwargs)
+
+def _normalize_kwargs(func: Callable, loc: Dict[str, Any]) -> "OrderedDict[str, Any]":
+    """ Given a function and local function arguments, normalize the keyword
+    arguments by filling in default arguments from function signature
+
+    Example::
+
+    >> def f(self, key1=3, key2=3):
+           pass
+    >> loc = {"key2": 6}
+    >> _normalize_kwargs(f, loc)
+    {"key1": 3, "key2": 6}
+    """
+    default_kwargs = _get_default_kwargs(func)
+    local_kwargs = _get_signature_locals(func, loc)
+    normalized_kwargs = default_kwargs.copy()
+    for attr, val in local_kwargs.items():
+        if attr in normalized_kwargs:
+            # override the default keyword arguments
+            normalized_kwargs[attr] = val
+    return normalized_kwargs
+
+def validate_qmin_qmax(quant_min: int, quant_max: int) -> None:
+    r"""Validates that the user-specified quantization range is properly initialized
+    and within the given bound supported by the observer dtype.
+
+    To accommodate lower-bit quantization with respect to the existing torch.qint8 and
+    torch.quint8 datatypes, the user can choose to use dynamic quantization range by passing
+    in a tuple of initial qmin and qmax values. One use case is these customized qmin and qmax
+    values are used to calculate static estimates of the scale and zero point for aggressive lower-bit
+    fake quantization. These estimates are compared against parameters learned through backpropagation.
+    The related literatures for scale and zero point via backpropagation are as follows:
+
+    Learned Step Size Quantization: https://openreview.net/pdf?id=rkgO66VKDS
+    Trained Quantization Thresholds: https://arxiv.org/pdf/1903.08066.pdf
+    """
+    # The variable names are prefixed with "initial" because their values (qmin and qmax) might be adjusted
+    # based on whether quantization range is reduced and the datatype (signed/unsigned) used by the observer.
+    assert (
+        quant_min <= 0 <= quant_max
+    ), "Used-specified quantization range must include 0."
+    assert (
+        quant_min < quant_max
+    ), "qmin must be strictly less than qmax for user-specified quantization range."
+
+
+# Functionally equivalent to '_calculate_qparams' in observer.py. Observers must be torchscriptable however and qscheme
+# as far as I can tell is not allowed to passed as a parameter in torchscript functions. This makes refactoring observer
+# to use this utility a massive pain and very gross. For now Im opting just to duplicate as this code seems unlikey to change
+# (last update over 1 year ago) and when torchscript is fully deprecated we can refactor. TODO(jakeszwe, jerryzh168)
+def determine_qparams(
+        min_val: torch.Tensor, max_val: torch.Tensor, quant_min: int, quant_max: int,
+        dtype: torch.dtype, eps: torch.Tensor, has_customized_qrange: bool,
+        qscheme: torch.qscheme = torch.per_tensor_affine) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""Calculates the quantization parameters, given min and max
+    value tensors. Works for both per tensor and per channel cases
+
+    Args:
+        min_val: Minimum values per channel
+        max_val: Maximum values per channel
+
+    Returns:
+        scales: Scales tensor of shape (#channels,)
+        zero_points: Zero points tensor of shape (#channels,)
+    """
+    if not check_min_max_valid(min_val, max_val):
+        return torch.tensor([1.0], device=min_val.device.type), torch.tensor([0], device=min_val.device.type)
+
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+
+    device = min_val_neg.device
+    scale = torch.ones(min_val_neg.size(), dtype=torch.double, device=device)
+    zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device)
+
+    if (
+        qscheme == torch.per_tensor_symmetric
+        or qscheme == torch.per_channel_symmetric
+    ):
+        max_val_pos = torch.max(-min_val_neg, max_val_pos)
+        scale = max_val_pos / (float(quant_max - quant_min) / 2)
+        scale = torch.max(scale, eps)
+        if dtype in [torch.uint8, torch.quint8]:
+            if has_customized_qrange:
+                # When customized quantization range is used, down-rounded midpoint of the range is chosen.
+                zero_point = zero_point.new_full(
+                    zero_point.size(), (quant_min + quant_max) // 2
+                )
+            else:
+                zero_point = zero_point.new_full(zero_point.size(), 128)
+    elif qscheme == torch.per_channel_affine_float_qparams:
+        scale = (max_val - min_val) / float(quant_max - quant_min)
+        scale = torch.where(scale > eps, scale, torch.ones_like(scale))
+        # We use the quantize function
+        # xq = Round(Xf * inv_scale + zero_point),
+        # setting zero_point to (-1 * min *inv_scale) we get
+        # Xq = Round((Xf - min) * inv_scale)
+        zero_point = -1 * min_val / scale
+    else:
+        scale = (max_val_pos - min_val_neg) / float(quant_max - quant_min)
+        scale = torch.max(scale, eps)
+        zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int)
+        zero_point = torch.clamp(zero_point, quant_min, quant_max)
+
+    # For scalar values, cast them to Tensors of size 1 to keep the shape
+    # consistent with default values in FakeQuantize.
+    if len(scale.shape) == 0:
+        # TODO: switch to scale.item() after adding JIT support
+        scale = torch.tensor([float(scale)], dtype=scale.dtype, device=device)
+    if len(zero_point.shape) == 0:
+        # TODO: switch to zero_point.item() after adding JIT support
+        zero_point = torch.tensor(
+            [int(zero_point)], dtype=zero_point.dtype, device=device
+        )
+        if qscheme == torch.per_channel_affine_float_qparams:
+            zero_point = torch.tensor(
+                [float(zero_point)], dtype=zero_point.dtype, device=device
+            )
+
+    return scale.to(torch.double), zero_point.to(torch.int64)
+
+def _get_num_pos_args(f: Callable) -> int:
+    """ Get number of positional args for a function
+
+    Example::
+
+    >> def f(self, key1=3, key2=3):
+           pass
+    >> _get_num_pos_args(f)
+    3
+    """
+    return len(getfullargspec(f).args)
+
+def get_fqn_to_example_inputs(
+    model: torch.nn.Module,
+    example_inputs: Tuple[Any, ...]
+) -> Dict[str, Tuple[Any, ...]]:
+    """ Given a model and its example inputs, return a dictionary from
+    fully qualified name of submodules to example_inputs for that submodule,
+    e.g. {"linear1": (tensor1,), "linear2": (tensor2,), "sub": (tensor3,),
+          "sub.linear1": (tensor4,), ...}
+
+    Used to make quantizing submodules easier now that FX Graph Mode Quantization requires
+    example inputs.
+
+    Also works for keyword arguments with default values, we would flatten keyword
+    arguments as positional arguments and fill in the missing keyword args with default
+    values, e.g. if we have a forward function:
+    def forward(self, x, key1=3, key2=3):
+        ...
+
+    and we call it with self.submodule(x, key2=6)
+    we'll get example_inputs: (x, 3, 6)
+
+    user can also override `key1` with positional arguments as well:
+    for self.submodule(x, 5, key2=6)
+    we'll get: (x, 5, 6)
+
+    variable positional arguments and variable positional keyword arguments in forward
+    function are not supported currently, so please make sure no submodules is using
+    them.
+    """
+    root = model
+    fqn_to_example_inputs = {}
+
+    def _patched_module_call(self, *args, **kwargs):
+        submodule_example_inputs = list(args).copy()
+        normalized_kwargs = _normalize_kwargs(self.forward, kwargs)
+        # minus 1 to skipping counting `self`
+        num_args = _get_num_pos_args(self.forward) - 1
+        num_to_pop = num_args - len(submodule_example_inputs)
+        while num_to_pop and normalized_kwargs:
+            normalized_kwargs.popitem(last=False)
+            num_to_pop -= 1
+        submodule_example_inputs.extend(normalized_kwargs.values())
+        submodule_example_inputs_tuple = tuple(submodule_example_inputs)
+        fqn = _get_path_of_module(root, self)
+        if fqn is not None:
+            fqn_to_example_inputs[fqn] = submodule_example_inputs_tuple
+        return orig_module_call(self, *args, **kwargs)
+
+    orig_module_call = torch.nn.Module.__call__
+    torch.nn.Module.__call__ = _patched_module_call  # type: ignore[method-assign]
+    try:
+        model(*example_inputs)
+    finally:
+        # restore the module call even if there is an exception
+        torch.nn.Module.__call__ = orig_module_call  # type: ignore[method-assign]
+    return fqn_to_example_inputs
+
+__all__ = [
+    "NodePattern",
+    "Pattern",
+    "MatchAllNode",
+    "check_node",
+    "get_combined_dict",
+    "is_per_tensor",
+    "is_per_channel",
+    "getattr_from_fqn",
+    "get_qparam_dict",
+    "get_swapped_custom_module_class",
+    "activation_dtype",
+    "weight_dtype",
+    "activation_is_statically_quantized",
+    "activation_is_dynamically_quantized",
+    "activation_is_int8_quantized",
+    "activation_is_int32_quantized",
+    "weight_is_quantized",
+    "weight_is_statically_quantized",
+    "op_is_int8_dynamically_quantized",
+    "get_qconfig_dtypes",
+    "get_quant_type",
+    "check_min_max_valid",
+    "calculate_qmin_qmax",
+    "has_no_children_ignoring_parametrizations",
+    "get_fqn_to_example_inputs",
+    "to_underlying_dtype",
+    "determine_qparams",
+    "validate_qmin_qmax",
+]
diff --git a/MLPY/Lib/site-packages/torch/autograd/__init__.py b/MLPY/Lib/site-packages/torch/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30e24e6df2e2aee769cc029475679325bb5243b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/__init__.py
@@ -0,0 +1,515 @@
+"""
+``torch.autograd`` provides classes and functions implementing automatic
+differentiation of arbitrary scalar valued functions. It requires minimal
+changes to the existing code - you only need to declare :class:`Tensor` s
+for which gradients should be computed with the ``requires_grad=True`` keyword.
+As of now, we only support autograd for floating point :class:`Tensor` types (
+half, float, double and bfloat16) and complex :class:`Tensor` types (cfloat, cdouble).
+"""
+import warnings
+from typing import Any, Callable, cast, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+from torch.types import _size, _TensorOrTensors, _TensorOrTensorsOrGradEdge
+from .. import _vmap_internals
+from ..overrides import handle_torch_function, has_torch_function, is_tensor_like
+from . import forward_ad, functional, graph
+from .anomaly_mode import detect_anomaly, set_detect_anomaly
+from .function import Function, NestedIOFunction
+from .grad_mode import (
+    _force_original_view_tracking,
+    _unsafe_preserve_version_counter,
+    enable_grad,
+    inference_mode,
+    no_grad,
+    set_grad_enabled,
+    set_multithreading_enabled,
+)
+from .gradcheck import gradcheck, gradgradcheck
+from .graph import _engine_run_backward
+
+from .variable import Variable
+
+__all__ = ["Variable", "Function", "backward", "grad_mode"]
+
+_OptionalTensor = Optional[torch.Tensor]
+_ShapeorNestedShape = Union[_size, Sequence[_size], torch.Tensor]
+
+
+def _calculate_shape(
+    output: torch.Tensor, grad: torch.Tensor, is_grads_batched: bool
+) -> Tuple[_ShapeorNestedShape, _ShapeorNestedShape]:
+    # is_same_size ensures that both tensors are either nested or non nested
+    # circular import
+    from torch.nested._internal.nested_tensor import NestedTensor
+
+    if output.is_nested and not isinstance(output, NestedTensor):
+        if is_grads_batched:
+            raise RuntimeError("Batched grads are not supported with Nested Tensor.")
+        out_shape = output._nested_tensor_size()
+        grad_shape = grad._nested_tensor_size()
+
+        return out_shape, grad_shape
+
+    reg_out_shape = output.shape
+    reg_grad_shape = grad.shape if not is_grads_batched else grad.shape[1:]
+    return reg_out_shape, reg_grad_shape
+
+
+def _make_grads(
+    outputs: Sequence[torch.Tensor],
+    grads: Sequence[_OptionalTensor],
+    is_grads_batched: bool,
+) -> Tuple[_OptionalTensor, ...]:
+    new_grads: List[_OptionalTensor] = []
+    for out, grad in zip(outputs, grads):
+        if isinstance(grad, torch.Tensor):
+            from torch.fx.experimental.symbolic_shapes import expect_true, sym_eq
+
+            first_grad = grad if not is_grads_batched else grad[0]
+            # TODO: We can remove this conditional once we uniformly use
+            # singleton int to represent jagged dimension, so that size() call
+            # on nested tensor works
+            if out.is_nested or first_grad.is_nested:
+                shape_matches = torch.is_same_size(out, first_grad)
+            else:
+                # We need to do a regular size check, without going through
+                # the operator, to be able to handle unbacked symints
+                # (expect_true ensures we can deal with unbacked)
+                shape_matches = expect_true(sym_eq(out.size(), first_grad.size()))
+            if not shape_matches:
+                out_shape, grad_shape = _calculate_shape(
+                    out, first_grad, is_grads_batched
+                )
+                if is_grads_batched:
+                    raise RuntimeError(
+                        "If `is_grads_batched=True`, we interpret the first "
+                        "dimension of each grad_output as the batch dimension. "
+                        "The sizes of the remaining dimensions are expected to match "
+                        "the shape of corresponding output, but a mismatch "
+                        "was detected: grad_output["
+                        + str(grads.index(grad))
+                        + "] has a shape of "
+                        + str(grad_shape)
+                        + " and output["
+                        + str(outputs.index(out))
+                        + "] has a shape of "
+                        + str(out_shape)
+                        + ". "
+                        "If you only want some tensors in `grad_output` to be considered "
+                        "batched, consider using vmap."
+                    )
+                else:
+                    raise RuntimeError(
+                        "Mismatch in shape: grad_output["
+                        + str(grads.index(grad))
+                        + "] has a shape of "
+                        + str(grad_shape)
+                        + " and output["
+                        + str(outputs.index(out))
+                        + "] has a shape of "
+                        + str(out_shape)
+                        + "."
+                    )
+            if out.dtype.is_complex != grad.dtype.is_complex:
+                raise RuntimeError(
+                    "For complex Tensors, both grad_output and output"
+                    " are required to have the same dtype."
+                    " Mismatch in dtype: grad_output["
+                    + str(grads.index(grad))
+                    + "] has a dtype of "
+                    + str(grad.dtype)
+                    + " and output["
+                    + str(outputs.index(out))
+                    + "] has a dtype of "
+                    + str(out.dtype)
+                    + "."
+                )
+            new_grads.append(grad)
+        elif grad is None:
+            if out.requires_grad:
+                if out.numel() != 1:
+                    raise RuntimeError(
+                        "grad can be implicitly created only for scalar outputs"
+                    )
+                if not out.dtype.is_floating_point:
+                    msg = (
+                        "grad can be implicitly created only for real scalar outputs"
+                        f" but got {out.dtype}"
+                    )
+                    raise RuntimeError(msg)
+                new_grads.append(
+                    torch.ones_like(out, memory_format=torch.preserve_format)
+                )
+            else:
+                new_grads.append(None)
+        else:
+            raise TypeError(
+                "gradients can be either Tensors or None, but got "
+                + type(grad).__name__
+            )
+    return tuple(new_grads)
+
+
+def _tensor_or_tensors_to_tuple(
+    tensors: Optional[_TensorOrTensors], length: int
+) -> Tuple[_OptionalTensor, ...]:
+    if tensors is None:
+        return (None,) * length
+    if isinstance(tensors, torch.Tensor):
+        return (tensors,)
+    return tuple(tensors)
+
+
+def backward(
+    tensors: _TensorOrTensors,
+    grad_tensors: Optional[_TensorOrTensors] = None,
+    retain_graph: Optional[bool] = None,
+    create_graph: bool = False,
+    grad_variables: Optional[_TensorOrTensors] = None,
+    inputs: Optional[_TensorOrTensorsOrGradEdge] = None,
+) -> None:
+    r"""Computes the sum of gradients of given tensors with respect to graph
+    leaves.
+
+    The graph is differentiated using the chain rule. If any of ``tensors``
+    are non-scalar (i.e. their data has more than one element) and require
+    gradient, then the Jacobian-vector product would be computed, in this
+    case the function additionally requires specifying ``grad_tensors``.
+    It should be a sequence of matching length, that contains the "vector"
+    in the Jacobian-vector product, usually the gradient of the differentiated
+    function w.r.t. corresponding tensors (``None`` is an acceptable value for
+    all tensors that don't need gradient tensors).
+
+    This function accumulates gradients in the leaves - you might need to zero
+    ``.grad`` attributes or set them to ``None`` before calling it.
+    See :ref:`Default gradient layouts<default-grad-layouts>`
+    for details on the memory layout of accumulated gradients.
+
+    .. note::
+        Using this method with ``create_graph=True`` will create a reference cycle
+        between the parameter and its gradient which can cause a memory leak.
+        We recommend using ``autograd.grad`` when creating the graph to avoid this.
+        If you have to use this function, make sure to reset the ``.grad`` fields of your
+        parameters to ``None`` after use to break the cycle and avoid the leak.
+
+    .. note::
+
+        If you run any forward ops, create ``grad_tensors``, and/or call ``backward``
+        in a user-specified CUDA stream context, see
+        :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
+
+    .. note::
+
+        When ``inputs`` are provided and a given input is not a leaf,
+        the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+        It is an implementation detail on which the user should not rely.
+        See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+
+    Args:
+        tensors (Sequence[Tensor] or Tensor): Tensors of which the derivative will be
+            computed.
+        grad_tensors (Sequence[Tensor or None] or Tensor, optional): The "vector" in
+            the Jacobian-vector product, usually gradients w.r.t. each element of
+            corresponding tensors. None values can be specified for scalar Tensors or
+            ones that don't require grad. If a None value would be acceptable for all
+            grad_tensors, then this argument is optional.
+        retain_graph (bool, optional): If ``False``, the graph used to compute the grad
+            will be freed. Note that in nearly all cases setting this option to ``True``
+            is not needed and often can be worked around in a much more efficient
+            way. Defaults to the value of ``create_graph``.
+        create_graph (bool, optional): If ``True``, graph of the derivative will
+            be constructed, allowing to compute higher order derivative products.
+            Defaults to ``False``.
+        inputs (Sequence[Tensor] or Tensor or Sequence[GradientEdge], optional): Inputs w.r.t. which the gradient
+            be will accumulated into ``.grad``. All other Tensors will be ignored. If
+            not provided, the gradient is accumulated into all the leaf Tensors that
+            were used to compute the :attr:`tensors`.
+    """
+    if torch._C._are_functorch_transforms_active():
+        raise RuntimeError(
+            "backward() called inside a functorch transform. This is not "
+            "supported, please use functorch.grad or functorch.vjp instead "
+            "or call backward() outside of functorch transforms."
+        )
+
+    if grad_variables is not None:
+        warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
+        if grad_tensors is None:
+            grad_tensors = grad_variables
+        else:
+            raise RuntimeError(
+                "'grad_tensors' and 'grad_variables' (deprecated) "
+                "arguments both passed to backward(). Please only "
+                "use 'grad_tensors'."
+            )
+    if inputs is not None and len(inputs) == 0:
+        raise RuntimeError("'inputs' argument to backward() cannot be empty.")
+
+    tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
+    inputs = (
+        (inputs,)
+        if isinstance(inputs, (torch.Tensor, graph.GradientEdge))
+        else tuple(inputs)
+        if inputs is not None
+        else tuple()
+    )
+
+    grad_tensors_ = _tensor_or_tensors_to_tuple(grad_tensors, len(tensors))
+    grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    # The reason we repeat the same comment below is that
+    # some Python versions print out the first line of a multi-line function
+    # calls in the traceback and some print out the last line
+    _engine_run_backward(
+        tensors,
+        grad_tensors_,
+        retain_graph,
+        create_graph,
+        inputs,
+        allow_unreachable=True,
+        accumulate_grad=True,
+    )
+
+
+def grad(
+    outputs: _TensorOrTensors,
+    inputs: _TensorOrTensorsOrGradEdge,
+    grad_outputs: Optional[_TensorOrTensors] = None,
+    retain_graph: Optional[bool] = None,
+    create_graph: bool = False,
+    only_inputs: bool = True,
+    allow_unused: Optional[bool] = None,
+    is_grads_batched: bool = False,
+    materialize_grads: bool = False,
+) -> Tuple[torch.Tensor, ...]:
+    r"""Computes and returns the sum of gradients of outputs with respect to
+    the inputs.
+
+    ``grad_outputs`` should be a sequence of length matching ``output``
+    containing the "vector" in vector-Jacobian product, usually the pre-computed
+    gradients w.r.t. each of the outputs. If an output doesn't require_grad,
+    then the gradient can be ``None``).
+
+    .. note::
+
+        If you run any forward ops, create ``grad_outputs``, and/or call ``grad``
+        in a user-specified CUDA stream context, see
+        :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
+
+    .. note::
+
+        ``only_inputs`` argument is deprecated and is ignored now (defaults to ``True``).
+        To accumulate gradient for other parts of the graph, please use
+        ``torch.autograd.backward``.
+
+    Args:
+        outputs (sequence of Tensor): outputs of the differentiated function.
+        inputs (sequence of Tensor or GradientEdge): Inputs w.r.t. which the gradient will be
+            returned (and not accumulated into ``.grad``).
+        grad_outputs (sequence of Tensor): The "vector" in the vector-Jacobian product.
+            Usually gradients w.r.t. each output. None values can be specified for scalar
+            Tensors or ones that don't require grad. If a None value would be acceptable
+            for all grad_tensors, then this argument is optional. Default: None.
+        retain_graph (bool, optional): If ``False``, the graph used to compute the grad
+            will be freed. Note that in nearly all cases setting this option to ``True``
+            is not needed and often can be worked around in a much more efficient
+            way. Defaults to the value of ``create_graph``.
+        create_graph (bool, optional): If ``True``, graph of the derivative will
+            be constructed, allowing to compute higher order derivative products.
+            Default: ``False``.
+        allow_unused (Optional[bool], optional): If ``False``, specifying inputs
+            that were not used when computing outputs (and therefore their grad is
+            always zero) is an error. Defaults to the value of ``materialize_grads``.
+        is_grads_batched (bool, optional): If ``True``, the first dimension of each
+            tensor in ``grad_outputs`` will be interpreted as the batch dimension.
+            Instead of computing a single vector-Jacobian product, we compute a
+            batch of vector-Jacobian products for each "vector" in the batch.
+            We use the vmap prototype feature as the backend to vectorize calls
+            to the autograd engine so that this computation can be performed in a
+            single call. This should lead to performance improvements when compared
+            to manually looping and performing backward multiple times. Note that
+            due to this feature being experimental, there may be performance
+            cliffs. Please use ``torch._C._debug_only_display_vmap_fallback_warnings(True)``
+            to show any performance warnings and file an issue on github if warnings exist
+            for your use case. Defaults to ``False``.
+        materialize_grads (bool, optional): If ``True``, set the gradient for unused inputs
+            to zero instead of None. This is useful when computing higher-order derivatives.
+            If ``materialize_grads`` is ``True`` and ``allow_unused`` is ``False``, an error
+            will be raised. Defaults to ``False``.
+
+    """
+    if materialize_grads and allow_unused is False:
+        raise ValueError(
+            "Expected allow_unused to be True or not passed when materialize_grads=True, "
+            "but got: allow_unused=False."
+        )
+    if allow_unused is None:
+        allow_unused = materialize_grads
+    t_outputs = cast(
+        Tuple[torch.Tensor, ...],
+        (outputs,) if is_tensor_like(outputs) else tuple(outputs),
+    )
+    if is_tensor_like(inputs) or isinstance(inputs, graph.GradientEdge):
+        inputs = cast(_TensorOrTensorsOrGradEdge, (inputs,))
+    else:
+        inputs = tuple(inputs)
+    t_inputs = tuple(i for i in inputs if is_tensor_like(i))
+    overridable_args = t_outputs + t_inputs
+    if has_torch_function(overridable_args):
+        return handle_torch_function(
+            grad,
+            overridable_args,
+            t_outputs,
+            inputs,
+            grad_outputs=grad_outputs,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            only_inputs=only_inputs,
+            allow_unused=allow_unused,
+            is_grads_batched=is_grads_batched,
+            materialize_grads=materialize_grads,
+        )
+
+    if not only_inputs:
+        warnings.warn(
+            "only_inputs argument is deprecated and is ignored now "
+            "(defaults to True). To accumulate gradient for other "
+            "parts of the graph, please use torch.autograd.backward."
+        )
+
+    grad_outputs_ = _tensor_or_tensors_to_tuple(grad_outputs, len(t_outputs))
+    grad_outputs_ = _make_grads(
+        t_outputs, grad_outputs_, is_grads_batched=is_grads_batched
+    )
+
+    if retain_graph is None:
+        retain_graph = create_graph
+
+    # The reason we repeat the same comment several times below is because
+    # some Python versions print out the first line of multi-line function
+    # calls in the traceback and some print out the last line
+    if is_grads_batched:
+
+        def vjp(gO):
+            return _engine_run_backward(
+                t_outputs,
+                gO,
+                retain_graph,
+                create_graph,
+                inputs,
+                allow_unused,
+                accumulate_grad=False,
+            )
+
+        result = _vmap_internals._vmap(vjp, 0, 0, allow_none_pass_through=True)(
+            grad_outputs_
+        )
+    else:
+        result = _engine_run_backward(
+            t_outputs,
+            grad_outputs_,
+            retain_graph,
+            create_graph,
+            inputs,
+            allow_unused,
+            accumulate_grad=False,
+        )
+    if materialize_grads:
+        if any(
+            result[i] is None and not is_tensor_like(inputs[i])
+            for i in range(len(inputs))
+        ):
+            raise RuntimeError(
+                "materialize_grads cannot be used when the given input is a GradientEdge"
+            )
+        result = tuple(
+            output
+            if output is not None
+            else torch.zeros_like(input, requires_grad=True)
+            for (output, input) in zip(result, inputs)
+        )
+    return result
+
+
+# This function applies in case of gradient checkpointing for memory
+# optimization. Currently, gradient checkpointing is supported only if the
+# execution engine is invoked through torch.autograd.backward() and its
+# inputs argument is not passed. It is not supported for torch.autograd.grad().
+# This is because if inputs are specified, the gradient won't be calculated for
+# anything else e.g. model parameters like weights, bias etc.
+#
+# This function returns whether the checkpointing is valid i.e. torch.autograd.backward
+# or not i.e. torch.autograd.grad. The implementation works by maintaining a thread
+# local variable in torch/csrc/autograd/engine.cpp which looks at the NodeTask
+# in the stack and before a NodeTask is executed in evaluate_function, it
+# checks for whether reentrant backwards is imperative or not.
+# See https://github.com/pytorch/pytorch/pull/4594 for more discussion/context
+def _is_checkpoint_valid():
+    return Variable._execution_engine.is_checkpoint_valid()
+
+
+def variable(*args, **kwargs):
+    raise RuntimeError(
+        "torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead"
+    )
+
+
+# Monkey patching variable.Variable to fix FX codegen. FX generates a call by roughly doing
+# f"{fn.__module__}.{fn.__name__}(...). This yields torch.autograd.variable.Variable(...) in the
+# output of an FX graph.  Unfortunately the module name torch.autograd.variable is shadowed by the
+# deprecated function - variable(...).
+variable.Variable = Variable  # type: ignore[attr-defined]
+
+if not torch._C._autograd_init():
+    raise RuntimeError("autograd initialization failed")
+
+# Import all native method/classes
+from torch._C._autograd import (
+    _add_metadata_json,
+    _disable_profiler,
+    _disable_profiler_legacy,
+    _enable_profiler,
+    _enable_profiler_legacy,
+    _enable_record_function,
+    _get_sequence_nr,
+    _kineto_step,
+    _KinetoEvent,
+    _pop_saved_tensors_default_hooks,
+    _prepare_profiler,
+    _profiler_enabled,
+    _ProfilerResult,
+    _push_saved_tensors_default_hooks,
+    _record_function_with_args_enter,
+    _record_function_with_args_exit,
+    _set_empty_test_observer,
+    _supported_activities,
+    DeviceType,
+    kineto_available,
+    ProfilerEvent,
+    SavedTensor,
+)
+
+from torch._C._profiler import ProfilerActivity, ProfilerConfig, ProfilerState
+
+from . import profiler
+
+
+def _register_py_tensor_class_for_device(device, cls):
+    if not isinstance(cls, type):
+        raise RuntimeError("cls isn't a typeinfo object")
+    torch._C._register_py_class_for_device(device, cls)
+
+
+is_multithreading_enabled = torch._C._is_multithreading_enabled
+torch._C._add_docstr(
+    is_multithreading_enabled, "Returns True if multithreading is currently enabled."
+)
+
+is_view_replay_enabled = torch._C._is_view_replay_enabled
+torch._C._add_docstr(
+    is_view_replay_enabled, "Returns True if view-replay is currently enabled."
+)
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a091e221d0e2d5407d335c0654ced30565213732
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/anomaly_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/anomaly_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2380b3f4a0cb058384ed7e85e3a83d162a48d8a1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/anomaly_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/forward_ad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/forward_ad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..482794897f1220950a56b4186a0acace27a4b0b0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/forward_ad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/function.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/function.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4e809bb299c836596338e96da94b6c5c366ffc6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/function.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a84d26aaab958f401e6b7622c9d058df6901569
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/grad_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/grad_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91f23ce2455c44462ef55bdd24f6513d88fd4798
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/grad_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/gradcheck.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/gradcheck.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..640970ccfff440a9ab09f66bcd67c848b9743e40
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/gradcheck.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2fefb1b21f8f4330a3d6974e0c48b95aab5ad58
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da7992f12a43fb4e76de133f059f5adaeca49da2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_legacy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_legacy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa2d42ae2ebd1f55441f20e5bf158da4762125f8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_legacy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_util.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b93b70b4f609e0ef9e94595e213cd3eda332fa06
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/profiler_util.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/__pycache__/variable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/__pycache__/variable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2679d8e05cd940d0bec245bd095cfe367979123
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/__pycache__/variable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/__init__.py b/MLPY/Lib/site-packages/torch/autograd/_functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc92d7c7fe74ad79a100e0233150f90becde55be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/_functions/__init__.py
@@ -0,0 +1 @@
+from .tensor import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0756615e91568c405180bdd12586fa5a8bddf8e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e44c30532ca8fffe471ba586b373ae3164852fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80751ea4f8aa4f040170f17b50aa068f3e60fcb2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/autograd/_functions/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/tensor.py b/MLPY/Lib/site-packages/torch/autograd/_functions/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dab52745bf21c148b6705d71e3dc306ad57ba6e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/_functions/tensor.py
@@ -0,0 +1,63 @@
+import operator
+import warnings
+from functools import reduce
+
+import torch
+import torch._utils
+from ..function import Function
+
+
+class Type(Function):
+    @staticmethod
+    def forward(ctx, i, dest_type):
+        warnings.warn(
+            "torch.autograd._functions.Type is deprecated as of PyTorch 2.1, please use "
+            "torch.tensor.to(dtype=dtype) instead."
+        )
+        ctx.input_type = type(i)
+        ctx.input_device = -1 if not i.is_cuda else i.get_device()
+        return i.type(dest_type)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.input_device == -1:
+            return grad_output.type(ctx.input_type), None
+        else:
+            with torch.cuda.device(ctx.input_device):
+                return grad_output.type(ctx.input_type), None
+
+
+# TODO: deprecate this
+class Resize(Function):
+    @staticmethod
+    def forward(ctx, tensor, sizes):
+        ctx.sizes = sizes
+        ctx.numel = reduce(operator.mul, sizes, 1)
+        if tensor.numel() != ctx.numel:
+            raise RuntimeError(
+                (
+                    "requested resize to {} ({} elements in total), "
+                    "but the given tensor has a size of {} ({} elements). "
+                    "autograd's resize can only change the shape of a given "
+                    "tensor, while preserving the number of elements. "
+                ).format(
+                    "x".join(map(str, sizes)),
+                    ctx.numel,
+                    "x".join(map(str, tensor.size())),
+                    tensor.numel(),
+                )
+            )
+        ctx.input_sizes = tensor.size()
+        if tensor.is_quantized:
+            tensor.copy_(tensor)
+            return tensor.contiguous().view(*sizes)
+        if tensor.is_contiguous():
+            result = tensor.new(tensor).contiguous().view(*sizes)
+            return result
+        else:
+            return tensor.contiguous().view(*sizes)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.numel() == ctx.numel
+        return grad_output.contiguous().view(ctx.input_sizes), None
diff --git a/MLPY/Lib/site-packages/torch/autograd/_functions/utils.py b/MLPY/Lib/site-packages/torch/autograd/_functions/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86fd64ed2a2baf88561cd116e0a6ca8cc61dfc15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/_functions/utils.py
@@ -0,0 +1,62 @@
+import operator
+from functools import reduce
+
+
+def maybe_view(tensor, size, check_same_size=True):
+    if check_same_size and tensor.size() == size:
+        return tensor
+    return tensor.contiguous().view(size)
+
+
+def maybe_unexpand(tensor, old_size, check_same_size=True):
+    if check_same_size and tensor.size() == old_size:
+        return tensor
+    num_unsqueezed = tensor.dim() - len(old_size)
+    expanded_dims = [
+        dim
+        for dim, (expanded, original) in enumerate(
+            zip(tensor.size()[num_unsqueezed:], old_size)
+        )
+        if expanded != original
+    ]
+
+    for _ in range(num_unsqueezed):
+        tensor = tensor.sum(0, keepdim=False)
+    for dim in expanded_dims:
+        tensor = tensor.sum(dim, keepdim=True)
+    return tensor
+
+
+# Check whether the op enable broadcasting, and whether it is supported by ONNX.
+# If dims1 and dims2 are different, then broadcast is True.
+# We always assume the combination of dims1 and dims2 is broadcastable.
+# The following types of broadcasting are supported in ONNX:
+#     1) Only one element in dims2, such as dims2 = [1, 1]
+#     2) dims2 is suffix of dims1, such as dims1 = [2, 3, 4], and dims2 = [3, 4]
+# Details can be found here: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Gemm
+def check_onnx_broadcast(dims1, dims2):
+    broadcast = False
+    supported = True
+    len1 = len(dims1)
+    len2 = len(dims2)
+    numel1 = reduce(operator.mul, dims1)
+    numel2 = reduce(operator.mul, dims2)
+    if len1 < len2:
+        broadcast = True
+        if numel2 != 1:
+            supported = False
+    elif len1 > len2:
+        broadcast = True
+        if numel2 != 1 and dims1[len1 - len2 :] != dims2:
+            supported = False
+    else:
+        if dims1 != dims2:
+            broadcast = True
+            if numel2 != 1:
+                supported = False
+
+    if not supported:
+        raise ValueError(
+            f"Numpy style broadcasting is not supported in ONNX. Input dims are: {dims1}, {dims2}"
+        )
+    return broadcast
diff --git a/MLPY/Lib/site-packages/torch/autograd/anomaly_mode.py b/MLPY/Lib/site-packages/torch/autograd/anomaly_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f186c6527fd1b4273f1e1c3dccfde8c3f2d1e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/anomaly_mode.py
@@ -0,0 +1,119 @@
+import warnings
+
+import torch
+
+__all__ = ["detect_anomaly", "set_detect_anomaly"]
+
+
+class detect_anomaly:
+    r"""Context-manager that enable anomaly detection for the autograd engine.
+
+    This does two things:
+
+    - Running the forward pass with detection enabled will allow the backward
+      pass to print the traceback of the forward operation that created the failing
+      backward function.
+    - If ``check_nan`` is ``True``, any backward computation that generate "nan"
+      value will raise an error. Default ``True``.
+
+    .. warning::
+        This mode should be enabled only for debugging as the different tests
+        will slow down your program execution.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ANOMALY)
+        >>> import torch
+        >>> from torch import autograd
+        >>> class MyFunc(autograd.Function):
+        ...     @staticmethod
+        ...     def forward(ctx, inp):
+        ...         return inp.clone()
+        ...     @staticmethod
+        ...     def backward(ctx, gO):
+        ...         # Error during the backward pass
+        ...         raise RuntimeError("Some error in backward")
+        ...         return gO.clone()
+        >>> def run_fn(a):
+        ...     out = MyFunc.apply(a)
+        ...     return out.sum()
+        >>> inp = torch.rand(10, 10, requires_grad=True)
+        >>> out = run_fn(inp)
+        >>> out.backward()
+            Traceback (most recent call last):
+              File "<stdin>", line 1, in <module>
+              File "/your/pytorch/install/torch/_tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+        >>> with autograd.detect_anomaly():
+        ...     inp = torch.rand(10, 10, requires_grad=True)
+        ...     out = run_fn(inp)
+        ...     out.backward()
+            Traceback of forward call that caused the error:
+              File "tmp.py", line 53, in <module>
+                out = run_fn(inp)
+              File "tmp.py", line 44, in run_fn
+                out = MyFunc.apply(a)
+            Traceback (most recent call last):
+              File "<stdin>", line 4, in <module>
+              File "/your/pytorch/install/torch/_tensor.py", line 93, in backward
+                torch.autograd.backward(self, gradient, retain_graph, create_graph)
+              File "/your/pytorch/install/torch/autograd/__init__.py", line 90, in backward
+                allow_unreachable=True)  # allow_unreachable flag
+              File "/your/pytorch/install/torch/autograd/function.py", line 76, in apply
+                return self._forward_cls.backward(self, *args)
+              File "<stdin>", line 8, in backward
+            RuntimeError: Some error in backward
+
+    """
+
+    def __init__(self, check_nan=True) -> None:
+        self.prev = torch.is_anomaly_enabled()
+        self.check_nan = check_nan
+        self.prev_check_nan = torch.is_anomaly_check_nan_enabled()
+        warnings.warn(
+            "Anomaly Detection has been enabled. "
+            "This mode will increase the runtime "
+            "and should only be enabled for debugging.",
+            stacklevel=2,
+        )
+
+    def __enter__(self) -> None:
+        torch.set_anomaly_enabled(True, self.check_nan)
+
+    def __exit__(self, *args: object) -> None:
+        torch.set_anomaly_enabled(self.prev, self.prev_check_nan)
+
+
+class set_detect_anomaly:
+    r"""Context-manager that sets the anomaly detection for the autograd engine on or off.
+
+    ``set_detect_anomaly`` will enable or disable the autograd anomaly detection
+    based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    See ``detect_anomaly`` above for details of the anomaly detection behaviour.
+
+    Args:
+        mode (bool): Flag whether to enable anomaly detection (``True``),
+                     or disable (``False``).
+        check_nan (bool): Flag whether to raise an error when the backward
+                          generate "nan"
+
+    """
+
+    def __init__(self, mode: bool, check_nan: bool = True) -> None:
+        self.prev = torch.is_anomaly_enabled()
+        self.prev_check_nan = torch.is_anomaly_check_nan_enabled()
+        torch.set_anomaly_enabled(mode, check_nan)
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: object) -> None:
+        torch.set_anomaly_enabled(self.prev, self.prev_check_nan)
diff --git a/MLPY/Lib/site-packages/torch/autograd/forward_ad.py b/MLPY/Lib/site-packages/torch/autograd/forward_ad.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f0c1400a6a1ff12614a2741b3f33030003b560
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/forward_ad.py
@@ -0,0 +1,227 @@
+import os
+from collections import namedtuple
+
+from typing import Any
+
+import torch
+from .grad_mode import _DecoratorContextManager
+
+__all__ = [
+    "UnpackedDualTensor",
+    "enter_dual_level",
+    "exit_dual_level",
+    "make_dual",
+    "unpack_dual",
+    "dual_level",
+]
+
+# Global variable used to make the python API simpler to use
+_current_level = -1
+
+
+def enter_dual_level():
+    r"""Enter a new forward grad level.
+
+    This level can be used to make and unpack dual Tensors to compute
+    forward gradients.
+
+    This function also updates the current level that is used by default
+    by the other functions in this API.
+    """
+    global _current_level
+    new_level = torch._C._enter_dual_level()
+    if new_level != _current_level + 1:
+        raise RuntimeError(
+            "Entering a new forward AD level but the current level "
+            "is not valid. Make sure you did not modified it directly."
+        )
+    _current_level = new_level
+    return new_level
+
+
+def exit_dual_level(*, level=None):
+    r"""Exit a forward grad level.
+
+    This function deletes all the gradients associated with this
+    level. Only deleting the latest entered level is allowed.
+
+    This function also updates the current level that is used by default
+    by the other functions in this API.
+    """
+    global _current_level
+    if level is None:
+        level = _current_level
+    if level != _current_level:
+        raise RuntimeError(
+            "Trying to exit a forward AD level that was not the last one "
+            "that was created. This is not supported."
+        )
+    torch._C._exit_dual_level(level=level)
+    _current_level = level - 1
+
+
+def make_dual(tensor, tangent, *, level=None):
+    r"""Associate a tensor value with its tangent to create a "dual tensor" for forward AD gradient computation.
+
+    The result is a new tensor aliased to :attr:`tensor` with :attr:`tangent` embedded
+    as an attribute as-is if it has the same storage layout or copied otherwise.
+    The tangent attribute can be recovered with :func:`unpack_dual`.
+
+    This function is backward differentiable.
+
+    Given a function `f` whose jacobian is `J`, it allows one to compute the Jacobian-vector product (`jvp`)
+    between `J` and a given vector `v` as follows.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> with dual_level():
+        ...     inp = make_dual(x, v)
+        ...     out = f(inp)
+        ...     y, jvp = unpack_dual(out)
+
+    Please see the `forward-mode AD tutorial <https://pytorch.org/tutorials/intermediate/forward_ad_usage.html>`__
+    for detailed steps on how to use this API.
+
+    """
+    # See NOTE: [forward-mode AD decompositions mechanism]
+    #
+    # Import from torch._decomp import decompositions_for_jvp to register
+    # decompositions for jvp to the jit registry
+    #
+    # FIXME: We specify that __debug__ must be True because
+    # if python is run with -OO or -O flags (i.e., __debug__ is False), we encounter the
+    # following error:
+    #
+    # Return value was annotated as having type Tuple[NoneType, NoneType] but is actually of
+    # type Tuple[Tensor, Tensor]:
+    #   File ".../torch/_decomp/__init__.py", line 1585
+    #     else:
+    #         buffer = z
+    #     return min - torch.log1p(z), buffer
+    #     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+    if os.environ.get("PYTORCH_JIT", "1") == "1" and __debug__:
+        from torch._decomp import decompositions_for_jvp  # noqa: F401
+
+    if level is None:
+        level = _current_level
+
+    if level < 0:
+        raise RuntimeError(
+            "Trying to create a dual Tensor for forward AD but no level "
+            "exists, make sure to enter_dual_level() first."
+        )
+    if not (tensor.is_floating_point() or tensor.is_complex()):
+        raise ValueError(
+            f"Expected primal to be floating point or complex, but got: {tensor.dtype}"
+        )
+    if not (tangent.is_floating_point() or tangent.is_complex()):
+        raise ValueError(
+            f"Expected tangent to be floating point or complex, but got: {tangent.dtype}"
+        )
+
+    return torch._VF._make_dual(tensor, tangent, level=level)
+
+
+_UnpackedDualTensor = namedtuple("_UnpackedDualTensor", ["primal", "tangent"])
+
+
+class UnpackedDualTensor(_UnpackedDualTensor):
+    r"""Namedtuple returned by :func:`unpack_dual` containing the primal and tangent components of the dual tensor.
+
+    See :func:`unpack_dual` for more details.
+
+    """
+
+    pass
+
+
+def unpack_dual(tensor, *, level=None):
+    r"""Unpack a "dual tensor" to get both its Tensor value and its forward AD gradient.
+
+    The result is a namedtuple ``(primal, tangent)`` where ``primal`` is a view of
+    :attr:`tensor`'s primal and ``tangent`` is :attr:`tensor`'s tangent as-is.
+    Neither of these tensors can be dual tensor of level :attr:`level`.
+
+    This function is backward differentiable.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> with dual_level():
+        ...     inp = make_dual(x, x_t)
+        ...     out = f(inp)
+        ...     y, jvp = unpack_dual(out)
+        ...     jvp = unpack_dual(out).tangent
+
+    Please see the `forward-mode AD tutorial <https://pytorch.org/tutorials/intermediate/forward_ad_usage.html>`__
+    for detailed steps on how to use this API.
+    """
+    if level is None:
+        level = _current_level
+
+    if level < 0:
+        return UnpackedDualTensor(tensor, None)
+
+    primal, dual = torch._VF._unpack_dual(tensor, level=level)
+
+    return UnpackedDualTensor(primal, dual)
+
+
+class dual_level(_DecoratorContextManager):
+    r"""Context-manager for forward AD, where all forward AD computation must occur within the ``dual_level`` context.
+
+    .. Note::
+
+        The ``dual_level`` context appropriately enters and exit the dual level to
+        controls the current forward AD level, which is used by default by the other
+        functions in this API.
+
+        We currently don't plan to support nested ``dual_level`` contexts, however, so
+        only a single forward AD level is supported. To compute higher-order
+        forward grads, one can use :func:`torch.func.jvp`.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> x = torch.tensor([1])
+        >>> x_t = torch.tensor([1])
+        >>> with dual_level():
+        ...     inp = make_dual(x, x_t)
+        ...     # Do computations with inp
+        ...     out = your_fn(inp)
+        ...     _, grad = unpack_dual(out)
+        >>> grad is None
+        False
+        >>> # After exiting the level, the grad is deleted
+        >>> _, grad_after = unpack_dual(out)
+        >>> grad is None
+        True
+
+    Please see the `forward-mode AD tutorial <https://pytorch.org/tutorials/intermediate/forward_ad_usage.html>`__
+    for detailed steps on how to use this API.
+    """
+
+    def __enter__(self):
+        return enter_dual_level()
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        exit_dual_level()
+
+
+# Private helper functions
+_is_fwd_grad_enabled = torch._C._is_fwd_grad_enabled
+
+
+# Private helper function to enable or disable fwd grad.
+# If you're a user and want to use this, please file an issue to discuss the use case.
+class _set_fwd_grad_enabled(_DecoratorContextManager):
+    def __init__(self, mode: bool) -> None:
+        self.prev = _is_fwd_grad_enabled()
+        torch._C._set_fwd_grad_enabled(mode)
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch._C._set_fwd_grad_enabled(self.prev)
diff --git a/MLPY/Lib/site-packages/torch/autograd/function.py b/MLPY/Lib/site-packages/torch/autograd/function.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ef625876de8ec2d9598e2c82c03b32b2e06f4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/function.py
@@ -0,0 +1,883 @@
+import functools
+import inspect
+import itertools
+import warnings
+from collections import OrderedDict
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch._C as _C
+import torch._functorch as _functorch
+import torch.utils.hooks as hooks
+from torch._C import _functions
+from torch._functorch.autograd_function import custom_function_call
+
+__all__ = [
+    "FunctionCtx",
+    "BackwardCFunction",
+    "FunctionMeta",
+    "Function",
+    "once_differentiable",
+    "traceable",
+    "InplaceFunction",
+    "NestedIOFunction",
+]
+
+# Unique id provider for each class inheriting from Function
+# This is incremented in FunctionMeta during class definition
+AUTOGRAD_FUNCTION_COUNTER = itertools.count()
+
+
+# Formerly known as: _ContextMethodMixin
+class FunctionCtx:
+    def save_for_backward(self, *tensors: torch.Tensor):
+        r"""Save given tensors for a future call to :func:`~Function.backward`.
+
+        ``save_for_backward`` should be called at most once, only from inside the
+        :func:`forward` method, and only with tensors.
+
+        All tensors intended to be used in the backward pass should be saved
+        with ``save_for_backward`` (as opposed to directly on ``ctx``) to prevent
+        incorrect gradients and memory leaks, and enable the application of saved
+        tensor hooks. See :class:`torch.autograd.graph.saved_tensors_hooks`.
+
+        Note that if intermediary tensors, tensors that are neither inputs
+        nor outputs of :func:`forward`, are saved for backward, your custom Function
+        may not support double backward.
+        Custom Functions that do not support double backward should decorate their
+        :func:`backward` method with ``@once_differentiable`` so that performing
+        double backward raises an error. If you'd like to support double backward,
+        you can either recompute intermediaries based on the inputs during backward
+        or return the intermediaries as the outputs of the custom Function. See the
+        `double backward tutorial <https://pytorch.org/tutorials/intermediate/custom_function_double_backward_tutorial.html>`_
+        for more details.
+
+        In :func:`backward`, saved tensors can be accessed through the :attr:`saved_tensors`
+        attribute. Before returning them to the user, a check is made to ensure
+        they weren't used in any in-place operation that modified their content.
+
+        Arguments can also be ``None``. This is a no-op.
+
+        See :ref:`extending-autograd` for more details on how to use this method.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
+            >>>         w = x * z
+            >>>         out = x * y + y * z + w * y
+            >>>         ctx.save_for_backward(x, y, w, out)
+            >>>         ctx.z = z  # z is not a tensor
+            >>>         return out
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, grad_out):
+            >>>         x, y, w, out = ctx.saved_tensors
+            >>>         z = ctx.z
+            >>>         gx = grad_out * (y + y * z)
+            >>>         gy = grad_out * (x + z + w)
+            >>>         gz = None
+            >>>         return gx, gy, gz
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double)
+            >>> b = torch.tensor(2., requires_grad=True, dtype=torch.double)
+            >>> c = 4
+            >>> d = Func.apply(a, b, c)
+
+        """
+        self.to_save = tensors
+
+    def save_for_forward(self, *tensors: torch.Tensor):
+        r"""Save given tensors for a future call to :func:`~Function.jvp`.
+
+        ``save_for_forward`` should be only called once, from inside the :func:`forward`
+        method, and only be called with tensors.
+
+        In :func:`jvp`, saved objects can be accessed through the :attr:`saved_tensors`
+        attribute.
+
+        Arguments can also be ``None``. This is a no-op.
+
+        See :ref:`extending-autograd` for more details on how to use this method.
+
+        Example::
+            >>> # xdoctest: +SKIP
+            >>> class Func(torch.autograd.Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x: torch.Tensor, y: torch.Tensor, z: int):
+            >>>         ctx.save_for_backward(x, y)
+            >>>         ctx.save_for_forward(x, y)
+            >>>         ctx.z = z
+            >>>         return x * y * z
+            >>>
+            >>>     @staticmethod
+            >>>     def jvp(ctx, x_t, y_t, _):
+            >>>         x, y = ctx.saved_tensors
+            >>>         z = ctx.z
+            >>>         return z * (y * x_t + x * y_t)
+            >>>
+            >>>     @staticmethod
+            >>>     def vjp(ctx, grad_out):
+            >>>         x, y = ctx.saved_tensors
+            >>>         z = ctx.z
+            >>>         return z * grad_out * y, z * grad_out * x, None
+            >>>
+            >>>     a = torch.tensor(1., requires_grad=True, dtype=torch.double)
+            >>>     t = torch.tensor(1., dtype=torch.double)
+            >>>     b = torch.tensor(2., requires_grad=True, dtype=torch.double)
+            >>>     c = 4
+            >>>
+            >>>     with fwAD.dual_level():
+            >>>         a_dual = fwAD.make_dual(a, t)
+            >>>         d = Func.apply(a_dual, b, c)
+
+        """
+        for tensor in tensors:
+            assert isinstance(tensor, torch.Tensor) or tensor is None, (
+                "save_for_forward expects all arguments to be tensors; you should "
+                "save non-tensors as attributes on ctx."
+            )
+
+        self.saved_for_forward = tensors
+
+    def mark_dirty(self, *args: torch.Tensor):
+        r"""Mark given tensors as modified in an in-place operation.
+
+        **This should be called at most once, only from inside the**
+        :func:`forward` **method, and all arguments should be inputs.**
+
+        Every tensor that's been modified in-place in a call to :func:`forward`
+        should be given to this function, to ensure correctness of our checks.
+        It doesn't matter whether the function is called before or after
+        modification.
+
+        Examples::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+            >>> class Inplace(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         x_npy = x.numpy() # x_npy shares storage with x
+            >>>         x_npy += 1
+            >>>         ctx.mark_dirty(x)
+            >>>         return x
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, grad_output):
+            >>>         return grad_output
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True, dtype=torch.double).clone()
+            >>> b = a * a
+            >>> Inplace.apply(a)  # This would lead to wrong gradients!
+            >>>                   # but the engine would not know unless we mark_dirty
+            >>> # xdoctest: +SKIP
+            >>> b.backward() # RuntimeError: one of the variables needed for gradient
+            >>>              # computation has been modified by an inplace operation
+
+        """
+        self.dirty_tensors = args
+
+    def mark_shared_storage(self, *pairs):
+        warnings.warn(
+            "mark_shared_storage is deprecated. "
+            "Tensors with shared storages are automatically tracked. Note "
+            "that calls to `set_()` are not tracked"
+        )
+
+    def mark_non_differentiable(self, *args: torch.Tensor):
+        r"""Mark outputs as non-differentiable.
+
+        **This should be called at most once, only from inside the**
+        :func:`forward` **method, and all arguments should be tensor outputs.**
+
+        This will mark outputs as not requiring gradients, increasing the
+        efficiency of backward computation. You still need to accept a gradient
+        for each output in :meth:`~Function.backward`, but it's always going to
+        be a zero tensor with the same shape as the shape of a corresponding
+        output.
+
+        This is used e.g. for indices returned from a sort. See example::
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         sorted, idx = x.sort()
+            >>>         ctx.mark_non_differentiable(idx)
+            >>>         ctx.save_for_backward(x, idx)
+            >>>         return sorted, idx
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):  # still need to accept g2
+            >>>         x, idx = ctx.saved_tensors
+            >>>         grad_input = torch.zeros_like(x)
+            >>>         grad_input.index_add_(0, idx, g1)
+            >>>         return grad_input
+
+        """
+        self.non_differentiable = args
+
+    def set_materialize_grads(self, value: bool):
+        r"""Set whether to materialize grad tensors. Default is ``True``.
+
+        **This should be called only from inside the** :func:`forward` **method**
+
+        If ``True``, undefined grad tensors will be expanded to tensors full of zeros
+        prior to calling the :func:`backward` and :func:`jvp` methods.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+            >>> class SimpleFunc(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         return x.clone(), x.clone()
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):
+            >>>         return g1 + g2  # No check for None necessary
+            >>>
+            >>> # We modify SimpleFunc to handle non-materialized grad outputs
+            >>> class Func(Function):
+            >>>     @staticmethod
+            >>>     def forward(ctx, x):
+            >>>         ctx.set_materialize_grads(False)
+            >>>         ctx.save_for_backward(x)
+            >>>         return x.clone(), x.clone()
+            >>>
+            >>>     @staticmethod
+            >>>     @once_differentiable
+            >>>     def backward(ctx, g1, g2):
+            >>>         x, = ctx.saved_tensors
+            >>>         grad_input = torch.zeros_like(x)
+            >>>         if g1 is not None:  # We must check for None now
+            >>>             grad_input += g1
+            >>>         if g2 is not None:
+            >>>             grad_input += g2
+            >>>         return grad_input
+            >>>
+            >>> a = torch.tensor(1., requires_grad=True)
+            >>> b, _ = Func.apply(a)  # induces g2 to be undefined
+
+        """
+        self.materialize_grads = value
+
+
+# DO NOT USE: This is only defined to be able to load old serialized models
+_ContextMethodMixin = FunctionCtx
+
+
+class _HookMixin:
+    @staticmethod
+    def _register_hook(backward_hooks, hook):
+        if backward_hooks is None:
+            backward_hooks = OrderedDict()
+        handle = hooks.RemovableHandle(backward_hooks)
+        backward_hooks[handle.id] = hook
+        return backward_hooks, handle
+
+
+class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin):
+    r"""
+    This class is used for internal autograd work. Do not use.
+    """
+
+    def apply(self, *args):
+        r"""
+        Apply method used when executing this Node during the backward
+        """
+        # _forward_cls is defined by derived class
+        # The user should define either backward or vjp but never both.
+        backward_fn = self._forward_cls.backward  # type: ignore[attr-defined]
+        vjp_fn = self._forward_cls.vjp  # type: ignore[attr-defined]
+        if backward_fn is not Function.backward and vjp_fn is not Function.vjp:
+            raise RuntimeError(
+                "Implementing both 'backward' and 'vjp' for a custom "
+                "Function is not allowed. You should only implement one "
+                "of them."
+            )
+        user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
+        return user_fn(self, *args)
+
+    def apply_jvp(self, *args):
+        r"""
+        Apply method used when executing forward mode AD during the forward
+        """
+        # _forward_cls is defined by derived class
+        return self._forward_cls.jvp(self, *args)  # type: ignore[attr-defined]
+
+    def _compiled_autograd_key(self):
+        return self._forward_cls._compiled_autograd_key(self)  # type: ignore[attr-defined]
+
+
+def _warn_traceable_deprecated():
+    warnings.warn(
+        "The is_traceable field on torch.autograd.Function is deprecated "
+        "and will be removed in PyTorch 2.4.",
+        stacklevel=3,
+    )
+
+
+class FunctionMeta(type):
+    """Function metaclass.
+
+    This metaclass sets up the following properties:
+        _backward_cls: The Function class corresponding to the differentiated
+            version of this function (which is generated on the fly by this
+            metaclass).
+    """
+
+    def __init__(cls, name, bases, attrs):
+        backward_fn = type(
+            name + "Backward", (BackwardCFunction,), {"_forward_cls": cls}
+        )
+        backward_fn._autograd_function_id = next(AUTOGRAD_FUNCTION_COUNTER)  # type: ignore[attr-defined]
+        backward_fn._compiled_autograd_should_lift = attrs.get(  # type: ignore[attr-defined]
+            "_compiled_autograd_should_lift", True
+        )
+        cls._backward_cls = backward_fn
+
+        if "is_traceable" in attrs and attrs["is_traceable"] is True:
+            _warn_traceable_deprecated()
+
+        super().__init__(name, bases, attrs)
+
+    def __getattribute__(cls, name):
+        if name == "is_traceable":
+            _warn_traceable_deprecated()
+        return super().__getattribute__(name)
+
+    def __setattr__(cls, name, value):
+        if name == "is_traceable" and value is True:
+            warnings.warn(
+                "The is_traceable field on torch.autograd.Function is deprecated "
+                "and will be removed in PyTorch 2.4.",
+                stacklevel=2,
+            )
+        return super().__setattr__(name, value)
+
+
+class _SingleLevelFunction(
+    _C._FunctionBase, FunctionCtx, _HookMixin, metaclass=FunctionMeta
+):
+    @staticmethod
+    def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+        r"""Define the forward of the custom autograd Function.
+
+        This function is to be overridden by all subclasses.
+        There are two ways to define forward:
+
+        Usage 1 (Combined forward and ctx)::
+
+            @staticmethod
+            def forward(ctx: Any, *args: Any, **kwargs: Any) -> Any:
+                pass
+
+        - It must accept a context ctx as the first argument, followed by any
+          number of arguments (tensors or other types).
+        - See :ref:`combining-forward-context` for more details
+
+        Usage 2 (Separate forward and ctx)::
+
+            @staticmethod
+            def forward(*args: Any, **kwargs: Any) -> Any:
+                pass
+
+            @staticmethod
+            def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+                pass
+
+        - The forward no longer accepts a ctx argument.
+        - Instead, you must also override the :meth:`torch.autograd.Function.setup_context`
+          staticmethod to handle setting up the ``ctx`` object.
+          ``output`` is the output of the forward, ``inputs`` are a Tuple of inputs
+          to the forward.
+        - See :ref:`extending-autograd` for more details
+
+        The context can be used to store arbitrary data that can be then
+        retrieved during the backward pass. Tensors should not be stored
+        directly on `ctx` (though this is not currently enforced for
+        backward compatibility). Instead, tensors should be saved either with
+        :func:`ctx.save_for_backward` if they are intended to be used in
+        ``backward`` (equivalently, ``vjp``) or :func:`ctx.save_for_forward`
+        if they are intended to be used for in ``jvp``.
+        """
+        raise NotImplementedError(
+            "You must implement the forward function for custom autograd.Function."
+        )
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> Any:
+        r"""There are two ways to define the forward pass of an autograd.Function.
+
+        Either:
+
+        1. Override forward with the signature ``forward(ctx, *args, **kwargs)``.
+           ``setup_context`` is not overridden. Setting up the ctx for backward
+           happens inside the ``forward``.
+        2. Override forward with the signature ``forward(*args, **kwargs)`` and
+           override ``setup_context``. Setting up the ctx for backward happens
+           inside ``setup_context`` (as opposed to inside the ``forward``)
+
+        See :meth:`torch.autograd.Function.forward` and :ref:`extending-autograd` for more details.
+        """
+        raise NotImplementedError("setup_context is not implemented.")
+
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Any) -> Any:
+        r"""Define a formula for differentiating the operation with backward mode automatic differentiation.
+
+        This function is to be overridden by all subclasses.
+        (Defining this function is equivalent to defining the ``vjp`` function.)
+
+        It must accept a context :attr:`ctx` as the first argument, followed by
+        as many outputs as the :func:`forward` returned (None will be passed in
+        for non tensor outputs of the forward function),
+        and it should return as many tensors, as there were inputs to
+        :func:`forward`. Each argument is the gradient w.r.t the given output,
+        and each returned value should be the gradient w.r.t. the
+        corresponding input. If an input is not a Tensor or is a Tensor not
+        requiring grads, you can just pass None as a gradient for that input.
+
+        The context can be used to retrieve tensors saved during the forward
+        pass. It also has an attribute :attr:`ctx.needs_input_grad` as a tuple
+        of booleans representing whether each input needs gradient. E.g.,
+        :func:`backward` will have ``ctx.needs_input_grad[0] = True`` if the
+        first input to :func:`forward` needs gradient computed w.r.t. the
+        output.
+        """
+        raise NotImplementedError(
+            "You must implement either the backward or vjp method for "
+            "your custom autograd.Function to use it with backward "
+            "mode AD."
+        )
+
+    # vjp and backward are alias of each other
+    vjp = backward
+
+    @staticmethod
+    def jvp(ctx: Any, *grad_inputs: Any) -> Any:
+        r"""Define a formula for differentiating the operation with forward mode automatic differentiation.
+
+        This function is to be overridden by all subclasses.
+        It must accept a context :attr:`ctx` as the first argument, followed by
+        as many inputs as the :func:`forward` got (None will be passed in
+        for non tensor inputs of the forward function),
+        and it should return as many tensors as there were outputs to
+        :func:`forward`. Each argument is the gradient w.r.t the given input,
+        and each returned value should be the gradient w.r.t. the
+        corresponding output. If an output is not a Tensor or the function is not
+        differentiable with respect to that output, you can just pass None as a
+        gradient for that input.
+
+        You can use the :attr:`ctx` object to pass any value from the forward to this
+        functions.
+        """
+        raise NotImplementedError(
+            "You must implement the jvp function for custom "
+            "autograd.Function to use it with forward mode AD."
+        )
+
+
+class Function(_SingleLevelFunction):
+    r"""Base class to create custom `autograd.Function`.
+
+    To create a custom `autograd.Function`, subclass this class and implement
+    the :meth:`forward` and :meth:`backward` static methods. Then, to use your custom
+    op in the forward pass, call the class method ``apply``. Do not call
+    :meth:`forward` directly.
+
+    To ensure correctness and best performance, make sure you are calling the
+    correct methods on ``ctx`` and validating your backward function using
+    :func:`torch.autograd.gradcheck`.
+
+    See :ref:`extending-autograd` for more details on how to use this class.
+
+    Examples::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> class Exp(Function):
+        >>>     @staticmethod
+        >>>     def forward(ctx, i):
+        >>>         result = i.exp()
+        >>>         ctx.save_for_backward(result)
+        >>>         return result
+        >>>
+        >>>     @staticmethod
+        >>>     def backward(ctx, grad_output):
+        >>>         result, = ctx.saved_tensors
+        >>>         return grad_output * result
+        >>>
+        >>> # Use it by calling the apply method:
+        >>> # xdoctest: +SKIP
+        >>> output = Exp.apply(input)
+    """
+
+    def __init__(self, *args, **kwargs):
+        cls = self.__class__
+        warnings.warn(
+            f"{cls} should not be instantiated. Methods on autograd functions"
+            "are all static, so you should invoke them on the class itself. "
+            "Instantiating an autograd function will raise an "
+            "error in a future version of PyTorch.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError(
+            "Legacy autograd function with non-static forward method is deprecated. "
+            "Please use new-style autograd function with static forward method. "
+            "(Example: https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)"
+        )
+
+    # for the tracer
+    is_traceable = False
+
+    """
+    Bool that specifies if PyTorch should attempt to autogenerate
+    :func:`torch.vmap` support for this autograd.Function. You may set this to
+    True only if this autograd.Function's forward, backward, and jvp (if they
+    exist) are written using PyTorch operations; otherwise, please override
+    :meth:`torch.autograd.Function.vmap` to add support for :func:`torch.vmap`.
+
+    Please see :ref:`func-autograd-function` for more details.
+    """
+    generate_vmap_rule = False
+
+    @staticmethod
+    def vmap(info, in_dims, *args):
+        r"""Define the behavior for this autograd.Function underneath :func:`torch.vmap`.
+
+        For a :func:`torch.autograd.Function` to support
+        :func:`torch.vmap`, you must either override this static method, or set
+        ``generate_vmap_rule`` to ``True`` (you may not do both).
+
+        If you choose to override this staticmethod: it must accept
+
+        - an ``info`` object as the first argument. ``info.batch_size``
+          specifies the size of the dimension being vmapped over,
+          while ``info.randomness`` is the randomness option passed to
+          :func:`torch.vmap`.
+        - an ``in_dims`` tuple as the second argument.
+          For each arg in ``args``, ``in_dims`` has a corresponding
+          ``Optional[int]``. It is ``None`` if the arg is not a Tensor or if
+          the arg is not being vmapped over, otherwise, it is an integer
+          specifying what dimension of the Tensor is being vmapped over.
+        - ``*args``, which is the same as the args to :meth:`~Function.forward`.
+
+        The return of the vmap staticmethod is a tuple of ``(output, out_dims)``.
+        Similar to ``in_dims``, ``out_dims`` should be of the same structure as
+        ``output`` and contain one ``out_dim`` per output that specifies if the
+        output has the vmapped dimension and what index it is in.
+
+        Please see :ref:`func-autograd-function` for more details.
+        """
+        raise NotImplementedError(
+            "To use autograd.Function with vmap, you must either override the "
+            "vmap staticmethod or set generate_vmap_rule=True."
+        )
+
+    @classmethod
+    def apply(cls, *args, **kwargs):
+        def bind_default_args(func, *args, **kwargs):
+            signature = inspect.signature(func)
+            bound_args = signature.bind(*args, **kwargs)
+            bound_args.apply_defaults()
+
+            return bound_args.args
+
+        is_setup_ctx_defined = cls.setup_context != _SingleLevelFunction.setup_context
+        if is_setup_ctx_defined:
+            args = bind_default_args(cls.forward, *args, **kwargs)
+
+        if not torch._C._are_functorch_transforms_active():
+            # See NOTE: [functorch vjp and autograd interaction]
+            args = _functorch.utils.unwrap_dead_wrappers(args)
+            return super().apply(*args, **kwargs)  # type: ignore[misc]
+
+        if not is_setup_ctx_defined:
+            raise RuntimeError(
+                "In order to use an autograd.Function with functorch transforms "
+                "(vmap, grad, jvp, jacrev, ...), it must override the setup_context "
+                "staticmethod. For more details, please see "
+                "https://pytorch.org/docs/master/notes/extending.func.html"
+            )
+
+        return custom_function_call(cls, *args, **kwargs)
+
+    @staticmethod
+    def _compiled_autograd_key(ctx):
+        return (ctx._autograd_function_id,)
+
+
+def once_differentiable(fn):
+    @functools.wraps(fn)
+    def wrapper(ctx, *args):
+        with torch.no_grad():
+            outputs = fn(ctx, *args)
+
+        if not torch.is_grad_enabled():
+            return outputs
+
+        # If any of the inputs have requires_grad=True, we force the outputs
+        # to have requires_grad=True but point to a grad_fn which throws an
+        # error message during (double) back-propagation.
+        # XXX: this is only an approximation of requires_grad - there's no way
+        # to figure out if fn didn't use ctx.saved_tensors and as a result
+        # some Tensors might require grad, even if no args do.
+        # Unfortunately, this leads to unexpected error messages ("no nodes
+        # require computing gradients"), but I don't have a better idea.
+        # These functions would raise an error in backward anyway.
+        requires_grad = any(
+            isinstance(arg, torch.Tensor) and arg.requires_grad for arg in args
+        )
+        if not requires_grad:
+            return outputs
+
+        if not isinstance(outputs, tuple):
+            outputs = (outputs,)
+
+        err_fn = _functions.DelayedError(
+            b"trying to differentiate twice a function that was marked "
+            b"with @once_differentiable",
+            len(outputs),
+        )
+
+        # Create aliases of each output that has requires_grad=True. We need
+        # at least one of the inputs to err_fn to require grad so that the
+        # output will have a grad_fn.
+        def fake_requires_grad(var):
+            if var is not None:
+                var = var.detach()
+                var.requires_grad = True
+            return var
+
+        return err_fn(*[fake_requires_grad(v) for v in outputs])
+
+    return wrapper
+
+
+def traceable(fn_cls):
+    r"""Mark Function as traceable for the JIT.
+
+    Traceable functions have additional restrictions - they can't pass any
+    data-dependent values to backward (e.g. Prod passes the output, which makes
+    it non-traceable), and their backward should be implemented entirely in terms
+    of operations on autograd Tensors in all cases.
+
+    DON'T USE THIS DECORATOR. IT IS FOR INTERNAL USE ONLY AND SHOULD BE HANDLED WITH
+    CARE (or can give incorrect results otherwise).
+    """
+    warnings.warn(
+        "torch.autograd.function.traceable is deprecated "
+        "and will be removed in PyTorch 2.4.",
+        stacklevel=2,
+    )
+    fn_cls.is_traceable = True
+    return fn_cls
+
+
+class InplaceFunction(Function):
+    r"""
+    This class is here only for backward compatibility reasons.
+    Use :class:`Function` instead of this for any new use case.
+    """
+
+    def __init__(self, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+
+
+def _nested_map(condition, fn, condition_msg=None):
+    def _map(obj):
+        if condition(obj):
+            return fn(obj)
+        elif obj is None:
+            return None
+        elif isinstance(obj, (list, tuple)):
+            mapped = (_map(x) for x in obj)
+            if hasattr(obj, "_fields"):
+                # obj is namedtuple
+                return type(obj)(*mapped)
+            return type(obj)(mapped)
+        elif isinstance(obj, dict):
+            return {x: _map(obj[x]) for x in obj}
+        else:
+            raise ValueError(
+                "Auto nesting doesn't know how to process "
+                "an input object of type "
+                + torch.typename(obj)
+                + (
+                    ". Accepted types: " + condition_msg + ", or lists/tuples of them"
+                    if condition_msg
+                    else ""
+                )
+            )
+
+    return _map
+
+
+def _jit_unwrap_structured(obj):
+    if hasattr(obj, "_jit_unwrap"):
+        return obj._jit_unwrap()
+    return obj
+
+
+def _iter_filter(condition, allow_unknown=False, condition_msg=None, conversion=None):
+    def _iter(obj):
+        if conversion is not None:
+            obj = conversion(obj)
+        if condition(obj):
+            yield obj
+        elif obj is None:
+            return
+        elif isinstance(obj, (list, tuple)):
+            for o in obj:
+                yield from _iter(o)
+        elif isinstance(obj, dict):
+            # We only accept primitive key types, so we needn't inspect them
+            for o in obj.values():
+                yield from _iter(o)
+        elif allow_unknown:
+            yield obj
+        else:
+            raise ValueError(
+                "Auto nesting doesn't know how to process "
+                "an input object of type "
+                + torch.typename(obj)
+                + (
+                    ". Accepted types: " + condition_msg + ", or lists/tuples of them"
+                    if condition_msg
+                    else ""
+                )
+            )
+
+    return _iter
+
+
+def _unflatten(input, proto):
+    # unflatten a list or tuple input into a nested list/tuple structure
+    # specified by proto
+    def unflatten_helper(input, proto):
+        res: List[Optional[torch.Tensor]] = []
+        if hasattr(proto, "_jit_wrap"):
+            return proto._jit_wrap(input)
+        if not isinstance(proto, (list, tuple)):
+            return input[0], input[1:]
+        for e in proto:
+            if e is None:
+                res.append(e)
+            else:
+                res_e, input = unflatten_helper(input, e)
+                res.append(res_e)
+        return type(proto)(res), input
+
+    return unflatten_helper(input, proto)[0]
+
+
+_iter_jit_values = _iter_filter(
+    lambda o: o is None or isinstance(o, torch._C.Value),
+    condition_msg="jit's Values or None",
+)
+_iter_tensors = _iter_filter(
+    lambda x: isinstance(x, torch.Tensor),
+    condition_msg="Tensors",
+    conversion=_jit_unwrap_structured,
+)
+_iter_tensors_permissive = _iter_filter(
+    lambda x: isinstance(x, torch.Tensor),
+    allow_unknown=True,
+    condition_msg="Tensors (permissive)",
+)
+_iter_None_tensors = _iter_filter(
+    lambda o: o is None or isinstance(o, torch.Tensor), condition_msg="Tensors or None"
+)
+_map_tensor_data = _nested_map(
+    lambda x: isinstance(x, torch.Tensor), lambda o: o.data, condition_msg="Tensors"
+)
+
+
+class NestedIOFunction(Function):
+    r"""
+    This class is here only for backward compatibility reasons.
+    Use :class:`Function` instead of this for any new use case.
+    """
+    # The 'type: ignore' statements are needed here because these functions are declared as '@staticmethod' in the
+    # superclass (Function) but are instance methods here, which mypy reports as incompatible.
+
+    def _do_forward(self, *input):
+        self._nested_input = input
+        flat_input = tuple(_iter_tensors(input))
+        flat_output = super()._do_forward(*flat_input)  # type: ignore[misc]
+        nested_output = self._nested_output
+        nested_tensors = _unflatten(flat_output, self._nested_output)
+        return nested_tensors
+
+    def _do_backward(self, gradients, retain_variables):
+        self.retain_variables = retain_variables
+        result = super()._do_backward(gradients, retain_variables)  # type: ignore[misc]
+        if not retain_variables:
+            del self._nested_output
+            del self._to_save_nested
+        return result
+
+    def backward(self, *gradients: Any) -> Any:  # type: ignore[override]
+        r"""
+        Shared backward utility.
+        """
+        nested_gradients = _unflatten(gradients, self._nested_output)
+        result = self.backward_extended(*nested_gradients)  # type: ignore[func-returns-value]
+        return tuple(_iter_None_tensors(result))
+
+    __call__ = _do_forward
+
+    def forward(self, *args: Any) -> Any:  # type: ignore[override]
+        r"""
+        Shared forward utility.
+        """
+        nested_tensors = _map_tensor_data(self._nested_input)
+        result = self.forward_extended(*nested_tensors)  # type: ignore[func-returns-value]
+        del self._nested_input
+        self._nested_output = result
+        return tuple(_iter_tensors(result))
+
+    def save_for_backward(self, *args: Any) -> None:
+        r"""
+        See :meth:`Function.save_for_backward`.
+        """
+        self.to_save = tuple(_iter_tensors(args))
+        self._to_save_nested = args
+
+    @property
+    def saved_tensors(self):
+        r"""
+        See :meth:`Function.saved_tensors`.
+        """
+        flat_tensors = super().saved_tensors  # type: ignore[misc]
+        return _unflatten(flat_tensors, self._to_save_nested)
+
+    def mark_dirty(self, *args: Any, **kwargs: Any) -> None:
+        r"""
+        See :meth:`Function.mark_dirty`.
+        """
+        self.dirty_tensors = tuple(_iter_tensors((args, kwargs)))
+
+    def mark_non_differentiable(self, *args: Any, **kwargs: Any) -> None:
+        r"""
+        See :meth:`Function.mark_non_differentiable`.
+        """
+        self.non_differentiable = tuple(_iter_tensors((args, kwargs)))
+
+    def forward_extended(self, *input: Any) -> None:
+        r"""
+        User defined forward.
+        """
+        raise NotImplementedError
+
+    def backward_extended(self, *grad_output: Any) -> None:
+        r"""
+        User defined backward.
+        """
+        raise NotImplementedError
diff --git a/MLPY/Lib/site-packages/torch/autograd/functional.py b/MLPY/Lib/site-packages/torch/autograd/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d817dbef15407efc63e8397da87dd5fcc5b99a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/functional.py
@@ -0,0 +1,1182 @@
+from typing import List, Tuple
+
+import torch
+from torch._vmap_internals import _vmap
+from . import forward_ad as fwAD
+
+__all__ = ["vjp", "jvp", "jacobian", "hessian", "hvp", "vhp"]
+
+# Utility functions
+
+
+def _as_tuple_nocheck(x):
+    if isinstance(x, tuple):
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return (x,)
+
+
+def _as_tuple(inp, arg_name=None, fn_name=None):
+    # Ensures that inp is a tuple of Tensors
+    # Returns whether or not the original inp was a tuple and the tupled version of the input
+    if arg_name is None and fn_name is None:
+        return _as_tuple_nocheck(inp)
+
+    is_inp_tuple = True
+    if not isinstance(inp, tuple):
+        inp = (inp,)
+        is_inp_tuple = False
+
+    for i, el in enumerate(inp):
+        if not isinstance(el, torch.Tensor):
+            if is_inp_tuple:
+                raise TypeError(
+                    f"The {arg_name} given to {fn_name} must be either a Tensor or a tuple of Tensors but the"
+                    f" value at index {i} has type {type(el)}."
+                )
+            else:
+                raise TypeError(
+                    f"The {arg_name} given to {fn_name} must be either a Tensor or a tuple of Tensors but the"
+                    f" given {arg_name} has type {type(el)}."
+                )
+
+    return is_inp_tuple, inp
+
+
+def _tuple_postprocess(res, to_unpack):
+    # Unpacks a potentially nested tuple of Tensors
+    # to_unpack should be a single boolean or a tuple of two booleans.
+    # It is used to:
+    # - invert _as_tuple when res should match the inp given to _as_tuple
+    # - optionally remove nesting of two tuples created by multiple calls to _as_tuple
+    if isinstance(to_unpack, tuple):
+        assert len(to_unpack) == 2
+        if not to_unpack[1]:
+            res = tuple(el[0] for el in res)
+        if not to_unpack[0]:
+            res = res[0]
+    else:
+        if not to_unpack:
+            res = res[0]
+    return res
+
+
+def _grad_preprocess(inputs, create_graph, need_graph):
+    # Preprocess the inputs to make sure they require gradient
+    # inputs is a tuple of Tensors to preprocess
+    # create_graph specifies if the user wants gradients to flow back to the Tensors in inputs
+    # need_graph specifies if we internally want gradients to flow back to the Tensors in res
+    # Note that we *always* create a new Tensor object to be able to see the difference between
+    # inputs given as arguments and the same Tensors automatically captured by the user function.
+    # Check this issue for more details on how that can happen: https://github.com/pytorch/pytorch/issues/32576
+    res = []
+    for inp in inputs:
+        if create_graph and inp.requires_grad:
+            # Create at least a new Tensor object in a differentiable way
+            if not inp.is_sparse:
+                # Use .view_as() to get a shallow copy
+                res.append(inp.view_as(inp))
+            else:
+                # We cannot use view for sparse Tensors so we clone
+                res.append(inp.clone())
+        else:
+            res.append(inp.detach().requires_grad_(need_graph))
+    return tuple(res)
+
+
+def _grad_postprocess(inputs, create_graph):
+    # Postprocess the generated Tensors to avoid returning Tensors with history when the user did not
+    # request it.
+    if isinstance(inputs[0], torch.Tensor):
+        if not create_graph:
+            return tuple(inp.detach() for inp in inputs)
+        else:
+            return inputs
+    else:
+        return tuple(_grad_postprocess(inp, create_graph) for inp in inputs)
+
+
+def _validate_v(v, other, is_other_tuple):
+    # This assumes that other is the correct shape, and v should match
+    # Both are assumed to be tuples of Tensors
+    if len(other) != len(v):
+        if is_other_tuple:
+            raise RuntimeError(
+                f"v is a tuple of invalid length: should be {len(other)} but got {len(v)}."
+            )
+        else:
+            raise RuntimeError("The given v should contain a single Tensor.")
+
+    for idx, (el_v, el_other) in enumerate(zip(v, other)):
+        if el_v.size() != el_other.size():
+            prepend = ""
+            if is_other_tuple:
+                prepend = f"Entry {idx} in "
+            raise RuntimeError(
+                f"{prepend}v has invalid size: should be {el_other.size()} but got {el_v.size()}."
+            )
+
+
+def _check_requires_grad(inputs, input_type, strict):
+    # Used to make all the necessary checks to raise nice errors in strict mode.
+    if not strict:
+        return
+
+    if input_type not in ["outputs", "grad_inputs", "jacobian", "hessian"]:
+        raise RuntimeError("Invalid input_type to _check_requires_grad")
+    for i, inp in enumerate(inputs):
+        if inp is None:
+            # This can only be reached for grad_inputs.
+            raise RuntimeError(
+                f"The output of the user-provided function is independent of input {i}."
+                " This is not allowed in strict mode."
+            )
+        if not inp.requires_grad:
+            if input_type == "hessian":
+                raise RuntimeError(
+                    f"The hessian of the user-provided function with respect to input {i}"
+                    " is independent of the input. This is not allowed in strict mode."
+                    " You should ensure that your function is thrice differentiable and that"
+                    " the hessian depends on the inputs."
+                )
+            elif input_type == "jacobian":
+                raise RuntimeError(
+                    "While computing the hessian, found that the jacobian of the user-provided"
+                    f" function with respect to input {i} is independent of the input. This is not"
+                    " allowed in strict mode. You should ensure that your function is twice"
+                    " differentiable and that the jacobian depends on the inputs (this would be"
+                    " violated by a linear function for example)."
+                )
+            elif input_type == "grad_inputs":
+                raise RuntimeError(
+                    f"The gradient with respect to input {i} is independent of the inputs of the"
+                    " user-provided function. This is not allowed in strict mode."
+                )
+            else:
+                raise RuntimeError(
+                    f"Output {i} of the user-provided function does not require gradients."
+                    " The outputs must be computed in a differentiable manner from the input"
+                    " when running in strict mode."
+                )
+
+
+def _autograd_grad(
+    outputs,
+    inputs,
+    grad_outputs=None,
+    create_graph=False,
+    retain_graph=None,
+    is_grads_batched=False,
+):
+    # Version of autograd.grad that accepts `None` in outputs and do not compute gradients for them.
+    # This has the extra constraint that inputs has to be a tuple
+    assert isinstance(outputs, tuple)
+    if grad_outputs is None:
+        grad_outputs = (None,) * len(outputs)
+    assert isinstance(grad_outputs, tuple)
+    assert len(outputs) == len(grad_outputs)
+
+    new_outputs: Tuple[torch.Tensor, ...] = tuple()
+    new_grad_outputs: Tuple[torch.Tensor, ...] = tuple()
+    for out, grad_out in zip(outputs, grad_outputs):
+        if out is not None and out.requires_grad:
+            new_outputs += (out,)
+            new_grad_outputs += (grad_out,)
+
+    if len(new_outputs) == 0:
+        # No differentiable output, we don't need to call the autograd engine
+        return (None,) * len(inputs)
+    else:
+        return torch.autograd.grad(
+            new_outputs,
+            inputs,
+            new_grad_outputs,
+            allow_unused=True,
+            create_graph=create_graph,
+            retain_graph=retain_graph,
+            is_grads_batched=is_grads_batched,
+        )
+
+
+def _fill_in_zeros(grads, refs, strict, create_graph, stage):
+    # Used to detect None in the grads and depending on the flags, either replace them
+    # with Tensors full of 0s of the appropriate size based on the refs or raise an error.
+    # strict and create graph allow us to detect when it is appropriate to raise an error
+    # stage gives us information of which backward call we consider to give good error message
+    if stage not in ["back", "back_trick", "double_back", "double_back_trick"]:
+        raise RuntimeError(f"Invalid stage argument '{stage}' to _fill_in_zeros")
+
+    res: Tuple[torch.Tensor, ...] = tuple()
+    for i, grads_i in enumerate(grads):
+        if grads_i is None:
+            if strict:
+                if stage == "back":
+                    raise RuntimeError(
+                        "The output of the user-provided function is independent of "
+                        f"input {i}. This is not allowed in strict mode."
+                    )
+                elif stage == "back_trick":
+                    raise RuntimeError(
+                        f"The gradient with respect to the input is independent of entry {i}"
+                        " in the grad_outputs when using the double backward trick to compute"
+                        " forward mode gradients. This is not allowed in strict mode."
+                    )
+                elif stage == "double_back":
+                    raise RuntimeError(
+                        "The jacobian of the user-provided function is independent of "
+                        f"input {i}. This is not allowed in strict mode."
+                    )
+                else:
+                    raise RuntimeError(
+                        "The hessian of the user-provided function is independent of "
+                        f"entry {i} in the grad_jacobian. This is not allowed in strict "
+                        "mode as it prevents from using the double backward trick to "
+                        "replace forward mode AD."
+                    )
+
+            grads_i = torch.zeros_like(refs[i])
+        else:
+            if strict and create_graph and not grads_i.requires_grad:
+                if "double" not in stage:
+                    raise RuntimeError(
+                        "The jacobian of the user-provided function is independent of "
+                        f"input {i}. This is not allowed in strict mode when create_graph=True."
+                    )
+                else:
+                    raise RuntimeError(
+                        "The hessian of the user-provided function is independent of "
+                        f"input {i}. This is not allowed in strict mode when create_graph=True."
+                    )
+
+        res += (grads_i,)
+
+    return res
+
+
+# Public API
+
+
+def vjp(func, inputs, v=None, create_graph=False, strict=False):
+    r"""Compute the dot product between a vector ``v`` and the Jacobian of the given function at the point given by the inputs.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a tuple of Tensors or a Tensor.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        v (tuple of Tensors or Tensor): The vector for which the vector
+            Jacobian product is computed.  Must be the same size as the output
+            of ``func``. This argument is optional when the output of ``func``
+            contains a single element and (if it is not provided) will be set
+            as a Tensor containing a single ``1``.
+        create_graph (bool, optional): If ``True``, both the output and result
+            will be computed in a differentiable way. Note that when ``strict``
+            is ``False``, the result can not require gradients or be
+            disconnected from the inputs.  Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we
+            detect that there exists an input such that all the outputs are
+            independent of it. If ``False``, we return a Tensor of zeros as the
+            vjp for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+
+    Returns:
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
+            vjp (tuple of Tensors or Tensor): result of the dot product with
+            the same shape as the inputs.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def exp_reducer(x):
+        ...     return x.exp().sum(dim=1)
+        >>> inputs = torch.rand(4, 4)
+        >>> v = torch.ones(4)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> vjp(exp_reducer, inputs, v)
+        (tensor([5.7817, 7.2458, 5.7830, 6.7782]),
+         tensor([[1.4458, 1.3962, 1.3042, 1.6354],
+                [2.1288, 1.0652, 1.5483, 2.5035],
+                [2.2046, 1.1292, 1.1432, 1.3059],
+                [1.3225, 1.6652, 1.7753, 2.0152]]))
+
+        >>> vjp(exp_reducer, inputs, v, create_graph=True)
+        (tensor([5.7817, 7.2458, 5.7830, 6.7782], grad_fn=<SumBackward1>),
+         tensor([[1.4458, 1.3962, 1.3042, 1.6354],
+                [2.1288, 1.0652, 1.5483, 2.5035],
+                [2.2046, 1.1292, 1.1432, 1.3059],
+                [1.3225, 1.6652, 1.7753, 2.0152]], grad_fn=<MulBackward0>))
+
+        >>> def adder(x, y):
+        ...     return 2 * x + 3 * y
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> v = torch.ones(2)
+        >>> vjp(adder, inputs, v)
+        (tensor([2.4225, 2.3340]),
+         (tensor([2., 2.]), tensor([3., 3.])))
+    """
+    with torch.enable_grad():
+        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "vjp")
+        inputs = _grad_preprocess(inputs, create_graph=create_graph, need_graph=True)
+
+        outputs = func(*inputs)
+        is_outputs_tuple, outputs = _as_tuple(
+            outputs, "outputs of the user-provided function", "vjp"
+        )
+        _check_requires_grad(outputs, "outputs", strict=strict)
+
+        if v is not None:
+            _, v = _as_tuple(v, "v", "vjp")
+            v = _grad_preprocess(v, create_graph=create_graph, need_graph=False)
+            _validate_v(v, outputs, is_outputs_tuple)
+        else:
+            if len(outputs) != 1 or outputs[0].nelement() != 1:
+                raise RuntimeError(
+                    "The vector v can only be None if the "
+                    "user-provided function returns "
+                    "a single Tensor with a single element."
+                )
+
+    enable_grad = True if create_graph else torch.is_grad_enabled()
+    with torch.set_grad_enabled(enable_grad):
+        grad_res = _autograd_grad(outputs, inputs, v, create_graph=create_graph)
+        vjp = _fill_in_zeros(grad_res, inputs, strict, create_graph, "back")
+
+    # Cleanup objects and return them to the user
+    outputs = _grad_postprocess(outputs, create_graph)
+    vjp = _grad_postprocess(vjp, create_graph)
+
+    return _tuple_postprocess(outputs, is_outputs_tuple), _tuple_postprocess(
+        vjp, is_inputs_tuple
+    )
+
+
+def jvp(func, inputs, v=None, create_graph=False, strict=False):
+    r"""Compute the dot product between the Jacobian of the given function at the point given by the inputs and a vector ``v``.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a tuple of Tensors or a Tensor.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        v (tuple of Tensors or Tensor): The vector for which the Jacobian
+            vector product is computed. Must be the same size as the input of
+            ``func``. This argument is optional when the input to ``func``
+            contains a single element and (if it is not provided) will be set
+            as a Tensor containing a single ``1``.
+        create_graph (bool, optional): If ``True``, both the output and result
+            will be computed in a differentiable way. Note that when ``strict``
+            is ``False``, the result can not require gradients or be
+            disconnected from the inputs.  Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we
+            detect that there exists an input such that all the outputs are
+            independent of it. If ``False``, we return a Tensor of zeros as the
+            jvp for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+
+    Returns:
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
+            jvp (tuple of Tensors or Tensor): result of the dot product with
+            the same shape as the output.
+
+    Note:
+        ``autograd.functional.jvp`` computes the jvp by using the backward of
+        the backward (sometimes called the double backwards trick). This is not
+        the most performant way of computing the jvp. Please consider using
+        :func:`torch.func.jvp` or the
+        :ref:`low-level forward-mode AD API <forward-mode-ad>` instead.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def exp_reducer(x):
+        ...     return x.exp().sum(dim=1)
+        >>> inputs = torch.rand(4, 4)
+        >>> v = torch.ones(4, 4)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> jvp(exp_reducer, inputs, v)
+        (tensor([6.3090, 4.6742, 7.9114, 8.2106]),
+         tensor([6.3090, 4.6742, 7.9114, 8.2106]))
+
+        >>> jvp(exp_reducer, inputs, v, create_graph=True)
+        (tensor([6.3090, 4.6742, 7.9114, 8.2106], grad_fn=<SumBackward1>),
+         tensor([6.3090, 4.6742, 7.9114, 8.2106], grad_fn=<SqueezeBackward1>))
+
+        >>> def adder(x, y):
+        ...     return 2 * x + 3 * y
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> v = (torch.ones(2), torch.ones(2))
+        >>> jvp(adder, inputs, v)
+        (tensor([2.2399, 2.5005]),
+         tensor([5., 5.]))
+
+    """
+    with torch.enable_grad():
+        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jvp")
+        inputs = _grad_preprocess(inputs, create_graph=create_graph, need_graph=True)
+
+        if v is not None:
+            _, v = _as_tuple(v, "v", "jvp")
+            v = _grad_preprocess(v, create_graph=create_graph, need_graph=False)
+            _validate_v(v, inputs, is_inputs_tuple)
+        else:
+            if len(inputs) != 1 or inputs[0].nelement() != 1:
+                raise RuntimeError(
+                    "The vector v can only be None if the input to "
+                    "the user-provided function is a single Tensor "
+                    "with a single element."
+                )
+
+        outputs = func(*inputs)
+        is_outputs_tuple, outputs = _as_tuple(
+            outputs, "outputs of the user-provided function", "jvp"
+        )
+        _check_requires_grad(outputs, "outputs", strict=strict)
+        # The backward is linear so the value of grad_outputs is not important as
+        # it won't appear in the double backward graph. We only need to ensure that
+        # it does not contain inf or nan.
+        grad_outputs = tuple(
+            torch.zeros_like(out, requires_grad=True) for out in outputs
+        )
+
+        grad_inputs = _autograd_grad(outputs, inputs, grad_outputs, create_graph=True)
+        _check_requires_grad(grad_inputs, "grad_inputs", strict=strict)
+
+    if create_graph:
+        with torch.enable_grad():
+            grad_res = _autograd_grad(
+                grad_inputs, grad_outputs, v, create_graph=create_graph
+            )
+            jvp = _fill_in_zeros(grad_res, outputs, strict, create_graph, "back_trick")
+    else:
+        grad_res = _autograd_grad(
+            grad_inputs, grad_outputs, v, create_graph=create_graph
+        )
+        jvp = _fill_in_zeros(grad_res, outputs, strict, create_graph, "back_trick")
+
+    # Cleanup objects and return them to the user
+    outputs = _grad_postprocess(outputs, create_graph)
+    jvp = _grad_postprocess(jvp, create_graph)
+
+    return _tuple_postprocess(outputs, is_outputs_tuple), _tuple_postprocess(
+        jvp, is_outputs_tuple
+    )
+
+
+def _construct_standard_basis_for(
+    tensors: Tuple[torch.Tensor, ...], tensor_numels: Tuple[int, ...]
+) -> Tuple[torch.Tensor, ...]:
+    # This function:
+    # - constructs a N=sum(tensor_numels) standard basis. i.e. an NxN identity matrix.
+    # - Splits the identity matrix into chunks with each chunk size determined by `tensor_numels`.
+    # - Each chunk corresponds to one tensor. The chunk has the same dtype and
+    #   device as the tensor
+    #
+    # For example, with tensor_numels = [1, 2, 1], this function returns:
+    # ( tensor([[1],     tensor([[0, 0],      tensor([[0],
+    #           [0],             [1, 0],              [0],
+    #           [0],             [0, 1],              [0],
+    #           [0]])  ,         [0, 0]])  ,          [1]])  )
+    #
+    # Precondition: tensor_numels == tuple(tensor.numel() for tensor in tensors)
+    # Precondition: tensors always has at least one element.
+    #
+    # See NOTE: [Computing jacobian with vmap and grad for multiple tensors]
+    # for context behind this function. All the pre-conditions are guarded for
+    # in torch.autograd.functional.jacobian.
+    assert len(tensors) == len(tensor_numels)
+    assert len(tensors) > 0
+    total_numel = sum(tensor_numels)
+    chunks = tuple(
+        tensor.new_zeros(total_numel, tensor_numel)
+        for tensor, tensor_numel in zip(tensors, tensor_numels)
+    )
+    diag_start_idx = 0
+    for chunk, numel in zip(chunks, tensor_numels):
+        chunk.diagonal(diag_start_idx).fill_(1)
+        diag_start_idx -= numel
+    return chunks
+
+
+def _jacfwd(func, inputs, strict=False, vectorize=False):
+    if strict:
+        raise RuntimeError(
+            "torch.autograd.functional.jacobian: `strict=True` "
+            'and `strategy="forward-mode"` are not supported together (yet). '
+            "Please either set `strict=False` or "
+            '`strategy="reverse-mode"`.'
+        )
+    is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jacobian")
+    output_info = []
+
+    if vectorize:
+        # See NOTE: [Computing jacobian with vmap and grad for multiple outputs]
+        input_numels = tuple(input.numel() for input in inputs)
+
+        # Step 1: Prepare tangents
+        tangents = _construct_standard_basis_for(inputs, input_numels)
+
+        # Step 2: Compute vmap over computation with dual tensors
+        def jvp(tangents):
+            with fwAD.dual_level():
+                dual_inputs = tuple(
+                    fwAD.make_dual(input, tangent.view_as(input))
+                    for input, tangent in zip(inputs, tangents)
+                )
+                _is_outputs_tuple, dual_outputs = _as_tuple(
+                    func(*dual_inputs), "outputs"
+                )
+                output_info.append(_is_outputs_tuple)
+                jv = []
+                primal_outs = []
+                for dual_out in dual_outputs:
+                    primal, tangent = fwAD.unpack_dual(dual_out)
+                    primal_outs.append(primal)
+                    if tangent is not None:
+                        jv.append(tangent)
+                    else:
+                        jv.append(torch.zeros_like(primal))
+                output_info.append(primal_outs)
+                return tuple(jv)
+
+        outputs_before_split = _vmap(jvp)(tangents)
+        is_outputs_tuple, outputs = output_info
+        # Step 3: for each of the output tangents, split along dim 0
+        jacobian_input_output = []
+        for jac_output_i, output_i in zip(outputs_before_split, outputs):
+            jacobian_output_i_output = []
+            for jac, input_j in zip(jac_output_i.split(input_numels, dim=0), inputs):
+                # We need to transpose the Jacobian because in forward AD, the
+                # batch dimension represents that of the inputs
+                jacobian_input_i_output_j = jac.permute(*range(1, jac.ndim), 0).reshape(
+                    (*output_i.shape, *input_j.shape)
+                )  # noqa: C409
+
+                jacobian_output_i_output.append(jacobian_input_i_output_j)
+            jacobian_input_output.append(jacobian_output_i_output)
+
+        # Omit [Step 4] because everything is already transposed w/ forward AD
+        return _tuple_postprocess(
+            jacobian_input_output, (is_outputs_tuple, is_inputs_tuple)
+        )
+    else:
+        raise NotImplementedError(
+            "Computing Jacobian using forward-AD or forward-over-reverse Hessian is"
+            "only implemented for `vectorize=True`."
+        )
+
+
+def jacobian(
+    func,
+    inputs,
+    create_graph=False,
+    strict=False,
+    vectorize=False,
+    strategy="reverse-mode",
+):
+    r"""Compute the Jacobian of a given function.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a tuple of Tensors or a Tensor.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        create_graph (bool, optional): If ``True``, the Jacobian will be
+            computed in a differentiable manner. Note that when ``strict`` is
+            ``False``, the result can not require gradients or be disconnected
+            from the inputs.  Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we
+            detect that there exists an input such that all the outputs are
+            independent of it. If ``False``, we return a Tensor of zeros as the
+            jacobian for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+        vectorize (bool, optional): This feature is experimental.
+            Please consider using :func:`torch.func.jacrev` or
+            :func:`torch.func.jacfwd` instead if you are looking for something
+            less experimental and more performant.
+            When computing the jacobian, usually we invoke
+            ``autograd.grad`` once per row of the jacobian. If this flag is
+            ``True``, we perform only a single ``autograd.grad`` call with
+            ``batched_grad=True`` which uses the vmap prototype feature.
+            Though this should lead to performance improvements in many cases,
+            because this feature is still experimental, there may be performance
+            cliffs. See :func:`torch.autograd.grad`'s ``batched_grad`` parameter for
+            more information.
+        strategy (str, optional): Set to ``"forward-mode"`` or ``"reverse-mode"`` to
+            determine whether the Jacobian will be computed with forward or reverse
+            mode AD. Currently, ``"forward-mode"`` requires ``vectorized=True``.
+            Defaults to ``"reverse-mode"``. If ``func`` has more outputs than
+            inputs, ``"forward-mode"`` tends to be more performant. Otherwise,
+            prefer to use ``"reverse-mode"``.
+
+    Returns:
+        Jacobian (Tensor or nested tuple of Tensors): if there is a single
+        input and output, this will be a single Tensor containing the
+        Jacobian for the linearized inputs and output. If one of the two is
+        a tuple, then the Jacobian will be a tuple of Tensors. If both of
+        them are tuples, then the Jacobian will be a tuple of tuple of
+        Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the
+        ``i``\th output and ``j``\th input and will have as size the
+        concatenation of the sizes of the corresponding output and the
+        corresponding input and will have same dtype and device as the
+        corresponding input. If strategy is ``forward-mode``, the dtype will be
+        that of the output; otherwise, the input.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def exp_reducer(x):
+        ...     return x.exp().sum(dim=1)
+        >>> inputs = torch.rand(2, 2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> jacobian(exp_reducer, inputs)
+        tensor([[[1.4917, 2.4352],
+                 [0.0000, 0.0000]],
+                [[0.0000, 0.0000],
+                 [2.4369, 2.3799]]])
+
+        >>> jacobian(exp_reducer, inputs, create_graph=True)
+        tensor([[[1.4917, 2.4352],
+                 [0.0000, 0.0000]],
+                [[0.0000, 0.0000],
+                 [2.4369, 2.3799]]], grad_fn=<ViewBackward>)
+
+        >>> def exp_adder(x, y):
+        ...     return 2 * x.exp() + 3 * y
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> jacobian(exp_adder, inputs)
+        (tensor([[2.8052, 0.0000],
+                [0.0000, 3.3963]]),
+         tensor([[3., 0.],
+                 [0., 3.]]))
+    """
+    assert strategy in ("forward-mode", "reverse-mode"), (
+        'Expected strategy to be either "forward-mode" or "reverse-mode". Hint: If your '
+        'function has more outputs than inputs, "forward-mode" tends to be more performant. '
+        'Otherwise, prefer to use "reverse-mode".'
+    )
+    if strategy == "forward-mode":
+        if create_graph:
+            raise NotImplementedError(
+                "torch.autograd.functional.jacobian: `create_graph=True` "
+                'and `strategy="forward-mode"` are not supported together (yet). '
+                "Please either set `create_graph=False` or "
+                '`strategy="reverse-mode"`.'
+            )
+        return _jacfwd(func, inputs, strict, vectorize)
+
+    with torch.enable_grad():
+        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "jacobian")
+        inputs = _grad_preprocess(inputs, create_graph=create_graph, need_graph=True)
+
+        outputs = func(*inputs)
+        is_outputs_tuple, outputs = _as_tuple(
+            outputs, "outputs of the user-provided function", "jacobian"
+        )
+        _check_requires_grad(outputs, "outputs", strict=strict)
+
+        if vectorize:
+            if strict:
+                raise RuntimeError(
+                    "torch.autograd.functional.jacobian: `strict=True` "
+                    "and `vectorized=True` are not supported together. "
+                    "Please either set `strict=False` or "
+                    "`vectorize=False`."
+                )
+            # NOTE: [Computing jacobian with vmap and grad for multiple outputs]
+            #
+            # Let's consider f(x) = (x**2, x.sum()) and let x = torch.randn(3).
+            # It turns out we can compute the jacobian of this function with a single
+            # call to autograd.grad by using vmap over the correct grad_outputs.
+            #
+            # Firstly, one way to compute the jacobian is to stack x**2 and x.sum()
+            # into a 4D vector. E.g., use g(x) = torch.stack([x**2, x.sum()])
+            #
+            # To get the first row of the jacobian, we call
+            # >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([1, 0, 0, 0]))
+            # To get the 2nd row of the jacobian, we call
+            # >>> autograd.grad(g(x), x, grad_outputs=torch.tensor([0, 1, 0, 0]))
+            # and so on.
+            #
+            # Using vmap, we can vectorize all 4 of these computations into one by
+            # passing the standard basis for R^4 as the grad_output.
+            # vmap(partial(autograd.grad, g(x), x))(torch.eye(4)).
+            #
+            # Now, how do we compute the jacobian *without stacking the output*?
+            # We can just split the standard basis across the outputs. So to
+            # compute the jacobian of f(x), we'd use
+            # >>> autograd.grad(f(x), x, grad_outputs=_construct_standard_basis_for(...))
+            # The grad_outputs looks like the following:
+            # ( torch.tensor([[1, 0, 0],
+            #                 [0, 1, 0],
+            #                 [0, 0, 1],
+            #                 [0, 0, 0]]),
+            #   torch.tensor([[0],
+            #                 [0],
+            #                 [0],
+            #                 [1]]) )
+            #
+            # But we're not done yet!
+            # >>> vmap(partial(autograd.grad(f(x), x, grad_outputs=...)))
+            # returns a Tensor of shape [4, 3]. We have to remember to split the
+            # jacobian of shape [4, 3] into two:
+            # - one of shape [3, 3] for the first output
+            # - one of shape [   3] for the second output
+
+            # Step 1: Construct grad_outputs by splitting the standard basis
+            output_numels = tuple(output.numel() for output in outputs)
+            grad_outputs = _construct_standard_basis_for(outputs, output_numels)
+            flat_outputs = tuple(output.reshape(-1) for output in outputs)
+
+            # Step 2: Call vmap + autograd.grad
+            def vjp(grad_output):
+                vj = list(
+                    _autograd_grad(
+                        flat_outputs,
+                        inputs,
+                        grad_output,
+                        create_graph=create_graph,
+                        is_grads_batched=True,
+                    )
+                )
+                for el_idx, vj_el in enumerate(vj):
+                    if vj_el is not None:
+                        continue
+                    vj[el_idx] = torch.zeros_like(inputs[el_idx]).expand(
+                        (sum(output_numels),) + inputs[el_idx].shape
+                    )
+                return tuple(vj)
+
+            jacobians_of_flat_output = vjp(grad_outputs)
+
+            # Step 3: The returned jacobian is one big tensor per input. In this step,
+            # we split each Tensor by output.
+            jacobian_input_output = []
+            for jac_input_i, input_i in zip(jacobians_of_flat_output, inputs):
+                jacobian_input_i_output = []
+                for jac, output_j in zip(
+                    jac_input_i.split(output_numels, dim=0), outputs
+                ):
+                    jacobian_input_i_output_j = jac.view(output_j.shape + input_i.shape)
+                    jacobian_input_i_output.append(jacobian_input_i_output_j)
+                jacobian_input_output.append(jacobian_input_i_output)
+
+            # Step 4: Right now, `jacobian` is a List[List[Tensor]].
+            # The outer List corresponds to the number of inputs,
+            # the inner List corresponds to the number of outputs.
+            # We need to exchange the order of these and convert to tuples
+            # before returning.
+            jacobian_output_input = tuple(zip(*jacobian_input_output))
+
+            jacobian_output_input = _grad_postprocess(
+                jacobian_output_input, create_graph
+            )
+            return _tuple_postprocess(
+                jacobian_output_input, (is_outputs_tuple, is_inputs_tuple)
+            )
+
+        jacobian: Tuple[torch.Tensor, ...] = tuple()
+
+        for i, out in enumerate(outputs):
+            # mypy complains that expression and variable have different types due to the empty list
+            jac_i: Tuple[List[torch.Tensor]] = tuple([] for _ in range(len(inputs)))  # type: ignore[assignment]
+            for j in range(out.nelement()):
+                vj = _autograd_grad(
+                    (out.reshape(-1)[j],),
+                    inputs,
+                    retain_graph=True,
+                    create_graph=create_graph,
+                )
+
+                for el_idx, (jac_i_el, vj_el, inp_el) in enumerate(
+                    zip(jac_i, vj, inputs)
+                ):
+                    if vj_el is not None:
+                        if strict and create_graph and not vj_el.requires_grad:
+                            msg = (
+                                "The jacobian of the user-provided function is "
+                                f"independent of input {i}. This is not allowed in "
+                                "strict mode when create_graph=True."
+                            )
+                            raise RuntimeError(msg)
+                        jac_i_el.append(vj_el)
+                    else:
+                        if strict:
+                            msg = (
+                                f"Output {i} of the user-provided function is "
+                                f"independent of input {el_idx}. This is not allowed in "
+                                "strict mode."
+                            )
+                            raise RuntimeError(msg)
+                        jac_i_el.append(torch.zeros_like(inp_el))
+
+            jacobian += (
+                tuple(
+                    torch.stack(jac_i_el, dim=0).view(
+                        out.size() + inputs[el_idx].size()  # type: ignore[operator]
+                    )
+                    for (el_idx, jac_i_el) in enumerate(jac_i)
+                ),
+            )
+
+        jacobian = _grad_postprocess(jacobian, create_graph)
+
+        return _tuple_postprocess(jacobian, (is_outputs_tuple, is_inputs_tuple))
+
+
+def hessian(
+    func,
+    inputs,
+    create_graph=False,
+    strict=False,
+    vectorize=False,
+    outer_jacobian_strategy="reverse-mode",
+):
+    r"""Compute the Hessian of a given scalar function.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor with a single element.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        create_graph (bool, optional): If ``True``, the Hessian will be computed in
+            a differentiable manner. Note that when ``strict`` is ``False``, the result can not
+            require gradients or be disconnected from the inputs.
+            Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we detect that there exists an input
+            such that all the outputs are independent of it. If ``False``, we return a Tensor of zeros as the
+            hessian for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+        vectorize (bool, optional): This feature is experimental.
+            Please consider using :func:`torch.func.hessian`
+            instead if you are looking for something less experimental and more performant.
+            When computing the hessian, usually we invoke
+            ``autograd.grad`` once per row of the hessian. If this flag is
+            ``True``, we use the vmap prototype feature as the backend to
+            vectorize calls to ``autograd.grad`` so we only invoke it once
+            instead of once per row. This should lead to performance
+            improvements in many use cases, however, due to this feature
+            being incomplete, there may be performance cliffs. Please
+            use `torch._C._debug_only_display_vmap_fallback_warnings(True)`
+            to show any performance warnings and file us issues if
+            warnings exist for your use case. Defaults to ``False``.
+        outer_jacobian_strategy (str, optional): The Hessian is computed by
+            computing the Jacobian of a Jacobian. The inner Jacobian is always
+            computed in reverse-mode AD. Setting strategy to ``"forward-mode"``
+            or ``"reverse-mode"`` determines whether the outer Jacobian will be
+            computed with forward or reverse mode AD. Currently, computing the outer
+            Jacobian in ``"forward-mode"`` requires ``vectorized=True``. Defaults
+            to ``"reverse-mode"``.
+
+    Returns:
+        Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input,
+        this will be a single Tensor containing the Hessian for the input.
+        If it is a tuple, then the Hessian will be a tuple of tuples where
+        ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input
+        and ``j``\th input with size the sum of the size of the ``i``\th input plus
+        the size of the ``j``\th input. ``Hessian[i][j]`` will have the same
+        dtype and device as the corresponding ``i``\th input.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def pow_reducer(x):
+        ...     return x.pow(3).sum()
+        >>> inputs = torch.rand(2, 2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> hessian(pow_reducer, inputs)
+        tensor([[[[5.2265, 0.0000],
+                  [0.0000, 0.0000]],
+                 [[0.0000, 4.8221],
+                  [0.0000, 0.0000]]],
+                [[[0.0000, 0.0000],
+                  [1.9456, 0.0000]],
+                 [[0.0000, 0.0000],
+                  [0.0000, 3.2550]]]])
+
+        >>> hessian(pow_reducer, inputs, create_graph=True)
+        tensor([[[[5.2265, 0.0000],
+                  [0.0000, 0.0000]],
+                 [[0.0000, 4.8221],
+                  [0.0000, 0.0000]]],
+                [[[0.0000, 0.0000],
+                  [1.9456, 0.0000]],
+                 [[0.0000, 0.0000],
+                  [0.0000, 3.2550]]]], grad_fn=<ViewBackward>)
+
+
+        >>> def pow_adder_reducer(x, y):
+        ...     return (2 * x.pow(2) + 3 * y.pow(2)).sum()
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> hessian(pow_adder_reducer, inputs)
+        ((tensor([[4., 0.],
+                  [0., 4.]]),
+          tensor([[0., 0.],
+                  [0., 0.]])),
+         (tensor([[0., 0.],
+                  [0., 0.]]),
+          tensor([[6., 0.],
+                  [0., 6.]])))
+    """
+    is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "hessian")
+    assert outer_jacobian_strategy in (
+        "forward-mode",
+        "reverse-mode",
+    ), 'Expected strategy to be either "forward-mode" or "reverse-mode".'
+
+    def ensure_single_output_function(*inp):
+        out = func(*inp)
+        is_out_tuple, t_out = _as_tuple(
+            out, "outputs of the user-provided function", "hessian"
+        )
+        _check_requires_grad(t_out, "outputs", strict=strict)
+
+        if is_out_tuple or not isinstance(out, torch.Tensor):
+            raise RuntimeError(
+                "The function given to hessian should return a single Tensor"
+            )
+
+        if out.nelement() != 1:
+            raise RuntimeError(
+                "The Tensor returned by the function given to hessian should contain a single element"
+            )
+
+        return out.squeeze()
+
+    def jac_func(*inp):
+        if outer_jacobian_strategy == "forward-mode":
+            # _grad_preprocess requires create_graph=True and input to require_grad
+            # or else the input will be detached
+            inp = tuple(t.requires_grad_(True) for t in inp)
+        jac = jacobian(ensure_single_output_function, inp, create_graph=True)
+        _check_requires_grad(jac, "jacobian", strict=strict)
+        return jac
+
+    res = jacobian(
+        jac_func,
+        inputs,
+        create_graph=create_graph,
+        strict=strict,
+        vectorize=vectorize,
+        strategy=outer_jacobian_strategy,
+    )
+    return _tuple_postprocess(res, (is_inputs_tuple, is_inputs_tuple))
+
+
+def vhp(func, inputs, v=None, create_graph=False, strict=False):
+    r"""Compute the dot product between vector ``v`` and Hessian of a  given scalar function at a specified point.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor with a single element.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        v (tuple of Tensors or Tensor): The vector for which the vector Hessian
+            product is computed. Must be the same size as the input of
+            ``func``. This argument is optional when ``func``'s input contains
+            a single element and (if it is not provided) will be set as a
+            Tensor containing a single ``1``.
+        create_graph (bool, optional): If ``True``, both the output and result
+            will be computed in a differentiable way. Note that when ``strict``
+            is ``False``, the result can not require gradients or be
+            disconnected from the inputs.
+            Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we
+            detect that there exists an input such that all the outputs are
+            independent of it. If ``False``, we return a Tensor of zeros as the
+            vhp for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+
+    Returns:
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
+            vhp (tuple of Tensors or Tensor): result of the dot product with the
+            same shape as the inputs.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def pow_reducer(x):
+        ...     return x.pow(3).sum()
+        >>> inputs = torch.rand(2, 2)
+        >>> v = torch.ones(2, 2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> vhp(pow_reducer, inputs, v)
+        (tensor(0.5591),
+         tensor([[1.0689, 1.2431],
+                 [3.0989, 4.4456]]))
+        >>> vhp(pow_reducer, inputs, v, create_graph=True)
+        (tensor(0.5591, grad_fn=<SumBackward0>),
+         tensor([[1.0689, 1.2431],
+                 [3.0989, 4.4456]], grad_fn=<MulBackward0>))
+        >>> def pow_adder_reducer(x, y):
+        ...     return (2 * x.pow(2) + 3 * y.pow(2)).sum()
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> v = (torch.zeros(2), torch.ones(2))
+        >>> vhp(pow_adder_reducer, inputs, v)
+        (tensor(4.8053),
+         (tensor([0., 0.]),
+          tensor([6., 6.])))
+    """
+    with torch.enable_grad():
+        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "vhp")
+        inputs = _grad_preprocess(inputs, create_graph=create_graph, need_graph=True)
+
+        if v is not None:
+            _, v = _as_tuple(v, "v", "vhp")
+            v = _grad_preprocess(v, create_graph=create_graph, need_graph=False)
+            _validate_v(v, inputs, is_inputs_tuple)
+        else:
+            if len(inputs) != 1 or inputs[0].nelement() != 1:
+                raise RuntimeError(
+                    "The vector v can only be None if the input to the user-provided function "
+                    "is a single Tensor with a single element."
+                )
+        outputs = func(*inputs)
+        is_outputs_tuple, outputs = _as_tuple(
+            outputs, "outputs of the user-provided function", "vhp"
+        )
+        _check_requires_grad(outputs, "outputs", strict=strict)
+
+        if is_outputs_tuple or not isinstance(outputs[0], torch.Tensor):
+            raise RuntimeError(
+                "The function given to vhp should return a single Tensor"
+            )
+
+        if outputs[0].nelement() != 1:
+            raise RuntimeError(
+                "The Tensor returned by the function given to vhp should contain a single element"
+            )
+
+        jac = _autograd_grad(outputs, inputs, create_graph=True)
+        _check_requires_grad(jac, "jacobian", strict=strict)
+
+    enable_grad = True if create_graph else torch.is_grad_enabled()
+    with torch.set_grad_enabled(enable_grad):
+        grad_res = _autograd_grad(jac, inputs, v, create_graph=create_graph)
+        vhp = _fill_in_zeros(grad_res, inputs, strict, create_graph, "double_back")
+
+    outputs = _grad_postprocess(outputs, create_graph)
+    vhp = _grad_postprocess(vhp, create_graph)
+
+    return _tuple_postprocess(outputs, is_outputs_tuple), _tuple_postprocess(
+        vhp, is_inputs_tuple
+    )
+
+
+def hvp(func, inputs, v=None, create_graph=False, strict=False):
+    r"""Compute the dot product between the scalar function's Hessian and a vector ``v`` at a specified point.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor with a single element.
+        inputs (tuple of Tensors or Tensor): inputs to the function ``func``.
+        v (tuple of Tensors or Tensor): The vector for which the Hessian vector
+            product is computed. Must be the same size as the input of
+            ``func``. This argument is optional when ``func``'s input contains
+            a single element and (if it is not provided) will be set as a
+            Tensor containing a single ``1``.
+        create_graph (bool, optional): If ``True``, both the output and result will be
+            computed in a differentiable way. Note that when ``strict`` is
+            ``False``, the result can not require gradients or be disconnected
+            from the inputs.  Defaults to ``False``.
+        strict (bool, optional): If ``True``, an error will be raised when we
+            detect that there exists an input such that all the outputs are
+            independent of it. If ``False``, we return a Tensor of zeros as the
+            hvp for said inputs, which is the expected mathematical value.
+            Defaults to ``False``.
+    Returns:
+        output (tuple): tuple with:
+            func_output (tuple of Tensors or Tensor): output of ``func(inputs)``
+
+            hvp (tuple of Tensors or Tensor): result of the dot product with
+            the same shape as the inputs.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def pow_reducer(x):
+        ...     return x.pow(3).sum()
+        >>> inputs = torch.rand(2, 2)
+        >>> v = torch.ones(2, 2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> hvp(pow_reducer, inputs, v)
+        (tensor(0.1448),
+         tensor([[2.0239, 1.6456],
+                 [2.4988, 1.4310]]))
+
+        >>> hvp(pow_reducer, inputs, v, create_graph=True)
+        (tensor(0.1448, grad_fn=<SumBackward0>),
+         tensor([[2.0239, 1.6456],
+                 [2.4988, 1.4310]], grad_fn=<MulBackward0>))
+
+
+        >>> def pow_adder_reducer(x, y):
+        ...     return (2 * x.pow(2) + 3 * y.pow(2)).sum()
+        >>> inputs = (torch.rand(2), torch.rand(2))
+        >>> v = (torch.zeros(2), torch.ones(2))
+        >>> hvp(pow_adder_reducer, inputs, v)
+        (tensor(2.3030),
+         (tensor([0., 0.]),
+          tensor([6., 6.])))
+
+    Note:
+
+        This function is significantly slower than `vhp` due to backward mode AD constraints.
+        If your functions is twice continuously differentiable, then hvp = vhp.t(). So if you
+        know that your function satisfies this condition, you should use vhp instead that is
+        much faster with the current implementation.
+
+    """
+    with torch.enable_grad():
+        is_inputs_tuple, inputs = _as_tuple(inputs, "inputs", "hvp")
+        inputs = _grad_preprocess(inputs, create_graph=create_graph, need_graph=True)
+
+        if v is not None:
+            _, v = _as_tuple(v, "v", "hvp")
+            v = _grad_preprocess(v, create_graph=create_graph, need_graph=False)
+            _validate_v(v, inputs, is_inputs_tuple)
+        else:
+            if len(inputs) != 1 or inputs[0].nelement() != 1:
+                raise RuntimeError(
+                    "The vector v can only be None if the input to the user-provided function "
+                    "is a single Tensor with a single element."
+                )
+        outputs = func(*inputs)
+        is_outputs_tuple, outputs = _as_tuple(
+            outputs, "outputs of the user-provided function", "hvp"
+        )
+        _check_requires_grad(outputs, "outputs", strict=strict)
+
+        if is_outputs_tuple or not isinstance(outputs[0], torch.Tensor):
+            raise RuntimeError(
+                "The function given to hvp should return a single Tensor"
+            )
+
+        if outputs[0].nelement() != 1:
+            raise RuntimeError(
+                "The Tensor returned by the function given to hvp should contain a single element"
+            )
+
+        jac = _autograd_grad(outputs, inputs, create_graph=True)
+        _check_requires_grad(jac, "jacobian", strict=strict)
+
+        grad_jac = tuple(torch.zeros_like(inp, requires_grad=True) for inp in inputs)
+
+        double_back = _autograd_grad(jac, inputs, grad_jac, create_graph=True)
+        _check_requires_grad(jac, "hessian", strict=strict)
+
+    enable_grad = True if create_graph else torch.is_grad_enabled()
+    with torch.set_grad_enabled(enable_grad):
+        grad_res = _autograd_grad(double_back, grad_jac, v, create_graph=create_graph)
+        hvp = _fill_in_zeros(
+            grad_res, inputs, strict, create_graph, "double_back_trick"
+        )
+
+    outputs = _grad_postprocess(outputs, create_graph)
+    hvp = _grad_postprocess(hvp, create_graph)
+
+    return _tuple_postprocess(outputs, is_outputs_tuple), _tuple_postprocess(
+        hvp, is_inputs_tuple
+    )
diff --git a/MLPY/Lib/site-packages/torch/autograd/grad_mode.py b/MLPY/Lib/site-packages/torch/autograd/grad_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6ecb5fcea193967d5fdf160f1cfce20b3c37cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/grad_mode.py
@@ -0,0 +1,396 @@
+from typing import Any
+
+import torch
+
+from torch.utils._contextlib import (
+    _DecoratorContextManager,
+    _NoParamDecoratorContextManager,
+    F,
+)
+
+__all__ = [
+    "no_grad",
+    "enable_grad",
+    "set_grad_enabled",
+    "inference_mode",
+    "set_multithreading_enabled",
+]
+
+
+class no_grad(_NoParamDecoratorContextManager):
+    r"""Context-manager that disables gradient calculation.
+
+    Disabling gradient calculation is useful for inference, when you are sure
+    that you will not call :meth:`Tensor.backward()`. It will reduce memory
+    consumption for computations that would otherwise have `requires_grad=True`.
+
+    In this mode, the result of every computation will have
+    `requires_grad=False`, even when the inputs have `requires_grad=True`.
+    There is an exception! All factory functions, or functions that create
+    a new Tensor and take a requires_grad kwarg, will NOT be affected by
+    this mode.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Also functions as a decorator.
+
+    .. note::
+        No-grad is one of several mechanisms that can enable or
+        disable gradients locally see :ref:`locally-disable-grad-doc` for
+        more information on how they compare.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+        If you want to disable forward AD for a computation, you can unpack
+        your dual tensors.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> x = torch.tensor([1.], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     y = x * 2
+        >>> y.requires_grad
+        False
+        >>> @torch.no_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> z = doubler(x)
+        >>> z.requires_grad
+        False
+        >>> @torch.no_grad
+        ... def tripler(x):
+        ...     return x * 3
+        >>> z = tripler(x)
+        >>> z.requires_grad
+        False
+        >>> # factory function exception
+        >>> with torch.no_grad():
+        ...     a = torch.nn.Parameter(torch.rand(10))
+        >>> a.requires_grad
+        True
+    """
+
+    def __init__(self) -> None:
+        if not torch._jit_internal.is_scripting():
+            super().__init__()
+        self.prev = False
+
+    def __enter__(self) -> None:
+        self.prev = torch.is_grad_enabled()
+        torch.set_grad_enabled(False)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch.set_grad_enabled(self.prev)
+
+
+class enable_grad(_NoParamDecoratorContextManager):
+    r"""Context-manager that enables gradient calculation.
+
+    Enables gradient calculation, if it has been disabled via :class:`~no_grad`
+    or :class:`~set_grad_enabled`.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Also functions as a decorator.
+
+    .. note::
+        enable_grad is one of several mechanisms that can enable or
+        disable gradients locally see :ref:`locally-disable-grad-doc` for
+        more information on how they compare.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> x = torch.tensor([1.], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     with torch.enable_grad():
+        ...         y = x * 2
+        >>> y.requires_grad
+        True
+        >>> y.backward()
+        >>> x.grad
+        tensor([2.])
+        >>> @torch.enable_grad()
+        ... def doubler(x):
+        ...     return x * 2
+        >>> with torch.no_grad():
+        ...     z = doubler(x)
+        >>> z.requires_grad
+        True
+        >>> @torch.enable_grad
+        ... def tripler(x):
+        ...     return x * 3
+        >>> with torch.no_grad():
+        ...     z = tripler(x)
+        >>> z.requires_grad
+        True
+
+    """
+
+    def __enter__(self) -> None:
+        self.prev = torch.is_grad_enabled()
+        torch._C._set_grad_enabled(True)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch._C._set_grad_enabled(self.prev)
+
+
+class set_grad_enabled(_DecoratorContextManager):
+    r"""Context-manager that sets gradient calculation on or off.
+
+    ``set_grad_enabled`` will enable or disable grads based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Args:
+        mode (bool): Flag whether to enable grad (``True``), or disable
+                     (``False``). This can be used to conditionally enable
+                     gradients.
+
+    .. note::
+        set_grad_enabled is one of several mechanisms that can enable or
+        disable gradients locally see :ref:`locally-disable-grad-doc` for
+        more information on how they compare.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> x = torch.tensor([1.], requires_grad=True)
+        >>> is_train = False
+        >>> with torch.set_grad_enabled(is_train):
+        ...     y = x * 2
+        >>> y.requires_grad
+        False
+        >>> _ = torch.set_grad_enabled(True)
+        >>> y = x * 2
+        >>> y.requires_grad
+        True
+        >>> _ = torch.set_grad_enabled(False)
+        >>> y = x * 2
+        >>> y.requires_grad
+        False
+
+    """
+
+    def __init__(self, mode: bool) -> None:
+        self.prev = torch.is_grad_enabled()
+        self.mode = mode
+        torch._C._set_grad_enabled(mode)
+
+    def __call__(self, orig_func: F) -> F:
+        torch._C._set_grad_enabled(self.prev)
+        return super().__call__(orig_func)
+
+    def __enter__(self) -> None:
+        torch._C._set_grad_enabled(self.mode)
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch._C._set_grad_enabled(self.prev)
+
+    def clone(self) -> "set_grad_enabled":
+        r"""
+        Create a copy of this class
+        """
+        return self.__class__(self.mode)
+
+
+class inference_mode(_DecoratorContextManager):
+    r"""Context-manager that enables or disables inference mode.
+
+    InferenceMode is a new context manager analogous to :class:`~no_grad`
+    to be used when you are certain your operations will have no interactions
+    with autograd (e.g., model training). Code run under this mode gets better
+    performance by disabling view tracking and version counter bumps. Note that
+    unlike some other mechanisms that locally enable or disable grad,
+    entering inference_mode also disables to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Also functions as a decorator.
+
+    .. note::
+        Inference mode is one of several mechanisms that can enable or
+        disable gradients locally see :ref:`locally-disable-grad-doc` for
+        more information on how they compare.
+
+    Args:
+        mode (bool or function): Either a boolean flag whether to enable or
+            disable inference mode or a Python function to decorate with
+            inference mode enabled
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> import torch
+        >>> x = torch.ones(1, 2, 3, requires_grad=True)
+        >>> with torch.inference_mode():
+        ...     y = x * x
+        >>> y.requires_grad
+        False
+        >>> # xdoctest: +SKIP("want string isnt quite right")
+        >>> y._version
+        Traceback (most recent call last):
+        File "<stdin>", line 1, in <module>
+        RuntimeError: Inference tensors do not track version counter.
+        >>> @torch.inference_mode()
+        ... def func(x):
+        ...     return x * x
+        >>> out = func(x)
+        >>> out.requires_grad
+        False
+        >>> @torch.inference_mode
+        ... def doubler(x):
+        ...     return x * 2
+        >>> out = doubler(x)
+        >>> out.requires_grad
+        False
+
+    """
+
+    def __init__(self, mode: bool = True) -> None:
+        if not torch._jit_internal.is_scripting():
+            super().__init__()
+        self.mode = mode
+
+    def __new__(cls, mode=True):
+        if isinstance(mode, bool):
+            return super().__new__(cls)
+        return cls()(mode)
+
+    def __enter__(self) -> None:
+        self._inference_mode_context = torch._C._InferenceMode(self.mode)
+        self._inference_mode_context.__enter__()
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self._inference_mode_context.__exit__(exc_type, exc_value, traceback)
+
+    def clone(self) -> "inference_mode":
+        r"""
+        Create a copy of this class
+        """
+        return self.__class__(self.mode)
+
+
+def _enter_inference_mode(mode):
+    mode_context = torch._C._InferenceMode(mode)
+    mode_context.__enter__()
+    return mode_context
+
+
+def _exit_inference_mode(mode):
+    mode.__exit__(None, None, None)
+
+
+class set_multithreading_enabled(_DecoratorContextManager):
+    r"""Context-manager that sets multithreaded backwards on or off.
+
+    ``set_multithreading_enabled`` will enable or disable multithreaded backwards based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    Args:
+        mode (bool): Flag whether to enable multithreaded backwards (``True``), or disable
+                     (``False``).
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    """
+
+    def __init__(self, mode: bool) -> None:
+        self.prev = torch._C._is_multithreading_enabled()
+        torch._C._set_multithreading_enabled(mode)
+        self.mode = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch._C._set_multithreading_enabled(self.prev)
+
+    def clone(self) -> "set_multithreading_enabled":
+        r"""
+        Create a copy of this class
+        """
+        return self.__class__(self.mode)
+
+
+class _force_original_view_tracking(_DecoratorContextManager):
+    r"""Context-manager that sets whether or not to always enable view-replay in autograd.
+
+    ``set_view_replay_enabled`` will enable or disable view-replay based on its argument :attr:`mode`.
+    It can be used as a context-manager or as a function.
+
+    This context manager is thread local; it will not affect computation
+    in other threads.
+
+    When a tensor view is mutated, the autograd engine needs to decide whether or not
+    to regenerate the "updated view" by either replaying the chain of views from the updated base,
+    or with a single call to as_strided.
+
+    If set_view_replay_enabled is set to True, then autograd will always use view replay.
+    Otherwise, it will fall back to its existing logic.
+
+    Args:
+        mode (bool): Flag whether to enable view-replay (``True``), or disable
+                     (``False``).
+
+    """
+
+    def __init__(self, mode: bool) -> None:
+        self.prev = torch._C._is_view_replay_enabled()
+        torch._C._set_view_replay_enabled(mode)
+        self.mode = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        torch._C._set_view_replay_enabled(self.prev)
+
+    def clone(self):
+        return self.__class__(self.mode)
+
+
+class _unsafe_preserve_version_counter(_DecoratorContextManager):
+    r"""DO NOT USE THIS UNLESS YOU KNOW EXACTLY WHAT YOU'RE DOING.
+
+    This context manager can lead to arbitrary silent-correctness issues in any other part of your code
+    (even the ones not touched directly by the context manager)!
+
+    Ordinarily, autograd will track mutations to tensors by incrementing it's `._version` attribute.
+    This is generally important for correctness, as for example, mutating a tensor that autograd has saved
+    for the backwards pass can result in incorrect gradients, and autograd uses the version counter to detect
+    and error out in this situation.
+
+    However, there are rare instances where it might be useful to hide mutations from autograd. For example:
+    if a tensor is very large, and you'd like to free its memory by storing it elsewhere, and re-populate
+    the tensor right before it is needed by autograd.
+
+    Args:
+        tensor (torch.Tensor): the tensor in question, that you would like to preserve the version counter of.
+
+    .. note::
+        This API does not apply to :ref:`forward-mode AD <forward-mode-ad>`.
+
+    """
+
+    def __init__(self, tensor: torch.Tensor) -> None:
+        self.tensor = tensor
+        self.prev_version = tensor._version
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args) -> None:
+        torch._C._autograd._unsafe_set_version_counter(self.tensor, self.prev_version)
diff --git a/MLPY/Lib/site-packages/torch/autograd/gradcheck.py b/MLPY/Lib/site-packages/torch/autograd/gradcheck.py
new file mode 100644
index 0000000000000000000000000000000000000000..7505c2fd5ff7d4112996d78863eadda8b184846c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/gradcheck.py
@@ -0,0 +1,2266 @@
+import collections
+import functools
+import warnings
+from itertools import product
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.testing
+from torch._vmap_internals import _vmap, vmap
+from torch.overrides import is_tensor_like
+from torch.types import _TensorOrTensors
+
+# Note: `get_*_jacobian` functions are added here even though we didn't intend to make them public
+# since they have been exposed from before we added `__all__`  and we already maintain BC for them
+# We should eventually deprecate them and remove them from `__all__`
+__all__ = [
+    "gradcheck",
+    "gradgradcheck",
+    "GradcheckError",
+    "get_numerical_jacobian",
+    "get_analytical_jacobian",
+    "get_numerical_jacobian_wrt_specific_input",
+]
+
+
+class GradcheckError(RuntimeError):
+    r"""Error raised by :func:`gradcheck` and :func:`gradgradcheck`."""
+
+    pass
+
+
+def _is_sparse_compressed_tensor(obj: torch.Tensor):
+    return obj.layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }
+
+
+def _is_sparse_any_tensor(obj: torch.Tensor):
+    return _is_sparse_compressed_tensor(obj) or obj.layout is torch.sparse_coo
+
+
+def _is_float_or_complex_tensor(obj):
+    return is_tensor_like(obj) and (obj.is_floating_point() or obj.is_complex())
+
+
+def _allocate_jacobians_with_inputs(
+    input_tensors: Tuple, numel_output
+) -> Tuple[torch.Tensor, ...]:
+    # Makes zero-filled tensors from inputs. If `numel_output` is not None, for
+    # each tensor in `input_tensors`, returns a new zero-filled tensor with height
+    # of `t.numel` and width of `numel_output`. Otherwise, for each tensor, returns
+    # a 1-d tensor with size `(t.numel,)`. Each new tensor will be strided and have
+    # the same dtype and device as those of the corresponding input.
+    out: List[torch.Tensor] = []
+    for t in input_tensors:
+        if _is_float_or_complex_tensor(t) and t.requires_grad:
+            out.append(t.new_zeros((t.numel(), numel_output), layout=torch.strided))
+    return tuple(out)
+
+
+def _allocate_jacobians_with_outputs(
+    output_tensors: Tuple, numel_input, dtype=None, device=None
+) -> Tuple[torch.Tensor, ...]:
+    # Makes zero-filled tensors from outputs. If `dim` is not None, for each tensor
+    # in `output_tensors`, returns a new zero-filled tensor with height of `dim` and
+    # width of `t.numel`. Otherwise, for each tensor, returns a 1-d tensor with size
+    # (t.numel,).
+    out: List[torch.Tensor] = []
+    options = {"dtype": dtype, "device": device, "layout": torch.strided}
+    for t in output_tensors:
+        if _is_float_or_complex_tensor(t):
+            out.append(t.new_zeros((numel_input, t.numel()), **options))
+    return tuple(out)
+
+
+def _iter_tensors(
+    x: Union[torch.Tensor, Iterable[torch.Tensor]], only_requiring_grad: bool = False
+) -> Iterable[torch.Tensor]:
+    if is_tensor_like(x):
+        # mypy doesn't narrow type of `x` to torch.Tensor
+        if x.requires_grad or not only_requiring_grad:  # type: ignore[union-attr]
+            yield x  # type: ignore[misc]
+    elif isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+        for elem in x:
+            yield from _iter_tensors(elem, only_requiring_grad)
+
+
+def _densify(x):
+    # return a copy of sparse x with all unspecified elements
+    # "replaced" with zero-valued elements
+    if isinstance(x, (list, tuple)):
+        return type(x)(map(_densify, x))
+    elif not is_tensor_like(x) or x.layout in {torch.strided, torch._mkldnn}:  # type: ignore[attr-defined] # no attr _mkldnn
+        return x
+    elif x.layout is torch.sparse_coo:
+        device = x.device
+        indices_dtype = x._indices().dtype
+        tmp = torch.ones(x.shape[: x.sparse_dim()], dtype=torch.int8, device=device)
+        indices = tmp.nonzero().t().to(dtype=indices_dtype)
+        values = torch.zeros(
+            (tmp.numel(), *x.shape[x.sparse_dim() :]), dtype=x.dtype, device=device
+        )
+        x_coalesced = x.detach().coalesce()
+        if x_coalesced.numel() > 0:
+            stride = tmp.stride()
+            flat_indices = (
+                x_coalesced.indices()
+                .mul(
+                    torch.tensor(stride, dtype=indices_dtype, device=device).unsqueeze(
+                        1
+                    )
+                )
+                .sum(0)
+            )
+            values[flat_indices] = x_coalesced.values()
+        return (
+            torch.sparse_coo_tensor(indices, values, x.shape)
+            ._coalesced_(True)
+            .requires_grad_(x.requires_grad)
+        )
+    elif _is_sparse_compressed_tensor(x):
+        blocksize = (
+            x.values().shape[1:3]
+            if x.layout in {torch.sparse_bsr, torch.sparse_bsc}
+            else None
+        )
+        compressed_indices = (
+            x.crow_indices()
+            if x.layout in {torch.sparse_csr, torch.sparse_bsr}
+            else x.ccol_indices()
+        )
+        # We'll use intermediate sparse COO for simplicity
+        r = _densify(x.detach().to_sparse(layout=torch.sparse_coo)).to_sparse(
+            layout=x.layout, blocksize=blocksize
+        )
+        # Check that all elements are specified also after `to_sparse` op:
+        dense_numel = r.values().numel() // max(1, r.values().shape[0])
+        batch_numel = compressed_indices.numel() // compressed_indices.shape[-1]
+        sparse_numel = r.numel() // max(1, dense_numel * batch_numel)
+        if sparse_numel != r._nnz():
+            raise AssertionError(
+                f"{x.layout} densify failed: expected nnz={sparse_numel} but got {r._nnz()}"
+            )
+        return r.requires_grad_(x.requires_grad)
+    elif _is_sparse_any_tensor(x):
+        raise NotImplementedError(x.layout)
+    return x
+
+
+def _iter_tensor(x_tensor):
+    # (Only used for slow gradcheck) Returns a generator that yields the following
+    # elements at each iteration:
+    #  1) a tensor: the same tensor is returned across all iterations. The tensor
+    #     is not the same as the original x_tensor as given as input - it is
+    #     prepared so that it can be modified in-place. Depending on whether the
+    #     input tensor is strided, sparse, or dense, the returned tensor may or may
+    #     not share storage with x_tensor.
+    #  2) a tuple of indices that can be used with advanced indexing (yielded in
+    #     dictionary order)
+    #  3) flattened index that will be used to index into the Jacobian tensor
+    #
+    # For a tensor t with size (2, 2), _iter_tensor yields:
+    #     `x, (0, 0), 0`, `x, (0, 1), 1`, `x, (1, 0), 2`, `x, (1, 1), 3`
+    #
+    # where x is the t.data of the original tensor. Perturbing the entry of x
+    # at index (1, 1) yields the 3rd column of the overall Jacobian matrix.
+    if _is_sparse_any_tensor(x_tensor):
+
+        def get_stride(size):
+            dim = len(size)
+            tmp = 1
+            stride = [0] * dim
+            for i in reversed(range(dim)):
+                stride[i] = tmp
+                tmp *= size[i]
+            return stride
+
+        x_nnz = x_tensor._nnz()
+        x_size = list(x_tensor.size())
+        if x_tensor.layout is torch.sparse_coo:
+            x_indices = x_tensor._indices().t()
+            x_values = x_tensor._values()
+        elif x_tensor.layout is torch.sparse_csr:
+            x_indices = torch._convert_indices_from_csr_to_coo(
+                x_tensor.crow_indices(), x_tensor.col_indices()
+            ).t()
+            x_values = x_tensor.values()
+        elif x_tensor.layout is torch.sparse_csc:
+            x_indices = torch._convert_indices_from_csr_to_coo(
+                x_tensor.ccol_indices(), x_tensor.row_indices(), transpose=True
+            ).t()
+            x_values = x_tensor.values()
+        elif x_tensor.layout is torch.sparse_bsr:
+            x_block_values = x_tensor.values()
+            x_blocksize = x_block_values.size()[1:3]
+            x_indices = (
+                torch._convert_indices_from_csr_to_coo(
+                    x_tensor.crow_indices(), x_tensor.col_indices()
+                )
+                .repeat_interleave(x_blocksize[0] * x_blocksize[1], 1)
+                .mul_(torch.tensor(x_blocksize, device=x_tensor.device).reshape(2, 1))
+                .add_(
+                    torch.stack(
+                        torch.where(torch.ones(x_blocksize, device=x_tensor.device))
+                    ).repeat(1, x_nnz)
+                )
+                .t()
+            )
+            x_values = x_block_values.flatten(0, 2)
+            x_nnz = x_values.size(0)
+        elif x_tensor.layout is torch.sparse_bsc:
+            x_block_values = x_tensor.values()
+            x_blocksize = x_block_values.size()[1:3]
+            x_indices = (
+                torch._convert_indices_from_csr_to_coo(
+                    x_tensor.ccol_indices(), x_tensor.row_indices(), transpose=True
+                )
+                .repeat_interleave(x_blocksize[0] * x_blocksize[1], 1)
+                .mul_(torch.tensor(x_blocksize, device=x_tensor.device).reshape(2, 1))
+                .add_(
+                    torch.stack(
+                        torch.where(torch.ones(x_blocksize, device=x_tensor.device))
+                    ).repeat(1, x_nnz)
+                )
+                .t()
+            )
+            x_values = x_block_values.flatten(0, 2)
+            x_nnz = x_values.size(0)
+        else:
+            raise NotImplementedError(f"_iter_tensor for {x_tensor.layout} input")
+        x_stride = get_stride(x_size)
+        # Use .data here to get around the version check
+        x_values = x_values.data
+        for i in range(x_nnz):
+            x_value = x_values[i]
+            for x_idx in product(*[range(m) for m in x_values.size()[1:]]):
+                indices = x_indices[i].tolist() + list(x_idx)
+                d_idx = sum(indices[k] * x_stride[k] for k in range(len(x_size)))
+                yield x_value, x_idx, d_idx
+    elif x_tensor.layout == torch._mkldnn:  # type: ignore[attr-defined]
+        for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
+            # this is really inefficient, but without indexing implemented, there's
+            # not really a better way than converting back and forth
+            x_tensor_dense = x_tensor.to_dense()
+            yield x_tensor_dense, x_idx, d_idx
+    else:
+        # Use .data here to get around the version check
+        x_tensor = x_tensor.data
+        for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
+            yield x_tensor, x_idx, d_idx
+
+
+def _get_numerical_jacobian(
+    fn, inputs, outputs=None, target=None, eps=1e-3, is_forward_ad=False
+) -> List[Tuple[torch.Tensor, ...]]:
+    """Compute the numerical Jacobian of `fn(inputs)` with respect to `target`.
+
+    If not specified, targets are the input. Returns M * N Jacobians where N is the
+    number of tensors in target that require grad and M is the number of non-integral
+    outputs.
+
+    Args:
+        fn: the function to compute the jacobian for
+        inputs: inputs to `fn`
+        outputs: provide precomputed outputs to avoid one extra invocation of fn
+        target: the Tensors wrt whom Jacobians are calculated (default=`inputs`)
+        eps: the magnitude of the perturbation during finite differencing
+             (default=`1e-3`)
+        is_forward_ad: if this numerical jacobian is computed to be checked wrt
+                       forward AD gradients (this is used for error checking only)
+
+    Returns:
+        A list of M N-tuples of tensors
+
+    Note that `target` may not even be part of `input` to `fn`, so please be
+    **very careful** in this to not clone `target`.
+    """
+    jacobians: List[Tuple[torch.Tensor, ...]] = []
+    if outputs is None:
+        outputs = _as_tuple(fn(*_as_tuple(inputs)))
+    if not is_forward_ad and any(o.is_complex() for o in outputs):
+        raise ValueError(
+            "Expected output to be non-complex. get_numerical_jacobian no "
+            "longer supports functions that return complex outputs."
+        )
+    if target is None:
+        target = inputs
+    inp_indices = [
+        i for i, a in enumerate(target) if is_tensor_like(a) and a.requires_grad
+    ]
+    for i, (inp, inp_idx) in enumerate(zip(_iter_tensors(target, True), inp_indices)):
+        jacobians += [
+            get_numerical_jacobian_wrt_specific_input(
+                fn,
+                inp_idx,
+                inputs,
+                outputs,
+                eps,
+                input=inp,
+                is_forward_ad=is_forward_ad,
+            )
+        ]
+    return jacobians
+
+
+def get_numerical_jacobian(fn, inputs, target=None, eps=1e-3, grad_out=1.0):
+    """Compute the numerical Jacobian for a given fn and its inputs.
+
+    This is a Deprecated API.
+
+    Args:
+        fn: the function to compute the Jacobian for (must take inputs as a tuple)
+        input: input to `fn`
+        target: the Tensors wrt whom Jacobians are calculated (default=`input`)
+        eps: the magnitude of the perturbation during finite differencing
+             (default=`1e-3`)
+
+    Returns:
+        A list of Jacobians of `fn` (restricted to its first output) with respect to
+        each input or target, if provided.
+
+    Note that `target` may not even be part of `input` to `fn`, so please be
+    **very careful** in this to not clone `target`.
+    """
+    warnings.warn(
+        "get_numerical_jacobian was part of PyTorch's private API and not "
+        "meant to be exposed. We are deprecating it and it will be removed "
+        "in a future version of PyTorch. If you have a specific use for "
+        "this or feature request for this to be a stable API, please file "
+        "us an issue at https://github.com/pytorch/pytorch/issues/new"
+    )
+    if (
+        grad_out != 1.0
+    ):  # grad_out param is only kept for backward compatibility reasons
+        raise ValueError(
+            "Expected grad_out to be 1.0. get_numerical_jacobian no longer "
+            "supports values of grad_out != 1.0."
+        )
+
+    def fn_pack_inps(*inps):
+        return fn(inps)
+
+    jacobians = _get_numerical_jacobian(fn_pack_inps, inputs, None, target, eps)
+
+    return tuple(jacobian_for_each_output[0] for jacobian_for_each_output in jacobians)
+
+
+def _compute_numerical_gradient(fn, entry, v, norm_v, nbhd_checks_fn):
+    # Computes numerical directional derivative as finite difference
+    # of function `fn` at input `entry`, perturbed by vector `v`.
+    if _is_sparse_compressed_tensor(entry):
+        # sparse compressed tensors don't implement sub/add/copy_
+        # yet. However, in non-masked semantics context entry and v
+        # have the same sparse indices ...
+        assert entry.layout == v.layout, (entry.layout, v.layout)
+        assert entry._nnz() == v._nnz(), (entry._nnz(), v._nnz(), entry.shape)
+        # ... the finite differencing can be performed on values only:
+        entry = entry.values()
+        v = v.values()
+        # we'll detach to avoid backward computations that sparse
+        # tensors have limited support for.
+        entry = entry.detach()
+
+    orig = entry.clone()
+    entry.copy_(orig - v)
+    outa = fn()
+    entry.copy_(orig + v)
+    outb = fn()
+    entry.copy_(orig)
+
+    def compute(a, b):
+        nbhd_checks_fn(a, b)
+        ret = (b - a) / (2 * norm_v)  # use central difference approx
+        return ret.detach().reshape(-1)
+
+    return tuple(compute(a, b) for (a, b) in zip(outa, outb))
+
+
+def _compute_numerical_jvps_wrt_specific_input(
+    jvp_fn, delta, input_is_complex, is_forward_ad=False
+) -> List[torch.Tensor]:
+    # Computing the jacobian only works for real delta
+    # For details on the algorithm used here, refer:
+    # Section 3.5.3 https://arxiv.org/pdf/1701.00392.pdf
+    # s = fn(z) where z = x for real valued input
+    # and z = x + yj for complex valued input
+    jvps: List[torch.Tensor] = []
+    ds_dx_tup = jvp_fn(delta[0] if isinstance(delta, tuple) else delta)
+
+    if input_is_complex:  # C -> R
+        ds_dy_tup = (
+            jvp_fn(delta[1] * 1j) if isinstance(delta, tuple) else jvp_fn(delta * 1j)
+        )
+        for ds_dx, ds_dy in zip(ds_dx_tup, ds_dy_tup):
+            assert not ds_dx.is_complex()
+            # conjugate wirtinger derivative
+            conj_w_d = ds_dx + ds_dy * 1j
+            jvps.append(conj_w_d)
+    else:
+        for ds_dx in ds_dx_tup:  # R -> R or (R -> C for the forward AD case)
+            assert is_forward_ad or not ds_dx.is_complex()
+            jvps.append(ds_dx)
+    return jvps
+
+
+def _combine_jacobian_cols(
+    jacobians_cols: Dict[int, List[torch.Tensor]], outputs, input, numel
+) -> Tuple[torch.Tensor, ...]:
+    # jacobian_cols maps column_idx -> output_idx -> single column of jacobian Tensor
+    # we return a list that maps output_idx -> full jacobian Tensor
+    jacobians = _allocate_jacobians_with_outputs(
+        outputs, numel, dtype=input.dtype if input.dtype.is_complex else None
+    )
+    for i, jacobian in enumerate(jacobians):
+        for k, v in jacobians_cols.items():
+            jacobian[k] = v[i]
+    return jacobians
+
+
+def _prepare_input(
+    input: torch.Tensor, maybe_perturbed_input: Optional[torch.Tensor], fast_mode=False
+) -> torch.Tensor:
+    # Prepares the inputs to be passed into the function while including the new
+    # modified input.
+    if input.layout == torch._mkldnn:  # type: ignore[attr-defined] # no attr _mkldnn
+        # Convert back to mkldnn
+        if maybe_perturbed_input is not None:
+            return maybe_perturbed_input.to_mkldnn()
+        else:
+            return input
+    elif _is_sparse_any_tensor(input):
+        if fast_mode and maybe_perturbed_input is not None:
+            # entry is already a "cloned" version of the original tensor
+            # thus changes to entry are not reflected in the input
+            return maybe_perturbed_input
+        else:
+            return input
+    else:
+        # We cannot use entry (input.data) if we want gradgrad to work because
+        # fn (in the gradgrad case) needs to compute grad wrt input
+        return input
+
+
+def _check_outputs_same_dtype_and_shape(output1, output2, eps, idx=None) -> None:
+    # Check that the returned outputs don't have different dtype or shape when you
+    # perturb the input
+    on_index = "on index {idx} " if idx is not None else ""
+    assert output1.shape == output2.shape, (
+        f"Expected `func` to return outputs with the same shape"
+        f" when inputs are perturbed {on_index}by {eps}, but got:"
+        f" shapes {output1.shape} and {output2.shape}."
+    )
+    assert output1.dtype == output2.dtype, (
+        f"Expected `func` to return outputs with the same dtype"
+        f" when inputs are perturbed {on_index}by {eps}, but got:"
+        f" dtypes {output1.dtype} and {output2.dtype}."
+    )
+
+
+def get_numerical_jacobian_wrt_specific_input(
+    fn, input_idx, inputs, outputs, eps, input=None, is_forward_ad=False
+) -> Tuple[torch.Tensor, ...]:
+    # Computes the numerical jacobians wrt to a single input. Returns N jacobian
+    # tensors, where N is the number of outputs. We use a dictionary for
+    # jacobian_cols because indices aren't necessarily consecutive for sparse inputs
+    # When we perturb only a single element of the input tensor at a time, the jvp
+    # is equivalent to a single col of the Jacobian matrix of fn.
+    jacobian_cols: Dict[int, List[torch.Tensor]] = {}
+    input = inputs[input_idx] if input is None else input
+    assert input.requires_grad
+    for x, idx, d_idx in _iter_tensor(input):
+        wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, x)
+        input_to_perturb = x[idx]
+        nbhd_checks_fn = functools.partial(
+            _check_outputs_same_dtype_and_shape, idx=idx, eps=eps
+        )
+        jvp_fn = _get_numerical_jvp_fn(
+            wrapped_fn, input_to_perturb, eps, nbhd_checks_fn
+        )
+        jacobian_cols[d_idx] = _compute_numerical_jvps_wrt_specific_input(
+            jvp_fn, eps, x.is_complex(), is_forward_ad
+        )
+    return _combine_jacobian_cols(jacobian_cols, outputs, input, input.numel())
+
+
+def _get_analytical_jacobian_forward_ad(
+    fn, inputs, outputs, *, check_grad_dtypes=False, all_u=None
+) -> Tuple[Tuple[torch.Tensor, ...], ...]:
+    """Compute the analytical Jacobian using forward mode AD of `fn(inputs)` using forward mode AD with respect to `target`.
+
+    Return N * M Jacobians where N is the number of tensors in target that require grad and
+    M is the number of non-integral outputs.
+    Contrary to other functions here, this function requires "inputs" to actually be used by the function.
+    The computed value is expected to be wrong if the function captures the inputs by side effect instead of
+    using the passed ones (many torch.nn tests do this).
+
+    Args:
+        fn: the function to compute the jacobian for
+        inputs: inputs to `fn`
+        outputs: provide precomputed outputs to avoid one extra invocation of fn
+        check_grad_dtypes: if True, will check that the gradient dtype are valid
+        all_u (optional): if provided, the Jacobian will be right multiplied with this vector
+
+    Returns:
+        A tuple of M N-tuples of tensors
+    """
+    # To avoid early import issues
+    fwAD = torch.autograd.forward_ad
+
+    tensor_inputs = tuple(i for i in inputs if is_tensor_like(i) and i.requires_grad)
+
+    if any(i.is_complex() for i in tensor_inputs):
+        raise ValueError(
+            "Expected inputs to be non-complex for _get_analytical_jacobian_forward_ad."
+        )
+
+    if all_u:
+        jacobians = tuple(
+            _allocate_jacobians_with_outputs(outputs, 1) for i in tensor_inputs
+        )
+    else:
+        jacobians = tuple(
+            _allocate_jacobians_with_outputs(outputs, i.numel()) for i in tensor_inputs
+        )
+
+    with fwAD.dual_level():
+        fw_grads = []
+        dual_inputs = []
+        for i, inp in enumerate(inputs):
+            if is_tensor_like(inp) and inp.requires_grad:
+                if inp.layout == torch._mkldnn:  # type: ignore[attr-defined]
+                    raise ValueError(
+                        "MKLDNN inputs are not support for forward AD gradcheck."
+                    )
+
+                inp = fwAD.make_dual(inp.detach(), torch.zeros_like(inp))
+                # If inp is a differentiable view, the dual might not be the tangent given to
+                # make_dual, so read it explicitly from the dual tensor
+                fw_grads.append(fwAD.unpack_dual(inp)[1])
+            dual_inputs.append(inp)
+
+        if all_u:
+            # Do the full reduction in one pass
+            # To be consistent with numerical evaluation, we actually compute one reduction per input
+            for i, (fw_grad, u) in enumerate(zip(fw_grads, all_u)):
+                fw_grad.copy_(u.view_as(fw_grad))
+                raw_outputs = _as_tuple(fn(*dual_inputs))
+                dual_outputs = filter(_is_float_or_complex_tensor, raw_outputs)
+                for index_o, d_o in enumerate(dual_outputs):
+                    val, res = fwAD.unpack_dual(d_o)
+                    if (
+                        check_grad_dtypes
+                        and res is not None
+                        and val.is_complex() != res.is_complex()
+                    ):
+                        raise GradcheckError("Forward AD gradient has dtype mismatch.")
+
+                    # Remove extra dimension of size 1 corresponding to the reduced input
+                    jacobians[i][index_o].squeeze_(0)
+                    if res is None:
+                        jacobians[i][index_o].zero_()
+                    else:
+                        jacobians[i][index_o].copy_(res.reshape(-1))
+                fw_grad.zero_()
+        else:
+            # Reconstruct the full Jacobian column by column
+            for i, fw_grad in enumerate(fw_grads):
+                for lin_idx, grad_idx in enumerate(
+                    product(*[range(m) for m in fw_grad.size()])
+                ):
+                    fw_grad[grad_idx] = 1.0
+                    raw_outputs = _as_tuple(fn(*dual_inputs))
+                    dual_outputs = filter(_is_float_or_complex_tensor, raw_outputs)
+                    for index_o, d_o in enumerate(dual_outputs):
+                        val, res = fwAD.unpack_dual(d_o)
+                        if (
+                            check_grad_dtypes
+                            and res is not None
+                            and val.is_complex() != res.is_complex()
+                        ):
+                            raise GradcheckError(
+                                "Forward AD gradient has dtype mismatch."
+                            )
+
+                        if res is None:
+                            jacobians[i][index_o][lin_idx].zero_()
+                        else:
+                            jacobians[i][index_o][lin_idx].copy_(res.reshape(-1))
+                    fw_grad[grad_idx] = 0.0
+
+    return jacobians
+
+
+def _get_input_to_perturb(input):
+    # Prepare the input so that it can be modified in-place and do certain
+    # operations that require the tensor to have strides. If fast_mode=False,
+    # _iter_tensor would handle the below cases:
+    if input.layout == torch._mkldnn:  # type: ignore[attr-defined] # no attr _mkldnn
+        # Convert to dense so we can perform operations that require strided tensors
+        input_to_perturb = input.to_dense()
+    elif _is_sparse_any_tensor(input):
+        # Clone because input may require grad, and copy_ calls resize_,
+        # which is not allowed for .data
+        input_to_perturb = input.clone()
+    else:
+        input_to_perturb = input.data
+    return input_to_perturb
+
+
+def _with_prepare_inputs(fn, inputs, input_idx, input_to_perturb, fast_mode=False):
+    # Wraps `fn` so that its inputs are already supplied
+    def wrapped_fn():
+        inp = tuple(
+            _prepare_input(a, input_to_perturb if i == input_idx else None, fast_mode)
+            if is_tensor_like(a)
+            else a
+            for i, a in enumerate(_as_tuple(inputs))
+        )
+        return tuple(a.clone() for a in _as_tuple(fn(*inp)))
+
+    return wrapped_fn
+
+
+def _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn):
+    # Wraps jvp_fn so that certain arguments are already supplied
+    def jvp_fn(delta):
+        return _compute_numerical_gradient(
+            wrapped_fn, input_to_perturb, delta, eps, nbhd_checks_fn
+        )
+
+    return jvp_fn
+
+
+def _reshape_tensor_or_tuple(u, shape):
+    # We don't need to reshape when input corresponding to u is sparse
+    if isinstance(u, tuple):
+        if not _is_sparse_any_tensor(u[0]):
+            return (u[0].reshape(shape), u[1].reshape(shape))
+    else:
+        if not _is_sparse_any_tensor(u):
+            return u.reshape(shape)
+    return u
+
+
+def _mul_tensor_or_tuple(u, k):
+    if isinstance(u, tuple):
+        return (k * u[0], k * u[1])
+    else:
+        return k * u
+
+
+def _get_numerical_jvp_wrt_specific_input(
+    fn, input_idx, inputs, u, eps, is_forward_ad=False
+) -> List[torch.Tensor]:
+    input = inputs[input_idx]
+    input_to_perturb = _get_input_to_perturb(input)
+    wrapped_fn = _with_prepare_inputs(fn, inputs, input_idx, input_to_perturb, True)
+    nbhd_checks_fn = functools.partial(_check_outputs_same_dtype_and_shape, eps=eps)
+    jvp_fn = _get_numerical_jvp_fn(wrapped_fn, input_to_perturb, eps, nbhd_checks_fn)
+    u = _reshape_tensor_or_tuple(u, input_to_perturb.shape)
+    u = _mul_tensor_or_tuple(u, eps)
+    return _compute_numerical_jvps_wrt_specific_input(
+        jvp_fn, u, input.is_complex(), is_forward_ad
+    )
+
+
+def _get_numerical_vJu(
+    fn, inputs, inp_indices, func_out, all_u, all_v, eps, is_forward_ad
+):
+    # Note that all_v can also be None, in that case, this function only computes Ju.
+    reduced_jacobians: List[List[torch.Tensor]] = []
+    for i, (inp_idx, u) in enumerate(zip(inp_indices, all_u)):
+        all_Ju = _get_numerical_jvp_wrt_specific_input(
+            fn, inp_idx, inputs, u, eps, is_forward_ad
+        )
+        # Filter out the Ju for non floating point outputs
+        filtered_Ju = []
+        func_out = _as_tuple(func_out)
+        assert len(all_Ju) == len(func_out)
+        for Ju, output in zip(all_Ju, func_out):
+            if _is_float_or_complex_tensor(output):
+                filtered_Ju.append(Ju)
+            else:
+                # TODO: handle the other Ju
+                pass
+        if all_v is not None:
+            jacobian_scalars: List[torch.Tensor] = []
+            for v, Ju in zip(all_v, filtered_Ju):
+                jacobian_scalars.append(_dot_with_type_promotion(v, Ju))
+            reduced_jacobians.append(jacobian_scalars)
+        else:
+            reduced_jacobians.append(filtered_Ju)
+    return reduced_jacobians
+
+
+def _check_jacobians_equal(j1, j2, atol):
+    # Check whether the max difference between two Jacobian tensors are within some
+    # tolerance `atol`.
+    for j1_x, j2_x in zip(j1, j2):
+        if j1_x.numel() != 0 and (j1_x - j2_x).abs().max() > atol:
+            return False
+    return True
+
+
+def _stack_and_check_tensors(
+    list_of_list_of_tensors, inputs, numel_outputs
+) -> Tuple[Tuple[torch.Tensor, ...], bool, bool]:
+    # For the ith tensor in the inner list checks whether it has the same size and
+    # dtype as the ith differentiable input.
+    out_jacobians = _allocate_jacobians_with_inputs(inputs, numel_outputs)
+    diff_input_list = list(_iter_tensors(inputs, True))
+    correct_grad_sizes = True
+    correct_grad_types = True
+    for i, tensor_list in enumerate(list_of_list_of_tensors):
+        inp = diff_input_list[i]
+        out_jacobian = out_jacobians[i]
+        for j, tensor in enumerate(tensor_list):
+            if tensor is not None and tensor.size() != inp.size():
+                correct_grad_sizes = False
+            elif tensor is not None and tensor.dtype != inp.dtype:
+                correct_grad_types = False
+            if tensor is None:
+                out_jacobian[:, j].zero_()
+            else:
+                dense = (
+                    tensor.to_dense() if not tensor.layout == torch.strided else tensor
+                )
+                assert out_jacobian[:, j].numel() == dense.numel()
+                out_jacobian[:, j] = dense.reshape(-1)
+    return out_jacobians, correct_grad_sizes, correct_grad_types
+
+
+FAILED_NONDET_MSG = """\n
+NOTE: If your op relies on non-deterministic operations i.e., it is listed here:
+https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
+this failure might be expected.
+
+If you are adding a new operator, please file an issue and then use one of the
+workarounds. The workaround depends on how your test invokes gradcheck/gradgradcheck.
+If the test
+- manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
+  with `nondet_tol=<tol>` as a keyword argument.
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
+  to have `gradcheck_nondet_tol=<tol>`.
+- is a Module test (e.g., in common_nn.py), then modify the corresponding
+  module_test entry to have `gradcheck_nondet_tol=<tol>`
+"""
+
+
+def _check_analytical_jacobian_attributes(
+    inputs, output, nondet_tol, check_grad_dtypes, fast_mode=False, v=None
+) -> Tuple[torch.Tensor, ...]:
+    # This is used by both fast and slow mode:
+    #  - For slow mode, vjps[i][j] is the jth row of the Jacobian wrt the ith
+    #    input.
+    #  - For fast mode, vjps[i][0] is a linear combination of the rows
+    #    of the Jacobian wrt the ith input
+    diff_input_list = list(_iter_tensors(inputs, True))
+
+    def vjp_fn(grad_output):
+        return torch.autograd.grad(
+            output, diff_input_list, grad_output, retain_graph=True, allow_unused=True
+        )
+
+    # Compute everything twice to check for nondeterminism (which we call reentrancy)
+    if fast_mode:
+        vjps1 = _get_analytical_vjps_wrt_specific_output(vjp_fn, output.clone(), v)
+        vjps2 = _get_analytical_vjps_wrt_specific_output(vjp_fn, output.clone(), v)
+    else:
+        vjps1 = _compute_analytical_jacobian_rows(vjp_fn, output.clone())
+        vjps2 = _compute_analytical_jacobian_rows(vjp_fn, output.clone())
+
+    output_numel = output.numel() if not fast_mode else 1
+    jacobians1, types_ok, sizes_ok = _stack_and_check_tensors(
+        vjps1, inputs, output_numel
+    )
+    jacobians2, _, _ = _stack_and_check_tensors(vjps2, inputs, output_numel)
+    reentrant = _check_jacobians_equal(jacobians1, jacobians2, nondet_tol)
+
+    if not types_ok and check_grad_dtypes:
+        raise GradcheckError("Gradient has dtype mismatch")
+    if not sizes_ok:
+        raise GradcheckError("Analytical gradient has incorrect size")
+    if not reentrant:
+        raise GradcheckError(
+            "Backward is not reentrant, i.e., running backward with "
+            "same input and grad_output multiple times gives different values, "
+            "although analytical gradient matches numerical gradient."
+            f"The tolerance for nondeterminism was {nondet_tol}." + FAILED_NONDET_MSG
+        )
+    return jacobians1
+
+
+def _get_analytical_vJu_backward_mode(
+    inputs, outputs, nondet_tol, check_grad_dtypes, all_v, all_u
+):
+    reduced_jacobians: List[List[torch.Tensor]] = []
+    for output, v in zip(outputs, all_v):
+        all_vJ = _check_analytical_jacobian_attributes(
+            inputs, output, nondet_tol, check_grad_dtypes, fast_mode=True, v=v
+        )
+        jacobian_scalars: List[torch.Tensor] = []
+        for vJ, u in zip(all_vJ, all_u):
+            # Why do we need squeeze here? vJ is a 2-d tensor so that we can reuse
+            # the error checking logic from slow mode
+            vJ = vJ.T.squeeze(0)
+            if vJ.is_complex():  # C -> R
+                tv = torch.view_as_real(vJ.resolve_conj())
+                tr = tv.select(-1, 0)
+                ti = tv.select(-1, 1)
+                jacobian_scalars.append(tr.dot(u[0]) + 1j * ti.dot(u[1]))
+            else:  # R -> R
+                jacobian_scalars.append(vJ.dot(u))
+        reduced_jacobians.append(jacobian_scalars)
+    return reduced_jacobians
+
+
+def get_analytical_jacobian(inputs, output, nondet_tol=0.0, grad_out=1.0):
+    # Replicates the behavior of the old get_analytical_jacobian before the refactor
+    # This shares much of its code with _check_analytical_jacobian_attributes
+    warnings.warn(
+        "get_analytical_jacobian was part of PyTorch's private API and not "
+        "meant to be exposed. We are deprecating it and it will be removed "
+        "in a future version of PyTorch. If you have a specific use for "
+        "this or feature request for this to be a stable API, please file "
+        "us an issue at https://github.com/pytorch/pytorch/issues/new"
+    )
+    if (
+        grad_out != 1.0
+    ):  # grad_out param is only kept for backward compatibility reasons
+        raise ValueError(
+            "Expected grad_out to be 1.0. get_analytical_jacobian no longer "
+            "supports values of grad_out != 1.0."
+        )
+    if output.is_complex():
+        raise ValueError(
+            "Expected output to be non-complex. get_analytical_jacobian no "
+            "longer supports functions that return complex outputs."
+        )
+    diff_input_list = list(_iter_tensors(inputs, True))
+
+    def vjp_fn(grad_output):
+        return torch.autograd.grad(
+            output, diff_input_list, grad_output, retain_graph=True, allow_unused=True
+        )
+
+    # Compute everything twice to check for nondeterminism (which we call reentrancy)
+    vjps1 = _compute_analytical_jacobian_rows(vjp_fn, output.clone())
+    vjps2 = _compute_analytical_jacobian_rows(vjp_fn, output.clone())
+
+    output_numel = output.numel()
+    jacobians1, types_ok, sizes_ok = _stack_and_check_tensors(
+        vjps1, inputs, output_numel
+    )
+    jacobians2, _, _ = _stack_and_check_tensors(vjps2, inputs, output_numel)
+    reentrant = _check_jacobians_equal(jacobians1, jacobians2, nondet_tol)
+
+    return jacobians1, reentrant, sizes_ok, types_ok
+
+
+def _get_analytical_jacobian(inputs, outputs, input_idx, output_idx):
+    # Computes the analytical Jacobian in slow mode for a single input-output pair.
+    # Forgoes performing checks on dtype, shape, and reentrancy.
+    jacobians = _check_analytical_jacobian_attributes(
+        inputs, outputs[output_idx], nondet_tol=float("inf"), check_grad_dtypes=False
+    )
+    return jacobians[input_idx]
+
+
+def _compute_analytical_jacobian_rows(
+    vjp_fn, sample_output
+) -> List[List[Optional[torch.Tensor]]]:
+    # Computes Jacobian row-by-row by projecting `vjp_fn` = v^T J on standard basis
+    # vectors: vjp_fn(e) = e^T J is a corresponding row of the Jacobian.
+    # NB: this function does not assume vjp_fn(v) to return tensors with the same
+    # number of elements for different v. This is checked when we later combine the
+    # rows into a single tensor.
+    grad_out_base = torch.zeros_like(
+        sample_output, memory_format=torch.legacy_contiguous_format
+    )
+    flat_grad_out = grad_out_base.view(-1)
+    # jacobians_rows[i][j] is the Jacobian jth row for the ith input
+    jacobians_rows: List[List[Optional[torch.Tensor]]] = []
+    for j in range(flat_grad_out.numel()):
+        flat_grad_out.zero_()
+        flat_grad_out[j] = 1.0  # projection for jth row of Jacobian
+        grad_inputs = vjp_fn(grad_out_base)
+        for i, d_x in enumerate(grad_inputs):
+            if j == 0:
+                jacobians_rows.append([])
+            jacobians_rows[i] += [
+                d_x.clone() if isinstance(d_x, torch.Tensor) else None
+            ]
+    return jacobians_rows
+
+
+def _get_analytical_vjps_wrt_specific_output(
+    vjp_fn, sample_output, v
+) -> List[List[Optional[torch.Tensor]]]:
+    vjps: List[List[Optional[torch.Tensor]]] = []
+    grad_inputs = vjp_fn(v.reshape(sample_output.shape))
+    for vjp in grad_inputs:
+        vjps.append([vjp.clone() if isinstance(vjp, torch.Tensor) else None])
+    return vjps
+
+
+def _check_inputs(tupled_inputs) -> bool:
+    # Make sure that gradients are saved for at least one input
+    any_input_requiring_grad = False
+    for idx, inp in enumerate(tupled_inputs):
+        if is_tensor_like(inp) and inp.requires_grad:
+            if not (inp.dtype == torch.float64 or inp.dtype == torch.complex128):
+                warnings.warn(
+                    f"Input #{idx} requires gradient and "
+                    "is not a double precision floating point or complex. "
+                    "This check will likely fail if all the inputs are "
+                    "not of double precision floating point or complex. "
+                )
+            if inp.is_sparse:
+                content = inp._values()
+            elif _is_sparse_compressed_tensor(inp):
+                content = inp.values()
+            else:
+                content = inp
+            # TODO: To cover more problematic cases, replace stride = 0 check with
+            # "any overlap in memory" once we have a proper function to check it.
+            if content.layout is not torch._mkldnn:  # type: ignore[attr-defined]
+                if not all(
+                    st > 0 or sz <= 1
+                    for st, sz in zip(content.stride(), content.size())
+                ):
+                    raise RuntimeError(
+                        f"The {idx}th input has a dimension with stride 0. gradcheck only "
+                        "supports inputs that are non-overlapping to be able to "
+                        "compute the numerical gradients correctly. You should call "
+                        ".contiguous on the input before passing it to gradcheck."
+                    )
+            any_input_requiring_grad = True
+
+    if not any_input_requiring_grad:
+        raise ValueError(
+            "gradcheck expects at least one input tensor to require gradient, "
+            "but none of the them have requires_grad=True."
+        )
+    return True
+
+
+def _check_outputs(outputs) -> None:
+    if any(_is_sparse_any_tensor(t) for t in outputs if isinstance(t, torch.Tensor)):
+        # it is easier to call to_dense() on the sparse output than
+        # to modify analytical jacobian
+        raise ValueError(
+            "Sparse output is not supported at gradcheck yet. "
+            "Please call to_dense(masked_grad=...) on the output of fn for gradcheck."
+        )
+    if any(t.layout == torch._mkldnn for t in outputs if isinstance(t, torch.Tensor)):  # type: ignore[attr-defined]
+        raise ValueError(
+            "MKLDNN output is not supported at gradcheck yet. "
+            "Please call to_dense(masked_grad=...) on the output of fn for gradcheck."
+        )
+
+
+def _check_no_differentiable_outputs(
+    func, inputs, func_out, eps, *, is_forward_ad
+) -> bool:
+    # When there are no differentiable outputs, numerical gradient for a function is
+    # expected to be zero.
+    jacobians_all_inputs_outputs = _get_numerical_jacobian(
+        func, inputs, func_out, eps=eps, is_forward_ad=is_forward_ad
+    )
+    for jacobians_all_outputs_and_fixed_input in jacobians_all_inputs_outputs:
+        for jacobian in jacobians_all_outputs_and_fixed_input:
+            if torch.ne(jacobian, 0).sum() > 0:
+                raise GradcheckError(
+                    "Numerical gradient for function expected to be zero"
+                )
+    return True
+
+
+def _check_no_differentiable_outputs_fast(
+    func, func_out, all_inputs, inputs_indices, all_u, eps, nondet_tol
+):
+    for inp_idx, u in zip(inputs_indices, all_u):
+        jvps = _get_numerical_jvp_wrt_specific_input(func, inp_idx, all_inputs, u, eps)
+        for jvp in jvps:
+            if jvp.numel() == 0:
+                continue
+            if (jvp - torch.zeros_like(jvp)).abs().max() > nondet_tol:
+                raise GradcheckError(
+                    "Numerical gradient for function expected to be zero"
+                )
+    return True
+
+
+FAILED_BATCHED_GRAD_MSG = """
+gradcheck or gradgradcheck failed while testing batched gradient computation.
+This could have been invoked in a number of ways (via a test that calls
+gradcheck/gradgradcheck directly or via an autogenerated test).
+
+If you are adding a new operator, please file an issue and then use one of the
+workarounds. The workaround depends on how your test invokes gradcheck/gradgradcheck.
+If the test
+- manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
+  with `check_batched_grad=False` as a keyword argument.
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
+  to have `check_batched_grad=False` and/or `check_batched_gradgrad=False`.
+
+If you're modifying an existing operator that supports batched grad computation,
+or wish to make a new operator work with batched grad computation, please read
+the following.
+
+To compute batched grads (e.g., jacobians, hessians), we vmap over the backward
+computation. The most common failure case is if there is a 'vmap-incompatible
+operation' in the backward pass. Please see
+NOTE: [How to write vmap-compatible backward formulas]
+in the codebase for an explanation of how to fix this.
+""".strip()
+
+FAILED_BATCHED_GRAD_MSG_FWD_AD = """
+gradcheck failed while testing batched gradient computation with forward-mode AD.
+This test is enabled automatically when both `check_batched_grad=True`
+and `check_forward_ad=True`, but can be disabled in the following ways
+dependong on how the test was invoked (via a test that calls gradcheck
+directly or via an autogenerated test).
+
+If you are adding a new operator, please file an issue and then use one of the
+workarounds. The workaround depends on how your test invokes gradcheck/gradgradcheck.
+If the test
+- manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
+  with `check_batched_forward_grad=False` as a keyword argument.
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
+  to have `check_batched_forward_grad=False`
+"""
+
+
+def _get_failed_batched_grad_test_msg(
+    output_idx, input_idx, res, exp, is_forward_ad=False
+):
+    return f"""
+For output {output_idx} and input {input_idx}:
+
+{FAILED_BATCHED_GRAD_MSG_FWD_AD if is_forward_ad else FAILED_BATCHED_GRAD_MSG}
+
+Got:
+{res}
+
+Expected:
+{exp}
+""".strip()
+
+
+def _test_batched_grad_forward_ad(func, inputs) -> bool:
+    fwAD = torch.autograd.forward_ad  # To avoid early import issues (do we need this?)
+    assert isinstance(inputs, tuple)
+
+    for input_idx, current_input in enumerate(inputs):
+        if not (is_tensor_like(current_input) and current_input.requires_grad):
+            continue
+
+        def jvp(tangent: torch.Tensor):
+            with fwAD.dual_level():
+                dual = fwAD.make_dual(current_input.detach(), tangent)
+                inputs_with_dual = tuple(
+                    dual
+                    if idx == input_idx
+                    else (inp.detach() if is_tensor_like(inp) else inp)
+                    for idx, inp in enumerate(inputs)
+                )
+                dual_outputs = _as_tuple(func(*inputs_with_dual))
+                ret = []
+                for dual_output in dual_outputs:
+                    if dual_output is None:
+                        continue
+                    primal_out, tangent_out = fwAD.unpack_dual(dual_output)
+                    if tangent_out is not None:
+                        ret.append(tangent_out)
+                    else:
+                        ret.append(
+                            torch.zeros(
+                                [], dtype=primal_out.dtype, device=primal_out.device
+                            ).expand(primal_out.shape)
+                        )
+                return tuple(ret)
+
+        if not _is_float_or_complex_tensor(current_input):
+            continue
+
+        tangents = [torch.randn_like(current_input) for _ in range(2)]
+        expected = [jvp(t) for t in tangents]
+        expected = [torch.stack(shards) for shards in zip(*expected)]
+
+        try:
+            result = _vmap(jvp)(torch.stack(tangents))
+        except RuntimeError as ex:
+            # Rethrow to provide a better error message
+            raise GradcheckError(
+                f"While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG_FWD_AD}"
+            ) from ex
+
+        for input_idx, (res, exp) in enumerate(zip(result, expected)):
+            if torch.allclose(res, exp):
+                continue
+            raise GradcheckError(
+                _get_failed_batched_grad_test_msg(
+                    input_idx, input_idx, res, exp, is_forward_ad=True
+                )
+            )
+    return True
+
+
+def _test_batched_grad(input, output, output_idx) -> bool:
+    # NB: _test_batched_grad compares two autograd.grad invocations with a single
+    # vmap(autograd.grad) invocation. It's not exactly a "gradcheck" in the
+    # sense that we're not comparing an analytical jacobian with a numeric one,
+    # but it is morally similar (we could have computed a full analytic jac
+    # via vmap, but that is potentially slow)
+    diff_input_list = list(_iter_tensors(input, True))
+    grad = functools.partial(
+        torch.autograd.grad,
+        output,
+        diff_input_list,
+        retain_graph=True,
+        allow_unused=True,
+    )
+
+    def vjp(v):
+        results = grad(v)
+        results = tuple(
+            grad
+            if grad is not None
+            else torch.zeros([], dtype=inp.dtype, device=inp.device).expand(inp.shape)
+            for grad, inp in zip(results, diff_input_list)
+        )
+        return results
+
+    grad_outputs = [torch.randn_like(output) for _ in range(2)]
+
+    expected = [vjp(gO) for gO in grad_outputs]
+    expected = [torch.stack(shards) for shards in zip(*expected)]
+
+    # Squash warnings since these are expected to happen in most cases
+    # NB: this doesn't work for CUDA tests: https://github.com/pytorch/pytorch/issues/50209
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message="There is a performance drop")
+        warnings.filterwarnings("ignore", message="Please use torch.vmap")
+        try:
+            result = vmap(vjp)(torch.stack(grad_outputs))
+        except RuntimeError as ex:
+            # It's OK that we're not raising the error at the correct callsite.
+            # That's because the callsite is always going to inside the Python
+            # autograd.grad instead of the C++ traceback of what line in the
+            # backward formula
+            raise GradcheckError(
+                f"While computing batched gradients, got: {ex}\n\n{FAILED_BATCHED_GRAD_MSG}"
+            ) from ex
+
+    for input_idx, (res, exp) in enumerate(zip(result, expected)):
+        if torch.allclose(res, exp):
+            continue
+        raise GradcheckError(
+            _get_failed_batched_grad_test_msg(output_idx, input_idx, res, exp)
+        )
+    return True
+
+
+def _test_backward_mul_by_grad_output(outputs, inputs, masked) -> bool:
+    # Tests that backward is multiplied by grad_output
+    diff_input_list: List[torch.Tensor] = list(_iter_tensors(inputs, True))
+    if not diff_input_list:
+        raise GradcheckError("no Tensors requiring grad found in input")
+    grads_input = torch.autograd.grad(
+        outputs,
+        diff_input_list,
+        [
+            torch.zeros_like(o, memory_format=torch.legacy_contiguous_format)
+            for o in outputs
+        ],
+        allow_unused=True,
+    )
+    for gi, di in zip(grads_input, diff_input_list):
+        if gi is None:
+            continue
+        if isinstance(gi, torch.Tensor) and gi.layout != torch.strided:
+            if gi.layout != di.layout:
+                raise GradcheckError(
+                    "grad is incorrect layout ("
+                    + str(gi.layout)
+                    + " is not "
+                    + str(di.layout)
+                    + ")"
+                )
+            if _is_sparse_any_tensor(gi):
+                sparse_kind = str(gi.layout).replace("torch.", "").replace("_coo", "")
+                if gi.sparse_dim() != di.sparse_dim():
+                    raise GradcheckError(
+                        f"grad is {sparse_kind} tensor, but has incorrect sparse_dim"
+                        f" {gi.sparse_dim()}, expected {di.sparse_dim()}"
+                    )
+                if gi.dense_dim() != di.dense_dim():
+                    raise GradcheckError(
+                        f"grad is {sparse_kind} tensor, but has incorrect dense_dim"
+                        f" {gi.dense_dim()}, expected {di.dense_dim()}"
+                    )
+            gi = gi.to_dense()
+            di = di.to_dense()
+        if masked:
+            if not torch.allclose(gi, torch.zeros_like(gi)):
+                raise GradcheckError("backward not multiplied by grad_output")
+        elif not gi.eq(0).all():
+            raise GradcheckError("backward not multiplied by grad_output")
+        if gi.dtype != di.dtype:
+            raise GradcheckError("grad is incorrect type")
+        if gi.device != di.device:
+            raise GradcheckError("grad is incorrect device")
+        if gi.size() != di.size():
+            raise GradcheckError("grad is incorrect size")
+    return True
+
+
+def _test_undefined_forward_mode(func, outputs, inputs):
+    fwAD = torch.autograd.forward_ad
+
+    inp_tensors_idx, inp_tensors = _get_inp_tensors(inputs)
+    all_v, all_u, all_u_dense = _make_vectors(inp_tensors, outputs, use_forward_ad=True)
+
+    tensor_inputs = tuple(i for i in inputs if is_tensor_like(i) and i.requires_grad)
+
+    with fwAD.dual_level():
+        fw_grads = []
+        dual_inputs = []
+        tensor_indices = set()
+        for i, inp in enumerate(inputs):
+            if is_tensor_like(inp) and inp.requires_grad:
+                if inp.layout == torch._mkldnn:  # type: ignore[attr-defined]
+                    raise ValueError(
+                        "MKLDNN inputs are not support for forward AD gradcheck."
+                    )
+
+                inp = fwAD.make_dual(inp.detach(), torch.zeros_like(inp))
+                # If inp is a differentiable view, the dual might not be the tangent given to
+                # make_dual, so read it explicitly from the dual tensor
+                fw_grads.append(fwAD.unpack_dual(inp)[1])
+                tensor_indices.add(i)
+            dual_inputs.append(inp)
+
+        for i, (fw_grad, u) in enumerate(zip(fw_grads, all_u)):
+            fw_grad.copy_(u.view_as(fw_grad))
+
+        for idx, inp in enumerate(inputs):
+            if idx not in tensor_indices:
+                continue
+            dual_inp_obj = dual_inputs[idx]
+
+            # case 1 (Materialized Zero Tensor Tangent)
+            dual_inputs[idx] = fwAD.make_dual(inp.detach(), torch.zeros_like(inp))
+            raw_outputs = _as_tuple(func(*dual_inputs))
+            dual_outputs1 = filter(_is_float_or_complex_tensor, raw_outputs)
+
+            # case 2 (Efficient Zero Tensor Tangent since we don't make a dual object and pass a regular tensor)
+            dual_inputs[idx] = inp.detach()
+            raw_outputs = _as_tuple(func(*dual_inputs))
+            dual_outputs2 = filter(_is_float_or_complex_tensor, raw_outputs)
+
+            # reset
+            dual_inputs[idx] = dual_inp_obj
+
+            for index_o, (d_o1, d_o2) in enumerate(zip(dual_outputs1, dual_outputs2)):
+                val1, res1 = fwAD.unpack_dual(d_o1)
+                val2, res2 = fwAD.unpack_dual(d_o2)
+
+                if not (res1 is None or res2 is None):
+                    if not torch.allclose(res1, res2):
+                        raise GradcheckError(
+                            "Mismatch in tangent values for output with index: ",
+                            index_o,
+                            " when input: ",
+                            inp,
+                            " has an undefined tangent value. ",
+                            " Got: ",
+                            res1,
+                            " but expected: ",
+                            res2,
+                        )
+    return True
+
+
+def _test_undefined_backward_mode(func, outputs, inputs) -> bool:
+    diff_input_list: List[torch.Tensor] = list(_iter_tensors(inputs, True))
+    if not diff_input_list:
+        raise GradcheckError("no Tensors requiring grad found in input")
+
+    def warn_bc_breaking():
+        warnings.warn(
+            "Backwards compatibility: New undefined gradient support checking "
+            "feature is enabled by default, but it may break existing callers "
+            "of this function. If this is true for you, you can call this "
+            'function with "check_undefined_grad=False" to disable the feature'
+        )
+
+    def check_undefined_grad_support(output_to_check):
+        grads_output = [
+            torch.zeros_like(o, memory_format=torch.legacy_contiguous_format)
+            for o in output_to_check
+        ]
+        try:
+            grads_input = torch.autograd.grad(
+                output_to_check, diff_input_list, grads_output, allow_unused=True
+            )
+        except RuntimeError as e:
+            warn_bc_breaking()
+            raise GradcheckError(
+                "Expected backward function to handle undefined output grads. "
+                'Please look at "Notes about undefined output gradients" in '
+                '"tools/autograd/derivatives.yaml"'
+            ) from e
+
+        for gi, i in zip(grads_input, diff_input_list):
+            if (gi is not None) and (not gi.eq(0).all()):
+                warn_bc_breaking()
+                raise GradcheckError(
+                    "Expected all input grads to be undefined or zero when all output grads are undefined "
+                    'or zero. Please look at "Notes about undefined output gradients" in '
+                    '"tools/autograd/derivatives.yaml"'
+                )
+        return True
+
+    # All backward functions must work properly if all output grads are undefined
+    outputs_to_check = [
+        [
+            torch._C._functions.UndefinedGrad()(o)
+            for o in _differentiable_outputs(func(*inputs))
+            # This check filters out Tensor-likes that aren't instances of Tensor.
+            if isinstance(o, torch.Tensor)
+        ]
+    ]
+
+    # If there are multiple output grads, we should be able to undef one at a time without error
+    if len(outputs_to_check[0]) > 1:
+        for undef_grad_idx in range(len(outputs)):
+            output_to_check = _differentiable_outputs(func(*inputs))
+            outputs_to_check.append(
+                [
+                    torch._C._functions.UndefinedGrad()(o)
+                    if idx == undef_grad_idx
+                    else o
+                    for idx, o in enumerate(output_to_check)
+                ]
+            )
+
+    return all(check_undefined_grad_support(output) for output in outputs_to_check)
+
+
+def _as_tuple(x):
+    if isinstance(x, tuple):
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return (x,)
+
+
+def _differentiable_outputs(x):
+    return tuple(o for o in _as_tuple(x) if o.requires_grad)
+
+
+def _get_notallclose_msg(
+    analytical,
+    numerical,
+    output_idx,
+    input_idx,
+    complex_indices,
+    test_imag=False,
+    is_forward_ad=False,
+) -> str:
+    out_is_complex = (
+        (not is_forward_ad) and complex_indices and output_idx in complex_indices
+    )
+    inp_is_complex = is_forward_ad and complex_indices and input_idx in complex_indices
+    part = "imaginary" if test_imag else "real"
+    element = "inputs" if is_forward_ad else "outputs"
+    prefix = (
+        ""
+        if not (out_is_complex or inp_is_complex)
+        else f"While considering the {part} part of complex {element} only, "
+    )
+    mode = "computed with forward mode " if is_forward_ad else ""
+    return (
+        prefix + "Jacobian %smismatch for output %d with respect to input %d,\n"
+        "numerical:%s\nanalytical:%s\n"
+        % (mode, output_idx, input_idx, numerical, analytical)
+    )
+
+
+def _transpose(matrix_of_tensors):
+    # returns list of tuples
+    return list(zip(*matrix_of_tensors))
+
+
+def _real_and_imag_output(fn):
+    # returns new functions real(fn), and imag(fn) where real(fn) and imag(fn) behave the same as
+    # the original fn, except torch.real or torch.imag are applied to the complex outputs
+    def apply_to_c_outs(fn, fn_to_apply):
+        def wrapped_fn(*inputs):
+            outs = _as_tuple(fn(*inputs))
+            return tuple(fn_to_apply(o) if o.is_complex() else o for o in outs)
+
+        return wrapped_fn
+
+    return apply_to_c_outs(fn, torch.real), apply_to_c_outs(fn, torch.imag)
+
+
+def _real_and_imag_input(fn, complex_inp_indices, tupled_inputs):
+    # returns new functions that take real inputs instead of complex inputs as
+    # (x, y) -> fn(x + y * 1j). And it computes: inp -> fn(inp + y * 1j) and inp -> fn(x + inp * 1j).
+    # In each case, the other part is considered constant.
+    # We do not use 0 for the constant here to make sure we always call the user function with a valid input.
+    def apply_to_c_inps(fn, fn_to_apply):
+        def wrapped_fn(*inputs):
+            new_inputs = list(inputs)
+            for should_be_complex in complex_inp_indices:
+                new_inputs[should_be_complex] = fn_to_apply(
+                    new_inputs[should_be_complex], tupled_inputs[should_be_complex]
+                )
+            return _as_tuple(fn(*new_inputs))
+
+        return wrapped_fn
+
+    real_fn = apply_to_c_inps(fn, lambda inp, orig: inp + orig.imag * 1j)
+    imag_fn = apply_to_c_inps(fn, lambda inp, orig: orig.real + inp * 1j)
+    return real_fn, imag_fn
+
+
+def _gradcheck_real_imag(
+    gradcheck_fn,
+    func,
+    func_out,
+    tupled_inputs,
+    outputs,
+    eps,
+    rtol,
+    atol,
+    check_grad_dtypes,
+    check_forward_ad,
+    check_backward_ad,
+    nondet_tol,
+    check_undefined_grad,
+):
+    complex_out_indices = [i for i, o in enumerate(outputs) if o.is_complex()]
+    has_any_complex_output = any(o.is_complex() for o in _as_tuple(func_out))
+    if check_backward_ad:
+        if has_any_complex_output:
+            real_fn, imag_fn = _real_and_imag_output(func)
+
+            imag_func_out = imag_fn(*tupled_inputs)
+            imag_outputs = _differentiable_outputs(imag_func_out)
+            gradcheck_fn(
+                imag_fn,
+                imag_func_out,
+                tupled_inputs,
+                imag_outputs,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+                complex_indices=complex_out_indices,
+                test_imag=True,
+            )
+
+            real_func_out = real_fn(*tupled_inputs)
+            real_outputs = _differentiable_outputs(real_func_out)
+            gradcheck_fn(
+                real_fn,
+                real_func_out,
+                tupled_inputs,
+                real_outputs,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+                complex_indices=complex_out_indices,
+            )
+        else:
+            gradcheck_fn(
+                func,
+                func_out,
+                tupled_inputs,
+                outputs,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+            )
+
+    if check_forward_ad:
+        complex_inp_indices = [
+            i
+            for i, inp in enumerate(tupled_inputs)
+            if is_tensor_like(inp) and inp.is_complex()
+        ]
+        if complex_inp_indices:
+            real_fn, imag_fn = _real_and_imag_input(
+                func, complex_inp_indices, tupled_inputs
+            )
+
+            imag_inputs = [
+                inp.imag if is_tensor_like(inp) and inp.is_complex() else inp
+                for inp in tupled_inputs
+            ]
+            imag_func_out = imag_fn(*imag_inputs)
+            diff_imag_func_out = _differentiable_outputs(imag_func_out)
+            gradcheck_fn(
+                imag_fn,
+                imag_func_out,
+                imag_inputs,
+                diff_imag_func_out,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+                complex_indices=complex_inp_indices,
+                test_imag=True,
+                use_forward_ad=True,
+            )
+
+            real_inputs = [
+                inp.real if is_tensor_like(inp) and inp.is_complex() else inp
+                for inp in tupled_inputs
+            ]
+            real_func_out = real_fn(*real_inputs)
+            diff_real_func_out = _differentiable_outputs(real_func_out)
+            gradcheck_fn(
+                real_fn,
+                real_func_out,
+                real_inputs,
+                diff_real_func_out,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+                complex_indices=complex_inp_indices,
+                use_forward_ad=True,
+            )
+            if check_undefined_grad:
+                _test_undefined_forward_mode(imag_fn, imag_func_out, imag_inputs)
+                _test_undefined_forward_mode(real_fn, real_func_out, real_inputs)
+        else:
+            gradcheck_fn(
+                func,
+                func_out,
+                tupled_inputs,
+                outputs,
+                eps,
+                rtol,
+                atol,
+                check_grad_dtypes,
+                nondet_tol,
+                use_forward_ad=True,
+            )
+            if check_undefined_grad:
+                _test_undefined_forward_mode(func, outputs, tupled_inputs)
+
+
+def _slow_gradcheck(
+    func,
+    func_out,
+    tupled_inputs,
+    outputs,
+    eps,
+    rtol,
+    atol,
+    check_grad_dtypes,
+    nondet_tol,
+    *,
+    use_forward_ad=False,
+    complex_indices=None,
+    test_imag=False,
+    masked=False,
+):
+    func_out = _as_tuple(func_out)
+    if not outputs:
+        return _check_no_differentiable_outputs(
+            func, tupled_inputs, func_out, eps=eps, is_forward_ad=use_forward_ad
+        )
+    tupled_inputs_numerical = tupled_inputs if masked else _densify(tupled_inputs)
+
+    numerical = _transpose(
+        _get_numerical_jacobian(
+            func,
+            tupled_inputs_numerical,
+            func_out,
+            eps=eps,
+            is_forward_ad=use_forward_ad,
+        )
+    )
+    # Note: [numerical vs analytical output length]
+    # The numerical path returns jacobian quantity for all outputs, even if requires_grad of that
+    # output is False. This behavior is necessary for _check_no_differentiable_outputs to work.
+    numerical = [nj for o, nj in zip(func_out, numerical) if o.requires_grad]
+    if use_forward_ad:
+        analytical_forward = _get_analytical_jacobian_forward_ad(
+            func, tupled_inputs, func_out, check_grad_dtypes=check_grad_dtypes
+        )
+
+        for i, n_per_out in enumerate(numerical):
+            for j, n in enumerate(n_per_out):
+                a = analytical_forward[j][i]
+                if not _allclose_with_type_promotion(a, n.to(a.device), rtol, atol):
+                    raise GradcheckError(
+                        _get_notallclose_msg(
+                            a, n, i, j, complex_indices, test_imag, is_forward_ad=True
+                        )
+                    )
+    else:
+        for i, o in enumerate(outputs):
+            analytical = _check_analytical_jacobian_attributes(
+                tupled_inputs, o, nondet_tol, check_grad_dtypes
+            )
+
+            for j, (a, n) in enumerate(zip(analytical, numerical[i])):
+                if not _allclose_with_type_promotion(a, n.to(a.device), rtol, atol):
+                    raise GradcheckError(
+                        _get_notallclose_msg(a, n, i, j, complex_indices, test_imag)
+                    )
+
+    return True
+
+
+def _dot_with_type_promotion(u, v):
+    assert u.dim() == 1 and v.dim() == 1
+    return (u * v).sum()
+
+
+def _allclose_with_type_promotion(a, b, rtol, atol):
+    promoted_type = torch.promote_types(a.dtype, b.dtype)
+    a = a.to(dtype=promoted_type)
+    b = b.to(dtype=promoted_type)
+    return torch.allclose(a, b, rtol, atol)
+
+
+def _to_real_dtype(dtype):
+    if dtype == torch.complex128:
+        return torch.float64
+    elif dtype == torch.complex64:
+        return torch.float32
+    else:
+        return dtype
+
+
+def _vec_from_tensor(x, generator, downcast_complex=False):
+    # Create a random vector with the same number of elements as x and the same
+    # dtype/device. If x is complex and downcast_complex is False, we create a
+    # complex tensor with only real component.
+    if x.layout == torch.sparse_coo:
+        # For sparse, create a random sparse vec with random values in the same
+        # indices. Make sure size is set so that it isn't inferred to be smaller.
+        x_values = x._values()
+        dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
+        values = (
+            torch.rand(x_values.numel(), generator=generator)
+            .to(dtype=dtype, device=x.device)
+            .view(x_values.shape)
+        )
+        values /= values.norm()
+        vec = torch.sparse_coo_tensor(x._indices(), values, x.size(), device=x.device)
+    elif _is_sparse_compressed_tensor(x):
+        if x.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            compressed_indices, plain_indices = x.crow_indices(), x.col_indices()
+        else:
+            compressed_indices, plain_indices = x.ccol_indices(), x.row_indices()
+        x_values = x.values()
+        dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
+        values = (
+            torch.rand(x_values.numel(), generator=generator)
+            .to(dtype=dtype, device=x.device)
+            .view(x_values.shape)
+        )
+        values /= values.norm()
+        vec = torch.sparse_compressed_tensor(
+            compressed_indices,
+            plain_indices,
+            values,
+            x.size(),
+            layout=x.layout,
+            device=x.device,
+        )
+    else:
+        dtype = _to_real_dtype(x.dtype) if downcast_complex else x.dtype
+        vec = torch.rand(x.numel(), generator=generator).to(
+            dtype=dtype, device=x.device
+        )
+        vec /= vec.norm()
+    return vec
+
+
+def _get_inp_tensors(tupled_inputs):
+    inp_idx_tup = [
+        (i, t)
+        for i, t in enumerate(tupled_inputs)
+        if is_tensor_like(t) and t.requires_grad
+    ]
+    return [tup[0] for tup in inp_idx_tup], [tup[1] for tup in inp_idx_tup]
+
+
+def _adjusted_atol(atol, u, v):
+    # In slow gradcheck, we compare A and B element-wise, i.e., for some a, b we
+    # allow: |a - b| < atol + rtol * b. But since we now compare q1 = v^T A u and
+    # q2 = v^T B u, we must allow |q1 - q2| < v^T E u + rtol * v^T B u, where E is
+    # the correctly sized matrix in which each entry is atol.
+    #
+    # We see that atol needs to be scaled by v^T M u (where M is an all-ones M x N
+    # matrix): v^T M u = \sum_{i} \sum_{j} u_i * v_j = (\sum_{i} u_i)(\sum_{i} v_i)
+    # TODO: properly handle case when u is tuple instead of only taking first element
+    u = u[0] if isinstance(u, tuple) else u
+    sum_u = u.sum()
+    sum_v = 1.0 if v is None else v.sum()
+    return atol * float(sum_u) * float(sum_v)
+
+
+FAST_FAIL_SLOW_OK_MSG = """
+Fast gradcheck failed but element-wise differences are small. This means that the
+test might've passed in slow_mode!
+
+If you are adding a new operator, please file an issue and then use one of the
+workarounds. The workaround depends on how your test invokes gradcheck/gradgradcheck:
+
+If the test
+- manually invokes gradcheck/gradgradcheck, then call gradcheck/gradgradcheck
+  with `fast_mode=False` as a keyword argument.
+- is OpInfo-based (e.g., in test_ops_gradients.py), then modify the OpInfo for the test
+  to have `gradcheck_fast_mode=False`
+- is a Module test (e.g., in common_nn.py), then modify the corresponding
+  module_test entry to have `gradcheck_fast_mode=False`
+""".strip()
+
+
+def _run_slow_mode_and_get_error(
+    func, tupled_inputs, outputs, input_idx, output_idx, rtol, atol, eps, is_forward_ad
+):
+    # Compute jacobians in slow mode for better error message
+    slow_numerical = _get_numerical_jacobian(
+        func, tupled_inputs, outputs, eps=eps, is_forward_ad=is_forward_ad
+    )[input_idx][output_idx]
+    if is_forward_ad:
+
+        def new_fn(inp):
+            new_inputs = list(tupled_inputs)
+            new_inputs[input_idx] = inp
+            return _as_tuple(func(*new_inputs))[output_idx]
+
+        slow_analytical = _get_analytical_jacobian_forward_ad(
+            new_fn, (tupled_inputs[input_idx],), (outputs[output_idx],)
+        )[0][0]
+    else:
+        slow_analytical = _get_analytical_jacobian(
+            tupled_inputs, outputs, input_idx, output_idx
+        )
+
+    # Assume jacobians are non-empty and have the same shape
+    slow_max_diff = (slow_numerical - slow_analytical).abs().max()
+
+    slow_allclose = torch.allclose(slow_analytical, slow_numerical, rtol, atol)
+    msg = (
+        "\nThe above quantities relating the numerical and analytical jacobians are computed \n"
+        "in fast mode. See: https://github.com/pytorch/pytorch/issues/53876 for more background \n"
+        "about fast mode. Below, we recompute numerical and analytical jacobians in slow mode:\n\n"
+        f"Numerical:\n {slow_numerical}\n"
+        f"Analytical:\n{slow_analytical}\n\n"
+        f"The max per-element difference (slow mode) is: {slow_max_diff}.\n"
+    )
+    if slow_allclose:
+        # Slow gradcheck would've passed!
+        msg += FAST_FAIL_SLOW_OK_MSG
+    return msg
+
+
+def _to_flat_dense_if_sparse(tensor):
+    if _is_sparse_any_tensor(tensor):
+        return tensor.to_dense().reshape(-1)
+    else:
+        return tensor
+
+
+def _make_vectors(inp_tensors, outputs, *, use_forward_ad):
+    # Use our own generator to avoid messing with the user's RNG state
+    g_cpu = torch.Generator()
+
+    def _vec_from_tensor_cpu(*args):
+        # Default allocate all tensors on CPU, so they are on the same device as the generator
+        # even if the user specified a default device
+        with torch.device("cpu"):
+            return _vec_from_tensor(*args)
+
+    all_u = []
+    all_u_dense = []
+    for inp in inp_tensors:
+        ur = _vec_from_tensor_cpu(inp, g_cpu, True)
+        ur_dense = _to_flat_dense_if_sparse(ur)
+        if inp.is_complex():
+            ui = _vec_from_tensor_cpu(inp, g_cpu, True)
+            all_u.append((ur, ui))
+            ui_dense = _to_flat_dense_if_sparse(ui)
+            all_u_dense.append((ur_dense, ui_dense))
+        else:
+            all_u.append(ur)
+            all_u_dense.append(ur_dense)
+    all_v = (
+        None
+        if use_forward_ad
+        else [_vec_from_tensor_cpu(out, g_cpu) for out in outputs]
+    )
+    return all_v, all_u, all_u_dense
+
+
+def _check_analytical_numerical_equal(
+    all_analytical,
+    all_numerical,
+    complex_indices,
+    tupled_inputs,
+    outputs,
+    func,
+    all_v,
+    all_u,
+    rtol,
+    atol,
+    eps,
+    test_imag,
+    *,
+    is_forward_ad=False,
+):
+    for i, all_numerical_for_input_i in enumerate(all_numerical):
+        for j, n in enumerate(all_numerical_for_input_i):
+            # Forward AD generates the transpose of what this function expects
+            if is_forward_ad:
+                a = all_analytical[i][j]
+            else:
+                a = all_analytical[j][i]
+            n = n.to(device=a.device)
+            updated_atol = _adjusted_atol(atol, all_u[i], all_v[j] if all_v else None)
+            if not _allclose_with_type_promotion(a, n.to(a.device), rtol, updated_atol):
+                jacobians_str = _run_slow_mode_and_get_error(
+                    func, tupled_inputs, outputs, i, j, rtol, atol, eps, is_forward_ad
+                )
+                raise GradcheckError(
+                    _get_notallclose_msg(
+                        a, n, j, i, complex_indices, test_imag, is_forward_ad
+                    )
+                    + jacobians_str
+                )
+
+
+def _fast_gradcheck(
+    func,
+    func_out,
+    inputs,
+    outputs,
+    eps,
+    rtol,
+    atol,
+    check_grad_dtypes,
+    nondet_tol,
+    *,
+    use_forward_ad=False,
+    complex_indices=None,
+    test_imag=False,
+    masked=False,
+):
+    # See https://github.com/pytorch/pytorch/issues/53876 for details
+    inp_tensors_idx, inp_tensors = _get_inp_tensors(inputs)
+    # Backward mode computes v^T * J (VJP)
+    # Since we computed J * u (JVP) through finite difference method, we perform an equality check
+    # between VJP * u, v * JVP
+    # ----
+    # Forward mode computes J * u (JVP)
+    # Since we already compute JVP through finite difference method,
+    # we don't need v for correctness check here as asserted below
+    all_v, all_u, all_u_dense = _make_vectors(
+        inp_tensors, outputs, use_forward_ad=use_forward_ad
+    )
+
+    inputs_numerical, all_u_numerical, all_v_numerical = (
+        (inputs, all_u, all_v) if masked else _densify((inputs, all_u, all_v))
+    )
+
+    numerical_vJu = _get_numerical_vJu(
+        func,
+        inputs_numerical,
+        inp_tensors_idx,
+        func_out,
+        all_u_numerical,
+        all_v_numerical,
+        eps,
+        is_forward_ad=use_forward_ad,
+    )
+    # TODO: replicate https://github.com/pytorch/pytorch/pull/77743 for fast gradcheck as well
+    if use_forward_ad:
+        assert all_v is None
+        analytical_vJu = _get_analytical_jacobian_forward_ad(
+            func,
+            inputs,
+            _as_tuple(func_out),
+            all_u=all_u,
+            check_grad_dtypes=check_grad_dtypes,
+        )
+    else:
+        if not outputs:
+            _check_no_differentiable_outputs_fast(
+                func, func_out, inputs, inp_tensors_idx, all_u, eps, nondet_tol
+            )
+
+        analytical_vJu = _get_analytical_vJu_backward_mode(
+            inputs, outputs, nondet_tol, check_grad_dtypes, all_v, all_u_dense
+        )
+
+    _check_analytical_numerical_equal(
+        analytical_vJu,
+        numerical_vJu,
+        complex_indices,
+        inputs,
+        outputs,
+        func,
+        all_v,
+        all_u,
+        rtol,
+        atol,
+        eps,
+        test_imag,
+        is_forward_ad=use_forward_ad,
+    )
+
+    return True
+
+
+# Note [VarArg of Tensors]
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
+# If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
+# the '...' first argument of Callable can be replaced with VarArg(Tensor).
+# For now, we permit any input.
+def gradcheck(
+    func: Callable[..., Union[_TensorOrTensors]],  # See Note [VarArg of Tensors]
+    inputs: _TensorOrTensors,
+    *,
+    eps: float = 1e-6,
+    atol: float = 1e-5,
+    rtol: float = 1e-3,
+    raise_exception: bool = True,
+    nondet_tol: float = 0.0,
+    check_undefined_grad: bool = True,
+    check_grad_dtypes: bool = False,
+    check_batched_grad: bool = False,
+    check_batched_forward_grad: bool = False,
+    check_forward_ad: bool = False,
+    check_backward_ad: bool = True,
+    fast_mode: bool = False,
+    masked: Optional[bool] = None,
+) -> bool:  # noqa: D400,D205
+    r"""Check gradients computed via small finite differences against analytical
+    gradients wrt tensors in :attr:`inputs` that are of floating point or complex type
+    and with ``requires_grad=True``.
+
+    The check between numerical and analytical gradients uses :func:`~torch.allclose`.
+
+    For most of the complex functions we consider for optimization purposes, no notion of
+    Jacobian exists. Instead, gradcheck verifies if the numerical and analytical values of
+    the Wirtinger and Conjugate Wirtinger derivatives are consistent. Because the gradient
+    computation is done under the assumption that the overall function has a real-valued
+    output, we treat functions with complex output in a special way. For these functions,
+    gradcheck is applied to two real-valued functions corresponding to taking the real
+    components of the complex outputs for the first, and taking the imaginary components
+    of the complex outputs for the second. For more details, check out
+    :ref:`complex_autograd-doc`.
+
+    .. note::
+        The default values are designed for :attr:`input` of double precision.
+        This check will likely fail if :attr:`input` is of less precision, e.g.,
+        ``FloatTensor``.
+
+    .. note::
+        Gradcheck may fail when evaluated on non-differentiable points
+        because the numerically computed gradients via finite differencing may differ
+        those computed analytically (not necessarily because either is incorrect).
+        For more context, see :ref:`non-differentiable-func-grad`.
+
+    .. warning::
+       If any checked tensor in :attr:`input` has overlapping memory, i.e.,
+       different indices pointing to the same memory address (e.g., from
+       :func:`torch.expand`), this check will likely fail because the numerical
+       gradients computed by point perturbation at such indices will change
+       values at all other indices that share the same memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor or Tensor): inputs to the function
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+        nondet_tol (float, optional): tolerance for non-determinism. When running
+            identical inputs through the differentiation, the results must either match
+            exactly (default, 0.0) or be within this tolerance.
+        check_undefined_grad (bool, optional): if ``True``, check if undefined output grads
+            are supported and treated as zeros, for ``Tensor`` outputs.
+        check_batched_grad (bool, optional): if ``True``, check if we can compute
+            batched gradients using prototype vmap support. Defaults to False.
+        check_batched_forward_grad (bool, optional): if ``True``, checks if we can compute
+            batched forward gradients using forward ad and prototype vmap support. Defaults to ``False``.
+        check_forward_ad (bool, optional): if ``True``, check that the gradients computed with forward
+            mode AD match the numerical ones. Defaults to ``False``.
+        check_backward_ad (bool, optional): if ``False``, do not perform any checks that rely on
+            backward mode AD to be implemented. Defaults to ``True``.
+        fast_mode (bool, optional): Fast mode for gradcheck and gradgradcheck is currently only
+            implemented for R to R functions. If none of the inputs and outputs are complex
+            a faster implementation of gradcheck that no longer computes the entire jacobian
+            is run; otherwise, we fall back to the slow implementation.
+        masked (bool, optional): if ``True``, the gradients of unspecified elements of
+            sparse tensors are ignored. Defaults to ``False``.
+    Returns:
+        ``True`` if all differences satisfy allclose condition
+
+    """
+    assert (
+        check_forward_ad or check_backward_ad
+    ), "Expected at least one of check_forward_ad or check_backward_ad to be True"
+    assert not (
+        check_batched_grad and not check_backward_ad
+    ), "Setting check_batched_grad=True requires check_backward_ad to be True"
+    assert not (
+        check_batched_forward_grad and not check_forward_ad
+    ), "Setting check_batched_forward_grad=True requires check_forward_ad to be True"
+    args = locals().copy()
+    args.pop("raise_exception")
+    if not raise_exception:
+        try:
+            return _gradcheck_helper(**args)
+        except GradcheckError as e:
+            return False
+    else:
+        return _gradcheck_helper(**args)
+
+
+def _gradcheck_helper(
+    func,
+    inputs,
+    eps,
+    atol,
+    rtol,
+    nondet_tol,
+    check_undefined_grad,
+    check_grad_dtypes,
+    check_batched_grad,
+    check_batched_forward_grad,
+    check_forward_ad,
+    check_backward_ad,
+    fast_mode,
+    masked,
+):
+    tupled_inputs = _as_tuple(inputs)
+    _check_inputs(tupled_inputs)
+
+    func_out = func(*tupled_inputs)
+    outputs = _differentiable_outputs(func_out)
+    _check_outputs(outputs)
+
+    gradcheck_fn = functools.partial(
+        _fast_gradcheck if fast_mode else _slow_gradcheck, masked=masked
+    )
+    _gradcheck_real_imag(
+        gradcheck_fn,
+        func,
+        func_out,
+        tupled_inputs,
+        outputs,
+        eps,
+        rtol,
+        atol,
+        check_grad_dtypes,
+        check_forward_ad=check_forward_ad,
+        check_backward_ad=check_backward_ad,
+        nondet_tol=nondet_tol,
+        check_undefined_grad=check_undefined_grad,
+    )
+
+    if check_batched_forward_grad:
+        _test_batched_grad_forward_ad(func, tupled_inputs)
+
+    # Short circuit because remaining tests rely on backward AD to be implemented
+    if not check_backward_ad:
+        return True
+
+    for i, o in enumerate(outputs):
+        if check_batched_grad:
+            _test_batched_grad(tupled_inputs, o, i)
+
+    _test_backward_mul_by_grad_output(outputs, tupled_inputs, masked)
+
+    if check_undefined_grad and check_backward_ad:
+        _test_undefined_backward_mode(func, outputs, tupled_inputs)
+    return True
+
+
+def gradgradcheck(
+    func: Callable[..., _TensorOrTensors],  # See Note [VarArg of Tensors]
+    inputs: _TensorOrTensors,
+    grad_outputs: Optional[_TensorOrTensors] = None,
+    *,
+    eps: float = 1e-6,
+    atol: float = 1e-5,
+    rtol: float = 1e-3,
+    gen_non_contig_grad_outputs: bool = False,
+    raise_exception: bool = True,
+    nondet_tol: float = 0.0,
+    check_undefined_grad: bool = True,
+    check_grad_dtypes: bool = False,
+    check_batched_grad: bool = False,
+    check_fwd_over_rev: bool = False,
+    check_rev_over_rev: bool = True,
+    fast_mode: bool = False,
+    masked: bool = False,
+) -> bool:  # noqa: D400,D205
+    r"""Check gradients of gradients computed via small finite differences
+    against analytical gradients wrt tensors in :attr:`inputs` and
+    :attr:`grad_outputs` that are of floating point or complex type and with
+    ``requires_grad=True``.
+
+    This function checks that backpropagating through the gradients computed
+    to the given :attr:`grad_outputs` are correct.
+
+    The check between numerical and analytical gradients uses :func:`~torch.allclose`.
+
+    .. note::
+        The default values are designed for :attr:`input` and
+        :attr:`grad_outputs` of double precision. This check will likely fail if
+        they are of less precision, e.g., ``FloatTensor``.
+
+    .. warning::
+       If any checked tensor in :attr:`input` and :attr:`grad_outputs` has
+       overlapping memory, i.e., different indices pointing to the same memory
+       address (e.g., from :func:`torch.expand`), this check will likely fail
+       because the numerical gradients computed by point perturbation at such
+       indices will change values at all other indices that share the same
+       memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor or Tensor): inputs to the function
+        grad_outputs (tuple of Tensor or Tensor, optional): The gradients with
+            respect to the function's outputs.
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
+            ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
+            randomly generated gradient outputs are made to be noncontiguous
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+        nondet_tol (float, optional): tolerance for non-determinism. When running
+            identical inputs through the differentiation, the results must either match
+            exactly (default, 0.0) or be within this tolerance. Note that a small amount
+            of nondeterminism in the gradient will lead to larger inaccuracies in
+            the second derivative.
+        check_undefined_grad (bool, optional): if True, check if undefined output grads
+            are supported and treated as zeros
+        check_batched_grad (bool, optional): if True, check if we can compute
+            batched gradients using prototype vmap support. Defaults to False.
+        fast_mode (bool, optional): if True, run a faster implementation of gradgradcheck that
+            no longer computes the entire jacobian.
+        masked (bool, optional): if True, the gradients of unspecified elements of
+            sparse tensors are ignored (default, False).
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    assert (
+        check_fwd_over_rev or check_rev_over_rev
+    ), "Expected at least one of check_fwd_over_rev or check_rev_over_rev to be True"
+    assert not (
+        check_undefined_grad and not check_rev_over_rev
+    ), "Setting check_undefined_grad=True requires check_rev_over_rev to be True"
+    assert not (
+        check_batched_grad and not check_rev_over_rev
+    ), "Setting check_batched_grad=True requires check_rev_over_rev to be True"
+    # TODO: do we want to test this too?
+    # assert not (check_batched_forward_grad and not check_fwd_over_rev), (
+    #     "Setting check_batched_forward_grad=True requires check_fwd_over_rev to be True")
+    tupled_inputs = _as_tuple(inputs)
+
+    if grad_outputs is None:
+        # If grad_outputs is not specified, create random Tensors of the same shape, type, and device as the outputs
+
+        outputs = _differentiable_outputs(func(*tupled_inputs))
+        tupled_grad_outputs = tuple(
+            torch.testing.make_tensor(
+                x.shape,
+                dtype=x.dtype
+                if x.is_floating_point() or x.is_complex()
+                else torch.double,
+                device=x.device,
+                low=-1,
+                high=1,
+                requires_grad=True,
+                noncontiguous=gen_non_contig_grad_outputs,
+            )
+            for x in outputs
+        )
+    else:
+        tupled_grad_outputs = _as_tuple(grad_outputs)
+
+    num_outputs = len(tupled_grad_outputs)
+
+    # NB: We need to save the requires_grad information about the inputs here because gradcheck detaches inputs
+    #     before running forward mode AD
+    diff_input_args_indices = {
+        i for i, x in enumerate(tupled_inputs) if is_tensor_like(x) and x.requires_grad
+    }
+    diff_grad_output_indices = {
+        i for i, x in enumerate(tupled_grad_outputs) if x.requires_grad
+    }
+
+    def new_func(*args):
+        # Restore the requires_grad information
+        input_args = tuple(
+            x.requires_grad_() if i in diff_input_args_indices else x
+            for i, x in enumerate(args[:-num_outputs])
+        )
+        outputs = _differentiable_outputs(func(*input_args))
+        grad_outputs = tuple(
+            x.requires_grad_() if i in diff_grad_output_indices else x
+            for i, x in enumerate(args[-num_outputs:])
+        )
+        diff_input_args = tuple(
+            x for i, x in enumerate(input_args) if i in diff_input_args_indices
+        )
+        grad_inputs = torch.autograd.grad(
+            outputs, diff_input_args, grad_outputs, create_graph=True, allow_unused=True
+        )
+        grad_inputs = tuple(g for g in grad_inputs if g is not None)
+        return grad_inputs
+
+    return gradcheck(
+        new_func,
+        tupled_inputs + tupled_grad_outputs,
+        eps=eps,
+        atol=atol,
+        rtol=rtol,
+        raise_exception=raise_exception,
+        nondet_tol=nondet_tol,
+        check_undefined_grad=check_undefined_grad,
+        check_grad_dtypes=check_grad_dtypes,
+        check_batched_grad=check_batched_grad,
+        fast_mode=fast_mode,
+        check_forward_ad=check_fwd_over_rev,
+        check_backward_ad=check_rev_over_rev,
+        masked=masked,
+    )
diff --git a/MLPY/Lib/site-packages/torch/autograd/graph.py b/MLPY/Lib/site-packages/torch/autograd/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6db9f086ee0851a57563bdf265bf7e2f636ef34
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/graph.py
@@ -0,0 +1,749 @@
+import abc
+import collections
+import contextlib
+import functools
+import logging
+import threading
+import weakref
+from collections import defaultdict, namedtuple
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Deque,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+from torch.autograd.variable import Variable
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.hooks import RemovableHandle
+
+log = logging.getLogger(__name__)
+
+
+__all__ = [
+    "saved_tensors_hooks",
+    "save_on_cpu",
+    "disable_saved_tensors_hooks",
+    "register_multi_grad_hook",
+    "allow_mutation_on_saved_tensors",
+    "Node",
+    "GradientEdge",
+    "get_gradient_edge",
+    "increment_version",
+]
+
+
+class Node(abc.ABC):
+    @abc.abstractmethod
+    def name(self) -> str:
+        r"""Return the name.
+
+        Example::
+
+            >>> import torch
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> print(b.grad_fn.name())
+            CloneBackward0
+        """
+        ...
+
+    @property
+    @abc.abstractmethod
+    def next_functions(self) -> Tuple[Tuple[Optional["Node"], int], ...]:
+        ...
+
+    @abc.abstractmethod
+    def metadata(self) -> dict:
+        r"""Return the metadata."""
+        ...
+
+    @abc.abstractmethod
+    def _register_hook_dict(self, tensor: torch.Tensor) -> None:
+        ...
+
+    @abc.abstractmethod
+    def register_hook(self, fn: Callable[..., Any]) -> RemovableHandle:
+        r"""Register a backward hook.
+
+        The hook will be called every time a gradient with respect to the
+        Node is computed. The hook should have the following signature::
+
+            hook(grad_inputs: Tuple[Tensor], grad_outputs: Tuple[Tensor]) -> Tuple[Tensor] or None
+
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad_inputs`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks.
+
+        Example::
+
+            >>> import torch
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> handle = b.grad_fn.register_hook(lambda gI, gO: (gO[0] * 2,))
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([2., 2., 2.])
+            >>> handle.remove() # Removes the hook
+            >>> a.grad = None
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([1., 1., 1.])
+        """
+        ...
+
+    @abc.abstractmethod
+    def register_prehook(self, fn: Callable[..., Any]) -> RemovableHandle:
+        r"""Register a backward pre-hook.
+
+        The hook will be called every time a gradient with respect to the
+        Node is computed. The hook should have the following signature::
+
+            hook(grad_outputs: Tuple[Tensor]) -> Tuple[Tensor] or None
+
+        The hook should not modify its argument, but it can optionally return
+        a new gradient which will be used in place of :attr:`grad_outputs`.
+
+        This function returns a handle with a method ``handle.remove()``
+        that removes the hook from the module.
+
+        .. note::
+            See :ref:`backward-hooks-execution` for more information on how when this hook
+            is executed, and how its execution is ordered relative to other hooks.
+
+        Example::
+
+            >>> a = torch.tensor([0., 0., 0.], requires_grad=True)
+            >>> b = a.clone()
+            >>> assert isinstance(b.grad_fn, torch.autograd.graph.Node)
+            >>> handle = b.grad_fn.register_prehook(lambda gI: (gI[0] * 2,))
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([2., 2., 2.])
+            >>> handle.remove()
+            >>> a.grad = None
+            >>> b.sum().backward(retain_graph=True)
+            >>> print(a.grad)
+            tensor([1., 1., 1.])
+        """
+        ...
+
+    @classmethod
+    def __subclasshook__(cls, C):
+        if cls is Node:
+            if (
+                C is not None and C is getattr(torch._C._functions, C.__name__, None)
+            ) or issubclass(C, torch.autograd.function.BackwardCFunction):
+                return True
+        return NotImplemented
+
+
+def _get_grad_fn_or_grad_acc(t):
+    if t.requires_grad and t.grad_fn is None:
+        return t.view_as(t).grad_fn.next_functions[0][0]
+    else:
+        return t.grad_fn
+
+
+GradientEdge = namedtuple("GradientEdge", ("node output_nr"))
+GradientEdge.__doc__ = """\
+Object representing a given gradient edge within the autograd graph.
+To get the gradient edge where a given Tensor gradient will be computed,
+you can do ``edge = autograd.graph.get_gradient_edge(tensor)``.
+"""
+
+
+def get_gradient_edge(tensor):
+    """Get the gradient edge for computing the gradient of the given Tensor.
+
+    In particular, it is equivalent to call
+    ``g = autograd.grad(loss, input)`` and ``g = autograd.grad(loss, get_gradient_edge(input))``.
+    """
+    if not tensor.requires_grad:
+        raise RuntimeError(
+            "It is not possible to get the gradient edge for a Tensor that does not require gradients"
+        )
+    grad_fn = _get_grad_fn_or_grad_acc(tensor)
+
+    # Note that output_nr default to 0 which is the right value
+    # for the AccumulateGrad node.
+    return GradientEdge(grad_fn, tensor.output_nr)
+
+
+def increment_version(tensor):
+    """Update autograd metadata tracking whether the given Tensor was modified in place.
+
+    This is to enable more accurate error checking within the autograd engine.
+    It is already done automatically by PyTorch functions and within custom Function
+    when mark_dirty() is called appropriately so you only need to call this explicitly
+    if you are doing inplace operation on the Tensor data in a way that Pytorch doesn't
+    know about. For example a custom kernel that reads the Tensor data_ptr and modifies
+    the memory inplace based on this pointer.
+
+    Note that incrementing the version counter multiple times for a single inplace operation
+    is not problematic.
+    """
+    torch._C._increment_version(tensor)
+
+
+class saved_tensors_hooks:
+    """Context-manager that sets a pair of pack / unpack hooks for saved tensors.
+
+    Use this context-manager to define how intermediary results of an operation
+    should be packed before saving, and unpacked on retrieval.
+
+    In that context, the ``pack_hook`` function will be called everytime an
+    operation saves a tensor for backward (this includes intermediary results
+    saved using
+    :func:`~torch.autograd.function._ContextMethodMixin.save_for_backward` but
+    also those recorded by a PyTorch-defined operation). The output of
+    ``pack_hook`` is then stored in the computation graph instead of the
+    original tensor.
+
+    The ``unpack_hook`` is called when the saved tensor needs to be accessed,
+    namely when executing :func:`torch.Tensor.backward()` or
+    :func:`torch.autograd.grad()`. It takes as argument the *packed* object
+    returned by ``pack_hook`` and should return a tensor which has the same
+    content as the original tensor (passed as input to the corresponding
+    ``pack_hook``).
+
+    The hooks should have the following signatures:
+
+        pack_hook(tensor: Tensor) -> Any
+
+        unpack_hook(Any) -> Tensor
+
+    where the return value of ``pack_hook`` is a valid input to ``unpack_hook``.
+
+    In general, you want ``unpack_hook(pack_hook(t))`` to be equal to ``t`` in terms
+    of value, size, dtype and device.
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> def pack_hook(x):
+        ...     print("Packing", x)
+        ...     return x
+        >>>
+        >>> def unpack_hook(x):
+        ...     print("Unpacking", x)
+        ...     return x
+        >>>
+        >>> a = torch.ones(5, requires_grad=True)
+        >>> b = torch.ones(5, requires_grad=True) * 2
+        >>> with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+        ...     y = a * b
+        Packing tensor([1., 1., 1., 1., 1.], requires_grad=True)
+        Packing tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
+        >>> y.sum().backward()
+        Unpacking tensor([1., 1., 1., 1., 1.], requires_grad=True)
+        Unpacking tensor([2., 2., 2., 2., 2.], grad_fn=<MulBackward0>)
+
+    .. warning ::
+        Performing an inplace operation on the input to either hooks may lead
+        to undefined behavior.
+
+    .. warning ::
+        Only one pair of hooks is allowed at a time. When recursively nesting this
+        context-manager, only the inner-most pair of hooks will be applied.
+    """
+
+    def __init__(
+        self,
+        pack_hook: Callable[[torch.Tensor], Any],
+        unpack_hook: Callable[[Any], torch.Tensor],
+    ):
+        self.pack_hook = pack_hook
+        self.unpack_hook = unpack_hook
+
+    def __enter__(self):
+        torch._C._autograd._push_saved_tensors_default_hooks(
+            self.pack_hook, self.unpack_hook
+        )
+
+    def __exit__(self, *args: object):
+        torch._C._autograd._pop_saved_tensors_default_hooks()
+
+
+class save_on_cpu(saved_tensors_hooks):
+    """Context manager under which tensors saved by the forward pass will be stored on cpu, then retrieved for backward.
+
+    When performing operations within this context manager, intermediary
+    results saved in the graph during the forward pass will be moved to CPU,
+    then copied back to the original device when needed for the backward pass.
+    If the graph was already on CPU, no tensor copy is performed.
+
+    Use this context-manager to trade compute for GPU memory usage (e.g.
+    when your model doesn't fit in GPU memory during training).
+
+    Args:
+        pin_memory (bool): If ``True`` tensors will be saved to CPU pinned memory
+                           during packing and copied to GPU asynchronously during unpacking.
+                           Defaults to ``False``.
+                           Also see :ref:`cuda-memory-pinning`.
+
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
+        >>> a = torch.randn(5, requires_grad=True, device="cuda")
+        >>> b = torch.randn(5, requires_grad=True, device="cuda")
+        >>> c = torch.randn(5, requires_grad=True, device="cuda")
+        >>>
+        >>> def f(a, b, c):
+        ...     prod_1 = a * b           # a and b are saved on GPU
+        ...     with torch.autograd.graph.save_on_cpu():
+        ...         prod_2 = prod_1 * c  # prod_1 and c are saved on CPU
+        ...     y = prod_2 * a           # prod_2 and a are saved on GPU
+        ...     return y
+        >>>
+        >>> y = f(a, b, c)
+        >>> del a, b, c  # for illustration only
+        >>> # the content of a, b, and prod_2 are still alive on GPU
+        >>> # the content of prod_1 and c only live on CPU
+        >>> y.sum().backward()  # all CPU tensors are moved back to GPU, for backward
+        >>> # all intermediary tensors are released (deleted) after the call to backward
+
+    """
+
+    def __init__(self, pin_memory=False, device_type="cuda"):
+        device_module = getattr(torch, device_type, torch.cuda)
+
+        def pack_to_cpu(tensor):
+            if not pin_memory:
+                return (tensor.device, tensor.cpu())
+            packed = torch.empty(
+                tensor.size(),
+                dtype=tensor.dtype,
+                layout=tensor.layout,
+                pin_memory=(device_module.is_available() and not tensor.is_sparse),
+            )
+            packed.copy_(tensor)
+            return (tensor.device, packed)
+
+        def unpack_from_cpu(packed):
+            device, tensor = packed
+            return tensor.to(device, non_blocking=pin_memory)
+
+        super().__init__(pack_to_cpu, unpack_from_cpu)
+
+
+@contextlib.contextmanager
+def disable_saved_tensors_hooks(error_message):
+    """Context-manager that disables the saved tensors default hooks feature.
+
+    Useful for if you are creating a feature that does not work with saved
+    tensors default hooks.
+
+    Args:
+        error_message (str): When saved tensors default hooks are used when they
+                             have been are disabled, a RuntimeError with this
+                             error message gets raised.
+
+    Example::
+
+        >>> # xdoctest: +SKIP(failing)
+        >>> message = "saved tensors default hooks are disabled"
+        >>> with torch.autograd.graph.disable_saved_tensors_hooks(message):
+        ...     # Raises RuntimeError: saved tensors default hooks are disabled
+        ...     with torch.autograd.graph.save_on_cpu():
+        ...         pass
+
+    """
+    try:
+        maybe_prev_message = (
+            torch._C._autograd._saved_tensors_hooks_get_disabled_error_message()
+        )
+        torch._C._autograd._saved_tensors_hooks_disable(error_message)
+        yield
+    finally:
+        # See NOTE: [disabled_error_message invariant]
+        if maybe_prev_message is None:
+            torch._C._autograd._saved_tensors_hooks_enable()
+        else:
+            torch._C._autograd._saved_tensors_hooks_disable(maybe_prev_message)
+
+
+def register_multi_grad_hook(
+    tensors: Sequence[torch.Tensor],
+    fn: Union[
+        Callable[[Sequence[Optional[torch.Tensor]]], None],
+        Callable[[torch.Tensor], None],
+    ],
+    *,
+    mode: str = "all",
+):
+    r"""Register a multi-grad backward hook.
+
+    There are two supported modes: ``"all"`` and ``"any"``.
+
+    Under the ``"all"`` mode, the hook will be called after gradients with respect to every tensor in
+    :attr:`tensors` have been computed. If a tensor is in :attr:`tensors` but
+    is not part of the graph, or if a tensor is not needed to compute the gradients
+    for any ``inputs`` specified for the current ``.backward()`` or ``.grad()`` call,
+    this tensor will be ignored and the hook will not wait for its gradient to be
+    computed.
+
+    After every non-ignored tensor's gradient has been computed, :attr:`fn` will be
+    called with those gradients. ``None`` will be passed for tensors that did not
+    have their gradients computed.
+
+    Under the ``"any"`` mode, the hook will be called after the first gradient
+    with respect to a tensor in :attr:`tensors` has been computed. The hook
+    will be called with that gradient as its argument.
+
+    The hook should not modify its arguments.
+
+    This function returns a handle with a method ``handle.remove()`` that removes the hook.
+
+    .. note::
+        See :ref:`backward-hooks-execution` for more information on how when this hook
+        is executed, and how its execution is ordered relative to other hooks.
+
+    Example::
+
+        >>> import torch
+        >>>
+        >>> a = torch.rand(2, 3, requires_grad=True)
+        >>> b = torch.rand(2, 3, requires_grad=True)
+        >>> c = a * b
+        >>> d = a * b
+        >>>
+        >>> def fn(grads):
+        ...     print([g is not None for g in grads])
+        ...
+        >>> torch.autograd.graph.register_multi_grad_hook((a, b, c, d), fn)
+        >>>
+        >>> c.sum().backward(retain_graph=True)
+        [True, True, True, False]
+        >>> c.sum().backward(inputs=(a,), retain_graph=True)
+        [True, False, True, False]
+        >>>
+    """
+    supported_modes = ("all", "any")
+    if mode not in supported_modes:
+        raise ValueError(f"Expects mode to be one of {supported_modes} but got {mode}")
+
+    class Handle(RemovableHandle):
+        handles: Tuple[RemovableHandle, ...]
+
+        def __init__(self, handles: Tuple[RemovableHandle, ...]):
+            self.handles = handles
+
+        def remove(self):
+            for handle in self.handles:
+                handle.remove()
+
+        def __getstate__(self):
+            return self.handles
+
+        def __setstate__(self, state):
+            self.handles = state
+
+    if mode == "all":
+        count: Dict[int, int] = dict()
+        nb_calls = None
+        buffer: Dict[int, List[Optional[torch.Tensor]]] = dict()
+
+        grad_fns = list(map(_get_grad_fn_or_grad_acc, tensors))
+        len_tensors = len(tensors)
+
+        def get_inner_hook(idx):
+            def inner_hook(grad: torch.Tensor):
+                nonlocal count, nb_calls, buffer, fn
+                id = torch._C._current_graph_task_id()
+                assert (
+                    id != -1
+                ), "expected this hook to be called inside a backward call"
+                count[id] = count.get(id, 0)
+                buffer[id] = buffer.get(id, [None] * len_tensors)
+
+                if count[id] == 0:
+                    # On the first call, compute the actual nb_calls and buffer
+                    nb_calls = sum(torch._C._will_engine_execute_node(g) for g in grad_fns)  # type: ignore[attr-defined]
+
+                buffer[id][idx] = grad
+                count[id] += 1
+
+                if count[id] == nb_calls:
+                    fn = cast(Callable[[Sequence[Optional[torch.Tensor]]], None], fn)
+                    fn(buffer[id])
+                    del count[id]
+                    del buffer[id]
+
+            return inner_hook
+
+        handles: Tuple[RemovableHandle] = tuple(
+            t.register_hook(get_inner_hook(i)) for i, t in enumerate(tensors)
+        )
+    elif mode == "any":
+        fn = cast(Callable[[torch.Tensor], None], fn)
+        lock = threading.Lock()
+        ran_hook: Dict[int, bool] = defaultdict(bool)
+
+        @functools.wraps(fn)
+        def wrapped_fn(grad: torch.Tensor):
+            nonlocal ran_hook
+            id = torch._C._current_graph_task_id()
+            assert id != -1, "expected this hook to be called inside a backward call"
+            with lock:
+                prev, ran_hook[id] = ran_hook[id], True
+            if prev:
+                return
+            fn(grad)
+
+        handles = tuple(
+            tensor.register_hook(wrapped_fn)
+            for tensor in tensors
+            if tensor.requires_grad
+        )
+
+    return Handle(handles)  # type: ignore[possibly-undefined]
+
+
+# NOTE [Allow mutation on tensors saved for backward]
+#
+# 1. Tensor gets saved for backward
+#    - remember the python object id and the version of the tensor
+#    - remember aliasing information (data_ptr of base + version)
+#    - save the original so we control its lifetime
+# 2. Any time a tensor gets in-placed
+#    - for each tensor aliased to it:
+#      - check using its object id and version to see if it has been saved
+#      - if it has been saved, clone it
+#      - delete the reference to the original
+# 3. during backward
+#    - if the clone exists, the tensor must've been modified in-place
+_allow_mutation_on_saved_tensors_enabled = False
+
+
+def _get_tid(t) -> Tuple[int, int, int]:
+    return (id(t), t.data_ptr(), t._version)
+
+
+def _get_sid(t) -> Tuple[int, int]:
+    return (t.data_ptr(), t._version)
+
+
+class _Handle:
+    pass
+
+
+class _swap_with_cloned(saved_tensors_hooks):
+    def __init__(self, ctx):
+        def pack_hook(t):
+            tid = _get_tid(t)
+            sid = _get_sid(t)
+            # Tensors saved for backward have an entry in _tid_to_weakhandle
+            handle: Optional[_Handle] = None
+
+            # Save aliasing information
+            ctx.sid_to_tid[sid].add(tid)
+
+            # NB: The same tensor (of the same version) can be saved multiple times
+            if tid not in ctx.tid_to_weakhandle:
+                handle = _Handle()
+                ctx.tid_to_weakhandle[tid] = handle
+                ctx.original[handle] = t
+            else:
+                # Store an additional strong reference to the handle
+                handle = ctx.tid_to_weakhandle[tid]
+            return handle
+
+        def unpack_hook(tup):
+            handle = tup
+            error_msg = (
+                "Trying to backward outside of the 'allow_mutation_on_saved_tensors' context"
+                "in which the graph was originally recorded."
+            )
+            assert _allow_mutation_on_saved_tensors_enabled, error_msg
+            if handle in ctx.cloned:
+                res = ctx.cloned[handle]
+            else:
+                assert handle in ctx.original, error_msg
+                res = ctx.original[handle]
+            return res
+
+        super().__init__(pack_hook, unpack_hook)
+
+
+class _CloneArgBeforeMutateMode(TorchDispatchMode):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        for idx, arg in enumerate(func._schema.arguments):
+            if arg.alias_info is not None and arg.alias_info.is_write:
+                t = kwargs["out"] if arg.is_out else args[idx]
+                tid = _get_tid(t)
+                sid = _get_sid(t)
+                ctx = self.ctx
+                if sid in ctx.sid_to_tid:
+                    for tid in ctx.sid_to_tid[sid]:
+                        if tid not in ctx.tid_to_weakhandle:
+                            # We know that if tid is in sid_to_tid, then it must also be in
+                            # tid_to_weakhandle. However, it is possible for the tensor to be
+                            # saved at one point, but cleared by backward before it is modified
+                            # in-place. Consider the following example:
+                            #
+                            # >>> a = torch.randn(2, 3, requires_grad=True).clone()
+                            # >>> out = (a**2).sum()
+                            # >>> out.backward()
+                            # >>> a.sin_()
+                            continue
+                        handle = ctx.tid_to_weakhandle[tid]
+                        if handle in ctx.cloned:
+                            # The same exact tensor has been cloned already
+                            continue
+                        ctx.cloned[handle] = ctx.original[handle].clone()
+                        del ctx.original[handle]
+
+        rs = func(*args, **kwargs)
+        return rs
+
+
+class _AllowMutationOnSavedContext:
+    def __init__(self):
+        self.cloned: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self.original: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+        self.tid_to_weakhandle: weakref.WeakValueDictionary = (
+            weakref.WeakValueDictionary()
+        )
+        self.sid_to_tid: Dict[Tuple[int, int], Set[Tuple[int, int, int]]] = defaultdict(
+            set
+        )
+
+    def clear(self):
+        self.cloned.clear()
+        self.original.clear()
+        self.tid_to_weakhandle.clear()
+        self.sid_to_tid.clear()
+
+
+@contextlib.contextmanager
+def allow_mutation_on_saved_tensors():
+    """Context manager under which mutating tensors saved for backward is allowed.
+
+    Under this context manager, tensors saved for backward are cloned on mutation,
+    so the original version can still be used during backward. Normally, mutating a tensor
+    saved for backward will result in an error raised when it's used during backward.
+
+    To ensure the correct behavior, both the forward and backward should be run under
+    the same context manager.
+
+    returns:
+        An _AllowMutationOnSavedContext object storing the state managed by this
+        context manager. This object can be useful for debugging purposes. The state
+        managed by the context manager is automatically cleared upon exiting.
+
+    Example::
+
+        >>> import torch
+        >>> with torch.autograd.graph.allow_mutation_on_saved_tensors():
+        ...     # forward
+        ...     a = torch.ones(2, 3, requires_grad=True)
+        ...     b = a.clone()
+        ...     out = (b**2).sum()
+        ...     b.sin_()
+        ...     # backward
+        ...     out.sum().backward()
+        ...
+        tensor([[0.8415, 0.8415, 0.8415],
+                [0.8415, 0.8415, 0.8415]], grad_fn=<SinBackward0>)
+    """
+    global _allow_mutation_on_saved_tensors_enabled
+
+    ctx = _AllowMutationOnSavedContext()
+
+    with _swap_with_cloned(ctx), _CloneArgBeforeMutateMode(ctx):
+        try:
+            if _allow_mutation_on_saved_tensors_enabled:
+                raise RuntimeError(
+                    "allow_mutation_on_saved_tensors contexts cannot be nested"
+                )
+            _allow_mutation_on_saved_tensors_enabled = True
+            yield ctx
+        finally:
+            ctx.clear()
+            _allow_mutation_on_saved_tensors_enabled = False
+
+
+def _register_logging_hooks_on_whole_graph(t_outputs: List[torch.Tensor]):
+    grad_fns = list(map(_get_grad_fn_or_grad_acc, t_outputs))
+
+    def iter_graph(roots):
+        if not roots:
+            return
+        seen = set()
+        q: Deque = collections.deque()
+        for node in roots:
+            if node is not None:
+                seen.add(node)
+                q.append(node)
+
+        while q:
+            node = q.popleft()
+            for fn, _idx in node.next_functions:
+                if fn in seen or fn is None:
+                    continue
+                seen.add(fn)
+                q.append(fn)
+
+            yield node
+
+    def fmt(t):
+        # Avoid circular import
+        from torch.testing._internal.common_utils import dtype_abbrs
+
+        if t is None:
+            return "None"
+        return f"{dtype_abbrs[t.dtype]}[{', '.join(map(str, t.shape))}]"
+
+    def prehook(grad_outputs):
+        node = torch._C._current_autograd_node()
+        grad_outputs_str = f"[{','.join(fmt(t) for t in grad_outputs)}]"
+        log_str = f"Executing: {node} with grad_outputs: {grad_outputs_str}"
+        log.debug(log_str)
+
+    handles = []
+    for node in iter_graph(grad_fns):
+        handles.append(node.register_prehook(prehook))
+
+    def unregister_hooks():
+        for handle in handles:
+            handle.remove()
+
+    return unregister_hooks
+
+
+def _engine_run_backward(t_outputs, *args, **kwargs):
+    attach_logging_hooks = log.getEffectiveLevel() <= logging.DEBUG
+    if attach_logging_hooks:
+        unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
+    try:
+        return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
+            t_outputs, *args, **kwargs
+        )  # Calls into the C++ engine to run the backward pass
+    finally:
+        if attach_logging_hooks:
+            unregister_hooks()  # type: ignore[possibly-undefined]
diff --git a/MLPY/Lib/site-packages/torch/autograd/profiler.py b/MLPY/Lib/site-packages/torch/autograd/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5daeb6d250ffa5982e6ddaca21d4a7a689f8e56b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/profiler.py
@@ -0,0 +1,1042 @@
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+from warnings import warn
+
+import torch
+
+import torch.cuda
+from torch._C import _get_privateuse1_backend_name
+from torch._C._profiler import _ExperimentalConfig
+
+from torch.autograd import (
+    _disable_profiler,
+    _enable_profiler,
+    _kineto_step,
+    _prepare_profiler,
+    _ProfilerResult,
+    _supported_activities,
+    DeviceType,
+    kineto_available,
+    ProfilerActivity,
+    ProfilerConfig,
+    ProfilerState,
+)
+from torch.autograd.profiler_util import (
+    _filter_name,
+    _filter_stack_entry,
+    _rewrite_name,
+    EventList,
+    FunctionEvent,
+    MEMORY_EVENT_NAME,
+    MemRecordsAcc,
+    OUT_OF_MEMORY_EVENT_NAME,
+)
+from torch.futures import Future
+
+__all__ = [
+    "profile",
+    "record_function",
+    "emit_itt",
+    "emit_nvtx",
+    "load_nvprof",
+    "EnforceUnique",
+    "parse_nvprof_trace",
+    "KinetoStepTracker",
+    "EventList",
+    "FunctionEvent",
+    "MemRecordsAcc",
+]
+
+try:
+    # Available in Python >= 3.2
+    from contextlib import ContextDecorator as _ContextDecorator
+except ImportError:
+    import functools
+
+    class _ContextDecorator:  # type: ignore[no-redef]
+        def __enter__(self):
+            raise NotImplementedError
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            raise NotImplementedError
+
+        def __call__(self, func):
+            @functools.wraps(func)
+            def wrapped(*args, **kwargs):
+                with self:
+                    return func(*args, **kwargs)
+
+            return wrapped
+
+
+# global python state - whether profiler is currently enabled
+# useful for fast python checks to reduce latency
+_is_profiler_enabled: bool = False
+
+
+def _set_is_profiler_enabled(enable: bool):
+    global _is_profiler_enabled
+    _is_profiler_enabled = enable
+
+
+def _run_on_profiler_start():
+    _set_is_profiler_enabled(True)
+
+
+def _run_on_profiler_stop():
+    _set_is_profiler_enabled(False)
+
+
+class profile:
+    """Context manager that manages autograd profiler state and holds a summary of results.
+
+    Under the hood it just records events of functions being executed in C++ and
+    exposes those events to Python. You can wrap any code into it and it will
+    only report runtime of PyTorch functions.
+    Note: profiler is thread local and is automatically propagated into the async tasks
+
+    Args:
+        enabled (bool, optional): Setting this to False makes this context manager a no-op.
+
+        use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
+            Adds approximately 4us of overhead to each tensor operation.
+
+        record_shapes (bool, optional): If shapes recording is set, information
+            about input dimensions will be collected. This allows one to see which
+            dimensions have been used under the hood and further group by them
+            using prof.key_averages(group_by_input_shape=True). Please note that
+            shape recording might skew your profiling data. It is recommended to
+            use separate runs with and without shape recording to validate the timing.
+            Most likely the skew will be negligible for bottom most events (in a case
+            of nested function calls). But for higher level functions the total
+            self cpu time might be artificially increased because of the shape
+            collection.
+
+        with_flops (bool, optional): If with_flops is set, the profiler will estimate
+            the FLOPs (floating point operations) value using the operator's input shape.
+            This allows one to estimate the hardware performance. Currently,
+            this option only works for the matrix multiplication and 2D convolution operators.
+
+        profile_memory (bool, optional): track tensor memory allocation/deallocation.
+
+        with_stack (bool, optional): record source information (file and line number) for the ops.
+
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+
+        use_kineto (bool, optional): experimental, enable profiling with Kineto profiler.
+
+        use_cpu (bool, optional): profile CPU events; setting to ``False`` requires
+            ``use_kineto=True`` and can be used to lower the overhead for GPU-only profiling.
+
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+
+
+    .. warning:
+        Enabling memory profiling or source attribution incurs additional profiler
+        overhead
+
+    .. warning:
+        This context managers should not be called recursively, i.e. no nested
+        instances are allowed
+
+    .. warning:
+        Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
+        one cannot use the profiler with ``use_cuda = True`` to benchmark
+        DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
+        please use ``use_cuda = False`` or ``num_workers = 0``.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
+        >>> x = torch.randn((1, 1), requires_grad=True)
+        >>> with torch.autograd.profiler.profile() as prof:
+        >>>     for _ in range(100):  # any normal python code, really!
+        >>>         y = x ** 2
+        >>>         y.backward()
+        >>> # NOTE: some columns were removed for brevity
+        >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+        -----------------------------------  ---------------  ---------------  ---------------
+        Name                                 Self CPU total   CPU time avg     Number of Calls
+        -----------------------------------  ---------------  ---------------  ---------------
+        mul                                  32.048ms         32.048ms         200
+        pow                                  27.041ms         27.041ms         200
+        PowBackward0                         9.727ms          55.483ms         100
+        torch::autograd::AccumulateGrad      9.148ms          9.148ms          100
+        torch::autograd::GraphRoot           691.816us        691.816us        100
+        -----------------------------------  ---------------  ---------------  ---------------
+
+    """
+
+    def __init__(
+        self,
+        enabled=True,
+        *,
+        use_cuda=False,
+        use_device=None,
+        record_shapes=False,
+        with_flops=False,
+        profile_memory=False,
+        with_stack=False,
+        with_modules=False,
+        use_kineto=False,
+        use_cpu=True,
+        use_mtia=False,
+        experimental_config=None,
+    ):
+        self.enabled: bool = enabled
+        if not self.enabled:
+            return
+        self.use_cuda = use_cuda
+        self.use_device: Optional[str] = (
+            use_device if use_device != "privateuseone" else None
+        )
+        self.function_events: Optional[EventList] = None
+        self.entered = False
+        self.record_shapes = record_shapes
+        self.with_flops = with_flops
+        self.record_shapes |= self.with_flops
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_modules = with_modules
+        self.use_cpu = use_cpu
+        self.use_mtia = use_mtia
+        if experimental_config is None:
+            experimental_config = _ExperimentalConfig()
+        self.experimental_config = experimental_config
+        self.kineto_results: Optional[_ProfilerResult] = None
+
+        if not self.use_cpu:
+            assert (
+                use_kineto
+            ), "Device-only events supported only with Kineto (use_kineto=True)"
+
+        if self.use_device == "cuda":
+            self.use_device = None
+            self.use_cuda = True
+
+        if self.use_device and self.use_device != _get_privateuse1_backend_name():
+            warn(f"{self.use_device} doesn't support profile.")
+            self.use_device = None
+
+        if self.use_cuda and not torch.cuda.is_available():
+            warn("CUDA is not available, disabling CUDA profiling")
+            self.use_cuda = False
+
+        self.kineto_activities = set()
+        if self.use_cpu:
+            self.kineto_activities.add(ProfilerActivity.CPU)
+        if self.use_mtia:
+            self.kineto_activities.add(ProfilerActivity.MTIA)
+
+        self.profiler_kind = ProfilerState.KINETO
+        if self.use_cuda:
+            if not use_kineto or ProfilerActivity.CUDA not in _supported_activities():
+                assert self.use_cpu, "Legacy CUDA profiling requires use_cpu=True"
+                self.profiler_kind = ProfilerState.KINETO_GPU_FALLBACK
+            else:
+                self.kineto_activities.add(ProfilerActivity.CUDA)
+
+        if self.use_device:
+            if (
+                not use_kineto
+                or ProfilerActivity.PrivateUse1 not in _supported_activities()
+            ):
+                assert (
+                    self.use_cpu
+                ), "Legacy custombackend profiling requires use_cpu=True"
+                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1_FALLBACK
+            else:
+                self.kineto_activities.add(ProfilerActivity.PrivateUse1)
+                self.profiler_kind = ProfilerState.KINETO_PRIVATEUSE1
+
+        assert (
+            len(self.kineto_activities) > 0
+        ), "No activities specified for the profiler"
+
+    def config(self):
+        return ProfilerConfig(
+            self.profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack,
+            self.with_flops,
+            self.with_modules,
+            self.experimental_config,
+        )
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("Profiler context manager is not reentrant")
+        self._prepare_trace()
+        self._start_trace()
+        return self
+
+    def _prepare_trace(self):
+        self.entered = True
+        _prepare_profiler(self.config(), self.kineto_activities)
+
+    def _start_trace(self):
+        self.entered = True
+        _run_on_profiler_start()
+        _enable_profiler(self.config(), self.kineto_activities)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        if self.use_cuda:
+            torch.cuda.synchronize()
+        self.kineto_results = _disable_profiler()
+        _run_on_profiler_stop()
+        parsed_results = self._parse_kineto_results(self.kineto_results)
+        self.function_events = EventList(
+            parsed_results,
+            use_cuda=self.use_cuda,
+            use_device=self.use_device,
+            profile_memory=self.profile_memory,
+            with_flops=self.with_flops,
+        )
+        self.function_events._build_tree()
+        return False
+
+    def __repr__(self):
+        if self.function_events is None:
+            return "<unfinished torch.autograd.profile>"
+        return repr(self.function_events)
+
+    def __str__(self):
+        if self.function_events is None:
+            return "<unfinished torch.autograd.profile>"
+        return str(self.function_events)
+
+    def _check_finish(self):
+        if self.function_events is None:
+            raise RuntimeError("Profiler didn't finish running")
+
+    def table(
+        self,
+        sort_by=None,
+        row_limit=100,
+        max_src_column_width=75,
+        max_name_column_width=55,
+        max_shapes_column_width=80,
+        header=None,
+        top_level_events_only=False,
+    ):
+        self._check_finish()
+        assert self.function_events is not None
+        return self.function_events.table(
+            sort_by=sort_by,
+            row_limit=row_limit,
+            max_src_column_width=max_src_column_width,
+            max_name_column_width=max_name_column_width,
+            max_shapes_column_width=max_shapes_column_width,
+            header=header,
+            top_level_events_only=top_level_events_only,
+        )
+
+    table.__doc__ = EventList.table.__doc__
+
+    def export_chrome_trace(self, path):
+        self._check_finish()
+        if kineto_available():
+            self.kineto_results.save(path)  # type: ignore[union-attr]
+        else:
+            return self.function_events.export_chrome_trace(path)  # type: ignore[union-attr]
+
+    export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
+
+    def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        assert self.with_stack, "export_stacks() requires with_stack=True"
+        return self.function_events.export_stacks(path, metric)
+
+    def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
+
+    key_averages.__doc__ = EventList.key_averages.__doc__
+
+    def total_average(self):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        return self.function_events.total_average()
+
+    total_average.__doc__ = EventList.total_average.__doc__
+
+    @property
+    def self_cpu_time_total(self):
+        """Returns total time spent on CPU.
+
+        The total time is a sum of all self times across all the events.
+        """
+        self._check_finish()
+        assert self.function_events is not None
+        return self.function_events.self_cpu_time_total
+
+    def _parse_kineto_results(self, result: _ProfilerResult):
+        # result.events() has most of the events - PyTorch op-level and device-level events
+
+        trace_start_us = result.trace_start_us()
+        mem_records = [
+            [evt, False] for evt in result.events() if evt.name() == MEMORY_EVENT_NAME
+        ]
+        oom_records = [
+            evt for evt in result.events() if evt.name() == OUT_OF_MEMORY_EVENT_NAME
+        ]
+        mem_records_acc = MemRecordsAcc(mem_records)
+
+        def _cpu_memory_usage(mem_record):
+            return (
+                mem_record.nbytes()
+                if mem_record.device_type()
+                in [DeviceType.CPU, DeviceType.MKLDNN, DeviceType.IDEEP]
+                else 0
+            )
+
+        def _cuda_memory_usage(mem_record):
+            return (
+                mem_record.nbytes()
+                if mem_record.device_type() in [DeviceType.CUDA, DeviceType.HIP]
+                else 0
+            )
+
+        def _privateuse1_memory_usage(mem_record):
+            return (
+                mem_record.nbytes()
+                if mem_record.device_type() in [DeviceType.PrivateUse1]
+                else 0
+            )
+
+        # Create and return FunctionEvent list
+        function_events = []
+        device_corr_map: Dict[int, List[FunctionEvent]] = {}
+        max_evt_id = 0
+        for kineto_event in result.events():
+            if _filter_name(kineto_event.name()):
+                continue
+            rel_start_us = kineto_event.start_us() - trace_start_us
+            rel_end_us = rel_start_us + kineto_event.duration_us()
+            abs_end_us = kineto_event.start_us() + kineto_event.duration_us()
+
+            cpu_memory_usage = 0
+            cuda_memory_usage = 0
+            privateuse1_memory_usage = 0
+            if kineto_event.device_type() == DeviceType.CPU:
+                # find the corresponding memory allocation events
+                for mem_record in mem_records_acc.in_interval(
+                    kineto_event.start_us(), abs_end_us
+                ):
+                    cpu_memory_usage += _cpu_memory_usage(mem_record[0])
+                    cuda_memory_usage += _cuda_memory_usage(mem_record[0])
+                    privateuse1_memory_usage += _privateuse1_memory_usage(mem_record[0])
+                    mem_record[1] = True
+
+            is_async = kineto_event.is_async() or (
+                kineto_event.start_thread_id() != kineto_event.end_thread_id()
+            )
+
+            fe = FunctionEvent(
+                id=kineto_event.correlation_id(),
+                name=_rewrite_name(name=kineto_event.name(), with_wildcard=True),
+                trace_name=_rewrite_name(name=kineto_event.name(), with_wildcard=False),
+                thread=kineto_event.start_thread_id(),
+                start_us=rel_start_us,
+                end_us=rel_end_us,
+                fwd_thread=kineto_event.fwd_thread_id(),
+                input_shapes=kineto_event.shapes(),
+                concrete_inputs=kineto_event.concrete_inputs(),
+                stack=[
+                    entry
+                    for entry in kineto_event.stack()
+                    if _filter_stack_entry(entry)
+                ],
+                scope=kineto_event.scope(),
+                use_device=self.use_device,
+                cpu_memory_usage=cpu_memory_usage,
+                cuda_memory_usage=cuda_memory_usage,
+                privateuse1_memory_usage=privateuse1_memory_usage,
+                is_async=is_async,
+                sequence_nr=kineto_event.sequence_nr(),
+                device_type=kineto_event.device_type(),
+                device_index=kineto_event.device_index(),
+                flops=kineto_event.flops(),
+            )
+            max_evt_id = max(max_evt_id, fe.id)
+            if fe.device_type == DeviceType.CPU and not fe.is_async:
+                if self.use_device:
+                    privateuse1_time = kineto_event.privateuse1_elapsed_us()
+                    if privateuse1_time > 0:
+                        fe.append_kernel(fe.name, fe.device_index, privateuse1_time)
+                        fe.is_legacy = True
+                else:
+                    # Check if we have CUDA time as a fallback
+                    cuda_time = kineto_event.cuda_elapsed_us()
+                    if cuda_time > 0:
+                        fe.append_kernel(fe.name, fe.device_index, cuda_time)
+                        fe.is_legacy = True
+            function_events.append(fe)
+            corr_id = kineto_event.linked_correlation_id()
+            if corr_id > 0:
+                if corr_id not in device_corr_map:
+                    device_corr_map[corr_id] = []
+                device_corr_map[corr_id].append(fe)
+
+        # associate CUDA kernels and CUDA runtime (CPU) with CPU events
+        for fe in function_events:
+            if (
+                fe.device_type == DeviceType.CPU
+                and not fe.is_async
+                and fe.id in device_corr_map
+            ):
+                for f_evt in device_corr_map[fe.id]:
+                    if f_evt.device_type == DeviceType.CUDA:
+                        fe.append_kernel(
+                            f_evt.name,
+                            f_evt.device_index,
+                            f_evt.time_range.end - f_evt.time_range.start,
+                        )
+                    elif f_evt.device_type == DeviceType.CPU:
+                        # make sure that 'thread' of a CPU Kineto (e.g. CUDA Runtime) event is associated
+                        # with the 'thread' of the corresponding linked PyTorch event to properly track
+                        # parents and children
+                        f_evt.thread = fe.thread
+
+        def createFunctionEventForMemoryEvents(evt):
+            rel_start_us = evt.start_us() - trace_start_us
+            fe = FunctionEvent(
+                id=max_evt_id,
+                name=evt.name(),
+                trace_name=None,  # not outputting in the trace
+                thread=evt.start_thread_id(),
+                start_us=rel_start_us,
+                end_us=rel_start_us,  # no duration
+                fwd_thread=evt.start_thread_id(),
+                input_shapes=[],
+                stack=[],
+                scope=0,  # RecordScope::FUNCTION
+                use_device=self.use_device,
+                cpu_memory_usage=_cpu_memory_usage(evt),
+                cuda_memory_usage=_cuda_memory_usage(evt),
+                privateuse1_memory_usage=_privateuse1_memory_usage(evt),
+                is_async=False,
+                sequence_nr=-1,
+                device_type=DeviceType.CPU,
+                device_index=0,
+            )
+            return fe
+
+        # output top-level memory events
+        for mem_record in mem_records:
+            if not mem_record[1]:
+                max_evt_id += 1
+                fe = createFunctionEventForMemoryEvents(mem_record[0])
+                function_events.append(fe)
+
+        for oom_record in oom_records:
+            max_evt_id += 1
+            fe = createFunctionEventForMemoryEvents(oom_record)
+            function_events.append(fe)
+
+        function_events.sort(
+            key=lambda evt: [evt.time_range.start, -evt.time_range.end]
+        )
+        return function_events
+
+
+class record_function(_ContextDecorator):
+    """Context manager/function decorator that adds a label to a code block/function when running autograd profiler.
+
+    It is useful when tracing the code profile.
+
+    Args:
+        name (str): Label assigned to the block of code.
+        node_id (int): ID of node, for distributed profiling. Unset in
+        non-distributed cases.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
+        >>> x = torch.randn((1, 1), requires_grad=True)
+        >>> with torch.autograd.profiler.profile() as prof:
+        ...     y = x ** 2
+        ...     with torch.autograd.profiler.record_function("label-z"): # label the block
+        ...         z = y ** 3
+        ...     y.backward()
+        ...
+        >>> # xdoctest: +IGNORE_WANT
+        >>> # NOTE: some columns were removed for brevity
+        >>> print(prof.key_averages().table(sort_by="self_cpu_time_total"))
+        -----------------------------------  ---------------  ---------------  ---------------
+        Name                                 Self CPU total %  CPU time avg     Number of Calls
+        -----------------------------------  ---------------  ---------------  ---------------
+        pow                                  60.77%           47.470us         3
+        mul                                  21.73%           25.465us         2
+        PowBackward0                         12.03%           121.891us        1
+        torch::autograd::AccumulateGrad      2.70%            6.324us          1
+        label-z                              2.13%            12.421us         1
+        torch::autograd::GraphRoot           0.64%            1.503us          1
+        -----------------------------------  ---------------  ---------------  ---------------
+        Self CPU time total: 234.344us
+        CUDA time total: 0.000us
+
+    """
+
+    def __init__(self, name: str, args: Optional[str] = None):
+        self.name: str = name
+        self.args: Optional[str] = args
+        # Whether or not we should run record function's end callbacks when exiting.
+        self.run_callbacks_on_exit: bool = True
+        # TODO: TorchScript ignores standard type annotation here
+        # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
+        self.record = torch.jit.annotate(
+            Optional["torch.classes.profiler._RecordFunction"], None
+        )
+
+    def __enter__(self):
+        self.record = torch.ops.profiler._record_function_enter_new(
+            self.name, self.args
+        )
+        return self
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
+        if not self.run_callbacks_on_exit:
+            return
+
+        # Local variable is needed by TorchScript to refine Optional[T] to T
+        record = self.record
+        assert record is not None
+
+        # TODO: Too slow with __torch_function__ handling enabled
+        # See https://github.com/pytorch/pytorch/issues/76410
+        if not torch.jit.is_scripting():
+            with torch._C.DisableTorchFunctionSubclass():
+                torch.ops.profiler._record_function_exit._RecordFunction(record)
+        else:
+            torch.ops.profiler._record_function_exit(record)
+
+    def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
+        """Use for profiling async calls that return a future.
+
+        Calling this function will extend recording beyond this scope, until the future is
+        satisfied. It is useful for profiling the end to end time of asynchronous calls.
+        This function should only be called once to attach the callback onto the future, and
+        will throw if called multiple times.
+
+        Args:
+            fut: (torch._C.Future): future for which to schedule
+            callback for.
+
+        Returns:
+            A future that completes with the value of the passed in future when
+            the profiling callbacks have ran.
+
+        """
+        # Throw if we have already attached a callback onto the future.
+        if not self.run_callbacks_on_exit:
+            raise RuntimeError("_call_end_callbacks_on_future can only be called once.")
+
+        # We are scheduling to run this RecordFunction's end callbacks when the
+        # passed in future completes, so don't run end callbacks on exit.
+        self.run_callbacks_on_exit = False
+
+        # Local variable is needed by TorchScript to refine Optional[T] to T
+        record = self.record
+        assert record is not None
+
+        # TODO: Too slow with __torch_function__ handling enabled
+        # See https://github.com/pytorch/pytorch/issues/76410
+        if not torch.jit.is_scripting():
+            with torch._C.DisableTorchFunctionSubclass():
+                profiled_future = (
+                    torch.ops.profiler._call_end_callbacks_on_jit_fut._RecordFunction(
+                        record, fut
+                    )
+                )
+        else:
+            profiled_future = torch.ops.profiler._call_end_callbacks_on_jit_fut(
+                record, fut
+            )
+        return profiled_future
+
+
+class emit_itt:
+    """Context manager that makes every autograd operation emit an ITT range.
+
+    It is useful when running the program under Intel(R) VTune Profiler::
+
+        vtune <--vtune-flags> <regular command here>
+
+    The Instrumentation and Tracing Technology (ITT) API enables your application to generate and
+    control the collection of trace data during its execution across different Intel tools.
+    This context manager is to annotate Intel(R) VTune Profiling trace. With help of this context manager,
+    you will be able to see labled ranges in Intel(R) VTune Profiler GUI.
+
+    .. warning:
+        This context manager should not be called recursively, i.e. at most one
+        instance should be enabled at any given time.
+
+    Args:
+        enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
+            Default: ``True``.
+        record_shapes (bool, optional): If ``record_shapes=True``, the itt range wrapping
+            each autograd op will append information about the sizes of Tensor arguments received
+            by that op, in the following format:
+            ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
+            Non-tensor arguments will be represented by ``[]``.
+            Arguments will be listed in the order they are received by the backend op.
+            Please note that this order may not match the order in which those arguments were passed
+            on the Python side.  Also note that shape recording may increase the overhead of itt range creation.
+            Default: ``False``
+
+    Example:
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
+        >>> with torch.autograd.profiler.emit_itt():
+        ...     model(x)
+
+    """
+
+    def __init__(self, enabled=True, record_shapes=False):
+        self.enabled = enabled
+        self.entered = False
+        self.record_shapes = record_shapes
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("ITT annotation context manager is not reentrant")
+        self.entered = True
+        _run_on_profiler_start()
+        _enable_profiler(
+            ProfilerConfig(
+                ProfilerState.ITT,
+                self.record_shapes,
+                False,
+                False,
+                False,
+                False,
+                _ExperimentalConfig(),
+            ),
+            set(),
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        _disable_profiler()
+        _run_on_profiler_stop()
+        return False
+
+
+class emit_nvtx:
+    """Context manager that makes every autograd operation emit an NVTX range.
+
+    It is useful when running the program under nvprof::
+
+        nvprof --profile-from-start off -o trace_name.prof -- <regular command here>
+
+    Unfortunately, there's no way to force nvprof to flush the data it collected
+    to disk, so for CUDA profiling one has to use this context manager to annotate
+    nvprof traces and wait for the process to exit before inspecting them.
+    Then, either NVIDIA Visual Profiler (nvvp) can be used to visualize the timeline, or
+    :func:`torch.autograd.profiler.load_nvprof` can load the results for inspection
+    e.g. in Python REPL.
+
+    .. warning:
+        This context manager should not be called recursively, i.e. at most one
+        instance should be enabled at any given time.
+
+    Args:
+        enabled (bool, optional): Setting ``enabled=False`` makes this context manager a no-op.
+            Default: ``True``.
+        record_shapes (bool, optional): If ``record_shapes=True``, the nvtx range wrapping
+            each autograd op will append information about the sizes of Tensor arguments received
+            by that op, in the following format:
+            ``[[arg0.size(0), arg0.size(1), ...], [arg1.size(0), arg1.size(1), ...], ...]``
+            Non-tensor arguments will be represented by ``[]``.
+            Arguments will be listed in the order they are received by the backend op.
+            Please note that this order may not match the order in which those arguments were passed
+            on the Python side.  Also note that shape recording may increase the overhead of nvtx range creation.
+            Default: ``False``
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD_PROFILER)
+        >>> with torch.cuda.profiler.profile():
+        ...     model(x)  # Warmup CUDA memory allocator and profiler
+        ...     with torch.autograd.profiler.emit_nvtx():
+        ...         model(x)
+
+    **Forward-backward correlation**
+
+    When viewing a profile created using :class:`emit_nvtx` in the Nvidia Visual Profiler,
+    correlating each backward-pass op with the corresponding forward-pass op can be difficult.
+    To ease this task, :class:`emit_nvtx` appends sequence number information to the ranges it
+    generates.
+
+    During the forward pass, each function range is decorated with ``seq=<N>``.  ``seq`` is a running
+    counter, incremented each time a new backward Function object is created and stashed for backward.
+    Thus, the ``seq=<N>`` annotation associated with each forward function range tells you that
+    if a backward Function object is created by this forward function,
+    the backward object will receive sequence number N.
+    During the backward pass, the top-level range wrapping each C++ backward Function's
+    ``apply()`` call is decorated with ``stashed seq=<M>``.  ``M`` is the sequence number that
+    the backward object was created with.  By comparing ``stashed seq`` numbers in backward with ``seq``
+    numbers in forward, you can track down which forward op created each backward Function.
+
+    Any functions executed during the backward pass are also decorated with ``seq=<N>``.  During
+    default backward (with ``create_graph=False``) this information is irrelevant, and in fact,
+    ``N`` may simply be 0 for all such functions.  Only the top-level ranges associated with
+    backward Function objects' ``apply()`` methods are useful, as a way to correlate these Function
+    objects with the earlier forward pass.
+
+    **Double-backward**
+
+    If, on the other hand, a backward pass with ``create_graph=True`` is underway (in other words,
+    if you are setting up for a double-backward), each function's execution during backward
+    is given a nonzero, useful ``seq=<N>``.  Those functions may themselves create Function objects
+    to be executed later during double-backward, just as the original functions in the forward pass did.
+    The relationship between backward and double-backward is conceptually the same as the relationship
+    between forward and backward: The functions still emit current-sequence-number-tagged ranges,
+    the Function objects they create still stash those sequence numbers, and during the eventual
+    double-backward, the Function objects' ``apply()`` ranges are still tagged with ``stashed seq``
+    numbers, which can be compared to `seq` numbers from the backward pass.
+
+    .. warning:
+        The sequence number is thread-local, and some forward functions don't create an associated
+        backward Function object (instead delegating that to sub-functions further down the call chain).
+        For these reasons, the correspondence of stashed sequence numbers in
+        backward Function ``apply()`` ranges with `seq` numbers in forward-pass ranges is
+        not guaranteed to be 1 to 1.  The sequence numbers alone may not be enough to fully
+        disambiguate which forward function created which
+        backward Function object.  You may need to make a judgment based on analytic knowledge of what
+        the expected correspondence should be.
+    """
+
+    def __init__(self, enabled=True, record_shapes=False):
+        self.enabled = enabled
+        self.entered = False
+        self.record_shapes = record_shapes
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("NVTX annotation context manager is not reentrant")
+        self.entered = True
+        torch.cuda.synchronize()
+        _run_on_profiler_start()
+        _enable_profiler(
+            ProfilerConfig(
+                ProfilerState.NVTX,
+                self.record_shapes,
+                False,
+                False,
+                False,
+                False,
+                _ExperimentalConfig(),
+            ),
+            set(),
+        )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        torch.cuda.synchronize()
+        _disable_profiler()
+        _run_on_profiler_stop()
+        return False
+
+
+def load_nvprof(path):
+    """Open an nvprof trace file and parses autograd annotations.
+
+    Args:
+        path (str): path to nvprof trace
+    """
+    return EventList(parse_nvprof_trace(path))
+
+
+class EnforceUnique:
+    """Raises an error if a key is seen more than once."""
+
+    def __init__(self):
+        self.seen = set()
+
+    def see(self, *key):
+        r"""
+        Observe a key and raise an error if it is seen multiple times.
+        """
+        if key in self.seen:
+            raise RuntimeError("duplicate key: " + str(key))
+        self.seen.add(key)
+
+
+def parse_nvprof_trace(path):
+    import sqlite3
+
+    conn = sqlite3.connect(path)
+    conn.row_factory = sqlite3.Row
+
+    # Parse strings table
+    strings = {}
+    for r in conn.execute("SELECT _id_ as id, value FROM StringTable"):
+        strings[r["id"]] = torch._C._demangle(r["value"])
+
+    # First, find all functions and create FunctionEvents for them
+    marker_query = """
+    SELECT
+        start.id AS marker_id, start.name, start.timestamp AS start_time, end.timestamp AS end_time
+    FROM
+        CUPTI_ACTIVITY_KIND_MARKER AS start INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
+        ON start.id = end.id
+    WHERE
+        start.name != 0 AND end.name = 0
+    """
+    functions = []
+    functions_map = {}
+    unique = EnforceUnique()
+    for row in conn.execute(marker_query):
+        unique.see(row["marker_id"])
+        evt = FunctionEvent(
+            id=row["marker_id"],
+            node_id=0,  # missing a node_id when calling FunctionEvent. This is just to ensure
+            # that pytorch doesn't crash when creating a FunctionEvent() object
+            name=strings[row["name"]],
+            start_us=row["start_time"],
+            end_us=row["end_time"],
+            thread=0,
+        )  # TODO: find in sqlite database
+        functions.append(evt)
+        functions_map[evt.id] = evt
+
+    # Now, correlate all kernels with FunctionEvents
+    kernel_query = """
+    SELECT
+        start.id AS marker_id, start.name, start.timestamp, end.timestamp,
+        runtime._id_ AS runtime_id, runtime.cbid, runtime.start AS runtime_start, runtime.end AS runtime_end,
+        kernel.start AS kernel_start, kernel.end AS kernel_end, kernel.name AS kernel_name
+    FROM
+        CUPTI_ACTIVITY_KIND_MARKER AS start
+        INNER JOIN CUPTI_ACTIVITY_KIND_MARKER AS end
+            ON start.id = end.id
+        INNER JOIN CUPTI_ACTIVITY_KIND_RUNTIME as runtime
+            ON (start.timestamp < runtime.start AND runtime.end < end.timestamp)
+        INNER JOIN CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL AS kernel
+            ON kernel.correlationId = runtime.correlationId
+    """
+    unique = EnforceUnique()
+    for row in conn.execute(kernel_query):
+        unique.see(row["marker_id"], row["runtime_id"])
+        # 211 is cudaKernelLaunch for cuda >= 9.2
+        assert row["cbid"] == 211
+        evt = functions_map[row["marker_id"]]
+        evt.append_kernel(
+            row["kernel_name"], 0, row["kernel_end"] - row["kernel_start"]
+        )
+
+    functions.sort(key=lambda evt: evt.time_range.start)
+    return functions
+
+
+class KinetoStepTracker:
+    """Provides an abstraction for incrementing the step count globally.
+
+    Previously, we only had one place to mark that a step() has occurred
+    in the program via pytorch profiler step(). We will now add step hooks
+    in the Optimizer class https://github.com/pytorch/pytorch/issues/88446
+
+    - This could mean programs that already call profiler.step() every
+      iteration can end up double incrementing step count.
+    - If a model uses multiple optimizers we can also have double or more
+      counting of the step.
+
+    We fix this by adding a layer of abstraction before calling step()
+    to the kineto library. The idea is to maintain steps per requester in a dict:
+
+    .. code-block::
+
+        {
+           "ProfilerStep": 100,  # triggered by profiler step() call
+           "Optimizer1Step": 100,   # Optimizer 1 or 2 are just examples, could be SGD, Adam etc
+           "Optimizer2Step": 100,
+        }
+
+    To figure out the global step count just take the max of dict values (100).
+
+    If one of the count increments the max will go up.
+
+    .. code-block::
+
+        {
+           "ProfilerStep": 100,
+           "Optimizer1Step": 101,   # Optimizer1 got incremented first say
+           "Optimizer2Step": 100,
+        }
+
+    Then global step count is 101
+    We only call the kineto step() function when global count increments.
+
+    NOTE: Please do not use the KinetoStepTracker in modules beside the Optimizer
+    for now. The result could be incorrect increments of the step count.
+    """
+
+    _current_step = 0
+    _step_dict: Dict[str, int] = defaultdict(int)
+
+    @classmethod
+    def init_step_count(cls, requester: str):
+        r"""
+        Initialize for a given requester.
+        """
+        cls._step_dict[requester] = cls._current_step
+
+    @classmethod
+    def erase_step_count(cls, requester: str) -> bool:
+        r"""
+        Remove a given requester.
+        """
+        return cls._step_dict.pop(requester, None) is not None
+
+    @classmethod
+    def increment_step(cls, requester: str) -> int:
+        """Increments the step count for the requester.
+
+        Additionally if the max over all step counts has incremented then
+        trigger the _kineto_step() returns global step count
+        """
+        if requester not in cls._step_dict:
+            cls.init_step_count(requester)
+        cls._step_dict[requester] += 1
+
+        new_step = max(cls._step_dict.values())
+        if new_step > cls._current_step:
+            delta = new_step - cls._current_step
+            if delta > 1:
+                warn(
+                    "Profiler step count has increased more than 1 - "
+                    f"current_step = {cls._current_step} step dict =  {cls._step_dict}"
+                )
+            for _ in range(0, delta):
+                _kineto_step()
+            cls._current_step = new_step
+        return cls._current_step
+
+    @classmethod
+    def current_step(cls) -> int:
+        r"""
+        Get the latest step for any requester
+        """
+        return cls._current_step
diff --git a/MLPY/Lib/site-packages/torch/autograd/profiler_legacy.py b/MLPY/Lib/site-packages/torch/autograd/profiler_legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbccb7e69d26ce2e95bad4e64c33eea51594ce7b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/profiler_legacy.py
@@ -0,0 +1,303 @@
+import itertools
+from warnings import warn
+
+import torch
+import torch.cuda
+
+from torch.autograd import (
+    _disable_profiler_legacy,
+    _enable_profiler_legacy,
+    DeviceType,
+    ProfilerConfig,
+    ProfilerState,
+)
+from torch.autograd.profiler_util import (
+    _filter_name,
+    _filter_stack_entry,
+    _rewrite_name,
+    EventList,
+    FunctionEvent,
+    MEMORY_EVENT_NAME,
+)
+
+__all__ = ["profile"]
+
+
+class profile:
+    """DEPRECATED: use torch.profiler instead."""
+
+    def __init__(
+        self,
+        enabled=True,
+        *,
+        use_cuda=False,
+        record_shapes=False,
+        with_flops=False,
+        profile_memory=False,
+        with_stack=False,
+        with_modules=False,
+    ):
+        self.enabled: bool = enabled
+        if not self.enabled:
+            return
+        self.use_cuda = use_cuda
+        self.function_events = None
+        self.entered = False
+        self.record_shapes = record_shapes
+        self.with_flops = with_flops
+        self.record_shapes |= self.with_flops
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_modules = with_modules
+
+        if self.use_cuda and not torch.cuda.is_available():
+            warn("CUDA is not available, disabling CUDA profiling")
+            self.use_cuda = False
+
+        if self.use_cuda:
+            self.profiler_kind = ProfilerState.CUDA
+        else:
+            self.profiler_kind = ProfilerState.CPU
+
+    def config(self):
+        return ProfilerConfig(
+            self.profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            self.with_stack,
+            self.with_flops,
+            self.with_modules,
+            # avoid exposing _ExperimentalConfig this in legacy public API
+            torch._C._profiler._ExperimentalConfig(),
+        )
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        if self.entered:
+            raise RuntimeError("Profiler context manager is not reentrant")
+        self.entered = True
+        self._start_trace()
+        return self
+
+    def _start_trace(self):
+        _enable_profiler_legacy(self.config())
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.enabled:
+            return
+        if self.use_cuda:
+            torch.cuda.synchronize()
+
+        records = _disable_profiler_legacy()
+        parsed_results = _parse_legacy_records(records)
+        self.function_events = EventList(
+            parsed_results,
+            use_cuda=self.use_cuda,
+            profile_memory=self.profile_memory,
+            with_flops=self.with_flops,
+        )
+        self.function_events._build_tree()
+        return False
+
+    def __repr__(self):
+        if self.function_events is None:
+            return "<unfinished profiler_legacy.profile>"
+        return repr(self.function_events)
+
+    def __str__(self):
+        if self.function_events is None:
+            return "<unfinished profile.profiler_legacy.profile>"
+        return str(self.function_events)
+
+    def _check_finish(self):
+        if self.function_events is None:
+            raise RuntimeError("Profiler didn't finish running")
+
+    def table(
+        self,
+        sort_by=None,
+        row_limit=100,
+        max_src_column_width=75,
+        max_name_column_width=55,
+        max_shapes_column_width=80,
+        header=None,
+        top_level_events_only=False,
+    ):
+        self._check_finish()
+        assert self.function_events is not None
+        return self.function_events.table(
+            sort_by=sort_by,
+            row_limit=row_limit,
+            max_src_column_width=max_src_column_width,
+            max_name_column_width=max_name_column_width,
+            max_shapes_column_width=max_shapes_column_width,
+            header=header,
+            top_level_events_only=top_level_events_only,
+        )
+
+    table.__doc__ = EventList.table.__doc__
+
+    def export_chrome_trace(self, path):
+        self._check_finish()
+        assert self.function_events is not None
+        return self.function_events.export_chrome_trace(path)
+
+    export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__
+
+    def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        assert self.with_stack, "export_stacks() requires with_stack=True"
+        return self.function_events.export_stacks(path, metric)
+
+    def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        return self.function_events.key_averages(group_by_input_shape, group_by_stack_n)
+
+    key_averages.__doc__ = EventList.key_averages.__doc__
+
+    def total_average(self):
+        self._check_finish()
+        assert self.function_events is not None, "Expected profiling results"
+        return self.function_events.total_average()
+
+    total_average.__doc__ = EventList.total_average.__doc__
+
+    @property
+    def self_cpu_time_total(self):
+        """Return CPU time as the sum of self times across all events."""
+        self._check_finish()
+        assert self.function_events is not None
+        return self.function_events.self_cpu_time_total
+
+
+def _parse_legacy_records(thread_records):
+    def _get_record_key(record):
+        """Return a tuple for correlating start and end records in `_parse_legacy_records`."""
+        return (record.handle(), record.node_id())
+
+    next_id = 0
+    start_record = None
+    functions = []
+    record_stack = []
+
+    # '__start_profile' is not guaranteed to be first, so we must find it here
+    for record in itertools.chain.from_iterable(thread_records):
+        name = record.name()
+        if start_record is None and name == "__start_profile":
+            start_record = record
+
+    assert start_record is not None and not start_record.is_remote()
+
+    for thread_record_list in thread_records:
+        # accumulated memory allocations per handle
+        cpu_memory_allocs = {}
+        cuda_memory_allocs = {}
+        # ranges per handle
+        range_starts = {}
+
+        filtered_handles = set()
+        prev_record = None
+        for record in thread_record_list:
+            record_key = _get_record_key(record)
+            if _filter_name(record.name()) or record_key in filtered_handles:
+                filtered_handles.add(record_key)
+                continue
+
+            if record.kind() == "push":
+                # workaround to reduce double logging from operator
+                # wrappers and redispatch
+                if prev_record is not None:
+                    duplicate = (
+                        prev_record.name() == record.name()
+                        and prev_record.kind() == record.kind()
+                        and prev_record.node_id() == record.node_id()
+                    )
+                    if duplicate:
+                        filtered_handles.add(record_key)
+                        continue
+
+                range_starts[record_key] = record
+                cpu_memory_allocs[record_key] = 0
+                cuda_memory_allocs[record_key] = 0
+            elif record.kind() == "pop":
+                assert (
+                    record_key in range_starts
+                ), f"""Expected record with key {record_key} to exist in range_starts.
+                    This means that the pop event did not have a corresponding push."""
+
+                start = range_starts[record_key]
+
+                cpu_memory_usage = cpu_memory_allocs[record_key]
+                cuda_memory_usage = cuda_memory_allocs[record_key]
+                is_async = start.is_async() or (start.thread_id() != record.thread_id())
+                is_remote_event = record.is_remote()
+                start_flops = start.flops()
+
+                fe = FunctionEvent(
+                    id=record.handle(),
+                    node_id=record.node_id(),
+                    name=_rewrite_name(name=start.name(), with_wildcard=True),
+                    trace_name=_rewrite_name(name=start.name(), with_wildcard=False),
+                    thread=start.thread_id(),
+                    start_us=start_record.cpu_elapsed_us(start),
+                    end_us=start_record.cpu_elapsed_us(record),
+                    fwd_thread=start.fwd_thread_id(),
+                    input_shapes=start.shapes(),
+                    stack=[
+                        entry for entry in start.stack() if _filter_stack_entry(entry)
+                    ],
+                    scope=start.scope(),
+                    cpu_memory_usage=cpu_memory_usage,
+                    cuda_memory_usage=cuda_memory_usage,
+                    is_async=is_async,
+                    is_remote=is_remote_event,
+                    sequence_nr=start.sequence_nr(),
+                    device_type=DeviceType.CPU,
+                    is_legacy=True,
+                    flops=start_flops,
+                )
+                # note: async events have only cpu total time
+                if not is_async and start.has_cuda():
+                    duration = start.cuda_elapsed_us(record)
+                    if duration > 0:
+                        fe.append_kernel(start.name(), start.device(), duration)
+                functions.append(fe)
+                del range_starts[record_key]
+                del cpu_memory_allocs[record_key]
+                del cuda_memory_allocs[record_key]
+            elif record.kind() == "memory_alloc":
+                num_open_handles_cpu = len(cpu_memory_allocs)
+                num_open_handles_cuda = len(cuda_memory_allocs)
+                assert num_open_handles_cpu == num_open_handles_cuda
+                for handle in cpu_memory_allocs.keys():
+                    cpu_memory_allocs[handle] += record.cpu_memory_usage()
+                for handle in cuda_memory_allocs.keys():
+                    cuda_memory_allocs[handle] += record.cuda_memory_usage()
+                if num_open_handles_cpu == 0:
+                    # output event as a top-level memory event
+                    fe = FunctionEvent(
+                        id=0,
+                        name=MEMORY_EVENT_NAME,
+                        trace_name=None,
+                        thread=0,
+                        start_us=0,
+                        end_us=0,
+                        stack=[],
+                        cpu_memory_usage=record.cpu_memory_usage(),
+                        cuda_memory_usage=record.cuda_memory_usage(),
+                        is_legacy=True,
+                    )
+                    functions.append(fe)
+            prev_record = record
+
+    # Sort functions by start time then by end time ascending.
+    # This ensures that--in the case of nested events which
+    # have the same start time (which may happen due to the
+    # granularity of the given clock tick)--we always show
+    # the outermost nested call first. This adds stability
+    # in how FunctionEvents appear
+    functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end])
+    return functions
diff --git a/MLPY/Lib/site-packages/torch/autograd/profiler_util.py b/MLPY/Lib/site-packages/torch/autograd/profiler_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..43dbab2b490092c96b16a3cdce49f8f0745b5847
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/profiler_util.py
@@ -0,0 +1,1178 @@
+import bisect
+import itertools
+import math
+
+from collections import defaultdict, namedtuple
+from operator import attrgetter
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch.autograd import DeviceType
+
+__all__ = [
+    "EventList",
+    "FormattedTimesMixin",
+    "Interval",
+    "Kernel",
+    "FunctionEvent",
+    "FunctionEventAvg",
+    "StringTable",
+    "MemRecordsAcc",
+]
+
+
+class EventList(list):
+    """A list of Events (for pretty printing)."""
+
+    def __init__(self, *args, **kwargs):
+        use_cuda = kwargs.pop("use_cuda", True)
+        use_device = kwargs.pop("use_device", None)
+        profile_memory = kwargs.pop("profile_memory", False)
+        with_flops = kwargs.pop("with_flops", False)
+        super().__init__(*args, **kwargs)
+        self._use_cuda = use_cuda
+        self._use_device = use_device
+        self._profile_memory = profile_memory
+        self._tree_built = False
+        self._with_flops = with_flops
+
+    def _build_tree(self):
+        self._populate_cpu_children()
+        self._remove_dup_nodes()
+        self._set_backward_stacktraces()
+        self._tree_built = True
+
+    def __str__(self):
+        return self.table()
+
+    def _remove_dup_nodes(self):
+        while True:
+            to_delete = set()
+            for idx in range(len(self)):
+                if (
+                    self[idx].cpu_parent is not None
+                    and self[idx].cpu_parent.name == self[idx].name
+                    and len(self[idx].cpu_parent.cpu_children) == 1
+                ):
+                    self[idx].cpu_parent.cpu_children = self[idx].cpu_children
+                    self[idx].cpu_parent.kernels = self[idx].kernels  # lift kernels up
+                    for ch in self[idx].cpu_children:
+                        ch.cpu_parent = self[idx].cpu_parent
+                    to_delete.add(idx)
+            if len(to_delete) == 0:
+                break
+            new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete]
+            self.clear()
+            self.extend(new_evts)
+
+    def _populate_cpu_children(self):
+        """Populate child events into each underlying FunctionEvent object.
+
+        One event is a child of another if [s1, e1) is inside [s2, e2). Where
+        s1 and e1 would be start and end of the child event's interval. And
+        s2 and e2 start and end of the parent event's interval
+
+        Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10]
+        be a parent of two other intervals.
+
+        If for any reason two intervals intersect only partially, this function
+        will not record a parent child relationship between then.
+        """
+        # Some events can be async (i.e. start and end on different threads),
+        # since it's generally undefined how to attribute children ranges to
+        # async ranges, we do not use them when calculating nested ranges and stats
+        sync_events = [
+            evt
+            for evt in self
+            if not evt.is_async and evt.device_type == DeviceType.CPU
+        ]
+        events = sorted(
+            sync_events,
+            key=attrgetter("thread"),
+        )
+        # Group by both thread and node_id, so that events that happen to have
+        # the same thread_id but are from different nodes aren't incorrectly
+        # grouped together.
+        threads = itertools.groupby(
+            events, key=lambda event: (event.thread, event.node_id)
+        )
+
+        # For each thread we keep a stack of current nested parents.
+        # We maintain the invariant that each interval is a subset of all other
+        # intervals lower in the stack.
+        #
+        # First we sort the intervals by their start time. Then we iterate over them.
+        # Every time we see a new interval we remove several parents from
+        # the top until we restore the invariant. Then parent child relationship
+        # if recorded if the stack is not empty.
+        # Finally we add new interval to the list
+        #
+        # Algorithm has O(N * log(N)) complexity where N is number of
+        # intervals
+        for thread_id, thread_events in threads:
+            thread_events_ = sorted(
+                thread_events,
+                key=lambda event: [event.time_range.start, -event.time_range.end],
+            )
+            current_events: List[FunctionEvent] = []
+            cur_end = 0
+            for event in thread_events_:
+                while len(current_events) > 0:
+                    parent = current_events[-1]
+                    if (
+                        event.time_range.start >= parent.time_range.end
+                        or event.time_range.end > parent.time_range.end
+                    ):
+                        # this can't be a parent
+                        current_events.pop()
+                    else:
+                        parent.append_cpu_child(event)
+                        assert (
+                            event.cpu_parent is None
+                        ), f"There is already a CPU parent event for {event.key}"
+                        event.set_cpu_parent(parent)
+                        break
+
+                current_events.append(event)
+
+    def _set_backward_stacktraces(self):
+        def bw_parent(evt):
+            if evt is None:
+                return None
+            elif evt.scope == 1:  # BACKWARD_FUNCTION
+                return evt
+            else:
+                return bw_parent(evt.cpu_parent)
+
+        fwd_stacks = {}
+        for evt in self:
+            if bw_parent(evt) is None and evt.stack is not None:
+                t = (evt.sequence_nr, evt.thread)
+                if t not in fwd_stacks:
+                    fwd_stacks[t] = evt.stack
+
+        for evt in self:
+            p = bw_parent(evt)
+            if p is not None:
+                assert p.fwd_thread is not None
+                t = (p.sequence_nr, p.fwd_thread)
+                if t in fwd_stacks:
+                    evt.stack = fwd_stacks[t]
+                else:
+                    evt.stack = []
+
+    @property
+    def self_cpu_time_total(self):
+        return sum([event.self_cpu_time_total for event in self])
+
+    def table(
+        self,
+        sort_by=None,
+        row_limit=100,
+        max_src_column_width=75,
+        max_name_column_width=55,
+        max_shapes_column_width=80,
+        header=None,
+        top_level_events_only=False,
+    ):
+        """Print an EventList as a nicely formatted table.
+
+        Args:
+            sort_by (str, optional): Attribute used to sort entries. By default
+                they are printed in the same order as they were registered.
+                Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
+                ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``,
+                ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``.
+            top_level_events_only(bool, optional): Boolean flag to determine the
+                selection of events to display. If true, the profiler will only
+                display events at top level like top-level invocation of python
+                `lstm`, python `add` or other functions, nested events like low-level
+                cpu/cuda ops events are omitted for profiler result readability.
+
+        Returns:
+            A string containing the table.
+        """
+        return _build_table(
+            self,
+            sort_by=sort_by,
+            row_limit=row_limit,
+            max_src_column_width=max_src_column_width,
+            max_name_column_width=max_name_column_width,
+            max_shapes_column_width=max_shapes_column_width,
+            header=header,
+            profile_memory=self._profile_memory,
+            with_flops=self._with_flops,
+            top_level_events_only=top_level_events_only,
+        )
+
+    def export_chrome_trace(self, path):
+        """Export an EventList as a Chrome tracing tools file.
+
+        The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
+
+        Args:
+            path (str): Path where the trace will be written.
+        """
+        import os
+
+        device_name = "cuda" if not self._use_device else self._use_device
+        with open(path, "w") as f:
+            chrome_events = []
+            next_id = 0
+            # Use file IO over using json.dump since JSON dumping is very slow and
+            # this technique is proven to give a 4x speedup.
+            f.write("[")
+            for evt in self:
+                if evt.trace_name is None:
+                    continue
+                f.write(
+                    '{{"name": "{}", '
+                    '"ph": "X", '
+                    '"ts": {}, '
+                    '"dur": {}, '
+                    '"tid": {}, '
+                    '"pid": "CPU functions", '
+                    '"args": {{}}}}, '.format(
+                        evt.trace_name,
+                        evt.time_range.start,
+                        evt.time_range.elapsed_us(),
+                        evt.thread
+                        if not evt.is_remote
+                        else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "',
+                    )
+                )
+                for k in evt.kernels:
+                    # 's' and 'f' draw Flow arrows from
+                    # the CPU launch to the GPU kernel
+                    f.write(
+                        f'{{"name": "{evt.trace_name}", '
+                        '"ph": "s", '
+                        f'"ts": {evt.time_range.start}, '
+                        f'"tid": {evt.thread}, '
+                        '"pid": "CPU functions", '
+                        f'"id": {next_id}, '
+                        f'"cat": "cpu_to_{device_name}", '
+                        '"args": {}}, '
+                    )
+                    # Note: use torch.profiler to get device kernel trace
+                    next_id += 1
+            if len(self) > 0:
+                # remove trailing whitespace and comma
+                f.seek(f.tell() - 2, os.SEEK_SET)
+                f.truncate()
+            f.write("]")
+
+    def supported_export_stacks_metrics(self):
+        return [
+            "self_cpu_time_total",
+            "self_cuda_time_total",
+            "self_privateuse1_time_total",
+        ]
+
+    def export_stacks(self, path: str, metric: str):
+        if metric not in self.supported_export_stacks_metrics():
+            raise ValueError(
+                "metric should be one of: "
+                + str(self.supported_export_stacks_metrics())
+            )
+        translate_table = str.maketrans(" ;\t\n", "____")
+        with open(path, "w") as f:
+            for evt in self:
+                if evt.stack and len(evt.stack) > 0:
+                    metric_value = getattr(evt, metric)
+                    if int(metric_value) > 0:
+                        stack_str = ""
+                        for entry in reversed(evt.stack):
+                            stack_str += entry.translate(translate_table)
+                            stack_str += ";"
+                        stack_str = stack_str[:-1] + " " + str(int(metric_value))
+                        f.write(stack_str + "\n")
+
+    def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
+        """Averages all function events over their keys.
+
+        Args:
+            group_by_input_shapes: group entries by
+                (event name, input shapes) rather than just event name.
+                This is useful to see which input shapes contribute to the runtime
+                the most and may help with size-specific optimizations or
+                choosing the best candidates for quantization (aka fitting a roof line)
+
+            group_by_stack_n: group by top n stack trace entries
+
+        Returns:
+            An EventList containing FunctionEventAvg objects.
+        """
+        assert self._tree_built
+        stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg)
+
+        def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]:
+            key = [
+                str(event.key),
+                str(event.node_id),
+                str(event.device_type),
+                str(event.is_legacy),
+            ]
+            if group_by_input_shapes:
+                key.append(str(event.input_shapes))
+            if group_by_stack_n > 0:
+                key += event.stack[:group_by_stack_n]
+            return tuple(key)
+
+        for evt in self:
+            stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt)
+
+        avg_list = EventList(
+            stats.values(),
+            use_cuda=self._use_cuda,
+            use_device=self._use_device,
+            profile_memory=self._profile_memory,
+            with_flops=self._with_flops,
+        )
+        for evt in avg_list:
+            evt.stack = evt.stack[:group_by_stack_n]
+            if not group_by_input_shapes:
+                evt.input_shapes = ""
+        return avg_list
+
+    def total_average(self):
+        """Averages all events.
+
+        Returns:
+            A FunctionEventAvg object.
+        """
+        total_stat = FunctionEventAvg()
+        for evt in self:
+            total_stat += evt
+            total_stat.key = None
+        total_stat.key = "Total"
+        return total_stat
+
+
+def _format_time(time_us):
+    """Define how to format time in FunctionEvent."""
+    US_IN_SECOND = 1000.0 * 1000.0
+    US_IN_MS = 1000.0
+    if time_us >= US_IN_SECOND:
+        return f"{time_us / US_IN_SECOND:.3f}s"
+    if time_us >= US_IN_MS:
+        return f"{time_us / US_IN_MS:.3f}ms"
+    return f"{time_us:.3f}us"
+
+
+def _format_time_share(time_us, total_time_us):
+    """Define how to format time in FunctionEvent."""
+    if total_time_us == 0:
+        assert time_us == 0, f"Expected time_us == 0 but got {time_us}"
+        return "NaN"
+    return f"{time_us * 100.0 / total_time_us:.2f}%"
+
+
+def _format_memory(nbytes):
+    """Return a formatted memory size string."""
+    KB = 1024
+    MB = 1024 * KB
+    GB = 1024 * MB
+    if abs(nbytes) >= GB:
+        return f"{nbytes * 1.0 / GB:.2f} Gb"
+    elif abs(nbytes) >= MB:
+        return f"{nbytes * 1.0 / MB:.2f} Mb"
+    elif abs(nbytes) >= KB:
+        return f"{nbytes * 1.0 / KB:.2f} Kb"
+    else:
+        return str(nbytes) + " b"
+
+
+def _attr_formatter(name):
+    return property(lambda self: _format_time(getattr(self, name)))
+
+
+class FormattedTimesMixin:
+    """Helpers for FunctionEvent and FunctionEventAvg.
+
+    The subclass should define `*_time_total` and `count` attributes.
+    """
+
+    cpu_time_str = _attr_formatter("cpu_time")
+    cuda_time_str = _attr_formatter("cuda_time")
+    privateuse1_time_str = _attr_formatter("privateuse1_time")
+    cpu_time_total_str = _attr_formatter("cpu_time_total")
+    cuda_time_total_str = _attr_formatter("cuda_time_total")
+    privateuse1_time_total_str = _attr_formatter("privateuse1_time_total")
+    self_cpu_time_total_str = _attr_formatter("self_cpu_time_total")
+    self_cuda_time_total_str = _attr_formatter("self_cuda_time_total")
+    self_privateuse1_time_total_str = _attr_formatter("self_privateuse1_time_total")
+
+    @property
+    def cpu_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count  # type: ignore[attr-defined]
+
+    @property
+    def cuda_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count  # type: ignore[attr-defined]
+
+    @property
+    def privateuse1_time(self):
+        return 0.0 if self.count == 0 else 1.0 * self.privateuse1_time_total / self.count  # type: ignore[attr-defined]
+
+
+class Interval:
+    def __init__(self, start, end):
+        self.start = start
+        self.end = end
+
+    def elapsed_us(self):
+        r"""
+        Returns the length of the interval
+        """
+        return self.end - self.start
+
+
+Kernel = namedtuple("Kernel", ["name", "device", "duration"])
+
+
+class FunctionEvent(FormattedTimesMixin):
+    """Profiling information about a single function."""
+
+    def __init__(
+        self,
+        id,
+        name,
+        thread,
+        start_us,
+        end_us,
+        fwd_thread=None,
+        input_shapes=None,
+        stack=None,
+        scope=0,
+        use_device=None,
+        cpu_memory_usage=0,
+        cuda_memory_usage=0,
+        privateuse1_memory_usage=0,
+        is_async=False,
+        is_remote=False,
+        sequence_nr=-1,
+        node_id=-1,
+        device_type=DeviceType.CPU,
+        device_index=0,
+        is_legacy=False,
+        flops=None,
+        trace_name=None,
+        concrete_inputs=None,
+    ):
+        self.id: int = id
+        self.node_id: int = node_id
+        self.name: str = name
+        self.trace_name: str = trace_name
+        self.time_range: Interval = Interval(start_us, end_us)
+        self.thread: int = thread
+        self.fwd_thread: Optional[int] = fwd_thread
+        self.kernels: List[Kernel] = []
+        self.count: int = 1
+        self.cpu_children: List[FunctionEvent] = []
+        self.cpu_parent: Optional[FunctionEvent] = None
+        self.input_shapes: Tuple[int, ...] = input_shapes
+        self.concrete_inputs: List[Any] = concrete_inputs
+        self.stack: List = stack
+        self.scope: int = scope
+        self.use_device: Optional[str] = use_device
+        self.cpu_memory_usage: int = cpu_memory_usage
+        self.cuda_memory_usage: int = cuda_memory_usage
+        self.privateuse1_memory_usage: int = privateuse1_memory_usage
+        self.is_async: bool = is_async
+        self.is_remote: bool = is_remote
+        self.sequence_nr: int = sequence_nr
+        self.device_type: DeviceType = device_type
+        self.device_index: int = device_index
+        self.is_legacy: bool = is_legacy
+        self.flops: Optional[int] = flops
+
+    def append_kernel(self, name, device, duration):
+        assert self.device_type == DeviceType.CPU
+        self.kernels.append(Kernel(name, device, duration))
+
+    def append_cpu_child(self, child):
+        """Append a CPU child of type FunctionEvent.
+
+        One is supposed to append only direct children to the event to have
+        correct self cpu time being reported.
+        """
+        assert self.device_type == DeviceType.CPU
+        assert isinstance(child, FunctionEvent)
+        assert child.device_type == DeviceType.CPU
+        self.cpu_children.append(child)
+
+    def set_cpu_parent(self, parent):
+        """Set the immediate CPU parent of type FunctionEvent.
+
+        One profiling FunctionEvent should have only one CPU parent such that
+        the child's range interval is completely inside the parent's. We use
+        this connection to determine the event is from top-level op or not.
+        """
+        assert self.device_type == DeviceType.CPU
+        assert isinstance(parent, FunctionEvent)
+        assert parent.device_type == DeviceType.CPU
+        self.cpu_parent = parent
+
+    # Note: async events don't have children, are not used when computing 'self'
+    # metrics of other events, have only total cpu time
+    @property
+    def self_cpu_memory_usage(self):
+        if self.is_async or self.device_type != DeviceType.CPU:
+            return 0
+        return self.cpu_memory_usage - sum(
+            [child.cpu_memory_usage for child in self.cpu_children]
+        )
+
+    @property
+    def self_cuda_memory_usage(self):
+        if self.is_async or self.device_type != DeviceType.CPU:
+            return 0
+        return self.cuda_memory_usage - sum(
+            [child.cuda_memory_usage for child in self.cpu_children]
+        )
+
+    @property
+    def self_privateuse1_memory_usage(self):
+        if self.is_async or self.device_type != DeviceType.CPU:
+            return 0
+        return self.privateuse1_memory_usage - sum(
+            [child.privateuse1_memory_usage for child in self.cpu_children]
+        )
+
+    @property
+    def self_cpu_time_total(self):
+        if self.is_async or self.device_type != DeviceType.CPU:
+            return 0
+        return self.cpu_time_total - sum(
+            [child.cpu_time_total for child in self.cpu_children]
+        )
+
+    @property
+    def cuda_time_total(self):
+        if self.is_async or self.use_device:
+            return 0
+        if self.device_type == DeviceType.CPU:
+            if not self.is_legacy:
+                # account for the kernels in the children ops
+                return sum(kinfo.duration for kinfo in self.kernels) + sum(
+                    ch.cuda_time_total for ch in self.cpu_children
+                )
+            else:
+                # each legacy cpu events has a single (fake) kernel
+                return sum(kinfo.duration for kinfo in self.kernels)
+        else:
+            assert self.device_type == DeviceType.CUDA
+            return self.time_range.elapsed_us()
+
+    @property
+    def self_cuda_time_total(self):
+        if self.is_async or self.use_device:
+            return 0
+        if self.device_type == DeviceType.CPU:
+            return self.cuda_time_total - sum(
+                [child.cuda_time_total for child in self.cpu_children]
+            )
+        else:
+            assert self.device_type == DeviceType.CUDA
+            return self.cuda_time_total
+
+    @property
+    def cpu_time_total(self):
+        if self.device_type == DeviceType.CPU:
+            return self.time_range.elapsed_us()
+        else:
+            return 0
+
+    @property
+    def self_privateuse1_time_total(self):
+        if self.is_async or not self.use_device:
+            return 0
+        if self.device_type == DeviceType.CPU:
+            return self.privateuse1_time_total - sum(
+                [child.privateuse1_time_total for child in self.cpu_children]
+            )
+        else:
+            assert self.device_type == DeviceType.CUDA
+            return self.privateuse1_time_total
+
+    @property
+    def privateuse1_time_total(self):
+        if self.is_async or not self.use_device:
+            return 0
+        if self.device_type == DeviceType.CPU:
+            if not self.is_legacy:
+                # account for the kernels in the children ops
+                return sum(kinfo.duration for kinfo in self.kernels) + sum(
+                    ch.privateuse1_time_total for ch in self.cpu_children
+                )
+            else:
+                # each legacy cpu events has a single (fake) kernel
+                return sum(kinfo.duration for kinfo in self.kernels)
+        else:
+            assert self.device_type == DeviceType.PrivateUse1
+            return self.time_range.elapsed_us()
+
+    @property
+    def key(self):
+        return self.name
+
+    def __repr__(self):
+        device_name = "cuda" if not self.use_device else self.use_device
+        device_time = (
+            self.cuda_time_str if not self.use_device else self.privateuse1_time_str
+        )
+        device_memory_usage = (
+            self.cuda_memory_usage
+            if not self.use_device
+            else self.privateuse1_memory_usage
+        )
+        return (
+            "<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} "
+            "cpu_children={} {}_time={} name={} thread={} input_shapes={} "
+            "cpu_memory_usage={} {}_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>".format(
+                self.id,
+                self.name,
+                self.device_type,
+                self.node_id,
+                self.cpu_time_str,
+                self.time_range.start,
+                self.time_range.end,
+                str([child.id for child in self.cpu_children]),
+                device_name,
+                device_time,
+                self.name,
+                self.thread,
+                str(self.input_shapes),
+                self.cpu_memory_usage,
+                device_name,
+                device_memory_usage,
+                self.is_async,
+                self.is_remote,
+                self.sequence_nr,
+                self.is_legacy,
+            )
+        )
+
+
+class FunctionEventAvg(FormattedTimesMixin):
+    """Used to average stats over multiple FunctionEvent objects."""
+
+    def __init__(self):
+        self.key: Optional[str] = None
+        self.count: int = 0
+        self.node_id: int = 0
+        self.is_async: bool = False
+        self.is_remote: bool = False
+        self.use_device: Optional[str] = None
+        self.cpu_time_total: int = 0
+        self.cuda_time_total: int = 0
+        self.privateuse1_time_total: int = 0
+        self.self_cpu_time_total: int = 0
+        self.self_cuda_time_total: int = 0
+        self.self_privateuse1_time_total: int = 0
+        self.input_shapes: Optional[List[List[int]]] = None
+        self.stack: Optional[List] = None
+        self.scope: Optional[int] = None
+        self.cpu_memory_usage: int = 0
+        self.cuda_memory_usage: int = 0
+        self.privateuse1_memory_usage: int = 0
+        self.self_cpu_memory_usage: int = 0
+        self.self_cuda_memory_usage: int = 0
+        self.self_privateuse1_memory_usage: int = 0
+        self.cpu_children: Optional[List[FunctionEvent]] = None
+        self.cpu_parent: Optional[FunctionEvent] = None
+        self.device_type: DeviceType = DeviceType.CPU
+        self.is_legacy: bool = False
+        self.flops: int = 0
+
+    def add(self, other):
+        if self.key is None:
+            # First function being recorded as part of FunctionEventAvg, propagate
+            # fields.
+            self.key = other.key
+            self.node_id = other.node_id
+            self.is_async = other.is_async
+            self.is_remote = other.is_remote
+            self.cpu_parent = other.cpu_parent
+            self.cpu_children = other.cpu_children
+
+            self.input_shapes = other.input_shapes
+            self.stack = other.stack
+            self.scope = other.scope
+            self.device_type = other.device_type
+            self.is_legacy = other.is_legacy
+            self.use_device = other.use_device
+
+        assert isinstance(other, (FunctionEvent, FunctionEventAvg))
+        assert other.key == self.key
+        self.cpu_time_total += other.cpu_time_total
+        self.cuda_time_total += other.cuda_time_total
+        self.privateuse1_time_total += other.privateuse1_time_total
+        self.self_cpu_time_total += other.self_cpu_time_total
+        self.self_cuda_time_total += other.self_cuda_time_total
+        self.self_privateuse1_time_total += other.self_privateuse1_time_total
+        self.cpu_memory_usage += other.cpu_memory_usage
+        self.cuda_memory_usage += other.cuda_memory_usage
+        self.privateuse1_memory_usage += other.privateuse1_memory_usage
+        self.self_cpu_memory_usage += other.self_cpu_memory_usage
+        self.self_cuda_memory_usage += other.self_cuda_memory_usage
+        self.self_privateuse1_memory_usage += other.self_privateuse1_memory_usage
+        self.count += other.count
+        if self.flops is None:
+            self.flops = other.flops
+        elif other.flops is not None:
+            self.flops += other.flops
+        return self
+
+    def __iadd__(self, other):
+        return self.add(other)
+
+    def __repr__(self):
+        device_name = "cuda" if not self.use_device else self.use_device
+        self_device_time = (
+            self.self_cuda_time_total_str
+            if not self.use_device
+            else self.self_privateuse1_time_total_str
+        )
+        device_time = (
+            self.cuda_time_str if not self.use_device else self.privateuse1_time_str
+        )
+        device_memory = (
+            self.cuda_memory_usage
+            if not self.use_device
+            else self.privateuse1_memory_usage
+        )
+        return (
+            "<FunctionEventAvg key={} self_cpu_time={} cpu_time={} "
+            " self_{}_time={} {}_time={} input_shapes={} "
+            "cpu_memory_usage={} {}_memory_usage={}>".format(
+                self.key,
+                self.self_cpu_time_total_str,
+                self.cpu_time_str,
+                device_name,
+                self_device_time,
+                device_name,
+                device_time,
+                str(self.input_shapes),
+                self.cpu_memory_usage,
+                device_name,
+                device_memory,
+            )
+        )
+
+
+class StringTable(defaultdict):
+    def __missing__(self, key):
+        # manage cases like 't' (demangled to 'unsigned short') separately,
+        # for now simply check the length to avoid unexpected results for
+        # the short sequences
+        self[key] = torch._C._demangle(key) if len(key) > 1 else key
+        return self[key]
+
+
+class MemRecordsAcc:
+    """Acceleration structure for accessing mem_records in interval."""
+
+    def __init__(self, mem_records):
+        self._mem_records = mem_records
+        self._start_uses: List[int] = []
+        self._indices: List[int] = []
+        if len(mem_records) > 0:
+            tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)])
+            self._start_uses, self._indices = zip(*tmp)  # type: ignore[assignment]
+
+    def in_interval(self, start_us, end_us):
+        r"""
+        Return all records in the given interval
+        """
+        start_idx = bisect.bisect_left(self._start_uses, start_us)
+        end_idx = bisect.bisect_right(self._start_uses, end_us)
+        for i in range(start_idx, end_idx):
+            yield self._mem_records[self._indices[i]]
+
+
+def _filter_stack_entry(entry):
+    filtered_entries = [
+        ("autograd/__init__", "_make_grads"),
+        ("autograd/__init__", "backward"),
+        ("torch/tensor", "backward"),
+        ("_internal/common_utils", "prof_callable"),
+        ("_internal/common_utils", "prof_func_call"),
+        ("_internal/common_utils", "prof_meth_call"),
+    ]
+    return all(not (f[0] in entry and f[1] in entry) for f in filtered_entries)
+
+
+MEMORY_EVENT_NAME = "[memory]"
+OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]"
+
+
+def _filter_name(name):
+    # ignoring the following utility ops
+    filtered_out_names = [
+        MEMORY_EVENT_NAME,  # used only for the top-level memory events
+        OUT_OF_MEMORY_EVENT_NAME,
+        "profiler::_record_function_enter",
+        "profiler::_record_function_enter_new",
+        "profiler::_record_function_exit",
+        "aten::is_leaf",
+        "aten::output_nr",
+        "aten::_version",
+    ]
+    return name in filtered_out_names
+
+
+# Demangles and optionally rewrites the provided event name,
+# with_wildcard - whether to replace certain numbered event names
+# with a wildcard name to aggregate them together in the profiler table
+# output
+def _rewrite_name(name, with_wildcard=False):
+    string_table = StringTable()
+    name = string_table[name]
+    if with_wildcard:
+        if name.startswith("ProfilerStep#"):
+            name = "ProfilerStep*"
+    return name
+
+
+def _build_table(
+    events,
+    sort_by=None,
+    header=None,
+    row_limit=100,
+    max_src_column_width=75,
+    max_name_column_width=55,
+    max_shapes_column_width=80,
+    with_flops=False,
+    profile_memory=False,
+    top_level_events_only=False,
+):
+    """Print a summary of events (which can be a list of FunctionEvent or FunctionEventAvg)."""
+    if len(events) == 0:
+        return ""
+
+    has_cuda_time = any(event.self_cuda_time_total > 0 for event in events)
+    has_cuda_mem = any(event.self_cuda_memory_usage > 0 for event in events)
+    has_privateuse1_time = any(
+        event.self_privateuse1_time_total > 0 for event in events
+    )
+    has_privateuse1_mem = any(
+        event.self_privateuse1_memory_usage > 0 for event in events
+    )
+    use_device = events[0].use_device
+    if not use_device and (has_privateuse1_mem or has_privateuse1_time):
+        raise RuntimeError(
+            "use_device is None, but there is private device performance data."
+        )
+
+    has_input_shapes = any(
+        (event.input_shapes is not None and len(event.input_shapes) > 0)
+        for event in events
+    )
+
+    if sort_by is not None:
+        events = EventList(
+            sorted(events, key=lambda evt: getattr(evt, sort_by), reverse=True),
+            use_cuda=has_cuda_time,
+            use_device=use_device,
+            profile_memory=profile_memory,
+            with_flops=with_flops,
+        )
+
+    name_column_width = max([len(evt.key) for evt in events]) + 4
+    if max_name_column_width is not None:
+        name_column_width = min(name_column_width, max_name_column_width)
+
+    shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4
+    if max_shapes_column_width is not None:
+        shapes_column_width = min(shapes_column_width, max_shapes_column_width)
+
+    DEFAULT_COLUMN_WIDTH = 12
+    flops_column_width = DEFAULT_COLUMN_WIDTH
+
+    src_column_width = None
+    stacks = []
+    for evt in events:
+        if evt.stack is not None and len(evt.stack) > 0:
+            stacks.append(evt.stack)
+    has_stack = len(stacks) > 0
+    if has_stack:
+        src_column_width = (
+            max([max([len(entry) for entry in stack]) for stack in stacks]) + 4
+        )
+        if max_src_column_width is not None:
+            src_column_width = min(src_column_width, max_src_column_width)
+
+    headers = [
+        "Name",
+        "Self CPU %",
+        "Self CPU",
+        "CPU total %",
+        "CPU total",
+        "CPU time avg",
+    ]
+    if has_cuda_time:
+        headers.extend(
+            [
+                "Self CUDA",
+                "Self CUDA %",
+                "CUDA total",
+                "CUDA time avg",
+            ]
+        )
+    if has_privateuse1_time:
+        privateuse1 = use_device.upper()
+        headers.extend(
+            [
+                f"Self {privateuse1}",
+                f"Self {privateuse1} %",
+                f"{privateuse1} total",
+                f"{privateuse1} time avg",
+            ]
+        )
+    if profile_memory:
+        headers.extend(
+            [
+                "CPU Mem",
+                "Self CPU Mem",
+            ]
+        )
+        if has_cuda_mem:
+            headers.extend(
+                [
+                    "CUDA Mem",
+                    "Self CUDA Mem",
+                ]
+            )
+        if has_privateuse1_mem:
+            privateuse1 = use_device.upper()
+            headers.extend(
+                [
+                    f"{privateuse1} Mem",
+                    f"Self {privateuse1} Mem",
+                ]
+            )
+    headers.append("# of Calls")
+    # Only append Node ID if any event has a valid (>= 0) Node ID
+    append_node_id = any(evt.node_id != -1 for evt in events)
+    if append_node_id:
+        headers.append("Node ID")
+
+    # Have to use a list because nonlocal is Py3 only...
+    SPACING_SIZE = 2
+    row_format_lst = [""]
+    header_sep_lst = [""]
+    line_length_lst = [-SPACING_SIZE]
+    MAX_STACK_ENTRY = 5
+
+    def add_column(padding, text_dir=">"):
+        row_format_lst[0] += (
+            "{: " + text_dir + str(padding) + "}" + (" " * SPACING_SIZE)
+        )
+        header_sep_lst[0] += "-" * padding + (" " * SPACING_SIZE)
+        line_length_lst[0] += padding + SPACING_SIZE
+
+    def auto_scale_flops(flops):
+        flop_headers = [
+            "FLOPs",
+            "KFLOPs",
+            "MFLOPs",
+            "GFLOPs",
+            "TFLOPs",
+            "PFLOPs",
+        ]
+        assert flops > 0
+        log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1)))
+        assert log_flops >= 0 and log_flops < len(flop_headers)
+        return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)])
+
+    add_column(name_column_width)
+    for _ in headers[1:]:
+        add_column(DEFAULT_COLUMN_WIDTH)
+
+    if has_input_shapes:
+        headers.append("Input Shapes")
+        add_column(shapes_column_width)
+
+    if has_stack:
+        headers.append("Source Location")
+        add_column(src_column_width, text_dir="<")
+
+    if with_flops:
+        # Auto-scaling of flops header
+        raw_flops = []
+        for evt in events:
+            if evt.flops > 0:
+                raw_flops.append(evt.flops)
+        if len(raw_flops) != 0:
+            (flops_scale, flops_header) = auto_scale_flops(min(raw_flops))
+            headers.append(f"Total {flops_header}")
+            add_column(flops_column_width)
+        else:
+            with_flops = False  # can't find any valid flops
+
+    row_format = row_format_lst[0]
+    header_sep = header_sep_lst[0]
+    line_length = line_length_lst[0]
+    add_column = None  # type: ignore[assignment]
+
+    # Have to use a list because nonlocal is Py3 only...
+    result = []
+
+    def append(s):
+        result.append(s)
+        result.append("\n")  # Yes, newline after the end as well
+
+    sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events])
+    sum_self_cuda_time_total = 0
+    sum_self_privateuse1_time_total = 0
+    for evt in events:
+        if evt.device_type == DeviceType.CPU:
+            # in legacy profiler, kernel info is stored in cpu events
+            if evt.is_legacy:
+                if not use_device:
+                    sum_self_cuda_time_total += evt.self_cuda_time_total
+                else:
+                    sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
+        elif evt.device_type == DeviceType.CUDA:
+            # in kineto profiler, there're events with the correct device type (e.g. CUDA)
+            sum_self_cuda_time_total += evt.self_cuda_time_total
+        elif evt.device_type == DeviceType.PrivateUse1:
+            sum_self_privateuse1_time_total += evt.self_privateuse1_time_total
+
+    # Actual printing
+    if header is not None:
+        append("=" * line_length)
+        append(header)
+    if top_level_events_only:
+        append("=" * line_length)
+        append("This report only display top-level ops statistics")
+    append(header_sep)
+    append(row_format.format(*headers))
+
+    append(header_sep)
+
+    def trim_path(path, src_column_width):
+        if len(path) > src_column_width:
+            offset = len(path) - src_column_width
+            path = path[offset:]
+            if len(path) > 3:
+                path = "..." + path[3:]
+        return path
+
+    event_limit = 0
+    for evt in events:
+        if event_limit == row_limit:
+            break
+        if top_level_events_only and evt.cpu_parent is not None:
+            continue
+        else:
+            event_limit += 1
+        name = evt.key
+        if max_name_column_width is not None and len(name) >= max_name_column_width - 3:
+            name = name[: (max_name_column_width - 3)] + "..."
+        row_values = [
+            name,
+            # Self CPU total %, 0 for async events.
+            _format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total),
+            evt.self_cpu_time_total_str,  # Self CPU total
+            # CPU total %, 0 for async events.
+            _format_time_share(evt.cpu_time_total, sum_self_cpu_time_total)
+            if not evt.is_async
+            else 0,
+            evt.cpu_time_total_str,  # CPU total
+            evt.cpu_time_str,  # CPU time avg
+        ]
+        if has_cuda_time:
+            row_values.extend(
+                [
+                    evt.self_cuda_time_total_str,
+                    # CUDA time total %
+                    _format_time_share(
+                        evt.self_cuda_time_total, sum_self_cuda_time_total
+                    ),
+                    evt.cuda_time_total_str,
+                    evt.cuda_time_str,  # Cuda time avg
+                ]
+            )
+        if has_privateuse1_time:
+            row_values.extend(
+                [
+                    evt.self_privateuse1_time_total_str,
+                    # PrivateUse1 time total %
+                    _format_time_share(
+                        evt.self_privateuse1_time_total, sum_self_privateuse1_time_total
+                    ),
+                    evt.privateuse1_time_total_str,
+                    evt.privateuse1_time_str,  # PrivateUse1 time avg
+                ]
+            )
+        if profile_memory:
+            row_values.extend(
+                [
+                    # CPU Mem Total
+                    _format_memory(evt.cpu_memory_usage),
+                    # Self CPU Mem Total
+                    _format_memory(evt.self_cpu_memory_usage),
+                ]
+            )
+            if has_cuda_mem:
+                row_values.extend(
+                    [
+                        # CUDA Mem Total
+                        _format_memory(evt.cuda_memory_usage),
+                        # Self CUDA Mem Total
+                        _format_memory(evt.self_cuda_memory_usage),
+                    ]
+                )
+            if has_privateuse1_mem:
+                row_values.extend(
+                    [
+                        # PrivateUse1 Mem Total
+                        _format_memory(evt.privateuse1_memory_usage),
+                        # Self PrivateUse1 Mem Total
+                        _format_memory(evt.self_privateuse1_memory_usage),
+                    ]
+                )
+        row_values.append(
+            evt.count,  # Number of calls
+        )
+
+        if append_node_id:
+            row_values.append(evt.node_id)
+        if has_input_shapes:
+            row_values.append(str(evt.input_shapes)[:shapes_column_width])
+        if with_flops:
+            if evt.flops <= 0:
+                row_values.append("--")
+            else:
+                row_values.append(f"{evt.flops * flops_scale:8.3f}")  # type: ignore[possibly-undefined]
+        if has_stack:
+            src_field = ""
+            if len(evt.stack) > 0:
+                src_field = trim_path(evt.stack[0], src_column_width)
+            row_values.append(src_field)
+        append(row_format.format(*row_values))
+
+        if has_stack:
+            empty_headers = [""] * (len(headers) - 1)
+            for entry in evt.stack[1:MAX_STACK_ENTRY]:
+                append(
+                    row_format.format(
+                        *(empty_headers + [trim_path(entry, src_column_width)])
+                    )
+                )
+            empty_headers.append("")
+            append(row_format.format(*empty_headers))
+
+    append(header_sep)
+    append(f"Self CPU time total: {_format_time(sum_self_cpu_time_total)}")
+    if has_cuda_time:
+        append(f"Self CUDA time total: {_format_time(sum_self_cuda_time_total)}")
+    if has_privateuse1_time:
+        append(
+            f"Self {use_device.upper()} time total: {_format_time(sum_self_privateuse1_time_total)}"
+        )
+    return "".join(result)
diff --git a/MLPY/Lib/site-packages/torch/autograd/variable.py b/MLPY/Lib/site-packages/torch/autograd/variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6c74819c79090f1926cc83a0b15e2aff322e304
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/autograd/variable.py
@@ -0,0 +1,14 @@
+import torch
+from torch._C import _ImperativeEngine as ImperativeEngine
+
+
+__all__ = ["VariableMeta", "Variable"]
+
+
+class VariableMeta(type):
+    def __instancecheck__(cls, other):
+        return isinstance(other, torch.Tensor)
+
+
+class Variable(torch._C._LegacyVariableBase, metaclass=VariableMeta):  # type: ignore[misc]
+    _execution_engine = ImperativeEngine()
diff --git a/MLPY/Lib/site-packages/torch/backends/__init__.py b/MLPY/Lib/site-packages/torch/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdfe3ff1655ad581775d2761660c0713292fa15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/__init__.py
@@ -0,0 +1,70 @@
+import types
+from contextlib import contextmanager
+
+# The idea for this parameter is that we forbid bare assignment
+# to torch.backends.<cudnn|mkldnn>.enabled and friends when running our
+# test suite, where it's very easy to forget to undo the change
+# later.
+__allow_nonbracketed_mutation_flag = True
+
+
+def disable_global_flags():
+    global __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = False
+
+
+def flags_frozen():
+    return not __allow_nonbracketed_mutation_flag
+
+
+@contextmanager
+def __allow_nonbracketed_mutation():
+    global __allow_nonbracketed_mutation_flag
+    old = __allow_nonbracketed_mutation_flag
+    __allow_nonbracketed_mutation_flag = True
+    try:
+        yield
+    finally:
+        __allow_nonbracketed_mutation_flag = old
+
+
+class ContextProp:
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter()
+
+    def __set__(self, obj, val):
+        if not flags_frozen():
+            self.setter(val)
+        else:
+            raise RuntimeError(
+                "not allowed to set %s flags "
+                "after disable_global_flags; please use flags() context manager instead"
+                % obj.__name__
+            )
+
+
+class PropModule(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+
+from torch.backends import (
+    cpu as cpu,
+    cuda as cuda,
+    cudnn as cudnn,
+    mha as mha,
+    mkl as mkl,
+    mkldnn as mkldnn,
+    mps as mps,
+    nnpack as nnpack,
+    openmp as openmp,
+    quantized as quantized,
+)
diff --git a/MLPY/Lib/site-packages/torch/backends/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e579111fd3cd5dcdf8e5ce62e8ca0f211348556
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_coreml/__init__.py b/MLPY/Lib/site-packages/torch/backends/_coreml/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6e7be639b24e1913a24176f8d87ad6d9f7b1e30
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/preprocess.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/preprocess.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1d080d40419b6f4a6b9f878bb2fbbe670a85748
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/_coreml/__pycache__/preprocess.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_coreml/preprocess.py b/MLPY/Lib/site-packages/torch/backends/_coreml/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4ab3d64edcf1b8e69ef00ae754a587c31aaf69c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/_coreml/preprocess.py
@@ -0,0 +1,146 @@
+import hashlib
+import json
+from typing import Dict, Tuple
+
+import coremltools as ct  # type: ignore[import]
+from coremltools.converters.mil.input_types import TensorType  # type: ignore[import]
+from coremltools.converters.mil.mil import types  # type: ignore[import]
+from coremltools.models.neural_network import quantization_utils  # type: ignore[import]
+
+import torch
+
+CT_METADATA_VERSION = "com.github.apple.coremltools.version"
+CT_METADATA_SOURCE = "com.github.apple.coremltools.source"
+
+
+class ScalarType:
+    Float = 0
+    Double = 1
+    Int = 2
+    Long = 3
+    Undefined = 4
+
+
+# Supported Tensor types in coremltools:
+# https://github.com/apple/coremltools/blob/main/coremltools/converters/mil/frontend/torch/converter.py#L28
+torch_to_mil_types = {
+    ScalarType.Float: types.fp32,
+    ScalarType.Double: types.fp64,
+    ScalarType.Int: types.int32,
+    ScalarType.Long: types.int64,
+}
+
+
+class CoreMLComputeUnit:
+    CPU = "cpuOnly"
+    CPUAndGPU = "cpuAndGPU"
+    ALL = "all"
+
+
+class CoreMLQuantizationMode:
+    LINEAR = "linear"
+    LINEAR_SYMMETRIC = "linear_symmetric"
+    NONE = "none"
+
+
+def TensorSpec(shape, dtype=ScalarType.Float):
+    return (shape, dtype)
+
+
+def CompileSpec(
+    inputs,
+    outputs,
+    backend=CoreMLComputeUnit.CPU,
+    allow_low_precision=True,
+    quantization_mode=CoreMLQuantizationMode.NONE,
+    mlmodel_export_path=None,
+):
+    return (
+        inputs,
+        outputs,
+        backend,
+        allow_low_precision,
+        quantization_mode,
+        mlmodel_export_path,
+    )
+
+
+def _check_enumerated_shape(shape):
+    for s in shape:
+        if not isinstance(s, (list, tuple)):
+            return False
+    return True
+
+
+def _convert_to_mil_type(shape, dtype, name: str):
+    mil_shape = shape
+    if _check_enumerated_shape(shape):
+        mil_shape = ct.EnumeratedShapes(shape)
+    ml_type = TensorType(shape=mil_shape, dtype=torch_to_mil_types[dtype])
+    ml_type.name = name
+    return ml_type
+
+
+def preprocess(script_module: torch._C.ScriptObject, compile_spec: Dict[str, Tuple]):
+    spec = compile_spec["forward"]
+    (
+        input_specs,
+        output_specs,
+        backend,
+        allow_low_precision,
+        quantization_mode,
+        mlmodel_export_path,
+    ) = spec
+    mil_inputs = []
+    inputs = []
+    for index, input in enumerate(input_specs):
+        shape, dtype = input
+        name = "input_" + str(index)
+        inputs.append([name, str(dtype), str(shape)])
+        ml_type = _convert_to_mil_type(shape, dtype, name)
+        mil_inputs.append(ml_type)
+    model = torch.jit.RecursiveScriptModule._construct(script_module, lambda x: None)
+    mlmodel = ct.convert(model, inputs=mil_inputs)
+
+    if quantization_mode != CoreMLQuantizationMode.NONE:
+        quant_model_spec = quantization_utils.quantize_weights(
+            mlmodel, nbits=8, quantization_mode=quantization_mode
+        )
+        mlmodel = ct.models.MLModel(quant_model_spec)
+
+    spec = mlmodel.get_spec()
+    assert len(spec.description.output) == len(output_specs)  # type: ignore[attr-defined]
+    outputs = []
+    for index, output in enumerate(output_specs):
+        shape, dtype = output
+        name = spec.description.output[index].name  # type: ignore[attr-defined]
+        outputs.append([name, str(dtype), str(shape)])
+    mlmodel = ct.models.model.MLModel(spec)
+    print(mlmodel)
+
+    if mlmodel_export_path is not None:
+        print(f"Saving CoreML .mlmodel file to {mlmodel_export_path}")
+        mlmodel.save(mlmodel_export_path)
+
+    config = {
+        "spec_ver": str(spec.specificationVersion),  # type: ignore[attr-defined]
+        "backend": backend,
+        "allow_low_precision": str(allow_low_precision),
+    }
+    metadata = {
+        "coremltool_ver": mlmodel.user_defined_metadata[CT_METADATA_VERSION],
+        "torch_ver": mlmodel.user_defined_metadata[CT_METADATA_SOURCE],
+    }
+    coreml_compile_spec = {
+        "inputs": inputs,
+        "outputs": outputs,
+        "config": config,
+        "metadata": metadata,
+    }
+    mlmodel = spec.SerializeToString()  # type: ignore[attr-defined]
+
+    return {
+        "model": mlmodel,
+        "hash": str(hashlib.sha256(mlmodel).hexdigest()),
+        "extra": json.dumps(coreml_compile_spec),
+    }
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/__init__.py b/MLPY/Lib/site-packages/torch/backends/_nnapi/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4352ed061e292ea69fbe696366620e3ef8bbef0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/prepare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/prepare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8530b9989253ef1bd9803c4b61a836b815fd64c4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/prepare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/serializer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/serializer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dabe7e27254c02cffec7b38d58eef24d6752e54
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/_nnapi/__pycache__/serializer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/prepare.py b/MLPY/Lib/site-packages/torch/backends/_nnapi/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c377f149c8e223bdbfd1c258f65ef1ad61d0388
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/_nnapi/prepare.py
@@ -0,0 +1,198 @@
+from typing import List, Optional
+
+import torch
+from torch.backends._nnapi.serializer import _NnapiSerializer
+
+ANEURALNETWORKS_PREFER_LOW_POWER = 0
+ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1
+ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2
+
+
+class NnapiModule(torch.nn.Module):
+    """Torch Module that wraps an NNAPI Compilation.
+
+    This module handles preparing the weights, initializing the
+    NNAPI TorchBind object, and adjusting the memory formats
+    of all inputs and outputs.
+    """
+
+    # _nnapi.Compilation is defined
+    comp: Optional[torch.classes._nnapi.Compilation]  # type: ignore[name-defined]
+    weights: List[torch.Tensor]
+    out_templates: List[torch.Tensor]
+
+    def __init__(
+        self,
+        shape_compute_module: torch.nn.Module,
+        ser_model: torch.Tensor,
+        weights: List[torch.Tensor],
+        inp_mem_fmts: List[int],
+        out_mem_fmts: List[int],
+        compilation_preference: int,
+        relax_f32_to_f16: bool,
+    ):
+        super().__init__()
+        self.shape_compute_module = shape_compute_module
+        self.ser_model = ser_model
+        self.weights = weights
+        self.inp_mem_fmts = inp_mem_fmts
+        self.out_mem_fmts = out_mem_fmts
+        self.out_templates = []
+        self.comp = None
+        self.compilation_preference = compilation_preference
+        self.relax_f32_to_f16 = relax_f32_to_f16
+
+    @torch.jit.export
+    def init(self, args: List[torch.Tensor]):
+        assert self.comp is None
+        self.out_templates = self.shape_compute_module.prepare(self.ser_model, args)  # type: ignore[operator]
+        self.weights = [w.contiguous() for w in self.weights]
+        comp = torch.classes._nnapi.Compilation()
+        comp.init2(
+            self.ser_model,
+            self.weights,
+            self.compilation_preference,
+            self.relax_f32_to_f16,
+        )
+
+        self.comp = comp
+
+    def forward(self, args: List[torch.Tensor]) -> List[torch.Tensor]:
+        if self.comp is None:
+            self.init(args)
+        comp = self.comp
+        assert comp is not None
+        outs = [torch.empty_like(out) for out in self.out_templates]
+
+        assert len(args) == len(self.inp_mem_fmts)
+        fixed_args = []
+        for idx in range(len(args)):
+            fmt = self.inp_mem_fmts[idx]
+            # These constants match the values in DimOrder in serializer.py
+            # TODO: See if it's possible to use those directly.
+            if fmt == 0:
+                fixed_args.append(args[idx].contiguous())
+            elif fmt == 1:
+                fixed_args.append(args[idx].permute(0, 2, 3, 1).contiguous())
+            else:
+                raise Exception("Invalid mem_fmt")
+        comp.run(fixed_args, outs)
+        assert len(outs) == len(self.out_mem_fmts)
+        for idx in range(len(self.out_templates)):
+            fmt = self.out_mem_fmts[idx]
+            # These constants match the values in DimOrder in serializer.py
+            # TODO: See if it's possible to use those directly.
+            if fmt in (0, 2):
+                pass
+            elif fmt == 1:
+                outs[idx] = outs[idx].permute(0, 3, 1, 2)
+            else:
+                raise Exception("Invalid mem_fmt")
+        return outs
+
+
+def convert_model_to_nnapi(
+    model,
+    inputs,
+    serializer=None,
+    return_shapes=None,
+    use_int16_for_qint16=False,
+    compilation_preference=ANEURALNETWORKS_PREFER_SUSTAINED_SPEED,
+    relax_f32_to_f16=False,
+):
+    (
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        retval_count,
+    ) = process_for_nnapi(
+        model, inputs, serializer, return_shapes, use_int16_for_qint16
+    )
+
+    nnapi_model = NnapiModule(
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        compilation_preference,
+        relax_f32_to_f16,
+    )
+
+    class NnapiInterfaceWrapper(torch.nn.Module):
+        """NNAPI list-ifying and de-list-ifying wrapper.
+
+        NNAPI always expects a list of inputs and provides a list of outputs.
+        This module allows us to accept inputs as separate arguments.
+        It returns results as either a single tensor or tuple,
+        matching the original module.
+        """
+
+        def __init__(self, mod):
+            super().__init__()
+            self.mod = mod
+
+    wrapper_model_py = NnapiInterfaceWrapper(nnapi_model)
+    wrapper_model = torch.jit.script(wrapper_model_py)
+    # TODO: Maybe make these names match the original.
+    arg_list = ", ".join(f"arg_{idx}" for idx in range(len(inputs)))
+    if retval_count < 0:
+        ret_expr = "retvals[0]"
+    else:
+        ret_expr = "".join(f"retvals[{idx}], " for idx in range(retval_count))
+    wrapper_model.define(
+        f"def forward(self, {arg_list}):\n"
+        f"    retvals = self.mod([{arg_list}])\n"
+        f"    return {ret_expr}\n"
+    )
+    return wrapper_model
+
+
+def process_for_nnapi(
+    model, inputs, serializer=None, return_shapes=None, use_int16_for_qint16=False
+):
+    model = torch.jit.freeze(model)
+
+    if isinstance(inputs, torch.Tensor):
+        inputs = [inputs]
+
+    serializer = serializer or _NnapiSerializer(
+        config=None, use_int16_for_qint16=use_int16_for_qint16
+    )
+    (
+        ser_model,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        shape_compute_lines,
+        retval_count,
+    ) = serializer.serialize_model(model, inputs, return_shapes)
+    ser_model_tensor = torch.tensor(ser_model, dtype=torch.int32)
+
+    # We have to create a new class here every time this function is called
+    # because module.define adds a method to the *class*, not the instance.
+    class ShapeComputeModule(torch.nn.Module):
+        """Code-gen-ed module for tensor shape computation.
+
+        module.prepare will mutate ser_model according to the computed operand
+        shapes, based on the shapes of args.  Returns a list of output templates.
+        """
+
+        pass
+
+    shape_compute_module = torch.jit.script(ShapeComputeModule())
+    real_shape_compute_lines = [
+        "def prepare(self, ser_model: torch.Tensor, args: List[torch.Tensor]) -> List[torch.Tensor]:\n",
+    ] + [f"    {line}\n" for line in shape_compute_lines]
+    shape_compute_module.define("".join(real_shape_compute_lines))
+
+    return (
+        shape_compute_module,
+        ser_model_tensor,
+        used_weights,
+        inp_mem_fmts,
+        out_mem_fmts,
+        retval_count,
+    )
diff --git a/MLPY/Lib/site-packages/torch/backends/_nnapi/serializer.py b/MLPY/Lib/site-packages/torch/backends/_nnapi/serializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e42ab22fa5cd101778e9096903c6a3a25bacc8d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/_nnapi/serializer.py
@@ -0,0 +1,2188 @@
+import array
+import enum
+import functools
+import logging
+import operator
+import struct
+import sys
+from typing import List, NamedTuple, Optional, Tuple
+
+import torch
+
+
+# TODO: Add type annotations
+# TODO: Check tensor types for ops
+
+
+LOG = logging.getLogger("nnapi_serialize")
+
+
+class NNAPI_OperandCode:
+    FLOAT32 = 0
+    INT32 = 1
+    UINT32 = 2
+    TENSOR_FLOAT32 = 3
+    TENSOR_INT32 = 4
+    TENSOR_QUANT8_ASYMM = 5
+    BOOL = 6
+    TENSOR_QUANT16_SYMM = 7
+    TENSOR_FLOAT16 = 8
+    TENSOR_BOOL8 = 9
+    FLOAT16 = 10
+    TENSOR_QUANT8_SYMM_PER_CHANNEL = 11
+    TENSOR_QUANT16_ASYMM = 12
+
+
+class NNAPI_OperationCode:
+    ADD = 0
+    AVERAGE_POOL_2D = 1
+    CONCATENATION = 2
+    CONV_2D = 3
+    DEPTHWISE_CONV_2D = 4
+    DEPTH_TO_SPACE = 5
+    DEQUANTIZE = 6
+    EMBEDDING_LOOKUP = 7
+    FLOOR = 8
+    FULLY_CONNECTED = 9
+    HASHTABLE_LOOKUP = 10
+    L2_NORMALIZATION = 11
+    L2_POOL_2D = 12
+    LOCAL_RESPONSE_NORMALIZATION = 13
+    LOGISTIC = 14
+    LSH_PROJECTION = 15
+    LSTM = 16
+    MAX_POOL_2D = 17
+    MUL = 18
+    RELU = 19
+    RELU1 = 20
+    RELU6 = 21
+    RESHAPE = 22
+    RESIZE_BILINEAR = 23
+    RNN = 24
+    SOFTMAX = 25
+    SPACE_TO_DEPTH = 26
+    SVDF = 27
+    TANH = 28
+    BATCH_TO_SPACE_ND = 29
+    DIV = 30
+    MEAN = 31
+    PAD = 32
+    SPACE_TO_BATCH_ND = 33
+    SQUEEZE = 34
+    STRIDED_SLICE = 35
+    SUB = 36
+    TRANSPOSE = 37
+    ABS = 38
+    ARGMAX = 39
+    ARGMIN = 40
+    AXIS_ALIGNED_BBOX_TRANSFORM = 41
+    BIDIRECTIONAL_SEQUENCE_LSTM = 42
+    BIDIRECTIONAL_SEQUENCE_RNN = 43
+    BOX_WITH_NMS_LIMIT = 44
+    CAST = 45
+    CHANNEL_SHUFFLE = 46
+    DETECTION_POSTPROCESSING = 47
+    EQUAL = 48
+    EXP = 49
+    EXPAND_DIMS = 50
+    GATHER = 51
+    GENERATE_PROPOSALS = 52
+    GREATER = 53
+    GREATER_EQUAL = 54
+    GROUPED_CONV_2D = 55
+    HEATMAP_MAX_KEYPOINT = 56
+    INSTANCE_NORMALIZATION = 57
+    LESS = 58
+    LESS_EQUAL = 59
+    LOG = 60
+    LOGICAL_AND = 61
+    LOGICAL_NOT = 62
+    LOGICAL_OR = 63
+    LOG_SOFTMAX = 64
+    MAXIMUM = 65
+    MINIMUM = 66
+    NEG = 67
+    NOT_EQUAL = 68
+    PAD_V2 = 69
+    POW = 70
+    PRELU = 71
+    QUANTIZE = 72
+    QUANTIZED_16BIT_LSTM = 73
+    RANDOM_MULTINOMIAL = 74
+    REDUCE_ALL = 75
+    REDUCE_ANY = 76
+    REDUCE_MAX = 77
+    REDUCE_MIN = 78
+    REDUCE_PROD = 79
+    REDUCE_SUM = 80
+    ROI_ALIGN = 81
+    ROI_POOLING = 82
+    RSQRT = 83
+    SELECT = 84
+    SIN = 85
+    SLICE = 86
+    SPLIT = 87
+    SQRT = 88
+    TILE = 89
+    TOPK_V2 = 90
+    TRANSPOSE_CONV_2D = 91
+    UNIDIRECTIONAL_SEQUENCE_LSTM = 92
+    UNIDIRECTIONAL_SEQUENCE_RNN = 93
+    RESIZE_NEAREST_NEIGHBOR = 94
+
+
+class NNAPI_FuseCode:
+    FUSED_NONE = 0
+    FUSED_RELU = 1
+    FUSED_RELU1 = 2
+    FUSED_RELU6 = 3
+
+
+class OperandValueSourceType:
+    IMMEDIATE = 0
+    NUMBERED_BUFFER = 2
+    NUMBERED_MEMORY = 3
+
+
+# Scalar types that appear explicitly in models.
+# These must be kept in sync with
+# AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS.
+# TODO: Expose these directly to Python to avoid maintaining this list.
+class TorchScalarTypes(enum.Enum):
+    QUINT8 = 13
+
+
+def approx_equal(lhs, rhs, tolerance=1e-6):
+    return abs(lhs - rhs) <= tolerance * min(lhs, rhs)
+
+
+def tensor_size(op_type, dims):
+    ITEM_SIZES = {
+        NNAPI_OperandCode.TENSOR_FLOAT32: 4,
+        NNAPI_OperandCode.TENSOR_INT32: 4,
+        NNAPI_OperandCode.TENSOR_QUANT8_ASYMM: 1,
+        NNAPI_OperandCode.TENSOR_QUANT16_SYMM: 2,
+        NNAPI_OperandCode.TENSOR_QUANT16_ASYMM: 2,
+    }
+    size = ITEM_SIZES[op_type]
+    for d in dims:
+        size *= d
+    return size
+
+
+def change_element(tup, index, value):
+    ls = list(tup)
+    ls[index] = value
+    return tuple(ls)
+
+
+class ConvPoolArgs2d(NamedTuple):
+    """Configuration arguments for a convolution."""
+
+    kernel_h: int
+    kernel_w: int
+    stride_h: int
+    stride_w: int
+    pad_t: int
+    pad_b: int
+    pad_l: int
+    pad_r: int
+    dilation_h: int
+    dilation_w: int
+    group: int
+
+
+class DimOrder(enum.Enum):
+    PRESUMED_CONTIGUOUS = 0
+    CHANNELS_LAST = 1
+    SCALAR_OR_VECTOR = 2
+    UNKNOWN_CONSTANT = 999
+
+
+class Operand(NamedTuple):
+    """Represenation of an NNAPI operand."""
+
+    # NNAPI operand type.  One of NNAPI_OperandCode.
+    # TODO: Make this an enum.
+    op_type: int
+
+    # This is always the PyTorch shape, which is NCHW for feature maps.
+    # The actual NNAPI operand might have a transposed shape.
+    # we use 0 for load time dynamic shapes & -1 for runtime dynamic shapes
+    shape: Tuple[int, ...]
+
+    # Specifies how the shape of the operand that we define in NNAPI
+    # relates to the shape we track above.
+    # - PRESUMED_CONTIGUOUS: physical NNAPI operand will exactly match
+    #   the shape of the PyTorch tensor.
+    # - CHANNELS_LAST: The PyTorch tensor is expected to be NCHW, and
+    #   the NNAPI operand will be represented explicitly as NHWC.
+    dim_order: DimOrder
+
+    # Quantization params
+    scale: float
+    zero_point: int
+
+    def use_nchw(self):
+        if self.dim_order is DimOrder.PRESUMED_CONTIGUOUS:
+            return True
+        if self.dim_order is DimOrder.CHANNELS_LAST:
+            return False
+        raise Exception("Unknown dim order")
+
+
+def broadcast_shapes(shape1, shape2):
+    assert len(shape1) > 0
+    assert len(shape2) > 0
+    s1 = list(shape1)
+    s2 = list(shape2)
+    # TODO: Support non-equal-rank broadcast where semantics match.
+    # This can be tricky for NHWC tensors because dimension orders
+    # don't match between PT and NNAPI, even though semantics match.
+    if len(s1) > len(s2):
+        # s2 = [1] * (len(s1) - len(s2)) + s2
+        raise Exception("Non-equal-rank broadcast is not supported yet.")
+    if len(s2) > len(s1):
+        # s3 = [1] * (len(s2) - len(s1)) + s1
+        raise Exception("Non-equal-rank broadcast is not supported yet.")
+    ret = []
+    for d1, d2 in zip(s1, s2):
+        if d1 == 1:
+            ret.append(d2)
+        elif d2 == 1:
+            ret.append(d1)
+        elif d1 == d2:
+            ret.append(d1)
+        else:
+            raise Exception(f"Cannot broadcast shapes: {shape1} and {shape2}")
+    return tuple(ret)
+
+
+def get_conv_pool_shape(image_shape, args, out_ch, transpose):
+    batch, in_c, in_h, in_w = image_shape
+
+    # TODO: Handle dilation
+    if args.dilation_h != 1 or args.dilation_w != 1:
+        raise Exception("Dilation not supported yet.")
+
+    if transpose:
+        out_h = (in_h - 1) * args.stride_h + args.kernel_h - args.pad_t - args.pad_b
+        out_w = (in_w - 1) * args.stride_w + args.kernel_w - args.pad_l - args.pad_l
+    else:
+        out_h = (in_h - args.kernel_h + args.pad_t + args.pad_b) // args.stride_h + 1
+        out_w = (in_w - args.kernel_w + args.pad_l + args.pad_r) // args.stride_w + 1
+
+    # Handle variable-sized tensors.
+    if in_h == 0:
+        out_h = 0
+    if in_w == 0:
+        out_w = 0
+
+    out_shape = (batch, out_ch, out_h, out_w)
+    return out_shape
+
+
+def fix_shape(shape, dim_order):
+    # Return the actual shape that an operand should have in NNAPI,
+    # given a PyTorch shape and dimension order.  This is where we
+    # convert from PyTorch's "always NCHW" shape to explicit NHWC.
+    if dim_order is DimOrder.PRESUMED_CONTIGUOUS:
+        return shape
+    if dim_order is DimOrder.CHANNELS_LAST:
+        return tuple([shape[0]] + list(shape[2:]) + [shape[1]])
+    if dim_order is DimOrder.SCALAR_OR_VECTOR:
+        assert len(shape) == 0 or len(shape) == 1
+        return shape
+    if dim_order is DimOrder.UNKNOWN_CONSTANT:
+        # XXX think this through
+        return shape
+    raise Exception(f"Bad dim_order: {dim_order!r}.")
+
+
+def reverse_map_dim(dim_order, d):
+    # Return the original PyTorch dimension position for a given dimension.
+    # d should be the dimension that NNAPI will see.
+    # reverse_map_dim(PRESUMED_CONTIGUOUS, x) == x
+    # reverse_map_dim(CHANNELS_LAST, 3) == 1
+    if dim_order in (DimOrder.PRESUMED_CONTIGUOUS, DimOrder.SCALAR_OR_VECTOR):
+        return d
+    assert dim_order is DimOrder.CHANNELS_LAST
+    return [0, 2, 3, 1][d]
+
+
+def flex_name(op_id, dim):
+    # Return the local variable name for the computed flexible size
+    # for a given op and dimension.
+    return f"s_{op_id}_{dim}"
+
+
+class _NnapiSerializer:
+    def __init__(self, config, use_int16_for_qint16=False):
+        self.operands = []
+        self.values = []
+        self.operations = []
+        self.value_data = []
+        self.operation_args = []
+        self.inputs = []
+        self.outputs = []
+        self.flexible_shape_computation_lines = []
+
+        self.modules = {}
+        self.constants = {}
+        self.tensor_sequences = {}
+        self.jitval_operand_map = {}
+        self.cached_immediates = {}
+        self.used_weights = []
+        self.weight_offset = 0
+        self.use_int16_for_qint16 = use_int16_for_qint16
+
+        if config is None:
+            config = {}
+
+    def get_next_operand_id(self):
+        return len(self.operands)
+
+    # Add a tensor operand corresponding to a JIT Value.
+    # Returns the NNAPI operand ID.  Can be looked up later with
+    # get_tensor_operand_by_jitval.
+    def add_tensor_operand(self, jitval, oper):
+        assert isinstance(oper, Operand)
+        if jitval in self.jitval_operand_map:
+            raise Exception(f"Duplicate tensor: {jitval!r}")
+
+        operand_id = self.get_next_operand_id()
+        self.operands.append(oper)
+        self.jitval_operand_map[jitval] = operand_id
+        return operand_id
+
+    # Add a tensor operand that does not correspond to a JIT Value.
+    # Useful for cases where multiple NNAPI operands are required
+    # to implement one JIT IR node.  Returns the NNAPI operand ID.
+    def add_anonymous_tensor_operand(self, oper):
+        assert isinstance(oper, Operand)
+        operand_id = self.get_next_operand_id()
+        self.operands.append(oper)
+        return operand_id
+
+    def torch_tensor_to_operand(self, tensor, dim_order):
+        dtype = str(tensor.dtype).replace("torch.", "")
+        scale = 0.0
+        zero_point = 0
+        if dtype == "float32":
+            op_type = NNAPI_OperandCode.TENSOR_FLOAT32
+        elif dtype == "int32":
+            op_type = NNAPI_OperandCode.TENSOR_INT32
+        elif dtype == "quint8":
+            op_type = NNAPI_OperandCode.TENSOR_QUANT8_ASYMM
+            scale = tensor.q_scale()
+            zero_point = tensor.q_zero_point()
+        elif dtype == "qint32":
+            op_type = NNAPI_OperandCode.TENSOR_INT32
+            scale = tensor.q_scale()
+            zero_point = tensor.q_zero_point()
+            assert zero_point == 0
+        elif dtype == "int16":
+            if self.use_int16_for_qint16:
+                nnapi_dtype = getattr(tensor, "nnapi_dtype", None)
+                op_codes = (
+                    NNAPI_OperandCode.TENSOR_QUANT16_SYMM,
+                    NNAPI_OperandCode.TENSOR_QUANT16_ASYMM,
+                )
+                if nnapi_dtype in op_codes:
+                    op_type = nnapi_dtype
+                    scale = tensor.nnapi_scale
+                    zero_point = tensor.nnapi_zero_point
+                else:
+                    raise Exception(
+                        f"`nnapi_type` needs to be one of {op_codes} for `int16`"
+                    )
+            else:
+                raise Exception(
+                    "`int16` isn't supported. If you're trying to represent NNAPI"
+                    " qint16 with Pytorch int16, set `use_int16_for_qint16 = True`"
+                )
+        else:
+            raise Exception(f"Can't handle input with dtype '{tensor.dtype}'")
+        return Operand(
+            shape=tuple(tensor.shape),
+            op_type=op_type,
+            dim_order=dim_order,
+            scale=scale,
+            zero_point=zero_point,
+        )
+
+    def add_tensor_operand_for_input(self, arg_idx, jitval, tensor):
+        dim_order = (
+            DimOrder.CHANNELS_LAST
+            if getattr(tensor, "nnapi_nhwc", False)
+            else DimOrder.PRESUMED_CONTIGUOUS
+        )
+        toper = self.torch_tensor_to_operand(tensor, dim_order)
+        operand_id = self.add_tensor_operand(jitval, toper)
+        self.inputs.append(operand_id)
+        for dim, size in enumerate(tensor.shape):
+            if size == 0:
+                self.compute_operand_shape(
+                    operand_id, dim, f"args[{arg_idx}].shape[{dim}]"
+                )
+        return operand_id
+
+    def add_tensor_operand_for_weight(
+        self, tensor, dim_order=DimOrder.UNKNOWN_CONSTANT
+    ):
+        toper = self.torch_tensor_to_operand(tensor, dim_order)
+        operand_id = len(self.operands)
+        self.operands.append(toper)
+        tsize = tensor_size(toper.op_type, toper.shape)
+        psize = ((tsize - 1) | 0x3) + 1
+        self.values.append((operand_id, OperandValueSourceType.NUMBERED_BUFFER))
+        buf_num = len(self.used_weights)
+        offset = 0
+        self.value_data.append(struct.pack("iii", buf_num, offset, tsize))
+        # For NHWC NNAPI op, lay out data in the same dim order by permuting torch tensor
+        if dim_order == DimOrder.CHANNELS_LAST:
+            tensor = tensor.permute(0, 2, 3, 1)
+        self.used_weights.append(tensor)
+        return operand_id
+
+    def add_immediate_operand(self, code, value, dims):
+        assert isinstance(dims, tuple)
+        cache_key = (code, value)
+        if cache_key not in self.cached_immediates:
+            operand_id = len(self.operands)
+            self.operands.append(Operand(code, dims, DimOrder.SCALAR_OR_VECTOR, 0.0, 0))
+            self.values.append((operand_id, OperandValueSourceType.IMMEDIATE))
+            self.value_data.append(value)
+            self.cached_immediates[cache_key] = operand_id
+        return self.cached_immediates[cache_key]
+
+    def add_immediate_int_scalar(self, value):
+        return self.add_immediate_operand(
+            NNAPI_OperandCode.INT32, struct.pack("i", value), ()
+        )
+
+    def add_immediate_float_scalar(self, value):
+        return self.add_immediate_operand(
+            NNAPI_OperandCode.FLOAT32, struct.pack("f", value), ()
+        )
+
+    def add_immediate_bool_scalar(self, value):
+        return self.add_immediate_operand(
+            NNAPI_OperandCode.BOOL, b"\x01" if value else b"\x00", ()
+        )
+
+    def add_immediate_int_vector(self, value):
+        return self.add_immediate_operand(
+            NNAPI_OperandCode.TENSOR_INT32,
+            array.array("i", value).tobytes(),
+            (len(value),),
+        )
+
+    def has_operand_for_jitval(self, jitval):
+        return jitval in self.jitval_operand_map
+
+    def get_tensor_operand_by_jitval(self, jitval):
+        operand_id = self.jitval_operand_map[jitval]
+        return (operand_id, self.operands[operand_id])
+
+    def get_tensor_operand_by_jitval_fixed_size(self, jitval):
+        op_id, oper = self.get_tensor_operand_by_jitval(jitval)
+        for s in oper.shape:
+            if s == 0:
+                # TODO: Improve this error message, possibly after converting
+                # many callsites to support flexible size.
+                raise Exception("Flexible size is not supported for this operand.")
+            if s < 0:
+                # runtime flex
+                LOG.warning("Operand %s has runtime flex shape", oper)
+        return op_id, oper
+
+    def get_tensor_operand_or_constant(
+        self, jitval, dim_order=DimOrder.PRESUMED_CONTIGUOUS
+    ):
+        operand_id = self.jitval_operand_map.get(jitval)
+        if operand_id is None:
+            _, value = self.get_constant_value(jitval, "TensorType")
+            operand_id = self.add_tensor_operand_for_weight(value, dim_order)
+        return (operand_id, self.operands[operand_id])
+
+    def get_tensor_operand_for_weight(self, jitval):
+        _, value = self.get_constant_value(jitval, "TensorType")
+        operand_id = self.add_tensor_operand_for_weight(value)
+        return (operand_id, self.operands[operand_id])
+
+    def add_operation(self, opcode, inputs, outputs):
+        self.operations.append((opcode, len(inputs), len(outputs)))
+        self.operation_args.extend(inputs + outputs)
+
+    def add_tensor_sequence(self, jitval, values):
+        assert jitval not in self.tensor_sequences
+        self.tensor_sequences[jitval] = values
+
+    def add_constant_value(self, jitval, ctype, value):
+        assert jitval not in self.constants
+        self.constants[jitval] = (ctype, value)
+
+    def get_constant_value(self, jitval, typekind=None):
+        record = self.constants.get(jitval)
+        if record is None:
+            raise Exception(f"Could not find constant value for '{jitval!r}'.")
+        ctype, _ = record
+        if typekind is not None and ctype.kind() != typekind:
+            raise Exception(
+                f"Expected constant value of type {typekind}, but got {ctype.kind()} for value '{jitval!r}'"
+            )
+        return record
+
+    def operand_to_template_torchscript(self, op_id, oper, shape=None):
+        """Return a TorchScript expression to build a template for a given operand."""
+        if shape is None:
+            shape = oper.shape
+        else:
+            assert len(shape) == len(oper.shape)
+
+        shape_parts = ["("]
+        for d, s in enumerate(shape):
+            if s > 0:
+                # Fixed shape dimension: just add the value.
+                shape_parts.append(str(s))
+            elif s == 0:
+                # Load time flexible shape dimension: it should have been computed in a variable.
+                shape_parts.append(flex_name(op_id, d))
+            elif s == -1:
+                # Runtime flexible shape
+                shape_parts.append("0")
+            else:
+                raise Exception("Unknown dim value, dimensions should be >= -1")
+            shape_parts.append(",")
+        shape_parts.append(")")
+        shape_code = "".join(shape_parts)
+        if oper.op_type == NNAPI_OperandCode.TENSOR_FLOAT32:
+            return f"torch.zeros({shape_code}, dtype=torch.float32)"
+        elif oper.op_type == NNAPI_OperandCode.TENSOR_INT32:
+            return f"torch.zeros({shape_code}, dtype=torch.int32)"
+        elif oper.op_type == NNAPI_OperandCode.TENSOR_QUANT8_ASYMM:
+            return (
+                f"torch.quantize_per_tensor("
+                f"torch.zeros(1), scale={oper.scale}, zero_point={oper.zero_point}, dtype=torch.quint8)"
+                f".expand({shape_code}).contiguous()"
+            )
+        elif oper.op_type in (
+            NNAPI_OperandCode.TENSOR_QUANT16_ASYMM,
+            NNAPI_OperandCode.TENSOR_QUANT16_SYMM,
+        ):
+            if self.use_int16_for_qint16:
+                return f"torch.zeros({shape_code}, dtype=torch.int16)"
+            else:
+                raise Exception(
+                    "`int16` isn't supported. If you're trying to represent NNAPI"
+                    " qint16 with Pytorch int16, set `use_int16_for_qint16 = True`"
+                )
+
+        raise Exception(f"Unsupported output operand type: {oper.op_type}")
+
+    def forward_operand_shape(self, out_op_id, out_dim, in_op_id, in_dim):
+        self.compute_operand_shape(out_op_id, out_dim, flex_name(in_op_id, in_dim))
+
+    def compute_operand_shape(self, op_id, dim, expr):
+        self.flexible_shape_computation_lines.append(
+            f"{flex_name(op_id, dim)} = {expr}"
+        )
+
+    def transpose_to_nhwc(self, in_id, oper):
+        if oper.shape[2:] != (1, 1):
+            raise Exception("Automatic transpose only supported for H,W == 1,1")
+
+        out_oper = oper._replace(dim_order=DimOrder.CHANNELS_LAST)
+
+        inputs = [None] * 2
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_vector([0, 2, 3, 1])
+
+        outputs = [None] * 1
+        outputs[0] = self.add_anonymous_tensor_operand(out_oper)
+
+        self.add_operation(NNAPI_OperationCode.TRANSPOSE, inputs, outputs)
+
+        return outputs[0], out_oper
+
+    # Transpose inputs as necessary to allow broadcasting.
+    def transpose_for_broadcast(self, in0_id, in0_oper, in1_id, in1_oper):
+        if in0_oper.dim_order == in1_oper.dim_order:
+            return in0_id, in0_oper, in1_id, in1_oper
+
+        # Assume NHWC is preferred if there is a mismatch.
+        orders = (in0_oper.dim_order, in1_oper.dim_order)
+        if orders == (DimOrder.PRESUMED_CONTIGUOUS, DimOrder.CHANNELS_LAST):
+            return self.transpose_to_nhwc(in0_id, in0_oper) + (in1_id, in1_oper)
+        if orders == (DimOrder.CHANNELS_LAST, DimOrder.PRESUMED_CONTIGUOUS):
+            return (in0_id, in0_oper) + self.transpose_to_nhwc(in1_id, in1_oper)
+
+        raise Exception(
+            f"Automatic transpose not supported for dim_orders: {in0_oper.dim_order!r}, {in1_oper.dim_order!r}"
+        )
+
+    def get_size_arg(self, jitval):
+        ctype, value = self.get_constant_value(jitval)
+        if ctype.kind() == "ListType":
+            assert ctype.getElementType().kind() == "IntType"
+            return value
+        raise Exception(f"Can't handle size arg of type '{ctype!r}' for '{jitval!r}'")
+
+    def get_conv_pool_args_2d_from_pack(self, kernel_size, packed_config):
+        pc = [i.item() for i in packed_config]
+        assert pc[0] == 2
+        strides = [pc[1], pc[2]]
+        paddings = [pc[3], pc[4]]
+        dilations = [pc[5], pc[6]]
+        output_padding = [pc[7], pc[8]]
+        group_num = pc[9]
+
+        assert len(pc) == 11
+        assert output_padding == [0, 0]
+
+        return self.get_conv_pool_args_2d_common(
+            kernel_size, strides, paddings, dilations, group_num
+        )
+
+    def get_conv_pool_args_2d_from_jit(
+        self, kernel_size, stride, padding, dilation=None, group=None
+    ):
+        strides = self.get_size_arg(stride)
+        paddings = self.get_size_arg(padding)
+        if dilation is None:
+            dilations = [1, 1]
+        else:
+            dilations = self.get_size_arg(dilation)
+        if group is not None:
+            _, group_num = self.get_constant_value(group, "IntType")
+        else:
+            group_num = None
+        return self.get_conv_pool_args_2d_common(
+            kernel_size, strides, paddings, dilations, group_num
+        )
+
+    def get_conv_pool_args_2d_common(
+        self, kernel_size, strides, paddings, dilations, group_num
+    ):
+        kernels = list(kernel_size)
+
+        assert len(kernels) == 2
+        assert len(strides) == 2
+        assert len(paddings) == 2
+        assert len(dilations) == 2
+
+        # NNAPI uses 4 values for padding.
+        ph, pw = paddings
+        real_paddings = [ph, ph, pw, pw]
+
+        return ConvPoolArgs2d(
+            *(kernels + strides + real_paddings + dilations + [group_num])
+        )
+
+    def serialize_model(self, model, inputs, return_shapes=None):
+        self.add_immediate_bool_scalar(False)
+        self.add_immediate_bool_scalar(True)
+
+        inp_dim_orders = []
+        out_dim_orders = []
+
+        self_jitval = next(model.graph.inputs())
+        self.add_constant_value(self_jitval, self_jitval.type(), model)
+
+        for arg_idx, (input_value, input_tensor) in enumerate(
+            zip(list(model.graph.inputs())[1:], inputs)
+        ):
+            op_id = self.add_tensor_operand_for_input(
+                arg_idx, input_value, input_tensor
+            )
+            inp_dim_orders.append(self.operands[op_id].dim_order.value)
+
+        for idx, node in enumerate(model.graph.nodes()):
+            LOG.debug("Processing node #%d: %r", idx, node)
+            self.add_node(node)
+
+        retn = model.graph.return_node()
+        assert retn.inputsSize() == 1
+        assert retn.outputsSize() == 0
+        retn_input = retn.inputsAt(0)
+        template_return_lines = ["return ["]
+        if retn_input.type().kind() == "TensorType":
+            return_values = [retn_input]
+            retval_count = -1
+        elif retn_input.type().kind() == "TupleType":
+            return_values = self.tensor_sequences[retn_input]
+            retval_count = len(return_values)
+        else:
+            raise Exception(f"Unsupported return type: {retn_input.type()}")
+
+        if return_shapes is not None:
+            assert len(return_shapes) == len(return_values)
+        for i, v in enumerate(return_values):
+            op_id = self.jitval_operand_map[v]
+            self.outputs.append(op_id)
+            out_dim_orders.append(self.operands[op_id].dim_order.value)
+            shape = return_shapes[i] if return_shapes else None
+            template_return_lines.append(
+                self.operand_to_template_torchscript(op_id, self.operands[op_id], shape)
+                + ","
+            )
+        template_return_lines.append("]")
+
+        model = []
+
+        version = 1
+        header = struct.pack(
+            "iiiiii",
+            version,
+            len(self.operands),
+            len(self.values),
+            len(self.operations),
+            len(self.inputs),
+            len(self.outputs),
+        )
+        model.append(header)
+
+        serialized_values, serialized_value_data = self.serialize_values()
+
+        model.extend(
+            struct.pack("iifi", t, len(d), s, z) for (t, d, _m, s, z) in self.operands
+        )
+        model.extend(serialized_values)
+        model.extend(struct.pack("iii", *x) for x in self.operations)
+
+        # Compact the model so we can get its length so far.
+        model = [b"".join(model)]
+        model_offset = len(model[0])
+        # Model offset is the index into the model (in 32-bit words, not bytes)
+        # of the next dimension we're about to serialize.  If it's 0,
+        # generate code to mutate it before passing to NNAPI.
+        assert model_offset % 4 == 0
+        model_offset = int(model_offset / 4)
+
+        for op_id, (_, dims, dim_order, _, _) in enumerate(self.operands):
+            shape = fix_shape(dims, dim_order)
+            for d, s in enumerate(shape):
+                if s == 0:
+                    pt_d = reverse_map_dim(dim_order, d)
+                    self.flexible_shape_computation_lines.append(
+                        f"ser_model[{model_offset}] = {flex_name(op_id, pt_d)}"
+                    )
+                model_offset += 1
+
+            # convert runtime flex shape from -1 to 0
+            shape = tuple(d if d != -1 else 0 for d in shape)
+            model.append(self.serialize_ints(shape))
+
+        model.extend(serialized_value_data)
+        model.append(self.serialize_ints(self.operation_args))
+        model.append(self.serialize_ints(self.inputs))
+        model.append(self.serialize_ints(self.outputs))
+
+        self.flexible_shape_computation_lines.extend(template_return_lines)
+
+        return (
+            array.array("i", b"".join(model)),
+            self.used_weights,
+            inp_dim_orders,
+            out_dim_orders,
+            self.flexible_shape_computation_lines,
+            retval_count,
+        )
+
+    def serialize_values(self):
+        serialized_values = []
+        serialized_value_data = []
+        assert len(self.values) == len(self.value_data)
+        for (op_index, source_type), data in zip(self.values, self.value_data):
+            source_length = len(data)
+
+            # Pad with 0 bytes out to a multiple of 4 for alignment.
+            physical_length = ((source_length - 1) | 0x3) + 1
+            padded_data = data + (b"\0" * (physical_length - source_length))
+
+            serialized_values.append(
+                struct.pack("iii", op_index, source_type, source_length)
+            )
+            serialized_value_data.append(padded_data)
+
+        return serialized_values, serialized_value_data
+
+    @staticmethod
+    def serialize_ints(ints):
+        return array.array("i", ints).tobytes()
+
+    ADDER_MAP = {
+        "prim::GetAttr": lambda self, node: self.add_getattr(node),
+        "prim::Constant": lambda self, node: self.add_constant_node(node),
+        "prim::ListConstruct": lambda self, node: self.add_list_construct(node),
+        "prim::TupleConstruct": lambda self, node: self.add_tuple_construct(node),
+        "aten::unsqueeze": lambda self, node: self.add_unsqueeze(node),
+        "aten::to": lambda self, node: self.add_to(node),
+        "aten::detach": lambda self, node: self._identity(node),
+        "aten::reshape": lambda self, node: self.add_reshape(node),
+        "aten::flatten": lambda self, node: self.add_flatten(node),
+        "aten::slice": lambda self, node: self.add_slice(node),
+        "aten::size": lambda self, node: self.add_size(node),
+        "aten::cat": lambda self, node: self.add_cat(node),
+        "aten::mean": lambda self, node: self.add_mean(node),
+        "aten::quantize_per_tensor": lambda self, node: self.add_quantize(node),
+        "aten::dequantize": lambda self, node: self.add_dequantize(node),
+        "aten::add": lambda self, node: self.add_add_sub_op(
+            node, NNAPI_OperationCode.ADD, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "aten::sub": lambda self, node: self.add_add_sub_op(
+            node, NNAPI_OperationCode.SUB, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "aten::mul": lambda self, node: self.add_pointwise_simple_binary_broadcast_op(
+            node, NNAPI_OperationCode.MUL, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "aten::div": lambda self, node: self.add_pointwise_simple_binary_broadcast_op(
+            node, NNAPI_OperationCode.DIV, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "aten::relu": lambda self, node: self.add_pointwise_simple_unary_op(
+            node, NNAPI_OperationCode.RELU
+        ),
+        "aten::sigmoid": lambda self, node: self.add_pointwise_simple_unary_op(
+            node, NNAPI_OperationCode.LOGISTIC
+        ),
+        "aten::softmax": lambda self, node: self.add_softmax(node),
+        "aten::hardtanh": lambda self, node: self.add_hardtanh(node),
+        "aten::avg_pool2d": lambda self, node: self.add_avg_pool2d(node),
+        "aten::max_pool2d": lambda self, node: self.add_pool2d_node(
+            node, NNAPI_OperationCode.MAX_POOL_2D
+        ),
+        "aten::adaptive_avg_pool2d": lambda self, node: self.add_adaptive_avg_pool2d(
+            node
+        ),
+        "aten::upsample_nearest2d": lambda self, node: self.add_upsample_nearest2d(
+            node
+        ),
+        "aten::prelu": lambda self, node: self.add_prelu_op(node),
+        "aten::addmm": lambda self, node: self.add_addmm(node),
+        "aten::linear": lambda self, node: self.add_linear(node),
+        "aten::_convolution": lambda self, node: self.add_conv_underscore(node),
+        "aten::conv2d": lambda self, node: self.add_conv2d(node),
+        "aten::log_softmax": lambda self, node: self.add_log_softmax(node),
+        "quantized::linear": lambda self, node: self.add_qlinear(node),
+        "quantized::conv2d": lambda self, node: self.add_qconv2d(
+            node, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "quantized::conv2d_relu": lambda self, node: self.add_qconv2d(
+            node, NNAPI_FuseCode.FUSED_RELU
+        ),
+        "quantized::conv_transpose2d": lambda self, node: self.add_qconv2d(
+            node, NNAPI_FuseCode.FUSED_NONE, transpose=True
+        ),
+        "quantized::add": lambda self, node: self.add_qadd(
+            node, NNAPI_OperationCode.ADD, NNAPI_FuseCode.FUSED_NONE
+        ),
+        "quantized::add_relu": lambda self, node: self.add_qadd(
+            node, NNAPI_OperationCode.ADD, NNAPI_FuseCode.FUSED_RELU
+        ),
+        "quantized::mul": lambda self, node: self.add_qadd(
+            node, NNAPI_OperationCode.MUL, NNAPI_FuseCode.FUSED_NONE
+        ),
+    }
+
+    def add_node(self, node):
+        adder = self.ADDER_MAP.get(node.kind())
+        if not adder:
+            raise Exception(f"Unsupported node kind ({node.kind()!r}) in node {node!r}")
+        adder(self, node)
+
+    def _identity(self, node):
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+        jitval = node.outputsAt(0)
+        self.jitval_operand_map[jitval] = in_id
+
+    def add_getattr(self, node):
+        assert node.inputsSize() == 1
+        assert node.outputsSize() == 1
+        obj_ctype, obj = self.get_constant_value(node.inputsAt(0))
+        assert str(obj_ctype).startswith("__torch__.")
+        name = node.s("name")
+        value = getattr(obj, name)
+        output = node.outputsAt(0)
+        ctype = output.type()
+        self.add_constant_value(output, ctype, value)
+
+    def add_constant_node(self, node):
+        assert node.inputsSize() == 0
+        assert node.outputsSize() == 1
+        output = node.outputsAt(0)
+        ctype = output.type()
+        value = output.toIValue()
+        self.add_constant_value(output, ctype, value)
+
+    def add_list_construct(self, node):
+        assert node.outputsSize() == 1
+        output = node.outputsAt(0)
+        ctype = output.type()
+        const_vals: Optional[List] = []
+        tensors: Optional[List] = []
+        for inp in node.inputs():
+            if const_vals is not None and inp in self.constants:
+                _, val = self.get_constant_value(inp)
+                const_vals.append(val)
+            else:
+                const_vals = None
+            if tensors is not None and inp.type().kind() == "TensorType":
+                tensors.append(inp)
+            else:
+                tensors = None
+
+        if const_vals is not None:
+            # NOTE: Now that TorchScript supports list constants,
+            # this code path might not be used anymore.
+            self.add_constant_value(output, ctype, const_vals)
+        if tensors is not None:
+            self.add_tensor_sequence(output, tensors)
+        if const_vals is None and tensors is None:
+            raise Exception(
+                f"Unable to handle ListConstruct node.  Neither all constants nor all tensors. {node!r}"
+            )
+
+    def add_tuple_construct(self, node):
+        assert node.outputsSize() == 1
+        output = node.outputsAt(0)
+        values = list(node.inputs())
+        self.add_tensor_sequence(output, values)
+
+    def add_unsqueeze(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+
+        _, dim = self.get_constant_value(node.inputsAt(1), "IntType")
+        assert in_oper.dim_order == DimOrder.PRESUMED_CONTIGUOUS
+
+        real_dim = dim if dim >= 0 else dim + len(in_oper.shape) + 1
+        out_shape_list = list(in_oper.shape)
+        out_shape_list.insert(real_dim, 1)
+        out_shape = tuple(out_shape_list)
+        out_oper = in_oper._replace(shape=out_shape)
+
+        inputs = [None] * 2
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_scalar(dim)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.EXPAND_DIMS, inputs, outputs)
+
+    def add_to(self, node):
+        # Handle to("cpu") / to("gpu") case
+        self._identity(node)
+
+    def add_reshape(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+
+        shape_ctype, shape = self.get_constant_value(node.inputsAt(1))
+        assert shape_ctype.kind() == "ListType"
+        assert shape_ctype.getElementType().kind() == "IntType"
+        is_trivial_reshape = len(shape) == 2 and shape[1] == -1
+
+        if in_oper.dim_order != DimOrder.PRESUMED_CONTIGUOUS and not is_trivial_reshape:
+            raise Exception(
+                "Currently, reshape is only supported on NHWC tensors if the target size is [X, -1]."
+            )
+
+        # Bit of a hack here.  Use a real tensor to infer the output shape.
+        out_shape = torch.zeros(1).expand(in_oper.shape).reshape(shape).shape
+        out_oper = in_oper._replace(
+            shape=out_shape, dim_order=DimOrder.PRESUMED_CONTIGUOUS
+        )
+
+        inputs = [None] * 2
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_vector(shape)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.RESHAPE, inputs, outputs)
+
+    def add_flatten(self, node):
+        assert node.inputsSize() == 3
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+
+        start_ctype, start_dim = self.get_constant_value(node.inputsAt(1), "IntType")
+        end_ctype, end_dim = self.get_constant_value(node.inputsAt(2), "IntType")
+
+        # channels last with channels == 1 or (height & width both 1)
+        is_trivial_flatten = len(in_oper.shape) == 4 and (
+            in_oper.shape[1] == 1 or (in_oper.shape[2] == 1 and in_oper.shape[3] == 1)
+        )
+        if in_oper.dim_order != DimOrder.PRESUMED_CONTIGUOUS and not is_trivial_flatten:
+            raise Exception(
+                "Currently, flatten is not supported on NHWC tensors unless C=1 or H=W=1"
+            )
+
+        if start_dim < 0:
+            start_dim += len(in_oper.shape)
+        if end_dim < 0:
+            end_dim += len(in_oper.shape)
+
+        out_shape = (
+            in_oper.shape[:start_dim]
+            + (functools.reduce(operator.mul, in_oper.shape[start_dim : end_dim + 1]),)
+            + in_oper.shape[end_dim + 1 :]
+        )
+
+        if any(dim == 0 for dim in in_oper.shape[start_dim : end_dim + 1]):
+            raise Exception("Flattening flexible dims is not supported yet")
+        non_flattened_dims = in_oper.shape[:start_dim] + in_oper.shape[end_dim + 1 :]
+        if non_flattened_dims.count(0) > 1:
+            raise Exception("Only 1 dim can be flexible")
+
+        out_oper = in_oper._replace(
+            shape=out_shape, dim_order=DimOrder.PRESUMED_CONTIGUOUS
+        )
+        out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        for idx, dim in enumerate(out_shape):
+            if dim == 0:
+                self.forward_operand_shape(out_id, idx, in_id, in_oper.shape.index(0))
+
+        inputs_1 = tuple(dim if dim != 0 else -1 for dim in out_shape)
+        inputs = [None] * 2
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_vector(inputs_1)
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.RESHAPE, inputs, outputs)
+
+    def add_slice(self, node):
+        assert node.inputsSize() == 5
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+        _, dim_value = self.get_constant_value(node.inputsAt(1))
+        _, start_value = self.get_constant_value(node.inputsAt(2))
+        _, stop_value = self.get_constant_value(node.inputsAt(3))
+        _, step_value = self.get_constant_value(node.inputsAt(4))
+
+        if start_value is None:
+            start_value = 0
+        if stop_value is None:
+            stop_value = sys.maxsize
+
+        if start_value < 0:
+            start_value += in_oper.shape[dim_value]
+        elif start_value == sys.maxsize:
+            start_value = 0
+
+        if start_value == 0 and stop_value == sys.maxsize:
+            self._identity(node)
+            return
+
+        if in_oper.shape[dim_value] == 0:
+            raise Exception("Unable to slice with flexible shape")
+
+        if stop_value < 0:
+            stop_value += in_oper.shape[dim_value]
+        elif stop_value == sys.maxsize:
+            stop_value = in_oper.shape[dim_value]
+
+        if start_value >= stop_value:
+            raise Exception("Slice start value should be less than stop value")
+
+        out_len = (stop_value - start_value) // step_value
+        out_shape = tuple(
+            out_len if i == dim_value else dim for i, dim in enumerate(in_oper.shape)
+        )
+        out_id = self.add_tensor_operand(
+            node.outputsAt(0), in_oper._replace(shape=out_shape)
+        )
+
+        # flex inputs
+        end_mask = 0
+        for idx, dim in enumerate(out_shape):
+            if dim == 0:
+                self.forward_operand_shape(out_id, idx, in_id, idx)
+                end_mask |= 1 << idx
+
+        inputs = [None] * 7
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_vector(
+            [start_value if i == dim_value else 0 for i in range(len(in_oper.shape))]
+        )
+        inputs[2] = self.add_immediate_int_vector(
+            [
+                stop_value if i == dim_value else dim
+                for i, dim in enumerate(in_oper.shape)
+            ]
+        )
+        inputs[3] = self.add_immediate_int_vector(
+            [step_value if i == dim_value else 1 for i in range(len(in_oper.shape))]
+        )
+        inputs[4] = self.add_immediate_int_scalar(0)  # begin mask
+        inputs[5] = self.add_immediate_int_scalar(end_mask)
+        inputs[6] = self.add_immediate_int_scalar(0)  # shrink axis mas
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.STRIDED_SLICE, inputs, outputs)
+
+    def add_size(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        _, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        _, value = self.constants[node.inputsAt(1)]
+        res = in_oper.shape[value]
+        output = node.outputsAt(0)
+        self.add_constant_value(output, output.type(), res)
+
+    def add_cat(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        tensors = self.tensor_sequences[node.inputsAt(0)]
+        _, dim = self.get_constant_value(node.inputsAt(1), "IntType")
+
+        assert len(tensors) > 0
+        in_ids = []
+        out_oper = None
+        out_dim_size = 0
+        for inp in tensors:
+            in_id, in_oper = self.get_tensor_operand_by_jitval(inp)
+            if out_oper is None:
+                out_shape = change_element(in_oper.shape, dim, -1)
+                out_oper = in_oper._replace(shape=out_shape)
+            assert in_oper.op_type == out_oper.op_type
+            assert in_oper.dim_order == out_oper.dim_order
+            assert change_element(in_oper.shape, dim, -1) == change_element(
+                out_oper.shape, dim, -1
+            )
+            # TODO: Possibly check scale and zero point.
+            in_ids.append(in_id)
+            # TODO: Possibly support variable-sized inputs.
+            out_dim_size += in_oper.shape[dim]
+
+        assert out_oper is not None
+        out_oper = out_oper._replace(
+            shape=change_element(out_oper.shape, dim, out_dim_size)
+        )
+
+        if in_oper.dim_order == DimOrder.CHANNELS_LAST:  # type: ignore[possibly-undefined]
+            assert len(out_oper.shape) == 4
+            nnapi_dim = [0, 3, 1, 2][dim]
+        else:
+            nnapi_dim = dim
+
+        out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
+        for idx, d in enumerate(out_oper.shape):
+            if d == 0:
+                if idx == dim:
+                    shape = " + ".join(flex_name(ip_id, dim) for ip_id in in_ids)
+                    self.compute_operand_shape(out_id, idx, shape)
+                else:
+                    self.forward_operand_shape(out_id, idx, in_ids[0], idx)
+
+        inputs = in_ids + [self.add_immediate_int_scalar(nnapi_dim)]
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.CONCATENATION, inputs, outputs)
+
+    def add_mean(self, node):
+        assert node.inputsSize() == 4
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        dim_ctype, dim = self.get_constant_value(node.inputsAt(1))
+        assert dim_ctype.kind() == "ListType"
+        assert dim_ctype.getElementType().kind() == "IntType"
+        _, keep_dim = self.get_constant_value(node.inputsAt(2), "BoolType")
+        # Expect None for dtype
+        self.get_constant_value(node.inputsAt(3), "NoneType")
+
+        if in_oper.dim_order == DimOrder.CHANNELS_LAST:
+            assert len(in_oper.shape) == 4
+            nnapi_dim = [[0, 3, 1, 2][d] for d in dim]
+        else:
+            nnapi_dim = dim
+
+        collapsed_dims = set()
+        for d in dim:
+            if d < 0:
+                d += len(in_oper.shape)
+            collapsed_dims.add(d)
+
+        if in_oper.dim_order == DimOrder.CHANNELS_LAST and not keep_dim:
+            assert collapsed_dims.issuperset({2, 3})
+            out_dim_order = DimOrder.PRESUMED_CONTIGUOUS
+        else:
+            out_dim_order = in_oper.dim_order
+
+        out_shape = []
+        for i, s in enumerate(in_oper.shape):
+            if i not in collapsed_dims:
+                out_shape.append(s)
+            elif keep_dim:
+                out_shape.append(1)
+
+        out_oper = in_oper._replace(shape=out_shape, dim_order=out_dim_order)
+
+        inputs = [None] * 3
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_int_vector(nnapi_dim)
+        inputs[2] = self.add_immediate_int_scalar(keep_dim)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.MEAN, inputs, outputs)
+
+    def add_quantize(self, node):
+        assert node.inputsSize() == 4
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        if in_oper.dim_order != DimOrder.CHANNELS_LAST:
+            raise Exception(
+                "Most hardware backends prefer NHWC quantized tensors.  "
+                "Try setting `t.nnapi_nhwc = True` on your tensor inputs.  "
+            )
+        _, scale = self.get_constant_value(node.inputsAt(1), "FloatType")
+        _, zero_point = self.get_constant_value(node.inputsAt(2), "IntType")
+        _, scalar_type = self.get_constant_value(node.inputsAt(3), "IntType")
+        if scalar_type != TorchScalarTypes.QUINT8.value:
+            raise Exception(
+                "PyTorch NNAPI export only supports quantized tensors "
+                "with the quint8 dtype."
+            )
+        op_type = NNAPI_OperandCode.TENSOR_QUANT8_ASYMM
+
+        out_oper = in_oper._replace(
+            op_type=op_type,
+            scale=scale,
+            zero_point=zero_point,
+        )
+
+        inputs = [None] * 1
+        inputs[0] = in_id
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.QUANTIZE, inputs, outputs)
+
+    def add_dequantize(self, node):
+        assert node.inputsSize() == 1
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        out_oper = in_oper._replace(
+            op_type=NNAPI_OperandCode.TENSOR_FLOAT32,
+            scale=0.0,
+            zero_point=0,
+        )
+
+        inputs = [None] * 1
+        inputs[0] = in_id
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.DEQUANTIZE, inputs, outputs)
+
+    def add_pointwise_simple_unary_op(self, node, opcode):
+        assert node.inputsSize() == 1
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+
+        out_oper = in_oper
+        if opcode == NNAPI_OperationCode.LOGISTIC:
+            # NNAPI docs: For ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, the scale
+            # must be 1.f / 256 and the zeroPoint must be 0.
+            # https://fburl.com/h52stoog
+            if in_oper.op_type == NNAPI_OperandCode.TENSOR_QUANT8_ASYMM:
+                out_oper = in_oper._replace(zero_point=0, scale=1.0 / 256)
+
+        out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        for idx, dim in enumerate(in_oper.shape):
+            if dim == 0:
+                self.forward_operand_shape(out_id, idx, in_id, idx)
+
+        inputs = [None] * 1
+        inputs[0] = in_id
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(opcode, inputs, outputs)
+
+    def _do_add_binary(self, node, opcode, fuse_code, *, qparams=None):  # noqa: D401
+        """Helper for pointwise binary broadcast ops with superfluous extra args."""
+        assert node.outputsSize() == 1
+
+        assert node.inputsAt(0).type().kind() == "TensorType"
+        assert node.inputsAt(1).type().kind() == "TensorType"
+
+        if self.has_operand_for_jitval(node.inputsAt(0)):
+            in0_id, in0_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+            in1_id, in1_oper = self.get_tensor_operand_or_constant(
+                node.inputsAt(1), in0_oper.dim_order
+            )
+        elif self.has_operand_for_jitval(node.inputsAt(1)):
+            in1_id, in1_oper = self.get_tensor_operand_by_jitval(node.inputsAt(1))
+            in0_id, in0_oper = self.get_tensor_operand_or_constant(
+                node.inputsAt(0), in1_oper.dim_order
+            )
+        else:
+            raise Exception(f"Can't do a NNAPI binary op: {opcode} on two constants")
+
+        assert in0_oper.op_type == in1_oper.op_type
+        in0_id, in0_oper, in1_id, in1_oper = self.transpose_for_broadcast(
+            in0_id, in0_oper, in1_id, in1_oper
+        )
+        # NOTE: PyTorch and NNAPI have the same broadcast semantics.
+        out_shape = broadcast_shapes(in0_oper.shape, in1_oper.shape)
+        out_oper = in0_oper._replace(shape=out_shape)
+        if qparams is not None:
+            scale, zp = qparams
+            out_oper = out_oper._replace(scale=scale, zero_point=zp)
+
+        out_id = self.add_tensor_operand(node.outputsAt(0), out_oper)
+        for idx, (d0, d1) in enumerate(zip(in0_oper.shape, in1_oper.shape)):
+            if d0 == 1 and d1 == 0:
+                self.forward_operand_shape(out_id, idx, in1_id, idx)
+            elif d0 == 0 and d1 == 1:
+                self.forward_operand_shape(out_id, idx, in0_id, idx)
+            elif d0 == 0 and d1 == 0:
+                self.flexible_shape_computation_lines.append(
+                    f"assert {flex_name(in0_id, idx)} == {flex_name(in1_id, idx)}"
+                )
+                self.forward_operand_shape(out_id, idx, in0_id, idx)
+
+        inputs = [None] * 3
+        inputs[0] = in0_id
+        inputs[1] = in1_id
+        inputs[2] = self.add_immediate_int_scalar(fuse_code)
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(opcode, inputs, outputs)
+
+    def add_pointwise_simple_binary_broadcast_op(self, node, opcode, fuse_code):
+        assert node.inputsSize() == 2
+        self._do_add_binary(node, opcode, fuse_code)
+
+    def add_add_sub_op(self, node, opcode, fuse_code):
+        assert node.inputsSize() == 3
+
+        _, alpha = self.get_constant_value(node.inputsAt(2), "IntType")
+        if alpha != 1:
+            raise Exception("NNAPI does not support add/sub with alpha.")
+
+        self._do_add_binary(node, opcode, fuse_code)
+
+    def add_qadd(self, node, opcode, fuse_code):
+        assert node.inputsSize() == 4
+
+        _, scale = self.get_constant_value(node.inputsAt(2), "FloatType")
+        _, zero_point = self.get_constant_value(node.inputsAt(3), "IntType")
+
+        self._do_add_binary(node, opcode, fuse_code, qparams=(scale, zero_point))
+
+    def add_softmax(self, node):
+        assert node.inputsSize() == 3
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+
+        _, softmax_dim = self.get_constant_value(node.inputsAt(1), "IntType")
+
+        out_id = self.add_tensor_operand(node.outputsAt(0), in_oper)
+        for dim, size in enumerate(in_oper.shape):
+            if size == 0:
+                self.forward_operand_shape(out_id, dim, in_id, dim)
+
+        inputs = [None] * 3
+        inputs[0] = in_id
+        inputs[1] = self.add_immediate_float_scalar(
+            1.0
+        )  # positive scaling factor of exponent, beta
+        inputs[2] = self.add_immediate_int_scalar(softmax_dim)
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.SOFTMAX, inputs, outputs)
+
+    def add_hardtanh(self, node):
+        assert node.inputsSize() == 3
+        assert node.outputsSize() == 1
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval_fixed_size(node.inputsAt(0))
+        _, min_val = self.get_constant_value(node.inputsAt(1), "FloatType")
+        _, max_val = self.get_constant_value(node.inputsAt(2), "FloatType")
+
+        op_map = {
+            (-1, 1): NNAPI_OperationCode.RELU1,
+            (0, 6): NNAPI_OperationCode.RELU6,  # noqa: E201
+        }
+
+        opcode = op_map.get((min_val, max_val))
+        if opcode is None:
+            raise Exception("NNAPI only supports hardtanh with args (-1, 1) or (0, 6).")
+
+        inputs = [None] * 1
+        inputs[0] = in_id
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), in_oper)
+
+        self.add_operation(opcode, inputs, outputs)
+
+    def add_prelu_op(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        assert node.inputsAt(0).type().kind() == "TensorType"
+        assert node.inputsAt(1).type().kind() == "TensorType"
+
+        in_id, in_oper = self.get_tensor_operand_by_jitval(node.inputsAt(0))
+        w_id, w_oper = self.get_tensor_operand_for_weight(node.inputsAt(1))
+        assert len(w_oper.shape) == 1
+        assert w_oper.shape[0] > 0
+        if w_oper.shape[0] > 1:
+            if in_oper.use_nchw():
+                # TODO: Support this by adding trailing 1 dims.
+                raise Exception(
+                    "Per-channel PReLU only supports channels_last right now."
+                )
+
+        out_id = self.add_tensor_operand(node.outputsAt(0), in_oper)
+        for dim, size in enumerate(in_oper.shape):
+            if size > 0:
+                pass
+            elif dim <= 1:
+                raise Exception("PReLU requires fixed size for dim 0 and dim 1.")
+            else:
+                self.forward_operand_shape(out_id, dim, in_id, dim)
+
+        inputs = [None] * 2
+        inputs[0] = in_id
+        inputs[1] = w_id
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.PRELU, inputs, outputs)
+
+    def add_pool2d_node(self, node, opcode):
+        assert node.inputsSize() == 6
+        assert node.outputsSize() == 1
+        image, kernel, stride, padding, dilation, ceil_mode = node.inputs()
+
+        stride = stride or kernel
+
+        # TODO: Validate ceil_mode semantics.
+
+        args = self.get_conv_pool_args_2d_from_jit(
+            self.get_size_arg(kernel), stride, padding, dilation
+        )
+        if args.dilation_h != 1 or args.dilation_w != 1:
+            raise Exception("NNAPI does not support dilated pooling.")
+
+        image_id, image_oper = self.get_tensor_operand_by_jitval_fixed_size(image)
+        assert len(image_oper.shape) == 4
+
+        out_shape = get_conv_pool_shape(
+            image_oper.shape, args, image_oper.shape[1], False
+        )
+        use_nchw = image_oper.use_nchw()
+
+        inputs = [None] * 11
+        inputs[0] = image_id
+        inputs[1] = self.add_immediate_int_scalar(args.pad_l)
+        inputs[2] = self.add_immediate_int_scalar(args.pad_r)
+        inputs[3] = self.add_immediate_int_scalar(args.pad_t)
+        inputs[4] = self.add_immediate_int_scalar(args.pad_b)
+        inputs[5] = self.add_immediate_int_scalar(args.stride_w)
+        inputs[6] = self.add_immediate_int_scalar(args.stride_h)
+        inputs[7] = self.add_immediate_int_scalar(args.kernel_w)
+        inputs[8] = self.add_immediate_int_scalar(args.kernel_h)
+        inputs[9] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
+        inputs[10] = self.add_immediate_bool_scalar(use_nchw)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(
+            node.outputsAt(0), image_oper._replace(shape=out_shape)
+        )
+
+        self.add_operation(opcode, inputs, outputs)
+
+    def add_avg_pool2d(self, node):
+        assert node.inputsSize() == 7
+        assert node.outputsSize() == 1
+        (
+            image,
+            kernel,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        ) = node.inputs()
+
+        _, count_include_pad_value = self.get_constant_value(count_include_pad)
+        _, divisor_override_value = self.get_constant_value(divisor_override)
+        if not count_include_pad_value or divisor_override_value:
+            raise Exception(
+                "NNAPI doesn't support count_include_pad=False or divisor_override"
+            )
+
+        args = self.get_conv_pool_args_2d_from_jit(
+            self.get_size_arg(kernel), stride, padding
+        )
+
+        image_id, image_oper = self.get_tensor_operand_by_jitval(image)
+        assert len(image_oper.shape) == 4
+
+        out_shape = get_conv_pool_shape(
+            image_oper.shape, args, image_oper.shape[1], False
+        )
+        use_nchw = image_oper.use_nchw()
+
+        inputs = [None] * 11
+        inputs[0] = image_id
+        inputs[1] = self.add_immediate_int_scalar(args.pad_l)
+        inputs[2] = self.add_immediate_int_scalar(args.pad_r)
+        inputs[3] = self.add_immediate_int_scalar(args.pad_t)
+        inputs[4] = self.add_immediate_int_scalar(args.pad_b)
+        inputs[5] = self.add_immediate_int_scalar(args.stride_w)
+        inputs[6] = self.add_immediate_int_scalar(args.stride_h)
+        inputs[7] = self.add_immediate_int_scalar(args.kernel_w)
+        inputs[8] = self.add_immediate_int_scalar(args.kernel_h)
+        inputs[9] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
+        inputs[10] = self.add_immediate_bool_scalar(use_nchw)
+
+        outputs = [None] * 1
+        out_id = self.add_tensor_operand(
+            node.outputsAt(0), image_oper._replace(shape=out_shape)
+        )
+        self._handle_conv_pool_flexible_input(out_id, image, args, False)
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.AVERAGE_POOL_2D, inputs, outputs)
+
+    def add_adaptive_avg_pool2d(self, node):
+        assert node.inputsSize() == 2
+        assert node.outputsSize() == 1
+
+        image_id, image_oper = self.get_tensor_operand_by_jitval_fixed_size(
+            node.inputsAt(0)
+        )
+        assert len(image_oper.shape) == 4
+
+        size_ctype, size_arg = self.get_constant_value(node.inputsAt(1))
+        assert size_ctype.kind() == "ListType"
+        assert size_ctype.getElementType().kind() == "IntType"
+        if size_arg != [1, 1]:
+            raise Exception(
+                "NNAPI only supports adaptive_avg_pool2d with output size (1, 1)."
+            )
+
+        out_shape = image_oper.shape[0:2] + tuple(size_arg)
+        use_nchw = image_oper.use_nchw()
+
+        inputs = [None] * 11
+        inputs[0] = image_id
+        inputs[1] = self.add_immediate_int_scalar(0)
+        inputs[2] = self.add_immediate_int_scalar(0)
+        inputs[3] = self.add_immediate_int_scalar(0)
+        inputs[4] = self.add_immediate_int_scalar(0)
+        inputs[5] = self.add_immediate_int_scalar(1)
+        inputs[6] = self.add_immediate_int_scalar(1)
+        inputs[7] = self.add_immediate_int_scalar(image_oper.shape[3])
+        inputs[8] = self.add_immediate_int_scalar(image_oper.shape[2])
+        inputs[9] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
+        inputs[10] = self.add_immediate_bool_scalar(use_nchw)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(
+            node.outputsAt(0), image_oper._replace(shape=out_shape)
+        )
+
+        self.add_operation(NNAPI_OperationCode.AVERAGE_POOL_2D, inputs, outputs)
+
+    def add_upsample_nearest2d(self, node):
+        assert node.inputsSize() == 3 or node.inputsSize() == 4
+        assert node.outputsSize() == 1
+        if node.inputsSize() == 3:
+            image, size_jit, scale_jit = node.inputs()
+        else:
+            image, size_jit, scale_h_jit, scale_w_jit = node.inputs()
+        size_ctype, size_arg = self.get_constant_value(size_jit)
+
+        if node.inputsSize() == 3:
+            scale_ctype, scale_arg = self.get_constant_value(scale_jit)  # type: ignore[possibly-undefined]
+        else:
+            scale_h_ctype, scale_h_arg = self.get_constant_value(scale_h_jit)  # type: ignore[possibly-undefined]
+            scale_w_ctype, scale_w_arg = self.get_constant_value(scale_w_jit)  # type: ignore[possibly-undefined]
+
+            # The only way for the 4-argument overload of upsample_nearest2d to
+            # have been added to the graph without error is if the scale_h and
+            # scale_w arguments are None
+            assert scale_h_ctype.kind() == "NoneType"
+            assert scale_w_ctype.kind() == "NoneType"
+
+            scale_ctype = scale_h_ctype
+            scale_arg = scale_h_arg
+
+        image_id, image_oper = self.get_tensor_operand_by_jitval(image)
+        assert len(image_oper.shape) == 4
+
+        if size_ctype.kind() != "NoneType" and scale_ctype.kind() != "NoneType":
+            raise Exception("Size and scale cannot both be non-None.")
+        elif size_ctype.kind() != "NoneType":
+            assert size_ctype.kind() == "ListType"
+            assert size_ctype.getElementType().kind() == "IntType"
+            assert scale_ctype.kind() == "NoneType"
+            assert scale_arg is None
+            assert isinstance(size_arg, list)
+            assert size_arg
+            assert all(isinstance(val, int) for val in size_arg)
+            if len(size_arg) == 1:
+                size_arg = size_arg * 2
+            assert len(size_arg) == 2
+            out_h = size_arg[0]
+            out_w = size_arg[1]
+            arg_h = self.add_immediate_int_scalar(out_h)
+            arg_w = self.add_immediate_int_scalar(out_w)
+        elif scale_ctype.kind() != "NoneType":
+            assert scale_ctype.kind() == "ListType"
+            assert scale_ctype.getElementType().kind() == "FloatType"
+            assert size_ctype.kind() == "NoneType"
+            assert size_arg is None
+            assert isinstance(scale_arg, list)
+            assert scale_arg
+            assert all(isinstance(val, float) for val in scale_arg)
+            if len(scale_arg) == 1:
+                scale_arg = scale_arg * 2
+            assert len(scale_arg) == 2
+            out_h = int(scale_arg[0] * image_oper.shape[2])
+            out_w = int(scale_arg[1] * image_oper.shape[3])
+            arg_h = self.add_immediate_float_scalar(scale_arg[0])
+            arg_w = self.add_immediate_float_scalar(scale_arg[1])
+        else:
+            raise Exception("Size and scale cannot both be None.")
+
+        out_shape = (image_oper.shape[0], image_oper.shape[1], out_h, out_w)
+        use_nchw = image_oper.use_nchw()
+        out_id = self.add_tensor_operand(
+            node.outputsAt(0), image_oper._replace(shape=out_shape)
+        )
+
+        if image_oper.shape[0] == 0 or image_oper.shape[1] == 0:
+            raise Exception("Flexible batch or channels not supported")
+
+        # Handle variable input size
+        for dim in (2, 3):  # h, w indices
+            if image_oper.shape[dim] == 0:
+                if size_ctype.kind() != "NoneType":
+                    self.compute_operand_shape(out_id, dim, size_arg[dim - 2])
+                elif scale_ctype.kind() != "NoneType":
+                    self.compute_operand_shape(
+                        out_id,
+                        dim,
+                        f"int({scale_arg[dim - 2]} * {flex_name(image_id, dim)})",
+                    )
+                else:
+                    raise Exception("Size and scale cannot both be None.")
+
+        inputs = [None] * 4
+        inputs[0] = image_id
+        inputs[1] = arg_w
+        inputs[2] = arg_h
+        inputs[3] = self.add_immediate_bool_scalar(use_nchw)
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.RESIZE_NEAREST_NEIGHBOR, inputs, outputs)
+
+    def add_addmm(self, node):
+        assert node.inputsSize() == 5
+        assert node.outputsSize() == 1
+        jit_bias, jit_input, jit_weight, jit_beta, jit_alpha = node.inputs()
+
+        for jitval in (jit_beta, jit_alpha):
+            scale_ctype, scale_value = self.get_constant_value(jitval)
+            assert scale_ctype.kind() in ("IntType", "FloatType")
+            if scale_value != 1:
+                raise Exception(
+                    "NNAPI Fully-Connected does not support alpha and beta."
+                )
+
+        self.add_addmm_or_linear(node, True, jit_input, jit_weight, jit_bias)
+
+    def add_linear(self, node):
+        assert node.inputsSize() == 3
+        assert node.outputsSize() == 1
+        jit_input, jit_weight, jit_bias = node.inputs()
+
+        self.add_addmm_or_linear(node, False, jit_input, jit_weight, jit_bias)
+
+    def add_addmm_or_linear(
+        self, node, transpose_weight, jit_input, jit_weight, jit_bias
+    ):
+        input_id, input_oper = self.get_tensor_operand_by_jitval(jit_input)
+        bias_id, bias_oper = self.get_tensor_operand_for_weight(jit_bias)
+
+        assert len(input_oper.shape) == 2
+        assert len(bias_oper.shape) == 1
+
+        # TODO: Transform at load time to share weights with CPU model.
+        _, weight_tensor = self.get_constant_value(jit_weight, "TensorType")
+        assert len(weight_tensor.shape) == 2
+        if transpose_weight:
+            nnapi_weight_tensor = weight_tensor.t().contiguous()
+        else:
+            nnapi_weight_tensor = weight_tensor.contiguous()
+        weight_id = self.add_tensor_operand_for_weight(nnapi_weight_tensor)
+        weight_oper = self.operands[weight_id]
+
+        out_shape = (input_oper.shape[0], weight_oper.shape[0])
+        out_id = self.add_tensor_operand(
+            node.outputsAt(0), input_oper._replace(shape=out_shape)
+        )
+
+        if input_oper.shape[0] == 0:
+            self.forward_operand_shape(out_id, 0, input_id, 0)
+
+        inputs = [None] * 4
+        inputs[0] = input_id
+        inputs[1] = weight_id
+        inputs[2] = bias_id
+        inputs[3] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
+
+        outputs = [None] * 1
+        outputs[0] = out_id
+
+        self.add_operation(NNAPI_OperationCode.FULLY_CONNECTED, inputs, outputs)
+
+    def add_qlinear(self, node):
+        assert node.inputsSize() == 4
+        assert node.outputsSize() == 1
+        (
+            jit_input,
+            jit_packed_weight,
+            jit_scale,
+            jit_zero_point,
+        ) = node.inputs()
+
+        input_id, input_oper = self.get_tensor_operand_by_jitval_fixed_size(jit_input)
+        # TODO: Support automatic reshape
+        assert len(input_oper.shape) == 2
+
+        _, out_scale = self.get_constant_value(jit_scale, "FloatType")
+        _, out_zero_point = self.get_constant_value(jit_zero_point, "IntType")
+        weight_ctype, packed_weight = self.get_constant_value(jit_packed_weight)
+        assert weight_ctype.name() == "LinearPackedParamsBase"
+        raw_weight, raw_bias = packed_weight.__getstate__()[0]
+        assert raw_bias is not None
+
+        assert len(raw_weight.shape) == 2
+        assert len(raw_bias.shape) == 1
+        assert raw_bias.shape[0] == raw_weight.shape[0]
+        assert raw_weight.shape[1] == input_oper.shape[1]
+
+        assert raw_weight.qscheme() == torch.per_tensor_affine
+        if raw_weight.dtype == torch.quint8:
+            unsigned_weight = raw_weight
+        else:
+            assert raw_weight.dtype == torch.qint8
+            unsigned_weight = torch._make_per_tensor_quantized_tensor(
+                (raw_weight.int_repr().int() + 128).to(torch.uint8),
+                scale=raw_weight.q_scale(),
+                zero_point=raw_weight.q_zero_point() + 128,
+            )
+        weight_scale = unsigned_weight.q_scale()
+        bias_scale = input_oper.scale * weight_scale
+        int_bias = torch.quantize_per_tensor(raw_bias, bias_scale, 0, torch.qint32)
+        bias_id = self.add_tensor_operand_for_weight(int_bias)
+
+        multiplier = input_oper.scale * weight_scale / out_scale
+        assert multiplier > 0
+        if multiplier >= 1:
+            raise Exception(
+                "Quantized convolution multiplier is greater than 1.  "
+                "This is supported by NNAPI, but not by most hardware backends.  "
+                "Try training a model without quantization-aware training.  "
+            )
+
+        # TODO: Transform at load time to share weights with CPU model.
+        nnapi_weight_tensor = unsigned_weight.contiguous()
+        weight_id = self.add_tensor_operand_for_weight(nnapi_weight_tensor)
+        weight_oper = self.operands[weight_id]
+
+        out_shape = (input_oper.shape[0], weight_oper.shape[0])
+        out_oper = input_oper._replace(
+            shape=out_shape,
+            scale=out_scale,
+            zero_point=out_zero_point,
+        )
+
+        inputs = [None] * 4
+        inputs[0] = input_id
+        inputs[1] = weight_id
+        inputs[2] = bias_id
+        inputs[3] = self.add_immediate_int_scalar(NNAPI_FuseCode.FUSED_NONE)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(node.outputsAt(0), out_oper)
+
+        self.add_operation(NNAPI_OperationCode.FULLY_CONNECTED, inputs, outputs)
+
+    def get_optional_bias(self, jit_bias, weight_tensor, transpose=False):
+        ctype, value = self.get_constant_value(jit_bias)
+        if ctype.kind() == "NoneType":
+            bias_idx = 1 if transpose else 0
+            nnapi_bias_tensor = torch.zeros(
+                weight_tensor.size()[bias_idx], dtype=weight_tensor.dtype
+            )
+            bias_id = self.add_tensor_operand_for_weight(nnapi_bias_tensor)
+            bias_oper = self.operands[bias_id]
+            return bias_id, bias_oper
+        else:
+            return self.get_tensor_operand_for_weight(jit_bias)
+
+    def add_conv2d(self, node):
+        assert node.inputsSize() == 7
+        assert node.outputsSize() == 1
+
+        (
+            jit_image,
+            jit_weight,
+            jit_bias,
+            jit_stride,
+            jit_pad,
+            jit_dilation,
+            jit_groups,
+        ) = node.inputs()
+
+        _, weight_tensor = self.get_constant_value(jit_weight, "TensorType")
+        bias_id, bias_oper = self.get_optional_bias(jit_bias, weight_tensor)
+        args = self.get_conv_pool_args_2d_from_jit(
+            weight_tensor.shape[2:4], jit_stride, jit_pad, jit_dilation, jit_groups
+        )
+
+        return self.add_conv2d_common(
+            node.outputsAt(0),
+            0.0,
+            0,
+            jit_image,
+            weight_tensor,
+            bias_id,
+            args,
+            False,  # transpose
+            NNAPI_FuseCode.FUSED_NONE,
+        )
+
+    def add_conv_underscore(self, node):
+        assert node.inputsSize() == 13
+        assert node.outputsSize() == 1
+
+        (
+            jit_image,
+            jit_weight,
+            jit_bias,
+            jit_stride,
+            jit_pad,
+            jit_dilation,
+            jit_transpose,
+            _,
+            jit_groups,
+            _,
+            _,
+            _,
+            _,
+        ) = node.inputs()
+
+        _, weight_tensor = self.get_constant_value(jit_weight, "TensorType")
+        _, transpose = self.get_constant_value(jit_transpose)
+        bias_id, bias_oper = self.get_optional_bias(jit_bias, weight_tensor, transpose)
+        args = self.get_conv_pool_args_2d_from_jit(
+            weight_tensor.shape[2:4], jit_stride, jit_pad, jit_dilation, jit_groups
+        )
+
+        return self.add_conv2d_common(
+            node.outputsAt(0),
+            0.0,
+            0,
+            jit_image,
+            weight_tensor,
+            bias_id,
+            args,
+            transpose,
+            NNAPI_FuseCode.FUSED_NONE,
+        )
+
+    def add_log_softmax(self, node):
+        assert node.inputsSize() == 3
+        assert node.outputsSize() == 1
+
+        (jit_input, jit_dim, jit_half_to_float) = node.inputs()
+        input_id, input_oper = self.get_tensor_operand_by_jitval_fixed_size(jit_input)
+        _, dim = self.get_constant_value(jit_dim, "IntType")
+
+        out_shape = input_oper.shape
+
+        inputs = [None] * 3
+        inputs[0] = input_id
+        # specifying 1 as the scaling factor for the exponent, beta
+        inputs[1] = self.add_immediate_float_scalar(1)
+        inputs[2] = self.add_immediate_int_scalar(dim)
+
+        outputs = [None] * 1
+        outputs[0] = self.add_tensor_operand(
+            node.outputsAt(0), input_oper._replace(shape=out_shape)
+        )
+        self.add_operation(NNAPI_OperationCode.LOG_SOFTMAX, inputs, outputs)
+
+    def add_qconv2d(self, node, fuse_code, transpose=False):
+        assert node.inputsSize() == 4
+        assert node.outputsSize() == 1
+
+        (
+            jit_image,
+            jit_packed_weight,
+            jit_scale,
+            jit_zero_point,
+        ) = node.inputs()
+
+        _, out_scale = self.get_constant_value(jit_scale, "FloatType")
+        _, out_zero_point = self.get_constant_value(jit_zero_point, "IntType")
+        weight_ctype, packed_weight = self.get_constant_value(jit_packed_weight)
+        assert weight_ctype.name() == "Conv2dPackedParamsBase"
+        (
+            pack_version,
+            tensors,
+            opt_tensors,
+        ) = packed_weight.__getstate__()[0]
+        assert pack_version == "2"
+        packed_config, raw_weight = tensors
+        (raw_bias,) = opt_tensors
+        assert raw_bias is not None
+        args = self.get_conv_pool_args_2d_from_pack(
+            raw_weight.shape[2:4], packed_config
+        )
+
+        assert raw_weight.qscheme() == torch.per_tensor_affine
+        if raw_weight.dtype == torch.quint8:
+            unsigned_weight = raw_weight
+        else:
+            assert raw_weight.dtype == torch.qint8
+            unsigned_weight = torch._make_per_tensor_quantized_tensor(
+                (raw_weight.int_repr().int() + 128).to(torch.uint8),
+                scale=raw_weight.q_scale(),
+                zero_point=raw_weight.q_zero_point() + 128,
+            )
+        weight_scale = unsigned_weight.q_scale()
+        _, image_oper = self.get_tensor_operand_by_jitval(jit_image)
+        bias_scale = image_oper.scale * weight_scale
+        int_bias = torch.quantize_per_tensor(raw_bias, bias_scale, 0, torch.qint32)
+        bias_id = self.add_tensor_operand_for_weight(int_bias)
+
+        multiplier = image_oper.scale * weight_scale / out_scale
+        assert multiplier > 0
+        if multiplier >= 1:
+            raise Exception(
+                "Quantized convolution multiplier is greater than 1.  "
+                "This is supported by NNAPI, but not by most hardware backends.  "
+                "Try training a model without quantization-aware training.  "
+            )
+
+        return self.add_conv2d_common(
+            node.outputsAt(0),
+            out_scale,
+            out_zero_point,
+            jit_image,
+            unsigned_weight,
+            bias_id,
+            args,
+            transpose,
+            fuse_code,
+        )
+
+    def add_conv2d_common(
+        self,
+        jit_out,
+        out_scale,
+        out_zero_point,
+        jit_image,
+        weight_tensor,
+        bias_id,
+        args,
+        transpose,
+        fuse_code,
+    ):
+        image_id, image_oper = self.get_tensor_operand_by_jitval(jit_image)
+        in_c = image_oper.shape[1]
+
+        if args.group == 1:
+            # Full convolution
+            depthwise = False
+            if transpose:
+                weight_permutation = (1, 2, 3, 0)
+            else:
+                weight_permutation = (0, 2, 3, 1)
+        elif args.group == in_c:
+            # Depthwise convolution
+            depthwise = True
+            weight_permutation = (1, 2, 3, 0)
+        else:
+            raise Exception("Group convolution not supported yet.")
+
+        # TODO: Transform at load time to share weights with CPU model.
+        nnapi_weight_tensor = weight_tensor.permute(*weight_permutation).contiguous()
+        weight_id = self.add_tensor_operand_for_weight(nnapi_weight_tensor)
+        weight_oper = self.operands[weight_id]
+
+        bias_oper = self.operands[bias_id]
+
+        if image_oper.op_type == NNAPI_OperandCode.TENSOR_FLOAT32:
+            assert weight_oper.op_type == NNAPI_OperandCode.TENSOR_FLOAT32
+            assert bias_oper.op_type == NNAPI_OperandCode.TENSOR_FLOAT32
+        elif image_oper.op_type == NNAPI_OperandCode.TENSOR_QUANT8_ASYMM:
+            assert weight_oper.op_type == NNAPI_OperandCode.TENSOR_QUANT8_ASYMM
+            assert bias_oper.op_type == NNAPI_OperandCode.TENSOR_INT32
+            assert approx_equal(image_oper.scale * weight_oper.scale, bias_oper.scale)
+            assert bias_oper.zero_point == 0
+        else:
+            raise Exception(f"Unsupported input type for conv2d: {image_oper.op_type}")
+
+        assert len(image_oper.shape) == 4
+        assert len(weight_oper.shape) == 4
+        assert len(bias_oper.shape) == 1
+
+        if depthwise:
+            # Depthwise convolution
+            one, kern_h, kern_w, out_c = weight_oper.shape
+            assert one == 1
+            assert out_c % in_c == 0
+            channel_multiplier = out_c // in_c
+            assert channel_multiplier == 1  # Don't support multiplier
+            assert out_c == in_c
+        else:
+            # Full convolution
+            out_c, kern_h, kern_w, kern_d = weight_oper.shape
+            assert kern_d == in_c
+
+        assert out_c == bias_oper.shape[0]
+
+        use_nchw = image_oper.use_nchw()
+
+        if depthwise:
+            num_args = 12
+            opcode = NNAPI_OperationCode.DEPTHWISE_CONV_2D
+        else:
+            num_args = 11
+            if transpose:
+                opcode = NNAPI_OperationCode.TRANSPOSE_CONV_2D
+            else:
+                opcode = NNAPI_OperationCode.CONV_2D
+
+        inputs = [None] * num_args
+        inputs[0] = image_id
+        inputs[1] = weight_id
+        inputs[2] = bias_id
+        inputs[3] = self.add_immediate_int_scalar(args.pad_l)
+        inputs[4] = self.add_immediate_int_scalar(args.pad_r)
+        inputs[5] = self.add_immediate_int_scalar(args.pad_t)
+        inputs[6] = self.add_immediate_int_scalar(args.pad_b)
+        inputs[7] = self.add_immediate_int_scalar(args.stride_w)
+        inputs[8] = self.add_immediate_int_scalar(args.stride_h)
+        if depthwise:
+            inputs[9] = self.add_immediate_int_scalar(1)
+            inputs[10] = self.add_immediate_int_scalar(fuse_code)
+            inputs[11] = self.add_immediate_bool_scalar(use_nchw)
+        else:
+            inputs[9] = self.add_immediate_int_scalar(fuse_code)
+            inputs[10] = self.add_immediate_bool_scalar(use_nchw)
+
+        outputs = [None] * 1
+        out_shape = get_conv_pool_shape(image_oper.shape, args, out_c, transpose)
+        out_oper = image_oper._replace(
+            shape=out_shape,
+            scale=out_scale,
+            zero_point=out_zero_point,
+        )
+        out_id = self.add_tensor_operand(jit_out, out_oper)
+        self._handle_conv_pool_flexible_input(out_id, jit_image, args, transpose)
+
+        outputs[0] = out_id
+        self.add_operation(opcode, inputs, outputs)
+
+    def _handle_conv_pool_flexible_input(self, out_id, jit_image, args, transpose):
+        image_id, image_oper = self.get_tensor_operand_by_jitval(jit_image)
+        batch, in_ch, in_h, in_w = image_oper.shape
+
+        if batch == 0:
+            self.forward_operand_shape(out_id, 0, image_id, 0)
+        if in_ch == 0:
+            raise Exception("Input channels can't be flexible")
+        # H & W
+        if transpose:
+            if in_h == 0:
+                self.compute_operand_shape(
+                    out_id,
+                    2,
+                    f"({flex_name(image_id, 2)} - 1) * {args.stride_h} + {args.kernel_h} - {args.pad_t} - {args.pad_b}",
+                )
+            if in_w == 0:
+                self.compute_operand_shape(
+                    out_id,
+                    3,
+                    f"({flex_name(image_id, 3)} - 1) * {args.stride_w} + {args.kernel_w} - {args.pad_l} - {args.pad_r}",
+                )
+        else:
+            if in_h == 0:
+                self.compute_operand_shape(
+                    out_id,
+                    2,
+                    f"({flex_name(image_id, 2)} - {args.kernel_h} + {args.pad_t} + {args.pad_b}) // {args.stride_h} + 1",
+                )
+            if in_w == 0:
+                self.compute_operand_shape(
+                    out_id,
+                    3,
+                    f"({flex_name(image_id, 3)} - {args.kernel_w} + {args.pad_l} + {args.pad_r}) // {args.stride_w} + 1",
+                )
+
+
+def serialize_model(
+    module, inputs, *, config=None, return_shapes=None, use_int16_for_qint16=False
+):
+    """Convert to NNAPI and serialize torchscript module.
+
+    Parameters:
+        module: Torchscript module to convert
+        inputs: Tensors used to specify input details for NNAPI
+        config (optional): Optional config to attach to module
+        return_shapes (optional): Specify shape of outputs if
+            your module uses runtime flexible shapes to set output
+            buffer size for NNAPI
+        use_int16_for_qint16 (optional): Use Pytorch int16 to represent NNAPI qint16 values
+    """
+    return _NnapiSerializer(config, use_int16_for_qint16).serialize_model(
+        module, inputs, return_shapes
+    )
diff --git a/MLPY/Lib/site-packages/torch/backends/cpu/__init__.py b/MLPY/Lib/site-packages/torch/backends/cpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..923fbd6401673d7569efc5ddb2f4edd9101b7860
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/cpu/__init__.py
@@ -0,0 +1,19 @@
+import torch
+
+__all__ = [
+    "get_cpu_capability",
+]
+
+
+def get_cpu_capability() -> str:
+    r"""Return cpu capability as a string value.
+
+    Possible values:
+    - "DEFAULT"
+    - "VSX"
+    - "Z VECTOR"
+    - "NO AVX"
+    - "AVX2"
+    - "AVX512"
+    """
+    return torch._C._get_cpu_capability()
diff --git a/MLPY/Lib/site-packages/torch/backends/cpu/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/cpu/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba1f81ddc28b41a7035c7550f5a7f3ef6b766234
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/cpu/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/cuda/__init__.py b/MLPY/Lib/site-packages/torch/backends/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29a25146a9b88243b4ded4b648b11725fb9a272
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/cuda/__init__.py
@@ -0,0 +1,371 @@
+import contextlib
+import warnings
+
+from typing import Union
+
+import torch
+
+__all__ = [
+    "is_built",
+    "cuFFTPlanCacheAttrContextProp",
+    "cuFFTPlanCache",
+    "cuFFTPlanCacheManager",
+    "cuBLASModule",
+    "preferred_linalg_library",
+    "cufft_plan_cache",
+    "matmul",
+    "SDPBackend",
+    "SDPAParams",
+    "enable_cudnn_sdp",
+    "cudnn_sdp_enabled",
+    "enable_flash_sdp",
+    "flash_sdp_enabled",
+    "enable_mem_efficient_sdp",
+    "mem_efficient_sdp_enabled",
+    "math_sdp_enabled",
+    "enable_math_sdp",
+    "can_use_flash_attention",
+    "can_use_efficient_attention",
+    "sdp_kernel",
+]
+
+
+def is_built():
+    r"""
+    Return whether PyTorch is built with CUDA support.
+
+    Note that this doesn't necessarily mean CUDA is available; just that if this PyTorch
+    binary were run on a machine with working CUDA drivers and devices, we would be able to use it.
+    """
+    return torch._C._has_cuda
+
+
+class cuFFTPlanCacheAttrContextProp:
+    # Like regular ContextProp, but uses the `.device_index` attribute from the
+    # calling object as the first argument to the getter and setter.
+    def __init__(self, getter, setter):
+        self.getter = getter
+        self.setter = setter
+
+    def __get__(self, obj, objtype):
+        return self.getter(obj.device_index)
+
+    def __set__(self, obj, val):
+        if isinstance(self.setter, str):
+            raise RuntimeError(self.setter)
+        self.setter(obj.device_index, val)
+
+
+class cuFFTPlanCache:
+    r"""
+    Represent a specific plan cache for a specific `device_index`.
+
+    The attributes `size` and `max_size`, and method `clear`, can fetch and/ or
+    change properties of the C++ cuFFT plan cache.
+    """
+
+    def __init__(self, device_index):
+        self.device_index = device_index
+
+    size = cuFFTPlanCacheAttrContextProp(
+        torch._cufft_get_plan_cache_size,
+        ".size is a read-only property showing the number of plans currently in the "
+        "cache. To change the cache capacity, set cufft_plan_cache.max_size.",
+    )
+
+    max_size = cuFFTPlanCacheAttrContextProp(
+        torch._cufft_get_plan_cache_max_size, torch._cufft_set_plan_cache_max_size
+    )
+
+    def clear(self):
+        return torch._cufft_clear_plan_cache(self.device_index)
+
+
+class cuFFTPlanCacheManager:
+    r"""
+    Represent all cuFFT plan caches, return the cuFFTPlanCache for a given device when indexed.
+
+    Finally, this object, when used directly as a `cuFFTPlanCache` object (e.g.,
+    setting the `.max_size`) attribute, the current device's cuFFT plan cache is
+    used.
+    """
+
+    __initialized = False
+
+    def __init__(self):
+        self.caches = []
+        self.__initialized = True
+
+    def __getitem__(self, device):
+        index = torch.cuda._utils._get_device_index(device)
+        if index < 0 or index >= torch.cuda.device_count():
+            raise RuntimeError(
+                f"cufft_plan_cache: expected 0 <= device index < {torch.cuda.device_count()}, but got "
+                f"device with index {index}"
+            )
+        if len(self.caches) == 0:
+            self.caches.extend(
+                cuFFTPlanCache(index) for index in range(torch.cuda.device_count())
+            )
+        return self.caches[index]
+
+    def __getattr__(self, name):
+        return getattr(self[torch.cuda.current_device()], name)
+
+    def __setattr__(self, name, value):
+        if self.__initialized:
+            return setattr(self[torch.cuda.current_device()], name, value)
+        else:
+            return super().__setattr__(name, value)
+
+
+class cuBLASModule:
+    def __getattr__(self, name):
+        if name == "allow_tf32":
+            return torch._C._get_cublas_allow_tf32()
+        elif name == "allow_fp16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_fp16_reduced_precision_reduction()
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._get_cublas_allow_bf16_reduced_precision_reduction()
+        raise AttributeError("Unknown attribute " + name)
+
+    def __setattr__(self, name, value):
+        if name == "allow_tf32":
+            return torch._C._set_cublas_allow_tf32(value)
+        elif name == "allow_fp16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_fp16_reduced_precision_reduction(value)
+        elif name == "allow_bf16_reduced_precision_reduction":
+            return torch._C._set_cublas_allow_bf16_reduced_precision_reduction(value)
+        raise AttributeError("Unknown attribute " + name)
+
+
+_LinalgBackends = {
+    "default": torch._C._LinalgBackend.Default,
+    "cusolver": torch._C._LinalgBackend.Cusolver,
+    "magma": torch._C._LinalgBackend.Magma,
+}
+_LinalgBackends_str = ", ".join(_LinalgBackends.keys())
+
+
+def preferred_linalg_library(
+    backend: Union[None, str, torch._C._LinalgBackend] = None
+) -> torch._C._LinalgBackend:
+    r"""
+    Override the heuristic PyTorch uses to choose between cuSOLVER and MAGMA for CUDA linear algebra operations.
+
+    .. warning:: This flag is experimental and subject to change.
+
+    When PyTorch runs a CUDA linear algebra operation it often uses the cuSOLVER or MAGMA libraries,
+    and if both are available it decides which to use with a heuristic.
+    This flag (a :class:`str`) allows overriding those heuristics.
+
+    * If `"cusolver"` is set then cuSOLVER will be used wherever possible.
+    * If `"magma"` is set then MAGMA will be used wherever possible.
+    * If `"default"` (the default) is set then heuristics will be used to pick between
+      cuSOLVER and MAGMA if both are available.
+    * When no input is given, this function returns the currently preferred library.
+    * User may use the environment variable TORCH_LINALG_PREFER_CUSOLVER=1 to set the preferred library to cuSOLVER
+      globally.
+      This flag only sets the initial value of the preferred library and the preferred library
+      may still be overridden by this function call later in your script.
+
+    Note: When a library is preferred other libraries may still be used if the preferred library
+    doesn't implement the operation(s) called.
+    This flag may achieve better performance if PyTorch's heuristic library selection is incorrect
+    for your application's inputs.
+
+    Currently supported linalg operators:
+
+    * :func:`torch.linalg.inv`
+    * :func:`torch.linalg.inv_ex`
+    * :func:`torch.linalg.cholesky`
+    * :func:`torch.linalg.cholesky_ex`
+    * :func:`torch.cholesky_solve`
+    * :func:`torch.cholesky_inverse`
+    * :func:`torch.linalg.lu_factor`
+    * :func:`torch.linalg.lu`
+    * :func:`torch.linalg.lu_solve`
+    * :func:`torch.linalg.qr`
+    * :func:`torch.linalg.eigh`
+    * :func:`torch.linalg.eighvals`
+    * :func:`torch.linalg.svd`
+    * :func:`torch.linalg.svdvals`
+    """
+    if backend is None:
+        pass
+    elif isinstance(backend, str):
+        if backend not in _LinalgBackends:
+            raise RuntimeError(
+                "Unknown input value. " f"Choose from: {_LinalgBackends_str}."
+            )
+        torch._C._set_linalg_preferred_backend(_LinalgBackends[backend])
+    elif isinstance(backend, torch._C._LinalgBackend):
+        torch._C._set_linalg_preferred_backend(backend)
+    else:
+        raise RuntimeError("Unknown input value type.")
+
+    return torch._C._get_linalg_preferred_backend()
+
+
+from torch._C import _SDPAParams as SDPAParams, _SDPBackend as SDPBackend
+
+# Set the __module__ attribute
+SDPAParams.__module__ = "torch.backends.cuda"
+SDPAParams.__name__ = "SDPAParams"
+
+
+def flash_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether flash scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_flash_sdp_enabled()
+
+
+def enable_flash_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables flash scaled dot product attention.
+    """
+    torch._C._set_sdp_use_flash(enabled)
+
+
+def mem_efficient_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether memory efficient scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_mem_efficient_sdp_enabled()
+
+
+def enable_mem_efficient_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables memory efficient scaled dot product attention.
+    """
+    torch._C._set_sdp_use_mem_efficient(enabled)
+
+
+def math_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether math scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_math_sdp_enabled()
+
+
+def enable_math_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables math scaled dot product attention.
+    """
+    torch._C._set_sdp_use_math(enabled)
+
+
+def can_use_flash_attention(params: SDPAParams, debug: bool = False) -> bool:
+    r"""Check if FlashAttention can be utilized in scaled_dot_product_attention.
+
+    Args:
+        params: An instance of SDPAParams containing the tensors for query,
+                key, value, an optional attention mask, dropout rate, and
+                a flag indicating if the attention is causal.
+        debug: Whether to logging.warn debug information as to why FlashAttention could not be run.
+            Defaults to False.
+
+    Returns:
+        True if FlashAttention can be used with the given parameters; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._can_use_flash_attention(params, debug)
+
+
+def can_use_efficient_attention(params: SDPAParams, debug: bool = False) -> bool:
+    r"""Check if efficient_attention can be utilized in scaled_dot_product_attention.
+
+    Args:
+        params: An instance of SDPAParams containing the tensors for query,
+                key, value, an optional attention mask, dropout rate, and
+                a flag indicating if the attention is causal.
+        debug: Whether to logging.warn with information as to why efficient_attention could not be run.
+            Defaults to False.
+
+    Returns:
+        True if efficient_attention can be used with the given parameters; otherwise, False.
+
+    Note:
+        This function is dependent on a CUDA-enabled build of PyTorch. It will return False
+        in non-CUDA environments.
+    """
+    return torch._C._can_use_mem_efficient_attention(params, debug)
+
+
+def cudnn_sdp_enabled():
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Returns whether cuDNN scaled dot product attention is enabled or not.
+    """
+    return torch._C._get_cudnn_sdp_enabled()
+
+
+def enable_cudnn_sdp(enabled: bool):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    Enables or disables cuDNN scaled dot product attention.
+    """
+    torch._C._set_sdp_use_cudnn(enabled)
+
+
+@contextlib.contextmanager
+def sdp_kernel(
+    enable_flash: bool = True,
+    enable_math: bool = True,
+    enable_mem_efficient: bool = True,
+    enable_cudnn: bool = True,
+):
+    r"""
+    .. warning:: This flag is beta and subject to change.
+
+    This context manager can be used to temporarily enable or disable any of the three backends for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored.
+    """
+    warnings.warn(
+        (
+            "torch.backends.cuda.sdp_kernel() "
+            "is deprecated. In the future, this context manager will be removed. "
+            "Please see, torch.nn.attention.sdpa_kernel() for the new context manager, with updated "
+            "signature."
+        ),
+        FutureWarning,
+    )
+    from torch.nn.attention import sdpa_kernel, SDPBackend
+
+    backend_list = []
+    if enable_flash:
+        backend_list.append(SDPBackend.FLASH_ATTENTION)
+    if enable_mem_efficient:
+        backend_list.append(SDPBackend.EFFICIENT_ATTENTION)
+    if enable_math:
+        backend_list.append(SDPBackend.MATH)
+    if enable_cudnn:
+        backend_list.append(SDPBackend.CUDNN_ATTENTION)
+
+    with sdpa_kernel(backend_list) as context:
+        try:
+            yield context
+        finally:
+            pass
+
+
+cufft_plan_cache = cuFFTPlanCacheManager()
+matmul = cuBLASModule()
diff --git a/MLPY/Lib/site-packages/torch/backends/cuda/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/cuda/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cf7fdc2911d65ae5b785f1f7685f2fd29c7392e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/cuda/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/cudnn/__init__.py b/MLPY/Lib/site-packages/torch/backends/cudnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2939fa72b4fa2a6293e6eba9ba3385fd4ea34045
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/cudnn/__init__.py
@@ -0,0 +1,206 @@
+import os
+import sys
+import warnings
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+try:
+    from torch._C import _cudnn
+except ImportError:
+    _cudnn = None  # type: ignore[assignment]
+
+# Write:
+#
+#   torch.backends.cudnn.enabled = False
+#
+# to globally disable CuDNN/MIOpen
+
+__cudnn_version: Optional[int] = None
+
+if _cudnn is not None:
+
+    def _init():
+        global __cudnn_version
+        if __cudnn_version is None:
+            __cudnn_version = _cudnn.getVersionInt()
+            runtime_version = _cudnn.getRuntimeVersion()
+            compile_version = _cudnn.getCompileVersion()
+            runtime_major, runtime_minor, _ = runtime_version
+            compile_major, compile_minor, _ = compile_version
+            # Different major versions are always incompatible
+            # Starting with cuDNN 7, minor versions are backwards-compatible
+            # Not sure about MIOpen (ROCm), so always do a strict check
+            if runtime_major != compile_major:
+                cudnn_compatible = False
+            elif runtime_major < 7 or not _cudnn.is_cuda:
+                cudnn_compatible = runtime_minor == compile_minor
+            else:
+                cudnn_compatible = runtime_minor >= compile_minor
+            if not cudnn_compatible:
+                if os.environ.get("PYTORCH_SKIP_CUDNN_COMPATIBILITY_CHECK", "0") == "1":
+                    return True
+                base_error_msg = (
+                    f"cuDNN version incompatibility: "
+                    f"PyTorch was compiled  against {compile_version} "
+                    f"but found runtime version {runtime_version}. "
+                    f"PyTorch already comes bundled with cuDNN. "
+                    f"One option to resolving this error is to ensure PyTorch "
+                    f"can find the bundled cuDNN. "
+                )
+
+                if "LD_LIBRARY_PATH" in os.environ:
+                    ld_library_path = os.environ.get("LD_LIBRARY_PATH", "")
+                    if any(
+                        substring in ld_library_path for substring in ["cuda", "cudnn"]
+                    ):
+                        raise RuntimeError(
+                            f"{base_error_msg}"
+                            f"Looks like your LD_LIBRARY_PATH contains incompatible version of cudnn. "
+                            f"Please either remove it from the path or install cudnn {compile_version}"
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"{base_error_msg}"
+                            f"one possibility is that there is a "
+                            f"conflicting cuDNN in LD_LIBRARY_PATH."
+                        )
+                else:
+                    raise RuntimeError(base_error_msg)
+
+        return True
+
+else:
+
+    def _init():
+        return False
+
+
+def version():
+    """Return the version of cuDNN."""
+    if not _init():
+        return None
+    return __cudnn_version
+
+
+CUDNN_TENSOR_DTYPES = {
+    torch.half,
+    torch.float,
+    torch.double,
+}
+
+
+def is_available():
+    r"""Return a bool indicating if CUDNN is currently available."""
+    return torch._C._has_cudnn
+
+
+def is_acceptable(tensor):
+    if not torch._C._get_cudnn_enabled():
+        return False
+    if tensor.device.type != "cuda" or tensor.dtype not in CUDNN_TENSOR_DTYPES:
+        return False
+    if not is_available():
+        warnings.warn(
+            "PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild "
+            "PyTorch making sure the library is visible to the build system."
+        )
+        return False
+    if not _init():
+        warnings.warn(
+            "cuDNN/MIOpen library not found. Check your {libpath}".format(
+                libpath={"darwin": "DYLD_LIBRARY_PATH", "win32": "PATH"}.get(
+                    sys.platform, "LD_LIBRARY_PATH"
+                )
+            )
+        )
+        return False
+    return True
+
+
+def set_flags(
+    _enabled=None,
+    _benchmark=None,
+    _benchmark_limit=None,
+    _deterministic=None,
+    _allow_tf32=None,
+):
+    orig_flags = (
+        torch._C._get_cudnn_enabled(),
+        torch._C._get_cudnn_benchmark(),
+        None if not is_available() else torch._C._cuda_get_cudnn_benchmark_limit(),
+        torch._C._get_cudnn_deterministic(),
+        torch._C._get_cudnn_allow_tf32(),
+    )
+    if _enabled is not None:
+        torch._C._set_cudnn_enabled(_enabled)
+    if _benchmark is not None:
+        torch._C._set_cudnn_benchmark(_benchmark)
+    if _benchmark_limit is not None and is_available():
+        torch._C._cuda_set_cudnn_benchmark_limit(_benchmark_limit)
+    if _deterministic is not None:
+        torch._C._set_cudnn_deterministic(_deterministic)
+    if _allow_tf32 is not None:
+        torch._C._set_cudnn_allow_tf32(_allow_tf32)
+    return orig_flags
+
+
+@contextmanager
+def flags(
+    enabled=False,
+    benchmark=False,
+    benchmark_limit=10,
+    deterministic=False,
+    allow_tf32=True,
+):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(
+            enabled, benchmark, benchmark_limit, deterministic, allow_tf32
+        )
+    try:
+        yield
+    finally:
+        # recover the previous values
+        with __allow_nonbracketed_mutation():
+            set_flags(*orig_flags)
+
+
+# The magic here is to allow us to intercept code like this:
+#
+#   torch.backends.<cudnn|mkldnn>.enabled = True
+
+
+class CudnnModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    enabled = ContextProp(torch._C._get_cudnn_enabled, torch._C._set_cudnn_enabled)
+    deterministic = ContextProp(
+        torch._C._get_cudnn_deterministic, torch._C._set_cudnn_deterministic
+    )
+    benchmark = ContextProp(
+        torch._C._get_cudnn_benchmark, torch._C._set_cudnn_benchmark
+    )
+    benchmark_limit = None
+    if is_available():
+        benchmark_limit = ContextProp(
+            torch._C._cuda_get_cudnn_benchmark_limit,
+            torch._C._cuda_set_cudnn_benchmark_limit,
+        )
+    allow_tf32 = ContextProp(
+        torch._C._get_cudnn_allow_tf32, torch._C._set_cudnn_allow_tf32
+    )
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = CudnnModule(sys.modules[__name__], __name__)
+
+# Add type annotation for the replaced module
+enabled: bool
+deterministic: bool
+benchmark: bool
+allow_tf32: bool
+benchmark_limit: int
diff --git a/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dd8d20bf0ee32606c91a9086f90582ca9033cbe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4976c7f091ea415da0e0034f1318ed92ebb3543
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/cudnn/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/cudnn/rnn.py b/MLPY/Lib/site-packages/torch/backends/cudnn/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bbbe5676413715ecf551326cc392584ae5b356a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/cudnn/rnn.py
@@ -0,0 +1,62 @@
+import torch.cuda
+
+try:
+    from torch._C import _cudnn
+except ImportError:
+    # Uses of all the functions below should be guarded by torch.backends.cudnn.is_available(),
+    # so it's safe to not emit any checks here.
+    _cudnn = None  # type: ignore[assignment]
+
+
+def get_cudnn_mode(mode):
+    if mode == "RNN_RELU":
+        return int(_cudnn.RNNMode.rnn_relu)
+    elif mode == "RNN_TANH":
+        return int(_cudnn.RNNMode.rnn_tanh)
+    elif mode == "LSTM":
+        return int(_cudnn.RNNMode.lstm)
+    elif mode == "GRU":
+        return int(_cudnn.RNNMode.gru)
+    else:
+        raise Exception(f"Unknown mode: {mode}")
+
+
+# NB: We don't actually need this class anymore (in fact, we could serialize the
+# dropout state for even better reproducibility), but it is kept for backwards
+# compatibility for old models.
+class Unserializable:
+    def __init__(self, inner):
+        self.inner = inner
+
+    def get(self):
+        return self.inner
+
+    def __getstate__(self):
+        # Note: can't return {}, because python2 won't call __setstate__
+        # if the value evaluates to False
+        return "<unserializable>"
+
+    def __setstate__(self, state):
+        self.inner = None
+
+
+def init_dropout_state(dropout, train, dropout_seed, dropout_state):
+    dropout_desc_name = "desc_" + str(torch.cuda.current_device())
+    dropout_p = dropout if train else 0
+    if (dropout_desc_name not in dropout_state) or (
+        dropout_state[dropout_desc_name].get() is None
+    ):
+        if dropout_p == 0:
+            dropout_state[dropout_desc_name] = Unserializable(None)
+        else:
+            dropout_state[dropout_desc_name] = Unserializable(
+                torch._cudnn_init_dropout_state(  # type: ignore[call-arg]
+                    dropout_p,
+                    train,
+                    dropout_seed,
+                    self_ty=torch.uint8,
+                    device=torch.device("cuda"),
+                )
+            )
+    dropout_ts = dropout_state[dropout_desc_name].get()
+    return dropout_ts
diff --git a/MLPY/Lib/site-packages/torch/backends/mha/__init__.py b/MLPY/Lib/site-packages/torch/backends/mha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48cd8ce957cdf149ce7fd4608710b303261b3dda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/mha/__init__.py
@@ -0,0 +1,24 @@
+# Config options to enable/disable C++ kernel for nn.functional.MHA
+# and nn.TransformerEncoder
+import torch
+
+_is_fastpath_enabled: bool = True
+
+
+def get_fastpath_enabled() -> bool:
+    """Returns whether fast path for TransformerEncoder and MultiHeadAttention
+    is enabled, or ``True`` if jit is scripting.
+
+    ..note:
+        The fastpath might not be run even if ``get_fastpath_enabled`` returns
+        ``True`` unless all conditions on inputs are met.
+    """
+    if not torch.jit.is_scripting():
+        return _is_fastpath_enabled
+    return True
+
+
+def set_fastpath_enabled(value: bool) -> None:
+    """Sets whether fast path is enabled"""
+    global _is_fastpath_enabled
+    _is_fastpath_enabled = value
diff --git a/MLPY/Lib/site-packages/torch/backends/mha/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/mha/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31882d6906a9d030a5d900ad69d2ce8d92360e52
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/mha/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/mkl/__init__.py b/MLPY/Lib/site-packages/torch/backends/mkl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a79bf8d184428242e76805185007878dd356c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/mkl/__init__.py
@@ -0,0 +1,56 @@
+import torch
+
+
+def is_available():
+    r"""Return whether PyTorch is built with MKL support."""
+    return torch._C.has_mkl
+
+
+VERBOSE_OFF = 0
+VERBOSE_ON = 1
+
+
+class verbose:
+    """
+    On-demand oneMKL verbosing functionality.
+
+    To make it easier to debug performance issues, oneMKL can dump verbose
+    messages containing execution information like duration while executing
+    the kernel. The verbosing functionality can be invoked via an environment
+    variable named `MKL_VERBOSE`. However, this methodology dumps messages in
+    all steps. Those are a large amount of verbose messages. Moreover, for
+    investigating the performance issues, generally taking verbose messages
+    for one single iteration is enough. This on-demand verbosing functionality
+    makes it possible to control scope for verbose message dumping. In the
+    following example, verbose messages will be dumped out for the second
+    inference only.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        import torch
+        model(data)
+        with torch.backends.mkl.verbose(torch.backends.mkl.VERBOSE_ON):
+            model(data)
+
+    Args:
+        level: Verbose level
+            - ``VERBOSE_OFF``: Disable verbosing
+            - ``VERBOSE_ON``:  Enable verbosing
+    """
+
+    def __init__(self, enable):
+        self.enable = enable
+
+    def __enter__(self):
+        if self.enable == VERBOSE_OFF:
+            return
+        st = torch._C._verbose.mkl_set_verbose(self.enable)
+        assert (
+            st
+        ), "Failed to set MKL into verbose mode. Please consider to disable this verbose scope."
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch._C._verbose.mkl_set_verbose(VERBOSE_OFF)
+        return False
diff --git a/MLPY/Lib/site-packages/torch/backends/mkl/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/mkl/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83c9399aaf34fe6256e2a8bb26d336eb2dc92fa4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/mkl/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/mkldnn/__init__.py b/MLPY/Lib/site-packages/torch/backends/mkldnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d9f83f95d8d147f60e3bde84c1dc191b1c49e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/mkldnn/__init__.py
@@ -0,0 +1,97 @@
+import sys
+from contextlib import contextmanager
+
+from typing import TYPE_CHECKING
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+
+def is_available():
+    r"""Return whether PyTorch is built with MKL-DNN support."""
+    return torch._C._has_mkldnn
+
+
+VERBOSE_OFF = 0
+VERBOSE_ON = 1
+VERBOSE_ON_CREATION = 2
+
+
+class verbose:
+    """
+    On-demand oneDNN (former MKL-DNN) verbosing functionality.
+
+    To make it easier to debug performance issues, oneDNN can dump verbose
+    messages containing information like kernel size, input data size and
+    execution duration while executing the kernel. The verbosing functionality
+    can be invoked via an environment variable named `DNNL_VERBOSE`. However,
+    this methodology dumps messages in all steps. Those are a large amount of
+    verbose messages. Moreover, for investigating the performance issues,
+    generally taking verbose messages for one single iteration is enough.
+    This on-demand verbosing functionality makes it possible to control scope
+    for verbose message dumping. In the following example, verbose messages
+    will be dumped out for the second inference only.
+
+    .. highlight:: python
+    .. code-block:: python
+
+        import torch
+        model(data)
+        with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
+            model(data)
+
+    Args:
+        level: Verbose level
+            - ``VERBOSE_OFF``: Disable verbosing
+            - ``VERBOSE_ON``:  Enable verbosing
+            - ``VERBOSE_ON_CREATION``: Enable verbosing, including oneDNN kernel creation
+    """
+
+    def __init__(self, level):
+        self.level = level
+
+    def __enter__(self):
+        if self.level == VERBOSE_OFF:
+            return
+        st = torch._C._verbose.mkldnn_set_verbose(self.level)
+        assert (
+            st
+        ), "Failed to set MKLDNN into verbose mode. Please consider to disable this verbose scope."
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        torch._C._verbose.mkldnn_set_verbose(VERBOSE_OFF)
+        return False
+
+
+def set_flags(_enabled):
+    orig_flags = (torch._C._get_mkldnn_enabled(),)
+    torch._C._set_mkldnn_enabled(_enabled)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=False):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled)
+    try:
+        yield
+    finally:
+        with __allow_nonbracketed_mutation():
+            set_flags(orig_flags[0])
+
+
+class MkldnnModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    enabled = ContextProp(torch._C._get_mkldnn_enabled, torch._C._set_mkldnn_enabled)
+
+
+if TYPE_CHECKING:
+    enabled: ContextProp
+
+
+# Cool stuff from torch/backends/cudnn/__init__.py and
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = MkldnnModule(sys.modules[__name__], __name__)
diff --git a/MLPY/Lib/site-packages/torch/backends/mkldnn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/mkldnn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..debe775ccc8916b83ab91d237fd1089c53523774
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/mkldnn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/mps/__init__.py b/MLPY/Lib/site-packages/torch/backends/mps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84f2a190303c90d8a73a9825835a2d0a3db5cd04
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/mps/__init__.py
@@ -0,0 +1,54 @@
+from functools import lru_cache as _lru_cache
+
+from typing import Optional
+
+import torch
+from ...library import Library as _Library
+
+__all__ = ["is_built", "is_available", "is_macos13_or_newer", "is_macos_or_newer"]
+
+
+def is_built() -> bool:
+    r"""Return whether PyTorch is built with MPS support.
+
+    Note that this doesn't necessarily mean MPS is available; just that
+    if this PyTorch binary were run a machine with working MPS drivers
+    and devices, we would be able to use it.
+    """
+    return torch._C._has_mps
+
+
+@_lru_cache
+def is_available() -> bool:
+    r"""Return a bool indicating if MPS is currently available."""
+    return torch._C._mps_is_available()
+
+
+@_lru_cache
+def is_macos_or_newer(major: int, minor: int) -> bool:
+    r"""Return a bool indicating whether MPS is running on given MacOS or newer."""
+    return torch._C._mps_is_on_macos_or_newer(major, minor)
+
+
+@_lru_cache
+def is_macos13_or_newer(minor: int = 0) -> bool:
+    r"""Return a bool indicating whether MPS is running on MacOS 13 or newer."""
+    return torch._C._mps_is_on_macos_or_newer(13, minor)
+
+
+_lib: Optional[_Library] = None
+
+
+def _init():
+    r"""Register prims as implementation of var_mean and group_norm."""
+    global _lib
+    if is_built() is False or _lib is not None:
+        return
+    from ..._decomp.decompositions import (
+        native_group_norm_backward as _native_group_norm_backward,
+    )
+    from ..._refs import native_group_norm as _native_group_norm
+
+    _lib = _Library("aten", "IMPL")
+    _lib.impl("native_group_norm", _native_group_norm, "MPS")
+    _lib.impl("native_group_norm_backward", _native_group_norm_backward, "MPS")
diff --git a/MLPY/Lib/site-packages/torch/backends/mps/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/mps/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6991284813119e55b744feee9075ce387aa0f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/mps/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/nnpack/__init__.py b/MLPY/Lib/site-packages/torch/backends/nnpack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..938ed29a44572a9f529dda1632dfed915c006a28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/nnpack/__init__.py
@@ -0,0 +1,30 @@
+from contextlib import contextmanager
+
+import torch
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+__all__ = ["is_available", "flags", "set_flags"]
+
+
+def is_available():
+    r"""Return whether PyTorch is built with NNPACK support."""
+    return torch._nnpack_available()
+
+
+def set_flags(_enabled):
+    r"""Set if nnpack is enabled globally"""
+    orig_flags = (torch._C._get_nnpack_enabled(),)
+    torch._C._set_nnpack_enabled(_enabled)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=False):
+    r"""Context manager for setting if nnpack is enabled globally"""
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled)
+    try:
+        yield
+    finally:
+        with __allow_nonbracketed_mutation():
+            set_flags(orig_flags[0])
diff --git a/MLPY/Lib/site-packages/torch/backends/nnpack/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/nnpack/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..119488f907ce85e1c0e72803bda834455e3e5912
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/nnpack/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/openmp/__init__.py b/MLPY/Lib/site-packages/torch/backends/openmp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e0afd5a0e58f82d010db5775cae6bf46c48336
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/openmp/__init__.py
@@ -0,0 +1,6 @@
+import torch
+
+
+def is_available():
+    r"""Return whether PyTorch is built with OpenMP support."""
+    return torch._C.has_openmp
diff --git a/MLPY/Lib/site-packages/torch/backends/openmp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/openmp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95f885e201da04103a00da70fa3d1d38eaa0d12b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/openmp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/opt_einsum/__init__.py b/MLPY/Lib/site-packages/torch/backends/opt_einsum/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..26903bfe75b143c34d00dca97e9a03b0b45f4c29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/opt_einsum/__init__.py
@@ -0,0 +1,110 @@
+import sys
+import warnings
+from contextlib import contextmanager
+from functools import lru_cache as _lru_cache
+from typing import Any
+
+from torch.backends import __allow_nonbracketed_mutation, ContextProp, PropModule
+
+try:
+    import opt_einsum as _opt_einsum  # type: ignore[import]
+except ImportError:
+    _opt_einsum = None
+
+
+@_lru_cache
+def is_available() -> bool:
+    r"""Return a bool indicating if opt_einsum is currently available."""
+    return _opt_einsum is not None
+
+
+def get_opt_einsum() -> Any:
+    r"""Return the opt_einsum package if opt_einsum is currently available, else None."""
+    return _opt_einsum
+
+
+def _set_enabled(_enabled: bool) -> None:
+    if not is_available() and _enabled:
+        raise ValueError(
+            f"opt_einsum is not available, so setting `enabled` to {_enabled} will not reap "
+            "the benefits of calculating an optimal path for einsum. torch.einsum will "
+            "fall back to contracting from left to right. To enable this optimal path "
+            "calculation, please install opt-einsum."
+        )
+    global enabled
+    enabled = _enabled
+
+
+def _get_enabled() -> bool:
+    return enabled
+
+
+def _set_strategy(_strategy: str) -> None:
+    if not is_available():
+        raise ValueError(
+            f"opt_einsum is not available, so setting `strategy` to {_strategy} will not be meaningful. "
+            "torch.einsum will bypass path calculation and simply contract from left to right. "
+            "Please install opt_einsum or unset `strategy`."
+        )
+    if not enabled:
+        raise ValueError(
+            f"opt_einsum is not enabled, so setting a `strategy` to {_strategy} will not be meaningful. "
+            "torch.einsum will bypass path calculation and simply contract from left to right. "
+            "Please set `enabled` to `True` as well or unset `strategy`."
+        )
+    if _strategy not in ["auto", "greedy", "optimal"]:
+        raise ValueError(
+            f"`strategy` must be one of the following: [auto, greedy, optimal] but is {_strategy}"
+        )
+    global strategy
+    strategy = _strategy
+
+
+def _get_strategy() -> str:
+    return strategy
+
+
+def set_flags(_enabled=None, _strategy=None):
+    orig_flags = (enabled, None if not is_available() else strategy)
+    if _enabled is not None:
+        _set_enabled(_enabled)
+    if _strategy is not None:
+        _set_strategy(_strategy)
+    return orig_flags
+
+
+@contextmanager
+def flags(enabled=None, strategy=None):
+    with __allow_nonbracketed_mutation():
+        orig_flags = set_flags(enabled, strategy)
+    try:
+        yield
+    finally:
+        # recover the previous values
+        with __allow_nonbracketed_mutation():
+            set_flags(*orig_flags)
+
+
+# The magic here is to allow us to intercept code like this:
+#
+#   torch.backends.opt_einsum.enabled = True
+
+
+class OptEinsumModule(PropModule):
+    def __init__(self, m, name):
+        super().__init__(m, name)
+
+    global enabled
+    enabled = ContextProp(_get_enabled, _set_enabled)
+    global strategy
+    strategy = None
+    if is_available():
+        strategy = ContextProp(_get_strategy, _set_strategy)
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = OptEinsumModule(sys.modules[__name__], __name__)
+
+enabled = True if is_available() else False
+strategy = "auto" if is_available() else None
diff --git a/MLPY/Lib/site-packages/torch/backends/opt_einsum/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/opt_einsum/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3ad4df3e554ed64270eaafd6dcfd265b0a67987
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/opt_einsum/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/quantized/__init__.py b/MLPY/Lib/site-packages/torch/backends/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d715f6d4acc1750e76482ab06f3792974b22591d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/quantized/__init__.py
@@ -0,0 +1,65 @@
+import sys
+import types
+from typing import List
+
+import torch
+
+
+# This function should correspond to the enums present in c10/core/QEngine.h
+def _get_qengine_id(qengine: str) -> int:
+    if qengine == "none" or qengine == "" or qengine is None:
+        ret = 0
+    elif qengine == "fbgemm":
+        ret = 1
+    elif qengine == "qnnpack":
+        ret = 2
+    elif qengine == "onednn":
+        ret = 3
+    elif qengine == "x86":
+        ret = 4
+    else:
+        ret = -1
+        raise RuntimeError(f"{qengine} is not a valid value for quantized engine")
+    return ret
+
+
+# This function should correspond to the enums present in c10/core/QEngine.h
+def _get_qengine_str(qengine: int) -> str:
+    all_engines = {0: "none", 1: "fbgemm", 2: "qnnpack", 3: "onednn", 4: "x86"}
+    return all_engines.get(qengine, "*undefined")
+
+
+class _QEngineProp:
+    def __get__(self, obj, objtype) -> str:
+        return _get_qengine_str(torch._C._get_qengine())
+
+    def __set__(self, obj, val: str) -> None:
+        torch._C._set_qengine(_get_qengine_id(val))
+
+
+class _SupportedQEnginesProp:
+    def __get__(self, obj, objtype) -> List[str]:
+        qengines = torch._C._supported_qengines()
+        return [_get_qengine_str(qe) for qe in qengines]
+
+    def __set__(self, obj, val) -> None:
+        raise RuntimeError("Assignment not supported")
+
+
+class QuantizedEngine(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+    engine = _QEngineProp()
+    supported_engines = _SupportedQEnginesProp()
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = QuantizedEngine(sys.modules[__name__], __name__)
+engine: str
+supported_engines: List[str]
diff --git a/MLPY/Lib/site-packages/torch/backends/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdae660aae7aa884c6cda8f709c7c6f52885bc72
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/xeon/__init__.py b/MLPY/Lib/site-packages/torch/backends/xeon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59cceff859ff5e0bec1d99334db3f7748b4c3835
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/run_cpu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/run_cpu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3970883ac193e06ca1de56dd1beeb5fa98d887d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/xeon/__pycache__/run_cpu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/backends/xeon/run_cpu.py b/MLPY/Lib/site-packages/torch/backends/xeon/run_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdfe445abad86c8140f4b85f176b8fa1d0c799b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/xeon/run_cpu.py
@@ -0,0 +1,929 @@
+"""
+This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable Processors with optimal configurations.
+
+Single instance inference, multi-instance inference are enabled.
+
+Note: term "instance" here doesn't refer to a cloud instance. This script is executed as a single process. It invokes
+multiple "instances" which are formed from multiple threads for each. "instance" is kind of group of threads in this
+context.
+
+Illustrated as below:
+
+::
+
+    +-----------------------------+----------------------+-------+
+    |            process          |        thread        | core  |
+    +=============================+======================+=======+
+    | torch.backends.xeon.run_cpu | instance 0: thread 0 |   0   |
+    |                             |             thread 1 |   1   |
+    |                             +----------------------+-------+
+    |                             | instance 1: thread 0 |   2   |
+    |                             |             thread 1 |   3   |
+    |                             +----------------------+-------+
+    |                             | ...                  |  ...  |
+    |                             +----------------------+-------+
+    |                             | instance N: thread 0 |   M   |
+    |                             |             thread 1 |  M+1  |
+    +-----------------------------+----------------------+-------+
+
+To get the peak performance on Intel(R) Xeon(R) Scalable Processors, the script optimizes the configuration of thread and memory
+management. For thread management, the script configures thread affinity and the preload of Intel OMP library.
+For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc).
+
+Environment variables that will be set by this script:
+
++------------------+-------------------------------------------------------------------------------------------------+
+| Environ Variable |                                             Value                                               |
++==================+=================================================================================================+
+|    LD_PRELOAD    | Depending on knobs you set, <lib>/libiomp5.so, <lib>/libjemalloc.so, <lib>/libtcmalloc.so might |
+|                  | be appended to LD_PRELOAD.                                                                      |
++------------------+-------------------------------------------------------------------------------------------------+
+|   KMP_AFFINITY   | If libiomp5.so is preloaded, KMP_AFFINITY could be set to "granularity=fine,compact,1,0".       |
++------------------+-------------------------------------------------------------------------------------------------+
+|   KMP_BLOCKTIME  | If libiomp5.so is preloaded, KMP_BLOCKTIME is set to "1".                                       |
++------------------+-------------------------------------------------------------------------------------------------+
+|  OMP_NUM_THREADS | value of ncores_per_instance                                                                    |
++------------------+-------------------------------------------------------------------------------------------------+
+|    MALLOC_CONF   | If libjemalloc.so is preloaded, MALLOC_CONF will be set to                                      |
+|                  | "oversize_threshold:1,background_thread:true,metadata_thp:auto".                                |
++------------------+-------------------------------------------------------------------------------------------------+
+
+*Note*: This script respects environment variables set preliminarily. I.e. If you set the environment variables
+mentioned above before running the script, the script will not overwrite the values in the script.
+
+How to use this module:
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Single instance inference
+-------------------------
+
+1. Run single-instance inference on a single node with all CPU nodes.
+
+::
+
+   python -m torch.backends.xeon.run_cpu --throughput-mode script.py args
+
+2. Run single-instance inference on a single CPU node.
+
+::
+
+   python -m torch.backends.xeon.run_cpu --node-id 1 script.py args
+
+Multi-instance inference
+------------------------
+
+1. Multi-instance
+   By default this tool runs one process per node. If you want to set the instance numbers and core per instance,
+   --ninstances and  --ncores-per-instance should be set.
+
+::
+
+   python -m torch.backends.xeon.run_cpu -- python_script args
+
+   eg: on an Intel(R) Xeon(R) Scalable Processor with 14 instance, 4 cores per instance
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 14 --ncores-per-instance 4 python_script args
+
+2. Run single-instance inference among multiple instances.
+   By default, runs all ninstances. If you want to independently run a single instance among ninstances, specify rank.
+
+   eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 0-27)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 0 python_script args
+
+   eg: run 1st instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance (i.e., numactl -C 28-55)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --ninstances 2 --rank 1 python_script args
+
+   eg: run 0th instance on an Intel(R) Xeon(R) Scalable Processor with 2 instance, 2 cores per instance,
+   first four cores (i.e., numactl -C 0-1)
+
+::
+
+   python -m torch.backends.xeon.run_cpu --core-list "0, 1, 2, 3" --ninstances 2 --ncores-per-instance 2
+   --rank 0 python_script args
+
+3. To look up what optional arguments this module offers:
+
+::
+
+    python -m torch.backends.xeon.run_cpu --help
+
+Memory allocator
+----------------
+
+"--enable-tcmalloc" and "--enable-jemalloc" can be used to enable different memory allcator.
+
+"""
+
+import glob
+import logging
+import os
+import platform
+import re
+import subprocess
+import sys
+from argparse import ArgumentParser, RawTextHelpFormatter, REMAINDER
+from os.path import expanduser
+from typing import Dict, List
+
+from torch.distributed.elastic.multiprocessing import (
+    DefaultLogsSpecs,
+    start_processes,
+    Std,
+)
+
+format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+logging.basicConfig(level=logging.INFO, format=format_str)
+logger = logging.getLogger(__name__)
+
+
+class _CPUinfo:
+    """Get CPU information, such as cores list and NUMA information."""
+
+    def __init__(self, test_input=""):
+        self.cpuinfo = []
+        if platform.system() in ["Windows", "Darwin"]:
+            raise RuntimeError(f"{platform.system()} is not supported!!!")
+        elif platform.system() == "Linux":
+            # Sample output of: `lscpu --parse=CPU,Core,Socket,Node`
+            #
+            # # The following is the parsable format, which can be fed to other
+            # # programs. Each different item in every column has an unique ID
+            # # starting from zero.
+            # # CPU,Core,Socket,Node
+            # 0,0,0,0
+            # 1,1,0,0
+            # ...
+            if test_input == "":
+                lscpu_cmd = ["lscpu", "--parse=CPU,Core,Socket,Node"]
+                lscpu_info = subprocess.check_output(
+                    lscpu_cmd, universal_newlines=True
+                ).split("\n")
+            else:
+                lscpu_info = test_input.split("\n")
+
+            # Get information about  cpu, core, socket and node
+            for line in lscpu_info:
+                pattern = r"^([\d]+,[\d]+,[\d]+,[\d]?)"
+                regex_out = re.search(pattern, line)
+                if regex_out:
+                    self.cpuinfo.append(regex_out.group(1).strip().split(","))
+
+            # physical cores := core column in lscpu output
+            #  logical cores :=  cPU column in lscpu output
+            self.node_nums = int(max([line[3] for line in self.cpuinfo])) + 1
+            self.node_physical_cores: List[List[int]] = []  # node_id is index
+            self.node_logical_cores: List[List[int]] = []  # node_id is index
+            self.physical_core_node_map = {}  # physical core to numa node id
+            self.logical_core_node_map = {}  # logical core to numa node id
+
+            for node_id in range(self.node_nums):
+                cur_node_physical_core = []
+                cur_node_logical_core = []
+                for cpuinfo in self.cpuinfo:
+                    nid = cpuinfo[3] if cpuinfo[3] != "" else "0"
+                    if node_id == int(nid):
+                        if int(cpuinfo[1]) not in cur_node_physical_core:
+                            cur_node_physical_core.append(int(cpuinfo[1]))
+                            self.physical_core_node_map[int(cpuinfo[1])] = int(node_id)
+                        cur_node_logical_core.append(int(cpuinfo[0]))
+                        self.logical_core_node_map[int(cpuinfo[0])] = int(node_id)
+                self.node_physical_cores.append(cur_node_physical_core)
+                self.node_logical_cores.append(cur_node_logical_core)
+
+    def _physical_core_nums(self):
+        return len(self.node_physical_cores) * len(self.node_physical_cores[0])
+
+    def _logical_core_nums(self):
+        return len(self.node_logical_cores) * len(self.node_logical_cores[0])
+
+    def get_node_physical_cores(self, node_id):
+        if node_id < 0 or node_id > self.node_nums - 1:
+            raise ValueError(
+                f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
+            )
+        return self.node_physical_cores[node_id]
+
+    def get_node_logical_cores(self, node_id):
+        if node_id < 0 or node_id > self.node_nums - 1:
+            raise ValueError(
+                f"Invalid node id: {node_id}. Valid node ids: {list(range(len(self.node_physical_cores)))}"
+            )
+        return self.node_logical_cores[node_id]
+
+    def get_all_physical_cores(self):
+        all_cores = []
+        for cores in self.node_physical_cores:
+            all_cores.extend(cores)
+        return all_cores
+
+    def get_all_logical_cores(self):
+        all_cores = []
+        for cores in self.node_logical_cores:
+            all_cores.extend(cores)
+        return all_cores
+
+    def numa_aware_check(self, core_list):
+        """
+        Check whether all cores in core_list are in the same NUMA node.
+
+        Cross NUMA will reduce performance.
+        We strongly advice to not use cores on different nodes.
+        """
+        cores_numa_map = self.logical_core_node_map
+        numa_ids = []
+        for core in core_list:
+            numa_id = cores_numa_map[core]
+            if numa_id not in numa_ids:
+                numa_ids.append(numa_id)
+        if len(numa_ids) > 1:
+            logger.warning(
+                "Numa Aware: cores:%s on different NUMA nodes:%s. To avoid \
+this behavior, please use --ncores-per-instance knob to make sure number of cores is divisible by --ncores-per-\
+instance. Alternatively, please use --skip-cross-node-cores knob.",
+                str(core_list),
+                str(numa_ids),
+            )
+        if len(numa_ids) == 0:
+            raise RuntimeError(
+                "invalid number of NUMA nodes; please make sure numa_ids >= 1"
+            )
+        return numa_ids
+
+
+class _Launcher:
+    r"""Class for launcher."""
+
+    msg_lib_notfound = f"Unable to find the {{0}} library file lib{{1}}.so in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib \
+or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or \
+{expanduser('~')}/.local/lib/ so the LD_PRELOAD environment variable will not be set."
+
+    def __init__(self):
+        self.cpuinfo = _CPUinfo()
+
+    def add_lib_preload(self, lib_type):
+        """Enable TCMalloc/JeMalloc/intel OpenMP."""
+        library_paths = []
+        if "CONDA_PREFIX" in os.environ:
+            library_paths.append(f"{os.environ['CONDA_PREFIX']}/lib")
+        if "VIRTUAL_ENV" in os.environ:
+            library_paths.append(f"{os.environ['VIRTUAL_ENV']}/lib")
+
+        library_paths += [
+            f"{expanduser('~')}/.local/lib",
+            "/usr/local/lib",
+            "/usr/local/lib64",
+            "/usr/lib",
+            "/usr/lib64",
+        ]
+
+        lib_find = False
+        lib_set = False
+        for item in os.getenv("LD_PRELOAD", "").split(":"):
+            if item.endswith(f"lib{lib_type}.so"):
+                lib_set = True
+                break
+        if not lib_set:
+            for lib_path in library_paths:
+                library_file = os.path.join(lib_path, f"lib{lib_type}.so")
+                matches = glob.glob(library_file)
+                if len(matches) > 0:
+                    ld_preloads = [f"{matches[0]}", os.getenv("LD_PRELOAD", "")]
+                    os.environ["LD_PRELOAD"] = os.pathsep.join(
+                        [p.strip(os.pathsep) for p in ld_preloads if p]
+                    )
+                    lib_find = True
+                    break
+        return lib_set or lib_find
+
+    def is_numactl_available(self):
+        numactl_available = False
+        try:
+            cmd = ["numactl", "-C", "0", "-m", "0", "hostname"]
+            r = subprocess.run(
+                cmd,
+                env=os.environ,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                check=False,
+            )
+            if r.returncode == 0:
+                numactl_available = True
+        except Exception:
+            pass
+        return numactl_available
+
+    def set_memory_allocator(
+        self, enable_tcmalloc=True, enable_jemalloc=False, use_default_allocator=False
+    ):
+        """
+        Enable TCMalloc/JeMalloc with LD_PRELOAD and set configuration for JeMalloc.
+
+        By default, PTMalloc will be used for PyTorch, but TCMalloc and JeMalloc can get better
+        memory reuse and reduce page fault to improve performance.
+        """
+        if enable_tcmalloc and enable_jemalloc:
+            raise RuntimeError(
+                "Unable to enable TCMalloc and JEMalloc at the same time."
+            )
+
+        if enable_tcmalloc:
+            find_tc = self.add_lib_preload(lib_type="tcmalloc")
+            if not find_tc:
+                msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge gperftools" to install {{0}}'
+                logger.warning(msg.format("TCmalloc", "tcmalloc"))  # noqa: G001
+            else:
+                logger.info("Use TCMalloc memory allocator")
+
+        elif enable_jemalloc:
+            find_je = self.add_lib_preload(lib_type="jemalloc")
+            if not find_je:
+                msg = f'{self.msg_lib_notfound} you can use "conda install -c conda-forge jemalloc" to install {{0}}'
+                logger.warning(msg.format("Jemalloc", "jemalloc"))  # noqa: G001
+            else:
+                logger.info("Use JeMalloc memory allocator")
+                self.set_env(
+                    "MALLOC_CONF",
+                    "oversize_threshold:1,background_thread:true,metadata_thp:auto",
+                )
+
+        elif use_default_allocator:
+            pass
+
+        else:
+            find_tc = self.add_lib_preload(lib_type="tcmalloc")
+            if find_tc:
+                logger.info("Use TCMalloc memory allocator")
+                return
+            find_je = self.add_lib_preload(lib_type="jemalloc")
+            if find_je:
+                logger.info("Use JeMalloc memory allocator")
+                return
+            logger.warning(
+                """Neither TCMalloc nor JeMalloc is found in $CONDA_PREFIX/lib or $VIRTUAL_ENV/lib
+                            or /.local/lib/ or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or
+                           %s/.local/lib/ so the LD_PRELOAD environment variable will not be set.
+                           This may drop the performance""",
+                expanduser("~"),
+            )
+
+    def log_env_var(self, env_var_name=""):
+        if env_var_name in os.environ:
+            logger.info("%s=%s", env_var_name, os.environ[env_var_name])
+
+    def set_env(self, env_name, env_value):
+        if not env_value:
+            logger.warning("%s is None", env_name)
+        if env_name not in os.environ:
+            os.environ[env_name] = env_value
+        elif os.environ[env_name] != env_value:
+            logger.warning(
+                "Overriding value with the one set in environment variable: %s. \
+Value applied: %s. Value ignored: %s",
+                env_name,
+                os.environ[env_name],
+                env_value,
+            )
+        self.log_env_var(env_name)
+
+    # set_kmp_affinity is used to control whether to set KMP_AFFINITY or not.
+    # In scenario that use all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores.
+    # In this case, KMP_AFFINITY should not be set.
+    def set_multi_thread_and_allocator(
+        self,
+        ncores_per_instance,
+        disable_iomp=False,
+        set_kmp_affinity=True,
+        enable_tcmalloc=True,
+        enable_jemalloc=False,
+        use_default_allocator=False,
+    ):
+        """
+        Set multi-thread configuration and enable Intel openMP and TCMalloc/JeMalloc.
+
+        By default, GNU openMP and PTMalloc are used in PyTorch. but Intel openMP and TCMalloc/JeMalloc are better alternatives
+        to get performance benefit.
+        """
+        self.set_memory_allocator(
+            enable_tcmalloc, enable_jemalloc, use_default_allocator
+        )
+        self.set_env("OMP_NUM_THREADS", str(ncores_per_instance))
+        if not disable_iomp:
+            find_iomp = self.add_lib_preload(lib_type="iomp5")
+            if not find_iomp:
+                msg = f'{self.msg_lib_notfound} you can use "conda install mkl" to install {{0}}'
+                logger.warning(msg.format("iomp", "iomp5"))  # noqa: G001
+            else:
+                logger.info("Using Intel OpenMP")
+                if set_kmp_affinity:
+                    self.set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+                self.set_env("KMP_BLOCKTIME", "1")
+        self.log_env_var("LD_PRELOAD")
+
+    r"""
+     Launcher for single instance and multi-instance
+     """
+
+    def launch(self, args):
+        cores = []
+        set_kmp_affinity = True
+        enable_taskset = False
+        if args.core_list:  # user specify what cores will be used by params
+            cores = [int(x) for x in args.core_list.split(",")]
+            if args.ncores_per_instance == -1:
+                raise RuntimeError(
+                    'please specify the "--ncores-per-instance" if you have pass the --core-list params'
+                )
+            elif (
+                args.ninstances > 1
+                and args.ncores_per_instance * args.ninstances < len(cores)
+            ):
+                logger.warning(
+                    "only first %s cores will be used, \
+but you specify %s cores in core_list",
+                    args.ncores_per_instance * args.ninstances,
+                    len(cores),
+                )
+            else:
+                args.ninstances = len(cores) // args.ncores_per_instance
+
+        else:
+            if args.use_logical_core:
+                if args.node_id != -1:
+                    cores = self.cpuinfo.get_node_logical_cores(args.node_id)
+                else:
+                    cores = self.cpuinfo.get_all_logical_cores()
+                    # When using all cores on all nodes, including logical cores,
+                    # setting KMP_AFFINITY disables logical cores. Thus, KMP_AFFINITY should not be set.
+                    set_kmp_affinity = False
+            else:
+                if args.node_id != -1:
+                    cores = self.cpuinfo.get_node_physical_cores(args.node_id)
+                else:
+                    cores = self.cpuinfo.get_all_physical_cores()
+            if (
+                not args.multi_instance
+                and args.ninstances == -1
+                and args.ncores_per_instance == -1
+            ):
+                args.ninstances = 1
+                args.ncores_per_instance = len(cores)
+            elif (
+                args.multi_instance
+                and args.ninstances == -1
+                and args.ncores_per_instance == -1
+            ):
+                args.throughput_mode = True
+            elif args.ncores_per_instance == -1 and args.ninstances != -1:
+                if args.ninstances > len(cores):
+                    raise RuntimeError(
+                        f"there are {len(cores)} total cores but you specify {args.ninstances} ninstances; \
+please make sure ninstances <= total_cores)"
+                    )
+                else:
+                    args.ncores_per_instance = len(cores) // args.ninstances
+            elif args.ncores_per_instance != -1 and args.ninstances == -1:
+                if not args.skip_cross_node_cores:
+                    args.ninstances = len(cores) // args.ncores_per_instance
+                else:
+                    ncore_per_node = len(self.cpuinfo.node_physical_cores[0])
+                    num_leftover_cores = ncore_per_node % args.ncores_per_instance
+                    if args.ncores_per_instance > ncore_per_node:
+                        # too many ncores_per_instance to skip cross-node cores
+                        logger.warning(
+                            "there are %s core(s) per socket, but you specify %s ncores_per_instance and \
+skip_cross_node_cores. Please make sure --ncores-per-instance < core(s) per \
+socket",
+                            ncore_per_node,
+                            args.ncores_per_instance,
+                        )
+                        sys.exit(-1)
+                    elif num_leftover_cores == 0:
+                        # aren't any cross-node cores
+                        logger.info(
+                            "--skip-cross-node-cores is set, but there are no cross-node cores."
+                        )
+                        args.ninstances = len(cores) // args.ncores_per_instance
+                    else:
+                        # skip cross-node cores
+                        if args.ninstances != -1:
+                            logger.warning(
+                                "--skip-cross-node-cores is exclusive to --ninstances. --ninstances \
+won't take effect even if it is set explicitly."
+                            )
+
+                        i = 1
+                        leftover_cores = set()
+                        while ncore_per_node * i <= len(cores):
+                            leftover_cores.update(
+                                cores[
+                                    ncore_per_node * i
+                                    - num_leftover_cores : ncore_per_node * i
+                                ]
+                            )
+                            i += 1
+                        cores = list(set(cores) - leftover_cores)
+                        assert len(cores) % args.ncores_per_instance == 0
+                        args.ninstances = len(cores) // args.ncores_per_instance
+            else:
+                if args.ninstances * args.ncores_per_instance > len(cores):
+                    raise RuntimeError(
+                        "Please make sure ninstances * ncores_per_instance <= total_cores"
+                    )
+            if args.latency_mode:
+                logger.warning(
+                    "--latency-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly."
+                )
+                args.ncores_per_instance = 4
+                cores = self.cpuinfo.get_all_physical_cores()
+                args.ninstances = len(cores) // args.ncores_per_instance
+
+            if args.throughput_mode:
+                logger.warning(
+                    "--throughput-mode is exclusive to --ninstances, --ncores-per-instance, --node-id and \
+--use-logical-core. They won't take effect even they are set explicitly."
+                )
+                args.ninstances = self.cpuinfo.node_nums
+                cores = self.cpuinfo.get_all_physical_cores()
+                args.ncores_per_instance = len(cores) // args.ninstances
+
+        if args.ninstances > 1 and args.rank != -1:
+            logger.info(
+                "assigning %s cores for instance %s",
+                args.ncores_per_instance,
+                args.rank,
+            )
+
+        if not args.disable_numactl:
+            numactl_available = self.is_numactl_available()
+            if not numactl_available:
+                if not args.disable_taskset:
+                    logger.warning(
+                        "Core binding with numactl is not available. Disabling numactl and using taskset instead. \
+                    This may affect performance in multi-socket system; please use numactl if memory binding is needed."
+                    )
+                    args.disable_numactl = True
+                    enable_taskset = True
+                else:
+                    logger.warning(
+                        "Core binding with numactl is not available, and --disable_taskset is set. \
+                    Please unset --disable_taskset to use taskset instead of numactl."
+                    )
+                    sys.exit(-1)
+
+        if not args.disable_taskset:
+            enable_taskset = True
+
+        self.set_multi_thread_and_allocator(
+            args.ncores_per_instance,
+            args.disable_iomp,
+            set_kmp_affinity,
+            args.enable_tcmalloc,
+            args.enable_jemalloc,
+            args.use_default_allocator,
+        )
+        entrypoint = ""
+        launch_args = {}
+        launch_envs: Dict[int, Dict] = {}
+        launch_tee = {}
+        for i in range(args.ninstances):
+            cmd = []
+            cur_process_cores = ""
+            if not args.disable_numactl or enable_taskset:
+                if not args.disable_numactl:
+                    cmd = ["numactl"]
+                elif enable_taskset:
+                    cmd = ["taskset"]
+                cores = sorted(cores)
+                if (
+                    args.rank == -1
+                ):  # sequentially assign ncores_per_instance to ninstances
+                    core_list = cores[
+                        i
+                        * args.ncores_per_instance : (i + 1)
+                        * args.ncores_per_instance
+                    ]
+                else:  # assign ncores_per_instance from rank
+                    core_list = cores[
+                        args.rank
+                        * args.ncores_per_instance : (args.rank + 1)
+                        * args.ncores_per_instance
+                    ]
+
+                core_ranges: List[Dict] = []
+                for core in core_list:
+                    if len(core_ranges) == 0:
+                        range_elem = {"start": core, "end": core}
+                        core_ranges.append(range_elem)
+                    else:
+                        if core - core_ranges[-1]["end"] == 1:
+                            core_ranges[-1]["end"] = core
+                        else:
+                            range_elem = {"start": core, "end": core}
+                            core_ranges.append(range_elem)
+                for r in core_ranges:
+                    cur_process_cores = f"{cur_process_cores}{r['start']}-{r['end']},"
+                cur_process_cores = cur_process_cores[:-1]
+                if not args.disable_numactl:
+                    numa_params = f"-C {cur_process_cores} "
+                    numa_ids = ",".join(
+                        [
+                            str(numa_id)
+                            for numa_id in self.cpuinfo.numa_aware_check(core_list)
+                        ]
+                    )
+                    numa_params += f"-m {numa_ids}"
+                    cmd.extend(numa_params.split())
+                elif enable_taskset:
+                    taskset_params = f"-c {cur_process_cores} "
+                    cmd.extend(taskset_params.split())
+            with_python = not args.no_python
+            if with_python:
+                cmd.append(sys.executable)
+                cmd.append("-u")
+            if args.module:
+                cmd.append("-m")
+            cmd.append(args.program)
+            cmd.extend(args.program_args)
+            cmd_s = " ".join(cmd)
+            logger.info(cmd_s)
+            if entrypoint == "":
+                entrypoint = cmd[0]
+            del cmd[0]
+            launch_args[i] = tuple(cmd)
+            launch_envs[i] = {}
+            launch_tee[i] = Std.ALL
+
+            if args.rank != -1:  # launches single instance, rank, only
+                break
+
+        ctx = start_processes(
+            name=args.log_file_prefix,
+            entrypoint=entrypoint,
+            args=launch_args,
+            envs=launch_envs,
+            logs_specs=DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
+        )
+        ctx.wait()
+
+
+def _add_memory_allocator_params(parser):
+    group = parser.add_argument_group("Memory Allocator Parameters")
+    # allocator control
+    group.add_argument(
+        "--enable-tcmalloc",
+        "--enable_tcmalloc",
+        action="store_true",
+        default=False,
+        help="Enable tcmalloc allocator",
+    )
+    group.add_argument(
+        "--enable-jemalloc",
+        "--enable_jemalloc",
+        action="store_true",
+        default=False,
+        help="Enable jemalloc allocator",
+    )
+    group.add_argument(
+        "--use-default-allocator",
+        "--use_default_allocator",
+        action="store_true",
+        default=False,
+        help="Use default memory allocator",
+    )
+
+
+def _add_multi_instance_params(parser):
+    group = parser.add_argument_group("Multi-instance Parameters")
+    # multi-instance control
+    group.add_argument(
+        "--ncores-per-instance",
+        "--ncores_per_instance",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="Cores per instance",
+    )
+    group.add_argument(
+        "--ninstances",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="For multi-instance, you should give the cores number you used for per instance.",
+    )
+    group.add_argument(
+        "--skip-cross-node-cores",
+        "--skip_cross_node_cores",
+        action="store_true",
+        default=False,
+        help="If specified --ncores-per-instance, skips cross-node cores.",
+    )
+    group.add_argument(
+        "--rank",
+        metavar="\b",
+        default="-1",
+        type=int,
+        help="Specify instance index to assign ncores_per_instance for rank; \
+otherwise ncores_per_instance will be assigned sequentially to ninstances. Please refer to \
+https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md",
+    )
+    group.add_argument(
+        "--latency-mode",
+        "--latency_mode",
+        action="store_true",
+        default=False,
+        help="By default 4 core per instance and use all physical cores",
+    )
+    group.add_argument(
+        "--throughput-mode",
+        "--throughput_mode",
+        action="store_true",
+        default=False,
+        help="By default one instance per node and use all physical cores",
+    )
+    group.add_argument(
+        "--node-id",
+        "--node_id",
+        metavar="\b",
+        default=-1,
+        type=int,
+        help="node id for multi-instance, by default all nodes will be used",
+    )
+    group.add_argument(
+        "--use-logical-core",
+        "--use_logical_core",
+        action="store_true",
+        default=False,
+        help="Whether only use physical cores",
+    )
+    group.add_argument(
+        "--disable-numactl",
+        "--disable_numactl",
+        action="store_true",
+        default=False,
+        help="Disable numactl",
+    )
+    group.add_argument(
+        "--disable-taskset",
+        "--disable_taskset",
+        action="store_true",
+        default=False,
+        help="Disable taskset",
+    )
+    group.add_argument(
+        "--core-list",
+        "--core_list",
+        metavar="\b",
+        default=None,
+        type=str,
+        help='Specify the core list as "core_id, core_id, ....", otherwise, all the cores will be used.',
+    )
+    group.add_argument(
+        "--log-path",
+        "--log_path",
+        metavar="\b",
+        default="",
+        type=str,
+        help="The log file directory. Default path is "
+        ", which means disable logging to files.",
+    )
+    group.add_argument(
+        "--log-file-prefix",
+        "--log_file_prefix",
+        metavar="\b",
+        default="run",
+        type=str,
+        help="log file prefix",
+    )
+
+
+def _add_kmp_iomp_params(parser):
+    group = parser.add_argument_group("IOMP Parameters")
+    group.add_argument(
+        "--disable-iomp",
+        "--disable_iomp",
+        action="store_true",
+        default=False,
+        help="By default, we use Intel OpenMP and libiomp5.so will be add to LD_PRELOAD",
+    )
+
+
+def create_args(parser=None):
+    """
+    Parse the command line options.
+
+    @retval ArgumentParser
+    """
+    parser.add_argument(
+        "--multi-instance",
+        "--multi_instance",
+        action="store_true",
+        default=False,
+        help="Enable multi-instance, by default one instance per node",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script "
+        "as a python module, executing with the same behavior as"
+        '"python -m".',
+    )
+
+    parser.add_argument(
+        "--no-python",
+        "--no_python",
+        default=False,
+        action="store_true",
+        help='Do not prepend the --program script with "python" - just exec '
+        "it directly. Useful when the script is not a Python script.",
+    )
+
+    _add_memory_allocator_params(parser)
+    _add_kmp_iomp_params(parser)
+
+    _add_multi_instance_params(parser)
+    # positional
+    parser.add_argument(
+        "program",
+        type=str,
+        help="The full path to the program/script to be launched. "
+        "followed by all the arguments for the script",
+    )
+
+    # rest from the training program
+    parser.add_argument("program_args", nargs=REMAINDER)
+
+
+def main(args):
+    env_before = set(os.environ.keys())
+    if platform.system() in ["Windows", "Darwin"]:
+        raise RuntimeError(f"{platform.system()} is not supported!!!")
+
+    if args.log_path:
+        os.makedirs(args.log_path, exist_ok=True)
+    else:
+        args.log_path = os.devnull
+
+    if args.latency_mode and args.throughput_mode:
+        raise RuntimeError(
+            "Either args.latency_mode or args.throughput_mode should be set"
+        )
+
+    if not args.no_python and not args.program.endswith(".py"):
+        raise RuntimeError(
+            'For non Python script, you should use "--no-python" parameter.'
+        )
+
+    # Verify LD_PRELOAD
+    if "LD_PRELOAD" in os.environ:
+        lst_valid = []
+        tmp_ldpreload = os.environ["LD_PRELOAD"]
+        for item in tmp_ldpreload.split(":"):
+            matches = glob.glob(item)
+            if len(matches) > 0:
+                lst_valid.append(item)
+            else:
+                logger.warning("%s doesn't exist. Removing it from LD_PRELOAD.", item)
+        if len(lst_valid) > 0:
+            os.environ["LD_PRELOAD"] = ":".join(lst_valid)
+        else:
+            os.environ["LD_PRELOAD"] = ""
+
+    launcher = _Launcher()
+    launcher.launch(args)
+    for x in sorted(set(os.environ.keys()) - env_before):
+        logger.debug("%s=%s", x, os.environ[x])
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="This is a script for launching PyTorch inference on Intel(R) Xeon(R) Scalable "
+        "Processors with optimal configurations. Single instance inference, "
+        "multi-instance inference are enable. To get the peak performance on Intel(R) "
+        "Xeon(R) Scalable Processors, the script optimizes the configuration "
+        "of thread and memory management. For thread management, the script configures thread "
+        "affinity and the preload of Intel OMP library. For memory management, it configures "
+        "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) "
+        "\n################################# Basic usage ############################# \n"
+        "\n 1. single instance\n"
+        "\n   >>> python -m torch.backends.xeon.run_cpu python_script args \n"
+        "\n2. multi-instance \n"
+        "\n   >>> python -m torch.backends.xeon.run_cpu --ninstances xxx "
+        "--ncores-per-instance xx python_script args\n"
+        "\n############################################################################# \n",
+        formatter_class=RawTextHelpFormatter,
+    )
+    create_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/MLPY/Lib/site-packages/torch/backends/xnnpack/__init__.py b/MLPY/Lib/site-packages/torch/backends/xnnpack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4153d7fc4c81126af6fec996619fedc255c264fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/backends/xnnpack/__init__.py
@@ -0,0 +1,28 @@
+import sys
+import types
+
+import torch
+
+
+class _XNNPACKEnabled:
+    def __get__(self, obj, objtype):
+        return torch._C._is_xnnpack_enabled()
+
+    def __set__(self, obj, val):
+        raise RuntimeError("Assignment not supported")
+
+
+class XNNPACKEngine(types.ModuleType):
+    def __init__(self, m, name):
+        super().__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+    enabled = _XNNPACKEnabled()
+
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = XNNPACKEngine(sys.modules[__name__], __name__)
diff --git a/MLPY/Lib/site-packages/torch/backends/xnnpack/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/backends/xnnpack/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ce784e99210a42c6b922fc17495b530f54ae8ee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/backends/xnnpack/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/bin/asmjit.dll b/MLPY/Lib/site-packages/torch/bin/asmjit.dll
new file mode 100644
index 0000000000000000000000000000000000000000..75d2557ec3e06bd90e8fc51199ca6dd1c73879d4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/bin/asmjit.dll differ
diff --git a/MLPY/Lib/site-packages/torch/bin/fbgemm.dll b/MLPY/Lib/site-packages/torch/bin/fbgemm.dll
new file mode 100644
index 0000000000000000000000000000000000000000..c2bafc25ee4fa9f7621a9a9b004d732ac61fab4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/bin/fbgemm.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e18ad86dae8caa56ebfd9da655f9c8b81d324a35586caf78734d9d0a48aa0518
+size 4961280
diff --git a/MLPY/Lib/site-packages/torch/bin/protoc.exe b/MLPY/Lib/site-packages/torch/bin/protoc.exe
new file mode 100644
index 0000000000000000000000000000000000000000..bc9a6b57f9b243fd57866b26fcedc9fc2bfd1fae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/bin/protoc.exe
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f10551c6cbc7187ee90ece18ffc24635dc1d308479718919a4807fee6c41551
+size 2812416
diff --git a/MLPY/Lib/site-packages/torch/compiler/__init__.py b/MLPY/Lib/site-packages/torch/compiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d17c5fcd57b44644193f2d7cd2519087d041ab68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/compiler/__init__.py
@@ -0,0 +1,193 @@
+import torch
+from typing import List
+
+__all__ = [
+    "compile",
+    "assume_constant_result",
+    "reset",
+    "allow_in_graph",
+    "list_backends",
+    "disable",
+    "cudagraph_mark_step_begin",
+    "wrap_numpy",
+    "is_compiling",
+    "is_dynamo_compiling",
+]
+
+def compile(*args, **kwargs):
+    """
+    See :func:`torch.compile` for details on the arguments for this function.
+    """
+    return torch.compile(*args, **kwargs)
+
+def reset() -> None:
+    """
+    This function clears all compilation caches and restores the system to its initial state.
+    It is recommended to call this function, especially after using operations like `torch.compile(...)`
+    to ensure a clean state before another unrelated compilation
+    """
+    import torch._dynamo
+
+    torch._dynamo.reset()
+
+def allow_in_graph(fn):
+    """
+    Customize which functions compilation will include in the generated graph.
+    It bypasses all introspection of the symbolic python code in favor of
+    directly writing it to the graph.
+    If fn is a list or tuple of callables it recursively applies :func:`allow_in_graph()`
+    to each function and returns a new list or tuple containing the modified functions
+
+    Args:
+        fn: A callable representing the function to be included in the graph.
+
+    .. warning::
+
+        :func:`allow_in_graph` skips TorchDynamo completely on the decorated function
+        skipping all TorchDynamo safety checks (graph breaks, handling closures, etc).
+        Therefore, one has to be very careful with :func:`allow_in_graph` since subsystems
+        like AOT Autograd rely on torchdynamo
+        If not careful, this could lead to soundness and really hard-to-debug issues.
+
+    """
+    import torch._dynamo
+
+    return torch._dynamo.allow_in_graph(fn)
+
+
+def list_backends(exclude_tags=("debug", "experimental")) -> List[str]:
+    """
+    Return valid strings that can be passed to `torch.compile(..., backend="name")`.
+
+    Args:
+        exclude_tags(optional): A tuple of strings representing tags to exclude.
+    """
+    import torch._dynamo
+
+    return torch._dynamo.list_backends(exclude_tags)
+
+def assume_constant_result(fn):
+    """
+    This function is used to mark a function `fn` as having a constant result.
+    This allows the compiler to optimize away your function
+    Returns The same function `fn`
+
+    Args:
+        fn: The function to be marked as having a constant result.
+
+    .. warning::
+        `assume_constant_result` can if invalid cause safety and soundness issues, :func:`torch.compile`
+        will not attempt to validate whether the constant assumption is true or not
+
+    """
+    import torch._dynamo
+
+    return torch._dynamo.assume_constant_result(fn)
+
+def disable(fn=None, recursive=True):
+    """
+    This function provides both a decorator and a context manager to disable compilation on a function
+    It also provides the option of recursively disabling called functions
+
+    Args:
+        fn (optional): The function to disable
+        recursive (optional): A boolean value indicating whether the disabling should be recursive.
+    """
+    import torch._dynamo
+
+    return torch._dynamo.disable(fn, recursive)
+
+def cudagraph_mark_step_begin():
+    """
+    Indicates that a new iteration of inference or training is about to begin.
+
+    CUDA Graphs will free tensors of a prior iteration. A new iteration is started on each invocation of
+    torch.compile, so long as there is not a pending backward that has not been called.
+
+    If that heuristic is wrong, such as in the following example, manually mark it with this api.
+
+    .. code-block:: python
+
+        @torch.compile(mode="reduce-overhead")
+        def rand_foo():
+            return torch.rand([4], device="cuda")
+
+        for _ in range(5):
+            torch.compiler.cudagraph_mark_step_begin()
+            rand_foo() + rand_foo()
+
+    For more details, see `torch.compiler_cudagraph_trees <https://pytorch.org/docs/main/torch.compiler_cudagraph_trees.html>`__
+    """
+    from torch._inductor import cudagraph_trees
+
+    cudagraph_trees.mark_step_begin()
+
+def wrap_numpy(fn):
+    r"""Decorator that turns a function from ``np.ndarray``s to ``np.ndarray``s into a function
+    from ``torch.Tensor``s to ``torch.Tensor``s.
+
+    It is designed to be used with :func:`torch.compile` with ``fullgraph=True``. It allows to
+    compile a NumPy function as if it were a PyTorch function. This allows you to run NumPy code
+    on CUDA or compute its gradients.
+
+    .. note::
+
+        This decorator does not work without :func:`torch.compile`.
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # Compile a NumPy function as a Tensor -> Tensor function
+        >>> @torch.compile(fullgraph=True)
+        >>> @torch.compiler.wrap_numpy
+        >>> def fn(a: np.ndarray):
+        >>>     return np.sum(a * a)
+        >>> # Execute the NumPy function using Tensors on CUDA and compute the gradients
+        >>> x = torch.arange(6, dtype=torch.float32, device="cuda", requires_grad=True)
+        >>> out = fn(x)
+        >>> out.backward()
+        >>> print(x.grad)
+        tensor([ 0.,  2.,  4.,  6.,  8., 10.], device='cuda:0')
+    """
+    from torch._dynamo.external_utils import wrap_numpy as wrap
+    return wrap(fn)
+
+_is_compiling_flag: bool = False
+
+def is_compiling() -> bool:
+    """
+    Indicates whether a graph is executed/traced as part of torch.compile() or torch.export().
+
+    Note that there are 2 other related flags that should deprecated eventually:
+      * torch._dynamo.external_utils.is_compiling()
+      * torch._utils.is_compiling()
+
+    Example::
+
+        >>> def forward(self, x):
+        >>>     if not torch.compiler.is_compiling():
+        >>>        ...logic that is not needed in a compiled/traced graph...
+        >>>
+        >>>     ...rest of the function...
+    """
+    if torch.jit.is_scripting():
+        return False
+    else:
+        return _is_compiling_flag
+
+def is_dynamo_compiling() -> bool:
+    """
+    Indicates whether a graph is traced via TorchDynamo.
+
+    It's stricter than is_compiling() flag, as it would only be set to True when
+    TorchDynamo is used.
+
+    Example::
+
+        >>> def forward(self, x):
+        >>>     if not torch.compiler.is_dynamo_compiling():
+        >>>        ...logic that is not needed in a TorchDynamo-traced graph...
+        >>>
+        >>>     ...rest of the function...
+    """
+    return False
diff --git a/MLPY/Lib/site-packages/torch/compiler/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/compiler/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..543c074dd38a49922b284074b3950afa64900790
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/compiler/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/contrib/__init__.py b/MLPY/Lib/site-packages/torch/contrib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/contrib/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/contrib/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cfe12a556b18ba0d2e6d3940602ceea738971b3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/contrib/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/contrib/__pycache__/_tensorboard_vis.cpython-39.pyc b/MLPY/Lib/site-packages/torch/contrib/__pycache__/_tensorboard_vis.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c607878f3640d37db6ab12af9567a38d725c7f0d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/contrib/__pycache__/_tensorboard_vis.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/contrib/_tensorboard_vis.py b/MLPY/Lib/site-packages/torch/contrib/_tensorboard_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b411bb3fd2b52ccf7bd0fe176516789d521fa02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/contrib/_tensorboard_vis.py
@@ -0,0 +1,142 @@
+import time
+from collections import defaultdict
+from functools import partial
+from typing import DefaultDict
+
+import torch
+
+
+# Unfortunately it doesn't seem as if there was any way to get TensorBoard to do
+# anything without having TF installed, and so this file has a hard dependency on it
+# as well. It really is a debugging tool, so it doesn't matter.
+try:
+    from tensorflow.core.util import event_pb2
+    from tensorflow.core.framework import graph_pb2
+    from tensorflow.python.summary.writer.writer import FileWriter
+except ImportError:
+    raise ImportError("TensorBoard visualization of GraphExecutors requires having "
+                      "TensorFlow installed") from None
+
+
+def dump_tensorboard_summary(graph_executor, logdir):
+    with FileWriter(logdir) as w:
+        pb_graph = visualize(graph_executor)
+        evt = event_pb2.Event(wall_time=time.time(), graph_def=pb_graph.SerializeToString())
+        w.add_event(evt)
+
+
+def visualize(graph, name_prefix='', pb_graph=None, executors_it=None):
+    """Visualizes an independent graph, or a graph executor."""
+    value_map = {}
+    pb_graph = pb_graph or graph_pb2.GraphDef()
+
+    if isinstance(graph, torch._C.GraphExecutorState):
+        visualize_graph_executor(graph, name_prefix, pb_graph,
+                                 partial(visualize, pb_graph=pb_graph))
+        return pb_graph
+
+    # Set up an input node
+    input_node = pb_graph.node.add(op='input', name=name_prefix + 'input')
+    for i, value in enumerate(graph.param_node().outputs()):
+        value_map[value.unique()] = name_prefix + 'input:' + str(i)
+
+    visualize_rec(graph, value_map, name_prefix, pb_graph, executors_it)
+
+    # Gather all outputs
+    return_node = pb_graph.node.add(op='output', name=name_prefix + 'output')
+    for value in graph.return_node().inputs():
+        return_node.input.append(value_map[value.unique()])
+
+    return pb_graph
+
+
+def visualize_graph_executor(state, name_prefix, pb_graph, inline_graph):
+    """Append the state of a given GraphExecutor to the graph protobuf.
+
+    Args:
+        state (GraphExecutor or GraphExecutorState): GraphExecutor to display.
+        name_prefix (str): Name prefix of the containing subgraph.
+        pb_graph (GraphDef): graph to append to.
+        inline_graph (Callable): a function that handles setting up a value_map,
+            so that some graphs in here can be inlined. This is necessary, because
+            this will simply be `visualize` for the top-level GraphExecutor,
+            or `inline_graph` for all nested ones.
+
+            The signature should look like (Graph, name_prefix) -> ().
+            It will be called exactly once.
+
+    The strategy is to embed all different configurations as independent subgraphs,
+    while inlining the original graph as the one that actually produces the values.
+    """
+    if state.autograd_fallback_graph is not None:
+        visualize(graph=state.autograd_fallback_graph,
+                  name_prefix=name_prefix + 'autograd_fallback/',
+                  pb_graph=pb_graph,
+                  executors_it=iter(state.autograd_fallback.executors()))
+
+    for i, (arg_spec, plan) in enumerate(state.execution_plans.items()):
+        subgraph_name = name_prefix + f'plan{i}/'
+
+        # Create a disconnected node that will keep information regarding the input
+        # types of this trace. This is unfortunately a bit too verbose to be included
+        # in the subgraph name.
+        input_kinds = pb_graph.node.add(op='INPUT_KIND', name=subgraph_name)
+        input_kinds.attr['inputs'].s = repr(arg_spec).encode('ascii')
+
+        visualize(plan.graph, subgraph_name, pb_graph, iter(plan.code.executors()))
+
+        # Show gradient as an independent subgraph of this plan
+        if plan.grad_executor is not None:
+            grad_subgraph_name = subgraph_name + 'grad/'
+            visualize(plan.grad_executor, grad_subgraph_name, pb_graph)
+
+    return inline_graph(state.graph, name_prefix + 'original/')
+
+
+def visualize_rec(graph, value_map, name_prefix, pb_graph, executors_it=None):
+    """Recursive part of visualize (basically skips setting up the input and output nodes)."""
+    def inline_graph(subgraph, name, node):
+        rec_value_map = {inp.unique(): value_map[val.unique()]
+                         for inp, val in zip(subgraph.inputs(), node.inputs())}
+        visualize_rec(graph=subgraph,
+                      value_map=rec_value_map,
+                      name_prefix=name,
+                      pb_graph=pb_graph)
+        for out, val in zip(subgraph.outputs(), node.outputs()):
+            value_map[val.unique()] = rec_value_map[out.unique()]
+
+    op_id_counter: DefaultDict[str, int] = defaultdict(int)
+
+    def name_for(node):
+        kind = node.kind()[node.kind().index('::') + 2:]
+        op_id_counter[kind] += 1
+        return kind, name_prefix + kind + '_' + str(op_id_counter[kind])
+
+    def add_fusion_group(node):
+        op, name = name_for(node)
+        inline_graph(node.g('Subgraph'), name + '/', node)
+
+    def add_graph_executor(node):
+        op, name = name_for(node)
+        if executors_it is None:
+            add_node(node)
+        else:
+            ge = next(executors_it)
+            visualize_graph_executor(ge, name + '/', pb_graph,
+                                     partial(inline_graph, node=node))
+
+    def add_node(node):
+        if node.kind() == 'prim::FusionGroup':
+            return add_fusion_group(node)
+        elif node.kind() == 'prim::GraphExecutor':
+            return add_graph_executor(node)
+        op, name = name_for(node)
+        pb_node = pb_graph.node.add(op=op, name=name)
+        for value in node.inputs():
+            pb_node.input.append(value_map[value.unique()])
+        # TODO: handle attrs
+        for i, value in enumerate(node.outputs()):
+            value_map[value.unique()] = name + ':' + str(i)
+
+    for node in graph.nodes():
+        add_node(node)
diff --git a/MLPY/Lib/site-packages/torch/cpu/__init__.py b/MLPY/Lib/site-packages/torch/cpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02bf9e7b7eb5f8e9a7e9440c70e9bbeec1ebc3ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cpu/__init__.py
@@ -0,0 +1,157 @@
+r"""
+This package implements abstractions found in ``torch.cuda``
+to facilitate writing device-agnostic code.
+"""
+
+from contextlib import AbstractContextManager
+from typing import Any, Optional, Union
+
+import torch
+
+from .. import device as _device
+from . import amp
+
+__all__ = [
+    "is_available",
+    "synchronize",
+    "current_device",
+    "current_stream",
+    "stream",
+    "set_device",
+    "device_count",
+    "Stream",
+    "StreamContext",
+    "Event",
+]
+
+_device_t = Union[_device, str, int, None]
+
+
+def _is_cpu_support_vnni() -> bool:
+    r"""Returns a bool indicating if CPU supports VNNI."""
+    return torch._C._cpu._is_cpu_support_vnni()
+
+
+def is_available() -> bool:
+    r"""Returns a bool indicating if CPU is currently available.
+
+    N.B. This function only exists to facilitate device-agnostic code
+
+    """
+    return True
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Waits for all kernels in all streams on the CPU device to complete.
+
+    Args:
+        device (torch.device or int, optional): ignored, there's only one CPU device.
+
+    N.B. This function only exists to facilitate device-agnostic code.
+    """
+    pass
+
+
+class Stream:
+    """
+    N.B. This class only exists to facilitate device-agnostic code
+    """
+
+    def __init__(self, priority: int = -1):
+        pass
+
+    def wait_stream(self, stream) -> None:
+        pass
+
+
+class Event:
+    def query(self) -> bool:
+        return True
+
+    def record(self, stream=None):
+        pass
+
+    def synchronize(self):
+        pass
+
+    def wait(self, stream=None):
+        pass
+
+
+_default_cpu_stream = Stream()
+_current_stream = _default_cpu_stream
+
+
+def current_stream(device: _device_t = None) -> Stream:
+    r"""Returns the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): Ignored.
+
+    N.B. This function only exists to facilitate device-agnostic code
+
+    """
+    return _current_stream
+
+
+class StreamContext(AbstractContextManager):
+    r"""Context-manager that selects a given stream.
+
+    N.B. This class only exists to facilitate device-agnostic code
+
+    """
+    cur_stream: Optional[Stream]
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.prev_stream = _default_cpu_stream
+
+    def __enter__(self):
+        cur_stream = self.stream
+        if cur_stream is None:
+            return
+
+        global _current_stream
+        self.prev_stream = _current_stream
+        _current_stream = cur_stream
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        cur_stream = self.stream
+        if cur_stream is None:
+            return
+
+        global _current_stream
+        _current_stream = self.prev_stream
+
+
+def stream(stream: Stream) -> AbstractContextManager:
+    r"""Wrapper around the Context-manager StreamContext that
+    selects a given stream.
+
+    N.B. This function only exists to facilitate device-agnostic code
+    """
+    return StreamContext(stream)
+
+
+def device_count() -> int:
+    r"""Returns number of CPU devices (not cores). Always 1.
+
+    N.B. This function only exists to facilitate device-agnostic code
+    """
+    return 1
+
+
+def set_device(device: _device_t) -> None:
+    r"""Sets the current device, in CPU we do nothing.
+
+    N.B. This function only exists to facilitate device-agnostic code
+    """
+    pass
+
+
+def current_device() -> str:
+    r"""Returns current device for cpu. Always 'cpu'.
+
+    N.B. This function only exists to facilitate device-agnostic code
+    """
+    return "cpu"
diff --git a/MLPY/Lib/site-packages/torch/cpu/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cpu/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..774468c28dd9106645502203c00db2ee3a944f8a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cpu/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/__init__.py b/MLPY/Lib/site-packages/torch/cpu/amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..973717d653ba9c47a5b63be6aa699dfe0d25b58f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cpu/amp/__init__.py
@@ -0,0 +1,2 @@
+from .autocast_mode import autocast
+from .grad_scaler import GradScaler
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b88b8da0941e5b4e3b98d14f104c44f4d2a2ee1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/autocast_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/autocast_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa58d22a8bc16cb885d29fbe34eebca7c06fb6b1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/autocast_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/grad_scaler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/grad_scaler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc612338165d7539d56018a7375085ed5ecd146e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cpu/amp/__pycache__/grad_scaler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/autocast_mode.py b/MLPY/Lib/site-packages/torch/cpu/amp/autocast_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..03075f923746c3a7f625a50aea4ed1bb9eef6403
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cpu/amp/autocast_mode.py
@@ -0,0 +1,43 @@
+from typing import Any
+
+import torch
+
+__all__ = ["autocast"]
+
+
+class autocast(torch.amp.autocast_mode.autocast):
+    r"""
+    See :class:`torch.autocast`.
+    ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)``
+    """
+
+    def __init__(
+        self,
+        enabled: bool = True,
+        dtype: torch.dtype = torch.bfloat16,
+        cache_enabled: bool = True,
+    ):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = "cpu"
+            self.fast_dtype = dtype
+            return
+        super().__init__(
+            "cpu", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
+        )
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            return self
+        return super().__enter__()
+
+    # TODO: discuss a unified TorchScript-friendly API for autocast
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return super().__call__(func)
diff --git a/MLPY/Lib/site-packages/torch/cpu/amp/grad_scaler.py b/MLPY/Lib/site-packages/torch/cpu/amp/grad_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5d935371df1f66ee93675c259a51361d8caa903
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cpu/amp/grad_scaler.py
@@ -0,0 +1,27 @@
+import torch
+
+__all__ = ["GradScaler"]
+
+
+class GradScaler(torch.amp.GradScaler):
+    r"""
+    See :class:`torch.amp.GradScaler`.
+    ``torch.cpu.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cpu", args...)``
+    """
+
+    def __init__(
+        self,
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        super().__init__(
+            "cpu",
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
diff --git a/MLPY/Lib/site-packages/torch/cuda/__init__.py b/MLPY/Lib/site-packages/torch/cuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d49f1a4c4417ad0856bcf5d6a99c71a6afbf0363
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/__init__.py
@@ -0,0 +1,1412 @@
+r"""
+This package adds support for CUDA tensor types.
+
+It implements the same function as CPU tensors, but they utilize
+GPUs for computation.
+
+It is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports CUDA.
+
+:ref:`cuda-semantics` has more details about working with CUDA.
+"""
+
+
+import contextlib
+import importlib
+import os
+import sys
+import threading
+import traceback
+import warnings
+from functools import lru_cache
+from typing import Any, Callable, cast, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from torch.types import Device
+from .. import device as _device
+from .._utils import _dummy_type, _LazySeedTracker, classproperty
+from ._utils import _get_device_index
+from .graphs import (
+    CUDAGraph,
+    graph,
+    graph_pool_handle,
+    is_current_stream_capturing,
+    make_graphed_callables,
+)
+from .streams import Event, ExternalStream, Stream
+
+try:
+    from torch._C import _cudart  # type: ignore[attr-defined]
+except ImportError:
+    _cudart = None
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_cuda_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+
+_HAS_PYNVML = False
+_PYNVML_ERR = None
+try:
+    import pynvml  # type: ignore[import]
+
+    _HAS_PYNVML = True
+except ImportError as err:
+    _PYNVML_ERR = err  # sometimes a lib is installed but the import fails for some other reason, so we log the error for later
+
+_lazy_seed_tracker = _LazySeedTracker()
+
+# Define dummy _CudaDeviceProperties type if PyTorch was compiled without CUDA
+if hasattr(torch._C, "_CudaDeviceProperties"):
+    _CudaDeviceProperties = torch._C._CudaDeviceProperties
+else:
+    _CudaDeviceProperties = _dummy_type("_CudaDeviceProperties")  # type: ignore[assignment, misc]
+
+if hasattr(torch._C, "_cuda_exchangeDevice"):
+    _exchange_device = torch._C._cuda_exchangeDevice
+else:
+
+    def _exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without CUDA support")
+
+
+if hasattr(torch._C, "_cuda_maybeExchangeDevice"):
+    _maybe_exchange_device = torch._C._cuda_maybeExchangeDevice
+else:
+
+    def _maybe_exchange_device(device: int) -> int:
+        if device < 0:
+            return -1
+        raise RuntimeError("PyTorch was compiled without CUDA support")
+
+
+has_half: bool = True
+has_magma: bool = torch._C._has_magma
+
+default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with CUDA support."""
+    return hasattr(torch._C, "_cuda_getDeviceCount")
+
+
+def _nvml_based_avail() -> bool:
+    return os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1"
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if CUDA is currently available."""
+    if not _is_compiled():
+        return False
+    if _nvml_based_avail():
+        # The user has set an env variable to request this availability check that attempts to avoid fork poisoning by
+        # using NVML at the cost of a weaker CUDA availability assessment. Note that if NVML discovery/initialization
+        # fails, this assessment falls back to the default CUDA Runtime API assessment (`cudaGetDeviceCount`)
+        return device_count() > 0
+    else:
+        # The default availability inspection never throws and returns 0 if the driver is missing or can't
+        # be initialized. This uses the CUDA Runtime API `cudaGetDeviceCount` which in turn initializes the CUDA Driver
+        # API via `cuInit`
+        return torch._C._cuda_getDeviceCount() > 0
+
+
+def is_bf16_supported():
+    r"""Return a bool indicating if the current CUDA/ROCm device supports dtype bfloat16."""
+    # Check for ROCm, if true return true, no ROCM_VERSION check required,
+    # since it is supported on AMD GPU archs.
+    if torch.version.hip:
+        return True
+
+    device = torch.cuda.current_device()
+
+    # Check for CUDA version and device compute capability.
+    # This is a fast way to check for it.
+    cuda_version = torch.version.cuda
+    if (
+        cuda_version is not None
+        and int(cuda_version.split(".")[0]) >= 11
+        and torch.cuda.get_device_properties(device).major >= 8
+    ):
+        return True
+
+    # Finally try to create a bfloat16 device.
+    return _check_bf16_tensor_supported(device)
+
+
+@lru_cache(maxsize=16)
+def _check_bf16_tensor_supported(device: _device_t):
+    try:
+        torch.tensor([1.0], dtype=torch.bfloat16, device=device)
+        return True
+    except Exception:
+        return False
+
+
+def _sleep(cycles):
+    torch._C._cuda_sleep(cycles)
+
+
+def _check_capability():
+    incorrect_binary_warn = """
+    Found GPU%d %s which requires CUDA_VERSION >= %d to
+     work properly, but your PyTorch was compiled
+     with CUDA_VERSION %d. Please install the correct PyTorch binary
+     using instructions from https://pytorch.org
+    """
+
+    old_gpu_warn = """
+    Found GPU%d %s which is of cuda capability %d.%d.
+    PyTorch no longer supports this GPU because it is too old.
+    The minimum cuda capability supported by this library is %d.%d.
+    """
+
+    if torch.version.cuda is not None:  # on ROCm we don't want this check
+        CUDA_VERSION = torch._C._cuda_getCompiledVersion()
+        for d in range(device_count()):
+            capability = get_device_capability(d)
+            major = capability[0]
+            minor = capability[1]
+            name = get_device_name(d)
+            current_arch = major * 10 + minor
+            min_arch = min(
+                (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()),
+                default=35,
+            )
+            if current_arch < min_arch:
+                warnings.warn(
+                    old_gpu_warn
+                    % (d, name, major, minor, min_arch // 10, min_arch % 10)
+                )
+
+
+def _check_cubins():
+    incompatible_device_warn = """
+{} with CUDA capability sm_{} is not compatible with the current PyTorch installation.
+The current PyTorch install supports CUDA capabilities {}.
+If you want to use the {} GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/
+"""
+    if torch.version.cuda is None:  # on ROCm we don't want this check
+        return
+    arch_list = get_arch_list()
+    if len(arch_list) == 0:
+        return
+    supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch]
+    for idx in range(device_count()):
+        cap_major, cap_minor = get_device_capability(idx)
+        # NVIDIA GPU compute architectures are backward compatible within major version
+        supported = any(sm // 10 == cap_major for sm in supported_sm)
+        if not supported:
+            device_name = get_device_name(idx)
+            capability = cap_major * 10 + cap_minor
+            warnings.warn(
+                incompatible_device_warn.format(
+                    device_name, capability, " ".join(arch_list), device_name
+                )
+            )
+
+
+def is_initialized():
+    r"""Return whether PyTorch's CUDA state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _lazy_call(callable, **kwargs):
+    if is_initialized():
+        callable()
+    else:
+        # TODO(torch_deploy): this accesses linecache, which attempts to read the
+        # file system to get traceback info. Patch linecache or do something
+        # else here if this ends up being important.
+        global _lazy_seed_tracker
+        if kwargs.get("seed_all", False):
+            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
+        elif kwargs.get("seed", False):
+            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
+        else:
+            # Don't store the actual traceback to avoid memory cycle
+            _queued_calls.append((callable, traceback.format_stack()))
+
+
+_lazy_call(_check_capability)
+_lazy_call(_check_cubins)
+
+
+class DeferredCudaCallError(Exception):
+    pass
+
+
+OutOfMemoryError = torch._C._OutOfMemoryError
+
+
+def init():
+    r"""Initialize PyTorch's CUDA state.
+
+    You may need to call this explicitly if you are interacting with
+    PyTorch via its C API, as Python bindings for CUDA functionality
+    will not be available until this initialization takes place.
+    Ordinary users should not need this, as all of PyTorch's CUDA methods
+    automatically initialize CUDA state on-demand.
+
+    Does nothing if the CUDA state is already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # We be double-checked locking, boys!  This is OK because
+        # the above test was GIL protected anyway.  The inner test
+        # is for when a thread blocked on some other thread which was
+        # doing the initialization; when they get the lock, they will
+        # find there is nothing left to do.
+        if is_initialized():
+            return
+        # It is important to prevent other threads from entering _lazy_init
+        # immediately, while we are still guaranteed to have the GIL, because some
+        # of the C calls we make below will release the GIL
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize CUDA in forked subprocess. To use CUDA with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not hasattr(torch._C, "_cuda_getDeviceCount"):
+            raise AssertionError("Torch not compiled with CUDA enabled")
+        if _cudart is None:
+            raise AssertionError(
+                "libcudart functions unavailable. It looks like you have a broken build?"
+            )
+        # This function throws if there's a driver initialization error, no GPUs
+        # are found or any other error occurs
+        if "CUDA_MODULE_LOADING" not in os.environ:
+            os.environ["CUDA_MODULE_LOADING"] = "LAZY"
+        torch._C._cuda_init()
+        # Some of the queued calls may reentrantly call _lazy_init();
+        # we need to just return without initializing in that case.
+        # However, we must not let any *other* threads in!
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"CUDA call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"CUDA call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise DeferredCudaCallError(msg) from e
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+def cudart():
+    _lazy_init()
+    return _cudart
+
+
+class cudaStatus:
+    SUCCESS: int = 0
+    ERROR_NOT_READY: int = 34
+
+
+class CudaError(RuntimeError):
+    def __init__(self, code: int) -> None:
+        msg = _cudart.cudaGetErrorString(_cudart.cudaError(code))
+        super().__init__(f"{msg} ({code})")
+
+
+def check_error(res: int) -> None:
+    if res != _cudart.cudaError.success:
+        raise CudaError(res)
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.cuda._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.cuda._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.cuda._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.cuda._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a GPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_cuda else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Usage of this function is discouraged in favor of :any:`device`. In most
+    cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable.
+
+    Args:
+        device (torch.device or int): selected device. This function is a no-op
+            if this argument is negative.
+    """
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._cuda_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            name. This function is a no-op if this argument is a negative
+            integer. It uses the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]:
+    r"""Get the cuda capability of a device.
+
+    Args:
+        device (torch.device or int, optional): device for which to return the
+            device capability. This function is a no-op if this argument is
+            a negative integer. It uses the current device, given by
+            :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        tuple(int, int): the major and minor cuda capability of the device
+    """
+    prop = get_device_properties(device)
+    return prop.major, prop.minor
+
+
+def get_device_properties(device: _device_t) -> _CudaDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _CudaDeviceProperties: the properties of the device
+    """
+    _lazy_init()  # will define _get_device_properties
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    return _get_device_properties(device)  # type: ignore[name-defined]
+
+
+def can_device_access_peer(device: _device_t, peer_device: _device_t) -> bool:
+    r"""Check if peer access between two devices is possible."""
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    peer_device = _get_device_index(peer_device)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device id")
+    if peer_device < 0 or peer_device >= device_count():
+        raise AssertionError("Invalid peer device id")
+    return torch._C._cuda_canDeviceAccessPeer(device, peer_device)
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.cuda.Stream"]
+
+    def __init__(self, stream: Optional["torch.cuda.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if not torch.jit.is_scripting():
+            if self.idx is None:
+                self.idx = -1
+
+        self.src_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.cuda.default_stream(None)
+        )
+        self.dst_prev_stream = (
+            None if not torch.jit.is_scripting() else torch.cuda.default_stream(None)
+        )
+
+    def __enter__(self):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None or CUDA device not available
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.cuda.current_stream(None)
+
+        # If the stream is not on the current device, then
+        # set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device)
+        torch.cuda.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no CUDA device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device
+        # and destination device
+        if self.src_prev_stream.device != cur_stream.device:  # type: ignore[union-attr]
+            torch.cuda.set_stream(self.dst_prev_stream)  # type: ignore[arg-type]
+        torch.cuda.set_stream(self.src_prev_stream)  # type: ignore[arg-type]
+
+
+def stream(stream: Optional["torch.cuda.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    ..Note:: In eager mode stream is of type Stream class while in JIT it is
+    an object of the custom class ``torch.classes.cuda.Stream``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and
+        device type
+
+    Args: stream_id (int): stream id in stream pool
+          device_index (int): device index in topo
+          device_type (int): enum device type
+    """
+    torch._C._cuda_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
+
+
+def _parse_visible_devices() -> Union[List[int], List[str]]:
+    r"""Parse CUDA_VISIBLE_DEVICES environment variable."""
+    var = os.getenv("CUDA_VISIBLE_DEVICES")
+    if var is None:
+        return list(range(64))
+
+    def _strtoul(s: str) -> int:
+        """Return -1 or positive integer sequence string starts with."""
+        if not s:
+            return -1
+        for idx, c in enumerate(s):
+            if not (c.isdigit() or (idx == 0 and c in "+-")):
+                break
+            if idx + 1 == len(s):
+                idx += 1
+        return int(s[:idx]) if idx > 0 else -1
+
+    def parse_list_with_prefix(lst: str, prefix: str) -> List[str]:
+        rcs: List[str] = []
+        for elem in lst.split(","):
+            # Repeated id results in empty set
+            if elem in rcs:
+                return cast(List[str], [])
+            # Anything other but prefix is ignored
+            if not elem.startswith(prefix):
+                break
+            rcs.append(elem)
+        return rcs
+
+    if var.startswith("GPU-"):
+        return parse_list_with_prefix(var, "GPU-")
+    if var.startswith("MIG-"):
+        return parse_list_with_prefix(var, "MIG-")
+    # CUDA_VISIBLE_DEVICES uses something like strtoul
+    # which makes `1gpu2,2ampere` is equivalent to `1,2`
+    rc: List[int] = []
+    for elem in var.split(","):
+        x = _strtoul(elem.strip())
+        # Repeated ordinal results in empty set
+        if x in rc:
+            return cast(List[int], [])
+        # Negative value aborts the sequence
+        if x < 0:
+            break
+        rc.append(x)
+    return rc
+
+
+def _raw_device_count_nvml() -> int:
+    r"""Return number of devices as reported by NVML or negative value if NVML discovery/initialization failed."""
+    from ctypes import byref, c_int, CDLL
+
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return -1
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return -1
+    del nvml_h
+    return dev_count.value
+
+
+def _raw_device_uuid_nvml() -> Optional[List[str]]:
+    r"""Return list of device UUID as reported by NVML or None if NVM discovery/initialization failed."""
+    from ctypes import byref, c_int, c_void_p, CDLL, create_string_buffer
+
+    nvml_h = CDLL("libnvidia-ml.so.1")
+    rc = nvml_h.nvmlInit()
+    if rc != 0:
+        warnings.warn("Can't initialize NVML")
+        return None
+    dev_count = c_int(-1)
+    rc = nvml_h.nvmlDeviceGetCount_v2(byref(dev_count))
+    if rc != 0:
+        warnings.warn("Can't get nvml device count")
+        return None
+    uuids: List[str] = []
+    for idx in range(dev_count.value):
+        dev_id = c_void_p()
+        rc = nvml_h.nvmlDeviceGetHandleByIndex_v2(idx, byref(dev_id))
+        if rc != 0:
+            warnings.warn("Can't get device handle")
+            return None
+        buf_len = 96
+        buf = create_string_buffer(buf_len)
+        rc = nvml_h.nvmlDeviceGetUUID(dev_id, buf, buf_len)
+        if rc != 0:
+            warnings.warn("Can't get device UUID")
+            return None
+        uuids.append(buf.raw.decode("ascii").strip("\0"))
+    del nvml_h
+    return uuids
+
+
+def _transform_uuid_to_ordinals(candidates: List[str], uuids: List[str]) -> List[int]:
+    r"""Given the set of partial uuids and list of known uuids builds a set of ordinals excluding ambiguous partials IDs."""
+
+    def uuid_to_orinal(candidate: str, uuids: List[str]) -> int:
+        best_match = -1
+        for idx, uuid in enumerate(uuids):
+            if not uuid.startswith(candidate):
+                continue
+            # Ambiguous candidate
+            if best_match != -1:
+                return -1
+            best_match = idx
+        return best_match
+
+    rc: List[int] = []
+    for candidate in candidates:
+        idx = uuid_to_orinal(candidate, uuids)
+        # First invalid ordinal stops parsing
+        if idx < 0:
+            break
+        # Duplicates result in empty set
+        if idx in rc:
+            return cast(List[int], [])
+        rc.append(idx)
+    return rc
+
+
+def _device_count_nvml() -> int:
+    r"""Return number of devices as reported by NVML taking CUDA_VISIBLE_DEVICES into account.
+
+    Negative value is returned if NVML discovery or initialization has failed.
+    """
+    visible_devices = _parse_visible_devices()
+    if not visible_devices:
+        return 0
+    try:
+        if type(visible_devices[0]) is str:
+            # Skip MIG parsing
+            if visible_devices[0].startswith("MIG-"):
+                return -1
+            uuids = _raw_device_uuid_nvml()
+            if uuids is None:
+                return -1
+            visible_devices = _transform_uuid_to_ordinals(
+                cast(List[str], visible_devices), uuids
+            )
+        else:
+            raw_cnt = _raw_device_count_nvml()
+            if raw_cnt <= 0:
+                return raw_cnt
+            # Trim the list up to a maximum available device
+            for idx, val in enumerate(visible_devices):
+                if cast(int, val) >= raw_cnt:
+                    return idx
+    except OSError:
+        return -1
+    except AttributeError:
+        return -1
+    return len(visible_devices)
+
+
+def _get_nvml_device_index(device: Optional[Union[int, Device]]) -> int:
+    r"""Return the NVML index of the device, taking CUDA_VISIBLE_DEVICES into account."""
+    idx = _get_device_index(device, optional=True)
+    visible_devices = _parse_visible_devices()
+    if type(visible_devices[0]) is str:
+        uuids = _raw_device_uuid_nvml()
+        if uuids is None:
+            raise RuntimeError("Can't get device UUIDs")
+        visible_devices = _transform_uuid_to_ordinals(
+            cast(List[str], visible_devices), uuids
+        )
+    visible_devices = cast(List[int], visible_devices)
+    if idx < 0 or idx >= len(visible_devices):
+        raise RuntimeError(
+            f"device {idx} is not visible (CUDA_VISIBLE_DEVICES={visible_devices})"
+        )
+    return visible_devices[idx]
+
+
+@lru_cache(maxsize=1)
+def device_count() -> int:
+    r"""Return the number of GPUs available."""
+    if not _is_compiled():
+        return 0
+    # bypass _device_count_nvml() if rocm (not supported)
+    nvml_count = -1 if torch.version.hip else _device_count_nvml()
+    return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count
+
+
+def get_arch_list() -> List[str]:
+    r"""Return list CUDA architectures this library was compiled for."""
+    if not is_available():
+        return []
+    arch_flags = torch._C._cuda_getArchFlags()
+    if arch_flags is None:
+        return []
+    return arch_flags.split()
+
+
+def get_gencode_flags() -> str:
+    r"""Return NVCC gencode flags this library was compiled with."""
+    arch_list = get_arch_list()
+    if len(arch_list) == 0:
+        return ""
+    arch_list_ = [arch.split("_") for arch in arch_list]
+    return " ".join(
+        [
+            f"-gencode compute=compute_{arch},code={kind}_{arch}"
+            for (kind, arch) in arch_list_
+        ]
+    )
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._cuda_getDevice()
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a CUDA device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    with torch.cuda.device(device):
+        return torch._C._cuda_synchronize()
+
+
+def ipc_collect():
+    r"""Force collects GPU memory after it has been released by CUDA IPC.
+
+    .. note::
+        Checks if any sent CUDA tensors could be cleaned from the memory. Force
+        closes shared memory file used for reference counting if there is no
+        active counters. Useful when the producer process stopped actively sending
+        tensors and want to release unused memory.
+    """
+    _lazy_init()
+    return torch._C._cuda_ipc_collect()
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._cuda_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def default_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the default :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the default :class:`Stream` for the current device, given by
+            :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._cuda_getDefaultStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def current_blas_handle():
+    r"""Return cublasHandle_t pointer to current cuBLAS handle"""
+    _lazy_init()
+    return torch._C._cuda_getCurrentBlasHandle()
+
+
+def set_sync_debug_mode(debug_mode: Union[int, str]) -> None:
+    r"""Set the debug mode for cuda synchronizing operations.
+
+    Args:
+        debug_mode(str or int): if "default" or 0, don't error or warn on synchronizing operations,
+            if "warn" or 1, warn on synchronizing operations, if "error" or 2, error out synchronizing operations.
+
+    Warning:
+        This is an experimental feature, and not all synchronizing operations will trigger warning or error. In
+        particular, operations in torch.distributed and torch.sparse namespaces are not covered yet.
+    """
+    _lazy_init()
+    if isinstance(debug_mode, str):
+        if debug_mode == "default":
+            debug_mode = 0
+        elif debug_mode == "warn":
+            debug_mode = 1
+        elif debug_mode == "error":
+            debug_mode = 2
+        else:
+            raise RuntimeError(
+                "invalid value of debug_mode, expected one of `default`, `warn`, `error`"
+            )
+
+    torch._C._cuda_set_sync_debug_mode(debug_mode)
+
+
+def get_sync_debug_mode() -> int:
+    r"""Return current value of debug mode for cuda synchronizing operations."""
+    _lazy_init()
+    return torch._C._cuda_get_sync_debug_mode()
+
+
+def _get_pynvml_handler(device: Optional[Union[Device, int]] = None):
+    if not _HAS_PYNVML:
+        raise ModuleNotFoundError(
+            "pynvml does not seem to be installed or it can't be imported."
+        ) from _PYNVML_ERR
+    from pynvml import NVMLError_DriverNotLoaded
+
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded as e:
+        raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") from e
+
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return handle
+
+
+def memory_usage(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the percent of time over the past sample period during which global (device)
+    memory was being read or written as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler()
+
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return pynvml.nvmlDeviceGetUtilizationRates(handle).memory
+
+
+def utilization(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the percent of time over the past sample period during which one or
+    more kernels was executing on the GPU as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
+
+
+def temperature(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the average temperature of the GPU sensor in Degrees C (Centigrades).
+
+    The average temperature is computed based on past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    # 0 refers to the temperature sensor for the GPU die.
+    return pynvml.nvmlDeviceGetTemperature(handle, 0)
+
+
+def power_draw(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the average power draw of the GPU sensor in mW (MilliWatts)
+        over the past sample period as given by `nvidia-smi` for Fermi or newer fully supported devices.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetPowerUsage(handle)
+
+
+def clock_rate(device: Optional[Union[Device, int]] = None) -> int:
+    r"""Return the clock speed of the GPU SM in Hz Hertz over the past sample period as given by `nvidia-smi`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Warning: Each sample period may be between 1 second and 1/6 second,
+    depending on the product being queried.
+    """
+    handle = _get_pynvml_handler(device)
+    return pynvml.nvmlDeviceGetClockInfo(handle, 1)
+
+
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+    return device
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the CUDA Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.cuda.default_generators[idx]
+
+
+def _set_rng_state_offset(
+    offset: int, device: Union[int, str, torch.device] = "cuda"
+) -> None:
+    r"""Set the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = "cuda") -> int:
+    r"""Return the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
+from .memory import *  # noqa: F403
+
+
+from .random import *  # noqa: F403
+
+################################################################################
+# Define Storage and Tensor classes
+################################################################################
+
+
+@staticmethod  # type: ignore[misc]
+def _lazy_new(cls, *args, **kwargs):
+    _lazy_init()
+    # We may need to call lazy init again if we are a forked child
+    # del _CudaBase.__new__
+    return super(_CudaBase, cls).__new__(cls, *args, **kwargs)
+
+
+class _CudaBase:
+    is_cuda = True
+    is_sparse = False
+
+    def type(self, *args, **kwargs):
+        # We could use a Protocol here to tell mypy that self has `get_device` method
+        # but it is only available in the typing module on Python >= 3.8
+        # or on typing_extensions module on Python >= 3.6
+        with device(self.get_device()):  # type: ignore[attr-defined]
+            return super().type(*args, **kwargs)  # type: ignore[misc]
+
+    __new__ = _lazy_new
+
+
+from torch.storage import _LegacyStorage, _warn_typed_storage_removal
+
+
+class _CudaLegacyStorage(_LegacyStorage):
+    @classmethod
+    def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
+        raise RuntimeError("from_buffer: Not available for CUDA storage")
+
+    @classmethod
+    def _new_with_weak_ptr(cls, *args, **kwargs):
+        raise RuntimeError("_new_with_weak_ptr: Not available for CUDA storage")
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size, *, device=None, dtype=None):
+        raise RuntimeError("_new_shared_filename: Not available for CUDA storage")
+
+
+class ByteStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.uint8
+
+
+class DoubleStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.double
+
+
+class FloatStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.float
+
+
+class HalfStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.half
+
+
+class LongStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.long
+
+
+class IntStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int
+
+
+class ShortStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.short
+
+
+class CharStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.int8
+
+
+class BoolStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bool
+
+
+class BFloat16Storage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.bfloat16
+
+
+class ComplexDoubleStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cdouble
+
+
+class ComplexFloatStorage(_CudaLegacyStorage):
+    @classproperty
+    def dtype(self):
+        _warn_typed_storage_removal()
+        return self._dtype
+
+    @classproperty
+    def _dtype(self):
+        return torch.cfloat
+
+
+del _LegacyStorage
+del _CudaLegacyStorage
+
+torch._storage_classes.add(DoubleStorage)
+torch._storage_classes.add(FloatStorage)
+torch._storage_classes.add(LongStorage)
+torch._storage_classes.add(IntStorage)
+torch._storage_classes.add(ShortStorage)
+torch._storage_classes.add(CharStorage)
+torch._storage_classes.add(ByteStorage)
+torch._storage_classes.add(HalfStorage)
+torch._storage_classes.add(BoolStorage)
+torch._storage_classes.add(BFloat16Storage)
+torch._storage_classes.add(ComplexDoubleStorage)
+torch._storage_classes.add(ComplexFloatStorage)
+
+
+class _WrappedTritonKernel:
+    """Just a simple wrapper to store some metadata for testing purposes."""
+
+    def __init__(self, kernel):
+        self.kernel = kernel
+        self.kernel_invoked = False
+
+    def __call__(self, *args, **kwargs):
+        res = self.kernel(*args, **kwargs)
+        self.kernel_invoked = True
+        return res
+
+
+def _register_triton_kernels():
+    if torch._running_with_deploy():
+        return
+
+    @_WrappedTritonKernel
+    def kernel_impl(*args, **kwargs):
+        from torch.sparse._triton_ops import bsr_dense_mm
+
+        return bsr_dense_mm(*args, skip_checks=True, **kwargs)
+
+    @_WrappedTritonKernel
+    def addmm_kernel_impl(*args, **kwargs):
+        from torch.sparse._triton_ops import bsr_dense_addmm
+
+        return bsr_dense_addmm(*args, skip_checks=True, **kwargs)
+
+    has_triton = importlib.util.find_spec("triton") is not None
+    if has_triton:
+        torch._TritonLibrary.registerOp(
+            "_triton_bsr_dense_mm_out",
+            "_triton_bsr_dense_mm_out(Tensor bsr, Tensor dense, *, Tensor(a!) out) -> Tensor(a!)",
+            kernel_impl,
+            "SparseCsrCUDA",
+        )
+
+        torch._TritonLibrary.registerOp(
+            "_triton_bsr_dense_addmm_out",
+            (
+                "_triton_bsr_dense_addmm_out(Tensor input, Tensor bsr, Tensor dense,"
+                " *, Scalar beta, Scalar alpha, Tensor(a!) out) -> Tensor(a!)"
+            ),
+            addmm_kernel_impl,
+            "SparseCsrCUDA",
+        )
+
+
+_lazy_call(_register_triton_kernels)
+
+
+from . import amp, jiterator, nvtx, profiler, sparse
+
+__all__ = [
+    # Typed storage and tensors
+    "BFloat16Storage",
+    "BFloat16Tensor",
+    "BoolStorage",
+    "BoolTensor",
+    "ByteStorage",
+    "ByteTensor",
+    "CharStorage",
+    "CharTensor",
+    "ComplexDoubleStorage",
+    "ComplexFloatStorage",
+    "DoubleStorage",
+    "DoubleTensor",
+    "FloatStorage",
+    "FloatTensor",
+    "HalfStorage",
+    "HalfTensor",
+    "IntStorage",
+    "IntTensor",
+    "LongStorage",
+    "LongTensor",
+    "ShortStorage",
+    "ShortTensor",
+    "CUDAGraph",
+    "CudaError",
+    "DeferredCudaCallError",
+    "Event",
+    "ExternalStream",
+    "OutOfMemoryError",
+    "Stream",
+    "StreamContext",
+    "amp",
+    "caching_allocator_alloc",
+    "caching_allocator_delete",
+    "can_device_access_peer",
+    "check_error",
+    "cudaStatus",
+    "cudart",
+    "current_blas_handle",
+    "current_device",
+    "current_stream",
+    "default_generators",
+    "default_stream",
+    "device",
+    "device_count",
+    "device_of",
+    "empty_cache",
+    "get_allocator_backend",
+    "CUDAPluggableAllocator",
+    "change_current_allocator",
+    "get_arch_list",
+    "get_device_capability",
+    "get_device_name",
+    "get_device_properties",
+    "get_gencode_flags",
+    "get_rng_state",
+    "get_rng_state_all",
+    "get_sync_debug_mode",
+    "graph",
+    "graph_pool_handle",
+    "graphs",
+    "has_half",
+    "has_magma",
+    "init",
+    "initial_seed",
+    "ipc_collect",
+    "is_available",
+    "is_bf16_supported",
+    "is_current_stream_capturing",
+    "is_initialized",
+    "jiterator",
+    "list_gpu_processes",
+    "make_graphed_callables",
+    "manual_seed",
+    "manual_seed_all",
+    "max_memory_allocated",
+    "max_memory_cached",
+    "max_memory_reserved",
+    "mem_get_info",
+    "memory",
+    "memory_allocated",
+    "memory_cached",
+    "memory_reserved",
+    "memory_snapshot",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "memory_summary",
+    "memory_usage",
+    "temperature",
+    "power_draw",
+    "clock_rate",
+    "nccl",
+    "nvtx",
+    "profiler",
+    "random",
+    "reset_accumulated_memory_stats",
+    "reset_max_memory_allocated",
+    "reset_max_memory_cached",
+    "reset_peak_memory_stats",
+    "seed",
+    "seed_all",
+    "set_device",
+    "set_per_process_memory_fraction",
+    "set_rng_state",
+    "set_rng_state_all",
+    "set_stream",
+    "set_sync_debug_mode",
+    "sparse",
+    "stream",
+    "streams",
+    "synchronize",
+    "utilization",
+]
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3096f4e8512a1c5cc37d7328d5bb79cc24640b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52ac78d5adff2e7ce813288779dec57ce5ce2954
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_memory_viz.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a79d4618c950de3b366622fbcc6a1cbbc9da95e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_sanitizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..227d58e8b6a28b41e3123278d85d87bdfefbe0c3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/comm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/comm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb856cc5f5a44403612ec678de1174d2eda150f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/comm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/error.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/error.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70c4e65a59eee7949c863c25f7425f82262da24e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/error.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/graphs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/graphs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19caddd9980441d869e4befd4b346ec28d336a54
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/graphs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/jiterator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/jiterator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8cc9465913ef891167f69289b0371b7c132e0f6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/jiterator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/memory.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/memory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48d451960c45f566a189beaf1ab4038444303c06
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/memory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/nccl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/nccl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66e77a67e7a99b038c21600a9dca06d76a902af1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/nccl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/nvtx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/nvtx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09bde3a7d236c6828e79c62ffa106452795c2b9e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/nvtx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9842e1e926c3ba9e5cd8bbe52e294ecf3ab17672
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/random.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/random.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1611416ee848d252856cdc44efbeb42679b620f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/random.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/sparse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/sparse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..162da8c0e2dff0ee9c526bad6fc0a9455e1b7f62
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/sparse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/__pycache__/streams.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/__pycache__/streams.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15f5665ef01a2b6c455ff6d3eb2fc1bf20a5f49
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/__pycache__/streams.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/_memory_viz.py b/MLPY/Lib/site-packages/torch/cuda/_memory_viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c959803f4d01d04b8590fe678df2124c4ba9c68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/_memory_viz.py
@@ -0,0 +1,626 @@
+import pickle
+import sys
+import os
+import io
+import subprocess
+import json
+from functools import lru_cache
+from typing import Any
+from itertools import groupby
+import base64
+import warnings
+
+cache = lru_cache(None)
+
+__all__ = ["format_flamegraph", "segments", "memory", "compare"]
+
+def _frame_fmt(f, full_filename=False):
+    i = f['line']
+    fname = f['filename']
+    if not full_filename:
+        fname = fname.split('/')[-1]
+    func = f['name']
+    return f'{fname}:{i}:{func}'
+
+@cache
+def _frame_filter(name, filename):
+    omit_functions = [
+        "unwind::unwind",
+        "CapturedTraceback::gather",
+        "gather_with_cpp",
+        "_start",
+        "__libc_start_main",
+        "PyEval_",
+        "PyObject_",
+        "PyFunction_",
+    ]
+    omit_filenames = [
+        "core/boxing",
+        "/Register",
+        "/Redispatch",
+        "pythonrun.c",
+        "Modules/main.c",
+        "Objects/call.c",
+        "Objects/methodobject.c",
+        "pycore_ceval.h",
+        "ceval.c",
+        "cpython/abstract.h",
+    ]
+    for of in omit_functions:
+        if of in name:
+            return False
+    for of in omit_filenames:
+        if of in filename:
+            return False
+    return True
+
+def _frames_fmt(frames, full_filename=False, reverse=False):
+    if reverse:
+        frames = reversed(frames)
+    return [_frame_fmt(f, full_filename) for f in frames if _frame_filter(f['name'], f['filename'])]
+
+def _block_extra_legacy(b):
+    if 'history' in b:
+        frames = b['history'][0].get('frames', [])
+        real_size = b['history'][0]['real_size']
+    else:
+        real_size = b.get('requested_size', b['size'])
+        frames = []
+    return frames, real_size
+
+def _block_extra(b):
+    if 'frames' not in b:
+        # old snapshot format made it more complicated to get frames/allocated size
+        return _block_extra_legacy(b)
+    return b['frames'], b['requested_size']
+
+def format_flamegraph(flamegraph_lines, flamegraph_script=None):
+    if flamegraph_script is None:
+        flamegraph_script = f'/tmp/{os.getuid()}_flamegraph.pl'
+    if not os.path.exists(flamegraph_script):
+        import urllib.request
+        print(f"Downloading flamegraph.pl to: {flamegraph_script}")
+        urllib.request.urlretrieve(
+            'https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl', flamegraph_script)
+        subprocess.check_call(['chmod', '+x', flamegraph_script])
+    args = [flamegraph_script, '--countname', 'bytes']
+    p = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf-8')
+    assert p.stdin is not None
+    assert p.stdout is not None
+    p.stdin.write(flamegraph_lines)
+    p.stdin.close()
+    result = p.stdout.read()
+    p.stdout.close()
+    p.wait()
+    assert p.wait() == 0
+    return result
+
+def _write_blocks(f, prefix, blocks):
+    def frames_fragment(frames):
+        if not frames:
+            return "<non-python>"
+        return ';'.join(_frames_fmt(frames, reverse=True))
+    for b in blocks:
+        if 'history' not in b:
+            frames, accounted_for_size = _block_extra(b)
+            f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {accounted_for_size}\n')
+        else:
+            accounted_for_size = 0
+            for h in b['history']:
+                sz = h['real_size']
+                accounted_for_size += sz
+                if 'frames' in h:
+                    frames = h['frames']
+                    f.write(f'{prefix};{b["state"]};{frames_fragment(frames)} {sz}\n')
+                else:
+                    f.write(f'{prefix};{b["state"]};<no-context> {sz}\n')
+        gaps = b['size'] - accounted_for_size
+        if gaps:
+            f.write(f'{prefix};{b["state"]};<gaps> {gaps}\n')
+
+def segments(snapshot, format_flamegraph=format_flamegraph):
+    f = io.StringIO()
+    for seg in snapshot['segments']:
+        prefix = f'stream_{seg["stream"]};seg_{seg["address"]}'
+        _write_blocks(f, prefix, seg['blocks'])
+    return format_flamegraph(f.getvalue())
+
+def memory(snapshot, format_flamegraph=format_flamegraph):
+    f = io.StringIO()
+    for seg in snapshot['segments']:
+        prefix = f'stream_{seg["stream"]}'
+        _write_blocks(f, prefix, seg['blocks'])
+    return format_flamegraph(f.getvalue())
+
+def compare(before, after, format_flamegraph=format_flamegraph):
+    def _seg_key(seg):
+        return (seg['address'], seg['total_size'])
+
+    def _seg_info(seg):
+        return f'stream_{seg["stream"]};seg_{seg["address"]}'
+
+    f = io.StringIO()
+
+    before_segs = {_seg_key(seg) for seg in before}
+    after_segs = {_seg_key(seg) for seg in after}
+
+    print(f'only_before = {[a for a,_ in (before_segs - after_segs)]}')
+    print(f'only_after = {[a for a,_ in (after_segs - before_segs)]}')
+
+    for seg in before:
+        if _seg_key(seg) not in after_segs:
+            _write_blocks(f, f'only_before;{_seg_info(seg)}', seg['blocks'])
+
+    for seg in after:
+        if _seg_key(seg) not in before_segs:
+            _write_blocks(f, f'only_after;{_seg_info(seg)}', seg['blocks'])
+
+    return format_flamegraph(f.getvalue())
+
+def _format_size(num):
+    # https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
+    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
+        if abs(num) < 1024.0:
+            return f"{num:3.1f}{unit}B"
+        num /= 1024.0
+    return f"{num:.1f}YiB"
+
+class Bytes:
+    def __init__(self, value):
+        self.value = value
+
+    def __add__(self, rhs):
+        return Bytes(self.value + rhs)
+
+    def __repr__(self):
+        return _format_size(self.value)
+
+def calc_active(seg):
+    return sum(b['size'] for b in seg['blocks'] if b['state'] == 'active_allocated')
+
+def _report_free(free_external, free_internal):
+    total = free_external + free_internal
+    suffix = ''
+    if total != 0:
+        pct = (free_internal / total) * 100
+        suffix = f' ({pct:.1f}% internal)'
+    return f'{Bytes(total)}{suffix}'
+
+PAGE_SIZE = 1024 * 1024 * 20
+legend = f"""\
+
+Legend:
+    [a     ] - a segment in the allocator
+     ^-- a page {Bytes(PAGE_SIZE)} of memory in the segment
+    a-z: pages filled with a single block's content
+    ' ': page is completely free
+    *: page if completely full with multiple blocks
+    0-9: page is partially full with tensors of multiple blocks (9 == 90% full)
+    (X% internal) - of the free memory, X% is free because we rounded the size of the allocation.
+"""
+
+def segsum(data):
+    r"""Visually reports how the allocator has filled its segments.
+
+    This printout can help debug fragmentation issues since free fragments
+    will appear as gaps in this printout.  The amount of free space is reported
+    for each segment.
+    We distinguish between internal free memory which occurs because the
+    allocator rounds the allocation size, and external free memory, which are
+    the gaps between allocations in a segment.
+    Args:
+        data: snapshot dictionary created from _snapshot()
+    """
+    segments = []
+    out = io.StringIO()
+    out.write(f"Summary of segments >= {Bytes(PAGE_SIZE)} in size\n")
+    total_reserved = 0
+    total_allocated = 0
+    free_external = 0
+    free_internal = 0
+    for seg in sorted(data['segments'], key=lambda x: (x['total_size'], calc_active(x))):
+        total_reserved += seg['total_size']
+
+        seg_free_external = 0
+        seg_free_internal = 0
+        seg_allocated = 0
+        all_ranges = []
+        boffset = 0
+        for b in seg['blocks']:
+            active = b['state'] == 'active_allocated'
+            if active:
+                _, allocated_size = _block_extra(b)
+                all_ranges.append((boffset, allocated_size, True))
+                seg_allocated += allocated_size
+                seg_free_internal += b['size'] - allocated_size
+            else:
+                seg_free_external += b['size']
+
+            boffset += b['size']
+
+        total_allocated += seg_allocated
+        free_external += seg_free_external
+        free_internal += seg_free_internal
+
+        nseg = (seg['total_size'] - 1) // PAGE_SIZE + 1
+        occupied = [' ' for _ in range(nseg)]
+        frac = [0.0 for _ in range(nseg)]
+        active_size = 0
+        for i, (start_, size, active) in enumerate(all_ranges):
+            active_size += size
+            finish_ = (start_ + size)
+            start = start_ // PAGE_SIZE
+            finish = (finish_ - 1) // PAGE_SIZE + 1
+            m = chr(ord('a' if active else 'A') + (i % 26))
+            for j in range(start, finish):
+                s = max(start_, j * PAGE_SIZE)
+                e = min(finish_, (j + 1) * PAGE_SIZE)
+                frac[j] += (e - s) / PAGE_SIZE
+                if occupied[j] != ' ':
+                    occupied[j] = '0123456789*'[int(frac[j] * 10)]
+                else:
+                    occupied[j] = m
+        stream = '' if seg['stream'] == 0 else f', stream_{seg["stream"]}'
+        body = ''.join(occupied)
+        assert seg_free_external + seg_free_internal + seg_allocated == seg['total_size']
+        stream = f' stream_{seg["stream"]}' if seg['stream'] != 0 else ''
+        if seg['total_size'] >= PAGE_SIZE:
+            out.write(f'[{body}] {Bytes(seg["total_size"])} allocated, '
+                      f'{_report_free(seg_free_external, seg_free_internal)} free{stream}\n')
+    out.write(f'segments: {len(data["segments"])}\n')
+    out.write(f'total_reserved: {Bytes(total_reserved)}\n')
+    out.write(f'total_allocated: {Bytes(total_allocated)}\n')
+    internal_external = f' ({Bytes(free_internal)} internal + {Bytes(free_external)} external)' if free_internal else ''
+    out.write(f'total_free: {_report_free(free_external, free_internal)}\n')
+    out.write(legend)
+    assert free_internal + free_external + total_allocated == total_reserved
+    return out.getvalue()
+
+def trace(data):
+    out = io.StringIO()
+
+    def format(entries):
+        segment_intervals : list = []
+        segment_addr_to_name = {}
+        allocation_addr_to_name = {}
+
+        free_names : list = []
+        next_name = 0
+
+        def _name():
+            nonlocal next_name
+            if free_names:
+                return free_names.pop()
+            r, m = next_name // 26, next_name % 26
+            next_name += 1
+            return f'{chr(ord("a") + m)}{"" if r == 0 else r}'
+
+        def find_segment(addr):
+            for name, saddr, size in segment_intervals:
+                if addr >= saddr and addr < saddr + size:
+                    return name, saddr
+            for i, seg in enumerate(data['segments']):
+                saddr = seg['address']
+                size = seg['allocated_size']
+                if addr >= saddr and addr < saddr + size:
+                    return f'seg_{i}', saddr
+            return None, None
+        count = 0
+        out.write(f'{len(entries)} entries\n')
+
+
+        total_reserved = 0
+        for seg in data['segments']:
+            total_reserved += seg['total_size']
+
+        for count, e in enumerate(entries):
+            if e['action'] == 'alloc':
+                addr, size = e['addr'], e['size']
+                n = _name()
+                seg_name, seg_addr = find_segment(addr)
+                if seg_name is None:
+                    seg_name = "MEM"
+                    offset = addr
+                else:
+                    offset = addr - seg_addr
+                out.write(f'{n} = {seg_name}[{offset}:{Bytes(size)}]\n')
+                allocation_addr_to_name[addr] = (n, size, count)
+                count += size
+            elif e['action'] == 'free_requested':
+                addr, size = e['addr'], e['size']
+                name, _, _ = allocation_addr_to_name.get(addr, (addr, None, None))
+                out.write(f'del {name} # {Bytes(size)}\n')
+            elif e['action'] == 'free_completed':
+                addr, size = e['addr'], e['size']
+                count -= size
+                name, _, _ = allocation_addr_to_name.get(addr, (addr, None, None))
+                out.write(f'# free completed for {name} {Bytes(size)}\n')
+                if name in allocation_addr_to_name:
+                    free_names.append(name)
+                    del allocation_addr_to_name[name]
+            elif e['action'] == 'segment_alloc':
+                addr, size = e['addr'], e['size']
+                name = _name()
+                out.write(f'{name} = cudaMalloc({addr}, {Bytes(size)})\n')
+                segment_intervals.append((name, addr, size))
+                segment_addr_to_name[addr] = name
+            elif e['action'] == 'segment_free':
+                addr, size = e['addr'], e['size']
+                name = segment_addr_to_name.get(addr, addr)
+                out.write(f'cudaFree({name}) # {Bytes(size)}\n')
+                if name in segment_addr_to_name:
+                    free_names.append(name)
+                    del segment_addr_to_name[name]
+            elif e['action'] == 'oom':
+                size = e['size']
+                free = e['device_free']
+                out.write(f'raise OutOfMemoryError() # {Bytes(size)} requested, {Bytes(free)} free in CUDA\n')
+            else:
+                out.write(f'{e}\n')
+        out.write(f"TOTAL MEM: {Bytes(count)}")
+    for i, d in enumerate(data['device_traces']):
+        if d:
+            out.write(f'Device {i} ----------------\n')
+            format(d)
+    return out.getvalue()
+
+
+_memory_viz_template = r"""
+<!DOCTYPE html>
+<html>
+<head>
+</head>
+<body>
+<script type="module">
+import {add_local_files} from "https://cdn.jsdelivr.net/gh/pytorch/pytorch@main/torch/utils/viz/MemoryViz.js"
+const local_files = $SNAPSHOT
+add_local_files(local_files, $VIZ_KIND)
+</script>
+</body>
+"""
+
+def _format_viz(data, viz_kind, device):
+    if device is not None:
+        warnings.warn('device argument is deprecated, plots now contain all device')
+    buffer = pickle.dumps(data)
+    buffer += b'\x00' * (3 - len(buffer) % 3)
+    # Encode the buffer with base64
+    encoded_buffer = base64.b64encode(buffer).decode('utf-8')
+
+    json_format = json.dumps([{"name": 'snapshot.pickle', "base64": encoded_buffer}])
+    return _memory_viz_template.replace('$VIZ_KIND', repr(viz_kind)) \
+                               .replace('$SNAPSHOT', json_format)
+
+def trace_plot(data, device=None, plot_segments=False):
+    """Generate a visualization over time of the memory usage recorded by the trace as an html file.
+
+    Args:
+        data: Memory snapshot as generated from torch.cuda.memory._snapshot()
+        device (torch.device, optional): Generate the trace for this device, needed if multiple devices have allocations.
+        plot_segments (bool, optional): Plots memory returned from cudaMalloc, rather than individual allocations.
+                                        Defaults to False.
+
+    Returns:
+        str: HTML of visualization
+    """
+    return _format_viz(data, 'Active Memory Timeline' if not plot_segments else 'Active Cached Memory Timeline', device)
+
+
+def _profile_to_snapshot(profile):
+    import torch
+    from torch.profiler._memory_profiler import Action, TensorKey
+    from torch._C._profiler import _EventType
+    memory_profile = profile._memory_profile()
+
+    allocation_stacks = {}
+    for event in memory_profile._op_tree.sorted_nodes:
+        if event.tag == _EventType.Allocation:
+            parent = event.parent
+            python_parents = []
+            while parent:
+                if parent.tag in (_EventType.PyCall, _EventType.PyCCall):
+                    python_parents.append(parent)
+                parent = parent.parent
+            key = TensorKey.from_allocation(event.extra_fields)
+
+            # Corner case: If allocation doesn't have an ID (can't prove it was used as a Tensor)
+            #              key will be None. I should add some way to identify these, I just haven't yet.
+            if key and event.extra_fields.alloc_size > 0:
+                allocation_stacks[key] = python_parents
+
+
+    device_count = torch.cuda.device_count()
+    snapshot = {
+        'device_traces': [[] for _ in range(device_count + 1)],
+        'segments': [{'device': device,
+                      'address': None,
+                      'total_size': 0,
+                      'stream': 0,
+                      'blocks': []} for device in range(device_count + 1)]
+    }
+
+    def to_device(device):
+        if device.type == 'cuda':
+            return device.index
+        else:
+            return device_count
+
+    def allocate(size, tensor_key, version, during_trace=True):
+        device = to_device(tensor_key.device)
+        addr = tensor_key.storage.ptr
+
+        seg = snapshot['segments'][device]  # type: ignore[index]
+        if seg['address'] is None or seg['address'] > addr:
+            seg['address'] = addr
+        seg['total_size'] = max(seg['total_size'], addr + size)  # record max addr for now, we will make it the size later
+        category = memory_profile._categories.get(tensor_key, version)
+        category = category.name.lower() if category is not None else "unknown"
+        stack = allocation_stacks.get(tensor_key, ())
+        stack = [{'filename': 'none', 'line': 0, 'name': p.name} for p in stack]
+        r = {'action': 'alloc', 'addr': addr, 'size': size, 'stream': 0, 'frames': stack, 'category': category}
+        if during_trace:
+            snapshot['device_traces'][device].append(r)  # type: ignore[index]
+        return r
+
+    def free(alloc, device):
+        for e in ('free_requested', 'free_completed'):
+            snapshot['device_traces'][device].append({'action': e,  # type: ignore[index]
+                                                      'addr': alloc['addr'],
+                                                      'size': alloc['size'],
+                                                      'stream': 0,
+                                                      'frames': alloc['frames']})
+
+    kv_to_elem = {}
+
+
+
+    # create the device trace
+    for time, action, (tensor_key, version), size in memory_profile.timeline:
+        if not isinstance(tensor_key, TensorKey):
+            continue
+        if action == Action.CREATE:
+            kv_to_elem[(tensor_key, version)] = allocate(size, tensor_key, version)
+        elif action == Action.DESTROY:
+            free(kv_to_elem.pop((tensor_key, version)), to_device(tensor_key.device))
+        elif action == Action.INCREMENT_VERSION:
+            free(kv_to_elem.pop((tensor_key, version)), to_device(tensor_key.device))
+            kv_to_elem[(tensor_key, version + 1)] = allocate(size, tensor_key, version + 1)
+        elif action == Action.PREEXISTING:
+            kv_to_elem[(tensor_key, version)] = allocate(size, tensor_key, version, during_trace=False)
+
+
+    # create the final snapshot state
+    blocks_at_end = [(to_device(tensor_key.device), event['addr'], event['size'], event['frames'])
+                     for (tensor_key, version), event in kv_to_elem.items()]
+    for device, blocks in groupby(sorted(blocks_at_end), key=lambda x: x[0]):
+        seg = snapshot['segments'][device]  # type: ignore[index]
+        last_addr = seg['address']
+        for _, addr, size, frames in blocks:
+            if last_addr < addr:
+                seg['blocks'].append({'size': addr - last_addr, 'state': 'inactive'})
+            seg['blocks'].append({'size': size, 'state': 'active_allocated', 'requested_size': size, 'frames': frames})
+            last_addr = addr + size
+        if last_addr < seg['total_size']:
+            seg['blocks'].append({'size': seg['total_size'] - last_addr, 'state': 'inactive'})
+
+    snapshot['segments'] = [seg for seg in snapshot['segments'] if seg['blocks']]  # type: ignore[attr-defined]
+    for seg in snapshot['segments']:  # type: ignore[attr-defined, name-defined, no-redef]
+        seg['total_size'] -= seg['address']
+        if not seg['blocks']:
+            seg['blocks'].append({'size': seg['total_size'], 'state': 'inactive'})
+
+    return snapshot
+
+def profile_plot(profile, device=None):
+    """Generate a visualization over time of the memory usage recorded by kineto memory profiling as an html file.
+
+    Args:
+        profile: profile as generated by `torch.profiler.profile(profile_memory=True)`
+        device (torch.device, optional): Generate the trace for this device, needed if multiple devices have allocations.
+
+    Returns:
+        str: HTML of visualization
+    """
+    snapshot = _profile_to_snapshot(profile)
+    return _format_viz(snapshot, 'Active Memory Timeline', device)
+
+
+def segment_plot(data: Any, device=None):
+    return _format_viz(data, 'Allocator State History', device)
+
+if __name__ == "__main__":
+    import os.path
+    thedir = os.path.realpath(os.path.dirname(__file__))
+    if thedir in sys.path:
+        # otherwise we find cuda/random.py as random...
+        sys.path.remove(thedir)
+    import argparse
+
+    fn_name = 'torch.cuda.memory._snapshot()'
+    pickled = f'pickled memory statistics from {fn_name}'
+    parser = argparse.ArgumentParser(description=f'Visualize memory dumps produced by {fn_name}')
+
+    subparsers = parser.add_subparsers(dest='action')
+
+    def _output(p):
+        p.add_argument('-o', '--output', default='output.svg', help='flamegraph svg (default: output.svg)')
+
+    description = 'Prints overall allocation statistics and a visualization of how the allocators segments are currently filled.'
+    stats_a = subparsers.add_parser('stats', description=description)
+    stats_a.add_argument('input', help=pickled)
+
+    description = 'Prints buffer of the most recent allocation events embedded in the snapshot in a Pythonic style.'
+    trace_a = subparsers.add_parser('trace', description=description)
+    trace_a.add_argument('input', help=pickled)
+
+    description = 'Generate a flamegraph that visualizes what memory is stored in each allocator segment (aka block)'
+    segments_a = subparsers.add_parser('segments', description=description)
+    segments_a.add_argument('input', help=pickled)
+    _output(segments_a)
+
+    description = "Generate a flamegraph the program locations contributing to CUDA memory usage."
+    memory_a = subparsers.add_parser('memory', description=description)
+    memory_a.add_argument('input', help=pickled)
+    _output(memory_a)
+
+    description = 'Generate a flamegraph that shows segments (aka blocks) that have been added ' \
+        'or removed between two different memorys snapshots.'
+    compare_a = subparsers.add_parser('compare', description=description)
+    compare_a.add_argument('before', help=pickled)
+    compare_a.add_argument('after', help=pickled)
+    _output(compare_a)
+
+    plots = (
+        ("trace_plot", "Generate a visualization over time of the memory usage recorded by the trace as an html file."),
+        ("segment_plot", "Visualize how allocations are packed into allocator segments at each point in a trace as an html file.")
+    )
+    for cmd, description in plots:
+        trace_plot_a = subparsers.add_parser(cmd, description=description)
+        trace_plot_a.add_argument('input', help=pickled)
+        help = 'visualize trace from this device (default: chooses the only device with trace info or errors)'
+        trace_plot_a.add_argument('-d', '--device', type=int, default=None, help=help)
+        help = 'path to save the visualization(default: output.html)'
+        trace_plot_a.add_argument('-o', '--output', default='output.html', help=help)
+        if cmd == "trace_plot":
+            help = 'visualize change to segments rather than individual allocations'
+            trace_plot_a.add_argument('-s', '--segments', action='store_true', help=help)
+
+
+    args = parser.parse_args()
+
+    def _read(name):
+        if name == '-':
+            f = sys.stdin.buffer
+        else:
+            f = open(name, 'rb')
+        data = pickle.load(f)
+        if isinstance(data, list):  # segments only...
+            data = {'segments': data, 'traces': []}
+        return data
+
+    def _write(name, data):
+        with open(name, 'w') as f:
+            f.write(data)
+
+    if args.action == 'segments':
+        data = _read(args.input)
+        _write(args.output, segments(data))
+    elif args.action == 'memory':
+        data = _read(args.input)
+        _write(args.output, memory(data))
+    elif args.action == 'stats':
+        data = _read(args.input)
+        print(segsum(data))
+    elif args.action == 'trace':
+        data = _read(args.input)
+        print(trace(data))
+    elif args.action == 'compare':
+        before = _read(args.before)
+        after = _read(args.after)
+        _write(args.output, compare(before, after))
+    elif args.action == 'trace_plot':
+        data = _read(args.input)
+        _write(args.output, trace_plot(data, device=args.device, plot_segments=args.segments))
+    elif args.action == 'segment_plot':
+        data = _read(args.input)
+        _write(args.output, segment_plot(data, device=args.device))
diff --git a/MLPY/Lib/site-packages/torch/cuda/_sanitizer.py b/MLPY/Lib/site-packages/torch/cuda/_sanitizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e01f8a87a43f1643c8bcfe2577e8977e7fb4e3e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/_sanitizer.py
@@ -0,0 +1,622 @@
+r"""
+This module introduces CUDA Sanitizer, a tool for detecting synchronization errors between kernels ran on different streams.
+
+It stores information on accesses to tensors to determine if they are synchronized
+or not. When enabled in a python program and a possible data race is detected, a
+detailed warning will be printed and the program will exit.
+
+It can be enabled either by importing this module and calling
+:func:`enable_cuda_sanitizer()` or by exporting the ``TORCH_CUDA_SANITIZER``
+environment variable.
+"""
+
+import enum
+import functools
+import inspect
+import io
+import logging
+import sys
+import textwrap
+import traceback
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, TypeVar
+
+import torch
+import torch.utils._cuda_trace as cuda_trace
+from torch.utils import _pytree as pytree
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+DEFAULT_STREAM_ID = 0
+
+TK = TypeVar("TK")
+TVa = TypeVar("TVa")
+TVb = TypeVar("TVb")
+
+DataPtr = int
+StreamId = int
+EventId = int
+SeqNum = int
+
+logger = logging.getLogger(__name__)
+
+
+class AccessType(enum.Enum):
+    READ = enum.auto()
+    WRITE = enum.auto()
+
+    def __str__(self):
+        return "reading from" if self is AccessType.READ else "writing to"
+
+
+@dataclass
+class Access:
+    r"""Stores information about a single access to a tensor by a kernel.
+
+    Args:
+        type: either AccessType.READ or AccessType.Write.
+        seq_num: the sequential number of the kernel performing the access.
+        stream: the stream id of the stream executing the kernel.
+        operator: the schema of the launched kernel, which lists the
+            arguments and return type.
+        aliases: the arguments in the schema this access corresponds to.
+        is_output: Whether the tensor was an output of the kernel.
+        stack_trace: the stack summary object captured during access.
+    """
+
+    type: AccessType
+    seq_num: SeqNum
+    stream: StreamId
+    operator: str
+    aliases: List[str]
+    is_output: bool
+    stack_trace: traceback.StackSummary
+
+
+class SynchronizationError(Exception):
+    """Base class for errors detected by CUDA Sanitizer."""
+
+    pass
+
+
+class UnsynchronizedAccessError(SynchronizationError):
+    """Stores information about two unsynchronized accesses to one data pointer."""
+
+    def __init__(
+        self,
+        data_ptr: DataPtr,
+        allocation_stack_trace: Optional[traceback.StackSummary],
+        current_access: Access,
+        previous_access: Access,
+    ):
+        self.data_ptr = data_ptr
+        self.allocation_stack_trace = allocation_stack_trace
+        self.current_access = current_access
+        self.previous_access = previous_access
+
+    def __str__(self):
+        def format_access(access: Access):
+            message.write(f"{access.operator}\n{access.type}")
+            if access.aliases:
+                message.write(" argument(s) " + ", ".join(access.aliases))
+                if access.is_output:
+                    message.write(", and to")
+            if access.is_output:
+                message.write(" the output")
+            message.write(
+                f"\nWith stack trace:\n{''.join(access.stack_trace.format())}\n"
+            )
+
+        with io.StringIO() as message:
+            message.write(
+                textwrap.dedent(
+                    f"""\
+                    ============================
+                    CSAN detected a possible data race on tensor with data pointer {self.data_ptr}
+                    Access by stream {self.current_access.stream} during kernel:
+                    """
+                )
+            )
+            format_access(self.current_access)
+
+            message.write(
+                f"Previous access by stream {self.previous_access.stream} during kernel:\n"
+            )
+            format_access(self.previous_access)
+
+            if self.allocation_stack_trace:
+                message.write(
+                    "Tensor was allocated with stack trace:\n"
+                    f"{''.join(self.allocation_stack_trace.format())}"
+                )
+            else:
+                message.write("Trace for tensor allocation not found.")
+            return message.getvalue()
+
+
+class CUDASanitizerErrors(Exception):
+    """Wrapper class for errors reported by CUDA Sanitizer."""
+
+    def __init__(self, errors: List[SynchronizationError]):
+        self.errors = errors
+
+    def __str__(self):
+        return f"detected {len(self.errors)} errors"
+
+
+@dataclass
+class TensorInfo:
+    r"""Stores information about a single tensor and recent accesses to it.
+
+    Args:
+        allocation_stack_trace: the stack summary object captured during tensor
+            allocation. Can be ``None`` if the allocation wasn't caught by CSAN.
+        reads: list of read accesses to the tensor that were performed since
+            the last write.
+        write: the last write access to the tensor.
+    """
+
+    allocation_stack_trace: Optional[traceback.StackSummary]
+    reads: List[Access] = field(default_factory=list)
+    write: Optional[Access] = None
+
+
+class _TensorsAccessed:
+    def __init__(self):
+        self.accesses: Dict[DataPtr, TensorInfo] = {}
+
+    def ensure_tensor_exists(self, data_ptr: DataPtr) -> None:
+        if data_ptr not in self.accesses:
+            logger.info(
+                "Found tensor with pointer: %s, but no matching tensor "
+                "allocation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                data_ptr,
+            )
+            self.create_tensor(data_ptr, None)
+
+    def ensure_tensor_does_not_exist(self, data_ptr: DataPtr) -> None:
+        if data_ptr in self.accesses:
+            logger.info(
+                "Found duplicate tensor allocation in the trace for tensor with "
+                "pointer: %s. Assuming the trace for tensor deallocation "
+                "wasn't caught and backfilling it now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                data_ptr,
+            )
+            self.delete_tensor(data_ptr)
+
+    def create_tensor(
+        self, data_ptr: DataPtr, stack_trace: Optional[traceback.StackSummary]
+    ) -> None:
+        self.accesses[data_ptr] = TensorInfo(stack_trace)
+
+    def delete_tensor(self, data_ptr: DataPtr) -> None:
+        del self.accesses[data_ptr]
+
+    def were_there_reads_since_last_write(self, data_ptr: DataPtr) -> bool:
+        return True if self.accesses[data_ptr].reads else False
+
+    def get_allocation_stack_trace(
+        self, data_ptr: DataPtr
+    ) -> Optional[traceback.StackSummary]:
+        return self.accesses[data_ptr].allocation_stack_trace
+
+    def get_write(self, data_ptr: DataPtr) -> Optional[Access]:
+        return self.accesses[data_ptr].write
+
+    def get_reads(self, data_ptr: DataPtr) -> List[Access]:
+        return self.accesses[data_ptr].reads
+
+    def add_read(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].reads.append(access)
+
+    def set_write(self, data_ptr: DataPtr, access: Access) -> None:
+        self.accesses[data_ptr].write = access
+        self.accesses[data_ptr].reads = []
+
+
+class StreamSynchronizations:
+    def __init__(self):
+        self.current_sync_states: Dict[StreamId, Dict[StreamId, SeqNum]] = {}
+        self.recorded_sync_states: Dict[EventId, Dict[StreamId, SeqNum]] = {}
+        self.host_sync_state: Dict[StreamId, SeqNum] = {}
+        self.create_stream(DEFAULT_STREAM_ID)
+
+    def _ensure_stream_exists(self, stream: StreamId) -> None:
+        if stream not in self.current_sync_states:
+            logger.info(
+                "Found Stream with id: %s, but no matching stream "
+                "creation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                stream,
+            )
+            self.create_stream(stream)
+
+    def _ensure_event_exists(self, event: EventId) -> None:
+        if event not in self.recorded_sync_states:
+            logger.info(
+                "Found Event with id: %s, but no matching event "
+                "creation in the trace. Backfilling the trace now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                event,
+            )
+            self.create_event(event)
+
+    def _ensure_event_does_not_exist(self, event: EventId) -> None:
+        if event in self.recorded_sync_states:
+            logger.info(
+                "Found duplicate event creation in the trace for event with "
+                "id: %s. Assuming the trace for event deletion wasn't caught "
+                "and backfilling it now. "
+                "Perhaps the sanitizer was enabled after some torch operations?",
+                event,
+            )
+            self.delete_event(event)
+
+    def create_stream(self, stream: StreamId) -> None:
+        if stream in self.current_sync_states:
+            logger.info(
+                "Found duplicate Stream creation in the trace for Stream with "
+                "id: %s. PyTorch Streams are only created once, so this "
+                "trace entry is ignored.",
+                stream,
+            )
+        else:
+            self.host_sync_state[stream] = 0
+            self.current_sync_states[stream] = self.host_sync_state.copy()
+
+    def create_event(self, event: EventId) -> None:
+        self._ensure_event_does_not_exist(event)
+        self.recorded_sync_states[event] = {}
+
+    def delete_event(self, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        del self.recorded_sync_states[event]
+
+    def update_seq_num(self, stream: StreamId, seq_num: SeqNum) -> None:
+        self._ensure_stream_exists(stream)
+        self.current_sync_states[stream][stream] = seq_num
+
+    def record_state(self, event: EventId, stream: StreamId) -> None:
+        self._ensure_event_exists(event)
+        self._ensure_stream_exists(stream)
+        self.recorded_sync_states[event] = self.current_sync_states[stream].copy()
+
+    def _state_wait_for_other(
+        self, state: Dict[StreamId, SeqNum], other: Dict[StreamId, SeqNum]
+    ) -> None:
+        for stream, seq_num in other.items():
+            state[stream] = max(state.get(stream, -1), seq_num)
+
+    def stream_wait_for_event(self, stream: StreamId, event: EventId) -> None:
+        self._ensure_stream_exists(stream)
+        self._ensure_event_exists(event)
+        self._state_wait_for_other(
+            self.current_sync_states[stream], self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_event(self, event: EventId) -> None:
+        self._ensure_event_exists(event)
+        for stream in self.current_sync_states.keys():
+            self.stream_wait_for_event(stream, event)
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.recorded_sync_states[event]
+        )
+
+    def all_streams_wait_for_stream(self, stream: StreamId) -> None:
+        self._ensure_stream_exists(stream)
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.current_sync_states[stream])
+
+        self._state_wait_for_other(
+            self.host_sync_state, self.current_sync_states[stream]
+        )
+
+    def sync_all_streams(self) -> None:
+        for stream, state in self.current_sync_states.items():
+            self.host_sync_state[stream] = state[stream]
+
+        for state in self.current_sync_states.values():
+            self._state_wait_for_other(state, self.host_sync_state)
+
+    def is_ordered_after(
+        self, current_stream: StreamId, seq_num: SeqNum, other_stream: StreamId
+    ) -> bool:
+        self._ensure_stream_exists(current_stream)
+        self._ensure_stream_exists(other_stream)
+        return seq_num <= self.current_sync_states[current_stream].get(other_stream, -1)
+
+
+class EventHandler:
+    """Analyzes CSAN trace for synchronization errors.
+
+    Stores information on each stream's synchronizations with other streams as well
+    as tensor accesses to determine whether a given kernel launch might cause a
+    data race.
+    """
+
+    def __init__(self):
+        self.tensors_accessed = _TensorsAccessed()
+        self.syncs = StreamSynchronizations()
+        self.seq_num: SeqNum = 0
+
+    def _handle_kernel_launch(
+        self,
+        stream: StreamId,
+        read_only: Set[DataPtr],
+        read_write: Set[DataPtr],
+        outputs: Set[DataPtr],
+        operator: str,
+        tensor_aliases: Dict[int, List[str]],
+    ) -> List[SynchronizationError]:
+        def check_conflict(
+            data_ptr: DataPtr, current_access: Access, previous_access: Optional[Access]
+        ) -> None:
+            if previous_access is None:
+                return
+            if not self.syncs.is_ordered_after(
+                current_access.stream, previous_access.seq_num, previous_access.stream
+            ):
+                error_list.append(
+                    UnsynchronizedAccessError(
+                        data_ptr,
+                        self.tensors_accessed.get_allocation_stack_trace(data_ptr),
+                        current_access,
+                        previous_access,
+                    )
+                )
+
+        error_list: List[SynchronizationError] = []
+        self.seq_num += 1
+        self.syncs.update_seq_num(stream, self.seq_num)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(inspect.currentframe()), lookup_lines=False
+        )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
+
+        for data_ptr in read_only:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.READ,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_aliases[data_ptr],
+                data_ptr in outputs,
+                stack_trace,
+            )
+            check_conflict(
+                data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+            )
+            self.tensors_accessed.add_read(data_ptr, current_access)
+
+        for data_ptr in read_write:
+            self.tensors_accessed.ensure_tensor_exists(data_ptr)
+            current_access = Access(
+                AccessType.WRITE,
+                self.seq_num,
+                stream,
+                operator,
+                tensor_aliases[data_ptr],
+                data_ptr in outputs,
+                stack_trace,
+            )
+            if self.tensors_accessed.were_there_reads_since_last_write(data_ptr):
+                for previous_access in self.tensors_accessed.get_reads(data_ptr):
+                    check_conflict(data_ptr, current_access, previous_access)
+            else:
+                check_conflict(
+                    data_ptr, current_access, self.tensors_accessed.get_write(data_ptr)
+                )
+            self.tensors_accessed.set_write(data_ptr, current_access)
+
+        return error_list
+
+    def _handle_event_creation(self, event: EventId) -> None:
+        self.syncs.create_event(event)
+
+    def _handle_event_deletion(self, event: EventId) -> None:
+        self.syncs.delete_event(event)
+
+    def _handle_event_record(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.record_state(event, stream)
+
+    def _handle_event_wait(self, event: EventId, stream: StreamId) -> None:
+        self.syncs.stream_wait_for_event(stream, event)
+
+    def _handle_memory_allocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_does_not_exist(data_ptr)
+        stack_trace = traceback.StackSummary.extract(
+            traceback.walk_stack(inspect.currentframe()), lookup_lines=False
+        )
+        # The stack trace generated in this way is in the inverse order, so it must be
+        # reversed.
+        stack_trace.reverse()
+        self.tensors_accessed.create_tensor(
+            data_ptr,
+            stack_trace,
+        )
+
+    def _handle_memory_deallocation(self, data_ptr: DataPtr) -> None:
+        self.tensors_accessed.ensure_tensor_exists(data_ptr)
+        self.tensors_accessed.delete_tensor(data_ptr)
+
+    def _handle_stream_creation(self, stream: StreamId) -> None:
+        self.syncs.create_stream(stream)
+
+    def _handle_device_synchronization(self) -> None:
+        self.syncs.sync_all_streams()
+
+    def _handle_stream_synchronization(self, stream: StreamId) -> None:
+        self.syncs.all_streams_wait_for_stream(stream)
+
+    def _handle_event_synchronization(self, event: EventId) -> None:
+        self.syncs.all_streams_wait_for_event(event)
+
+
+def zip_by_key(a: Dict[TK, TVa], b: Dict[TK, TVb]) -> Iterator[Tuple[TK, TVa, TVb]]:
+    for arg, value in a.items():
+        if arg in b:
+            yield arg, value, b[arg]
+
+
+def zip_arguments(
+    schema: torch.FunctionSchema, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Iterator[Tuple[torch.Argument, Any]]:
+    schema_args = schema.arguments[: len(args)]
+    schema_kwargs = {arg.name: arg for arg in schema.arguments[len(args) :]}
+
+    yield from zip(schema_args, args)
+
+    for _, argument, value in zip_by_key(schema_kwargs, kwargs):
+        yield (argument, value)
+
+
+class ArgumentHandler:
+    def __init__(self):
+        self.dataptrs_read: Set[DataPtr] = set()
+        self.dataptrs_written: Set[DataPtr] = set()
+        self.tensor_aliases: Dict[DataPtr, List[str]] = dict()
+        self.outputs: Set[DataPtr] = set()
+
+    def _handle_argument(
+        self,
+        value: Any,
+        is_write: bool,
+        name: Optional[str] = None,
+        is_output: bool = False,
+    ) -> None:
+        if isinstance(value, torch.Tensor) and value.is_cuda:
+            data_ptr = value.data_ptr()
+            if is_write:
+                self.dataptrs_written.add(data_ptr)
+            else:
+                self.dataptrs_read.add(data_ptr)
+
+            self.tensor_aliases.setdefault(data_ptr, [])
+            if name is not None:
+                self.tensor_aliases[data_ptr].append(name)
+            if is_output:
+                self.outputs.add(data_ptr)
+
+    def parse_inputs(
+        self,
+        schema: torch.FunctionSchema,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> None:
+        for argument, value in zip_arguments(schema, args, kwargs):
+            is_write = argument.alias_info is not None and argument.alias_info.is_write
+            pytree.tree_map_(
+                functools.partial(
+                    self._handle_argument, is_write=is_write, name=argument.name
+                ),
+                value,
+            )
+
+    def parse_outputs(self, outputs: Any) -> None:
+        pytree.tree_map_(
+            functools.partial(self._handle_argument, is_write=True, is_output=True),
+            outputs,
+        )
+
+
+class CUDASanitizerDispatchMode(TorchDispatchMode):
+    def __init__(self):
+        self.event_handler = EventHandler()
+        torch._C._activate_cuda_trace()
+        cuda_trace.register_callback_for_cuda_event_creation(
+            self.event_handler._handle_event_creation
+        )
+        cuda_trace.register_callback_for_cuda_event_deletion(
+            self.event_handler._handle_event_deletion
+        )
+        cuda_trace.register_callback_for_cuda_event_record(
+            self.event_handler._handle_event_record
+        )
+        cuda_trace.register_callback_for_cuda_event_wait(
+            self.event_handler._handle_event_wait
+        )
+        cuda_trace.register_callback_for_cuda_memory_allocation(
+            self.event_handler._handle_memory_allocation
+        )
+        cuda_trace.register_callback_for_cuda_memory_deallocation(
+            self.event_handler._handle_memory_deallocation
+        )
+        cuda_trace.register_callback_for_cuda_stream_creation(
+            self.event_handler._handle_stream_creation
+        )
+        cuda_trace.register_callback_for_cuda_device_synchronization(
+            self.event_handler._handle_device_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_stream_synchronization(
+            self.event_handler._handle_stream_synchronization
+        )
+        cuda_trace.register_callback_for_cuda_event_synchronization(
+            self.event_handler._handle_event_synchronization
+        )
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        argument_handler = ArgumentHandler()
+        argument_handler.parse_inputs(func._schema, args, kwargs)
+
+        outputs = func(*args, **kwargs)
+
+        argument_handler.parse_outputs(outputs)
+        errors = self.event_handler._handle_kernel_launch(
+            torch.cuda.current_stream().cuda_stream,
+            argument_handler.dataptrs_read - argument_handler.dataptrs_written,
+            argument_handler.dataptrs_written,
+            argument_handler.outputs,
+            func._schema,
+            argument_handler.tensor_aliases,
+        )
+        if errors:
+            for error in errors:
+                print(error, file=sys.stderr)
+            raise CUDASanitizerErrors(errors)
+
+        return outputs
+
+
+class CUDASanitizer:
+    """Manages the lifetime of a CUDASanitizer dispatch mode object.
+
+    The CUDASanitizer class wraps the entering/exiting functions of the dispatch mode
+    context manager in the enable function/destructor, respectively. This is to
+    explicitly set the lifetime of the dispatch mode object to that of the application.
+    This approach was deemed more elegant than using the atexit module.
+    """
+
+    def __init__(self):
+        self.dispatch = CUDASanitizerDispatchMode()
+        self.enabled = False
+
+    def enable(self):
+        self.dispatch.__enter__()
+        self.enabled = True
+
+    def __del__(self):
+        if self.enabled:
+            self.dispatch.__exit__(None, None, None)
+
+
+def enable_cuda_sanitizer():
+    """Enable CUDA Sanitizer.
+
+    The sanitizer will begin to analyze low-level CUDA calls invoked by torch functions
+    for synchronization errors. All data races found will be printed to the standard
+    error output along with stack traces of suspected causes. For best results, the
+    sanitizer should be enabled at the very beginning of the program.
+    """
+    cuda_sanitizer.enable()
+
+
+cuda_sanitizer = CUDASanitizer()
diff --git a/MLPY/Lib/site-packages/torch/cuda/_utils.py b/MLPY/Lib/site-packages/torch/cuda/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a745084612a099771abb4d587b40c1b8b447b2e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/_utils.py
@@ -0,0 +1,38 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a CUDA device. Note that for a CUDA device without a specified index,
+    i.e., ``torch.device('cuda')``, this will return the current default CUDA
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default CUDA
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["cuda", "cpu"]:
+                raise ValueError(f"Expected a cuda or cpu device, but got: {device}")
+        elif device.type != "cuda":
+            raise ValueError(f"Expected a cuda device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.cuda.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/__init__.py b/MLPY/Lib/site-packages/torch/cuda/amp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d45cd029f10483aab9f4e849efb7f667a84e12a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/amp/__init__.py
@@ -0,0 +1,11 @@
+from .autocast_mode import autocast, custom_bwd, custom_fwd
+from .common import amp_definitely_not_available
+from .grad_scaler import GradScaler
+
+__all__ = [
+    "amp_definitely_not_available",
+    "autocast",
+    "custom_bwd",
+    "custom_fwd",
+    "GradScaler",
+]
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c26ad5600930d1974e62808cff6007a31c79d0bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/autocast_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/autocast_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f40c5b4dbe2990d9b6f898c8faef4723adbda2ed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/autocast_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03f7ebd515964176055ed41f6aa652a5d54bad75
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb598ef5ca81e283f48da43819295fc7a053148a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/cuda/amp/__pycache__/grad_scaler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/autocast_mode.py b/MLPY/Lib/site-packages/torch/cuda/amp/autocast_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e86429596b43ce6be3a97da4e07e3bdb2221ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/amp/autocast_mode.py
@@ -0,0 +1,144 @@
+import collections
+import functools
+
+import torch
+
+try:
+    import numpy as np
+
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+from typing import Any
+
+__all__ = ["autocast", "custom_fwd", "custom_bwd"]
+
+
+class autocast(torch.amp.autocast_mode.autocast):
+    r"""See :class:`torch.autocast`.
+
+    ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)``
+    """
+
+    def __init__(
+        self,
+        enabled: bool = True,
+        dtype: torch.dtype = torch.float16,
+        cache_enabled: bool = True,
+    ):
+        if torch._jit_internal.is_scripting():
+            self._enabled = enabled
+            self.device = "cuda"
+            self.fast_dtype = dtype
+            return
+        super().__init__(
+            "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
+        )
+
+    def __enter__(self):
+        if torch._jit_internal.is_scripting():
+            return self
+        return super().__enter__()
+
+    # TODO: discuss a unified TorchScript-friendly API for autocast
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):  # type: ignore[override]
+        if torch._jit_internal.is_scripting():
+            return
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __call__(self, func):
+        if torch._jit_internal.is_scripting():
+            return func
+        return super().__call__(func)
+
+
+# Casts Tensors and containers of Tensors.  Special-cases passthroughs for strings and np.ndarrays, which
+# may be falsely detected as "Iterables."
+def _cast(value, dtype):
+    if isinstance(value, torch.Tensor):
+        is_eligible = (
+            value.is_floating_point()
+            and value.is_cuda
+            and (value.dtype is not torch.float64)
+        )
+        return value.to(dtype) if is_eligible else value
+    elif isinstance(value, (str, bytes)):
+        return value
+    elif HAS_NUMPY and isinstance(value, np.ndarray):
+        return value
+    elif isinstance(value, collections.abc.Mapping):
+        return {_cast(k, dtype): _cast(v, dtype) for k, v in value.items()}
+    elif isinstance(value, collections.abc.Iterable):
+        iterable = (_cast(v, dtype) for v in value)
+        if isinstance(value, (list, tuple)):
+            return type(value)(iterable)
+        else:
+            return iterable
+    else:
+        return value
+
+
+# custom_fwd is a decorator that may or may not be used with arguments, following
+# https://github.com/dabeaz/python-cookbook/tree/master/src/9/defining_a_decorator_that_takes_an_optional_argument.
+# this works:
+#     @custom_fwd
+#     def forward(...):
+# this also works:
+#     @custom_fwd(cast_inputs=torch.float)
+#     def forward(...):
+def custom_fwd(fwd=None, *, cast_inputs=None):
+    """
+    Create a helper decorator for ``forward`` methods of custom autograd functions.
+
+    Autograd functions are subclasses of :class:`torch.autograd.Function`.
+    See the :ref:`example page<amp-custom-examples>` for more detail.
+
+    Args:
+        cast_inputs (:class:`torch.dtype` or None, optional, default=None):  If not ``None``,
+            when ``forward`` runs in an autocast-enabled region, casts incoming
+            floating-point CUDA Tensors to the target dtype (non-floating-point Tensors are not affected),
+            then executes ``forward`` with autocast disabled.
+            If ``None``, ``forward``'s internal ops execute with the current autocast state.
+
+    .. note::
+        If the decorated ``forward`` is called outside an autocast-enabled region,
+        :func:`custom_fwd<custom_fwd>` is a no-op and ``cast_inputs`` has no effect.
+    """
+    if fwd is None:
+        return functools.partial(custom_fwd, cast_inputs=cast_inputs)
+
+    @functools.wraps(fwd)
+    def decorate_fwd(*args, **kwargs):
+        args[0]._dtype = torch.get_autocast_gpu_dtype()
+        if cast_inputs is None:
+            args[0]._fwd_used_autocast = torch.is_autocast_enabled()
+            return fwd(*args, **kwargs)
+        else:
+            autocast_context = torch.is_autocast_enabled()
+            args[0]._fwd_used_autocast = False
+            if autocast_context:
+                with autocast(enabled=False):
+                    return fwd(*_cast(args, cast_inputs), **_cast(kwargs, cast_inputs))
+            else:
+                return fwd(*args, **kwargs)
+
+    return decorate_fwd
+
+
+# Autograd ensures incoming gradients are the same type as forward outputs.  Allowing a separate
+# cast_inputs argument on custom_bwd is unnecessary and could cause errors if it doesn't match
+# cast_inputs supplied to custom_fwd.
+def custom_bwd(bwd):
+    """Create a helper decorator for backward methods of custom autograd functions.
+
+    Autograd functions are subclasses of :class:`torch.autograd.Function`.
+    Ensures that ``backward`` executes with the same autocast state as ``forward``.
+    See the :ref:`example page<amp-custom-examples>` for more detail.
+    """
+
+    @functools.wraps(bwd)
+    def decorate_bwd(*args, **kwargs):
+        with autocast(enabled=args[0]._fwd_used_autocast, dtype=args[0]._dtype):
+            return bwd(*args, **kwargs)
+
+    return decorate_bwd
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/common.py b/MLPY/Lib/site-packages/torch/cuda/amp/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f03685281b1b053af152301f6bc5d1981ef32b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/amp/common.py
@@ -0,0 +1,9 @@
+from importlib.util import find_spec
+
+import torch
+
+__all__ = ["amp_definitely_not_available"]
+
+
+def amp_definitely_not_available():
+    return not (torch.cuda.is_available() or find_spec("torch_xla"))
diff --git a/MLPY/Lib/site-packages/torch/cuda/amp/grad_scaler.py b/MLPY/Lib/site-packages/torch/cuda/amp/grad_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed9cac2ecba674640d92a9a04a81e3740549c911
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/amp/grad_scaler.py
@@ -0,0 +1,28 @@
+import torch
+from torch.amp.grad_scaler import OptState
+
+__all__ = ["GradScaler", "OptState"]
+
+
+class GradScaler(torch.amp.GradScaler):
+    r"""
+    See :class:`torch.amp.GradScaler`.
+    ``torch.cuda.amp.GradScaler(args...)`` is equivalent to ``torch.amp.GradScaler("cuda", args...)``
+    """
+
+    def __init__(
+        self,
+        init_scale: float = 2.0**16,
+        growth_factor: float = 2.0,
+        backoff_factor: float = 0.5,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+    ) -> None:
+        super().__init__(
+            "cuda",
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
diff --git a/MLPY/Lib/site-packages/torch/cuda/comm.py b/MLPY/Lib/site-packages/torch/cuda/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..51c124cbf6f8932cfb9a27cd4276ce7b6c4c7cd6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/comm.py
@@ -0,0 +1,18 @@
+# The functions here have been moved to torch.nn.parallel.comm
+from torch.nn.parallel.comm import (
+    broadcast,
+    broadcast_coalesced,
+    gather,
+    reduce_add,
+    reduce_add_coalesced,
+    scatter,
+)
+
+__all__ = [
+    "broadcast",
+    "broadcast_coalesced",
+    "reduce_add",
+    "reduce_add_coalesced",
+    "scatter",
+    "gather",
+]
diff --git a/MLPY/Lib/site-packages/torch/cuda/error.py b/MLPY/Lib/site-packages/torch/cuda/error.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/cuda/graphs.py b/MLPY/Lib/site-packages/torch/cuda/graphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ead56cf61306ab549623c24a39951e5972d371
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/graphs.py
@@ -0,0 +1,479 @@
+import gc
+from typing import Optional
+
+import torch
+from torch.utils import _pytree
+from .._utils import _dummy_type
+
+if not hasattr(torch._C, "_CudaStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_CUDAGraph"] = _dummy_type("_CUDAGraph")
+    torch._C.__dict__["_graph_pool_handle"] = _dummy_type("_graph_pool_handle")
+    torch._C.__dict__["_cuda_isCurrentStreamCapturing"] = _dummy_type(
+        "_cuda_isCurrentStreamCapturing"
+    )
+
+from torch._C import (  # noqa: F401
+    _cuda_isCurrentStreamCapturing,
+    _CUDAGraph,
+    _graph_pool_handle,
+)
+
+
+def is_current_stream_capturing():
+    r"""Return True if CUDA graph capture is underway on the current CUDA stream, False otherwise.
+
+    If a CUDA context does not exist on the current device, returns False without initializing the context.
+    """
+    return _cuda_isCurrentStreamCapturing()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+def graph_pool_handle():
+    r"""Return an opaque token representing the id of a graph memory pool.
+
+    See :ref:`Graph memory management<graph-memory-management>`.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+    return _graph_pool_handle()
+
+
+# Python shim helps Sphinx process docstrings more reliably.
+class CUDAGraph(torch._C._CUDAGraph):
+    r"""Wrapper around a CUDA graph.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+    """
+
+    def __new__(cls):
+        return super().__new__(cls)
+
+    def capture_begin(self, pool=None, capture_error_mode="global"):
+        r"""Begin capturing CUDA work on the current stream.
+
+        Typically, you shouldn't call ``capture_begin`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_begin`` internally.
+
+        Arguments:
+            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
+                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
+                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
+                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+        """  # noqa: B950
+        super().capture_begin(pool=pool, capture_error_mode=capture_error_mode)
+
+    def capture_end(self):
+        r"""End CUDA graph capture on the current stream.
+
+        After ``capture_end``, ``replay`` may be called on this instance.
+
+        Typically, you shouldn't call ``capture_end`` yourself.
+        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
+        which call ``capture_end`` internally.
+        """
+        super().capture_end()
+
+    def replay(self):
+        r"""Replay the CUDA work captured by this graph."""
+        super().replay()
+
+    def reset(self):
+        r"""Delete the graph currently held by this instance."""
+        super().reset()
+
+    def pool(self):
+        r"""Return an opaque token representing the id of this graph's memory pool.
+
+        This id can optionally be passed to another graph's ``capture_begin``,
+        which hints the other graph may share the same memory pool.
+        """
+        return super().pool()
+
+    def enable_debug_mode(self):
+        r"""Enable debugging mode for CUDAGraph.debug_dump."""
+        return super().enable_debug_mode()
+
+    def debug_dump(self, debug_path):
+        r"""
+        Arguments:
+            debug_path (required): Path to dump the graph to.
+
+        Calls a debugging function to dump the graph if the debugging is
+        enabled via CUDAGraph.enable_debug_mode()
+        """
+        return super().debug_dump(debug_path)
+
+
+class graph:
+    r"""Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.
+
+    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
+    detailed use, and constraints.
+
+    Arguments:
+        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
+        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
+            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
+        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
+            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
+        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
+            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
+            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
+            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
+            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
+
+    .. note::
+        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
+        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. _cudaStreamCaptureMode:
+        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
+    """  # noqa: B950
+
+    default_capture_stream: Optional["torch.cuda.Stream"] = None
+
+    def __init__(
+        self,
+        cuda_graph,
+        pool=None,
+        stream=None,
+        capture_error_mode: str = "global",
+    ):
+        # Lazy-init of default_capture_stream helps avoid circular-import errors.
+        # Not thread safe, but graphs already have the general (explicitly documented)
+        # restriction that only one capture may be underway at a time in the process.
+        if self.__class__.default_capture_stream is None:
+            self.__class__.default_capture_stream = torch.cuda.Stream()
+
+        self.pool = () if pool is None else (pool,)
+        self.capture_stream = (
+            stream if stream is not None else self.__class__.default_capture_stream
+        )
+        assert self.capture_stream is not None
+        self.stream_ctx = torch.cuda.stream(self.capture_stream)
+        self.cuda_graph = cuda_graph
+        self.capture_error_mode = capture_error_mode
+
+    def __enter__(self):
+        # Free as much memory as we can for the graph
+        torch.cuda.synchronize()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        # Stackoverflow seems comfortable with this pattern
+        # https://stackoverflow.com/questions/26635684/calling-enter-and-exit-manually#39172487
+        self.stream_ctx.__enter__()
+
+        self.cuda_graph.capture_begin(
+            *self.pool, capture_error_mode=self.capture_error_mode
+        )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.cuda_graph.capture_end()
+        self.stream_ctx.__exit__(exc_type, exc_value, traceback)
+        # returning None should propagate exceptions from either capture_end or stream_ctx.__exit__()
+
+
+def make_graphed_callables(
+    callables, sample_args, num_warmup_iters=3, allow_unused_input=False, pool=None
+):
+    r"""Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.
+
+    Each graphed callable's forward pass runs its source callable's
+    forward CUDA work as a CUDA graph inside a single autograd node.
+
+    The graphed callable's forward pass also appends
+    a backward node to the autograd graph. During backward, this node runs the
+    callable's backward work as a CUDA graph.
+
+    Therefore, each graphed callable should be a drop-in replacement for its source callable
+    in an autograd-enabled training loop.
+
+    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.
+
+    If you pass a tuple of several callables, their captures will use the same memory pool.
+    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.
+
+    Arguments:
+        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
+            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
+            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
+            they'll run in the live workload.
+        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
+            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
+            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
+        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
+            11 iterations for warm up. Default: ``3``.
+        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
+            (and therefore their grad is always zero) is an error. Defaults to False.
+        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
+            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
+            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
+    .. note::
+        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
+        that's expected for the corresponding real input in the training loop.
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.
+
+    .. warning::
+        Returned callables do not support higher order differentiation (e.g., double backward).
+
+    .. warning::
+        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
+        may be trainable. Buffers must have ``requires_grad=False``.
+
+    .. warning::
+        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
+        you may not add or remove any of that Module's parameters or buffers.
+
+    .. warning::
+        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
+        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
+        through :func:`~torch.cuda.make_graphed_callables` is allowed.
+
+    .. warning::
+        When running a graphed callable, you must pass its arguments in the same order and format
+        they appeared in that callable's ``sample_args``.
+
+    .. warning::
+        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
+        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
+    """
+    if torch.is_autocast_enabled() and torch.is_autocast_cache_enabled():
+        raise RuntimeError(
+            "make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`."
+        )
+
+    just_one_callable = False
+
+    if not isinstance(callables, tuple):
+        just_one_callable = True
+        callables = (callables,)
+        sample_args = (sample_args,)
+
+    flatten_sample_args = []
+
+    for c, args in zip(callables, sample_args):
+        if isinstance(c, torch.nn.Module):
+            assert (
+                len(c._backward_hooks) == 0
+                and len(c._forward_hooks) == 0
+                and len(c._forward_pre_hooks) == 0
+            ), (
+                "Modules must not have hooks registered at the time they are passed. However, registering hooks "
+                + "on modules after passing them through make_graphed_callables is allowed."
+            )
+            assert all(b.requires_grad is False for b in c.buffers()), (
+                "In any :class:`~torch.nn.Module` passed to "
+                + ":func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have "
+                + "``requires_grad=False``."
+            )
+        flatten_arg = _pytree.arg_tree_leaves(*args)
+        flatten_sample_args.append(tuple(flatten_arg))
+        assert all(isinstance(arg, torch.Tensor) for arg in flatten_arg), (
+            "In the beta API, sample_args "
+            + "for each callable must contain only Tensors. Other types are not allowed."
+        )
+
+    # If a callable is an nn.Module, its graph's full input surface is the args the user explicitly
+    # passes to forward (ie, its sample_args) AND the module's parameter attributes.
+    per_callable_len_user_args = [len(args) for args in flatten_sample_args]
+    per_callable_module_params = [
+        tuple(c.parameters()) if isinstance(c, torch.nn.Module) else ()
+        for c in callables
+    ]
+    per_callable_static_input_surfaces = [
+        flatten_sample_args[i] + per_callable_module_params[i]
+        for i in range(len(callables))
+    ]
+
+    fwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+    bwd_graphs = [torch.cuda.CUDAGraph() for _ in range(len(callables))]
+
+    mempool = graph_pool_handle() if pool is None else pool
+
+    # Warmup
+    # Hopefully prevents cudnn benchmarking and other lazy-initialization cuda work
+    # from ending up in any captures.
+    torch.cuda.synchronize()
+    with torch.cuda.stream(torch.cuda.Stream()):
+        for func, args, static_input_surface in zip(
+            callables, sample_args, per_callable_static_input_surfaces
+        ):
+            for _ in range(num_warmup_iters):
+                outputs = _pytree.tree_leaves(func(*args))
+                grad_inputs = torch.autograd.grad(
+                    outputs=tuple(o for o in outputs if o.requires_grad),
+                    inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                    grad_outputs=tuple(
+                        torch.empty_like(o) for o in outputs if o.requires_grad
+                    ),
+                    only_inputs=True,
+                    allow_unused=allow_unused_input,
+                )
+            del outputs, grad_inputs  # type: ignore[possibly-undefined]
+    torch.cuda.synchronize()
+
+    # All captures here share a mempool. To avoid replays corrupting each other's memory,
+    # the safest approach is to capture all passes in the same order they'll run:
+    # fwd 1, fwd 2, ... fwd N, then bwd N, bwd N-1, ... bwd 1.
+
+    # Capture forward graphs
+    per_callable_static_outputs = []
+    per_callable_output_unflatten_spec = []
+    for func, args, fwd_graph in zip(callables, sample_args, fwd_graphs):
+        with torch.cuda.graph(fwd_graph, pool=mempool):
+            outputs = func(*args)
+
+        flatten_outputs, spec = _pytree.tree_flatten(outputs)
+        per_callable_static_outputs.append(tuple(flatten_outputs))
+        per_callable_output_unflatten_spec.append(spec)
+
+    # Capture backward graphs in reverse order
+    per_callable_static_grad_outputs = []
+    per_callable_static_grad_inputs = []
+    for static_input_surface, static_outputs, bwd_graph, module_params in zip(
+        reversed(per_callable_static_input_surfaces),
+        reversed(per_callable_static_outputs),
+        reversed(bwd_graphs),
+        reversed(per_callable_module_params),
+    ):
+        # For now, assumes all static_outputs require grad
+        # assert all(o.requires_grad for o in static_outputs), "Outputs of graphed callables must require grad."
+        static_grad_outputs = tuple(
+            torch.empty_like(o) if o.requires_grad else None for o in static_outputs
+        )
+
+        with torch.cuda.graph(bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(
+                outputs=tuple(o for o in static_outputs if o.requires_grad),
+                inputs=tuple(i for i in static_input_surface if i.requires_grad),
+                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                only_inputs=True,
+                allow_unused=allow_unused_input,
+            )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs that don't require grad.
+        # I couldn't think of a slick one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in static_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)  # type: ignore[arg-type]
+        static_grad_inputs = tuple(static_grad_inputs)  # type: ignore[assignment]
+
+        per_callable_static_grad_outputs.append(static_grad_outputs)
+        per_callable_static_grad_inputs.append(static_grad_inputs)
+
+    # Reverses the most recent two lists
+    per_callable_static_grad_outputs.reverse()
+    per_callable_static_grad_inputs.reverse()
+    # Now for every per_callable list, per_callable_*[i] holds the stuff for the ith callable.
+
+    def make_graphed_autograd_function(
+        fwd_graph,
+        bwd_graph,
+        module_params,
+        len_user_args,
+        output_unflatten_spec,
+        static_input_surface,
+        static_outputs,
+        static_grad_outputs,
+        static_grad_inputs,
+    ):
+        class Graphed(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *inputs):
+                # At this stage, only the user args may (potentially) be new tensors.
+                for i in range(len_user_args):
+                    if static_input_surface[i].data_ptr() != inputs[i].data_ptr():
+                        static_input_surface[i].copy_(inputs[i])
+                fwd_graph.replay()
+                assert isinstance(static_outputs, tuple)
+                return tuple(o.detach() for o in static_outputs)
+
+            @staticmethod
+            @torch.autograd.function.once_differentiable
+            def backward(ctx, *grads):
+                assert len(grads) == len(static_grad_outputs)
+                for g, grad in zip(static_grad_outputs, grads):
+                    if g is not None:
+                        # don't copy if autograd gods have been kind and the
+                        # incoming grad is already in the right place
+                        if g.data_ptr() != grad.data_ptr():
+                            g.copy_(grad)
+                bwd_graph.replay()
+
+                # Input args that didn't require grad expect a None gradient.
+                assert isinstance(static_grad_inputs, tuple)
+                return tuple(
+                    b.detach() if b is not None else b for b in static_grad_inputs
+                )
+
+        def functionalized(*user_args):
+            # Runs the autograd function with inputs == all inputs to the graph that might require grad
+            # (explicit user args + module parameters)
+            # Assumes module params didn't change since capture.
+            flatten_user_args = _pytree.arg_tree_leaves(*user_args)
+            out = Graphed.apply(*(tuple(flatten_user_args) + module_params))
+            return _pytree.tree_unflatten(out, output_unflatten_spec)
+
+        return functionalized
+
+    # Put together the final graphed callables
+    ret = []
+    for i, func in enumerate(callables):
+        graphed = make_graphed_autograd_function(
+            fwd_graphs[i],
+            bwd_graphs[i],
+            per_callable_module_params[i],
+            per_callable_len_user_args[i],
+            per_callable_output_unflatten_spec[i],
+            per_callable_static_input_surfaces[i],
+            per_callable_static_outputs[i],
+            per_callable_static_grad_outputs[i],
+            per_callable_static_grad_inputs[i],
+        )
+
+        if isinstance(func, torch.nn.Module):
+
+            def make_graphed_forward(func, graph_training_state, graphed, orig_fwd):
+                def new_fwd(*user_args):
+                    # If the module's training-or-eval state matches what we graphed,
+                    # run the graph, otherwise run the original forward method
+                    if func.training == graph_training_state:
+                        return graphed(*user_args)
+                    else:
+                        return orig_fwd(*user_args)
+
+                return new_fwd
+
+            func.forward = make_graphed_forward(func, func.training, graphed, func.forward)  # type: ignore[assignment]
+            ret.append(func)
+        else:
+            ret.append(graphed)
+
+    if just_one_callable:
+        return ret[0]
+
+    return tuple(ret)
diff --git a/MLPY/Lib/site-packages/torch/cuda/jiterator.py b/MLPY/Lib/site-packages/torch/cuda/jiterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee6ddab5b6cc3f5b13456daae3972a318699aad3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/jiterator.py
@@ -0,0 +1,185 @@
+import re
+from typing import Callable, List
+
+import torch
+from torch import Tensor
+
+__all__: List[str] = []
+
+
+class _CodeParser:
+    def __init__(self, code_string: str):
+        optional_ws = r"\s*"
+        required_ws = r"\s+"
+        template_params = r"(?P<template_params>\<.+\>)"
+        return_type = r"(?P<return_type>\w+)"
+        function_name = r"(?P<function_name>\w+)"
+        function_params = r"(?P<function_params>\(.+\))"
+        function_body = r"(?P<function_body>\{.+\})"
+
+        pattern = (
+            optional_ws
+            + "template"
+            + optional_ws
+            + template_params
+            + optional_ws
+            + return_type
+            + required_ws
+            + function_name
+            + optional_ws
+            + function_params
+            + optional_ws
+            + function_body
+            + optional_ws
+        )
+
+        result = re.match(
+            pattern, code_string, re.DOTALL
+        )  # DOTALL for matching multiline
+
+        if result is None:
+            raise Exception(
+                f"Couldn't parse code, please check correctness:\n {code_string}"
+            )
+
+        self.template_params = result["template_params"]
+        self.return_type = result["return_type"]
+        self.function_name = result["function_name"]
+        self.function_params = result["function_params"]
+        self.function_body = result["function_body"]
+
+
+class _JittedFunction:
+    def __init__(
+        self, code_string: str, return_by_ref: bool, num_outputs: int, **kwargs
+    ):
+        self.code_string = code_string
+
+        assert (
+            return_by_ref or num_outputs == 1
+        ), "Return by value only works for single output. "
+        self.return_by_ref = return_by_ref
+        self.num_outputs = num_outputs
+
+        parsed_code = _CodeParser(code_string)
+        self.kernel_name = parsed_code.function_name
+
+        self.kwargs_dict = kwargs
+        self.is_cuda_available = torch.cuda.is_available()
+
+    def __call__(self, *tensors: Tensor, **kwargs):
+        # Jiterator follow torch.cuda's lazy initialization behavior
+        # Defer checking cuda's availability at the function invocation time
+        assert (
+            self.is_cuda_available
+        ), "Jiterator is only supported on CUDA and ROCm GPUs, none are available."
+
+        assert len(tensors) <= 8, "jiterator only supports up to 8 tensor inputs."
+
+        expanded_kwargs = self.kwargs_dict.copy()
+        for key, value in kwargs.items():
+            if key in self.kwargs_dict:
+                expanded_kwargs[key] = value
+            else:
+                raise KeyError(f"{key} is not declared in function definition")
+
+        return torch._C._cuda_jiterator_compile_and_launch_kernel(
+            self.code_string,
+            self.kernel_name,
+            self.return_by_ref,
+            self.num_outputs,
+            tensors,
+            expanded_kwargs,
+        )
+
+
+def _create_jit_fn(code_string: str, **kwargs) -> Callable:
+    """
+    Create a jiterator-generated cuda kernel for an elementwise op.
+
+    The code string has to be a valid CUDA function that describes the computation for a single element. The code
+    string has to follow the c++ template pattern, as shown in the example below. This function will be inlined
+    into elementwise kernel template, and compiled on the fly. Compiled kernel will be cached in memory, as well as
+    local temp dir.
+
+    Jiterator-generated kernels accepts noncontiguous tensors, and supports broadcasting and type promotion.
+
+    Args:
+        code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return by value.
+        kwargs (Dict, optional): Keyword arguments for generated function
+
+    Example::
+
+        code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
+        jitted_fn = create_jit_fn(code_string, alpha=1.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b, alpha=3.14)
+
+    code_string also allows multiple function definitions, and the last function will be treated as the entry function.
+
+    Example::
+
+        code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
+        code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
+        jitted_fn = create_jit_fn(code_string, val=0.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b)  # using default val=0.0
+
+    Jiterator can be used together with python registration to override an operator's cuda kernel.
+    Following example is overriding gelu's cuda kernel with relu.
+
+    Example::
+
+        code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
+        my_gelu = create_jit_fn(code_string)
+        my_lib = torch.library.Library("aten", "IMPL")
+        my_lib.impl('aten::gelu', my_gelu, "CUDA")
+        # torch.nn.GELU and torch.nn.function.gelu are now overridden
+        a = torch.rand(3, device='cuda')
+        torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        This API only supports up to 8 inputs and 1 output
+
+    .. warning::
+        All input tensors must live in CUDA device
+    """
+    return _JittedFunction(code_string, return_by_ref=False, num_outputs=1, **kwargs)
+
+
+def _create_multi_output_jit_fn(
+    code_string: str, num_outputs: int, **kwargs
+) -> Callable:
+    """
+    Create a jiterator-generated cuda kernel for an elementwise op that supports returning one or more outputs.
+
+    Args:
+        code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return value by reference.
+        num_outputs(int): number of outputs return by the kernel
+        kwargs (Dict, optional): Keyword arguments for generated function
+
+    Example::
+
+        code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
+        jitted_fn = create_jit_fn(code_string, alpha=1.0)
+        a = torch.rand(3, device='cuda')
+        b = torch.rand(3, device='cuda')
+        # invoke jitted function like a regular python function
+        result = jitted_fn(a, b, alpha=3.14)
+
+    .. warning::
+        This API is in beta and may change in future releases.
+
+    .. warning::
+        This API only supports up to 8 inputs and 8 outputs
+    """
+    return _JittedFunction(
+        code_string, return_by_ref=True, num_outputs=num_outputs, **kwargs
+    )
diff --git a/MLPY/Lib/site-packages/torch/cuda/memory.py b/MLPY/Lib/site-packages/torch/cuda/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..34fcedc5994cb2cd1b0bb05d7edbe1a58ec8b514
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/memory.py
@@ -0,0 +1,914 @@
+r"""This package adds support for device memory management implemented in CUDA."""
+
+import collections
+import contextlib
+import ctypes
+import pickle
+import sys
+import warnings
+from inspect import signature
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import _C
+
+from torch.types import Device
+from .._utils import _dummy_type
+from . import _get_device_index, _get_nvml_device_index, _lazy_init, is_initialized
+
+from ._memory_viz import memory as _memory, segments as _segments
+
+__all__ = [
+    "caching_allocator_alloc",
+    "caching_allocator_delete",
+    "set_per_process_memory_fraction",
+    "empty_cache",
+    "memory_stats",
+    "memory_stats_as_nested_dict",
+    "reset_accumulated_memory_stats",
+    "reset_peak_memory_stats",
+    "reset_max_memory_allocated",
+    "reset_max_memory_cached",
+    "memory_allocated",
+    "max_memory_allocated",
+    "memory_reserved",
+    "max_memory_reserved",
+    "memory_cached",
+    "max_memory_cached",
+    "memory_snapshot",
+    "memory_summary",
+    "list_gpu_processes",
+    "mem_get_info",
+    "get_allocator_backend",
+    "CUDAPluggableAllocator",
+    "change_current_allocator",
+]
+
+
+if not hasattr(torch._C, "_cuda_CUDAAllocator"):
+    # Define dummy base classes
+    torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator")
+
+
+def _host_allocator():
+    _lazy_init()
+    return torch._C._cuda_cudaHostAllocator()
+
+
+@contextlib.contextmanager
+def _free_mutex():
+    torch._C._cuda_lock_mutex()
+    try:
+        yield
+    finally:
+        torch._C._cuda_unlock_mutex()
+
+
+def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None):
+    r"""Perform a memory allocation using the CUDA memory allocator.
+
+    Memory is allocated for a given device and a stream, this
+    function is intended to be used for interoperability with other
+    frameworks. Allocated memory is released through
+    :func:`~torch.cuda.caching_allocator_delete`.
+
+    Args:
+        size (int): number of bytes to be allocated.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+        stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then
+            the default stream for the selected device is used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if stream is None:
+        stream = torch.cuda.current_stream(device)
+    if isinstance(stream, torch.cuda.streams.Stream):
+        stream = stream.cuda_stream
+    if not isinstance(stream, int):
+        raise TypeError(
+            "Invalid type for stream argument, must be "
+            "`torch.cuda.Stream` or `int` representing a pointer "
+            "to a existing stream"
+        )
+    with torch.cuda.device(device):
+        return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream)
+
+
+def caching_allocator_delete(mem_ptr):
+    r"""Delete memory allocated using the CUDA memory allocator.
+
+    Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`.
+    is freed here. The associated device and stream are tracked inside
+    the allocator.
+
+    Args:
+        mem_ptr (int): memory address to be freed by the allocator.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
+
+
+def set_per_process_memory_fraction(
+    fraction, device: Union[Device, int] = None
+) -> None:
+    r"""Set memory fraction for a process.
+
+    The fraction is used to limit an caching allocator to allocated memory on a CUDA device.
+    The allowed value equals the total visible memory multiplied fraction.
+    If trying to allocate more than the allowed value in a process, will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
+        device (torch.device or int, optional): selected device. If it is
+            ``None`` the default CUDA device is used.
+    .. note::
+        In general, the total available free memory is less than the total capacity.
+    """
+    _lazy_init()
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    if not isinstance(fraction, float):
+        raise TypeError("Invalid type for fraction argument, must be `float`")
+    if fraction < 0 or fraction > 1:
+        raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1")
+
+    torch._C._cuda_setMemoryFraction(fraction, device)
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU application and visible in
+    `nvidia-smi`.
+
+    .. note::
+        :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of GPU memory in certain cases. See :ref:`cuda-memory-management` for
+        more details about GPU memory management.
+    """
+    if is_initialized():
+        torch._C._cuda_emptyCache()
+
+
+def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return a dictionary of CUDA memory allocator statistics for a given device.
+
+    The return value of this function is a dictionary of statistics, each of
+    which is a non-negative integer.
+
+    Core statistics:
+
+    - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of allocation requests received by the memory allocator.
+    - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of allocated memory.
+    - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of reserved segments from ``cudaMalloc()``.
+    - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of reserved memory.
+    - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of active memory blocks.
+    - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of active memory.
+    - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      number of inactive, non-releasable memory blocks.
+    - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      amount of inactive, non-releasable memory.
+
+    For these core statistics, values are broken down as follows.
+
+    Pool type:
+
+    - ``all``: combined statistics across all memory pools.
+    - ``large_pool``: statistics for the large allocation pool
+      (as of October 2019, for size >= 1MB allocations).
+    - ``small_pool``: statistics for the small allocation pool
+      (as of October 2019, for size < 1MB allocations).
+
+    Metric type:
+
+    - ``current``: current value of this metric.
+    - ``peak``: maximum value of this metric.
+    - ``allocated``: historical total increase in this metric.
+    - ``freed``: historical total decrease in this metric.
+
+    In addition to the core statistics, we also provide some simple event
+    counters:
+
+    - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that
+      result in a cache flush and retry.
+    - ``"num_ooms"``: number of out-of-memory errors thrown.
+
+    The caching allocator can be configured via ENV to not split blocks larger than a
+    defined size (see Memory Management section of the Cuda Semantics documentation).
+    This helps avoid memory fragmentation but may have a performance
+    penalty. Additional outputs to assist with tuning and evaluating impact:
+
+    - ``"max_split_size"``: blocks above this size will not be split.
+    - ``"oversize_allocations.{current,peak,allocated,freed}"``:
+      number of over-size allocation requests received by the memory allocator.
+    - ``"oversize_segments.{current,peak,allocated,freed}"``:
+      number of over-size reserved segments from ``cudaMalloc()``.
+
+    The caching allocator can be configured via ENV to round memory allocations in order
+    to reduce fragmentation. Sometimes the overhead from rounding can be higher than
+    the fragmentation it helps reduce. The following stat can be used to check if
+    rounding adds too much overhead:
+
+    - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
+      memory requested by client code, compare this with allocated_bytes to check if
+      allocation rounding adds too much overhead.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistics for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+
+    .. note::
+        With :ref:`backend:cudaMallocAsync<cuda-memory-envvars>`, some stats are not
+        meaningful, and are always reported as zero.
+    """
+    result = []
+
+    def _recurse_add_to_result(prefix, obj):
+        if isinstance(obj, dict):
+            if len(prefix) > 0:
+                prefix += "."
+            for k, v in obj.items():
+                _recurse_add_to_result(prefix + k, v)
+        else:
+            result.append((prefix, obj))
+
+    stats = memory_stats_as_nested_dict(device=device)
+    _recurse_add_to_result("", stats)
+    result.sort()
+
+    return collections.OrderedDict(result)
+
+
+def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> Dict[str, Any]:
+    r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary."""
+    if not is_initialized():
+        return {}
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_memoryStats(device)
+
+
+def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to
+    the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
+    `"num_alloc_retries"` and `"num_ooms"`.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetAccumulatedMemoryStats(device)
+
+
+def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
+    r"""Reset the "peak" stats tracked by the CUDA memory allocator.
+
+    See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
+    `"peak"` key in each individual stat dict.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    return torch._C._cuda_resetPeakMemoryStats(device)
+
+
+def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device.
+
+    See :func:`~torch.cuda.max_memory_allocated` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
+    r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device.
+
+    See :func:`~torch.cuda.max_memory_cached` for details.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. warning::
+        This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets
+        /all/ peak memory stats.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    warnings.warn(
+        "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, "
+        "which resets /all/ peak memory stats.",
+        FutureWarning,
+    )
+    return reset_peak_memory_stats(device=device)
+
+
+def memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory occupied by tensors in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        This is likely less than the amount shown in `nvidia-smi` since some
+        unused memory can be held by the caching allocator and some context
+        needs to be created on GPU. See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.current", 0)
+
+
+def max_memory_allocated(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
+
+    By default, this returns the peak allocated memory since the beginning of
+    this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to
+    reset the starting point in tracking this metric. For example, these two
+    functions can measure the peak allocated memory usage of each iteration in a
+    training loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
+
+
+def memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.current", 0)
+
+
+def max_memory_reserved(device: Union[Device, int] = None) -> int:
+    r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
+
+    By default, this returns the peak cached memory since the beginning of this
+    program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset
+    the starting point in tracking this metric. For example, these two functions
+    can measure the peak cached memory amount of each iteration in a training
+    loop.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
+
+
+def memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.memory_cached has been renamed to torch.cuda.memory_reserved",
+        FutureWarning,
+    )
+    return memory_reserved(device=device)
+
+
+def max_memory_cached(device: Union[Device, int] = None) -> int:
+    r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`."""
+    warnings.warn(
+        "torch.cuda.max_memory_cached has been renamed to torch.cuda.max_memory_reserved",
+        FutureWarning,
+    )
+    return max_memory_reserved(device=device)
+
+
+def memory_snapshot():
+    r"""Return a snapshot of the CUDA memory allocator state across all devices.
+
+    Interpreting the output of this function requires familiarity with the
+    memory allocator internals.
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    return torch._C._cuda_memorySnapshot()["segments"]
+
+
+def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str:
+    r"""Return a human-readable printout of the current memory allocator statistics for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+        abbreviated (bool, optional): whether to return an abbreviated summary
+            (default: False).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more details about GPU memory
+        management.
+    """
+    device = _get_device_index(device, optional=True)
+    stats = memory_stats(device=device)
+
+    def _format_size(sz, pref_sz):
+        prefixes = ["B  ", "KiB", "MiB", "GiB", "TiB", "PiB"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_sz < 768 * 1024:
+                break
+            prefix = new_prefix
+            sz //= 1024
+            pref_sz /= 1024
+        return f"{sz:6d} {prefix}"
+
+    def _format_count(cnt, pref_cnt):
+        prefixes = [" ", "K", "M"]
+        prefix = prefixes[0]
+        for new_prefix in prefixes[1:]:
+            if pref_cnt < 750 * 1000:
+                break
+            prefix = new_prefix
+            cnt //= 1000
+            pref_cnt /= 1000
+        return f"{cnt:7d} {prefix} "
+
+    metrics_to_display = [
+        ("allocated_bytes", "Allocated memory", _format_size),
+        ("active_bytes", "Active memory", _format_size),
+        ("requested_bytes", "Requested memory", _format_size),
+        ("reserved_bytes", "GPU reserved memory", _format_size),
+        ("inactive_split_bytes", "Non-releasable memory", _format_size),
+        ("allocation", "Allocations", _format_count),
+        ("active", "Active allocs", _format_count),
+        ("segment", "GPU reserved segments", _format_count),
+        ("inactive_split", "Non-releasable allocs", _format_count),
+    ]
+
+    lines = []
+    lines.append("=" * 75)
+    lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ")
+    lines.append("-" * 75)
+    lines.append(
+        "  {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d}  "
+    )
+    lines.append("=" * 75)
+    lines.append(
+        "        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  "
+    )
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+        submetrics = [("all", metric_name)]
+        if not abbreviated:
+            submetrics.append(("large_pool", "      from large pool"))
+            submetrics.append(("small_pool", "      from small pool"))
+
+        current_prefval, peak_prefval, allocated_prefval, freed_prefval = (
+            None,
+            None,
+            None,
+            None,
+        )
+
+        for submetric_key, submetric_name in submetrics:
+            prefix = metric_key + "." + submetric_key + "."
+
+            current = stats[prefix + "current"]
+            peak = stats[prefix + "peak"]
+            allocated = stats[prefix + "allocated"]
+            freed = stats[prefix + "freed"]
+
+            if current_prefval is None:
+                current_prefval = current
+                peak_prefval = peak
+                allocated_prefval = allocated
+                freed_prefval = freed
+
+            lines.append(
+                " {:<21} | {} | {} | {} | {} ".format(
+                    submetric_name,
+                    formatter(current, current_prefval),
+                    formatter(peak, peak_prefval),
+                    formatter(allocated, allocated_prefval),
+                    formatter(freed, freed_prefval),
+                ),
+            )
+
+    metrics_to_display = [
+        ("oversize_allocations", "Oversize allocations", _format_count),
+        ("oversize_segments", "Oversize GPU segments", _format_count),
+    ]
+
+    for metric_key, metric_name, formatter in metrics_to_display:
+        lines.append("-" * 75)
+
+        prefix = metric_key + "."
+
+        current = stats[prefix + "current"]
+        peak = stats[prefix + "peak"]
+        allocated = stats[prefix + "allocated"]
+        freed = stats[prefix + "freed"]
+
+        lines.append(
+            " {:<21} | {} | {} | {} | {} ".format(
+                metric_name,
+                formatter(current, current),
+                formatter(peak, peak),
+                formatter(allocated, allocated),
+                formatter(freed, freed),
+            ),
+        )
+
+    lines.append("=" * 75)
+
+    fmt_dict = {"_": "", "device": device}
+    for k, v in stats.items():
+        fmt_dict[k.replace(".", "-")] = v
+    return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n"
+
+
+def list_gpu_processes(device: Union[Device, int] = None) -> str:
+    r"""Return a human-readable printout of the running processes and their GPU memory use for a given device.
+
+    This can be useful to display periodically during training, or when
+    handling out-of-memory exceptions.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            printout for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    try:
+        import pynvml  # type: ignore[import]
+    except ModuleNotFoundError:
+        return "pynvml module not found, please install pynvml"
+    from pynvml import NVMLError_DriverNotLoaded
+
+    try:
+        pynvml.nvmlInit()
+    except NVMLError_DriverNotLoaded:
+        return "cuda driver can't be loaded, is cuda enabled?"
+    device = _get_nvml_device_index(device)
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
+    lines = []
+    lines.append(f"GPU:{device}")
+    if len(procs) == 0:
+        lines.append("no processes are running")
+    for p in procs:
+        mem = p.usedGpuMemory / (1024 * 1024)
+        lines.append(f"process {p.pid:>10d} uses {mem:>12.3f} MB GPU memory")
+    return "\n".join(lines)
+
+
+def mem_get_info(device: Union[Device, int] = None) -> Tuple[int, int]:
+    r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            statistic for the current device, given by :func:`~torch.cuda.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    .. note::
+        See :ref:`cuda-memory-management` for more
+        details about GPU memory management.
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    device = _get_device_index(device)
+    return torch.cuda.cudart().cudaMemGetInfo(device)
+
+
+def _record_memory_history_legacy(
+    enabled: bool,
+    record_context=True,
+    trace_alloc_max_entries=1,
+    trace_alloc_record_context=False,
+    device: Union[Device, int] = None,
+    record_context_cpp=False,
+):
+    _C._cuda_record_memory_history_legacy(
+        enabled,
+        record_context,
+        trace_alloc_max_entries,
+        trace_alloc_record_context,
+        record_context_cpp,
+    )
+
+
+def _record_memory_history(enabled="all", *args, **kwargs):
+    """Enable recording of stack traces associated with memory
+    allocations, so you can tell what allocated any piece of memory in
+    :func:`torch.cuda.memory._snapshot()`.
+
+    In addition too keeping stack traces with each current allocation and free,
+    this will also enable recording of a history of all alloc/free events.
+
+    Use :func:`torch.cuda.memory._snapshot()` to retrieve this information,
+    and the tools in `_memory_viz.py` to visualize snapshots.
+
+    The Python trace collection is fast (2us per trace), so you may consider
+    enabling this on production jobs if you anticipate ever having to debug
+    memory issues.
+
+    C++ trace collection is also fast (~50ns/frame), which for many typical programs
+    works out to ~2us per trace, but can vary depending on stack depth.
+
+    Args:
+        enabled (Literal[None, "state", "all"], optional):
+            `None`, disable recording memory history.
+            `"state"`, keep information for currenly allocated memory.
+            `"all"`, additionally keep a history of all alloc/free calls.
+            Defaults to "all".
+        context (Literal[None, "state", "alloc", "all"], optional):
+            `None`, Do not record any tracebacks.
+            `"state"`, Record tracebacks for currently allocated memory.
+            `"alloc"`, additionally keep tracebacks for alloc calls.
+            `"all"`, additionally keep tracebacks for free calls.
+            Defaults to "all".
+        stacks (Literal["python", "all"], optional):
+            `"python"`, include Python, TorchScript, and inductor frames in tracebacks
+            `"all"`, additionally include C++ frames
+            Defaults to "all".
+        max_entries (int, optional): Keep a maximum of `max_entries`
+            alloc/free events in the recorded history recorded.
+    """
+    if isinstance(enabled, bool):
+        return _record_memory_history_legacy(enabled, *args, **kwargs)
+    else:
+        return _record_memory_history_impl(enabled, *args, **kwargs)
+
+
+def _record_memory_history_impl(
+    enabled: Optional[str] = "all",
+    context: Optional[str] = "all",
+    stacks: str = "all",
+    max_entries: int = sys.maxsize,
+    device: Union[Device, int] = None,
+):
+    _C._cuda_record_memory_history(enabled, context, stacks, max_entries)
+
+
+_record_memory_history.__signature__ = signature(_record_memory_history_impl)  # type: ignore[attr-defined]
+
+
+def _snapshot(device: Union[Device, int] = None):
+    """Save a snapshot of CUDA memory state at the time it was called.
+
+    The state is represented as a dictionary with the following structure.
+
+    .. code-block:: python
+
+        class Snapshot(TypedDict):
+            segments : List[Segment]
+            device_traces: List[List[TraceEntry]]
+
+        class Segment(TypedDict):
+            # Segments are memory returned from a cudaMalloc call.
+            # The size of reserved memory is the sum of all Segments.
+            # Segments are cached and reused for future allocations.
+            # If the reuse is smaller than the segment, the segment
+            # is split into more then one Block.
+            # empty_cache() frees Segments that are entirely inactive.
+            address: int
+            total_size: int #  cudaMalloc'd size of segment
+            stream: int
+            segment_type: Literal['small', 'large'] # 'large' (>1MB)
+            allocated_size: int # size of memory in use
+            active_size: int # size of memory in use or in active_awaiting_free state
+            blocks : List[Block]
+
+        class Block(TypedDict):
+            # A piece of memory returned from the allocator, or
+            # current cached but inactive.
+            size: int
+            requested_size: int # size requested during malloc, may be smaller than
+                                # size due to rounding
+            address: int
+            state: Literal['active_allocated', # used by a tensor
+                        'active_awaiting_free', # waiting for another stream to finish using
+                                                # this, then it will become free
+                        'inactive',] # free for reuse
+            frames: List[Frame] # stack trace from where the allocation occurred
+
+        class Frame(TypedDict):
+                filename: str
+                line: int
+                name: str
+
+        class TraceEntry(TypedDict):
+            # When `torch.cuda.memory._record_memory_history()` is enabled,
+            # the snapshot will contain TraceEntry objects that record each
+            # action the allocator took.
+            action: Literal[
+            'alloc'  # memory allocated
+            'free_requested', # the allocated received a call to free memory
+            'free_completed', # the memory that was requested to be freed is now
+                            # able to be used in future allocation calls
+            'segment_alloc', # the caching allocator ask cudaMalloc for more memory
+                            # and added it as a segment in its cache
+            'segment_free',  # the caching allocator called cudaFree to return memory
+                            # to cuda possibly trying free up memory to
+                            # allocate more segments or because empty_caches was called
+            'oom',          # the allocator threw an OOM exception. 'size' is
+                            # the requested number of bytes that did not succeed
+            'snapshot'      # the allocator generated a memory snapshot
+                            # useful to coorelate a previously taken
+                            # snapshot with this trace
+            ]
+            addr: int # not present for OOM
+            frames: List[Frame]
+            size: int
+            stream: int
+            device_free: int # only present for OOM, the amount of
+                            # memory cuda still reports to be free
+
+    Returns:
+        The Snapshot dictionary object
+    """
+    return _C._cuda_memorySnapshot()
+
+
+def _dump_snapshot(filename="dump_snapshot.pickle"):
+    """
+    Save a pickled version of the `torch.memory._snapshot()` dictionary to a file.
+
+    This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz
+
+    Args:
+        filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle".
+    """
+    s = _snapshot()
+    with open(filename, "wb") as f:
+        pickle.dump(s, f)
+
+
+def _save_segment_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_segments(snapshot))
+
+
+def _save_memory_usage(filename="output.svg", snapshot=None):
+    if snapshot is None:
+        snapshot = _snapshot()
+    with open(filename, "w") as f:
+        f.write(_memory(snapshot))
+
+
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
+
+
+def get_allocator_backend() -> str:
+    r"""Return a string describing the active allocator backend as set by
+    ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are
+    ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync``
+    (CUDA's built-in asynchronous allocator).
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on choosing the allocator backend.
+    """
+    return torch._C._cuda_getAllocatorBackend()
+
+
+class _CUDAAllocator:
+    r"""Wrapper over internal CUDA memory allocators."""
+
+    def __init__(self, allocator: torch._C._cuda_CUDAAllocator):
+        self._allocator = allocator
+
+    def allocator(self):
+        return self._allocator
+
+
+class CUDAPluggableAllocator(_CUDAAllocator):
+    r"""CUDA memory allocator loaded from a so file."""
+
+    def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str):
+        r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes.
+
+        To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function.
+
+        Args:
+            path_to_so_file(str): Path in the filesystem to the `.so` file containing
+                the allocator functions
+            alloc_fn_name(str): Name of the function to perform the memory allocation
+                in the so file. The signature must be:
+                void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream);
+            free_fn_name(str): Name of the function to perform the memory release
+                in the so file. The signature must be:
+                void free_fn_name(void* ptr, size_t size, cudaStream_t stream);
+
+        .. warning::
+            This is currently supported only in unix OSs
+
+        .. note::
+            See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+        """
+        allocator = ctypes.CDLL(path_to_so_file)
+        alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value
+        free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value
+        assert alloc_fn is not None
+        assert free_fn is not None
+        self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn)
+
+
+def change_current_allocator(allocator: _CUDAAllocator) -> None:
+    r"""Change the currently used memory allocator to be the one provided.
+
+    If the current allocator has already been used/initialized, this function will error.
+
+
+    Args:
+        allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one.
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    torch._C._cuda_changeCurrentAllocator(allocator.allocator())
+
+
+def _get_current_allocator() -> _CUDAAllocator:
+    r"""Return the allocator being currently used.
+
+    .. note::
+        See :ref:`cuda-memory-management` for details on creating and using a custom allocator
+    """
+    return _CUDAAllocator(torch._C._cuda_getAllocator())
diff --git a/MLPY/Lib/site-packages/torch/cuda/nccl.py b/MLPY/Lib/site-packages/torch/cuda/nccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..439651a0492431383f358641119b26651d532dda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/nccl.py
@@ -0,0 +1,137 @@
+import collections
+import warnings
+from typing import Optional, Sequence, Union
+
+import torch.cuda
+
+
+__all__ = ["all_reduce", "reduce", "broadcast", "all_gather", "reduce_scatter"]
+
+SUM = 0  # ncclRedOp_t
+
+
+def is_available(tensors):
+    if not hasattr(torch._C, "_nccl_all_reduce"):
+        warnings.warn("PyTorch is not compiled with NCCL support")
+        return False
+
+    devices = set()
+    for tensor in tensors:
+        if tensor.is_sparse:
+            return False
+        if not tensor.is_contiguous():
+            return False
+        if not tensor.is_cuda:
+            return False
+        device = tensor.get_device()
+        if device in devices:
+            return False
+        devices.add(device)
+
+    return True
+
+
+def version():
+    ver = torch._C._nccl_version()
+    major = ver >> 32
+    minor = (ver >> 16) & 65535
+    patch = ver & 65535
+    suffix = torch._C._nccl_version_suffix().decode("utf-8")
+    if suffix == "":
+        return (major, minor, patch)
+    else:
+        return (major, minor, patch, suffix)
+
+
+def unique_id():
+    return torch._C._nccl_unique_id()
+
+
+def init_rank(num_ranks, uid, rank):
+    return torch._C._nccl_init_rank(num_ranks, uid, rank)
+
+
+def _check_sequence_type(inputs: Union[torch.Tensor, Sequence[torch.Tensor]]) -> None:
+    if not isinstance(inputs, collections.abc.Container) or isinstance(
+        inputs, torch.Tensor
+    ):
+        raise TypeError("Inputs should be a collection of tensors")
+
+
+def all_reduce(inputs, outputs=None, op=SUM, streams=None, comms=None):
+    _check_sequence_type(inputs)
+    if outputs is None:
+        outputs = inputs
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_reduce(inputs, outputs, op, streams, comms)
+
+
+# `output` used to be `outputs`, taking in a list of tensors. So we have two
+# arguments for BC reasons.
+def reduce(
+    inputs: Sequence[torch.Tensor],
+    output: Optional[Union[torch.Tensor, Sequence[torch.Tensor]]] = None,
+    root: int = 0,
+    op: int = SUM,
+    streams: Optional[Sequence[torch.cuda.Stream]] = None,
+    comms=None,
+    *,
+    outputs: Optional[Sequence[torch.Tensor]] = None,
+) -> None:
+    _check_sequence_type(inputs)
+    _output: torch.Tensor
+    if outputs is not None:
+        if output is not None:
+            raise ValueError(
+                "'output' and 'outputs' can not be both specified. 'outputs' is deprecated in "
+                "favor of 'output', taking in a single output tensor. The signature of reduce is: "
+                "reduce(inputs, output=None, root=0, op=SUM, streams=None, comms=None)."
+            )
+        else:
+            warnings.warn(
+                "nccl.reduce with an output tensor list is deprecated. "
+                "Please specify a single output tensor with argument 'output' instead instead."
+            )
+            _output = outputs[root]
+    elif not isinstance(output, torch.Tensor) and isinstance(
+        output, collections.abc.Sequence
+    ):
+        # User called old API with positional arguments of list of output tensors.
+        warnings.warn(
+            "nccl.reduce with an output tensor list is deprecated. "
+            "Please specify a single output tensor."
+        )
+        _output = output[root]
+    else:
+        _output = inputs[root] if output is None else output
+    torch._C._nccl_reduce(inputs, _output, root, op, streams, comms)
+
+
+def broadcast(
+    inputs: Sequence[torch.Tensor], root: int = 0, streams=None, comms=None
+) -> None:
+    _check_sequence_type(inputs)
+    torch._C._nccl_broadcast(inputs, root, streams, comms)
+
+
+def all_gather(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_all_gather(inputs, outputs, streams, comms)
+
+
+def reduce_scatter(
+    inputs: Sequence[torch.Tensor],
+    outputs: Sequence[torch.Tensor],
+    op: int = SUM,
+    streams=None,
+    comms=None,
+) -> None:
+    _check_sequence_type(inputs)
+    _check_sequence_type(outputs)
+    torch._C._nccl_reduce_scatter(inputs, outputs, op, streams, comms)
diff --git a/MLPY/Lib/site-packages/torch/cuda/nvtx.py b/MLPY/Lib/site-packages/torch/cuda/nvtx.py
new file mode 100644
index 0000000000000000000000000000000000000000..58713d06e713a3c5fe4ad9f7eedf0a533a0343d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/nvtx.py
@@ -0,0 +1,91 @@
+r"""This package adds support for NVIDIA Tools Extension (NVTX) used in profiling."""
+
+from contextlib import contextmanager
+
+try:
+    from torch._C import _nvtx
+except ImportError:
+
+    class _NVTXStub:
+        @staticmethod
+        def _fail(*args, **kwargs):
+            raise RuntimeError(
+                "NVTX functions not installed. Are you sure you have a CUDA build?"
+            )
+
+        rangePushA = _fail
+        rangePop = _fail
+        markA = _fail
+
+    _nvtx = _NVTXStub()  # type: ignore[assignment]
+
+__all__ = ["range_push", "range_pop", "range_start", "range_end", "mark", "range"]
+
+
+def range_push(msg):
+    """
+    Push a range onto a stack of nested range span.  Returns zero-based depth of the range that is started.
+
+    Args:
+        msg (str): ASCII message to associate with range
+    """
+    return _nvtx.rangePushA(msg)
+
+
+def range_pop():
+    """Pop a range off of a stack of nested range spans.  Returns the  zero-based depth of the range that is ended."""
+    return _nvtx.rangePop()
+
+
+def range_start(msg) -> int:
+    """
+    Mark the start of a range with string message. It returns an unique handle
+    for this range to pass to the corresponding call to rangeEnd().
+
+    A key difference between this and range_push/range_pop is that the
+    range_start/range_end version supports range across threads (start on one
+    thread and end on another thread).
+
+    Returns: A range handle (uint64_t) that can be passed to range_end().
+
+    Args:
+        msg (str): ASCII message to associate with the range.
+    """
+    return _nvtx.rangeStartA(msg)
+
+
+def range_end(range_id) -> None:
+    """
+    Mark the end of a range for a given range_id.
+
+    Args:
+        range_id (int): an unique handle for the start range.
+    """
+    _nvtx.rangeEnd(range_id)
+
+
+def mark(msg):
+    """
+    Describe an instantaneous event that occurred at some point.
+
+    Args:
+        msg (str): ASCII message to associate with the event.
+    """
+    return _nvtx.markA(msg)
+
+
+@contextmanager
+def range(msg, *args, **kwargs):
+    """
+    Context manager / decorator that pushes an NVTX range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+
+    Args:
+        msg (str): message to associate with the range
+    """
+    range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        range_pop()
diff --git a/MLPY/Lib/site-packages/torch/cuda/profiler.py b/MLPY/Lib/site-packages/torch/cuda/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..010590d5d87e681edd650bea98100165dcd7e772
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/profiler.py
@@ -0,0 +1,61 @@
+import contextlib
+import tempfile
+
+import torch
+from . import check_error, cudart
+
+__all__ = ["init", "start", "stop", "profile"]
+
+DEFAULT_FLAGS = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+def init(output_file, flags=None, output_mode="key_value"):
+    rt = cudart()
+    if not hasattr(rt, "cudaOutputMode"):
+        raise AssertionError("HIP does not support profiler initialization!")
+    if (
+        hasattr(torch.version, "cuda")
+        and torch.version.cuda is not None
+        and int(torch.version.cuda.split(".")[0]) >= 12
+    ):
+        # Check https://github.com/pytorch/pytorch/pull/91118
+        # cudaProfilerInitialize is no longer needed after CUDA 12
+        raise AssertionError("CUDA12+ does not need profiler initialization!")
+    flags = DEFAULT_FLAGS if flags is None else flags
+    if output_mode == "key_value":
+        output_mode_enum = rt.cudaOutputMode.KeyValuePair
+    elif output_mode == "csv":
+        output_mode_enum = rt.cudaOutputMode.CSV
+    else:
+        raise RuntimeError(
+            "supported CUDA profiler output modes are: key_value and csv"
+        )
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+        f.write(b"\n".join(f.encode("ascii") for f in flags))
+        f.flush()
+        check_error(rt.cudaProfilerInitialize(f.name, output_file, output_mode_enum))
+
+
+def start():
+    check_error(cudart().cudaProfilerStart())
+
+
+def stop():
+    check_error(cudart().cudaProfilerStop())
+
+
+@contextlib.contextmanager
+def profile():
+    try:
+        start()
+        yield
+    finally:
+        stop()
diff --git a/MLPY/Lib/site-packages/torch/cuda/random.py b/MLPY/Lib/site-packages/torch/cuda/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f831b112b2a92b6d51c5dfa1dafe8c0125e6e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/random.py
@@ -0,0 +1,179 @@
+from typing import Iterable, List, Union
+
+import torch
+from .. import Tensor
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "cuda") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.cuda.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "cuda"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'cuda'`` (i.e., ``torch.device('cuda')``, the current CUDA device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("cuda", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.cuda.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.cuda.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if CUDA is not available; in that
+    case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.cuda.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes CUDA.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.cuda.default_generators[idx]
+    return default_generator.initial_seed()
diff --git a/MLPY/Lib/site-packages/torch/cuda/sparse.py b/MLPY/Lib/site-packages/torch/cuda/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..702def052945ad1bd54ab221ae517a9c01361e34
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/sparse.py
@@ -0,0 +1 @@
+# The Tensor classes are added to this module by python_tensor.cpp
diff --git a/MLPY/Lib/site-packages/torch/cuda/streams.py b/MLPY/Lib/site-packages/torch/cuda/streams.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee98beab003d9f5bb7692db6efb969d49b26ea5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/cuda/streams.py
@@ -0,0 +1,241 @@
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_CudaStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_CudaStreamBase"] = _dummy_type("_CudaStreamBase")
+    torch._C.__dict__["_CudaEventBase"] = _dummy_type("_CudaEventBase")
+
+
+class Stream(torch._C._CudaStreamBase, _StreamBase):
+    r"""Wrapper around a CUDA stream.
+
+    A CUDA stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.cuda.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event):
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.cuda.Event): an event to wait for.
+
+        .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see
+           `CUDA Stream documentation`_ for more info.
+
+           This function returns without waiting for :attr:`event`: only future
+           operations are affected.
+
+        .. _CUDA Stream documentation:
+           https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream):
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+
+        .. note:: This function returns without waiting for currently enqueued
+           kernels in :attr:`stream`: only future operations are affected.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.cuda.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self):
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self):
+        r"""Wait for all the kernels in this stream to complete.
+
+        .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
+           `CUDA Stream documentation`_ for more info.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.cuda_stream)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.cuda_stream, self.device))
+
+    def __repr__(self):
+        return f"<torch.cuda.Stream device={self.device} cuda_stream={self.cuda_stream:#x}>"
+
+
+class ExternalStream(Stream):
+    r"""Wrapper around an externally allocated CUDA stream.
+
+    This class is used to wrap streams allocated in other libraries in order
+    to facilitate data exchange and multi-library interactions.
+
+    .. note:: This class doesn't manage the stream life-cycle, it is the user
+       responsibility to keep the referenced stream alive while this class is
+       being used.
+
+    Args:
+        stream_ptr(int): Integer representation of the `cudaStream_t` value.
+            allocated externally.
+        device(torch.device or int, optional): the device where the stream
+            was originally allocated. if device is specified incorrectly,
+            subsequent launches using this stream may fail.
+    """
+
+    def __new__(cls, stream_ptr, device=None, **kwargs):
+        with torch.cuda.device(device):
+            return super().__new__(cls, stream_ptr=stream_ptr, **kwargs)
+
+
+class Event(torch._C._CudaEventBase, _EventBase):
+    r"""Wrapper around a CUDA event.
+
+    CUDA events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize CUDA
+    streams.
+
+    The underlying CUDA events are lazily initialized when the event is first
+    recorded or exported to another process. After creation, only streams on the
+    same device may record the event. However, streams on any device can wait on
+    the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """
+
+    def __new__(cls, enable_timing=False, blocking=False, interprocess=False):
+        return super().__new__(
+            cls,
+            enable_timing=enable_timing,
+            blocking=blocking,
+            interprocess=interprocess,
+        )
+
+    @classmethod
+    def from_ipc_handle(cls, device, handle):
+        r"""Reconstruct an event from an IPC handle on the given device."""
+        return super().from_ipc_handle(device, handle)
+
+    def record(self, stream=None):
+        r"""Record the event in a given stream.
+
+        Uses ``torch.cuda.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None):
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.cuda.current_stream()`` if no stream is specified.
+
+        .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        if stream is None:
+            stream = torch.cuda.current_stream()
+        super().wait(stream)
+
+    def query(self):
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self):
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+
+         .. note:: This is a wrapper around ``cudaEventSynchronize()``: see
+            `CUDA Event documentation`_ for more info.
+        """
+        super().synchronize()
+
+    def ipc_handle(self):
+        r"""Return an IPC handle of this event.
+
+        If not recorded yet, the event will use the current device.
+        """
+        return super().ipc_handle()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.cuda_event)
+
+    def __repr__(self):
+        if self.cuda_event:
+            return f"<torch.cuda.Event {self._as_parameter_.value:#x}>"
+        else:
+            return "<torch.cuda.Event uninitialized>"
diff --git a/MLPY/Lib/site-packages/torch/distributed/__init__.py b/MLPY/Lib/site-packages/torch/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..851314d31f3ad16d7e3e617582f9e93fc981b47f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/__init__.py
@@ -0,0 +1,132 @@
+import os
+import sys
+from enum import Enum
+import pdb
+import io
+
+import torch
+
+def is_available() -> bool:
+    """
+    Return ``True`` if the distributed package is available.
+
+    Otherwise,
+    ``torch.distributed`` does not expose any other APIs. Currently,
+    ``torch.distributed`` is available on Linux, MacOS and Windows. Set
+    ``USE_DISTRIBUTED=1`` to enable it when building PyTorch from source.
+    Currently, the default value is ``USE_DISTRIBUTED=1`` for Linux and Windows,
+    ``USE_DISTRIBUTED=0`` for MacOS.
+    """
+    return hasattr(torch._C, "_c10d_init")
+
+
+if is_available() and not torch._C._c10d_init():
+    raise RuntimeError("Failed to initialize torch.distributed")
+
+# Custom Runtime Errors thrown from the distributed package
+DistError = torch._C._DistError
+DistBackendError = torch._C._DistBackendError
+DistNetworkError = torch._C._DistNetworkError
+DistStoreError = torch._C._DistStoreError
+
+if is_available():
+    from torch._C._distributed_c10d import (
+        Store,
+        FileStore,
+        TCPStore,
+        ProcessGroup as ProcessGroup,
+        Backend as _Backend,
+        PrefixStore,
+        Reducer,
+        Logger,
+        BuiltinCommHookType,
+        GradBucket,
+        Work as _Work,
+        _DEFAULT_FIRST_BUCKET_BYTES,
+        _register_comm_hook,
+        _register_builtin_comm_hook,
+        _broadcast_coalesced,
+        _compute_bucket_assignment_by_size,
+        _verify_params_across_processes,
+        _test_python_store,
+        DebugLevel,
+        get_debug_level,
+        set_debug_level,
+        set_debug_level_from_env,
+        _make_nccl_premul_sum,
+    )
+
+    class _DistributedPdb(pdb.Pdb):
+        """
+        Supports using PDB from inside a multiprocessing child process.
+
+        Usage:
+        _DistributedPdb().set_trace()
+        """
+        def interaction(self, *args, **kwargs):
+            _stdin = sys.stdin
+            try:
+                sys.stdin = open('/dev/stdin')
+                pdb.Pdb.interaction(self, *args, **kwargs)
+            finally:
+                sys.stdin = _stdin
+
+    def breakpoint(rank: int = 0):
+        """
+        Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
+        done with the breakpoint before continuing.
+
+        Args:
+            rank (int): Which rank to break on.  Default: ``0``
+        """
+        if get_rank() == rank:
+            pdb = _DistributedPdb()
+            pdb.message(
+                "\n!!! ATTENTION !!!\n\n"
+                f"Type 'up' to get to the frame that called dist.breakpoint(rank={rank})\n"
+            )
+            pdb.set_trace()
+        barrier()
+
+    if sys.platform != "win32":
+        from torch._C._distributed_c10d import (
+            HashStore,
+            _round_robin_process_groups,
+        )
+
+    from .distributed_c10d import *  # noqa: F403
+
+    # Variables prefixed with underscore are not auto imported
+    # See the comment in `distributed_c10d.py` above `_backend` on why we expose
+    # this.
+
+    from .distributed_c10d import (
+        _all_gather_base,
+        _reduce_scatter_base,
+        _create_process_group_wrapper,
+        _rank_not_in_group,
+        _coalescing_manager,
+        _CoalescingManager,
+        _get_process_group_name,
+    )
+
+    from .rendezvous import (
+        rendezvous,
+        _create_store_from_options,
+        register_rendezvous_handler,
+    )
+
+    from .remote_device import _remote_device
+
+    set_debug_level_from_env()
+
+else:
+    # This stub is sufficient to get
+    #   python test/test_public_bindings.py -k test_correct_module_names
+    # working even when USE_DISTRIBUTED=0.  Feel free to add more
+    # stubs as necessary.
+    # We cannot define stubs directly because they confuse pyre
+
+    class _ProcessGroupStub:
+        pass
+    sys.modules["torch.distributed"].ProcessGroup = _ProcessGroupStub  # type: ignore[attr-defined]
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab1ddd5cedae6a0898fa9dc8ee39dadce4b8f8d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/_composable_state.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_composable_state.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e77e20e45e637eb46d2d1ac07ff7d6dc674c746
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_composable_state.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..007cff9806eb5843947378fdb49d79f1985edf96
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29c917f07a9c66243005ab70bd0bc0f76c7583aa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_functional_collectives_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/_state_dict_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_state_dict_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42b9a0e5d38593433a5baaf505473a56e6ccf50e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/_state_dict_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/argparse_util.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/argparse_util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6da77a3da7a953f3d1463a03cc70ebadd550950a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/argparse_util.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/c10d_logger.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/c10d_logger.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe57d80b08716e41f1e930f2c4ebb40b669e8bd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/c10d_logger.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/collective_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/collective_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4211a6a598096e1b2f41cb1443ef399ce8b6f32
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/collective_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/constants.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4805ea9eda967be2acce919b5c52f7d399613ca5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/constants.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/device_mesh.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/device_mesh.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b381162f583c9911cba4f5d038d4ee38f8f0c4fd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/device_mesh.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/distributed_c10d.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/distributed_c10d.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33ff4f0d1389642dbd440c3c371be113838a0dd4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/distributed_c10d.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/launch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/launch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edd29282eb7c6b3c2f2ded3f4ac5ab443a322e16
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/launch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/logging_handlers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/logging_handlers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d8bdf18f81fb8729311eb3e20dd9589c7a6694f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/logging_handlers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/remote_device.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/remote_device.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35a3fc1468f72b8e19911a0a2b29aa9aa9cbe505
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/remote_device.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/rendezvous.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/rendezvous.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40338e0ec17d6cd21ecd6e2fea4ad237dff30981
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/rendezvous.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/run.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/run.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bf3a4d780165ac6fcf14c1396f745661e110c09
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/run.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d17b5af17a584045b80a54605896bafbdc49475
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_composable/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f347203e6c3c9e4aa92a4ea796d35349c0a548f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/__init__.py
@@ -0,0 +1,4 @@
+from .checkpoint_activation import checkpoint
+from .contract import _get_registry, contract
+from .fully_shard import fully_shard
+from .replicate import replicate
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc39e11aa4a38567d710273c8def068435547b2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/checkpoint_activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/checkpoint_activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..526120cbaa99e26966e00fbdf80951e4b9725713
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/checkpoint_activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/contract.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/contract.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc761259a0c303e6e0d347d1f7a88f6dc56e529b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/contract.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/fully_shard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/fully_shard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2869920de64190b8bfedf87864aad54aea6f5e0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/fully_shard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/replicate.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/replicate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85a2d0da1c626c3359c67af35a9580358f39c326
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/__pycache__/replicate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/checkpoint_activation.py b/MLPY/Lib/site-packages/torch/distributed/_composable/checkpoint_activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e398d87eb0af4716fce33d122f722e20b1f152
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/checkpoint_activation.py
@@ -0,0 +1,94 @@
+from contextlib import contextmanager, nullcontext
+from typing import Any, Tuple
+
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import (
+    _checkpoint_without_reentrant_generator,
+    _DEFAULT_DETERMINISM_MODE,
+)
+
+from .contract import contract
+
+
+@contextmanager
+def _no_hook(module: nn.Module):
+    r"""
+    Disable hooks installed by checkpoint to avoid unintentional recursion
+    during backward recomputation.
+    """
+    orig_enable_hook = checkpoint.state(module).enable_hook
+    checkpoint.state(module).enable_hook = False
+    try:
+        yield
+    finally:
+        checkpoint.state(module).enable_hook = orig_enable_hook
+
+
+@contract()
+def checkpoint(module: nn.Module) -> nn.Module:
+    r"""
+    This is a composable activation checkpointing API. Unlike functional
+    activation checkpointing APIs, this one does not require changing model
+    source code. Unlike ``nn.Module`` wrapper activation checkpointing APIs,
+    this one does not modify model structure or fully-qualified names either.
+    Under the hood, it registers activation checkpointing logic as pre- and
+    post-forward hooks. Hence, this API can be easily applied to any model or
+    sub-modules in the model.
+
+    Args:
+        module (nn.Module): the target model or sub-module to apply activation
+            checkpointing.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> import torch.nn as nn
+        >>>
+        >>> class MyModel(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.l1 = nn.Linear(10, 10)
+        >>>         self.l2 = nn.Linear(10, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         return self.l2(self.l1(x))
+        >>>
+        >>> model = MyModel()
+        >>> checkpoint(model.l1)  # apply activation checkpointing only to l1
+        >>> model(torch.zeros(2, 10)).sum().backward()
+
+    """
+    torch._C._log_api_usage_once("torch.distributed.checkpoint")
+
+    def forward_pre_hook(module: nn.Module, inputs: Tuple[Any, ...]) -> None:
+        if checkpoint.state(module).enable_hook:
+
+            def context_fns():
+                return nullcontext(), _no_hook(module)
+
+            checkpoint.state(
+                module
+            )._ac_generator = _checkpoint_without_reentrant_generator(
+                module, True, context_fns, _DEFAULT_DETERMINISM_MODE, False, *inputs
+            )
+            next(checkpoint.state(module)._ac_generator)
+
+    def forward_hook(module: nn.Module, inputs: Tuple[Any, ...], output: Any) -> Any:
+        if checkpoint.state(module).enable_hook:
+            try:
+                next(checkpoint.state(module)._ac_generator)
+            except StopIteration:
+                pass
+            else:
+                raise RuntimeError(
+                    "Expected non-reentrant activation checkpoint generator to be exhausted, but it was not!"
+                )
+
+        #  Ensure that we no longer hold on to the generator. always_call=True helps ensure we
+        # clear this even in the case of exception in fwd pass.
+        checkpoint.state(module)._ac_generator = None
+
+    checkpoint.state(module).enable_hook = True
+    module.register_forward_pre_hook(forward_pre_hook)
+    module.register_forward_hook(forward_hook, prepend=True, always_call=True)
+    return module
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/contract.py b/MLPY/Lib/site-packages/torch/distributed/_composable/contract.py
new file mode 100644
index 0000000000000000000000000000000000000000..72b54b2f55970a9638803b6b32d03212bac26271
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/contract.py
@@ -0,0 +1,194 @@
+import uuid
+from collections import OrderedDict
+from functools import wraps
+from typing import Callable, Dict, List, Optional, Type
+
+import torch.nn as nn
+from torch.distributed._composable_state import _State
+
+
+def generate_state_key(string="__composable_api_state_key"):
+    return f"{string}_{str(uuid.uuid4())}"
+
+
+STATE_KEY = generate_state_key()
+REGISTRY_KEY = generate_state_key()
+
+
+# TODO: we can add additional info to RegistryItem to share across APIs. E.g.,
+# we can add args and kwargs here, and then we can detect whether fully_shard
+# is combined with reentrant activation checkpointing and error out with a clear
+# message.
+class RegistryItem:
+    pass
+
+
+def contract(state_cls: Type[_State] = _State):
+    r"""
+    Decorate a function as a composable distributed API, where the first
+    argument of the function must be an :class:`nn.Module` instance. The
+    decorator verifies that the wrapped function does not modify parameter,
+    buffer or sub-module fully-qualified names (FQN).
+
+    When a function ``func`` is decorated by ``@contract()``, a
+    ``.state(module: nn.Module)`` method will be installed to the decorated
+    function. Then you can retrieve and modify the state on a module by calling
+    ``func.state(module)``.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> import torch.nn as nn
+        >>>
+        >>> class MyModel(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.l1 = nn.Linear(10, 10)
+        >>>         self.l2 = nn.Linear(10, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         return self.l2(self.l1(x))
+        >>>
+        >>> @contract()
+        >>> def my_feature(module: nn.Module) -> nn.Module:
+        >>>     my_feature.state(module).some_state = "any value"
+        >>>     return module
+        >>>
+        >>> model = MyModel()
+        >>> my_feature(model.l1)
+        >>> assert my_feature.state(model.l1).some_state == "any value"
+        >>> my_feature(model.l2)
+        >>> model(torch.randn(2, 10)).sum().backward()
+    """
+
+    # wraps will make functions decorated with contract() pickleable - needed for integration with torch.package
+    @wraps(state_cls)
+    def inner(func):
+        @wraps(func)
+        def wrapper(module: nn.Module, *args, **kwargs) -> Optional[nn.Module]:
+            # get existing global states
+            default_all_state: Dict[Callable, _State] = OrderedDict()
+            all_state: Dict[Callable, _State] = module.__dict__.setdefault(  # type: ignore[call-overload]
+                STATE_KEY, default_all_state
+            )
+            assert isinstance(
+                all_state, dict
+            ), "Distributed composable API states corrupted"
+
+            # get global registry
+            default_registry: Dict[str, RegistryItem] = OrderedDict()
+            registry: Dict[str, RegistryItem] = module.__dict__.setdefault(  # type: ignore[call-overload]
+                REGISTRY_KEY, default_registry
+            )
+
+            assert isinstance(
+                registry, dict
+            ), "Distributed composable API registry corrupted"
+
+            # make sure the API func has not been applied to the input module yet.
+            assert func not in all_state and func.__name__ not in registry, (
+                "Each distinct composable distributed API can only be applied to a "
+                f"module once. {func.__name__} has already been applied to the "
+                f"following module.\n{module}"
+            )
+
+            # install states specific to the wrapped ``func``
+            all_state.setdefault(func, state_cls())
+            # register ``func`` in the global registry by name
+            registry.setdefault(func.__name__, RegistryItem())
+
+            orig_named_params = OrderedDict(module.named_parameters())
+            orig_named_buffers = OrderedDict(
+                module.named_buffers(remove_duplicate=False)
+            )
+            orig_named_modules = OrderedDict(
+                module.named_modules(remove_duplicate=False)
+            )
+
+            updated = func(module, *args, **kwargs)
+
+            if updated is None:
+                updated = module
+
+            new_named_params = OrderedDict(updated.named_parameters())
+            new_named_buffers = OrderedDict(
+                updated.named_buffers(remove_duplicate=False)
+            )
+            new_named_modules = OrderedDict(
+                updated.named_modules(remove_duplicate=False)
+            )
+
+            assert isinstance(updated, nn.Module), (
+                "Output of composable distributed APIs must be either None or "
+                f"nn.Module, but got {type(updated)}"
+            )
+
+            def check_fqn(orig_fqns: List[str], new_fqns: List[str], check_key: str):
+                if orig_fqns == new_fqns:
+                    return
+
+                orig_fqn_set, new_fqn_set = set(orig_fqns), set(new_fqns)
+                orig_only = orig_fqn_set - new_fqn_set
+                new_only = new_fqn_set - orig_fqn_set
+                if len(orig_only) or len(new_only):
+                    raise RuntimeError(
+                        f"{check_key}"
+                        "Composable distributed API implementations cannot modify "
+                        "FQNs.\n"
+                        f"Only in original FQNs: {orig_only},\n"
+                        f"Only in new FQNs: {new_only}"
+                    )
+                else:
+                    raise RuntimeError(
+                        f"{check_key}"
+                        "Composable distributed API implementations cannot modify "
+                        "the order of FQNs.\n"
+                        f"Original FQNs: {orig_only}\n"
+                        f"New FQNs: {new_only}"
+                    )
+
+            check_fqn(
+                list(orig_named_params.keys()),
+                list(new_named_params.keys()),
+                "Check parameters, ",
+            )
+            check_fqn(
+                list(orig_named_buffers.keys()),
+                list(new_named_buffers.keys()),
+                "Check buffer, ",
+            )
+            check_fqn(
+                list(orig_named_modules.keys()),
+                list(new_named_modules.keys()),
+                "Check modules, ",
+            )
+
+            # TODO: a stricter verification should also reject changing module
+            # types and monkey-patching forward() method implementations.
+
+            # TODO: verify that installed distributed paradigms are compatible with
+            # each other.
+
+            return updated
+
+        def get_state(module: nn.Module) -> Optional[_State]:
+            return module.__dict__.setdefault(  # type: ignore[call-overload]
+                STATE_KEY,
+                {},  # TODO(@yhcharles): this is a temporary fix, need a better way
+            ).get(
+                func
+            )  # type: ignore[call-overload]
+
+        wrapper.state = get_state  # type: ignore[attr-defined]
+
+        return wrapper
+
+    return inner
+
+
+def _get_registry(module: nn.Module) -> Optional[Dict[str, RegistryItem]]:
+    r"""
+    Get an ``OrderedDict`` of composable APIs that have been applied to the
+    ``module``, indexed by the API name. If no API has been applied, then this
+    returns ``None``.
+    """
+    return getattr(module, REGISTRY_KEY, None)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81a56b625cd924f8d314a020caef977591546d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__init__.py
@@ -0,0 +1,2 @@
+from ._fsdp_api import MixedPrecisionPolicy
+from .fully_shard import FSDP, fully_shard
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43ce0c9e6237ab960bffbfdea099f320e062db24
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6440f0f061625c891ae0fe9705fdb58ad824a4bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_collectives.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_collectives.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e80f3b71fe2fe0ef4ac84a2d311c705f92d9ec6d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_collectives.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07d72953511b917fc74f74daac3f274f2df30be0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_init.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..409ab5af3bf15462e45dfd46d7fe7c567f5a2d4a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_init.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecf73d8a7e0deb95db984d865ac66c9f96c176a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param_group.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param_group.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e26b3aecf74603ecabf1a0605dd6c8e60193752
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_param_group.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_state.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_state.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d99be15325fa4b7e1ff788f5bd32d60eb0bd4bcf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/_fsdp_state.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/fully_shard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/fully_shard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea8c9d31a0eda95195557daff3fbaa98fc459a94
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/__pycache__/fully_shard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_api.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d77e93513228c74f2defba8270e1e40354cdf8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_api.py
@@ -0,0 +1,52 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+
+@dataclass(frozen=True)
+class MixedPrecisionPolicy:
+    """
+    This configures FSDP's mixed precision. Unlike autocast, this applies mixed
+    precision at the module level, not op level, which means low-precision
+    activations are saved for backward and high-to-low-precision casts are
+    incurred only at module boundaries.
+
+    FSDP works well with module-level mixed precision since it keeps the
+    high-precision sharded parameters in memory anyway. In other words, FSDP
+    does not require any extra memory to keep a high-precision copy of the
+    parameters for the optimizer step.
+
+    Attributes:
+        param_dtype (Optional[torch.dtype]): This specifies the dtype for
+            the unsharded parameter and hence the dtype for forward/backward
+            computation and the parameter all-gather. If this is ``None``, then
+            the unsharded parameter uses the original dtype. The optimizer step
+            uses the sharded parameter in the original dtype. (Default:
+            ``None``)
+        reduce_dtype (Optional[torch.dtype]): This specifies the dtype for
+            gradient reduction (i.e. reduce-scatter or all-reduce). If this is
+            ``None`` but ``param_dtype`` is not ``None``, then the reduction
+            uses the compute dtype. This can be used to run gradient reduction
+            in full precision while using low precision for compute. (Default:
+            ``None``)
+        output_dtype (Optional[torch.dtype]): This specifies the dtype for
+            casting floating-point forward outputs. This can be used to
+            help implement cases where different modules have different mixed
+            precision policies. (Default: ``None``)
+        cast_forward_inputs (bool): This specifies whether FSDP should cast the
+            forward's floating-point input tensors to ``param_dtype`` or not.
+    """
+
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+    output_dtype: Optional[torch.dtype] = None
+    cast_forward_inputs: bool = True
+
+    def __post_init__(self):
+        # Clamp `reduce_dtype` to `None` if no casting is required: since
+        # gradients are computed in `param_dtype`, if `reduce_dtype` matches,
+        # then we do not need extra casting
+        if self.param_dtype == self.reduce_dtype:
+            # Bypass the frozen dataclass checks
+            object.__setattr__(self, "reduce_dtype", None)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_collectives.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_collectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1859f42e2a1a8ff7f19d4c76d57b2400f1c008
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_collectives.py
@@ -0,0 +1,217 @@
+from typing import List, NamedTuple, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import ReduceOp
+from ._fsdp_common import (
+    _get_dim0_padded_size,
+    _raise_assert_with_print,
+    _to_dtype_if_needed,
+)
+from ._fsdp_param import FSDPParam
+
+
+class AllGatherResult(NamedTuple):
+    all_gather_output: torch.Tensor
+    all_gather_event: Optional[torch.cuda.Event]
+    all_gather_work: Optional[dist.distributed_c10d.Work]
+    all_gather_input_numels: List[int]
+
+
+@torch.no_grad()
+def foreach_all_gather(
+    fsdp_params: List[FSDPParam],
+    group: dist.ProcessGroup,
+    async_op: bool,
+    all_gather_copy_in_stream: torch.cuda.Stream,
+    all_gather_stream: torch.cuda.Stream,
+    device: torch.device,
+) -> Optional[AllGatherResult]:
+    world_size, rank = group.size(), group.rank()
+    # - Copy in
+    with torch.cuda.stream(all_gather_copy_in_stream):
+        param_all_gather_inputs = [
+            fsdp_param.all_gather_input for fsdp_param in fsdp_params
+        ]
+        dtype = param_all_gather_inputs[0].dtype
+        if not all(t.dtype == dtype for t in param_all_gather_inputs):
+            raise NotImplementedError(
+                f"Mixed dtype not supported yet: {[t.dtype for t in param_all_gather_inputs]}"
+            )
+        inp_split_sizes = [inp.numel() for inp in param_all_gather_inputs]
+        all_gather_input_numel = sum(inp_split_sizes)
+        all_gather_output = torch.empty(
+            (all_gather_input_numel * world_size,), dtype=dtype, device=device
+        )
+        all_gather_input = all_gather_output.narrow(
+            0, all_gather_input_numel * rank, all_gather_input_numel
+        )
+        foreach_copy_dsts = torch.split(all_gather_input, inp_split_sizes)
+        torch._foreach_copy_(foreach_copy_dsts, param_all_gather_inputs)
+        del param_all_gather_inputs
+    all_gather_stream.wait_stream(all_gather_copy_in_stream)
+    with torch.cuda.stream(all_gather_stream):
+        # - All-gather
+        all_gather_work = dist.all_gather_into_tensor(
+            output_tensor=all_gather_output,
+            input_tensor=all_gather_input,
+            group=group,
+            async_op=async_op,
+        )
+        all_gather_event = all_gather_stream.record_event()
+        return AllGatherResult(
+            all_gather_output, all_gather_event, all_gather_work, inp_split_sizes
+        )
+
+
+@torch.no_grad()
+def foreach_all_gather_copy_out(
+    all_gather_result: AllGatherResult,
+    fsdp_params: List[FSDPParam],
+    group: dist.ProcessGroup,
+) -> None:
+    (
+        all_gather_output,
+        all_gather_event,
+        all_gather_work,
+        all_gather_input_numels,
+    ) = all_gather_result
+    if all_gather_event is not None:  # sync op
+        torch.cuda.current_stream().wait_event(all_gather_event)
+    if all_gather_work is not None:  # async op
+        all_gather_work.wait()
+    world_size = group.size()
+    dtype, device = all_gather_output.dtype, all_gather_output.device
+    for all_gather_input_numel, fsdp_param in zip(all_gather_input_numels, fsdp_params):
+        fsdp_param.init_all_gather_output(
+            all_gather_input_numel, world_size, dtype, device
+        )  # no-op after 1st call
+        fsdp_param.alloc_all_gather_output()
+    all_gather_output = all_gather_output.view(world_size, -1)
+    out = [
+        fsdp_param.all_gather_output.view(world_size, -1) for fsdp_param in fsdp_params
+    ]
+    torch.split_with_sizes_copy(
+        all_gather_output, all_gather_input_numels, dim=1, out=out
+    )
+
+
+@torch.no_grad()
+def foreach_reduce_scatter(
+    fsdp_params: List[FSDPParam],
+    unsharded_grads: List[torch.Tensor],
+    group: dist.ProcessGroup,
+    reduce_scatter_stream: torch.cuda.Stream,
+    orig_dtype: torch.dtype,
+    reduce_dtype: Optional[torch.dtype],
+    device: torch.device,
+    divide_factors: Optional[Tuple[float, float]],
+) -> torch.cuda.Event:
+    """
+    ``unsharded_grads`` owns the references to the gradients computed by
+    autograd, so clearing the list frees the gradients.
+    """
+    grad_dtypes = {grad.dtype for grad in unsharded_grads}
+    if len(grad_dtypes) != 1:
+        # Check this at runtime since it could be a real runtime error if e.g.
+        # fp8 weights do not produce the correct higher precision gradients
+        _raise_assert_with_print(
+            f"FSDP reduce-scatter expects uniform gradient dtype but got {grad_dtypes}"
+        )
+    grad_dtype = unsharded_grads[0].dtype
+    reduce_dtype = reduce_dtype or grad_dtype
+    world_size = group.size()
+    padded_unsharded_sizes = tuple(
+        _get_dim0_padded_size(grad.size(), world_size) for grad in unsharded_grads
+    )
+    reduce_scatter_input_numel = sum(s.numel() for s in padded_unsharded_sizes)
+    reduce_scatter_output_numel = reduce_scatter_input_numel // world_size
+    current_stream = torch.cuda.current_stream()
+    reduce_scatter_stream.wait_stream(current_stream)
+    with torch.cuda.stream(reduce_scatter_stream):
+        reduce_scatter_input = torch.empty(
+            (reduce_scatter_input_numel,), dtype=reduce_dtype, device=device
+        )
+        foreach_reduce_scatter_copy_in(
+            unsharded_grads, reduce_scatter_input, world_size
+        )
+        # Only after the copy-in finishes can we free the gradients, which were
+        # computed in the default stream
+        current_stream.wait_stream(reduce_scatter_stream)
+        unsharded_grads.clear()
+        reduce_scatter_output = reduce_scatter_input.new_empty(
+            (reduce_scatter_output_numel,)
+        )
+        _reduce_scatter(
+            reduce_scatter_output, reduce_scatter_input, group, divide_factors
+        )
+        reduce_scatter_output = _to_dtype_if_needed(reduce_scatter_output, orig_dtype)
+        # - View out and accumulate
+        flat_grad_offset = 0  # [0, reduce_scatter_output_numel - 1]
+        for padded_unsharded_size, fsdp_param in zip(
+            padded_unsharded_sizes, fsdp_params
+        ):
+            new_sharded_grad = torch.as_strided(
+                reduce_scatter_output,
+                size=fsdp_param.sharded_size,
+                stride=fsdp_param.contiguous_sharded_stride,
+                storage_offset=flat_grad_offset,
+            )
+            to_accumulate_grad = fsdp_param.sharded_param.grad is not None
+            new_sharded_dtensor_grad = fsdp_param.to_sharded_dtensor(new_sharded_grad)
+            if to_accumulate_grad:
+                fsdp_param.sharded_param.grad += new_sharded_dtensor_grad
+            else:
+                fsdp_param.sharded_param.grad = new_sharded_dtensor_grad
+            padded_sharded_numel = padded_unsharded_size.numel() // world_size
+            flat_grad_offset += padded_sharded_numel
+        reduce_scatter_view_out_event = reduce_scatter_stream.record_event()
+    # The RS output is allocated in the RS stream and used in the default
+    # stream (for optimizer). To ensure its memory is not reused for later
+    # RSs, we do not need extra synchronization since the sharded parameters
+    # hold refs through the end of backward.
+    return reduce_scatter_view_out_event
+
+
+def foreach_reduce_scatter_copy_in(
+    unsharded_grads: List[torch.Tensor],
+    reduce_scatter_input: torch.Tensor,
+    world_size: int,
+) -> None:
+    grad_views: List[torch.Tensor] = []
+    grads_to_copy: List[torch.Tensor] = []
+    padded_grad_slices: List[torch.Tensor] = []
+    for grad in unsharded_grads:
+        grad_size = grad.size()
+        dim0_padded_size = _get_dim0_padded_size(grad_size, world_size)
+        if dim0_padded_size != grad_size:
+            padded_grad = grad.new_empty(dim0_padded_size)
+            padded_grad_slices.append(padded_grad[: grad.size(0)])
+            grads_to_copy.append(grad)
+            grad = padded_grad
+        grad_views.append(grad.view(world_size, -1))
+    if padded_grad_slices:
+        torch._foreach_copy_(padded_grad_slices, grads_to_copy)
+    torch.cat(grad_views, dim=-1, out=reduce_scatter_input.view(world_size, -1))
+
+
+def _reduce_scatter(
+    output: torch.Tensor,
+    input: torch.Tensor,
+    group: dist.ProcessGroup,
+    divide_factors: Optional[Tuple[float, float]],
+) -> None:
+    if divide_factors:
+        predivide_factor, postdivide_factor = divide_factors
+        _div_if_needed(input, predivide_factor)
+        dist.reduce_scatter_tensor(output, input, group=group)
+        _div_if_needed(output, postdivide_factor)
+    else:
+        # Using NCCL's reduce-scatter to do the division by world size saves
+        # extra memory read/write from a separate division kernel
+        dist.reduce_scatter_tensor(output, input, op=ReduceOp.AVG, group=group)
+
+
+def _div_if_needed(tensor: torch.Tensor, div_factor: float) -> None:
+    if div_factor > 1:
+        tensor.div_(div_factor)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_common.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aca875bedd6d154a18fc52db30861861566c025
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_common.py
@@ -0,0 +1,151 @@
+import math
+import traceback
+
+from dataclasses import dataclass
+from enum import auto, Enum
+from typing import Any, cast, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable.contract import _get_registry
+from torch.distributed._tensor import DeviceMesh, DTensor, Placement
+
+
+@dataclass
+class DataParallelMeshInfo:
+    mesh: DeviceMesh
+    shard_mesh_dim: Optional[int] = None
+    replicate_mesh_dim: Optional[int] = None
+
+    def __post_init__(self):
+        if self.shard_mesh_dim is None and self.replicate_mesh_dim is None:
+            raise AssertionError(
+                "At least one of shard_mesh_dim and replicate_mesh_dim must not be None"
+            )
+
+
+@dataclass
+class FSDPMeshInfo(DataParallelMeshInfo):
+    def __post_init__(self):
+        super().__post_init__()
+        if self.shard_mesh_dim is None:
+            raise AssertionError("Expects non-None shard_mesh_dim")
+        self.shard_mesh_size: int = self.mesh.size(self.shard_mesh_dim)
+        self.shard_process_group = cast(
+            dist.ProcessGroup, self.mesh.get_group(self.shard_mesh_dim)
+        )
+        self.shard_mesh_rank: int = self.shard_process_group.rank()
+
+
+@dataclass
+class DDPMeshInfo(DataParallelMeshInfo):
+    def __post_init__(self):
+        super().__post_init__()
+        if self.replicate_mesh_dim is None:
+            raise AssertionError("Expects non-None replicate_mesh_dim")
+        self.replicate_mesh_size: int = self.mesh.size(self.replicate_mesh_dim)
+        self.replicate_process_group = cast(
+            dist.ProcessGroup, self.mesh.get_group(self.replicate_mesh_dim)
+        )
+        self.replicate_mesh_rank: int = self.replicate_process_group.rank()
+
+
+@dataclass
+class HSDPMeshInfo(FSDPMeshInfo, DDPMeshInfo):
+    def __post_init__(self):
+        # Calls `FSDPMeshInfo` -> `DDPMeshInfo` -> `DataParallelMeshInfo`
+        super().__post_init__()
+
+
+class TrainingState(Enum):
+    """Describes the training state of one FSDP state / parameter group."""
+
+    # Transition to forward starting pre-forward until post-forward
+    FORWARD = auto()
+    # Transition to pre-backward when unsharding in backward
+    PRE_BACKWARD = auto()
+    # Transition to post-backward when resharding and reducing gradients
+    POST_BACKWARD = auto()
+    # Idle before/after forward or before pre-backward/after post-backward
+    IDLE = auto()
+
+
+def _raise_assert_with_print(*args: Any, **kwargs: Any):
+    print(f"[Rank {dist.get_rank()}] ", end="")
+    print(*args, **kwargs)
+    traceback.print_stack()
+    raise AssertionError(*args, **kwargs)
+
+
+def _is_composable_with_fsdp(module: nn.Module) -> bool:
+    registry = _get_registry(module)
+    if registry is None:
+        return True
+    # Registry keys by function name
+    return "replicate" not in registry
+
+
+def _get_dim0_padded_size(tensor_size: torch.Size, dim0_factor: int) -> torch.Size:
+    padded_dim0 = math.ceil(tensor_size[0] / dim0_factor) * dim0_factor
+    return cast(torch.Size, torch.Size([padded_dim0]) + tensor_size[1:])
+
+
+def _chunk_with_empty(
+    tensor: torch.Tensor, num_chunks: int, dim: int
+) -> List[torch.Tensor]:
+    chunks = list(torch.chunk(tensor, num_chunks, dim=dim))
+    while len(chunks) < num_chunks:
+        chunks.append(chunks[0].new_empty(0))
+    return chunks
+
+
+def _get_dim0_chunked_size(
+    chunk: torch.Tensor, unchunked_size: torch.Size
+) -> torch.Size:
+    if chunk.numel() > 0:
+        return chunk.size()
+    # For 0 numel, we need to preserve trailing dims for DTensor APIs
+    return cast(torch.Size, torch.Size([0]) + unchunked_size[1:])
+
+
+def _from_local_no_grad(
+    local_tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Tuple[Placement, ...],
+    global_size: torch.Size,
+    global_stride: Tuple[int, ...],
+) -> DTensor:
+    """
+    This method is similar to ``DTensor.from_local()`` except it avoids some
+    CPU overhead by avoiding default args and not being differentiable.
+    """
+    return DTensor(
+        # Use the local tensor directly instead of constructing a new tensor
+        # variable, e.g. with `view_as()`, since this is not differentiable
+        local_tensor,
+        device_mesh,
+        placements,
+        shape=global_size,
+        dtype=local_tensor.dtype,
+        requires_grad=local_tensor.requires_grad,
+        stride=global_stride,
+    )
+
+
+def _to_dtype_if_needed(
+    tensor: torch.Tensor, dtype: Optional[torch.dtype]
+) -> torch.Tensor:
+    if dtype is not None and tensor.dtype != dtype:
+        return tensor.to(dtype)
+    return tensor
+
+
+def _cast_fp_tensor(dtype: torch.dtype, x: torch.Tensor) -> torch.Tensor:
+    if (
+        not isinstance(x, torch.Tensor)
+        or not torch.is_floating_point(x)
+        or x.dtype == dtype
+    ):
+        return x
+    return x.to(dtype)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_init.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b6453309cd139c262bb28dba8ddaf6599b66fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_init.py
@@ -0,0 +1,144 @@
+import itertools
+from typing import List, Optional, Set, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from torch.distributed._tensor import DeviceMesh, DTensor, init_device_mesh
+from torch.distributed.device_mesh import _get_device_handle
+from ._fsdp_common import _is_composable_with_fsdp, FSDPMeshInfo, HSDPMeshInfo
+from ._fsdp_state import _get_module_fsdp_state
+
+
+def _get_post_forward_mesh_info(
+    reshard_after_forward: Union[bool, int], mesh_info: FSDPMeshInfo
+) -> Optional[FSDPMeshInfo]:
+    shard_mesh_size = mesh_info.shard_mesh_size
+    if not isinstance(reshard_after_forward, (bool, int)):
+        raise ValueError(
+            "reshard_after_forward should be a bool or an int representing the "
+            f"group size to reshard to, not {reshard_after_forward}"
+        )
+    # NOTE: `isinstance(False, int)` returns `True`.
+    if not isinstance(reshard_after_forward, bool) and isinstance(
+        reshard_after_forward, int
+    ):
+        if (
+            reshard_after_forward < 1
+            or reshard_after_forward > shard_mesh_size
+            or shard_mesh_size % reshard_after_forward != 0
+        ):
+            raise ValueError(
+                "If passing reshard_after_forward as an int, it should be a "
+                f"factor of {shard_mesh_size}, not {reshard_after_forward}"
+            )
+        elif reshard_after_forward == 1:
+            reshard_after_forward = False
+        elif reshard_after_forward == shard_mesh_size:
+            reshard_after_forward = True
+    post_forward_mesh_info = None
+    if reshard_after_forward is True:
+        post_forward_mesh_info = mesh_info
+    elif reshard_after_forward is not False:  # int case
+        # For HSDP, we can flatten the two replicate dims into the 0th dim
+        post_forward_mesh_tensor = mesh_info.mesh.mesh.view(-1, reshard_after_forward)
+        post_forward_mesh = DeviceMesh(
+            mesh_info.mesh.device_type, post_forward_mesh_tensor
+        )
+        post_forward_mesh_info = HSDPMeshInfo(
+            post_forward_mesh, shard_mesh_dim=1, replicate_mesh_dim=0
+        )
+    return post_forward_mesh_info
+
+
+def _init_default_fully_shard_mesh() -> DeviceMesh:
+    """Default to global CUDA mesh if possible else global CPU mesh."""
+    if not dist.distributed_c10d.is_initialized():
+        dist.distributed_c10d.init_process_group()
+    default_pg = dist.distributed_c10d._get_default_group()
+    device_type = "cuda" if torch.cuda.is_available() else "cpu"
+    mesh = init_device_mesh(device_type, mesh_shape=(default_pg.size(),))
+    return mesh
+
+
+def _get_device_from_mesh(mesh: DeviceMesh) -> torch.device:
+    if mesh.device_type == "cpu":
+        return torch.device("cpu")
+    device_handle = _get_device_handle(mesh.device_type)
+    return torch.device(mesh.device_type, device_handle.current_device())
+
+
+def _get_managed_modules(root_module: nn.Module) -> List[nn.Module]:
+    modules: List[nn.Module] = []
+    # Track visisted modules to avoid visiting shared modules multiple times
+    visited_modules: Set[nn.Module] = set()
+
+    def dfs(module: nn.Module) -> None:
+        """
+        Runs a DFS to collect managed modules, not recursing into modules with
+        a non-composable API or ``fully_shard`` already applied.
+        """
+        if not _is_composable_with_fsdp(module):
+            return
+        elif module is not root_module and _get_module_fsdp_state(module) is not None:
+            return  # nested `fully_shard` module
+        visited_modules.add(module)
+        for submodule in module.children():
+            if submodule not in visited_modules:
+                dfs(submodule)
+        modules.append(module)
+
+    dfs(root_module)
+    return modules
+
+
+def _get_managed_states(
+    modules: List[nn.Module],
+) -> Tuple[List[nn.Parameter], List[torch.Tensor]]:
+    params: List[nn.Parameter] = []
+    buffers: List[torch.Tensor] = []
+    # Track visited parameters/buffers to avoid visiting shared parameters and
+    # buffers multiple times
+    visited_params: Set[nn.Parameter] = set()
+    visited_buffers: Set[torch.Tensor] = set()
+    for module in modules:
+        for param in module.parameters(recurse=False):
+            if param not in visited_params:
+                params.append(param)
+                visited_params.add(param)
+        for buffer in module.buffers(recurse=False):
+            if buffer not in visited_buffers:
+                buffers.append(buffer)
+                visited_buffers.add(buffer)
+    return params, buffers
+
+
+def _move_states_to_device(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    device: torch.device,
+    mesh_info: FSDPMeshInfo,
+) -> None:
+    """
+    We have FSDP move states to device for simpler and faster initialization
+    since FSDP almost always uses CUDA for training. We move parameters/buffers
+    rather than modules since modules to support ignoring parameters/buffers in
+    the future.
+    """
+    # TODO: De-duplicate with `_apply` after `swap_tensors` path lands:
+    # https://github.com/pytorch/pytorch/issues/115792
+    for tensor in itertools.chain(params, buffers):
+        if tensor.device == device or tensor.device.type == "meta":
+            # Keep meta-device tensors on meta device for deferred init
+            continue
+        if isinstance(tensor, DTensor):
+            if (dtensor_mesh_type := tensor._spec.mesh.device_type) != device.type:
+                raise ValueError(
+                    "Requires DTensor to have mesh of the same type as the FSDP mesh "
+                    f"but got {dtensor_mesh_type} for DTensor and {device.type} for FSDP"
+                )
+            raise AssertionError(
+                f"Expects DTensor to be moved to {dtensor_mesh_type} but got {tensor.device}"
+            )
+        tensor.data = tensor.to(device)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..275e2f4f980a54e9f7d9f71243e5ba3b3c8f9a67
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param.py
@@ -0,0 +1,438 @@
+from dataclasses import dataclass, field
+from enum import auto, Enum
+from typing import cast, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from torch._prims_common import make_contiguous_strides_for
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed._tensor import DTensor, Placement, Replicate, Shard
+from torch.distributed._tensor.device_mesh import _mesh_resources
+from torch.distributed._tensor.placement_types import DTensorSpec
+from ._fsdp_api import MixedPrecisionPolicy
+from ._fsdp_common import (
+    _chunk_with_empty,
+    _from_local_no_grad,
+    _get_dim0_chunked_size,
+    _raise_assert_with_print,
+    _to_dtype_if_needed,
+    FSDPMeshInfo,
+    HSDPMeshInfo,
+)
+
+"""
+[Note: FSDP tensors]
+FSDP considers the following tensors:
+- Original parameter: parameter passed to :class:`FSDPParam`, i.e. the one
+  on the module when applying FSDP
+- Sharded parameter: sharding the original parameter on dim-0 as a DTensor
+  over the main mesh
+- All-gather input: the ``torch.Tensor`` passed to all-gather, derived from the
+  sharded parameter
+- All-gather output: the ``torch.Tensor`` resulting from all-gathering the
+  all-gather input
+- Unsharded parameter: parameter used for forward/backward computation, derived
+  from the all-gather output; autograd leaf
+
+We define these tensors to describe the general framework that can accomodate
+extensions, where:
+- all-gather-input = pre-all-gather-transform(sharded-parameter)
+- unsharded-parameter = post-all-gather-transform(all-gather-output)
+
+For the default ``torch.Tensor`` case, the sharded parameter and all-gather
+input share the same underlying tensor data, meaning that they can be thought
+of as the same tensors. The same applies for the all-gather output and
+unsharded parameter. For non-``torch.Tensor`` extensions, these equivalences
+may no longer hold due to the pre/post-all-gather transforms.
+
+[Note: FSDP and autograd]
+FSDP dynamically frees and allocates the unsharded parameter. Since autograd
+can pack a reference to it or a view to save for backward, we use storage
+resizing to implement the freeing/allocation since that preserves the aliasing.
+This implies that we construct the unsharded parameter object once and write to
+it in-place thereafter. For the default ``torch.Tensor` original parameter
+case, the all-gather output and unsharded parameter share the same
+data, so we use storage resizing on the all-gather output.
+"""
+
+
+class ShardedState(Enum):
+    """
+    - ``SHARDED``: The sharded parameter is registered to the module. It is the
+      only contributor to parameter memory.
+    - ``SHARDED_POST_FORWARD``: The unsharded parameter is resharded to a
+      smaller world size. Since this data should not be used for computation,
+      we do not register it to the module. Users should reshard the module
+      before any in-place modifications. Both it and the sharded parameter
+      contribute to parameter memory.
+    - ``UNSHARDED``: The unsharded parameter is registered to the module. Both
+      it and the sharded parameter contribute to parameter memory.
+    """
+
+    SHARDED = auto()
+    SHARDED_POST_FORWARD = auto()
+    UNSHARDED = auto()
+
+
+@dataclass
+class ParamModuleInfo:
+    """
+    For a parameter, this stores the module and the parameter name to be able
+    to do a parameter swap via ``setattr(module, param_name, ...)`` or to get
+    the parameter via ``getattr(module, param_name)``. We additionally save
+    shared modules and shared parameter names to update them accordingly.
+    """
+
+    # Parameter names are unprefixed, e.g. "weight", not "lin.weight"
+    module: nn.Module
+    param_name: str
+    shared_modules: List[nn.Module] = field(default_factory=list)
+    shared_param_names: List[str] = field(default_factory=list)
+
+
+class FSDPParam:
+    """
+    This class manages a parameter with FSDP or FSDP variants applied,
+    implementing dim-0 per-parameter sharding.
+    """
+
+    orig_dtype: torch.dtype
+    param_dtype: Optional[torch.dtype]
+    reduce_dtype: Optional[torch.dtype]
+    _orig_size: torch.Size  # ND
+    _contiguous_orig_stride: Tuple[int, ...]
+    sharded_size: torch.Size  # ND
+    contiguous_sharded_stride: Tuple[int, ...]
+    padded_sharded_param_size: torch.Size  # ND
+    sharded_post_forward_size: torch.Size  # ND
+    contiguous_sharded_post_forward_stride: Tuple[int, ...]
+    _sharded_param_data: torch.Tensor  # 1D
+    sharded_param: nn.Parameter  # ND
+    _sharded_post_forward_param_data: Optional[torch.Tensor]  # 1D
+    _sharded_post_forward_param: Optional[nn.Parameter]  # ND
+    _unsharded_param: nn.Parameter  # ND
+    _global_placements: Tuple[Placement, ...]
+    _global_size: torch.Size
+    _global_stride: Tuple[int, ...]
+    # DTensor attributes (only defined for DTensor `param`):
+    _tp_spec: DTensorSpec
+
+    def __init__(
+        self,
+        param: nn.Parameter,
+        module_info: ParamModuleInfo,
+        mesh_info: FSDPMeshInfo,
+        post_forward_mesh_info: Optional[FSDPMeshInfo],
+        device: torch.device,
+        mp_policy: MixedPrecisionPolicy,
+    ):
+        self._module_info: ParamModuleInfo = module_info
+        self.mesh_info = mesh_info
+        self.post_forward_mesh_info = post_forward_mesh_info
+        self.device = device
+        self._init_sharded_param(param, device)
+        if self.post_forward_mesh_info:
+            self._init_sharded_post_forward_param_metadata(param)
+        self.all_gather_output = torch.empty(0)
+        self._param_fqn: Optional[str] = None  # prefixed from root module
+
+    @torch.no_grad()
+    def _init_sharded_param(self, param: nn.Parameter, device: torch.device):
+        if param.device != device and param.device.type != "meta":
+            raise AssertionError(
+                f"Expects the parameter to already be moved to device {device} but got {param.device}"
+            )
+        # TODO: Replace the sharded DTensor parameter construction logic with
+        # `distribute_tensor` after https://github.com/pytorch/pytorch/issues/116101
+        # TODO: Simplify the following sharded parameter padding logic after
+        # https://github.com/pytorch/pytorch/issues/113045
+        self.is_dtensor = isinstance(param, DTensor)
+        if self.is_dtensor:
+            self._tp_spec = cast(DTensor, param)._spec
+            if (
+                self.mesh_info.shard_mesh_dim != 0
+                or self.mesh_info.replicate_mesh_dim is not None
+            ):
+                raise NotImplementedError("Using TP with HSDP is not supported")
+            dp_mesh, tp_mesh = (self.mesh_info.mesh, self._tp_spec.mesh)
+            dp_global_mesh = _mesh_resources.get_parent_mesh(dp_mesh)
+            tp_global_mesh = _mesh_resources.get_parent_mesh(tp_mesh)
+            if dp_global_mesh != tp_global_mesh or (
+                dp_global_mesh is None or tp_global_mesh is None
+            ):
+                raise AssertionError(
+                    "FSDP requires the DP and TP mesh to have the same parent mesh but got: \n"
+                    f"DP's global mesh: {dp_global_mesh}\nTP's global mesh: {tp_global_mesh}"
+                )
+            self._global_mesh = dp_global_mesh
+            if len(self._tp_spec.placements) != 1:
+                raise NotImplementedError(
+                    f"FSDP only supports 1D TP, not {self._tp_spec.placements}"
+                )
+            global_placements: List[Placement] = [Replicate(), Replicate()]
+            global_dp_mesh_dim = _mesh_resources.get_parent_mesh_dim(dp_mesh)
+            global_tp_mesh_dim = _mesh_resources.get_parent_mesh_dim(tp_mesh)
+            assert global_dp_mesh_dim is not None  # mypy
+            assert global_tp_mesh_dim is not None  # mypy
+            # TODO: Hard code FSDP + TP; need to support HSDP + TP
+            global_placements[global_dp_mesh_dim] = Shard(0)
+            global_placements[global_tp_mesh_dim] = self._tp_spec.placements[0]
+            self._global_placements = tuple(global_placements)
+            self._global_size = param.size()
+            self._global_stride = param.stride()
+            param_data = cast(DTensor, param)._local_tensor
+        else:
+            self._global_mesh = self.mesh_info.mesh
+            self._global_placements = (Shard(0),)
+            self._global_size = param.size()
+            self._global_stride = param.stride()
+            param_data = param
+        self._orig_size = param_data.size()
+        self._contiguous_orig_stride = make_contiguous_strides_for(self._orig_size)
+        shard_rank = self.mesh_info.shard_mesh_rank
+        shard_world_size = self.mesh_info.shard_mesh_size
+        chunks = _chunk_with_empty(param_data, shard_world_size, dim=0)
+        sharded_param = chunks[shard_rank]
+        self.sharded_size = _get_dim0_chunked_size(sharded_param, param_data.size())
+        self.contiguous_sharded_stride = make_contiguous_strides_for(self.sharded_size)
+        padded_sharded_size = chunks[0].size()  # 0th always padded
+        padded_sharded_param = param_data.new_zeros(padded_sharded_size)
+        self.padded_sharded_param_size = padded_sharded_param.size()
+        if sharded_param.numel() > 0:
+            padded_sharded_param[: sharded_param.size(0)].copy_(sharded_param)
+        self._sharded_param_data = padded_sharded_param.view(-1)
+        self.sharded_param = nn.Parameter(
+            self.to_sharded_dtensor(padded_sharded_param[: sharded_param.size(0)])
+        )
+        self.sharded_param.requires_grad_(param.requires_grad)
+        # Let `param_data` be freed normally when its ref count reaches 0 when
+        # the `fully_shard` call returns to allow provided parameters to alias
+        self._setattr_on_modules(self.sharded_param)
+        self.sharded_state = ShardedState.SHARDED
+
+    def _init_sharded_post_forward_param_metadata(self, param: torch.Tensor) -> None:
+        mesh_info = self.post_forward_mesh_info
+        assert mesh_info is not None  # mypy
+        param_data = param._local_tensor if isinstance(param, DTensor) else param
+        chunks = _chunk_with_empty(param_data, mesh_info.shard_mesh_size, dim=0)
+        self.sharded_post_forward_size = _get_dim0_chunked_size(
+            chunks[mesh_info.shard_mesh_rank], param_data.size()
+        )
+        self.contiguous_sharded_post_forward_stride = make_contiguous_strides_for(
+            self.sharded_post_forward_size
+        )
+
+    def init_dtype_attrs(self, mp_policy: MixedPrecisionPolicy):
+        param_dtype, reduce_dtype = (mp_policy.param_dtype, mp_policy.reduce_dtype)
+        self.orig_dtype = self.sharded_param.dtype
+        # Clamp `param_dtype` to `None` if no casting is required
+        if param_dtype == self.orig_dtype:
+            param_dtype = None
+        self.param_dtype = param_dtype
+        self.reduce_dtype = reduce_dtype
+        # None indicates that the mixed precision is not enabled
+
+    def init_all_gather_output(
+        self,
+        all_gather_input_numel: int,
+        world_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        if self.all_gather_output.numel() > 0:
+            return  # already initialized
+        all_gather_output_size = torch.Size([all_gather_input_numel * world_size])
+        self.all_gather_output = torch.empty(
+            all_gather_output_size, dtype=dtype, device=device
+        )
+
+    def init_unsharded_param(self):
+        if hasattr(self, "_unsharded_param"):
+            return  # already initialized
+        # For the default path (no post-all-gather), the all-gather output
+        # gives the unsharded parameter data directly
+        unsharded_param = torch.as_strided(
+            self.all_gather_output,
+            self._orig_size,
+            self._contiguous_orig_stride,
+            storage_offset=0,
+        )
+        if self.is_dtensor:
+            unsharded_param = _from_local_no_grad(
+                unsharded_param,
+                self._tp_spec.mesh,
+                self._tp_spec.placements,
+                self._global_size,
+                self._global_stride,
+            )
+        self._unsharded_param = nn.Parameter(unsharded_param)
+        self._unsharded_param.requires_grad_(self.sharded_param.requires_grad)
+
+    def to_sharded(self) -> None:
+        self._setattr_on_modules(self.sharded_param)
+        self.free_all_gather_output()
+        self.sharded_state = ShardedState.SHARDED
+
+    def to_sharded_post_forward(self) -> None:
+        if self.is_dtensor:
+            raise NotImplementedError(
+                "Resharding to smaller mesh with TP is not supported yet"
+            )
+        self._assert_in_states(ShardedState.UNSHARDED)
+        assert self.post_forward_mesh_info is not None  # mypy
+        shard_world_size = self.post_forward_mesh_info.shard_mesh_size
+        if (numel := self.all_gather_output.numel()) % shard_world_size != 0:
+            _raise_assert_with_print(
+                f"All-gather output size ({numel}) must be divisible by the shard "
+                f"world size ({shard_world_size})"
+            )
+        shard_rank = self.post_forward_mesh_info.shard_mesh_rank
+        sharded_numel = numel // shard_world_size
+        self._sharded_post_forward_param_data = (
+            self.all_gather_output.narrow(0, sharded_numel * shard_rank, sharded_numel)
+        ).clone()  # clone to be able to free all-gather output
+        sharded_post_forward_tensor = torch.as_strided(
+            self._sharded_post_forward_param_data,
+            size=self.sharded_post_forward_size,
+            stride=self.contiguous_sharded_post_forward_stride,
+            storage_offset=0,
+        )
+        self._sharded_post_forward_param = nn.Parameter(
+            self.to_sharded_post_forward_dtensor(sharded_post_forward_tensor)
+        )
+        self._setattr_on_modules(self._sharded_post_forward_param)
+        self.free_all_gather_output()
+        self.sharded_state = ShardedState.SHARDED_POST_FORWARD
+
+    def to_unsharded(self) -> None:
+        # Assume that the data has been allocated and all-gathered
+        set_requires_grad_if_needed(self.sharded_param, self._unsharded_param)
+        self._setattr_on_modules(self._unsharded_param)
+        if self.sharded_state == ShardedState.SHARDED_POST_FORWARD:
+            # The data is allocated in the default stream via the post-forward
+            # reshard and must be kept alive for the next all-gather copy-in.
+            # Since we call this method after the copy-out, the data's lifetime
+            # is ensured without further synchronization.
+            self._sharded_post_forward_param = None
+            self._sharded_post_forward_param_data = None  # free
+        self.sharded_state = ShardedState.UNSHARDED
+
+    def _setattr_on_modules(self, param: nn.Parameter) -> None:
+        unsafe_setattr_param(
+            self._module_info.module, self._module_info.param_name, param
+        )
+        for shared_module, shared_param_name in zip(
+            self._module_info.shared_modules, self._module_info.shared_param_names
+        ):
+            unsafe_setattr_param(shared_module, shared_param_name, param)
+
+    def to_sharded_dtensor(self, tensor: torch.Tensor) -> DTensor:
+        """
+        Converts a local tensor representing either the sharded parameter or
+        sharded gradient to DTensor.
+        """
+        if tensor.shape != self.sharded_size:
+            _raise_assert_with_print(
+                f"Expects size {self.sharded_size} but got {tensor.shape}"
+            )
+        return _from_local_no_grad(
+            tensor,
+            self._global_mesh,
+            self._global_placements,
+            self._global_size,
+            self._global_stride,
+        )
+
+    def to_sharded_post_forward_dtensor(self, tensor: torch.Tensor) -> DTensor:
+        if tensor.shape != self.sharded_post_forward_size:
+            _raise_assert_with_print(
+                f"Expects size {self.sharded_post_forward_size} but got {tensor.shape}"
+            )
+        assert isinstance(self.post_forward_mesh_info, HSDPMeshInfo)
+        # TODO: Prefer this DTensor to be read-only and generalize the
+        # placement once we support TP.
+        return _from_local_no_grad(
+            tensor,
+            self.post_forward_mesh_info.mesh,
+            (Replicate(), Shard(0)),
+            self._global_size,
+            self._global_stride,
+        )
+
+    def alloc_all_gather_output(self) -> None:
+        unsafe_alloc_storage(self.all_gather_output)
+
+    def free_all_gather_output(self) -> None:
+        unsafe_free_storage(self.all_gather_output)
+
+    @property
+    def all_gather_input(self) -> torch.Tensor:  # 1D
+        self._assert_in_states(ShardedState.SHARDED, ShardedState.SHARDED_POST_FORWARD)
+        if self.sharded_state == ShardedState.SHARDED:
+            return _to_dtype_if_needed(self._sharded_param_data, self.param_dtype)
+        elif self.sharded_state == ShardedState.SHARDED_POST_FORWARD:
+            return _to_dtype_if_needed(
+                cast(torch.Tensor, self._sharded_post_forward_param_data),
+                self.param_dtype,
+            )
+        return torch.empty(0)  # mypy
+
+    @property
+    def unsharded_param(self) -> nn.Parameter:  # ND
+        self._assert_in_states(ShardedState.UNSHARDED)
+        return self._unsharded_param
+
+    @property
+    def unsharded_grad_data(self) -> torch.Tensor:
+        grad = self.unsharded_param.grad
+        assert grad is not None, "Expects unsharded_param.grad to not be None"
+        return self._get_grad_inner_tensor(grad)
+
+    def _get_grad_inner_tensor(self, grad: torch.Tensor) -> torch.Tensor:
+        if self.is_dtensor:
+            if isinstance(grad, AsyncCollectiveTensor):
+                grad = grad.wait()
+            grad = cast(DTensor, grad)._local_tensor
+        return grad
+
+    def _assert_in_states(self, *states: ShardedState) -> None:
+        if self.sharded_state not in states:
+            _raise_assert_with_print(
+                f"Expects to be in one of {states}, not {self.sharded_state}"
+            )
+
+
+# NOTE: Unsafe here refers to not checking whether the storage is already
+# allocated or freed, respectively. We should be safe to use them since we
+# explicitly manage the state transition.
+def unsafe_alloc_storage(tensor: torch.Tensor) -> None:
+    # Skip the already-allocated check and assume that `tensor` is the base
+    # tensor to save CPU overhead
+    tensor.untyped_storage().resize_(tensor.numel() * tensor.itemsize)
+
+
+def unsafe_free_storage(tensor: torch.Tensor) -> None:
+    # Skip the already-freed check to save CPU overhead
+    tensor.untyped_storage().resize_(0)
+
+
+# NOTE: These bypass `nn.Module.__setattr__` checks, which incur non-trivial
+# CPU overhead, if the module did not override it. For FSDP, we know we do not
+# need those checks when transitioning between sharded/unsharded parameters.
+def unsafe_setattr_param(
+    module: nn.Module, param_name: str, param: nn.Parameter
+) -> None:
+    if getattr(module.__setattr__, "__func__", None) is nn.Module.__setattr__:
+        module._parameters[param_name] = param
+    else:  # slow path
+        setattr(module, param_name, param)
+
+
+def set_requires_grad_if_needed(
+    src_tensor: torch.Tensor, dst_tensor: torch.Tensor
+) -> None:
+    # Only call `requires_grad_` if needed to avoid the Python <> C++ context
+    # switch overhead
+    if src_tensor.requires_grad != dst_tensor.requires_grad:
+        dst_tensor.requires_grad_(src_tensor.requires_grad)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param_group.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..51546afa3ae277fa7fe6df4281f9e5a3fa22688b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_param_group.py
@@ -0,0 +1,506 @@
+import contextlib
+
+from typing import Any, cast, Dict, List, NamedTuple, Optional, Set, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from torch.autograd.graph import Node
+from torch.distributed.fsdp._common_utils import _named_parameters_with_duplicates
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils.hooks import RemovableHandle
+from ._fsdp_api import MixedPrecisionPolicy
+from ._fsdp_collectives import (
+    AllGatherResult,
+    foreach_all_gather,
+    foreach_all_gather_copy_out,
+    foreach_reduce_scatter,
+)
+from ._fsdp_common import FSDPMeshInfo, HSDPMeshInfo, TrainingState
+from ._fsdp_param import FSDPParam, ParamModuleInfo, ShardedState
+
+_ModuleToHandleDict = Dict[nn.Module, RemovableHandle]  # for state dict
+
+
+"""
+[Note: Overlapping all-gather copy-in and all-gather]
+For implicit forward prefetching, we want to overlap the next copy-in with the
+current all-gather. We do so using a separate copy-in stream. However, since
+we have the all-gather input as a view into the output, we must make sure to
+copy into different memory from the current all-gather's output. Thus, we keep
+a reference to the current all-gather's output and have the next FSDP parameter
+group free it after its copy-in. Finally, we have the last FSDP state flush the
+reference to avoid holding onto memory after forward.
+"""
+
+
+class FSDPCommContext:
+    """This has the communication state shared across FSDP states/parameter groups."""
+
+    def init(self):
+        # Setting the all-gather/reduce-scatter streams to be higher priority
+        # can help avoid some issues where their copies in/out are delayed and
+        # block computation
+        high_priority = -1
+        # All-gather state and copy-in stream allow overlapping the next
+        # copy-in with the current all-gather in forward; copy-in overlaps with
+        # reduce-scatter in backward without the separate copy-in stream
+        self.all_gather_copy_in_stream = torch.cuda.Stream(priority=high_priority)
+        self.all_gather_state: Optional[AllGatherState] = None
+        # All-gather stream allows overlapping next all-gather with current
+        # forward compute
+        self.all_gather_stream = torch.cuda.Stream(priority=high_priority)
+        # Reduce-scatter stream gives separate execution "thread" for post-
+        # backward logic like pre/post-gradient division and reduce-scatter
+        self.reduce_scatter_stream = torch.cuda.Stream(priority=high_priority)
+        # Post-forward order for explicit backward prefetching
+        self.post_forward_order: List[FSDPParamGroup] = []  # will cause ref cycles
+
+    def get_all_gather_streams(
+        self, training_state: TrainingState
+    ) -> Tuple[torch.cuda.Stream, torch.cuda.Stream]:
+        if training_state in (TrainingState.FORWARD, TrainingState.PRE_BACKWARD):
+            # Use separate streams for implicit prefetching
+            return self.all_gather_copy_in_stream, self.all_gather_stream
+        current_stream = torch.cuda.current_stream()
+        return current_stream, current_stream
+
+
+# See [Note: Overlapping all-gather copy-in and all-gather]
+class AllGatherState(NamedTuple):
+    all_gather_result: AllGatherResult
+    event: torch.cuda.Event  # all-gather copy-out
+
+
+class FSDPParamGroup:
+    """This class represents a parameter group to communicate together."""
+
+    _orig_dtype: torch.dtype
+    _reduce_dtype: Optional[torch.dtype]
+
+    def __init__(
+        self,
+        params: List[nn.Parameter],
+        module: nn.Module,
+        mesh_info: FSDPMeshInfo,
+        post_forward_mesh_info: Optional[FSDPMeshInfo],
+        device: torch.device,
+        mp_policy: MixedPrecisionPolicy,
+    ):
+        self.module = module  # permit ref cycle because 1:1 lifetime
+        param_module_infos = _get_param_module_infos(params, module)
+        self.fsdp_params = [
+            FSDPParam(
+                param, module_info, mesh_info, post_forward_mesh_info, device, mp_policy
+            )
+            for param, module_info in zip(params, param_module_infos)
+        ]
+        self.mesh_info = mesh_info
+        self.post_forward_mesh_info = post_forward_mesh_info
+        self.device = device
+        self.mp_policy = mp_policy
+        self._training_state = TrainingState.IDLE
+        # Group's sharded state always matches its parameters' sharded states
+        self._sharded_state = ShardedState.SHARDED
+        self._module_fqn: Optional[str] = None  # prefixed from root module
+
+        # - Hook state
+        self._module_to_pre_save_state_dict_hook_handle: _ModuleToHandleDict = {}
+        self._module_to_pre_load_state_dict_hook_handle: _ModuleToHandleDict = {}
+
+        # - Communication and communication/computation overlap
+        self.comm_ctx = FSDPCommContext()
+        # Group's indices in the shared post-forward order
+        self._post_forward_indices: List[int] = []
+        # Used to avoid mistargeted backward prefetches when the module is used
+        # in forward but not in backward: for each forward, we record a tuple
+        # of the output's grad fns and later query the autograd engine whether
+        # any grad fn will execute in the current backward to know to prefetch.
+        self.all_forward_output_grad_fns: Set[Tuple[Node, ...]] = set()
+        # Whether to reduce-scatter or all-reduce gradients, respectively
+        # (can be set to false to save communication during gradient
+        # accumulation); all-reducing without reduce-scatter is disallowed
+        self.reduce_scatter_grads: bool = True
+        self.all_reduce_grads: bool = True
+
+        # - CUDA events for stream synchronization
+        # Holds the all-gather output buffer, sync objects, and metadata
+        self._all_gather_result: Optional[AllGatherResult] = None
+        # Holds the reduce-scatter view-out CUDA event that marks the end of
+        # the group's post-backward (e.g. reduce-scatter and div), which should
+        # be waited on at the end of backward
+        self._reduce_scatter_view_out_event: Optional[torch.cuda.Event] = None
+        # Holds the reshard-after-forward CUDA event when resharding to a
+        # different world size, which should be waited on in the next unshard
+        self._reshard_after_forward_event: Optional[torch.cuda.Event] = None
+
+    # Initialization #
+    def _init_mp_dtypes(self) -> None:
+        for fsdp_param in self.fsdp_params:
+            fsdp_param.init_dtype_attrs(self.mp_policy)
+        orig_dtypes = {fsdp_param.orig_dtype for fsdp_param in self.fsdp_params}
+        if len(orig_dtypes) != 1:
+            # This can be relaxed if we copy-out for the reduce-scatter
+            raise AssertionError(
+                f"FSDP expects uniform original parameter dtype but got {orig_dtypes}"
+            )
+        self._orig_dtype = next(iter(orig_dtypes))
+        reduce_dtypes = {fsdp_param.reduce_dtype for fsdp_param in self.fsdp_params}
+        if len(reduce_dtypes) != 1:
+            # This can be relaxed if we issue one reduce-scatter per reduce
+            # dtype (but we would need a way for users to specify multiple
+            # reduce dtypes)
+            raise AssertionError(
+                f"FSDP expects uniform reduce dtype but got {reduce_dtypes}"
+            )
+        self._reduce_dtype = next(iter(reduce_dtypes))
+
+    def _init_grad_divide_factors(self):
+        data_parallel_world_size = 1
+        data_parallel_world_size *= self.mesh_info.shard_mesh_size
+        if isinstance(self.mesh_info, HSDPMeshInfo):
+            data_parallel_world_size *= self.mesh_info.replicate_mesh_size
+        if self._reduce_dtype == torch.float32:
+            # Use NCCL's AVG op to divide after reduction since it is more
+            # performant and fp32 has sufficient precision
+            self._grad_divide_factors: Optional[Tuple[float, float]] = None
+            return
+        # For N data parallel workers, each worker computes g_i, and they
+        # collectively reduce (g_1 + ... + g_N) / N. To avoid overflow and
+        # underflow, we divide by ~sqrt(N) before and after the reduction.
+        factor: int = 1
+        while (
+            data_parallel_world_size % factor == 0
+            and data_parallel_world_size / factor > factor
+        ):
+            factor *= 2
+        factor = float(factor)
+        self._grad_divide_factors = (factor, data_parallel_world_size / factor)
+
+    def lazy_init(self):
+        param_names_on_meta = [
+            fsdp_param._param_fqn
+            for fsdp_param in self.fsdp_params
+            if fsdp_param.sharded_param.device.type == "meta"
+        ]
+        if param_names_on_meta:
+            raise RuntimeError(
+                "FSDP parameters should be materialized from meta device before training, "
+                f"but the following were still on meta device: {param_names_on_meta}\n"
+                "For example, call module.to_empty(device) to materialize to device and "
+                "call module.reset_parameters() on each module to initialize values."
+            )
+        # Initialize mixed precision attributes lazily in case the user changes
+        # the parameter dtypes after construction time but before forward
+        self._init_mp_dtypes()
+        self._init_grad_divide_factors()
+        self._register_state_dict_hooks()
+
+    # Runtime #
+    def unshard(self, async_op: bool = False):
+        if self._all_gather_result is not None:  # already called, pending wait
+            return
+        if self.is_unsharded:
+            return  # no-op
+        if self._reshard_after_forward_event is not None:
+            # Resharded parameter data is allocated in the default stream and
+            # used in the all-gather streams
+            self._wait_all_gather_streams_on_event(self._reshard_after_forward_event)
+            self._reshard_after_forward_event = None
+        self._all_gather_result = foreach_all_gather(
+            self.fsdp_params,
+            self._all_gather_process_group,
+            async_op,
+            *self.comm_ctx.get_all_gather_streams(self._training_state),
+            self.device,
+        )
+
+    def wait_for_unshard(self):
+        """
+        1. In forward with implict prefetching, to overlap the current copy-out
+        with the next all-gather, we save a reference to the current all-gather
+        result to free after the next copy-out.
+        2. Otherwise (explicit prefetching or in backward), we free the
+        all-gather result immediately after the current copy-out since we can
+        already overlap the current copy-out with the previous reduce-scatter.
+        """
+        if not self._all_gather_result:
+            return  # no preceding unshard
+        if self._training_state == TrainingState.FORWARD:  # implicit prefetch
+            if prev_all_gather_state := self.comm_ctx.all_gather_state:
+                self._wait_all_gather_streams_on_event(prev_all_gather_state.event)
+                self.comm_ctx.all_gather_state = None  # free the all-gather result
+        foreach_all_gather_copy_out(
+            self._all_gather_result, self.fsdp_params, self._all_gather_process_group
+        )
+        for fsdp_param in self.fsdp_params:
+            fsdp_param.init_unsharded_param()  # no-op after 1st call
+        self._to_unsharded()
+        all_gather_copy_out_event = torch.cuda.Event()
+        all_gather_copy_out_event.record()
+        if self._training_state == TrainingState.FORWARD:
+            self.comm_ctx.all_gather_state = AllGatherState(
+                self._all_gather_result, all_gather_copy_out_event
+            )
+        else:
+            self._wait_all_gather_streams_on_event(all_gather_copy_out_event)
+        self._all_gather_result = None  # free unless saved in `all_gather_state`
+
+    def _wait_all_gather_streams_on_event(self, event: torch.cuda.Event):
+        self.comm_ctx.all_gather_copy_in_stream.wait_event(event)
+        self.comm_ctx.all_gather_stream.wait_event(event)
+
+    def reshard(self):
+        if self._training_state == TrainingState.FORWARD:
+            if not self._reshard_after_forward:
+                return
+            if self._use_post_forward_mesh:
+                self._to_sharded_post_forward()
+                self._reshard_after_forward_event = torch.cuda.Event()
+                self._reshard_after_forward_event.record()
+                return
+        self._to_sharded()
+
+    def pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        with torch.profiler.record_function("FSDP::pre_forward"):
+            self._training_state = TrainingState.FORWARD
+            self.unshard()
+            self.wait_for_unshard()
+            args, kwargs = self._register_post_backward_hook(args, kwargs)
+            return args, kwargs
+
+    def post_forward(self, module: nn.Module, input: Any, output: Any):
+        with torch.profiler.record_function("FSDP::post_forward"):
+            self.reshard()
+            self._record_post_forward()
+            self._training_state = TrainingState.IDLE
+            return output
+
+    def _record_post_forward(self) -> None:
+        # Since a group has one pre-backward unshard for each forward call
+        # before the backward, we record each usage (with multiplicity)
+        post_forward_index = len(self.comm_ctx.post_forward_order)
+        self.comm_ctx.post_forward_order.append(self)
+        self._post_forward_indices.append(post_forward_index)
+
+    def pre_backward(self, forward_grad_fns: Tuple[Any, ...], *unused: Any):
+        with torch.profiler.record_function("FSDP::pre_backward"):
+            self._training_state = TrainingState.PRE_BACKWARD
+            self.unshard()  # no-op if prefetched
+            self.wait_for_unshard()
+            # Can be already removed if running multiple `backward`s
+            self.all_forward_output_grad_fns.discard(forward_grad_fns)
+            self._prefetch_unshard()
+
+    def post_backward(self, *unused: Any):
+        self._training_state = TrainingState.POST_BACKWARD
+        with torch.profiler.record_function("FSDP::post_backward_reshard"):
+            if not self.reduce_scatter_grads:
+                self.reshard()
+                return
+            # Save the autograd-computed gradients before resharding to only
+            # access the unsharded parameters when their data is present
+            fsdp_params_with_grad: List[FSDPParam] = []
+            unsharded_grads: List[torch.Tensor] = []
+            for fsdp_param in self.fsdp_params:
+                if fsdp_param.unsharded_param.grad is not None:
+                    fsdp_params_with_grad.append(fsdp_param)
+                    unsharded_grads.append(fsdp_param.unsharded_grad_data)
+                    fsdp_param.unsharded_param.grad = None
+            self.reshard()
+        if len(fsdp_params_with_grad) == 0:
+            return
+        with torch.profiler.record_function("FSDP::post_backward_reduce"):
+            self._reduce_scatter_view_out_event = foreach_reduce_scatter(
+                fsdp_params_with_grad,
+                unsharded_grads,
+                self._reduce_scatter_process_group,
+                self.comm_ctx.reduce_scatter_stream,
+                self._orig_dtype,
+                self._reduce_dtype,
+                self.device,
+                self._grad_divide_factors,
+            )
+
+    def finalize_backward(self):
+        if self._reduce_scatter_view_out_event is not None:
+            torch.cuda.current_stream().wait_event(self._reduce_scatter_view_out_event)
+            self._reduce_scatter_view_out_event = None
+        self._training_state = TrainingState.IDLE
+        self._post_forward_indices.clear()
+        self.all_forward_output_grad_fns.clear()
+
+    def _prefetch_unshard(self):
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            if not self._post_forward_indices:
+                # Can be cleared if running multiple `backward`s
+                return
+            curr_index = self._post_forward_indices.pop()
+            if (target_index := curr_index - 1) < 0:
+                return
+            target_fsdp_param_group = self.comm_ctx.post_forward_order[target_index]
+            if any(
+                torch._C._will_engine_execute_node(grad_fn)  # type: ignore[attr-defined]
+                for grad_fns in target_fsdp_param_group.all_forward_output_grad_fns
+                for grad_fn in grad_fns
+            ):
+                with torch.profiler.record_function(
+                    "FSDP::backward_prefetch"
+                ), target_fsdp_param_group.use_training_state(
+                    TrainingState.PRE_BACKWARD
+                ):
+                    target_fsdp_param_group.unshard()
+
+    # Utilities #
+    def _to_sharded(self):
+        if not self.is_sharded:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_sharded()
+            self._sharded_state = ShardedState.SHARDED
+
+    def _to_sharded_post_forward(self):
+        if not self.is_sharded_post_forward:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_sharded_post_forward()
+            self._sharded_state = ShardedState.SHARDED_POST_FORWARD
+
+    def _to_unsharded(self):
+        if not self.is_unsharded:
+            for fsdp_param in self.fsdp_params:
+                fsdp_param.to_unsharded()
+            self._sharded_state = ShardedState.UNSHARDED
+
+    @property
+    def is_sharded(self) -> bool:
+        return self._sharded_state == ShardedState.SHARDED
+
+    @property
+    def is_sharded_post_forward(self) -> bool:
+        return self._sharded_state == ShardedState.SHARDED_POST_FORWARD
+
+    @property
+    def is_unsharded(self) -> bool:
+        return self._sharded_state == ShardedState.UNSHARDED
+
+    @contextlib.contextmanager
+    def use_training_state(self, training_state: TrainingState):
+        old_training_state = self._training_state
+        self._training_state = training_state
+        try:
+            yield
+        finally:
+            self._training_state = old_training_state
+
+    # Hook Registration #
+    def _register_post_backward_hook(
+        self, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        if not torch.is_grad_enabled():
+            return args, kwargs
+        args_list, args_spec = tree_flatten(args)
+        kwargs_list, kwargs_spec = tree_flatten(kwargs)
+        args_kwargs_list = list(args_list) + list(kwargs_list)
+        inp_tensor_indices: List[int] = []
+        inp_tensors: List[torch.Tensor] = []
+        for i, obj in enumerate(args_kwargs_list):
+            if torch.is_tensor(obj) and obj.requires_grad:
+                inp_tensor_indices.append(i)
+                inp_tensors.append(obj)
+        if len(inp_tensors) == 0:
+            return args, kwargs  # no tensors that require gradients
+        inp_tensors = RegisterPostBackwardFunction.apply(self, *inp_tensors)
+        for inp_tensor_idx, inp_tensor in zip(inp_tensor_indices, inp_tensors):
+            args_kwargs_list[inp_tensor_idx] = inp_tensor
+        args_list = args_kwargs_list[: len(args_list)]
+        kwargs_list = args_kwargs_list[len(args_list) :]
+        args = tree_unflatten(args_list, args_spec)
+        kwargs = tree_unflatten(kwargs_list, kwargs_spec)
+        return args, kwargs
+
+    def _register_state_dict_hooks(self) -> None:
+        assert len(self._module_to_pre_save_state_dict_hook_handle) == 0
+        assert len(self._module_to_pre_load_state_dict_hook_handle) == 0
+        modules_with_fsdp_params: Set[nn.Module] = {
+            fsdp_param._module_info.module for fsdp_param in self.fsdp_params
+        }
+
+        def to_sharded_hook(*args: Any, **kwargs: Any) -> None:
+            self._to_sharded()
+
+        for module in modules_with_fsdp_params:
+            self._module_to_pre_save_state_dict_hook_handle[
+                module
+            ] = module.register_state_dict_pre_hook(to_sharded_hook)
+            self._module_to_pre_load_state_dict_hook_handle[
+                module
+            ] = module._register_load_state_dict_pre_hook(to_sharded_hook)
+
+    # Properties #
+    @property
+    def _reshard_after_forward(self) -> bool:
+        return self.post_forward_mesh_info is not None
+
+    @property
+    def _use_post_forward_mesh(self) -> bool:
+        return (
+            self._reshard_after_forward
+            and self.mesh_info != self.post_forward_mesh_info
+        )
+
+    @property
+    def _all_gather_process_group(self) -> dist.ProcessGroup:
+        mesh_info = (
+            cast(FSDPMeshInfo, self.post_forward_mesh_info)
+            if self.is_sharded_post_forward
+            else self.mesh_info
+        )
+        assert isinstance(mesh_info, FSDPMeshInfo)
+        return mesh_info.shard_process_group
+
+    @property
+    def _reduce_scatter_process_group(self) -> dist.ProcessGroup:
+        mesh_info = self.mesh_info
+        assert isinstance(mesh_info, FSDPMeshInfo)
+        return mesh_info.shard_process_group
+
+
+def _get_param_module_infos(
+    params: List[nn.Parameter], module: nn.Module
+) -> List[ParamModuleInfo]:
+    """
+    Shared parameter: lin1.weight = lin2.weight
+    Shared module: mlp.lin1 = mlp.lin2
+    We do not remove duplicates when traversing both modules and parameters to
+    find shared modules' parameters and shared parameters within a module.
+    """
+    params_set = set(params)
+    param_to_module_info: Dict[nn.Parameter, ParamModuleInfo] = {}
+    for _, submodule in module.named_modules(remove_duplicate=False):
+        for param_name, param in _named_parameters_with_duplicates(
+            submodule, recurse=False
+        ):
+            if param in params_set:
+                if param not in param_to_module_info:
+                    param_to_module_info[param] = ParamModuleInfo(submodule, param_name)
+                else:
+                    param_to_module_info[param].shared_modules.append(submodule)
+                    param_to_module_info[param].shared_param_names.append(param_name)
+    if len(param_to_module_info) != len(params):
+        raise AssertionError(f"Some parameters are not in the module tree of {module}")
+    return [param_to_module_info[param] for param in params]
+
+
+class RegisterPostBackwardFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, param_group: FSDPParamGroup, *inputs: torch.Tensor):
+        # All tensors in `inputs` should require gradient
+        ctx.param_group = param_group
+        return inputs
+
+    @staticmethod
+    def backward(ctx, *grads: torch.Tensor):
+        ctx.param_group.post_backward()
+        return (None,) + grads
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_state.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..06f839ee429cdef4bac0761a1cfc45746c0765e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/_fsdp_state.py
@@ -0,0 +1,246 @@
+import functools
+
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.autograd.graph import Node, register_multi_grad_hook
+from torch.distributed._composable_state import (
+    _get_module_state,
+    _insert_module_state,
+    _State,
+)
+from torch.distributed.utils import _to_kwargs
+from torch.utils._pytree import tree_flatten, tree_map
+from torch.utils.hooks import RemovableHandle
+from ._fsdp_api import MixedPrecisionPolicy
+from ._fsdp_common import _cast_fp_tensor, TrainingState
+from ._fsdp_param import FSDPParam
+from ._fsdp_param_group import FSDPCommContext, FSDPParamGroup
+
+
+class FSDPStateContext:
+    """This has state shared across FSDP states."""
+
+    def __init__(self):
+        # All FSDP states in the root state's module tree
+        self.all_states: List[FSDPState] = []
+        # Iteration's forward root runs the once-per-forward logic; this root
+        # may not be the overall root set by lazy initialization in cases where
+        # only a submodule runs forward (e.g. encoder-only for eval)
+        self.iter_forward_root: Optional[FSDPState] = None
+        # Final callback should only be queued once per backward
+        self.post_backward_final_callback_queued: bool = False
+        # Whether to finalize backward in this backward's final callback
+        self.is_last_backward: bool = True
+
+
+class FSDPState(_State):
+    def __init__(self):
+        super().__init__()
+        self._fsdp_param_group: Optional[FSDPParamGroup] = None
+        self._is_root: Optional[bool] = None  # root set during lazy init
+        self._state_ctx = FSDPStateContext()
+        self._comm_ctx = FSDPCommContext()
+        self._training_state: TrainingState = TrainingState.IDLE
+        self._pre_backward_hook_handles: List[RemovableHandle] = []
+
+    # Define a separate init since `__init__` is called in the contract
+    def init(
+        self, module: nn.Module, device: torch.device, mp_policy: MixedPrecisionPolicy
+    ) -> None:
+        _insert_module_state(module, self)
+        self._module = module
+        self._device = device
+        self._mp_policy = mp_policy
+        self._pre_forward_hook_handle = module.register_forward_pre_hook(
+            self._pre_forward, prepend=True, with_kwargs=True
+        )
+        self._post_forward_hook_handle = module.register_forward_hook(
+            self._post_forward, prepend=False
+        )
+
+    def _root_pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        self._lazy_init()
+        if self._state_ctx.iter_forward_root is not None:
+            return args, kwargs
+        self._state_ctx.iter_forward_root = self
+        with torch.profiler.record_function("FSDP::root_pre_forward"):
+            # Wait for optimizer before implicitly prefetched all-gathers
+            current_stream = torch.cuda.current_stream()
+            self._comm_ctx.all_gather_copy_in_stream.wait_stream(current_stream)
+            self._comm_ctx.all_gather_stream.wait_stream(current_stream)
+            if self._device.type == "cuda":
+                with torch.profiler.record_function("FSDP::inputs_to_device"):
+                    args_tuple, kwargs_tuple = _to_kwargs(
+                        args, kwargs, self._device, False
+                    )  # same as DDP
+                args, kwargs = args_tuple[0], kwargs_tuple[0]
+        return args, kwargs
+
+    def _lazy_init(self) -> None:
+        """
+        Lazy initialization represents when all modules' parallelisms have
+        finalized (e.g. FSDP has been applied to all desired modules). This
+        means that we can determine which state is the root, and we do so by
+        the 1st state to run forward.
+        """
+        if self._is_root is not None:
+            return  # no-op: already initialized
+        self._is_root = True
+        root_module = self._module
+        for module_name, module in root_module.named_modules():
+            if (state := _get_module_fsdp_state(module)) is None:
+                continue
+            if module is not root_module:
+                if state._is_root is not None:
+                    raise RuntimeError(
+                        "FSDP state has already been lazily initialized for "
+                        f"{module_name}\nFSDP requires running forward through "
+                        "the root module first"
+                    )
+                state._is_root = False
+            self._state_ctx.all_states.append(state)
+        if self._fsdp_param_group:
+            # For the root, do not reshard after forward since for training,
+            # the parameters would be freed and all-gathered immediately
+            self._fsdp_param_group.post_forward_mesh_info = None
+        self._init_fqns()
+        self._init_shared_state()
+        # Run parameter group lazy inits after initializing FQNs for improved
+        # error messages
+        for state in self._state_ctx.all_states:
+            if state._fsdp_param_group:
+                state._fsdp_param_group.lazy_init()
+
+    def _init_shared_state(self) -> None:
+        self._comm_ctx.init()
+        for state in self._state_ctx.all_states:
+            state._state_ctx = self._state_ctx
+            state._comm_ctx = self._comm_ctx
+            if fsdp_param_group := state._fsdp_param_group:
+                fsdp_param_group.comm_ctx = self._comm_ctx
+
+    def _init_fqns(self) -> None:
+        """Sets module and parameter FQN attributes for debugging."""
+        assert self._is_root
+        root_module = self._module
+        param_to_fsdp_param: Dict[nn.Parameter, FSDPParam] = {}
+        module_to_fsdp_param_group: Dict[nn.Module, FSDPParamGroup] = {}
+        for state in self._state_ctx.all_states:
+            if fsdp_param_group := state._fsdp_param_group:
+                for fsdp_param in fsdp_param_group.fsdp_params:
+                    param_to_fsdp_param[fsdp_param.sharded_param] = fsdp_param
+                module_to_fsdp_param_group[fsdp_param_group.module] = fsdp_param_group
+        for param_name, param in root_module.named_parameters():
+            if param in param_to_fsdp_param:
+                param_to_fsdp_param[param]._param_fqn = param_name
+        for module_name, module in root_module.named_modules():
+            if module in module_to_fsdp_param_group:
+                module_to_fsdp_param_group[module]._module_fqn = module_name
+
+    def _pre_forward(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+        # When composing with module-hook-based activation checkpointing, the
+        # the pre-backward hook is responsible for the unshard
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            return args, kwargs
+        self._training_state = TrainingState.FORWARD
+        args, kwargs = self._root_pre_forward(module, args, kwargs)
+        if self._mp_policy.cast_forward_inputs and self._mp_policy.param_dtype:
+            with torch.profiler.record_function("FSDP::cast_forward_inputs"):
+                cast_fn = functools.partial(
+                    _cast_fp_tensor, self._mp_policy.param_dtype
+                )
+                args, kwargs = tree_map(cast_fn, args), tree_map(cast_fn, kwargs)
+        if self._fsdp_param_group:
+            args, kwargs = self._fsdp_param_group.pre_forward(module, args, kwargs)
+        return args, kwargs
+
+    def _post_forward(self, module: nn.Module, input: Any, output: Any) -> Any:
+        # When composing with module-hook-based activation checkpointing, the
+        # post-backward hook is responsible for the reshard
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            return output
+        if self._fsdp_param_group:
+            output = self._fsdp_param_group.post_forward(module, input, output)
+        output = self._register_pre_backward_hook(output)
+        self._training_state = TrainingState.IDLE
+        if self._state_ctx.iter_forward_root is self:
+            if all_gather_state := self._comm_ctx.all_gather_state:
+                # Free the last all-gather result if needed; refer to
+                # [Note: Overlapping all-gather copy-in and all-gather]
+                self._comm_ctx.all_gather_copy_in_stream.wait_event(
+                    all_gather_state.event
+                )
+                self._comm_ctx.all_gather_stream.wait_event(all_gather_state.event)
+                self._comm_ctx.all_gather_state = None  # free the all-gather result
+            self._state_ctx.iter_forward_root = None
+        if self._mp_policy.output_dtype is not None:
+            with torch.profiler.record_function("FSDP::cast_forward_outputs"):
+                output = tree_map(
+                    functools.partial(_cast_fp_tensor, self._mp_policy.output_dtype),
+                    output,
+                )
+        return output
+
+    def _pre_backward(self, forward_grad_fns: Tuple[Node, ...], *unused: Any) -> None:
+        self._training_state = TrainingState.PRE_BACKWARD
+        self._register_root_post_backward_final_callback()
+        if self._fsdp_param_group:
+            self._fsdp_param_group.pre_backward(forward_grad_fns, *unused)
+
+    def _root_post_backward_final_callback(self) -> None:
+        with torch.profiler.record_function("FSDP::root_post_backward_callback"):
+            for state in self._state_ctx.all_states:
+                if state._fsdp_param_group and state._fsdp_param_group.is_unsharded:
+                    # Run post-backward in case forward inputs did not require
+                    # gradient so the autograd backward did not run
+                    state._fsdp_param_group.post_backward()
+                if self._state_ctx.is_last_backward:
+                    state._finalize_backward()
+            if self._state_ctx.is_last_backward:
+                self._comm_ctx.post_forward_order.clear()
+            self._state_ctx.post_backward_final_callback_queued = False
+
+    def _finalize_backward(self) -> None:
+        self._training_state = TrainingState.IDLE
+        for handle in self._pre_backward_hook_handles:
+            handle.remove()
+        self._pre_backward_hook_handles.clear()
+        if self._fsdp_param_group:
+            self._fsdp_param_group.finalize_backward()
+
+    def _register_pre_backward_hook(self, output: Any) -> Any:
+        if not torch.is_grad_enabled():
+            return output
+
+        flat_outputs, _ = tree_flatten(output)
+        tensors = tuple(t for t in flat_outputs if t.requires_grad)
+        if tensors:
+            grad_fns = tuple(t.grad_fn for t in tensors if t.grad_fn is not None)
+            pre_backward = functools.partial(self._pre_backward, grad_fns)
+            handle = register_multi_grad_hook(tensors, pre_backward, mode="any")
+            self._pre_backward_hook_handles.append(handle)
+            if self._fsdp_param_group:
+                self._fsdp_param_group.all_forward_output_grad_fns.add(grad_fns)
+        return output
+
+    def _register_root_post_backward_final_callback(self):
+        if self._state_ctx.post_backward_final_callback_queued:
+            return
+        self._state_ctx.post_backward_final_callback_queued = True
+        Variable._execution_engine.queue_callback(
+            self._root_post_backward_final_callback
+        )
+
+
+def _get_module_fsdp_state(module: nn.Module) -> Optional[FSDPState]:
+    state = _get_module_state(module)
+    if isinstance(state, FSDPState):
+        return state
+    return None
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/fully_shard.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/fully_shard.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9962d01d7ca7877079bc4daacc5a203987fa484
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fsdp/fully_shard.py
@@ -0,0 +1,246 @@
+from typing import Any, cast, Optional, Union
+
+import typing_extensions
+
+import torch
+import torch.nn as nn
+
+from torch.distributed._composable import contract
+from torch.distributed._tensor import DeviceMesh, DTensor
+
+from ._fsdp_api import MixedPrecisionPolicy
+from ._fsdp_common import FSDPMeshInfo, HSDPMeshInfo
+from ._fsdp_init import (
+    _get_device_from_mesh,
+    _get_managed_modules,
+    _get_managed_states,
+    _get_post_forward_mesh_info,
+    _init_default_fully_shard_mesh,
+    _move_states_to_device,
+)
+from ._fsdp_param_group import FSDPParamGroup
+from ._fsdp_state import _get_module_fsdp_state, FSDPState
+
+
+# The decorator adds a state object to `module` that can be accessed via
+# `fully_shard.state(module)`. The state object and module are 1:1.
+@contract(state_cls=FSDPState)
+def fully_shard(
+    module: nn.Module,
+    *,
+    mesh: Optional[DeviceMesh] = None,
+    reshard_after_forward: Union[bool, int] = True,
+    mp_policy: MixedPrecisionPolicy = MixedPrecisionPolicy(),
+):
+    """
+    Shard module parameters across data parallel workers.
+
+    This function applies fully sharded data parallelism (FSDP) or a variant to
+    ``module``, a technique for memory savings at the cost of communication.
+    Parameters are sharded across ``mesh``, and in turn, so are their gradients
+    and optimizer states.
+
+    The sharded parameters are all-gathered to construct the unsharded
+    parameters for forward or backward computation. The unsharded parameters
+    are freed after computation to save memory. The gradients are reduced
+    across the mesh and divided by the mesh size for data parallelism. The
+    optimizer step runs on the sharded parameters.
+
+    Each call to ``fully_shard`` constructs one communication group that
+    includes the parameters in ``module.parameters()`` except those already
+    assigned to a group from a nested call. Each group's parameters and its
+    gradients are communicated together in one collective, respectively.
+    Constructing multiple groups across the model (e.g. "layer by layer")
+    allows for peak memory savings and communication/computation overlap.
+
+    Implementation-wise, the sharded parameters are represented as
+    :class:`DTensor` s, sharded on dim-0, and the unsharded parameters are
+    represented as :class:`Tensor` s. A module forward pre-hook all-gathers the
+    parameters, and a module forward hook frees them. Similar backward hooks
+    gather parameters and later free parameters/reduce gradients.
+
+    Args:
+        mesh (Optional[DeviceMesh]): This data parallel mesh defines the
+            sharding and device. If 1D, then parameters are fully sharded
+            across the 1D mesh (FSDP). If 2D, then parameters are sharded
+            across the 0th dim and replicated across the 1st dim (HSDP). The
+            mesh's device type gives the device type used for communication;
+            if a CUDA or CUDA-like device type, then we use the current device.
+        reshard_after_forward (Union[bool, int]): This controls the parameter
+            behavior after forward and can trade off memory and communication:
+            - If ``True``, then this reshards parameters after forward and
+            all-gathers in backward.
+            - If ``False``, then this keeps the unsharded parameters in memory
+            after forward and avoids the all-gather in backward.
+            - If an ``int``, then this represents the world size to reshard to
+            after forward. It should be a non-trivial divisor of the ``mesh``
+            shard dim size (i.e. excluding 1 and the dim size itself). A choice
+            may be the intra-node size (e.g. ``torch.cuda.device_count()``).
+            This allows the all-gather in backward to be over a smaller world
+            size at the cost of higher memory usage than setting to ``True``.
+            - The root FSDP state has its value specially set to ``False`` as a
+            heuristic since its parameters would typically be immediately
+            all-gathered for backward.
+            - After forward, the parameters registered to the module depend on
+            to this: The registered parameters are the sharded parameters if
+            ``True``; unsharded parameters if ``False``; and the paramters
+            resharded to the smaller mesh otherwise. To modify the parameters
+            between forward and backward, the registered parameters must be the
+            sharded parameters. For ``False`` or an ``int``, this can be done
+            by manually resharding via :meth:`reshard`.
+        mp_policy (MixedPrecisionPolicy): This controls the mixed precision
+            policy, which offers parameter/reduction mixed precision for this
+            module. See :class:`MixedPrecisionPolicy` for details.
+    """
+    if isinstance(module, (nn.ModuleList, nn.ModuleDict)):
+        raise ValueError(
+            f"fully_shard does not support containers that do not implement forward: {module}"
+        )
+    mesh = mesh or _init_default_fully_shard_mesh()
+    if mesh.ndim not in (1, 2):
+        raise ValueError(f"fully_shard expects a 1D or 2D DeviceMesh but got {mesh}")
+    elif mesh.ndim == 1:
+        mesh_info = FSDPMeshInfo(mesh, shard_mesh_dim=0)
+    else:
+        mesh_info = HSDPMeshInfo(mesh, shard_mesh_dim=1, replicate_mesh_dim=0)
+    device = _get_device_from_mesh(mesh)
+    post_forward_mesh_info = _get_post_forward_mesh_info(
+        reshard_after_forward, mesh_info
+    )
+
+    state = fully_shard.state(module)
+    state.init(module, device, mp_policy)
+
+    managed_modules = _get_managed_modules(module)
+    params, buffers = _get_managed_states(managed_modules)
+    _move_states_to_device(params, buffers, device, mesh_info)
+    if params:
+        state._fsdp_param_group = FSDPParamGroup(
+            params, module, mesh_info, post_forward_mesh_info, device, mp_policy
+        )
+
+    # for dynamo
+    for module in managed_modules:
+        module._is_fsdp_managed_module = True  # type: ignore[assignment]
+        module._fsdp_use_orig_params = True  # type: ignore[assignment]
+
+    # Place FSDP leftmost for highest priority in the method resolution order
+    cls = module.__class__
+    dct = {"__deepcopy__": unimplemented_deepcopy}
+    new_cls = type(f"FSDP{cls.__name__}", (FSDP, cls), dct)
+    module.__class__ = new_cls
+    return module
+
+
+def unimplemented_deepcopy(*args: Any, **kwargs: Any) -> typing_extensions.Never:
+    raise AssertionError(
+        "FSDP does not support deepcopy. Please use state dict for serialization."
+    )
+
+
+class FSDP:
+    def __new__(cls, *args, **kwargs):
+        """
+        Override ``__new__`` to remove the FSDP class and directly construct
+        the original class for cases like indexing into a container module.
+        """
+        # Use index 2 since 0 is the dynamically constructed `FSDP<...>` class
+        # and index 1 is the `FSDP` class itself
+        orig_cls = cls.__mro__[2]
+        self = orig_cls.__new__(orig_cls, *args, **kwargs)
+        self.__init__(*args, **kwargs)
+        return self
+
+    def reshard(self) -> None:
+        """
+        Reshards the module's parameters, registering the sharded parameters
+        to the module and freeing the unsharded parameters if needed. This
+        method is *not* recursive.
+        """
+        state = self._get_fsdp_state()
+        if fsdp_param_group := state._fsdp_param_group:
+            fsdp_param_group.reshard()
+
+    def set_is_last_backward(self, is_last_backward: bool) -> None:
+        """
+        Sets whether the next backward is the last one, meaning that FSDP
+        should wait for gradient reduction to finish and clear internal data
+        structures used for explicit prefetching.
+        """
+        state = self._get_fsdp_state()
+        state._state_ctx.is_last_backward = is_last_backward
+
+    def set_requires_gradient_sync(
+        self, requires_gradient_sync: bool, recurse: bool = True
+    ) -> None:
+        """
+        Sets if the module should sync gradients. This can be used to implement
+        gradient accumulation without communication. For HSDP, this controls
+        both reduce-scatter and all-reduce together.
+
+        Args:
+            requires_gradient_sync (bool): Whether to reduce gradients for the
+                module's parameters.
+            recurse (bool): Whether to set for all submodules or just the
+                passed-in module.
+        """
+        for module in cast(nn.Module, self).modules():
+            if isinstance(module, FSDP):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.reduce_scatter_grads = requires_gradient_sync
+                    fsdp_param_group.all_reduce_grads = requires_gradient_sync
+
+    def set_requires_all_reduce(self, requires_all_reduce: bool, recurse: bool = True):
+        """
+        Sets if the module should all-reduce gradients. This can be used to
+        implement gradient accumulation with only reduce-scatter but not
+        all-reduce for HSDP.
+        """
+        for module in cast(nn.Module, self).modules():
+            if isinstance(module, FSDP):
+                state = module._get_fsdp_state()
+                if fsdp_param_group := state._fsdp_param_group:
+                    fsdp_param_group.all_reduce_grads = requires_all_reduce
+
+    def _get_fsdp_state(self) -> FSDPState:
+        if (state := _get_module_fsdp_state(cast(nn.Module, self))) is None:
+            raise AssertionError(f"No FSDP state found on {self}")
+        return state
+
+    def _apply(self, *args: Any, **kwargs: Any) -> Any:
+        # Reshard to ensure that sharded parameters are registered
+        self.reshard()
+        ret = super()._apply(*args, **kwargs)  # type: ignore[misc]
+        state = self._get_fsdp_state()
+        if not (fsdp_param_group := state._fsdp_param_group):
+            return ret
+        # TODO: Remove this padding logic once DTensor pads the local tensor:
+        # https://github.com/pytorch/pytorch/issues/113045
+        with torch.no_grad():
+            for fsdp_param in fsdp_param_group.fsdp_params:
+                module_info = fsdp_param._module_info
+                new_param = getattr(module_info.module, module_info.param_name)
+                if new_param is not fsdp_param.sharded_param:
+                    if torch.__future__.get_swap_module_params_on_conversion():
+                        raise AssertionError(
+                            "Expects swap_tensors to preserve object but got "
+                            f"{new_param} instead of {fsdp_param.sharded_param}"
+                        )
+                    else:
+                        raise AssertionError(
+                            "Please set torch.__future__.set_swap_module_params_on_conversion(True) "
+                            "to use _apply methods with FSDP"
+                        )
+                local_tensor = new_param._local_tensor
+                padded_sharded_size = fsdp_param.padded_sharded_param_size
+                if local_tensor.size() != padded_sharded_size:
+                    padded_local_tensor = local_tensor.new_zeros(padded_sharded_size)
+                    padded_local_tensor[: local_tensor.size(0)].copy_(local_tensor)
+                    local_tensor = padded_local_tensor
+                fsdp_param._sharded_param_data = local_tensor.view(-1)
+                assert isinstance(fsdp_param.sharded_param, DTensor)  # mypy
+                fsdp_param.sharded_param._local_tensor = local_tensor[
+                    : fsdp_param.sharded_size[0]
+                ]
+        return ret
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/fully_shard.py b/MLPY/Lib/site-packages/torch/distributed/_composable/fully_shard.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ee3b54fbb101861fb0886a32369ba6cfb40c43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/fully_shard.py
@@ -0,0 +1,133 @@
+import warnings
+from typing import Callable, Iterable, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._composable.contract import contract
+from torch.distributed._composable_state import _get_module_state, _insert_module_state
+from torch.distributed.fsdp._common_utils import _FSDPState
+from torch.distributed.fsdp._dynamo_utils import _annotate_modules_for_dynamo
+
+from torch.distributed.fsdp._init_utils import (
+    _init_buffer_state,
+    _init_core_state,
+    _init_device_handle,
+    _init_ignored_module_states,
+    _init_param_handle_from_module,
+    _init_prefetching_state,
+    _init_process_group_state,
+    _init_runtime_state,
+    _init_state_dict_state,
+    HYBRID_SHARDING_STRATEGIES,
+)
+from torch.distributed.fsdp._runtime_utils import (
+    _register_post_forward_hook,
+    _register_pre_forward_hook,
+    _register_root_pre_forward_hook,
+)
+from torch.distributed.fsdp._state_dict_utils import _register_all_state_dict_hooks
+from torch.distributed.fsdp._wrap_utils import _auto_wrap
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.wrap import _Policy
+
+
+@contract(state_cls=_FSDPState)
+def fully_shard(
+    module: nn.Module,
+    *,
+    process_group: Optional[dist.ProcessGroup] = None,
+    policy: Optional[_Policy] = None,
+    strategy: Optional[ShardingStrategy] = None,
+    mixed_precision: Optional[MixedPrecision] = None,
+    cpu_offload: Optional[CPUOffload] = None,
+    ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+    device_id: Optional[Union[int, torch.device]] = None,
+    param_init_fn: Optional[Callable[[nn.Module], None]] = None,
+    sync_module_states: bool = False,
+    forward_prefetch: bool = False,
+    ignored_states: Union[
+        Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]
+    ] = None,
+) -> nn.Module:
+    """
+    Applies ``FullyShardedDataParallel` (FSDP) semantics to ``module``.
+    """
+    warnings.warn(
+        "``torch.distributed._composable.fully_shard`` is being deprecated."
+        "You can contintue to use the wrapper based FSDP."
+        "See usage in: https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/fully_sharded_data_parallel.py."
+        "``torch.distributed._composable.fully_shard`` will be removed after PyTorch 2.5."
+    )
+
+    torch._C._log_api_usage_once("torch.distributed.fully_shard")
+    # Enforce the new auto wrap policy
+    if policy is not None and not isinstance(policy, _Policy):
+        raise ValueError(f"Expects a `_Policy` but got {policy}")
+    state = fully_shard.state(module)
+    state = _init_ignored_module_states(state, module, ignored_modules, ignored_states)
+    state = _init_device_handle(state, module, state._ignored_params, device_id)
+    _annotate_modules_for_dynamo(module, state._ignored_modules, True)
+    state = _init_process_group_state(state, process_group, strategy, policy)
+    if policy is not None:
+        root_kwargs = {
+            "process_group": process_group,
+            "strategy": strategy,
+            "mixed_precision": mixed_precision,
+            "cpu_offload": cpu_offload,
+            "ignored_modules": ignored_modules,
+            "device_id": device_id,
+            "param_init_fn": param_init_fn,
+            "sync_module_states": sync_module_states,
+            "forward_prefetch": forward_prefetch,
+            "ignored_states": ignored_states,
+        }
+        if strategy in HYBRID_SHARDING_STRATEGIES:
+            root_kwargs["process_group"] = (state.process_group, state._inter_node_pg)
+        _auto_wrap(
+            module,
+            policy,
+            state._ignored_modules,
+            state._ignored_params,
+            root_kwargs,
+            fully_shard,
+        )
+    state = _init_core_state(
+        state,
+        strategy or ShardingStrategy.FULL_SHARD,
+        mixed_precision,
+        cpu_offload,
+        limit_all_gathers=True,
+        use_orig_params=True,
+        backward_prefetch_limit=1,
+        forward_prefetch_limit=1,
+    )
+    state = _init_runtime_state(state)
+    state = _init_prefetching_state(
+        state, BackwardPrefetch.BACKWARD_PRE, forward_prefetch=forward_prefetch
+    )
+    state = _init_buffer_state(state, module)
+    state = _init_param_handle_from_module(
+        state, module, device_id, param_init_fn, sync_module_states
+    )
+    state = _init_state_dict_state(state)
+    _register_all_state_dict_hooks(state)
+    _register_pre_forward_hook(state, module)
+    _register_post_forward_hook(state, module)
+    _register_root_pre_forward_hook(state, module)  # prepend last
+    # Always insert the state for the passed-in module even if it has no
+    # managed parameters, in which case it has no handles and does not appear
+    # in `_fully_sharded_module_to_handles`
+    _insert_module_state(module, state)
+    for submodule in module.modules():
+        if (
+            submodule in state._fully_sharded_module_to_handle
+            and _get_module_state(submodule) is None
+        ):
+            _insert_module_state(submodule, state)
+    return module
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable/replicate.py b/MLPY/Lib/site-packages/torch/distributed/_composable/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d67f9f4201ba3bf1450f2f9fa59920cb783dcca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable/replicate.py
@@ -0,0 +1,154 @@
+import weakref
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+from torch.distributed._composable_state import _State
+from torch.nn.parallel import DistributedDataParallel
+
+from .contract import _get_registry, contract
+
+_ROOT_MODULE_PREFIX = ""
+
+
+class _ReplicateState(_State):
+    def __init__(self) -> None:
+        super().__init__()
+        self.module: nn.Module = nn.ParameterList()
+        self.has_initialized: bool = False
+        self._param_list: nn.ParameterList = nn.ParameterList()
+        # TODO(@fegin): this variable is originally create for testing, we
+        # should remove this if possible.
+        self._param_names: List[str] = []
+
+    def _collect_params(
+        self,
+        module: nn.Module,
+        ignored_modules: Set[nn.Module],
+        ignored_params: Set[nn.Parameter],
+        prefix: str = _ROOT_MODULE_PREFIX,
+    ) -> None:
+        # skip if managed by fully_sharded API
+        if _is_fully_sharded(module):
+            return
+
+        # if a module is ignored, all descendants of the module are ignored.
+        if module in ignored_modules:
+            return
+
+        recurse_prefix = (
+            f"{prefix}." if prefix != _ROOT_MODULE_PREFIX else _ROOT_MODULE_PREFIX
+        )
+
+        for n, p in module.named_parameters(recurse=False):
+            if p not in ignored_params:
+                self._param_list.append(p)
+                self._param_names.append(f"{recurse_prefix}{n}")
+
+        for name, child_module in module.named_children():
+            self._collect_params(
+                child_module,
+                ignored_modules,
+                ignored_params,
+                prefix=f"{recurse_prefix}{name}",
+            )
+
+    def init(
+        self,
+        module: nn.Module,
+        ignored_modules: Set[nn.Module],
+        **kwargs,
+    ) -> None:
+        if _is_fully_sharded(module):
+            raise RuntimeError(
+                "Cannot apply `replicate()` on a Module already managed by `fully_shard`"
+            )
+
+        if self.has_initialized:
+            return
+
+        self.has_initialized = True
+        self.module = module
+        ignored_params = {p for m in ignored_modules for p in m.parameters()}
+        self._collect_params(module, ignored_modules, ignored_params)
+        module.register_forward_pre_hook(self.forward_pre_hook, with_kwargs=True)
+        module.register_forward_hook(self.forward_post_hook)  # type: ignore[arg-type]
+
+        if "device_id" in kwargs:
+            # replicate() supports a small usability enhancement where
+            # user can pass in device_id as a Union[int, torch.device] even for
+            # CPU devices so users don't have to change code for CPU/GPU runs.
+            # We derive the right device_ids to feed into DDP to support this.
+            if kwargs["device_id"] is not None:
+                device_id = kwargs["device_id"]
+                # Convert to device_ids that DDP expects.
+                if isinstance(device_id, torch.device) and device_id.type == "cpu":
+                    # CPU modules receive device_ids None
+                    kwargs["device_ids"] = None
+                else:
+                    # GPU modules expect device_ids=[cuda_device]
+                    kwargs["device_ids"] = [device_id]
+            else:
+                kwargs["device_ids"] = None
+            kwargs.pop("device_id")
+
+        self._ddp = DistributedDataParallel(self._param_list, **kwargs)
+        # Weakref to the DDP instance is currently only used for testing.
+        replicate.state(self.module)._ddp_weakref = weakref.ref(self._ddp)
+
+    def forward_pre_hook(
+        self, module: nn.Module, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Any:
+        return self._ddp._pre_forward(*args, **kwargs)
+
+    def forward_post_hook(
+        self,
+        module: nn.Module,
+        input: Tuple[torch.Tensor],
+        output: torch.Tensor,
+    ) -> torch.Tensor:
+        return self._ddp._post_forward(output)
+
+
+@contract(state_cls=_ReplicateState)
+def replicate(
+    module: nn.Module,
+    ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+    **kwargs,
+) -> nn.Module:
+    r"""Replicates a module
+
+    Args:
+        module (torch.nn.Module): module to replicate
+
+    Example::
+        >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
+        >>> module = nn.Linear(3, 3)
+        >>> replicate(module)
+    """
+    torch._C._log_api_usage_once("torch.distributed.replicate")
+
+    # TODO(fegin): using kwargs is not a good idea if we would like to make
+    # replicate a formal API to replace DDP.
+    if "device_id" in kwargs:
+        if not isinstance(kwargs["device_id"], (int, torch.device)):
+            raise RuntimeError(
+                "Expected device_id to be int or torch.device, "
+                f"but got {type(kwargs['device_id'])}"
+            )
+
+    if ignored_modules is None:
+        ignored_modules = {}
+    else:
+        ignored_modules = set(ignored_modules)
+    replicate.state(module).init(module, ignored_modules, **kwargs)
+
+    return module
+
+
+def _is_fully_sharded(module: nn.Module) -> bool:
+    r"""Check if module is marked with fully_shard."""
+    registry = _get_registry(module)
+    if registry is None:
+        return False
+    return "fully_shard" in registry
diff --git a/MLPY/Lib/site-packages/torch/distributed/_composable_state.py b/MLPY/Lib/site-packages/torch/distributed/_composable_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..5095fd4424838902d80594f4c9b4e53852990ace
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_composable_state.py
@@ -0,0 +1,37 @@
+from typing import cast, Dict, Optional
+
+import torch.nn as nn
+
+
+class _State:
+    pass
+
+
+_module_state_mapping: Dict[nn.Module, _State] = {}
+
+
+def _insert_module_state(module: nn.Module, state: _State) -> None:
+    global _module_state_mapping
+    assert module not in _module_state_mapping, f"Inserting {module} more than once."
+    _module_state_mapping[module] = state
+
+
+def _get_module_state(module: nn.Module) -> Optional[_State]:
+    """
+    Return the ``_State`` in ``model``.
+
+    Given a ``module``, this API finds out if the module is also a ``_State``
+    instance or if the module is managed by a composable API. If the module
+    is also a ``_State``, ``module`` will be casted to ``_State` and returned.
+    If it is managed by a composable API, the corresponding ``_State`` will
+    be returned.
+    """
+    global _module_state_mapping
+    if isinstance(module, _State):
+        return cast(_State, module)
+    else:
+        # https://github.com/pytorch/pytorch/issues/107054
+        if module in _module_state_mapping:
+            return _module_state_mapping[module]
+        else:
+            return None
diff --git a/MLPY/Lib/site-packages/torch/distributed/_functional_collectives.py b/MLPY/Lib/site-packages/torch/distributed/_functional_collectives.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c154f90b8889a80116d779e75bd95341e43d14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_functional_collectives.py
@@ -0,0 +1,1084 @@
+import sys
+import warnings
+from typing import cast, List, Optional, Tuple, TYPE_CHECKING, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as c10d
+from torch._custom_ops import impl_abstract
+from torch.distributed.device_mesh import DeviceMesh
+from torch.fx.experimental.proxy_tensor import get_innermost_proxy_mode
+
+from . import _functional_collectives_impl as fun_col_impl
+from ._functional_collectives_impl import (  # noqa: F401
+    _register_tensor_wrapper,
+    native_funcol_enabled,
+)
+
+try:
+    from torch.utils._cxx_pytree import tree_map_only
+except ImportError:
+    from torch.utils._pytree import tree_map_only  # type: ignore[no-redef]
+
+
+if torch._running_with_deploy():
+
+    def is_torchdynamo_compiling():
+        """Can't import torchdynamo in torchdeploy builds currently."""
+        return False
+
+else:
+    try:
+        from torch.compiler import is_dynamo_compiling as is_torchdynamo_compiling
+    except Exception:
+        warnings.warn(
+            "Unable to import torchdynamo util `is_torchdynamo_compiling`, so won't support torchdynamo correctly"
+        )
+
+        def is_torchdynamo_compiling():
+            return False
+
+
+"""
+New traceable, functional collectives.
+RFC: https://github.com/pytorch/pytorch/issues/93173
+
+  compiler: trace these ops with plain-old-data schemas, then choose how to lower them.
+  eager: execute these 'functional' ops which in eager return AsyncCollectiveTensor subclasses,
+         automatically calling .wait() on underlying/hidden async 'work' obj only when fed to
+         a downstream op.
+
+Issues:
+* Where should these ops live? Couldn't `import torch` if putting these ops in existing torch.distributed files
+* Proper support for eager requires inplace ops. We should explore having it as an option for the API.
+"""
+
+"""
+Functional collectives are asynchronous only and we perform implicit stream synchronization
+on behalf of the user.
+
+We use AsyncCollectiveTensor to wrap the result tensor of a collective and it lets us witness
+first usage of the tensor and insert cross stream sync at the right place.
+
+The above are the easy bits, the hard one is how we match the Work object returned by
+c10d and the tensor AsyncCollectiveTensor wraps. We alloc the tensor inside the collective
+op implementation (see ``clone()`` call in ``_all_reduce``) and then it's handled by the
+dispatcher which might call other implementations that are allowed to change the returned
+tensor - even return a tensor with a different shape (see ``torch.vmap``).
+
+This means the caller of our ops receives a Tensor that is not guaranteed to be the same
+allocated by our implementations and that makes pairing The AsyncTensor to the original
+tensor a lot harder. This pairing is needed so we can lookup the Work object to use.
+
+Originally, we tried WeakKeyDictionary to map from Tensor to Work, but because Tensor's
+identity is not stable across dispatch, the op caller would end up with a different Tensor
+instance that would not match any in the dictionary.
+
+With Tensor identity out of the question, we decided use the tensor data pointer, which
+should be stable across all the Tensor changes done during dispatch.
+
+We have a dictionary of tensor::data_ptr -> Work that we insert right after we call into c10d.
+
+We use this dictionary when AsyncCollectiveTensor is used to invoke Work::wait()
+
+Finally, we setup a finalizer against the tensor wrapper to observe it getting collected so we
+can clean up stale entries in the dictionary.
+
+To eliminate the possibility of races we have a global version counter that is used by the finalizer.
+
+As a wise man said once: Don't cross the streams (https://www.youtube.com/watch?v=wyKQe_i9yyo)
+
+"""
+
+"""
+Functional collectives can accept any of these types to describe the ranks participating in collectives.
+
+The different types will be desugared to a canonical format
+"""
+RANK_TYPES = Union[
+    List[int],
+    List[List[int]],
+    dist.ProcessGroup,
+    DeviceMesh,
+    Tuple["dist._tensor.DeviceMesh", int],
+    str,
+]
+
+
+"""
+User facing APIs for functional collectives
+-------------------------------------------
+
+These apis are called by user code and expected to work both in eager execution and compilation,
+but there are significant differences to how the two modes are implemented underneath.
+
+Eager execution is 'optimized' using a tensor subclass that schedules the synchronization (via wait_tensor() op)
+just before the tensor is first used.  Compiled tracing currently relies on the compiler to perform this optimization,
+and cannot yet correctly trace the AsyncTensor wrapper class.  In the future, these paths may be unified
+if sufficient subclass support is added in dynamo.
+
+Example: all_reduce is an entrypoint API, and other collectives follow a similar pattern.
+
+Here's how it works under torch.compile/dynamo:
+all_reduce(...)
+  |--> _expand_group(...)               - desugars processgroup into canonical/traceable format
+  |--> c10d_functional.all_reduce(...)  - dynamo captures this op call, doesn't trace deeper
+  |--> _maybe_wrap_tensor(...)          - wait_tensor() op is immediately called, no AsyncTensor subclass needed
+
+And under eager execution:
+all_reduce(...)
+  |--> _expand_group(...)               - same as above, but less critical for eager
+  |--> c10d_functional.all_reduce(...)  - dispatches to real kernel OR records op in trace
+  |--> _maybe_wrap_tensor(...)          - AsyncTensor wrapper applied to returned tensor,
+                                          which issues wait_tensor() at the time of first use
+"""
+
+
+def wait_tensor(tensor):
+    """
+    Wait on a tensor returned by the collectives ops.
+
+    Waiting follows device semantics, which means blocking on CPU and synchronizing streams on CUDA.
+    """
+    if native_funcol_enabled():
+        return torch.ops._c10d_functional.wait_tensor(tensor)  # type: ignore[attr-defined]
+    else:
+        return torch.ops.c10d_functional.wait_tensor(tensor)  # type: ignore[attr-defined]
+
+
+def broadcast(self: torch.Tensor, src: int, group: RANK_TYPES, tag: str = ""):
+    """
+    Broadcasts the tensor to all processes in the given process group.
+
+    Args:
+        src (int): Source rank
+        group (ProcessGroup or List[int]): The process group to work on.
+        tag (str, optional): A unique identifier for the collective. Default: empty string
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        tensor = torch.ops._c10d_functional.broadcast(self, src, group_name)
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor = torch.ops.c10d_functional.broadcast(
+            self, src, tag, rankset, group_size
+        )
+    return _maybe_wrap_tensor(tensor)
+
+
+def all_reduce(self: torch.Tensor, reduceOp: str, group: RANK_TYPES, tag: str = ""):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    The input tensor is left unmodified.
+
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        tensor = torch.ops._c10d_functional.all_reduce(
+            self, reduceOp.lower(), group_name
+        )
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor = torch.ops.c10d_functional.all_reduce(  # type: ignore[attr-defined]
+            self,
+            reduceOp,
+            tag,
+            rankset,
+            group_size,
+        )
+    return _maybe_wrap_tensor(tensor)
+
+
+def all_gather_tensor(
+    self: torch.Tensor,
+    gather_dim: int,
+    group: RANK_TYPES,
+    tag: str = "",
+):
+    """
+    Gather tensor data across from all machines and concatenate over ``gather_dim``.
+
+    Note that it currently only supports gather_dim = 0.
+
+    The input tensor is left unmodified.
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    assert self.is_contiguous()
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        group_size = c10d._get_group_size_by_name(group_name)
+        tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+            self, group_size, group_name
+        )
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor = torch.ops.c10d_functional.all_gather_into_tensor(  # type: ignore[attr-defined]
+            self,
+            tag,
+            rankset,
+            group_size,
+        )
+    res = _maybe_wrap_tensor(tensor)
+    # TODO this should be done inside AsyncCollectiveTensor to delay the wait() call
+    if gather_dim != 0:
+        # torch.cat access the data so we already need to wait here, first do wait
+        # and then chunk + cat avoid us going through ACT dispatching logic again
+        if isinstance(res, AsyncCollectiveTensor):
+            res = res.wait()  # type: ignore[attr-defined]
+        res = torch.cat(torch.chunk(res, group_size, dim=0), dim=gather_dim)
+    return res
+
+
+def reduce_scatter_tensor(
+    self: torch.Tensor,
+    reduceOp: str,
+    scatter_dim: int,
+    group: RANK_TYPES,
+    tag: str = "",
+):
+    """
+    Reduces the tensor data across all machines in such a way that all get
+    the final result, then scatter the results to corresponding ranks.
+
+
+    The input tensor is left unmodified.
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        group_size = c10d._get_group_size_by_name(group_name)
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+
+    assert (
+        self.size(scatter_dim) % group_size == 0
+    ), f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size}"
+    if scatter_dim != 0:
+        tensor_list = torch.chunk(self, group_size, dim=scatter_dim)
+        self = torch.cat(tensor_list)
+
+    if native_funcol_enabled():
+        tensor = torch.ops._c10d_functional.reduce_scatter_tensor(
+            self,
+            reduceOp.lower(),
+            group_size,
+            group_name,  # type: ignore[possibly-undefined]
+        )
+    else:
+        tensor = torch.ops.c10d_functional.reduce_scatter_tensor(  # type: ignore[attr-defined]
+            self,
+            reduceOp,
+            tag,
+            rankset,  # type: ignore[possibly-undefined]
+            group_size,
+        )
+    res = _maybe_wrap_tensor(tensor)
+    return res
+
+
+def all_reduce_coalesced(
+    self: List[torch.Tensor], reduceOp: str, group: RANK_TYPES, tag: str = ""
+) -> List[torch.Tensor]:
+    """
+    Reduces a list of tensors across all machines in such a way that all get
+    the final result.
+
+    The all tensors in the input list are left unmodified.
+
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        tensor_list = torch.ops._c10d_functional.all_reduce_coalesced(  # type: ignore[attr-defined]
+            self,
+            reduceOp.lower(),
+            group_name,
+        )
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor_list = torch.ops.c10d_functional.all_reduce_coalesced(  # type: ignore[attr-defined]
+            self,
+            reduceOp,
+            tag,
+            rankset,
+            group_size,
+        )
+    return list(map(_maybe_wrap_tensor, tensor_list))
+
+
+def all_gather_into_tensor_coalesced(
+    self: List[torch.Tensor], group: RANK_TYPES, tag: str = ""
+) -> List[torch.Tensor]:
+    """
+    Gather a list of tensors across from all machines.
+
+    Note that it currently only supports gather_dim = 0.
+
+    The input tensor is left unmodified.
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        group_size = c10d._get_group_size_by_name(group_name)
+        tensor_list = torch.ops._c10d_functional.all_gather_into_tensor_coalesced(  # type: ignore[attr-defined]
+            self,
+            group_size,
+            group_name,
+        )
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor_list = torch.ops.c10d_functional.all_gather_into_tensor_coalesced(  # type: ignore[attr-defined]
+            self,
+            tag,
+            rankset,
+            group_size,
+        )
+    return list(map(_maybe_wrap_tensor, tensor_list))
+
+
+def reduce_scatter_tensor_coalesced(
+    inputs: List[torch.Tensor],
+    reduceOp: str,
+    scatter_dim: List[int],
+    group: RANK_TYPES,
+    tag: str = "",
+) -> List[torch.Tensor]:
+    """
+    Reduces a list of tensors across all machines in such a way that all get
+    the final result, then scatter the results to corresponding ranks.
+
+    The input tensors are left unmodified.
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        group_size = c10d._get_group_size_by_name(group_name)
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+
+    assert len(scatter_dim) == len(inputs)
+    for idx, (dim, tensor) in enumerate(zip(scatter_dim, inputs)):
+        assert (
+            tensor.size(dim) % group_size == 0
+        ), f"input dimension {dim} ({tensor.size(dim)} must be a multiple of group_size {group_size} for tensor at index {idx}"
+        if dim != 0:
+            tensor_list = torch.chunk(tensor, group_size, dim=dim)
+            inputs[idx] = torch.cat(tensor_list)
+
+    if native_funcol_enabled():
+        tensor_list = torch.ops._c10d_functional.reduce_scatter_tensor_coalesced(  # type: ignore[attr-defined]
+            inputs,
+            reduceOp.lower(),
+            group_size,
+            group_name,  # type: ignore[possibly-undefined]
+        )
+    else:
+        tensor_list = torch.ops.c10d_functional.reduce_scatter_tensor_coalesced(  # type: ignore[attr-defined]
+            inputs,
+            reduceOp,
+            tag,
+            rankset,  # type: ignore[possibly-undefined]
+            group_size,
+        )
+
+    return list(map(_maybe_wrap_tensor, tensor_list))
+
+
+# This is a bit unsafe: it checks if the first argument in the schema reports as a non-mutable alias.
+# Today, this maps 1:1 with "aten ops that are views".
+def _is_view_op(tgt):
+    assert isinstance(tgt, torch._ops.OpOverload)
+    schema = tgt._schema
+    if len(schema.arguments) > 0:
+        first_arg = schema.arguments[0]
+        # check if op is a view
+        return first_arg.alias_info is not None and not first_arg.alias_info.is_write
+
+
+def all_to_all_single(
+    self: torch.Tensor,
+    output_split_sizes: Optional[List[int]],
+    input_split_sizes: Optional[List[int]],
+    group: RANK_TYPES,
+    tag: str = "",
+) -> torch.Tensor:
+    """
+    Each process splits input tensor and then scatters the split list
+    to all processes in a group. Then concatenate the received tensors from all
+    the processes in the group and return single output tensor.
+
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one dimension of the DeviceMesh
+
+    :: N.B. If you pass a PG or a 1D list to perform a MPMD collective, the compiler won't be able to recover
+    that information and perform collective algebraic optimization. Use other forms of input for that.
+    """
+    if output_split_sizes is not None:
+        assert all(
+            isinstance(size, (int, torch.SymInt)) for size in output_split_sizes
+        ), output_split_sizes
+    if input_split_sizes is not None:
+        assert all(
+            isinstance(size, (int, torch.SymInt)) for size in input_split_sizes
+        ), input_split_sizes
+    if native_funcol_enabled():
+        group_name = _resolve_group_name(group, tag)
+        group_size = c10d._get_group_size_by_name(group_name)
+        if output_split_sizes is None or input_split_sizes is None:
+            assert output_split_sizes is None and input_split_sizes is None, (
+                "output_split_sizes and input_split_sizes must either be "
+                "specified together or both set to None"
+            )
+            output_split_sizes = [self.shape[0] // group_size] * group_size
+            input_split_sizes = output_split_sizes
+        tensor = torch.ops._c10d_functional.all_to_all_single(  # type: ignore[attr-defined]
+            self,
+            output_split_sizes,
+            input_split_sizes,
+            group_name,
+        )
+    else:
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor = torch.ops.c10d_functional.all_to_all_single(  # type: ignore[attr-defined]
+            self,
+            output_split_sizes,
+            input_split_sizes,
+            tag,
+            rankset,
+            group_size,
+        )
+    return _maybe_wrap_tensor(tensor)
+
+
+def permute_tensor(
+    self: torch.Tensor,
+    src_dst: List[int],
+    group: RANK_TYPES,
+    tag: str = "",
+) -> torch.Tensor:
+    """
+    Permutes the elements of the tensor according to the given source/destination pairs. `src_dst` should
+    be defined such that src_dst[m] == n means m sends to n.
+
+    Group can be one of:
+        List[int]: ranks participating in the collective.
+        List[List[int]]: 2D mesh of ranks taking part of this collective in MPMD.
+        ProcessGroup: Will perform a collective using the ranks and tag of the PG.
+        DeviceMesh: Do a SPMD collective over all ranks of the mesh
+        (DeviceMesh, int): Do a MPMD collective over one
+    """
+    t, rankset, group_size = _expand_group(group, tag)
+    local_pg = c10d._find_or_create_pg_by_ranks_and_tag(t, rankset, group_size)
+
+    output_split_sizes = [0] * group_size
+    input_split_sizes = [0] * group_size
+    for src, dst in enumerate(src_dst):
+        if src == dist.get_rank(local_pg):
+            input_split_sizes[dst] = self.numel()
+        if dst == dist.get_rank(local_pg):
+            output_split_sizes[src] = self.numel()
+
+    return all_to_all_single(self, output_split_sizes, input_split_sizes, group, tag)
+
+
+class AsyncCollectiveTensor(torch.Tensor):
+    r"""
+    A Tensor wrapper subclass that is used to trigger a call to wait
+    prior to first use of the underlying tensor.
+    Use it inside functional collective pytorch wrappers like the following:
+    def functional_collective(self, group, tag):
+        tag, rankset, group_size = _expand_group(group, tag)
+        tensor = torch.ops.c10d_functional.{collective}(self, tag, rankset, group_size)
+        return _maybe_wrap_tensor(tensor)
+    """
+    elem: torch.Tensor
+    completed: bool
+
+    __slots__ = ["elem", "completed"]
+
+    @staticmethod
+    def __new__(cls, elem: torch.Tensor):
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            elem.size(),
+            strides=elem.stride(),
+            storage_offset=elem.storage_offset(),
+            dtype=elem.dtype,
+            layout=elem.layout,
+            device=elem.device,
+            requires_grad=False,
+        )
+        r.elem = elem
+        r.completed = False
+        return r
+
+    def __tensor_flatten__(self):
+        return ["elem"], None
+
+    def tolist(self):
+        self.trigger_wait()
+        return self.elem.tolist()
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert meta is None
+        elem = inner_tensors["elem"]
+        return AsyncCollectiveTensor(elem)
+
+    def __repr__(self):
+        self.trigger_wait()
+        return f"AsyncCollectiveTensor({self.elem})"
+
+    def trigger_wait(self):
+        if not self.completed:
+            wait_tensor(self.elem)
+            self.completed = True
+        return self.elem
+
+    def wait(self) -> torch.Tensor:
+        wait_tensor(self.elem)
+        return self.elem
+
+    def _get_acs_underlying_tensor(self):
+        """This method enables  _functional_collectives_impl to test if a tensor is an ACS"""
+        return self.elem
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        if func == torch.ops.aten.view.default:
+            # Fast handle aten.view as a lot of view related op goes to aten.view
+            # eventually, this avoids pytree slowdown
+            res = func(args[0].elem, args[1])
+            wrapper_res = AsyncCollectiveTensor(res)
+            _register_tensor_wrapper(wrapper_res)
+            return wrapper_res
+
+        is_view_op = _is_view_op(func)
+
+        def unwrap(e: AsyncCollectiveTensor):
+            # wait_tensor is idepotent and will do stream sync only once
+            if not is_view_op:
+                e.trigger_wait()
+            return e.elem
+
+        def wrap(e: torch.Tensor):
+            # wait_tensor is idepotent and will do stream sync only once
+            assert not isinstance(e, AsyncCollectiveTensor)
+            res = AsyncCollectiveTensor(e)
+            _register_tensor_wrapper(res)
+            return res
+
+        unwrapped_args = tree_map_only(AsyncCollectiveTensor, unwrap, args)
+        unwrapped_kwargs = tree_map_only(AsyncCollectiveTensor, unwrap, kwargs)
+
+        # we don't wrap the result as it doesn't need to be waited on.
+        out = func(*unwrapped_args, **unwrapped_kwargs)
+
+        # View ops dont require a sync, so we should re-wrap the outputs.
+        if is_view_op:
+            out = tree_map_only(torch.Tensor, wrap, out)
+
+        return out
+
+    def numpy(self):
+        return self.wait().numpy()
+
+
+"""
+Utils and infrastructure for tracing support
+"""
+
+
+def _expand_group(group: RANK_TYPES, tag: str = "") -> Tuple[str, List[int], int]:
+    """
+    _expand_group desugars the different RANK_TYPES types into a canonical format that is traceable.
+
+    By having this be part of the explicit eager codepath, we avoid having to specialize behavior inside
+    torchdynamo and can still interoperate with processgroup objects or other untraceable forms.
+    """
+    # had to define this hack _inside_ expand_group to avoid
+    # graph_break [('torch.* op returned non-Tensor int
+    # caused by 'cast_*` functions being treated as 'torch.*' ops (iiuc)
+    if TYPE_CHECKING:
+
+        def cast_listlistint(x):
+            return cast(List[List[int]], x)
+
+        def cast_listint(x):
+            return cast(List[int], x)
+
+    else:
+        # fake cast op for use at runtime since dynamo doesn't support real cast
+        # also, dynamo didn't like encountering 'typing' objects ()
+        # NotImplementedError: argument of type: <class 'typing._GenericAlias'>
+        def cast_listlistint(x):
+            return x
+
+        def cast_listint(x):
+            return x
+
+    rankset: List[int]
+    if isinstance(group, list):
+        if isinstance(group[0], list):
+            nested_list = cast_listlistint(group)
+            rankset = []
+            group_size = -1
+            for rs in nested_list:
+                rankset.extend(rs)
+                if group_size != -1 and group_size != len(rs):
+                    raise ValueError(
+                        f"group sizes must be identical found {group_size} and {len(rs)}"
+                    )
+                group_size = len(rs)
+        else:
+            rankset = cast_listint(group)
+            group_size = len(rankset)
+    elif isinstance(group, dist.ProcessGroup):
+        rankset = dist.get_process_group_ranks(group)
+        group_size = len(rankset)
+        tag = tag or c10d._get_group_tag(group)
+    elif isinstance(group, DeviceMesh):
+        assert (
+            group.ndim == 1
+        ), "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
+        # TODO: it should run collective in the whole mesh instead of dim 0
+        tag, rankset, _ = group._dim_group_infos[0]
+        group_size = len(rankset)
+    elif isinstance(group, tuple):
+        if (
+            len(group) == 2
+            and isinstance(group[0], DeviceMesh)
+            and isinstance(group[1], int)
+        ):
+            dmesh = group[0]
+            dim = group[1]
+            tag, rankset, _ = dmesh._dim_group_infos[dim]
+            group_size = len(rankset)
+        else:
+            raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
+    else:
+        raise ValueError(
+            "Invalid type for group, must be one of List, Processgroup, DeviceMesh or (DeviceMesh, int)."
+        )
+
+    return (tag, rankset, group_size)
+
+
+def _resolve_group_name(group: RANK_TYPES, tag: str = "") -> str:
+    """
+    Given group in RANK_TYPES, return the group name.
+    """
+    # `tag` will be deprecated. See details in:
+    # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
+    if isinstance(group, dist.ProcessGroup):
+        return group.group_name
+    elif isinstance(group, str):
+        return group
+    elif isinstance(group, DeviceMesh):
+        assert (
+            group.ndim == 1
+        ), "Only 1D mesh is supported, pass in (DeviceMesh, int) together if mesh > 1D"
+        return group._dim_group_infos[0][2]
+    elif isinstance(group, tuple):
+        if (
+            len(group) == 2
+            and isinstance(group[0], DeviceMesh)
+            and isinstance(group[1], int)
+        ):
+            dmesh = group[0]
+            dim = group[1]
+            return dmesh._dim_group_infos[dim][2]
+        else:
+            raise ValueError("Invalid tuple for group must be (DeviceMesh, int)")
+    elif isinstance(group, list):
+        if not is_torchdynamo_compiling():
+            warnings.warn(
+                "The combination of ranks + tag as process group "
+                "identifier has been deprecated. Please switch to "
+                "using ProcessGroup, DeviceMesh, or group name instead."
+            )
+        return c10d._resolve_group_name_by_ranks_and_tag(cast(List[int], group), tag)
+    else:
+        raise ValueError(f"Unsupported group type: {type(group)}, {group}")
+
+
+def _are_we_tracing() -> bool:
+    if is_torchdynamo_compiling():
+        return True
+    # If functionalization is turned on, we are almost definitely compiling/tracing.
+    # (In particular, AOTAutograd traces a model once with functionalization on
+    #  but proxy tracing turned of, so this is how we detect it).
+    if (
+        torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
+        is not None
+    ):
+        return True
+    mode = get_innermost_proxy_mode()
+    if mode is None:
+        return False
+    return mode.tracer is not None
+
+
+def _maybe_wrap_tensor(self) -> torch.Tensor:
+    if _are_we_tracing():
+        return wait_tensor(self)
+    res = AsyncCollectiveTensor(self)
+    _register_tensor_wrapper(res)
+    return cast(torch.Tensor, res)
+
+
+def _all_gather_into_tensor_coalesced_meta(self, tag, rankset, group_size):
+    def mk_out_tensor(shard):
+        out_size = list(shard.size())
+        out_size[0] *= group_size
+        out_tensor = shard.new_empty(out_size)
+        return out_tensor
+
+    return [mk_out_tensor(t) for t in self]
+
+
+# We now register meta kernels to deal with tracing
+def _broadcast_meta(self, *args):
+    return torch.empty_like(self)
+
+
+def _all_reduce_meta(self, *args):
+    return torch.empty_like(self)
+
+
+def _wait_tensor_meta(self, *args):
+    return torch.empty_like(self)
+
+
+def _all_gather_into_tensor_meta(shard, tag, rankset, group_size):
+    out_size = list(shard.size())
+    out_size[0] *= group_size
+    return shard.new_empty(out_size)
+
+
+def _reduce_scatter_tensor_meta(input, reduce_op, tag, rankset, group_size):
+    out_size = list(input.size())
+    out_size[0] //= group_size
+    return input.new_empty(out_size)
+
+
+def _all_reduce_coalesced_meta(self, *args):
+    return [torch.empty_like(t) for t in self]
+
+
+def _all_reduce__meta(inp, *args):
+    return inp
+
+
+def _broadcast__meta(inp, *args):
+    return inp
+
+
+def _all_reduce_coalesced__meta(inputs, *args):
+    return inputs
+
+
+def _reduce_scatter_tensor_coalesced_meta(inputs, reduceOp, tag, rankset, group_size):
+    def mk_out_tensor(input):
+        out_size = list(input.size())
+        out_size[0] //= group_size
+        out_tensor = input.new_empty(out_size)
+        return out_tensor
+
+    return [mk_out_tensor(t) for t in inputs]
+
+
+# NB: We often say all_to_all has dynamic output size, but this is not
+# technically true: instead, what typically happens is you manually
+# communicate the output_split_sizes ahead of time (which is dynamic),
+# but then you pass those sizes explicitly, and the all to all itself
+# isn't dynamic, it just follows the specified output splits
+def _all_to_all_single_meta(
+    input, output_split_sizes, input_split_sizes, *args, **kwargs
+):
+    if output_split_sizes is None:
+        return input.new_empty(input.size())
+    else:
+        for s in output_split_sizes:
+            torch._check_is_size(s)
+        out_size = list(input.size())
+        out_size[0] = sum(output_split_sizes)
+        return input.new_empty(out_size)
+
+
+def _all_gather_into_tensor_native_meta(input, group_size, group_name):
+    shape = list(input.size())
+    shape[0] *= group_size
+    return input.new_empty(shape)
+
+
+def _all_gather_into_tensor_coalesced_native_meta(inputs, group_size, group_name):
+    return [
+        _all_gather_into_tensor_native_meta(input, group_size, group_name)
+        for input in inputs
+    ]
+
+
+def _reduce_scatter_tensor_native_meta(inp, reduce_op, group_size, group_name):
+    shape = list(inp.size())
+    shape[0] //= group_size
+    return inp.new_empty(shape)
+
+
+def _reduce_scatter_tensor_coalesced_native_meta(
+    inputs, reduce_op, group_size, group_name
+):
+    return [
+        _reduce_scatter_tensor_native_meta(inp, reduce_op, group_size, group_name)
+        for inp in inputs
+    ]
+
+
+def _register_ops():
+    ops_defs = [
+        "broadcast(Tensor self, int src, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_reduce_coalesced(Tensor[] self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "wait_tensor(Tensor self) -> Tensor",
+        "all_gather_into_tensor(Tensor shard, str tag, int[] ranks, int group_size) -> Tensor",
+        "all_gather_into_tensor_coalesced(Tensor[] input, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "reduce_scatter_tensor(Tensor input, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor",
+        "reduce_scatter_tensor_coalesced(Tensor[] inputs, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor[]",
+        "all_to_all_single(Tensor input, SymInt[]? output_split_sizes, SymInt[]? input_split_sizes, str tag, int[] ranks, int group_size) -> Tensor",  # noqa: B950
+    ]
+
+    my_module = sys.modules[__name__]
+    for op_def in ops_defs:
+        op_name = op_def[0 : op_def.index("(")]
+        backend_impl = getattr(fun_col_impl, f"_{op_name}")
+        meta_impl = getattr(my_module, f"_{op_name}_meta")
+        c10_lib.define(op_def, tags=torch.Tag.pt2_compliant_tag)
+        c10_lib_impl.impl(op_name, backend_impl, "CompositeExplicitAutograd")
+        impl_abstract(f"c10d_functional::{op_name}")(meta_impl)
+
+
+if not torch._running_with_deploy():
+    # Library MUST be defined at module scope or it doesn't work
+    # Creating a "DEF" Library always crashes torch::deploy so we create our Library instances here
+    #   guarded against running inside it
+    c10_lib = torch.library.Library("c10d_functional", "DEF")
+    c10_lib_impl = torch.library.Library("c10d_functional", "IMPL")
+    _register_ops()
+
+    _c10_lib_impl = torch.library.Library("_c10d_functional", "IMPL")
+    _c10_lib_impl.impl("all_reduce", _all_reduce_meta, "Meta")
+    _c10_lib_impl.impl("all_reduce_", _all_reduce__meta, "Meta")
+    _c10_lib_impl.impl("all_reduce_coalesced", _all_reduce_coalesced_meta, "Meta")
+    _c10_lib_impl.impl("all_reduce_coalesced_", _all_reduce_coalesced__meta, "Meta")
+    _c10_lib_impl.impl("wait_tensor", _wait_tensor_meta, "Meta")
+    _c10_lib_impl.impl(
+        "all_gather_into_tensor", _all_gather_into_tensor_native_meta, "Meta"
+    )
+    _c10_lib_impl.impl(
+        "all_gather_into_tensor_coalesced",
+        _all_gather_into_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    _c10_lib_impl.impl(
+        "reduce_scatter_tensor", _reduce_scatter_tensor_native_meta, "Meta"
+    )
+    _c10_lib_impl.impl(
+        "reduce_scatter_tensor_coalesced",
+        _reduce_scatter_tensor_coalesced_native_meta,
+        "Meta",
+    )
+    _c10_lib_impl.impl("all_to_all_single", _all_to_all_single_meta, "Meta")
+    _c10_lib_impl.impl("broadcast", _broadcast_meta, "Meta")
+    _c10_lib_impl.impl("broadcast_", _broadcast__meta, "Meta")
+else:
+    warnings.warn(
+        "PyTorch Distributed functional collectives do not work with torch::deploy."
+    )
+
+
+"""
+Dynamo Remappings allow seamless translation from non-functional collectives of supportable form into
+functional collective calls followed by inplace copy ops, allowing them to be traced into a functional graph.
+
+We implement this by writing a decomposition and teaching dynamo how to associate it to a corresponding op via
+the mapping dict below.
+
+These schemas intentionally match torch.distributed.distributed_c10d.* ops that we are trying to remap from
+"""
+
+
+def all_gather_tensor_inplace(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    group,  # TODO add a type,
+    async_op: bool = False,
+    tag: str = "",
+    gather_dim: int = 0,
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+    return output_tensor.copy_(all_gather_tensor(input_tensor, gather_dim, group, tag))
+
+
+def reduce_scatter_tensor_inplace(
+    output: torch.Tensor,
+    input: torch.Tensor,
+    op: str = "sum",  # TODO type is actually c10d ReduceOp. is this ok?
+    group=None,  # TODO add a type
+    async_op: bool = False,
+    scatter_dim: int = 0,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+    return output.copy_(reduce_scatter_tensor(input, op, scatter_dim, group, tag))
+
+
+REDUCE_OP_TO_STR = {
+    dist.ReduceOp.SUM: "sum",
+    dist.ReduceOp.AVG: "avg",
+    dist.ReduceOp.PRODUCT: "product",
+    dist.ReduceOp.MIN: "min",
+    dist.ReduceOp.MAX: "max",
+    dist.ReduceOp.BAND: "band",
+    dist.ReduceOp.BOR: "bor",
+    dist.ReduceOp.BXOR: "bxor",
+}
+
+
+def all_reduce_inplace(
+    tensor: torch.Tensor,
+    op: str = "sum",
+    group=None,
+    async_op: bool = False,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+
+    return tensor.copy_(all_reduce(tensor, op, group, tag))
+
+
+def all_to_all_inplace(
+    output: torch.Tensor,
+    input: torch.Tensor,
+    output_split_sizes=None,
+    input_split_sizes=None,
+    group=None,
+    async_op=False,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+    return output.copy_(
+        all_to_all_single(input, output_split_sizes, input_split_sizes, group, tag)
+    )
+
+
+def all_gather_inplace(
+    tensor_list: List[torch.Tensor],
+    tensor: torch.Tensor,
+    group=None,
+    async_op=False,
+    tag: str = "",
+):
+    assert (
+        not async_op
+    ), "Can't remap async version of inplace op to functional collective"
+    assert all(
+        t.size(0) == tensor.size(0) for t in tensor_list
+    ), "Remapping variable size all_gather is not yet supported"
+
+    output = all_gather_tensor(tensor, 0, group, tag)
+
+    # Use aten.slice instead of aten.split because the latter causes
+    # tensor.shape(0) to be unnecessarily baked in when it's a SymInt.
+    output_splits = []
+    offset = 0
+    for t in tensor_list:
+        output_splits.append(output[offset : offset + t.size(0)])
+        offset += t.size(0)
+    for dst, src in zip(tensor_list, output_splits):
+        dst.copy_(src)
+    return tensor_list
+
+
+from torch.distributed.distributed_c10d import (
+    _all_gather_base as legacy_all_gather_base,
+    _reduce_scatter_base as legacy_reduce_scatter_base,
+    all_gather as legacy_all_gather,
+    all_gather_into_tensor as legacy_allgather,
+    all_reduce as legacy_allreduce,
+    all_to_all_single as legacy_all_to_all_single,
+    reduce_scatter_tensor as legacy_reducescatter,
+)
+
+# This dict should contain sets of functions that dynamo is allowed to remap.
+# Functions in this set should accept the same args/kwargs 1:1 as their mapping.
+traceable_collective_remaps = {
+    legacy_allgather: all_gather_tensor_inplace,
+    legacy_reducescatter: reduce_scatter_tensor_inplace,
+    legacy_allreduce: all_reduce_inplace,
+    legacy_all_to_all_single: all_to_all_inplace,
+    legacy_all_gather: all_gather_inplace,
+    legacy_reduce_scatter_base: reduce_scatter_tensor_inplace,
+    legacy_all_gather_base: all_gather_tensor_inplace,
+}
diff --git a/MLPY/Lib/site-packages/torch/distributed/_functional_collectives_impl.py b/MLPY/Lib/site-packages/torch/distributed/_functional_collectives_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cf4c223dc4305ede128bed965ed3fa914e2e894
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_functional_collectives_impl.py
@@ -0,0 +1,409 @@
+import logging
+import os
+import warnings
+import weakref
+from typing import cast, Dict, List, Optional
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as c10d
+
+"""
+Moved eager kernel implementations to a separate file partly for readability and partly as it is currently
+easier in dynamo to set tracing policy on a file-by-file level.
+
+Do not put code in this file that Dynamo is expected to trace into, as dynamo may disallow this whole file.
+
+DEBUG/TESTING HELPERS:
+
+This module includes some helpers that are quite useful when debugging or testing functional collectives:
+
+_tensor_needs_wait
+_outstanding_wait_count
+_wait_all
+
+"""
+
+_use_native_funcol: Optional[bool] = None
+
+
+if torch._running_with_deploy():
+
+    def native_funcol_enabled():
+        return False
+
+else:
+    from torch._dynamo import assume_constant_result
+
+    @assume_constant_result
+    def native_funcol_enabled():
+        global _use_native_funcol
+        if _use_native_funcol is None:
+            try:
+                # Disable native funcol when torch_xla is installed. This check
+                # will be removed once torch_xla adopts the native_funcol IR.
+                import torch_xla  # noqa: F401
+
+                _use_native_funcol = False
+            except Exception:
+                # When TORCH_DISABLE_NATIVE_FUNCOL is set, fallback to py funcol
+                _use_native_funcol = (
+                    os.environ.get("TORCH_DISABLE_NATIVE_FUNCOL") != "1"
+                )
+
+        return _use_native_funcol
+
+
+logger = logging.getLogger(__name__)
+
+data_ptr_to_work: Dict[int, "_WaitRegistration"] = dict()
+work_version = 0
+
+
+class _WaitRegistration:
+    def __init__(self, work):
+        global work_version
+        self.work = work
+        self.version = work_version
+        self.ptrs = set()
+        self.ptr_alias_count = {}
+        self.cleanup_count = 0
+        work_version += 1
+
+    def _register_tensor_ptr(self, data_ptr):
+        global data_ptr_to_work
+        data_ptr_to_work[data_ptr] = self
+        self.ptrs.add(data_ptr)
+
+    def _record_wrapper(self, ptr):
+        self._register_tensor_ptr(ptr)
+        self.ptr_alias_count.setdefault(ptr, 0)
+        self.ptr_alias_count[ptr] += 1
+        self.cleanup_count += 1
+
+    def wait(self):
+        if self.work is not None:
+            self.work.wait()
+            self.work = None
+        self.cleanup()
+
+    def decrement_live_tensor(self, ptr):
+        self.cleanup_count -= 1
+        if self.cleanup_count == 0:
+            self.wait()
+        else:
+            self.ptr_alias_count[ptr] -= 1
+            if (
+                self.ptr_alias_count[ptr] < 1
+                and data_ptr_to_work.get(ptr, None) == self
+            ):
+                del data_ptr_to_work[ptr]
+
+    def cleanup(self):
+        for ptr in self.ptrs:
+            if data_ptr_to_work.get(ptr, None) == self:
+                del data_ptr_to_work[ptr]
+
+
+def _register_tensor_work(tensor_or_list, work_or_list):
+    if not isinstance(tensor_or_list, list):
+        tensor_or_list = [tensor_or_list]
+    if not isinstance(work_or_list, list):
+        reg = _WaitRegistration(work_or_list)
+        for tensor in tensor_or_list:
+            reg._register_tensor_ptr(tensor.data_ptr())
+    else:
+        for tensor, work in zip(tensor_or_list, work_or_list):
+            reg = _WaitRegistration(work)
+            reg._register_tensor_ptr(tensor.data_ptr())
+
+
+def _wait_reg_dec(ptr, wait_reg):
+    wait_reg.decrement_live_tensor(ptr)
+
+
+def _register_tensor_wrapper(tensor) -> None:
+    if native_funcol_enabled():
+        # Tensor storage -> work mapping is maintained in C++
+        return
+    global data_ptr_to_work
+    data_ptr = tensor.elem.data_ptr()
+    # Note: we should NEVER try to trace this, bc it registers runtime stuff during trace.
+    # Instead, backends must call this themselves when implementing traced collectives.
+    wait_reg = data_ptr_to_work.get(data_ptr, None)
+    if wait_reg is None:
+        warnings.warn(
+            "Trying to register finalizer to AsyncCollectiveTensor but the inner tensor is already gone"
+        )
+    else:
+        # We force the collective to be waited in the case this tensor goes away to reduce the change of deadlocks.
+        # NOTE: we register the callback to the ACT wrapper class, for the following reasons:
+        # 1. The inner tensor is referenced by the associated Work object, so it's uncollective until we release the
+        #  associated work object
+        # 2. There's a n-to-1 relationship between wrappers and inner tensor due to non-waitable ops like view()
+        wait_reg._record_wrapper(data_ptr)
+        weakref.finalize(tensor, _wait_reg_dec, data_ptr, wait_reg)
+
+
+def _wait_tensor(tensor: torch.Tensor) -> torch.Tensor:
+    global data_ptr_to_work
+    data_ptr = tensor.data_ptr()
+    wait_reg = data_ptr_to_work.get(data_ptr)
+    if wait_reg is not None:
+        wait_reg.wait()
+    return tensor
+
+
+def _tensor_needs_wait(tensor: torch.Tensor) -> bool:
+    """Returns true if ```tensor``` needs to be waited. Works with ACS and inner tensors."""
+    if hasattr(tensor, "_get_acs_underlying_tensor"):
+        tensor = tensor._get_acs_underlying_tensor()
+    data_ptr = tensor.data_ptr()
+    wait_reg = data_ptr_to_work.get(data_ptr)
+    return wait_reg is not None and wait_reg.work is not None
+
+
+def _outstanding_wait_count() -> int:
+    """Returns the number of outstanding work objects waiting to be waited (sic)."""
+    return len(data_ptr_to_work)
+
+
+def _wait_all() -> None:
+    """Wait for all outstanding collectives."""
+    for work_reg in list(data_ptr_to_work.values()):
+        work_reg.wait()
+
+
+def _str_to_reduce_op(reduceOp: str) -> dist.ReduceOp:
+    reduceOp = reduceOp.upper()
+    op = dist.ReduceOp.RedOpType.__members__.get(reduceOp)
+    if op is None:
+        raise ValueError(f"Invalid reduce operation {reduceOp}")
+    return cast(dist.ReduceOp, op)
+
+
+"""
+Kernel implementations (for eager runtime only) - should never be traced by torch.compile
+
+These functions should all be bound to dispatcher ops.  During tracing, the op itself should be
+captured in the graph and the backend should implement the op however it prefers.
+"""
+
+
+def _broadcast(self, src, tag, ranks, group_size):
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+
+    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
+    work = dist.broadcast(inplace_tensor, src, group=group, async_op=True)
+    _register_tensor_work(inplace_tensor, work)
+
+    return inplace_tensor
+
+
+# TODO assert if ranks has duplicated entries
+def _all_reduce(self, reduceOp, tag, ranks, group_size):
+    op = _str_to_reduce_op(reduceOp)
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+
+    inplace_tensor = self.clone(memory_format=torch.contiguous_format)
+    work = dist.all_reduce(inplace_tensor, op=op, group=group, async_op=True)
+    _register_tensor_work(inplace_tensor, work)
+
+    return inplace_tensor
+
+
+def _all_reduce_coalesced(self, reduceOp, tag, ranks, group_size):
+    op = _str_to_reduce_op(reduceOp)
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+
+    inplace_tensor_list = [t.clone(memory_format=torch.contiguous_format) for t in self]
+    work = dist.all_reduce_coalesced(
+        inplace_tensor_list, op=op, group=group, async_op=True
+    )
+    _register_tensor_work(inplace_tensor_list, work)
+
+    return inplace_tensor_list
+
+
+def _all_gather_into_tensor(shard, tag, ranks, group_size):
+    # TODO add dim support?
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+    out_size = list(shard.size())
+    out_size[0] *= group_size
+    out_tensor = shard.new_empty(out_size)
+    assert out_tensor.is_contiguous()
+    # FIXME gloo doesn't support _allgather_base
+    if dist.get_backend(group) == dist.Backend.GLOO or shard.is_cpu:
+        tensor_list = list(torch.chunk(out_tensor, group_size))
+        work = dist.all_gather(tensor_list, shard, group=group, async_op=True)
+    else:
+        work = dist.all_gather_into_tensor(
+            out_tensor, shard, group=group, async_op=True
+        )
+    _register_tensor_work(out_tensor, work)
+
+    return out_tensor
+
+
+def _all_gather_into_tensor_coalesced(self, tag, rankset, group_size):
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, rankset, group_size)
+    assert group is not None
+
+    def mk_out_tensor(shard):
+        out_size = list(shard.size())
+        out_size[0] *= group_size
+        out_tensor = shard.new_empty(out_size)
+        assert out_tensor.is_contiguous()
+        return out_tensor
+
+    out_tensors = [mk_out_tensor(t) for t in self]
+
+    work_list = _all_gather_into_tensor_coalesced_fallback(
+        output_tensors=out_tensors, input_tensors=self, group=group, async_op=True
+    )
+
+    _register_tensor_work(out_tensors, work_list)
+    return out_tensors
+
+
+def _reduce_scatter_tensor(
+    input: torch.Tensor,
+    reduceOp: str,
+    tag: str,
+    ranks: List[int],
+    group_size: int,
+):
+    # TODO add dim support?
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+    op = _str_to_reduce_op(reduceOp)
+
+    if dist.get_backend(group) == dist.Backend.GLOO or input.is_cpu:
+        # cpu::gloo backend does not have reduce_scatter we fallback to do all_reduce
+        # + local chunk
+        logger.warning(
+            "ProcessGroupGloo does not support reduce_scatter, falling back with all reduce!"
+        )
+        reduction_input = input.clone()
+        group_rank = dist.get_rank(group)
+        work = dist.all_reduce(reduction_input, op=op, group=group, async_op=True)
+        out_tensor = reduction_input.chunk(group_size, dim=0)[group_rank]
+        _register_tensor_work(out_tensor, work)
+    else:
+        out_size = list(input.size())
+        out_size[0] //= group_size
+        out_tensor = input.new_empty(out_size)
+        work = dist.reduce_scatter_tensor(
+            out_tensor, input, op=op, group=group, async_op=True
+        )
+        _register_tensor_work(out_tensor, work)
+
+    return out_tensor
+
+
+def _reduce_scatter_tensor_coalesced(
+    inputs: List[torch.Tensor],
+    reduce_op: str,
+    tag: str,
+    ranks: List[int],
+    group_size: int,
+):
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+    assert group is not None
+    op = _str_to_reduce_op(reduce_op)
+
+    def mk_out_tensor(shard):
+        out_size = list(shard.size())
+        out_size[0] //= group_size
+        out_tensor = shard.new_empty(out_size)
+        assert out_tensor.is_contiguous()
+        return out_tensor
+
+    out_tensors = [mk_out_tensor(t) for t in inputs]
+
+    work_list = _reduce_scatter_tensor_coalesced_fallback(
+        output_tensors=out_tensors,
+        input_tensors=inputs,
+        op=op,
+        group=group,
+        async_op=False,
+    )
+
+    _register_tensor_work(out_tensors, work_list)
+    return out_tensors
+
+
+def _all_gather_into_tensor_coalesced_fallback(
+    output_tensors, input_tensors, group, async_op=False
+):
+    # all_gather_coalesced is useless, it doesn't work under NCCL and does lots of copies under Gloo
+    # all_gather is useless too because it's single tensor
+    # NCCL's PG::all_gather with multiple tensors is broken, it only works for the multi-device setting
+    #  and fails if you mix same-size with different-size tensor lists.
+    # _coalescing_manager crashed NCCL when used with all_gather_into_tensor.
+    if input_tensors[0].is_cpu or not async_op:
+        work_list = []
+        out_tensors_sliced = [
+            list(torch.chunk(out_tensor, dist.get_world_size(group)))
+            for out_tensor in output_tensors
+        ]
+        for shard, out_tensor in zip(input_tensors, out_tensors_sliced):
+            work = c10d.all_gather(out_tensor, shard, group=group, async_op=async_op)
+            work_list.append(work)
+        return work_list
+    else:
+        with c10d._coalescing_manager(group=group, async_ops=True) as cm:
+            for in_t, out_t in zip(input_tensors, output_tensors):
+                dist.all_gather_into_tensor(out_t, in_t, group=group, async_op=True)
+        return cm
+
+
+def _reduce_scatter_tensor_coalesced_fallback(
+    output_tensors, input_tensors, op, group, async_op=False
+):
+    # All the same reasons as the all_gather fallback
+    work_list = []
+    for shard, out_tensor in zip(input_tensors, output_tensors):
+        work = c10d.reduce_scatter_tensor(
+            out_tensor, shard, op=op, group=group, async_op=async_op
+        )
+        work_list.append(work)
+    return work_list
+
+
+def _all_to_all_single(
+    input: torch.Tensor,
+    output_split_sizes: Optional[List[int]],
+    input_split_sizes: Optional[List[int]],
+    tag: str,
+    ranks: List[int],
+    group_size: int,
+):
+    group = c10d._find_or_create_pg_by_ranks_and_tag(tag, ranks, group_size)
+
+    if output_split_sizes is not None:
+        torch._check(
+            input.dim() >= 1,
+            lambda: f"Expected input to have at least 1 dim but got {input.dim()} dim",
+        )
+        out_size = list(input.size())
+        out_size[0] = sum(output_split_sizes)
+        out_tensor = input.new_empty(out_size)
+    else:
+        out_tensor = input.new_empty(input.size())
+
+    work = c10d.all_to_all_single(
+        out_tensor,
+        input,
+        output_split_sizes=output_split_sizes,
+        input_split_sizes=input_split_sizes,
+        group=group,
+        async_op=True,
+    )
+    _register_tensor_work(out_tensor, work)
+
+    return out_tensor
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76091e1460e37564ff79fdd09869fcb09b498741
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/__init__.py
@@ -0,0 +1,6 @@
+from .api import (
+    _shard_tensor,
+    load_with_process_group,
+    shard_module,
+    shard_parameter,
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a11d4156d02c2611a48ffbcdca9b9f85170f14e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2828a73b8dd5afc285b5842f57e69719198636c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7f8aaa59fc41145ef5f812e79dfcf259ddd9bdc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/common_op_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/common_op_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92f6eba7bd40dc6d9f842cbf63074d9f18d38b7e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/common_op_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/metadata.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1ddda13b7bf9106a9b8c4ef069173a9bbb05fbd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/metadata.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/op_registry_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/op_registry_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00aa3fbddb11c963f289169c88c4b46326a7ce0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/op_registry_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/sharder.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/sharder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c07e800c7b956ca727b1d88fe267d1401a6003a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/__pycache__/sharder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/_utils.py b/MLPY/Lib/site-packages/torch/distributed/_shard/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aedb7c8e6f2d4fa0d33dd79a3efddfe35c2ebfd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/_utils.py
@@ -0,0 +1,28 @@
+import torch
+from torch.distributed._shard.metadata import ShardMetadata
+from typing import Sequence
+
+DEPRECATE_MSG = "Please use DTensor instead and we are deprecating ShardedTensor."
+
+def narrow_tensor_by_index(tensor: torch.Tensor, offsets: Sequence[int], sizes: Sequence[int]) -> torch.Tensor:
+    """
+    Narrow the tensor according to ``offsets`` and ``sizes``.
+    """
+    narrowed_tensor = tensor
+    for idx, (offset, size) in enumerate(zip(offsets, sizes)):
+        if size < tensor.size(idx):
+            # Reshape to get shard for this rank and we don't want autograd
+            # recording here for the narrow op and 'local_shard' should be a
+            # leaf variable in the autograd graph.
+            narrowed_tensor = narrowed_tensor.narrow(
+                idx,
+                offset,
+                size
+            )
+    return narrowed_tensor
+
+def narrow_tensor(tensor: torch.Tensor, metadata: ShardMetadata) -> torch.Tensor:
+    """
+    Narrow the tensor according to the metadata
+    """
+    return narrow_tensor_by_index(tensor, metadata.shard_offsets, metadata.shard_sizes)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/api.py b/MLPY/Lib/site-packages/torch/distributed/_shard/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6867cd0bd5b8a3cb1734314c4051d3b82b7e68c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/api.py
@@ -0,0 +1,290 @@
+from contextlib import contextmanager
+from typing import Optional
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import distributed_c10d
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+)
+from .sharding_spec import (
+    ShardingSpec,
+    ChunkShardingSpec
+)
+from .sharding_plan import (
+    ShardingPlan
+)
+from .sharder import Sharder
+
+def _shard_tensor(
+    tensor: torch.Tensor, sharding_spec: ShardingSpec, src_rank=0, process_group=None
+) -> ShardedTensor:
+    """
+    Given a :class:`torch.Tensor`, it shards that tensor according to the provided
+    ``sharding_spec``. ``src_rank`` denotes the source rank which would be
+    used as the ground truth of the data which would be scattered as shards
+    across the rest of the ranks.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor needs to be sharded.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+
+    Keyword args:
+        src_rank (int, optional): The source rank which is used as the ground truth of
+            the data for the parameter that would be sharded and scattered
+            across the rest of the ranks.
+            Default: 0.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    Returns:
+        A :class:`ShardedTensor` sharded from the given tensor.
+
+    .. warning::
+        Only :class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec` is
+        currently supported as the ``sharding_spec``.
+    """
+    if not tensor.is_contiguous():
+        raise ValueError('input tensor is not a contiguous Tensor')
+
+    pg = process_group if process_group is not None else distributed_c10d._get_default_group()
+    world_size = dist.get_world_size(pg)
+    current_rank = dist.get_rank(pg)
+
+    # Validate src_rank and sharding_spec are same across all ranks.
+    gathered_list = [None] * world_size
+    dist.all_gather_object(gathered_list, (src_rank, sharding_spec), group=pg)
+
+    for idx, entry in enumerate(gathered_list):
+        if src_rank != entry[0]:  # type: ignore[index]
+            raise ValueError(
+                f'src_rank={src_rank} on rank: {current_rank} does not '  # type: ignore[index]
+                f'match with src_rank={entry[0]} on rank: {idx}')
+        if sharding_spec != entry[1]:  # type: ignore[index]
+            raise ValueError(
+                f'sharding_spec={sharding_spec} on rank: {current_rank} does not '  # type: ignore[index]
+                f'match with sharding_spec={entry[1]} on rank: {idx}')
+
+    st = sharding_spec.shard(tensor, src_rank=src_rank, process_group=process_group)
+
+    return st
+
+def shard_parameter(
+        module: torch.nn.Module,
+        param_name: str,
+        sharding_spec: ShardingSpec,
+        src_rank=0,
+        process_group=None):
+    """
+    Given a :class:`torch.nn.Module`, a ``param_name`` for a parameter in that
+    module, it shards that parameter according to the provided
+    ``sharding_spec``. ``src_rank`` denotes the source rank which would be
+    used as the ground truth of the data which would be scattered as shards
+    across the rest of the ranks.
+
+    This method replaces ``module.param_name`` with a
+    :class:`torch.distributed._sharded_tensor.ShardedTensor`
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose parameter needs to be sharded.
+        param_name (str): Name of the parameter of ``module`` that needs to be sharded.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+
+    Keyword args:
+        src_rank (int, optional): The source rank which is used as the ground truth of
+            the data for the parameter that would be sharded and scattered
+            across the rest of the ranks.
+            Default: 0.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    .. warning::
+        Only :class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec` is
+        currently supported as the ``sharding_spec``.
+    """
+    # Perform some validation first.
+    if not hasattr(module, param_name):
+        raise AttributeError(f'{module._get_name()} has no attribute `{param_name}`')
+
+    tensor = getattr(module, param_name)
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError(f'Expected {type(module).__name__}.{param_name} to be a Tensor, but found {type(tensor).__name__}')
+
+    if not tensor.is_contiguous():
+        raise ValueError(f'param: {param_name} is not a contiguous Tensor')
+
+    st = _shard_tensor(tensor, sharding_spec, src_rank, process_group)
+
+    # Replace param with ShardedTensor.
+    module.register_parameter(param_name, nn.Parameter(st))
+
+# Tracks the current process group in the load context manager.
+_CURRENT_PROCESS_GROUP: Optional[dist.ProcessGroup] = None
+
+@contextmanager
+def load_with_process_group(process_group):
+    """
+    Context manager to set the process group with which to load a ShardedTensor.
+    """
+    global _CURRENT_PROCESS_GROUP
+    if _CURRENT_PROCESS_GROUP is not None:
+        raise RuntimeError(
+            'ProcessGroup already set by previous "load_with_process_group" '
+            'context manager')
+    _CURRENT_PROCESS_GROUP = process_group
+    try:
+        yield process_group
+    finally:
+        _CURRENT_PROCESS_GROUP = None
+
+def _get_current_process_group():
+    """
+    Retrieves the current process group set by ``load_with_process_group``.
+    If not set, it just returns the default group.
+    """
+    global _CURRENT_PROCESS_GROUP
+    if _CURRENT_PROCESS_GROUP is None:
+        return distributed_c10d._get_default_group()
+    else:
+        return _CURRENT_PROCESS_GROUP
+
+def _reshard_output(
+        module: torch.nn.Module,
+        resharding_spec: ShardingSpec) -> torch.nn.Module:
+    """
+    Hook a module with output resharding in the forward pass according
+    to the given ``resharding_spec``.
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose output needs to be resharded.
+        resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+            The specification describing how the output of the module will be resharded.
+
+    Returns:
+        A :class:`torch.nn.Module` object with reshard API hooked.
+    """
+    def hook_func(_module, _input, output):
+        if isinstance(output, ShardedTensor):
+            return output.reshard(resharding_spec)
+        return output
+    module.register_forward_hook(hook_func)
+    return module
+
+def _collect_local_shard(module: torch.nn.Module) -> torch.nn.Module:
+    """
+    Hook a module with local shards collection in the forward pass.
+
+    This API is typically used to convert a sharded representation back to data parallel
+    representation. In particular, it returns the local tensor for this Shard. If the
+    size along the sharding dimension for the local tensor is 1, this dimension is removed
+    from the final result. For example a [4, 16] ShardedTensor across 4 ranks is typically
+    a local Tensor of size [16] across each rank and not [1, 16] across each rank.
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose output is ShardedTensor and the
+            local tensor value needs to be returned.
+
+    Returns:
+        A :class:`torch.nn.Module` object with collection API hooked.
+    """
+
+    def hook_func(_module, _input, output):
+        if isinstance(output, ShardedTensor):
+            local_tensor = output.local_tensor()
+            # Squeeze the # of dimensions manually, only applicable to ChunkShardingSpec
+            sharding_spec = output._sharding_spec
+            if isinstance(sharding_spec, ChunkShardingSpec) \
+               and local_tensor.size(sharding_spec.dim) == 1:  # type: ignore[attr-defined, arg-type]
+                local_tensor = local_tensor.squeeze(
+                    output._sharding_spec.dim  # type: ignore[attr-defined]
+                )
+            return local_tensor
+    module.register_forward_hook(hook_func)
+    return module
+
+def shard_module(
+    module: nn.Module,
+    plan: ShardingPlan,
+    src_rank=0,
+    process_group=None
+):
+    """
+    Shards a given module according to the provided sharding `plan`. This method
+    first shards all the parameters according to the given sharding `plan`. Then if
+    `output_plan` and `return_local_tensor` are specified in the sharding `plan`, it
+    will tag the output of modules according `output_plan`, convert the module's
+    output back to data parallel according to `return_local_tensor`.
+
+    Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        module (:class:`torch.nn.Module`): The module to apply sharding to
+        plan (:class:`torch.distributed._shard.sharding_plan.ShardingPlan`):
+            The ShardingPlan which specified param name to ShardingSpec to apply to
+            each parameter.
+
+    Keyword args:
+         src_rank (int, optional): The source rank which is used as the ground truth of
+            the data for the module that would be sharded and scattered across the rest
+            of the ranks.
+            Default: 0.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+    """
+    # record Sharder paths for sanity check on the plan to ensure items in the plan
+    # does not conflict with the submodule tree that the Sharder is working with
+    sharder_paths = []
+    for name, spec in plan.plan.items():
+        if isinstance(spec, Sharder):
+            sharder_paths.append(name)
+
+    # shard the parameter according to the ShardingPlan
+    for name, spec in plan.plan.items():
+        if isinstance(spec, ShardingSpec):
+            # if found a sharding spec, try to shard the parameter
+            module_path, _, param_name = name.rpartition(".")
+
+            for sharder_path in sharder_paths:
+                if module_path.startswith(sharder_path):
+                    raise RuntimeError(f"ShardingPlan is in-valid, trying to shard a parameter: {name},"
+                                       f" but there's already a Sharder entry for module {sharder_path},"
+                                       f" parameter sharding should not conflict with the submodule tree"
+                                       f" that a Sharder is working with!")
+
+            mod = module.get_submodule(module_path)
+            shard_parameter(
+                mod,
+                param_name,
+                spec,
+                src_rank=src_rank,
+                process_group=process_group
+            )
+        elif isinstance(spec, Sharder):
+            parent_mod_path, _, mod_name = name.rpartition(".")
+            if name == "":
+                raise KeyError("Module path must not be empty for custom sharder!")
+            mod = module.get_submodule(name)
+            parent_mod = module.get_submodule(parent_mod_path)
+            sharded_mod = spec.shard(mod)
+            # swap this submodule with the sharded module
+            parent_mod.mod_name = sharded_mod
+        else:
+            raise TypeError(f"Only `ShardingSpec` and `Sharder` are supported to shard '{name}'")
+
+    # reshard output if there's an entry in `reshard_output` for this module
+    if plan.output_plan is not None:
+        for module_path, output_spec in plan.output_plan.items():
+            if isinstance(output_spec, ShardingSpec):
+                mod = module.get_submodule(module_path)
+                _reshard_output(mod, output_spec)
+            else:
+                raise TypeError(f"Only `ShardingSpec` is supported as output_plan for '{module_path}'")
+    # convert the output back to data parallel for the modules appears in
+    # `return_local_tensor` of the plan, we will call `_collect_local_shard`
+    # to collect the local tensor for output of modules
+    if plan.return_local_tensor is not None:
+        for module_path in plan.return_local_tensor:
+            mod = module.get_submodule(module_path)
+            _collect_local_shard(mod)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec64ad79bd2a9f35b7fcb82d285f2eef57ff1831
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__init__.py
@@ -0,0 +1,12 @@
+# Keep old package for BC purposes, this file should be removed once
+# everything moves to the `torch.distributed.checkpoint` package.
+import sys
+import torch
+import warnings
+
+from torch.distributed.checkpoint import *  # noqa: F403
+warnings.warn(
+    "torch.distributed._shard.checkpoint will be deprecated, use torch.distributed.checkpoint instead",
+    DeprecationWarning
+)
+sys.modules['torch.distributed._shard.checkpoint'] = torch.distributed.checkpoint
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd852c86a58024a1ac5eda84b2adf01d8fa55907
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/checkpoint/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/common_op_utils.py b/MLPY/Lib/site-packages/torch/distributed/_shard/common_op_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..001c1f312224af64ed1d1ca2230990cec2eee08c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/common_op_utils.py
@@ -0,0 +1,61 @@
+import torch
+from torch.utils import _pytree as pytree
+from typing import Optional
+
+def _basic_validation(op, args=(), kwargs=None):
+    """
+    Common validation across all ops go in here.
+    """
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+    if len(args) == 0 and (kwargs is None or len(kwargs) == 0):
+        raise ValueError(f" No input for '{op.__name__}'!")
+
+    # Validate types
+    has_distributed_tensor = False
+
+    def is_distributed_tensor(e):
+        nonlocal has_distributed_tensor
+        if isinstance(e, ShardedTensor):
+            has_distributed_tensor = True
+
+    pytree.tree_map_(is_distributed_tensor, args)
+    pytree.tree_map_(is_distributed_tensor, kwargs)
+
+    if not has_distributed_tensor:
+        raise TypeError(
+            f"torch function '{op.__name__}', with args: {args} and "
+            f"kwargs: {kwargs} are called without any distributed tensor!"
+        )
+
+    # Validate all distributed tensors use the same PG.
+    cur_pg: Optional[torch.distributed.ProcessGroup] = None
+
+    def validate_pg(e):
+        nonlocal cur_pg
+        if isinstance(e, ShardedTensor):
+            if cur_pg is not None and e._process_group is not cur_pg:
+                raise RuntimeError(
+                    'All distributed tensors should use the '
+                    'same ProcessGroup if used together in an op.'
+                )
+            cur_pg = e._process_group
+
+    pytree.tree_map_(validate_pg, args)
+    pytree.tree_map_(validate_pg, kwargs)
+
+def _register_default_op(op, decorator):
+    @decorator(op)
+    def tensor_default_op(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for the default tensor ops that
+        behave the same as ``torch.Tensor`` such as ``torch.Tensor.shape`` or
+        ``torch.Tensor.dtype``. We simply lower to the real op call with
+        DisableTorchFunctionSubclass context like ``torch.Tensor.__torch_function__``
+        to avoid recursions.
+        """
+        if kwargs is None:
+            kwargs = {}
+
+        with torch._C.DisableTorchFunctionSubclass():
+            return op(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/metadata.py b/MLPY/Lib/site-packages/torch/distributed/_shard/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..057f5a06fd7211ab0c749c6deb7f9dce45790616
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/metadata.py
@@ -0,0 +1,61 @@
+from dataclasses import dataclass
+from typing import List, Union, Optional
+from functools import reduce
+
+from torch.distributed.remote_device import _remote_device
+
+@dataclass
+class ShardMetadata:
+    """
+    Represents a shard of the overall Tensor including its
+    offsets, lengths and device placement.
+
+    Args:
+        shard_offsets(List[int]): Offsets in the original tensor indicating
+            the start offsets for this shard. Should have the same rank as
+            the original tensor.
+        shard_sizes(List[int]): Integers indicating the size of each
+            dimension for this shard. Should have the same rank as the
+            original tensor.
+        placement(:class:`torch.distributed._remote_device`):
+            Specifies the placement of this shard.
+    """
+
+    __slots__ = ['shard_offsets', 'shard_sizes', 'placement']
+
+    shard_offsets: List[int]
+    shard_sizes: List[int]
+    placement: Optional[_remote_device]
+
+    def __init__(
+        self,
+        shard_offsets: List[int],
+        shard_sizes: List[int],
+        placement: Optional[Union[str, _remote_device]] = None
+    ):
+        self.shard_offsets = shard_offsets
+        self.shard_sizes = shard_sizes
+        if isinstance(placement, str):
+            self.placement = _remote_device(placement)
+        else:
+            self.placement = placement
+        if len(self.shard_offsets) != len(self.shard_sizes):
+            raise ValueError(
+                f'shard_offsets and shard_sizes should have '
+                f'the same number of elements, found {len(self.shard_offsets)} '
+                f'and {self.shard_sizes} respectively')
+
+        for i in range(len(self.shard_offsets)):
+            if self.shard_offsets[i] < 0:
+                raise ValueError('shard_offsets should be >=0')
+            if self.shard_sizes[i] < 0:
+                raise ValueError('shard_sizes should be >= 0')
+
+    def __hash__(self):
+        def _hash_reduce(a, b):
+            return (a << 8) + hash(b)
+
+        res = reduce(_hash_reduce, self.shard_offsets, 37)
+        res = reduce(_hash_reduce, self.shard_sizes, res)
+        res = _hash_reduce(res, self.placement)
+        return res
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/op_registry_utils.py b/MLPY/Lib/site-packages/torch/distributed/_shard/op_registry_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af4a17ecb722383d089a8d445ed14850a3292f3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/op_registry_utils.py
@@ -0,0 +1,35 @@
+import functools
+from inspect import signature
+from .common_op_utils import _basic_validation
+
+"""
+Common utilities to register ops on ShardedTensor
+and PartialTensor.
+"""
+
+def _register_op(op, func, op_table):
+    """
+    Performs basic validation and registers the provided op in the given
+    op_table.
+    """
+    if len(signature(func).parameters) != 4:
+        raise TypeError(
+            f'Custom sharded op function expects signature: '
+            f'(types, args, kwargs, process_group), but received '
+            f'signature: {signature(func)}')
+
+    op_table[op] = func
+
+def _decorator_func(wrapped_func, op, op_table):
+    """
+    Decorator function to register the given ``op`` in the provided
+    ``op_table``
+    """
+
+    @functools.wraps(wrapped_func)
+    def wrapper(types, args, kwargs, process_group):
+        _basic_validation(op, args, kwargs)
+        return wrapped_func(types, args, kwargs, process_group)
+
+    _register_op(op, wrapper, op_table)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d65d720c8b229037f983bf1ac8d175f19f6155
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__init__.py
@@ -0,0 +1,54 @@
+from typing import Iterator, Tuple, Union
+from .api import ShardedOptimizer
+
+import torch.nn as nn
+
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor
+)
+
+def named_params_with_sharded_tensor(
+    module: nn.Module,
+    prefix: str = '',
+    recurse: bool = True,
+) -> Iterator[Tuple[str, Union[nn.Parameter, ShardedTensor]]]:
+
+    r"""Returns an iterator over module parameters (together with the
+    ShardedTensor parameters), yielding both the name of the parameter
+    as well as the parameter itself. This is typically passed to a
+    :class:torch.distributed._shard.sharded_optim.ShardedOptimizer
+
+    Args:
+        prefix (str): prefix to prepend to all parameter names.
+        recurse (bool): if True, then yields parameters of this module
+            and all submodules. Otherwise, yields only parameters that
+            are direct members of this module.
+
+    Yields:
+        (str, Union[Tensor, ShardedTensor]): Tuple containing
+            the name and parameter (or ShardedTensor parameter)
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> model = torch.nn.Linear(*linear_size)
+        >>> shard_parameter(model, "weight", spec)
+        >>> for name, param in named_params_with_sharded_tensor(model):
+        >>>    if name in ['weight']:
+        >>>        print(param.size())
+
+    """
+    modules = module.named_modules(prefix=prefix) if recurse else [(prefix, module)]
+
+    memo = set()
+    for mod_prefix, mod in modules:
+        # find all sharded tensor params
+        for name, val in vars(mod).items():
+            if isinstance(val, ShardedTensor) and val not in memo:
+                memo.add(val)
+                name = mod_prefix + ('.' if mod_prefix else '') + name
+                yield name, val
+
+    # find all nn.Parameters
+    for name, val in module.named_parameters():
+        yield name, val
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d3dcc94d3d5230bf33e8a667de0b08fea10f2b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b1008d0e12d1df25b38df6a52d582de29b512d4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/api.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14be300e67888aceb78f3124d004e37deea489b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_optim/api.py
@@ -0,0 +1,97 @@
+from typing import List, Union, Mapping, Dict, Any
+
+import torch.optim as optim
+from torch import Tensor
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+
+class ShardedOptimizer(optim.Optimizer):
+    def __init__(
+        self,
+        named_params: Mapping[str, Union[Tensor, ShardedTensor]],
+        optimizer_class,
+        *optimizer_args,
+        **optimizer_kwargs
+    ):
+        """
+        ShardedOptimizer collects all tensors and local shard tensors of
+        ShardedTensor, then use these tensors as ``params`` for optimizers
+
+        Args:
+            named_params (Dict[str, Union[Tensor, ShardedTensor]]) : a Dict
+                of parameters, where key is the parameter key, value is either
+                Tensor or ShardedTensor parameter.
+            optimizer_class (torch.optim.Optimizer): the Optimizer to use
+                locally, i.e. torch.optim.SGD, torch.optim.Adagrad, etc.
+            *optimizer_args: the arguments to initialize the optimizer.
+            **optimizer_kwargs: the key-word arguments to initialize the optimizer.
+
+        """
+        tensors: List[Tensor] = []
+        for value in named_params.values():
+            if isinstance(value, ShardedTensor):
+                for local_shard in value.local_shards():
+                    tensors.append(local_shard.tensor)
+            else:
+                tensors.append(value)
+
+        self.named_params = named_params
+        self._optim = optimizer_class(tensors, *optimizer_args, **optimizer_kwargs)
+        self.param_groups = self._optim.param_groups
+        self.state = self._optim.state
+
+    def zero_grad(self, set_to_none: bool = True):  # type: ignore[override]
+        r"""Resets the gradients of all optimized :class:`torch.Tensor` s.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                This will in general have lower memory footprint, and can modestly improve performance.
+                However, it changes certain behaviors. For example:
+                1. When the user tries to access a gradient and perform manual ops on it,
+                a None attribute or a Tensor full of 0s will behave differently.
+                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
+                are guaranteed to be None for params that did not receive a gradient.
+                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
+                (in one case it does the step with a gradient of 0 and in the other it skips
+                the step altogether).
+        """
+        self._optim.zero_grad(set_to_none)
+
+    def step(self, closure=None):
+        r"""Performs a single optimization step (parameter update).
+
+        Args:
+            closure (Callable): A closure that reevaluates the model and
+                returns the loss. Optional for most optimizers.
+
+        .. note::
+            Unless otherwise specified, this function should not modify the
+            ``.grad`` field of the parameters.
+        """
+        self._optim.step(closure)
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Returned state and param_groups will contain parameter keys
+        instead of parameter indices like torch.optim.Optimizer.
+        This allows for advanced functionality like optimizer re-sharding to be implemented.
+        """
+        # TODO: implement state_dict
+        raise NotImplementedError("ShardedOptimizer state_dict not implemented yet!")
+
+
+    def load_state_dict(self, state_dict: Mapping[str, Any]):
+        r"""Loads the ShardedOptimizer state.
+
+        Args:
+            state_dict (dict): ShardedOptimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # TODO: implement load_state_dict
+        raise NotImplementedError("ShardedOptimizer load_state_dict not implemented yet!")
+
+    def add_param_group(self, param_group: Any):
+        r"""Add a new param group
+        """
+        # TODO: implement add_param_group
+        raise NotImplementedError("ShardedOptimizer add_param_group not implemented yet!")
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..183854860b6899c398c0aa082197c24c29764b4d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__init__.py
@@ -0,0 +1,469 @@
+import functools
+from typing import List, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch.distributed._shard.sharding_spec import ShardingSpec
+else:
+    ShardingSpec = "ShardingSpec"
+
+from .api import (
+    _CUSTOM_SHARDED_OPS,
+    _SHARDED_OPS,
+    Shard,
+    ShardedTensorBase,
+    ShardedTensor,
+    ShardedTensorMetadata,
+    TensorProperties,
+)
+from .metadata import ShardMetadata  # noqa: F401
+from torch.distributed._shard.op_registry_utils import _decorator_func
+
+
+def empty(sharding_spec: ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False) -> ShardedTensor:
+    """
+    Returns a :class:`ShardedTensor` filled with uninitialized data.
+        Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    return ShardedTensor(
+        sharding_spec,
+        *size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+
+def ones(sharding_spec: ShardingSpec,
+         *size,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False) -> ShardedTensor:
+    """
+    Returns a :class:`ShardedTensor` with the scalar value 1.
+        Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    return full(
+        sharding_spec,
+        size,
+        fill_value=1,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs
+    )
+
+def zeros(sharding_spec: ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False) -> ShardedTensor:
+    """
+    Returns a :class:`ShardedTensor` filled with the scalar value 0.
+        Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    return full(
+        sharding_spec,
+        size,
+        fill_value=0,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs
+    )
+
+def full(sharding_spec: ShardingSpec,
+         size,
+         fill_value,
+         *,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False) -> ShardedTensor:
+    """
+    Creates a :class:`ShardedTensor` filled with fill_value. The tensor’s dtype
+        is inferred from fill_value. If dtype is specified, it will override the
+        inferred type from fill_value. Needs to be called on all ranks in an SPMD fashion.
+    Args:
+        sharding_spec (:class:`torch.distributed._sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
+            output tensor.
+        fill_value (Scalar) – the value to fill the output tensor with.
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    sharded_tensor = ShardedTensor(
+        sharding_spec,
+        *size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+    torch.nn.init.constant_(sharded_tensor, fill_value)  # type: ignore[arg-type]
+    return sharded_tensor
+
+def rand(sharding_spec: ShardingSpec,
+         *size,
+         dtype=None,
+         layout=torch.strided,
+         requires_grad=False,
+         pin_memory=False,
+         memory_format=torch.contiguous_format,
+         process_group=None,
+         init_rrefs=False) -> ShardedTensor:
+    """
+    Creates a :class:`ShardedTensor` filled with random numbers from a uniform distribution
+        on the interval :math:`[0, 1)`. The shape of the tensor is defined by the
+        variable argument `size`. Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
+            output tensor.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    sharded_tensor = ShardedTensor(
+        sharding_spec,
+        *size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+    torch.nn.init.uniform_(sharded_tensor, 0, 1)  # type: ignore[arg-type]
+    return sharded_tensor
+
+def randn(sharding_spec: ShardingSpec,
+          *size,
+          dtype=None,
+          layout=torch.strided,
+          requires_grad=False,
+          pin_memory=False,
+          memory_format=torch.contiguous_format,
+          process_group=None,
+          init_rrefs=False) -> ShardedTensor:
+    """
+    Creates a :class:`ShardedTensor` filled with random numbers from a uniform distribution
+        with mean `0` and variance `1` (also called standard normal distribution). The shape
+        of the tensor is defined by the variable argument `size`. Needs to be called on all ranks
+        in an SPMD fashion.
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...):  a list, tuple, or `torch.Size` of integers defining the shape of the
+            output tensor.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object on each rank
+    """
+    sharded_tensor = ShardedTensor(
+        sharding_spec,
+        *size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        pin_memory=pin_memory,
+        memory_format=memory_format,
+        process_group=process_group,
+        init_rrefs=init_rrefs,
+    )
+    torch.nn.init.normal_(sharded_tensor, 0, 1)  # type: ignore[arg-type]
+    return sharded_tensor
+
+def init_from_local_shards(
+        local_shards: List[Shard],
+        *global_size,
+        process_group=None,
+        init_rrefs=False) -> ShardedTensor:
+    """
+    Creates an :class:`ShardedTensor` from local shards and the global metadata.
+    Needs to be called on all ranks in an SPMD fashion.
+
+    Args:
+        local_shards (List[:class `torch.distributed._shard.sharded_tensor.Shard`]): A list
+            of shards that represent the local shards on this rank.
+        global_size (int...):  a list, tuple, or `torch.Size` of integers defining the
+            shape of the overall sharded tensor.
+
+    Keyword args:
+        process_group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    Returns:
+        A :class:`ShardedTensor` object handle on this rank
+
+
+    Examples:
+        Suppose we want construct a sharded tensor on two ranks, global size = (10, 5),
+        each shard have a (5, 5) local tensor, we can do it like below:
+
+        on rank 0:
+        >>> # xdoctest: +SKIP("not distributed")
+        >>> local_shard_metadata = ShardMetadata(
+        >>>     shard_offsets=[0, 0],
+        >>>     shard_lengths=[5, 5],
+        >>>     placement="rank:0/cuda:0"
+        >>> )
+        >>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
+        >>> sharded_tensor = init_from_local_shards(local_shards, [10, 5])
+
+        on rank 1:
+        >>> # xdoctest: +SKIP("not distributed")
+        >>> local_shard_metadata = ShardMetadata(
+        >>>     shard_offsets=[5, 0],
+        >>>     shard_lengths=[5, 5],
+        >>>     placement="rank:1/cuda:1"
+        >>> )
+        >>> local_shards = [Shard(torch.randn(5, 5), local_shard_metadata)]
+        >>> sharded_tensor = init_from_local_shards(local_shards, [10, 5])
+    """
+    return ShardedTensor._init_from_local_shards(
+        local_shards,
+        *global_size,
+        process_group=process_group,
+        init_rrefs=init_rrefs
+    )
+
+def state_dict_hook(module, destination, prefix, local_metadata):
+    """
+    Hook to add ShardedTensor to Module's ``state_dict``. Needs to be
+    registered to the Module using
+    :meth:`torch.nn.Module._register_state_dict_hook`.
+    """
+    for submodule_name, submodule in module.named_modules():
+        for attr_name, attr in submodule.__dict__.items():
+            if isinstance(attr, ShardedTensor):
+                mod_prefix = prefix + submodule_name
+                key = mod_prefix + ('.' if mod_prefix else '') + attr_name
+                destination[key] = attr
+
+def pre_load_state_dict_hook(module, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+    """
+    Pre-load state dict hook to add ShardedTensor to the module.
+    """
+    for submodule_name, submodule in module.named_modules():
+        for attr_name in submodule.__dict__.keys():
+            mod_prefix = prefix + submodule_name
+            key = mod_prefix + ('.' if mod_prefix else '') + attr_name
+            if key in state_dict:
+                if isinstance(state_dict[key], ShardedTensor):
+                    setattr(submodule, attr_name, state_dict[key])
+
+def custom_sharded_op_impl(func):
+    """
+    Provides a way for users to write their own custom sharded operator. This
+    can be used to override existing ShardedTensor operators or write a new
+    one not supported by ShardedTensor. If the operator in question is covered
+    by ``__torch_function__`` dispatch and has a ShardedTensor as any of its
+    parameters, the function provided will be invoked for that operator.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> @custom_sharded_op_impl(torch.nn.functional.linear)
+        >>> def my_custom_sharded_linear(types, args, kwargs, process_group):
+        >>>     ...
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> input = torch.rand(10, 32)
+        >>> weight = sharded_tensor.rand(32, 16)
+        >>> bias = torch.rand(16)
+        >>> # This will call 'my_custom_sharded_linear'
+        >>> torch.nn.functional.linear(input, weight, bias)
+
+    The types, args and kwargs parameters are the same parameters that are
+    passed to ``__torch_function__`` dispatch API
+    (https://pytorch.org/docs/stable/notes/extending.html#extending-torch).
+    There is an additional ``process_group`` parameter which is the
+    process_group used for the ShardedTensor and can be used by
+    implementations for communications within a sharded implementation.
+
+    Args:
+        func(Callable): Torch function for which we want to provide a sharded
+            implementation (ex: torch.nn.functional.linear)
+    """
+    return functools.partial(
+        _decorator_func,
+        op=func,
+        op_table=_CUSTOM_SHARDED_OPS
+    )
+
+def _sharded_op_impl(func):
+    """
+    Decorator to register a default sharded op.
+    """
+    return functools.partial(
+        _decorator_func,
+        op=func,
+        op_table=_SHARDED_OPS
+    )
+
+# Import all builtin sharded ops
+from ._ops import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c605dc866f3baf727b34be264e1e083cba7f309
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b37316e2b1f0720ef634aa67634376b38a49ee28
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logger.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logger.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5088979651b8348215a5e61cd8e397719b4bc870
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logger.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logging_handlers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logging_handlers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a926eaf91a0c857eeb46c87c16979e8845e1bc1b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/logging_handlers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/metadata.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e31044bd80026f8934bbcf2e22384b47b178184c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/metadata.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/reshard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/reshard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc13fdf4927482a6b41284775ad92e86c783045f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/reshard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/shard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/shard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5fc69baf094b5d5bf772c17b7f00a4323a6385
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/shard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70c89a491bd6137aa2548c76bd39d5f80dc2c0cf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a994f8e214a17e29349c94e7c823977e5d0962b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__init__.py
@@ -0,0 +1,9 @@
+import torch.distributed._shard.sharded_tensor._ops.misc_ops
+import torch.distributed._shard.sharded_tensor._ops.tensor_ops
+
+from .binary_cmp import equal, allclose
+from .init import kaiming_uniform_, normal_, uniform_, constant_
+
+# Import all ChunkShardingSpec ops
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.embedding import sharded_embedding
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec_ops.embedding_bag import sharded_embedding_bag
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57293da37571eeaf444b1096816891f2b1aacfb1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3eff78092a9e142b3a4d6273f86e14693f33809
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/binary_cmp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/binary_cmp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ca60197de7ceae3f09a1814d94acf5aeab0b971
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/binary_cmp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/init.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e395827cb657597984fe1a47413a390ff4bbc562
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/init.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/misc_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/misc_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5138841b244eda2ba1b33bbf38a538ce305bd90d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/misc_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/tensor_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/tensor_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2cbbb5a40da7578c5fc24f0b5e70daa07b52738
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/__pycache__/tensor_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/_common.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d094dda16c6aee25c4c498ba217fb3dc76b0342
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/_common.py
@@ -0,0 +1,107 @@
+import functools
+from torch.distributed._shard.sharded_tensor import (
+    _sharded_op_impl,
+    Shard,
+    ShardedTensor,
+)
+from torch.distributed._shard.common_op_utils import _basic_validation
+
+def _sharded_op_common(op, early_stop_func, extra_check):
+    """
+    Inject sharded tensor op registration with common logics executed before
+    different behaviors are done on either local shards or a local tensor.
+
+    Example::
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> op = torch.transpose
+        >>> @_sharded_op_impl(op)
+        >>> @_sharded_op_common(op, early_stop_func, extra_check)
+        >>> def sharded_tensor_op(types, args, kwargs, process_group):
+        >>>   ...
+        >>>
+        >>> st = sharded_tensor.rand(32, 16)
+        >>> st.transpose(1, 2)
+        >>> # This will call '_sharded_op_common'
+
+    Args:
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
+
+    Return:
+        func (Callable): Torch function for which we want to provide a sharded
+            implementation (ex: torch.transpose)
+    """
+    def decorator_sharded_func(wrapped_func):
+        @functools.wraps(wrapped_func)
+        def wrapper(types, args=(), kwargs=None, pg=None):
+            _basic_validation(op, args, kwargs)
+
+            st = args[0]
+            if kwargs is None:
+                kwargs = {}
+            if extra_check:
+                extra_check(*args, **kwargs)
+            if early_stop_func:
+                early_stop = early_stop_func(*args, **kwargs)
+                if early_stop:
+                    return st
+            return wrapped_func(types, args, kwargs, pg)
+
+        return wrapper
+
+    return decorator_sharded_func
+
+def _register_sharded_op_on_local_shards(
+    op, early_stop_func=None, extra_check=None, customized_func=None
+):
+    """
+    Handles ``__torch_function__`` dispatch for ops which are performed on
+    each shard of the sharded tensor such as elementwise op like
+    ``torch.nn.functional.gelu`` or ``torch.nn.functional.relu``.
+
+    For more complicated ops, a customized func can be used to generate
+    the new shards and sharded tensor size.
+
+    This function expects that the original ShardingSpec for the ShardedTensor
+    is preserved irrespective of whether or not a customized function is used.
+
+    Args:
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
+        customized_func (Callable, optional): the func for customized logic
+            to generate new shards and sharded tensor size.
+            Default: if ``None``, we simply lower to the real op call with
+                all local shards of the st.
+
+    Return:
+        func (Callable): registered implementation for sharded op for
+        ``__torch_function__`` dispatch.
+    """
+    @_sharded_op_impl(op)
+    @_sharded_op_common(op, early_stop_func, extra_check)
+    def sharded_tensor_op_on_local_shards(types, args=(), kwargs=None, pg=None):
+        st = args[0]
+        st_metadata = st.metadata()
+        local_shards = st.local_shards()
+        local_shards_new = []
+        if customized_func:
+            local_shards_new, st_metadata = customized_func(args, kwargs, pg)
+        else:
+            for local_shard in local_shards:
+                args = (local_shard.tensor, *args[1:])
+                local_shards_new.append(
+                    Shard(op(*args, **kwargs), local_shard.metadata)
+                )
+        return ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards_new,
+            st_metadata,
+            process_group=pg,
+            init_rrefs=st._init_rrefs,
+            sharding_spec=st.sharding_spec()
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9809e70d718399e99fbd20a47779456f4238692f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py
@@ -0,0 +1,68 @@
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as distributed_c10d
+from torch.distributed._shard.sharded_tensor import (
+    ShardedTensor,
+    _sharded_op_impl
+)
+
+def _communicate_result(result, pg):
+    # Gather results from all ranks.
+    if result:
+        result_tensor = torch.ones(1, device=torch.device(torch.cuda.current_device()))
+    else:
+        result_tensor = torch.zeros(1, device=torch.device(torch.cuda.current_device()))
+
+    dist.all_reduce(result_tensor, group=pg)
+
+    expected_result = torch.ones(1, device=torch.device(torch.cuda.current_device())) * dist.get_world_size(pg)
+
+    return torch.equal(result_tensor, expected_result)
+
+def binary_cmp(cmp_fun, types, args, kwargs=None, process_group=None):
+    if len(args) != 2:
+        raise ValueError(f'Expected two arguments for torch.{cmp_fun.__name__}')
+
+    result = True
+    st1 = args[0]
+    st2 = args[1]
+    if not (isinstance(st1, ShardedTensor) and isinstance(st2, ShardedTensor)):
+        raise TypeError(f'Both arguments to torch.{cmp_fun.__name__} need to be of type ShardedTensor')
+
+    # Verify same PG
+    if st1._process_group != st2._process_group:
+        return False
+
+    if distributed_c10d._rank_not_in_group(st1._process_group) or distributed_c10d._rank_not_in_group(st2._process_group):
+        return distributed_c10d._rank_not_in_group(st1._process_group) == distributed_c10d._rank_not_in_group(st2._process_group)
+
+    # Verify metadata
+    if st1.metadata() != st2.metadata():
+        return _communicate_result(False, st1._process_group)
+
+    # Verify number of local shards
+    st1_local_shards = st1.local_shards()
+    st2_local_shards = st2.local_shards()
+    if len(st1_local_shards) != len(st2_local_shards):
+        return _communicate_result(False, st1._process_group)
+
+    # kwargs must be dict-like
+    if kwargs is None:
+        kwargs = {}
+    # Verify each local shard
+    for idx in range(len(st1_local_shards)):
+        if st1_local_shards[idx].metadata != st2_local_shards[idx].metadata:
+            return _communicate_result(False, st1._process_group)
+        if not cmp_fun(st1_local_shards[idx].tensor, st2_local_shards[idx].tensor, **kwargs):
+            return _communicate_result(False, st1._process_group)
+
+
+    return _communicate_result(True, st1._process_group)
+
+@_sharded_op_impl(torch.equal)
+def equal(types, args, kwargs, process_group):
+    return binary_cmp(torch.equal, types, args, kwargs, process_group)
+
+@_sharded_op_impl(torch.allclose)
+def allclose(types, args, kwargs, process_group):
+    return binary_cmp(torch.allclose, types, args, kwargs, process_group)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/init.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd9bfaa083b4f8dccc0372c4577446bc88ac972
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/init.py
@@ -0,0 +1,143 @@
+import torch
+import torch.distributed._shard.sharded_tensor as sharded_tensor
+from torch.distributed._shard.sharded_tensor import (
+    _sharded_op_impl,
+)
+
+def validate_param(param, param_name):
+    if param is None:
+        raise ValueError(f"param: {param_name} shouldn't be None!")
+
+@_sharded_op_impl(torch.nn.init.uniform_)
+def uniform_(types, args=(), kwargs=None, pg=None):
+    r"""
+    Fills the Tensor in tensor.local_shards with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+    Args:
+        tensor: tensor sharded across devices
+        a: the lower bound of the uniform distribution
+        b: the upper bound of the uniform distribution
+    """
+    validate_param(kwargs, "kwargs")
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    a = kwargs['a']
+    validate_param(a, "a")
+    b = kwargs['b']
+    validate_param(b, "b")
+
+    for shard in sharded_tensor.local_shards():
+        torch.nn.init.uniform_(shard.tensor, a=a, b=b)
+    return sharded_tensor
+
+@_sharded_op_impl(torch.nn.init.normal_)
+def normal_(types, args=(), kwargs=None, pg=None):
+    r"""
+    Fills the Tensors in tensor.local_shards with values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+    Args:
+        tensor: tensor sharded across devices
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+    """
+    validate_param(kwargs, "kwargs")
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    mean = kwargs['mean']
+    validate_param(mean, "mean")
+    std = kwargs['std']
+    validate_param(std, "std")
+
+    for shard in sharded_tensor.local_shards():
+        torch.nn.init.normal_(shard.tensor, mean=mean, std=std)
+    return sharded_tensor
+
+@_sharded_op_impl(torch.nn.init.kaiming_uniform_)
+def kaiming_uniform_(types, args=(), kwargs=None, pg=None):
+    r"""
+    Fills the Tensors in tensor.local_shards with values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification` - He, K. et al. (2015), using a
+    uniform distribution. The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
+    Also known as He initialization.
+    Args:
+        tensor: tensor sharded across devices
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+    """
+    validate_param(kwargs, "kwargs")
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    a = kwargs['a']
+    validate_param(a, "a")
+    mode = kwargs['mode']
+    validate_param(mode, "mode")
+    nonlinearity = kwargs['nonlinearity']
+    validate_param(nonlinearity, "nonlinearity")
+
+    for shard in sharded_tensor.local_shards():
+        torch.nn.init.kaiming_uniform_(shard.tensor, a=a, mode=mode, nonlinearity=nonlinearity)
+    return sharded_tensor
+
+@_sharded_op_impl(torch.nn.init.constant_)
+def constant_(types, args=(), kwargs=None, pg=None):
+    r"""
+    Fills the input ShardedTensor with the value \text{val}val.
+    Args:
+        tensor: tensor sharded across devices
+        val: the value to fill the tensor with
+    """
+    validate_param(kwargs, "kwargs")
+    sharded_tensor = kwargs["tensor"]
+    validate_param(sharded_tensor, "tensor")
+    val = kwargs['val']
+    validate_param(val, "val")
+    for shard in sharded_tensor.local_shards():
+        torch.nn.init.constant_(shard.tensor, val=val)
+    return sharded_tensor
+
+tensor_like_creation_op_map = {
+    torch.full_like: sharded_tensor.full,
+    torch.empty_like: sharded_tensor.empty,
+    torch.zeros_like: sharded_tensor.zeros,
+    torch.ones_like: sharded_tensor.ones,
+    torch.rand_like: sharded_tensor.rand,
+    torch.randn_like: sharded_tensor.randn,
+}
+
+# tensor ops that behave the same as the default tensor
+def register_tensor_creation_op(op):
+    @_sharded_op_impl(op)
+    def tensor_creation_op(types, args=(), kwargs=None, pg=None):
+        """
+        Handles ``__torch_function__`` dispatch for tensor creation ops that
+        takes a ShardedTensor as argument, such as ``torch.zeros_like`` or
+        ``torch.full_like``.
+        """
+        creation_op = tensor_like_creation_op_map.get(op, None)
+        if creation_op is None:
+            raise RuntimeError(f"Tensor creation {op} not supported!")
+        if kwargs is None:
+            kwargs = {}
+
+        st = args[0]
+
+        new_st = creation_op(st.sharding_spec(), st.size(), *args[1:], **kwargs)  # type: ignore[operator]
+        return new_st
+
+
+register_tensor_creation_op(torch.full_like)
+register_tensor_creation_op(torch.empty_like)
+register_tensor_creation_op(torch.zeros_like)
+register_tensor_creation_op(torch.ones_like)
+register_tensor_creation_op(torch.rand_like)
+register_tensor_creation_op(torch.randn_like)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/misc_ops.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/misc_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..409e6495e803eba46b045f1cdf9a395fac858085
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/misc_ops.py
@@ -0,0 +1,12 @@
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    _sharded_op_impl,
+)
+
+# This is used by `_apply()` within module.py to set new
+# parameters after apply a certain method, we should follow
+# the future behavior of overwriting the existing tensor
+# instead of doing in-place change using `.data = `.
+@_sharded_op_impl(torch._has_compatible_shallow_copy_type)
+def tensor_has_compatible_shallow_copy_type(types, args=(), kwargs=None, pg=None):
+    return False
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a6b7dfdba1f8b85caf7007ac0ce5ffc3f6e8922
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py
@@ -0,0 +1,215 @@
+import copy
+import torch
+from torch.distributed._shard.sharded_tensor import (
+    _sharded_op_impl,
+    Shard,
+    ShardedTensor,
+)
+from ._common import (
+    _register_sharded_op_on_local_shards,
+)
+from torch.distributed._shard.common_op_utils import _register_default_op
+
+
+# Tensor properties access
+_register_default_op(torch.Tensor.shape.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+_register_default_op(torch.Tensor.dtype.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+_register_default_op(torch.Tensor.layout.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+_register_default_op(torch.Tensor.size, _sharded_op_impl)
+_register_default_op(torch.Tensor.dim, _sharded_op_impl)
+_register_default_op(torch.Tensor.ndim.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+_register_default_op(torch.Tensor.is_contiguous, _sharded_op_impl)
+_register_default_op(torch.Tensor.contiguous, _sharded_op_impl)
+_register_default_op(torch.Tensor.is_floating_point, _sharded_op_impl)
+
+# __reduce_ex__ to dispatch to get_state/set_state
+_register_default_op(torch.Tensor.__reduce_ex__, _sharded_op_impl)
+
+# autograd related properties
+_register_default_op(torch.Tensor.requires_grad.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+# TODO: set grad with a ShardedTensor that consists of all local grads
+_register_default_op(torch.Tensor.grad.__get__, _sharded_op_impl)  # type: ignore[union-attr]
+_register_default_op(torch.Tensor.grad_fn.__get__, _sharded_op_impl)  # type: ignore[union-attr]
+_register_default_op(torch.Tensor.is_leaf.__get__, _sharded_op_impl)  # type: ignore[attr-defined]
+
+# device property is ambiguous as from a global prospective,
+# ShardedTensor.device consists of multiple devices (might even across hosts)
+# We choose to return the current device of the local tensor to represent
+# the device property on each rank
+@_sharded_op_impl(torch.Tensor.device.__get__)
+def tensor_device(types, args=(), kwargs=None, pg=None):
+    self_st = args[0]
+    # Validate types
+    if not isinstance(self_st, ShardedTensor):
+        raise TypeError("input needs to be a ShardedTensor")
+    dev: torch.device
+    if self_st._local_shards:
+        dev = self_st._local_shards[0].tensor.device
+    elif pg and pg._get_backend_name() == "gloo":
+        dev = torch.device("cpu")
+    else:
+        dev = torch.device(torch.cuda.current_device())
+    return dev
+
+@_sharded_op_impl(torch.Tensor.is_meta.__get__)  # type: ignore[attr-defined]
+def st_is_meta(types, args=(), kwargs=None, pg=None):
+    return args[0].local_tensor().is_meta
+
+
+def sharded_type_as_check(*args, **kwargs):
+    """
+    Perform extra checks for the sharded_type_as op such as the input needs to
+    be either a Tensor or ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return: None
+    """
+    if len(args) < 2:
+        raise ValueError("Needs to give a tensor to cast type as!")
+    if not isinstance(args[1], torch.Tensor) and not isinstance(args[1], ShardedTensor):
+        raise ValueError("Needs to give a Tensor or ShardedTensor to cast type as!")
+
+
+def same_dtype(*args, **kwargs):
+    """
+    When the dtype is the same, return the original ShardedTensor.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return (bool): Whether to return early or not.
+    """
+    return args[0].dtype == args[1].dtype
+
+
+def sharded_type_as(args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for the ``torch.Tensor.type_as`` op.
+
+    Args: same as ``torch.Tensor.type_as``.
+
+    Return:
+        new_local_shards (List[Shard]): Local shards for the new sharded tensor.
+        st_meta (ShardedTensorMetadata): Metadata of the new sharded tensor.
+    """
+    st = args[0]
+    tensor = args[1]
+    if isinstance(tensor, ShardedTensor):
+        tensor = tensor.local_tensor()
+    new_local_shards = []
+    for shard in st.local_shards():
+        new_local_shards.append(Shard(shard.tensor.type_as(tensor), shard.metadata))
+    st_meta = copy.deepcopy(st._metadata)
+    st_meta.tensor_properties.dtype = tensor.dtype
+    return new_local_shards, st_meta
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.type_as,
+    early_stop_func=same_dtype,
+    extra_check=sharded_type_as_check,
+    customized_func=sharded_type_as,
+)
+
+
+def sharded_deepcopy(args, kwargs, pg):
+    # NOTE: we directly implement deepcopy magic method
+    # instead of using the default tensor.__deepcopy__
+    # and implement clone(). This is because the default
+    # tensor deepcopy copies every attribute, but the
+    # process_group in ShardedTensor cannot be deep copied.
+    self_st = args[0]
+    new_local_shards = copy.deepcopy(self_st.local_shards())
+    new_metadata = copy.deepcopy(self_st.metadata())
+    return new_local_shards, new_metadata
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.__deepcopy__,
+    customized_func=sharded_deepcopy,
+)
+
+
+@_sharded_op_impl(torch.Tensor.copy_)
+def sharded_inplace_copy(types, args, kwargs, pg):
+    # NOTE: inplace op don't need to rewrap
+    kwargs = {} if kwargs is None else kwargs
+    self_st = args[0]
+    new_st = args[1]
+    nonblocking = kwargs.get("non_blocking", False)
+    for local_shard, new_shard in zip(self_st.local_shards(), new_st.local_shards()):
+        if local_shard.metadata != new_shard.metadata:
+            raise RuntimeError(
+                "inplace copy can only happen between two ShardedTensor with same metadata!"
+            )
+    for local_shard, new_shard in zip(self_st.local_shards(), new_st.local_shards()):
+        local_shard.tensor.copy_(new_shard.tensor, nonblocking)
+
+    return self_st
+
+
+def sharded_clone(args, kwargs, pg):
+    self_st = args[0]
+    desire_memory_format = kwargs.get("memory_format", None)
+    if desire_memory_format and desire_memory_format != torch.preserve_format:
+        raise RuntimeError("Only support torch.preserve_format for ShardedTensor!")
+    cloned_local_shards = [
+        Shard(
+            local_shard.tensor.clone(memory_format=desire_memory_format),
+            metadata=copy.deepcopy(local_shard.metadata),
+        )
+        for local_shard in self_st.local_shards()
+    ]
+    new_metadata = copy.deepcopy(self_st.metadata())
+    return cloned_local_shards, new_metadata
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.clone,
+    customized_func=sharded_clone,
+)
+
+
+def sharded_detach(args, kwargs, pg):
+    self_st = args[0]
+    detached_local_shards = [
+        Shard(
+            local_shard.tensor.detach(),
+            metadata=copy.deepcopy(local_shard.metadata),
+        )
+        for local_shard in self_st.local_shards()
+    ]
+    new_metadata = copy.deepcopy(self_st.metadata())
+    new_metadata.tensor_properties.requires_grad = False
+    return detached_local_shards, new_metadata
+
+
+_register_sharded_op_on_local_shards(
+    torch.Tensor.detach,
+    customized_func=sharded_detach,
+)
+
+
+@_sharded_op_impl(torch.Tensor.requires_grad_)
+def tensor_requires_grad_set(types, args=(), kwargs=None, pg=None):
+    self_st = args[0]
+    # Validate types
+    if not isinstance(self_st, ShardedTensor):
+        raise TypeError("input needs to be a ShardedTensor")
+
+    if kwargs is None:
+        kwargs = {}
+
+    requires_grad = args[1] if len(args) > 1 else kwargs.get("requires_grad", True)
+    if requires_grad == self_st.requires_grad:
+        return self_st
+
+    for local_shard in self_st.local_shards():
+        local_shard.tensor.requires_grad_(requires_grad)
+
+        # update the wrapper class property
+    with torch._C.DisableTorchFunctionSubclass():
+        self_st.requires_grad_(requires_grad)
+    # update the metadata in the meanwhile
+    self_st._metadata.tensor_properties.requires_grad = requires_grad
+    return self_st
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/api.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a072dade94e429c2b96279d8e2c7a73015cf5e5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/api.py
@@ -0,0 +1,1253 @@
+from __future__ import annotations  # type: ignore[attr-defined]
+from dataclasses import dataclass
+from typing import (
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    cast,
+)
+import copy
+import warnings
+from functools import reduce
+import weakref
+
+import threading
+import torch
+import torch.distributed as dist
+from torch.distributed import rpc
+from torch.distributed import distributed_c10d
+from torch.distributed._shard.metadata import ShardMetadata
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.sharding_spec.api import (
+    _dispatch_custom_op,
+    _has_custom_op,
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    check_tensor,
+    validate_non_overlapping_shards_metadata,
+)
+from torch.distributed._shard._utils import (
+    DEPRECATE_MSG,
+)
+
+from .metadata import TensorProperties, ShardedTensorMetadata
+from .shard import Shard
+from .reshard import reshuffle_local_shard, reshard_local_shard
+from .utils import (
+    _flatten_tensor_size,
+    _parse_and_validate_remote_device,
+    _validate_output_tensor_for_gather,
+    build_metadata_from_local_shards,
+    build_global_metadata
+)
+from torch.distributed.remote_device import _remote_device
+from torch.utils import _pytree as pytree
+import operator
+
+# Tracking for sharded tensor objects.
+_sharded_tensor_lock = threading.Lock()
+_sharded_tensor_current_id = 0
+_sharded_tensor_map: Dict[int, weakref.ReferenceType[ShardedTensor]] = {}
+
+# Default sharded ops
+_SHARDED_OPS: Dict[Callable, Callable] = {}
+
+# Customized user ops
+_CUSTOM_SHARDED_OPS: Dict[Callable, Callable] = {}
+
+def _register_remote_shards(sharded_tensor_id: int, rrefs: List[rpc.RRef[Shard]], rpc_rank: int):
+    with _sharded_tensor_lock:
+        if sharded_tensor_id not in _sharded_tensor_map:
+            raise RuntimeError(
+                f'Could not find sharded_tensor_id: {sharded_tensor_id} in map: {_sharded_tensor_map.keys()}')
+
+        sharded_tensor = _sharded_tensor_map[sharded_tensor_id]()
+        if sharded_tensor is None:
+            raise RuntimeError('ShardedTensor weakref has been deallocated')
+        else:
+            sharded_tensor._register_remote_shards(rrefs, rpc_rank)
+
+class ShardedTensorBase(torch.Tensor):
+    _sharding_spec: shard_spec.ShardingSpec
+    _metadata: ShardedTensorMetadata
+    _local_shards: List[Shard]
+
+    def __new__(cls, sharding_spec: shard_spec.ShardingSpec, *size, **kwargs):
+        # Use __new__ to construct a wrapper tensor, for recording tensor
+        # properties and logging purposes.
+        torch._C._log_api_usage_once("torch.distributed._shard.sharded_tensor")
+
+        # check sharding spec and build sharded tensor metadata
+        if not isinstance(sharding_spec, shard_spec.ShardingSpec):
+            raise ValueError(f"Expecting ShardingSpec but got: {type(sharding_spec)}")
+
+        sizes = _flatten_tensor_size(size)
+        dtype = kwargs["dtype"]
+        layout = kwargs["layout"]
+        pin_memory = kwargs["pin_memory"]
+        requires_grad = kwargs["requires_grad"]
+
+        if dtype is None:
+            dtype = torch.get_default_dtype()
+
+        tensor_properties = TensorProperties(
+            dtype, layout, requires_grad, pin_memory=pin_memory
+        )
+        sharded_tensor_metadata = sharding_spec.build_metadata(
+            sizes, tensor_properties=tensor_properties
+        )
+
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            sizes,
+            dtype=dtype,
+            layout=layout,
+            pin_memory=pin_memory,
+            requires_grad=requires_grad,
+        )
+        # set sharding spec
+        r._sharding_spec = sharding_spec
+        # set metadata
+        r._metadata = sharded_tensor_metadata
+        # set local shards
+        r._local_shards = []
+        return r
+
+    def metadata(self) -> ShardedTensorMetadata:
+        """
+        Returns a :class:`ShardedTensorMetadata` object corresponding to the
+        metadata for the entire tensor.
+        """
+        return self._metadata
+
+    def local_shards(self) -> List[Shard]:
+        """
+        Returns a list of :class:`Shard' corresponding to the
+        local shards for this rank. Returns an empty list if the current rank
+        does not host any shards for this Tensor.
+        """
+        return self._local_shards
+
+    @classmethod
+    def _init_from_local_shards_and_global_metadata(
+        cls,
+        local_shards: List[Shard],
+        sharded_tensor_metadata: ShardedTensorMetadata,
+        sharding_spec=None,
+    ) -> ShardedTensorBase:
+        """
+        Initialize a ShardedTensorBase with local shards and a global
+        ShardedTensorMetadata built on each rank.
+        Warning: This API is experimental and subject to change. It does
+                 not do cross rank validations, and fully rely on the user
+                 for the correctness of sharded_tensor_metadata on each rank
+        """
+        shards_metadata = sharded_tensor_metadata.shards_metadata
+        tensor_properties = sharded_tensor_metadata.tensor_properties
+
+        if len(shards_metadata) == 0:
+            raise ValueError("shards_metadata must not be empty!")
+
+        if tensor_properties.layout != torch.strided:
+            raise ValueError("Only torch.strided layout is currently supported")
+
+        if sharding_spec is None:
+            spec = shard_spec._infer_sharding_spec_from_shards_metadata(shards_metadata)
+        else:
+            spec = sharding_spec
+
+        sharded_tensor_base = ShardedTensorBase.__new__(
+            ShardedTensor,
+            spec,
+            sharded_tensor_metadata.size,
+            dtype=tensor_properties.dtype,
+            layout=tensor_properties.layout,
+            pin_memory=tensor_properties.pin_memory,
+            requires_grad=tensor_properties.requires_grad,
+        )
+
+        # check if shards_metadata have overlap shards
+        validate_non_overlapping_shards_metadata(shards_metadata)
+
+        # check if the shards_metadata is compatible with overall size of the sharded tensor.
+        check_tensor(shards_metadata, list(sharded_tensor_metadata.size))
+
+        # done validation, add local_shards
+        sharded_tensor_base._local_shards = local_shards
+        return sharded_tensor_base
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        raise RuntimeError(
+            f"A {cls.__name__} object is being used from c++ while calling {func.__module__}.{func.__name__} "
+            "but the there is no custom __torch_dispatch__ implementation for it."
+        )
+
+class ShardedTensor(ShardedTensorBase):
+    """
+    ShardedTensor is an torch.Tensor subclass to represent Tensors that are sharded
+    across multiple devices and multiple processes.
+
+    ShardedTensor is initialized in an SPMD like fashion where each rank
+    initializes the ShardedTensor. The ShardedTensor object on each rank
+    then only stores the local shard for the Tensor and provides global
+    metadata for all the shards.
+
+    ShardedTensor doesn't provide any Tensor like operations but is a wrapper
+    providing the Tensor representing the local shard and the global metadata.
+    Using these, users can build their custom distributed._sharded computations
+    on top of this primitive. The local shards are all initialized using the
+    create_op specified by tensor_init_params.create_op, e.g., torch.ones, or
+    torch.empty
+
+    Args:
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The specification
+            describing how to shard the Tensor.
+        size (int...): a sequence of integers defining the shape of the output
+            tensor. Can be a variable number of arguments or a collection like a list or tuple.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+                Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned Tensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned tensor. Default: ``False``.
+        pin_memory (bool, optional): If set, returned tensor would be allocated in
+            the pinned memory. Works only for CPU tensors. Default: ``False``.
+        memory_format (:class:`torch.memory_format`, optional): the desired memory format of
+            returned Tensor. Default: ``torch.contiguous_format``.
+        init_rrefs (bool, optional): Whether or not to initialize
+            :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+            Need to initialize the RPC Framework if specified as ``True``.
+            Default: ``False``.
+
+    .. note:: ShardedTensor uses collectives to do various operations, i.e. it
+        uses all_gather to do cross rank validations. For NCCL-based process
+        groups, internal tensor representations of objects must be moved to the
+        GPU device before communication takes place. In this case, the device
+        used is given by ``torch.cuda.current_device()`` and it is the user's
+        responsibility to ensure that this is set so that each rank has an
+        individual GPU, via ``torch.cuda.set_device()``
+
+    """
+    def __new__(cls, sharding_spec: shard_spec.ShardingSpec, *size, **kwargs):
+        self = super().__new__(cls, sharding_spec, *size, **kwargs)
+        return self
+
+    def __init__(
+        self,
+        sharding_spec: shard_spec.ShardingSpec,
+        *size,
+        dtype=None,
+        layout=torch.strided,
+        requires_grad=False,
+        pin_memory=False,
+        memory_format=torch.contiguous_format,
+        process_group=None,
+        init_rrefs=False,
+    ):
+        # prepare initialization, initialize fields like
+        # _process_group, _local_shards, etc.
+        self._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
+
+        if layout != torch.strided:
+            raise ValueError('Only torch.strided layout is currently supported')
+
+        if memory_format != torch.contiguous_format:
+            raise ValueError('Only torch.contiguous_format memory_format is currently supported')
+
+        self._metadata.tensor_properties.memory_format = memory_format
+
+        current_rank = dist.get_rank(self._process_group)
+
+        for shard_metadata in self._metadata.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(self._process_group, shard_metadata.placement)
+            if rank == current_rank:
+                local_tensor = _create_tensor_from_params(
+                    shard_metadata.shard_sizes,
+                    local_device=device,
+                    tensor_properties=self._metadata.tensor_properties
+                )
+                self._local_shards.append(Shard(local_tensor, shard_metadata))
+
+        # do post initialization (i.e. register sharded_tensor_id, initialize_rpc)
+        self._post_init()
+
+    def _prepare_init(self, process_group=None, init_rrefs=False):
+        self._init_rrefs = init_rrefs
+        self._sharded_tensor_id = None
+
+        self._process_group = (
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+
+        self._remote_shards: Dict[int, List[rpc.RRef[Shard]]] = {}
+
+    def _post_init(self):
+        # Initialize RPC if available.
+        if self._init_rrefs:
+            with _sharded_tensor_lock:
+                global _sharded_tensor_current_id, _sharded_tensor_map
+                self._sharded_tensor_id = _sharded_tensor_current_id
+                _sharded_tensor_map[self._sharded_tensor_id] = weakref.ref(self)
+                _sharded_tensor_current_id += 1
+
+            if not rpc._is_current_rpc_agent_set():
+                raise RuntimeError(
+                    'RPC Framework needs to be initialized using'
+                    ' torch.distributed.rpc.init_rpc if init_rrefs is set to True')
+            self._init_rpc()
+
+    def __del__(self):
+        # Clean up the global map.
+        with _sharded_tensor_lock:
+            global _sharded_tensor_current_id, _sharded_tensor_map
+            if (
+                hasattr(self, "_sharded_tensor_id")
+                and self._sharded_tensor_id in _sharded_tensor_map
+            ):
+                _sharded_tensor_map.pop(self._sharded_tensor_id)  # type: ignore[call-overload]
+
+    def _init_rpc(self):
+        # Validate PG and RPC ranks match.
+        pg_rank = dist.get_rank()
+        rpc_rank = rpc.get_worker_info().id
+        if pg_rank != rpc_rank:
+            raise ValueError(
+                f'Default ProcessGroup and RPC ranks must be '
+                f'the same for ShardedTensor, found process group rank: '
+                f'{pg_rank} and RPC rank: {rpc_rank}'
+            )
+
+        self._remote_shards = {}
+
+        # Gather all the sharded tensor ids.
+        worker_infos = rpc._get_current_rpc_agent().get_worker_infos()
+        rank_to_name = {}
+        name_to_rank = {}
+
+        for worker_info in worker_infos:
+            rank_to_name[worker_info.id] = worker_info.name
+            name_to_rank[worker_info.name] = worker_info.id
+
+        all_tensor_ids = rpc.api._all_gather(self._sharded_tensor_id)
+
+        # Share the local shards to the entire world.
+        futs = []
+        rpc_rank = rpc.get_worker_info().id
+        for rank in range(dist.get_world_size()):
+            # Skip self.
+            if rank == dist.get_rank():
+                continue
+
+            if len(self.local_shards()) != 0:
+                rrefs: List[rpc.RRef[Shard]] = [rpc.RRef(shard) for shard in self.local_shards()]
+                fut = rpc.rpc_async(
+                    rank,
+                    _register_remote_shards,
+                    args=(all_tensor_ids[rank_to_name[rank]], rrefs, rpc_rank))
+                futs.append(fut)
+
+        torch.futures.wait_all(futs)
+
+        # Barrier for all RPCs to finish on all ranks.
+        rpc.api._all_gather(None)
+
+    def _get_preferred_device(self) -> torch.device:
+        """
+        Return the preferred device to be used when creating tensors for collectives.
+        This method takes into account the associated process group
+        """
+        if dist.get_backend(self._process_group) == dist.Backend.NCCL:
+            return torch.device(torch.cuda.current_device())
+        return torch.device("cpu")
+
+    def gather(  # type: ignore[override]
+        self,
+        dst: int = 0,
+        out: Optional[torch.Tensor] = None,
+        enforce_dtype: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        """
+        Creates a full :class:`Tensor` on rank ``dst`` by gathering all shards of the
+        sharded tensor.
+
+        The API needs to be called on all ranks in SPMD fashion. All ranks should have
+        the same ``dst``. ``out`` should be a tensor of the same size as the overall
+        size of the sharded tensor on ``dst`` and ``None`` on all other ranks.
+
+        Args:
+            dst(int): The rank where full tensor is constructed.
+                Default: 0
+            out (:class `torch.Tensor`, optional): The output full tensor.
+                Must to be provided ONLY on ``dst`` rank.
+                Default: ``None``
+            enforce_dtype (bool): Deprecated, please use dtype instead.  Force the
+                gathered tensors to be the same type as input and output.
+            dtype (torch.dtype): Force the gathered tensors to be this dtype.
+                Default: ``None``
+        """
+        def shard_size(shard_md):
+            return reduce(operator.mul, shard_md.shard_sizes)  # type: ignore[attr-defined]
+
+        if enforce_dtype:
+            warnings.warn("enforce_dtype is deprecated.  Please use dtype instead.")
+
+        rank = dist.get_rank(self._process_group)
+        full_size = self.metadata().size
+        _validate_output_tensor_for_gather(rank, dst, full_size, out)
+
+        local_shards = self.local_shards()
+        world_size = dist.get_world_size(self._process_group)
+        rank_sizes = [0 for _ in range(world_size)]
+        max_rank_size = 0
+        shard_placement: Dict[ShardMetadata, Tuple[int, int]] = {}
+        # collect sizes
+        for shard_md in self.metadata().shards_metadata:
+            shard_rank = cast(_remote_device, shard_md.placement).rank()
+            assert shard_rank is not None
+
+            shard_placement[shard_md] = (shard_rank, rank_sizes[shard_rank])
+            rank_sizes[shard_rank] += shard_size(shard_md)
+            max_rank_size = max(max_rank_size, rank_sizes[shard_rank])
+
+        gather_list: Optional[List[torch.Tensor]]
+        if rank == dst:
+            assert out is not None
+            if enforce_dtype:
+                # enforce_dtype is deprecated.  Do it for backward compatibility.
+                dtype = out.dtype
+            # TODO make it as a view of out tensor
+            gather_list = [torch.empty((max_rank_size,), device=out.device, dtype=dtype) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        with torch.no_grad():
+            if enforce_dtype and len(local_shards) > 0:
+                # enforce_dtype is deprecated.  Do it for backward compatibility.
+                dtype = local_shards[0].tensor.dtype
+            data = torch.empty(max_rank_size, device=self._get_preferred_device(), dtype=dtype)
+
+            for shard in local_shards:
+                src = shard.tensor.flatten()
+                if src.nelement() == 0 :
+                    warnings.warn("Gathering a tensor with zero elements on rank " + str(rank))
+                    return
+                shard_offset = shard_placement[shard.metadata][1]
+                data[shard_offset: shard_offset + src.numel()].copy_(src)
+
+        dist.gather(
+            tensor=data,
+            gather_list=gather_list,
+            dst=dst,
+            group=self._process_group,
+        )
+        if rank != dst:
+            return
+        # In _validate_output_tensor_for_gather, we raise if out == None and rank == dst
+        out = cast(torch.Tensor, out)
+        assert gather_list is not None
+
+        full_size = self.metadata().size
+        dims = len(full_size)
+        for shard_md in self.metadata().shards_metadata:
+            rank, rank_offset = shard_placement[shard_md]
+            tensor = gather_list[rank]
+            tensor = tensor[rank_offset : rank_offset + shard_size(shard_md)]
+            tensor = tensor.view(shard_md.shard_sizes)
+
+            out_narrow_view = out
+            for dim in range(dims):
+                out_narrow_view = out_narrow_view.narrow(
+                    dim,
+                    shard_md.shard_offsets[dim],
+                    shard_md.shard_sizes[dim],
+                )
+
+            out_narrow_view.copy_(tensor)
+
+    def cpu(
+        self,
+        memory_format=torch.preserve_format,
+        process_group=None
+    ) -> ShardedTensor:
+        """
+        Returns a copy of this object in CPU memory.
+
+        If this ShardedTensor is already on CPU memory, then no copy is
+        performed and original object is returned.
+
+        .. note:: When moving a ShardedTensor from GPU to CPU, the ShardedTensor might
+            need to be managed by a different type of ProcessGroup(i.e. ProcessGroupGloo),
+            it is the user's responsiblity to explicitly pass in a new process_group that
+            is compatible with CPU.
+        """
+        # TODO: make this a __torch_function__ op once ShardedTensor becomes a
+        # torch.Tensor subclass, see https://github.com/pytorch/pytorch/issues/75402
+        if memory_format != torch.preserve_format and \
+                memory_format != torch.contiguous_format:
+            raise RuntimeError("Only `torch.contiguous_format` or "
+                               "`torch.preserve_format` is supported!")
+        all_on_cpu = True
+        for meta in self.metadata().shards_metadata:
+            all_on_cpu &= (meta.placement.device().type == "cpu")  # type: ignore[union-attr]
+
+        # if every shard is already on CPU, return the original object
+        if all_on_cpu:
+            return self
+
+        # if not, returns a copy of this object on CPU
+        list_shards: List[Shard] = []
+        # move all local shards to cpu, and change metadata
+        for shard in self._local_shards:
+            cpu_tensor = shard.tensor.cpu(memory_format=memory_format)  # type: ignore[call-arg]
+            metadata = copy.deepcopy(shard.metadata)
+            metadata.placement._device = torch.device("cpu")  # type: ignore[union-attr]
+            list_shards.append(
+                Shard(cpu_tensor, metadata)
+            )
+
+        st_meta = copy.deepcopy(self.metadata())
+        for meta in st_meta.shards_metadata:
+            if meta.placement.device().type != "cpu":  # type: ignore[union-attr]
+                meta.placement._device = torch.device("cpu")  # type: ignore[union-attr]
+
+        pg = self._process_group if process_group is None else process_group
+        st_cpu = ShardedTensor._init_from_local_shards_and_global_metadata(
+            list_shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=pg,
+            init_rrefs=self._init_rrefs
+        )
+        return st_cpu
+
+    def cuda(
+        self,
+        device=None,
+        non_blocking=False,
+        memory_format=torch.preserve_format,
+        process_group=None
+    ) -> ShardedTensor:
+        """
+        Returns a copy of this object in CUDA memory, if the original ShardedTensor
+        is on CPU, we will move the local shard to the current GPU device of each
+        process in a SPMD fashion.
+        If this ShardedTensor is already on CUDA memory and local shards on each rank are
+        already on current device, we still returns a new ShardedTensor object with new
+        metadata, but no underlying data movements are performed.
+        .. note:: When moving a ShardedTensor from CPU to GPU, the ShardedTensor might
+            need to be managed by a different type of ProcessGroup(i.e. ProcessGroupNCCL),
+            it is the user's responsiblity to explicitly pass in a new process_group that
+            is compatible with GPU.
+        """
+        if memory_format != torch.preserve_format and \
+                memory_format != torch.contiguous_format:
+            raise RuntimeError("Only `torch.contiguous_format` or "
+                               "`torch.preserve_format` is supported!")
+
+        if device is not None:
+            device = torch.device(device) if isinstance(device, str) else device
+            assert isinstance(device, torch.device) and device.index == torch.cuda.current_device(), \
+                '''Only device without device id (e.g. "cpu" or "cuda") is expected for ShardedTensor!'''
+
+        current_device = torch.device(torch.cuda.current_device())
+        # returns a copy of ShardedTensor on CUDA current device
+        list_shards: List[Shard] = []
+        # move all local shards to current device, and change metadata
+        # if local shards already on the current device, there's no
+        # real data movement, only the metadata are copied.
+        for shard in self._local_shards:
+            cuda_tensor = shard.tensor.cuda(
+                device=current_device,
+                non_blocking=non_blocking,
+                memory_format=memory_format
+            )  # type: ignore[call-arg]
+            metadata = copy.deepcopy(shard.metadata)
+            metadata.placement._device = current_device  # type: ignore[union-attr]
+
+            list_shards.append(
+                Shard(cuda_tensor, metadata)
+            )
+
+        st_meta = copy.deepcopy(self.metadata())
+        for meta in st_meta.shards_metadata:
+            if meta.placement.device().type != "cuda":  # type: ignore[union-attr]
+                meta.placement._device = current_device  # type: ignore[union-attr]
+
+        pg = self._process_group if process_group is None else process_group
+        # we need to use `init_from_local_shards` to communicate between ranks
+        # and update the sharding spec/shards metadata.
+        st_cuda = ShardedTensor._init_from_local_shards_and_global_metadata(
+            list_shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=pg,
+            init_rrefs=self._init_rrefs
+        )
+        return st_cuda
+
+    def to(self, *args, **kwargs) -> ShardedTensor:
+        current_device: torch.device
+        if self._local_shards:
+            current_device = self._local_shards[0].tensor.device
+        elif self._process_group._get_backend_name() == "gloo":
+            current_device = torch.device("cpu")
+        else:
+            current_device = torch.device(torch.cuda.current_device())
+        current_dtype = self.dtype
+        device_to = current_device
+        dtype_to = current_dtype
+        if len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype_to = args[0]
+            elif isinstance(args[0], torch.device):
+                device_to = args[0]
+            elif isinstance(args[0], (str, int)):
+                device_to = torch.device(args[0])
+            elif isinstance(args[0], torch.Tensor):
+                dtype_to = args[0].dtype
+                device_to = args[0].device
+            else:
+                raise RuntimeError(f"ShardedTensor.to() have wrong arguments: {args}")
+        elif len(args) == 2:
+            device_to, dtype_to = args
+        else:
+            dtype_to = kwargs.get("dtype", current_dtype)
+            device_to = kwargs.get("device", current_device)
+
+        device_to = torch.device(device_to) if isinstance(device_to, (str, int)) else device_to
+
+        if device_to.type == "cuda":
+            # if device_to set to cuda, set to current device even
+            # if user specify the device index.
+            current_idx = torch.cuda.current_device()
+            if device_to.index != current_idx:
+                warnings.warn("ShardedTensor.to only move tensor to its current device"
+                              "If you want to put to different device, use `reshard` instead.")
+            device_to = torch.device(current_idx)
+
+        copy_tensor = kwargs.get("copy", False)
+        non_blocking = kwargs.get("non_blocking", False)
+        memory_format = kwargs.get("memory_format", torch.preserve_format)
+        process_group = kwargs.get("process_group", None)
+
+        if not copy_tensor and dtype_to == current_dtype and device_to == current_device:
+            # already have correct dtype and device, return itself
+            return self
+
+        # returns a copy of ShardedTensor on CUDA current device
+        list_shards: List[Shard] = []
+
+        for shard in self._local_shards:
+            new_tensor = shard.tensor.to(  # type: ignore[call-overload]
+                device=device_to,
+                dtype=dtype_to,
+                non_blocking=non_blocking,
+                copy=copy_tensor,
+                memory_format=memory_format
+            )
+            metadata = copy.deepcopy(shard.metadata)
+            if metadata.placement is not None:
+                metadata.placement._device = device_to
+            list_shards.append(Shard(new_tensor, metadata))
+
+        # update metadata
+        st_meta = copy.deepcopy(self.metadata())
+        st_meta.tensor_properties.dtype = dtype_to
+        for meta in st_meta.shards_metadata:
+            meta.placement._device = device_to  # type: ignore[union-attr]
+
+        pg = self._process_group if process_group is None else process_group
+        # we need to use `init_from_local_shards` to communicate between ranks
+        # and update the sharding spec/shards metadata.
+        st_to = ShardedTensor._init_from_local_shards_and_global_metadata(
+            list_shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=pg,
+            init_rrefs=self._init_rrefs
+        )
+        return st_to
+
+
+    @classmethod
+    def _init_from_local_shards(
+        cls,
+        local_shards: List[Shard],
+        *global_size,
+        process_group=None,
+        init_rrefs=False,
+    ):
+        # STEP 1: Validate the Shardmetadatas locally
+        process_group = (
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        current_rank = dist.get_rank(process_group)
+        world_size = dist.get_world_size(process_group)
+
+        local_sharded_tensor_metadata: Optional[ShardedTensorMetadata] = None
+        global_tensor_size = _flatten_tensor_size(global_size)
+
+        if len(local_shards) > 0:
+            local_sharded_tensor_metadata = \
+                build_metadata_from_local_shards(local_shards, global_tensor_size, current_rank, process_group)
+
+        # STEP 2. Validate metadata across ranks, and build a global sharded tensor
+        # metadata by gathering local ShardedTensorMetadata
+        gathered_metadatas: List[Optional[ShardedTensorMetadata]] = []
+        if world_size > 1:
+            gathered_metadatas = [None for _ in range(world_size)]
+
+            dist.all_gather_object(
+                gathered_metadatas,
+                local_sharded_tensor_metadata,
+                group=process_group
+            )
+        else:
+            gathered_metadatas = [local_sharded_tensor_metadata]
+
+        global_sharded_tensor_metadata = build_global_metadata(gathered_metadatas)
+        tensor_properties = global_sharded_tensor_metadata.tensor_properties
+
+        # STEP 3: Validation done, create the actual ShardedTensor and populate fields
+        # prepare initialization
+        spec = shard_spec._infer_sharding_spec_from_shards_metadata(
+            global_sharded_tensor_metadata.shards_metadata
+        )
+        sharded_tensor = cls.__new__(cls,
+                                     spec,
+                                     global_sharded_tensor_metadata.size,
+                                     dtype=tensor_properties.dtype,
+                                     layout=tensor_properties.layout,
+                                     pin_memory=tensor_properties.pin_memory,
+                                     requires_grad=tensor_properties.requires_grad)
+        sharded_tensor._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
+
+        # attach local_shards to the ShardedTensor created
+        sharded_tensor._local_shards = local_shards
+
+        # run post initialization, i.e. map registration, rpc initialization
+        sharded_tensor._post_init()
+        return sharded_tensor
+
+    @classmethod
+    def _init_from_local_tensor(
+        cls,
+        local_tensor: torch.Tensor,
+        sharding_spec: shard_spec.ShardingSpec,
+        *global_size: Sequence[int],
+        process_group: Optional[dist.ProcessGroup] = None,
+        init_rrefs=False,
+    ) -> ShardedTensor:
+        """
+        Initialize a ShardedTensor given only one local tensor, global sharded tensor
+        size and sharding spec on each rank.
+
+        Args:
+            local_tensor (Tensor): Single tensor of local shard stored in each rank.
+            sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`):
+                The specification describing how to shard the Tensor.
+            global_size (Sequence[int]): Size of the sharded tensor.
+            process_group (ProcessGroup, optional): The process group to aggregate on.
+                Default: None
+            init_rrefs (bool, optional): Whether or not to initialize
+                :class:`torch.distributed.rpc.RRef`s pointing to remote shards.
+                Need to initialize the RPC Framework if specified as ``True``.
+                Default: ``False``.
+
+        Returns:
+            A :class:`ShardedTensor` sharded based on the given sharding_spec with local
+                tensor stored in the current rank.
+
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> # All tensors below are of torch.int64 type.
+            >>> # We have 2 process groups, 2 ranks.
+            >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+            >>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2]))
+            >>> local_tensor
+            tensor([[1, 2, 3, 4]]) # Rank 0
+            tensor([[3, 4, 5, 6]]) # Rank 1
+            >>> sharding_dim = 0
+            >>> sharding_spec = ChunkShardingSpec(
+                    dim=sharding_dim,
+                    placements=[
+                        "rank:0/cuda:0",
+                        "rank:1/cuda:1",
+                    ],
+                )
+            >>> st = ShardedTensor._init_from_local_tensor(local_tensor, sharding_spec, [2, 4])
+            >>> st
+            ShardedTensor(
+                ShardedTensorMetadata(
+                    shards_metadata=[
+                        ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1, 4], placement=rank:0/cuda:0),
+                        ShardMetadata(shard_offsets=[1, 0], shard_sizes=[1, 4], placement=rank:1/cuda:1),
+                    ],
+                    size=torch.Size([2, 4])
+            )
+            >>> st.local_tensor()
+            tensor([1, 2, 3, 4]) # Rank 0
+            tensor([3, 4, 5, 6]) # Rank 1
+
+        Warning: This API is experimental and subject to change. It lacks of a fully across
+                 rank validations, and we only validate the local shard on the current rank.
+                 We fully rely on the user to ensure local tensor is sharded based on the
+                 sharding spec.
+        """
+        warnings.warn(DEPRECATE_MSG)
+
+        if not local_tensor.is_contiguous():
+            raise ValueError('local_tensor is not a contiguous Tensor.')
+
+        global_tensor_size = _flatten_tensor_size(global_size)
+        tensor_properties = TensorProperties(
+            dtype=local_tensor.dtype,
+            layout=local_tensor.layout,
+            requires_grad=local_tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=local_tensor.is_pinned())
+        sharded_tensor_metadata = sharding_spec.build_metadata(
+            global_tensor_size,
+            tensor_properties
+        )
+
+        process_group = (
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        current_rank = dist.get_rank(process_group)
+
+        local_shards: List[Shard] = []
+        for shard_metadata in sharded_tensor_metadata.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(process_group, shard_metadata.placement)
+            if rank == current_rank:
+                local_shards.append(Shard(local_tensor, shard_metadata))
+
+        # TODO: figure out what the API should behave when some rank have no shard
+        # see https://github.com/pytorch/pytorch/issues/7313
+        return ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            sharded_tensor_metadata,
+            process_group=process_group,
+            init_rrefs=init_rrefs,
+            sharding_spec=sharding_spec,
+        )
+
+    @classmethod
+    def _init_from_local_shards_and_global_metadata(  # type: ignore[override]
+        cls,
+        local_shards: List[Shard],
+        sharded_tensor_metadata: ShardedTensorMetadata,
+        process_group=None,
+        init_rrefs=False,
+        sharding_spec=None,
+    ) -> ShardedTensor:
+        """
+        Initialize a ShardedTensor with local shards and a global
+        ShardedTensorMetadata built on each rank.
+
+        Warning: This API is experimental and subject to change. It does
+                 not do cross rank validations, and fully rely on the user
+                 for the correctness of sharded_tensor_metadata on each rank
+        """
+        process_group = (
+            process_group
+            if process_group is not None
+            else distributed_c10d._get_default_group()
+        )
+        current_rank = dist.get_rank(process_group)
+
+        shards_metadata = sharded_tensor_metadata.shards_metadata
+
+        local_shard_metadatas = []
+
+        # collect local shard metadatas from the global sharded_tensor_metadata
+        for shard_metadata in shards_metadata:  # type: ignore[attr-defined]
+            rank, local_device = _parse_and_validate_remote_device(process_group, shard_metadata.placement)
+
+            if current_rank == rank:
+                local_shard_metadatas.append(shard_metadata)
+
+        if len(local_shards) != len(local_shard_metadatas):
+            raise RuntimeError(
+                f'Number of local shards ({len(local_shards)}) does not match number of local '
+                f'shards metadata in sharded_tensor_metadata ({len(local_shard_metadatas)}) '
+                f'on rank ({current_rank}) '
+            )
+
+        shards_metadata = sharded_tensor_metadata.shards_metadata
+        tensor_properties = sharded_tensor_metadata.tensor_properties
+
+        if len(shards_metadata) == 0:
+            raise ValueError("shards_metadata must not be empty!")
+
+        if tensor_properties.layout != torch.strided:
+            raise ValueError("Only torch.strided layout is currently supported")
+
+        if sharding_spec is None:
+            spec = shard_spec._infer_sharding_spec_from_shards_metadata(shards_metadata)
+        else:
+            spec = sharding_spec
+
+        sharded_tensor = ShardedTensor.__new__(
+            ShardedTensor,
+            spec,
+            sharded_tensor_metadata.size,
+            dtype=tensor_properties.dtype,
+            layout=tensor_properties.layout,
+            pin_memory=tensor_properties.pin_memory,
+            requires_grad=tensor_properties.requires_grad,
+        )
+
+        def _raise_if_mismatch(expected, actual, prop_name, rank, is_property=False):
+            tensor_property_or_metadata = (
+                "tensor property" if is_property else "local ShardMetadata"
+            )
+            if expected != actual:
+                raise ValueError(
+                    f"Local shards' tensor {prop_name} property is incompatible with "
+                    f"{tensor_property_or_metadata} on rank {rank}: "
+                    f"{tensor_property_or_metadata} {prop_name}={expected}, "
+                    f"local shard tensor {prop_name}={actual}."
+                )
+
+        for shard in local_shards:
+            shard_meta = shard.metadata
+            local_shard_tensor = shard.tensor
+            placement = shard_meta.placement
+            assert placement is not None, "Must specify placement for `Shard`!"
+            rank = placement.rank()
+            local_device = placement.device()
+
+            _raise_if_mismatch(
+                tensor_properties.layout,
+                local_shard_tensor.layout,
+                "layout",
+                rank,
+                True,
+            )
+            if not local_shard_tensor.is_contiguous():
+                raise ValueError(
+                    "Only torch.contiguous_format memory_format is currently supported"
+                )
+
+            _raise_if_mismatch(
+                shard_meta.shard_sizes,
+                list(local_shard_tensor.size()),
+                "size",
+                rank,
+            )
+            _raise_if_mismatch(
+                tensor_properties.pin_memory,
+                local_shard_tensor.is_pinned(),
+                "pin_memory",
+                rank,
+                True,
+            )
+            _raise_if_mismatch(local_device, local_shard_tensor.device, "device", rank)
+            _raise_if_mismatch(
+                tensor_properties.dtype,
+                local_shard_tensor.dtype,
+                "dtype",
+                rank,
+                True,
+            )
+            _raise_if_mismatch(
+                tensor_properties.requires_grad,
+                local_shard_tensor.requires_grad,
+                "requires_grad",
+                rank,
+                True,
+            )
+
+        # check if shards_metadata have overlap shards
+        validate_non_overlapping_shards_metadata(shards_metadata)
+
+        # check if the shards_metadata is compatible with overall size of the sharded tensor.
+        check_tensor(shards_metadata, list(sharded_tensor_metadata.size))
+
+        # done validation, add local_shards
+        sharded_tensor._local_shards = local_shards
+        sharded_tensor._prepare_init(process_group=process_group, init_rrefs=init_rrefs)
+
+        # run post initialization, i.e. map registration, rpc initialization
+        sharded_tensor._post_init()
+        return sharded_tensor
+
+    def sharding_spec(self) -> shard_spec.ShardingSpec:
+        """
+        Returns the ShardingSpec for the tensor.
+        """
+        return self._sharding_spec
+
+    def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
+        """
+        Reshard a sharded tensor given the ``resharding_spec``. For now, we only support
+        single local shard.
+
+        If ``resharding_spec`` is same as the original one, this becomes a no-op.
+        If only ``resharding_spec`` shares the same sharding dim with the original one,
+        we swap local shards directly.
+        For more generic cases, we merge different shards across different ranks and split
+        the local shards based on the ``resharding_spec`` via `all_to_all` collective API.
+
+        Args:
+            resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+                specification describing how the tensor is sharded.
+
+        Returns:
+            A :class:`ShardedTensor` object whose local shards are resharded.
+
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> # We have 2 process groups, 2 ranks.
+            >>> tensor = torch.arange(4, dtype=torch.int64) + 1 + 2 * rank
+            >>> tensor = torch.stack([tensor, tensor])
+            >>> tensor
+            tensor([[1, 2, 3, 4], [1, 2, 3, 4]]) # Rank 0
+            tensor([[3, 4, 5, 6], [3, 4, 5, 6]]) # Rank 1
+            tensor([[5, 6, 7, 8], [5, 6, 7, 8]]) # Rank 2
+            tensor([[7, 8, 9, 10], [7, 8, 9, 10]]) # Rank 3
+            >>> sharding_dim = 0
+            >>> spec = ChunkShardingSpec(
+                    dim=sharding_dim,
+                    placements=[
+                        "rank:0/cuda:0",
+                        "rank:1/cuda:1",
+                        "rank:2/cuda:2",
+                        "rank:3/cuda:3",
+                    ],
+                )
+            >>> current_offsets = [0] * 2
+            >>> current_offsets[0] = rank * 2
+            >>> shard_metadata = ShardMetadata(
+                    shard_offsets=copy.deepcopy(current_offsets),
+                    shard_sizes=tensor.size(),
+                    placement=spec.placements[rank],
+                )
+            >>> local_shards = [
+                    Shard(
+                        tensor=tensor,
+                        metadata=shard_metadata,
+                    )
+                ]
+            >>> st = ShardedTensor._init_from_local_shards(local_shards, tensor.size())
+            >>> sharding_dim = 1
+            >>> resharding_spec = ChunkShardingSpec(
+                    dim=sharding_dim,
+                    placements=[
+                        "rank:0/cuda:0",
+                        "rank:1/cuda:1",
+                        "rank:2/cuda:2",
+                        "rank:3/cuda:3",
+                    ],
+                )
+            >>> st.reshard(resharding_spec)
+            >>> tensor = st.local_shards()[0].tensor
+            >>> tensor
+            tensor([[1], [1], [3], [3], [5], [5], [7], [7]]) # Rank 0
+            tensor([[2], [2], [4], [4], [6], [6], [8], [8]]) # Rank 1
+            tensor([[3], [3], [5], [5], [7], [7], [9], [9]]) # Rank 2
+            tensor([[4], [4], [6], [6], [8], [8], [10], [10]]) # Rank 3
+        """
+        warnings.warn(DEPRECATE_MSG)
+
+        if (
+            not isinstance(resharding_spec, shard_spec.ChunkShardingSpec) or
+            not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec)
+        ):
+            raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
+        if (len(self.local_shards()) != 1):
+            raise NotImplementedError("Only single local shard supported for reshard.")
+
+        if self._sharding_spec.dim == resharding_spec.dim:  # type: ignore[attr-defined]
+            if self._sharding_spec.placements == resharding_spec.placements:  # type: ignore[attr-defined]
+                return self
+            else:
+                local_shards, shards_metadata = reshuffle_local_shard(
+                    self.local_tensor(),
+                    self.size(),  # type: ignore[arg-type]
+                    self._sharding_spec,
+                    resharding_spec,
+                    self._process_group,
+                )
+        else:
+            local_shards, shards_metadata = reshard_local_shard(
+                self.local_tensor(),
+                self.size(),  # type: ignore[arg-type]
+                self._sharding_spec,
+                resharding_spec,
+                self._process_group,
+            )
+        self._local_shards = local_shards
+        self._metadata.shards_metadata = shards_metadata
+        self._sharding_spec = resharding_spec
+        return self
+
+    def local_tensor(self) -> torch.Tensor:
+        """
+        Return local tensor for a sharded_tensor. For now we only support single local shard.
+
+        Returns:
+            A :class:`torch.Tensor` of the local shard.
+        """
+        if len(self.local_shards()) != 1:
+            raise NotImplementedError("Only single local shard is supported.")
+        return self.local_shards()[0].tensor
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        def dispatch(st: ShardedTensor, func: Callable):
+            # Dispatch to custom user provided op first if it exists.
+            if func in _CUSTOM_SHARDED_OPS:
+                return _CUSTOM_SHARDED_OPS[func](types, args, kwargs, st._process_group)
+
+            # Dispatch to custom sharding spec op if it has one.
+            if _has_custom_op(st._sharding_spec, func):
+                return _dispatch_custom_op(
+                    st._sharding_spec,
+                    func,
+                    types,
+                    args,
+                    kwargs,
+                    st._process_group
+                )
+
+            if func in _SHARDED_OPS:
+                return _SHARDED_OPS[func](types, args, kwargs, st._process_group)
+
+            raise RuntimeError(
+                f"torch function '{func.__name__}', with args: {args} and "
+                f"kwargs: {kwargs} not supported for ShardedTensor!")
+
+        warnings.warn(DEPRECATE_MSG)
+        # Find ShardedTensor instance to get process_group and sharding_spec.
+        st_instance = None
+
+        def find_sharded_tensor(e):
+            nonlocal st_instance
+            if st_instance is None and isinstance(e, ShardedTensor):
+                st_instance = e
+
+        pytree.tree_map_(find_sharded_tensor, args)
+        pytree.tree_map_(find_sharded_tensor, kwargs)
+
+        if st_instance is not None:
+            return dispatch(st_instance, func)
+
+        raise RuntimeError(
+            f"torch function '{func.__name__}', with args: {args} and "
+            f"kwargs: {kwargs} not supported for ShardedTensor!")
+
+    def is_pinned(self) -> bool:  # type: ignore[override]
+        """
+        Returns True if the sharded tensor (each local shard) resides in pinned memory.
+        """
+        return self._metadata.tensor_properties.pin_memory
+
+    def _register_remote_shards(self, remote_shards: List[rpc.RRef[Shard]], rpc_rank: int):
+        self._remote_shards[rpc_rank] = remote_shards
+
+    def remote_shards(self) -> Dict[int, List[rpc.RRef[Shard]]]:
+        """
+        Returns a Dict[int, RRef] with keys being the RPC rank and values
+        being RRefs to shards on that rank. Need to initialize the
+        RPC framework for this functionality.
+
+        Raises an exception if ShardedTensor was created with ``init_rrefs=False``
+        """
+        if not self._init_rrefs:
+            raise RuntimeError(
+                'ShardedTensor created with init_rrefs=False, no RRefs to remote shards available'
+            )
+        return self._remote_shards
+
+    def __hash__(self):
+        return id(self)
+
+    def __repr__(self):
+        return f'ShardedTensor({self._metadata})'
+
+    @dataclass
+    class ProcessGroupState:
+        """
+        State for ser-de of process group
+        """
+        local_rank: int
+        global_rank: int
+        local_world_size: int
+        global_world_size: int
+
+    def __getstate__(self):
+        pg_state = ShardedTensor.ProcessGroupState(
+            distributed_c10d.get_rank(self._process_group),
+            distributed_c10d.get_rank(),
+            distributed_c10d.get_world_size(self._process_group),
+            distributed_c10d.get_world_size(),
+        )
+
+        return self._local_shards, self._metadata, pg_state, self._sharding_spec, self._init_rrefs
+
+    def __setstate__(self, state):
+        self._sharded_tensor_id = None
+        if not distributed_c10d.is_initialized():
+            raise RuntimeError(
+                'Need to initialize default process group using '
+                '"init_process_group" before loading ShardedTensor')
+
+        self._local_shards, self._metadata, pg_state, self._sharding_spec, self._init_rrefs = state
+
+        # Setup process group
+        from torch.distributed._shard.api import _get_current_process_group
+        self._process_group = _get_current_process_group()
+
+        # Validate process group.
+        local_rank = distributed_c10d.get_rank(self._process_group)
+        if pg_state.local_rank != local_rank:
+            raise RuntimeError(
+                f'Local rank at save time was {pg_state.local_rank}, but at '
+                f'load time was {local_rank}')
+
+        global_rank = distributed_c10d.get_rank()
+        if pg_state.global_rank != global_rank:
+            raise RuntimeError(
+                f'Global rank at save time was {pg_state.global_rank}, but at '
+                f'load time was {global_rank}')
+
+        local_world_size = distributed_c10d.get_world_size(self._process_group)
+        if pg_state.local_world_size != local_world_size:
+            raise RuntimeError(
+                f'Local world size at save time was {pg_state.local_world_size}, '
+                f'but at load time was {local_world_size}')
+
+        global_world_size = distributed_c10d.get_world_size()
+        if pg_state.global_world_size != global_world_size:
+            raise RuntimeError(
+                f'Global world size at save time was {pg_state.global_world_size}, '
+                f'but at load time was {global_world_size}')
+
+        self._post_init()
+
+
+def _create_tensor_from_params(*size, local_device, tensor_properties: TensorProperties):
+    """ Helper to construct tensor from size, device and common params. """
+    dtype = tensor_properties.dtype
+    layout = tensor_properties.layout
+    requires_grad = tensor_properties.requires_grad
+    memory_format = tensor_properties.memory_format
+    pin_memory = tensor_properties.pin_memory
+
+    return torch.empty(
+        *size, dtype=dtype, layout=layout,
+        device=local_device, requires_grad=requires_grad,
+        memory_format=memory_format, pin_memory=pin_memory
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logger.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e4a17b6a205060684617d12da849fccc1eee1a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logger.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Tuple
+
+from torch.distributed._shard.sharded_tensor.logging_handlers import (
+    _log_handlers,
+)
+
+__all__: List[str] = []
+
+
+def _get_or_create_logger() -> logging.Logger:
+    logging_handler, log_handler_name = _get_logging_handler()
+    logger = logging.getLogger(f"sharding-spec-{log_handler_name}")
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    logging_handler.setFormatter(formatter)
+    logger.propagate = False
+    logger.addHandler(logging_handler)
+    return logger
+
+
+def _get_logging_handler(
+    destination: str = "default",
+) -> Tuple[logging.Handler, str]:
+    log_handler = _log_handlers[destination]
+    log_handler_name = type(log_handler).__name__
+    return (log_handler, log_handler_name)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logging_handlers.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logging_handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a775863e0b06b2f7597cd9cae85d19110271a1f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/logging_handlers.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, List
+
+__all__: List[str] = []
+
+_log_handlers: Dict[str, logging.Handler] = {
+    "default": logging.NullHandler(),
+}
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/metadata.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..e917f1156eb3fb8c26f9bfbbd1a42850224bc639
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/metadata.py
@@ -0,0 +1,82 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import List
+
+import torch
+from torch.distributed._shard.metadata import ShardMetadata
+
+class MEM_FORMAT_ENCODING(Enum):
+    TORCH_CONTIGUOUS_FORMAT = 0
+    TORCH_CHANNELS_LAST = 1
+    TORCH_PRESERVE_FORMAT = 2
+
+@dataclass
+class TensorProperties:
+    """ Properties used to create :class:`Tensor` """
+
+    # Regular tensor fields
+    dtype: torch.dtype = field(default=torch.get_default_dtype())
+    layout: torch.layout = field(default=torch.strided)
+    requires_grad: bool = False
+    memory_format: torch.memory_format = field(default=torch.contiguous_format)
+    pin_memory: bool = False
+
+    def __getstate__(self):
+        # Since torch.memory_format cannot be pickled!
+        memory_format = self.memory_format
+        if memory_format == torch.contiguous_format:
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT
+        elif memory_format == torch.channels_last:
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST
+        elif memory_format == torch.preserve_format:
+            mem_format_encoding = MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT
+        else:
+            raise RuntimeError(f'Invalid torch.memory_format: {memory_format}')
+
+        return (
+            self.dtype,
+            self.layout,
+            self.requires_grad,
+            mem_format_encoding,
+            self.pin_memory,
+        )
+
+    def __setstate__(
+        self,
+        state,
+    ):
+        (self.dtype, self.layout, self.requires_grad, mem_format_encoding, self.pin_memory) = state
+
+        if mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT:
+            memory_format = torch.contiguous_format
+        elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST:
+            memory_format = torch.channels_last
+        elif mem_format_encoding == MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT:
+            memory_format = torch.preserve_format
+        else:
+            raise RuntimeError(f'Invalid torch.memory_format encoding: {mem_format_encoding}')
+
+        self.memory_format = memory_format
+
+    @staticmethod
+    def create_from_tensor(tensor: torch.Tensor) -> "TensorProperties":
+        return TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned()
+        )
+@dataclass
+class ShardedTensorMetadata:
+    """
+    Represents metadata for :class:`ShardedTensor`
+    """
+
+    # Metadata about each shard of the Tensor
+    shards_metadata: List[ShardMetadata] = field(default_factory=list)
+
+    # Size of each dim of the overall Tensor.
+    size: torch.Size = field(default=torch.Size([]))
+
+    tensor_properties: TensorProperties = field(default_factory=TensorProperties)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/reshard.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/reshard.py
new file mode 100644
index 0000000000000000000000000000000000000000..91a1bd254327e83b6812ba6d5b2484913394c027
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/reshard.py
@@ -0,0 +1,248 @@
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.distributed as dist
+from torch._C._distributed_c10d import (
+    ProcessGroup,
+)
+import torch.distributed._shard.sharding_spec as shard_spec
+from torch.distributed._shard.sharding_spec._internals import (
+    get_split_size,
+    get_chunked_dim_size,
+)
+from torch.distributed.nn.functional import (
+    all_to_all,
+    all_to_all_single,
+)
+from torch.distributed._shard.metadata import ShardMetadata
+
+from .shard import Shard
+
+
+def get_idx_from_placements(placements, current_rank) -> int:
+    """
+    Return the position of the current rank in the given placements.
+
+    Args:
+        placements(List[Union[_remote_device, str]]):
+            Specifies the placement of each shard of the Tensor. The size of
+            the list represents the number of shards to be created. This could
+            be a list of
+            :class:`torch.distributed._remote_device`'s. This list
+            could also contain a string which represents remote
+            device as accepted by
+            :class:`torch.distributed._remote_device`
+        current_rank (int): number of current device.
+
+    Returns:
+        A int which contains the position of current device in the placement list.
+    """
+    for idx, placement in enumerate(placements):  # type: ignore[attr-defined]
+        if current_rank == placement.rank():  # type: ignore[union-attr]
+            return idx
+    raise RuntimeError('current_rank not in the placement.')
+
+
+def build_reshard_metadata(
+    st_size: torch.Size,
+    sharding_spec: shard_spec.ShardingSpec,
+    world_size: int,
+) -> Tuple[List[ShardMetadata], List[int]]:
+    """
+    Based the given sharding spec, we calculate the offset and local shard size.
+    We then build a ShardMetadata on top of the calculation result.
+
+    Args:
+        st_size (torch.Size): The size of the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+            specification describing how the tensor is sharded.
+        world_size (int): number of ranks.
+
+    Returns:
+        A Tuple of the followings:
+            A List[`ShardMetadata`] which contains the metadata for the shard, including
+                offsets, lengths and device placement.
+            A List[int] which contains the ranks in the order of placement.
+    """
+    shard_dim = int(sharding_spec.dim)  # type: ignore[attr-defined]
+    shards_metadata = [None] * world_size
+    ranks = []
+    offsets = [0] * len(st_size)
+    split_size = get_split_size(st_size[shard_dim], world_size)
+    for idx, placement in enumerate(sharding_spec.placements):  # type: ignore[attr-defined]
+        ranks.append(placement.rank())
+        sharded_dim_size = get_chunked_dim_size(st_size[shard_dim], split_size, idx)
+        local_tensor_size = list(st_size)
+        local_tensor_size[shard_dim] = sharded_dim_size
+        shards_metadata[placement.rank()] = ShardMetadata(  # type: ignore[call-overload]
+            shard_offsets=copy.deepcopy(offsets),
+            shard_sizes=local_tensor_size,
+            placement=placement,
+        )
+        offsets[shard_dim] += sharded_dim_size
+    return shards_metadata, ranks  # type: ignore[return-value]
+
+
+def reshuffle_local_shard(
+    local_shard: torch.Tensor,
+    st_size: torch.Size,
+    sharding_spec: shard_spec.ShardingSpec,
+    resharding_spec: shard_spec.ShardingSpec,
+    pg: ProcessGroup,
+) -> Tuple[List[Shard], List[ShardMetadata]]:
+    """
+    Reshuffle the local shard directly when the reshard dim is same as the original
+    sharding dim. Logically we do this in two step:
+    1. To collect all shards based on original sharding spec.
+    2. Reshard the tensor based on the given resharding spec.
+
+    In reality, we consolidate the two steps into one by sending the local tensor to
+    the new shard directly based on the resharding spec.
+
+    Args:
+        local_shard (Tensor): Local tensor stored in the current rank.
+        st_size (torch.Size): The size of the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+            specification describing how the tensor is sharded originally.
+        resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+            specification describing how the tensor will be resharded.
+        pg (ProcessGroup): The process group to aggregate on.
+
+    Returns:
+        A Tuple of the followings:
+            A List[`Shard`] which contains the local tensor and its metadata.
+            A List[`ShardMetadata`] which contains the metadata for the shard, including
+                offsets, lengths and device placement.
+    """
+    current_rank = dist.get_rank(pg)
+    world_size = dist.get_world_size(pg)
+    # Build shards_metadata first.
+    shards_metadata, ranks = build_reshard_metadata(
+        st_size, resharding_spec, world_size
+    )
+    # Get input split size for all2all.
+    reshard_dim = int(resharding_spec.dim)  # type: ignore[attr-defined]
+    split_size = get_split_size(st_size[reshard_dim], world_size)
+    input_split_sizes = [0] * world_size
+    idx = get_idx_from_placements(sharding_spec.placements, current_rank)  # type: ignore[attr-defined]
+    new_rank = resharding_spec.placements[idx].rank()  # type: ignore[union-attr, attr-defined]
+    input_split_sizes[new_rank] = local_shard.size(reshard_dim)
+    # Get output split size for all2all.
+    output_split_sizes = [0] * world_size
+    new_idx = ranks.index(current_rank)
+    sharded_dim_size = get_chunked_dim_size(st_size[reshard_dim], split_size, new_idx)
+    output_split_sizes[new_rank] = sharded_dim_size
+    # Get gathered_input for all2all.
+    local_shard = local_shard.transpose(0, reshard_dim).contiguous()
+    gathered_input_size = list(local_shard.size())
+    gathered_input_size[0] = sharded_dim_size
+    gathered_input = torch.empty(gathered_input_size, device=local_shard.device, dtype=local_shard.dtype)
+    # all2all.
+    local_shard = all_to_all_single(
+        gathered_input,
+        local_shard,
+        input_split_sizes=input_split_sizes,
+        output_split_sizes=output_split_sizes,
+        group=pg,
+    )
+    local_tensor = local_shard.transpose(0, reshard_dim).contiguous()
+    local_shards = [Shard(local_tensor, shards_metadata[current_rank])]
+    return local_shards, shards_metadata
+
+
+def reshard_local_shard(
+    local_tensor: torch.Tensor,
+    st_size: torch.Size,
+    sharding_spec: shard_spec.ShardingSpec,
+    resharding_spec: shard_spec.ShardingSpec,
+    pg: ProcessGroup,
+) -> Tuple[List[Shard], List[ShardMetadata]]:
+    """
+    Reshard a sharded tensor given the ``resharding_spec``. When the reshard dim is
+    different from the original sharding dim, we need to do two steps logically:
+    1. To collect all shards based on original sharding spec.
+    2. Reshard the tensor based on the given resharding spec.
+
+    In reality, we consolidate the two steps into one by sending each rank the new
+    shard based on the resharding spec.
+
+    Args:
+        local_tensor (Tensor): Local tensor stored in the current rank.
+        st_size (torch.Size): The size of the sharded tensor.
+        sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+            specification describing how the tensor is sharded originally.
+        resharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): The
+            specification describing how the tensor will be resharded.
+        pg (ProcessGroup): The process group to aggregate on.
+
+    Returns:
+        A Tuple of the followings:
+            A List[`Shard`] which contains the local tensor and its metadata.
+            A List[`ShardMetadata`] which contains the metadata for the shard, including
+                offsets, lengths and device placement.
+    """
+    current_rank = dist.get_rank(pg)
+    world_size = dist.get_world_size(pg)
+    current_sharding_dim = int(sharding_spec.dim)  # type: ignore[attr-defined]
+    reshard_dim = int(resharding_spec.dim)  # type: ignore[attr-defined]
+
+    # Build shards_metadata first.
+    shards_metadata, ranks = build_reshard_metadata(
+        st_size, resharding_spec, world_size
+    )
+
+    # Compute expected size
+    input_split_sizes = []
+    for metadata in shards_metadata:
+        input_split_sizes.append(metadata.shard_sizes[reshard_dim])
+    rearrange_input = any(ranks[i] > ranks[i + 1] for i in range(len(ranks) - 1))
+
+    if rearrange_input:
+        # Need to re-arrange reshard_dim of local_tensor before all2all.
+        indices: List[int] = []
+        for metadata in shards_metadata:
+            offset_start_idx = metadata.shard_offsets[reshard_dim]
+            split_size = metadata.shard_sizes[reshard_dim]
+            indices += range(offset_start_idx, offset_start_idx + split_size)
+        local_tensor = local_tensor.index_select(
+            reshard_dim, torch.tensor(indices, device=local_tensor.device)
+        )
+
+    # Because reshard_dim != original shard_dim. We need to compute the
+    # size of tensor from each rank.
+    output_tensor_list = [torch.tensor(1)] * world_size
+    split_size = get_split_size(st_size[current_sharding_dim], world_size)
+    rearrange_output_list = False
+    indices = []
+    for idx, placement in enumerate(sharding_spec.placements):  # type: ignore[attr-defined]
+        sharded_dim_size = get_chunked_dim_size(
+            st_size[current_sharding_dim], split_size, idx
+        )
+        output_tensor_size = list(st_size)
+        output_tensor_size[current_sharding_dim] = sharded_dim_size
+        output_tensor_size[reshard_dim] = input_split_sizes[current_rank]
+        output_tensor_list[
+            placement.rank()
+        ] = torch.empty(  # type: ignore[union-attr, index]
+            output_tensor_size, device=local_tensor.device, dtype=local_tensor.dtype
+        )
+        indices.append(placement.rank())  # type: ignore[union-attr, index, arg-type]
+        if idx != placement.rank():  # type: ignore[union-attr]
+            rearrange_output_list = True
+
+    # Perform autograd enabled all2all.
+    input_tensor_tuple = torch.split(local_tensor, input_split_sizes, dim=reshard_dim)
+    input_tensor_list = [tensor.contiguous() for tensor in input_tensor_tuple]
+    output_tensor_list = all_to_all(
+        output_tensor_list,
+        input_tensor_list,
+        group=pg,
+    )
+
+    if rearrange_output_list:
+        # Need to re-arrange original shard_dim of output_tensor_list.
+        output_tensor_list = [output_tensor_list[idx] for idx in indices]  # type: ignore[call-overload]
+    local_tensor = torch.cat(output_tensor_list, dim=current_sharding_dim)
+    local_shards = [Shard(local_tensor, shards_metadata[current_rank])]
+    return local_shards, shards_metadata
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/shard.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/shard.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d0768bc9a05f1e9a159f8602e33b26c18c253ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/shard.py
@@ -0,0 +1,58 @@
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed.remote_device import _remote_device
+
+
+@dataclass
+class Shard:
+    """
+    Container which holds the data for a shard as a Tensor and also
+    the associated metadata for that shard.
+
+    Args:
+        tensor(torch.Tensor): Local tensor for the shard.
+        metadata(:class `torch.distributed._shard.sharded_tensor.ShardMetadata`):
+            The metadata for the shard, including offsets, lengths and device placement.
+    """
+    __slots__ = ['tensor', 'metadata']
+    tensor: torch.Tensor
+    metadata: ShardMetadata
+
+    def __post_init__(self):
+        # verification between local tensor and metadata
+        if list(self.tensor.size()) != self.metadata.shard_sizes:
+            raise ValueError(
+                "Shard tensor size does not match with metadata.shard_lengths! "
+                f"Found shard tensor size: {list(self.tensor.size())}, "
+                f"metadata.shard_lengths: {self.metadata.shard_sizes}, "
+            )
+        placement_device = self.metadata.placement
+        if placement_device is not None and placement_device.device() != self.tensor.device:
+            raise ValueError(
+                f"Local shard tensor device does not match with local Shard's placement! "
+                f"Found local shard tensor device: {self.tensor.device}, "
+                f"local shard metadata placement device: {placement_device.device()}"
+            )
+
+    @classmethod
+    def from_tensor_and_offsets(cls, tensor: torch.Tensor, shard_offsets: List[int], rank: int):
+        """
+        Creates a Shard of a ShardedTensor from a local torch.Tensor, shard_offsets and rank.
+
+        Args:
+            tensor(torch.Tensor): Local tensor for the shard.
+            shard_offsets(List[int]): List of integers specify the offset
+                of the shard on each dimension.
+            rank(int): Specify the rank for the shard.
+        """
+        shard_sizes = list(tensor.size())
+        placement = _remote_device(f"rank:{rank}/{str(tensor.device)}")
+        shard_meta = ShardMetadata(
+            shard_offsets=shard_offsets,
+            shard_sizes=shard_sizes,
+            placement=placement
+        )
+        return Shard(tensor, shard_meta)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/utils.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..343af02c96600212fa98ec0f5b773e719b909228
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharded_tensor/utils.py
@@ -0,0 +1,211 @@
+import collections.abc
+import copy
+from typing import Optional, List, Sequence
+
+import torch
+from torch.distributed import distributed_c10d
+from torch.distributed import rpc
+from torch.distributed._shard.sharding_spec._internals import (
+    check_tensor,
+    validate_non_overlapping_shards_metadata,
+)
+
+from torch.distributed._shard.metadata import ShardMetadata
+from .metadata import TensorProperties, ShardedTensorMetadata
+from .shard import Shard
+
+def _parse_and_validate_remote_device(pg, remote_device):
+    if remote_device is None:
+        raise ValueError("remote device is None")
+
+    worker_name = remote_device.worker_name()
+    rank = remote_device.rank()
+    device = remote_device.device()
+
+    # Validate rank, skip validation if rank is not part of process group.
+    if not distributed_c10d._rank_not_in_group(pg):
+        if rank is not None and (rank < 0 or rank >= distributed_c10d.get_world_size(pg)):
+            raise ValueError(f'Invalid rank: {rank}')
+
+    if worker_name is not None:
+        if not rpc._is_current_rpc_agent_set():
+            raise RuntimeError(f'RPC framework needs to be initialized for using worker names: {worker_name}')
+
+        workers = rpc._get_current_rpc_agent().get_worker_infos()
+        for worker in workers:
+            if worker.name == worker_name:
+                return worker.id, device
+
+        raise ValueError(f'Invalid worker name: {worker_name}')
+
+    return rank, device
+
+def _validate_output_tensor_for_gather(
+    my_rank: int,
+    dst_rank: int,
+    size: torch.Size,
+    dst_tensor: Optional[torch.Tensor],
+) -> None:
+    if dst_rank == my_rank:
+        if dst_tensor is None:
+            raise ValueError(
+                f"Argument ``dst_tensor`` must be specified on destination rank {dst_rank}"
+            )
+        if tuple(size) != (dst_tensor.size()):
+            raise ValueError(
+                f"Argument ``dst_tensor`` have size {tuple(dst_tensor.size())},"
+                f"but should be {tuple(size)}"
+            )
+    elif dst_tensor:
+        raise ValueError(
+            "Argument ``dst_tensor`` must NOT be specified "
+            "on non-destination ranks."
+        )
+
+def _flatten_tensor_size(size) -> torch.Size:
+    """
+    Checks if tensor size is valid, then flatten/return a torch.Size object.
+    """
+    if len(size) == 1 and isinstance(size[0], collections.abc.Sequence):
+        dims = list(*size)
+    else:
+        dims = list(size)
+
+    for dim in dims:
+        if not isinstance(dim, int):
+            raise TypeError(f'size has to be a sequence of ints, found: {dims}')
+
+    return torch.Size(dims)
+
+def _raise_if_mismatch(expected, actual, prop_name, ranks, is_local=True):
+    if is_local:
+        assert isinstance(ranks, int)
+        if expected != actual:
+            raise ValueError(f"Local shards' tensor {prop_name} property need to be the same on rank:{ranks}! "
+                             f"Found one local shard tensor {prop_name}={expected}, "
+                             f"the other local shard tensor {prop_name}={actual}.")
+    else:
+        # compare failure check across ranks, ranks list should have two rank
+        assert len(ranks) == 2
+        if expected != actual:
+            raise ValueError(f"ShardedTensor {prop_name} property does not match from different ranks! "
+                             f"Found {prop_name}={expected} on rank:{ranks[0]}, "
+                             f"and {prop_name}={actual} on rank:{ranks[1]}.")
+
+
+def build_metadata_from_local_shards(
+    local_shards: List[Shard],
+    global_size: torch.Size,
+    current_rank: int,
+    pg: distributed_c10d.ProcessGroup
+) -> ShardedTensorMetadata:
+
+    assert len(local_shards) > 0, "must have local shards!"
+    local_shard_metadatas: List[ShardMetadata] = []
+
+    first_shard_dtype = local_shards[0].tensor.dtype
+    first_shard_layout = local_shards[0].tensor.layout
+    first_shard_requires_grad = local_shards[0].tensor.requires_grad
+    first_shard_is_pinned = local_shards[0].tensor.is_pinned()
+
+    # 1). Validate local tensors and associated metadatas
+    for local_shard in local_shards:
+        local_shard_tensor = local_shard.tensor
+        local_shard_meta = local_shard.metadata
+        local_shard_metadatas.append(local_shard_meta)
+        rank, local_device = _parse_and_validate_remote_device(pg, local_shard_meta.placement)
+
+        if local_shard_tensor.layout != torch.strided or local_shard_tensor.layout != first_shard_layout:
+            raise ValueError(
+                f'Only torch.strided layout is currently supported, but found '
+                f'{local_shard_tensor.layout} on rank:{current_rank}!'
+            )
+
+        if not local_shard_tensor.is_contiguous():
+            raise ValueError('Only torch.contiguous_format memory_format is currently supported!')
+
+        if rank != current_rank:
+            raise ValueError(
+                f"Local shard metadata's rank does not match with the rank in its process group! "
+                f'Found current rank in the process group: {current_rank}, '
+                f"local ShardMetadata placement's rank: {rank}"
+            )
+        if local_shard_tensor.device != local_device:
+            raise ValueError(
+                f"Local shard tensor device does not match with local Shard's placement! "
+                f"Found local shard tensor device: {local_shard_tensor.device}, "
+                f"local shard metadata placement device: {local_device}"
+            )
+
+        _raise_if_mismatch(local_shard_meta.shard_sizes, list(local_shard_tensor.size()), "size", current_rank)
+        _raise_if_mismatch(local_shard_tensor.is_pinned(), first_shard_is_pinned, "pin_memory", current_rank)
+        _raise_if_mismatch(local_shard_tensor.dtype, first_shard_dtype, "dtype", current_rank)
+        _raise_if_mismatch(local_shard_tensor.requires_grad, first_shard_requires_grad, "requires_grad", current_rank)
+
+    # 2). Build a "local" ShardedTensorMetadata with all local shards on this rank, then
+    #    do all_gather to collect local_sharded_tensor_metadata from all ranks
+    local_tensor_properties = TensorProperties(
+        dtype=first_shard_dtype,
+        layout=first_shard_layout,
+        requires_grad=first_shard_requires_grad,
+        memory_format=torch.contiguous_format,
+        pin_memory=first_shard_is_pinned
+    )
+
+    local_sharded_tensor_metadata = ShardedTensorMetadata(
+        shards_metadata=local_shard_metadatas,
+        size=global_size,
+        tensor_properties=local_tensor_properties)
+
+    return local_sharded_tensor_metadata
+
+
+def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMetadata]]):
+    global_sharded_tensor_metadata = None
+    global_metadata_rank = 0
+
+    for rank, rank_metadata in enumerate(gathered_metadatas):
+        if rank_metadata is None:
+            continue
+
+        if global_sharded_tensor_metadata is None:
+            global_sharded_tensor_metadata = copy.deepcopy(rank_metadata)
+            global_metadata_rank = rank
+        else:
+            _raise_if_mismatch(global_sharded_tensor_metadata.size,
+                               rank_metadata.size,
+                               "global_size",
+                               [global_metadata_rank, rank],
+                               is_local=False)
+
+            # don't need to check layout and memory format as we already checked in local shards validation stage
+            _raise_if_mismatch(global_sharded_tensor_metadata.tensor_properties.dtype,
+                               rank_metadata.tensor_properties.dtype,
+                               "dtype",
+                               [global_metadata_rank, rank],
+                               is_local=False)
+
+            _raise_if_mismatch(global_sharded_tensor_metadata.tensor_properties.requires_grad,
+                               rank_metadata.tensor_properties.requires_grad,
+                               "requires_grad",
+                               [global_metadata_rank, rank],
+                               is_local=False)
+
+            _raise_if_mismatch(global_sharded_tensor_metadata.tensor_properties.pin_memory,
+                               rank_metadata.tensor_properties.pin_memory,
+                               "pin_memory",
+                               [global_metadata_rank, rank],
+                               is_local=False)
+            # pass all validations, extend shards metadata
+            global_sharded_tensor_metadata.shards_metadata.extend(rank_metadata.shards_metadata)
+
+    if global_sharded_tensor_metadata is not None:
+        # check if shards_metadata have overlap shards
+        validate_non_overlapping_shards_metadata(global_sharded_tensor_metadata.shards_metadata)
+
+        # check if the shards_metadata is compatible with global size of the sharded tensor.
+        check_tensor(global_sharded_tensor_metadata.shards_metadata, global_sharded_tensor_metadata.size)
+    else:
+        raise ValueError("ShardedTensor have no local shards on all ranks!")
+
+    return global_sharded_tensor_metadata
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharder.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e732208b557377bdcae044f6b8176e6a275fd092
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharder.py
@@ -0,0 +1,27 @@
+import abc
+import torch.nn as nn
+
+class Sharder(abc.ABC):
+    """
+    This is an interface which allows user to create more advanced
+    sharding strategies that are not easily be composed by the
+    `ShardingSpec`.
+
+    :class:`torch.distributed._shard.sharding_plan.ShardingPlan` could
+    take an object of the `Sharder` and call `shard` to shard the module,
+    then replace the original module with sharded module returned.
+    """
+    @abc.abstractmethod
+    def shard(self, module: nn.Module) -> nn.Module:
+        """
+        Shard a module base on the implementation of this method, and
+        return the sharded version of the module.
+
+        Args:
+            module (:class:`torch.nn.Module`):
+                The module to apply sharding to.
+        Returns:
+            A :class:`torch.nn.Module` object that represents a module
+            that's already been sharded.
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..263e9d538f9a7f4442b12c086ef4620a21d49edd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__init__.py
@@ -0,0 +1,4 @@
+from .api import (
+    ShardingPlan,
+    ShardingPlanner
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d4b60a2af0f8be1461dc5d68a492863dc28f785
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93ce3151d3c243d80e6bd1f9898115922183c856
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/api.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b04a51fb3f1d15a05dd96fd6dbb561b76a5a01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_plan/api.py
@@ -0,0 +1,86 @@
+import abc
+import torch.nn as nn
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+from torch.distributed._shard.sharder import Sharder
+from torch.distributed._shard.sharding_spec import ShardingSpec
+
+@dataclass
+class ShardingPlan:
+    """
+    Representation of a sharding plan, describes how to shard a module
+    across hosts. `plan` is used to shard module parameters according to the spec provided,
+    `output_plan` and `return_local_tensor` are optional, they are used to specify the output
+    layout of a module with a spec, and when to convert back to data parallel fashion.
+
+    Args:
+        plan (Dict[str, Union[:class:`torch.distributed._shard.sharding_spec.ShardingSpec`,
+              :class:`torch.distributed._shard.sharder.Sharder`]):
+            a dict describes how to shard a module, there're currently two ways to shard a module:
+                1. directly shard a module parameter by a `ShardingSpec`, keyed by the name of
+                   a parameter to a `ShardingSpec`.
+                2. shard a submodule by applying a `Sharder` on it, keyed by the name of a module
+                   to a `Sharder` object.
+        output_plan (Dict[str, :class:`torch.distributed._shard.sharding_spec.ShardingSpec`), optional):
+            a dict specifies the layout of a module's output which produces a ShardedTensor,
+            keyed by the name of module to ShardingSpec("" in key means the root module).
+            Default: `None`
+        return_local_tensor (List[str], optional): a list of string, each element enables
+            a module's sharded output to be returned as a Tensor from its local shards to
+            ensure further processing in a data parallel fashion. ("" in list means the
+            root module).
+            Default: None
+    Example:
+      Suppose we want to shard a module with two linear layers and then run it with DDP, we also
+      want to convert the output of the second linear layer back to DDP, we can do it as follows:
+
+        >>> # xdoctest: +REQUIRES(module:torch._C._distributed_c10d)
+        >>> class MyModule(nn.Module):
+        >>>     def __init__(self):
+        >>>        super().__init__()
+        >>>        self.fc1 = nn.Linear()
+        >>>        self.gelu = nn.GELU()
+        >>>        self.fc2 = nn.Linear()
+        >>>        self.relu = nn.Linear()
+        >>>
+        >>>     def forward(self, input):
+        >>>         return self.relu(self.fc2(self.gelu(self.fc1(input))))
+
+
+        >>> # xdoctest: +SKIP("Undefined spec1, spec2)
+        >>> sharding_plan = ShardingPlan(
+        >>>    plan={
+        >>>        "fc1.weight": spec1,
+        >>>        "fc2.weight": spec2
+        >>>    },
+        >>>    output_plan={
+        >>>        "fc2": output_spec
+        >>>    },
+        >>>    return_local_tensor=["fc2"]
+        >>> )
+    """
+    plan: Dict[str, Union[ShardingSpec, Sharder]]
+    output_plan: Optional[Dict[str, ShardingSpec]] = None
+    return_local_tensor: Optional[List[str]] = None
+
+
+class ShardingPlanner(abc.ABC):
+    """
+    Default ShardingPlanner interface, can be extended and
+    implement advanced sharding strategies.
+    """
+    @abc.abstractmethod
+    def build_plan(self, module: nn.Module) -> ShardingPlan:
+        """
+        Given a nn.Module, define how to shard the module across
+        ranks, return a ShardingPlan
+        Args:
+            module (:class:`torch.nn.Module`):
+                The module to apply sharding to.
+        Returns:
+            A :class:`torch.distributed._shard.sharding_plan.ShardingPlan` object that
+            represents how to shard the module.
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f7a90f5156144a940d79475f65a94bc0ee49f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__init__.py
@@ -0,0 +1,12 @@
+from .api import (
+    DevicePlacementSpec,
+    EnumerableShardingSpec,
+    PlacementSpec,
+    ShardingSpec,
+    _infer_sharding_spec_from_shards_metadata,
+)
+from .chunk_sharding_spec import (
+    ChunkShardingSpec as ChunkShardingSpec,
+)
+
+from torch.distributed._shard.metadata import ShardMetadata
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b06e7528523dd3fd083f00ef18b368857eb057b1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/_internals.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/_internals.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1285ef08f4eb974f152ade69c3f64301a767f725
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/_internals.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f2a2f3d0d6d1f070bf6384796010634747bf229
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/chunk_sharding_spec.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/chunk_sharding_spec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d9ef84e97e415ce9410c9b68baa51eec1b1741b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/__pycache__/chunk_sharding_spec.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/_internals.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ae66fe5e0332a00e01c38cef297bc630db846b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/_internals.py
@@ -0,0 +1,209 @@
+from typing import List, Optional, Tuple
+
+from torch.distributed._shard.metadata import ShardMetadata
+
+
+def _check_shard_metadata_pair_overlap(shard1: ShardMetadata, shard2: ShardMetadata):
+    """
+    Checks if two shards overlap.
+    """
+
+    # For each dim of each shard, check if one shard resides on the other
+    # end of second shard with respect to that dim. As an example for a 2D
+    # shard, we would check if one shard is above or on the left of the
+    # other shard.
+    ndims = len(shard1.shard_offsets)
+    for i in range(ndims):
+        if shard1.shard_offsets[i] >= shard2.shard_offsets[i] + shard2.shard_sizes[i]:
+            return False
+        if shard2.shard_offsets[i] >= shard1.shard_offsets[i] + shard1.shard_sizes[i]:
+            return False
+
+    return True
+
+
+def _find_nd_overlapping_shards(
+    shards: List[ShardMetadata], sharded_dims: List[int]
+) -> Optional[Tuple[int, int]]:
+    # Each rank has len(sharded_dims) tuples. Each tuple represent the
+    # [begin, end] (inclusive) pair of that dimension.
+    shard_intervals = [
+        [
+            (s.shard_offsets[dim], s.shard_offsets[dim] + s.shard_sizes[dim] - 1)
+            for dim in sharded_dims
+        ]
+        for s in shards
+    ]
+
+    for i in range(len(shards)):
+        shard_i = shard_intervals[i]
+        for j in range(i + 1, len(shards)):
+            shard_j = shard_intervals[j]
+            # For each dim of each shard, check if one shard resides on the other
+            # end of second shard with respect to that dim. As an example for a 2D
+            # shard, we would check if one shard is above or on the left of the
+            # other shard.
+            overlap = True
+            for interval_i, interval_j in zip(shard_i, shard_j):
+                if interval_i[0] > interval_j[1] or interval_j[0] > interval_i[1]:
+                    overlap = False
+                    break
+            if overlap:
+                return (i, j)
+    return None
+
+
+def _find_1d_overlapping_shards(
+    shards: List[ShardMetadata], dim: int
+) -> Optional[Tuple[int, int]]:
+    # (begin, end, index_in_shards). Begin and end are inclusive.
+    intervals = [
+        (s.shard_offsets[dim], s.shard_offsets[dim] + s.shard_sizes[dim] - 1, i)
+        for i, s in enumerate(shards)
+    ]
+    intervals.sort()
+    for i in range(len(shards) - 1):
+        if intervals[i][1] >= intervals[i + 1][0]:
+            return (intervals[i][2], intervals[i + 1][2])
+    return None
+
+
+def validate_non_overlapping_shards_metadata(shards: List[ShardMetadata]):
+    """
+    Ensures none of the shards overlap with each other.
+
+    Args:
+        shards(List[ShardMetadata]): List of :class:`ShardMetadata` objects representing
+            each shard.
+    Raises:
+        ``ValueError`` if there's overlap in any two shards.
+    """
+    if not shards or len(shards) == 1:
+        return
+
+    sharded_dims: List[int] = []
+    for dim in range(len(shards[0].shard_offsets)):
+        for i in range(1, len(shards)):
+            if (
+                shards[i].shard_offsets[dim] != shards[0].shard_offsets[dim] or
+                shards[i].shard_sizes[dim] != shards[0].shard_sizes[dim]
+            ):
+                sharded_dims.append(dim)
+                break
+
+    pair: Optional[Tuple[int, int]] = None
+    if len(sharded_dims) == 0:
+        # All shards are the same, all dims are not partitioned. Choose any 2.
+        pair = (0, 1)
+    elif len(sharded_dims) == 1:
+        # Shards are partitioned over only one dimension. Overlap can be found
+        # using a O(nlogn) overlapping interval algorithm.
+        pair = _find_1d_overlapping_shards(shards, sharded_dims[0])
+    else:
+        # Shards are partitioned over more than one dimension. Fall back to
+        # pair-wise check. Even though O(nlogn) algorithms (line sweep) exist
+        # for 2D overlap, the implementation is not trivial and may not justify
+        # the time saving in most cases.
+        pair = _find_nd_overlapping_shards(shards, sharded_dims)
+
+    if pair:
+        raise ValueError(f'Shards {shards[pair[0]]} and {shards[pair[1]]} overlap')
+
+
+def check_tensor(shards_metadata, tensor_dims) -> None:
+    """
+    Checks if the shards_metadata is compatible with the provided tensor dims.
+
+    Args:
+        shards_metadata(List[ShardMetadata]): List of :class:`ShardMetadata`
+            objects representing each shard of the tensor.
+        tensor_dims(Sequence of int): Dimensions of tensor to verify
+    Raises:
+        ``ValueError`` if not compatible.
+    """
+
+    # If the tensor's volume matches the total volume of all shards and
+    # all shard boundaries are within tensor dims, we have a compatible
+    # sharding spec for this tensor. Note that we have already verified
+    # we don't have overlapping shards.
+    tensor_rank = len(tensor_dims)
+    shards_rank = len(shards_metadata[0].shard_offsets)
+    if tensor_rank != shards_rank:
+        raise ValueError(f'Rank of tensor is {tensor_rank}, but shards rank is {shards_rank}')
+
+    total_shard_volume = 0
+    for shard in shards_metadata:
+        shard_volume = 1
+        for i, shard_length in enumerate(shard.shard_sizes):
+            shard_volume *= shard_length
+            if shard.shard_offsets[i] + shard.shard_sizes[i] > tensor_dims[i]:
+                raise ValueError(
+                    f'Shard offset {shard.shard_offsets[i]} and length '
+                    f'{shard.shard_sizes[i]} exceeds tensor dim: {tensor_dims[i]} for shard {shard}')
+        total_shard_volume += shard_volume
+
+    tensor_volume = 1
+    for size in tensor_dims:
+        tensor_volume *= size
+
+    if total_shard_volume != tensor_volume:
+        # TODO: Can we improve this error message to point out the gaps?
+        raise ValueError(
+            f'Total volume of shards: {total_shard_volume} '
+            f'does not match tensor volume: {tensor_volume}, in other words '
+            f'all the individual shards do not cover the entire tensor')
+
+def get_split_size(dim_size, chunks):
+    """
+    Computes the split size inline with ``torch.chunk``
+
+    Args:
+        dim_size(int): Size of the dimension being chunked.
+        chunks(int): Number of chunks to create for ``dim_size``.
+
+    Returns:
+        An int indicating the split size to use.
+    """
+    return (dim_size + chunks - 1) // chunks
+
+def get_chunked_dim_size(dim_size, split_size, idx):
+    """
+    Computes the dim size of the chunk for provided ``idx`` given ``dim_size``
+    and ``split_size``.
+
+    Args:
+        dim_size(int): Size of the dimension being chunked.
+        split_size(int): The chunk size for each chunk of ``dim_size``.
+        idx(int): The index of chunk whose dim size is being requested.
+
+    Returns:
+        An int indicating the dim size of the chunk.
+    """
+    return max(min(dim_size, split_size * (idx + 1)) - split_size * idx, 0)
+
+def get_chunk_sharding_params(sharding_dim_size, world_size, spec, rank):
+    """
+    Generate the start pos and offset length for the current rank for
+    chunk sharding.
+
+    Args:
+        sharding_dim_size(int): The dimension length which we shard on.
+        world_size(int): number of ranks.
+        spec (:class:`torch.distributed._shard.sharding_spec.ChunkShardingSpec`):
+            sharding spec.
+        rank(int): # of cuda process.
+
+    Returns:
+        start_pos(int): start position of sharded tensor on the given rank.
+        chunk_size(int): chunk size of sharded tensor on the given rank.
+    """
+    split_size = get_split_size(sharding_dim_size, world_size)
+    current_offsets = 0
+    start_pos = current_offsets
+    for idx, placement in enumerate(spec.placements):
+        chunk_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
+        if rank == placement.rank():
+            start_pos = current_offsets
+            break
+        current_offsets += chunk_size
+    return start_pos, chunk_size  # type: ignore[possibly-undefined]
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/api.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97286f092bda3e5b50901cbda3f4fd2c363e249
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/api.py
@@ -0,0 +1,242 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import functools
+from typing import Callable, Dict, List, TYPE_CHECKING
+
+import torch
+
+from ._internals import (
+    check_tensor,
+    get_chunked_dim_size,
+    get_split_size,
+    validate_non_overlapping_shards_metadata
+)
+from torch.distributed._shard.metadata import ShardMetadata
+
+import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
+from torch.distributed._shard.op_registry_utils import _decorator_func
+
+if TYPE_CHECKING:
+    # Only include ShardedTensor when do type checking, exclude it
+    # from run-time to resolve circular dependency.
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+class PlacementSpec(ABC):  # noqa: B024
+    """
+    Base class representing the placement of an entity. Subclasses of this
+    class can be used to specify customized placements which might not be
+    covered by existing APIs.
+    """
+    pass
+
+
+@dataclass
+class DevicePlacementSpec(PlacementSpec):
+    """
+    Associates placement of an entity with a single device.
+
+    Args:
+        device(:class:`torch.distributed._remote_device`): The device to place the entity on.
+    """
+
+    device: torch.distributed._remote_device
+
+    def __post_init__(self):
+        if not isinstance(self.device, torch.distributed._remote_device):
+            self.device = torch.distributed._remote_device(self.device)
+
+class ShardingSpec(ABC):
+    """
+    Base class representing sharding specifications.
+    """
+    @abstractmethod
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        """
+        Given a global tensor size, define how to shard a tensor like this shape
+        across ranks, return ShardedTensorMetadata
+        Args:
+            tensor_sizes (:class:`torch.Size`):
+                The tensor shape to shard on, a `torch.Size` object that represents the
+                tensor shape to be sharded according to the ShardingSpec.
+            tensor_properties(:class:`torch.distributed._shard.sharded_tensor.TensorProperties):
+                Tensor properties used to create a ShardedTensor.
+        Returns:
+            A :class:`ShardedTensorMetadata` object that encodes the information about
+            the layout of the ShardedTensor and its properties.
+        """
+
+    @abstractmethod
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        """
+        Given a global tensor on src_rank, shard this tensor
+        across ranks within the process group, return a ShardedTensor.
+        Args:
+            tensor (:class:`torch.Tensor`): Tensor needs to be sharded.
+        Keyword args:
+            src_rank (int, optional): The source rank which is used as the ground truth of
+                the data for the parameter that would be sharded and scattered
+                across the rest of the ranks.
+                Default: 0.
+            process_group (ProcessGroup, optional): The process group to work on. If None,
+                the default process group will be used.
+        Returns:
+            A :class:`ShardedTensor` sharded from the given tensor.
+        """
+
+# Ops customized for a particular ShardingSpec.
+_CUSTOM_SHARDING_SPEC_OPS: Dict[str, Dict[Callable, Callable]] = {}
+
+def _has_custom_op(sharding_spec, op):
+    """
+    Returns whether or not the ShardingSpec has a custom op implementation.
+    """
+    class_name = type(sharding_spec).__qualname__
+    return class_name in _CUSTOM_SHARDING_SPEC_OPS and op in _CUSTOM_SHARDING_SPEC_OPS[class_name]
+
+def _dispatch_custom_op(sharding_spec, op: Callable, types, args, kwargs, process_group):
+    """
+    Calls the custom op for this ShardingSpec if it exists.
+    """
+    class_name = type(sharding_spec).__qualname__
+    if not _has_custom_op(sharding_spec, op):
+        raise RuntimeError(f'Custom op: {op} not registered for {class_name}')
+    func = _CUSTOM_SHARDING_SPEC_OPS[class_name][op]
+    return func(types, args, kwargs, process_group)
+
+def custom_sharding_spec_op(sharding_spec_class, func):
+    """
+    Decorator to allow custom registration of ops.
+    Args:
+        sharding_spec_class(type): The ShardingSpec for which we need to add this custom op.
+        func(Callable): The op to override (ex: torch.bmm)
+    """
+    class_name = sharding_spec_class.__qualname__
+    if class_name not in _CUSTOM_SHARDING_SPEC_OPS:
+        _CUSTOM_SHARDING_SPEC_OPS[class_name] = {}
+    return functools.partial(
+        _decorator_func,
+        op=func,
+        op_table=_CUSTOM_SHARDING_SPEC_OPS[class_name]
+    )
+
+
+@dataclass
+class EnumerableShardingSpec(ShardingSpec):
+    """
+    This is a type of PlacementSpec that allows users to specify a generic
+    sharding scheme by enumerating exactly how each shard is laid out.
+
+    Args:
+        shards(List[ShardMetadata]): List of :class:`ShardMetadata` objects representing
+            each shard. Note that none of the shards should overlap.
+    """
+
+    shards: List[ShardMetadata]
+
+    def __post_init__(self):
+        if len(self.shards) == 0:
+            raise ValueError(f'Empty shard list provided: {self.shards}')
+
+        # Validate each shard has same rank.
+        rank = -1
+        for shard in self.shards:
+            if rank != -1 and rank != len(shard.shard_offsets):
+                raise ValueError(f'Found inconsistent ranks for shards: {rank} and {len(shard.shard_offsets)}')
+            rank = len(shard.shard_offsets)
+
+        validate_non_overlapping_shards_metadata(self.shards)
+
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        # check if shards form a valid tensor
+        check_tensor(self.shards, tensor_sizes)
+        return sharded_tensor_meta.ShardedTensorMetadata(
+            self.shards,
+            tensor_sizes,
+            tensor_properties
+        )
+
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        # TODO: figure out a generic and efficient way to scatter the shards for EnumerableShardingSpec
+        raise NotImplementedError("EnumerableShardingSpec.shard not implemented yet!")
+
+
+def _infer_sharding_spec_from_shards_metadata(shards_metadata):
+    """
+    Infer the sharding spec from the metadata of each shard of a ShardedTensor.
+    If the tensor is sharded only on one dimension, we can then verify whether it's
+    a ChunkShardingSpec or not. The way to verify it is to first get the total length
+    and perform a chunk sharding with the given placements to see if we can have the
+    same chunk size as the given shards_metadata. If not, we assume it's enum sharded.
+
+    Args:
+        shards_metadata (List[ShardMetadata]): List of Metadata of local shards.
+
+    Returns:
+        A :class:`torch.distributed._shard.sharding_spec.ShardingSpec` object of sharding
+            spec for one sharded tensor.
+    """
+    placements = []
+    chunk_sharding_dim = None
+    chunk_offset_list = []
+    shard_size_list = []
+    shard_offset_list = []
+    # collect local shard metadatas from the global sharded_tensor_metadata
+    for shard_metadata in shards_metadata:  # type: ignore[attr-defined]
+        placements.append(shard_metadata.placement)
+        local_offsets = shard_metadata.shard_offsets
+        chunk_offset_list.append(sum(local_offsets))
+        shard_size_list.append(shard_metadata.shard_sizes)
+        shard_offset_list.append(shard_metadata.shard_offsets)
+        shard_dims = [idx for idx, e in enumerate(local_offsets) if e != 0]
+        # If the offset is [0, 0, ..., 0] (all zeros),
+        # we cannot decide whether how the tensor is sharded.
+        if len(shard_dims) == 0:
+            continue
+        # If the offset is [0, N, .,0, M, 0, .., 0],
+        # we are sure it's sharded by more than one dimension.
+        if len(shard_dims) != 1:
+            chunk_sharding_dim = None
+            break
+        # If the offset is [0, 0, .,0, M, 0, .., 0], aka, it's sharded by just
+        # one dimension, we need to make sure all ranks share the same dimension.
+        if not chunk_sharding_dim:
+            chunk_sharding_dim = shard_dims[0]
+        elif chunk_sharding_dim != shard_dims[0]:
+            chunk_sharding_dim = None
+            break
+
+    if chunk_sharding_dim is not None:
+        # Ensure we infer the correct placement order from offsets
+        placements = [
+            x for _, x in sorted(zip(chunk_offset_list, placements), key=lambda e: e[0])
+        ]
+
+        from .chunk_sharding_spec import ChunkShardingSpec
+        chunk_spec = ChunkShardingSpec(
+            dim=chunk_sharding_dim,
+            placements=placements,
+        )
+
+        shard_sizes = sorted([x[chunk_sharding_dim] for x in shard_size_list])
+        shard_total_length = sum(shard_sizes)
+        shard_offsets = sorted([x[chunk_sharding_dim] for x in shard_offset_list])
+
+        chunks = len(placements)
+        split_size = get_split_size(shard_total_length, chunks)
+        chunk_shard_sizes = sorted(
+            [
+                get_chunked_dim_size(shard_total_length, split_size, idx)
+                for idx in range(chunks)
+            ]
+        )
+        # Should match ChunkShardingSpec offsets calculation
+        chunk_shard_offsets = [split_size * idx for idx in range(chunks)]
+        if shard_sizes == chunk_shard_sizes and shard_offsets == chunk_shard_offsets:
+            return chunk_spec
+    return EnumerableShardingSpec(shards_metadata)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..723908b94f494a595b0fb4209ed6db4a5073c85c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py
@@ -0,0 +1,202 @@
+from dataclasses import dataclass
+import torch
+import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._shard.sharded_tensor.utils import (
+    _parse_and_validate_remote_device
+)
+from torch.distributed._shard._utils import narrow_tensor
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as distributed_c10d
+from typing import List, Union, TYPE_CHECKING
+from ._internals import (
+    get_chunked_dim_size,
+    get_split_size,
+)
+
+from .api import ShardingSpec
+
+if TYPE_CHECKING:
+    # Only include ShardedTensor when do type checking, exclude it
+    # from run-time to resolve circular dependency.
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+@dataclass
+class ChunkShardingSpec(ShardingSpec):
+    """
+    This is a type of PlacementSpec that defines the placement as being sharded
+    across multiple devices. In particular, it represents sharding a Tensor
+    along a single dimension into equal chunks (similar to :meth:`torch.chunk`).
+
+    The semantics of how a tensor is partitioned is inline with
+    :meth:`torch.chunk`, where ``dim`` in torch.chunk corresponds to the
+    specified ``dim`` and ``chunks`` in torch.chunk is the number of elements
+    in the placement specified.
+
+    Args:
+        dim (int or str):
+            The dimension to shard on, could be an integer representing the
+            dimension or a string in case of named tensors where dimensions are
+            named. Note that named tensor support is not added yet.
+        placement(List[Union[_remote_device, str]]):
+            Specifies the placement of each shard of the Tensor. The size of
+            the list represents the number of shards to be created. This could
+            be a list of
+            :class:`torch.distributed._remote_device`'s. This list
+            could also contain a string which represents remote
+            device as accepted by
+            :class:`torch.distributed._remote_device`
+    """
+
+    ShardingDim = Union[int, str]
+
+    dim: ShardingDim
+    placements: List[Union[torch.distributed._remote_device, str]]
+
+    def __post_init__(self):
+        self._verify_dim(self.dim)
+        for i, remote_device in enumerate(self.placements):
+            if not isinstance(remote_device, torch.distributed._remote_device):
+                self.placements[i] = torch.distributed._remote_device(remote_device)
+
+    @staticmethod
+    def _verify_dim(dim):
+        # Validate the sharding spec.
+        # TODO: support named dimension
+        if isinstance(dim, str):
+            raise NotImplementedError(
+                "ChunkShardingSpec does not support named dimension yet!"
+            )
+
+        if not isinstance(dim, int):
+            raise ValueError(
+                f"Sharding dim needs to be an integer, found: {dim}"
+            )
+
+    def build_metadata(self,
+                       tensor_sizes: torch.Size,
+                       tensor_properties: sharded_tensor_meta.TensorProperties,
+                       ) -> sharded_tensor_meta.ShardedTensorMetadata:
+        tensor_num_dim = len(tensor_sizes)
+
+        self._verify_dim(self.dim)
+        if self.dim >= tensor_num_dim or self.dim < -tensor_num_dim:  # type: ignore[operator]
+            raise ValueError(f"Invalid sharding dim: {self.dim}")
+
+        shards_metadata = []
+        sharding_dim_size = tensor_sizes[self.dim]  # type: ignore[index]
+        chunks = len(self.placements)
+        split_size = get_split_size(sharding_dim_size, chunks)
+        for idx, placement in enumerate(self.placements):
+            # generate ShardMetadata for each placement device
+            chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
+            shard_size = list(tensor_sizes)
+            current_offsets = [0] * tensor_num_dim
+            current_offsets[self.dim] = split_size * idx  # type: ignore[index]
+            shard_size[self.dim] = chunked_dim_size  # type: ignore[index]
+
+            shard_metadata = ShardMetadata(
+                shard_offsets=current_offsets,
+                shard_sizes=shard_size,
+                placement=placement,
+            )
+            shards_metadata.append(shard_metadata)
+
+        return sharded_tensor_meta.ShardedTensorMetadata(
+            shards_metadata,
+            tensor_sizes,
+            tensor_properties
+        )
+
+
+    def shard(self, tensor: torch.Tensor, src_rank: int = 0, process_group=None) -> "ShardedTensor":
+        """
+        Args:
+            src_rank: group rank relative to ``process_group``
+
+            N.B. If ``process_group`` is None, ``src_rank`` is a global rank.
+        """
+        # relative imports to avoid circular dependency
+        from torch.distributed._shard.sharded_tensor import (
+            ShardedTensor
+        )
+        tensor_properties = sharded_tensor_meta.TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned()
+        )
+        current_rank = dist.get_rank(process_group)
+        tensor_meta = self.build_metadata(tensor.size(), tensor_properties)
+        local_shards = []
+        local_tensor = None
+        local_metadata = None
+        tensors_to_scatter = [None] * dist.get_world_size(process_group)
+
+        sharding_dim_size = tensor.size()[self.dim]  # type: ignore[index]
+        chunks = len(self.placements)
+        split_size = get_split_size(sharding_dim_size, chunks)
+        scatter_shape = list(tensor.size())
+        scatter_shape[self.dim] = split_size  # type: ignore[index]
+
+        for shard_meta in tensor_meta.shards_metadata:
+            rank, device = _parse_and_validate_remote_device(process_group, shard_meta.placement)
+            if current_rank == src_rank:
+                # Reshape to get shard for this rank and we don't want autograd
+                # recording here for the narrow op and 'local_shard' should be a
+                # leaf variable in the autograd graph.
+                narrowed_tensor = narrow_tensor(tensor, shard_meta)
+                if shard_meta.shard_sizes[self.dim] < split_size:  # type: ignore[index]
+                    # for the last shard that might be smaller to other shards
+                    # resize the narrowed tensor to the same size and use it for
+                    # the scatter collective as dist.scatter requires same size
+                    # inputs on every rank
+                    tensor_to_scatter = narrowed_tensor.detach().clone().resize_(scatter_shape)
+                else:
+                    tensor_to_scatter = narrowed_tensor.detach().clone().contiguous()
+
+                tensors_to_scatter[rank] = tensor_to_scatter
+
+            if current_rank == rank:
+                local_tensor = torch.empty(
+                    scatter_shape, dtype=tensor.dtype, layout=tensor.layout, device=device)
+                local_metadata = shard_meta
+
+        # each rank should have local_tensor and local_metadata initialized if we build
+        # the metadata list in a correct way.
+        assert local_tensor is not None
+        assert local_metadata is not None
+
+        # Scatter the shards to all ranks in the pg
+        # scatter takes the global rank as ``src``
+        src_for_scatter = src_rank
+        if process_group is not None and process_group is not distributed_c10d._get_default_group():
+            src_for_scatter = distributed_c10d.get_global_rank(process_group, src_for_scatter)
+
+        dist.scatter(
+            local_tensor,
+            scatter_list=tensors_to_scatter if current_rank == src_rank else None,
+            src=src_for_scatter,
+            group=process_group
+        )
+
+        if list(local_tensor.size()) != local_metadata.shard_sizes:
+            # detach again after receiving to ensure local shards remain a leaf node
+            local_tensor = local_tensor.resize_(local_metadata.shard_sizes).detach()
+
+        # Sync requires_grad to local_shard.
+        local_tensor.requires_grad = tensor.requires_grad
+
+        local_shards.append(Shard(tensor=local_tensor, metadata=local_metadata))
+
+        st = ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards,
+            tensor_meta,
+            process_group=process_group)
+
+        # Manually set sharding_spec
+        st._sharding_spec = self
+
+        return st
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c2ff6960456ccaf873b1b21d46a740c37a69788
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b725af33c32ef205fee58422a021e9c37f7e3ecc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a24d3e2d9719147b563ba12a798317a1fe4bb14
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding_bag.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding_bag.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4858d14bed6274985f07b8bc8b667dc929cfec53
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__pycache__/embedding_bag.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..083bd959a9dc9ddf3b7c7dc12e384194f98dced0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py
@@ -0,0 +1,349 @@
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._shard.sharded_tensor._ops._common import _sharded_op_common
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec._internals import (
+    get_chunk_sharding_params,
+    get_chunked_dim_size,
+    get_split_size,
+)
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+from torch.distributed.nn.functional import (
+    _all_gather_base,
+    all_reduce,
+    all_to_all_single,
+)
+
+
+def _chunk_sharding_spec_check(spec, op):
+    """
+    For the given op implementation check if the sharding spec is ChunkShardingSpec.
+    """
+    if not isinstance(spec, ChunkShardingSpec):
+        raise NotImplementedError(
+            f"Only ChunkShardingSpec supported for '{op.__name__}'."
+        )
+
+
+def _register_sharded_op_on_local_tensor(
+    op, early_stop_func=None, extra_check=None, customized_func=None
+):
+    """
+    Handles ``__torch_function__`` dispatch for ops which are performed on
+    the single local tensor of the sharded tensor such as op like
+    ``torch.nn.functional.softmax`` or ``torch.Tensor.view``.
+
+    For more complicated ops, a customized func can be used to generate
+    the new local tensor, sharding spec and sharded tensor size.
+
+    Args:
+        op: The op to be registered and applied to all shards of the st.
+        early_stop_func (Callable, optional): the func for early stop.
+            Default: if ``None``, no early stop.
+        extra_check (Callable, optional): the func for extra condition check.
+            Default: if ``None``, no extra check.
+        customized_func (Callable, optional): the func for customized logic
+            to generate the new local tensor, sharding spec and sharded tensor size.
+            Default: if ``None``, we simply lower to the real op call with
+                the single local tensor of the st.
+
+    Return:
+        func (Callable): registered implementation for sharded op for
+        ``__torch_function__`` dispatch.
+    """
+
+    @custom_sharding_spec_op(ChunkShardingSpec, op)
+    @_sharded_op_common(op, early_stop_func, extra_check)
+    def sharded_tensor_op_on_local_tensor(types, args=(), kwargs=None, pg=None):
+        st = args[0]
+        sharding_spec = st.sharding_spec()
+        if len(st.local_shards()) != 1:
+            raise TypeError(
+                f"torch function '{op.__name__}', with args: {args} and "
+                f"kwargs: {kwargs} only supported for single local tensor!"
+            )
+        st_size = st.size()
+        if customized_func:
+            local_tensor, sharding_spec, st_size = customized_func(args, kwargs, pg)
+        else:
+            args = (st.local_tensor(), *args[1:])
+            local_tensor = op(*args, **kwargs)
+        return ShardedTensor._init_from_local_tensor(
+            local_tensor.contiguous(),
+            sharding_spec,
+            st_size,  # type: ignore[arg-type]
+            process_group=pg,
+            init_rrefs=st._init_rrefs,
+        )
+
+
+def _handle_col_wise_sharding_base(
+    op_func,
+    col_dim,
+    input,
+    world_size,
+    weight,
+    local_shard,
+    pg,
+    gathered_inputs,
+    mode=None,
+    gathered_per_sample_weights=None,
+    gathered_offsets=None,
+    padding_idx=None,
+):
+    """
+    For col-wise sharding of weight, lots of logic are common.
+    So we extract the common logic and put in this function:
+    Step 1. To get input from each rank and
+    Step 2. To perform the op on the concatenated tensor.
+    Step 3. To distribute results to each rank with col rearrangement.
+    Step 4. To concatenate all results from all ranks.
+
+    Args:
+        op_func: operator which is applied to the input tensor.
+        col_dim: dim of result tensor after the operation.
+        input: tensor to be applied op on.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        local_shard: col-wise sharded weight tensor.
+        pg: process group.
+        gathered_inputs: list of inputs from all ranks. If specified, we
+            don't need to communicate with each rank any more.
+        mode: aggregation mode of EmbeddingBag.
+        gathered_per_sample_weights: per_sample_weights across all ranks.
+        gathered_offsets: offsets across all ranks.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+            Note that the embedding vector at padding_idx is
+            excluded from the reduction.
+
+    Return: final result of input being applied with the op.
+    """
+    # run the operator's function for all the inputs.
+    results = []
+    for i, inp in enumerate(gathered_inputs):
+        if op_func == torch.nn.functional.embedding_bag:
+            result = op_func(
+                inp,
+                local_shard,
+                offsets=gathered_offsets[i] if gathered_offsets is not None else None,
+                mode=mode,
+                per_sample_weights=gathered_per_sample_weights[i]
+                if gathered_per_sample_weights is not None
+                else None,
+                padding_idx=padding_idx,
+            )
+        elif op_func == torch.nn.functional.embedding:
+            result = op_func(
+                inp,
+                local_shard,
+                padding_idx=padding_idx,
+            )
+        else:
+            result = op_func(inp, local_shard)
+        results.append(torch.transpose(result, 0, col_dim))
+
+    # Distribute results to each rank with col rearrangement.
+    output = _result_distribute_with_col_rearrange(
+        results, input, world_size, weight, pg
+    )
+
+    # transpose the output and return result.
+    return torch.transpose(output, 0, col_dim)
+
+
+def _result_distribute_with_col_rearrange(results, input, world_size, weight, pg):
+    """
+    For col-wise sharding of weight, we need to distribute
+    results to each rank. We do them in this function.
+    Note that, if the index in the Sharding Spec is not equal to
+    the rank number, we need to do the rearrangement based on the
+    order given by the Sharding Spec (placement).
+
+    Args:
+        results: results from ops applied to inputs from all ranks.
+            We need to distribute them back to their original ranks.
+        input: tensor to be applied op to.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        pg: process group.
+
+    Return: column rearranged result.
+    """
+    # Process results and outputs for all2all.
+    sharding_dim = weight._sharding_spec.dim
+    sharding_dim_size = weight.size(sharding_dim)
+    dims = list(results[0].size())
+    dims[0] = sharding_dim_size
+    combined_results = torch.cat(results)
+    output = torch.empty(
+        *dims, device=combined_results.device, dtype=combined_results.dtype
+    )
+
+    # Compute output splits
+    split_size = get_split_size(sharding_dim_size, world_size)
+    output_split_sizes = [0] * world_size
+    for idx, placement in enumerate(weight._sharding_spec.placements):
+        output_split_sizes[placement.rank()] = get_chunked_dim_size(
+            sharding_dim_size, split_size, idx
+        )
+
+    # distribute the outputs using all2all.
+    output = all_to_all_single(
+        output, combined_results, output_split_sizes=output_split_sizes, group=pg
+    )
+
+    # Check if we need to rearrange columns appropriately for output.
+    rearrange_columns = any(
+        idx != placement.rank()
+        for idx, placement in enumerate(weight._sharding_spec.placements)
+    )
+    if not rearrange_columns:
+        return output
+
+    indices = []
+    for placement in weight._sharding_spec.placements:
+        dim_size = output_split_sizes[placement.rank()]
+        start = sum(
+            [
+                split_size if i < placement.rank() else 0
+                for i, split_size in enumerate(output_split_sizes)
+            ]
+        )
+        indices += list(range(start, start + dim_size))
+
+    return output.index_select(0, torch.tensor(indices, device=output.device))
+
+
+def _handle_max_norm_col_wise(
+    max_norm,
+    norm_type,
+    local_shard,
+    input,
+    world_size,
+    gathered_inputs,
+    pg,
+):
+    """
+    For col-wise sharding of weight, we need to aggregate the
+    norm across all ranks before we can perform the proper re-norm.
+    Note that, the max_norm logic is only applied to the embedding
+    indices that are looked up and not the whole shard.
+
+    Args:
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        local_shard: col-wise shared local weight used for lookup.
+        input: tensor to be applied op to.
+        world_size: number of ranks.
+        gathered_inputs: list of inputs from all ranks.
+        pg: process group.
+
+    Return:
+        local_shard_norm_renormed: local_shard re-normed to max_norm if the norm is larger
+            than it.
+
+    """
+    norm_type = norm_type if norm_type is not None else 2.0
+    unique_inp = torch.unique(torch.cat(gathered_inputs))
+    local_shard_sum = torch.sum(
+        torch.pow(torch.abs(local_shard), norm_type), dim=1, dtype=local_shard.dtype
+    )
+    # For col-wise sharding, we need to first aggregate the powered sum
+    # from each rank first and then calculate the norm.
+    local_shard_sum = all_reduce(local_shard_sum, group=pg)
+    local_shard_norm = torch.pow(local_shard_sum, 1.0 / norm_type)
+    max_norm_tensor = torch.full(
+        (local_shard.size(0),),
+        float("inf"),
+        dtype=local_shard.dtype,
+        device=input.device,
+    )
+    max_norm_tensor[unique_inp] = max_norm
+    local_shard_t = local_shard.t().contiguous()
+    normalized_tensor = torch.where(
+        local_shard_norm > max_norm_tensor, max_norm_tensor, local_shard_norm
+    )
+    # Make sure divisor is not zero.
+    local_shard_norm[local_shard_norm == 0.0] = 1.0
+    local_shard_norm_renormed = (
+        torch.div(torch.mul(local_shard_t, normalized_tensor), local_shard_norm)
+        .t()
+        .contiguous()
+    )
+    return local_shard_norm_renormed
+
+
+def _all_gather_base_input(input, pg):
+    """
+    Use _all_gather_base to get a concatenated input from each rank.
+
+    Args:
+        input: tensor to be applied op on.
+        pg: process group.
+
+    Returns:
+        gathered_inputs: input gathered from each rank and concat by dim 0.
+    """
+    # allgather the inputs first.
+    gather_inp_size = list(input.size())
+    gather_inp_size[0] = input.size(0) * dist.get_world_size(pg)
+    gather_inp = torch.empty(gather_inp_size, device=input.device, dtype=input.dtype)
+    return _all_gather_base(gather_inp, input, group=pg)
+
+
+def _handle_row_wise_mask(gather_inp, padding_idx, weight, world_size, rank):
+    """
+    Mask the input for embedding look-up for IDs which are not stored
+    on the current rank. This function also adjust the ``padding_idx``
+    so that it is only used on the rank where the corresponding row is
+    stored.
+
+    Note that, with ``max_norm`` flag on, only weights of rows being
+    looked up will be re-normed. So we need an extra row for masked ID
+    so that it does not affect the final result and ``max_norm``.
+
+    Args:
+        gather_inp: tensor to be applied op on gathered from all ranks.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+            Note that the embedding vector at padding_idx is
+            excluded from the reduction.
+        weight: weight tensor of Embedding look-up table.
+        world_size: number of ranks.
+        rank: # of cuda process.
+
+    Returns:
+        lookup_input: Tensor of masked input.
+        padding_idx: adjusted padding_idx.
+        padding_row: The extra row we used during lookup so that
+            looking up does not affect ``max_norm``.
+    """
+    (start_pos, chunk_size) = get_chunk_sharding_params(
+        weight.size(0), world_size, weight._sharding_spec, rank
+    )
+    mask = (gather_inp < start_pos) | (gather_inp >= start_pos + chunk_size)
+    lookup_input = gather_inp.clone() - start_pos
+    lookup_input[mask] = chunk_size
+    if (
+        padding_idx is not None
+        and padding_idx >= start_pos
+        and padding_idx < (start_pos + chunk_size)
+    ):
+        padding_idx = padding_idx - start_pos
+    else:
+        padding_idx = None
+
+    # When max_norm is set, it will only re-norm the row being looked up.
+    padding_row = torch.zeros(
+        1, weight.size(1), device=gather_inp.device, dtype=weight.dtype
+    )
+    return lookup_input, padding_idx, padding_row
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2bca833975d425b4189398642ab2bf75a33b93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py
@@ -0,0 +1,293 @@
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+from torch.distributed.nn.functional import all_gather, reduce_scatter
+
+from ._common import (
+    _all_gather_base_input,
+    _handle_col_wise_sharding_base,
+    _handle_max_norm_col_wise,
+    _handle_row_wise_mask,
+)
+
+
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.embedding)
+def sharded_embedding(types, args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding``.
+    This method computes a sharded embedding lookup and has the following limitations:
+
+    1. Supports only sharding of ``weight``.
+    2. Supports only ``ChunkShardingSpec``.
+    3. Supports only a single local shard per rank.
+    4. Supports all specs except for scale_grad_by_freq, sparse, etc.
+
+    Based on the dimension that the weight is sharded on, there are two
+    algorithms:
+
+    ROWWISE SHARDING
+    ================
+    For row-wise sharding the weight is sharded on dimension 0.
+
+    The overall algorithm can be best explained with an example. Let's assume
+    the dims for input are (4 x 6) and W are (10 x 17) and W is sharded across
+    4 GPUs creating 3 shard of (3 x 17) and 1 shard of (1 x 17).
+    The algorithm is as follows:
+
+    1. First the input is all gathered to all ranks, since this is SPMD and
+       input is actually sharded across all ranks. The inputs then become a
+       4 (4 x 6) tensor on each rank. For example if the given input is
+       tensor([[6, 5, 2, 9, 6, 3],
+               [3, 1, 2, 4, 7, 6],
+               [4, 0, 4, 9, 8, 9],
+               [8, 6, 6, 4, 6, 1]])
+       on rank 0.
+       Then on every rank, we will have this tensor.
+       If input itself is already replicated, no all-gather will be done.
+    2. Next, we mask the ID which are not stored on that rank.
+       For example on rank 0, we store ID [0, 1, 2]. We only keep the ID
+       inside the set of numbers. The rest of them will be masked to an extra row.
+       The masked matrix will be used for embedding look up and is like:
+       tensor([[4, 4, 2, 4, 4, 4],
+               [4, 1, 2, 4, 4, 4],
+               [4, 0, 4, 4, 4, 4],
+               [4, 4, 4, 4, 4, 1]])
+       The reason of having an extra row (aka, number 4 in the example) is
+       because when max_norm is specified only weight which has looked will
+       be re-normed so mask IDs whose embeddings are not stored in current
+       rank will to an extra row will ensure max_norm still works as expected.
+    3. If max_norm is specified, the extra row guarantees that the mask ID will
+       not affect the behavior of weigh re-norm.
+
+    COLWISE SHARDING
+    ================
+    For col-wise sharding the weight is sharded on dimension 1.
+
+    The overall algorithm can be best explained with an example. Let's assume
+    the dims for input are (4 x 6) and W are (16 x 17) and W is sharded across
+    4 GPUs creating 3 shards of (16 x 5) and 1 shard of (16 x 2).
+    The algorithm is as follows:
+
+    1. First the input is broadcasted to all ranks, since this is SPMD we
+       actually do an all_gather for all the inputs resulting in 4 (4 x 6)
+       inputs on each rank.
+    2. Next we perform local embedding lookup operation by apply each
+       input (4 x 6) with the local shard (16 x 5) ((16 x 2) for the last).
+       This results in 4 (5 x 6 x 4) ((2 x 6 x 4) for the last) matrices
+       on each rank. We transpose dim 0 and dim 2.
+    3. Next, we concat these 4 matrices and perform an all2all to share the
+       appropriate (5 x 6 x 4) or (2 x 6 x 4) matrices to each rank.
+    4. Now, each rank receives a (17 x 6 x 4) matrix which is basically the
+       size of the result we need.
+    5. If placements are not in order any appropriate rearrangement of columns
+       are done for the (17 x 6 x 4) matrix and finally we transpose the
+       dim 0 and dim 2 again.
+    6. If max_norm is specified, we manually sum up the norm and renorm. Because
+       the renorm must be in place, we need to override the local_shard to mimic
+       this behavior.
+    """
+    # Validate input params
+    _validate_embedding_param(args, kwargs)
+
+    input = args[0]
+    weight = args[1]
+    max_norm = kwargs.get("max_norm")
+    norm_type = kwargs.get("norm_type")
+    padding_idx = kwargs.get("padding_idx")
+
+    local_shard = weight.local_tensor().contiguous()
+    sharding_dim = weight._sharding_spec.dim
+    world_size = dist.get_world_size(pg)
+    rank = dist.get_rank(pg)
+
+    if sharding_dim == 1:
+        output, local_shard = _handle_col_wise_sharding(
+            input, world_size, weight, local_shard, max_norm, norm_type, padding_idx, pg
+        )
+        weight.local_shards()[0].tensor = local_shard
+        return output
+    elif sharding_dim == 0:
+        return _handle_row_wise_sharding(
+            input,
+            world_size,
+            weight,
+            local_shard,
+            max_norm,
+            norm_type,
+            padding_idx,
+            rank,
+            pg,
+        )
+    else:
+        raise RuntimeError(
+            f"nn.Embedding weight sharded on dim {sharding_dim} not supported!"
+        )
+
+
+def _validate_embedding_param(args, kwargs):
+    """
+    Validate input params of sharded embedding op.
+
+    Args:
+        input: list of ID used for lookup.
+        weight: sharded weight tensor.
+        kwargs: same as normal Embedding.
+
+    Return: None.
+    """
+
+    input = args[0]
+    weight = args[1]
+    max_norm = kwargs.get("max_norm")
+    scale_grad_by_freq = kwargs.get("scale_grad_by_freq")
+    sparse = kwargs.get("sparse")
+
+    # Validate types
+    if not isinstance(input, torch.Tensor):
+        raise TypeError("input need to be torch.Tensor")
+    if not isinstance(weight, ShardedTensor):
+        raise TypeError("weight needs to be ShardedTensor")
+    weight_size = weight.size()
+    if len(weight_size) != 2:
+        raise ValueError("Weight needs to have exactly 2 dims")
+    if int(torch.min(input).item()) < 0:
+        raise ValueError(
+            "Index out of range in Input %d %d",
+            int(torch.min(input).item()),
+            weight_size[1],
+        )
+    if int(torch.max(input).item()) >= weight_size[0]:
+        raise ValueError(
+            "Index out of range in Input %d %d",
+            int(torch.max(input).item()),
+            weight_size[1],
+        )
+    if scale_grad_by_freq:
+        raise RuntimeError(
+            'nn.Embedding weight sharded with flag on "scale_grad_by_freq" not supported!'
+        )
+    if sparse:
+        raise RuntimeError(
+            'nn.Embedding weight sharded with flag on "sparse" not supported!'
+        )
+    if max_norm and max_norm <= 0.0:
+        raise ValueError('"max_norm" must be larger than zero!')
+
+    if not isinstance(weight._sharding_spec, ChunkShardingSpec):
+        raise ValueError("Only ChunkShardingSpec supported for ShardedTensor ops!")
+    if len(weight.local_shards()) != 1:
+        raise ValueError("Only one local shard supported!")
+
+
+def _handle_col_wise_sharding(
+    input, world_size, weight, local_shard, max_norm, norm_type, padding_idx, pg
+):
+    """
+    Entry-point function to handle the logic of col-wise sharding of weight
+    for embedding. (Detailed explanations of the logic can be found in
+    the comment for sharded_embedding.)
+
+    Args:
+        input: list of ID used for lookup and aggregation.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        local_shard: col-wise shared local weight used for lookup.
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+        pg: process group.
+
+    Returns: final result of lookup.
+    """
+    # allgather the inputs first for non Replicated Tensor.
+    gathered_inputs = all_gather(input, group=pg)
+
+    if max_norm is not None:
+        # max_norm changes the weight in-place
+        local_shard = _handle_max_norm_col_wise(
+            max_norm, norm_type, local_shard, input, world_size, gathered_inputs, pg
+        )
+
+    output = _handle_col_wise_sharding_base(
+        torch.nn.functional.embedding,
+        len(input.size()),
+        input,
+        world_size,
+        weight,
+        local_shard,
+        pg,
+        gathered_inputs,
+        padding_idx=padding_idx,
+    )
+    return (output, local_shard)
+
+
+def _handle_row_wise_sharding(
+    input, world_size, weight, local_shard, max_norm, norm_type, padding_idx, rank, pg
+):
+    """
+    Entry-point function to handle the logic of row-wise sharding of weight
+    for embedding. (Detailed explanations of the logic can be found in
+    the comment for sharded_embedding.)
+
+    Args:
+        input: list of ID used for lookup and aggregation.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        local_shard: row-wise shared local weight used for lookup.
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+        rank: # of cuda process.
+        pg: process group.
+
+    Returns: final result of lookup.
+    """
+    # allgather the inputs first for non Replicated Tensor.
+    gather_inp = _all_gather_base_input(input, pg)
+
+    # Mask the input according to sharding spec.
+    lookup_input, padding_idx, padding_row = _handle_row_wise_mask(
+        gather_inp, padding_idx, weight, world_size, rank
+    )
+
+    # When input is a large tensor, the value of weight is changed.
+    # This is a walk-around for now. GH issue: #81717
+    if max_norm is not None:
+        torch.nn.functional.embedding(
+            torch.unique(lookup_input)[:-1],
+            local_shard,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+        )
+        max_norm = None
+
+    local_input_embeddings = torch.nn.functional.embedding(
+        lookup_input,
+        torch.cat([local_shard, padding_row]),
+        padding_idx=padding_idx,
+        max_norm=max_norm,
+        norm_type=norm_type,
+    )
+
+    # TODO: Make the result a PartialTensor.
+    local_shards = local_input_embeddings.chunk(pg.size())
+    return reduce_scatter(
+        torch.empty_like(local_shards[0]),
+        list(local_shards),
+        group=pg,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95f5334750e74348e2f0bfd52359d64e2ef899d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py
@@ -0,0 +1,476 @@
+
+from typing import cast, List
+
+import torch
+import torch.distributed as dist
+from torch._C._distributed_c10d import ReduceOp
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._shard.sharding_spec import ChunkShardingSpec
+from torch.distributed._shard.sharding_spec.api import custom_sharding_spec_op
+from torch.distributed.nn.functional import all_gather, reduce_scatter
+
+from ._common import (
+    _all_gather_base_input,
+    _handle_col_wise_sharding_base,
+    _handle_max_norm_col_wise,
+    _handle_row_wise_mask,
+)
+
+
+@custom_sharding_spec_op(ChunkShardingSpec, torch.nn.functional.embedding_bag)
+def sharded_embedding_bag(types, args, kwargs, pg):
+    """
+    Handles ``__torch_function__`` dispatch for ``torch.nn.functional.embedding_bag``.
+    This method computes a sharded embedding bag aggregation and has the following limitations:
+
+    1. Supports only sharding of ``weight``.
+    2. Supports only ``ChunkShardingSpec``.
+    3. Supports only a single local shard per rank.
+    4. Supports all specs except for scale_grad_by_freq, sparse, etc.
+
+    Based on the dimension that the weight is sharded on, there are two
+    algorithms:
+
+    ROWWISE SHARDING
+    ================
+    For row-wise sharding the weight is sharded on dimension 0.
+
+    The overall algorithm can be best explained with an example. Let's assume
+    the dims for input are (4 x 6) and W are (16 x 17) and W is sharded across
+    4 GPUs creating 4 shard of (4 x 17).
+    The algorithm is as follows:
+
+    1. First the input is all gathered to all ranks, since this is SPMD and
+       input is actually sharded across all ranks. The inputs then become a
+       4 (4 x 6) tensor on each rank. For example if the given input is
+       tensor([[6, 5, 2, 9, 6, 3],
+               [3, 1, 2, 4, 7, 6],
+               [4, 0, 4, 9, 8, 9],
+               [8, 6, 6, 4, 6, 1]])
+       on rank 0.
+       Then on every rank, we will have this tensor.
+       If input itself is already replicated, no all-gather will be done.
+    2. Next, we mask the ID which are not stored on that rank.
+       For example on rank 0, we store ID [0, 1, 2]. We only keep the ID
+       inside the set of numbers. The rest of them will be masked to an extra row.
+       The masked matrix will be used for embedding look up and is like:
+       tensor([[4, 4, 2, 4, 4, 4],
+               [4, 1, 2, 4, 4, 4],
+               [4, 0, 4, 4, 4, 4],
+               [4, 4, 4, 4, 4, 1]])
+    3. If ``max_norm`` is specified, the extra row guarantees that the mask ID will
+       not affect the behavior of weigh re-norm.
+    4. The example above only happens in one rank and each rank does a very similar thing.
+       For "Mean" mode we need to divide by either column size (2D) or the interval length
+       defined by the offset (excluding the row specified in ``padding_idx``).
+       We also need to mask the unexisting row to neg Inf so that negative value does not
+       gets wiped out in the "Max" mode.
+
+    COLWISE SHARDING
+    ================
+    For col-wise sharding the weight is sharded on dimension 1.
+
+    The overall algorithm can be best explained with an example. Let's assume
+    the dims for input are (4 x 6) and W are (16 x 17) and W is sharded across
+    4 GPUs creating 3 shards of (16 x 5) and 1 shard of (16 x 2).
+    The algorithm is as follows:
+
+    1. First the input is broadcasted to all ranks, since this is SPMD we
+       actually do an all_gather for all the inputs resulting in 4 (4 x 6)
+       inputs on each rank.
+    2. Next we perform local embedding bag operation under the given mode by
+       apply each input (4 x 6) with the local shard (16 x 5) ((16 x 2) for the last).
+       This results in 4 (5 x 4) ((2 x 4) for the last) matrices on each rank.
+       We transpose the aggregation result.
+    3. Next, we concatenate these 4 matrices and perform an all2all to share the
+       appropriate (5 x 4) or (2 x 4) matrices to each rank.
+    4. Now, each rank receives a (17 x 4) matrix which is basically the
+       size of the result we need.
+    5. If placements are not in order any appropriate rearrangement of columns
+       are done for the (17 x 4) matrix and finally we transpose the output again.
+    6. If max_norm is specified, we manually sum up the norm and renorm. Because
+       the renorm must be in place, we need to override the local_shard to mimic
+       this behavior.
+    """
+    # Validate input params
+    _validate_embedding_bag_param(args, kwargs)
+
+    input = args[0]
+    weight = args[1]
+    offsets = kwargs.get("offsets")
+    per_sample_weights = kwargs.get("per_sample_weights")
+    mode = kwargs.get("mode")
+    max_norm = kwargs.get("max_norm")
+    norm_type = kwargs.get("norm_type")
+    include_last_offset = kwargs.get("include_last_offset")
+    padding_idx = kwargs.get("padding_idx")
+
+    local_shard = weight.local_tensor().contiguous()
+    sharding_dim = weight._sharding_spec.dim
+    world_size = dist.get_world_size(pg)
+    rank = dist.get_rank(pg)
+    if include_last_offset:
+        offsets = offsets[:-1]
+
+    if sharding_dim == 1:
+        output, local_shard = _handle_col_wise_sharding(
+            input,
+            world_size,
+            weight,
+            local_shard,
+            offsets,
+            per_sample_weights,
+            mode,
+            max_norm,
+            norm_type,
+            padding_idx,
+            pg,
+        )
+        weight.local_shards()[0].tensor = local_shard
+        return output
+    elif sharding_dim == 0:
+        return _handle_row_wise_sharding(
+            input,
+            world_size,
+            weight,
+            local_shard,
+            offsets,
+            per_sample_weights,
+            mode,
+            max_norm,
+            norm_type,
+            padding_idx,
+            rank,
+            pg,
+        )
+    else:
+        raise RuntimeError(
+            f"nn.EmbeddingBag weight sharded on dim {sharding_dim} not supported!"
+        )
+
+
+def _validate_embedding_bag_param(args, kwargs):
+    """
+    Validate input params of sharded embeddingBag op.
+
+    Args:
+        input: list of ID used for lookup and aggregation.
+        weight: sharded weight tensor.
+        kwargs: same as normal EmbeddingBag.
+
+    Return: None.
+    """
+
+    input = args[0]
+    weight = args[1]
+    offsets = kwargs.get("offsets")
+    per_sample_weights = kwargs.get("per_sample_weights")
+    mode = kwargs.get("mode")
+    max_norm = kwargs.get("max_norm")
+    scale_grad_by_freq = kwargs.get("scale_grad_by_freq")
+    sparse = kwargs.get("sparse")
+    include_last_offset = kwargs.get("include_last_offset")
+
+    # Validate types
+    if not isinstance(input, torch.Tensor):
+        raise TypeError("input need to be torch.Tensor")
+    if offsets is not None and not isinstance(offsets, torch.Tensor):
+        raise TypeError("offsets need to be torch.Tensor")
+    if per_sample_weights is not None and not isinstance(
+        per_sample_weights, torch.Tensor
+    ):
+        raise TypeError("per_sample_weights need to be torch.Tensor")
+    if not isinstance(weight, ShardedTensor):
+        raise TypeError("weight needs to be ShardedTensor")
+    if len(input.size()) > 2:
+        raise ValueError("Input more than 2 dims not supported")
+    weight_size = weight.size()
+    if len(weight_size) != 2:
+        raise ValueError("Weight needs to have exactly 2 dims")
+    if int(torch.min(input).item()) < 0:
+        raise ValueError(
+            "Index out of range in Input %d %d",
+            int(torch.min(input).item()),
+            weight_size[1],
+        )
+    if int(torch.max(input).item()) >= weight_size[0]:
+        raise ValueError(
+            "Index out of range in Input %d %d",
+            int(torch.max(input).item()),
+            weight_size[1],
+        )
+    if offsets is not None and len(input.size()) != 1:
+        raise ValueError("Input dimension needs to be exactly 1 dim")
+    if len(input.size()) == 1 and offsets is None:
+        raise ValueError("offsets is required for 1D input")
+    if per_sample_weights is not None and per_sample_weights.size() != input.size():
+        raise ValueError(
+            f"per_sample_weights size {per_sample_weights.size()} not equal to input size {input.size()}"
+        )
+    if mode is None:
+        mode = "mean"
+    if mode not in ["sum", "mean", "max"]:
+        raise ValueError(f"mode '{mode}' is not supported")
+    if scale_grad_by_freq:
+        raise RuntimeError(
+            'nn.Embedding weight sharded with flag on "scale_grad_by_freq" not supported!'
+        )
+    if sparse:
+        raise RuntimeError(
+            'nn.Embedding weight sharded with flag on "sparse" not supported!'
+        )
+    if include_last_offset and offsets is None:
+        raise ValueError('offsets is required for flag "include_last_offset"!')
+    if include_last_offset and cast(List[int], offsets)[-1] != input.size(0):
+        raise ValueError(
+            'offsets need to have the input size in the end when the flag "include_last_offset" is on!'
+        )
+
+    if max_norm and max_norm <= 0.0:
+        raise ValueError('"max_norm" must be larger than zero!')
+
+    if not isinstance(weight._sharding_spec, ChunkShardingSpec):
+        raise ValueError("Only ChunkShardingSpec supported for ShardedTensor ops!")
+    if len(weight.local_shards()) != 1:
+        raise ValueError("Only one local shard supported!")
+
+
+def _handle_col_wise_sharding(
+    input,
+    world_size,
+    weight,
+    local_shard,
+    offsets,
+    per_sample_weights,
+    mode,
+    max_norm,
+    norm_type,
+    padding_idx,
+    pg,
+):
+    """
+    Entry-point function to handle the logic of col-wise sharding of weight
+    for embeddingBag. (Detailed explanations of the logic can be found in
+    the comment for sharded_embedding_bag.)
+
+    Args:
+        input: list of ID used for lookup and aggregation.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        local_shard: col-wise shared local weight used for lookup.
+        offsets: list of start positions of each bag for 1D input.
+        per_sample_weights: weights for weighted sum mode.
+        mode: aggregation method of each bag.
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+            Note that the embedding vector at padding_idx is
+            excluded from the reduction.
+        pg: process group.
+
+    Return:
+        output: final result of lookup and aggregation.
+        local_shard: col-wise shared local weight used for lookup.
+            If max_norm, this will be the renormed weight.
+    """
+    # allgather the special input of embedding bag first.
+    (
+        gathered_inputs,
+        gathered_per_sample_weights,
+        gathered_offsets,
+    ) = _all_gather_embedding_bag_input(input, per_sample_weights, offsets, pg)
+
+    if max_norm is not None:
+        # max_norm changes the weight in-place
+        local_shard = _handle_max_norm_col_wise(
+            max_norm, norm_type, local_shard, input, world_size, gathered_inputs, pg
+        )
+
+    output = _handle_col_wise_sharding_base(
+        torch.nn.functional.embedding_bag,
+        1,
+        input,
+        world_size,
+        weight,
+        local_shard,
+        pg,
+        gathered_inputs,
+        mode=mode,
+        gathered_per_sample_weights=gathered_per_sample_weights,
+        gathered_offsets=gathered_offsets,
+        padding_idx=padding_idx,
+    )
+    return (output, local_shard)
+
+
+def _handle_row_wise_sharding(
+    input,
+    world_size,
+    weight,
+    local_shard,
+    offsets,
+    per_sample_weights,
+    mode,
+    max_norm,
+    norm_type,
+    padding_idx,
+    rank,
+    pg,
+):
+    """
+    Entry-point function to handle the logic of row-wise sharding of weight
+    for embeddingBag. (Detailed explanations of the logic can be found in
+    the comment for sharded_embedding_bag.)
+
+    Args:
+        input: list of ID used for lookup and aggregation.
+        world_size: number of ranks.
+        weight: sharded weight tensor.
+        local_shard: row-wise shared local weight used for lookup.
+        offsets: list of start positions of each bag for 1D input.
+        per_sample_weights: weights for weighted sum mode.
+        mode: aggregation method of each bag.
+        max_norm: If given, each embedding vector with norm larger
+            than max_norm is renormalized to have norm max_norm.
+            Note: this will modify weight in-place.
+        norm_type: The p in the p-norm to compute for the max_norm option.
+        padding_idx: If specified, the entries at padding_idx do
+            not contribute to the gradient; therefore, the embedding
+            vector at padding_idx is not updated during training,
+            i.e. it remains as a fixed “pad”.
+            Note that the embedding vector at padding_idx is
+            excluded from the reduction.
+        rank: # of cuda process.
+        pg: process group.
+
+    Returns:
+        gathered_output: final result of lookup and aggregation.
+    """
+    if input.dim() > 1 and per_sample_weights is None:
+        # allgather the inputs first for non Replicated Tensor.
+        gather_inp = _all_gather_base_input(input, pg)
+    else:
+        (
+            gathered_inputs,
+            gathered_per_sample_weights,
+            gathered_offsets,
+        ) = _all_gather_embedding_bag_input(input, per_sample_weights, offsets, pg)
+        cat_dim = 0 if input.dim() != 1 else -1
+        gather_inp = torch.cat(gathered_inputs, dim=cat_dim)
+        if per_sample_weights is not None:
+            per_sample_weights = torch.cat(gathered_per_sample_weights, dim=cat_dim)
+        offset_add = 0 if input.dim() > 1 else input.size(0)
+        if offsets is not None:
+            offsets_list = torch.cat(
+                [gathered_offsets[i] + (offset_add * i) for i in range(pg.size())],
+                dim=cat_dim,
+            )
+
+    # Mask the input according to sharding spec.
+    lookup_input, padding_local, padding_row = _handle_row_wise_mask(
+        gather_inp, padding_idx, weight, world_size, rank
+    )
+    if mode == "max":
+        padding_row[:] = -float("Inf")
+
+    # When input is a large tensor, the value of weight is changed.
+    # This is a walk-around for now. GH issue: #81717.
+    if max_norm is not None:
+        torch.nn.functional.embedding_bag(
+            torch.unique(lookup_input)[:-1],
+            local_shard,
+            offsets=torch.tensor([0], device=local_shard.device, dtype=torch.long),
+            mode=mode,
+            per_sample_weights=None,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            padding_idx=padding_local,
+        )
+        max_norm = None
+    result = torch.nn.functional.embedding_bag(
+        lookup_input,
+        torch.cat([local_shard, padding_row]),
+        offsets=offsets_list if offsets is not None else offsets,  # type: ignore[possibly-undefined]
+        mode=mode if mode != "mean" else "sum",
+        per_sample_weights=per_sample_weights,
+        max_norm=max_norm,
+        norm_type=norm_type,
+        padding_idx=padding_local,
+    )
+
+    op = ReduceOp.SUM if mode != "max" else ReduceOp.MAX
+    # TODO: Make the result a PartialTensor and move the logic below there.
+    local_shards = result.chunk(pg.size())
+    result = reduce_scatter(
+        torch.empty_like(local_shards[0]),
+        list(local_shards),
+        op=op,
+        group=pg,
+    )
+
+    # For Mean, we cannot do the division until very end because the sum of means
+    # not equal to the mean of sum. (Divisor is different)
+    if mode == "mean":
+        if input.dim() > 1:
+            padding_idx = padding_idx if padding_idx is not None else -1
+            split_sizes = torch.sum(
+                torch.ne(input, padding_idx), dim=-1, dtype=local_shard.dtype
+            )
+        else:
+            split_sizes = torch.cat(
+                (
+                    offsets[1 : offsets.size(0)] - offsets[0:-1],
+                    (input.size(0) - offsets[-1]).unsqueeze(0),
+                ),
+                dim=-1,
+            )
+        return torch.div(result, split_sizes.unsqueeze(1))
+
+    # Return the appropriate local result.
+    return result
+
+
+def _all_gather_embedding_bag_input(input, per_sample_weights, offsets, pg):
+    """
+    In case we need to gather input and all other parameters of embeddingBag
+    ops, we need to stack all input together to perform ``all_gather``
+    collective communication just once.
+
+    Note that since offsets does not share the same size as input and
+    is always smaller than input, we resize it during the communication.
+
+    Args:
+        input: tensor to be applied op on.
+        per_sample_weights: weights for weighted sum mode.
+        offsets: when input is 1D. offsets determines the starting
+            index position of each bag (sequence) in input.
+        pg: process group.
+
+    Returns:
+        gathered_inputs: list of input tensor gathered from each rank.
+        gathered_per_sample_weights: list of per_sample_weights from each rank.
+        gathered_offsets: list of offsets from each rank.
+    """
+    input_to_gather = [input]
+    if per_sample_weights is not None:
+        input_to_gather.append(per_sample_weights)
+    if offsets is not None:
+        input_to_gather.append(offsets.clone().resize_(input.size()))
+    gathered_inputs = all_gather(torch.stack(input_to_gather), group=pg)
+
+    gathered_per_sample_weights = None
+    if per_sample_weights is not None:
+        gathered_per_sample_weights = [t[1] for t in gathered_inputs]
+    gathered_offsets = None
+    if offsets is not None:
+        idx = 2 if per_sample_weights is not None else 1
+        gathered_offsets = [
+            t[idx].resize_(offsets.size()).to(offsets.dtype) for t in gathered_inputs
+        ]
+    gathered_inputs = [t[0].to(input.dtype) for t in gathered_inputs]
+    return gathered_inputs, gathered_per_sample_weights, gathered_offsets
diff --git a/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30f121173c4c9c80ff09903da42b010f9ade2855
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__init__.py
@@ -0,0 +1,12 @@
+# Keep old package for BC purposes, this file should be removed once
+# everything moves to the `torch.distributed._shard` package.
+import sys
+import torch
+import warnings
+
+from torch.distributed._shard.sharded_tensor import *  # noqa: F403
+warnings.warn(
+    "torch.distributed._sharded_tensor will be deprecated, use torch.distributed._shard.sharded_tensor instead",
+    DeprecationWarning
+)
+sys.modules['torch.distributed._sharded_tensor'] = torch.distributed._shard.sharded_tensor
diff --git a/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dde1b44ada39079d3e8161c1fc4a415c20b414b3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_sharded_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d1890c6b6f0930426a543a5d19cf8e02710b384
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__init__.py
@@ -0,0 +1,14 @@
+# Keep old package for BC purposes, this file should be removed once
+# everything moves to the `torch.distributed._shard` package.
+import sys
+import torch
+import warnings
+
+from torch.distributed._shard.sharding_spec import *  # noqa: F403
+warnings.warn(
+    "torch.distributed._sharding_spec will be deprecated, use torch.distributed._shard.sharding_spec instead",
+    DeprecationWarning
+)
+
+import torch.distributed._shard.sharding_spec as _sharding_spec
+sys.modules['torch.distributed._sharding_spec'] = _sharding_spec
diff --git a/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8febcfddb3aab538e86322657d3c8a84cfa88a97
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_sharding_spec/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..475f93d71f7611d7e3d380ff85b8ce086762ecba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d65d75533dea9100b698005c491561aac3adaf4f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/batch_dim_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/batch_dim_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..163218563efb422d983239a80d165b945e8e637e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/batch_dim_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/comm_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/comm_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6f038c243e6a3aee558e82dc72017d7c307ef24
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/comm_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..267dafadde99bbfc1c499fdac25703b1c8f6c35d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/data_parallel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/data_parallel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9604372e55ee7e061916557636aa03409ded11ed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/data_parallel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/distribute.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/distribute.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d148a32c448c7046c1fec54a5daf1e94ddb066cd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/distribute.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/experimental_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/experimental_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a96b738f51bbdf497bd47356304bf6f8728bdc11
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/experimental_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/gm_transformation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/gm_transformation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..831bf4aa72395d114df3d10e6ef027b033c8bc99
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/gm_transformation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_optimization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_optimization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6c4ca0c35b5111ba9850745254bb136c5f1ebf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_optimization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ac2c4a1a0884b991a268f277e116ecbb2de534e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/graph_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/iter_graph_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/iter_graph_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36697f4b787571d879ac84fe548c792c01780698
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/iter_graph_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/log_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/log_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..caf9751d168b571530219ad6e59bb9b2cf012a52
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/log_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/parallel_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/parallel_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..484a8d38eae67e36a20429070e10c366d100624e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/parallel_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/partial_lower.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/partial_lower.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b56f7e12e69ea40fba9a83f52068fe7aca8b2dad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_spmd/__pycache__/partial_lower.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/api.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..77953fee4310e2a26c0afc86bd06da93739dd49e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/api.py
@@ -0,0 +1,575 @@
+from abc import ABC, abstractmethod
+from contextlib import contextmanager, nullcontext
+from copy import copy
+from dataclasses import dataclass
+from functools import partial, wraps
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Union
+
+from functorch import make_fx
+
+import torch
+import torch.distributed as dist
+
+# We need to import _functional_collectives to trigger op registration
+import torch.distributed._functional_collectives
+import torch.nn as nn
+import torch.utils._pytree as pytree
+
+from torch import fx
+from torch._decomp.decompositions import native_layer_norm_backward
+
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed._spmd.data_parallel import gradients_tagging
+from torch.distributed._spmd.parallel_mode import (
+    DataParallel,
+    DTensorExpandMode,
+    ParallelMode,
+)
+from torch.distributed._tensor import Placement
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo, CodeGen
+from torch.nn.utils import stateless
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+
+class Override(ABC):
+    r"""Override the tracing and transformation behavior of :meth:`~torch.distributed._spmd.compile`.
+
+    This is useful when any part of the model is not traceable or if you prefer
+    to not trace it due to any reason. More specifically, users can implement
+    :meth:`torch.distributed._spmd.Override.replacement` to replace an original
+    submodule with the return new submodule. The new submodule contains
+    operations that users preferred to be traced, which simply be a dummy
+    placeholder operator. After tracing, users can implement
+    :meth:`torch.distributed._spmd.Override.transform` to transform the traced
+    graph, where the dummy placeholder operator serves as an anchor to insert
+    new sub-graphs.
+    """
+
+    @abstractmethod
+    def replacement(self, fqn: str, orig_submodule: torch.nn.Module) -> torch.nn.Module:
+        r"""Implement this method to return a new :class:`nn.Module` instance to replace the ``orig_submodule``
+        argument in the model.
+
+        This helps if ``orig_submodule`` is not traceable or should not be traced.
+
+        Args:
+            fqn (str): fully quantified name of the submodule.
+            orig_submodule (class:`nn.Module`): original submodule instance to replace.
+
+        Returns:
+            A new :class:`nn.Module` instance to replace the original one.
+
+        """
+        pass
+
+    @abstractmethod
+    def transform(
+        self,
+        gm: fx.GraphModule,
+        flat_state: List[torch.Tensor],
+    ) -> fx.GraphModule:
+        r"""
+        Given a DTensor-expanded graph and sharding schema for every node,
+        conduct additional transformation for the sub-graph from the :class:`nn.Module`
+        returned by :meth:`torch.distributed._spmd.Override.replacement` if
+        necessary.
+
+        Args:
+            gm (:class:`fx.Graph`): a DTensor-expanded graph.
+            flat_state (List[str, :class:`Tensor`]): a reference to the list of
+                flattened state. The elements in ``flat_state`` map to the first
+                ``len(flat_state)`` placeholders in the graph. The transformation
+                can add state to or remove state from ``flat_state`` as long as
+                it keeps ``flat_state`` and the placeholders consistent.
+
+        Returns:
+            The :class:`fx.Graph` after transformation.
+
+        """
+        pass
+
+
+class _PyTreeCodeGenOutputsOnly(_PyTreeCodeGen):
+    # pyre-ignore[3]
+    def process_inputs(self, *args: Any) -> Any:
+        return args
+
+    # pyre-ignore[2, 3]
+    def gen_fn_def(self, free_vars, maybe_return_annotation):
+        return CodeGen.gen_fn_def(self, free_vars, maybe_return_annotation)
+
+
+def _to_caller_flattened_graph_module(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """Move the responsibility of flattening the input arguments from the graph module to the caller.
+
+    Example:
+
+        output = gm(my_struct)
+
+        gm = gm(to_caller_flattened_graph_module)
+
+        output = gm(*pytree.flatten(my_struct)[0])
+
+    """
+    # pyre-ignore[16]
+    gm._graph._codegen = _PyTreeCodeGenOutputsOnly(
+        pytree_info=_PyTreeInfo(
+            # pyre-ignore[6]
+            orig_args=None,  # type: ignore[arg-type]
+            # pyre-ignore[6]
+            in_spec=None,  # type: ignore[arg-type]
+            # pyre-ignore[16]
+            out_spec=gm._graph._codegen.pytree_info.out_spec,
+        )
+    )
+    gm.recompile()
+    return gm
+
+
+# Use a dtensor expand mode for now to preserve the old behavior
+# and avoid breaking existing code
+dtensor_expand_mode = DTensorExpandMode()
+
+
+def _override_placements(t: torch.Tensor, placements: List[Placement]):
+    global dtensor_expand_mode
+    dtensor_expand_mode._placements_override[id(t)] = placements
+
+
+@contextmanager
+def _rematerialize_optimizer(
+    opt: torch.optim.Optimizer,
+    named_states: Dict[str, Any],
+    params: Dict[str, nn.Parameter],
+):
+    assert opt is not None
+
+    # update opt.state with proxy tensors
+    orig_states = copy(opt.state)
+    for n in named_states:
+        # opt.state's key type is string, but optimizer uses Parameter as keys
+        opt.state[params[n]] = named_states[n]  # type: ignore[index]
+
+    # FIXME: support multiple parameter groups
+    param_group = opt.param_groups[0]
+    orig_params = param_group["params"]
+    param_group["params"] = params.values()
+
+    try:
+        yield
+    finally:
+        param_group["params"] = orig_params
+        opt.state = orig_states
+
+
+aten = torch.ops.aten  # pyre-ignore
+
+
+@contextmanager
+def _enable_compile():
+    # The return value of torch._utils.is_compiling changes optimizer behavior.
+    # We need that function to return True to include optimizer in the graph.
+    # See: https://github.com/pytorch/pytorch/blob/a524123c91ab399c9dd6882c1189596dd77e7734/torch/optim/optimizer.py#L41
+    def f_true():
+        return True
+
+    orig_is_compiling_code = torch._utils.is_compiling.__code__
+    torch._utils.is_compiling.__code__ = f_true.__code__
+    try:
+        yield
+    finally:
+        torch._utils.is_compiling.__code__ = orig_is_compiling_code
+
+
+def _foreach_add_decomp(self, other, alpha=1):
+    self_updated = aten._foreach_add.List(self, other, alpha=alpha)
+    for s, s_u in zip(self, self_updated):
+        s.copy_(s_u)
+
+
+def _foreach_unaop_decomp(op, self):
+    self_updated = op(self)
+    for s, s_u in zip(self, self_updated):
+        s.copy_(s_u)
+
+
+def _foreach_binop_list_decomp(op, self, other):
+    self_updated = op(self, other)
+    for s, s_u in zip(self, self_updated):
+        s.copy_(s_u)
+
+
+def _foreach_binop_scalar_decomp(op, self, scalar=1):
+    self_updated = op(self, scalar)
+    for s, s_u in zip(self, self_updated):
+        s.copy_(s_u)
+
+
+def _foreach_addcop_scalar_decomp(op, self, tensor1, tensor2, scalar=1):
+    self_updated = op(self, tensor1, tensor2, scalar)
+    for s, s_u in zip(self, self_updated):
+        s.copy_(s_u)
+
+
+def _fused_adam_decomp(
+    self,
+    grads,
+    exp_avgs,
+    exp_avg_sqs,
+    max_exp_avg_sqs,
+    state_steps,
+    *,
+    lr=1,
+    beta1=1,
+    beta2=1,
+    weight_decay=1,
+    eps=1,
+    amsgrad=True,
+    maximize=True,
+    grad_scale=None,
+    found_inf=None,
+):
+    orig_tuple = (self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs)
+    updated_tuple = aten._fused_adam.default(
+        self,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        lr=lr,
+        beta1=beta1,
+        beta2=beta2,
+        weight_decay=weight_decay,
+        eps=eps,
+        amsgrad=amsgrad,
+        maximize=maximize,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+    )
+
+    for idx, (orig, updated) in enumerate(zip(orig_tuple, updated_tuple)):
+        if idx == 1:
+            # skip gradient copying as we don't need to copy gradients back
+            continue
+        for o, u in zip(orig, updated):
+            o.copy_(u)
+
+
+SPMD_DECOMP_TABLE = {
+    aten._foreach_add_.List: _foreach_add_decomp,
+    aten._foreach_add_.Scalar: partial(
+        _foreach_binop_scalar_decomp, aten._foreach_add.Scalar
+    ),
+    aten._foreach_addcdiv_.Scalar: partial(
+        _foreach_addcop_scalar_decomp, aten._foreach_addcdiv.Scalar
+    ),
+    aten._foreach_addcmul_.Scalar: partial(
+        _foreach_addcop_scalar_decomp, aten._foreach_addcmul.Scalar
+    ),
+    aten._foreach_div_.List: partial(
+        _foreach_binop_list_decomp, aten._foreach_div.List
+    ),
+    aten._foreach_mul_.Scalar: partial(
+        _foreach_binop_scalar_decomp, aten._foreach_mul.Scalar
+    ),
+    aten._foreach_div_.Scalar: partial(
+        _foreach_binop_scalar_decomp, aten._foreach_div.Scalar
+    ),
+    aten._foreach_neg_.default: partial(
+        _foreach_unaop_decomp, aten._foreach_neg.default
+    ),
+    aten._foreach_reciprocal_.default: partial(
+        _foreach_unaop_decomp, aten._foreach_reciprocal.default
+    ),
+    aten._foreach_sqrt_.default: partial(
+        _foreach_unaop_decomp, aten._foreach_sqrt.default
+    ),
+    aten._foreach_sub_.Scalar: partial(
+        _foreach_binop_scalar_decomp, aten._foreach_sub.Scalar
+    ),
+    aten._fused_adam_.default: _fused_adam_decomp,
+    aten.native_layer_norm_backward.default: native_layer_norm_backward,
+}
+
+
+DEDUP_TARGETS: Set[torch._ops.OpOverload] = {
+    torch.ops.c10d_functional.all_reduce.default,
+    torch.ops.c10d_functional.wait_tensor.default,
+}
+
+
+def _dedup_collectives(gm: fx.GraphModule) -> fx.GraphModule:
+    args_to_node: Dict[Tuple[Any, ...], fx.Node] = {}
+
+    for node in gm.graph.nodes:
+        # replace all args with the results from the first unique comm op
+        args = pytree.arg_tree_leaves(*node.args)
+
+        if node.target in DEDUP_TARGETS:
+            args_key = (node.target, *args)
+            unique_node = args_to_node.get(args_key, None)
+            if unique_node is None:
+                # first time seeing this combination, remember it
+                args_to_node[args_key] = node
+            else:
+                # the current node is a duplicate, replace it
+                node.replace_all_uses_with(unique_node)
+                gm.graph.erase_node(node)
+
+    gm.recompile()
+
+    return gm
+
+
+@dataclass
+class _CompiledResult:
+    gm: fx.GraphModule
+    mod: nn.Module
+    opt: Optional[torch.optim.Optimizer]
+    flat_state: List[torch.Tensor]
+
+
+def _compile(
+    func: Callable,
+    module_override: Optional[List[Override]],
+    parallel_mode: ParallelMode,
+    *args: Any,
+    **kwargs: Any,
+) -> _CompiledResult:
+    # 1. Extract nn.Module and Optimizer from args and kwargs
+    # FIXME(@mrshenli): support multiple nn.Module instances
+    # FIXME(@mrshenli): support multiple Optiimzer instances
+    # FIXME(@mrshenli): need to broadcast model to sync parameters
+    mod, opt = None, None
+    for arg in pytree.arg_tree_leaves(*args, **kwargs):
+        if isinstance(arg, nn.Module):
+            assert mod is None, "Only support single nn.Module for now"
+            mod = arg
+        if isinstance(arg, torch.optim.Optimizer):
+            assert opt is None, "Only support single Optimizer for now"
+            opt = arg
+
+    assert mod is not None, "Couldn't find nn.Module instances from the arguments."
+
+    # 2. Override target submodules (e.g., MoE) with dummy replacements
+    if module_override:
+        accessor = NamedMemberAccessor(mod)
+
+        def swap(fqn_prefix: str, module: torch.nn.Module) -> None:
+            for override in module_override:  # type: ignore[union-attr]
+                for name, child in module.named_children():
+                    if len(name) == 0:
+                        continue
+                    fqn = fqn_prefix + "." + name if fqn_prefix != "" else name
+                    new_child = override.replacement(fqn, child)
+                    if id(new_child) == id(child):
+                        swap(fqn, new_child)
+                    else:
+                        accessor.swap_submodule(fqn, new_child)
+
+        swap("", mod)
+
+    # 3. Trace statelss version of the train_step
+    params = dict(mod.named_parameters(remove_duplicate=False))
+    buffers = dict(mod.named_buffers(remove_duplicate=False))
+
+    named_states = {}
+    if opt is not None:
+        # Pass named_states instead of opt.state to stateless_func, because
+        # the later uses nn.Parameter as key. During tracing, we need to
+        # make sure optimizers can find the states using proxy tensors.
+        for n, p in params.items():
+            if p in opt.state:
+                # opt.state's key type is string, but optimizer uses
+                # Parameter as keys
+                named_states[n] = opt.state[p]  # type: ignore[index]
+
+    is_data_parallel_mode = isinstance(parallel_mode, DataParallel)
+
+    # Lift states and parameters as function arguments so that make_fx
+    # can trace operations applied to them.
+    def stateless_func(func, params, buffers, named_states, args, kwargs):
+        with stateless._reparametrize_module(
+            mod, {**params, **buffers}
+        ), _rematerialize_optimizer(
+            opt, named_states, params
+        ) if opt else nullcontext():
+            # For DataParallel mode, install hooks first to tag the gradients
+            with gradients_tagging(params) if is_data_parallel_mode else nullcontext():
+                ret = func(*args, **kwargs)
+
+            # make sure updated parameters are returned
+            return ret, list(mod.parameters()), list(named_states.values())  # type: ignore[union-attr]
+
+    # FIXME: Using symbolic tracing to work around in DTensor expand mode.
+    # Otherwise it hits shape mismatch error, as we use local inputs to
+    # trace local graph and use DTensor to expand operators, where
+    # DTensor's shape is the global shape.
+    tracing_mode = "fake" if is_data_parallel_mode else "symbolic"
+
+    if is_data_parallel_mode:
+        fake_mode = FakeTensorMode()
+        data_parallel_mode = cast(DataParallel, parallel_mode)
+
+        def _get_full_batch_arg(arg: torch.Tensor) -> torch.Tensor:
+            # since compilation happens in the first iteration and we
+            # receives mini-batch input, convert them to full batch
+            # fake tensor input first for data parallel sharding
+            # propagations
+            fake_arg = fake_mode.from_tensor(arg)
+            arg_dims = [1] * arg.ndim
+            # expand the tensor to full batch size on its batch dim
+            arg_dims[data_parallel_mode.input_batch_dim] *= dist.get_world_size()
+            return fake_arg.repeat(arg_dims)
+
+        args = pytree.tree_map_only(
+            torch.Tensor,
+            _get_full_batch_arg,
+            args,
+        )
+        kwargs = pytree.tree_map_only(
+            torch.Tensor,
+            _get_full_batch_arg,
+            kwargs,
+        )
+
+    with _enable_compile(), torch.autograd.detect_anomaly(check_nan=False):
+        # FIXME(@mrshenli): functionalization does not work for our use
+        # case yet. Use explicit decompositions for foreach ops.
+        # Remove this when the following issue is addressed.
+        # Issue: https://github.com/pytorch/pytorch/issues/97852
+        gm = make_fx(
+            partial(stateless_func, func),
+            tracing_mode=tracing_mode,
+            decomposition_table=SPMD_DECOMP_TABLE,
+            _allow_non_fake_inputs=False,
+        )(params, buffers, named_states, args, kwargs)
+
+    params_and_buffers: Dict[str, Union[torch.Tensor, nn.Parameter]] = {
+        **params,
+        **buffers,
+    }
+
+    # 4. parallel mode to expand a single device graph to a distributed graph
+    gm = parallel_mode.partition(
+        gm,
+        mod,
+        opt,
+        params_and_buffers,
+        named_states,
+        args,
+        kwargs,
+    )
+
+    # 5. Move the responsibility of flattening the input arguments from the
+    # graph module to the caller. This serves two purposes:
+    #   - Transformations that add/remove state need to manipulate a state
+    #   container that maintains the state tensors in the same order as they
+    #   appear in graph placeholders.
+    #   - Reduced runtime cost. The state container is only flattened once upfront.
+    flat_state = pytree.tree_leaves([params_and_buffers, named_states])
+    gm = _to_caller_flattened_graph_module(gm)
+
+    # 6. dedup comm operators.
+    # The duplication could come from DTensor args and kwargs redistribution.
+    # Suppose one operator produces a Partial gradient tensor and model
+    # parameters are replicated. In this case, every optimizer operation using
+    # that Partial gradient tensor would trigger an allreduce. This is becuase
+    # DTensor only has local information on individual tensor/operator, which is
+    # not sufficient to detect duplications in the graph. This situation can
+    # also happen when inserting FSDP allgather if a parameter is used multiple
+    # times in the forward method.
+    # TODO(@mrshenli): @yifuwang has a suggestion of conducting expansion and
+    # dedup at tracer-level to avoid multiple graph passes.
+    gm = _dedup_collectives(gm)
+
+    # 7. Replace previously inserted dummy ones with real graphs.
+    if module_override:
+        for override in module_override:
+            gm = override.transform(gm, flat_state)
+
+    return _CompiledResult(gm, mod, opt, flat_state)
+
+
+# Note that the Python convention of __dict__ requires the key to be str.
+# TODO: ensure the key is unique.
+COMPILED_OBJECT_KEY = "_compiled_obj"
+
+
+def compile(
+    module_override: Optional[List[Override]] = None,
+    gm_transformation: Optional[Callable[[fx.GraphModule], fx.GraphModule]] = None,
+    parallel_mode: Optional[ParallelMode] = None,
+):
+    r"""Compile and optimize a callable, which can be a train step within a training loop.
+
+    This method will extract :class:`nn.Module` and :class:`torch.optim.Optimizer`
+    instances from the input arguments and trace operations applied to their
+    parameters and states.
+
+    Args:
+        module_override (Optional[List[Override]]): a list of Override instances
+            that will be applied to the module in order. The :class:`Override`
+            objects provide :class:`nn.Module` replacements during tracing and a
+            graph transformation function after tracing. (Default: ``None``)
+        gm_transformation (Optional[Callable[fx.GraphModule, fx.GraphModule]]):
+            a callback that will be called after the original callable is
+            compiled and distributed (usually after the first iteration) to
+            transform the compiled GraphModule into a new optimized one.
+        parallel_mode (Optional[ParallelMode]): a :class:`ParallelMode` object
+            that specifies how to parallelize the callable. Each ParallelMode
+            would have its own strategy to partition the model and the captured
+            graph (Default: ``None``)
+
+    """
+
+    def inner(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_train_step = kwargs.pop("last_train_step", False) if kwargs else False
+            first_iter = False
+            # Put the COMPILED_OBJECT_KEY in ``wrapper`` instead of ``func`` as
+            # ``wrapper`` is the one that users will get.
+            compiled_obj = wrapper.__dict__.get(COMPILED_OBJECT_KEY, None)
+            if compiled_obj is None:
+                first_iter = True
+                global dtensor_expand_mode
+                mode: ParallelMode = (
+                    dtensor_expand_mode if parallel_mode is None else parallel_mode
+                )
+
+                compiled_obj = _compile(func, module_override, mode, *args, **kwargs)
+                wrapper.__dict__[COMPILED_OBJECT_KEY] = compiled_obj
+
+            flat_inps = compiled_obj.flat_state + pytree.arg_tree_leaves(
+                *args, **kwargs
+            )
+
+            with torch.no_grad():
+                # N.B.: we don't need autograd as backward has already been
+                # captured in the graph.
+                if first_iter and gm_transformation:
+                    # TODO: SPMD should provid a default and configurable
+                    # transformation.
+                    compiled_obj.gm = gm_transformation(compiled_obj.gm)
+                if not last_train_step:
+                    output = compiled_obj.gm(*flat_inps)[0]
+                else:
+                    # This is the last train step. Call IterGraphModule.forward()
+                    # with the `last_iter` argument and catch the exception in
+                    # case the compiled_obj is not wrapped with IterGraphModule.
+                    try:
+                        output = compiled_obj.gm(*flat_inps, last_iter=last_train_step)[
+                            0
+                        ]
+                    except TypeError as e:
+                        if "last_iter" not in str(e):
+                            raise e
+                        output = compiled_obj.gm(*flat_inps)[0]
+
+                return output
+
+        return wrapper
+
+    return inner
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/batch_dim_utils.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/batch_dim_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8277ae25cdf1d8d7df61e04ad9acf40f62b86b40
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/batch_dim_utils.py
@@ -0,0 +1,179 @@
+from typing import Callable, Dict, List, Set
+
+import torch
+
+import torch.fx as fx
+
+import torch.utils._pytree as pytree
+
+from torch import Tensor
+
+from torch.distributed._tensor import DeviceMesh, Replicate, Shard
+from torch.distributed._tensor.ops.view_ops import (
+    DimSpec,
+    InputDim,
+    ops as view_op_rules,
+)
+from torch.distributed._tensor.placement_types import _Partial, DTensorSpec
+
+aten = torch.ops.aten
+
+
+class BatchDimAnalyzer:
+    """This class is used to analyze the batch dimension of each tensor/node in the graph.
+
+    We need to know the batch dimension of each tensor/node so that we know
+    exactly the sharding layout of intermediate tensors.
+
+    We possibly should evaluate using symbolic shapes to track the batch dimension.
+    We can experiment it later with dynamo integration (as dynamo have mark_dynamic
+    API which allows marking batch dimension only) or try to use FakeTensorMode to
+    mark the batch dimension. For now, let's just use the batch dimension of the first
+    input tensor as the hint to track the batch dimension of all tensors/nodes in
+    the graph.
+    """
+
+    def __init__(self, batch_dim: int = 0) -> None:
+        self.batch_dim = batch_dim
+
+        self.batch_dim_map: Dict[fx.Node, int] = {}
+        # batch dim size is used to track the batch dim size of the input tensor
+        self.batch_dim_size = -1
+
+        self.dim_rule_map: Dict[torch._ops.OpOverload, Callable[..., torch.Tensor]] = {
+            aten.squeeze.default: torch.squeeze,
+            aten.squeeze.dim: torch.squeeze,
+            aten.view.default: Tensor.view,
+            aten.reshape.default: torch.reshape,
+            aten._unsafe_view.default: Tensor.view,
+            aten.unsqueeze.default: torch.unsqueeze,
+            aten.expand.default: Tensor.expand,
+            aten.permute.default: torch.permute,
+            aten.repeat.default: Tensor.repeat,
+            aten.transpose.int: torch.transpose,
+        }
+
+    def init_batch_dim_size(self, batch_dim_size: int) -> None:
+        """Initialize batch dim size base on the first input batch size."""
+        if self.batch_dim_size != -1 and self.batch_dim_size != batch_dim_size:
+            raise RuntimeError(
+                f"batch dim size is already initialized! "
+                f"Found new batch size: {batch_dim_size} not "
+                f"matching existing batch dim size: {self.batch_dim_size}!"
+            )
+        self.batch_dim_size = batch_dim_size
+
+    def set_batch_dim(self, node: fx.Node, batch_dim: int) -> None:
+        self.batch_dim_map[node] = batch_dim
+
+    def get_batch_dim(self, node: fx.Node) -> int:
+        if node not in self.batch_dim_map:
+            raise RuntimeError(f"batch dim analysis failed on node: {node}!")
+        return self.batch_dim_map[node]
+
+    def compute_batch_dim(self, node: fx.Node, full_reduction=False) -> int:
+        """Compute the batch dimension for the `node`."""
+        assert self.batch_dim_size != -1, "batch dim size is not initialized!"
+
+        if node in self.batch_dim_map:
+            # if batch dim already computed, simply return it
+            return self.batch_dim_map[node]
+
+        if node.target in self.dim_rule_map:
+            view_op_rule = view_op_rules[self.dim_rule_map[node.target]]  # type: ignore[index]
+            args_val = pytree.tree_map_only(fx.Node, lambda n: n.meta["val"], node.args)
+            kwargs_val = pytree.tree_map_only(
+                fx.Node, lambda n: n.meta["val"], node.kwargs
+            )
+            output_dim_rules = view_op_rule.dim_map(*args_val, **kwargs_val)
+
+            def collect_input_dim(cmd: DimSpec, input_dims: Set[int]):
+                if isinstance(cmd, InputDim):
+                    input_dims.add(cmd.input_dim)
+                for inp in cmd.inputs():
+                    collect_input_dim(inp, input_dims)
+
+            output_dim_to_input_dims: List[Set[int]] = []
+            for inp in output_dim_rules:
+                input_dims: Set[int] = set()
+                collect_input_dim(inp, input_dims=input_dims)
+                output_dim_to_input_dims.append(input_dims)
+
+            operand = node.all_input_nodes[0]
+            operand_batch_dim = self.get_batch_dim(operand)
+            for output_dim, input_dims in enumerate(output_dim_to_input_dims):
+                if operand_batch_dim in input_dims:
+                    self.set_batch_dim(node, output_dim)
+                    # update batch dim size before return
+                    # this is because batch dim size might change during the middle
+                    self.batch_dim_size = node.meta["val"].shape[output_dim]
+                    return output_dim
+
+        # if there's no hints from the output_dim_rules, we infer from output
+        # shape to see if there's batch dim, and shard correspondingly
+        node_val = node.meta["val"]
+        if isinstance(node_val, (list, tuple)):
+            shapes = [val.shape for val in node_val]
+        else:
+            shapes = [node_val.shape]
+
+        # for reduction op that reduces over the sharded batch dim
+        # we don't generate partial, but rather, we generate shard
+        # This is because the intention of data parallel is to never
+        # do full reduction across batch dimension, it would still
+        # keep the reduction activation as sharded.
+        full_reduction = False
+        # loop through the dim size to find the output batch dim
+        for shape in shapes:
+            if len(shape) == 0:
+                full_reduction = True
+
+            for i, dim_size in enumerate(shape):
+                if dim_size == self.batch_dim_size:
+                    self.set_batch_dim(node, i)
+                    return i
+
+        operands = node.all_input_nodes
+        if not operands:
+            # if there's no operands, it must be factory ops and it's a tensor
+            # generated for computation and should be marked as replicated
+            self.set_batch_dim(node, -1)
+            # -1 means replicated
+            return -1
+        else:
+            # if there's operand we see the operand have batch dim, if operand
+            # have batch dim but output does not, it's either a full reduction,
+            # where we should stay sharded, or it's a reduction on batch dim only
+            # where we should produce partial
+            operand_batch_dim = -1
+            for operand in operands:
+                if operand in self.batch_dim_map:
+                    operand_batch_dim = self.get_batch_dim(operand)
+            # self.get_batch_dim(operands[0])
+            if operand_batch_dim < 0:
+                # if operand does not have batch dim, we also don't have batch dim
+                self.set_batch_dim(node, operand_batch_dim)
+                return operand_batch_dim
+            elif full_reduction:
+                self.set_batch_dim(node, operand_batch_dim)
+                return operand_batch_dim
+            else:
+                # if operand have batch dim but output does not, it should
+                # produce partial, we use -2 to indicate partial
+                self.set_batch_dim(node, -2)
+                return -2
+
+    def compute_act_spec(self, node: fx.Node, mesh: DeviceMesh) -> DTensorSpec:
+        """Compute the batch dimension for the current node, then generate the sharding spec that shards on the batch dimension."""
+        node_batch_dim = self.compute_batch_dim(node)
+        if node_batch_dim == -1:
+            # indicate this activation is replicated
+            act_spec = DTensorSpec(mesh=mesh, placements=(Replicate(),))
+        elif node_batch_dim == -2:
+            # indicate this activation is partial
+            act_spec = DTensorSpec(mesh=mesh, placements=(_Partial(),))
+        else:
+            # indicate this activation is Shard
+            act_spec = DTensorSpec(mesh=mesh, placements=(Shard(node_batch_dim),))
+
+        return act_spec
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/comm_tensor.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/comm_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5369c2d4f628ecdd9b2a7344cb8d1ab155ef11dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/comm_tensor.py
@@ -0,0 +1,247 @@
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, List, Optional, Tuple
+
+import torch
+from torch._C import _disabled_torch_function_impl
+from torch.fx.experimental.proxy_tensor import (
+    _ProxyTensor,
+    fetch_object_proxy,
+    get_innermost_proxy_mode,
+    get_proxy_slot,
+    set_proxy_slot,
+    track_tensor_tree,
+)
+from torch.utils import _pytree as pytree
+from torch.utils._mode_utils import no_dispatch
+from torch.utils._pytree import tree_flatten, tree_map, tree_map_only
+
+
+@dataclass
+class _CommResult:
+    # a custom type wrapping both inplace output tensor and work handle
+    _tensor: torch.Tensor
+    _work: torch.distributed._Work
+
+
+def _wait_comm(comm_result: _CommResult):
+    # This function is only used by tracing mode as a call_function node right
+    # before consuming a collective result tensor.
+    comm_result._work.wait()
+    return comm_result._tensor
+
+
+def _wrap_comm_result(result: Tuple[Any, Any]) -> Tuple[Any, Any]:
+    def wrap(work, e):
+        assert isinstance(e, torch.Tensor), (
+            "Excepting collection of tensors as the first element in the "
+            "return value of communication operations."
+        )
+
+        return _CommResult(e, work)
+
+    # E.g.,
+    # allreduce_ returns ([tensor], work)
+    # allgather_ returns ([[tensor1, tensor2]], work)
+    work = result[1]
+    return (tree_map(partial(wrap, work), result[0]), work)
+
+
+def _get_tracer() -> Optional[torch.fx.Tracer]:
+    mode = get_innermost_proxy_mode()
+    if mode is None:
+        return None
+    return mode.tracer
+
+
+class CommTensor(torch.Tensor):
+    r"""
+    A Tensor subclass to wrap input tensors for collective communications.
+
+    This Tensor subclass works for both eager and tracing mode.
+    In eager mode, it will record whether the inplace collective communication
+    has been launched using this Tensor and remember the corresponding work
+    handle. If yes, it will explicitly call wait() in the ``__torch_dispatch__``
+    function before subsequent operations consuming the value of the Tensor.
+
+    In tracing mode, ``CommTensor`` inserts two node into the graph using the
+    ``__torch_dispatch__`` function.
+    1. The first node is inserted right after the
+    communication, wrapping both the inplace output tensor and the returned
+    work handle into a custom ``_CommResult`` type. We have to do this because
+    ``ProxyTorchDispatchMode`` only handles ``torch.Tensor``, ``_ProxyTensor``,
+    and ``torch.nn.Parameter`` objects and will treat the work handle
+    as a constant and embed that into the graph. As a result, during execution,
+    it will use the work handle created during tracing and will lead to wrong
+    result. The solution in this test is to manually create a proxy on the
+    return value of ``allreduce_`` which is ``([tensor], work)``, and wrap that
+    to ``[(_CommResult(tensor, work)), work]``. In this way, subsequent nodes can
+    directly consume ``_CommResult``.
+    2. The second node is inserted right before any subsequent node reads from
+    ``_CommResult``. It will call ``wait()`` on the stashed work handle to ensure
+    that computation waits for communication.
+    """
+
+    _supported_comms: List[str] = [
+        "_allgather_base_",
+        "_reduce_scatter_base_",
+        "allreduce_",
+        "allgather_",
+        "alltoall_",
+        "broadcast_",
+        "reduce_scatter_",
+        "scatter_",
+    ]
+
+    _tensor: torch.Tensor
+    _work: Optional[torch.distributed._Work]
+
+    @staticmethod
+    def __new__(cls, tensor: torch.Tensor):
+        t = tensor._tensor if isinstance(tensor, CommTensor) else tensor
+        if get_innermost_proxy_mode() is None:
+            # noop for eager mode
+            return tensor
+
+        # Use non-CommTensor to avoid nested CommTensor Wrapping
+        r = torch.Tensor._make_subclass(cls, t, require_grad=t.requires_grad)
+        # The tensor object wrapped by this CommTensor
+        # NB: THIS CAN BE A CommTensor; see test_nested_comm_tensor_wrapping
+        r._tensor = tensor  # type: ignore[attr-defined]
+        # Record the LAST `work` object returned by collective communication
+        # operations. If this is None, it means no collectives have called
+        # since last time a tensor is wrapped by CommTensor
+        r._work = None  # type: ignore[attr-defined]
+        return r
+
+    def __repr__(self):
+        return f"CommTensor({self._tensor}, work={self._work})"
+
+    # disable __torch_function__ so that CommTensor can recursively dispatch
+    # with ProxyTorchDispatchMode in make_fx
+    __torch_function__ = _disabled_torch_function_impl
+
+    @classmethod
+    def _is_supported(cls, op_name):
+        return any(comm in op_name for comm in cls._supported_comms)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        # shared states when unwrapping args
+        tracer: Optional[torch.fx.Tracer] = None
+        work: Optional[torch.distributed._Work] = None
+
+        # wrapped ._tensor if this is a CommTensor, and insert/call wait()
+        # if communication has been launched on this tensor.
+        def unwrap(e: Any):
+            if isinstance(e, CommTensor):
+                nonlocal tracer, work
+
+                work = e._work
+                # TODO(ezyang): I don't really understand what's going on
+                # here, but it seems that tracer doesn't reflect whether or
+                # not there is ambient tracing going on, but rather, whether
+                # or not we will trace THIS particular invocation.  If we
+                # have a nested CommTensor, the outer layer doesn't actually
+                # trace and we only trace the inner layer
+                if not isinstance(e._tensor, CommTensor):
+                    tracer = _get_tracer()
+
+                if work is not None:
+                    if tracer is not None:
+                        # insert a node to the traced graph.
+                        proxy_res = tracer.create_proxy(  # type: ignore[union-attr]
+                            "call_function",
+                            _wait_comm,
+                            (get_proxy_slot(e._tensor, tracer).proxy,),
+                            {},
+                            name="wait_comm",
+                        )
+                        # HACK: update the proxy for the inplace output
+                        set_proxy_slot(e._tensor, tracer, proxy_res)
+                    # For eager mode, simply wait.
+                    # During tracing, still need to wait here, to make sure the
+                    # execution during tracing is correct.
+                    work.wait()
+
+                # communication has been waited, stop propagating CommTensor
+                return e._tensor
+            else:
+                return e
+
+        def wrap(e: Any):
+            return CommTensor(e) if isinstance(e, torch.Tensor) else e
+
+        def set_work(work: torch.distributed._Work, e: Any):
+            if isinstance(e, CommTensor):
+                e._work = work  # type: ignore[attr-defined]
+            elif isinstance(e, torch.Tensor):
+                raise RuntimeError(
+                    "Type of output tensors from collective communication during "
+                    "tracing should always be CommTensor instead of torch.Tensor"
+                )
+            return e
+
+        unwrapped_args = tree_map(unwrap, args)
+        unwrapped_kwargs = tree_map(unwrap, kwargs)
+
+        if cls._is_supported(func.__name__):
+            if tracer is not None:
+                # in tracing mode, get proxies for args
+                proxy_args, proxy_kwargs = tree_map_only(
+                    _ProxyTensor,
+                    lambda e: e.proxy,
+                    tree_map_only(
+                        torch.Tensor,
+                        fetch_object_proxy(tracer),
+                        (unwrapped_args, unwrapped_kwargs),
+                    ),
+                )
+
+                # get proxy for output tuple
+                proxy_res = func(*proxy_args, **proxy_kwargs)
+                assert isinstance(proxy_res, torch.fx.Proxy)
+                # insert a node that wraps the output tuple into
+                # _CommResult(tensor, work)
+                comm_result_proxy = tracer.create_proxy(  # type: ignore[union-attr]
+                    "call_function",
+                    _wrap_comm_result,
+                    (proxy_res,),
+                    {},
+                    name="comm_result",
+                )
+
+                with no_dispatch():
+                    # disable dispatch to avoid trigger ProxyTorchDispatchMode logic
+                    out = func(*unwrapped_args, **unwrapped_kwargs)
+
+                # wrap output with the proxy of _CommResult, so that subsequent
+                # ops and link to it.
+                track_tensor_tree(out, comm_result_proxy, constant=None, tracer=tracer)
+
+                # N.B.: we still need to remember the work handle here, and wait
+                # for it later to make sure the execution during tracing is
+                # correct. Also, remember comm is already launched
+                # args[0] is always the collection of output tensors
+                pytree.tree_map_(partial(set_work, out[1]), args[0])
+
+                # HACK: update the proxy on the input argument as this is an
+                # inplace collective communication.
+                flat_args, args_spec = tree_flatten(unwrapped_args[0])
+                flat_out, out_spec = tree_flatten(out[0])
+                for a, o in zip(flat_args, flat_out):
+                    set_proxy_slot(a, tracer, get_proxy_slot(o, tracer))
+
+                return out
+            else:
+                # in eager mode, simply remember work handle as an attribute
+                out = func(*unwrapped_args, **unwrapped_kwargs)
+                pytree.tree_map_(partial(set_work, out[1]), args[0])
+                return out
+        else:
+            if work is not None:
+                return func(*unwrapped_args, **unwrapped_kwargs)
+            else:
+                # we need to propagate CommTensor wrapping until the first
+                # subsequent operation has waited for it.
+                return tree_map(wrap, func(*unwrapped_args, **unwrapped_kwargs))
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/config.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..31dbcb1f1e7594439b7363cf43230389f48d76c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/config.py
@@ -0,0 +1,27 @@
+import logging
+import sys
+from types import ModuleType
+from typing import Set
+
+# log level (levels print what it says + all levels listed below it)
+# DEBUG print full traces <-- lowest level + print tracing of every instruction
+# INFO print compiler functions + distributed graphs
+# WARN print warnings
+# ERROR print exceptions
+log_level: int = logging.DEBUG
+# Verbose will print full stack traces on warnings and errors
+verbose = False
+
+# the name of a file to write the logs to
+log_file_name: None = None
+
+
+class _AccessLimitingConfig(ModuleType):
+    def __setattr__(self, name, value) -> None:
+        if name not in _allowed_config_names:
+            raise AttributeError(f"{__name__}.{name} does not exist")
+        return object.__setattr__(self, name, value)
+
+
+_allowed_config_names: Set[str] = {*globals().keys()}
+sys.modules[__name__].__class__ = _AccessLimitingConfig
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/data_parallel.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6b92c0494db3541365ab434a0e7d2b5e7b3831
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/data_parallel.py
@@ -0,0 +1,824 @@
+import operator
+from contextlib import contextmanager
+from enum import Enum
+
+from typing import Any, cast, Dict, List, Optional, Tuple
+
+import torch
+
+import torch.distributed.distributed_c10d as c10d
+import torch.fx as fx
+import torch.library
+import torch.nn as nn
+
+import torch.utils._pytree as pytree
+
+from torch.distributed._spmd.batch_dim_utils import BatchDimAnalyzer
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
+
+from torch.distributed._tensor._utils import compute_local_shape
+from torch.distributed._tensor.op_schema import (
+    OpStrategy,
+    PlacementStrategy,
+    StrategyType,
+    TupleStrategy,
+)
+from torch.distributed._tensor.placement_types import _Partial, DTensorSpec, Placement
+from torch.distributed._tensor.redistribute import redistribute_local_tensor
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+aten = torch.ops.aten
+
+# Dummy op used by data parallel to tag gradients.
+_spmd_lib_def = torch.library.Library("_spmd", "DEF")
+_spmd_lib_def.define("tag_grad(Tensor self) -> Tensor")
+
+_spmd_lib_impl = torch.library.Library("_spmd", "IMPL")
+_spmd_lib_impl.impl("tag_grad", lambda x: x, "CompositeExplicitAutograd")
+
+
+class DataParallelStyle(Enum):
+    """This enum represents the style of the data-parallel operation.
+
+    We have three types of Data Parallel style:
+    1. DEFAULT: the default data parallel style, which is to represent a mixed
+                replicate and fully shard behavior. For each parameter that is able
+                to be sharded evenly, we shard it, otherwise we would replicate the
+                parameter. This style avoids potential padding if the parameters
+                cannot be sharded evenly, but it would generate a mixed of all_reduce
+                and reduce_scatter.
+    2. REPLICATE: the data parallel style that replicates all model parameters.
+                  This is similar to the behavior of DistributedDataParallel.
+    3. FULLY_SHARD: the data parallel style that shards all model parameters. This
+                    is similar to the behavior of FullyShardedDataParallel, the
+                    difference is that FullyShardedDataParallel (ZERO-3), which
+                    shards the model using FlatParameter based sharding,
+                    while this style shards each parameter into DTensor.
+    """
+
+    DEFAULT = 0
+    REPLICATE = 1
+    FULLY_SHARD = 2
+
+
+class NodeType(Enum):
+    """NodeType is an enum that records the type of the tensors in the graph.
+
+    This is used to determine the data parallel strategy.
+    """
+
+    PARAM = 0
+    ACT = 1
+    GRAD = 2
+    STATE = 3
+    NON_TENSOR = 4  # NON_TENSOR is to tag non tensor node (i.e. graph output)
+
+
+class DataParallelStrategy(OpStrategy):
+    """DataParallelStrategy is a special case of OpStrategy that only records the "data parallel style" placement
+    strategy for each fx Node.
+
+    It takes a list of PlacementStrategy, where each PlacementStrategy describes
+    one way to distribute the tensor and computation. In the DataParallel case,
+    there're two possible ways to distribute the parameters:
+        1. replicate the parameter over a set of devices (DDP like behavior)
+        2. shard the parameter on its tensor dimension 0 over a set of devices
+           (FSDP like behavior).
+
+    In addition to the strategy list, we also need to:
+    1. `node_type`: record the type of each node in the graph, so that we can
+        determine how to propagate in a data parallel fashion.
+    2. `reduce_over_batch` is specifically tied to data parallel as the loss
+        calculation usually results in scalar tensor where it comes from a
+        reduction over the batch dimension. We need to know this information
+        so that we could keep the output as sharded.
+    """
+
+    def __init__(
+        self,
+        node_type: NodeType,
+        strategy_list: List[PlacementStrategy],
+        reduction_over_batch: bool = False,
+    ):
+        super().__init__(strategy_list)
+        self.node_type = node_type
+        self.reduction_over_batch = reduction_over_batch
+
+    def __str__(self) -> str:
+        return f"type: {self.node_type}, {super().__str__()}"
+
+
+@contextmanager
+def gradients_tagging(params: Dict[str, torch.Tensor]):
+    """Tag the gradient of the parameters with a special tag, so that we can identify them during SPMD expansion.
+
+    It's safe to trace those hooks and we would remove those nodes later.
+    """
+    tagging_hooks = []
+    try:
+        for p in params.values():
+            h = p.register_hook(torch.ops._spmd.tag_grad)
+            tagging_hooks.append(h)
+        yield
+    finally:
+        # remove those hooks after tracing
+        for h in tagging_hooks:
+            h.remove()
+
+
+def _gen_shard_strategy(
+    mesh: DeviceMesh, shard_dim: int, input_specs: Optional[List[DTensorSpec]] = None
+) -> PlacementStrategy:
+    """Util function to generate a shard strategy on shard_dim."""
+    return PlacementStrategy(
+        output_specs=DTensorSpec(mesh=mesh, placements=(Shard(shard_dim),)),
+        input_specs=input_specs,
+    )
+
+
+def _gen_replicate_strategy(
+    mesh: DeviceMesh, input_specs: Optional[List[DTensorSpec]] = None
+) -> PlacementStrategy:
+    """Util function to generate a replicate strategy."""
+    return PlacementStrategy(
+        output_specs=DTensorSpec(mesh=mesh, placements=(Replicate(),)),
+        input_specs=input_specs,
+    )
+
+
+def _gen_partial_strategy(mesh: DeviceMesh) -> PlacementStrategy:
+    """Util function to generate a partial strategy."""
+    # NOTE: we use AVG by default, avg reduction is needed depending on
+    # the loss function, for most loss function it should do
+    # gradient averaging. There might be certain cases it should
+    # not do gradient averaging (i.e. sum) but it's pretty rare.
+    # TODO: Only NCCL supports AVG so using backend like Gloo would
+    # crash, we should figure out a way to support avg reduction
+    # for non-NCCL backend
+    reduce_op = c10d.ReduceOp.AVG  # type: ignore[attr-defined]
+    return PlacementStrategy(
+        output_specs=DTensorSpec(mesh=mesh, placements=(_Partial(reduce_op),)),
+    )
+
+
+def build_data_parallel_strategies(
+    train_step_graph: GraphModule,
+    num_params: int,
+    num_states: int,
+    mesh: DeviceMesh,
+    batch_dim: int = 0,
+) -> Dict[fx.Node, StrategyType]:
+    """Loop through the train step graph and build the data parallel strategy for each fx Node."""
+    activation_idx = num_params + num_states
+    non_compute_ops = [
+        aten.clone.default,
+        aten.detach.default,
+        aten.ones_like.default,
+        aten.reshape.default,
+        aten.t.default,
+        aten.view.default,
+        torch.ops._spmd.tag_grad.default,
+        operator.getitem,
+    ]
+
+    tuple_strategy_ops = [aten._fused_adam.default]
+
+    dp_strategy_map: Dict[fx.Node, StrategyType] = {}
+    batch_dim_analyzer = BatchDimAnalyzer(batch_dim)
+    placeholder_idx = 0
+    num_param_grad = 0
+
+    # first we backward propagate to mark the param gradients sharding
+    # with tag_grad node helps and then delete the tag_grad nodes
+    for node in reversed(list(train_step_graph.graph.nodes)):
+        # find a param_grad node via the tagging
+        if node.target == torch.ops._spmd.tag_grad.default:
+            cur_node = node
+            while cur_node.target in non_compute_ops:
+                cur_node = cur_node.args[0]
+                partial_strategy = _gen_partial_strategy(mesh)
+                dp_strategy_map[cur_node] = DataParallelStrategy(
+                    NodeType.GRAD, [partial_strategy]
+                )
+            num_param_grad += 1
+            # remove the tag_grad node from graph
+            node.replace_all_uses_with(node.args[0])
+            train_step_graph.graph.erase_node(node)
+
+            if num_param_grad == num_params:
+                # early break if we have already processed all param_grads
+                break
+
+    # next we forward propagate to mark all the sharding
+    for node in train_step_graph.graph.nodes:
+        if node.op == "placeholder":
+            if "val" not in node.meta:
+                # NOTE: There're certain cases where the placeholder nodes do
+                # not have real tensor values:
+                # 1. optimizer states can be None sometimes, i.e. SGD with
+                #    no momentum, optimizer states populate `momentum` state
+                #    as None, the full graph we get from `compile` would have
+                #    None as the placeholder value
+                # 2. function args might not only contain params or activations,
+                #    but also contain other non-tensor inputs, i.e. the model
+                #    and optimizer instances baked in as a placeholder, there might
+                #    also be some scalar argument which is not a tensor
+                #
+                # For the above cases, we create a NON_TENSOR stratgy so that we
+                # know it's not a tensor and we don't need to shard it
+                dp_strategy_map[node] = DataParallelStrategy(NodeType.NON_TENSOR, [])
+
+            elif placeholder_idx < num_params:
+                # during compilation there's an assumption that the first num_params
+                # placeholders should be parameters
+                shard_strategy = _gen_shard_strategy(mesh, 0)
+                replica_strategy = _gen_replicate_strategy(mesh)
+                dp_strategy_map[node] = DataParallelStrategy(
+                    NodeType.PARAM, [replica_strategy, shard_strategy]
+                )
+
+            elif placeholder_idx < activation_idx:
+                # optimizer states follow the same strategy as
+                # the corresponding parameters
+                replica_strategy = _gen_replicate_strategy(mesh)
+                shard_strategy = _gen_shard_strategy(mesh, 0)
+
+                dp_strategy_map[node] = DataParallelStrategy(
+                    NodeType.STATE, [replica_strategy, shard_strategy]
+                )
+            else:
+                activation_batch_dim_size = node.meta["val"].shape[batch_dim]
+                # find the first activation node and use its batch dim size
+                if batch_dim_analyzer.batch_dim_size == -1:
+                    batch_dim_analyzer.init_batch_dim_size(activation_batch_dim_size)
+
+                batch_dim_analyzer.set_batch_dim(node, batch_dim)
+                shard_strategy = _gen_shard_strategy(mesh, batch_dim)
+                dp_strategy_map[node] = DataParallelStrategy(
+                    NodeType.ACT, [shard_strategy]
+                )
+            placeholder_idx += 1
+        elif node.op == "call_function":
+            # Annotate node types for the computation graph
+            # Data Parallel node propagation logic:
+            # param (non-compute) -> out: param
+            # grad (non-compute before/after) -> out: grad
+            # state -> output: state
+            #
+            # param + activation (param must be replicate, act be sharded) -> out: activation
+            # param/state + grad (param/state/grad be the same spec) -> out: param/state
+            # param + state -> out: param
+
+            if node.target in non_compute_ops:
+                # At this point, we should have removed all the `tag_grad` nodes in the graph
+                assert node.target != torch.ops._spmd.tag_grad.default
+
+                input_nodes = node.all_input_nodes
+                assert (
+                    len(input_nodes) == 1
+                ), f"non-compute op only support one input now, found node: {node} with length of inputs: {len(node.args)}"
+                arg_strategy = dp_strategy_map[input_nodes[0]]
+
+                if node.target == operator.getitem:
+                    # for getitem call, just forward the strategy from the input
+                    getitem_idx = node.args[1]
+                    if isinstance(arg_strategy, TupleStrategy):
+                        # for tuple strategy, we need to get the child strategy from the tuple
+                        dp_strategy_map[node] = arg_strategy.childs[getitem_idx]
+                    else:
+                        # if it's not a tuple strategy, we just forward the arg strategy
+                        dp_strategy_map[node] = arg_strategy
+                else:
+                    assert isinstance(arg_strategy, DataParallelStrategy)
+                    arg_node_type = arg_strategy.node_type
+                    if arg_node_type == NodeType.PARAM:
+                        replica_strategy = _gen_replicate_strategy(mesh)
+                        dp_strategy_map[node] = DataParallelStrategy(
+                            NodeType.PARAM, [replica_strategy]
+                        )
+                    elif arg_node_type == NodeType.GRAD:
+                        partial_sig = _gen_partial_strategy(mesh)
+                        dp_strategy_map[node] = DataParallelStrategy(
+                            NodeType.GRAD, [partial_sig]
+                        )
+                    elif arg_node_type == NodeType.ACT:
+                        arg_node_spec = batch_dim_analyzer.compute_act_spec(
+                            input_nodes[0], mesh
+                        )
+
+                        output_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
+
+                        shard_strategy = PlacementStrategy(
+                            output_specs=output_spec, input_specs=[arg_node_spec]
+                        )
+                        dp_strategy_map[node] = DataParallelStrategy(
+                            NodeType.ACT, [shard_strategy]
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"non compute op not supporting {arg_node_type}! "
+                        )
+
+                # finished processing this non-compute node
+                continue
+
+            # for computatation nodes, we need to check all the inputs
+            input_args = node.all_input_nodes
+            input_specs = []
+            if node in dp_strategy_map:
+                # found a param_grad node that already have output pre-filled spec
+                # fill in the expected input specs for the pre-filled strategy
+                node_strategy = dp_strategy_map[node]
+                assert isinstance(node_strategy, DataParallelStrategy)
+                node_type = node_strategy.node_type
+                assert node_type == NodeType.GRAD
+                produce_param_grad_strat = node_strategy.strategies
+                has_activation = False
+                for arg in input_args:
+                    arg_strategy = dp_strategy_map[arg]
+                    assert isinstance(arg_strategy, DataParallelStrategy)
+                    arg_node_type = arg_strategy.node_type
+                    if arg_node_type == NodeType.ACT:
+                        # activation sharded
+                        has_activation = True
+                        act_spec = batch_dim_analyzer.compute_act_spec(arg, mesh)
+
+                        input_specs.append(act_spec)
+
+                if has_activation:
+                    assert len(produce_param_grad_strat) == 1
+                    produce_param_grad_strat[0].input_specs = input_specs
+            elif node.target in tuple_strategy_ops:
+                # ops that need to build tuple strategy instead of normal strategy
+                # This should happen rarely and only needed when we need to generate
+                # different node strategy for multiple outputs (i.e. fused_adam op)
+                # TODO: Currently this specializes to fused optimizer ops, but we need
+                # to see how to generalize this strategy building logic
+                output_strategy_len = len(node.args) - 1
+                tuple_strategies = []
+                for i in range(output_strategy_len):
+                    if not isinstance(node.args[i], list):
+                        raise RuntimeError(
+                            f"Expecting list as arg to build Tuple Strategy, but found type {type(node.args[i])}!"
+                        )
+                    # for list/tuple arg, use the first one to find out the node type
+                    if len(node.args[i]) > 0:
+                        arg_strategy = dp_strategy_map[node.args[i][0]]
+                        assert isinstance(arg_strategy, DataParallelStrategy)
+                        assert arg_strategy.node_type in [
+                            NodeType.PARAM,
+                            NodeType.GRAD,
+                            NodeType.STATE,
+                        ], "Expecting param/grad/state as arg to build Tuple Strategy!"
+                        replica_strategy = _gen_replicate_strategy(mesh)
+                        shard_strategy = _gen_shard_strategy(mesh, shard_dim=0)
+                        out_node_strategy: StrategyType = DataParallelStrategy(
+                            arg_strategy.node_type, [replica_strategy, shard_strategy]
+                        )
+
+                        tuple_strategies.append(out_node_strategy)
+
+                output_tuple_strategy = TupleStrategy(tuple(tuple_strategies))
+                dp_strategy_map[node] = output_tuple_strategy
+            else:
+                # NOTE: This is the common region for all regular computation ops
+
+                input_node_types = [
+                    cast(DataParallelStrategy, dp_strategy_map[arg]).node_type
+                    for arg in input_args
+                    if isinstance(dp_strategy_map[arg], DataParallelStrategy)
+                ]
+                if NodeType.GRAD in input_node_types:
+                    # param/state + grad, build up acceptable strategy
+                    # the strategy should be the same for all the inputs/outputs
+                    # TODO: optimizer parts should follow the dtensor prop logic
+                    # to support more general cases that allows optimizer states
+                    # to have different shardings compare to the params
+                    replica_strategy = _gen_replicate_strategy(mesh)
+                    shard_strategy = _gen_shard_strategy(mesh, shard_dim=0)
+                    output_node_type = NodeType.PARAM
+
+                    non_grad_types = [t for t in input_node_types if t != NodeType.GRAD]
+
+                    output_node_type = non_grad_types[0]
+                    for non_grad_type in non_grad_types:
+                        assert (
+                            non_grad_type == output_node_type
+                        ), f"Found more than one non grad types! Expect {output_node_type} but found {non_grad_type}!"
+                    assert output_node_type in [
+                        NodeType.PARAM,
+                        NodeType.STATE,
+                    ], f"Expecting output node type to be either state or param, but found {output_node_type}!"
+
+                    dp_strategy_map[node] = DataParallelStrategy(
+                        output_node_type, [replica_strategy, shard_strategy]
+                    )
+                elif NodeType.STATE in input_node_types:
+                    # either param + state or state + state
+                    replica_strategy = _gen_replicate_strategy(mesh)
+                    shard_strategy = _gen_shard_strategy(mesh, shard_dim=0)
+                    output_node_type = (
+                        NodeType.PARAM
+                        if NodeType.PARAM in input_node_types
+                        else NodeType.STATE
+                    )
+
+                    dp_strategy_map[node] = DataParallelStrategy(
+                        output_node_type, [replica_strategy, shard_strategy]
+                    )
+                elif NodeType.PARAM in input_node_types:
+                    if NodeType.ACT in input_node_types:
+                        # param + activation, build up acceptable strategy
+                        # param must be replicated, activation must be sharded
+                        for arg in input_args:
+                            arg_strategy = dp_strategy_map[arg]
+                            assert isinstance(arg_strategy, DataParallelStrategy)
+                            node_type = arg_strategy.node_type
+                            if node_type == NodeType.ACT:
+                                # compute activation spec
+                                act_spec = batch_dim_analyzer.compute_act_spec(
+                                    arg, mesh
+                                )
+
+                                input_specs.append(act_spec)
+                            elif node_type == NodeType.PARAM:
+                                # param must be replicated
+                                input_specs.append(
+                                    DTensorSpec(mesh=mesh, placements=(Replicate(),))
+                                )
+                            else:
+                                raise RuntimeError(
+                                    f"Expecting node with parameter and activation, but found {input_node_types}! "
+                                )
+                        # produce activation type sharding for output
+                        output_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
+
+                        act_strategy = PlacementStrategy(
+                            output_specs=output_spec, input_specs=input_specs
+                        )
+
+                        dp_strategy_map[node] = DataParallelStrategy(
+                            NodeType.ACT, [act_strategy]
+                        )
+                    else:
+                        # If inputs only have parameters, the
+                        # strategy of this node should follow input
+                        dp_strategy_map[node] = dp_strategy_map[input_args[0]]
+                else:
+                    # If input nodes does not have PARAM/GRAD/STATE, then
+                    # it should be a pure activation computation, it should
+                    # produce activation output.
+                    # Activations are usually sharded unless model creates
+                    # new tensors during computation, which depend on whether
+                    # the new tensor associate with a batch dim or not, it could
+                    # be shard/replicate/partial, batch dim analyzer should tell
+                    # us the correct sharding.
+                    for arg in input_args:
+                        arg_strategy = dp_strategy_map[arg]
+                        assert isinstance(arg_strategy, DataParallelStrategy)
+                        input_spec = batch_dim_analyzer.compute_act_spec(arg, mesh)
+
+                        input_specs.append(input_spec)
+
+                    act_spec = batch_dim_analyzer.compute_act_spec(node, mesh)
+                    op_strategy = PlacementStrategy(
+                        output_specs=act_spec, input_specs=input_specs
+                    )
+                    dp_strategy_map[node] = DataParallelStrategy(
+                        NodeType.ACT, [op_strategy]
+                    )
+
+        elif node.op == "output":
+            dp_strategy_map[node] = DataParallelStrategy(NodeType.NON_TENSOR, [])
+        else:
+            raise RuntimeError(f"op code {node.op} not supported")
+
+    return dp_strategy_map  # type: ignore[return-value]
+
+
+def mark_data_parallel_shardings(
+    train_step_graph: GraphModule,
+    num_parameters: int,
+    num_states: int,
+    dp_strategy_map: Dict[fx.Node, StrategyType],
+    parallel_mode: DataParallelStyle = DataParallelStyle.FULLY_SHARD,
+) -> None:
+    """Mark the sharding for the nodes in the train_step_graph."""
+    activation_idx = num_parameters + num_states
+    placeholder_idx = 0
+    for node in train_step_graph.graph.nodes:
+        node_strategy = dp_strategy_map[node]
+        if node.op == "placeholder":
+            assert isinstance(node_strategy, DataParallelStrategy)
+            node_type = node_strategy.node_type
+            node_strategies = node_strategy.strategies
+            if node_type == NodeType.NON_TENSOR:
+                # set node sharding to None
+                node_sharding = None
+            elif placeholder_idx < activation_idx:
+                assert len(node_strategies) > 0, "node_strategies should not be empty"
+                if parallel_mode == DataParallelStyle.REPLICATE:
+                    # set to replicate for replicate style
+                    node_sharding = node_strategies[0]
+                elif parallel_mode == DataParallelStyle.FULLY_SHARD:
+                    # set to shard for fully shard style
+                    if len(node_strategies) == 1:
+                        # only one strategy, use that instead
+                        # i.e. optimizer state steps can only be replicate
+                        node_sharding = node_strategies[0]
+                    else:
+                        # use the full sharding strategy
+                        node_sharding = node_strategies[1]
+                elif parallel_mode == DataParallelStyle.DEFAULT:
+                    # TODO: add support for default mode
+                    # default mode would generate either replicate or shard
+                    raise NotImplementedError("default mode not implemented")
+            else:
+                assert len(node_strategies) > 0, "node_strategies should not be empty"
+                # mark activation as sharded on batch dim
+                node_sharding = node_strategies[0]
+
+            node.meta["sharding"] = node_sharding  # type: ignore[possibly-undefined]
+
+            placeholder_idx += 1
+        elif node.op == "call_function":
+            if isinstance(node_strategy, TupleStrategy):
+                # For tuple strategy in the data parallel mode, it should have the same strategy
+                # for all tuple elements, assert that then use the first element's strategy as sharding
+                first_strategy = cast(DataParallelStrategy, node_strategy.childs[0])
+                for child_strategy in node_strategy.childs:
+                    assert isinstance(child_strategy, DataParallelStrategy)
+                    assert child_strategy.strategies == first_strategy.strategies
+
+                node_strategies = first_strategy.strategies
+            else:
+                assert isinstance(node_strategy, DataParallelStrategy)
+                node_strategies = node_strategy.strategies
+
+            assert (
+                len(node_strategies) <= 2
+            ), "data parallel should have at most 2 strategies"
+            if len(node_strategies) == 1:
+                node.meta["sharding"] = node_strategies[0]
+            elif len(node_strategies) == 2:
+                if parallel_mode == DataParallelStyle.REPLICATE:
+                    # set to replicate for replicate style
+                    node.meta["sharding"] = node_strategies[0]
+                elif parallel_mode == DataParallelStyle.FULLY_SHARD:
+                    # set to shard for fully shard style
+                    node.meta["sharding"] = node_strategies[1]
+                else:
+                    raise RuntimeError("default mode not supported yet!")
+            else:
+                raise RuntimeError(
+                    f"node {node} strategy length {len(node_strategies)} is not expected!"
+                )
+        elif node.op == "output":
+            assert (
+                isinstance(node_strategy, DataParallelStrategy)
+                and node_strategy.node_type == NodeType.NON_TENSOR
+            ), "output node should not be tensor"
+            node.meta["sharding"] = None
+        else:
+            raise RuntimeError(f"op code {node.op} not supported")
+
+
+def _partition_val(val: Any, spec: DTensorSpec) -> Any:
+    """Util function to convert a full tensor val to its local component."""
+    if isinstance(val, torch.Tensor):
+        local_shard = val
+        if val.ndim == 0:
+            # If it's already a scalar tensor, it is already local, we don't
+            # need to do anything
+            return local_shard
+
+        for idx, placement in enumerate(spec.placements):
+            if placement.is_shard():
+                placement = cast(Shard, placement)
+                num_chunks = spec.mesh.size(mesh_dim=idx)
+                my_coord = spec.mesh.get_coordinate()
+                assert my_coord is not None, "current rank not in mesh!"
+                my_coord_on_mesh_dim = my_coord[idx]
+                local_shard = placement._split_tensor(
+                    local_shard, num_chunks, with_padding=False, contiguous=False
+                )[0][my_coord_on_mesh_dim]
+        return local_shard
+    elif isinstance(val, (tuple, list)):
+        return val.__class__(_partition_val(v, spec) for v in val)
+    else:
+        raise RuntimeError(f"val type {type(val)} not supported")
+
+
+def partitioner(graph: GraphModule) -> GraphModule:
+    """Graph partitioner that partitions the single device graph to distributed graph."""
+    shape_adjustment_ops = {
+        aten._unsafe_view.default: 1,
+        aten.expand.default: 1,
+        aten.new_zeros.default: 1,
+        aten.ones.default: 0,
+        aten.reshape.default: 1,
+        aten.view.default: 1,
+        aten.zeros.default: 0,
+    }
+    # partition the graph to distributed
+    for node in graph.graph.nodes:
+        node_sharding = node.meta["sharding"]
+        # None sharding means this node don't need sharding
+        if node_sharding is None:
+            continue
+
+        if node.op == "placeholder":
+            out_spec = node_sharding.output_spec
+            if not hasattr(out_spec, "from_local"):
+                local_val = _partition_val(node.meta["val"], out_spec)
+                # update node value
+                node.meta["val"] = local_val
+        elif node.op == "call_function":
+            out_spec = node_sharding.output_spec
+
+            # check if there's misaligned sharding, insert reshard if there is
+            expected_input_specs = node_sharding.input_specs
+            for idx, input_arg in enumerate(node.all_input_nodes):
+                input_arg_sharding = input_arg.meta["sharding"]
+
+                input_arg_spec = input_arg_sharding.output_spec
+                desired_spec = (
+                    out_spec
+                    if expected_input_specs is None
+                    else expected_input_specs[idx]
+                )
+                if input_arg_spec != desired_spec:
+                    input_arg_spec.tensor_meta = input_arg.meta["tensor_meta"]
+                    desired_spec.tensor_meta = input_arg.meta["tensor_meta"]
+                    input_arg_tensor = input_arg.meta["val"]
+
+                    # insert reshard operation
+                    def reshard_fn(local_tensor: torch.Tensor) -> torch.Tensor:
+                        return redistribute_local_tensor(
+                            local_tensor,
+                            input_arg_spec,
+                            desired_spec,
+                        )
+
+                    reshard_gm = make_fx(reshard_fn)(input_arg_tensor)
+                    reshard_gm_nodes = list(reshard_gm.graph.nodes)
+                    input_node = reshard_gm_nodes[0]
+                    with graph.graph.inserting_before(node):
+                        output_node = graph.graph.graph_copy(
+                            reshard_gm.graph,
+                            val_map={
+                                input_node: input_arg,
+                            },
+                        )
+                    node.replace_input_with(input_arg, output_node)
+
+            output_val = node.meta["val"]
+
+            if node.target == torch.ops.aten.repeat.default:
+                # for repeat op, we need to infer the repeat sizes
+                assert isinstance(output_val, torch.Tensor)
+                local_shape = compute_local_shape(
+                    output_val.shape, out_spec.mesh, out_spec.placements
+                )
+                input_shape = node.args[0].meta["val"].shape
+
+                def infer_repeat_sizes(repeated_shape, input_shape):
+                    repeated_size = [1] * len(repeated_shape)
+                    padded_length = len(repeated_shape) - len(input_shape)
+                    for i in range(len(repeated_shape)):
+                        if i < padded_length:
+                            repeated_size[i] = repeated_shape[i]
+                        else:
+                            repeated_size[i] = (
+                                repeated_shape[i] // input_shape[i - padded_length]
+                            )
+
+                    return repeated_size
+
+                node.update_arg(1, infer_repeat_sizes(local_shape, input_shape))
+
+            elif node.target in shape_adjustment_ops:
+                # for view related op that needs shape, adjust shape to local shape if needed
+                assert isinstance(output_val, torch.Tensor)
+                local_shape = compute_local_shape(
+                    output_val.shape, out_spec.mesh, out_spec.placements
+                )
+                shape_arg_num = shape_adjustment_ops[node.target]
+                node.update_arg(shape_arg_num, local_shape)
+
+            # convert output val to its local component
+            node.meta["val"] = _partition_val(output_val, out_spec)
+
+        elif node.op == "output":
+            break
+        else:
+            raise RuntimeError(f"op code {node} not supported")
+
+    # clean up the graph by removing sharding and partitioning related metadata
+    for node in graph.graph.nodes:
+        if "sharding" in node.meta:
+            del node.meta["sharding"]
+        if "val" in node.meta and isinstance(node.meta["val"], torch.Tensor):
+            local_tensor_meta = _extract_tensor_metadata(node.meta["val"])
+            node.meta["tensor_meta"] = local_tensor_meta
+
+    graph.graph.lint()
+    graph.recompile()
+    return graph
+
+
+def partition_data_parallel(
+    graph: GraphModule,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer],
+    params_buffers: Dict[str, torch.Tensor],
+    named_states: Dict[str, Any],
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    mesh: DeviceMesh,
+    parallel_style: DataParallelStyle,
+    input_batch_dim: int,
+) -> GraphModule:
+    """Partition the graph to into a data parallel graph.
+
+    This function also shards/replicates the model parameters and optimizer states to DTensors.
+    """
+    num_params_buffers = len(params_buffers)
+    flattened_states = pytree.tree_leaves(named_states)
+    num_states = len(flattened_states)
+
+    changed = graph.graph.eliminate_dead_code()
+    if changed:
+        graph.recompile()
+
+    # 1. First build up data parallel strategies for the whole graph
+    strategy_map = build_data_parallel_strategies(
+        graph, num_params_buffers, num_states, mesh=mesh, batch_dim=input_batch_dim
+    )
+
+    # 2. Next we mark the data parallel strategy for each node base on
+    #    the parallel_style
+    mark_data_parallel_shardings(
+        graph,
+        num_parameters=num_params_buffers,
+        num_states=num_states,
+        dp_strategy_map=strategy_map,
+        parallel_mode=parallel_style,
+    )
+
+    # 3. Partition the single machine graph to the distribute graph
+    partitioned_graph = partitioner(graph)
+
+    # preserve node types for the expanded graph
+    for node in partitioned_graph.graph.nodes:
+        if node in strategy_map:
+            node_strategy = strategy_map[node]
+            if isinstance(node_strategy, DataParallelStrategy):
+                node.meta["node_type"] = node_strategy.node_type
+            elif isinstance(node_strategy, TupleStrategy):
+                node.meta["node_type"] = NodeType.NON_TENSOR
+            else:
+                raise RuntimeError(f"Unknown node strategy {node_strategy}")
+        else:
+            # if the nodes are expanded nodes (collectives), we mark them
+            # the same type as the input node.
+            input_node = node.all_input_nodes[0]
+            node.meta["node_type"] = input_node.meta["node_type"]
+
+    # 4. Last, inplace partition the weights and optim states to
+    #    DTensors base on the parallel style
+    accessor = NamedMemberAccessor(model)
+    for param_key, param in params_buffers.items():
+        placement: Placement = Replicate()
+        if parallel_style == DataParallelStyle.FULLY_SHARD:
+            placement = Shard(0)
+        elif parallel_style != DataParallelStyle.REPLICATE:
+            raise RuntimeError(f"parallel style {parallel_style} not supported yet")
+
+        dtensor_param = distribute_tensor(param, mesh, [placement])
+        # update re-parameterized module param dict and optim states dict to DTensor
+        params_buffers[param_key] = dtensor_param.to_local()
+        # update module parameters to DTensor
+        accessor.set_tensor(param_key, dtensor_param)
+
+        # update the optimizer state key and values to DTensor
+        if optimizer is not None and param in optimizer.state:
+            param_states = named_states[param_key]
+            param_dtensor_states = {}
+            for state_key, state_val in param_states.items():
+                if isinstance(state_val, torch.Tensor) and state_val.ndim > 0:
+                    # shard/replicate non-scalar tensors, for scalar tensor, we
+                    # don't do anything
+                    dtensor_state = distribute_tensor(state_val, mesh, [placement])
+                    param_dtensor_states[state_key] = dtensor_state
+                    param_states[state_key] = dtensor_state.to_local()
+                else:
+                    param_dtensor_states[state_key] = state_val
+
+            optimizer.state.pop(param)  # type: ignore[call-overload]
+            optimizer.state[dtensor_param] = param_dtensor_states  # type: ignore[index]
+
+    return partitioned_graph
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/distribute.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/distribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf2ed4b9daec85b0634fefb11de32dd9fa59f2b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/distribute.py
@@ -0,0 +1,783 @@
+import logging
+import operator
+from dataclasses import dataclass
+from enum import auto, Enum
+from functools import partial
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.distributed._spmd.experimental_ops
+import torch.fx as fx
+
+from torch.distributed._spmd.comm_tensor import _get_tracer
+from torch.distributed._spmd.graph_utils import OP
+from torch.distributed._spmd.log_utils import get_logger
+
+from torch.distributed._tensor import DeviceMesh, DTensor
+from torch.distributed._tensor.op_schema import OpSchema
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+    TensorMeta,
+)
+from torch.distributed._tensor.redistribute import redistribute_local_tensor
+from torch.fx.experimental.proxy_tensor import make_fx, proxy_slot
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten, tree_map, tree_map_only, tree_unflatten
+
+
+logger: Optional[logging.Logger] = None
+
+aten = torch.ops.aten
+
+
+class TrainingPhase(Enum):
+    FORWARD = auto()
+    BACKWARD = auto()
+
+
+@dataclass
+class Schema:
+    mesh: DeviceMesh
+    placements: List[Placement]
+
+
+@dataclass
+class DSymInt:
+    """DSymInt represents a value retrieved by a SymInt op from a DTensor.
+
+    DSymInt helps View and Factory ops to determine the placement and shape of the
+    output tensor, as those operators either do not have an input DTensor or
+    the input DTensor is insufficient to determine the output tensor's placement.
+    """
+
+    global_value: int  # value that the SymInt evaluates to
+    local_value: int  # vaue that this SymInt evaluates to on the local shard
+    mesh: DeviceMesh  # device mesh of the DTensor where this SymInt is retrieved from
+
+    def is_shard(self) -> bool:
+        return self.local_value != self.global_value
+
+    @classmethod
+    def from_node(cls, node: fx.Node, dtensor: DTensor) -> "DSymInt":
+        dim: int = 0
+        if node.target == aten.sym_size:
+            dim = cast(int, node.args[1])
+            return cls(
+                global_value=dtensor.size(dim),
+                local_value=dtensor.to_local().size(dim),
+                mesh=dtensor.device_mesh,
+            )
+        elif node.target == aten.sym_numel:
+            return cls(
+                global_value=dtensor.numel(),
+                local_value=dtensor.to_local().numel(),
+                mesh=dtensor.device_mesh,
+            )
+        elif node.target == aten.sym_stride:
+            dim = cast(int, node.args[1])
+            return cls(
+                global_value=dtensor.stride(dim),
+                local_value=dtensor.to_local().stride(dim),
+                mesh=dtensor.device_mesh,
+            )
+        else:
+            raise NotImplementedError(f"DSymInt does not support {node.target}")
+
+
+def _is_partial_dtensor(obj: Any) -> bool:
+    """Check if object is 1) DTensor and  2) with any placement of _Partial."""
+    if not isinstance(obj, DTensor):
+        return False
+
+    is_partial = False
+    for placement in obj.placements:
+        if isinstance(placement, _Partial):
+            is_partial = True
+            break
+
+    return is_partial
+
+
+def _dispatch_with_local_tensors(
+    op: torch._ops.OpOverload,
+    local_args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    specs: Optional[
+        Dict[
+            torch.Tensor,
+            Tuple[torch.Size, DeviceMesh, Sequence[Placement], Sequence[Placement]],
+        ]
+    ] = None,
+) -> Any:
+    if kwargs is None:
+        kwargs = {}
+    if specs is None:
+        specs = {}
+
+    def redistribute(arg: Any) -> Any:
+        tensor_shape, mesh, current_placement, target_placement = specs[arg]
+        tensor_meta = TensorMeta(
+            tensor_shape,
+            stride=arg.stride(),
+            dtype=arg.dtype,
+        )
+        current_spec = DTensorSpec(
+            mesh, tuple(current_placement), tensor_meta=tensor_meta
+        )
+        target_spec = DTensorSpec(
+            mesh, tuple(target_placement), tensor_meta=tensor_meta
+        )
+
+        return (
+            redistribute_local_tensor(arg, current_spec, target_spec)  # type: ignore[index]
+            if isinstance(arg, torch.Tensor) and arg in specs  # type: ignore[operator]
+            else arg
+        )
+
+    # TODO: this is broken because it won't redistributed potential tensors on the kwargs
+    return op(*tree_map(redistribute, local_args), **kwargs)
+
+
+# Figure out how to specify a type spec for the return specs value
+# without the entire structure.
+# pyre-fixme
+def _update_specs_for_redistribute(args, target_schema, redistribute):
+    # Code adapted from pack_args_kwargs_with_local_tensor
+    flatten_args, args_tree_spec = tree_flatten(args)
+    flatten_args_schema = pytree.tree_leaves(target_schema.args_schema)
+
+    specs: Dict[
+        torch.Tensor,
+        Tuple[
+            torch.Size,
+            DeviceMesh,
+            Sequence[Placement],
+            Sequence[Placement],
+        ],
+    ] = {}
+    for i, arg in enumerate(flatten_args):
+        if isinstance(arg, DTensor):
+            if redistribute:
+                specs[arg._local_tensor] = (
+                    arg.size(),
+                    flatten_args_schema[i].mesh,
+                    arg.placements,
+                    flatten_args_schema[i].placements,
+                )
+            flatten_args_schema[i] = arg._local_tensor
+
+    unflattened_args = tree_unflatten(flatten_args_schema, args_tree_spec)
+    return specs, unflattened_args
+
+
+# When no tensor redistribution is required, we only need to update non-tensor args
+# of the node according to op_schema and avoid building a GraphModule just for the
+# node.
+def _update_node_from_op_schema(node: torch.fx.Node, op_schema: OpSchema) -> None:
+    flat_args, args_tree_spec = tree_flatten(node.args)
+    flat_args_schema = pytree.tree_leaves(op_schema.args_schema)
+
+    def is_sym_int_or_int(arg: Union[int, torch.fx.Node]) -> bool:
+        if isinstance(arg, torch.fx.Node):
+            return arg.target in [
+                aten.sym_size,
+                aten.sym_numel,
+                aten.sym_stride,
+            ]
+        return isinstance(arg, int)
+
+    assert len(flat_args) == len(flat_args_schema)
+    for i, (arg, arg_schema) in enumerate(zip(flat_args, flat_args_schema)):
+        if is_sym_int_or_int(arg) and isinstance(arg_schema, int):
+            flat_args[i] = arg_schema
+
+    args = tree_unflatten(flat_args, args_tree_spec)
+    for idx, arg in enumerate(args):
+        node.update_arg(idx, arg)
+    return None
+
+
+def _remap_arg(node_to_obj: Dict[fx.Node, Any], arg: Any) -> Any:
+    if isinstance(arg, torch.fx.Node):
+        obj = node_to_obj[arg]
+        if _get_tracer():
+            # This is a shared arg, already has a tracer from previous
+            # tracing. Delete the tracer.
+            del cast(Dict[Any, Any], obj.__dict__)[proxy_slot]
+        return obj
+    else:
+        return arg
+
+
+def unpack_sizes_and_dims(
+    sizes: List[Union[DSymInt, int]], mesh: DeviceMesh
+) -> Tuple[List[int], List[Placement]]:
+    local_sizes: List[int] = [
+        s.local_value if isinstance(s, DSymInt) else s for s in sizes
+    ]
+    placements: List[Placement] = [
+        Shard(i)
+        for i, a in enumerate(sizes)
+        if (isinstance(a, DSymInt) and a.is_shard())
+    ] or [Replicate()]
+
+    assert len(placements) == mesh.ndim, (
+        f"The number of sharded dimensions ({len(placements)}) must "
+        f"match number of dimensions in device mesh ({mesh.ndim})."
+    )
+
+    return local_sizes, placements
+
+
+def binop_sym_int_consumer_rule(node: fx.Node, args: Tuple[Any, ...]) -> DTensor:
+    assert len(args) == 2, f"Expect two args but got op {node.target} with args {args}"
+    assert isinstance(
+        args[0], DTensor
+    ), f"Expect 1st argument to be DTensor but got {args[0]}"
+    assert isinstance(args[1], list), f"Expect 2nd argument as list but got {args[1]}"
+
+    # extract sharded dimensions in the size list, the output DTensor should
+    # follow these placements.
+    local_sizes, placements = unpack_sizes_and_dims(args[1], args[0].device_mesh)
+
+    # set node args to real int sizes.
+    node.args = (node.args[0], local_sizes)
+    op = cast(torch._ops.OpOverload, node.target)
+    return DTensor.from_local(
+        local_tensor=op(args[0]._local_tensor, local_sizes),
+        device_mesh=args[0].device_mesh,
+        placements=placements,
+        run_check=False,
+    )
+
+
+def slice_backwad_sym_int_consumer_rule(
+    node: fx.Node, args: Tuple[Any, ...]
+) -> DTensor:
+    grad_output, input_sizes, dim, start, end, step = args
+
+    local_sizes: List[int] = [
+        s.local_value if isinstance(s, DSymInt) else s for s in input_sizes
+    ]
+
+    input_tensor = torch.zeros(
+        local_sizes, device=grad_output.device, dtype=grad_output.dtype
+    )
+    return DTensor.from_local(
+        local_tensor=torch.slice_scatter(
+            input_tensor, grad_output.to_local(), dim, start, end, step
+        ),
+        device_mesh=grad_output.device_mesh,
+        placements=grad_output.placements,
+        run_check=False,
+    )
+
+
+def factory_with_sizes_rule(
+    node: fx.Node,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    default_mesh: DeviceMesh,
+) -> DTensor:
+    flat_args = pytree.arg_tree_leaves(*args)
+    assert not any(isinstance(a, DTensor) for a in flat_args), (
+        f"Not expect DTensor argument for factory op, but got {node.target} "
+        f"with arguments {args}."
+    )
+    assert isinstance(args[0], list), f"Expect 2nd argument as list but got {args[1]}"
+
+    local_sizes, placements = unpack_sizes_and_dims(args[0], default_mesh)
+    node.args = (local_sizes, *args[1:])
+    op = cast(torch._ops.OpOverload, node.target)
+    return DTensor.from_local(
+        local_tensor=op(*node.args, **kwargs),
+        device_mesh=default_mesh,
+        placements=placements,
+        run_check=False,
+    )
+
+
+def factory_arange_rule(
+    node: fx.Node,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    default_mesh: DeviceMesh,
+) -> DTensor:
+    node.args = tree_map(lambda a: a.local_value if isinstance(a, DSymInt) else a, args)
+    op = cast(torch._ops.OpOverload, node.target)
+    return DTensor.from_local(
+        local_tensor=op(*node.args, **kwargs),
+        device_mesh=default_mesh,
+        placements=[Replicate()],
+        run_check=False,
+    )
+
+
+def default_factory_op_rule(
+    node: fx.Node,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    default_mesh: DeviceMesh,
+) -> DTensor:
+    node.args, node.kwargs = args, kwargs
+    op = cast(torch._ops.OpOverload, node.target)
+    return DTensor.from_local(
+        local_tensor=op(*node.args, **node.kwargs),
+        device_mesh=default_mesh,
+        placements=[Replicate()],
+        run_check=False,
+    )
+
+
+# Dispatch override for view and factory ops that consume SymInt arguments,
+# where the output spec should follow dimension placement where the SymInt comes
+# from.
+VIEW_SYM_INT_CONSUMERS: Dict[torch._ops.OpOverload, Callable] = {
+    aten._unsafe_view.default: binop_sym_int_consumer_rule,
+    aten.expand.default: binop_sym_int_consumer_rule,
+    aten.slice_backward.default: slice_backwad_sym_int_consumer_rule,
+    aten.view.default: binop_sym_int_consumer_rule,
+}
+
+FACTORY_SYM_INT_CONSUMERS: Dict[torch._ops.OpOverload, Callable] = {
+    aten.full.default: factory_with_sizes_rule,
+    aten.arange.default: factory_arange_rule,
+    aten.arange.start: factory_arange_rule,
+}
+
+
+# Dispatch override for factory ops, as DTensor cannot propogate sharding spec
+# without DTensor inputs.
+FACTORY_OPS: Dict[torch._ops.OpOverload, Callable] = {
+    aten.scalar_tensor.default: default_factory_op_rule,
+    aten.arange.start: default_factory_op_rule,
+    aten.zeros.default: default_factory_op_rule,
+}
+
+
+def _get_dtensor_dispatch_graph(
+    node: fx.Node,
+    node_to_obj: Dict[fx.Node, Any],
+    *,
+    force_make_fx: bool = False,
+    default_mesh: Optional[DeviceMesh] = None,
+) -> Optional[fx.GraphModule]:
+    with torch.no_grad():
+        # Args should be a list of objects post remapping.
+        args = tree_map(partial(_remap_arg, node_to_obj), node.args)
+        kwargs = tree_map(partial(_remap_arg, node_to_obj), node.kwargs)
+
+        op_overload = cast(torch._ops.OpOverload, node.target)
+
+        if any(
+            a.is_shard()
+            for a in pytree.arg_tree_leaves(*args)
+            if isinstance(a, DSymInt)
+        ):
+            if op_overload in VIEW_SYM_INT_CONSUMERS:
+                assert len(kwargs) == 0, f"Expect empty kwargs, but got {kwargs}"
+                node_to_obj[node] = VIEW_SYM_INT_CONSUMERS[op_overload](node, args)
+                return None
+            elif op_overload in FACTORY_SYM_INT_CONSUMERS:
+                assert default_mesh is not None, "Requires default mesh for factory ops"
+                node_to_obj[node] = FACTORY_SYM_INT_CONSUMERS[op_overload](
+                    node, args, kwargs, default_mesh
+                )
+                return None
+            else:
+                assert isinstance(logger, logging.Logger)
+                logger.warning(
+                    "Assuming using local_value from SymInt for %s"
+                    "is mathematically correct. Full args are %s.",
+                    op_overload,
+                    args,
+                )
+
+        if node.target == aten.view.default:
+            # HACK: this is a hack to get around with the fact that some
+            # view operations on a "global" tensor is invalid usage
+            # but somehow the view operation on the batch input might hit it
+            # so we convert the view op to reshape before calling DTensor
+            op_overload = aten.reshape.default
+
+        # DSymInt args are not sharded on any dimension, local value and global
+        # value should be the same
+        args = tree_map(lambda a: a.local_value if isinstance(a, DSymInt) else a, args)
+        kwargs = tree_map(
+            lambda a: a.local_value if isinstance(a, DSymInt) else a, kwargs
+        )
+
+        if op_overload in FACTORY_OPS:
+            # Don't pass factory ops to DTensor dispatch, as DTensor cannot
+            # propagate sharding spec without DTensor inputs.
+            node_to_obj[node] = FACTORY_OPS[op_overload](
+                node, args, kwargs, default_mesh
+            )
+            return None
+
+        dispatch = partial(
+            _dispatch_with_local_tensors,
+            op_overload,
+            kwargs=kwargs,
+            specs=args,
+        )
+
+        gm = make_fx(dispatch, _allow_non_fake_inputs=False)(args)
+        # FIXME(@wanchaol, @mrshenli): the above seems to accidentally captured
+        # DeviceMesh tensor ops when handling inplace operators? The ``_to_copy`` is
+        # not connected to graph output. So, using DCE to get rid of it, but this
+        # doesn't look correct.
+        #
+        # The following operators appear in the captured graph, where the dtype is
+        # torch.int64.
+        #
+        # get_attr       _tensor_constant0  _tensor_constant0         ()
+        # call_function  transpose          aten.transpose.int        (_tensor_constant0, -1, 0)
+        # call_function  view               aten.view.default         (transpose, [-1, 2])
+        # call_function  view_1             aten.view.default         (view, [2])
+        # call_function  _to_copy           aten._to_copy.default     (view_1,)
+        gm.graph.eliminate_dead_code()
+
+        return gm
+
+
+def _build_dummy_add_graph(
+    dt: DTensor, node_to_obj: Dict[fx.Node, Any]
+) -> Tuple[fx.GraphModule, Any]:
+    """Create a graph for a dummy add function from a partial DTensor.
+
+    This dummy add is used for triggering all_reduce on a Partial DTensor
+    during the DTensor expansion of the traced graph.
+    Also returns the actual DTensor after resharding.
+    """
+
+    def dummy_add(grad: torch.Tensor, zero: torch.Tensor) -> torch.Tensor:
+        return grad + zero
+
+    grad: torch.Tensor = dt._local_tensor
+    zero: torch.Tensor = torch.zeros_like(dt._local_tensor)
+
+    traced_add = make_fx(dummy_add)(grad, zero)
+
+    placeholders = [n for n in traced_add.graph.nodes if n.op == OP.PLACEHOLDER]
+    call_functions = [n for n in traced_add.graph.nodes if n.op == OP.CALL_FUNCTION]
+    assert len(placeholders) == 2
+    assert len(call_functions) == 1
+    node_to_obj[placeholders[0]] = dt
+    node_to_obj[placeholders[1]] = DTensor.from_local(
+        zero, dt.device_mesh, [Replicate()], run_check=False
+    )
+
+    traced_dispatch = _get_dtensor_dispatch_graph(
+        call_functions[0], node_to_obj, force_make_fx=True
+    )
+    assert traced_dispatch is not None
+
+    # TODO(anj): This depends on the call function node -> actual DTensor output
+    # mapping that we want to avoid for SPMD expansion
+    return traced_dispatch, node_to_obj[call_functions[0]]
+
+
+def _convert_output(
+    gm: fx.GraphModule,
+    node: fx.Node,
+    node_to_obj: Dict[fx.Node, Any],
+) -> fx.Node:
+    new_args = []
+    has_partial = False
+    for argument in node.args[0]:  # type: ignore[union-attr]
+        if not isinstance(argument, fx.Node):
+            new_args.append(argument)
+            continue
+
+        obj = node_to_obj[argument]
+
+        if not _is_partial_dtensor(obj):
+            new_args.append(argument)
+            continue
+
+        has_partial = True
+
+        # we know it's a dtensor from is partial DT check...
+        dt = cast(DTensor, obj)
+
+        traced_dispatch, result_obj = _build_dummy_add_graph(dt, node_to_obj)
+
+        wait = [
+            n
+            for n in traced_dispatch.graph.nodes
+            if n.name == "wait_comm" or n.name == "wait_tensor"
+        ]
+        add = [n for n in traced_dispatch.graph.nodes if n.name == "add"]
+        assert len(wait) == 1 and len(add) == 1
+
+        # remove add node and replace it with wait node
+        add[0].replace_all_uses_with(wait[0])
+        traced_dispatch.graph.eliminate_dead_code()
+        # also update the actual DTensor corresponding to the node
+        # TODO(anj): We require mapping of the final DTensor output to the wait
+        # comm node.
+        node_to_obj[wait[0]] = result_obj
+
+        value_remap: Dict[fx.Node, fx.Node] = {}
+        for dtn in traced_dispatch.graph.nodes:
+            if dtn.op == OP.PLACEHOLDER:
+                # do nothing, ignore placeholders, as it has
+                # already been prepared in value_remap
+                value_remap[dtn] = argument
+            elif dtn.op == OP.OUTPUT:
+                assert (
+                    len(dtn.args) == 1 and len(dtn.args[0]) == 1
+                ), f"Expecting single output, but got {dtn.args} {len(dtn.args)}"
+                new_args.append(value_remap[dtn.args[0][0]])
+                # the concrete DTensor value of output was added when creating the
+                # inner graph (in _build_dummy_add_graph). Just add it to the final
+                # output node so that we can report the final output specs correctly.
+                # TODO(anj): We are depending on the concrete DTensor output of the dummy add.
+                node_to_obj[value_remap[dtn.args[0][0]]] = node_to_obj[dtn.args[0][0]]
+
+            else:
+                if dtn.op == OP.GET_ATTR:
+                    setattr(
+                        gm,
+                        dtn.target,
+                        getattr(traced_dispatch, dtn.target),
+                    )
+                with gm.graph.inserting_before(node):
+                    value_remap[dtn] = gm.graph.node_copy(dtn, lambda n: value_remap[n])
+    if has_partial:
+        gm.graph.erase_node(node)
+        return gm.graph.output(new_args)
+    else:
+        return node
+
+
+def _rebuild_graph(
+    gm: fx.GraphModule,
+    node_replacements: Dict[torch.fx.Node, torch.fx.GraphModule],
+) -> None:
+    # replace nodes in local traced graph with DTensor's dispatch graph
+    for node in gm.graph.nodes:
+        if node not in node_replacements:
+            continue
+
+        traced_dispatch = node_replacements[node]
+        # Map DT's dispatch graph input placeholder nodes to the ones in
+        # local traced graph. It uses index-based accessing, which is
+        # brittle, just for testing purpose.
+        flatten_args = pytree.arg_tree_leaves(*node.args)
+        i, value_remap = 0, {}
+        for dtn in traced_dispatch.graph.nodes:
+            if dtn.op == OP.PLACEHOLDER:
+                value_remap[dtn] = flatten_args[i]
+                i += 1
+
+        # insert DT's dispatch graph to traced local graph.
+        with gm.graph.inserting_before(node):
+            for dtn in traced_dispatch.graph.nodes:
+                if dtn.op == OP.PLACEHOLDER:
+                    # do nothing, ignore placeholders, as it has already
+                    # been prepared in value_remap
+                    pass
+                elif dtn.op == OP.OUTPUT:
+                    assert (
+                        len(dtn.args) == 1
+                    ), f"Expecting single output, but got {dtn.args} {len(dtn.args[0])}"
+                    outputs = dtn.args[0]
+                    # we currently support two very specific types of output
+                    # 1. single output
+                    # 2. multiple outputs resulting from getitem of all elements of tuple
+                    if len(outputs) == 1:
+                        # for single output, we replace the node with the single node
+                        output = outputs[0]
+                    else:
+                        # for multiple outputs, we check that these outputs correspond
+                        # to all elements of a tuple. In that case, we replace
+                        # uses of the output directly with the original tuple
+                        source = None
+                        for i, out in enumerate(outputs):
+                            # we allow None outputs for certain items in the tuple
+                            if out is None:
+                                continue
+                            assert out.op == "call_function"
+                            assert out.target.__module__ == "_operator"
+                            assert out.target.__name__ == "getitem"
+                            assert source is None or source == out.args[0]
+                            source = out.args[0]
+                            assert out.args[1] == i
+                        assert source is not None
+                        output = source
+
+                    new_node = value_remap[output]
+                    node.replace_all_uses_with(new_node)
+                else:
+                    value_remap[dtn] = gm.graph.node_copy(dtn, lambda n: value_remap[n])
+                    if all(
+                        isinstance(n.target, torch._ops.OpOverload)
+                        and n.target._schema.name.startswith(
+                            ("aten::_foreach", "aten::_fused_adam")
+                        )
+                        for n in [dtn, node]
+                    ):
+                        # FIXME(@mrshenli): This is a temporary solution enable
+                        # foreach ops. The problem is that foreach ops returns
+                        # List[Tensor], but make_fx will flatten that before
+                        # passing those tensors to output node, which will
+                        # introduce additional getitem nodes. These redundant
+                        # getitem nodes breaks graph correctness as we cannot do
+                        # getitem(getitem(foreach_out, 0), 0). This temporary
+                        # solution skips getitem nodes in DTensor expanded
+                        # subgraphs.
+                        node.replace_all_uses_with(value_remap[dtn])
+                        break
+            # explicitly erase node instead of relying on DCE, as DCE does not
+            # remove inplace copy_ correctly.
+            gm.graph.erase_node(node)
+
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+
+def _get_last_consumer_to_nodes(
+    graph: fx.Graph,
+) -> Dict[fx.Node, List[fx.Node]]:
+    # Run through reverse nodes and record the first instance of a use
+    # of a given node. This represents the *last* use of the node in the
+    # execution order of the program, which we will use to free unused
+    # values
+    node_to_last_consumer: Dict[fx.Node, fx.Node] = {}
+    last_consumer_to_nodes: Dict[fx.Node, List[fx.Node]] = {}
+
+    def _register_final_consumer(arg_node: fx.Node, consumer: fx.Node) -> None:
+        if arg_node not in node_to_last_consumer:
+            node_to_last_consumer[arg_node] = consumer
+            last_consumer_to_nodes.setdefault(consumer, []).append(arg_node)
+
+    for node in reversed(graph.nodes):
+        fx.node.map_arg(
+            node.args, lambda arg_node: _register_final_consumer(arg_node, node)
+        )
+        fx.node.map_arg(
+            node.kwargs,
+            lambda kwarg_node: _register_final_consumer(kwarg_node, node),
+        )
+
+    return last_consumer_to_nodes
+
+
+def _convert_to_distributed(
+    gm: fx.GraphModule,
+    inps: List[torch.Tensor],
+    schemas: List[Schema],
+    default_mesh: Optional[DeviceMesh] = None,
+    _allow_partial: bool = False,
+) -> Tuple[fx.GraphModule, Dict[str, Schema]]:
+    """Transform a graph module to a distributed graph module.
+
+    Returns:
+        - transformed graph module
+        - map from output name to DTensorSpec
+
+    """
+    global logger
+    logger = get_logger("spmd_exp")
+    operators = {getattr(operator, name) for name in operator.__all__}
+    node_to_obj: Dict[fx.Node, Any] = {}
+    # map local op node in traced_f to its corresponding subgraph of
+    # DTensor ops.
+    node_replacements: Dict[torch.fx.Node, torch.fx.GraphModule] = {}
+
+    last_consumer_to_nodes = _get_last_consumer_to_nodes(gm.graph)
+
+    output_schemas: Dict[str, Schema] = {}
+    for i, node in enumerate(gm.graph.nodes):
+        assert logger is not None
+        logger.info("node%s: op=%s target=%s", i, node.op, node.target)
+        if node.op == OP.PLACEHOLDER:
+            assert i < len(
+                inps
+            ), f"got more placeholder nodes ({i + 1}) than inputs ({len(inps)})"
+
+            # our example inputs are local shards. Create DTensors from them.
+            node_to_obj[node] = DTensor.from_local(
+                inps[i].clone(),  # use clone to avoid modifications from inplace ops
+                schemas[i].mesh,
+                schemas[i].placements,
+                # prevent running this collective in backwards pass
+                run_check=False,
+            )
+        elif isinstance(node.target, torch._ops.OpOverloadPacket):
+            dtensor = cast(DTensor, node_to_obj[node.args[0]])
+            node_to_obj[node] = DSymInt.from_node(node, dtensor)
+        elif isinstance(node.target, torch._ops.OpOverload):
+            replacement = _get_dtensor_dispatch_graph(
+                node, node_to_obj, default_mesh=default_mesh
+            )
+            if replacement is not None:
+                node_replacements[node] = replacement
+        elif node.op == OP.OUTPUT:
+            if not _allow_partial:
+                # Returns an expanded dummy add node that ensures
+                # that the partial output tensor has been converted
+                # to a replicated tensor.
+                node = _convert_output(gm, node, node_to_obj)
+
+            # Save output sharding for the inputs to backward pass.
+            # TODO(anj): Pipe the output schema for the BW pass
+            # instead of requiring the full output DTensor to be
+            # materialized.
+            for inp_arg in node.args[0]:
+                if isinstance(inp_arg, fx.Node):
+                    obj = node_to_obj[inp_arg]
+                    if isinstance(obj, DTensor):
+                        output_schemas[inp_arg.name] = Schema(
+                            obj.device_mesh, obj.placements  # type: ignore[arg-type]
+                        )
+        elif node.op == OP.CALL_FUNCTION:
+            args = tree_map(partial(_remap_arg, node_to_obj), node.args)
+            kwargs = tree_map(partial(_remap_arg, node_to_obj), node.kwargs)
+
+            dsymints = list(
+                filter(lambda a: isinstance(a, DSymInt), args + tuple(kwargs.values()))
+            )
+
+            if node.target in operators and len(dsymints) > 0:
+                assert all(
+                    dsymints[0].mesh == d.mesh for d in dsymints
+                ), "all DSymInts must have the same mesh. "
+
+                local_args = tree_map_only(DSymInt, lambda a: a.local_value, args)
+                local_kwargs = tree_map_only(DSymInt, lambda a: a.local_value, kwargs)
+
+                global_args = tree_map_only(DSymInt, lambda a: a.global_value, args)
+                global_kwargs = tree_map_only(DSymInt, lambda a: a.global_value, kwargs)
+
+                node.args = local_args
+                node.kwargs = local_kwargs
+
+                node_to_obj[node] = DSymInt(
+                    local_value=node.target(*local_args, **local_kwargs),
+                    global_value=node.target(*global_args, **global_kwargs),
+                    mesh=dsymints[0].mesh,
+                )
+            else:
+                assert len(dsymints) == 0, (
+                    "SPMD expansion does not support SymInt in non-operator "
+                    f"nodes, got {node.target}."
+                )
+                node_to_obj[node] = node.target(*args, **kwargs)
+        else:
+            raise ValueError(f"Unrecognized node.op type {node.op}")
+
+        if node in last_consumer_to_nodes:
+            # Save memory by deleting objs that wont be used anymore.
+            for arg_node in last_consumer_to_nodes[node]:
+                del node_to_obj[arg_node]
+
+    _rebuild_graph(gm, node_replacements)
+
+    return gm, output_schemas
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/experimental_ops.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/experimental_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f42c52e2bf88a0d56feec928a4b157806bb480c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/experimental_ops.py
@@ -0,0 +1,455 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast, List, Optional, Sequence, Tuple
+
+import torch
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops.utils import register_prop_rule
+
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+    TensorMeta,
+)
+
+aten = torch.ops.aten  # pyre-ignore
+
+
+@register_prop_rule(  # pyre-ignore
+    [
+        aten._foreach_neg.default,
+        aten._foreach_reciprocal.default,
+        aten._foreach_sqrt.default,
+    ]
+)
+def _prop__foreach_unaop(op_schema: OpSchema) -> OutputSharding:
+    self = op_schema.args_schema[0]
+    assert isinstance(self, list) and all(isinstance(s, DTensorSpec) for s in self)
+    # FIXME(@mrshenli): for sqrt, this is only mathematically correct for
+    # Replicate and Shard tensor.
+    return OutputSharding(output_spec=self)
+
+
+@register_prop_rule(  # pyre-ignore
+    [
+        aten._foreach_add.List,
+        aten._foreach_div.List,
+        aten._foreach_mul.List,
+    ]
+)
+def _prop__foreach_binop_list(op_schema: OpSchema) -> OutputSharding:
+    self, other = op_schema.args_schema[:2]
+    scalar = None if len(op_schema.args_schema) < 3 else op_schema.args_schema[2]
+    assert isinstance(self, list) and all(
+        isinstance(s, DTensorSpec) for s in self
+    ), f"Expect a List[DTensorSpec] but got {self}"
+    assert isinstance(other, list) and all(
+        isinstance(o, DTensorSpec) for o in other
+    ), f"Expect a List[DTensorSpec] but got {other}"
+    assert len(self) == len(other), (
+        "Two tensor lists must match in length, "
+        f"but got {len(self)} and {len(other)}"
+    )
+
+    if any(s != o for s, o in zip(self, other)):
+        # If DTensorSpec for the two operand do not match, suggest using
+        # self's DTensorSpec. This will trigger allreduce if other is partial
+        # and self is replicated.
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(self, self, scalar) if scalar else (self, self),
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+    else:
+        return OutputSharding(output_spec=self)
+
+
+@register_prop_rule(  # pyre-ignore
+    [
+        aten._foreach_add.Scalar,
+        aten._foreach_div.Scalar,
+        aten._foreach_mul.Scalar,
+        aten._foreach_sub.Scalar,
+    ]
+)
+def _prop__foreach_binop_scalar(op_schema: OpSchema) -> OutputSharding:
+    self, scalar = op_schema.args_schema
+    assert isinstance(self, list) and all(isinstance(s, DTensorSpec) for s in self)
+    assert not isinstance(scalar, list)
+    return OutputSharding(output_spec=self)
+
+
+@register_prop_rule(  # pyre-ignore
+    [
+        aten._foreach_addcdiv.Scalar,
+        aten._foreach_addcmul.Scalar,
+    ]
+)
+def _prop__foreach_addcop_scalar(op_schema: OpSchema):
+    self, tensor1, tensor2 = op_schema.args_schema[:3]
+    scalar = None if len(op_schema.args_schema) < 4 else op_schema.args_schema[3]
+    assert isinstance(self, list) and all(isinstance(s, DTensorSpec) for s in self)
+    assert isinstance(tensor1, list) and all(isinstance(s, DTensorSpec) for s in self)
+    assert isinstance(tensor2, list) and all(isinstance(s, DTensorSpec) for s in self)
+    if any(s != t1 or s != t2 for s, t1, t2 in zip(self, tensor1, tensor2)):
+        # If DTensorSpec for the two operand do not match, suggest using
+        # self's DTensorSpec. This will trigger allreduce if other is partial
+        # and self is replicated.
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(self, self, self, scalar)
+                    if scalar
+                    else (self, self, self),
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+    else:
+        return OutputSharding(output_spec=self)
+
+
+@register_prop_rule([aten._foreach_pow.ScalarAndTensor])  # pyre-ignore
+def _prop__foreach_pow_scalar_and_tensor(op_schema: OpSchema):
+    scala, exponent = op_schema.args_schema
+    assert isinstance(exponent, list) and all(
+        isinstance(s, DTensorSpec) for s in exponent
+    )
+    return OutputSharding(output_spec=exponent)
+
+
+@register_prop_rule([aten._fused_adam.default])  # pyre-ignore
+def _prop__fused_adam(op_schema: OpSchema):
+    NT = 5
+    tesnor_list_args: Tuple[List[DTensorSpec]] = op_schema.args_schema[:NT]  # type: ignore[assignment]
+
+    assert all(isinstance(schema, list) for schema in tesnor_list_args)
+    assert all(
+        isinstance(s, DTensorSpec) for schema in tesnor_list_args for s in schema
+    )
+
+    tensor_schemas: Tuple[List[DTensorSpec]] = [  # type: ignore[assignment]
+        schema for schema in tesnor_list_args if len(schema)
+    ]
+
+    assert all(len(s) == len(tensor_schemas[0]) for s in tensor_schemas), (
+        "expect the same number of gradients and states, but got "
+        f"{[len(s) for s in tensor_schemas]}."
+    )
+
+    if any(any(t != ts[0] for t in ts) for ts in zip(*tensor_schemas)):
+        new_schemas: Tuple[List[DTensorSpec]] = tuple(  # type: ignore[assignment]
+            op_schema.args_schema[0] if len(s) else s for s in tesnor_list_args
+        )
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=new_schemas + op_schema.args_schema[NT:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+    else:
+        return OutputSharding(output_spec=(op_schema.args_schema[0],) * NT)  # type: ignore[arg-type]
+
+
+@register_prop_rule(aten.nll_loss_forward.default)  # pyre-ignore
+def _prop_nll_loss_forward(op_schema: OpSchema) -> OutputSharding:
+    self, target = op_schema.args_schema[:2]
+    assert isinstance(self, DTensorSpec)
+    assert isinstance(target, DTensorSpec)
+    if self.placements != target.placements:
+        # Self and target must match in placements, which should be shard along
+        # batch dimension in data parallell use cases. Force redistribute.
+
+        # need to create a new self instead return (target, target) as target
+        # and self might not match in shape.
+        new_self = DTensorSpec(
+            mesh=self.mesh,
+            placements=target.placements,
+            tensor_meta=self.tensor_meta,
+        )
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(new_self, target) + op_schema.args_schema[2:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+    else:
+        return OutputSharding(
+            output_spec=(
+                # by default, nll_loss_forward conducts a reduction and returns
+                # a scalar tensor, and hence the _Partial placements.
+                DTensorSpec(mesh=self.mesh, placements=(_Partial(),)),
+                # the 2nd output total_weight is always a scalar tensor
+                DTensorSpec(mesh=self.mesh, placements=(Replicate(),)),
+            )
+        )
+
+
+@register_prop_rule(aten.nll_loss_backward.default)  # pyre-ignore
+def _prop_nll_loss_backward(op_schema: OpSchema) -> OutputSharding:
+    grad_output, self = op_schema.args_schema[:2]
+    assert isinstance(grad_output, DTensorSpec)
+    assert isinstance(self, DTensorSpec)
+    return OutputSharding(output_spec=self)
+
+
+@register_prop_rule(aten.stack.default)
+def _prop_stack(op_schema: OpSchema) -> OutputSharding:
+    tensors = op_schema.args_schema[0]
+    dim = 0 if len(op_schema.args_schema) == 1 else cast(int, op_schema.args_schema[1])
+    assert (
+        isinstance(tensors, list) and len(tensors) > 0
+    ), "expect at least one tensor to stack"
+    assert all(
+        isinstance(t, DTensorSpec) for t in tensors
+    ), f"expect a list of DTensorSpecs, but got {tensors}"
+    assert all(
+        t.shape == tensors[0].shape for t in tensors
+    ), f"expect all tensors to have the same shape, but got {tensors}."
+    # TODO: provide schema_suggestions when placements do not match
+    assert all(
+        t.placements == tensors[0].placements for t in tensors
+    ), f"expect all tensors to have the same placements, but got {tensors}."
+    assert all(
+        not p.is_shard(dim) for p in tensors[0].placements
+    ), "DTensor does not support stack on sharded dimension."
+
+    return OutputSharding(
+        output_spec=DTensorSpec(mesh=tensors[0].mesh, placements=tensors[0].placements)
+    )
+
+
+@register_prop_rule(aten.select.int)
+def _prop_select(op_schema: OpSchema) -> OutputSharding:
+    tensor, dim = op_schema.args_schema[:2]
+    assert isinstance(tensor, DTensorSpec)
+    assert isinstance(dim, int)
+    placements: Sequence[Placement] = tensor.placements
+    assert all(
+        not p.is_shard(dim) for p in placements
+    ), "DTensor does not support select on sharded dimension."
+
+    # select will remove one dimension, decrement dim of Shard placements by 1
+    # if they are larger than dim.
+    new_placements: List[Placement] = []
+    for p in placements:
+        # Using isinstance instead of is_shard so that mypy won't complain
+        # about accessing dim attribute.
+        if isinstance(p, Shard) and p.dim > dim:
+            new_placements.append(Shard(p.dim - 1))
+        else:
+            new_placements.append(p)
+
+    return OutputSharding(
+        output_spec=DTensorSpec(mesh=tensor.mesh, placements=tuple(new_placements))
+    )
+
+
+@register_prop_rule(aten.native_layer_norm.default)  # pyre-ignore
+def _prop_native_layer_norm(op_schema: OpSchema) -> OutputSharding:
+    input, normalized_shape, weight, bias, eps = op_schema.args_schema
+    assert isinstance(input, DTensorSpec)
+    assert isinstance(normalized_shape, (tuple, list))
+    if weight is not None:
+        assert isinstance(weight, DTensorSpec)
+        assert all(isinstance(p, Replicate) for p in weight.placements)
+    if bias is not None:
+        assert isinstance(bias, DTensorSpec)
+        assert all(isinstance(p, Replicate) for p in bias.placements)
+    # only the left-most (non-normalized) dimensions of the input can be sharded
+    batch_ndim = len(input.shape) - len(normalized_shape)
+    assert all(
+        isinstance(p, Replicate) or (isinstance(p, Shard) and p.dim < batch_ndim,)
+        for p in input.placements
+    )
+    stats_spec = DTensorSpec(
+        mesh=input.mesh,
+        placements=input.placements,
+    )
+    return OutputSharding(output_spec=(input, stats_spec, stats_spec))
+
+
+@register_prop_rule(aten.native_layer_norm_backward.default)  # pyre-ignore
+def _prop_native_layer_norm_backward(op_schema: OpSchema) -> OutputSharding:
+    (
+        grad,
+        input,
+        normalized_shape,
+        result1,
+        result2,
+        weight,
+        bias,
+        grad_input_mask,
+    ) = op_schema.args_schema
+    assert isinstance(grad, DTensorSpec)
+    assert isinstance(grad_input_mask, (list, tuple))
+    if weight is not None:
+        assert isinstance(weight, DTensorSpec)
+        assert all(isinstance(s, Replicate) for s in weight.placements)
+    if bias is not None:
+        assert isinstance(bias, DTensorSpec)
+        assert all(isinstance(s, Replicate) for s in bias.placements)
+    # ensure sharding on dim 0, which will trigger the "Partial" output on
+    # weight and bias grads
+    assert any(
+        isinstance(s, Shard) and s.dim == 0 for s in grad.placements
+    ), f"Got {grad.placements}"
+    weight_grad = (
+        DTensorSpec(
+            mesh=weight.mesh,
+            placements=tuple([_Partial()] * weight.mesh.ndim),
+        )
+        if weight
+        else None
+    )
+    bias_grad = (
+        DTensorSpec(
+            mesh=bias.mesh,
+            placements=tuple([_Partial()] * bias.mesh.ndim),
+        )
+        if bias
+        else None
+    )
+    return OutputSharding(
+        # NOTE: type errors below are legit. This is because DTensor currently
+        # doesn't support Optional return values. Need to be fixed in DTensor repo.
+        output_spec=(
+            grad if grad_input_mask[0] else None,
+            weight_grad if grad_input_mask[1] else None,
+            bias_grad if grad_input_mask[2] else None,
+        ),
+    )
+
+
+def _refine_sharding(
+    op_schema: OpSchema, active_dim: Optional[int]
+) -> Sequence[Placement]:
+    """Considers 2 first inputs of op_schema as having same shape, and returns suggested placement for a pointwise operation."""
+    # consider the operating dimension as a singleton to prevent sharding on it
+    # however, if active_dim is None, this means the input and output shapes are equal and
+    # we'll apply exactly the pointwise rule.
+
+    args_schema = []
+    for s in op_schema.args_schema[:2]:
+        assert isinstance(s, DTensorSpec) and s.tensor_meta is not None
+        args_schema.append(
+            DTensorSpec(
+                mesh=s.mesh,  # type: ignore[attr-defined]
+                placements=s.placements,  # type: ignore[attr-defined]
+                tensor_meta=TensorMeta(
+                    shape=torch.Size(
+                        s.shape[0:active_dim] + (1,) + s.shape[active_dim + 1 :]
+                    )
+                    if active_dim is not None
+                    else s.shape,
+                    stride=s.tensor_meta.stride,
+                    dtype=s.tensor_meta.dtype,
+                ),
+            )
+        )
+
+    op_schema = OpSchema(
+        op=op_schema.op,
+        args_schema=args_schema,  # type: ignore[arg-type]
+        kwargs_schema={},
+    )
+    output_sharding = pointwise_rule(op_schema, linearity=False)
+    if output_sharding.output_spec:
+        assert isinstance(output_sharding.output_spec, DTensorSpec)
+        return output_sharding.output_spec.placements
+    else:
+        assert output_sharding.schema_suggestions is not None
+        out_schema = output_sharding.schema_suggestions[0].args_schema[0]
+        assert isinstance(out_schema, DTensorSpec)
+        return tuple(out_schema.placements)
+
+
+@register_prop_rule(aten.slice_scatter.default)  # pyre-ignore
+def prop_slice_scatter(op_schema: OpSchema) -> OutputSharding:
+    # 1. number of dimensions in input and src need to match.
+    # 2. number of elements on all non-dim need to match between input and src.
+    # 3. numer of elements in src in dim need to match the slice size.
+    # Given the above:
+    # - We suggest for src to follow the sharding of input, except on the scatter dimension,
+    #   where our best bet for now is to make them replicated as a fall-back.
+    #   TODO: Ideally we'd like to make sure the output is re-sharded afterwards to keep input sharding.
+
+    defaults = (None, None, 0, None, None, 1)
+    input, src, dim, start, end, step = (
+        op_schema.args_schema + defaults[len(op_schema.args_schema) :]
+    )
+    assert isinstance(input, DTensorSpec)
+    assert isinstance(src, DTensorSpec)
+    assert isinstance(dim, int)
+
+    if dim < 0:
+        dim += input.ndim
+
+    # if the input shape and the output shape are the same on the operating dimension,
+    # this is effectively a no-op, so we just propagate sharding as we would do for
+    # pointwise, no exceptions.
+    if input.shape[dim] == src.shape[dim]:
+        assert start == 0
+        assert end >= src.shape[dim]  # type: ignore[operator]
+        dim = None
+
+    # apply sharding refinement as implemented in pointwise_rule
+    input_suggestion = list(_refine_sharding(op_schema, dim))
+    # apply the exception -- disallow sharding on the operating dimension.
+    for i, p in enumerate(input_suggestion):
+        if isinstance(p, Shard) and p.dim == dim:
+            input_suggestion[i] = Replicate()
+    input_suggestion = tuple(input_suggestion)  # type: ignore[assignment]
+
+    if input_suggestion == tuple(input.placements) and src.placements == tuple(
+        input.placements
+    ):
+        # if our sharding is correct, the output sharding will be the same as the input.
+        return OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=input.mesh,
+                placements=input.placements,
+            )
+        )
+    else:
+        # otherwise, return the suggestion.
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(
+                        DTensorSpec(
+                            mesh=input.mesh,
+                            placements=input_suggestion,
+                            tensor_meta=input.tensor_meta,
+                        ),
+                        DTensorSpec(
+                            mesh=src.mesh,
+                            placements=input_suggestion,
+                            tensor_meta=src.tensor_meta,
+                        ),
+                    )
+                    + op_schema.args_schema[2:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/gm_transformation.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/gm_transformation.py
new file mode 100644
index 0000000000000000000000000000000000000000..77247616251521f20d0dba36c2137fa55866781d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/gm_transformation.py
@@ -0,0 +1,51 @@
+from typing import Callable
+
+from torch import fx
+from torch.distributed._spmd.graph_optimization import (
+    comm_fusion_with_concat,
+    enable_graph_optimization_dump,
+    remove_copy_from_optimizer,
+    schedule_comm_wait,
+)
+from torch.distributed._spmd.graph_utils import dump_graphs_to_files
+from torch.distributed._spmd.iter_graph_module import IterGraphModule
+
+
+class GraphModuleTransformation:
+    def __init__(
+        self,
+        *,
+        enable_graph_optimization: bool = False,
+        enable_inductor: bool = False,
+        dump_graphs: bool = False,
+    ) -> None:
+        self.enable_graph_optimization = enable_graph_optimization
+        self.enable_inductor = enable_inductor
+        self.dump_graphs = dump_graphs
+
+    def __call__(self, gm: fx.GraphModule) -> Callable:
+        if self.dump_graphs:
+            graph_folder = dump_graphs_to_files(
+                {"before_transformation_gm": gm.print_readable(False)}
+            )
+            enable_graph_optimization_dump(graph_folder)
+
+        iter_gm = IterGraphModule(gm, enable_inductor=self.enable_inductor)
+        if self.enable_graph_optimization:
+            comm_fusion_with_concat(iter_gm, 100)
+            schedule_comm_wait(iter_gm)
+            remove_copy_from_optimizer(iter_gm)
+        # Must be called after we are not going to move the graphs
+        iter_gm.finalize_setup()
+
+        if self.dump_graphs:
+            dump_graphs_to_files(
+                {
+                    "iter_graph_setup_gm": iter_gm.setup_gm.print_readable(False),
+                    "iter_graph_main_gm": iter_gm.main_gm.print_readable(False),
+                    "iter_graph_cleanup_gm": iter_gm.cleanup_gm.print_readable(False),
+                },
+                graph_folder,  # type: ignore[possibly-undefined]
+            )
+
+        return iter_gm
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_optimization.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd96e7cf246f6fbcaa6715894e5e817ab4ce8d46
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_optimization.py
@@ -0,0 +1,986 @@
+# Owner(s): ["oncall: distributed"]
+import collections
+import itertools
+import logging
+import operator
+import tempfile
+import time
+from dataclasses import dataclass, field
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    cast,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.fx as fx
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.distributed._spmd.graph_utils import (
+    CommType,
+    dump_graphs_to_files,
+    find_node,
+    get_output,
+    OP,
+)
+from torch.distributed._spmd.iter_graph_module import IterGraphModule
+from torch.fx.passes.shape_prop import TensorMetadata
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+logger: logging.Logger = logging.getLogger("graph_optimization")
+aten = torch.ops.aten
+fake_tensor_mode = FakeTensorMode()
+
+_optimized_func: Set[str] = set()
+# The key is the target pass and the value is the prerequisites of the pass.
+_prerequisite_sets: DefaultDict[str, Set[str]] = collections.defaultdict(set)
+# The key is the target pass and the value is the passes that must applied before
+# the key.
+_apply_before_sets: DefaultDict[str, Set[str]] = collections.defaultdict(set)
+_dump_graph_folder: str = ""
+
+
+def enable_graph_optimization_dump(folder: str = ""):
+    global _dump_graph_folder
+    if not folder:
+        folder = tempfile.mkdtemp()
+    _dump_graph_folder = folder
+
+
+# TODO(@fegin): Support multiple runs of graph optimization
+# TODO(@fegin): With this design, circular imports will happen when a pass
+# developer accidentally create a pass dependency cycle. As a result, we need to
+# break this file into a finer granularity to avoid incorrect circular import.
+def graph_optimization_pass(
+    prerequisites: Iterable[Callable],
+    apply_after: Iterable[Callable],
+) -> Callable:
+    """Define the contract of a graph optimization pass.
+
+    All the passes should be wrapped with this decorator.
+    `prerequisites` is used to annotate the prerequisite passes of the this pass.
+    `apply_after` means that this wrapped pass must be applied after the passes
+    in `apply_after`. The difference between `prerequisites` and `apply_after`
+    is that all the passes in `prerequisites` must be applied to the graph and
+    must be applifed before the wrapped pass while the passes `apply_after` are
+    optional. But if a pass in `apply_after` is applied to the graph, it has to
+    be done before the wrapped pass.
+    Optimizer pass developers are required to add these fields accordingly and
+    users need to follow the restrictions to avoid the assert.
+
+    Current design has one limitation: users can only apply the optimizations
+    once.  In some cases, we may need to run multiple the same optimization
+    multiple time, e.g., optimization passes -> profiling the result -> apply
+    optimization passes with the profiling result again. This limitation will be
+    addressed limitation in the future.
+
+    Args:
+        prerequisites (Iterable[Callable]): the list of string to the names of
+            passes which are the prerequisites of this pass.
+        apply_after (Iterable[Callable]): the list of string to the names of
+            passes that can not be applied after the wrapped pass.
+    """
+
+    def inner(func: Callable) -> Callable:
+        def make_key(func: Callable) -> str:
+            return f"{func.__module__}.{func.__name__}"
+
+        func_key = make_key(func)
+        _prerequisite_sets[func_key] = {make_key(f) for f in prerequisites}
+        for apply_after_pass in apply_after:
+            _apply_before_sets[make_key(apply_after_pass)].add(func_key)
+
+        @wraps(func)
+        def pass_wrapper(
+            gm: Union[fx.GraphModule, IterGraphModule], *args: Any, **kwargs: Any
+        ) -> None:
+            begin = time.time()
+            assert isinstance(gm, (fx.GraphModule, IterGraphModule)), (
+                "The first argument of the pass must be either "
+                "fx.GraphModule or IterGraphModule."
+            )
+            assert func_key not in _optimized_func, f"Cannot apply {func_key} twice."
+            invalid_passes = _apply_before_sets[func_key].intersection(_optimized_func)
+            assert (
+                not invalid_passes
+            ), f"{invalid_passes} must be applied after {func_key}."
+            assert _prerequisite_sets[func_key].issubset(_optimized_func), (
+                f"{_prerequisite_sets[func_key] - _optimized_func} are the "
+                f"prerequisites of {func_key} but are not applified. "
+                f"Applied passes are {_optimized_func}."
+            )
+
+            func(gm, *args, **kwargs)
+            gm.graph.lint()
+            gm.graph.eliminate_dead_code()
+            gm.recompile()
+            _optimized_func.add(func_key)
+
+            prefix = f"after_{func.__name__}"
+            if _dump_graph_folder:
+                if isinstance(gm, IterGraphModule):
+                    dump_graphs_to_files(
+                        {
+                            f"{prefix}_setup_gm": gm.setup_gm,
+                            f"{prefix}_main_gm": gm.main_gm,
+                            f"{prefix}_cleanup_gm": gm.cleanup_gm,
+                        },
+                        _dump_graph_folder,
+                    )
+                else:
+                    dump_graphs_to_files({prefix: gm}, _dump_graph_folder)
+
+            logger.info("Spent %f seconds applying %s", time.time() - begin, func_key)
+
+        return pass_wrapper
+
+    return inner
+
+
+@dataclass(unsafe_hash=True)
+class CommBlock:
+    shape: Optional[torch.Size]
+    node_list: List[fx.Node]
+    inputs: List[fx.Node]
+    wait_nodes: List[fx.Node]
+    comm_node: fx.Node
+    outputs: Set[fx.Node]
+
+
+def get_comm_block(comm_node: fx.Node) -> CommBlock:
+    """Find out all the nodes belong to this communcation given a collective node (e.g., allreduce).
+
+    Args:
+        comm_node(fx.Node): The target communication/collective node.
+
+    Returns:
+        The CommBlock that encapsulates the related nodes (e.g., wait_node) of
+        the given comm_node.
+    """
+    # We choose 5 to prevent some accidents that cause infinite loop. But
+    # with functional collective, the distance is 1.
+    MAX_WAIT_DISTANCE = 5
+    node_list = []
+    wait_nodes = []
+    inputs = pytree.arg_tree_leaves(*comm_node.args, **comm_node.kwargs)
+    input_nodes = [inp for inp in inputs if isinstance(inp, fx.Node)]
+    distance = 0
+    wait_prefixes = ("wait_comm", "wait_tensor")
+    non_end_users_nodes = ("split", "reshape", "getitem", "detach", "alias")
+
+    nodes = collections.deque([comm_node, None])
+    while nodes and distance < 5:
+        node = nodes.popleft()
+        if node is None:
+            distance += 1
+            if nodes:
+                nodes.append(None)
+            continue
+        node_list.append(node)
+        if node.name.startswith(wait_prefixes):
+            wait_nodes.append(node)
+        else:
+            for child in node.users:
+                if isinstance(child, fx.Node):
+                    nodes.append(child)
+
+    if not wait_nodes:
+        raise RuntimeError(
+            "The wait nodes are too far away from the comm node {comm_node}."
+        )
+
+    # Identify all the outputs of this collective block.
+    outputs: Set[fx.Node] = set()
+    nodes = collections.deque(wait_nodes)
+    while nodes:
+        node = nodes.popleft()
+        assert node is not None
+        for user in node.users:
+            if isinstance(user, fx.Node) and user.name.startswith(non_end_users_nodes):
+                nodes.append(user)
+                node_list.append(user)
+            else:
+                outputs.add(node)
+                break
+
+    # TODO: populate all the tensor metadata and remove the default.
+    tensor_meta = input_nodes[0].meta.get("tensor_meta", None)
+    return CommBlock(
+        # TODO: support symbolic shapes
+        shape=torch.Size(int(s) for s in tensor_meta.shape) if tensor_meta else None,
+        node_list=node_list,
+        wait_nodes=wait_nodes,
+        comm_node=comm_node,
+        inputs=input_nodes,
+        outputs=outputs,
+    )
+
+
+def get_all_comm_blocks(
+    gm: IterGraphModule, comm_ops: Union[Tuple[str, ...], str]
+) -> List[CommBlock]:
+    return [
+        get_comm_block(node)
+        for node in gm.graph.nodes
+        if node.name.startswith(comm_ops)
+    ]
+
+
+def _create_meta_val(
+    fake_tensor_mode: FakeTensorMode,
+    val: FakeTensor,
+) -> FakeTensor:
+    # TODO: fix the memory_format
+    return FakeTensor(
+        fake_tensor_mode,
+        torch.empty(
+            val.shape,
+            dtype=val.dtype,
+            device="meta",
+            requires_grad=val.requires_grad,
+        ),
+        val.device,
+    )
+
+
+def _create_meta_tensor_meta(
+    fake_tensor_mode: FakeTensorMode,
+    val: FakeTensor,
+) -> TensorMetadata:
+    return TensorMetadata(
+        shape=val.shape,
+        dtype=val.dtype,
+        requires_grad=val.requires_grad,
+        stride=val.stride,  # type: ignore[arg-type]
+        # TODO: fix these value
+        memory_format=None,
+        is_quantized=False,
+        qparams={},
+    )
+
+
+def _call_function(
+    gm: IterGraphModule,
+    fake_tensor_mode: FakeTensorMode,
+    meta_val: Optional[FakeTensor],
+    function: Any,
+    *args: Any,
+    **kwargs: Any,
+) -> fx.Node:
+    node = gm.graph.call_function(function, args, kwargs)
+
+    if meta_val is None:
+        flat_args, spec = tree_flatten((args, kwargs))
+        new_flat_args = []
+        memory_format = None
+        for arg in flat_args:
+            if not isinstance(arg, fx.Node):
+                new_flat_args.append(arg)
+                continue
+            val = arg.meta["val"]
+            new_flat_args.append(_create_meta_val(fake_tensor_mode, val))
+
+        fake_args, fake_kwargs = tree_unflatten(new_flat_args, spec)
+        new_meta_val = function(*fake_args, **fake_kwargs)
+    else:
+        new_meta_val = meta_val
+    node.meta["val"] = new_meta_val
+    node.meta["tensor_meta"] = _create_meta_tensor_meta(fake_tensor_mode, new_meta_val)
+    return node
+
+
+def _scatter_wait_result(
+    gm: IterGraphModule,
+    fused_comm_block: CommBlock,
+    comm_blocks: List[CommBlock],
+    node_indices: Dict[fx.Node, int],
+) -> None:
+    """Scatter the result of the fused communication node to the original users -- splitting the output and reshape each subitem."""
+    last_wait_node_idx = 0
+    for node in gm.graph.nodes:
+        if node == fused_comm_block.comm_node:
+            break
+        last_wait_node_idx = max(
+            node_indices.get(node, last_wait_node_idx), last_wait_node_idx
+        )
+
+    fused_comm_node = fused_comm_block.comm_node
+    fused_wait_node = fused_comm_block.wait_nodes[0]
+
+    with gm.graph.inserting_after(fused_wait_node):
+        split_node = gm.graph.call_function(
+            aten.split,
+            (
+                fused_wait_node,
+                # TODO(@fegin): support symbolic shapes
+                [int(cast(torch.Size, cb.shape).numel()) for cb in comm_blocks],
+            ),
+        )
+
+    # Scatter the split result.
+    need_sort_nodes = []
+    last_split_reshape_node = split_node
+    with gm.graph.inserting_after(split_node):
+        for idx, comm_block in enumerate(comm_blocks):
+            # Some users of the original allreduce and wait are scheduled
+            # before the fused allreduce. We must move these users to a
+            # correct topological sort order -- right after the last fused
+            # allreduce result, the `last_split_reshape_node` variable.
+            orig_wait = comm_block.wait_nodes[0]
+            nodes = collections.deque(list(orig_wait.users))
+            while nodes:
+                user_node = nodes.popleft()
+                if not isinstance(user_node, fx.Node):
+                    continue
+                if node_indices[user_node] < last_wait_node_idx:
+                    need_sort_nodes.append(user_node)
+                    nodes.extend(list(user_node.users))
+
+            split_idx_node = gm.graph.call_function(operator.getitem, (split_node, idx))
+            with gm.graph.inserting_after(split_idx_node):
+                wait_output_node = gm.graph.call_function(
+                    aten.reshape, (split_idx_node, comm_block.shape)
+                )
+            gm.graph.node_replace_all_uses_with(orig_wait, wait_output_node)
+
+        if last_split_reshape_node == split_node:
+            last_split_reshape_node = wait_output_node  # type: ignore[possibly-undefined]
+
+    need_sort_nodes = sorted(need_sort_nodes, key=lambda node: node_indices[node])
+    gm.graph.move_after(need_sort_nodes, last_split_reshape_node)
+
+    gm.graph.eliminate_dead_code()
+
+
+def _fuse_with_cat(
+    gm: IterGraphModule,
+    comm_blocks: List[CommBlock],
+    node_indices: Dict[fx.Node, int],
+) -> CommBlock:
+    """Fuse the CommBlocks using concat given a list of CommBlock (only allreduce)."""
+    # Find the last input node.
+    last_input_node = comm_blocks[0].inputs[0]
+    last_input_index = -1
+    all_input_nodes = []
+    for comm_block in comm_blocks:
+        input_node = comm_block.inputs[0]
+        # If the input node is a clone, this is CommTensor based implementation.
+        if input_node.name.startswith("clone"):
+            input_node = cast(fx.Node, input_node.args[0])
+        all_input_nodes.append(input_node)
+        index = node_indices[input_node]
+        if index >= last_input_index:
+            assert index != last_input_index
+            last_input_node = input_node
+            last_input_index = index
+
+    # Flatten all the inputs right after the last input is ready.
+    with gm.graph.inserting_after(last_input_node):
+        cat_inputs = []
+        for input_node in all_input_nodes:
+            cat_inputs.append(
+                _call_function(
+                    gm, fake_tensor_mode, None, aten.flatten.using_ints, input_node
+                )
+            )
+
+    with gm.graph.inserting_after(cat_inputs[0]):
+        cat_node = _call_function(gm, fake_tensor_mode, None, aten.cat, cat_inputs)
+
+    # Create a new Comm node.
+    last_comm = comm_blocks[-1]
+    last_comm_node = last_comm.comm_node
+    last_wait_node = last_comm.wait_nodes[0]
+    with gm.graph.inserting_after(cat_node):
+        flatten_args, spec = tree_flatten((last_comm_node.args, last_comm_node.kwargs))
+        flatten_args[0] = cat_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        fused_comm_node = _call_function(
+            gm,
+            fake_tensor_mode,
+            cat_node.meta["val"],
+            last_comm_node.target,
+            *args,
+            **kwargs,
+        )
+
+    # Create a new Wait node.
+    with gm.graph.inserting_after(fused_comm_node):
+        flatten_args, spec = tree_flatten((last_wait_node.args, last_wait_node.kwargs))
+        flatten_args[0] = fused_comm_node
+        args, kwargs = tree_unflatten(flatten_args, spec)
+        fused_wait_node = _call_function(
+            gm,
+            fake_tensor_mode,
+            cat_node.meta["val"],
+            last_wait_node.target,
+            *args,
+            **kwargs,
+        )
+
+    # Move the fused_comm_node and its args to right after the source node
+    nodes_to_move = cat_inputs + [cat_node, fused_comm_node, fused_wait_node]
+    gm.graph.move_after(nodes_to_move, last_input_node)
+
+    tensor_meta = cat_node.meta.get("tensor_meta")
+    fused_comm_block = CommBlock(
+        shape=tensor_meta.shape,  # type: ignore[union-attr]
+        node_list=[fused_comm_node, fused_wait_node],
+        wait_nodes=[fused_wait_node],
+        comm_node=fused_comm_node,
+        inputs=[cat_node],
+        outputs={fused_wait_node},
+    )
+
+    _scatter_wait_result(gm, fused_comm_block, comm_blocks, node_indices)
+
+    return fused_comm_block
+
+
+def _expedite_comm_ops(gm: IterGraphModule, comm_blocks: List[CommBlock]) -> None:
+    node_indices = {node: i for i, node in enumerate(gm.graph.nodes)}
+    for comm_block in comm_blocks:
+        last_input = comm_block.comm_node
+        last_input_idx = -1
+        for input in comm_block.inputs:
+            input_idx = node_indices[input]
+            if input_idx > last_input_idx:
+                last_input = input
+                last_input_idx = input_idx
+        gm.graph.node_append(last_input, comm_block.comm_node)
+
+
+@graph_optimization_pass(
+    prerequisites=[],
+    apply_after=[],
+)
+def comm_fusion_with_concat(
+    gm: IterGraphModule,
+    bucket_size_mb: int,
+) -> None:
+    """Run fuse communication with concat.
+
+    This implementation uses concat to concat the bucketed gradients.
+    """
+    comm_blocks = get_all_comm_blocks(gm, (CommType.ALLREDUCE, "all_reduce"))
+    # First ensure the allreduce are scheduled immediately right after the gradients.
+    _expedite_comm_ops(gm, comm_blocks)
+    # Get the comm_blocks based on the new order.
+    comm_blocks = get_all_comm_blocks(gm, (CommType.ALLREDUCE, "all_reduce"))
+    node_indices = {node: i for i, node in enumerate(gm.graph.nodes)}
+
+    bucket_size = 1 * 1024**2
+    bucket_cap_size = bucket_size_mb * 1024**2
+    begin = end = curr_size = 0
+    while end < len(comm_blocks):
+        # TODO: determine the dtype
+        curr_size += cast(torch.Size, comm_blocks[end].shape).numel() * 4
+        end += 1
+        if curr_size < bucket_size:
+            continue
+        _fuse_with_cat(gm, comm_blocks[begin:end], node_indices)
+        bucket_size = bucket_cap_size
+        begin = end
+        curr_size = 0
+    else:
+        if begin < len(comm_blocks):
+            _fuse_with_cat(gm, comm_blocks[begin:end], node_indices)
+
+
+@graph_optimization_pass(
+    prerequisites=[comm_fusion_with_concat],
+    apply_after=[],
+)
+def schedule_comm_wait(gm: IterGraphModule) -> None:
+    """Delay the execution of wait tensors of allreduce until its first user."""
+    comm_blocks = get_all_comm_blocks(gm, (CommType.ALLREDUCE, "all_reduce"))
+
+    # Find all the end users.
+    allreduce_users: Set[fx.Node] = set()
+    for allreduce in comm_blocks:
+        for output in allreduce.outputs:
+            allreduce_users.update(output.users)
+
+    node_indices = {node: i for i, node in enumerate(gm.graph.nodes)}
+    for allreduce in comm_blocks:
+        # Find the earliest users.
+        assert (
+            len(allreduce.outputs) >= 1
+        ), f"Found a allreduce that has zero outputs/users -- {allreduce}."
+        # Initialize the target_node to be the first user of the first output.
+        target_node = next(iter(next(iter(allreduce.outputs)).users))
+        target_node_index = 2**31
+        for user in (user for output in allreduce.outputs for user in output.users):
+            index = node_indices[user]
+            if index < target_node_index:
+                target_node = user
+                target_node_index = index
+
+        # Move wait nodes and all the subsequent output nodes before the
+        # earliest user.
+        wait_idx = -1
+        for wait_idx, node in enumerate(allreduce.node_list):
+            if node == allreduce.wait_nodes[0]:
+                break
+        assert wait_idx >= 0
+        gm.graph.move_before(allreduce.node_list[wait_idx:], target_node)
+
+
+@graph_optimization_pass(
+    prerequisites=[],
+    apply_after=[],
+)
+def remove_copy_from_optimizer(gm: IterGraphModule) -> None:
+    """Erase the orphant copy_ that generated when tracing optimizer.
+
+    Two reasons why we could not simply use the DCE of fx.Graph.
+    1. fx.Graph treats copy_ as a side-effect node and does not erase it.
+    2. Users may want to preserve some orphan `copy_` that is not from the
+       optimizer.
+    If the second reason does not hold, this pass can be rewritten as using
+    DCE from fx.Graph (with the overwrite to the side-effect node list).
+    """
+    MAX_COPY_DISTANCE = 5
+    remove_candidates: Set[fx.Node] = set()
+    for node in reversed(gm.graph.nodes):
+        if node.users:
+            continue
+        if node.op != OP.CALL_FUNCTION or node.target != aten.copy_.default:
+            continue
+
+        copy_ancestors: Set[fx.Node] = set()
+        nodes = collections.deque([node, None])
+        distance = 0
+        should_remove = False
+        while nodes and distance < MAX_COPY_DISTANCE:
+            visiting = nodes.popleft()
+            if visiting is None:
+                distance += 1
+                if nodes:
+                    nodes.append(None)
+                continue
+            copy_ancestors.add(visiting)
+            if visiting.op == OP.CALL_FUNCTION and str(visiting.target).startswith(
+                ("aten._foreach_", "aten._fused_")
+            ):
+                should_remove = True
+            parents = pytree.arg_tree_leaves(*visiting.args, **visiting.kwargs)
+            for parent in parents:
+                if isinstance(parent, fx.Node):
+                    nodes.append(parent)
+        if should_remove:
+            # We add all ancestors to the list and it is okay as not all of
+            # them will be erased -- only those nodes with zero users will be
+            # erased.
+            remove_candidates.update(copy_ancestors)
+
+    for node in reversed(gm.graph.nodes):
+        if node.users:
+            continue
+        if node not in remove_candidates:
+            continue
+        gm.graph.erase_node(node)
+
+
+# The args list of fused_adam function. We don't care about kwargs.
+AdamArgs = collections.namedtuple(
+    "AdamArgs",
+    ["params", "grads", "exp_avgs", "exp_avg_sqs", "max_exp_avg_sqs", "state_steps"],
+)
+
+
+# TODO(fegin): Have a template class for all Block class.
+@dataclass(unsafe_hash=True)
+class FusedAdamBlock:
+    optim_node: fx.Node
+    generate_output: bool
+    # The output list of the copy nodes. The order follows the argument order.
+    param_outputs: List[fx.Node] = field(default_factory=list)
+    grad_outputs: List[fx.Node] = field(default_factory=list)
+    exp_avgs_outputs: List[fx.Node] = field(default_factory=list)
+    exp_avg_sqs_outputs: List[fx.Node] = field(default_factory=list)
+    # TODO(fegin): populate/generate the max_exp_avg_sqs if exists
+    max_exp_avg_sqs: List[fx.Node] = field(default_factory=list)
+
+    def generate_outputs(self):
+        # Iterate all the args and generate the corresponding output lists.
+        # Assuming the corrsesponding output nodes are not created yet.
+        def _generate_outputs(arg_idx, output_list):
+            graph = self.optim_node.graph
+            with graph.inserting_after(self.optim_node):
+                optim_getitem = graph.call_function(
+                    operator.getitem, (self.optim_node, arg_idx)
+                )
+            for i, arg in enumerate(self.optim_node.args[arg_idx]):
+                with graph.inserting_after(optim_getitem):
+                    updated_arg = graph.call_function(
+                        operator.getitem, (optim_getitem, i)
+                    )
+                with graph.inserting_after(updated_arg):
+                    output_copy = graph.call_function(aten.copy_, (arg, updated_arg))
+                output_list.append(output_copy)
+
+        _generate_outputs(0, self.param_outputs)
+        # Do not generate gradient out list as it is not used.
+        _generate_outputs(2, self.exp_avgs_outputs)
+        _generate_outputs(3, self.exp_avg_sqs_outputs)
+
+    def populate_outputs(self):
+        # Populate the existing output lists from the graph.
+        def _populate_outputs(args_idx, output_list):
+            optim_getitem = self.optim_node
+            for user in self.optim_node.users:
+                assert (
+                    user.target == operator.getitem
+                ), f"The user of {self.optim_node} is not getitem."
+                if user.args[1] == args_idx:
+                    optim_getitem = user
+                    break
+            assert (
+                optim_getitem != self.optim_node
+            ), f"Cannot find the getitem node for {self.optim_node}"
+            output_list.extend(
+                [self.optim_node] * len(cast(List[fx.Node], self.optim_node.args[0]))
+            )
+            for updated_arg in optim_getitem.users:
+                assert (
+                    updated_arg.target == operator.getitem
+                ), f"Unexpected node target {updated_arg.target}."
+                idx = updated_arg.args[1]
+                output_copy = next(iter(updated_arg.users))
+                assert str(output_copy.target).startswith(
+                    "aten.copy_"
+                ), f"Unexpected node target {output_copy.target}."
+                output_list[idx] = output_copy
+            for i, output in enumerate(output_list):
+                assert output != self.optim_node, f"{i}th output is not replaced."
+
+            assert output_list, f"The output for {self.optim_node} is empty."
+
+        _populate_outputs(0, self.param_outputs)
+        _populate_outputs(2, self.exp_avgs_outputs)
+        _populate_outputs(3, self.exp_avg_sqs_outputs)
+
+    def __post_init__(self):
+        if self.param_outputs:
+            return
+        if self.generate_output:
+            self.generate_outputs()
+        else:
+            self.populate_outputs()
+
+
+@dataclass(unsafe_hash=True)
+class ForeachAddBlock:
+    add_node: fx.Node
+    generate_output: bool
+    # The output list of the copy nodes. The order follows the argument order.
+    outputs: List[fx.Node] = field(default_factory=list)
+
+    def generate_outputs(self):
+        # Iterate all the args and generate the corresponding output lists
+        # Assuming the corrsesponding output nodes are not created yet.
+        graph = self.add_node.graph
+        for i, arg in enumerate(cast(Tuple[Any, ...], self.add_node.args[0])):
+            with graph.inserting_after(self.add_node):
+                updated_arg = graph.call_function(operator.getitem, (self.add_node, i))
+            with graph.inserting_after(updated_arg):
+                output_copy = graph.call_function(aten.copy_, (arg, updated_arg))
+            self.outputs.append(output_copy)
+        assert self.outputs, f"The output for {self.add_node} is empty."
+
+    def populate_outputs(self):
+        # Populate the existing output lists from the graph.
+        self.outputs = [
+            self.add_node for _ in cast(Tuple[Any, ...], self.add_node.args[0])
+        ]
+        for updated_arg in self.add_node.users:
+            assert (
+                updated_arg.target == operator.getitem
+            ), f"Unexpected node target {updated_arg.target}"
+            idx = cast(int, updated_arg.args[1])
+            output_copy = next(iter(updated_arg.users))
+            assert str(output_copy.target).startswith(
+                "aten.copy_"
+            ), f"The execpted output node is different, {str(output_copy.target)}"
+            self.outputs[idx] = output_copy
+        for i, output in enumerate(self.outputs):
+            assert output != self.add_node, f"{i}th output is not replaced."
+
+    def __post_init__(self):
+        if self.outputs:
+            return
+
+        if self.generate_output:
+            self.generate_outputs()
+        else:
+            self.populate_outputs()
+
+
+@dataclass(unsafe_hash=True)
+class FusedOptimizerBlock:
+    step: ForeachAddBlock
+    optim: FusedAdamBlock
+
+
+def get_fused_optimizer_block(optim_node: fx.Node) -> FusedOptimizerBlock:
+    """Given a fused optimizer node and return the FusedOptimizerBlock."""
+    MAX_STEP_DISTANCE = 5
+    # Find the step (foreach_add)
+    nodes = collections.deque([optim_node, None])
+    step_node = optim_node
+    distance = 0
+    while nodes and distance < MAX_STEP_DISTANCE:
+        node = nodes.popleft()
+        if node is None:
+            distance += 1
+            if nodes:
+                nodes.append(None)
+            continue
+        elif node.op == OP.CALL_FUNCTION and str(node.target).startswith(
+            "aten._foreach_add"
+        ):
+            step_node = node
+            break
+        else:
+            nodes.extend(
+                a
+                for a in pytree.arg_tree_leaves(*node.args, **node.kwargs)
+                if isinstance(a, fx.Node)
+            )
+    if step_node == optim_node:
+        raise RuntimeError(
+            "Cannot find step node (foreach_add) for the optimizer node "
+            f"{optim_node} with {MAX_STEP_DISTANCE} BFS distance. "
+            "The API design does not match the tracing graph."
+        )
+
+    step = ForeachAddBlock(step_node, generate_output=False)
+    optim = FusedAdamBlock(optim_node, generate_output=False)
+    return FusedOptimizerBlock(step, optim)
+
+
+def get_all_fused_optimizer_blocks(
+    gm: IterGraphModule, optim_ops: Union[Tuple[str, ...], str]
+) -> List[FusedOptimizerBlock]:
+    """Find all the FusedOptimizerBlock that the optimizer operators are in `optim_ops`."""
+    return [
+        get_fused_optimizer_block(node)
+        for node in gm.graph.nodes
+        if node.name.startswith(optim_ops)
+    ]
+
+
+def _split_fused_adam(
+    gm: IterGraphModule,
+    orig_optim_block: FusedOptimizerBlock,
+    split_gradients: Set[fx.Node],
+) -> Tuple[FusedOptimizerBlock, FusedOptimizerBlock]:
+    """Split the `orig_optim_block` into two FusedOptimizerBlock.
+
+    The first one will be the optimizer that optimize `split_gradients`. The second one is
+    used to optimize the remaining gradients.
+    An assert will be raised if one of the optimizer optimize zero gradients.
+    """
+    orig_optim_args = AdamArgs(*orig_optim_block.optim.optim_node.args)
+    optim_args = (AdamArgs([], [], [], [], [], []), AdamArgs([], [], [], [], [], []))
+    # The only hint we can use to split the optimizer is the order/indices.
+    orig_optim_indices: Tuple[List[int], List[int]] = ([], [])
+    orig_step_indices: Tuple[List[int], List[int]] = ([], [])
+
+    for idx, gradient in enumerate(orig_optim_args.grads):
+        group_idx = 0 if gradient in split_gradients else 1
+        orig_optim_indices[group_idx].append(idx)
+        # Get the argument for idx-th gradient from orig_optim_args
+        for orig_arg, optim_arg in zip(orig_optim_args, optim_args[group_idx]):
+            # Only add the argument to the list if the original argument list
+            # is not empty. If the original argument list is empty, the new
+            # one must be an empty list as well.
+            if orig_arg:
+                optim_arg.append(orig_arg[idx])
+
+        # If argument order of step is the same as optimizer, nothing has to be
+        # done. However, it is risky to rely on this assumption so we populate
+        # the orig_step_indices.
+        orig_step_output = optim_args[group_idx].state_steps[-1]
+        assert str(orig_step_output.target).startswith(
+            "aten.copy_"
+        ), f"The copy output is {orig_step_output.target}, expect aten.copy_"
+        orig_step_getitem = orig_step_output.args[1]
+        assert "getitem" in str(
+            orig_step_getitem.target
+        ), f"The copy getitem is {orig_step_getitem.target}, expect operator.getitem"
+        orig_step_idx = orig_step_getitem.args[1]
+        orig_step_indices[group_idx].append(orig_step_idx)
+
+    if not all(l for l in (orig_step_indices + orig_optim_indices)):
+        raise ValueError("At least one split optimizer does not have input.")
+
+    output = get_output(gm.graph)
+    results: List[FusedOptimizerBlock] = []
+    flatten_output_args, spec = tree_flatten((output.args, output.kwargs))
+    flatten_output_args_indices: DefaultDict[
+        fx.Node, Set[int]
+    ] = collections.defaultdict(set)
+    for idx, output_arg in enumerate(flatten_output_args):
+        if isinstance(output_arg, fx.Node):
+            flatten_output_args_indices[output_arg].add(idx)
+
+    def replace_flatten_output_args(orig_node: fx.Node, new_node: fx.Node):
+        for idx in flatten_output_args_indices[orig_node]:
+            flatten_output_args[idx] = new_node
+
+    # Create the new step and optim nodes and blocks.
+    for group_idx in range(2):
+        step_args: List[fx.Node] = []
+        orig_step_outputs: List[fx.Node] = []
+        # We have to create the new step node and block first because it is used
+        # for the new optim node as the input.
+        with gm.graph.inserting_after(orig_optim_block.optim.optim_node):
+            for idx in orig_step_indices[group_idx]:
+                step_args.append(
+                    cast(Tuple[fx.Node, ...], orig_optim_block.step.add_node.args[0])[
+                        idx
+                    ]
+                )
+                orig_step_outputs.append(orig_optim_block.step.outputs[idx])
+            step = gm.graph.call_function(
+                aten._foreach_add.Scalar,
+                (step_args, 1),
+            )
+        step_block = ForeachAddBlock(step, generate_output=True)
+        for i, step_output in enumerate(step_block.outputs):
+            # Replace the original step output in the graph output node with
+            # the new one.
+            orig_step_output = orig_step_outputs[i]
+            replace_flatten_output_args(orig_step_output, step_output)
+            # Also need to replace the step output used for the new optimizer.
+            assert optim_args[group_idx].state_steps[i] == orig_step_output, (
+                f"The expected step output node mismatched, {orig_step_output} "
+                f"{optim_args[group_idx].state_steps[i]}"
+            )
+            optim_args[group_idx].state_steps[i] = step_output
+
+        # Insert the optimizer node after the first step output because its
+        # topo sort order is the last.
+        with gm.graph.inserting_after(step_block.outputs[0]):
+            optim = gm.graph.call_function(
+                aten._fused_adam.default,
+                optim_args[group_idx],
+                orig_optim_block.optim.optim_node.kwargs,
+            )
+        optim_block = FusedAdamBlock(optim, generate_output=True)
+        for curr_idx, orig_idx in enumerate(orig_optim_indices[group_idx]):
+            list_names = ("param_outputs", "exp_avgs_outputs", "exp_avg_sqs_outputs")
+            for name in list_names:
+                orig_list = getattr(orig_optim_block.optim, name)
+                curr_list = getattr(optim_block, name)
+                replace_flatten_output_args(orig_list[orig_idx], curr_list[curr_idx])
+
+        results.append(FusedOptimizerBlock(step_block, optim_block))
+
+    # Optimizer is used as the output of the train_step. Therefore, we have to
+    # update the output node of the graph.
+    output_args, output_kwargs = tree_unflatten(flatten_output_args, spec)
+    gm.graph.node_set_args(output, output_args)
+    gm.graph.node_set_kwargs(output, output_kwargs)
+    # Remove the original copy_ nodes as they won't be DCE.
+    for copy_output in itertools.chain(
+        orig_optim_block.optim.param_outputs,
+        orig_optim_block.optim.exp_avgs_outputs,
+        orig_optim_block.optim.exp_avg_sqs_outputs,
+    ):
+        gm.graph.erase_node(copy_output)
+    # Call DCE once to get rid of the old optimizer. By doing so, we will be
+    # able to erase the copy_ nodes of step later.
+    gm.graph.eliminate_dead_code()
+    for copy_output in orig_optim_block.step.outputs:
+        gm.graph.erase_node(copy_output)
+    # This is not required but calling this for consistency.
+    gm.graph.eliminate_dead_code()
+
+    return results[0], results[1]
+
+
+def split_fused_optimizer(
+    gm: IterGraphModule,
+    optim_block: FusedOptimizerBlock,
+    split_gradients: Set[fx.Node],
+) -> Tuple[FusedOptimizerBlock, FusedOptimizerBlock]:
+    if not split_gradients:
+        raise ValueError("The given split_gradients is empty.")
+    if str(optim_block.optim.optim_node.target).startswith("aten._fused_adam"):
+        return _split_fused_adam(gm, optim_block, split_gradients)
+    else:
+        raise NotImplementedError("Only fused_adam is supported now")
+
+
+# TODO(fegin): The API only support fused adam now. Should extend it to support
+# foreach as well.
+@graph_optimization_pass(
+    prerequisites=[remove_copy_from_optimizer],
+    apply_after=[schedule_comm_wait],
+)
+def iter_move_grads_and_optimizers(
+    gm: IterGraphModule,
+    target_comm_node: str,
+    target_dest_node: str,
+) -> None:
+    """Extract a comm block and split out a new optimizer and step for it.
+
+    This subgraph is then moved to the forward graph.
+    """
+    for comm_block in get_all_comm_blocks(gm, "all_reduce"):
+        if comm_block.comm_node.name == target_comm_node:
+            break
+    else:
+        raise ValueError(f"Cannot find {target_comm_node}")
+
+    optim_blocks = get_all_fused_optimizer_blocks(gm, "_fused_adam")
+    for optim_block in optim_blocks:
+        optim_args = AdamArgs(*optim_block.optim.optim_node.args)
+        one_output = next(iter(comm_block.outputs))
+        if one_output in optim_args.grads:
+            break
+    else:
+        raise ValueError(f"{target_comm_node} is not used by any fused optimizer.")
+
+    move_optim, _ = split_fused_optimizer(gm, optim_block, comm_block.outputs)
+
+    move_nodes = find_all_descendants(
+        gm, [comm_block.comm_node, move_optim.step.add_node]
+    )
+
+    stop_node = find_node(gm.graph, lambda n: n.name == target_dest_node)[0]
+
+    gm.graph.move_to_next_iter_before(move_nodes, stop_node)
+
+
+def find_all_descendants(
+    gm: IterGraphModule,
+    parent_nodes: List[fx.Node],
+) -> List[fx.Node]:
+    """Identify the list of nodes to move during FX graph transformation."""
+    assert len(parent_nodes) > 0, "No parent nodes are given."
+
+    output = get_output(gm.graph)
+    dq_parent_nodes = collections.deque(parent_nodes)
+    move_node_set = set()
+    while dq_parent_nodes:
+        node = dq_parent_nodes.popleft()
+        move_node_set.add(node)
+        dq_parent_nodes += [
+            u for u in node.users if isinstance(u, fx.Node) and u != output
+        ]
+    move_nodes = [node for node in gm.graph.nodes if node in move_node_set]
+
+    return move_nodes
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_utils.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ee71ce69c2cf09b287dcfbd5f322df11e47ed3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/graph_utils.py
@@ -0,0 +1,145 @@
+import logging
+import os
+import tempfile
+from enum import Enum
+from typing import Callable, cast, Dict, Iterable, List, Set
+
+import torch.fx as fx
+from torch.fx.passes.shape_prop import TensorMetadata
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+
+logger: logging.Logger = logging.getLogger("graph_utils")
+
+
+class OP(str, Enum):
+    CALL_FUNCTION = "call_function"
+    CALL_MODULE = "call_module"
+    CALL_METHOD = "call_method"
+    GET_ATTR = "get_attr"
+    OUTPUT = "output"
+    PLACEHOLDER = "placeholder"
+
+
+class CommType(str, Enum):
+    ALLREDUCE = "allreduce_"
+    ALLGATHER = "allgather_"
+    BROADCAST = "broadcast_"
+    REDUCESCATTER = "reduce_scatter_"
+    SCATTER = "scatter_"
+
+
+def get_node_tensor_metadata(node: fx.Node, is_required: bool = True) -> TensorMetadata:
+    metadata = node.meta.get("tensor_meta", None)
+    if is_required and metadata is None:
+        raise RuntimeError(
+            f"Callsite expects that ``tensor_meta`` exists in ``{node.name}``, "
+            f"but got None instead. Node: {node.op} {node.name} {node.target}"
+        )
+    return metadata
+
+
+def get_output(graph: fx.Graph) -> fx.Node:
+    """Take a graphmodule and return the graph output node.
+
+    We traverse in reverse to expedite it, with the idea that last node should be output
+    """
+    for node in reversed(graph.nodes):
+        if node.op == OP.OUTPUT:
+            return node
+    raise RuntimeError(f"Cannot find the output node in {graph}")
+
+
+def find_node(
+    graph: fx.Graph, predicate: Callable, reverse_order: bool = False
+) -> List[fx.Node]:
+    """Take a predicate and return all the nodes in the `graph` where the predicate holds."""
+    nodes = cast(Iterable[fx.Node], graph.nodes)
+    if reverse_order:
+        nodes = cast(Iterable[fx.Node], iter(reversed(nodes)))  # type: ignore[call-overload]
+    return [node for node in nodes if predicate(node)]
+
+
+def is_leaf_subgraph(graph: fx.Graph, subgraph: List[fx.Node]) -> bool:
+    """Ensure nodes in ``subgraph`` satisfy one of the following rules.
+
+    1. The user of the node is in ``subgraph``.
+    2. The user of the node is output.
+    3. There are no users -- the node is a side-effect node.
+    """
+    all_nodes: Set[fx.Node] = set(subgraph)
+    output = get_output(graph)
+    for node in subgraph:
+        for user in node.users:
+            if not isinstance(user, fx.Node):
+                continue
+            if user not in all_nodes and user != output:
+                return False
+    return True
+
+
+def clone_subgraph(
+    graph: fx.Graph, subgraph: List[fx.Node], target: fx.Node
+) -> List[fx.Node]:
+    """Clone the given subgraph and insert it before ``target``.
+
+    This API currently does not support inserting after ``target``.
+    """
+    all_nodes = set(subgraph)
+    mapping: Dict[fx.Node, fx.Node] = dict()
+    cloned_subgraph = []
+    with graph.inserting_before(target):
+        for node in subgraph:
+            cloned_node = graph.call_function(
+                node.target, node.args, node.kwargs, node.type
+            )
+            # TODO: there are many flatten/unflatten in IterGraph that
+            # can be simplified with tree_map. Will simplify this in
+            # a follow-up PR.
+            original_input = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            cloned_input, spec = tree_flatten((cloned_node.args, cloned_node.kwargs))
+            mapped_cloned_input = []
+            for original_input_node, cloned_input_node in zip(
+                original_input, cloned_input
+            ):
+                if (
+                    isinstance(original_input_node, fx.Node)
+                    and original_input_node in all_nodes
+                ):
+                    assert original_input_node in mapping
+                    mapped_cloned_input.append(mapping[original_input_node])
+                else:
+                    mapped_cloned_input.append(cloned_input_node)
+            cloned_node.args, cloned_node.kwargs = tree_unflatten(
+                mapped_cloned_input, spec
+            )
+            mapping[node] = cloned_node
+            cloned_subgraph.append(cloned_node)
+
+    return cloned_subgraph
+
+
+def rebuild_graph(gm: fx.GraphModule, remove_dead_code: bool = True) -> None:
+    """Run the required steps to ensure production-ready graph.
+
+    Note - per the fx docs, elimination of dead code is not very precise.
+    Hence, the flag to make this step optional.
+    """
+    gm.graph.lint()
+    if remove_dead_code:
+        gm.graph.eliminate_dead_code()
+    gm.recompile()
+
+
+def dump_graphs_to_files(graphs: Dict[str, fx.GraphModule], folder: str = "") -> str:
+    if not folder:
+        folder = tempfile.mkdtemp()
+
+    for prefix, gm in graphs.items():
+        with open(os.path.join(folder, f"{prefix}.graph"), "w") as fp:
+            fp.write(str(gm))
+
+    logger.warning("Dump graphs to %s", folder)
+
+    return folder
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/iter_graph_module.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/iter_graph_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..11576db9161daad1e7c669cead76a33b7f5bc397
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/iter_graph_module.py
@@ -0,0 +1,762 @@
+import copy
+import inspect
+import logging
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Type
+
+import torch.nn as nn
+from torch import fx
+from torch.distributed._spmd.graph_utils import (
+    clone_subgraph,
+    get_output,
+    is_leaf_subgraph,
+)
+from torch.distributed._spmd.partial_lower import partial_lower
+from torch.fx.graph import _PyTreeCodeGen, PythonCode
+from torch.fx.node import Argument
+from torch.profiler import record_function
+from torch.utils import _pytree as pytree
+from torch.utils._pytree import tree_flatten, tree_map, tree_map_only, tree_unflatten
+
+
+logger: logging.Logger = logging.getLogger("IterGraphModule")
+
+
+class IterGraph(fx.Graph):
+    """``IterGraph`` is used to perform cross-iteration optimization.
+
+    ``IterGraph`` keeps track of the 3 graphs, self (the original graph), setup graph, and
+    cleanup graph. The 3 graphs should be identical copies of a ``fx.Graph``.
+
+    IterGraph subclass fx.Graph to override the necessary APIs that will be used
+    when constructing a optimization, e.g., communication fusion. IterGraph also
+    provides APIs that originally belong to fx.Node and all these APIs will have
+    ``node_`` prefix. For example, ``IterGraph.node_prepend`` is the equivalence
+    of ``fx.Node.prepend``. Note that all the optimizations must be constructed
+    using these APIs.
+    """
+
+    def __init__(
+        self,
+        orig_graph: fx.Graph,
+        setup_graph: fx.Graph,
+        cleanup_graph: fx.Graph,
+        owning_module: Optional[fx.GraphModule] = None,
+        tracer_cls: Optional[Type["fx.Tracer"]] = None,
+        tracer_extras: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(owning_module, tracer_cls, tracer_extras)
+
+        output_vals = self.graph_copy(orig_graph, {}, return_output_node=True)
+        # TODO: if we do ``deepcopy(_codegen)`` and the input argument contains
+        # a dictionary with the form of Dict[torch.Tensor, Any], the
+        # torch.fx._pytree.treen_flatten_spec will not be able to flatten the
+        # dict -- the torch.Tensor will be duplicated because the _input_spec
+        # will save the ``keys`` of a dictionary (the values are not saved).
+        self._codegen = copy.deepcopy(orig_graph._codegen)
+        assert isinstance(output_vals, tuple)
+        output_val, old_output_val = output_vals
+        super().output(output_val, type_expr=getattr(old_output_val, "type", None))
+
+        self.setup_graph = setup_graph
+        self.cleanup_graph = cleanup_graph
+        self._all_graphs: Tuple[fx.Graph, ...] = (
+            self.setup_graph,
+            self.cleanup_graph,
+            cast(fx.Graph, super()),
+        )
+
+        self._setup_mapping: Dict[fx.Node, fx.Node] = {}
+        self._cleanup_mapping: Dict[fx.Node, fx.Node] = {}
+        self._freeze_cross_iter_movement = False
+        self._cross_iter_block_count = 0
+
+        for node, setup_node, cleanup_node in zip(
+            self.nodes, self.setup_graph.nodes, self.cleanup_graph.nodes
+        ):
+            self._setup_mapping[node] = setup_node
+            self._cleanup_mapping[node] = cleanup_node
+
+        self.num_extra_output = 0
+
+    def _lookup_node(self, node: fx.Node, graph: fx.Graph) -> Optional[fx.Node]:
+        if graph == self.setup_graph:
+            return self._setup_mapping.get(node, None)
+        elif graph == self.cleanup_graph:
+            return self._cleanup_mapping.get(node, None)
+        return node
+
+    def _fx_graph_call(
+        self, graph: fx.Graph, func: str, *args: Any, **kwargs: Any
+    ) -> Any:
+        fx_graph: fx.Graph = graph if graph != self else cast(fx.Graph, super())
+        return getattr(fx_graph, func)(*args, **kwargs)
+
+    def _insert_context(self, func: str, node: fx.Node):
+        class _InsertPoint:
+            def __init__(self, insert_points: List[Any]):
+                self.insert_points = insert_points
+
+            def __enter__(self):
+                pass
+
+            def __exit__(self, type, value, tb):
+                for insert_point in self.insert_points:
+                    insert_point.__exit__(type, value, tb)
+
+        insert_points = []
+        for graph in self._all_graphs:
+            if node:
+                actual_node = self._lookup_node(node, graph)
+                assert actual_node is not None, "Cannot handle None case now."
+            else:
+                actual_node = node
+            insert_points.append(getattr(graph, func)(actual_node))
+
+        return _InsertPoint(insert_points)
+
+    def inserting_after(self, node):
+        if self._freeze_cross_iter_movement:
+            return super().inserting_after(node)
+        return self._insert_context("inserting_after", node)
+
+    def inserting_before(self, node):
+        if self._freeze_cross_iter_movement:
+            return super().inserting_before(node)
+        return self._insert_context("inserting_before", node)
+
+    def _forward_subgraph_inputs(
+        self, subgraph: List[fx.Node], graph: fx.Graph, erase_node: bool
+    ) -> int:
+        """Turn the inputs of a subgraph into the extra output of the entire graph.
+
+        If ``erase_node`` is True, the subgraph will be erased from the graph -- essentially forward the inputs
+        of the subgraph to the output of the graph.
+        """
+        output = get_output(graph)
+        inputs = []
+        all_nodes: Set[fx.Node] = set(subgraph)
+
+        for node in subgraph:
+            node_inputs = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            for _input in node_inputs:
+                if not isinstance(_input, fx.Node):
+                    continue
+                if _input in all_nodes:
+                    continue
+                inputs.append(_input)
+
+        if erase_node:
+            # We have to remove the node in the reversed order to ensure the
+            # node has zero users.
+            erased = set()
+            for node in reversed(subgraph):
+                if len(node.users) == 1:
+                    key = next(iter(node.users.keys()))
+                    if key == output:
+                        flatten_args, spec = tree_flatten((output.args, output.kwargs))
+                        if node not in flatten_args:
+                            # This optimizer node from the legacy _SPMD tracing.
+                            node.users.clear()
+                        elif str(node.target).startswith("aten.copy_"):
+                            # This is the case where the optimizer is
+                            # functionalized with copy_.
+                            for i in range(len(flatten_args)):
+                                if flatten_args[i] == node:
+                                    flatten_args[i] = node.args[0]
+                        else:
+                            # We have not figured out semantics of forwarding
+                            # all diff ops.
+                            raise RuntimeError(
+                                f"IterGraph does not how to forward the output of {node}"
+                            )
+                        output.args, output.kwargs = tree_unflatten(flatten_args, spec)
+
+                # This is the step case where there is a virtual data dependency
+                # (in-place update) between step and optimizer. And
+                # functionalize_optim add this dependency
+                for user in list(node.users.keys()):
+                    if user in erased:
+                        node.users.pop(user)
+                if node.users:
+                    raise RuntimeError(
+                        "IterGraph has not supported moving the nodes that "
+                        "produce users output result. "
+                        f"Error node: {node}."
+                    )
+                self._fx_graph_call(graph, "erase_node", node)
+                erased.add(node)
+
+        # Add all the extra output nodes into a list and append the list to
+        # the original output.args[0].
+        if self.num_extra_output:
+            # If the extra-output list already exist, just use it.
+            cast(List[fx.Node], output.args[0][-1]).extend(inputs)  # type: ignore[index]
+            new_output = output.args[0]
+        else:
+            # When adding the extra-output list, out_spec of _PyTreeCodeGen
+            # must be updated accordingly.
+            if isinstance(graph._codegen, _PyTreeCodeGen):
+                codegen = graph._codegen
+                new_output = list(output.args[0])  # type: ignore[arg-type]
+                new_output.append(inputs)
+                assert codegen.pytree_info.out_spec is not None
+                original_tree_out = tree_unflatten(
+                    cast(List[Any], output.args[0]), codegen.pytree_info.out_spec
+                )
+                # Use None as a placeholder. If we use the extra-output list
+                # the list will be flatten as well and put into out_spec.
+                _, out_spec = tree_flatten((original_tree_out, None))
+                codegen.pytree_info = codegen.pytree_info._replace(out_spec=out_spec)
+            else:
+                new_output = (output.args[0], inputs)
+        self._fx_graph_call(graph, "erase_node", output)
+        self._fx_graph_call(graph, "output", new_output)
+
+        logger.info("Extended outputs from the subgraph inputs: %s", str(inputs))
+        return len(inputs)
+
+    def _forward_inputs_to_subgraph(
+        self, subgraph: List[fx.Node], graph: fx.Graph, extra_input: int
+    ) -> None:
+        """Create extra input nodes and forward the input nodes to the ``subgraph``.
+
+        The external input nodes of ``subgraph`` (nodes that are not in ``subgraph``) will replaced by the newly
+        created input nodes.
+        """
+        placeholders = [node for node in graph.nodes if str(node.op) == "placeholder"]
+        assert placeholders, "No placeholders are found"
+        # Append the extra input nodes to the current input nodes.
+        with self._fx_graph_call(graph, "inserting_after", placeholders[-1]):
+            new_input_nodes = list(
+                reversed(
+                    [
+                        self._fx_graph_call(
+                            graph,
+                            "placeholder",
+                            f"cross_iter_input_{self._cross_iter_block_count}_{i}",
+                        )
+                        for i in reversed(range(extra_input))
+                    ]
+                )
+            )
+
+        # Update the inputs of subgraph to use the newly created input nodes.
+        all_nodes = set(subgraph)
+        new_input_index = 0
+        for node in subgraph:
+            node_inputs, spec = tree_flatten((node.args, node.kwargs))
+            new_node_inputs = []
+            for input_node in node_inputs:
+                if not isinstance(input_node, fx.Node) or input_node in all_nodes:
+                    new_node_inputs.append(input_node)
+                else:
+                    new_node_inputs.append(new_input_nodes[new_input_index])
+                    new_input_index += 1
+            node.args, node.kwargs = tree_unflatten(new_node_inputs, spec)
+        assert new_input_index == len(
+            new_input_nodes
+        ), f"More inputs than needed {len(new_input_nodes)} > {new_input_index}"
+
+        # Update the in_spec of _PyTreeCodeGen if in_spec is not None (the new
+        # SPMD makes in_spec as None).
+        if (
+            isinstance(graph._codegen, _PyTreeCodeGen)
+            and graph._codegen.pytree_info.in_spec is not None
+        ):
+            codegen = graph._codegen
+            original_tree_in = tree_unflatten(placeholders, codegen.pytree_info.in_spec)
+            _, in_spec = tree_flatten(tuple(list(original_tree_in) + new_input_nodes))
+            codegen.pytree_info = codegen.pytree_info._replace(in_spec=in_spec)
+            for new_input in new_input_nodes:
+                codegen.pytree_info.orig_args.append(new_input.name)
+            codegen.pytree_info = codegen.pytree_info._replace(in_spec=in_spec)
+
+    def move_to_next_iter_before(
+        self, subgraph: List[fx.Node], target_node: fx.Node
+    ) -> None:
+        """Move the ``subgraph`` to the next iteration before ``target_node``.
+
+        The ``subgraph`` is a list of fx.Node and must satisfy the following
+        restrictions:
+            1. The order of the nodes in ``subgraph`` must obey the topological
+               sort order.
+            2. The users of the node in ``subgraph`` must be one of the following:
+                a.) the user is also a node in ``subgraph``.
+                b.) the user is the output of the full graph.
+                c.) the node has users (side effect node).
+        """
+        if self._freeze_cross_iter_movement:
+            raise RuntimeError(
+                "The cross-iteration movement has been frozen for the given "
+                "IterGraph."
+            )
+
+        if not is_leaf_subgraph(self, subgraph):
+            raise ValueError(
+                "The target nodes for ``move_to_next_iter_before`` must "
+                "satisfy one of the following conditions: 1) the user of the "
+                "node is in the target nodes, 2) the user is the output of the "
+                "graph, 3) there are no users -- the node is a side-effect node. "
+            )
+
+        self._cross_iter_block_count += 1
+        # The main graph must be the last one to be modified. Otherwise, the
+        # mapping may change and hence introduce incorrect mapping for setup
+        # and cleanup graphs.
+
+        # For the setup graph, no additional input is needed but additional
+        # outputs will be created. The additional output represents the input of
+        # the action to be moved to the next iteration -- main graph.
+        setup_subgraph: List[fx.Node] = []
+        for node in subgraph:
+            mapped_node = self._lookup_node(node, self.setup_graph)
+            assert mapped_node is not None
+            setup_subgraph.append(mapped_node)
+        setup_extra_input = self._forward_subgraph_inputs(
+            subgraph=setup_subgraph,
+            graph=self.setup_graph,
+            erase_node=True,
+        )
+
+        # For the cleanup graph, additional input is required to get the output
+        # from the last iteration -- main graph. Additional nodes are also
+        # needed to perform the action moved from the last iteration.
+        target_cleanup_node = self._lookup_node(target_node, self.cleanup_graph)
+        assert target_cleanup_node is not None, "The target_cleanup_node is None."
+        cleanup_subgraph: List[fx.Node] = []
+        for node in subgraph:
+            mapped_node = self._lookup_node(node, self.cleanup_graph)
+            assert mapped_node is not None
+            cleanup_subgraph.append(mapped_node)
+        cloned_subgraph = clone_subgraph(
+            self.cleanup_graph,
+            cleanup_subgraph,
+            target=target_cleanup_node,
+        )
+        self._forward_inputs_to_subgraph(
+            cloned_subgraph, self.cleanup_graph, setup_extra_input
+        )
+
+        # For the main graph, additional input will be created to represent
+        # the output from the last iteration -- main graph or setup graph.
+        # Additional output will also be generated to represent the input for
+        # the next iteration -- the main graph or the cleanup graph.
+        main_extra_input = self._forward_subgraph_inputs(
+            subgraph=subgraph, graph=self, erase_node=False
+        )
+        assert main_extra_input == setup_extra_input
+        for node in subgraph:
+            target_node.prepend(node)
+        self._forward_inputs_to_subgraph(subgraph, self, main_extra_input)
+
+        # TODO: This is a temporary solution. We are going to remove DCE usage
+        # or have something to replace fx DCE.
+        for node in self.cleanup_graph.nodes:
+            if len(node.users) == 0:
+                node.users["__hold__"] = None  # type: ignore[index]
+        for node in self.nodes:
+            if len(node.users) == 0:
+                node.users["__hold__"] = None  # type: ignore[index]
+        self.num_extra_output += main_extra_input
+
+    def move_before(self, nodes: List[fx.Node], target_node: fx.Node) -> None:
+        for graph in self._all_graphs:
+            actual_nodes = [self._lookup_node(node, graph) for node in nodes]
+            actual_target_node = self._lookup_node(target_node, graph)
+            assert actual_target_node is not None
+            for actual_node in actual_nodes:
+                actual_target_node.prepend(actual_node)
+
+    def move_after(self, nodes: List[fx.Node], target_node: fx.Node) -> None:
+        for graph in self._all_graphs:
+            actual_nodes = [self._lookup_node(node, graph) for node in nodes]
+            actual_target_node = self._lookup_node(target_node, graph)
+            for actual_node in actual_nodes:
+                assert actual_target_node is not None
+                actual_target_node.append(actual_node)
+                actual_target_node = actual_node
+
+    def call_function(
+        self,
+        the_function: Callable[..., Any],
+        args: Optional[Tuple[Argument, ...]] = None,
+        kwargs: Optional[Dict[str, Argument]] = None,
+        type_expr: Optional[Any] = None,
+    ) -> fx.Node:
+        if self._freeze_cross_iter_movement:
+            return super().call_function(the_function, args, kwargs, type_expr)
+
+        setup_args = tree_map(
+            lambda arg: self._lookup_node(arg, self.setup_graph)
+            if isinstance(arg, fx.Node)
+            else arg,
+            args,
+        )
+        setup_kwargs = tree_map(
+            lambda arg: self._lookup_node(arg, self.setup_graph)
+            if isinstance(arg, fx.Node)
+            else arg,
+            kwargs,
+        )
+        cleanup_args = tree_map(
+            lambda arg: self._lookup_node(arg, self.cleanup_graph)
+            if isinstance(arg, fx.Node)
+            else arg,
+            args,
+        )
+        cleanup_kwargs = tree_map(
+            lambda arg: self._lookup_node(arg, self.cleanup_graph)
+            if isinstance(arg, fx.Node)
+            else arg,
+            kwargs,
+        )
+
+        setup_node = self.setup_graph.call_function(
+            the_function, setup_args, setup_kwargs, type_expr
+        )
+        main_node = super().call_function(the_function, args, kwargs, type_expr)
+        cleanup_node = self.cleanup_graph.call_function(
+            the_function, cleanup_args, cleanup_kwargs, type_expr
+        )
+        self._setup_mapping[main_node] = setup_node
+        self._cleanup_mapping[main_node] = cleanup_node
+        return main_node
+
+    def erase_node(self, to_erase: fx.Node) -> None:
+        if self._freeze_cross_iter_movement:
+            return super().erase_node(to_erase)
+
+        setup_node = self._lookup_node(to_erase, self.setup_graph)
+        assert setup_node is not None, "setup_node is None"
+        self.setup_graph.erase_node(setup_node)
+        super().erase_node(to_erase)
+        cleanup_node = self._lookup_node(to_erase, self.cleanup_graph)
+        self.cleanup_graph.erase_node(cleanup_node)
+
+    def placeholder(
+        self,
+        name: str,
+        type_expr: Optional[Any] = None,
+        default_value: Any = inspect.Signature.empty,
+    ) -> fx.Node:
+        if self._freeze_cross_iter_movement:
+            return super().placeholder(name, type_expr, default_value)
+
+        main_placeholder = super().placeholder(name, type_expr, default_value)
+        setup_placeholder = self.setup_graph.placeholder(name, type_expr, default_value)
+        cleanup_placeholder = self.cleanup_graph.placeholder(
+            name, type_expr, default_value
+        )
+        self._setup_mapping[main_placeholder] = setup_placeholder
+        self._cleanup_mapping[main_placeholder] = cleanup_placeholder
+        return main_placeholder
+
+    def output(self, result: Argument, type_expr: Optional[Any] = None) -> fx.Node:
+        if self._freeze_cross_iter_movement:
+            return super().output(result, type_expr)
+
+        main_output = super().output(result, type_expr)
+        setup_result = tree_map(
+            lambda _result: self._lookup_node(_result, self.setup_graph)
+            if isinstance(_result, fx.Node)
+            else _result,
+            result,
+        )
+        cleanup_result = tree_map(
+            lambda _result: self._lookup_node(_result, self.cleanup_graph)
+            if isinstance(_result, fx.Node)
+            else _result,
+            result,
+        )
+        self.setup_graph.output(setup_result, type_expr)
+        self.cleanup_graph.output(cleanup_result, type_expr)
+
+        return main_output
+
+    def lint(self) -> None:
+        self.setup_graph.lint()
+        super().lint()
+        self.cleanup_graph.lint()
+
+    def node_prepend(self, target_node: fx.Node, node: fx.Node) -> None:
+        """Prepend node to target_node."""
+        if self._freeze_cross_iter_movement:
+            target_node.prepend(node)
+            return
+
+        for graph in self._all_graphs:
+            actual_node = self._lookup_node(node, graph)
+            assert actual_node is not None, "The node is None"
+            actual_target_node = self._lookup_node(target_node, graph)
+            assert actual_target_node is not None, "The target node is None"
+            actual_target_node.prepend(actual_node)
+
+    def node_append(self, target_node: fx.Node, node: fx.Node) -> None:
+        """Append node to target_node."""
+        if self._freeze_cross_iter_movement:
+            target_node.append(node)
+            return
+
+        for graph in self._all_graphs:
+            actual_node = self._lookup_node(node, graph)
+            assert actual_node is not None, f"The actual node is None, {node}."
+            actual_target_node = self._lookup_node(target_node, graph)
+            assert (
+                actual_target_node is not None
+            ), f"The actual target node is None, {target_node}."
+            actual_target_node.append(actual_node)
+
+    def node_set_args(self, node: fx.Node, args: Tuple[Argument, ...]) -> None:
+        if self._freeze_cross_iter_movement:
+            node.args = args
+            return
+
+        setup_args = tree_map_only(
+            fx.Node, lambda _arg: self._lookup_node(_arg, self.setup_graph), args
+        )
+        setup_node = self._lookup_node(node, self.setup_graph)
+        assert setup_node is not None
+        setup_node.args = setup_args
+        cleanup_args = tree_map_only(
+            fx.Node, lambda _arg: self._lookup_node(_arg, self.cleanup_graph), args
+        )
+        cleanup_node = self._lookup_node(node, self.cleanup_graph)
+        assert cleanup_node is not None
+        cleanup_node.args = cleanup_args
+        node.args = args
+
+    def node_set_kwargs(self, node: fx.Node, kwargs: Dict[str, Argument]) -> None:
+        if self._freeze_cross_iter_movement:
+            node.kwargs = kwargs
+            return
+
+        setup_kwargs = tree_map_only(
+            fx.Node, lambda _arg: self._lookup_node(_arg, self.setup_graph), kwargs
+        )
+        setup_node = self._lookup_node(node, self.setup_graph)
+        assert setup_node is not None
+        setup_node.kwargs = setup_kwargs
+        cleanup_kwargs = tree_map_only(
+            fx.Node, lambda _arg: self._lookup_node(_arg, self.cleanup_graph), kwargs
+        )
+        cleanup_node = self._lookup_node(node, self.cleanup_graph)
+        assert cleanup_node is not None
+        cleanup_node.kwargs = cleanup_kwargs
+        node.kwargs = kwargs
+
+    def node_replace_all_uses_with(
+        self,
+        node: fx.Node,
+        replace_with: fx.Node,
+        delete_user_cb: Callable[[fx.Node], bool] = lambda user: True,
+        *,
+        propagate_meta=False,
+    ) -> List[fx.Node]:
+        for graph in self._all_graphs:
+            actual_node = self._lookup_node(node, graph)
+            actual_replace_with = self._lookup_node(replace_with, graph)
+            assert actual_node is not None
+            ret = actual_node.replace_all_uses_with(
+                actual_replace_with,
+                delete_user_cb,
+                propagate_meta=propagate_meta,
+            )
+        return ret  # type: ignore[possibly-undefined]
+
+    def node_add_user(self, node: fx.Node, user: Any) -> None:
+        for graph in self._all_graphs:
+            actual_node = self._lookup_node(node, graph)
+            if isinstance(user, fx.Node):
+                actual_user_node = self._lookup_node(user, graph)
+            else:
+                actual_user_node = user
+            assert actual_node is not None
+            actual_node.users[actual_user_node] = None  # type: ignore[index]
+
+    def node_remove_user(self, node: fx.Node, user: Any) -> None:
+        for graph in self._all_graphs:
+            actual_node = self._lookup_node(node, graph)
+            if isinstance(user, fx.Node):
+                actual_user_node = self._lookup_node(user, graph)
+            else:
+                actual_user_node = user
+            assert actual_node is not None
+            del actual_node.users[actual_user_node]  # type: ignore[arg-type]
+
+    def keep_unused_nodes(self) -> None:
+        for node in self.nodes:
+            if len(node.users) == 0 and str(node.op) != "output":
+                self.node_add_user(node, "__hold__")
+
+    def functionalize_optim(self) -> None:
+        # IterGraph can only support full graph (fwd+bwd+optim). As optimizer
+        # is not a functional call (it is inplace op), this method adds the of
+        # the optimizer call. This method has strong assumption of the optimizer
+        # and may not always be working. This method is intended be a temporary
+        # solution only.
+
+        # TODO: remove this API after DCE is removed
+        for node in reversed(self.nodes):
+            if node.name.startswith("output"):
+                output_node = node
+            elif node.name.startswith(
+                "_fused_adam_",
+            ):
+                optim_node = node
+            elif node.name.startswith(
+                "_foreach_add_",
+            ):
+                step_node = node
+                self.node_add_user(optim_node, output_node)  # type: ignore[possibly-undefined]
+                self.node_add_user(step_node, optim_node)  # type: ignore[possibly-undefined]
+
+    def defunctionalize_optim(self) -> None:
+        # TODO: remove this API after DCE is not used with IterGraph
+        for graph in self._all_graphs:
+            for node in reversed(graph.nodes):
+                if node.name.startswith("output"):
+                    output_node = node
+                elif node.name.startswith(
+                    "_fused_adam_",
+                ):
+                    optim_node = node
+                elif node.name.startswith(
+                    "_foreach_add_",
+                ):
+                    step_node = node
+                    optim_node.users.pop(output_node, None)  # type: ignore[possibly-undefined]
+                    step_node.users.pop(optim_node, None)  # type: ignore[possibly-undefined]
+
+    def freeze_cross_iter_movement(self) -> None:
+        self._freeze_cross_iter_movement = True
+
+
+class IterGraphModule(nn.Module):
+    """``IterGraphModule`` provides the ability to do cross-iteration optimization.
+
+    Given a ``fx.GraphModule``, main_gm, ``IterGraphModule`` internally
+    duplicate it to 3 copies and redirect the ``forward`` request to a different
+    ``fx.GraphModule`` based on the iteration count. This allows users to do
+    graph optimizations that across iterations (e.g., moving collective wait in
+    the backward to the forward of the next iteration).
+
+    Note that users must call the APIs provided by ``IterGraphModule`` or
+    ``IterGraph`` to rewrite the graph so that ``IterGraphModule`` can keep the
+    data dependency for all 3 graphs.
+    """
+
+    def __init__(
+        self,
+        main_gm: fx.GraphModule,
+        max_iters: int = -1,
+        enable_inductor: bool = False,
+    ) -> None:
+        super().__init__()
+
+        def _copy_gm(src: fx.GraphModule, graph: fx.Graph) -> fx.GraphModule:
+            gm = fx.GraphModule(src, graph)
+            gm.meta = getattr(graph, "meta", {})
+            return gm
+
+        self.setup_gm = _copy_gm(main_gm, copy.deepcopy(main_gm.graph))
+        self.cleanup_gm = _copy_gm(main_gm, copy.deepcopy(main_gm.graph))
+        self.main_gm = _copy_gm(
+            main_gm,
+            IterGraph(main_gm.graph, self.setup_gm.graph, self.cleanup_gm.graph),
+        )
+
+        self._iter = 0
+        self._max_iters = max_iters
+        self._previous_output: Tuple[Any, ...] = tuple()
+        self._num_extra_output = 0
+        self._is_frozen = False
+        self._enable_inductor = enable_inductor
+
+    def finalize_setup(self) -> None:
+        """Set up the internal states and also get the signal from users that what is the maximum iteration count.
+
+        This method must be called before the forward() is called.
+        """
+        if not self._is_frozen:
+            self.graph.freeze_cross_iter_movement()
+            self._num_extra_output = self.graph.num_extra_output
+            if self._enable_inductor:
+                self.main_gm = partial_lower(self.main_gm)
+            self._is_frozen = True
+
+        self._iter = 0
+
+    def _run(self, gm: fx.GraphModule, last_iter: bool, *args, **kwargs) -> Any:
+        if self._num_extra_output > 0:
+            new_args = args + (self._previous_output)
+            output = gm(*new_args, **kwargs)
+            if not last_iter:
+                assert len(output) == 2
+                self._previous_output = tuple(output[-1])
+                assert (
+                    len(self._previous_output) > 0
+                ), "There should be at least one extra output."
+                output = output[0]
+        else:
+            # No cross-iteration optimization is done. Simply call the
+            # GraphModule.
+            output = gm(*args, **kwargs)
+        return output
+
+    def forward(self, *args: Any, last_iter: bool = False, **kwargs: Any) -> Any:
+        self._iter += 1
+        last_iter = last_iter or self._iter == self._max_iters
+        if last_iter:
+            logger.info("Using the cleanup graph")
+            gm = self.cleanup_gm
+            profiler_string = "## IterGraphModule: Cleanup Graph ##"
+            self._iter = 0
+        elif self._iter == 1:
+            logger.info("Using the setup graph")
+            gm = self.setup_gm
+            profiler_string = "## IterGraphModule: Setup Graph ##"
+        else:
+            gm = self.main_gm
+            if self._iter == 2:
+                logger.info("Using the main graph")
+                profiler_string = "## IterGraphModule -- Maybe Compiling ##"
+            else:
+                profiler_string = "## IterGraphModule ##"
+
+        with record_function(profiler_string):
+            return self._run(gm, last_iter, *args, **kwargs)
+
+    @property
+    def graph(self) -> IterGraph:
+        return cast(IterGraph, self.main_gm.graph)
+
+    def recompile(self) -> PythonCode:
+        self.setup_gm.recompile()
+        self.cleanup_gm.recompile()
+        return self.main_gm.recompile()
+
+    def freeze_cross_iter_movement(self) -> None:
+        # TODO: remove this API once it is not used.
+        self.graph.freeze_cross_iter_movement()
+        self._num_extra_output = self.graph.num_extra_output
+
+    def print_readable(self, print_output: bool = True) -> str:
+        return self.main_gm.print_readable(print_output)
+
+    def print_all_graphs(self) -> None:
+        logger.info("Printing the three fx.Graph:")
+        logger.info("1. Setup fx.Graph:")
+        logger.info("%s", self.setup_gm.graph)
+        logger.info("2. Main fx.Graph:")
+        logger.info("%s", self.main_gm.graph)
+        logger.info("3. Cleanup fx.Graph:")
+        logger.info("%s", self.cleanup_gm.graph)
+
+    def print_all_graph_modules(self) -> None:
+        logger.info("Printing the three fx gm:")
+        logger.info("1. Setup fx.GraphModule:")
+        logger.info("%s", self.setup_gm.print_readable(False))
+        logger.info("2. Main fx.GraphModule:")
+        logger.info("%s", self.main_gm.print_readable(False))
+        logger.info("3. Cleanup fx.GraphModule:")
+        logger.info("%s", self.cleanup_gm.print_readable(False))
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/log_utils.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/log_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c792e8649c96851ffd7c9ba1df12d4dce67a9bbe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/log_utils.py
@@ -0,0 +1,78 @@
+import logging
+import logging.config
+import os
+from typing import Optional
+
+import torch.distributed as dist
+
+
+LOGGING_CONFIG = {
+    "version": 1,
+    "formatters": {
+        "spmd_format": {"format": "%(name)s: [%(levelname)s] %(message)s"},
+        "graph_opt_format": {"format": "%(name)s: [%(levelname)s] %(message)s"},
+    },
+    "handlers": {
+        "spmd_console": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "spmd_format",
+            "stream": "ext://sys.stdout",
+        },
+        "graph_opt_console": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "graph_opt_format",
+            "stream": "ext://sys.stdout",
+        },
+        "null_console": {
+            "class": "logging.NullHandler",
+        },
+    },
+    "loggers": {
+        "spmd_exp": {
+            "level": "DEBUG",
+            "handlers": ["spmd_console"],
+            "propagate": False,
+        },
+        "graph_opt": {
+            "level": "DEBUG",
+            "handlers": ["graph_opt_console"],
+            "propagate": False,
+        },
+        "null_logger": {
+            "handlers": ["null_console"],
+            "propagate": False,
+        },
+        # TODO(anj): Add loggers for MPMD
+    },
+    "disable_existing_loggers": False,
+}
+
+
+def get_logger(log_type: str) -> Optional[logging.Logger]:
+    from torch.distributed._spmd import config
+
+    if "PYTEST_CURRENT_TEST" not in os.environ:
+        logging.config.dictConfig(LOGGING_CONFIG)
+        avail_loggers = list(LOGGING_CONFIG["loggers"].keys())  # type: ignore[attr-defined]
+        assert (
+            log_type in avail_loggers
+        ), f"Unable to find {log_type} in the available list of loggers {avail_loggers}"
+
+        if not dist.is_initialized():
+            return logging.getLogger(log_type)
+
+        if dist.get_rank() == 0:
+            logger = logging.getLogger(log_type)
+            logger.setLevel(config.log_level)
+            if config.log_file_name is not None:
+                log_file = logging.FileHandler(config.log_file_name)
+                log_file.setLevel(config.log_level)
+                logger.addHandler(log_file)
+        else:
+            logger = logging.getLogger("null_logger")
+
+        return logger
+
+    return logging.getLogger("null_logger")
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/parallel_mode.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/parallel_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..da719c8086eb48750b900fe8d969099b87d7ef1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/parallel_mode.py
@@ -0,0 +1,216 @@
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.utils._pytree as pytree
+from torch._subclasses import FakeTensorMode
+from torch.distributed._spmd.data_parallel import (
+    DataParallelStyle,
+    partition_data_parallel,
+)
+from torch.distributed._spmd.distribute import _convert_to_distributed, Schema
+from torch.distributed._tensor import DeviceMesh, Placement, Replicate, Shard
+
+from torch.fx import GraphModule
+
+
+class ParallelMode(ABC):
+    """
+    Basic Parallel Mode interface. Each parallelism pattern should implement
+    this interface to describe how to partition and compile the graph in the
+    spmd compiler.
+    """
+
+    @abstractmethod
+    def partition(
+        self,
+        gm: GraphModule,
+        model: torch.nn.Module,
+        optimizer: Optional[torch.optim.Optimizer],
+        params_and_buffers: Dict[str, Any],
+        named_states: Dict[str, Any],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> GraphModule:
+        """
+        Partition a single device graph to a distributed graph.
+
+        TODO(@wanchaol): some of these arguments are not necessary for
+        partitioning, remove the unnecessary ones later.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def transform_and_compile(self, gm: GraphModule) -> GraphModule:
+        """
+        Transform and compile a distributed graph with a set of graph
+        transformation and optimization passes for each parallel mode.
+
+        The returned result should be a compiled executable graph in
+        the distributed environment.
+        """
+        # TODO: add more necessary arguments to this interface.
+        raise NotImplementedError()
+
+
+class DataParallel(ParallelMode):
+    """Data Parallelism mode."""
+
+    def __init__(
+        self,
+        parallel_style: str = "replicate",
+        *,
+        input_batch_dim: int = 0,
+        custom_passes: Optional[Callable[[GraphModule], GraphModule]] = None,
+    ):
+        """
+        DataParallel Mode that partition the model and graph to data parallel style
+        parallelism (i.e. DDP/FSDP/ZERO-3). It currently supports three different
+        parallel styles: "replicate", "fully_shard", and "default". See
+        :class:`DataParallelStyle` for more details.
+
+        Args:
+            parallel_style (str): parallel style to use. Currently supports
+                "replicate", "fully_shard", and "default".
+
+        Keyword args:
+            input_batch_dim (int): the batch dimension of the input tensor.
+                 default: 0
+            custom_passes (Callable[[GraphModule], GraphModule], optional):
+                A custom callable that overrides the default graph transformation
+                and optimization passes.
+        """
+        if parallel_style == "replicate":
+            self.parallel_style = DataParallelStyle.REPLICATE
+        elif parallel_style == "fully_shard":
+            self.parallel_style = DataParallelStyle.FULLY_SHARD
+        elif parallel_style == "default":
+            self.parallel_style = DataParallelStyle.DEFAULT
+        else:
+            raise RuntimeError(f"Unknown parallel style: {parallel_style}")
+
+        # TODO: what if user passes in a incorrect `input_batch_dim`, how should we
+        # detect that and do proper error handling?
+        self.input_batch_dim = input_batch_dim
+
+        if custom_passes is not None:
+            self._gm_passes: Callable[[GraphModule], GraphModule] = custom_passes
+        else:
+            # TODO: add a few default passes here.
+            self._gm_passes = lambda gm: gm
+
+    def partition(
+        self,
+        gm: GraphModule,
+        model: torch.nn.Module,
+        optimizer: Optional[torch.optim.Optimizer],
+        params_and_buffers: Dict[str, Any],
+        named_states: Dict[str, Any],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> GraphModule:
+        # TODO: figure out a way to avoid explicit "cuda" mesh.
+        mesh = DeviceMesh("cuda", torch.arange(dist.get_world_size()))
+
+        gm = partition_data_parallel(
+            gm,
+            model,
+            optimizer,
+            params_and_buffers,
+            named_states,
+            args,
+            kwargs,
+            mesh,
+            self.parallel_style,
+            self.input_batch_dim,
+        )
+        return gm
+
+    def transform_and_compile(self, gm: GraphModule) -> GraphModule:
+        """optimize a distributed graph with a set of optimization passes"""
+        # TODO: add more necessary arguments to this interface.
+        return self._gm_passes(gm)
+
+
+class DTensorExpandMode(ParallelMode):
+    """
+    The DTensor Expand mode. It's replicating the parameters and
+    shard the inputs to represent DDP like behavior, it's currently
+    a transitent mode before we move to the new data parallel expansion.
+    """
+
+    def __init__(
+        self, custom_passes: Optional[Callable[[GraphModule], GraphModule]] = None
+    ):
+        self._placements_override: Dict[int, List[Placement]] = {}
+        if custom_passes is not None:
+            self._gm_passes: Callable[[GraphModule], GraphModule] = custom_passes
+        else:
+            # TODO: add a few default passes here.
+            self._gm_passes = lambda gm: gm
+
+    def partition(
+        self,
+        gm: GraphModule,
+        model: torch.nn.Module,
+        optimizer: Optional[torch.optim.Optimizer],
+        params_and_buffers: Dict[str, Any],
+        named_states: Dict[str, Any],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> GraphModule:
+        flat_args = pytree.arg_tree_leaves(*args, **kwargs)
+
+        mesh = DeviceMesh("cuda", torch.arange(dist.get_world_size()).cuda())
+        shard_schema: Schema = Schema(mesh=mesh, placements=[Shard(0)])
+        # FIXME: allow other sharding schemas
+        replicate_schema: Schema = Schema(mesh=mesh, placements=[Replicate()])
+
+        inps, schemas = [], []
+
+        for p in pytree.tree_leaves(params_and_buffers):
+            assert isinstance(p, torch.Tensor), f"expecting Tensor but got {type(p)}"
+            inps.append(p)
+            schemas.append(replicate_schema)
+
+        for o in pytree.tree_leaves(named_states):
+            if isinstance(o, torch.Tensor):
+                inps.append(o)
+                schemas.append(replicate_schema)
+            else:
+                inps.append(torch.empty(0))
+                schemas.append(replicate_schema)
+
+        for a in flat_args:
+            if isinstance(a, torch.Tensor):
+                inps.append(a)
+                if id(a) in self._placements_override:
+                    schemas.append(
+                        Schema(mesh=mesh, placements=self._placements_override[id(a)])
+                    )
+                else:
+                    schemas.append(shard_schema)
+            else:
+                # Create dummy tensor and schema for non-tensor inputs for
+                # the purpose of dtensor expansion. Non-tensor inputs are
+                # guaranteed unused in dispatcher graphs produced by make_fx.
+                # However, we still need to respect them so that tensor inputs
+                # match wtih their placeholders.
+                inps.append(torch.empty(0))
+                schemas.append(shard_schema)
+
+        with FakeTensorMode(allow_non_fake_inputs=True):
+            fake_inps = [torch.empty_like(inp) for inp in inps]
+
+        return _convert_to_distributed(
+            gm, fake_inps, schemas, default_mesh=mesh, _allow_partial=False
+        )[0]
+
+    def transform_and_compile(self, gm: GraphModule) -> GraphModule:
+        """
+        Transform and compile a distributed graph with a set of graph transformation
+        and optimization passes for the dtensor fallback parallel mode.
+        """
+        # TODO: move the trasnformation passed to this function
+        return self._gm_passes(gm)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_spmd/partial_lower.py b/MLPY/Lib/site-packages/torch/distributed/_spmd/partial_lower.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd6fd85fb425b43728e88586090feffec05a84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_spmd/partial_lower.py
@@ -0,0 +1,268 @@
+# This file is copied from Meta internal repo and is not synced with the
+# internal version. Once the internal version is fully mature, we should
+# upstream again and retire the internal version. @yifuwang
+
+import logging
+import operator
+from typing import Callable, List, Optional, Set, Tuple
+
+from functorch import make_fx
+
+import torch
+
+from torch._inductor.compile_fx import compile_fx_inner
+from torch._inductor.decomposition import select_decomp_table
+
+MIN_ATEN_OPS_TO_LOWER = 10
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _create_subgraph_module(
+    inputs: List[torch.fx.Node], body: List[torch.fx.Node], outputs: List[torch.fx.Node]
+) -> torch.fx.GraphModule:
+    subgraph: torch.fx.Graph = torch.fx.Graph()
+    node_to_subgraph_node = {}
+    for idx, inp in enumerate(inputs):
+        subgraph_inp = subgraph.placeholder(name=f"arg_{idx}")
+        subgraph_inp.meta = inp.meta
+        node_to_subgraph_node[inp] = subgraph_inp
+
+    for node in body:
+        subgraph_node = subgraph.node_copy(
+            node, arg_transform=lambda x: node_to_subgraph_node[x]
+        )
+        node_to_subgraph_node[node] = subgraph_node
+
+    subgraph.output(result=tuple(node_to_subgraph_node[x] for x in outputs))
+    subgraph.eliminate_dead_code()
+    subgraph.lint()
+    return torch.fx.GraphModule(root={}, graph=subgraph)
+
+
+def _is_container_node(node: torch.fx.Node) -> bool:
+    if any(user.target == operator.getitem for user in node.users):
+        assert all(user.target == operator.getitem for user in node.users), (
+            "Malformed graph: a container node is used as input for non-getitem nodes."
+            "\nNode: {fmt_node}\nUsers: {fmt_users}".format(
+                fmt_node=node.format_node(),
+                fmt_users="\n".join(u.format_node() for u in node.users),
+            )
+        )
+        return True
+    return False
+
+
+def _lower_subgraph_nodes(
+    gm: torch.fx.GraphModule,
+    subgraph_name: str,
+    subgraph_nodes: List[torch.fx.Node],
+    dumper: Callable[[str], str],
+) -> None:
+    prologue: List[torch.fx.Node] = []
+    inputs: List[torch.fx.Node] = []
+    body: List[torch.fx.Node] = []
+    visible: Set[torch.fx.Node] = set()
+
+    # Inductor requires all graph input to be tensors. When adding a container
+    # node as subgraph input, add its descendant getitem nodes to the subgraph
+    # prologue and add its leaf getitem nodes to the subgraph input.
+    def add_input(arg: torch.fx.Node) -> None:
+        stack = [arg]
+        while len(stack) != 0:
+            node = stack.pop()
+            if _is_container_node(node):
+                # We should only prepone nodes within subgraph_nodes
+                prologue.extend(user for user in node.users if user in subgraph_nodes)
+                stack.extend(node.users)
+            else:
+                if node not in visible:
+                    inputs.append(node)
+                    visible.add(node)
+
+    for node in subgraph_nodes:
+        if node.op == "get_attr":
+            # Prepone get_attr to avoid having to copy
+            # the attribute to the subgraph module.
+            inputs.append(node)
+            visible.add(node)
+            continue
+
+        for arg in node.all_input_nodes:
+            if arg not in visible:
+                add_input(arg)
+
+        if node not in prologue:
+            body.append(node)
+            visible.add(node)
+
+    outputs: List[torch.fx.Node] = []
+
+    # Inductor requires all graph output to be tensors. When adding a container
+    # node as subgraph output, add its descendant getitem nodes to the subgraph
+    # body and add its leaf getitem nodes to the subgraph output.
+    def add_output(output: torch.fx.Node) -> None:
+        stack = [output]
+        while len(stack) != 0:
+            node = stack.pop()
+            if _is_container_node(node):
+                body.extend(node.users)
+                stack.extend(node.users)
+            elif not all(user in visible for user in node.users):
+                if node not in outputs:
+                    outputs.append(node)
+
+    for node in body:
+        if not all(user in visible for user in node.users):
+            add_output(node)
+
+    assert len(inputs) == len(set(inputs))
+    assert len(outputs) == len(set(outputs))
+
+    subgraph_module = _create_subgraph_module(inputs, body, outputs)
+    readable_tag = dumper(str(subgraph_module.graph))
+    setattr(gm, subgraph_name, _InductorModule(subgraph_module))
+
+    insertion_point = subgraph_nodes[-1].next
+    for node in prologue:
+        insertion_point.prepend(node)
+
+    with gm.graph.inserting_before(insertion_point):
+        # Insert subgraph call
+        subgraph_call = gm.graph.create_node(
+            op="call_module",
+            target=subgraph_name,
+            args=tuple(inputs),
+            kwargs={"tag": readable_tag},
+        )
+        # Replace parent graph nodes with their corresponding subgraph outputs
+        for idx, output in enumerate(outputs):
+            new_output = gm.graph.create_node(
+                op="call_function",
+                target=operator.getitem,
+                args=(subgraph_call, idx),
+            )
+            new_output.meta = output.meta
+            output.replace_all_uses_with(new_output)
+
+    # Erase lowered nodes from the parent graph
+    for node in reversed(body + outputs):
+        if len(node.users) == 0:
+            gm.graph.erase_node(node)
+
+
+class _InductorModule(torch.nn.Module):
+    def __init__(self, gm: torch.fx.GraphModule) -> None:
+        super().__init__()
+        self.gm = gm
+        self.compiled: Optional[
+            Callable[[List[torch.Tensor]], List[torch.Tensor]]
+        ] = None
+
+    def forward(self, *args: torch.Tensor, tag: str) -> List[torch.Tensor]:
+        if self.compiled is None:
+            inductor_decompositions = select_decomp_table()
+            # TODO: figure out why turning on cudagraphs cause exceptions.
+            decomp_gm = make_fx(self.gm, decomposition_table=inductor_decompositions)(
+                *args
+            )
+            logger.info("Lowering subgraph (%s) to Inductor...", tag)
+            self.compiled = compile_fx_inner(
+                decomp_gm,
+                list(args),
+                cudagraphs=False,
+            )
+            logger.info("Completed lowering subgraph (%s) to Inductor", tag)
+        with torch.profiler.record_function(tag):
+            assert self.compiled is not None
+            return self.compiled(list(args))
+
+
+def _is_inductor_compatible(node: torch.fx.Node) -> Tuple[bool, str]:
+    # `has_tag` is not supported yet
+    # if has_tag(node, "non_lowerable"):
+
+    if node.target in (
+        torch.ops.aten._fused_adam_.default,
+        torch.ops.aten._fused_adam.default,
+        torch.ops.aten._foreach_add_.Scalar,
+        torch.ops.aten._foreach_add.Scalar,
+    ):
+        return False, "fused adam is not supported yet"
+
+    # TODO(yifu): apparently having a meta kernel is not a necessary
+    # condition for Inductor compatiblity. We should refine the check.
+    # Sneaking this one in for now to support comm_fusion_with_cat.
+    if node.target == torch.ops.aten.flatten.using_ints:
+        return True, ""
+
+    if isinstance(node.target, torch._ops.OpOverload):
+        if not node.target.has_kernel_for_dispatch_key(torch._C.DispatchKey.Meta):
+            return False, f"{node.target} doesn't have a meta kernel registered"
+    return True, ""
+
+
+def _subgraph_predicate(nodes: List[torch.fx.Node]) -> bool:
+    num_aten_ops = len([n for n in nodes if str(n.target).startswith("aten.")])
+    return num_aten_ops >= MIN_ATEN_OPS_TO_LOWER
+
+
+def partial_lower(
+    gm: torch.fx.GraphModule,
+    node_predicate: Callable[[torch.fx.Node], bool] = lambda x: True,
+    subgraph_predicate: Callable[[List[torch.fx.Node]], bool] = lambda x: True,
+    dumper: Callable[[str], str] = lambda x: "subgraph",
+) -> torch.fx.GraphModule:
+    """
+    Lower Inductor compatible portions of the graph module to Inductor.
+
+    Args:
+        node_predicate: user predicate for determining whether to consider a node for
+            lowering.
+        subgraph_predicate: user predicate for determining whether to consider a list of
+            candidate nodes for lowering.
+        dumper: a callback for dumping subgraphs for human digestion. For exmaple, it
+            can be a function that writes to disk/blob storage and returns the
+            path/handle. The returned path/handle for each subgraph will be made
+            available in the subgraph call node in the parent graph, as well as the
+            label of the profiler block for the subgraph.
+    """
+    nodes_per_subgraph: List[List[torch.fx.Node]] = [[]]
+    ptr = next(iter(gm.graph.nodes))
+
+    def _node_predicate(node: torch.fx.Node) -> Tuple[bool, str]:
+        should_lower, reason = _is_inductor_compatible(node)
+        if not should_lower:
+            return should_lower, reason
+        if not node_predicate(node):
+            return False, "user predicate"
+        return True, ""
+
+    while ptr.op != "output":
+        if ptr.op == "placeholder":
+            ptr = ptr.next
+            continue
+        should_lower, reason = _node_predicate(ptr)
+        if should_lower:
+            nodes_per_subgraph[-1].append(ptr)
+        else:
+            if len(nodes_per_subgraph[-1]) > 0:
+                logger.warning(
+                    "partial_lower: graph break at %s. Reason: %s", str(ptr), reason
+                )
+            nodes_per_subgraph.append([])
+        ptr = ptr.next
+
+    nodes_per_subgraph = [
+        nodes
+        for nodes in nodes_per_subgraph
+        if subgraph_predicate(nodes) and _subgraph_predicate(nodes)
+    ]
+
+    for idx, subgraph_nodes in enumerate(nodes_per_subgraph):
+        subgraph_name = f"subgraph_{idx}"
+        _lower_subgraph_nodes(gm, subgraph_name, subgraph_nodes, dumper)
+
+    gm.graph.lint()
+    gm.recompile()
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/distributed/_state_dict_utils.py b/MLPY/Lib/site-packages/torch/distributed/_state_dict_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ce134241ae2a1b067553402aff8783b40cc6934
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_state_dict_utils.py
@@ -0,0 +1,385 @@
+import io
+import math
+from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+
+if dist.is_available() or TYPE_CHECKING:
+    from torch.distributed import distributed_c10d
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+    from torch.distributed._tensor import DTensor, Replicate
+
+
+def _identity_func(
+    obj: torch.Tensor,
+    pg: Optional[dist.ProcessGroup],
+    device: Optional[torch.device],
+    companion_obj: Any,
+) -> torch.Tensor:
+    return obj
+
+
+def _all_gather_sharded_tensor(
+    sharded_tensor: "ShardedTensor",
+    pg: Optional[dist.ProcessGroup] = None,
+    device: Optional[torch.device] = None,
+) -> torch.Tensor:
+    if pg is None:
+        pg = distributed_c10d._get_default_group()
+    world_size = dist.get_world_size(pg)
+    shards = sharded_tensor.local_shards()
+    dim_0_size = sharded_tensor.size()[0]  # type: ignore[index]
+    tensor_numel = sharded_tensor.size().numel()  # type: ignore[union-attr]
+    chunk_size = math.ceil(dim_0_size / world_size) * tensor_numel // dim_0_size
+    pg_device = (
+        distributed_c10d._get_pg_default_device(pg) if device is None else device
+    )
+    if shards:
+        local_tensor = shards[0].tensor.flatten()
+        if local_tensor.device.type != pg_device.type:
+            local_tensor = local_tensor.to(pg_device)
+        num_padding = chunk_size - local_tensor.numel()
+        if num_padding > 0:
+            local_tensor = F.pad(local_tensor, [0, num_padding])
+    else:
+        local_tensor = torch.zeros(
+            chunk_size, dtype=sharded_tensor.dtype, device=pg_device
+        )
+
+    tensor = torch.empty(
+        chunk_size * world_size,
+        dtype=local_tensor.dtype,
+        device=pg_device,
+    )
+    dist.all_gather_into_tensor(tensor, local_tensor, group=pg)
+
+    tensor = tensor.narrow(0, 0, tensor_numel).reshape(sharded_tensor.size())
+    return tensor
+
+
+class CompanionMismatch(Exception):
+    ...
+
+
+def _iterate_state_dict(
+    iter_object: Any,
+    sharded_tensor_func: Callable,
+    dtensor_func: Callable,
+    tensor_func: Callable,
+    *,
+    pg: Optional[dist.ProcessGroup] = None,
+    device: Optional[torch.device] = None,
+    cpu_offload: bool = False,
+    companion_obj: Any = None,
+    ranks_only: Tuple[int, ...] = tuple(),
+    type_check: bool = True,
+) -> Dict[str, Any]:
+    # TODO: should we use pytree?
+    cpu_device = torch.device("cpu")
+    if isinstance(iter_object, ShardedTensor):
+        ret = sharded_tensor_func(iter_object, pg, device, companion_obj)
+    elif isinstance(iter_object, DTensor):
+        ret = dtensor_func(iter_object, pg, device, companion_obj)
+    elif isinstance(iter_object, torch.Tensor):
+        ret = tensor_func(iter_object, pg, device, companion_obj)
+    elif (
+        isinstance(iter_object, (int, float, str, bytes, io.BytesIO))
+        or iter_object is None
+    ):
+        ret = iter_object
+    elif isinstance(iter_object, dict):
+        if companion_obj is not None and (
+            not isinstance(companion_obj, dict)
+            or set(companion_obj.keys()) != set(iter_object.keys())
+        ):
+            raise CompanionMismatch()
+
+        ret = {
+            key: _iterate_state_dict(
+                value,
+                sharded_tensor_func,
+                dtensor_func,
+                tensor_func,
+                pg=pg,
+                device=device,
+                cpu_offload=cpu_offload,
+                companion_obj=companion_obj[key] if companion_obj is not None else None,
+                ranks_only=ranks_only,
+                type_check=type_check,
+            )
+            for key, value in iter_object.items()
+        }
+    elif isinstance(iter_object, (list, tuple)):
+        if companion_obj is not None and (
+            not isinstance(companion_obj, (list, tuple))
+            or len(companion_obj) != len(iter_object)
+        ):
+            raise CompanionMismatch()
+
+        ret = [
+            _iterate_state_dict(
+                v,
+                sharded_tensor_func,
+                dtensor_func,
+                tensor_func,
+                pg=pg,
+                device=device,
+                cpu_offload=cpu_offload,
+                companion_obj=companion_obj[idx] if companion_obj is not None else None,
+                ranks_only=ranks_only,
+                type_check=type_check,
+            )
+            for idx, v in enumerate(iter_object)
+        ]
+        if isinstance(iter_object, tuple):
+            ret = tuple(ret)
+    elif not type_check:
+        ret = iter_object
+    else:
+        raise ValueError(f"Unexpected value type {type(iter_object)}")
+
+    if not ranks_only or dist.get_rank(pg) in ranks_only:
+        if isinstance(ret, torch.Tensor) and cpu_offload:
+            if companion_obj is None:
+                ret = ret.to(cpu_device)
+            else:
+                # TODO: support DTensor
+                companion_obj.copy_(ret, non_blocking=True)
+                ret = companion_obj
+    else:
+        ret = {} if isinstance(ret, dict) else None
+
+    return ret
+
+
+def _gather_state_dict(
+    state_dict: Dict[str, Any],
+    *,
+    pg: Optional[dist.ProcessGroup] = None,
+    device: Optional[torch.device] = None,
+    cpu_offload: bool = False,
+    ranks_only: Tuple[int, ...] = tuple(),
+    type_check: bool = True,
+) -> Dict[str, Any]:
+    """
+    Given a state_dict, this API gathers all the ShardedTensors or DTensors in
+    the state_dict.
+
+
+    Args:
+        state_dict (Dict[str, Any]): the target sharded state_dict.
+        pg (Optional[dist.ProcessGroup]): the process group that is used to
+            gather ShardedTensor. Note that gathering a DTensor will use
+            the DeviceMesh. So this argument will be ignored when gathering a
+            DTensor.
+        device: (Optional[torch.device]): the device that is used to
+            perform allgather for ShardedTensor. Note that gathering a DTensor
+            will use the DeviceMesh. So this argument will be ignored when
+            gathering a DTensor.
+        cpu_offload (bool): whether to offload the tensors to CPU memory. The
+            default value is False.
+        ranks_only: (Tuple[int, ...]): if this tuple is empty, all ranks will
+            have the same state_dicts. Otherwise only ranks that in ``ranks_only``
+            have the same state_dicts. Other ranks will get empty state_dicts.
+        type_check: (bool): check if the instance data type is a supported type
+            that can be saved by DCP.  The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
+
+    Returns:
+        The gathered state dictionary.
+    """
+
+    def sharded_tensor_func(value, pg, device, companion_obj):
+        # ShardedTensor does not seem to record the original device type.
+        # So if the tensor is moved to CPU, we won't know the original type.
+        # As a result, we have to rely on the user to tell us the correct one.
+        cpu_device = torch.device("cpu")
+        output_tensor = _all_gather_sharded_tensor(value, pg, device)
+        local_shard_device = (
+            value.local_shards()[0].tensor.device
+            if value.local_shards()
+            else cpu_device
+        )
+        if output_tensor.device != local_shard_device:
+            value = output_tensor.to(local_shard_device)
+        else:
+            value = output_tensor
+        return value
+
+    def dtensor_func(value, pg, device, companion_obj):
+        if value.device != value.device_mesh.device_type:
+            value = value.to(value.device_mesh.device_type)
+        # FSDP all_gather: [Shard(0)] -> [Replicate()]
+        # HSDP all_gather: [Replicate(), Shard(0)] -> [Replicate(), Replicate()]
+        # 2D FSDP + TP all_gather:
+        # - [Shard(0), Shard(n)] -> [Replicate(), Replicate()]
+        # - [Shard(0), Replicate()] -> [Replicate(), Replicate()]
+        placements = [Replicate() for _ in value.placements]
+        value = value.redistribute(
+            device_mesh=value.device_mesh,
+            placements=placements,
+        )
+        # Call `wait()` to force the tensor to be synchronous with respect
+        # to the main stream.
+        # See the discussion in https://github.com/pytorch/pytorch/pull/117799.
+        value = value.to_local()
+        if isinstance(value, AsyncCollectiveTensor):
+            value = value.wait()
+        return value
+
+    return _iterate_state_dict(
+        state_dict,
+        sharded_tensor_func,
+        dtensor_func,
+        _identity_func,
+        pg=pg,
+        device=device,
+        cpu_offload=cpu_offload,
+        ranks_only=ranks_only,
+        type_check=type_check,
+    )
+
+
+def _offload_state_dict_to_cpu(
+    state_dict: Dict[str, Any],
+    *,
+    ranks_only: Tuple[int, ...] = tuple(),
+    cpu_offload_state_dict: Optional[Dict[str, Any]] = None,
+    cpu_offload_sync: bool = True,
+    type_check: bool = True,
+) -> Dict[str, Any]:
+    """
+    Given a state_dict, this API offload all the tensors to CPU memory.
+
+    Args:
+        state_dict (Dict[str, Any]): the target state_dict.
+        pg (Optional[dist.ProcessGroup]): the process group that is used to
+            gather ShardedTensor. Note that gathering a DTensor will use
+            the DeviceMesh. So this argument will be ignored when gathering a
+            DTensor.
+        ranks_only: (Tuple[int, ...]): if this tuple is empty, all ranks will
+            have the same state_dicts. Otherwise only ranks that in ``ranks_only``
+            have the same state_dicts. Other ranks will get empty state_dicts.
+        cpu_offload_state_dict (Optional[Dict[str, Any]]): the CPU state_dict
+            that will be returned. If this is not None, this API will use
+            `copy_` to copy the GPU tensor to the tensor in this CPU state_dict.
+            This CPU state_dict must have exactly the same structure as the
+            `state_dict` the only difference is that all the tensors in this
+            CPU state_dict are on CPU memory.
+        cpu_offload_sync: (bool): flag to decide whether to call `synchronize()`
+            before this API returns.
+        type_check: (bool): check if the instance data type is a supported type
+            that can be saved by DCP.  The current supported data types are
+            torch.Tensor, DTensor, int, float, str, list, dict, None.
+
+    Returns:
+        The gathered state dictionary.
+    """
+
+    ret = _iterate_state_dict(
+        state_dict,
+        _identity_func,
+        _identity_func,
+        _identity_func,
+        pg=None,
+        device=None,
+        cpu_offload=True,
+        ranks_only=ranks_only,
+        companion_obj=cpu_offload_state_dict,
+        type_check=type_check,
+    )
+    if cpu_offload_state_dict is not None and cpu_offload_sync:
+        torch.cuda.synchronize()
+    return ret
+
+
+def _create_cpu_state_dict(
+    state_dict: Dict[str, Any], pin_memory: bool = False, share_memory: bool = False
+) -> Dict[str, Any]:
+    """
+    Given a state_dict, create another state_dict with the same structure and elements.
+    However, all tensors in the returned state_dict are new tensors on CPU. These
+    tensors can be placed on pin_memory or share_memory based on the provided arguments.
+    """
+
+    if pin_memory and share_memory:
+        raise ValueError(
+            "Cannot allocate both memory on both pin_memory and share_memory"
+        )
+
+    def tensor_func(
+        obj: torch.Tensor,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+        companion_obj: Any,
+    ) -> torch.Tensor:
+        if len(obj.size()) == 0:
+            return torch.tensor(0, dtype=obj.dtype)
+
+        if share_memory:
+            return torch.empty(
+                *tuple(companion_obj.size()), dtype=companion_obj.dtype
+            ).share_memory_()
+        else:
+            return torch.empty(
+                *tuple(companion_obj.size()), dtype=companion_obj.dtype
+            ).pin_memory()
+
+    ret = _iterate_state_dict(
+        state_dict,
+        _identity_func,
+        _identity_func,
+        tensor_func,
+        pg=None,
+        device=None,
+        cpu_offload=False,
+        ranks_only=tuple(),
+        companion_obj=state_dict,
+        type_check=False,
+    )
+    return ret
+
+
+def _check_state_dict_similarity(
+    state_dict: Dict[str, Any],
+    compared_state_dict: Dict[str, Any],
+) -> bool:
+    """
+    Given two state_dicts, check if the structures are the same. And
+    if a [key, tensor] pair exist in one state_dict there must be
+    the a corresponding pait, [key, other_tensor], in the other state_dict,
+    where tensor and other_tensor have the same size and dtype.
+
+    Return the check result.
+    """
+
+    def tensor_func(
+        obj: torch.Tensor,
+        pg: Optional[dist.ProcessGroup],
+        device: Optional[torch.device],
+        companion_obj: Any,
+    ) -> torch.Tensor:
+        if companion_obj.dtype != obj.dtype or companion_obj.size() != obj.size():
+            raise CompanionMismatch()
+        return obj
+
+    try:
+        _iterate_state_dict(
+            state_dict,
+            _identity_func,
+            _identity_func,
+            tensor_func,
+            pg=None,
+            device=None,
+            cpu_offload=False,
+            ranks_only=tuple(),
+            companion_obj=compared_state_dict,
+            type_check=False,
+        )
+    except CompanionMismatch:
+        return False
+
+    return True
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c67c146cc9c40dd9b0d697a3dd9724bb693ac39
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/__init__.py
@@ -0,0 +1,342 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Optional, Sequence
+
+# Import all builtin dist tensor ops
+import torch
+import torch.distributed._tensor.ops
+import torch.distributed._tensor.random as random
+from torch.distributed._tensor._utils import compute_local_shape
+from torch.distributed._tensor.api import distribute_module, distribute_tensor, DTensor
+from torch.distributed._tensor.ops.utils import normalize_to_torch_size
+from torch.distributed._tensor.placement_types import Placement, Replicate, Shard
+from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
+
+# All public APIs from dtensor package
+__all__ = [
+    "DTensor",
+    "DeviceMesh",
+    "distribute_tensor",
+    "distribute_module",
+    "init_device_mesh,",
+    "Shard",
+    "Replicate",
+]
+
+
+def _dtensor_init_helper(
+    init_op,
+    size: torch.Size,
+    device_mesh=None,
+    placements=None,
+    **kwargs,
+) -> DTensor:
+    # if device_mesh is None, use the one from mesh resources
+    device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+    kwargs["device"] = device_mesh.device_type
+
+    # set default placements to replicated if not specified
+    placements = placements or tuple(Replicate() for _ in range(device_mesh.ndim))
+
+    # check device_mesh againts placements
+    assert device_mesh.ndim == len(
+        placements
+    ), "mesh dimension does not match the length of placements"
+
+    assert kwargs["layout"] == torch.strided, "layout value not supported!"
+    torch_stride = torch._prims_common.make_contiguous_strides_for(size)
+
+    # get local tensor shape
+    local_shape = compute_local_shape(size, device_mesh, placements)
+    # initialize the local tensor
+    if init_op == torch.full:
+        fill_value = kwargs.pop("fill_value", 0)
+        local_tensor = init_op(local_shape, fill_value, **kwargs)
+    elif init_op == torch.rand or init_op == torch.randn:
+        # this tensor meta is not used except `shape`
+        dtype = kwargs.get("dtype", torch.get_default_dtype())
+
+        from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+
+        tensor_meta = TensorMeta(size, (0,), dtype)
+        spec = DTensorSpec(device_mesh, placements, tensor_meta=tensor_meta)
+
+        if random.is_rng_supported_mesh(device_mesh) and not random._rng_tracker:
+            random._rng_tracker = random.OffsetBasedRNGTracker()
+
+        assert random._rng_tracker is not None
+        with random._rng_tracker._distribute_region(spec):
+            local_tensor = init_op(local_shape, **kwargs)
+    else:
+        local_tensor = init_op(local_shape, **kwargs)
+
+    return DTensor(
+        local_tensor=local_tensor,
+        device_mesh=device_mesh,
+        placements=tuple(placements),
+        shape=size,
+        dtype=local_tensor.dtype,
+        stride=torch_stride,
+        requires_grad=kwargs["requires_grad"],
+    )
+
+
+def ones(
+    *size,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    requires_grad: bool = False,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with the scalar value 1, with the shape defined
+    by the variable argument ``size``.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.ones,
+        torch_size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
+
+
+def empty(
+    *size,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    requires_grad: bool = False,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with uninitialized data. The shape of the :class:`DTensor`
+    is defined by the variable argument ``size``.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: empty(1,2,3..) or empty([1,2,3..]) or empty((1,2,3..))
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).\
+        layout (:class:`torch.layout`, optional): the desired layout of returned :class:`DTensor`.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.empty,
+        torch_size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
+
+
+def full(
+    size,
+    fill_value,
+    *,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    requires_grad: bool = False,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with ``fill_value``. The scalar value type should match
+        ``device_mesh.device_type``.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+        fill_value(Scalar): the value to fill the output tensor with.
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.full,
+        torch_size,
+        fill_value=fill_value,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
+
+
+def rand(
+    *size,
+    requires_grad: bool = False,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with random numbers from a uniform distribution
+        on the interval ``[0, 1)``. The shape of the tensor is defined by the variable
+        argument ``size``.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.rand,
+        torch_size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
+
+
+def randn(
+    *size,
+    requires_grad: bool = False,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with random numbers from a normal distribution
+        with mean 0 and variance 1. The shape of the tensor is defined by the variable
+        argument ``size``.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: ones(1,2,3..) or ones([1,2,3..]) or ones((1,2,3..))
+
+    Keyword args:
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned DTensor.
+            Default: ``torch.strided``.
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks.
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.randn,
+        torch_size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
+
+
+def zeros(
+    *size,
+    requires_grad: bool = False,
+    dtype: Optional[torch.dtype] = None,
+    layout: torch.layout = torch.strided,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Returns a :class:`DTensor` filled with the scalar value 0.
+
+    Args:
+        size (int...): a sequence of integers defining the shape of the output :class:`DTensor`.
+            Can be a variable number of arguments or a collection like a list or tuple.
+            E.g.: zeros(1,2,3..) or zeros([1,2,3..]) or zeros((1,2,3..))
+    Keyword args:
+        requires_grad (bool, optional): If autograd should record operations on the
+            returned :class:`DTensor`. Default: ``False``.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned :class:`DTensor`.
+            Default: if ``None``, uses a global default (see :func:`torch.set_default_dtype`).
+        layout (:class:`torch.layout`, optional): the desired layout of returned :class:`DTensor`.
+            Default: ``torch.strided``.
+        device_mesh: :class:`DeviceMesh` type, contains the mesh info of ranks
+        placements: a sequence of :class:`Placement` type: ``Shard``, ``Replicate``
+
+    Returns:
+        A :class:`DTensor` object on each rank
+    """
+    torch_size = normalize_to_torch_size(size)
+
+    return _dtensor_init_helper(
+        torch.zeros,
+        torch_size,
+        dtype=dtype,
+        layout=layout,
+        requires_grad=requires_grad,
+        device_mesh=device_mesh,
+        placements=placements,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7f1a97ddde89cdf5e16281d7642924ee8daeca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_collective_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_collective_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..053e8ffe80ea0f086a7cf87bee54693836a0bbf1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_collective_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd29b73947615309db4837987dcf569a46b18ae9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dd4c3bd3fc81197e9e16ea5ef1fbb16b540f18b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/device_mesh.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/device_mesh.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84decc8de8cc2fbfeb326d783af0ef2f6563a149
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/device_mesh.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/dispatch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/dispatch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16b21f5945b282c0ebe0fbcdebc7f68c453b1bb8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/dispatch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/op_schema.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/op_schema.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b9cb1d58cf93db23b444f25add94c746ed0fda4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/op_schema.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/placement_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/placement_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05400981c736888571e4e4b0e250b60be723c333
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/placement_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/random.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/random.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..729142ab3d3140c73f7e7e1ce5c9be15eca71176
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/random.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/redistribute.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/redistribute.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..864abcc7f4510e0ea4e22ba526f0aab218fdfbfb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/redistribute.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/sharding_prop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/sharding_prop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b038dd055692de693e3d7f743a9256ca454fcb54
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/sharding_prop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/tp_conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/tp_conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4aa1da6df197117d1b7e58beda383c8ad0f812b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/__pycache__/tp_conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/_collective_utils.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/_collective_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb5ba70d71991f1344daad89b22d8df57b54802
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/_collective_utils.py
@@ -0,0 +1,313 @@
+import logging
+import math
+from dataclasses import dataclass
+from functools import lru_cache
+
+from typing import List, Optional
+
+import torch
+import torch.distributed._tensor.placement_types as placement_types
+from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+from torch.distributed.distributed_c10d import (
+    all_to_all,
+    broadcast,
+    get_global_rank,
+    get_rank,
+    get_world_size,
+    GroupMember,
+    ProcessGroup,
+    scatter,
+    Work,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# TODO: we need to migrate these APIs to be functional collectives
+
+
+def mesh_scatter(
+    output: torch.Tensor,
+    scatter_list: List[torch.Tensor],
+    mesh: DeviceMesh,
+    mesh_dim: int = 0,
+    async_op: bool = False,
+) -> Optional[Work]:
+    """
+    scatter a list of tensors to a device mesh dimension. We by default
+    use the first rank of the mesh dimension as the source of truth, i.e
+    for a 2d mesh [[0, 1], [2, 3]], if we scatter on mesh_dim = 1, we will
+    scatter the tensor list on rank 0 to rank 0/1, and tensor list on rank
+    2 to rank 2/3.
+
+    Args:
+        output (torch.Tensor): the tensor to receive the scattered list.
+        scatter_list (List[torch.Tensor]): the tensor list to be scattered.
+        mesh_dim (int, optional): indicate which mesh dimension we want
+            to scatter on, we by default choose the first rank on the
+            mesh dimension as source of truth.
+
+    Returns:
+        A :class:`Work` object
+    """
+    # TODO: Ideally we should use the meta tensor way
+    # (to register a meta kernel for the collective op)
+    # so that it would avoid the communication. Need to
+    # remove the check below once that is done.
+    if output.is_meta:
+        return None
+    dim_group = mesh.get_group(mesh_dim)
+    assert isinstance(dim_group, ProcessGroup)
+    # src need to be global rank
+    src_for_dim = 0
+
+    if dim_group is not GroupMember.WORLD:
+        src_for_dim = get_global_rank(dim_group, 0)
+
+    if src_for_dim == get_rank():
+        fut = scatter(
+            output,
+            scatter_list=scatter_list,
+            src=src_for_dim,
+            group=dim_group,
+            async_op=async_op,
+        )
+    else:
+        fut = scatter(
+            output,
+            scatter_list=None,
+            src=src_for_dim,
+            group=dim_group,
+            async_op=async_op,
+        )
+
+    return fut
+
+
+def mesh_broadcast(
+    tensor: torch.Tensor,
+    mesh: DeviceMesh,
+    mesh_dim: int = 0,
+    async_op: bool = False,
+) -> Optional[Work]:
+    """
+    broadcast the tensor to a device mesh dimension. We by default
+    use the first rank of the mesh dimension as the source of truth, i.e
+    for a 2d mesh [[0, 1], [2, 3]], if we broadcast on mesh_dim = 1, we will
+    broadcast the tensor on rank 0 to rank 0/1, and tensor on rank 2
+    to rank 2/3.
+
+    Args:
+        tensor (torch.Tensor): tensor to broadcast.
+        mesh_dim (int, optional): indicate which mesh dimension we want
+            to scatter on, we by default choose the first rank on the
+            mesh dimension as source of truth.
+
+    Returns:
+        A :class:`Work` object
+    """
+    # TODO: Ideally we should use the meta tensor way
+    # (to register a meta kernel for the collective op)
+    # so that it would avoid the communication. Need to
+    # remove the check below once that is done.
+    if tensor.is_meta:
+        return None
+    dim_group = mesh.get_group(mesh_dim)
+    assert isinstance(dim_group, ProcessGroup)
+    # src need to be global rank
+    src_for_dim = 0
+    if dim_group is not GroupMember.WORLD:
+        src_for_dim = get_global_rank(dim_group, 0)
+
+    return broadcast(tensor, src=src_for_dim, group=dim_group, async_op=async_op)
+
+
+# TODO: test uneven split on GLOO and NCCL
+def mesh_all_to_all(
+    output_tensor_list: List[torch.Tensor],
+    input_tensor_list: List[torch.Tensor],
+    mesh: DeviceMesh,
+    mesh_dim: int = 0,
+    async_op: bool = False,
+) -> Optional[Work]:
+    dim_group = mesh.get_group(mesh_dim)
+    assert isinstance(dim_group, ProcessGroup)
+
+    work = None
+    # no direct dist.all_to_all support on 'gloo' so we manually do scatters
+    if mesh.device_type == "cpu":
+        logger.warning(
+            "ProcessGroupGloo does not support all_to_all, falling back with scatters!"
+        )
+        # TODO: pull the handle of uneven case in #492
+        dim_group_size = get_world_size(dim_group)
+        for i in range(dim_group_size):
+            # src need to be global rank
+            src_for_dim = i
+            if dim_group is not GroupMember.WORLD:
+                src_for_dim = get_global_rank(dim_group, i)
+
+            work = scatter(
+                output_tensor_list[i],
+                input_tensor_list if mesh.get_rank() == src_for_dim else [],
+                group=dim_group,
+                src=src_for_dim,
+                async_op=async_op,
+            )
+    else:
+        work = all_to_all(
+            output_tensor_list,
+            input_tensor_list,
+            dim_group,
+            async_op=async_op,
+        )
+    return work
+
+
+def spec_to_bytes(spec: "placement_types.DTensorSpec") -> int:
+    assert spec.tensor_meta is not None, "spec should have tensor meta defined!"
+    return spec.tensor_meta.dtype.itemsize * math.prod(spec.shape)
+
+
+@dataclass
+class MeshTopoInfo:
+    """
+    Mesh information for collective cost estimation
+    """
+
+    mesh: DeviceMesh
+    mesh_dim_devices: List[int]
+    mesh_dim_bandwidth: List[float]
+    mesh_dim_latency: List[float]
+
+    @staticmethod
+    @lru_cache(None)
+    def build_from_mesh(mesh: DeviceMesh) -> "MeshTopoInfo":
+        # Generate mesh topology info for intra-host/inter-host communication pattern
+        # Note that we made bunch of assumptions for simplicity:
+        # 1. we assume the mesh is homogeneous, and it's gpu/nccl model
+        # 2. we assume gpu arch is Ampere or Hopper
+        # 3. we assume collectives are all ring base algo for now
+        num_devices_per_host = _mesh_resources.num_devices_per_host(mesh.device_type)
+        # the base bw number (intra-node), GB/s
+        base_bw = 87.7
+        mesh_dim_bandwidth = [base_bw] * mesh.ndim
+        # the latency in terms of us (intra-node, nv-link)
+        mesh_dim_latency = [0.6] * mesh.ndim
+        mesh_dim_devices = [1] * mesh.ndim
+
+        total_num_devices = 1
+        for mesh_dim in reversed(range(mesh.ndim)):
+            num_devices = mesh.size(mesh_dim)
+            mesh_dim_devices[mesh_dim] = num_devices
+            total_num_devices *= num_devices
+            if total_num_devices > num_devices_per_host:
+                # magic number for inter-host communication bandwidth/latency factor
+                # This number assumes latest GPU arch, i.e. Ampere or Hopper
+                # TODO: see if we need to tweak this or offer a way for user
+                # to specify the bandwidths/latency
+                mesh_dim_bandwidth[mesh_dim] *= 0.22
+                # set to ethernet latency for inter-host
+                mesh_dim_latency[mesh_dim] = 2.7
+
+        return MeshTopoInfo(
+            mesh, mesh_dim_devices, mesh_dim_bandwidth, mesh_dim_latency
+        )
+
+
+def allgather_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    num_hops = num_devices_on_mesh_dim - 1
+    # base latency + comm latency
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]  # us
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth  # s
+    return latency + bw * 1e6  # rescale to us
+
+
+def allreduce_cost(bytes_gb: float, mesh_topo: MeshTopoInfo, mesh_dim: int) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    # allreduce have almost 2x comm bytes compare to allgather/reduce_scatter
+    num_hops = 2 * num_devices_on_mesh_dim - 1
+
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth
+    return latency + bw * 1e6
+
+
+def reduce_scatter_cost(
+    bytes_gb: float,
+    mesh_topo: MeshTopoInfo,
+    mesh_dim: int,
+) -> float:
+    num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[mesh_dim]
+    mesh_dim_bandwidth = mesh_topo.mesh_dim_bandwidth[mesh_dim]
+    num_hops = num_devices_on_mesh_dim - 1
+    # base latency + comm latency
+    latency = 6.6 + num_hops * mesh_topo.mesh_dim_latency[mesh_dim]
+    bw = (bytes_gb * num_hops / num_devices_on_mesh_dim) / mesh_dim_bandwidth
+    return latency + bw * 1e6
+
+
+def redistribute_cost(
+    current_spec: "placement_types.DTensorSpec",
+    target_spec: "placement_types.DTensorSpec",
+) -> float:
+    """
+    This function returns the cost of redistribute from current to target DTensorSpec.
+
+    NOTE:
+    1. Only consider communication cost here, since computation costs for redistribute
+       are quite trival (i.e. we only need to narrow or simple division)
+    2. Only consider redistribute cost on same mesh, cross mesh communication cost is
+       not quite needed for operator strategy estimation/selection.
+    """
+    if current_spec.mesh != target_spec.mesh:
+        # make infinite cost if meshes are not same
+        # TODO: see if we want to support this once there's cross mesh communication
+        return float("inf")
+
+    if current_spec.is_replicated():
+        # short-cut:
+        # comm cost is 0 if current spec is already full replication
+        return 0.0
+
+    mesh_topo = MeshTopoInfo.build_from_mesh(current_spec.mesh)
+    cost = 0.0
+    comm_bytes_gb = (
+        spec_to_bytes(current_spec) / current_spec.num_shards / 1024 / 1024 / 1024
+    )
+    # Transformation that considered for redistribute cost:
+    # 1. allgather 2. alltoall
+    # 3. allreduce 4. reduce_scatter
+    for i, (current, target) in enumerate(
+        zip(current_spec.placements, target_spec.placements)
+    ):
+        if current == target:
+            continue
+
+        num_devices_on_mesh_dim = mesh_topo.mesh_dim_devices[i]
+        if current.is_shard() and target.is_replicate():
+            # allgather gives larger comm bytes
+            comm_bytes_gb *= num_devices_on_mesh_dim
+            # add up allgather comm cost
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i)
+        elif current.is_shard() and target.is_shard():
+            # should be alltoall comm, since we haven't implement it yet, add penalty
+            # to favor allgather instead
+            cost += allgather_cost(comm_bytes_gb, mesh_topo, i) + 1.0
+        elif current.is_partial() and target.is_replicate():
+            # add up allreduce comm cost
+            cost += allreduce_cost(comm_bytes_gb, mesh_topo, i)
+        elif current.is_partial() and target.is_shard():
+            # add up reduce_scatter comm cost
+            cost += reduce_scatter_cost(comm_bytes_gb, mesh_topo, i)
+            # after reduce_scatter the comm bytes for further collectives halved.
+            comm_bytes_gb /= num_devices_on_mesh_dim
+        elif current.is_shard() and target.is_partial():
+            # ban shard -> partial as it does not make sense to perform
+            # this redistribute
+            return float("inf")
+
+    return cost
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/_utils.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0ee7b188f1fc34fdbcfd8c4c0fc015c99886362
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/_utils.py
@@ -0,0 +1,204 @@
+from typing import cast, List, Sequence, Tuple
+
+import torch
+import torch.distributed._tensor.api as dtensor
+from torch._prims_common import ShapeType
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+from torch.distributed.device_mesh import DeviceMesh
+
+
+# TODO: audit existing code base to see if we can safely remove this API.
+def compute_local_shape(
+    global_shape: ShapeType, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> Tuple[int, ...]:
+    """
+    Compute the shape of a local shard of the given DTensor on its current
+    coordinate of the mesh.
+    """
+    my_coordinate = mesh.get_coordinate()
+
+    if my_coordinate is None:
+        # if rank not in the mesh, return empty shape
+        return (0,)
+    else:
+        local_shape = list(global_shape)  # start with global shape
+        ndim = len(global_shape)
+        for idx, placement in enumerate(placements):
+            mesh_dim_size = mesh.size(idx)
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                assert (
+                    shard_dim < ndim
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {ndim}"
+                local_shard_size, _ = placement._local_shard_size_on_dim(
+                    local_shape[shard_dim], mesh_dim_size, my_coordinate[idx]
+                )
+                assert isinstance(local_shard_size, int)
+                local_shape[shard_dim] = local_shard_size
+
+        return tuple(local_shape)
+
+
+def compute_local_shape_and_global_offset(
+    global_shape: ShapeType, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
+    """
+    Compute the local tensor shape and the global offsets into the original tensor
+    of a DTensor on its current global rank. This is useful for checkpointing purpose.
+
+    Example (2 host with 4GPUs each):
+    # Below is a DeviceMesh with mesh_shape of (2, 4)
+    mesh = DeviceMesh(device_type="cuda",
+                        mesh=[
+                        [0, 1, 2, 3],
+                        [4, 5, 6, 7]
+                        ],
+    )
+
+    Let's say we distribute a global_tensor of shape (8,4) over the above DeviceMesh
+    with a placements of [Shard(0), Shard(0)].
+    The local shape and global offset will be as follows:
+    rank0 -- local_shape:[1, 4], global_offset:[0, 0]
+    rank1 -- local_shape:[1, 4], global_offset:[1, 0]
+    rank2 -- local_shape:[1, 4], global_offset:[2, 0]
+    rank5 -- local_shape:[1, 4], global_offset:[5, 0]
+    rank3 -- local_shape:[1, 4], global_offset:[3, 0]
+    rank4 -- local_shape:[1, 4], global_offset:[4, 0]
+    rank6 -- local_shape:[1, 4], global_offset:[6, 0]
+    rank7 -- local_shape:[1, 4], global_offset:[7, 0]
+
+    Let's say we distribute a global_tensor of shape (2) over the above DeviceMesh with
+    a placements of [Shard(0)]. We will not have non-empty local tensor for all the ranks.
+    The local shape and global offset will be as follows:
+    rank0 -- local_shape:[1,], global_offset:[0,]
+    rank1 -- local_shape:[1,], global_offset:[1,]
+    rank2 -- local_shape:[0,], global_offset:[2,]
+    rank5 -- local_shape:[0,], global_offset:[2,]
+    rank3 -- local_shape:[0,], global_offset:[2,]
+    rank4 -- local_shape:[0,], global_offset:[2,]
+    rank6 -- local_shape:[0,], global_offset:[2,]
+    rank7 -- local_shape:[0,], global_offset:[2,]
+    """
+    my_coordinate = mesh.get_coordinate()
+
+    if my_coordinate is None:
+        # if rank not in the mesh, return empty offset
+        return ((), ())
+    else:
+        local_shape = list(global_shape)
+        global_offset = [0] * len(global_shape)
+
+        for idx, placement in enumerate(placements):
+            mesh_dim_size = mesh.size(idx)
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                local_offset = [0] * len(global_shape)
+                assert shard_dim < len(
+                    local_shape
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
+                shard_size, shard_offset = placement._local_shard_size_on_dim(
+                    local_shape[shard_dim],
+                    mesh_dim_size,
+                    my_coordinate[idx],
+                    return_offset=True,
+                )
+
+                local_shape[shard_dim] = shard_size
+                local_offset[shard_dim] = shard_offset
+
+                # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
+                # it means that this dimension has been already sharded in previous placement.
+                # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
+                # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
+                if global_offset[shard_dim] <= local_offset[shard_dim]:
+                    global_offset[shard_dim] = local_offset[shard_dim]
+                else:
+                    global_offset[shard_dim] += local_offset[shard_dim]
+
+        return tuple(local_shape), tuple(global_offset)
+
+
+def compute_global_tensor_info(
+    tensor: torch.Tensor, mesh: DeviceMesh, placements: Sequence[Placement]
+) -> Tuple[List[int], List[int]]:
+    """
+    Compute the global size and stride of a DTensor from the given local tensor.
+    The local size is multiplited by `world_size` per Sharding dim.
+    The local stride is multiplited by `world_size` per Sharding dim, as long as the
+    dimension is outside sharding dim.
+
+    For example, if we have a local tensor with size (4, 8, 2) and stride (16, 1, 8).
+    If the DTensor placements are [Shard(2)] and world_size is 2;
+    then the global size is (4, 8, 4) and stride is (16 * 2, 1, 8).
+
+    Args:
+        tensor (:class:`torch.Tensor`):
+            Local tensor which DTensor will be constructed from.
+        mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        placements (Sequence[:class:`Placement`]]):
+            The attribute of the DTensor that describes its layout
+            on the mesh topology.
+
+    Return:
+        tensor_shape: A List of int which specifies the size of DTensor which build
+            on top of the local tensor.
+        tensor_stride: A List of int which specifies the stride of DTensor.
+    """
+    tensor_shape = list(tensor.size())
+    tensor_stride = list(tensor.stride())
+    for idx, placement in enumerate(placements):
+        mesh_dim_size = mesh.size(idx)
+        if placement.is_shard():
+            shard_placement = cast(Shard, placement)
+            if shard_placement.dim < 0:
+                raise AssertionError(
+                    "Shard placements should have negative dims normalized in "
+                    f"the user-facing APIs: {shard_placement}"
+                )
+            shard_dim = shard_placement.dim
+
+            assert (
+                shard_dim < tensor.ndim
+            ), f"Sharding dim {shard_dim} greater than tensor ndim {tensor.ndim} for placement number {idx}."
+
+            local_dim_size = tensor_shape[shard_dim]
+            tensor_shape[shard_dim] = local_dim_size * mesh_dim_size
+
+            # recover tensor stride by modifying the stride that larger than
+            # the current stride on the shard_dim
+            for i in range(len(tensor_stride)):
+                if i != shard_dim and tensor_stride[i] >= tensor_stride[shard_dim]:
+                    # rescale the stride by the shard size
+                    tensor_stride[i] = tensor_stride[i] * mesh_dim_size
+        elif not isinstance(placement, (Replicate, _Partial)):
+            raise RuntimeError(f"placement type {type(placement)} not supported!")
+    return tensor_shape, tensor_stride
+
+
+def try_find_mesh_from_args(
+    op_call: torch._ops.OpOverload, args: Sequence[object]
+) -> DeviceMesh:
+    """
+    Find the device mesh object from args.
+    It returns None if no mesh is found.
+    NOTE: we can optimize this search if needed
+    """
+    for arg in args:
+        if isinstance(arg, (dtensor.DTensor, DTensorSpec)):
+            return arg.device_mesh
+        elif (
+            isinstance(arg, (list, tuple))
+            and len(arg) > 0
+            and isinstance(arg[0], (dtensor.DTensor, DTensorSpec))
+        ):
+            return arg[0].device_mesh
+
+    raise ValueError(f"Cannot find device mesh from args for op : {op_call}.")
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/api.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..feabc57d2fd75d7e90b9c32215f4f40bb4a07928
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/api.py
@@ -0,0 +1,760 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import inspect
+import warnings
+from typing import Any, Callable, cast, Optional, Sequence, Tuple
+
+import torch
+
+import torch.distributed._tensor.dispatch as op_dispatch
+import torch.distributed._tensor.random as random
+import torch.nn as nn
+from torch.distributed._tensor._collective_utils import mesh_broadcast
+from torch.distributed._tensor._utils import compute_global_tensor_info
+from torch.distributed._tensor.placement_types import (
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+    TensorMeta,
+)
+from torch.distributed._tensor.random import (
+    is_rng_supported_mesh,
+    OffsetBasedRNGTracker,
+)
+from torch.distributed._tensor.redistribute import (
+    Redistribute,
+    redistribute_local_tensor,
+)
+from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+
+
+__all__ = ["DTensor", "distribute_tensor", "distribute_module"]
+
+aten = torch.ops.aten
+
+
+# NOTE [Autograd interaction between torch.Tensor]
+#
+# The autograd functions defined below are being used by the public
+# facing APIs (i.e. from_local, to_local) to ensure our DTensor
+# works together with torch.Tensor within autograd engine. This
+# allows DistributedTensor to exist on part of the module hierarchy
+# and still able to calculate gradients across the torch.Tensor and
+# DistributedTensor boundary.
+# As an example, we have the a module that consists of submodules
+# A, B, and C, the execution flow would be like:
+#  input(torch.Tensor) -> Module A -> Module B -> Module C -> output (torch.Tensor)
+#
+# Suppose I only want to make Module B be a sharded module with
+# DistributedTensor params, we would need to make the following
+# flow to work:
+#
+#  input(torch.Tensor) -> Module A
+#       -> DTensor input -> Sharded Module B -> DTensor output
+#           -> output (torch.Tensor) -> Module C -> output (torch.Tensor)
+#
+# We need the conversion from Module A to DTensor input, which is
+# `from_local`, and conversion from DTensor output to output, which
+# is `to_local`, thus these two functions must be Autograd functions.
+#
+class _ToTorchTensor(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,
+        input: "DTensor",
+        grad_placements: Optional[Sequence[Placement]],
+    ):
+        ctx.dtensor_spec = input._spec
+        ctx.grad_placements = grad_placements
+        local_tensor = input._local_tensor
+
+        # We need to return a fresh Tensor object there as autograd metadata
+        # will be inplaced into it. So we don't want to pollute the Tensor
+        # object stored in the _local_tensor of this DTensor.
+        return local_tensor.view_as(local_tensor)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):  # type: ignore[override]
+        dtensor_spec = ctx.dtensor_spec
+        mesh = dtensor_spec.mesh
+        grad_placements = ctx.grad_placements
+        dtensor_meta = dtensor_spec.tensor_meta
+
+        _, tensor_stride = compute_global_tensor_info(
+            grad_output, mesh, dtensor_spec.placements
+        )
+        tensor_stride = tuple(tensor_stride)
+        grad_placements = grad_placements or dtensor_spec.placements
+
+        return (
+            DTensor(
+                grad_output,
+                mesh,
+                grad_placements,
+                shape=dtensor_meta.shape,
+                dtype=dtensor_meta.dtype,
+                requires_grad=grad_output.requires_grad,
+                stride=tensor_stride,
+            ),
+            None,
+        )
+
+
+class _FromTorchTensor(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        ctx,  # pyre-ignore[2]: Parameter must be annotated.
+        input: torch.Tensor,
+        device_mesh: DeviceMesh,
+        placements: Tuple[Placement, ...],
+        run_check: bool,
+        shape: Optional[torch.Size] = None,
+        stride: Optional[Tuple[int, ...]] = None,
+    ) -> "DTensor":
+        ctx.previous_placement = placements
+        ctx.previous_device_mesh = device_mesh
+
+        if shape and stride:
+            tensor_shape, tensor_stride = shape, stride
+        elif not shape and not stride:
+            # if it's not by default run_check, we assume user is certain that each
+            # rank has the same tensor shape, and we just use that to calculate the
+            # global shape
+            global_shape, global_stride = compute_global_tensor_info(
+                input, device_mesh, placements
+            )
+            tensor_shape, tensor_stride = torch.Size(global_shape), tuple(global_stride)
+        else:
+            raise RuntimeError(
+                f"Found shape:{shape}, stride:{stride}.",
+                "Please pass both shape and stride at the same time.",
+            )
+
+        if device_mesh.get_coordinate() is None:
+            # if the global rank is not participating in the device mesh, we
+            # simply set the local tensor to an empty tensor
+            input = input.new_empty(0, requires_grad=input.requires_grad)
+        elif run_check:
+            # TODO: by default check tensor metas across rank
+            # TODO: See if we need to make this run_check logic
+            # have a corresponding backward.
+            for idx, placement in enumerate(placements):
+                if placement.is_replicate():
+                    # broadcast rank 0 tensor to all ranks
+                    # only broadcast if run_check is True
+                    input = input.contiguous()
+                    mesh_broadcast(input, device_mesh, mesh_dim=idx)
+
+        # We want a fresh Tensor object that shares memory with the input tensor
+        dist_tensor = DTensor(
+            input.view_as(input),
+            device_mesh,
+            placements,
+            shape=tensor_shape,
+            dtype=input.dtype,
+            # requires_grad of the dist tensor depends on if input
+            # requires_grad or not
+            requires_grad=input.requires_grad,
+            stride=tensor_stride,
+        )
+        return dist_tensor
+
+    @staticmethod
+    def backward(ctx, grad_output: "DTensor"):  # type: ignore[override]
+        previous_placement = ctx.previous_placement
+        previous_device_mesh = ctx.previous_device_mesh
+
+        # reshard to the placement when creating DistributedTensor
+        # so that the gradient layout matches, and we could return
+        # local gradients directly
+        if grad_output.placements != previous_placement:
+            current_spec = grad_output._spec
+            target_spec = DTensorSpec(
+                previous_device_mesh,
+                previous_placement,
+                tensor_meta=grad_output._spec.tensor_meta,
+            )
+            local_tensor = grad_output._local_tensor
+            output = redistribute_local_tensor(
+                local_tensor, current_spec, target_spec, is_backward=True
+            )
+            # TODO: return the redistributed local tensor directly without
+            # differentiable backward. see if this make sense for all cases.
+            return output, None, None, None, None, None
+
+        # TODO: backward is also differentiable now, add a test
+        # to test higher level gradients.
+        return grad_output.to_local(), None, None, None, None, None
+
+
+class DTensor(torch.Tensor):  # pyre-ignore[13]: pyre is bad at __new__
+    _local_tensor: torch.Tensor
+    _spec: DTensorSpec
+    __slots__ = ["_local_tensor", "_spec"]
+
+    # class attribute that handles operator placements propagation
+    # rules, keyed by aten op name, value is propagation func
+    _op_dispatcher: op_dispatch.OpDispatcher = op_dispatch.OpDispatcher()
+
+    @staticmethod
+    def __new__(
+        cls,
+        local_tensor: torch.Tensor,
+        device_mesh: DeviceMesh,
+        placements: Tuple[Placement, ...],
+        *,
+        shape: torch.Size,
+        dtype: torch.dtype,
+        requires_grad: bool,
+        stride: Tuple[int, ...],
+    ) -> "DTensor":
+        """
+        Construct a DTensor from a local tensor, device mesh, and placement and
+        other tensor properties (i.e. shape, requires_grad, strides, etc).
+        Note: This is not a public API and it's only supposed to be used by the
+            operator implementations and internals. If you want to construct a
+            DTensor from a local tensor, consider using `DTensor.from_local`, if
+            you want to construct a DTensor from a "global" tensor (where you
+            already have tensor initialized and want to shard this tensor),
+            consider using `distribute_tensor`.
+        """
+        if local_tensor.requires_grad and not requires_grad:
+            warnings.warn(
+                "To construct DTensor from torch.Tensor, it's recommended to "
+                "use local_tensor.detach() and make requires_grad consistent."
+            )
+
+        # new method instruct wrapper tensor from local_tensor and add
+        # placement spec, it does not do actual distribution
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            shape,
+            strides=stride,
+            dtype=dtype,
+            device=local_tensor.device,
+            layout=local_tensor.layout,
+            requires_grad=requires_grad,
+        )
+
+        tensor_meta = TensorMeta(shape, stride, dtype)
+        # deepcopy and set spec
+        r._spec = DTensorSpec(device_mesh, placements, tensor_meta=tensor_meta)
+        r._local_tensor = local_tensor
+        return r
+
+    # pyre-fixme[14]: `__repr__` overrides method defined in `DTensor` inconsistently.
+    # pyre-fixme[3]: Return type must be annotated.
+    def __repr__(self):
+        # TODO: consider all_gather the local tensors for better debugging
+        return f"DTensor(local_tensor={self._local_tensor}, device_mesh={self._spec.mesh}, placements={self._spec.placements})"
+
+    def __tensor_flatten__(self):
+        """
+        protocol to inform how to flatten a DTensor to local tensor
+        for PT2 tracing
+        """
+        return ["_local_tensor"], (self._spec, self.requires_grad)
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):
+        assert (
+            flatten_spec is not None
+        ), "Expecting spec to be not None from `__tensor_flatten__` return value!"
+        local_tensor = inner_tensors["_local_tensor"]
+        spec, requires_grad = flatten_spec
+        return DTensor(
+            local_tensor,
+            spec.mesh,
+            spec.placements,
+            shape=outer_size,
+            dtype=spec.tensor_meta.dtype,
+            requires_grad=requires_grad,
+            stride=outer_stride,
+        )
+
+    @classmethod
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        return DTensor._op_dispatcher.dispatch(
+            func,
+            args,
+            kwargs or {},
+        )
+
+    @staticmethod
+    def from_local(
+        local_tensor: torch.Tensor,
+        device_mesh: Optional[DeviceMesh] = None,
+        placements: Optional[Sequence[Placement]] = None,
+        *,
+        run_check: bool = True,
+        shape: Optional[torch.Size] = None,
+        stride: Optional[Tuple[int, ...]] = None,
+    ) -> "DTensor":
+        """
+        Create a :class:`DTensor` from a local torch.Tensor on each rank
+        according to the `device_mesh` and `placements` specified.
+
+        Args:
+            local_tensor (torch.Tensor): local torch.Tensor on each rank.
+            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
+                tensor, if not specified, must be called under a DeviceMesh
+                context manager, default: None
+            placements (List[:class:`Placement`], optional): the placements that
+                describes how to place the local torch.Tensor on DeviceMesh, must
+                have the same number of elements as `device_mesh.ndim`. If not
+                specified, we will by default replicate the tensor across the
+                `device_mesh` from the first rank of each dimension of the `device_mesh`.
+
+        Keyword args:
+            run_check (bool, optional): indicate whether to run check across ranks
+                to check meta information and data. if have :class:`Replicate` in
+                `placements`, the data on first rank of the device mesh dimension
+                will be broadcasted to other ranks.
+            shape (torch.Size, optional): A List of int which specifies the size of
+                DTensor which build on top of `local_tensor`. Note this needs to be
+                provided if the shape of `local_tensor` are different across the ranks.
+                If not provided, `shape` will be computed assuming the given distributed
+                tensor is evenly sharded across ranks.
+            stride (tuple, optional): A List of int which specifies the stride of DTensor.
+                If not provided, `stride` will be computed assuming the given distributed
+                tensor is evenly sharded across ranks.
+
+        Returns:
+            A :class:`DTensor` object
+
+        .. note:: `from_local` is differentiable, the `requires_grad` of the created
+            `DTensor` object will depend on if `local_tensor` requires_grad or not.
+        """
+        # if same shape/dtype, no need to run_check, if not, must allgather
+        # the metadatas to check the size/dtype across ranks
+        # There should be no data communication unless there's replication
+        # strategy, where we broadcast the replication from the first rank
+        # in the mesh dimension
+        device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+        device_type = device_mesh.device_type
+
+        # convert the local tensor to desired device base on device mesh's device_type
+        if device_type != local_tensor.device.type and not local_tensor.is_meta:
+            local_tensor = local_tensor.to(device_type)
+
+        # set default placements to replicated if not specified
+        if placements is None:
+            placements = [Replicate() for _ in range(device_mesh.ndim)]
+        else:
+            placements = list(placements)
+            for idx, placement in enumerate(placements):
+                # normalize shard dim to be positive
+                if placement.is_shard():
+                    placement = cast(Shard, placement)
+                    if placement.dim < 0:
+                        placements[idx] = Shard(placement.dim + local_tensor.ndim)
+
+        # `from_local` is differentiable, and the gradient of the dist tensor this function
+        # created should flow back the gradients to the local_tensor, so we call an autograd
+        # function to construct the dist tensor instead.
+        return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+            local_tensor,
+            device_mesh,
+            tuple(placements),
+            run_check,
+            shape,
+            stride,
+        )
+
+    def to_local(
+        self, *, grad_placements: Optional[Sequence[Placement]] = None
+    ) -> torch.Tensor:
+        """
+        Get the local tensor of this DTensor on its current rank. For sharding it returns
+        a local shard of the logical tensor view, for replication it returns the replica on
+        its current rank.
+
+        Keyword args:
+            grad_placements (List[:class:`Placement`], optional): the placements describes
+                the future layout of any gradient layout of the Tensor returned from this
+                function.
+                `to_local` converts DTensor to local tensor and the returned local tensor
+                might not be used as the original DTensor layout later in the code. This
+                argument is the hint that user can give to autograd in case the gradient
+                layout of the returned tensor does not match the original DTensor layout.
+                If not specified, we will assume the gradient layout remains the same
+                as the original DTensor and use that for gradient computation.
+
+        Returns:
+            A :class:`torch.Tensor` or `AsyncCollectiveTensor` object. it represents the
+            local tensor on its current rank.
+
+        .. note:: `to_local` is differentiable, the `requires_grad` of the local tensor returned
+            will depend on if the `DTensor` requires_grad or not.
+        """
+        if grad_placements is not None and not isinstance(grad_placements, tuple):
+            grad_placements = tuple(grad_placements)
+        return _ToTorchTensor.apply(
+            self, grad_placements
+        )  # pyre-ignore[16]: autograd func
+
+    def redistribute(
+        self,
+        device_mesh: Optional[DeviceMesh] = None,
+        placements: Optional[Sequence[Placement]] = None,
+        *,
+        async_op: bool = False,
+    ) -> "DTensor":
+        """
+        `redistribute` performs necessary collective operations that redistribute the current
+        DTensor from its current placements to a new placements, or from is current DeviceMesh
+        to a new DeviceMesh. i.e. we can turn a Sharded DTensor to a Replicated DTensor by
+        specifying a Replicate placement for each dimension of the DeviceMesh.
+
+        Args:
+            device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to place the
+                DTensor, if not specified, must be called under a DeviceMesh
+                context manager, default: None
+            placements (List[:class:`Placement`], optional): the new placements that
+                describes how to place the DTensor into the DeviceMesh, must
+                have the same number of elements as `device_mesh.ndim`.
+
+        Keyword args:
+            async_op (bool, optional): whether to perform the DTensor redistribute operation
+                asynchronously or not. Default: False
+
+        Returns:
+            A :class:`DTensor` object
+
+        .. note:: `redistribute` is differentiable.
+        """
+        # NOTE: This redistribute API currently only supports out
+        # of place redistribution, i.e. it always create a new
+        # DTensor object and leave the original one unchanged.
+
+        # if device_mesh is not specified, use the current device_mesh
+        device_mesh = device_mesh or self.device_mesh
+        # raise error if new placements not specified
+        if placements is None:
+            raise RuntimeError("placements is needed for redistribute!")
+
+        placements = list(placements)
+        for i, placement in enumerate(placements):
+            if placement.is_partial():
+                raise RuntimeError(
+                    "Can not redistribute to _Partial, _Partial is for internal use only!"
+                )
+            elif isinstance(placement, Shard) and placement.dim < 0:
+                # normalize shard dim to be positive
+                placements[i] = Shard(placement.dim + self.ndim)
+        placements = tuple(placements)
+
+        # Early return the original DTensor if the placements are the same.
+        if self._spec.placements == placements:
+            return self
+
+        # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
+        return Redistribute.apply(self, device_mesh, placements, async_op)
+
+    def full_tensor(
+        self, *, grad_placements: Optional[Sequence[Placement]] = None
+    ) -> torch.Tensor:
+        """
+        Return the full tensor of this DTensor. It will perform necessary collectives
+        to gather the local tensors from other ranks in its DeviceMesh and concatenate
+        them together. It's a syntatic sugar of the following code:
+
+        `dtensor.redistribute(placements=[Replicate()] * mesh.ndim).to_local()`
+
+        Keyword args:
+            grad_placements (List[:class:`Placement`], optional): the placements describes
+                the future layout of any gradient layout of the full Tensor returned from this
+                function.
+                `full_tensor` converts DTensor to a full torch.Tensor and the returned torch.tensor
+                might not be used as the original replicated DTensor layout later in the code. This
+                argument is the hint that user can give to autograd in case the gradient
+                layout of the returned tensor does not match the original replicated DTensor layout.
+                If not specified, we will assume the gradient layout of the full tensor be replicated.
+
+        Returns:
+            A :class:`torch.Tensor` object that represents the full tensor of this DTensor.
+
+        .. note:: `full_tensor` is differentiable.
+        """
+
+        redist_res = self.redistribute(
+            placements=[Replicate()] * self.device_mesh.ndim, async_op=False
+        )
+        return _ToTorchTensor.apply(redist_res, grad_placements)
+
+    @property
+    def device_mesh(self) -> DeviceMesh:
+        """
+        The :class:`DeviceMesh` attribute that associates with this DTensor object.
+
+        .. note:: device_mesh is a read-only property, it can not be set.
+        """
+        return self._spec.mesh
+
+    @property
+    def placements(self) -> Sequence[Placement]:
+        """
+        The placements attribute of this DTensor that describes the layout of this
+        DTensor on the its DeviceMesh.
+
+        .. note:: placements is a read-only property, it can not be set.
+        """
+        return self._spec.placements
+
+
+def distribute_tensor(
+    tensor: torch.Tensor,
+    device_mesh: Optional[DeviceMesh] = None,
+    placements: Optional[Sequence[Placement]] = None,
+) -> DTensor:
+    """
+    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
+    specified. The rank of `device_mesh` and `placements` must be the same.
+
+    Args:
+        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
+            want to shard a tensor on a dimension that is not evenly divisible by
+            the number of devices in that mesh dimension, we use `torch.chunk`
+            semantic to shard the tensor and scatter the shards.
+        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
+            tensor, if not specified, must be called under a DeviceMesh context
+            manager, default: None
+        placements (List[:class:`Placement`], optional): the placements that
+            describes how to place the tensor on DeviceMesh, must have the same
+            number of elements as `device_mesh.ndim`. If not specified, we will
+            by default replicate the tensor across the `device_mesh` from the
+            first rank of each dimension of the `device_mesh`.
+
+    Returns:
+        A :class:`DTensor` or `XLAShardedTensor` object.
+
+    Note:
+        When initialize the DeviceMesh with the `xla` device_type, `distribute_tensor`
+        return `XLAShardedTensor` instead. see [link](https://github.com/pytorch/pytorch/issues/92909)
+        for more details. The XLA integration is experimental and subject to change.
+    """
+
+    torch._C._log_api_usage_once("torch.dtensor.distribute_tensor")
+
+    # get default device mesh if there's nothing specified
+    device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+    device_type = device_mesh.device_type
+    if device_type == "xla":
+        try:
+            # call PyTorch/XLA SPMD for `xla` backend type device mesh.
+            # This returns XLAShardedTensor
+            from torch_xla.distributed.spmd import (  # type:ignore[import]
+                xla_distribute_tensor,
+            )
+
+            return xla_distribute_tensor(
+                tensor, device_mesh, placements
+            )  # type:ignore[return-value]
+        except ImportError as e:
+            msg = "To use DTensor API with xla, you must install the torch_xla package!"
+            raise ImportError(msg) from e
+
+    # instantiate a RNG tracker if haven't. By default DTensor uses an
+    # OffsetBasedRNGTracker to perform random operators.
+    # TODO: the value assignment to global variable is not the ideal solution
+    # we can replace it in future.
+    if is_rng_supported_mesh(device_mesh) and not random._rng_tracker:
+        random._rng_tracker = OffsetBasedRNGTracker(device_type)
+
+    if not tensor.is_leaf:
+        raise RuntimeError(
+            "`distribute_tensor` should be used to distribute leaf tensors! but found non-leaf tensor!"
+        )
+
+    # convert tensor to the corresponding device type if it's not in that device type
+    if device_type != tensor.device.type and not tensor.is_meta:
+        tensor = tensor.to(device_type)
+
+    # set default placements to replicated if not specified
+    if placements is None:
+        placements = [Replicate() for _ in range(device_mesh.ndim)]
+
+    if len(placements) != device_mesh.ndim:
+        raise ValueError(
+            f"`placements` must have the same length as `device_mesh.ndim`! "
+            f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
+        )
+    if isinstance(tensor, DTensor):
+        # if the tensor is already a DTensor, we just need to check if the
+        # device mesh and placements are the same
+        if tensor.device_mesh != device_mesh:
+            raise ValueError(
+                f"Cannot distribute a DTensor with device mesh {tensor.device_mesh} "
+                f"to a different device mesh {device_mesh}."
+            )
+        if tensor.placements != tuple(placements):
+            raise ValueError(
+                f"Cannot distribute a DTensor with placements {tensor.placements} "
+                f"to a different placements {placements}. do you want to call "
+                f"`redistribute` instead?"
+            )
+        return tensor
+
+    local_tensor = tensor
+
+    # distribute the tensor according to the placements.
+    placements = list(placements)
+    for idx, placement in enumerate(placements):
+        if placement.is_shard():
+            placement = cast(Shard, placement)
+            if placement.dim < 0:
+                # normalize shard placement dim
+                placement = Shard(placement.dim + tensor.ndim)
+                placements[idx] = placement
+            local_tensor = placement._shard_tensor(local_tensor, device_mesh, idx)
+        elif placement.is_replicate():
+            placement = cast(Replicate, placement)
+            local_tensor = placement._replicate_tensor(local_tensor, device_mesh, idx)
+        else:
+            raise RuntimeError(
+                f"Trying to distribute tensor with unsupported placements {placement} on device mesh dimension {idx}!"
+            )
+    placements = tuple(placements)
+
+    assert local_tensor is not None, "distributing a tensor should not be None"
+    # detach the local tensor passed to DTensor since after the construction
+    # of DTensor, autograd would work on top of DTensor instead of local tensor
+    return DTensor(
+        local_tensor.detach().requires_grad_(tensor.requires_grad),
+        device_mesh,
+        placements,
+        shape=tensor.size(),
+        dtype=tensor.dtype,
+        requires_grad=tensor.requires_grad,
+        stride=tensor.stride(),
+    )
+
+
+def distribute_module(
+    module: nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
+    input_fn: Optional[Callable[[nn.Module, Any, DeviceMesh], None]] = None,
+    output_fn: Optional[Callable[[nn.Module, Any, DeviceMesh], None]] = None,
+) -> nn.Module:
+    """
+    This function converts all module parameters to :class:`DTensor` parameters
+    according to the `partition_fn` specified. It could also control the input or
+    output of the module by specifying the `input_fn` and `output_fn`. (i.e. convert
+    the input to :class:`DTensor`, convert the output back to torch.Tensor)
+    Args:
+        module (:class:`nn.Module`): user module to be partitioned.
+        device_mesh (:class:`DeviceMesh`): the device mesh to place the module.
+        partition_fn (Callable): the function to partition parameters (i.e. shard certain
+            parameters across the `device_mesh`). If `partition_fn` is not specified,
+            by default we replicate all module parameters of `module` across the mesh.
+        input_fn (Callable): specify the input distribution, i.e. could control how the
+            input of the module is sharded. `input_fn` will be installed as a module
+            `forward_pre_hook` (pre forward hook).
+        output_fn (Callable): specify the output distribution, i.e. could control how the
+            output is sharded, or convert it back to torch.Tensor. output_fn will be
+            installed as a module `forward_hook` (post forward hook).
+
+    Returns:
+        A module that contains parameters/buffers that are all `DTensor`s.
+
+    Note:
+        When initialize the DeviceMesh with the `xla` device_type, `distribute_module`
+        return nn.Module with PyTorch/XLA SPMD annotated parameters. See [link](https://github.com/pytorch/pytorch/issues/92909)
+        for more details. The XLA integration is experimental and subject to change.
+    """
+
+    torch._C._log_api_usage_once("torch.dtensor.distribute_module")
+
+    device_mesh = device_mesh or _mesh_resources.get_current_mesh()
+    device_type = device_mesh.device_type
+    if device_type == "xla":
+        try:
+            # This function annotates all module parameters for auto-partitioning with
+            # PyTorch/XLA SPMD or explicitly partition to :class:`XLAShardedTensor` parameters
+            # according to the `partition_fn` specified.
+            from torch_xla.distributed.spmd import (  # type:ignore[import]
+                xla_distribute_module,
+            )
+
+            return xla_distribute_module(
+                module, device_mesh, partition_fn, input_fn, output_fn
+            )  # type:ignore[return-value]
+        except ImportError as e:
+            msg = "To use DTensor API with xla, you must install the torch_xla package!"
+            raise ImportError(msg) from e
+
+    def replicate_module_params_buffers(m: nn.Module, mesh: DeviceMesh) -> None:
+        # This function loop over the immediate module parameters and
+        # buffers, replicate all non DTensor params/buffers to DTensor
+        # parameters/buffers, if they have not been partitioned in the
+        # partition_fn, we can't easily use `module._apply` here
+        # because we don't know what happened inside partition_fn as
+        # user could do anything, i.e. install hooks, and we want to
+        # preserve those.
+        full_replicate = [Replicate()] * mesh.ndim
+        for key, param in m._parameters.items():
+            if param is not None and not isinstance(param, DTensor):
+                m.register_parameter(
+                    key,
+                    nn.Parameter(distribute_tensor(param.data, mesh, full_replicate)),
+                )
+        for key, buffer in m._buffers.items():
+            if buffer is not None and not isinstance(buffer, DTensor):
+                m._buffers[key] = distribute_tensor(buffer, mesh, full_replicate)
+
+    if partition_fn is None:
+        # if partition_fn not specified, we by default replicate
+        # all module params/buffers
+        for name, submod in module.named_modules():
+            replicate_module_params_buffers(submod, device_mesh)
+    else:
+        # apply partition_fun to submodules
+        for name, submod in module.named_modules():
+            partition_fn(name, submod, device_mesh)
+            replicate_module_params_buffers(submod, device_mesh)
+
+    # register input_fn as module forward pre hook
+    if input_fn is not None:
+        # check the input_fn signature
+        num_args = len(inspect.signature(input_fn).parameters)
+        if num_args == 2:
+            # input_fn only takes in inputs and device mesh
+            warnings.warn(
+                "Deprecating input_fn that takes two arguments (inputs, device_mesh), "
+                "please use input_fn that takes in (module, inputs, device_mesh) instead!",
+            )
+            module.register_forward_pre_hook(lambda _, inputs: input_fn(inputs, device_mesh))  # type: ignore[call-arg]
+        elif num_args == 3:
+            # input_fn takes in module, inputs, device mesh
+            module.register_forward_pre_hook(
+                lambda mod, inputs: input_fn(mod, inputs, device_mesh)
+            )
+        else:
+            raise ValueError(
+                f"input_fn should take in 3 arguments, but got {num_args} arguments!"
+            )
+    # register output_fn as module forward hook
+    if output_fn is not None:
+        num_args = len(inspect.signature(output_fn).parameters)
+        if num_args == 2:
+            # output_fn only takes in outputs and device mesh
+            warnings.warn(
+                "Deprecating output_fn that takes two arguments (inputs, device_mesh), "
+                "please use output_fn that takes in (module, inputs, device_mesh) instead!",
+            )
+            module.register_forward_hook(
+                lambda mod, inputs, outputs: output_fn(outputs, device_mesh)  # type: ignore[call-arg]
+            )
+        elif num_args == 3:
+            module.register_forward_hook(
+                lambda mod, inputs, outputs: output_fn(mod, outputs, device_mesh)
+            )
+        else:
+            raise ValueError(
+                f"output_fn should take in 3 arguments, but got {num_args} arguments!"
+            )
+
+    return module
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3cdb8683b076258ab805257e61e8fecd71f67f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__init__.py
@@ -0,0 +1,14 @@
+from torch.distributed._tensor.api import DTensor
+
+from torch.distributed._tensor.debug.comm_mode import CommDebugMode
+
+
+def get_sharding_prop_cache_info():
+    """
+    Get the cache info for the sharding propagation cache, used for debugging purpose only.
+    This would return a named tuple showing hits, misses, maxsize and cursize of the sharding
+    propagator cache.
+    """
+    return (
+        DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding.cache_info()  # type:ignore[attr-defined]
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b889ca5f95fd25df7ce14965cc051043eef1e609
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/comm_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/comm_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf25caf053dacd986616669fa8947db1ea16e7cb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/comm_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/op_coverage.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/op_coverage.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05207625ac97c44dc52c95cacb58b2a11dc05660
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/op_coverage.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/visualize_sharding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/visualize_sharding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5df149e8441907e9f084f28ae13e88d368313d47
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/__pycache__/visualize_sharding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/comm_mode.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/comm_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8adf0e338418717ac6c2436178ebe2f6463af69
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/comm_mode.py
@@ -0,0 +1,91 @@
+from collections import defaultdict
+from typing import Any, Dict
+
+import torch
+from torch.distributed._tensor.api import DTensor
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+funcol_native = torch.ops._c10d_functional
+funcol_py = torch.ops.c10d_functional
+
+NATIVE_TO_PY_MAPPING = {
+    funcol_native.all_gather_into_tensor: funcol_py.all_gather_into_tensor,
+    funcol_native.all_gather_into_tensor_coalesced: funcol_py.all_gather_into_tensor_coalesced,
+    funcol_native.all_reduce: funcol_py.all_reduce,
+    funcol_native.all_to_all_single: funcol_py.all_to_all_single,
+    funcol_native.broadcast: funcol_py.broadcast,
+    funcol_native.reduce_scatter_tensor: funcol_py.reduce_scatter_tensor,
+    funcol_native.reduce_scatter_tensor_coalesced: funcol_py.reduce_scatter_tensor_coalesced,
+}
+
+
+class CommDebugMode(TorchDispatchMode):
+    """
+    ``CommDebugMode`` is a context manager that counts the number of
+    functional collectives within its context. It does this using a
+    ``TorchDispatchMode``.
+
+    NOTE: this mode only works for functional collective atm and the
+    distributed_c10d collectives are not supported yet.
+
+    Example usage
+
+    .. code-block:: python
+
+        mod = ...
+        comm_mode = CommDebugMode()
+        with comm_mode:
+            mod.sum().backward()
+
+    """
+
+    def __init__(self):
+        self.comm_counts: Dict[Any, int] = defaultdict(int)
+        self.comm_registry = set()
+        for native_op, py_op in NATIVE_TO_PY_MAPPING.items():
+            self.comm_registry.add(native_op)
+            self.comm_registry.add(py_op)
+
+    def get_total_counts(self) -> int:
+        return sum(self.comm_counts.values())
+
+    def get_comm_counts(self) -> Dict[Any, int]:
+        """Returns the communication counts as a dictionary.
+
+        Returns:
+            Dict[Any, int]: The communication counts as a dictionary.
+        """
+        return self.comm_counts
+
+    def __enter__(self):
+        self.comm_counts.clear()
+        super().__enter__()
+        return self
+
+    def __exit__(self, *args):
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        # When running this mode with DTensor, ordinarily all modes will
+        # run **before** subclasses get a chance to run.
+        # Returning NotImplemented here gives us a chance to let DTensor
+        # run and desugar into comms ops, before CommDebugMode sees them.
+        if any(t == DTensor for t in types):
+            return NotImplemented
+        kwargs = kwargs if kwargs else {}
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        # We have many tests that use CommDebugMode to verify the occurrence of
+        # collectives. These tests do so by querying comm_counts with legacy
+        # funcol ops as key. For the purpose of native funcol migration, we
+        # need these tests to work for both legacy and native funcol. To avoid
+        # the need to modify all tests to accommodate the two implementations,
+        # we make CommDebugMode translate native funcol ops into legacy funcol
+        # ops until the migration finishes.
+        if func_packet in self.comm_registry:
+            if func_packet in NATIVE_TO_PY_MAPPING:
+                func_packet = NATIVE_TO_PY_MAPPING[func_packet]
+            self.comm_counts[func_packet] += 1
+
+        return out
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/op_coverage.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/op_coverage.py
new file mode 100644
index 0000000000000000000000000000000000000000..a66eddcfecd9c99c29447d65a7680fd92bd5e902
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/op_coverage.py
@@ -0,0 +1,105 @@
+from operator import itemgetter
+from typing import List
+
+from functorch.compile import make_boxed_func
+
+import torch
+import torch.fx
+import torch.nn as nn
+from torch._functorch.compilers import aot_module
+from torch._inductor.decomposition import select_decomp_table
+from torch.distributed._tensor import DTensor
+
+
+inductor_decomps = select_decomp_table()
+
+graphs: List[torch.fx.GraphModule] = []
+
+
+def fwd_bwd_compiler(fx_g, _):
+    graphs.append(fx_g)
+    return make_boxed_func(fx_g)
+
+
+def get_inductor_decomp_graphs(model: nn.Module, args, kwargs):
+    """
+    Obtain forward and backward graphs of a model with inductor decompositions using tracing and aot_module.
+
+    Convenient util to get the fwd and bwd graphs of an arbitrary model
+    with inductor decompositions. Note that this would simply do tracing
+    with aot_module and don't ensure correctness. This is useful to track
+    the ops needed in DTensor.
+    """
+    compiled_mod = aot_module(
+        model, fw_compiler=fwd_bwd_compiler, decompositions=inductor_decomps
+    )
+    output = compiled_mod(*args, **kwargs)
+
+    if output.ndim != 0:
+        # if output is not a scalar tensor, by default sum it in order to
+        # run backward
+        output = output.sum()
+
+    output.backward()
+
+    # one fwd, one bwd graph
+    assert len(graphs) == 2
+    return graphs
+
+
+def print_op_coverage_summary(model: nn.Module, args, kwargs, *, output_csv=False):
+    """
+    Util to print the operator coverage summary of a certain model with tabulute.
+
+    Must have tabulate module installed.
+    """
+    # python module required for summary
+    import csv
+
+    from tabulate import tabulate
+
+    fwd_graph, bwd_graph = get_inductor_decomp_graphs(model, args, kwargs)
+
+    op_counts = {}
+
+    for node in fwd_graph.graph.nodes:
+        if node.op == "call_function" and isinstance(
+            node.target, torch._ops.OpOverload
+        ):
+            if node.target not in op_counts:
+                op_counts[node.target] = 0
+
+            op_counts[node.target] += 1
+
+    for node in bwd_graph.graph.nodes:
+        if node.op == "call_function" and isinstance(
+            node.target, torch._ops.OpOverload
+        ):
+            if node.target not in op_counts:
+                op_counts[node.target] = 0
+
+            op_counts[node.target] += 1
+
+    op_infos = []
+
+    for op, count in op_counts.items():
+        supported = op in DTensor._op_dispatcher.sharding_propagator.op_to_rules
+        op_infos.append([op, str(op._schema), count, supported])
+
+    # sort the op info base on the total count index
+    count_idx = 2
+    op_infos.sort(key=itemgetter(count_idx), reverse=True)
+
+    headers = ["Operator", "Schema", "Total Count", "Supported"]
+    print(tabulate(op_infos, headers=headers))
+
+    if output_csv:
+        # Open a CSV file for writing
+        with open("op_summary.csv", "w", newline="") as csv_file:
+            # Create a CSV writer object
+            csv_writer = csv.writer(csv_file)
+
+            csv_writer.writerow(headers)
+            # Write each table row to the CSV file
+            for row in op_infos:
+                csv_writer.writerow(row)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/visualize_sharding.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/visualize_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a775b1a0f256c9de6a27801b52d222f5b8c7da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/debug/visualize_sharding.py
@@ -0,0 +1,176 @@
+from typing import List, Sequence, Tuple
+
+import numpy as np
+
+from torch._prims_common import ShapeType
+from torch.distributed._tensor import DeviceMesh
+
+from torch.distributed._tensor.placement_types import Placement, Shard
+
+
+def _mesh_to_coordinate(mesh, device_type):
+    """
+    Given a n-dimensional list of device mesh, this function creates a map of
+    device and its coordinate
+    """
+    # Convert the n-dimensional list to a NumPy array
+    np_mesh = np.array(mesh.mesh.tolist())
+
+    # Create a dictionary to map each value to its coordinate
+    device_to_coordinate_map = {}
+    for coord, value in np.ndenumerate(np_mesh):
+        # device is unique in device_mesh
+        device_to_coordinate_map[f"{device_type}:{str(value)}"] = list(coord)
+
+    return device_to_coordinate_map
+
+
+def _convert_offset_to_ranges(all_offsets):
+    """
+    Using tabulate package to create a table is easier when we specify row and col ranges
+    This function converts offsets to ranges.
+    """
+    converted_blocks = []
+
+    for offset in all_offsets:
+        shape, offset, value = offset
+
+        # Calculate row_range and column_range
+        row_range = (offset[0], offset[0] + shape[0] - 1)
+        column_range = (offset[1], offset[1] + shape[1] - 1)
+
+        # Convert value to string to match your desired format
+        converted_block = {
+            "row_range": row_range,
+            "column_range": column_range,
+            "value": str(value),
+        }
+        converted_blocks.append(converted_block)
+
+    return converted_blocks
+
+
+def _create_table(blocks):
+    """
+    Creates a tabulate table given row and column ranges with device name
+    """
+    try:
+        from tabulate import tabulate
+    except ImportError as e:
+        raise ImportError("tabulate package is required to visualize sharding") from e
+
+    # Extract unique row and column ranges
+    row_ranges = sorted({block["row_range"] for block in blocks})
+    col_ranges = sorted({block["column_range"] for block in blocks})
+
+    # Create a matrix initialized with empty strings
+    matrix = [["" for _ in col_ranges] for _ in row_ranges]
+
+    # Fill the matrix with values
+    for block in blocks:
+        row_index = row_ranges.index(block["row_range"])
+        col_index = col_ranges.index(block["column_range"])
+        if matrix[row_index][col_index] == "":
+            matrix[row_index][col_index] = block["value"]
+        else:
+            matrix[row_index][col_index] += ", " + block["value"]
+
+    # Prepare headers
+    row_headers = [f"Row {r[0]}-{r[1]}" for r in row_ranges]
+    col_headers = [f"Col {c[0]}-{c[1]}" for c in col_ranges]
+
+    return tabulate(matrix, headers=col_headers, showindex=row_headers)
+
+
+def compute_local_shape_and_global_offset(
+    global_shape: ShapeType,
+    mesh: DeviceMesh,
+    placements: Sequence[Placement],
+    my_coordinate: List[int],
+) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
+    """
+    Same as torch.distributed._tensor._utils.compute_local_shape_and_global_offset but
+    with custom my_coordinate input. This is the modified implementation for visualize_sharding.
+    """
+
+    if my_coordinate is None:
+        # if rank not in the mesh, return empty offset
+        return ((), ())
+    else:
+        local_shape = list(global_shape)
+        global_offset = [0] * len(global_shape)
+
+        for idx, placement in enumerate(placements):
+            mesh_dim_size = mesh.size(idx)
+            if isinstance(placement, Shard):
+                shard_dim = placement.dim
+                local_offset = [0] * len(global_shape)
+                assert shard_dim < len(
+                    local_shape
+                ), f"Sharding dim {shard_dim} greater than tensor ndim {len(local_shape)}"
+                shard_size, shard_offset = placement._local_shard_size_on_dim(
+                    local_shape[shard_dim],
+                    mesh_dim_size,
+                    my_coordinate[idx],
+                    return_offset=True,
+                )
+
+                local_shape[shard_dim] = shard_size
+                local_offset[shard_dim] = shard_offset
+
+                # On a given dimension, if the local_offset[shard_dim] is smaller than global_offset[shard_dim],
+                # it means that this dimension has been already sharded in previous placement.
+                # Therefore, we cannot simply replace the global_offset[shard_dim] with local_offset[shard_dim].
+                # Instead, for the given shard_dim, we need to add local_offset[shard_dim] to existing global_offset[shard_dim].
+                if global_offset[shard_dim] <= local_offset[shard_dim]:
+                    global_offset[shard_dim] = local_offset[shard_dim]
+                else:
+                    global_offset[shard_dim] += local_offset[shard_dim]
+
+        return tuple(local_shape), tuple(global_offset)
+
+
+def visualize_sharding(dtensor, header=""):
+    """
+    Visualizes sharding in 1D-2D dtensors
+    Requires tabulate, install with `pip install tabulate`
+
+    note: no sharding info will be printed for empty tensors
+    """
+    if dtensor.numel() == 0:  # we do not print for empty dtensors
+        return
+
+    if len(dtensor.shape) >= 3:
+        raise RuntimeError(
+            "visualize sharding is only implemented for 1D or 2D dtensor"
+        )
+    placements = dtensor.placements
+    device_mesh = dtensor.device_mesh
+    device_type = dtensor.device_mesh.device_type
+
+    if device_mesh.get_coordinate() is None:  # current rank is not in the mesh
+        return
+
+    # Only display the visualization once for each DTensor, on the rank whose
+    # coordinate is 0 on all dimensions. For example, if the mesh is a full mesh,
+    # we will only print on rank 0.
+    local_rank_zero_on_all_dim = all(
+        device_mesh.get_local_rank(mesh_dim=dim) == 0 for dim in range(device_mesh.ndim)
+    )
+    if not local_rank_zero_on_all_dim:
+        return
+
+    device_map = _mesh_to_coordinate(device_mesh, device_type)
+    all_offsets = []
+    for device in device_map:
+        local_shape, global_offset = compute_local_shape_and_global_offset(
+            dtensor.shape, device_mesh, placements, device_map[device]
+        )
+        all_offsets.append([local_shape, global_offset, device])
+
+    # Convert offsets to blocks with row_ranges for tabulate
+    blocks = _convert_offset_to_ranges(all_offsets)
+
+    # Print the table
+    print(header)
+    print(_create_table(blocks))
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/device_mesh.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/device_mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c9c01ff186892d6b61097317d24cad9cc2c0cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/device_mesh.py
@@ -0,0 +1,6 @@
+from torch.distributed.device_mesh import (  # noqa: F401
+    _get_device_handle,
+    _mesh_resources,
+    DeviceMesh,
+    init_device_mesh,
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/dispatch.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7686dc5c2e762851b2f9a9dfdf5cfc5c2cd3267
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/dispatch.py
@@ -0,0 +1,393 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import functools
+import operator
+from typing import cast, Dict, List, Optional, Sequence, Tuple
+
+import torch
+
+import torch.distributed as dist
+import torch.distributed._tensor.api as dtensor
+import torch.distributed._tensor.random as random
+from torch.distributed._tensor._utils import try_find_mesh_from_args
+from torch.distributed._tensor.op_schema import (
+    _is_inplace_op,
+    _is_out_variant_op,
+    OpInfo,
+    OpSchema,
+    OutputSpecType,
+)
+from torch.distributed._tensor.placement_types import DTensorSpec, Replicate, TensorMeta
+from torch.distributed._tensor.random import is_rng_supported_mesh
+from torch.distributed._tensor.redistribute import redistribute_local_tensor
+from torch.distributed._tensor.sharding_prop import ShardingPropagator
+from torch.distributed._tensor.tp_conv import (
+    convolution_backward_handler,
+    convolution_handler,
+)
+from torch.distributed.device_mesh import DeviceMesh
+
+try:
+    from torch.utils import _cxx_pytree as pytree
+except ImportError:
+    from torch.utils import _pytree as pytree  # type: ignore[no-redef]
+
+aten = torch.ops.aten
+
+
+def decompose_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    """
+    Decomposes a op to core ATen op, this handler is mostly here
+    for inference mode usage where the ops are not core aten ops.
+    """
+    r = op_call.decompose(*args, **kwargs)
+    if r is not NotImplemented:
+        return r
+    else:
+        raise RuntimeError("Decomposition failed")
+
+
+def is_same_size_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> bool:
+    lhs = cast(torch.Tensor, args[0])
+    rhs = cast(torch.Tensor, args[1])
+    return lhs.shape == rhs.shape
+
+
+class OpDispatcher:
+    """
+    Op dispatching class instance to handle args/kwargs pre-processing (un-wrapping), sharding
+    propagation, redistribute local args, local compute, and post-processing (re-wrapping). It
+    also handles any op specific logic if necessary.
+    """
+
+    def __init__(self) -> None:
+        self.sharding_propagator = ShardingPropagator()
+        self._random_ops = {
+            aten.native_dropout.default,
+            aten.normal_.default,
+            aten.rand_like.default,
+            aten.randn_like.default,
+            aten.randint_like.default,
+            aten.randint_like.low_dtype,
+            aten.randint_like.low_dtype_out,
+            aten.uniform_.default,
+            aten.bernoulli.default,
+            aten.bernoulli_.float,
+        }
+        self._custom_op_handlers = {
+            aten.linear.default: decompose_handler,
+            aten.is_same_size.default: is_same_size_handler,
+            aten.convolution.default: convolution_handler,
+            aten.convolution_backward.default: convolution_backward_handler,
+        }
+
+        # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
+        # as implicitly replicated or we throw error to user.
+        # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
+        # it as False by default.
+        self._allow_implicit_replication = False
+
+    def dispatch(
+        self,
+        op_call: torch._ops.OpOverload,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object],
+    ) -> object:
+        """
+        Main dispatching logic
+        """
+        # operators that does not need to go through sharding propagation
+        if op_call in self._custom_op_handlers:
+            return self._custom_op_handlers[op_call](op_call, args, kwargs)  # type: ignore[operator]
+
+        # extract local tensor and sharding infos to a OpInfo
+        op_info = self.unwrap_to_op_info(op_call, args, kwargs)
+
+        self.sharding_propagator.propagate(op_info)
+        output_sharding = op_info.output_sharding
+        assert output_sharding is not None, "output sharding should not be None"
+
+        mesh = op_info.mesh
+        if mesh.get_coordinate() is None:
+            # For a non-participating device, we do:
+            #   1. if the return type is scalar, set the local result to None.
+            #   The local results from all devices will then be all-gathered
+            #   and a reduce op will be performed on the list of results
+            #   with appropriate operators:
+            #       for bool type, we by default use AND to reduce;
+            #       we can extend for more ops if necessary.
+            #   2. if the return type is Tensor or List[Tensor], return empty
+            #   tensor(s) with correct dtype.
+            spec = output_sharding.output_spec
+            ret_list = op_info.schema.op._schema.returns
+
+            if spec is None:
+                # For a scalar return type, the non-participating device has None
+                # as its local result
+                local_results: object = None
+            else:
+
+                def default_tensor(spec: DTensorSpec) -> torch.Tensor:
+                    if spec.tensor_meta is not None:
+                        shape = spec.tensor_meta.shape
+                        dtype = spec.tensor_meta.dtype
+                        if len(shape) == 0:
+                            # scalar tensor
+                            return torch.zeros((), dtype=dtype)
+                        else:
+                            # non-scalar tensor
+                            return torch.tensor([], dtype=dtype)
+                    else:
+                        raise RuntimeError(f"{spec} has no tensor metadata.")
+
+                if isinstance(spec, DTensorSpec):
+                    # return a Tensor value
+                    local_results = default_tensor(spec)
+                elif isinstance(spec, Sequence):
+                    # return a List[Tensor] value
+                    local_results = [
+                        default_tensor(s) if s is not None else None for s in spec
+                    ]
+                    assert isinstance(local_results, List)
+                    if None in local_results:
+                        ret_type = str(ret_list[0].type)
+                        raise NotImplementedError(
+                            f"return type {ret_type} in DTensor op is not supported"
+                        )
+        else:
+            if output_sharding.needs_redistribute:
+                # compute locally with redistribute first if needed
+                assert output_sharding.schema_suggestions is not None
+                self.redistribute_local_args(
+                    op_info, output_sharding.schema_suggestions[0]
+                )
+
+            local_tensor_args = (
+                pytree.tree_unflatten(
+                    cast(List[object], op_info.local_args), op_info.args_tree_spec
+                )
+                if op_info.args_tree_spec
+                else op_info.local_args
+            )
+
+            # run local op computation with potentially modified args/kwargs
+            local_tensor_args = cast(Tuple[object, ...], local_tensor_args)
+            if op_call in self._random_ops and is_rng_supported_mesh(mesh):
+                if not random._rng_tracker:
+                    # Default to `OffsetBasedRNGTracker` if the parallelism API
+                    # did not already construct one
+                    random._rng_tracker = random.OffsetBasedRNGTracker(mesh.device_type)
+                # For DTensor random operator, run it within a distribute region
+                with random._rng_tracker._distribute_region(
+                    cast(dtensor.DTensor, args[0])._spec
+                ):
+                    local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
+            else:
+                local_results = op_call(*local_tensor_args, **op_info.local_kwargs)
+
+        # communicate the result to all ranks for some operators that return scalar value
+        if output_sharding.output_spec is None:
+            if op_call == aten.equal.default:
+                obj_list = [None for _ in range(dist.get_world_size())]
+                dist.all_gather_object(obj_list, local_results)  # type: ignore[possibly-undefined]
+                obj_list = list(filter(lambda x: x is not None, obj_list))
+                # perform reduce on the collection with AND op
+                local_results = functools.reduce(operator.and_, obj_list, True)
+
+        if _is_inplace_op(op_call):
+            # inplace op should return self instead of re-wrapping
+            if output_sharding.output_spec is not None:
+                return args[0]
+            else:
+                return None
+        elif _is_out_variant_op(op_call):
+            # out variant could possibly have multiple out args (i.e. lu_unpack.out)
+            output_specs = (
+                (output_sharding.output_spec,)
+                if not isinstance(output_sharding.output_spec, tuple)
+                else output_sharding.output_spec
+            )
+            out_dts = []
+            spec_idx = 0
+            for argument in op_call._schema.arguments:
+                if argument.is_out:
+                    out_dt = cast(dtensor.DTensor, kwargs[argument.name])
+                    out_dt._spec = cast(DTensorSpec, output_specs[spec_idx])
+                    out_dts.append(out_dt)
+                    spec_idx += 1
+
+            assert len(out_dts) >= 1, "out variant should have at least one out arg"
+            return tuple(out_dts) if len(out_dts) > 1 else out_dts[0]
+        else:
+            return self.wrap(local_results, output_sharding.output_spec)  # type: ignore[possibly-undefined]
+
+    @staticmethod
+    def redistribute_local_args(
+        op_info: OpInfo,
+        suggested_input_schema: OpSchema,
+    ) -> None:
+        # NOTE: it's very rare that we need to reshard kwargs so we intentionally skip it
+
+        # TODO: the op schema should probably just remain flattened so that we can avoid this tree flatten
+        # Need to fix all the ops before doing this.
+        if op_info.args_tree_spec is not None:
+            flatten_args_schema_to_reshard = tuple(
+                pytree.tree_leaves(suggested_input_schema.args_schema)
+            )
+        else:
+            flatten_args_schema_to_reshard = suggested_input_schema.args_schema
+
+        new_local_args: List[object] = []
+        for i, arg_spec in enumerate(op_info.flat_args_schema):
+            reshard_arg_spec = flatten_args_schema_to_reshard[i]
+            if isinstance(arg_spec, DTensorSpec):
+                local_tensor = cast(torch.Tensor, op_info.local_args[i])
+                if arg_spec != reshard_arg_spec:
+                    resharded_local_tensor = redistribute_local_tensor(
+                        local_tensor, arg_spec, reshard_arg_spec
+                    )
+                    new_local_args.append(resharded_local_tensor)
+                else:
+                    new_local_args.append(local_tensor)
+            else:
+                new_local_args.append(reshard_arg_spec)
+
+        op_info.local_args = tuple(new_local_args)
+
+    def unwrap_to_op_info(
+        self,
+        op_call: torch._ops.OpOverload,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object],
+    ) -> OpInfo:
+        # get runtime schema to determine whether to use pytree to flatten inputs
+        runtime_schema_info = self.sharding_propagator.op_to_schema_info.get(
+            op_call, None
+        )
+
+        if runtime_schema_info is not None and runtime_schema_info.needs_pytree:
+            # flatten args/kwargs when necessary
+            tree_args, args_spec = pytree.tree_flatten(args)
+            args_list: Sequence[object] = tree_args
+        else:
+            args_list, args_spec = args, None
+
+        args_schema: List[object] = []
+        kwargs_schema: Dict[str, object] = {}
+        local_args: List[object] = []
+        local_kwargs: Dict[str, object] = {}
+        mesh: Optional[DeviceMesh] = None
+
+        for arg in args_list:
+            if isinstance(arg, dtensor.DTensor):
+                args_schema.append(arg._spec)
+                local_args.append(arg._local_tensor)
+                if mesh is not None:
+                    if mesh != arg.device_mesh:
+                        raise NotImplementedError(
+                            f"{op_call}: DTensor does not support cross-mesh operation yet!"
+                        )
+                else:
+                    mesh = arg.device_mesh
+            elif isinstance(arg, torch.Tensor):
+                if arg.ndim == 0 or self._allow_implicit_replication:
+                    mesh = mesh or try_find_mesh_from_args(op_call, args_list)
+                    # scalar tensor can be safely treated as replicated
+                    args_schema.append(
+                        DTensorSpec(
+                            mesh,
+                            (Replicate(),) * mesh.ndim,
+                            tensor_meta=TensorMeta(
+                                shape=arg.shape, stride=arg.stride(), dtype=arg.dtype
+                            ),
+                        )
+                    )
+                    local_args.append(arg)
+                else:
+                    raise RuntimeError(
+                        f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
+                        " torch.Tensor to DTensor before calling distributed operators!"
+                    )
+            else:
+                args_schema.append(arg)
+                local_args.append(arg)
+
+        for k, v in kwargs.items():
+            if isinstance(v, dtensor.DTensor):
+                kwargs_schema[k] = v._spec
+                local_kwargs[k] = v._local_tensor
+                if mesh is not None:
+                    if mesh != v.device_mesh:
+                        raise NotImplementedError(
+                            f"{op_call}: DTensor does not support cross-mesh operation yet!"
+                        )
+                else:
+                    mesh = v.device_mesh
+            elif isinstance(v, torch.Tensor):
+                raise RuntimeError(
+                    f"{op_call}: got mixed torch.Tensor and DTensor, need to convert all"
+                    " torch.Tensor to DTensor before calling distributed operators!"
+                )
+            else:
+                kwargs_schema[k] = v
+                local_kwargs[k] = v
+
+        assert mesh is not None, f"found no DeviceMesh from dtensor args for {op_call}!"
+        op_info = OpInfo(
+            mesh,
+            OpSchema(
+                op_call,
+                pytree.tree_unflatten(args_schema, args_spec)
+                if args_spec
+                else tuple(args_schema),
+                kwargs_schema,
+                schema_info=runtime_schema_info,
+            ),
+            args_schema,
+            tuple(local_args),
+            local_kwargs,
+            args_spec,
+        )
+        return op_info
+
+    @staticmethod
+    def wrap(res: object, spec: OutputSpecType) -> object:
+        if isinstance(res, torch.Tensor):
+            if spec is not None:
+                assert isinstance(
+                    spec, DTensorSpec
+                ), f"output spec does not match with output! Expected DTensorSpec, got {spec}."
+                assert spec.tensor_meta is not None
+                return dtensor.DTensor(
+                    res,
+                    spec.mesh,
+                    spec.placements,
+                    shape=spec.tensor_meta.shape,
+                    dtype=spec.tensor_meta.dtype,
+                    requires_grad=res.requires_grad,
+                    stride=spec.tensor_meta.stride,
+                )
+            else:
+                # if output does not have a DTensorSpec due to specific ops, it must be a scalar tensor
+                assert res.ndim == 0, "output tensor should be scalar!"
+                return res
+        elif isinstance(res, (list, tuple)):
+            assert spec is not None and isinstance(
+                spec, (list, tuple)
+            ), f"output spec does not match with output! Expected list/tuple, got {spec}."
+            res_list = []
+            for e, s in zip(res, spec):
+                res_list.append(OpDispatcher.wrap(e, s))
+
+            return tuple(res_list) if isinstance(res, tuple) else res_list
+        else:
+            # if the res contains only non tensor values (i.e. int/float/none), we simply return it
+            # without rewrapping to DTensor.
+            return res
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d488e14db16c07674d9652f410326ec9fe23fe9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__init__.py
@@ -0,0 +1,12 @@
+from contextlib import contextmanager
+
+from torch.distributed._tensor.api import DTensor
+
+
+@contextmanager
+def implicit_replication():
+    try:
+        DTensor._op_dispatcher._allow_implicit_replication = True
+        yield
+    finally:
+        DTensor._op_dispatcher._allow_implicit_replication = False
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d7102a38dab4967db55f50def645a2c80878b1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/tp_transform.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/tp_transform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c0212a12848abbacfedb1505583e87f8da5ea7f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/__pycache__/tp_transform.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/tp_transform.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/tp_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..df100c4696341823e30dd131c07aa763e2c010e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/experimental/tp_transform.py
@@ -0,0 +1,547 @@
+import copy
+import operator
+from typing import Any, cast, Dict, List, Optional, Sequence, Tuple
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, DTensor
+from torch.distributed._tensor.op_schema import (
+    DTensorSpec,
+    OpSchema,
+    OutputSharding,
+    OutputSpecType,
+    PlacementStrategy,
+)
+from torch.distributed._tensor.placement_types import (
+    Placement,
+    Replicate,
+    Shard,
+    TensorMeta,
+)
+from torch.distributed._tensor.redistribute import redistribute_local_tensor
+from torch.distributed.tensor.parallel.style import ColwiseParallel, ParallelStyle
+from torch.export import ExportedProgram
+from torch.export.exported_program import ExportGraphSignature
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.fx.node import Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils import _pytree as pytree
+
+
+aten = torch.ops.aten
+
+
+def tensor_parallel_transformation(
+    exported_program: ExportedProgram,
+    rank: int,
+    world_size: int,
+    device_type: str,
+    parallel_strategies: Dict[str, ParallelStyle],
+) -> ExportedProgram:
+    """
+    The entry point function to perform graph transformations on an exported program
+    to transform a single-device graph into a tensor parallel graph.
+
+    .. warning::
+        This API is experimental and subject to change.
+    """
+
+    gm = exported_program.graph_module
+    sig = copy.deepcopy(exported_program.graph_signature)
+    state_dict = copy.copy(exported_program.state_dict)
+
+    with gm._set_replace_hook(sig.get_replace_hook()):
+        res = TensorParallelTransformPass(
+            rank,
+            world_size,
+            device_type,
+            state_dict,
+            exported_program.graph_signature,
+            parallel_strategies,
+        )(gm)
+        assert res is not None
+        gm = res.graph_module
+
+    return exported_program._update(gm, sig, state_dict)
+
+
+class TensorParallelTransformPass(PassBase):
+    """
+    This pass is responsible for transforming a single-device graph into a tensor parallel
+    graph. It will mark the placement strategy of each node in the graph,
+    partition the graph into distributed graph, then shard the parameters/buffers accordingly.
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        world_size: int,
+        device_type: str,
+        state_dict: Dict[str, torch.Tensor],
+        graph_signature: ExportGraphSignature,
+        parallel_strategies: Dict[str, ParallelStyle],
+    ) -> None:
+        super().__init__()
+        self.rank = rank
+        self.mesh = DeviceMesh(device_type, torch.arange(world_size))
+        self.state_dict: Dict[str, torch.Tensor] = state_dict
+        self.graph_signature = graph_signature
+        self.parallel_strategies = parallel_strategies
+
+    def call(self, graph_module) -> PassResult:
+        gm = copy.deepcopy(graph_module)
+
+        parameter_placements = _generate_parameter_and_buffer_placements(
+            list(self.state_dict.keys()), self.parallel_strategies
+        )
+        placement_strategies = _mark_sharding(
+            gm, self.graph_signature, self.mesh, parameter_placements
+        )
+        _partitioner(gm)
+        _shard_state_dict(
+            self.state_dict, placement_strategies, self.graph_signature, self.mesh
+        )
+        return PassResult(gm, True)
+
+
+def _generate_parameter_and_buffer_placements(
+    params_and_buffers: List[str],
+    parallel_strategies: Dict[str, ParallelStyle],
+) -> Dict[str, Placement]:
+    """
+    Build parameter placements based on the give parallel style of linear layers.
+    """
+    parameter_placements: Dict[str, Placement] = {}
+    for linear_fqn, parallel_style in parallel_strategies.items():
+        weight_fqn = f"{linear_fqn}.weight"
+        bias_fqn = f"{linear_fqn}.bias"
+        assert weight_fqn in params_and_buffers
+        parameter_placements[weight_fqn] = (
+            Shard(0) if parallel_style == ColwiseParallel else Shard(1)
+        )
+        if bias_fqn in params_and_buffers:
+            parameter_placements[bias_fqn] = (
+                Shard(0) if parallel_style == ColwiseParallel else Replicate()
+            )
+    return parameter_placements
+
+
+def _mark_tensor_parallel_shardings(
+    gm: GraphModule,
+    graph_signature: ExportGraphSignature,
+    mesh: DeviceMesh,
+    parameter_placements: Dict[str, Placement],
+) -> Dict[Node, PlacementStrategy]:
+    """
+    Mark the placement strategies of the parameter and buffer placeholder nodes.
+    """
+    placement_strategies: Dict[Node, PlacementStrategy] = {}
+    num_params_and_buffers = len(graph_signature.inputs_to_parameters) + len(
+        graph_signature.inputs_to_buffers
+    )
+    placeholder_idx: int = 0
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            if placeholder_idx < num_params_and_buffers:
+                fqn: str = _get_input_node_fqn(node.name, graph_signature)
+                placement: Placement = (
+                    parameter_placements[fqn]
+                    if fqn in parameter_placements
+                    else Replicate()
+                )
+                placement_strategies[node] = _create_placement_strategy(
+                    node,
+                    mesh,
+                    placements=(placement,),
+                )
+                placeholder_idx += 1
+            else:
+                placement_strategies[node] = _create_placement_strategy(
+                    node,
+                    mesh,
+                    placements=(Replicate(),),
+                )
+    return placement_strategies
+
+
+def _get_input_node_fqn(input_name: str, graph_signature: ExportGraphSignature) -> str:
+    """
+    Return the FQN of an input node.
+    """
+    if input_name in graph_signature.inputs_to_parameters:
+        return graph_signature.inputs_to_parameters[input_name]
+    elif input_name in graph_signature.inputs_to_buffers:
+        return graph_signature.inputs_to_buffers[input_name]
+    else:
+        raise ValueError(
+            f"{input_name} not found in inputs_to_parameters or inputs_to_buffers"
+        )
+
+
+def _mark_sharding(
+    gm: GraphModule,
+    graph_signature: ExportGraphSignature,
+    mesh: DeviceMesh,
+    parameter_placements: Dict[str, Placement],
+) -> Dict[Node, PlacementStrategy]:
+    """
+    Mark the sharding strategy for each node in the graph module.
+    """
+    placement_strategies: Dict[
+        Node, PlacementStrategy
+    ] = _mark_tensor_parallel_shardings(gm, graph_signature, mesh, parameter_placements)
+
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            if node not in placement_strategies:
+                placement_strategies[node] = _create_placement_strategy(
+                    node, mesh, placements=(Replicate(),)
+                )
+            node.meta["sharding"] = placement_strategies[node]
+        elif node.op == "call_function":
+            if node.target == operator.getitem:
+                input_nodes = node.all_input_nodes
+                assert (
+                    len(input_nodes) == 1
+                ), f"non-compute op only support one input now, found node: {node} with length of inputs: {len(node.args)}"
+                arg_strategy = placement_strategies[input_nodes[0]]
+                placement_strategies[node] = _create_placement_strategy(
+                    node,
+                    mesh,
+                    placements=arg_strategy.output_spec.placements,
+                    input_specs=_get_input_node_specs(node, placement_strategies),
+                )
+                node.meta["sharding"] = placement_strategies[node]
+            else:
+                op_schema = _get_op_schema(node, placement_strategies)
+
+                # get DTensor specs for inputs and outputs
+                if (
+                    op_schema.op
+                    not in DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs
+                    and op_schema.op
+                    not in DTensor._op_dispatcher.sharding_propagator.op_to_rules
+                ):
+                    # Mark all as replicated
+                    output_sharding = _generate_default_output_sharding(
+                        node,
+                        mesh,
+                        op_schema,
+                    )
+                else:
+                    output_sharding = DTensor._op_dispatcher.sharding_propagator.propagate_op_sharding(
+                        op_schema,
+                    )
+                placement_strategies[node] = PlacementStrategy(
+                    output_specs=_get_output_spec_from_output_sharding(output_sharding),
+                    input_specs=output_sharding.schema_suggestions[0].args_spec
+                    if output_sharding.schema_suggestions is not None
+                    else _get_input_node_specs(node, placement_strategies),
+                )
+                node.meta["sharding"] = placement_strategies[node]
+        elif node.op == "output":
+            node.meta["sharding"] = None
+        else:
+            raise RuntimeError(f"op code {node.op} not supported")
+    return placement_strategies
+
+
+def _get_output_spec_from_output_sharding(
+    output_sharding: OutputSharding,
+) -> DTensorSpec:
+    """
+    Util function to extract output spec from output sharding.
+    """
+    if isinstance(output_sharding.output_spec, DTensorSpec):
+        return output_sharding.output_spec
+    else:
+        # For ops that return multiple outputs, the outputs should have the same output spec
+        assert isinstance(output_sharding.output_spec, Sequence)
+        assert output_sharding.output_spec[0] is not None
+        output_sharding.output_spec[0].tensor_meta = None
+        return output_sharding.output_spec[0]
+
+
+def _create_placement_strategy(
+    node: Node,
+    mesh: DeviceMesh,
+    placements: Tuple[Placement, ...],
+    input_specs: Optional[Sequence[DTensorSpec]] = None,
+) -> PlacementStrategy:
+    """
+    Util function to construct a placement strategy for a given node.
+    """
+    placement = PlacementStrategy(
+        input_specs=input_specs,
+        output_specs=DTensorSpec(
+            mesh=mesh,
+            placements=placements,
+        ),
+    )
+    _populate_tensor_meta(node, placement.output_specs)
+    return placement
+
+
+def _populate_tensor_meta(node: Node, output_spec: OutputSpecType) -> None:
+    """
+    Util function to populate tensor meta of output_spec based on node metadata.
+    """
+    if isinstance(node.meta["val"], Sequence):
+        assert isinstance(output_spec, Sequence)
+        for spec, fake_tensor in zip(output_spec, node.meta["val"]):
+            assert spec is not None
+            spec.tensor_meta = TensorMeta(
+                shape=fake_tensor.shape,
+                stride=fake_tensor.stride(),
+                dtype=fake_tensor.dtype,
+            )
+    else:
+        assert isinstance(output_spec, DTensorSpec)
+        output_spec.tensor_meta = TensorMeta(
+            shape=node.meta["val"].shape,
+            stride=node.meta["val"].stride(),
+            dtype=node.meta["val"].dtype,
+        )
+
+
+def _generate_default_output_sharding(
+    node: Node,
+    mesh: DeviceMesh,
+    op_schema: OpSchema,
+) -> OutputSharding:
+    """
+    Util function to create a default output sharding that suggests Replicate placement for both args and outputs.
+    """
+
+    def update_arg_spec(arg_spec: DTensorSpec) -> DTensorSpec:
+        return DTensorSpec(
+            mesh=arg_spec.mesh,
+            placements=(Replicate(),),
+            tensor_meta=arg_spec.tensor_meta,
+        )
+
+    new_op_schema = OpSchema(
+        op=op_schema.op,
+        args_schema=pytree.tree_map_only(
+            DTensorSpec, update_arg_spec, op_schema.args_schema
+        ),
+        kwargs_schema=op_schema.kwargs_schema,
+    )
+
+    def create_output_spec(tensor: FakeTensor) -> DTensorSpec:
+        return DTensorSpec(
+            mesh=mesh,
+            placements=(Replicate(),),
+            tensor_meta=TensorMeta(
+                shape=tensor.shape,
+                stride=tensor.stride(),
+                dtype=tensor.dtype,
+            ),
+        )
+
+    return OutputSharding(
+        output_spec=pytree.tree_map_only(
+            FakeTensor, create_output_spec, node.meta["val"]
+        ),
+        schema_suggestions=[new_op_schema],
+        failed_reason=f"{node.op} does not have sharding strategy registered",
+        needs_redistribute=True,
+    )
+
+
+def _partitioner(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Graph partitioner that partitions the single device graph
+    to distributed graph
+    """
+    for node in gm.graph.nodes:
+        node_sharding = node.meta["sharding"]
+        if node.op == "placeholder":
+            out_spec = node_sharding.output_spec
+            local_val = _partition_val(node.meta["val"], out_spec)
+            # update node value
+            node.meta["val"] = local_val
+        elif node.op == "call_function":
+            out_spec = node_sharding.output_spec
+            # check if there's misaligned sharding, insert reshard if there is
+            expected_input_specs = node_sharding.input_specs
+            for idx, input_arg in enumerate(node.all_input_nodes):
+                input_arg_sharding = input_arg.meta["sharding"]
+                input_arg_spec = input_arg_sharding.output_spec
+                desired_spec = (
+                    out_spec
+                    if expected_input_specs is None
+                    else expected_input_specs[idx]
+                )
+                if input_arg_spec != desired_spec:
+                    _insert_reshard_gm(
+                        gm, node, input_arg, input_arg_spec, desired_spec
+                    )
+            # convert output val to its local component
+            output_val = node.meta["val"]
+            node.meta["val"] = _partition_val(output_val, out_spec)
+        elif node.op == "output":
+            for input_arg in node.all_input_nodes:
+                # input args of output should be Replicate, otherwise redistribution is needed.
+                input_args_to_check: Sequence[Node] = (
+                    input_arg if isinstance(input_arg, Sequence) else [input_arg]
+                )
+                for arg in input_args_to_check:
+                    arg_sharding = arg.meta["sharding"]
+                    arg_spec = arg_sharding.output_spec
+                    desired_spec = copy.copy(arg_spec)
+                    desired_spec.placements = (Replicate(),)
+                    if arg_spec != desired_spec:
+                        _insert_reshard_gm(gm, node, arg, arg_spec, desired_spec)
+        else:
+            raise RuntimeError(f"op code {node} not supported")
+
+    _clean_up_graph_metadata(gm)
+    gm.graph.lint()
+    gm.recompile()
+    return gm
+
+
+def _partition_val(val: Any, spec: DTensorSpec) -> Any:
+    """
+    util function to convert a full tensor val to its local component
+    """
+    if isinstance(val, torch.Tensor):
+        local_shard = val
+        if val.ndim == 0:
+            # If it's already a scalar tensor, it is already local, we don't
+            # need to do anything
+            return local_shard
+
+        for idx, placement in enumerate(spec.placements):
+            if placement.is_shard():
+                placement = cast(Shard, placement)
+                num_chunks = spec.mesh.size(mesh_dim=idx)
+                my_coord = spec.mesh.get_coordinate()
+                assert my_coord is not None, "current rank not in mesh!"
+                my_coord_on_mesh_dim = my_coord[idx]
+                local_shard = placement._split_tensor(
+                    local_shard, num_chunks, with_padding=False, contiguous=True
+                )[0][my_coord_on_mesh_dim]
+        return local_shard
+    elif isinstance(val, (list, tuple)):
+        return val.__class__(_partition_val(v, spec) for v in val)
+    else:
+        raise RuntimeError(f"val type {type(val)} not supported")
+
+
+def _insert_reshard_gm(
+    gm: torch.fx.GraphModule,
+    node: Node,
+    input_arg: Node,
+    input_arg_spec: DTensorSpec,
+    desired_spec: DTensorSpec,
+) -> None:
+    """
+    Transform the graph for tensor redistribution.
+    """
+    input_arg_spec.tensor_meta = input_arg.meta["tensor_meta"]
+    desired_spec.tensor_meta = input_arg.meta["tensor_meta"]
+    input_arg_tensor = input_arg.meta["val"]
+
+    # insert reshard operation
+    def reshard_fn(local_tensor: torch.Tensor) -> torch.Tensor:
+        return redistribute_local_tensor(
+            local_tensor,
+            input_arg_spec,
+            desired_spec,
+        )
+
+    reshard_gm = make_fx(reshard_fn)(input_arg_tensor)
+    reshard_gm_nodes = list(reshard_gm.graph.nodes)
+    input_node = reshard_gm_nodes[0]
+    with gm.graph.inserting_before(node):
+        output_node = gm.graph.graph_copy(
+            reshard_gm.graph,
+            val_map={
+                input_node: input_arg,
+            },
+        )
+    node.replace_input_with(input_arg, output_node)
+
+
+def _clean_up_graph_metadata(gm: torch.fx.GraphModule) -> None:
+    """
+    Clean up the graph by removing sharding and partitioning related metadata
+    """
+    for node in gm.graph.nodes:
+        if "sharding" in node.meta:
+            del node.meta["sharding"]
+        if "val" in node.meta and isinstance(node.meta["val"], torch.Tensor):
+            local_tensor_meta = _extract_tensor_metadata(node.meta["val"])
+            node.meta["tensor_meta"] = local_tensor_meta
+
+
+def _get_input_node_specs(
+    node: Node, placement_strategies: Dict[Node, PlacementStrategy]
+) -> Tuple[DTensorSpec, ...]:
+    """
+    Get the input specs of a node.
+    """
+    input_specs_list: List[DTensorSpec] = []
+    for input_arg in node.all_input_nodes:
+        if input_arg in placement_strategies:
+            output_spec = placement_strategies[input_arg].output_specs
+            assert isinstance(output_spec, DTensorSpec)
+            input_specs_list.append(output_spec)
+        else:
+            raise ValueError(f"{input_arg} does not have output_spec populated.")
+    return tuple(input_specs_list)
+
+
+def _get_op_schema(
+    node: Node, placement_strategies: Dict[Node, PlacementStrategy]
+) -> OpSchema:
+    """
+    Util function to construct the operator schema of a node.
+    """
+    args_schema_list = pytree.tree_map_only(
+        Node, lambda arg: placement_strategies[arg].output_specs, node.args
+    )
+    op_schema = OpSchema(
+        op=cast(torch._ops.OpOverload, node.target),
+        args_schema=tuple(args_schema_list),
+        kwargs_schema=cast(Dict[str, object], node.kwargs),
+    )
+    return op_schema
+
+
+def _shard_state_dict(
+    state_dict: Dict[str, torch.Tensor],
+    placement_strategies: Dict[Node, PlacementStrategy],
+    graph_signature: ExportGraphSignature,
+    mesh: DeviceMesh,
+) -> None:
+    """
+    Inplace partition the weights based on the placement strategy
+    """
+    for node, placement_strategy in placement_strategies.items():
+        if node.op != "placeholder":
+            continue
+        if node.name in graph_signature.inputs_to_parameters:
+            fqn = graph_signature.inputs_to_parameters[node.name]
+        elif node.name in graph_signature.inputs_to_buffers:
+            fqn = graph_signature.inputs_to_buffers[node.name]
+        else:
+            continue
+        assert fqn in state_dict, f"{fqn} not found in state dict: {state_dict.keys()}"
+
+        original_param = state_dict[fqn]
+        dtensor_param = distribute_tensor(
+            original_param,
+            mesh,
+            placement_strategy.output_spec.placements,
+        )
+        local_param = dtensor_param.to_local()
+        state_dict[fqn] = (
+            torch.nn.Parameter(local_param)
+            if isinstance(original_param, torch.nn.Parameter)
+            else local_param
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/op_schema.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/op_schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4ff8c6db88de8b3a0fbbeb3bd1f0c97cec77f5e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/op_schema.py
@@ -0,0 +1,427 @@
+from dataclasses import dataclass
+from functools import cached_property
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torch._ops import OpOverload
+from torch.distributed._tensor.placement_types import DTensorSpec
+from torch.distributed.device_mesh import DeviceMesh
+
+try:
+    from torch.utils._cxx_pytree import tree_map_only, TreeSpec
+except ImportError:
+    from torch.utils._pytree import (  # type: ignore[no-redef, assignment]
+        tree_map_only,
+        TreeSpec,
+    )
+
+
+# Common type aliases
+ArgsType = Tuple[object, ...]
+KwargsType = Dict[str, object]
+# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type sould
+# be the same set of possibilities.
+OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]
+
+
+def _rebuild_tensor_from_dtensor_meta(arg) -> object:
+    """
+    This is used to propagate tensor metadata, must be under fake mode
+    """
+    assert arg.tensor_meta is not None, "DTensorSpec does not contain tensor_meta."
+    return torch.empty_strided(
+        arg.tensor_meta.shape,
+        arg.tensor_meta.stride,
+        dtype=arg.tensor_meta.dtype,
+    )
+
+
+def _is_inplace_op(op: OpOverload):
+    # simple analysis of function schema to determine
+    # if this is an inplace variant, it might not
+    # be entirely correct, but it's good enough for now.
+    return op._schema.name[-1] == "_"
+
+
+def _is_out_variant_op(op: OpOverload):
+    # simple analysis of function schema to determine
+    # if this is an out variant, it might not
+    # be entirely correct, but it's good enough for now.
+    return "out" in op._schema.overload_name
+
+
+def _pretty_print_spec(spec: object) -> str:
+    if spec is None:
+        return "None"
+    elif isinstance(spec, DTensorSpec):
+        return "".join([str(p) for p in spec.placements])
+    elif isinstance(spec, Sequence):
+        return "(" + ", ".join([_pretty_print_spec(s) for s in spec]) + ")"
+    else:
+        raise RuntimeError(f"Unknown spec type to print: spec={spec}")
+
+
+@dataclass
+class PlacementStrategy:
+    """
+    A placement strategy describes acceptable sharding placements of the output
+    and the tensor arguments of an operation.
+
+    note: when the op return value is a single DTensor object, output_specs is
+    DTensorSpec; when the return value is a tuple of Optional[DTensor],
+    output_specs is a tuple of Optional[DTensorSpec].
+    """
+
+    output_specs: Union[DTensorSpec, Tuple[Optional[DTensorSpec], ...]]
+    input_specs: Optional[Sequence[DTensorSpec]] = None
+
+    # redistribute costs for this op placement strategy
+    # we need a nested list to record the cost for each
+    # operand of this operator, and for each operand of
+    # this operator it might have multiple placement strategies
+    redistribute_cost: Optional[List[List[float]]] = None
+
+    @cached_property
+    def output_spec(self) -> DTensorSpec:
+        """
+        This function requires that the strategy have exactly one DTensorSpec as the
+        output spec. If the output_specs is a tuple, we throw an exception.
+        """
+        if isinstance(self.output_specs, DTensorSpec):
+            return self.output_specs
+        else:
+            raise ValueError(
+                f"function output_spec expects a single DTensorSpec but got: {self.output_specs}"
+            )
+
+    def input_spec(self, index: int = 0) -> DTensorSpec:
+        assert self.input_specs is not None, "input_specs of PlacementStrategy is None!"
+        assert len(self.input_specs) > index, (
+            f"Invalid index {index} for input_specs of length "
+            f"{len(self.input_specs)}: {self.input_specs}"
+        )
+        return self.input_specs[index]
+
+    def __str__(self) -> str:
+        input_specs_str = _pretty_print_spec(self.input_specs)
+        output_spec_str = _pretty_print_spec(self.output_specs)
+        return f"{input_specs_str} -> {output_spec_str}"
+
+
+class StrategyType:
+    """
+    Base class type for op strategy, We have two StrategyType:
+        OpStrategy and TupleStrategy
+    """
+
+    pass
+
+
+class OpStrategy(StrategyType):
+    """
+    OpStrategy that consists of a list of placement strategies associated with the op
+    """
+
+    def __init__(self, strategies: List[PlacementStrategy]) -> None:
+        super().__init__()
+        self.strategies: List[PlacementStrategy] = strategies
+
+    def __str__(self) -> str:
+        strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
+        mesh_shape = self.output_mesh_shape
+        return f"OpStrategy:[{strategy_list_str}] @ mesh: {mesh_shape}"
+
+    def max_num_shards(self) -> int:
+        """
+        Returns the max number of shards across all placement strategies
+        """
+        return max([strategy.output_spec.num_shards for strategy in self.strategies])
+
+    @property
+    def output_mesh_shape(self):
+        output_spec = self.strategies[0].output_specs
+        if isinstance(output_spec, DTensorSpec):
+            return output_spec.mesh.shape
+        else:
+            assert isinstance(
+                output_spec, tuple
+            ), "found no DTensorSpec in the OpStrategy!"
+            assert output_spec[0] is not None
+            return output_spec[0].mesh.shape
+
+    @property
+    def output_ndim(self):
+        return self.strategies[0].output_spec.ndim
+
+    @property
+    def output_shape(self):
+        return self.strategies[0].output_spec.shape
+
+
+class TupleStrategy(StrategyType):
+    """
+    TupleStrategy represents the output strategy of this op is a tuple
+    of strategy, i.e. If the output of this op is a tuple of tensors or list of tensors
+    with possibly different placement strategies, we should return a TupleStrategy that
+    contains a tuple of OpStrategy, where each child represents the sharding strategy
+    of "each element" of the tuple/list of tensors the op returns.
+
+    NOTE: if the output of the op is a List[Tensor] and they share the same placement
+    strategy, then we should return a single OpStrategy instead of a TupleStrategy
+    """
+
+    def __init__(self, childs: Sequence[StrategyType]) -> None:
+        super().__init__()
+        self.childs: Sequence[StrategyType] = childs
+
+    def __str__(self) -> str:
+        child_strategies_str = ", ".join(
+            [f"{str(strat)}" for idx, strat in enumerate(self.childs)]
+        )
+        return f"TupleStrategy({child_strategies_str})"
+
+
+@dataclass
+class RuntimeSchemaInfo:
+    """
+    RuntimeSchemaInfo stores the operator schema related information for runtime (eager)
+    execution. This is mainly used for two ways: 1. to generate hash for args to determine
+    whether to re-run sharding prop or not 2. to determine if we need pytree
+    """
+
+    # This static_argnum records static arg "starting index" for ops that have non-tensor
+    # args/kwargs which would affect sharding propagation results. All args starting from
+    # this index would be hashed to our sharding cache.
+    # Note that only a few ops need this information, e.g. view, transpose, var.dim, etc.
+    static_argnum: int = 100
+    # This static_kwargkey records static kwarg names which would affect sharding prop
+    static_kwargkey: Optional[List[str]] = None
+    # each op can decide if it wants to use pytree flatten/unflatten during operator
+    # eager execution, by default we don't need to do flatten/unflatten, only if the
+    # op indicate it needs to, this is to accelate eager performance.
+    needs_pytree: bool = False
+
+
+@dataclass
+class OpSchema:
+    """
+    OpSchema is a data class that describes an operator input schemas, it
+    includes DTensor DTensorSpecs and non-tensor args/kwargs (positional order
+    preserved). It is mainly used by the dispatching logic below to run things like
+    sharding propagation.
+
+    NOTE: this should be used as a read only data class
+    TODO: make this a frozen dataclass
+
+    Args:
+        op: the operator overload we are intercepting
+        args_schema: contains args except that the DTensor args have been replaced
+            with its DTensorSpec
+        kwargs_schema: contains kwargs except that the DTensor kwargs have been replaced
+            with its DTensorSpec
+    """
+
+    op: OpOverload
+    args_schema: ArgsType
+    kwargs_schema: KwargsType
+
+    schema_info: Optional[RuntimeSchemaInfo] = None
+
+    @property
+    def args_spec(self) -> Tuple[DTensorSpec, ...]:
+        """
+        args_spec: Tuple[DTensorSpec, ...]: contains a clean list of args spec list
+            with NO non-DTensor positional arguments (i.e. int/float/tuple, etc)
+            mainly used by sharding propagation to propagate the output spec
+        """
+        # filter out non-relevant values from args schema to get a clean spec list
+        # this would mainly be used by sharding propagation rules
+        return tuple(item for item in self.args_schema if isinstance(item, DTensorSpec))
+
+    def __repr__(self) -> str:
+        return (
+            f"OpSchema(op={self.op},"
+            f" args_schema={self.args_schema},"
+            f" kwargs_schema={self.kwargs_schema})"
+        )
+
+    def __str__(self) -> str:
+        args_sharding: List[str] = []
+        mesh_shape = None
+        for arg in self.args_schema:
+            if isinstance(arg, DTensorSpec):
+                args_sharding.append(str(arg))
+                mesh_shape = arg.mesh.shape
+            elif isinstance(arg, OpStrategy):
+                assert len(arg.strategies) == 1
+                args_sharding.append(_pretty_print_spec(arg.strategies[0].output_specs))
+                mesh_shape = arg.output_mesh_shape
+            elif isinstance(arg, TupleStrategy):
+                first_op_strtgy = arg.childs[0]
+                assert isinstance(first_op_strtgy, OpStrategy)
+                mesh_shape = first_op_strtgy.output_mesh_shape
+                args_sharding.append(str(arg))
+            else:
+                args_sharding.append(str(arg))
+        return f"Op(op={self.op}, args_sharding={', '.join(args_sharding)} @ mesh: {mesh_shape})"
+
+    def __post_init__(self) -> None:
+        has_symints = False
+        for a in self.args_schema:
+            if isinstance(a, DTensorSpec) and a.tensor_meta is not None:
+                if any(isinstance(s, torch.SymInt) for s in a.tensor_meta.shape):
+                    has_symints = True
+                    break
+        self.has_symints = has_symints
+
+    def arg_type_tensor_or_tensor_list_like(self, arg_idx: int) -> bool:
+        arg = self.args_schema[arg_idx]
+        is_tensor = isinstance(arg, DTensorSpec)
+        if is_tensor:
+            return True
+
+        if not isinstance(arg, list):
+            return False
+
+        return all(isinstance(e, DTensorSpec) or e is None for e in arg)
+
+    def return_type_tuple_tensor_like(self) -> bool:
+        # all dispatch ops could only return Tuple[Tensor] or have None/ints/floats
+        # in the tuple, but the first element must be a Tensor, so this check is enough
+        return_types = self.op._schema.returns
+        return len(return_types) > 1 and isinstance(
+            return_types[0].type, torch.TensorType
+        )
+
+    def return_type_tensor(self) -> bool:
+        return_types = self.op._schema.returns
+        # all dispatch ops only return Tensor or Tuple[Tensor] for tensor like
+        # return types, so this check is enough for tensor like types
+        return isinstance(return_types[0].type, torch.TensorType)
+
+    def __hash__(self) -> int:
+        # Only hash args and kwargs that op indicates to hash
+        if not self.schema_info:
+            static_argnum = len(self.args_schema)
+            static_kwargkey = None
+        else:
+            static_argnum = self.schema_info.static_argnum
+            static_kwargkey = self.schema_info.static_kwargkey
+
+        args_to_hash = tuple(
+            tuple(e) if isinstance(e, list) else e
+            for i, e in enumerate(self.args_schema)
+            if self.arg_type_tensor_or_tensor_list_like(i) or i >= static_argnum
+        )
+        if static_kwargkey is not None:
+            kwargs_to_hash = tuple(
+                self.kwargs_schema.get(k, None) for k in static_kwargkey
+            )
+            return hash((self.op, args_to_hash, kwargs_to_hash))
+        else:
+            return hash((self.op, args_to_hash))
+
+    def __eq__(self, other: object) -> bool:
+        # early return checks
+        if not isinstance(other, OpSchema):
+            return False
+
+        if self.op != other.op:
+            return False
+
+        if len(self.args_schema) != len(other.args_schema):
+            return False
+
+        # compare each element and early return if any of them is different
+        if not self.schema_info:
+            static_argnum = len(self.args_schema)
+            static_kwargkey = None
+        else:
+            static_argnum = self.schema_info.static_argnum
+            static_kwargkey = self.schema_info.static_kwargkey
+
+        for i, (self_arg, other_arg) in enumerate(
+            zip(self.args_schema, other.args_schema)
+        ):
+            if isinstance(self_arg, DTensorSpec) and self_arg != other_arg:
+                return False
+            elif i >= static_argnum and self_arg != other_arg:
+                return False
+
+        # check kwarg equality when there's a static kwarg key
+        if static_kwargkey:
+            for key in static_kwargkey:
+                if self.kwargs_schema.get(key, None) != other.kwargs_schema.get(
+                    key, None
+                ):
+                    return False
+
+        return True
+
+    def gen_fake_args(self) -> ArgsType:
+        """
+        gen_fake_args: generate fake args for the operator, this is mainly used
+            by sharding propagation rules to generate fake args for the operator
+            to run the local tensor operator and get the output spec.
+        """
+        return tree_map_only(
+            DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.args_schema
+        )
+
+    def gen_fake_kwargs(self) -> KwargsType:
+        """
+        gen_fake_kwargs: generate fake kwargs for the operator, this is mainly used
+            by sharding propagation rules to generate fake kwargs for the operator
+            to run the local tensor operator and get the output spec.
+        """
+        return tree_map_only(
+            DTensorSpec, _rebuild_tensor_from_dtensor_meta, self.kwargs_schema
+        )
+
+    def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
+        suggestion_args_spec = self.args_spec
+        new_arg_schema: List[object] = []
+        idx_of_args_spec = 0
+        for arg in origin_schema.args_schema:
+            if isinstance(arg, DTensorSpec):
+                new_arg_schema.append(suggestion_args_spec[idx_of_args_spec])
+                idx_of_args_spec += 1
+            else:
+                new_arg_schema.append(arg)
+        self.args_schema = tuple(new_arg_schema)
+        self.kwargs_schema = origin_schema.kwargs_schema
+
+
+@dataclass
+class OutputSharding:
+    """
+    OutputSharding is a data class that is used by the sharding propagation
+    rules, it could set the output_spec upon successful propagation, and if
+    it failed, output_spec would become None and sharding propagation rules
+    could give a list of suggestions for inputs to reshard.
+
+    NOTE: the schema_suggestion generated by sharding propagation should be
+    exactly the same as the operator OpSchema, except the DTensor DTensorSpecs
+    """
+
+    output_spec: OutputSpecType
+    schema_suggestions: Optional[List[OpSchema]] = None
+    failed_reason: Optional[str] = None
+    needs_redistribute: bool = False
+
+
+@dataclass
+class OpInfo:
+    """
+    All Runtime Op execution info are packed here
+    """
+
+    mesh: DeviceMesh
+    schema: OpSchema
+    flat_args_schema: List[object]
+    local_args: Sequence[object]
+    local_kwargs: Dict[str, object]
+    args_tree_spec: Optional[TreeSpec] = None
+
+    # the output sharding info
+    output_sharding: Optional[OutputSharding] = None
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9112fa0b61e8bf65a5fc7076fb38bf00c8fcc62f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from .embedding_ops import *  # noqa: F403
+from .matrix_ops import *  # noqa: F403
+from .math_ops import *  # noqa: F403
+from .tensor_ops import *  # noqa: F403
+from .pointwise_ops import *  # noqa: F403
+from .random_ops import *  # noqa: F403
+from .view_ops import *  # noqa: F403
+from .conv_ops import *  # noqa: F403
+from .experimental_ops import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7f704e9afe04142b176c5e9c34c3d0219ec93a6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/basic_strategy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/basic_strategy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8139bf72256c9a508f394cc5927329a42852b88
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/basic_strategy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/common_rules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/common_rules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28a276c5632895e6e5c3c6c72676cd81f09672af
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/common_rules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/conv_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/conv_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6bd98092828a1470515d2c253a3c21d6963d6b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/conv_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/embedding_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/embedding_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c73017724fb778dc83fc06cd91f75ab79adc392
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/embedding_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/experimental_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/experimental_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0922ba345ac4389d1eabd0b88691e9f51b381ef0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/experimental_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/math_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/math_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec85984ef9d3b488a6b8e5daa16f7b70a9c969f7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/math_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/matrix_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/matrix_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..365099bc3b6bfcefa6de29d6bad77ea00247ff92
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/matrix_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/pointwise_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/pointwise_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da315038137f27976d39e6270b77a90fc95bbc8a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/pointwise_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/random_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/random_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..993c31f8da2f4c27211d87134034acea5d0a8562
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/random_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/tensor_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/tensor_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..678e418acdf02dd422c2b68d9ebcf6a3a659bce9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/tensor_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13e4fb2ff4643509626dc9b5c7e3469a2d29e237
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/view_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/view_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ace9bf611ec98e31822368cba9d4b616fff9562
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/__pycache__/view_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/basic_strategy.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/basic_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd3ea85fd3f1551669d8298090fc6e57bae1fbf8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/basic_strategy.py
@@ -0,0 +1,184 @@
+import itertools
+from dataclasses import dataclass
+
+from typing import List, Tuple
+
+from torch.distributed._tensor.op_schema import OpStrategy, PlacementStrategy
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+
+from torch.distributed.device_mesh import DeviceMesh
+
+
+@dataclass
+class EinsumDims:
+    contracting_dims: List[str]
+    batch_dims: List[str]
+    lhs_out_only_dims: List[str]
+    rhs_out_only_dims: List[str]
+
+    @classmethod
+    def parse_equation(cls, equation: str) -> Tuple[List[str], str]:
+        # parse einop equation and extract arg specs
+        """
+        Parse the einsum equation str to input dim chars and output dim char
+        """
+        inputs, outputs = equation.split("->")
+        input_dims, output_dims = inputs.split(","), outputs.split(",")
+
+        # NOTE: only support at most two inputs, and single output
+        # extend to support more inputs if needed in future
+        assert len(input_dims) <= 2, "Only support at most two inputs"
+        assert len(output_dims) == 1, "Only support single output"
+        output_dim = output_dims[0]
+        return input_dims, output_dim
+
+    @classmethod
+    def parse_dims(cls, input_dims: List[str], output_dim: str) -> "EinsumDims":
+        """
+        Parse the dims and extract the contracting, batch, and free dimensions
+        for the left and right hand sides.
+        """
+        dim_char_set = set()
+        for input_dim in input_dims:
+            for input_char in list(input_dim):
+                dim_char_set.add(input_char)
+
+        # get a determinisitc order of all dim chars
+        all_dim_chars = sorted(dim_char_set)
+
+        # parse input and output dimensions
+        lhs_out_only_dims, rhs_out_only_dims = [], []
+        batch_dims, contracting_dims = [], []
+
+        for dim_char in all_dim_chars:
+            if dim_char not in output_dim:
+                contracting_dims.append(dim_char)
+            else:
+                is_batch_dim = True
+                for input_dim in input_dims:
+                    is_batch_dim = is_batch_dim and dim_char in input_dim
+
+                if is_batch_dim:
+                    batch_dims.append(dim_char)
+                else:
+                    assert (
+                        len(input_dims) == 2
+                    ), "free dimension only supported for two inputs!"
+                    lhs, rhs = input_dims
+                    if dim_char in lhs:
+                        lhs_out_only_dims.append(dim_char)
+                    elif dim_char in rhs:
+                        rhs_out_only_dims.append(dim_char)
+                    else:
+                        raise RuntimeError("Invalid dimension character")
+
+        return cls(
+            contracting_dims=contracting_dims,
+            batch_dims=batch_dims,
+            lhs_out_only_dims=lhs_out_only_dims,
+            rhs_out_only_dims=rhs_out_only_dims,
+        )
+
+
+def gen_einsum_strategies(
+    equation: str,
+    mesh: DeviceMesh,
+    *,
+    linearity: bool = False,
+) -> OpStrategy:
+    """
+    Generate a strategy list for the ops that follow einsum style notation.
+    """
+    # parse einop equation and extract dims
+    input_dims, output_dim = EinsumDims.parse_equation(equation)
+    edims = EinsumDims.parse_dims(input_dims, output_dim)
+
+    all_mesh_dim_strategies = []
+
+    # generate strategies for each mesh dim
+    for mesh_dim in range(mesh.ndim):
+        mesh_dim_strategies = []
+
+        # placement list stores placements of [output, input1, input2, ...]
+        # first we always have replicate all for inputs and output
+        placement_list: List[Placement] = [Replicate()] * (len(input_dims) + 1)
+        mesh_dim_strategies.append(placement_list)
+
+        if mesh.size(mesh_dim) <= 1:
+            # only replicate strategy for mesh dim with size 1
+            # TODO: see if this is valid for the submesh case
+            continue
+
+        # split batch dim
+        for batch_dim in edims.batch_dims:
+            output_batch_dim = output_dim.index(batch_dim)
+            placement_list = [Shard(output_batch_dim)]
+            for input_dim in input_dims:
+                input_batch_dim = input_dim.index(batch_dim)
+                placement_list.append(Shard(input_batch_dim))
+
+            mesh_dim_strategies.append(placement_list)
+
+        # split contracting dim
+        for contracting_dim in edims.contracting_dims:
+            placement_list = [_Partial()]
+            for input_dim in input_dims:
+                input_contracting_dim = input_dim.index(contracting_dim)
+                placement_list.append(Shard(input_contracting_dim))
+
+            mesh_dim_strategies.append(placement_list)
+
+        # split lhs free dim
+        for lhs_dim in edims.lhs_out_only_dims:
+            lhs_free_dim = output_dim.index(lhs_dim)
+            # this means split the lhs input and output
+            # i.e. S(0), R -> S(0)
+            lhs_placement_list: List[Placement] = [
+                Shard(lhs_free_dim),
+                Shard(lhs_free_dim),
+                Replicate(),
+            ]
+            mesh_dim_strategies.append(lhs_placement_list)
+
+        # split rhs free dim
+        for rhs_dim in edims.rhs_out_only_dims:
+            rhs_free_dim = output_dim.index(rhs_dim)
+            rhs_placement_list: List[Placement] = [
+                Shard(rhs_free_dim),
+                Replicate(),
+                Shard(rhs_free_dim),
+            ]
+            mesh_dim_strategies.append(rhs_placement_list)
+
+        # linearity strategy
+        if linearity:
+            linearity_placement_list: List[Placement] = [_Partial()]
+            for input_dim in input_dims:
+                linearity_placement_list.append(_Partial())
+            mesh_dim_strategies.append(linearity_placement_list)
+
+        all_mesh_dim_strategies.append(mesh_dim_strategies)
+
+    # generate strategies for entire mesh
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    # TODO: filter out invalid strategies, at this point we generate
+    # all possible strategies without considering the whether the tensor
+    # dim could be sharded or not, we would need to filter out invalid
+    # strategies base on the actual tensor shape
+    # (i.e. for Shard, tensor dim size must > mesh size)
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+        strat = PlacementStrategy(output_specs=spec_list[0], input_specs=spec_list[1:])
+        all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/common_rules.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/common_rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4add50dee169646009c706316a531070570913a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/common_rules.py
@@ -0,0 +1,289 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import cast, Dict, List, Optional, Tuple
+
+import torch
+from torch.distributed._tensor._utils import compute_local_shape
+from torch.distributed._tensor.op_schema import (
+    _is_inplace_op,
+    _is_out_variant_op,
+    OpSchema,
+    OutputSharding,
+)
+from torch.distributed._tensor.ops.utils import prod
+from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+
+
+def _replace_char_in_str(string: str, new_char: str, idx: int) -> str:
+    return string[:idx] + new_char + string[idx + 1 :]
+
+
+def _gen_reshard_suggestions(
+    op_schema: OpSchema,
+    input_dims: List[str],
+    input_specs: Tuple[DTensorSpec, ...],
+    dim_to_sharding: Dict[str, int],
+    pending_sum: List[int],
+) -> OutputSharding:
+    suggested_arg_specs: List[DTensorSpec] = []
+    for input_dim, input_spec in zip(input_dims, input_specs):
+        dim_map = [dim_to_sharding[dim] for dim in input_dim]
+        suggested_arg_specs.append(
+            DTensorSpec.from_dim_map(
+                mesh=input_spec.mesh,
+                dim_map=dim_map,
+                sums=pending_sum,
+                tensor_meta=input_spec.tensor_meta,
+            )
+        )
+    suggested_schema = OpSchema(op_schema.op, tuple(suggested_arg_specs), {})
+    suggested_schema._inplace_rewrap_schema_suggestion(op_schema)
+    return OutputSharding(
+        None,
+        schema_suggestions=[suggested_schema],
+        failed_reason="Input placements op sharding propagation failed, need to reshard!",
+    )
+
+
+def einop_rule(
+    equation: str,
+    op_schema: OpSchema,
+    *,
+    linearity: bool = False,
+    enforce_sharding: Optional[Dict[str, int]] = None,
+) -> OutputSharding:
+    """
+    Propagate the sharding of inputs to output for ops whose data moves according to einsum notation.
+
+    This is mostly borrowed from @zdevito's sharding simulator. Examples:
+        mk,kn->mn - einsum
+        ij,ij->ij - addition
+        ij,j->ij - broadcasted addition
+        ij->i - reduction
+    Other ops could use this propagation algorithm when applied, note
+    that einsum propagation only deal with list of specs (DTensor specs)
+    as it only works on list of tensors!
+
+    linearity in einop_rule means that the calling op `f` follows this rule:
+        f(a + b) = f(a) + f(b)
+
+    In this case we can propagate the partial sum, note that linearity in einop
+    only applies to partial sum, not other operations like min/max (which are
+    associative but not linear).
+    """
+    # parse einop equation and extract arg specs
+    inputs, outputs = equation.split("->")
+    input_dims, output_dims = inputs.split(","), outputs.split(",")
+    input_specs = op_schema.args_spec
+    # NOTE: only support single output unless needed in future
+    output_dim = output_dims[0]
+
+    dim_to_sharding: Dict[str, int] = {}
+    dim_to_size: Dict[str, int] = {}
+    # record pending sum, key is mesh dimension, value is pending sum
+    # counter across input specs
+    pending_sums_counter: Dict[int, int] = {}
+    seen_shardings: Dict[int, str] = {}
+    needs_reshard = False
+
+    def merge_sharding(dim: str, a: int, b: int) -> int:
+        # merge the sharding of inputs if it's able to merge, i.e. we can merge
+        # replicate and shard to shard, but this will trigger an reshard operation
+        if a != b:
+            if a == -1 or b == -1:
+                # reshard the replicate to match the sharded one
+                nonlocal needs_reshard
+                needs_reshard = True
+                return a if a != -1 else b
+            else:
+                # TODO: further merge the sharding properly (i.e. reshard one input to replicate)
+                raise RuntimeError(
+                    f"{equation}: dim {dim} sharded two different ways: {a} and {b}"
+                )
+        else:
+            return a
+
+    for input_dim, input_spec in zip(input_dims, input_specs):
+        # deal with partial sums
+        input_sums = input_spec.sums
+        for sum_dim in input_sums:
+            if sum_dim not in pending_sums_counter:
+                seen_shardings[sum_dim] = "+"
+            # update pending sum counter for pending sum mesh
+            # dimension with the occurrence from each input
+            pending_sums_counter[sum_dim] = pending_sums_counter.get(sum_dim, 0) + 1
+
+        for idx, (dim, mesh_dim) in enumerate(zip(input_dim, input_spec.dim_map)):
+            if enforce_sharding and dim in enforce_sharding:
+                if enforce_sharding[dim] != mesh_dim:
+                    needs_reshard = True
+                dim_to_sharding[dim] = enforce_sharding[dim]
+                dim_to_size[dim] = input_spec.shape[idx]
+            elif dim not in dim_to_sharding:
+                dim_to_sharding[dim] = mesh_dim
+                dim_to_size[dim] = input_spec.shape[idx]
+            else:
+                dim_to_sharding[dim] = merge_sharding(
+                    dim, dim_to_sharding[dim], mesh_dim
+                )
+                assert dim_to_size[dim] == input_spec.shape[idx]
+
+            # after merging sharding, we check if there're multiple
+            # sharding on the same mesh dim.
+            merged_sharding_for_dim = dim_to_sharding[dim]
+            if merged_sharding_for_dim != -1:
+                if (
+                    merged_sharding_for_dim in seen_shardings
+                    and dim != seen_shardings[merged_sharding_for_dim]
+                ):
+                    needs_reshard = True
+                    seen_shardings[merged_sharding_for_dim] += dim
+                else:
+                    seen_shardings[merged_sharding_for_dim] = dim
+
+    if pending_sums_counter and not linearity:
+        # return reshard suggestion with no pending sum, because we already properly
+        # merge the sharding, this reshard suggestion is legit to use
+        return _gen_reshard_suggestions(
+            op_schema, input_dims, input_specs, dim_to_sharding, []
+        )
+    else:
+        # It's a op that support linearity, but not all input arguments are partial
+        # we fail the sharding propagation with suggestion to make all inputs be
+        # partial on the corresponding mesh dim (all inputs should be partial for
+        # the mesh dims in order to execute locally and delay the sum reduction)
+        for value in pending_sums_counter.values():
+            if value != len(input_specs):
+                needs_reshard = True
+
+    for mesh_dim, dims in seen_shardings.items():
+        if len(dims) > 1:
+            # we found different input dims are being sharded on the same mesh dim
+            # in order to perform local op computation, we need to reshard inputs
+            # base on some simple heuristics, now we simply pick the one with least comm
+            # volume. (i.e. the input with least size)
+            # TODO: consider a more advanced heuristic to pick the best sharding
+            costs = []
+            for d in dims:
+                cost = 0
+                for input_dim, input_spec in zip(input_dims, input_specs):
+                    if (
+                        d in input_dim
+                        and input_spec.dim_map[input_dim.index(d)] == mesh_dim
+                    ):
+                        assert input_spec.tensor_meta is not None
+                        global_shape = input_spec.tensor_meta.shape
+                        local_shape = compute_local_shape(
+                            global_shape, input_spec.mesh, input_spec.placements
+                        )
+                        cost += prod(local_shape) * input_spec.mesh.size(mesh_dim)
+                costs.append(cost)
+            d_to_keep_sharding = dims[costs.index(max(costs))]
+            for d in dims:
+                # update dim_to_sharding to keep the sharding of the dim with
+                # highest comm and make the rest of the dims to replicate
+                if d != d_to_keep_sharding:
+                    dim_to_sharding[d] = -1
+
+    pending_sums = list(pending_sums_counter.keys())
+    if needs_reshard:
+        return _gen_reshard_suggestions(
+            op_schema, input_dims, input_specs, dim_to_sharding, pending_sums
+        )
+
+    # generate output pending sum if a dim is sharded, and it appears in input
+    # but not output
+    for dim, shard_on_mesh in dim_to_sharding.items():
+        if dim not in output_dims[0] and shard_on_mesh != -1:
+            pending_sums.append(shard_on_mesh)
+
+    # if no need to reshard, we directly generate the output sharding
+    output_dim_map = []
+    output_shape = []
+    for dim in output_dim:
+        if dim == "1":
+            # find output dim that is a singleton dimension, mark sharding and shape
+            output_dim_map.append(-1)
+            output_shape.append(1)
+        else:
+            output_dim_map.append(dim_to_sharding[dim])
+            output_shape.append(dim_to_size[dim])
+
+    # XXX: since we still need to have intermediate shape calculation, we need
+    # to pass in the shape here. We should remove this once sharding decomp works
+    # for ops like addmm
+    assert input_specs[0].tensor_meta is not None
+    tensor_meta = TensorMeta(
+        torch.Size(output_shape),
+        input_specs[0].tensor_meta.stride,
+        input_specs[0].tensor_meta.dtype,
+    )
+    return OutputSharding(
+        DTensorSpec.from_dim_map(
+            input_specs[0].mesh,
+            output_dim_map,
+            pending_sums,
+            tensor_meta=tensor_meta,
+        )
+    )
+
+
+def pointwise_rule(op_schema: OpSchema, linearity: bool = False) -> OutputSharding:
+    """
+    Propagate the sharding for pointwise operations.
+
+    Examples:
+        ij,ij->ij - addition/mul
+        ij,j->ij - broadcasted addition
+    """
+    alphabet = "abcdefghijklmnopqrstuvwxyz"
+    # find the max_dim first in case we need to broadcasting
+    input_specs = op_schema.args_spec
+    max_dim = max(input.ndim for input in input_specs)
+    dimchars = []
+    singleton_counter: List[int] = [0] * max_dim
+    for input in input_specs:
+        start_dim = max_dim - input.ndim
+        p = alphabet[start_dim:max_dim]
+        # handle the "broadcasting to a common shape case"
+        # see https://pytorch.org/docs/stable/notes/broadcasting.html
+        # If any of the dimensions is singleton dimension (i.e. 1).
+        # we mark the dim char as a special "1" to distinguish with
+        # the non-singleton dimension, so that sharding propagation
+        # should just ignore the singleton dimension.
+        if len(input_specs) > 1:
+            for i in range(max_dim):
+                if i < start_dim:
+                    # treat the leading miss dim chars as singleton
+                    singleton_counter[i] += 1
+                elif input.shape[i - start_dim] == 1:
+                    # mark singleton dim char as a special "1" in einop rule
+                    singleton_counter[i] += 1
+                    p = _replace_char_in_str(p, "1", (i - start_dim))
+
+        dimchars.append(p)
+    out_dimchars = alphabet[:max_dim]
+    # check if we replace the all inputs dim char with singleton dimension,
+    # if we replace all inputs, we also need to replace the output dimension.
+    for output_dim_idx in range(len(out_dimchars)):
+        out_dimchar = out_dimchars[output_dim_idx]
+        if singleton_counter[output_dim_idx] == len(input_specs):
+            out_dimchars = _replace_char_in_str(out_dimchars, "1", output_dim_idx)
+
+    fmt = f"{','.join(p for p in dimchars)}->{out_dimchars}"
+
+    enforce_sharding: Dict[str, int] = {}
+    if _is_inplace_op(op_schema.op):
+        # inplace op should keep the input sharding it writes to
+        for out_dimchar, mesh_dim in zip(out_dimchars, input_specs[0].dim_map):
+            enforce_sharding[out_dimchar] = mesh_dim
+    elif _is_out_variant_op(op_schema.op):
+        out_spec = cast(DTensorSpec, op_schema.kwargs_schema["out"])
+        for out_dimchar, mesh_dim in zip(out_dimchars, out_spec.dim_map):
+            enforce_sharding[out_dimchar] = mesh_dim
+
+    return einop_rule(
+        fmt,
+        op_schema,
+        linearity=linearity,
+        enforce_sharding=enforce_sharding,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/conv_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/conv_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f901f4d2edc7ac52aa14d3c966a9ce42c41544
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/conv_ops.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+from typing import List
+
+import torch
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import register_prop_rule
+from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+
+aten = torch.ops.aten
+
+
+@register_prop_rule(aten.convolution.default)
+def convolution_rules(op_schema: OpSchema) -> OutputSharding:
+    (
+        input_spec,
+        weight_spec,
+        bias_spec,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+    ) = op_schema.args_schema
+
+    assert isinstance(input_spec, DTensorSpec)
+    assert isinstance(weight_spec, DTensorSpec)
+    assert isinstance(bias_spec, DTensorSpec)
+    assert input_spec.tensor_meta is not None
+    assert weight_spec.tensor_meta is not None
+    in_shape = input_spec.tensor_meta.shape
+    weight_shape = weight_spec.tensor_meta.shape
+    assert isinstance(stride, List)
+    assert isinstance(padding, List)
+    assert isinstance(dilation, List)
+    assert isinstance(weight_shape, torch.Size)
+    N, C_in, H_in, W_in = in_shape[0], in_shape[1], in_shape[2], in_shape[3]
+    C_out = weight_shape[0]
+    H_out = (H_in + 2 * padding[0] - dilation[0] * (weight_shape[2] - 1) - 1) // stride[
+        0
+    ] + 1
+    W_out = (W_in + 2 * padding[1] - dilation[1] * (weight_shape[3] - 1) - 1) // stride[
+        1
+    ] + 1
+    output_shape = [N, C_out, H_out, W_out]
+    output_stride = (C_out * H_out * W_out, H_out * W_out, W_out, 1)
+    output_dim_map = input_spec.dim_map
+    pending_sums = input_spec.sums
+
+    tensor_meta = TensorMeta(
+        torch.Size(output_shape),
+        output_stride,
+        input_spec.tensor_meta.dtype,
+    )
+    return OutputSharding(
+        DTensorSpec.from_dim_map(
+            input_spec.mesh,
+            output_dim_map,
+            pending_sums,
+            tensor_meta=tensor_meta,
+        )
+    )
+
+
+@register_prop_rule(aten.convolution_backward.default)
+def convolution_backward_rules(op_schema: OpSchema) -> OutputSharding:
+    input_spec = op_schema.args_schema[0]
+    (
+        grad_output_spec,
+        input_spec,
+        weight_spec,
+        bias_shape_opt,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        output_mask,
+    ) = op_schema.args_schema
+
+    assert isinstance(grad_output_spec, DTensorSpec)
+    assert isinstance(input_spec, DTensorSpec)
+    assert isinstance(weight_spec, DTensorSpec)
+    assert isinstance(bias_shape_opt, List)
+    assert input_spec.tensor_meta is not None
+    weight_tensor_meta = weight_spec.tensor_meta
+    bias_tensor_meta = TensorMeta(
+        torch.Size(bias_shape_opt),
+        (1,),
+        input_spec.tensor_meta.dtype,
+    )
+
+    grad_input_spec = input_spec
+    grad_weight_spec = DTensorSpec.from_dim_map(
+        input_spec.mesh,
+        [-1, -1, -1, -1],
+        [0],
+        tensor_meta=weight_tensor_meta,
+    )
+    grad_bias_spec = DTensorSpec.from_dim_map(
+        input_spec.mesh,
+        [-1],
+        [0],
+        tensor_meta=bias_tensor_meta,
+    )
+    return OutputSharding([grad_input_spec, grad_weight_spec, grad_bias_spec])
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/embedding_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a8dcd9679884c10851dd69db5bbde52598ed8fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/embedding_ops.py
@@ -0,0 +1,313 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+import itertools
+from dataclasses import dataclass, field
+from typing import cast, List, Optional
+
+import torch
+import torch.distributed._functional_collectives as funcol
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    PlacementStrategy,
+    StrategyType,
+)
+from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
+    is_tensor_shardable,
+    register_op_strategy,
+)
+
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+@dataclass
+class MaskBuffer:
+    data: Optional[torch.Tensor] = None
+
+    def materialize_mask(self, mask):
+        if self.data is not None:
+            raise RuntimeError("MaskBuffer has already been materialized")
+        self.data = mask
+
+    def release_mask(self):
+        # TODO: evaluate if we need to release the mask buffer or the buffer
+        # can just have the same lifetime as the _Partial placement
+        if self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+        self.data = None
+
+    def apply_mask(self, tensor):
+        if self.data is None:
+            raise RuntimeError("MaskBuffer has not been materialized")
+
+        # NOTE: _MaskPartial is being used by the embedding op and the gather op.
+        # For gather, the mask has the same dimension as the output tensor, whereas
+        # the output of the embedding op has an additional dimension compare to the input,
+        # hence the output masking logic below having two different cases.
+        if tensor.ndim == self.data.ndim:
+            tensor[self.data] = 0.0
+        else:
+            tensor[self.data, :] = 0.0
+
+
+@dataclass(frozen=True)
+class _MaskPartial(_Partial):
+    """
+    A partial mask placement devised for rowwise sharded embedding op, where we need
+    to mask and adjust the indices to the local embedding shard, embedding masking
+    is a special type of the Partial placement
+
+    NOTE: the lifecycle of this MaskPartial placement follows the corresponding DTensor
+    lifecycle, i.e. the indices_mask would only be alive during the lifetime of the DTensor.
+    """
+
+    logical_dim_size: int = -1
+    mask_buffer: MaskBuffer = field(default_factory=MaskBuffer)
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # override parent logic to perform partial mask for embedding
+        num_chunks = mesh.size(mesh_dim)
+        # get local shard size and offset on the embedding_dim
+        local_shard_size, local_offset_on_dim = Shard._local_shard_size_on_dim(
+            self.logical_dim_size,
+            num_chunks,
+            mesh.get_local_rank(mesh_dim),
+            return_offset=True,
+        )
+        # Build the input mask and save it for the current partial placement
+        # this is so that the output of embedding op can reuse the same partial
+        # placement saved mask to perform mask + reduction
+        mask = (tensor < local_offset_on_dim) | (
+            tensor >= local_offset_on_dim + local_shard_size
+        )
+        # mask the input tensor
+        masked_tensor = tensor.clone() - local_offset_on_dim
+        masked_tensor[mask] = 0
+        # materialize the mask buffer to be used for reduction
+        self.mask_buffer.materialize_mask(mask)
+        return masked_tensor
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # perform sum reduction
+        return funcol.all_reduce(
+            tensor, reduceOp=self.reduce_op.name, group=(mesh, mesh_dim)
+        )
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        # by the time we ned reduction, we should have already saved the mask
+        assert self.mask_buffer.data is not None
+
+        # apply the mask to the tensor that pending reduction
+        self.mask_buffer.apply_mask(tensor)
+
+        # clear the mask buffer
+        self.mask_buffer.release_mask()
+
+        # call reduce_shard_tensor of the shard_spec.
+        shard_spec = cast(Shard, shard_spec)
+        return shard_spec._reduce_shard_tensor(tensor, mesh, self.reduce_op, mesh_dim)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _MaskPartial):
+            return False
+
+        # if either data is not None, we invalidate the sharding cache, as this indicates
+        # the current MaskPartial placement is still in use and should not be used for cache hit.
+        if self.mask_buffer.data is not None or other.mask_buffer.data is not None:
+            return False
+
+        return (
+            self.reduce_op == other.reduce_op
+            and self.logical_dim_size == other.logical_dim_size
+        )
+
+    def __hash__(self) -> int:
+        return 1 + hash(
+            (self.logical_dim_size, id(self.mask_buffer.data), self.reduce_op)
+        )
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the MaskPartial placement
+        """
+        return f"_MaskPartial(logical_dim_size={self.logical_dim_size})"
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the MaskPartial placement
+        """
+        return "MaskP"
+
+
+@register_op_strategy(aten.embedding.default)
+def embedding_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """
+    This strategy handles embedding op. We have two possible embedding shardings:
+    rowwise and colwise
+    # TODO: implement rowwise sharding
+    """
+    weight_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
+
+    weight_shape = weight_strategy.output_shape
+    indices_shape = indices_strategy.output_shape
+    output_emd_dim = len(indices_shape)
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, weight, input_indices]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # colwise sharding, output shard on last dim, weight shard on dim 1, input replicate
+        colwise_sharding = [Shard(output_emd_dim), Shard(1), Replicate()]
+        single_mesh_dim_strategies.append(colwise_sharding)
+
+        # rowwise sharding, output is embedding partial, weight shard on dim 0, input accepts embedding partial
+        embedding_partial_placement = _MaskPartial(logical_dim_size=weight_shape[0])
+
+        # NOTE we want to reuse the same mask partial placement so that we can reuse the same mask that generates
+        # from the input indices and use it for output reduction
+        rowwise_sharding = [
+            embedding_partial_placement,
+            Shard(0),
+            embedding_partial_placement,
+        ]
+        single_mesh_dim_strategies.append(rowwise_sharding)
+
+        # batch dim sharding, weight replicated, input can shard on any dim, output follows input
+        for input_dim in range(len(indices_shape)):
+            batch_sharding = [Shard(input_dim), Replicate(), Shard(input_dim)]
+            single_mesh_dim_strategies.append(batch_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(weight_shape, spec_list[1]) and is_tensor_shardable(
+            indices_shape, spec_list[2]
+        ):
+            # only add to the strategy list when both weight and indices are shardable
+            weight_spec, indices_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(weight_strategy, weight_spec),
+                generate_redistribute_costs(indices_strategy, indices_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten.embedding_dense_backward.default)
+def embedding_dense_backward_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> StrategyType:
+    """
+    This strategy handles embedding op. We have two possible embedding shardings:
+    rowwise and colwise
+    # TODO: implement rowwise sharding backward
+    """
+    grad_out_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    indices_strategy = cast(OpStrategy, op_schema.args_schema[1])
+
+    grad_out_shape = grad_out_strategy.output_shape
+    indices_shape = indices_strategy.output_shape
+    grad_out_ndim = len(grad_out_shape)
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, weight, input_indices]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # colwise sharding backward, grad_out shard on last dim, input replicate,
+        # weight grad shard colwise
+        colwise_sharding = [Shard(1), Shard(grad_out_ndim - 1), Replicate()]
+        single_mesh_dim_strategies.append(colwise_sharding)
+
+        # batch dim sharding, weight replicated, grad_out/input have same sharding
+        # that can shard on any dim, weight grad partial
+        for input_dim in range(len(indices_shape)):
+            batch_sharding = [_Partial(), Shard(input_dim), Shard(input_dim)]
+            single_mesh_dim_strategies.append(batch_sharding)
+
+        # grad_out partial, input replicate, weight grad keep partial
+        partial_sharding = [_Partial(), _Partial(), Replicate()]
+        single_mesh_dim_strategies.append(partial_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(grad_out_shape, spec_list[1]) and is_tensor_shardable(
+            indices_shape, spec_list[2]
+        ):
+            # only add to the strategy list when both grad_out and indices are shardable
+            grad_out_spec, indices_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(grad_out_strategy, grad_out_spec),
+                generate_redistribute_costs(indices_strategy, indices_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/experimental_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/experimental_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c011ba28381280556ac03a923ddf5ddd95c14137
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/experimental_ops.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+from typing import List
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+import torch
+from torch.distributed._tensor.op_schema import OpSchema, OutputSharding
+from torch.distributed._tensor.ops.utils import register_prop_rule
+from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
+
+aten = torch.ops.aten
+
+
+@register_prop_rule(aten.slice_backward.default)
+def slice_backward_rules(op_schema: OpSchema) -> OutputSharding:
+    grad_output_spec, input_sizes, dim, start, end, step = op_schema.args_schema
+    assert isinstance(grad_output_spec, DTensorSpec)
+    assert isinstance(input_sizes, List)
+    assert grad_output_spec.tensor_meta is not None
+    grad_input_stride = list(np.cumprod(input_sizes[::-1])[:-1][::-1])
+    grad_input_stride.append(1)
+    dim_map = grad_output_spec.dim_map
+    sums = grad_output_spec.sums
+
+    grad_input_tensor_meta = TensorMeta(
+        torch.Size(input_sizes),
+        tuple(grad_input_stride),
+        grad_output_spec.tensor_meta.dtype,
+    )
+    grad_input_spec = DTensorSpec.from_dim_map(
+        grad_output_spec.mesh,
+        dim_map,
+        sums,
+        tensor_meta=grad_input_tensor_meta,
+    )
+
+    return OutputSharding(grad_input_spec)
+
+
+@register_prop_rule(aten.bernoulli.default)
+@register_prop_rule(aten.bernoulli_.float)
+def bernoulli_rules(op_schema: OpSchema) -> OutputSharding:
+    input_spec = op_schema.args_schema[0]
+    assert isinstance(input_spec, DTensorSpec)
+    return OutputSharding(input_spec)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/math_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b76a08038875da5847db32ae8bb45cde25c25b92
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/math_ops.py
@@ -0,0 +1,957 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from dataclasses import dataclass
+from enum import Enum
+from typing import cast, List, Optional, Sequence, Tuple, Union
+
+import torch
+
+import torch.distributed.distributed_c10d as c10d
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    PlacementStrategy,
+    RuntimeSchemaInfo,
+    TupleStrategy,
+)
+from torch.distributed._tensor.ops.utils import (
+    as_list,
+    generate_redistribute_costs,
+    is_tensor_evenly_shardable,
+    normalize_dim,
+    normalize_dims,
+    normalize_to_torch_size,
+    register_op_strategy,
+)
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+from torch.distributed.device_mesh import DeviceMesh
+
+
+aten = torch.ops.aten
+
+
+class Reduction(Enum):
+    NONE = 0
+    MEAN = 1
+    SUM = 2
+
+
+@dataclass(frozen=True)
+class NormReduction:
+    norm_type: Union[int, float, str]
+
+
+ReductionOpType = Union[NormReduction, c10d.ReduceOp.RedOpType]
+
+
+@dataclass(frozen=True)
+class _NormPartial(_Partial):
+    """
+    This placement is used for partial vector norm.
+
+    For p-norms (where p not inf or -inf), the p-norm over n elements computes
+        (sum_i x_i^p)^(1/p)
+    where the sum is from i=1 to n. The reduction op is the p-norm itself.
+    For example, consider 2 ranks, a (4,) tensor sharded on dim-0, and 2-norm:
+        Rank 0: [t1, t2] | Rank 1: [t3, t4]
+    After computing 2-norm per gradient (partial placement):
+        Rank 0: [sqrt(t1^2 + t2^2)] | Rank 1: [sqrt(t3^2 + t4^2)]
+    Converting from partial to replicate wants to ultimately get:
+        Rank 0/1: [sqrt(t1^2 + t2^2 + t3^2 + t4^2)]
+    This can be achieved by computing 2-norm on each rank's result. This holds
+    similarly for inf and -inf norm. For 0-norm, the reduction op is sum.
+    """
+
+    norm_type: Union[int, float, str] = 2
+
+    def __post_init__(self):
+        """Set the appropriate reduce op based on the norm type."""
+        # Use `object.__setattr__` to bypass frozen checks
+        if self.norm_type in (float("inf"), "inf"):
+            object.__setattr__(self, "reduce_op", c10d.ReduceOp.MAX)
+        elif self.norm_type in (float("-inf"), "-inf"):
+            object.__setattr__(self, "reduce_op", c10d.ReduceOp.MIN)
+        elif isinstance(self.norm_type, (int, float)):
+            object.__setattr__(self, "reduce_op", c10d.ReduceOp.SUM)
+        else:
+            raise NotImplementedError(f"Unsupported norm type: {self.norm_type}")
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        if self.reduce_op in (c10d.ReduceOp.MAX, c10d.ReduceOp.MIN):
+            return tensor
+        elif self.reduce_op == c10d.ReduceOp.SUM:
+            return tensor / mesh.size(mesh_dim=mesh_dim)
+        raise NotImplementedError(self.reduce_op)
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        assert isinstance(shard_spec, Shard), f"{shard_spec}"
+        tensor = self._pre_reduce_transform(tensor)
+        reduced_tensor = super()._reduce_shard_value(tensor, mesh, mesh_dim, shard_spec)
+        return self._post_reduce_transform(reduced_tensor)
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        tensor = self._pre_reduce_transform(tensor)
+        reduced_tensor = super()._reduce_value(tensor, mesh, mesh_dim)
+        return self._post_reduce_transform(reduced_tensor)
+
+    def _pre_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.reduce_op == c10d.ReduceOp.SUM:
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+                return tensor**self.norm_type
+        return tensor
+
+    def _post_reduce_transform(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.reduce_op == c10d.ReduceOp.SUM:
+            assert isinstance(self.norm_type, (int, float)), f"{self.norm_type}"
+            if self.norm_type != 0 and self.norm_type != 1:
+                return tensor ** (1.0 / self.norm_type)
+        return tensor
+
+
+def _infer_reduction_dims(dims_arg: object, ndim: int) -> Optional[List[int]]:
+    if dims_arg is None:
+        return None
+    dims = cast(List[int], as_list(dims_arg))
+    dims = cast(List[int], normalize_dims(dims, ndim))
+    empty_dims = [[0], [-1], []]
+    if ndim == 0 and dims_arg in empty_dims:
+        return None
+    return dims
+
+
+def _infer_reduce_dims_map(
+    reduction_dims: List[int], input_ndim: int, keep_dim=False
+) -> List[int]:
+    reduction_dims_map = []
+    new_dim_count = 0
+    for input_dim in range(input_ndim):
+        if input_dim in reduction_dims and not keep_dim:
+            # if input dim in reduction dims, mark it as -1
+            reduction_dims_map.append(-1)
+        else:
+            # otherwise mark it as the new dim
+            reduction_dims_map.append(new_dim_count)
+            new_dim_count += 1
+
+    return reduction_dims_map
+
+
+def replicate_reduction_dims(
+    placements: Tuple[Placement, ...], reduction_dims: List[int]
+) -> Tuple[Placement, ...]:
+    # replicate the reduction dims if not reduction_linear
+    new_placements: List[Placement] = []
+
+    for p in placements:
+        if p.is_partial():
+            new_placements.append(Replicate())
+        elif isinstance(p, Shard) and p.dim in reduction_dims:
+            new_placements.append(Replicate())
+        else:
+            new_placements.append(p)
+
+    return tuple(new_placements)
+
+
+def map_placements_after_reduction(
+    placements: Tuple[Placement, ...],
+    reduction_dims: List[int],
+    reduction_dims_map: List[int],
+    reduction_op: ReductionOpType,
+) -> Tuple[Placement, ...]:
+    """
+    Map each placement based on the output shape after reduction.
+    """
+    new_placements: List[Placement] = []
+    for placement in placements:
+        if isinstance(placement, (Replicate, _Partial)):
+            new_placements.append(placement)
+        else:
+            assert isinstance(placement, Shard)
+            shard_dim = placement.dim
+            new_shard_dim = reduction_dims_map[shard_dim]
+            if new_shard_dim == -1 or shard_dim in reduction_dims:
+                # if new_shard_dim collapsed or its in the reduction dims
+                # (i.e. for the case where keepdims=True), we generate partial
+                new_placements.append(get_placement_from_reduction_op(reduction_op))
+            else:
+                new_placements.append(Shard(new_shard_dim))
+    return tuple(new_placements)
+
+
+def get_placement_from_reduction_op(reduction_op: ReductionOpType) -> Placement:
+    if isinstance(reduction_op, NormReduction):
+        return _NormPartial(norm_type=reduction_op.norm_type)
+    return _Partial(reduction_op)
+
+
+def common_reduction_strategy(
+    mesh: DeviceMesh,
+    input_strategy: OpStrategy,
+    reduce_dims: List[int],
+    keep_dim: bool = False,
+    reduction_linear: bool = True,
+    reduction_op: ReductionOpType = c10d.ReduceOp.SUM,
+) -> OpStrategy:
+    """
+    reduction_linear means that the reduction `f` follows this rule:
+        f([f(a), f(b)]) = f([a, b])
+
+    reduction linear should be super set of linearity.
+    """
+    # by default follow reduction input strategy
+    reduction_strategy = OpStrategy([])
+
+    for strtg in input_strategy.strategies:
+        if not reduction_linear:
+            # input placements for this strategy should clear out pending sum and sharding
+            # on the reduction dimension
+            input_placements = replicate_reduction_dims(
+                strtg.output_spec.placements, reduce_dims
+            )
+        else:
+            input_placements = strtg.output_spec.placements
+
+        input_spec = DTensorSpec(
+            mesh=mesh,
+            placements=input_placements,
+            tensor_meta=strtg.output_spec.tensor_meta,
+        )
+
+        reduce_dims_map = _infer_reduce_dims_map(reduce_dims, input_spec.ndim, keep_dim)
+        out_placements = map_placements_after_reduction(
+            input_spec.placements, reduce_dims, reduce_dims_map, reduction_op
+        )
+        redistribute_cost = [generate_redistribute_costs(input_strategy, input_spec)]
+        reduction_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=DTensorSpec(
+                    mesh=mesh,
+                    placements=out_placements,
+                ),
+                input_specs=(input_spec,),
+                redistribute_cost=redistribute_cost,
+            )
+        )
+
+    return reduction_strategy
+
+
+LINEAR_REDUCTION_OP_MAP = {
+    aten.all.default: c10d.ReduceOp.SUM,
+    aten.all.dim: c10d.ReduceOp.SUM,
+    aten.sum.default: c10d.ReduceOp.SUM,
+    aten.sum.dim_IntList: c10d.ReduceOp.SUM,
+    aten.prod.default: c10d.ReduceOp.PRODUCT,
+    aten.prod.dim_int: c10d.ReduceOp.PRODUCT,
+    aten.prod.int_out: c10d.ReduceOp.PRODUCT,
+    aten.mean.default: c10d.ReduceOp.AVG,
+    aten.mean.dim: c10d.ReduceOp.AVG,
+    aten.mean.out: c10d.ReduceOp.AVG,
+    aten.max.default: c10d.ReduceOp.MAX,
+    aten.max.dim: c10d.ReduceOp.MAX,
+    aten.max.out: c10d.ReduceOp.MAX,
+    aten.min.default: c10d.ReduceOp.MIN,
+    aten.min.dim: c10d.ReduceOp.MIN,
+    aten.min.out: c10d.ReduceOp.MIN,
+}
+
+
+@register_op_strategy(
+    list(LINEAR_REDUCTION_OP_MAP.keys()), schema_info=RuntimeSchemaInfo(1)
+)
+def linear_reduction_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    args_schema = op_schema.args_schema
+    input_strategy = args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    dims = None
+    if len(op_schema.args_schema) > 1:
+        dims = _infer_reduction_dims(args_schema[1], input_strategy.output_ndim)
+
+    reduce_dims = list(range(input_strategy.output_ndim)) if dims is None else dims
+
+    keep_dim = len(op_schema.args_schema) > 2 and bool(op_schema.args_schema[2])
+    reduction_op = LINEAR_REDUCTION_OP_MAP[op_schema.op]
+    return common_reduction_strategy(
+        mesh,
+        input_strategy,
+        reduce_dims,
+        keep_dim=keep_dim,
+        reduction_linear=True,
+        reduction_op=reduction_op,
+    )
+
+
+@register_op_strategy(
+    [aten.var.correction, aten.var.correction_out],
+    schema_info=RuntimeSchemaInfo(1, ["keepdim"]),
+)
+def var_reduction_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    args_schema = op_schema.args_schema
+    input_strategy = args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    dims = None
+    if len(op_schema.args_schema) > 1:
+        dims = _infer_reduction_dims(args_schema[1], input_strategy.output_ndim)
+
+    reduce_dims = list(range(input_strategy.output_ndim)) if dims is None else dims
+
+    keep_dim = cast(bool, op_schema.kwargs_schema.get("keepdim", False))
+    return common_reduction_strategy(
+        mesh, input_strategy, reduce_dims, keep_dim=keep_dim, reduction_linear=False
+    )
+
+
+@register_op_strategy(
+    [aten.linalg_vector_norm.default], schema_info=RuntimeSchemaInfo(1)
+)
+def vector_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    args_schema = op_schema.args_schema
+    input_strategy = args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    norm_type = args_schema[1] if len(args_schema) > 1 else 2
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    dim = args_schema[2] if len(args_schema) > 2 else None
+    keepdim = args_schema[3] if len(args_schema) > 3 else False
+    dims = _infer_reduction_dims(dim, input_strategy.output_ndim)
+    reduce_dims = list(range(input_strategy.output_ndim)) if dims is None else dims
+    return common_reduction_strategy(
+        mesh,
+        input_strategy,
+        reduce_dims,
+        keep_dim=cast(bool, keepdim),
+        reduction_linear=True,
+        reduction_op=NormReduction(norm_type),
+    )
+
+
+@register_op_strategy(
+    [aten._foreach_norm.Scalar], schema_info=RuntimeSchemaInfo(1, needs_pytree=True)
+)
+def foreach_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> TupleStrategy:
+    args_schema = op_schema.args_schema
+    input_tuple_strategy = args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy)
+    norm_type = args_schema[1]
+    assert isinstance(norm_type, (int, float, str)), f"{norm_type}"
+    output_tuple_strategy_childs: List[OpStrategy] = []
+    for op_strategy in input_tuple_strategy.childs:
+        assert isinstance(op_strategy, OpStrategy), f"{op_strategy}"
+        reduce_dims = list(range(op_strategy.output_ndim))
+        output_strategy = common_reduction_strategy(
+            mesh,
+            op_strategy,
+            reduce_dims,
+            reduction_linear=True,
+            reduction_op=NormReduction(norm_type),
+        )
+        output_tuple_strategy_childs.append(output_strategy)
+    return TupleStrategy(output_tuple_strategy_childs)
+
+
+@register_op_strategy(
+    [aten._log_softmax.default, aten._softmax.default], schema_info=RuntimeSchemaInfo(1)
+)
+def softmax_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    input_strategy, softmax_dim, _ = op_schema.args_schema
+    input_strategy = cast(OpStrategy, input_strategy)
+    softmax_dim = cast(int, softmax_dim)
+    softmax_dim = normalize_dim(softmax_dim, input_strategy.output_ndim)
+
+    output_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        redistribute_costs = []
+        input_src_spec = input_placement_strategy.output_spec
+
+        # make sure input is replicated along the softmax dim
+        input_target_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [softmax_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_target_spec)
+        )
+        output_target_spec = input_target_spec
+        output_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=output_target_spec,
+                input_specs=[input_target_spec],
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return output_strategy
+
+
+@register_op_strategy(
+    [
+        aten._log_softmax_backward_data.default,
+        aten._softmax_backward_data.default,
+    ],
+    schema_info=RuntimeSchemaInfo(2),
+)
+def softmax_backward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    grad_out_strategy, out_strategy, softmax_dim, _ = op_schema.args_schema
+    grad_out_strategy = cast(OpStrategy, grad_out_strategy)
+    out_strategy = cast(OpStrategy, out_strategy)
+    softmax_dim = cast(int, softmax_dim)
+    softmax_dim = normalize_dim(softmax_dim, grad_out_strategy.output_ndim)
+
+    grad_in_strategy = OpStrategy([])
+    for grad_out_placement_strat, out_placement_strat in zip(
+        grad_out_strategy.strategies, out_strategy.strategies
+    ):
+        # follow the sharding of the grad_out or out depending on which has more shards
+        grad_out_src_spec = grad_out_placement_strat.output_spec
+        out_src_spec = out_placement_strat.output_spec
+        src_spec = (
+            grad_out_src_spec
+            if grad_out_src_spec.num_shards >= out_src_spec.num_shards
+            else out_src_spec
+        )
+
+        # make sure inputs are replicated along the softmax dim
+        tgt_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(src_spec.placements, [softmax_dim]),
+        )
+        redist_grad_out_cost = generate_redistribute_costs(grad_out_strategy, tgt_spec)
+        redist_out_cost = generate_redistribute_costs(out_strategy, tgt_spec)
+        grad_in_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=tgt_spec,
+                redistribute_cost=[redist_grad_out_cost, redist_out_cost],
+            )
+        )
+
+    return grad_in_strategy
+
+
+@register_op_strategy(
+    [aten.nll_loss_forward.default, aten.nll_loss2d_forward.default],
+    schema_info=RuntimeSchemaInfo(3),
+)
+def nll_loss_forward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    assert len(op_schema.args_schema) == 5
+    (
+        input_strategy,
+        target_strategy,
+        weight_strategy,
+        reduction,
+        _,
+    ) = op_schema.args_schema
+    input_strategy = cast(OpStrategy, input_strategy)
+    target_strategy = cast(OpStrategy, target_strategy)
+    reduction = cast(int, reduction)
+
+    input_shape = input_strategy.output_shape
+    channel_dim = 1 if len(input_shape) >= 2 else 0
+
+    output_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        op_args_target_specs = []
+        redistribute_costs = []
+
+        # make sure input is replicated along the channel dim
+        input_src_spec = input_placement_strategy.output_spec
+        input_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [channel_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_expected_spec)
+        )
+
+        # target doesn't have channel dim, and it follows input on other dims
+        target_src_spec = target_strategy.strategies[idx].output_spec
+        target_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_skip_dim(input_expected_spec.placements, channel_dim),
+            tensor_meta=target_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(target_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(target_strategy, target_expected_spec)
+        )
+
+        # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
+        # make sure it is replicated
+        if weight_strategy is not None:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+            weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(weight_src_spec.placements),
+                tensor_meta=weight_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(weight_expected_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(weight_strategy, weight_expected_spec)
+            )
+
+        if reduction == Reduction.NONE.value:
+            output_expected_spec = target_expected_spec
+            total_weight_expected_spec = DTensorSpec(
+                mesh=mesh, placements=tuple([Replicate()] * mesh.ndim)
+            )
+        else:
+            if reduction == Reduction.MEAN.value:
+                reduction_op = c10d.ReduceOp.AVG
+                if not is_tensor_evenly_shardable(
+                    target_expected_spec.shape, target_expected_spec
+                ):
+                    raise ValueError(
+                        "The intermediate results of nll_loss cannot be evenly sharded, \
+                        resulting in biased mean result."
+                    )
+            else:  # reduction == Reduction.SUM.value:
+                reduction_op = c10d.ReduceOp.SUM
+            reduce_dims = list(range(target_expected_spec.ndim))
+            reduce_dims_map = _infer_reduce_dims_map(
+                reduce_dims, target_expected_spec.ndim, keep_dim=False
+            )
+            out_placements = map_placements_after_reduction(
+                target_expected_spec.placements,
+                reduce_dims,
+                reduce_dims_map,
+                reduction_op,
+            )
+            output_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=out_placements,
+            )
+
+            # whether reduction is sum or mean, the total weight has to be summed up if not replicated
+            total_weight_placements = map_placements_after_reduction(
+                target_expected_spec.placements,
+                reduce_dims,
+                reduce_dims_map,
+                c10d.ReduceOp.SUM,
+            )
+            total_weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=total_weight_placements,
+            )
+
+        output_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=(output_expected_spec, total_weight_expected_spec),
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return output_strategy
+
+
+@register_op_strategy(
+    [aten.nll_loss_backward.default, aten.nll_loss2d_backward.default],
+    schema_info=RuntimeSchemaInfo(4),
+)
+def nll_loss_backward_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    assert len(op_schema.args_schema) == 7
+    (
+        grad_out_strategy,
+        input_strategy,
+        target_strategy,
+        weight_strategy,
+        reduction,
+        _,
+        total_weight_strategy,
+    ) = op_schema.args_schema
+    grad_out_strategy = cast(OpStrategy, grad_out_strategy)
+    input_strategy = cast(OpStrategy, input_strategy)
+    target_strategy = cast(OpStrategy, target_strategy)
+    reduction = cast(int, reduction)
+    total_weight_strategy = cast(OpStrategy, total_weight_strategy)
+
+    input_shape = input_strategy.output_shape
+    channel_dim = 1 if len(input_shape) >= 2 else 0
+
+    grad_in_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        op_args_target_specs = []
+        redistribute_costs = []
+
+        # make sure input is replicated along the channel dim
+        input_src_spec = input_placement_strategy.output_spec
+        input_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=replicate_reduction_dims(
+                input_src_spec.placements, [channel_dim]
+            ),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_expected_spec)
+        )
+
+        # target doesn't have channel dim, and it follows input on other dims
+        target_src_spec = target_strategy.strategies[idx].output_spec
+        target_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_skip_dim(input_expected_spec.placements, channel_dim),
+            tensor_meta=target_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(target_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(target_strategy, target_expected_spec)
+        )
+
+        # grad_out follows target if there is no reduction;
+        # otherwise, it should be a replicated scalar.
+        grad_out_src_spec = grad_out_strategy.strategies[idx].output_spec
+        if reduction == Reduction.NONE.value:
+            grad_out_expected_spec = target_expected_spec
+        else:
+            grad_out_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(grad_out_src_spec.placements),
+                tensor_meta=grad_out_src_spec.tensor_meta,
+            )
+        op_args_target_specs.insert(0, grad_out_expected_spec)
+        redistribute_costs.insert(
+            0, generate_redistribute_costs(grad_out_strategy, grad_out_expected_spec)
+        )
+
+        # weight tensor, if given, has to be a Tensor of size input_shape[channel_dim]
+        # make sure it is replicated
+        if weight_strategy is not None:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+            weight_expected_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(weight_src_spec.placements),
+                tensor_meta=weight_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(weight_expected_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(weight_strategy, weight_expected_spec)
+            )
+
+        # total_weight should always be replicated
+        total_weight_src_spec = total_weight_strategy.strategies[idx].output_spec
+        total_weight_expected_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(total_weight_src_spec.placements),
+            tensor_meta=total_weight_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(total_weight_expected_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(
+                total_weight_strategy, total_weight_expected_spec
+            )
+        )
+
+        grad_in_expected_spec = input_expected_spec
+        grad_in_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=grad_in_expected_spec,
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return grad_in_strategy
+
+
+@register_op_strategy(
+    [aten.native_layer_norm.default],
+    schema_info=RuntimeSchemaInfo(1),
+)
+def layer_norm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    # args must be: input, normalized_shape, weight, bias, eps
+    # for None weight and bias, their corresponding objects will
+    # be None as well. layer_norm_strategy returns one OpStrategy
+    # for the triple return values (out, mean, rstd).
+    assert len(op_schema.args_schema) == 5
+    (
+        input_strategy,
+        normalized_shape,
+        weight_strategy,
+        bias_strategy,
+        _,
+    ) = op_schema.args_schema
+
+    # the current layer norm implementation requires that all
+    # input DTensor's sharding must be in form of OpStrategy
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+    normalized_size = normalize_to_torch_size(normalized_shape)
+
+    input_ndim = input_strategy.output_ndim
+    axis = input_ndim - len(normalized_size)
+
+    # we use OpStrategy because the output (out, mean, rstd)
+    # should have the same placements
+    output_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        op_args_target_specs = []
+        redistribute_costs = []
+        input_src_spec = input_placement_strategy.output_spec
+
+        # for the input tensor, we replicate it on the inner dims if necessary
+        # TODO: we can avoid forcing the redistribution once we figure out
+        # how to decompose layer norm
+        input_target_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(input_src_spec.placements, axis),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_target_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_target_spec)
+        )
+
+        if weight_strategy is not None:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+
+            # for the weight tensor, we replicate it on all dims if necessary
+            # TODO: we can avoid forcing the redistribution once we figure out
+            # how to decompose layer norm
+            weight_target_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(weight_src_spec.placements),
+                tensor_meta=weight_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(weight_target_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(weight_strategy, weight_target_spec)
+            )
+
+        if bias_strategy is not None:
+            assert isinstance(bias_strategy, OpStrategy)
+            bias_src_spec = bias_strategy.strategies[idx].output_spec
+
+            # for the bias tensor, we replicate it on all dims if necessary
+            # TODO: we can avoid forcing the redistribution once we figure out
+            # how to decompose layer norm
+            bias_target_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(bias_src_spec.placements),
+                tensor_meta=bias_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(bias_target_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(bias_strategy, bias_target_spec)
+            )
+
+        # the output spec is the same as input spec
+        output_target_spec = input_target_spec
+        output_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=output_target_spec,
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return output_strategy
+
+
+@register_op_strategy(
+    [aten.native_layer_norm_backward.default],
+    schema_info=RuntimeSchemaInfo(2),
+)
+def layer_norm_bwd_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    # args must be: grad_out, input, normalized_shape, mean, rstd,
+    # weight, bias, output_mask. For None weight and bias, their
+    # corresponding objects will be None as well.
+    assert len(op_schema.args_schema) == 8
+    (
+        grad_out_strategy,
+        input_strategy,
+        normalized_shape,
+        mean_strategy,
+        rstd_strategy,
+        weight_strategy,
+        bias_strategy,
+        output_mask,
+    ) = op_schema.args_schema
+
+    assert isinstance(grad_out_strategy, OpStrategy)
+    assert isinstance(input_strategy, OpStrategy)
+    assert isinstance(mean_strategy, OpStrategy)
+    assert isinstance(rstd_strategy, OpStrategy)
+
+    assert isinstance(normalized_shape, (int, Sequence, torch.Size))
+    normalized_size = normalize_to_torch_size(normalized_shape)
+    input_ndim = input_strategy.output_ndim
+    axis = input_ndim - len(normalized_size)
+    outer_dims = list(range(axis))
+
+    assert isinstance(output_mask, List) and len(output_mask) == 3
+
+    # output triple: (d_input, d_weight, d_bias)
+    out_tuple_strategy = OpStrategy([])
+    for idx, input_placement_strategy in enumerate(input_strategy.strategies):
+        # args for PlacementStrategy
+        output_specs_list: List[Optional[DTensorSpec]] = []
+        op_args_target_specs = []
+        redistribute_costs = []
+
+        input_src_spec = input_placement_strategy.output_spec
+        # arg: grad_out
+        # TODO: change the strategy to the following rule.
+        # d_input is basically a product of element-wise mul of
+        # grad_out, rstd, and normalized input, among which rstd
+        # and normalized input (x_hat) should have the same sharding
+        # placements, and grad_out's sharding is determined by the
+        # pointwise result of x_hat and weight/bias.
+        if output_mask[0]:
+            # TODO: now grad_out spec follows input spec. we may need
+            # to change it to apply a pointwise rule over grad_out,
+            # input, and weight.
+            grad_out_target_spec = DTensorSpec(
+                mesh=mesh,
+                placements=_replicate_dims_start_at(input_src_spec.placements, axis),
+                tensor_meta=input_src_spec.tensor_meta,
+            )
+            op_args_target_specs.append(grad_out_target_spec)
+            redistribute_costs.append(
+                generate_redistribute_costs(grad_out_strategy, grad_out_target_spec)
+            )
+            output_specs_list.append(grad_out_target_spec)
+        else:
+            output_specs_list.append(None)
+
+        # arg: input
+        input_target_spec = DTensorSpec(
+            mesh=mesh,
+            placements=_replicate_dims_start_at(input_src_spec.placements, axis),
+            tensor_meta=input_src_spec.tensor_meta,
+        )
+        op_args_target_specs.append(input_target_spec)
+        redistribute_costs.append(
+            generate_redistribute_costs(input_strategy, input_target_spec)
+        )
+
+        # arg: mean, rstd
+        mean_src_spec = mean_strategy.strategies[idx].output_spec
+        op_args_target_specs.append(mean_src_spec)
+        redistribute_costs.append([0.0 for _ in mean_strategy.strategies])
+        rstd_src_spec = rstd_strategy.strategies[idx].output_spec
+        op_args_target_specs.append(rstd_src_spec)
+        redistribute_costs.append([0.0 for _ in rstd_strategy.strategies])
+
+        # arg: weight
+        # d_weight = sum(grad_out * (input - mean) / rstd, outer_dim, keepdim=False)
+        if output_mask[1]:
+            assert isinstance(weight_strategy, OpStrategy)
+            weight_src_spec = weight_strategy.strategies[idx].output_spec
+            # no need to redistribute weight since they should be replicated
+            # in forward pass
+            op_args_target_specs.append(weight_src_spec)
+            redistribute_costs.append([0.0 for _ in weight_strategy.strategies])
+            # TODO: now d_weight spec follows input spec w/ a reduction.
+            # we may need to change to a pointwise rule over grad_out and
+            # input, then apply a reduction.
+            inp_placements = _replicate_dims_start_at(input_src_spec.placements, axis)
+            reduce_dims_map = _infer_reduce_dims_map(
+                outer_dims, input_src_spec.ndim, False
+            )
+            out_placements = map_placements_after_reduction(
+                inp_placements, outer_dims, reduce_dims_map, c10d.ReduceOp.SUM
+            )
+            output_specs_list.append(
+                DTensorSpec(
+                    mesh=mesh,
+                    placements=out_placements,
+                    tensor_meta=weight_src_spec.tensor_meta,
+                )
+            )
+        else:
+            output_specs_list.append(None)
+
+        # arg: bias
+        # d_bias = sum(grad_out, outer_dim, keepdim=False)
+        if output_mask[2]:
+            assert isinstance(bias_strategy, OpStrategy)
+            bias_src_spec = bias_strategy.strategies[idx].output_spec
+            # no need to redistribute weight since they should be replicated
+            # in forward pass
+            op_args_target_specs.append(bias_src_spec)
+            redistribute_costs.append([0.0 for _ in bias_strategy.strategies])
+            # Currently we do not support the case where output_mask[0] is False while
+            # output_mask[1] is True. But it's easy to support that by accessing
+            # grad_out_spec via a local variable rather than the list. We just don't
+            # see the case.
+            grad_out_spec = output_specs_list[0]
+            assert isinstance(grad_out_spec, DTensorSpec)
+            # d_bias spec follows a reduction over grad_out
+            inp_placements = _replicate_dims_start_at(grad_out_spec.placements, axis)
+            reduce_dims_map = _infer_reduce_dims_map(
+                outer_dims, grad_out_spec.ndim, False
+            )
+            out_placements = map_placements_after_reduction(
+                inp_placements, outer_dims, reduce_dims_map, c10d.ReduceOp.SUM
+            )
+            output_specs_list.append(
+                DTensorSpec(
+                    mesh=mesh,
+                    placements=out_placements,
+                    tensor_meta=bias_src_spec.tensor_meta,
+                )
+            )
+        else:
+            output_specs_list.append(None)
+
+        out_tuple_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=tuple(output_specs_list),
+                input_specs=op_args_target_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+
+    return out_tuple_strategy
+
+
+def _replicate_dims_start_at(
+    placements: Sequence[Placement], start_dim: int = 0
+) -> Tuple[Placement, ...]:
+    new_placements: List[Placement] = []
+    for p in placements:
+        if p.is_partial() or (isinstance(p, Shard) and p.dim >= start_dim):
+            new_placements.append(Replicate())  # make it replicate
+        else:
+            new_placements.append(p)  # keep the placement
+    return tuple(new_placements)
+
+
+# return new_placements which align with placements but skip the skipped_dim
+def _skip_dim(
+    placements: Tuple[Placement, ...], skipped_dim: int
+) -> Tuple[Placement, ...]:
+    new_placements: List[Placement] = []
+    for p in placements:
+        if isinstance(p, Shard) and p.dim >= skipped_dim:
+            new_placements.append(Shard(p.dim - 1))
+        else:
+            new_placements.append(p)
+    return tuple(new_placements)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/matrix_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/matrix_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9f718ec6afc2d7f76acf0457a0654bb365d9f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/matrix_ops.py
@@ -0,0 +1,226 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+import itertools
+from typing import List, Optional
+
+import torch
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    OutputSharding,
+    PlacementStrategy,
+)
+from torch.distributed._tensor.ops.basic_strategy import gen_einsum_strategies
+from torch.distributed._tensor.ops.common_rules import einop_rule
+from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
+    infer_broadcast_dims_map,
+    is_tensor_shardable,
+    map_placements_after_broadcast,
+    register_op_strategy,
+    register_prop_rule,
+)
+from torch.distributed._tensor.placement_types import (
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+@register_prop_rule(aten.t.default)
+def transpose_rule(op_schema: OpSchema) -> OutputSharding:
+    return einop_rule("ij->ji", op_schema, linearity=True)
+
+
+def _mm_like_strategy(
+    mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    self_strategy, mat2_strategy = op_schema.args_schema
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(mat2_strategy, OpStrategy)
+    # generate all possible strategies for mm
+    mm_strategy = gen_einsum_strategies(mm_equation, mesh)
+    # filter out invalid strategies and associate costs
+    strategies = mm_strategy.strategies
+    filtered_strategies = []
+    for strtg in strategies:
+        assert strtg.input_specs is not None
+        self_spec = strtg.input_specs[0]
+        mat2_spec = strtg.input_specs[1]
+        if is_tensor_shardable(
+            self_strategy.output_shape, self_spec
+        ) and is_tensor_shardable(mat2_strategy.output_shape, mat2_spec):
+            redistribute_cost = [
+                generate_redistribute_costs(self_strategy, self_spec),
+                generate_redistribute_costs(mat2_strategy, mat2_spec),
+            ]
+            strtg.redistribute_cost = redistribute_cost
+            filtered_strategies.append(strtg)
+
+    mm_strategy.strategies = filtered_strategies
+
+    return mm_strategy
+
+
+def _addmm_like_strategy(
+    mm_equation: str, mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    self_strategy, mat1_strategy, mat2_strategy = op_schema.args_schema
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(mat1_strategy, OpStrategy)
+    assert isinstance(mat2_strategy, OpStrategy)
+    self_shape = self_strategy.output_shape
+    mm_out_shape = torch.Size(
+        [
+            mat2_strategy.output_shape[-1]
+            if i == len(mat1_strategy.output_shape) - 1
+            else dim_size
+            for i, dim_size in enumerate(mat1_strategy.output_shape)
+        ]
+    )
+    # generate all possible strategies for mm
+    mm_strategy = gen_einsum_strategies(mm_equation, mesh)
+    # filter out invalid strategies and associate costs
+    strategies = mm_strategy.strategies
+    filtered_strategies = []
+    for strtg in strategies:
+        # construct new strategy by consider the self arg
+        assert strtg.input_specs is not None
+        mat1_spec = strtg.input_specs[0]
+        mat2_spec = strtg.input_specs[1]
+        out_spec = strtg.output_spec
+
+        # self arg's spec should follow the output of mm, but need
+        # to consider broadcast for the self arg
+        broadcast_dims_map = infer_broadcast_dims_map(mm_out_shape, self_shape)
+        self_placements = map_placements_after_broadcast(
+            out_spec.placements, mm_out_shape, broadcast_dims_map
+        )
+        self_spec = DTensorSpec(mesh=mesh, placements=self_placements)
+
+        if is_tensor_shardable(
+            mat1_strategy.output_shape, mat1_spec
+        ) and is_tensor_shardable(mat2_strategy.output_shape, mat2_spec):
+            # update input specs with new self spec
+            strtg.input_specs = (self_spec, mat1_spec, mat2_spec)
+
+            # associate costs
+            redistribute_cost = [
+                generate_redistribute_costs(self_strategy, self_spec),
+                generate_redistribute_costs(mat1_strategy, mat1_spec),
+                generate_redistribute_costs(mat2_strategy, mat2_spec),
+            ]
+            strtg.redistribute_cost = redistribute_cost
+            filtered_strategies.append(strtg)
+
+    mm_strategy.strategies = filtered_strategies
+
+    return mm_strategy
+
+
+@register_op_strategy(aten.mm.default)
+def mm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    return _mm_like_strategy("mk,kn->mn", mesh, op_schema)
+
+
+@register_op_strategy(aten.addmm.default)
+def addmm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    return _addmm_like_strategy("mk,kn->mn", mesh, op_schema)
+
+
+@register_op_strategy(aten.bmm.default)
+def bmm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    return _mm_like_strategy("bmk,bkn->bmn", mesh, op_schema)
+
+
+@register_op_strategy(aten.baddbmm.default)
+def baddmm_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> OpStrategy:
+    return _addmm_like_strategy("bmk,bkn->bmn", mesh, op_schema)
+
+
+@register_op_strategy(aten._scaled_dot_product_flash_attention.default)
+def scaled_dot_product_attention_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> OpStrategy:
+    # NOTE: currently we only support some simple strategies to support tensor parallelism
+    # TODO: sdpa might be a good candidate for us to explore decomposed sharding propagation
+    # as it involves: matmul, pointwise, reduction ops together.
+    return_debug_mask = len(op_schema.args_schema) >= 6 and op_schema.args_schema[5]
+    q_input_strategy = op_schema.args_schema[0]
+    assert isinstance(q_input_strategy, OpStrategy)
+    # q/k/v have the same shape
+    qkv_shape = q_input_strategy.output_shape
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [outputs, inputs]
+        # in the spda case, we have 3 valid tensor outputs and 3 tensor inputs
+        # first we can always accept full replication for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 6
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # second we can accept the sharding pattern of tensor parallelism, which
+        # shard on the num of head dim
+        qkv_sharding = Shard(1)  # num head dim
+        output_sharding = Shard(1)  # num head dim
+        logsumexp_sharding = Shard(1)  # num head dim
+        if return_debug_mask:
+            debug_attn_mask_sharding: Placement = Shard(1)  # num head dim
+        else:
+            # empty debug mask, replicated
+            debug_attn_mask_sharding = Replicate()
+
+        num_heads_dim_sharding = [
+            output_sharding,
+            logsumexp_sharding,
+            debug_attn_mask_sharding,
+            qkv_sharding,
+            qkv_sharding,
+            qkv_sharding,
+        ]
+        single_mesh_dim_strategies.append(num_heads_dim_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        assert len(spec_list) == 6
+        input_expected_specs = spec_list[3:]
+        output_specs: List[Optional[DTensorSpec]] = list(spec_list[:3])
+        # fix up output_specs and fill in None for the int and empty tensor return values
+        for i in range(2, 8):
+            output_specs.insert(i, None)
+        if all(is_tensor_shardable(qkv_shape, spec) for spec in input_expected_specs):
+            # only add to the strategy list when all inputs are shardable
+            redistribute_cost = []
+            for input_idx, spec in enumerate(input_expected_specs):
+                qkv_strategy = op_schema.args_schema[input_idx]
+                assert isinstance(qkv_strategy, OpStrategy)
+                qkv_tensor_meta = qkv_strategy.strategies[0].output_spec.tensor_meta
+                spec.tensor_meta = qkv_tensor_meta
+                redistribute_cost.append(
+                    generate_redistribute_costs(qkv_strategy, spec)
+                )
+
+            strat = PlacementStrategy(
+                output_specs=tuple(output_specs),
+                input_specs=tuple(input_expected_specs),
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/pointwise_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/pointwise_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..187dd1b04a613ee69aedac27b58b05a8dd588366
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/pointwise_ops.py
@@ -0,0 +1,629 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import List, Sequence, Tuple
+
+import torch
+
+from torch.distributed._tensor.op_schema import (
+    _is_inplace_op,
+    _is_out_variant_op,
+    OpSchema,
+    OpStrategy,
+    PlacementStrategy,
+    RuntimeSchemaInfo,
+    StrategyType,
+    TupleStrategy,
+)
+
+from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
+    infer_broadcast_dims_map,
+    map_placements_after_broadcast,
+    normalize_dim,
+    register_op_strategy,
+)
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+from torch.distributed.device_mesh import DeviceMesh
+
+
+aten = torch.ops.aten
+# leave the remaining pointwise_ops list here for convenience,
+# Below ops are some pointwise ops that are yet to be supported,
+# they might not be a complete list.
+# pointwise_ops = [
+#     "fake_quantize_per_channel_affine",
+#     "fake_quantize_per_tensor_affine",
+#     "floor_divide",  # floor_divide is deprecated
+#     "frexp",  # multiple output pointwise op, need to add support
+#     "gradient",  #  need investigation on this op
+#     "imag",  # complex data type only
+#     "quantized_batch_norm",
+#     "quantized_max_pool1d",
+#     "quantized_max_pool2d",
+#     "real",  # complex data type only
+# ]
+
+
+linear_pointwise_ops = [
+    aten.div.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.div_.Scalar,  # this op is linear on the first argument, and the second argument is scalar, so it fits as a linear op.
+    aten.to.dtype,
+    aten.add.Tensor,
+    aten.add_.Tensor,
+]
+
+
+pointwise_ops = [
+    # please keep the entries below alphabetically sorted
+    aten.abs.default,
+    aten.abs.out,
+    aten.abs_.default,
+    aten.acos.default,
+    aten.acos.out,
+    aten.acos_.default,
+    aten.acosh.default,
+    aten.acosh.out,
+    aten.acosh_.default,
+    aten.add.Scalar,
+    aten.add.out,
+    aten.add_.Scalar,
+    aten.addcdiv.default,
+    aten.addcdiv.out,
+    aten.addcdiv_.default,
+    aten.addcmul.default,
+    aten.addcmul.out,
+    aten.addcmul_.default,
+    aten.angle.default,
+    aten.angle.out,
+    aten.asin.default,
+    aten.asin.out,
+    aten.asin_.default,
+    aten.asinh.default,
+    aten.asinh.out,
+    aten.asinh_.default,
+    aten.atan.default,
+    aten.atan.out,
+    aten.atan2.default,
+    aten.atan2.out,
+    aten.atan2_.default,
+    aten.atan_.default,
+    aten.atanh.default,
+    aten.atanh.out,
+    aten.atanh_.default,
+    aten.bitwise_and.Scalar,
+    aten.bitwise_and.Scalar_Tensor,
+    aten.bitwise_and.Scalar_out,
+    aten.bitwise_and.Tensor,
+    aten.bitwise_and.Tensor_out,
+    aten.bitwise_and_.Scalar,
+    aten.bitwise_and_.Tensor,
+    aten.bitwise_left_shift.Scalar_Tensor,
+    aten.bitwise_left_shift.Tensor,
+    aten.bitwise_left_shift.Tensor_Scalar,
+    aten.bitwise_left_shift.Tensor_Scalar_out,
+    aten.bitwise_left_shift.Tensor_out,
+    aten.bitwise_left_shift_.Tensor,
+    aten.bitwise_left_shift_.Tensor_Scalar,
+    aten.bitwise_not.default,
+    aten.bitwise_not.out,
+    aten.bitwise_not_.default,
+    aten.bitwise_or.Scalar,
+    aten.bitwise_or.Scalar_Tensor,
+    aten.bitwise_or.Scalar_out,
+    aten.bitwise_or.Tensor,
+    aten.bitwise_or.Tensor_out,
+    aten.bitwise_or_.Scalar,
+    aten.bitwise_or_.Tensor,
+    aten.bitwise_right_shift.Scalar_Tensor,
+    aten.bitwise_right_shift.Tensor,
+    aten.bitwise_right_shift.Tensor_Scalar,
+    aten.bitwise_right_shift.Tensor_Scalar_out,
+    aten.bitwise_right_shift.Tensor_out,
+    aten.bitwise_right_shift_.Tensor,
+    aten.bitwise_right_shift_.Tensor_Scalar,
+    aten.bitwise_xor.Scalar,
+    aten.bitwise_xor.Scalar_Tensor,
+    aten.bitwise_xor.Scalar_out,
+    aten.bitwise_xor.Tensor,
+    aten.bitwise_xor.Tensor_out,
+    aten.bitwise_xor_.Scalar,
+    aten.bitwise_xor_.Tensor,
+    aten.ceil.default,
+    aten.ceil.out,
+    aten.ceil_.default,
+    aten.clamp.default,
+    aten.clamp.out,
+    aten.clamp_.default,
+    aten.clip.default,
+    aten.clip.out,
+    aten.clip_.default,
+    aten.conj_physical.default,
+    aten.conj_physical.out,
+    aten.conj_physical_.default,
+    aten.copysign.Scalar,
+    aten.copysign.Scalar_out,
+    aten.copysign.Tensor,
+    aten.copysign.out,
+    aten.copysign_.Scalar,
+    aten.copysign_.Tensor,
+    aten.cos.default,
+    aten.cos.out,
+    aten.cos_.default,
+    aten.cosh.default,
+    aten.cosh.out,
+    aten.cosh_.default,
+    aten.deg2rad.default,
+    aten.deg2rad.out,
+    aten.deg2rad_.default,
+    aten.digamma.default,
+    aten.digamma.out,
+    aten.digamma_.default,
+    aten.div.Tensor,
+    aten.div.Tensor_mode,
+    aten.div.out,
+    aten.div.out_mode,
+    aten.div_.Tensor,
+    aten.div_.Tensor_mode,
+    aten.eq.Tensor,
+    aten.eq.Tensor_out,
+    aten.eq.Scalar,
+    aten.eq.Scalar_out,
+    aten.erf.default,
+    aten.erf.out,
+    aten.erf_.default,
+    aten.erfc.default,
+    aten.erfc.out,
+    aten.erfc_.default,
+    aten.erfinv.default,
+    aten.erfinv.out,
+    aten.erfinv_.default,
+    aten.exp.default,
+    aten.exp.out,
+    aten.exp2.default,
+    aten.exp2.out,
+    aten.exp2_.default,
+    aten.exp_.default,
+    aten.expm1.default,
+    aten.expm1.out,
+    aten.expm1_.default,
+    aten.float_power.Scalar,
+    aten.float_power.Scalar_out,
+    aten.float_power.Tensor_Scalar,
+    aten.float_power.Tensor_Scalar_out,
+    aten.float_power.Tensor_Tensor,
+    aten.float_power.Tensor_Tensor_out,
+    aten.float_power_.Scalar,
+    aten.float_power_.Tensor,
+    aten.floor.default,
+    aten.floor.out,
+    aten.floor_.default,
+    aten.fmod.Scalar,
+    aten.fmod.Scalar_out,
+    aten.fmod.Tensor,
+    aten.fmod.Tensor_out,
+    aten.fmod_.Scalar,
+    aten.fmod_.Tensor,
+    aten.frac.default,
+    aten.frac.out,
+    aten.frac_.default,
+    aten.ge.Scalar,
+    aten.ge.Tensor,
+    aten.gelu.default,
+    aten.gt.Tensor,
+    aten.gt.Tensor_out,
+    aten.gt.Scalar,
+    aten.gt.Scalar_out,
+    aten.gt.Scalar,
+    aten.gt.Tensor,
+    aten.hypot.default,
+    aten.hypot.out,
+    aten.hypot_.default,
+    aten.i0.default,
+    aten.i0.out,
+    aten.i0_.default,
+    aten.igamma.default,
+    aten.igamma.out,
+    aten.igamma_.default,
+    aten.igammac.default,
+    aten.igammac.out,
+    aten.igammac_.default,
+    aten.isnan.default,
+    aten.ldexp.default,
+    aten.ldexp.out,
+    aten.ldexp_.default,
+    aten.lt.Tensor,
+    aten.lt.Tensor_out,
+    aten.lt.Scalar,
+    aten.lt.Scalar_out,
+    aten.le.Scalar,
+    aten.le.Tensor,
+    aten.lerp.Scalar,
+    aten.lerp.Scalar_out,
+    aten.lerp.Tensor,
+    aten.lerp.Tensor_out,
+    aten.lerp_.Scalar,
+    aten.lerp_.Tensor,
+    aten.lgamma.default,
+    aten.lgamma.out,
+    aten.lgamma_.default,
+    aten.log.default,
+    aten.log.out,
+    aten.log10.default,
+    aten.log10.out,
+    aten.log10_.default,
+    aten.log1p.default,
+    aten.log1p.out,
+    aten.log1p_.default,
+    aten.log2.default,
+    aten.log2.out,
+    aten.log2_.default,
+    aten.log_.default,
+    aten.logaddexp.default,
+    aten.logaddexp.out,
+    aten.logaddexp2.default,
+    aten.logaddexp2.out,
+    aten.logical_and.default,
+    aten.logical_and.out,
+    aten.logical_and_.default,
+    aten.logical_not.default,
+    aten.logical_not.out,
+    aten.logical_not_.default,
+    aten.logical_or.default,
+    aten.logical_or.out,
+    aten.logical_or_.default,
+    aten.logical_xor.default,
+    aten.logical_xor.out,
+    aten.logical_xor_.default,
+    aten.logit.default,
+    aten.logit.out,
+    aten.logit_.default,
+    aten.masked_fill.Scalar,
+    aten.maximum.out,
+    aten.mul.Scalar,
+    aten.mul.Tensor,
+    aten.mul.out,
+    aten.mul_.Scalar,
+    aten.mul_.Tensor,
+    aten.mvlgamma.default,
+    aten.mvlgamma.out,
+    aten.mvlgamma_.default,
+    aten.native_dropout_backward.default,
+    aten.native_dropout_backward.out,
+    aten.nan_to_num.default,
+    aten.nan_to_num.out,
+    aten.nan_to_num_.default,
+    aten.ne.Scalar,
+    aten.neg.default,
+    aten.neg.out,
+    aten.neg_.default,
+    aten.nextafter.default,
+    aten.nextafter.out,
+    aten.nextafter_.default,
+    aten.polygamma.default,
+    aten.polygamma.out,
+    aten.polygamma_.default,
+    aten.positive.default,
+    aten.pow.Scalar,
+    aten.pow.Scalar_out,
+    aten.pow.Tensor_Scalar,
+    aten.pow.Tensor_Scalar_out,
+    aten.pow.Tensor_Tensor,
+    aten.pow.Tensor_Tensor_out,
+    aten.pow_.Scalar,
+    aten.pow_.Tensor,
+    aten.reciprocal.default,
+    aten.reciprocal.out,
+    aten.reciprocal_.default,
+    aten.rad2deg.default,
+    aten.rad2deg.out,
+    aten.rad2deg_.default,
+    aten.relu.default,
+    aten.relu_.default,
+    aten.remainder.Scalar,
+    aten.remainder.Scalar_Tensor,
+    aten.remainder.Scalar_out,
+    aten.remainder.Tensor,
+    aten.remainder.Tensor_out,
+    aten.remainder_.Scalar,
+    aten.remainder_.Tensor,
+    aten.round.decimals,
+    aten.round.decimals_out,
+    aten.round.default,
+    aten.round.out,
+    aten.round_.decimals,
+    aten.round_.default,
+    aten.rsqrt.default,
+    aten.rsqrt.out,
+    aten.rsqrt_.default,
+    aten.rsub.Scalar,
+    aten.sgn.default,
+    aten.sgn.out,
+    aten.sgn_.default,
+    aten.sigmoid.default,
+    aten.sigmoid.out,
+    aten.sigmoid_.default,
+    aten.sign.default,
+    aten.sign.out,
+    aten.sign_.default,
+    aten.signbit.default,
+    aten.signbit.out,
+    aten.silu.default,
+    aten.silu.out,
+    aten.sin.default,
+    aten.sin.out,
+    aten.sin_.default,
+    aten.sinc.default,
+    aten.sinc.out,
+    aten.sinc_.default,
+    aten.sinh.default,
+    aten.sinh.out,
+    aten.sinh_.default,
+    aten.sqrt.default,
+    aten.sqrt.out,
+    aten.sqrt_.default,
+    aten.square.default,
+    aten.square.out,
+    aten.square_.default,
+    aten.sub.Scalar,
+    aten.sub.Tensor,
+    aten.sub.out,
+    aten.sub_.Scalar,
+    aten.sub_.Tensor,
+    aten.tan.default,
+    aten.tan.out,
+    aten.tan_.default,
+    aten.tanh.default,
+    aten.tanh.out,
+    aten.tanh_.default,
+    aten.true_divide.Tensor,
+    aten.trunc.default,
+    aten.trunc.out,
+    aten.trunc_.default,
+    aten.where.self,
+    aten.where.self_out,
+    aten.xlogy.OutScalar_Self,
+    aten.xlogy.OutScalar_Other,
+    aten.xlogy.OutTensor,
+    aten.xlogy.Scalar_Other,
+    aten.xlogy.Scalar_Self,
+    aten.xlogy.Tensor,
+    aten.xlogy_.Scalar_Other,
+    aten.xlogy_.Tensor,
+    # backward point-wise ops
+    # please keep the entries below alphabetically sorted
+    aten.gelu_backward.default,
+    aten.sigmoid_backward.default,
+    aten.silu_backward.default,
+    aten.tanh_backward.default,
+    aten.threshold_backward.default,
+]
+
+
+def pointwise_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema, linearity: bool = False
+) -> OpStrategy:
+    max_shards_strategy_index = -1
+    max_shards = -1
+
+    if _is_inplace_op(op_schema.op):
+        # inplace op should follow the first arg strategy
+        followed_strategy = op_schema.args_schema[0]
+    elif _is_out_variant_op(op_schema.op):
+        # out variant op should follow the out kwarg strategy
+        followed_strategy = op_schema.kwargs_schema["out"]
+    else:
+        # normal pointwise op, we choose to follow the arg with
+        # the max shards in case operands needs reshard
+        for idx, arg_strategy in enumerate(op_schema.args_schema):
+            if not isinstance(arg_strategy, OpStrategy):
+                continue
+
+            arg_max_shards = arg_strategy.max_num_shards()
+            if arg_max_shards > max_shards:
+                max_shards_strategy_index = idx
+                max_shards = arg_max_shards
+
+        followed_strategy = op_schema.args_schema[max_shards_strategy_index]
+
+    assert isinstance(
+        followed_strategy, OpStrategy
+    ), f"no strategy to follow for {op_schema}!"
+    return common_pointwise_strategy(
+        mesh, op_schema.args_schema, followed_strategy, linearity
+    )
+
+
+def common_pointwise_strategy(
+    mesh: DeviceMesh,
+    args_schema: Sequence[object],
+    followed_strategy: OpStrategy,
+    linearity: bool,
+) -> OpStrategy:
+    # handle broadcasting
+    common_shape = torch.broadcast_shapes(
+        *[arg.output_shape for arg in args_schema if isinstance(arg, OpStrategy)]
+    )
+    pointwise_strategy = OpStrategy([])
+
+    for placement_strategy in followed_strategy.strategies:
+        spec_to_follow = placement_strategy.output_spec
+        out_placements: List[Placement] = []
+        for placement in spec_to_follow.placements:
+            if isinstance(placement, Shard):
+                shard_dim = normalize_dim(placement.dim, len(spec_to_follow.shape))
+                common_ndim = len(common_shape)
+                new_shard_dim = common_ndim - len(spec_to_follow.shape) + shard_dim
+                out_placements.append(Shard(new_shard_dim))
+            elif isinstance(placement, _Partial) and not linearity:
+                # clear the partial placemnet if op does not support linearity
+                # by default we just replicate the partial, need to see if this
+                # is optimal for all cases
+                out_placements.append(Replicate())
+            else:
+                out_placements.append(placement)
+
+        input_specs: List[DTensorSpec] = []
+        redistribute_costs: List[List[float]] = []
+        for idx, input_arg in enumerate(args_schema):
+            if isinstance(input_arg, OpStrategy):
+                # every arg follow the out_placements, but need to handle broadcasting
+                input_arg_spec = input_arg.strategies[0].output_spec
+                input_arg_dims_map = infer_broadcast_dims_map(
+                    common_shape, input_arg_spec.shape
+                )
+                input_target_placements = map_placements_after_broadcast(
+                    tuple(out_placements),
+                    common_shape,
+                    input_arg_dims_map,
+                )
+                input_arg_target_spec = DTensorSpec(
+                    mesh=mesh,
+                    placements=input_target_placements,
+                    tensor_meta=input_arg_spec.tensor_meta,
+                )
+                input_specs.append(input_arg_target_spec)
+                redistribute_costs.append(
+                    generate_redistribute_costs(input_arg, input_arg_target_spec)
+                )
+
+        pointwise_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=DTensorSpec(
+                    mesh=mesh,
+                    placements=tuple(out_placements),
+                ),
+                input_specs=input_specs,
+                redistribute_cost=redistribute_costs,
+            )
+        )
+    return pointwise_strategy
+
+
+def linear_pointwise_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """
+    Linear pointwise operators can propagate pending reductions.
+    For example, c = add(a, b); if a is pending sum, then c will be
+    pending sum as well without any communication overhead.
+    """
+    return pointwise_strategy(mesh, op_schema, linearity=True)
+
+
+for op in linear_pointwise_ops:
+    register_op_strategy(op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"]))(
+        linear_pointwise_strategy
+    )
+
+for op in pointwise_ops:
+    register_op_strategy(op, schema_info=RuntimeSchemaInfo(static_kwargkey=["out"]))(
+        pointwise_strategy
+    )
+
+
+# TODO: add all for_each ops
+for_each_ops = [
+    aten._foreach_abs_.default,
+    aten._foreach_addcdiv_.Scalar,
+    aten._foreach_addcdiv_.ScalarList,
+    aten._foreach_addcdiv_.Tensor,
+    aten._foreach_addcmul.Scalar,
+    aten._foreach_addcmul_.Scalar,
+    aten._foreach_addcmul_.ScalarList,
+    aten._foreach_addcmul_.Tensor,
+    aten._foreach_div_.List,
+    aten._foreach_div_.ScalarList,
+    aten._foreach_lerp_.Scalar,
+    aten._foreach_maximum_.List,
+    aten._foreach_mul.Scalar,
+    aten._foreach_mul.List,
+    aten._foreach_mul_.Scalar,
+    aten._foreach_mul_.ScalarList,
+    aten._foreach_mul_.Tensor,
+    aten._foreach_mul_.List,
+    aten._foreach_neg.default,
+    aten._foreach_neg_.default,
+    aten._foreach_reciprocal_.default,
+    aten._foreach_sub_.Scalar,
+    aten._foreach_sqrt.default,
+    aten._foreach_sqrt_.default,
+    aten._foreach_zero_.default,
+]
+
+for_each_linearity_ops = [
+    aten._foreach_add.Scalar,
+    aten._foreach_add_.Scalar,
+    aten._foreach_add_.ScalarList,
+    aten._foreach_add.List,
+    aten._foreach_add_.List,
+]
+
+
+def foreach_list_pointwise_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema, linearity: bool = False
+) -> StrategyType:
+    """
+    Apply the pointwise strategy to the zipped arguments. For example, if we
+    run a foreach add of two lists l1 and l2, then we apply the pointwise
+    strategy on each pair (l1[i], l2[i]). If the first argument is a list but
+    the second (or later) one is a tensor, then we broadcast the tensor by
+    replicating it into a list with the length of the first argument.
+    """
+
+    def args_tuple_strategies(args_schema: Tuple[object, ...]) -> List[TupleStrategy]:
+        first_arg = args_schema[0]
+        assert isinstance(first_arg, TupleStrategy)
+        strategy_len = len(first_arg.childs)
+        tuple_strategies: List[TupleStrategy] = []
+        for arg_idx, arg in enumerate(args_schema):
+            if isinstance(arg, TupleStrategy):
+                # every tuple strategy should have the same length
+                assert len(arg.childs) == strategy_len
+                tuple_strategies.append(arg)
+            elif isinstance(arg, OpStrategy):
+                if arg_idx > 0:  # implicitly broadcast
+                    tuple_strategies.append(
+                        TupleStrategy([arg for _ in range(strategy_len)])
+                    )
+                else:
+                    raise RuntimeError(
+                        f"foreach list op only supports tuple strategy! {op_schema}"
+                    )
+        return tuple_strategies
+
+    args_strategies = args_tuple_strategies(op_schema.args_schema)
+    follow_strategy: TupleStrategy = args_strategies[0]
+    foreach_strategy_list: List[OpStrategy] = []
+    for child_idx, child_strtgy in enumerate(follow_strategy.childs):
+        assert isinstance(child_strtgy, OpStrategy)
+        args_schema: List[StrategyType] = [
+            arg_strategy.childs[child_idx] for arg_strategy in args_strategies
+        ]
+        pointwise_strategy: OpStrategy = common_pointwise_strategy(
+            mesh, args_schema, child_strtgy, linearity
+        )
+        foreach_strategy_list.append(pointwise_strategy)
+    return TupleStrategy(foreach_strategy_list)
+
+
+def foreach_list_linear_pointwise_strategy(
+    mesh: DeviceMesh, op_schema: OpSchema
+) -> StrategyType:
+    """
+    for each list op stratgy that supports linearity
+    """
+    return foreach_list_pointwise_strategy(mesh, op_schema, linearity=True)
+
+
+for op in for_each_ops:
+    register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
+        foreach_list_pointwise_strategy
+    )
+
+for op in for_each_linearity_ops:
+    register_op_strategy(op, schema_info=RuntimeSchemaInfo(needs_pytree=True))(
+        foreach_list_linear_pointwise_strategy
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/random_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db7e2f4c0295e9d9c25c871065f44856c58c7bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/random_ops.py
@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    PlacementStrategy,
+    StrategyType,
+)
+from torch.distributed._tensor.ops.utils import is_tensor_partial, register_op_strategy
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+@register_op_strategy(
+    [aten.normal_.default, aten.uniform_.default, aten.native_dropout.default]
+)
+def random_op_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    self_strategy = op_schema.args_schema[0]
+    assert isinstance(self_strategy, OpStrategy)
+
+    random_strategy = OpStrategy([])
+    for arg_strategy in self_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_partial(arg_spec):
+            # TODO: figure out how inplace random op should behave when it's partial
+            raise RuntimeError(f"{op_schema.op} with _Partial is not supported yet!")
+        random_strategy.strategies.append(PlacementStrategy(output_specs=arg_spec))
+
+    return random_strategy
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/tensor_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af0814e1778405e12b65bb91cd9972dc6bc8d28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/tensor_ops.py
@@ -0,0 +1,826 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import itertools
+from typing import cast, List, Optional, Sequence, Tuple
+
+import torch
+
+from torch.distributed._tensor._utils import compute_local_shape
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OpStrategy,
+    OutputSharding,
+    PlacementStrategy,
+    RuntimeSchemaInfo,
+    StrategyType,
+    TupleStrategy,
+)
+from torch.distributed._tensor.ops.common_rules import pointwise_rule
+from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
+    is_tensor_dim_sharded,
+    is_tensor_partial,
+    is_tensor_shardable,
+    normalize_dim,
+    prod,
+    register_op_strategy,
+    register_prop_rule,
+)
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+from torch.distributed.device_mesh import DeviceMesh
+
+
+aten = torch.ops.aten
+
+
+def default_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # Default strategy by default just propagate the first input strategy
+    select_strategy = op_schema.args_schema[0]
+    assert isinstance(select_strategy, OpStrategy)
+    default_strategy = []
+    for strategy in select_strategy.strategies:
+        # we create new DTensorSpecs even for default strategy to assure that
+        # the tensor metas are distinct between the arguments and outputs
+        default_strategy.append(
+            PlacementStrategy(
+                output_specs=DTensorSpec(
+                    mesh=strategy.output_spec.mesh,
+                    placements=strategy.output_spec.placements,
+                )
+            )
+        )
+    return OpStrategy(default_strategy)
+
+
+register_op_strategy(
+    [
+        aten.clone.default,
+        aten.contiguous.default,
+        aten.copy_.default,
+        aten.detach.default,
+        aten.fill_.Scalar,
+        aten.zero_.default,
+    ]
+)(default_strategy)
+
+register_op_strategy(
+    aten._to_copy.default, schema_info=RuntimeSchemaInfo(static_kwargkey=["dtype"])
+)(default_strategy)
+
+
+@register_op_strategy(
+    [
+        aten.equal.default,
+        aten.is_same_size.default,
+    ]
+)
+def equal_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # equal_strategy deals with ops that comparing two tensor, we need to make sure
+    # sharding layout the same with two operands, we choose to follow the arg with max
+    # num of shards, still keep is_same_size here for completeness as they share the
+    # same strategy in theory.
+    self_strategy, other_strategy = op_schema.args_schema
+    assert isinstance(self_strategy, OpStrategy)
+    assert isinstance(other_strategy, OpStrategy)
+
+    select_strategy = (
+        self_strategy
+        if self_strategy.max_num_shards() >= other_strategy.max_num_shards()
+        else other_strategy
+    )
+    equal_strategy = OpStrategy([])
+
+    for arg_strategy in select_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_partial(arg_spec):
+            # if the arg_spec have partial, reshard to replicate
+            # otherwise local shard tensor comparison would be invalid
+            output_spec = DTensorSpec(
+                mesh=arg_spec.mesh,
+                placements=tuple(
+                    Replicate() if isinstance(p, _Partial) else p
+                    for p in arg_spec.placements
+                ),
+            )
+            equal_strategy.strategies.append(
+                PlacementStrategy(output_specs=output_spec)
+            )
+        else:
+            equal_strategy.strategies.append(PlacementStrategy(arg_spec))
+    return equal_strategy
+
+
+@register_op_strategy(
+    [
+        aten.empty_like.default,
+        aten.ones_like.default,
+        aten.rand_like.default,
+        aten.randn_like.default,
+        aten.zeros_like.default,
+    ],
+    schema_info=RuntimeSchemaInfo(1, ["dtype"]),
+)
+@register_op_strategy(
+    [aten.full_like.default],
+    schema_info=RuntimeSchemaInfo(2, ["dtype"]),
+)
+@register_op_strategy(
+    [
+        aten.randint_like.default,
+        aten.randint_like.low_dtype,
+        aten.randint_like.low_dtype_out,
+    ],
+    schema_info=RuntimeSchemaInfo(3, ["dtype"]),
+)
+def create_like_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # create_like_strategy deals with ops that creating tensors with same
+    # shape as input, but with specific content that does not depend on
+    # the input, we can propagate sharding, but we have to make sure we
+    # move from partial to replicated.
+    select_strategy = op_schema.args_schema[0]
+    create_like_strategy = OpStrategy([])
+    assert isinstance(select_strategy, OpStrategy)
+    for arg_strategy in select_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if is_tensor_partial(arg_spec):
+            # if the arg_spec have partial, accept partial
+            # in the input_specs but output replicate for
+            # those corresponding mesh dims
+            output_spec = DTensorSpec(
+                mesh=arg_spec.mesh,
+                placements=tuple(
+                    Replicate() if isinstance(p, _Partial) else p
+                    for p in arg_spec.placements
+                ),
+            )
+            create_like_strategy.strategies.append(
+                PlacementStrategy(output_specs=output_spec, input_specs=(arg_spec,))
+            )
+
+        else:
+            create_like_strategy.strategies.append(PlacementStrategy(arg_spec))
+
+    return create_like_strategy
+
+
+@register_op_strategy(
+    [
+        aten.new_empty.default,
+        aten.new_full.default,
+        aten.new_ones.default,
+        aten.new_zeros.default,
+        aten.new_empty_strided.default,  # TODO: re-think new_empty_strided
+    ],
+    schema_info=RuntimeSchemaInfo(1, ["dtype"]),
+)
+def new_factory_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # TODO: maybe we should generate all possible shardings intead of just stay
+    # replicated for new factory methods
+    input_strategy = op_schema.args_schema[0]
+    new_factory_strategy = OpStrategy([])
+    assert isinstance(input_strategy, OpStrategy)
+    for arg_strategy in input_strategy.strategies:
+        input_spec = arg_strategy.output_spec
+        replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+        new_factory_strategy.strategies.append(
+            PlacementStrategy(output_specs=replica_spec, input_specs=(input_spec,))
+        )
+
+    return new_factory_strategy
+
+
+@register_op_strategy(aten.bucketize.Tensor)
+def gen_bucketize_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """Just propagate input sharding, but expect replicated for boundaries input."""
+    input_strategy = op_schema.args_schema[0]
+    bucketize_strategy = OpStrategy([])
+    assert isinstance(input_strategy, OpStrategy)
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = DTensorSpec(mesh, arg_strategy.output_spec.placements)
+        replica_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+        bucketize_strategy.strategies.append(
+            PlacementStrategy(
+                output_specs=arg_spec, input_specs=(arg_spec, replica_spec)
+            )
+        )
+
+    return bucketize_strategy
+
+
+@register_op_strategy(aten.slice.Tensor, schema_info=RuntimeSchemaInfo(1))
+def gen_slice_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """Forward all shardings except the slice dimension."""
+    defaults = (None, 0, None, None, 1)
+    input_strategy, dim, start, end, step = (
+        op_schema.args_schema + defaults[len(op_schema.args_schema) :]
+    )
+    assert isinstance(input_strategy, OpStrategy)
+    input_shape = input_strategy.output_shape
+    input_ndim = input_strategy.output_ndim
+    assert isinstance(dim, int)
+    if start is None:
+        start = 0
+    if end is None or end > input_shape[dim]:
+        end = input_shape[dim]
+    assert isinstance(start, int)
+    assert isinstance(end, int)
+    assert isinstance(step, int)
+
+    # normalize args
+    slice_dim = normalize_dim(dim, input_ndim)
+    start = normalize_dim(start, input_shape[dim])
+    end = normalize_dim(end, input_shape[dim])
+
+    redundant_slice = start == 0 and end == input_shape[dim] and step == 1
+
+    slice_strategy = OpStrategy([])
+
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if not is_tensor_dim_sharded(arg_spec, dim=slice_dim) or redundant_slice:
+            # only add the strategy if the slice dim is not sharded
+            out_spec = DTensorSpec(mesh, arg_spec.placements)
+            slice_strategy.strategies.append(PlacementStrategy(output_specs=out_spec))
+    if not slice_strategy.strategies:
+        # if all strategies are filtered out, unsharding all specs on slice dim
+        # of the input strategy, and use that as the op strategy
+        for arg_strategy in input_strategy.strategies:
+            arg_spec = arg_strategy.output_spec
+            unshard_spec = DTensorSpec(
+                mesh, unshard_tensor_dim(arg_spec.placements, dim=slice_dim)
+            )
+            slice_strategy.strategies.append(
+                PlacementStrategy(output_specs=unshard_spec)
+            )
+    return slice_strategy
+
+
+def unshard_tensor_dim(
+    placements: Sequence[Placement], dim: int
+) -> Tuple[Placement, ...]:
+    """Disallow the given tensor dimension to be sharded."""
+    return tuple(
+        p if (not isinstance(p, Shard) or p.dim != dim) else Replicate()
+        for p in placements
+    )
+
+
+def replicate_tensor_dim(
+    placements: Sequence[Placement], dim: int
+) -> Tuple[Placement, ...]:
+    """Force the given tensor dimension to be replicated."""
+    # Not using p.is_shard() to avoid mypy complain about Placement not having
+    # attribute dim.
+    return tuple(
+        Replicate() if p.is_partial() or isinstance(p, Shard) and p.dim == dim else p
+        for p in placements
+    )
+
+
+@register_op_strategy(aten.slice_scatter.default, schema_info=RuntimeSchemaInfo(2))
+def gen_slice_scatter_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    # 1. number of dimensions in input and src need to match.
+    # 2. number of elements on all non-dim need to match between input and src.
+    # 3. numer of elements in src in dim need to match the slice size.
+    # Given the above:
+    # - We suggest for src to follow the sharding of input, except on the scatter dimension,
+    #   where our best bet for now is to make them replicated as a fall-back.
+    #   TODO: Ideally we'd like to make sure the output is re-sharded afterwards to keep input sharding.
+
+    input_strategy = op_schema.args_schema[0]
+    assert isinstance(input_strategy, OpStrategy)
+    input_ndim = input_strategy.output_ndim
+    slice_dim = (
+        cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
+    )
+    slice_dim = normalize_dim(slice_dim, input_ndim)
+
+    slice_scatter_strategy = OpStrategy([])
+    # by default follow the input strategy for both input and src
+    for arg_strategy in input_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        if not (
+            is_tensor_dim_sharded(arg_spec, dim=slice_dim)
+            or is_tensor_partial(arg_spec)
+        ):
+            # only add the strategy if the slice_scatter dim is not sharded or partial
+            slice_scatter_strategy.strategies.append(
+                PlacementStrategy(output_specs=arg_spec)
+            )
+
+    if not slice_scatter_strategy.strategies:
+        # if all strategies are filtered out, replicating all specs on slice_scatter dim
+        # of the input strategy, and use that as the op strategy
+        for arg_strategy in input_strategy.strategies:
+            arg_spec = arg_strategy.output_spec
+            replicate_spec = DTensorSpec(
+                mesh, replicate_tensor_dim(arg_spec.placements, dim=slice_dim)
+            )
+            slice_scatter_strategy.strategies.append(
+                PlacementStrategy(output_specs=replicate_spec)
+            )
+    return slice_scatter_strategy
+
+
+@register_op_strategy(aten._local_scalar_dense.default)
+def replica_only_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    """Only allow replication on the input/output."""
+    replicate_spec = DTensorSpec(mesh, tuple([Replicate()] * mesh.ndim))
+    return OpStrategy([PlacementStrategy(replicate_spec)])
+
+
+@register_op_strategy(aten.gather.default)
+def gather_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    input_strategy = cast(OpStrategy, op_schema.args_schema[0])
+    dim = cast(int, op_schema.args_schema[1])
+    index_strategy = cast(OpStrategy, op_schema.args_schema[2])
+
+    input_shape = input_strategy.output_shape
+    index_shape = index_strategy.output_shape
+
+    all_mesh_dim_strategies = []
+
+    for mesh_dim in range(mesh.ndim):
+        single_mesh_dim_strategies = []
+
+        # placement list stores placements of [output, input, index]
+        # first we always have replicate all for inputs and output
+        all_replicate: List[Placement] = [Replicate()] * 3
+        single_mesh_dim_strategies.append(all_replicate)
+
+        # input sharding, input sharded, index accepts mask partial, output follows index
+        # this only works when the input is sharded on the gather dimension, and
+        # index has size 1 on the gather dimension
+        if index_shape[dim] == 1:
+            index_partial_placement = _MaskPartial(logical_dim_size=input_shape[dim])
+            input_sharding = [
+                index_partial_placement,
+                Shard(dim),
+                index_partial_placement,
+            ]
+            single_mesh_dim_strategies.append(input_sharding)
+
+        # index sharding, input replicated, index sharded, output follows index
+        # this only works when the sharding dimension is the gather dimension
+        index_sharding = [Shard(dim), Replicate(), Shard(dim)]
+        single_mesh_dim_strategies.append(index_sharding)
+
+        all_mesh_dim_strategies.append(single_mesh_dim_strategies)
+
+    strategy_combs = itertools.product(*all_mesh_dim_strategies)
+
+    all_strategies = []
+    for strategy_comb in strategy_combs:
+        spec_list = []
+        for specs in zip(*strategy_comb):
+            spec_list.append(DTensorSpec(mesh, tuple(specs)))
+
+        if is_tensor_shardable(input_shape, spec_list[1]) and is_tensor_shardable(
+            index_shape, spec_list[2]
+        ):
+            input_spec, index_spec = spec_list[1:]
+            redistribute_cost = [
+                generate_redistribute_costs(input_strategy, input_spec),
+                generate_redistribute_costs(index_strategy, index_spec),
+            ]
+            strat = PlacementStrategy(
+                output_specs=spec_list[0],
+                input_specs=spec_list[1:],
+                redistribute_cost=redistribute_cost,
+            )
+            all_strategies.append(strat)
+
+    return OpStrategy(all_strategies)
+
+
+@register_op_strategy(aten.stack.default, RuntimeSchemaInfo(1, needs_pytree=True))
+def stack_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+    args_schema = op_schema.args_schema
+    input_tuple_strategy = args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy), f"{input_tuple_strategy}"
+    dim = cast(int, args_schema[1]) if len(args_schema) > 1 else 0
+
+    # Follow the 1st child strategy's placement strategies
+    child_strategy = input_tuple_strategy.childs[0]
+    assert isinstance(child_strategy, OpStrategy), f"{child_strategy}"
+    strategies: List[PlacementStrategy] = []
+
+    # For each arg strategy of the child to follow, we check if every other
+    # child has an equal strategy. If so, then that is a valid strategy. If
+    # there are no such valid strategies, then we replicate.
+    for arg_strategy in child_strategy.strategies:
+        arg_spec = arg_strategy.output_spec
+        # For each arg strategy (whether the one to follow or other), we
+        # replicate the stack dim since we cannot stack on a sharded dim
+        if is_tensor_dim_sharded(arg_spec, dim):
+            arg_spec = DTensorSpec(
+                mesh, unshard_tensor_dim(arg_spec.placements, dim=dim)
+            )
+        all_compatible = True
+        for other_child_strategy in input_tuple_strategy.childs[1:]:
+            has_compatible_strategy = False
+            assert isinstance(
+                other_child_strategy, OpStrategy
+            ), f"{other_child_strategy}"
+            for other_arg_strategy in other_child_strategy.strategies:
+                other_arg_spec = other_arg_strategy.output_spec
+                if is_tensor_dim_sharded(other_arg_spec, dim):
+                    other_arg_spec = DTensorSpec(
+                        mesh, unshard_tensor_dim(other_arg_spec.placements, dim=dim)
+                    )
+                if other_arg_spec.placements == arg_spec.placements:
+                    has_compatible_strategy = True
+                    break
+            if not has_compatible_strategy:
+                all_compatible = False
+                break
+        if all_compatible:
+            input_specs = tuple(
+                arg_spec for _ in range(len(input_tuple_strategy.childs))
+            )
+            strategies.append(
+                PlacementStrategy(
+                    output_specs=DTensorSpec(mesh, arg_spec.placements),
+                    input_specs=input_specs,
+                )
+            )
+    if not strategies:
+        # Arbitrarily use each child strategy's 0th strategy's output spec
+        input_specs = tuple(
+            cast(OpStrategy, child_strategy).strategies[0].output_spec
+            for child_strategy in input_tuple_strategy.childs
+        )
+        replicate_spec = DTensorSpec(mesh, tuple(Replicate() for _ in range(mesh.ndim)))
+        strategies.append(PlacementStrategy(output_specs=replicate_spec))
+    return OpStrategy(strategies)
+
+
+@register_prop_rule(aten.index_select.default, schema_info=RuntimeSchemaInfo(1))
+def prop_index_select(op_schema: OpSchema) -> OutputSharding:
+    values_spec, dim, indices_spec = op_schema.args_schema
+
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(dim, int)
+    assert isinstance(indices_spec, DTensorSpec)
+
+    all_indices_spec: List[Optional[DTensorSpec]] = [
+        indices_spec if dim == i else None for i in range(values_spec.ndim)
+    ]
+
+    result = prop_index(
+        OpSchema(
+            op=op_schema.op,
+            args_schema=(values_spec, all_indices_spec),
+            kwargs_schema=op_schema.kwargs_schema,
+        )
+    )
+    if result.schema_suggestions:
+        result.schema_suggestions = [
+            OpSchema(
+                op=op_schema.op,
+                args_schema=(s.args_schema[0], dim, s.args_schema[1][dim]),
+                kwargs_schema=op_schema.kwargs_schema,
+            )
+            for s in result.schema_suggestions
+        ]
+    return result
+
+
+@register_prop_rule(aten.index.Tensor, schema_info=RuntimeSchemaInfo(needs_pytree=True))
+def prop_index(op_schema: OpSchema) -> OutputSharding:
+    """
+    Expect replicated on the first input; _mostly_ pointwise on the second input.
+
+    TODO: exception: when the dtype of second input is "bool", then a torch.nonzero needs to be triggered first.
+    """
+    # Current sharding constraints:
+    # For values:
+    #   1. We currently require that the dimension of values_spec be replicated or partial
+    #      if they are being indexed on.
+    #   2. Other dimensions of values_spec can remain sharded if they are so.
+    # For indices:
+    #   Indices can be either sharded or replicated. All index tensors need to be sharded
+    #   in a compatible way, following the pointwise rule (including resolving _Partial
+    #   into either sharded or replicated)
+
+    values_spec, multi_indices_spec = op_schema.args_schema
+    assert isinstance(values_spec, DTensorSpec)
+    assert isinstance(multi_indices_spec, list)
+    multi_indices_spec = cast(List[Optional[DTensorSpec]], multi_indices_spec)
+    valid_indices_spec: List[Tuple[int, DTensorSpec]] = [
+        (i, a) for i, a in enumerate(multi_indices_spec) if a is not None
+    ]
+
+    # 1. All indices have to be sharded equally. Moreover, indices can be broadcast.
+    #    Here, we piggyback on the pointwise sharding rule for indices.
+    indices_out = pointwise_rule(
+        OpSchema(
+            op=op_schema.op,
+            args_schema=tuple(v[1] for v in valid_indices_spec),
+            kwargs_schema={},
+        )
+    )
+    need_reshard_on_indices = indices_out.output_spec is None
+
+    if not need_reshard_on_indices:
+        # this means that our inputs are already sharded properly and we will use that as our indices_spec
+        assert isinstance(indices_out.output_spec, DTensorSpec)
+        indices_spec: DTensorSpec = indices_out.output_spec
+    else:
+        assert indices_out.schema_suggestions is not None
+        valid_indices_suggestion = indices_out.schema_suggestions[0]
+        for i, v in enumerate(valid_indices_suggestion.args_spec):
+            multi_indices_spec[valid_indices_spec[i][0]] = v
+        # we'll need to call pointwise_rule again to see what's our ideal indices_spec and then
+        # use that to compute our ideal values_spec
+        indices_output_spec = pointwise_rule(valid_indices_suggestion).output_spec
+        assert isinstance(indices_output_spec, DTensorSpec)
+        indices_spec = indices_output_spec
+
+    lookup_dims = {v[0] for v in valid_indices_spec}
+
+    need_reshard_on_values = tuple(
+        (isinstance(vp, Shard) and (vp.dim in lookup_dims or isinstance(ip, Shard)))
+        for vp, ip in zip(values_spec.placements, indices_spec.placements)
+    )
+
+    if not need_reshard_on_indices and not any(need_reshard_on_values):
+        value_placements = values_spec.placements
+
+        all_dims_consecutive = all(
+            b[0] - a[0] == 1
+            for b, a in zip(valid_indices_spec[1:], valid_indices_spec[:-1])
+        )
+        if all_dims_consecutive:
+            # if all index vectors are consecutives, insert at the dimension of the first index
+            insert_dim: int = valid_indices_spec[0][0]
+        else:
+            # else, insert on the first dimension
+            insert_dim = 0
+
+        def place(vp: Placement, ip: Placement) -> Placement:
+            if isinstance(vp, Shard):
+                return Shard(
+                    vp.dim
+                    if vp.dim < insert_dim
+                    # accounts for the offset in output dimensions
+                    else vp.dim
+                    + indices_spec.ndim
+                    - sum(1 if vp.dim > v[0] else 0 for v in valid_indices_spec)
+                )
+            if isinstance(ip, Shard):
+                return Shard(ip.dim + insert_dim)
+            # _Partial or Replicated
+            return vp
+
+        value_placements = tuple(
+            place(vp, ip)
+            for vp, ip in zip(values_spec.placements, indices_spec.placements)
+        )
+        result = OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=values_spec.mesh,
+                placements=value_placements,
+            )
+        )
+        return result
+    else:
+        result = OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(
+                        DTensorSpec(
+                            mesh=values_spec.mesh,
+                            placements=tuple(
+                                [
+                                    Replicate() if need_reshard_on_values[i] else v
+                                    for i, v in enumerate(values_spec.placements)
+                                ]
+                            ),
+                            tensor_meta=values_spec.tensor_meta,
+                        ),
+                        multi_indices_spec,
+                    ),
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+            ],
+        )
+        return result
+
+
+@register_prop_rule(
+    aten.cat.default, schema_info=RuntimeSchemaInfo(1, needs_pytree=True)
+)
+def cat_rule(op_schema: OpSchema) -> OutputSharding:
+    # torch.cat requires all tensors must either have the same shape (except
+    # in the concatenating dimension) or be "empty". "Empty" here strictly means
+    # tensor.shape is torch.Size([0]). When tensor.ndim > 1, it will be treated
+    # as a non-empty tensor and the shape must match on non-cat dimensions.
+    def is_empty(spec: DTensorSpec) -> bool:
+        return list(spec.shape) == [0]
+
+    # the first arg is a list of input tensor specs
+    tensor_list_specs = cast(List[DTensorSpec], op_schema.args_schema[0])
+    assert len(tensor_list_specs) > 0, "torch.cat expects a non-empty list of tensors"
+    non_empty_specs = [spec for spec in tensor_list_specs if not is_empty(spec)]
+
+    if len(non_empty_specs) == 0:
+        # all tensors are empty, we can return any output sharding
+        return OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=tensor_list_specs[0].mesh,
+                placements=tensor_list_specs[0].placements,
+            )
+        )
+
+    assert all(
+        spec.ndim == non_empty_specs[0].ndim for spec in non_empty_specs
+    ), f"Expect all tensors to have same shape or empty, but got {tensor_list_specs}"
+    assert all(
+        spec.mesh == tensor_list_specs[0].mesh for spec in tensor_list_specs
+    ), f"Expect all tensors to have same mesh, but got {tensor_list_specs}"
+
+    # ndim will also be the result's ndim
+    ndim = 1
+    for spec in tensor_list_specs:
+        ndim = max(ndim, spec.ndim)
+
+    dim = 0  # default dim = 0
+    if len(op_schema.args_schema) > 1:
+        dim = cast(int, op_schema.args_schema[1])
+    dim = normalize_dim(dim, ndim)
+
+    # Make sure all tensors are replicated on cat dimension
+    need_reshard = False
+    tensor_list_specs_after: List[DTensorSpec] = []
+    for spec in tensor_list_specs:
+        if not is_empty(spec) and (
+            is_tensor_dim_sharded(spec, dim=dim) or is_tensor_partial(spec)
+        ):
+            need_reshard = True
+            tensor_list_specs_after.append(
+                DTensorSpec(
+                    mesh=spec.mesh,
+                    placements=replicate_tensor_dim(spec.placements, dim=dim),
+                    tensor_meta=spec.tensor_meta,
+                )
+            )
+        else:
+            tensor_list_specs_after.append(spec)
+
+    tensor_list_specs = tensor_list_specs_after
+
+    # align non-cat dimensions placements based on reshard cost
+    non_empty_specs = [spec for spec in tensor_list_specs if not is_empty(spec)]
+    mesh = non_empty_specs[0].mesh
+    ndim = non_empty_specs[0].ndim
+    new_placements: List[Placement] = []
+    for mesh_dim in range(mesh.ndim):
+        # compute the minimum cost of resharding on this mesh_dim
+        if any(
+            spec.placements[mesh_dim] != non_empty_specs[0].placements[mesh_dim]
+            for spec in non_empty_specs
+        ):
+            # only reshard if there is a mismatch
+            need_reshard = True
+            reshard_cost = []
+            for shard_dim in range(ndim):
+                # compute the cost of resharding on this shard_dim
+                cost: float = 0.0
+                for spec in non_empty_specs:
+                    global_shape = spec.shape
+                    if global_shape[shard_dim] < mesh.size(mesh_dim):
+                        # found one tensor where the shard_dim is smaller than
+                        # mesh_dim. In this case, we cannot shard on this shard_dim,
+                        # and hence set cost to infinity.
+                        cost = +float("inf")
+                    elif (
+                        is_tensor_dim_sharded(spec, dim=shard_dim)
+                        or prod(global_shape) == 0
+                    ):
+                        continue
+                    else:
+                        local_shape = compute_local_shape(
+                            global_shape, spec.mesh, spec.placements
+                        )
+                        cost += prod(local_shape) * spec.mesh.size(mesh_dim)
+                reshard_cost.append(cost)
+            best_dim = reshard_cost.index(min(reshard_cost))
+            new_placements.append(Shard(best_dim))
+        else:
+            # no mismatch, keep the original placement
+            new_placements.append(non_empty_specs[0].placements[mesh_dim])
+
+    if need_reshard:
+        tensor_list_specs_after = []
+        for spec in tensor_list_specs:
+            if is_empty(spec):
+                tensor_list_specs_after.append(spec)
+            else:
+                tensor_list_specs_after.append(
+                    DTensorSpec(
+                        mesh=spec.mesh,
+                        placements=tuple(new_placements),
+                        tensor_meta=spec.tensor_meta,
+                    )
+                )
+
+        return OutputSharding(
+            output_spec=None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(
+                        tuple(tensor_list_specs_after),
+                        *op_schema.args_schema[1:],
+                    ),
+                    kwargs_schema=op_schema.kwargs_schema,
+                ),
+            ],
+        )
+    else:
+        # at this point, the cat dim is not sharded,
+        return OutputSharding(
+            output_spec=DTensorSpec(
+                mesh=non_empty_specs[0].mesh,
+                placements=non_empty_specs[0].placements,
+            ),
+        )
+
+
+@register_prop_rule(
+    [
+        aten.split.Tensor,
+        aten.split_with_sizes.default,
+        aten.split_with_sizes_copy.default,
+    ],
+    schema_info=RuntimeSchemaInfo(1),
+)
+def split_rule(op_schema: OpSchema) -> OutputSharding:
+    output_spec_list: List[DTensorSpec] = []
+    input_spec = cast(DTensorSpec, op_schema.args_schema[0])
+    ndim = input_spec.ndim
+    split_size_or_sections = op_schema.args_schema[1]
+    dim = cast(int, op_schema.args_schema[2]) if len(op_schema.args_schema) > 2 else 0
+    dim = normalize_dim(dim, ndim)
+
+    # TODO: tensor to split cannot have _Partial
+    # in its placements for now. Will need to
+    # support in future.
+    if input_spec.sums:
+        raise NotImplementedError(
+            f"splitting distributed tensor with "
+            f"_Partial placement is not implemented!\n"
+            f"DTensorSpec={input_spec}"
+        )
+
+    # TODO: just like slice op, split replicates before
+    # splitting on a sharded dimension
+    need_reshard = False
+    if is_tensor_dim_sharded(input_spec, dim=dim):
+        need_reshard = True
+        input_spec = DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=unshard_tensor_dim(input_spec.placements, dim=dim),
+            tensor_meta=input_spec.tensor_meta,
+        )
+
+    if need_reshard:
+        return OutputSharding(
+            None,
+            schema_suggestions=[
+                OpSchema(
+                    op=op_schema.op,
+                    args_schema=(input_spec,) + op_schema.args_schema[1:],
+                    kwargs_schema=op_schema.kwargs_schema,
+                ),
+            ],
+        )
+
+    def size_split(N, i):
+        # Last chunk will be smaller if the tensor size N
+        # along the given dimension dim is not divisible by i.
+        assert i > 0
+        return [i] * (N // i) + ([N % i] if N % i != 0 else [])
+
+    output_size_list = (
+        size_split(input_spec.shape[dim], split_size_or_sections)
+        if isinstance(split_size_or_sections, int)
+        else split_size_or_sections
+    )
+    output_spec_list = [
+        DTensorSpec(
+            mesh=input_spec.mesh,
+            placements=input_spec.placements,
+        )
+        for _ in range(len(output_size_list))
+    ]
+    return OutputSharding(output_spec_list)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/utils.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..559ba483efdfa278d116fac89a02f44d0ceef01a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/utils.py
@@ -0,0 +1,226 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import functools
+import operator
+from typing import cast, Iterable, List, Sequence, Tuple, Union
+
+import torch
+from torch.distributed._tensor._collective_utils import redistribute_cost
+from torch.distributed._tensor.api import DTensor
+from torch.distributed._tensor.op_schema import OpStrategy, RuntimeSchemaInfo
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+
+
+# convenient wrapper to register sharding propagation rules
+# pyre-fixme[3]: Return type must be annotated.
+# pyre-fixme[2]: Parameter must be annotated.
+def register_prop_rule(op, schema_info=None):
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+    def wrapper(impl):
+        overloads = op if isinstance(op, list) else [op]
+        for overload in overloads:
+            DTensor._op_dispatcher.sharding_propagator.register_sharding_prop_rule(
+                overload, impl, schema_info
+            )
+        return impl
+
+    return wrapper
+
+
+def register_op_strategy(op, schema_info=None):
+    # pyre-fixme[53]: Captured variable `func` is not annotated.
+    # pyre-fixme[3]: Return type must be annotated.
+    # pyre-fixme[2]: Parameter must be annotated.
+
+    # For every ATen op that accepts any args in this list,
+    # the arg itself can impact the strides (and potentially the sharding strategy)
+    # of the output tensor.
+    # thus, we will detect ATen schemas with any of these args and ensure
+    # that they get specialized here.
+    arg_names_that_require_specializing_cache_strategy = [
+        "memory_format",
+    ]
+
+    def wrapper(impl):
+        if isinstance(op, list):
+            overloads = op
+        else:
+            overloads = [op]
+
+        for overload in overloads:
+            curr_schema_info = None
+            if schema_info is None:
+                specialized_args = [
+                    a.name
+                    for a in overload._schema.arguments
+                    if a.name in arg_names_that_require_specializing_cache_strategy
+                ]
+                if any(specialized_args):
+                    curr_schema_info = RuntimeSchemaInfo(
+                        static_kwargkey=specialized_args
+                    )
+            else:
+                curr_schema_info = schema_info
+            DTensor._op_dispatcher.sharding_propagator.register_op_strategy(
+                overload, impl, curr_schema_info
+            )
+        return impl
+
+    return wrapper
+
+
+def as_list(
+    x: Union[List[object], object]
+    # pyre-fixme[11]: Annotation `immutable_list` is not defined as a type.
+) -> Union[List[object], torch.fx.immutable_collections.immutable_list]:  # type: ignore[valid-type]
+    # During tracing, `aten.sum.dim_IntList` uses `immutable_list` for its args,
+    # which is an object but treated as a list by the tracer. Therefore, keep
+    # `immutable_list` intact here as well.
+    if type(x) is list or isinstance(x, torch.fx.immutable_collections.immutable_list):
+        return x
+    else:
+        return [x]
+
+
+def normalize_dim(dim: int, ndim: int) -> int:
+    return dim if dim >= 0 else dim + ndim
+
+
+def normalize_dims(dims: Union[int, Sequence[int]], ndim: int) -> Sequence[int]:
+    """Normalize a dim or a sequence of dims, so that they are all positive."""
+    if isinstance(dims, int):
+        dims = (normalize_dim(dims, ndim),)
+    elif isinstance(dims, list):
+        dims = [normalize_dim(dim, ndim) for dim in dims]
+    elif isinstance(dims, tuple):
+        dims = tuple([normalize_dim(dim, ndim) for dim in dims])
+    return dims
+
+
+def normalize_to_torch_size(size) -> torch.Size:
+    """
+    Unify variable types of size argument to torch.Size
+    Acceptable types include:
+        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
+        or torch.Size
+    """
+    if isinstance(size, torch.Size):
+        return size
+
+    if isinstance(size, int):
+        torch_size = [size]
+    elif len(size) == 1 and isinstance(size[0], Sequence):
+        torch_size = list(size[0])
+    else:
+        torch_size = list(size)
+    return torch.Size(torch_size)
+
+
+def prod(xs: Iterable[int]) -> int:
+    return functools.reduce(operator.mul, xs, 1)
+
+
+def is_tensor_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
+    """Check if the shape is shardable according to the spec."""
+    # number of shards in each tensor dimension
+    shards_map = [1] * len(shape)
+    for i, placement in enumerate(spec.placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            shards_map[shard_dim] *= spec.mesh.size(i)
+
+    for i, dim_size in enumerate(shape):
+        # TODO: maybe we should determine is_shardable based on
+        #       whether it's evenly sharded or not
+        if shards_map[i] > 1 and dim_size < shards_map[i]:
+            return False
+
+    return True
+
+
+def is_tensor_evenly_shardable(shape: Sequence[int], spec: DTensorSpec) -> bool:
+    """Check if the shape is evenly shardable according to the spec."""
+    # number of shards in each tensor dimension
+    shards_map = [1] * len(shape)
+    for i, placement in enumerate(spec.placements):
+        if placement.is_shard():
+            shard_dim = cast(Shard, placement).dim
+            shards_map[shard_dim] *= spec.mesh.size(i)
+
+    for i, dim_size in enumerate(shape):
+        if shards_map[i] > 1 and (dim_size % shards_map[i] != 0):
+            return False
+
+    return True
+
+
+def is_tensor_dim_sharded(spec: DTensorSpec, dim: int) -> bool:
+    """Return True if tensor dim is sharded."""
+    return any(p.is_shard(dim) for p in spec.placements)
+
+
+def is_tensor_partial(spec: DTensorSpec) -> bool:
+    """Return True if tensor is partial on the mesh."""
+    return any(p.is_partial() for p in spec.placements)
+
+
+def infer_broadcast_dims_map(
+    common_shape: torch.Size, input_shape: torch.Size
+) -> List[int]:
+    # infer the broadcast dims map, where it maps from the common shape dim to the input shape dim
+    # this is aligned with the broadcast semantics
+    common_ndim = len(common_shape)
+    input_ndim = len(input_shape)
+    broadcast_dims_map = [-1] * common_ndim
+    for idx in range(-1, -1 - input_ndim, -1):
+        if input_shape[idx] == common_shape[idx]:
+            broadcast_dims_map[common_ndim + idx] = input_ndim + idx
+    return broadcast_dims_map
+
+
+def map_placements_after_broadcast(
+    placements: Tuple[Placement, ...],
+    shape: torch.Size,
+    broadcast_dims_map: List[int],
+) -> Tuple[Placement, ...]:
+    """Map each placement based on the output shape after broadcast."""
+    new_placements: List[Placement] = []
+    for placement in placements:
+        if isinstance(placement, (Replicate, _Partial)):
+            new_placements.append(placement)
+        else:
+            assert isinstance(placement, Shard)
+            shard_dim = normalize_dim(placement.dim, len(shape))
+            new_shard_dim = broadcast_dims_map[shard_dim]
+            if new_shard_dim != -1:
+                # there's a map from the common shape shard dim to
+                # the input shape shard dim before broadcasting,
+                # use that instead
+                new_placements.append(Shard(new_shard_dim))
+            else:
+                # there's no map between common shape shard dim and
+                # the input shape shard dim before broadcasting,
+                # in this case it means implicit broadcasting happen
+                # in this dim, so we can just mark it as replicate
+                # and implict broadcast will broadcast automatically
+                # to the sharded shape
+                new_placements.append(Replicate())
+
+    return tuple(new_placements)
+
+
+def generate_redistribute_costs(
+    src_strategy: OpStrategy, dst_spec: DTensorSpec
+) -> List[float]:
+    redistribute_costs: List[float] = []
+    for strat in src_strategy.strategies:
+        redistribute_costs.append(redistribute_cost(strat.output_spec, dst_spec))
+
+    return redistribute_costs
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/view_ops.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/view_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb567f220a6c337c8e6804837d391c8c4b028a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/ops/view_ops.py
@@ -0,0 +1,717 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from dataclasses import dataclass
+from typing import Callable, cast, Dict, Iterable, Optional, Sequence, Set, Tuple, Union
+
+import torch
+
+from torch import Tensor
+from torch._subclasses.fake_tensor import unset_fake_temporarily
+from torch.distributed._tensor._utils import compute_local_shape
+from torch.distributed._tensor.api import Shard
+from torch.distributed._tensor.op_schema import (
+    OpSchema,
+    OutputSharding,
+    RuntimeSchemaInfo,
+)
+from torch.distributed._tensor.ops.utils import (
+    normalize_dim,
+    normalize_dims,
+    prod,
+    register_prop_rule,
+)
+
+from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
+from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+
+aten = torch.ops.aten
+
+Shape = Tuple[int, ...]
+
+
+@dataclass
+class DimSpec:
+    """Specifies how an output dimension maps to an input dimension."""
+
+    def inputs(self) -> Iterable["DimSpec"]:
+        return ()
+
+
+# Rules that map each dimension of the output to dimensions of the input tensor
+DimMap = Tuple[DimSpec, ...]
+
+
+@dataclass
+class Singleton(DimSpec):
+    """Output dimension is a singleton."""
+
+    pass
+
+
+@dataclass
+class InputDim(DimSpec):
+    """Output dimension maps directly to an input dimension."""
+
+    input_dim: int
+
+
+@dataclass
+class Broadcast(DimSpec):
+    """Output is the broadcast of a singleton input dimension."""
+
+    dim: DimSpec
+    dim_size: int
+
+    @classmethod
+    def new(cls, dim: DimSpec, dim_size: int) -> DimSpec:
+        return Broadcast(dim, dim_size)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.dim,)
+
+
+@dataclass
+class NewDim(DimSpec):
+    """This is a new dimension created by the op."""
+
+    size: int
+
+    @classmethod
+    def new(cls, size: int) -> DimSpec:
+        return Singleton() if size == 1 else NewDim(size)
+
+
+@dataclass
+class Repeat(DimSpec):
+    """Output dimension is the input dimension repeated n-times."""
+
+    input_dim: DimSpec
+    times: int
+
+    @classmethod
+    def new(cls, dim: DimSpec, times: int) -> DimSpec:
+        if times == 1:
+            return dim
+        elif isinstance(dim, Singleton):
+            # repeating a singleton is the same as broadcasting it
+            return Broadcast(dim, times)
+        else:
+            return Repeat(dim, times)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.input_dim,)
+
+
+@dataclass
+class Flatten(DimSpec):
+    """Flatten a set of input dimensions, ensuring right-most adjacent elements remain adjacent in the output."""
+
+    input_dims: Sequence[DimSpec]
+
+    @classmethod
+    def new(cls, dims: Sequence[DimSpec]) -> DimSpec:
+        if len(dims) == 0:
+            # flattening a scalar leads to a singleton
+            return Singleton()
+        elif len(dims) == 1:
+            # flattening a single dimension is no-op
+            return dims[0]
+        else:
+            return Flatten(dims)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return self.input_dims
+
+
+@dataclass
+class Split(DimSpec):
+    """
+    This dimension is a member of a decomposition of the input dim.
+
+    Note that input_dim itself could be a Flattened set of input dims.
+    """
+
+    input_dim: DimSpec
+    group_shape: Shape
+    split_id: int
+
+    @classmethod
+    def new(cls, dim: DimSpec, group_shape: Tuple[int, ...], idx: int) -> DimSpec:
+        assert len(group_shape) > 0
+        if len(group_shape) == 1:
+            # not really a group, just return the input dim back
+            assert idx == 0
+            return dim
+        elif group_shape[idx] == 1:
+            return Singleton()
+        else:
+            # remove singletons from group
+            # group_mapping = [(new_index, (shape, old_index)) ...]
+            group_mapping = list(
+                enumerate((s, i) for i, s in enumerate(group_shape) if s != 1)
+            )
+            new_group_shape = tuple(m[1][0] for m in group_mapping)
+            new_idx = next(filter(lambda x: x[1][1] == idx, group_mapping))[0]
+            return Split(dim, new_group_shape, new_idx)
+
+    def inputs(self) -> Iterable[DimSpec]:
+        return (self.input_dim,)
+
+
+def dim_pad_left(ndim: int, min_dims: int) -> DimMap:
+    return (Singleton(),) * max(0, min_dims - ndim) + tuple(
+        InputDim(i) for i in range(ndim)
+    )
+
+
+def dim_atleast_3d(ndim: int) -> DimMap:
+    if ndim == 0:
+        return (Singleton(), Singleton(), Singleton())
+    elif ndim == 1:
+        return (Singleton(), InputDim(0), Singleton())
+    elif ndim == 2:
+        return (InputDim(0), InputDim(1), Singleton())
+    else:
+        return tuple(InputDim(i) for i in range(ndim))
+
+
+def expand(input_shape: Shape, shape: Shape) -> DimMap:
+    """Implement broadcast on multiple dimensions."""
+    assert len(shape) >= len(input_shape)
+
+    # 1. create padded input dimensions
+    padded_input = dim_pad_left(len(input_shape), len(shape))
+    # 2. check that input shapes are compatible
+    mapping = []
+    for p, desired_s in zip(padded_input, shape):
+        if isinstance(p, Singleton):
+            actual_s = 1
+            assert desired_s >= 0
+        else:
+            assert isinstance(p, InputDim), f"DimSpec not supported in expand: {p}"
+            actual_s = input_shape[p.input_dim]
+            assert actual_s == 1 or desired_s == -1 or desired_s == actual_s
+        mapping.append(
+            p
+            if desired_s in (1, -1) or desired_s == actual_s
+            else Broadcast.new(p, desired_s)
+        )
+    return tuple(mapping)
+
+
+def normalize_sizes(sizes: Union[Shape, Tuple[Shape]]) -> Shape:
+    if isinstance(sizes[0], int):
+        return cast(Shape, sizes)
+    elif len(sizes) == 1:
+        return cast(Shape, sizes[0])  # type: ignore[redundant-cast]
+    else:
+        raise RuntimeError("Size must be int... or tuple")
+
+
+def dim_flatten(ndim: int) -> DimMap:
+    if ndim == 0:
+        return (Singleton(),)
+    elif ndim == 1:
+        return (InputDim(0),)
+    else:
+        return (Flatten.new(tuple(InputDim(i) for i in range(ndim))),)
+
+
+def dim_movedim(
+    ndim: int,
+    input: Union[int, Sequence[int]],
+    destination: Union[int, Sequence[int]],
+) -> DimMap:
+    input = normalize_dims(input, ndim)
+    destination = normalize_dims(destination, ndim)
+
+    assert len(input) == len(destination)
+    input_set = set(input)
+    assert len(input_set) == len(input), "Found repeated input dims"
+    assert len(set(destination)) == len(destination), "Found repeated output dims"
+    assert max(input) < ndim
+    assert max(destination) < ndim
+
+    dest = [-1] * ndim
+    for i, d in zip(input, destination):
+        dest[d] = i
+
+    unused_inputs_iter = iter(i for i in range(ndim) if i not in input_set)
+    for i in range(ndim):
+        if dest[i] == -1:
+            dest[i] = next(unused_inputs_iter)
+
+    return tuple(InputDim(i) for i in dest)
+
+
+def dim_repeat(ndim: int, sizes: Shape) -> DimMap:
+    sizes = normalize_sizes(sizes)
+    assert (
+        len(sizes) >= ndim
+    ), f"Number of dimensions of repeat dims {sizes} can not be smaller than number of dimensions of tensor {ndim}."
+    pad = len(sizes) - ndim
+    return tuple(Repeat.new(Singleton(), s) for s in sizes[:pad]) + tuple(
+        Repeat.new(InputDim(i), s) for i, s in enumerate(sizes[pad:])
+    )
+
+
+def infer_size(total_size: int, sizes: Shape) -> Shape:
+    """
+    One dimension input to view may be "-1".
+
+    Infer the size of this dimension given the total_size.
+    """
+    infers = [i for i, s in enumerate(sizes) if s == -1]
+    size = prod(sizes)
+    assert len(infers) <= 1, "can only infer one size"
+    if infers:
+        size = -size
+        missing_size = total_size // size
+        assert (
+            total_size % size == 0
+        ), f"size inferred for -1 is not integral {sizes} should have {total_size} elements."
+        return tuple(s if s != -1 else missing_size for s in sizes)
+    assert size == total_size, f"sizes do not match {total_size} vs {size}"
+    return sizes
+
+
+def view_groups(from_size: Shape, to_size: Shape) -> DimMap:
+    """
+    Decompose a reshape operation into forwarding, flattening, or splitting dimensions for each output dimension.
+
+    A view or reshape operation can be decomposed into a set of 3 types of smaller operations:
+    1) Forward a dimension from input to output
+    2) Flatten a set of dimensions into a single dimension
+    3) Split one dimension into multiple dimensions
+
+    view_groups identifies these operations and returns, for each output dimension, what
+    is operation was performed in the input dimension. For example:
+
+        view_groups([2, 3, 4], [2, 12]) -> (
+            InputDim(0),
+            Flatten((InputDim(1), InputDim(2)))
+        )
+
+    - ouptut dimension 0 maps to input dimension 0
+    - output dimension 1 maps to a flattened input dimensions 1 and 2
+
+
+        view_groups([2, 3], [3, 2]) -> (
+            Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 0),
+            Split(Flatten((InputDim(0), InputDim(1))), (3, 2), 1),
+        )
+
+    - in the above, input is flattened into a single dimension and then split
+      into two separate dimensions with different sizes from the input.
+    """
+    from_nelem = prod(from_size)
+    to_size = infer_size(from_nelem, normalize_sizes(to_size))
+
+    assert from_nelem == prod(to_size), "Total view shape does not add up"
+
+    from_idx = 0
+    to_idx = 0
+    from_len = len(from_size)
+    to_len = len(to_size)
+
+    result_pp = []
+
+    while from_idx < from_len or to_idx < to_len:
+        from_group_dim, to_group_shape = [], []
+
+        if from_idx >= from_len:
+            f = 1
+        else:
+            f = from_size[from_idx]
+            from_group_dim.append(from_idx)
+            from_idx += 1
+
+        if to_idx >= to_len:
+            t = 1
+        else:
+            t = to_size[to_idx]
+            to_group_shape.append(t)
+            to_idx += 1
+
+        # if any of the groups is singleton, great, we need to backtrack though
+        if f == 1 and t != 1:
+            # produces ([1], [])
+            to_idx -= 1
+            to_group_shape = []
+        elif f != 1 and t == 1:
+            # produces ([], [1])
+            from_idx -= 1
+            from_group_dim = []
+        else:
+            # produces ([1], [1]),  ([2], [2]), ([2,3], [6])
+            while f != t:
+                if f < t:
+                    nf = from_size[from_idx]
+                    from_group_dim.append(from_idx)
+                    from_idx += 1
+                    f *= nf
+                else:
+                    nt = to_size[to_idx]
+                    to_group_shape.append(nt)
+                    to_idx += 1
+                    t *= nt
+
+        if len(to_group_shape) > 0:
+            flattened = Flatten.new(
+                tuple(InputDim(fi) for fi in from_group_dim if from_size[fi] > 1)
+            )
+            result_pp += [
+                Split.new(flattened, tuple(to_group_shape), i)
+                for i in range(len(to_group_shape))
+            ]
+
+    return tuple(result_pp)
+
+
+def dim_tile(ndim: int, dims: Tuple[int, ...]) -> DimMap:
+    if len(dims) < ndim:
+        dims = (1,) * (ndim - len(dims)) + dims
+    return dim_repeat(ndim, dims)
+
+
+def dim_transpose(ndim: int, dim1: int, dim2: int) -> DimMap:
+    dim1 = normalize_dim(dim1, ndim)
+    dim2 = normalize_dim(dim2, ndim)
+    assert dim1 < ndim
+    assert dim2 < ndim
+    dimmap = [InputDim(i) for i in range(ndim)]
+    swapdim = dimmap[dim1]
+    dimmap[dim1] = dimmap[dim2]
+    dimmap[dim2] = swapdim
+    return tuple(dimmap)
+
+
+def dim_squeeze(shape: Shape, dim: Optional[int] = None) -> DimMap:
+    # FIXME: this is wrong when dim=None and one of the dimensions
+    # equals size of the mesh. For example squeeze(DTensor(tensor(4), Shard[0])) could
+    # end up as squeeze(tensor(1)) if we have 4 devices; this would lead to
+    # removal of a dimension that is not actually a singleton.
+    return tuple(
+        InputDim(i)
+        for i, s in enumerate(shape)
+        if s > 1 or (dim is not None and i != normalize_dim(dim, len(shape)))
+    )
+
+
+def dim_unsqueeze(ndim: int, dim: int) -> DimMap:
+    dims = tuple(InputDim(i) for i in range(ndim))
+    if dim < 0:
+        dim += ndim + 1
+    return dims[:dim] + (Singleton(),) + dims[dim:]
+
+
+def dim_reduction(
+    ndim: int, dim_or_dims: Optional[Union[int, Sequence[int]]], keepdim: bool
+) -> DimMap:
+    """
+    General fallback for reduction ops where _Partial() does not apply.
+
+    This will cause incoming tensor to be replicated on the reducing dimensions.
+    """
+    if dim_or_dims is None:
+        dim_or_dims = tuple(range(ndim))
+    if isinstance(dim_or_dims, int):
+        dim_or_dims = (dim_or_dims,)
+    dim_or_dims = tuple(d if d >= 0 else d + ndim for d in dim_or_dims)
+    return tuple(
+        InputDim(i) if i not in dim_or_dims else Singleton()
+        for i in range(ndim)
+        if i not in dim_or_dims or keepdim
+    )
+
+
+@dataclass
+class Op:
+    dim_map: Callable[..., DimMap]
+    shape_argnum: Optional[int] = None
+
+
+ops: Dict[Callable[..., torch.Tensor], Op] = {
+    torch.atleast_1d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 1)),
+    torch.atleast_2d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 2)),
+    torch.atleast_3d: Op(dim_map=lambda x: dim_atleast_3d(x.ndim)),
+    torch.broadcast_to: Op(
+        dim_map=lambda input, shape: expand(input.shape, shape), shape_argnum=1
+    ),
+    Tensor.expand: Op(
+        dim_map=lambda self, *sizes: expand(self.shape, normalize_sizes(sizes)),
+        shape_argnum=1,
+    ),
+    torch.flatten: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
+    torch.movedim: Op(
+        dim_map=lambda input, source, destination: dim_movedim(
+            input.ndim, source, destination
+        )
+    ),
+    torch.permute: Op(
+        dim_map=lambda input, dims: tuple(
+            InputDim(i) for i in normalize_dims(dims, input.ndim)
+        )
+    ),
+    torch.ravel: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
+    Tensor.repeat: Op(dim_map=lambda self, *sizes: dim_repeat(self.ndim, sizes)),
+    torch.reshape: Op(
+        dim_map=lambda input, shape: view_groups(input.shape, shape),
+        shape_argnum=1,
+    ),
+    torch.squeeze: Op(dim_map=lambda input, dim=None: dim_squeeze(input.shape, dim)),
+    torch.tile: Op(dim_map=lambda input, dims: dim_tile(input.ndim, dims)),
+    torch.transpose: Op(
+        dim_map=lambda input, dim0, dim1: dim_transpose(input.ndim, dim0, dim1)
+    ),
+    torch.unsqueeze: Op(dim_map=lambda input, dim: dim_unsqueeze(input.ndim, dim)),
+    Tensor.view: Op(
+        dim_map=lambda input, *shape: view_groups(input.shape, shape),
+        shape_argnum=1,
+    ),
+}
+
+
+def propagate_shape_and_sharding(
+    in_shard: Sequence[Placement],
+    local_in_shape: Shape,
+    rule: DimMap,
+    mesh_sizes: Shape,
+) -> Tuple[Shape, Optional[Sequence[Placement]], torch.Tensor]:
+    """
+    Determine output sharding and tensor shape based on given global tensor shape and input sharding.
+
+    Takes as input the global shape of the tensor, and the input sharding,
+    and produce corresponding output sharding and shape of the output tensor.
+
+    Sharding propagation follows mapped dimensions:
+    - An output dimension that maps directly to an input dimension is sharded equally
+    - An output dimension that is a flattened set of input dimensions can only be
+      sharded if only the leftmost flattened dimension is sharded.
+    - An output dimension that is a split of the input dimension can only be sharded
+      if the leftmost split size is divisible by the mesh dimension
+    """
+    assert len(in_shard) == len(mesh_sizes)
+    sharded_in_dims: Set[int] = {s.dim for s in in_shard if isinstance(s, Shard)}
+    # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
+    shardable_dims: torch.Tensor = torch.ones(
+        (len(local_in_shape), len(mesh_sizes)), dtype=torch.bool
+    )
+
+    # in case an input dimension disappears (e.g. collapsing, reduction)
+    # we cannot shard in that dimension (we need a replication fall-back rule)
+
+    seen_input_dims: Set[int] = set()
+
+    def collect_used_inputs(cmd: DimSpec) -> None:
+        if isinstance(cmd, InputDim):
+            seen_input_dims.add(cmd.input_dim)
+        for inp in cmd.inputs():
+            collect_used_inputs(inp)
+
+    for cmd in rule:
+        collect_used_inputs(cmd)
+    for dim in range(len(local_in_shape)):
+        shardable_dims[dim, :] = dim in seen_input_dims
+
+    def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
+        if isinstance(cmd, InputDim):
+            seen_input_dims.add(cmd.input_dim)
+            return (
+                local_in_shape[cmd.input_dim],
+                cmd if cmd.input_dim in sharded_in_dims else None,
+            )
+        elif isinstance(cmd, Flatten):
+            for dim in cmd.input_dims[1:]:
+                if isinstance(dim, InputDim):
+                    shardable_dims[dim.input_dim, :] = False
+            dim0 = cmd.input_dims[0]
+            return (
+                prod(get_dim_size(a)[0] for a in cmd.input_dims),
+                dim0
+                if isinstance(dim0, InputDim) and dim0.input_dim in sharded_in_dims
+                else None,
+            )
+        elif isinstance(cmd, Split):
+            _, in_dim = get_dim_size(cmd.input_dim)
+            out_size = cmd.group_shape[cmd.split_id]
+            if cmd.split_id == 0 and in_dim is not None:
+                # we need to check that the input dimension is divisible
+                # by the size of the submesh we're sharding it on
+                # NOTE: it would be possible to shard the same input dimension
+                # on more than one mesh dimension. In that case, the dimension
+                # needs to be divisible by the product of mesh sizes.
+                # In order to keep the problem more tractable, we will not consider
+                # double resharding as a suggestion (e.g. [Shard(0), Shard(0) ])
+                # but we will allow it if that's the input and it's compatible
+
+                # 1. is this dimension shardable on each individual mesh dim?
+                for mesh_dim, mesh_dim_size in enumerate(mesh_sizes):
+                    shardable_dims[in_dim.input_dim, mesh_dim] = (
+                        out_size % mesh_dim_size == 0
+                    )
+
+                # 2. here we special case things like [Shard(0), Shard(0)]
+                submesh_size = 1
+                for size, shard in zip(mesh_sizes, in_shard):
+                    if isinstance(shard, Shard) and shard.dim == in_dim:
+                        submesh_size *= size
+                assert (
+                    out_size % submesh_size == 0
+                ), f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
+
+            # we will only shard our first component of the split
+            return out_size, in_dim if cmd.split_id == 0 else None
+        elif isinstance(cmd, Singleton):
+            return 1, None
+        elif isinstance(cmd, Broadcast):
+            return cmd.dim_size, None
+        elif isinstance(cmd, NewDim):
+            return cmd.size, None
+        elif isinstance(cmd, Repeat):
+            size, in_dim = get_dim_size(cmd.input_dim)
+            if in_dim is not None:
+                shardable_dims[in_dim.input_dim, :] = False
+            return size * cmd.times, None
+        else:
+            raise RuntimeError(f"cmd not found: {cmd}, in rule: {rule}")
+
+    dim_map = {}
+    out_shape = []
+    for dim, cmd in enumerate(rule):
+        out_size, in_dim = get_dim_size(cmd)
+        out_shape.append(out_size)
+        if in_dim is not None:
+            dim_map[in_dim.input_dim] = dim
+
+    needs_reshard = any(
+        isinstance(placement, Shard) and not shardable_dims[placement.dim][mesh_dim]
+        for mesh_dim, placement in enumerate(in_shard)
+    )
+
+    output_placements = (
+        None
+        if needs_reshard
+        else [Shard(dim_map[s.dim]) if isinstance(s, Shard) else s for s in in_shard]
+    )
+
+    return (tuple(out_shape), output_placements, shardable_dims)
+
+
+def register_prop_rule_map(
+    aten_op_overload: torch._ops.OpOverload,
+    local_op_name: Callable[..., torch.Tensor],
+    schema_info: Optional[RuntimeSchemaInfo] = None,
+) -> None:
+    spec: Op = ops[local_op_name]
+
+    @register_prop_rule(aten_op_overload, schema_info=schema_info)
+    def reshape_prop(op_schema: OpSchema) -> OutputSharding:
+        rules = spec.dim_map(*op_schema.args_schema, **op_schema.kwargs_schema)
+        input_dtensor_spec = cast(DTensorSpec, op_schema.args_schema[0])
+        mesh = input_dtensor_spec.mesh
+
+        assert isinstance(
+            input_dtensor_spec, DTensorSpec
+        ), "Expected first input to be a DTensorSpec"
+        global_in_shape = input_dtensor_spec.shape
+        assert global_in_shape is not None, "Shape required."
+
+        with disable_proxy_modes_tracing(), unset_fake_temporarily():
+            (
+                global_out_shape,
+                shard_out,
+                shardable_dims,
+            ) = propagate_shape_and_sharding(
+                input_dtensor_spec.placements,
+                tuple(global_in_shape),
+                rules,
+                mesh.shape,
+            )
+
+        if shard_out is not None:
+            # no reshard needed
+            output_dtensor_spec = DTensorSpec(mesh=mesh, placements=tuple(shard_out))
+
+            # We only need the local shape to lower the call into the local op
+            args = op_schema.args_schema
+            shape_argnum = spec.shape_argnum
+            if shape_argnum is not None:
+                # compute the local shape from the global shape, then return
+                # a resharding even if we don't really reshard, the only reason
+                # for this type of resharding is to lower the global shape to
+                # local shape
+                local_out_shape = compute_local_shape(
+                    list(global_out_shape), mesh, shard_out
+                )
+
+                suggested_schema = OpSchema(
+                    op=op_schema.op,
+                    args_schema=args[:shape_argnum]
+                    + (tuple(local_out_shape),)
+                    + args[shape_argnum + 1 :],
+                    kwargs_schema=op_schema.kwargs_schema,
+                )
+                return OutputSharding(
+                    output_spec=output_dtensor_spec,
+                    schema_suggestions=[suggested_schema],
+                    needs_redistribute=True,
+                )
+
+            return OutputSharding(output_spec=output_dtensor_spec)
+
+        else:
+            # TODO: optimize this. we shouldn't simply blindly replicate
+            #       unshardable dims ...
+            # FIXME: this can be wrong for situations where we have
+            #        [Shard(0), Shard(0)]
+            suggested_placements = [
+                p
+                if not isinstance(p, Shard) or shardable_dims[p.dim][mesh_dim]
+                else Replicate()
+                for mesh_dim, p in enumerate(input_dtensor_spec.placements)
+            ]
+            return OutputSharding(
+                output_spec=None,
+                schema_suggestions=[
+                    OpSchema(
+                        op=op_schema.op,
+                        args_schema=(
+                            DTensorSpec(
+                                placements=tuple(suggested_placements),
+                                mesh=input_dtensor_spec.mesh,
+                                tensor_meta=input_dtensor_spec.tensor_meta,
+                            ),
+                        )
+                        + op_schema.args_schema[1:],
+                        kwargs_schema=op_schema.kwargs_schema,
+                    )
+                ],
+            )
+
+
+register_prop_rule_map(aten.squeeze.default, torch.squeeze)
+register_prop_rule_map(
+    aten.squeeze.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(aten.view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1))
+register_prop_rule_map(
+    aten.reshape.default, torch.reshape, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten._unsafe_view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten.unsqueeze.default, torch.unsqueeze, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten.expand.default, Tensor.expand, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten.permute.default, torch.permute, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten.repeat.default, Tensor.repeat, schema_info=RuntimeSchemaInfo(1)
+)
+register_prop_rule_map(
+    aten.transpose.int, torch.transpose, schema_info=RuntimeSchemaInfo(1)
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/placement_types.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/placement_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b5424a028eb120c25d20fe524a49f4ce4df9fa4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/placement_types.py
@@ -0,0 +1,620 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from dataclasses import dataclass
+from typing import Any, cast, List, NamedTuple, Optional, Tuple
+
+import torch
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+
+from torch.distributed._tensor._collective_utils import mesh_broadcast, mesh_scatter
+from torch.distributed.device_mesh import DeviceMesh
+
+
+class Placement:
+    # base class Placement type
+
+    # convenient utils to check for placement types
+    def is_shard(self, dim: Optional[int] = None) -> bool:
+        is_shard_instance = isinstance(self, Shard)
+        if dim is not None and is_shard_instance:
+            return cast(Shard, self).dim == dim
+        else:
+            return is_shard_instance
+
+    def is_replicate(self) -> bool:
+        return isinstance(self, Replicate)
+
+    def is_partial(self) -> bool:
+        return isinstance(self, _Partial)
+
+
+@dataclass(frozen=True)
+class Shard(Placement):
+    # shard placement, shard on a dim
+    dim: int
+
+    def _split_tensor(
+        self,
+        tensor: torch.Tensor,
+        num_chunks: int,
+        *,
+        with_padding: bool = True,
+        contiguous: bool = True,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        """
+        This function uses torch.chunk to split a tensor into num_chunks shards along
+        the Shard placement dimension, and return a list of shards with their pad sizes.
+
+        Keyword args:
+            with_padding (bool, optional): when True, we pad the tensor on the last
+            few ranks before calling the collectives (i.e. scatter/all_gather, etc.).
+            This is because collectives usually require equal size tensor inputs
+        """
+        assert (
+            self.dim <= tensor.ndim
+        ), f"Sharding dim {self.dim} greater than tensor ndim {tensor.ndim}"
+
+        # chunk tensor over dimension `dim` into n slices with padding if necessary
+        tensor_list = list(torch.chunk(tensor, num_chunks, dim=self.dim))
+        # compute the chunk size inline with ``torch.chunk``
+        full_chunk_size = (tensor.size(self.dim) + num_chunks - 1) // num_chunks
+
+        # Compute chunk size for each chunk for ``self.dim``
+        chunk_sizes = [
+            tensor_list[idx].size(self.dim) if idx < len(tensor_list) else 0
+            for idx in range(num_chunks)
+        ]
+        # Compute pad size on each chunk
+        pad_sizes = [full_chunk_size - chunk_size for chunk_size in chunk_sizes]
+
+        # Reuse tensor to fill empty chunk with empty tensor
+        num_empty_tensors = num_chunks - len(tensor_list)
+        tensor_size = list(tensor_list[0].size())
+        tensor_size = [
+            size if idx != self.dim else 0 for idx, size in enumerate(tensor_size)
+        ]
+        tensor = tensor.new_zeros(tensor_size)
+        for _ in range(num_empty_tensors):
+            tensor_list.append(tensor)
+
+        if with_padding or contiguous:
+            shard_list = []
+            for shard, pad_size in zip(tensor_list, pad_sizes):
+                # Fill the empty tensor with zeroes with padding.
+                if with_padding and pad_size > 0:
+                    shard = self._pad_tensor(shard, pad_size)
+                shard = shard.contiguous() if contiguous else shard
+                shard_list.append(shard)
+            return shard_list, pad_sizes
+        else:
+            return tensor_list, pad_sizes
+
+    def _pad_tensor(
+        self,
+        tensor: torch.Tensor,
+        pad_size: int,
+    ) -> torch.Tensor:
+        if pad_size == 0:
+            return tensor
+        pad = [0, 0] * (tensor.ndim - self.dim)
+        pad[-1] = pad_size
+        return torch.nn.functional.pad(tensor, pad)
+
+    def _unpad_tensor(
+        self,
+        tensor: torch.Tensor,
+        pad_size: int,
+    ) -> torch.Tensor:
+        if pad_size == 0:
+            return tensor
+        return tensor.narrow(
+            self.dim,
+            start=0,
+            length=tensor.size(self.dim) - pad_size,
+        )
+
+    @staticmethod
+    def _local_shard_size_on_dim(
+        size_on_dim: int,
+        num_chunks: int,
+        rank: int,
+        return_offset: bool = False,
+    ) -> Tuple[int, int]:
+        """
+        returns the local shard size and offset on a given tensor dim
+        """
+        # Compute the chunk size inline with ``torch.chunk``
+        if size_on_dim % num_chunks == 0:
+            full_chunk_size = size_on_dim // num_chunks
+            return full_chunk_size, full_chunk_size * rank if return_offset else -1
+
+        # uneven sharding case
+        full_chunk_size = (size_on_dim + num_chunks - 1) // num_chunks
+        shard_starting_idx = full_chunk_size * rank
+
+        if size_on_dim < shard_starting_idx:
+            return 0, size_on_dim if return_offset else -1
+        else:
+            local_shard_size = (
+                min(size_on_dim, shard_starting_idx + full_chunk_size)
+                - shard_starting_idx
+            )
+            return local_shard_size, shard_starting_idx if return_offset else -1
+
+    def _shard_tensor(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        """
+        shard and scatter a tensor on a mesh dimension (use coordinate
+        0 on the mesh dimension as source of truth)
+        """
+        my_coordinate = mesh.get_coordinate()
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return an empty tensor
+            return tensor.new_empty(0, requires_grad=tensor.requires_grad)
+
+        scatter_list, pad_sizes = self._split_tensor(
+            tensor, num_chunks, with_padding=True, contiguous=True
+        )
+
+        output = torch.empty_like(scatter_list[my_coordinate[mesh_dim]])
+        mesh_scatter(output, scatter_list, mesh, mesh_dim=mesh_dim)
+
+        # Only unpad if the local_tensor was padded on the dimension.
+        pad_size = pad_sizes[my_coordinate[mesh_dim]]
+        if pad_size > 0:
+            output = self._unpad_tensor(output, pad_size)
+        return output
+
+    def _reduce_shard_tensor(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        reduce_op: c10d.ReduceOp.RedOpType,
+        mesh_dim: int,
+    ) -> torch.Tensor:
+        """
+        reduce and scatter a tensor on a mesh dimension
+        """
+        my_coordinate = mesh.get_coordinate()
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return local_tensor,
+            # which should be an empty tensor
+            return tensor
+
+        is_padded = tensor.size(self.dim) % num_chunks != 0
+        if is_padded:
+            scattered_list, pad_sizes = self._split_tensor(
+                tensor, num_chunks, with_padding=True, contiguous=True
+            )
+            tensor = torch.cat(scattered_list, dim=self.dim)
+        elif not tensor.is_contiguous():
+            tensor = tensor.contiguous()
+
+        output = funcol.reduce_scatter_tensor(
+            tensor, reduce_op.name, scatter_dim=self.dim, group=(mesh, mesh_dim)
+        )
+
+        if is_padded:
+            output = self._unpad_tensor(output, pad_sizes[my_coordinate[mesh_dim]])  # type: ignore[possibly-undefined]
+        return output
+
+    def _to_replicate_tensor(
+        self,
+        local_tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        current_logical_shape: List[int],
+    ) -> torch.Tensor:
+        """
+        This function all_gather all shards and return a tensor that
+        is replicated on the previously sharded mesh dimension
+        """
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        # check if it's uneven, so we need to pad input tensor before all_gather
+        local_shape = list(local_tensor.size())
+
+        logical_dim_size = current_logical_shape[self.dim]
+        is_padded = logical_dim_size % num_chunks != 0
+
+        if is_padded:
+            full_chunk_size = (logical_dim_size + num_chunks - 1) // num_chunks
+            pad_size = full_chunk_size - local_shape[self.dim]
+            local_tensor = self._pad_tensor(local_tensor, pad_size)
+
+        if not local_tensor.is_contiguous():
+            local_tensor = local_tensor.contiguous()
+
+        result = funcol.all_gather_tensor(
+            local_tensor,
+            gather_dim=self.dim,
+            group=(mesh, mesh_dim),
+        )
+        if is_padded:
+            unpad_size = full_chunk_size * num_chunks - logical_dim_size  # type: ignore[possibly-undefined]
+            result = self._unpad_tensor(result, unpad_size)
+        return result
+
+    def _replicate_to_shard(
+        self,
+        local_tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_index: int,
+    ) -> torch.Tensor:
+        """
+        transform from replicated tensor to a sharded tensor on
+        the current rank, which would perform a local chunk
+        """
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        shards, _ = self._split_tensor(
+            local_tensor,
+            num_chunks,
+            with_padding=False,
+            contiguous=False,
+        )
+        return shards[shard_index].clone()
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Shard):
+            return False
+        return self.dim == other.dim
+
+    def __hash__(self) -> int:
+        return hash(self.dim)
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the Shard placement
+        """
+        return f"Shard(dim={self.dim})"
+
+    def __str__(self) -> str:
+        """human readable representation of the Shard placement"""
+        return f"S({self.dim})"
+
+
+@dataclass(frozen=True)
+class Replicate(Placement):
+    # replicate placement
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Replicate):
+            return False
+        return True
+
+    def __hash__(self) -> int:
+        # every replicate placement is the same
+        return -1
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the Replicate placement
+        """
+        return "Replicate()"
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the Replicate placement
+        """
+        return "R"
+
+    def _replicate_tensor(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        """
+        Replicate (broadcast) a torch.Tensor on a mesh dimension (use
+        the first coordinate on the mesh dimension as source of truth)
+        """
+        my_coordinate = mesh.get_coordinate()
+        if my_coordinate is None:
+            # if rank is not part of mesh, we simply return an empty tensor
+            return tensor.new_empty(0, requires_grad=tensor.requires_grad)
+
+        tensor = tensor.contiguous()
+        mesh_broadcast(tensor, mesh, mesh_dim=mesh_dim)
+        return tensor
+
+
+@dataclass(frozen=True)
+class _Partial(Placement):
+    # This is a default _Partial placement with element-wise reduce op
+    # _Partial define three contracts:
+    # 1. _reduce_value: reduce the value of the tensor on the mesh dimension
+    # 2. _reduce_shard_value: reduce_scatter the value of the tensor on the mesh dimension
+    # 3. _partition_value: partition the value of a replicated tensor on the mesh dimension
+    # We can implement custom reductions as needed by subclassing this
+    # class and override those contracts.
+    reduce_op: c10d.ReduceOp.RedOpType = c10d.ReduceOp.SUM
+
+    def _reduce_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        return funcol.all_reduce(
+            tensor, reduceOp=self.reduce_op.name, group=(mesh, mesh_dim)
+        )
+
+    def _reduce_shard_value(
+        self,
+        tensor: torch.Tensor,
+        mesh: DeviceMesh,
+        mesh_dim: int,
+        shard_spec: Placement,
+    ) -> torch.Tensor:
+        # by default call reduce_shard_tensor of the shard_spec.
+        shard_spec = cast(Shard, shard_spec)
+        return shard_spec._reduce_shard_tensor(tensor, mesh, self.reduce_op, mesh_dim)
+
+    def _partition_value(
+        self, tensor: torch.Tensor, mesh: DeviceMesh, mesh_dim: int
+    ) -> torch.Tensor:
+        # _partition_value is the conjugate operation of _reduce_value
+        # - i.e. _partition_value on a sum reduce op is just a divison operation
+        # - the _reduce_value on a sum reduce op would just be a sum(allreduce) operation
+        # TODO: if the reduce_op is min/max, etc. the _partition_value should be a
+        # different operation
+        assert (
+            self.reduce_op == c10d.ReduceOp.SUM
+        ), "only support replicate to PartialSUM for now!"
+        num_chunks = mesh.size(mesh_dim=mesh_dim)
+        return tensor / num_chunks
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, _Partial):
+            return False
+        return self.reduce_op == other.reduce_op
+
+    def __hash__(self) -> int:
+        return 1 + hash(self.reduce_op)
+
+    def __repr__(self) -> str:
+        """
+        machine readable representation of the Partial placement
+        """
+        return f"_Partial(reduce_op={self.reduce_op})"
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the Partial placement
+        """
+        return "P"
+
+
+class TensorMeta(NamedTuple):
+    # simple named tuple to represent tensor metadata
+    # intentionally to stay simple only for sharding
+    # propagation purposes.
+    shape: torch.Size
+    stride: Tuple[int, ...]
+    dtype: torch.dtype
+
+
+# used internally to propagate the placements
+@dataclass
+class DTensorSpec:
+    mesh: DeviceMesh
+    placements: Tuple[Placement, ...]
+
+    # tensor meta will only be set during sharding propagation
+    tensor_meta: Optional[TensorMeta] = None
+
+    def __post_init__(self):
+        if not isinstance(self.placements, tuple):
+            self.placements = tuple(self.placements)
+        self._hash: Optional[int] = None
+
+    def __setattr__(self, attr: str, value: Any):
+        super().__setattr__(attr, value)
+        # Make sure to recompute the hash in case any of the hashed attributes
+        # change (though we do not expect `mesh` or `placements` to change)
+        if hasattr(self, "_hash") and attr in ("mesh", "placements", "tensor_meta"):
+            self._hash = None
+
+    def _hash_impl(self) -> int:
+        # hashing and equality check for DTensorSpec are used to cache the sharding
+        # propagation results. We only need to consider the mesh, placements, shape
+        # dtype and stride.
+        # Caveat: we need to keep this in mind and sync hash and eq if we add more
+        # fields to them.
+        if self.tensor_meta is not None:
+            return hash(
+                (
+                    self.mesh,
+                    self.placements,
+                    self.tensor_meta.shape,
+                    self.tensor_meta.stride,
+                    self.tensor_meta.dtype,
+                )
+            )
+        return hash((self.mesh, self.placements))
+
+    def __hash__(self) -> int:
+        # We lazily cache the spec to avoid recomputing the hash upon each
+        # use, where we make sure to update the hash when the `tensor_meta`
+        # changes by overriding `__setattr__`. This must be lazy so that Dynamo
+        # does not try to hash non-singleton `SymInt`s for the stride.
+        if self._hash is None:
+            self._hash = self._hash_impl()
+        return self._hash
+
+    def __eq__(self, __o: object) -> bool:
+        if not (
+            isinstance(__o, DTensorSpec)
+            and self.mesh == __o.mesh
+            and self.placements == __o.placements
+        ):
+            return False
+        if self.tensor_meta is None or __o.tensor_meta is None:
+            return self.tensor_meta == __o.tensor_meta
+
+        return (
+            self.tensor_meta.shape == __o.tensor_meta.shape  # type: ignore[union-attr]
+            and self.tensor_meta.stride == __o.tensor_meta.stride  # type: ignore[union-attr]
+            and self.tensor_meta.dtype == __o.tensor_meta.dtype  # type: ignore[union-attr]
+        )
+
+    def __str__(self) -> str:
+        """
+        human readable representation of the DTensorSpec
+        """
+        if len(self.placements) == 1:
+            placement_str = str(self.placements[0])
+        else:
+            placement_str = str(self.placements)
+
+        if self.tensor_meta is not None:
+            tensor_shape = str(tuple(self.tensor_meta.shape))
+        else:
+            tensor_shape = "unknown shape"
+
+        return f"Spec({placement_str} on {tensor_shape})"
+
+    @property
+    def shape(self) -> torch.Size:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return self.tensor_meta.shape
+
+    @property
+    def stride(self) -> Tuple[int, ...]:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return self.tensor_meta.stride
+
+    @property
+    def ndim(self) -> int:
+        if self.tensor_meta is None:
+            raise ValueError("tensor_meta is not set")
+        return len(self.tensor_meta.shape)
+
+    @property
+    def num_shards(self) -> int:
+        num_shards = 1
+        for i, placement in enumerate(self.placements):
+            if placement.is_shard():
+                num_shards *= self.mesh.size(i)
+        return num_shards
+
+    @property
+    def device_mesh(self) -> DeviceMesh:
+        # simple aliasing for the mesh field, make some
+        # checks that mixes DTensor/DTensorSpec easier
+        return self.mesh
+
+    @property
+    def dim_map(self) -> List[int]:
+        """
+        dim_map is a property we derive from `placements` of
+        the distributed tensor. It simply return a list of ints
+        where dim_map[i] denotes the sharding mapping to the mesh
+        dimension, and len(dim_map) == dist_tensor.ndim
+        dim_map[i] = -1: means tensor dim i replicate on mesh
+        dim_map[i] = j: means tensor dim i shard on mesh dim j
+
+        For example, we have a dist tensor that have the shape of
+        [18, 20, 30], and device_mesh([0, 1, 2, 3]), placements:
+        [Shard(1)], the dim_map of this placement would be:
+        [-1, 0, -1]. This representation is pretty helpful during
+        sharding propagation where we could know exactly each
+        tensor dimension is sharded or not.
+
+        Note that if placements contains `_Partial`, we have to
+        explicitly deal with it, so that when we create a DTensorSpec
+        with dim_map, we could properly record the pending sums.
+        """
+        # dims mapping of dist tensor sharding
+        # return size of tensor ndim, -1 represent replicate
+        # and int >=0 represent shard on that device mesh dim
+        r = [-1] * self.ndim
+        for i, placement in enumerate(self.placements):
+            if placement.is_shard():
+                shard_dim = cast(Shard, placement).dim
+                if r[shard_dim] > -1:
+                    raise ValueError(
+                        f"Tensor dim {shard_dim} is already sharded on mesh dim {r[shard_dim]},"
+                        " DTensor operator implementation does not support things like hybrid"
+                        " sharding strategies yet (i.e. [Shard(0), Shard(0)])"
+                    )
+                r[shard_dim] = i
+        return r
+
+    @property
+    def sums(self) -> List[int]:
+        """
+        sums is a property we derive from `placements` of the
+        distributed tensor. It simply return a list of ints where
+        sums[i] denotes the pending sum (partial) on mesh dim i
+        """
+        return [
+            idx
+            for idx, placement in enumerate(self.placements)
+            if placement.is_partial()
+        ]
+
+    @classmethod
+    def from_dim_map(
+        cls,
+        mesh: DeviceMesh,
+        dim_map: List[int],
+        sums: List[int],
+        tensor_meta: Optional[TensorMeta] = None,
+    ) -> "DTensorSpec":
+        """
+        Construct a DTensorSpec from dim_map list and pending sum.
+
+        Args:
+            mesh (class:`DeviceMesh`): device mesh to be used in the DTensorSpec
+            dim_map (List[int]): a list of integer that represents sharding on each
+                tensor dimension, see `dim_map` property doc for details
+            sums (List[int]): a list of integer that represents the dist tensor have
+                pending sum on which device mesh dimension.
+            tensor meta (TensorMeta): DTensor metadata
+
+        Return:
+            a class:`DTensorSpec` object
+        """
+        # by default replicate on device mesh dims
+        placements: List[Placement] = [Replicate() for _ in range(mesh.ndim)]
+
+        # find all mesh dims that need pending reductions
+        for s in sums:
+            placements[s] = _Partial()
+
+        for i, m in enumerate(dim_map):
+            if m >= 0:
+                placement = placements[m]
+                if placement.is_shard():
+                    placement = cast(Shard, placement)
+                    raise RuntimeError(
+                        f"DeviceMesh dimension cann't be mapped to two dimension of the same tensor: {i} and {placement.dim}"
+                    )
+                elif placement.is_partial():
+                    raise RuntimeError(
+                        f"DeviceMesh dimension {m} cannot be both shard and partial!"
+                    )
+                placements[m] = Shard(i)
+
+        return cls(mesh, tuple(placements), tensor_meta=tensor_meta)
+
+    def is_replicated(self):
+        """
+        return True if the current DTensorSpec replicates on all mesh dims (devices)
+        """
+        return all(placement.is_replicate() for placement in self.placements)
+
+    def shallow_copy_with_tensor_meta(
+        self, tensor_meta: Optional[TensorMeta]
+    ) -> "DTensorSpec":
+        """
+        Shallow copy the DTensorSpec with a new tensor_meta.
+        """
+        assert tensor_meta is not None, "shallow copy with no tensor_meta!"
+        return DTensorSpec(
+            self.mesh,
+            self.placements,
+            tensor_meta=tensor_meta,
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/random.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d0cf9ca708fe5ab026f8130ed8a396b1f2ed57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/random.py
@@ -0,0 +1,372 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+import warnings
+from typing import Dict, List, Optional
+
+import torch
+import torch.distributed as dist
+
+from torch import Tensor
+from torch.distributed._tensor.placement_types import DTensorSpec, Shard
+from torch.distributed.device_mesh import _get_device_handle, DeviceMesh
+
+
+_rng_tracker: Optional["RNGStateTracker"] = None
+
+
+def is_rng_supported_mesh(device_mesh: DeviceMesh) -> bool:
+    """Checks if the current device of `device_mesh` supports DTensor's random APIs.
+    Currently DTensor Random APIs only supports cuda/cuda-like devices. We suggest
+    users call this API to test the availability before using our random APIs.
+
+    Args:
+        device_mesh (:class:`DeviceMesh`): The device mesh on which we check if the
+            random ops APIs are supported.
+
+    Returns:
+        A bool value. True if `device_mesh` supports DTensor Random APIs; False otherwise.
+
+    .. warning::
+        Currently we only support correct RNG on cuda/cuda-like devices.
+    """
+    device_handle = _get_device_handle(device_mesh.device_type)
+    if device_handle and hasattr(device_handle, "set_rng_state"):
+        return True
+    else:
+        warnings.warn(
+            f"DTensor random operators may not have complete support on {device_mesh.device_type} device mesh"
+        )
+        return False
+
+
+def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
+    """Sets the seed for generating random numbers for the calling rank.
+
+    Args:
+        seed (int): The desired seed.
+        device_mesh (:class:`DeviceMesh`): The device mesh to set the seed.
+
+    Returns:
+        None
+
+    .. warning::
+        When calling this function, :func:`manual_seed` must be called from all ranks of the
+        default `ProcessGroup` even if some ranks may not be a part of the `device_mesh`,
+        with the same `seed` value.
+        If ``device_mesh`` is a sub-mesh and the calling rank is not a part of it,
+        `manual_seed` will not set its GPU device's generator seed.
+        Current implementation only supports a GPU device mesh.
+    """
+    device_handle = _get_device_handle(device_mesh.device_type)
+    if not device_handle:
+        raise NotImplementedError(
+            f"DTensor randomness only supports cuda/cuda-like device type, but got {device_mesh.device_type}"
+        )
+
+    # allgather the seed over the default PG
+    object_list = [seed] * dist.get_world_size()
+    dist.all_gather_object(object_list, seed)
+    for rank, object in enumerate(object_list):
+        if seed != int(object):
+            raise RuntimeError(
+                f"calling manual_seed function over {device_mesh} but received different seed values on ranks:",
+                f"seed on rank {dist.get_rank()} is {seed}, and seed on rank {rank} is {object}!",
+            )
+    # instantiate a RNG tracker if haven't. By default DTensor uses an
+    # OffsetBasedRNGTracker to perform random operators.
+    global _rng_tracker
+    if not _rng_tracker:
+        _rng_tracker = OffsetBasedRNGTracker(device_mesh.device_type)
+
+    # the current rank is in mesh
+    if device_mesh.get_coordinate() is not None:
+        if isinstance(_rng_tracker, TensorParallelRNGTracker):
+            _rng_tracker._manual_seed(device_mesh, seed)
+        elif isinstance(_rng_tracker, OffsetBasedRNGTracker):
+            _rng_tracker._manual_seed(seed)
+        else:
+            raise RuntimeError(
+                f"Unknown type of cuda RNG state tracker: _rng_tracker = {_rng_tracker}"
+            )
+
+
+class RNGStateTracker:
+    """
+    RNGStateTracker stores Random Number Generator (RNG) state (a ByteTensor object)
+    in a dict, mapping from a corresponding tag to each state tensor. It also provides
+    a set of convenient utility methods to help access/modify the state tensors. The most
+    important interface is _distribute_region which will be used when DTensor executes
+    a random op (an operator that calls RNG).
+    """
+
+    def __init__(self, device_type: str = "cuda"):
+        self._device_type = device_type
+        self._device_handle = _get_device_handle(device_type)
+        if not (self._device_handle and self._device_handle.is_available()):
+            raise RuntimeError(
+                f"{self.__class__.__name__} instantiation requires the presence of CUDA/CUDA-like device"
+            )
+
+        self._states: Dict[str, Tensor] = {}
+        self._devices = [self._device_handle.current_device()]
+        self._use_distribute_region = True
+
+    @property
+    def rng_states(self) -> Dict[str, Tensor]:
+        return self._states
+
+    @property
+    def distribute_region_enabled(self) -> bool:
+        return self._use_distribute_region
+
+    @distribute_region_enabled.setter
+    def distribute_region_enabled(self, value) -> None:
+        self._use_distribute_region = value
+
+    def rng_state_is_sync(self, name) -> bool:
+        return name in self.rng_states
+
+    def get_seed(self, name: str) -> int:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        seed_tensor = (self.rng_states[name])[0:8].view(dtype=torch.int64)
+        return int(seed_tensor.item())
+
+    def set_seed(self, name: str, seed: int) -> None:
+        seed_tensor = torch.tensor([seed]).view(torch.uint8)
+        offset_tensor = torch.tensor([0]).view(torch.uint8)
+        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+
+    def _distribute_region(self, spec: DTensorSpec):
+        pass
+
+
+class OffsetBasedRNGTracker(RNGStateTracker):
+    """
+    This subclass of `RNGStateTracker` defines the default policy of how RNG states
+    should be shared and synchronized among all ranks to respect the semantics of DTensor
+    random operators.
+    """
+
+    def __init__(self, device_type: str = "cuda"):
+        super().__init__(device_type)
+        # synchronize RNG state using rank 0's current one
+        rng_state = self._device_handle.get_rng_state().to(device_type)
+        dist.broadcast(rng_state, 0)
+        self.rng_states["parallel-rng"] = rng_state.to("cpu")
+
+    def _manual_seed(self, parallel_seed: int) -> None:
+        self.set_seed("parallel-rng", parallel_seed)
+
+    @contextlib.contextmanager
+    def _distribute_region(self, spec: DTensorSpec):
+        # check if the parallel rng state has been synchronized or not
+        if not self.rng_state_is_sync("parallel-rng"):
+            raise RuntimeError(
+                "OffsetBasedRNGTracker requires the random state to be synchronized "
+                "before entering into a distribute region!"
+            )
+
+        if self.distribute_region_enabled:
+            old_offset = self.get_offset("parallel-rng")
+            self._set_pre_op_offset(spec)
+            with torch.random.fork_rng(self._devices, device_type=self._device_type):
+                self._device_handle.set_rng_state(self.rng_states["parallel-rng"])
+                try:
+                    yield  # execute the region code
+                finally:
+                    # update offset to synchronize among ranks
+                    self._set_post_op_offset(spec, old_offset)
+        else:
+            yield
+
+    def get_offset(self, name: str) -> int:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        offset_tensor = (self.rng_states[name])[8:].view(dtype=torch.int64)
+        return int(offset_tensor.item())
+
+    def set_offset(self, name: str, offset: int) -> None:
+        if name not in self.rng_states:
+            raise RuntimeError(
+                f"{self.__class__.__name__} does not have random state for {name}"
+            )
+
+        seed_tensor = (self.rng_states[name])[0:8]
+        offset_tensor = torch.tensor([offset]).view(torch.uint8)
+        self.rng_states[name] = torch.cat([seed_tensor, offset_tensor])
+
+    def _set_pre_op_offset(self, spec: DTensorSpec) -> None:
+        """Set the starting RNG offset for current device's local shard before actual
+        op execution. The pre_op_offset value should start from the current RNG offset
+        and increment by the size of local shard until it reaches the size of the whole
+        DTensor. For different ranks that hold the same DTensor shard, their pre_op_offset
+        will be the same.
+
+        Args:
+            spec (:class:`DTensorSpec`): the spec of the DTensor object on which
+                we prepare the offset for running random ops.
+
+        Returns:
+            None
+
+        .. warning::
+            Note that, current implementation does not consider DTensor's continguity.
+
+        Example:
+            take a DTensor of shape [8, 16] as an example. Assume that the DTensor
+            is placed on a device mesh with placements ([Shard(1), Replicate(), Shard(0)]),
+            and the mesh is:
+                [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
+            ``spec.mesh.get_coordinate()`` provides the coordinate of the current rank
+            in the mesh. For example, the coordinate of rank 5 is (1, 0, 1).
+
+            Another concept to introduce besides rank coordinate is shard coordinate.
+            Each rank holds a local shard of the DTensor. In the example, the DTensor
+            is partitioned into 4 [4, 8] shards. The first shard has 2 replicas and
+            rank 0 (coord (0, 0, 0)) and rank 2 (coord (0, 1, 0)) have 1 replica each.
+            That being said, the local shard on rank 0 and rank 2 correspond to the same
+            shard of the DTensor. To denote each DTensor shard, we use a shard coordinate
+            (in the example, it will be a tuple (i, j) where shard (i, j) has the slice
+            DTensor[4 * i : 4 * (i + 1), 8 * j : 8 * (j + 1)], 0 <= i < 2, 0 <= j < 2).
+
+            Once we have rank coordinate and shard coordinate, we can calculate on each rank
+            what shard of the DTensor the rank holds, with the help of dim_map. The dim_map
+            of the above DTensor is [2, 0] so the shard coordinate of a rank with rank coord
+            (x, y, z) is simply (z, x) by taking(rank_coord[dim_map[0]],rank_coord[dim_map[1]]).
+            Following this calculation,
+            rank 0 and rank 2 holds the shard of coord (0, 0);
+            rank 1 and rank 3 holds the shard of coord (0, 1);
+            rank 4 and rank 6 holds the shard of coord (1, 0);
+            rank 5 and rank 7 holds the shard of coord (1, 1);
+
+            The last value to calculate before obtaining the starting offset is the shard linear index.
+            The starting offset for each rank will be its shard_linear_index * local_tensor_numel.
+        """
+        dtensor_shape = spec.shape
+        mesh = spec.mesh
+        dim_map = spec.dim_map
+
+        # Compute shard coordinate:
+        # The coordinate on each tensor dim is a tuple (idx, range)
+        # If a DTensor is partitioned on its dim i into n shards, and the current rank
+        # holds the j-th, then its shard coordinate will be (idx=j, range=n) on dim i
+        coordinate = mesh.get_coordinate()
+        assert coordinate is not None
+        shard_coord = [
+            coordinate[mesh_dim] if mesh_dim >= 0 else 0 for mesh_dim in dim_map
+        ]
+        shard_size = [
+            mesh.size(mesh_dim) if mesh_dim >= 0 else 1 for mesh_dim in dim_map
+        ]
+
+        # compute shard linear index
+        shard_linear_idx = self._calc_shard_linear_idx(shard_coord, shard_size)
+
+        # compute starting offset using the first shard's size
+        local_size_on_rank_0 = list(dtensor_shape)
+        for idx, placement in enumerate(spec.placements):
+            if isinstance(placement, Shard):
+                mesh_dim_size = mesh.size(idx)
+                shard_dim = placement.dim
+                local_size_on_rank_0[shard_dim] = placement._local_shard_size_on_dim(
+                    dtensor_shape[shard_dim],
+                    mesh_dim_size,
+                    0,
+                    return_offset=False,
+                )[0]
+
+        from torch.distributed._tensor.ops.utils import prod
+
+        local_size = prod(local_size_on_rank_0)
+
+        # get current RNG offset
+        current_offset = self.get_offset("parallel-rng")
+
+        # pytorch: offset must be multiple of 4
+        # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+        offset_incr = (shard_linear_idx * local_size + 3) // 4 * 4
+        self.set_offset("parallel-rng", current_offset + offset_incr)
+
+    def _set_post_op_offset(self, spec: DTensorSpec, old_offset: int) -> None:
+        """Sets the RNG to a synchronized state after running the local random op. Every
+        rank should set its RNG offset to `old_offset + DTensor.numel()` where old_offset is
+        the offset before calling `set_pre_op_offset` i.e. the offset before running DTensor
+        random ops.
+
+        Args:
+            spec (:class:`DTensorSpec`): the spec of the DTensor object on which
+                we post-process the offset for running random ops.
+
+        Returns:
+            None
+        """
+        dtensor_shape = spec.shape
+
+        from torch.distributed._tensor.ops.utils import prod
+
+        numel = prod(dtensor_shape)
+        # pytorch: offset must be multiple of 4
+        # source: aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+        numel = (numel + 3) // 4 * 4
+        self.set_offset("parallel-rng", old_offset + numel)
+
+    def _calc_shard_linear_idx(
+        self, shard_coord: List[int], shard_size: List[int]
+    ) -> int:
+        # compute shard linear index
+        shard_linear_idx = 0
+        shard_coord_stride = 1
+        for idx, size in zip(reversed(shard_coord), reversed(shard_size)):
+            shard_linear_idx += idx * shard_coord_stride
+            shard_coord_stride *= size
+
+        return shard_linear_idx
+
+
+class TensorParallelRNGTracker(RNGStateTracker):
+    def __init__(self, device_type: str = "cuda"):
+        super().__init__(device_type)
+        # copy the default RNG state
+        self.rng_states["tensor-parallel-rng"] = self._device_handle.get_rng_state()
+
+    def _manual_seed(
+        self,
+        tp_mesh: DeviceMesh,
+        base_seed: int = 1234,
+    ):
+        tensor_parallel_rank = tp_mesh.get_local_rank()
+        # this magic number 2718 comes from Megatron's code
+        # (https://github.com/NVIDIA/Megatron-LM/blob/060415572f4365a2e895f8036c4e37dad0efbdf5/megatron/core/tensor_parallel/random.py#L162-L163)
+        MegatronMagicNum = 2718
+        tensor_parallel_seed = base_seed + MegatronMagicNum + tensor_parallel_rank
+        self.set_seed("tensor-parallel-rng", tensor_parallel_seed)
+
+    @contextlib.contextmanager
+    def _distribute_region(self, spec: DTensorSpec):
+        # check if the tensor parallel rng state has been synchronized or not
+        if not self.rng_state_is_sync("tensor-parallel-rng"):
+            raise RuntimeError(
+                "TensorParallelRNGTracker requires the random state to be synchronized "
+                "before entering into a distribute region!"
+            )
+
+        if self.distribute_region_enabled:
+            with torch.random.fork_rng(self._devices, device_type=self._device_type):
+                self._device_handle.set_rng_state(
+                    self.rng_states["tensor-parallel-rng"]
+                )
+                try:
+                    yield
+                finally:
+                    self.rng_states[
+                        "tensor-parallel-rng"
+                    ] = self._device_handle.get_rng_state()
+        else:
+            yield
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/redistribute.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/redistribute.py
new file mode 100644
index 0000000000000000000000000000000000000000..021c0adeac5c100de121f5ded51316e898c1d2aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/redistribute.py
@@ -0,0 +1,337 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from functools import lru_cache
+from typing import cast, Dict, List, NamedTuple, Tuple
+
+import torch
+import torch.distributed._functional_collectives as funcol
+import torch.distributed._tensor.api as dtensor
+from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.distributed._tensor.placement_types import (
+    _Partial,
+    DTensorSpec,
+    Placement,
+    Replicate,
+    Shard,
+)
+
+
+class _TransformInfo(NamedTuple):
+    mesh_dim: int
+    src_dst_placements: Tuple[Placement, Placement]
+    # logical_shape on this mesh dimension
+    logical_shape: List[int]
+
+
+def _replicate_then_shard(val: _TransformInfo) -> int:
+    """
+    This is a helper function to allow reordering _TransformInfo list. The high level
+    idea is that we want to reorder the sharding redistributions so that the DTensor
+    redistribution is consistent with its full tensor. This is built on top of two simple
+    assumptions:
+    1. Replication happens from inner to outer dimension. i.e. Shard -> Replicate
+    2. Sharding happens from outer to inner dimension, i.e. Replicate -> Shard
+
+    So we always put the replication first and put sharding later.
+    """
+    mesh_dim = val.mesh_dim
+    src, dst = val.src_dst_placements
+    if (dst.is_replicate() or dst.is_partial()) and src.is_shard():
+        return -mesh_dim
+    elif (src.is_replicate() or src.is_partial()) and dst.is_shard():
+        return mesh_dim
+    else:
+        return 0
+
+
+@lru_cache(maxsize=None)
+def _gen_transform_infos(
+    src_spec: DTensorSpec,
+    dst_spec: DTensorSpec,
+) -> List[_TransformInfo]:
+    """
+    Generate the transform infos from the source placements to the target placements, to
+    transform from source to target placement it might have multipl steps, i.e. it might
+    decompose Si -> Sj into Si -> R -> Sj.
+    This would detects if there're mis-aligned shardings between src/dst placements.
+    i.e. (Shard(0), Shard(0)) -> (Replicate(), Shard(0)), in this case Shard(0) -> Shard(0)
+    for mesh dimension 1 actually needs reshard, because in the first case it's a sub-sharding
+    of an already tensor dimension 0, and in the second case, it's the first sharding on tensor
+    dimension 0.
+
+    Note that we also currently handles sharding on different tensor dimensions, e.g.
+    Shard(0) -> Shard(1) in this pass
+    """
+    src_dim_counts: Dict[int, int] = {}
+    dst_dim_counts: Dict[int, int] = {}
+    transform_infos: List[_TransformInfo] = []
+
+    src_placements = src_spec.placements
+    dst_placements = dst_spec.placements
+    device_mesh = src_spec.device_mesh
+    my_coordinate = device_mesh.get_coordinate()
+    assert my_coordinate is not None
+
+    # logical shape records the logic tensor shape on the mesh dimension
+    # this is useful to ensure uneven sharding gets correct output shape
+    initial_logical_shape = list(src_spec.shape)
+    mesh_dims_to_logical_shape = [initial_logical_shape]
+    mesh_ndim = len(src_placements)
+
+    for i, (src, dst) in enumerate(zip(src_placements, dst_placements)):
+        # detect mis-aligned sharding and build logical shapes
+        current_logical_shape = mesh_dims_to_logical_shape[i]
+        if isinstance(src, Shard):
+            src_dim_counts[src.dim] = src_dim_counts.get(src.dim, 0) + 1
+
+            if i < mesh_ndim - 1:
+                # calculate and save the logical shape for this sharding
+                mesh_dim_size = device_mesh.size(mesh_dim=i)
+                local_shard_size, _ = src._local_shard_size_on_dim(
+                    current_logical_shape[src.dim],
+                    mesh_dim_size,
+                    my_coordinate[i],
+                )
+                new_logical_shape = list(current_logical_shape)
+                new_logical_shape[src.dim] = local_shard_size
+                mesh_dims_to_logical_shape.append(new_logical_shape)
+        else:
+            mesh_dims_to_logical_shape.append(current_logical_shape)
+
+        if isinstance(dst, Shard):
+            dst_dim_counts[dst.dim] = dst_dim_counts.get(dst.dim, 0) + 1
+
+        if (
+            isinstance(src, Shard)
+            and isinstance(dst, Shard)
+            and (
+                src.dim != dst.dim or src_dim_counts[src.dim] != dst_dim_counts[dst.dim]
+            )
+        ):
+            # decompose Shard(i) -> Shard(j) into Shard(i) -> Replicate() -> Shard(j)
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(src, Replicate()),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(Replicate(), dst),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
+        else:
+            transform_infos.append(
+                _TransformInfo(
+                    mesh_dim=i,
+                    src_dst_placements=(src, dst),
+                    logical_shape=mesh_dims_to_logical_shape[i],
+                )
+            )
+
+    # sort the pairs by first perform replication then sharding
+    transform_infos.sort(key=_replicate_then_shard)
+    return transform_infos
+
+
+def redistribute_local_tensor(
+    local_tensor: torch.Tensor,
+    current_spec: DTensorSpec,
+    target_spec: DTensorSpec,
+    *,
+    async_op: bool = False,
+    is_backward: bool = False,
+) -> torch.Tensor:
+    """
+    This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
+    the target DTensorSpec, which involves the necessary collective calls to transform
+    the local shard of the DTensor from its current spec to the target spec.
+    """
+
+    if current_spec.mesh != target_spec.mesh:
+        # TODO: alltoall/permute reshuffling to change device_mesh if they are not the same
+        raise NotImplementedError("Cross device mesh comm not supported yet!")
+
+    new_local_tensor = None
+    device_mesh = current_spec.mesh
+
+    my_coordinate = device_mesh.get_coordinate()
+
+    if my_coordinate is None:
+        # if rank is not part of mesh, we skip redistribute and simply return local_tensor,
+        # which should be an empty tensor
+        return local_tensor
+
+    transform_infos = _gen_transform_infos(current_spec, target_spec)
+
+    for transform_info in transform_infos:
+        i = transform_info.mesh_dim
+        current, target = transform_info.src_dst_placements
+        num_chunks = device_mesh.size(mesh_dim=i)
+
+        if current == target:
+            # short cut, just use the original local tensor
+            new_local_tensor = local_tensor
+            continue
+
+        if target.is_replicate():
+            # Case 1: target is Replicate
+            if current.is_partial():
+                partial_spec = cast(_Partial, current)
+                new_local_tensor = partial_spec._reduce_value(
+                    local_tensor, device_mesh, i
+                )
+            elif current.is_shard():
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, device_mesh, i, transform_info.logical_shape
+                )
+            else:
+                raise RuntimeError(
+                    f"redistribute from {current} to {target} not supported yet"
+                )
+        elif target.is_shard():
+            # Case 2: target is Shard
+            target_placement = cast(Shard, target)
+            target_dim = target_placement.dim
+            if current.is_partial():
+                partial_spec = cast(_Partial, current)
+                new_local_tensor = partial_spec._reduce_shard_value(
+                    local_tensor, device_mesh, i, target_placement
+                )
+            elif current.is_replicate():
+                # split the tensor and return the corresponding cloned local shard
+                new_local_tensor = target_placement._replicate_to_shard(
+                    local_tensor, device_mesh, i, my_coordinate[i]
+                )
+            else:
+                # NOTE: we don't support this case efficiently yet, the fallback path we are going here is
+                # to decompose Shard(0) -> Shard(1) into Shard(0) -> Replicate -> Shard(1)
+                # TODO: enable this with all_to_all
+                assert (
+                    current.is_shard()
+                ), f"Current placement should be shard but found {current}"
+                shard_spec = cast(Shard, current)
+                if shard_spec.dim != target_placement.dim:
+                    new_local_tensor = shard_spec._to_replicate_tensor(
+                        local_tensor, device_mesh, i, transform_info.logical_shape
+                    )
+                    shards, _ = target_placement._split_tensor(
+                        new_local_tensor,
+                        num_chunks,
+                        with_padding=False,
+                        contiguous=False,
+                    )
+                    new_local_tensor = shards[my_coordinate[i]]
+        elif target.is_partial():
+            if current.is_replicate():
+                partial_spec = cast(_Partial, target)
+                # skip the replicate to partial transformation when we are in backward pass
+                # In this case we keep the grad as replicate, this is because we don't
+                # want to convert the replicated gradients back to partial, although
+                # that's logically conform with the same layout, converting the gradients
+                # back to partial is actually useless as you would have to do reduce later
+                # which would be more expensive than keeping it replicate! For this reason,
+                # we keep the replicate grad here.
+                new_local_tensor = (
+                    partial_spec._partition_value(local_tensor, device_mesh, i)
+                    if not is_backward
+                    else local_tensor
+                )
+            elif current.is_shard():
+                if not is_backward:
+                    raise RuntimeError(
+                        f"redistribute from {current} to {target} not supported yet"
+                    )
+                # for backward shard -> partial, we just need to convert the shard to replicate
+                current_placement = cast(Shard, current)
+                new_local_tensor = current_placement._to_replicate_tensor(
+                    local_tensor, device_mesh, i, transform_info.logical_shape
+                )
+            else:
+                # partial -> partial no op, should never hit
+                new_local_tensor = local_tensor
+
+        assert new_local_tensor is not None
+        local_tensor = new_local_tensor
+
+    assert new_local_tensor is not None, "redistribute failed!"
+
+    if not async_op and isinstance(new_local_tensor, funcol.AsyncCollectiveTensor):
+        new_local_tensor = new_local_tensor.wait()
+
+    return new_local_tensor
+
+
+class Redistribute(torch.autograd.Function):
+    @staticmethod
+    def forward(  # type: ignore[override]
+        # pyre-fixme[2]: Parameter must be annotated.
+        ctx,
+        input: "dtensor.DTensor",
+        device_mesh: DeviceMesh,
+        placements: Tuple[Placement, ...],
+        async_op: bool = False,
+    ):
+        current_spec = input._spec
+        ctx.current_spec = current_spec
+        ctx.async_op = async_op
+        target_spec = DTensorSpec(
+            device_mesh, placements, tensor_meta=input._spec.tensor_meta
+        )
+
+        local_tensor = input._local_tensor
+        output = redistribute_local_tensor(
+            local_tensor, current_spec, target_spec, async_op=async_op
+        )
+
+        return dtensor.DTensor(
+            output,
+            device_mesh,
+            target_spec.placements,
+            shape=input.shape,
+            dtype=input.dtype,
+            requires_grad=input.requires_grad,
+            stride=input.stride(),
+        )
+
+    @staticmethod
+    def backward(ctx, grad_output: "dtensor.DTensor"):  # type: ignore[override]
+        previous_spec = ctx.current_spec
+        current_spec = grad_output._spec
+        async_op = ctx.async_op
+
+        local_tensor = grad_output._local_tensor
+        output = redistribute_local_tensor(
+            local_tensor,
+            current_spec,
+            previous_spec,
+            async_op=async_op,
+            is_backward=True,
+        )
+        # normalize the target placement to replicate if it is partial
+        normalized_placements: List[Placement] = []
+        for previous_placement in previous_spec.placements:
+            if previous_placement.is_partial():
+                # keep target placement to replicate instead of partial in this case
+                normalized_placements.append(Replicate())
+            else:
+                normalized_placements.append(previous_placement)
+        output_dtensor = dtensor.DTensor(
+            output,
+            previous_spec.mesh,
+            tuple(normalized_placements),
+            shape=grad_output.shape,
+            dtype=grad_output.dtype,
+            requires_grad=grad_output.requires_grad,
+            stride=grad_output.stride(),
+        )
+
+        return (
+            output_dtensor,
+            None,
+            None,
+            None,
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/sharding_prop.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/sharding_prop.py
new file mode 100644
index 0000000000000000000000000000000000000000..a106c1aeb73f4005688d9e157a43c55303a37b36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/sharding_prop.py
@@ -0,0 +1,410 @@
+from functools import lru_cache
+from itertools import chain
+from typing import Callable, cast, Dict, List, Optional, Sequence, Union
+
+import torch
+from torch._ops import OpOverload
+from torch._subclasses import FakeTensorMode
+from torch.distributed._tensor._utils import try_find_mesh_from_args
+from torch.distributed._tensor.op_schema import (
+    DTensorSpec,
+    OpInfo,
+    OpSchema,
+    OpStrategy,
+    OutputSharding,
+    OutputSpecType,
+    PlacementStrategy,
+    RuntimeSchemaInfo,
+    StrategyType,
+    TupleStrategy,
+)
+from torch.distributed._tensor.placement_types import TensorMeta
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+def _length(obj) -> int:
+    if obj is None:
+        return 0
+    if not isinstance(obj, Sequence):
+        return 1
+    return len(obj)
+
+
+class ShardingPropagator:
+    def __init__(self) -> None:
+        self.op_to_rules: Dict[OpOverload, Callable[[OpSchema], OutputSharding]] = {}
+        self.op_strategy_funcs: Dict[
+            OpOverload,
+            Callable[[DeviceMesh, OpSchema], StrategyType],
+        ] = {}
+        # op map to save static argnum to decide to reuse sharding prop cache or re-run sharding prop
+        self.op_to_schema_info: Dict[OpOverload, RuntimeSchemaInfo] = {}
+        self.propagate_op_sharding = lru_cache(None)(self.propagate_op_sharding_non_cached)  # type: ignore[method-assign]
+
+    def register_sharding_prop_rule(
+        self,
+        op_overload: OpOverload,
+        rule_func: Callable[[OpSchema], OutputSharding],
+        schema_info: Optional[RuntimeSchemaInfo] = None,
+    ):
+        """
+        Register a sharding propagation rule for an operator.
+        """
+        self.op_to_rules[op_overload] = rule_func
+        if schema_info is not None:
+            self.op_to_schema_info[op_overload] = schema_info
+
+    def register_op_strategy(
+        self,
+        op_overload: OpOverload,
+        strategy_func: Callable[[DeviceMesh, OpSchema], StrategyType],
+        schema_info: Optional[RuntimeSchemaInfo] = None,
+    ):
+        """
+        Register a sharding strategy generator for an operator.
+        """
+        self.op_strategy_funcs[op_overload] = strategy_func
+        if schema_info is not None:
+            self.op_to_schema_info[op_overload] = schema_info
+
+    @lru_cache
+    def _propagate_tensor_meta(
+        self, op_schema: OpSchema
+    ) -> Union[None, TensorMeta, Sequence[Optional[TensorMeta]]]:
+        """
+        Propagate the tensor metadata, it could either return a TensorMeta
+        or a list/tuple of TensorMetas
+        """
+        if op_schema.op == aten.equal.default:
+            # data dependent ops can't be used for fake propagation
+            return None
+
+        # NOTE: We must call the tracing in fake tensor mode so that it
+        # avoids materializing memory
+        with FakeTensorMode():
+            fake_args = op_schema.gen_fake_args()
+            fake_kwargs = op_schema.gen_fake_kwargs()
+            fake_out = op_schema.op(*fake_args, **fake_kwargs)
+
+        if isinstance(fake_out, torch.Tensor):
+            return TensorMeta(
+                shape=fake_out.shape, stride=fake_out.stride(), dtype=fake_out.dtype
+            )
+
+        elif isinstance(fake_out, (tuple, list)):
+            tensor_meta_list: List[Optional[TensorMeta]] = []
+            for fake_out_item in fake_out:
+                if isinstance(fake_out_item, torch.Tensor):
+                    tensor_meta_list.append(
+                        TensorMeta(
+                            shape=fake_out_item.shape,
+                            stride=fake_out_item.stride(),
+                            dtype=fake_out_item.dtype,
+                        )
+                    )
+                else:
+                    tensor_meta_list.append(None)
+            return (
+                tuple(tensor_meta_list)
+                if isinstance(fake_out, tuple)
+                else tensor_meta_list
+            )
+        else:
+            # if fake is not a tensor or tuple of tensor, return as none
+            return None
+
+    def _wrap_output_spec_tensor_meta(
+        self,
+        op: OpOverload,
+        output_specs: OutputSpecType,
+        output_tensor_meta: Union[None, TensorMeta, Sequence[Optional[TensorMeta]]],
+    ) -> None:
+        """
+        Wrap the output_specs with the tensor metadata from the output.
+        """
+
+        if isinstance(output_specs, DTensorSpec):
+            if not isinstance(output_tensor_meta, TensorMeta):
+                # Either error due to ShardingPropagator or due to incorrect OutputSpec
+                if not isinstance(output_tensor_meta, (tuple, list)):
+                    raise ValueError(
+                        "ShardingPropagator error: output does not have an associated TensorMeta"
+                    )
+                raise ValueError(
+                    f"For the op {op.name()}, `output_specs` has 1 output which does not equal the "
+                    f"number of op outputs: {len(output_tensor_meta)}."
+                )
+            output_specs.tensor_meta = output_tensor_meta
+        elif isinstance(output_specs, (tuple, list)):
+            if not isinstance(output_tensor_meta, (tuple, list)) or len(
+                output_specs
+            ) != len(output_tensor_meta):
+                raise ValueError(
+                    f"For the op {op.name()}, `output_specs` has {len(output_specs)} outputs which does not equal the "
+                    f"number of op outputs {_length(output_tensor_meta)}."
+                )
+            for i, spec in enumerate(output_specs):
+                if isinstance(spec, DTensorSpec):
+                    output_tensor_meta_i = output_tensor_meta[i]
+                    if not isinstance(output_tensor_meta_i, TensorMeta):
+                        raise ValueError(
+                            f"ShardingPropagator error: output {i} does not have an associated TensorMeta"
+                        )
+                    spec.tensor_meta = output_tensor_meta_i
+
+    def propagate(self, op_info: OpInfo) -> None:
+        # We cannot use an lru cache if we know that inputs will have dynamic shapes,
+        # because SymInts are not hashable.
+        # This is generally ok because this only happens during tracing in torch.compile,
+        # and tracing does not need to be as fast as eagermode DTensor usages.
+        if op_info.schema.has_symints:
+            output_sharding = self.propagate_op_sharding_non_cached(op_info.schema)
+        else:
+            output_sharding = self.propagate_op_sharding(op_info.schema)
+        op_info.output_sharding = output_sharding
+
+    def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputSharding:
+        """
+        Propagate the sharding for an operator given the op_schema.
+        """
+        # special case op, we don't need to propagate for local
+        # scalar. TODO: figure out a better way to handle this
+        if op_schema.op is aten._local_scalar_dense.default:
+            return OutputSharding(None, [op_schema])
+
+        out_tensor_meta = self._propagate_tensor_meta(op_schema)
+
+        def spec_to_strategy(spec: object) -> object:
+            if isinstance(spec, DTensorSpec):
+                return OpStrategy([PlacementStrategy(spec)])
+            elif (
+                isinstance(spec, (list, tuple))
+                and len(spec) > 0
+                and isinstance(spec[0], DTensorSpec)
+            ):
+                # tensor list create tuple strategy
+                tuple_strategy = [spec_to_strategy(s) for s in spec]
+                tuple_strategy = cast(Sequence[StrategyType], tuple_strategy)
+                return TupleStrategy(
+                    tuple(tuple_strategy) if isinstance(spec, tuple) else tuple_strategy
+                )
+            else:
+                return spec
+
+        if op_schema.op in self.op_strategy_funcs:
+            # generate op strategy for the op.
+            mesh = try_find_mesh_from_args(op_schema.op, op_schema.args_schema)
+            # swap the args spec with args strategies
+            args_op_strategy = [spec_to_strategy(i) for i in op_schema.args_schema]
+
+            kwargs_op_strategy = {
+                k: spec_to_strategy(v) for k, v in op_schema.kwargs_schema.items()
+            }
+
+            # construct a new OpSchema on args for strategy based propagation
+            strategy_schema: OpSchema = OpSchema(
+                op=op_schema.op,
+                args_schema=tuple(args_op_strategy),
+                kwargs_schema=kwargs_op_strategy,
+            )
+
+            op_strategy = self.op_strategy_funcs[op_schema.op](mesh, strategy_schema)
+
+            if isinstance(op_strategy, OpStrategy):
+                # single Op strategy
+                output_strategy = self._select_strategy(op_strategy)
+
+                # check if we need to redistribute the input
+                needs_redistribute = False
+                expected_input_specs = []
+
+                # in case where the op does not specify input_specs and output_specs
+                # is a DTensorSpec, we use output_specs as the spec for each DTensor
+                # input arg.
+                if output_strategy.input_specs is None:
+                    assert isinstance(output_strategy.output_specs, DTensorSpec)
+
+                for idx, input_spec in enumerate(op_schema.args_spec):
+                    desired_spec = (
+                        output_strategy.output_spec
+                        if output_strategy.input_specs is None
+                        else output_strategy.input_specs[idx]
+                    )
+                    expected_input_specs.append(desired_spec)
+                    if input_spec.placements != desired_spec.placements:
+                        needs_redistribute = True
+
+                suggestion_schema = None
+                if needs_redistribute:
+                    reshard_schema = OpSchema(
+                        op_schema.op, tuple(expected_input_specs), {}
+                    )
+                    reshard_schema._inplace_rewrap_schema_suggestion(op_schema)
+                    suggestion_schema = [reshard_schema]
+
+                # construct output spec for the op
+                if op_schema.return_type_tuple_tensor_like():
+                    # for ops that return multiple tensors and the output_specs is not
+                    # a tuple, we use a tuple of that single output spec as the new
+                    # output_specs
+                    output_specs: OutputSpecType = output_strategy.output_specs
+                    if isinstance(output_specs, DTensorSpec):
+                        output_specs = tuple(
+                            [
+                                # create a new DTensorSpec with the same placement as the
+                                # output_specs in output_strategy
+                                DTensorSpec(
+                                    mesh=output_specs.mesh,
+                                    placements=output_specs.placements,
+                                    tensor_meta=output_specs.tensor_meta,
+                                )
+                                for _ in range(len(op_schema.op._schema.returns))
+                            ]
+                        )
+                elif op_schema.return_type_tensor():
+                    output_specs = output_strategy.output_specs
+                else:
+                    output_specs = None
+
+                output_sharding = OutputSharding(
+                    output_specs,
+                    suggestion_schema,
+                    needs_redistribute=needs_redistribute,
+                )
+            elif isinstance(op_strategy, TupleStrategy):
+                # tuple strategy output sharding processing
+                # runtime selected placement strategy for each TupleStrategy input arg
+                selected_strategies: List[PlacementStrategy] = []
+                out_spec_list: List[DTensorSpec] = []
+                for strategy in op_strategy.childs:
+                    assert isinstance(strategy, OpStrategy)
+                    selected_strategy = self._select_strategy(strategy)
+                    selected_strategies.append(selected_strategy)
+                    out_spec_list.append(selected_strategy.output_spec)
+
+                needs_redistribute = False
+                suggestion_args: List[object] = []
+                for arg_idx, arg in enumerate(op_schema.args_schema):
+                    if isinstance(arg, (list, tuple)) and isinstance(
+                        arg[0], DTensorSpec
+                    ):
+                        expected_input_spec_list: List[DTensorSpec] = []
+                        for idx, arg_spec in enumerate(arg):
+                            expected_input_spec = selected_strategies[idx].input_spec(
+                                arg_idx
+                            )
+                            expected_input_spec = (
+                                expected_input_spec.shallow_copy_with_tensor_meta(
+                                    arg_spec.tensor_meta
+                                )
+                            )
+                            if arg_spec.placements != expected_input_spec.placements:
+                                needs_redistribute = True
+                            expected_input_spec_list.append(expected_input_spec)
+                        suggestion_args.append(
+                            tuple(expected_input_spec_list)
+                            if isinstance(arg, tuple)
+                            else expected_input_spec_list
+                        )
+                    elif isinstance(arg, DTensorSpec):
+                        expected_input_spec = selected_strategies[0].input_spec(arg_idx)
+                        expected_input_spec = (
+                            expected_input_spec.shallow_copy_with_tensor_meta(
+                                arg.tensor_meta
+                            )
+                        )
+                        if arg.placements != expected_input_spec.placements:
+                            needs_redistribute = True
+                        suggestion_args.append(expected_input_spec)
+                    else:
+                        suggestion_args.append(arg)
+
+                suggestion_schema = None
+                if needs_redistribute:
+                    reshard_schema = OpSchema(
+                        op_schema.op, tuple(suggestion_args), op_schema.kwargs_schema
+                    )
+                    suggestion_schema = [reshard_schema]
+
+                output_sharding = OutputSharding(
+                    tuple(out_spec_list) if out_tensor_meta is not None else None,
+                    suggestion_schema,
+                    needs_redistribute=needs_redistribute,
+                )
+            else:
+                raise ValueError("Unsupported op strategy type")
+
+            # associate the output sharding with the output tensor metadata
+            self._wrap_output_spec_tensor_meta(
+                op_schema.op, output_sharding.output_spec, out_tensor_meta
+            )
+            return output_sharding
+        elif op_schema.op in self.op_to_rules:
+            # propagate the sharding with rule
+            sharding_prop_func = self.op_to_rules[op_schema.op]
+
+            # step 1. there's sharding propagation rule, run
+            # sharding propagation to get the output sharding
+            try:
+                output_sharding = sharding_prop_func(op_schema)
+            except NotImplementedError as e:
+                raise e
+            except Exception as e:
+                raise RuntimeError(
+                    f"Sharding propagation failed on op {op_schema}.\n" f"Error: {e}"
+                ) from e
+
+            # step 2. if can't get output_spec from sharding
+            # propagation (i.e. no rules apply for input
+            # placements), we return the output sharding
+            # with schema suggestions, which can be used to
+            # decide how to do redistribute on inputs
+            if output_sharding.output_spec is None:
+                if output_sharding.schema_suggestions is None:
+                    if output_sharding.failed_reason is not None:
+                        raise RuntimeError(
+                            f"Sharding propagation failed on op {op_schema}!"
+                            f"Failed reason: {output_sharding.failed_reason}"
+                        )
+                else:
+                    # we do auto redistribute on inputs if necessary
+                    # to get an eligible input, which we will pick a
+                    # schema suggestion base on the redistribute cost.
+                    # For now we simply pick the first suggestion.
+                    suggested_input_schema = output_sharding.schema_suggestions[0]
+                    # run sharding propagation again with suggested schema
+                    propagation_res = sharding_prop_func(suggested_input_schema)
+                    # we set the output sharding with the new propagation result
+                    # so that dispatching know both output_spec and schema_suggestions
+                    # exist, which indicates a reshard is needed
+                    output_sharding.output_spec = propagation_res.output_spec
+                    output_sharding.needs_redistribute = True
+
+            # associate the output sharding with the output tensor metadata
+            self._wrap_output_spec_tensor_meta(
+                op_schema.op, output_sharding.output_spec, out_tensor_meta
+            )
+
+            return output_sharding
+        else:
+            raise NotImplementedError(
+                f"Operator {op_schema.op} does not have a sharding strategy registered."
+            )
+
+    def _select_strategy(self, strategy: OpStrategy) -> PlacementStrategy:
+        if len(strategy.strategies) == 1:
+            # short cut with only one possible strategy
+            return strategy.strategies[0]
+
+        strategy_costs: List[float] = []
+        for strtg in strategy.strategies:
+            assert (
+                strtg.redistribute_cost is not None
+            ), "must set redistribute cost each strategy!"
+            redistribute_cost = sum(chain.from_iterable(strtg.redistribute_cost))
+            strategy_costs.append(redistribute_cost)
+
+        # for eager execution, we just select the one with the minimal redistribute cost
+        return strategy.strategies[strategy_costs.index(min(strategy_costs))]
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tensor/tp_conv.py b/MLPY/Lib/site-packages/torch/distributed/_tensor/tp_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..230b1f2c0974e5f04710c996b7c6a3d6f14a85d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tensor/tp_conv.py
@@ -0,0 +1,277 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# implement matrix related ops for distributed tensor
+from typing import cast, Dict, List, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.distributed._tensor.api as dtensor
+
+aten = torch.ops.aten
+
+
+def _requires_data_exchange(padding):
+    # TODO: whether there requires data exchange is currently determined by padding
+    return padding[1] != 0
+
+
+def _is_supported(input_size, kernel_size, stride, padding, dilation):
+    if dilation[1] != 1:
+        raise RuntimeError("Dilation must be 1 for tensor parallel convolution.")
+    if padding[1] != 0:
+        if stride[1] != 1:
+            raise RuntimeError(
+                "Stride must be 1 when there is padding for tensor parallel convolution."
+            )
+        if kernel_size[3] // 2 > input_size[3]:
+            raise RuntimeError(
+                "kernel_size[3] // 2 should be less than or equal to input_size[3] for tensor parallel convolution."
+            )
+    else:
+        if not (input_size[3] % stride[1] == 0 and stride[1] == kernel_size[3]):
+            raise RuntimeError(
+                "It requires that input_size[3] is divisible by stride[1] and stride[1] equals kernel_size[3] "
+                "when there is padding for tensor parallel convolution."
+            )
+    return True
+
+
+def _ring_send_recv_construct(in_tensor, d1, d2, left, right, rank, size):
+    # dist comms and reconstruct local input tensor
+    send_to_right = in_tensor[:, :, :, -d1:].contiguous()
+    send_to_left = in_tensor[:, :, :, :d2].contiguous()
+    recv_from_right = torch.zeros_like(send_to_left)
+    recv_from_left = torch.zeros_like(send_to_right)
+
+    send_op_right = dist.P2POp(dist.isend, send_to_right, right)
+    send_op_left = dist.P2POp(dist.isend, send_to_left, left)
+    recv_op_right = dist.P2POp(dist.irecv, recv_from_right, right)
+    recv_op_left = dist.P2POp(dist.irecv, recv_from_left, left)
+
+    reqs = dist.batch_isend_irecv(
+        [send_op_right, send_op_left, recv_op_left, recv_op_right]
+    )
+    for req in reqs:
+        req.wait()
+
+    if rank == 0:
+        in_tensor = torch.cat([in_tensor, recv_from_right], dim=-1)
+    elif rank == size - 1:
+        in_tensor = torch.cat([recv_from_left, in_tensor], dim=-1)
+    else:
+        in_tensor = torch.cat([recv_from_left, in_tensor, recv_from_right], dim=-1)
+
+    return in_tensor
+
+
+def _ring_send_recv_aggregate(grad_in_tensor, d1, d2, left, right, rank, size):
+    # dist comms and aggregate gradients for edge pixels
+    send_to_right = grad_in_tensor[:, :, :, -d2:].contiguous()
+    send_to_left = grad_in_tensor[:, :, :, :d1].contiguous()
+    recv_from_right = torch.zeros_like(send_to_left)
+    recv_from_left = torch.zeros_like(send_to_right)
+
+    send_op_right = dist.P2POp(dist.isend, send_to_right, right)
+    send_op_left = dist.P2POp(dist.isend, send_to_left, left)
+    recv_op_right = dist.P2POp(dist.irecv, recv_from_right, right)
+    recv_op_left = dist.P2POp(dist.irecv, recv_from_left, left)
+
+    reqs = dist.batch_isend_irecv(
+        [send_op_right, send_op_left, recv_op_left, recv_op_right]
+    )
+    for req in reqs:
+        req.wait()
+
+    if rank == 0:
+        grad_in_tensor = grad_in_tensor[:, :, :, :-d2]
+        grad_in_tensor[:, :, :, -d1:] = torch.add(
+            grad_in_tensor[:, :, :, -d1:], recv_from_right
+        )
+    elif rank == size - 1:
+        grad_in_tensor = grad_in_tensor[:, :, :, d1:]
+        grad_in_tensor[:, :, :, :d2] = torch.add(
+            grad_in_tensor[:, :, :, :d2], recv_from_left
+        )
+    else:
+        grad_in_tensor = grad_in_tensor[:, :, :, d1:-d2]
+        grad_in_tensor[:, :, :, -d1:] = torch.add(
+            grad_in_tensor[:, :, :, -d1:], recv_from_right
+        )
+        grad_in_tensor[:, :, :, :d2] = torch.add(
+            grad_in_tensor[:, :, :, :d2], recv_from_left
+        )
+
+
+def tp_convolution(
+    op_call: torch._ops.OpOverload,
+    local_tensor_args: Tuple[object, ...],
+    local_tensor_kwargs: Dict[str, object],
+) -> object:
+    assert op_call == aten.convolution.default
+    assert len(local_tensor_args) == 9
+
+    rank = dist.get_rank()
+    size = dist.get_world_size()
+    in_tensor = cast(torch.Tensor, local_tensor_args[0])
+    weight = cast(torch.Tensor, local_tensor_args[1])
+    stride, padding, dilation = local_tensor_args[3:6]
+
+    assert _is_supported(in_tensor.shape, weight.shape, stride, padding, dilation)
+    assert isinstance(padding, List)
+
+    if not _requires_data_exchange(padding):
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+        return local_results
+    else:
+        # step 0 compute the overlap pixels of the input tensor
+        d = weight.shape[3] - 1
+        d1 = d // 2
+        d2 = d - d1
+        assert d1 + d2 == d
+        right = (rank + 1) % size
+        left = (rank - 1 + size) % size
+
+        # step1 reconstruct local input tensor
+        in_tensor = _ring_send_recv_construct(
+            in_tensor, d1, d2, left, right, rank, size
+        )
+
+        # step2 feed local input tensor to op_call
+        local_tensor_args_list = list(local_tensor_args)
+        local_tensor_args_list[0] = in_tensor
+        local_tensor_args = cast(Tuple[object, ...], local_tensor_args_list)
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+
+        # step3 remove extra outputs from the results
+        padding_w = padding[1]
+        w = local_results.size(3)
+        if rank == 0:
+            local_results = local_results[:, :, :, : w - padding_w]
+        elif rank == size - 1:
+            local_results = local_results[:, :, :, padding_w:]
+        else:
+            local_results = local_results[:, :, :, padding_w : w - padding_w]
+
+        return local_results
+
+
+def tp_convolution_backward(
+    op_call: torch._ops.OpOverload,
+    local_tensor_args: Tuple[object, ...],
+    local_tensor_kwargs: Dict[str, object],
+) -> object:
+    assert op_call == aten.convolution_backward.default
+    assert len(local_tensor_args) == 11
+
+    rank = dist.get_rank()
+    size = dist.get_world_size()
+    grad_out_tensor = cast(torch.Tensor, local_tensor_args[0])
+    in_tensor = cast(torch.Tensor, local_tensor_args[1])
+    weight = cast(torch.Tensor, local_tensor_args[2])
+    stride, padding, dilation = local_tensor_args[4:7]
+
+    assert _is_supported(in_tensor.shape, weight.shape, stride, padding, dilation)
+    assert isinstance(padding, List)
+
+    if not _requires_data_exchange(padding):
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+        return local_results
+    else:
+        # step 0 compute the overlap pixels of the input tensor
+        d = weight.shape[3] - 1
+        d1 = d // 2
+        d2 = d - d1
+        assert d1 + d2 == d
+        right = (rank + 1) % size
+        left = (rank - 1 + size) % size
+
+        # step1 reconstruct local input tensor
+        in_tensor = _ring_send_recv_construct(
+            in_tensor, d1, d2, left, right, rank, size
+        )
+
+        # step2 reconstruct local gradient output tensor
+        N, C_out, H_out, _ = grad_out_tensor.shape
+        padding_w = padding[1]
+        if rank == 0:
+            grad_out_tensor = torch.nn.functional.pad(
+                grad_out_tensor, (0, padding_w), "constant", 0
+            )
+        elif rank == size - 1:
+            grad_out_tensor = torch.nn.functional.pad(
+                grad_out_tensor, (padding_w, 0), "constant", 0
+            )
+        else:
+            grad_out_tensor = torch.nn.functional.pad(
+                grad_out_tensor, (padding_w, padding_w), "constant", 0
+            )
+
+        # step3 feed local input tensor to op_call
+        local_tensor_args_list = list(local_tensor_args)
+        local_tensor_args_list[0] = grad_out_tensor
+        local_tensor_args_list[1] = in_tensor
+        local_tensor_args = cast(Tuple[object, ...], local_tensor_args_list)
+        local_results = op_call(*local_tensor_args, **local_tensor_kwargs)
+
+        # step4 aggregate gradients for edge pixels
+        grad_in_tensor = local_results[0]
+        grad_in_tensor = _ring_send_recv_aggregate(
+            grad_in_tensor, d1, d2, left, right, rank, size
+        )
+
+        local_results = list(local_results)
+        local_results[0] = grad_in_tensor
+        local_results = cast(Tuple[object, ...], local_results)
+
+        return local_results
+
+
+def convolution_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = dtensor.DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+
+    # sharding propagation
+    dtensor.DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+
+    # local propagation
+    local_results = tp_convolution(
+        op_call, tuple(op_info.local_args), op_info.local_kwargs
+    )
+
+    return dtensor.DTensor._op_dispatcher.wrap(
+        local_results, output_sharding.output_spec
+    )
+
+
+def convolution_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    # Redistribute grad_output tensor to the same placement as input tensor
+    args = list(args)
+    assert isinstance(args[0], dtensor.DTensor) and isinstance(args[1], dtensor.DTensor)
+    args[0] = args[0].redistribute(args[1].device_mesh, args[1].placements)
+    args = tuple(args)
+
+    # extract local tensor and sharding infos to a OpInfo
+    op_info = dtensor.DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+
+    # sharding propagation
+    dtensor.DTensor._op_dispatcher.sharding_propagator.propagate(op_info)
+    output_sharding = op_info.output_sharding
+    assert output_sharding is not None, "output sharding should not be None"
+
+    # local propagation
+    local_results = tp_convolution_backward(
+        op_call, tuple(op_info.local_args), op_info.local_kwargs
+    )
+
+    return dtensor.DTensor._op_dispatcher.wrap(
+        local_results, output_sharding.output_spec
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tools/__init__.py b/MLPY/Lib/site-packages/torch/distributed/_tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1cf0c563abd9886a50a6ea78d257d4a4d26b03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tools/__init__.py
@@ -0,0 +1 @@
+from .memory_tracker import MemoryTracker
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0569f337dbfbe171e2de064b919ace6aa8a2354
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/memory_tracker.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/memory_tracker.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..491dea1a476294087acadff886be05872fc72f1c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/_tools/__pycache__/memory_tracker.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/_tools/memory_tracker.py b/MLPY/Lib/site-packages/torch/distributed/_tools/memory_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d736703e581fb182955acc86b75b838e0dbb8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/_tools/memory_tracker.py
@@ -0,0 +1,299 @@
+from collections import defaultdict
+
+from itertools import chain
+
+import pickle
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    no_type_check,
+    Sequence,
+)
+
+import torch
+import torch.nn as nn
+from torch.utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+BYTES_PER_MB = 1024 * 1024.0
+
+
+class MemoryProfileDispatchMode(TorchDispatchMode):
+    """Run in ``TorchDispatchMode`` to get memory stats at operator level."""
+
+    def __init__(self, memory_tracker) -> None:
+        self.memory_tracker = memory_tracker
+
+    def __torch_dispatch__(self, func, types, args=..., kwargs=None):
+        rs = func(*args, **kwargs)
+        if func == torch.ops.aten.detach.default:
+            return rs
+        func_name: str = (
+            self.memory_tracker._cur_module_name
+            + "."
+            + func.__name__
+            + "_"
+            + str(self.memory_tracker._operator_names[func.__name__])
+        )
+        self.memory_tracker._operator_names[func.__name__] = (
+            self.memory_tracker._operator_names[func.__name__] + 1
+        )
+        self.memory_tracker._record_memory_stats(func_name)
+
+        return rs
+
+
+class MemoryTracker:
+    """
+    Collect and plot the memory stats at operator level.
+
+    Includes ``memories_allocated``, ``memories_active`` and ``memories_reserved``.
+    It also prints a summary for the top 20 operators that generate the most memories.
+
+    Example usage:
+
+        >>> # xdoctest: +SKIP(failing)
+        >>> net.cuda()
+        >>> input = input.cuda()
+
+        >>> mem_tracker = MemoryTracker()
+        >>> mem_tracker.start_monitor(net)
+
+        >>> net.zero_grad(True)
+        >>> loss = net(input)
+        >>> if isinstance(loss, dict):
+        >>>    loss = loss['out']
+        >>> loss.sum().backward()
+        >>> net.zero_grad(set_to_none=True)
+
+        >>> mem_tracker.stop()
+        >>> mem_tracker.summary()
+        >>> mem_tracker.show_traces()
+    """
+
+    def __init__(self) -> None:
+        torch._C._log_api_usage_once("torch.distributed.memory_tracker")
+        self._hooks: List[RemovableHandle] = []
+        self._operator_names: Dict[str, int] = defaultdict(int)
+        self.memories_allocated: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_active: Dict[int, Dict[str, float]] = defaultdict()
+        self.memories_reserved: Dict[int, Dict[str, float]] = defaultdict()
+        self._markers: Dict[str, int] = defaultdict(int)
+        self._cur_module_name: str = ""
+        self._op_index: int = 0
+        self._num_cuda_retries: int = 0
+
+    @no_type_check
+    def start_monitor(self, root_module: nn.Module) -> None:
+        """
+        Register module hooks and entering ``MemoryProfileDispatchMode``.
+
+        This enables operator level memory stats can be tracked during module runtime.
+        """
+        self._clear_state()
+        root_module.__setattr__("_memory_tracker_is_root", True)
+        for name, m in root_module.named_modules():
+            if m is not root_module:
+                m.__setattr__("_memory_tracker_is_root", False)
+            # fused_proxy_group does not support hooks
+            if ".fused_proxy_grouped_embedding_bag" in name:
+                continue
+            # hook ordering with other hooks added by users is not managed, so
+            # the memory stats tracked here may not completely accurate.
+            h1 = m.register_forward_pre_hook(self._create_pre_forward_hook(name))
+            h2 = m.register_forward_hook(self._create_post_forward_hook(name))
+            # it does not work well with jagged tensor somehow, the root cause is not
+            # clear and remove it for now as it does not really capture important info.
+            # h3 = m.register_backward_hook(self._create_backward_hook(name))
+            self._hooks.extend([h1, h2])
+        torch.cuda.empty_cache()
+        assert getattr(self, "profile_mode", None) is None
+        self.profile_mode = MemoryProfileDispatchMode(self)
+        self.profile_mode.__enter__()
+
+    @no_type_check
+    def stop(self) -> None:
+        """
+        Remove module hooks and exit ``MemoryProfileDispatchMode`` to stop tracking memory stats at operator level.
+
+        Get some aggregated stats when the memory_tracker() is enabled, like cuda ``num_alloc_retries``.
+        """
+        self._num_cuda_retries = torch.cuda.memory_stats().get("num_alloc_retries", 0)
+
+        for h in self._hooks:
+            h.remove()
+        self._hooks.clear()
+        assert getattr(self, "profile_mode", None) is not None
+        self.profile_mode.__exit__(None, None, None)
+        self.profile_mode = None
+
+    @no_type_check
+    def summary(self, top: int = 20) -> None:
+        """
+        Print out the top operators that generate the most memories.
+
+        The number of the top operators can be configured.
+        """
+        op_diff: Dict[str, float] = defaultdict(float)
+        op_name, previous_allocated_memory = self.memories_allocated[0]
+        for i in range(1, self._op_index):
+            op_name, current_allocated_memory = self.memories_allocated[i]
+            op_diff[op_name] = current_allocated_memory - previous_allocated_memory
+            previous_allocated_memory = current_allocated_memory
+
+        print("------------------------------------------------")
+        print(f"The number of cuda retries are: {self._num_cuda_retries}")
+        print(f"Top {top} ops that generates memory are:")
+        for k, v in sorted(op_diff.items(), key=lambda item: item[1], reverse=True)[
+            :top
+        ]:
+            print(f"{k}: {v}MB")
+        print("------------------------------------------------")
+
+    @no_type_check
+    def show_traces(self, path: str = "") -> None:
+        import matplotlib.pyplot as plt
+
+        def _plot_figure(x, y_values, labels):
+            min_val = min(list(chain(*y_values))) * 0.999
+            max_val = max(list(chain(*y_values))) * 1.001
+            plt.figure()
+            for y, label in zip(y_values, labels):
+                plt.plot(x, y, label=label)
+            plt.xlabel("# Operator Calls")
+            plt.ylabel("Memory (MB)")
+            plt.legend()
+            for marker_name, marker in self._markers.items():
+                if marker_name == "fw_bw_boundary":
+                    plt.plot(
+                        [marker, marker],
+                        [min_val, max_val],
+                        "r",
+                        lw=2,
+                        label=marker_name,
+                    )
+                else:
+                    plt.plot(
+                        [marker, marker],
+                        [min_val, max_val],
+                        "k-",
+                        lw=2,
+                        label=marker_name,
+                    )
+
+        if path != "":
+            self.load(path)
+
+        y_1 = [gb for (name, gb) in self.memories_allocated.values()]
+        y_2 = [gb for (name, gb) in self.memories_active.values()]
+        y_3 = [gb for (name, gb) in self.memories_reserved.values()]
+        x = list(range(len(y_1)))
+        # Split figures when there is big difference between
+        # "reserved_memory" and "allocated_memory" or "active_memory".
+        _plot_figure(
+            x,
+            [list(y_1), list(y_2), list(y_3)],
+            ["allocated_memory", "active_memory", "reserved_memory"],
+        )
+        _plot_figure(x, [list(y_1)], ["allocated_memory"])
+        _plot_figure(x, [list(y_2)], ["active_memory"])
+        _plot_figure(x, [list(y_3)], ["reserved_memory"])
+
+    def save_stats(self, path: str) -> None:
+        """Save the stats using pickle during runtime if users want to plot the traces in other places like notebook."""
+        stats = {
+            "memories_allocated": self.memories_allocated,
+            "memories_active": self.memories_active,
+            "memories_reserved": self.memories_reserved,
+            "markers": self._markers,
+            "num_alloc_retries": self._num_cuda_retries,
+        }
+
+        with open(path, "wb") as f:
+            pickle.dump(stats, f, pickle.HIGHEST_PROTOCOL)
+
+    def load(self, path: str) -> None:
+        """Load the pickled memory stats to plot the traces or print the summary."""
+        with open(path, "rb") as f:
+            stats = pickle.load(f)
+
+        self.memories_allocated = stats["memories_allocated"]
+        self.memories_active = stats["memories_active"]
+        self.memories_reserved = stats["memories_reserved"]
+        self._markers = stats["markers"]
+        self._num_cuda_retries = stats["num_alloc_retries"]
+
+    def _create_pre_forward_hook(self, name: str) -> Callable:
+        """Prefix operator name with current module and 'forward', and insert 'fw_start' marker at forward pass start."""
+        def _pre_forward_hook(module: nn.Module, inputs: Any) -> None:
+            self._cur_module_name = f"{name}.forward"
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_start")
+
+        return _pre_forward_hook
+
+    def _create_post_forward_hook(self, name: str) -> Callable:
+        """Insert the marker 'fw_bw_boundary' at the boundary of forward and backward pass."""
+
+        def _post_forward_hook(
+            module: nn.Module,
+            inputs: Sequence[torch.Tensor],
+            outputs: Sequence[torch.Tensor],
+        ) -> None:
+            if (
+                hasattr(module, "_memory_tracker_is_root")
+                and module._memory_tracker_is_root
+            ):
+                self._add_marker("fw_bw_boundary")
+
+        return _post_forward_hook
+
+    def _create_backward_hook(self, name: str) -> Callable:
+        """Insert the current module name with backward prefix for the operator name."""
+
+        def _backward_hook(
+            module: nn.Module, grad_input: torch.Tensor, grad_output: torch.Tensor
+        ) -> None:
+            self._cur_module_name = f"{name}.backward"
+
+        return _backward_hook
+
+    @no_type_check
+    def _record_memory_stats(self, fn_name: str) -> None:
+        """
+        Record current memory allocated, current memory active and current memory reserved.
+
+        The memory stats dict is indexed with ``self._op_index``.
+        """
+        memory_allocated: float = torch.cuda.memory_allocated() / BYTES_PER_MB
+        memory_reserved: float = torch.cuda.memory_reserved() / BYTES_PER_MB
+        memory_active: float = (
+            torch.cuda.memory_stats().get("active_bytes.all.current", 0) / BYTES_PER_MB
+        )
+        self.memories_allocated[self._op_index] = (fn_name, memory_allocated)
+        self.memories_reserved[self._op_index] = (fn_name, memory_reserved)
+        self.memories_active[self._op_index] = (fn_name, memory_active)
+        self._op_index += 1
+
+    def _add_marker(self, marker_name: str) -> None:
+        """Set the marker's x-axis value."""
+        marker_val = len(self.memories_allocated.values())
+        self._markers[marker_name] = marker_val
+
+    def _clear_state(self) -> None:
+        """Clear states when start_monitor() is called."""
+        self._operator_names.clear()
+        self.memories_allocated.clear()
+        self.memories_active.clear()
+        self.memories_reserved.clear()
+        self._markers.clear()
+        self._cur_module_name = ""
+        self._op_index = 0
+        self._num_cuda_retries = 0
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2322e7967783425573d7a1e887cb2249d76a095
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/__init__.py
@@ -0,0 +1,3 @@
+from .join import Join
+from .join import Joinable
+from .join import JoinHook
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72ac83944d2f698e1692540d231f254d4754eb2e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/join.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/join.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb8199f162ae4bcaa57dac3b950129894cec244e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/__pycache__/join.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2447d96f6aeedc2b58c3026efb27368865fb6157
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/checkpoint_wrapper.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/checkpoint_wrapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bff6259c23cdb42546a94ff812a9ec9a98a19518
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/__pycache__/checkpoint_wrapper.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ccaf64b3162bccb88e095f01b1f099ca0b2a0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py
@@ -0,0 +1,314 @@
+import warnings
+from enum import auto, Enum
+from functools import partial
+from typing import Any, Callable, Dict, Iterator, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd.graph import save_on_cpu
+from torch.distributed.utils import _pack_kwargs, _replace_by_prefix, _unpack_kwargs
+from torch.utils.checkpoint import checkpoint as torch_utils_checkpoint
+
+_CHECKPOINT_WRAPPED_MODULE = "_checkpoint_wrapped_module"
+_CHECKPOINT_PREFIX = _CHECKPOINT_WRAPPED_MODULE + "."
+
+
+class CheckpointImpl(Enum):
+    REENTRANT = auto()
+    NO_REENTRANT = auto()
+
+
+class ActivationWrapper(torch.nn.Module):
+    """
+    Base class for Activation Checkpoint and Activation Offload.
+
+    Not meant to be instantiated directly.
+    """
+
+    def __init__(self, mod):
+        super().__init__()
+        self._checkpoint_wrapped_module = mod
+        # state_dict post hook to remove prefix to allow loading into a
+        # non-checkpoint wrapped module.
+        self._register_state_dict_hook(self._post_state_dict_hook)
+        # load_state_dict pre-hook to allow loading back into
+        # checkpoint-wrapped module.
+        self._register_load_state_dict_pre_hook(
+            self._pre_load_state_dict_hook, with_module=True
+        )
+
+    def forward(self, *args, **kwargs):
+        raise ValueError("Subclasses should implement forward().")
+
+    def __getattr__(self, name: str) -> Any:
+        """Forward missing attributes to wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self._checkpoint_wrapped_module, name)
+
+    def __getitem__(self, key: int) -> Any:
+        """Forward indexing calls in case the module is a nn.Sequential."""
+        return self._checkpoint_wrapped_module.__getitem__(key)  # type: ignore[operator]
+
+    def named_parameters(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
+        """
+        Override :meth:`named_parameters()` to intercept parameter names.
+
+        remove all occurrences of ``_CHECKPOINT_PREFIX``.
+        """
+        for param_name, param in super().named_parameters(*args, **kwargs):
+            yield param_name.replace(_CHECKPOINT_PREFIX, ""), param
+
+    @staticmethod
+    def _post_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> Dict[str, Any]:
+        """
+        _post_state_dict_hook() is called after the state_dict() of this FSDP module is executed.
+
+        For ``checkpoint_wrapper``, it will strip checkpoint-wrapped module prefix,
+        so that this module can be loaded into non-checkpointed modules.
+        It would still be able to be loaded into checkpoint-wrapped modules as this class,
+        adds the prefix back before loading the state_dict.
+        """
+        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}", prefix)
+        return state_dict
+
+    @staticmethod
+    def _pre_load_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> None:
+        """
+        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()`` is called.
+
+        For ``checkpoint_wrapper``, it will add back the module
+        prefix so that non-checkpointed modules can be loaded into
+        checkpoint_wrapper modules properly.
+        """
+        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}")
+
+
+class OffloadWrapper(ActivationWrapper):
+    def __init__(self, mod):
+        super().__init__(mod)
+
+    def forward(self, *args, **kwargs):
+        with save_on_cpu(pin_memory=True):
+            return self._checkpoint_wrapped_module(*args, **kwargs)
+
+
+class CheckpointWrapper(ActivationWrapper):
+    """
+    An ``nn.Module`` that wraps another ``nn.Module`` with checkpointing.
+
+    Note that this module is not meant to be used directly but instead,
+    it is to be used through the ``checkpoint_wrapper`` function.
+    """
+
+    def __init__(
+        self,
+        mod: torch.nn.Module,
+        checkpoint_impl: CheckpointImpl = CheckpointImpl.NO_REENTRANT,
+        checkpoint_fn=None,
+        **checkpoint_fn_kwargs,
+    ):
+        super().__init__(mod)
+        self.checkpoint_impl = checkpoint_impl
+        if checkpoint_fn is None:
+            # use torch.utils.checkpoint
+            self.checkpoint_fn = partial(
+                torch_utils_checkpoint,
+                use_reentrant=(self.checkpoint_impl == CheckpointImpl.REENTRANT),
+                **checkpoint_fn_kwargs,
+            )
+        else:
+            # Construct user-specified checkpoint function.
+            self.checkpoint_fn = partial(
+                checkpoint_fn,
+                **checkpoint_fn_kwargs,
+            )
+
+    def forward(self, *args, **kwargs):
+        # Support keyword arguments for reentrant checkpoint. Note that this
+        # only works if user has specified self.checkpoint_impl and is not
+        # using their own custom checkpoint_fn.
+        if self.checkpoint_impl == CheckpointImpl.REENTRANT and kwargs != {}:
+            # Pack the args and kwargs
+            flat_args, kwarg_keys = _pack_kwargs(*args, **kwargs)
+
+            # Function that only takes (packed) args, but can unpack them
+            # into the original args and kwargs for the checkpointed
+            # function, and runs that function.
+            def my_function(*inputs):
+                # unpack back into args and kwargs
+                unpacked_args, unpacked_kwargs = _unpack_kwargs(inputs, kwarg_keys)
+                # run original module
+                return self._checkpoint_wrapped_module(
+                    *unpacked_args, **unpacked_kwargs
+                )
+
+            # Pass the function that only takes packed args into reentrant
+            # checkpoint API.
+            return self.checkpoint_fn(  # type: ignore[misc]
+                my_function,
+                *flat_args,
+            )
+        else:
+            return self.checkpoint_fn(  # type: ignore[misc]
+                self._checkpoint_wrapped_module, *args, **kwargs
+            )
+
+
+def offload_wrapper(module: torch.nn.Module) -> torch.nn.Module:
+    """
+    Wrap a module for activation offloading to CPU.
+
+    Offloads intermediate activations to the CPU for modules wrapped with this function.
+    Wrappers with activation offload can be composed with ones that do recomputation-based
+    checkpoint to trade off increased compute versus increased CPU
+    memory usage and additional H2D transfers.
+
+    Usage::
+        offloaded_module = offload_wrapper(module)
+        outputs = checkpointed_module(inputs)
+    Args:
+        module (nn.Module):
+            The module to be wrapped
+    Returns:
+        (nn.Module):
+            Wrapped module
+    """
+    return OffloadWrapper(module)
+
+
+def checkpoint_wrapper(
+    module: torch.nn.Module,
+    checkpoint_impl: CheckpointImpl = CheckpointImpl.NO_REENTRANT,
+    checkpoint_fn=None,
+    **checkpoint_fn_kwargs,
+) -> torch.nn.Module:
+    """
+    Wrap a module for activation checkpointing.
+
+    If the module is wrapped with this function, all subsequent calls to the module will,
+    automatically perform checkpointing without the user having to explicitly call ``checkpoint`` function.
+
+    Usage::
+        checkpointed_module = checkpoint_wrapper(module)
+        outputs = checkpointed_module(inputs)
+    Args:
+        module (nn.Module):
+            The module to be wrapped
+        checkpoint_impl (Optional[CheckpointImpl]):
+            The checkpointing implementation to use. Note that this will only
+            be passed into the ``torch.utils.checkpoint.checkpoint``
+            implementation, and is ignored if a custom ``checkpoint_fn`` is
+            specified. Note that for implementations using reentrant checkpoint
+            from ``torch.utils.checkpoint``, keyword arguments will only be
+            supported if ``checkpoint_impl`` is passed as ``CheckpointImpl.REENTRANT`.
+        checkpoint_fn (Optional[Callable]):
+            Functional checkpoint implementation to use. If this is specified,
+            it will be used over the default ``torch.utils.checkpoint.checkpoint``
+            implementation and the `checkpoint_impl` argument will be ignored.
+        **checkpoint_fn_kwargs: (Dict[str, Any]): Keyword arguments to pass into `checkpoint_fn`.
+
+    Returns:
+        (nn.Module):
+            Wrapped module
+    """
+
+    if checkpoint_impl == CheckpointImpl.REENTRANT:
+        warnings.warn(
+            f"Please specify {CheckpointImpl.NO_REENTRANT} as "
+            f"{CheckpointImpl.REENTRANT} will soon be removed as "
+            "the default and eventually deprecated.",
+            stacklevel=1,
+        )
+    return CheckpointWrapper(
+        module,
+        checkpoint_impl,
+        checkpoint_fn,
+        **checkpoint_fn_kwargs,
+    )
+
+
+def apply_activation_checkpointing(
+    model,
+    checkpoint_wrapper_fn=checkpoint_wrapper,
+    check_fn=lambda _: True,
+    auto_wrap_policy: Optional[Callable[[nn.Module, bool, int], bool]] = None,
+):
+    """
+    Apply :func:`checkpoint_wrapper` to modules within `model` based on a user-defined configuration.
+
+    For each module within `model`, the `check_fn` is used to decide
+    whether `module` should be wrapped with :func:`checkpoint_wrapper` or not.
+
+    Note::
+        This function modifies `model` in place and replaces appropriate layers with
+        their checkpoint-wrapped modules.
+    Note::
+        This function will not wrap the overall root module. If this is needed, please directly use
+        :func:`checkpoint_wrapper` or :func:`offload_wrapper`.
+    Usage::
+        model = nn.Sequential(
+            nn.Linear(10, 10), nn.Linear(10, 10), nn.Linear(10, 10)
+        )
+        check_fn = lambda l: isinstance(l, nn.Linear)
+        # checkpoint activations
+        apply_activation_checkpointing(model, checkpoint_wrapper_fn=checkpoint_wrapper, check_fn=check_fn)
+        # Or offload activations to CPU
+        apply_activation_checkpointing(model, checkpoint_wrapper_fn=offload_wrapper, check_fn=check_fn)
+    Args:
+        model (nn.Module):
+            The model whose submodules should be wrapped with activation checkpointing.
+        checkpoint_wrapper_fn (Optional[Callable[nn.Module]])
+            A ``Callable`` which will wrap modules
+        check_fn (Optional[Callable[nn.Module, nn.Module]])
+            A lambda function which will be passed each child submodule of ``model`` and returns
+            ``True`` or ``False`` depending on whether the submodule should be wrapped.
+        auto_wrap_policy (Optional[Callable[[nn.Module, bool, int], bool]]): A policy to wrap model's
+            submodules with AC. Note that if this is specified, it takes precedence over ``check_fn``.
+    Returns: None (`model` is modified inplace)
+    """
+    # TODO: Importing inside function to avoid circular import issue between FSDP and
+    # checkpoint_wrapper. This can be resolved once wrap() APIs are decoupled from FSDP code.
+    from torch.distributed.fsdp.wrap import _recursive_wrap, lambda_auto_wrap_policy, _Policy
+    from torch.distributed.fsdp._wrap_utils import _construct_wrap_fn, _post_order_apply
+
+    policy = (
+        auto_wrap_policy
+        if auto_wrap_policy is not None
+        else partial(lambda_auto_wrap_policy, lambda_fn=check_fn)
+    )
+    if not callable(policy):
+        if not isinstance(policy, _Policy):
+            raise ValueError(
+                f"Expected {policy} to be callable or be a pre-defined wrap policy"
+            )
+        target_module_to_kwargs = policy._run_policy(
+            model, ignored_modules=set(), root_kwargs={}
+        )
+        wrap_fn = _construct_wrap_fn(model, target_module_to_kwargs, checkpoint_wrapper_fn)
+        _post_order_apply(model, wrap_fn)
+        return
+
+    _recursive_wrap(
+        module=model,
+        auto_wrap_policy=policy,  # type: ignore[arg-type]
+        wrapper_cls=checkpoint_wrapper_fn,
+        ignored_modules=set(),
+        ignored_params=set(),
+        only_wrap_children=True,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..990bfd9dcf09c9fc8029739f6c0191a614f87404
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__init__.py
@@ -0,0 +1,7 @@
+
+from . import default_hooks as default
+
+LOW_PRECISION_HOOKS = [
+    default.fp16_compress_hook,
+    default.bf16_compress_hook,
+]
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebba0369be6abbb629d2a66d62e2805782de79d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/default_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/default_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b687f74b8acee7c1bb09313ee86bf9ad494bedb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/__pycache__/default_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/default_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/default_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ebd5b4bd3e65b3e09300f171fe18a55d634ff09
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_comm_hooks/default_hooks.py
@@ -0,0 +1,165 @@
+import functools
+import torch
+import torch.distributed as dist
+from typing import Optional
+
+
+class DefaultState:
+    r"""
+    Stores state needed to perform the default communication algorithm within a communication hook.
+
+    Args:
+        process_group (ProcessGroup): The process group to be used.
+    """
+
+    __slots__ = [
+        "process_group",
+        "world_size",
+        "gradient_predivide_factor",
+        "gradient_postdivide_factor"
+    ]
+
+    def __init__(
+        self,
+        process_group: dist.ProcessGroup
+    ):
+        if process_group is None:
+            raise ValueError(f"Expected to pass in an explicit ProcessGroup to {self}.")
+        self.process_group = process_group
+        self.world_size = dist.get_world_size(process_group)
+        # Setting two factors `self.gradient_predivide_factor`
+        # and `self.gradient_postdivide_factor` to avoid underflow and overflow
+        self.gradient_predivide_factor = self._get_gradient_predivide_factor(
+            self.world_size
+        )
+        self.gradient_postdivide_factor = self.world_size / self.gradient_predivide_factor
+
+    @staticmethod
+    def _get_gradient_predivide_factor(world_size: int) -> float:
+        factor: int = 1
+        while world_size % factor == 0 and world_size / factor > factor:
+            factor *= 2
+        return float(factor)
+
+class LowPrecisionState(DefaultState):
+    r"""
+    Stores state needed to perform gradient communication in a lower precision within a communication hook.
+
+    Communication hook will cast gradients back to the original
+    parameter precision specified by ``parameter_type`` (default: torch.float32).
+    Builds on top of the :class:`DefaultState`.
+
+    Args:
+        parameter_type (torch.dtype): The precision of model's parameters.
+        Required for a hook to cast gradients back to a parameter's precision.
+    """
+
+    __slots__ = [
+        "parameter_type",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        parameter_type=torch.float32,
+    ):
+        super().__init__(process_group)
+        self.parameter_type = parameter_type
+
+
+def _decompress(state: LowPrecisionState, grad: torch.Tensor):
+    """
+    Casts gradients back to full parameter precision so that further computation happens in full precision.
+    """
+    orig_grad_data = grad.data
+    grad.data = grad.data.to(state.parameter_type)
+    # Don't let this memory get reused until after the transfer.
+    orig_grad_data.record_stream(torch.cuda.current_stream())  # type: ignore[arg-type]
+
+def allreduce_hook(state: DefaultState, grad: torch.Tensor):
+    r"""
+    Implement the  FSDP communication hook for ``all_reduce`` algorithm and a necessary pre- and post-division of gradients.
+
+    Args:
+        state (DefaultState): State information, configures pre- and post-division factors.
+        grad (torch.Tensor): A gradient for the local batch that needs to be communicated across ranks.
+    """
+    # Average grad by pre-division factor. Together pre- and post-division factors
+    # lead to an overall averaging by world_size, required for consistency with PyTorch DDP.
+    # This is a two-step process to avoid potential underflow and overflow.
+    if state.gradient_predivide_factor > 1:
+        grad.div_(state.gradient_predivide_factor)
+    dist.all_reduce(grad, group=state.process_group)
+    # Average grad by post-division factor.
+    if state.gradient_postdivide_factor > 1:
+        grad.div_(state.gradient_postdivide_factor)
+
+def reduce_scatter_hook(state: DefaultState, grad: torch.Tensor, output: torch.Tensor):
+    r"""
+    Implement the  FSDP communication hook for ``reduce_scatter`` algorithm.
+
+    For sharded FSDP strategies and a necessary pre- and post-division of gradients.
+
+    Args:
+        state (DefaultState): State information, configures pre- and post-division factors.
+        grad (torch.Tensor): An unsharded gradient for the local batch that needs to be
+        communicated across ranks.
+        output (torch.Tensor): Stores a single shard of the gradient after ``reduce_scatter``.
+    """
+    # Average grad by pre-division factor.
+    if state.gradient_predivide_factor > 1:
+        grad.div_(state.gradient_predivide_factor)
+    dist.reduce_scatter_tensor(
+        output, grad, group=state.process_group
+    )
+    # Average grad's shard by post-division factor.
+    if state.gradient_postdivide_factor > 1:
+        output.div_(state.gradient_postdivide_factor)
+
+def _low_precision_hook(prec: torch.dtype, state: LowPrecisionState, grad: torch.Tensor, output: torch.Tensor):
+    if grad.dtype != prec:
+        grad.data = grad.data.to(prec)
+    if output is not None:
+        if output.dtype != prec:
+            output.data = output.data.to(prec)
+        reduce_scatter_hook(state, grad, output)
+        _decompress(state, output)
+    else:
+        allreduce_hook(state, grad)
+        _decompress(state, grad)
+
+def fp16_compress_hook(state: LowPrecisionState, grad: torch.Tensor, output: Optional[torch.Tensor] = None):
+    r"""
+    Implement FSDP communication hook for a simple gradient compression approach.
+    Casts ``grad`` to half-precision floating-point format (``torch.float16``).
+
+    It also averages gradients by ``world_size`` in two steps: first it pre-divides gradients by a
+    ``state.gradient_predivide_factor``, and after a communication step (``all_reduce`` or ``reduce_scatter``)
+    gradients are averaged by a ``state.gradient_postdivide_factor``.
+    Once post-division is done, compressed gradients are casted back to parameters' precision.
+
+    Args:
+        state (LowPrecisionState): State information, configures pre- and post-division factors, parameters' precision.
+        grad (torch.Tensor): A gradient for the local batch that needs to be communicated across ranks in a lower precision.
+        output (torch.Tensor): Stores a single shard of the gradient after ``reduce_scatter``.
+    """
+    fp16_hook = functools.partial(_low_precision_hook, torch.float16)
+    return fp16_hook(state, grad, output)
+
+def bf16_compress_hook(state: LowPrecisionState, grad: torch.Tensor, output: Optional[torch.Tensor] = None):
+    r"""
+    Implement FSDP communication hook for a simple gradient compression approach .
+    Casts ``grad`` to half-precision floating-point format.
+
+    It also averages gradients by ``world_size`` in two steps: first it pre-divides gradients by a
+    ``state.gradient_predivide_factor``, and after a communication step (``all_reduce`` or ``reduce_scatter``)
+    gradients are averaged by a ``state.gradient_postdivide_factor``.
+    Once post-division is done, compressed gradients are casted back to parameters' precision.
+
+    Args:
+        state (LowPrecisionState): State information, configures pre- and post-division factors, parameters' precision.
+        grad (torch.Tensor): A gradient for the local batch that needs to be communicated across ranks in a lower precision.
+        output (torch.Tensor): Stores a single shard of the gradient after ``reduce_scatter``.
+    """
+    bf16_hook = functools.partial(_low_precision_hook, torch.bfloat16)
+    return bf16_hook(state, grad, output)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9460c12ce8abc076e6d22570e65a773039aa145f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__init__.py
@@ -0,0 +1 @@
+from .optimizer_overlap import _as_overlapped_optim
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4400d9acc55d75450755425a11a329bb1005ed1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/optimizer_overlap.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/optimizer_overlap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1250da9aa6e7bc12384bbab9b8e1ffcb6fcc52a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/__pycache__/optimizer_overlap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/optimizer_overlap.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/optimizer_overlap.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d824897049d10df995e7b471bb8c077adfa434
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_optimizer_overlap/optimizer_overlap.py
@@ -0,0 +1,93 @@
+from abc import ABC, abstractmethod
+import inspect
+from typing import Dict, Type
+
+from torch.distributed.fsdp import FullyShardedDataParallel
+from torch.nn.parallel import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.distributed.optim import as_functional_optim
+
+from torch.distributed.algorithms.ddp_comm_hooks.default_hooks import allreduce_hook
+
+from torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks import (
+    _OptimizerHookState,
+    _hook_then_optimizer
+)
+
+# Contains the mappings between the regular and overlapped optimizer types.
+_registered_overlapped_optims: Dict[Type, Type] = {}
+
+
+def register_overlapped(optim_cls):
+    def decorator(target_overlapped_optim_cls):
+        if target_overlapped_optim_cls in _registered_overlapped_optims:
+            raise ValueError(
+                f"{target_overlapped_optim_cls} already registered with optim_cls "
+                f"{_registered_overlapped_optims[optim_cls]} {optim_cls}, trying to"
+                f"re-register it for {optim_cls} is not supported."
+            )
+        _registered_overlapped_optims[optim_cls] = target_overlapped_optim_cls
+        return target_overlapped_optim_cls
+    return decorator
+
+
+class OverlappedOptimizer(ABC):
+    def __init__(self, optim_cls: Type) -> None:
+        """
+        Initialize the OverlappedOptimizer.
+
+        Overlappedoptimizer is a base class that child classes can implement to
+        specify how different optimizers will register themselves with DDP.
+        """
+        self.optim_cls = optim_cls
+
+    @abstractmethod
+    def register_ddp(self, ddp: DistributedDataParallel) -> None:
+        """Registers the overlapped optimizer with DDP."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support overlapped DDP."
+        )
+
+    @abstractmethod
+    def register_fsdp(self, fsdp: FullyShardedDataParallel) -> None:
+        """Registers the overlapped optimizer with FSDP."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support overlapped FSDP."
+        )
+
+
+@register_overlapped(Optimizer)
+class _OverlappedStandardOptimizer(OverlappedOptimizer):
+    """Overlaps a regular ``Optimizer``."""
+
+    def __init__(self, optim_cls: Type, params, *optim_args, **optim_kwargs) -> None:
+        super().__init__(optim_cls)
+        f_optim = as_functional_optim(self.optim_cls, *optim_args, **optim_kwargs)
+        self._opt_hook_state = _OptimizerHookState(f_optim, params)
+
+    def register_ddp(self, ddp_inst: DistributedDataParallel):
+        # NOTE: using a custom communication hook and fused optimizer is not
+        # yet supported.
+        ddp_inst.register_comm_hook(  # type: ignore[operator]
+            None,  # wrapped hook state
+            _hook_then_optimizer(allreduce_hook, self._opt_hook_state)
+        )
+
+    # TODO: register_fsdp once FSDP supports communication hook.
+    def register_fsdp(self, fsdp: FullyShardedDataParallel) -> None:
+        """Register the overlapped optimizer with FSDP."""
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support overlapped FSDP."
+        )
+
+def _as_overlapped_optim(optim_cls: Type, params, *args, **kwargs):
+    """Return a new ``OverlappedOptimizer`` instance that supports ``optim_cls``."""
+    for clz in inspect.getmro(optim_cls):
+        try:
+            return _registered_overlapped_optims[clz](optim_cls, params, *args, **kwargs)
+        except KeyError:
+            pass
+
+    # Fallback to standard overlapped optimizer, which will raise errors if user
+    # is attempting to use an unsupported optimizer.
+    return _OverlappedStandardOptimizer(optim_cls, params, *args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..407d34d949887c4c9a26c30519776db912243bc7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/quantization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/quantization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88009dd7c40b2e87e6c5201b41f7d047c3264b3f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/__pycache__/quantization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/quantization.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb6862c46b9596069eec65a91b354046dc6d0ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/_quantization/quantization.py
@@ -0,0 +1,144 @@
+import functools
+import torch
+import torch.distributed as dist
+
+
+from enum import Enum
+
+
+TORCH_HALF_MIN = torch.finfo(torch.float16).min
+TORCH_HALF_MAX = torch.finfo(torch.float16).max
+
+class DQuantType(Enum):
+    """
+    Different quantization methods for auto_quantize API are identified here.
+
+    auto_quantize API currently supports fp16 and bfp16 methods.
+    """
+    FP16 = "fp16",
+    BFP16 = "bfp16"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+def _fp32_to_fp16_with_clamp(tensor: torch.Tensor) -> torch.Tensor:
+    return torch.clamp(tensor, TORCH_HALF_MIN, TORCH_HALF_MAX).half()
+
+def _quantize_tensor(tensor, qtype):
+    if not isinstance(tensor, torch.Tensor):
+        raise RuntimeError(
+            f"_quantize_tensor expecting torch.Tensor as input but found {type(tensor)}"
+        )
+    if qtype == DQuantType.FP16:
+        return _fp32_to_fp16_with_clamp(tensor)
+    elif qtype == DQuantType.BFP16:
+        return torch.ops.quantization._FloatToBfloat16Quantized(tensor)
+    else:
+        raise RuntimeError(
+            f'Quantization type {qtype} is not supported'
+        )
+
+def _quantize_tensor_list(tensor_list, qtype):
+    if not isinstance(tensor_list, list) or not all(
+        isinstance(p, torch.Tensor) for p in tensor_list
+    ):
+        raise RuntimeError(
+            f"_quantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}"
+        )
+    quantized_tensor_list = [_quantize_tensor(t, qtype) for t in tensor_list]
+    return quantized_tensor_list
+
+def _dequantize_tensor(tensor, qtype, quant_loss=None):
+    if not isinstance(tensor, torch.Tensor):
+        raise RuntimeError(
+            f"_dequantize_tensor expecting torch.Tensor as input but found {type(tensor)}"
+        )
+    if qtype == DQuantType.FP16:
+        if tensor.dtype != torch.float16:
+            raise RuntimeError(
+                f"tensor dtype is {tensor.dtype} while expected to be FP16."
+            )
+        elif tensor.dtype == torch.float16 and quant_loss is None:
+            return tensor.float()
+        else:
+            return tensor.float() / quant_loss
+    elif qtype == DQuantType.BFP16:
+        if tensor.dtype != torch.float16:
+            raise RuntimeError(
+                f"tensor dtype is {tensor.dtype} while expected to be FP16."
+            )
+        else:
+            return torch.ops.quantization._Bfloat16QuantizedToFloat(tensor)
+    else:
+        raise RuntimeError(
+            f'Quantization type {qtype} is not supported'
+        )
+
+
+def _dequantize_tensor_list(tensor_list, qtype, quant_loss=None):
+    if not isinstance(tensor_list, list) or not all(
+        isinstance(p, torch.Tensor) for p in tensor_list
+    ):
+        raise RuntimeError(
+            f"_dequantize_tensor_list expecting list of torch.Tensor as input but found {type(tensor_list)}"
+        )
+    dequantized_tensor_list = [_dequantize_tensor(t, qtype) for t in tensor_list]
+    return dequantized_tensor_list
+
+
+def auto_quantize(func, qtype, quant_loss=None):
+    """
+    Quantize the input tensors, choose the precision types, and pass other necessary arguments and then dequantizes the output.
+
+    Currently it only supports:
+        . FP16 and BFP16 quantization method supported for gloo and nccl backends
+        . all_gather, all_to_all collective ops
+    Note: BFP16 only supports 2D tensors.
+    Args:
+        func (Callable): A function representing collective operations.
+        qtype (QuantType): Quantization method
+        quant_loss (float, optional): This can be used to improve accuracy in the dequantization.
+    Returns:
+        (Callable): the same collective as func but enables automatic quantization/dequantization.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        group = kwargs.get('group', None)
+        async_op = kwargs.get('async_op', False)
+        if async_op is True:
+            raise RuntimeError(
+                'The async_op=True mode is not supported yet.'
+            )
+        if func == dist.all_gather:
+            tensors = args[0]
+            input_tensors = _quantize_tensor(args[1], qtype)
+            out_tensors = _quantize_tensor_list(tensors, qtype)
+            dist.all_gather(out_tensors, input_tensors, group=group, async_op=async_op)
+            for i, t in enumerate(_dequantize_tensor_list(out_tensors, qtype, quant_loss=quant_loss)):
+                tensors[i] = t
+
+        elif func == dist.all_to_all:
+            tensors = args[0]
+            input_tensors = _quantize_tensor_list(args[1], qtype)
+            out_tensors = _quantize_tensor_list(tensors, qtype)
+            dist.all_to_all(out_tensors, input_tensors, group=group, async_op=async_op)
+            for i, t in enumerate(_dequantize_tensor_list(out_tensors, qtype, quant_loss=quant_loss)):
+                tensors[i] = t
+
+        elif func == dist.all_to_all_single:
+            tensors = args[0]
+            out_splits = kwargs.get('out_splits', None)
+            in_splits = kwargs.get('in_splits', None)
+            # Quantizing the input/output tensor
+            input_tensors = _quantize_tensor(args[1], qtype)
+            out_tensors = _quantize_tensor(tensors, qtype)
+            dist.all_to_all_single(out_tensors, input_tensors, out_splits, in_splits, group=group)
+            for i, t in enumerate(_dequantize_tensor(out_tensors, qtype, quant_loss=quant_loss)):
+                tensors[i] = t
+        else:
+            raise RuntimeError(
+                f"The collective op {func} is not supported yet"
+            )
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc837ee0abcfeb481cc60d2e74aab92230c859d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -0,0 +1,108 @@
+from enum import Enum
+from functools import partial
+
+import torch.distributed as dist
+
+from . import (
+    debugging_hooks as debugging,
+    default_hooks as default,
+    powerSGD_hook as powerSGD,
+    quantization_hooks as quantization,
+    optimizer_overlap_hooks as optimizer_overlap,
+)
+
+__all__ = ['DDPCommHookType', 'register_ddp_comm_hook']
+
+def _ddp_comm_hook_wrapper(comm_hook, model, state):
+    model.register_comm_hook(state, comm_hook)
+
+
+def _powerSGD_comm_hook_wrapper(
+    comm_hook,
+    model,
+    state,
+    matrix_approximation_rank,
+    start_powerSGD_iter=1_000,
+):
+    """
+    Wrap PowerSGD communication hook.
+
+    To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group,
+    which will be wrapped up with other state info.
+    """
+    powerSGD_state = powerSGD.PowerSGDState(
+        process_group=state,
+        matrix_approximation_rank=matrix_approximation_rank,
+        start_powerSGD_iter=start_powerSGD_iter,
+    )
+    model.register_comm_hook(powerSGD_state, comm_hook)
+
+
+class DDPCommHookType(Enum):
+    """
+    Enumerate ``ddp_comm_hooks`` and ``ddp_comm_hook_wrapper`` communucation hook types.
+
+    DDPCommHookType enumerates the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
+    as names and ``ddp_comm_hook_wrapper`` partials with hook specified. As an example,
+    you can register allreduce hook by
+    ``DDPCommHookType.ALLREDUCE.value(model=model, state=process_group)``.
+    """
+
+    ALLREDUCE = partial(_ddp_comm_hook_wrapper, comm_hook=default.allreduce_hook)
+    FP16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.fp16_compress_hook
+    )
+    BF16_COMPRESS = partial(
+        _ddp_comm_hook_wrapper, comm_hook=default.bf16_compress_hook
+    )
+    QUANTIZE_PER_TENSOR = partial(
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_pertensor_hook
+    )
+    QUANTIZE_PER_CHANNEL = partial(
+        _ddp_comm_hook_wrapper, comm_hook=quantization.quantization_perchannel_hook
+    )
+    POWER_SGD = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.powerSGD_hook,
+        matrix_approximation_rank=1,
+    )
+    # Rank-2 PowerSGD can give a higher accuracy than the default rank-1 version,
+    # but it runs slower and consumes more memory.
+    POWER_SGD_RANK2 = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.powerSGD_hook,
+        matrix_approximation_rank=2,
+    )
+    # Batching can lead to a faster training at the cost of accuracy.
+    BATCHED_POWER_SGD = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.batched_powerSGD_hook,
+        matrix_approximation_rank=1,
+    )
+    BATCHED_POWER_SGD_RANK2 = partial(
+        _powerSGD_comm_hook_wrapper,
+        comm_hook=powerSGD.batched_powerSGD_hook,
+        matrix_approximation_rank=2,
+    )
+    NOOP = partial(
+        _ddp_comm_hook_wrapper, comm_hook=debugging.noop_hook,
+    )
+
+
+def register_ddp_comm_hook(
+    comm_hook_type: DDPCommHookType, model, state=None
+):
+    """
+    Register ``ddp_comm_hooks`` to DDP model.
+
+    Registers the hooks of ``torch.distributed.algorithms.ddp_comm_hooks``
+    to the DDP model. User can specify the type of hook as an enum
+    ``DDPCommHookType`` type using ``comm_hook_type`` input. State input will
+    be passed to the model.
+    Uses Python comm hook implementations.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, model, state)
+    """
+    comm_hook_type.value(model=model, state=state)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..079216591f84f3a729b3ff218b8fd6cf2b041489
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/ddp_zero_hook.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/ddp_zero_hook.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48be6b606c19ebb1f06b00b1db19617346810a62
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/ddp_zero_hook.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/debugging_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/debugging_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c6ab9522d1a6af5c686170637675f42b0411d59
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/debugging_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/default_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/default_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bead8fc933c58257733aec8bd629399370f12142
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/default_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/mixed_precision_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/mixed_precision_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..047f95f726191a11b33663cd604f709935110b7a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/mixed_precision_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/optimizer_overlap_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/optimizer_overlap_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5accb84c6e515a2c5c419316c9edd41c2d9d54ac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/optimizer_overlap_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/post_localSGD_hook.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/post_localSGD_hook.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a318bc3eeda9f98fcd26150bf9690afba1c5b0c0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/post_localSGD_hook.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/powerSGD_hook.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/powerSGD_hook.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e05d4889ca8a686ac412c18c07304ebd0de4a0a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/powerSGD_hook.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/quantization_hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/quantization_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95a65ac700c6d9ebbb526903bbf1757a5907420a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/__pycache__/quantization_hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..95bfdec9500e95369439a26f878b20a8f6e6417b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py
@@ -0,0 +1,448 @@
+import weakref
+from typing import Any, Callable, List, Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed.optim import ZeroRedundancyOptimizer
+from torch.distributed.optim.zero_redundancy_optimizer import (
+    _OverlapStatus,
+)
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+__all__ = ["hook_with_zero_step", "hook_with_zero_step_interleaved"]
+
+# Functional optimizers require passing a list of gradients to their `step()`
+# method, and ZeRO requires a functional optimizer to overlap with DDP
+# Passing a `None` instead of an actual gradient indicates to the optimizer
+# to not update the corresponding parameter
+_NO_PARAM_UPDATE: None = None
+
+
+def _perform_local_step(
+    bucket: dist.GradBucket,
+    zero: ZeroRedundancyOptimizer,
+    rank: int,
+):
+    r"""
+    Perform a local optimizer step using the gradients provided by ``bucket``.
+
+    Arguments:
+        bucket (dist.GradBucket): the bucket providing the gradients.
+        zero (ZeroRedundancyOptimizer): the :class:`ZeroRedundancyOptimizer`
+            instance to perform the :meth:`_local_step`.
+        rank (int): the calling process's rank.
+
+    .. warning::
+        This function assumes that appropriate synchronization has taken place
+        so that the bucket's gradients can be used.
+    """
+    overlap_info = zero._overlap_info
+    bucket_index = bucket.index()
+    assert len(zero.optim.param_groups) == 1, \
+        "Overlapping DDP with ZeRO only supports a single parameter group"
+
+    # Construct the `gradients` input for the local optimizer step, which
+    # expects `None` in a list position to indicate that the corresponding
+    # parameter should not be updated
+    num_local_optim_params = len(zero.optim.param_groups[0]["params"])
+    gradients: List[Optional[torch.Tensor]] = \
+        [_NO_PARAM_UPDATE for _ in range(num_local_optim_params)]
+    assert bucket_index in overlap_info.offsets, \
+        f"Bucket index {bucket_index} was not assigned to rank {rank}"
+    gradients_offset = overlap_info.offsets[bucket_index]
+    bucket_assignment = zero._bucket_assignments_per_rank[rank][bucket_index]
+    bucket_offset = bucket_assignment.offset
+    length = len(bucket_assignment.parameters)
+    bucket_gradients = bucket.gradients()[bucket_offset:bucket_offset + length]
+    for i, grad in enumerate(bucket_gradients):
+        gradients[gradients_offset + i] = grad
+
+    zero._local_step(gradients)
+
+
+def _broadcast_bucket(
+    bucket_index: int,
+    zero: ZeroRedundancyOptimizer,
+):
+    r"""
+    Broadcasts a bucket's parameters.
+
+    Arguments:
+        bucket_index (int): the index of the bucket corresponding to the
+            parameters to broadcast.
+        zero (ZeroRedundancyOptimizer): the calling process's
+            :class:`ZeroRedundancyOptimizer` instance.
+    """
+    overlap_info = zero._overlap_info
+    assert len(overlap_info.assigned_ranks_per_bucket) > bucket_index, \
+        "`assigned_ranks_per_bucket` is not fully constructed"
+    # Sort to ensure the same ordering across ranks
+    assigned_ranks = sorted(overlap_info.assigned_ranks_per_bucket[bucket_index])
+    assert len(assigned_ranks) > 0, f"Bucket {bucket_index} should be " \
+        "assigned to at least one rank"
+    for assigned_rank in assigned_ranks:
+        bucket_assignments = zero._bucket_assignments_per_rank[assigned_rank]
+        if bucket_index in bucket_assignments:
+            overlap_info.broadcast_handles.append(
+                dist.broadcast(
+                    bucket_assignments[bucket_index].tensor,
+                    src=dist.get_global_rank(zero.process_group, assigned_rank),
+                    group=zero.process_group,
+                    async_op=True,
+                )
+            )
+
+
+def _save_ddp_bucket_info(
+    bucket: dist.GradBucket,
+    zero: ZeroRedundancyOptimizer,
+):
+    r"""
+    Save :class:`DistributedDataParallel` gradient bucket information for :class:`ZeroRedundancyOptimizer` instance ``zero``.
+
+    In particular, this function is meant to be called upon seeing each
+    gradient bucket to use when overlapping, meaning it does not save or compute any global
+    information.
+
+    Arguments:
+        bucket (dist.GradBucket): the current gradient bucket.
+        zero (ZeroRedundancyOptimizer): the calling process's
+            :class:`ZeroRedundancyOptimizer` instance.
+    """
+    overlap_info = zero._overlap_info
+    bucket_params = bucket.parameters()
+    assert len(bucket_params) > 0, "Empty bucket"
+
+    # Save the parameters in the bucket
+    overlap_info.params_per_bucket.append(bucket_params)
+    if overlap_info.shard_buckets:
+        # Additionally save the bucket size for the assignment heuristic to use
+        bucket_size = 0
+        for param in bucket_params:
+            bucket_size += param.numel()
+        assert overlap_info.total_size is not None
+        overlap_info.total_size += bucket_size
+
+
+def _hook_with_zero_step_setup(
+    ddp_ref: weakref.ReferenceType,
+    zero: ZeroRedundancyOptimizer,
+    bucket: dist.GradBucket,
+):
+    r"""
+    Encapsulate the setup logic for :func:`hook_with_zero_step` and :func:`hook_with_zero_step_interleaved`.
+
+    This means the logic to run in the
+    hook before the backward pass and optimizer step can actually be
+    overlapped. This is factored out since it is common to both
+    :func:`hook_with_zero_step` and :func:`hook_with_zero_step_interleaved`.
+
+    Arguments:
+        ddp_ref (weakref.ReferenceType): weak reference to the process's
+            :class:`DistributedDataParallel` instance.
+        zero (ZeroRedundancyOptimizer): the calling process's
+            :class:`ZeroRedundancyOptimizer` instance.
+        bucket (dist.GradBucket): the current gradient bucket.
+    """
+    # Proceed as normal until the DDP buckets have been rebuilt
+    if not ddp_ref()._has_rebuilt_buckets:  # type: ignore[union-attr]
+        assert zero._overlap_info.status == _OverlapStatus.UNINITIALIZED
+        return
+
+    bucket_index = bucket.index()
+    overlap_info = zero._overlap_info
+    if overlap_info.status == _OverlapStatus.UNINITIALIZED:
+        overlap_info.status = _OverlapStatus.DDP_HAS_REBUILT_BUCKETS
+
+    if overlap_info.status == _OverlapStatus.DDP_HAS_REBUILT_BUCKETS:
+        if bucket_index == 0 and len(overlap_info.params_per_bucket) > 0:
+            # This corresponds to the first bucket of the backward pass
+            # immediately after all information has been saved, so we
+            # can perform the delayed ZeRO initialization
+            zero._init_zero_for_overlap()
+        else:
+            # Once DDP buckets have been rebuilt but ZeRO has not been
+            # properly initialized yet, save the information needed
+            _save_ddp_bucket_info(bucket, zero)
+
+
+def hook_with_zero_step(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future],
+    ddp: DistributedDataParallel,
+    zero: ZeroRedundancyOptimizer,
+    shard_buckets: bool = False,
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    r"""
+    Modify ``hook`` to overlap :class:`ZeroRedundancyOptimizer` optimizer step with :class:`DistributedDataParallel` backward pass.
+
+    This approach overlaps the optimizer computation and communication with the
+    backward communication. In particular, the backward computation proceeds
+    contiguously, and the optimizer computation follows, overlapping with
+    outstanding backward communication (i.e. all-reduces) and possibly other
+    optimizer communication (i.e. broadcasts).
+    The optimizer step computation begins after the last gradient bucket computation has finished.
+
+    This approach may be preferred over :meth:`hook_with_zero_step_interleaved`
+    if communication is relatively slow compared to computation.
+
+    Arguments:
+        hook (Callable[[Any, dist.GradBucket], torch.futures.Future]): the hook
+            to modify.
+        ddp (DistributedDataParallel): the :class:`DistributedDataParallel`
+            instance to use.
+        zero (ZeroRedundancyOptimizer): the :class:`ZeroRedundancyOptimizer`
+            instance to use.
+        shard_buckets (bool): if ``True``, then the assignment of each
+            :class:`DistributedDataParallel` bucket is partitioned across
+            possibly multiple :class:`ZeroRedundancyOptimizer` instances (i.e.
+            across possibly multiple ranks) to approximate uniformity; if
+            ``False``, then each bucket is wholly assigned to a single
+            :class:`ZeroRedundancyOptimizer` instance (i.e. to a single rank).
+
+    Returns:
+        The modified hook.
+
+    Raises:
+        ValueError: if ``zero`` was constructed with ``overlap_with_ddp=False``.
+        RuntimeError: if using any backend other than NCCL/HCCL since currently
+            Gloo may hang.
+
+    .. warning::
+        Given the way that overlapping :class:`DistributedDataParallel` with
+        :class:`ZeroRedundancyOptimizer` is currently implemented, the first
+        two or three training iterations do not perform parameter updates in
+        the optimizer step, depending on if ``static_graph=False`` or
+        ``static_graph=True``, respectively. This is because it needs
+        information about the gradient bucketing strategy used by
+        :class:`DistributedDataParallel`, which is not finalized until the
+        second forward pass if ``static_graph=False`` or until the third
+        forward pass if ``static_graph=True``.
+    """
+    if not zero._overlap_with_ddp:
+        raise ValueError(
+            "ZeroRedundancyOptimizer must be constructed with "
+            "`overlap_with_ddp=True` to use this hook properly"
+        )
+    ddp_ref = weakref.ref(ddp)
+
+    # NOTE: Gloo may hang with this overlapping approach, so we require
+    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
+    if ((pg != dist.Backend.NCCL) and (pg != 'hccl')):
+        raise RuntimeError(
+            "Overlapping DDP with ZeRO using this approach currently requires "
+            "NCCL/HCCL backend to avoid hangs"
+        )
+
+    if shard_buckets:
+        zero._overlap_info.shard_buckets = True
+        zero._overlap_info.total_size = 0
+
+    def hook_with_zero_fn(
+        state: Any,
+        bucket: dist.GradBucket,
+    ) -> torch.futures.Future[torch.Tensor]:
+        r"""
+        Return :class:`Future` that runs the optimizer step if this corresponds to the last gradient bucket.
+
+        Perform equivalent of :class:`ZeroRedundancyOptimizer` :meth:`step` if ``bucket`` is last gradient bucket.
+        The function gives a gradient bucket tensor and
+        performs additional computation on the iteration that
+        the :class:`DistributedDataParallel` buckets are rebuilt to collect
+        information used to implement the modified hook.
+
+        Arguments:
+            state (Any): any state for the hook.
+            bucket (dist.GradBucket): the :class:`DistributedDataParallel`
+                gradient bucket.
+        """
+        fut = hook(state, bucket)
+        _hook_with_zero_step_setup(ddp_ref, zero, bucket)
+        if zero._overlap_info.status != _OverlapStatus.INITIALIZED:
+            return fut
+
+        overlap_info = zero._overlap_info
+        bucket_index = bucket.index()
+        rank = zero.global_rank
+
+        assert overlap_info.status == _OverlapStatus.INITIALIZED
+        assert len(overlap_info.assigned_ranks_per_bucket) > bucket_index, \
+            "`assigned_ranks_per_bucket` is not fully constructed"
+        assigned_to_bucket = rank in overlap_info.assigned_ranks_per_bucket[bucket_index]
+
+        # Save the bucket reference and all-reduce future for the final bucket
+        if assigned_to_bucket:
+            overlap_info.bucket_index_to_bucket[bucket_index] = bucket
+            overlap_info.bucket_index_to_future[bucket_index] = fut
+
+        # Check that buckets are indexed incrementally starting from 0 in the
+        # order of their autograd hooks firing
+        if len(overlap_info.bucket_indices_seen) > 0:
+            assert overlap_info.bucket_indices_seen[-1] == bucket_index - 1, \
+                "Bucket indices are not in incremental order"
+        else:
+            assert bucket_index == 0, "Bucket indices do not start from 0"
+        overlap_info.bucket_indices_seen.append(bucket_index)
+
+        # Directly return the future without any optimizer computation if this
+        # is not the last bucket
+        num_buckets = len(overlap_info.params_per_bucket)
+        is_last_bucket = bucket_index == num_buckets - 1
+        if not is_last_bucket:
+            return fut
+
+        # Perform partial optimizer step on all buckets after the final
+        # bucket has been computed
+        # NOTE: This should not be chained as a callback to the last bucket's
+        # all-reduce future since that would add synchronization that delays
+        # all optimizer computation to wait for that last all-reduce
+        for bucket_index in range(num_buckets):
+            assigned_ranks = overlap_info.assigned_ranks_per_bucket[bucket_index]
+            if rank in assigned_ranks:
+                # Wait on the bucket's all-reduce future to ensure correct
+                # gradients
+                assert bucket_index in overlap_info.bucket_index_to_future, \
+                    f"All-reduce future for bucket {bucket_index} not saved " \
+                    f"on rank {rank}"
+                allreduce_future = overlap_info.bucket_index_to_future[bucket_index]
+                allreduce_future.wait()
+
+                # Perform the partial optimizer step
+                curr_bucket = overlap_info.bucket_index_to_bucket[bucket_index]
+                _perform_local_step(curr_bucket, zero, rank)
+
+            _broadcast_bucket(bucket_index, zero)
+
+        # Ensure that all parameter updates are finished before the
+        # next forward pass
+        overlap_info.wait_for_broadcasts()
+        overlap_info.clear_per_iter_info()
+
+        return fut
+
+    return hook_with_zero_fn
+
+
+def hook_with_zero_step_interleaved(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future],
+    ddp: DistributedDataParallel,
+    zero: ZeroRedundancyOptimizer,
+    shard_buckets: bool = False,
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    r"""
+    Modify ``hook`` to overlap :class:`ZeroRedundancyOptimizer` optimizer step with :class:`DistributedDataParallel` backward pass
+
+    This approach overlaps the optimizer computation and communication with the
+    backward computation and communication. In particular, once a bucket's
+    gradients have been computed, the optimizer computation using those
+    gradients is launched (though the actual computation must wait for the
+    bucket's all-reduce to complete). This yields an interleaving of all-
+    reduces and broadcasts in the communication stream.
+
+    This approach may be preferred over :meth:`hook_with_zero_step` if
+    communication is relatively fast compared to computation.
+
+    Arguments:
+        hook (Any * dist.GradBucket -> torch.futures.Future): the hook to
+            modify.
+        ddp (DistributedDataParallel): the :class:`DistributedDataParallel`
+            instance to use.
+        zero (ZeroRedundancyOptimizer): the :class:`ZeroRedundancyOptimizer`
+            instance to use.
+        shard_buckets (bool): if ``True``, then the assignment of each
+            :class:`DistributedDataParallel` bucket is partitioned across
+            possibly multiple :class:`ZeroRedundancyOptimizer` instances (i.e.
+            across possibly multiple ranks) to approximate uniformity; if
+            ``False``, then each bucket is wholly assigned to a single
+            :class:`ZeroRedundancyOptimizer` instance (i.e. to a single rank).
+
+    Returns:
+        The modified hook.
+
+    Raises:
+        ValueError: if ``zero`` was constructed with ``overlap_with_ddp=False``.
+        RuntimeError: if using any backend other than NCCL since currently
+            Gloo may hang.
+
+    .. warning::
+        Given the way that overlapping :class:`DistributedDataParallel` with
+        :class:`ZeroRedundancyOptimizer` is currently implemented, the first
+        two or three training iterations do not perform parameter updates in
+        the optimizer step, depending on if ``static_graph=False`` or
+        ``static_graph=True``, respectively. This is because it needs
+        information about the gradient bucketing strategy used by
+        :class:`DistributedDataParallel`, which is not finalized until the
+        second forward pass if ``static_graph=False`` or until the third
+        forward pass if ``static_graph=True``.
+    """
+    if not zero._overlap_with_ddp:
+        raise ValueError(
+            "ZeroRedundancyOptimizer must be constructed with "
+            "`overlap_with_ddp=True` to use this hook properly"
+        )
+    ddp_ref = weakref.ref(ddp)
+
+    # NOTE: Gloo may hang with this overlapping approach, so we require
+    # NCCL/HCCL backend for now; see https://github.com/pytorch/pytorch/issues/62300
+    pg = dist.get_backend(ddp_ref().process_group)  # type: ignore[union-attr]
+    if ((pg != dist.Backend.NCCL) and (pg != 'hccl')):
+        raise RuntimeError(
+            "Overlapping DDP with ZeRO using this approach currently requires "
+            "NCCL/HCCL backend to avoid hangs"
+        )
+
+    if shard_buckets:
+        zero._overlap_info.shard_buckets = True
+        zero._overlap_info.total_size = 0
+
+    def hook_with_zero_interleaved_fn(
+        state,
+        bucket: dist.GradBucket,
+    ) -> torch.futures.Future[torch.Tensor]:
+        r"""
+        Return :class:`Future` that gives gradient bucket tensor and performs partial :class:`ZeroRedundancyOptimizer` :meth:`step`.
+
+        This function uses the gradients in gradient in given bucket to perform a partial
+        :class:`ZeroRedundancyOptimizer` :meth:`step`
+
+        Arguments:
+            state: any state for the hook.
+            bucket (dist.GradBucket): the :class:`DistributedDataParallel`
+                gradient bucket.
+        """
+        fut = hook(state, bucket)
+        _hook_with_zero_step_setup(ddp_ref, zero, bucket)
+        if zero._overlap_info.status != _OverlapStatus.INITIALIZED:
+            return fut
+
+        def zero_step(fut: torch.futures.Future) -> torch.Tensor:
+            r"""
+            Perform partial :class:`ZeroRedundancyOptimizer` :meth:`step` using gradients in the :class:`DistributedDataParallel`.
+
+            Returns:
+                A :class:`torch.Tensor` representing the contents of the
+                gradient bucket.
+            """
+            overlap_info = zero._overlap_info
+            bucket_index = bucket.index()
+            rank = zero.global_rank
+
+            assigned_ranks = overlap_info.assigned_ranks_per_bucket[bucket_index]
+            overlap_info.bucket_indices_seen.append(bucket_index)
+            if rank in assigned_ranks:
+                _perform_local_step(bucket, zero, rank)
+
+            _broadcast_bucket(bucket_index, zero)
+
+            num_buckets = len(overlap_info.params_per_bucket)
+            if len(overlap_info.bucket_indices_seen) == num_buckets:
+                # Ensure that all parameter updates are finished before the
+                # next forward pass
+                overlap_info.wait_for_broadcasts()
+                overlap_info.clear_per_iter_info()
+
+            return bucket.buffer()
+
+        return fut.then(zero_step)
+
+    return hook_with_zero_interleaved_fn
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..79673f5e297c0c1053361c09817c447cbd54b439
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py
@@ -0,0 +1,28 @@
+from typing import Any
+
+import torch
+from torch.distributed import GradBucket
+
+__all__ = ["noop_hook"]
+
+
+def noop_hook(_: Any, bucket: GradBucket) -> torch.futures.Future[torch.Tensor]:
+    """
+    Return a future that wraps the input, so it is a no-op that does not incur any communication overheads.
+
+    This hook should **only** be used for headroom analysis of allreduce optimization,
+    instead of the normal gradient synchronization.
+    For example, if only less than 10% speedup of training time can be observed after this hook is registered,
+    it usually implies that allreduce is not a performance bottleneck for this case.
+    Such instrumentation can be particularly useful
+    if GPU traces cannot be easily retrieved or the trace analysis is complicated
+    some factors such as the overlap between allreduce and computation or the desynchronization across ranks.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(None, noop_hook)
+    """
+    fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
+    fut.set_result(bucket.buffer())
+
+    return fut
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f5dfb185cd22e6e8537f45da3f2c0c42a93fb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -0,0 +1,223 @@
+from typing import Any, Callable, cast, Tuple
+
+import torch
+import torch.distributed as dist
+
+__all__ = [
+    "allreduce_hook",
+    "fp16_compress_hook",
+    "bf16_compress_hook",
+    "fp16_compress_wrapper",
+    "bf16_compress_wrapper",
+]
+
+
+def _allreduce_fut(
+    process_group: dist.ProcessGroup, tensor: torch.Tensor
+) -> torch.futures.Future[torch.Tensor]:
+    """Average the input gradient tensor by allreduce and returns a future."""
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+
+    # Apply the division first to avoid overflow, especially for FP16.
+    tensor.div_(group_to_use.size())
+
+    return (
+        dist.all_reduce(tensor, group=group_to_use, async_op=True)
+        .get_future()
+        .then(lambda fut: fut.value()[0])
+    )
+
+
+def allreduce_hook(
+    process_group: dist.ProcessGroup, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Call ``allreduce`` using ``GradBucket`` tensors.
+
+    Once gradient tensors are aggregated across all workers, its ``then``
+    callback takes the mean and returns the result.
+
+    If user registers this DDP communication hook,
+    DDP results is expected to be same as the case where no hook was registered.
+    Hence, this won't change behavior of DDP and user can use this as a reference
+    or modify this hook to log useful information or any other purposes while
+    unaffecting DDP behavior.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(process_group, allreduce_hook)
+    """
+    return _allreduce_fut(process_group, bucket.buffer())
+
+
+def fp16_compress_hook(
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Compress by casting ``GradBucket`` to ``torch.float16`` divided by process group size.
+
+    This DDP communication hook implements a simple gradient compression
+    approach that casts ``GradBucket`` tensor to half-precision floating-point format (``torch.float16``)
+    and then divides it by the process group size.
+    It allreduces those ``float16`` gradient tensors. Once compressed gradient
+    tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(process_group, fp16_compress_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    buffer = (
+        cast(Tuple[torch.Tensor, ...], bucket)[0]
+        if isinstance(bucket, tuple)
+        else bucket.buffer()
+    )
+    compressed_tensor = buffer.to(torch.float16).div_(world_size)
+
+    def decompress(fut):
+        decompressed_tensor = buffer
+        # Decompress in place to reduce the peak memory.
+        # See: https://github.com/pytorch/pytorch/issues/45968
+        value = fut if isinstance(fut, torch.Tensor) else fut.value()[0]
+        decompressed_tensor.copy_(value)
+        return decompressed_tensor
+
+    if torch._utils.is_compiling():
+        grad = dist._functional_collectives.all_reduce(
+            compressed_tensor, "sum", group_to_use
+        )
+        return decompress(grad)
+    else:
+        fut = dist.all_reduce(
+            compressed_tensor, group=group_to_use, async_op=True
+        ).get_future()
+        return fut.then(decompress)
+
+
+# TODO: create an internal helper function and extract the duplicate code in FP16_compress and BF16_compress.
+def bf16_compress_hook(
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This DDP communication hook implements a simple gradient compression
+    approach that casts ``GradBucket`` tensor to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ (``torch.bfloat16``)
+    and then divides it by the process group size.
+    It allreduces those ``bfloat16`` gradient tensors. Once compressed gradient
+    tensors are allreduced, the chained callback ``decompress`` casts it back to the input data type (such as ``float32``).
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(process_group, bf16_compress_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    buffer = (
+        cast(Tuple[torch.Tensor, ...], bucket)[0]
+        if isinstance(bucket, tuple)
+        else bucket.buffer()
+    )
+    compressed_tensor = buffer.to(torch.bfloat16).div_(world_size)
+
+    def decompress(fut):
+        decompressed_tensor = buffer
+        # Decompress in place to reduce the peak memory.
+        # See: https://github.com/pytorch/pytorch/issues/45968
+        value = fut if isinstance(fut, torch.Tensor) else fut.value()[0]
+        decompressed_tensor.copy_(value)
+        return decompressed_tensor
+
+    if torch._utils.is_compiling():
+        grad = dist._functional_collectives.all_reduce(
+            compressed_tensor, "sum", group_to_use
+        )
+        return decompress(grad)
+    else:
+        fut = dist.all_reduce(
+            compressed_tensor, group=group_to_use, async_op=True
+        ).get_future()
+        return fut.then(decompress)
+
+
+def fp16_compress_wrapper(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    """
+    Cast input tensor to ``torch.float16``, cast result of hook back to input dtype.
+
+    This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+    floating point format (``torch.float16``), and casts the resulting tensor of the given hook back to
+    the input data type, such as ``float32``.
+    Therefore, ``fp16_compress_hook`` is equivalent to ``fp16_compress_wrapper(allreduce_hook)``.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, fp16_compress_wrapper(powerSGD_hook))
+    """
+
+    def fp16_compress_wrapper_hook(
+        hook_state, bucket: dist.GradBucket
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Cast bucket tensor to FP16.
+        bucket.set_buffer(bucket.buffer().to(torch.float16))
+
+        fut = hook(hook_state, bucket)
+
+        def decompress(fut):
+            decompressed_tensor = bucket.buffer()
+            # Decompress in place to reduce the peak memory.
+            # See: https://github.com/pytorch/pytorch/issues/45968
+            decompressed_tensor.copy_(fut.value())
+            return decompressed_tensor
+
+        # Decompress after hook has run.
+        return fut.then(decompress)
+
+    return fp16_compress_wrapper_hook
+
+
+def bf16_compress_wrapper(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    """
+    Warning: This API is experimental, and it requires NCCL version later than 2.9.6.
+
+    This wrapper casts the input gradient tensor of a given DDP communication hook to half-precision
+    `Brain floating point format <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format> `_  (``torch.bfloat16``),
+    and casts the resulting tensor of the given hook back to the input data type, such as ``float32``.
+
+    Therefore, ``bf16_compress_hook`` is equivalent to ``bf16_compress_wrapper(allreduce_hook)``.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1, start_powerSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, bf16_compress_wrapper(powerSGD_hook))
+    """
+
+    def bf16_compress_wrapper_hook(
+        hook_state, bucket: dist.GradBucket
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Cast bucket tensor to BF16.
+        bucket.set_buffer(bucket.buffer().to(torch.bfloat16))
+
+        fut = hook(hook_state, bucket)
+
+        def decompress(fut):
+            decompressed_tensor = bucket.buffer()
+            # Decompress in place to reduce the peak memory.
+            # See: https://github.com/pytorch/pytorch/issues/45968
+            decompressed_tensor.copy_(fut.value())
+            return decompressed_tensor
+
+        # Decompress after hook has run.
+        return fut.then(decompress)
+
+    return bf16_compress_wrapper_hook
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..253a902e004f90bc3f1d7232e4efb250f8b207f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py
@@ -0,0 +1,85 @@
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+
+from dataclasses import dataclass
+from typing import Any, no_type_check
+from torch.distributed.utils import _free_storage
+
+@dataclass
+class _AllreduceUpcastHookState:
+    """
+    State to manage DDP mixed precision in backward / gradient communication.
+
+    This contains a weakref to the DDP module for access to reducer and process
+    group, and a stream to run parameter and gradient upcasts.
+    """
+
+    ddp_weakref: Any
+    upcast_stream: torch.cuda.Stream
+    wait_for_stream_enqueued: bool = False
+
+@no_type_check
+def _reducer_allreduce_and_upcast_hook(
+    hook_state: _AllreduceUpcastHookState, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Perform allreduce in precision ``reduce_dtype``, upcast to prepare for optimizer.
+
+    Performs allreduce in the reduced precision given by DDP's mixed precision
+    reduce_dtype, and upcasts parameters and gradients to fp32 in preparation
+    to run the optimizer.
+    """
+    ddp_weakref = hook_state.ddp_weakref
+    reducer, process_group = ddp_weakref().reducer, ddp_weakref().process_group
+    gradient_is_bucket_view = ddp_weakref().gradient_as_bucket_view
+    # Cast bucket if different than param_dtype.
+    if (
+        ddp_weakref().mixed_precision.param_dtype != ddp_weakref().mixed_precision.reduce_dtype
+    ):
+        # Cast bucket tensor to reduce_dtype
+        bucket.set_buffer(bucket.buffer().to(ddp_weakref().mixed_precision.reduce_dtype))
+    fut = reducer._run_allreduce_hook(bucket)
+    ret_fut = torch.futures.Future()
+    stream = hook_state.upcast_stream
+    with torch.cuda.stream(stream):
+        fut.wait()
+        bucket.buffer().div_(process_group.size())
+        ret_fut.set_result(bucket.buffer())
+
+        # Upcast parameters and gradients so optimizer step can run in fp32.
+        params, grads = bucket.parameters(), bucket.gradients()
+        for p, g in zip(params, grads):
+            p.data = p._fp_param
+            # free storage for mp param as it will be allocated again in next
+            # forward pass.
+            _free_storage(p._mp_param)
+            p.grad.data = p.grad.to(p.data.dtype)
+
+    # enqueue a callback to wait for this stream at end of backward
+    def wait_for_stream_cb():
+        torch.cuda.current_stream().wait_stream(stream)
+        # Remove post-backward hooks since they are re-installed in next
+        # iteration, similar to FSDP.
+        # Parameters that don't require grad still needed to be casted since
+        # they may participate in computation. However, they would not be recast
+        # by hook above as they don't have a grad hook installed, so cast them
+        # back here.
+        for n, p in ddp_weakref().module.named_parameters():
+            if hasattr(p, '_ddp_mp_hook_state'):
+                p._ddp_mp_hook_state[1].remove()
+                delattr(p, '_ddp_mp_hook_state')
+            if not p.requires_grad and not hasattr(p, '_ddp_ignored'):
+                p.data = p._fp_param
+
+        # reset for next backward pass
+        hook_state.wait_for_stream_enqueued = False
+
+    if not hook_state.wait_for_stream_enqueued:
+        Variable._execution_engine.queue_callback(
+            wait_for_stream_cb
+        )
+        # mark that the callback is enqueued
+        hook_state.wait_for_stream_enqueued = True
+
+    return ret_fut
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e62f70122b83e6b7c3481b08d7888282b799955
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -0,0 +1,154 @@
+from typing import Any, Callable, List, no_type_check
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Variable
+from functools import partial
+from dataclasses import dataclass
+
+__all__: List[str] = []
+
+_FUNCTIONAL_OPTIM_STEP_METHOD_NAME = "step_param"
+
+class _OptimizerHookState:
+    """
+    Holds state for running optimizer in-line after DDP communication hook.
+
+    Currently contains only optimizer class which must have a method `step_param`.
+    """
+
+    __slots__ = ["functional_optimizer", "params_to_optimize"]
+
+    def __init__(self, functional_optim, params=None):
+        self.functional_optimizer = functional_optim
+        self._check_valid_functional_optim()
+        self._set_params_to_optimize(params)
+
+    def _set_params_to_optimize(self, params):
+        if params is not None:
+            self.params_to_optimize = set(params)
+
+    def _check_valid_functional_optim(self):
+        if not hasattr(self.functional_optimizer, _FUNCTIONAL_OPTIM_STEP_METHOD_NAME):
+            raise ValueError(
+                f"Class {type(self.functional_optimizer)} must implement method "
+                f"{_FUNCTIONAL_OPTIM_STEP_METHOD_NAME}."
+            )
+
+
+@dataclass
+class _OptimInBackwardHookState:
+    optim_stream: torch.cuda.Stream
+    wait_for_optim_stream_enqueued: bool
+
+@no_type_check
+def _apply_optim_in_backward_hook(
+    gradient_is_bucket_view: bool
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    r"""
+    Register hook to apply the optimizer in backward.
+
+    If torch.distributed.optim._apply_optimizer_in_backward is used to overlap
+    optimizer with backward pass, DDP will run the below hook to run optimizer
+    step for parameters after gradient communication has taken place.
+    """
+    optim_in_bwd_state = _OptimInBackwardHookState(
+        optim_stream=torch.cuda.Stream(),
+        wait_for_optim_stream_enqueued=False,
+    )
+
+    def apply_optim_in_backward_hook(
+        hook_state: Any, bucket: dist.GradBucket, optim_stream_state,
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Run original hook
+        ddp_weakref = hook_state
+        ddp_inst = ddp_weakref()
+        reducer, process_group = ddp_inst.reducer, ddp_inst.process_group
+        fut = reducer._run_allreduce_hook(bucket)
+        optimizer_stream = optim_stream_state.optim_stream
+        with torch.cuda.stream(optimizer_stream):
+            fut.wait()
+            # Apply gradient division since C++ side only allreduces and does
+            # not average. TODO: (rohan-varma) the div factor may be different
+            # when running with join hook
+            bucket.buffer().div_(process_group.size())
+            model_params = bucket.parameters()
+            grads = bucket.gradients()
+            # TODO (rohan-varma): upcast as needed for DDP mixed precision,
+            # once optimizer in backward + DDP mixed precision is supported.
+            for p, g in zip(model_params, grads):
+                if hasattr(p, '_in_backward_optimizers'):
+                    # Note: need to set grad to the bucket's grad, because
+                    # running allreduce results in the bucket's grad being
+                    # reduced, but not grad field.
+                    if not gradient_is_bucket_view:
+                        p.grad = g
+                    for optim in p._in_backward_optimizers:
+                        optim.step()
+
+        # Need to return a Future[Tensor] to obey comm hook API contract.
+        ret_fut = torch.futures.Future()
+        ret_fut.set_result(bucket.buffer())
+
+        # enqueue a callback to wait for this optimizer stream at the end of
+        # backward and set all DDP managed grads to None.
+        def wait_for_optim_stream_callback():
+            torch.cuda.current_stream().wait_stream(
+                optim_stream_state.optim_stream
+            )
+            # Set DDP managed grads to None
+            for param in ddp_inst._get_data_parallel_params(ddp_inst.module):
+                if hasattr(param, '_in_backward_optimizers'):
+                    param.grad = None
+
+            # reset for the next backwards pass
+            optim_stream_state.wait_for_optim_stream_enqueued = False
+
+        if not optim_stream_state.wait_for_optim_stream_enqueued:
+            Variable._execution_engine.queue_callback(
+                wait_for_optim_stream_callback
+            )
+            # mark that the callback is enqueued
+            optim_stream_state.wait_for_optim_stream_enqueued = True
+
+        return ret_fut
+
+    comm_hook = partial(
+        apply_optim_in_backward_hook, optim_stream_state=optim_in_bwd_state
+    )
+    # These are needed for DDP's logging of comm hooks
+    comm_hook.__name__ = apply_optim_in_backward_hook.__name__
+    comm_hook.__qualname__ = apply_optim_in_backward_hook.__qualname__
+
+    return comm_hook
+
+def _hook_then_optimizer(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]],
+    optimizer_state: _OptimizerHookState,
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    r"""Run optimizer in a functional fashion after DDP communication hook."""
+    has_set_params = (
+        hasattr(optimizer_state, 'params_to_optimize')
+        and optimizer_state.params_to_optimize is not None
+    )
+
+    def hook_then_optimizer_wrapper(
+        hook_state, bucket: dist.GradBucket
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Run original hook
+        fut = hook(hook_state, bucket)
+
+        def optimizer_step(fut):
+            gradient_tensors = bucket.gradients()
+            model_params = bucket.parameters()
+            for grad_tensor, model_param in zip(gradient_tensors, model_params):
+                if not has_set_params or model_param in optimizer_state.params_to_optimize:
+                    optimizer_state.functional_optimizer.step_param(
+                        model_param,
+                        grad_tensor,
+                    )
+            return bucket.buffer()
+
+        return fut.then(optimizer_step)
+
+    return hook_then_optimizer_wrapper
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4ca0d2868095307a10669168379c109d5fe2990
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py
@@ -0,0 +1,123 @@
+import logging
+
+import torch
+import torch.distributed as dist
+
+from . import default_hooks as default
+
+logger = logging.getLogger(__name__)
+
+
+class PostLocalSGDState:
+    r"""
+    Store state for all-reducing gradients globally until given step, then locally after.
+
+    Stores the state for all-reducing gradients globally using ``process_group`` until step ``start_localSGD_iter``,
+    and all-reducing gradients locally using ``subgroup`` afterwards.
+
+    If ``process_group`` is ``None``, the global process group will be used.
+    If ``subgroup`` is ``None``, the intra-node process group on each machine will be used.
+
+    Additionally, ``post_local_gradient_allreduce`` may be worth tuning,
+    because both true and false may give a faster convergence.
+    """
+
+    __slots__ = [
+        "process_group",
+        "subgroup",
+        "start_localSGD_iter",
+        "post_local_gradient_allreduce",
+        "iter",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        subgroup,
+        start_localSGD_iter,
+        post_local_gradient_allreduce=True,
+    ):
+        """Initialize state object with given parameters and log when localSGD start."""
+        logger.info(
+            "Local SGD will be started after %s iterations", start_localSGD_iter
+        )
+
+        # The group used for all-reducing gradients globally.
+        self.process_group = process_group
+        # The group used for all-reducing gradients locally.
+        self.subgroup = subgroup
+        self.start_localSGD_iter = start_localSGD_iter
+        # Allreduce gradients locally since iteration `start_localSGD_iter`.
+        # This may help with the convergence efficiency at the cost of relatively cheap intra-subgroup communication.
+        self.post_local_gradient_allreduce = post_local_gradient_allreduce
+        # Iteration/step in the training loop.
+        self.iter = 0
+
+    def maybe_increase_iter(self, bucket):
+        """Track iterations and trigger log message at start of local SGD."""
+        # Since bucket 0 is the last bucket to allreduce in an iteration.
+        # Only increase `iter` when bucket 0 is processed.
+        if bucket.is_last():
+            self.iter += 1
+
+        if self.iter == self.start_localSGD_iter:
+            logger.info(
+                "Start to apply local SGD after %s iterations.", self.iter
+            )
+
+def post_localSGD_hook(
+    state: PostLocalSGDState, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Run post-localSGD algorithm.
+
+    This DDP communication hook is used for running post-localSGD algorithm,
+    by combining with a model averaging component (e.g.,
+    :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`)
+    that runs after the optimizer step.
+
+    Args:
+        state (PostLocalSGDState): State information to run post-localSGD.
+            Users mainly need to tune ``start_localSGD_iter`` to determine when to start local SGD.
+        bucket (dist.GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
+            Note that since DDP comm hook only supports single process single device mode,
+            only exactly one tensor is stored in this bucket.
+
+    Returns:
+        Future handler of the communication, which updates the gradients in place.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> state = PostLocalSGDState(process_group=process_group, subgroup=subgroup,
+                                  start_localSGD_iter=10)
+        >>> ddp_model.register_comm_hook(state, post_localSGD_hook)
+        >>> # Also need to establish a model averaging module and run model averaging after ``optimizer.step()``.
+        >>> # Please refer to the examples in ``torch.distributed.algorithms.model_averaging.averagers`` module.
+    """
+    global_group_to_use = (
+        state.process_group if state.process_group is not None else dist.group.WORLD
+    )
+
+    # The input tensor is a flattened 1D tensor.
+    input_tensor = bucket.buffer()
+
+    # Run allreduce using `global_group_to_use` in the first `start_localSGD_iter` iterations.
+    if state.iter < state.start_localSGD_iter:
+        state.maybe_increase_iter(bucket)
+        return default._allreduce_fut(global_group_to_use, input_tensor)
+
+    # If `post_local_gradient_allreduce` is not set,
+    # then no gradient synchronization after the first `start_localSGD_iter` iterations.
+    if not state.post_local_gradient_allreduce:
+        fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
+        fut.set_result(input_tensor)
+        return fut
+
+    # Run allreduce using `subgroup` after the first `start_localSGD_iter` iterations.
+    # Note that by default, a separate subgroup for each node is created which
+    # causes an intra-node allreduce to be done at each training step.
+    # From this moment, model averaging should run after the optimizer step,
+    # to globally allreduce all the parameters.
+    if state.subgroup is None:
+        state.subgroup, _ = dist.new_subgroups()
+    return default._allreduce_fut(state.subgroup, input_tensor)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..364cccec56fe28a0bb4451fd0d4f3a1565f49d10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -0,0 +1,850 @@
+from collections import defaultdict
+import logging
+import math
+from typing import Dict
+
+import torch
+import torch.distributed as dist
+
+from . import default_hooks as default
+from torch.distributed import distributed_c10d
+
+__all__ = [
+    "PowerSGDState", "powerSGD_hook", "batched_powerSGD_hook"
+]
+
+logger = logging.getLogger(__name__)
+
+
+def _orthogonalize(matrices, epsilon=0):
+    """
+    Decide between Gram-Schmidt or QR factorization to orthogonalize a batch of matrices.
+
+    QR factorization doesn't work with half-precision, but it is usually faster with a rank > 2.
+    """
+    assert len(matrices.shape) == 3 and matrices.shape[2] <= matrices.shape[1]
+
+    num_matrices = matrices.shape[0]
+    rank = matrices.shape[2]
+    dtype = matrices.dtype
+    if rank <= 2 or dtype in [torch.float16, torch.bfloat16]:
+        _orthogonalize_gram_schmidt(matrices, epsilon=epsilon)
+    else:
+        torch.linalg.qr(
+            matrices,
+            out=(
+                matrices,
+                torch.empty(num_matrices, rank, rank, device=matrices.device, dtype=dtype)
+            )
+        )
+
+def _orthogonalize_gram_schmidt(matrices, epsilon=0):
+    """
+    Apply Gram-Schmidt procedure to orthogonalize a batch of matrices.
+
+    If epsilon is 0, this is equivalent to `torch.qr(matrices, out=(matrices, _))`,
+    """
+    num_cols = matrices.shape[2]
+    for i in range(num_cols):
+        # Normalize the i'th column.
+        col = matrices[:, :, i : i + 1]
+        # If no epsilon is added here, division by zero may be caused by vanishing gradients.
+        # This epsilon is not needed if the input batch of matrices covers the gradients of at least one entire layer
+        # in the neural network.
+        if epsilon == 0:
+            # Note that col ** 2 can underflow/overflow if we use FP16.
+            # May need to consider multiplying a scaling factor and dividing it later, or using bfloat16 instead.
+            try:
+                col /= torch.norm(col, dim=1, keepdim=True)
+            except ZeroDivisionError:
+                logger.error(
+                    "The matrices to be orthogonalized has at least a column of all 0s. Please set a small value such as 1e-8 "
+                    "as `orthogonalization_epsilon` in PowerSGD state."
+                )
+                # Recover the values from NaNs to 0s.
+                col.fill_(0.0)
+        else:
+            col /= torch.norm(col, dim=1, keepdim=True) + epsilon
+        # Project it on the rest and remove it.
+        if i + 1 < num_cols:
+            rest = matrices[:, :, i + 1 :]
+            rest -= torch.sum(col * rest, dim=1, keepdim=True) * col
+
+
+def _should_compress(
+    num_rows, num_cols, matrix_approximation_rank, min_compression_rate
+):
+    """
+    Recommend if tensor given is worth compressing.
+
+    Returns a recommendation as to whether the 2D tensor described by the arguments is worth compressing,
+    including statistics describing the expected savings from compression.  We consider a tensor worth
+    compressing when ``min_compression_rate`` < uncompressed size / compressed size, where
+    uncompressed size = ``num_rows`` * ``num_cols``,
+    and compressed size = (``num_rows`` + ``num_cols``) * ``matrix_approximation_rank``.
+
+    The result of this function is a tuple of the form (compression_recommendation, uncompressed_el_count, compressed_el_count), where:
+
+    compression_recommendation is true if the tensor is worth compressing, and false otherwise (see above);
+
+    uncompressed_el_count is the uncompressed element count, i.e. ``num_rows`` * ``num_cols``; and,
+
+    compress_el_count is the element count after compression, i.e. (``num_rows`` + ``num_cols``) * ``matrix_approximation_rank``.
+    """  # noqa: B950
+    uncompressed_size = num_rows * num_cols
+    compressed_size = (num_rows + num_cols) * matrix_approximation_rank
+    return (
+        compressed_size * min_compression_rate < uncompressed_size,
+        uncompressed_size,
+        compressed_size,
+    )
+
+
+def _report_compression_stats(bucket, state):
+    """Report compression stats at frequency of ``compression_stats_logging_frequency`` specified in PowerSGD state."""
+    if (
+        bucket.is_last()
+        and state.iter >= state.next_stats_report
+    ):
+        stats = state.compression_stats()
+        logger.info(
+            "Compression stats: iter %s, total before compression %s, total after compression %s, "
+            "rate %s", state.iter, stats[1], stats[2], stats[0]
+        )
+        state.next_stats_report = state.iter + state.compression_stats_logging_frequency
+
+
+class PowerSGDState:
+    r"""
+    Store both the algorithm's hyperparameters and internal state for all gradients during training.
+
+    Particularly, ``matrix_approximation_rank`` and ``start_powerSGD_iter`` are the main hyperparameters that should be tuned by the user.
+    For performance, we suggest to keep binary hyperparameters ``use_error_feedback`` and ``warm_start`` on.
+
+    1. ``matrix_approximation_rank`` controls the size of compressed low-rank tensors, which determines the compression rate. The lower the rank, the stronger the compression.
+
+        1.1. If ``matrix_approximation_rank`` is too low, the full model quality will need more training steps to reach or will never reach and yield loss in accuracy.
+
+        1.2. The increase of ``matrix_approximation_rank`` can substantially increase the computation costs of the compression, and the accuracy may not be further improved beyond a certain ``matrix_approximation_rank`` threshold.
+
+    To tune ``matrix_approximation_rank``, we suggest to start from 1 and increase by factors of 2 (like an exponential grid search, 1, 2, 4, ...), until a satisfactory accuracy is reached. Typically only a small value 1-4 is used. For some NLP tasks (as shown in Appendix D of the original paper), this value has been increased to 32.
+
+    2. ``start_powerSGD_iter`` defers PowerSGD compression until step ``start_powerSGD_iter``, and vanilla allreduce runs prior to step ``start_powerSGD_iter``. This hybrid scheme of **vanilla allreduce + PowerSGD** can effectively improve the accuracy, even a relatively small ``matrix_approximation_rank`` is used. This is because that, the beginning of training phase is usually very sensitive to inaccurate gradients, and compressing gradients too early may make the training quickly take a suboptimal trajectory, which can result in an irrecoverable impact on the accuracy.
+
+    To tune ``start_powerSGD_iter``, we suggest to start with 10% of total training steps, and increase it until a satisfactory accuracy is reached. If there is a warm-up stage in the training, ``start_powerSGD_iter`` typically should be no less than the number of warm-up steps.
+
+    3. ``min_compression_rate`` is the minimum compression rate required when a layer is compressed. Due to the computation overheads incurred by the compression, a tensor is worth compressing only if there can be sufficient saving in bandwidth, where ``(num_rows + num_cols) * matrix_approximation_rank * min_compression_rate < num_rows * num_cols``. If the specified compression rate threshold cannot be satisfied, the tensor will be directly allreduced without compression.
+
+    Compression statistics are logged every ``compression_stats_logging_frequency`` iterations once PowerSGD compression starts.
+
+    4. ``orthogonalization_epsilon`` can be a very small value (e.g., 1e-8) added to every normalized matrix column in orthogonalization step, to prevent div-by-zero error if any column has all 0s. If this can already be prevented (e.g., by batch normalization), an epsilon of 0 is recommended for accuracy.
+
+    5. ``batch_tensors_with_same_shape`` controls whether to compress and decompress tensors with same shape in a batched operation to achieve higher parallelism. Note that you should also increase the bucket size (i.e., ``bucket_cap_mb`` arg in DDP constructor) to make more same-shaped tensors appear in the same bucket, however this may reduce the overlap between computation and communication, and increase the memory footprint due to stacking the tensors of the same shape. Set to ``True`` if the compression / decompression computation is a bottleneck.
+
+    .. warning ::
+        If error feedback or warm-up is enabled, the minimum value of ``start_powerSGD_iter`` allowed in DDP is 2.
+        This is because there is another internal optimization that rebuilds buckets at iteration 1 in DDP,
+        and this can conflict with any tensor memorized before the rebuild process.
+    """  # noqa: B950
+
+    __slots__ = [
+        "process_group",
+        # The fields below are the hyperparameters that often need to be tuned by the user.
+        "matrix_approximation_rank",
+        "start_powerSGD_iter",
+        # The fields below are the hyperparameters that seldom need be tuned by the user.
+        "min_compression_rate",
+        "orthogonalization_epsilon",
+        # The fields below are the binary hyperparameters recommended to be turned on for performance and accuracy.
+        "use_error_feedback",
+        "warm_start",
+        "batch_tensors_with_same_shape",
+        # The fields below are internal state.
+        "rng",
+        "error_dict",
+        "p_memory_dict",
+        "q_memory_dict",
+        "iter",
+        # The fields below are for recording compression stats.
+        "total_numel_before_compression",
+        "total_numel_after_compression",
+        "compression_stats_logging_frequency",
+        "next_stats_report",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        matrix_approximation_rank=1,
+        start_powerSGD_iter=1_000,
+        min_compression_rate=2,
+        use_error_feedback=True,
+        warm_start=True,
+        orthogonalization_epsilon=0,
+        random_seed=0,
+        compression_stats_logging_frequency=10_000,
+        batch_tensors_with_same_shape: bool = False,
+    ):
+        logger.info(
+            "PowerSGD config: matrix_approximation_rank = %s; start_powerSGD_iter = %s; "
+            "min_compression_rate = %s; orthogonalization_epsilon = %s; use_error_feedback = %s; warm_start = %s; "
+            "random_seed = %s; compression_stats_logging_frequency = %s; batch_tensors_with_same_shape = %s",
+            matrix_approximation_rank,
+            start_powerSGD_iter,
+            min_compression_rate,
+            orthogonalization_epsilon,
+            use_error_feedback,
+            warm_start,
+            random_seed,
+            compression_stats_logging_frequency,
+            batch_tensors_with_same_shape,
+        )
+
+        self.process_group = process_group
+        self.matrix_approximation_rank = matrix_approximation_rank
+        # Deferring PowerSGD compression util step 'start_powerSGD_iter' can have two advantages:
+        # 1) It turns out that PowerSGD may lead to a non-trivial accuracy loss,
+        # even if the matrix approximation rank is increased to a large value.
+        # To mitigate the accuracy loss, a simple yet effective way is mixing vanilla allreduce
+        # (or a more conservative compression such as FP16 compression) with PowerSGD.
+        # 2) There is an internal optimization of rebuilding buckets process in DDP,
+        # in order to save the memory space.
+        # This step takes place after the first iteration.
+        # However, this means that the shape of input bucketized tensors is subject to change,
+        # which will complicate the implementations of error feedback and warm-up.
+        # Running vanilla allreduce in the first few iterations can avoid this complexity.
+        if (use_error_feedback or warm_start) and start_powerSGD_iter <= 1:
+            raise ValueError(
+                "Expect `start_powerSGD_iter` > 1 if `use_error_feedback` or `warm_start` is enabled, "
+                "because PowerSGD can only be applied after the first two iterations in DDP."
+            )
+        self.start_powerSGD_iter = start_powerSGD_iter
+        self.min_compression_rate = min_compression_rate
+        # Error feedback is usually crucial for both for convergence and generalization,
+        # because PowerSGD is a biased compressor,
+        # i.e., compressing and decompressing a random gradient does not yield the original in expectation.
+        # This mechanism requires a temporary copy of the input gradients,
+        # so it increases the peak memory consumption by the size of the gradient tensor.
+        # However, if the target matrices are known to be exactly low-ranked (instead of just low stable rank),
+        # sometimes it is possible to converge to the optima without error feedback.
+        # See: http://proceedings.mlr.press/v54/yurtsever17a/yurtsever17a.pdf
+        self.use_error_feedback = use_error_feedback
+        # Warm-start reuses P(s) and Q(s) from the previous iteration.
+        # This can improve the approximation quality and hence improve the accuracy.
+        # Additionally, by avoiding the initialization of these low-rank tensors at every step,
+        # this can also accelerate training.
+        # However, this is at the cost of extra memory.
+        self.warm_start = warm_start
+        # Can use a very small value to prevent div-by-zero error caused by orthogonalization of vanishing gradients.
+        self.orthogonalization_epsilon = orthogonalization_epsilon
+        # The purpose of this RNG is to generate different random seeds for initializing Q across iterations,
+        # but in the same order for all the DDP replicas.
+        # Different random seeds across iterations indicate different 'projections' of the gradients at different SGD steps.
+        # If the same random projection is used,
+        # there will be differences between the gradients that are never synchronized.
+        import numpy as np
+        self.rng = np.random.RandomState(random_seed)
+        # Since there is only a single state instance for all the input buckets,
+        # need to maintain a dictionary that maps each bucket index to the local error.
+        self.error_dict: Dict[int, torch.Tensor] = {}
+        self.p_memory_dict: Dict[int, torch.Tensor] = {}
+        self.q_memory_dict: Dict[int, torch.Tensor] = {}
+        # Iteration/step in the training loop.
+        self.iter = 0
+        # Compression stats accumulators
+        self.total_numel_before_compression = 0
+        self.total_numel_after_compression = 0
+        # We'll report compression stats every 'compression_stats_logging_frequency' iterations
+        # Note that we always report compression stats at least once.
+        self.compression_stats_logging_frequency = max(
+            1, compression_stats_logging_frequency
+        )
+        self.next_stats_report = 0
+        # Batching tensors with same shape can increase parallelism in compression / decompression computation.
+        # This requires a larger bucket size to make more same-shaped tensor to appear in one bucket, however
+        # this may reduce the overlap between computation and communication, and increase the memory footprint
+        # due to stacking tensors.
+        # Turn on if compression / decompression computation is a bottleneck.
+        self.batch_tensors_with_same_shape = batch_tensors_with_same_shape
+
+    def __getstate__(self):
+        r"""
+        Return a ``Dict[str, Any]`` which will be pickled and saved.
+
+        ``process_group`` is not serializable and excluded from
+        a returned state.
+        """
+        logger.warning(
+            "NOTE: Process group is not serializable and excluded from a saved state."
+        )
+        return {
+            slot: getattr(self, slot)
+            for slot in self.__slots__ if slot != "process_group"
+        }
+
+    def __setstate__(self, state):
+        r"""
+        Take a provided ``state`` and set to this ``PowerSGDState`` instance.
+
+        ``process_group`` is set to default.
+        """
+        self.process_group = distributed_c10d._get_default_group()
+        logger.warning(
+            "NOTE: Process group will be set to a default group (i.e. the world size).\
+                If a different group is desired, please set `self.process_group` after PowerSGD state is loaded."
+        )
+        for slot, value in state.items():
+            setattr(self, slot, value)
+
+    def maybe_increase_iter(self, bucket):
+        """Track iterations and trigger log message at start of local SGD."""
+        # Since bucket 0 is the last bucket to allreduce in an iteration.
+        # Only increase `iter` when bucket 0 is processed.
+        if bucket.is_last():
+            self.iter += 1
+
+        if self.iter == self.start_powerSGD_iter:
+            logger.info(
+                "Start to apply PowerSGD after %s iterations.", self.iter
+            )
+
+    def compression_stats(self):
+        r"""
+        Return latest compression statistics as tuple.
+
+        Returns tuple of form (compress_rate, numel_before_compression, numel_after_compression) where:
+
+        compress_rate is the effective compression rate i.e. (number of elements before compression) / (number of elements after compression);
+
+        numel_before_compression is the total number of elements before compression was applied; and,
+
+        numel_after_compression is the total number of elements after compression was applied.
+        """  # noqa: B950
+        compress_rate = (
+            self.total_numel_before_compression / self.total_numel_after_compression
+            if self.total_numel_after_compression > 0
+            else 0
+        )
+        return (
+            compress_rate,
+            self.total_numel_before_compression,
+            self.total_numel_after_compression,
+        )
+
+
+def powerSGD_hook(
+    state: PowerSGDState, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    r"""
+    Implement PowerSGD algorithm.
+
+    This DDP communication hook implements PowerSGD gradient compression
+    algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
+    Once gradient tensors are aggregated across all workers, this hook applies
+    compression as follows:
+
+    1. Views the input flattened 1D gradient tensor as a list of per-parameter tensors, and divides all the tensors into two groups:
+
+        1.1 The tensors that should be compressed before allreduce, because the compression can give enough saving in bandwidth.
+
+        1.2 Rest of the tensors will be directly allreduced without compression, including all the vector tensors (for biases).
+
+    2. Handles uncompressed tensors:
+
+        2.1. Allocate contiguous memory for those uncompressed tensors, and allreduces all the uncompressed tensors as a batch, without compression;
+
+        2.2. Copies the individual uncompressed tensors from the contiguous memory back to the input tensor.
+
+    3. Handles the tensors that should be compressed by PowerSGD compression:
+
+        3.1. For each tensor M, creates two low-rank tensors P and Q for decomposing M,
+        such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+
+        3.2. Computes each P in Ps, which is equal to MQ;
+
+        3.3. Allreduces Ps as a batch;
+
+        3.4. Orthogonalizes each P in Ps;
+
+        3.5. Computes each Q in Qs, which is approximately equal to M^TP;
+
+        3.6. Allreduces Qs as a batch;
+
+        3.7. Computes each M among all the compressed tensors, which is approximately equal to PQ^T.
+
+    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    This not only gives the user more control over the tradeoff between speedup and accuracy,
+    but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+
+    Args:
+        state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
+            To tune the compression configs, mainly need to tune ``matrix_approximation_rank``, ``start_powerSGD_iter``
+            and ``min_compression_rate``.
+        bucket (dist.GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
+            Note that since DDP comm hook only supports single process single device mode,
+            only exactly one tensor is stored in this bucket.
+
+    Returns:
+        Future handler of the communication, which updates the gradients in place.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1,
+                                  start_powerSGD_iter=10, min_compression_rate=0.5)
+        >>> ddp_model.register_comm_hook(state, powerSGD_hook)
+    """  # noqa: B950
+    process_group = state.process_group
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    # The input tensor is a flattened 1D tensor.
+    input_tensor = bucket.buffer()
+
+    # Run vanilla allreduce in the first `start_powerSGD_iter` iterations.
+    if state.iter < state.start_powerSGD_iter:
+        state.maybe_increase_iter(bucket)
+        return default._allreduce_fut(group_to_use, input_tensor)
+
+    # Apply PowerSGD after `start_powerSGD_iter` iterations.
+    device = input_tensor.device
+    dtype = input_tensor.dtype
+
+    # Incorporate the error from the previous state into the gradients.
+    bucket_index = bucket.index()
+    input_tensor_cp = None
+    total_length = input_tensor.shape[0]
+    if state.use_error_feedback:
+        if bucket_index in state.error_dict:
+            input_tensor.add_(state.error_dict[bucket_index])
+        else:
+            logger.info(
+                "A zero tensor of length %s that represents local error is created.",
+                total_length
+            )
+            state.error_dict[bucket_index] = torch.zeros(
+                total_length, device=device, dtype=dtype
+            )
+
+        # Keep a copy of the input tensor,
+        # so that we can compute the local error caused by compression later,
+        # by comparing this copy and the input tensor updated after decompression.
+        input_tensor_cp = torch.clone(input_tensor).detach()
+
+    # Unflatten the input tensor into per-parameter tensors, for layer-wise compression.
+    tensors = bucket.gradients()
+
+    # Step I: Divide all the tensors into two groups,
+    # one will be compressed before allreduce and the other will be directly allreduced without compression.
+    tensors_to_compress, uncompressed_tensors = [], []
+    total_Ps_size = 0
+    total_Qs_size = 0
+    for tensor in tensors:
+        matrix = tensor.view(tensor.shape[0], -1)
+        n, m = matrix.shape
+        matrix_approximation_rank = min(n, m, state.matrix_approximation_rank)
+        compress_test = _should_compress(
+            n, m, matrix_approximation_rank, state.min_compression_rate
+        )
+        state.total_numel_before_compression += compress_test[1]
+        if compress_test[0]:
+            tensors_to_compress.append(matrix)
+            total_Ps_size += n * matrix_approximation_rank
+            total_Qs_size += m * matrix_approximation_rank
+            state.total_numel_after_compression += compress_test[2]
+        else:
+            uncompressed_tensors.append(tensor)
+            state.total_numel_after_compression += compress_test[1]
+
+    _report_compression_stats(bucket, state)
+
+    # Step II: Handle uncompressed tensors.
+    # Allocate contiguous memory for these tensors to allreduce efficiently.
+    uncompressed_tensors_memory = (
+        torch.cat([tensor.view(-1) for tensor in uncompressed_tensors])
+        if uncompressed_tensors
+        else torch.tensor([], device=device, dtype=dtype)
+    )
+
+    # Step III: Handle the tensors that should be compressed.
+    # Allocate contiguous memory for Ps and Qs to allreduce efficiently.
+    # If warm-start is enabled, reuse Ps and Qs from the previous iteration if possible.
+    # The memory spaces of Ps and Qs need to be allocated in the first iteration when PowerSGD is applied.
+    need_randomize_qs = False
+    if not state.warm_start or bucket_index not in state.p_memory_dict:
+        need_randomize_qs = True
+        # If warm-start is disabled, low-rank tensors will be initialized at every step.
+        # Only log this if warm-start to avoid spamming.
+        if state.warm_start:
+            logger.info(
+                "Allocating contiguous memory of length %s for Ps, and of length %s for Qs, respectively.",
+                total_Ps_size, total_Qs_size
+            )
+        state.p_memory_dict[bucket_index] = torch.empty(
+            total_Ps_size, device=device, dtype=dtype
+        )
+        state.q_memory_dict[bucket_index] = torch.empty(
+            total_Qs_size, device=device, dtype=dtype
+        )
+
+    # Batch tensors to compress by shape.
+    shape_to_tensors = defaultdict(list)
+    for tensor in tensors_to_compress:
+        shape_to_tensors[tensor.shape].append(tensor)
+
+    # This function decides whether to batch tensors with same shape or not according to the argument,
+    # so the following process could share the same code.
+    def maybe_batched_tensors_to_compress():
+        for tensors in shape_to_tensors.values():
+            if state.batch_tensors_with_same_shape:
+                batch_size = len(tensors)
+                if batch_size == 1:
+                    # Use the original tensor to avoid copy.
+                    yield tensors[0].unsqueeze(0)
+                else:
+                    yield torch.stack(tensors)
+            else:
+                for tensor in tensors:
+                    yield tensor.unsqueeze(0)
+
+    # Create Ps and Qs that point to the allocated memory.
+    tensors_to_compress = []
+    ps = []
+    qs = []
+    p_idx = 0
+    q_idx = 0
+    for tensor in maybe_batched_tensors_to_compress():
+        batch_size, n, m = tensor.shape
+        matrix_approximation_rank = min(n, m, state.matrix_approximation_rank)
+        tensors_to_compress.append(tensor)
+        ps.append(
+            state.p_memory_dict[bucket_index][
+                p_idx : p_idx + batch_size * n * matrix_approximation_rank
+            ].view(batch_size, n, matrix_approximation_rank)
+        )
+        qs.append(
+            state.q_memory_dict[bucket_index][
+                q_idx : q_idx + batch_size * m * matrix_approximation_rank
+            ].view(batch_size, m, matrix_approximation_rank)
+        )
+        p_idx += batch_size * n * matrix_approximation_rank
+        q_idx += batch_size * m * matrix_approximation_rank
+
+    # If warm-start is enabled, reuse Qs from the previous iteration if possible and skip filling random values.
+    # The exception is the first iteration when PowerSGD is applied.
+    if not need_randomize_qs:
+        for q in qs:
+            _orthogonalize(q, state.orthogonalization_epsilon)
+    else:
+        with torch.random.fork_rng(devices=[]):
+            # Fork this RNG to avoid changing the seed globally and affecting the random sampling anywhere else in the training.
+            # The seed makes sure that the initial random values are the same across all the DDP replicas.
+            # This seed should differ at every step.
+            # Since it is very slow to fork RNG state across all the CUDA devices,
+            # only fork on CPU and then move the generated tensor to the CUDA device (by overwriting q).
+            torch.manual_seed(state.rng.randint(1_000_000_000))
+            for q in qs:
+                q.copy_(
+                    torch.randn(
+                        *q.shape,
+                        device="cpu",
+                        dtype=dtype,
+                    )
+                )
+                _orthogonalize(q, state.orthogonalization_epsilon)
+
+    # Compute Ps.
+    for tensor, q, p in zip(tensors_to_compress, qs, ps):
+        torch.bmm(tensor, q, out=p)
+
+    # This allreduce is only applied to uncompressed tensors,
+    # so it should have been kicked off before the above computation on the compressed tensors to hide more communication costs.
+    # However, this somehow requires a separate future chain at this time.
+    allreduce_contiguous_uncompressed_tensors_fut = dist.all_reduce(
+        uncompressed_tensors_memory, group=group_to_use, async_op=True
+    ).get_future()
+
+    def unpack_uncompressed_tensors_and_allreduce_ps(fut):
+        uncompressed_tensors_memory = fut.value()[0].div_(world_size)
+        idx = 0
+        for tensor in uncompressed_tensors:
+            tensor.copy_(
+                uncompressed_tensors_memory[idx : idx + tensor.numel()].view_as(tensor)
+            )
+            idx += tensor.numel()
+
+        # Since these Ps will be orthogonalized later, no need to divide them by world size.
+        return (
+            dist.all_reduce(
+                state.p_memory_dict[bucket_index], group=group_to_use, async_op=True
+            )
+            .get_future()
+            .wait()[0]
+        )
+
+    def compute_qs(fut):
+        state.p_memory_dict[bucket_index] = fut.value()
+        for p in ps:
+            _orthogonalize(p, state.orthogonalization_epsilon)
+
+        # Compute Qs.
+        for tensor, p, q in zip(tensors_to_compress, ps, qs):
+            torch.bmm(tensor.transpose(1, 2), p, out=q)
+
+        # TODO: The above procedure does two matmul+allreduce steps per iteration --
+        # one left multiplication and one right multiplication.
+        # For warm-start, can take one such step at a time, and alternate between them.
+
+        # Allreduce Qs.
+        return (
+            dist.all_reduce(
+                state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
+            )
+            .get_future()
+            .wait()[0]
+        )
+
+    def decompress(fut):
+        state.q_memory_dict[bucket_index] = fut.value().div_(world_size)
+
+        for p, q, tensor in zip(ps, qs, tensors_to_compress):
+            torch.bmm(p, q.transpose(1, 2), out=tensor)
+
+        # Copy batched tensors back to original buffer.
+        if state.batch_tensors_with_same_shape:
+            for tensor in tensors_to_compress:
+                if tensor.shape[0] == 1:
+                    # Skip tensor with batch_size == 1 since itself is the original tensor.
+                    continue
+                original_tensors = shape_to_tensors[tensor.shape[1:]]
+                for i, original_tensor in enumerate(original_tensors):
+                    original_tensor.copy_(tensor[i])
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(device)
+
+        if state.use_error_feedback:
+            # Memorize the local errors.
+            state.error_dict[bucket_index] = input_tensor_cp - input_tensor
+        if not state.warm_start:
+            state.p_memory_dict.clear()
+            state.q_memory_dict.clear()
+
+        state.maybe_increase_iter(bucket)
+
+        return input_tensor
+
+    return (
+        allreduce_contiguous_uncompressed_tensors_fut.then(
+            unpack_uncompressed_tensors_and_allreduce_ps
+        )
+        .then(compute_qs)
+        .then(decompress)
+    )
+
+
+def batched_powerSGD_hook(
+    state: PowerSGDState, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    r"""
+    Implement simplified PowerSGD algorithm.
+
+    This DDP communication hook implements a simplified PowerSGD gradient compression
+    algorithm described in the `paper <https://arxiv.org/abs/1905.13727>`_.
+    This variant does not compress the gradients layer by layer,
+    but instead compresses the flattened input tensor that batches all the gradients.
+    Therefore, it is **faster** than :meth:`powerSGD_hook`,
+    but usually results in a **much lower accuracy**, unless ``matrix_approximation_rank`` is 1.
+
+    .. warning ::
+        Increasing ``matrix_approximation_rank`` here may not necessarily increase the accuracy,
+        because batching per-parameter tensors without column/row alignment can destroy low-rank structure.
+        Therefore, the user should always consider :meth:`powerSGD_hook` first,
+        and only consider this variant when a satisfactory accuracy can be achieved when ``matrix_approximation_rank`` is 1.
+
+    Once gradient tensors are aggregated across all workers, this hook applies
+    compression as follows:
+
+    1. Views the input flattened 1D gradient tensor as a square-shaped tensor M with 0 paddings;
+
+    2. Creates two low-rank tensors P and Q for decomposing M, such that M = PQ^T, where Q is initialized from a standard normal distribution and orthogonalized;
+
+    3. Computes P, which is equal to MQ;
+
+    4. Allreduces P;
+
+    5. Orthogonalizes P;
+
+    6. Computes Q, which is approximately equal to M^TP;
+
+    7. Allreduces Q;
+
+    8. Computes M, which is approximately equal to PQ^T.
+
+    9. Truncates the input tensor to the original length.
+
+    Note that this communication hook enforces vanilla allreduce for the first ``state.start_powerSGD_iter`` iterations.
+    This not only gives the user more control over the tradeoff between speedup and accuracy,
+    but also helps abstract away some complexity of the internal optimization of DDP for future communication hook developers.
+
+    Args:
+        state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
+            To tune the compression configs, mainly need to tune ``matrix_approximation_rank`` and ``start_powerSGD_iter``.
+        bucket (dist.GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
+            Note that since DDP comm hook only supports single process single device mode,
+            only exactly one tensor is stored in this bucket.
+
+    Returns:
+        Future handler of the communication, which updates the gradients in place.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> state = PowerSGDState(process_group=process_group, matrix_approximation_rank=1)
+        >>> ddp_model.register_comm_hook(state, batched_powerSGD_hook)
+    """  # noqa: B950
+    process_group = state.process_group
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    world_size = group_to_use.size()
+
+    # The input tensor is a flattened 1D tensor.
+    input_tensor = bucket.buffer()
+
+    # Run vanilla allreduce in the first `start_powerSGD_iter` iterations.
+    if state.iter < state.start_powerSGD_iter:
+        state.maybe_increase_iter(bucket)
+        return default._allreduce_fut(group_to_use, input_tensor)
+
+    # Apply PowerSGD after `start_powerSGD_iter` iterations.
+    device = input_tensor.device
+    total_length = input_tensor.shape[0]
+    state.total_numel_before_compression += total_length
+
+    # View the input tensor as a 2D square-shape tensor, and pad 0s if necessary.
+    square_side_length = math.ceil(math.sqrt(total_length))
+    state.total_numel_after_compression += (
+        square_side_length * state.matrix_approximation_rank * 2
+    )
+    padded_total_length = square_side_length ** 2
+    input_tensor.resize_(padded_total_length)
+    input_tensor[total_length:padded_total_length].fill_(0)
+
+    _report_compression_stats(bucket, state)
+
+    # Incorporate the error from the previous state into the gradients.
+    bucket_index = bucket.index()
+    input_tensor_cp = None
+    if state.use_error_feedback:
+        if bucket_index in state.error_dict:
+            input_tensor.add_(state.error_dict[bucket_index])
+        else:
+            logger.info(
+                "A zero tensor of length %s that represents local error is created.",
+                padded_total_length
+            )
+            state.error_dict[bucket_index] = torch.zeros(
+                padded_total_length, device=device, dtype=input_tensor.dtype
+            )
+
+        # Keep a copy of the input tensor,
+        # so that we can compute the local error caused by compression later,
+        # by comparing this copy and the input tensor updated after decompression.
+        input_tensor_cp = torch.clone(input_tensor).detach()
+    matrix = input_tensor.view(square_side_length, square_side_length)
+
+    # Reuse P and Q from the previous iteration if possible.
+    # The memory spaces of P and Q need to be allocated in the first iteration when PowerSGD is applied.
+    if not state.warm_start or bucket_index not in state.p_memory_dict:
+        # If warm-start is disabled, low-rank tensors will be initialized at every step.
+        # Only log this if warm-start to avoid spamming.
+        if state.warm_start:
+            logger.info(
+                "Initializing low-rank tensors P and Q, each of which has a shape of %s x %s.",
+                square_side_length, state.matrix_approximation_rank
+            )
+
+        def create_low_rank_tensor(fill_random_values, rng):
+            """Return a low-rank 2D tensor of square_side_length * matrix_approximation_rank."""
+            if fill_random_values:
+                with torch.random.fork_rng(devices=[]):
+                    # Fork this RNG to avoid changing the seed globally and affecting the random sampling
+                    # anywhere else in the training.
+                    # The seed makes sure that the initial random values are the same across all the DDP replicas.
+                    # This seed should differ at every step.
+                    # Since it is very slow to fork RNG state across all the CUDA devices,
+                    # only fork on CPU and then move the generated tensor to the CUDA device.
+                    torch.manual_seed(rng.randint(1_000_000_000))
+                    return torch.randn(
+                        square_side_length,
+                        state.matrix_approximation_rank,
+                        device="cpu",
+                        dtype=input_tensor.dtype,
+                    ).to(device)
+            else:
+                return torch.empty(
+                    square_side_length,
+                    state.matrix_approximation_rank,
+                    device=device,
+                    dtype=input_tensor.dtype,
+                )
+
+        state.p_memory_dict[bucket_index] = create_low_rank_tensor(
+            fill_random_values=False, rng=state.rng
+        )
+        state.q_memory_dict[bucket_index] = create_low_rank_tensor(
+            fill_random_values=True, rng=state.rng
+        )
+    _orthogonalize(state.q_memory_dict[bucket_index])
+
+    torch.matmul(
+        matrix, state.q_memory_dict[bucket_index], out=state.p_memory_dict[bucket_index]
+    )
+    allreduce_p_fut = dist.all_reduce(
+        state.p_memory_dict[bucket_index], group=group_to_use, async_op=True
+    ).get_future()
+
+    def compute_q(fut):
+        state.p_memory_dict[bucket_index] = fut.value()[0]
+        _orthogonalize(state.p_memory_dict[bucket_index])
+
+        torch.matmul(
+            matrix.t(),
+            state.p_memory_dict[bucket_index],
+            out=state.q_memory_dict[bucket_index],
+        )
+
+        # TODO: The above procedure does two matmul+allreduce steps per iteration --
+        # one left multiplication and one right multiplication.
+        # For warm-start, can take one such step at a time, and alternate between them.
+
+        return (
+            dist.all_reduce(
+                state.q_memory_dict[bucket_index], group=group_to_use, async_op=True
+            )
+            .get_future()
+            .wait()[0]
+        )
+
+    def decompress(fut):
+        state.q_memory_dict[bucket_index] = fut.value().div_(world_size)
+        torch.matmul(
+            state.p_memory_dict[bucket_index],
+            state.q_memory_dict[bucket_index].t(),
+            out=matrix,
+        )
+
+        if state.use_error_feedback:
+            # Memorize the local errors.
+            state.error_dict[bucket_index] = input_tensor_cp - input_tensor
+        # Removing this seemingly unnecessary sync somehow may cause failures.
+        # See: https://github.com/pytorch/pytorch/pull/54838
+        if torch.cuda.is_available():
+            torch.cuda.synchronize(device)
+        if not state.warm_start:
+            state.p_memory_dict.clear()
+            state.q_memory_dict.clear()
+        ret = input_tensor.resize_(total_length)
+
+        state.maybe_increase_iter(bucket)
+
+        return ret
+
+    return allreduce_p_fut.then(compute_q).then(decompress)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5091f9cbe918217aa135a46729c191b5320cd82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py
@@ -0,0 +1,217 @@
+import torch
+import torch.distributed as dist
+from torch import nn
+
+
+def _quantize_per_tensor_cuda(x, scale, zero_point):
+    y = torch.round(x / scale) + zero_point
+    y = torch.clamp(y, 0, 255).to(torch.uint8)
+    return y
+
+
+def _dequantize_per_tensor_cuda(y, scale, zero_point):
+    x = scale * (y.to(torch.float32) - zero_point)
+    return x
+
+
+def _quantize_per_channel_cuda(x, scale, zero_point):
+    y = torch.zeros(x.size(), device=x.device)
+    for i in range(x.size()[0]):
+        y[i, :] = torch.round(x[i, :] / scale[i]) + zero_point[i]
+    y = torch.clamp(y, 0, 255).to(torch.uint8)
+    return y
+
+
+def _dequantize_per_channel_cuda(y, scale, zero_point):
+    y = y.to(torch.float32).cuda(y.device)
+    x = torch.zeros_like(y, device=y.device)
+    for i in range(x.size()[0]):
+        x[i, :] = scale[i] * (y[i, :] - zero_point[i])
+    return x
+
+
+def _get_allgather_out_list(all_gather_in_list, world_size):
+    out_list = [
+        torch.zeros_like(
+            all_gather_in_list,
+            device=all_gather_in_list.device,
+            dtype=all_gather_in_list.dtype,
+        )
+        for _ in range(world_size)
+    ]
+    return out_list
+
+
+def quantization_pertensor_hook(
+    process_group: dist.ProcessGroup, bucket: dist.GradBucket
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Apply ``torch.quantize_per_tensor`` logic to DDP using ``allgather`` protocol.
+
+    Workers first allgather the scale and zero point of their own
+    ``GradBucket`` prior to the quantization. After all workers have that information,
+    the first ``then`` callback called ``quantize_and_allgather`` quantizes worker's
+    own gradient tensor, and uses ``allgather`` to communicate these across all workers.
+    The final ``then`` callback called ``dequantize_and_aggregate``, dequantizes and
+    aggregates each quantized gradient tensor locally and returns the mean.
+
+    .. warning ::
+        This is experimental, and uses ``allgather`` protocol which is considerably slower than
+        ``allreduce`` protocol. It works only with flattened grads.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(process_group, quantization_pertensor_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    rank = process_group.rank() if process_group is not None else dist.get_rank()
+    world_size = group_to_use.size()
+
+    tensor = bucket.buffer()
+
+    myObserver = torch.ao.quantization.MinMaxObserver().cuda(tensor.device)
+    myObserver(tensor)
+
+    s, z = myObserver.calculate_qparams()
+    s_and_z = torch.FloatTensor([s, z]).cuda(tensor.device)
+
+    all_ranks_s_and_z = _get_allgather_out_list(s_and_z, world_size)
+
+    # First, allgather scale and zeros.
+    fut = dist.all_gather(
+        all_ranks_s_and_z, s_and_z, group=group_to_use, async_op=True
+    ).get_future()
+
+    def quantize_and_allgather(fut):
+        # Store scale and zeros across all workers.
+        all_ranks_s_and_z = fut.wait()[0]
+        # All workers quantize their own ``GradBucket`` tensors.
+        quantized_tensor = _quantize_per_tensor_cuda(
+            tensor, all_ranks_s_and_z[rank][0], all_ranks_s_and_z[rank][1]
+        )
+        # Allgather quantized tensors.
+        fut = dist.all_gather(
+            _get_allgather_out_list(quantized_tensor, world_size),
+            quantized_tensor,
+            group=group_to_use,
+            async_op=True,
+        ).get_future()
+
+        return fut.wait()
+
+    def dequantize_and_aggregate(fut):
+        all_ranks_quantized_tensor = fut.wait()[0]
+
+        aggregated_dequantized_tensor = torch.zeros_like(
+            all_ranks_quantized_tensor[0], device=tensor.device, dtype=torch.float32
+        )
+        # Using previously allgathered scales and zeros, dequantize gradient tensors
+        # locally and then aggregate them.
+        for r, quantized_tensor in enumerate(all_ranks_quantized_tensor):
+            aggregated_dequantized_tensor += _dequantize_per_tensor_cuda(
+                quantized_tensor, all_ranks_s_and_z[r][0], all_ranks_s_and_z[r][1]
+            )
+
+        return aggregated_dequantized_tensor / world_size
+
+    return fut.then(quantize_and_allgather).then(dequantize_and_aggregate)
+
+
+def quantization_perchannel_hook(
+    process_group: dist.ProcessGroup, bucket: dist.GradBucket, bucket_size=512
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Apply``torch.quantize_per_channel`` logic to DDP using ``allgather`` protocol.
+
+    Compared to per-tensor, the main motivation of per-channel is
+    for considerably large tensors such as a tensor that contains 6 million
+    elements quantizing per a bucket size of 512 (or 128) elements may significantly
+    increase the resolution.
+
+    It first splits ``GradBucket`` tensor into multiple chunks (channels) of ``bucket_size``
+    elements. Then, workers allgather the scales and zero points of their own
+    ``GradBucket`` prior to the quantization. After all workers have that information,
+    the first ``then`` callback called ``quantize_and_allgather`` quantizes worker's
+    own gradient tensor, and uses ``allgather`` to communicate these across all workers.
+    The final ``then`` callback called ``dequantize_and_aggregate``, dequantizes, flattens, and
+    aggregates each quantized gradient tensor locally and returns the mean.
+
+    .. warning ::
+        This is experimental, and uses ``allgather`` protocol which is considerably slower than
+        ``allreduce`` protocol. It works only with flattened grads.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(process_group, quantization_perchannel_hook)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+    rank = process_group.rank() if process_group is not None else dist.get_rank()
+    world_size = group_to_use.size()
+
+    tensor = bucket.buffer()
+
+    tensor_in_channels = (
+        nn.functional.pad(
+            input=tensor,
+            pad=(0, bucket_size - len(tensor) % bucket_size),
+            mode="constant",
+            value=0,
+        )
+        .view(-1, bucket_size)
+        .cuda(tensor.device)
+    )
+
+    myPerChannelObserver = torch.ao.quantization.PerChannelMinMaxObserver().cuda(
+        tensor.device
+    )
+    myPerChannelObserver(tensor_in_channels)
+
+    s_ch, z_ch = myPerChannelObserver.calculate_qparams()
+    s_and_z = torch.stack((s_ch, z_ch)).cuda(tensor.device)
+
+    all_ranks_s_and_z = _get_allgather_out_list(s_and_z, world_size)
+    # First, allgather scale and zeros.
+    fut = dist.all_gather(
+        all_ranks_s_and_z, s_and_z, group=group_to_use, async_op=True
+    ).get_future()
+
+    def quantize_and_allgather(fut):
+        # Store scale and zeros across all workers.
+        all_ranks_s_and_z = fut.wait()[0]
+        # All workers quantize their corresponding ``GradBucket`` tensors.
+        quantized_tensor = _quantize_per_channel_cuda(
+            tensor_in_channels,
+            all_ranks_s_and_z[rank, 0, :],
+            all_ranks_s_and_z[rank, 1, :],
+        )
+        # Allgather quantized tensors.
+        fut = dist.all_gather(
+            _get_allgather_out_list(quantized_tensor, world_size),
+            quantized_tensor,
+            group=group_to_use,
+            async_op=True,
+        ).get_future()
+
+        return fut.wait()
+
+    def dequantize_and_aggregate(fut):
+        all_ranks_quantized_tensor = fut.wait()[0]
+
+        aggregated_dequantized_tensor = torch.zeros_like(
+            all_ranks_quantized_tensor[0], device=tensor.device, dtype=torch.float32
+        )
+        # Using previously allgathered scales and zeros, dequantize gradient tensors
+        # locally and then aggregate them.
+        for r, quantized_tensor in enumerate(all_ranks_quantized_tensor):
+            aggregated_dequantized_tensor += _dequantize_per_channel_cuda(
+                quantized_tensor, all_ranks_s_and_z[r][0], all_ranks_s_and_z[r][1]
+            )
+
+        return (
+            torch.flatten(aggregated_dequantized_tensor).cuda(tensor.device)[
+                : tensor.size()[0]
+            ]
+            / world_size
+        )
+
+    return fut.then(quantize_and_allgather).then(dequantize_and_aggregate)
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/join.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/join.py
new file mode 100644
index 0000000000000000000000000000000000000000..bab93d0479bcad17779e74dcf79f0db1ef4a85d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/join.py
@@ -0,0 +1,346 @@
+import warnings
+from abc import ABC, abstractmethod
+from types import TracebackType
+from typing import Any, List, NamedTuple, Optional, Type
+
+import torch
+import torch.distributed as dist
+
+__all__ = ['JoinHook', 'Joinable', 'Join']
+
+class JoinHook:
+    r"""
+    This defines a join hook, which provides two entry points in the join context manager.
+
+    Entry points : a main hook, which is called repeatedly while there exists a non-joined
+    process, and a post-hook, which is called once all processes have joined.
+
+    To implement a join hook for the generic join context manager, define a
+    class that inherits from :class:`JoinHook` and override ``main_hook()`` and
+    ``post_hook()`` as appropriate.
+    """
+
+    def main_hook(self) -> None:
+        r"""Call this hook while there exists a non-joined process to shadow collective communications in a training iteration.
+
+        Training iteration i.e., in one forward pass, backward pass, and optimizer step.
+        """
+        ...
+
+    def post_hook(self, is_last_joiner: bool) -> None:
+        r"""
+        Call hook after all processes have joined.
+
+        It is passed an additional ``bool`` argument ``is_last_joiner``, which indicates if the rank is one of the last to join.
+
+        Arguments:
+            is_last_joiner (bool): ``True`` if the rank is one of the last to
+                join; ``False`` otherwise.
+        """
+        ...
+
+
+class Joinable(ABC):
+    r"""
+    This defines an abstract base class for joinable classes.
+
+    A joinable class
+    (inheriting from :class:`Joinable`) should implement :meth:`join_hook`,
+    which returns a :class:`JoinHook` instance, in addition to
+    :meth:`join_device` and :meth:`join_process_group` that return device and
+    process group information, respectively.
+    """
+
+    @abstractmethod
+    def __init__(self):
+        super().__init__()
+        self._join_config = _JoinConfig.construct_disabled_join_config()
+
+    @abstractmethod
+    def join_hook(self, **kwargs) -> JoinHook:
+        r"""
+        Return a :class:`JoinHook` instance for the given :class:`Joinable`.
+
+        Arguments:
+            kwargs (dict): a :class:`dict` containing any keyword arguments
+                to modify the behavior of the join hook at run time; all
+                :class:`Joinable` instances sharing the same join context
+                manager are forwarded the same value for ``kwargs``.
+        """
+        ...
+
+    @property
+    @abstractmethod
+    def join_device(self) -> torch.device:
+        r"""Return the device from which to perform collective communications needed by the join context manager."""
+        ...
+
+    @property
+    @abstractmethod
+    def join_process_group(self) -> Any:
+        r"""Returns the process group for the collective communications needed by the join context manager itself."""
+        ...
+
+
+class _JoinConfig(NamedTuple):
+    r"""This includes all fields needed from a :class:`Joinable` instance for the join context manager side."""
+
+    enable: bool
+    throw_on_early_termination: bool
+    is_first_joinable: bool
+
+    @staticmethod
+    def construct_disabled_join_config():
+        r"""Return a :class:`_JoinConfig` instance indicating that join-related logic should be disabled.
+
+        e.g. if the caller is not in a join context manager.
+        """
+        return _JoinConfig(
+            enable=False,
+            throw_on_early_termination=False,
+            is_first_joinable=False
+        )
+
+
+
+class Join:
+    r"""
+    This class defines the generic join context manager, which allows custom hooks to be called after a process joins.
+
+    These hooks should shadow the
+    collective communications of non-joined processes to prevent hanging and
+    erroring and to ensure algorithmic correctness. Refer to :class:`JoinHook`
+    for details about the hook definition.
+
+    .. warning::
+        The context manager requires each participating :class:`Joinable` to
+        call the method :meth:`notify_join_context()` before its own per-
+        iteration collective communications to ensure correctness.
+
+    .. warning::
+        The context manager requires that all ``process_group`` attributes in
+        the :class:`JoinHook` objects are the same. If there are multiple
+        :class:`JoinHook` objects, then the ``device`` of the first is used.
+        The process group and device information is used for checking for non-
+        joined processes and for notifying processes to throw an exception if
+        ``throw_on_early_termination`` is enabled, both of which using an all-
+        reduce.
+
+    Arguments:
+        joinables (List[Joinable]): a list of the participating
+            :class:`Joinable` s; their hooks are iterated over in the given
+            order.
+
+        enable (bool): a flag enabling uneven input detection; setting to
+            ``False`` disables the context manager's functionality and should
+            only be set when the user knows the inputs will not be uneven
+            (default: ``True``).
+
+        throw_on_early_termination (bool): a flag controlling whether to throw an
+            exception upon detecting uneven inputs (default: ``False``).
+
+    Example::
+
+        >>> import os
+        >>> import torch
+        >>> import torch.distributed as dist
+        >>> import torch.multiprocessing as mp
+        >>> # xdoctest: +SKIP
+        >>> import torch.nn.parallel.DistributedDataParallel as DDP
+        >>> import torch.distributed.optim.ZeroRedundancyOptimizer as ZeRO
+        >>> from torch.distributed.algorithms.join import Join
+        >>>
+        >>> # On each spawned worker
+        >>> def worker(rank):
+        >>>     dist.init_process_group("nccl", rank=rank, world_size=2)
+        >>>     model = DDP(torch.nn.Linear(1, 1).to(rank), device_ids=[rank])
+        >>>     optim = ZeRO(model.parameters(), torch.optim.Adam, lr=0.01)
+        >>>     # Rank 1 gets one more input than rank 0
+        >>>     inputs = [torch.tensor([1.]).to(rank) for _ in range(10 + rank)]
+        >>>     with Join([model, optim]):
+        >>>         for input in inputs:
+        >>>             loss = model(input).sum()
+        >>>             loss.backward()
+        >>>             optim.step()
+        >>>     # All ranks reach here without hanging/erroring
+    """
+
+    def __init__(
+        self,
+        joinables: List[Joinable],
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+        **kwargs,
+    ):
+        if len(joinables) == 0:
+            raise ValueError("The join context manager requires at least one joinable")
+        self._joinables = joinables
+        self._join_hooks = [joinable.join_hook(**kwargs) for joinable in self._joinables]
+        self._enable = enable
+        self._throw_on_early_termination = throw_on_early_termination
+        self._set_joinable_configs()
+        self._extract_dist_info()
+
+    def _set_joinable_configs(self) -> None:
+        r"""Set the :class:`_JoinConfig` of each participating :class:`Joinable`."""
+        assert len(self._joinables) > 0
+        is_first_joinable = True
+        for joinable in self._joinables:
+            joinable._join_config = _JoinConfig(
+                enable=self._enable,
+                throw_on_early_termination=self._throw_on_early_termination,
+                is_first_joinable=is_first_joinable
+            )
+            is_first_joinable = False
+
+    def _extract_dist_info(self) -> None:
+        r"""
+        Extract the process group and device information from the joinables.
+
+        If there are multiple joinables, then the context manager uses the
+        first specified device.
+
+        Preconditions:
+            ``self._joinables`` is not ``None`` and is non-empty.
+
+        Raises:
+            ValueError
+                If there are multiple conflicting ``process_group`` attributes
+                among the ``Joinable`` objects.
+        """
+        process_group = None
+        device = None
+        for joinable in self._joinables:
+            if process_group is None:
+                process_group = joinable.join_process_group
+            elif process_group != joinable.join_process_group:
+                raise ValueError("Using join context manager with multiple process groups")
+            if device is None:
+                device = joinable.join_device
+        self._process_group = process_group
+        self._rank = dist.get_rank(self._process_group)
+        self._device = device
+
+    def __enter__(self):
+        ...
+
+    def __exit__(
+        self,
+        type: Optional[Type[BaseException]],
+        value: Optional[BaseException],
+        traceback: Optional[TracebackType]
+    ):
+        r"""
+        Repeatedly runs the main hooks until all processes join; then, runs the post-hooks.
+
+        Raises:
+            RuntimeError
+                If ``throw_on_early_termination=True``.
+        """
+        if not self._enable or type:
+            return  # propagate the exception directly if one was raised
+
+        all_procs_joined = False
+        is_last_joiner = True
+
+        i = 0
+        WARN_THRESHOLD = 1000
+        warnings.simplefilter("once")
+
+        while not all_procs_joined:
+            if i > WARN_THRESHOLD:
+                warnings.warn(
+                    "Detected uneven input skew of greater than "
+                    f"{WARN_THRESHOLD}. This means that rank "
+                    f"{self._rank} has at least {WARN_THRESHOLD} "
+                    f"fewer inputs than other currently-active ranks. "
+                    "This level of skew could lead to performance "
+                    "degradation during training."
+                )
+            # Shadow the all-reduce in non-joined processes
+            num_nonjoined_procs = self._get_num_nonjoined_procs()
+            if num_nonjoined_procs == 0:
+                all_procs_joined = True
+            else:
+                if self._throw_on_early_termination:
+                    self._notify_procs_to_terminate()
+
+                # Run main hooks
+                for join_hook in self._join_hooks:
+                    join_hook.main_hook()
+
+                is_last_joiner = False
+                i += 1
+
+        # Run post-hooks
+        for join_hook in self._join_hooks:
+            join_hook.post_hook(is_last_joiner)
+
+    def _get_num_nonjoined_procs(self):
+        r"""Return the number of non-joined processes by shadowing an all-reduce in the non-joined processes."""
+        num_nonjoined_procs = torch.zeros(1, device=self._device)
+        dist.all_reduce(num_nonjoined_procs, group=self._process_group)
+        return num_nonjoined_procs.item()
+
+    def _notify_procs_to_terminate(self):
+        r"""Schedule an all-reduce to notify non-joined processes to terminate.
+
+        Also raise a ``RuntimeError`` indicating that the current process has exhausted its inputs.
+        """
+        ones = torch.ones(1, device=self._device)
+        dist.all_reduce(ones, group=self._process_group)
+        raise RuntimeError(f"Rank {self._rank} exhausted all inputs.")
+
+    @staticmethod
+    def notify_join_context(joinable: Joinable):
+        r"""
+        Notifies the join context manager that the calling process has not yet joined.
+
+        Then, if ``throw_on_early_termination=True``, checks if uneven inputs have been detected
+        (i.e. if one process has already joined) and throws an exception if so.
+
+        This method should be called from a :class:`Joinable` object before
+        its per-iteration collective communications. For example, this should
+        be called at the beginning of the forward pass in
+        :class:`DistributedDataParallel`.
+
+        Only the first :class:`Joinable` object passed into the context
+        manager performs the collective communications in this method, and
+        for the others, this method is vacuous.
+
+        Arguments:
+            joinable (Joinable): the :class:`Joinable` object calling this
+                method.
+
+        Returns:
+            An async work handle for the all-reduce meant to notify the context
+            manager that the process has not yet joined if ``joinable`` is the
+            first one passed into the context manager; ``None`` otherwise.
+        """
+        assert hasattr(joinable, "_join_config"), \
+            f"Check that the {type(joinable)} constructor calls the " \
+            "``Joinable`` constructor"
+
+        join_config = joinable._join_config
+        # First joinable is responsible for the collective communications
+        if not join_config.is_first_joinable or not join_config.enable:
+            return None
+
+        device = joinable.join_device
+        process_group = joinable.join_process_group
+
+        # Schedule an all-reduce to indicate that the caller has not yet joined
+        ones = torch.ones(1, device=device)
+        work = dist.all_reduce(ones, group=process_group, async_op=True)
+
+        if join_config.throw_on_early_termination:
+            # Check if uneven inputs have been detected
+            zeros = torch.zeros(1, device=device)
+            dist.all_reduce(zeros, group=process_group)
+            should_throw = zeros.item()
+            if should_throw:
+                raise RuntimeError(
+                    "Detected at least one rank that exhausted inputs. "
+                    "Throwing across all ranks."
+                )
+        return work
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__init__.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413b55e18a23afb658c8c77434db638495cd5b8e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/averagers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/averagers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb37830a301cbac35cd3c3a5c84f26aa529068f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/averagers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/hierarchical_model_averager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/hierarchical_model_averager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec641f28a37e86cfe44bae54118c16afbcf6fbf2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/hierarchical_model_averager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa1a0e015b2f3054bb285f36b99f62b79129d683
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/averagers.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/averagers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1085d37563fd2b73efdcaea75e44136195c1e6f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/averagers.py
@@ -0,0 +1,120 @@
+import warnings
+from abc import ABC, abstractmethod
+from typing import Union, Iterable, Dict
+import torch
+import torch.distributed as dist
+import torch.distributed.algorithms.model_averaging.utils as utils
+
+__all__ = ['ModelAverager', 'PeriodicModelAverager']
+
+class ModelAverager(ABC):
+    r"""Base class for all model averagers.
+
+    Args:
+        process_group: The process group to be used for all-reduce.
+                       If ``None``, the default process group, which
+                       is created by :func:`torch.distributed.init_process_group`,
+                       will be used. (default: ``None``)
+    """
+
+    def __init__(self, process_group=None):
+        self.process_group = (
+            process_group if process_group is not None else dist.group.WORLD
+        )
+        self.step = 0
+
+    @abstractmethod
+    def average_parameters(self, params):
+        raise NotImplementedError
+
+
+class PeriodicModelAverager(ModelAverager):
+    r"""
+    Averages parameters periodically after the warm-up stage.
+
+    This can be used for running `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
+    by running :class:`~torch.nn.DistributedDataParallel` (DDP)
+    using the subgroups created by :meth:`~torch.distributed.new_subgroups`.
+
+    Args:
+        period (int): The number of steps per model averaging.
+                      Usually the period should be greater than ``1`` to reduce the communication cost.
+                      Otherwise, only DDP needs to be used.
+        warmup_steps (int): The number of warm-up steps. During this stage,
+                            model averaging is skipped.
+        process_group: The process group to be used for all-reduce.
+                       If ``None``, the default process group, which
+                       is created by :func:`torch.distributed.init_process_group`,
+                       will be used. (default: ``None``)
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> import torch
+        >>> import torch.distributed as dist
+        >>> import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
+        >>> import torch.distributed.algorithms.model_averaging.averagers as averagers
+        >>> import torch.nn as nn
+        >>>
+        >>> dist.init_process_group("nccl", rank=rank, world_size=16)
+        >>> torch.cuda.set_device(rank)
+        >>> module = nn.Linear(1, 1, bias=False).cuda()
+        >>> model = nn.parallel.DistributedDataParallel(
+        >>>    module, device_ids=[rank], output_device=rank
+        >>> )
+        >>> # Register a post-localSGD communication hook.
+        >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
+        >>> model.register_comm_hook(state, post_localSGD_hook)
+        >>>
+        >>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
+        >>> # After 100 steps, run model averaging every 4 steps.
+        >>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
+        >>> averager = averagers.PeriodicModelAverager(period=4, warmup_steps=100)
+        >>> for step in range(0, 200):
+        >>>    optimizer.zero_grad()
+        >>>    loss = loss_fn(output, labels)
+        >>>    loss.backward()
+        >>>    optimizer.step()
+        >>>    # Will average model parameters globally every 4 steps. Thus,
+        >>>    # inter-node communication only occurs every 4 iterations after
+        >>>    # the initial ``warmup_steps`` period.
+        >>>    averager.average_parameters(model.parameters())
+    """
+
+    def __init__(
+        self,
+        period,
+        warmup_steps=0,
+        process_group=None
+    ):
+        super().__init__(process_group)
+        if warmup_steps < 0:
+            raise ValueError("Arg ``warmup_steps`` must be a non-negative number.")
+        self.warmup_steps = warmup_steps
+        if period < 1:
+            raise ValueError("Arg ``period`` must be a positive value.")
+        elif period == 1:
+            warnings.warn(
+                "When period is 1, no need to use model averaging because the communication cost "
+                "of all-reducing parameters will be no less than the cost of all-reducing gradients "
+                "by DistributedDataParallel in the backward pass. Therefore, only "
+                "DistributedDataParallel should be used for this case."
+            )
+        self.period = period
+
+    def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+        """
+        Averages parameters or parameter groups of an optimizer if ``step`` is no less than ``warmup_steps``.
+
+        Can be divided by ``period``, where ``step`` is increased by 1
+        at each iteration in the training loop.
+        Args:
+            params: The parameters of a model or parameter groups of an optimizer.
+
+        """
+        if (
+            self.step >= self.warmup_steps
+            and (self.step - self.warmup_steps) % self.period == 0
+        ):
+            utils.average_parameters_or_parameter_groups(params, self.process_group)
+        self.step += 1
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6246f4c669dc75b011338ba4591b418fdb17fa8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py
@@ -0,0 +1,167 @@
+# Copyright 2022 Cruise LLC
+import logging
+import warnings
+from collections import OrderedDict
+from typing import Union, Iterable, Dict
+
+import torch
+import torch.distributed as dist
+import torch.distributed.algorithms.model_averaging.averagers as averagers
+import torch.distributed.algorithms.model_averaging.utils as utils
+
+logger = logging.getLogger(__name__)
+
+
+class HierarchicalModelAverager(averagers.ModelAverager):
+    r"""
+    Runs hierarchical model averaging (`hierarchical SGD <https://arxiv.org/pdf/2010.12998.pdf>`_).
+
+    Process groups of different sizes are organized in a hierarchy, and they average parameters
+    by using different periods concurrently after the warm-up stage.
+    This is an extension of :class:`~torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager`
+    that supports `post-local SGD <https://arxiv.org/abs/1808.07217>`_, which essentially only supports
+    a two-level hierarchy: the intra-machine level and the global level, where the intra-machine
+    level is usually embedded in :meth:`~torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook`.
+    Similarly, the process groups within this class do not have such an intra-machine process
+    subgroup, which should be embedded by the post-local SGD communication hook instead.
+
+    Args:
+        period_group_size_dict: An ordered dict mapping keys of model averaging period to
+                                process group size, used for initializing process groups of
+                                different sizes in a hierarchy to average parameters concurrently.
+                                Particularly, at each iteration, there will be at most a single
+                                process group that runs averaging -- the period of such group should
+                                have the largest period which the current step can be divided by.
+                                For example, if the dict has three keys: 2, 4, and 8,
+                                then this means totally three process groups will be created to
+                                average parameters every 2, 4, and 8 iterations, respectively.
+                                At the 4th iteration, only the second process group will run
+                                averaging, because the first process group should be a
+                                subset of the second process group, and no need to execute the first
+                                process group redundantly.
+                                On the other hand, the third process group can only be triggered
+                                every 8 iterations, so it will not be triggered at the 4th iteration.
+        warmup_steps (int): The number of warm-up steps. During this stage, model averaging is skipped.
+        process_group (ProcessGroup, optional): The overall process group containing all the processes that runs model averaging.
+                                                If ``None``, the default process group, which is created
+                                                by :func:`torch.distributed.init_process_group`, will be used.
+                                                (default: ``None``)
+
+    Example::
+        >>> # xdoctest: +SKIP('undefined rank')
+        >>> from collections import OrderedDict
+        >>> import torch
+        >>> import torch.distributed as dist
+        >>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
+        >>>     PostLocalSGDState,
+        >>>     post_localSGD_hook,
+        >>> )
+        >>> import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
+        >>> import torch.nn as nn
+        >>>
+        >>> dist.init_process_group("nccl", rank=rank, world_size=16)
+        >>> torch.cuda.set_device(rank)
+        >>> module = nn.Linear(1, 1, bias=False).to(rank)
+        >>> model = nn.parallel.DistributedDataParallel(
+        >>>    module, device_ids=[rank], output_device=rank
+        >>> )
+        >>> # Register a post-localSGD communication hook.
+        >>> # Assume that each machine has 4 GPUs, then each intra-machine subgroup has a size of 4.
+        >>> subgroup, _ = dist.new_subgroups()
+        >>> state = PostLocalSGDState(process_group=None, subgroup=subgroup, start_localSGD_iter=100)
+        >>> model.register_comm_hook(state, post_localSGD_hook)
+        >>>
+        >>> # Average parameters among each group of 8 processes every 4 iterations, and among all
+        >>> # the 16 processes every 16 iterations.
+        >>> averager = hierarchicalSGD.HierarchicalModelAverager(
+        >>>     period_group_size_dict=OrderedDict([(4, 8), (16, 16)]), warmup_steps=100)
+        >>> # Note that ``warmup_steps`` must be the same as ``start_localSGD_iter`` used in ``PostLocalSGDState``.
+        >>> # In the first 100 steps, run global gradient averaging like normal DDP at every step.
+        >>> # After 100 steps, run model averaging at two levels.
+        >>> for step in range(0, 200):
+        >>>    optimizer.zero_grad()
+        >>>    loss = loss_fn(output, labels)
+        >>>    loss.backward()
+        >>>    optimizer.step()
+        >>>    # Average parameters after ``optimizer.step()``.
+        >>>    # Thus, the inter-node communication only occurs periodically after ``warmup_steps``.
+        >>>    averager.average_parameters(model.parameters())
+
+    .. warning ::
+        The last group size in the dict must be the size of the provided ``process_group``,
+        which indicates model averaging at the highest level of the hierarchy.
+        If ``process_group`` is not provided, then the last group size should be equal to the world size.
+
+    .. warning ::
+        `HierarchicalModelAverager` is experimental and subject to change.
+    """
+
+    def __init__(self, period_group_size_dict=None, warmup_steps=0, process_group=None):
+        super().__init__(process_group)
+        if not period_group_size_dict:
+            raise ValueError("Arg ``period_group_size_dict`` must not be empty.")
+        self._periods = list(period_group_size_dict.keys())
+        if self._periods[0] <= 0:
+            raise ValueError("The minimum period in arg ``period_group_size_dict`` must be a positive value.")
+        elif self._periods[-1] == 1:
+            warnings.warn(
+                "When the maximum period in arg ``period_group_size_dict`` is 1, "
+                "no need to use model averaging because the communication cost "
+                "of all-reducing parameters will be no less than the cost of all-reducing gradients "
+                "by DistributedDataParallel in the backward pass. Therefore, only "
+                "DistributedDataParallel should be used for this case."
+            )
+        overall_group_size = dist.get_world_size(group=self.process_group)
+        if list(period_group_size_dict.values())[-1] != overall_group_size:
+            raise ValueError(
+                f"The last value in arg ``period_process_group_dict`` {list(period_group_size_dict.values())[-1]} "
+                f"must be equal to the size of arg ``process_group`` {overall_group_size}."
+            )
+
+        self.period_process_group_dict = OrderedDict()
+        logger.info("Model averaging hierarchy:")
+        for period, group_size in period_group_size_dict.items():
+            logger.info(
+                "\tEach group that has %s processes average parameters every %s iterations, "
+                "if no higher-level averaging.", group_size, period)
+            if group_size != overall_group_size:
+                self.period_process_group_dict[period], _ = dist.new_subgroups(
+                    group_size=group_size, group=self.process_group)
+            else:
+                self.period_process_group_dict[period] = self.process_group
+
+        if warmup_steps < 0:
+            raise ValueError("Arg ``warmup_steps`` must be a non-negative number.")
+        self.warmup_steps = warmup_steps
+
+    def _find_process_group(self):
+        """
+        Return a process group as the value of an ``period_process_group_dict`` entry.
+
+        If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
+        then the returned process group is the one corresponding to the largest period,
+        since this process group will be used for averaging parameters at this ``step``.
+        Returns ``None`` if not found.
+        """
+        for period in reversed(self._periods):
+            if self.step % period == 0:
+                return self.period_process_group_dict[period]
+        return None
+
+    def average_parameters(self, params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+        """
+        Averages parameters or parameter groups of an optimizer.
+
+        Averaging only occurs if ``step`` is no less than ``warmup_steps``
+        and it can be divided by a period in the keys of ``period_process_group_dict``,
+        where ``step`` is increased by 1 at each iteration in the training loop.
+        If ``step`` can be divided by multiple periods in the keys of ``period_process_group_dict``,
+        only the largest period is used, and the corresponding process group is used for averaging parameters.
+        Args:
+            params: The parameters of a model or parameter groups of an optimizer.
+        """
+        if self.step >= self.warmup_steps:
+            group = self._find_process_group()
+            if group is not None:
+                utils.average_parameters_or_parameter_groups(params, group)
+        self.step += 1
diff --git a/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/utils.py b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..34444b3478e253a50ac71151c44d334f8aefb890
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/algorithms/model_averaging/utils.py
@@ -0,0 +1,72 @@
+# flake8: noqa C101
+import itertools
+from typing import Union, Iterable, Dict, Iterator
+
+import torch
+import torch.distributed as dist
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
+from torch.distributed import ProcessGroup, group
+
+__all__ = ["average_parameters", "get_params_to_average", "average_parameters_or_parameter_groups"]
+
+def average_parameters(
+    params: Iterator[torch.nn.Parameter], process_group: ProcessGroup
+):
+    """
+    Averages all the given parameters.
+
+    For allreduce efficiency, all the parameters are flattened into a contiguous buffer.
+    Thus, it requires extra memory of the same size as the given parameters.
+    """
+    group_to_use = process_group if process_group is not None else group.WORLD
+    # Do not update any parameter if not in the process group.
+    if dist._rank_not_in_group(group_to_use):
+        return
+
+    params_it1, params_it2 = itertools.tee(params)
+    # If the input parameters have different data types,
+    # packing these parameters will trigger an implicit type up-casting.
+    # The original parameter data types will be restored during the subsequent unpacking.
+    flat_params = torch.cat([p.data.reshape(-1) for p in params_it1])
+    flat_params /= dist.get_world_size(group_to_use)
+    # Make sure the allreduce will not conflict with any other ongoing process group.
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    dist.all_reduce(flat_params, group=group_to_use)
+
+    offset = 0
+    for p in params_it2:
+        p.data = flat_params[offset : offset + p.numel()].view_as(p).type_as(p)
+        offset += p.numel()
+
+
+def get_params_to_average(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]]):
+    """
+    Return a list of parameters that need to average.
+
+    This filters out the parameters that do not contain any gradients.
+    Args:
+        params: The parameters of a model or parameter groups of an optimizer.
+    """
+    filtered_params = []
+    for param in params:
+        if isinstance(param, torch.nn.Parameter):
+            # model.parameters() input
+            param_data = param
+            if param_data.grad is not None:
+                filtered_params.append(param_data)
+        elif isinstance(param, dict):
+            # optimizer.param_groups input
+            for param_data in param["params"]:
+                if param_data.grad is not None:
+                    filtered_params.append(param_data)
+        else:
+            raise NotImplementedError(f"Parameter input of type {type(param)} is not supported")
+    return filtered_params
+
+
+def average_parameters_or_parameter_groups(params: Union[Iterable[torch.nn.Parameter], Iterable[Dict[str, torch.nn.Parameter]]], process_group: ProcessGroup):
+    """Averages parameters of a model or parameter groups of an optimizer."""
+    average_parameters(iter(get_params_to_average(params)), process_group)
diff --git a/MLPY/Lib/site-packages/torch/distributed/argparse_util.py b/MLPY/Lib/site-packages/torch/distributed/argparse_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..668ebc8c1f89d2952652a68d433d7ef9e019fd9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/argparse_util.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from argparse import Action
+
+
+class env(Action):
+    """
+    Get argument values from ``PET_{dest}`` before defaulting to the given ``default`` value.
+
+    For flags (e.g. ``--standalone``)
+    use ``check_env`` instead.
+
+    .. note:: when multiple option strings are specified, ``dest`` is
+              the longest option string (e.g. for ``"-f", "--foo"``
+              the env var to set is ``PET_FOO`` not ``PET_F``)
+
+    Example:
+    ::
+
+     parser.add_argument("-f", "--foo", action=env, default="bar")
+
+     ./program                                      -> args.foo="bar"
+     ./program -f baz                               -> args.foo="baz"
+     ./program --foo baz                            -> args.foo="baz"
+     PET_FOO="env_bar" ./program -f baz    -> args.foo="baz"
+     PET_FOO="env_bar" ./program --foo baz -> args.foo="baz"
+     PET_FOO="env_bar" ./program           -> args.foo="env_bar"
+
+     parser.add_argument("-f", "--foo", action=env, required=True)
+
+     ./program                                      -> fails
+     ./program -f baz                               -> args.foo="baz"
+     PET_FOO="env_bar" ./program           -> args.foo="env_bar"
+     PET_FOO="env_bar" ./program -f baz    -> args.foo="baz"
+    """
+
+    def __init__(self, dest, default=None, required=False, **kwargs) -> None:
+        env_name = f"PET_{dest.upper()}"
+        default = os.environ.get(env_name, default)
+
+        # ``required`` means that it NEEDS to be present  in the command-line args
+        # rather than "this option requires a value (either set explicitly or default"
+        # so if we found default then we don't "require" it to be in the command-line
+        # so set it to False
+        if default:
+            required = False
+
+        super().__init__(dest=dest, default=default, required=required, **kwargs)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+
+
+class check_env(Action):
+    """
+    Check whether the env var ``PET_{dest}`` exists before defaulting to the given ``default`` value.
+
+    Equivalent to
+    ``store_true`` argparse built-in action except that the argument can
+    be omitted from the commandline if the env var is present and has a
+    non-zero value.
+
+    .. note:: it is redundant to pass ``default=True`` for arguments
+              that use this action because a flag should be ``True``
+              when present and ``False`` otherwise.
+
+    Example:
+    ::
+
+     parser.add_argument("--verbose", action=check_env)
+
+     ./program                                  -> args.verbose=False
+     ./program --verbose                        -> args.verbose=True
+     PET_VERBOSE=1 ./program           -> args.verbose=True
+     PET_VERBOSE=0 ./program           -> args.verbose=False
+     PET_VERBOSE=0 ./program --verbose -> args.verbose=True
+
+    Anti-pattern (don't do this):
+
+    ::
+
+     parser.add_argument("--verbose", action=check_env, default=True)
+
+     ./program                                  -> args.verbose=True
+     ./program --verbose                        -> args.verbose=True
+     PET_VERBOSE=1 ./program           -> args.verbose=True
+     PET_VERBOSE=0 ./program           -> args.verbose=False
+
+    """
+
+    def __init__(self, dest, default=False, **kwargs) -> None:
+        env_name = f"PET_{dest.upper()}"
+        default = bool(int(os.environ.get(env_name, "1" if default else "0")))
+        super().__init__(dest=dest, const=True, default=default, nargs=0, **kwargs)
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, self.const)
diff --git a/MLPY/Lib/site-packages/torch/distributed/autograd/__init__.py b/MLPY/Lib/site-packages/torch/distributed/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f017e96de63905fe36edba3045269a853805b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/autograd/__init__.py
@@ -0,0 +1,52 @@
+
+import sys
+import torch
+
+
+def is_available():
+    return hasattr(torch._C, "_dist_autograd_init")
+
+
+if is_available() and not torch._C._dist_autograd_init():
+    raise RuntimeError("Failed to initialize torch.distributed.autograd")
+
+if is_available():
+    from torch._C._distributed_autograd import (
+        get_gradients,
+        backward,
+        _init,
+        _new_context,
+        _release_context,
+        _get_max_id,
+        _is_valid_context,
+        _retrieve_context,
+        _current_context,
+        _get_debug_info,
+        DistAutogradContext,
+    )
+
+
+class context:
+    '''
+    Context object to wrap forward and backward passes when using
+    distributed autograd. The ``context_id`` generated in the ``with``
+    statement  is required to uniquely identify a distributed backward pass
+    on all workers. Each worker stores metadata associated with this
+    ``context_id``, which is required to correctly execute a distributed
+    autograd pass.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> import torch.distributed.autograd as dist_autograd
+        >>> with dist_autograd.context() as context_id:
+        >>>     t1 = torch.rand((3, 3), requires_grad=True)
+        >>>     t2 = torch.rand((3, 3), requires_grad=True)
+        >>>     loss = rpc.rpc_sync("worker1", torch.add, args=(t1, t2)).sum()
+        >>>     dist_autograd.backward(context_id, [loss])
+    '''
+    def __enter__(self):
+        self.autograd_context = _new_context()
+        return self.autograd_context._context_id()
+
+    def __exit__(self, type, value, traceback):
+        _release_context(self.autograd_context._context_id())
diff --git a/MLPY/Lib/site-packages/torch/distributed/autograd/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/autograd/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64dfdaaa4fa7ae0dfea7199a2408efb524a8fa6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/autograd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/c10d_logger.py b/MLPY/Lib/site-packages/torch/distributed/c10d_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4e312a7de9cb4eeb696718f6d71392a4d6b8c1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/c10d_logger.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import time
+from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from typing_extensions import ParamSpec
+
+import torch
+import torch.distributed as dist
+
+from torch.distributed.logging_handlers import _log_handlers
+
+__all__: List[str] = []
+
+
+def _get_or_create_logger() -> logging.Logger:
+    logging_handler, log_handler_name = _get_logging_handler()
+    logger = logging.getLogger(f"c10d-{log_handler_name}")
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    logging_handler.setFormatter(formatter)
+    logger.propagate = False
+    logger.addHandler(logging_handler)
+    return logger
+
+
+def _get_logging_handler(destination: str = "default") -> Tuple[logging.Handler, str]:
+    log_handler = _log_handlers[destination]
+    log_handler_name = type(log_handler).__name__
+    return (log_handler, log_handler_name)
+
+
+global _c10d_logger
+_c10d_logger = _get_or_create_logger()
+
+
+def _get_msg_dict(func_name, *args, **kwargs) -> Dict[str, Any]:
+    if dist.is_initialized():
+        msg_dict = {
+            "func_name": f"{func_name}",
+            "args": f"{args}, {kwargs}",
+            "pg_name": f"{dist._get_process_group_name(kwargs.get('pg'))}",  # type: ignore[arg-type]
+            "backend": f"{dist.get_backend(kwargs.get('group'))}",
+            "world_size": f"{dist.get_world_size()}",
+            "group_size": f"{dist.get_world_size(kwargs.get('group'))}",
+            "global_rank": f"{dist.get_rank()}",
+            "local_rank": f"{dist.get_rank(kwargs.get('group'))}",
+        }
+        if msg_dict["backend"] == "nccl":
+            nccl_version = torch.cuda.nccl.version()
+            msg_dict["nccl_version"] = ".".join(str(v) for v in nccl_version)
+    else:
+        msg_dict = {
+            "func_name": f"{func_name}",
+            "args": f"{args}, {kwargs}",
+        }
+    return msg_dict
+
+_T = TypeVar('_T')
+_P = ParamSpec('_P')
+
+def _exception_logger(func: Callable[_P, _T]) -> Callable[_P, _T]:
+    @functools.wraps(func)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+        try:
+            return func(*args, **kwargs)
+        except Exception as error:
+            msg_dict = _get_msg_dict(func.__name__, *args, **kwargs)
+            msg_dict["error"] = f"{error}"
+            _c10d_logger.debug(msg_dict)
+            raise
+
+    return wrapper
+
+
+def _time_logger(func: Callable[_P, _T]) -> Callable[_P, _T]:
+    @functools.wraps(func)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+        t1 = time.time_ns()
+        func_return = func(*args, **kwargs)
+        time_spent = time.time_ns() - t1
+
+        msg_dict = _get_msg_dict(func.__name__, *args, **kwargs)
+        msg_dict["time_spent"] = f"{time_spent}ns"
+        _c10d_logger.debug(msg_dict)
+
+        return func_return
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__init__.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..007a207618e469e90e30bde67d8e530855decf27
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__init__.py
@@ -0,0 +1,15 @@
+from .api import CheckpointException
+from .default_planner import DefaultLoadPlanner, DefaultSavePlanner
+from .filesystem import FileSystemReader, FileSystemWriter
+from .fsspec import FsspecReader, FsspecWriter
+from .metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+    TensorStorageMetadata,
+)
+from .optimizer import load_sharded_optimizer_state_dict
+from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
+from .state_dict_loader import load, load_state_dict
+from .state_dict_saver import async_save, save, save_state_dict
+from .storage import StorageReader, StorageWriter
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..661dd428dfdcea39f30d89f9eccd05e86f21ae31
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_save_plans.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_save_plans.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a962f272b00714088e3b9a5b62944aaec92fe2a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_save_plans.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_tensors.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_tensors.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c37ce3e477abe7dee3d85398e3d59331473dd947
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_dedup_tensors.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_fsspec_filesystem.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_fsspec_filesystem.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7ddb9a6928414779a4067081ebd58ddb96ba948
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_fsspec_filesystem.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_nested_dict.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_nested_dict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72f5999482663ffbd30fe00232ccc0610e0d936c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_nested_dict.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_sharded_tensor_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_sharded_tensor_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b1e001c54e9548c2bfd4a1e0ef049c8f54581d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_sharded_tensor_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_storage_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_storage_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b523088c2348af8378d8be453745d20a44c35df7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_storage_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_traverse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_traverse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92f01881408b1408c3cdd559ec1e13c9aaa2b772
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/_traverse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0397a4d7cff9af5cc46d60e62421b049d6a5dd80
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/default_planner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/default_planner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2df7b45d6d8b2ab383fffcc662348c3ea58e0870
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/default_planner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/filesystem.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/filesystem.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0eb51f9af8213dfbbb163c3fee2eb13333a735f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/filesystem.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/format_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/format_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2e7c7a28459aab2a6eabf3c39e8a7ab2d97ecd9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/format_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/fsspec.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/fsspec.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..027b9716b408f951788bea65c3ffa855e0a95fbf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/fsspec.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/metadata.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d36f2e08568645a02e48c37941c8ad3df863ffd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/metadata.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da8606b2455412855713415d44690cdaf240c51f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31c58dd3e1a4224129a4144f40b29829c6adaa82
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner_helpers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner_helpers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c04ab860496f72c7f20a981d483ceb82f91ddbbc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/planner_helpers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/resharding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/resharding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aca818516da5ee71258a77c4b251b34b79509f02
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/resharding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ccac23b666a7d4e89bbb06fa6f1869383414ffd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_loader.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_loader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ddbf726b933d44bc9993f0872c1041cef94877f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_loader.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_saver.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_saver.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c7555b08a1238f073d003aefa47239add11ba0b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/state_dict_saver.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/stateful.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/stateful.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa89a3972e2bc24112c078dca22e8c968baffdb1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/stateful.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/storage.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/storage.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8b448ad2bdd7f3973ef4d411384ecd31f762fe0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/storage.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e5ee7b56e2c1d5e6134013fb5819b5d849548d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/checkpoint/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_save_plans.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_save_plans.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a7ddce6de0aad680c775db1a6d88ad3849aa13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_save_plans.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import dataclasses
+from collections import defaultdict
+from typing import Dict, List, Set
+
+from torch.distributed.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.planner import SavePlan, WriteItem
+
+__all__ = ["dedup_save_plans"]
+
+
+def dedup_save_plans(all_plans: List[SavePlan]) -> List[SavePlan]:
+    """
+    Removes duplicate entries from appearing on multiple SavePlans. For each duplicate across
+    a set of SavePlans, only the smallest SavePlan in terms of planned storage keeps the entry.
+    """
+
+    write_item_to_plan_indices: Dict[MetadataIndex, Set[int]] = defaultdict(set)
+    write_item_idx_to_write_item: Dict[MetadataIndex, WriteItem] = {}
+    for plan_idx, plan in enumerate(all_plans):
+        for write_item in plan.items:
+            # map each write item to its plan
+            write_item_to_plan_indices[write_item.index].add(plan_idx)
+            write_item_idx_to_write_item[write_item.index] = write_item
+
+    # put item in the plan with the smallest size and remove it from the other plan_indices
+    to_remove: List[Set] = [set() for _ in range(len(all_plans))]
+    plan_to_size = [0] * len(all_plans)
+    for write_item_idx, plan_indices in write_item_to_plan_indices.items():
+        select_plan_idx = min(plan_indices, key=lambda plan_idx: plan_to_size[plan_idx])
+
+        write_item = write_item_idx_to_write_item[write_item_idx]
+        # essentially ignores the storage size of anything that is not a tensor, since
+        # we don't know how much storage they represent
+        plan_to_size[select_plan_idx] += write_item.tensor_storage_size() or 1
+
+        plan_indices.remove(select_plan_idx)
+        for plan_idx in plan_indices:
+            to_remove[plan_idx].add(write_item_idx)
+
+    for plan_idx, remove_set in enumerate(to_remove):
+        new_items = [
+            write_item
+            for write_item in all_plans[plan_idx].items
+            if write_item.index not in remove_set
+        ]
+        all_plans[plan_idx] = dataclasses.replace(all_plans[plan_idx], items=new_items)
+
+    return all_plans
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_tensors.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eea84f6333bc93cc84b863d85f139bdb2b34fb2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_dedup_tensors.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import dataclasses
+import logging
+from typing import Dict, List
+
+from torch.distributed.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.planner import SavePlan
+
+__all__ = ["dedup_tensors"]
+
+
+def init_logger() -> logging.Logger:
+    logger = logging.getLogger(__name__)
+    level = logging.INFO
+    logger.setLevel(level)
+    console = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    console.setFormatter(formatter)
+    console.setLevel(level)
+    logger.addHandler(console)
+    logger.propagate = False
+    return logger
+
+
+logger = init_logger()
+
+
+# TODO add docstring for dedup_tensors
+def dedup_tensors(all_plans: List[SavePlan]) -> List[SavePlan]:
+    all_plans = list(all_plans)
+    key_to_plan: Dict[MetadataIndex, List[int]] = {}
+    for plan_idx, plan in enumerate(all_plans):
+        for write_item in plan.items:
+            key_to_plan.setdefault(write_item.index, []).append(plan_idx)
+
+    replicated_items = {k: v for k, v in key_to_plan.items() if len(v) > 1}
+
+    # Remove duplicates by always keeping the first entry.
+    # Compute the per-rank remove set.
+    plan_to_keys: Dict[int, List[MetadataIndex]] = {}
+    for key, plans in replicated_items.items():
+        for plan_idx in plans[1:]:
+            plan_to_keys.setdefault(plan_idx, []).append(key)
+    if len(plan_to_keys) > 0:
+        logger.info("Duplicate keys to remove: %s", plan_to_keys)
+
+    for plan_idx, keys in plan_to_keys.items():
+        key_set = set(keys)
+        # rewrite items and remove elements
+        new_items = [
+            write_item
+            for write_item in all_plans[plan_idx].items
+            if write_item.index not in key_set
+        ]
+        all_plans[plan_idx] = dataclasses.replace(all_plans[plan_idx], items=new_items)
+
+    return all_plans
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_fsspec_filesystem.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_fsspec_filesystem.py
new file mode 100644
index 0000000000000000000000000000000000000000..8087bd3e0bad21fcd4b80206bf1e8b5e4d029445
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -0,0 +1,15 @@
+# Mypy will not try inferring the types of any 3rd party libraries installed.
+# mypy: ignore-errors
+
+import logging
+
+from torch.distributed.checkpoint.fsspec import (  # noqa: F401  # noqa: F401
+    FsspecReader,
+    FsspecWriter,
+)
+
+log = logging.getLogger(__name__)
+log.warning(
+    "FSSpec Filesystem has been made public, please update your "
+    "import to torch.distributed.checkpoint"
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_nested_dict.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_nested_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eb2bb77cf4db45a26e51613f61fd2dff2b24f10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_nested_dict.py
@@ -0,0 +1,53 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, Tuple
+
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+
+from ._traverse import OBJ_PATH, set_element, STATE_DICT_ITEM, traverse_state_dict
+
+"""
+TODO:
+Need to add ability to handle tuple, OrderedDict, NamedTuple.
+Update mappings from dict to a class.
+Change set_element to recreate the right type for tuple, OrderedDict, and NamedTuple.
+"""
+
+
+FLATTEN_MAPPING = Dict[str, OBJ_PATH]
+
+
+# TODO: Update Docstring for nested_dict.py
+def flatten_state_dict(
+    state_dict: STATE_DICT_TYPE,
+) -> Tuple[STATE_DICT_TYPE, FLATTEN_MAPPING]:
+    """
+    Flatten ``state_dict`` made of nested dicts and lists into a top level dictionary.
+
+    Use ``unflatten_state_dict`` to revert this process.
+    Returns:
+        A tuple with the flatten state_dict and a mapping from original to new state_dict.
+    N.B. The new keys are derived from the object paths, joined by dot.
+        For example: ``{ 'a': {'b':...}}`` results in the key `a.b`.
+    """
+    flattened: STATE_DICT_TYPE = {}
+    mappings: FLATTEN_MAPPING = {}
+
+    def flat_copy(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        new_fqn = ".".join(map(str, path))
+        if new_fqn in flattened:
+            raise ValueError(f"duplicated flatten key {new_fqn}")
+        flattened[new_fqn] = value
+        mappings[new_fqn] = path
+
+    traverse_state_dict(state_dict, flat_copy)
+    return flattened, mappings
+
+
+def unflatten_state_dict(
+    state_dict: STATE_DICT_TYPE, mapping: FLATTEN_MAPPING
+) -> STATE_DICT_TYPE:
+    """Restore the original nested state_dict according to ``mapping`` and the flattened ``state_dict``."""
+    nested: STATE_DICT_TYPE = {}
+    for key, value in state_dict.items():
+        set_element(nested, mapping[key], value)
+    return nested
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_sharded_tensor_utils.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_sharded_tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..27807cfc768979c6c23b53bd6facd4c254c7ecbd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_sharded_tensor_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import copy
+
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import Shard, ShardedTensor, ShardMetadata
+from torch.distributed._shard.sharded_tensor.metadata import ShardedTensorMetadata
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+from torch.distributed.remote_device import _remote_device
+
+from ._traverse import OBJ_PATH, set_element, STATE_DICT_ITEM, traverse_state_dict
+from .utils import _element_wise_add, _normalize_device_info
+
+
+# TODO: We need to refactor this code.
+def _flatten_sharded_tensors(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+    r"""
+    Transform ``state_dict`` by flattening all nested ShardedTensor instances found.
+
+    The resulting ShardedTensor instances are only correct regarding the local shard and
+    MUST not be used for any other purpose but checkpointing, as no operator will work with them.
+
+    This function should be used in conjunction with a state_dict produced by FSDP's
+    StateDictType.SHARDED_STATE_DICT methods.
+    """
+    new_state_dict: STATE_DICT_TYPE = {}
+
+    def rewrite_dict(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        if not isinstance(value, ShardedTensor):
+            set_element(new_state_dict, path, value)
+            return
+        shards = value.local_shards()
+
+        if len(shards) == 0:
+            return
+        if len(shards) != 1:
+            set_element(new_state_dict, path, value)
+            return
+
+        outer_shard = shards[0]
+
+        inner_st = outer_shard.tensor
+        if not isinstance(inner_st, ShardedTensor):
+            set_element(new_state_dict, path, value)
+            return
+
+        if len(inner_st.local_shards()) != 1:
+            raise ValueError("Cannot handle inner tensor with more than 1 shard")
+        inner_shard = inner_st.local_shards()[0]
+
+        local_shards = [
+            Shard(
+                tensor=inner_shard.tensor,
+                metadata=ShardMetadata(
+                    shard_offsets=_element_wise_add(
+                        outer_shard.metadata.shard_offsets,
+                        inner_shard.metadata.shard_offsets,
+                    ),
+                    shard_sizes=inner_shard.metadata.shard_sizes,
+                    placement=f"rank:{dist.get_rank()}/{inner_shard.tensor.device}",
+                ),
+            )
+        ]
+
+        st_meta: ShardedTensorMetadata = copy.deepcopy(value.metadata())
+        other_rank = 0 if dist.get_rank() > 0 else 1
+        device_info = _normalize_device_info(inner_shard.tensor.device.type, 0)
+
+        # Remove the outer ST shard the inner ST covers
+        for i, shard_md in enumerate(st_meta.shards_metadata):
+            if shard_md.shard_offsets == outer_shard.metadata.shard_offsets:
+                st_meta.shards_metadata.pop(i)
+                break
+
+        # Attribute other rank for the other shards
+        for shard_md in st_meta.shards_metadata:
+            shard_md.placement = _remote_device(f"rank:{other_rank}/{device_info}")
+
+        # Add other inner shards from the inner tensor
+        for inner_md in inner_st.metadata().shards_metadata:
+            if inner_md.shard_offsets != inner_shard.metadata.shard_offsets:
+                st_meta.shards_metadata.append(
+                    ShardMetadata(
+                        shard_offsets=_element_wise_add(
+                            outer_shard.metadata.shard_offsets,
+                            inner_md.shard_offsets,
+                        ),
+                        shard_sizes=inner_md.shard_sizes,
+                        placement=f"rank:{other_rank}/{device_info}",
+                    )
+                )
+
+        # Finally add this shard
+        st_meta.shards_metadata.append(local_shards[0].metadata)
+
+        st = ShardedTensor._init_from_local_shards_and_global_metadata(
+            local_shards=local_shards,
+            sharded_tensor_metadata=st_meta,
+        )
+        set_element(new_state_dict, path, st)
+
+    traverse_state_dict(state_dict, rewrite_dict)
+    return new_state_dict
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_storage_utils.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_storage_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d66e1a605270766893c21235efc63930cc7b79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_storage_utils.py
@@ -0,0 +1,50 @@
+import os
+from typing import List, Type, Union
+
+from .filesystem import FileSystemReader, FileSystemWriter
+
+from .storage import StorageReader, StorageWriter
+
+
+def _storage_setup(
+    storage: Union[StorageReader, StorageWriter, None],
+    checkpoint_id: Union[str, os.PathLike, None],
+    reader: bool = False,
+) -> Union[None, StorageReader, StorageWriter]:
+    if storage:
+        if checkpoint_id is not None:
+            storage.reset(checkpoint_id)
+        return storage
+
+    if not checkpoint_id:
+        raise RuntimeError(
+            "`checkpoint_id` must be specificed if "
+            "storage_reader/storage_writer is None."
+        )
+
+    targets: List[Type[Union[StorageReader, StorageWriter]]] = []
+    if reader:
+        targets = [
+            FileSystemReader,
+        ]
+    else:
+        targets = [
+            FileSystemWriter,
+        ]
+    try:
+        from .fsspec import FsspecReader, FsspecWriter
+
+        targets.append(FsspecReader if reader else FsspecWriter)
+    except Exception:
+        pass
+
+    for target in targets:
+        if target.validate_checkpoint_id(checkpoint_id):
+            storage = target(checkpoint_id)  # type: ignore[call-arg]
+            storage.reset(checkpoint_id)
+            return storage
+
+    raise RuntimeError(
+        "Cannot detect which StorageReader or StorageWriter to use. "
+        "Please specify the storage_reader/storage_writer."
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/_traverse.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_traverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b40729b282424b127cf009cde3532abaf3792ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/_traverse.py
@@ -0,0 +1,167 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import (
+    Callable,
+    cast,
+    Collection,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+import torch
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
+
+PATH_ITEM = Union[str, int]
+OBJ_PATH = Tuple[PATH_ITEM, ...]
+T = TypeVar("T")
+
+STATE_DICT_ITEM = object
+CONTAINER_TYPE = MutableMapping[PATH_ITEM, STATE_DICT_ITEM]
+
+__all__ = ["traverse_state_dict", "set_element", "get_element", "print_tensor"]
+
+
+def _keep_visiting_tensors(value: STATE_DICT_ITEM) -> bool:
+    return isinstance(value, torch.Tensor)
+
+
+# TODO: update docstring for traverse.py
+def traverse_state_dict(
+    state_dict: STATE_DICT_TYPE,
+    visitor: Callable[[OBJ_PATH, STATE_DICT_ITEM], None],
+    keep_traversing: Callable[[STATE_DICT_ITEM], bool] = _keep_visiting_tensors,
+) -> None:
+    """
+    Invoke ``visitor`` for each value recursively in ``state_dict``.
+
+    Traversal is short-circuited when if finds a collection for which ``keep_visiting_tensors`` evaluates
+    to false for all elements.
+    By default, all collections with at least one ``torch.Tensor`` element are traversed.
+    Visitor takes a path argument that is a tuple of the keys used to reach it.
+    """
+
+    # a value is terminal if it has no other containers values inside it
+    def _is_terminal(value: STATE_DICT_ITEM) -> bool:
+        values: Collection[STATE_DICT_ITEM]
+        if isinstance(value, Mapping):
+            values = value.values()
+        elif isinstance(value, list):
+            values = value
+        else:
+            return True
+
+        for entry in values:
+            if isinstance(entry, (Mapping, list)) and not _is_terminal(entry):
+                return False
+            if keep_traversing is not None and keep_traversing(entry):
+                return False
+        return True
+
+    def _traverse_obj(path: OBJ_PATH, value: STATE_DICT_ITEM) -> None:
+        if _is_terminal(value):
+            visitor(path, value)
+        elif isinstance(value, Mapping):
+            for k, v in value.items():
+                _traverse_obj(path + (str(k),), v)
+        elif isinstance(value, list):
+            for i, v in enumerate(value):
+                _traverse_obj(path + (i,), v)
+
+    for key, value in state_dict.items():
+        _traverse_obj((str(key),), value)
+
+
+def set_element(
+    root_dict: STATE_DICT_TYPE, path: OBJ_PATH, value: STATE_DICT_ITEM
+) -> None:
+    """Set ``value`` in ``root_dict`` along the ``path`` object path."""
+    cur_container = cast(CONTAINER_TYPE, root_dict)
+
+    def extend_list(lst: List[STATE_DICT_ITEM], idx: int) -> None:
+        while len(lst) <= idx:
+            lst.append(None)
+
+    for i in range(1, len(path)):
+        prev_key = path[i - 1]
+        key = path[i]
+        def_val = cast(STATE_DICT_ITEM, {} if type(key) == str else [])
+
+        if isinstance(cur_container, Mapping):
+            cur_container = cast(
+                CONTAINER_TYPE, cur_container.setdefault(prev_key, def_val)
+            )
+        else:
+            extend_list(cur_container, prev_key)
+            if cur_container[prev_key] is None:
+                cur_container[prev_key] = def_val
+            cur_container = cur_container[prev_key]
+
+    key = path[-1]
+    if type(key) == int:
+        extend_list(cast(List[STATE_DICT_ITEM], cur_container), key)
+
+    cur_container[key] = value
+
+
+def get_element(
+    root_dict: STATE_DICT_TYPE,
+    path: OBJ_PATH,
+    default_value: Optional[T] = None,
+) -> Optional[T]:
+    """Retrieve the value at ``path``from ``root_dict``, returning ``default_value`` if not found."""
+    cur_value = cast(CONTAINER_TYPE, root_dict)
+    for part in path:
+        if type(part) is int:
+            if not isinstance(cur_value, list) or len(cur_value) < part:
+                return default_value
+        elif not isinstance(cur_value, Mapping) or part not in cur_value:
+            return default_value
+
+        cur_value = cast(CONTAINER_TYPE, cur_value[part])
+    return cast(Optional[T], cur_value)
+
+
+def _print_nested(
+    value: STATE_DICT_ITEM,
+    prefix: str = "",
+    print_fun: Callable[[str], None] = print,
+) -> None:
+    if type(value) is ShardedTensor:
+        print_fun(f"{prefix} ShardedTensor size: {value.size()}")
+        for shard in value.local_shards():
+            _print_nested(
+                shard.tensor,
+                f"{shard.metadata.shard_offsets} ",
+                print_fun=print_fun,
+            )
+    elif type(value) is (DTensor):
+        print_fun(f"{prefix} DistributedTensor size: {value.size()}")
+        # TODO: add local offset for _local_tensor in print_nested.
+        _print_nested(
+            value._local_tensor,
+            print_fun=print_fun,
+        )
+    elif isinstance(value, torch.Tensor):
+        print_fun(f"{prefix} Tensor size: {value.size()}")
+    else:
+        print_fun(f"{prefix} Type: {type(value)}")
+
+
+def print_tensor(
+    path: OBJ_PATH,
+    value: STATE_DICT_ITEM,
+    print_fun: Callable[[str], None] = print,
+) -> None:
+    """
+    Use this callback with traverse_state_dict to print its content.
+
+    By default the content is printed using the builtin ``print`` but this can
+    be change by passing a different ``print_fun` callable.
+    """
+    _print_nested(value, prefix=str(path), print_fun=print_fun)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/api.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08215d4751ae7b3594f4e4ac67d72c3d805a3b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/api.py
@@ -0,0 +1,41 @@
+import traceback as tb
+from typing import Any, Dict, Tuple
+
+WRAPPED_EXCEPTION = Tuple[BaseException, tb.StackSummary]
+
+__all__ = ["CheckpointException"]
+
+
+def _wrap_exception(exc: BaseException) -> WRAPPED_EXCEPTION:
+    return (exc, tb.extract_tb(exc.__traceback__))
+
+
+def _is_wrapped_exception(obj: Any) -> bool:
+    if not isinstance(obj, tuple):
+        return False
+    if len(obj) != 2:
+        return False
+    return isinstance(obj[0], BaseException) and isinstance(obj[1], tb.StackSummary)
+
+
+class CheckpointException(BaseException):
+    """Exception raised if failure was detected as part of a checkpoint load or save."""
+
+    def __init__(self, msg: str, failures: Dict[int, WRAPPED_EXCEPTION]):
+        super().__init__(msg, failures)
+        self._failures = failures
+
+    @property
+    def failures(self) -> Dict[int, WRAPPED_EXCEPTION]:
+        """Return a dictionary mapping node ranks to their associated exceptions in case of failure."""
+        return self._failures
+
+    def __str__(self):
+        str = f"CheckpointException ranks:{self._failures.keys()}\n"
+        for rank, exc_pair in self._failures.items():
+            exc, trace = exc_pair
+            str += f"Traceback (most recent call last): (RANK {rank})\n"
+            if trace is not None:
+                str += "".join(tb.format_list(trace))
+            str += "".join(tb.format_exception_only(type(exc), value=exc))
+        return str
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/default_planner.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/default_planner.py
new file mode 100644
index 0000000000000000000000000000000000000000..df49dfca05ec1c6e1c2a5ec20669df7b64194ebc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/default_planner.py
@@ -0,0 +1,420 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import dataclasses
+import io
+import logging
+import operator
+from collections import ChainMap
+from functools import reduce
+from typing import Any, cast, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint._dedup_save_plans import dedup_save_plans
+from torch.distributed.checkpoint._nested_dict import (
+    FLATTEN_MAPPING,
+    flatten_state_dict,
+)
+from torch.distributed.checkpoint._sharded_tensor_utils import _flatten_sharded_tensors
+from torch.distributed.checkpoint._traverse import set_element
+from torch.distributed.checkpoint.metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    STORAGE_TYPES,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import (
+    LoadPlan,
+    LoadPlanner,
+    ReadItem,
+    SavePlan,
+    SavePlanner,
+    WriteItem,
+    WriteItemType,
+)
+from torch.distributed.checkpoint.planner_helpers import (
+    _create_default_metadata_only_plan,
+    _create_read_items,
+    _create_write_items,
+    _init_state_dict,
+)
+from torch.distributed.checkpoint.utils import find_state_dict_object
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+__all__ = [
+    "DefaultSavePlanner",
+    "DefaultLoadPlanner",
+    "create_default_local_load_plan",
+    "create_default_global_load_plan",
+    "create_default_local_save_plan",
+    "create_default_global_save_plan",
+]
+
+
+# TODO: Update docstrings for default_planner.py
+class DefaultSavePlanner(SavePlanner):
+    mappings: FLATTEN_MAPPING
+
+    def __init__(
+        self,
+        flatten_state_dict: bool = True,
+        flatten_sharded_tensors: bool = True,
+        dedup_replicated_tensors: Optional[bool] = None,
+    ) -> None:
+        self.flatten_state_dict = flatten_state_dict
+        self.flatten_sharded_tensors = flatten_sharded_tensors
+        self.mappings = {}
+
+        if dedup_replicated_tensors is not None:
+            logger.warning(
+                "DefaultSavePlanner's `dedup_replicated_tensors` argument is being "
+                "deprecated, and no longer has any effect. Please remove this argument "
+                "from your call."
+            )
+
+    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+        if self.flatten_state_dict:
+            state_dict, self.mappings = flatten_state_dict(state_dict)
+        if self.flatten_sharded_tensors:
+            state_dict = _flatten_sharded_tensors(state_dict)
+        self.state_dict = state_dict
+        self.is_coordinator = is_coordinator
+
+    def create_local_plan(self) -> SavePlan:
+        plan = create_default_local_save_plan(self.state_dict, self.is_coordinator)
+        if self.flatten_state_dict:
+            plan = dataclasses.replace(plan, planner_data=self.mappings)
+        self.plan = plan
+
+        return self.plan
+
+    def create_global_plan(
+        self, all_plans: List[SavePlan]
+    ) -> Tuple[List[SavePlan], Metadata]:
+        all_plans = dedup_save_plans(all_plans)
+
+        global_plan, metadata = create_default_global_save_plan(all_plans)
+
+        if self.flatten_state_dict:
+            # | does not work for Python 3.8 or older version.
+            # merged_mappings = reduce(
+            #     lambda x, y: x | y, (p.planner_data for p in global_plan)
+            # )
+            planner_data_dict = [p.planner_data for p in global_plan]
+            merged_mappings = dict(ChainMap(*planner_data_dict))
+            metadata = dataclasses.replace(metadata, planner_data=merged_mappings)
+
+        if not _validate_global_plan(global_plan, metadata):
+            raise ValueError("Failed to validate global plan")
+
+        self.global_plan = global_plan
+        self.metadata = metadata
+
+        return self.global_plan, self.metadata
+
+    def finish_plan(self, new_plan: SavePlan) -> SavePlan:
+        self.plan = new_plan
+        return new_plan
+
+    def resolve_data(self, write_item: WriteItem) -> Union[torch.Tensor, io.BytesIO]:
+        object = self.lookup_object(write_item.index)
+        return self.transform_object(write_item, object)
+
+    def lookup_object(self, index: MetadataIndex) -> Any:
+        """Extension from the planner interface to make it easy to extend the default planner."""
+        return find_state_dict_object(self.state_dict, index)
+
+    def transform_object(self, write_item: WriteItem, object: Any):
+        """Extension from the planner interface to make it easy to extend the default planner."""
+        if write_item.type == WriteItemType.BYTE_IO:
+            bytes = io.BytesIO()
+            torch.save(object, bytes)
+            object = bytes
+        return object
+
+
+class DefaultLoadPlanner(LoadPlanner):
+    """
+    DefaultLoadPlanner that adds multiple features on top of LoadPlanner.
+
+    In particular it adds the following:
+
+    flatten_state_dict: Handle state_dict with nested dicts
+    flatten_sharded_tensors: For FSDP in 2D parallel mode
+    """
+
+    original_state_dict: STATE_DICT_TYPE
+    mappings: FLATTEN_MAPPING
+
+    def __init__(
+        self,
+        flatten_state_dict: bool = True,
+        flatten_sharded_tensors: bool = True,
+    ) -> None:
+        self.flatten_state_dict = flatten_state_dict
+        self.flatten_sharded_tensors = flatten_sharded_tensors
+        self.original_state_dict = {}
+        self.mappings = {}
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
+        _init_state_dict(state_dict)
+        self.original_state_dict = state_dict
+
+        if self.flatten_sharded_tensors:
+            state_dict = _flatten_sharded_tensors(state_dict)
+
+        if self.flatten_state_dict:
+            state_dict, self.mappings = flatten_state_dict(state_dict)
+
+        self.state_dict = state_dict
+        self.metadata = metadata
+        self.is_coordinator = is_coordinator
+
+    def create_local_plan(self) -> LoadPlan:
+        return create_default_local_load_plan(self.state_dict, self.metadata)
+
+    def create_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+        return create_default_global_load_plan(global_plan)
+
+    def finish_plan(self, new_plan: LoadPlan) -> LoadPlan:
+        return new_plan
+
+    def load_bytes(self, read_item: ReadItem, value: io.BytesIO) -> None:
+        if self.flatten_state_dict:
+            set_element(
+                self.original_state_dict,
+                self.mappings[read_item.dest_index.fqn],
+                torch.load(value),
+            )
+        else:
+            self.state_dict[read_item.dest_index.fqn] = torch.load(value)
+
+    def resolve_tensor(self, read_item: ReadItem):
+        tensor = self.lookup_tensor(read_item.dest_index)
+        return self.transform_tensor(read_item, tensor)
+
+    def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> None:
+        pass
+
+    def lookup_tensor(self, index: MetadataIndex) -> torch.Tensor:
+        """Extension from the planner interface to make it easy to extend the default planner."""
+        return find_state_dict_object(self.state_dict, index)
+
+    def transform_tensor(self, read_item: ReadItem, tensor: torch.Tensor):
+        """Extension from the planner interface to make it easy to extend the default planner."""
+        return narrow_tensor_by_index(tensor, read_item.dest_offsets, read_item.lengths)
+
+
+def create_default_local_load_plan(
+    state_dict: Dict[str, Any],
+    metadata: Metadata,
+) -> LoadPlan:
+    requests = []
+    """
+    Create the ``LoadPlan`` used by DefaultLoadPlanner.
+
+    It produces one read item per value in ``state_dict`` using the metadata in ``metadata``.
+
+    The default behavior is to match key exactly between state_dict and metadata.
+    It handles resharding by issuing multiple read requests against storage in order to match
+    load requirements.
+    """
+
+    for fqn, obj in state_dict.items():
+        md = metadata.state_dict_metadata[fqn]
+        # Since DTensor supports submesh, adding extra check to ensure _create_read_items()
+        # gets called only when the current rank is part of the mesh for the corresponding DTensor.
+        if isinstance(obj, DTensor):
+            if obj.device_mesh.get_coordinate() is not None:
+                requests += _create_read_items(fqn, md, obj)
+        else:
+            requests += _create_read_items(fqn, md, obj)
+
+    return LoadPlan(requests)
+
+
+def create_default_global_load_plan(
+    all_plans: List[LoadPlan],
+) -> List[LoadPlan]:
+    """
+    Create global load plan used by DefaultLoadPlanner.
+
+    The default load behavior involved no global coordination and this function
+    currently doesn't change the local plans.
+    """
+    return all_plans
+
+
+def create_default_local_save_plan(
+    state_dict: Dict[str, Any], is_coordinator: bool
+) -> SavePlan:
+    """
+    Create the ``SavePlan`` used by DefaultSavePlanner.
+
+    On non-coordinator ranks, this function ignores tensors and non-tensor objects,
+    only producing writes for ShardedTensor objects.
+
+    On the coordinator rank, produce writes for all values.
+    """
+    requests = []
+    for fqn, obj in state_dict.items():
+        # Since DTensor supports submesh, adding extra check to ensure _create_write_items()
+        # gets called only when the current rank is part of the mesh for the corresponding DTensor.
+        if isinstance(obj, DTensor):
+            if obj.device_mesh.get_coordinate() is not None:
+                requests += _create_write_items(fqn, obj)
+        elif isinstance(obj, (torch.Tensor)) or is_coordinator:
+            requests += _create_write_items(fqn, obj)
+
+    return SavePlan(requests)
+
+
+def create_default_global_save_plan(
+    all_plans: List[SavePlan],
+    rewrite_index_hints: bool = True,
+) -> Tuple[List[SavePlan], Metadata]:
+    """
+    Create the global plan and metadata used by DefaultSavePlanner.
+
+    Metadata is produced by concatenating the metadata of all ``WriteItem`` from the supplied plans.
+
+    The only global planning change is to update index hints in all ``MetadataIndex`` objects if
+    ``rewrite_index_hints`` is True.
+    """
+    md: Dict[str, STORAGE_TYPES] = {}
+    new_plans = []
+    for plan in all_plans:
+        new_items = []
+        for item in plan.items:
+            if not item.type == WriteItemType.SHARD:
+                assert item.index.fqn not in md
+
+            if item.type == WriteItemType.BYTE_IO:
+                md[item.index.fqn] = BytesStorageMetadata()
+                new_items.append(item)
+            else:
+                assert item.tensor_data is not None
+                tensor_md = cast(
+                    TensorStorageMetadata,
+                    md.setdefault(
+                        item.index.fqn,
+                        TensorStorageMetadata(
+                            properties=item.tensor_data.properties,
+                            size=item.tensor_data.size,
+                            chunks=[],
+                        ),
+                    ),
+                )
+                new_item = item
+                if rewrite_index_hints:
+                    new_index = dataclasses.replace(
+                        item.index, index=len(tensor_md.chunks)
+                    )
+                    new_item = dataclasses.replace(item, index=new_index)
+                new_items.append(new_item)
+
+                assert (
+                    item.tensor_data.chunk is not None
+                ), f"""
+                    Cannot create MD for tensor without bounds.
+                    FQN: {item.index.fqn}
+                """
+                tensor_md.chunks.append(item.tensor_data.chunk)
+        new_plans.append(dataclasses.replace(plan, items=new_items))
+    return (new_plans, Metadata(md))
+
+
+def _create_default_local_metadata(state_dict: STATE_DICT_TYPE) -> Metadata:
+    """Return the ``Metadata`` if DefaultSavePlanner was used to checkpoint ``state_dict``."""
+    plan = _create_default_metadata_only_plan(state_dict)
+    _, md = create_default_global_save_plan([plan])
+    return md
+
+
+def _check_box_overlap(box0: ChunkStorageMetadata, box1: ChunkStorageMetadata) -> bool:
+    """Check if two boxes overlap. Tuples are (offset, lengths)."""
+    # For each dim of each shard, check if one shard resides on the other
+    # end of second shard with respect to that dim. As an example for a 2D
+    # shard, we would check if one shard is above or on the left of the
+    # other shard.
+    ndims = len(box0.offsets)
+    for i in range(ndims):
+        if box0.offsets[i] >= box1.offsets[i] + box1.sizes[i]:
+            return False
+        if box1.offsets[i] >= box0.offsets[i] + box0.sizes[i]:
+            return False
+
+    return True
+
+
+def _check_box_bounds(
+    outer_box_size: torch.Size, inner_box: ChunkStorageMetadata
+) -> bool:
+    for i in range(len(outer_box_size)):
+        if inner_box.offsets[i] < 0:
+            return False
+        if inner_box.sizes[i] < 0:
+            return False
+        if inner_box.offsets[i] + inner_box.sizes[i] > outer_box_size[i]:
+            return False
+
+    return True
+
+
+def _validate_global_plan(global_plan: List[SavePlan], metadata: Metadata) -> bool:
+    all_good = True
+    for key, value in metadata.state_dict_metadata.items():
+        if isinstance(value, BytesStorageMetadata):
+            continue
+        if len(value.size) == 0:
+            continue
+        chunks_volume = 0
+        for chunk_idx, chunk0 in enumerate(value.chunks):
+            # Compute the volume
+            if not _check_box_bounds(value.size, chunk0):
+                logger.warning(
+                    """
+                        key:%s has out of bounds chunk:
+                        tensor-size:%s chunk: %s
+                    """,
+                    key,
+                    value.size,
+                    chunk0,
+                )
+                all_good = False
+            chunks_volume += reduce(operator.mul, chunk0.sizes, 1)
+
+            # Check for overlap
+            for chunk1 in value.chunks[chunk_idx + 1 :]:
+                if _check_box_overlap(chunk0, chunk1):
+                    logger.warning(
+                        "key:%s has overlapping chunks: %s %s", key, chunk0, chunk1
+                    )
+                    all_good = False
+
+        # Check whether combined chunk cover the whole tensor
+        tensor_volume = reduce(operator.mul, value.size, 1)
+        if chunks_volume != tensor_volume:
+            logger.warning(
+                """
+                    key:%s invalid fill tensor-volume:
+                    %s chunks-volume: %s
+                """,
+                key,
+                tensor_volume,
+                chunks_volume,
+            )
+            all_good = False
+
+    return all_good
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/filesystem.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/filesystem.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3f7b081ee6408bebed6974c670429683a46784b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/filesystem.py
@@ -0,0 +1,618 @@
+import collections
+import dataclasses
+import io
+import os
+import pickle
+import queue
+import threading
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import (
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    IO,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import torch
+from torch import Tensor
+from torch._utils import _get_available_device_type, _get_device_module
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.futures import Future
+
+from .metadata import Metadata, MetadataIndex
+from .planner import (
+    LoadItemType,
+    LoadPlan,
+    LoadPlanner,
+    ReadItem,
+    SavePlan,
+    SavePlanner,
+    WriteItem,
+    WriteItemType,
+)
+from .storage import StorageReader, StorageWriter, WriteResult
+from .utils import _create_file_view
+
+__all__ = ["FileSystemWriter", "FileSystemReader"]
+
+
+@dataclass
+class _StorageInfo:
+    """This is the per entry storage info."""
+
+    relative_path: str
+    offset: int
+    length: int
+
+
+@dataclass
+class _StoragePrefix:
+    prefix: str
+
+
+DEFAULT_SUFFIX = ".distcp"
+
+
+class _TensorLoader(ABC):
+    @abstractmethod
+    def add(self, size: int, obj: object) -> None:
+        pass
+
+    @abstractmethod
+    def start_loading(self) -> None:
+        pass
+
+    @abstractmethod
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
+        pass
+
+
+class _SerialCpuLoader(_TensorLoader):
+    def __init__(self, resolve_fun: Callable) -> None:
+        self.resolve_fun = resolve_fun
+        self.items: List[Tuple[int, object]] = []
+
+    def add(self, size: int, obj: object) -> None:
+        self.items.append((size, obj))
+
+    def start_loading(self) -> None:
+        pass
+
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
+        for _, obj in self.items:
+            tensor = self.resolve_fun(obj).detach()
+            tensor = tensor.cpu()
+            if tensor.storage().size() != tensor.numel():
+                tensor = tensor.clone()
+            yield (
+                tensor,
+                obj,
+            )
+
+
+class _OverlappingCpuLoader(_TensorLoader):
+    def __init__(
+        self,
+        resolve_fun: Callable,
+        stream: Optional[torch.Stream] = None,
+        inflight_threshhold: int = 1_000_000,
+    ) -> None:
+        self.resolve_fun = resolve_fun
+        self.items: List[Tuple[int, object]] = []
+        self.inflight_threshhold = inflight_threshhold
+        self.in_flight_data = 0
+        self.current_items: collections.deque = collections.deque()
+        self.idx = 0
+        self.started = False
+        self.device_type = (
+            stream.device_type if stream else _get_available_device_type()
+        )
+        self.device_module = _get_device_module(self.device_type)
+        self.stream = cast(
+            torch.cuda.Stream, stream or self.device_module.current_stream()
+        )
+        if self.stream != self.device_module.current_stream():
+            self.stream.wait_stream(self.device_module.current_stream())
+
+    @property
+    def _done(self) -> bool:
+        return self.idx >= len(self.items)
+
+    def _drain(self) -> List[Tuple[torch.Tensor, object]]:
+        drained = []
+        if self.in_flight_data >= self.inflight_threshhold:
+            self.stream.synchronize()
+        while self.in_flight_data >= self.inflight_threshhold:
+            val = self.current_items.popleft()
+            self.in_flight_data -= val[0].numel() * val[0].element_size()
+            drained.append(val)
+        return drained
+
+    def _refill(self) -> None:
+        with self.device_module.stream(self.stream):
+            while not self._done and self.in_flight_data < self.inflight_threshhold:
+                _, obj = self.items[self.idx]
+                self.idx += 1
+                tensor = self.resolve_fun(obj).detach()
+                if tensor.device.type == self.device_type:
+                    tensor = tensor.to(device="cpu", non_blocking=True)
+                elif tensor.device == torch.device("cpu"):
+                    if (
+                        tensor.untyped_storage().size()
+                        != tensor.numel() * tensor.itemsize
+                    ):
+                        # this forces the tensor to be both contiguous and with minimal storage
+                        tensor = tensor.clone()
+
+                self.current_items.append(
+                    (
+                        tensor,
+                        obj,
+                    )
+                )
+                self.in_flight_data += tensor.numel() * tensor.element_size()
+
+    def _finish(self) -> Iterable[Tuple[torch.Tensor, object]]:
+        assert self._done
+        if len(self.current_items) > 0:
+            self.stream.synchronize()
+        return self.current_items
+
+    def add(self, size: int, obj: object) -> None:
+        if self.started:
+            raise RuntimeError("cannot add items after loading started")
+        self.items.append((size, obj))
+
+    def start_loading(self) -> None:
+        if self.started:
+            return
+        self.started = True
+        self.items.sort(key=lambda x: x[0])
+        self._refill()
+
+    def values(self) -> Iterator[Tuple[torch.Tensor, object]]:
+        self.start_loading()
+        while not self._done:
+            drained = self._drain()
+            self._refill()
+            yield from drained
+
+        yield from self._finish()
+
+
+def _item_size(item: WriteItem) -> int:
+    size = 1
+    assert item.tensor_data is not None
+    # can't use math.prod as PT needs to support older python
+    for s in item.tensor_data.size:
+        size *= s
+
+    dtype = item.tensor_data.properties.dtype
+    return size * torch._utils._element_size(dtype)
+
+
+def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]:
+    if bins == 1:
+        return [items]
+
+    bytes_w = [wi for wi in items if wi.type == WriteItemType.BYTE_IO]
+    tensor_w = [wi for wi in items if wi.type != WriteItemType.BYTE_IO]
+
+    buckets: List[List[WriteItem]] = [[] for _ in range(bins)]
+    bucket_sizes = [0 for _ in range(bins)]
+
+    tensor_w.sort(key=_item_size, reverse=True)
+
+    for i, wi in enumerate(bytes_w):
+        buckets[i % bins].append(wi)
+
+    for wi in tensor_w:
+        # TODO replace with headq
+        idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0]
+        buckets[idx].append(wi)
+        bucket_sizes[idx] += _item_size(wi)
+
+    return buckets
+
+
+def _write_item(
+    stream: io.IOBase,
+    data: Union[io.BytesIO, torch.Tensor],
+    write_item: WriteItem,
+    storage_key: str,
+) -> WriteResult:
+    offset = stream.tell()
+
+    if write_item.type == WriteItemType.BYTE_IO:
+        assert isinstance(data, io.BytesIO)
+        stream.write(data.getbuffer())
+    else:
+        assert isinstance(data, torch.Tensor)
+        assert data.device == torch.device("cpu")
+        torch.save(data, cast(IO[bytes], stream))
+    length = stream.tell() - offset
+
+    return WriteResult(
+        index=write_item.index,
+        size_in_bytes=length,
+        storage_data=_StorageInfo(storage_key, offset, length),
+    )
+
+
+def _write_files_from_queue(
+    create_stream: Callable,
+    file_queue: queue.Queue,
+    result_queue: queue.Queue,
+    planner: SavePlanner,
+    inflight_threshhold: int,
+    use_fsync: bool,
+    thread_count: int,
+) -> None:
+    try:
+        while True:
+            file_name, storage_key, write_items = file_queue.get_nowait()
+            loader: _TensorLoader
+
+            custom_backend_name = torch._C._get_privateuse1_backend_name()
+            custom_device_mod = getattr(torch, custom_backend_name, None)
+
+            # TODO: Using the OverlappingCpuLoader with multiple threads creates significant
+            # performance degredation, observed as being related to cuda stream syncs. We
+            # should try to fix this and use _OverlappingCpuLoader for all threaded cases
+            if (
+                thread_count == 1
+                and (
+                    torch.cuda.is_available()
+                    or (custom_device_mod and custom_device_mod.is_available())
+                )
+                and inflight_threshhold > 0
+            ):
+                loader = _OverlappingCpuLoader(
+                    planner.resolve_data,
+                    inflight_threshhold=inflight_threshhold,
+                )
+            else:
+                loader = _SerialCpuLoader(
+                    planner.resolve_data,
+                )
+
+            tensor_w = [wi for wi in write_items if wi.type != WriteItemType.BYTE_IO]
+            for write_item in tensor_w:
+                loader.add(_item_size(write_item), write_item)
+            loader.start_loading()
+
+            bytes_w = [wi for wi in write_items if wi.type == WriteItemType.BYTE_IO]
+            write_results = []
+
+            with create_stream(file_name, "wb") as stream:
+                for write_item in bytes_w:
+                    data = planner.resolve_data(write_item)
+                    write_results.append(
+                        _write_item(stream, data, write_item, storage_key)
+                    )
+
+                for tensor, write_item in loader.values():
+                    assert tensor.is_cpu
+                    write_results.append(
+                        _write_item(stream, tensor, write_item, storage_key)
+                    )
+
+                if use_fsync:
+                    try:
+                        os.fsync(stream.fileno())
+                    except AttributeError:
+                        os.sync()
+            result_queue.put(write_results)
+    except queue.Empty:
+        pass
+
+
+class FileSystemBase(ABC):
+    @contextmanager
+    @abstractmethod
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        ...
+
+    @abstractmethod
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        ...
+
+    @abstractmethod
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        ...
+
+    @abstractmethod
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        ...
+
+    @abstractmethod
+    def mkdir(self, path: Union[str, os.PathLike]) -> None:
+        ...
+
+    @classmethod
+    @abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        ...
+
+
+class FileSystem(FileSystemBase):
+    @contextmanager
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        with cast(Path, path).open(mode) as stream:
+            yield cast(io.IOBase, stream)
+
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        return cast(Path, path) / suffix
+
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        if not isinstance(path, Path):
+            path = Path(path)
+        return path
+
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        cast(Path, path).rename(cast(Path, new_path))
+
+    def mkdir(self, path: Union[str, os.PathLike]) -> None:
+        cast(Path, path).mkdir(parents=True, exist_ok=True)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        if isinstance(checkpoint_id, Path):
+            return True
+
+        if "://" in str(checkpoint_id):
+            return False
+
+        for p in Path(checkpoint_id).parents:
+            if p.exists() and os.access(str(p), os.W_OK):
+                return True
+
+        return False
+
+
+class FileSystemWriter(StorageWriter):
+    """
+    Basic implementation of StorageWriter using file IO.
+
+    This implementation makes the following assumptions and simplifications:
+
+    * The checkpoint path is an empty or non-existing directory.
+    * File creation is atomic
+
+    The checkpoint consist of one file per write request plus
+    a `.metadata` file with the serialized metadata.
+
+    """
+
+    def __init__(
+        self,
+        path: Union[str, os.PathLike],
+        single_file_per_rank: bool = True,
+        sync_files: bool = True,
+        thread_count: int = 1,
+        per_thread_copy_ahead: int = 10_000_000,
+    ) -> None:
+        """
+        Initialize the writer pointing to `path`.
+
+        Args:
+            path: directory where the checkpoint will be written to.
+            single_file_per_rank: Produce one file per rank instead of one file per tensor/blob. Default to True.
+            sync_files : force files to be synced to permanent storage. Default to True.
+            thread_count: Number of IO threads to use to write. Default to 1.
+            per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
+
+        N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
+        """
+        super().__init__()
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
+        self.single_file_per_rank = single_file_per_rank
+        self.sync_files = sync_files
+        self.thread_count = thread_count
+        self.per_thread_copy_ahead = per_thread_copy_ahead
+
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        if checkpoint_id:
+            self.path = self.fs.init_path(checkpoint_id)
+
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+        pass
+
+    def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
+        self.fs.mkdir(self.path)
+        return plan
+
+    def prepare_global_plan(self, global_plan: List[SavePlan]) -> List[SavePlan]:
+        new_plans = [
+            dataclasses.replace(plan, storage_data=_StoragePrefix(f"__{i}_"))
+            for i, plan in enumerate(global_plan)
+        ]
+        return new_plans
+
+    def write_data(
+        self,
+        plan: SavePlan,
+        planner: SavePlanner,
+    ) -> Future[List[WriteResult]]:
+        storage_plan: _StoragePrefix = plan.storage_data
+        file_count = 0
+
+        def gen_file():
+            nonlocal file_count
+            file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}"
+            file_count += 1
+            return file_name
+
+        file_queue: queue.Queue = queue.Queue()
+        if self.single_file_per_rank:
+            for bucket in _split_by_size_and_type(self.thread_count, plan.items):
+                file_name = gen_file()
+                path = self.fs.concat_path(self.path, file_name)
+                file_queue.put((path, file_name, bucket))
+        else:
+            for item in plan.items:
+                file_name = gen_file()
+                path = self.fs.concat_path(self.path, file_name)
+                file_queue.put((path, file_name, [item]))
+
+        result_queue: queue.Queue = queue.Queue()
+
+        threads = []
+        for _ in range(1, self.thread_count):
+            t = threading.Thread(
+                target=_write_files_from_queue,
+                args=(
+                    self.fs.create_stream,
+                    file_queue,
+                    result_queue,
+                    planner,
+                    self.per_thread_copy_ahead,
+                    self.sync_files,
+                    self.thread_count,
+                ),
+            )
+            t.start()
+            threads.append(t)
+
+        _write_files_from_queue(
+            create_stream=self.fs.create_stream,
+            file_queue=file_queue,
+            result_queue=result_queue,
+            planner=planner,
+            inflight_threshhold=self.per_thread_copy_ahead,
+            use_fsync=self.sync_files,
+            thread_count=self.thread_count,
+        )
+
+        for t in threads:
+            t.join()
+
+        res = []
+        try:
+            while True:
+                res += result_queue.get_nowait()
+        except queue.Empty:
+            pass
+
+            fut: Future[List[WriteResult]] = Future()
+            fut.set_result(res)
+            return fut
+
+    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+        storage_md = dict()
+        for wr_list in results:
+            storage_md.update({wr.index: wr.storage_data for wr in wr_list})
+        metadata.storage_data = storage_md
+        tmp_path = cast(Path, self.fs.concat_path(self.path, ".metadata.tmp"))
+        meta_path = cast(Path, self.fs.concat_path(self.path, ".metadata"))
+        with self.fs.create_stream(tmp_path, "wb") as metadata_file:
+            pickle.dump(metadata, metadata_file)
+            if self.sync_files:
+                try:
+                    os.fsync(metadata_file.fileno())
+                except AttributeError:
+                    os.sync()
+
+        self.fs.rename(tmp_path, meta_path)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
+
+
+class FileSystemReader(StorageReader):
+    def __init__(self, path: Union[str, os.PathLike]) -> None:
+        super().__init__()
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
+        self.storage_data: Dict[MetadataIndex, _StorageInfo] = dict()
+
+    def _slice_file(self, file, sinfo: _StorageInfo) -> io.IOBase:
+        return _create_file_view(file, sinfo.offset, sinfo.length)
+
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        self.storage_data = dict()
+        if checkpoint_id:
+            self.path = self.fs.init_path(checkpoint_id)
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        # group requests by file
+        per_file: Dict[str, List[ReadItem]] = dict()
+        for read_item in plan.items:
+            item_md = self.storage_data[read_item.storage_index]
+            path = item_md.relative_path
+            per_file.setdefault(path, []).append(read_item)
+
+        for relative_path, reqs in per_file.items():
+            new_path = self.fs.concat_path(self.path, relative_path)
+            with self.fs.create_stream(new_path, "rb") as stream:
+                # TODO sort by offset and cache the reading
+                for req in reqs:
+                    item_md = self.storage_data[req.storage_index]
+                    file_slice = self._slice_file(stream, item_md)
+                    if req.type == LoadItemType.BYTE_IO:
+                        read_bytes = io.BytesIO(file_slice.read(item_md.length))
+                        read_bytes.seek(0)
+                        planner.load_bytes(req, read_bytes)
+                    else:
+                        tensor = cast(
+                            Tensor,
+                            torch.load(cast(IO[bytes], file_slice), map_location="cpu"),
+                        )
+                        tensor = narrow_tensor_by_index(
+                            tensor, req.storage_offsets, req.lengths
+                        )
+                        target_tensor = planner.resolve_tensor(req).detach()
+
+                        assert (
+                            target_tensor.size() == tensor.size()
+                        ), f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+                        target_tensor.copy_(tensor)
+                        planner.commit_tensor(req, target_tensor)
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    # Implementing the abstract function in StorageReader
+    def read_metadata(self) -> Metadata:
+        path = self.fs.concat_path(self.path, ".metadata")
+        with self.fs.create_stream(path, "rb") as metadata_file:
+            return pickle.load(metadata_file)
+
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        self.storage_data = metadata.storage_data
+        assert self.storage_data is not None
+
+    def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
+        return plan
+
+    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+        return global_plan
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/format_utils.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/format_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..754ae7b5cb73e841e07f036d47e4f5ecf04e9257
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/format_utils.py
@@ -0,0 +1,311 @@
+import argparse
+import os
+from enum import Enum
+from typing import cast, Dict, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard._utils import narrow_tensor_by_index
+from torch.distributed.checkpoint import FileSystemReader, FileSystemWriter
+from torch.distributed.checkpoint._nested_dict import flatten_state_dict
+from torch.distributed.checkpoint._traverse import set_element
+from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
+from torch.distributed.checkpoint.metadata import (
+    Metadata,
+    STATE_DICT_TYPE,
+    STORAGE_TYPES,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import LoadItemType, LoadPlan, LoadPlanner
+from torch.distributed.checkpoint.planner_helpers import _create_chunk_list
+from torch.distributed.checkpoint.state_dict_loader import _load_state_dict
+from torch.distributed.checkpoint.state_dict_saver import _save_state_dict
+from torch.distributed.checkpoint.storage import StorageReader
+from torch.futures import Future
+
+
+__all__ = [
+    "dcp_to_torch_save",
+    "torch_save_to_dcp",
+    "BroadcastingTorchSaveReader",
+    "DynamicMetaLoadPlanner",
+]
+
+
+class _EmptyStateDictLoadPlanner(DefaultLoadPlanner):
+    """
+    Extension of DefaultLoadPlanner, which rebuilds state_dict from the saved metadata.
+    Useful for loading in state_dict without first initializing a model, such as
+    when converting a DCP checkpoint into a Torch save file.
+
+    . N.B. `state_dict` must be an empty dictionary when used with this LoadPlanner
+
+    .. warning::
+        Because the entire state dict is initialized, It's recommended to only utilize
+        this LoadPlanner on a single rank or process to avoid OOM.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
+        assert not state_dict
+
+        # rebuild the state dict from the metadata
+        for k, v in metadata.state_dict_metadata.items():
+            if isinstance(v, TensorStorageMetadata):
+                v = torch.empty(v.size, dtype=v.properties.dtype)  # type: ignore[assignment]
+            if k in metadata.planner_data:
+                set_element(state_dict, metadata.planner_data[k], v)
+            else:
+                state_dict[k] = v
+
+        super().set_up_planner(state_dict, metadata, is_coordinator)
+
+
+class BroadcastingTorchSaveReader(StorageReader):
+    """
+    StorageReader for reading a Torch Save file. This reader will read the entire checkpoint
+    on the coordinator rank, and then broadcast and shard each tensor to all ranks.
+
+    . N.B. Intended to be used with DynamicMetaLoadPlanner
+
+    .. warning::
+        Current implementation only supports loading Tensors.
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> sd = {"mode": model}
+    >>> dcp.load(
+    >>>    sd,
+    >>>    storage_reader=BroadcastingTorchSaveReader(),
+    >>>    planner=DynamicMetaLoadPlanner(),
+    >>>    checkpoint_id="path_to_model.pt"
+    >>> )
+    """
+
+    def __init__(
+        self,
+        checkpoint_id: Optional[Union[str, os.PathLike]] = None,
+        coordinator_rank: int = 0,
+    ) -> None:
+        self.checkpoint_id = checkpoint_id
+        self.coordinator_rank = coordinator_rank
+
+    def read_metadata(self) -> Metadata:
+        """Extends the default StorageReader to support building the metadata file"""
+        # Metadata is built in planner.set_up_planner, since we are not actually reading metadata from
+        # the disk
+        return Metadata(state_dict_metadata={})
+
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        """
+        Reads torch save data on the coordinator rank, and broadcast afterwards
+        this incurrs a communication cost, but avoids having to load
+        the entire checkpoint on each rank, hopefully preventing OOM issues
+        """
+        planner = cast(DefaultLoadPlanner, planner)
+
+        # data is read in on the coordinator rank, and broadcast afterwards
+        # this incurrs a communication cost, but it avoids having to load
+        # the entire checkpoint on each rank, hopefully preventing OOM issues
+        # TODO: read on each host, instead of only the coordinator
+        if self.is_coordinator:
+            assert self.checkpoint_id is not None
+            torch_state_dict = torch.load(self.checkpoint_id, map_location="cpu")
+            if planner.flatten_state_dict:
+                torch_state_dict, _ = flatten_state_dict(torch_state_dict)
+        else:
+            torch_state_dict = None
+
+        for req in plan.items:
+            if req.type == LoadItemType.BYTE_IO:
+                raise RuntimeError(
+                    f"Non-tensor value identified at {req.storage_index.fqn}. "
+                    f"At this time {type(self).__name__} only supports loading Tensors."
+                )
+
+            #  Broadcast the tensor from the coordinator rank
+            if self.is_coordinator:
+                tensor = torch_state_dict[req.storage_index.fqn].cuda()
+            else:
+                tensor = torch.empty_like(planner.state_dict[req.storage_index.fqn])
+
+            dist.broadcast(tensor, src=self.coordinator_rank, async_op=False)
+
+            tensor = narrow_tensor_by_index(tensor, req.storage_offsets, req.lengths)
+            target_tensor = planner.resolve_tensor(req).detach()
+            assert target_tensor.size() == tensor.size(), (
+                f"req {req.storage_index} mismatch sizes, "
+                f"{target_tensor.size()} vs {tensor.size()}"
+            )
+            target_tensor.copy_(tensor)
+            planner.commit_tensor(req, target_tensor)
+
+        fut: Future = Future()
+        fut.set_result(None)
+        return fut
+
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        """Implementation of the StorageReader method"""
+        self.is_coordinator = is_coordinator
+        if self.is_coordinator:
+            assert dist.get_rank() == self.coordinator_rank
+
+        assert self.checkpoint_id is not None
+
+    def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
+        """Implementation of the StorageReader method"""
+        return plan
+
+    def prepare_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+        """Implementation of the StorageReader method"""
+        return global_plan
+
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """Implementation of the StorageReader method"""
+        self.checkpoint_id = checkpoint_id
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """Implementation of the StorageReader method"""
+        return os.path.isfile(checkpoint_id)
+
+
+class DynamicMetaLoadPlanner(DefaultLoadPlanner):
+    """
+    Extension of DefaultLoadPlanner, which creates a new Metadata object based on the passed in state dict,
+    avoiding the need to read metadata from disk. This is useful when reading formats which don't have a
+    metadata file, like Torch Save files.
+
+    . N.B. Intended to be used with BroadcastingTorchSaveReader
+
+    .. warning::
+        Current implementation only supports loading Tensors.
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> sd = {"mode": model}
+    >>> dcp.load(
+    >>>    sd,
+    >>>    storage_reader=BroadcastingTorchSaveReader(),
+    >>>    planner=DynamicMetaLoadPlanner(),
+    >>>    checkpoint_id="path_to_model.pt"
+    >>> )
+    """
+
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
+        """Setups of the planner, extnding default behavior by creating the Metadata object from the state dict"""
+        super().set_up_planner(state_dict, metadata, is_coordinator)
+
+        state_dict_metadata: Dict[str, STORAGE_TYPES] = {}
+        for key, tensor in self.state_dict.items():
+            if not torch.is_tensor(tensor):
+                raise RuntimeError(
+                    f"Non-tensor value identified at {key}. "
+                    f"At this time {type(self).__name__} only supports loading Tensors."
+                )
+
+            state_dict_metadata[key] = TensorStorageMetadata(
+                TensorProperties(dtype=tensor.dtype),
+                tensor.size(),
+                _create_chunk_list(tensor),
+            )
+        self.metadata = Metadata(state_dict_metadata=state_dict_metadata)
+
+
+def dcp_to_torch_save(
+    dcp_checkpoint_dir: Union[str, os.PathLike],
+    torch_save_path: Union[str, os.PathLike],
+):
+    """
+    Given a directory containing a DCP checkpoint, this function will convert it into a
+    Torch save file.
+
+    Args:
+        dcp_checkpoint_dir: Directory containing the DCP checkpoint.
+        torch_save_path: Filename to store the converted Torch save file.
+
+    .. warning::
+        To avoid OOM, it's recommended to only run this function on a single rank.
+    """
+    sd: STATE_DICT_TYPE = {}
+    _load_state_dict(
+        sd,
+        storage_reader=FileSystemReader(dcp_checkpoint_dir),
+        planner=_EmptyStateDictLoadPlanner(),
+        no_dist=True,
+    )
+    torch.save(sd, torch_save_path)
+
+
+def torch_save_to_dcp(
+    torch_save_path: Union[str, os.PathLike],
+    dcp_checkpoint_dir: Union[str, os.PathLike],
+):
+    """
+    Given the location of a torch save file, converts it into a DCP checkpoint.
+
+    Args:
+        torch_save_path: Filename to store the converted Torch save file.
+        dcp_checkpoint_dir: Directory containing the DCP checkpoint.
+
+    .. warning::
+        To avoid OOM, it's recommended to only run this function on a single rank.
+    """
+
+    state_dict = torch.load(torch_save_path)
+    # we don't need stateful behavior here because the expectation is anything loaded by
+    # torch.load would not contain stateful objects.
+    _save_state_dict(
+        state_dict, storage_writer=FileSystemWriter(dcp_checkpoint_dir), no_dist=True
+    )
+
+
+if __name__ == "__main__":
+
+    class FormatMode(Enum):
+        TORCH_TO_DCP = "torch_to_dcp"
+        DCP_TO_TORCH = "dcp_to_torch"
+
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "mode",
+        type=str,
+        help="Conversion mode",
+        choices=[m.value for m in FormatMode],
+        default=FormatMode.TORCH_TO_DCP,
+    )
+    parser.add_argument("src", type=str, help="Path to the source model")
+    parser.add_argument("dst", type=str, help="Path to the destination model")
+    args = parser.parse_args()
+
+    print(
+        f"Converting checkpoint from {args.src} to {args.dst} using method: '{args.mode}'"
+    )
+    checkpoint_missing_warning = (
+        f"No checkpoint found at {args.src}. Skipping conversion."
+    )
+    if args.mode == FormatMode.TORCH_TO_DCP.value:
+        if os.path.isfile(args.src):
+            torch_save_to_dcp(args.src, args.dst)
+        else:
+            print(checkpoint_missing_warning)
+    elif args.mode == FormatMode.DCP_TO_TORCH.value:
+        if os.path.isdir(args.src):
+            dcp_to_torch_save(args.src, args.dst)
+        else:
+            print(checkpoint_missing_warning)
+    else:
+        raise ValueError(f"Unknown conversion mode: {args.mode}")
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/fsspec.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/fsspec.py
new file mode 100644
index 0000000000000000000000000000000000000000..f471fba89ada2a5aadf1530a3b6aed40bd68ac44
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/fsspec.py
@@ -0,0 +1,122 @@
+# Mypy will not try inferring the types of any 3rd party libraries installed.
+# mypy: ignore-errors
+
+import io
+import os
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Optional, Union
+
+import fsspec
+from fsspec import AbstractFileSystem
+from fsspec.core import url_to_fs
+
+from torch.distributed.checkpoint.filesystem import (
+    FileSystemBase,
+    FileSystemReader,
+    FileSystemWriter,
+)
+
+__all__ = [
+    "FsspecWriter",
+    "FsspecReader",
+]
+
+
+class FileSystem(FileSystemBase):
+    def __init__(self) -> None:
+        self.fs: Optional[AbstractFileSystem] = None
+
+    @contextmanager
+    def create_stream(
+        self, path: Union[str, os.PathLike], mode: str
+    ) -> Generator[io.IOBase, None, None]:
+        assert self.fs is not None
+        with self.fs.transaction:
+            with fsspec.open(str(path), mode) as stream:
+                yield stream
+
+    def concat_path(
+        self, path: Union[str, os.PathLike], suffix: str
+    ) -> Union[str, os.PathLike]:
+        return os.path.join(path, suffix)
+
+    def init_path(self, path: Union[str, os.PathLike]) -> Union[str, os.PathLike]:
+        self.fs, _ = url_to_fs(path)
+        return path
+
+    def rename(
+        self, path: Union[str, os.PathLike], new_path: Union[str, os.PathLike]
+    ) -> None:
+        self.fs.rename(path, new_path)
+
+    def mkdir(self, path: [str, os.PathLike]) -> None:
+        self.fs.makedirs(path, exist_ok=True)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        if isinstance(checkpoint_id, Path):
+            return False
+
+        try:
+            url_to_fs(checkpoint_id)
+        except ValueError as e:
+            return False
+
+        return True
+
+
+class FsspecWriter(FileSystemWriter):
+    """
+    Basic implementation of StorageWriter using FFspec.
+
+    This implementation makes the following assumptions and simplifications:
+
+    * The checkpoint path is an empty or non-existing directory.
+    * File creation is atomic
+
+    The checkpoint consist of one file per write request plus
+    a `.metadata` file with the serialized metadata.
+
+    """
+
+    def __init__(
+        self,
+        path: Union[str, os.PathLike],
+        single_file_per_rank: bool = True,
+        sync_files: bool = True,
+        thread_count: int = 1,
+        per_thread_copy_ahead: int = 10_000_000,
+    ) -> None:
+        """
+        Initialize the writer pointing to `path`.
+
+        Args:
+            path: directory where the checkpoint will be written to.
+            single_file_per_rank: Produce one file per rank instead of one file per tensor/blob. Default to True.
+            sync_files : force files to be synced to permanent storage. Default to True.
+            thread_count: Number of IO threads to use to write. Default to 1.
+            per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
+
+        N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
+        """
+        super().__init__(
+            path, single_file_per_rank, sync_files, thread_count, per_thread_copy_ahead
+        )
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.validate_checkpoint_id(checkpoint_id)
+
+
+class FsspecReader(FileSystemReader):
+    def __init__(self, path: Union[str, os.PathLike]) -> None:
+        super().__init__(path)
+        self.fs = FileSystem()
+        self.path = self.fs.init_path(path)
+
+    @classmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        return FileSystem.check(checkpoint_id)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/metadata.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da2237807a90fcdab6dec485b3a9382b8707236
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/metadata.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+import torch
+from torch.distributed.checkpoint.stateful import StatefulT
+
+__all__ = [
+    "ChunkStorageMetadata",
+    "TensorStorageMetadata",
+    "BytesStorageMetadata",
+    "Metadata",
+    "MetadataIndex",
+    "TensorProperties",
+]
+
+
+@dataclass
+class ChunkStorageMetadata:
+    """
+    Each chunk is expected to have the same properties of the TensorStorageMetadata
+    that includes it.
+    """
+
+    offsets: torch.Size
+    sizes: torch.Size
+
+
+class _MEM_FORMAT_ENCODING(Enum):
+    """Describe the memory format of a tensor."""
+
+    TORCH_CONTIGUOUS_FORMAT = 0
+    TORCH_CHANNELS_LAST = 1
+    TORCH_PRESERVE_FORMAT = 2
+
+
+@dataclass
+class TensorProperties:
+    """Properties used to create :class:`Tensor`"""
+
+    # Regular tensor fields
+    dtype: torch.dtype = field(default_factory=torch.get_default_dtype)
+    # This field is deprecated.
+    layout: torch.layout = field(default=torch.strided)
+    # This field is deprecated.
+    requires_grad: bool = False
+    # This field is deprecated.
+    memory_format: torch.memory_format = field(default=torch.contiguous_format)
+    # This field is deprecated.
+    pin_memory: bool = False
+
+    def __getstate__(self):
+        # Since torch.memory_format cannot be pickled!
+        memory_format = self.memory_format
+        if memory_format == torch.contiguous_format:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT
+        elif memory_format == torch.channels_last:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST
+        elif memory_format == torch.preserve_format:
+            mem_format_encoding = _MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT
+        else:
+            raise RuntimeError(f"Invalid torch.memory_format: {memory_format}")
+
+        return (
+            self.dtype,
+            self.layout,
+            self.requires_grad,
+            mem_format_encoding,
+            self.pin_memory,
+        )
+
+    def __setstate__(
+        self,
+        state,
+    ):
+        (
+            self.dtype,
+            self.layout,
+            self.requires_grad,
+            mem_format_encoding,
+            self.pin_memory,
+        ) = state
+
+        if mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_CONTIGUOUS_FORMAT:
+            memory_format = torch.contiguous_format
+        elif mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_CHANNELS_LAST:
+            memory_format = torch.channels_last
+        elif mem_format_encoding == _MEM_FORMAT_ENCODING.TORCH_PRESERVE_FORMAT:
+            memory_format = torch.preserve_format
+        else:
+            raise RuntimeError(
+                f"Invalid torch.memory_format encoding: {mem_format_encoding}"
+            )
+
+        self.memory_format = memory_format
+
+    @staticmethod
+    def create_from_tensor(tensor: torch.Tensor) -> "TensorProperties":
+        return TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=tensor.requires_grad,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned(),
+        )
+
+
+@dataclass
+class TensorStorageMetadata:
+    properties: TensorProperties
+    size: torch.Size
+    chunks: List[ChunkStorageMetadata]
+
+
+@dataclass
+class BytesStorageMetadata:
+    pass
+
+
+STORAGE_TYPES = Union[TensorStorageMetadata, BytesStorageMetadata]
+STATE_DICT_TYPE = Dict[str, Union[StatefulT, Any]]
+
+
+@dataclass
+class Metadata:
+    """This class represents the metadata of the checkpoint."""
+
+    # Keys are the same from the `state_dict` used.
+    state_dict_metadata: Dict[str, STORAGE_TYPES]
+    # It is the responsibility of the planner and storage plugins to ensure
+    # backward compatibility of the planner_data and storage_data. DCP will
+    # also ensure the backward compatibility of the metadata in this file and
+    # the metadata of the built-in planner and storage plugins.
+    planner_data: Any = None
+    storage_data: Any = None
+
+
+@dataclass(frozen=True)
+class MetadataIndex:
+    """This class represents a lookup key for items in a state dict or Metadata."""
+
+    fqn: str
+    """Fully Qualified Name of the object"""
+
+    offset: Optional[torch.Size] = None
+    """If the object is a tensor, offset into the tensor we're looking for"""
+
+    index: Optional[int] = field(hash=False, compare=False, default=None)
+    """
+    Index hint when searching for tensor chunk to speedup lookups (optional)
+
+    A common representation of a sharded tensor is as a list of chunks so to
+    find the index in such a list you need to linear search it.
+
+    When constructing an instance of MetadataIndex that points to that list,
+    one can provide the index as a hint and it will be probed first before
+    the linear search and thus making it significantly faster.
+    """
+
+    def __init__(
+        self,
+        fqn: str,
+        offset: Optional[Sequence[int]] = None,
+        index: Optional[int] = None,
+    ):
+        # We must use object.__setattr__ due to frozen=True
+        object.__setattr__(self, "fqn", fqn)
+        object.__setattr__(self, "index", index)
+        if offset is not None:
+            object.__setattr__(self, "offset", torch.Size(offset))
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/optimizer.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..92b969a8266dec88cc79d70e12d874cc80cef871
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/optimizer.py
@@ -0,0 +1,348 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import dataclasses
+from typing import cast, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch._utils import _get_device_module
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._shard.sharded_tensor.metadata import (
+    TensorProperties as ShardTensorProperties,
+)
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint._nested_dict import unflatten_state_dict
+from torch.distributed.checkpoint.default_planner import DefaultLoadPlanner
+from torch.distributed.checkpoint.metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from torch.distributed.checkpoint.planner import LoadPlan, LoadPlanner
+from torch.distributed.checkpoint.planner_helpers import (
+    _create_read_items,
+    create_read_items_for_chunk_list,
+)
+from torch.distributed.checkpoint.state_dict_loader import load_state_dict
+from torch.distributed.checkpoint.storage import StorageReader
+from torch.distributed.checkpoint.utils import (
+    _element_wise_add,
+    _element_wise_sub,
+    _normalize_device_info,
+)
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
+from torch.distributed.remote_device import _remote_device
+
+STATE_DICT_2D_LAYOUT = Dict[str, Tuple[Optional[Sequence[int]], Sequence[int]]]
+
+
+# TODO: Update docstrings for optimizer.py
+__all__ = [
+    "load_sharded_optimizer_state_dict",
+]
+
+
+def _gen_rank_device(global_rank: int, device_type: str = "cuda") -> str:
+    if device_type == "cpu":
+        return "cpu"
+    device_module = _get_device_module(device_type)
+    if device_module.is_available():
+        return _normalize_device_info(
+            device_type, global_rank % device_module.device_count()
+        )
+    return "cpu"
+
+
+def _create_colwise_spec(
+    pg: Optional[dist.ProcessGroup] = None,
+) -> ChunkShardingSpec:
+    pg_device_type = dist.distributed_c10d._get_pg_default_device(pg).type
+    if pg is None:
+        placements = [
+            f"rank:{idx}/{_gen_rank_device(idx, pg_device_type)}"
+            for idx in range(dist.get_world_size())
+        ]
+    else:
+        placements = [
+            f"rank:{idx}/{_gen_rank_device(dist.get_global_rank(pg, idx), pg_device_type)}"
+            for idx in range(pg.size())
+        ]
+    return ChunkShardingSpec(
+        dim=0,
+        placements=cast(List[Union[_remote_device, str]], placements),
+    )
+
+
+def _is_nested_tensor(val: torch.Tensor) -> bool:
+    if type(val) is ShardedTensor:
+        if len(val.local_shards()) == 0:
+            return False
+        if type(val.local_shards()[0].tensor) is ShardedTensor:
+            return True
+        if type(val.local_shards()[0].tensor) is DTensor:
+            raise ValueError("Cannot handle DTensor nested insided ShardedTensor")
+    elif type(val) is DTensor and (
+        type(val._local_tensor) is DTensor or type(val._local_tensor) is ShardedTensor
+    ):
+        raise ValueError("Cannot handle nested DTensor")
+    return False
+
+
+def _alloc_tensor(
+    props: TensorProperties, size: Sequence[int], device_type: str = "cuda"
+) -> torch.Tensor:
+    return torch.empty(
+        size=size,
+        dtype=props.dtype,
+        layout=props.layout,
+        requires_grad=props.requires_grad,
+        pin_memory=props.pin_memory,
+        device=cast(torch.device, _get_device_module(device_type).current_device()),
+    )
+
+
+def _get_state_dict_2d_layout(
+    state_dict: STATE_DICT_TYPE,
+) -> Tuple[STATE_DICT_2D_LAYOUT, Optional[dist.ProcessGroup]]:
+    """
+    Load the right TP slice of the optimizer state.
+
+    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
+    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
+    This is pretty fragile and it might be easier for FSDP to compute this info for us.
+    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
+    (offset, size) for the current rank TP slice.
+    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
+    """
+    specs: STATE_DICT_2D_LAYOUT = {}
+    dp_pg: Optional[dist.ProcessGroup] = None
+    for key, value in state_dict.items():
+        specs[key] = (None, value.size())
+        if _is_nested_tensor(value):
+            assert (
+                len(value.local_shards()) == 1
+            ), "Cannot handle ST with multiple shards"
+            assert isinstance(
+                value, ShardedTensor
+            ), "Can only handle nested ShardedTensor"
+            shard = value.local_shards()[0]
+            specs[key] = (
+                shard.metadata.shard_offsets,
+                shard.metadata.shard_sizes,
+            )
+            dp_pg = shard.tensor._process_group  # type: ignore[attr-defined]
+
+    return (
+        specs,
+        dp_pg,
+    )
+
+
+class _ReaderWithOffset(DefaultLoadPlanner):
+    translation: Dict[MetadataIndex, MetadataIndex]
+    state_dict: STATE_DICT_TYPE
+    metadata: Metadata
+
+    def __init__(self, fqn_to_offset: Dict[str, Sequence[int]]) -> None:
+        super().__init__()
+        self.fqn_to_offset = fqn_to_offset
+        self.metadata = Metadata({})
+        self.state_dict = {}
+        self.translation = {}
+
+    def create_local_plan(self) -> LoadPlan:
+        requests = []
+        self.translation = {}
+        for fqn, obj in self.state_dict.items():
+            md = self.metadata.state_dict_metadata[fqn]
+            if not isinstance(obj, ShardedTensor):
+                requests += _create_read_items(fqn, md, obj)
+                continue
+
+            if fqn not in self.fqn_to_offset:
+                requests += _create_read_items(fqn, md, obj)
+                continue
+
+            offset = self.fqn_to_offset[fqn]
+
+            assert len(obj.local_shards()) == 1
+            original_shard = obj.local_shards()[0]
+            local_chunks = [
+                ChunkStorageMetadata(
+                    offsets=torch.Size(
+                        _element_wise_add(original_shard.metadata.shard_offsets, offset)
+                    ),
+                    sizes=torch.Size(original_shard.metadata.shard_sizes),
+                )
+            ]
+
+            reqs = create_read_items_for_chunk_list(
+                fqn, cast(TensorStorageMetadata, md), local_chunks
+            )
+            # TODO: The ReadItems will have a displaced MetadataIndex, fix it.
+            # TODO: we should change _create_sharded_read_items to have more ergonomic API
+            for ri in reqs:
+                assert ri.dest_index.offset is not None
+                original_offset = _element_wise_sub(ri.dest_index.offset, offset)
+                original_index = dataclasses.replace(
+                    ri.dest_index, offset=torch.Size(original_offset)
+                )
+                self.translation[ri.dest_index] = original_index
+
+            requests += reqs
+        return LoadPlan(requests)
+
+    def lookup_tensor(self, index: MetadataIndex) -> torch.Tensor:
+        return super().lookup_tensor(self.translation.get(index, index))
+
+
+def load_sharded_optimizer_state_dict(
+    model_state_dict: STATE_DICT_TYPE,
+    optimizer_key: str,
+    storage_reader: StorageReader,
+    planner: Optional[LoadPlanner] = None,
+) -> STATE_DICT_TYPE:
+    """
+    Load a state_dict in conjunction with FSDP sharded optimizer state.
+
+    This is the current recommended way to checkpoint FSDP.
+    >>> # xdoctest: +SKIP
+    >>> import torch.distributed.checkpoint as dist_cp
+    >>> # Save
+    >>> model: torch.nn.Model
+    >>> optim_params = model.parameters()
+    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
+    >>> # Save
+    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
+    >>>     state_dict = {
+    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
+    >>>         "model": model.state_dict()
+    >>>     }
+    >>>     dist_cp.save_state_dict(
+    >>>         state_dict=optim_state,
+    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
+    >>>         planner=dist_cp.DefaultSavePlanner(),
+    >>>     )
+    >>>
+    >>> # Load
+    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
+    >>>     model_state_dict = model_tp.state_dict()
+    >>>     checkpoint = {
+    >>>         "model": model_state_dict
+    >>>     }
+    >>>     dist_cp.load_state_dict(
+    >>>         state_dict=checkpoint,
+    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
+    >>>         planner=dist_cp.DefaultLoadPlanner(),
+    >>>     )
+    >>>     model.load_state_dict(checkpoint["model_state"])
+    >>>
+    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
+    >>>         model_state_dict,
+    >>>         optimizer_key="optimizer",
+    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
+    >>>     )
+    >>>
+    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
+    >>>        model, optim, optim_state["optimizer"]
+    >>>     )
+    >>>
+    >>>     optim.load_state_dict(flattened_osd)
+    """
+    metadata = storage_reader.read_metadata()
+
+    layout_specs, dp_pg = _get_state_dict_2d_layout(model_state_dict)
+    dp_pg_device_type = dist.distributed_c10d._get_pg_default_device(dp_pg).type
+    device_module = _get_device_module(dp_pg_device_type)
+
+    if dp_pg is None:
+        placements = []
+        for i in range(dist.get_world_size()):
+            device_info = _normalize_device_info(
+                dp_pg_device_type, i % device_module.device_count()
+            )
+            placements.append(f"rank:{i}/{device_info}")
+        sharding_spec = ChunkShardingSpec(dim=0, placements=placements)  # type: ignore[arg-type]
+    else:
+        sharding_spec = _create_colwise_spec(dp_pg)
+
+    # Create a state_dict for optimizer state
+    state_dict: STATE_DICT_TYPE = {}
+
+    fqn_to_offset: Dict[str, Sequence[int]] = {}
+    for key, value in metadata.state_dict_metadata.items():
+        key_path = metadata.planner_data[key]
+        if key_path[0] != optimizer_key:
+            continue
+
+        if isinstance(value, BytesStorageMetadata):
+            state_dict[key] = "<bytes_io>"
+            continue
+
+        # value: TensorStorageMetadata
+        if value.size.numel() == 1:
+            state_dict[key] = _alloc_tensor(
+                value.properties, value.size, dp_pg_device_type
+            )
+        elif dp_pg is None:
+            state_dict[key] = _create_chunk_sharded_tensor(
+                _alloc_tensor(value.properties, value.size, dp_pg_device_type),
+                rank=dist.get_rank(),
+                world_size=dist.get_world_size(),
+                num_devices_per_node=device_module.device_count(),
+                pg=_get_default_group(),
+            )
+        else:
+            spec_key = key_path[2]
+            alloc_size = layout_specs.get(spec_key, (None, value.size))[1]
+
+            properties = ShardTensorProperties(
+                dtype=value.properties.dtype,
+                layout=value.properties.layout,
+                requires_grad=value.properties.requires_grad,
+                memory_format=value.properties.memory_format,
+                pin_memory=value.properties.pin_memory,
+            )
+
+            st_md = sharding_spec.build_metadata(torch.Size(alloc_size), properties)
+            local_shards = []
+            current_rank = dist.get_rank(dp_pg)
+            for shard_md in st_md.shards_metadata:
+                if cast(_remote_device, shard_md.placement).rank() != current_rank:
+                    continue
+                local_shards.append(
+                    Shard(
+                        tensor=_alloc_tensor(
+                            value.properties, shard_md.shard_sizes, dp_pg_device_type
+                        ),
+                        metadata=shard_md,
+                    )
+                )
+
+            st = ShardedTensor._init_from_local_shards_and_global_metadata(
+                local_shards, st_md, process_group=dp_pg
+            )
+
+            if spec_key in layout_specs and layout_specs[spec_key][0] is not None:
+                fqn_to_offset[key] = cast(Sequence[int], layout_specs[spec_key][0])
+
+            state_dict[key] = st
+
+    # Whether we unflatten before or after doesn't matter
+    load_state_dict(
+        state_dict=state_dict,
+        storage_reader=storage_reader,
+        # FIXME the type of planner is wrong in load_state_dict
+        planner=_ReaderWithOffset(fqn_to_offset) if dp_pg is not None else planner,
+    )
+
+    state_dict = unflatten_state_dict(state_dict, metadata.planner_data)
+
+    return state_dict
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner.py
new file mode 100644
index 0000000000000000000000000000000000000000..8992e3915b96cae3e9d3f65fafa2561c9ee521e6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner.py
@@ -0,0 +1,403 @@
+import abc
+import io
+from dataclasses import dataclass
+from enum import auto, Enum
+from functools import reduce
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+
+from .metadata import (
+    ChunkStorageMetadata,
+    Metadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    TensorProperties,
+)
+
+
+__all__ = [
+    "WriteItemType",
+    "LoadItemType",
+    "TensorWriteData",
+    "WriteItem",
+    "ReadItem",
+    "SavePlan",
+    "LoadPlan",
+    "SavePlanner",
+    "LoadPlanner",
+]
+
+
+class WriteItemType(Enum):
+    TENSOR = auto()
+    SHARD = auto()
+    BYTE_IO = auto()
+
+
+class LoadItemType(Enum):
+    TENSOR = auto()
+    BYTE_IO = auto()
+
+
+@dataclass(frozen=True)
+class TensorWriteData:
+    chunk: ChunkStorageMetadata
+    properties: TensorProperties
+    size: torch.Size
+
+
+@dataclass(frozen=True)
+class WriteItem:
+    """Dataclass which holds information about what needs to be written to storage."""
+
+    index: MetadataIndex
+    type: WriteItemType
+
+    # Value present if it's a tensor write
+    tensor_data: Optional[TensorWriteData] = None
+
+    def tensor_storage_size(self) -> Optional[int]:
+        """
+        Calculates the storage size of the underlying tensor, or None if this is not a tensor write.
+
+        Returns:
+            Optional[int] storage size, in bytes of underlying tensor if any.
+        """
+        if self.tensor_data is None:
+            return None
+
+        numels = reduce(lambda x, y: x * y, self.tensor_data.size, 1)
+        dtype_size = torch._utils._element_size(self.tensor_data.properties.dtype)
+        return numels * dtype_size
+
+
+@dataclass(frozen=True)
+class ReadItem:
+    # Read Item
+    type: LoadItemType
+
+    # Index into the state_dict
+    dest_index: MetadataIndex
+    # Offsets into destination tensor
+    dest_offsets: torch.Size
+
+    # Index into the checkpoint
+    storage_index: MetadataIndex
+    # Offset into the checkpoint data
+    storage_offsets: torch.Size
+
+    # Size of the hypercube to copy
+    lengths: torch.Size
+
+
+@dataclass(frozen=True)
+class SavePlan:
+    items: List[WriteItem]
+    storage_data: Any = None
+    planner_data: Any = None
+
+
+@dataclass
+class LoadPlan:
+    items: List[ReadItem]
+    storage_data: Any = None
+    planner_data: Any = None
+
+
+class SavePlanner(abc.ABC):
+    """
+    Abstract class defining the protocol used by save_state_dict to plan the save process.
+
+    SavePlanners are stateful objects that can be used to customize the whole save process.
+
+    SavePlanner acts as an access proxy to the state_dict, so any transformation done to it
+    will be visible to the whole process.
+
+    A planner subclass can expect the following sequence of calls during save_state_dict:
+
+    1) set_up_planner - called on all ranks.
+        Signals the start of a checkpoint save.
+
+    2) create_local_plan - called on all ranks.
+        Process the state_dict and produces a `SavePlan` that will be sent for global planning.
+
+    3) create_global_plan - called on the coordinator rank only.
+        Takes the SavePlan from all ranks and make any global decision.
+
+    4) finish_plan - called on all ranks.
+        This gives each rank a chance to adjust to global planning decisions.
+
+    5) resolve_data - called multiple times on each rank
+        Lookups a value on the `state_dict` for the storage layer to write.
+
+    Users are recommended to extend DefaultSavePlanner instead of this interface directly as
+    most changes can be expressed by changes in a single method.
+
+    There are 3 usual patterns of extension:
+
+    Rewriting state_dict. This is the simplest way to extend the save process as it
+    doesn't requite understanding the intrincacies of how SavePlan works:
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class RenamePlanner(DefaultSavePlanner):
+    >>>     def set_up_planner(self, state_dict, is_coordinator):
+    >>>         # prefix all keys with `foo_``
+    >>>         super().set_up_planner({"foo_" + k: v for k, v in state_dict.items()}, is_coordinator)
+
+    Modifying local plan and lookup in tandem. This is useful when fine control of how data is persisted
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class FP16Planner(DefaultSavePlanner):
+    >>>     def create_local_plan(self):
+    >>>         plan = super().create_local_plan()
+    >>>         for p in plan:
+    >>>             if p.tensor_data is not None:
+    >>>                 p.tensor_data.properties.dtype = torch.float16
+    >>>         return plan
+    >>>
+    >>>     def resolve_data(self, write_item):
+    >>>         item = super().resolve_data(write_item)
+    >>>         return item if write_item.type == WriteItemType.BYTE_IO else item.to(torch.float16)
+
+    Using the global planning step to make central decisions that can't be made individually by each rank
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> from itertools import islice
+    >>> from dataclasses import replace
+    >>> class DDPLoadBalancingPlanner(DefaultSavePlanner):
+    >>>     # This uses the default local plan behavior of having all non-sharded writes in rank 0
+    >>>     # This sample doesn't handle ShardedTensors
+    >>>     def create_global_plan(self, all_plans):
+    >>>         def chunk(it, size):
+    >>>             it = iter(it)
+    >>>         return list(iter(lambda: tuple(islice(it, size)), ()))
+    >>>         all_plans = [
+    >>>             replace(plan, items=items) for plan, items in
+    >>>                 zip(all_plans, chunk(all_plans[0].items, len(all_plans)))
+    >>>         ]
+    >>>         return super().create_global_plan(all_plans)
+
+    Finally, some planners need to save additional metadata in the checkpoint, this is
+    accomplished by having each rank contribute their data items in the local plan and
+    the global planner aggregate them:
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class SaveExtraDataPlanner(DefaultSavePlanner):
+    >>>     def create_local_plan(self) -> SavePlan:
+    >>>         plan = super().create_local_plan()
+    >>>         return replace(plan, planner_data="per-rank-data")
+    >>>
+    >>>     def create_global_plan(self, all_plans: List[SavePlan]) -> Tuple[List[SavePlan], Metadata]:
+    >>>         global_plan, metadata = super().create_global_plan(all_plans)
+    >>>         merged_data = [p.planner_data for p in global_plan]
+    >>>         metadata = replace(metadata, planner_data=merged_data)
+    >>>         return global_plan, metadata
+    """
+
+    @abc.abstractmethod
+    def set_up_planner(self, state_dict: STATE_DICT_TYPE, is_coordinator: bool) -> None:
+        """
+        Initialize this planner to save ``state_dict``.
+
+        Implementations should save those values as they won't be provided lated in the save process.
+
+        This is called on all ranks.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_local_plan(self) -> SavePlan:
+        """
+        Compute the save plan for the current rank.
+
+        This will be aggregated and passed to create_global_plan.
+        Planner specific data can be passed through SavePlan::planner_data.
+
+        This is called on all ranks.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_global_plan(
+        self, all_plans: List[SavePlan]
+    ) -> Tuple[List[SavePlan], Metadata]:
+        """
+        Compute the global checkpoint plan and return the local plan of each rank.
+
+        This is called on the coordinator rank only.
+        """
+        pass
+
+    @abc.abstractmethod
+    def finish_plan(self, new_plan: SavePlan) -> SavePlan:
+        """
+        Merge the plan created by `create_local_plan` and the result of `create_global_plan`.
+
+        This is called on all ranks.
+        """
+        pass
+
+    @abc.abstractmethod
+    def resolve_data(self, write_item: WriteItem) -> Union[torch.Tensor, io.BytesIO]:
+        """
+        Transform and prepare ``write_item`` from ``state_dict`` for storage, ensuring idempotency and thread-safety.
+
+        Lookup the object associated with ``write_item`` in ``state_dict`` and apply any
+        transformation (such as serialization) prior to the storage layer consuming it.
+
+        Called on each rank multiple times, at least once per WriteItem in the final SavePlan.
+
+        This method should be idempotent and thread-save. StorageWriter implementations
+        are free to call it as frequently as they need.
+
+        Any transformation that allocates memory should be lazily done when his method
+        is called in order to reduce peak memory required by checkpointing.
+
+        When returning tensors, they can be on any device or format, they can be views too.
+        It's the storage layer responsibility to figure out how to save them.
+        """
+        pass
+
+
+class LoadPlanner:
+    """
+    Abstract class defining the protocol used by load_state_dict to plan the load process.
+
+    LoadPlanner are stateful objects that can be used to customize the whole load process.
+
+    LoadPlanner acts as an access proxy to the state_dict, so any transformation done to it
+    will be visible to the whole process.
+
+    A planner subclass can expect the following sequence of calls during load_state_dict:
+
+    1) set_up_planner - called on all ranks.
+        Signals the start of loading a checkpoint.
+
+    2) create_local_plan - called on all ranks.
+        Process the state_dict and produces a `LoadPlan` that will be sent for global planning.
+
+    3) create_global_plan - called on the coordinator rank only.
+        Takes the LoadPlan from all ranks and make any global decision.
+
+    4) load_bytes - called multiple times on each rank
+        This is called once per non-tensor value in state_dict.
+
+    5) resolve_tensor and commit_tensor - called multiple times on each rank
+        They are called in pair for each Tensor value in state_dict.
+
+    Users are recommended to extend DefaultLoadPlanner instead of this interface directly as
+    most changes can be expressed by changes in a single method.
+
+    There are two usual patterns of extension:
+
+    Rewriting state_dict. This is the simplest way to extend the load process as it
+    doesn't requite understanding the intrincacies of how LoadPlan works. We need
+    to keep a reference to the original state_dict as load happens in place so
+    we need to be able to perform it in place
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class RenamePlanner(DefaultLoadPlanner):
+    >>>     def set_up_planner(self, state_dict, metadata, is_coordinator):
+    >>>         self.original_state_dict = state_dict
+    >>>         state_dict = {"foo_" + k: v for k, v in state_dict.items()}
+    >>>
+    >>>         if self.flatten_sharded_tensors:
+    >>>             state_dict = _flatten_sharded_tensors(state_dict)
+    >>>
+    >>>         if self.flatten_state_dict:
+    >>>             state_dict, self.mappings = flatten_state_dict(state_dict)
+    >>>
+    >>>         self.state_dict = state_dict
+    >>>         self.metadata = metadata
+    >>>         self.is_coordinator = is_coordinator
+    >>>
+    >>>     def load_bytes(self, read_item, value):
+    >>>         # Remove the "foo_" prefix
+    >>>         self.original_state_dict[read_item.dest_index.fqn[4:]] = torch.load(value)
+
+
+    Modifying resolve_tensor and commit_tensor to handle load time transformation.
+
+    >>> # xdoctest: +SKIP("undefined vars")
+    >>> class MetaModelMaterialize(DefaultSavePlanner):
+    >>>     def resolve_tensor(self, read_item):
+    >>>         tensor = super().resolve_tensor(read_item)
+    >>>         return torch.empty_like(tensor, device="cpu")
+    >>>
+    >>>     def commit_tensor(self, read_item, tensor):
+    >>>         self.state_dict[read_item.dest_index.fqn] = tensor
+    """
+
+    @abc.abstractmethod
+    def set_up_planner(
+        self,
+        state_dict: STATE_DICT_TYPE,
+        metadata: Metadata,
+        is_coordinator: bool,
+    ) -> None:
+        """
+        Initialize this instance to load data into ``state_dict``.
+
+        . N.B. This is called on every rank.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_local_plan(self) -> LoadPlan:
+        """
+        Create a LoadPlan based on state_dict and metadata provided by set_up_planner.
+
+        . N.B. This is called on every rank.
+        """
+        pass
+
+    @abc.abstractmethod
+    def create_global_plan(self, global_plan: List[LoadPlan]) -> List[LoadPlan]:
+        """
+        Compute the global load plan and return plans for each rank.
+
+        . N.B. This is called on the coordinator rank only
+        """
+        pass
+
+    @abc.abstractmethod
+    def finish_plan(self, central_plan: LoadPlan) -> LoadPlan:
+        """Accept the plan from coordinator and return final LoadPlan."""
+        pass
+
+    @abc.abstractmethod
+    def load_bytes(self, read_item: ReadItem, value: io.BytesIO) -> None:
+        """
+        Load the item described by ``read_item``and ``value``.
+
+        This method is expected to modify in-place the underlying state_dict.
+
+        The contents of ``value`` are defined by the SavePlanner used to produce
+        the checkpoint being loaded.
+        """
+        pass
+
+    @abc.abstractmethod
+    def resolve_tensor(self, read_item: ReadItem) -> torch.Tensor:
+        """
+        Return the tensor described by ``read_item`` to be used by the StorageReader to load `read_item`.
+
+        The tensor should alias with one on the underlying state_dict as StorageReader will replace its contents.
+        If, for any reason, that's not possible, the planner can use the ``commit_tensor`` method to copy the data
+        back to the one in state_dict.
+        """
+        pass
+
+    @abc.abstractmethod
+    def commit_tensor(self, read_item: ReadItem, tensor: torch.Tensor) -> None:
+        """
+        Call once the StorageReader finished loading data into ``tensor``.
+
+        The provided tensor is the same one returned by the call to ``resolve_tensor``.
+        This method is only needed if this LoadPlanner needs to post process ``tensor`` prior to
+        copying it back to the one in the state_dict.
+
+        The contents of tensor will follow its device synchronization model.
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner_helpers.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ee60d95830fb0a76e50199dff7917213653107
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/planner_helpers.py
@@ -0,0 +1,325 @@
+from typing import Any, cast, List
+
+import torch
+import torch.distributed as dist
+from torch._utils import _get_device_module
+
+from torch.distributed._shard.metadata import ShardMetadata
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor._utils import compute_local_shape_and_global_offset
+
+from torch.utils._pytree import tree_map_only
+
+from .metadata import (
+    BytesStorageMetadata,
+    ChunkStorageMetadata,
+    MetadataIndex,
+    STATE_DICT_TYPE,
+    STORAGE_TYPES,
+    TensorProperties,
+    TensorStorageMetadata,
+)
+from .planner import (
+    LoadItemType,
+    ReadItem,
+    SavePlan,
+    TensorWriteData,
+    WriteItem,
+    WriteItemType,
+)
+from .resharding import (
+    _check_shard_metadata_pair_overlap,
+    _shards_get_overlap_region_wrt_saved_tensor,
+)
+
+__all__: List[str] = ["create_read_items_for_chunk_list"]
+
+
+def _create_chunk_from_tensor(tensor: torch.Tensor) -> ChunkStorageMetadata:
+    return ChunkStorageMetadata(
+        offsets=torch.Size([0] * len(tensor.size())), sizes=tensor.size()
+    )
+
+
+def _chunk_for_shard(shard_md: ShardMetadata) -> ChunkStorageMetadata:
+    return ChunkStorageMetadata(
+        offsets=torch.Size(shard_md.shard_offsets),
+        sizes=torch.Size(shard_md.shard_sizes),
+    )
+
+
+def _sharded_tensor_metadata(
+    sharded_tensor: ShardedTensor, shard_md: ShardMetadata
+) -> TensorWriteData:
+    shard_properties = sharded_tensor.metadata().tensor_properties
+
+    properties = TensorProperties(
+        dtype=shard_properties.dtype,
+        layout=shard_properties.layout,
+        requires_grad=shard_properties.requires_grad,
+        memory_format=shard_properties.memory_format,
+        pin_memory=shard_properties.pin_memory,
+    )
+
+    return TensorWriteData(
+        chunk=_chunk_for_shard(shard_md),
+        properties=properties,
+        size=sharded_tensor.metadata().size,
+    )
+
+
+def _create_write_items_for_dtensor(fqn: str, tensor: DTensor) -> WriteItem:
+    sizes, offsets = compute_local_shape_and_global_offset(
+        tensor.shape, tensor.device_mesh, tensor.placements
+    )
+    sizes, offsets = torch.Size(sizes), torch.Size(offsets)
+
+    return WriteItem(
+        index=MetadataIndex(fqn, offsets),
+        type=WriteItemType.SHARD,
+        tensor_data=TensorWriteData(
+            chunk=ChunkStorageMetadata(
+                offsets=offsets,
+                sizes=sizes,
+            ),
+            properties=TensorProperties.create_from_tensor(tensor.to_local()),
+            size=tensor.size(),
+        ),
+    )
+
+
+def _create_write_item_for_shard(
+    fqn: str, sharded_tensor: ShardedTensor, shard_md: ShardMetadata
+) -> WriteItem:
+    offsets = torch.Size(shard_md.shard_offsets)
+    return WriteItem(
+        index=MetadataIndex(fqn, offsets),
+        type=WriteItemType.SHARD,
+        tensor_data=_sharded_tensor_metadata(sharded_tensor, shard_md),
+    )
+
+
+def _create_write_item_for_tensor(fqn: str, tensor: torch.Tensor) -> WriteItem:
+    offsets = torch.Size([0] * len(tensor.size()))
+    return WriteItem(
+        index=MetadataIndex(fqn, offsets),
+        type=WriteItemType.TENSOR,
+        tensor_data=TensorWriteData(
+            chunk=ChunkStorageMetadata(offsets=offsets, sizes=tensor.size()),
+            properties=TensorProperties.create_from_tensor(tensor),
+            size=tensor.size(),
+        ),
+    )
+
+
+def _create_write_item_for_bytesio(fqn: str, bytes: Any):
+    return WriteItem(
+        index=MetadataIndex(fqn),
+        type=WriteItemType.BYTE_IO,
+    )
+
+
+def _create_read_item_for_byteio(
+    dest_index, dest_offset, storage_index, storage_offset, length
+):
+    return ReadItem(
+        type=LoadItemType.BYTE_IO,
+        dest_index=dest_index,
+        dest_offsets=torch.Size((dest_offset,)),
+        storage_index=storage_index,
+        storage_offsets=torch.Size((storage_offset,)),
+        lengths=torch.Size((length,)),
+    )
+
+
+def _create_read_item_for_tensor(
+    dest_index, dest_offsets, storage_index, storage_offsets, lengths
+):
+    return ReadItem(
+        type=LoadItemType.TENSOR,
+        dest_index=dest_index,
+        dest_offsets=torch.Size(dest_offsets),
+        storage_index=storage_index,
+        storage_offsets=torch.Size(storage_offsets),
+        lengths=torch.Size(lengths),
+    )
+
+
+def create_read_items_for_chunk_list(
+    fqn: str,
+    checkpoint_md: TensorStorageMetadata,
+    local_chunks: List[ChunkStorageMetadata],
+) -> List[ReadItem]:
+    """
+    Create a list of ``ReadItem`` based on the checkpoint and local chunks.
+
+    This applies the resharding algorithm and computes the reads needed
+    to satisfy ``local_chunks`` with a checkpoint described by ``checkpoint_md``.
+
+    Args:
+        fqn (str) : The state_dict FQN to pass to ``ReadItem``.
+        checkpoint_md (TensorStorageMetadata): metadata for a given tensor
+            from a checkpoint.
+        local_chunks (List[ChunkStorageMetadata]): Local chunks that needs to be
+            loaded.
+
+    Returns:
+        A list of ``ReadItem`` that will satisfy all input chunks.
+    """
+    read_items = []
+    # this is a naive quadratic algo that can be optimized later
+    for idx, shard in enumerate(local_chunks):
+        for storage_idx, storage_md in enumerate(checkpoint_md.chunks):
+            if not _check_shard_metadata_pair_overlap(shard, storage_md):
+                continue
+
+            storage_offsets = []
+            dest_offsets = []
+            lengths = []
+            for (
+                dim,
+                offset_for_saved_tensor,
+                offset_for_current_tensor,
+                length,
+            ) in _shards_get_overlap_region_wrt_saved_tensor(
+                saved_shard=storage_md, current_shard=shard
+            ):
+                storage_offsets.append(offset_for_saved_tensor)
+                dest_offsets.append(offset_for_current_tensor)
+                lengths.append(length)
+
+            read_items.append(
+                _create_read_item_for_tensor(
+                    dest_index=MetadataIndex(fqn, shard.offsets, idx),
+                    dest_offsets=dest_offsets,
+                    storage_index=MetadataIndex(fqn, storage_md.offsets, storage_idx),
+                    storage_offsets=storage_offsets,
+                    lengths=lengths,
+                )
+            )
+    return read_items
+
+
+def _create_default_metadata_only_plan(state_dict: STATE_DICT_TYPE) -> SavePlan:
+    requests = []
+    for fqn, obj in state_dict.items():
+        if isinstance(obj, DTensor):
+            requests.append(_create_write_items_for_dtensor(fqn, obj))
+        elif isinstance(obj, ShardedTensor):
+            for shard_md in obj.metadata().shards_metadata:
+                requests.append(_create_write_item_for_shard(fqn, obj, shard_md))
+        elif isinstance(obj, torch.Tensor):
+            requests.append(_create_write_item_for_tensor(fqn, obj))
+        else:
+            requests.append(_create_write_item_for_bytesio(fqn, obj))
+    return SavePlan(requests)
+
+
+def _create_write_items(fqn: str, object: Any) -> List[WriteItem]:
+    if isinstance(object, DTensor):
+        return [_create_write_items_for_dtensor(fqn, object)]
+    elif isinstance(object, ShardedTensor):
+        return [
+            _create_write_item_for_shard(fqn, object, shard.metadata)
+            for shard in object.local_shards()
+        ]
+    elif isinstance(object, torch.Tensor):
+        return [_create_write_item_for_tensor(fqn, object)]
+    else:
+        return [_create_write_item_for_bytesio(fqn, object)]
+
+
+def _create_chunk_from_dtensor(tensor: DTensor) -> ChunkStorageMetadata:
+    sizes, offsets = compute_local_shape_and_global_offset(
+        tensor.shape, tensor.device_mesh, tensor.placements
+    )
+    sizes, offsets = torch.Size(sizes), torch.Size(offsets)
+    return ChunkStorageMetadata(
+        offsets=offsets,
+        sizes=sizes,
+    )
+
+
+def _create_chunk_list(tensor: torch.Tensor) -> List[ChunkStorageMetadata]:
+    if isinstance(tensor, DTensor):
+        local_chunks = [_create_chunk_from_dtensor(tensor)]
+    elif isinstance(tensor, ShardedTensor):
+        local_chunks = [
+            _chunk_for_shard(shard.metadata) for shard in tensor.local_shards()
+        ]
+    elif isinstance(tensor, torch.Tensor):
+        local_chunks = [_create_chunk_from_tensor(tensor)]
+    else:
+        raise ValueError(
+            "Unsupported Type, expecting one of [Tensor, DTensor, ShardedTensor] "
+            f",but got {type(tensor)}"
+        )
+
+    return local_chunks
+
+
+def _create_read_items(fqn: str, md: STORAGE_TYPES, obj: Any) -> List[ReadItem]:
+    if not isinstance(md, BytesStorageMetadata):
+        try:
+            local_chunks = _create_chunk_list(obj)
+        except ValueError as ex:
+            raise ValueError(
+                f"Invalid checkpoint metadata for {fqn}, "
+                + f"expected BytesStorageMetadata but found {type(md)}",
+            ) from ex
+
+        return create_read_items_for_chunk_list(fqn, md, local_chunks)
+    else:
+        return [
+            _create_read_item_for_byteio(
+                dest_index=MetadataIndex(fqn),
+                dest_offset=0,
+                storage_index=MetadataIndex(fqn),
+                storage_offset=0,
+                length=0,
+            )
+        ]
+
+
+def _init_state_dict(state_dict: STATE_DICT_TYPE) -> None:
+    state_dict_assigned_storage = tree_map_only(
+        torch.Tensor, lambda v: _init_meta_tensor(v), state_dict
+    )
+    # The inplace version of tree_map_only, tree_map_only_ doesn't seem to work.
+    # So we need to temporariy update the each element in the state dict with meta tensor.
+    for k in state_dict.keys():
+        state_dict[k] = state_dict_assigned_storage[k]
+
+
+def _init_meta_tensor(value: Any) -> Any:
+    """
+    Initializes tensor, moves it to device for torch.Tensor/DTensor on meta device.
+    """
+
+    device = getattr(value, "device", None)
+    # DCP does the initialization if it's meta tensor/DTensor.
+    if device == torch.device("meta"):
+        device_type = dist.distributed_c10d._get_pg_default_device().type
+        device = cast(torch.device, _get_device_module(device_type).current_device())
+        if isinstance(value, DTensor):
+            new_local_tensor = torch.empty_like(value.to_local(), device=device)
+            # We need to pass shape and stride explicitly, since DTensor might be
+            # sharded unevenly.
+            dtensor = DTensor.from_local(
+                new_local_tensor,
+                device_mesh=value.device_mesh,
+                placements=value.placements,
+                shape=value.size(),
+                stride=value.stride(),
+            )
+            return dtensor
+        elif isinstance(value, torch.Tensor):
+            tensor = torch.empty_like(value, device=device)
+            return tensor
+        else:
+            raise RuntimeError(
+                f"Found unsupported type {type(value)} for meta device loading."
+            )
+    else:
+        return value
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/resharding.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/resharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..16ce4138ce11bc1a45ebd275068a5b043297b870
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/resharding.py
@@ -0,0 +1,70 @@
+from typing import List, Tuple
+
+from torch.distributed.checkpoint.metadata import ChunkStorageMetadata
+
+__all__: List[str] = []
+
+
+def _check_shard_metadata_pair_overlap(
+    shard1: ChunkStorageMetadata, shard2: ChunkStorageMetadata
+):
+    """Check if two shards overlap."""
+    # For each dim of each shard, check if one shard resides on the other
+    # end of second shard with respect to that dim. As an example for a 2D
+    # shard, we would check if one shard is above or on the left of the
+    # other shard.
+    ndims = len(shard1.offsets)
+    for i in range(ndims):
+        if shard1.offsets[i] >= shard2.offsets[i] + shard2.sizes[i]:
+            return False
+        if shard2.offsets[i] >= shard1.offsets[i] + shard1.sizes[i]:
+            return False
+
+    return True
+
+
+def _shards_get_overlap_region_wrt_saved_tensor(
+    saved_shard: ChunkStorageMetadata, current_shard: ChunkStorageMetadata
+) -> List[Tuple[int, int, int, int]]:
+    """
+    Return the overlapping region between saved_shard and current_shard.
+
+    There returned list has the same number of elements as the tensor's dimension.
+    For each element, we produce a tuple with the following contents:
+        (dimension, `saved_shard` offset, `current_shard` offset, length)
+
+    Offsets are relative to each shard.
+    """
+    narrows = []
+    for dim, (
+        saved_shard_offset,
+        current_shard_offset,
+        saved_shard_size,
+        current_shard_size,
+    ) in enumerate(
+        zip(
+            saved_shard.offsets,
+            current_shard.offsets,
+            saved_shard.sizes,
+            current_shard.sizes,
+        )
+    ):
+        min_range_end = min(
+            saved_shard_offset + saved_shard_size,
+            current_shard_offset + current_shard_size,
+        )
+
+        length = min_range_end - max(current_shard_offset, saved_shard_offset)
+
+        if saved_shard_offset > current_shard_offset:
+            offset_for_saved_tensor = 0
+            offset_for_current_tensor = saved_shard_offset - current_shard_offset
+        else:
+            offset_for_saved_tensor = current_shard_offset - saved_shard_offset
+            offset_for_current_tensor = 0
+
+        narrows.append(
+            (dim, offset_for_saved_tensor, offset_for_current_tensor, length)
+        )
+
+    return narrows
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a115f5e349bcd5101eebdfe7b2ad54da1262dd2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict.py
@@ -0,0 +1,1115 @@
+import contextlib
+import functools
+import gc
+from dataclasses import asdict, dataclass, field
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    no_type_check,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import (
+    _gather_state_dict,
+    _offload_state_dict_to_cpu,
+)
+from torch.distributed._tensor import DTensor
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+)
+from torch.distributed.fsdp import (
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    FullyShardedDataParallel as FSDP,
+    OptimStateDictConfig,
+    ShardedOptimStateDictConfig,
+    ShardedStateDictConfig,
+    StateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp._common_utils import (
+    _get_module_fsdp_state_if_fully_sharded_module,
+    FSDP_WRAPPED_MODULE,
+)
+from torch.nn.modules.module import _IncompatibleKeys
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+
+FLAT_PARAM = "_flat_param"
+PG = "param_groups"
+PG_PREFIX = f"{PG}."
+STATE = "state"
+STATE_PREFIX = f"{STATE}."
+PARAMS = "params"
+FQNS_T = Set[str]
+
+_patched_state_dict: Set[Callable] = set()
+
+
+PrimitiveType = Union[DTensor, ShardedTensor, torch.Tensor, int, float, str]
+ValueType = Union[
+    PrimitiveType, List[PrimitiveType], Tuple[PrimitiveType], Dict[str, "ValueType"]
+]
+DictValueType = Dict[str, ValueType]
+ListDictValueType = List[DictValueType]
+OptimizerStateType = Dict[str, Union[DictValueType, ListDictValueType]]
+
+
+@contextlib.contextmanager
+def gc_context():
+    is_enabled = gc.isenabled()
+    gc.disable()
+    try:
+        yield
+    finally:
+        # TODO: add logging for the gc details/time
+        gc.collect()
+        if is_enabled:
+            gc.enable()
+
+
+@dataclass
+class StateDictOptions:
+    """
+    This dataclass specifies how get_state_dict/set_state_dict will work.
+
+    - ``full_state_dict``: if this is set to True, all the tensors in the
+      returned state_dict will be gathered. No ShardedTensor and DTensor
+      will be in the returned state_dict.
+
+    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
+      ``full_state_dict`` is also true, then only the rank0 will get the
+      state_dict and all other ranks will get empty state_dict.
+
+    - ``ignore_frozen_params``: if the value is True, the returned state_dict
+      won't contain any frozen parameters -- the ``requires_grad`` is False.
+      The default value is False.
+
+    - ``keep_submodule_prefixes``: when ``submodules`` is not None, this option
+      indicates whether to keep the submodule prefixes from the state_dict keys.
+      or example, if the submodule is ``module.pretrain`` and the full FQN of
+      the parameter is ``pretrain.layer1.weight`` of the param. When this option
+      is True, the parameter's key in the returned state_dict will be
+      ``pretrain.layer1.weight``. If the options is False, the key will be
+      ``layer1.weight``.
+      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
+      FQNs, hence there should be only one submodule in ``submodules``.
+
+    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
+      model.load_state_dict().
+      The default value is False.
+    """
+
+    full_state_dict: bool = False
+    cpu_offload: bool = False
+    ignore_frozen_params: bool = False
+    keep_submodule_prefixes: bool = True
+    strict: bool = True
+
+
+@dataclass
+class _StateDictInfo(StateDictOptions):
+    fqn_param_mapping: Dict[
+        Union[str, torch.Tensor], Union[FQNS_T, torch.Tensor]
+    ] = field(default_factory=dict)
+    all_fqns: Set[str] = field(default_factory=set)
+    submodule_prefixes: Set[str] = field(default_factory=set)
+    handle_model: bool = True
+    handle_optim: bool = True
+    fsdp_context: Callable = contextlib.nullcontext
+    fsdp_modules: List[nn.Module] = field(default_factory=list)
+
+
+def _get_fqns(
+    model: nn.Module,
+    name: str,
+    skip_ddp_prefix: bool = True,
+    skip_compiler_prefix: bool = True,
+) -> FQNS_T:
+    """
+    This API is used to convert the name of a parameter to the FQNs. For FSDP
+    without `use_orig_params`, the name of FlatParameter can be mapped to
+    multiple original parameters. As a result, the return type of this function
+    is `Set[str]`.
+
+    Args:
+        module (nn.Module): the root model.
+        name (str): the name
+        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix
+
+    Returns:
+        The canonical FQNs based on the model traversal.
+    """
+
+    # Remove the checkpoint prefix, if it exists.
+    name = name.replace(_CHECKPOINT_PREFIX, "")
+    if "." not in name:
+        return {name}
+
+    obj_names = name.split(".")
+    fqn_obj_names = []
+    curr_obj = model
+    for i, curr_obj_name in enumerate(obj_names):
+        if isinstance(curr_obj, DDP):
+            assert curr_obj_name == "module"
+            curr_obj = curr_obj.module
+            if not skip_ddp_prefix:
+                fqn_obj_names.append(curr_obj_name)
+        elif isinstance(curr_obj, FSDP):
+            if i < len(obj_names) - 1 and obj_names[i + 1] == FLAT_PARAM:
+                prefix = ".".join(fqn_obj_names)
+                flat_param = getattr(curr_obj, FLAT_PARAM)
+                if prefix:
+                    prefix = f"{prefix}."
+                return {f"{prefix}{fqn}" for fqn in flat_param._fqns}
+            curr_obj = getattr(curr_obj, FSDP_WRAPPED_MODULE)
+            if curr_obj_name != FSDP_WRAPPED_MODULE:
+                fqn_obj_names.append(curr_obj_name)
+                curr_obj = getattr(curr_obj, curr_obj_name)
+        elif isinstance(curr_obj, torch._dynamo.eval_frame.OptimizedModule):
+            assert curr_obj_name == "_orig_mod"
+            curr_obj = curr_obj._orig_mod
+            if not skip_compiler_prefix:
+                fqn_obj_names.append(curr_obj_name)
+        else:
+            fqn_obj_names.append(curr_obj_name)
+            if curr_obj_name == nn.modules.module._EXTRA_STATE_KEY_SUFFIX:
+                if i != len(obj_names) - 1:
+                    raise RuntimeError("Expect `_extra_state` to be the last obj name")
+            else:
+                curr_obj = getattr(curr_obj, curr_obj_name)
+
+    return {".".join(fqn_obj_names).replace(_CHECKPOINT_PREFIX, "")}
+
+
+class _EXTRA_STATE:
+    pass
+
+
+def _iterate_valid_model_state(model):
+    visited_modules: Set[nn.Module] = set()
+
+    def recurse(module: nn.Module, curr_fqn: str) -> Generator:
+        visited_modules.add(module)
+
+        curr_fqn = f"{curr_fqn}." if curr_fqn else ""
+        for name, submodule in module.named_children():
+            if submodule in visited_modules:
+                continue
+            new_fqn = f"{curr_fqn}{name}"
+            yield from recurse(submodule, new_fqn)
+
+        for name, obj in chain(
+            module.named_buffers(recurse=False), module.named_parameters(recurse=False)
+        ):
+            if name in module._non_persistent_buffers_set:
+                continue
+            new_fqn = f"{curr_fqn}{name}"
+            yield new_fqn, obj
+
+        if (
+            getattr(module.__class__, "get_extra_state", nn.Module.get_extra_state)
+            != nn.Module.get_extra_state
+        ):
+            new_fqn = f"{curr_fqn}{nn.modules.module._EXTRA_STATE_KEY_SUFFIX}"
+            yield new_fqn, _EXTRA_STATE()
+
+    yield from recurse(model, "")
+
+
+def _verify_options(
+    model: nn.Module,
+    optims: Tuple[torch.optim.Optimizer, ...],
+    optim_only: bool,
+    *,
+    submodules: Optional[Set[nn.Module]] = None,
+    options: Optional[StateDictOptions] = None,
+) -> _StateDictInfo:
+    """
+    Verify the model and options passed by the user and generates _StateDictInfo.
+    """
+    if optim_only and not optims:
+        raise RuntimeError(
+            "Optimizers are not passed in but optim_only is set to True."
+        )
+
+    options = options or StateDictOptions()
+
+    fqn_param_mapping: Dict[
+        Union[str, torch.Tensor], Union[Set[str], torch.Tensor]
+    ] = {}
+    all_fqns = set()
+    for name, param in _iterate_valid_model_state(model):
+        fqns = _get_fqns(model, name)
+        if not isinstance(param, _EXTRA_STATE):
+            fqn_param_mapping[param] = fqns
+        for fqn in fqns:
+            if not isinstance(param, _EXTRA_STATE):
+                fqn_param_mapping[fqn] = param
+            all_fqns.add(fqn)
+
+    submodule_prefixes = set()
+    if submodules:
+        submodules = set(submodules)
+        for name, module in model.named_modules():
+            if module not in submodules:
+                continue
+            fqns = _get_fqns(model, name)
+            assert len(fqns) == 1, "Submodule FQN should only have 1 instance"
+            for fqn in fqns:
+                submodule_prefixes.add(f"{fqn}.")
+
+    fsdp_modules = FSDP.fsdp_modules(model)
+    state_dict_config: StateDictConfig
+    optim_state_dict_config: OptimStateDictConfig
+    fsdp_context: Callable
+    if fsdp_modules:
+        # FSDP API only work if at least one FSDP instance exists.
+        if options.full_state_dict:
+            state_dict_config = FullStateDictConfig(
+                offload_to_cpu=options.cpu_offload, rank0_only=options.cpu_offload
+            )
+            optim_state_dict_config = FullOptimStateDictConfig(
+                offload_to_cpu=options.cpu_offload, rank0_only=options.cpu_offload
+            )
+            state_dict_type = StateDictType.FULL_STATE_DICT
+        else:
+            state_dict_config = ShardedStateDictConfig(
+                offload_to_cpu=options.cpu_offload,
+            )
+            optim_state_dict_config = ShardedOptimStateDictConfig(
+                offload_to_cpu=options.cpu_offload,
+            )
+            state_dict_type = StateDictType.SHARDED_STATE_DICT
+
+        fsdp_context = functools.partial(
+            FSDP.state_dict_type,
+            module=model,
+            state_dict_type=state_dict_type,
+            state_dict_config=state_dict_config,
+            optim_state_dict_config=optim_state_dict_config,
+        )
+    else:
+        fsdp_context = contextlib.nullcontext
+
+    return _StateDictInfo(
+        **asdict(options),
+        fqn_param_mapping=fqn_param_mapping,
+        all_fqns=all_fqns,
+        submodule_prefixes=submodule_prefixes,
+        fsdp_context=fsdp_context,
+        fsdp_modules=cast(List[nn.Module], fsdp_modules),
+        handle_model=not optim_only,
+        handle_optim=(len(optims) > 0),
+    )
+
+
+def _verify_state_dict(
+    model_state_dict: Dict[str, ValueType],
+    optim_state_dict: OptimizerStateType,
+    info: _StateDictInfo,
+) -> None:
+    for module in info.fsdp_modules:
+        fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+        assert fsdp_state is not None, "Expected a fsdp_state with a fsdp module."
+
+    # Verify if the model_state_dict and optim_state_dict are valid. This API
+    # should give the users an explicit error message to debug or report.
+    if (
+        info.handle_model
+        and not model_state_dict
+        and not info.submodule_prefixes
+        and not info.ignore_frozen_params
+        and not (info.cpu_offload and info.full_state_dict)
+        and info.strict
+    ):
+        raise RuntimeError(
+            "The option indicates that model state_dict is required to save "
+            "or load, but model state_dict is empty."
+            f"rank = {dist.get_rank()=}."
+        )
+
+    if info.handle_optim:
+        if not (optim_state_dict and optim_state_dict[STATE]) and not (
+            info.cpu_offload and info.full_state_dict
+        ):
+            raise RuntimeError(
+                "The option indicates that model state_dict is required to save, "
+                f"or load but optim state_dict is empty. {optim_state_dict}"
+            )
+
+    for key in model_state_dict.keys():
+        if FLAT_PARAM in key:
+            raise RuntimeError(
+                f"{key} contains {FLAT_PARAM}. This can happen if the model "
+                "is not the root module."
+            )
+
+
+def _state_dict_fn(obj: Union[nn.Module, torch.optim.Optimizer], api: str) -> Callable:
+    call = getattr(obj, api)
+    if call in _patched_state_dict:
+        call = functools.partial(getattr(obj.__class__, api), self=obj)
+    return call
+
+
+def _get_model_state_dict(
+    model: nn.Module, info: _StateDictInfo
+) -> Dict[str, ValueType]:
+    if not info.handle_model:
+        return {}
+
+    with info.fsdp_context():
+        state_dict = _state_dict_fn(model, "state_dict")()
+
+    for key in list(state_dict.keys()):
+        fqns = _get_fqns(model, key)
+        assert len(fqns) == 1
+        fqn = next(iter(fqns))
+        if fqn != key:
+            # As we only support FSDP, DDP, and TP, the only cases are
+            # wrapper-based DDP and compiler. Verify if the assumption
+            # is correct.
+            def verify(key, fqn) -> bool:
+                if len(fqn) >= len(key):
+                    return False
+                fqn_split = fqn.split(".")
+                key_split = key.split(".")
+                fqn_idx = 0
+                for key_idx, key_name in enumerate(key_split):
+                    if key_name == fqn_split[fqn_idx]:
+                        fqn_idx += 1
+                        if fqn_idx == len(fqn_split):
+                            return key_idx == len(key_split) - 1
+                    elif key_name in ("module", "_orig_mod"):
+                        continue
+                    else:
+                        return False
+                return True
+
+            if not verify(key, fqn):
+                raise RuntimeError(f"An unexpected key, {key}, exists. FQN is {fqn}")
+            state_dict[fqn] = state_dict.pop(key)
+
+    if info.submodule_prefixes:
+        new_state_dict: Dict[str, ValueType] = {}
+        # TODO: make this faster.
+        for fqn in state_dict.keys():
+            for prefix in info.submodule_prefixes:
+                if not fqn.startswith(prefix):
+                    continue
+                if info.keep_submodule_prefixes:
+                    new_state_dict[fqn] = state_dict[fqn]
+                else:
+                    new_fqn = fqn[len(prefix) :]
+                    new_state_dict[new_fqn] = state_dict[fqn]
+        state_dict = new_state_dict
+
+    if info.ignore_frozen_params:
+        for key, param in model.named_parameters():
+            if param.requires_grad:
+                continue
+            fqns = _get_fqns(model, key)
+            for fqn in fqns:
+                state_dict.pop(fqn)
+
+    for key, p in list(state_dict.items()):
+        if torch.is_tensor(p) and p.is_meta:
+            state_dict.pop(key)
+
+    if info.full_state_dict:
+        ranks_only = tuple() if not info.cpu_offload else (0,)
+        return _gather_state_dict(
+            state_dict, cpu_offload=info.cpu_offload, ranks_only=ranks_only
+        )
+    elif info.cpu_offload:
+        return _offload_state_dict_to_cpu(state_dict)
+    else:
+        return state_dict
+
+
+def _load_model_state_dict(
+    model: nn.Module,
+    state_dict: Dict[str, ValueType],
+    info: _StateDictInfo,
+) -> _IncompatibleKeys:
+    if not info.handle_model or not state_dict:
+        return _IncompatibleKeys({}, {})
+
+    for key, _ in _iterate_valid_model_state(model):
+        fqns = _get_fqns(model, key)
+        fqns_with_prefix = _get_fqns(
+            model, key, skip_ddp_prefix=False, skip_compiler_prefix=False
+        )
+        for fqn, fqn_with_prefix in zip(fqns, fqns_with_prefix):
+            if fqn != fqn_with_prefix:
+                state_dict[fqn_with_prefix] = state_dict.pop(fqn)
+
+    with info.fsdp_context():
+        return cast(
+            _IncompatibleKeys,
+            _state_dict_fn(model, "load_state_dict")(
+                state_dict=state_dict, strict=info.strict
+            ),
+        )
+
+
+def _init_optim_state(optim: torch.optim.Optimizer) -> None:
+    """
+    Initialize optim states by calling the step() with zero grads.
+    """
+    if optim.state:
+        # The optimizer state is initialized.
+        return
+
+    for param_group in optim.param_groups:
+        for param in param_group[PARAMS]:
+            if param.grad is not None:
+                raise RuntimeError(
+                    "state_dict can only be used if the optimizer "
+                    "states are initialized (usually after one step() with "
+                    "gradients) or gradients are None. For the later case, "
+                    "state_dict will fake the gradients as zero "
+                    "to initialize the optimizer states. However, the "
+                    "gradients are not None."
+                )
+            if param.requires_grad:
+                param.grad = torch.zeros_like(param)
+    optim.step(closure=None)
+    optim.zero_grad(set_to_none=True)
+
+
+def _get_optim_state_dict(
+    model: nn.Module,
+    optimizers: Tuple[torch.optim.Optimizer, ...],
+    info: _StateDictInfo,
+) -> OptimizerStateType:
+    if not info.handle_optim:
+        return {}
+
+    optim_state_dict: OptimizerStateType = {STATE: {}, PG: []}
+    for optim in optimizers:
+        _init_optim_state(optim)
+        osd = _state_dict_fn(optim, "state_dict")()
+        if info.fsdp_modules:
+            with info.fsdp_context():
+                osd = FSDP.optim_state_dict(model, optim, osd)
+
+            # We need to specially handle FlatParameter FSDP as
+            # FlatParameter FSDP converts the FQNs.
+            # There are no easy ways to do this conversion systematically.
+            # We can only use a string replacment without correctness check.
+            if not osd:
+                continue
+            for k in list(osd[STATE].keys()):
+                if "_orig_mod" in k:
+                    osd[STATE][k.replace("_orig_mod.", "")] = osd[STATE].pop(k)
+            for g in osd[PG]:
+                params = [k.replace("_orig_mod.", "") for k in g[PARAMS]]
+                g[PARAMS] = params
+        else:
+            params = list(chain.from_iterable(g[PARAMS] for g in optim.param_groups))
+            param_pid_mapping = dict(zip(params, range(len(params))))
+            fqn_pid_mapping = {}
+            for key, param in model.named_parameters():
+                fqns = _get_fqns(model, key)
+                assert len(fqns) == 1
+                fqn = next(iter(fqns))
+                if param not in param_pid_mapping:
+                    continue
+                pid = param_pid_mapping[param]
+                fqn_pid_mapping[fqn] = pid
+                fqn_pid_mapping[pid] = fqn
+
+            for key in list(osd[STATE].keys()):
+                fqn = fqn_pid_mapping[key]
+                osd[STATE][fqn] = osd[STATE].pop(key)
+
+            for group in osd[PG]:
+                group[PARAMS] = [fqn_pid_mapping[pid] for pid in group[PARAMS]]
+
+        if not osd:
+            continue
+
+        cast(DictValueType, optim_state_dict[STATE]).update(osd[STATE])
+        cast(ListDictValueType, optim_state_dict[PG]).extend(osd[PG])
+
+    if info.full_state_dict:
+        ranks_only = tuple() if not info.cpu_offload else (0,)
+        return _gather_state_dict(
+            optim_state_dict, cpu_offload=info.cpu_offload, ranks_only=ranks_only
+        )
+    elif info.cpu_offload:
+        return _offload_state_dict_to_cpu(optim_state_dict)
+    else:
+        return optim_state_dict
+
+
+def _split_optim_state_dict(
+    model: nn.Module,
+    optim: torch.optim.Optimizer,
+    optim_state_dict: OptimizerStateType,
+    info: _StateDictInfo,
+) -> OptimizerStateType:
+    """
+    Extract the corresponding optim state_dict from ``optim_state_dict`` for
+    ``optim`` and return the result optim state_dict.
+
+    Args:
+        model (nn.Module): the root model.
+        optim (torch.optim.Optimizer): the optimizer.
+        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
+            contains the optim state_dict of ``optim``.
+        info (_StateDictInfo): state dict information.
+
+    Returns:
+        The optim state_dict of ``optim``.
+    """
+
+    state: DictValueType = {}
+    pg_state: ListDictValueType = []
+    return_osd: OptimizerStateType = {STATE: state, PG: pg_state}
+    pg_mapping: Dict[int, int] = {}
+
+    for param_group in optim.param_groups:
+        pg_state.append({PARAMS: []})
+        for param in param_group[PARAMS]:
+            for fqn in info.fqn_param_mapping[param]:
+                params = pg_state[-1][PARAMS]
+                assert isinstance(params, list)
+                params.append(fqn)
+                if param.requires_grad:
+                    state[fqn] = cast(DictValueType, optim_state_dict[STATE])[fqn]
+                for loaded_param_group in cast(ListDictValueType, optim_state_dict[PG]):
+                    params = loaded_param_group[PARAMS]
+                    assert isinstance(params, list)
+                    if fqn in params:
+                        pg_mapping[id(loaded_param_group)] = len(return_osd[PG]) - 1
+
+    for param_group in cast(ListDictValueType, optim_state_dict[PG]):
+        idx = pg_mapping.get(id(param_group), -1)
+        if idx == -1:
+            continue
+        for key, value in param_group.items():
+            if key == PARAMS:
+                continue
+            # TODO: check if value is the same if exists.
+            pg_state[idx][key] = value
+
+    return return_osd
+
+
+def _load_optim_state_dict(
+    model: nn.Module,
+    optimizers: Tuple[torch.optim.Optimizer, ...],
+    state_dict: OptimizerStateType,
+    info: _StateDictInfo,
+) -> None:
+    if not info.handle_optim:
+        return
+
+    for optim in optimizers:
+        optim_state_dict = _split_optim_state_dict(model, optim, state_dict, info)
+        if info.fsdp_modules:
+            # We need to specially handle FlatParameter FSDP as
+            # FlatParameter FSDP converts the FQNs.
+            for original_fqn, _ in model.named_parameters():
+                fqns = _get_fqns(model, original_fqn)
+                fqns_with_compiler = _get_fqns(
+                    model, original_fqn, skip_compiler_prefix=False
+                )
+                if fqns == fqns_with_compiler:
+                    continue
+
+                assert len(fqns) == 1
+                fqn = fqns.pop()
+                fqn_with_compiler = fqns_with_compiler.pop()
+                for g in optim_state_dict[PG]:
+                    val = cast(Dict[str, Any], g)
+                    params = [
+                        key.replace(fqn, fqn_with_compiler) for key in val[PARAMS]
+                    ]
+                    val[PARAMS] = params
+                osd_state = cast(DictValueType, optim_state_dict[STATE])
+                for k in list(osd_state.keys()):
+                    if fqn in k:
+                        osd_state[k.replace(fqn, fqn_with_compiler)] = osd_state.pop(k)
+
+            with info.fsdp_context():
+                optim_state_dict = FSDP.optim_state_dict_to_load(
+                    model, optim, optim_state_dict
+                )
+
+        # Note that we do not have to convert the FQN back to param id here if
+        # order in optim.param_groups[idx][PARAMS] is the same as the one in
+        # optim_state_dict[PG][idx][PARAMS].
+        _init_optim_state(optim)
+        _state_dict_fn(optim, "load_state_dict")(state_dict=optim_state_dict)
+
+
+def get_model_state_dict(
+    model: nn.Module,
+    *,
+    submodules: Optional[Set[nn.Module]] = None,
+    options: Optional[StateDictOptions] = None,
+) -> Dict[str, ValueType]:
+    """
+    Return the model state_dict of ``model``.
+
+    See ``get_state_dict`` for the detail usage.
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        submodules: Optional[Set[nn.Module]]: only return the model parameters
+            that belong to the submodules.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be returned. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        The state_dict for ``model``.
+
+    :rtype: typing.Dict[str, ValueType]
+    """
+    with gc_context():
+        info = _verify_options(
+            model,
+            tuple(),
+            optim_only=False,
+            submodules=submodules,
+            options=options,
+        )
+        model_state_dict = _get_model_state_dict(model, info)
+        _verify_state_dict(model_state_dict, {}, info)
+        return model_state_dict
+
+
+def get_optimizer_state_dict(
+    model: nn.Module,
+    optimizers: Union[torch.optim.Optimizer, Iterable[torch.optim.Optimizer]],
+    *,
+    submodules: Optional[Set[nn.Module]] = None,
+    options: Optional[StateDictOptions] = None,
+) -> OptimizerStateType:
+    """
+    Return the combined state_dict for optimizers.
+
+    See ``get_state_dict`` for the detail usage.
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
+            The optimizers that are used to optimize ``model``.
+        submodules: Optional[Set[nn.Module]]: only return the model parameters
+            that belong to the submodules.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be returned. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        The state_dict for ``optimizers``.
+
+    :rtype: OptimizerStateType
+    """
+    with gc_context():
+        optimizers = (
+            (optimizers,)
+            if isinstance(optimizers, torch.optim.Optimizer)
+            else tuple(optimizers)
+        )
+        info = _verify_options(
+            model,
+            optimizers,
+            optim_only=True,
+            submodules=submodules,
+            options=options,
+        )
+        optim_state_dict = _get_optim_state_dict(model, optimizers, info)
+        _verify_state_dict({}, optim_state_dict, info)
+        return optim_state_dict
+
+
+def get_state_dict(
+    model: nn.Module,
+    optimizers: Union[torch.optim.Optimizer, Iterable[torch.optim.Optimizer]],
+    *,
+    submodules: Optional[Set[nn.Module]] = None,
+    options: Optional[StateDictOptions] = None,
+) -> Tuple[Dict[str, ValueType], OptimizerStateType]:
+    """
+    Return the model state_dict and optimizers state_dict.
+
+    ``get_state_dict`` can process any module that is parallelized by PyTorch
+    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
+    combination of these parallelisms. The main functions of ``get_state_dict``
+    are: 1.) returning a model and optimizer state_dict that can be resharded
+    with a different number of trainers and/or different parallelisms.
+    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
+    these APIs.
+    3.) sanity checking the result state_dict.
+
+    The keys of the result state dictionary are the canonical FQNs (Fully
+    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
+    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
+    parameter is the FQN returned by ``module.named_parameters()`` or
+    ``module.named_buffers()`` when the module is not distributed by any
+    parallelisms. Since the optimizer internally uses parameter IDs to represent
+    a parameter, there will be a conversion from the parameter IDs to the
+    canonical FQNs when calling this API.
+
+    ``get_state_dict`` can also process a module that is not parallelized. In
+    such a case, ``get_state_dict`` only performs one function -- converting the
+    optimizer parameter IDs to the canonical FQNs.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> import torch
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> from torch.nn.parallel import DistributedDataParallel as DDP
+        >>> from torch.distributed.checkpoint.state_dict import get_state_dict
+
+        >>> fsdp_model = FSDP(copy.deepcopy(model))
+        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
+        >>> ddp_model = DDP(copy.deepcopy(model))
+        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+
+        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
+        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)
+
+        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
+        >>> # the asserts will fail.
+        >>> assert ddp_state_dict == fsdp_state_dict
+        >>> assert ddp_optim_state == fsdp_optim_state_dict
+
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
+            The optimizers that are used to optimize ``model``.
+        submodules: Optional[Set[nn.Module]]: only return the model parameters
+            that belong to the submodules.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be returned. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        ``Tuple`` that contain model state_dict and optimizer state_dict.
+
+    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
+    """
+
+    with gc_context():
+        optimizers = (
+            (optimizers,)
+            if isinstance(optimizers, torch.optim.Optimizer)
+            else tuple(optimizers)
+        )
+        info = _verify_options(
+            model,
+            optimizers,
+            optim_only=False,
+            submodules=submodules,
+            options=options,
+        )
+        model_state_dict = _get_model_state_dict(model, info)
+        optim_state_dict = _get_optim_state_dict(model, optimizers, info)
+        _verify_state_dict(model_state_dict, optim_state_dict, info)
+        return model_state_dict, optim_state_dict
+
+
+def _unflatten_model_state_dict(
+    model: nn.Module,
+    state_dict: Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]],
+) -> Dict[str, ValueType]:
+    if not state_dict:
+        return {}
+
+    if isinstance(next(iter(state_dict.keys())), nn.Module):
+        cast_state_dict = cast(Dict[nn.Module, Dict[str, ValueType]], state_dict)
+        new_state_dict: Dict[str, ValueType] = {}
+        for submodule, sub_state_dict in cast_state_dict.items():
+            for name, m in model.named_modules():
+                if m != submodule:
+                    continue
+
+                fqns = _get_fqns(model, name)
+                assert len(fqns) == 1, "FQNs for a submodule should only have 1 element"
+                prefix = f"{next(iter(fqns))}."
+                new_state_dict.update(
+                    {prefix + subfqn: value for subfqn, value in sub_state_dict.items()}
+                )
+        return new_state_dict
+    else:
+        return cast(Dict[str, ValueType], state_dict)
+
+
+def set_model_state_dict(
+    model: nn.Module,
+    model_state_dict: Dict[str, ValueType],
+    *,
+    options: Optional[StateDictOptions] = None,
+) -> _IncompatibleKeys:
+    """Load the model state_dict.
+
+    The counterpart of ``get_model_state_dict`` to set the state_dict to the
+    model. See ``set_state_dict`` for the detail usage.
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        model_state_dict: (Dict[str, ValueType]):
+           the model state_dict to load. If the key of the ``model_state_dict``
+           is nn.Module, the key is a submodule of ``model`` and the value should
+           be the state_dict of the submodule. When loading the state_dict,
+           the prefix of the submodule will be append to the state_dict.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be loaded. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+            * **missing_keys** is a list of str containing the missing keys
+            * **unexpected_keys** is a list of str containing the unexpected keys
+
+    :type model_state_dict: typing.Dict[str, ValueType]
+    """
+    model_state_dict: Dict[str, ValueType] = _unflatten_model_state_dict(
+        model, model_state_dict
+    )
+    with gc_context():
+        info = _verify_options(model, tuple(), optim_only=False, options=options)
+
+        _verify_state_dict(model_state_dict, {}, info)
+        return _load_model_state_dict(model, model_state_dict, info)
+
+
+def set_optimizer_state_dict(
+    model: nn.Module,
+    optimizers: Union[torch.optim.Optimizer, Iterable[torch.optim.Optimizer]],
+    *,
+    optim_state_dict: OptimizerStateType,
+    options: Optional[StateDictOptions] = None,
+) -> None:
+    """Load the optimizers state_dict.
+
+    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
+    optimizers. See ``set_state_dict`` for the detail usage.
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        optimizers (Union[Optimizer, Iterable[Optimizer]]):
+            The optimizers that are used to optimize ``model``.
+        optim_state_dict: OptimizerStateType:
+            the optimizer state_dict to load.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be loaded. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        None
+
+    :type optim_state_dict: typing.OptimizerStateType
+    """
+    with gc_context():
+        optimizers = (
+            (optimizers,)
+            if isinstance(optimizers, torch.optim.Optimizer)
+            else tuple(optimizers)
+        )
+        info = _verify_options(model, optimizers, optim_only=True, options=options)
+
+        _verify_state_dict({}, optim_state_dict, info)
+        _load_optim_state_dict(model, optimizers, optim_state_dict, info)
+
+
+def set_state_dict(
+    model: nn.Module,
+    optimizers: Union[torch.optim.Optimizer, Iterable[torch.optim.Optimizer]],
+    *,
+    model_state_dict: Dict[str, ValueType],
+    optim_state_dict: OptimizerStateType,
+    options: Optional[StateDictOptions] = None,
+) -> _IncompatibleKeys:
+    """Load the model state_dict and optimizers state_dict.
+
+    The counterpart of ``get_state_dict`` to set the state_dict to the model and
+    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
+    have to be returned by ``get_state_dict`` but must meet the following
+    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
+    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
+    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
+    the canonical FQNs.
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        optimizers (Union[Optimizer, Iterable[Optimizer]]):
+            The optimizers that are used to optimize ``model``.
+        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
+           the model state_dict to load. If the key of the ``model_state_dict``
+           is nn.Module, the key is a submodule of ``model`` and the value should
+           be the state_dict of the submodule. When loading the state_dict,
+           the prefix of the submodule will be append to the state_dict.
+        optim_state_dict: OptimizerStateType:
+            the optimizer state_dict to load.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be loaded. See
+            `StateDictOptions` for the details.
+
+    Returns:
+        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
+            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.
+
+    :type model_state_dict: typing.Dict[str, ValueType]
+    :type optim_state_dict: typing.OptimizerStateType
+    """
+
+    model_state_dict: Dict[str, ValueType] = _unflatten_model_state_dict(
+        model, model_state_dict
+    )
+    with gc_context():
+        optimizers = (
+            (optimizers,)
+            if isinstance(optimizers, torch.optim.Optimizer)
+            else tuple(optimizers)
+        )
+        info = _verify_options(
+            model, optimizers, optim_only=not model_state_dict, options=options
+        )
+
+        _verify_state_dict(model_state_dict, optim_state_dict, info)
+        _load_optim_state_dict(model, optimizers, optim_state_dict, info)
+        return _load_model_state_dict(model, model_state_dict, info)
+
+
+# TODO: correct the state_dict function signature.
+# TODO: this API is not yet fully tested. Make it private
+@no_type_check
+def _patch_model_state_dict(
+    model: nn.Module,
+    *,
+    options: Optional[StateDictOptions] = None,
+) -> None:
+    """Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.
+
+    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
+    be a partial function to call ``get_state_dict`` and ``set_state_dict``.
+
+    Example:
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        from torch.distributed.checkpoint.state_dict import patch_model_state_dict
+
+        model = fsdp(model)
+        patch_model_state_dict(model)
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be loaded. See
+            `StateDictOptions` for the details.
+    Returns:
+        None
+    """
+
+    _state_dict_call = functools.partial(
+        get_model_state_dict,
+        model=model,
+        options=options,
+    )
+
+    def state_dict_call():
+        return _state_dict_call()
+
+    model.state_dict = state_dict_call
+
+    _load_state_dict_call = functools.partial(
+        set_model_state_dict,
+        model=model,
+        options=options,
+    )
+
+    def load_state_dict_call(state_dict: Dict[str, Any]):
+        _load_state_dict_call(model_state_dict=state_dict)
+
+    model.load_state_dict = load_state_dict_call
+
+    _patched_state_dict.add(state_dict_call)
+    _patched_state_dict.add(load_state_dict_call)
+
+
+# TODO: correct the load_state_dict function signature.
+# TODO: this API is not yet fully tested. Make it private
+@no_type_check
+def _patch_optimizer_state_dict(
+    model: nn.Module,
+    *,
+    optimizers: Tuple[torch.optim.Optimizer, ...],
+    options: Optional[StateDictOptions] = None,
+) -> None:
+    """Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.
+
+    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
+    be a partial function to call ``get_state_dict`` and ``set_state_dict``.
+
+    Note that if there are multiple optimizers, all of the optimizers will be patched.
+    So users only need to call one of the state_dict() to get the full result.
+
+    Example:
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        from torch.distributed.checkpoint.state_dict import patch_model_state_dict
+
+        model = fsdp(model)
+        patch_model_state_dict(model)
+
+    Args:
+        model (nn.Module): the nn.Module to the model.
+        options (StateDictOptions): the options to control how
+            model state_dict and optimizer state_dict should be loaded. See
+            `StateDictOptions` for the details.
+    Returns:
+        None
+    """
+
+    _state_dict_call = functools.partial(
+        get_optimizer_state_dict,
+        model=model,
+        optimizers=optimizers,
+        options=options,
+    )
+
+    def state_dict_call():
+        return _state_dict_call()
+
+    _load_state_dict_call = functools.partial(
+        set_optimizer_state_dict,
+        model=model,
+        optimizers=optimizers,
+        options=options,
+    )
+
+    def load_state_dict_call(state_dict: Dict[str, Any]):
+        _load_state_dict_call(optim_state_dict=state_dict)
+
+    _patched_state_dict.add(state_dict_call)
+    _patched_state_dict.add(load_state_dict_call)
+    optimizers = (
+        (optimizers,)
+        if isinstance(optimizers, torch.optim.Optimizer)
+        else tuple(optimizers)
+    )
+    for optim in optimizers:
+        optim.state_dict = state_dict_call
+        optim.load_state_dict = load_state_dict_call
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_loader.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28738bc7772012d1ad2f24b484ccfe1c6b69381
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_loader.py
@@ -0,0 +1,218 @@
+import os
+import warnings
+from typing import Any, cast, Dict, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed.checkpoint.stateful import Stateful
+
+from ._storage_utils import _storage_setup
+from .default_planner import DefaultLoadPlanner
+from .planner import LoadPlanner
+from .storage import StorageReader
+from .utils import _all_gather_keys, _api_bc_check, _DistWrapper, _profile
+
+__all__ = ["load_state_dict", "load"]
+
+
+def load_state_dict(
+    state_dict: Dict[str, Any],
+    storage_reader: StorageReader,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False,
+    planner: Optional[LoadPlanner] = None,
+) -> None:
+    """This method is deprecated. Please switch to 'load'."""
+    warnings.warn(
+        "'load_state_dict' is deprecated and will be removed in future versions. "
+        "Please use 'load' instead."
+    )
+    storage_reader.reset()
+    with _profile():
+        # TODO: test returning `load` here instead.
+        return _load_state_dict(
+            state_dict,
+            storage_reader,
+            process_group,
+            coordinator_rank,
+            no_dist,
+            planner,
+        )
+
+
+@_api_bc_check
+def load(
+    state_dict: Dict[str, Any],
+    *,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_reader: Optional[StorageReader] = None,
+    planner: Optional[LoadPlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> None:
+    """
+    Load a distributed ``state_dict`` in SPMD style.
+
+    Each rank will try to read the least amount of data necessary
+    to fullfill the requested `state_dict`. When loading :class:`ShardedTensor`
+    or :class:`DTensor` instances, each rank only reads data for their local shards.
+
+    For each ``Stateful`` object (having both a ``state_dict`` and a ``load_state_dict``),
+    load will first call ``state_dict`` before attempting deserialization, followed by
+    ``load_state_dict`` once the deserialization is complete.
+
+    .. warning::
+        All tensors in ``state_dict`` must be allocated on their
+        destination device *prior to* calling this function.
+
+        All non-tensor data is loaded using `torch.load()` and modified in place
+        on state_dict.
+
+    .. warning::
+        Users must call `load_state_dict` on the root module to ensure load
+        pos-processing and non-tensor data properly propagates.
+
+    .. note:
+        If no process group is initialized, this function can assumesbe the intent
+        is to load a checkpoint into the local process. This can be useful in the
+        case of local inference, and when using regular Tensors (as opposed to DTensor
+         or ShardedTensor)
+
+    .. note:
+        Rank 0 is assumed to be the coordinator rank.
+
+    Args:
+        state_dict (Dict[str, Any]): The state_dict to save.
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_reader (Optional[StorageReader]):
+            Instance of StorageWriter used to perform reads. If this is not
+            specified, DCP will automatically infer the reader based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[LoadPlanner]):
+            Instance of LoadPlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
+            ProcessGroup to be used for cross-rank synchronization.
+            (Default: ``None``)
+
+    Returns:
+        None.
+
+    Examples
+        >>> # xdoctest: +SKIP
+        >>> my_model = MyModule()
+        >>> optimizer = Adagrad(my_model.parameters())
+        >>> model_state_dict = my_model.state_dict()
+        >>> fs_storage_reader = torch.distributed.checkpoint.FileSystemReader("/checkpoint/1")
+
+        >>> torch.distributed.checkpoint.load_state_dict(
+        >>>     state_dict=model_state_dict,
+        >>>     storage_reader=fs_storage_reader,
+        >>> )
+
+        >>> # module.load_state_dict() function might have customized steps
+        >>> # to flush the state_dict, must call it to
+        >>> # ensure correct behavior.
+        >>> my_model.load_state_dict(model_state_dict)
+
+    .. note::
+        load_state_dict uses collectives to coordinate reads across ranks.
+        For NCCL-based process groups, internal tensor representations of
+        objects must be moved to the GPU device before communication takes place.
+        In this case, the device used is given by ``torch.cuda.current_device()``
+        and it is the user's responsibility to ensure that this is set so that each
+        rank has an individual GPU, via ``torch.cuda.set_device()``.
+    """
+
+    no_dist = not (dist.is_available() and dist.is_initialized())
+    if no_dist:
+        warnings.warn(
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to load in a single process."
+        )
+
+    with _profile():
+        storage_reader = cast(
+            StorageReader, _storage_setup(storage_reader, checkpoint_id, reader=True)
+        )
+
+        if no_dist:
+            keys = list(state_dict.keys())
+        else:
+            keys = _all_gather_keys(state_dict, process_group)
+            if keys != sorted(state_dict.keys()):
+                warnings.warn(
+                    "Detected mismatched keys in state dict after all gather!"
+                    " This behavior is unsupported and may cause errors may cause errors."
+                )
+
+        statetful_sd = {}
+        for key in keys:
+            if key not in state_dict:
+                continue
+            elem = state_dict[key]
+            statetful_sd[key] = (
+                elem.state_dict() if isinstance(elem, Stateful) else elem
+            )
+
+        _load_state_dict(
+            state_dict=statetful_sd,
+            storage_reader=storage_reader,
+            process_group=process_group,
+            no_dist=no_dist,
+            planner=planner,
+        )
+        for key in keys:
+            if key not in state_dict:
+                continue
+            elem = state_dict[key]
+            if isinstance(elem, Stateful):
+                elem.load_state_dict(statetful_sd[key])
+            state_dict[key] = elem
+
+
+def _load_state_dict(
+    state_dict: Dict[str, Any],
+    storage_reader: StorageReader,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False,
+    planner: Optional[LoadPlanner] = None,
+) -> None:
+    torch._C._log_api_usage_once("torch.distributed.checkpoint.load_state_dict")
+
+    distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
+    if planner is None:
+        planner = DefaultLoadPlanner()
+
+    def local_step():
+        assert planner is not None
+        metadata = storage_reader.read_metadata()
+        planner.set_up_planner(state_dict, metadata, distW.is_coordinator)
+        storage_reader.set_up_storage_reader(metadata, distW.is_coordinator)
+
+        local_plan = planner.create_local_plan()
+        local_plan = storage_reader.prepare_local_plan(local_plan)
+        return local_plan
+
+    def global_step(all_local_plans):
+        assert planner is not None
+        all_local_plans = planner.create_global_plan(all_local_plans)
+        all_local_plans = storage_reader.prepare_global_plan(all_local_plans)
+        return all_local_plans
+
+    central_plan = distW.reduce_scatter("plan", local_step, global_step)
+
+    def read_data():
+        assert planner is not None
+        final_local_plan = planner.finish_plan(central_plan)
+        all_reads = storage_reader.read_data(final_local_plan, planner)
+
+        all_reads.wait()
+        return None
+
+    _ = distW.all_gather("read", read_data)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_saver.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..280ffdbe518dcf0b0e8f72c64805d170ec2d393b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/state_dict_saver.py
@@ -0,0 +1,288 @@
+import os
+import warnings
+from concurrent.futures import Future, ThreadPoolExecutor
+from typing import cast, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed._state_dict_utils import _offload_state_dict_to_cpu
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.distributed_c10d import _get_default_group
+
+from ._storage_utils import _storage_setup
+from .default_planner import DefaultSavePlanner
+from .metadata import Metadata, STATE_DICT_TYPE
+from .planner import SavePlanner
+from .storage import StorageWriter
+from .utils import _api_bc_check, _DistWrapper, _profile
+
+
+__all__ = ["save_state_dict", "save", "async_save"]
+
+
+def save_state_dict(
+    state_dict: STATE_DICT_TYPE,
+    storage_writer: StorageWriter,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False,
+    planner: Optional[SavePlanner] = None,
+) -> Metadata:
+    """This method is deprecated. Please switch to 'save'."""
+    warnings.warn(
+        "'save_state_dict' is deprecated and will be removed in future versions."
+        "Please use 'save' instead."
+    )
+
+    storage_writer.reset()
+
+    # TODO: test returning `save` here instead.
+    with _profile():
+        return _save_state_dict(
+            state_dict,
+            storage_writer,
+            process_group,
+            coordinator_rank,
+            no_dist,
+            planner,
+        )
+
+
+@_api_bc_check
+def save(
+    state_dict: STATE_DICT_TYPE,
+    *,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_writer: Optional[StorageWriter] = None,
+    planner: Optional[SavePlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> Metadata:
+    """
+    Save a distributed model in SPMD style.
+
+    This function is different from ``torch.save()`` as it handles
+    ``ShardedTensor`` , and ``DTensor`` by having each rank only save their local shards.
+
+    For each ``Stateful`` object (having both a ``state_dict`` and a ``load_state_dict``),
+    save will call ``state_dict`` before serialization.
+
+    .. warning::
+        There is no guarantees of Backwards Compatibility across PyTorch versions
+        for saved state_dicts.
+
+    .. warning::
+        If using the `process_group` argument, make sure that only its ranks
+        call `save_state_dict` and that all data in state_dict belong to it.
+
+    .. note::
+        When saving checkpoint for FSDP's `ShardingStrategy.HYBRID_SHARD`, only one of
+        the shard_group should be calling `save_state_dict` and the corresponding process
+        group needs to be passed in.
+
+    .. note::
+        If no process group is available, this function assumes the intention is to save the
+         state_dict in the local process.
+
+    .. note:
+        Rank 0 is assumed to be the coordinator rank.
+
+
+    Args:
+        state_dict (Dict[str, Any]): The state_dict to save.
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_writer (Optional[StorageWriter]):
+            Instance of StorageWriter used to perform writes. If this is not
+            specified, DCP will automatically infer the writer based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[SavePlanner]):
+            Instance of SavePlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
+            ProcessGroup to be used for cross-rank synchronization.
+            (Default: ``None``)
+
+    Returns:
+        Metadata: Metadata object for the saved checkpoint.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> my_model = MyModule()
+
+        >>> state_dict = {"model": my_model}
+
+        >>> fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter("/checkpoint/1")
+        >>> torch.distributed.checkpoint.save(
+        >>>     state_dict=state_dict,
+        >>>     storage_writer=fs_storage_writer,
+        >>> )
+
+    .. note::
+        save_state_dict uses collectives to coordinate writes across ranks.
+        For NCCL-based process groups, internal tensor representations of
+        objects must be moved to the GPU device before communication takes place.
+        In this case, the device used is given by ``torch.cuda.current_device()``
+        and it is the user's responsibility to ensure that this is set so that
+        each rank has an individual GPU, via ``torch.cuda.set_device()``.
+    """
+    torch._C._log_api_usage_once("torch.distributed.checkpoint.save")
+
+    no_dist = not (dist.is_available() and dist.is_initialized())
+    if no_dist:
+        warnings.warn(
+            "torch.distributed is unavailable or uninitialized, assuming the intent is to save in a single process."
+        )
+
+    with _profile():
+        storage_writer = cast(
+            StorageWriter, _storage_setup(storage_writer, checkpoint_id, reader=False)
+        )
+
+        return _save_state_dict(
+            state_dict=_stateful_to_state_dict(state_dict),
+            storage_writer=storage_writer,
+            process_group=process_group,
+            no_dist=no_dist,
+            planner=planner,
+        )
+
+
+def async_save(
+    state_dict: STATE_DICT_TYPE,
+    *,
+    checkpoint_id: Union[str, os.PathLike, None] = None,
+    storage_writer: Optional[StorageWriter] = None,
+    planner: Optional[SavePlanner] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> Future:
+    """Asynchronous version of ``save_state_dict``. This code first de-stages the state_dict on CPU, and then calls
+    `save` in a separate thread.
+
+    .. warning::
+        This feature is experimental and subject to change.
+
+    Args:
+        state_dict (Dict[str, Any]): The state_dict to save.
+        checkpoint_id (Union[str, os.PathLike, None]):
+            The ID of this checkpoint instance. The meaning of the checkpoint_id
+            depends on the storage. It can be a path to a folder or to a file.
+            It can also be a key if the storage is a key-value store.
+            (Default: ``None``)
+        storage_writer (Optional[StorageWriter]):
+            Instance of StorageWriter used to perform writes. If this is not
+            specified, DCP will automatically infer the writer based on the
+            checkpoint_id. If checkpoint_id is also None, an exception will
+            be raised. (Default: ``None``)
+        planner (Optional[SavePlanner]):
+            Instance of SavePlanner. If this is not specificed, the default
+            planner will be used. (Default: ``None``)
+        process_group (Optional[ProcessGroup]):
+            ProcessGroup to be used for cross-rank synchronization.
+            (Default: ``None``)
+
+    Returns:
+        Future: A future holding the resultant Metadata object from `save`.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> my_model = MyModule()
+
+        >>> state_dict = {"model": my_model}
+
+        >>> fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter("/checkpoint/1")
+        >>> checkpoint_future = torch.distributed.checkpoint.async_save(
+        >>>     state_dict=state_dict,
+        >>>     storage_writer=fs_storage_writer,
+        >>> )
+        >>>
+        >>> # ... do some work ...
+        >>>
+        >>> checkpoint_future.result()
+
+    """
+    torch._C._log_api_usage_once("torch.distributed.checkpoint.async_save")
+
+    pg = process_group or _get_default_group()
+    assert (
+        torch.device("cpu") in pg._device_types  # type: ignore[attr-defined]
+    ), "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:ncc'"
+
+    cpu_state_dict = _offload_state_dict_to_cpu(_stateful_to_state_dict(state_dict))
+
+    executor = ThreadPoolExecutor(max_workers=1)
+    f = executor.submit(
+        save,
+        cpu_state_dict,
+        checkpoint_id=checkpoint_id,
+        storage_writer=storage_writer,
+        planner=planner,
+        process_group=process_group,
+    )
+    f.add_done_callback(lambda f: executor.shutdown(wait=False))
+
+    return f
+
+
+def _stateful_to_state_dict(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+    """Creates a shallow copy of `state_dict` where `state_dict` is called for each Stateful object."""
+    stateful_state_dict = {}
+    for key, elem in state_dict.items():
+        stateful_state_dict[key] = (
+            elem.state_dict() if isinstance(elem, Stateful) else elem
+        )
+    return stateful_state_dict
+
+
+def _save_state_dict(
+    state_dict: STATE_DICT_TYPE,
+    storage_writer: StorageWriter,
+    process_group: Optional[dist.ProcessGroup] = None,
+    coordinator_rank: int = 0,
+    no_dist: bool = False,
+    planner: Optional[SavePlanner] = None,
+) -> Metadata:
+    torch._C._log_api_usage_once("torch.distributed.checkpoint.save_state_dict")
+
+    distW = _DistWrapper(process_group, not no_dist, coordinator_rank)
+    if planner is None:
+        planner = DefaultSavePlanner()
+    assert planner is not None
+
+    global_metatadata = None
+
+    def local_step():
+        assert planner is not None
+        planner.set_up_planner(state_dict, distW.is_coordinator)
+        storage_writer.set_up_storage_writer(distW.is_coordinator)
+        local_plan = planner.create_local_plan()
+        local_plan = storage_writer.prepare_local_plan(local_plan)
+        return local_plan
+
+    def global_step(all_local_plans):
+        nonlocal global_metatadata
+
+        assert planner is not None
+        all_local_plans, global_metatadata = planner.create_global_plan(all_local_plans)
+        all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
+        return all_local_plans
+
+    central_plan = distW.reduce_scatter("plan", local_step, global_step)
+
+    def write_data():
+        assert planner is not None
+        final_local_plan = planner.finish_plan(central_plan)
+        all_writes = storage_writer.write_data(final_local_plan, planner)
+
+        all_writes.wait()
+        return all_writes.value()
+
+    def finish_checkpoint(all_results):
+        assert global_metatadata is not None
+        storage_writer.finish(metadata=global_metatadata, results=all_results)
+        return global_metatadata
+
+    return distW.all_reduce("write", write_data, finish_checkpoint)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/stateful.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/stateful.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09cb8fffd1379c4b8c3bf22dd41c68b93200c49
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/stateful.py
@@ -0,0 +1,43 @@
+from typing import Any, Dict, runtime_checkable, TypeVar
+
+from typing_extensions import Protocol
+
+
+__all__ = ["Stateful", "StatefulT"]
+
+
+@runtime_checkable
+class Stateful(Protocol):
+    """
+    Stateful protocol for objects that can be checkpointed and restored.
+    """
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Objects should return their state_dict representation as a dictionary.
+        The output of this function will be checkpointed, and later restored in
+        `load_state_dict()`.
+
+        .. warning::
+            Because of the inplace nature of restoring a checkpoint, this function
+            is also called during `torch.distributed.checkpoint.load`.
+
+
+        Returns:
+            Dict: The objects state dict
+        """
+
+        ...
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """
+        Restore the object's state from the provided state_dict.
+
+        Args:
+            state_dict: The state dict to restore from
+        """
+
+        ...
+
+
+StatefulT = TypeVar("StatefulT", bound=Stateful)
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/storage.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..98b828c0b9cda33e233947473ab07753cad64346
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/storage.py
@@ -0,0 +1,279 @@
+import abc
+import os
+from dataclasses import dataclass
+from typing import Any, List, Union
+
+from torch.futures import Future
+
+from .metadata import Metadata, MetadataIndex
+from .planner import LoadPlan, LoadPlanner, SavePlan, SavePlanner
+
+__all__ = ["WriteResult", "StorageWriter", "StorageReader"]
+
+
+@dataclass(frozen=True)
+class WriteResult:
+    index: MetadataIndex
+
+    size_in_bytes: int
+    storage_data: Any
+
+
+class StorageWriter(abc.ABC):
+    """
+    Interface used by ``save_state_dict`` to write to storage.
+
+    One StorageWriter instance acts as both the coordinator and the follower
+    in a distributed checkpoint. As part of initialization, each instance
+    is told its role.
+
+    A subclass should expect the following sequence of calls.
+
+    0) (all ranks) set checkpoint_id if users pass a valid checkpoint_id.
+    1) (all ranks) set_up_storage_writer()
+    2) (all ranks) prepare_local_plan()
+    3) (coordinator) prepare_global_plan()
+    4) (all ranks) write_data()
+    5) (coordinator) finish()
+    """
+
+    @abc.abstractmethod
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """
+        Calls to indicates a brand new checkpoint write is going to happen.
+        A checkpoint_id may be present if users set the checkpoint_id for
+        this checkpoint write. The meaning of the checkpiont_id is
+        storage-dependent. It can be a path to a folder/file or a key for
+        a key-value storage.
+
+        Args:
+            checkpoint_id (Union[str, os.PathLike, None]):
+                The ID of this checkpoint instance. The meaning of the checkpoint_id
+                depends on the storage. It can be a path to a folder or to a file.
+                It can also be a key if the storage is a key-value store.
+                (Default: ``None``)
+        """
+        ...
+
+    @abc.abstractmethod
+    def set_up_storage_writer(self, is_coordinator: bool) -> None:
+        """
+        Initialize this instance.
+
+        Args:
+            is_coordinator (bool): Whether this instance is responsible for coordinating
+              the checkpoint.
+        """
+        pass
+
+    @abc.abstractmethod
+    def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
+        """
+        Perform storage-specific local planning.
+
+        While this method can produce a completely different plan, the recommended
+        way is to store storage specific data in SavePlan::storage_data.
+
+        Args:
+            plan (SavePlan): The local plan from the ``SavePlanner`` in use.
+
+        Returns:
+            A transformed ``SavePlan`` after storage local planning
+        """
+        pass
+
+    @abc.abstractmethod
+    def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
+        """
+        Perform centralized planning of storage.
+
+        This method is only called on the coordinator instance.
+
+        While this method can produce a completely different plan, the preferred
+        way is to store storage specific data in SavePlan::storage_data.
+
+        Args:
+            plans: A list of ``SavePlan`` instances, one for each rank.
+
+        Returns:
+            A list of transformed ``SavePlan`` after storage global planning
+        """
+        pass
+
+    @abc.abstractmethod
+    def write_data(
+        self, plan: SavePlan, planner: SavePlanner
+    ) -> Future[List[WriteResult]]:
+        """
+        Write all items from ``plan`` using ``planner`` to resolve the data.
+
+        A subclass should call ``SavePlanner::resolve_data`` on each item
+        from the plan to get access to the underlying object to write.
+
+        Subclasses should lazily call `resolve_data` as it can allocate memory.
+        In case of tensors, make following assumptions:
+
+        - They might be on any device, including not matching the one on ``WriteItem::tensor_data``
+        - They might be views or not contiguous. Only the projection needs to be saved.
+
+        Args:
+            plan (SavePlan): The save plan to execute.
+            planner (SavePlanner): Planner object to be used to resolve items to data.
+
+        Returns:
+            A future that completes to a list of WriteResult
+        """
+        pass
+
+    @abc.abstractmethod
+    def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
+        """
+        Write the metadata and marks the current checkpoint as successful.
+
+        The actual format/schema used for serializing `metadata` is an
+        implementation detail. The only requirement is that it's recoverable
+        in to the same object graph.
+
+        Args:
+            metadata (Metadata): metadata for the new checkpoint
+            results: A list of WriteResults from all ranks.
+
+        Returns:
+            None
+        """
+        pass
+
+    @classmethod
+    @abc.abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """
+        Check if the given checkpoint_id is supported by the stroage. This allow
+        us to enable automatic storage selection.
+        """
+        ...
+
+
+class StorageReader(abc.ABC):
+    """
+    Interface used by ``load_state_dict`` to read from storage.
+
+    One StorageReader instance acts as both the coordinator and the follower
+    in a distributed checkpoint. As part of initialization, each instance
+    is told its role.
+
+    A subclass should expected the following sequence of calls by ``load_state_dict``:
+
+    0) (all ranks) set checkpoint_id if users pass a valid checkpoint_id.
+    1) (all ranks) read_metadata()
+    2) (all ranks) set_up_storage_reader()
+    3) (all ranks) prepare_local_plan()
+    4) (coordinator) prepare_global_plan()
+    5) (all ranks) read_data()
+    """
+
+    @abc.abstractmethod
+    def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
+        """
+        Calls to indicates a brand new checkpoint read is going to happen.
+        A checkpoint_id may be present if users set the checkpoint_id for
+        this checkpoint read. The meaning of the checkpiont_id is
+        storage-dependent. It can be a path to a folder/file or a key for
+        a key-value storage.
+
+        Args:
+            checkpoint_id (Union[str, os.PathLike, None]):
+                The ID of this checkpoint instance. The meaning of the checkpoint_id
+                depends on the storage. It can be a path to a folder or to a file.
+                It can also be a key if the storage is more like a key-value store.
+                (Default: ``None``)
+        """
+        ...
+
+    @abc.abstractmethod
+    def read_metadata(self) -> Metadata:
+        """
+        Read the checkpoint metadata.
+
+        Returns:
+            The metadata object associated with the checkpoint being loaded.
+
+        """
+        pass
+
+    @abc.abstractmethod
+    def set_up_storage_reader(self, metadata: Metadata, is_coordinator: bool) -> None:
+        """
+        Initialize this instance.
+
+        Args:
+            metadata (Metadata): The metadata schema to use.
+            is_coordinator (bool): Whether this instance is responsible for coordinating
+              the checkpoint.
+        """
+        pass
+
+    @abc.abstractmethod
+    def prepare_local_plan(self, plan: LoadPlan) -> LoadPlan:
+        """
+        Perform storage-specific local planning.
+
+        While this method can produce a completely different plan, the recommended
+        way is to store storage specific data in LoadPlan::storage_data.
+
+        Args:
+            plan (LoadPlan): The local plan from the ``LoadPlan`` in use.
+
+        Returns:
+            A transformed ``LoadPlan`` after storage local planning
+        """
+        pass
+
+    @abc.abstractmethod
+    def prepare_global_plan(self, plans: List[LoadPlan]) -> List[LoadPlan]:
+        """
+        Perform centralized planning of storage loading.
+
+        This method is only called on the coordinator instance.
+
+        While this method can produce a completely different plan, the preferred
+        way is to store storage specific data in LoadPlan::storage_data.
+
+        Args:
+            plans: A list of ``LoadPlan`` instances, one for each rank.
+
+        Returns:
+            A list of transformed ``LoadPlan`` after storage global planning
+        """
+        pass
+
+    @abc.abstractmethod
+    def read_data(self, plan: LoadPlan, planner: LoadPlanner) -> Future[None]:
+        """
+        Read all items from ``plan`` using ``planner`` to resolve the data.
+
+        A subclass should call ``LoadPlanner::load_bytes`` to deserialize a BytesIO
+        object into the right place.
+
+        A subclass should call ``LoadPlanner::resolve_tensor`` to get access to the
+        tensors that in should load data into.
+
+        It's the StorageLayer responsibility to properly schedule any cross device copies
+        required.
+
+        Args:
+            plan (LoadPlan): The local plan to execute on
+            planner (LoadPlanner): The planner object to use to resolve items.
+
+        Returns:
+            A future that completes once all reads are finished.
+        """
+        pass
+
+    @classmethod
+    @abc.abstractmethod
+    def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
+        """
+        Check if the given checkpoint_id is supported by the stroage. This allow
+        us to enable automatic storage selection.
+        """
+        ...
diff --git a/MLPY/Lib/site-packages/torch/distributed/checkpoint/utils.py b/MLPY/Lib/site-packages/torch/distributed/checkpoint/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0817736a957549f059d40b585c450d37ea6373
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/checkpoint/utils.py
@@ -0,0 +1,429 @@
+import cProfile
+import inspect
+import io
+import itertools
+import os
+import warnings
+from contextlib import contextmanager
+from functools import wraps
+from pstats import Stats
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, TypeVar, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._tensor import DTensor
+
+from .api import (
+    _is_wrapped_exception,
+    _wrap_exception,
+    CheckpointException,
+    WRAPPED_EXCEPTION,
+)
+from .metadata import MetadataIndex, STATE_DICT_TYPE
+
+__all__ = ["find_tensor_shard", "find_state_dict_object"]
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def _get_failure_dict(
+    results: List[Union[T, WRAPPED_EXCEPTION]]
+) -> Dict[int, WRAPPED_EXCEPTION]:
+    return cast(
+        Dict[int, WRAPPED_EXCEPTION],
+        {i: err for i, err in enumerate(results) if _is_wrapped_exception(err)},
+    )
+
+
+def _all_gather_keys(
+    local_dict: Dict[Any, Any], group: Optional[dist.ProcessGroup] = None
+) -> List[Any]:
+    """Gathers all keys, and returns them sorted."""
+    keys = list(local_dict.keys())
+    gathered_keys: List[List[Any]] = [None] * dist.get_world_size()  # type: ignore[list-item]
+
+    dist.all_gather_object(gathered_keys, keys, group=group)
+    return sorted(set(itertools.chain.from_iterable(gathered_keys)))
+
+
+class _DistWrapper:
+    """
+    This is a wrapper around PG that provides a series of features around object collectives.
+
+    It works without distributed initialized, where most collectives turns into nops.
+
+    All variants that take functions are exception robust, meaning that if one or more
+    ranks raise errors, all ranks will observe those.
+    """
+
+    def __init__(
+        self,
+        group: Optional[dist.ProcessGroup],
+        use_dist: bool,
+        coordinator_rank: int,
+    ):
+        self.group = group
+        self.use_dist = use_dist
+        self.coordinator_rank = coordinator_rank
+        if self.use_dist:
+            self.rank = dist.get_rank(group)
+            self.is_coordinator = self.rank == coordinator_rank
+        else:
+            self.rank = 0
+            self.is_coordinator = True
+
+    def get_rank(self) -> int:
+        return self.rank
+
+    def get_world_size(self) -> int:
+        if self.use_dist:
+            return dist.get_world_size(self.group)
+        return 1
+
+    def broadcast_object(self, object: Optional[T]) -> T:
+        """Implement functionality similar to c10d::broadcast_object_list but without distributed enabled."""
+        object_list = [object]
+        if self.use_dist:
+            dist.broadcast_object_list(
+                object_list=object_list,
+                group=self.group,
+                src=self.coordinator_rank,
+            )
+        return cast(T, object_list[0])
+
+    def gather_object(self, object: T) -> Optional[List[T]]:
+        """Implement functionality similar to c10d::gather_object but without distributed enabled."""
+        if self.use_dist:
+            gather_objs = (
+                cast(List[T], [None] * dist.get_world_size(self.group))
+                if self.is_coordinator
+                else None
+            )
+
+            dist.gather_object(
+                obj=object,
+                object_gather_list=gather_objs if self.is_coordinator else None,
+                dst=self.coordinator_rank,
+                group=self.group,
+            )
+            result = gather_objs
+        else:
+            result = [object]
+        return result
+
+    def all_gather_object(self, object: T) -> List[T]:
+        """Implement functionality similar to c10d::all_gather_object but without distributed enabled."""
+        if self.use_dist:
+            gather_objs = cast(List[T], [None] * dist.get_world_size(self.group))
+
+            dist.all_gather_object(
+                object_list=gather_objs, obj=object, group=self.group
+            )
+        else:
+            gather_objs = [object]
+        return gather_objs
+
+    def scatter_object(self, object_list: Optional[List[T]]) -> T:
+        """Implement functionality similar to c10d::scatter_object but without distributed enabled."""
+        if self.use_dist:
+            gather_result = cast(List[T], [None])
+            dist.scatter_object_list(
+                scatter_object_output_list=gather_result,
+                scatter_object_input_list=object_list if self.is_coordinator else None,
+                src=self.coordinator_rank,
+                group=self.group,
+            )
+
+            local_reply = gather_result[0]
+        else:
+            assert object_list is not None
+            local_reply = object_list[0]
+        return local_reply
+
+    def reduce_scatter(
+        self,
+        step: str,
+        map_fun: Callable[[], T],
+        reduce_fun: Callable[[List[T]], List[R]],
+    ) -> R:
+        """
+        Compute a value on each rank, then do centralized reduce on a single rank, followed by a scatter.
+
+        This method operates in the following way:
+            Run ``map_fun`` on all ranks
+            Gather results on rank 0
+            Call ``reduce_fun`` on all those values
+            Scatter to each rank part of the result.
+        """
+        local_data: Union[WRAPPED_EXCEPTION, T]
+        try:
+            local_data = map_fun()
+        except BaseException as e:
+            local_data = _wrap_exception(e)
+
+        all_data = self.gather_object(local_data)
+        all_results: Optional[List[Union[R, CheckpointException]]] = None
+        if self.is_coordinator:
+            assert all_data is not None
+            node_failures = _get_failure_dict(all_data)
+
+            if len(node_failures) == 0:
+                try:
+                    # N.B. why can't mypy cast List[R] to List[Union[R, WRAPPED_EXCEPTION]]?
+                    all_results = cast(
+                        List[Union[R, CheckpointException]],
+                        reduce_fun(cast(List[T], all_data)),
+                    )
+                except BaseException as e:
+                    node_failures[self.rank] = _wrap_exception(e)
+
+            if len(node_failures) > 0:
+                all_results = [
+                    CheckpointException(step, node_failures)
+                ] * self.get_world_size()
+
+        result = self.scatter_object(all_results)
+        if isinstance(result, CheckpointException):
+            raise result
+        return result
+
+    def all_reduce(
+        self,
+        step: str,
+        map_fun: Callable[[], T],
+        reduce_fun: Callable[[List[T]], R],
+    ) -> R:
+        """
+        Compute a value on each rank, then do centralized reduce on a single rank, followed by a broadcast.
+
+        This method operates in the following way:
+            Run ``map_fun`` on all ranks
+            Gather results on rank 0
+            Call ``reduce_fun`` on all those values
+            Broadcast the reduced value to all ranks.
+        """
+        local_data: Union[T, WRAPPED_EXCEPTION]
+        try:
+            local_data = map_fun()
+        except BaseException as e:
+            local_data = _wrap_exception(e)
+
+        all_data = self.gather_object(local_data)
+        result: Optional[Union[R, CheckpointException]] = None
+        if self.is_coordinator:
+            assert all_data is not None
+            node_failures = _get_failure_dict(all_data)
+            if len(node_failures) == 0:
+                try:
+                    result = reduce_fun(cast(List[T], all_data))
+                except BaseException as e:
+                    node_failures[self.rank] = _wrap_exception(e)
+
+            if len(node_failures) > 0:
+                result = CheckpointException(step, node_failures)
+
+        final_result = self.broadcast_object(result)
+        if isinstance(final_result, CheckpointException):
+            raise final_result
+        return cast(R, final_result)
+
+    def all_gather(
+        self,
+        step: str,
+        map_fun: Callable[[], T],
+    ) -> List[T]:
+        """
+        Compute a value on each rank, then all_gather them.
+
+        This method operates in the following way:
+            Run ``map_cp`` on all ranks
+            all_gather the values to all ranks
+        """
+        result: Union[T, WRAPPED_EXCEPTION]
+        try:
+            result = map_fun()
+        except BaseException as e:
+            result = _wrap_exception(e)
+
+        all_results = self.all_gather_object(result)
+
+        node_failures = _get_failure_dict(all_results)
+        if len(node_failures) > 0:
+            raise CheckpointException(step, node_failures)
+        return cast(List[T], all_results)
+
+    def broadcast(
+        self,
+        step: str,
+        map_fun: Callable[[], T],
+    ) -> T:
+        """
+        Compute a value on rank 0 and broadcast it.
+
+        This method operates in the following way:
+            Run ``map_cp`` on rank 0
+            broadcast the value
+        """
+        result: Optional[Union[T, CheckpointException]] = None
+        if self.is_coordinator:
+            try:
+                result = map_fun()
+            except BaseException as e:
+                result = CheckpointException(step, {self.rank: _wrap_exception(e)})
+        final_result = self.broadcast_object(result)
+        if isinstance(final_result, CheckpointException):
+            raise final_result
+        return cast(T, final_result)
+
+
+def _find_shard(tensor: ShardedTensor, index: MetadataIndex) -> Shard:
+    if index.offset is None:
+        raise ValueError(
+            f"Cannot lookup {index.fqn} since its a ShardedTensor and no offset was provided"
+        )
+
+    shards = tensor.local_shards()
+    # index fast path
+    if index.index is not None:
+        if (
+            len(shards) > index.index
+            and torch.Size(shards[index.index].metadata.shard_offsets) == index.offset
+        ):
+            return shards[index.index]
+
+    for shard in shards:
+        if torch.Size(shard.metadata.shard_offsets) == index.offset:
+            return shard
+    raise ValueError(f"Could not find shard at '{index.offset}' for FQN: '{index.fqn}'")
+
+
+def find_tensor_shard(tensor: torch.Tensor, index: MetadataIndex) -> torch.Tensor:
+    if isinstance(tensor, DTensor):
+        return tensor.to_local()
+    if isinstance(tensor, ShardedTensor):
+        return _find_shard(tensor, index).tensor
+    if index.offset is not None:
+        # special case looking up a tensor by origin
+        if index.offset == torch.Size([0] * len(tensor.size())):
+            return tensor
+        raise ValueError(
+            f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'"
+        )
+    return tensor
+
+
+def find_state_dict_object(state_dict: STATE_DICT_TYPE, index: MetadataIndex) -> Any:
+    if index.fqn not in state_dict:
+        raise ValueError(f"Could not find FQN: '{index.fqn}'")
+    obj = state_dict[index.fqn]
+
+    if isinstance(obj, torch.Tensor):
+        return find_tensor_shard(obj, index)
+    elif index.offset is not None:
+        raise ValueError(
+            f"FQN: '{index.fqn}' is not a ShardedTensor, can't find by offset: '{index.offset}'"
+        )
+    return obj
+
+
+def _element_wise_add(a: Sequence[int], b: Sequence[int]) -> List[int]:
+    return [i_a + i_b for i_a, i_b in zip(a, b)]
+
+
+def _element_wise_sub(a: Sequence[int], b: Sequence[int]) -> List[int]:
+    return [i_a - i_b for i_a, i_b in zip(a, b)]
+
+
+class _ReaderView(io.IOBase):
+    def __init__(self, base_stream: io.IOBase, offset: int, len: int):
+        super().__init__()
+        self.offset = offset
+        self.len = len
+        self.base_stream = base_stream
+        self.seek(0)
+
+    def seek(self, __offset: int, __whence: int = os.SEEK_SET) -> int:
+        if __whence == os.SEEK_SET:
+            __offset = self.offset + __offset
+        elif __whence == os.SEEK_END:
+            __whence = os.SEEK_SET
+            __offset = (self.offset + self.len) - __offset
+        return self.base_stream.seek(__offset, __whence)
+
+    def tell(self) -> int:
+        return self.base_stream.tell() - self.offset
+
+    def readable(self) -> bool:
+        return self.base_stream.readable()
+
+    def seekable(self) -> bool:
+        return self.base_stream.seekable()
+
+    def readinto(self, b):
+        return self.base_stream.readinto(b)  # type: ignore[attr-defined]
+
+    def read(self, size=-1):
+        return self.base_stream.read(size)
+
+
+def _create_file_view(file: io.IOBase, offset: int, length: int) -> io.IOBase:
+    # FIXME (kumpera) torch.load fails if we wrap with io.BufferedReader
+    return _ReaderView(file, offset, length)
+
+
+def _normalize_device_info(device_type: str, device_id: int) -> str:
+    """Device info normalization."""
+    if device_type == "cpu":
+        return "cpu"
+    return f"{device_type}:{device_id}"
+
+
+# TODO: integrate with distributed logging flag
+ENABLE_PROFILE = False
+
+
+@contextmanager
+def _profile():
+    # Only log the profiling when it is enable and is on rank0  or dist is not
+    # avaiable.
+    if ENABLE_PROFILE and (not dist.is_available() or dist.get_rank() == 0):
+        profiler = cProfile.Profile()
+        profiler.enable()
+        try:
+            yield
+        finally:
+            profiler.disable()
+            stats = Stats(profiler)
+            stats.sort_stats("time").print_stats(10)
+    else:
+        yield
+
+
+def _api_bc_check(func):
+    @wraps(func)
+    def inner_func(*args, **kwargs) -> Any:
+        if len(args) == 2:
+            warnings.warn(
+                f"The argument order of {func.__name__} has been changed. "
+                "Please check the document to avoid future breakages."
+            )
+            sig = inspect.signature(func)
+            kwonlyargs = [
+                p.name for p in sig.parameters.values() if p.kind == p.KEYWORD_ONLY
+            ]
+            if "storage_writer" in kwonlyargs:
+                assert "storage_writer" not in kwargs, (args, kwargs)
+                kwargs["storage_writer"] = args[1]
+            elif "storage_reader" in kwonlyargs:
+                assert "storage_reader" not in kwargs, (args, kwargs)
+                kwargs["storage_reader"] = args[1]
+            else:
+                raise RuntimeError(f"Unexpected kwonlyargs = {kwonlyargs}")
+            return func(args[0], **kwargs)
+        else:
+            return func(*args, **kwargs)
+
+    return inner_func
diff --git a/MLPY/Lib/site-packages/torch/distributed/collective_utils.py b/MLPY/Lib/site-packages/torch/distributed/collective_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dbe0310e63433d15a957388282eac55431ed270
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/collective_utils.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+
+
+"""
+A set of primitive functions for performing collective ops.
+
+Each should also handle single rank scenario.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, Callable, cast, Generic, List, Optional, Tuple, TypeVar, Union
+
+import torch.distributed as dist
+
+T = TypeVar("T")
+
+@dataclass
+class SyncPayload(Generic[T]):
+    stage_name: Optional[str]
+    success: bool
+    payload: T
+    exception: Optional[Exception] = None
+
+def broadcast(
+    data_or_fn: Union[T, Callable[[], T]],
+    *,
+    success: bool = True,
+    stage_name: Optional[str] = None,
+    rank: int = 0,
+    pg: Optional[dist.ProcessGroup] = None,
+) -> T:
+    """
+    Broadcasts the data payload from rank 0 to all other ranks.
+    Or if a function is passed, execute it in rank 0 and broadcast result to all other ranks.
+
+    Can be used to broadcast a failure signal to stop all ranks.
+
+    If the function raises an exception, all ranks will raise.
+
+    Args:
+        data_or_fn: the data to broadcast or function to execute and broadcast result.
+        success: False to stop all ranks.
+        stage_name: the name of the logical stage for synchronization and debugging
+        rank: rank to broadcast data or execute function and broadcast resutls.
+        pg: the process group for sync
+    Throws:
+        RuntimeError from original exception trace
+    Returns:
+        the value after synchronization
+
+    Example usage:
+    >> id = broadcast(data_or_fn=allocate_id, rank=0, pg=ext_pg.my_pg)
+    """
+
+    if not success and data_or_fn is not None:
+        raise AssertionError("Data or Function is expected to be None if not successful")
+
+    payload: Optional[T] = None
+    exception : Optional[Exception] = None
+    # if no pg is passed then execute if rank is 0
+    if (pg is None and rank == 0) or (pg is not None and pg.rank() == rank):
+        # determine if it is an executable function or data payload only
+        if callable(data_or_fn):
+            try:
+                payload = data_or_fn()
+            except Exception as e:
+                success = False
+                exception = e
+        else:
+            payload = data_or_fn
+
+    # broadcast the exception type if any to all ranks for failure categorization
+    sync_obj = SyncPayload(
+        stage_name=stage_name,
+        success=success,
+        payload=payload,
+        exception=exception,
+    )
+
+    if pg is not None:
+        broadcast_list = [sync_obj]
+        dist.broadcast_object_list(broadcast_list, src=rank, group=pg)
+        assert len(broadcast_list) == 1
+        sync_obj = broadcast_list[0]
+
+    # failure in any rank will trigger a throw in every rank.
+    if not sync_obj.success:
+        error_msg = f"Rank {rank} failed"
+        if stage_name is not None:
+            error_msg += f": stage {sync_obj.stage_name}"
+        if sync_obj.exception is not None:
+            error_msg += f": exception {sync_obj.exception}"
+        raise RuntimeError(error_msg) from sync_obj.exception
+
+    return cast(T, sync_obj.payload)
+
+
+def all_gather(
+    data_or_fn: Union[T, Callable[[], T]],
+    stage_name: Optional[str] = None,
+    pg: Optional[dist.ProcessGroup] = None,
+) -> List[T]:
+    """
+    A simple all_gather primitive with basic synchronization guard logic,
+    by checking payload from all ranks has the same stage name.
+
+    Args:
+        data_or_fn: the data to be all gathered across ranks or function to be executed
+        stage_name: the sync stage name for out-of-sync protection
+        pg: the process group for sync
+    Throws:
+        RuntimeError from original exception trace
+    Returns:
+        a list of synced data from all ranks
+
+    Example usage:
+    >> all_ids = all_gather(data_or_fn=allocate_id, pg=ext_pg.my_pg)
+    """
+    payload: Optional[T] = None
+    exception : Optional[Exception] = None
+    success = True
+    # determine if it is an executable function or data payload only
+    if callable(data_or_fn):
+        try:
+            payload = data_or_fn()
+        except Exception as e:
+            success = False
+            exception = e
+    else:
+        payload = data_or_fn
+
+    sync_obj = SyncPayload(
+        stage_name=stage_name,
+        success=success,
+        payload=payload,
+        exception=exception,
+    )
+
+    if pg is not None:
+        # List of success/failure across all ranks.
+        total_list = [None] * dist.get_world_size(pg)
+        all_gather_object_enforce_type(pg, total_list, sync_obj)
+        # Each rank will throw RuntimeError in case of failure on any rank.
+        stage_name = cast(SyncPayload[T], total_list[0]).stage_name
+        exception_list: List[Tuple[int, Exception]] = []
+        ret_list: List[T] = []
+        error_msg: str = ""
+
+        for i, sp in enumerate(cast(List[SyncPayload[T]], total_list)):
+            if sp.stage_name != stage_name:
+                error_msg += (
+                    f"Unexpected stage name received from rank {i}: {sp.stage_name} "
+                )
+                continue
+            if not sp.success and sp.exception is not None:
+                exception_list.append((i, sp.exception))
+                continue
+            ret_list.append(sp.payload)
+
+        if len(exception_list) > 0:
+            raise RuntimeError(  # type: ignore[misc]
+                error_msg, exception_list) from exception_list[0]
+        return ret_list
+    else:
+        if not sync_obj.success:
+            raise RuntimeError(
+                f"all_gather failed with exception {sync_obj.exception}",
+            ) from sync_obj.exception
+        return [sync_obj.payload]  # type: ignore[list-item]
+
+
+# Note: use Any for typing for now so users can pass in
+# either a list of None or target type placeholders
+# otherwise pyre would complain
+def all_gather_object_enforce_type(
+    pg: dist.ProcessGroup,
+    # pyre-fixme[2]: Parameter must have a type that does not contain `Any`
+    object_list: List[Any],
+    # pyre-fixme[2]: Parameter must have a type other than `Any`
+    obj: Any,
+    # pyre-fixme[2]: Parameter must have a type that does not contain `Any`
+    type_checker: Callable[[Any, Any], bool] = lambda x, y: type(x) == type(y),
+) -> None:
+    """
+    Similar to plain all_gather_object but with additional type checking
+    AFTER gather is done to ensure basic consistency.
+    If check does not pass, all ranks will fail with exception.
+
+    This is generally to prevent conditional logic leading to
+    unexpected messages being received. This is considered fatal code error,
+    but due to logic stacks this might happen implicitly in practice.
+
+    The default check does not check sub type (considered different)
+    or covariance (considered same) but users can pass in custom checker
+    if more complicated check is needed.
+    """
+    dist.all_gather_object(object_list, obj, group=pg)
+
+    # conservative check
+    list_len = len(object_list)
+    if list_len == 0:
+        return
+    first_obj = object_list[0]
+    for i in range(1, list_len):
+        if not type_checker(first_obj, object_list[i]):
+            raise TypeError(
+                f"Object type at index {i} is {type(object_list[i])}, "
+                f"while first object type is {type(first_obj)}"
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/constants.py b/MLPY/Lib/site-packages/torch/distributed/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..102d7bf100080087a867ada7421dc96aef8b972d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/constants.py
@@ -0,0 +1,23 @@
+from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
+from datetime import timedelta
+from typing import Optional
+
+__all__ = ['default_pg_timeout', 'default_pg_nccl_timeout']
+
+# Default process group wide timeout, if applicable.
+# This only applies to the non-nccl backends
+# To make an attempt at backwards compatibility with THD, we use an
+# extraordinarily high default timeout, given that THD did not have timeouts.
+default_pg_timeout: timedelta = _DEFAULT_PG_TIMEOUT
+# Separate timeout for PGNCCL mainly becuase it's always been that way in the C++ layer, but until recently
+# there was one default that applied across all backends in the python layer.
+# Later, we could consider merging them back together at the c++ layer if we can align on a same value.
+# (only if TORCH_NCCL_BLOCKING_WAIT or TORCH_NCCL_ASYNC_ERROR_HANDLING is set to 1).
+
+try:
+    from torch._C._distributed_c10d import _DEFAULT_PG_NCCL_TIMEOUT
+    default_pg_nccl_timeout: Optional[timedelta] = _DEFAULT_PG_NCCL_TIMEOUT
+except ImportError:
+    # if C++ NCCL support is not compiled, we don't have access to the default nccl value.
+    # if anyone is actually trying to use nccl in this state, it should error.
+    default_pg_nccl_timeout = None
diff --git a/MLPY/Lib/site-packages/torch/distributed/device_mesh.py b/MLPY/Lib/site-packages/torch/distributed/device_mesh.py
new file mode 100644
index 0000000000000000000000000000000000000000..192516aa033ee10533b498d1e649982f3a2beac6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/device_mesh.py
@@ -0,0 +1,567 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+import math
+from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union
+
+import torch
+
+from torch.distributed import is_available
+
+from ..utils._typing_utils import not_none
+
+__all__ = ["init_device_mesh", "DeviceMesh"]
+
+
+if not is_available():
+    import sys
+
+    # We need to create the stubs when distributed is not available.
+    # Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
+    # since it would try to import ``torch.distributed.device_mesh`` or
+    # ``torch.distributed.init_device_mesh`` but cannot find them.
+
+    class _DeviceMeshStub:
+        pass
+
+    def _init_device_mesh_stub():
+        pass
+
+    sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub  # type: ignore[attr-defined]
+    sys.modules[
+        "torch.distributed.device_mesh"
+    ].init_device_mesh = _init_device_mesh_stub  # type: ignore[attr-defined]
+
+
+else:
+    from torch.distributed.distributed_c10d import (
+        _find_pg_by_ranks_and_tag,
+        _get_default_group,
+        _get_group_tag,
+        get_rank,
+        get_world_size,
+        init_process_group,
+        is_initialized,
+        new_group,
+        ProcessGroup,
+    )
+
+    logger = logging.getLogger(__name__)
+
+    # only import numpy typing when type checking
+    if TYPE_CHECKING:
+        try:
+            from numpy.typing import ArrayLike
+        except ImportError:
+            logger.warning(
+                "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
+            )
+
+    class _MeshEnv:
+        def __init__(self) -> None:
+            self.mesh_stack: List[DeviceMesh] = []
+            self.child_to_parent_mapping: Dict[DeviceMesh, DeviceMesh] = {}
+            self.parent_to_child_mapping: Dict[DeviceMesh, Dict[str, DeviceMesh]] = {}
+
+        def get_current_mesh(self) -> "DeviceMesh":
+            if len(self.mesh_stack) == 0:
+                raise RuntimeError("No device mesh is currently active!")
+            return self.mesh_stack[-1]
+
+        def create_child_mesh(
+            self, device_mesh: "DeviceMesh", mesh_dim: int, mesh_dim_name: str
+        ) -> "DeviceMesh":
+            # Directly return the child mesh if it is already created.
+            child_mesh_mappings = self.parent_to_child_mapping.get(device_mesh)
+            if child_mesh_mappings:
+                sub_mesh = child_mesh_mappings.get(mesh_dim_name)
+                if sub_mesh:
+                    return sub_mesh
+
+            # swap the current dim to the last dim then reshape to flatten out other
+            # dims, so we can just extract the list of ranks which contains cur_rank.
+            cur_rank = device_mesh.get_rank()
+            pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, mesh_dim).reshape(
+                -1, device_mesh.mesh.size(mesh_dim)
+            )
+
+            for mesh_1d in pg_ranks_by_dim:
+                sub_mesh = DeviceMesh(
+                    device_mesh.device_type,
+                    mesh_1d,
+                    mesh_dim_names=(mesh_dim_name,),
+                    _init_backend=False,
+                )
+                if cur_rank in mesh_1d:
+                    res_sub_mesh = sub_mesh
+
+            res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[mesh_dim]]  # type: ignore[possibly-undefined]
+            # Assign the current DeviceMesh as the parent of the child DeviceMesh.
+            self.child_to_parent_mapping[res_sub_mesh] = device_mesh
+            self.parent_to_child_mapping.setdefault(device_mesh, {})[
+                mesh_dim_name
+            ] = res_sub_mesh
+            return res_sub_mesh
+
+        def get_parent_mesh(self, device_mesh: "DeviceMesh") -> Optional["DeviceMesh"]:
+            return self.child_to_parent_mapping.get(device_mesh, None)
+
+        def get_parent_mesh_dim(self, device_mesh: "DeviceMesh") -> Optional[int]:
+            """
+            Return the index of the mesh dim in the parent mesh.
+            The device_mesh passed in needs to be sliced out from a parent mesh.
+            """
+            parent_mesh = self.get_parent_mesh(device_mesh)
+            child_mesh_dim_names = device_mesh.mesh_dim_names
+            if parent_mesh and child_mesh_dim_names:
+                assert (
+                    len(child_mesh_dim_names) == 1
+                ), "The child mesh can only be a 1D mesh."
+                child_mesh_dim_name = child_mesh_dim_names[0]
+                return self.get_mesh_dim_by_name(parent_mesh, child_mesh_dim_name)
+            return None
+
+        @staticmethod
+        def num_devices_per_host(device_type: str) -> int:
+            return _get_device_handle(device_type).device_count()
+
+        @staticmethod
+        def num_hosts(device_type: str) -> int:
+            # ProcessGroup can't tell us this info so we have to infer it, assume
+            # homogeneous hardware for now
+            return get_world_size() // _MeshEnv.num_devices_per_host(device_type)
+
+        def get_mesh_dim_by_name(
+            self, device_mesh: "DeviceMesh", mesh_dim_name: str
+        ) -> int:
+            if (
+                device_mesh.mesh_dim_names is None
+                or len(device_mesh.mesh_dim_names) == 0
+            ):
+                raise KeyError(
+                    "No `mesh_dim_names` found.",
+                )
+            if mesh_dim_name not in device_mesh.mesh_dim_names:
+                raise KeyError(
+                    f"Mesh dimension '{mesh_dim_name}' does not exist.",
+                    f"Available mesh dimensions are: mesh_dim_names={device_mesh.mesh_dim_names}",
+                )
+            return not_none(device_mesh.mesh_dim_names.index(mesh_dim_name))
+
+    _mesh_resources: _MeshEnv = _MeshEnv()
+
+    def _get_device_handle(device_type: str = "cuda"):
+        """
+        Get the module corresponding to the device_type which is cuda or cuda-like device.
+        For example, when the device_type is cuda, the module `torch.cuda` is returned.
+        Return None when there is no corresponding module for device_type, otherwise
+        return the corresponding module.
+        """
+        return getattr(torch, device_type, None)
+
+    class DeviceMesh:
+        """
+        DeviceMesh represents a mesh of devices, where layout of devices could be
+        represented as a n-d dimension array, and each value of the n-d dimensional
+        array is the global id of the default process group ranks.
+
+        DeviceMesh could be used to describe the layout of devices across the cluster,
+        and serves as a proxy for communication among the device lists within the cluster.
+
+        DeviceMesh can be used as a context manager.
+
+        .. note::
+            DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
+            is running on all processes/ranks in the cluster. Therefore, users need to make sure the
+            `mesh` array (which describes the layout of devices) should be identical across all ranks.
+            Inconsistent `mesh` will lead to silent hang.
+
+        Args:
+            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
+            mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
+                of devices, where the IDs are global IDs of the default process group.
+
+        Returns:
+            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
+
+        The following program runs on each process/rank in an SPMD manner. In this example, we have 2
+        hosts with 4 GPUs each.
+        A reduction over the first dimension of mesh will reduce across
+        columns (0, 4), .. and (3, 7), a reduction over the second dimension
+        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).
+
+        Example::
+            >>> # xdoctest: +SKIP("no rank")
+            >>> from torch.distributed.device_mesh import DeviceMesh
+            >>>
+            >>> # Initialize device mesh as (2, 4) to represent the topology
+            >>> # of cross-host(dim 0), and within-host (dim 1).
+            >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
+        """
+
+        device_type: str
+        mesh: torch.Tensor
+        mesh_dim_names: Optional[Tuple[str, ...]]
+
+        def __init__(
+            self,
+            device_type: str,
+            mesh: Union[torch.Tensor, "ArrayLike"],
+            *,
+            mesh_dim_names: Optional[Tuple[str, ...]] = None,
+            _init_backend: bool = True,
+        ) -> None:
+            self.device_type = device_type
+            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
+                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
+            self.mesh = (
+                mesh.detach().cpu()
+                if isinstance(mesh, torch.Tensor)
+                else torch.tensor(mesh, dtype=torch.int)
+            )
+            self.mesh_dim_names = mesh_dim_names
+
+            # private field to pre-generate DeviceMesh's hash
+            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
+            self._hash = hash((self._flatten_mesh_list, self.mesh.shape, id(self)))
+
+            # Skip process group initialization if xla device or init backend is False
+            # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
+            if device_type != "xla":
+                # always try to create default (world) pg, even if it is not initialized
+                # already. The world pg is used for device mesh identity (rank) on each
+                # process (we need to know if the current global rank is in the mesh or not).
+                if _init_backend:
+                    self._get_or_create_default_group()
+                    self._init_process_groups()
+
+                # calculate the coordinates of the current global rank on the mesh
+                rank_coords = (self.mesh == get_rank()).nonzero()
+                assert rank_coords.size(0) in (0, 1)
+                self._coordinate_on_dim: Optional[List[int]] = (
+                    rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
+                )
+
+        def _get_or_create_default_group(self):
+            default_initialized = is_initialized()
+            if not default_initialized:
+                init_process_group()
+
+            world_size = get_world_size()
+            if self.mesh.numel() > world_size:
+                raise RuntimeError(
+                    f"Mesh should not be bigger than default world size, but found {self.mesh.numel()} ranks!"
+                )
+
+            device_handle = _get_device_handle(self.device_type)
+            # TODO: if user want to pass pg_options, offer a way to do it
+            if not default_initialized and device_handle:
+                # automatically set the current cuda/cuda-like device base on num of gpu devices available in each host
+                # NOTE: This device selection would only work for homogeneous hardware.
+                num_devices_per_host = device_handle.device_count()
+                if (
+                    world_size > num_devices_per_host
+                    and world_size % num_devices_per_host != 0
+                ):
+                    raise RuntimeError(
+                        f"DeviceMesh only support homogeneous hardware, but found "
+                        f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
+                    )
+                device_handle.set_device(get_rank() % num_devices_per_host)
+
+            return _get_default_group()
+
+        def _init_process_groups(self):
+            # tag/ranks/group_name associated with each mesh dimension, each
+            # mesh dimension should have one sub-group per rank
+            #
+            # TODO(yifu): remove tag and ranks once we fully migrate to native
+            # functional collectives. See details in:
+            # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
+            dim_group_infos: List[Tuple[str, List[int], str]] = []
+
+            if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
+                # if the mesh is the same as world_pg, we just append the default
+                # pg to the first dim groups, as new_group cannot have the exact
+                # same ranks as world
+                dim_group_infos.append(
+                    (
+                        _get_group_tag(_get_default_group()),
+                        list(range(get_world_size())),
+                        _get_default_group().group_name,
+                    )
+                )
+            else:
+                # create sub pgs base on the mesh argument specified
+                for dim in range(self.mesh.ndim):
+                    # swap the current dim to the last dim
+                    # then reshape to flatten out other dims
+                    pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
+                        -1, self.mesh.size(dim)
+                    )
+                    # multi-dim mesh, create subgroups by looping over the pg_ranks
+                    # for each dim and append the groups
+                    for dim_mesh in pg_ranks_by_dim:
+                        subgroup_ranks = dim_mesh.tolist()
+
+                        # We temporarily revert the re-use subgroup, since it breaks two internal tests.
+                        # Temporarily reverting to resolve test timeout while root-causing.
+                        # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
+                        dim_group = new_group(ranks=subgroup_ranks)
+
+                        # only add to dim_groups if the current rank in the subgroup
+                        if self.get_rank() in subgroup_ranks:
+                            if len(dim_group_infos) > dim:
+                                raise RuntimeError(
+                                    f"Each device mesh dimension should get only one process group, but got {self.get_rank} "
+                                    f"in {subgroup_ranks}!"
+                                )
+                            dim_group_infos.append(
+                                (
+                                    _get_group_tag(not_none(dim_group)),
+                                    subgroup_ranks,
+                                    dim_group.group_name,
+                                )
+                            )
+            self._dim_group_infos = dim_group_infos
+
+        def __enter__(self) -> "DeviceMesh":
+            # set this mesh as the current mesh in mesh env
+            _mesh_resources.mesh_stack.append(self)
+            return self
+
+        # pyre-fixme[2]: Parameter must be annotated.
+        def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
+            # pop this mesh from mesh env
+            _mesh_resources.mesh_stack.pop()
+
+        def __repr__(self) -> str:
+            device_mesh_repr = (
+                f"DeviceMesh({self.mesh.tolist()})"
+                if not self.mesh_dim_names
+                else f"DeviceMesh({self.mesh.tolist()}, mesh_dim_names={self.mesh_dim_names})"
+            )
+            return device_mesh_repr
+
+        def __hash__(self):
+            return self._hash
+
+        def __eq__(self, other: object) -> bool:
+            if not isinstance(other, DeviceMesh):
+                return False
+            if id(self.mesh) == id(other.mesh):
+                return True
+            return (
+                self.mesh.shape == other.mesh.shape
+                and self._flatten_mesh_list == other._flatten_mesh_list
+            )
+
+        def __getitem__(self, mesh_dim_name: str) -> "DeviceMesh":
+            """
+            Slice the current DeviceMesh based on the mesh_dim_name given to create a child
+            DeviceMesh.
+
+            Args:
+                mesh_dim_name (str): the name of the mesh dimension of the parent DeviceMesh
+                to create a child DeviceMesh for.
+            Returns:
+                A :class:`DeviceMesh` object
+
+            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
+            hosts with 4 GPUs each.
+            Calling mesh["tp"] on rank 0, 1, 2, 3 would return a 1D child DeviceMesh:([0, 1, 2, 3]).
+            Calling mesh["tp"] on rank 4, 5, 6, 7 would return a 1D child DeviceMesh:([4, 5, 6, 7]).
+            Calling mesh["dp"] on rank 0, 4 would return a 1D child DeviceMesh:([0, 4]).
+            Calling mesh["dp"] on rank 1, 5 would return a 1D child DeviceMesh:([1, 5]).
+            Calling mesh["dp"] on rank 2, 6 would return a 1D child DeviceMesh:([2, 6]).
+            Calling mesh["dp"] on rank 3, 7 would return a 1D child DeviceMesh:([3, 7]).
+
+            Example::
+                >>> # xdoctest: +SKIP("no rank")
+                >>> from torch.distributed.device_mesh import DeviceMesh
+                >>>
+                >>> # Initialize device mesh as (2, 4) to represent the topology
+                >>> # of cross-host(dim 0), and within-host (dim 1).
+                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
+            """
+            if self.mesh.ndim == 1:
+                if self.mesh_dim_names and mesh_dim_name == self.mesh_dim_names[0]:
+                    return self
+                else:
+                    raise RuntimeError(
+                        f"Invalid mesh_dim_name {mesh_dim_name} specified."
+                    )
+
+            mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim_name)
+            submesh = _mesh_resources.create_child_mesh(self, mesh_dim, mesh_dim_name)
+            return submesh
+
+        def get_group(
+            self, mesh_dim: Optional[Union[int, str]] = None
+        ) -> Union[ProcessGroup, List[ProcessGroup]]:
+            """
+            Returns a list of ProcessGroups corresponding to the mesh dimensions, or
+            returns a single ProcessGroup if mesh_dim is specified or the given mesh has
+            only one mesh dimension.
+
+            Args:
+                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
+                of the mesh dimension. Default is None.
+
+            Returns:
+                A list of :class:`ProcessGroup` object when `mesh_dim` is not specified for
+                a DeviceMesh with more than 1 dimension; otherwise, returns a single
+                :class:`ProcessGroup` object.
+            """
+            if not hasattr(self, "_dim_group_infos"):
+                raise RuntimeError("DeviceMesh process groups not initialized!")
+
+            if self.mesh.ndim == 1:
+                return not_none(
+                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[0][:2])
+                )
+
+            if mesh_dim is not None:
+                if isinstance(mesh_dim, str):
+                    mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
+                return not_none(
+                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim][:2])
+                )
+            else:
+                dim_groups = []
+                for ith_dim in range(self.mesh.ndim):
+                    dim_groups.append(
+                        not_none(
+                            _find_pg_by_ranks_and_tag(
+                                *self._dim_group_infos[ith_dim][:2]
+                            )
+                        )
+                    )
+                return dim_groups
+
+        def size(self, mesh_dim: Optional[int] = None) -> int:
+            return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
+
+        @property
+        def ndim(self) -> int:
+            return self.mesh.ndim
+
+        @property
+        def shape(self) -> Tuple[int, ...]:
+            return tuple(self.mesh.shape)
+
+        def get_rank(self) -> int:
+            """
+            Returns the current global rank.
+            """
+            return get_rank()
+
+        def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
+            """
+            Returns the local rank of the given mesh_dim of the DeviceMesh.
+
+            Args:
+                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
+                of the mesh dimension. Default is None.
+
+            Returns:
+                An integer denotes the local rank.
+
+            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
+            hosts with 4 GPUs each.
+            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
+            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
+            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
+            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
+            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
+            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.
+
+            Example::
+                >>> # xdoctest: +SKIP("no rank")
+                >>> from torch.distributed.device_mesh import DeviceMesh
+                >>>
+                >>> # Initialize device mesh as (2, 4) to represent the topology
+                >>> # of cross-host(dim 0), and within-host (dim 1).
+                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
+            """
+            if self.ndim > 1 and mesh_dim is None:
+                raise RuntimeError(
+                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
+                    "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
+                )
+            elif mesh_dim is None:
+                mesh_dim = 0
+
+            mesh_dim_group = not_none(self.get_group(mesh_dim))
+            assert isinstance(
+                mesh_dim_group, ProcessGroup
+            ), "We expect ProcessGroup before calling `get_rank`!"
+            return not_none(get_rank(mesh_dim_group))
+
+        def get_coordinate(self) -> Optional[List[int]]:
+            """
+            Return the relative indices of this rank relative to all
+            dimensions of the mesh. If this rank is not part of the mesh, return None.
+            """
+            return self._coordinate_on_dim if self._coordinate_on_dim else None
+
+    def init_device_mesh(
+        device_type: str,
+        mesh_shape: Tuple[int, ...],
+        *,
+        mesh_dim_names: Optional[Tuple[str, ...]] = None,
+    ) -> DeviceMesh:
+        """
+        Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.
+
+        This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
+        If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.
+
+        .. note::
+            `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
+            runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
+            describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.
+
+        .. note::
+            If no process group is found, init_device_mesh will initialize distributed process group/groups
+            required for distributed communications behind the scene.
+
+        Args:
+            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
+            mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
+                describing the layout of devices.
+            mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
+                of the multi-dimensional array describing the layout of devices. Its length must match the length
+                of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
+
+        Returns:
+            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
+
+        Example::
+            >>> # xdoctest: +SKIP("no rank")
+            >>> from torch.distributed.device_mesh import init_device_mesh
+            >>>
+            >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
+            >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))
+
+        """
+        if mesh_dim_names is not None:
+            if len(set(mesh_dim_names)) != len(mesh_dim_names):
+                raise RuntimeError(
+                    "Each mesh_dim_name must be unique.",
+                    f"Found repeated mesh_dim_name in mesh_dim_names {mesh_dim_names}",
+                )
+
+            if len(mesh_shape) != len(mesh_dim_names):
+                raise RuntimeError(
+                    "mesh_shape and mesh_dim_names should have same length!",
+                    f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
+                )
+
+        mesh = torch.arange(math.prod(mesh_shape)).view(mesh_shape)
+        device_mesh = DeviceMesh(
+            device_type=device_type,
+            mesh=mesh,
+            mesh_dim_names=mesh_dim_names,
+        )
+
+        return device_mesh
diff --git a/MLPY/Lib/site-packages/torch/distributed/distributed_c10d.py b/MLPY/Lib/site-packages/torch/distributed/distributed_c10d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a2485e8c4ad09b6cc2845a8c4d77f6c6d85dc6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/distributed_c10d.py
@@ -0,0 +1,4264 @@
+"""Distributed Collective Communication (c10d)."""
+
+import itertools
+import collections.abc
+import contextlib
+import hashlib
+import io
+import logging
+import os
+import pickle
+import sys
+import time
+import warnings
+from collections import namedtuple
+from datetime import timedelta
+from typing import Any, Callable, Dict, Optional, Tuple, Union, List
+
+import torch
+from torch._C._distributed_c10d import (
+    AllgatherOptions,
+    AllreduceCoalescedOptions,
+    AllreduceOptions,
+    AllToAllOptions,
+    _DistributedBackendOptions,
+    BarrierOptions,
+    BroadcastOptions,
+    GatherOptions,
+    PrefixStore,
+    ProcessGroup,
+    ReduceOp,
+    ReduceOptions,
+    ReduceScatterOptions,
+    ScatterOptions,
+    Store,
+    DebugLevel,
+    get_debug_level,
+    Work,
+    _register_process_group,
+    _resolve_process_group,
+    _unregister_all_process_groups,
+    _unregister_process_group,
+)
+from torch._utils_internal import set_pytorch_distributed_envs_from_justknobs
+from .constants import default_pg_timeout, default_pg_nccl_timeout
+from .c10d_logger import _exception_logger, _time_logger
+from .rendezvous import register_rendezvous_handler, rendezvous  # noqa: F401
+from ..utils._typing_utils import not_none
+DistStoreError = torch._C._DistStoreError
+
+__all__ = [
+    'Backend', 'BackendConfig', 'GroupMember', 'P2POp', 'all_gather', 'all_gather_coalesced',
+    'all_gather_object', 'all_reduce',
+    'all_reduce_coalesced', 'all_to_all',
+    'all_to_all_single', 'barrier', 'batch_isend_irecv', 'broadcast',
+    'broadcast_object_list', 'destroy_process_group',
+    'gather', 'gather_object', 'get_backend_config', 'get_backend', 'get_rank',
+    'get_world_size', 'get_pg_count', 'group', 'init_process_group', 'irecv',
+    'is_gloo_available', 'is_initialized', 'is_mpi_available', 'is_backend_available',
+    'is_nccl_available', 'is_torchelastic_launched', 'is_ucc_available',
+    'isend', 'monitored_barrier', 'new_group', 'new_subgroups',
+    'new_subgroups_by_enumeration', 'recv', 'reduce',
+    'reduce_scatter', 'scatter',
+    'scatter_object_list', 'send', 'supports_complex',
+    'AllreduceCoalescedOptions', 'AllreduceOptions', 'AllToAllOptions',
+    'BarrierOptions', 'BroadcastOptions', 'GatherOptions', 'PrefixStore',
+    'ProcessGroup', 'ReduceOp', 'ReduceOptions', 'ReduceScatterOptions',
+    'ScatterOptions', 'Store', 'DebugLevel', 'get_debug_level', 'Work',
+    'default_pg_timeout', 'get_group_rank', 'get_global_rank', 'get_process_group_ranks',
+    'reduce_op', 'all_gather_into_tensor', 'reduce_scatter_tensor',
+]
+
+_MPI_AVAILABLE = True
+_NCCL_AVAILABLE = True
+_GLOO_AVAILABLE = True
+_UCC_AVAILABLE = True
+
+_pickler = pickle.Pickler
+_unpickler = pickle.Unpickler
+
+# Change __module__ of all imported types from torch._C._distributed_c10d that are public
+def _export_c_types() -> None:
+    _public_types_to_change_module = [
+        AllreduceCoalescedOptions,
+        AllreduceOptions,
+        AllToAllOptions,
+        BarrierOptions,
+        BroadcastOptions,
+        GatherOptions,
+        PrefixStore,
+        ProcessGroup,
+        ReduceOp,
+        ReduceOptions,
+        ReduceScatterOptions,
+        ScatterOptions,
+        Store,
+        DebugLevel,
+        get_debug_level,
+        Work
+    ]
+    for type in _public_types_to_change_module:
+        type.__module__ = "torch.distributed.distributed_c10d"
+_export_c_types()
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupMPI
+    ProcessGroupMPI.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupMPI"]
+except ImportError:
+    _MPI_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupNCCL
+    ProcessGroupNCCL.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupNCCL"]
+except ImportError:
+    _NCCL_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupGloo
+    from torch._C._distributed_c10d import _ProcessGroupWrapper
+    ProcessGroupGloo.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupGloo"]
+except ImportError:
+    _GLOO_AVAILABLE = False
+
+try:
+    from torch._C._distributed_c10d import ProcessGroupUCC
+    ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
+    __all__ += ["ProcessGroupUCC"]
+except ImportError:
+    _UCC_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
+
+
+# Some reduce ops are not supported by complex numbers and will result in an error.
+# We currently provide complex support to the distributed API by viewing
+# complex tensors as real (torch.view_as_real), meaning that calling
+# these unsupported ops will return garbage values rather than error out.
+# (e.g. max(2+3i, 3+2i) = 3+3i)
+# We'd like calls to unsupported ops to error out accordingly,
+# rather than returning garbage values.
+def supports_complex(reduceOp: ReduceOp) -> bool:
+    """Return true if reduce ops is supported. False otherwise."""
+    denyList = [
+        ReduceOp.MAX,
+        ReduceOp.MIN,
+        ReduceOp.PRODUCT,
+        ReduceOp.BAND,
+        ReduceOp.BOR,
+        ReduceOp.BXOR,
+    ]
+    return reduceOp not in denyList
+
+
+class Backend(str):
+    """
+    An enum-like class for backends.
+
+    Available backends: GLOO, NCCL, UCC, MPI, and other registered backends.
+
+    The values of this class are lowercase strings, e.g., ``"gloo"``. They can
+    be accessed as attributes, e.g., ``Backend.NCCL``.
+
+    This class can be directly called to parse the string, e.g.,
+    ``Backend(backend_str)`` will check if ``backend_str`` is valid, and
+    return the parsed lowercase string if so. It also accepts uppercase strings,
+    e.g., ``Backend("GLOO")`` returns ``"gloo"``.
+
+    .. note:: The entry ``Backend.UNDEFINED`` is present but only used as
+              initial value of some fields. Users should neither use it directly
+              nor assume its existence.
+    """
+
+    UNDEFINED = "undefined"
+    GLOO = "gloo"
+    NCCL = "nccl"
+    UCC = "ucc"
+    MPI = "mpi"
+
+    _BackendPlugin = namedtuple("_BackendPlugin", ["creator_fn", "extended_api"])
+
+    _plugins: Dict[str, _BackendPlugin] = {}
+
+    backend_list = [UNDEFINED, GLOO, NCCL, UCC, MPI]
+
+    default_device_backend_map: Dict[str, str] = {
+        'cpu' : GLOO,
+        'cuda' : NCCL,
+    }
+
+    backend_capability: Dict[str, List[str]] = {
+        GLOO : ["cpu", "cuda"],
+        NCCL : ["cuda"],
+        UCC : ["cpu", "cuda"],
+        MPI : ["cpu", "cuda"],
+    }
+
+    backend_type_map: Dict[str, ProcessGroup.BackendType] = {
+        UNDEFINED: ProcessGroup.BackendType.UNDEFINED,
+        GLOO : ProcessGroup.BackendType.GLOO,
+        NCCL: ProcessGroup.BackendType.NCCL,
+        UCC: ProcessGroup.BackendType.UCC,
+    }
+
+    def __new__(cls, name: str):
+        """Create and return a new instance of the class."""
+        if not isinstance(name, str):
+            raise ValueError("Backend constructor parameter must be string-ish")
+        value = getattr(Backend, name.upper(), Backend.UNDEFINED)
+
+        if value == Backend.UNDEFINED:
+            value = name.lower()
+        return value
+
+    @classmethod
+    def register_backend(cls, name, func, extended_api=False, devices: Optional[Union[str, List[str]]] = None) -> None:
+        """
+        Register a new backend with the given name and instantiating function.
+
+        This class method is used by 3rd party ``ProcessGroup`` extension to
+        register new backends.
+
+        Args:
+            name (str): Backend name of the ``ProcessGroup`` extension. It
+                        should match the one in ``init_process_group()``.
+            func (function): Function handler that instantiates the backend.
+                             The function should be implemented in the backend
+                             extension and takes four arguments, including
+                             ``store``, ``rank``, ``world_size``, and ``timeout``.
+            extended_api (bool, optional): Whether the backend supports extended argument structure.
+                                           Default: ``False``. If set to ``True``, the backend
+                                           will get an instance of ``c10d::DistributedBackendOptions``, and
+                                           a process group options object as defined by the backend implementation.
+            device (str or list of str, optional): device type this backend
+                            supports, e.g. "cpu", "cuda", etc. If `None`,
+                            assuming both "cpu" and "cuda"
+
+        .. note:: This support of 3rd party backend is experimental and subject to change.
+
+        """
+        # Allow UCC plugin if Pytorch is not built with native support.
+        # TODO: remove this exception once UCC plugin is fully deprecated.
+        if (name != Backend.UCC or (name == Backend.UCC and is_ucc_available())):
+            assert not hasattr(Backend, name.upper()), (
+                f"{name.upper()} c10d backend already exist"
+            )
+        assert name.upper() not in Backend._plugins, (
+            f"{name.upper()} c10d backend creator function already exist"
+        )
+
+        setattr(Backend, name.upper(), name.lower())
+        Backend.backend_list.append(name.lower())
+        if devices is not None:
+            for device in devices:
+                if device != 'cpu' and device != 'cuda':
+                    Backend.default_device_backend_map[device] = name.lower()
+        Backend.backend_type_map[name.lower()] = ProcessGroup.BackendType.CUSTOM
+
+        # Update device capability matrix in Backend class
+        if devices is None:
+            # This is more of a backward support for groups like `threaded`:
+            # assume default devices "cpu" and "cuda", but warn
+            warnings.warn(
+                f"Device capability of {name} unspecified, assuming `cpu` and "
+                "`cuda`. Please specify it via the `devices` argument of "
+                "`register_backend`."
+            )
+            Backend.backend_capability[name.lower()] = ["cpu", "cuda"]
+        elif isinstance(devices, str):
+            # Single device string specified. Simply convert to list.
+            Backend.backend_capability[name.lower()] = [devices]
+        else:
+            Backend.backend_capability[name.lower()] = devices
+
+        Backend._plugins[name.upper()] = Backend._BackendPlugin(func, extended_api)
+
+class BackendConfig:
+    """Backend configuration class."""
+
+    def __init__(self, backend: Backend):
+        """Init."""
+        self.device_backend_map: Dict[str, Backend] = {}
+        backend = str(backend)
+
+        if backend == Backend.UNDEFINED:
+            # default config when backend is not specified
+            # supported since PyTorch 2.0
+            for device, default_backend in Backend.default_device_backend_map.items():
+                if is_backend_available(default_backend):
+                    if default_backend == Backend.NCCL and not torch.cuda.is_available():
+                        continue
+                    self.device_backend_map[device] = Backend(default_backend)
+        elif backend.lower() in Backend.backend_list:
+            # Cases for when backend is a single string (without device types)
+            # e.g. "nccl", "gloo", "ucc", "mpi"
+            supported_devices = Backend.backend_capability[backend.lower()]
+            backend_val = Backend(backend)
+            self.device_backend_map = dict.fromkeys(supported_devices, backend_val)
+        elif ":" in backend.lower():
+            # Backend specified in "device:backend" format
+            # make sure the backend string is in the correct format
+            # "{device_type1}:{backend1},{device_type2}:{backend2}"
+            # e.g. "cpu:gloo,cuda:nccl"
+            backend_str_error_message = f"""The custom backend string argument is invalid: {backend}.
+                Custom backend string is an experimental feature where the backend string must be in the format:
+                "<device_type1>:<backend1>,<device_type2>:<backend2>...". e.g. 'cpu:gloo,cuda:nccl'"""
+
+            # parse the backend string and populate the device_backend_map
+            for device_backend_pair_str in backend.lower().split(","):
+                device_backend_pair = device_backend_pair_str.split(":")
+                if len(device_backend_pair) != 2:
+                    raise ValueError(f"Invalid device:backend pairing: \
+                                     {device_backend_pair_str}. {backend_str_error_message}")
+                device, backend = device_backend_pair
+                if device in self.device_backend_map:
+                    raise ValueError(f"Duplicate device type {device} \
+                                     in backend string: {backend}. {backend_str_error_message}")
+                self.device_backend_map[device] = Backend(backend)
+        else:
+            # User specified a single backend name whose device capability is
+            # unknown, assuming it can support the default devices of PyTorch
+            # (cpu and cuda)
+            warnings.warn(
+                f"Device capability of {backend} unknown, assuming `cpu` and "
+                "`cuda`. You can specify it in `device:backend` format in "
+                "`init_process_group` call."
+            )
+            backend_val = Backend(backend)
+            self.device_backend_map = {
+                "cpu" : backend_val,
+                "cuda" : backend_val,
+                "xpu" : backend_val,
+            }
+
+        logger.info(
+            f"Using backend config: {self.device_backend_map}"  # noqa: G004
+        )
+
+    def __repr__(self):
+        """Return all the device:backend pairs separated by commas."""
+        return ",".join(f"{device}:{backend}" for device, backend in self.device_backend_map.items())
+
+    def get_device_backend_map(self) -> Dict[str, Backend]:
+        """Return backend map of the device."""
+        return self.device_backend_map
+
+class _reduce_op:
+    r"""
+    Deprecated enum-like class.
+
+    For reduction operations: ``SUM``, ``PRODUCT``, ``MIN``, and ``MAX``.
+
+    :class:`~torch.distributed.ReduceOp` is recommended to use instead.
+    """
+
+    def __init__(self):
+        # __members__ is a dict storing key-value pairs for enum classes
+        for k, v in ReduceOp.RedOpType.__members__.items():
+            setattr(self, k, v)
+        self.__members__ = ReduceOp.RedOpType.__members__
+
+    def __getattribute__(self, key):
+        warnings.warn(
+            "torch.distributed.reduce_op is deprecated, please use "
+            "torch.distributed.ReduceOp instead"
+        )
+        return object.__getattribute__(self, key)
+
+
+reduce_op = _reduce_op()
+
+
+class P2POp:
+    """
+    A class to build point-to-point operations for ``batch_isend_irecv``.
+
+    This class builds the type of P2P operation, communication buffer, peer rank,
+    Process Group, and tag. Instances of this class will be passed to
+    ``batch_isend_irecv`` for point-to-point communications.
+
+    Args:
+        op (Callable): A function to send data to or receive data from a peer process.
+            The type of ``op`` is either ``torch.distributed.isend`` or
+            ``torch.distributed.irecv``.
+        tensor (Tensor): Tensor to send or receive.
+        peer (int): Destination or source rank.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        tag (int, optional): Tag to match send with recv.
+    """
+
+    def __init__(self, op: Callable, tensor: torch.Tensor, peer: int,
+                 group: Optional[ProcessGroup] = None, tag: int = 0):
+        """Init."""
+        self.op = op
+        self.tensor = tensor
+        self.peer = peer
+        self.group = group
+        self.tag = tag
+
+    def __new__(cls, op: Callable, tensor: torch.Tensor, peer: int,
+                group: Optional[ProcessGroup] = None, tag: int = 0):
+        """Create and return a new instance of the class."""
+        _check_op(op)
+        _check_single_tensor(tensor, "tensor")
+        return object.__new__(cls)
+
+
+class _CollOp:
+    """
+    A class to capture collective operations.
+
+    Args:
+        op (Callable): A collective function, e.g. ``torch.distributed.all_reduce``.
+        tensor (Tensor): Tensor to operate on.
+        dst_tensor (Tensor, optional): Provided when source and destinaton tensors are not the same.
+        redop (ReduceOp, optional): reduce operation.
+        root (int, optional): root of broadcast or reduce.
+    """
+
+    def __init__(self, op: Callable, tensor: torch.Tensor, dst_tensor: Optional[torch.Tensor] = None,
+                 redop: Optional[ReduceOp] = None, root: Optional[int] = None):
+        self.op = op
+        self.tensor = tensor
+        self.dst_tensor = dst_tensor
+        self.redop = redop
+        self.root = root
+
+
+# DO NOT USE THESE FIELDS DIRECTLY.
+# Use them through the _world object to make sure the _world override mechanism
+_pg_map: Dict[ProcessGroup, Tuple[str, Store]] = {}
+_pg_names: Dict[ProcessGroup, str] = {}
+_pg_group_ranks: Dict[ProcessGroup, Dict[int, int]] = {}
+# For a pg, it is a map from ProcessGroup to BackendConfig
+_pg_backend_config: Dict[ProcessGroup, str] = {}
+_group_count = 0
+_tags_to_pg: Dict[str, List[ProcessGroup]] = {}
+_pg_to_tag: Dict[ProcessGroup, str] = {}
+_backend: Optional[str] = None
+
+class _World:
+    """
+    Container class for c10d process group state.
+
+    This is used during registration and lookup of PG state.
+
+    .. warning:: This is an experimental API intended to expose the inner workings
+       of c10d and is subject to change..
+    """
+
+    def __init__(self):
+        self._default_pg = None
+        self._pg_coalesce_state: Dict[ProcessGroup, List[_CollOp]] = {}
+        self._pg_default_device: Dict[ProcessGroup, torch.device] = {}
+
+    @property
+    def default_pg(self) -> Optional[ProcessGroup]:
+        """
+        Process group that includes all ranks of the cluster.
+
+        This default ProcessGroup is used by c10d APIs when a ProcessGroup is needed
+        but None is provided.
+        """
+        return self._default_pg
+
+    @default_pg.setter
+    def default_pg(self, value) -> None:
+        self._default_pg = value
+
+    @property
+    def pg_map(self) -> Dict[ProcessGroup, Tuple[str, Store]]:
+        """
+        Provide Mapping from ProcessGroup to backend name and store.
+
+        For NCCL and GLOO pg, it is a map from ProcessGroup to (Backend, Store)
+        For MPI pg, it is a map from ProcessGroup to (Backend, None)
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_map
+        return _pg_map
+
+    @property
+    def pg_names(self) -> Dict[ProcessGroup, str]:
+        """
+        Process group's names, map from ProcessGroup to str.
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_names
+        return _pg_names
+
+    @property
+    def pg_group_ranks(self) -> Dict[ProcessGroup, Dict[int, int]]:
+        """
+        Process group's global rank to local rank mapping.
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_group_ranks
+        return _pg_group_ranks
+
+    @property
+    def pg_backend_config(self) -> Dict[ProcessGroup, str]:
+        """
+        Process group's backend config.
+
+        TODO don't expose the map, expose fine grained ops
+        """
+        global _pg_backend_config
+        return _pg_backend_config
+
+    @property
+    def group_count(self) -> int:
+        """
+        Process group count for default naming.
+
+        TODO don't expose group_count, use something else instead
+        """
+        global _group_count
+        return _group_count
+
+    @group_count.setter
+    def group_count(self, value: int) -> None:
+        """Use to compute the name of ProcessGroups when using global synchronization."""
+        global _group_count
+        _group_count = value
+
+    @property
+    def tags_to_pg(self) -> Dict[str, List[ProcessGroup]]:
+        global _tags_to_pg
+        return _tags_to_pg
+
+    @property
+    def pg_to_tag(self) -> Dict[ProcessGroup, str]:
+        global _pg_to_tag
+        return _pg_to_tag
+
+    @property
+    def pg_coalesce_state(self) -> Dict[ProcessGroup, List[_CollOp]]:
+        return self._pg_coalesce_state
+
+    @property
+    def pg_default_device(self) -> Dict[ProcessGroup, torch.device]:
+        return self._pg_default_device
+
+    @property
+    def pg_config_info(self) -> List[Dict[str, Any]]:
+        """
+        Return a list of dict with process groups and backends.
+
+        Along with their unique IDs and configurations (types and ranks).
+        """
+        config_info: List[Dict[str, Any]] = []
+        default_pg_size = _get_group_size(None)
+        for pg in self.pg_map.keys():
+            ranks = self.pg_group_ranks[pg]
+            config_info.append(
+                {
+                    "pg_name": self.pg_names[pg],
+                    "uid": _get_process_group_uid(pg),
+                    "backend_config": self.pg_backend_config[pg],
+                    "ranks": list(ranks.keys())
+                    if len(ranks) != default_pg_size
+                    else [],  # 'ranks' is an empty list when all ranks are involved in a pg
+                    "group_size": len(ranks),
+                    "group_count": self.group_count,
+                }
+            )
+        return config_info
+
+
+_world = _World()
+"""Holds the singleton instance of ``_World`` used by c10. Experimental extension point to override it"""
+
+class _WorldMeta(type):
+    """
+    Meta class of ``group`` and ``GroupMember``.
+
+    Allows them to have the class property ``WORLD``.
+    """
+
+    # Points to the default PG once initialized.
+    @property
+    def WORLD(cls) -> Optional[ProcessGroup]:
+        return _world.default_pg
+
+    @WORLD.setter
+    def WORLD(cls, pg: Optional[ProcessGroup]):
+        _world.default_pg = pg
+
+class group(metaclass=_WorldMeta):
+    """Group class. Placeholder."""
+
+    pass
+
+class GroupMember(metaclass=_WorldMeta):
+    """Group member class."""
+
+    NON_GROUP_MEMBER = -100
+
+
+def _get_default_timeout(backend: Backend) -> timedelta:
+    # see note on nccl vs other backend timeout (constants.py)
+    if backend == Backend.NCCL:
+        if not isinstance(default_pg_nccl_timeout, timedelta):
+            # TODO moco benchmark on CPU initializes pgnccl backend today, triggered this assert in CI before it was
+            # changed to be a warning.  We should fix the moco model.
+            warnings.warn("Attempted to get default timeout for nccl backend, but NCCL support is not compiled")
+            return default_pg_timeout
+        return default_pg_nccl_timeout
+    else:
+        return default_pg_timeout
+
+def _check_valid_timeout(timeout: Any) -> None:
+    if not isinstance(timeout, timedelta):
+        raise TypeError(
+            f"Expected timeout argument to be of type datetime.timedelta, got {timeout}"
+        )
+
+# Default process group state
+_default_pg_init_method: Optional[str] = None
+
+STORE_BASED_BARRIER_PREFIX = "store_based_barrier_key"
+
+def _get_pg_default_device(group: Optional[ProcessGroup] = None) -> torch.device:
+    """
+    Return the device to use with ``group`` for control flow usage (object collectives, barrier).
+
+    There are selection rules:
+        1. If user specifies exactly one backend in ``init_process_group`` call:
+            use that backend
+        2. Else if user specifies multiple "device:backend" pairs in init_process_group:
+            If "cpu" is among those pairs, use "cpu" (because the object is in cpu memory);
+            Otherwise, use the first backend (sort of a random pick).
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    Returns:
+        torch.device: The device to use with ``group``.
+
+    """
+    group = group or _get_default_group()
+    if group in _world.pg_default_device:
+        # Previously searched and cached; just return
+        return _world.pg_default_device[group]
+
+    if not isinstance(group, ProcessGroup):
+        # Provide backward compatibility to cases where `group` passed in is
+        # actually a Backend (like `ProcessGroupGloo`) rather than a
+        # `ProcessGroup` in PT 2.0 sense
+        warnings.warn(
+            f"You are using a Backend {type(group)} as a ProcessGroup. "
+            "This usage is deprecated since PyTorch 2.0. Please use a public API "
+            "of PyTorch Distributed instead."
+        )
+        # Most users create Gloo with private API for object collectives
+        _world.pg_default_device[group] = torch.device("cpu")
+        return _world.pg_default_device[group]
+
+    """
+    ``group._device_types`` is a property pybind that returns the devices
+    ("cpu", "cuda", etc) supported by ``group``. Can be multiple if the
+    ``group`` supports multiple devices.
+    """
+    devices = group._device_types
+
+    if len(devices) == 1:
+        # User fixed exactly one backend in `init_process_group`
+        _world.pg_default_device[group] = devices[0]
+    elif len(devices) == 0:
+        # No backend has been registered with this PG (maybe because no
+        # collective has been run?) We pick cpu as the default and hopefully
+        # this would lazily init Gloo or other available cpu backend.
+        _world.pg_default_device[group] = torch.device("cpu")
+    elif torch.device("cpu") in devices:
+        # There are multiple backends in this PG and cpu is among them.
+        # cpu is preferred as the object is in cpu memory. No need for device
+        # copy.
+        _world.pg_default_device[group] = torch.device("cpu")
+    else:
+        # No cpu in the backend list. Randomly pick the first backend
+        _world.pg_default_device[group] = devices[0]
+
+    logger.info(
+        f"Using device {_world.pg_default_device[group]} for object "  # noqa: G004
+        "collectives."
+    )
+    return _world.pg_default_device[group]
+
+
+@_time_logger
+def _store_based_barrier(rank, store, group_name, rendezvous_count, timeout, logging_interval=timedelta(seconds=10)) -> None:
+    """
+    Store based barrier for synchronizing processes.
+
+    Barrier based on store which is used for synchronizing processes after
+    ``init_process_group`` or ``new_group``. Intended to be used only with
+    those two methods and is not a generic alternative to ``barrier()``.
+    """
+    store_key = f"{STORE_BASED_BARRIER_PREFIX}:{group_name}"
+    store.add(store_key, 1)
+    logger.info("Added key: %s to store for rank: %s", store_key, rank)
+
+    # Now wait for all workers to check in with the store.
+    world_size = rendezvous_count
+    worker_count = store.add(store_key, 0)
+
+    last_worker_key = f"{store_key}:last_worker"
+    if worker_count == world_size:
+        store.set(last_worker_key, "1")
+
+    # adjust the timeout to be at least 10secs + 1sec per thousand ranks to reduce the odds of timeout
+    # this value was empirically found while scale testing.
+    logging_interval = max(logging_interval, timedelta(seconds=10 + world_size / 1000))
+
+    start = time.time()
+    while True:
+        try:
+            # This will throw an exception after the logging_interval in which we print out
+            # the status of the group or time out officially, throwing runtime error
+            store.wait([last_worker_key], logging_interval)
+            break
+        except RuntimeError as e:
+            worker_count = store.add(store_key, 0)
+            # Print status periodically to keep track.
+            logger.info(
+                "Waiting in store based barrier to initialize process group for "
+                "rank: %s, key: %s (world_size=%s, num_workers_joined=%s, timeout=%s)",
+                rank, store_key, world_size, worker_count, timeout
+            )
+
+            if timedelta(seconds=(time.time() - start)) > timeout:
+                raise DistStoreError(  # noqa: TRY200
+                    "Timed out initializing process group in store based barrier on "
+                    "rank {}, for key: {} (world_size={}, num_workers_joined={}, timeout={})".format(
+                        rank, store_key, world_size, worker_count, timeout
+                    )
+                )
+
+    logger.info(
+        "Rank %s: Completed store-based barrier for key:%s with %s nodes.", rank, store_key, world_size
+    )
+
+
+def _rank_not_in_group(group: Optional[ProcessGroup]) -> bool:
+    """Check if the current process's rank is not in a given group."""
+    if group is None:
+        return False
+    return group == GroupMember.NON_GROUP_MEMBER
+
+
+def _warn_not_in_group(op_name) -> None:
+    global_rank = -1 if GroupMember.WORLD is None else GroupMember.WORLD.rank()
+    warnings.warn(
+        f"Running {op_name} on global rank {global_rank} which does not "
+        "belong to the given group."
+    )
+
+
+def get_group_rank(group: ProcessGroup, global_rank: int) -> int:
+    """
+    Translate a global rank into a group rank.
+
+    ``global_rank`` must be part of ``group`` otherwise this raises RuntimeError.
+
+    Args:
+        group (ProcessGroup): ProcessGroup to find the relative rank.
+        global_rank (int): Global rank to query.
+
+    Returns:
+        Group rank of ``global_rank`` relative to ``group``
+
+    N.B. calling this function on the default process group returns identity
+    """
+    if group is GroupMember.WORLD:
+        return global_rank
+    if group not in _world.pg_group_ranks:
+        raise ValueError(f"Group {group} is not registered, please create group with torch.distributed.new_group API")
+    group_ranks = _world.pg_group_ranks[group]
+    if global_rank not in group_ranks:
+        raise ValueError(f"Global rank {global_rank} is not part of group {group}")
+
+    return group_ranks[global_rank]
+
+def get_global_rank(group: ProcessGroup, group_rank: int) -> int:
+    """
+    Translate a group rank into a global rank.
+
+    ``group_rank`` must be part of `group` otherwise this raises RuntimeError.
+
+    Args:
+        group (ProcessGroup): ProcessGroup to find the global rank from.
+        group_rank (int): Group rank to query.
+
+    Returns:
+        Global rank of ``group_rank`` relative to ``group``
+
+    N.B. calling this function on the default process group returns identity
+    """
+    if group is GroupMember.WORLD:
+        return group_rank
+    if group not in _world.pg_group_ranks:
+        raise ValueError(f"Group {group} is not registered, please create group with torch.distributed.new_group API")
+    for rank, grp_rank in _world.pg_group_ranks[group].items():
+        if grp_rank == group_rank:
+            return rank
+    raise ValueError(f"Group rank {group_rank} is not part of group {group}")
+
+# TODO: remove this once the ecosystem moves away from it.
+def _get_global_rank(group, rank) -> int:
+    """Use get_global_rank as this method is deprecated."""
+    warnings.warn(
+        "torch.distributed.distributed_c10d._get_global_rank is deprecated "
+        "please use torch.distributed.distributed_c10d.get_global_rank instead"
+    )
+    return get_global_rank(group, rank)
+
+
+def get_process_group_ranks(group: ProcessGroup) -> List[int]:
+    """
+    Get all ranks associated with ``group``.
+
+    Args:
+        group (ProcessGroup): ProcessGroup to get all ranks from.
+
+    Returns:
+        List of global ranks ordered by group rank.
+    """
+    return list(_world.pg_group_ranks[group].keys())
+
+def _get_group_size(group) -> int:
+    """Get a given group's world size."""
+    if group is GroupMember.WORLD or group is None:
+        default_pg = _get_default_group()
+        return default_pg.size()
+    return group.size()
+
+
+def _get_group_size_by_name(group_name: str) -> int:
+    group = _resolve_process_group(group_name)
+    return group.size()
+
+
+def _resolve_group_name_by_ranks_and_tag(ranks: List[int], tag: str) -> str:
+    # TODO(yifu): remove this function once ranks + tag is not a supported
+    # identifier for process group for functional collectives.
+    group = _find_pg_by_ranks_and_tag(tag, ranks)
+    if group is None:
+        raise ValueError("")
+    return group.group_name
+
+
+def _check_single_tensor(param, param_name) -> None:
+    """Check that the parameter ``param_name`` is a single tensor."""
+    if not isinstance(param, torch.Tensor):
+        raise TypeError(
+            f"""Invalid function argument. Expected parameter `{param_name}` of type torch.Tensor
+             but got {type(param)} instead."""
+        )
+
+
+def _check_tensor_list(param, param_name) -> None:
+    """Check that the parameter ``param_name`` is a list of tensors."""
+    if not isinstance(param, list):
+        raise TypeError(
+            f"""Invalid function argument. Expected parameter `{param_name}` of type List[torch.Tensor]
+             but got {type(param)} instead."""
+        )
+    elif not all(isinstance(p, torch.Tensor) for p in param):
+        raise TypeError(
+            f"""Invalid function argument. Expected parameter `{param_name}` of type List[torch.Tensor]
+             but got {type(param)} with elements of type {[type(p) for p in param]}."""
+        )
+
+
+def _as_iterable(obj) -> collections.abc.Iterable:
+    return obj if isinstance(obj, list) else (obj,)
+
+def _ensure_all_tensors_same_dtype(*tensors) -> None:
+    last_dtype = None
+    for tensor in itertools.chain.from_iterable(map(_as_iterable, tensors)):
+        tensor_dtype = tensor.dtype
+        # Mixing complex and its element type is allowed
+        if tensor_dtype.is_complex:
+            tensor_dtype = torch.float32 if tensor_dtype == torch.complex64 else torch.complex128
+
+        if last_dtype is None:
+            last_dtype = tensor_dtype
+        else:
+            if last_dtype != tensor_dtype:
+                raise ValueError(
+                    "Invalid usage of tensors with different dtypes"
+                    f"Found {last_dtype} and  {tensor.dtype}"
+                )
+
+
+def _check_op(op) -> None:
+    """Check that the ``op`` is either isend or irecv."""
+    if op not in [isend, irecv]:
+        raise ValueError(
+            "Invalid ``op``. Expected ``op`` "
+            "to be of type ``torch.distributed.isend`` or "
+            "``torch.distributed.irecv``."
+        )
+
+
+def _check_p2p_op_list(p2p_op_list) -> None:
+    """
+    Check that the ``p2p_op_list`` is a list of P2POp instances.
+
+    Also, check that all ops use the same group.
+    """
+    if not isinstance(p2p_op_list, list) or not all(
+        isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list
+    ):
+        raise ValueError(
+            "Invalid ``p2p_op_list``. Each op is expected to "
+            "to be of type ``torch.distributed.P2POp``."
+        )
+
+    group = p2p_op_list[0].group
+    if not all(group == p2p_op.group for p2p_op in p2p_op_list):
+        raise ValueError("All ops need to use the same group.")
+
+
+def is_mpi_available() -> bool:
+    """Check if the MPI backend is available."""
+    return _MPI_AVAILABLE
+
+
+def is_nccl_available() -> bool:
+    """Check if the NCCL backend is available."""
+    return _NCCL_AVAILABLE
+
+
+def is_gloo_available() -> bool:
+    """Check if the Gloo backend is available."""
+    return _GLOO_AVAILABLE
+
+
+def is_ucc_available() -> bool:
+    """Check if the UCC backend is available."""
+    return _UCC_AVAILABLE
+
+
+def is_backend_available(backend: str) -> bool:
+    """
+    Check backend availability.
+
+    Checks if the given backend is available and supports the built-in backends or
+    third-party backends through function ``Backend.register_backend``.
+
+    Args:
+        backend (str): Backend name.
+    Returns:
+        bool: Returns true if the backend is available otherwise false.
+    """
+    # If the backend has an ``is_backend_available`` function, return the result of that function directly
+    available_func = getattr(torch.distributed, f"is_{backend.lower()}_available", None)
+    if available_func:
+        return available_func()
+
+    return backend.lower() in Backend.backend_list
+
+
+def is_initialized() -> bool:
+    """Check if the default process group has been initialized."""
+    return GroupMember.WORLD is not None
+
+
+def is_torchelastic_launched() -> bool:
+    """
+    Check whether this process was launched with ``torch.distributed.elastic`` (aka torchelastic).
+
+    The existence of ``TORCHELASTIC_RUN_ID`` environment
+    variable is used as a proxy to determine whether the current process
+    was launched with torchelastic. This is a reasonable proxy since
+    ``TORCHELASTIC_RUN_ID`` maps to the rendezvous id which is always a
+    non-null value indicating the job id for peer discovery purposes..
+    """
+    return os.getenv("TORCHELASTIC_RUN_ID") is not None
+
+
+def _is_barrier_after_init() -> int:
+    # Environment variable to control whether process group should perform a
+    # barrier after its init. Default value is 0, i.e. no barrier. If you
+    # experience issue with this setting, you may set
+    # `TORCH_DIST_INIT_BARRIER=1` to add the barrier.
+    return int(os.getenv("TORCH_DIST_INIT_BARRIER", "0"))
+
+
+def _abort_in_destroy_pg() -> bool:
+    # Environment variable to control whether to abort the communicators when users call destroy_process_group()
+    env = os.getenv("TORCH_NCCL_ABORT_IN_DESTROY_PG", "0")
+    return env == "1" or env.lower() == "true"
+
+
+def _get_default_group() -> ProcessGroup:
+    """Get the default process group created by init_process_group."""
+    if not is_initialized():
+        raise ValueError(
+            "Default process group has not been initialized, "
+            "please make sure to call init_process_group."
+        )
+    return not_none(GroupMember.WORLD)
+
+
+def _get_default_store() -> Store:
+    """Get the default store created by init_process_group."""
+    if not is_initialized():
+        raise ValueError(
+            "Default process group has not been initialized, "
+            "please make sure to call init_process_group."
+        )
+    default_pg = _get_default_group()
+    _, default_store = _world.pg_map[default_pg]
+    return default_store
+
+
+def _update_default_pg(pg) -> None:
+    _world.default_pg = pg
+    rank = pg.rank() if pg is not None and pg != GroupMember.NON_GROUP_MEMBER else -1
+    torch._C._distributed_c10d._set_global_rank(rank)
+
+def get_backend_config(group: Optional[ProcessGroup] = None) -> str:
+    """
+    Return the backend configuration of the given process group.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific group
+            is specified, the calling process must be part of :attr:`group`.
+
+    Returns:
+        The backend configuration of the given process group as a lower case string.
+
+    """
+    if group is None:
+        pg = _get_default_group()
+    else:
+        pg = group
+    if _rank_not_in_group(pg):
+        raise ValueError("Invalid process group specified")
+    backend_config = _world.pg_backend_config.get(pg)
+    return str(not_none(backend_config))
+
+def get_backend(group: Optional[ProcessGroup] = None) -> Backend:
+    """
+    Return the backend of the given process group.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific group
+            is specified, the calling process must be part of :attr:`group`.
+
+    Returns:
+        The backend of the given process group as a lower case string.
+
+    """
+    if group is None:
+        pg = _get_default_group()
+    else:
+        pg = group
+    if _rank_not_in_group(pg):
+        raise ValueError("Invalid process group specified")
+    pg_store = _world.pg_map[pg] if pg in _world.pg_map else None
+    return Backend(not_none(pg_store)[0])
+
+def _get_process_group_uid(pg: ProcessGroup) -> int:
+    backend = None
+    try:
+        backend = pg._get_backend(torch.device("cuda"))
+    except RuntimeError:
+        pass
+    if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
+        return backend.uid
+    return -1
+
+def _get_pg_config(group: Optional[ProcessGroup] = None) -> Dict[str, Any]:
+    """
+    Return the pg configuration of the given process group.
+
+    """
+    if group is None:
+        pg = _get_default_group()
+    else:
+        pg = group
+    return {
+        "pg_name": _get_process_group_name(pg),
+        "uid": _get_process_group_uid(pg),
+        "backend_config": get_backend_config(pg),
+        "pg_size": _get_group_size(pg),
+        "ranks": get_process_group_ranks(pg),
+    }
+
+def _get_all_pg_configs() -> List[Dict[str, Any]]:
+    """
+    Return the pg configuration of all the process groups.
+
+    """
+    config_info: List[Dict[str, Any]] = []
+    for pg in _world.pg_map.keys():
+        config_info.append(_get_pg_config(pg))
+    return config_info
+
+def get_pg_count() -> int:
+    """
+    Return the number of process groups.
+
+    """
+    return _world.group_count
+
+def _set_pg_timeout(timeout: timedelta, group: Optional[ProcessGroup] = None) -> None:
+    """
+    Set the timeout for the given process group when users want to use a different timeout instead of
+    default values.
+
+    Args:
+        timeout (timedelta): Timeout for operations executed against the process group which
+            users want to set. Default value is 10 minutes for NCCL and 30 minutes for other backends.
+            This is the duration after which collectives will be aborted asynchronously and the process will crash.
+            This is done since CUDA execution is async and it is no longer safe to continue executing user code since
+            failed async NCCL operations might result in subsequent CUDA operations running on corrupted data.
+            When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout.
+
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific group
+            is specified, the calling process must be part of :attr:`group`.
+
+    Returns:
+        None
+    """
+    if group is None:
+        group = _get_default_group()
+    if _rank_not_in_group(group):
+        raise ValueError("Invalid process group specified")
+    assert isinstance(group, ProcessGroup)
+    devices = group._device_types
+    backends = set()
+    if torch.device("cpu") in devices and is_gloo_available():
+        backend = group._get_backend(torch.device("cpu"))
+        if isinstance(backend, ProcessGroupGloo):
+            backends.add(backend)
+    if torch.device("cuda") in devices:
+        backend = group._get_backend(torch.device("cuda"))
+        if is_nccl_available() and isinstance(backend, ProcessGroupNCCL):
+            backends.add(backend)  # type: ignore[arg-type]
+        elif is_gloo_available() and isinstance(backend, ProcessGroupGloo):
+            backends.add(backend)  # type: ignore[arg-type]
+    if len(backends) == 0:
+        warnings.warn("Set timeout is now only supported for either nccl or gloo.")
+    for backend in backends:
+        backend._set_default_timeout(timeout)
+
+
+@_exception_logger
+@_time_logger
+def init_process_group(
+    backend: Optional[str] = None,
+    init_method: Optional[str] = None,
+    timeout: Optional[timedelta] = None,
+    world_size: int = -1,
+    rank: int = -1,
+    store: Optional[Store] = None,
+    group_name: str = "",
+    pg_options: Optional[Any] = None,
+    device_id: Optional[torch.device] = None,
+) -> None:
+    """
+    Initialize the default distributed process group.
+
+    This will also initialize the distributed package.
+
+    There are 2 main ways to initialize a process group:
+        1. Specify ``store``, ``rank``, and ``world_size`` explicitly.
+        2. Specify ``init_method`` (a URL string) which indicates where/how
+           to discover peers. Optionally specify ``rank`` and ``world_size``,
+           or encode all required parameters in the URL and omit them.
+
+    If neither is specified, ``init_method`` is assumed to be "env://".
+
+
+    Args:
+        backend (str or Backend, optional): The backend to use. Depending on
+            build-time configurations, valid values include ``mpi``, ``gloo``,
+            ``nccl``, and ``ucc``. If the backend is not provided, then both a ``gloo``
+            and ``nccl`` backend will be created, see notes below for how multiple
+            backends are managed. This field can be given as a lowercase string
+            (e.g., ``"gloo"``), which can also be accessed via
+            :class:`Backend` attributes (e.g., ``Backend.GLOO``). If using
+            multiple processes per machine with ``nccl`` backend, each process
+            must have exclusive access to every GPU it uses, as sharing GPUs
+            between processes can result in deadlocks. ``ucc`` backend is
+            experimental.
+        init_method (str, optional): URL specifying how to initialize the
+                                     process group. Default is "env://" if no
+                                     ``init_method`` or ``store`` is specified.
+                                     Mutually exclusive with ``store``.
+        world_size (int, optional): Number of processes participating in
+                                    the job. Required if ``store`` is specified.
+        rank (int, optional): Rank of the current process (it should be a
+                              number between 0 and ``world_size``-1).
+                              Required if ``store`` is specified.
+        store(Store, optional): Key/value store accessible to all workers, used
+                                to exchange connection/address information.
+                                Mutually exclusive with ``init_method``.
+        timeout (timedelta, optional): Timeout for operations executed against
+            the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends.
+            This is the duration after which collectives will be aborted asynchronously and the process will crash.
+            This is done since CUDA execution is async and it is no longer safe to continue executing user code since
+            failed async NCCL operations might result in subsequent CUDA operations running on corrupted data.
+            When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout.
+
+        group_name (str, optional, deprecated): Group name. This argument is ignored
+        pg_options (ProcessGroupOptions, optional): process group options
+            specifying what additional options need to be passed in during
+            the construction of specific process groups. As of now, the only
+            options we support is ``ProcessGroupNCCL.Options`` for the ``nccl``
+            backend, ``is_high_priority_stream`` can be specified so that
+            the nccl backend can pick up high priority cuda streams when
+            there're compute kernels waiting.
+        device_id (torch.device, optional): a single, specific device
+            to "bind" this process to, allowing for backend-specific
+            optimizations.  Currently this has two effects, only under
+            NCCL: the communicator is immediately formed (calling
+            ``ncclCommInit*`` immediately rather than the normal lazy
+            call) and sub-groups will use ``ncclCommSplit`` when
+            possible to avoid unnecessary overhead of group creation. If you
+            want to know NCCL initialization error early, you can also use this
+            field.
+
+    .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source
+        on a system that supports MPI.
+
+    .. note:: Support for multiple backends is experimental. Currently when no backend is
+        specified, both ``gloo`` and ``nccl`` backends will be created. The ``gloo`` backend
+        will be used for collectives with CPU tensors and the ``nccl`` backend will be used
+        for collectives with CUDA tensors. A custom backend can be specified by passing in
+        a string with format "<device_type>:<backend_name>,<device_type>:<backend_name>", e.g.
+        "cpu:gloo,cuda:custom_backend".
+
+    """
+
+    global _world
+
+    global _backend
+    global _default_pg_init_method
+
+    if GroupMember.WORLD is not None:
+        raise ValueError("trying to initialize the default process group twice!")
+
+    set_pytorch_distributed_envs_from_justknobs()
+
+    assert (store is None) or (
+        init_method is None
+    ), "Cannot specify both init_method and store."
+
+    if store is not None:
+        assert world_size > 0, "world_size must be positive if using store"
+        assert rank >= 0, "rank must be non-negative if using store"
+    elif init_method is None:
+        init_method = "env://"
+
+    if backend:
+        backend = Backend(backend)
+    else:
+        backend = Backend("undefined")
+
+    if timeout is None:
+        timeout = _get_default_timeout(backend)
+
+    _check_valid_timeout(timeout)
+
+    """
+    Group name is not visible to users unless they access
+    internals of c10d. This means we can ignore the value
+    they provide as it not exposed in a public way.
+    """
+    group_name = _process_group_name([], use_hashed_name=False)
+    if backend == Backend.MPI:
+        if world_size != -1 or rank != -1:
+            warnings.warn(
+                f"For MPI backend, world_size ({world_size}) and rank ({rank}) "
+                "are ignored since they are assigned by the "
+                "MPI runtime."
+            )
+
+        default_pg, _ = _new_process_group_helper(
+            -1, -1, [], backend, None, group_name, timeout=timeout
+        )
+        _update_default_pg(default_pg)
+    else:
+        # backward compatible API
+        if store is None:
+            rendezvous_iterator = rendezvous(
+                not_none(init_method), rank, world_size, timeout=timeout
+            )
+            store, rank, world_size = next(rendezvous_iterator)
+            store.set_timeout(timeout)
+
+            # Use a PrefixStore to avoid accidental overrides of keys used by
+            # different systems (e.g. RPC) in case the store is multi-tenant.
+            store = PrefixStore("default_pg", store)
+
+        default_pg, _ = _new_process_group_helper(
+            world_size,
+            rank,
+            [],
+            backend,
+            store,
+            group_name,
+            pg_options=pg_options,
+            timeout=timeout,
+            device_id=device_id,
+        )
+        _update_default_pg(default_pg)
+
+    _world.pg_group_ranks[GroupMember.WORLD] = {i: i for i in range(GroupMember.WORLD.size())}  # type: ignore[attr-defined, index]
+    _backend = _world.pg_map[not_none(GroupMember.WORLD)][0]
+    _default_pg_init_method = init_method
+
+    old_hook = sys.excepthook
+
+    def _distributed_excepthook(*args):
+        old_stderr = sys.stderr
+        sys.stderr = buf = io.StringIO()
+        try:
+            old_hook(*args)
+        finally:
+            sys.stderr = old_stderr
+        msg = buf.getvalue()
+        prefix = f"[rank{get_rank()}]"
+        msg = "\n".join(f"{prefix}: {s}" if s != "" else "" for s in msg.split("\n"))
+        sys.stderr.write(msg)
+        sys.stderr.flush()
+
+    sys.excepthook = _distributed_excepthook
+
+    if _is_barrier_after_init() == 1:
+        # barrier at the end to ensure that once we return from this method, all
+        # process groups including global variables (if any) are updated
+        # correctly on all ranks.
+        # Update 04/2023: for large-scale runs, this barrier (esp. store-based
+        # barrier) may be costly and/or unscalable. Also, in a lot of cases,
+        # these barriers may be unnecessary, as proven by a green CI after
+        # removal. An environment variable `TORCH_DIST_INIT_BARRIER` has been
+        # added which enables this barrier only when set to 1.
+        logger.info(
+            "Performing barrier after ProcessGroup initialization since "
+            "TORCH_DIST_INIT_BARRIER = 1"
+        )
+        if backend == Backend.MPI:
+            # MPI backend doesn't use store.
+            barrier()
+        else:
+            # Use store based barrier here since barrier() used a bunch of
+            # default devices and messes up NCCL internal state.
+            _store_based_barrier(rank, store, group_name, world_size, timeout)
+
+def _get_split_source(pg):
+    split_from = None
+    if pg.bound_device_id:
+        split_from = pg._get_backend(pg.bound_device_id)
+    elif pg is _world.default_pg:
+        try:
+            split_from = pg._get_backend(torch.device("cuda"))
+        except RuntimeError:
+            # no cuda device associated with this backend
+            pass
+
+    if not split_from or not split_from.supports_splitting:
+        return None
+
+    # If necessary, find a backend to split from by peeling process
+    # group wrappers from our potentially wrapped process group.
+    while isinstance(split_from, _ProcessGroupWrapper):
+        split_from = split_from.wrapped_pg
+
+    return split_from
+
+def _shutdown_backend(pg):
+    """
+    Try to shut down the backend of a process group.
+    Currently, only ProcessGroupNCCL backend is supported.
+    No op for other backends.
+    """
+    backend = None
+    try:
+        backend = pg._get_backend(torch.device("cuda"))
+    except RuntimeError:
+        pass
+    if isinstance(backend, ProcessGroupNCCL):
+        # explictly call shutdown to ensure that NCCL resources are released
+        backend._shutdown()
+
+def _new_process_group_helper(
+    group_size,
+    group_rank,
+    global_ranks_in_group,
+    backend,
+    store,
+    group_name,
+    pg_options=None,
+    timeout=None,
+    pg_tag=None,
+    device_id=None,
+):
+    """
+    Create a new distributed process group.
+
+    This function must be called by ALL processes in the global group, even if
+    the calling process is not part of the newly created group. In that case,
+    this function returns GroupMember.NON_GROUP_MEMBER.
+
+    This function is called with ``global_ranks_in_group == []`` for the default group.
+    """
+    global _world
+
+    if group_name in _world.pg_names.values():
+        raise ValueError(
+            "The specified group name has already been "
+            "created, please use a different group name"
+        )
+
+    if device_id is not None and (device_id.index is None or device_id.type != 'cuda'):
+        raise ValueError("init_process_group device_id parameter must be a cuda device with an "
+                         "id, e.g. cuda:0, not just cuda or cpu")
+
+    # Note: _new_process_group_helper is only called from init_process_group, which always provides a timeout value
+    _check_valid_timeout(timeout)
+
+    if pg_tag not in [None, ""]:
+        # creating with the same tag and rank set results in the same underlying PG
+        existing_group = _find_pg_by_ranks_and_tag(pg_tag, global_ranks_in_group)
+        if existing_group:
+            _, prefix_store = _world.pg_map[existing_group]
+            return existing_group, prefix_store
+
+    # The list of group ranks is empty if we're creating the default group.
+    is_default_group = len(global_ranks_in_group) == 0
+
+    # nccl and potentially other backends allow creation of
+    # communicators based on pre-existing ones, which can save
+    # initialization time.  Due to lazy initialization of
+    # communicators in some backends, we have to be careful and only
+    # split when we *know* the backends already are connected _on all
+    # ranks_.  We can only know this if the group we are making is the
+    # entire world or if we have bound a device id to the world (which
+    # causes early connection initialization).
+    if (is_initialized() and
+            (len(global_ranks_in_group) == _get_default_group().size() or _get_default_group().bound_device_id)):
+        split_from = _get_split_source(_get_default_group())
+    else:
+        split_from = None
+
+    # If this is a subgroup (which means group_ranks is specified),
+    # we check if the current process is a member of the new group.
+    if not is_default_group:
+        global_rank = _get_default_group().rank()
+        if global_rank not in global_ranks_in_group:
+            # If we are using `ncclCommSplit` (or similar split from
+            # other APIs) to create the communicator, we will need to
+            # call `ncclCommSplit` on *all* ranks in this new group's
+            # parent group, even those not in the new group.  This is
+            # a requirement of the NCCL API as otherwise we would get
+            # out of sync.
+            if split_from:
+                split_from.perform_nocolor_split(_get_default_group().bound_device_id)
+            return GroupMember.NON_GROUP_MEMBER, None
+
+    prefix_store = PrefixStore(f"{group_name}/", store)
+    base_pg_options = ProcessGroup.Options(backend=str(backend))
+    base_pg_options._timeout = timeout
+    pg: ProcessGroup = ProcessGroup(prefix_store, group_rank, group_size, base_pg_options)
+    if device_id:
+        pg.bound_device_id = device_id
+    backend_config = BackendConfig(backend)
+    backend_class: torch._C._distributed_c10d.Backend
+    for device, backend_str in backend_config.get_device_backend_map().items():
+        # Use the group name as prefix in the default store, such that
+        # a single store can be reused by multiple groups.
+        backend_prefix_store = PrefixStore(f"{device}/", prefix_store)
+
+        if backend_str == Backend.MPI:
+            if not is_mpi_available():
+                raise RuntimeError(
+                    "Distributed package doesn't have MPI built in."
+                    " MPI is only included if you build PyTorch from"
+                    " source on a host that has MPI installed."
+                )
+            backend_class = ProcessGroupMPI.create(global_ranks_in_group)
+            backend_type = ProcessGroup.BackendType.MPI
+            if not backend_class:
+                return GroupMember.NON_GROUP_MEMBER, None
+            # create new process group with accurate rank and size
+            if pg.rank() == -1 and pg.size() == -1:
+                pg = ProcessGroup(backend_prefix_store, backend_class.rank(), backend_class.size(), base_pg_options)
+        elif backend_str == Backend.GLOO:
+            # TODO: remove this check after lazy initialization is supported
+            # if pg_options is not None:
+            #     raise RuntimeError("GLOO options not supported")
+            backend_class = ProcessGroupGloo(backend_prefix_store, group_rank, group_size, timeout=timeout)
+            backend_type = ProcessGroup.BackendType.GLOO
+        elif backend_str == Backend.NCCL:
+            if not is_nccl_available():
+                raise RuntimeError("Distributed package doesn't have NCCL built in")
+            if pg_options is not None:
+                assert isinstance(
+                    pg_options, ProcessGroupNCCL.Options
+                ), "Expected pg_options argument to be of type ProcessGroupNCCL.Options"
+                if pg_options._timeout != timeout:
+                    warnings.warn(
+                        "pg_options._timeout was specified, "
+                        "but timeout kwarg has a default value that will always override it. "
+                    )
+            else:
+                # default pg_options for NCCL
+                pg_options = ProcessGroupNCCL.Options()
+                pg_options.is_high_priority_stream = False
+            pg_options._timeout = timeout
+
+            if split_from:
+                pg_options.split_from = split_from
+                pg_options.split_color = _process_group_color(global_ranks_in_group)
+            pg_options.global_ranks_in_group = global_ranks_in_group
+            backend_class = ProcessGroupNCCL(
+                backend_prefix_store, group_rank, group_size, pg_options)
+            backend_type = ProcessGroup.BackendType.NCCL
+        elif backend_str == Backend.UCC and is_ucc_available():
+            # TODO: once UCC plugin is fully deprecated, remove
+            # is_ucc_available() from above elif-condition and raise
+            # RuntimeError if is_ucc_available() returns false.
+
+            backend_class = ProcessGroupUCC(backend_prefix_store, group_rank, group_size, timeout=timeout)
+            backend_type = ProcessGroup.BackendType.UCC
+        else:
+            assert backend_str.upper() in Backend._plugins, (
+                f"Unknown c10d backend type {backend_str.upper()}"
+            )
+
+            backend_plugin = Backend._plugins[backend_str.upper()]
+            creator_fn = backend_plugin.creator_fn
+            extended_api = backend_plugin.extended_api
+            backend_type = ProcessGroup.BackendType.CUSTOM
+
+            if not extended_api:
+                backend_class = creator_fn(backend_prefix_store, group_rank, group_size, timeout)
+            else:
+                dist_backend_opts = _DistributedBackendOptions()
+                dist_backend_opts.store = backend_prefix_store
+                dist_backend_opts.group_rank = group_rank
+                dist_backend_opts.group_size = group_size
+                dist_backend_opts.timeout = timeout
+                dist_backend_opts.group_id = group_name
+                dist_backend_opts.global_ranks_in_group = global_ranks_in_group
+
+                backend_class = creator_fn(dist_backend_opts, pg_options)
+
+        # Set sequence numbers for gloo and nccl backends.
+        if backend_str == Backend.GLOO:
+            assert isinstance(backend_class, ProcessGroupGloo)
+            backend_class._set_sequence_number_for_group()
+        elif backend_str == Backend.NCCL:
+            assert isinstance(backend_class, ProcessGroupNCCL)
+            backend_class._set_sequence_number_for_group()
+
+        # If the type is a subclass of ProcessGroup then return this process group immediately
+        # TODO: This defaults to the old behavior for PythonProcessGroups which overwrites the
+        # ProcessGroup instance
+        if issubclass(type(backend_class), ProcessGroup):
+            pg = backend_class  # type: ignore[assignment]
+            break
+
+        # Process group wrapper initialization for supported PGs when TORCH_DISTRIBUTED_DEBUG is set
+        if backend_str in [Backend.GLOO, Backend.NCCL, Backend.UCC]:
+            # In debug mode and if GLOO is available, wrap in a wrapper PG that
+            # enables enhanced collective checking for debuggability.
+            if get_debug_level() == DebugLevel.DETAIL:
+                if not _GLOO_AVAILABLE:
+                    logger.info(
+                        """TORCH_DISTRIBUTED_DEBUG was set to DETAIL, but
+                                GLOO is not available. Build with Gloo to
+                                create a wrapper process group in debug mode
+                                to aid collective desynchronization debugging."""
+                    )
+                else:
+                    backend_class = _create_process_group_wrapper(
+                        wrapped_pg=backend_class,
+                        store_prefix=group_name,
+                        store=backend_prefix_store,
+                        rank=group_rank,
+                        world_size=group_size,
+                        timeout=timeout,
+                    )
+
+        # register only a single backend when all get_device_backend_map values are the same
+        if len(set(backend_config.get_device_backend_map().values())) == 1:
+            for device in backend_config.get_device_backend_map().keys():
+                pg._register_backend(torch.device(device), backend_type, backend_class)
+
+            # break out of outer loop to not create any more backends
+            break
+
+        pg._register_backend(torch.device(device), backend_type, backend_class)
+
+    if device_id and pg._get_backend(device_id).supports_splitting:
+        eager_backend = pg._get_backend(device_id)
+        eager_backend.eager_connect_single_device(device_id)
+
+    # update global state
+    assert group_name is not None
+    _world.pg_map[pg] = (backend, prefix_store)
+    _world.pg_names[pg] = group_name
+    pg._set_group_name(group_name)
+    _register_process_group(group_name, pg)
+
+    _world.pg_backend_config[pg] = str(backend_config)
+    # "" is the default tag for user PGs
+    if pg_tag in [None, ""]:
+        pg_tag = f"ptd:{group_name}"
+        _world.tags_to_pg.setdefault("", []).append(pg)
+    else:
+        pg_tag = f"user:{pg_tag}"
+
+    _world.tags_to_pg.setdefault(pg_tag, []).append(pg)
+    _world.pg_to_tag[pg] = pg_tag
+    return pg, prefix_store
+
+def destroy_process_group(group: Optional[ProcessGroup] = None):
+    """
+    Destroy a given process group, and deinitialize the distributed package.
+
+    Args:
+        group (ProcessGroup, optional): The process group to be destroyed, if
+                                        group.WORLD is given, all process
+                                        groups including the default one will
+                                        be destroyed.
+    """
+    global _world
+
+    if group == GroupMember.NON_GROUP_MEMBER:
+        return
+
+    if group is None:
+        pg = GroupMember.WORLD
+    else:
+        pg = group
+
+    assert pg is not None
+    if _world.pg_map.get(pg, None) is None:
+        raise ValueError("Invalid process group specified")
+
+    # When users register Python onCompletion hooks, those hooks will run on a
+    # different thread than the main thread. Today, the ProcessGroup dtor does
+    # wait for that thread. However, the dtor might finish after the Python
+    # Interpreter exits. After that grabbing the GIL for the Python hook will crash.
+    # We can either revive the interpreter when running hooks or keep the main one
+    # alive until all works and hooks are done. The current implementation does the
+    # latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
+    # for the pending hooks to finish.
+    if pg.name().lower() == "nccl" and pg._has_hooks():
+        pg._wait_for_pending_works()
+
+    if group is None or group == GroupMember.WORLD:
+        if _abort_in_destroy_pg():
+            # shutdown all backends in the order of pg names. shutting down in order because
+            # ncclCommAbort() was a 'collective' call in some versions of NCCL.
+            for pg_to_shutdown in sorted(_world.pg_names, key=lambda x: _world.pg_names[x], reverse=True):
+                _shutdown_backend(pg_to_shutdown)
+
+        _update_default_pg(None)
+        _world.pg_map.clear()
+        _world.pg_names.clear()
+        _world.pg_group_ranks.clear()
+        _world.pg_backend_config.clear()
+        _world.pg_to_tag.clear()
+        _world.tags_to_pg.clear()
+        _world.pg_coalesce_state.clear()
+        _world.pg_default_device.clear()
+        _unregister_all_process_groups()
+
+        # when process group doesn't have an explicit name (only WORLD (default)
+        # process group can have an explicit name), we use global _world.group_count
+        # to generate the name. We need to reset the counter on destruction to
+        # allow consistent value to be generated when we re-create process
+        # groups after some trainers recover from failure
+        #
+        # We only reset this when WORLD is being destroyed because if this
+        # process group is in good state, we aren't dealing with failures.
+        _world.group_count = 0
+    else:
+        if _abort_in_destroy_pg():
+            _shutdown_backend(pg)
+        del _world.pg_map[pg]
+        del _world.pg_names[pg]
+        del _world.pg_group_ranks[pg]
+        del _world.pg_backend_config[pg]
+        if pg in _world.pg_default_device:
+            del _world.pg_default_device[pg]
+        if pg in _world.pg_coalesce_state.keys():
+            warnings.warn(
+                "Some coalesced collectives haven't been launched when "
+                "ProcessGroup is destroyed. They will be cleaned."
+            )
+            del _world.pg_coalesce_state[pg]
+
+        tag = _world.pg_to_tag.get(pg)
+        del _world.pg_to_tag[pg]
+        if tag is not None:
+            try:
+                _world.tags_to_pg[tag].remove(pg)
+                if tag.startswith("ptd:"):
+                    _world.tags_to_pg[""].remove(pg)
+            except Exception:
+                pass
+        _unregister_process_group(pg.group_name)
+
+
+def get_rank(group: Optional[ProcessGroup] = None) -> int:
+    """
+    Return the rank of the current process in the provided ``group``, default otherwise.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    Returns:
+        The rank of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    default_pg = _get_default_group()
+    if group is None or group is GroupMember.WORLD:
+        return default_pg.rank()
+
+    return get_group_rank(group, default_pg.rank())
+
+
+def get_world_size(group: Optional[ProcessGroup] = None) -> int:
+    """
+    Return the number of processes in the current process group.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    Returns:
+        The world size of the process group
+        -1, if not part of the group
+
+    """
+    if _rank_not_in_group(group):
+        return -1
+
+    return _get_group_size(group)
+
+
+def isend(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, tag: int = 0) -> Optional[Work]:
+    """
+    Send a tensor asynchronously.
+
+    .. warning::
+        Modifying ``tensor`` before the request completes causes undefined
+        behavior.
+
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
+    Args:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank on global process group (regardless of ``group`` argument)
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        tag (int, optional): Tag to match send with remote recv
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("isend")
+        return None
+
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
+    if group is None or group is GroupMember.WORLD:
+        pg = _get_default_group()
+    else:
+        pg = group
+        dst = get_group_rank(pg, dst)
+
+    return pg.send([tensor], dst, tag)
+
+def irecv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[ProcessGroup] = None, tag: int = 0) -> Optional[Work]:
+    """
+    Receives a tensor asynchronously.
+
+    .. warning::
+        ``tag`` is not supported with the NCCL backend.
+
+    Args:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank on global process group (regardless of ``group`` argument).
+            Will receive from any process if unspecified.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        tag (int, optional): Tag to match recv with remote send
+
+    Returns:
+        A distributed request object.
+        None, if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("irecv")
+        return None
+
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
+    if group is None or group is GroupMember.WORLD:
+        pg = _get_default_group()
+    else:
+        pg = group
+
+    if src is None:
+        return pg.recv_anysource([tensor], tag)
+    else:
+        if pg is GroupMember.WORLD:
+            return pg.recv([tensor], src, tag)
+        else:
+            group_src_rank = get_group_rank(pg, src)
+            return pg.recv([tensor], group_src_rank, tag)
+
+@_exception_logger
+def send(tensor: torch.Tensor, dst: int, group: Optional[ProcessGroup] = None, tag: int = 0) -> None:
+    """
+    Send a tensor synchronously.
+
+    Args:
+        tensor (Tensor): Tensor to send.
+        dst (int): Destination rank on global process group (regardless of ``group`` argument).
+            Destination rank should not be the same as the rank of the current process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        tag (int, optional): Tag to match send with remote recv
+
+    """
+    if get_rank() == dst:
+        raise ValueError(
+            "Invalid destination rank: destination rank should not be the same as "
+            "the rank of the current process."
+        )
+
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("send")
+        return None
+
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
+    if group is None or group is GroupMember.WORLD:
+        default_pg = _get_default_group()
+        default_pg.send([tensor], dst, tag).wait()
+    else:
+        group_dst_rank = get_group_rank(group, dst)
+        group.send([tensor], group_dst_rank, tag).wait()
+
+@_exception_logger
+def recv(tensor: torch.Tensor, src: Optional[int] = None, group: Optional[ProcessGroup] = None, tag: int = 0) -> int:
+    """
+    Receives a tensor synchronously.
+
+    Args:
+        tensor (Tensor): Tensor to fill with received data.
+        src (int, optional): Source rank on global process group (regardless of ``group`` argument).
+            Will receive from any process if unspecified.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        tag (int, optional): Tag to match recv with remote send
+
+    Returns:
+        Sender rank
+        -1, if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("recv")
+        return -1
+
+    if tensor.is_complex():
+        tensor = torch.view_as_real(tensor)
+
+    if group is None:
+        pg = _get_default_group()
+    else:
+        pg = group
+
+    if src is None:
+        work = pg.recv_anysource([tensor], tag)
+        work.wait()
+        src_rank = work._source_rank()
+        if group is None or group is GroupMember.WORLD:
+            return src_rank
+        else:
+            return get_global_rank(pg, src_rank)
+    else:
+        if group is None or group is GroupMember.WORLD:
+            pg.recv([tensor], src, tag).wait()
+        else:
+            group_src_rank = get_group_rank(pg, src)
+            pg.recv([tensor], group_src_rank, tag).wait()
+        return src
+
+
+class _IllegalWork(Work):
+    def __getattribute__(self, name):
+        if name in ["is_success", "exception", "wait", "source_rank", "_source_rank", "result", "synchronize"]:
+            raise ValueError(f"Illegal to call {name} on IllegalWork object")
+
+
+class _CoalescingManager:
+    def __init__(self):
+        self.works: List[Work] = []
+
+    def append(self, work: Work):
+        if work:
+            self.works.append(work)
+
+    def wait(self):
+        for work in self.works:
+            work.wait()
+
+
+@contextlib.contextmanager
+def _coalescing_manager(
+    group: Optional[ProcessGroup] = None,
+    device: Optional[torch.device] = None,
+    async_ops: Optional[bool] = False,
+):
+    """
+    Context manager used to coalesce collectives or P2P operations when possible.
+
+    Args:
+        group (`ProcessGroup`, optional): The process group to work on. If None,
+            the default process group will be used.
+        device (`torch.device`, optional): Default is None, set to a device if
+            there isn't a `**_coalesced` implementation by the backend.
+        async_ops (`bool`, optional): whether the coalesced ops are async ops.
+
+    Examples:
+        >>> # xdoctest: +SKIP("no rank")
+        >>> # Synchronous ops
+        >>> with _coalescing_manager():
+        >>>     for i in range(num_colls):
+        >>>         dist.all_reduce(tensors[i])
+        >>> # Asynchronous ops
+        >>> with _coalescing_manager(async_ops=True) as cm:
+        >>>     for i in range(num_colls):
+        >>>         dist.all_reduce(tensors[i])
+        >>> cm.wait()
+
+    .. warning::
+       :func:`_coalescing_manager` currently do not support coalescing
+       all-reduces with different reduce operators, e.g.  `ReduceOp.SUM` mixed
+       with `ReduceOp.PRODUCT`.
+    """
+    group = group or _get_default_group()
+    op_list = _world.pg_coalesce_state.setdefault(group, [])
+    if op_list:
+        raise ValueError("ProcessGroup has non-empty op list at the start of coalescing")
+    if device:
+        group._start_coalescing(device)
+    cm = _CoalescingManager()
+    yield cm
+    op_list = _world.pg_coalesce_state.pop(group)
+    if op_list:
+        # Collectives supporting "Fast Path" coalescing are captured.
+        # See implementation in corresponding collective APIs.
+        # Currently supported:
+        # - coalesced `all_reduce`
+        # - coalesced `all_gather_into_tensor`
+        # - coalesced `reduce_scatter_tensor`
+        op0 = op_list[0].op
+        if op0 == all_reduce:
+            tensors = []
+            for op in op_list:
+                tensors.append(op.tensor)
+            all_reduce_opts = AllreduceCoalescedOptions()
+            all_reduce_opts.reduceOp = not_none(op_list[0].redop)
+            work = group.allreduce_coalesced(tensors, all_reduce_opts)
+        elif op0 == all_gather_into_tensor:
+            inputs = []
+            outputs = []
+            for op in op_list:
+                inputs.append(op.tensor)
+                outputs.append(not_none(op.dst_tensor))
+            work = group.allgather_into_tensor_coalesced(outputs, inputs)
+        elif op0 == reduce_scatter_tensor:
+            inputs = []
+            outputs = []
+            for op in op_list:
+                inputs.append(op.tensor)
+                outputs.append(not_none(op.dst_tensor))
+            reduce_opts = ReduceScatterOptions()
+            reduce_opts.reduceOp = not_none(op_list[0].redop)
+            work = group.reduce_scatter_tensor_coalesced(outputs, inputs, reduce_opts)
+        else:
+            raise AssertionError(
+                f"Coalescing manager does not support fast-path coalescing of {op0}, "
+                f"yet {op0} is still recorded in op list. This is an internal error of c10d."
+            )
+
+    if device:
+        # Old style of letting each coll inside the context manager to call into C++ counterpart via python binding
+        work = group._end_coalescing(device)
+
+    if async_ops:
+        cm.append(work)  # type: ignore[possibly-undefined]
+    else:
+        work.wait()  # type: ignore[possibly-undefined]
+
+
+def batch_isend_irecv(p2p_op_list):
+    """
+    Send or Receive a batch of tensors asynchronously and return a list of requests.
+
+    Process each of the operations in ``p2p_op_list`` and return the corresponding
+    requests. NCCL, Gloo, and UCC backend are currently supported.
+
+    Args:
+        p2p_op_list: A list of point-to-point operations(type of each operator is
+            ``torch.distributed.P2POp``). The order of the isend/irecv in the list
+            matters and it needs to match with corresponding isend/irecv on the
+            remote end.
+
+    Returns:
+        A list of distributed request objects returned by calling the corresponding
+        op in the op_list.
+
+    Examples:
+        >>> # xdoctest: +SKIP("no rank")
+        >>> send_tensor = torch.arange(2, dtype=torch.float32) + 2 * rank
+        >>> recv_tensor = torch.randn(2, dtype=torch.float32)
+        >>> send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1)%world_size)
+        >>> recv_op = dist.P2POp(dist.irecv, recv_tensor, (rank - 1 + world_size)%world_size)
+        >>> reqs = batch_isend_irecv([send_op, recv_op])
+        >>> for req in reqs:
+        >>>     req.wait()
+        >>> recv_tensor
+        tensor([2, 3])     # Rank 0
+        tensor([0, 1])     # Rank 1
+
+    .. note:: Note that when this API is used with the NCCL PG backend, users must set
+        the current GPU device with `torch.cuda.set_device`, otherwise it will
+        lead to unexpected hang issues.
+
+        In addition, if this API is the first collective call in the ``group``
+        passed to ``dist.P2POp``, all ranks of the ``group`` must participate in
+        this API call; otherwise, the behavior is undefined. If this API call is
+        not the first collective call in the ``group``, batched P2P operations
+        involving only a subset of ranks of the ``group`` are allowed.
+    """
+    _check_p2p_op_list(p2p_op_list)
+    group = p2p_op_list[0].group
+    device = p2p_op_list[0].tensor.device
+    if device.type == "cuda":
+        # NCCL style coalescing
+        with _coalescing_manager(group, device, async_ops=True) as cm:
+            for p2p_op in p2p_op_list:
+                p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
+        return cm.works
+    else:
+        # Backward support for Gloo
+        reqs = []
+        for p2p_op in p2p_op_list:
+            work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
+            if work:
+                reqs.append(work)
+        return reqs
+
+
+@_exception_logger
+def broadcast(tensor, src, group=None, async_op=False):
+    """
+    Broadcasts the tensor to the whole group.
+
+    ``tensor`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Args:
+        tensor (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and tensor to be used to save received data otherwise.
+        src (int): Source rank on global process group (regardless of ``group`` argument).
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("broadcast")
+        return
+
+    opts = BroadcastOptions()
+    opts.rootRank = src
+    opts.rootTensor = 0
+    opts.asyncOp = async_op
+
+    if group is None or group is GroupMember.WORLD:
+        default_pg = _get_default_group()
+        work = default_pg.broadcast([tensor], opts)
+    else:
+        group_src_rank = get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.broadcast([tensor], opts)
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+@_exception_logger
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces the tensor data across all machines in a way that all get the final result.
+
+    After the call ``tensor`` is going to be bitwise identical in all processes.
+
+    Complex tensors are supported.
+
+    Args:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    Examples:
+        >>> # xdoctest: +SKIP("no rank")
+        >>> # All tensors below are of torch.int64 type.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank
+        >>> tensor
+        tensor([1, 2], device='cuda:0') # Rank 0
+        tensor([3, 4], device='cuda:1') # Rank 1
+        >>> dist.all_reduce(tensor, op=ReduceOp.SUM)
+        >>> tensor
+        tensor([4, 6], device='cuda:0') # Rank 0
+        tensor([4, 6], device='cuda:1') # Rank 1
+
+        >>> # All tensors below are of torch.cfloat type.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat, device=device) + 2 * rank * (1+1j)
+        >>> tensor
+        tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0
+        tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1
+        >>> dist.all_reduce(tensor, op=ReduceOp.SUM)
+        >>> tensor
+        tensor([4.+4.j, 6.+6.j], device='cuda:0') # Rank 0
+        tensor([4.+4.j, 6.+6.j], device='cuda:1') # Rank 1
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_reduce")
+        return
+
+    if tensor.is_complex():
+        if not supports_complex(op):
+            raise ValueError(f"all_reduce does not support {op} on complex tensors")
+        tensor = torch.view_as_real(tensor)
+
+    opts = AllreduceOptions()
+    opts.reduceOp = op
+    if group is None:
+        group = _get_default_group()
+
+    if group in _world.pg_coalesce_state.keys():
+        # We are in coalescing context, do not issue single operation, just append a collective representation
+        coll = _CollOp(all_reduce, tensor, None, op, None)
+        _world.pg_coalesce_state[group].append(coll)
+        if async_op:
+            return _IllegalWork()
+        else:
+            return None
+
+    work = group.allreduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+@_exception_logger
+def all_reduce_coalesced(tensors, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    WARNING: at this time individual shape checking is not implemented across nodes.
+
+    For example, if the rank 0 node passes [torch.rand(4), torch.rand(2)] and the
+    rank 1 node passes [torch.rand(2), torch.rand(2), torch.rand(2)], the allreduce
+    operation will proceed without complaint and return erroneous outputs. This lack
+    of shape checking results in significant performance improvements but users of this
+    function should take extra care to ensure that each node passes in tensors whose
+    shapes match across nodes.
+
+    Reduces each tensor in tensors (residing on the same device) across all machines
+    in such a way that all get the final result.
+
+    After the call each tensor in tensors is going to bitwise identical
+    in all processes.
+
+    Complex tensors are supported.
+
+    Args:
+        tensors (Union[List[Tensor], Tensor]): Input and output of the collective.
+            The function operates in-place.
+        op (Optional[ReduceOp]): One of the values from
+            ``torch.distributed.ReduceOp`` enum. Specifies an operation used for
+            element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (Optional[bool]): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    """
+    warnings.warn(
+        "torch.distributed.all_reduce_coalesced will be deprecated. If you must "
+        "use it, please revisit our documentation later at "
+        "https://pytorch.org/docs/master/distributed.html#collective-functions"
+    )
+    if isinstance(tensors, torch.Tensor):
+        tensors = [tensors]
+    _check_tensor_list(tensors, "tensor")
+    _ensure_all_tensors_same_dtype(tensors)
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_reduce_coalesced")
+        return
+
+    if any(t.is_complex() for t in tensors) and not supports_complex(op):
+        raise ValueError(f"all_reduce does not support {op} on complex tensors")
+
+    tensors = [t if not t.is_complex() else torch.view_as_real(t) for t in tensors]
+
+    opts = AllreduceCoalescedOptions()
+    opts.reduceOp = op
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.allreduce_coalesced(tensors, opts)
+    else:
+        work = group.allreduce_coalesced(tensors, opts)
+
+    if async_op:
+        return work.get_future()
+    else:
+        work.wait()
+
+@_exception_logger
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces the tensor data across all machines.
+
+    Only the process with rank ``dst`` is going to receive the final result.
+
+    Args:
+        tensor (Tensor): Input and output of the collective. The function
+            operates in-place.
+        dst (int): Destination rank on global process group (regardless of ``group`` argument)
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("reduce")
+        return
+
+    opts = ReduceOptions()
+    opts.reduceOp = op
+    opts.rootRank = dst
+
+    if group is None or group is GroupMember.WORLD:
+        default_pg = _get_default_group()
+        work = default_pg.reduce([tensor], opts)
+    else:
+        group_dst_rank = get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.reduce([tensor], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+def _object_to_tensor(obj, device, group):
+    f = io.BytesIO()
+    _pickler(f).dump(obj)
+    byte_storage = torch.ByteStorage._from_buffer(f.getvalue())  # type: ignore[attr-defined]
+    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
+    # Otherwise, it will casue 100X slowdown.
+    # See: https://github.com/pytorch/pytorch/issues/65696
+    byte_tensor = torch.ByteTensor(byte_storage).to(device)
+    if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
+        backend = get_backend(group)
+        if backend == Backend.NCCL:
+            hash = torch._C._distributed_c10d._hash_tensors([byte_tensor])
+            logger.warning(f"_object_to_tensor size: {byte_tensor.numel()} hash value: {hash}")  # noqa: G004
+    local_size = torch.LongTensor([byte_tensor.numel()]).to(device)
+    return byte_tensor, local_size
+
+
+def _tensor_to_object(tensor, tensor_size, group):
+    if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
+        backend = get_backend(group)
+        if backend == Backend.NCCL:
+            hash = torch._C._distributed_c10d._hash_tensors([tensor])
+            logger.warning(f"_tensor_to_object size: {tensor.numel()} hash value: {hash}")  # noqa: G004
+    tensor = tensor.cpu()
+    buf = tensor.numpy().tobytes()[:tensor_size]
+    return _unpickler(io.BytesIO(buf)).load()
+
+
+@_exception_logger
+def all_gather_object(object_list, obj, group=None):
+    """
+    Gathers picklable objects from the whole group into a list.
+
+    Similar to :func:`all_gather`, but Python objects can be passed in.
+    Note that the object must be picklable in order to be gathered.
+
+    Args:
+        object_list (list[Any]): Output list. It should be correctly sized as the
+            size of the group for this collective and will contain the output.
+        obj (Any): Pickable Python object to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+
+    Returns:
+        None. If the calling rank is part of this group, the output of the
+        collective will be populated into the input ``object_list``. If the
+        calling rank is not part of the group, the passed in ``object_list`` will
+        be unmodified.
+
+    .. note:: Note that this API differs slightly from the :func:`all_gather`
+        collective since it does not provide an ``async_op`` handle and thus
+        will be a blocking call.
+
+    .. note:: For NCCL-based processed groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsiblity to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. warning::
+        :func:`all_gather_object` uses ``pickle`` module implicitly, which is
+        known to be insecure. It is possible to construct malicious pickle data
+        which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`all_gather_object` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`all_gather` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes world_size of 3.
+        >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> output = [None for _ in gather_objects]
+        >>> dist.all_gather_object(output, gather_objects[dist.get_rank()])
+        >>> output
+        ['foo', 12, {1: 2}]
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_object")
+        return
+
+    current_device = _get_pg_default_device(group)
+    input_tensor, local_size = _object_to_tensor(obj, current_device, group)
+
+    # Gather all local sizes. This is so that we can find the max size, and index
+    # until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device
+    )
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes
+    all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())  # type: ignore[type-var]
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    coalesced_output_tensor = torch.empty(
+        max_object_size * group_size, dtype=torch.uint8, device=current_device
+    )
+    # Output tensors are nonoverlapping views of coalesced_output_tensor
+    output_tensors = [
+        coalesced_output_tensor[max_object_size * i : max_object_size * (i + 1)]
+        for i in range(group_size)
+    ]
+    all_gather(output_tensors, input_tensor, group=group)
+    # Deserialize outputs back to object.
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        tensor_size = object_size_list[i]
+        object_list[i] = _tensor_to_object(tensor, tensor_size, group)
+
+
+@_exception_logger
+def gather_object(obj, object_gather_list=None, dst=0, group=None):
+    """
+    Gathers picklable objects from the whole group in a single process.
+
+    Similar to :func:`gather`, but Python objects can be passed in. Note that the
+    object must be picklable in order to be gathered.
+
+    Args:
+        obj (Any): Input object. Must be picklable.
+        object_gather_list (list[Any]): Output list. On the ``dst`` rank, it
+            should be correctly sized as the size of the group for this
+            collective and will contain the output. Must be ``None`` on non-dst
+            ranks. (default is ``None``)
+        dst (int, optional): Destination rank on global process group (regardless of ``group`` argument). (default is 0)
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+
+    Returns:
+        None. On the ``dst`` rank, ``object_gather_list`` will contain the
+        output of the collective.
+
+    .. note:: Note that this API differs slightly from the gather collective
+        since it does not provide an async_op handle and thus will be a blocking
+        call.
+
+    .. note:: For NCCL-based processed groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsiblity to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. warning::
+        :func:`gather_object` uses ``pickle`` module implicitly, which is
+        known to be insecure. It is possible to construct malicious pickle data
+        which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`gather_object` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`gather` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> # Assumes world_size of 3.
+        >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> output = [None for _ in gather_objects]
+        >>> dist.gather_object(
+        ...     gather_objects[dist.get_rank()],
+        ...     output if dist.get_rank() == 0 else None,
+        ...     dst=0
+        ... )
+        >>> # On rank 0
+        >>> output
+        ['foo', 12, {1: 2}]
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("gather_object")
+        return
+
+    # Ensure object_gather_list is specified appropriately.
+    my_rank = get_rank()
+    _validate_output_list_for_rank(my_rank, dst, object_gather_list)
+    current_device = _get_pg_default_device(group)
+    input_tensor, local_size = _object_to_tensor(obj, current_device, group)
+
+    # Gather all local sizes. This is so that we can find the max size, and index
+    # until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device
+    )
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes. An all-gather is needed here despite this being a
+    # gather, since each rank needs to broadcast a tensor of the same (maximal)
+    # size.
+    all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())  # type: ignore[type-var]
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    # Avoid populating output tensors if the result won't be gathered on this rank.
+    if my_rank == dst:
+        coalesced_output_tensor = torch.empty(
+            max_object_size * group_size, dtype=torch.uint8, device=current_device
+        )
+        # Output tensors are nonoverlapping views of coalesced_output_tensor
+        output_tensors = [
+            coalesced_output_tensor[max_object_size * i : max_object_size * (i + 1)]
+            for i in range(group_size)
+        ]
+    # All ranks call gather with equal-sized tensors.
+    gather(
+        input_tensor,
+        gather_list=output_tensors if my_rank == dst else None,  # type: ignore[possibly-undefined]
+        dst=dst,
+        group=group,
+    )
+    if my_rank != dst:
+        return
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        tensor_size = object_size_list[i]
+        object_gather_list[i] = _tensor_to_object(tensor, tensor_size, group)
+
+
+@_exception_logger
+def broadcast_object_list(object_list, src=0, group=None, device=None):
+    """
+    Broadcasts picklable objects in ``object_list`` to the whole group.
+
+    Similar to :func:`broadcast`, but Python objects can be passed in.
+    Note that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+
+    Args:
+        object_list (List[Any]): List of input objects to broadcast.
+            Each object must be picklable. Only objects on the ``src`` rank will
+            be broadcast, but each rank must provide lists of equal sizes.
+        src (int): Source rank from which to broadcast ``object_list``.
+            Source rank is based on global process group (regardless of ``group`` argument)
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, the objects are
+            serialized and converted to tensors which are moved to the
+            ``device`` before broadcasting. Default is ``None``.
+
+    Returns:
+        ``None``. If rank is part of the group, ``object_list`` will contain the
+        broadcasted objects from ``src`` rank.
+
+    .. note:: For NCCL-based process groups, internal tensor representations
+        of objects must be moved to the GPU device before communication takes
+        place. In this case, the device used is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is set so that each rank has an individual GPU, via
+        ``torch.cuda.set_device()``.
+
+    .. note:: Note that this API differs slightly from the :func:`all_gather`
+        collective since it does not provide an ``async_op`` handle and thus
+        will be a blocking call.
+
+    .. warning::
+        :func:`broadcast_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`broadcast_object_list` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`broadcast` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> else:
+        >>>     objects = [None, None, None]
+        >>> # Assumes backend is not NCCL
+        >>> device = torch.device("cpu")
+        >>> dist.broadcast_object_list(objects, src=0, device=device)
+        >>> objects
+        ['foo', 12, {1: 2}]
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("broadcast_object_list")
+        return
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is default to ``None``
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In the
+    # case it is not ``None`` we move the size and object tensors to be
+    # broadcasted to this device.
+    current_device = device or _get_pg_default_device(group)
+    my_rank = get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        tensor_list, size_list = zip(*[_object_to_tensor(obj, current_device, group) for obj in object_list])
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long, device=current_device)
+
+    # Broadcast object sizes
+    broadcast(object_sizes_tensor, src=src, group=group)
+
+    # Concatenate and broadcast serialized object tensors
+    # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
+    # has only one element, we can skip the copy.
+    if my_rank == src:
+        if len(tensor_list) == 1:  # type: ignore[possibly-undefined]
+            object_tensor = tensor_list[0]
+        else:
+            object_tensor = torch.cat(tensor_list)
+    else:
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            torch.sum(object_sizes_tensor).item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device=current_device
+        )
+
+    broadcast(object_tensor, src=src, group=group)
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_rank != src:
+        for i, obj_size in enumerate(object_sizes_tensor):
+            obj_view = object_tensor[offset : offset + obj_size]
+            obj_view = obj_view.type(torch.uint8)
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size, group)
+
+
+@_exception_logger
+def scatter_object_list(
+    scatter_object_output_list, scatter_object_input_list, src=0, group=None
+):
+    """
+    Scatters picklable objects in ``scatter_object_input_list`` to the whole group.
+
+    Similar to :func:`scatter`, but Python objects can be passed in. On
+    each rank, the scattered object will be stored as the first element of
+    ``scatter_object_output_list``. Note that all objects in
+    ``scatter_object_input_list`` must be picklable in order to be scattered.
+
+    Args:
+        scatter_object_output_list (List[Any]): Non-empty list whose first
+            element will store the object scattered to this rank.
+        scatter_object_input_list (List[Any]): List of input objects to scatter.
+            Each object must be picklable. Only objects on the ``src`` rank will
+            be scattered, and the argument can be ``None`` for non-src ranks.
+        src (int): Source rank from which to scatter ``scatter_object_input_list``.
+            Source rank is based on global process group (regardless of ``group`` argument).
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+
+    Returns:
+        ``None``. If rank is part of the group, ``scatter_object_output_list``
+        will have its first element set to the scattered object for this rank.
+
+    .. note:: Note that this API differs slightly from the scatter collective
+        since it does not provide an ``async_op`` handle and thus will be a
+        blocking call.
+
+    .. warning::
+        :func:`scatter_object_list` uses ``pickle`` module implicitly, which
+        is known to be insecure. It is possible to construct malicious pickle
+        data which will execute arbitrary code during unpickling. Only call this
+        function with data you trust.
+
+    .. warning::
+        Calling :func:`scatter_object_list` with GPU tensors is not well supported
+        and inefficient as it incurs GPU -> CPU transfer since tensors would be
+        pickled. Please consider using :func:`scatter` instead.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     objects = ["foo", 12, {1: 2}] # any picklable object
+        >>> else:
+        >>>     # Can be any list on non-src ranks, elements are not used.
+        >>>     objects = [None, None, None]
+        >>> output_list = [None]
+        >>> dist.scatter_object_list(output_list, objects, src=0)
+        >>> # Rank i gets objects[i]. For example, on rank 2:
+        >>> output_list
+        [{1: 2}]
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("scatter_object_list")
+        return
+
+    if (
+        not isinstance(scatter_object_output_list, list)
+        or len(scatter_object_output_list) < 1
+    ):
+        raise ValueError(
+            "Expected argument scatter_object_output_list to be a list of size at least 1."
+        )
+
+    my_rank = get_rank()
+    pg_device = _get_pg_default_device(group)
+    if my_rank == src:
+        tensor_list, tensor_sizes = zip(
+            *[_object_to_tensor(obj, pg_device, group) for obj in scatter_object_input_list]
+        )
+        tensor_list, tensor_sizes = list(tensor_list), list(tensor_sizes)
+
+    # Src rank broadcasts the maximum tensor size. This is because all ranks are
+    # expected to call into scatter() with equal-sized tensors.
+    if my_rank == src:
+        max_tensor_size = max(tensor_sizes)  # type: ignore[possibly-undefined]
+        for tensor in tensor_list:  # type: ignore[possibly-undefined]
+            tensor.resize_(max_tensor_size)
+    else:
+        max_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
+    broadcast(max_tensor_size, src=src, group=group)
+
+    # Scatter actual serialized objects
+    output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8, device=pg_device)
+    scatter(
+        output_tensor,
+        scatter_list=None if my_rank != src else tensor_list,  # type: ignore[possibly-undefined]
+        src=src,
+        group=group,
+    )
+
+    # Scatter per-object sizes to trim tensors when deserializing back to object
+    obj_tensor_size = torch.tensor([0], dtype=torch.long, device=pg_device)
+    scatter(
+        obj_tensor_size,
+        scatter_list=None if my_rank != src else tensor_sizes,  # type: ignore[possibly-undefined]
+        src=src,
+        group=group,
+    )
+
+    # Deserialize back to object
+    scatter_object_output_list[0] = _tensor_to_object(output_tensor, obj_tensor_size, group)
+
+
+@_exception_logger
+def all_gather(tensor_list, tensor, group=None, async_op=False):
+    """
+    Gathers tensors from the whole group in a list.
+
+    Complex tensors are supported.
+
+    Args:
+        tensor_list (list[Tensor]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    Examples:
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # All tensors below are of torch.int64 dtype.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor_list = [torch.zeros(2, dtype=torch.int64, device=device) for _ in range(2)]
+        >>> tensor_list
+        [tensor([0, 0], device='cuda:0'), tensor([0, 0], device='cuda:0')] # Rank 0
+        [tensor([0, 0], device='cuda:0'), tensor([0, 0], device='cuda:1')] # Rank 1
+        >>> tensor = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank
+        >>> tensor
+        tensor([1, 2], device='cuda:0') # Rank 0
+        tensor([3, 4], device='cuda:1') # Rank 1
+        >>> dist.all_gather(tensor_list, tensor)
+        >>> tensor_list
+        [tensor([1, 2], device='cuda:0'), tensor([3, 4], device='cuda:0')] # Rank 0
+        [tensor([1, 2], device='cuda:1'), tensor([3, 4], device='cuda:1')] # Rank 1
+
+        >>> # All tensors below are of torch.cfloat dtype.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> tensor_list = [torch.zeros(2, dtype=torch.cfloat, device=device) for _ in range(2)]
+        >>> tensor_list
+        [tensor([0.+0.j, 0.+0.j], device='cuda:0'), tensor([0.+0.j, 0.+0.j], device='cuda:0')] # Rank 0
+        [tensor([0.+0.j, 0.+0.j], device='cuda:1'), tensor([0.+0.j, 0.+0.j], device='cuda:1')] # Rank 1
+        >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat, device=device) + 2 * rank * (1+1j)
+        >>> tensor
+        tensor([1.+1.j, 2.+2.j], device='cuda:0') # Rank 0
+        tensor([3.+3.j, 4.+4.j], device='cuda:1') # Rank 1
+        >>> dist.all_gather(tensor_list, tensor)
+        >>> tensor_list
+        [tensor([1.+1.j, 2.+2.j], device='cuda:0'), tensor([3.+3.j, 4.+4.j], device='cuda:0')] # Rank 0
+        [tensor([1.+1.j, 2.+2.j], device='cuda:1'), tensor([3.+3.j, 4.+4.j], device='cuda:1')] # Rank 1
+
+    """
+    _check_tensor_list(tensor_list, "tensor_list")
+    _check_single_tensor(tensor, "tensor")
+    _ensure_all_tensors_same_dtype(tensor_list, tensor)
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather")
+        return
+
+    tensor_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in tensor_list
+    ]
+    tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor)
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.allgather([tensor_list], [tensor])
+    else:
+        work = group.allgather([tensor_list], [tensor])
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False):
+    """
+    Gather tensors from all ranks and put them in a single output tensor.
+
+    Args:
+        output_tensor (Tensor): Output tensor to accommodate tensor elements
+            from all ranks. It must be correctly sized to have one of the
+            following forms:
+            (i) a concatenation of all the input tensors along the primary
+            dimension; for definition of "concatenation", see ``torch.cat()``;
+            (ii) a stack of all the input tensors along the primary dimension;
+            for definition of "stack", see ``torch.stack()``.
+            Examples below may better explain the supported output forms.
+        input_tensor (Tensor): Tensor to be gathered from current rank.
+            Different from the ``all_gather`` API, the input tensors in this
+            API must have the same size across all ranks.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    Examples:
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # All tensors below are of torch.int64 dtype and on CUDA devices.
+        >>> # We have two ranks.
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor_in = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank
+        >>> tensor_in
+        tensor([1, 2], device='cuda:0') # Rank 0
+        tensor([3, 4], device='cuda:1') # Rank 1
+        >>> # Output in concatenation form
+        >>> tensor_out = torch.zeros(world_size * 2, dtype=torch.int64, device=device)
+        >>> dist.all_gather_into_tensor(tensor_out, tensor_in)
+        >>> tensor_out
+        tensor([1, 2, 3, 4], device='cuda:0') # Rank 0
+        tensor([1, 2, 3, 4], device='cuda:1') # Rank 1
+        >>> # Output in stack form
+        >>> tensor_out2 = torch.zeros(world_size, 2, dtype=torch.int64, device=device)
+        >>> dist.all_gather_into_tensor(tensor_out2, tensor_in)
+        >>> tensor_out2
+        tensor([[1, 2],
+                [3, 4]], device='cuda:0') # Rank 0
+        tensor([[1, 2],
+                [3, 4]], device='cuda:1') # Rank 1
+
+    .. warning::
+        The Gloo backend does not support this API.
+
+    """
+    _check_single_tensor(input_tensor, "input_tensor")
+    _check_single_tensor(output_tensor, "output_tensor")
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_into_tensor")
+        return
+
+    output_tensor = (
+        output_tensor
+        if not output_tensor.is_complex()
+        else torch.view_as_real(output_tensor)
+    )
+    input_tensor = (
+        input_tensor
+        if not input_tensor.is_complex()
+        else torch.view_as_real(input_tensor)
+    )
+
+    opts = AllgatherOptions()
+    opts.asyncOp = async_op
+
+    group = group or _get_default_group()
+
+    if group in _world.pg_coalesce_state.keys():
+        # We are in coalescing context, do not issue single operation, just append a collective representation
+        coll = _CollOp(all_gather_into_tensor, input_tensor, output_tensor)
+        _world.pg_coalesce_state[group].append(coll)
+        if async_op:
+            return _IllegalWork()
+        else:
+            return None
+
+    work = group._allgather_base(output_tensor, input_tensor, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def _all_gather_base(output_tensor, input_tensor, group=None, async_op=False):
+    """
+    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.
+
+    Args:
+        output_tensor (Tensor): Output tensor. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        input_tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    .. warning::
+        `_all_gather_base` is a private function. Users should use
+        `all_gather_into_tensor` instead.
+
+    """
+    warnings.warn(
+        "torch.distributed._all_gather_base is a private function and will be "
+        "deprecated. Please use torch.distributed.all_gather_into_tensor "
+        "instead."
+    )
+    return all_gather_into_tensor(output_tensor, input_tensor, group, async_op)
+
+
+@_exception_logger
+def all_gather_coalesced(
+    output_tensor_lists, input_tensor_list, group=None, async_op=False
+):
+    """
+    Gathers input tensors from the whole group in a list in a coalesced manner.
+
+    Complex tensors are supported.
+
+    Args:
+        output_tensor_lists (list[list[Tensor]]): Output list. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        input_tensor_list (list[Tensor]): Tensors to be broadcast from
+            current process. At least one tensor has to be non empty.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    Example:
+        we have 2 process groups, 2 ranks.
+        rank 0 passes:
+            input_tensor_list = [[[1, 1], [1, 1]], [2], [3, 3]]
+            output_tensor_lists =
+               [[[[-1, -1], [-1, -1]], [-1], [-1, -1]],
+                [[[-1, -1], [-1, -1]], [-1], [-1, -1]]]
+        rank 1 passes:
+            input_tensor_list = [[[3, 3], [3, 3]], [5], [1, 1]]
+            output_tensor_lists =
+               [[[[-1, -1], [-1, -1]], [-1], [-1, -1]],
+                [[[-1, -1], [-1, -1]], [-1], [-1, -1]]]
+        both rank 0 and 1 get:
+            output_tensor_lists =
+               [[[1, 1], [1, 1]], [2], [3, 3]],
+                [[3, 3], [3, 3]], [5], [1, 1]]].
+
+    WARNING: at this time individual shape checking is not implemented across nodes.
+    For example, if the rank 0 node passes [torch.rand(4), torch.rand(2)] and the
+    rank 1 node passes [torch.rand(2), torch.rand(2), torch.rand(2)], the
+    all_gather_coalesced operation will proceed without complaint and return
+    erroneous outputs. This lack of shape checking results in significant
+    performance improvements but users of this function should take extra care
+    to ensure that each node passes in tensors whose shapes match across nodes.
+    """
+    warnings.warn(
+        "torch.distributed.all_gather_coalesced will be deprecated. If you must "
+        "use it, please revisit our documentation later at "
+        "https://pytorch.org/docs/master/distributed.html#collective-functions"
+    )
+    # We only check basic compatibility with C++ params here, C++ code will
+    # do shape and type checking.
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_gather_coalesced")
+        return
+    _check_tensor_list(input_tensor_list, "input_tensor_list")
+    _ensure_all_tensors_same_dtype(input_tensor_list)
+    if not isinstance(output_tensor_lists, list):
+        raise TypeError(
+            "Invalid function argument: output_tensor_lists should be a list"
+        )
+    for output_tensor_list in output_tensor_lists:
+        _check_tensor_list(output_tensor_list, "output_tensor_lists")
+        _ensure_all_tensors_same_dtype(output_tensor_list)
+
+    output_tensor_lists = [
+        [t if not t.is_complex() else torch.view_as_real(t) for t in l]
+        for l in output_tensor_lists
+    ]
+    input_tensor_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in input_tensor_list
+    ]
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.allgather_coalesced(output_tensor_lists, input_tensor_list)
+    else:
+        work = group.allgather_coalesced(output_tensor_lists, input_tensor_list)
+
+    if async_op:
+        return work.get_future()
+    else:
+        work.wait()
+
+
+def _validate_output_list_for_rank(my_rank, dst, gather_list):
+    if dst == my_rank:
+        if not gather_list:
+            raise ValueError(
+                "Argument ``gather_list`` must be specified on destination rank."
+            )
+    elif gather_list:
+        raise ValueError(
+            "Argument ``gather_list`` must NOT be specified "
+            "on non-destination ranks."
+        )
+
+
+@_exception_logger
+def gather(tensor, gather_list=None, dst=0, group=None, async_op=False):
+    """
+    Gathers a list of tensors in a single process.
+
+    Args:
+        tensor (Tensor): Input tensor.
+        gather_list (list[Tensor], optional): List of appropriately-sized
+            tensors to use for gathered data (default is None, must be specified
+            on the destination rank)
+        dst (int, optional): Destination rank on global process group (regardless of ``group`` argument). (default is 0)
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    """
+    _check_single_tensor(tensor, "tensor")
+
+    # Parameter ``gather_list`` may be left unspecified on non-dst ranks.
+    if gather_list:
+        _check_tensor_list(gather_list, "gather_list")
+    else:
+        gather_list = []
+    _ensure_all_tensors_same_dtype(tensor, gather_list)
+
+    if _rank_not_in_group(group):
+        _warn_not_in_group("gather")
+        return
+
+    my_rank = get_rank()
+    _validate_output_list_for_rank(my_rank, dst, gather_list)
+    output_tensors = [gather_list] if dst == my_rank else []
+    input_tensors = [tensor]
+
+    opts = GatherOptions()
+    opts.rootRank = dst
+
+    if group is None or group is GroupMember.WORLD:
+        default_pg = _get_default_group()
+        work = default_pg.gather(output_tensors, input_tensors, opts)
+    else:
+        group_dst_rank = get_group_rank(group, dst)
+        opts.rootRank = group_dst_rank
+        work = group.gather(output_tensors, input_tensors, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def scatter(tensor, scatter_list=None, src=0, group=None, async_op=False):
+    """
+    Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    ``tensor`` argument.
+
+    Complex tensors are supported.
+
+    Args:
+        tensor (Tensor): Output tensor.
+        scatter_list (list[Tensor]): List of tensors to scatter (default is
+            None, must be specified on the source rank)
+        src (int): Source rank on global process group (regardless of ``group`` argument).
+            Default is 0
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    .. note:: Note that all Tensors in scatter_list must have the same size.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> tensor_size = 2
+        >>> t_ones = torch.ones(tensor_size)
+        >>> t_fives = torch.ones(tensor_size) * 5
+        >>> output_tensor = torch.zeros(tensor_size)
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 2.
+        >>>     # Only tensors, all of which must be the same size.
+        >>>     scatter_list = [t_ones, t_fives]
+        >>> else:
+        >>>     scatter_list = None
+        >>> dist.scatter(output_tensor, scatter_list, src=0)
+        >>> # Rank i gets scatter_list[i]. For example, on rank 1:
+        >>> output_tensor
+        tensor([5., 5.])
+
+    """
+    _check_single_tensor(tensor, "tensor")
+
+    # Parameter ``scatter_list`` may be left unspecified on non-src ranks.
+    if scatter_list:
+        _check_tensor_list(scatter_list, "scatter_list")
+    else:
+        scatter_list = []
+    _ensure_all_tensors_same_dtype(tensor, scatter_list)
+
+    if _rank_not_in_group(group):
+        _warn_not_in_group("scatter")
+        return
+    scatter_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in scatter_list
+    ]
+    tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor)
+
+    my_rank = get_rank()
+    if src == my_rank:
+        if not scatter_list:
+            raise ValueError(
+                "Argument ``scatter_list`` must be specified on source rank."
+            )
+        input_tensors = [scatter_list]
+        output_tensors = [tensor]
+    else:
+        if scatter_list:
+            raise ValueError(
+                "Argument ``scatter_list`` must NOT be specified "
+                "on non-source ranks."
+            )
+        input_tensors = []
+        output_tensors = [tensor]
+
+    opts = ScatterOptions()
+    opts.rootRank = src
+    opts.asyncOp = async_op
+
+    if group is None or group is GroupMember.WORLD:
+        default_pg = _get_default_group()
+        work = default_pg.scatter(output_tensors, input_tensors, opts)
+    else:
+        group_src_rank = get_group_rank(group, src)
+        opts.rootRank = group_src_rank
+        work = group.scatter(output_tensors, input_tensors, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces, then scatters a list of tensors to all processes in a group.
+
+    Args:
+        output (Tensor): Output tensor.
+        input_list (list[Tensor]): List of tensors to reduce and scatter.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    """
+    _check_single_tensor(output, "output")
+    _check_tensor_list(input_list, "input_list")
+    _ensure_all_tensors_same_dtype(output, input_list)
+    if _rank_not_in_group(group):
+        _warn_not_in_group("reduce_scatter")
+        return
+
+    opts = ReduceScatterOptions()
+    opts.reduceOp = op
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.reduce_scatter([output], [input_list], opts)
+    else:
+        work = group.reduce_scatter([output], [input_list], opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def reduce_scatter_tensor(output, input, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces, then scatters a tensor to all ranks in a group.
+
+    Args:
+        output (Tensor): Output tensor. It should have the same size across all
+            ranks.
+        input (Tensor): Input tensor to be reduced and scattered. Its size
+            should be output tensor size times the world size. The input tensor
+            can have one of the following shapes:
+            (i) a concatenation of the output tensors along the primary
+            dimension, or
+            (ii) a stack of the output tensors along the primary dimension.
+            For definition of "concatenation", see ``torch.cat()``.
+            For definition of "stack", see ``torch.stack()``.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    Examples:
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # All tensors below are of torch.int64 dtype and on CUDA devices.
+        >>> # We have two ranks.
+        >>> device = torch.device(f'cuda:{rank}')
+        >>> tensor_out = torch.zeros(2, dtype=torch.int64, device=device)
+        >>> # Input in concatenation form
+        >>> tensor_in = torch.arange(world_size * 2, dtype=torch.int64, device=device)
+        >>> tensor_in
+        tensor([0, 1, 2, 3], device='cuda:0') # Rank 0
+        tensor([0, 1, 2, 3], device='cuda:1') # Rank 1
+        >>> dist.reduce_scatter_tensor(tensor_out, tensor_in)
+        >>> tensor_out
+        tensor([0, 2], device='cuda:0') # Rank 0
+        tensor([4, 6], device='cuda:1') # Rank 1
+        >>> # Input in stack form
+        >>> tensor_in = torch.reshape(tensor_in, (world_size, 2))
+        >>> tensor_in
+        tensor([[0, 1],
+                [2, 3]], device='cuda:0') # Rank 0
+        tensor([[0, 1],
+                [2, 3]], device='cuda:1') # Rank 1
+        >>> dist.reduce_scatter_tensor(tensor_out, tensor_in)
+        >>> tensor_out
+        tensor([0, 2], device='cuda:0') # Rank 0
+        tensor([4, 6], device='cuda:1') # Rank 1
+
+    .. warning::
+        The Gloo backend does not support this API.
+
+    """
+    _check_single_tensor(output, "output")
+    _check_single_tensor(input, "input")
+
+    if _rank_not_in_group(group):
+        _warn_not_in_group("reduce_scatter_tensor")
+        return
+
+    opts = ReduceScatterOptions()
+    opts.reduceOp = op
+    opts.asyncOp = async_op
+
+    group = group or _get_default_group()
+
+    # Check if we are in coalescing context
+    # If we are, do not issue single operation, just append a collective representation
+    if group in _world.pg_coalesce_state.keys():
+        coll = _CollOp(reduce_scatter_tensor, input, output, op, None)
+        _world.pg_coalesce_state[group].append(coll)
+        if async_op:
+            return _IllegalWork()
+        else:
+            return None
+
+    work = group._reduce_scatter_base(output, input, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def _reduce_scatter_base(output, input, op=ReduceOp.SUM, group=None, async_op=False):
+    """
+    Reduces, then scatters a flattened tensor to all processes in a group.
+
+    Args:
+        output (Tensor): Output tensor.
+        input (Tensor): Input tensor that is of size output tensor size times world size
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    .. warning::
+        `_reduce_scatter_base` is a private function. Users should use
+        `reduce_scatter_tensor` instead.
+
+    """
+    warnings.warn(
+        "torch.distributed._reduce_scatter_base is a private function and will "
+        "be deprecated. Please use torch.distributed.reduce_scatter_tensor "
+        "instead."
+    )
+    return reduce_scatter_tensor(output, input, op, group, async_op)
+
+
+@_exception_logger
+def all_to_all_single(
+    output,
+    input,
+    output_split_sizes=None,
+    input_split_sizes=None,
+    group=None,
+    async_op=False,
+):
+    """
+    Split input tensor and then scatter the split list to all processes in a group.
+
+    Later the received tensors are concatenated from all the processes in the group
+    and returned as a single output tensor.
+
+    Complex tensors are supported.
+
+    Args:
+        output (Tensor): Gathered concatenated output tensor.
+        input (Tensor): Input tensor to scatter.
+        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
+            if specified None or empty, dim 0 of ``output`` tensor must divide
+            equally by ``world_size``.
+        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
+            if specified None or empty, dim 0 of ``input`` tensor must divide
+            equally by ``world_size``.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    .. warning::
+        `all_to_all_single` is experimental and subject to change.
+
+    Examples:
+        >>> # xdoctest: +SKIP("Undefined rank")
+        >>> input = torch.arange(4) + rank * 4
+        >>> input
+        tensor([0, 1, 2, 3])     # Rank 0
+        tensor([4, 5, 6, 7])     # Rank 1
+        tensor([8, 9, 10, 11])   # Rank 2
+        tensor([12, 13, 14, 15]) # Rank 3
+        >>> output = torch.empty([4], dtype=torch.int64)
+        >>> dist.all_to_all_single(output, input)
+        >>> output
+        tensor([0, 4, 8, 12])    # Rank 0
+        tensor([1, 5, 9, 13])    # Rank 1
+        tensor([2, 6, 10, 14])   # Rank 2
+        tensor([3, 7, 11, 15])   # Rank 3
+
+        >>> # Essentially, it is similar to following operation:
+        >>> scatter_list = list(input.chunk(world_size))
+        >>> gather_list  = list(output.chunk(world_size))
+        >>> for i in range(world_size):
+        >>>     dist.scatter(gather_list[i], scatter_list if i == rank else [], src = i)
+
+        >>> # Another example with uneven split
+        >>> input
+        tensor([0, 1, 2, 3, 4, 5])                                       # Rank 0
+        tensor([10, 11, 12, 13, 14, 15, 16, 17, 18])                     # Rank 1
+        tensor([20, 21, 22, 23, 24])                                     # Rank 2
+        tensor([30, 31, 32, 33, 34, 35, 36])                             # Rank 3
+        >>> input_splits
+        [2, 2, 1, 1]                                                     # Rank 0
+        [3, 2, 2, 2]                                                     # Rank 1
+        [2, 1, 1, 1]                                                     # Rank 2
+        [2, 2, 2, 1]                                                     # Rank 3
+        >>> output_splits
+        [2, 3, 2, 2]                                                     # Rank 0
+        [2, 2, 1, 2]                                                     # Rank 1
+        [1, 2, 1, 2]                                                     # Rank 2
+        [1, 2, 1, 1]                                                     # Rank 3
+        >>> output = ...
+        >>> dist.all_to_all_single(output, input, output_splits, input_splits)
+        >>> output
+        tensor([ 0,  1, 10, 11, 12, 20, 21, 30, 31])                     # Rank 0
+        tensor([ 2,  3, 13, 14, 22, 32, 33])                             # Rank 1
+        tensor([ 4, 15, 16, 23, 34, 35])                                 # Rank 2
+        tensor([ 5, 17, 18, 24, 36])                                     # Rank 3
+
+
+        >>> # Another example with tensors of torch.cfloat type.
+        >>> input = torch.tensor([1+1j, 2+2j, 3+3j, 4+4j], dtype=torch.cfloat) + 4 * rank * (1+1j)
+        >>> input
+        tensor([1+1j, 2+2j, 3+3j, 4+4j])                                # Rank 0
+        tensor([5+5j, 6+6j, 7+7j, 8+8j])                                # Rank 1
+        tensor([9+9j, 10+10j, 11+11j, 12+12j])                          # Rank 2
+        tensor([13+13j, 14+14j, 15+15j, 16+16j])                        # Rank 3
+        >>> output = torch.empty([4], dtype=torch.int64)
+        >>> dist.all_to_all_single(output, input)
+        >>> output
+        tensor([1+1j, 5+5j, 9+9j, 13+13j])                              # Rank 0
+        tensor([2+2j, 6+6j, 10+10j, 14+14j])                            # Rank 1
+        tensor([3+3j, 7+7j, 11+11j, 15+15j])                            # Rank 2
+        tensor([4+4j, 8+8j, 12+12j, 16+16j])                            # Rank 3
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_to_all_single")
+        return
+
+    opts = AllToAllOptions()
+    _check_single_tensor(output, "output")
+    _check_single_tensor(input, "input")
+    _ensure_all_tensors_same_dtype(output, input)
+
+    if input.is_complex():
+        input = torch.view_as_real(input)
+    if output.is_complex():
+        output = torch.view_as_real(output)
+
+    output_split_sizes = [] if output_split_sizes is None else output_split_sizes
+    input_split_sizes = [] if input_split_sizes is None else input_split_sizes
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.alltoall_base(
+            output, input, output_split_sizes, input_split_sizes, opts
+        )
+    else:
+        work = group.alltoall_base(
+            output, input, output_split_sizes, input_split_sizes, opts
+        )
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+@_exception_logger
+def all_to_all(output_tensor_list, input_tensor_list, group=None, async_op=False):
+    """
+    Scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.
+
+    Complex tensors are supported.
+
+    Args:
+        output_tensor_list (list[Tensor]): List of tensors to be gathered one
+            per rank.
+        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group.
+
+    .. warning::
+        `all_to_all` is experimental and subject to change.
+
+    Examples:
+        >>> # xdoctest: +SKIP("Undefined rank")
+        >>> input = torch.arange(4) + rank * 4
+        >>> input = list(input.chunk(4))
+        >>> input
+        [tensor([0]), tensor([1]), tensor([2]), tensor([3])]     # Rank 0
+        [tensor([4]), tensor([5]), tensor([6]), tensor([7])]     # Rank 1
+        [tensor([8]), tensor([9]), tensor([10]), tensor([11])]   # Rank 2
+        [tensor([12]), tensor([13]), tensor([14]), tensor([15])] # Rank 3
+        >>> output = list(torch.empty([4], dtype=torch.int64).chunk(4))
+        >>> dist.all_to_all(output, input)
+        >>> output
+        [tensor([0]), tensor([4]), tensor([8]), tensor([12])]    # Rank 0
+        [tensor([1]), tensor([5]), tensor([9]), tensor([13])]    # Rank 1
+        [tensor([2]), tensor([6]), tensor([10]), tensor([14])]   # Rank 2
+        [tensor([3]), tensor([7]), tensor([11]), tensor([15])]   # Rank 3
+
+        >>> # Essentially, it is similar to following operation:
+        >>> scatter_list = input
+        >>> gather_list  = output
+        >>> for i in range(world_size):
+        >>>     dist.scatter(gather_list[i], scatter_list if i == rank else [], src=i)
+
+        >>> input
+        tensor([0, 1, 2, 3, 4, 5])                                       # Rank 0
+        tensor([10, 11, 12, 13, 14, 15, 16, 17, 18])                     # Rank 1
+        tensor([20, 21, 22, 23, 24])                                     # Rank 2
+        tensor([30, 31, 32, 33, 34, 35, 36])                             # Rank 3
+        >>> input_splits
+        [2, 2, 1, 1]                                                     # Rank 0
+        [3, 2, 2, 2]                                                     # Rank 1
+        [2, 1, 1, 1]                                                     # Rank 2
+        [2, 2, 2, 1]                                                     # Rank 3
+        >>> output_splits
+        [2, 3, 2, 2]                                                     # Rank 0
+        [2, 2, 1, 2]                                                     # Rank 1
+        [1, 2, 1, 2]                                                     # Rank 2
+        [1, 2, 1, 1]                                                     # Rank 3
+        >>> input = list(input.split(input_splits))
+        >>> input
+        [tensor([0, 1]), tensor([2, 3]), tensor([4]), tensor([5])]                   # Rank 0
+        [tensor([10, 11, 12]), tensor([13, 14]), tensor([15, 16]), tensor([17, 18])] # Rank 1
+        [tensor([20, 21]), tensor([22]), tensor([23]), tensor([24])]                 # Rank 2
+        [tensor([30, 31]), tensor([32, 33]), tensor([34, 35]), tensor([36])]         # Rank 3
+        >>> output = ...
+        >>> dist.all_to_all(output, input)
+        >>> output
+        [tensor([0, 1]), tensor([10, 11, 12]), tensor([20, 21]), tensor([30, 31])]   # Rank 0
+        [tensor([2, 3]), tensor([13, 14]), tensor([22]), tensor([32, 33])]           # Rank 1
+        [tensor([4]), tensor([15, 16]), tensor([23]), tensor([34, 35])]              # Rank 2
+        [tensor([5]), tensor([17, 18]), tensor([24]), tensor([36])]                  # Rank 3
+
+        >>> # Another example with tensors of torch.cfloat type.
+        >>> input = torch.tensor([1+1j, 2+2j, 3+3j, 4+4j], dtype=torch.cfloat) + 4 * rank * (1+1j)
+        >>> input = list(input.chunk(4))
+        >>> input
+        [tensor([1+1j]), tensor([2+2j]), tensor([3+3j]), tensor([4+4j])]            # Rank 0
+        [tensor([5+5j]), tensor([6+6j]), tensor([7+7j]), tensor([8+8j])]            # Rank 1
+        [tensor([9+9j]), tensor([10+10j]), tensor([11+11j]), tensor([12+12j])]      # Rank 2
+        [tensor([13+13j]), tensor([14+14j]), tensor([15+15j]), tensor([16+16j])]    # Rank 3
+        >>> output = list(torch.empty([4], dtype=torch.int64).chunk(4))
+        >>> dist.all_to_all(output, input)
+        >>> output
+        [tensor([1+1j]), tensor([5+5j]), tensor([9+9j]), tensor([13+13j])]          # Rank 0
+        [tensor([2+2j]), tensor([6+6j]), tensor([10+10j]), tensor([14+14j])]        # Rank 1
+        [tensor([3+3j]), tensor([7+7j]), tensor([11+11j]), tensor([15+15j])]        # Rank 2
+        [tensor([4+4j]), tensor([8+8j]), tensor([12+12j]), tensor([16+16j])]        # Rank 3
+
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("all_to_all")
+        return
+
+    opts = AllToAllOptions()
+    _check_tensor_list(output_tensor_list, "output_tensor_list")
+    _check_tensor_list(input_tensor_list, "input_tensor_list")
+    _ensure_all_tensors_same_dtype(output_tensor_list, input_tensor_list)
+
+    input_tensor_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in input_tensor_list
+    ]
+    output_tensor_list = [
+        t if not t.is_complex() else torch.view_as_real(t) for t in output_tensor_list
+    ]
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.alltoall(output_tensor_list, input_tensor_list, opts)
+    else:
+        work = group.alltoall(output_tensor_list, input_tensor_list, opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+@_exception_logger
+def barrier(group=GroupMember.WORLD, async_op=False, device_ids=None):
+    """
+    Synchronize all processes.
+
+    This collective blocks processes until the whole group enters this function,
+    if async_op is False, or if async work handle is called on wait().
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+        async_op (bool, optional): Whether this op should be an async op
+        device_ids ([int], optional): List of device/GPU ids.
+
+    Returns:
+        Async work handle, if async_op is set to True.
+        None, if not async_op or if not part of the group
+
+    .. note:: `ProcessGroupNCCL` now relies on stream synchronization instead of
+              device synchronization to block the CPU. Thus, please do not assume that
+              `barrier()` would perform a device synchronization.
+    """
+    if _rank_not_in_group(group):
+        _warn_not_in_group("barrier")
+        return
+
+    opts = BarrierOptions()
+    opts.device = _get_pg_default_device(group)
+    if device_ids is not None:
+        if isinstance(device_ids, list):
+            opts.device_ids = device_ids
+        else:
+            raise TypeError(
+                "Invalid function argument: device_ids type should be List[int]"
+            )
+
+    if group is None:
+        default_pg = _get_default_group()
+        work = default_pg.barrier(opts=opts)
+    else:
+        work = group.barrier(opts=opts)
+
+    if async_op:
+        return work
+    else:
+        work.wait()
+
+
+def monitored_barrier(group=GroupMember.WORLD, timeout=None, wait_all_ranks=False):
+    """
+    Synchronize processes similar to ``torch.distributed.barrier``, but consider a configurable timeout.
+
+    It is able to report ranks that did not pass this barrier within the provided timeout.
+    Specifically, for non-zero ranks, will block until a send/recv is processed from rank 0.
+    Rank 0 will block until all send /recv from other ranks are processed, and will report
+    failures for ranks that failed to respond in time. Note that if one rank does not reach the
+    monitored_barrier (for example due to a hang), all other ranks would fail in monitored_barrier.
+
+    This collective will block all processes/ranks in the group, until the
+    whole group exits the function successfully, making it useful for debugging
+    and synchronizing. However, it can have a performance impact and should only
+    be used for debugging or scenarios that require full synchronization points
+    on the host-side. For debugging purposes, this barrier can be inserted
+    before the application's collective calls to check if any ranks are
+    desynchronized.
+
+    .. note:: Note that this collective is only supported with the GLOO backend.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If
+            ``None``, the default process group will be used.
+        timeout (datetime.timedelta, optional): Timeout for monitored_barrier.
+            If ``None``, the default process group timeout will be used.
+        wait_all_ranks (bool, optional): Whether to collect all failed ranks or
+            not. By default, this is ``False`` and ``monitored_barrier`` on rank 0
+            will throw on the first failed rank it encounters in order to fail
+            fast. By setting ``wait_all_ranks=True`` ``monitored_barrier`` will
+            collect all failed ranks and throw an error containing information
+            about all failed ranks.
+
+    Returns:
+        ``None``.
+
+    Example::
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> # Note: Process group initialization omitted on each rank.
+        >>> import torch.distributed as dist
+        >>> if dist.get_rank() != 1:
+        >>>     dist.monitored_barrier() # Raises exception indicating that
+        >>> # rank 1 did not call into monitored_barrier.
+        >>> # Example with wait_all_ranks=True
+        >>> if dist.get_rank() == 0:
+        >>>     dist.monitored_barrier(wait_all_ranks=True) # Raises exception
+        >>> # indicating that ranks 1, 2, ... world_size - 1 did not call into
+        >>> # monitored_barrier.
+    """
+    # Need to call rank not in group before using the group, otherwise
+    # "Invalid process group" error is raised.
+    if _rank_not_in_group(group):
+        _warn_not_in_group("monitored_barrier")
+        return
+
+    if get_backend(group) != Backend.GLOO:
+        raise ValueError("monitored_barrier is only implemented for GLOO backend.")
+
+    if timeout is None:
+        timeout = _get_default_timeout(get_backend(group))
+    elif isinstance(timeout, float):
+        # TODO(whc) aparently some existing test case for monitored_barrier passes in a timeout in float format?
+        warnings.warn(
+            "Please specify timeout arg as a timedelta. "
+            f"Converting current value of {timeout} assuming it represents seconds",
+        )
+        timeout = timedelta(seconds=timeout)
+
+    _check_valid_timeout(timeout)
+
+    group_to_use = _get_default_group() if group is None else group
+    return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
+
+
+def _create_process_group_wrapper(
+    wrapped_pg: torch._C._distributed_c10d.Backend,
+    store_prefix: str,
+    store: Store,
+    rank: int,
+    world_size: int,
+    timeout: timedelta = default_pg_timeout,
+):
+    # (whc) this appears to be just for the gloo backend? if so, `default_pg_timeout` is appropriate...
+
+    # Create a separate prefix store for the helper process group.
+    prefix = f"{PG_WRAPPER_STORE_PREFIX}:{store_prefix}"
+    store = PrefixStore(prefix, store)
+    helper_pg = ProcessGroupGloo(store, rank, world_size, timeout=timeout)
+    # Wrap the underlying pg with ProcessGroupWrapper.
+    wrapped_pg = _ProcessGroupWrapper(wrapped_pg, helper_pg)
+    return wrapped_pg
+
+# helper function for deterministically hashing a list of ranks
+def _hash_ranks(ranks: List[int]):
+    return hashlib.sha1(bytes("_".join(map(str, ranks)), "utf-8")).hexdigest()
+
+# Takes a list of ranks and computes an integer color
+def _process_group_color(ranks: List[int]) -> int:
+    # Convert our hash to an int, but avoid negative numbers by shifting a bit.
+    return int(_hash_ranks(ranks), 16) % (sys.maxsize >> 1)
+
+def _process_group_name(ranks, use_hashed_name):
+    global _world
+    if use_hashed_name:
+        pg_name = _hash_ranks(ranks)
+        while pg_name in _world.pg_names.values():
+            pg_name = hashlib.sha1(bytes(pg_name + "_", "utf-8")).hexdigest()
+    else:
+        pg_name = str(_world.group_count)
+        _world.group_count += 1
+    return pg_name
+
+def _get_backend_from_str(backend: Optional[str] = None) -> Backend:
+    # Default to the same backend as the global process group
+    #  if backend is not specified.
+    if not backend:
+        backend = get_backend(_get_default_group())
+    return Backend(backend)
+
+
+@_time_logger
+def new_group(ranks=None, timeout=None, backend=None, pg_options=None, use_local_synchronization=False):
+    """
+    Create a new distributed group.
+
+    This function requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group. Additionally, groups
+    should be created in the same order in all processes.
+
+    .. warning::
+        Using multiple process groups with the ``NCCL`` backend concurrently
+        is not safe and the user should perform explicit synchronization in
+        their application to ensure only one process group is used at a time.
+        This means collectives from one process group should have completed
+        execution on the device (not just enqueued since CUDA execution is
+        async) before collectives from another process group are enqueued.
+        See `Using multiple NCCL communicators concurrently <https://docs.nvid
+        ia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using
+        -multiple-nccl-communicators-concurrently>`_ for more details.
+
+    Args:
+        ranks (list[int]): List of ranks of group members. If ``None``, will be
+            set to all ranks. Default is ``None``.
+        timeout (timedelta, optional): see `init_process_group` for details and default value.
+        backend (str or Backend, optional): The backend to use. Depending on
+            build-time configurations, valid values are ``gloo`` and ``nccl``.
+            By default uses the same backend as the global group. This field
+            should be given as a lowercase string (e.g., ``"gloo"``), which can
+            also be accessed via :class:`Backend` attributes (e.g.,
+            ``Backend.GLOO``). If ``None`` is passed in, the backend
+            corresponding to the default process group will be used. Default is
+            ``None``.
+        pg_options (ProcessGroupOptions, optional): process group options
+            specifying what additional options need to be passed in during
+            the construction of specific process groups. i.e. for the ``nccl``
+            backend, ``is_high_priority_stream`` can be specified so that
+            process group can pick up high priority cuda streams.
+        use_local_synchronization (bool, optional): perform a group-local
+            barrier at the end of the process group creation. This is different
+            in that non-member ranks don't need to call into API and don't
+            join the barrier.
+
+    Returns:
+        A handle of distributed group that can be given to collective calls or None if the rank is not part of ``ranks``.
+
+    N.B. use_local_synchronization doesn't work with MPI.
+
+    N.B. While use_local_synchronization=True can be significantly faster with larger
+    clusters and small process groups, care must be taken since it changes cluster behavior
+    as non-member ranks don't join the group barrier().
+
+    N.B. use_local_synchronization=True can lead to deadlocks when each rank creates
+    multiple overlaping process groups. To avoid that, make sure all ranks follow the
+    same global creation order.
+    """
+    return _new_group_with_tag(ranks, timeout, backend, pg_options, None, use_local_synchronization=use_local_synchronization)
+
+def _new_group_with_tag(
+    ranks=None,
+    timeout=None,
+    backend=None,
+    pg_options=None,
+    pg_tag=None,
+    use_local_synchronization=False
+):
+    """
+    Variant of ``new_group`` that exposes tag creation.
+
+    :: N.B. The mechanism is experimental and tied to the functional collectives effort, see
+    ``torch.distributed._functional_collectives`` for reference on how to use it.
+    """
+    global _world
+
+    default_pg = _get_default_group()
+    default_backend, default_store = _world.pg_map[default_pg]
+    global_rank = default_pg.rank()
+    global_world_size = default_pg.size()
+
+
+    # Default to the same backend as the global process group
+    # if the backend is not specified.
+    if not backend:
+        backend = default_backend
+    backend = Backend(backend)
+
+    # this timeout defaulting/validation is used for all the new_groups/new_subgroups variants,
+    # which may just pass their timeout value (or None)
+    if timeout is None:
+        timeout = _get_default_timeout(backend)
+    _check_valid_timeout(timeout)
+
+    if use_local_synchronization:
+        # MPI backend doesn't have have a way for us to perform a partial sync
+        if backend == Backend.MPI:
+            raise ValueError("MPI backend doesn't support use_local_synchronization=True")
+        if ranks is not None and get_rank() not in ranks:
+            return None
+
+    # checks the input ranks
+    if ranks is not None:
+        ranks = sorted(ranks)
+        group_world_size = len(ranks)
+        if group_world_size > global_world_size:
+            raise ValueError(
+                "the new group's world size should be less or "
+                "equal to the world size set by "
+                "init_process_group"
+            )
+        # check ranks' sanity
+        for rank in ranks:
+            if rank < 0 or rank >= global_world_size:
+                raise ValueError(
+                    "The new group's rank should be within "
+                    "the world_size set by init_process_group"
+                )
+        if global_rank in ranks:
+            group_rank = ranks.index(global_rank)
+        else:
+            group_rank = None
+    else:
+        ranks = list(range(global_world_size))
+        group_world_size = global_world_size
+        group_rank = global_rank
+
+    group_name = _process_group_name(ranks, use_hashed_name=use_local_synchronization)
+
+    pg, pg_store = _new_process_group_helper(
+        group_world_size,
+        group_rank,
+        ranks,
+        backend,
+        default_store,
+        group_name,
+        pg_options=pg_options,
+        timeout=timeout,
+        pg_tag=pg_tag
+    )
+
+    # Create the global rank to group rank mapping
+    _world.pg_group_ranks[pg] = {
+        global_rank: group_rank for group_rank, global_rank in enumerate(ranks)
+    }
+
+    if _is_barrier_after_init() == 1:
+        # barrier at the end to ensure that once we return from this method, all
+        # process groups including global variables (if any) are updated
+        # correctly on all ranks.
+        # Update 04/2023: for large-scale runs, this barrier (esp. store-based
+        # barrier) may be costly and/or unscalable. Also, in a lot of cases,
+        # these barriers may be unnecessary, as proven by a green CI after
+        # removal. An environment variable `TORCH_DIST_INIT_BARRIER` has been
+        # added which enables this barrier only when set to 1.
+        logger.info(
+            "Performing barrier after ProcessGroup initialization since "
+            "TORCH_DIST_INIT_BARRIER = 1"
+        )
+        if backend == Backend.MPI:
+            # MPI doesn't have store.
+            barrier()
+        else:
+            barrier_store = pg_store if use_local_synchronization else default_store
+            world_size = len(ranks) if use_local_synchronization else get_world_size()
+            # Use store based barrier here since barrier() used a bunch of
+            # default devices and messes up NCCL internal state.
+            _store_based_barrier(global_rank, barrier_store, group_name, world_size, timeout)
+
+    return pg
+
+
+def new_subgroups(
+    group_size=None,
+    group=None,
+    timeout=None,
+    backend=None,
+    pg_options=None,
+):
+    """
+    Create subgroups of equal size.
+
+    By default, it creates intra-machine subgroups,
+    where each of which contains all the ranks of a machine, based on the assumption
+    that each machine has the same number of devices.
+
+    This is a convenience API that calls ``new_group`` to generate multiple subgroups.
+    It requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group.
+
+    .. warning::
+        If ``group_size`` is passed in, the world size must be divisible by ``group_size``.
+        If no ``group_size`` is passed in, it believe that you are creating a group based
+        on CUDA and determining the group size by number of CUDA devices, and if not all
+        the machines have the same number of devices, the subgroup division will be
+        different across nodes and can cause unexpected behaviors. Therefore, if you are
+        creating a subgroup that does not depend on CUDA (such as Gloo on CPU), please
+        pass in ``group_size`` correctly.
+
+    .. warning::
+        Using multiple process groups with the ``NCCL`` backend concurrently
+        is not safe and the user should perform explicit synchronization in
+        their application to ensure only one process group is used at a time.
+        This means collectives from one process group should have completed
+        execution on the device (not just enqueued since CUDA execution is
+        async) before collectives from another process group are enqueued.
+        See `Using multiple NCCL communicators concurrently <https://docs.nvid
+        ia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using
+        -multiple-nccl-communicators-concurrently>`_ for more details.
+
+    Args:
+        group_size (int, optional): The size of each subgroup. If ``None``,
+            the default subgroup size is equal to the number of devices on each machine,
+            based on the assumption that each machine has exactly the same
+            number of devices. Default is ``None``.
+        timeout (timedelta, optional): see `init_process_group` for details and default value.
+        backend (str or Backend, optional): The backend to use. Depending on
+            build-time configurations, valid values are ``gloo`` and ``nccl``.
+            By default uses the same backend as the global group. This field
+            should be given as a lowercase string (e.g., ``"gloo"``), which can
+            also be accessed via :class:`Backend` attributes (e.g.,
+            ``Backend.GLOO``). If ``None`` is passed in, the backend
+            corresponding to the default process group will be used. Default is
+            ``None``.
+        pg_options (ProcessGroupOptions, optional): process group options
+            specifying what additional options need to be passed in during
+            the construction of specific process groups. i.e. for the ``nccl``
+            backend, ``is_high_priority_stream`` can be specified so that
+            process group can pick up high priority cuda streams.
+
+    Returns:
+        The subgroup containing the current rank, and all the subgroups used for cleanup.
+
+    Examples:
+        >>> # Create intra-machine subgroups.
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> cur_subgroup, subgroups = dist.new_subgroups()
+        >>> # Allreduce within the machine.
+        >>> rank = dist.get_rank()
+        >>> tensor = torch.ones(1, device=rank) * rank
+        >>> dist.all_reduce(tensor, group=cur_subgroup)
+        >>> tensor
+        tensor([8])     # Assume 8 is the number of CUDA devices per machine.
+        >>> # Cleanup.
+        >>> for subgroup in subgroups:
+        >>>     dist.destroy_process_group(subgroup)
+    """
+    if group_size is None:
+        if not torch.cuda.is_available():
+            raise ValueError("Default group size only takes effect when CUDA is available."
+                             "If your subgroup using a backend that does not depend on CUDA,"
+                             "please pass in 'group_size' correctly.")
+        group_size = torch.cuda.device_count()
+    if group_size <= 0:
+        raise ValueError(f"The arg 'group_size' ({group_size}) must be positive")
+
+    world_size = get_world_size()
+    if world_size < group_size:
+        raise ValueError(f"The arg 'group_size' ({group_size}) must not exceed the world size ({world_size})")
+    if world_size % group_size != 0:
+        raise ValueError("The world size must be divisible by 'group_size'")
+
+    subgroups = []
+    cur_subgroup = None
+
+    for subgroup_id in range(world_size // group_size):
+        start_rank = subgroup_id * group_size
+        end_rank = start_rank + group_size
+        ranks_in_subgroup = list(range(start_rank, end_rank))
+        subgroup = new_group(
+            ranks=ranks_in_subgroup,
+            timeout=timeout,
+            backend=backend,
+            pg_options=pg_options,
+        )
+        subgroups.append(subgroup)
+
+        rank = get_rank()
+        if rank in ranks_in_subgroup:
+            cur_subgroup = subgroup
+            logger.info(
+                "Rank %s is assigned to subgroup %s",
+                rank, ranks_in_subgroup
+            )
+
+    return cur_subgroup, subgroups
+
+
+def new_subgroups_by_enumeration(
+    ranks_per_subgroup_list,
+    timeout=None,
+    backend=None,
+    pg_options=None,
+):
+    """
+    Create subgroups by dividing the global world.
+
+    The division is specified by a nested list of ranks. The subgroups cannot have
+    overlap, and some ranks may not have to be in any subgroup.
+
+    This is a convenience API that calls ``new_group`` to generate multiple subgroups.
+    It requires that all processes in the main group (i.e. all
+    processes that are part of the distributed job) enter this function, even
+    if they are not going to be members of the group.
+
+    .. warning::
+        Using multiple process groups with the ``NCCL`` backend concurrently
+        is not safe and the user should perform explicit synchronization in
+        their application to ensure only one process group is used at a time.
+        This means collectives from one process group should have completed
+        execution on the device (not just enqueued since CUDA execution is
+        async) before collectives from another process group are enqueued.
+        See `Using multiple NCCL communicators concurrently <https://docs.nvid
+        ia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using
+        -multiple-nccl-communicators-concurrently>`_ for more details.
+
+    Args:
+        ranks_per_subgroup_list (list[list[int]]): A nested list of ranks of
+            group members.
+        timeout (timedelta, optional): see `init_process_group` for details and default value.
+        backend (str or Backend, optional): The backend to use. Depending on
+             build-time configurations, valid values are ``gloo`` and ``nccl``.
+             By default uses the same backend as the global group. This field
+             should be given as a lowercase string (e.g., ``"gloo"``), which can
+             also be accessed via :class:`Backend` attributes (e.g.,
+             ``Backend.GLOO``). If ``None`` is passed in, the backend
+             corresponding to the default process group will be used. Default is
+             ``None``.
+        pg_options (ProcessGroupOptions, optional): process group options
+            specifying what additional options need to be passed in during
+            the construction of specific process groups. i.e. for the ``nccl``
+            backend, ``is_high_priority_stream`` can be specified so that
+            process group can pick up high priority cuda streams.
+
+    Returns:
+        The subgroup containing the current rank, and all the subgroups used for cleanup.
+
+    Examples:
+        >>> # Create two subgroups, where each has 2 processes.
+        >>> # xdoctest: +SKIP("need process group init")
+        >>> cur_subgroup, subgroups = dist.new_subgroups(ranks=[[0, 2], [1, 3]])
+        >>> rank = dist.get_rank()
+        >>> tensor = torch.ones(1, device=rank) * rank
+        >>> dist.all_reduce(tensor, group=cur_subgroup)
+        >>> tensor
+        tensor([2])     # Subgroup 0: ranks 0 and 2
+        tensor([4])     # Subgroup 1: ranks 1 and 3
+    """
+    if ranks_per_subgroup_list is None or len(ranks_per_subgroup_list) == 0:
+        raise ValueError("The arg 'ranks_per_subgroup_list' cannot be empty")
+
+    subgroups = []
+    cur_subgroup = None
+    # Create a mapping from rank to subgroup to check if there is any subgroup overlap.
+    rank_to_ranks_dict = {}  # type: ignore[var-annotated]
+    for ranks in ranks_per_subgroup_list:
+        subgroup = new_group(
+            ranks=ranks,
+            timeout=timeout,
+            backend=backend,
+            pg_options=pg_options,
+        )
+        subgroups.append(subgroup)
+        my_rank = get_rank()
+        for rank in ranks:
+            if rank in rank_to_ranks_dict:
+                raise ValueError(
+                    f"Rank {rank} has appeared in both subgroup {rank_to_ranks_dict[rank]} and {ranks}"
+                )
+            rank_to_ranks_dict[rank] = ranks
+            if my_rank == rank:
+                cur_subgroup = subgroup
+                logger.info("Rank %s is assigned to subgroup %s", rank, ranks)
+
+    return cur_subgroup, subgroups
+
+
+def _find_pg_by_ranks_and_tag(tag: str, ranks: List[int]) -> Optional[ProcessGroup]:
+    if len(tag) > 0 and not tag.startswith("ptd:") and not tag.startswith("user:"):
+        tag = f"user:{tag}"
+
+    for group in _world.tags_to_pg.get(tag, []):
+        if group.size() != len(ranks):
+            continue
+
+        group_ranks = get_process_group_ranks(group)
+        good = all(r in group_ranks for r in ranks)
+        if good:
+            return group
+    return None
+
+def _find_or_create_pg_by_ranks_and_tag(tag: str, ranks: List[int], stride: int) -> ProcessGroup:
+    assert len(ranks) % stride == 0, f"Ranks length ({len(ranks)}) must be divisible by stride ({stride})"
+
+    my_rank = get_rank()
+    my_ranks = None
+
+    if stride == len(ranks):
+        my_ranks = ranks.copy()
+        assert my_rank in my_ranks, "rankset doesn't include the current node"
+    else:
+        for i in range(0, len(ranks), stride):
+            rank_set = ranks[i : i + stride]
+            if my_rank in rank_set:
+                my_ranks = rank_set
+        assert my_ranks is not None, "rankset doesn't include the current node"
+
+    my_ranks.sort()
+
+    pg = _find_pg_by_ranks_and_tag(tag, my_ranks)
+    if pg is not None:
+        return pg
+    if tag == "":
+        raise ValueError("Cannot automatically create PG with empty tag")
+    # TODO copy settings and timeout from default PG
+    return _new_group_with_tag(my_ranks, pg_tag=tag)
+
+def _get_group_tag(pg: ProcessGroup) -> str:
+    """Return the tag associated with ``pg``."""
+    tag = _world.pg_to_tag[pg]
+    if tag.startswith("user:"):
+        tag = tag[5:]
+    return tag
+
+def _get_process_group_name(pg: ProcessGroup) -> str:
+    return _world.pg_names.get(pg, "None")
+
+def _get_process_group_store(pg: ProcessGroup) -> Store:
+    return _world.pg_map[pg][1]
+
+# This ops are not friendly to TorchDynamo. So, we decide to disallow these ops
+# in FX graph, allowing them to run them on eager, with torch.compile.
+dynamo_unsupported_distributed_c10d_ops = [
+    recv,
+    all_gather_object,
+    all_gather_coalesced,
+    all_to_all_single,
+    all_reduce,
+    gather_object,
+    all_to_all,
+    all_reduce_coalesced,
+    gather,
+    broadcast_object_list,
+    barrier,
+    scatter,
+    scatter_object_list,
+    reduce,
+    all_gather,
+    reduce_scatter,
+    all_gather_into_tensor,
+    broadcast,
+    reduce_scatter_tensor,
+    send,
+]
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..427e1745c4a2631cd006e0c856c248d7e2968c11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/__init__.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env/python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+
+Torchelastic agent and user worker failover contract:
+
+**TL;DR;**:
+
+* TE(torchelastic) expects user workers to finish with the 5 minutes drift
+* It is better to design DDP app to fail for all workers, rather than a single one.
+* TE does not synchronize number of restarts between agents
+* TE re-rendezvous does not trigger restart decrease
+* When a single agent finishes its job(successfully or not), it will close rendezvous.
+  If other agents still have workers in progress, they will be terminated.
+* Based on above, scale down does not work if at least single agent finishes the job.
+* When Scale up is detected by agents, it will not decrease ``max_restarts``
+
+
+In general TE(torchelastic) can launch arbitrary user code, but there is some
+clarifications need to be done around what failover mechanism torchelastic
+provides and what failover mechanism it expects from user workers.
+
+Torchelastic currently supports DDP style applications.  That means that
+TE expects *ALL* workers finish approximately at the same time. In practice,
+it is nearly to impossible to guarantee that all workers in arbitrary
+DDP application finish at the time, so TE provides a finalization barrier
+that waits for TIMEOUT(5 minutes) for worker finalization.
+
+**Worker Failure**
+
+When worker fails, TE will check the number of restarts
+available, if there is more than 0 restarts, TE will start a new rendezvous
+round and restart the worker process. New rendezvous round will other
+TE agents to terminate their workers.
+
+.. note:: The TE agent does not synchronize restarts between themselves.
+          When a single agent performs restart, it will trigger a local ``max_restarts``
+          decrease, other agent will not decrease their ``max_restarts``.
+          the user to run the distributed application locally on a dev host.
+
+A single worker failure can cause the whole cluster to fail:
+If a single worker is constantly failing, it will cause the TE agent
+``max_restarts``  to go to zero. This will cause an agent to finish its
+work and close rendezvous. If there are any other workers on different
+agents, they will be terminated.
+
+
+**Re-Rendezvous**
+
+Re-rendezvous occurs when TE agents detect a new node
+trying to joint a cluster. TE will not decrease ``max_restarts``. TE agents
+will terminate its workers and start a new rendezvous round.
+
+Note about DynamicRendezvous(etcd-v2, c10d-experimental): If the rendezvous
+has already max_nodes, the new node won't be added to the wait list right
+away since there is no need to tear down a rendezvous that is already fully
+utilized. The new node will wait until its timeout (600 secs by default)
+and periodically check the number of participants. If the number becomes
+less than max_nodes, it will be added to the wait list; otherwise, it will time out after 600 secs.
+
+*Scale up event*. When scale up event happens, torchelastic rendezvous
+will detect that there are new nodes trying to join. Torchelastic agent
+will stop all workers and perform re-rendezvous. Note: when scale up event
+happens, *``max_restarts``* will *not* decrease.
+
+*Scale down event*. When scale down event happens, rendezvous will not
+notify the torchelastic agent about it. If TE agent launched with ``max_restarts=0`` ,
+it relies on the underlying scheduler to handle job restart. If the ``max_restarts>0`` ,
+TE agent will terminate workers and start a new rdzv round, which is a *Scale up event*.
+
+"""
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf1d28816b3dba643896be4bb77447933eef2572
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52b18e9d0fbf56ad260176bcf1f4f5baed037d79
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..21d5992f195597fd71aa809fe1b2dcedb4d6a8ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__init__.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+The elastic agent is the control plane of torchelastic.
+
+It is a process that launches and manages underlying worker processes.
+The agent is responsible for:
+
+1. Working with distributed torch: the workers are started with all the
+   necessary information to successfully and trivially call
+   ``torch.distributed.init_process_group()``.
+
+2. Fault tolerance: monitors workers and upon detecting worker failures
+   or unhealthiness, tears down all workers and restarts everyone.
+
+3. Elasticity: Reacts to membership changes and restarts workers with the new
+   members.
+
+The simplest agents are deployed per node and works with local processes.
+A more advanced agent can launch and manage workers remotely. Agents can
+be completely decentralized, making decisions based on the workers it manages.
+Or can be coordinated, communicating to other agents (that manage workers
+in the same job) to make a collective decision.
+"""
+
+from .api import (  # noqa: F401
+    ElasticAgent,
+    RunResult,
+    SimpleElasticAgent,
+    Worker,
+    WorkerGroup,
+    WorkerSpec,
+    WorkerState,
+)
+from .local_elastic_agent import TORCHELASTIC_ENABLE_FILE_TIMER, TORCHELASTIC_TIMER_FILE
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4acc402160714660e11bd9ca87abe070907f1ea4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d4fa7de067a2dd8a14354b4e6e24f51e7016da3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/local_elastic_agent.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/local_elastic_agent.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0ba513f9458a8d57a9b51f8fac58b3e866d488c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/__pycache__/local_elastic_agent.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d684d1aac0db50e2dcd634314952df2b3e5e8a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/api.py
@@ -0,0 +1,954 @@
+# mypy: ignore-errors
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import abc
+import functools
+import json
+import os
+import signal
+import socket
+import time
+import traceback
+import warnings
+from contextlib import closing
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch.distributed.elastic.rendezvous as rdzv
+import torch.distributed.elastic.utils.store as store_util
+from torch.distributed.elastic.rendezvous import RendezvousGracefulExitError
+from torch.distributed import Store
+from torch.distributed.elastic.events import Event, EventSource, record
+from torch.distributed.elastic.metrics import prof, put_metric
+from torch.distributed.elastic.multiprocessing import (
+    ProcessFailure,
+    SignalException,
+)
+from torch.distributed.elastic.utils.logging import get_logger
+
+__all__ = [
+    "WorkerSpec",
+    "Worker",
+    "WorkerState",
+    "WorkerGroup",
+    "RunResult",
+    "ElasticAgent",
+    "SimpleElasticAgent",
+]
+_TERMINAL_STATE_SYNC_ID = "torchelastic/agent/terminal_state"
+
+DEFAULT_ROLE = "default"
+log = get_logger(__name__)
+
+
+@dataclass
+class WorkerSpec:
+    """Blueprint information about a particular type of worker.
+
+    For a given role, there must only exist a single worker spec.
+    Worker spec is expected to be homogeneous across all nodes (machine),
+    that is each node runs the same number of workers for a particular spec.
+
+    Args:
+        role: user-defined role for the workers with this spec
+        local_world_size: number local workers to run
+        fn: (deprecated use entrypoint instead)
+        entrypoint: worker function or command
+        args: arguments to pass to ``entrypoint``
+        rdzv_handler: handles rdzv for this set of workers
+        max_restarts: number of max retries for the workers
+        monitor_interval: monitor status of workers every ``n`` seconds
+        master_port: fixed port to run the c10d store on rank 0
+                     if not specified then will chose a random free port
+        master_addr: fixed master_addr to run the c10d store on rank 0
+                     if not specified then will chose hostname on agent rank 0
+        redirects: redirect std streams to a file,
+                   selectively redirect for a particular
+                   local rank by passing a map
+        tee: tees the specified std stream(s) to console + file,
+             selectively tee for a particular local rank by passing a map,
+             takes precedence over ``redirects`` settings.
+
+    """
+
+    role: str
+    local_world_size: int
+    rdzv_handler: rdzv.RendezvousHandler
+    fn: Optional[Callable] = None
+    # TODO @kiuk - make entrypoint a required field
+    entrypoint: Union[Callable, str, None] = None
+    args: Tuple = ()
+    max_restarts: int = 3
+    monitor_interval: float = 30.0
+    master_port: Optional[int] = None
+    master_addr: Optional[str] = None
+    local_addr: Optional[str] = None
+
+    def __post_init__(self):
+        assert self.local_world_size > 0
+        assert self.monitor_interval > 0
+
+        if self.fn:
+            warnings.warn(
+                "WorkerSpec.fn will be deprecated,"
+                " please use WorkerSpec.entrypoint instead",
+                category=DeprecationWarning,
+            )
+            self.entrypoint = self.fn
+        assert self.entrypoint
+
+    def get_entrypoint_name(self):
+        """Get the entry point name.
+
+        If the entrypoint is a function (e.g. ``Callable``) returns its ``__qualname__``
+        else if the entrypoint is a binary (e.g. ``str``), returns the binary name.
+        """
+        if isinstance(self.entrypoint, str):
+            return os.path.basename(self.entrypoint)
+        else:
+            assert self.entrypoint is not None
+            return self.entrypoint.__qualname__
+
+
+class Worker:
+    """A worker instance.
+
+    Contrast this with ``WorkerSpec`` that represents the specifications of a
+    worker. A ``Worker`` is created from a ``WorkerSpec``. A ``Worker`` is to
+    a ``WorkerSpec`` as an object is to a class.
+
+    The ``id`` of the worker is interpreted
+    by the specific implementation of ``ElasticAgent``. For a local
+    agent, it could be the ``pid (int)`` of the worker, for a remote
+    agent it could be encoded as ``host:port (string)``.
+
+    Args:
+        id (Any): uniquely identifies a worker (interpreted by the agent)
+        local_rank (int): local rank of the worker
+        global_rank (int): global rank of the worker
+        role_rank (int): rank of the worker across all workers that have the same role
+        world_size (int): number of workers (globally)
+        role_world_size (int): number of workers that have the same role
+    """
+
+    __slots__ = [
+        "id",
+        "local_rank",
+        "global_rank",
+        "role_rank",
+        "world_size",
+        "role_world_size",
+    ]
+
+    def __init__(
+        self,
+        local_rank: int,
+        global_rank: int = -1,
+        role_rank: int = -1,
+        world_size: int = -1,
+        role_world_size: int = -1,
+    ):
+        # unique identifier for this worker
+        self.id: Any = None
+
+        # rank of the worker among workers with the same role being monitored
+        # by the same ``agent`` instance.
+        self.local_rank: int = local_rank
+
+        #  rank of the worker among all the workers across all roles
+        #  across all ``agent`` instances.
+        #  Global rank is not stable between re-rendezvous.
+        self.global_rank: int = global_rank
+
+        #  rank of the worker among all the workers with the same role
+        #  across all ``agent`` instances.
+        #  Role rank is not stable between re-rendezvous.
+        self.role_rank: int = role_rank
+
+        # total number of workers (globally). Due to elasticity
+        # the world size may change between re-rendezvous.
+        self.world_size: int = world_size
+
+        # total number of workers that share the same role. Due to elasticity
+        # the role world size may change between re-rendezvous.
+        self.role_world_size: int = role_world_size
+
+    def __str__(self):
+        return (
+            f"local_rank={self.local_rank},global_rank={self.global_rank}"
+            f",role_rank={self.role_rank},world_size={self.world_size}"
+            f",role_world_size={self.role_world_size}"
+        )
+
+    def __repr__(self):
+        return str(self)
+
+
+class WorkerState(str, Enum):
+    """A state of the ``WorkerGroup``.
+
+    Workers in a worker group change state as a unit. If a single worker
+    in a worker group fails the entire set is considered failed::
+
+      UNKNOWN - agent lost track of worker group state, unrecoverable
+      INIT - worker group object created not yet started
+      HEALTHY - workers running and healthy
+      UNHEALTHY - workers running and unhealthy
+      STOPPED - workers stopped (interrupted) by the agent
+      SUCCEEDED - workers finished running (exit 0)
+      FAILED - workers failed to successfully finish (exit !0)
+
+
+    A worker group starts from an initial ``INIT`` state,
+    then progresses to ``HEALTHY`` or ``UNHEALTHY`` states,
+    and finally reaches a terminal ``SUCCEEDED`` or ``FAILED`` state.
+
+    Worker groups can be interrupted and temporarily put into ``STOPPED`` state
+    by the agent. Workers in ``STOPPED`` state are scheduled to be restarted
+    in the near future by the agent. Some examples of workers being put into
+    ``STOPPED`` state are:
+
+    1. Worker group failure|unhealthy observed
+    2. Membership change detected
+
+    When actions (start, stop, rdzv, retry, etc) on worker group fails
+    and results in the action being partially applied to the worker group
+    the state will be ``UNKNOWN``. Typically this happens on uncaught/unhandled
+    exceptions during state change events on the agent. The agent is not
+    expected to recover worker groups in ``UNKNOWN`` state and is better off
+    self terminating and allowing the job manager to retry the node.
+    """
+
+    UNKNOWN = "UNKNOWN"
+    INIT = "INIT"
+    HEALTHY = "HEALTHY"
+    UNHEALTHY = "UNHEALTHY"
+    STOPPED = "STOPPED"
+    SUCCEEDED = "SUCCEEDED"
+    FAILED = "FAILED"
+
+    @staticmethod
+    def is_running(state: "WorkerState") -> bool:
+        """Return the state of the Worker.
+
+        Returns:
+             True if the worker state represents workers still running
+             (e.g. that the process exists but not necessarily healthy).
+        """
+        return state in {WorkerState.HEALTHY, WorkerState.UNHEALTHY}
+
+
+class WorkerGroup:
+    """A set of ``Worker`` instances.
+
+    The class defines a set of ``Worker`` instances for the given ``WorkerSpec`` managed by ``ElasticAgent``. Whether the worker
+    group contains cross instance workers or not depends on the implementation of the agent.
+    """
+
+    __slots__ = ["spec", "workers", "store", "group_rank", "group_world_size", "state"]
+
+    def __init__(self, spec: WorkerSpec):
+        self.spec = spec
+        self.workers = [Worker(local_rank=i) for i in range(self.spec.local_world_size)]
+
+        # assigned after rdzv
+        self.store = None
+        self.group_rank = None
+        self.group_world_size = None
+
+        self.state = WorkerState.INIT
+
+
+class _RoleInstanceInfo:
+    """The class is used by the agent to exchange the information with other agents.
+
+    The information is used to determine the rank of the workers that agent
+    manages in heterogeneous environments, where different agents can have
+    different number of workers.
+    """
+
+    __slots__ = ["role", "rank", "local_world_size"]
+
+    def __init__(self, role: str, rank: int, local_world_size: int):
+        r"""Initialize the agent class instance.
+
+        Args:
+            role (str): user-defined role for the workers with this spec
+            rank (int): the rank of the agent
+            local_world_size (int): number of local workers to run
+        """
+        self.role = role
+        self.rank = rank
+        self.local_world_size = local_world_size
+
+    def serialize(self) -> bytes:
+        dict_data = {
+            "role": self.role,
+            "rank": self.rank,
+            "local_world_size": self.local_world_size,
+        }
+        return json.dumps(dict_data).encode(encoding="UTF-8")
+
+    @staticmethod
+    def deserialize(data: bytes):
+        dict_data = json.loads(data.decode(encoding="UTF-8"))
+        return _RoleInstanceInfo(
+            dict_data["role"], dict_data["rank"], dict_data["local_world_size"]
+        )
+
+    @staticmethod
+    def compare(obj1, obj2) -> int:
+        if obj1.role == obj2.role:
+            return obj1.rank - obj2.rank
+        elif obj1.role > obj2.role:
+            return 1
+        else:
+            return -1
+
+    @staticmethod
+    def find_role_boundaries(roles_infos: List, role: str) -> Tuple[int, int]:
+        start_idx, end_idx = -1, -1
+        for idx, role_info in enumerate(roles_infos):
+            if role_info.role == role:
+                if start_idx == -1:
+                    start_idx = idx
+                end_idx = idx
+        return (start_idx, end_idx)
+
+
+@dataclass
+class RunResult:
+    """Return results of the worker executions.
+
+    Run results follow an "all-or-nothing" policy where the run is successful if and
+    only if ALL local workers managed by this agent complete successfully.
+
+    If the result is successful (e.g. ``is_failed() = False``) then the ``return_values``
+    field contains the outputs (return values) of the workers managed by THIS agent mapped
+    by their GLOBAL ranks. That is ``result.return_values[0]`` is the return value of
+    global rank 0.
+
+    .. note:: ``return_values`` are only meaningful for when the worker entrypoint
+              is a function. Workers specified as a binary entrypoint do not canonically
+              have a return value and the ``return_values`` field is meaningless and
+              may be empty.
+
+    If ``is_failed()`` returns ``True`` then the ``failures`` field contains the
+    failure information, again, mapped by the GLOBAL rank of the worker that failed.
+
+    The keys in ``return_values`` and ``failures`` are mutually exclusive, that is,
+    a worker's final state can only be one of: succeeded, failed. Workers intentionally
+    terminated by the agent according to the agent's restart policy, are not represented
+    in either ``return_values`` nor ``failures``.
+    """
+
+    state: WorkerState
+    return_values: Dict[int, Any] = field(default_factory=dict)
+    failures: Dict[int, ProcessFailure] = field(default_factory=dict)
+
+    def is_failed(self) -> bool:
+        return self.state == WorkerState.FAILED
+
+
+def _get_socket_with_port() -> socket.socket:
+    """Return a free port on localhost.
+
+    The free port is "reserved" by binding a temporary socket on it.
+    Close the socket before passing the port to the entity that
+    requires it. Usage example::
+
+    sock = _get_socket_with_port()
+    with closing(sock):
+        port = sock.getsockname()[1]
+        sock.close()
+        # there is still a race-condition that some other process
+        # may grab this port before func() runs
+        func(port)
+    """
+    addrs = socket.getaddrinfo(
+        host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
+    )
+    for addr in addrs:
+        family, type, proto, _, _ = addr
+        s = socket.socket(family, type, proto)
+        try:
+            s.bind(("localhost", 0))
+            s.listen(0)
+            return s
+        except OSError as e:
+            s.close()
+            log.info("Socket creation attempt failed.", exc_info=e)
+    raise RuntimeError("Failed to create a socket")
+
+
+def _get_fq_hostname() -> str:
+    return socket.getfqdn(socket.gethostname())
+
+
+class ElasticAgent(abc.ABC):
+    """An agent process responsible for managing one or more worker processes.
+
+    The worker processes are assumed to be regular distributed PyTorch scripts.
+    When the worker process is created by the agent, the agent provides the
+    necessary information for the worker processes to properly initialize
+    a torch process group.
+
+    The exact deployment topology and ratio of agent-to-worker is dependent
+    on the specific implementation of the agent and the user's job placement
+    preferences. For instance, to run a distributed training job on GPU with
+    8 trainers (one per GPU) one can:
+
+    1. Use 8 x single GPU instances, place an agent per instance, managing
+       1 worker per agent.
+    2. Use 4 x double GPU instances, place an agent per instance, managing
+       2 workers per agent.
+    3. Use 2 x quad GPU instances, place an agent per instance, managing
+       4 workers per agent.
+    4. Use 1 x 8 GPU instance, place an agent per instance, managing
+       8 workers per agent.
+
+    Usage
+    ::
+
+     group_result = agent.run()
+      if group_result.is_failed():
+        # workers failed
+        failure = group_result.failures[0]
+        log.exception("worker 0 failed with exit code : %s", failure.exit_code)
+      else:
+        return group_result.return_values[0] # return rank 0's results
+
+    """
+
+    @abc.abstractmethod
+    def run(self, role: str = DEFAULT_ROLE) -> RunResult:
+        """Run the agent.
+
+        Supports retrying the worker group on failures up to ``max_restarts``.
+
+        Returns:
+            The result of the execution, containing the return values or
+            failure details for each worker mapped by the worker's global rank.
+
+        Raises:
+            Exception - any other failures NOT related to worker process
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_worker_group(self, role: str = DEFAULT_ROLE) -> WorkerGroup:
+        """Return the ``WorkerGroup`` for the given ``role``.
+
+        Note that the worker group is a mutable object and hence in a
+        multi-threaded/process environment it may change state.
+        Implementors are encouraged (but not required) to return
+        a defensive read-only copy.
+        """
+        raise NotImplementedError()
+
+
+class SimpleElasticAgent(ElasticAgent):
+    """An ``ElasticAgent`` that manages one particular type of worker role.
+
+    An ``ElasticAgent`` that manages workers (``WorkerGroup``) for a single ``WorkerSpec``
+    such as one particular type of worker role.
+    """
+
+    def __init__(self, spec: WorkerSpec, exit_barrier_timeout: float = 300):
+        self._worker_group = WorkerGroup(spec)
+        self._remaining_restarts = self._worker_group.spec.max_restarts
+        self._store = None
+        self._exit_barrier_timeout = exit_barrier_timeout
+        self._total_execution_time = 0
+
+    def get_worker_group(self, role: str = DEFAULT_ROLE) -> WorkerGroup:
+        return self._worker_group
+
+    @abc.abstractmethod
+    def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
+        r"""Start ``worker_group.spec.local_world_size`` number of workers.
+
+        This is according to worker spec for the worker group .
+        Returns a map of ``local_rank`` to worker ``id``.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+        r"""Stop all workers in the given worker group.
+
+        Implementors must deal with workers in all states defined by
+        ``WorkerState``. That is, it must gracefully handle stopping
+        non-existent workers, unhealthy (stuck) workers, etc.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
+        r"""Check on the workers for the ``worker_group``.
+
+        This function also returns the new state of the worker group.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+        """Clean up any resources that were allocated during the agent's work.
+
+        Args:
+            death_sig: Signal to send to the child process, SIGTERM is default
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    def _set_master_addr_port(
+        store: Store,
+        master_addr: Optional[str],
+        master_port: Optional[int],
+        local_addr: Optional[str],
+    ):
+        if master_port is None:
+            sock = _get_socket_with_port()
+            with closing(sock):
+                master_port = sock.getsockname()[1]
+
+        if master_addr is None:
+            # If user specified the address for the local node, use it as the master addr if not exist
+            if local_addr:
+                master_addr = local_addr
+            else:
+                master_addr = _get_fq_hostname()
+
+        store.set("MASTER_ADDR", master_addr.encode(encoding="UTF-8"))
+        store.set("MASTER_PORT", str(master_port).encode(encoding="UTF-8"))
+
+    @staticmethod
+    def _get_master_addr_port(store: Store) -> Tuple[str, int]:
+        master_addr = store.get("MASTER_ADDR").decode(encoding="UTF-8")
+        master_port = int(store.get("MASTER_PORT").decode(encoding="UTF-8"))
+        return (master_addr, master_port)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _rendezvous(self, worker_group: WorkerGroup) -> None:
+        r"""Run rendezvous for the workers specified by the worker spec.
+
+        Assigns workers a new global rank and world size.
+        Updates the rendezvous store for the worker group.
+        """
+        spec = worker_group.spec
+
+        store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
+        self._store = store
+
+        workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
+        worker_group.workers = workers
+        worker_group.store = store
+        worker_group.group_rank = group_rank
+        worker_group.group_world_size = group_world_size
+
+        if group_rank == 0:
+            self._set_master_addr_port(
+                store,
+                spec.master_addr,
+                spec.master_port,
+                spec.local_addr,
+            )
+
+        master_addr, master_port = self._get_master_addr_port(store)
+        restart_count = spec.max_restarts - self._remaining_restarts
+
+        log.info(
+            "[%(role)s] Rendezvous complete for workers. Result:\n"
+            "  restart_count=%(restart_count)s\n"
+            "  master_addr=%(master_addr)s\n"
+            "  master_port=%(master_port)s\n"
+            "  group_rank=%(group_rank)s\n"
+            "  group_world_size=%(group_world_size)s\n"
+            "  local_ranks=%(local_ranks)s\n"
+            "  role_ranks=%(role_ranks)s\n"
+            "  global_ranks=%(global_ranks)s\n"
+            "  role_world_sizes=%(role_world_sizes)s\n"
+            "  global_world_sizes=%(global_world_sizes)s\n",
+            {
+                "role": spec.role,
+                "restart_count": restart_count,
+                "master_addr": master_addr,
+                "master_port": master_port,
+                "group_rank": group_rank,
+                "group_world_size": group_world_size,
+                "local_ranks": [worker.local_rank for worker in workers],
+                "role_ranks": [worker.role_rank for worker in workers],
+                "global_ranks": [worker.global_rank for worker in workers],
+                "role_world_sizes": [worker.role_world_size for worker in workers],
+                "global_world_sizes": [worker.world_size for worker in workers]
+            }
+        )
+
+    def _get_ranks(
+        self,
+        role_infos: List[_RoleInstanceInfo],
+        role_idx: int,
+        start_idx: int = 0,
+        end_idx: int = -1,
+    ) -> Tuple[int, List[int]]:
+        if end_idx == -1:
+            end_idx = len(role_infos)
+        prefix_sum = 0
+        total_sum = 0
+        for idx in range(start_idx, end_idx):
+            if role_idx > idx:
+                prefix_sum += role_infos[idx].local_world_size
+            total_sum += role_infos[idx].local_world_size
+        return (
+            total_sum,
+            list(range(prefix_sum, prefix_sum + role_infos[role_idx].local_world_size)),
+        )
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _assign_worker_ranks(
+        self, store, group_rank: int, group_world_size: int, spec: WorkerSpec
+    ) -> List[Worker]:
+        """Determine proper ranks for worker processes.
+
+        The rank assignment is done according to the following algorithm:
+
+        1. Each agent writes its configuration(group_rank, group_world_size
+           , num_workers) to the common store.
+        2. Each agent retrieves configuration for all agents
+           and performs two level sort using role and rank.
+        3. Determine the global rank: the global rank of the workers for the current
+           agent is the offset of the infos array up to group_rank of the agent.
+           The offset is computed as a sum of local_world_size of all agents that
+           have rank less than the group_rank. The workers would have the ranks:
+           [offset, offset+local_world_size)
+        4. Determine the role rank: The role rank is determined using the algorithms
+           in the point 3 with the exception that the offset is done from the first
+           agent that has the same role as current one and has the minimum group rank.
+        """
+        role_infos = self._share_and_gather(store, group_rank, group_world_size, spec)
+        my_role_info = role_infos[group_rank]
+        worker_world_size, worker_global_ranks = self._get_ranks(role_infos, group_rank)
+        role_infos = sorted(
+            role_infos, key=functools.cmp_to_key(_RoleInstanceInfo.compare)
+        )
+        role_start_idx, role_end_idx = _RoleInstanceInfo.find_role_boundaries(
+            role_infos, my_role_info.role
+        )
+        role_pos = next(
+            idx
+            for idx, role_info in enumerate(role_infos)
+            if _RoleInstanceInfo.compare(role_info, my_role_info) == 0
+        )
+        role_world_size, role_ranks = self._get_ranks(
+            role_infos, role_pos, role_start_idx, role_end_idx + 1
+        )
+        workers = []
+        for ind in range(spec.local_world_size):
+            worker = Worker(
+                local_rank=ind,
+                global_rank=worker_global_ranks[ind],
+                role_rank=role_ranks[ind],
+                world_size=worker_world_size,
+                role_world_size=role_world_size,
+            )
+            workers.append(worker)
+        return workers
+
+    def _share_and_gather(
+        self, store, group_rank: int, group_world_size: int, spec: WorkerSpec
+    ) -> List:
+        agent_role_info = _RoleInstanceInfo(
+            spec.role, group_rank, spec.local_world_size
+        )
+        key_prefix = "torchelastic/role_info"
+        agent_config_enc = agent_role_info.serialize()
+        role_infos_bytes = store_util.synchronize(
+            store, agent_config_enc, group_rank, group_world_size, key_prefix
+        )
+        role_infos = [
+            _RoleInstanceInfo.deserialize(role_info_bytes)
+            for role_info_bytes in role_infos_bytes
+        ]
+        return role_infos
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _initialize_workers(self, worker_group: WorkerGroup) -> None:
+        r"""Start a fresh set of workers for the worker_group.
+
+        Essentially, a rendezvous followed by a ``start_workers``.
+        The caller should first call ``_stop_workers()`` to stop running workers
+        prior to calling this method.
+
+        Optimistically sets the state of the worker group that
+        just started as ``HEALTHY`` and delegates the actual monitoring
+        of state to ``_monitor_workers()`` method
+        """
+        role = worker_group.spec.role
+        log.info("[%s] Rendezvous'ing worker group", role)
+
+        # TODO after stopping workers, wait at least monitor_interval*2 for
+        # workers on different nodes to fail on a collective op before waiting
+        # on the rdzv barrier, this way we ensure that nodes enter rdzv
+        # at around the same time and reduce false positive rdzv timeout errors
+        self._rendezvous(worker_group)
+
+        log.info("[%s] Starting worker group", role)
+        worker_ids = self._start_workers(worker_group)
+        for local_rank, w_id in worker_ids.items():
+            worker = worker_group.workers[local_rank]
+            worker.id = w_id
+
+        worker_group.state = WorkerState.HEALTHY
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _restart_workers(self, worker_group: WorkerGroup) -> None:
+        """Restart (stops, rendezvous, starts) all local workers in the group."""
+        role = worker_group.spec.role
+        log.info("[%s] Stopping worker group", role)
+        self._stop_workers(worker_group)
+        worker_group.state = WorkerState.STOPPED
+        self._initialize_workers(worker_group)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def run(self, role: str = DEFAULT_ROLE) -> RunResult:
+        start_time = time.monotonic()
+        shutdown_called: bool = False
+        try:
+            result = self._invoke_run(role)
+            self._total_execution_time = int(time.monotonic() - start_time)
+            self._record_metrics(result)
+            self._record_worker_events(result)
+            return result
+        except RendezvousGracefulExitError as e:
+            log.info("Rendezvous gracefully exited: %s", e)
+        except SignalException as e:
+            log.warning("Received %s death signal, shutting down workers", e.sigval)
+            self._shutdown(e.sigval)
+            shutdown_called = True
+            raise
+        finally:
+            if not shutdown_called:
+                self._shutdown()
+            # record the execution time in case there were any exceptions during run.
+            self._total_execution_time = int(time.monotonic() - start_time)
+
+    def get_event_failed(self) -> Event:
+        return self._construct_event(
+            state="FAILED",
+            source=EventSource.AGENT,
+            raw_error=traceback.format_exc(),
+        )
+
+    def get_event_succeeded(self) -> Event:
+        return self._construct_event(
+            state="SUCCEEDED",
+            source=EventSource.AGENT,
+        )
+
+    def _record_worker_events(self, result: RunResult) -> None:
+        for worker in self._worker_group.workers:
+            failure = result.failures.get(worker.global_rank)
+            state: str = self._get_worker_state(worker, result)
+            raw_error = json.dumps(failure.error_file_data) if failure else None
+            record(self._construct_event(state, EventSource.WORKER, worker, raw_error))
+
+    def _get_worker_state(self, worker: Worker, result: RunResult) -> str:
+        failure = result.failures.get(worker.global_rank)
+        if result.state in {WorkerState.UNHEALTHY, WorkerState.FAILED} and not failure:
+            # The worker got terminated by the torchelastic agent via SIGTERM signal
+            return "TERMINATED"
+        elif failure or worker.global_rank in result.return_values:
+            return result.state.value
+        else:
+            raise ValueError(f"Unknown worker: {worker.global_rank}")
+
+    def _construct_event(
+        self,
+        state: str,
+        source: EventSource,
+        worker: Optional[Worker] = None,
+        raw_error: Optional[str] = None,
+    ) -> Event:
+        wg = self._worker_group
+        spec = wg.spec
+        md = {
+            "group_world_size": wg.group_world_size,
+            "entry_point": spec.get_entrypoint_name(),
+        }
+        if worker:
+            md["local_rank"] = (worker.local_rank,)
+            md["role_rank"] = (worker.role_rank,)
+            md["role_world_size"] = (worker.role_world_size,)
+            global_rank = worker.global_rank
+            worker_id = str(worker.id)
+        else:
+            global_rank = None
+            worker_id = None
+        md_str = json.dumps(md)
+        metadata = {
+            "run_id": spec.rdzv_handler.get_run_id(),
+            "global_rank": global_rank,
+            "group_rank": wg.group_rank,
+            "worker_id": worker_id,
+            "role": spec.role,
+            "hostname": _get_fq_hostname(),
+            "state": state,
+            "total_run_time": self._total_execution_time,
+            "rdzv_backend": spec.rdzv_handler.get_backend(),
+            "raw_error": raw_error,
+            "metadata": md_str,
+            "agent_restarts": spec.max_restarts - self._remaining_restarts,
+        }
+        return Event(
+            f"torchelastic.worker.status.{state}", source=source, metadata=metadata
+        )
+
+    def _record_metrics(self, group_results: RunResult):
+        is_failed = group_results.is_failed()
+        self._record_flakiness_metric(is_failed)
+        spec = self._worker_group.spec
+        restarts_happened = self._remaining_restarts != spec.max_restarts
+        put_metric(f"workers.{spec.role}.run_total", 1)
+        self._record_metric_with_condition(
+            "run_success_with_retries", not is_failed and restarts_happened
+        )
+        self._record_metric_with_condition(
+            "run_success_no_retries", not is_failed and not restarts_happened
+        )
+        self._record_metric_with_condition(
+            "run_failed_with_retries", is_failed and restarts_happened
+        )
+        self._record_metric_with_condition(
+            "run_failed_no_retries", is_failed and not restarts_happened
+        )
+
+    def _record_metric_with_condition(self, metric_name, condition):
+        spec = self._worker_group.spec
+        if condition:
+            put_metric(f"workers.{spec.role}.{metric_name}", 1)
+        else:
+            put_metric(f"workers.{spec.role}.{metric_name}", 0)
+
+    def _record_flakiness_metric(self, is_failed: bool = False):
+        if is_failed:
+            flakiness = 100.0
+        else:
+            spec = self._worker_group.spec
+            flakiness = 100.0 - 100.0 * (self._remaining_restarts + 1) / (
+                spec.max_restarts + 1
+            )
+        spec = self._worker_group.spec
+
+        put_metric(f"workers.{spec.role}.flakiness", int(flakiness))
+
+    def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult:
+        # NOTE: currently only works for a single role
+
+        spec = self._worker_group.spec
+        role = spec.role
+
+        log.info(
+            "[%s] starting workers for entrypoint: %s", role, spec.get_entrypoint_name()
+        )
+
+        self._initialize_workers(self._worker_group)
+        monitor_interval = spec.monitor_interval
+        rdzv_handler = spec.rdzv_handler
+
+        while True:
+            assert self._worker_group.state != WorkerState.INIT
+            time.sleep(monitor_interval)
+            run_result = self._monitor_workers(self._worker_group)
+            state = run_result.state
+            self._worker_group.state = state
+
+            put_metric(f"workers.{role}.remaining_restarts", self._remaining_restarts)
+            put_metric(f"workers.{role}.{state.name.lower()}", 1)
+
+            if state == WorkerState.SUCCEEDED:
+                log.info(
+                    "[%s] worker group successfully finished."
+                    " Waiting %s seconds for other agents to finish.",
+                    role, self._exit_barrier_timeout
+                )
+                self._exit_barrier()
+                return run_result
+            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED}:
+                if self._remaining_restarts > 0:
+                    log.info(
+                        "[%s] Worker group %s. "
+                        "%s/%s attempts left;"
+                        " will restart worker group",
+                        role, state.name, self._remaining_restarts, spec.max_restarts
+                    )
+                    self._remaining_restarts -= 1
+                    self._restart_workers(self._worker_group)
+                else:
+                    self._stop_workers(self._worker_group)
+                    self._worker_group.state = WorkerState.FAILED
+                    return run_result
+            elif state == WorkerState.HEALTHY:
+                # membership changes do not count as retries
+                num_nodes_waiting = rdzv_handler.num_nodes_waiting()
+                group_rank = self._worker_group.group_rank
+                if num_nodes_waiting > 0:
+                    log.info(
+                        "[%s] Detected %s "
+                        "new nodes from group_rank=%s; "
+                        "will restart worker group",
+                        role, num_nodes_waiting, group_rank
+                    )
+                    self._restart_workers(self._worker_group)
+            else:
+                raise Exception(f"[{role}] Worker group in {state.name} state")
+
+    def _exit_barrier(self):
+        """
+        Define a barrier that keeps the agent process alive until all workers finish.
+
+        Wait for ``exit_barrier_timeout`` seconds for all agents to finish
+        executing their local workers (either successfully or not). This
+        acts as a safety guard against user scripts that terminate at different
+        times.
+        """
+        log.info(
+            "Local worker group finished (%s). "
+            "Waiting %s seconds for other agents to finish",
+            self._worker_group.state, self._exit_barrier_timeout
+        )
+        start = time.time()
+        try:
+            store_util.barrier(
+                self._store,
+                self._worker_group.group_rank,
+                self._worker_group.group_world_size,
+                key_prefix=_TERMINAL_STATE_SYNC_ID,
+                barrier_timeout=self._exit_barrier_timeout,
+            )
+            log.info(
+                "Done waiting for other agents. Elapsed: %s seconds", time.time() - start
+            )
+        except SignalException as e:
+            log.warning("Got termination signal: %s", e.sigval)
+            raise
+        except Exception:
+            log.exception(
+                "Error waiting on exit barrier. Elapsed: %s seconds",
+                time.time() - start
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5f98867024c1df5965a3e54b3097fb63551dc6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import json
+import os
+import signal
+import socket
+from string import Template
+import uuid
+from typing import Any, Dict, Optional, Tuple
+
+import torch.distributed.elastic.timer as timer
+from torch.distributed.elastic import events
+
+from torch.distributed.elastic.agent.server.api import (
+    RunResult,
+    SimpleElasticAgent,
+    WorkerGroup,
+    WorkerSpec,
+    WorkerState,
+)
+from torch.distributed.elastic.events.api import EventMetadataValue
+from torch.distributed.elastic.metrics.api import prof
+from torch.distributed.elastic.multiprocessing import PContext, start_processes, LogsSpecs
+from torch.distributed.elastic.utils import macros
+from torch.distributed.elastic.utils.logging import get_logger
+
+log = get_logger(__name__)
+
+__all__ = [
+    "LocalElasticAgent",
+    "TORCHELASTIC_ENABLE_FILE_TIMER",
+    "TORCHELASTIC_TIMER_FILE",
+]
+
+TORCHELASTIC_ENABLE_FILE_TIMER = "TORCHELASTIC_ENABLE_FILE_TIMER"
+TORCHELASTIC_TIMER_FILE = "TORCHELASTIC_TIMER_FILE"
+
+class LocalElasticAgent(SimpleElasticAgent):
+    """An implementation of :py:class:`torchelastic.agent.server.ElasticAgent` that handles host-local workers.
+
+    This agent is deployed per host and is configured to spawn ``n`` workers.
+    When using GPUs, ``n`` maps to the number of GPUs available on the host.
+
+    The local agent does not communicate to other local agents deployed on
+    other hosts, even if the workers may communicate inter-host. The worker id
+    is interpreted to be a local process. The agent starts and stops all worker
+    processes as a single unit.
+
+
+    The worker function and argument passed to the worker function must be
+    python multiprocessing compatible. To pass multiprocessing data structures
+    to the workers you may create the data structure in the same multiprocessing
+    context as the specified ``start_method`` and pass it as a function argument.
+
+    The ``exit_barrier_timeout`` specifies the amount of time (in seconds) to wait
+    for other agents to finish. This acts as a safety net to handle cases where
+    workers finish at different times, to prevent agents from viewing workers
+    that finished early as a scale-down event. It is strongly advised that the
+    user code deal with ensuring that workers are terminated in a synchronous
+    manner rather than relying on the exit_barrier_timeout.
+
+    A named pipe based watchdog can be enabled in ```LocalElasticAgent``` if an
+    environment variable ``TORCHELASTIC_ENABLE_FILE_TIMER`` with value 1 has
+    been defined in the ```LocalElasticAgent``` process.
+    Optionally, another environment variable ```TORCHELASTIC_TIMER_FILE```
+    can be set with a unique file name for the named pipe. If the environment
+    variable ```TORCHELASTIC_TIMER_FILE``` is not set, ```LocalElasticAgent```
+    will internally create a unique file name and set it to the environment
+    variable ```TORCHELASTIC_TIMER_FILE```, and this environment variable will
+    be propagated to the worker processes to allow them to connect to the same
+    named pipe that ```LocalElasticAgent``` uses.
+
+    Logs are written to the specified log directory. Each log line will be by default
+    prefixed by ``[${role_name}${local_rank}]:`` (e.g. ``[trainer0]: foobar``).
+    Log prefixes can be customized by passing a `template string
+    <https://docs.python.org/3/library/string.html#template-strings>`_ as the
+    ``log_line_prefix_template`` argument.
+    The following macros (identifiers) are substituted at runtime:
+    ``${role_name}, ${local_rank}, ${rank}``. For example, to prefix each log line with
+    global rank instead of the local rank, set ``log_line_prefix_template = "[${rank}]:``.
+
+
+    Example launching function
+
+    ::
+
+        def trainer(args) -> str:
+            return "do train"
+
+        def main():
+            start_method="spawn"
+            shared_queue= multiprocessing.get_context(start_method).Queue()
+            spec = WorkerSpec(
+                        role="trainer",
+                        local_world_size=nproc_per_process,
+                        entrypoint=trainer,
+                        args=("foobar",),
+                        ...<OTHER_PARAMS...>)
+            agent = LocalElasticAgent(spec, start_method)
+            results = agent.run()
+
+            if results.is_failed():
+                print("trainer failed")
+            else:
+                print(f"rank 0 return value: {results.return_values[0]}")
+                # prints -> rank 0 return value: do train
+
+    Example launching binary
+
+    ::
+
+        def main():
+            spec = WorkerSpec(
+                        role="trainer",
+                        local_world_size=nproc_per_process,
+                        entrypoint="/usr/local/bin/trainer",
+                        args=("--trainer-args", "foobar"),
+                        ...<OTHER_PARAMS...>)
+            agent = LocalElasticAgent(spec)
+            results = agent.run()
+
+            if not results.is_failed():
+                print("binary launches do not have return values")
+
+    """
+
+    def __init__(
+        self,
+        spec: WorkerSpec,
+        logs_specs: LogsSpecs,
+        start_method="spawn",
+        exit_barrier_timeout: float = 300,
+        log_line_prefix_template: Optional[str] = None,
+    ):
+        super().__init__(spec, exit_barrier_timeout)
+        self._start_method = start_method
+        self._pcontext: Optional[PContext] = None
+        self._rdzv_handler = spec.rdzv_handler
+        self._log_line_prefix_template = log_line_prefix_template
+        self._worker_watchdog: Optional[timer.FileTimerServer] = None
+        self._logs_specs = logs_specs
+
+
+    def _setup_local_watchdog(self, envs: Dict[int, Dict[str, str]]) -> None:
+        enable_watchdog_env_name = TORCHELASTIC_ENABLE_FILE_TIMER
+        watchdog_enabled = os.getenv(enable_watchdog_env_name)
+        watchdog_file_env_name = TORCHELASTIC_TIMER_FILE
+        watchdog_file_path = os.getenv(watchdog_file_env_name)
+        if watchdog_enabled is not None and str(watchdog_enabled) == "1":
+            if watchdog_file_path is None:
+                watchdog_file_path = "/tmp/watchdog_timer_" + str(uuid.uuid4())
+            log.info("Starting a FileTimerServer with %s ...", watchdog_file_path)
+            self._worker_watchdog = timer.FileTimerServer(
+                file_path=watchdog_file_path,
+                max_interval=0.1,
+                daemon=True,
+                log_event=self._log_watchdog_event)
+            self._worker_watchdog.start()
+            log.info("FileTimerServer started")
+        else:
+            log.info("Environment variable '%s' not found. Do not start FileTimerServer.", enable_watchdog_env_name)
+        # Propagate the watchdog file env to worker processes
+        if watchdog_file_path is not None:
+            for worker_env in envs.values():
+                worker_env[watchdog_file_env_name] = watchdog_file_path
+
+
+    def _get_fq_hostname(self) -> str:
+        return socket.getfqdn(socket.gethostname())
+
+    def _log_watchdog_event(
+        self,
+        name: str,
+        request: Optional[timer.FileTimerRequest],
+    ) -> None:
+        wg = self._worker_group
+        spec = wg.spec
+        md = {
+            "watchdog_event": name
+        }
+        if request is not None:
+            md["worker_pid"] = str(request.worker_pid)
+            md["scope_id"] = request.scope_id
+            md["expiration_time"] = str(request.expiration_time)
+            md["signal"] = str(request.signal)
+        md_str = json.dumps(md)
+        state = "RUNNING"
+        metadata: Dict[str, EventMetadataValue] = {
+            "run_id": spec.rdzv_handler.get_run_id(),
+            "global_rank": None,
+            "group_rank": wg.group_rank,
+            "worker_id": None,
+            "role": spec.role,
+            "hostname": self._get_fq_hostname(),
+            "state": state,
+            "total_run_time": self._total_execution_time,
+            "rdzv_backend": spec.rdzv_handler.get_backend(),
+            "raw_error": None,
+            "metadata": md_str,
+            "agent_restarts": spec.max_restarts - self._remaining_restarts,
+        }
+        # Note: The 'metadata' field of the Event is converted to a TorchelasticStatusLogEntry later.
+        #       The 'name' field of the Event is NOT used in the TorchelasticStatusLogEntry.
+        event = events.Event(
+            name=name, source=events.EventSource.AGENT, metadata=metadata
+        )
+        events.record(event)
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _stop_workers(self, worker_group: WorkerGroup) -> None:
+        self._shutdown()
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]:
+        spec = worker_group.spec
+        store = worker_group.store
+        assert store is not None
+        master_addr, master_port = super()._get_master_addr_port(store)
+        restart_count = spec.max_restarts - self._remaining_restarts
+
+        use_agent_store = spec.rdzv_handler.get_backend() == "static"
+
+        args: Dict[int, Tuple] = {}
+        envs: Dict[int, Dict[str, str]] = {}
+        log_line_prefixes: Optional[Dict[int, str]] = {} if self._log_line_prefix_template else None
+        for worker in worker_group.workers:
+            local_rank = worker.local_rank
+            worker_env = {
+                "LOCAL_RANK": str(local_rank),
+                "RANK": str(worker.global_rank),
+                "GROUP_RANK": str(worker_group.group_rank),
+                "ROLE_RANK": str(worker.role_rank),
+                "ROLE_NAME": spec.role,
+                "LOCAL_WORLD_SIZE": str(spec.local_world_size),
+                "WORLD_SIZE": str(worker.world_size),
+                "GROUP_WORLD_SIZE": str(worker_group.group_world_size),
+                "ROLE_WORLD_SIZE": str(worker.role_world_size),
+                "MASTER_ADDR": master_addr,
+                "MASTER_PORT": str(master_port),
+                "TORCHELASTIC_RESTART_COUNT": str(restart_count),
+                "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts),
+                "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(),
+                "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store),
+                "TORCH_NCCL_ASYNC_ERROR_HANDLING": os.getenv(
+                    "TORCH_NCCL_ASYNC_ERROR_HANDLING", str(1)
+                ),
+            }
+            if "OMP_NUM_THREADS" in os.environ:
+                worker_env["OMP_NUM_THREADS"] = os.environ["OMP_NUM_THREADS"]
+
+
+            if self._log_line_prefix_template:
+                log_line_prefix = Template(self._log_line_prefix_template).safe_substitute(
+                    role_name=spec.role,
+                    rank=worker.global_rank,
+                    local_rank=local_rank,)
+                log_line_prefixes[local_rank] = log_line_prefix
+
+            envs[local_rank] = worker_env
+            worker_args = list(spec.args)
+            worker_args = macros.substitute(worker_args, str(local_rank))
+            args[local_rank] = tuple(worker_args)
+
+        self._setup_local_watchdog(envs=envs)
+
+        assert spec.entrypoint is not None
+        assert self._logs_specs is not None
+        self._pcontext = start_processes(
+            name=spec.role,
+            entrypoint=spec.entrypoint,
+            args=args,
+            envs=envs,
+            logs_specs=self._logs_specs,
+            log_line_prefixes=log_line_prefixes,
+            start_method=self._start_method,
+        )
+
+        return self._pcontext.pids()
+
+    def _shutdown(self, death_sig: signal.Signals = signal.SIGTERM) -> None:
+        if self._worker_watchdog is not None:
+            self._worker_watchdog.stop()
+            self._worker_watchdog = None
+        if self._pcontext:
+            self._pcontext.close(death_sig)
+        if self._rdzv_handler:
+            self._rdzv_handler.shutdown()
+
+    # pyre-fixme[56]: Pyre was not able to infer the type of the decorator
+    #  `torch.distributed.elastic.metrics.prof`.
+    @prof
+    def _monitor_workers(self, worker_group: WorkerGroup) -> RunResult:
+        role = worker_group.spec.role
+        worker_pids = {w.id for w in worker_group.workers}
+        assert self._pcontext is not None
+        pc_pids = set(self._pcontext.pids().values())
+        if worker_pids != pc_pids:
+            log.error(
+                "[%s] worker pids do not match process_context pids."
+                " Expected: %s, actual: %s",
+                role, worker_pids, pc_pids
+            )
+            return RunResult(state=WorkerState.UNKNOWN)
+
+        result = self._pcontext.wait(0)
+        if result:
+            if result.is_failed():
+                # map local rank failure to global rank
+                worker_failures = {}
+                for local_rank, failure in result.failures.items():
+                    worker = worker_group.workers[local_rank]
+                    worker_failures[worker.global_rank] = failure
+                return RunResult(
+                    state=WorkerState.FAILED,
+                    failures=worker_failures,
+                )
+            else:
+                # copy ret_val_queue into a map with a global ranks
+                workers_ret_vals = {}
+                for local_rank, ret_val in result.return_values.items():
+                    worker = worker_group.workers[local_rank]
+                    workers_ret_vals[worker.global_rank] = ret_val
+                return RunResult(
+                    state=WorkerState.SUCCEEDED,
+                    return_values=workers_ret_vals,
+                )
+        else:
+            return RunResult(state=WorkerState.HEALTHY)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5ff025f8ae3a2a10d3e28ec1b6ef9d5ae4573f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__init__.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env/python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Module contains events processing mechanisms that are integrated with the standard python logging.
+
+Example of usage:
+
+::
+
+  from torch.distributed.elastic import events
+  event = events.Event(name="test_event", source=events.EventSource.WORKER, metadata={...})
+  events.get_logging_handler(destination="console").info(event)
+
+"""
+
+import inspect
+import logging
+import os
+import socket
+import traceback
+from enum import Enum
+from typing import Dict, Optional
+
+from torch.distributed.elastic.events.handlers import get_logging_handler
+
+from .api import (  # noqa: F401
+    Event,
+    EventMetadataValue,
+    EventSource,
+    NodeState,
+    RdzvEvent,
+)
+
+_events_loggers: Dict[str, logging.Logger] = {}
+
+def _get_or_create_logger(destination: str = "null") -> logging.Logger:
+    """
+    Construct python logger based on the destination type or extends if provided.
+
+    Available destination could be found in ``handlers.py`` file.
+    The constructed logger does not propagate messages to the upper level loggers,
+    e.g. root logger. This makes sure that a single event can be processed once.
+
+    Args:
+        destination: The string representation of the event handler.
+            Available handlers found in ``handlers`` module
+    """
+    global _events_loggers
+
+    if destination not in _events_loggers:
+        _events_logger = logging.getLogger(f"torchelastic-events-{destination}")
+        _events_logger.setLevel(os.environ.get("LOGLEVEL", "INFO"))
+        # Do not propagate message to the root logger
+        _events_logger.propagate = False
+
+        logging_handler = get_logging_handler(destination)
+        _events_logger.addHandler(logging_handler)
+
+        # Add the logger to the global dictionary
+        _events_loggers[destination] = _events_logger
+
+    return _events_loggers[destination]
+
+
+def record(event: Event, destination: str = "null") -> None:
+    _get_or_create_logger(destination).info(event.serialize())
+
+def record_rdzv_event(event: RdzvEvent) -> None:
+    _get_or_create_logger("dynamic_rendezvous").info(event.serialize())
+
+
+def construct_and_record_rdzv_event(
+    run_id: str,
+    message: str,
+    node_state: NodeState,
+    name: str = "",
+    hostname: str = "",
+    pid: Optional[int] = None,
+    master_endpoint: str = "",
+    local_id: Optional[int] = None,
+    rank: Optional[int] = None,
+) -> None:
+    # We don't want to perform an extra computation if not needed.
+    if isinstance(get_logging_handler("dynamic_rendezvous"), logging.NullHandler):
+        return
+
+    # Set up parameters.
+    if not hostname:
+        hostname = socket.getfqdn()
+    if not pid:
+        pid = os.getpid()
+
+    # Determines which file called this function.
+    callstack = inspect.stack()
+    filename = "no_file"
+    if len(callstack) > 1:
+        stack_depth_1 = callstack[1]
+        filename = os.path.basename(stack_depth_1.filename)
+        if not name:
+            name = stack_depth_1.function
+
+    # Delete the callstack variable. If kept, this can mess with python's
+    # garbage collector as we are holding on to stack frame information in
+    # the inspect module.
+    del callstack
+
+    # Set up error trace if this is an exception
+    if node_state == NodeState.FAILED:
+        error_trace = traceback.format_exc()
+    else:
+        error_trace = ""
+
+    # Initialize event object
+    event = RdzvEvent(
+        name=f"{filename}:{name}",
+        run_id=run_id,
+        message=message,
+        hostname=hostname,
+        pid=pid,
+        node_state=node_state,
+        master_endpoint=master_endpoint,
+        rank=rank,
+        local_id=local_id,
+        error_trace=error_trace,
+    )
+
+    # Finally, record the event.
+    record_rdzv_event(event)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a66c7eee23ed21bb25c273f0c5215005e5f91ac0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d59f704caf9540c1ac4abecb4fb03e88d9623ccd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/handlers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/handlers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf4b06bcf5863d79d808a9d82f6b44ca2494bc95
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/events/__pycache__/handlers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/events/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..37d1d8947e0ecfcae86c512cad94ede4214970ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/events/api.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from typing import Dict, Union, Optional
+
+__all__ = ['EventSource', 'Event', 'NodeState', 'RdzvEvent']
+
+EventMetadataValue = Union[str, int, float, bool, None]
+
+
+class EventSource(str, Enum):
+    """Known identifiers of the event producers."""
+
+    AGENT = "AGENT"
+    WORKER = "WORKER"
+
+
+@dataclass
+class Event:
+    """
+    The class represents the generic event that occurs during the torchelastic job execution.
+
+    The event can be any kind of meaningful action.
+
+    Args:
+        name: event name.
+        source: the event producer, e.g. agent or worker
+        timestamp: timestamp in milliseconds when event occurred.
+        metadata: additional data that is associated with the event.
+    """
+
+    name: str
+    source: EventSource
+    timestamp: int = 0
+    metadata: Dict[str, EventMetadataValue] = field(default_factory=dict)
+
+    def __str__(self):
+        return self.serialize()
+
+    @staticmethod
+    def deserialize(data: Union[str, "Event"]) -> "Event":
+        if isinstance(data, Event):
+            return data
+        if isinstance(data, str):
+            data_dict = json.loads(data)
+        data_dict["source"] = EventSource[data_dict["source"]]  # type: ignore[possibly-undefined]
+        return Event(**data_dict)
+
+    def serialize(self) -> str:
+        return json.dumps(asdict(self))
+
+
+class NodeState(str, Enum):
+    """The states that a node can be in rendezvous."""
+
+    INIT = "INIT"
+    RUNNING = "RUNNING"
+    SUCCEEDED = "SUCCEEDED"
+    FAILED = "FAILED"
+
+
+@dataclass
+class RdzvEvent:
+    """
+    Dataclass to represent any rendezvous event.
+
+    Args:
+        name: Event name. (E.g. Current action being performed)
+        run_id: The run id of the rendezvous
+        message: The message describing the event
+        hostname: Hostname of the node
+        pid: The process id of the node
+        node_state: The state of the node (INIT, RUNNING, SUCCEEDED, FAILED)
+        master_endpoint: The master endpoint for the rendezvous store, if known
+        rank: The rank of the node, if known
+        local_id: The local_id of the node, if defined in dynamic_rendezvous.py
+        error_trace: Error stack trace, if this is an error event.
+    """
+
+    name: str
+    run_id: str
+    message: str
+    hostname: str
+    pid: int
+    node_state: NodeState
+    master_endpoint: str = ""
+    rank: Optional[int] = None
+    local_id: Optional[int] = None
+    error_trace: str = ""
+
+    def __str__(self):
+        return self.serialize()
+
+    @staticmethod
+    def deserialize(data: Union[str, "RdzvEvent"]) -> "RdzvEvent":
+        if isinstance(data, RdzvEvent):
+            return data
+        if isinstance(data, str):
+            data_dict = json.loads(data)
+        data_dict["node_state"] = NodeState[data_dict["node_state"]]  # type: ignore[possibly-undefined]
+        return RdzvEvent(**data_dict)
+
+    def serialize(self) -> str:
+        return json.dumps(asdict(self))
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/events/handlers.py b/MLPY/Lib/site-packages/torch/distributed/elastic/events/handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..51dd142801ba1f3d597d41da7df9121aef006fe7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/events/handlers.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict
+
+
+_log_handlers: Dict[str, logging.Handler] = {
+    "console": logging.StreamHandler(),
+    "dynamic_rendezvous": logging.NullHandler(),
+    "null": logging.NullHandler(),
+}
+
+
+def get_logging_handler(destination: str = "null") -> logging.Handler:
+    global _log_handlers
+    return _log_handlers[destination]
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc47774f8bc60e40663b5cb6e5f703afc1ad2b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__init__.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env/python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Metrics API.
+
+**Overview**:
+
+The metrics API in torchelastic is used to publish telemetry metrics.
+It is designed to be used by torchelastic's internal modules to
+publish metrics for the end user with the goal of increasing visibility
+and helping with debugging. However you may use the same API in your
+jobs to publish metrics to the same metrics ``sink``.
+
+A ``metric`` can be thought of as timeseries data
+and is uniquely identified by the string-valued tuple
+``(metric_group, metric_name)``.
+
+torchelastic makes no assumptions about what a ``metric_group`` is
+and what relationship it has with ``metric_name``. It is totally up
+to the user to use these two fields to uniquely identify a metric.
+
+.. note:: The metric group ``torchelastic`` is reserved by torchelastic for
+          platform level metrics that it produces.
+          For instance torchelastic may output the latency (in milliseconds)
+          of a re-rendezvous operation from the agent as
+          ``(torchelastic, agent.rendezvous.duration.ms)``
+
+A sensible way to use metric groups is to map them to a stage or module
+in your job. You may also encode certain high level properties
+the job such as the region or stage (dev vs prod).
+
+**Publish Metrics**:
+
+Using torchelastic's metrics API is similar to using python's logging
+framework. You first have to configure a metrics handler before
+trying to add metric data.
+
+The example below measures the latency for the ``calculate()`` function.
+
+::
+
+  import time
+  import torch.distributed.elastic.metrics as metrics
+
+  # makes all metrics other than the one from "my_module" to go /dev/null
+  metrics.configure(metrics.NullMetricsHandler())
+  metrics.configure(metrics.ConsoleMetricsHandler(), "my_module")
+
+  def my_method():
+    start = time.time()
+    calculate()
+    end = time.time()
+    metrics.put_metric("calculate_latency", int(end-start), "my_module")
+
+You may also use the torch.distributed.elastic.metrics.prof` decorator
+to conveniently and succinctly profile functions
+
+::
+
+  # -- in module examples.foobar --
+
+  import torch.distributed.elastic.metrics as metrics
+
+  metrics.configure(metrics.ConsoleMetricsHandler(), "foobar")
+  metrics.configure(metrics.ConsoleMetricsHandler(), "Bar")
+
+  @metrics.prof
+  def foo():
+    pass
+
+  class Bar():
+
+    @metrics.prof
+    def baz():
+        pass
+
+``@metrics.prof`` will publish the following metrics
+::
+
+  <leaf_module or classname>.success - 1 if the function finished successfully
+  <leaf_module or classname>.failure - 1 if the function threw an exception
+  <leaf_module or classname>.duration.ms - function duration in milliseconds
+
+**Configuring Metrics Handler**:
+
+`torch.distributed.elastic.metrics.MetricHandler` is responsible for emitting
+the added metric values to a particular destination. Metric groups can be
+configured with different metric handlers.
+
+By default torchelastic emits all metrics to ``/dev/null``.
+By adding the following configuration metrics,
+``torchelastic`` and ``my_app`` metric groups will be printed out to
+console.
+
+::
+
+  import torch.distributed.elastic.metrics as metrics
+
+  metrics.configure(metrics.ConsoleMetricHandler(), group = "torchelastic")
+  metrics.configure(metrics.ConsoleMetricHandler(), group = "my_app")
+
+**Writing a Custom Metric Handler**:
+
+If you want your metrics to be emitted to a custom location, implement
+the `torch.distributed.elastic.metrics.MetricHandler` interface
+and configure your job to use your custom metric handler.
+
+Below is a toy example that prints the metrics to ``stdout``
+
+::
+
+  import torch.distributed.elastic.metrics as metrics
+
+  class StdoutMetricHandler(metrics.MetricHandler):
+     def emit(self, metric_data):
+         ts = metric_data.timestamp
+         group = metric_data.group_name
+         name = metric_data.name
+         value = metric_data.value
+         print(f"[{ts}][{group}]: {name}={value}")
+
+  metrics.configure(StdoutMetricHandler(), group="my_app")
+
+Now all metrics in the group ``my_app`` will be printed to stdout as:
+
+::
+
+  [1574213883.4182858][my_app]: my_metric=<value>
+  [1574213940.5237644][my_app]: my_metric=<value>
+
+"""
+
+from typing import Optional
+
+from .api import (  # noqa: F401
+    ConsoleMetricHandler,
+    MetricData,
+    MetricHandler,
+    MetricsConfig,
+    NullMetricHandler,
+    configure,
+    get_elapsed_time_ms,
+    getStream,
+    prof,
+    profile,
+    publish_metric,
+    put_metric,
+)
+
+
+def initialize_metrics(cfg: Optional[MetricsConfig] = None):
+    pass
+
+
+try:
+    from torch.distributed.elastic.metrics.static_init import *  # type: ignore[import] # noqa: F401 F403
+except ModuleNotFoundError:
+    pass
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44096ca0ea2e27b0c28ae614341d8ac24304db0d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c23139ef26a2d91cfd79842f96fcdcad9ec00422
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe2ef22bd53edea9da695dde06259510d569ff8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/metrics/api.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import abc
+import time
+import warnings
+from collections import namedtuple
+from functools import wraps
+from typing import Dict, Optional
+
+__all__ = ['MetricsConfig', 'MetricHandler', 'ConsoleMetricHandler', 'NullMetricHandler', 'MetricStream',
+           'configure', 'getStream', 'prof', 'profile', 'put_metric', 'publish_metric', 'get_elapsed_time_ms',
+           'MetricData']
+
+MetricData = namedtuple("MetricData", ["timestamp", "group_name", "name", "value"])
+
+
+class MetricsConfig:
+    __slots__ = ["params"]
+
+    def __init__(self, params: Optional[Dict[str, str]] = None):
+        self.params = params
+        if self.params is None:
+            self.params = {}
+
+
+class MetricHandler(abc.ABC):
+    @abc.abstractmethod
+    def emit(self, metric_data: MetricData):
+        pass
+
+
+class ConsoleMetricHandler(MetricHandler):
+    def emit(self, metric_data: MetricData):
+        print(
+            f"[{metric_data.timestamp}][{metric_data.group_name}]: {metric_data.name}={metric_data.value}"
+        )
+
+
+class NullMetricHandler(MetricHandler):
+    def emit(self, metric_data: MetricData):
+        pass
+
+
+class MetricStream:
+    def __init__(self, group_name: str, handler: MetricHandler):
+        self.group_name = group_name
+        self.handler = handler
+
+    def add_value(self, metric_name: str, metric_value: int):
+        self.handler.emit(
+            MetricData(time.time(), self.group_name, metric_name, metric_value)
+        )
+
+
+_metrics_map: Dict[str, MetricHandler] = {}
+_default_metrics_handler: MetricHandler = NullMetricHandler()
+
+
+# pyre-fixme[9]: group has type `str`; used as `None`.
+def configure(handler: MetricHandler, group: Optional[str] = None):
+    if group is None:
+        global _default_metrics_handler
+        # pyre-fixme[9]: _default_metrics_handler has type `NullMetricHandler`; used
+        #  as `MetricHandler`.
+        _default_metrics_handler = handler
+    else:
+        _metrics_map[group] = handler
+
+
+def getStream(group: str):
+    if group in _metrics_map:
+        handler = _metrics_map[group]
+    else:
+        handler = _default_metrics_handler
+    return MetricStream(group, handler)
+
+
+def _get_metric_name(fn):
+    qualname = fn.__qualname__
+    split = qualname.split(".")
+    if len(split) == 1:
+        module = fn.__module__
+        if module:
+            return module.split(".")[-1] + "." + split[0]
+        else:
+            return split[0]
+    else:
+        return qualname
+
+
+def prof(fn=None, group: str = "torchelastic"):
+    r"""
+    @profile decorator publishes duration.ms, count, success, failure metrics for the function that it decorates.
+
+    The metric name defaults to the qualified name (``class_name.def_name``) of the function.
+    If the function does not belong to a class, it uses the leaf module name instead.
+
+    Usage
+
+    ::
+
+     @metrics.prof
+     def x():
+         pass
+
+     @metrics.prof(group="agent")
+     def y():
+         pass
+    """
+
+    def wrap(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            key = _get_metric_name(f)
+            try:
+                start = time.time()
+                result = f(*args, **kwargs)
+                put_metric(f"{key}.success", 1, group)
+            except Exception:
+                put_metric(f"{key}.failure", 1, group)
+                raise
+            finally:
+                put_metric(f"{key}.duration.ms", get_elapsed_time_ms(start), group)  # type: ignore[possibly-undefined]
+            return result
+
+        return wrapper
+
+    if fn:
+        return wrap(fn)
+    else:
+        return wrap
+
+
+def profile(group=None):
+    """
+    @profile decorator adds latency and success/failure metrics to any given function.
+
+    Usage
+
+    ::
+
+     @metrics.profile("my_metric_group")
+     def some_function(<arguments>):
+    """
+    warnings.warn("Deprecated, use @prof instead", DeprecationWarning)
+
+    def wrap(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                start_time = time.time()
+                result = func(*args, **kwargs)
+                publish_metric(group, f"{func.__name__}.success", 1)
+            except Exception:
+                publish_metric(group, f"{func.__name__}.failure", 1)
+                raise
+            finally:
+                publish_metric(
+                    group,
+                    f"{func.__name__}.duration.ms",
+                    get_elapsed_time_ms(start_time),  # type: ignore[possibly-undefined]
+                )
+            return result
+
+        return wrapper
+
+    return wrap
+
+
+def put_metric(metric_name: str, metric_value: int, metric_group: str = "torchelastic"):
+    """
+    Publish a metric data point.
+
+    Usage
+
+    ::
+
+     put_metric("metric_name", 1)
+     put_metric("metric_name", 1, "metric_group_name")
+    """
+    getStream(metric_group).add_value(metric_name, metric_value)
+
+
+def publish_metric(metric_group: str, metric_name: str, metric_value: int):
+    warnings.warn(
+        "Deprecated, use put_metric(metric_group)(metric_name, metric_value) instead"
+    )
+    metric_stream = getStream(metric_group)
+    metric_stream.add_value(metric_name, metric_value)
+
+
+def get_elapsed_time_ms(start_time_in_seconds: float):
+    """Return the elapsed time in millis from the given start time."""
+    end_time = time.time()
+    return int((end_time - start_time_in_seconds) * 1000)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7beb6b0ad259bbe3240495a4285fa1a7c6dd19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__init__.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Library that launches and manages ``n`` copies of worker subprocesses either specified by a function or a binary.
+
+For functions, it uses ``torch.multiprocessing`` (and therefore python
+``multiprocessing``) to spawn/fork worker processes. For binaries it uses python
+``subprocessing.Popen`` to create worker processes.
+
+
+Usage 1: Launching two trainers as a function
+
+::
+
+ from torch.distributed.elastic.multiprocessing import Std, start_processes
+
+ def trainer(a, b, c):
+     pass # train
+
+
+ # runs two trainers
+ # LOCAL_RANK=0 trainer(1,2,3)
+ # LOCAL_RANK=1 trainer(4,5,6)
+ ctx = start_processes(
+         name="trainer",
+         entrypoint=trainer,
+         args={0: (1,2,3), 1: (4,5,6)},
+         envs={0: {"LOCAL_RANK": 0}, 1: {"LOCAL_RANK": 1}},
+         log_dir="/tmp/foobar",
+         redirects=Std.ALL, # write all worker stdout/stderr to a log file
+         tee={0: Std.ERR}, # tee only local rank 0's stderr to console
+       )
+
+ # waits for all copies of trainer to finish
+ ctx.wait()
+
+Usage 2: Launching 2 echo workers as a binary
+
+::
+
+ # same as invoking
+ # echo hello
+ # echo world > stdout.log
+ ctx = start_processes(
+         name="echo"
+         entrypoint="echo",
+         log_dir="/tmp/foobar",
+         args={0: "hello", 1: "world"},
+         redirects={1: Std.OUT},
+        )
+
+Just like ``torch.multiprocessing``, the return value of the function
+:func:`start_processes` is a process context (:class:`api.PContext`). If a function
+was launched, a :class:`api.MultiprocessContext` is returned and if a binary
+was launched a :class:`api.SubprocessContext` is returned. Both are specific
+implementations of the parent :class:`api.PContext` class.
+"""
+
+import os
+from typing import Callable, Dict, Optional, Tuple, Union, Set
+
+from torch.distributed.elastic.multiprocessing.api import (  # noqa: F401
+    _validate_full_rank,
+    DefaultLogsSpecs,
+    LogsDest,
+    LogsSpecs,
+    MultiprocessContext,
+    PContext,
+    ProcessFailure,
+    RunProcsResult,
+    SignalException,
+    Std,
+    SubprocessContext,
+    to_map,
+)
+from torch.distributed.elastic.utils.logging import get_logger
+
+__all__ = [
+    "start_processes",
+    "MultiprocessContext",
+    "PContext",
+    "ProcessFailure",
+    "RunProcsResult",
+    "SignalException",
+    "Std",
+    "LogsDest",
+    "LogsSpecs",
+    "DefaultLogsSpecs",
+    "SubprocessContext",
+    "to_map",
+]
+
+log = get_logger(__name__)
+
+
+def start_processes(
+    name: str,
+    entrypoint: Union[Callable, str],
+    args: Dict[int, Tuple],
+    envs: Dict[int, Dict[str, str]],
+    logs_specs: LogsSpecs,
+    log_line_prefixes: Optional[Dict[int, str]] = None,
+    start_method: str = "spawn",
+) -> PContext:
+    """
+    Start ``n`` copies of ``entrypoint`` processes with the provided options.
+
+    ``entrypoint`` is either a ``Callable`` (function) or a ``str`` (binary).
+    The number of copies is determined by the number of entries for ``args`` and
+    ``envs`` arguments, which need to have the same key set.
+
+    ``args`` and ``env`` parameters are the arguments and environment variables
+    to pass down to the entrypoint mapped by the replica index (local rank).
+    All local ranks must be accounted for.
+    That is, the keyset should be ``{0,1,...,(nprocs-1)}``.
+
+    .. note:: When the ``entrypoint`` is a binary (``str``), ``args`` can only be strings.
+              If any other type is given, then it is casted to a string representation
+              (e.g. ``str(arg1)``). Furthermore, a binary failure will only write
+              an ``error.json`` error file if the main function is annotated with
+              ``torch.distributed.elastic.multiprocessing.errors.record``. For function launches,
+              this is done by default and there is no need to manually annotate
+              with the ``@record`` annotation.
+
+    ``redirects`` and ``tee`` are bitmasks specifying which std stream(s) to redirect
+    to a log file in the ``log_dir``. Valid mask values are defined in ``Std``.
+    To redirect/tee only certain local ranks, pass ``redirects`` as a map with the key as
+    the local rank to specify the redirect behavior for.
+    Any missing local ranks will default to ``Std.NONE``.
+
+    ``tee`` acts like the unix "tee" command in that it redirects + prints to console.
+    To avoid worker stdout/stderr from printing to console, use the ``redirects`` parameter.
+
+    For each process, the ``log_dir`` will contain:
+
+    #. ``{local_rank}/error.json``: if the process failed, a file with the error info
+    #. ``{local_rank}/stdout.json``: if ``redirect & STDOUT == STDOUT``
+    #. ``{local_rank}/stderr.json``: if ``redirect & STDERR == STDERR``
+
+    .. note:: It is expected that the ``log_dir`` exists, is empty, and is a directory.
+
+    Example:
+    ::
+
+     log_dir = "/tmp/test"
+
+     # ok; two copies of foo: foo("bar0"), foo("bar1")
+     start_processes(
+        name="trainer",
+        entrypoint=foo,
+        args:{0:("bar0",), 1:("bar1",),
+        envs:{0:{}, 1:{}},
+        log_dir=log_dir
+     )
+
+     # invalid; envs missing for local rank 1
+     start_processes(
+        name="trainer",
+        entrypoint=foo,
+        args:{0:("bar0",), 1:("bar1",),
+        envs:{0:{}},
+        log_dir=log_dir
+     )
+
+     # ok; two copies of /usr/bin/touch: touch file1, touch file2
+     start_processes(
+        name="trainer",
+        entrypoint="/usr/bin/touch",
+        args:{0:("file1",), 1:("file2",),
+        envs:{0:{}, 1:{}},
+        log_dir=log_dir
+      )
+
+     # caution; arguments casted to string, runs:
+     # echo "1" "2" "3" and echo "[1, 2, 3]"
+     start_processes(
+        name="trainer",
+        entrypoint="/usr/bin/echo",
+        args:{0:(1,2,3), 1:([1,2,3],),
+        envs:{0:{}, 1:{}},
+        log_dir=log_dir
+      )
+
+    Args:
+        name: a human readable short name that describes what the processes are
+              (used as header when tee'ing stdout/stderr outputs)
+        entrypoint: either a ``Callable`` (function) or ``cmd`` (binary)
+        args: arguments to each replica
+        envs: env vars to each replica
+        log_dir: directory used to write log files
+        start_method: multiprocessing start method (spawn, fork, forkserver)
+                      ignored for binaries
+        redirects: which std streams to redirect to a log file
+        tee: which std streams to redirect + print to console
+        local_ranks_filter: which ranks' logs to print to console
+
+    """
+
+    nprocs = len(args)
+    _validate_full_rank(args, nprocs, "args")
+    _validate_full_rank(envs, nprocs, "envs")
+
+    context: PContext
+    if isinstance(entrypoint, str):
+        context = SubprocessContext(
+            name=name,
+            entrypoint=entrypoint,
+            args=args,
+            envs=envs,
+            logs_specs=logs_specs,
+            log_line_prefixes=log_line_prefixes,
+        )
+    else:
+        context = MultiprocessContext(
+            name=name,
+            entrypoint=entrypoint,
+            args=args,
+            envs=envs,
+            log_line_prefixes=log_line_prefixes,
+            start_method=start_method,
+            logs_specs=logs_specs,
+        )
+
+    try:
+        context.start()
+        return context
+    except Exception:
+        context.close()
+        raise
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..369c1c91a1162f966f61cde508d0cb778dd07443
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e5912b0f70f85324bb42d495d7c107497a547fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/redirects.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/redirects.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5d8beee573c6e40fe968bbe11789d1635bac105
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/redirects.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/tail_log.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/tail_log.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6df5103b1261e6193e01319799f2087c0ef6f8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/__pycache__/tail_log.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f51a224ef3db62039a7dc014e656c15ab2bff23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/api.py
@@ -0,0 +1,873 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import abc
+import logging
+import os
+import re
+import shutil
+import signal
+import subprocess
+import sys
+import tempfile
+import time
+from contextlib import nullcontext
+from dataclasses import dataclass, field
+from enum import IntFlag
+from multiprocessing import synchronize
+from types import FrameType
+from typing import Any, Callable, Dict, Optional, Set, Tuple, Union
+from abc import ABC, abstractmethod
+
+import torch.multiprocessing as mp
+from torch.distributed.elastic.multiprocessing.errors import ProcessFailure, record
+from torch.distributed.elastic.multiprocessing.redirects import (
+    redirect_stderr,
+    redirect_stdout,
+)
+
+from torch.distributed.elastic.multiprocessing.subprocess_handler import SubprocessHandler, get_subprocess_handler
+from torch.distributed.elastic.multiprocessing.tail_log import TailLog
+
+IS_WINDOWS = sys.platform == "win32"
+IS_MACOS = sys.platform == "darwin"
+
+
+log = logging.getLogger(__name__)
+
+__all__ = [
+    "DefaultLogsSpecs",
+    "SignalException",
+    "Std",
+    "to_map",
+    "RunProcsResult",
+    "PContext",
+    "get_std_cm",
+    "MultiprocessContext",
+    "SubprocessContext",
+]
+
+class SignalException(Exception):
+    """
+    Exception is raised inside the torchelastic agent process by the termination handler
+    if the death signal got received by the process.
+    """
+
+    def __init__(self, msg: str, sigval: signal.Signals) -> None:
+        super().__init__(msg)
+        self.sigval = sigval
+
+
+def _terminate_process_handler(signum: int, frame: Optional[FrameType]) -> None:
+    """Termination handler that raises exceptions on the main process.
+
+    When the process receives death signal(SIGTERM, SIGINT), this termination handler will
+    be invoked. It raises the ``SignalException`` exception that should be processed by the
+    user code. Python does not terminate process after the termination handler is finished,
+    so the exception should not be silently ignored, otherwise the process will never
+    be terminated.
+    """
+    sigval = signal.Signals(signum)
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+
+
+def _get_kill_signal() -> signal.Signals:
+    """Get the kill signal. SIGKILL for unix, CTRL_C_EVENT for windows."""
+    if IS_WINDOWS:
+        return signal.CTRL_C_EVENT  # type: ignore[attr-defined] # noqa: F821
+    else:
+        return signal.SIGKILL
+
+
+def _get_default_signal() -> signal.Signals:
+    """Get the default termination signal. SIGTERM for unix, CTRL_C_EVENT for windows."""
+    if IS_WINDOWS:
+        return signal.CTRL_C_EVENT  # type: ignore[attr-defined] # noqa: F821
+    else:
+        return signal.SIGTERM
+
+
+def _validate_full_rank(d: Dict[int, Any], nprocs: int, what: str):
+    actual_keys = set(d.keys())
+    expected_keys = set(range(nprocs))
+
+    if actual_keys != expected_keys:
+        raise RuntimeError(
+            f"{what}, local rank mapping mismatch,"
+            f" expected: {expected_keys}, actual: {actual_keys}"
+        )
+
+
+_MAPPING_REGEX = r"^(\d:[0123],)*(\d:[0123])$"
+_VALUE_REGEX = r"^[0123]$"
+
+
+class Std(IntFlag):
+    NONE = 0
+    OUT = 1
+    ERR = 2
+    ALL = OUT | ERR
+
+    @classmethod
+    def from_str(cls, vm: str) -> Union["Std", Dict[int, "Std"]]:
+        """
+        Example:
+        ::
+
+         from_str("0") -> Std.NONE
+         from_str("1") -> Std.OUT
+         from_str("0:3,1:0,2:1,3:2") -> {0: Std.ALL, 1: Std.NONE, 2: Std.OUT, 3: Std.ERR}
+
+        Any other input raises an exception
+        """
+
+        def to_std(v: str) -> Std:  # type: ignore[return]
+            s = Std(int(v))
+            if s in Std:
+                return s
+            # return None -> should NEVER reach here since we regex check input
+
+        if re.match(_VALUE_REGEX, vm):  # vm is a number (e.g. 0)
+            return to_std(vm)
+        elif re.match(_MAPPING_REGEX, vm):  # vm is a mapping (e.g. 0:1,1:2)
+            d: Dict[int, Std] = {}
+            for m in vm.split(","):
+                i, v = m.split(":")
+                d[int(i)] = to_std(v)
+            return d
+        else:
+            raise ValueError(
+                f"{vm} does not match: <{_VALUE_REGEX}> or <{_MAPPING_REGEX}>"
+            )
+
+
+def to_map(
+    val_or_map: Union[Std, Dict[int, Std]], local_world_size: int
+) -> Dict[int, Std]:
+    """
+    Certain APIs take redirect settings either as a single value (e.g. apply to all
+    local ranks) or as an explicit user-provided mapping. This method is a convenience
+    method that converts a value or mapping into a mapping.
+
+    Example:
+    ::
+
+     to_map(Std.OUT, local_world_size=2) # returns: {0: Std.OUT, 1: Std.OUT}
+     to_map({1: Std.OUT}, local_world_size=2) # returns: {0: Std.NONE, 1: Std.OUT}
+     to_map({0: Std.OUT, 1: Std.OUT}, local_world_size=2) # returns: {0: Std.OUT, 1: Std.OUT}
+    """
+    if isinstance(val_or_map, Std):
+        return dict.fromkeys(range(local_world_size), val_or_map)
+    else:
+        map = {}
+        for i in range(local_world_size):
+            map[i] = val_or_map.get(i, Std.NONE)
+        return map
+
+
+@dataclass
+class LogsDest:
+    """
+    For each log type, holds mapping of local rank ids to file paths.
+    """
+    stdouts: Dict[int, str] = field(default_factory=dict)
+    stderrs: Dict[int, str] = field(default_factory=dict)
+    tee_stdouts: Dict[int, str] = field(default_factory=dict)
+    tee_stderrs: Dict[int, str] = field(default_factory=dict)
+    error_files: Dict[int, str] = field(default_factory=dict)
+
+
+class LogsSpecs(ABC):
+    """
+    Defines logs processing and redirection for each worker process.
+
+    Args:
+        log_dir:
+            Base directory where logs will be written.
+        redirects:
+            Streams to redirect to files. Pass a single ``Std``
+            enum to redirect for all workers, or a mapping keyed
+            by local_rank to selectively redirect.
+        tee:
+            Streams to duplicate to stdout/stderr.
+            Pass a single ``Std`` enum to duplicate streams for all workers,
+            or a mapping keyed by local_rank to selectively duplicate.
+    """
+
+    def __init__(
+        self,
+        log_dir: Optional[str] = None,
+        redirects: Union[Std, Dict[int, Std]] = Std.NONE,
+        tee: Union[Std, Dict[int, Std]] = Std.NONE,
+        local_ranks_filter: Optional[Set[int]] = None,
+    ) -> None:
+        self._root_log_dir = log_dir
+        self._redirects = redirects
+        self._tee = tee
+        self._local_ranks_filter = local_ranks_filter
+
+    @abstractmethod
+    def reify(self, envs: Dict[int, Dict[str, str]],) -> LogsDest:
+        """
+        Given the environment variables, builds destination of log files for each of the local ranks.
+
+        Envs parameter contains env variables dict for each of the local ranks, where entries are defined in:
+        :func:`~torchelastic.distributed.elastic.agent.server.local_elastic_agent.LocalElasticAgent._start_workers`.
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def root_log_dir(self) -> str:
+        pass
+
+class DefaultLogsSpecs(LogsSpecs):
+    """
+    Default LogsSpecs implementation:
+
+    - `log_dir` will be created if it doesn't exist
+    - Generates nested folders for each attempt and rank.
+    """
+    def __init__(
+        self,
+        log_dir: Optional[str] = None,
+        redirects: Union[Std, Dict[int, Std]] = Std.NONE,
+        tee: Union[Std, Dict[int, Std]] = Std.NONE,
+        local_ranks_filter: Optional[Set[int]] = None,
+    ) -> None:
+        if log_dir != os.devnull:
+            if not log_dir:
+                log_dir = tempfile.mkdtemp(prefix="torchelastic_")
+            elif not os.path.exists(log_dir):
+                os.makedirs(log_dir)
+            else:
+                if os.path.isfile(log_dir):
+                    raise NotADirectoryError(f"log_dir: {log_dir} is a file")
+        super().__init__(log_dir, redirects, tee, local_ranks_filter)
+        # initialized only once
+        self._run_log_dir = None
+
+    @property
+    def root_log_dir(self) -> str:
+        return str(self._root_log_dir)
+
+    def _make_log_dir(self, log_dir: Optional[str], rdzv_run_id: str):
+        base_log_dir = log_dir or tempfile.mkdtemp(prefix="torchelastic_")
+        os.makedirs(base_log_dir, exist_ok=True)
+        dir = tempfile.mkdtemp(prefix=f"{rdzv_run_id}_", dir=base_log_dir)
+        log.info("log directory set to: %s", dir)
+        return dir
+
+    def reify(self, envs: Dict[int, Dict[str, str]],) -> LogsDest:
+        """
+        Uses following scheme to build log destination paths:
+
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stdout.log`
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/stderr.log`
+        - `<log_dir>/<rdzv_run_id>/attempt_<attempt>/<rank>/error.json`
+        """
+        nprocs = len(envs)
+        global_env = {}  # use only to query properies that are not dependent on a rank
+        if nprocs > 0:
+            global_env = envs[0]
+        else:
+            log.warning("Empty envs map provided when defining logging destinations.")
+        # Keys are always defined, but values can be missing in unit tests
+        run_id = global_env.get("TORCHELASTIC_RUN_ID", "test_run_id")
+        restart_count = global_env.get("TORCHELASTIC_RESTART_COUNT", "0")
+
+        attempt_log_dir: str = ""
+        if self._root_log_dir != os.devnull:
+            if not self._run_log_dir:
+                self._run_log_dir = self._make_log_dir(self._root_log_dir, run_id)
+
+            attempt_log_dir = os.path.join(self._run_log_dir, f"attempt_{restart_count}")  # type: ignore[call-overload]
+            shutil.rmtree(attempt_log_dir, ignore_errors=True)
+            os.makedirs(attempt_log_dir)
+
+        if self._root_log_dir == os.devnull:
+            attempt_log_dir = os.devnull
+
+        # create subdirs for each local rank in the logs_dir
+        # logs_dir
+        #       |- 0
+        #          |- error.json
+        #          |- stdout.log
+        #          |- stderr.log
+        #       |- ...
+        #       |- (nprocs-1)
+        redirs = to_map(self._redirects, nprocs)
+        ts = to_map(self._tee, nprocs)
+
+        # to tee stdout/stderr we first redirect into a file
+        # then tail -f stdout.log/stderr.log so add tee settings to redirects
+        for local_rank, tee_std in ts.items():
+            redirect_std = redirs[local_rank]
+            redirs[local_rank] = redirect_std | tee_std
+
+        SYS_STREAM = ""  # special case to indicate to output to console
+        stdouts = dict.fromkeys(range(nprocs), SYS_STREAM)
+        stderrs = dict.fromkeys(range(nprocs), SYS_STREAM)
+        tee_stdouts: Dict[int, str] = {}
+        tee_stderrs: Dict[int, str] = {}
+        error_files = {}
+
+        for local_rank in range(nprocs):
+
+            if attempt_log_dir == os.devnull:
+                tee_stdouts[local_rank] = os.devnull
+                tee_stderrs[local_rank] = os.devnull
+                error_files[local_rank] = os.devnull
+                envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = ""
+            else:
+                clogdir = os.path.join(attempt_log_dir, str(local_rank))
+                os.mkdir(clogdir)
+
+                rd = redirs[local_rank]
+                if (rd & Std.OUT) == Std.OUT:
+                    stdouts[local_rank] = os.path.join(clogdir, "stdout.log")
+                if (rd & Std.ERR) == Std.ERR:
+                    stderrs[local_rank] = os.path.join(clogdir, "stderr.log")
+
+                t = ts[local_rank]
+                if t & Std.OUT == Std.OUT:
+                    tee_stdouts[local_rank] = stdouts[local_rank]
+                if t & Std.ERR == Std.ERR:
+                    tee_stderrs[local_rank] = stderrs[local_rank]
+
+                if self._local_ranks_filter and local_rank not in self._local_ranks_filter:
+                    # If stream is tee'd, only write to file, but don't tail
+                    if local_rank in tee_stdouts:
+                        tee_stdouts.pop(local_rank, None)
+                    if local_rank in tee_stderrs:
+                        tee_stderrs.pop(local_rank, None)
+
+                    # If stream is not redirected, don't print
+                    if stdouts[local_rank] == SYS_STREAM:
+                        stdouts[local_rank] = os.devnull
+                    if stderrs[local_rank] == SYS_STREAM:
+                        stderrs[local_rank] = os.devnull
+
+                error_file = os.path.join(clogdir, "error.json")
+                error_files[local_rank] = error_file
+                log.info("Setting worker%s reply file to: %s", local_rank, error_file)
+                envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = error_file
+
+        return LogsDest(stdouts, stderrs, tee_stdouts, tee_stderrs, error_files)
+
+    def __repr__(self) -> str:
+        return (
+            f"DefaultLogsSpecs(root_log_dir={self._root_log_dir}, redirects={self._redirects}, "
+            f"tee={self._tee}, local_ranks_filter={self._local_ranks_filter})"
+        )
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, DefaultLogsSpecs):
+            return False
+
+        return (
+            self._root_log_dir == other._root_log_dir
+            and self._redirects == other._redirects
+            and self._tee == other._tee
+            and self._local_ranks_filter == other._local_ranks_filter
+        )
+
+
+@dataclass
+class RunProcsResult:
+    """
+    Results of a completed run of processes started with ``start_processes()``. Returned by ``PContext``.
+
+    Note the following:
+
+    1. All fields are mapped by local rank
+    2. ``return_values`` - only populated for functions (not the binaries).
+    3. ``stdouts`` - path to stdout.log (empty string if no redirect)
+    4. ``stderrs`` - path to stderr.log (empty string if no redirect)
+
+    """
+
+    return_values: Dict[int, Any] = field(default_factory=dict)
+    failures: Dict[int, ProcessFailure] = field(default_factory=dict)
+    stdouts: Dict[int, str] = field(default_factory=dict)
+    stderrs: Dict[int, str] = field(default_factory=dict)
+
+    def is_failed(self) -> bool:
+        return len(self.failures) > 0
+
+
+class PContext(abc.ABC):
+    """
+    The base class that standardizes operations over a set of processes that are launched via different mechanisms.
+
+    The name ``PContext`` is intentional to disambiguate with ``torch.multiprocessing.ProcessContext``.
+
+    .. warning:: stdouts and stderrs should ALWAYS be a superset of
+                 tee_stdouts and tee_stderrs (respectively) this is b/c
+                 tee is implemented as a redirect + tail -f <stdout/stderr.log>
+    """
+
+    def __init__(
+        self,
+        name: str,
+        entrypoint: Union[Callable, str],
+        args: Dict[int, Tuple],
+        envs: Dict[int, Dict[str, str]],
+        logs_specs: LogsSpecs,
+        log_line_prefixes: Optional[Dict[int, str]] = None,
+
+    ):
+        self.name = name
+        # validate that all mappings have the same number of keys and
+        # all local ranks are accounted for
+        nprocs = len(args)
+
+        # TODO log_line_prefixes can be exanded too
+        logs_dest = logs_specs.reify(envs)
+
+        _validate_full_rank(logs_dest.stdouts, nprocs, "stdouts")
+        _validate_full_rank(logs_dest.stderrs, nprocs, "stderrs")
+
+        self.entrypoint = entrypoint
+        self.args = args
+        self.envs = envs
+        self.stdouts = logs_dest.stdouts
+        self.stderrs = logs_dest.stderrs
+        self.error_files = logs_dest.error_files
+        self.nprocs = nprocs
+
+        self._stdout_tail = TailLog(name, logs_dest.tee_stdouts, sys.stdout, log_line_prefixes)
+        self._stderr_tail = TailLog(name, logs_dest.tee_stderrs, sys.stderr, log_line_prefixes)
+
+    def start(self) -> None:
+        """Start processes using parameters defined in the constructor."""
+        signal.signal(signal.SIGTERM, _terminate_process_handler)
+        signal.signal(signal.SIGINT, _terminate_process_handler)
+        if not IS_WINDOWS:
+            signal.signal(signal.SIGHUP, _terminate_process_handler)
+            signal.signal(signal.SIGQUIT, _terminate_process_handler)
+        self._start()
+        self._stdout_tail.start()
+        self._stderr_tail.start()
+
+    @abc.abstractmethod
+    def _start(self) -> None:
+        """Start processes using strategy defined in a particular context."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _poll(self) -> Optional[RunProcsResult]:
+        """
+        Poll the run status of the processes running under this context.
+        This method follows an "all-or-nothing" policy and returns
+        a ``RunProcessResults`` object if either all processes complete
+        successfully or any process fails. Returns ``None`` if
+        all processes are still running.
+        """
+        raise NotImplementedError()
+
+    def wait(self, timeout: float = -1, period: float = 1) -> Optional[RunProcsResult]:
+        """
+        Wait for the specified ``timeout`` seconds, polling every ``period`` seconds
+        for the processes to be done. Returns ``None`` if the processes are still running
+        on timeout expiry. Negative timeout values are interpreted as "wait-forever".
+        A timeout value of zero simply queries the status of the processes (e.g. equivalent
+        to a poll).
+
+        ..note: Multiprocessing library registers SIGTERM and SIGINT signal handlers that raise
+                ``SignalException`` when the signals received. It is up to the consumer of the code
+                to properly handle the exception. It is important not to swallow the exception otherwise
+                the process would not terminate. Example of the typical workflow can be:
+
+        .. code-block:: python
+            pc = start_processes(...)
+            try:
+                pc.wait(1)
+                .. do some other work
+            except SignalException as e:
+                pc.shutdown(e.sigval, timeout=30)
+
+        If SIGTERM or SIGINT occurs, the code above will try to shutdown child processes by propagating
+        received signal. If child processes will not terminate in the timeout time, the process will send
+        the SIGKILL.
+        """
+        if timeout == 0:
+            return self._poll()
+
+        if timeout < 0:
+            timeout = sys.maxsize
+
+        expiry = time.time() + timeout
+        while time.time() < expiry:
+            pr = self._poll()
+            if pr:
+                return pr
+            time.sleep(period)
+
+        return None
+
+    @abc.abstractmethod
+    def pids(self) -> Dict[int, int]:
+        """Return pids of processes mapped by their respective local_ranks."""
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
+        r"""
+        Terminates all processes managed by this context and cleans up any
+        meta resources (e.g. redirect, error_file files).
+        """
+        raise NotImplementedError()
+
+    def close(
+        self, death_sig: Optional[signal.Signals] = None, timeout: int = 30
+    ) -> None:
+        r"""
+        Terminates all processes managed by this context and cleans up any
+        meta resources (e.g. redirect, error_file files).
+
+        Args:
+            death_sig: Death signal to terminate processes.
+            timeout: Time to wait for processes to finish, if process is
+                still alive after this time, it will be terminated via SIGKILL.
+        """
+        if not death_sig:
+            death_sig = _get_default_signal()
+        self._close(death_sig=death_sig, timeout=timeout)
+        if self._stdout_tail:
+            self._stdout_tail.stop()
+        if self._stderr_tail:
+            self._stderr_tail.stop()
+
+
+def get_std_cm(std_rd: str, redirect_fn):
+    if IS_WINDOWS or IS_MACOS or not std_rd:
+        return nullcontext()
+    else:
+        return redirect_fn(std_rd)
+
+
+def _wrap(
+    local_rank: int,
+    fn: Callable,
+    args: Dict[int, Tuple],
+    envs: Dict[int, Dict[str, str]],
+    stdout_redirects: Dict[int, str],  # redirect file for stdout (to console if None)
+    stderr_redirects: Dict[int, str],  # redirect file for stderr (to console if None)
+    ret_vals: Dict[int, mp.SimpleQueue],
+    queue_finished_reading_event: synchronize.Event,
+) -> None:
+    # get the per-rank params up front so we fail fast if no mapping is found
+    args_ = args[local_rank]
+    env_ = envs[local_rank]
+    ret_val_ = ret_vals[local_rank]
+
+    stdout_rd = stdout_redirects[local_rank]
+    stderr_rd = stderr_redirects[local_rank]
+
+    stdout_cm = get_std_cm(stdout_rd, redirect_stdout)
+    stderr_cm = get_std_cm(stderr_rd, redirect_stderr)
+
+    for k, v in env_.items():
+        os.environ[k] = v
+
+    with stdout_cm, stderr_cm:
+        ret = record(fn)(*args_)
+    ret_val_.put(ret)
+    queue_finished_reading_event.wait()
+
+
+class MultiprocessContext(PContext):
+    """``PContext`` holding worker processes invoked as a function."""
+
+    def __init__(
+        self,
+        name: str,
+        entrypoint: Callable,
+        args: Dict[int, Tuple],
+        envs: Dict[int, Dict[str, str]],
+        start_method: str,
+        logs_specs: LogsSpecs,
+        log_line_prefixes: Optional[Dict[int, str]] = None,
+    ):
+        super().__init__(
+            name,
+            entrypoint,
+            args,
+            envs,
+            logs_specs,
+            log_line_prefixes,
+        )
+
+        self.start_method = start_method
+        # each ret_val queue will always contain a single element.
+        self._ret_vals = {
+            local_rank: mp.get_context(self.start_method).SimpleQueue()
+            for local_rank in range(self.nprocs)
+        }
+
+        # see comments in ``join()`` for what this is
+        self._return_values: Dict[int, Any] = {}
+        self._pc: Optional[mp.ProcessContext] = None
+        # Note: set method should ONLY be invoked for the use case when all processes finished
+        # successfully. If any process died on event.wait() calling set() method will deadlock.
+        self._worker_finished_event = mp.get_context(self.start_method).Event()
+
+    def _start(self):
+        if self._pc:
+            raise ValueError(
+                "The process context already initialized."
+                " Most likely the start method got called twice."
+            )
+        self._pc = mp.start_processes(
+            fn=_wrap,
+            args=(
+                self.entrypoint,
+                self.args,
+                self.envs,
+                self.stdouts,
+                self.stderrs,
+                self._ret_vals,
+                self._worker_finished_event,
+            ),
+            nprocs=self.nprocs,
+            join=False,
+            daemon=False,
+            start_method=self.start_method,
+        )
+
+    def _is_done(self) -> bool:
+        return len(self._return_values) == self.nprocs
+
+    def _poll(self) -> Optional[RunProcsResult]:
+        assert self._pc is not None  # assertion for mypy type checker
+
+        try:
+            # torch.mp.ProcessContext Throws an Exception if some/all of
+            # worker processes failed
+            # timeout < 0 checks worker status and return immediately
+            # Join will never return success since we use synchronize.Event to wait
+            # for all processes to finish.
+            self._pc.join(-1)
+
+            # IMPORTANT: we use multiprocessing.Queue to carry worker return values
+            # back to the parent, the worker process will wait before terminating
+            # until all the buffered items are fed by the feeder thread to the underlying
+            # pipe. Hence to prevent deadlocks on large return values,
+            # we opportunistically try queue.get on each join call
+            # See: https://docs.python.org/2/library/multiprocessing.html#all-platforms
+            for local_rank in range(0, self.nprocs):
+                return_queue = self._ret_vals[local_rank]
+                if not return_queue.empty():
+                    # save the return values temporarily into a member var
+                    self._return_values[local_rank] = return_queue.get()
+
+            if self._is_done():
+                # we should ALWAYS have ALL the return values when all the processes are done
+                self._worker_finished_event.set()
+                # Wait untill all processes are finished. At this point workers finished executing
+                # user function
+                self._pc.join()
+                _validate_full_rank(
+                    self._return_values, self.nprocs, "return_value queue"
+                )
+                self.close()
+                return RunProcsResult(
+                    return_values=self._return_values,
+                    stdouts=self.stdouts,
+                    stderrs=self.stderrs,
+                )
+            else:
+                return None
+        except (mp.ProcessRaisedException, mp.ProcessExitedException) as e:
+            failed_local_rank = e.error_index
+
+            # entrypoint for MultiprocessContext will always be a Callable
+            fn_name = self.entrypoint.__qualname__  # type: ignore[union-attr]
+            failed_proc = self._pc.processes[failed_local_rank]
+            error_filepath = self.error_files[failed_local_rank]
+
+            log.exception(
+                "failed (exitcode: %s)"
+                " local_rank: %s (pid: %s)"
+                " of fn: %s (start_method: %s)",
+                failed_proc.exitcode,
+                failed_local_rank, e.pid,
+                fn_name, self.start_method,
+            )
+
+            self.close()
+            return RunProcsResult(
+                failures={
+                    failed_local_rank: ProcessFailure(
+                        local_rank=failed_local_rank,
+                        pid=e.pid,
+                        exitcode=failed_proc.exitcode,
+                        error_file=error_filepath,
+                    )
+                },
+                stdouts=self.stdouts,
+                stderrs=self.stderrs,
+            )
+
+    def pids(self) -> Dict[int, int]:
+        assert self._pc is not None  # assertion for mypy type checking
+        return dict(enumerate(self._pc.pids()))
+
+    def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
+        if not self._pc:
+            return
+        for proc in self._pc.processes:
+            if proc.is_alive():
+                log.warning("Closing process %s via signal %s", proc.pid, death_sig.name)
+                try:
+                    os.kill(proc.pid, death_sig)
+                except ProcessLookupError:
+                    # If the process exited because of some reason,
+                    # `ProcessLookupError` will be raised, it is safe to ignore it.
+                    pass
+        end = time.monotonic() + timeout
+        for proc in self._pc.processes:
+            time_to_wait = end - time.monotonic()
+            if time_to_wait <= 0:
+                break
+            proc.join(time_to_wait)
+        for proc in self._pc.processes:
+            if proc.is_alive():
+                log.warning(
+                    "Unable to shutdown process %s via %s, forcefully exiting via %s",
+                    proc.pid, death_sig, _get_kill_signal()
+                )
+                try:
+                    os.kill(proc.pid, _get_kill_signal())
+                except ProcessLookupError:
+                    # If the process exited because of some reason,
+                    # `ProcessLookupError` will be raised, it is safe to ignore it.
+                    pass
+            proc.join()
+
+class SubprocessContext(PContext):
+    """``PContext`` holding worker processes invoked as a binary."""
+
+    def __init__(
+        self,
+        name: str,
+        entrypoint: str,
+        args: Dict[int, Tuple],
+        envs: Dict[int, Dict[str, str]],
+        logs_specs: LogsSpecs,
+        log_line_prefixes: Optional[Dict[int, str]] = None,
+
+    ):
+        super().__init__(
+            name,
+            entrypoint,
+            args,
+            envs,
+            logs_specs,
+            log_line_prefixes,
+        )
+
+        # state vector; _vdone[local_rank] -> is local_rank finished or not
+        self._running_local_ranks: Set[int] = set(range(self.nprocs))
+        self._failures: Dict[int, ProcessFailure] = {}
+        self.subprocess_handlers: Dict[int, SubprocessHandler] = {}
+
+    def _start(self):
+        if self.subprocess_handlers:
+            raise ValueError(
+                "The subprocess handlers already initialized. Most likely the start method got called twice."
+            )
+        self.subprocess_handlers = {
+            local_rank: get_subprocess_handler(
+                entrypoint=self.entrypoint,  # type: ignore[arg-type] # entrypoint is always a str
+                args=self.args[local_rank],
+                env=self.envs[local_rank],
+                stdout=self.stdouts[local_rank],
+                stderr=self.stderrs[local_rank],
+                local_rank_id=local_rank,
+            )
+            for local_rank in range(self.nprocs)
+        }
+
+    def _poll(self) -> Optional[RunProcsResult]:
+        done_local_ranks = set()
+        for local_rank in self._running_local_ranks:
+            handler = self.subprocess_handlers[local_rank]
+            exitcode = handler.proc.poll()
+            if exitcode is not None:
+                done_local_ranks.add(local_rank)
+                if exitcode != 0:  # failed or signaled
+                    self._failures[local_rank] = ProcessFailure(
+                        local_rank=local_rank,
+                        pid=handler.proc.pid,
+                        exitcode=exitcode,
+                        error_file=self.error_files[local_rank],
+                    )
+                # else: --> succeeded; nothing to do
+
+        self._running_local_ranks.difference_update(done_local_ranks)
+
+        # if ALL procs are finished or ANY have failed
+        if not self._running_local_ranks or self._failures:
+            self.close()  # terminate all running procs
+            result = RunProcsResult(
+                failures=self._failures,
+                stdouts=self.stdouts,
+                stderrs=self.stderrs,
+            )
+            if result.is_failed():
+                first_failure = min(result.failures.values(), key=lambda f: f.timestamp)
+                log.error(
+                    "failed (exitcode: %s)"
+                    " local_rank: %s (pid: %s)"
+                    " of binary: %s",
+                    first_failure.exitcode, first_failure.local_rank, first_failure.pid, self.entrypoint
+                )
+            else:
+                # Populate return with dummy values. This provides consistency with MultiprocessingHandler
+                result.return_values = dict.fromkeys(range(self.nprocs))
+
+            return result
+        else:  # there are no failures and procs still running
+            return None
+
+    def pids(self) -> Dict[int, int]:
+        return {
+            local_rank: sh.proc.pid
+            for local_rank, sh in self.subprocess_handlers.items()
+        }
+
+    def _close(self, death_sig: signal.Signals, timeout: int = 30) -> None:
+        if not self.subprocess_handlers:
+            return
+        for handler in self.subprocess_handlers.values():
+            if handler.proc.poll() is None:
+                log.warning(
+                    "Sending process %s closing signal %s", handler.proc.pid, death_sig.name
+                )
+                handler.close(death_sig=death_sig)
+        end = time.monotonic() + timeout
+        for handler in self.subprocess_handlers.values():
+            time_to_wait = end - time.monotonic()
+            if time_to_wait <= 0:
+                break
+            try:
+                handler.proc.wait(time_to_wait)
+            except subprocess.TimeoutExpired:
+                # Ignore the timeout expired exception, since
+                # the child process will be forcefully terminated via SIGKILL
+                pass
+        for handler in self.subprocess_handlers.values():
+            if handler.proc.poll() is None:
+                log.warning(
+                    "Unable to shutdown process %s via %s, forcefully exiting via %s",
+                    handler.proc.pid, death_sig, _get_kill_signal()
+                )
+                handler.close(death_sig=_get_kill_signal())
+                handler.proc.wait()
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c06f1dcb2b61233b0ccf426f72806a3c124db61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Each host in a distributed PyTorch job runs with a single TorchElastic agent,
+and multiple workers (as children processes of the TorchElastic agent).
+Since the workers are user-provided (your PyTorch script/job), TorchElastic
+has a way to propagate errors on the trainers through the agent and up to the
+scheduler, which ultimately informs the end-user about the state of the job
+and applies any retry policies.
+
+TorchElastic categorizes errors into 3 categories:
+
++----------------+----------------+--------------------------------------------------------------+
+| Category       | Sub-Category   |  Description                                                 |
++================+================+==============================================================+
+| User Error     | Input Error    | invalid inputs to TorchElastic APIs (e.g. min > max nodes)   |
+|                +----------------+--------------------------------------------------------------+
+|                | Worker Failure | any failures on the worker child process                     |
++----------------+----------------+--------------------------------------------------------------+
+| Platform Error |      n/a       | failures caused by the agent                                 |
++----------------+----------------+--------------------------------------------------------------+
+| Infra Error    |      n/a       | failures outside the domain of the agent and workers         |
+|                |                | (e.g. host failures)                                         |
++----------------+----------------+--------------------------------------------------------------+
+
+All errors other than "Worker Failure" are either raised canonically from the
+agent process or implicitly or explicitly crash the agent process. So the
+standard language (python) provided exception handling strategies apply.
+
+Worker Failures are special because the exception/failure originates on a different
+process from the agent so the error needs to be propagated inter-process
+(e.g. the agent cannot simply ``try-catch`` an exception raised on the worker process).
+
+TorchElastic agents use :func:`torch.distributed.elastic.multiprocessing.start_processes`
+to launch the workers which has a simple file based inter-process error propagation
+built-in.
+
+Any function or binary entrypoint decorated with :func:`record`
+will write uncaught exceptions (with the trace information) to a file specified by the
+environment variable ``TORCHELASTIC_ERROR_FILE``. The parent process (e.g. agent)
+sets this env var on each child it launches, then aggregates the error files for all
+children, and propagates the one with the **smallest** timestamp (e.g. the **first** error).
+"""
+
+import json
+import os
+import signal
+import socket
+import time
+import warnings
+from dataclasses import dataclass, field
+from datetime import datetime
+from functools import wraps
+from string import Template
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar
+
+from torch.distributed.elastic.utils.logging import get_logger
+
+from .error_handler import ErrorHandler  # noqa: F401
+from .handlers import get_error_handler  # noqa: F401
+
+__all__ = ["ProcessFailure", "ChildFailedError", "record", "ErrorHandler", "get_error_handler"]
+
+log = get_logger(__name__)
+
+
+JSON = Dict
+
+_EMPTY_ERROR_DATA = {"message": "<NONE>"}
+_NOT_AVAILABLE = "<N/A>"
+
+T = TypeVar("T")
+
+
+@dataclass
+class ProcessFailure:
+    """
+    Represent the failed process result. When the worker process fails, it may record failure root cause into the file.
+
+    Tries to read the failure timestamp from the provided ``error_file``,
+    if the ``error_file`` does not exist, the timestamp is the current
+    timestamp (seconds since epoch).
+
+    The ``message`` field is a concise explanation of the failure. If
+    the error file exists then the message is obtained from the error file.
+    Otherwise one is generated based on the failure signature.
+
+    .. note:: It is assumed that the ``error_file`` is written by
+              ``torch.distributed.elastic.multiprocessing.errors.error_handler.ErrorHandler``.
+              Otherwise the behavior is undefined.
+
+    """
+
+    local_rank: int
+    pid: int
+    exitcode: int
+    error_file: str
+    error_file_data: JSON = field(init=False)
+    message: str = field(init=False)
+    timestamp: int = field(init=False)
+
+    def __post_init__(self):
+        self.error_file_data = _EMPTY_ERROR_DATA
+        if os.path.isfile(self.error_file):
+            try:
+                with open(self.error_file) as fp:
+                    self.error_file_data = json.load(fp)
+                    log.debug(
+                        "User process failed with error data: %s", json.dumps(self.error_file_data, indent=2)
+                    )
+                    self.message, self.timestamp = self._get_error_data(
+                        self.error_file_data
+                    )
+            except Exception:
+                log.exception("Failed to parse reply file: %s", self.error_file)
+                raise
+        else:
+            self._set_no_reply_file()
+
+        # make up an informative message if not already present
+        if not self.message:
+            # signals typically do not generate an error file message
+            if self.exitcode < 0:
+                self.message = (
+                    f"Signal {-self.exitcode} ({self.signal_name()})"
+                    f" received by PID {self.pid}"
+                )
+            else:
+                self.message = "To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html"
+
+    def _get_error_data(self, error_file_data: Dict[str, Any]) -> Tuple[str, int]:
+        message = error_file_data["message"]
+        if isinstance(message, str):
+            timestamp = int(error_file_data.get("timestamp", 0))
+        else:
+            timestamp = int(message["extraInfo"]["timestamp"])
+        return (message, timestamp)
+
+    def _set_no_reply_file(self):
+        self.error_file = _NOT_AVAILABLE
+        self.error_file_data = _EMPTY_ERROR_DATA
+        self.message = ""
+        self.timestamp = int(time.time())
+
+    def signal_name(self) -> str:
+        if self.exitcode < 0:
+            # We don't want to kill the parent process trying to find the signal name.
+            # if the signal doesn't map to a known name, use not available.
+            try:
+                return signal.Signals(-self.exitcode).name
+            except Exception:
+                return _NOT_AVAILABLE
+        else:
+            return _NOT_AVAILABLE
+
+    def timestamp_isoformat(self):
+        """Return timestamp in ISO format (YYYY-MM-DD_HH:MM:SS)."""
+        return datetime.fromtimestamp(self.timestamp).isoformat(sep="_")
+
+
+GlobalRank = int
+
+_FAILURE_FORMAT_TEMPLATE = """[${idx}]:
+  time      : ${time}
+  host      : ${hostname}
+  rank      : ${rank} (local_rank: ${local_rank})
+  exitcode  : ${exitcode} (pid: ${pid})
+  error_file: ${error_file}
+  traceback : ${message}"""
+
+# extra new lines before and after are intentional
+_MSG_FORMAT_TEMPLATE = """
+${boarder}
+${title}
+${section}
+Failures:
+${other_failures}
+${section}
+Root Cause (first observed failure):
+${root_failure}
+${boarder}"""
+
+
+class ChildFailedError(Exception):
+    """
+    Special exception type that can be raised from a function annotated with the
+    ``@record`` decorator to have the child process' (root exception) propagate
+    up the stack as-is (e.g. without being wrapped in the parent's traceback).
+
+    Useful in cases where the parent is a simple nanny process
+    and the child (worker) processes are actually doing meaningful compute.
+    In this case, errors typically occur on the child process as the parent
+    is not doing anything non-trivial, and child errors should be propagated
+    to the scheduler for accurate root cause diagnostics.
+
+    .. note:: The propagation relies on error files rather than exception handling to
+              support both function and binary launches.
+
+    Example:
+    ::
+
+     # process tree on a host (container)
+     0: scheduler-init-process:
+                |- 1: torchelastic_agent:
+                         |- 2: trainer_0 (ok)
+                         |- 3: trainer_1 (fail) -> error.json
+                         |- ...
+                         |- n+2: trainer_n (ok)
+                |- n+3: other processes
+                |- ...
+
+    In the example above, trainer 1's failure (written into error.json) is
+    the root cause and should be reported to the scheduler's init process.
+    The torchelastic agent raises a ``ChildFailedError("trainer", {1: "trainer_1/error.json"})``
+    upon detecting trainer 1's failure which would propagate the contents
+    of trainer 1's error file to the scheduler's init process.
+    """
+
+    def __init__(self, name: str, failures: Dict[GlobalRank, ProcessFailure]):
+        self.name = name
+        self.failures = failures
+        assert (
+            self.failures
+        )  # does not make sense to create a ChildFaileError with no failures
+        super().__init__(self.format_msg())
+
+    def get_first_failure(self) -> Tuple[GlobalRank, ProcessFailure]:
+        rank = min(self.failures.keys(), key=lambda r: self.failures[r].timestamp)
+        return rank, self.failures[rank]
+
+    def format_msg(self, boarder_delim="=", section_delim="-"):
+        title = f"{self.name} FAILED"
+        root_rank, root_failure = self.get_first_failure()
+
+        root_failure_fmt: str = ""
+        other_failures_fmt: List[str] = []
+        width = len(title)
+        for idx, (rank, failure) in enumerate(self.failures.items()):
+            fmt, w = self._format_failure(idx, rank, failure)
+            width = max(width, w)
+            if rank == root_rank:
+                root_failure_fmt = fmt
+            else:
+                other_failures_fmt.append(fmt)
+
+        # upper boundary on width
+        width = min(width, 60)
+
+        return Template(_MSG_FORMAT_TEMPLATE).substitute(
+            boarder=boarder_delim * width,
+            title=title,
+            section=section_delim * width,
+            root_failure=root_failure_fmt,
+            other_failures="\n".join(other_failures_fmt or ["  <NO_OTHER_FAILURES>"]),
+        )
+
+    def _format_failure(
+        self, idx: int, rank: int, failure: ProcessFailure
+    ) -> Tuple[str, int]:
+
+        # failure.message is either a str (when the failure does not generate a traceback - e.g. signals)
+        # or a dict (json) of the form
+        # {"message": $ERROR_MSG, "extraInfo": {"py_callstack": $TRACEBACK, timestamp: $TS}}
+        # so the display logic is:
+        # 1. if failure.message is not a dict (it is a str) just show it as is
+        # 2. else try to get the traceback (py_callstack)
+        # 3.      if the traceback is not there, use the message
+        # 4.      if the message  is not there show <N/A>
+        msg = failure.message
+        if isinstance(failure.message, dict):
+            msg = (
+                failure.message.get("extraInfo", {})
+                .get("py_callstack", failure.message.get("message", "<N/A>"))
+                .replace("\n", "\n  ")  # to properly indent the traceback
+            )
+
+        fmt = Template(_FAILURE_FORMAT_TEMPLATE).substitute(
+            idx=idx,
+            time=failure.timestamp_isoformat(),
+            hostname=socket.getfqdn(),
+            rank=rank,
+            local_rank=failure.local_rank,
+            exitcode=failure.exitcode,
+            pid=failure.pid,
+            error_file=failure.error_file,
+            message=msg,
+        )
+        width = 0
+        for line in fmt.split("\n"):
+            width = max(width, len(line))
+        return fmt, width
+
+
+def record(
+    fn: Callable[..., T], error_handler: Optional[ErrorHandler] = None
+) -> Callable[..., T]:
+    """
+    Syntactic sugar to record errors/exceptions that happened in the decorated
+    function using the provided ``error_handler``.
+
+    Using this decorator is equivalent to:
+
+    ::
+
+     error_handler = get_error_handler()
+     error_handler.initialize()
+     try:
+        foobar()
+     except ChildFailedError as e:
+        _, failure = e.get_first_failure()
+        error_handler.dump_error_file(failure.error_file, failure.exitcode)
+        raise
+     except Exception as e:
+        error_handler.record(e)
+        raise
+
+    .. important:: use this decorator once per process at the top level method,
+                   typically this is the main method.
+
+    Example
+
+    ::
+
+     @record
+     def main():
+         pass
+
+     if __name__=="__main__":
+        main()
+
+    """
+    if not error_handler:
+        error_handler = get_error_handler()
+
+    def wrap(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            assert error_handler is not None  # assertion for mypy type checker
+            error_handler.initialize()
+            try:
+                return f(*args, **kwargs)
+            except SystemExit as se:
+                # For run_path based entrypoints, SystemExit with code = 0 will never exit.
+                # Handling it here by returning a value:
+                if se.code == 0:
+                    return None
+                else:
+                    raise
+            except ChildFailedError as e:
+                rank, failure = e.get_first_failure()
+                if failure.error_file != _NOT_AVAILABLE:
+                    error_handler.dump_error_file(failure.error_file, failure.exitcode)
+                else:
+                    log.info(
+                        (
+                            "local_rank %s FAILED with no error file."
+                            " Decorate your entrypoint fn with @record for traceback info."
+                            " See: https://pytorch.org/docs/stable/elastic/errors.html",
+                            rank
+                        )
+                    )
+                raise
+            except Exception as e:
+                error_handler.record_exception(e)
+                raise
+
+        return wrapper
+
+    return wrap(fn)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b08155fe28c931f8a6bddf27644caf35646a9c4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/error_handler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/error_handler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f61d3650956ab271bc578bdc9daea4700063982
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/error_handler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/handlers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/handlers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b176a6050b4e27cbda8f56ad21a8f479c0bd40a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/__pycache__/handlers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9986e7ac7822c579ae5d64d03a9754359303b8a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/error_handler.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+import json
+import logging
+import os
+import time
+import traceback
+import warnings
+from typing import Any, Dict, Optional
+
+__all__ = ['ErrorHandler']
+
+log = logging.getLogger(__name__)
+
+
+class ErrorHandler:
+    """
+    Write the provided exception object along with some other metadata about
+    the error in a structured way in JSON format to an error file specified by the
+    environment variable: ``TORCHELASTIC_ERROR_FILE``. If this environment
+    variable is not set, then simply logs the contents of what would have been
+    written to the error file.
+
+    This handler may be subclassed to customize the handling of the error.
+    Subclasses should override ``initialize()`` and ``record_exception()``.
+    """
+
+    def _get_error_file_path(self) -> Optional[str]:
+        """
+        Return the error file path.
+
+        May return ``None`` to have the structured error be logged only.
+        """
+        return os.environ.get("TORCHELASTIC_ERROR_FILE", None)
+
+    def initialize(self) -> None:
+        """
+        Call prior to running code that we wish to capture errors/exceptions.
+
+        Typically registers signal/fault handlers. Users can override this
+        function to add custom initialization/registrations that aid in
+        propagation/information of errors/signals/exceptions/faults.
+        """
+        try:
+            faulthandler.enable(all_threads=True)
+        except Exception as e:
+            warnings.warn(f"Unable to enable fault handler. {type(e).__name__}: {e}")
+
+    def _write_error_file(self, file_path: str, error_msg: str) -> None:
+        """Write error message to the file."""
+        try:
+            with open(file_path, "w") as fp:
+                fp.write(error_msg)
+        except Exception as e:
+            warnings.warn(f"Unable to write error to file. {type(e).__name__}: {e}")
+
+    def record_exception(self, e: BaseException) -> None:
+        """
+        Write a structured information about the exception into an error file in JSON format.
+
+        If the error file cannot be determined, then logs the content
+        that would have been written to the error file.
+        """
+        file = self._get_error_file_path()
+        if file:
+            data = {
+                "message": {
+                    "message": f"{type(e).__name__}: {e}",
+                    "extraInfo": {
+                        "py_callstack": traceback.format_exc(),
+                        "timestamp": str(int(time.time())),
+                    },
+                }
+            }
+            with open(file, "w") as fp:
+                json.dump(data, fp)
+
+    def override_error_code_in_rootcause_data(
+        self,
+        rootcause_error_file: str,
+        rootcause_error: Dict[str, Any],
+        error_code: int = 0,
+    ):
+        """Modify the rootcause_error read from the file, to correctly set the exit code."""
+        if "message" not in rootcause_error:
+            log.warning(
+                "child error file (%s) does not have field `message`. \n"
+                "cannot override error code: %s",
+                rootcause_error_file, error_code
+            )
+        elif isinstance(rootcause_error["message"], str):
+            log.warning(
+                "child error file (%s) has a new message format. \n"
+                "skipping error code override",
+                rootcause_error_file
+            )
+        else:
+            rootcause_error["message"]["errorCode"] = error_code
+
+    def dump_error_file(self, rootcause_error_file: str, error_code: int = 0):
+        """Dump parent error file from child process's root cause error and error code."""
+        with open(rootcause_error_file) as fp:
+            rootcause_error = json.load(fp)
+            # Override error code since the child process cannot capture the error code if it
+            # is terminated by signals like SIGSEGV.
+            if error_code:
+                self.override_error_code_in_rootcause_data(rootcause_error_file, rootcause_error, error_code)
+            log.debug(
+                "child error file (%s) contents:\n"
+                "%s",
+                rootcause_error_file, json.dumps(rootcause_error, indent=2)
+            )
+
+        my_error_file = self._get_error_file_path()
+        if my_error_file:
+            # Guard against existing error files
+            # This can happen when the child is created using multiprocessing
+            # and the same env var (TORCHELASTIC_ERROR_FILE) is used on the
+            # parent and child to specify the error files (respectively)
+            # because the env vars on the child is set in the wrapper function
+            # and by default the child inherits the parent's env vars, if the child
+            # process receives a signal before the wrapper function kicks in
+            # and the signal handler writes to the error file, then the child
+            # will write to the parent's error file. In this case just log the
+            # original error file contents and overwrite the error file.
+            self._rm(my_error_file)
+            self._write_error_file(my_error_file, json.dumps(rootcause_error))
+            log.info("dumped error file to parent's %s", my_error_file)
+        else:
+            log.error(
+                "no error file defined for parent, to copy child error file (%s)", rootcause_error_file
+            )
+
+    def _rm(self, my_error_file):
+        if os.path.isfile(my_error_file):
+            # Log the contents of the original file.
+            with open(my_error_file) as fp:
+                try:
+                    original = json.dumps(json.load(fp), indent=2)
+                    log.warning(
+                        "%s already exists"
+                        " and will be overwritten."
+                        " Original contents:\n%s",
+                        my_error_file, original
+                    )
+                except json.decoder.JSONDecodeError as err:
+                    log.warning(
+                        "%s already exists"
+                        " and will be overwritten."
+                        " Unable to load original contents:\n",
+                        my_error_file
+                    )
+            os.remove(my_error_file)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/handlers.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4eac0899df723686a433ef75ba0760623594af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/errors/handlers.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multiprocessing error-reporting module
+
+
+from torch.distributed.elastic.multiprocessing.errors.error_handler import ErrorHandler
+
+__all__ = ['get_error_handler']
+
+def get_error_handler():
+    return ErrorHandler()
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/redirects.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/redirects.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c3bcb5cdf1f43e10b9b43cab2684298c8dc7a64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/redirects.py
@@ -0,0 +1,102 @@
+# !/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Taken and modified from original source:
+# https://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/
+import ctypes
+import logging
+import os
+import sys
+from contextlib import contextmanager
+from functools import partial
+
+IS_WINDOWS = sys.platform == "win32"
+IS_MACOS = sys.platform == "darwin"
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_libc():
+    if IS_WINDOWS or IS_MACOS:
+        logger.warning(
+            "NOTE: Redirects are currently not supported in Windows or MacOs."
+        )
+        return None
+    else:
+        return ctypes.CDLL("libc.so.6")
+
+
+libc = get_libc()
+
+
+def _c_std(stream: str):
+    return ctypes.c_void_p.in_dll(libc, stream)
+
+
+def _python_std(stream: str):
+    return {"stdout": sys.stdout, "stderr": sys.stderr}[stream]
+
+
+_VALID_STD = {"stdout", "stderr"}
+
+
+@contextmanager
+def redirect(std: str, to_file: str):
+    """
+    Redirect ``std`` (one of ``"stdout"`` or ``"stderr"``) to a file in the path specified by ``to_file``.
+
+    This method redirects the underlying std file descriptor (not just python's ``sys.stdout|stderr``).
+    See usage for details.
+
+    Directory of ``dst_filename`` is assumed to exist and the destination file
+    is overwritten if it already exists.
+
+    .. note:: Due to buffering cross source writes are not guaranteed to
+              appear in wall-clock order. For instance in the example below
+              it is possible for the C-outputs to appear before the python
+              outputs in the log file.
+
+    Usage:
+
+    ::
+
+     # syntactic-sugar for redirect("stdout", "tmp/stdout.log")
+     with redirect_stdout("/tmp/stdout.log"):
+        print("python stdouts are redirected")
+        libc = ctypes.CDLL("libc.so.6")
+        libc.printf(b"c stdouts are also redirected"
+        os.system("echo system stdouts are also redirected")
+
+     print("stdout restored")
+
+    """
+    if std not in _VALID_STD:
+        raise ValueError(
+            f"unknown standard stream <{std}>, must be one of {_VALID_STD}"
+        )
+
+    c_std = _c_std(std)
+    python_std = _python_std(std)
+    std_fd = python_std.fileno()
+
+    def _redirect(dst):
+        libc.fflush(c_std)
+        python_std.flush()
+        os.dup2(dst.fileno(), std_fd)
+
+    with os.fdopen(os.dup(std_fd)) as orig_std, open(to_file, mode="w+b") as dst:
+        _redirect(dst)
+        try:
+            yield
+        finally:
+            _redirect(orig_std)
+
+
+redirect_stdout = partial(redirect, "stdout")
+redirect_stderr = partial(redirect, "stderr")
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1fe591bca2e753727c8365d2ea693d0f61d966
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__init__.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch.distributed.elastic.multiprocessing.subprocess_handler.handlers import (
+    get_subprocess_handler,
+)
+from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
+    SubprocessHandler,
+)
+
+__all__ = ["SubprocessHandler", "get_subprocess_handler"]
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adbd2dbfcb52f031e36b0522e797c3149b8740f7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/handlers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/handlers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..142591654ac2526c627c1e9af888334585ae2fca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/handlers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/subprocess_handler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/subprocess_handler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d13a9e15b1482945b3172934085effdaa991e13
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/__pycache__/subprocess_handler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e372f45b88ebf680d9c9dc6b65739cf2a2ff88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/handlers.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, Tuple
+
+from torch.distributed.elastic.multiprocessing.subprocess_handler.subprocess_handler import (
+    SubprocessHandler,
+)
+
+__all__ = ["get_subprocess_handler"]
+
+
+def get_subprocess_handler(
+    entrypoint: str,
+    args: Tuple,
+    env: Dict[str, str],
+    stdout: str,
+    stderr: str,
+    local_rank_id: int,
+):
+    return SubprocessHandler(
+        entrypoint=entrypoint,
+        args=args,
+        env=env,
+        stdout=stdout,
+        stderr=stderr,
+        local_rank_id=local_rank_id,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbed2fc5d8594d8f742a60338aeb5d9647ca952e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import signal
+import subprocess
+import sys
+
+from typing import Any, Dict, Optional, Tuple
+
+__all__ = ["SubprocessHandler"]
+
+IS_WINDOWS = sys.platform == "win32"
+
+
+def _get_default_signal() -> signal.Signals:
+    """Get the default termination signal. SIGTERM for unix, CTRL_C_EVENT for windows."""
+    if IS_WINDOWS:
+        return signal.CTRL_C_EVENT  # type: ignore[attr-defined] # noqa: F821
+    else:
+        return signal.SIGTERM
+
+
+class SubprocessHandler:
+    """
+    Convenience wrapper around python's ``subprocess.Popen``. Keeps track of
+    meta-objects associated to the process (e.g. stdout and stderr redirect fds).
+    """
+
+    def __init__(
+        self,
+        entrypoint: str,
+        args: Tuple,
+        env: Dict[str, str],
+        stdout: str,
+        stderr: str,
+        local_rank_id: int,
+    ):
+        self._stdout = open(stdout, "w") if stdout else None
+        self._stderr = open(stderr, "w") if stderr else None
+        # inherit parent environment vars
+        env_vars = os.environ.copy()
+        env_vars.update(env)
+
+        args_str = (entrypoint, *[str(e) for e in args])
+        self.local_rank_id = local_rank_id
+        self.proc: subprocess.Popen = self._popen(args_str, env_vars)
+
+    def _popen(self, args: Tuple, env: Dict[str, str]) -> subprocess.Popen:
+        kwargs: Dict[str, Any] = {}
+        if not IS_WINDOWS:
+            kwargs["start_new_session"] = True
+        return subprocess.Popen(
+            # pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
+            #  _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
+            #  `Tuple[str, *Tuple[Any, ...]]`.
+            args=args,
+            env=env,
+            stdout=self._stdout,
+            stderr=self._stderr,
+            **kwargs,
+        )
+
+    def close(self, death_sig: Optional[signal.Signals] = None) -> None:
+        if not death_sig:
+            death_sig = _get_default_signal()
+        if IS_WINDOWS:
+            self.proc.send_signal(death_sig)
+        else:
+            os.killpg(self.proc.pid, death_sig)
+        if self._stdout:
+            self._stdout.close()
+        if self._stderr:
+            self._stderr.close()
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/tail_log.py b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/tail_log.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63ba1feceff2d198ced71f2e5ec568455f1184d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/multiprocessing/tail_log.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import time
+from concurrent.futures._base import Future
+from concurrent.futures.thread import ThreadPoolExecutor
+from threading import Event
+from typing import Dict, List, Optional, TextIO
+
+__all__ = ["tail_logfile", "TailLog"]
+
+log = logging.getLogger(__name__)
+
+
+def tail_logfile(
+    header: str, file: str, dst: TextIO, finished: Event, interval_sec: float
+):
+
+    while not os.path.exists(file):
+        if finished.is_set():
+            return
+        time.sleep(interval_sec)
+
+    with open(file, errors="replace") as fp:
+        while True:
+            line = fp.readline()
+
+            if line:
+                dst.write(f"{header}{line}")
+            else:  # reached EOF
+                if finished.is_set():
+                    # log line producer is finished
+                    break
+                else:
+                    # log line producer is still going
+                    # wait for a bit before looping again
+                    time.sleep(interval_sec)
+
+
+class TailLog:
+    """
+    Tail the given log files.
+
+    The log files do not have to exist when the ``start()`` method is called. The tail-er will gracefully wait until
+    the log files are created by the producer and will tail the contents of the
+    log files until the ``stop()`` method is called.
+
+    .. warning:: ``TailLog`` will wait indefinitely for the log file to be created!
+
+    Each log file's line will be suffixed with a header of the form: ``[{name}{idx}]:``,
+    where the ``name`` is user-provided and ``idx`` is the index of the log file
+    in the ``log_files`` mapping. ``log_line_prefixes`` can be used to override the
+    header for each log file.
+
+    Usage:
+
+    ::
+
+     log_files = {0: "/tmp/0_stdout.log", 1: "/tmp/1_stdout.log"}
+     tailer = TailLog("trainer", log_files, sys.stdout).start()
+     # actually run the trainers to produce 0_stdout.log and 1_stdout.log
+     run_trainers()
+     tailer.stop()
+
+     # once run_trainers() start writing the ##_stdout.log files
+     # the tailer will print to sys.stdout:
+     # >>> [trainer0]:log_line1
+     # >>> [trainer1]:log_line1
+     # >>> [trainer0]:log_line2
+     # >>> [trainer0]:log_line3
+     # >>> [trainer1]:log_line2
+
+    .. note:: Due to buffering log lines between files may not necessarily
+              be printed out in order. You should configure your application's
+              logger to suffix each log line with a proper timestamp.
+
+    """
+
+    def __init__(
+        self,
+        name: str,
+        log_files: Dict[int, str],
+        dst: TextIO,
+        log_line_prefixes: Optional[Dict[int, str]] = None,
+        interval_sec: float = 0.1,
+    ):
+        n = len(log_files)
+        self._threadpool = None
+        if n > 0:
+            self._threadpool = ThreadPoolExecutor(
+                max_workers=n,
+                thread_name_prefix=f"{self.__class__.__qualname__}_{name}",
+            )
+
+        self._name = name
+        self._dst = dst
+        self._log_files = log_files
+        self._log_line_prefixes = log_line_prefixes
+        self._finished_events: Dict[int, Event] = {
+            local_rank: Event() for local_rank in log_files.keys()
+        }
+        self._futs: List[Future] = []
+        self._interval_sec = interval_sec
+        self._stopped = False
+
+    def start(self) -> "TailLog":
+        if not self._threadpool:
+            return self
+
+        for local_rank, file in self._log_files.items():
+            header = f"[{self._name}{local_rank}]:"
+            if self._log_line_prefixes and local_rank in self._log_line_prefixes:
+                header = self._log_line_prefixes[local_rank]
+            self._futs.append(
+                self._threadpool.submit(
+                    tail_logfile,
+                    header=header,
+                    file=file,
+                    dst=self._dst,
+                    finished=self._finished_events[local_rank],
+                    interval_sec=self._interval_sec,
+                )
+            )
+        return self
+
+    def stop(self) -> None:
+        for finished in self._finished_events.values():
+            finished.set()
+
+        for local_rank, f in enumerate(self._futs):
+            try:
+                f.result()
+            except Exception as e:
+                log.error(
+                    "error in log tailor for %s%s. %s: %s",
+                    self._name, local_rank,
+                    e.__class__.__qualname__, e,
+                )
+
+        if self._threadpool:
+            self._threadpool.shutdown(wait=True)
+
+        self._stopped = True
+
+    def stopped(self) -> bool:
+        return self._stopped
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcff834a35c0c47a8e0eddc838aa92d93fa6d7cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__init__.py
@@ -0,0 +1,150 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+In the context of Torch Distributed Elastic we use the term *rendezvous* to
+refer to a particular functionality that combines a **distributed
+synchronization** primitive with **peer discovery**.
+
+It is used by Torch Distributed Elastic to gather participants of a training
+job (i.e. nodes) such that they all agree on the same list of participants and
+everyone's roles, as well as make a consistent collective decision on when
+training can begin/resume.
+
+Torch Distributed Elastic rendezvous provides the following critical
+functionalities:
+
+**Barrier**:
+
+Nodes performing rendezvous will all block until the rendezvous is considered
+complete - this happens when at least ``min`` total number of nodes have joined
+the rendezvous barrier (for the same job). This also implies the barrier is not
+necessarily of fixed size.
+
+There's an additional small waiting time after reaching ``min`` number of
+nodes - this is used to ensure the rendezvous is not completed "too quickly"
+(which could potentially exclude additional nodes attempting to join at
+approximately the same time).
+
+If ``max`` number of nodes is gathered at the barrier, the rendezvous is
+completed immediately.
+
+There's also an overall timeout which causes the rendezvous to fail if ``min``
+number of nodes is never reached - this is meant to be a simple fail-safe to
+help release partially allocated job resources, in case there's a problem with
+the resource manager, and is meant to be interpreted as non-retryable.
+
+**Exclusivity**:
+
+A simple distributed barrier would not be sufficient, as we also need to ensure
+that only one group of nodes exists at any given time (for a given job). In
+other words, new nodes (i.e. joining late) should not be able to form a parallel
+independent group of workers for the same job.
+
+Torch Distributed Elastic rendezvous ensures that if a group of nodes has
+already completed a rendezvous (and hence might already be training), then
+additional "late" nodes attempting to rendezvous will only announce themselves
+as waiting, and will have to wait until the (previously completed) existing
+rendezvous is destroyed first.
+
+**Consistency**:
+
+When a rendezvous is completed, all its members will agree on the job membership
+and everyone's role in it. This role is represented using an integer, called
+rank, that is between between 0 and world size.
+
+Note that ranks are *not stable*, in the sense that the same node can be
+assigned a different rank in the next (re-)rendezvous.
+
+**Fault-tolerance**:
+
+Torch Distributed Elastic rendezvous is designed to tolerate node failures
+during the rendezvous process. Should a process crash (or lose network
+connectivity, etc), between joining the rendezvous and it being completed, then
+a re-rendezvous with remaining healthy nodes will happen automatically.
+
+A node can also fail *after* it has completed (or *has been observered* by other
+nodes to have completed) the rendezvous - this scenario will be handled by the
+Torch Distributed Elastic ``train_loop`` instead (where it will also trigger a
+re-rendezvous).
+
+**Shared key-value store**:
+
+When the rendezvous is completed, a shared key-value store is created and
+returned. This store implements a ``torch.distributed.Store`` API (see
+`distributed communication docs
+<https://pytorch.org/docs/stable/distributed.html>`__).
+
+This store is only shared by the members of the completed rendezvous. It
+is intended to be used by Torch Distributed Elastic to exchange information
+necessary to initialize job control and data-planes.
+
+**Waiting workers and rendezvous closing**:
+
+Torch Distributed Elastic rendezvous handler object provides additional
+functionalities, which are technically not part of the rendezvous process:
+
+1. Querying how many workers arrived late at the barrier, who can participate in
+   *next* rendezvous.
+
+2. Setting the rendezvous *closed* to signal all nodes not to participate in
+   next rendezvous.
+
+**DynamicRendezvousHandler**:
+
+Torch Distributed Elastic comes with the :py:class:`.DynamicRendezvousHandler`
+class that implements the rendezvous mechanism described above. It is a backend-
+agnostic type that expects a particular :py:class:`.RendezvousBackend` instance
+to be specified during construction.
+
+Torch distributed users can either implement their own backend type or use one
+of the following implementations that come with PyTorch:
+
+- :py:class:`.C10dRendezvousBackend`: Uses a C10d store (by default
+  ``TCPStore``) as the rendezvous backend. The main advantage of using a C10d
+  store is that it requires no 3rd-party dependency (such as etcd) to establish
+  a rendezvous.
+- :py:class:`.EtcdRendezvousBackend`: Supersedes the legacy
+  :py:class:`.EtcdRendezvousHandler` class. Passing an
+  :py:class:`.EtcdRendezvousBackend` instance to
+  :py:class:`.DynamicRendezvousHandler` is functionally equivalent to
+  instantiating an :py:class:`.EtcdRendezvousHandler`.
+
+  ::
+
+     store = TCPStore("localhost")
+
+     backend = C10dRendezvousBackend(store, "my_run_id")
+
+     rdzv_handler = DynamicRendezvousHandler.from_backend(
+         run_id="my_run_id",
+         store=store,
+         backend=backend,
+         min_nodes=2,
+         max_nodes=4
+     )
+"""
+
+from .api import *  # noqa: F403
+from .registry import _register_default_handlers
+
+
+_register_default_handlers()
+
+
+__all__ = [
+    "RendezvousClosedError",
+    "RendezvousConnectionError",
+    "RendezvousError",
+    "RendezvousGracefulExitError",
+    "RendezvousHandler",
+    "RendezvousHandlerCreator",
+    "RendezvousHandlerRegistry",
+    "RendezvousParameters",
+    "RendezvousStateError",
+    "RendezvousTimeoutError",
+    "rendezvous_handler_registry",
+]
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c38ff9c80f6815749a82f757806c97022148796
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bbbee62d492c49e3d0db4282cc9f1fa6d844912
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/c10d_rendezvous_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/c10d_rendezvous_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df2f1e8bcd5dac29b9c0b83ccf6770f7f943a1fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/c10d_rendezvous_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/dynamic_rendezvous.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/dynamic_rendezvous.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2a5590c7826b730746982691ffde2033874584
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/dynamic_rendezvous.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..797c21813f37b94d6201953f909ab97b56427cb1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous_backend.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous_backend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..379e4426ab1842e4ea0e7dcb84982e24b3a12edd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_rendezvous_backend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_server.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_server.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d669a13ac5d7e515b3225dd7b0abc0fb126700c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_server.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_store.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_store.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b098351761f5558c848bb16e05bdd0dccf9b7f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/etcd_store.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73abe27b28848b47847b9e8bc94682a75778b520
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/static_tcp_rendezvous.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/static_tcp_rendezvous.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9029d442d2e86d2526d7ce69ba1865c2bea17688
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/static_tcp_rendezvous.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a411d6423b7fb6c252226474312bde21560ebd4e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbfbae82a851491184916cb1a1fc8398bfd97f0b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/api.py
@@ -0,0 +1,277 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Optional, Tuple
+
+from torch.distributed import Store
+
+
+class RendezvousError(Exception):
+    """Represents the base type for rendezvous errors."""
+
+
+class RendezvousClosedError(RendezvousError):
+    """Raised when a rendezvous is closed."""
+
+
+class RendezvousTimeoutError(RendezvousError):
+    """Raised when a rendezvous did not complete on time."""
+
+
+class RendezvousConnectionError(RendezvousError):
+    """Raised when the connection to a rendezvous backend has failed."""
+
+
+class RendezvousStateError(RendezvousError):
+    """Raised when the state of a rendezvous is corrupt."""
+
+class RendezvousGracefulExitError(RendezvousError):
+    """Raised when node wasn't not included in rendezvous and gracefully exits.
+
+    Exception is a mechanism to exit the stack, however does not mean a failure.
+    """
+
+class RendezvousHandler(ABC):
+    """Main rendezvous interface.
+
+    Note:
+        Distributed Torch users normally **do not** need to implement their own
+        ``RendezvousHandler``. An implementation based on C10d Store is already
+        provided, and is recommended for most users.
+    """
+
+    @abstractmethod
+    def get_backend(self) -> str:
+        """Return the name of the rendezvous backend."""
+
+    @abstractmethod
+    def next_rendezvous(
+        self,
+    ) -> Tuple[Store, int, int]:
+        """Main entry-point into the rendezvous barrier.
+
+        Blocks until the rendezvous is complete and the current process is
+        included in the formed worker group, or a timeout occurs, or the
+        rendezvous was marked closed.
+
+        Returns:
+            A tuple of :py:class:`torch.distributed.Store`, ``rank``, and
+            ``world size``.
+
+        Raises:
+            RendezvousClosedError:
+                The rendezvous is closed.
+            RendezvousConnectionError:
+                The connection to the rendezvous backend has failed.
+            RendezvousStateError:
+                The rendezvous state is corrupt.
+            RendezvousTimeoutError:
+                The rendezvous did not complete on time.
+        """
+
+    @abstractmethod
+    def is_closed(self) -> bool:
+        """Check whether the rendezvous has been closed.
+
+        A closed rendezvous means all future attempts to re-rendezvous within
+        same job will fail.
+
+        ``is_closed()`` and :py:meth:`set_closed` have semantics of eventual
+        propagation and should not be used for synchronization. The intention is
+        that if at least one node decides the job is finished, it will close the
+        rendezvous, and other nodes will soon observe this and stop running as
+        well.
+        """
+
+    @abstractmethod
+    def set_closed(self):
+        """Mark the rendezvous as closed."""
+
+    @abstractmethod
+    def num_nodes_waiting(self) -> int:
+        """Return the number of nodes who arrived late at the rendezvous
+        barrier, hence were not included in the current worker group.
+
+        Callers should periodically call this method to check whether new
+        nodes are waiting to join the job and if so admit them by calling
+        :py:meth:`next_rendezvous()` (re-rendezvous).
+        """
+
+    @abstractmethod
+    def get_run_id(self) -> str:
+        """Return the run id of the rendezvous.
+
+        The run id is a user-defined id that uniquely identifies an instance of
+        a distributed application. It typically maps to a job id and is used to
+        allow nodes to join the correct distributed application.
+        """
+
+    @abstractmethod
+    def shutdown(self) -> bool:
+        """Close all resources that were open for the rendezvous.
+
+        Example::
+
+            rdzv_handler = ...
+            try:
+                store, rank, world_size = rdzv_handler.next_rendezvous()
+            finally:
+                rdzv_handler.shutdown()
+        """
+
+
+class RendezvousParameters:
+    """Hold the parameters to construct a :py:class:`RendezvousHandler`.
+
+    Args:
+        backend:
+            The name of the backend to use to handle the rendezvous.
+        endpoint:
+            The endpoint of the rendezvous, usually in form <hostname>[:<port>].
+        run_id:
+            The id of the rendezvous.
+        min_nodes:
+            The minimum number of nodes to admit to the rendezvous.
+        max_nodes:
+            The maximum number of nodes to admit to the rendezvous.
+        local_addr:
+            The address of the local node.
+        **kwargs:
+            Additional parameters for the specified backend.
+    """
+
+    def __init__(
+        self,
+        backend: str,
+        endpoint: str,
+        run_id: str,
+        min_nodes: int,
+        max_nodes: int,
+        local_addr: Optional[str] = None,
+        **kwargs,
+    ):
+        if not backend:
+            raise ValueError("The rendezvous backend name must be a non-empty string.")
+
+        if min_nodes < 1:
+            raise ValueError(
+                f"The minimum number of rendezvous nodes ({min_nodes}) must be greater than zero."
+            )
+        if max_nodes < min_nodes:
+            raise ValueError(
+                f"The maximum number of rendezvous nodes ({max_nodes}) must be greater than or "
+                f"equal to the minimum number of rendezvous nodes ({min_nodes})."
+            )
+
+        self.backend = backend
+        self.endpoint = endpoint
+        self.run_id = run_id
+        self.min_nodes = min_nodes
+        self.max_nodes = max_nodes
+        self.config = kwargs
+        self.local_addr = local_addr
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """Return the value for ``key`` if ``key`` exists, else ``default``."""
+        return self.config.get(key, default)
+
+    def get_as_bool(self, key: str, default: Optional[bool] = None) -> Optional[bool]:
+        """Return the value for ``key`` as a ``bool``."""
+        value = self.get(key, default)
+        if value is None or isinstance(value, bool):
+            return value
+        if isinstance(value, int):
+            if value == 1:
+                return True
+            if value == 0:
+                return False
+        elif isinstance(value, str):
+            if value.lower() in ["1", "true", "t", "yes", "y"]:
+                return True
+            if value.lower() in ["0", "false", "f", "no", "n"]:
+                return False
+        raise ValueError(
+            f"The rendezvous configuration option '{key}' does not represent a valid boolean value."
+        )
+
+    def get_as_int(self, key: str, default: Optional[int] = None) -> Optional[int]:
+        """Return the value for ``key`` as an ``int``."""
+        value = self.get(key, default)
+        if value is None:
+            return value
+        try:
+            return int(value)
+        except ValueError as e:
+            raise ValueError(
+                f"The rendezvous configuration option '{key}' does not represent a valid integer "
+                "value."
+            ) from e
+
+
+RendezvousHandlerCreator = Callable[[RendezvousParameters], RendezvousHandler]
+
+
+class RendezvousHandlerRegistry:
+    """Represent a registry of :py:class:`RendezvousHandler` backends."""
+
+    _registry: Dict[str, RendezvousHandlerCreator]
+
+    def __init__(self) -> None:
+        self._registry = {}
+
+    def register(self, backend: str, creator: RendezvousHandlerCreator) -> None:
+        """Register a new rendezvous backend.
+
+        Args:
+            backend:
+                The name of the backend.
+            creator:
+                The callback to invoke to construct the
+                :py:class:`RendezvousHandler`.
+        """
+        if not backend:
+            raise ValueError("The rendezvous backend name must be a non-empty string.")
+
+        current_creator: Optional[RendezvousHandlerCreator]
+        try:
+            current_creator = self._registry[backend]
+        except KeyError:
+            current_creator = None
+
+        if current_creator is not None and current_creator != creator:
+            raise ValueError(
+                f"The rendezvous backend '{backend}' cannot be registered with '{creator}' as it "
+                f"is already registered with '{current_creator}'."
+            )
+
+        self._registry[backend] = creator
+
+    def create_handler(self, params: RendezvousParameters) -> RendezvousHandler:
+        """Create a new :py:class:`RendezvousHandler`."""
+        try:
+            creator = self._registry[params.backend]
+        except KeyError as e:
+            raise ValueError(
+                f"The rendezvous backend '{params.backend}' is not registered. Did you forget "
+                f"to call `{self.register.__name__}`?"
+            ) from e
+
+        handler = creator(params)
+
+        # Do some sanity check.
+        if handler.get_backend() != params.backend:
+            raise RuntimeError(
+                f"The rendezvous backend '{handler.get_backend()}' does not match the requested "
+                f"backend '{params.backend}'."
+            )
+
+        return handler
+
+
+# The default global registry instance used by launcher scripts to instantiate
+# rendezvous handlers.
+rendezvous_handler_registry = RendezvousHandlerRegistry()
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b547a1ead07275f564c2db71c9b99bf13791f2eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py
@@ -0,0 +1,269 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import binascii
+import logging
+import os
+import tempfile
+from base64 import b64decode, b64encode
+from datetime import timedelta
+from typing import Any, Optional, Tuple, cast
+
+from torch.distributed import FileStore, Store, TCPStore
+from torch.distributed.elastic.events import (
+    NodeState,
+    construct_and_record_rdzv_event,
+)
+
+from .api import (
+    RendezvousConnectionError,
+    RendezvousError,
+    RendezvousParameters,
+    RendezvousStateError,
+)
+from .dynamic_rendezvous import RendezvousBackend, Token
+from .utils import _matches_machine_hostname, parse_rendezvous_endpoint
+
+log = logging.getLogger(__name__)
+
+
+class C10dRendezvousBackend(RendezvousBackend):
+    """Represents a C10d-backed rendezvous backend.
+
+    Args:
+        store:
+            The :py:class:`torch.distributed.Store` instance to use to
+            communicate with the C10d store.
+        run_id:
+            The run id of the rendezvous.
+    """
+
+    # See the explanation in the __init__ method.
+    _NULL_SENTINEL = "Y2FuaW1hZGFt"
+
+    _store: Store
+    _key: str
+
+    def __init__(self, store: Store, run_id: str) -> None:
+        if not run_id:
+            raise ValueError("The run id must be a non-empty string.")
+
+        self._store = store
+
+        self._key = "torch.rendezvous." + run_id
+
+        # The read operation of a store blocks the caller until the specified
+        # key becomes available. This behavior makes it tricky to use a store
+        # as a regular key-value dictionary.
+        #
+        # As a workaround we initially set a sentinel value as the rendezvous
+        # state. Whenever this value gets returned we treat it as a None.
+        self._call_store("compare_set", self._key, "", self._NULL_SENTINEL)
+
+    @property
+    def name(self) -> str:
+        """See base class."""
+        return "c10d"
+
+    def get_state(self) -> Optional[Tuple[bytes, Token]]:
+        """See base class."""
+        base64_state: bytes = self._call_store("get", self._key)
+
+        return self._decode_state(base64_state)
+
+    def set_state(
+        self, state: bytes, token: Optional[Token] = None
+    ) -> Optional[Tuple[bytes, Token, bool]]:
+        """See base class."""
+        base64_state_str: str = b64encode(state).decode()
+
+        if token:
+            # Shortcut if we know for sure that the token is not valid.
+            if not isinstance(token, bytes):
+                result = self.get_state()
+                if result is not None:
+                    tmp = *result, False
+                    # Python 3.6 does not support tuple unpacking in return
+                    # statements.
+                    return tmp
+                return None
+
+            token = token.decode()
+        else:
+            token = self._NULL_SENTINEL
+
+        base64_state: bytes = self._call_store("compare_set", self._key, token, base64_state_str)
+
+        state_token_pair = self._decode_state(base64_state)
+        if state_token_pair is None:
+            return None
+
+        new_state, new_token = state_token_pair
+
+        # C10d Store's compare_set method does not offer an easy way to find out
+        # whether our write attempt was successful. As a brute-force solution we
+        # perform a bitwise comparison of our local state and the remote state.
+        return new_state, new_token, new_state == state
+
+    def _call_store(self, store_op: str, *args, **kwargs) -> Any:
+        try:
+            return getattr(self._store, store_op)(*args, **kwargs)
+        except (ValueError, RuntimeError, TimeoutError) as exc:
+            raise RendezvousConnectionError(
+                "The connection to the C10d store has failed. See inner exception for details."
+            ) from exc
+
+    def _decode_state(self, base64_state: bytes) -> Optional[Tuple[bytes, Token]]:
+        if base64_state == self._NULL_SENTINEL.encode():
+            return None
+
+        try:
+            state = b64decode(base64_state)
+        except binascii.Error as exc:
+            raise RendezvousStateError(
+                "The state object is corrupt. See inner exception for details."
+            ) from exc
+
+        return state, base64_state
+
+
+def _create_tcp_store(params: RendezvousParameters) -> TCPStore:
+    host, port = parse_rendezvous_endpoint(params.endpoint, default_port=29400)
+
+    cfg_is_host = params.get_as_bool("is_host")
+    # If the user has explicitly specified whether our process should host the
+    # the store, respect it.
+    if cfg_is_host is not None:
+        is_host = cfg_is_host
+    # Otherwise try to determine whether we are the host based on our hostname
+    # and IP address.
+    else:
+        is_host = _matches_machine_hostname(host)
+
+    use_libuv = params.get_as_bool("use_libuv", False)
+
+    # The timeout
+    read_timeout = cast(int, params.get_as_int("read_timeout", 60))
+    if read_timeout <= 0:
+        raise ValueError("The read timeout must be a positive integer.")
+
+    # In specific cases we attempt to instantiate the store twice. For details
+    # see the explanation in the except clause below.
+    for is_server in [is_host, False]:
+        try:
+            store = TCPStore(
+                host,
+                port,
+                is_master=is_server,
+                timeout=timedelta(seconds=read_timeout),
+                use_libuv=use_libuv,
+            )
+
+            if is_server:
+                msg = f"Process {os.getpid()} hosts the TCP store for the C10d rendezvous backend."
+                construct_and_record_rdzv_event(
+                    run_id=params.run_id, message=msg, node_state=NodeState.INIT
+                )
+                log.info(msg)
+
+            break
+        except (ValueError, RuntimeError, TimeoutError) as exc:
+            # If we heuristically inferred the value of is_host as True and our
+            # first attempt to instantiate the TCP store has failed, try it one
+            # more time with is_host set to False. As an edge case there can be
+            # more than one process that is part of the same rendezvous on this
+            # machine and only one of them will eventually host the store.
+
+            if not is_server or cfg_is_host is not None:
+                raise RendezvousConnectionError(
+                    "The connection to the C10d store has failed. See inner exception for details."
+                ) from exc
+
+    return store  # type: ignore[possibly-undefined]
+
+
+def _create_file_store(params: RendezvousParameters) -> FileStore:
+    # If a user specifies an endpoint, we treat it as a path to a file.
+    if params.endpoint:
+        path = params.endpoint
+    else:
+        try:
+            # The temporary file is readable and writable only by the user of
+            # this process.
+            _, path = tempfile.mkstemp()
+        except OSError as exc:
+            raise RendezvousError(
+                "The file creation for C10d store has failed. See inner exception for details."
+            ) from exc
+
+    try:
+        store = FileStore(path)
+    except (ValueError, RuntimeError) as exc:
+        raise RendezvousConnectionError(
+            "The connection to the C10d store has failed. See inner exception for details."
+        ) from exc
+
+    return store
+
+
+def create_backend(params: RendezvousParameters) -> Tuple[C10dRendezvousBackend, Store]:
+    """Create a new :py:class:`C10dRendezvousBackend` from the specified parameters.
+
+    +--------------+-----------------------------------------------------------+
+    | Parameter    | Description                                               |
+    +==============+===========================================================+
+    | store_type   | The type of the C10d store. The currently supported types |
+    |              | are "tcp" and "file" which correspond to                  |
+    |              | :py:class:`torch.distributed.TCPStore` and                |
+    |              | :py:class:`torch.distributed.FileStore`, respectively.    |
+    |              | Defaults to "tcp".                                        |
+    +--------------+-----------------------------------------------------------+
+    | read_timeout | The read timeout, in seconds, for store operations.       |
+    |              | Defaults to 60 seconds.                                   |
+    |              |                                                           |
+    |              | Note this only applies to                                 |
+    |              | :py:class:`torch.distributed.TCPStore`. It is not relevant|
+    |              | to :py:class:`torch.distributed.FileStore` which does not |
+    |              | take in timeout as a parameter.                           |
+    +--------------+-----------------------------------------------------------+
+    | is_host      | A boolean value indicating whether this backend instance  |
+    |              | will host the C10d store. If not specified it will be     |
+    |              | inferred heuristically by matching the hostname or the IP |
+    |              | address of this machine against the specified rendezvous  |
+    |              | endpoint. Defaults to ``None``.                           |
+    |              |                                                           |
+    |              | Note that this configuration option only applies to       |
+    |              | :py:class:`torch.distributed.TCPStore`. In normal         |
+    |              | circumstances you can safely skip it; the only time when  |
+    |              | it is needed is if its value cannot be correctly          |
+    |              | determined (e.g. the rendezvous endpoint has a CNAME as   |
+    |              | the hostname or does not match the FQDN of the machine).  |
+    +--------------+-----------------------------------------------------------+
+    """
+    # As of today we only support TCPStore and FileStore. Other store types do
+    # not have the required functionality (e.g. compare_set) yet.
+    store_type = params.get("store_type", "tcp").strip().lower()
+    store: Store
+
+    try:
+        if store_type == "file":
+            store = _create_file_store(params)
+        elif store_type == "tcp":
+            store = _create_tcp_store(params)
+        else:
+            raise ValueError("Invalid store type given. Currently only supports file and tcp.")
+
+        backend = C10dRendezvousBackend(store, params.run_id)
+
+    except Exception as e:
+        construct_and_record_rdzv_event(
+            message=f"{type(e).__name__}: {str(e)}",
+            run_id=params.run_id,
+            node_state=NodeState.FAILED,
+        )
+        raise
+
+    return backend, store
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
new file mode 100644
index 0000000000000000000000000000000000000000..88d649141d190c231364754c34494bf9e2bee47e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py
@@ -0,0 +1,1343 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+import logging
+import os
+import pickle
+import socket
+import threading
+import time
+import weakref
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from enum import Enum
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple
+
+from torch.distributed import PrefixStore, Store
+from torch.distributed.elastic.events import construct_and_record_rdzv_event, NodeState
+
+from .api import (
+    RendezvousClosedError,
+    RendezvousError,
+    RendezvousGracefulExitError,
+    RendezvousHandler,
+    RendezvousParameters,
+    RendezvousStateError,
+    RendezvousTimeoutError,
+)
+from .utils import _delay, _PeriodicTimer
+
+__all__ = ['RendezvousBackend', 'RendezvousTimeout', 'RendezvousSettings', 'DynamicRendezvousHandler', 'create_handler']
+
+log = logging.getLogger(__name__)
+
+
+def get_method_name(depth=2):
+    if len(inspect.stack()) > depth:
+        return inspect.stack()[depth].function
+    return "no_method_name"
+
+
+Token = Any
+"""Represent an opaque fencing token used by the rendezvous backend."""
+
+class RendezvousBackend(ABC):
+    """Represent a backend that holds the rendezvous state."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Get the name of the backend."""
+
+    @abstractmethod
+    def get_state(self) -> Optional[Tuple[bytes, Token]]:
+        """Get the rendezvous state.
+
+        Returns:
+            A tuple of the encoded rendezvous state and its fencing token or
+            ``None`` if no state is found in the backend.
+
+        Raises:
+            RendezvousConnectionError:
+                The connection to the backend has failed.
+            RendezvousStateError:
+                The rendezvous state is corrupt.
+        """
+
+    @abstractmethod
+    def set_state(
+        self, state: bytes, token: Optional[Token] = None
+    ) -> Optional[Tuple[bytes, Token, bool]]:
+        """Set the rendezvous state.
+
+        The new rendezvous state is set conditionally:
+
+          - If the specified ``token`` matches the fencing token stored in the
+            backend, the state will be updated. The new state will be returned
+            to the caller along with its fencing token.
+          - If the specified ``token`` does not match the fencing token stored
+            in the backend, the state won't be updated; instead the existing
+            state along with its fencing token will be returned to the caller.
+          - If the specified ``token`` is ``None``, the new state will be set
+            only if there is no existing state in the backend. Either the new
+            state or the existing state along with its fencing token will be
+            returned to the caller.
+
+        Args:
+            state:
+                The encoded rendezvous state.
+            token:
+                An optional fencing token that was retrieved by a previous call
+                to :py:meth:`get_state` or ``set_state()``.
+
+        Returns:
+            A tuple of the serialized rendezvous state, its fencing token, and
+            a boolean value indicating whether our set attempt succeeded.
+
+        Raises:
+            RendezvousConnectionError:
+                The connection to the backend has failed.
+            RendezvousStateError:
+                The rendezvous state is corrupt.
+        """
+
+
+class RendezvousTimeout:
+    """Hold the timeout configuration of a rendezvous.
+
+    Args:
+        join:
+            The time within which the rendezvous is expected to complete.
+        last_call:
+            An additional wait amount before completing the rendezvous once the
+            rendezvous has the minimum number of required participants.
+        close:
+            The time within which the rendezvous is expected to close after a
+            call to :py:meth:`RendezvousHandler.set_closed` or
+            :py:meth:`RendezvousHandler.shutdown`.
+        keep_alive:
+            The time within which a keep-alive heartbeat is expected to
+            complete.
+    """
+
+    _ZERO = timedelta(0)
+
+    _DEFAULT_TIMEOUTS = {
+        "join": timedelta(seconds=600),
+        "last_call": timedelta(seconds=30),
+        "close": timedelta(seconds=30),
+        "heartbeat": timedelta(seconds=5),
+    }
+
+    _join: timedelta
+    _last_call: timedelta
+    _close: timedelta
+    _heartbeat: timedelta
+
+    def __init__(
+        self,
+        join: Optional[timedelta] = None,
+        last_call: Optional[timedelta] = None,
+        close: Optional[timedelta] = None,
+        heartbeat: Optional[timedelta] = None,
+    ) -> None:
+        self._set_timeouts(join=join, last_call=last_call, close=close, heartbeat=heartbeat)
+
+    @property
+    def join(self) -> timedelta:
+        """Get the join timeout."""
+        return self._join
+
+    @property
+    def last_call(self) -> timedelta:
+        """Get the last call timeout."""
+        return self._last_call
+
+    @property
+    def close(self) -> timedelta:
+        """Get the close timeout."""
+        return self._close
+
+    @property
+    def heartbeat(self) -> timedelta:
+        """Get the keep-alive heartbeat timeout."""
+        return self._heartbeat
+
+    def _set_timeouts(self, **timeouts: Optional[timedelta]):
+        for name, timeout in timeouts.items():
+            if timeout is None:
+                timeout = self._DEFAULT_TIMEOUTS[name]
+            if timeout <= self._ZERO:
+                raise ValueError(f"The {name} timeout ({timeout}) must be positive.")
+            setattr(self, "_" + name, timeout)
+
+
+@dataclass(repr=False, eq=False, frozen=True)
+class RendezvousSettings:
+    """Hold the settings of the rendezvous.
+
+    Attributes:
+        run_id:
+            The run id of the rendezvous.
+        min_nodes:
+            The minimum number of nodes to admit to the rendezvous.
+        max_nodes:
+            The maximum number of nodes to admit to the rendezvous.
+        timeout:
+            The timeout configuration of the rendezvous.
+        keep_alive_interval:
+            The amount of time a node waits before sending a heartbeat to keep
+            it alive in the rendezvous.
+        keep_alive_max_attempt:
+            The maximum number of failed heartbeat attempts after which a node
+            is considered dead.
+    """
+
+    run_id: str
+    min_nodes: int
+    max_nodes: int
+    timeout: RendezvousTimeout
+    keep_alive_interval: timedelta
+    keep_alive_max_attempt: int
+
+
+@dataclass(eq=True, order=True, frozen=True)
+class _NodeDesc:
+    """Describe a node in the rendezvous.
+
+    Attributes:
+        addr:
+            The FQDN of the node or user specified local node address.
+        pid:
+            The id of the process in which the rendezvous handler runs.
+        local_id:
+            A process-wide unique id.
+    """
+
+    addr: str
+    pid: int
+    local_id: int
+
+    def __repr__(self) -> str:
+        return f"{self.addr}_{self.pid}_{self.local_id}"
+
+
+class _NodeDescGenerator:
+    """Generate node descriptors.
+
+    A node descriptor is a combination of an FQDN, a process id, and an auto-
+    incremented integer that uniquely identifies a node in the rendezvous.
+    """
+
+    _lock: threading.Lock
+    _local_id: int
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+
+        # An integer that is incremented with each call to generate().
+        self._local_id = 0
+
+    def generate(self, local_addr: Optional[str] = None) -> _NodeDesc:
+        # This method can be called by multiple threads concurrently; therefore,
+        # we must increment the integer atomically.
+        with self._lock:
+            local_id = self._local_id
+
+            self._local_id += 1
+
+        return _NodeDesc(local_addr or socket.getfqdn(), os.getpid(), local_id)
+
+
+class _RendezvousState:
+    """Hold the state of a rendezvous.
+
+    Attributes:
+        round:
+            The current round of the rendezvous.
+        complete:
+            A boolean value indicating whether the current round of the
+            rendezvous is complete.
+        deadline:
+            The time at which the current round of the rendezvous will be
+            considered complete if it is still waiting for nodes to join.
+        closed:
+            A boolean value indicating whether the rendezvous is closed.
+        participants:
+            A dictionary of the participants and their corresponding ranks.
+        wait_list:
+            A set of nodes that are waiting to participate in the next round of
+            the rendezvous.
+        redundancy_list:
+            A set of nodes that are redundant in the current round and can join
+            the next rendezvous without triggering re-rendezvous.
+        last_heartbeats:
+            A dictionary containing each node's last heartbeat time.
+    """
+
+    round: int
+    complete: bool
+    deadline: Optional[datetime]
+    closed: bool
+    participants: Dict[_NodeDesc, int]
+    wait_list: Set[_NodeDesc]
+    redundancy_list: Set[_NodeDesc]
+    last_heartbeats: Dict[_NodeDesc, datetime]
+
+    def __init__(self) -> None:
+        self.round = 0
+        self.complete = False
+        self.deadline = None
+        self.closed = False
+        self.participants = {}
+        self.wait_list = set()
+        self.redundancy_list = set()
+        self.last_heartbeats = {}
+
+
+def _remove_participant_epilogue(state: _RendezvousState, settings: RendezvousSettings) -> None:
+    if state.complete:
+        # If we do not have any participants left, move to the next round.
+        if not state.participants:
+            msg = "No participants left in the rendezvous, marking rendezvous as incomplete"
+            log.debug(msg)
+            state.complete = False
+
+            state.round += 1
+    else:
+        if len(state.participants) < settings.min_nodes:
+            msg = (
+                f"Number of participants {len(state.participants)}) less than"
+                f"min_nodes {settings.min_nodes}, clearning deadline in state"
+            )
+            log.debug(msg)
+            state.deadline = None
+
+
+class _RendezvousStateHolder(ABC):
+    """Hold the shared rendezvous state synced with other nodes."""
+
+    @property
+    @abstractmethod
+    def state(self) -> _RendezvousState:
+        """Get the local state."""
+
+    @abstractmethod
+    def sync(self) -> Optional[bool]:
+        """Read or writes the latest state.
+
+        Returns:
+            A boolean value indicating whether the local state, in case marked
+            as dirty, was successfully synced with other nodes.
+        """
+
+    @abstractmethod
+    def mark_dirty(self) -> None:
+        """Mark the local state as dirty."""
+
+
+class _BackendRendezvousStateHolder(_RendezvousStateHolder):
+    """Hold the rendezvous state synced with other nodes via a backend.
+
+    Args:
+        backend:
+            The rendezvous backend to use.
+        settings:
+            The rendezvous settings.
+        cache_duration:
+            The amount of time, in seconds, to cache the last rendezvous state
+            before requesting it from the backend again.
+    """
+
+    _backend: RendezvousBackend
+    _state: _RendezvousState
+    _settings: RendezvousSettings
+    _cache_duration: int
+    _token: Token
+    _dirty: bool
+    _last_sync_time: float
+    _dead_nodes: List[_NodeDesc]
+
+    def __init__(
+        self,
+        backend: RendezvousBackend,
+        settings: RendezvousSettings,
+        cache_duration: int = 1,
+    ) -> None:
+        self._backend = backend
+        self._state = _RendezvousState()
+        self._settings = settings
+        self._cache_duration = cache_duration
+        self._token = None
+        self._dirty = False
+        self._last_sync_time = -1
+        self._dead_nodes = []
+
+    def _record(self, message: str, node_state: NodeState = NodeState.RUNNING):
+        construct_and_record_rdzv_event(
+            name=f"{self.__class__.__name__}.{get_method_name()}",
+            run_id=self._settings.run_id,
+            message=message,
+            node_state=node_state,
+        )
+
+    @property
+    def state(self) -> _RendezvousState:
+        """See base class."""
+        return self._state
+
+    def sync(self) -> Optional[bool]:
+        """See base class."""
+        state_bits: Optional[bytes] = None
+
+        token = None
+
+        has_set: Optional[bool]
+
+        if self._dirty:
+            has_set = False
+
+            state_bits = pickle.dumps(self._state)
+
+            set_response = self._backend.set_state(state_bits, self._token)
+            if set_response is not None:
+                state_bits, token, has_set = set_response
+        else:
+            has_set = None
+
+            if self._cache_duration > 0:
+                # Avoid overloading the backend if we are asked to retrieve the
+                # state repeatedly. Try to serve the cached state.
+                if self._last_sync_time >= max(time.monotonic() - self._cache_duration, 0):
+                    return None
+
+            get_response = self._backend.get_state()
+            if get_response is not None:
+                state_bits, token = get_response
+
+        if state_bits is not None:
+            try:
+                self._state = pickle.loads(state_bits)
+            except pickle.PickleError as exc:
+                raise RendezvousStateError(
+                    "The rendezvous state is corrupt. See inner exception for details."
+                ) from exc
+        else:
+            self._state = _RendezvousState()
+
+        if has_set and self._dead_nodes and log.isEnabledFor(logging.DEBUG):
+            node_list = ", ".join(f"'{dead_node}'" for dead_node in self._dead_nodes)
+
+            msg = (
+                f"As part of the sync operation the node(s) {node_list} have been removed from the "
+                f"rendezvous '{self._settings.run_id}' since they had no heartbeat."
+            )
+            self._record(message=msg)
+            log.debug(msg)
+
+        self._token = token
+
+        self._dirty = False
+
+        self._last_sync_time = time.monotonic()
+
+        self._sanitize()
+
+        return has_set
+
+    def _sanitize(self) -> None:
+        state = self._state
+
+        expire_time = datetime.utcnow() - (
+            self._settings.keep_alive_interval * self._settings.keep_alive_max_attempt
+        )
+
+        # Filter out the dead nodes.
+        self._dead_nodes = [
+            node
+            for node, last_heartbeat in state.last_heartbeats.items()
+            if last_heartbeat < expire_time
+        ]
+
+        participant_removed = False
+
+        for dead_node in self._dead_nodes:
+            msg = f"Detected dead node '{dead_node}', removing it from the rendezvous"
+            log.debug(msg)
+            del state.last_heartbeats[dead_node]
+
+            try:
+                del state.participants[dead_node]
+
+                participant_removed = True
+            except KeyError:
+                pass
+
+            try:
+                state.wait_list.remove(dead_node)
+            except KeyError:
+                pass
+
+            try:
+                state.redundancy_list.remove(dead_node)
+            except KeyError:
+                pass
+
+        if participant_removed:
+            # Common epilogue shared with the _remove_from_participants()
+            # function of _DistributedRendezvousOpExecutor.
+            _remove_participant_epilogue(state, self._settings)
+
+    def mark_dirty(self) -> None:
+        """See base class.
+
+        If the local rendezvous state is dirty, the next sync call will try to
+        write the changes back to the backend. However this attempt might fail
+        if another node, which had the same state, also made changes and wrote
+        them before us.
+        """
+        self._dirty = True
+
+
+class _Action(Enum):
+    """Specifies the possible actions based on the state of the rendezvous."""
+
+    KEEP_ALIVE = 1
+    ADD_TO_PARTICIPANTS = 2
+    ADD_TO_WAIT_LIST = 3
+    ADD_TO_REDUNDANCY_LIST = 4
+    REMOVE_FROM_PARTICIPANTS = 5
+    REMOVE_FROM_WAIT_LIST = 6
+    REMOVE_FROM_REDUNDANCY_LIST = 7
+    MARK_RENDEZVOUS_COMPLETE = 8
+    MARK_RENDEZVOUS_CLOSED = 9
+    SYNC = 10
+    ERROR_CLOSED = 11
+    ERROR_TIMEOUT = 12
+    FINISH = 13
+
+
+class _RendezvousContext:
+    """Holds the context of the rendezvous.
+
+    Attributes:
+        node:
+            The node descriptor associated with the current rendezvous handler
+            instance.
+        state:
+            The current state of the rendezvous.
+        settings:
+            The rendezvous settings.
+    """
+
+    node: _NodeDesc
+    state: _RendezvousState
+    settings: RendezvousSettings
+
+    def __init__(
+        self, node: _NodeDesc, state: _RendezvousState, settings: RendezvousSettings
+    ) -> None:
+        self.node = node
+        self.state = state
+        self.settings = settings
+
+
+class _RendezvousOpExecutor(ABC):
+    """Execute rendezvous operations."""
+
+    @abstractmethod
+    def run(
+        self,
+        state_handler: Callable[[_RendezvousContext, float], _Action],
+        deadline: float,
+        update_deadline: Optional[Callable[[timedelta], float]] = None,
+    ) -> None:
+        """Execute a rendezvous operation.
+
+        An operation is run inside a state machine and is expected to transition
+        the rendezvous from one state to another.
+
+        Args:
+            state_handler:
+                A callable that is expected to return the next state transition
+                action based on the current state of the rendezvous.
+            deadline:
+                The time, in seconds, at which the operation will be considered
+                timed-out.
+            update_deadline:
+                Function to generate a new operation deadline if the current
+                node may participate in the next rendezvous.
+        """
+
+
+class _DistributedRendezvousOpExecutor(_RendezvousOpExecutor):
+    """Execute rendezvous operations using a shared state.
+
+    Args:
+        node:
+            The node descriptor associated with the current rendezvous handler
+            instance.
+        state_holder:
+            The ``RendezvousStateHolder`` to use to sync the rendezvous state
+            with other nodes.
+        settings:
+            The rendezvous settings.
+    """
+
+    _node: _NodeDesc
+    _state: _RendezvousState
+    _state_holder: _RendezvousStateHolder
+    _settings: RendezvousSettings
+
+    def __init__(
+        self,
+        node: _NodeDesc,
+        state_holder: _RendezvousStateHolder,
+        settings: RendezvousSettings,
+    ) -> None:
+        self._node = node
+        self._state_holder = state_holder
+        self._settings = settings
+
+    def _record(self, message: str, node_state: NodeState = NodeState.RUNNING) -> None:
+        construct_and_record_rdzv_event(
+            name=f"{self.__class__.__name__}.{get_method_name()}",
+            run_id=self._settings.run_id,
+            message=message,
+            node_state=node_state,
+            hostname=self._node.addr,
+            pid=self._node.pid,
+            local_id=self._node.local_id,
+        )
+
+    def run(
+        self,
+        state_handler: Callable[[_RendezvousContext, float], _Action],
+        deadline: float,
+        update_deadline: Optional[Callable[[timedelta], float]] = None,
+    ) -> None:
+        """See base class."""
+        action = None
+        while action != _Action.FINISH:
+            # Reads or writes the latest rendezvous state shared by all nodes in
+            # the rendezvous. Note that our local changes might get overridden
+            # by another node if that node synced its changes before us.
+            has_set = self._state_holder.sync()
+            if has_set is not None:
+                if has_set:
+                    msg = (
+                        f"The node '{self._node}' has successfully synced its local changes with "
+                        f"other nodes in the rendezvous '{self._settings.run_id}'."
+                    )
+                else:
+                    msg = (
+                        f"The node '{self._node}' has a stale state and failed to sync its local "
+                        f"changes with other nodes in the rendezvous '{self._settings.run_id}'."
+                    )
+
+                self._record(message=msg)
+                log.debug(msg)
+
+            self._state = self._state_holder.state
+
+            ctx = _RendezvousContext(self._node, self._state, self._settings)
+
+            # Determine the next action to take based on the current state of
+            # the rendezvous.
+            action = state_handler(ctx, deadline)
+
+            if action == _Action.FINISH:
+                continue
+
+            if action == _Action.ERROR_CLOSED:
+                raise RendezvousClosedError()
+
+            if action == _Action.ERROR_TIMEOUT:
+                raise RendezvousTimeoutError()
+
+            if action == _Action.SYNC:
+                # Delay the execution by one second to avoid overloading the
+                # backend if we are asked to poll for state changes.
+                _delay(seconds=1)
+            else:
+                if action == _Action.KEEP_ALIVE:
+                    self._keep_alive()
+                elif action == _Action.ADD_TO_PARTICIPANTS:
+                    self._add_to_participants()
+                elif action == _Action.ADD_TO_WAIT_LIST:
+                    self._add_to_wait_list()
+                elif action == _Action.ADD_TO_REDUNDANCY_LIST:
+                    self._add_to_redundancy_list()
+                elif action == _Action.REMOVE_FROM_PARTICIPANTS:
+                    self._remove_from_participants()
+                elif action == _Action.REMOVE_FROM_WAIT_LIST:
+                    self._remove_from_wait_list()
+                elif action == _Action.REMOVE_FROM_REDUNDANCY_LIST:
+                    self._remove_from_redundancy_list()
+                    # update deadline since the node may participate in rendezvous process
+                    if update_deadline:
+                        deadline = update_deadline(self._settings.timeout.join)
+                elif action == _Action.MARK_RENDEZVOUS_COMPLETE:
+                    self._mark_rendezvous_complete()
+                elif action == _Action.MARK_RENDEZVOUS_CLOSED:
+                    self._mark_rendezvous_closed()
+
+                # Attempt to sync our changes back to other nodes.
+                self._state_holder.mark_dirty()
+
+    def _keep_alive(self) -> None:
+        msg = (
+            f"The node '{self._node}' updated its keep-alive heartbeat time for the rendezvous "
+            f"'{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        self._state.last_heartbeats[self._node] = datetime.utcnow()
+
+    def _add_to_participants(self) -> None:
+        msg = (
+            f"The node '{self._node}' added itself to the participants of round "
+            f"{self._state.round} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        state = self._state
+
+        try:
+            state.wait_list.remove(self._node)
+        except KeyError:
+            pass
+
+        # The ranks of the participants will be set once the rendezvous is
+        # complete.
+        state.participants[self._node] = 0
+
+        self._keep_alive()
+
+        if len(state.participants) == self._settings.min_nodes:
+            state.deadline = datetime.utcnow() + self._settings.timeout.last_call
+
+        if len(state.participants) == self._settings.max_nodes:
+            self._mark_rendezvous_complete()
+
+    def _add_to_wait_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' added itself to the wait list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        if self._node in self._state.redundancy_list:
+            self._state.redundancy_list.remove(self._node)
+        self._state.wait_list.add(self._node)
+
+        self._keep_alive()
+
+    def _add_to_redundancy_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' added itself to the redundancy list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        self._state.redundancy_list.add(self._node)
+
+        self._keep_alive()
+
+    def _remove_from_participants(self) -> None:
+        msg = (
+            f"The node '{self._node}' removed itself from the participants of round "
+            f"{self._state.round} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        state = self._state
+
+        del state.participants[self._node]
+
+        del state.last_heartbeats[self._node]
+
+        # Common epilogue shared with the sanitizer() function of
+        # _BackendRendezvousStateHolder.
+        _remove_participant_epilogue(state, self._settings)
+
+    def _remove_from_wait_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' removed itself from the wait list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        self._state.wait_list.remove(self._node)
+
+        del self._state.last_heartbeats[self._node]
+
+    def _remove_from_redundancy_list(self) -> None:
+        msg = (
+            f"The node '{self._node}' removed itself from the redunant list of round "
+            f"{self._state.round + 1} of the rendezvous '{self._settings.run_id}'. Pending sync."
+        )
+        self._record(message=msg)
+        log.debug(msg)
+
+        self._state.redundancy_list.remove(self._node)
+
+        del self._state.last_heartbeats[self._node]
+
+    def _mark_rendezvous_complete(self) -> None:
+        msg = (
+            f"The node '{self._node}' marked round {self._state.round} of the rendezvous "
+            f"'{self._settings.run_id}' as complete. Pending sync."
+        )
+        self._record(message=msg, node_state=NodeState.SUCCEEDED)
+        log.debug(msg)
+
+        state = self._state
+
+        state.complete = True
+        state.deadline = None
+
+        # Assign the ranks.
+        for rank, node in enumerate(sorted(state.participants)):
+            state.participants[node] = rank
+
+    def _mark_rendezvous_closed(self) -> None:
+        msg = (
+            f"The node '{self._node}' marked the rendezvous '{self._settings.run_id}' as closed. "
+            "Pending sync."
+        )
+        self._record(message=msg, node_state=NodeState.SUCCEEDED)
+        log.debug(msg)
+
+        self._state.closed = True
+
+
+def _should_keep_alive(ctx: _RendezvousContext) -> bool:
+    """Determine whether a keep-alive heartbeat should be sent."""
+    try:
+        last_heartbeat = ctx.state.last_heartbeats[ctx.node]
+    except KeyError:
+        return False
+
+    return last_heartbeat <= datetime.utcnow() - ctx.settings.keep_alive_interval
+
+
+class _RendezvousExitOp:
+    """Represent a rendezvous exit operation."""
+
+    def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
+        if ctx.node in ctx.state.participants:
+            if time.monotonic() > deadline:
+                return _Action.ERROR_TIMEOUT
+            return _Action.REMOVE_FROM_PARTICIPANTS
+        return _Action.FINISH
+
+
+class _RendezvousJoinOp:
+    """Represent a rendezvous join operation."""
+
+    def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
+        state = ctx.state
+
+        # A closed rendezvous means that it no longer accepts new nodes.
+        if state.closed:
+            if ctx.node in state.redundancy_list:
+                msg = f"The rendezvous '{ctx.settings.run_id}' is closed, terminating pending rendezvous."
+                raise RendezvousGracefulExitError(msg)
+            return _Action.ERROR_CLOSED
+
+        if ctx.node in state.redundancy_list:
+            msg = f"The node {ctx.node} is in redunancy list"
+            log.debug(msg)
+            # don't apply the timeout logic here, since we want to allow the node to rejoin
+            if len(state.participants) == ctx.settings.max_nodes:
+                if _should_keep_alive(ctx):
+                    return _Action.KEEP_ALIVE
+                else:
+                    return _Action.SYNC
+            else:
+                # transition to waiting state that will respect timeouts.
+                msg = f"The node {ctx.node} is removed from redunancy list"
+                log.debug(msg)
+                return _Action.REMOVE_FROM_REDUNDANCY_LIST
+
+        is_participant = ctx.node in state.participants
+
+        # If we are part of the rendezvous and it is already complete there is
+        # no further action to take.
+        if state.complete and is_participant:
+            return _Action.FINISH
+
+        now = time.monotonic()
+        if now > deadline:
+            rollback_period = 5  # 5 seconds
+
+            # If we still have time to rollback (a short period on top of the
+            # operation deadline), try to remove ourself from the rendezvous.
+            # It is okay if we can't though as our keep-alive will eventually
+            # expire.
+            if now <= deadline + rollback_period:
+                # If we are part of the rendezvous, it means we couldn't find
+                # enough participants to complete it on time.
+                if is_participant:
+                    return _Action.REMOVE_FROM_PARTICIPANTS
+                # If we are in the wait list, it means we couldn't wait till the
+                # next round of the rendezvous.
+                if ctx.node in state.wait_list:
+                    return _Action.REMOVE_FROM_WAIT_LIST
+            return _Action.ERROR_TIMEOUT
+
+        if state.complete:
+            # If we are here, it means we are not part of the rendezvous. In
+            # case the rendezvous has capacity for additional participants add
+            # ourself to the wait list for the next round.
+            if len(state.participants) < ctx.settings.max_nodes:
+                if ctx.node not in state.wait_list:
+                    return _Action.ADD_TO_WAIT_LIST
+            elif len(state.participants) >= ctx.settings.max_nodes:
+                if ctx.node not in state.redundancy_list and ctx.node not in state.wait_list:
+                    return _Action.ADD_TO_REDUNDANCY_LIST
+        elif is_participant:
+            # If the rendezvous has enough number of participants including us,
+            # check whether we have passed the rendezvous deadline. If yes,
+            # complete it.
+            if len(state.participants) >= ctx.settings.min_nodes and \
+                    len(state.participants) <= ctx.settings.max_nodes:
+                if cast(datetime, state.deadline) < datetime.utcnow():
+                    msg = (
+                        f"The node '{ctx.node}' marking the rendezvous complete, "
+                        f"quorum established within deadline"
+                    )
+                    log.debug(msg)
+                    return _Action.MARK_RENDEZVOUS_COMPLETE
+                else:
+                    msg = f"The node '{ctx.node}' can't complete rendezvous: deadline reached"
+                    log.debug(msg)
+            else:
+                msg = f"The node '{ctx.node}' can't complete rendezvous: not enough participants"
+                log.debug(msg)
+        else:
+            # The rendezvous is not complete yet and we are not part of it. Try
+            # to join.
+            return _Action.ADD_TO_PARTICIPANTS
+
+        if _should_keep_alive(ctx):
+            return _Action.KEEP_ALIVE
+
+        # At this point either the rendezvous is not complete, but we are part
+        # of it, which means we have to wait for other participants to join; or
+        # the rendezvous is complete, but we are not part of it, which means we
+        # have to wait for the next round.
+        return _Action.SYNC
+
+
+class _RendezvousCloseOp:
+    """Represent a rendezvous close operation."""
+
+    def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
+        if ctx.state.closed:
+            return _Action.FINISH
+        if time.monotonic() > deadline:
+            return _Action.ERROR_TIMEOUT
+        return _Action.MARK_RENDEZVOUS_CLOSED
+
+
+class _RendezvousKeepAliveOp:
+    """Represent a rendezvous keep-alive update operation."""
+
+    def __call__(self, ctx: _RendezvousContext, deadline: float) -> _Action:
+        if _should_keep_alive(ctx):
+            if time.monotonic() > deadline:
+                return _Action.ERROR_TIMEOUT
+            return _Action.KEEP_ALIVE
+        return _Action.FINISH
+
+
+class DynamicRendezvousHandler(RendezvousHandler):
+    """Represent a handler that sets up a rendezvous among a set of nodes."""
+
+    # Static
+    _node_desc_generator = _NodeDescGenerator()
+
+    _this_node: _NodeDesc
+    _settings: RendezvousSettings
+    _backend_name: str
+    _store: Store
+    _state_holder: _RendezvousStateHolder
+    _op_executor: _RendezvousOpExecutor
+    _heartbeat_lock: threading.Lock
+    _keep_alive_timer: Optional[_PeriodicTimer]
+
+    @classmethod
+    def from_backend(
+        cls,
+        run_id: str,
+        store: Store,
+        backend: RendezvousBackend,
+        min_nodes: int,
+        max_nodes: int,
+        local_addr: Optional[str] = None,
+        timeout: Optional[RendezvousTimeout] = None,
+    ):
+        """Create a new :py:class:`DynamicRendezvousHandler`.
+
+        Args:
+            run_id:
+                The run id of the rendezvous.
+            store:
+                The C10d store to return as part of the rendezvous.
+            backend:
+                The backend to use to hold the rendezvous state.
+            min_nodes:
+                The minimum number of nodes to admit to the rendezvous.
+            max_nodes:
+                The maximum number of nodes to admit to the rendezvous.
+            local_addr:
+                The local node address.
+            timeout:
+                The timeout configuration of the rendezvous.
+        """
+        # We associate each handler instance with a unique node descriptor.
+        node = cls._node_desc_generator.generate(local_addr)
+
+        settings = RendezvousSettings(
+            run_id,
+            min_nodes,
+            max_nodes,
+            timeout or RendezvousTimeout(),
+            keep_alive_interval=timedelta(seconds=5),
+            keep_alive_max_attempt=3,
+        )
+
+        state_holder = _BackendRendezvousStateHolder(backend, settings)
+
+        return cls(node, settings, backend.name, store, state_holder)
+
+    def __init__(
+        self,
+        node: _NodeDesc,
+        settings: RendezvousSettings,
+        backend_name: str,
+        store: Store,
+        state_holder: _RendezvousStateHolder,
+    ) -> None:
+        if not settings.run_id:
+            raise ValueError("The run id must be a non-empty string.")
+
+        if settings.min_nodes < 1:
+            raise ValueError(
+                f"The minimum number of nodes ({settings.min_nodes}) must be greater than zero."
+            )
+
+        if settings.max_nodes < settings.min_nodes:
+            raise ValueError(
+                f"The maximum number of nodes ({settings.max_nodes}) must be greater than or equal "
+                f"to the minimum number of nodes ({settings.min_nodes})."
+            )
+
+        self._this_node = node
+
+        self._settings = settings
+
+        self._backend_name = backend_name
+
+        self._store = store
+
+        self._state_holder = state_holder
+
+        self._op_executor = _DistributedRendezvousOpExecutor(
+            self._this_node, self._state_holder, self._settings
+        )
+
+        self._heartbeat_lock = threading.Lock()
+
+        self._keep_alive_timer = None
+
+    def _record(
+        self,
+        message: str,
+        node_state: NodeState = NodeState.RUNNING,
+        rank: Optional[int] = None,
+    ) -> None:
+        construct_and_record_rdzv_event(
+            name=f"{self.__class__.__name__}.{get_method_name()}",
+            run_id=self._settings.run_id,
+            message=message,
+            node_state=node_state,
+            hostname=self._this_node.addr,
+            pid=self._this_node.pid,
+            local_id=self._this_node.local_id,
+            rank=rank,
+        )
+
+    @property
+    def settings(self) -> RendezvousSettings:
+        """Get the settings of the rendezvous."""
+        return self._settings
+
+    def get_backend(self) -> str:
+        """See base class."""
+        return self._backend_name
+
+    def next_rendezvous(self) -> Tuple[Store, int, int]:
+        """See base class."""
+        msg = (
+            f"The node '{self._this_node}' attempts to join the next round of the rendezvous "
+            f"'{self._settings.run_id}'."
+        )
+        self._record(message=msg)
+        log.info(msg)
+
+        try:
+            self._stop_heartbeats()
+
+            # Delay the execution for a small random amount of time if this is our
+            # first run. This will slightly skew the rendezvous attempts across the
+            # nodes and reduce the load on the backend.
+            if self._state_holder.state.round == 0:
+                _delay(seconds=(0, 0.3))
+
+            exit_op = _RendezvousExitOp()
+            join_op = _RendezvousJoinOp()
+
+            deadline = self._get_deadline(self._settings.timeout.join)
+            self._op_executor.run(exit_op, deadline)
+            self._op_executor.run(
+                join_op,
+                deadline,
+                self._get_deadline)
+
+            self._start_heartbeats()
+
+            rank, world_size = self._get_world()
+            store = self._get_store()
+
+        except Exception as e:
+            self._record(
+                message=f"{type(e).__name__}: {str(e)}",
+                node_state=NodeState.FAILED,
+            )
+            raise
+
+        msg = (
+            f"The node '{self._this_node}' has joined round {self._state_holder.state.round} of "
+            f"the rendezvous '{self._settings.run_id}' as rank {rank} in a world of size "
+            f"{world_size}."
+        )
+        self._record(message=msg, rank=rank)
+        log.info(msg)
+
+        return store, rank, world_size
+
+    def is_closed(self) -> bool:
+        """See base class."""
+        try:
+            with self._heartbeat_lock:
+                self._state_holder.sync()
+
+                return self._state_holder.state.closed
+
+        except Exception as e:
+            self._record(
+                message=f"{type(e).__name__}: {str(e)}",
+                node_state=NodeState.FAILED,
+            )
+            raise
+
+    def set_closed(self) -> None:
+        """See base class."""
+        try:
+            with self._heartbeat_lock:
+                self._close()
+        except Exception as e:
+            self._record(
+                message=f"{type(e).__name__}: {str(e)}",
+                node_state=NodeState.FAILED,
+            )
+            raise
+
+    def num_nodes_waiting(self) -> int:
+        """See base class."""
+        try:
+            with self._heartbeat_lock:
+                self._state_holder.sync()
+
+                return len(self._state_holder.state.wait_list)
+
+        except Exception as e:
+            self._record(
+                message=f"{type(e).__name__}: {str(e)}",
+                node_state=NodeState.FAILED,
+            )
+            raise
+
+    def get_run_id(self) -> str:
+        """See base class."""
+        return self._settings.run_id
+
+    def shutdown(self) -> bool:
+        """See base class."""
+        self._stop_heartbeats()
+
+        try:
+            self._close()
+
+            return True
+        except RendezvousError as ex:
+            msg = (
+                f"The node '{self._this_node}' has failed to shutdown the rendezvous "
+                f"'{self._settings.run_id}' due to an error of type {type(ex).__name__}."
+            )
+            self._record(message=msg, node_state=NodeState.FAILED)
+            log.warning(msg)
+
+            return False
+        except Exception as e:
+            self._record(
+                message=f"{type(e).__name__}: {str(e)}",
+                node_state=NodeState.FAILED,
+            )
+            raise
+
+    def _close(self) -> None:
+        op = _RendezvousCloseOp()
+
+        deadline = self._get_deadline(self._settings.timeout.close)
+
+        self._op_executor.run(op, deadline)
+
+        msg = f"The node '{self._this_node}' has closed the rendezvous '{self._settings.run_id}'."
+        self._record(message=msg, node_state=NodeState.SUCCEEDED)
+        log.info(msg)
+
+    @staticmethod
+    def _keep_alive_weak(weak_self) -> None:
+        self = weak_self()
+        if self is not None:
+            self._keep_alive()
+
+    def _keep_alive(self) -> None:
+        self._heartbeat_lock.acquire()
+
+        op = _RendezvousKeepAliveOp()
+
+        deadline = self._get_deadline(self._settings.timeout.heartbeat)
+
+        try:
+            self._op_executor.run(op, deadline)
+
+            msg = (
+                f"The node '{self._this_node}' has sent a keep-alive heartbeat to the rendezvous "
+                f"'{self._settings.run_id}'."
+            )
+            self._record(message=msg)
+            log.debug(msg)
+        except RendezvousError as ex:
+            msg = (
+                f"The node '{self._this_node}' has failed to send a keep-alive heartbeat to the "
+                f"rendezvous '{self._settings.run_id}' due to an error of type {type(ex).__name__}."
+            )
+            self._record(message=msg, node_state=NodeState.FAILED)
+            log.warning(msg)
+        finally:
+            self._heartbeat_lock.release()
+
+    def _start_heartbeats(self) -> None:
+        self._keep_alive_timer = _PeriodicTimer(
+            self._settings.keep_alive_interval, self._keep_alive_weak, weakref.ref(self)
+        )
+
+        self._keep_alive_timer.set_name(f"RendezvousKeepAliveTimer_{self._this_node.local_id}")
+
+        self._keep_alive_timer.start()
+
+    def _stop_heartbeats(self) -> None:
+        if self._keep_alive_timer is None:
+            return
+
+        self._keep_alive_timer.cancel()
+
+    def _get_world(self) -> Tuple[int, int]:
+        state = self._state_holder.state
+
+        return state.participants[self._this_node], len(state.participants)
+
+    def _get_store(self) -> Store:
+        key_prefix = f"torch.rendezvous.{self._settings.run_id}.{self._state_holder.state.round}"
+
+        return PrefixStore(key_prefix, self._store)
+
+    def _get_deadline(self, timeout: timedelta) -> float:
+        return time.monotonic() + timeout.total_seconds()
+
+
+def _get_timeout(params: RendezvousParameters, key: str) -> Optional[timedelta]:
+    timeout = params.get_as_int(key + "_timeout")
+    if timeout is None:
+        return None
+    return timedelta(seconds=timeout)
+
+
+def create_handler(
+    store: Store, backend: RendezvousBackend, params: RendezvousParameters
+) -> DynamicRendezvousHandler:
+    """Create a new :py:class:`DynamicRendezvousHandler` from the specified parameters.
+
+    Args:
+        store:
+            The C10d store to return as part of the rendezvous.
+        backend:
+            The backend to use to hold the rendezvous state.
+
+    +-------------------+------------------------------------------------------+
+    | Parameter         | Description                                          |
+    +===================+======================================================+
+    | join_timeout      | The total time, in seconds, within which the         |
+    |                   | rendezvous is expected to complete. Defaults to 600  |
+    |                   | seconds.                                             |
+    +-------------------+------------------------------------------------------+
+    | last_call_timeout | An additional wait amount, in seconds, before        |
+    |                   | completing the rendezvous once the minimum number of |
+    |                   | nodes has been reached. Defaults to 30 seconds.      |
+    +-------------------+------------------------------------------------------+
+    | close_timeout     | The time, in seconds, within which the rendezvous is |
+    |                   | expected to close after a call to                    |
+    |                   | :py:meth:`RendezvousHandler.set_closed` or           |
+    |                   | :py:meth:`RendezvousHandler.shutdown`. Defaults to   |
+    |                   | 30 seconds.                                          |
+    +-------------------+------------------------------------------------------+
+    """
+    try:
+        timeout = RendezvousTimeout(
+            _get_timeout(params, "join"),
+            _get_timeout(params, "last_call"),
+            _get_timeout(params, "close"),
+        )
+
+        return DynamicRendezvousHandler.from_backend(
+            params.run_id,
+            store,
+            backend,
+            params.min_nodes,
+            params.max_nodes,
+            params.local_addr,
+            timeout,
+        )
+    except Exception as e:
+        construct_and_record_rdzv_event(
+            message=f"{type(e).__name__}: {str(e)}",
+            run_id=params.run_id,
+            node_state=NodeState.FAILED,
+        )
+        raise
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
new file mode 100644
index 0000000000000000000000000000000000000000..952d3040c3c7383ed3ce2dd1f7a28ad51eec912f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous.py
@@ -0,0 +1,1045 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import sys
+import threading
+import time
+from typing import Optional
+
+import etcd  # type: ignore[import]
+from torch.distributed.elastic.rendezvous import (
+    RendezvousClosedError,
+    RendezvousError,
+    RendezvousHandler,
+    RendezvousParameters,
+    RendezvousTimeoutError,
+)
+
+from .utils import parse_rendezvous_endpoint
+from .etcd_store import EtcdStore, cas_delay
+
+
+_log_fmt = logging.Formatter("%(levelname)s %(asctime)s %(message)s")
+_log_handler = logging.StreamHandler(sys.stderr)
+_log_handler.setFormatter(_log_fmt)
+
+log = logging.getLogger(__name__)
+log.propagate = False
+log.setLevel(logging.INFO)
+log.addHandler(_log_handler)
+
+
+# Retryable failure exception means the we were too late to make
+# a desired state transition (e.g. because of a race condition),
+# and should now restart from the beginning.
+# A small delay is recommended to avoid spamming Etcd.
+class EtcdRendezvousRetryableFailure(Exception):
+    pass
+
+
+# Similar to retryable failure, but the new state we observed suggests we
+# can re-try immediately, i.e. without a need for "safety delay".
+class EtcdRendezvousRetryImmediately(Exception):
+    pass
+
+
+# Default timeout for the rendezvous.
+_DEFAULT_TIMEOUT: int = 600  # 10 minutes
+
+# Additional waiting time after reaching the minimum number of nodes
+# in case the rendezvous is elastic (min != max).
+_DEFAULT_LAST_CALL_TIMEOUT: int = 30  # 30 seconds
+
+# Various constants used internally in EtcdRendezvous
+CONST_ETCD_SETUP_TTL = 5
+CONST_ETCD_FROZEN_TTL = 10
+CONST_ETCD_JOINABLE_EPHEMERAL_TTL = 10
+
+# Ephemeral node TTL for worker's keep-alive key:
+CONST_WORKER_KEEPALIVE_TTL = 10
+
+# TTL for the ephemeral run_id-specific directory. All rendezvous state data
+# for a specific run_id (job instance) is contained within directory.
+# Its only role is to clean-up rendezvous data from old runs (for the case when
+# etcd server is persistent), and has no affect on correctness, but should be
+# larger than any timeouts that a worker process is expected to survive:
+CONST_RUNID_SUBROOT_TTL = 7200  # 2 hours
+
+
+class EtcdRendezvousHandler(RendezvousHandler):
+    """
+    Implements a
+    :py:class:`torch.distributed.elastic.rendezvous.RendezvousHandler` interface
+    backed by
+    :py:class:`torch.distributed.elastic.rendezvous.etcd_rendezvous.EtcdRendezvous`.
+    ``EtcdRendezvousHandler`` uses a URL to configure the type of rendezvous to
+    use and to pass implementation specific configurations to the rendezvous
+    module. The basic etcd rendezvous configuration URL looks like the following
+    ::
+
+     etcd://<etcd_address>:<port>/<job_id>?min_workers=<min_workers>&max_workers=<max_workers>  # noqa: W605
+
+     -- example --
+
+     etcd://localhost:2379/1234?min_workers=1&max_workers=3
+
+    The URL above is interpreted as follows:
+
+    1. Use the rendezvous handler that is registered with the ``etcd``
+       scheme
+    2. The ``etcd`` endpoint to use is ``localhost:2379``
+    3. ``job_id == 1234`` is used as the prefix in etcd (this allows one to
+       share a common etcd server for multiple jobs so long as the
+       ``job_ids`` are guaranteed to be unique). Note that the job id can be
+       any string (e.g. does not need to be a number) as long as it is
+       unique.
+    4. ``min_workers=1`` and ``max_workers=3`` specifies a range for
+       membership size - Torch Distributed Elastic starts running the job as
+       long as the cluster size is greater than or equal to ``min_workers``
+       and admits up to ``max_workers`` into the cluster.
+
+    Below are a full list of the parameters that can be passed to etcd
+    rendezvous:
+
+    +--------------------------------------------+--------------------------+
+    | Parameter                                  | Description              |
+    +============================================+==========================+
+    | min_workers                                | minimum number of        |
+    |                                            | workers for the          |
+    |                                            | rendezvous to be valid   |
+    +--------------------------------------------+--------------------------+
+    | max_workers                                | maximum number of        |
+    |                                            | workers to admit         |
+    +--------------------------------------------+--------------------------+
+    | timeout                                    | total timeout within     |
+    |                                            | which next_rendezvous is |
+    |                                            | expected to succeed      |
+    |                                            | (default 600s)           |
+    +--------------------------------------------+--------------------------+
+    | last_call_timeout                          | additional wait amount   |
+    |                                            | (“last call”) after min  |
+    |                                            | number of workers has    |
+    |                                            | been reached (defaults   |
+    |                                            | to 30s)                  |
+    +--------------------------------------------+--------------------------+
+    | etcd_prefix                                | path prefix (from etcd   |
+    |                                            | root), inside which all  |
+    |                                            | etcd nodes will be       |
+    |                                            | created (defaults to     |
+    |                                            | ``/torchelastic/p2p``)   |
+    +--------------------------------------------+--------------------------+
+    """
+
+    def __init__(self, rdzv_impl):
+        self._rdzv_impl = rdzv_impl
+
+    def __del__(self):
+        # TODO: look into using weakref here instead.
+        del self._rdzv_impl
+
+    def get_backend(self) -> str:
+        return "etcd"
+
+    def next_rendezvous(self):
+        rdzv_version, rank, world_size = self._rdzv_impl.rendezvous_barrier()
+
+        log.info("Creating EtcdStore as the c10d::Store implementation")
+        store = self._rdzv_impl.setup_kv_store(rdzv_version)
+
+        return store, rank, world_size
+
+    def is_closed(self):
+        try:
+            _, state = self._rdzv_impl.get_rdzv_state()
+            return state["status"] == "closed"
+        except etcd.EtcdKeyNotFound:
+            # No rendezvous state, so it cannot be closed.
+            return False
+
+    def set_closed(self):
+        self._rdzv_impl.set_closed()
+
+    def num_nodes_waiting(self):
+        try:
+            _, state = self._rdzv_impl.get_rdzv_state()
+            if state["status"] == "final":
+                return state["num_workers_waiting"]
+        except etcd.EtcdKeyNotFound:
+            pass
+        return 0
+
+    def get_run_id(self) -> str:
+        return self._rdzv_impl._run_id
+
+    def shutdown(self) -> bool:
+        try:
+            self.set_closed()
+            return True
+        except BaseException as e:
+            log.warning("Shutdown failed. Error occurred: %s", str(e))
+            return False
+
+
+# TODO: we should probably handle a few additional errors,
+# like EtcdLeaderElectionInProgress and EtcdWatcherCleared. These are
+# only relevant for multi-node Etcd ensemble. A simple retry would work,
+# but is verbose to add everywhere. Consider wrapping the client calls
+# into auto-retry for these errors?
+#
+class EtcdRendezvous:
+    """A rendezvous implementation that uses `etcd <https://etcd.io/>`__ as the backend store."""
+
+    def __init__(
+        self,
+        client,
+        prefix,
+        run_id,
+        num_min_workers,
+        num_max_workers,
+        timeout,
+        last_call_timeout,
+    ):
+        self.client = client
+        log.info("Etcd machines: %s", self.client.machines)
+
+        self._prefix = prefix
+        self._run_id = run_id
+        self._num_min_workers = num_min_workers
+        self._num_max_workers = num_max_workers
+        self._timeout = timeout
+        self._last_call_timeout = last_call_timeout
+
+        # For cleaning up TTL refresher threads (for ephemeral keys)
+        self._lease_run_id_stop = None
+        self._lease_this_rank_stop = None
+
+        if not self._prefix.endswith("/"):
+            self._prefix += "/"
+
+        # Setup a permanent prefix dir, if didn't exist
+        if self._prefix != "/":
+            self.create_path_if_not_exists(self._prefix)
+
+        # Lease a "sub-root" node specific to this job instance (run_id)
+        self.create_path_if_not_exists(self.get_path(""), ttl=CONST_RUNID_SUBROOT_TTL)
+        self._lease_run_id_stop = self.setup_lease_renewal(
+            self.get_path(""), ttl=CONST_RUNID_SUBROOT_TTL
+        )
+
+        # Subdir for all rendezvous work
+        self.create_path_if_not_exists(self.get_path("/rdzv"))
+
+        # Create a rendezvous version counter, if doesn't exist
+        try:
+            self.client.write(
+                key=self.get_path("/rdzv/version_counter"), value="0", prevExist=False
+            )
+        except etcd.EtcdAlreadyExist:
+            pass
+
+    def __del__(self):
+        # TODO: look into using weakref here instead.
+        if self._lease_run_id_stop is not None:
+            self._lease_run_id_stop.set()
+
+        if self._lease_this_rank_stop is not None:
+            self._lease_this_rank_stop.set()
+
+    def rendezvous_barrier(self):
+        """
+        Main entry point for next rendezvous.
+
+        This method is blocking until rendezvous succeeds or a timeout occurs.
+
+        Returns:
+             ``(rdzv_version, rank, world_size)``
+
+        Raises:
+            RendezvousTimeoutError - timeout waiting for rendezvous
+            RendezvousClosedError - rendezvous is or was closed while waiting
+            RendezvousError - other persistent errors that
+             render the rendezvous non-retryable
+        """
+        self._rendezvous_deadline = time.time() + self._timeout
+        while True:
+            if time.time() > self._rendezvous_deadline:
+                raise RendezvousTimeoutError()
+
+            log.info("Attempting to join next rendezvous")
+            try:
+                # Dis-own our lease in the previous rendezvous, if exists
+                if self._lease_this_rank_stop is not None:
+                    self._lease_this_rank_stop.set()
+
+                return self.init_phase()
+
+            except EtcdRendezvousRetryImmediately:
+                # The type of failure suggests we can retry without delay
+                pass
+
+            except EtcdRendezvousRetryableFailure:
+                # In case of retryable failure, wait a small delay
+                # to avoid spamming etcd
+                time.sleep(1)
+
+            except RendezvousTimeoutError:
+                log.info("Rendezvous timeout occurred in EtcdRendezvousHandler")
+                raise
+
+            except RendezvousClosedError:
+                log.info(
+                    "Rendezvous for run_id=%s was observed to be closed", self._run_id
+                )
+                raise
+
+            except RendezvousError:
+                raise
+
+            except Exception as e:
+                # In case of a general exception, wait a small delay
+                # to avoid spamming etcd
+                # FIXME: there are a few things that fall under this like
+                # etcd.EtcdKeyNotFound, etc, which could be handled more explicitly.
+                log.info("Rendezvous attempt failed, will retry. Reason: %s", e)
+                time.sleep(1)
+
+    def init_phase(self):
+        """
+        Initially, the rendezvous state is expected to be one of:
+
+        1. empty (non-existent) - in this case we try to create a new one.
+        2. joinable - we try to join it.
+        3. final - we announce ourselves as waiting, and go into monitoring mode
+
+        Any other state is considered transitional, and will be retried after
+        a short delay.
+
+        Returns:
+            ``(rdzv_version, rank, world_size)``
+
+        Raises:
+            RendezvousClosedError - current rendezvous was/is closed
+            EtcdRendezvousRetryableFailure - observed some intermediate
+             state, which is best handled by retrying later
+        """
+        try:
+            active_version = self.try_create_rendezvous()
+            state = json.loads(active_version.value)
+            log.info("New rendezvous state created: %s", state)
+        except etcd.EtcdAlreadyExist:
+            active_version, state = self.get_rdzv_state()
+            # Note: it is possible for above query to fail (etcd.EtcdKeyNotFound),
+            # but this is ok for us - just means we'll restart from beginning.
+            log.info("Observed existing rendezvous state: %s", state)
+
+        if state["status"] == "closed":
+            raise RendezvousClosedError()
+
+        if state["status"] == "joinable":
+            return self.join_phase(state["version"])
+
+        if state["status"] == "final":
+            self.handle_existing_rendezvous(state["version"])
+            raise EtcdRendezvousRetryImmediately()
+
+        self.try_wait_for_state_change(etcd_index=active_version.etcd_index + 1)
+        raise EtcdRendezvousRetryableFailure()
+
+    def join_phase(self, expected_version):
+        """
+        We observed a rendezvous state in 'joinable' state, and attempt to join this
+        particular version, and then wait for all other peers to join.
+        """
+        # Failure to join will propagate an exception, causing a re-entry.
+        active_version, this_rank = self.join_rendezvous(expected_version)
+        state = json.loads(active_version.value)
+        log.info(
+            "Joined rendezvous version %s as rank %s. Full state: %s",
+            state["version"], this_rank, state
+        )
+
+        # If this worker was first to reach num_min_workers requirement,
+        # and rendezvous is still joinable (therefore it is elastic),
+        # then this worker will be responsible for waiting out the "last call"
+        # timeout and closing (i.e. transitioning to 'frozen') the rendezvous
+        # afterwards.
+        # As a safety against a potential failure of this worker (during the
+        # last call timeout), the rendezvous state is made ephemeral
+        # when min_num_workers is reached.
+
+        if this_rank == self._num_min_workers - 1 and state["status"] == "joinable":
+            log.info("Rank %s is responsible for join last call.", this_rank)
+            last_call_deadline = time.time() + self._last_call_timeout
+            self.handle_join_last_call(expected_version, last_call_deadline)
+            log.info("Rank %s finished join last call.", this_rank)
+
+        # Wait for rendezvous state to be frozen, which means a fixed set of peers
+        log.info("Waiting for remaining peers.")
+        active_version = self.wait_for_peers(expected_version)
+        state = json.loads(active_version.value)
+
+        assert (
+            state["version"] == expected_version
+        ), "Logic error: failed to observe version mismatch"
+
+        return self.confirm_phase(expected_version, this_rank)
+
+    def confirm_phase(self, expected_version, this_rank):
+        """
+        Once the rendezvous state transitions from 'joinable' to 'frozen',
+        we have every participant confirm their membership and setup per-member
+        keep-alive TTL keys, and then wait for all other participants to confirm,
+        which would then successfully conclude this rendezvous.
+        """
+        log.info("All peers arrived. Confirming membership.")
+        self.confirm_membership(expected_version, this_rank)
+
+        log.info("Waiting for confirmations from all peers.")
+        active_version = self.wait_for_final(expected_version)
+        state = json.loads(active_version.value)
+
+        log.info(
+            "Rendezvous version %s is complete. Final state: %s",
+            state["version"], state
+        )
+
+        # Rendezvous version number; our rank in it; world size
+        return state["version"], this_rank, len(state["participants"])
+
+    def handle_existing_rendezvous(self, expected_version):
+        """
+        Handle the case when there's an existing (state 'final) rendezvous already
+        in place, and we have to announce ourselves waiting, and wait until
+        the next rendezvous opportunity.
+        """
+        # If state is 'final' -> increment num_workers_waiting
+        # Then, observe state changes:
+        #   1. if it's no longer final -> bail out and re-try
+        #   2. if keep alives are missing, destroy it and bail out.
+        active_state = self.announce_self_waiting(expected_version)
+        log.info(
+            "Added self to waiting list. Rendezvous full state: %s",
+            active_state.value
+        )
+
+        self.wait_for_rendezvous_to_free(expected_version)
+        log.info("Previously existing rendezvous state changed. Will re-try joining.")
+
+    def try_create_rendezvous(self):
+        """
+        Create new rendezvous state or raise an exception that indicates an unexpected state (e.g. already exists).
+
+        Raises:
+             RendezvousError - on unexpected state
+        """
+        # Initially active_version is ephemeral - this is to handle the
+        # possibility that might fail to complete the setup transaction,
+        # i.e. the transition "setup" -> "joinable".
+        active_version = self.client.write(
+            key=self.get_path("/rdzv/active_version"),
+            value=json.dumps({"status": "setup"}),
+            prevExist=False,
+            ttl=CONST_ETCD_SETUP_TTL,
+        )
+
+        try:
+            version_counter = self.client.get(self.get_path("/rdzv/version_counter"))
+            version_counter.value = str(int(version_counter.value) + 1)
+            self.client.update(version_counter)
+        except (etcd.EtcdKeyNotFound, etcd.EtcdCompareFailed) as e:
+            raise RendezvousError(
+                "Unexpected state of EtcdRendezvousHandler, worker needs to die."
+            ) from e
+
+        # Any failure below results in declaring a retryable rendezvous failure.
+        # The ephemeral /rdzv/active_version will expire and someone can then
+        # re-try the setup process.
+
+        # Create directory node for participant data
+        self.client.write(
+            key=self.get_path(f"/rdzv/v_{version_counter.value}"),
+            value=None,
+            dir=True,
+            prevExist=False,
+        )
+
+        # Publish rendezvous version and signal it is ready-to-be-joined.
+        # If rendezvous was set closed just before this, a retry will happen,
+        # where the closed condition will be handled.
+        return self.client.test_and_set(
+            key=self.get_path("/rdzv/active_version"),
+            value=json.dumps(
+                {
+                    "status": "joinable",
+                    "version": version_counter.value,
+                    "participants": [],
+                }
+            ),
+            prev_value=active_version.value,
+        )
+
+    def join_rendezvous(self, expected_version):
+        """Helper method for the join phase."""
+        # Use compare-and-swap to add self to rendezvous state:
+        while True:
+            cas_delay()
+            active_version, state = self.get_rdzv_state()
+
+            if state["status"] != "joinable":
+                raise EtcdRendezvousRetryableFailure(
+                    "Rendezvous state became non-joinable before we could join. "
+                    "Must join next one."
+                )
+
+            if state["version"] != expected_version:
+                raise EtcdRendezvousRetryImmediately(
+                    "Rendezvous version changed. Must try join the new one."
+                )
+
+            assert (
+                len(state["participants"]) < self._num_max_workers
+            ), "Logic error: joinable rendezvous should always have space left"
+
+            this_rank = len(state["participants"])
+            state["participants"].append(this_rank)
+
+            # When reaching min workers, or changing state to frozen, we'll set
+            # the active_version node to be ephemeral.
+            set_ttl: Optional[int] = None
+            if len(state["participants"]) == self._num_max_workers:
+                state["status"] = "frozen"
+                state["keep_alives"] = []
+                set_ttl = CONST_ETCD_FROZEN_TTL
+            elif len(state["participants"]) >= self._num_min_workers:
+                set_ttl = CONST_ETCD_JOINABLE_EPHEMERAL_TTL
+
+            try:
+                # Compare-and-swap.
+                active_version = self.client.test_and_set(
+                    key=self.get_path("/rdzv/active_version"),
+                    value=json.dumps(state),
+                    prev_value=active_version.value,
+                    ttl=set_ttl,
+                )
+                # We succeeded joining.
+                return active_version, this_rank
+
+            except etcd.EtcdCompareFailed:
+                log.info("Join rendezvous CAS unsuccessful, retrying")
+
+    def wait_for_peers(self, expected_version):
+        """Helper method for the join phase."""
+        active_version, state = self.get_rdzv_state()
+        while True:
+            if state["status"] == "frozen" and state["version"] == expected_version:
+                # Success, all peers arrived.
+                return active_version
+
+            elif state["status"] == "joinable" and state["version"] == expected_version:
+                # Continue waiting for any interesting events.
+                active_version, state = self.try_wait_for_state_change(
+                    etcd_index=active_version.etcd_index + 1
+                )
+
+            else:
+                # No valid transition possible at this point
+                raise EtcdRendezvousRetryableFailure(
+                    "Rendezvous state transition no longer possible. Must re-enter."
+                )
+
+    def confirm_membership(self, expected_version, this_rank):
+        """Helper method for the confirm phase."""
+        # Compare-and-swap loop
+        while True:
+            cas_delay()
+            active_version, state = self.get_rdzv_state()
+
+            if state["status"] != "frozen":
+                raise EtcdRendezvousRetryImmediately(
+                    "Rendezvous no longer frozen, before we confirmed. "
+                    "Must join next one"
+                )
+            if state["version"] != expected_version:
+                raise EtcdRendezvousRetryImmediately(
+                    "Rendezvous version changed. Must try join the new one."
+                )
+
+            this_lease_key = self.get_path(
+                f"/rdzv/v_{expected_version}/rank_{this_rank}"
+            )
+            self.client.set(this_lease_key, value=None, ttl=CONST_WORKER_KEEPALIVE_TTL)
+
+            state["keep_alives"].append(this_lease_key)
+            if len(state["keep_alives"]) == len(state["participants"]):
+                # Everyone confirmed (this rank is last to do so)
+                state["status"] = "final"
+                state["num_workers_waiting"] = 0
+                finalize = True
+            else:
+                finalize = False
+
+            try:
+                # Compare-and-swap. If new state is still frozen, keep it ephemeral.
+                active_version = self.client.test_and_set(
+                    key=self.get_path("/rdzv/active_version"),
+                    value=json.dumps(state),
+                    prev_value=active_version.value,
+                    ttl=None if finalize else CONST_ETCD_FROZEN_TTL,
+                )
+
+                self._lease_this_rank_stop = self.setup_lease_renewal(
+                    this_lease_key, ttl=CONST_WORKER_KEEPALIVE_TTL
+                )
+                return active_version
+
+            except etcd.EtcdCompareFailed:
+                log.info("Confirm membership CAS unsuccessful, retrying")
+
+    def wait_for_final(self, expected_version):
+        """Helper method for the confirm phase."""
+        active_version, state = self.get_rdzv_state()
+        while True:
+            if state["status"] == "final" and state["version"] == expected_version:
+                # Success. This rendezvous is final, and we accept it.
+                return active_version
+
+            elif state["status"] == "frozen" and state["version"] == expected_version:
+                # Continue waiting for any interesting events.
+                active_version, state = self.try_wait_for_state_change(
+                    etcd_index=active_version.etcd_index + 1
+                )
+
+            else:
+                # No valid transition possible at this point
+                raise EtcdRendezvousRetryableFailure(
+                    "Rendezvous state transition no longer possible. Must re-enter."
+                )
+
+    def announce_self_waiting(self, expected_version):
+        """
+        Announce this worker is waiting (via num_workers_waiting counter) to join next
+        rendezvous, but only if state and version match.
+        """
+        while True:
+            cas_delay()
+            active_version, state = self.get_rdzv_state()
+
+            if state["status"] != "final" or state["version"] != expected_version:
+                raise EtcdRendezvousRetryImmediately()
+
+            # Increment counter to signal an additional waiting worker.
+            state["num_workers_waiting"] += 1
+
+            try:
+                active_version = self.client.test_and_set(
+                    key=self.get_path("/rdzv/active_version"),
+                    value=json.dumps(state),
+                    prev_value=active_version.value,
+                )
+                return active_version
+
+            except etcd.EtcdCompareFailed:
+                log.info("Announce self as waiting CAS unsuccessful, retrying")
+
+    def wait_for_rendezvous_to_free(self, expected_version):
+        """
+        When there's an existing valid rendezvous in state 'final', we have to wait until the next opportunity to join.
+
+        Such opportunity may come from:
+
+        1. rendezvous state changed by someone else, in which case we unblock and retry.
+        2. rendezvous becomes invalid because at least one member failed to renew their
+           leased keep_alive node. We detect this, and destroy the rendezvous.
+        """
+        active_version, state = self.get_rdzv_state()
+        while True:
+            if state["status"] != "final" or state["version"] != expected_version:
+                return
+
+            # Check if current rendezvous state is valid, in the sense that all
+            # its members are alive (renewing their lease).
+            # If not, try destroy this rendezvous, so a new one can be created.
+            alive_members = self.client.get(
+                self.get_path(f"/rdzv/v_{expected_version}")
+            )
+            keep_alive_keys = [ch.key for ch in alive_members.children]
+
+            for key in state["keep_alives"]:
+                if key not in keep_alive_keys:
+                    # This participant didn't renew their lease. We'll declare this
+                    # rendezvous version as dead (but only if it hadn't changed)
+                    log.info("Keep-alive key %s is not renewed.", key)
+                    log.info(
+                        "Rendezvous version %s is incomplete. ",
+                        expected_version
+                    )
+                    log.info("Attempting to destroy it.")
+
+                    # Compare-and-delete operation. Throws if compare failed,
+                    # which means rendezvous was already destroyed/re-created/closed,
+                    # and we can try to re-enter the barrier.
+                    self.client.delete(
+                        key=self.get_path("/rdzv/active_version"),
+                        prevValue=active_version.value,
+                    )
+
+                    log.info(
+                        "Destroyed rendezvous version %s successfully.",
+                        expected_version
+                    )
+
+                    # We can return (and retry) immediately
+                    return
+
+            # Existing rendezvous seems valid, no reason to destroy it.
+            # We just have to wait until something changes and re-check.
+            try:
+                overall_timeout = (
+                    max(self._rendezvous_deadline - time.time(), 0.0) + 1.0
+                )
+                self.client.watch(
+                    key=self.get_path("/rdzv"),
+                    index=active_version.etcd_index + 1,
+                    recursive=True,
+                    timeout=overall_timeout,
+                )
+            except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut):
+                pass
+
+            if time.time() > self._rendezvous_deadline:
+                raise RendezvousTimeoutError()
+            active_version, state = self.get_rdzv_state()
+
+    def handle_join_last_call(self, expected_version, deadline):
+        """
+        After we reach min number of workers, one particular worker takes on the
+        responsibility of waiting an additional timeout before closing the join window.
+        If the worker responsible for this fails, the rendezvous will be destroyed due
+        to expiring TTL, and the other participants will re-rendezvous.
+
+        Here we expect to see state <joinable, expected_version>
+        Exit gracefully if either:
+
+        1. state becomes <frozen, expected_version>
+        2. timeout happens (reaching deadline), in which case
+           we try the transition to <frozen, expected_version>
+
+        Exit with exception otherwise.
+        """
+        active_version, state = self.get_rdzv_state()
+        while True:
+            if state["status"] == "frozen" and state["version"] == expected_version:
+                # Worker set became frozen before last-call timeout. This is possible
+                # when num_max_workers is reached before the timeout.
+                return
+
+            if state["status"] != "joinable" or state["version"] != expected_version:
+                raise EtcdRendezvousRetryableFailure(
+                    "Rendezvous state transition no longer possible. Must re-enter."
+                )
+
+            # If timeout occurred, attempt a state transition (joinable -> frozen)
+            if time.time() >= deadline:
+                state["status"] = "frozen"
+                state["keep_alives"] = []
+                try:
+                    active_version = self.client.test_and_set(
+                        key=self.get_path("/rdzv/active_version"),
+                        value=json.dumps(state),
+                        prev_value=active_version.value,
+                        ttl=CONST_ETCD_FROZEN_TTL,
+                    )
+                    # We successfully made this rendezvous frozen.
+                    return
+                except etcd.EtcdCompareFailed:
+                    log.info("Join last-call transition CAS unsuccessful. Will retry")
+                    cas_delay()
+                    active_version, state = self.get_rdzv_state()
+                    continue
+
+            # Timeout did not occur, so we must refresh TTL, and wait for
+            # further changes. Note: we only want TTL to be refreshed if
+            # state is still joinable, hence we use CAS for that here,
+            # even though we don't change any of the data.
+            try:
+                active_version = self.client.test_and_set(
+                    key=self.get_path("/rdzv/active_version"),
+                    value=active_version.value,
+                    prev_value=active_version.value,
+                    ttl=CONST_ETCD_JOINABLE_EPHEMERAL_TTL,
+                )
+
+                # Minimize "oversleeping":
+                timeout = min(
+                    CONST_ETCD_JOINABLE_EPHEMERAL_TTL / 2,
+                    deadline - time.time() + 1.0,  # Oversleeping by 1s is ok.
+                )
+                active_version, state = self.try_wait_for_state_change(
+                    etcd_index=active_version.etcd_index + 1, timeout=timeout
+                )
+            except etcd.EtcdCompareFailed:
+                log.info("Join last-call TTL refresh CAS unsuccessful, will retry")
+                cas_delay()
+                active_version, state = self.get_rdzv_state()
+
+    def set_closed(self):
+        """
+        Mark rendezvous 'closed' for current run_id, which is used to signal other
+        participants to not attempt to perform (re-)rendezvous. This is useful
+        when one of the workers decides the job is complete.
+        """
+        while True:
+            active_version, state = self.get_rdzv_state()
+
+            if state["status"] == "closed":
+                # Already closed by someone else.
+                return
+
+            state["status"] = "closed"
+            try:
+                self.client.test_and_set(
+                    key=self.get_path("/rdzv/active_version"),
+                    value=json.dumps(state),
+                    prev_value=active_version.value,
+                )
+                return
+
+            except etcd.EtcdCompareFailed:
+                log.info("Set closed CAS unsuccessful, retrying")
+                cas_delay()
+
+    def get_rdzv_state(self):
+        active_version = self.client.get(key=self.get_path("/rdzv/active_version"))
+        return active_version, json.loads(active_version.value)
+
+    def try_wait_for_state_change(self, etcd_index, timeout=None):
+        # Don't sleep past the overall deadline (at least more than by 1s)
+        overall_timeout = max(self._rendezvous_deadline - time.time(), 0.0) + 1.0
+        timeout = overall_timeout if timeout is None else min(timeout, overall_timeout)
+
+        try:
+            self.client.watch(
+                self.get_path("/rdzv/active_version"), index=etcd_index, timeout=timeout
+            )
+        except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut):
+            pass
+
+        if time.time() > self._rendezvous_deadline:
+            raise RendezvousTimeoutError()
+
+        # Unfortunately, we have to do another fetch in order to get last etcd_index.
+        return self.get_rdzv_state()
+
+    def get_path(self, path):
+        if not path.startswith("/"):
+            path = "/" + path
+
+        return f"{self._prefix}run_{self._run_id}{path}"
+
+    def create_path_if_not_exists(self, full_path, ttl=None):
+        try:
+            self.client.write(
+                key=full_path, value=None, dir=True, prevExist=False, ttl=ttl
+            )
+        except etcd.EtcdAlreadyExist:
+            pass
+
+    def setup_lease_renewal(self, full_path, ttl):
+        # NOTE: For ephemeral key TTL renewal (~lease) to work correctly,
+        # make sure you don't call any long-blocking methods that do not
+        # release the Python's GIL! An example of this is calling a pybind11
+        # extension function that is blocking / long-running, but is not
+        # doing a scoped release of the GIL.
+        def lease_worker(client, path, ttl, stop_event):
+            while True:
+                try:
+                    client.refresh(path, ttl=ttl)
+                except etcd.EtcdKeyNotFound:
+                    break
+                except ConnectionRefusedError:
+                    # This error usually occurs during test when the server already got terminated but the
+                    # python garbage collector have not yet invoked the __del__ method.
+                    break
+
+                if stop_event.wait(timeout=ttl / 2):
+                    break
+
+        lease_stop_event = threading.Event()
+        lease_thread = threading.Thread(
+            target=lease_worker, args=(self.client, full_path, ttl, lease_stop_event)
+        )
+
+        lease_thread.daemon = True
+        lease_thread.start()
+
+        return lease_stop_event
+
+    def store_extra_data(self, rdzv_version, key, value):
+        node = self.get_path(f"/rdzv/v_{rdzv_version}/extra_data")
+        try:
+            # If first time we are storing anything:
+            extra_data = self.client.write(
+                key=node, value=json.dumps({key: value}), prevExist=False
+            )
+            return
+        except etcd.EtcdAlreadyExist:
+            pass
+
+        # CAS loop, to make sure we don't lose concurrent stores.
+        while True:
+            # We never delete extra_data. Failure here should be fatal, no special handling.
+            extra_data = self.client.get(node)
+
+            new_extra_data_value = json.loads(extra_data.value)
+            new_extra_data_value[key] = value
+
+            try:
+                extra_data = self.client.test_and_set(
+                    key=node,
+                    value=json.dumps(new_extra_data_value),
+                    prev_value=extra_data.value,
+                )
+                return
+            except etcd.EtcdCompareFailed:
+                log.info("Store extra_data CAS unsuccessful, retrying")
+                time.sleep(0.1)
+
+    def load_extra_data(self, rdzv_version, key, timeout=None):
+        # 'extra_data' node itself, and the directory it is located in:
+        node = self.get_path(f"/rdzv/v_{rdzv_version}/extra_data")
+        node_dir = self.get_path(f"/rdzv/v_{rdzv_version}")
+
+        # TODO: implement timeout
+        # https://github.com/pytorch/elastic/issues/12
+        while True:
+            # Combined wait for the node itself, and the key inside it.
+            root = self.client.get(node_dir)
+
+            # Find the extra_data node, if it exists
+            extra_data = [n for n in root.children if n.key == node]
+            assert len(extra_data) <= 1
+
+            # Node for extra_data exists, check the desired key inside it.
+            if len(extra_data) == 1:
+                extra_data_dict = json.loads(extra_data[0].value)
+                if key in extra_data_dict:
+                    return extra_data_dict[key]
+
+            # The 'extra_data' node doesn't exist, or they key isn't published yet.
+            # Wait for interesting events on the extra_data node and retry.
+            try:
+                self.client.watch(node, index=root.etcd_index + 1)
+            except (etcd.EtcdEventIndexCleared, etcd.EtcdWatchTimedOut):
+                pass
+
+    def setup_kv_store(self, rdzv_version):
+        store_path = self.get_path(f"/rdzv/v_{rdzv_version}/kv")
+        self.create_path_if_not_exists(store_path)
+        return EtcdStore(etcd_client=self.client, etcd_store_prefix=store_path)
+
+
+def _create_etcd_client(params: RendezvousParameters) -> etcd.Client:
+    """Create a new ``etcd.Client`` from the specified ``RendezvousParameters``."""
+    hostname, port = parse_rendezvous_endpoint(params.endpoint, 2379)
+
+    # The communication protocol
+    protocol = params.config.get("protocol")
+    if protocol is None:
+        protocol = "http"
+    else:
+        if protocol != "http" and protocol != "https":
+            raise ValueError("The etcd protocol must be HTTP or HTTPS.")
+
+    # The SSL client certificate
+    ssl_cert = params.config.get("cert")
+    if ssl_cert is not None:
+        cert_key = params.config.get("key")
+        if cert_key is not None:
+            # The etcd client expects the certificate key as the second element
+            # of the `cert` tuple.
+            ssl_cert = (ssl_cert, cert_key)
+
+    # The root certificate
+    ca_cert = params.config.get("cacert")
+
+    return etcd.Client(
+        hostname,
+        port,
+        protocol=protocol,
+        cert=ssl_cert,
+        ca_cert=ca_cert,
+        allow_reconnect=True,
+    )
+
+
+# Handler for torch.distributed "static" registration
+def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
+    """
+    Usage:
+
+    ::
+
+    rdzv_params = RendezvousParameters(
+                        backend="etcd",
+                        endpoint="192.168.0.42:2379",
+                        run_id="123",
+                        min_nodes=4,
+                        max_nodes=8,
+                        timeout=300,
+                        last_call_timeout=30,
+                        etcd_prefix="custom_prefix",
+                        protocol="https",
+                        cacert="/etc/kubernetes/certs/ca.crt",
+                        cert="/etc/kubernetes/certs/client.crt",
+                        key="/etc/kubernetes/certs/client.key")
+    # -- or --
+    rdzv_params = RendezvousParameters(
+                        backend="etcd",
+                        endpoint="192.168.0.42:2379",
+                        run_id="123",
+                        min_nodes=4,
+                        max_nodes=8)
+
+    etcd_rdzv_handler = create_etcd_rendezvous_handler(rdzv_params)
+
+
+    Where:
+        run_id - unique id for this training job instance,
+        min_nodes - min number of workers expected to join the rendezvous,
+        max_nodes - max number of workers allowed to join the rendezvous,
+                        defaults to min_workers is not specified.
+        timeout - total timeout within which next_rendezvous is expected to
+                      succeed; a RendezvousTimeoutError is raised otherwise;
+                      Defaults is 600 (10 minutes).
+        last_call_timeout - additional wait amount ("last call") after
+                            min number of workers has been reached.
+                            Defaults to 30 seconds.
+        etcd_prefix - path prefix (from etcd root), inside which all
+                      etcd nodes will be created.
+                      Default is "/torchelastic/p2p".
+        protocol - http (default) or https to access etcd.
+        cacert - CA cert to access etcd, only makes sense with https.
+        cert - client cert to access etcd, only makes sense with https.
+        key - client key to access etcd, only makes sense with https.
+    """
+    client = _create_etcd_client(params)
+
+    etcd_prefix = params.get("etcd_prefix", "/torchelastic/p2p")
+
+    rdzv = EtcdRendezvous(
+        client=client,
+        prefix=etcd_prefix,
+        run_id=params.run_id,
+        num_min_workers=params.min_nodes,
+        num_max_workers=params.max_nodes,
+        timeout=params.get_as_int("timeout", _DEFAULT_TIMEOUT),
+        last_call_timeout=params.get_as_int("last_call_timeout", _DEFAULT_LAST_CALL_TIMEOUT),
+    )
+    return EtcdRendezvousHandler(rdzv_impl=rdzv)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..40d1501bed8820c12d3c824c812e277e20a20c0b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py
@@ -0,0 +1,213 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import binascii
+from base64 import b64decode, b64encode
+from typing import Optional, Tuple, cast
+
+import urllib3.exceptions  # type: ignore[import]
+from etcd import Client as EtcdClient  # type: ignore[import]
+from etcd import (
+    EtcdAlreadyExist,
+    EtcdCompareFailed,
+    EtcdException,
+    EtcdKeyNotFound,
+    EtcdResult,
+)
+from torch.distributed import Store
+
+from .api import RendezvousConnectionError, RendezvousParameters, RendezvousStateError
+from .dynamic_rendezvous import RendezvousBackend, Token
+from .etcd_store import EtcdStore
+from .utils import parse_rendezvous_endpoint
+
+
+class EtcdRendezvousBackend(RendezvousBackend):
+    """Represents an etcd-based rendezvous backend.
+
+    Args:
+        client:
+            The ``etcd.Client`` instance to use to communicate with etcd.
+        run_id:
+            The run id of the rendezvous.
+        key_prefix:
+            The path under which to store the rendezvous state in etcd.
+        ttl:
+            The TTL of the rendezvous state. If not specified, defaults to two hours.
+    """
+
+    _DEFAULT_TTL = 7200  # 2 hours
+
+    _client: EtcdClient
+    _key: str
+    _ttl: int
+
+    def __init__(
+        self,
+        client: EtcdClient,
+        run_id: str,
+        key_prefix: Optional[str] = None,
+        ttl: Optional[int] = None,
+    ) -> None:
+        if not run_id:
+            raise ValueError("The run id must be a non-empty string.")
+
+        self._client = client
+
+        if key_prefix:
+            self._key = key_prefix + "/" + run_id
+        else:
+            self._key = run_id
+
+        if ttl and ttl > 0:
+            self._ttl = ttl
+        else:
+            self._ttl = self._DEFAULT_TTL
+
+    @property
+    def name(self) -> str:
+        """See base class."""
+        return "etcd-v2"
+
+    def get_state(self) -> Optional[Tuple[bytes, Token]]:
+        """See base class."""
+        try:
+            result = self._client.read(self._key)
+        except EtcdKeyNotFound:
+            return None
+        except (EtcdException, urllib3.exceptions.TimeoutError) as exc:
+            raise RendezvousConnectionError(
+                "The connection to etcd has failed. See inner exception for details."
+            ) from exc
+
+        return self._decode_state(result)
+
+    def set_state(
+        self, state: bytes, token: Optional[Token] = None
+    ) -> Optional[Tuple[bytes, Token, bool]]:
+        """See base class."""
+        base64_state = b64encode(state).decode()
+
+        kwargs = {}
+
+        def get_state():
+            result = self.get_state()
+            if result is not None:
+                tmp = *result, False
+                # Python 3.6 does not support tuple unpacking in return
+                # statements.
+                return tmp
+            return None
+
+        if token:
+            try:
+                token = int(token)
+            except ValueError:
+                return get_state()
+
+        if token:
+            kwargs["prevIndex"] = token
+        else:
+            kwargs["prevExist"] = False
+
+        try:
+            result = self._client.write(self._key, base64_state, self._ttl, **kwargs)
+        except (EtcdAlreadyExist, EtcdCompareFailed):
+            result = None
+        except (EtcdException, urllib3.exceptions.TimeoutError) as exc:
+            raise RendezvousConnectionError(
+                "The connection to etcd has failed. See inner exception for details."
+            ) from exc
+
+        if result is None:
+            return get_state()
+
+        tmp = *self._decode_state(result), True
+        return tmp
+
+    def _decode_state(self, result: EtcdResult) -> Tuple[bytes, Token]:
+        base64_state = result.value.encode()
+
+        try:
+            state = b64decode(base64_state)
+        except binascii.Error as exc:
+            raise RendezvousStateError(
+                "The state object is corrupt. See inner exception for details."
+            ) from exc
+
+        return state, result.modifiedIndex
+
+
+def _create_etcd_client(params: RendezvousParameters) -> EtcdClient:
+    host, port = parse_rendezvous_endpoint(params.endpoint, default_port=2379)
+
+    # The timeout
+    read_timeout = cast(int, params.get_as_int("read_timeout", 60))
+    if read_timeout <= 0:
+        raise ValueError("The read timeout must be a positive integer.")
+
+    # The communication protocol
+    protocol = params.get("protocol", "http").strip().lower()
+    if protocol != "http" and protocol != "https":
+        raise ValueError("The protocol must be HTTP or HTTPS.")
+
+    # The SSL client certificate
+    ssl_cert = params.get("ssl_cert")
+    if ssl_cert:
+        ssl_cert_key = params.get("ssl_cert_key")
+        if ssl_cert_key:
+            # The etcd client expects the certificate key as the second element
+            # of the `cert` tuple.
+            ssl_cert = (ssl_cert, ssl_cert_key)
+
+    # The root certificate
+    ca_cert = params.get("ca_cert")
+
+    try:
+        return EtcdClient(
+            host,
+            port,
+            read_timeout=read_timeout,
+            protocol=protocol,
+            cert=ssl_cert,
+            ca_cert=ca_cert,
+            allow_reconnect=True,
+        )
+    except (EtcdException, urllib3.exceptions.TimeoutError) as exc:
+        raise RendezvousConnectionError(
+            "The connection to etcd has failed. See inner exception for details."
+        ) from exc
+
+
+def create_backend(params: RendezvousParameters) -> Tuple[EtcdRendezvousBackend, Store]:
+    """Create a new :py:class:`EtcdRendezvousBackend` from the specified parameters.
+
+    +--------------+-----------------------------------------------------------+
+    | Parameter    | Description                                               |
+    +==============+===========================================================+
+    | read_timeout | The read timeout, in seconds, for etcd operations.        |
+    |              | Defaults to 60 seconds.                                   |
+    +--------------+-----------------------------------------------------------+
+    | protocol     | The protocol to use to communicate with etcd. Valid       |
+    |              | values are "http" and "https". Defaults to "http".        |
+    +--------------+-----------------------------------------------------------+
+    | ssl_cert     | The path to the SSL client certificate to use along with  |
+    |              | HTTPS. Defaults to ``None``.                              |
+    +--------------+-----------------------------------------------------------+
+    | ssl_cert_key | The path to the private key of the SSL client certificate |
+    |              | to use along with HTTPS. Defaults to ``None``.            |
+    +--------------+-----------------------------------------------------------+
+    | ca_cert      | The path to the rool SSL authority certificate. Defaults  |
+    |              | to ``None``.                                              |
+    +--------------+-----------------------------------------------------------+
+    """
+    client = _create_etcd_client(params)
+
+    backend = EtcdRendezvousBackend(client, params.run_id, key_prefix="/torch/elastic/rendezvous")
+
+    store = EtcdStore(client, "/torch/elastic/store")
+
+    return backend, store
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_server.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cd5afb93a53eff8a0ba8537fe7ccfa8840585b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_server.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import atexit
+import logging
+import os
+import shlex
+import shutil
+import socket
+import subprocess
+import tempfile
+import time
+from typing import Optional, TextIO, Union
+
+try:
+    import etcd  # type: ignore[import]
+except ModuleNotFoundError:
+    pass
+
+
+log = logging.getLogger(__name__)
+
+
+def find_free_port():
+    """
+    Find a free port and binds a temporary socket to it so that the port can be "reserved" until used.
+
+    .. note:: the returned socket must be closed before using the port,
+              otherwise a ``address already in use`` error will happen.
+              The socket should be held and closed as close to the
+              consumer of the port as possible since otherwise, there
+              is a greater chance of race-condition where a different
+              process may see the port as being free and take it.
+
+    Returns: a socket binded to the reserved free port
+
+    Usage::
+
+    sock = find_free_port()
+    port = sock.getsockname()[1]
+    sock.close()
+    use_port(port)
+    """
+    addrs = socket.getaddrinfo(
+        host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
+    )
+
+    for addr in addrs:
+        family, type, proto, _, _ = addr
+        try:
+            s = socket.socket(family, type, proto)
+            s.bind(("localhost", 0))
+            s.listen(0)
+            return s
+        except OSError as e:
+            s.close()  # type: ignore[possibly-undefined]
+            print(f"Socket creation attempt failed: {e}")
+    raise RuntimeError("Failed to create a socket")
+
+
+def stop_etcd(subprocess, data_dir: Optional[str] = None):
+    if subprocess and subprocess.poll() is None:
+        log.info("stopping etcd server")
+        subprocess.terminate()
+        subprocess.wait()
+
+    if data_dir:
+        log.info("deleting etcd data dir: %s", data_dir)
+        shutil.rmtree(data_dir, ignore_errors=True)
+
+
+class EtcdServer:
+    """
+    .. note:: tested on etcd server v3.4.3.
+
+    Starts and stops a local standalone etcd server on a random free
+    port. Useful for single node, multi-worker launches or testing,
+    where a sidecar etcd server is more convenient than having to
+    separately setup an etcd server.
+
+    This class registers a termination handler to shutdown the etcd
+    subprocess on exit. This termination handler is NOT a substitute for
+    calling the ``stop()`` method.
+
+    The following fallback mechanism is used to find the etcd binary:
+
+    1. Uses env var TORCHELASTIC_ETCD_BINARY_PATH
+    2. Uses ``<this file root>/bin/etcd`` if one exists
+    3. Uses ``etcd`` from ``PATH``
+
+    Usage
+    ::
+
+     server = EtcdServer("/usr/bin/etcd", 2379, "/tmp/default.etcd")
+     server.start()
+     client = server.get_client()
+     # use client
+     server.stop()
+
+    Args:
+        etcd_binary_path: path of etcd server binary (see above for fallback path)
+    """
+
+    def __init__(self, data_dir: Optional[str] = None):
+        self._port = -1
+        self._host = "localhost"
+
+        root = os.path.dirname(__file__)
+        default_etcd_bin = os.path.join(root, "bin/etcd")
+        self._etcd_binary_path = os.environ.get(
+            "TORCHELASTIC_ETCD_BINARY_PATH", default_etcd_bin
+        )
+        if not os.path.isfile(self._etcd_binary_path):
+            self._etcd_binary_path = "etcd"
+
+        self._base_data_dir = (
+            data_dir if data_dir else tempfile.mkdtemp(prefix="torchelastic_etcd_data")
+        )
+        self._etcd_cmd = None
+        self._etcd_proc: Optional[subprocess.Popen] = None
+
+    def _get_etcd_server_process(self) -> subprocess.Popen:
+        if not self._etcd_proc:
+            raise RuntimeError(
+                "No etcd server process started. Call etcd_server.start() first"
+            )
+        else:
+            return self._etcd_proc
+
+    def get_port(self) -> int:
+        """Return the port the server is running on."""
+        return self._port
+
+    def get_host(self) -> str:
+        """Return the host the server is running on."""
+        return self._host
+
+    def get_endpoint(self) -> str:
+        """Return the etcd server endpoint (host:port)."""
+        return f"{self._host}:{self._port}"
+
+    def start(
+        self,
+        timeout: int = 60,
+        num_retries: int = 3,
+        stderr: Union[int, TextIO, None] = None,
+    ) -> None:
+        """
+        Start the server, and waits for it to be ready. When this function returns the sever is ready to take requests.
+
+        Args:
+            timeout: time (in seconds) to wait for the server to be ready
+                before giving up.
+            num_retries: number of retries to start the server. Each retry
+                will wait for max ``timeout`` before considering it as failed.
+            stderr: the standard error file handle. Valid values are
+                `subprocess.PIPE`, `subprocess.DEVNULL`, an existing file
+                descriptor (a positive integer), an existing file object, and
+                `None`.
+
+        Raises:
+            TimeoutError: if the server is not ready within the specified timeout
+        """
+        curr_retries = 0
+        while True:
+            try:
+                data_dir = os.path.join(self._base_data_dir, str(curr_retries))
+                os.makedirs(data_dir, exist_ok=True)
+                return self._start(data_dir, timeout, stderr)
+            except Exception as e:
+                curr_retries += 1
+                stop_etcd(self._etcd_proc)
+                log.warning(
+                    "Failed to start etcd server, got error: %s, retrying", str(e)
+                )
+                if curr_retries >= num_retries:
+                    shutil.rmtree(self._base_data_dir, ignore_errors=True)
+                    raise
+        atexit.register(stop_etcd, self._etcd_proc, self._base_data_dir)
+
+    def _start(
+        self, data_dir: str, timeout: int = 60, stderr: Union[int, TextIO, None] = None
+    ) -> None:
+        sock = find_free_port()
+        sock_peer = find_free_port()
+        self._port = sock.getsockname()[1]
+        peer_port = sock_peer.getsockname()[1]
+
+        etcd_cmd = shlex.split(
+            " ".join(
+                [
+                    self._etcd_binary_path,
+                    "--enable-v2",
+                    "--data-dir",
+                    data_dir,
+                    "--listen-client-urls",
+                    f"http://{self._host}:{self._port}",
+                    "--advertise-client-urls",
+                    f"http://{self._host}:{self._port}",
+                    "--listen-peer-urls",
+                    f"http://{self._host}:{peer_port}",
+                ]
+            )
+        )
+
+        log.info("Starting etcd server: [%s]", etcd_cmd)
+
+        sock.close()
+        sock_peer.close()
+        self._etcd_proc = subprocess.Popen(etcd_cmd, close_fds=True, stderr=stderr)
+        self._wait_for_ready(timeout)
+
+    def get_client(self):
+        """Return an etcd client object that can be used to make requests to this server."""
+        return etcd.Client(
+            host=self._host, port=self._port, version_prefix="/v2", read_timeout=10
+        )
+
+    def _wait_for_ready(self, timeout: int = 60) -> None:
+        client = etcd.Client(
+            host=f"{self._host}", port=self._port, version_prefix="/v2", read_timeout=5
+        )
+        max_time = time.time() + timeout
+
+        while time.time() < max_time:
+            if self._get_etcd_server_process().poll() is not None:
+                # etcd server process finished
+                exitcode = self._get_etcd_server_process().returncode
+                raise RuntimeError(
+                    f"Etcd server process exited with the code: {exitcode}"
+                )
+            try:
+                log.info("etcd server ready. version: %s", client.version)
+                return
+            except Exception:
+                time.sleep(1)
+        raise TimeoutError("Timed out waiting for etcd server to be ready!")
+
+    def stop(self) -> None:
+        """Stop the server and cleans up auto generated resources (e.g. data dir)."""
+        log.info("EtcdServer stop method called")
+        stop_etcd(self._etcd_proc, self._base_data_dir)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_store.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d34ac7cfa25a82f527c3b7b4cde45baa3542482
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/etcd_store.py
@@ -0,0 +1,204 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import datetime
+import random
+import time
+from base64 import b64decode, b64encode
+from typing import Optional
+
+import etcd  # type: ignore[import]
+
+# pyre-ignore[21]: Could not find name `Store` in `torch.distributed`.
+from torch.distributed import Store
+
+
+# Delay (sleep) for a small random amount to reduce CAS failures.
+# This does not affect correctness, but will reduce requests to etcd server.
+def cas_delay():
+    time.sleep(random.uniform(0, 0.1))
+
+
+# pyre-fixme[11]: Annotation `Store` is not defined as a type.
+class EtcdStore(Store):
+    """
+    Implement a c10 Store interface by piggybacking on the rendezvous etcd instance.
+
+    This is the store object returned by ``EtcdRendezvous``.
+    """
+
+    def __init__(
+        self,
+        etcd_client,
+        etcd_store_prefix,
+        # Default timeout same as in c10d/Store.hpp
+        timeout: Optional[datetime.timedelta] = None,
+    ):
+        super().__init__()  # required for pybind trampoline.
+
+        self.client = etcd_client
+        self.prefix = etcd_store_prefix
+
+        if timeout is not None:
+            self.set_timeout(timeout)
+
+        if not self.prefix.endswith("/"):
+            self.prefix += "/"
+
+    def set(self, key, value):
+        """
+        Write a key/value pair into ``EtcdStore``.
+
+        Both key and value may be either Python ``str`` or ``bytes``.
+        """
+        self.client.set(key=self.prefix + self._encode(key), value=self._encode(value))
+
+    def get(self, key) -> bytes:
+        """
+        Get a value by key, possibly doing a blocking wait.
+
+        If key is not immediately present, will do a blocking wait
+        for at most ``timeout`` duration or until the key is published.
+
+
+        Returns:
+            value ``(bytes)``
+
+        Raises:
+            LookupError - If key still not published after timeout
+        """
+        b64_key = self.prefix + self._encode(key)
+        kvs = self._try_wait_get([b64_key])
+
+        if kvs is None:
+            raise LookupError(f"Key {key} not found in EtcdStore")
+
+        return self._decode(kvs[b64_key])
+
+    def add(self, key, num: int) -> int:
+        """
+        Atomically increment a value by an integer amount.
+
+        The integer is represented as a string using base 10. If key is not present,
+        a default value of ``0`` will be assumed.
+
+        Returns:
+             the new (incremented) value
+
+
+        """
+        b64_key = self._encode(key)
+        # c10d Store assumes value is an integer represented as a decimal string
+        try:
+            # Assume default value "0", if this key didn't yet:
+            node = self.client.write(
+                key=self.prefix + b64_key,
+                value=self._encode(str(num)),  # i.e. 0 + num
+                prevExist=False,
+            )
+            return int(self._decode(node.value))
+        except etcd.EtcdAlreadyExist:
+            pass
+
+        while True:
+            # Note: c10d Store does not have a method to delete keys, so we
+            # can be sure it's still there.
+            node = self.client.get(key=self.prefix + b64_key)
+            new_value = self._encode(str(int(self._decode(node.value)) + num))
+            try:
+                node = self.client.test_and_set(
+                    key=node.key, value=new_value, prev_value=node.value
+                )
+                return int(self._decode(node.value))
+            except etcd.EtcdCompareFailed:
+                cas_delay()
+
+    def wait(self, keys, override_timeout: Optional[datetime.timedelta] = None):
+        """
+        Wait until all of the keys are published, or until timeout.
+
+        Raises:
+            LookupError - if timeout occurs
+        """
+        b64_keys = [self.prefix + self._encode(key) for key in keys]
+        kvs = self._try_wait_get(b64_keys, override_timeout)
+        if kvs is None:
+            raise LookupError("Timeout while waiting for keys in EtcdStore")
+        # No return value on success
+
+    def check(self, keys) -> bool:
+        """Check if all of the keys are immediately present (without waiting)."""
+        b64_keys = [self.prefix + self._encode(key) for key in keys]
+        kvs = self._try_wait_get(
+            b64_keys,
+            override_timeout=datetime.timedelta(microseconds=1),  # as if no wait
+        )
+        return kvs is not None
+
+    #
+    # Encode key/value data in base64, so we can store arbitrary binary data
+    # in EtcdStore. Input can be `str` or `bytes`.
+    # In case of `str`, utf-8 encoding is assumed.
+    #
+    def _encode(self, value) -> str:
+        if type(value) == bytes:
+            return b64encode(value).decode()
+        elif type(value) == str:
+            return b64encode(value.encode()).decode()
+        raise ValueError("Value must be of type str or bytes")
+
+    #
+    # Decode a base64 string (of type `str` or `bytes`).
+    # Return type is `bytes`, which is more convenient with the Store interface.
+    #
+    def _decode(self, value) -> bytes:
+        if type(value) == bytes:
+            return b64decode(value)
+        elif type(value) == str:
+            return b64decode(value.encode())
+        raise ValueError("Value must be of type str or bytes")
+
+    #
+    # Get all of the (base64-encoded) etcd keys at once, or wait until all the keys
+    # are published or timeout occurs.
+    # This is a helper method for the public interface methods.
+    #
+    # On success, a dictionary of {etcd key -> etcd value} is returned.
+    # On timeout, None is returned.
+    #
+    def _try_wait_get(self, b64_keys, override_timeout=None):
+        timeout = self.timeout if override_timeout is None else override_timeout  # type: ignore[attr-defined]
+        deadline = time.time() + timeout.total_seconds()
+
+        while True:
+            # Read whole directory (of keys), filter only the ones waited for
+            all_nodes = self.client.get(key=self.prefix)
+            req_nodes = {
+                node.key: node.value for node in all_nodes.children if node.key in b64_keys
+            }
+
+            if len(req_nodes) == len(b64_keys):
+                # All keys are available
+                return req_nodes
+
+            watch_timeout = deadline - time.time()
+            if watch_timeout <= 0:
+                return None
+
+            try:
+                self.client.watch(
+                    key=self.prefix,
+                    recursive=True,
+                    timeout=watch_timeout,
+                    index=all_nodes.etcd_index + 1,
+                )
+            except etcd.EtcdWatchTimedOut:
+                if time.time() >= deadline:
+                    return None
+                else:
+                    continue
+            except etcd.EtcdEventIndexCleared:
+                continue
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/registry.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0cb8e421ef2508e1db799083bcbb36863631bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/registry.py
@@ -0,0 +1,66 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .api import RendezvousHandler, RendezvousParameters
+from .api import rendezvous_handler_registry as handler_registry
+from .dynamic_rendezvous import create_handler
+
+__all__ = ['get_rendezvous_handler']
+
+def _create_static_handler(params: RendezvousParameters) -> RendezvousHandler:
+    from . import static_tcp_rendezvous
+
+    return static_tcp_rendezvous.create_rdzv_handler(params)
+
+
+def _create_etcd_handler(params: RendezvousParameters) -> RendezvousHandler:
+    from . import etcd_rendezvous
+
+    return etcd_rendezvous.create_rdzv_handler(params)
+
+
+def _create_etcd_v2_handler(params: RendezvousParameters) -> RendezvousHandler:
+    from .etcd_rendezvous_backend import create_backend
+
+    backend, store = create_backend(params)
+
+    return create_handler(store, backend, params)
+
+
+def _create_c10d_handler(params: RendezvousParameters) -> RendezvousHandler:
+    from .c10d_rendezvous_backend import create_backend
+
+    backend, store = create_backend(params)
+
+    return create_handler(store, backend, params)
+
+
+def _register_default_handlers() -> None:
+    handler_registry.register("etcd", _create_etcd_handler)
+    handler_registry.register("etcd-v2", _create_etcd_v2_handler)
+    handler_registry.register("c10d", _create_c10d_handler)
+    handler_registry.register("static", _create_static_handler)
+
+
+def get_rendezvous_handler(params: RendezvousParameters) -> RendezvousHandler:
+    """
+    Obtain a reference to a :py:class`RendezvousHandler`.
+
+    Custom rendezvous handlers can be registered by
+
+    ::
+
+      from torch.distributed.elastic.rendezvous import rendezvous_handler_registry
+      from torch.distributed.elastic.rendezvous.registry import get_rendezvous_handler
+
+      def create_my_rdzv(params: RendezvousParameters):
+        return MyCustomRdzv(params)
+
+      rendezvous_handler_registry.register("my_rdzv_backend_name", create_my_rdzv)
+
+      my_rdzv_handler = get_rendezvous_handler("my_rdzv_backend_name", RendezvousParameters)
+    """
+    return handler_registry.create_handler(params)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab5f2523512e5b3ad696c4e4f538d69bdcabc34d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import datetime
+import logging
+from typing import Tuple, cast, Optional
+
+# pyre-ignore[21]: Could not find name `Store` in `torch.distributed`.
+from torch.distributed import Store, TCPStore, PrefixStore
+from torch.distributed.elastic.rendezvous import RendezvousHandler, RendezvousParameters
+from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
+
+log = logging.getLogger(__name__)
+
+_default_timeout_seconds = 600
+
+
+class StaticTCPRendezvous(RendezvousHandler):
+    """
+    Static rendezvous that is a wrapper around the TCPStore.
+
+    Creates TCPStore based on the input parameters with the
+    listener on the agent with group_rank=0
+    """
+
+    def __init__(
+        self,
+        master_addr: str,
+        master_port: int,
+        rank: int,
+        world_size: int,
+        run_id: str,
+        timeout: int,
+    ):
+        self.master_addr = master_addr
+        self.master_port = master_port
+        self.rank = rank
+        self.world_size = world_size
+        self.run_id = run_id
+        self.timeout = datetime.timedelta(seconds=timeout)
+        self._store: Optional[Store] = None
+
+    def get_backend(self) -> str:
+        return "static"
+
+    def next_rendezvous(self) -> Tuple[Store, int, int]:
+        log.info("Creating TCPStore as the c10d::Store implementation")
+        if not self._store:
+            is_master = self.rank == 0
+            self._store = TCPStore(  # type: ignore[call-arg]
+                self.master_addr,
+                self.master_port,
+                self.world_size,
+                is_master,
+                self.timeout,
+                multi_tenant=True,
+            )
+        store = PrefixStore(self.run_id, self._store)
+        return store, self.rank, self.world_size
+
+    def is_closed(self):
+        return False
+
+    def set_closed(self):
+        pass
+
+    def num_nodes_waiting(self):
+        return 0
+
+    def get_run_id(self) -> str:
+        return self.run_id
+
+    def shutdown(self) -> bool:
+        return True
+
+
+def create_rdzv_handler(params: RendezvousParameters) -> RendezvousHandler:
+    if "rank" not in params.config:
+        raise ValueError(
+            "rank is absent in RendezvousParameters."
+            "Try add --node-rank to the cmd request"
+        )
+    endpoint = params.endpoint.strip()
+    if not endpoint:
+        raise ValueError(
+            "endpoint is absent in RendezvousParameters"
+            "Try add --master-port and --master-addr to the cmd request"
+        )
+    master_addr, master_port = parse_rendezvous_endpoint(endpoint, -1)
+    if master_port == -1:
+        raise ValueError(
+            f"Port is absent in endpoint: {endpoint}. Try launching with --master-port"
+        )
+    world_size = params.max_nodes
+    rank = cast(int, params.config.get("rank"))
+    run_id = params.run_id
+    if "timeout" in params.config:
+        timeout = int(params.config["timeout"])
+    else:
+        timeout = _default_timeout_seconds
+    return StaticTCPRendezvous(
+        master_addr, master_port, rank, world_size, run_id, timeout
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/utils.py b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fde1a3380f3bd693f6add3127dec191a5c58e31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/rendezvous/utils.py
@@ -0,0 +1,279 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import ipaddress
+import random
+import re
+import socket
+import time
+import weakref
+from datetime import timedelta
+from threading import Event, Thread
+from typing import Any, Callable, Dict, Optional, Tuple, Union
+
+__all__ = ['parse_rendezvous_endpoint']
+
+def _parse_rendezvous_config(config_str: str) -> Dict[str, str]:
+    """Extract key-value pairs from a rendezvous configuration string.
+
+    Args:
+        config_str:
+            A string in format <key1>=<value1>,...,<keyN>=<valueN>.
+    """
+    config: Dict[str, str] = {}
+
+    config_str = config_str.strip()
+    if not config_str:
+        return config
+
+    key_values = config_str.split(",")
+    for kv in key_values:
+        key, *values = kv.split("=", 1)
+
+        key = key.strip()
+        if not key:
+            raise ValueError(
+                "The rendezvous configuration string must be in format "
+                "<key1>=<value1>,...,<keyN>=<valueN>."
+            )
+
+        value: Optional[str]
+        if values:
+            value = values[0].strip()
+        else:
+            value = None
+        if not value:
+            raise ValueError(
+                f"The rendezvous configuration option '{key}' must have a value specified."
+            )
+
+        config[key] = value
+    return config
+
+
+def _try_parse_port(port_str: str) -> Optional[int]:
+    """Try to extract the port number from ``port_str``."""
+    if port_str and re.match(r"^[0-9]{1,5}$", port_str):
+        return int(port_str)
+    return None
+
+
+def parse_rendezvous_endpoint(endpoint: Optional[str], default_port: int) -> Tuple[str, int]:
+    """Extract the hostname and the port number from a rendezvous endpoint.
+
+    Args:
+        endpoint:
+            A string in format <hostname>[:<port>].
+        default_port:
+            The port number to use if the endpoint does not include one.
+
+    Returns:
+        A tuple of hostname and port number.
+    """
+    if endpoint is not None:
+        endpoint = endpoint.strip()
+
+    if not endpoint:
+        return ("localhost", default_port)
+
+    # An endpoint that starts and ends with brackets represents an IPv6 address.
+    if endpoint[0] == "[" and endpoint[-1] == "]":
+        host, *rest = endpoint, *[]
+    else:
+        host, *rest = endpoint.rsplit(":", 1)
+
+    # Sanitize the IPv6 address.
+    if len(host) > 1 and host[0] == "[" and host[-1] == "]":
+        host = host[1:-1]
+
+    if len(rest) == 1:
+        port = _try_parse_port(rest[0])
+        if port is None or port >= 2 ** 16:
+            raise ValueError(
+                f"The port number of the rendezvous endpoint '{endpoint}' must be an integer "
+                "between 0 and 65536."
+            )
+    else:
+        port = default_port
+
+    if not re.match(r"^[\w\.:-]+$", host):
+        raise ValueError(
+            f"The hostname of the rendezvous endpoint '{endpoint}' must be a dot-separated list of "
+            "labels, an IPv4 address, or an IPv6 address."
+        )
+
+    return host, port
+
+
+def _matches_machine_hostname(host: str) -> bool:
+    """Indicate whether ``host`` matches the hostname of this machine.
+
+    This function compares ``host`` to the hostname as well as to the IP
+    addresses of this machine. Note that it may return a false negative if this
+    machine has CNAME records beyond its FQDN or IP addresses assigned to
+    secondary NICs.
+    """
+    if host == "localhost":
+        return True
+
+    try:
+        addr = ipaddress.ip_address(host)
+    except ValueError:
+        addr = None
+
+    if addr and addr.is_loopback:
+        return True
+
+    try:
+        host_addr_list = socket.getaddrinfo(
+            host, None, proto=socket.IPPROTO_TCP, flags=socket.AI_CANONNAME
+        )
+    except (ValueError, socket.gaierror) as _:
+        host_addr_list = []
+
+    host_ip_list = [
+        host_addr_info[4][0]
+        for host_addr_info in host_addr_list
+    ]
+
+    this_host = socket.gethostname()
+    if host == this_host:
+        return True
+
+    addr_list = socket.getaddrinfo(
+        this_host, None, proto=socket.IPPROTO_TCP, flags=socket.AI_CANONNAME
+    )
+    for addr_info in addr_list:
+        # If we have an FQDN in the addr_info, compare it to `host`.
+        if addr_info[3] and addr_info[3] == host:
+            return True
+
+        # Otherwise if `host` represents an IP address, compare it to our IP
+        # address.
+        if addr and addr_info[4][0] == str(addr):
+            return True
+
+        # If the IP address matches one of the provided host's IP addresses
+        if addr_info[4][0] in host_ip_list:
+            return True
+
+    return False
+
+
+def _delay(seconds: Union[float, Tuple[float, float]]) -> None:
+    """Suspend the current thread for ``seconds``.
+
+    Args:
+        seconds:
+            Either the delay, in seconds, or a tuple of a lower and an upper
+            bound within which a random delay will be picked.
+    """
+    if isinstance(seconds, tuple):
+        seconds = random.uniform(*seconds)
+    # Ignore delay requests that are less than 10 milliseconds.
+    if seconds >= 0.01:
+        time.sleep(seconds)
+
+
+class _PeriodicTimer:
+    """Represent a timer that periodically runs a specified function.
+
+    Args:
+        interval:
+            The interval, in seconds, between each run.
+        function:
+            The function to run.
+    """
+
+    # The state of the timer is hold in a separate context object to avoid a
+    # reference cycle between the timer and the background thread.
+    class _Context:
+        interval: float
+        function: Callable[..., None]
+        args: Tuple[Any, ...]
+        kwargs: Dict[str, Any]
+        stop_event: Event
+
+    _name: Optional[str]
+    _thread: Optional[Thread]
+    _finalizer: Optional[weakref.finalize]
+
+    # The context that is shared between the timer and the background thread.
+    _ctx: _Context
+
+    def __init__(
+        self,
+        interval: timedelta,
+        function: Callable[..., None],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        self._name = None
+
+        self._ctx = self._Context()
+        self._ctx.interval = interval.total_seconds()
+        self._ctx.function = function  # type: ignore[assignment]
+        self._ctx.args = args or ()
+        self._ctx.kwargs = kwargs or {}
+        self._ctx.stop_event = Event()
+
+        self._thread = None
+        self._finalizer = None
+
+    @property
+    def name(self) -> Optional[str]:
+        """Get the name of the timer."""
+        return self._name
+
+    def set_name(self, name: str) -> None:
+        """Set the name of the timer.
+
+        The specified name will be assigned to the background thread and serves
+        for debugging and troubleshooting purposes.
+        """
+        if self._thread:
+            raise RuntimeError("The timer has already started.")
+
+        self._name = name
+
+    def start(self) -> None:
+        """Start the timer."""
+        if self._thread:
+            raise RuntimeError("The timer has already started.")
+
+        self._thread = Thread(
+            target=self._run, name=self._name or "PeriodicTimer", args=(self._ctx,), daemon=True
+        )
+
+        # We avoid using a regular finalizer (a.k.a. __del__) for stopping the
+        # timer as joining a daemon thread during the interpreter shutdown can
+        # cause deadlocks. The weakref.finalize is a superior alternative that
+        # provides a consistent behavior regardless of the GC implementation.
+        self._finalizer = weakref.finalize(
+            self, self._stop_thread, self._thread, self._ctx.stop_event
+        )
+
+        # We do not attempt to stop our background thread during the interpreter
+        # shutdown. At that point we do not even know whether it still exists.
+        self._finalizer.atexit = False
+
+        self._thread.start()
+
+    def cancel(self) -> None:
+        """Stop the timer at the next opportunity."""
+        if self._finalizer:
+            self._finalizer()
+
+    @staticmethod
+    def _run(ctx) -> None:
+        while not ctx.stop_event.wait(ctx.interval):
+            ctx.function(*ctx.args, **ctx.kwargs)
+
+    @staticmethod
+    def _stop_thread(thread, stop_event):
+        stop_event.set()
+
+        thread.join()
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b170b8d4444a1a0825a5268dfe278451170843a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__init__.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Expiration timers are set up on the same process as the agent and
+used from your script to deal with stuck workers. When you go into
+a code-block that has the potential to get stuck you can acquire
+an expiration timer, which instructs the timer server to kill the
+process if it does not release the timer by the self-imposed expiration
+deadline.
+
+Usage::
+
+    import torchelastic.timer as timer
+    import torchelastic.agent.server as agent
+
+    def main():
+        start_method = "spawn"
+        message_queue = mp.get_context(start_method).Queue()
+        server = timer.LocalTimerServer(message, max_interval=0.01)
+        server.start() # non-blocking
+
+        spec = WorkerSpec(
+                    fn=trainer_func,
+                    args=(message_queue,),
+                    ...<OTHER_PARAMS...>)
+        agent = agent.LocalElasticAgent(spec, start_method)
+        agent.run()
+
+    def trainer_func(message_queue):
+        timer.configure(timer.LocalTimerClient(message_queue))
+        with timer.expires(after=60): # 60 second expiry
+            # do some work
+
+In the example above if ``trainer_func`` takes more than 60 seconds to
+complete, then the worker process is killed and the agent retries the worker group.
+"""
+
+from .api import TimerClient, TimerRequest, TimerServer, configure, expires  # noqa: F401
+from .local_timer import LocalTimerClient, LocalTimerServer  # noqa: F401
+from .file_based_local_timer import FileTimerClient, FileTimerServer, FileTimerRequest  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a71171a3bb7edce1e3b93e4d06b99a961886de7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8815e326a441a17653d9a1c6f1aef2975c2c103c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/file_based_local_timer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/file_based_local_timer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e6b9851207520532eb2e8ef30832a5606a025fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/file_based_local_timer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/local_timer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/local_timer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1582f48c44d1a904ae4a92f675dc41f5caf5a70a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/__pycache__/local_timer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..50a430ffe0360f173d969feae4d4774dd82ffd26
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/api.py
@@ -0,0 +1,280 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import abc
+import logging
+import threading
+import time
+from contextlib import contextmanager
+from inspect import getframeinfo, stack
+from typing import Any, Dict, List, Optional, Set
+
+__all__ = ['TimerRequest', 'TimerClient', 'RequestQueue', 'TimerServer', 'configure', 'expires']
+
+log = logging.getLogger(__name__)
+
+class TimerRequest:
+    """
+    Data object representing a countdown timer acquisition and release
+    that is used between the ``TimerClient`` and ``TimerServer``.
+    A negative ``expiration_time`` should be interpreted as a "release"
+    request.
+
+    .. note:: the type of ``worker_id`` is implementation specific.
+              It is whatever the TimerServer and TimerClient implementations
+              have on to uniquely identify a worker.
+    """
+
+    __slots__ = ["worker_id", "scope_id", "expiration_time"]
+
+    def __init__(self, worker_id: Any, scope_id: str, expiration_time: float):
+        self.worker_id = worker_id
+        self.scope_id = scope_id
+        self.expiration_time = expiration_time
+
+    def __eq__(self, other):
+        if isinstance(other, TimerRequest):
+            return (
+                self.worker_id == other.worker_id
+                and self.scope_id == other.scope_id
+                and self.expiration_time == other.expiration_time
+            )
+        return False
+
+
+class TimerClient(abc.ABC):
+    """
+    Client library to acquire and release countdown timers by communicating
+    with the TimerServer.
+    """
+
+    @abc.abstractmethod
+    def acquire(self, scope_id: str, expiration_time: float) -> None:
+        """
+        Acquires a timer for the worker that holds this client object
+        given the scope_id and expiration_time. Typically registers
+        the timer with the TimerServer.
+        """
+        pass
+
+    @abc.abstractmethod
+    def release(self, scope_id: str):
+        """
+        Releases the timer for the ``scope_id`` on the worker this
+        client represents. After this method is
+        called, the countdown timer on the scope is no longer in effect.
+        """
+        pass
+
+
+class RequestQueue(abc.ABC):
+    """
+    Consumer queue holding timer acquisition/release requests
+    """
+
+    @abc.abstractmethod
+    def size(self) -> int:
+        """
+        Returns the size of the queue at the time this method is called.
+        Note that by the time ``get`` is called the size of the queue
+        may have increased. The size of the queue should not decrease
+        until the ``get`` method is called. That is, the following assertion
+        should hold:
+
+        size = q.size()
+        res = q.get(size, timeout=0)
+        assert size == len(res)
+
+        -- or --
+
+        size = q.size()
+        res = q.get(size * 2, timeout=1)
+        assert size <= len(res) <= size * 2
+        """
+        pass
+
+    @abc.abstractmethod
+    def get(self, size: int, timeout: float) -> List[TimerRequest]:
+        """
+        Gets up to ``size`` number of timer requests in a blocking fashion
+        (no more than ``timeout`` seconds).
+        """
+        pass
+
+
+class TimerServer(abc.ABC):
+    """
+    Entity that monitors active timers and expires them
+    in a timely fashion. This server is responsible for
+    reaping workers that have expired timers.
+    """
+
+    def __init__(
+        self, request_queue: RequestQueue, max_interval: float, daemon: bool = True
+    ):
+        """
+        :param request_queue: Consumer ``RequestQueue``
+        :param max_interval: max time (in seconds) to wait
+                             for an item in the request_queue
+        :param daemon: whether to run the watchdog thread as a daemon
+        """
+        super().__init__()
+        self._request_queue = request_queue
+        self._max_interval = max_interval
+        self._daemon = daemon
+        self._watchdog_thread: Optional[threading.Thread] = None
+        self._stop_signaled = False
+
+    @abc.abstractmethod
+    def register_timers(self, timer_requests: List[TimerRequest]) -> None:
+        """
+        Processes the incoming timer requests and registers them with the server.
+        The timer request can either be a acquire-timer or release-timer request.
+        Timer requests with a negative expiration_time should be interpreted
+        as a release-timer request.
+        """
+        pass
+
+    @abc.abstractmethod
+    def clear_timers(self, worker_ids: Set[Any]) -> None:
+        """
+        Clears all timers for the given ``worker_ids``.
+        """
+        pass
+
+    @abc.abstractmethod
+    def get_expired_timers(self, deadline: float) -> Dict[str, List[TimerRequest]]:
+        """
+        Returns all expired timers for each worker_id. An expired timer
+        is a timer for which the expiration_time is less than or equal to
+        the provided deadline.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _reap_worker(self, worker_id: Any) -> bool:
+        """
+        Reaps the given worker. Returns True if the worker has been
+        successfully reaped, False otherwise. If any uncaught exception
+        is thrown from this method, the worker is considered reaped
+        and all associated timers will be removed.
+        """
+
+    def _reap_worker_no_throw(self, worker_id: Any) -> bool:
+        """
+        Wraps ``_reap_worker(worker_id)``, if an uncaught exception is
+        thrown, then it considers the worker as reaped.
+        """
+        try:
+            return self._reap_worker(worker_id)
+        except Exception:
+            log.exception(
+                "Uncaught exception thrown from _reap_worker(), "
+                "check that the implementation correctly catches exceptions",
+            )
+            return True
+
+    def _watchdog_loop(self):
+        while not self._stop_signaled:
+            try:
+                self._run_watchdog()
+            except Exception:
+                log.exception("Error running watchdog")
+
+    def _run_watchdog(self):
+        batch_size = max(1, self._request_queue.size())
+        timer_requests = self._request_queue.get(batch_size, self._max_interval)
+        self.register_timers(timer_requests)
+        now = time.time()
+        reaped_worker_ids = set()
+        for worker_id, expired_timers in self.get_expired_timers(now).items():
+            log.info(
+                "Reaping worker_id=[%s]."
+                " Expired timers: %s",
+                worker_id, self._get_scopes(expired_timers)
+            )
+            if self._reap_worker_no_throw(worker_id):
+                log.info("Successfully reaped worker=[%s]", worker_id)
+                reaped_worker_ids.add(worker_id)
+            else:
+                log.error(
+                    "Error reaping worker=[%s]. Will retry on next watchdog.", worker_id
+                )
+        self.clear_timers(reaped_worker_ids)
+
+    def _get_scopes(self, timer_requests):
+        return [r.scope_id for r in timer_requests]
+
+    def start(self) -> None:
+        log.info(
+            "Starting %s..."
+            " max_interval=%s,"
+            " daemon=%s",
+            type(self).__name__, self._max_interval, self._daemon
+        )
+        self._watchdog_thread = threading.Thread(
+            target=self._watchdog_loop, daemon=self._daemon
+        )
+        log.info("Starting watchdog thread...")
+        self._watchdog_thread.start()
+
+    def stop(self) -> None:
+        log.info("Stopping %s", type(self).__name__)
+        self._stop_signaled = True
+        if self._watchdog_thread:
+            log.info("Stopping watchdog thread...")
+            self._watchdog_thread.join(self._max_interval)
+            self._watchdog_thread = None
+        else:
+            log.info("No watchdog thread running, doing nothing")
+
+
+_timer_client: Optional[TimerClient] = None
+
+
+def configure(timer_client: TimerClient):
+    """
+    Configures a timer client. Must be called before using ``expires``.
+    """
+    global _timer_client
+    _timer_client = timer_client
+    log.info("Timer client configured to: %s", type(_timer_client).__name__)
+
+
+@contextmanager
+def expires(
+    after: float, scope: Optional[str] = None, client: Optional[TimerClient] = None
+):
+    """
+    Acquires a countdown timer that expires in ``after`` seconds from now,
+    unless the code-block that it wraps is finished within the timeframe.
+    When the timer expires, this worker is eligible to be reaped. The
+    exact meaning of "reaped" depends on the client implementation. In
+    most cases, reaping means to terminate the worker process.
+    Note that the worker is NOT guaranteed to be reaped at exactly
+    ``time.now() + after``, but rather the worker is "eligible" for being
+    reaped and the ``TimerServer`` that the client talks to will ultimately
+    make the decision when and how to reap the workers with expired timers.
+
+    Usage::
+
+        torch.distributed.elastic.timer.configure(LocalTimerClient())
+        with expires(after=10):
+            torch.distributed.all_reduce(...)
+    """
+    if client is None:
+        if _timer_client is None:
+            raise RuntimeError("Configure timer client before using countdown timers.")
+        client = _timer_client
+    if scope is None:
+        # grab the caller file + lineno
+        caller = getframeinfo(stack()[1][0])
+        scope = f"{caller.filename}#{caller.lineno}"
+    expiration = time.time() + after
+    client.acquire(scope, expiration)
+    try:
+        yield
+    finally:
+        client.release(scope)
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/file_based_local_timer.py b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/file_based_local_timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b299ece24162b3db1a34308eb339ad8a7a9c4c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/file_based_local_timer.py
@@ -0,0 +1,333 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import io
+import json
+import logging
+import os
+import select
+import signal
+import sys
+import threading
+import time
+from typing import Callable, Dict, List, Optional, Set, Tuple
+
+from torch.distributed.elastic.timer.api import TimerClient, TimerRequest
+
+__all__ = ["FileTimerClient", "FileTimerRequest", "FileTimerServer"]
+
+log = logging.getLogger(__name__)
+
+class FileTimerRequest(TimerRequest):
+    """
+    Data object representing a countdown timer acquisition and release
+    that is used between the ``FileTimerClient`` and ``FileTimerServer``.
+    A negative ``expiration_time`` should be interpreted as a "release"
+    request.
+    ``signal`` is the signal to reap the worker process from the server
+    process.
+    """
+
+    __slots__ = ["version", "worker_pid", "scope_id", "expiration_time", "signal"]
+
+    def __init__(self, worker_pid: int, scope_id: str, expiration_time: float, signal: int = 0) -> None:
+        self.version = 1
+        self.worker_pid = worker_pid
+        self.scope_id = scope_id
+        self.expiration_time = expiration_time
+        self.signal = signal
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, FileTimerRequest):
+            return (
+                self.version == other.version
+                and self.worker_pid == other.worker_pid
+                and self.scope_id == other.scope_id
+                and self.expiration_time == other.expiration_time
+                and self.signal == other.signal
+            )
+        return False
+
+    def to_json(self) -> str:
+        return json.dumps(
+            {
+                "version": self.version,
+                "pid": self.worker_pid,
+                "scope_id": self.scope_id,
+                "expiration_time": self.expiration_time,
+                "signal": self.signal
+            },
+        )
+
+
+class FileTimerClient(TimerClient):
+    """
+    Client side of ``FileTimerServer``. This client is meant to be used
+    on the same host that the ``FileTimerServer`` is running on and uses
+    pid to uniquely identify a worker.
+    This client uses a named_pipe to send timer requests to the
+    ``FileTimerServer``. This client is a producer while the
+    ``FileTimerServer`` is a consumer. Multiple clients can work with
+    the same ``FileTimerServer``.
+
+    Args:
+
+        file_path: str, the path of a FIFO special file. ``FileTimerServer``
+                        must have created it by calling os.mkfifo().
+
+        signal: signal, the signal to use to kill the process. Using a
+                        negative or zero signal will not kill the process.
+    """
+    def __init__(self, file_path: str, signal=(signal.SIGKILL if sys.platform != "win32" else
+                                               signal.CTRL_C_EVENT)) -> None:  # type: ignore[attr-defined]
+        super().__init__()
+        self._file_path = file_path
+        self.signal = signal
+
+    def _open_non_blocking(self) -> Optional[io.TextIOWrapper]:
+        try:
+            fd = os.open(self._file_path, os.O_WRONLY | os.O_NONBLOCK)
+            return os.fdopen(fd, "wt")
+        except Exception:
+            return None
+
+    def _send_request(self, request: FileTimerRequest) -> None:
+        # The server may have crashed or may haven't started yet.
+        # In such case, calling open() in blocking model blocks the client.
+        # To avoid such issue, open it in non-blocking mode, and an OSError will
+        # be raised if the server is not there.
+        file = self._open_non_blocking()
+        if file is None:
+            raise BrokenPipeError("Could not send the FileTimerRequest because FileTimerServer is not available.")
+        with file:
+            json_request = request.to_json()
+            # Write request with no greater than select.PIPE_BUF is guarantee to be atomic.
+            if len(json_request) > select.PIPE_BUF:
+                raise RuntimeError(
+                    f"FileTimerRequest larger than {select.PIPE_BUF} bytes "
+                    f"is not supported: {json_request}"
+                )
+            file.write(json_request + "\n")
+
+    def acquire(self, scope_id: str, expiration_time: float) -> None:
+        self._send_request(
+            request=FileTimerRequest(
+                worker_pid=os.getpid(),
+                scope_id=scope_id,
+                expiration_time=expiration_time,
+                signal=self.signal
+            ),
+        )
+
+    def release(self, scope_id: str) -> None:
+        self._send_request(
+            request=FileTimerRequest(
+                worker_pid=os.getpid(),
+                scope_id=scope_id,
+                expiration_time=-1,
+                signal=0
+            ),
+        )
+
+
+class FileTimerServer:
+    """
+    Server that works with ``FileTimerClient``. Clients are expected to be
+    running on the same host as the process that is running this server.
+    Each host in the job is expected to start its own timer server locally
+    and each server instance manages timers for local workers (running on
+    processes on the same host).
+
+    Args:
+
+        file_path: str, the path of a FIFO special file to be created.
+
+        max_interval: float, max interval in seconds for each watchdog loop.
+
+        daemon: bool, running the watchdog thread in daemon mode or not.
+                      A daemon thread will not block a process to stop.
+        log_event: Callable[[Dict[str, str]], None], an optional callback for
+                logging the events in JSON format.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        max_interval: float = 10,
+        daemon: bool = True,
+        log_event: Optional[Callable[[str, Optional[FileTimerRequest]], None]] = None
+    ) -> None:
+        self._file_path = file_path
+        self._max_interval = max_interval
+        self._daemon = daemon
+        self._timers: Dict[Tuple[int, str], FileTimerRequest] = {}
+        self._stop_signaled = False
+        self._watchdog_thread: Optional[threading.Thread] = None
+        if os.path.exists(self._file_path):
+            os.remove(self._file_path)
+        os.mkfifo(self._file_path)
+        # For test only. Count the number of requests received.
+        self._request_count = 0
+        # For test only. Process all requests and stop the server.
+        self._run_once = False
+        self._log_event = log_event if log_event is not None else lambda name, request: None
+
+
+    def start(self) -> None:
+        log.info(
+            "Starting %s..."
+            " max_interval=%s,"
+            " daemon=%s",
+            type(self).__name__, self._max_interval, self._daemon
+        )
+        self._watchdog_thread = threading.Thread(target=self._watchdog_loop, daemon=self._daemon)
+        log.info("Starting watchdog thread...")
+        self._watchdog_thread.start()
+        self._log_event("watchdog started", None)
+
+    def stop(self) -> None:
+        log.info("Stopping %s", type(self).__name__)
+        self._stop_signaled = True
+        if self._watchdog_thread:
+            log.info("Stopping watchdog thread...")
+            self._watchdog_thread.join(self._max_interval)
+            self._watchdog_thread = None
+        else:
+            log.info("No watchdog thread running, doing nothing")
+        if os.path.exists(self._file_path):
+            os.remove(self._file_path)
+        self._log_event("watchdog stopped", None)
+
+    def run_once(self) -> None:
+        self._run_once = True
+        if self._watchdog_thread:
+            log.info("Stopping watchdog thread...")
+            self._watchdog_thread.join()
+            self._watchdog_thread = None
+        else:
+            log.info("No watchdog thread running, doing nothing")
+        if os.path.exists(self._file_path):
+            os.remove(self._file_path)
+
+    def _watchdog_loop(self) -> None:
+        # Open the pipe in blocking mode blocks the server thread.
+        # This is fine for the following reasons:
+        #  1. No client case usually does not happen.
+        #  2. We are running the watchdog loop in a separate daemon
+        #     thread, which will not block the process to stop.
+        with open(self._file_path) as fd:
+            while not self._stop_signaled:
+                try:
+                    run_once = self._run_once
+                    self._run_watchdog(fd)
+                    if run_once:
+                        break
+                except Exception:
+                    log.exception("Error running watchdog")
+
+    def _run_watchdog(self, fd: io.TextIOWrapper) -> None:
+        timer_requests = self._get_requests(fd, self._max_interval)
+        self.register_timers(timer_requests)
+        now = time.time()
+        reaped_worker_pids = set()
+        for worker_pid, expired_timers in self.get_expired_timers(now).items():
+            log.info("Reaping worker_pid=[%s]. Expired timers: %s", worker_pid, self._get_scopes(expired_timers))
+            reaped_worker_pids.add(worker_pid)
+            # In case we have multiple expired timers, we find the first timer
+            # with a valid signal (>0) in the expiration time order.
+            expired_timers.sort(key=lambda timer: timer.expiration_time)
+            signal = 0
+            expired_timer = None
+            for timer in expired_timers:
+                self._log_event("timer expired", timer)
+                if timer.signal > 0:
+                    signal = timer.signal
+                    expired_timer = timer
+                    break
+            if signal <= 0:
+                log.info("No signal specified with worker=[%s]. Do not reap it.", worker_pid)
+                continue
+            if self._reap_worker(worker_pid, signal):
+                log.info("Successfully reaped worker=[%s] with signal=%s", worker_pid, signal)
+                self._log_event("kill worker process", expired_timer)
+            else:
+                log.error("Error reaping worker=[%s]. Will retry on next watchdog.", worker_pid)
+        self.clear_timers(reaped_worker_pids)
+
+    def _get_scopes(self, timer_requests: List[FileTimerRequest]) -> List[str]:
+        return [r.scope_id for r in timer_requests]
+
+    def _get_requests(self, fd: io.TextIOWrapper, max_interval: float) -> List[FileTimerRequest]:
+        start = time.time()
+        requests = []
+        while not self._stop_signaled or self._run_once:
+            # For named pipe, readline() is blocking when at least one writer opens.
+            # It returns only when flush() is called at the writer side.
+            # Note that flush() is automatically called inside close().
+            # After the last writer closes, readline() is not blocking.
+            # It will return an empty string when it's at end-of-file.
+            # Since the client side always opens the pipe, writes a message and closes
+            # the pipe immediately, the readline() call below is not blocking for long.
+            json_request = fd.readline()
+            if len(json_request) == 0:
+                if self._run_once:
+                    break
+                time.sleep(min(max_interval, 1))
+            else:
+                request = json.loads(json_request)
+                pid = request["pid"]
+                scope_id = request["scope_id"]
+                expiration_time = request["expiration_time"]
+                signal = request["signal"]
+                requests.append(
+                    FileTimerRequest(
+                        worker_pid=pid, scope_id=scope_id, expiration_time=expiration_time, signal=signal
+                    )
+                )
+            now = time.time()
+            if now - start > max_interval:
+                break
+        return requests
+
+    def register_timers(self, timer_requests: List[FileTimerRequest]) -> None:
+        for request in timer_requests:
+            pid = request.worker_pid
+            scope_id = request.scope_id
+            expiration_time = request.expiration_time
+            self._request_count += 1
+
+            key = (pid, scope_id)
+            # negative expiration is a proxy for a release call
+            if expiration_time < 0:
+                if key in self._timers:
+                    del self._timers[key]
+            else:
+                self._timers[key] = request
+
+    def clear_timers(self, worker_pids: Set[int]) -> None:
+        for (pid, scope_id) in list(self._timers.keys()):
+            if pid in worker_pids:
+                del self._timers[(pid, scope_id)]
+
+    def get_expired_timers(self, deadline: float) -> Dict[int, List[FileTimerRequest]]:
+        # pid -> [timer_requests...]
+        expired_timers: Dict[int, List[FileTimerRequest]] = {}
+        for request in self._timers.values():
+            if request.expiration_time <= deadline:
+                expired_scopes = expired_timers.setdefault(request.worker_pid, [])
+                expired_scopes.append(request)
+        return expired_timers
+
+    def _reap_worker(self, worker_pid: int, signal: int) -> bool:
+        try:
+            os.kill(worker_pid, signal)
+            return True
+        except ProcessLookupError:
+            log.info("Process with pid=%s does not exist. Skipping", worker_pid)
+            return True
+        except Exception:
+            log.exception("Error terminating pid=%s", worker_pid)
+        return False
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/timer/local_timer.py b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/local_timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..62af765ec8ace4aa55ecae9b80f3dccfb7fbbf31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/timer/local_timer.py
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import multiprocessing as mp
+import os
+import signal
+import time
+from queue import Empty
+from typing import Any, Dict, List, Set, Tuple
+
+from .api import RequestQueue, TimerClient, TimerRequest, TimerServer
+
+__all__ = ['LocalTimerClient', 'MultiprocessingRequestQueue', 'LocalTimerServer']
+
+log = logging.getLogger(__name__)
+
+class LocalTimerClient(TimerClient):
+    """
+    Client side of ``LocalTimerServer``. This client is meant to be used
+    on the same host that the ``LocalTimerServer`` is running on and uses
+    pid to uniquely identify a worker. This is particularly useful in situations
+    where one spawns a subprocess (trainer) per GPU on a host with multiple
+    GPU devices.
+    """
+
+    def __init__(self, mp_queue):
+        super().__init__()
+        self._mp_queue = mp_queue
+
+    def acquire(self, scope_id, expiration_time):
+        pid = os.getpid()
+        acquire_request = TimerRequest(pid, scope_id, expiration_time)
+        self._mp_queue.put(acquire_request)
+
+    def release(self, scope_id):
+        pid = os.getpid()
+        release_request = TimerRequest(pid, scope_id, -1)
+        self._mp_queue.put(release_request)
+
+
+class MultiprocessingRequestQueue(RequestQueue):
+    """
+    A ``RequestQueue`` backed by python ``multiprocessing.Queue``
+    """
+
+    def __init__(self, mp_queue: mp.Queue):
+        super().__init__()
+        self._mp_queue = mp_queue
+
+    def size(self) -> int:
+        return self._mp_queue.qsize()
+
+    def get(self, size, timeout: float) -> List[TimerRequest]:
+        requests = []
+        wait = timeout
+        for _ in range(0, size):
+            start = time.time()
+
+            try:
+                r = self._mp_queue.get(block=True, timeout=wait)
+            except Empty:
+                break
+
+            requests.append(r)
+            wait = wait - (time.time() - start)
+            if wait <= 0:
+                break
+
+        return requests
+
+
+class LocalTimerServer(TimerServer):
+    """
+    Server that works with ``LocalTimerClient``. Clients are expected to be
+    subprocesses to the parent process that is running this server. Each host
+    in the job is expected to start its own timer server locally and each
+    server instance manages timers for local workers (running on processes
+    on the same host).
+    """
+
+    def __init__(
+        self, mp_queue: mp.Queue, max_interval: float = 60, daemon: bool = True
+    ):
+        super().__init__(MultiprocessingRequestQueue(mp_queue), max_interval, daemon)
+        self._timers: Dict[Tuple[Any, str], TimerRequest] = {}
+
+    def register_timers(self, timer_requests: List[TimerRequest]) -> None:
+        for request in timer_requests:
+            pid = request.worker_id
+            scope_id = request.scope_id
+            expiration_time = request.expiration_time
+
+            # negative expiration is a proxy for a release call
+            if expiration_time < 0:
+                self._timers.pop((pid, scope_id), None)
+            else:
+                self._timers[(pid, scope_id)] = request
+
+    def clear_timers(self, worker_ids: Set[int]) -> None:
+        for (pid, scope_id) in list(self._timers.keys()):
+            if pid in worker_ids:
+                self._timers.pop((pid, scope_id))
+
+    def get_expired_timers(self, deadline: float) -> Dict[Any, List[TimerRequest]]:
+        # pid -> [timer_requests...]
+        expired_timers: Dict[Any, List[TimerRequest]] = {}
+        for request in self._timers.values():
+            if request.expiration_time <= deadline:
+                expired_scopes = expired_timers.setdefault(request.worker_id, [])
+                expired_scopes.append(request)
+        return expired_timers
+
+    def _reap_worker(self, worker_id: int) -> bool:
+        try:
+            os.kill(worker_id, signal.SIGKILL)
+            return True
+        except ProcessLookupError:
+            log.info("Process with pid=%s does not exist. Skipping", worker_id)
+            return True
+        except Exception:
+            log.exception("Error terminating pid=%s", worker_id)
+        return False
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbc76bf70244c273d84c617a96dfc9827f1ae70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__init__.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .api import get_env_variable_or_raise, get_socket_with_port, macros  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fad988cec23a9d1d565b511985cac2f9c076ca6b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8231ff3fa75b555dc5e78d94dfd336a5a0cbf03a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea7d950cc51e3c59a62ca5ed2debaa5e423fbed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/log_level.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/log_level.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7d93462f9197319f2deebb7591b949c966f0a2a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/log_level.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/logging.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/logging.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e56b9ae8b3bff23486e35ceca524d9998ada068
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/logging.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/store.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/store.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af37705e285c636c5a7e968aa39ee2e7bc5ebcb1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/__pycache__/store.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/api.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f48da746fa16a6b4ef41ee276a6931696ce0aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/api.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import socket
+from string import Template
+from typing import List, Any
+
+
+def get_env_variable_or_raise(env_name: str) -> str:
+    r"""
+    Tries to retrieve environment variable. Raises ``ValueError``
+    if no environment variable found.
+
+    Args:
+        env_name (str): Name of the env variable
+    """
+    value = os.environ.get(env_name, None)
+    if value is None:
+        msg = f"Environment variable {env_name} expected, but not set"
+        raise ValueError(msg)
+    return value
+
+
+def get_socket_with_port() -> socket.socket:
+    addrs = socket.getaddrinfo(
+        host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
+    )
+    for addr in addrs:
+        family, type, proto, _, _ = addr
+        s = socket.socket(family, type, proto)
+        try:
+            s.bind(("localhost", 0))
+            s.listen(0)
+            return s
+        except OSError as e:
+            s.close()
+    raise RuntimeError("Failed to create a socket")
+
+
+class macros:
+    """
+    Defines simple macros for caffe2.distributed.launch cmd args substitution
+    """
+
+    local_rank = "${local_rank}"
+
+    @staticmethod
+    def substitute(args: List[Any], local_rank: str) -> List[str]:
+        args_sub = []
+        for arg in args:
+            if isinstance(arg, str):
+                sub = Template(arg).safe_substitute(local_rank=local_rank)
+                args_sub.append(sub)
+            else:
+                args_sub.append(arg)
+        return args_sub
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__init__.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73fd6cdd4431a77cc1cb7ae49efc92cedebfab2e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__init__.py
@@ -0,0 +1,10 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .cycling_iterator import CyclingIterator  # noqa: F401
+from .elastic_distributed_sampler import ElasticDistributedSampler  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1fd34eddc7eb3ca0fe5faa753092ce2bd289814
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/cycling_iterator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/cycling_iterator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b9362074c9dadf5af2c84e632b07f24e7029b5d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/cycling_iterator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/elastic_distributed_sampler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/elastic_distributed_sampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b88ee6bf8e5a55341bd04dc4c216d54409e429f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/__pycache__/elastic_distributed_sampler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/cycling_iterator.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/cycling_iterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..526d629cdec61093708aae90f9a0f7a9af257b1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/cycling_iterator.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+class CyclingIterator:
+    """
+    An iterator decorator that cycles through the
+    underlying iterator "n" times. Useful to "unroll"
+    the dataset across multiple training epochs.
+
+    The generator function is called as ``generator_fn(epoch)``
+    to obtain the underlying iterator, where ``epoch`` is a
+    number less than or equal to ``n`` representing the ``k``th cycle
+
+    For example if ``generator_fn`` always returns ``[1,2,3]``
+    then ``CyclingIterator(n=2, generator_fn)`` will iterate through
+    ``[1,2,3,1,2,3]``
+    """
+
+    def __init__(self, n: int, generator_fn, start_epoch=0):
+        self._n = n
+        self._epoch = start_epoch
+        self._generator_fn = generator_fn
+        self._iter = generator_fn(self._epoch)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self._iter)
+        except StopIteration as eod:  # eod == end of data
+            if self._epoch < self._n - 1:
+                self._epoch += 1
+                self._iter = self._generator_fn(self._epoch)
+                return self.__next__()
+            else:
+                raise eod
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d211dfabfbf78f9b4f5b210228c4dded497e472e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/data/elastic_distributed_sampler.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+from torch.utils.data.distributed import DistributedSampler
+
+
+class ElasticDistributedSampler(DistributedSampler):
+    """
+    Sampler that restricts data loading to a subset of
+    the dataset for elastic training.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Args:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        start_index (optional):  Which index of the dataset to start sampling from
+    """
+
+    def __init__(self, dataset, num_replicas=None, rank=None, start_index=0):
+        super().__init__(dataset=dataset, num_replicas=num_replicas, rank=rank)
+        if start_index >= len(dataset):
+            raise ValueError(
+                f"Start index {start_index} should be less than dataset size {len(dataset)}"
+            )
+
+        self.start_index = start_index
+        self.num_samples = int(
+            math.ceil(float(len(self.dataset) - self.start_index) / self.num_replicas)  # type: ignore[arg-type]
+        )
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        indices = (
+            torch.randperm(len(self.dataset) - self.start_index, generator=g)  # type: ignore[arg-type]
+            .add(self.start_index)
+            .tolist()
+        )
+
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/distributed.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc50038ca2ada2a37186e85cdeb959d7139a5a25
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/distributed.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import datetime
+import socket
+from contextlib import closing
+
+import torch.distributed as dist
+from torch.distributed.elastic.utils.logging import get_logger
+
+
+log = get_logger(__name__)
+
+_ADDRESS_IN_USE = "Address already in use"
+_SOCKET_TIMEOUT = "Socket Timeout"
+
+_MEMBER_CHECKIN = "_tcp_store/num_members"
+_LAST_MEMBER_CHECKIN = "_tcp_store/last_member"
+
+
+def create_c10d_store(
+    is_server: bool,
+    server_addr: str,
+    server_port: int = -1,
+    world_size: int = 1,
+    timeout: float = (60 * 10),  # 10 min
+    wait_for_workers: bool = True,
+    retries=3,
+):
+    if server_port == -1 and world_size > 1:
+        raise ValueError(
+            f"server_port must be specified when world_size > 1, got server_port={server_port}, world_size={world_size}"
+        )
+
+    if server_port != -1:
+        log.info("sever_port: %s, specified, ignoring retries", server_port)
+
+    # only retry when server_port is NOT static
+    attempt = retries if server_port == -1 else 1
+    while True:
+        if server_port != -1:
+            port = server_port
+        else:
+            port = get_free_port()
+
+        log.info(
+            "Creating c10d store on %s:%s\n"
+            "  world_size  : %s\n"
+            "  is_server   : %s\n"
+            "  timeout(sec): %s\n",
+            server_addr, port, world_size, is_server, timeout
+        )
+
+        try:
+            store = dist.TCPStore(
+                host_name=server_addr,
+                port=port,
+                world_size=world_size,
+                is_master=is_server,
+                timeout=datetime.timedelta(seconds=timeout),
+                wait_for_workers=wait_for_workers,
+            )
+            # skips full rank check when we don't have to wait for all workers
+            if wait_for_workers:
+                _check_full_rank(store, world_size)
+            log.info("Successfully created c10d store")
+            return store
+        except RuntimeError as e:
+            # this is brittle, but the underlying exception type is not properly pybinded
+            # so we parse the error msg for now, interestingly this is how torch itself
+            # detects timeouts and port conflicts in their own unittests
+            # see - caffe2/torch/testing/_internal/common_utils.py
+            # TODO properly map the exceptions in pybind (c10d/init.cpp)
+            if str(e) == _ADDRESS_IN_USE:  # this will only happen on the server
+                if attempt < retries:
+                    log.warning(
+                        "port: %s already in use, attempt: [%s/%s]", port, attempt, retries
+                    )
+                    attempt += 1
+                else:
+                    raise RuntimeError(
+                        f"on {server_addr}, port: {port} already in use"
+                    ) from e
+            else:
+                raise
+
+
+def _check_full_rank(store, world_size):
+    idx = store.add(_MEMBER_CHECKIN, 1)
+    if idx == world_size:
+        store.set(_LAST_MEMBER_CHECKIN, "<val_ignored>")
+
+    try:
+        store.get(_LAST_MEMBER_CHECKIN)
+    except RuntimeError as e:
+        if str(e) == _SOCKET_TIMEOUT:
+            raise TimeoutError(
+                f"timed out waiting for all {world_size} members to join"
+            ) from e
+        else:
+            raise
+
+
+def get_free_port():
+    sock = get_socket_with_port()
+    with closing(sock):
+        return sock.getsockname()[1]
+
+
+def get_socket_with_port() -> socket.socket:
+    """
+    Returns a free port on localhost that is "reserved" by binding a temporary
+    socket on it. Close the socket before passing the port to the entity
+    that requires it. Usage example
+
+    ::
+
+    sock = _get_socket_with_port()
+    with closing(sock):
+        port = sock.getsockname()[1]
+        sock.close()
+        # there is still a race-condition that some other process
+        # may grab this port before func() runs
+        func(port)
+    """
+
+    addrs = socket.getaddrinfo(
+        host="localhost", port=None, family=socket.AF_UNSPEC, type=socket.SOCK_STREAM
+    )
+    for addr in addrs:
+        family, type, proto, _, _ = addr
+        s = socket.socket(family, type, proto)
+        try:
+            s.bind(("localhost", 0))
+            s.listen(0)
+            return s
+        except OSError as e:
+            s.close()
+            log.info("Socket creation attempt failed.", exc_info=e)
+    raise RuntimeError("Failed to create a socket")
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/log_level.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/log_level.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf2d31347aeeb3ebc63af253a3f4db678cfdc0fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/log_level.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def get_log_level() -> str:
+    """
+    Return default log level for pytorch.
+    """
+    return "WARNING"
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/logging.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..34355c06ddd69d626723b923ce43918c1e1a3a6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/logging.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+import logging
+import os
+import warnings
+from typing import Optional
+
+from torch.distributed.elastic.utils.log_level import get_log_level
+
+
+def get_logger(name: Optional[str] = None):
+    """
+    Util function to set up a simple logger that writes
+    into stderr. The loglevel is fetched from the LOGLEVEL
+    env. variable or WARNING as default. The function will use the
+    module name of the caller if no name is provided.
+
+    Args:
+        name: Name of the logger. If no name provided, the name will
+              be derived from the call stack.
+    """
+
+    # Derive the name of the caller, if none provided
+    # Use depth=2 since this function takes up one level in the call stack
+    return _setup_logger(name or _derive_module_name(depth=2))
+
+
+def _setup_logger(name: Optional[str] = None):
+    log = logging.getLogger(name)
+    log.setLevel(os.environ.get("LOGLEVEL", get_log_level()))
+    return log
+
+
+def _derive_module_name(depth: int = 1) -> Optional[str]:
+    """
+    Derives the name of the caller module from the stack frames.
+
+    Args:
+        depth: The position of the frame in the stack.
+    """
+    try:
+        stack = inspect.stack()
+        assert depth < len(stack)
+        # FrameInfo is just a named tuple: (frame, filename, lineno, function, code_context, index)
+        frame_info = stack[depth]
+
+        module = inspect.getmodule(frame_info[0])
+        if module:
+            module_name = module.__name__
+        else:
+            # inspect.getmodule(frame_info[0]) does NOT work (returns None) in
+            # binaries built with @mode/opt
+            # return the filename (minus the .py extension) as modulename
+            filename = frame_info[1]
+            module_name = os.path.splitext(os.path.basename(filename))[0]
+        return module_name
+    except Exception as e:
+        warnings.warn(
+            f"Error deriving logger module name, using <None>. Exception: {e}",
+            RuntimeWarning,
+        )
+        return None
diff --git a/MLPY/Lib/site-packages/torch/distributed/elastic/utils/store.py b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/store.py
new file mode 100644
index 0000000000000000000000000000000000000000..953becb9ab53ac939f3ab82361aa9fd76b247abe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/elastic/utils/store.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from datetime import timedelta
+from typing import List
+
+
+def get_all(store, rank: int, prefix: str, size: int):
+    r"""
+    Given a store and a prefix, the method goes through the array of keys
+    of the following format: ``{prefix}{idx}``, where idx is in a range
+    from 0 to size, and tries to retrieve the data.
+
+    The Rank0 process waits at the end to make sure all other processes
+    finished the procedure before exiting.
+
+    Usage
+
+    ::
+
+     values = get_all(store, 'torchelastic/data', 3)
+     value1 = values[0] # retrieves the data for key torchelastic/data0
+     value2 = values[1] # retrieves the data for key torchelastic/data1
+     value3 = values[2] # retrieves the data for key torchelastic/data2
+
+    """
+    data_arr = []
+    for idx in range(size):
+        data = store.get(f"{prefix}{idx}")
+        data_arr.append(data)
+    store.set(f"{prefix}{rank}.FIN", b"FIN")
+    if rank == 0:
+        # Rank0 runs the TCPStore daemon, as a result it needs to exit last.
+        # Otherwise, the barrier may timeout if rank0 process finished the work
+        # before other processes finished `get_all` method
+        for node_rank in range(size):
+            store.get(f"{prefix}{node_rank}.FIN")
+
+    return data_arr
+
+
+def synchronize(
+    store,
+    data: bytes,
+    rank: int,
+    world_size: int,
+    key_prefix: str,
+    barrier_timeout: float = 300,
+) -> List[bytes]:
+    """
+    Synchronizes ``world_size`` agents between each other using the underlying c10d store.
+    The ``data`` will be available on each of the agents.
+
+    Note: The data on the path is not deleted, as a result there can be stale data if
+        you use the same key_prefix twice.
+    """
+    store.set_timeout(timedelta(seconds=barrier_timeout))
+    store.set(f"{key_prefix}{rank}", data)
+    agent_data = get_all(store, rank, key_prefix, world_size)
+    return agent_data
+
+
+def barrier(
+    store, rank: int, world_size: int, key_prefix: str, barrier_timeout: float = 300
+) -> None:
+    """
+    A global lock between agents.
+
+    Note: Since the data is not removed from the store, the barrier can be used
+        once per unique ``key_prefix``.
+    """
+    data = f"{rank}".encode()
+    synchronize(store, data, rank, world_size, key_prefix, barrier_timeout)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__init__.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ddf24dde2c2e755b3f378a9c73a012e33df6a65
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/__init__.py
@@ -0,0 +1,38 @@
+from ._flat_param import FlatParameter as FlatParameter
+from .fully_sharded_data_parallel import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    FullyShardedDataParallel,
+    LocalOptimStateDictConfig,
+    LocalStateDictConfig,
+    MixedPrecision,
+    OptimStateDictConfig,
+    OptimStateKeyType,
+    ShardedOptimStateDictConfig,
+    ShardedStateDictConfig,
+    ShardingStrategy,
+    StateDictConfig,
+    StateDictSettings,
+    StateDictType,
+)
+
+__all__ = [
+    "BackwardPrefetch",
+    "CPUOffload",
+    "FullOptimStateDictConfig",
+    "FullStateDictConfig",
+    "FullyShardedDataParallel",
+    "LocalOptimStateDictConfig",
+    "LocalStateDictConfig",
+    "MixedPrecision",
+    "OptimStateDictConfig",
+    "OptimStateKeyType",
+    "ShardedOptimStateDictConfig",
+    "ShardedStateDictConfig",
+    "ShardingStrategy",
+    "StateDictConfig",
+    "StateDictSettings",
+    "StateDictType",
+]
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78e890f540e56dfe6f329d04661b07698789d8cb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_common_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_common_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..904f7ccf46cb7960a44d2fb7b252a49c323acb41
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_common_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_debug_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_debug_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12755b111043a5055ada85b9e4caff99f6eced97
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_debug_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_dynamo_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_dynamo_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52a2285cb2e3d98975a026d38c38ebfe4a20a633
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_dynamo_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_exec_order_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_exec_order_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..800e5729c49483336f5a6da43a08e83ea085d12d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_exec_order_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_flat_param.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_flat_param.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cf2dc40144a5ab29ae5c83f951f56fd0c27f59a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_flat_param.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_fsdp_extensions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_fsdp_extensions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faa5ca6eabb391bc8222d5854031d064f2d7ab36
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_fsdp_extensions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_init_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_init_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da7ca420280d3fa915085e7801af1338ce1f954e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_init_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_limiter_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_limiter_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b38a41343069c07bfa95796b13d54cfc6ef701a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_limiter_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_optim_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_optim_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95c55fc07ca6e2bcf0c2e499590fd1e3e4f5796c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_optim_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_runtime_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_runtime_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3410fe1f71b5366e677fd0f3c3ab7c9738d1745
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_runtime_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_shard_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_shard_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4de6c523b23c0af55dc4c7e81f9a1eee381616e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_shard_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_state_dict_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_state_dict_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ed27043c5bfd33f6ea7cc7c5ff62dc9c8724fef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_state_dict_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_trace_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_trace_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85287e9841b5d163a3e1b9302c0a480bcf56cdd5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_trace_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_traversal_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_traversal_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..148d99694baa3f2ae2a596132360c5d1d968477f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_traversal_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_unshard_param_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_unshard_param_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8349e613e5550baf3ff94b38433d2b60c3bab15e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_unshard_param_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_wrap_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_wrap_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd6791a4dbab07cc0f97aa6541b0f8e6ac0119f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/_wrap_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d7d60c54b480b31d749401da661cf0b493e4ca8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/fully_sharded_data_parallel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/fully_sharded_data_parallel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e67f5a949ce95c76ec959a934252e99abefb5093
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/fully_sharded_data_parallel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/sharded_grad_scaler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/sharded_grad_scaler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db7aa31a4890fa21c83b34bb282a98d3bffe73e0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/sharded_grad_scaler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/wrap.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/wrap.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d50d7fee1f3dc17807266b12bca44d138a794340
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/fsdp/__pycache__/wrap.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_common_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_common_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8d3a722fd62bd0f7f6d9e394008083947618b90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_common_utils.py
@@ -0,0 +1,563 @@
+"""
+This file includes private common utilities for FSDP.
+"""
+import logging
+import traceback
+import warnings
+import weakref
+from enum import auto, Enum
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    no_type_check,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+)
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._flat_param as flat_param_file
+import torch.nn as nn
+from torch.distributed._composable_state import _get_module_state, _State
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_PREFIX,
+)
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
+from torch.distributed.utils import _apply_to_tensors
+from torch.utils._mode_utils import no_dispatch
+
+from .api import (
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    OptimStateDictConfig,
+    ShardingStrategy,
+    StateDictConfig,
+    StateDictType,
+)
+
+if TYPE_CHECKING:
+    from ._flat_param import FlatParamHandle
+
+FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module"
+FSDP_PREFIX = FSDP_WRAPPED_MODULE + "."
+FSDP_FLATTENED = "_fsdp_flattened"
+
+# Save a global mapping from module to its input tensor dtype to be populated
+# during the forward pre-hook and consumed in the forward post-hook when
+# overriding a module's mixed precision
+# NOTE: We currently take the last input tensor's dtype in the case of multiple
+# floating-point input tensors, which may be incorrect. However, since there is
+# not a 1:1 correspondence between input and output tensors, we must use *some*
+# heuristic like this to predict the desired output dtype.
+_MODULE_TO_INP_DTYPE: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+
+
+class _FSDPDeviceHandle:
+    """
+    This is a simple abstraction for FSDP computing devices,
+    which enables custom backends that implement CUDA-like
+    semantics to be integrated with FSDP.
+    """
+
+    def __init__(self, device: torch.device, backend: Any = None):
+        if backend is None:
+            try:
+                self.__backend = getattr(torch, device.type)
+                self.__device = device
+            except AttributeError as exc:
+                raise AttributeError(
+                    f"Device '{device}' does not have a corresponding backend registered as 'torch.{device.type}'."
+                ) from exc
+        else:
+            self.__backend = backend
+
+    @classmethod
+    def from_device(cls, device: torch.device) -> "_FSDPDeviceHandle":
+        """
+        Return an device handle corresponding to the device, and through this handle,
+        operations with the same semantics as CUDA can be performed on the device.
+        Just return torch.cuda if the device is cuda to make attribute-access faster.
+        Custom backend must first register a module with the same name with {device.type} on torch.
+        """
+        if device.type == "cuda":
+            return cast(_FSDPDeviceHandle, torch.cuda)
+        return cls(device)
+
+    def __getattr__(self, __name: str) -> Any:
+        try:
+            return getattr(self.__backend, __name)
+        except AttributeError as exc:
+            raise AttributeError(
+                f"Custom backend '{self.__device.type}' not implement 'torch.{self.__device.type}.{__name}'"
+            ) from exc
+
+
+class _UninitializedDeviceHandle(_FSDPDeviceHandle):
+    def __init__(self):
+        pass
+
+    def __getattribute__(self, __name: str) -> Any:
+        raise RuntimeError("Trying to use an uninitialized device handle.")
+
+
+class _FSDPState(_State):
+    def __init__(self) -> None:
+        # TODO: Move all the attributes to this class to enable typing for
+        # FSDP/fully_shard.
+        self._ignored_modules: Set[nn.Module] = set()
+        self._ignored_params: Set[nn.Parameter] = set()
+        # Buffer names are cleaned (without wrapper prefixes)
+        self._ignored_buffer_names: Set[str] = set()
+        self.process_group: Optional[dist.ProcessGroup] = None
+        self.rank: int = -1
+        self.world_size: int = -1
+        self._device_mesh: Optional[DeviceMesh] = None
+        self.sharding_strategy = ShardingStrategy.FULL_SHARD
+        self._use_orig_params: bool = False
+        self.training_state = TrainingState.IDLE
+        self._unshard_params_ctx: Dict[nn.Module, Generator] = {}
+        self._state_dict_type: StateDictType = StateDictType.FULL_STATE_DICT
+        self._state_dict_config: StateDictConfig = FullStateDictConfig()
+        self._optim_state_dict_config: OptimStateDictConfig = FullOptimStateDictConfig()
+        self._is_root: Optional[bool] = None
+        self._handle: Optional[flat_param_file.FlatParamHandle] = None
+        self._fully_sharded_module_to_handle: Dict[
+            nn.Module, Optional[flat_param_file.FlatParamHandle]
+        ] = {}
+        self.compute_device: Optional[torch.device] = None
+        self._gradient_predivide_factor: int = 0
+        self._gradient_postdivide_factor: int = 0
+        self._comm_hook: Optional[Callable] = None
+        self._comm_hook_state: Optional[Any] = None
+        # Abstract device handle for fsdp compute device. For now,
+        # the compute device must implement cuda semantics used by fsdp
+        self._device_handle: _FSDPDeviceHandle = _UninitializedDeviceHandle()
+        # All following attributes should only be used for root states:
+        # Save these static lists to avoid the repeated tree traversals
+        self._all_fsdp_states: List[_FSDPState] = []
+        self._all_handles: List[flat_param_file.FlatParamHandle] = []
+        self._fsdp_extension: Optional[FSDPExtensions] = None
+
+
+def _get_module_fsdp_state(module: nn.Module) -> Optional[_FSDPState]:
+    state = _get_module_state(module)
+    if state is None or not isinstance(state, _FSDPState):
+        return None
+    return state
+
+
+def _get_module_fsdp_state_if_fully_sharded_module(
+    module: nn.Module,
+) -> Optional[_FSDPState]:
+    state = _get_module_fsdp_state(module)
+    if state is None:
+        return None
+    if state == module:  # FullyShardedDataParallel module case.
+        return state
+    if module in state._fully_sharded_module_to_handle:  # fully_shard case.
+        return state
+    return None
+
+
+class TrainingState(Enum):
+    """
+    An enum that indicates the state of a ``FullyShardedDataParallel` instance.
+    """
+
+    IDLE = auto()
+    FORWARD_BACKWARD = auto()
+    SUMMON_FULL_PARAMS = auto()
+
+
+class HandleTrainingState(Enum):
+    """
+    An enum that indicates the state of a ``FlatParamHandle`.
+    """
+
+    IDLE = auto()
+    FORWARD = auto()
+    BACKWARD_PRE = auto()
+    BACKWARD_POST = auto()
+    SUMMON_FULL_PARAMS = auto()
+
+
+def _is_composable(state: _FSDPState):
+    # TODO: This is a temporary hack for differentiate between code paths.
+    return not isinstance(state, nn.Module)
+
+
+@no_type_check
+def _module_handle(state: _FSDPState, module: nn.Module) -> Optional["FlatParamHandle"]:
+    """
+    Returns the ``FlatParamHandle`` s corresponding to ``module``. This is
+    the handle that contains some parameter in ``module``.
+    """
+    if _is_composable(state):
+        # A valid FSDP state may have no managed parameters and hence no
+        # handles, meaning no entry in `_fully_sharded_module_to_handles`
+        if state._handle is None:
+            return None
+        assert (
+            module in state._fully_sharded_module_to_handle
+        ), f"Expects a fully sharded module but got {module} on rank {state.rank}"
+        return state._fully_sharded_module_to_handle[module]
+    else:
+        # NOTE: This assumes `module` is a `FullyShardedDataParallel` instance.
+        return module._handle
+
+
+@no_type_check
+def _has_fsdp_params(state: _FSDPState, module: nn.Module) -> bool:
+    """Returns if ``module`` has parameters managed by FSDP."""
+    return _module_handle(state, module) is not None
+
+
+def _get_sharding_strategy(handle):
+    """
+    Returns the sharding strategy of the handle.
+    """
+    return handle._sharding_strategy if handle else None
+
+
+def clean_tensor_name(tensor_name: str) -> str:
+    """
+    Cleans the parameter or buffer name by removing any module wrapper
+    prefixes.
+    """
+    tensor_name = tensor_name.replace(FSDP_PREFIX, "")
+    # TODO: Explicitly replacing the checkpoint wrapper prefix is not ideal as
+    # it couples `CheckpointWrapper` and FSDP and also does not scale for more
+    # module wrappers.
+    tensor_name = tensor_name.replace(_CHECKPOINT_PREFIX, "")
+    return tensor_name
+
+
+def _set_fsdp_flattened(tensor: torch.Tensor) -> None:
+    """
+    Sets an attribute on ``tensor`` to mark it as flattened by FSDP. This is to
+    avoid re-flattening it during nested construction.
+    """
+    setattr(tensor, FSDP_FLATTENED, True)
+
+
+def _is_fsdp_flattened(tensor: torch.Tensor) -> bool:
+    """Returns if ``tensor`` has been marked as flattened by FSDP."""
+    return getattr(tensor, FSDP_FLATTENED, False)
+
+
+def _named_parameters_with_duplicates(
+    module: nn.Module, **kwargs: Any
+) -> List[Tuple[str, nn.Parameter]]:
+    """
+    This API is required as some modules overwrite `named_parameters()` but do not support
+    `remove_duplicate`.
+    """
+    assert (
+        "remove_duplicate" not in kwargs
+    ), "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
+    kwargs["remove_duplicate"] = False
+    try:
+        ret = list(module.named_parameters(**kwargs))
+    except AssertionError as e:
+        kwargs.pop("remove_duplicate")
+        ret = list(module.named_parameters(**kwargs))
+    return ret
+
+
+def _get_param_to_fqns(
+    model: torch.nn.Module,
+    dedup_shared_params: bool = True,
+) -> Dict[nn.Parameter, List[str]]:
+    """
+    Constructs a mapping from parameter to a list of its \"canonical\" FQNs. Here,
+    we use canonical to mean the fully-qualified name assigned to the parameter
+    based on its position in the original nn.Module hierarchy before any wrapper
+    or parallelism has been applied to it. This is in contrast to FQNs that may be
+    generated after parallelisms or wrappers have been applied to the model.
+
+    Each normal parameter maps to a singleton list containing its FQN, while each
+    ``FlatParameter`` maps to a list of its original parameter FQNs, which may
+    have length greater than one.  All FQNs are prefixed starting from ``model``.
+
+    In the case where FSDP was applied with ``use_orig_params=True``, there should be no
+    ``FlatParameter`` s registered to the model's modules and this mapping will only
+    contain mappings from ``nn.Parameter`` s to singleton FQN lists.
+
+    It is only in the case where FSDP was applied with ``use_orig_params=False`` where
+    a ``FlatParameter`` will be registered in place of the original parameters and there
+    will be mappings from each ``FlatParameter`` to lists of FQNs corresponding to the
+    original parameters.
+
+    Args:
+        model (torch.nn.Module): Root module (which may or may not be a
+            :class:`FullyShardedDataParallel` instance).
+        dedup_shared_params (bool): For shared parameters, if ``True``, only
+            includes the FQNs corresponding to the first encounter of the
+            shared parameter in the module traversal; if ``False``, then
+            includes the FQNs across all encounters. (Default: ``True``)
+    """
+
+    def module_fn(module, prefix, tree_level, param_to_fqns):
+        for param_name, param in _named_parameters_with_duplicates(
+            module, recurse=False
+        ):
+            local_fqns = (
+                param._fqns
+                if isinstance(param, flat_param_file.FlatParameter)
+                else [param_name]
+            )  # prefixed from `module`
+            global_fqns = [
+                clean_tensor_name(prefix + name) for name in local_fqns
+            ]  # prefixed from the top level `model` (i.e. including `prefix`)
+            is_shared_param = param in param_to_fqns
+            if not is_shared_param:
+                param_to_fqns[param] = global_fqns
+            else:
+                if isinstance(param, flat_param_file.FlatParameter):
+                    # DMP overwrites `named_parameters` and skip (advance to
+                    # the next child module) the wrapped_module (e.g.,
+                    # _dmp_wrapped_module and _fsdp_wrapped_module). When a user
+                    # calls `named_child` to traverse the module recursively and
+                    # calls `named_parameters` with `recurse=False`, parameters
+                    # will be traversed more than once.
+                    # This hack is specified designed for DMP + FSDP. We
+                    # overwrite the flat_parameters traversal result to only obtain
+                    # the last one, which happens to be the correct one.
+                    #
+                    # TODO: Remove this hack once DMP + FSDP is not supported.
+                    warnings.warn(
+                        "FlatParameter is being traversed more than once. "
+                        "This case should only happen when using "
+                        "DistributedModelParallel with FullyShardedDataParallel."
+                    )
+                    param_to_fqns[param] = global_fqns
+                elif not dedup_shared_params:
+                    param_to_fqns[param].extend(global_fqns)
+
+    def return_fn(param_to_fqns):
+        return param_to_fqns
+
+    param_to_unflat_param_names: Dict[torch.nn.Parameter, List[str]] = {}
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        [key for key, _ in _named_parameters_with_duplicates(model)],
+        param_to_unflat_param_names,
+    )
+
+
+@no_type_check
+def _log_post_backward_hook(
+    state: _FSDPState, handle: "FlatParamHandle", log: logging.Logger
+) -> None:
+    # Under TORCH_DISTRIBUTED_DEBUG=INFO, log the module names this hook fires for.
+    # Below logging of module names this post-bwd hook fires for can help debug certain
+    # cases where hooks don't fire, such as under certain activation checkpoint configs.
+    if state._use_orig_params and handle._debug_level == dist.DebugLevel.INFO:
+        param_fqns = _get_handle_fqns_from_root(state, handle)
+        log.warning("FSDP firing post-backward hooks for parameters %s", param_fqns)
+
+
+@no_type_check
+def _get_handle_fqns_from_root(
+    state: _FSDPState, handle: "FlatParamHandle"
+) -> Optional[List[str]]:
+    if handle is None:
+        return None
+    param_to_fqn = state._exec_order_data.param_to_fqn
+    handle_params = handle.flat_param._params  # only populated for use_orig_params
+    param_fqns = [
+        fqn for fqn_list in [param_to_fqn[p] for p in handle_params] for fqn in fqn_list
+    ]
+    return param_fqns
+
+
+def _apply_to_modules(
+    root_module: torch.nn.Module,
+    module_fn: Callable,
+    return_fn: Callable,
+    filter_fqns: Optional[List[str]] = None,
+    *args,
+    **kwargs,
+):
+    """
+    Performs a pre-order traversal of the modules in the hierarchy rooted at
+    ``root_module``, applying ``module_fn`` at each module and finally
+    returning a value using ``return_fn``. The traversal constructs the full
+    module prefix name (e.g. "module.submodule." just like in model state dict)
+    and makes that available to ``module_fn``.
+
+    ``filter_fqns`` is used because some module may have its own prefix similar
+    to ``FullyShardedDataParallel`` and the ``named_parameters()`` is overwritten
+    to remove the prefix.
+    """
+
+    def f(module: torch.nn.Module, prefix: str, tree_level: int, *args, **kwargs):
+        # Call the module function before recursing over children (pre-order)
+        module_fn(module, prefix, tree_level, *args, **kwargs)
+        for submodule_name, submodule in module.named_children():
+            if submodule is None:
+                continue
+            new_prefix = prefix + submodule_name + "."
+            new_tree_level = tree_level + 1
+            if filter_fqns is not None:
+                for fqn in filter_fqns:
+                    if fqn.startswith(new_prefix):
+                        break
+                else:
+                    # DMP's named_parameter() will mess up the traversal with
+                    # ``named_children`` + `named_parameter(recurse=False)``.
+                    # This hack is a must to make the traversal work.
+                    # TODO: Remove this hack once DMP + FSDP is not supported.
+                    if (
+                        submodule_name == "_fsdp_wrapped_module"
+                        or submodule_name == "_dmp_wrapped_module"
+                    ):
+                        if (
+                            not torch.distributed._functional_collectives.is_torchdynamo_compiling()
+                        ):
+                            # TODO(voz): Don't graph break on this
+                            warnings.warn(
+                                "An unexpected prefix is detected. This case "
+                                " should only happen when using DMP with FSDP. "
+                                f"prefix = {prefix}, "
+                                f"submodule_name = {submodule_name}"
+                            )
+                        new_prefix = prefix
+                    elif submodule_name == "module":
+                        warnings.warn(
+                            "An unexpected prefix is detected. This case "
+                            " should only happen when DDP wraps the outer "
+                            " modules while FSDP wraps the inner ones."
+                            f"prefix = {prefix}, "
+                            f"submodule_name = {submodule_name}"
+                        )
+                        new_prefix = prefix
+            f(submodule, new_prefix, new_tree_level, *args, **kwargs)
+
+    f(root_module, "", 0, *args, **kwargs)
+    return return_fn(*args, **kwargs)
+
+
+@no_type_check
+def _assert_in_training_states(
+    state: _FSDPState,
+    training_states: List[TrainingState],
+) -> None:
+    """Asserts that FSDP is in the states ``_training_states``."""
+    # Raise a `ValueError` instead of using `assert` to ensure that these
+    # logical assertions run even if `assert`s are disabled
+    if state.training_state not in training_states:
+        msg = (
+            f"expected to be in states {training_states} but current state is "
+            f"{state.training_state}"
+        )
+        # Print the error on rank 0 in case this is called in the backward pass
+        if state.rank == 0:
+            if isinstance(state, nn.Module):
+                print(f"Asserting FSDP instance is: {state}")
+            print(f"ERROR: {msg}")
+            traceback.print_stack()
+        raise ValueError(msg)
+
+
+def _get_root_modules(modules: Set[nn.Module]) -> Set[nn.Module]:
+    """
+    Returns:
+        Set[nn.Module]: The subset of ``modules`` that are root modules (i.e.
+        parent-less) with respect to the modules in the set itself. In other
+        words, these are the modules in ``modules`` that are not the child of
+        any other module in ``modules``.
+    """
+    root_modules: Set[nn.Module] = set()
+    module_to_submodules = {module: set(module.modules()) for module in modules}
+    for candidate_module in modules:
+        is_root_module = True
+        for module, submodules in module_to_submodules.items():
+            is_child_module = (
+                candidate_module is not module and candidate_module in submodules
+            )
+            if is_child_module:
+                is_root_module = False
+                break
+        if is_root_module:
+            root_modules.add(candidate_module)
+    return root_modules
+
+
+def _override_module_mixed_precision(
+    root: torch.nn.Module,
+    module_classes_to_override: Iterable[Type[nn.Module]],
+    wrap_override_dict: Dict[str, Any] = {"mixed_precision": None},  # noqa: B006
+) -> Set[Type[nn.Module]]:
+    module_classes_to_override = tuple(set(module_classes_to_override))
+    # Return a set of the actually overridden module classes
+    overridden_module_classes: Set[Type[nn.Module]] = set()
+    for mod in root.modules():
+        if isinstance(mod, module_classes_to_override):
+            overridden_module_classes.add(type(mod))
+            mod._wrap_overrides = wrap_override_dict  # type: ignore[assignment]
+            # TODO: We need to run this mixed precision ignored module in fp32,
+            # but ensure subsequent modules, that may possibly be running with
+            # mixed precision, still receive the appropriate precision inputs
+            # without user having to adjust mixed precision config too much.
+            # As a result, we attach pre and post forward hooks to up / down
+            # cast. We should revisit this design.
+
+            def cast_fn(
+                dtype: torch.dtype, module: nn.Module, x: torch.Tensor
+            ) -> torch.Tensor:
+                if not torch.is_floating_point(x) or x.dtype == dtype:
+                    return x
+                _MODULE_TO_INP_DTYPE[module] = x.dtype
+                return x.to(dtype)
+
+            def forward_pre_hook(module, args):
+                return _apply_to_tensors(partial(cast_fn, torch.float32, module), args)
+
+            def forward_post_hook(module, args, output):
+                # NOTE: If the forward did not have any floating-point tensors,
+                # then the dtype will not be set for this module, and we do not
+                # upcast the dtype.
+                if module in _MODULE_TO_INP_DTYPE:
+                    old_dtype = _MODULE_TO_INP_DTYPE[module]
+                    return _apply_to_tensors(
+                        partial(cast_fn, old_dtype, module), output
+                    )
+
+            # We intentionally append both of these hooks so that they run after
+            # all other hooks.
+            mod.register_forward_pre_hook(forward_pre_hook, prepend=False)
+            mod.register_forward_hook(forward_post_hook, prepend=False)
+    return overridden_module_classes
+
+
+def _no_dispatch_record_stream(tensor: torch.Tensor, stream: torch.Stream) -> None:
+    # FIXME record_stream doesn't work with non-cuda tensors
+    if tensor.device.type not in ["cuda", torch._C._get_privateuse1_backend_name()]:
+        return
+
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        return
+        # from @ezyang:
+        # The no_dispatch was added in https://github.com/pytorch/pytorch/pull/88014 cc @fegin
+        # Looking over the PR, it looks like this is because we don't actually support Stream arguments
+        # in torch dispatch, so it just chokes.
+        # If Dynamo is able to answer "are there any torch dispatch modes" active (it should answer False),
+        # a better version of this would just be to check if there are any modes before disabling dispatch.
+        # TODO(voz): Extend a dynamo util to answer the above, unify the codepaths here.
+        tensor.record_stream(stream)
+    else:
+        with no_dispatch():
+            tensor.record_stream(stream)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_debug_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_debug_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..76463883817fb3854da9b5ae9fda643429410341
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_debug_utils.py
@@ -0,0 +1,155 @@
+import logging
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum
+from typing import Dict, Iterator, List, Set, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._flat_param as flat_param_file
+from torch.distributed.fsdp._common_utils import (
+    _apply_to_modules,
+    _get_module_fsdp_state,
+    clean_tensor_name,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class SimpleProfiler:
+    class Type(str, Enum):
+        ALL = "all"
+        ALLGATHER = "all_gather"
+        ALLGATHER_OBJ = "all_gather_object"
+        RESHARDING = "resharding"
+        H2D = "H2D"
+        D2H = "D2H"
+
+    results: Dict[str, float] = defaultdict(float)
+    profiling: Set[str] = set()
+
+    @classmethod
+    def reset(cls) -> None:
+        cls.results.clear()
+        cls.profiling.clear()
+
+    @classmethod
+    @contextmanager
+    def profile(cls, profile_type: str) -> Iterator[None]:
+        assert profile_type not in cls.profiling, (
+            f"{profile_type} is already being profiled. "
+            "SimpleProfiler does not support profiling multiple instances at "
+            "the same time. "
+        )
+
+        cls.profiling.add(profile_type)
+        begin = time.monotonic()
+        try:
+            yield
+        finally:
+            end = time.monotonic()
+            cls.results[profile_type] += end - begin
+            cls.profiling.remove(profile_type)
+
+    @classmethod
+    def dump_and_reset(cls, msg: str) -> None:
+        # This cannot be combined with DETAIL distributed log
+        # as the profiling will be very incorrect.
+        if dist.get_rank() == 0 and dist.get_debug_level() == dist.DebugLevel.INFO:
+            logger.warning("%s %s", msg, cls.results)
+        cls.reset()
+
+
+def _get_sharded_module_tree_with_module_name_to_fqns(
+    model: torch.nn.Module,
+) -> Tuple[str, Dict[str, List[str]]]:
+    """
+    It is used for composable fully_shard() code path, it returns
+      1. sharded module tree info: each line reprents a submodule name that contats the
+    submodule's FQN and its submodule class name, if the submodule is sharded by `fully_shard`,
+    the submodule name will add a postfix with ' FULLY SHARDED'. Each increased tree
+    level adds 4 spaces before the printed name. A printed sharded module tree info for a toy model
+    is like this:
+        [CompositeModel] FULLY SHARDED
+            l1[Linear]
+            u1[UnitModule] FULLY SHARDED
+                u1.l1[Linear]
+                u1.seq[Sequential]
+                    u1.seq.0[ReLU]
+                    u1.seq.1[Linear]
+                    u1.seq.2[ReLU]
+                u1.l2[Linear]
+            u2[UnitModule] FULLY SHARDED
+                u2.l1[Linear]
+                u2.seq[Sequential]
+                    u2.seq.0[ReLU]
+                    u2.seq.1[Linear]
+                    u2.seq.2[ReLU]
+                u2.l2[Linear]
+            l2[Linear]
+      2. a dict mapping from the concated module FQN and class name to a list of its managed
+    original parameters' FQNs. An example of the dict for the above toy sharded model is like this:
+            {'[CompositeModel]': ['l1.weight', 'l1.bias', 'l2.weight', 'l2.bias'],
+             'u1[UnitModule]': ['u1.l1.weight', 'u1.l1.bias', 'u1.seq.1.weight', 'u1.seq.1.bias', 'u1.l2.weight', 'u1.l2.bias'],
+             'u2[UnitModule]': ['u2.l1.weight', 'u2.l1.bias', 'u2.seq.1.weight', 'u2.seq.1.bias', 'u2.l2.weight', 'u2.l2.bias']
+            }
+    All FQNs are prefixed starting from ``model``.
+
+    Args:
+        model (torch.nn.Module): Root module (which may or may not be passed to
+                                 composable `fully_shard()`).
+    """
+
+    def module_fn(
+        module, prefix, tree_level, sharded_tree_info, sharded_module_name_to_fqns
+    ):
+        num_spaces = tree_level * 4
+        trimed_prefix = (
+            prefix[:-1] if (len(prefix) > 0 and prefix[-1] == ".") else prefix
+        )
+        prefixed_module_name = trimed_prefix + "[" + module.__class__.__name__ + "]"
+        printed_prefixed_module_name = " " * num_spaces + prefixed_module_name
+
+        state = _get_module_fsdp_state(module)
+        if state is None:
+            sharded_tree_info[0] += printed_prefixed_module_name + "\n"
+            return
+
+        handle = state._fully_sharded_module_to_handle.get(module, None)
+
+        if handle:
+            sharded_tree_info[0] += (
+                printed_prefixed_module_name + " FULLY SHARDED" + "\n"
+            )
+        else:
+            sharded_tree_info[0] += printed_prefixed_module_name + "\n"
+
+        if handle:
+            param = handle.flat_param
+            assert isinstance(param, flat_param_file.FlatParameter)
+            global_fqns = [
+                clean_tensor_name(prefix + name) for name in param._fqns
+            ]  # prefixed from the top level `model` (i.e. including `prefix`)
+
+            if prefixed_module_name in sharded_module_name_to_fqns:
+                sharded_module_name_to_fqns[prefixed_module_name].extend(global_fqns)
+            else:
+                sharded_module_name_to_fqns[prefixed_module_name] = global_fqns
+
+    def return_fn(sharded_tree_info, sharded_module_name_to_fqns):
+        return sharded_tree_info[0], sharded_module_name_to_fqns
+
+    # Use List to mutate its value in place while running the recursive functions
+    sharded_tree_info: List[str] = [
+        "",
+    ]
+    sharded_module_name_to_fqns: Dict[str, List[str]] = {}
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        [key for key, _ in model.named_parameters()],
+        sharded_tree_info,
+        sharded_module_name_to_fqns,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_dynamo_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_dynamo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53629ae60334a0c0661b991188b43a5d3a460f0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_dynamo_utils.py
@@ -0,0 +1,45 @@
+from typing import Set
+
+import torch.nn as nn
+
+
+def _annotate_modules_for_dynamo(
+    module: nn.Module,
+    ignored_modules: Set[nn.Module],
+    use_orig_params: bool,
+):
+    """
+    Annotates the submodules in ``module`` 's tree, except those in
+    ``ignored_modules``, indicating that the submodules are FSDP-managed and
+    saving the ``use_orig_params`` setting passed to the FSDP constructor.
+    """
+    for submodule in module.modules():
+        if submodule not in ignored_modules:
+            """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
+
+            Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
+            it skips tracing all the torch.distributed.fsdp code.
+                - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
+                gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
+                - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
+                and we need a way to indicate to dynamo which modules are wrapped by FSDP.
+
+            (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
+            guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
+            their code is well-behaved.
+
+            One particular issue with specialized NNModules for FSDP is that the
+            views created for orig_params are captured into the compiled graph on the first iteration, and while
+            they are always going to point to the correct flatparameter and give correct results, their order
+            of creation influences the order of backward execution, preventing overlap of comm and computation
+            during backward.  We need to _use_ the new parameter views created on each forward iteration, in
+            order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
+            this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
+            """
+            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]
+
+            # Dynamo only supports FSDP with use_orig_params=True.
+            # This is hacky, but I could not think of another way to add an assertion to dynamo
+            # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
+            # FSDP module directly
+            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_exec_order_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_exec_order_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4770469c06464b77dbcff0e866ce014d143f5088
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_exec_order_utils.py
@@ -0,0 +1,364 @@
+import itertools
+import warnings
+from enum import auto, Enum
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import _FSDPState, _get_param_to_fqns
+from torch.distributed.fsdp._flat_param import FlatParamHandle
+
+
+class _ExecOrderWarnStatus(Enum):
+    """Used internally for execution order validation."""
+
+    NONE = auto()  # no deviation yet
+    WARNING = auto()  # deviated this iteration; currently issuing warnings
+    WARNED = auto()  # deviated in a previous iteration
+
+
+class _ExecOrderData:
+    """
+    This contains the data structures to track the execution order. We track
+    the pre-forward order on the *first* iteration for forward prefetching
+    (which thus assumes static graph) and the post-forward order on *every*
+    iteration for backward prefetching (which thus does not assume static
+    graph but may be provide an incorrect order).
+    """
+
+    def __init__(
+        self,
+        debug_level: dist.DebugLevel,
+        backward_prefetch_limit: int,
+        forward_prefetch_limit: int,
+    ) -> None:
+        # Tracks the (static) pre-forward order for execution order validation
+        # and forward prefetching
+        self.handles_pre_forward_order: List[FlatParamHandle] = []
+        # Tracks the post-forward order for pre-backward prefetching
+        self.handles_post_forward_order: List[Optional[FlatParamHandle]] = []
+        self._iter = 0
+
+        # Gives the max number of backward/forward prefetched all-gathers by a
+        # single module
+        self._backward_prefetch_limit = backward_prefetch_limit
+        self._forward_prefetch_limit = forward_prefetch_limit
+
+        # Data structures for execution order validation
+        self._checking_order: bool = debug_level == dist.DebugLevel.DETAIL
+        self.process_group: Optional[dist.ProcessGroup] = None
+        self.world_size: Optional[int] = None
+        self.all_handles: List[FlatParamHandle] = []
+        # Names are prefixed from the root module
+        self.param_to_fqn: Dict[nn.Parameter, List[str]] = {}
+        # Current index in the pre-forward execution order
+        self.current_order_index = 0
+        self.warn_status = _ExecOrderWarnStatus.NONE
+
+    def init(
+        self,
+        state: _FSDPState,
+        root_module: nn.Module,
+        process_group: dist.ProcessGroup,
+    ) -> None:
+        """
+        Initializes the data structures needed for checking the forward order.
+        This should be called after a root FSDP instance has been set during
+        lazy initialization.
+        """
+        self.process_group = process_group
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
+        # Fix an order over the handles, which should be the same across ranks
+        for handle in traversal_utils._get_fsdp_handles(root_module):
+            index = len(self.all_handles)
+            self.all_handles.append(handle)
+            handle._handle_index = index
+        self.param_to_fqn = _get_param_to_fqns(root_module)
+        # TODO (awgu): We can broadcast the metadata of rank 0's `all_handles`
+        # to check that all ranks have the same handles in the same order.
+        # https://github.com/pytorch/pytorch/issues/79620
+
+    @property
+    def is_first_iter(self) -> bool:
+        return self._iter == 0
+
+    def get_handle_to_backward_prefetch(
+        self,
+        current_handle: FlatParamHandle,
+    ) -> Optional[FlatParamHandle]:
+        """
+        Returns a :class:`list` of the handles keys of the handles to backward
+        prefetch given the current handles key. If there are no valid handles
+        keys to prefetch, then this returns an empty :class:`list`.
+        """
+        current_index = current_handle._post_forward_index
+        if current_index is None:
+            return None
+        target_index = current_index - 1
+        target_handle: Optional[FlatParamHandle] = None
+        for _ in range(self._backward_prefetch_limit):
+            if target_index < 0:
+                break
+            target_handle = self.handles_post_forward_order[target_index]
+            target_index -= 1
+        return target_handle
+
+    def get_handle_to_forward_prefetch(
+        self,
+        current_handle: FlatParamHandle,
+    ) -> Optional[FlatParamHandle]:
+        """
+        Returns a :class:`list` of the handles keys of the handles to forward
+        prefetch given the current handles key. If there are no valid handles
+        keys to prefetch, then this returns an empty :class:`list`.
+        """
+        current_index = current_handle._pre_forward_order_index
+        if current_index is None:
+            return None
+        target_index = current_index + 1
+        target_handle: Optional[FlatParamHandle] = None
+        for _ in range(self._forward_prefetch_limit):
+            if target_index >= len(self.handles_pre_forward_order):
+                break
+            target_handle = self.handles_pre_forward_order[target_index]
+            target_index += 1
+        return target_handle
+
+    def record_post_forward(self, handle: Optional[FlatParamHandle]) -> None:
+        """
+        Records ``handles`` in the post-forward order, where ``handles`` should
+        be a group of handles used in the same module's forward. If ``handles``
+        is empty, then it is omitted.
+
+        Unlike :meth:`record_pre_forward`, this records the order *every*
+        iteration with the expectation that the recorded order is reset in
+        :meth:`next_iter`.
+        """
+        if not handle:
+            return
+        # Only record the first usage of a handles key
+        if handle._post_forward_index:
+            self.handles_post_forward_order.append(handle)
+            return
+        index = len(self.handles_post_forward_order)
+        handle._post_forward_index = index
+        self.handles_post_forward_order.append(handle)
+
+    def record_pre_forward(
+        self, handle: Optional[FlatParamHandle], is_training: bool
+    ) -> None:
+        """
+        Records ``handles`` in the pre-forward order, where ``handles`` should
+        be a group of handles used in the same module's forward. If ``handles``
+        is empty, then it is omitted.
+
+        On the first iteration, this checks the execution order across ranks.
+        See :meth:`_check_order` for details.
+        """
+        if not handle:
+            return
+        self._check_order(handle, is_training)
+        # Fix the order after the first iteration and only record the first
+        # usage of a handles key
+        if not self.is_first_iter or handle._pre_forward_order_index is not None:
+            return
+        index = len(self.handles_pre_forward_order)
+        handle._pre_forward_order_index = index
+        self.handles_pre_forward_order.append(handle)
+
+    def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
+        """
+        Checks the forward execution order as long as ``is_training`` is
+        ``True`` since checking in eval mode is not supported. This only checks
+        if the distributed debug level is DETAIL.
+
+        - On the first iteration, this uses all-gathers to check that all ranks
+        are all-gathering the same handles and hence ``FlatParameter`` s,
+        raising an error if not.
+        - On subsequent iterations, this checks that each rank is locally
+        consistent with its own forward order from the first iteration, issuing
+        a warning if not. This issues a warning on the first deviating
+        iteration and stops warning thereafter.
+        """
+        # Do not check order in eval mode since the post-backward callback does
+        # not run so it cannot be used to mark the end of an iteration
+        if not is_training or not self._checking_order:
+            return
+        if self.is_first_iter:
+            msg_prefix = "Forward order differs across ranks:"
+            optional_local_indices: Tuple[
+                Optional[int], ...
+            ] = self._get_handle_indices(handle)
+            device = handle.device  # guaranteed to be non-CPU
+            num_valid_indices = sum(
+                (index is not None) for index in optional_local_indices
+            )
+            tensor_kwargs: Dict[str, Union[torch.dtype, torch.device]] = {
+                "dtype": torch.int32,
+                "device": device,
+            }
+            world_num_valid_indices = torch.zeros(self.world_size, **tensor_kwargs)  # type: ignore[arg-type, call-overload]
+            local_num_valid_indices = torch.tensor([num_valid_indices], **tensor_kwargs)  # type: ignore[arg-type, call-overload]
+            dist.all_gather_into_tensor(
+                world_num_valid_indices,
+                local_num_valid_indices,
+                group=self.process_group,
+            )
+            # Copy entire tensor from D2H once to avoid per element D2H copies
+            world_num_valid_indices = world_num_valid_indices.cpu()
+            # Check that all ranks plan to all-gather the same number of
+            # parameters
+            # TODO (awgu): Since every module has at most one handle in the
+            # current implementation, this should never raise the error.
+            assert self.world_size is not None  # mypy
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                # TODO(voz): Don't graph break on this - dynamo hates the n1 != n2
+                # tensor comparison control flow.
+                # https://github.com/pytorch/pytorch/issues/107055
+                for (r1, n1), (r2, n2) in itertools.combinations(
+                    (
+                        (rank, world_num_valid_indices[rank])
+                        for rank in range(self.world_size)
+                    ),
+                    2,
+                ):
+                    if n1 != n2:
+                        raise RuntimeError(
+                            f"{msg_prefix} rank {r1} is all-gathering {n1} parameters "
+                            f"while rank {r2} is all-gathering {n2} parameters"
+                        )
+            world_indices = torch.zeros(  # type: ignore[call-overload]
+                self.world_size * num_valid_indices, **tensor_kwargs
+            )
+            local_indices = torch.tensor(optional_local_indices, **tensor_kwargs)  # type: ignore[arg-type]
+            dist.all_gather_into_tensor(
+                world_indices, local_indices, group=self.process_group
+            )
+            # Copy entire tensor from D2H once to avoid per element D2H copies
+            world_indices = world_indices.cpu()
+            # Check that all ranks plan to all-gather the same index parameters
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                # TODO(voz): Don't graph break on this - dynamo hates the i1 != i2
+                # tensor comparison control flow.
+                # https://github.com/pytorch/pytorch/issues/107055
+                for (r1, i1), (r2, i2) in itertools.combinations(
+                    (
+                        (
+                            rank,
+                            world_indices[
+                                rank
+                                * num_valid_indices : (rank + 1)
+                                * num_valid_indices
+                            ],
+                        )
+                        for rank in range(self.world_size)
+                    ),
+                    2,
+                ):
+                    if i1 != i2:
+                        r1_param_names = self._get_names_from_handle_indices(i1)
+                        r2_param_names = self._get_names_from_handle_indices(i2)
+                        raise RuntimeError(
+                            f"{msg_prefix} rank {r1} is all-gathering parameters "
+                            f"for {r1_param_names} while rank {r2} is all-gathering "
+                            f"parameters for {r2_param_names}"
+                        )
+        else:
+            # Only issue warnings on the first deviating iteration and stop
+            # checking thereafter to avoid flooding the console
+            if self.warn_status == _ExecOrderWarnStatus.WARNED:
+                return
+            msg_prefix = None  # non-`None` means we should warn
+            if self.current_order_index >= len(self.handles_pre_forward_order):
+                # This iteration sees extra all-gather(s) compared to the first
+                msg_prefix = (
+                    "Expected to not all-gather any more parameters in the "
+                    "forward but trying to all-gather parameters for "
+                )
+            else:
+                expected_handle = self.handles_pre_forward_order[
+                    self.current_order_index
+                ]
+                if expected_handle != handle:
+                    expected_param_names = self._get_names_from_handles(expected_handle)
+                    msg_prefix = (
+                        f"Expected to all-gather for {expected_param_names} "
+                        "but trying to all-gather parameters for "
+                    )
+            if msg_prefix is not None:
+                param_names = self._get_names_from_handles(handle)
+                msg_suffix = (
+                    f"{param_names}"
+                    if param_names
+                    else "a newly-added parameter since construction time"
+                )
+                warnings.warn(
+                    "Forward order differs from that of the first iteration "
+                    f"on rank {self.rank}. Collectives are unchecked and may "
+                    f"give incorrect results or hang.\n{msg_prefix}{msg_suffix}"
+                )
+                self.warn_status = _ExecOrderWarnStatus.WARNING
+            self.current_order_index += 1
+
+    def _get_handle_indices(
+        self,
+        handle: FlatParamHandle,
+    ) -> Tuple[Optional[int], ...]:
+        """
+        Returns the handle indices (i.e. indices into ``self.all_handles``)
+        corresponding to the handles in ``handle``. An entry in the
+        returned tuple is ``None`` if the handle is invalid.
+        """
+        indices: List[Optional[int]] = []
+        if handle:
+            indices.append(handle._handle_index)
+        return tuple(indices)
+
+    def _get_names_from_handle_indices(
+        self,
+        handle_indices: Tuple[int, ...],
+    ) -> List[List[str]]:
+        """
+        Returns a list of FQNs for each handle in ``handle_indices``. If a
+        handle index is invalid, then its FQNs are omitted from the returned
+        list.
+        """
+        fqns: List[List[str]] = []
+        for index in handle_indices:
+            if index is None or index < 0 or index >= len(self.all_handles):
+                continue
+            handle = self.all_handles[index]
+            flat_param = handle.flat_param
+            fqns.append(self.param_to_fqn[flat_param])
+        return fqns
+
+    def _get_names_from_handles(
+        self,
+        handle: FlatParamHandle,
+    ) -> List[List[str]]:
+        """
+        Returns a list of FQNs for each handle in ``handles_key``. If a handle
+        is invalid, then its FQNs are omitted from the returned list.
+        """
+        fqns: List[List[str]] = []
+        if handle:
+            flat_param = handle.flat_param
+            if flat_param in self.param_to_fqn:
+                fqns.append(self.param_to_fqn[flat_param])
+        return fqns
+
+    def next_iter(self):
+        """
+        Advances the internal data structures per iteration. This should be
+        called in the post-backward callback since that marks the true end of
+        an iteration.
+        """
+        self._iter += 1
+        self.handles_post_forward_order.clear()
+        if self._checking_order:
+            self.current_order_index = 0
+            if self.warn_status == _ExecOrderWarnStatus.WARNING:
+                self.warn_status = _ExecOrderWarnStatus.WARNED
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_flat_param.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_flat_param.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a4ebbb27127d3fc54b3afab1f1bf28e1a3f367
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_flat_param.py
@@ -0,0 +1,2731 @@
+import contextlib
+import functools
+import logging
+import os
+import warnings
+from enum import auto, Enum
+from itertools import accumulate, chain
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    NamedTuple,
+    no_type_check,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributed.fsdp._common_utils import (
+    _FSDPDeviceHandle,
+    _named_parameters_with_duplicates,
+    _no_dispatch_record_stream,
+    _set_fsdp_flattened,
+    HandleTrainingState,
+)
+from torch.distributed.utils import (
+    _alloc_storage,
+    _data_ptr_allocated,
+    _free_storage,
+    _p_assert,
+)
+from torch.nn.parameter import _ParameterMeta  # type: ignore[attr-defined]
+from torch.testing._internal.distributed.fake_pg import FakeProcessGroup
+
+from ._fsdp_extensions import (
+    _ext_post_unflatten_transform,
+    _ext_pre_flatten_transform,
+    FSDPExtensions,
+)
+
+__all__ = [
+    "FlatParameter",
+    "FlatParamHandle",
+    "FlatParamShardMetadata",
+    "ParamInfo",
+    "SharedParamInfo",
+    "HandleShardingStrategy",
+]
+
+log = logging.getLogger(__name__)
+
+
+"""
+[Note: Fully Sharded Module]
+We define the "fully sharded module" to be the original ``nn.Module`` that owns
+a ``FlatParamHandle``. It is the *single* module logically responsible for the
+*single* unshard/reshard pair for the handle's ``FlatParameter`` for a given
+forward or backward pass. The fully sharded module should be passed to the
+``FlatParamHandle`` constructor.
+
+For the wrapper code path:
+- The ``FullyShardedDataParallel`` module wrapping the fully sharded module
+runs the unshard/reshard on behalf of the fully sharded module by overriding
+``nn.Module.forward``.
+- The fully sharded module is exactly the module passed to the
+``FullyShardedDataParallel`` constructor's ``module`` argument.
+
+For the non-wrapper code path:
+- Hooks registered on the fully sharded module run the unshard/reshard.
+- The fully sharded module may either be the direct argument to ``fully_shard``
+or a submodule chosen by the provided wrapping policy.
+"""
+
+# Environment variable toggling whether to use unsafe `setattr()` for view
+# setting in `_use_sharded_views()` and `_use_unsharded_views()`
+# We should use 'safe' by default since it respects method overrides, but for
+# special cases such as for high CPU overhead or for intentionally bypassing
+# checks in the overrides, we may use 'unsafe'.
+_FSDP_USE_UNSAFE_SETATTR = "FSDP_USE_UNSAFE_SETATTR"
+
+# Environment variable toggling whether to check for parameter/gradient
+# writeback in case their storages change after FSDP initialization
+# We should check by default since it prevents silent correctness errors, but
+# since such changes are atypical, we may want to skip the check to save CPU
+# overhead, especially since the check happens in the pre-forward and
+# pre-backward each iteration.
+_FSDP_SKIP_WRITEBACK_CHECK = "FSDP_SKIP_WRITEBACK_CHECK"
+
+# Env var toggling whether when model is in .eval() mode, should we run in fp32
+# or the reduced precision.
+_FSDP_USE_FULL_PREC_IN_EVAL = "FSDP_USE_FULL_PREC_IN_EVAL"
+
+# Some value to set padding in tensors to for debuggability
+_FLAT_PARAM_PADDING_VALUE = 42
+
+# Environment variables for disabling the all-gather and reduce-scatter
+# communication ops for ablation studies. Note that without these communication
+# ops the training won't converge, and you probably need to disable correctness
+# checks in your model.
+_FSDP_USE_FAKE_ALL_GATHER = "FSDP_USE_FAKE_ALL_GATHER"
+_FSDP_USE_FAKE_REDUCE = "FSDP_USE_FAKE_REDUCE"
+
+
+# TODO: Define this for now to avoid circular imports. See if we can remove.
+class HandleShardingStrategy(Enum):
+    FULL_SHARD = auto()
+    SHARD_GRAD_OP = auto()
+    NO_SHARD = auto()
+    HYBRID_SHARD = auto()
+    _HYBRID_SHARD_ZERO2 = auto()
+
+
+RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES = (
+    HandleShardingStrategy.FULL_SHARD,
+    HandleShardingStrategy.HYBRID_SHARD,
+)
+NO_RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES = (
+    HandleShardingStrategy.SHARD_GRAD_OP,
+    HandleShardingStrategy._HYBRID_SHARD_ZERO2,
+)
+
+
+class ParamInfo(NamedTuple):
+    """Information for an original parameter."""
+
+    param_name: str  # unprefixed
+    module: nn.Module
+    module_name: str
+
+
+class SharedParamInfo(NamedTuple):
+    """
+    Additional information for a shared parameter.
+
+    For each shared parameter, we designate one module and its parameter
+    variable to be the primary owner, determined as the first one encountered
+    in the parameter walk. These are prefixed with "prim". The primary module
+    and parameter do not have their own :class:`SharedParamInfo` instance.
+    """
+
+    param_name: str  # unprefixed
+    module: nn.Module
+    module_name: str
+    prim_param_name: str  # unprefixed
+    prim_module: nn.Module
+    prim_module_name: str
+
+
+class _ShardParamInfo(NamedTuple):
+    """Shard-related information for an original parameter."""
+
+    in_shard: bool
+    # Use to index into the sharded flat parameter, e.g.
+    # `flat_param[offset_in_shard : offset_in_shard + numel_in_shard]`
+    offset_in_shard: Optional[int]
+    numel_in_shard: Optional[int]
+    # Use to get part of the parameter in the local shard from a flattened
+    # version of the unsharded parameter, e.g.
+    # `param.flatten()[intra_param_start_idx : intra_param_end_idx + 1]`
+    intra_param_start_idx: Optional[int]
+    intra_param_end_idx: Optional[int]  # inclusive
+
+
+class FlatParamShardMetadata(NamedTuple):
+    """
+    This holds metadata specific to this rank's shard of the flat parameter.
+
+    Attributes:
+        param_names (Tuple[str, ...]): Prefixed parameter names of this rank's
+            shard of the parameters; see :class:`FlatParameter`.
+        param_shapes (Tuple[torch.Size, ...]): Parameter shapes of this rank's
+            shard of the parameters; see :class:`FlatParameter`.
+        param_numels (Tuple[int, ...]): Parameter numels of this rank's shard
+            of the parameters; see :class:`FlatParameter`.
+        param_offsets (Tuple[Tuple[int, int], ...]): [start, end] offsets (in
+            units of numels) giving this rank's part of each flattened
+            original parameter.
+    """
+
+    param_names: Tuple[str, ...]
+    param_shapes: Tuple[torch.Size, ...]
+    param_numels: Tuple[int, ...]
+    param_offsets: Tuple[Tuple[int, int], ...]
+
+
+class _FlatParameterMeta(_ParameterMeta):
+    # Make `isinstance(t, FlatParameter)` return True for custom tensor
+    # instances that have the _is_flat_param flag for BC
+    def __instancecheck__(self, instance):
+        # NB: do NOT test the super implementation
+        return isinstance(instance, torch.Tensor) and getattr(
+            instance, "_is_flat_param", False
+        )
+
+
+class FlatParameter(nn.Parameter, metaclass=_FlatParameterMeta):
+    """
+    This is the flat parameter used by :class:`FullyShardedDataParallel`.
+
+    It is comprised of one or more original parameters, which are flattened and
+    concatenated to construct the flat parameter.
+
+    Under the current design, this parameter logically represents both the
+    unsharded and sharded flat parameter, and its data changes storages
+    dynamically.
+        - In the :class:`FullyShardedDataParallel` constructor, the parameter
+        is initialized as unsharded and then sharded in-place.
+        - At runtime, the parameter is lazily (re)-initialized. The sharded
+        parameter data is saved in ``self._local_shard``, and a new ``Tensor``
+        ``self._full_param_padded`` is created, which is the all-gather
+        destination and owns the unsharded parameter storage thereafter. (See
+        :meth:`FlatParamHandle.init_flat_param_attributes`.)
+        - Throughout runtime, the parameter data changes storages as needed,
+        e.g. to the sharded flat parameter, low precision sharded flat
+        parameter, or the unsharded flat parameter.
+
+    NOTE: Since ``use_orig_params=True`` supports intra-``FlatParameter``
+    padding, we have two versions of the per-parameter numels, one that
+    includes the padding (``_numels_with_padding``) and one that does not
+    (``_numels``). The former may have length longer than the other data
+    structures, while the latter has the same length as the number of actual
+    original parameters like the other per-parameter data structures.
+
+    NOTE: This is not a real class; instead, you will always get a Parameter
+    back out if you try to create one of these.  This is similar to the trick
+    we implemented for Parameter to get it to work with subclasses; this
+    is primarily so that FlatParameter supports combination with FakeTensor.
+
+    Attributes:
+        _unpadded_unsharded_size (torch.Size): Unsharded flat parameter's size
+            without right-hand-side padding for divisibility by the world size.
+            For ``use_orig_params=True``, this includes alignment padding.
+        _padded_unsharded_size (torch.Size): Unsharded flat parameter's size
+            with right-hand-side padding for divisibility by the world size.
+            For ``use_orig_params=True``, this includes alignment padding. This
+            is only set for sharded strategies since they require padding for
+            the all-gather.
+        _sharded_size (torch.Size): Sharded flat parameter's size with padding.
+            This is also set for ``NO_SHARD``, in which case it is the same as
+            the unsharded sizes. (We omit "padded" because there is no
+            analogous unpadded one.)
+
+        _num_params (int): Number of original parameters flattened into this
+            flat parameter. This is the length of the per-parameter data
+            structures.
+        _param_infos (Tuple[ParamInfo, ...]): Each parameter's parameter info
+            entry; see :class:`ParamInfo` for details.
+        _shapes (Tuple[torch.Size, ...]): Each parameter's original shape.
+        _fqns (Tuple[str, ...]): Each parameter's fully-qualified name (FQN)
+            prefixed from the ``_fully_sharded_module``. The names are
+            guaranteed to be unique in the subtree rooted at that module.
+        _param_extensions (Tuple[Optional[Any], ...]): Each parameter's
+            extension (i.e. some per-parameter state) used to customize
+            pre-flatten and post-unflatten behavior or ``None``. This is
+            experimental, and users should not depend on its existence in the
+            future.
+        _numels_with_padding (Tuple[int, ...]): Each parameter's numel
+            including entries for the padding. This is used to construct views
+            into the flat parameter via ``torch.split()``. This may have length
+            longer than ``_num_params``.
+        _numels (Tuple[int, ...]): Each parameter's numel excluding entries for
+            padding. This has length equal to ``_num_params``.
+        _shard_param_infos (Tuple[_ShardParamInfo, ...]): Each parameter's
+            shard parameter info; see :class:`_ShardParamInfo` for details.
+        _shared_param_infos (Tuple[SharedParamInfo, ...]): Shared parameter
+            info entries; see :class:`SharedParamInfo` for details.
+        _modules (Set[nn.Module]): Modules that contain some original parameter
+            that is flattened into the flat parameter.
+
+        _shard_numel_padded (int): Numel padded for this rank's sharded flat
+            parameter.
+        _local_shard (Tensor): Sharded flat parameter with padding if using a
+            sharded strategy. If using ``NO_SHARD``, then this is the unpadded
+            unsharded flat parameter, and there is no notion of a sharded flat
+            parameter or padded unsharded flat parameter.
+        _full_param_padded (Tensor): Unsharded flat parameter with padding.
+            This is not defined for ``NO_SHARD``. When using mixed precision
+            for parameters, this has the low precision.
+        _full_prec_full_param_padded (Tensor): Full precision unsharded flat
+            parameter with padding. This is used for unsharding outside of
+            computation when using mixed precision for parameters. This is
+            never defined for ``NO_SHARD``.
+        _post_backward_hook_handle (RemovableHandle):
+            Flat parameter's post-backward hook handle. (Compile only)
+        _post_backward_hook_state (Tuple[AccumulateGrad, RemovableHandle]):
+            Flat parameter's :class:`AccumulateGrad` object and post-backward
+            hook handle. (Eager only)
+        _mp_shard (Tensor): Low precision sharded flat parameter with padding.
+            This is only defined when parameter mixed precision is enabled. For
+            ``NO_SHARD``, this is used for computation.
+        _cpu_grad (Tensor): Sharded gradient with padding stored on CPU.
+            This is only defined when offloading parameters is enabled.
+        _saved_grad_shard (Tensor): Sharded gradient with padding from previous
+            iterations for gradient accumulation without :meth:`no_sync`.
+
+        _params (Optional[List[nn.Parameter]]): If ``use_orig_params=True``,
+            then each original parameter variable; otherwise, ``None``. This
+            does not include any padding tensors.
+        _shared_params (Optional[List[nn.Parameter]]): The original shared
+            parameter variables if ``use_orig_params=True`` and ``None``
+            otherwise.
+        _tensors (Optional[List[Optional[Tensor]]]): This saves the ``Tensor``
+            views created in the forward and tracked by autograd when
+            ``use_orig_params=True`` and is ``None`` otherwise. This is to
+            preserve those ``Tensor`` variables for the backward to ensure that
+            the ``FlatParameter`` 's ``AccumulateGrad`` object does not change
+            in which case the post-backward hook does not run. This is relevant
+            for cases like reentrant activation checkpointing.
+        _is_grad_none_mask (Optional[List[bool]]): If ``use_orig_params=True``,
+            a mask over the original parameters' gradients indicating if it is
+            logically ``None`` or not; otherwise, ``None``. This does not
+            include entries for padding. This mask is needed because only some
+            of the parameters may have ``None`` gradient, in which case the
+            flat gradient must be non-``None`` and must use zeros to
+            approximate those original ``None`` gradients. This mask informs
+            FSDP to set the original parameter gradients to ``None`` (instead
+            of zeros) as needed.
+    """
+
+    _unpadded_unsharded_size: torch.Size
+    _padded_unsharded_size: torch.Size
+    _sharded_size: torch.Size
+    _num_params: int
+    _param_infos: Tuple[ParamInfo, ...]
+    _shapes: Tuple[torch.Size, ...]
+    _fqns: Tuple[str, ...]
+    _param_extensions: Tuple[Optional[Any], ...]
+    _numels_with_padding: Tuple[int, ...]
+    _numels: Tuple[int, ...]
+    _shard_param_infos: Tuple[_ShardParamInfo, ...]
+    _shared_param_infos: Tuple[SharedParamInfo, ...]
+    _modules: Set[nn.Module]
+    _shard_numel_padded: int
+    _local_shard: Tensor
+    _full_param_padded: Tensor
+    _full_prec_full_param_padded: Tensor
+    # Eager only
+    _post_backward_hook_state: Tuple[Any, Any]
+    # Compile only
+    _post_backward_hook_handle: Any
+    _mp_shard: Tensor
+    _cpu_grad: Tensor
+    _saved_grad_shard: Tensor
+    _params: Optional[List[nn.Parameter]]
+    _shared_params: Optional[List[nn.Parameter]]
+    _tensors: Optional[List[Optional[Tensor]]]
+    _is_grad_none_mask: Optional[List[bool]]
+
+    _is_padding_mask: List[bool]
+
+    def __new__(cls, data=None, requires_grad=True):
+        assert cls is FlatParameter, "subclasses FlatParameter not supported"
+        r = nn.Parameter.__new__(nn.Parameter, data, requires_grad)  # type: ignore[call-arg]
+        r._is_flat_param = True  # type: ignore[attr-defined]
+        return r
+
+    # NB: This is not a regular method, because FlatParameters are not actually
+    # instances of this class (see __new__ above).  So you must indirectly
+    # call this directly through the classmethod.
+    @classmethod
+    def _init_metadata(
+        cls,
+        self,
+        param_infos: List[ParamInfo],
+        numels: List[int],
+        shapes: List[torch.Size],
+        fqns: List[str],
+        shared_param_infos: List[SharedParamInfo],
+        param_extensions: List[Optional[Any]],
+        params: Optional[List[nn.Parameter]],
+        shared_params: Optional[List[nn.Parameter]],
+        is_padding_mask: List[bool],
+    ) -> None:
+        """
+        Initialize attributes holding metadata about the original parameters comprising the flat parameter.
+
+        We expose this method separate from the constructor to keep the
+        constructor only responsible for the flat parameter's tensor data. This
+        method should only be called once per model, while the constructor may
+        be called multiple times, e.g. when reloading from a checkpoint, in
+        which case only the tensor data needs to be passed to the constructor.
+        Since :meth:`load_state_dict` is implemented via :meth:`copy_`, the
+        metadata is correctly assumed to be unchanged.
+
+        Args:
+            See the Attributes in the class docstring.
+        """
+        assert len(param_infos) == len(shapes)
+        assert len(param_infos) == len(fqns)
+        assert len(param_infos) == len(param_extensions)
+        self._num_params = len(param_infos)
+        self._param_infos = param_infos
+        self._shapes = shapes
+        self._fqns = fqns
+        self._param_extensions = param_extensions
+        self._is_padding_mask = is_padding_mask
+
+        numels_without_padding: List[int] = []
+        for numel, is_padding in zip(numels, is_padding_mask):
+            if not is_padding:
+                numels_without_padding.append(numel)
+        self._numels = tuple(numels_without_padding)
+        self._numels_with_padding = tuple(numels)
+        assert len(self._numels) == self._num_params
+
+        self._shared_param_infos = tuple(shared_param_infos)
+        self._modules = {pi.module for pi in self._param_infos}.union(
+            {spi.module for spi in self._shared_param_infos}
+        )
+        assert (params is None) == (shared_params is None)
+        if params is not None:
+            assert shared_params is not None and len(shared_params) == len(
+                shared_param_infos
+            )
+            self._params = []
+            for param, is_padding in zip(params, is_padding_mask):
+                if not is_padding:
+                    self._params.append(param)
+            self._shared_params = shared_params
+            # Mark the original parameters to avoid flattening them into
+            # another `FlatParameter` during recursive construction
+            for param in chain(self._params, self._shared_params):
+                _set_fsdp_flattened(param)
+            self._is_grad_none_mask = [False for _ in range(self._num_params)]
+            self._tensors = [None for _ in range(self._num_params)]
+        else:
+            self._params = None
+            self._shared_params = None
+            self._is_grad_none_mask = None
+            self._tensors = None
+        self._unpadded_unsharded_size = self.size()
+        _set_fsdp_flattened(self)
+        # Tracks whether the `FlatParameter`'s post-backward hook has been
+        # called to modify the behavior of the post-backward callback
+        self._post_backward_called = False
+
+
+class FlatParamHandle:
+    """
+    A handle that manages a flat parameter (:class:`FlatParameter`).
+
+    This includes sharding and view management.
+
+    Args:
+        params (Sequence[nn.Parameter]): The parameters to flatten into the
+            flat parameter.
+        fully_sharded_module (nn.Module): See [Note: Fully Sharded Module].
+        device (torch.device): The compute and communication device, which
+            should be a non-CPU device. We refer to it as the compute device.
+        sharding_strategy (ShardingStrategy): Sharding strategy to apply to
+            this handle's ``FlatParameter``.
+        offload_params (bool): Whether to offload the handle's
+            ``FlatParameter`` to CPU.
+        mp_param_dtype (Optional[torch.dtype]): Parameter mixed precision
+            setting passed to the FSDP constructor.
+        mp_reduce_dtype (Optional[torch.dtype]): Gradient reduction mixed
+            precision setting passed to the FSDP constructor.
+        keep_low_precision_grads (bool): Whether to keep gradients in low
+            precision.
+        use_orig_params (bool): If ``True``, then FSDP preserves the original
+            parameter variables and returns them from ``named_parameters()``
+            (e.g. to support different optimizer hyperparameters within one
+            :class:`FlatParameter`). If ``False``, then FSDP reconstructs the
+            parameters every iteration and returns the :class:`FlatParameter` s
+            from ``named_parameters()``.
+    """
+
+    ##################
+    # INITIALIZATION #
+    ##################
+    def __init__(
+        self,
+        params: Sequence[Union[nn.Parameter, Tensor]],
+        fully_sharded_module: nn.Module,
+        device: torch.device,
+        sharding_strategy: HandleShardingStrategy,
+        offload_params: bool,
+        mp_param_dtype: Optional[torch.dtype],
+        mp_reduce_dtype: Optional[torch.dtype],
+        keep_low_precision_grads: bool,
+        process_group: dist.ProcessGroup,
+        use_orig_params: bool,
+        *,
+        fsdp_extension: Optional[FSDPExtensions] = None,
+    ):
+        super().__init__()
+        params = list(params)
+        if len(params) == 0:
+            raise ValueError(
+                f"Cannot construct a {self.__class__.__name__} with an empty parameter list"
+            )
+        self._init_setattr_fns()
+        self._skip_writeback_check = (
+            os.environ.get(_FSDP_SKIP_WRITEBACK_CHECK, "") == "1"
+        )
+        self._use_full_prec_in_eval = (
+            os.environ.get(_FSDP_USE_FULL_PREC_IN_EVAL, "") == "1"
+        )
+        self._use_fake_all_gather = os.environ.get(_FSDP_USE_FAKE_ALL_GATHER, "") == "1"
+        self._use_fake_reduce = os.environ.get(_FSDP_USE_FAKE_REDUCE, "") == "1"
+        if self._skip_writeback_check:
+            _warn_skip_writeback_check(
+                log,
+                f"Since {_FSDP_SKIP_WRITEBACK_CHECK}=1, FSDP will not check "
+                "for parameter or gradient writeback. Changing parameter or "
+                "gradient storages may lead to silent correctness errors.",
+            )
+        if self._use_fake_all_gather:
+            _warn_use_fake_all_gather(
+                log,
+                f"Since {_FSDP_USE_FAKE_ALL_GATHER}=1, FSDP will not execute "
+                "all-gather ops. Your training will be incorrect, but "
+                "can reveal how much time spent on all-gather ops.",
+            )
+        if self._use_fake_reduce:
+            _warn_use_fake_reduce(
+                log,
+                f"Since {_FSDP_USE_FAKE_REDUCE}=1, FSDP will not execute "
+                "reduce-scatter ops. Your training will be incorrect, but "
+                "can reveal how much time spent on reduce-scatter ops.",
+            )
+        # Only align addresses for `use_orig_params=True` (for now)
+        align_addresses = use_orig_params
+        self._init_get_unflat_views_fn(align_addresses)
+        self.device = device
+        self._device_handle = _FSDPDeviceHandle.from_device(self.device)
+        self.process_group = process_group
+        if self._use_fake_all_gather or self._use_fake_reduce:
+            self._fake_process_group = FakeProcessGroup(
+                rank=process_group.rank(), world_size=process_group.size()
+            )
+        self.rank = process_group.rank()
+        self.world_size = process_group.size()
+        self._sharding_strategy = sharding_strategy
+        self._offload_params = offload_params
+        self._use_orig_params = use_orig_params
+        self._keep_low_precision_grads = keep_low_precision_grads
+        self._training_state = HandleTrainingState.IDLE
+        self._debug_level = dist.get_debug_level()
+        self._fully_sharded_module = fully_sharded_module
+        # For strategies that do not free after forward, we skip using sharded
+        # views after forward since the unsharded data exists. We still switch
+        # `self.flat_param` to point to the sharded flat parameter since what
+        # it points to parameterizes behavior. We use the following attribute
+        # to track which tensor data the parameters are unsharded views into.
+        self._unsharded_flat_param_for_skipped_views: Optional[Tensor] = None
+        # The index in the state's `all_handles`, which must be the
+        # same across ranks for the execution order validation to work
+        self._handle_index: Optional[int] = None
+        # Index in handles_to_pre_forward_order
+        self._pre_forward_order_index: Optional[int] = None
+        # Index in `handles_post_forward_order`
+        self._post_forward_index: Optional[int] = None
+        # Used for guarding against mistargeted forward prefetches
+        self._needs_pre_forward_unshard = False
+        # Used for guarding against mistargeted backward prefetches
+        self._needs_pre_backward_unshard = False
+        # Was the handle prefetched? Set on successful _prefetch_handle and unshard
+        self._prefetched = False
+        # Optimistically assume a valid input `params` and set dtype attributes
+        # before `_init_flat_param()`, which performs the actual validation
+        self._orig_param_dtype = params[0].dtype
+        self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype)
+        assert self._fwd_bwd_param_dtype is not None  # mypy
+        self._aligned_numel = (
+            _get_aligned_numel(unsharded_dtype=self._fwd_bwd_param_dtype)
+            if align_addresses
+            else 0
+        )
+        self._fsdp_extension = fsdp_extension
+        self._init_flat_param_and_metadata(
+            params, fully_sharded_module, self._aligned_numel, use_orig_params  # type: ignore[arg-type]
+        )
+        self._use_unsharded_views(as_params=False)
+
+    def _init_setattr_fns(self):
+        use_unsafe_setattr = os.environ.get(_FSDP_USE_UNSAFE_SETATTR, "") == "1"
+        self._setattr_tensor: Callable[[nn.Module, str, Tensor], None]
+        self._setattr_param: Callable[[nn.Module, str, nn.Parameter], None]
+        if use_unsafe_setattr:
+            self._setattr_tensor = _unsafe_setattr_tensor
+            self._setattr_param = _unsafe_setattr_param
+        else:
+            self._setattr_tensor = _safe_setattr_tensor_or_param
+            self._setattr_param = _safe_setattr_tensor_or_param
+
+    def _init_get_unflat_views_fn(self, align_addresses: bool):
+        self._get_unflat_views = (
+            self._get_unflat_views_aligned
+            if align_addresses
+            else self._get_unflat_views_unaligned
+        )
+
+    def _init_flat_param_and_metadata(
+        self,
+        params: List[Union[Tensor, nn.Parameter]],
+        module: nn.Module,
+        aligned_numel: int,
+        use_orig_params: bool,
+    ) -> None:
+        """
+        Initialize the ``FlatParameter`` and its metadata.
+
+        NOTE: This should only be called once at construction time, after which
+        the ``FlatParameter`` metadata is assumed to be static.
+
+        NOTE: The elements of ``params`` should only be ``Tensor`` s when
+        composing with ``DTensor`` -based tensor parallelism, in which case the
+        elements may be ``DTensor`` local shards.
+        """
+        if len(params) == 0:
+            raise ValueError("Expects non-empty `params`")
+        if aligned_numel < 0:
+            raise ValueError(
+                f"Expects non-negative `aligned_numel` but got {aligned_numel}"
+            )
+        (
+            dtype,
+            flat_param_requires_grad,
+            device,
+        ) = self._validate_tensors_to_flatten(params)
+        params_set = set(params)
+        # For alignment padding, only `numels` gets strictly non-`None`
+        # elements, and all other lists get `None` elements for padding.
+        param_infos: List[ParamInfo] = []
+        numels: List[int] = []
+        shapes: List[torch.Size] = []
+        fqns: List[str] = []
+        shared_param_infos: List[SharedParamInfo] = []
+        shared_param_memo: Dict[
+            Union[Tensor, nn.Parameter], Tuple[nn.Module, str, str]
+        ] = {}
+        params_to_flatten: List[Union[Tensor, nn.Parameter]] = []
+        shared_params: List[Union[Tensor, nn.Parameter]] = []
+        param_extensions: List[Any] = []
+        is_padding_mask: List[bool] = []
+        total_numel = total_numel_without_padding = 0
+        for submodule_name, submodule in module.named_modules(remove_duplicate=False):
+            for param_name, param in _named_parameters_with_duplicates(
+                submodule, recurse=False
+            ):
+                if param not in params_set:
+                    continue
+                if param in shared_param_memo:  # shared reference
+                    prim_module, prim_module_name, prim_param_name = shared_param_memo[
+                        param
+                    ]
+                    shared_params.append(param)
+                    shared_param_infos.append(
+                        SharedParamInfo(
+                            param_name,
+                            submodule,
+                            submodule_name,
+                            prim_param_name,
+                            prim_module,
+                            prim_module_name,
+                        )
+                    )
+                else:
+                    if aligned_numel > 0:
+                        numel_to_pad = aligned_numel - (total_numel % aligned_numel)
+                        if numel_to_pad > 0 and numel_to_pad < aligned_numel:
+                            padding_tensor = _construct_padding_tensor(
+                                numel_to_pad, dtype, False, device
+                            )
+                            params_to_flatten.append(padding_tensor)
+                            is_padding_mask.append(True)
+                            numels.append(numel_to_pad)
+                            total_numel += numel_to_pad
+                    transform_t, extension = _ext_pre_flatten_transform(
+                        param,
+                        self._fsdp_extension,
+                    )
+                    param = cast(nn.Parameter, transform_t)
+                    param_extensions.append(extension)
+                    shared_param_memo[param] = (submodule, submodule_name, param_name)
+                    params_to_flatten.append(param)
+                    is_padding_mask.append(False)
+                    param_infos.append(ParamInfo(param_name, submodule, submodule_name))
+                    numels.append(param.numel())
+                    shapes.append(param.shape)
+                    fqn = (
+                        submodule_name + "." + param_name
+                        if submodule_name
+                        else param_name
+                    )
+                    fqns.append(fqn)
+                    total_numel += param.numel()
+                    total_numel_without_padding += param.numel()
+        if len(params_to_flatten) == 0:
+            raise ValueError(
+                f"`params` were not found in `module`'s tree"
+                f"params: {params}\nmodule: {module}"
+            )
+        if (
+            self.rank == 0
+            and aligned_numel > 0
+            and total_numel != total_numel_without_padding
+        ):
+            log.info(
+                "FSDP FlatParameter address alignment created "
+                "%s numel of padding (%s vs. %s)",
+                total_numel - total_numel_without_padding,
+                total_numel,
+                total_numel_without_padding,
+            )
+        if aligned_numel > 0:
+            # Pad to be divisible by world size to avoid a copy for the
+            # post-backward reduce-scatter
+            numel_to_pad = self.world_size - (total_numel % self.world_size)
+            if numel_to_pad > 0 and numel_to_pad < self.world_size:
+                if self.rank == 0:
+                    log.info(
+                        "FSDP FlatParameter world size divisibility created "
+                        "%s numel of padding",
+                        numel_to_pad,
+                    )
+                padding_tensor = _construct_padding_tensor(
+                    numel_to_pad, dtype, False, device
+                )
+                params_to_flatten.append(padding_tensor)
+                is_padding_mask.append(True)
+                numels.append(numel_to_pad)
+                total_numel += numel_to_pad
+        # Pass `aligned_numel=0` since we already included padding tensors
+        self.flat_param: FlatParameter = self.flatten_tensors_into_flat_param(
+            params_to_flatten,
+            aligned_numel=0,
+            requires_grad=flat_param_requires_grad,
+        )
+        FlatParameter._init_metadata(
+            self.flat_param,
+            param_infos,
+            numels,
+            shapes,
+            fqns,
+            shared_param_infos,
+            param_extensions,
+            _convert_to_params(params_to_flatten) if use_orig_params else None,
+            _convert_to_params(shared_params) if use_orig_params else None,
+            is_padding_mask,
+        )
+
+    def _validate_tensors_to_flatten(
+        self, tensors: List[Union[Tensor, nn.Parameter]]
+    ) -> Tuple:
+        """Validate the tensors to flatten and returns any necessary metadata."""
+        dtype: Optional[torch.dtype] = None
+        # Return as the logical OR over each tensor's value
+        flat_param_requires_grad: Optional[bool] = None
+        device: Optional[torch.device] = None
+        # For `use_orig_params=True`, permit non-uniform `requires_grad`
+        for tensor in tensors:
+            if isinstance(tensor, FlatParameter):
+                raise ValueError("Cannot flatten a `FlatParameter`")
+            if dtype is None and not tensor.is_floating_point():
+                raise ValueError("Cannot flatten integer dtype tensors")
+            if dtype is not None and tensor.dtype != dtype:
+                raise ValueError(
+                    f"Must flatten tensors with uniform dtype but got {dtype} "
+                    f"and {tensor.dtype}"
+                )
+            if (
+                not self._use_orig_params
+                and flat_param_requires_grad is not None
+                and tensor.requires_grad != flat_param_requires_grad
+            ):
+                raise ValueError(
+                    "Must flatten tensors with uniform `requires_grad` when "
+                    "`use_orig_params=False`"
+                )
+            if device is not None and tensor.device != device:
+                raise ValueError(
+                    "Must flatten tensors on the same device but got both "
+                    f"{device} and {tensor.device}"
+                )
+            dtype = tensor.dtype
+            flat_param_requires_grad = flat_param_requires_grad or tensor.requires_grad
+            device = tensor.device
+        assert flat_param_requires_grad is not None, "Requires non-empty `tensors` list"
+        return dtype, flat_param_requires_grad, device
+
+    def flatten_tensors(
+        self,
+        tensors: List[Tensor],
+        aligned_numel: int,
+    ) -> Tensor:
+        """
+        Flatten ``tensors`` into a single flat tensor.
+
+        The flattening optionally includes
+        padding if ``aligned_numel`` is greater than 0, where ``aligned_numel``
+        gives the numel required to have address alignment.
+
+        NOTE: The padding alignment algorithm must be kept in sync with
+        :meth:`_init_flat_param_metadata`. We separate the two methods because
+        the initialization happens once, whereas this method may be called
+        multiple times throughout training (e.g. for checkpointing).
+        """
+        if len(tensors) == 0:
+            raise ValueError("Expects non-empty `tensors`")
+        if aligned_numel < 0:
+            raise ValueError(
+                f"Expects non-negative `aligned_numel` but got {aligned_numel}"
+            )
+        dtype, _, device = self._validate_tensors_to_flatten(tensors)
+        flat_tensors: List[Tensor] = []
+        if aligned_numel > 0:
+            total_numel = 0
+            for tensor in tensors:
+                numel_to_pad = aligned_numel - (total_numel % aligned_numel)
+                if numel_to_pad > 0 and numel_to_pad < aligned_numel:
+                    padding_tensor = _construct_padding_tensor(
+                        numel_to_pad, dtype, False, device
+                    )
+                    flat_tensors.append(padding_tensor)
+                    total_numel += numel_to_pad
+                flat_tensors.append(torch.flatten(_detach_if_needed(tensor)))
+                total_numel += tensor.numel()
+            numel_to_pad = self.world_size - (total_numel % self.world_size)
+            if numel_to_pad > 0 and numel_to_pad < self.world_size:
+                padding_tensor = _construct_padding_tensor(
+                    numel_to_pad, dtype, False, device
+                )
+                flat_tensors.append(padding_tensor)
+                total_numel += numel_to_pad
+        else:
+            flat_tensors = [
+                torch.flatten(_detach_if_needed(tensor)) for tensor in tensors
+            ]
+        return torch.cat(flat_tensors, dim=0)
+
+    def flatten_tensors_into_flat_param(
+        self,
+        tensors: List[Tensor],
+        aligned_numel: int,
+        requires_grad: bool,
+    ) -> FlatParameter:
+        flat_param_data = self.flatten_tensors(tensors, aligned_numel)
+        return FlatParameter(flat_param_data, requires_grad=requires_grad)
+
+    def _init_param_reduce_dtypes(
+        self,
+        mp_param_dtype: Optional[torch.dtype],
+        mp_reduce_dtype: Optional[torch.dtype],
+    ) -> None:
+        """
+        Initialize param and reduce dtypes.
+
+        Precondition: ``self.flat_param`` is set. This ensures that this
+        handle's parameters have a single dtype.
+
+        Postcondition: This sets ``self._fwd_bwd_param_dtype`` and
+        ``self._reduce_dtype``. If ``mp_param_dtype`` or ``mp_reduce_dtype``
+        is ``None``, then we assume the original parameter dtype. One special
+        case is if ``mp_param_dtype`` is not ``None`` and ``mp_reduce_dtype``
+        is ``None``, in which case we assume the gradient reduction dtype
+        matches the forward/backward parameter dtype.
+        """
+        # Save whether these dtypes were specified so that we permit the
+        # parameter dtype to change up until the lazy initialization
+        self._low_prec_param_dtype_specified = mp_param_dtype is not None
+        self._low_prec_reduce_dtype_specified = mp_reduce_dtype is not None
+        if (
+            self._low_prec_param_dtype_specified
+            and not self._low_prec_reduce_dtype_specified
+        ):
+            # Special case: infer gradient reduction mixed precision
+            self._fwd_bwd_param_dtype = mp_param_dtype
+            self._reduce_dtype = self._fwd_bwd_param_dtype
+        else:
+            self._fwd_bwd_param_dtype = mp_param_dtype or self._orig_param_dtype
+            self._reduce_dtype = mp_reduce_dtype or self._orig_param_dtype
+        assert self._fwd_bwd_param_dtype is not None
+        assert self._reduce_dtype is not None
+
+    ###################################
+    # SHARD INITIALIZATION & METADATA #
+    ###################################
+    @torch.no_grad()
+    def shard(self):
+        """
+        Shard the handle's ``FlatParameter``.
+
+        This allocates new memory for
+        the sharded flat parameter and frees the unsharded flat parameter's
+        storage.
+
+        Postcondition: ``self.flat_param`` is the sharded flat parameter. Shard
+        metadata attributes are set for all sharding strategies.
+        """
+        flat_param = self.flat_param
+        if not self.uses_sharded_strategy:
+            self._init_shard_metadata(0, 0, flat_param.numel() - 1)
+        else:
+            _p_assert(
+                flat_param.storage_offset() == 0,
+                "The `FlatParameter` is not the sole occupant of its storage",
+            )
+            sharded_flat_param, numel_padded = FlatParamHandle._get_shard(
+                flat_param, self.rank, self.world_size
+            )
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                allocated = flat_param._typed_storage()._size() > 0
+                if allocated:
+                    flat_param._typed_storage()._resize_(0)
+            flat_param.set_(sharded_flat_param)  # type: ignore[call-overload]
+            start_idx = sharded_flat_param.numel() * self.rank
+            end_idx = sharded_flat_param.numel() * (self.rank + 1) - 1  # inclusive
+            self._init_shard_metadata(numel_padded, start_idx, end_idx)
+        if self._use_orig_params:
+            self._use_sharded_views()
+
+    def _init_shard_metadata(
+        self,
+        numel_padded: int,
+        unsharded_start_idx: int,
+        unsharded_end_idx: int,
+    ) -> None:
+        """
+        Initialize shard-related metadata for this rank's shard of the flat parameter.
+
+        This includes ``_sharded_size``, ``_shard_param_infos``, and ``_shard_numel_padded``.
+
+        Args:
+            numel_padded (int): Numel padded for this rank's sharded flat
+                parameter.
+            unsharded_start_idx (int): Start index in the unsharded flat
+            parameter assigned to this rank.
+            unsharded_end_idx (int): End index (inclusive) in the unsharded
+                flat parameter assigned to this rank.
+
+        Precondition: ``self.flat_param`` 's data is the sharded flat
+        parameter.
+        """
+        flat_param = self.flat_param
+        flat_param._sharded_size = flat_param.size()  # type: ignore[attr-defined]
+        sharded_flat_param_numel = flat_param.numel()  # includes `numel_padded`
+        _p_assert(
+            unsharded_start_idx >= 0 and unsharded_start_idx <= unsharded_end_idx,
+            f"unsharded_start_idx: {unsharded_start_idx} unsharded_end_idx: {unsharded_end_idx}",
+        )
+        _p_assert(
+            numel_padded <= sharded_flat_param_numel,
+            f"numel_padded: {numel_padded} "
+            f"sharded_flat_param_numel: {sharded_flat_param_numel}",
+        )
+        shard_param_infos = self._get_shard_metadata(
+            unsharded_start_idx, unsharded_end_idx
+        )
+        assert (
+            len(shard_param_infos) == flat_param._num_params
+        ), f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
+        flat_param._shard_param_infos = shard_param_infos  # type: ignore[attr-defined]
+        flat_param._shard_numel_padded = numel_padded  # type: ignore[attr-defined]
+
+    def _get_shard_metadata(
+        self,
+        unsharded_start_idx: int,
+        unsharded_end_idx: int,
+    ) -> Tuple[_ShardParamInfo, ...]:
+        """
+        Compute the shard metadata based on ``unsharded_start_idx`` and ``unsharded_end_idx`` (inclusive).
+
+        ``unsharded_start_idx`` and ``unsharded_end_idx`` give the interval of the
+        unsharded flat parameter specifying the shard.
+        """
+        flat_param_offsets = self._get_flat_param_offsets()
+        assert len(flat_param_offsets) == len(
+            self.flat_param._numels_with_padding
+        ), f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
+        shard_param_infos: List[_ShardParamInfo] = []
+        sharded_flat_param_numel = unsharded_end_idx - unsharded_start_idx + 1
+        # `unsharded_param_start_idx` and `unsharded_param_end_idx` are indices
+        # into the unsharded flat parameter (inclusive) of the given parameter
+        for i, (
+            (unsharded_param_start_idx, unsharded_param_end_idx),
+            is_padding,
+        ) in enumerate(zip(flat_param_offsets, self.flat_param._is_padding_mask)):
+            if is_padding:
+                continue
+            in_sharded_flat_param = (
+                unsharded_start_idx <= unsharded_param_end_idx
+                and unsharded_end_idx >= unsharded_param_start_idx
+            )
+            if not in_sharded_flat_param:
+                shard_param_info = _ShardParamInfo(False, None, None, None, None)
+            else:
+                if unsharded_start_idx <= unsharded_param_start_idx:
+                    # This branch can only happen once since the rank's
+                    # unsharded start index can only intersect one parameter
+                    intra_param_start_idx = 0
+                    offset_in_shard = unsharded_param_start_idx - unsharded_start_idx
+                else:
+                    intra_param_start_idx = (
+                        unsharded_start_idx - unsharded_param_start_idx
+                    )
+                    offset_in_shard = 0
+                assert (
+                    offset_in_shard >= 0 and offset_in_shard < sharded_flat_param_numel
+                ), (
+                    f"Invalid `offset_in_shard` of {offset_in_shard} for "
+                    f"sharded flat parameter with {sharded_flat_param_numel} numel"
+                )
+                intra_param_end_idx = (
+                    min(unsharded_param_end_idx, unsharded_end_idx)
+                    - unsharded_param_start_idx
+                )
+                numel_in_shard = intra_param_end_idx - intra_param_start_idx + 1
+                shard_param_info = _ShardParamInfo(
+                    True,
+                    offset_in_shard,
+                    numel_in_shard,
+                    intra_param_start_idx,
+                    intra_param_end_idx,
+                )
+            shard_param_infos.append(shard_param_info)
+        return tuple(shard_param_infos)
+
+    @staticmethod
+    def _get_unpadded_shard(
+        tensor: Tensor,
+        rank: int,
+        world_size: int,
+    ) -> Tuple[Tensor, int]:
+        """
+        Return the unpadded shard of ``tensor`` for the given ``rank`` and ``world_size``.
+
+        The returned value is a tuple of the shard of ``tensor`` without any
+        padding and the numel to pad for that shard.
+
+        If ``tensor`` is already flattened or may be viewed in the flattened
+        shape (which is true in the expected usage), then this method does not
+        allocate any new tensor memory.
+        """
+        chunks = torch.flatten(tensor).chunk(world_size)
+        if len(chunks) < (rank + 1):
+            # This rank gets an empty chunk fully padded with zeros since there
+            # are not enough chunks across ranks
+            chunk = chunks[0].new_empty(0)
+        else:
+            chunk = chunks[rank]
+        numel_to_pad = chunks[0].numel() - chunk.numel()
+        assert (
+            numel_to_pad >= 0
+        ), "Chunk's size should be at most the first chunk's size"
+        return chunk, numel_to_pad
+
+    @staticmethod
+    def _get_shard(
+        tensor: Tensor,
+        rank: int,
+        world_size: int,
+    ) -> Tuple[Tensor, int]:
+        """
+        Return the shard of ``tensor`` with padding for the given ``rank`` and ``world_size`` and the numel padded for that shard.
+
+        This method allocates new memory (via :meth:`clone`) since the
+        unsharded ``tensor`` may be deallocated after this method returns.
+        """
+        chunk, numel_to_pad = FlatParamHandle._get_unpadded_shard(
+            tensor, rank, world_size
+        )
+        shard = chunk.clone()
+        if numel_to_pad > 0:
+            shard = F.pad(shard, [0, numel_to_pad])
+        return shard, numel_to_pad
+
+    @staticmethod
+    def _get_sharded_size(tensor: Tensor, rank: int, world_size: int) -> torch.Size:
+        """
+        Return the shape of ``tensor`` after sharding including padding.
+
+        This requires ``tensor`` to have 1D shape and ensures that the returned
+        shape is 1D.
+        """
+        assert len(tensor.shape) == 1, f"{tensor.shape}"
+        unpadded_sharded_tensor, numel_to_pad = FlatParamHandle._get_unpadded_shard(
+            tensor, rank, world_size
+        )
+        unpadded_sharded_size = unpadded_sharded_tensor.size()
+        assert len(unpadded_sharded_size) == 1, f"{unpadded_sharded_size}"
+        return torch.Size([unpadded_sharded_size[0] + numel_to_pad])
+
+    def _get_flat_param_offsets(self) -> List[Tuple[int, int]]:
+        """
+        Return [start, end] offsets of each original parameter's flattened data in the unsharded flat parameter (without padding).
+
+        NOTE: The returned list includes elements for alignment padding.
+        """
+        cumulative_sum = list(accumulate(self.flat_param._numels_with_padding))
+        starts = [0] + cumulative_sum[:-1]
+        ends = [end - 1 for end in cumulative_sum]  # inclusive
+        param_offsets = list(zip(starts, ends))
+        return param_offsets
+
+    @no_type_check
+    def shard_metadata(
+        self,
+    ) -> FlatParamShardMetadata:
+        """
+        Return the shard-related metadata specific to this rank's shard of the flat parameter.
+
+        NOTE: The returned tuple does not include elements for alignment
+        padding but does account for the padding.
+        """
+        fqns_list = []
+        shapes_list = []
+        numels_list = []
+        shard_param_offsets = []
+        for fqn, shape, numel, shard_param_info in zip(
+            self.flat_param._fqns,
+            self.flat_param._shapes,
+            self.flat_param._numels,
+            self.flat_param._shard_param_infos,
+        ):
+            if not shard_param_info.in_shard:
+                continue
+            fqns_list.append(fqn)
+            shapes_list.append(shape)
+            numels_list.append(numel)
+            shard_param_offsets.append(
+                (
+                    shard_param_info.intra_param_start_idx,
+                    shard_param_info.intra_param_end_idx,
+                )
+            )
+        return FlatParamShardMetadata(
+            tuple(fqns_list),
+            tuple(shapes_list),
+            tuple(numels_list),
+            shard_param_offsets,
+        )
+
+    @no_type_check
+    @torch.no_grad()
+    def init_flat_param_attributes(self) -> None:
+        """
+        This initializes some attributes on the handle's ``FlatParameter``.
+        This should be called during lazy initialization since it requires the
+        parameter to be on the compute device if not offloading to CPU and we
+        want to give users the chance to move the parameter appropriately after
+        the FSDP constructor.
+
+        For each tensor attribute on the ``FlatParameter``, see the unshard and
+        reshard methods in this class for the allocation and free pattern.
+        """
+        flat_param = self.flat_param
+        if flat_param.dtype != self._orig_param_dtype:
+            # Entering this branch means that the user changed the parameter
+            # dtype after FSDP initialization, in which case we may need to
+            # refresh some saved dtype attributes (dtypes specified as a part
+            # of mixed precision take precedence).
+            if not self._low_prec_param_dtype_specified:
+                self._fwd_bwd_param_dtype = flat_param.dtype
+            # For `reduce_dtype`, require `param_dtype` was not specified since
+            # then we infer the `reduce_dtype` from the specified `param_dtype`
+            if (
+                not self._low_prec_reduce_dtype_specified
+                and not self._low_prec_param_dtype_specified
+            ):
+                self._reduce_dtype = flat_param.dtype
+            self._orig_param_dtype = flat_param.dtype
+        cpu_device = torch.device("cpu")
+        if self._offload_params:
+            _p_assert(
+                flat_param.device == cpu_device,
+                f"Expects the `FlatParameter` to be on CPU when parameter CPU "
+                f"offloading is enabled, not {flat_param.device}",
+            )
+        else:
+            self._check_on_compute_device(self.flat_param)
+        flat_param._local_shard = flat_param.data
+        if self._offload_params:
+            # Pin the memory for faster H2D transfer
+            flat_param._local_shard = flat_param._local_shard.pin_memory()
+            # Pre-allocate the sharded gradient on CPU to enable non-blocking
+            # D2H transfer during the backward pass
+            flat_param._cpu_grad = torch.zeros_like(
+                flat_param._local_shard, device=cpu_device
+            ).pin_memory()
+        if self._uses_param_mixed_precision:
+            # For parameter mixed precision, we maintain a low precision
+            # sharded tensor on the compute device to be all-gathered (for
+            # sharded strategies) or directly used (for `NO_SHARD`) for
+            # computation.
+            flat_param._mp_shard = torch.empty_like(
+                flat_param._local_shard,
+                device=self.device,
+                dtype=self._fwd_bwd_param_dtype,
+            )
+            _free_storage(flat_param._mp_shard)
+        if self.uses_sharded_strategy:
+            # We maintain a padded unsharded tensor that serves as the
+            # all-gather destination and owns the original parameter storages.
+            unsharded_param_dtype = (
+                self._fwd_bwd_param_dtype
+                if self._uses_param_mixed_precision
+                else flat_param.dtype
+            )  # use low precision if parameter mixed precision is enabled
+            padded_unsharded_numel = flat_param.numel() * self.world_size
+            flat_param._full_param_padded = torch.empty(
+                padded_unsharded_numel,
+                device=self.device,
+                dtype=unsharded_param_dtype,
+            )
+            flat_param._padded_unsharded_size = flat_param._full_param_padded.size()
+            _free_storage(flat_param._full_param_padded)
+
+            if self._uses_param_mixed_precision:
+                # For parameter mixed precision, we maintain a full precision
+                # padded unsharded tensor for when we force full precision.
+                flat_param._full_prec_full_param_padded = torch.empty(
+                    padded_unsharded_numel,
+                    device=self.device,
+                    dtype=flat_param.dtype,  # full precision
+                )
+                _free_storage(flat_param._full_prec_full_param_padded)
+
+    ###################
+    # UNSHARD/RESHARD #
+    ###################
+    def pre_unshard(self) -> bool:
+        """
+        Return ``False`` if this is a no-op and ``True`` otherwise.
+
+        Postcondition: ``self.flat_param`` 's data is on the device for
+        communication and is what should be all-gathered. This means that it
+        matches the dtype of the expected unsharded parameter.
+        """
+        if (
+            self._training_state == HandleTrainingState.SUMMON_FULL_PARAMS
+            and self._skipped_use_sharded_views
+        ):
+            # Since this path imposes special semantics for the unsharded flat
+            # parameter (e.g. forcing full precision), use sharded views to
+            # reuse the existing logic for that special handling
+            self._use_sharded_views()
+        ret = False
+        if self._use_orig_params and not self._skip_writeback_check:
+            ret = self._writeback_orig_params()
+        if (
+            self.uses_sharded_strategy
+            and not self._offload_params
+            and not self.needs_unshard()
+        ):
+            pass  # no-op
+        elif self._uses_param_mixed_precision and not self._force_full_precision:
+            self._use_low_precision_shard()
+            ret = True
+        elif self._offload_params and self.flat_param.device != self.device:
+            # NOTE: This creates a new tensor distinct from any attributes.
+            self.flat_param_to(self.device, non_blocking=True)
+            ret = True
+        self._check_on_compute_device(self.flat_param)
+        return ret
+
+    def _use_low_precision_shard(self):
+        """Allocate on the compute device and switch to using the low precision sharded flat parameter."""
+        self._check_low_precision_shard()
+        flat_param = self.flat_param
+        _alloc_storage(
+            flat_param._mp_shard, flat_param._local_shard.size()  # type: ignore[attr-defined]
+        )
+        # `copy_()` implicitly casts to the low precision
+        flat_param._mp_shard.copy_(  # type: ignore[attr-defined]
+            flat_param._local_shard.to(  # type: ignore[attr-defined]
+                self.device, non_blocking=True
+            )
+        )
+        # Invariant: `_mp_shard` is always on the compute device.
+        flat_param.data = flat_param._mp_shard  # type: ignore[attr-defined]
+
+    def unshard(self):
+        """
+        Run the unshard logic.
+
+        This includes all-gathering the flat parameter
+        and switching to using the unsharded flat parameter. If the handle does
+        not need unsharding, then this only switches to using the unsharded
+        flat parameter. For ``NO_SHARD``, this is a no-op.
+
+        If FSDP is in :meth:`summon_full_params` and the handle uses parameter
+        mixed precision, then the parameter is forced to full precision.
+        """
+        if not self.needs_unshard():
+            # Even when not needing an unshard, we should switch to using
+            # the unsharded flat parameter
+            unsharded_flat_param = (
+                self._get_padded_unsharded_flat_param()
+                if self.uses_sharded_strategy
+                else self.flat_param
+            )
+            self._use_unsharded_flat_param(unsharded_flat_param)
+            return
+        unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+        padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
+        self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+    def needs_unshard(self) -> bool:
+        """Return if the handle's flat parameter needs to be unsharded."""
+        if not self.uses_sharded_strategy:
+            return False
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        already_unsharded = _same_storage_size(
+            unsharded_flat_param, unsharded_flat_param.numel()
+        )
+        return not already_unsharded
+
+    def _alloc_padded_unsharded_flat_param(self):
+        """
+        Allocate the *padded* unsharded flat parameter.
+
+        The unpadded unsharded
+        flat parameter is always a view into the padded one. This padded
+        parameter is saved to a different attribute on the ``FlatParameter``
+        depending on if we force full precision.
+        """
+        self._check_sharded_strategy()
+        flat_param = self.flat_param
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        self._check_storage_freed(unsharded_flat_param)
+        _alloc_storage(unsharded_flat_param, flat_param._padded_unsharded_size)  # type: ignore[attr-defined]
+        return unsharded_flat_param
+
+    def _get_padded_unsharded_flat_param(self) -> torch.Tensor:
+        """
+        Return a reference to the padded unsharded flat parameter depending on the calling context.
+
+        This should only be called if using a sharded strategy.
+        """
+        self._check_sharded_strategy()
+        flat_param = self.flat_param
+        if self._force_full_precision and self._uses_param_mixed_precision:
+            # When parameter mixed precision is enabled, we use a different
+            # tensor as the all-gather destination to preserve the invariant
+            # that  `_full_param_padded` is in the low precision
+            unsharded_flat_param = flat_param._full_prec_full_param_padded  # type: ignore[attr-defined]
+            _p_assert(
+                unsharded_flat_param.dtype != self._fwd_bwd_param_dtype,
+                f"Expects full precision but got {self._fwd_bwd_param_dtype}",
+            )
+            # For no-reshard-after-forward strategies, `_full_param_padded` may
+            # still be allocated from a previous forward. As we are forcing
+            # full precision here, the full-precision unsharded copy may be
+            # modified, invalidating the existing low-precision unsharded copy,
+            # so we should free it here to ensure a new all-gather for the next
+            # forward/backward computation to persist the modifications.
+            if flat_param._full_param_padded.untyped_storage().size() > 0:
+                _free_storage(flat_param._full_param_padded)
+        else:
+            unsharded_flat_param = flat_param._full_param_padded  # type: ignore[attr-defined]
+        return unsharded_flat_param
+
+    def _all_gather_flat_param(
+        self,
+        padded_unsharded_flat_param: Tensor,
+    ) -> Tensor:
+        """
+        All-gather the handle's flat parameter to the destination ``padded_unsharded_flat_param``.
+
+        Then switch to use the all-gathered tensor.
+        """
+        _p_assert(
+            hasattr(self, "process_group") and hasattr(self, "world_size"),
+            "Expects a process group and world size to have been set via `shard()`",
+        )
+        sharded_flat_param = self.flat_param.data
+        expected_numel = sharded_flat_param.numel() * self.world_size
+        _p_assert(
+            padded_unsharded_flat_param.numel() == expected_numel,
+            f"Expects {expected_numel} numel but got {padded_unsharded_flat_param.numel()}",
+        )
+
+        pg = (
+            self._fake_process_group
+            if self._use_fake_all_gather
+            else self.process_group
+        )
+
+        # HACK this should be handled by C10D
+        if sharded_flat_param.is_cpu:  # type: ignore[attr-defined]
+            tensor_list = list(
+                torch.chunk(padded_unsharded_flat_param, dist.get_world_size(pg))
+            )
+            work = dist.all_gather(tensor_list, sharded_flat_param, group=pg)
+        else:
+            dist.all_gather_into_tensor(
+                padded_unsharded_flat_param,
+                sharded_flat_param,
+                pg,
+            )
+
+        if self._offload_params:
+            # In case of offloading, `flat_param.data` (i.e. sharded param) is
+            # created on the pre-unshard stream. We need to hand it over to the
+            # unshard stream for all-gather
+            _no_dispatch_record_stream(
+                sharded_flat_param,
+                self._device_handle.current_stream(),  # unshard_stream
+            )
+        return padded_unsharded_flat_param
+
+    def _use_unsharded_flat_param(
+        self,
+        padded_unsharded_flat_param: torch.Tensor,
+    ) -> None:
+        """
+        Switch to use the *unpadded* unsharded flat parameter.
+
+        This is a view into the *padded* unsharded flat parameter.
+        """
+        unsharded_size = self.flat_param._unpadded_unsharded_size
+        flat_param_part = padded_unsharded_flat_param[: unsharded_size.numel()]
+        # slicing [:] is not visible to autograd because of .data
+        self.flat_param.data = flat_param_part
+        in_forward = self._training_state == HandleTrainingState.FORWARD
+        in_pre_backward = self._training_state == HandleTrainingState.BACKWARD_PRE
+        if self._use_orig_params:
+            if self._skipped_use_sharded_views and in_pre_backward:
+                # This call corresponds to the complementary pre-backward
+                # `_use_unsharded_views()` to the skipped pre-forward
+                # `_use_sharded_views()`, so we should skip this one too.
+                return
+            # We use `Tensor` views in the forward so that they are tracked by
+            # autograd. We use them in the pre-backward as well to support
+            # reentrant activation checkpointing, which needs the views to be
+            # tracked by autograd in the backward pass's recomputed forward.
+            self._use_unsharded_views(
+                as_params=(not in_forward and not in_pre_backward)
+            )
+        elif in_forward:
+            self._use_unsharded_views(as_params=False)
+
+    def post_unshard(self):
+        """
+        Run the post-unshard logic.
+
+        This includes freeing the low precision shard if needed.
+        """
+        if self._uses_param_mixed_precision and self.uses_sharded_strategy:
+            self._free_low_precision_sharded_param()
+        self._check_on_compute_device(self.flat_param)
+
+    def _free_low_precision_sharded_param(self):
+        """Frees the low precision sharded flat parameter."""
+        self._check_low_precision_shard()
+        # `_mp_shard` is allocated in the pre-unshard stream, consumed in the
+        # unshard stream for sharded strategies, and consumed in both the
+        # unshard and default streams for `NO_SHARD`. For sharded strategies,
+        # the current stream here is the unshard stream, and for `NO_SHARD`,
+        # it is the default stream. For `NO_SHARD`, only recording for the
+        # default stream suffices since the default stream waits for the
+        # unshard stream.
+        _no_dispatch_record_stream(
+            self.flat_param._mp_shard, self._device_handle.current_stream()  # type: ignore[attr-defined]
+        )
+        _free_storage(self.flat_param._mp_shard)  # type: ignore[attr-defined]
+
+    @torch.no_grad()
+    def unshard_grad(self):
+        """
+        Unshard the handle's ``FlatParameter``'s gradient.
+
+        If all ranks have
+        ``None`` gradient, then all original parameters will as well. This
+        method performs an all-reduce and an all-gather. The additional
+        all-reduce is tolerable since this method is not meant to be used on
+        the computation critical path.
+
+        Postcondition: ``_saved_grad_shard`` is defined and contains the value
+        to set ``flat_param.grad`` after gradients are resharded.
+        """
+        if not self.uses_sharded_strategy:
+            self._use_unsharded_grad_views()
+            return
+        flat_param = self.flat_param
+        self._check_unsharded(flat_param)
+
+        # Check if all ranks have a `None` gradient
+        num_grad_none = torch.zeros(1, dtype=torch.int32, device=self.device)
+        num_grad_none[0] = flat_param.grad is None
+        dist.all_reduce(num_grad_none, group=self.process_group)
+        if num_grad_none[0] == self.world_size:
+            flat_param._saved_grad_shard = None  # type: ignore[assignment]
+            self._use_unsharded_grad_views()
+            return
+
+        if flat_param.grad is None:
+            # In the case that only some ranks have `None` gradient, we use
+            # zeros to approximate as a best effort attempt
+            if self._debug_level == dist.DebugLevel.INFO:
+                warnings.warn(
+                    f"[Rank {self.rank}] Only some but not all ranks have a "
+                    "`None` `FlatParameter` gradient, so FSDP is using zeros to "
+                    "approximate those ranks' sharded gradients being `None`"
+                )
+            flat_param._saved_grad_shard = None  # type: ignore[assignment]
+            sharded_grad = torch.zeros(flat_param._sharded_size, device=self.device)  # type: ignore[attr-defined]
+        else:
+            self._check_sharded(flat_param.grad)
+            flat_param._saved_grad_shard = flat_param.grad  # type: ignore[attr-defined]
+            sharded_grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
+        padded_unsharded_grad = torch.empty(
+            flat_param._padded_unsharded_size,  # type: ignore[attr-defined]
+            device=self.device,
+            dtype=sharded_grad.dtype,
+        )
+        dist.all_gather_into_tensor(
+            padded_unsharded_grad, sharded_grad, self.process_group
+        )
+        unsharded_size = self.flat_param._unpadded_unsharded_size
+        flat_param.grad = padded_unsharded_grad[: unsharded_size.numel()].view(
+            unsharded_size
+        )
+        self._use_unsharded_grad_views()
+
+    def reshard_grad(self):
+        if self._use_orig_params:
+            self._use_sharded_grad_views()
+        if not self.uses_sharded_strategy:
+            return
+        self.flat_param.grad = self.flat_param._saved_grad_shard  # type: ignore[attr-defined]
+        delattr(self.flat_param, "_saved_grad_shard")
+
+    def prepare_gradient_for_backward(self):
+        """
+        Prepare the gradient for the backward computation.
+
+        This is done by saving and clearing any existing sharded gradient
+        in ``.grad`` to enable computing a new unsharded gradient.
+        """
+        _p_assert(
+            self._training_state
+            in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.IDLE),
+            "Expects to be in `BACKWARD_PRE` or `IDLE` (if prefetching)",
+        )
+        flat_param = self.flat_param
+        if flat_param.grad is not None and (
+            flat_param.grad.size() != flat_param._unpadded_unsharded_size
+            or flat_param.grad.device != flat_param.device  # grad on CPU
+        ):
+            self._check_on_compute_device(self.flat_param)
+            grad_offloaded = flat_param.grad.device != self.device
+            _p_assert(
+                not grad_offloaded or self._offload_params,
+                f"Expects the sharded gradient to be on {self.device} "
+                f"but got {flat_param.grad.device}",
+            )
+            prev_iter_synced_gradients = (
+                flat_param.grad.size()
+                == flat_param._local_shard.size()  # type: ignore[attr-defined]
+            )
+            if prev_iter_synced_gradients:
+                # TODO (awgu): Gradient accumulation outside `no_sync()`
+                # does not work with CPU offloading. The issue should be
+                # that, in the post-backward hook, we cannot do an addition
+                # between a CPU tensor (the existing sharded gradient) and
+                # a GPU tensor (the new sharded gradient).
+                if not grad_offloaded:
+                    flat_param._saved_grad_shard = flat_param.grad.data  # type: ignore[attr-defined]
+                    sharded_grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
+                else:
+                    _p_assert(
+                        hasattr(flat_param, "_cpu_grad"),
+                        "`_cpu_grad` should be defined if the gradient is on CPU",
+                    )
+                    sharded_grad = flat_param._cpu_grad  # type: ignore[attr-defined]
+                # If user specified to keep the gradient in low precision, then
+                # the gradient may still be of the low precision dtype if the
+                # user did not set the gradient to `None` after the previous
+                # backward, in which case FSDP should cast back to the full
+                # precision dtype so that FSDP can accumulate in that dtype in
+                # the post-backward hook and assign to `.grad` in that dtype in
+                # the post-backward callback.
+                local_shard_dtype = flat_param._local_shard.dtype  # type: ignore[attr-defined]
+                if (
+                    self._keep_low_precision_grads
+                    and sharded_grad.dtype != local_shard_dtype
+                ):
+                    sharded_grad.data = sharded_grad.to(local_shard_dtype)
+            else:
+                padded_unsharded_size = flat_param._padded_unsharded_size  # type: ignore[attr-defined]
+                _p_assert(
+                    flat_param.grad.size() == padded_unsharded_size,
+                    "Expects `.grad` to be the unsharded gradient in "
+                    f"`no_sync()` with size {padded_unsharded_size} "
+                    f"but got size {flat_param.grad.size()}",
+                )
+            flat_param.grad = None
+
+    def prepare_gradient_for_optim(self):
+        """Prepare the gradient for optimizer computation by moving the sharded gradient to the ``.grad`` attribute."""
+
+        def cast_grad_to_param_dtype_if_needed(flat_param):
+            # TODO (rohan-varma): test for full precision with keep_low_precision_grads
+            if not self._force_full_precision and self._keep_low_precision_grads:
+                _p_assert(flat_param.grad is not None, "Unexpected None grad!")
+                if flat_param.grad.dtype != self._fwd_bwd_param_dtype:
+                    flat_param.grad.data = flat_param.grad.to(self._fwd_bwd_param_dtype)
+                    if self._use_orig_params:
+                        self._use_sharded_grad_views()
+
+        flat_param = self.flat_param
+        # TODO (awgu): We should replace these conditional checks to encode
+        # the logical intention more directly.
+        if hasattr(flat_param, "_cpu_grad"):
+            # NOTE: This branch includes `NO_SHARD`.
+            self._check_sharded(flat_param)
+            self._check_on_cpu(flat_param)
+            flat_param.grad = flat_param._cpu_grad  # type: ignore[attr-defined]
+            cast_grad_to_param_dtype_if_needed(flat_param)
+        elif hasattr(flat_param, "_saved_grad_shard"):
+            self._check_sharded(flat_param)
+            self._check_on_compute_device(flat_param)
+            if flat_param._saved_grad_shard is not None:
+                self._check_on_compute_device(flat_param._saved_grad_shard)  # type: ignore[attr-defined]
+            # If no sharded gradient was computed this iteration, then there is
+            # no need to forward `_saved_grad_shard` to `grad`
+            if flat_param._post_backward_called:  # type: ignore[attr-defined]
+                flat_param.grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
+                if flat_param.grad is not None:
+                    cast_grad_to_param_dtype_if_needed(flat_param)
+        else:
+            _p_assert(
+                not self.uses_sharded_strategy
+                or not flat_param._post_backward_called,  # type: ignore[attr-defined]
+                "All sharded parameters that received a gradient in the "
+                "post-backward should use `_saved_grad_shard`",
+            )
+        # Delete `_saved_grad_shard` since its existence indicates a previous
+        # gradient to accumulate with in the post-backward hook
+        if hasattr(flat_param, "_saved_grad_shard"):
+            delattr(flat_param, "_saved_grad_shard")
+
+    @contextlib.contextmanager
+    def to_cpu(self):
+        """
+        Move the unpadded unsharded flat parameter to CPU while in the context and moves it back to the previous device upon exit.
+
+        For now, this assumes the ``FlatParameter`` is the unpadded unsharded flat parameter
+        since (1) there is no reason to include the padding in the copy and (2)
+        there is no use case for the sharded flat parameter.
+
+        Precondition: ``self.flat_param`` 's data is the unpadded unsharded
+        flat parameter on the compute device, and the handle uses a sharded
+        strategy.
+        Postcondition: Same as the precondition.
+        """
+        self._check_sharded_strategy()
+        _p_assert(
+            self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
+            f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
+        )
+        self._check_on_compute_device(self.flat_param)
+        # Check that the unpadded unsharded flat parameter is a view into the
+        # padded unsharded flat parameter as expected
+        # NOTE: This check is not strictly needed for correctness but is a
+        # useful sanity check since the tensor should only be used internally.
+        _p_assert(
+            _same_storage(self.flat_param, self._get_padded_unsharded_flat_param()),
+            "Expects the unpadded parameter to be a view into the padded parameter",
+        )
+        self.flat_param_to(torch.device("cpu"))
+        self._free_unsharded_flat_param()
+        try:
+            yield
+        finally:
+            _p_assert(
+                self.flat_param.size() == self.flat_param._unpadded_unsharded_size,
+                f"Expects size {self.flat_param._unpadded_unsharded_size} but got {self.flat_param.size()}",
+            )
+            padded_unsharded_flat_param = self._alloc_padded_unsharded_flat_param()
+            # Copy from CPU to the compute device
+            padded_unsharded_flat_param[: self.flat_param.numel()].copy_(
+                self.flat_param
+            )
+            self._use_unsharded_flat_param(padded_unsharded_flat_param)
+
+    def reshard(self, free_unsharded_flat_param: bool):
+        """
+        Run the reshard logic.
+
+        This includes freeing the unsharded flat
+        parameter if ``free_unsharded_flat_param`` and switching to using the
+        sharded flat parameter. Note that this also implicitly offloads
+        the sharded flat parameter (if CPU offload is enabled) by pointing
+        it to the ``_local_shard`` attribute which resides on CPU.
+        """
+        # Switch to the sharded `FlatParameter` before freeing to prevent
+        # "use-after-free"-type bugs with external profiling tools, where for
+        # `use_orig_params=True`, the `param` does not point to valid memory
+        # when setting `param.data = ...` in `_use_sharded_views()`.
+        self._use_sharded_flat_param()
+        if free_unsharded_flat_param:
+            self._free_unsharded_flat_param()
+
+    def post_reshard(self):
+        """
+        Run the post-reshard logic.
+
+        This includes freeing any memory that
+        can now be freed given that the ``FlatParameter`` points to the full
+        precision sharded flat parameter.
+
+        Precondition: ``self.flat_param`` 's data points to the full precision
+        sharded flat parameter.
+        """
+        # For `NO_SHARD`, `_mp_shard` is not freed in the post-unshard since it
+        # is also the low precision *unsharded* flat parameter. Hence, we delay
+        # the free until the reshard.
+        if (
+            self._uses_param_mixed_precision
+            and not self.uses_sharded_strategy
+            and not self._force_full_precision  # did not use the low precision shard
+        ):
+            self._free_low_precision_sharded_param()
+
+    def _free_unsharded_flat_param(self):
+        """
+        Free the padded unsharded flat parameter. We allow this
+        function to be called even when storage is not allocated
+
+        The tensor to free depends
+        on the calling context since the unshard may have forced full
+        precision, in which case a different tensor is used.
+        """
+        self._check_sharded_strategy()
+        unsharded_flat_param = self._get_padded_unsharded_flat_param()
+        self._check_on_compute_device(unsharded_flat_param)
+        # Do not free the memory until all ops in the current stream finish
+        _no_dispatch_record_stream(
+            unsharded_flat_param, self._device_handle.current_stream()
+        )
+        _free_storage(unsharded_flat_param)
+
+    def _use_sharded_flat_param(self) -> None:
+        """Switches to using the sharded flat parameter."""
+        flat_param = self.flat_param
+        if self._use_orig_params:
+            in_forward = self._training_state == HandleTrainingState.FORWARD
+            skip_use_sharded_views = (
+                torch.is_grad_enabled()
+                and in_forward
+                and self._sharding_strategy
+                in NO_RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES
+            )
+            # Only incur the extra `.data` call if needed
+            if skip_use_sharded_views:
+                unsharded_flat_param = flat_param.data
+        if self._offload_params:
+            device = flat_param._local_shard.device  # type: ignore[attr-defined]
+            _p_assert(
+                device == torch.device("cpu"),
+                f"Expects the local shard to be on CPU but got {device}",
+            )
+        flat_param.data = flat_param._local_shard  # type: ignore[attr-defined]
+        if self._use_orig_params:
+            if skip_use_sharded_views:  # type: ignore[possibly-undefined]
+                self._unsharded_flat_param_for_skipped_views = unsharded_flat_param  # type: ignore[possibly-undefined]
+            else:
+                self._use_sharded_views()
+            # For the post-forward reshard, we may try to use sharded gradient
+            # views (or unsharded gradient views if a gradient was accumulated
+            # in `no_sync()`), but for the post-backward reshard, we delay the
+            # call to after the reduce-scatter.
+            if (
+                in_forward  # type: ignore[possibly-undefined]
+                # Skip using gradient views if skipped using sharded views
+                # since exposing unsharded parameters with sharded gradients
+                # may be confusing to the user
+                and not self._skipped_use_sharded_views
+            ):
+                # TODO: Change `_unpadded_unsharded_size` if we change the
+                # gradient to be computed directly with padding.
+                accumulated_grad_in_no_sync = (
+                    flat_param.grad is not None
+                    and self.uses_sharded_strategy
+                    and flat_param.grad.shape == flat_param._unpadded_unsharded_size
+                )
+                if accumulated_grad_in_no_sync:
+                    self._use_unsharded_grad_views()
+                else:
+                    self._use_sharded_grad_views()
+
+    #########
+    # VIEWS #
+    #########
+    @no_type_check
+    def _get_unflat_views_unaligned(
+        self,
+        tensor: Optional[torch.Tensor] = None,
+    ) -> Iterator[Tensor]:
+        """
+        Return unflattened ``Tensor`` views into ``tensor``.
+
+        If `tensor`` is ``None``,  ``flat_param`` is used. The unflattening is based
+        on ``flat_param`` 's metadata.
+
+        Examples for ``tensor`` include ``flat_param.grad`` or unsharded
+        tensor optimizer state.
+        """
+        flat_param = self.flat_param
+        if tensor is None:
+            tensor = flat_param
+        views = (
+            _ext_post_unflatten_transform(
+                subtensor.view(shape),
+                param_extension,
+                self._fsdp_extension,
+            )
+            for (subtensor, shape, param_extension) in zip(
+                torch.split(tensor, flat_param._numels, dim=0),
+                flat_param._shapes,
+                flat_param._param_extensions,
+            )
+        )
+        return views
+
+    @no_type_check
+    def _get_unflat_views_aligned(
+        self,
+        tensor: Optional[Tensor] = None,
+    ) -> List[Tensor]:
+        """
+        Return unflattened ``Tensor`` views into ``tensor`` with handling for padding.
+
+        This method has the same contract as :meth:`_get_unflat_views_unaligned`
+        except it checks for ``None`` placeholders representing padding for
+        alignment, which may incur slightly more CPU overhead.
+        """
+        flat_param = self.flat_param
+        if tensor is None:
+            tensor = flat_param
+        splits: List[Tensor] = torch.split(
+            tensor, flat_param._numels_with_padding, dim=0
+        )
+        idx = 0
+        views: List[Tensor] = []
+        for split, is_padding in zip(splits, flat_param._is_padding_mask):
+            if is_padding:
+                continue
+            views.append(
+                _ext_post_unflatten_transform(
+                    split.view(flat_param._shapes[idx]),
+                    flat_param._param_extensions[idx],
+                    self._fsdp_extension,
+                )
+            )
+            idx += 1
+        return views
+
+    @no_type_check
+    @torch.enable_grad()
+    def _use_unsharded_views(self, as_params: bool) -> None:
+        """
+        Unflatten the unsharded flat parameter by setting the original parameter variables to be views into it.
+
+        Args:
+            as_params (bool): If ``True``, then registers the original
+                parameters as ``nn.Parameter`` s; if ``False``, then registers
+                the original parameters only as ``Tensor`` s. ``False`` should
+                be used during forward/backward computation and when hiding the
+                original parameters from :meth:`nn.Module.named_parameters`.
+
+        Note:
+            when prefetching for next forward, current forward may be
+            annotated with `@torch.no_grad()`
+            `@torch.enable_grad()` ensures non-empty `view.grad_fn`
+            otherwise `_post_backward_hook` will not get called
+        """
+        flat_param = self.flat_param
+        self._check_unsharded(flat_param)
+        views = self._get_unflat_views()
+        from torch.distributed._tensor import DTensor
+
+        for i, (view, (param_name, module, _)) in enumerate(
+            zip(views, flat_param._param_infos)
+        ):
+            if self._use_orig_params and as_params:
+                if type(view) is DTensor:
+                    # A `DTensor` `view` is not compatible with assigning
+                    # `param.data = view`, so we cannot preserve the parameter
+                    # variable.
+                    self._setattr_param(
+                        module,
+                        param_name,
+                        nn.Parameter(view, requires_grad=flat_param.requires_grad),
+                    )
+                    continue
+                param = self.flat_param._params[i]
+                self._setattr_param(module, param_name, param)
+                param.data = view
+            elif as_params:
+                self._setattr_param(
+                    module,
+                    param_name,
+                    nn.Parameter(view, requires_grad=flat_param.requires_grad),
+                )
+            else:  # `as_params=False`
+                param_var: Tensor = view
+                if self._use_orig_params:
+                    if self._training_state == HandleTrainingState.FORWARD:
+                        # Save the `Tensor` for the pre-backward
+                        self.flat_param._tensors[i] = view  # save for pre-backward
+                    elif self._training_state == HandleTrainingState.BACKWARD_PRE:
+                        # Use the saved `Tensor` variable from the forward to
+                        # preserve the autograd graph so that the post-backward
+                        # hook fires (e.g. for reentrant AC)
+                        tensor = self.flat_param._tensors[i]
+                        tensor.data = view
+                        param_var = tensor
+                self._setattr_tensor(module, param_name, param_var)
+                if (
+                    self._use_orig_params
+                    and self._training_state == HandleTrainingState.FORWARD
+                ):
+                    module._parameters[param_name] = param_var
+        for i, (
+            param_name,
+            module,
+            _,
+            prim_param_name,
+            prim_module,
+            _,
+        ) in enumerate(self.flat_param._shared_param_infos):
+            prim_param: Union[Tensor, nn.Parameter] = getattr(
+                prim_module, prim_param_name
+            )
+            _p_assert(
+                not as_params or isinstance(prim_param, nn.Parameter),
+                f"as_params={as_params} type(prim_param)={type(prim_param)}",
+            )
+            if self._use_orig_params and as_params:
+                shared_param = self.flat_param._shared_params[i]
+                self._setattr_param(module, param_name, shared_param)
+                shared_param.data = prim_param
+            elif as_params:
+                self._setattr_param(module, param_name, prim_param)
+            else:
+                self._setattr_tensor(module, param_name, prim_param)
+                if (
+                    self._use_orig_params
+                    and self._training_state == HandleTrainingState.FORWARD
+                ):
+                    module._parameters[param_name] = prim_param
+
+    @no_type_check
+    def _use_unsharded_grad_views(self) -> None:
+        """
+        Unflatten the unsharded flat parameter's gradient.
+
+        The original parameter variables' gradients are set to be views into
+        the unsharded flat parameter's gradient.
+        """
+        # Expects the gradient to be in `flat_param.grad`
+        if self.flat_param.grad is None:
+            for param in chain(self.flat_param._params, self.flat_param._shared_params):
+                param.grad = None
+            return
+        self._check_unsharded(self.flat_param.grad)
+        views = self._get_unflat_views(self.flat_param.grad)
+        for i, (view, (param_name, module, _)) in enumerate(
+            zip(views, self.flat_param._param_infos)
+        ):
+            _p_assert(
+                hasattr(module, param_name),
+                f"{self.flat_param._fqns[i]} is missing",
+            )
+            param = getattr(module, param_name)
+            if (
+                param.shape != view.shape
+                or param.dtype != view.dtype
+                or param.device != view.device
+            ):
+                # NOTE: This is a hack using `.data` to side step the check
+                # that parameter/gradient sizes/dtypes/devices match. From
+                # calling `reshard()`, `param` has the sharded size, has the
+                # full precision dtype, and if CPU offloading is enabled, is on
+                # CPU. Thus, one or more of the following cases can hold when
+                # in `no_sync()`, where `view` is the original parameter's
+                # gradient:
+                # 1. `view` can have the unsharded size.
+                # 2. `view` can have the parameter low precision dtype.
+                # 3. `view` can be on GPU.
+                if param.grad is None:
+                    param.grad = torch.empty_like(param)
+                param.grad.data = view
+            else:
+                param.grad = view
+        for i, (
+            param_name,
+            module,
+            module_name,
+            prim_param_name,
+            prim_module,
+            _,
+        ) in enumerate(self.flat_param._shared_param_infos):
+            _p_assert(
+                hasattr(module, param_name),
+                f"{module_name + '.' + param_name if module_name else param_name} is missing",
+            )  # did not save FQN info in `_shared_param_infos`
+            param = getattr(module, param_name)
+            prim_param = getattr(prim_module, prim_param_name)
+            if (
+                param.shape != prim_param.grad.shape
+                or param.dtype != prim_param.grad.dtype
+                or param.device != prim_param.grad.device
+            ):
+                # NOTE: This is the same hack to use `.data` to side step the
+                # size check.
+                if param.grad is None:
+                    param.grad = torch.empty_like(param)
+                param.grad.data = prim_param.grad
+            else:
+                param.grad = prim_param.grad
+
+    @contextlib.contextmanager
+    def unflatten_as_params(self) -> Generator:
+        """
+        Unflatten the original parameters.
+
+        The function assumes that the flat parameter is unsharded. When in the context,
+        unflattens the original parameters as ``nn.Parameter`` views into the
+        flat parameter, and after the context, restores the original parameters
+        as ``Tensor`` views into the flat parameter.
+        """
+        self._use_unsharded_views(as_params=True)
+        try:
+            yield
+        finally:
+            self._use_unsharded_views(as_params=False)
+
+    @no_type_check
+    @torch.no_grad()
+    def _use_sharded_views(self) -> None:
+        """
+        Set the original parameter variables' data to be flattened views into the sharded flat parameter.
+
+        The views are kept as flattened to simplify the case where a parameter
+        is sharded across ranks. Parameters whose data is not present in the
+        sharded flat parameter have their data set to a size-0 empty tensor. We
+        do not delete them to ensure to preserve expected behaviors like model
+        printability. Parameters whose data is present must preserve their
+        variables to be passable to an optimizer.
+        """
+        self._unsharded_flat_param_for_skipped_views = None
+        if not self.uses_sharded_strategy:
+            # For `NO_SHARD`, use the *unflattened* unsharded views since we
+            # have the unsharded parameter
+            self._use_unsharded_views(as_params=True)
+            return
+        flat_param = self.flat_param
+        self._check_sharded(flat_param)
+        # Construct once and reuse for all parameters not in the local shard
+        size_0_empty_tensor = torch.empty(
+            0,
+            dtype=self.flat_param.dtype,  # in case `flat_param` changed dtype
+            device=self.flat_param.device,
+            requires_grad=False,
+        )
+        for param, shard_param_info, (param_name, module, _) in zip(
+            flat_param._params, flat_param._shard_param_infos, flat_param._param_infos
+        ):
+            self._setattr_param(module, param_name, param)
+            if not shard_param_info.in_shard:
+                # Allow the original data to be freed via garbage collection
+                param.data = size_0_empty_tensor
+            else:
+                offset = shard_param_info.offset_in_shard
+                numel_in_shard = shard_param_info.numel_in_shard
+                param.data = flat_param[offset : offset + numel_in_shard]
+        assert self.flat_param._shared_params is not None
+        for i, (
+            param,
+            (param_name, module, _, prim_param_name, prim_module, _),
+        ) in enumerate(
+            zip(self.flat_param._shared_params, self.flat_param._shared_param_infos)
+        ):
+            self._setattr_param(module, param_name, param)
+            prim_param = getattr(prim_module, prim_param_name)
+            param.data = prim_param  # could be both empty and non-empty
+        if self._training_state == HandleTrainingState.BACKWARD_POST:
+            # Clear the saved `Tensor`s since they are unneeded now
+            for i in range(len(self.flat_param._tensors)):
+                self.flat_param._tensors[i] = None
+
+    @no_type_check
+    @torch.no_grad()
+    def _use_sharded_grad_views(self) -> None:
+        """
+        Set the original parameter variables' gradients to be flattened views into the sharded flat parameter's gradient.
+
+        This is a no-op if there is no gradient.
+
+        Parameters whose data is not present in the sharded flat parameter and
+        parameters with ``requires_grad=False`` have their gradients set to
+        ``None``. Since the gradient variables do not need to be preserved,
+        this method does not manipulate existing ``Tensor`` data directly and
+        creates new ``Tensor`` variables instead.
+        """
+        flat_param = self.flat_param
+        self._check_sharded(flat_param)
+        grad = self.sharded_grad
+        if grad is None:
+            for param in chain(flat_param._params, flat_param._shared_params):
+                param.grad = None
+            return
+        self._check_sharded(grad)
+        for param, shard_param_info, is_grad_none in zip(
+            flat_param._params,
+            flat_param._shard_param_infos,
+            flat_param._is_grad_none_mask,
+        ):
+            if not shard_param_info.in_shard:
+                param.grad = None
+            else:
+                numel_in_shard = shard_param_info.numel_in_shard
+                if param.requires_grad and not is_grad_none:
+                    offset = shard_param_info.offset_in_shard
+                    if self._keep_low_precision_grads or param.dtype != grad.dtype:
+                        # NOTE: This is a hack using `.data` to side step the
+                        # check that parameter/gradient dtypes match. Here,
+                        # `param` has full precision; `grad` has low precision.
+                        if param.grad is None:
+                            # `.grad` must have the same shape as `param`
+                            param.grad = torch.empty_like(param)
+                        param.grad.data = grad[
+                            offset : offset + numel_in_shard
+                        ].reshape(param.shape)
+                    else:
+                        param.grad = grad[offset : offset + numel_in_shard].reshape(
+                            param.shape
+                        )
+                else:
+                    param.grad = None
+        assert flat_param._shared_params is not None
+        for i, (param, (_, _, _, prim_param_name, prim_module, _)) in enumerate(
+            zip(flat_param._shared_params, flat_param._shared_param_infos)
+        ):
+            in_sharded_flat_param = hasattr(prim_module, prim_param_name)
+            if in_sharded_flat_param and param.requires_grad:
+                prim_param = getattr(prim_module, prim_param_name)
+                param.grad = prim_param.grad  # share the same reference
+            else:
+                param.grad = None
+
+    @no_type_check
+    @torch.no_grad()
+    def _writeback_orig_params(self) -> bool:
+        """
+        Write back any parameters that changed storage to the handle's ``FlatParameter``.
+
+        Iterates over the original parameters and writes back any parameters
+        that changed storages (due to a non-inplace operator) to the handle's
+        ``FlatParameter``. This method preserves the ``FlatParameter` 's
+        device even if an original parameter's device changes.
+
+        Raises:
+            RuntimeError: If an original parameter or gradient changes storages
+            but no longer has the expected flattened shape.
+        Returns: ``True`` if some writeback happened, and ``False`` otherwise.
+        """
+        if (
+            self.uses_sharded_strategy
+            and not self.is_sharded(self.flat_param)
+            and not self._skipped_use_sharded_views
+        ):
+            # For `NO_SHARD`, we may still need to writeback
+            return False
+        flat_param = self.flat_param
+        wroteback = False
+        if self._skipped_use_sharded_views and self.uses_sharded_strategy:
+            # NOTE: We must use the unsharded flat parameter from which the
+            # unsharded views were computed, not the one from the current
+            # calling context (`_get_padded_unsharded_flat_param()`) since that
+            # may be different (e.g. the model changed from train to eval).
+            flat_param_tensor = self._unsharded_flat_param_for_skipped_views
+            _p_assert(
+                _data_ptr_allocated(flat_param_tensor),
+                "If skipped using sharded views, the unsharded flat parameter "
+                "should be allocated",
+            )
+        else:
+            flat_param_tensor = flat_param
+        # NOTE: Since this method is called in the pre-unshard, which is only
+        # called during computation in the pre-forward or pre-backward, the
+        # sharded gradient should be guaranteed to be in `.grad`, not in
+        # `._saved_grad_shard`.
+        flat_param_grad = (
+            flat_param.grad
+            if self.uses_sharded_strategy or not self._offload_params
+            else flat_param._cpu_grad
+        )
+        for i, (
+            param,
+            (in_shard, offset_in_shard, numel_in_shard, _, _),
+            (param_name, module, _),
+        ) in enumerate(
+            zip(
+                flat_param._params,
+                flat_param._shard_param_infos,
+                flat_param._param_infos,
+            )
+        ):
+            if not in_shard:
+                continue
+            if not hasattr(module, param_name):
+                # Do not writeback if original parameters are deregistered
+                # (e.g. during model checkpointing)
+                continue
+
+            # Check for parameter writeback
+            if self._skipped_use_sharded_views:
+                param = flat_param._tensors[i]
+                _p_assert(
+                    param is not None,
+                    f"Expects to have saved tensor for {flat_param._fqns[i]}",
+                )
+            param_changed = getattr(module, param_name) is not param
+            needs_param_writeback = (
+                param_changed  # changed parameter variable itself
+                or not _same_storage(param, flat_param_tensor)
+            )
+            if self._skipped_use_sharded_views and (
+                param_changed or needs_param_writeback
+            ):
+                raise AssertionError(
+                    "FSDP does not support changing the parameters between "
+                    f"forward and backward for {self._sharding_strategy}"
+                )
+            if param_changed:
+                # NOTE: The gradient is not preserved after a parameter change.
+                param = getattr(module, param_name)
+                flat_param._params[i] = param
+            if needs_param_writeback:
+                expected_shape = torch.Size([numel_in_shard])
+                self._writeback_tensor(
+                    param, flat_param, i, expected_shape, offset_in_shard, True
+                )
+                wroteback = True
+
+            # Check for gradient writeback
+            if self._skipped_use_sharded_views:
+                # Skip the writeback check because we do not expose gradients
+                # when we skipped using sharded views
+                continue
+            if param.grad is None and flat_param.grad is not None:
+                expected_shape = torch.Size([numel_in_shard])
+                self._writeback_tensor(
+                    None, flat_param.grad, i, expected_shape, offset_in_shard, False
+                )
+            elif param.grad is not None:
+                # For `NO_SHARD` + CPU offloading, `_cpu_grad` is always in
+                # memory and owns the gradient storage, so it will never
+                # require gradient writeback.
+                if not self.uses_sharded_strategy and self._offload_params:
+                    # Explicitly continue to handle the case of `no_sync()`,
+                    # where `param.grad` is a view into the GPU gradient
+                    # referenced by `flat_param.grad`, while `flat_param_grad`
+                    # is `flat_param._cpu_grad`, which is on CPU
+                    continue
+
+                needs_grad_writeback = flat_param_grad is None or not _same_storage(
+                    param.grad, flat_param_grad
+                )
+                if needs_grad_writeback:
+                    if flat_param_grad is None:
+                        flat_param_grad = torch.zeros_like(flat_param)
+                    expected_shape = torch.Size([numel_in_shard])
+                    self._writeback_tensor(
+                        param.grad,
+                        flat_param_grad,
+                        i,
+                        expected_shape,
+                        offset_in_shard,
+                        False,
+                    )
+                    flat_param.grad = flat_param_grad
+                    flat_param_grad = flat_param.grad
+
+        # TODO: If we want to handle shared parameters, we need to re-generate
+        # the shared parameter data structures in case sharedness changed.
+        for i, (
+            param_name,
+            module,
+            _,
+            prim_param_name,
+            prim_module,
+            _,
+        ) in enumerate(flat_param._shared_param_infos):
+            if getattr(module, param_name) is not getattr(prim_module, prim_param_name):
+                raise NotImplementedError(
+                    "Changing shared parameters is not supported yet"
+                )
+        return wroteback
+
+    def _writeback_tensor(
+        self,
+        src_tensor: Optional[Tensor],
+        dst_tensor: Tensor,
+        tensor_index: int,
+        expected_shape: torch.Size,
+        offset: int,
+        is_param: bool,  # else gradient
+    ) -> None:
+        """
+        Write back ``src_tensor`` to ``dst_tensor`` at offset ``offset``, where ``src_tensor`` should have shape ``expected_shape``.
+
+        ``is_param`` indicates if the tensor is the parameter (if ``True``) or gradient (if
+        ``False``). If ``src_tensor`` is ``None``, then the effect is zeroing
+        instead of copying. ``tensor_index`` gives the index of ``src_tensor``
+        in the metadata structures.
+
+        Raises:
+            RuntimeError: If the ``src_tensor`` does not have the expected
+            shape.
+        """
+        _p_assert(
+            len(expected_shape) == 1,
+            f"Expects a 1D expected shape but got {expected_shape}",
+        )
+        if self._debug_level == dist.DebugLevel.INFO:
+            rank = self.rank if hasattr(self, "rank") else dist.get_rank()
+            src_shape = src_tensor.shape if src_tensor is not None else None
+            src_device = src_tensor.device if src_tensor is not None else None
+            warnings.warn(
+                f"[Rank {rank}] {'Parameter' if is_param else 'Gradient'} needs "
+                f"writeback in {self._training_state}\n"
+                f"expected shape={expected_shape} shape={src_shape} "
+                f"expected device={dst_tensor.device} device={src_device}"
+            )
+        if src_tensor is not None and src_tensor.shape != expected_shape:
+            # NOTE: Gradient shape mismatch is not possible in practice since
+            # the gradient shape is enforced to match that of the parameter and
+            # we already check for parameter shape mismatch.
+            raise RuntimeError(
+                f"Cannot writeback when the {'parameter' if is_param else 'gradient'} "
+                f"shape changes\nExpects {expected_shape} but got {src_tensor.shape}"
+            )
+        if src_tensor is not None:
+            dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor)
+        else:
+            dst_tensor[offset : offset + expected_shape.numel()].zero_()
+            assert self.flat_param._is_grad_none_mask is not None
+            self.flat_param._is_grad_none_mask[tensor_index] = True
+
+    def _reset_flat_param_grad_info_if_needed(self):
+        """
+        Reset ``flat_param.grad`` if needed.
+
+        When ``use_orig_params=True``:
+        (1) sets the underlying ``flat_param.grad`` to ``None`` if *all* of the
+        original parameters' ``.grad`` are ``None``, and
+        (2) sets ``flat_param.requires_grad=False`` if *none* of the original
+        parameters require gradient.
+        For (1), this is targeting ``optim.zero_grad(set_to_none=True)``, in
+        which case we want to free the gradients as soon after the
+        ``zero_grad()`` call as possible.
+        """
+        if not self._use_orig_params:
+            return
+        flat_param = self.flat_param
+        assert flat_param._params is not None  # mypy
+        all_grad_none = True
+        requires_grad = False
+        for param in flat_param._params:
+            all_grad_none &= param.grad is None
+            requires_grad |= param.requires_grad
+        if all_grad_none:
+            flat_param.grad = None
+        # As long as one parameter requires gradient, then the flat parameter
+        # must require gradient
+        flat_param.requires_grad = requires_grad
+
+    def _deregister_orig_params(self):
+        for param_info in self.flat_param._param_infos:
+            param_name, module, _ = param_info
+            if hasattr(module, param_name):
+                delattr(module, param_name)
+        for param_name, module, _, _, _, _ in self.flat_param._shared_param_infos:
+            if hasattr(module, param_name):
+                delattr(module, param_name)
+
+    ###########
+    # HELPERS #
+    ###########
+    def flat_param_to(self, *args, **kwargs):
+        """Wrap an in-place call to ``.to()`` for ``self.flat_param``."""
+        self.flat_param.data = self.flat_param.to(*args, **kwargs)
+        if self._use_orig_params:
+            # Refresh the views because their storage may have changed
+            if self.is_sharded(self.flat_param):
+                self._use_sharded_views()
+            else:
+                self._use_unsharded_views(as_params=True)
+
+    def _get_modules(self) -> Set[nn.Module]:
+        """Return a :class:`set` of the modules whose parameters are included in this handle's flat parameter."""
+        return {pi.module for pi in self.flat_param._param_infos}.union(
+            {spi.module for spi in self.flat_param._shared_param_infos}
+        )
+
+    def is_sharded(self, tensor: Tensor) -> bool:
+        """
+        Return whether ``tensor`` is *currently* sharded.
+
+        For ``NO_SHARD``, we choose to have this always return ``False`` for clarity.
+        """
+        if (
+            not hasattr(self.flat_param, "_sharded_size")
+            or not self.uses_sharded_strategy
+        ):
+            # `_sharded_size` is defined iff `handle.shard()` has been called
+            return False
+        sharded_size = self.flat_param._sharded_size  # type: ignore[attr-defined]
+        return tensor.size() == sharded_size
+
+    def param_module_names(self) -> Iterator[Tuple[str, str]]:
+        shared_param_infos = [
+            ParamInfo(param_name, module, module_name)
+            for (
+                param_name,
+                module,
+                module_name,
+                _,
+                _,
+                _,
+            ) in self.flat_param._shared_param_infos
+        ]
+        for param_info in chain(self.flat_param._param_infos, shared_param_infos):
+            param_name, _, module_name = param_info  # type: ignore[misc]
+            yield (param_name, module_name)
+
+    def shared_param_module_names(self) -> Iterator[Tuple[str, str]]:
+        for param_name, _, module_name in [
+            ParamInfo(param_name, module, module_name)
+            for (
+                param_name,
+                module,
+                module_name,
+                _,
+                _,
+                _,
+            ) in self.flat_param._shared_param_infos
+        ]:
+            yield (param_name, module_name)
+
+    @property
+    def _fqns_in_shard(self) -> List[str]:
+        """Return the FQNs of the parameters present in this rank's shard."""
+        fqns_in_shard: List[str] = []
+        for fqn, shard_param_info in zip(
+            self.flat_param._fqns, self.flat_param._shard_param_infos  # type: ignore[attr-defined]
+        ):
+            if shard_param_info.in_shard:
+                fqns_in_shard.append(fqn)
+        return fqns_in_shard
+
+    @property
+    def sharded_grad(self) -> Optional[Tensor]:
+        """Return the handle's sharded gradient."""
+        flat_param = self.flat_param
+        # Priority for non-`None`: `_cpu_grad` > `_saved_grad_shard` > `grad`
+        # - CPU offloading: `_cpu_grad`
+        # - No CPU offloading + sharded strategies: `_saved_grad_shard`
+        # - No CPU offloading + `NO_SHARD`: `grad`
+        grad: Optional[Tensor]
+        if hasattr(flat_param, "_cpu_grad"):
+            grad = flat_param._cpu_grad  # type: ignore[attr-defined]
+        elif hasattr(flat_param, "_saved_grad_shard"):
+            # In the post-backward hook, the sharded gradient is still in
+            # `_saved_grad_shard`.
+            grad = flat_param._saved_grad_shard  # type: ignore[attr-defined]
+        else:
+            # If in IDLE or in FORWARD states, then there may be an
+            # (accumulated) gradient. If accessed in IDLE, then this should
+            # be due to re-registering the original parameters (e.g. in state
+            # dict load).
+            _p_assert(
+                flat_param.grad is None
+                or not self.uses_sharded_strategy
+                or self._training_state
+                in (HandleTrainingState.FORWARD, HandleTrainingState.IDLE),
+                "Sharded strategies should use `_cpu_grad` or `_saved_grad_shard` "
+                "unless in IDLE or FORWARD",
+            )
+            grad = flat_param.grad
+        return grad
+
+    def _reset_is_grad_none(self) -> None:
+        """
+        Reset ``_is_grad_none_mask`` as needed.
+
+        This method should only be
+        called in the post-backward after gradient computation, in which case
+        if a parameter requires gradient, then it will surely receive a
+        gradient and we may reset its mask entry to ``False``.
+        """
+        if not self._use_orig_params:
+            return
+        _p_assert(
+            self._training_state == HandleTrainingState.BACKWARD_POST,
+            "Expects to only be called in the post-backward after gradient computation",
+        )
+        flat_param = self.flat_param
+        assert flat_param._params is not None  # mypy
+        for i, param in enumerate(flat_param._params):  # type: ignore[arg-type]
+            # As long as the parameter requires gradient, it should receive a
+            # meaningful gradient (even if the gradient happens to be zeros)
+            if param.requires_grad:
+                assert flat_param._is_grad_none_mask is not None  # mypy
+                flat_param._is_grad_none_mask[i] = False
+
+    #######################
+    # CHECKS & INVARIANTS #
+    #######################
+    def _check_sharded_strategy(self):
+        _p_assert(self.uses_sharded_strategy, "Expects sharded strategy")
+
+    def _check_on_compute_device(self, tensor: Tensor):
+        _p_assert(
+            tensor.device == self.device,
+            f"Expects tensor to be on the compute device {self.device}, was on {tensor.device}",
+        )
+
+    def _check_on_cpu(self, tensor: Tensor):
+        _p_assert(
+            tensor.device == torch.device("cpu"),
+            f"Expects tensor to be on CPU but got {tensor.device}",
+        )
+
+    @staticmethod
+    def _check_storage_freed(tensor: Tensor):
+        # Compile does not resize during trace
+        if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+            _p_assert(
+                _same_storage_size(tensor, 0),
+                "Expects storage to be freed but got storage with size > 0",
+            )
+
+    @staticmethod
+    def _check_storage_allocated(tensor: Tensor):
+        _p_assert(_storage_size_allocated(tensor), "Expects storage to be allocated")
+
+    def _check_low_precision_shard(self):
+        _p_assert(
+            self._uses_param_mixed_precision,
+            "Not using low precision for parameters",
+        )
+        _p_assert(
+            getattr(self.flat_param, "_mp_shard", None) is not None,
+            "Expects `_mp_shard` to exist",
+        )
+        device = self.flat_param._mp_shard.device  # type: ignore[attr-defined]
+        _p_assert(
+            device == self.device,
+            f"Expects the low precision shard to be on {self.device} but got {device}",
+        )
+
+    def _check_unsharded(self, tensor: Tensor):
+        msg_prefix = "Expects tensor to be unsharded "
+        _p_assert(tensor is not None, msg_prefix + "but got `None`")
+        unsharded_size = self.flat_param._unpadded_unsharded_size
+        _p_assert(
+            tensor.size() == unsharded_size,
+            msg_prefix + f"with size {unsharded_size} but got {tensor.size()}",
+        )
+
+    def _check_sharded(self, tensor: Tensor):
+        msg_prefix = "Expects tensor to be sharded "
+        _p_assert(tensor is not None, msg_prefix + "but got `None`")
+        sharded_size = self.flat_param._sharded_size  # type: ignore[attr-defined]
+        _p_assert(
+            tensor.size() == sharded_size,
+            msg_prefix + f"with size {sharded_size} but got {tensor.size()}",
+        )
+
+    ##############
+    # PROPERTIES #
+    ##############
+    @property
+    def uses_sharded_strategy(self) -> bool:
+        return self._sharding_strategy != HandleShardingStrategy.NO_SHARD
+
+    @property
+    def _uses_param_mixed_precision(self) -> bool:
+        return self._fwd_bwd_param_dtype != self._orig_param_dtype
+
+    @property
+    def _uses_reduce_mixed_precision(self) -> bool:
+        return self._reduce_dtype != self._orig_param_dtype
+
+    @property
+    def _force_full_precision(self) -> bool:
+        return (
+            self._uses_param_mixed_precision or self._uses_reduce_mixed_precision
+        ) and (
+            self._training_state == HandleTrainingState.SUMMON_FULL_PARAMS
+            or
+            # Also disable mixed precision in model eval mode, if configured
+            (not self._fully_sharded_module.training and self._use_full_prec_in_eval)
+        )
+
+    @property
+    def _skipped_use_sharded_views(self) -> bool:
+        """
+        This property is used for sharding strategies that do not free after forward with ``use_orig_params=True``.
+
+        This returns if this handle is
+        currently in a state where it has skipped using sharded views, in which
+        case it can restore view invariants via ``_use_sharded_views()``.
+        """
+        return self._unsharded_flat_param_for_skipped_views is not None
+
+
+# NOTE: These are hacks to bypass `nn.Module.__setattr__` checks.
+def _unsafe_setattr_param(
+    module: nn.Module, param_name: str, param: nn.Parameter
+) -> None:
+    module._parameters[param_name] = param
+    # This bypasses any overrides in case `module` is an instance of an
+    # `nn.Module` subclass
+    super(nn.Module, module).__setattr__(param_name, param)
+
+
+def _unsafe_setattr_tensor(module: nn.Module, param_name: str, tensor: Tensor) -> None:
+    module._parameters.pop(param_name, None)
+    # This bypasses any overrides in case `module` is an instance of an
+    # `nn.Module` subclass
+    super(nn.Module, module).__setattr__(param_name, tensor)
+
+
+def _safe_setattr_tensor_or_param(
+    module: nn.Module, param_name: str, tensor_or_param: Union[Tensor, nn.Parameter]
+):
+    # Call `delattr()` and `setattr()` to go through `nn.Module` checks
+    if hasattr(module, param_name):
+        delattr(module, param_name)
+    setattr(module, param_name, tensor_or_param)
+
+
+def _convert_to_params(
+    tensors: List[Union[torch.Tensor, nn.Parameter]]
+) -> List[nn.Parameter]:
+    return [t if isinstance(t, nn.Parameter) else nn.Parameter(t) for t in tensors]
+
+
+def _detach_if_needed(param_or_tensor: Union[nn.Parameter, Tensor]) -> Tensor:
+    return (
+        param_or_tensor.detach()
+        if isinstance(param_or_tensor, nn.Parameter)
+        else param_or_tensor
+    )
+
+
+def _get_aligned_numel(unsharded_dtype: torch.dtype):
+    # NOTE: This alignment constraint comes from TorchInductor.
+    ALIGNMENT = 16  # bytes
+    unsharded_dtype_size = _get_dtype_size(unsharded_dtype)
+    aligned_numel = ALIGNMENT // unsharded_dtype_size
+    return aligned_numel
+
+
+@functools.lru_cache(8)
+def _get_dtype_size(dtype):
+    return torch.empty((), dtype=dtype).element_size()
+
+
+def _construct_padding_tensor(
+    padding_numel: int, dtype: torch.dtype, requires_grad: bool, device: torch.device
+):
+    # NOTE: Set the padding value as a magic number for debuggability. The
+    # value itself should never be used in any user-facing computation.
+    return (
+        torch.ones(
+            (padding_numel,), dtype=dtype, requires_grad=requires_grad, device=device
+        )
+        * _FLAT_PARAM_PADDING_VALUE
+    )
+
+
+# Use `lru_cache(1)` to only log the warning once (assuming the fixed warning
+# messasge is passed in)
+@functools.lru_cache(1)
+def _warn_skip_writeback_check(log: logging.Logger, warning: str):
+    log.warning(warning)
+
+
+# Use `lru_cache(1)` to only log the warning once
+@functools.lru_cache(1)
+def _warn_use_fake_all_gather(log: logging.Logger, warning: str):
+    log.warning(warning)
+
+
+# Use `lru_cache(1)` to only log the warning once
+@functools.lru_cache(1)
+def _warn_use_fake_reduce(log: logging.Logger, warning: str):
+    log.warning(warning)
+
+
+def _same_storage(a, b):
+    # Params are DTensors in backward
+    # with SHARD_GRAD_OP + TP
+    from torch.distributed._tensor import DTensor
+
+    if isinstance(a, DTensor):
+        a = a._local_tensor
+    if isinstance(b, DTensor):
+        b = b._local_tensor
+    return a.untyped_storage().data_ptr() == b.untyped_storage().data_ptr()
+
+
+def _same_storage_size(a: torch.Tensor, b: int):
+    return a.untyped_storage().size() // a.element_size() == b
+
+
+def _storage_size_allocated(tensor: Tensor):
+    storage_size: int = tensor.untyped_storage().size()
+    return storage_size > 0
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_fsdp_extensions.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_fsdp_extensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed39cfb4ad0b7891066c8503d41bf3d52820a940
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_fsdp_extensions.py
@@ -0,0 +1,179 @@
+from abc import ABC, abstractmethod
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch.distributed._shard.sharded_tensor.api import ShardedTensor
+from torch.distributed._shard.sharded_tensor.shard import Shard
+from torch.distributed._tensor import DeviceMesh, DTensor
+from torch.distributed.fsdp._shard_utils import (
+    _all_gather_dtensor,
+    _create_chunk_dtensor,
+    _create_chunk_sharded_tensor,
+)
+
+
+class FSDPExtensions(ABC):
+    """
+    This enables some customizable hooks to enable composability with tensor
+    parallelism. To activate these hooks, use :func:`_set_fsdp_extensions` to
+    set a custom :class:`FSDPExtensions` that implements the hooks.
+    """
+
+    @abstractmethod
+    def pre_flatten_transform(
+        self,
+        tensor: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        """E.g. converting ``DistributedTensor`` to local tensor."""
+        ...
+
+    @abstractmethod
+    def post_unflatten_transform(
+        self,
+        tensor: torch.Tensor,
+        param_extension: Any,
+    ) -> torch.Tensor:
+        """E.g. converting local tensor to ``DistributedTensor``."""
+        ...
+
+    @abstractmethod
+    def chunk_tensor(
+        self,
+        tensor: torch.Tensor,
+        rank: int,
+        world_size: int,
+        num_devices_per_node: int,
+        pg: dist.ProcessGroup,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """Shards a tensor to chunks and returns the local chunk."""
+        ...
+
+    @abstractmethod
+    def chunk_dtensor(
+        self,
+        tensor: torch.Tensor,
+        rank: int,
+        device_mesh: DeviceMesh,
+    ) -> torch.Tensor:
+        """Shards a tensor/DTensor to DTensor and returns the local DTensor."""
+        ...
+
+    @abstractmethod
+    def pre_load_state_dict_transform(
+        self,
+        tensor: torch.Tensor,
+    ) -> Tuple[torch.Tensor, List[Shard]]:
+        """
+        This is to be called before loading a *sharded* model state dict and
+        should return the tensor and list of shards from which to load data.
+        """
+        ...
+
+    @abstractmethod
+    def all_gather_dtensor(
+        self,
+        tensor: DTensor,
+        parent_mesh: Optional[DeviceMesh],
+    ) -> torch.Tensor:
+        """
+        This is to be called before loading a *sharded* DTensor state dict.
+        This gathers tensor in FSDP dimension and returns local tensor of
+        TP DTensor.
+        """
+        ...
+
+
+_extensions: Optional[FSDPExtensions] = None
+
+
+def _set_fsdp_extensions(flattener: FSDPExtensions) -> None:
+    global _extensions
+    _extensions = flattener
+
+
+def _ext_pre_flatten_transform(
+    tensor: torch.Tensor,
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> Tuple[torch.Tensor, Optional[Any]]:
+    if fsdp_extension is not None:
+        new_tensor, param_extension = fsdp_extension.pre_flatten_transform(tensor)
+        if param_extension is not None:
+            return new_tensor, param_extension
+    return tensor, None
+
+
+def _ext_post_unflatten_transform(
+    tensor: torch.Tensor,
+    param_extension: Any,
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> torch.Tensor:
+    if fsdp_extension is not None and param_extension is not None:
+        return fsdp_extension.post_unflatten_transform(tensor, param_extension)
+    return tensor
+
+
+def _ext_chunk_tensor(
+    tensor: torch.Tensor,
+    rank: int,
+    world_size: int,
+    num_devices_per_node: int,
+    pg: dist.ProcessGroup,
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> torch.Tensor:
+    chunk_tensor_fn = (
+        fsdp_extension.chunk_tensor
+        if fsdp_extension is not None
+        else _create_chunk_sharded_tensor
+    )
+    return chunk_tensor_fn(
+        tensor,
+        rank,
+        world_size,
+        num_devices_per_node,
+        pg,
+    )
+
+
+def _ext_chunk_dtensor(
+    tensor: torch.Tensor,
+    rank: int,
+    device_mesh: DeviceMesh,
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> torch.Tensor:
+    chunk_dtensor_fn = (
+        fsdp_extension.chunk_dtensor
+        if fsdp_extension is not None
+        else _create_chunk_dtensor
+    )
+    return chunk_dtensor_fn(
+        tensor,
+        rank,
+        device_mesh,
+    )
+
+
+def _ext_pre_load_state_dict_transform(
+    tensor: torch.Tensor,
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> Tuple[torch.Tensor, List[Shard]]:
+    if fsdp_extension is not None:
+        return fsdp_extension.pre_load_state_dict_transform(tensor)
+
+    assert type(tensor) is ShardedTensor
+    shards = tensor.local_shards()
+    return (tensor, shards)
+
+
+def _ext_all_gather_dtensor(
+    tensor: DTensor,
+    parent_mesh: Optional[DeviceMesh],
+    fsdp_extension: Optional[FSDPExtensions] = None,
+) -> torch.Tensor:
+    all_gather_dtensor_fn = (
+        fsdp_extension.all_gather_dtensor
+        if fsdp_extension is not None
+        else _all_gather_dtensor
+    )
+    return all_gather_dtensor_fn(tensor, parent_mesh)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_init_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_init_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..179336fc13ae8801bbf326a10bfc6b9fe2fb1a00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_init_utils.py
@@ -0,0 +1,1182 @@
+import collections
+import itertools
+import os
+import warnings
+from typing import (
+    Any,
+    Callable,
+    Deque,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    no_type_check,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._exec_order_utils as exec_order_utils
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.distributed.fsdp.fully_sharded_data_parallel as fsdp_file
+import torch.nn as nn
+from torch.distributed.algorithms._comm_hooks import default_hooks
+from torch.distributed.device_mesh import _mesh_resources, DeviceMesh
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.distributed.fsdp._common_utils import (
+    _FSDPDeviceHandle,
+    _FSDPState,
+    _get_module_fsdp_state,
+    _is_fsdp_flattened,
+    _named_parameters_with_duplicates,
+    clean_tensor_name,
+    TrainingState,
+)
+from torch.distributed.fsdp._flat_param import (
+    _FSDP_USE_FULL_PREC_IN_EVAL,
+    FlatParameter,
+    FlatParamHandle,
+    HandleShardingStrategy,
+)
+from torch.distributed.fsdp._limiter_utils import _FreeEventQueue
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    MixedPrecision,
+    ShardingStrategy,
+    StateDictConfig,
+    StateDictType,
+)
+from torch.distributed.fsdp.wrap import _Policy
+from torch.distributed.tensor.parallel.fsdp import DTensorExtensions
+from torch.distributed.utils import _sync_params_and_buffers
+
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils.hooks import RemovableHandle
+
+_TORCHDISTX_AVAIL = True
+try:
+    from torchdistx import deferred_init, fake  # type: ignore[import]
+except ImportError:
+    _TORCHDISTX_AVAIL = False
+
+PARAM_BROADCAST_BUCKET_SIZE = int(250 * 1024 * 1024)
+FSDP_SYNCED = "_fsdp_synced"
+# Specification of process groups for hybrid sharding strategies.
+HybridShardProcessGroupType = Tuple[dist.ProcessGroup, dist.ProcessGroup]
+# Overall specification of process group.
+ProcessGroupType = Optional[Union[dist.ProcessGroup, HybridShardProcessGroupType]]
+
+
+# TODO (awgu): Refactor this later
+SHARDING_STRATEGY_MAP = {
+    ShardingStrategy.NO_SHARD: HandleShardingStrategy.NO_SHARD,
+    ShardingStrategy.FULL_SHARD: HandleShardingStrategy.FULL_SHARD,
+    ShardingStrategy.SHARD_GRAD_OP: HandleShardingStrategy.SHARD_GRAD_OP,
+    ShardingStrategy.HYBRID_SHARD: HandleShardingStrategy.HYBRID_SHARD,
+    ShardingStrategy._HYBRID_SHARD_ZERO2: HandleShardingStrategy._HYBRID_SHARD_ZERO2,
+}
+HYBRID_SHARDING_STRATEGIES = [
+    ShardingStrategy.HYBRID_SHARD,
+    ShardingStrategy._HYBRID_SHARD_ZERO2,
+]
+NO_RESHARD_AFTER_FORWARD_STRATEGIES = (
+    ShardingStrategy.SHARD_GRAD_OP,
+    ShardingStrategy._HYBRID_SHARD_ZERO2,
+)
+
+
+# NOTE: Since non-self attributes cannot be type annotated, several attributes
+# on `state` are defined first as local variables before being assigned.
+
+
+@no_type_check
+def _init_process_group_state(
+    state: _FSDPState,
+    process_group: ProcessGroupType,
+    sharding_strategy: ShardingStrategy,
+    policy: Optional[_Policy],
+    device_mesh: Optional[DeviceMesh] = None,
+) -> _FSDPState:
+    if process_group is not None and device_mesh is not None:
+        raise ValueError(
+            "Cannot pass both process_group and device_mesh at the "
+            "same time. Please just pass only one of them."
+        )
+    is_hybrid_strategy = sharding_strategy in HYBRID_SHARDING_STRATEGIES
+    if is_hybrid_strategy:
+        if process_group is None and policy is None and device_mesh is None:
+            # Raise an error here, since this is manual wrapping with no process group
+            # passed in, there is no way to ensure all wrapped FSDP instances use the same
+            # process groups.
+            raise ValueError(
+                f"Manual wrapping with {sharding_strategy}",
+                "requires explicit specification of process group or device_mesh.",
+            )
+        else:
+            state = _init_process_group_state_for_hybrid_shard(
+                state, process_group, device_mesh
+            )
+    else:
+        if device_mesh:
+            state._device_mesh = device_mesh
+            state.process_group = device_mesh.get_group(mesh_dim=0)
+        else:
+            state.process_group = (
+                process_group if process_group is not None else _get_default_group()
+            )
+
+    state.rank = state.process_group.rank()
+    state.world_size = state.process_group.size()
+    data_parallel_world_size = state.world_size
+    if is_hybrid_strategy:
+        data_parallel_world_size *= state._inter_node_pg.size()
+    state._gradient_predivide_factor = (
+        default_hooks.DefaultState._get_gradient_predivide_factor(
+            data_parallel_world_size
+        )
+    )
+    state._gradient_postdivide_factor = (
+        data_parallel_world_size / state._gradient_predivide_factor
+    )
+    return state
+
+
+@no_type_check
+def _init_process_group_state_for_hybrid_shard(
+    state: _FSDPState,
+    process_group: ProcessGroupType,
+    device_mesh: DeviceMesh,
+) -> _FSDPState:
+    if device_mesh:
+        if _is_valid_hybrid_shard_device_mesh(device_mesh):
+            state._device_mesh = device_mesh
+            # We currently only allow _inter_node_pg to be the outermost dimension, and the
+            # process_group(intra_node) to be the innermost dimension.
+            state._inter_node_pg = device_mesh.get_group(mesh_dim=0)
+            state.process_group = device_mesh.get_group(mesh_dim=1)
+        else:
+            raise ValueError(
+                "Expected device_mesh to have ndim=2 "
+                f"but got {len(device_mesh.get_group())}"
+            )
+    elif process_group is None:
+        default_group = _get_default_group()
+        intra_node_group, inter_node_group = _init_intra_and_inter_node_groups(
+            default_group, state._device_handle.device_count()
+        )
+        # we shard across intra-node
+        state.process_group = intra_node_group
+        # save _inter_node_pg to allreduce across.
+        state._inter_node_pg = inter_node_group
+    else:
+        # Check type and assign state.process_group and state._inter_node_pg.
+        if _is_valid_hybrid_shard_pg_type(process_group):
+            # Assuming that user passed in as intra node group and inter node group
+            # as documented.
+            state.process_group, state._inter_node_pg = process_group
+        else:
+            raise ValueError(
+                "Expected process_group to be passed in as either None or "
+                f"Tuple[dist.ProcessGroup, dist.ProcessGroup] but got {type(process_group)}"
+            )
+    # Create state for allreduce
+    state._inter_node_state = _get_default_comm_hook_state(
+        process_group=state._inter_node_pg,
+    )
+    return state
+
+
+@no_type_check
+def _is_valid_hybrid_shard_pg_type(process_group: Any) -> bool:
+    return (
+        isinstance(process_group, tuple)
+        and len(process_group) == 2
+        and all(isinstance(pg, dist.ProcessGroup) for pg in process_group)
+    )
+
+
+@no_type_check
+def _is_valid_hybrid_shard_device_mesh(device_mesh: DeviceMesh) -> bool:
+    return isinstance(device_mesh, DeviceMesh) and device_mesh.ndim == 2
+
+
+@no_type_check
+def _init_intra_node_process_group(num_devices_per_node: int) -> dist.ProcessGroup:
+    """
+    Return a process group across the current node.
+
+    For example, given each row is a distinct node:
+    0 1 2 3 4 5 6 7 8
+    9 10 11 12 13 14 15
+    This API would return an intra-node subgroup across
+    [0, 7] or [8, 15] depending on the process's rank.
+    For example, rank 3 would get [0, 7].
+    """
+    intra_node_subgroup, _ = dist.new_subgroups(num_devices_per_node)
+    return intra_node_subgroup
+
+
+@no_type_check
+def _init_inter_node_process_group(
+    global_process_group: dist.ProcessGroup,
+    num_devices_per_node: int,
+) -> dist.ProcessGroup:
+    """
+    Return an inter-node process group where each contained rank has the same local rank.
+
+    For example, given each row is a distinct node:
+    0 1 2 3 4 5 6 7 8
+    9 10 11 12 13 14 15
+    This API would return inter-node process group {0, 8}, {1, 9}, {2, 10}, and so forth
+    depending on the process's rank. For example, rank 1 would get {1, 9}, rank 5
+    would get {5, 13}.
+    """
+    # the inter-node pg that is returned
+    inter_node_pg = None
+    sharding_backend = dist.get_backend(global_process_group)
+    world_size = dist.get_world_size(global_process_group)
+    # Assuming fully homogeneous setup
+    num_nodes = world_size // num_devices_per_node
+    my_local_rank = dist.get_rank(global_process_group) % num_devices_per_node
+    for local_rank in range(num_devices_per_node):
+        ranks_for_inter_group = [
+            local_rank + (i * num_devices_per_node) for i in range(num_nodes)
+        ]
+        # every rank always needs to call dist.new_group
+        grp = dist.new_group(ranks=ranks_for_inter_group, backend=sharding_backend)
+        if local_rank == my_local_rank:
+            inter_node_pg = grp
+
+    assert (
+        inter_node_pg is not None
+    ), f"{my_local_rank} expected to assign inter-node pg, but did not"
+    return inter_node_pg
+
+
+def _init_intra_and_inter_node_groups(
+    global_process_group: dist.ProcessGroup,
+    num_devices_per_node: int,
+) -> Tuple[dist.ProcessGroup, dist.ProcessGroup]:
+    """
+    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.
+
+    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
+    ``_HYBRID_SHARD_ZERO2`` in FSDP.
+    This function assumes each node has an equal number of CUDA-enabled devices.
+    Returns:
+        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
+    """
+    return (
+        _init_intra_node_process_group(num_devices_per_node),
+        _init_inter_node_process_group(global_process_group, num_devices_per_node),
+    )
+
+
+@no_type_check
+def _init_ignored_module_states(
+    state: _FSDPState,
+    module: nn.Module,
+    ignored_modules: Optional[Iterable[torch.nn.Module]],
+    ignored_states: Union[
+        Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]
+    ] = None,
+) -> _FSDPState:
+    if ignored_modules is not None and ignored_states is not None:
+        raise ValueError(
+            "Cannot pass both ignored_modules and ignored_states at the "
+            "same time. Please just pass ignored_states."
+        )
+    ignored_parameters = None
+    passed_as_ignored_states = ignored_states is not None
+    if passed_as_ignored_states:
+        ignored_states_list = list(ignored_states)
+        _check_ignored_states(ignored_states_list, True)
+    else:
+        ignored_states_list = []
+        _check_ignored_states(
+            list(ignored_modules) if ignored_modules is not None else [], False
+        )
+    if len(ignored_states_list) > 0:
+        if isinstance(ignored_states_list[0], nn.Parameter):
+            ignored_parameters = ignored_states_list
+        else:
+            ignored_modules = ignored_states_list
+    state._ignored_modules = _get_ignored_modules(module, ignored_modules)
+    state._ignored_params = _get_ignored_params(
+        module,
+        state._ignored_modules,
+        ignored_parameters,
+    )
+    state._ignored_buffer_names = _get_ignored_buffer_names(
+        module,
+        state._ignored_modules,
+    )
+    # TODO: FSDP's contract for buffers is not well-defined. They are
+    # implicitly ignored for most functionality since they are not sharded;
+    # however, FSDP still imposes some semantics on buffers (e.g. buffer mixed
+    # precision). We should formalize this contract and decide if we need to
+    # compute and store `_ignored_buffers`.
+    return state
+
+
+def _check_ignored_states(
+    ignored_states: List[Any], passed_as_ignored_states: bool
+) -> None:
+    """
+    Check that the ignored states are uniformly parameters or uniformly modules.
+
+    We may remove this check in the future if we permit mixing.
+    """
+    if len(ignored_states) == 0:
+        return
+    if passed_as_ignored_states:
+        all_params = all(isinstance(state, nn.Parameter) for state in ignored_states)
+        all_modules = all(isinstance(state, nn.Module) for state in ignored_states)
+        if not all_params and not all_modules:
+            # Sort for consistent ordering for unit test regex matching
+            sorted_types = sorted({type(state) for state in ignored_states}, key=repr)
+            raise ValueError(
+                "ignored_states expects all nn.Parameter or all nn.Module list "
+                f"elements but got types {sorted_types}"
+            )
+    else:
+        if not all(isinstance(state, nn.Module) for state in ignored_states):
+            sorted_types = sorted({type(state) for state in ignored_states}, key=repr)
+            raise ValueError(
+                "ignored_modules expects nn.Module list elements but got "
+                f"types {sorted_types}"
+            )
+
+
+@no_type_check
+def _init_device_handle(
+    state: _FSDPState,
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    device_id: Optional[Union[int, torch.device]],
+) -> _FSDPState:
+    """
+    Determine device handle used for initializing FSDP.
+
+    If a device is specified by ``device_id``,
+    then returns device handle corresponds to that device type. Otherwise, If the
+    module is already on a non-CPU device, then the device type is that non-CPU device type.
+    If the module is on CPU or meta, then the device type is the current cuda device.
+
+    This method will be called once ignored paramters was determined, as the device handle maybe needed
+    for other initialization.
+    """
+    determined_device = None
+    if device_id is not None:
+        determined_device = (
+            device_id
+            if isinstance(device_id, torch.device)
+            else torch.device(device_id)
+        )
+    if determined_device is None:
+        for param in _get_orig_params(module, ignored_params):
+            if param.device.type in {"cpu", "meta"}:
+                continue
+            if determined_device is None:
+                determined_device = param.device
+            else:
+                if param.device.type != determined_device.type:
+                    raise RuntimeError(
+                        f"FSDP does not support modules with different device types "
+                        f"but got params on {determined_device.type} and {param.device.type}"
+                    )
+        determined_device = determined_device or torch.device(
+            "cuda", torch.cuda.current_device()
+        )
+
+    state._device_handle = _FSDPDeviceHandle.from_device(determined_device)
+    return state
+
+
+@no_type_check
+def _init_buffer_state(
+    state: _FSDPState,
+    module: nn.Module,
+) -> _FSDPState:
+    state._buffer_names = _get_buffer_names(module)
+    # Save a mapping from clean fully-qualified buffer name (starting from
+    # `module`) to its original dtype for restoring that dtype during model
+    # checkpointing when buffer mixed precision is enabled. The names should
+    # be clean since the casting happens in a `summon_full_params()` context.
+    _buffer_name_to_orig_dtype: Dict[str, torch.dtype] = {}
+    for buffer_name, buffer in module.named_buffers():
+        buffer_name = clean_tensor_name(buffer_name)
+        _buffer_name_to_orig_dtype[buffer_name] = buffer.dtype
+    state._buffer_name_to_orig_dtype = _buffer_name_to_orig_dtype
+    return state
+
+
+@no_type_check
+def _init_core_state(
+    state: _FSDPState,
+    sharding_strategy: Optional[ShardingStrategy],
+    mixed_precision: Optional[MixedPrecision],
+    cpu_offload: Optional[CPUOffload],
+    limit_all_gathers: bool,
+    use_orig_params: bool,
+    backward_prefetch_limit: int,
+    forward_prefetch_limit: int,
+) -> _FSDPState:
+    # We clamp the strategy to `NO_SHARD` for world size of 1 since they are
+    # currently functionally equivalent. This may change if/when we integrate
+    # FSDP with MoE.
+    if state.world_size == 1:
+        if sharding_strategy != ShardingStrategy.NO_SHARD:
+            warnings.warn(
+                "FSDP is switching to use `NO_SHARD` instead of "
+                f"{sharding_strategy or ShardingStrategy.FULL_SHARD} since "
+                "the world size is 1."
+            )
+        sharding_strategy = ShardingStrategy.NO_SHARD
+    elif sharding_strategy == ShardingStrategy.NO_SHARD:
+        warnings.warn(
+            "The `NO_SHARD` sharding strategy is deprecated. If having issues, "
+            "please use DistributedDataParallel instead.",
+            # Level 1 is here, level 2 is from `FullyShardedDataParallel`, and
+            # level 3 is from the true caller
+            stacklevel=3,
+        )
+    state.sharding_strategy = sharding_strategy or ShardingStrategy.FULL_SHARD
+    state.mixed_precision = mixed_precision or MixedPrecision()
+    if mixed_precision is not None:
+        torch._C._log_api_usage_once(
+            f"torch.distributed.fsdp.mixed_precision.{str(state.mixed_precision)}"
+        )
+    state._use_full_prec_in_eval = (
+        os.environ.get(_FSDP_USE_FULL_PREC_IN_EVAL, "") == "1"
+    )
+    state.cpu_offload = cpu_offload or CPUOffload()
+    state.limit_all_gathers = limit_all_gathers
+    state._use_orig_params = use_orig_params
+    state.training_state = TrainingState.IDLE
+    state._is_root = None
+    state._free_event_queue = _FreeEventQueue()
+    state._debug_level = dist.get_debug_level()
+    state._exec_order_data = exec_order_utils._ExecOrderData(
+        state._debug_level,
+        backward_prefetch_limit,
+        forward_prefetch_limit,
+    )
+    # Mapping from fully sharded module to the handles it is responsible to
+    # unshard and reshard (see [Note: Fully Sharded Module])
+    _fully_sharded_module_to_handle: Dict[nn.Module, FlatParamHandle] = dict()
+    state._fully_sharded_module_to_handle = _fully_sharded_module_to_handle
+    # Invariant: `state.params` contains exactly the `FlatParameter`s of the
+    # handles in `state._handle`
+    _handle: FlatParamHandle = None
+    state._handle = _handle
+    params: List[FlatParameter] = []
+    state.params = params
+    return state
+
+
+@no_type_check
+def _init_runtime_state(
+    state: _FSDPState,
+) -> _FSDPState:
+    _root_pre_forward_handles: List[RemovableHandle] = []
+    state._root_pre_forward_handles = _root_pre_forward_handles
+    _pre_forward_handles: List[RemovableHandle] = []
+    state._pre_forward_handles = _pre_forward_handles
+    _post_forward_handles: List[RemovableHandle] = []
+    state._post_forward_handles = _post_forward_handles
+    state._sync_gradients = True
+    state._comm_hook = None
+    state._comm_hook_state = None
+    # Used to prevent running the pre-backward hook multiple times
+    return state
+
+
+@no_type_check
+def _init_prefetching_state(
+    state: _FSDPState,
+    backward_prefetch: BackwardPrefetch,
+    forward_prefetch: bool,
+) -> _FSDPState:
+    state.backward_prefetch = backward_prefetch
+    state.forward_prefetch = forward_prefetch
+    # The data structures use tuples of handles to generalize over the case
+    # where a module's forward involves multiple handles.
+    return state
+
+
+@no_type_check
+def _init_extension(state: _FSDPState, device_mesh: DeviceMesh = None) -> _FSDPState:
+    # TODO: we need to add additional check once we support FSDP + PiPPy.
+    # This check is currently sufficient, since we only support FSDP + TP.
+    if device_mesh and _mesh_resources.get_parent_mesh(state._device_mesh) is not None:
+        state._fsdp_extension = DTensorExtensions(state._device_handle)
+    else:
+        # We need to explicilty set _fsdp_extension to None.
+        # Otherwise, we will run into an infinite recursion when getting the attribute.
+        state._fsdp_extension = None
+    return state
+
+
+@no_type_check
+def _init_state_dict_state(state: _FSDPState) -> _FSDPState:
+    state._state_dict_type = StateDictType.FULL_STATE_DICT
+    state_dict_config: StateDictConfig = FullStateDictConfig()
+    state._optim_state_dict_config = FullOptimStateDictConfig()
+    state._state_dict_config = state_dict_config
+    unshard_params_ctx: Dict[nn.Module, Generator] = {}
+    state._unshard_params_ctx = unshard_params_ctx
+
+    return state
+
+
+@no_type_check
+def _init_param_handle_from_module(
+    state: _FSDPState,
+    fully_sharded_module: nn.Module,
+    device_id: Optional[Union[int, torch.device]],
+    param_init_fn: Optional[Callable[[nn.Module], None]],
+    sync_module_states: bool,
+) -> _FSDPState:
+    """Initialize a ``FlatParamHandle`` from a module ``fully_sharded_module``."""
+    _check_single_device_module(fully_sharded_module, state._ignored_params, device_id)
+    device_from_device_id = _get_device_from_device_id(device_id, state.rank)
+    is_meta_module, is_torchdistX_deferred_init = _need_to_materialize_module(
+        fully_sharded_module, state._ignored_params, state._ignored_modules
+    )
+    # Materialize the module if needed
+    if (is_meta_module or is_torchdistX_deferred_init) and param_init_fn is not None:
+        _materialize_with_param_init_fn(
+            fully_sharded_module, param_init_fn, state._ignored_modules
+        )
+    elif is_meta_module:
+        _materialize_meta_module(
+            fully_sharded_module, device_id, state._ignored_modules
+        )
+    elif is_torchdistX_deferred_init:
+        deferred_init.materialize_module(
+            fully_sharded_module,
+            check_fn=lambda submodule: _get_module_fsdp_state(submodule) is None
+            and submodule not in state._ignored_modules,
+        )
+
+    ignored_buffers = {
+        buffer
+        for ignored_module in state._ignored_modules
+        for buffer in ignored_module.buffers()
+    }
+
+    _move_module_to_device(
+        fully_sharded_module,
+        state._ignored_params,
+        ignored_buffers,
+        device_from_device_id,
+    )
+    state.compute_device = _get_compute_device(
+        fully_sharded_module,
+        state._ignored_params,
+        device_from_device_id,
+        state.rank,
+    )
+
+    managed_params = list(_get_orig_params(fully_sharded_module, state._ignored_params))
+    if sync_module_states:
+        _sync_module_params_and_buffers(
+            fully_sharded_module, managed_params, state.process_group
+        )
+        if state.sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+            _sync_module_params_and_buffers(
+                fully_sharded_module, managed_params, state._inter_node_pg
+            )
+    _init_param_handle_from_params(state, managed_params, fully_sharded_module)
+    return state
+
+
+@no_type_check
+def _init_param_handle_from_params(
+    state: _FSDPState,
+    params: List[nn.Parameter],
+    fully_sharded_module: nn.Module,
+):
+    if len(params) == 0:
+        return
+    handle = FlatParamHandle(
+        params,
+        fully_sharded_module,
+        state.compute_device,
+        SHARDING_STRATEGY_MAP[state.sharding_strategy],
+        state.cpu_offload.offload_params,
+        state.mixed_precision.param_dtype,
+        state.mixed_precision.reduce_dtype,
+        state.mixed_precision.keep_low_precision_grads,
+        state.process_group,
+        state._use_orig_params,
+        fsdp_extension=state._fsdp_extension,
+    )
+    handle.shard()
+    assert not state._handle
+    state.params.append(handle.flat_param)
+    state._handle = handle
+    state._fully_sharded_module_to_handle[handle._fully_sharded_module] = handle
+    cpu_device = torch.device("cpu")
+    if state.cpu_offload.offload_params and handle.flat_param.device != cpu_device:
+        handle.flat_param_to(cpu_device)
+
+
+def _get_ignored_modules(
+    root_module: nn.Module,
+    _ignored_modules: Optional[Iterable[torch.nn.Module]],
+) -> Set[nn.Module]:
+    """
+    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.
+
+    Return the modules contained in their module
+    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
+    already-computed ignored modules are included.
+
+    ``_ignored_modules`` represents the argument passed by the user to FSDP.
+    """
+    msg_prefix = "`ignored_modules` should be an iterable of `torch.nn.Module`s "
+    try:
+        ignored_root_modules = (
+            set(_ignored_modules) if _ignored_modules is not None else set()
+        )
+    except TypeError as e:
+        raise TypeError(msg_prefix + f"but got {type(_ignored_modules)}") from e
+    for module in ignored_root_modules:
+        if not isinstance(module, torch.nn.Module):
+            raise TypeError(msg_prefix + f"but got an iterable with {type(module)}")
+        if _get_module_fsdp_state(module):
+            # TODO: We may relax this by taking the FSDP instance's wrapped
+            # module to provide more flexibility to the user.
+            raise ValueError("`ignored_modules` should not include FSDP modules")
+    # Treat modules that cannot compose with `fully_shard` as ignored modules,
+    # meaning that their subtrees are ignored
+    for module in root_module.modules():
+        if not traversal_utils._composable(module):
+            ignored_root_modules.add(module)
+    # NOTE: Even if `ignored_root_modules` is empty, do not return early so
+    # that this FSDP instance can get any ignored modules from its children.
+
+    # Include child modules and exclude nested FSDP modules themselves
+    ignored_modules = {
+        child
+        for module in ignored_root_modules
+        for child in module.modules()
+        if not isinstance(child, fsdp_file.FullyShardedDataParallel)
+    }
+    if root_module in ignored_modules:
+        warnings.warn(
+            "Trying to ignore the top-level module passed into the FSDP "
+            "constructor itself will result in all parameters being "
+            f"ignored and is not well-supported: {module}"
+        )
+    # Include nested FSDP modules' ignored modules
+    for submodule in root_module.modules():
+        optional_fsdp_state = _get_module_fsdp_state(submodule)
+        if optional_fsdp_state is not None:
+            assert hasattr(optional_fsdp_state, "_ignored_modules")
+            ignored_modules.update(optional_fsdp_state._ignored_modules)
+    return ignored_modules
+
+
+def _get_ignored_params(
+    root_module: torch.nn.Module,
+    ignored_modules: Set[torch.nn.Module],
+    ignored_parameters: Optional[Iterable[torch.nn.Parameter]] = None,
+) -> Set[torch.nn.Parameter]:
+    """
+    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.
+
+    :class:`FlatParameter` s are excluded from the result.
+    """
+    all_ignored_params: Set[torch.nn.Parameter] = set()
+
+    params_in_ignored_modules = {
+        p for m in ignored_modules for p in m.parameters() if not _is_fsdp_flattened(p)
+    }
+
+    all_ignored_params.update(params_in_ignored_modules)
+
+    if ignored_parameters is not None:
+        params_in_ignored_parameters = {
+            p for p in ignored_parameters if not _is_fsdp_flattened(p)
+        }
+        all_ignored_params.update(params_in_ignored_parameters)
+
+    # Always include nested FSDP modules' ignored parameters
+    for submodule in root_module.modules():
+        optional_fsdp_state = _get_module_fsdp_state(submodule)
+        if optional_fsdp_state is not None:
+            assert hasattr(optional_fsdp_state, "_ignored_params")
+            all_ignored_params.update(optional_fsdp_state._ignored_params)
+
+    return all_ignored_params
+
+
+def _get_ignored_buffer_names(
+    root_module: torch.nn.Module,
+    ignored_modules: Set[torch.nn.Module],
+) -> Set[str]:
+    """Return the cleaned buffer FQNs in ``ignored_modules``."""
+    all_ignored_buffer_names: Set[str] = set()
+
+    buffers_in_ignored_modules = {
+        buffer for m in ignored_modules for buffer in m.buffers()
+    }
+
+    all_ignored_buffer_names.update(
+        {
+            clean_tensor_name(buffer_name)
+            for buffer_name, buffer in root_module.named_buffers()
+            if buffer in buffers_in_ignored_modules
+        }
+    )
+
+    # Always include nested FSDP modules' ignored buffer names
+    for submodule in root_module.modules():
+        optional_fsdp_state = _get_module_fsdp_state(submodule)
+        if optional_fsdp_state is not None:
+            assert hasattr(optional_fsdp_state, "_ignored_buffer_names")
+            all_ignored_buffer_names.update(optional_fsdp_state._ignored_buffer_names)
+
+    return all_ignored_buffer_names
+
+
+def _get_buffer_names(root_module: nn.Module) -> Set[str]:
+    """Return the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`."""
+    return {
+        clean_tensor_name(buffer_name) for buffer_name, _ in root_module.named_buffers()
+    }
+
+
+def _check_single_device_module(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    device_id: Optional[Union[int, torch.device]],
+) -> None:
+    """
+    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.
+
+    Thus, after this method, the
+    module must be either fully on the CPU or fully on a non-CPU device.
+    """
+    devices = {param.device for param in _get_orig_params(module, ignored_params)}
+    # We allow module to be partially on CPU and partially on GPU if device_id is not
+    # None, since the device_id arg will result in the CPU portion being moved to
+    # GPU. This is useful in cases where part of the module may be parallelized
+    # by another algorithm and may already be on GPU. We'd like to enforce device_id
+    # to not be None, otherwise we'd flatten parameters in a mixed module which is
+    # not supported.
+    if len(devices) == 2 and torch.device("cpu") in devices:
+        if device_id is None:
+            raise RuntimeError(
+                "To support a module with both CPU and GPU params, "
+                "please pass in device_id argument."
+            )
+    elif len(devices) > 1:
+        raise RuntimeError(
+            f"FSDP only supports single device modules but got params on {devices}"
+        )
+
+
+def _get_device_from_device_id(
+    device_id: Optional[Union[int, torch.device]],
+    rank: int,
+) -> Optional[torch.device]:
+    """
+    Return a ``torch.device`` for the specified ``device_id``.
+
+    Processes ``device_id`` and returns either the corresponding device or
+    ``None`` if ``device_id`` is ``None``.
+    """
+    if device_id is None:
+        return None
+    device = (
+        device_id if isinstance(device_id, torch.device) else torch.device(device_id)
+    )
+    if device == torch.device("cuda"):
+        warnings.warn(
+            f"FSDP got the argument `device_id` {device_id} on rank "
+            f"{rank}, which does not have an explicit index. "
+            f"FSDP will use the current device {torch.cuda.current_device()}. "
+            "If this is incorrect, please explicitly call `torch.cuda.set_device()` "
+            "before FSDP initialization or pass in the explicit device "
+            "index as the `device_id` argument."
+        )
+        device = torch.device("cuda", torch.cuda.current_device())
+    return device
+
+
+def _need_to_materialize_module(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    ignored_modules: Set[nn.Module],
+) -> Tuple[bool, bool]:
+    """
+    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.
+
+    At most of the returned bools can
+    be ``True``. If either is ``True``, then ``module`` needs to be
+    materialized.
+    """
+    managed_params = list(_get_orig_params(module, ignored_params))
+    is_meta_module = any(param.is_meta for param in managed_params)
+    # TODO: We need to establish a contract for FSDP and buffers. For now, we
+    # skip checking for meta buffers from ignored modules. We should consider
+    # refactoring the initialization holistically to avoid so many traversals.
+    for submodule in module.modules():
+        if submodule in ignored_modules:
+            continue
+        for buf in submodule.buffers(recurse=False):
+            is_meta_module |= buf.is_meta
+    is_torchdistX_deferred_init = (
+        not is_meta_module
+        and _TORCHDISTX_AVAIL
+        and any(fake.is_fake(param) for param in managed_params)
+    )
+    return is_meta_module, is_torchdistX_deferred_init
+
+
+def _materialize_with_param_init_fn(
+    root_module: nn.Module,
+    param_init_fn: Callable[[nn.Module], None],
+    ignored_modules: Set[nn.Module],
+) -> None:
+    if not callable(param_init_fn):
+        raise ValueError(
+            f"Expected {param_init_fn} to be callable but got {type(param_init_fn)}"
+        )
+    modules_to_materialize = _get_modules_to_materialize(root_module, ignored_modules)
+    for module in modules_to_materialize:
+        param_init_fn(module)
+
+
+def _materialize_meta_module(
+    root_module: nn.Module,
+    device_from_device_id: Optional[torch.device],
+    ignored_modules: Set[nn.Module],
+):
+    # Run default meta device initialization
+    materialization_device = device_from_device_id or torch.device(
+        torch.cuda.current_device()
+    )
+    modules_to_materialize = _get_modules_to_materialize(root_module, ignored_modules)
+    try:
+        # Assume that each module's `reset_parameters()` only initializes its
+        # own parameters and not those of its children
+        with torch.no_grad():
+            for module in modules_to_materialize:
+                # As a contract to the user, only call `reset_parameters()` if
+                # the module has directly managed parameters/buffers
+                module_state_iter = itertools.chain(
+                    module.parameters(recurse=False), module.buffers(recurse=False)
+                )
+                has_module_states = len(list(module_state_iter)) > 0
+                if has_module_states:
+                    module.to_empty(device=materialization_device, recurse=False)
+                    module.reset_parameters()  # type: ignore[operator]
+    except BaseException as e:
+        warnings.warn(
+            "Unable to call `reset_parameters()` for module on meta "
+            f"device with error {str(e)}. Please ensure that your module of"
+            f"type {type(module)} implements a `reset_parameters()` method."  # type: ignore[possibly-undefined]
+        )
+        raise e
+
+
+def _get_modules_to_materialize(
+    root_module: nn.Module, ignored_modules: Set[nn.Module]
+) -> List[nn.Module]:
+    # Run BFS to collect the modules to materialize via `reset_parameters()`,
+    # stopping at any module with FSDP already applied or at ignored modules.
+    modules_to_materialize: List[nn.Module] = []
+    queue = collections.deque([root_module])
+    visited_modules: Set[nn.Module] = {root_module}
+    while queue:
+        module = queue.popleft()
+        modules_to_materialize.append(module)
+        for child_module in module.children():
+            if (
+                child_module not in visited_modules
+                and _get_module_fsdp_state(child_module) is None
+                and child_module not in ignored_modules
+            ):
+                visited_modules.add(child_module)
+                queue.append(child_module)
+    return modules_to_materialize
+
+
+def _move_module_to_device(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    ignored_buffers: Set[torch.Tensor],
+    device_from_device_id: Optional[torch.device],
+) -> None:
+    """
+    Move ``module`` depending on ``device_from_device_id`` and its current device.
+
+    This includes moving ignored modules' parameters.
+
+    - If ``device_from_device_id`` is not ``None``, then this moves
+    ``module`` to the device.
+    - If ``device_from_device_id`` is ``None``, then this does not move
+    ``module`` but warns the user if it is on CPU.
+
+    Precondition: ``_check_single_device_module()``.
+    """
+    cpu_device = torch.device("cpu")
+    if device_from_device_id is not None:
+        # BFS from `module` without traversing any nested FSDP instances to
+        # collect the parameters/buffers that have not yet been managed
+        queue: Deque[nn.Module] = collections.deque()
+        queue.append(module)
+        params: List[nn.Parameter] = []
+        buffers: List[torch.Tensor] = []
+        while queue:
+            curr_module = queue.popleft()
+            # NOTE: We include a check to only move parameters/buffers that are
+            # on CPU device. If they are on a CUDA device different from the
+            # one specified by `device_id`, then this does NOT move them. This
+            # is so that we can raise an error in `_get_compute_device()`.
+            params.extend(
+                param
+                for param in curr_module.parameters(recurse=False)
+                if param.device == cpu_device
+            )
+            buffers.extend(
+                buffer
+                for buffer in curr_module.buffers(recurse=False)
+                if buffer.device == cpu_device
+            )
+            for submodule in curr_module.children():
+                if not isinstance(submodule, fsdp_file.FullyShardedDataParallel):
+                    queue.append(submodule)
+        params_to_move = [p for p in params if p not in ignored_params]
+        bufs_to_move = [p for p in buffers if p not in ignored_buffers]
+        _move_states_to_device(params_to_move, bufs_to_move, device_from_device_id)
+        return
+    param = next(_get_orig_params(module, ignored_params), None)
+    if param is not None and param.device == cpu_device:
+        _warn_cpu_init()
+
+
+def _move_states_to_device(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    device_from_device_id: Optional[torch.device],
+) -> None:
+    """
+    Move states to the specified device.
+
+    Precondition: ``_check_single_device_module()`` and module's parameters and
+    buffers have been materialized if needed.
+    """
+    if len(params) == 0 and len(buffers) == 0:
+        return
+    if len(params) > 0:
+        current_device = params[0].device
+    elif len(buffers) > 0:
+        current_device = buffers[0].device
+    cpu_device = torch.device("cpu")
+    if device_from_device_id is not None:
+        # Move the parameters and buffers like the `.data` code path in
+        # `nn.Module._apply()`, which underlies `nn.Module.to()`
+        for param in params:
+            with torch.no_grad():
+                param.data = param.to(device_from_device_id)
+                if param.grad is not None:
+                    param.grad.data = param.grad.to(device_from_device_id)
+        for buffer in buffers:
+            buffer.data = buffer.to(device_from_device_id)
+    elif current_device == cpu_device:  # type: ignore[possibly-undefined]
+        _warn_cpu_init()
+
+
+def _warn_cpu_init():
+    warnings.warn(
+        "The passed-in `module` is on CPU and will thus have FSDP's sharding "
+        "initialization run on CPU, which may be slower than on GPU. We "
+        "recommend passing in the `device_id` argument for FSDP to move "
+        "`module` to GPU for the sharding initialization. `module` must also "
+        "be on GPU device to work with the `sync_module_states=True` flag "
+        "since that requires GPU communication."
+    )
+
+
+def _get_compute_device(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    device_from_device_id: Optional[torch.device],
+    rank: int,
+) -> torch.device:
+    """
+    Determine and return this FSDP instance's compute device.
+
+    If a device is
+    specified by ``device_id``, then returns that device. Otherwise, If the
+    module is already on a non-CPU device, then the compute device is that non-CPU
+    device. If the module is on CPU, then the compute device is the current
+    device.
+
+    Since this method should be called after materializing the module, any
+    non-CPU device should not be meta device. For now, the compute device is
+    always a CUDA GPU device with its explicit index.
+
+    Precondition: ``_check_single_device_module()`` and
+    ``_move_module_to_device()``.
+    """
+    param = next(_get_orig_params(module, ignored_params), None)
+    if param is not None and param.device.type != "cpu":
+        compute_device = param.device  # Determined by model param placement
+    else:
+        if device_from_device_id is not None and device_from_device_id.type != "cuda":
+            compute_device = device_from_device_id  # Determined by custom backend
+        else:
+            compute_device = torch.device("cuda", torch.cuda.current_device())
+    if device_from_device_id is not None and compute_device != device_from_device_id:
+        raise ValueError(
+            f"Inconsistent compute device and `device_id` on rank {rank}: "
+            f"{compute_device} vs {device_from_device_id}"
+        )
+    return compute_device
+
+
+# TODO: See how to deprecate!
+def _sync_module_params_and_buffers(
+    module: nn.Module,
+    params: List[nn.Parameter],
+    process_group: dist.ProcessGroup,
+) -> None:
+    """
+    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.
+
+    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
+    been set.
+    """
+    module_states: List[torch.Tensor] = []
+    for buffer in module.buffers():
+        # Avoid re-synchronizing buffers in case of nested wrapping
+        if not getattr(buffer, FSDP_SYNCED, False):
+            setattr(buffer, FSDP_SYNCED, True)
+            detached_buffer = buffer.detach()
+            if is_traceable_wrapper_subclass(detached_buffer):
+                # NOTE: Here we assume no nested subclasses, at most one level of subclass
+                # in both model's buffers and params
+                attrs, _ = detached_buffer.__tensor_flatten__()  # type: ignore[attr-defined]
+                inner_buffers = [getattr(detached_buffer, attr) for attr in attrs]
+                module_states.extend(inner_buffers)
+            else:
+                module_states.append(detached_buffer)
+
+    for param in params:
+        detached_param = param.detach()
+        if is_traceable_wrapper_subclass(detached_param):
+            attrs, _ = detached_param.__tensor_flatten__()  # type: ignore[attr-defined]
+            inner_params = [getattr(detached_param, attr) for attr in attrs]
+            module_states.extend(inner_params)
+        else:
+            module_states.append(detached_param)
+
+    _check_module_states_for_sync_module_states(module_states)
+    _sync_params_and_buffers(
+        process_group,
+        module_states,
+        PARAM_BROADCAST_BUCKET_SIZE,
+        src=0,
+    )
+
+
+def _sync_module_states(
+    params: List[nn.Parameter],
+    buffers: List[torch.Tensor],
+    process_group: dist.ProcessGroup,
+) -> None:
+    # Assumes that each call to this method passes in disjoint `params` and
+    # and `buffers` across calls, so there is no chance of re-synchronizing
+    params_and_buffers = [param.detach() for param in params] + [
+        buffer.detach() for buffer in buffers
+    ]
+    _check_module_states_for_sync_module_states(params_and_buffers)
+    _sync_params_and_buffers(
+        process_group,
+        params_and_buffers,
+        PARAM_BROADCAST_BUCKET_SIZE,
+        src=0,
+    )
+
+
+def _check_module_states_for_sync_module_states(
+    module_states: List[torch.Tensor],
+) -> None:
+    if module_states and any(
+        tensor.device == torch.device("cpu") for tensor in module_states
+    ):
+        raise ValueError(
+            "The module has CPU parameters or buffers when `sync_module_states=True`, "
+            "which requires them to be on GPU. Please specify the `device_id` argument "
+            "or move the module to GPU before passing it to FSDP."
+        )
+
+
+def _get_orig_params(
+    module: nn.Module,
+    ignored_params: Set[nn.Parameter],
+) -> Iterator[nn.Parameter]:
+    """
+    Return an iterator over the original parameters in ``module``.
+
+    The iterator does not return
+    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
+    present due to nested FSDP wrapping), or any original parameters already
+    flattened (only relevant when ``use_orig_params=True``).
+    """
+    param_gen = module.parameters()
+    try:
+        while True:
+            param = next(param_gen)
+            if param not in ignored_params and not _is_fsdp_flattened(param):
+                yield param
+    except StopIteration:
+        pass
+
+
+def _check_orig_params_flattened(
+    fsdp_module,
+    ignored_params: Set[nn.Parameter],
+) -> None:
+    """
+    Check that original parameters in ``fsdp_module`` have been flattened.
+
+    The flattened parameters are made
+    invisible to ``named_parameters()`` for the module hierarchy rooted at
+    ``fsdp_module``. This should be called as a sanity check after flattening
+    the wrapped module's parameters.
+    """
+    for param_name, param in _named_parameters_with_duplicates(fsdp_module):
+        if param not in ignored_params and not _is_fsdp_flattened(param):
+            raise RuntimeError(
+                f"Found an unflattened parameter: {param_name}; "
+                f"{param.size()} {param.__class__}"
+            )
+
+
+def _get_default_comm_hook(sharding_strategy: ShardingStrategy):
+    return (
+        default_hooks.allreduce_hook
+        if sharding_strategy == ShardingStrategy.NO_SHARD
+        else default_hooks.reduce_scatter_hook
+    )
+
+
+def _get_default_comm_hook_state(
+    process_group: dist.ProcessGroup,
+) -> default_hooks.DefaultState:
+    return default_hooks.DefaultState(process_group=process_group)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_limiter_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_limiter_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b37ad626fa1b2029179d5f07a43f602ecff953
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_limiter_utils.py
@@ -0,0 +1,33 @@
+import collections
+from typing import Deque, Optional
+
+import torch
+
+
+class _FreeEventQueue:
+    """
+    This tracks all pending frees corresponding to inflight all-gathers. The
+    queueing pattern is iterative enqueues with a single dequeue per iteration
+    once the limit ``_max_num_inflight_all_gathers`` is reached.
+    """
+
+    def __init__(self) -> None:
+        self._queue: Deque[torch.cuda.Event] = collections.deque()
+        self._max_num_inflight_all_gathers = 2  # empirically chosen
+
+    def enqueue(self, free_event: torch.cuda.Event) -> None:
+        """Enqueues a free event."""
+        self._queue.append(free_event)
+
+    def dequeue_if_needed(self) -> Optional[torch.cuda.Event]:
+        """Dequeues a single event if the limit is reached."""
+        if len(self._queue) >= self._max_num_inflight_all_gathers:
+            return self._dequeue()
+        return None
+
+    def _dequeue(self) -> Optional[torch.cuda.Event]:
+        """Dequeues a free event if possible."""
+        if self._queue:
+            event = self._queue.popleft()
+            return event
+        return None
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_optim_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_optim_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f42ed3bfc3fd49945d3be9b48a78227b5edcabe0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_optim_utils.py
@@ -0,0 +1,2086 @@
+import copy
+import functools
+import logging
+import warnings
+from contextlib import ExitStack
+from dataclasses import dataclass, field
+from typing import (
+    Any,
+    cast,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    NamedTuple,
+    no_type_check,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import _gather_state_dict
+from torch.distributed._tensor import DTensor, Replicate
+from torch.distributed.distributed_c10d import _get_pg_default_device
+from torch.distributed.fsdp._common_utils import (
+    _apply_to_modules,
+    _FSDPState,
+    _get_module_fsdp_state_if_fully_sharded_module,
+    _get_param_to_fqns,
+    _module_handle,
+    _named_parameters_with_duplicates,
+    clean_tensor_name,
+)
+from torch.distributed.fsdp._debug_utils import SimpleProfiler
+from torch.distributed.fsdp._flat_param import FlatParameter, FlatParamHandle
+from torch.distributed.fsdp._fsdp_extensions import (
+    _ext_chunk_dtensor,
+    _ext_chunk_tensor,
+)
+from torch.distributed.fsdp._runtime_utils import (
+    _lazy_init,
+    _reset_flat_param_grad_info_if_needed,
+)
+from torch.distributed.fsdp.api import (
+    ShardingStrategy,
+    StateDictSettings,
+    StateDictType,
+)
+from torch.utils._pytree import tree_map_only
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FSDPParamInfo:
+    state: _FSDPState
+    handle: FlatParamHandle
+    param_indices: Dict[str, int]
+    param_requires_grad: List[bool]
+
+
+def sorted_items(dictionary: Dict[str, Any]) -> Iterator[Tuple[str, Any]]:
+    keys = sorted(dictionary.keys())
+    for k in keys:
+        yield k, dictionary[k]
+
+
+@dataclass
+class _ConsolidatedOptimState:
+    """
+    This holds the consolidated optimizer state on the target rank. Positive-
+    dimension tensor state is communicated across ranks, while zero-dimension
+    tensor state and non-tensor state is taken directly from the target rank.
+
+    PyTorch version 1.12 moved to using zero-dimension tensors for scalar
+    values, but user implemented optimizers may still use float (i.e. a
+    non-tensor). Thus, we support both and handle them identically.
+
+    Attributes:
+        tensor_state (Dict[str, torch.Tensor]): Mapping from positive-dimension
+            tensor state name to the unsharded flat tensor representing the
+            state.
+        zero_dim_tensor_state (Dict[str, torch.Tensor]): Mapping from zero-
+            dimension tensor state name to its value.
+        non_tensor_state (Dict[str, Any]): Mapping from non-tensor state
+            name to its value.
+    """
+
+    tensor_state: Dict[str, torch.Tensor] = field(default_factory=dict)
+    zero_dim_tensor_state: Dict[str, torch.Tensor] = field(default_factory=dict)
+    non_tensor_state: Dict[str, Any] = field(default_factory=dict)
+
+
+class _PosDimTensorInfo(NamedTuple):
+    """
+    Meatadata for positive-dimension tensors used internally for
+    :meth:`scatter_full_optim_state_dict`.
+
+    Attributes:
+        shape (torch.Size): Sharded tensor shape (which is equal to the
+            unsharded tensor shape if the tensor is optimizer state for a
+            non-FSDP parameter and is hence not sharded).
+        dtype (torch.dtype): Data type of the tensor.
+    """
+
+    shape: torch.Size
+    dtype: torch.dtype
+
+
+class _OptimStateKey(NamedTuple):
+    """
+    This represents an optimizer state key that may be used commonly across
+    ranks. It is based on the unflattened parameter names rather than parameter
+    IDs to make it independent of each rank's own optimizer construction.
+    """
+
+    unflat_param_names: Tuple[str, ...]
+    is_fsdp_managed: bool
+
+
+def _unflatten_optim_state(
+    fsdp_param_info: FSDPParamInfo,
+    flat_param_state: Dict[str, Any],
+    to_save: bool,
+    shard_state: bool,
+    cpu_offload: bool,
+) -> List[Dict[str, Any]]:
+    """
+    Unflattens the optimizer state, consisting of the "state" part and the
+    "param_groups" part. Unflattening the "state" part involves consolidating
+    the state on the target rank and remapping from flattened to unflattened
+    parameter IDs, and the "param_groups" part only involves remapping from
+    flattened to unflattened parameter IDs.
+
+    Args:
+        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
+            mapping from FQN to original parameter index.
+        flat_param_state (Dict[str, Any]): Entry for the flat parameter in the
+            "state" part of the optimizer state dict.
+        to_save (bool): Whether to save the state on this rank.
+
+    Returns:
+        List[Dict[str, Any]]: A :class:`list` holding the entries in the
+        "state" part of the optimizer state dict corresponding to the
+        unflattened parameters comprising the flat parameter if on the target
+        rank or an empty :class:`list` otherwise. The final optimizer state
+        dict will need to map these entries using the proper unflattened
+        parameter IDs.
+    """
+    assert (
+        not shard_state or to_save
+    ), "If ``shard_state`` is True, ``to_save`` has to be True."
+    consolidated_state = _communicate_optim_state(
+        fsdp_param_info,
+        flat_param_state,
+    )
+    if to_save:
+        unflat_param_state = _unflatten_communicated_optim_state(
+            fsdp_param_info,
+            consolidated_state,
+            shard_state,
+        )
+        for optim_state in unflat_param_state:
+            # We can't use .items() below cuz we'd run into a concurrent modification error
+            if cpu_offload:
+                for key in list(optim_state.keys()):
+                    state = optim_state[key]
+                    if not isinstance(state, torch.Tensor):
+                        continue
+                    optim_state[key] = state.cpu()
+        return unflat_param_state
+    else:
+        return []
+
+
+def _is_zero_dim_tensor(x: Any) -> bool:
+    return torch.is_tensor(x) and x.dim() == 0
+
+
+def _communicate_optim_state(
+    fsdp_param_info: FSDPParamInfo,
+    flat_param_state: Dict[str, Any],
+) -> _ConsolidatedOptimState:
+    """
+    Communicates the optimizer state for a flat parameter across ranks. All
+    ranks will hold the entire non-sharded optimizer state on GPU.
+
+    If ``N`` is the number of tensor optimizer states in the optimizer state
+    dict, then the communication complexity is 0 if ``N = 0`` and ``N + 1``
+    otherwise (where the plus 1 comes from all-gathering the padding per rank).
+
+    Args:
+        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
+            mapping from FQN to original parameter index.
+        flat_param_state (Dict[str, Any]): The entry in the "state" part of the
+            optimizer state dict corresponding to the flat parameter.
+
+    Returns:
+        ConsolidatedOptimState: Consolidated optimizer state for the target
+        flat parameter.
+    """
+    fsdp_state = fsdp_param_info.state
+    flat_param = fsdp_param_info.handle.flat_param
+    state = _ConsolidatedOptimState()
+    tensor_state, zero_dim_tensor_state, non_tensor_state = (
+        state.tensor_state,
+        state.zero_dim_tensor_state,
+        state.non_tensor_state,
+    )
+
+    for state_name, value in sorted_items(flat_param_state):
+        # Positive-dimension tensor state: communicate across ranks
+        if torch.is_tensor(value) and value.dim() > 0:
+            # If the parameter is not sharded, then neither is the
+            # positive-dimension tensor state, so no need to communicate it --
+            # we take the target rank's value
+            if (
+                fsdp_state.world_size == 1
+                or fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
+            ):
+                tensor_state[state_name] = value
+                continue
+            assert (
+                fsdp_state.compute_device is not None
+            ), "compute_device has not been initialized"
+            if value.device.type != fsdp_state.compute_device.type:
+                value = value.to(fsdp_state.compute_device)
+            # Assume that positive-dimension tensor optimizer state
+            # has the same shape as the sharded flat parameter
+            buffer_size = flat_param._full_param_padded.size()  # type: ignore[attr-defined]
+            tensor_buffer = value.new_zeros(*buffer_size)
+            dist.all_gather_into_tensor(
+                tensor_buffer, value, group=fsdp_state.process_group
+            )
+            fsdp_state._device_handle.synchronize()
+            unpadded_numel = cast(
+                nn.Parameter, flat_param._unpadded_unsharded_size
+            ).numel()
+            tensor_state[state_name] = tensor_buffer[:unpadded_numel]
+        # Zero-dimension tensor state and non-tensor state: take this rank's
+        # value directly
+        else:
+            if _is_zero_dim_tensor(value):
+                zero_dim_tensor_state[state_name] = value.detach().clone()
+            else:
+                non_tensor_state[state_name] = value
+    return state
+
+
+def _unflatten_communicated_optim_state(
+    fsdp_param_info: FSDPParamInfo,
+    state: _ConsolidatedOptimState,
+    shard_state: bool,
+) -> List[Dict[str, Any]]:
+    """
+    Unflattens the communicated optimizer state (given by ``tensor_state``,
+    ``non_tensor_state``, and ``zero_dim_tensor_state``) for a single flat
+    parameter. This should only be called on the target rank.
+
+    Args:
+        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
+            mapping from FQN to original parameter index.
+        state (_ConsolidatedOptimState): Consolidated optimizer state.
+
+    Returns:
+        List[Dict[str, Any]]: A :class:`list` holding the entries in the
+        "state" part of the optimizer state dict corresponding to the
+        unflattened parameters comprising the flat parameter. The final
+        optimizer state dict will need to map these entries using the proper
+        unflattened parameter IDs.
+    """
+    fsdp_state = fsdp_param_info.state
+    handle = fsdp_param_info.handle
+    flat_param = handle.flat_param
+    unflat_param_state: List[Dict[str, Any]] = []
+    flat_param_views: Dict[str, Iterator] = {}
+    num_unflat_params = flat_param._num_params
+    tensor_state, zero_dim_tensor_state, non_tensor_state = (
+        state.tensor_state,
+        state.zero_dim_tensor_state,
+        state.non_tensor_state,
+    )
+
+    for _ in range(num_unflat_params):
+        unflat_state_param = {}
+        # Add positive-dimension tensor state: unflatten with views
+        for state_name, flat_tensor in sorted_items(tensor_state):
+            views_generated = state_name in flat_param_views
+            if not views_generated:
+                views = handle._get_unflat_views(flat_tensor)
+                flat_param_views[state_name] = views
+            else:
+                views = flat_param_views[state_name]
+            optim_state: Union[torch.Tensor, ShardedTensor, DTensor] = next(views)
+            if shard_state:
+                osd_config = fsdp_state._optim_state_dict_config
+                if getattr(osd_config, "_use_dtensor", False):
+                    assert fsdp_state._device_mesh is not None
+                    optim_state = _ext_chunk_dtensor(
+                        optim_state,
+                        fsdp_state.rank,
+                        fsdp_state._device_mesh,
+                        fsdp_state._fsdp_extension,
+                    )
+                else:
+                    assert fsdp_state.process_group is not None
+                    optim_state = _ext_chunk_tensor(
+                        optim_state,
+                        fsdp_state.rank,
+                        fsdp_state.world_size,
+                        fsdp_state._device_handle.device_count(),
+                        fsdp_state.process_group,
+                        fsdp_state._fsdp_extension,
+                    )
+            unflat_state_param[state_name] = optim_state
+
+        # Add zero-dimension tensor state: take the target rank's value
+        for state_name, zero_dim_tensor in sorted_items(zero_dim_tensor_state):
+            unflat_state_param[state_name] = zero_dim_tensor
+        # Add non-tensor state: take the target rank's value
+        for state_name, non_tensor in sorted_items(non_tensor_state):
+            unflat_state_param[state_name] = non_tensor
+        unflat_param_state.append(unflat_state_param)
+    return unflat_param_state
+
+
+def _broadcast_processed_state(
+    fsdp_state: _FSDPState,
+    optim_state: Dict[str, Any],
+    group: Optional[dist.ProcessGroup],
+) -> Dict[str, Any]:
+    objects: List[Any] = [None]
+    if fsdp_state.rank == 0:
+        objects[0] = tree_map_only(
+            torch.Tensor,
+            lambda v: v.cpu() if v.dim() == 0 else _PosDimTensorInfo(v.shape, v.dtype),  # type: ignore[union-attr]
+            optim_state,
+        )
+    dist.broadcast_object_list(objects, src=0, group=group)
+    if fsdp_state.rank == 0:
+        return optim_state
+    else:
+        return objects[0]
+
+
+def _broadcast_state(
+    fsdp_state: _FSDPState, state: Any, group: Optional[dist.ProcessGroup]
+) -> Any:
+    if fsdp_state.rank == 0:
+        if not isinstance(state, torch.Tensor) or state.dim() == 0:
+            return state
+        tensor = state.to(fsdp_state.compute_device)
+    else:
+        if isinstance(state, torch.Tensor):
+            assert state.dim() == 0, (
+                "For non-zero ranks, a tensor state should have zero dimension, "
+                "but got the state with shape {state.shape()}."
+            )
+            return state
+        elif not isinstance(state, _PosDimTensorInfo):
+            return state
+        tensor = torch.zeros(
+            state.shape, dtype=state.dtype, device=fsdp_state.compute_device
+        )
+    dist.broadcast(tensor, src=0, group=group)
+    return tensor
+
+
+def _shard_orig_param_state(
+    fsdp_param_info: FSDPParamInfo,
+    fqn: str,
+    optim_state: Dict[str, Any],
+) -> Dict[str, Any]:
+    """
+    Shard the optimizer state for the original parameter with the name ``fqn``.
+    This API should only be used when ``use_orig_params`` is True.
+    """
+    if not optim_state:
+        return {}
+    fsdp_state = fsdp_param_info.state
+    flat_param = fsdp_param_info.handle.flat_param
+    param_idx = fsdp_param_info.param_indices[fqn]
+    shard_param_info = flat_param._shard_param_infos[param_idx]  # type: ignore[attr-defined]
+    optim_state = _gather_state_dict(
+        optim_state, pg=fsdp_state.process_group, device=fsdp_state.compute_device
+    )
+    if not shard_param_info.in_shard:
+        return {}
+    # Flatten and shard the state.
+    new_optim_state: Dict[str, Any] = {}
+    intra_param_start_idx = shard_param_info.intra_param_start_idx
+    intra_param_end_idx = shard_param_info.intra_param_end_idx
+    for state_name, value in optim_state.items():
+        if (
+            torch.is_tensor(value)
+            and value.dim() > 0
+            and fsdp_state.sharding_strategy != ShardingStrategy.NO_SHARD
+        ):
+            value = value.flatten()[intra_param_start_idx : intra_param_end_idx + 1].clone()  # type: ignore[operator]
+        new_optim_state[state_name] = value
+    return new_optim_state
+
+
+def _flatten_optim_state_dict(
+    optim_state_dict: Dict[str, Any],
+    model: nn.Module,
+    use_orig_params: bool = False,
+    optim: Optional[torch.optim.Optimizer] = None,
+    rank0_only: bool = False,
+    group: Optional[dist.ProcessGroup] = None,
+) -> Dict[str, Any]:
+    """
+    Flattens the full optimizer state dict, still keying by unflattened parameter
+    names.
+
+    If ``use_orig_params`` is True, each rank will have all FSDP-managed
+    parameters but some of these parameters may be empty due to the sharding.
+    For a regular optim.Optimizer, states for those empty parameters will
+    not be initialized. So, when aggregating the FQNs across ranks, no assert
+    will be raised on a rank even if it does not have all the states -- it is
+    valid and FSDP know how to aggregate them. However, FSDP has to ignore
+    handling those parameters that are not managed by FSDP and do not exist on
+    the local rank -- it is managed by other parallelism and FSDP does not
+    know ho to handle/aggregate them.
+
+    Note that ``_flatten_tensor_optim_state`` does not need ``optim`` to
+    flatten/shard the state. However, NamedOptimizer and KeyedOptimizer require
+    all the states even if the corresponding parameters are empty. To this end,
+    ``optim`` will be used to to get the initial state of the empty parameters.
+    ``optim`` should only be non-None if the ``optim` is KeyedOptimizer or
+    NamedOptimizer.
+
+    Returns:
+        Dict[str, Any]: The flattened optimizer state dict.
+    """
+    SimpleProfiler.reset()
+
+    unflat_osd = optim_state_dict
+    if "state" not in unflat_osd and not rank0_only:
+        raise ValueError(
+            '`optim_state_dict` must have the keys "state"'
+            "to be a valid optimizer state dict"
+        )
+    param_to_fqns = _get_param_to_fqns(model)
+    fqn_to_fsdp_param_info = _get_fqn_to_fsdp_param_info(model)
+    fsdp_state = next(iter(fqn_to_fsdp_param_info.values())).state
+
+    # Broadcast unflat_osd without non-scalar tensor if rank0_only is True.
+    if rank0_only:
+        unflat_osd = _broadcast_processed_state(fsdp_state, unflat_osd, group=group)
+
+    # Construct the "state" part
+    flat_osd_state: Dict[Union[_OptimStateKey, str], Any] = {}
+    unflat_osd_state = unflat_osd["state"]
+    all_state_keys = set(unflat_osd_state.keys())
+
+    for param, fqns in param_to_fqns.items():
+        fqn = fqns[0]
+        if fqn not in unflat_osd_state:
+            continue
+        all_state_keys.difference_update(fqns)
+
+        if rank0_only:
+            for fqn in fqns:
+                if not unflat_osd_state[fqn]:
+                    continue
+                for state_name in unflat_osd_state[fqn].keys():
+                    unflat_osd_state[fqn][state_name] = _broadcast_state(
+                        fsdp_state, unflat_osd_state[fqn][state_name], group=group
+                    )
+            fqn = fqns[0]
+        if fqn in fqn_to_fsdp_param_info:
+            fsdp_param_info = fqn_to_fsdp_param_info[fqn]
+            if use_orig_params:
+                with SimpleProfiler.profile(SimpleProfiler.Type.RESHARDING):
+                    flat_state = _shard_orig_param_state(
+                        fsdp_param_info,
+                        fqn,
+                        unflat_osd_state[fqn],
+                    )
+            else:
+                flat_state = _flatten_optim_state(
+                    fsdp_param_info,
+                    unflat_osd_state,
+                    fqns,
+                )
+            key = _OptimStateKey(tuple(fqns), True)
+            # Only include non-empty states since as expected by
+            # `torch.optim.Optimizer` s unless the optimizer is KeyedOptimizer
+            # or NamedOptimizer.
+            if flat_state:
+                flat_osd_state[key] = flat_state
+            elif use_orig_params:
+                assert (
+                    len(fqns) == 1
+                ), f"use_orig_params is True but there are multiple FQNs, {fqns}."
+                if optim is not None:  # NamedOptimizer or KeyedOptimizer case.
+                    state = optim.state.get(param, None)  # type: ignore[call-overload]
+                    if state is not None:
+                        flat_osd_state[key] = copy.deepcopy(state)
+                    else:
+                        warnings.warn(
+                            f"optim_state[{key}] is not on rank{fsdp_state.rank}."
+                        )
+
+            else:
+                raise RuntimeError(
+                    f"The state of {key} is empty. This should happen when "
+                    "use_orig_params=True."
+                )
+        else:  # do not flatten non-FSDP parameters' states
+            assert len(fqns) == 1
+            key = _OptimStateKey(tuple(fqns), False)
+            flat_osd_state[key] = copy.copy(unflat_osd_state[fqn])
+
+        if rank0_only:
+            for fqn in fqns:
+                if not unflat_osd_state[fqn]:
+                    continue
+                for state_name, param_state in list(unflat_osd_state[fqn].items()):
+                    if fsdp_state.rank > 0:
+                        # Deference the tensor so that PyTorch can collect the memory.
+                        del unflat_osd_state[fqn][state_name]
+                    else:
+                        # Move the tensor in the original osd back to CPU to make the
+                        # original osd unaffected.
+                        unflat_osd_state[fqn][state_name] = unflat_osd_state[fqn][
+                            state_name
+                        ].cpu()
+
+    # Handle user-defined state, states that are not associated with parameters.
+    for key in all_state_keys:
+        user_state = unflat_osd_state[key]
+        if isinstance(user_state, torch.Tensor) and rank0_only and use_orig_params:
+            user_state = _broadcast_state(fsdp_state, user_state, group=group)
+        flat_osd_state[key] = copy.copy(user_state)
+
+    SimpleProfiler.dump_and_reset("FSDP _flatten_optim_state_dict() profiling: ")
+    # Construct the "param_groups" part -- copy as is since it will be
+    # rekeyed later according to the target rank's optimizer
+    # Only copy param_groups if it exists in unflat_osd
+    if "param_groups" in unflat_osd:
+        flat_osd_param_groups = copy.deepcopy(unflat_osd["param_groups"])
+        return {"state": flat_osd_state, "param_groups": flat_osd_param_groups}
+    else:
+        return {"state": flat_osd_state}
+
+
+def _flatten_optim_state(
+    fsdp_param_info: FSDPParamInfo,
+    unflat_osd_state: Dict[str, Dict[str, Any]],
+    unflat_param_names: List[str],
+) -> Dict[str, Any]:
+    """
+    Flattens the optimizer state in ``full_optim_state_dict`` for a single
+    flat parameter in ``fsdp_param_info`` corresponding to the unflattened
+    parameter names in ``unflat_param_names``.
+
+    Args:
+        fsdp_param_info (FSDPParamInfo): The FSDP state, the handle, and a
+            mapping from FQN to original parameter index.
+        unflat_osd_state (Dict[str, Dict[str, Any]]): The "state" part of the
+            optimizer state dict corresponding to the unflattened parameters.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the flat parameter ``flat_param``.
+
+    Returns:
+        Dict[str, Any]: A :class:`dict` mapping state names to their values for
+        a particular flat parameter. The sharded optimizer state dict's "state"
+        part will map a key to this returned value.
+    """
+    fsdp_state = fsdp_param_info.state
+    handle = fsdp_param_info.handle
+    flat_param = handle.flat_param
+    num_unflat_params = len(unflat_param_names)
+    assert num_unflat_params > 0, (
+        "Expects at least one unflattened parameter corresponding to the "
+        "flat parameter"
+    )
+    unflat_param_shapes = flat_param._shapes
+    num_unflat_param_shapes = len(unflat_param_shapes)
+    assert (
+        num_unflat_params == num_unflat_param_shapes
+    ), f"Expects {num_unflat_params} shapes but got {num_unflat_param_shapes}"
+
+    # Check if these unflattened parameters have any optimizer state
+    has_state = [
+        bool(unflat_param_name in unflat_osd_state)
+        for unflat_param_name in unflat_param_names
+    ]
+    # If none of the unflattened parameters comprising this flat parameter have
+    # any state, then we do not want an entry in the optimizer state dict
+    if not any(has_state):
+        return {}  # no need to flatten any state
+    # There may still be some unflattened parameters with state and some
+    # without
+    unflat_param_states = [
+        _gather_state_dict(
+            unflat_osd_state[unflat_param_name],
+            pg=fsdp_state.process_group,
+            device=fsdp_state.compute_device,
+        )
+        if unflat_param_name in unflat_osd_state
+        else None
+        for unflat_param_name in unflat_param_names
+    ]
+    # Check that the unflattened parameters have the same state names
+    state_names = None
+    for unflat_param_state in unflat_param_states:
+        if unflat_param_state is None:
+            continue
+        if state_names is None:
+            state_names = set(unflat_param_state.keys())
+        else:
+            if state_names != set(unflat_param_state.keys()):
+                raise ValueError(
+                    "Differing optimizer state names for the unflattened "
+                    f"parameters: {unflat_param_names}"
+                )
+    assert state_names is not None
+
+    # Flatten the state
+    flat_state: Dict[str, Any] = {}
+    for state_name in state_names:
+        state_values = [
+            unflat_param_state[state_name] if unflat_param_state is not None else None
+            for unflat_param_state in unflat_param_states
+        ]
+        non_none_state_values = [v for v in state_values if v is not None]
+        # If all ranks have None, this is a None value
+        if not non_none_state_values:
+            flat_state[state_name] = None
+            continue
+        are_pos_dim_tensors = are_zero_dim_tensors = are_non_tensors = True
+        for v in non_none_state_values:
+            are_pos_dim_tensors &= torch.is_tensor(v) and v.dim() > 0
+            are_zero_dim_tensors &= _is_zero_dim_tensor(v)
+            are_non_tensors &= not torch.is_tensor(v)
+        types = {type(v) for v in non_none_state_values}
+        if len(types) != 1 or not (
+            are_pos_dim_tensors or are_zero_dim_tensors or are_non_tensors
+        ):
+            raise ValueError(
+                f"Differing optimizer state types for state {state_name}, "
+                f"values {non_none_state_values}, and unflattened parameter "
+                f"names {unflat_param_names}"
+            )
+        if are_pos_dim_tensors:
+            flat_tensor = _flatten_tensor_optim_state(
+                state_name,
+                state_values,
+                unflat_param_names,
+                unflat_param_shapes,
+                handle,
+            )
+            # Shard the flattened tensor immediately to minimize max memory
+            # usage
+            if (
+                fsdp_state.world_size != 1
+                and fsdp_state.sharding_strategy != ShardingStrategy.NO_SHARD
+            ):
+                sharded_flat_tensor, _ = FlatParamHandle._get_shard(
+                    flat_tensor,
+                    fsdp_state.rank,
+                    fsdp_state.world_size,
+                )
+            else:
+                sharded_flat_tensor = flat_tensor
+            flat_state[state_name] = sharded_flat_tensor
+        elif are_zero_dim_tensors:
+            flat_state[state_name] = _flatten_zero_dim_tensor_optim_state(
+                state_name,
+                state_values,
+                unflat_param_names,
+            )
+        else:
+            assert are_non_tensors
+            flat_state[state_name] = _flatten_non_tensor_optim_state(
+                state_name,
+                state_values,
+                unflat_param_names,
+            )
+
+    return flat_state
+
+
+def _flatten_tensor_optim_state(
+    state_name: str,
+    pos_dim_tensors: List[torch.Tensor],
+    unflat_param_names: List[str],
+    unflat_param_shapes: Sequence[torch.Size],
+    handle: FlatParamHandle,
+) -> torch.Tensor:
+    """
+    Flattens the positive-dimension tensor optimizer state given by the values
+    ``tensors`` for the state ``state_name`` for a single flat parameter
+    from ``handle`` corresponding to the unflattened parameter names
+    ``unflat_param_names`` and unflatted parameter shapes
+    ``unflat_param_shapes``. This flattens each unflattened parameter's tensor
+    state into one tensor.
+
+    NOTE: We use zero tensors for any unflattened parameters without state
+    since some value is required to fill those entries. This assumes that the
+    zero tensor is mathematically equivalent to having no state, which is true
+    for Adam's "exp_avg" and "exp_avg_sq" but may not be true for all
+    optimizers.
+
+    Args:
+        state_name (str): Optimizer state name.
+        pos_dim_tensors (List[torch.Tensor]): Positive-dimension tensor
+            optimizer state values for the unflattened parameters corresponding
+            to the single flat parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flat parameter.
+        unflat_param_shapes (List[torch.Size]): Unflattened parameter shapes
+            corresponding to the single flat parameter.
+        handle (FlatParamHandle): The flat parameter's handle.
+
+    Returns:
+        torch.Tensor: A flat tensor containing the optimizer state
+        corresponding to ``state_name`` constructed by concatenating the
+        unflattened parameter tensor states in ``pos_dim_tensors`` (using zero
+        tensors for any unflattened parameters without the state).
+    """
+    flat_param = handle.flat_param
+    non_none_tensors = [t for t in pos_dim_tensors if t is not None]
+    # Check that all are tensors with the same dtype
+    dtypes = {t.dtype for t in non_none_tensors}
+    if len(dtypes) != 1:
+        raise ValueError(
+            "All unflattened parameters comprising a single flat "
+            "parameter must have positive-dimension tensor state with the "
+            f"same dtype but got dtypes {dtypes} for state {state_name} and "
+            f"unflattened parameter names {unflat_param_names}"
+        )
+    dtype = next(iter(dtypes))
+    # Check that each tensor state matches its parameter's shape
+    for tensor, shape in zip(pos_dim_tensors, unflat_param_shapes):
+        if tensor is None and len(shape) == 0:
+            raise ValueError("Flattening a zero-dimension parameter is not supported")
+        elif tensor is not None and tensor.shape != shape:
+            raise ValueError(
+                "Tensor optimizer state does not have same shape as its "
+                f"parameter: {tensor.shape} {shape}"
+            )
+    # Flatten the tensor states: we do not need to add any right-hand-side
+    # padding since the flat optimizer state tensor is sharded via
+    # `_get_shard()`, which pads the shard as needed (just like for the flat
+    # parameter)
+    cpu_device = torch.device("cpu")
+    tensors_to_flatten = [
+        torch.flatten(state_value.to(cpu_device))
+        if state_value is not None
+        else torch.flatten(
+            torch.zeros(
+                size=shape,
+                dtype=dtype,
+                device=cpu_device,
+            )
+        )
+        for state_value, shape in zip(pos_dim_tensors, unflat_param_shapes)
+    ]
+    flat_tensor = handle.flatten_tensors(tensors_to_flatten, handle._aligned_numel)
+    flat_param_shape = flat_param._unpadded_unsharded_size  # type: ignore[attr-defined]
+    assert flat_tensor.shape == flat_param_shape, (
+        f"tensor optim state: {flat_tensor.shape} "
+        f"flat parameter: {flat_param_shape}"
+    )
+    return flat_tensor
+
+
+def _flatten_zero_dim_tensor_optim_state(
+    state_name: str,
+    zero_dim_tensors: List[torch.Tensor],
+    unflat_param_names: List[str],
+) -> torch.Tensor:
+    """
+    Flattens the zero-dimension tensor optimizer state given by the values
+    ``zero_dim_tensors`` for the state ``state_name`` for a single flat
+    parameter corresponding to the unflattened parameter names
+    ``unflat_param_names`` by enforcing that all tensors are the same and using
+    that common value.
+
+    NOTE: The requirement that the tensors are the same across all unflattened
+    parameters comprising the flat parameter is needed to maintain the
+    invariant that FSDP performs the same computation as its non-sharded
+    equivalent. This means that none of the unflattened parameters can be
+    missing this state since imposing a value may differ from having no value.
+    For example, for Adam's "step", no value means maximum bias correction,
+    while having some positive value means less bias correction.
+
+    Args:
+        state_name (str): Optimizer state name.
+        zero_dim_tensors (List[torch.Tensor]): Zero-dimension optimizer state
+            for the unflattened parameters corresponding to the single
+            flat parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flat parameter.
+
+    Returns:
+        torch.Tensor: A zero-dimensional tensor giving the value of the state
+        ``state_name`` for all unflattened parameters corresponding to the
+        names ``unflat_param_names``.
+    """
+    non_none_tensors = [t for t in zero_dim_tensors if t is not None]
+    # Enforce that all have the same value and dtype
+    values_set = {t.item() if t is not None else None for t in zero_dim_tensors}
+    dtypes = {t.dtype if t is not None else None for t in zero_dim_tensors}
+    if (
+        len(non_none_tensors) != len(zero_dim_tensors)
+        or len(values_set) != 1
+        or len(dtypes) != 1
+    ):
+        raise ValueError(
+            "All unflattened parameters comprising a single flat "
+            "parameter must have scalar state with the same value and dtype "
+            f"but got values {values_set} and dtypes {dtypes} for state "
+            f"{state_name} and unflattened parameter names "
+            f"{unflat_param_names}"
+        )
+    value = next(iter(values_set))
+    dtype = next(iter(dtypes))
+    return torch.tensor(value, dtype=dtype, device=torch.device("cpu"))
+
+
+def _flatten_non_tensor_optim_state(
+    state_name: str,
+    non_tensors: List[Any],
+    unflat_param_names: List[str],
+) -> Any:
+    """
+    Flattens the non-tensor optimizer state given by the values ``non_tensors``
+    for the state ``state_name`` for a single flat parameter corresponding
+    to the unflattened parameter names ``unflat_param_names`` by enforcing that
+    all values are the same and using that common value.
+
+    See the note in :func:`_flatten_zero_dim_tensor_optim_state`.
+
+    Args:
+        state_name (str): Optimizer state name.
+        non_tensors (List[Any]): Non-tensor optimizer state for the unflattened
+            parameters corresponding to the single flat parameter.
+        unflat_param_names (List[str]): A :class:`list` of unflattened
+            parameter names corresponding to the single flat parameter.
+
+    Returns:
+        Any: A non-tensor giving the value of the state ``state_name`` for all
+        unflattened parameters corresponding to the names
+        ``unflat_param_names``.
+    """
+    non_none_non_tensors = [nt for nt in non_tensors if nt is not None]
+    # Enforce that all have the same value (same type already checked)
+    non_tensor_set = set(non_tensors)
+    if len(non_none_non_tensors) != len(non_tensors) or len(non_tensor_set) != 1:
+        raise ValueError(
+            "All unflattened parameters comprising a single flat "
+            "parameter must have scalar state with the same value and dtype "
+            f"but got values {non_tensor_set} for state {state_name} and  "
+            f"unflattened parameter names {unflat_param_names}"
+        )
+    non_tensor = next(iter(non_tensor_set))
+    return non_tensor
+
+
+def _rekey_sharded_optim_state_dict(
+    sharded_osd: Dict[str, Any],
+    model: nn.Module,
+    optim: torch.optim.Optimizer,
+    optim_input: Optional[
+        Union[
+            List[Dict[str, Any]],
+            Iterable[nn.Parameter],
+        ]
+    ],
+    using_optim_input: bool,
+    is_named_optimizer: bool = False,
+) -> Dict[str, Any]:
+    """
+    Rekeys the optimizer state dict from unflattened parameter names to flat
+    parameter IDs according to the calling rank's ``optim``, which may be
+    different across ranks. In particular, the unflattened parameter names are
+    represented as :class:`_OptimStateKey` s.
+    """
+    param_to_fqns = _get_param_to_fqns(model)
+    flat_param_to_fqn = _get_flat_param_to_fqn(model)
+    param_to_param_key: Dict[nn.Parameter, Union[int, str]] = cast(
+        Dict[nn.Parameter, Union[int, str]],
+        (
+            _get_param_to_param_id_from_optim_input(model, optim_input)
+            if using_optim_input
+            else _get_param_to_param_key(
+                optim, model, is_named_optimizer, param_to_fqns, flat_param_to_fqn
+            )
+        ),
+    )
+    # All parameter keys in `param_to_param_key` should be in
+    # `param_to_fqns` -- strict inequality follows when not all parameters are
+    # passed to the optimizer
+    assert len(param_to_param_key) <= len(param_to_fqns)
+
+    unflat_param_names_to_flat_param_key: Dict[
+        Tuple[str, ...], Union[int, str]
+    ] = {}  # for "state"
+    unflat_param_name_to_flat_param_key: Dict[
+        str, Union[int, str]
+    ] = {}  # for "param_groups"
+    for param, unflat_param_names in param_to_fqns.items():
+        if param not in param_to_param_key:
+            # This parameter was not passed to the optimizer
+            continue
+        flat_param_key = param_to_param_key[param]
+        unflat_param_names_to_flat_param_key[tuple(unflat_param_names)] = flat_param_key
+        for unflat_param_name in unflat_param_names:
+            unflat_param_name_to_flat_param_key[unflat_param_name] = flat_param_key
+
+    sharded_osd_state = sharded_osd["state"]
+    rekeyed_osd_state: Dict[Union[str, int], Any] = {}
+    for key, param_state in sharded_osd_state.items():
+        if isinstance(key, str):
+            rekeyed_osd_state[key] = param_state
+            continue
+        flat_param_key = unflat_param_names_to_flat_param_key.get(
+            key.unflat_param_names, key.unflat_param_names
+        )
+        rekeyed_osd_state[flat_param_key] = param_state
+
+    # Only process param_groups if it exists in sharded_osd
+    if "param_groups" in sharded_osd:
+        rekeyed_osd_param_groups: List[Dict[str, Any]] = []
+        for unflat_param_group in sharded_osd["param_groups"]:
+            flat_param_group = copy.deepcopy(unflat_param_group)
+            flat_param_keys = sorted(
+                {
+                    unflat_param_name_to_flat_param_key[unflat_param_name]
+                    for unflat_param_name in unflat_param_group["params"]
+                }
+            )
+            flat_param_group["params"] = flat_param_keys
+            rekeyed_osd_param_groups.append(flat_param_group)
+        return {"state": rekeyed_osd_state, "param_groups": rekeyed_osd_param_groups}
+    else:
+        return {"state": rekeyed_osd_state}
+
+
+def _get_param_id_to_param_from_optim_input(
+    model: nn.Module,
+    optim_input: Optional[
+        Union[
+            List[Dict[str, Any]],
+            Iterable[nn.Parameter],
+        ]
+    ] = None,
+) -> Dict[int, nn.Parameter]:
+    """
+    Constructs a mapping from parameter IDs to parameters. This may be used
+    both for models with ``FlatParameter`` s and without.
+
+    NOTE: This method is only preserved for backward compatibility. The method
+    :meth:`_get_param_key_to_param` is the preferred code path that does not
+    rely on ``optim_input``.
+
+    NOTE: We critically assume that, whether the optimizer input is a list of
+    parameters or a list of parameter groups, :class:`torch.optim.Optimizer`
+    enumerates the parameter IDs in order. In other words, for a parameter list
+    input, the parameter IDs should be in that list order, and for a parameter
+    groups input, the parameter IDs should be in order within each parameter
+    group and in order across parameter groups.
+
+    Args:
+        model (nn.Module): Model whose parameters are passed into the
+            optimizer.
+        optim_input (Optional[Union[List[Dict[str, Any]],
+        Iterable[nn.Parameter]]]): Input passed into the optimizer
+            representing either a :class:`list` of parameter groups or an
+            iterable of parameters; if ``None``, then this method assumes the
+            input was ``model.parameters()``. (Default: ``None``)
+
+    Returns:
+        List[nn.Parameter]: Mapping from parameter IDs to parameters,
+        where the parameter ID is implicitly the index in the :class:`list`.
+    """
+    # Assume the standard case of passing `model.parameters()` to the optimizer
+    # if `optim_input` is not specified
+    if optim_input is None:
+        return dict(enumerate(model.parameters()))
+    try:
+        params = cast(List[nn.Parameter], list(optim_input))
+    except TypeError as e:
+        raise TypeError(
+            "Optimizer input should be an iterable of Tensors or dicts, "
+            f"but got {optim_input}"
+        ) from e
+    if len(params) == 0:
+        raise ValueError("Optimizer input should not be empty")
+
+    # Check if the optimizer input represents tensors or parameter groups
+    all_tensors = True
+    all_dicts = True
+    for param in params:
+        all_tensors &= isinstance(param, torch.Tensor)
+        all_dicts &= isinstance(param, dict)
+    if not all_tensors and not all_dicts:
+        raise TypeError("Optimizer input should be an iterable of Tensors or dicts")
+    if all_tensors:
+        return dict(enumerate(params))
+    assert all_dicts
+    param_id_to_param: List[nn.Parameter] = []
+    for param_group in params:
+        has_params_key = "params" in param_group  # type: ignore[operator]
+        assert has_params_key, (
+            'A parameter group should map "params" to a list of the '
+            "parameters in the group"
+        )
+        # Implicitly map `flat_param_id` (current length of the list) to
+        # `param`
+        param_id_to_param.extend(param_group["params"])  # type: ignore[index]
+    return dict(enumerate(param_id_to_param))
+
+
+def _get_flat_param_to_fqn(model: torch.nn.Module) -> Dict[FlatParameter, str]:
+    """
+    Constructs a mapping from ``FlatParameter`` to a cleaned (devoid of prefixes
+    from wrappers) fully qualified name (FQN). Note that this FQN is "non-canonical"
+    because ``FlatParameter``  s do not come from the original module but are
+    registered only after FSDP has been applied. This function returns the FSDP-given
+    name for the ``FlatParameter`` (usually module._flat_param) as opposed to the
+    canonical FQNs returned for ``FlatParameter`` s in ``_common_utils._get_param_to_fqns(...)``).
+
+    Consequently, this function will only return a non-empty mapping if FSDP was
+    applied with ``use_orig_params=False`` as, otherwise, the original parameters
+    are used within the module and there would be no ``FlatParameter`` s in the module.
+
+    """
+
+    def module_fn(module, prefix, tree_level, flat_param_to_fqn):
+        for param_name, param in _named_parameters_with_duplicates(
+            module, recurse=False
+        ):
+            if not isinstance(param, FlatParameter):
+                continue
+            fqn = clean_tensor_name(prefix + param_name)
+            flat_param_to_fqn[param] = fqn
+
+    def return_fn(flat_param_to_fqn):
+        return flat_param_to_fqn
+
+    flat_param_to_fqn_ret: Dict[FlatParameter, str] = {}
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        [fqn for fqn, _ in _named_parameters_with_duplicates(model)],
+        flat_param_to_fqn_ret,
+    )
+
+
+def _get_param_key_to_param(
+    optim: torch.optim.Optimizer,
+    model: Optional[nn.Module] = None,
+    is_named_optimizer: bool = False,
+    param_to_fqns: Optional[Dict[nn.Parameter, List[str]]] = None,
+    flat_param_to_fqn: Optional[Dict[FlatParameter, str]] = None,
+) -> Dict[Union[int, str], nn.Parameter]:
+    """
+    Constructs a mapping from parameter keys to parameters. For the regular
+    optimizers, the keys are parameter IDs. For NamedOptimizer, the keys
+    are FQNs. This API may be used both for models with ``FlatParameter`` s and
+    without.
+    """
+    clean_fqn_to_curr_fqn: Dict[str, str] = {}
+    if is_named_optimizer:
+        assert (
+            param_to_fqns is not None and flat_param_to_fqn is not None
+        ), "The optimizer is a NamedOptimizer, `param_to_fqns` must not be None."
+        assert model is not None
+        for key, _ in _named_parameters_with_duplicates(model):
+            clean_fqn_to_curr_fqn[clean_tensor_name(key)] = key
+
+    param_key_to_param: Dict[Union[str, int], nn.Parameter] = {}
+    pid = 0
+    for param_group in optim.param_groups:
+        if is_named_optimizer:
+            for param in param_group["params"]:
+                assert flat_param_to_fqn is not None
+                if param in flat_param_to_fqn:
+                    # FlatParameter case
+                    key = flat_param_to_fqn[param]
+                else:
+                    assert param_to_fqns is not None
+                    # use_orig_params case
+                    assert len(param_to_fqns[param]) == 1
+                    key = param_to_fqns[param][0]
+                try:
+                    key = clean_fqn_to_curr_fqn[key]
+                except KeyError as e:
+                    raise KeyError(
+                        f"Can't find {key} from {list(clean_fqn_to_curr_fqn.keys())}."
+                    ) from e
+                param_key_to_param[key] = param
+        else:
+            for param in param_group["params"]:
+                param_key_to_param[pid] = param
+                pid += 1
+
+    return param_key_to_param
+
+
+def _get_param_to_param_key(
+    optim: torch.optim.Optimizer,
+    model: Optional[nn.Module] = None,
+    is_named_optimizer: bool = False,
+    param_to_fqns: Optional[Dict[nn.Parameter, List[str]]] = None,
+    flat_param_to_fqn: Optional[Dict[FlatParameter, str]] = None,
+) -> Dict[nn.Parameter, Union[int, str]]:
+    """
+    Constructs the inverse mapping of :func:`_get_param_key_to_param`. This API
+    only supports the case where `optim` is a regular optimizer, not NamedOptimizer.
+    So the parameter keys will be parameter ids.
+    """
+    param_id_to_param = _get_param_key_to_param(
+        optim, model, is_named_optimizer, param_to_fqns, flat_param_to_fqn
+    )
+    return {param: param_id for param_id, param in param_id_to_param.items()}
+
+
+def _get_param_to_param_id_from_optim_input(
+    model: nn.Module,
+    optim_input: Optional[
+        Union[
+            List[Dict[str, Any]],
+            Iterable[nn.Parameter],
+        ]
+    ] = None,
+) -> Dict[nn.Parameter, int]:
+    """Constructs the inverse mapping of :func:`_get_param_id_to_param_from_optim_input`."""
+    param_id_to_param = _get_param_id_to_param_from_optim_input(model, optim_input)
+    return {param: param_id for param_id, param in param_id_to_param.items()}
+
+
+def _check_missing_keys_on_rank(
+    r0_optim_state_keys: List[_OptimStateKey],
+    optim_state_key_to_param_key: Dict[_OptimStateKey, Union[str, int]],
+    param_key_to_param: Dict[Union[str, int], nn.Parameter],
+    group: Optional[dist.ProcessGroup],
+) -> None:
+    # Ensure that all ranks have at least the optimizer states needed by
+    # rank 0's optimizer
+    missing_keys: List[_OptimStateKey] = []
+    for r0_optim_state_key in r0_optim_state_keys:
+        if r0_optim_state_key not in optim_state_key_to_param_key:
+            # A parameter from rank 0's optimizer does not exist for this
+            # rank's optimizer
+            missing_keys.append(r0_optim_state_key)
+            continue
+        param_key = optim_state_key_to_param_key[r0_optim_state_key]
+        if isinstance(param_key, int):
+            assert param_key >= 0 and param_key < len(
+                param_key_to_param
+            ), "Check the `param_key_to_param` construction"
+    # We cannot use FSDPState.compute_device as this API is a global view.
+    device = _get_pg_default_device(group)
+    num_missing = torch.tensor([len(missing_keys)], dtype=torch.int32, device=device)
+    dist.all_reduce(num_missing, group=group)
+    if num_missing.item() > 0:
+        obj_list = [None for _ in range(dist.get_world_size(group))]
+        dist.all_gather_object(obj_list, missing_keys, group=group)
+        error_msg = (
+            "FSDP currently requires each rank to have at least the "
+            "optimizer states needed by rank 0's optimizer but some ranks "
+            "are missing some of those states"
+        )
+        for rank, keys in enumerate(obj_list):
+            keys = cast(List[_OptimStateKey], keys)
+            if len(keys) > 0:
+                error_msg += (
+                    f"\nRank {rank} is missing states for the parameters: "
+                    f"{[key.unflat_param_names for key in keys]}"
+                )
+        raise RuntimeError(error_msg)
+
+
+def _map_param_key_to_optim_keys(
+    optim_state_dict: Dict[str, Any],
+    group: Optional[dist.ProcessGroup],
+    param_key_to_param: Dict[Union[int, str], nn.Parameter],
+    param_to_fqns: Dict[nn.Parameter, List[str]],
+    fqn_to_fsdp_param_info: Dict[str, FSDPParamInfo],
+    merge_keys: bool = False,
+) -> Tuple[List[_OptimStateKey], Dict[_OptimStateKey, Union[int, str]]]:
+    """
+    Construct the local mapping between the ``_OptimStateKey`` and parameter keys
+    and all the ``_OptimStateKey`` across ranks. If ``merge_keys`` is False, rank0
+    must contain all the ``_OptimStateKey``, an exception will be raised otherwise.
+    Note that ``merge_keys`` should equal to ``use_orig_params``.
+    """
+    rank = dist.get_rank(group)
+    optim_state_key_to_param_key: Dict[_OptimStateKey, Union[int, str]] = {}  # local
+    all_optim_state_keys: List[_OptimStateKey] = []
+
+    for param_key, param in param_key_to_param.items():
+        # Do not include parameters without state to avoid empty mappings
+        # just like in normal `torch.optim.Optimizer.state_dict()`
+        if param_key not in optim_state_dict["state"]:
+            continue
+        fqns = param_to_fqns[param]
+        is_fsdp_managed = isinstance(param, FlatParameter)
+        if is_fsdp_managed:
+            assert fqns[0] in fqn_to_fsdp_param_info, (
+                fqns[0],
+                list(fqn_to_fsdp_param_info.keys()),
+            )
+        is_fsdp_managed = fqns[0] in fqn_to_fsdp_param_info
+        optim_state_key = _OptimStateKey(
+            unflat_param_names=tuple(fqns),
+            is_fsdp_managed=is_fsdp_managed,
+        )
+        if rank == 0 or merge_keys:
+            all_optim_state_keys.append(optim_state_key)
+        optim_state_key_to_param_key[optim_state_key] = param_key
+
+    if merge_keys:
+        all_keys: List[List[_OptimStateKey]] = [
+            [] for _ in range(dist.get_world_size(group))
+        ]
+        dist.all_gather_object(all_keys, all_optim_state_keys, group=group)
+        merge_all_optim_state_keys = [
+            key for local_keys in all_keys for key in local_keys
+        ]
+        all_optim_state_keys = sorted(set(merge_all_optim_state_keys))
+    else:
+        key_obj_list: List[Optional[List[_OptimStateKey]]] = (
+            [all_optim_state_keys] if rank == 0 else [None]
+        )
+        dist.broadcast_object_list(key_obj_list, src=0, group=group)
+        assert key_obj_list[0] is not None
+        all_optim_state_keys = key_obj_list[0]
+        _check_missing_keys_on_rank(
+            all_optim_state_keys,
+            optim_state_key_to_param_key,
+            param_key_to_param,
+            group,
+        )
+
+    return all_optim_state_keys, optim_state_key_to_param_key
+
+
+def _unflatten_param_groups(
+    state_dict: Dict[str, Any],
+    param_key_to_param: Dict[Union[int, str], nn.Parameter],
+    param_to_fqns: Dict[nn.Parameter, List[str]],
+) -> List[Dict[str, Any]]:
+    param_groups: List[Dict[str, Any]] = []
+    for flat_param_group in state_dict["param_groups"]:
+        unflat_param_group = copy.deepcopy(flat_param_group)
+        param_group_params = [
+            param_key_to_param[flat_param_key]
+            for flat_param_key in flat_param_group["params"]
+        ]
+        nested_unflat_param_names = [
+            param_to_fqns[param] for param in param_group_params
+        ]
+        unflat_param_group["params"] = [
+            unflat_param_name
+            for unflat_param_names in nested_unflat_param_names
+            for unflat_param_name in unflat_param_names
+        ]  # flatten the list of lists
+        param_groups.append(unflat_param_group)
+    return param_groups
+
+
+def _is_named_optimizer(optim_state_dict: Dict[str, Any]) -> bool:
+    """
+    Returns whether the state_dict is from a NamedOptimizer.
+    This function checks that the keys in the state_dict['state'] are strings
+    (which usually are FQNs) versus integers (which usually refer to param_ids
+    from a vanilla torch.optim.Optimizer).
+    """
+    state = optim_state_dict.get("state", None)
+    if not state:
+        # If we cannot find a state, assume it is not NamedOptimizer as
+        # NamedOptimizer has eager initialization.
+        return False
+    try:
+        key = next(iter(state.keys()))
+    except Exception as e:
+        raise Exception(optim_state_dict) from e
+    return isinstance(key, str)
+
+
+@dataclass
+class StateInfo:
+    # The key of these dictionaries are the state name, e.g., `exp_avg`.
+    tensors: Dict[str, _PosDimTensorInfo]
+    scalar_tensors: Dict[str, torch.Tensor]
+    non_tensors: Dict[str, Any]
+
+
+def _allgather_state_info(
+    fsdp_state: _FSDPState,
+    input_states: Dict[str, Any],
+) -> List[Dict[str, StateInfo]]:
+    """
+    Given the ``input_states``, allgather StateInfo for each state. The function
+    uses all_gather_object to gather StateInfo so no GPU tensors are sent.
+    """
+
+    processed_state_dict: Dict[str, StateInfo] = {}
+    gathered_state_info: List[Dict[str, StateInfo]] = [
+        {} for _ in range(fsdp_state.world_size)
+    ]
+
+    for fqn, optim_state in input_states.items():
+        # Allgather the scalar tensor state, non-tensor states and tensors metadata.
+        processed_state = StateInfo({}, {}, {})
+        for state_name, value in sorted_items(optim_state):
+            if torch.is_tensor(value):
+                if value.dim() == 0:
+                    # Ensure that `step` is on CPU.
+                    processed_state.scalar_tensors[state_name] = value.cpu()
+                else:
+                    processed_state.tensors[state_name] = _PosDimTensorInfo(
+                        value.shape, value.dtype
+                    )
+            else:
+                processed_state.non_tensors[state_name] = value
+        processed_state_dict[fqn] = processed_state
+    dist.all_gather_object(
+        gathered_state_info,
+        processed_state_dict,
+        group=fsdp_state.process_group,
+    )
+    return gathered_state_info
+
+
+def _convert_all_state_info(
+    fsdp_param_info: FSDPParamInfo,
+    gathered_state_info: List[Dict[str, StateInfo]],
+    input_states: Dict[str, Any],
+    output_states: Dict[str, Dict[str, Any]],
+) -> Tuple[Optional[torch.dtype], Dict[str, List[Optional[torch.Tensor]]]]:
+    """
+    Given the ``gathered_state_info`` and ``input_states``, the API converted
+    the StateInfo into the original state if the state is not a non-scalar
+    tensor. For a multi-dimensional tensor, the local state will be stored in
+    ``state_buffer`` in a correct order for later allgather purpose.
+    """
+
+    state_buffers: Dict[str, List[Optional[torch.Tensor]]] = {}
+
+    for fqn, gathered_state in output_states.items():
+        state_info = [s[fqn] for s in gathered_state_info]
+        all_tensor_states = sorted(
+            {n for state in state_info for n in state.tensors.keys()}
+        )
+        empty_ranks: Set[int] = set()
+        dtype: Optional[torch.dtype] = None
+        # First check all the non-scalar states and get the information of
+        # states on each rank.
+        for state_name in all_tensor_states:
+            numels = []
+            _empty_ranks: Set[int] = set()
+            for rank, object_state in enumerate(state_info):
+                numels.append(0)
+                info = object_state.tensors.get(state_name, None)
+                if info is not None:
+                    numels[-1] = info.shape.numel()
+                    if not dtype:
+                        dtype = info.dtype
+                    else:
+                        assert dtype == info.dtype
+                if numels[-1] == 0:
+                    _empty_ranks.add(rank)
+
+            assert not empty_ranks or empty_ranks == _empty_ranks
+            empty_ranks = _empty_ranks
+            if state_name not in state_buffers:
+                state_buffers[state_name] = [
+                    None for _ in fsdp_param_info.param_indices
+                ]
+            local_state = input_states[fqn].get(state_name, None)
+            # N.B. We need to move the state to compute_device. The reason is
+            # not yet clear and we need to figure out why the state may be on a
+            # different device.
+            if local_state is not None:
+                local_state = local_state.to(fsdp_param_info.state.compute_device)
+            state_buffers[state_name][fsdp_param_info.param_indices[fqn]] = local_state
+
+        # Restoring the scalar and non-tensor states. If the corresponding
+        # non-scalar states do not exist on the rank, we also skip the scalar
+        # non-tensor states on that rank.
+        for rank, object_state in enumerate(state_info):
+            if rank in empty_ranks:
+                continue
+            for name, non_tensor_value in object_state.non_tensors.items():
+                curr_non_tensor_value = gathered_state.get(name, None)
+                assert (
+                    curr_non_tensor_value is None
+                    or curr_non_tensor_value == non_tensor_value
+                ), (
+                    f"Rank {rank} has different values for {name}: {non_tensor_value}."
+                    + f" Other ranks: {curr_non_tensor_value}"
+                )
+                gathered_state[name] = non_tensor_value
+
+            for name, scalar_tensor_value in object_state.scalar_tensors.items():
+                curr_scalar_tensor_value = gathered_state.get(name, None)
+                assert curr_scalar_tensor_value is None or torch.equal(
+                    scalar_tensor_value, curr_scalar_tensor_value
+                ), (
+                    f"Rank {rank} has different values for {name}: {scalar_tensor_value}."
+                    + f" Other ranks: {curr_scalar_tensor_value}"
+                )
+                gathered_state[name] = scalar_tensor_value
+
+    return dtype, state_buffers  # type: ignore[possibly-undefined]
+
+
+def _unflatten_orig_param_states(
+    fsdp_param_info: FSDPParamInfo,
+    output_states: Dict[str, Dict[str, Any]],
+    state_name: str,
+    shard_state: bool,
+    to_save: bool,
+    cpu_offload: bool,
+) -> None:
+    """
+    Given a output state dict, ``output_states``, which the keys are FQNs to the
+    original parameters (not FlatParameters nor parmeter ID), and the values
+    are gathered states, unflatten the states to the original dimensions.
+
+    This function performs the unflattening process in-place.
+    """
+    if not to_save:
+        return
+    flat_param = fsdp_param_info.handle.flat_param
+    fsdp_state = fsdp_param_info.state
+    for fqn, gathered_state in output_states.items():
+        value = gathered_state[state_name]
+        param_idx = fsdp_param_info.param_indices[fqn]
+
+        # TODO: This solution is not general and only apply to PTD TP solution.
+        if isinstance(value, DTensor):
+            placement = value.placements[0]
+            # If gathered state is a DTensor and its TP placement is not Replicate(), we need to
+            # gather the tensor on its TP dimension before chunking them into DTensor again.
+            if placement != Replicate():
+                placement_dim = placement.dim  # type: ignore[attr-defined]
+                value_local = value.redistribute(placements=(Replicate(),))
+                reshape_size = list(flat_param._shapes[param_idx])
+                reshape_size[placement_dim] *= value.device_mesh.size(0)
+                reshape_size = torch.Size(reshape_size)
+                value = value.reshape(reshape_size)
+            # If gathered state is a replicate DTensor, we directly reshape it.
+            else:
+                value = value.reshape(flat_param._shapes[param_idx])
+        else:
+            # If gathered state is a tensor, we directly reshape it into unflatten state.
+            value = value.reshape(flat_param._shapes[param_idx])
+
+        if shard_state:
+            osd_config = fsdp_state._optim_state_dict_config
+            if getattr(osd_config, "_use_dtensor", False):
+                assert fsdp_state._device_mesh is not None
+                value = _ext_chunk_dtensor(
+                    value,
+                    fsdp_state.rank,
+                    fsdp_state._device_mesh,
+                    fsdp_state._fsdp_extension,
+                )
+            else:
+                assert fsdp_state.process_group is not None
+                value = _ext_chunk_tensor(
+                    value,
+                    fsdp_state.rank,
+                    fsdp_state.world_size,
+                    fsdp_state._device_handle.device_count(),
+                    fsdp_state.process_group,
+                    fsdp_state._fsdp_extension,
+                )
+        elif not cpu_offload:
+            with SimpleProfiler.profile("clone"):
+                value = value.detach().clone()
+
+        if cpu_offload:
+            with SimpleProfiler.profile(SimpleProfiler.Type.D2H):
+                value = value.cpu()
+        gathered_state[state_name] = value
+
+
+def _allgather_orig_param_states(
+    fsdp_param_info: FSDPParamInfo,
+    gathered_state_info: List[Dict[str, StateInfo]],
+    input_states: Dict[str, Any],
+    shard_state: bool,
+    to_save: bool,
+    cpu_offload: bool,
+) -> Dict[str, Dict[str, Any]]:
+    """
+    Given the ``gathered_state_info`` and ``input_states``, the API allgathers
+    all tensor states and restore non-tensor states from ``gathered_state_info``.
+    """
+    fsdp_state = fsdp_param_info.state
+    if fsdp_state.rank == 0 and dist.get_debug_level() == dist.DebugLevel.DETAIL:
+        logger.warning(
+            "CUDA Memory Summary before calling to _allgather_orig_param_states %s",
+            torch.cuda.memory_summary(),
+        )
+
+    output_states: Dict[str, Dict[str, Any]] = {fqn: {} for fqn in input_states.keys()}
+
+    dtype, state_buffers = _convert_all_state_info(
+        fsdp_param_info, gathered_state_info, input_states, output_states
+    )
+
+    if len(state_buffers) == 0:
+        return output_states
+
+    has_state_params: List[bool] = [
+        True if fqn in output_states else False
+        for fqn, idx in fsdp_param_info.param_indices.items()
+    ]
+
+    # Loop through the ``state_buffers`` and construct the flattened, concatenated,
+    # sharded states. The size of the constructed state will be the same size as
+    # flat_param (also sharded).
+    # Then we perform an allgather_into_tensor to get the full flat_param state.
+    # The full flat_param state is the result of concatenation of multiple states
+    # the order of of flat_param._fqns.
+    # The final step is to split the flat_param state into original param states
+    # and return the result.
+    flat_param = fsdp_param_info.handle.flat_param
+    empty_func = functools.partial(
+        torch.empty, dtype=dtype, device=fsdp_state.compute_device
+    )
+    gathered_tensor = empty_func(flat_param._padded_unsharded_size)
+    # Synchronize can be slow but this will be easier for us to debug.
+    torch.cuda.synchronize()
+    for state_name, buffers in state_buffers.items():
+        local_buffers: List[torch.Tensor] = []
+        begin = fsdp_state.rank * flat_param._sharded_size.numel()
+        # End is inclusive.
+        end = begin + flat_param._sharded_size.numel() - 1
+        # param_idx corresponds to the parameter index in the FlatParameter.
+        mem_offset, param_idx = 0, 0
+        for numel, is_padding in zip(
+            flat_param._numels_with_padding, flat_param._is_padding_mask
+        ):
+            frozen_and_no_state = not is_padding and (
+                not fsdp_param_info.param_requires_grad[param_idx]
+                and not has_state_params[param_idx]
+            )
+
+            if is_padding or frozen_and_no_state:
+                # This memory range is a padding or the param is frozen and does
+                # not require gradient. For the later case, we treat it as a
+                # padding and add empty values to the local_buffers.
+
+                padding_begin, padding_end = mem_offset, mem_offset + numel - 1
+                if padding_begin <= begin <= padding_end:
+                    # The range is an align padding before the first parameter in
+                    # the shard. The shard includes parts of this align padding.
+                    padding_len = (
+                        padding_end - begin + 1
+                        if end >= padding_end
+                        else end - begin + 1
+                    )
+                elif padding_begin <= end <= padding_end:
+                    # The range is an align padding after the last parameter in
+                    # the shard. The shard includes parts of this align padding.
+                    padding_len = (
+                        end - padding_begin + 1
+                        if begin <= padding_begin
+                        else end - begin + 1
+                    )
+                elif begin < padding_begin <= padding_end < end:
+                    # The range is an align padding that is completely in the
+                    # shard.
+                    padding_len = numel
+                else:
+                    padding_len = 0
+                if padding_len:
+                    local_buffers.append(empty_func(padding_len))
+
+            if not is_padding:
+                # This memory range is a parameter in FlatParameter. So there
+                # should be an corresponding state in the optimizer unless the
+                # parameter is frozen, which we treat it as a padding above.
+
+                # We need to check if this rank owns the buffer. If this is None:
+                # 1.) the rank does not own any part of the original parameter.
+                #     As a result, there is no corresponding optimizer state on
+                #     the rank as well.
+                # 2.) the parameter is frozen AND no optimizer state for the
+                #     parameter. If a parameter is frozen, there can still be
+                #     optimizer state if the parameter is not frozen in the
+                #     previous steps.
+                if buffers[param_idx] is not None:
+                    local_buffers.append(cast(torch.Tensor, buffers[param_idx]))
+                param_idx += 1
+
+            mem_offset += numel
+
+        shard_numel_padded = flat_param._sharded_size.numel() - (
+            sum(t.numel() for t in local_buffers)
+        )
+
+        assert flat_param._shard_numel_padded == shard_numel_padded, (
+            "Manually calculated _sharded_numel_padded is incorrect. "
+            f"_shard_numel_padded={flat_param._shard_numel_padded}, "
+            f"shard_numel_padded={shard_numel_padded}, "
+            f"_sharded_size.numel={flat_param._sharded_size.numel()}, "
+            f"_numels_with_padding={flat_param._numels_with_padding}, "
+            f"begin={begin}, end={end},"
+        )
+        if shard_numel_padded > 0:
+            # Add right-handed padding.
+            local_buffers.append(empty_func(shard_numel_padded))
+        local_shard = torch.cat(local_buffers)
+        assert local_shard.numel() * fsdp_state.world_size == gathered_tensor.numel(), (
+            "The size of local shard times the world size should equal to the "
+            "gathered tensor size. The inconsistency may be from a bug of "
+            "FlatParameter's metadata or the reconstruction logic in optimizer "
+            "state dict."
+        )
+        torch.cuda.synchronize()
+        with SimpleProfiler.profile(SimpleProfiler.Type.ALLGATHER):
+            dist.all_gather_into_tensor(
+                gathered_tensor, local_shard, group=fsdp_state.process_group
+            )
+            # Synchronize can be slow but this will be easier for us to debug.
+            torch.cuda.synchronize()
+
+        unpadded_tensor = gathered_tensor[: flat_param._unpadded_unsharded_size.numel()]
+        flat_param_handle = fsdp_param_info.handle
+        orig_states = flat_param_handle._get_unflat_views_aligned(unpadded_tensor)
+        assert len(orig_states) == len(fsdp_param_info.param_indices), (
+            "The number of parameters from FlatParameter is not consistent to "
+            "the number of states used by optimizer state dict reconstruction "
+            "logic."
+        )
+        for fqn, idx in fsdp_param_info.param_indices.items():
+            if fsdp_param_info.param_requires_grad[idx] or fqn in output_states:
+                output_states[fqn][state_name] = orig_states[idx]
+
+        _unflatten_orig_param_states(
+            fsdp_param_info,
+            output_states,
+            state_name,
+            shard_state,
+            to_save,
+            cpu_offload,
+        )
+
+    del gathered_tensor
+    return output_states
+
+
+def _gather_all_orig_param_state(
+    fsdp_param_info: FSDPParamInfo,
+    input_states: Dict[str, Any],
+    shard_state: bool,
+    to_save: bool,
+    cpu_offload: bool,
+) -> Dict[str, Any]:
+    """
+    Given a optimizer state dict, ``input_states``, which the keys are FQNs to the
+    original parameters (not FlatParameters nor parmeter ID), gather all the
+    states and unflatten them to the original dimensions. Note that all the
+    params referred by the ``input_states`` must be managed by FSDP.
+    """
+    fsdp_state = fsdp_param_info.state
+    if (
+        fsdp_state.world_size == 1
+        or fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD
+    ):
+        return input_states if to_save else {}
+
+    with SimpleProfiler.profile(SimpleProfiler.Type.RESHARDING):
+        with SimpleProfiler.profile(SimpleProfiler.Type.ALLGATHER_OBJ):
+            gathered_state_info = _allgather_state_info(fsdp_state, input_states)
+        output_states = _allgather_orig_param_states(
+            fsdp_param_info,
+            gathered_state_info,
+            input_states,
+            shard_state,
+            to_save,
+            cpu_offload,
+        )
+    if to_save:
+        for key, idx in fsdp_param_info.param_indices.items():
+            if key in output_states:
+                continue
+            if not fsdp_param_info.param_requires_grad[idx]:
+                continue
+
+            raise RuntimeError(
+                f"{key} is not in the output state. "
+                "The FSDPParamInfo has the param keys "
+                f"{sorted(fsdp_param_info.param_indices.keys())} while "
+                "the output_states has the param keys "
+                f"{sorted(output_states.keys())}."
+            )
+        return output_states
+    else:
+        return {}
+
+
+def _convert_state_with_orig_params(
+    all_optim_state_keys: List[_OptimStateKey],
+    optim_state_key_to_param_key: Dict[_OptimStateKey, Union[int, str]],
+    fqn_to_fsdp_param_info: Dict[str, FSDPParamInfo],
+    optim_state_dict: Dict[Union[str, int], Any],
+    to_save: bool,
+    shard_state: bool,
+    cpu_offload: bool = True,
+) -> Dict[str, Any]:
+    fsdp_osd_state: Dict[str, Any] = {}
+    # This variable is used to deduplicate the FSDPParamInfo as one FSDPParamInfo
+    # usually corresponds to multiple parameters. We could not use FSDPParamInfo
+    # as the key because FSDPParamInfo is not hashable. As a result, we fall back
+    # to `id(FSDPParamInfo)`, which the type is an integer.
+    all_states: Dict[int, Dict[str, Any]] = {}
+    # Iterate in rank 0's flat parameter ID order to ensure aligned all-gathers
+    # across ranks
+    for optim_state_key in all_optim_state_keys:
+        param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
+            optim_state_key, None
+        )
+
+        if param_key is None and not optim_state_key.is_fsdp_managed:
+            continue
+
+        if optim_state_key.is_fsdp_managed:
+            fqn = optim_state_key.unflat_param_names[0]
+            fsdp_param_info = fqn_to_fsdp_param_info.get(fqn, None)
+            if fsdp_param_info is None:
+                # This can happen if the not all FSDP instances have all the
+                # parameters. This can happen with FSDP + some MPMD style
+                # parallelism.
+
+                # TODO: it is unclear if we need to do the same check with
+                # non-FSDP managed keys.
+                continue
+            state = {} if param_key is None else optim_state_dict[param_key]
+            if id(fsdp_param_info) not in all_states:
+                all_states[id(fsdp_param_info)] = {}
+            all_states[id(fsdp_param_info)][fqn] = state
+
+        elif to_save:
+            assert len(optim_state_key.unflat_param_names) == 1
+            unflat_param_name = optim_state_key.unflat_param_names[0]
+            with SimpleProfiler.profile("none_fsdp_managed_copy"):
+                param_key = cast(Union[str, int], param_key)
+                fsdp_osd_state[unflat_param_name] = copy.copy(
+                    optim_state_dict[param_key]
+                )
+                if cpu_offload:
+                    for state_name, value in sorted_items(
+                        fsdp_osd_state[unflat_param_name]
+                    ):
+                        if not torch.is_tensor(value):
+                            continue
+                        fsdp_osd_state[unflat_param_name][state_name] = value.cpu()
+
+    # Instead of gathering the state of each parameter individually, we perform
+    # the gathering  all at once to speed up the process.
+    for _all_states in all_states.values():
+        fqn = next(iter(_all_states.keys()))
+        fsdp_param_info = fqn_to_fsdp_param_info[fqn]
+        assert len(fsdp_param_info.param_requires_grad) > 0, (
+            "With use_orig_params, FSDPParamInfo should have requires_grad "
+            "information. However, the length is zero."
+        )
+        for key, idx in fsdp_param_info.param_indices.items():
+            if key in _all_states:
+                continue
+            if not fsdp_param_info.param_requires_grad[idx]:
+                continue
+            raise RuntimeError(
+                f"{key} is not in the optimizer state. "
+                "The FSDPParamInfo has the param keys "
+                f"{sorted(fsdp_param_info.param_indices.keys())} while "
+                "the optimizer has the param keys "
+                f"{sorted(_all_states.keys())}."
+            )
+        fsdp_osd_state.update(
+            _gather_all_orig_param_state(
+                fsdp_param_info,
+                _all_states,
+                shard_state,
+                to_save,
+                cpu_offload,
+            )
+        )
+
+    return fsdp_osd_state
+
+
+def _convert_state_with_flat_params(
+    all_optim_state_keys: List[_OptimStateKey],
+    optim_state_key_to_param_key: Dict[_OptimStateKey, Union[int, str]],
+    fqn_to_fsdp_param_info: Dict[str, FSDPParamInfo],
+    optim_state_dict: Dict[Union[str, int], Any],
+    to_save: bool,
+    shard_state: bool,
+    cpu_offload: bool = True,
+) -> Dict[str, Any]:
+    fsdp_osd_state: Dict[str, Any] = {}
+    # Iterate in rank 0's flat parameter ID order to ensure aligned all-gathers
+    # across ranks
+    for optim_state_key in all_optim_state_keys:
+        param_key: Union[str, int, None] = optim_state_key_to_param_key.get(
+            optim_state_key, None
+        )
+
+        assert param_key is not None, (
+            "If use_orig_params is False, we must be able to find the "
+            f"corresponding param id. {optim_state_key} {param_key}"
+        )
+
+        if optim_state_key.is_fsdp_managed:
+            # If there are multiple unflat_param_names (not use_orig_params),
+            # they share the same FSDPParamInfo. So the first unflat_param_name
+            # is sufficient to fetch the FSDPParamInfo.
+            fqn = optim_state_key.unflat_param_names[0]
+            fsdp_param_info = fqn_to_fsdp_param_info[fqn]
+            unflat_state = _unflatten_optim_state(
+                fsdp_param_info,
+                optim_state_dict[param_key],
+                to_save,
+                shard_state,
+                cpu_offload,
+            )
+            if to_save:
+                assert len(unflat_state) == len(optim_state_key.unflat_param_names)
+                for unflat_param_name, unflat_param_state in zip(
+                    optim_state_key.unflat_param_names,
+                    unflat_state,
+                ):
+                    fsdp_osd_state[unflat_param_name] = unflat_param_state
+        elif to_save:
+            assert len(optim_state_key.unflat_param_names) == 1
+            unflat_param_name = optim_state_key.unflat_param_names[0]
+            fsdp_osd_state[unflat_param_name] = copy.copy(optim_state_dict[param_key])
+            if cpu_offload:
+                for state_name, value in sorted_items(
+                    fsdp_osd_state[unflat_param_name]
+                ):
+                    if not torch.is_tensor(value):
+                        continue
+                    fsdp_osd_state[unflat_param_name][state_name] = value.cpu()
+
+    return fsdp_osd_state
+
+
+@torch.no_grad()
+def _optim_state_dict(
+    model: nn.Module,
+    optim: torch.optim.Optimizer,
+    optim_state_dict: Dict[str, Any],
+    optim_input: Optional[
+        Union[
+            List[Dict[str, Any]],
+            Iterable[nn.Parameter],
+        ]
+    ],
+    rank0_only: bool,
+    shard_state: bool,
+    group: Optional[dist.ProcessGroup],
+    using_optim_input: bool,
+    use_orig_params: bool = False,
+    cpu_offload: bool = True,
+) -> Dict[str, Any]:
+    """
+    Consolidates the optimizer state and returns it as a :class:`dict`
+    following the convention of :meth:`torch.optim.Optimizer.state_dict`,
+    i.e. with keys ``"state"`` and ``"param_groups"``.
+    The flat parameters in ``FSDP`` modules contained in ``model`` are mapped
+    back to their unflattened parameters.
+
+    Parameter keys are not well-defined. For a regular optimizer, the optimizer
+    state_dict contains a mapping from parameter IDs to parameter states.
+    Parameter IDs are the order of parameters in ``optim.param_groups()`` across
+    all the groups. This API also allows user to pass ``optim_input`` for the
+    mapping between parameters and parameter IDs. Using ``optim_input`` is being
+    deprecated.
+
+    If the optimizer is a ``NamedOptimizer``, the optimizer state_dict does not
+    contain parameter IDs mapping but a mapping from parameter FQNs to parameter
+    states. This API finds the mapping from FQNs to parameters if the optimizer
+    is a ``NamedOptimizer``.
+
+    If ``use_orig_params`` is True, each rank will have all FSDP-managed
+    parameters but some of these parameters may be empty due to the sharding.
+    For a regular optim.Optimizer, states for those empty parameters will
+    not be initialized. So, when aggregating the FQNs across ranks, no assert
+    will be raised on a rank even if it does not have all the states -- it is
+    valid and FSDP knows how to aggregate them. However, FSDP has to ignore
+    handling those parameters that are not managed by FSDP and do not exist on
+    the local rank -- those are managed by other parallelisms and FSDP does not
+    know how to handle/aggregate them.
+
+    Args:
+        model (nn.Module): Root module (which may or may not be a
+            :class:`FullyShardedDataParallel` instance) whose parameters
+            were passed into the optimizer ``optim``.
+        optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+            parameters.
+        rank0_only (bool): If ``True``, saves the populated :class:`dict`
+            only on rank 0; if ``False``, saves it on all ranks. (Default:
+            ``True``)
+        shard_state (bool): If ``True``, shard and distribute all
+            non-zero-dimension states.
+
+    Returns:
+        Dict[str, Any]: A :class:`dict` containing the optimizer state for
+        ``model`` 's original unflattened parameters and including keys
+        "state" and "param_groups" following the convention of
+        :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=False``,
+        then nonzero ranks return an empty :class:`dict`.
+    """
+    SimpleProfiler.reset()
+    cm = ExitStack()
+    cm.enter_context(SimpleProfiler.profile(SimpleProfiler.Type.ALL))
+    _reset_flat_param_grad_info_if_needed(traversal_utils._get_fsdp_handles(model))
+    to_save = not rank0_only or dist.get_rank(group) == 0 or shard_state
+
+    with SimpleProfiler.profile("preprocessing"):
+        param_to_fqns = _get_param_to_fqns(model)
+        flat_param_to_fqn = _get_flat_param_to_fqn(model)
+        is_named_optimizer = _is_named_optimizer(optim_state_dict)
+
+        param_key_to_param = cast(
+            Dict[Union[int, str], nn.Parameter],
+            (
+                _get_param_id_to_param_from_optim_input(model, optim_input)
+                if using_optim_input
+                else _get_param_key_to_param(
+                    optim, model, is_named_optimizer, param_to_fqns, flat_param_to_fqn
+                )
+            ),
+        )
+        fqn_to_fsdp_param_info = _get_fqn_to_fsdp_param_info(model)
+
+    with SimpleProfiler.profile("preprocessing_with_comm"):
+        (
+            all_optim_state_keys,
+            optim_state_key_to_param_key,
+        ) = _map_param_key_to_optim_keys(
+            optim_state_dict,
+            group,
+            param_key_to_param,
+            param_to_fqns,
+            fqn_to_fsdp_param_info,
+            merge_keys=use_orig_params,
+        )
+
+    with SimpleProfiler.profile("state_converting"):
+        convert_fn = (
+            _convert_state_with_orig_params
+            if use_orig_params
+            else _convert_state_with_flat_params
+        )
+        fsdp_osd_state = convert_fn(
+            all_optim_state_keys,
+            optim_state_key_to_param_key,
+            fqn_to_fsdp_param_info,
+            optim_state_dict["state"],
+            to_save,
+            shard_state,
+            cpu_offload,
+        )
+
+    # At this point, communication is complete and ranks can return early if nothing
+    # will be saved on that rank.
+    if not to_save:
+        return {}
+
+    fsdp_osd: Dict[str, Any] = {"state": fsdp_osd_state}
+
+    flat_param_fqns = set(flat_param_to_fqn.values())
+    for key, value in optim_state_dict["state"].items():
+        if key in fsdp_osd_state:
+            continue
+        if key in flat_param_fqns:
+            continue
+        if key in param_key_to_param:
+            continue
+        # This key is not recognized by FSDP. It may be a user-defined state
+        # or some parameters state that FSDP is unable to map from
+        # ``optim.param_groups``.
+        warnings.warn(
+            f"Found a optim state, {key}, that FSDP cannot process. FSDP "
+            "will directly copy everything to the returned state_dict. In "
+            "most cases, this is a user-defined state that is not "
+            "associated with any particular parameter. Another possible "
+            "case is this state is managed by TorchRec. Otherwise, there may "
+            " be a mismatched assumption of optim_state_dict of this mode."
+        )
+        fsdp_osd_state[key] = value
+
+    if "param_groups" in optim_state_dict:
+        fsdp_osd["param_groups"] = _unflatten_param_groups(
+            optim_state_dict, param_key_to_param, param_to_fqns
+        )
+
+    cm.close()
+    SimpleProfiler.dump_and_reset("FSDP _optim_state_dict() profiling: ")
+
+    return fsdp_osd
+
+
+def _get_fqn_to_fsdp_param_info(model: nn.Module) -> Dict[str, FSDPParamInfo]:
+    """
+    Construct the mapping from a param's fqn to its corresponding ``FSDPParamInfo``
+    if the param is managed by FSDP. Shared parameters, or original parameters that
+    are shared across multiple nn.Modules, are required to belong to one and only
+    one FSDP instance and thus correspond to one ``FlatParameter``. Within the one
+    ``FlatParameter``, ``FlatParameter._fqns`` only stores the first FQN of a shared
+    parameter. Thus, the keys in the mapping are guaranteed to map to unique parameters.
+    """
+
+    def module_fn(module, prefix, tree_level, fqn_to_param_info):
+        fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+        if fsdp_state is None:
+            return
+        _lazy_init(fsdp_state, module)
+        handle = _module_handle(fsdp_state, module)
+        if not handle:
+            return
+        flat_param = handle.flat_param
+        fsdp_param_info = FSDPParamInfo(fsdp_state, handle, {}, [])
+        # NOTE: `idx` indexes into the data structures *without* padding
+        # elements
+        for idx, local_fqn in enumerate(flat_param._fqns):
+            fqn = clean_tensor_name(prefix + local_fqn)
+            if fqn in fqn_to_param_info:
+                assert fqn_to_param_info[fqn].handle.flat_param is flat_param, fqn
+            fqn_to_param_info[fqn] = fsdp_param_info
+            fsdp_param_info.param_indices[fqn] = idx
+            if flat_param._params is not None:
+                fsdp_param_info.param_requires_grad.append(
+                    flat_param._params[idx].requires_grad
+                )
+
+    def return_fn(fqn_to_param_info):
+        return fqn_to_param_info
+
+    fqn_to_param_info: Dict[str, FSDPParamInfo] = {}
+    # FlatParameter._fqns stores the local fqn, starting from the root of the
+    # FSDP. Using _apply_to_modules() with model (may not be the FSDP root
+    # module) allows us to construct the global fqn.
+    return _apply_to_modules(
+        model,
+        module_fn,
+        return_fn,
+        [fqn for fqn, _ in _named_parameters_with_duplicates(model)],
+        fqn_to_param_info,
+    )
+
+
+@no_type_check
+def _set_optim_use_dtensor(
+    fsdp_state: _FSDPState,
+    state_dict_settings: StateDictSettings,
+) -> None:
+    # If device_mesh is passed in when initalizing FSDP, we automatically turn the
+    # _use_dtensor flag to be true for ShardedOptimStateDictConfig() if state_dict_type
+    # has to be set to SHARDED_STATE_DICT.
+    if getattr(fsdp_state, "_device_mesh", None):
+        state_dict_type = state_dict_settings.state_dict_type
+        if state_dict_type == StateDictType.LOCAL_STATE_DICT:
+            raise RuntimeError(
+                "Found state_dict_type LOCAL_STATE_DICT.",
+                "DeviceMesh is not compatible with LOCAL_STATE_DICT.",
+                "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.",
+            )
+        else:
+            state_dict_settings.optim_state_dict_config._use_dtensor = True
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_runtime_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_runtime_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a570627e460bdb9ae9d5dab2dddf60c63da9e4f8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_runtime_utils.py
@@ -0,0 +1,1630 @@
+import functools
+import logging
+from enum import auto, Enum
+from typing import Any, Callable, Dict, List, no_type_check, Optional, Set, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.autograd.graph import register_multi_grad_hook
+from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
+from torch.distributed.fsdp._common_utils import (
+    _assert_in_training_states,
+    _FSDPState,
+    _get_module_fsdp_state,
+    _is_composable,
+    _log_post_backward_hook,
+    _no_dispatch_record_stream,
+    clean_tensor_name,
+    TrainingState,
+)
+from torch.distributed.fsdp._flat_param import (
+    FlatParameter,
+    FlatParamHandle,
+    HandleShardingStrategy,
+    HandleTrainingState,
+    RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES,
+)
+from torch.distributed.fsdp._init_utils import HYBRID_SHARDING_STRATEGIES
+from torch.distributed.fsdp.api import BackwardPrefetch
+from torch.distributed.utils import (
+    _apply_to_tensors,
+    _cast_forward_inputs,
+    _p_assert,
+    _to_kwargs,
+)
+from torch.utils import _pytree as pytree
+
+log = logging.getLogger(__name__)
+
+# Do not include "process_group" to enable hybrid shard and MoE cases
+HOMOGENEOUS_ATTR_NAMES = (
+    "_use_orig_params",
+    "limit_all_gathers",
+    "_use_full_prec_in_eval",
+)
+
+
+class _PrefetchMode(Enum):
+    BACKWARD = auto()
+    FORWARD = auto()
+
+
+def _get_fsdp_root_states_with_modules(
+    module: nn.Module,
+) -> Tuple[List[_FSDPState], List[nn.Module]]:
+    """
+    Returns a tuple containing:
+    1. A list of the root ``_FSDPState`` instances in the module tree rooted at
+    ``module`` without any duplicates and following the ``module.modules()``
+    traversal order (which is assumed to be depth-first).
+    2. A corresponding list of the root modules owning the states in the first
+    list.
+
+    This is similar to :func:`_get_fsdp_states_with_modules` except that we
+    must call :func:`_is_fsdp_root` to force a lazy initialization to determine
+    the FSDP root in case lazy initialization has not yet happened.
+    """
+    fsdp_root_states: List[_FSDPState] = []
+    fsdp_root_modules: List[nn.Module] = []
+    visited_fsdp_states: Set[_FSDPState] = set()
+    # NOTE: This function assumes that `module.modules()` proceeds top-down.
+    for submodule in module.modules():
+        optional_state = _get_module_fsdp_state(submodule)
+        if (
+            optional_state is not None
+            and optional_state not in visited_fsdp_states
+            and _is_fsdp_root(optional_state, submodule)
+        ):
+            visited_fsdp_states.add(optional_state)
+            fsdp_root_states.append(optional_state)
+            fsdp_root_modules.append(submodule)
+    return fsdp_root_states, fsdp_root_modules
+
+
+def _get_fsdp_root_states(module: nn.Module) -> List[_FSDPState]:
+    """See :func:`_get_fsdp_root_states_with_modules`."""
+    fsdp_root_states, _ = _get_fsdp_root_states_with_modules(module)
+    return fsdp_root_states
+
+
+def _is_fsdp_root(state: _FSDPState, module: nn.Module) -> bool:
+    """
+    Returns if ``state`` corresponds to that of an FSDP root.
+
+    For the wrapper code path, ``state`` and ``module`` should be the same. For
+    the non-wrapper code path, ``state`` should be ``module`` 's state.
+    """
+    # Force a lazy initialization to determine the FSDP root
+    _lazy_init(state, module)
+    assert state._is_root is not None  # mypy
+    return state._is_root
+
+
+@no_type_check
+def _lazy_init(
+    state: _FSDPState,
+    root_module: nn.Module,
+) -> _FSDPState:
+    """
+    Performs initialization lazily, typically right before the first forward
+    pass. The laziness is needed to ensure that the parameter device/dtype and
+    the FSDP hierarchy have finalized. This method's actual logic only runs on
+    the root FSDP instance, which performs initialization for all non-root FSDP
+    instances to avoid partial initialization.
+
+    For the non-composable code path, ``state`` and ``root_module`` should be
+    the same, namely the FSDP instance itself.
+    """
+    if state._is_root is not None:
+        return  # no-op: already lazily initialized
+    if not state._device_handle.is_available():
+        # Allow the FSDP constructor to run even without CUDA but check this
+        # once we start real execution
+        raise RuntimeError("FSDP does not support CPU only execution")
+    # The following logic is only run on the root FSDP instance since it will
+    # set `_is_root=False` for the non-root instances
+    state._is_root = True
+    _assert_in_training_states(state, [TrainingState.IDLE])
+    _check_flat_params_on_expected_device(state, root_module)
+    state._all_fsdp_states = traversal_utils._get_fsdp_states(root_module)
+    _init_streams(state)
+    buffers, buffer_dtypes = _get_buffers_and_dtypes_for_computation(state, root_module)
+    _cast_buffers_to_dtype_and_device(buffers, buffer_dtypes, state.compute_device)
+    state._exec_order_data.init(state, root_module, state.process_group)
+    _share_state_and_init_handle_attrs(state, root_module)
+    return state
+
+
+def _check_flat_params_on_expected_device(state: _FSDPState, module: nn.Module):
+    """
+    Checks that all ``FlatParameter``s in ``module`` 's tree managed by
+    ``state`` are on the expected device for *lazy initialization*.
+    """
+    cpu_device = torch.device("cpu")
+    for handle in traversal_utils._get_fsdp_handles(module):
+        if (
+            not handle._offload_params
+            and handle.flat_param.device != state.compute_device
+        ):
+            raise RuntimeError(
+                "An FSDP-managed module unexpectedly has parameters on "
+                f"{handle.flat_param.device}. Make sure to move the module to "
+                f"{state.compute_device} before training."
+            )
+        elif handle._offload_params and handle.flat_param.device != cpu_device:
+            raise RuntimeError(
+                "An FSDP-managed module with parameter CPU offloading enabled "
+                f"has parameters on {handle.flat_param.device}. Make sure to "
+                f"not move the module from CPU when offloading parameters."
+            )
+
+
+@no_type_check
+def _share_state_and_init_handle_attrs(
+    root_state: _FSDPState,
+    root_module: nn.Module,
+) -> None:
+    """
+    Shares data structure state from the ``root_state`` to all FSDP states in
+    ``root_module`` 's module tree, and initializes handle attributes. These
+    are done together to require a single loop over the states.
+    """
+    handle = root_state._handle
+    if handle:
+        handle.init_flat_param_attributes()
+    attr_name_to_values: Dict[str, Set[Any]] = {}
+    for attr_name in HOMOGENEOUS_ATTR_NAMES:
+        attr_name_to_values[attr_name] = set()
+    root_state._all_handles = root_state._exec_order_data.all_handles  # share reference
+    # Update _has_optim_in_backward for each handle.
+    for handle in root_state._all_handles:
+        flat_param = handle.flat_param
+        if hasattr(flat_param, "_in_backward_optimizers"):
+            raise RuntimeError(
+                "FSDP optimizer in backward only supported with use_orig_params=True!"
+            )
+        handle._has_optim_in_backward = flat_param._params is not None and any(
+            hasattr(param, "_in_backward_optimizers") for param in flat_param._params
+        )
+        if handle._has_optim_in_backward:
+            torch._C._log_api_usage_once("fsdp.optimizer_in_backward")
+    for fsdp_state in root_state._all_fsdp_states:
+        for attr_name in HOMOGENEOUS_ATTR_NAMES:
+            _p_assert(
+                hasattr(fsdp_state, attr_name),
+                f"FSDP state missing attribute {attr_name}",
+            )
+            attr_name_to_values[attr_name].add(getattr(fsdp_state, attr_name))
+        if fsdp_state is root_state:
+            continue
+        # Relax the assert for non-root FSDP instances in case the nested
+        # initialized module is wrapped again in FSDP later (e.g. after
+        # training to run inference)
+        _p_assert(
+            fsdp_state._is_root is None or not fsdp_state._is_root,
+            "Non-root FSDP instance's `_is_root` should not have been "
+            "set yet or should have been set to `False`",
+        )
+        fsdp_state._is_root = False
+        fsdp_state._unshard_stream = root_state._unshard_stream
+        fsdp_state._post_backward_stream = root_state._post_backward_stream
+        fsdp_state._pre_unshard_stream = root_state._pre_unshard_stream
+        fsdp_state._all_reduce_stream = root_state._all_reduce_stream
+        fsdp_state._default_stream = root_state._default_stream
+        fsdp_state._exec_order_data = root_state._exec_order_data
+        fsdp_state._free_event_queue = root_state._free_event_queue
+        if fsdp_state._fsdp_extension is not None:
+            fsdp_state._fsdp_extension.compute_stream = root_state._default_stream
+        handle = fsdp_state._handle
+        if handle:
+            handle.init_flat_param_attributes()
+    for attr_name, attr_values in attr_name_to_values.items():
+        if len(attr_values) != 1:
+            raise ValueError(
+                f"Expects one homogeneous value for {attr_name} but got {attr_values}"
+            )
+
+
+@no_type_check
+def _init_streams(
+    state: _FSDPState,
+) -> None:
+    """
+    Initializes CUDA streams for overlapping communication, computation, and
+    data transfers. The streams should be shared across FSDP instances.
+    """
+    assert state._is_root
+    assert state._device_handle.is_available()
+    uses_hybrid_sharding = any(
+        fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES
+        for fsdp_state in state._all_fsdp_states
+    )
+    # Prioritize all-gathers/reduce-scatters over async all-reduce for HSDP and
+    # preserve the default priority of 0 otherwise
+    high_priority = -1 if state.limit_all_gathers and uses_hybrid_sharding else 0
+    # Default stream for computation
+    state._default_stream = state._device_handle.current_stream()
+    if state._fsdp_extension is not None:
+        # set the compute stream to the FSDP extension
+        state._fsdp_extension.compute_stream = state._default_stream
+
+    # Stream for unshard logic, including allocating the all-gather destination
+    # tensors and the all-gathers themselves
+    state._unshard_stream = state._device_handle.Stream(priority=high_priority)
+    # Stream for overlapping gradient reduction with the backward pass gradient
+    # computation
+    state._post_backward_stream = state._device_handle.Stream(priority=high_priority)
+    # Stream for pre-unshard logic, namely allocations and writes for CPU
+    # offloading (H2D copy) and mixed precision (low precision cast)
+    state._pre_unshard_stream = state._device_handle.Stream(priority=high_priority)
+    # Stream to run HSDP's all-reduce as async (if using HSDP)
+    state._all_reduce_stream = (
+        state._device_handle.Stream() if uses_hybrid_sharding else state._default_stream
+    )
+
+
+@no_type_check
+def _unshard(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    unshard_stream: torch.Stream,
+    pre_unshard_stream: torch.Stream,
+) -> None:
+    """
+    Unshards the handles in ``handles``. If the handles are in
+    :meth:`summon_full_params` and are using mixed precision, then they are
+    forced to full precision.
+
+    Postcondition: handle's ``FlatParameter`` 's data is the padded
+    unsharded flat parameter on the compute device.
+    """
+    if not handle:
+        return
+    with state._device_handle.stream(pre_unshard_stream):
+        ran_pre_unshard = handle.pre_unshard()
+    if ran_pre_unshard:
+        unshard_stream.wait_stream(pre_unshard_stream)
+    if state.limit_all_gathers:
+        event = state._free_event_queue.dequeue_if_needed()
+        if event:
+            with torch.profiler.record_function(
+                "FullyShardedDataParallel.rate_limiter"
+            ):
+                event.synchronize()
+    with state._device_handle.stream(unshard_stream):
+        handle.unshard()
+        handle.post_unshard()
+
+
+@no_type_check
+def _reshard(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    free_unsharded_flat_param: bool,
+):
+    """
+    Reshards the handle. ``free_unsharded_flat_param`` indicates whether to
+    free the handle's padded unsharded flat parameter.
+    """
+    handle.reshard(free_unsharded_flat_param)
+    if state.limit_all_gathers and free_unsharded_flat_param:
+        if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+            # We don't run a even queue for freeing under torch compile atm
+            # But maybe we need to? TODO(voz): Look into this
+            free_event = state._device_handle.Event()
+            free_event.record()
+            state._free_event_queue.enqueue(free_event)
+    handle.post_reshard()
+    # Flat parameter freed or not, we always have to "unshard" the parameter
+    # upon next access to get its shape correct.
+    handle._prefetched = False
+
+
+def _unshard_grads(
+    handle: Optional[FlatParamHandle],
+) -> None:
+    if handle:
+        handle.unshard_grad()
+
+
+def _reshard_grads(
+    handle: Optional[FlatParamHandle],
+) -> None:
+    if handle:
+        handle.reshard_grad()
+
+
+@no_type_check
+def _pre_forward(
+    state: _FSDPState,
+    handle: Optional[FlatParamHandle],
+    unshard_fn: Callable,
+    module: nn.Module,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+    """
+    Runs the pre-forward logic. This includes an opportunity to unshard
+    currently sharded parameters such as those for the current forward and
+    registering post-backward hooks for these current parameters. This function
+    also converts forward ``args`` and ``kwargs`` to the given precision.
+
+    Args:
+        handles (List[FlatParamHandle]): Handles giving the parameters used in
+            the current forward.
+        unshard_fn (Optional[Callable]): A callable to unshard any currently
+            sharded parameters or ``None`` to not do any unsharding.
+        module (nn.Module): Module whose forward this method runs right before;
+            expected by the hook signature.
+        args (Tuple[Any, ...]): Module forward ``args``.
+        kwargs (Dict[str, Any]): Module forward ``kwargs``.
+    """
+    with torch.profiler.record_function("FullyShardedDataParallel._pre_forward"):
+        # For `fully_shard` + `checkpoint`, skip pre-forward logic in the
+        # recomputed forward
+        if handle and handle._training_state == HandleTrainingState.BACKWARD_PRE:
+            # For both checkpoint implementations, we do not need to re-cast
+            # inputs here since they will be checkpointed in the low precision
+            # either by AC or normally by autograd as long as the AC region is
+            # nested within FSDP
+            return args, kwargs
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        state._exec_order_data.record_pre_forward(handle, module.training)
+        if handle:
+            handle._training_state = HandleTrainingState.FORWARD
+        if unshard_fn is not None:
+            unshard_fn(state, handle)
+        # Register post-backward hooks to reshard the parameters and reduce-scatter
+        # their gradients. They must be re-registered every forward pass in case
+        # the `grad_fn` is mutated.
+        _register_post_backward_hook(state, handle)
+        # We have to reallocate the _cpu_grad if optimizer overlap
+        # set the grad to None in the backward pass.
+        if handle and handle._offload_params and handle.flat_param._cpu_grad is None:
+            handle.flat_param._cpu_grad = torch.zeros_like(
+                handle.flat_param._local_shard, device=torch.device("cpu")
+            ).pin_memory()
+
+        should_cast_forward_inputs = (
+            state._handle and not state._handle._force_full_precision
+        )
+
+        if should_cast_forward_inputs and state.mixed_precision.cast_forward_inputs:
+            # Recursively convert args and kwargs to specified precision.
+            input_dtype: Optional[torch.dtype] = state.mixed_precision.param_dtype
+            args, kwargs = _cast_forward_inputs(input_dtype, *args, **kwargs)
+        _register_post_backward_reshard_only_hook(state, handle, args, kwargs)
+        return args, kwargs
+
+
+@no_type_check
+def _pre_forward_unshard(
+    state: _FSDPState,
+    handle: Optional[FlatParamHandle],
+) -> None:
+    """Unshards parameters in the pre-forward."""
+    if not handle:
+        return
+    # If the handles have been prefetched, then there is no need to call
+    # `_unshard()` again
+    if not handle._prefetched:
+        _unshard(state, handle, state._unshard_stream, state._pre_unshard_stream)
+    handle._needs_pre_forward_unshard = False
+    # Don't wait during trace
+    if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        state._device_handle.current_stream().wait_stream(state._unshard_stream)
+    with torch.profiler.record_function(
+        "FullyShardedDataParallel._pre_forward_prefetch"
+    ):
+        _prefetch_handle(state, handle, _PrefetchMode.FORWARD)
+
+
+@no_type_check
+def _post_forward(
+    state: _FSDPState,
+    handle: Optional[FlatParamHandle],
+    reshard_fn: Callable,
+    module: nn.Module,
+    input: Any,
+    output: Any,
+) -> Any:
+    """
+    Runs the post-forward logic. This includes an opportunity to reshard
+    currently unsharded parameters such as those used in the current forward
+    and registering pre-backward hooks on the forward outputs.
+
+    Args:
+        handles (List[FlatParamHandle]): Handles giving the parameters used in
+            the current forward.
+        reshard_fn (Optional[Callable]): A callable to reshard any currently
+            unsharded parameters (e.g. from the current forward) or ``None`` to
+            not do any resharding.
+        module (nn.Module): Module whose forward just ran, which should be a
+            fully sharded module (see [Note: Fully Sharded Module]); expected
+            by the hook signature.
+        input (Any): Unused; expected by the hook signature.
+        output (Any): Forward pass output; pre-backward hooks are registered on
+            the tensors that require gradients in this output.
+
+    Postcondition: Each ``FlatParameter`` 's data points to the sharded flat
+    parameter.
+    """
+    with torch.profiler.record_function("FullyShardedDataParallel._post_forward"):
+        # For `fully_shard` + `checkpoint`, skip post-forward logic in the
+        # recomputed forward
+        if handle and handle._training_state == HandleTrainingState.BACKWARD_PRE:
+            return output
+
+        state._exec_order_data.record_post_forward(handle)
+        if reshard_fn is not None:
+            reshard_fn(state, handle)
+        # Register pre-backward hooks to unshard the flat parameters for the
+        # gradient computation (if needed)
+        output = _register_pre_backward_hooks(state, module, output, handle)
+        state.training_state = TrainingState.IDLE
+        if handle:
+            handle._training_state = HandleTrainingState.IDLE
+        return output
+
+
+@no_type_check
+def _post_forward_reshard(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+) -> None:
+    """Reshards parameters in the post-forward."""
+    if not handle:
+        return
+    # Do not free the root's parameters in the post-forward for `FULL_SHARD`
+    # with the intention that they are immediately used for backward
+    # computation (though this may not be true)
+    free_unsharded_flat_param = (
+        not state._is_root
+        and handle._sharding_strategy in RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES
+    )
+    _reshard(state, handle, free_unsharded_flat_param)
+
+
+@no_type_check
+def _root_pre_forward(
+    state: _FSDPState,
+    module: nn.Module,
+    args,
+    kwargs,
+) -> None:
+    """
+    Runs pre-forward logic specific to the root FSDP instance, which should run
+    before any individual module's pre-forward. This starts with an attempt at
+    lazy initialization (which only runs non-vacuously once). Otherwise, if
+    this is called on a non-root FSDP instance, then it returns directly.
+
+    Args:
+        module (nn.Module): Module for which this logic tries to run. It may or
+            may not be the root. If not, then this method does not do anything.
+    """
+    with torch.profiler.record_function("FullyShardedDataParallel._root_pre_forward"):
+        _lazy_init(state, module)
+        _p_assert(state._is_root is not None, "Expects a root FSDP to have been set")
+        if not state._is_root:
+            # Always cast forward inputs in the root of this local FSDP unit for mixed
+            # precision, as this is where mixed precision could be configed.
+            # This is more useful for auto wrapping that is recommended in composable path.
+            # For manual wrapping, cast forward inputs on each local FSDP unit root will
+            # increase some overhead, so not turned on for model wrapper path right now where
+            # manual wrapping is more broadly used.
+            if _is_composable(state):
+                return _root_cast_forward_input(state, module, args, kwargs)
+            return args, kwargs
+
+        # We cast buffers back to full precision if we're forcing full precision. Disjointly, we check if buffers
+        # are in full precision and if we should cast them back to lower precision, which happens when
+        # exiting eval() mode.
+        handle = state._handle
+        if handle:
+            should_cast_buffers_to_full_prec = handle._force_full_precision
+        else:
+            should_cast_buffers_to_full_prec = True
+
+        if should_cast_buffers_to_full_prec:
+            _cast_buffers_to_dtype_and_device(
+                buffers=dict(module.named_buffers()).values(),
+                buffer_dtypes=list(state._buffer_name_to_orig_dtype.values()),
+                device=state.compute_device,
+            )
+            # This flag is only set when we cast buffers to full precision, to avoid the
+            # CPU overhead that can stem from retrieving all buffers and their types in the
+            # following else branch.
+            state._needs_buffer_dtype_restore_check = True
+        elif getattr(state, "_needs_buffer_dtype_restore_check", False):
+            # Check if buffers are in full precision and we need to cast them
+            # back down.
+            (
+                buffers,
+                buffer_dtypes_for_computation,
+            ) = _get_buffers_and_dtypes_for_computation(state, module)
+            if len(buffers) > 0 and len(buffer_dtypes_for_computation) > 0:
+                if any(
+                    buffer.dtype != buffer_dtype_for_computation
+                    for buffer, buffer_dtype_for_computation in zip(
+                        buffers, buffer_dtypes_for_computation
+                    )
+                ):
+                    # Assume we have to cast everything if there is one mismatch
+                    _cast_buffers_to_dtype_and_device(
+                        buffers, buffer_dtypes_for_computation, state.compute_device
+                    )
+            # We don't have to check this again until we cast buffers to full precision again.
+            state._needs_buffer_dtype_restore_check = False
+
+        if state.forward_prefetch:
+            handles = []
+            for fsdp_state in state._all_fsdp_states:
+                if fsdp_state._handle:
+                    handles.append(fsdp_state._handle)
+            for handle in handles:
+                handle._needs_pre_forward_unshard = True
+                handle._prefetched = False
+        _wait_for_computation_stream(
+            state._device_handle.current_stream(),
+            state._unshard_stream,
+            state._pre_unshard_stream,
+        )
+        _reset_flat_param_grad_info_if_needed(state._all_handles)
+
+        # Prepares the forward inputs by moving them to ``compute_device``
+        # TODO: Do not use the side stream for tensor copies for now; investigate
+        # the perf with/without it.
+        with torch.profiler.record_function("FullyShardedDataParallel._to_kwargs"):
+            args_tuple, kwargs_tuple = _to_kwargs(
+                args, kwargs, state.compute_device, False
+            )
+        args = args_tuple[0]
+        kwargs = kwargs_tuple[0]
+
+        return _root_cast_forward_input(state, module, args, kwargs)
+
+
+@no_type_check
+def _root_cast_forward_input(
+    state: _FSDPState, module: torch.nn.Module, args, kwargs
+) -> Tuple[Any, Any]:
+    if state._handle:
+        force_full_precision = not state._handle._force_full_precision
+    else:
+        force_full_precision = True
+
+    should_cast_forward_inputs = (
+        (module.training or not state._use_full_prec_in_eval) and force_full_precision
+    ) and state.mixed_precision.cast_root_forward_inputs
+
+    if should_cast_forward_inputs:
+        input_dtype: Optional[torch.dtype] = state.mixed_precision.param_dtype
+        args, kwargs = _cast_forward_inputs(input_dtype, *args, **kwargs)
+
+    return args, kwargs
+
+
+@no_type_check
+def _pre_backward_hook(
+    state: _FSDPState,
+    module: nn.Module,
+    handle: FlatParamHandle,
+    grad,
+    *unused: Any,
+) -> Any:
+    """
+    Prepares ``_handle`` 's ``FlatParameter`` s for gradient computation.
+
+    Args:
+        module (nn.Module): Fully sharded module (see [Note: Fully Sharded
+            Module]).
+    """
+    # Only run the pre-backward hook once per group of handles involved in the
+    # same module forward computation
+    if (
+        handle
+        and hasattr(handle, "_ran_pre_backward_hook")
+        and handle._ran_pre_backward_hook
+    ):
+        log.debug("%s %s", id(state), "Not Running pre backward! Already Ran!")
+        return grad
+
+    with torch.profiler.record_function("FullyShardedDataParallel._pre_backward_hook"):
+        # Queue the post-backward callback once for the root FSDP instance to
+        # attach it to the outermost backward graph task so that it is called
+        # after all backward calls complete
+        if state._is_root and not state._post_backward_callback_queued:
+            _register_post_backward_final_callback(state, module)
+            _reset_flat_param_grad_info_if_needed(state._all_handles)
+        elif handle:
+            allowed_states = [TrainingState.IDLE]
+            if _is_composable(state):
+                allowed_states.append(TrainingState.FORWARD_BACKWARD)
+            _assert_in_training_states(state, allowed_states)
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        # Queueing the post-backward callback is the only logic that is not
+        # per-handle in the pre-backward hook, so we can return early here if
+        # there are no handles.
+        if not handle:
+            return grad
+        handle._training_state = HandleTrainingState.BACKWARD_PRE
+
+        if handle._needs_pre_backward_unshard:
+            # If the handles have been prefetched, then there is no need to
+            # call `_unshard()` again
+            if not handle._prefetched:
+                _unshard(
+                    state,
+                    handle,
+                    state._unshard_stream,
+                    state._pre_unshard_stream,
+                )
+            # Don't wait during trace
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                state._device_handle.current_stream().wait_stream(state._unshard_stream)
+
+        # Set this to `False` to ensure that a mistargeted prefetch does not
+        # actually unshard these handles
+        handle._needs_pre_backward_unshard = False
+        with torch.profiler.record_function(
+            "FullyShardedDataParallel._pre_backward_prefetch"
+        ):
+            _prefetch_handle(state, handle, _PrefetchMode.BACKWARD)
+        handle.prepare_gradient_for_backward()
+        handle._ran_pre_backward_hook = True
+        return grad
+
+
+@no_type_check
+@torch.no_grad()
+def _post_backward_hook(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    flat_param,
+    *unused: Any,
+):
+    """
+    Reduce-scatters the gradient of ``handle`` 's ``FlatParameter``.
+
+    Precondition: The ``FlatParameter`` 's ``.grad`` attribute contains the
+    unsharded gradient for the local batch.
+
+    Postcondition:
+    - If using ``NO_SHARD``, then the ``.grad`` attribute is the reduced
+    unsharded gradient.
+    - Otherwise, the ``_saved_grad_shard`` attribute is the reduced sharded
+    gradient (accumulating with any existing gradient).
+    """
+    _log_post_backward_hook(state, handle, log)
+    flat_param = handle.flat_param
+    flat_param._post_backward_called = True
+    with torch.autograd.profiler.record_function(
+        "FullyShardedDataParallel._post_backward_hook"
+    ):
+        _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
+        # For multiple applications of reentrant AC across submodules sharing
+        # the same `FlatParameter`, the post-backward hook may run multiple
+        # times in one backward, in which case we permit the state to already
+        # be in `BACKWARD_POST`.
+        _p_assert(
+            handle._training_state
+            in (HandleTrainingState.BACKWARD_PRE, HandleTrainingState.BACKWARD_POST),
+            f"Expects `BACKWARD_PRE` or `BACKWARD_POST` state but got {handle._training_state}",
+        )
+        handle._training_state = HandleTrainingState.BACKWARD_POST
+
+        if flat_param.grad is None:
+            return
+        if flat_param.grad.requires_grad:
+            raise RuntimeError("FSDP does not support gradients of gradients")
+
+        _post_backward_reshard(state, handle)
+        if not state._sync_gradients:
+            if handle._use_orig_params:
+                handle._use_unsharded_grad_views()
+            return
+
+        # Wait for all ops in the current stream (e.g. gradient computation) to
+        # finish before reduce-scattering the gradient
+        if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+            state._post_backward_stream.wait_stream(
+                state._device_handle.current_stream()
+            )
+
+        with state._device_handle.stream(state._post_backward_stream):
+            autograd_computed_grad = flat_param.grad.data
+            if (
+                not _low_precision_hook_enabled(state)
+                and flat_param.grad.dtype != handle._reduce_dtype
+                # If we are forcing full precision but communicating grads
+                # (i.e. model.eval() + full precision in eval was configured), don't downcast gradient.
+                and not handle._force_full_precision
+            ):
+                flat_param.grad.data = flat_param.grad.to(handle._reduce_dtype)
+            if handle.uses_sharded_strategy:
+                _reduce_grad(state, handle)
+            else:
+                _reduce_grad_no_shard(state, handle)
+            # Since the unsharded gradient is produced in the computation
+            # stream and consumed in the post-backward stream, inform the
+            # caching allocator (before it goes out of scope)
+            _no_dispatch_record_stream(
+                autograd_computed_grad, state._post_backward_stream
+            )
+
+
+def _post_backward_reshard_only_hook(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    *unused: Any,
+) -> None:
+    with torch.profiler.record_function(
+        "FullyShardedDataParallel._post_backward_hook_reshard_only"
+    ):
+        # `_pre_backward_hook` may not get executed
+        # if forward output does not require grad
+        # overwrite IDLE state for post-backward prefetching
+        state.training_state = TrainingState.FORWARD_BACKWARD
+        handle._training_state = HandleTrainingState.BACKWARD_POST
+        _post_backward_reshard(state, handle)
+
+
+def _post_backward_reshard(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    *unused: Any,
+) -> None:
+    free_unsharded_flat_param = _should_free_in_backward(state, handle)
+    _reshard(state, handle, free_unsharded_flat_param)
+
+    # TODO: Post-backward prefetching does not support the multiple handles
+    # per module case since the post-backward hook runs per handle, not per
+    # group of handles.
+    with torch.profiler.record_function(
+        "FullyShardedDataParallel._post_backward_prefetch"
+    ):
+        _prefetch_handle(state, handle, _PrefetchMode.BACKWARD)
+
+
+@no_type_check
+def _should_free_in_backward(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+) -> bool:
+    """
+    Returns whether FSDP should free the unsharded flat parameter in the
+    post-backward or not.
+    """
+    if not handle.uses_sharded_strategy:
+        return False
+    # If not syncing gradients, then we do not free for strategies that do not
+    # reshard after forward as a *heuristic* to tradeoff higher memory for
+    # higher throughput.
+    return (
+        state._sync_gradients
+        or handle._sharding_strategy in RESHARD_AFTER_FORWARD_HANDLE_STRATEGIES
+    )
+
+
+@no_type_check
+def _reduce_grad(state: _FSDPState, handle: FlatParamHandle) -> None:
+    """
+    For sharded strategies, this runs gradient reduction, sharded gradient
+    accumulation if needed, and the post-reduction callback.
+    """
+    flat_param = handle.flat_param
+    uses_hybrid_sharded_strategy = handle._sharding_strategy in (
+        HandleShardingStrategy.HYBRID_SHARD,
+        HandleShardingStrategy._HYBRID_SHARD_ZERO2,
+    )
+    # We clear `.grad` to permit multiple backwards. This avoids a race where
+    # the second backward pass computation precedes ahead of the first backward
+    # pass reduction, which is possible since the reduction is issued in a
+    # separate stream and is async and would result in reducing the wrong
+    # gradient.
+    unsharded_grad = flat_param.grad.data
+    flat_param.grad = None
+    padded_unsharded_grad, new_sharded_grad = _get_reduce_scatter_tensors(
+        state, unsharded_grad
+    )
+    if state._comm_hook is None:  # default path
+        _div_if_needed(padded_unsharded_grad, state._gradient_predivide_factor)
+        pg = (
+            handle._fake_process_group
+            if handle._use_fake_reduce
+            else state.process_group
+        )
+        dist.reduce_scatter_tensor(
+            new_sharded_grad,
+            padded_unsharded_grad,
+            group=pg,
+        )
+        if uses_hybrid_sharded_strategy:
+            # Don't wait during trace
+            if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+                state._all_reduce_stream.wait_stream(state._post_backward_stream)
+            with state._device_handle.stream(state._all_reduce_stream):
+                # Since the new sharded gradient is produced in the post-
+                # backward stream and consumed in the all-reduce stream,
+                # inform the caching allocator
+                _no_dispatch_record_stream(new_sharded_grad, state._all_reduce_stream)
+                dist.all_reduce(new_sharded_grad, group=state._inter_node_pg)
+                _div_if_needed(new_sharded_grad, state._gradient_postdivide_factor)
+                grad_to_offload = _accumulate_sharded_grad(
+                    state, handle, new_sharded_grad
+                )
+                _post_reduce_grad_callback(state, handle, grad_to_offload)
+                return
+        _div_if_needed(new_sharded_grad, state._gradient_postdivide_factor)
+    else:
+        state._comm_hook(
+            state._comm_hook_state, padded_unsharded_grad, new_sharded_grad
+        )
+        # NOTE: HSDP variants do not support communication hook.
+    grad_to_offload = _accumulate_sharded_grad(state, handle, new_sharded_grad)
+    _post_reduce_grad_callback(state, handle, grad_to_offload)
+
+
+@no_type_check
+def _get_reduce_scatter_tensors(
+    state: _FSDPState, unsharded_grad: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns the input and output tensors to reduce-scatter, respectively.
+    """
+    chunks = list(unsharded_grad.chunk(state.world_size))
+    numel_to_pad = state.world_size * chunks[0].numel() - unsharded_grad.numel()
+    padded_unsharded_grad = (
+        F.pad(unsharded_grad, [0, numel_to_pad]) if numel_to_pad > 0 else unsharded_grad
+    )
+    new_sharded_grad = torch.empty_like(chunks[0])  # padded
+    return padded_unsharded_grad, new_sharded_grad
+
+
+@no_type_check
+def _accumulate_sharded_grad(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    sharded_grad: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Accumulates the reduce-scattered sharded gradient with any existing sharded
+    gradient if needed, returning the gradient to offload (if CPU offloading is
+    enabled).
+    """
+    flat_param = handle.flat_param
+    _cast_grad_to_param_dtype(state, sharded_grad, flat_param)
+    # Save the sharded gradient in `_saved_grad_shard` to support gradient
+    # accumulation -- for multiple backwards, the gradient reductions may
+    # happen in arbitrary order
+    accumulate_grad = hasattr(flat_param, "_saved_grad_shard")
+    if accumulate_grad:
+        _check_grad_to_accumulate(sharded_grad, flat_param._saved_grad_shard)
+        flat_param._saved_grad_shard += sharded_grad
+    else:
+        flat_param._saved_grad_shard = sharded_grad
+    grad_to_offload = flat_param._saved_grad_shard
+    return grad_to_offload
+
+
+@no_type_check
+def _reduce_grad_no_shard(state: _FSDPState, handle: FlatParamHandle) -> None:
+    """
+    For no-shard, this runs gradient reduction (which directly covers any
+    gradient accumulation implicitly) and the post-reduction callback.
+    """
+    flat_param = handle.flat_param
+    if state._comm_hook is None:  # default path
+        _div_if_needed(flat_param.grad, state._gradient_predivide_factor)
+        dist.all_reduce(flat_param.grad, group=state.process_group)
+        _div_if_needed(flat_param.grad, state._gradient_postdivide_factor)
+    else:
+        state._comm_hook(state._comm_hook_state, flat_param.grad)
+    # For `NO_SHARD`, we can keep the low precision gradients by simply
+    # omitting the cast altogether
+    if not handle._keep_low_precision_grads:
+        _cast_grad_to_param_dtype(state, flat_param.grad, flat_param)
+    grad_to_offload = flat_param.grad.data
+    _post_reduce_grad_callback(state, handle, grad_to_offload)
+
+
+@no_type_check
+def _post_reduce_grad_callback(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    # Additional arguments needed for the callback logic
+    grad_to_offload: torch.Tensor,
+):
+    """
+    This callback captures any logic to run after the gradient reduction
+    finishes. Currently, this offloads the gradient to CPU if CPU offloading is
+    enabled and uses sharded gradient views if ``use_orig_params=True``.
+    """
+    _offload_grad(state, handle, grad_to_offload)
+    _post_backward_use_sharded_grad_views(handle)
+
+
+@no_type_check
+def _offload_grad(
+    state: _FSDPState,
+    handle: FlatParamHandle,
+    grad_to_offload: torch.Tensor,
+):
+    if not handle._offload_params:
+        return
+    # Offload the gradient to CPU to ensure parameters and gradients are on the
+    # same device as required by the optimizer
+    # TODO: Investigate why `NO_SHARD` breaks correctness when using
+    # `non_blocking=True` here.
+    # TODO (rohan-varma): When CPU offload and optimizer overlap,
+    # non_blocking=True won't work since the copy may have not finished before
+    # the optimizer step executes on CPU. If we want to use non-blocking=True
+    # here, we'll have to synchronize before using result on CPU.
+    non_blocking = handle.uses_sharded_strategy and not handle._has_optim_in_backward
+    handle.flat_param._cpu_grad.copy_(
+        grad_to_offload.detach(), non_blocking=non_blocking
+    )  # synchronized in the post-backward callback
+    # Since the gradient being offloaded may have been produced in the
+    # computation stream and is being consumed here in the post-backward
+    # stream, inform the caching allocator
+    _no_dispatch_record_stream(grad_to_offload.data, state._post_backward_stream)
+
+
+@no_type_check
+def _post_backward_use_sharded_grad_views(handle: FlatParamHandle):
+    if not handle._use_orig_params:
+        return
+    # Since the handle's `FlatParameter` completed its gradient computation, we
+    # should reset the gradient noneness mask
+    handle._reset_is_grad_none()
+    # Delay using sharded gradient views until after the reduce-scatter instead
+    # of immediately after resharding
+    handle._use_sharded_grad_views()
+    if handle._has_optim_in_backward:
+        handle.prepare_gradient_for_optim()
+        for orig_param in handle.flat_param._params:
+            # Check for `None` gradient to filter parameters not in the rank
+            if orig_param.grad is not None and hasattr(
+                orig_param, "_in_backward_optimizers"
+            ):
+                # TODO (rohan-varma): For CPU offload, this unfortunately
+                # operates on CPU because the parameters and gradients have
+                # already been offloaded. We should run this on GPU after
+                # refactoring.
+                for optim in orig_param._in_backward_optimizers:
+                    optim.step()
+
+                optim.zero_grad(set_to_none=True)
+        handle._reset_flat_param_grad_info_if_needed()
+        if handle._offload_params:
+            handle.flat_param._cpu_grad = None
+
+
+def _div_if_needed(tensor: torch.Tensor, div_factor: float) -> None:
+    if div_factor > 1:
+        tensor.div_(div_factor)
+
+
+@no_type_check
+def _cast_grad_to_param_dtype(
+    state: _FSDPState,
+    sharded_grad: torch.Tensor,
+    param: FlatParameter,
+):
+    """
+    Casts ``sharded_grad`` back to the full parameter dtype so that the
+    optimizer step runs with that dtype. This performs an actual cast if
+    1. parameters were in reduced precision during the forward since then
+    gradients would be in that reduced precision, or
+    2. parameters were not in reduced precision but gradients were in
+    reduced precision for communication.
+    However, if a low precision communication hook is registered, then this
+    dtype cast happens in the hook instead.
+    """
+    _assert_in_training_states(state, [TrainingState.FORWARD_BACKWARD])
+    if not _low_precision_hook_enabled(state) and sharded_grad.dtype != param.dtype:
+        low_prec_grad_data = sharded_grad.data
+        sharded_grad.data = sharded_grad.data.to(dtype=param.dtype)
+        # Since for `NO_SHARD`, the gradient is produced in the computation
+        # stream and consumed here in the post-backward stream, inform the
+        # caching allocator; for the sharded strategies, the gradient is
+        # produced in the post-backward stream, so this `record_stream()`
+        # should be a no-op
+        _no_dispatch_record_stream(
+            low_prec_grad_data, state._device_handle.current_stream()
+        )
+
+
+def _check_grad_to_accumulate(
+    new_sharded_grad: torch.Tensor,
+    accumulated_grad: torch.Tensor,
+) -> None:
+    _p_assert(
+        accumulated_grad.shape == new_sharded_grad.shape,
+        "Shape mismatch when accumulating gradients: "
+        f"existing gradient shape={accumulated_grad.shape} "
+        f"new gradient shape={new_sharded_grad.shape}",
+    )
+    _p_assert(
+        accumulated_grad.device == new_sharded_grad.device,
+        "Device mismatch when accumulating gradients: "
+        f"existing gradient device={accumulated_grad.device} "
+        f"new gradient device={new_sharded_grad.device}",
+    )
+
+
+@no_type_check
+def _low_precision_hook_enabled(state: _FSDPState) -> bool:
+    return state._comm_hook in LOW_PRECISION_HOOKS
+
+
+@no_type_check
+@torch.no_grad()
+def _post_backward_final_callback(
+    state: _FSDPState,
+    module: nn.Module,
+):
+    """
+    This waits for the post-backward to finish and performs some final cleanup.
+    This runs at the end of the entire backward pass and should only be called
+    on the root FSDP instance.
+    """
+    _p_assert(
+        state._is_root,
+        "The post-backward callback should only be called on the root FSDP instance",
+    )
+    root_state = state
+
+    if root_state._sync_gradients:
+        current_stream = state._device_handle.current_stream()
+        # TODO (rohan-varma): this also waits for the overlapped optimizer step to finish
+        # since it currently runs in the post-backward stream. That can be
+        # pushed to the next forward if run in a different stream
+        current_stream.wait_stream(root_state._post_backward_stream)
+        if root_state._all_reduce_stream is not current_stream:  # uses HSDP
+            current_stream.wait_stream(root_state._all_reduce_stream)
+        if root_state.cpu_offload.offload_params:
+            # Wait for non-blocking GPU -> CPU sharded gradient copies from the
+            # post-backward hooks to finish explicitly since CPU gradients do
+            # not automatically synchronize with the GPU
+            state._device_handle.current_stream().synchronize()
+    root_state._exec_order_data.next_iter()
+
+    for fsdp_state in state._all_fsdp_states:
+        _catch_all_reshard(fsdp_state)
+        _finalize_params(fsdp_state)
+        fsdp_state.training_state = TrainingState.IDLE
+        handle = fsdp_state._handle
+        if handle:
+            handle._ran_pre_backward_hook = False
+            handle._needs_pre_backward_unshard = False
+            handle._post_forward_index = None
+            handle._training_state = HandleTrainingState.IDLE
+            handle._prefetched = False
+    # Reset for cases like one forward and multiple backwards
+    root_state._post_backward_callback_queued = False
+
+
+@no_type_check
+def _catch_all_reshard(
+    state: _FSDPState,
+) -> None:
+    """
+    Reshards the parameters that may not have been resharded in the
+    post-backward hook. This can happen when a module's output is used in the
+    forward pass, meaning that its pre-backward hook runs (unsharding the
+    parameter), but the post-backward hook does not run because the output was
+    not jused in the loss computation corresponding to this backward pass.
+    """
+    # Wrap with a try-except to provide a more informative traceback if an
+    # error is raised
+    try:
+        if state._handle:
+            # TODO: This already-resharded check is brittle:
+            # https://github.com/pytorch/pytorch/issues/83956
+            already_resharded = (
+                state._handle.flat_param.data_ptr()
+                == state._handle.flat_param._local_shard.data_ptr()
+                # If FSDP skipped using sharded views, then the flat parameter
+                # still points to the sharded data, so we need to reshard to
+                # use sharded views
+                and not state._handle._skipped_use_sharded_views
+            )
+            if already_resharded:
+                return
+            free_unsharded_flat_param = _should_free_in_backward(state, state._handle)
+            _reshard(state, state._handle, free_unsharded_flat_param)
+    except Exception as e:
+        _p_assert(
+            False,
+            f"Got exception in the catch-all reshard for {state}: {str(e)}",
+            raise_assertion_error=False,
+        )
+        raise e
+
+
+@no_type_check
+def _finalize_params(
+    state: _FSDPState,
+) -> None:
+    """Finalizes the parameters before the next iteration."""
+    handle = state._handle
+    if not handle:
+        return
+    flat_param = handle.flat_param
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        if hasattr(flat_param, "_post_backward_hook_handle"):
+            pbhs_handle = flat_param._post_backward_hook_handle
+            pbhs_handle.remove()
+            del flat_param._post_backward_hook_handle
+    else:
+        if hasattr(flat_param, "_post_backward_hook_state"):
+            post_backward_hook_state_len = len(flat_param._post_backward_hook_state)
+            expected_post_backward_hook_state_len = int(flat_param.requires_grad) + 1
+            _p_assert(
+                post_backward_hook_state_len == expected_post_backward_hook_state_len,
+                f"Invalid: ``_post_backward_hook_state``: {flat_param._post_backward_hook_state}",
+            )
+            flat_param._post_backward_hook_state[-1].remove()
+            delattr(flat_param, "_post_backward_hook_state")
+    if flat_param.requires_grad:
+        if not state._sync_gradients:
+            # Preserve the gradient accumulation state if not synchronizing
+            # gradients: `.grad` remains the unsharded gradient  from prior
+            # `no_sync()` iterations, and `_saved_grad_shard` remains the
+            # sharded gradient from the last synchronized iteration
+            return
+        if not handle._has_optim_in_backward:
+            handle.prepare_gradient_for_optim()
+        _p_assert(
+            hasattr(flat_param, "_post_backward_called"),
+            "Expects `_post_backward_called` to be set on the `FlatParameter`",
+        )
+        flat_param._post_backward_called = False
+
+
+@no_type_check
+def _prefetch_handle(
+    state: _FSDPState,
+    current_handle: Optional[FlatParamHandle],
+    prefetch_mode: _PrefetchMode,
+) -> None:
+    """
+    Prefetches the next handles if needed (without synchronization). An empty
+    handles key cannot prefetch.
+    """
+    if not current_handle:
+        return
+    handle = _get_handle_to_prefetch(state, current_handle)
+    if not handle:
+        return
+    # Temporarily emulate the training state while calling `_unshard` to
+    # ensure the correct `as_params` for `_use_unsharded_views()`
+    prev_training_state = handle._training_state
+    if prefetch_mode == _PrefetchMode.BACKWARD:
+        handle._training_state = HandleTrainingState.BACKWARD_PRE
+    elif prefetch_mode == _PrefetchMode.FORWARD:
+        handle._training_state = HandleTrainingState.FORWARD
+    else:
+        raise ValueError(f"Invalid prefetch mode on rank {state.rank}: {prefetch_mode}")
+    # Prefetch the next set of handles without synchronizing to allow
+    # the sync to happen as late as possible to maximize overlap
+    _unshard(state, handle, state._unshard_stream, state._pre_unshard_stream)
+    handle._training_state = prev_training_state
+    handle._prefetched = True
+
+
+@no_type_check
+def _get_handle_to_prefetch(
+    state: _FSDPState,
+    current_handle: FlatParamHandle,
+) -> FlatParamHandle:
+    """
+    Returns a :class:`list` of the handles keys to prefetch for the next
+    module(s), where ``current_handle`` represents the current module.
+
+    "Prefetching" refers to running the unshard logic early (without
+    synchronization), and the "next" modules depend on the recorded execution
+    order and the current training state.
+    """
+    training_state = _get_training_state(current_handle)
+    valid_training_states = (
+        HandleTrainingState.BACKWARD_PRE,
+        HandleTrainingState.BACKWARD_POST,
+        HandleTrainingState.FORWARD,
+    )
+    _p_assert(
+        training_state in valid_training_states,
+        f"Prefetching is only supported in {valid_training_states} but "
+        f"currently in {training_state}",
+    )
+    eod = state._exec_order_data
+    target_handle: Optional[FlatParamHandle] = None
+    if (
+        training_state == HandleTrainingState.BACKWARD_PRE
+        and state.backward_prefetch == BackwardPrefetch.BACKWARD_PRE
+    ) or (
+        training_state == HandleTrainingState.BACKWARD_POST
+        and state.backward_prefetch == BackwardPrefetch.BACKWARD_POST
+    ):
+        target_handle_candidate = eod.get_handle_to_backward_prefetch(current_handle)
+        if (
+            target_handle_candidate
+            and target_handle_candidate._needs_pre_backward_unshard
+            and not target_handle_candidate._prefetched
+        ):
+            target_handle = target_handle_candidate
+        else:
+            target_handle = None
+    elif training_state == HandleTrainingState.FORWARD and state.forward_prefetch:
+        target_handle_candidate = eod.get_handle_to_forward_prefetch(current_handle)
+        if (
+            target_handle_candidate
+            and target_handle_candidate._needs_pre_forward_unshard
+            and not target_handle_candidate._prefetched
+        ):
+            target_handle = target_handle_candidate
+        else:
+            target_handle = None
+
+    return target_handle
+
+
+def _get_training_state(
+    handle: FlatParamHandle,
+) -> HandleTrainingState:
+    """Returns the training state of the handles in ``handle``."""
+    _p_assert(handle, "Expects a non-empty handle")
+    return handle._training_state
+
+
+@no_type_check
+def _register_pre_forward_hook(
+    state: _FSDPState,
+    module: nn.Module,
+) -> None:
+    """
+    Registers a pre-forward hook on ``module``.
+    """
+    for forward_handle in state._pre_forward_handles:
+        forward_handle.remove()
+    state._pre_forward_handles.clear()
+    module_param_handle = state._fully_sharded_module_to_handle.get(module, None)
+    hook = functools.partial(
+        _pre_forward, state, module_param_handle, _pre_forward_unshard
+    )
+    state._pre_forward_handles.append(
+        module.register_forward_pre_hook(hook, prepend=True, with_kwargs=True)
+    )
+
+
+@no_type_check
+def _register_post_forward_hook(
+    state: _FSDPState,
+    module: nn.Module,
+) -> None:
+    """
+    Registers a post-forward hook on ``module``. Even if the module has no
+    handles, we should register the hook since it will register the module's
+    pre-backward hook.
+    """
+    for forward_handle in state._post_forward_handles:
+        forward_handle.remove()
+    state._post_forward_handles.clear()
+    module_param_handle = state._fully_sharded_module_to_handle.get(module, None)
+    hook = functools.partial(
+        _post_forward,
+        state,
+        module_param_handle,
+        _post_forward_reshard,
+    )
+    state._post_forward_handles.append(module.register_forward_hook(hook))
+
+
+@no_type_check
+def _register_root_pre_forward_hook(
+    state: _FSDPState,
+    module: nn.Module,
+):
+    """
+    Registers root pre-forward hook on ``module``, which should be the local
+    FSDP root.
+
+    NOTE: For the current composable FSDP design, we have each application of
+    ``fully_shard()`` to a module to indicate that that module is the local
+    FSDP root. We may remove this assumption in the future, in which case we
+    will need to register this root pre-forward hook on any candidate module
+    that may be the local FSDP root.
+    """
+    for forward_handle in state._root_pre_forward_handles:
+        forward_handle.remove()
+    state._root_pre_forward_handles.clear()
+    hook = functools.partial(_root_pre_forward, state)
+    state._root_pre_forward_handles.append(
+        module.register_forward_pre_hook(hook, prepend=True, with_kwargs=True)
+    )
+
+
+@no_type_check
+def _register_pre_backward_hooks(
+    state: _FSDPState,
+    module: nn.Module,
+    outputs: Any,
+    handle: FlatParamHandle,
+) -> None:
+    """
+    Registers pre-backward hooks on the tensors that require gradients in the
+    forward pass outputs ``outputs``, which were computed using the
+    ``FlatParameter`` s of ``handles``.
+
+    Args:
+        module (nn.Module): Fully sharded module (see [Note: Fully Sharded
+            Module]).
+
+    Returns:
+        Forward pass outputs with pre-backward hooks registered to tensors that
+        require gradients.
+    """
+    # If there is no gradient computation, then there is no need for
+    # pre-backward logic
+    if not torch.is_grad_enabled():
+        return outputs
+    if state._is_root:
+        state._post_backward_callback_queued = False  # only defined on the root
+
+    if handle:
+        handle._needs_pre_backward_unshard = False
+        # Since these handles' `FlatParameter`s participated in a forward, we
+        # conservatively assume that they will be used in the backward
+        handle._ran_pre_backward_hook = False
+
+    def _register_hook(t: torch.Tensor) -> torch.Tensor:
+        if t.requires_grad:
+            t.register_hook(
+                functools.partial(_pre_backward_hook, state, module, handle)
+            )
+            if handle:
+                handle._needs_pre_backward_unshard = True
+        return t
+
+    return _apply_to_tensors(_register_hook, outputs)
+
+
+def _register_post_backward_hook(
+    state: _FSDPState,
+    handle: Optional[FlatParamHandle],
+) -> None:
+    """
+    Registers post-backward hooks on the ``FlatParameter`` s'
+    ``AccumulateGrad`` objects to reshard and to reduce-scatter gradients.
+
+    The ``AccumulateGrad`` object represents the last function that finalizes
+    the ``FlatParameter`` 's gradient, so it only runs after its entire
+    gradient computation has finished.
+
+    We register the post-backward hook only once in the *first* forward that a
+    ``FlatParameter`` participates in. This relies on the ``AccumulateGrad``
+    object being preserved through multiple forwards.
+
+    NOTE: We follow this heuristic to prefer the *first* forward to target the
+    parameter mixed precision case, where there are *separate*
+    ``AccumulateGrad`` objects across the different forwards. (Without
+    parameter mixed precision, the ``AccumulateGrad`` objects are the same.) If
+    we instead prefer the *last* forward, then the hook runs early.
+    """
+    # If there is no gradient computation, then there is no need for
+    # post-backward logic
+    if not torch.is_grad_enabled():
+        return
+    if not handle:
+        return
+    flat_param = handle.flat_param
+
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        already_registered = hasattr(flat_param, "_post_backward_hook_handle")
+        if already_registered or not flat_param.requires_grad:
+            return
+        hook = functools.partial(_post_backward_hook, state, handle)
+        hook_handle = flat_param.register_post_accumulate_grad_hook(hook)
+        flat_param._post_backward_hook_handle = hook_handle  # type: ignore[attr-defined]
+    else:
+        already_registered = hasattr(flat_param, "_post_backward_hook_state")
+        if already_registered or not flat_param.requires_grad:
+            return
+        # Get the `AccumulateGrad` object
+        temp_flat_param = flat_param.expand_as(flat_param)
+        _p_assert(
+            temp_flat_param.grad_fn is not None,
+            "The `grad_fn` is needed to access the `AccumulateGrad` and "
+            "register the post-backward hook",
+        )
+        acc_grad = temp_flat_param.grad_fn.next_functions[0][0]  # type: ignore[union-attr]
+        assert acc_grad is not None
+        hook_handle = acc_grad.register_hook(
+            functools.partial(_post_backward_hook, state, handle)
+        )
+        flat_param._post_backward_hook_state = (acc_grad, hook_handle)  # type: ignore[attr-defined]
+
+
+def _register_post_backward_reshard_only_hook(
+    state: _FSDPState,
+    handle: Optional[FlatParamHandle],
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+) -> None:
+    """
+    Registers post-backward hooks to reshard flat parameters that do not
+    require gradient. We register these using multi-post-grad hooks on the
+    input activations to ensure that all gradients that may depend on the
+    parameters have been computed before resharding.
+    """
+    # If there is no gradient computation, then there is no need for
+    # post-backward logic
+    if not torch.is_grad_enabled():
+        return
+    # Construct `inp_tensors` lazily to avoid CPU overhead in typical case
+    # where each flat parameter requires gradient
+    inp_tensors: Optional[List[torch.Tensor]] = None
+    if not handle:
+        return
+    flat_param = handle.flat_param
+
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        already_registered = hasattr(flat_param, "_post_backward_hook_handle")
+    else:
+        already_registered = hasattr(flat_param, "_post_backward_hook_state")
+
+    if already_registered or flat_param.requires_grad:
+        return
+    if inp_tensors is None:
+        args_flat = pytree.arg_tree_leaves(*args, **kwargs)
+        inp_tensors = [
+            obj for obj in args_flat if torch.is_tensor(obj) and obj.requires_grad
+        ]
+    assert inp_tensors is not None  # mypy
+    hook_handle = register_multi_grad_hook(
+        inp_tensors, functools.partial(_post_backward_reshard_only_hook, state, handle)
+    )
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        flat_param._post_backward_hook_handle = hook_handle  # type: ignore[attr-defined, assignment]
+    else:
+        flat_param._post_backward_hook_state = (hook_handle,)  # type: ignore[attr-defined, assignment]
+
+
+@no_type_check
+def _register_post_backward_final_callback(
+    state: _FSDPState, module: nn.Module
+) -> None:
+    """
+    Registers the post-backward final callback that runs at the end of the
+    backward pass. This should be called from the root FSDP instance at the
+    beginning of the pre-backward.
+    """
+    _p_assert(
+        state._is_root,
+        "Only the root FSDP instance should register the post-backward callback",
+    )
+    if state._post_backward_callback_queued:
+        return
+    _assert_in_training_states(state, [TrainingState.IDLE])
+    # Trace does not need this callback
+    if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        state._post_backward_callback_queued = True
+        Variable._execution_engine.queue_callback(
+            functools.partial(_post_backward_final_callback, state, module)
+        )
+
+
+def _wait_for_computation_stream(
+    computation_stream: torch.Stream,
+    unshard_stream: torch.Stream,
+    pre_unshard_stream: torch.Stream,
+):
+    """
+    Has the unshard and pre-unshard streams wait for the computation stream.
+    For example, this should be called in the FSDP root's pre-forward to
+    respect optimizer step computation.
+    """
+    # Tracing does not need to wait
+    if torch.distributed._functional_collectives.is_torchdynamo_compiling():
+        return
+    unshard_stream.wait_stream(computation_stream)  # type: ignore[attr-defined]
+    # Having the pre-all-gather stream wait for the current stream even if we
+    # do not leverage the pre-all-gather stream is tolerable since this only
+    # runs once per iteration
+    pre_unshard_stream.wait_stream(computation_stream)  # type: ignore[attr-defined]
+
+
+def _reset_flat_param_grad_info_if_needed(
+    handles: List[FlatParamHandle],
+):
+    """
+    Clears the original parameters' gradients if needed. This method's CPU
+    overhead is minimal, so we may call it throughout FSDP methods, which serve
+    as callsites to free the gradient memory earlier.
+    """
+    if not isinstance(handles, list):
+        handles = [handles]
+    for handle in handles:
+        if handle._use_orig_params:
+            handle._reset_flat_param_grad_info_if_needed()
+
+
+@no_type_check
+def _get_buffers_and_dtypes_for_computation(
+    state: _FSDPState,
+    root_module: nn.Module,
+) -> Tuple[List[torch.Tensor], List[Optional[torch.dtype]]]:
+    """
+    Returns all buffers in the module tree rooted at ``root_module`` and a
+    corresponding list of the buffer dtypes for computation. Each buffer dtype
+    is either ``None`` if buffer mixed precision is not enabled or the buffer
+    low precision dtype otherwise.
+    """
+    _p_assert(state._is_root, "Expects the root to cast buffers")
+    buffers: List[torch.Tensor] = []
+    buffer_dtypes: List[Optional[torch.dtype]] = []
+    visited_buffers: Set[torch.Tensor] = set()
+    # Traverse the FSDP states bottom-up so that we prefer the owning FSDP
+    # instance's mixed precision setting for each buffer
+    fsdp_states, fsdp_modules = traversal_utils._get_fsdp_states_with_modules(
+        root_module
+    )
+    for fsdp_state, fsdp_module in zip(reversed(fsdp_states), reversed(fsdp_modules)):
+        for buffer_name, buffer in fsdp_module.named_buffers():
+            if buffer in visited_buffers:
+                continue
+            visited_buffers.add(buffer)
+            if clean_tensor_name(buffer_name) in fsdp_state._ignored_buffer_names:
+                continue
+            buffers.append(buffer)
+            buffer_dtypes.append(fsdp_state.mixed_precision.buffer_dtype)
+    assert len(buffers) == len(buffer_dtypes), f"{len(buffers)} {len(buffer_dtypes)}"
+    return buffers, buffer_dtypes
+
+
+@no_type_check
+def _get_orig_buffer_dtypes(
+    state: _FSDPState,
+    buffer_names: List[str],
+) -> List[torch.dtype]:
+    """
+    Returns the original buffer types of the given buffer names.
+    """
+    buffer_dtypes: List[torch.dtype] = []
+    for buffer_name in buffer_names:
+        _p_assert(
+            buffer_name in state._buffer_name_to_orig_dtype,
+            f"{buffer_name} is missing from pre-computed dict on rank "
+            f"{state.rank}, which only has keys "
+            f"{state._buffer_name_to_orig_dtype.keys()}",
+        )
+        buffer_dtypes.append(state._buffer_name_to_orig_dtype[buffer_name])
+    return buffer_dtypes
+
+
+def _cast_buffers_to_dtype_and_device(
+    buffers: List[torch.Tensor],
+    buffer_dtypes: List[Optional[torch.dtype]],
+    device: torch.device,
+) -> None:
+    """
+    Casts ``buffers`` to the dtypes given by ``buffer_dtypes`` and moves them
+    to ``device``. If an element in ``buffer_dtypes`` is ``None``, then the
+    corresponding buffer is only moved to ``device``.
+    """
+    _p_assert(
+        buffer_dtypes is None or len(buffers) == len(buffer_dtypes),
+        f"Expects `buffers` and `buffer_dtypes` to have the same length if "
+        f"`buffer_dtypes` is specified but got {len(buffers)} and "
+        f"{len(buffer_dtypes)}",
+    )
+    for buffer, buffer_dtype in zip(buffers, buffer_dtypes):
+        if not torch.is_floating_point(buffer) or buffer_dtype is None:
+            buffer.data = buffer.to(device=device)
+        else:
+            buffer.data = buffer.to(device=device, dtype=buffer_dtype)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_shard_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_shard_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..756e65ba7ab3fc98f83299a91b3fb4ae7d596973
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_shard_utils.py
@@ -0,0 +1,127 @@
+import copy
+import itertools
+import math
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import distributed_c10d
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+    ShardedTensorMetadata,
+    TensorProperties,
+)
+from torch.distributed._shard.sharding_spec import ShardMetadata
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard as DShard
+
+
+def _get_remote_device_str(rank, device_type, num_devices_per_node):
+    if device_type.lower() == "cpu":
+        return f"rank:{rank}/{device_type}"
+    else:
+        return f"rank:{rank}/{device_type}:{rank % num_devices_per_node}"
+
+
+def _create_chunk_sharded_tensor(
+    tensor: torch.Tensor,
+    rank: int,
+    world_size: int,
+    num_devices_per_node: int,
+    pg: dist.ProcessGroup,
+    device: Optional[torch.device] = None,
+) -> ShardedTensor:
+    """
+    Shard a tensor to chunks along the first dimension. The local rank will gets its
+    corresponding chunk as the local shard to create a ShardedTensor.
+    """
+    chunks = tensor.chunk(world_size, dim=0)
+    if len(chunks) > rank:
+        local_shard = chunks[rank].clone()
+        offsets = [0 for _ in tensor.size()]
+        offsets[0] = math.ceil(tensor.size()[0] / world_size) * rank
+        local_shards = [Shard.from_tensor_and_offsets(local_shard, offsets, rank)]
+    else:
+        local_shards = []
+
+    # Create a ShardedTensor without invoking communication.
+    chunk_sizes = [list(chunk.size()) for chunk in chunks]
+    dim0_offsets = [0] + list(
+        itertools.accumulate([chunk_size[0] for chunk_size in chunk_sizes])
+    )[:-1]
+    offsets = [0] * (len(chunk_sizes[0]) - 1)
+    chunk_offsets = [[d0] + offsets for d0 in dim0_offsets]
+    device_type = (
+        distributed_c10d._get_pg_default_device(pg).type
+        if device is None
+        else device.type
+    )
+    placements = [
+        _get_remote_device_str(r, device_type, num_devices_per_node)
+        for r in range(len(chunk_sizes))
+    ]
+    assert len(chunk_sizes) == len(chunk_offsets) == len(placements)
+    shard_metadata = [
+        ShardMetadata(offset, size, placement)
+        for offset, size, placement in zip(chunk_offsets, chunk_sizes, placements)
+    ]
+    sharded_tensor_metadata = ShardedTensorMetadata(
+        shards_metadata=shard_metadata,
+        size=tensor.size(),
+        tensor_properties=TensorProperties(
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            requires_grad=False,
+            memory_format=torch.contiguous_format,
+            pin_memory=tensor.is_pinned(),
+        ),
+    )
+    return ShardedTensor._init_from_local_shards_and_global_metadata(
+        local_shards, sharded_tensor_metadata=sharded_tensor_metadata, process_group=pg
+    )
+
+
+def _create_chunk_dtensor(
+    tensor: torch.Tensor,
+    rank: int,
+    device_mesh: DeviceMesh,
+) -> DTensor:
+    """
+    Shard a tensor to chunks along the first dimension. The local rank will gets its
+    corresponding chunk as the local tensor to create a DTensor.
+    """
+    # We need to explicitly call .detach() to return a new tensor detached from the current graph.
+    tensor = tensor.clone().detach()
+
+    # FSDP placements: [Shard(0)]
+    # HSDP placements: [Replicate(), Shard(0)]
+    replicate_placements = [Replicate() for _ in range(device_mesh.ndim)]
+    shard_placements = [Replicate() for _ in range(device_mesh.ndim)]
+    shard_placements[-1] = DShard(0)  # type: ignore[call-overload]
+
+    return DTensor.from_local(
+        tensor, device_mesh, replicate_placements, run_check=False
+    ).redistribute(
+        placements=shard_placements,
+    )
+
+
+def _all_gather_dtensor(
+    tensor: DTensor,
+    parent_mesh: Optional[DeviceMesh],
+) -> torch.Tensor:
+    """
+    All gather a DTensor in its sharded dimension and return the local tensor.
+    """
+    assert parent_mesh is None
+
+    placements = list(copy.deepcopy(tensor.placements))
+    # FSDP placements: [Shard(0)] -> [Replicate()]
+    # HSDP placements: [Replicate(), Shard(0)] -> [Replicate(), Replicate()]
+    placements[-1] = Replicate()
+    tensor = tensor.redistribute(
+        device_mesh=tensor.device_mesh,
+        placements=placements,
+    )
+
+    return tensor.to_local()
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_state_dict_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_state_dict_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f9c8a569a270260077391b3a6946a054c4d6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_state_dict_utils.py
@@ -0,0 +1,928 @@
+import contextlib
+import logging
+import math
+import warnings
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterator,
+    List,
+    no_type_check,
+    Tuple,
+)
+
+import torch
+import torch.distributed as dist
+
+import torch.distributed.algorithms._checkpoint.checkpoint_wrapper as checkpoint_wrapper
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._shard.sharded_tensor import (
+    init_from_local_shards,
+    Shard,
+    ShardedTensor,
+)
+from torch.distributed._tensor import DTensor
+from torch.distributed.device_mesh import _mesh_resources
+
+from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
+    _get_module_fsdp_state_if_fully_sharded_module,
+    _has_fsdp_params,
+    _is_composable,
+    _module_handle,
+    clean_tensor_name,
+    FSDP_PREFIX,
+    FSDP_WRAPPED_MODULE,
+)
+from torch.distributed.fsdp._debug_utils import SimpleProfiler
+from torch.distributed.fsdp._runtime_utils import (
+    _cast_buffers_to_dtype_and_device,
+    _get_orig_buffer_dtypes,
+    _lazy_init,
+    _reset_flat_param_grad_info_if_needed,
+)
+from torch.distributed.fsdp.api import (
+    FullStateDictConfig,
+    ShardingStrategy,
+    StateDictType,
+)
+from torch.distributed.utils import _replace_by_prefix
+
+from ._fsdp_extensions import (
+    _ext_all_gather_dtensor,
+    _ext_chunk_dtensor,
+    _ext_chunk_tensor,
+    _ext_post_unflatten_transform,
+    _ext_pre_load_state_dict_transform,
+)
+from ._unshard_param_utils import _unshard_fsdp_state_params, FLAT_PARAM
+
+
+logger = logging.getLogger(__name__)
+
+
+def _should_unshard_params(fsdp_state: _FSDPState) -> bool:
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD and (
+        _is_composable(fsdp_state) or fsdp_state._use_orig_params
+    ):
+        return False
+    else:
+        return True
+
+
+def _convert_to_wrapped_module_name(module_name: str) -> str:
+    module_name = module_name.replace(f"{FSDP_PREFIX}", "")
+    module_name = module_name.replace(f"{FSDP_WRAPPED_MODULE}", "")
+    if module_name:
+        module_name = f"{module_name}."
+    # `CheckpointWrapper` adds a prefix that has to be removed as well.
+    module_name = module_name.replace(checkpoint_wrapper._CHECKPOINT_PREFIX, "")
+    return module_name
+
+
+def _param_name_infos(
+    module: nn.Module, fsdp_state: _FSDPState
+) -> Iterator[Tuple[str, str, str]]:
+    if not _has_fsdp_params(fsdp_state, module):
+        return
+    for param_name, module_name in _module_handle(
+        fsdp_state, module
+    ).param_module_names():
+        module_name = _convert_to_wrapped_module_name(module_name)
+        fqn = f"{module_name}{param_name}"
+        yield fqn, param_name, module_name
+
+
+def _shared_param_name_infos(
+    module: nn.Module, fsdp_state
+) -> Iterator[Tuple[str, str, str]]:
+    for param_name, module_name in _module_handle(
+        fsdp_state, module
+    ).shared_param_module_names():
+        module_name = _convert_to_wrapped_module_name(module_name)
+        fqn = f"{module_name}{param_name}"
+        yield fqn, param_name, module_name
+
+
+@no_type_check
+def _enter_unshard_params_ctx(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    writeback: bool = False,
+    rank0_only: bool = False,
+    offload_to_cpu: bool = False,
+    with_grads: bool = False,
+) -> None:
+    """
+    state_dict hooks cannot use the pure context call as the checkpoint flow
+    requires to enter the context in the pre-hook but leave the context in the
+    post-hook. This API enters the context of ``_unshard_fsdp_state_params``.
+    """
+    assert module not in fsdp_state._unshard_params_ctx, (
+        "Entering the ``_unshard_fsdp_state_params`` context but _unshard_params_ctx[module] "
+        "is not None."
+    )
+    fsdp_state._unshard_params_ctx[module] = _unshard_fsdp_state_params(
+        module,
+        fsdp_state,
+        writeback=writeback,
+        rank0_only=rank0_only,
+        offload_to_cpu=offload_to_cpu,
+        with_grads=with_grads,
+    )
+    fsdp_state._unshard_params_ctx[module].__enter__()
+
+
+@no_type_check
+def _exit_unshard_params_ctx(module: nn.Module, fsdp_state: _FSDPState) -> None:
+    """A helper function to exit ``_unshard_fsdp_state_params`` context."""
+    fsdp_state._unshard_params_ctx[module].__exit__(None, None, None)
+    fsdp_state._unshard_params_ctx.pop(module)
+
+
+def _common_pre_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+) -> None:
+    """Performs the pre-state_dict tasks shared by all state_dict types."""
+    if fsdp_state._device_handle.is_available():
+        fsdp_state._device_handle.synchronize()
+    # TODO: need to check if this is always correct for composable FSDP.
+    _lazy_init(fsdp_state, module)
+    if fsdp_state._is_root:
+        _reset_flat_param_grad_info_if_needed(fsdp_state._all_handles)
+
+
+def _common_unshard_pre_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    offload_to_cpu: bool,
+    rank0_only: bool,
+) -> None:
+    """
+    Performs the pre-state_dict tasks shared by all state_dict types that require
+    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this hook.
+    """
+    # For composable `fully_shard`, it does not need to unshard parameters for `NO_SHARD` cases.
+    if not _should_unshard_params(fsdp_state):
+        return
+    _enter_unshard_params_ctx(
+        module,
+        fsdp_state,
+        writeback=False,
+        offload_to_cpu=offload_to_cpu,
+        rank0_only=rank0_only,
+    )
+
+
+@no_type_check
+def _common_unshard_post_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+    param_hook: Callable,
+) -> Dict[str, Any]:
+    """
+    The post-state_dict flow that shared by all state_dict types that require
+    ``_unshard_fsdp_state_params()``. FULL_STATE_DICT and SHARDED_STATE_DICT use this
+    hook.
+    """
+    _replace_by_prefix(state_dict, prefix + f"{FSDP_PREFIX}", prefix)
+    # Return early for trivial cases
+    if not state_dict or not _has_fsdp_params(fsdp_state, module):
+        if _should_unshard_params(fsdp_state):
+            _exit_unshard_params_ctx(module, fsdp_state)
+        return state_dict
+
+    # If a rank does not have unsharded parameters(when `rank0_only=True`
+    # and `rank != 0`), then the rank only needed to participate in the
+    # all-gather and does not need to save the # state dict. We simply check
+    # rank0_only to ensure this issue.
+    rank0_only = (
+        fsdp_state._state_dict_type == StateDictType.FULL_STATE_DICT
+        and cast(FullStateDictConfig, fsdp_state._state_dict_config).rank0_only
+    )
+    # no_fsdp_return means the state_dict returned by this rank should contain
+    # only non-FSDP controlled parameters and buffers.
+    no_fsdp_return = rank0_only and fsdp_state.rank != 0
+    if no_fsdp_return and not fsdp_state._use_orig_params:
+        for clean_key in fsdp_state._buffer_names:
+            # This is a hack to support activation checkpoint.
+            clean_key = clean_key.replace(
+                f"{checkpoint_wrapper._CHECKPOINT_PREFIX}.", ""
+            )
+            state_dict.pop(f"{prefix}{clean_key}", None)
+        # Non-zero ranks have flat_param key when rank0_only=True, because rank0_only=True is
+        # passed in to unshard context, but nonzero ranks reshard early, causing this flat_param
+        # to appear in state_dict.
+        state_dict.pop(f"{prefix}{FLAT_PARAM}")
+        _exit_unshard_params_ctx(module, fsdp_state)
+        return state_dict
+
+    # Loop only the parameters saved in this instance's wrapped module to
+    # avoid processing buffers.
+    for fqn, param_name, module_name in _param_name_infos(module, fsdp_state):
+        fqn = f"{prefix}{fqn}"
+        if no_fsdp_return:
+            state_dict.pop(fqn)
+            continue
+        assert fqn in state_dict, (
+            f"FSDP assumes {fqn} is in the state_dict but the state_dict only "
+            f"has {state_dict.keys()}. "
+            f"prefix={prefix}, module_name={module_name}, "
+            f"param_name={param_name} rank={fsdp_state.rank}."
+        )
+
+        param_hook(state_dict, prefix, fqn)
+
+    if _should_unshard_params(fsdp_state):
+        _exit_unshard_params_ctx(module, fsdp_state)
+
+    cpu_device = torch.device("cpu")
+    buffer_clean_fqns = []
+    buffers = []
+    for clean_key in fsdp_state._buffer_names:
+        # This is a hack to support activation checkpoint.
+        clean_key = clean_tensor_name(clean_key)
+        fqn = f"{prefix}{clean_key}"
+        if fqn not in state_dict:
+            # A buffer can be registered as non-persistent.
+            continue
+        if no_fsdp_return:
+            state_dict.pop(fqn)
+        else:
+            buffer = state_dict[fqn]
+            if (
+                fsdp_state._state_dict_config.offload_to_cpu
+                and buffer.device != cpu_device
+            ):
+                state_dict[fqn] = buffer.to(cpu_device)
+            # skip upcasting for ignored buffers
+            if clean_key not in fsdp_state._ignored_buffer_names:
+                buffer_clean_fqns.append(clean_key)
+                buffers.append(state_dict[fqn])
+
+    if buffers:
+        mixed_precision_enabled_for_buffers = (
+            fsdp_state._mixed_precision_enabled_for_buffers()
+            if not _is_composable(fsdp_state)
+            else (fsdp_state.mixed_precision.buffer_dtype is not None)
+        )
+        if mixed_precision_enabled_for_buffers:
+            buffer_dtypes = _get_orig_buffer_dtypes(fsdp_state, buffer_clean_fqns)
+            _cast_buffers_to_dtype_and_device(
+                buffers, buffer_dtypes, fsdp_state.compute_device
+            )
+            for buffer, clean_fqn in zip(buffers, buffer_clean_fqns):
+                fqn = f"{prefix}{clean_fqn}"
+                logger.info("FSDP is casting the dtype of %s to %s", fqn, buffer.dtype)
+                state_dict[fqn] = buffer.clone()
+    return state_dict
+
+
+@no_type_check
+def _full_pre_state_dict_hook(
+    fsdp_state: _FSDPState,
+    module: nn.Module,
+    *args,
+    **kwargs,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. pre-state_dict hook is
+    not actually supported by ``nn.Module``. As a result, this API is called
+    from ``_full_post_state_dict_hook()`` to simulate the case. Once pre-state_dict
+    is supported in ``nn.Module``, this hook will be registered as a hook in
+    ``nn.Module``.
+    """
+    if getattr(fsdp_state, "_device_mesh", False):
+        parent_mesh = _mesh_resources.get_parent_mesh(fsdp_state._device_mesh)
+
+    _common_pre_state_dict_hook(module, fsdp_state)
+    _common_unshard_pre_state_dict_hook(
+        module,
+        fsdp_state,
+        offload_to_cpu=fsdp_state._state_dict_config.offload_to_cpu,
+        rank0_only=cast(FullStateDictConfig, fsdp_state._state_dict_config).rank0_only,
+    )
+
+
+@no_type_check
+def _full_post_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    Hook that runs after model.state_dict() is called before returning result to
+    user. For FSDP, we may have to clone the tensors in state_dict as params go
+    back to sharded version after _unshard_fsdp_state_params ends, and also remove
+    the ``FSDP_WRAPPED_MODULE`` prefix.
+    """
+
+    def param_hook(
+        state_dict: Dict[str, Any],
+        prefix: str,
+        fqn: str,
+    ) -> None:
+        clean_key = fqn
+        clean_prefix = clean_tensor_name(prefix)
+        # Strip prefix out of key if needed as buffer names and param names
+        # do not have prefix considered as they are not computed in `state_dict`
+        # call.
+        if clean_key.startswith(clean_prefix):
+            clean_key = clean_key[len(clean_prefix) :]
+
+        # Clone parameters before exiting the `_unshard_fsdp_state_params()` context.
+        if not getattr(state_dict[fqn], "_has_been_cloned", False):
+            try:
+                state_dict[fqn] = state_dict[fqn].clone().detach()
+                state_dict[fqn]._has_been_cloned = True  # type: ignore[attr-defined]
+            except BaseException as e:
+                warnings.warn(
+                    f"Failed to clone() tensor with name {fqn} on rank {fsdp_state.rank}. "
+                    "This may mean that this state_dict entry could point to invalid "
+                    "memory regions after returning from state_dict() call if this "
+                    "parameter is managed by FSDP. Please check clone "
+                    f"implementation of {fqn}. Error: {str(e)}"
+                )
+
+    return _common_unshard_post_state_dict_hook(
+        module, fsdp_state, state_dict, prefix, param_hook
+    )
+
+
+def _full_pre_load_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    _lazy_init(fsdp_state, module)
+    if _should_unshard_params(fsdp_state):
+        with SimpleProfiler.profile("_enter_unshard_params_ctx"):
+            _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
+    # Add FSDP_PREFIX only for wrapper-based FSDP.
+    if not _is_composable(fsdp_state):
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
+
+
+def _full_post_load_state_dict_hook(
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
+    if _should_unshard_params(fsdp_state):
+        with SimpleProfiler.profile("_exit_unshard_params_ctx"):
+            _exit_unshard_params_ctx(module, fsdp_state)
+
+
+def _local_pre_state_dict_hook(
+    fsdp_state: _FSDPState,
+    module: nn.Module,
+    *args,
+    **kwargs,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. Right now, pre-state_dict
+    hook is not supported by the PyTorch core. So this API is called from
+    `_local_post_state_dict_hook()` to simulate the case.
+    """
+    if (
+        _has_fsdp_params(fsdp_state, module)
+        and not _module_handle(fsdp_state, module).uses_sharded_strategy
+    ):
+        raise RuntimeError(
+            "``local_state_dict`` can only be used when parameters are flatten "
+            "and sharded."
+        )
+    _common_pre_state_dict_hook(module, fsdp_state)
+
+
+@no_type_check
+def _local_post_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    This hook create a ShardedTensor from the local flat_param and replace
+    the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy
+    will happen. The underlying storage is the same.
+    """
+
+    _replace_by_prefix(state_dict, f"{prefix}{FSDP_PREFIX}", prefix)
+    if not _has_fsdp_params(fsdp_state, module):
+        return state_dict
+
+    # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor
+    # value as the flat_param but it is a pure Tensor because
+    # nn.Module.state_dict() will detach the parameter. Therefore, we need
+    # to get flat_param to get the metadata.
+    assert _module_handle(fsdp_state, module), "Should have returned early"
+    flat_param = _module_handle(fsdp_state, module).flat_param
+    # Constructs a ShardedTensor from the flat_param "without" padding.
+    # Removing the padding allows users to change the number of ranks
+    # when loading the local_state_dict.
+    full_numel = flat_param._unpadded_unsharded_size.numel()  # type: ignore[attr-defined]
+    shard_offset = flat_param.numel() * fsdp_state.rank
+    valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
+    if valid_data_size > 0:
+        # If FlatParameter is returned, FlatParameter._local_shard cause a
+        # pickling issue (can be torch.save but not torch.load). Since there
+        # is no benefit for state_dict to return the actual FlatParameter class,
+        # a view (which is a tensor) of the FlatParameter will be returned.
+        flat_param = flat_param[:valid_data_size].view(valid_data_size)
+        local_shards = [
+            Shard.from_tensor_and_offsets(flat_param, [shard_offset], fsdp_state.rank)
+        ]
+    else:
+        local_shards = []
+    sharded_tensor = init_from_local_shards(
+        local_shards, full_numel, process_group=fsdp_state.process_group
+    )  # type: ignore[assignment]
+    # TODO: Add DTensor state_dict support for LOCAL_STATE_DICT.
+    if fsdp_state._state_dict_config.offload_to_cpu:
+        sharded_tensor = sharded_tensor.cpu()
+    state_dict[f"{prefix}{FLAT_PARAM}"] = sharded_tensor
+    return state_dict
+
+
+def _local_post_load_state_dict_hook(
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
+    pass
+
+
+def _local_pre_load_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """
+    This hook finds the local flat_param for this FSDP module from the
+    state_dict. The flat_param should be a ShardedTensor. This hook converts
+    the ShardedTensor to a tensor. No copy happen unless padding is required.
+    """
+    _lazy_init(fsdp_state, module)
+    _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_PREFIX}")
+    fqn = f"{prefix}{FSDP_PREFIX}{FLAT_PARAM}"
+    if fqn not in state_dict:
+        assert not _has_fsdp_params(fsdp_state, module), (
+            "No `FlatParameter` in `state_dict` for this FSDP instance "
+            "but it has parameters"
+        )
+        return
+    load_tensor = state_dict[fqn]
+    assert isinstance(
+        load_tensor, ShardedTensor
+    ), "Tensors in local_state_dict should be ShardedTensor."
+
+    # Convert the ShardedTensor to a Tensor.
+    flat_param = _module_handle(fsdp_state, module).flat_param
+    assert flat_param is not None
+    valid_data_size = flat_param.numel() - flat_param._shard_numel_padded
+    shards = load_tensor.local_shards()
+    if valid_data_size > 0:
+        assert len(shards), "load_local_state_dict assume one shard per ShardedTensor."
+        load_tensor = shards[0].tensor
+
+        # Get the metadata of the flat_param to decide whether to pad the loaded
+        # tensor.
+        if flat_param._shard_numel_padded > 0:
+            assert load_tensor.numel() < flat_param.numel(), (
+                f"Local shard size = {flat_param.numel()} and the tensor in "
+                f"the state_dict is {load_tensor.numel()}."
+            )
+            load_tensor = F.pad(load_tensor, [0, flat_param._shard_numel_padded])
+    else:
+        load_tensor = flat_param
+    # TODO: Add DTensor state_dict support for LOCAL_STATE_DICT.
+    state_dict[fqn] = load_tensor
+
+
+def _sharded_pre_state_dict_hook(
+    fsdp_state: _FSDPState,
+    module: nn.Module,
+    *args,
+    **kwargs,
+) -> None:
+    """
+    Hook that runs before model.state_dict() is called. Check
+    ``_full_pre_load_state_dict_hook`` for the detail.
+    """
+    if (
+        _has_fsdp_params(fsdp_state, module)
+        and not _module_handle(fsdp_state, module).uses_sharded_strategy
+    ):
+        raise RuntimeError(
+            "``sharded_state_dict`` can only be used when parameters are flatten "
+            "and sharded."
+        )
+    _common_pre_state_dict_hook(module, fsdp_state)
+    # Setting offload_to_cpu here does not work even if offload_to_cpu is True.
+    # We have to create ShardedTensor first then move it to CPU.
+    _common_unshard_pre_state_dict_hook(
+        module,
+        fsdp_state,
+        offload_to_cpu=False,
+        rank0_only=False,
+    )
+
+
+@no_type_check
+def _sharded_post_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> Dict[str, Any]:
+    """
+    The hook replaces the unflattened, unsharded parameter in the state_dict
+    with a unflattened, sharded parameter (a ShardedTensor).
+    """
+
+    def param_hook(state_dict: Dict[str, Any], prefix: str, fqn: str):
+        param = state_dict[fqn]
+        if not fsdp_state._state_dict_config._use_dtensor:
+            sharded_tensor = _ext_chunk_tensor(
+                tensor=param,
+                rank=fsdp_state.rank,
+                world_size=fsdp_state.world_size,
+                num_devices_per_node=fsdp_state._device_handle.device_count(),
+                pg=fsdp_state.process_group,
+                fsdp_extension=fsdp_state._fsdp_extension,
+            )
+        else:
+            sharded_tensor = _ext_chunk_dtensor(
+                tensor=param,
+                rank=fsdp_state.rank,
+                device_mesh=fsdp_state._device_mesh,
+                fsdp_extension=fsdp_state._fsdp_extension,
+            )
+        if fsdp_state._state_dict_config.offload_to_cpu:
+            sharded_tensor = sharded_tensor.cpu()
+        state_dict[fqn] = sharded_tensor
+
+    return _common_unshard_post_state_dict_hook(
+        module, fsdp_state, state_dict, prefix, param_hook
+    )
+
+
+@no_type_check
+def _sharded_post_load_state_dict_hook(
+    module: nn.Module, fsdp_state: _FSDPState, *args, **kwargs
+) -> None:
+    if _has_fsdp_params(fsdp_state, module):
+        with SimpleProfiler.profile("_exit_unshard_params_ctx"):
+            _exit_unshard_params_ctx(module, fsdp_state)
+
+
+@no_type_check
+def _sharded_pre_load_state_dict_hook(
+    module: nn.Module,
+    fsdp_state: _FSDPState,
+    state_dict: Dict[str, Any],
+    prefix: str,
+) -> None:
+    """
+    The hook combines the unflattened, sharded parameters (ShardedTensor) to
+    a new FlatParameter and shards the new FlatParameter to the local chunk.
+    """
+    _lazy_init(fsdp_state, module)
+    if not _is_composable(fsdp_state):
+        _replace_by_prefix(state_dict, prefix, prefix + f"{FSDP_PREFIX}")
+    if not _has_fsdp_params(fsdp_state, module):
+        return
+
+    handle = _module_handle(fsdp_state, module)
+    if not handle.uses_sharded_strategy:
+        raise RuntimeError(
+            "load_sharded_state_dict can only be called when parameters "
+            "are flattened and sharded."
+        )
+    fqn_to_param_ext = dict(
+        zip(handle.flat_param._fqns, handle.flat_param._param_extensions)
+    )
+
+    for fqn, _, _ in _param_name_infos(module, fsdp_state):
+        if not _is_composable(fsdp_state):
+            fqn_from_global_root = f"{prefix}{FSDP_PREFIX}{fqn}"
+        else:
+            fqn_from_global_root = f"{prefix}{fqn}"
+        try:
+            param = state_dict.pop(fqn_from_global_root)
+        except KeyError:
+            logger.warning(
+                f"Did not find param with FQN {fqn_from_global_root}, skipping it. "  # noqa: G004
+                "The weight will not be filled if you expect it to be."
+            )
+            continue  # TODO: Improve unittesting for state_dict finetuning
+            # cases: https://github.com/pytorch/pytorch/issues/109134
+
+        if not fsdp_state._state_dict_config._use_dtensor:
+            # All-gather the param (ShardedTensor)
+            param, shards = _ext_pre_load_state_dict_transform(
+                param, fsdp_state._fsdp_extension
+            )
+
+            assert len(shards) < 2, (
+                "Expects 0 or 1 shard per rank "
+                f"but got {len(shards)} shards on rank {fsdp_state.rank}."
+            )
+            param_numel = param.size().numel()
+            dim_0_size = param.size()[0]
+            chunk_size = (
+                math.ceil(dim_0_size / fsdp_state.world_size)
+                * param_numel
+                // dim_0_size
+            )
+            if len(shards) == 1:
+                local_tensor = shards[0].tensor.flatten()
+                with SimpleProfiler.profile(SimpleProfiler.Type.H2D):
+                    local_tensor = local_tensor.to(fsdp_state.compute_device)
+                num_padding = chunk_size - local_tensor.numel()
+                if num_padding > 0:
+                    local_tensor = F.pad(local_tensor, [0, num_padding])
+            else:
+                local_tensor = torch.zeros(
+                    chunk_size, dtype=param.dtype, device=fsdp_state.compute_device
+                )
+            tensor = torch.empty(
+                chunk_size * fsdp_state.world_size,
+                dtype=local_tensor.dtype,
+                device=fsdp_state.compute_device,
+            )
+            with SimpleProfiler.profile(SimpleProfiler.Type.ALLGATHER):
+                dist.all_gather_into_tensor(
+                    tensor, local_tensor, group=fsdp_state.process_group
+                )
+            tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
+            state_dict[fqn_from_global_root] = tensor
+        else:
+            if param.device != fsdp_state._device_mesh.device_type:
+                param = param.to(fsdp_state._device_mesh.device_type)
+
+            parent_mesh = _mesh_resources.get_parent_mesh(fsdp_state._device_mesh)
+            local_tensor = _ext_all_gather_dtensor(
+                param, parent_mesh, fsdp_state._fsdp_extension
+            )
+
+            if fqn_to_param_ext.get(fqn) is not None:
+                ext = fqn_to_param_ext[fqn]
+                local_tensor = _ext_post_unflatten_transform(
+                    local_tensor, ext, fsdp_state._fsdp_extension
+                )
+            state_dict[fqn_from_global_root] = local_tensor
+
+    with SimpleProfiler.profile("_enter_unshard_params_ctx"):
+        _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
+
+
+@contextlib.contextmanager
+def _replace_with_full_state_dict_type(fsdp_state: _FSDPState) -> Generator:
+    old_state_dict_config = fsdp_state._state_dict_config
+    old_state_dict_type = fsdp_state._state_dict_type
+    fsdp_state._state_dict_config = FullStateDictConfig()
+    fsdp_state._state_dict_type = StateDictType.FULL_STATE_DICT
+    yield
+    fsdp_state._state_dict_config = old_state_dict_config
+    fsdp_state._state_dict_type = old_state_dict_type
+
+
+@no_type_check
+@torch.no_grad()
+def _post_state_dict_hook(
+    module: nn.Module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+    *args: Any,
+) -> Dict[str, Any]:
+    """
+    _post_state_dict_hook() is called after the state_dict() of this
+    FSDP module is executed. ``fsdp_state._state_dict_type`` is used to decide
+    what postprocessing will be done.
+    """
+    fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
+        context = _replace_with_full_state_dict_type(fsdp_state)
+        warnings.warn(
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+            "be returned."
+        )
+    else:
+        context = contextlib.nullcontext()
+
+    with context:
+        _post_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: _full_post_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: _local_post_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: _sharded_post_state_dict_hook,
+        }
+        processed_state_dict = _post_state_dict_hook_fn[fsdp_state._state_dict_type](
+            module, fsdp_state, state_dict, prefix
+        )
+
+    if fsdp_state._is_root:
+        logger.info("FSDP finished processing state_dict(), prefix=%s", prefix)
+        for key, tensor in sorted(processed_state_dict.items()):
+            if key.startswith(prefix) and isinstance(tensor, torch.Tensor):
+                local_shape = tensor.shape
+                if isinstance(tensor, ShardedTensor):
+                    local_shape = None
+                    shards = tensor.local_shards()
+                    if shards:
+                        local_shape = shards[0].tensor.shape
+                elif isinstance(tensor, DTensor):
+                    local_shape = tensor.to_local().shape
+                logger.info(
+                    "FQN=%s: type=%s, shape=%s, local_shape=%s, dtype=%s, device=%s",
+                    key,
+                    type(tensor),
+                    tensor.shape,
+                    local_shape,
+                    tensor.dtype,
+                    tensor.device,
+                )
+
+    return processed_state_dict
+
+
+@no_type_check
+@torch.no_grad()
+def _pre_state_dict_hook(
+    module: nn.Module,
+    *args,
+    **kwargs,
+) -> None:
+    """
+    This is called before the core state dict saving logic of ``module``.
+    ``fsdp_state._state_dict_type`` is used to decide what postprocessing will
+    be done.
+    """
+    fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
+        context = _replace_with_full_state_dict_type(fsdp_state)
+        warnings.warn(
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+            "be returned."
+        )
+    else:
+        _set_use_dtensor(fsdp_state)
+        context = contextlib.nullcontext()
+
+    with context:
+        _pre_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: _full_pre_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: _local_pre_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: _sharded_pre_state_dict_hook,
+        }
+        _pre_state_dict_hook_fn[fsdp_state._state_dict_type](
+            fsdp_state,
+            module,
+            *args,
+            **kwargs,
+        )
+
+
+@no_type_check
+def _set_use_dtensor(fsdp_state: _FSDPState) -> None:
+    # If device_mesh is passed in when initalizing FSDP, we automatically turn the
+    # _use_dtensor flag to be true for ShardedStateDictConfig().
+    if getattr(fsdp_state, "_device_mesh", None):
+        state_dict_type = fsdp_state._state_dict_type
+        if state_dict_type == StateDictType.LOCAL_STATE_DICT:
+            raise RuntimeError(
+                "Found state_dict_type LOCAL_STATE_DICT",
+                "DeviceMesh is not compatible with LOCAL_STATE_DICT.",
+                "Please set state_dict_type to SHARDED_STATE_DICT to get DTensor state_dict.",
+            )
+        else:
+            fsdp_state._state_dict_config._use_dtensor = True
+
+
+@no_type_check
+@torch.no_grad()
+def _pre_load_state_dict_hook(
+    module: nn.Module,
+    state_dict: Dict[str, Any],
+    prefix: str,
+    *args: Any,
+) -> None:
+    """
+    This is called before ``module._load_from_state_dict()``.
+    ``fsdp_state._state_dict_type`` is used to decide what preprocessing will
+    be done.
+    """
+    fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
+        context = _replace_with_full_state_dict_type(fsdp_state)
+        warnings.warn(
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+            "be returned."
+        )
+    else:
+        _set_use_dtensor(fsdp_state)
+        context = contextlib.nullcontext()
+
+    _lazy_init(fsdp_state, module)
+    if fsdp_state._is_root:
+        SimpleProfiler.reset()
+
+    with context:
+        _pre_load_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: _full_pre_load_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: _local_pre_load_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: _sharded_pre_load_state_dict_hook,
+        }
+        # Code that is common for all state_dict impls
+        if fsdp_state._device_handle.is_available():
+            fsdp_state._device_handle.synchronize()
+        # Dispatch into state_dict specific implementation of pre-hook.
+        _pre_load_state_dict_hook_fn[fsdp_state._state_dict_type](
+            module, fsdp_state, state_dict, prefix
+        )
+
+
+@no_type_check
+@torch.no_grad()
+def _post_load_state_dict_hook(
+    module: nn.Module,
+    incompatible_keys: Tuple[List[str], List[str]],
+    *args: Any,
+) -> None:
+    fsdp_state = _get_module_fsdp_state_if_fully_sharded_module(module)
+    if fsdp_state.sharding_strategy == ShardingStrategy.NO_SHARD:
+        context = _replace_with_full_state_dict_type(fsdp_state)
+        warnings.warn(
+            "When using ``NO_SHARD`` for ``ShardingStrategy``, full_state_dict will"
+            "be returned."
+        )
+    else:
+        context = contextlib.nullcontext()
+
+    with context:
+        _post_load_state_dict_hook_fn = {
+            StateDictType.FULL_STATE_DICT: _full_post_load_state_dict_hook,
+            StateDictType.LOCAL_STATE_DICT: _local_post_load_state_dict_hook,
+            StateDictType.SHARDED_STATE_DICT: _sharded_post_load_state_dict_hook,
+        }
+        # Code that is common for all state_dict impls
+        # Dispatch into state_dict type specific implementation of post-hook for
+        # loading state_dict.
+        _post_load_state_dict_hook_fn[fsdp_state._state_dict_type](module, fsdp_state)
+
+    # When reporting incompatible keys, trim FSDP prefixes.
+    missing_keys = incompatible_keys[0]
+    unexpected_keys = incompatible_keys[1]
+    for i in range(len(missing_keys)):
+        missing_keys[i] = clean_tensor_name(missing_keys[i])
+
+    for i in range(len(unexpected_keys)):
+        unexpected_keys[i] = clean_tensor_name(unexpected_keys[i])
+
+    if fsdp_state._is_root:
+        SimpleProfiler.dump_and_reset("FSDP model load_state_dict profiling: ")
+
+
+def _register_all_state_dict_hooks(state: _FSDPState):
+    """
+    Registers pre-save, post-save, pre-load, and post-load state dict hooks.
+    """
+    for hook_registration_fn_str, hook, hook_registration_fn_kwargs in (
+        ("register_state_dict_pre_hook", _pre_state_dict_hook, {}),
+        ("_register_state_dict_hook", _post_state_dict_hook, {}),
+        (
+            "_register_load_state_dict_pre_hook",
+            _pre_load_state_dict_hook,
+            {"with_module": True},
+        ),
+        ("register_load_state_dict_post_hook", _post_load_state_dict_hook, {}),
+    ):
+        _register_state_dict_hooks_base(
+            state, hook_registration_fn_str, hook, hook_registration_fn_kwargs
+        )
+
+
+@no_type_check
+def _register_state_dict_hooks_base(
+    state: _FSDPState,
+    hook_registration_fn_name: str,
+    hook: Callable,
+    hook_registration_fn_kwargs: Dict[str, Any],
+) -> None:
+    """Registers ``hook`` using ``hook_registration_fn``."""
+    if not _is_composable(state):
+        getattr(state, hook_registration_fn_name)(hook, **hook_registration_fn_kwargs)
+    else:
+        handle = state._handle
+        if handle:
+            getattr(handle._fully_sharded_module, hook_registration_fn_name)(
+                hook, **hook_registration_fn_kwargs
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_trace_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_trace_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b1a4ee7b057fc0f00877448e65a47074fa1be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_trace_utils.py
@@ -0,0 +1,237 @@
+import functools
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class TracingConfig:
+    """
+    This represents a symbolic tracing configuration.
+
+    Args:
+        tracer (torch.fx.Tracer): An instance of :class:`torch.fx.Tracer` to
+            use for symbolic tracing. The default value is the native
+            :class:`torch.fx.Tracer` constructed with default arguments.
+            However, the user may want to pass a different value such as the
+            ``HFTracer`` for models in the HuggingFace Transformers_ library.
+            .. _Transformers: https://huggingface.co/docs/transformers/index
+        concrete_args (Optional[Dict[str, Any]]): Concrete arguments that
+            should not be treated as ``torch.fx.Proxy`` when tracing the
+            module ``forward()``. Passing ``concrete_args`` allows partially
+            specializing the forward, e.g. to remove control flow or data
+            structures. This ``concrete_args`` here is the same argument used
+            in :meth:`~torch.fx.Tracer.trace`.
+    """
+
+    tracer: torch.fx.Tracer = field(default_factory=torch.fx.Tracer)
+    concrete_args: Optional[Dict[str, Any]] = None
+
+
+class _ParamUsageInfo(NamedTuple):
+    """
+    This is used for ``_ExecutionInfo.module_to_param_usage_infos`` to record
+    execution information. The ``dict`` maps modules to a list of these
+    ``_ParamUsageInfo`` instances, where each instance represents a group of
+    parameters used together.
+
+    Specifically, for each module key in the ``dict``, each instance of this
+    class represents either:
+    (1) the module and some sublist of its ``named_parameters()`` used
+    together in execution (see ``_patched_create_proxy()``), or
+    (2) a submodule and all of ``submodule.named_parameters()`` (see
+    ``_patched_call_module()``).
+
+    Type (1) corresponds to directly using parameters in ops without calling
+    ``forward()``, and type (2) corresponds to calling ``forward()``. The
+    mapped-to lists in the ``dict`` follow the execution order.
+    """
+
+    module: nn.Module
+    named_params: List[Tuple[str, nn.Parameter]]
+
+
+class _ExecutionInfo:
+    """
+    This represents the execution order information from the forward pass.
+
+    Attributes:
+        curr_module (nn.Module): Current module being traced.
+        module_forward_order (List[nn.Module]): The modules in (pre-)forward
+            order, i.e. the order in which their ``forward()`` methods are
+            called. Each call to a module's ``forward()`` corresponds to one
+            element in the list.
+        module_to_param_usage_infos (Dict[nn.Module, List[_ParamUsageInfo]]):
+            Maps a module to a list of module execution infos. See
+            :class:`_ParamUsageInfo` for details.
+        param_forward_order (List[nn.Parameter]): The parameters in forward
+            execution order, where only a parameter's first participation is
+            included.
+        visited_params (Set[nn.Parameter]): The parameters visited so far
+            during the trace. This is only used during tracing for fast
+            membership check. Invariant: The parameters in
+            ``param_forward_order`` are exactly those in ``visited_params``.
+    """
+
+    def __init__(self, root_module: nn.Module) -> None:
+        self.curr_module: nn.Module = root_module
+        self.module_forward_order: List[nn.Module] = [root_module]
+        self.module_to_param_usage_infos: Dict[nn.Module, List[_ParamUsageInfo]] = {
+            root_module: []
+        }
+        self.param_forward_order: List[nn.Parameter] = []
+        self.visited_params: Set[nn.Parameter] = set()
+
+
+class _ExecOrderTracer:
+    def __init__(self) -> None:
+        self.exec_info: Optional[_ExecutionInfo] = None
+
+    @contextmanager
+    def patch_tracer(self, tracer: torch.fx.Tracer, root_module: nn.Module):
+        self.exec_info = _ExecutionInfo(root_module)
+        orig_call_module = tracer.call_module
+        orig_create_proxy = tracer.create_proxy
+        tracer.call_module = functools.partial(
+            self._patched_call_module, orig_call_module, self.exec_info
+        )
+        fqn_to_param = dict(root_module.named_parameters())
+        tracer.create_proxy = functools.partial(
+            self._patched_create_proxy,
+            orig_create_proxy,
+            self.exec_info,
+            fqn_to_param,
+        )
+        try:
+            yield
+        finally:
+            tracer.call_module = orig_call_module
+            tracer.create_proxy = orig_create_proxy
+
+    def _patched_call_module(
+        self,
+        call_module: Callable,
+        exec_info: _ExecutionInfo,
+        # Below are the expected arguments to `call_module()`
+        module: nn.Module,
+        forward: Callable,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> Any:
+        """
+        Overrides ``call_module`` to save execution information to
+        ``exec_info``. Note that ``call_module`` is called during symbolic
+        tracing for each non-root module.
+
+        Args:
+            call_module (Callable): Original ``call_module`` to override.
+            exec_info (_ExecutionInfo): Used to record execution information.
+            module (nn.Module): Module corresponding to this ``call_module``.
+            forward (Callable): ``forward()`` method of ``module`` to be called
+                for this ``call_module``.
+            args (Tuple[Any, ...]): Positional arguments for ``forward``.
+            kwargs (Dict[str, Any]): Keyword arguments for ``forward``.
+
+        Returns:
+            Same return value as ``call_module``.
+        """
+        exec_info.module_forward_order.append(module)
+        named_params = list(module.named_parameters())
+        curr_module = exec_info.curr_module
+        if named_params:
+            assert (
+                curr_module in exec_info.module_to_param_usage_infos
+            ), "The current module should have already been processed by a patched `call_module`"
+            exec_info.module_to_param_usage_infos[exec_info.curr_module].append(
+                _ParamUsageInfo(module, named_params)
+            )
+        prev_curr_module = curr_module
+        exec_info.curr_module = module
+        exec_info.module_to_param_usage_infos[module] = []
+        output = call_module(module, forward, args, kwargs)
+        exec_info.curr_module = prev_curr_module
+        return output
+
+    def _patched_create_proxy(
+        self,
+        create_proxy: Callable,
+        exec_info: _ExecutionInfo,
+        fqn_to_param: Dict[str, nn.Parameter],
+        # Below are the expected arguments to `create_proxy()`
+        kind: str,
+        target: torch.fx.node.Target,
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+        name: Optional[str] = None,
+        type_expr: Optional[Any] = None,
+        proxy_factory_fn: Optional[Callable[[torch.fx.Node], torch.fx.Proxy]] = None,
+    ) -> torch.fx.Proxy:
+        """
+        Overrides ``create_proxy`` to save execution information to
+        ``exec_info``. Note that ``create_proxy`` is called during symbolic
+        tracing for each leaf function/method/module.
+
+        Args:
+            create_proxy (Callable): Original ``create_proxy`` to override.
+            exec_info (_ExecutionInfo): Used to record execution information.
+            fqn_to_param (Dict[str, nn.Parameter]): ``dict`` version of the
+                root module's ``named_parameters()`` with FQN as key and
+                parameter as value.
+            kind (str): Kind of the target method ('call_function',
+                'call_method', 'get_attr', 'call_module', 'placeholder', or
+                'output'). See :class:`torch.fx.Graph` for details. This is
+                passed to ``create_proxy``.
+            target (torch.fx.node.Target): Contains the string name of the
+                function/method/module. This is passed to ``create_proxy``.
+            args (Tuple[Any, ...]): Positional arguments for the function/
+                method/module. This is passed to ``create_proxy``.
+            kwargs (Dict[str, Any]): Keyword arguments for the function/method/
+                module. This is passed to ``create_proxy``
+            name (Optional[str]): An optional string name for the ``Node``
+                created in ``create_proxy``. This is passed to
+                ``create_proxy``.
+            type_expr (Optional[Any]): An optional type annotation representing
+                the Python type that the output of the node has. This is passed
+                to ``create_proxy``.
+            proxy_factory_fn (Callable[[torch.fx.Node], torch.fx.Proxy]):
+                An alternative proxy constructor used in ``create_proxy``. This
+                is passed to ``create_proxy``.
+
+        Returns:
+            torch.fx.Proxy: Created ``Node`` wrapped in a ``Proxy`` object.
+        """
+        proxy = create_proxy(
+            kind, target, args, kwargs, name, type_expr, proxy_factory_fn
+        )
+        curr_module = exec_info.curr_module
+        if kind in ("call_function", "call_method"):
+            if args is not None:
+                named_params: List[Tuple[str, nn.Parameter]] = []
+                for arg in args:
+                    if (
+                        isinstance(arg, torch.fx.Proxy)
+                        and arg.node.target in fqn_to_param
+                    ):
+                        param = fqn_to_param[arg.node.target]
+                        named_params.append((arg.node.target, param))
+                        if param not in exec_info.visited_params:
+                            exec_info.visited_params.add(param)
+                            exec_info.param_forward_order.append(param)
+                if named_params:
+                    exec_info.module_to_param_usage_infos[curr_module].append(
+                        _ParamUsageInfo(curr_module, named_params)
+                    )
+        elif kind == "call_module":
+            named_params = list(curr_module.named_parameters())
+            if named_params:
+                exec_info.module_to_param_usage_infos[curr_module].append(
+                    _ParamUsageInfo(curr_module, named_params)
+                )
+            for _, param in named_params:
+                if param not in exec_info.visited_params:
+                    exec_info.visited_params.add(param)
+                    exec_info.param_forward_order.append(param)
+        return proxy
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_traversal_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_traversal_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b5db4fda13d369acf5522aef9a594abeb3db7ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_traversal_utils.py
@@ -0,0 +1,113 @@
+"""
+NOTE: This file must be imported like
+``import torch.distributed.fsdp._traversal_utils`` and not like
+``from torch.distirbuted.fsdp._traversal_utils import ...`` to avoid circular
+imports. For brevity, we may import the file as ``traversal_utils``.
+"""
+
+import collections
+from typing import Deque, List, Set, Tuple
+
+import torch.nn as nn
+from torch.distributed._composable.contract import _get_registry
+from torch.distributed.fsdp._common_utils import _FSDPState, _get_module_fsdp_state
+
+
+"""
+[Note: FSDP State Traversal]
+For the wrapper code path, ``_FSDPState`` is the ``FullyShardedDataParallel``
+module wrapping a fully sharded module, and for the non-wrapper code path,
+``_FSDPState`` is an object that gets embedded on a fully sharded module.
+See [Note: Fully Sharded Module] for the definition.
+
+There are three common traversal idioms: Given a root module,
+- ``_get_fsdp_states()`` returns all ``_FSDPState`` s in the tree.
+- ``get_fsdp_root_states()`` returns all local root ``_FSDPState`` s in the
+tree (i.e. those with ``_is_root == True``).
+- ``_get_fsdp_handles()``returns all ``FlatParamHandle`` s in the tree.
+
+All of these methods must take in the root module (i.e. an ``nn.Module``) and
+not a general ``_FSDPState`` because ``_FSDPState`` does not support a graph
+traversal, whereas ``nn.Module`` has ``nn.Module.modules()`` for traversal.
+"""
+
+
+def _composable(module: nn.Module) -> bool:
+    """
+    Returns if ``module`` can compose with ``fully_shard``.
+    """
+    # TODO: Add any other composable APIs that are mutually exclusive.
+    registry = _get_registry(module)
+    if registry is None:
+        return True
+    return "replicate" not in registry
+
+
+# TODO (awgu): We may be able to remove this function if we retired the
+# `use_orig_params=False` code path since so far we only need the module for
+# `FlatParameter` registration, which is not needed for `use_orig_params=True`.
+def _get_fsdp_states_with_modules(
+    module: nn.Module,
+) -> Tuple[List[_FSDPState], List[nn.Module]]:
+    """
+    Returns a tuple containing:
+    1. A list of the ``_FSDPState`` instances in the module tree rooted at
+    ``module`` without any duplicates and following the ``module.modules()``
+    traversal order (which is assumed to be depth-first).
+    2. A corresponding list of the modules owning the states in the first list.
+
+    For the wrapper code path, both returned lists are the same, each
+    containing all ``FullyShardedDataParallel`` instances. For the composable
+    code path, this returns a list of all composable state instances and a list
+    of the corresponding fully sharded modules. See [Note: Fully Sharded
+    Module].
+
+    NOTE: The traversal does not proceed into any module annotated by an
+    incompatible API (e.g. ``replicate``).
+    """
+    fsdp_states: List[_FSDPState] = []
+    fsdp_modules: List[nn.Module] = []
+    # Track the visited FSDP states since multiple modules may share the same
+    # one and we want to return a de-duplicated list
+    visited_fsdp_states: Set[_FSDPState] = set()
+    # Track the visited modules in case of shared modules, which implies the
+    # module graph is no longer a tree
+    visited_modules: Set[nn.Module] = set()
+
+    # Perform depth-first search from `module` to ensure that we do not
+    # traverse into an incompatible API's subtree (use DFS instead of BFS to
+    # match `.modules()` order)
+    deque: Deque[nn.Module] = collections.deque([module])
+    while deque:
+        submodule = deque.popleft()
+        visited_modules.add(submodule)
+        if not _composable(submodule):
+            continue
+        for child_module in reversed(list(submodule.children())):
+            if child_module not in visited_modules:
+                deque.appendleft(child_module)
+        optional_state = _get_module_fsdp_state(submodule)
+        if optional_state is not None and optional_state not in visited_fsdp_states:
+            visited_fsdp_states.add(optional_state)
+            fsdp_states.append(optional_state)
+            fsdp_modules.append(submodule)
+    return fsdp_states, fsdp_modules
+
+
+def _get_fsdp_states(module: nn.Module) -> List[_FSDPState]:
+    """See :func:`_get_fsdp_states_with_modules`."""
+    fsdp_states, _ = _get_fsdp_states_with_modules(module)
+    return fsdp_states
+
+
+def _get_fsdp_handles(module: nn.Module) -> List:
+    """
+    Returns all ``FlatParamHandle`` s in the module tree rooted at ``module``
+    following the rules in :func:`_get_fsdp_state`.
+    """
+    handles = [
+        fsdp_state._handle
+        for fsdp_state in _get_fsdp_states(module)
+        if fsdp_state._handle is not None
+    ]
+    return handles
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_unshard_param_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_unshard_param_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..053a73b3c254c33120d3b9dc8fc12dbaf272752d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_unshard_param_utils.py
@@ -0,0 +1,357 @@
+import contextlib
+import warnings
+from typing import cast, Generator
+
+import torch
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
+    _has_fsdp_params,
+    _module_handle,
+    HandleTrainingState,
+    TrainingState,
+)
+from torch.distributed.fsdp._runtime_utils import (
+    _get_fsdp_root_states_with_modules,
+    _lazy_init,
+    _reset_flat_param_grad_info_if_needed,
+    _reshard,
+    _reshard_grads,
+    _unshard,
+    _unshard_grads,
+)
+from torch.distributed.utils import _p_assert
+
+from ._flat_param import FlatParamHandle
+
+FLAT_PARAM = "_flat_param"
+
+
+@torch.no_grad()
+def _writeback_to_local_shard(
+    handle: FlatParamHandle,
+    writeback_grad: bool,
+):
+    """
+    For the handle, writes back the this rank's shard of the unsharded
+    flattened parameter to the sharded flattened parameter. If
+    ``writeback_grad=True``, then writes back to the sharded gradient as
+    well.
+
+    Precondition: The handle's ``FlatParameter`` 's data points to the
+    padded unsharded flattened parameter.
+    """
+
+    def _get_shard(flat_param_or_grad: torch.Tensor) -> torch.Tensor:
+        if handle.uses_sharded_strategy:
+            # For sharded strategies, get the *unpadded* shard instead of
+            # the *padded* shard to persist user changes to the padding
+            # (though FSDP does not explicitly support this)
+            shard, _ = FlatParamHandle._get_unpadded_shard(
+                flat_param_or_grad,
+                handle.rank,
+                handle.world_size,
+            )
+            return shard
+        # For `NO_SHARD`, the `flat_param` or its gradient may be modified,
+        # so we write it back directly
+        return flat_param_or_grad
+
+    param_shard = _get_shard(handle.flat_param)
+    handle.flat_param._local_shard[: param_shard.numel()].copy_(param_shard)  # type: ignore[attr-defined]
+    if writeback_grad:
+        existing_grad = handle.sharded_grad
+        if existing_grad is not None:
+            assert handle.flat_param.grad is not None
+            grad_shard = _get_shard(handle.flat_param.grad)
+            existing_grad[: grad_shard.numel()].copy_(grad_shard)
+
+
+def _deregister_flat_param(state: _FSDPState, module: nn.Module) -> None:
+    """
+    De-registers the flattened parameter from the wrapped module, hiding it
+    from ``nn.Module`` methods.
+
+    We do not use ``del`` because we want ``FLAT_PARAM`` to always be an
+    attribute but dynamically change whether it is visible to ``nn.Module``
+    methods.
+    """
+    if _has_fsdp_params(state, module):
+        # TODO: figure out the case for the composable APIs.
+        cast(nn.Module, module.module)._parameters.pop(FLAT_PARAM, None)
+
+
+def _register_flat_param(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Registers the flattened parameter to the wrapped module, making it
+    visible to ``nn.Module`` methods.
+
+    We do not use :meth:`nn.Module.register_parameter` because we want
+    ``FLAT_PARAM`` to always be an attribute but dynamically change whether
+    it is visible to ``nn.Module`` methods.
+    """
+    handle = _module_handle(state, module)
+    if _has_fsdp_params(state, module):
+        # TODO: figure out the case for the composable APIs.
+        cast(nn.Module, module.module)._parameters[FLAT_PARAM] = handle.flat_param
+
+
+@contextlib.contextmanager
+def _unflatten_as_params(state: _FSDPState, module: nn.Module) -> Generator:
+    """
+    Assumes that the flattened parameter is unsharded. When in the context,
+    de-registers the flattened parameter and unflattens the original
+    parameters as ``nn.Parameter`` views into the flattened parameter.
+    After the context, re-registers the flattened parameter and restores
+    the original parameters as ``Tensor`` views into the flattened
+    parameter.
+    """
+    handle = _module_handle(state, module)
+    if not handle:
+        yield
+    else:
+        _deregister_flat_param(state, module)
+        try:
+            with handle.unflatten_as_params():
+                yield
+        finally:
+            if not handle._use_orig_params:
+                _register_flat_param(state, module)
+
+
+def _validate_unshard_params_args(
+    state: _FSDPState,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+) -> None:
+    if with_grads and (offload_to_cpu or not state._use_orig_params):
+        raise NotImplementedError(
+            f"with_grads={with_grads}, "
+            f"use_orig_params={state._use_orig_params}, "
+            f"offload_to_cpu={offload_to_cpu} "
+            f"is not supported yet"
+        )
+    if offload_to_cpu and state._handle and (not state._handle.uses_sharded_strategy):
+        raise NotImplementedError(
+            "offload_to_cpu=True and NO_SHARD is not supported yet"
+        )
+    if writeback and rank0_only:
+        # TODO: Rank 0 can broadcast the `FlatParameter` to allow all ranks to
+        # persist the changes.
+        raise NotImplementedError(
+            "writeback=True and rank0_only=True is not supported yet"
+        )
+    if offload_to_cpu and not rank0_only:
+        warnings.warn(
+            "offload_to_cpu=True and rank0_only=False may result in the"
+            "unsharded parameters being redundantly copied to CPU memory for "
+            "GPUs sharing the same CPU memory, which risks CPU OOM. We "
+            "recommend using offload_to_cpu=True with rank0_only=True."
+        )
+
+
+@contextlib.contextmanager
+def _unshard_fsdp_state_params(
+    module: nn.Module,
+    state: _FSDPState,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This unshards the parameters for a single FSDP state ``state`` that
+    corresponds to ``module``.
+    """
+    _validate_unshard_params_args(
+        state, writeback, rank0_only, offload_to_cpu, with_grads
+    )
+    state._device_handle.synchronize()
+    # If handles are shared by other module(s), the handle may be already unsharded.
+    maybe_handle = _module_handle(state, module)
+    handle = None
+    if (
+        maybe_handle
+        and maybe_handle._training_state != HandleTrainingState.SUMMON_FULL_PARAMS
+    ):
+        handle = maybe_handle
+    if not handle:
+        yield
+        return
+
+    assert (
+        handle._training_state == HandleTrainingState.IDLE
+    ), f"Expects the handle training to be IDLE but got {handle._training_state}"
+
+    handle._training_state = HandleTrainingState.SUMMON_FULL_PARAMS
+
+    _reset_flat_param_grad_info_if_needed(handle)
+    free_unsharded_flat_param = handle.needs_unshard()
+    # No need to call `wait_stream()` since we unshard in the computation
+    # stream directly
+    computation_stream = state._device_handle.current_stream()
+    _unshard(state, handle, computation_stream, computation_stream)
+    if with_grads:
+        _unshard_grads(handle)
+
+    if rank0_only and state.rank != 0:
+        # Free the unsharded flattened parameter early
+        _reshard(state, handle, free_unsharded_flat_param)
+        if with_grads:
+            _reshard_grads(handle)
+        try:
+            yield
+        finally:
+            handle._training_state = HandleTrainingState.IDLE
+    else:
+        # Unflatten the unsharded flattened parameters
+        with contextlib.ExitStack() as stack:
+            # Invariant: rank == 0 or !rank0_only
+            if offload_to_cpu and handle.uses_sharded_strategy:
+                stack.enter_context(handle.to_cpu())
+                # NOTE: Since PyTorch enforces that a parameter and its
+                # gradients need to match metadata (e.g. device), we must
+                # move gradients to CPU *after* we move parameters.
+            # NOTE: This assumes 1 `FlatParameter`
+            if not state._use_orig_params:
+                stack.enter_context(_unflatten_as_params(state, module))
+            try:
+                yield
+            finally:
+                stack.close()
+                if writeback:
+                    _writeback_to_local_shard(handle, with_grads)
+                _reshard(state, handle, free_unsharded_flat_param)
+                if with_grads:
+                    _reshard_grads(handle)
+                handle._training_state = HandleTrainingState.IDLE
+
+
+@contextlib.contextmanager
+def _unshard_params_recurse(
+    module: nn.Module,
+    state: _FSDPState,
+    recurse: bool,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This is a helper for :func:`_unshard_params` that recursively calls
+    :func:`_unshard_fsdp_state_params` on FSDP states if ``recurse=True``.
+    NOTE: This runs lazy initialization.
+    """
+    _validate_unshard_params_args(
+        state, writeback, rank0_only, offload_to_cpu, with_grads
+    )
+    if recurse:
+        with contextlib.ExitStack() as stack:
+            # TODO (awgu): The traversal function does not traverse through
+            # incompatible composable APIs. Verify if this is the desired
+            # behavior for this function.
+            for state, fsdp_module in zip(
+                *traversal_utils._get_fsdp_states_with_modules(module)
+            ):
+                stack.enter_context(
+                    _unshard_params_recurse(
+                        module=fsdp_module,
+                        state=state,
+                        recurse=False,
+                        writeback=writeback,
+                        rank0_only=rank0_only,
+                        offload_to_cpu=offload_to_cpu,
+                        with_grads=with_grads,
+                    )
+                )
+            yield
+        return
+    _lazy_init(state, module)
+    if state.training_state == TrainingState.FORWARD_BACKWARD:
+        raise AssertionError(
+            "Cannot manually unshard parameters during forward/backward"
+        )
+    elif state.training_state == TrainingState.SUMMON_FULL_PARAMS:
+        raise AssertionError(
+            "Cannot manually unshard parameters when already unsharding parameters"
+        )
+    with _unshard_fsdp_state_params(
+        module=module,
+        state=state,
+        writeback=writeback,
+        rank0_only=rank0_only,
+        offload_to_cpu=offload_to_cpu,
+        with_grads=with_grads,
+    ):
+        try:
+            state.training_state = TrainingState.SUMMON_FULL_PARAMS
+            yield
+        finally:
+            state.training_state = TrainingState.IDLE
+
+
+@contextlib.contextmanager
+def _unshard_params(
+    module: nn.Module,
+    recurse: bool,
+    writeback: bool,
+    rank0_only: bool,
+    offload_to_cpu: bool,
+    with_grads: bool,
+):
+    """
+    This unshards FSDP-managed parameters for all modules with FSDP applied in
+    the module tree rooted at ``module``.
+    """
+    root_fsdp_states, root_fsdp_modules = _get_fsdp_root_states_with_modules(module)
+    with contextlib.ExitStack() as stack:
+        for root_fsdp_state, root_fsdp_module in zip(
+            root_fsdp_states, root_fsdp_modules
+        ):
+            stack.enter_context(
+                _unshard_params_recurse(
+                    module=root_fsdp_module,
+                    state=root_fsdp_state,
+                    recurse=recurse,
+                    writeback=writeback,
+                    rank0_only=rank0_only,
+                    offload_to_cpu=offload_to_cpu,
+                    with_grads=with_grads,
+                )
+            )
+        yield
+    return
+
+
+def _deregister_orig_params(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Deregisters the original parameters; registers the ``FlatParameter``.
+    """
+    handle = _module_handle(state, module)
+    if not handle:
+        return
+    _p_assert(
+        handle._use_orig_params,
+        f"Inconsistent `_use_orig_params` -- FSDP: {state._use_orig_params} "
+        f"handle: {handle._use_orig_params}",
+    )
+    handle._deregister_orig_params()
+    _register_flat_param(state, module)
+
+
+def _register_orig_params(state: _FSDPState, module: nn.Module) -> None:
+    """
+    Deregisters the ``FlatParameter``; registers the original parameters.
+    """
+    handle = _module_handle(state, module)
+    if not handle:
+        return
+    _deregister_flat_param(state, module)
+    if handle.is_sharded(handle.flat_param):
+        handle._use_sharded_views()
+        handle._use_sharded_grad_views()
+    else:
+        handle._use_unsharded_views(as_params=True)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/_wrap_utils.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/_wrap_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5d3452edefabe1e5dd4ec1cd31f27a0f436ffa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/_wrap_utils.py
@@ -0,0 +1,262 @@
+import collections
+import functools
+import inspect
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, List, Set, Tuple, Type, Union
+
+import torch.nn as nn
+from torch.distributed.fsdp._common_utils import (
+    _get_module_fsdp_state,
+    _override_module_mixed_precision,
+)
+
+from torch.distributed.fsdp.wrap import (
+    _construct_wrap_fn,
+    _or_policy,
+    _Policy,
+    _post_order_apply,
+    _recursive_wrap,
+    _run_mixed_precision_override_policy,
+    _wrap_module_cls_individually,
+)
+
+
+def _auto_wrap(
+    root_module: nn.Module,
+    policy: Union[Callable, _Policy],
+    ignored_modules: Set[nn.Module],
+    ignored_params: Set[nn.Parameter],
+    root_kwargs: Dict[str, Any],
+    fsdp_fn: Callable,  # e.g. `FullyShardedDataParallel` or `fully_shard`
+):
+    """
+    Auto wraps modules in ``root_module`` 's tree according to ``policy``
+    following a post-order traversal.
+
+    Precondition: ``root_kwargs`` should contain all arguments except
+    ``module``. This function accepts the kwargs dict directly since it gets
+    forwarded into the post-order traversal function.
+    """
+    mixed_precision = root_kwargs["mixed_precision"]
+    is_wrapper = inspect.isclass(fsdp_fn)
+    # TODO: We may relax this no-nested-wrapping constraint to support manual
+    # wrapping followed by auto wrapping.
+    _check_nested_wrapping(root_module)
+
+    if isinstance(policy, _Policy):
+        root_kwargs["auto_wrap_policy" if is_wrapper else "policy"] = None
+        target_module_to_kwargs = policy._run_policy(
+            root_module, ignored_modules, root_kwargs
+        )
+        if mixed_precision is not None:
+            target_module_to_kwargs = _run_mixed_precision_override_policy(
+                root_module,
+                mixed_precision._module_classes_to_ignore,
+                ignored_modules,
+                root_kwargs,
+                target_module_to_kwargs,
+            )
+            overridden_module_classes = _override_module_mixed_precision(
+                root_module, mixed_precision._module_classes_to_ignore
+            )
+            _warn_on_overridden_mixed_precision(overridden_module_classes)
+        use_orig_params = root_kwargs.get("use_orig_params", False)
+        _validate_frozen_params(
+            root_module,
+            set(target_module_to_kwargs.keys()),
+            ignored_params,
+            use_orig_params,
+        )
+        wrap_fn = _construct_wrap_fn(root_module, target_module_to_kwargs, fsdp_fn)
+        _post_order_apply(root_module, wrap_fn)
+        return
+
+    recursive_wrap_kwargs = {
+        "module": root_module,
+        "auto_wrap_policy": policy,
+        "wrapper_cls": fsdp_fn,
+        "ignored_modules": ignored_modules,
+        "ignored_params": ignored_params,
+        "only_wrap_children": True,
+    }
+    if mixed_precision is not None:
+        # Wrap modules of the ignored types separately and register forward
+        # hooks to cast to fp32 and back to the original dtype, respectively
+        overridden_module_classes = _override_module_mixed_precision(
+            root_module, mixed_precision._module_classes_to_ignore
+        )
+        policy = functools.partial(
+            _or_policy,
+            policies=[
+                policy,
+                partial(
+                    _wrap_module_cls_individually,
+                    module_classes=mixed_precision._module_classes_to_ignore,
+                ),
+            ],
+        )
+        recursive_wrap_kwargs["auto_wrap_policy"] = policy
+        _warn_on_overridden_mixed_precision(overridden_module_classes)
+    _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs)  # type: ignore[arg-type]
+
+
+def _check_nested_wrapping(root_module: nn.Module):
+    for module_name, module in root_module.named_modules():
+        if _get_module_fsdp_state(module) is not None:
+            raise ValueError(
+                "FSDP auto wrapping requires modules to not already have "
+                f"FSDP applied but found {module_name} in\n{root_module}"
+            )
+
+
+def _warn_on_overridden_mixed_precision(
+    overridden_module_classes: Set[Type[nn.Module]],
+):
+    if len(overridden_module_classes) == 0:
+        return
+    warnings.warn(
+        "Both mixed precision and an auto_wrap_policy were specified to FSDP, "
+        f"where the wrapped module has submodules of type:\n{overridden_module_classes}\n"
+        "These modules will be wrapped as separate FSDP instacnes with mixed "
+        "precision disabled."
+    )
+
+
+def _validate_frozen_params(
+    root_module: nn.Module,
+    modules_to_wrap: Set[nn.Module],
+    ignored_params: Set[nn.Parameter],
+    use_orig_params: bool,
+):
+    """
+    This checks that, given ``modules_to_wrap``, each module would manage
+    parameters that are uniformly frozen or non-frozen. This uniformity
+    requirement is strict for ``use_orig_params=False`` (hard error) and highly
+    recommended for ``use_orig_params=True`` (user warning).
+    """
+    post_order_named_modules = _get_post_order_named_modules(root_module)
+    visited_modules: Set[nn.Module] = set()
+    for module_name, module in post_order_named_modules:
+        if module in modules_to_wrap:
+            param_to_fqn = _get_managed_param_to_fqn(
+                module, ignored_params, visited_modules, module_name
+            )
+            frozen_param_fqns: List[str] = []
+            frozen_param_numel = 0
+            nonfrozen_param_fqns: List[str] = []
+            nonfrozen_param_numel = 0
+            for param, fqn in param_to_fqn.items():
+                if param.requires_grad:
+                    nonfrozen_param_fqns.append(fqn)
+                    nonfrozen_param_numel += param.numel()
+                else:
+                    frozen_param_fqns.append(fqn)
+                    frozen_param_numel += param.numel()
+            if len(frozen_param_fqns) > 0 and len(nonfrozen_param_fqns) > 0:
+                msg = f"{module_name} has both parameters with requires_grad=True and False."
+                if use_orig_params:
+                    total_param_numel = frozen_param_numel + nonfrozen_param_numel
+                    msg += (
+                        " We do not recommend wrapping such modules since "
+                        "the gradient memory usage will be higher than expected "
+                        f"({total_param_numel} numel instead of {nonfrozen_param_numel} numel "
+                        "before sharding via reduce-scatter). "
+                    )
+                else:
+                    msg += " FSDP does not support wrapping such modules when use_orig_params=False. "
+                msg += "If possible, wrap the frozen parameters with FSDP separately.\n"
+                msg += (
+                    f"The following parameters have requires_grad=True:\n{nonfrozen_param_fqns}\n"
+                    f"The following parameters have requires_grad=False:\n{frozen_param_fqns}"
+                )
+                if use_orig_params:
+                    warnings.warn(msg)
+                else:
+                    raise ValueError(msg)
+
+
+def _get_post_order_named_modules(
+    root_module: nn.Module,
+) -> List[Tuple[str, nn.Module]]:
+    """
+    This returns the named modules following a post-order traversal, which is a
+    valid reverse topological sort. We achieve this using the reverse of a
+    stack-based DFS order instead of reversing ``root_module.named_modules()``
+    since the former gives the modules in registration order at each level in
+    the module tree (as opposed to the reverse), which allows us to error/warn
+    on the first registered module that violates the condition.
+
+    For example, consider the following module structure:
+        M(
+          S1(),
+          S2(
+            SS1(),
+            SS2(),
+          ),
+          S3(),
+        )
+    The reverse DFS order is [S1, SS1, SS2, S2, S3, M], while the reverse
+    ``named_modules()`` order is [S3, SS2, SS1, S2, S1, M].
+    """
+    visited_modules = {root_module}
+    stack = [("", root_module)]
+    # Append and reverse at the end for linear-time algorithm
+    reverse_post_order_named_modules: List[Tuple[str, nn.Module]] = []
+    while stack:
+        module_name, module = stack.pop()
+        reverse_post_order_named_modules.append((module_name, module))
+        for child_module_name, child_module in module.named_children():
+            if child_module is None:  # only for overrides of `named_children()`
+                continue
+            if child_module not in visited_modules:
+                visited_modules.add(child_module)
+                if module_name != "":
+                    child_module_name = module_name + "." + child_module_name
+                stack.append((child_module_name, child_module))
+    post_order_named_modules = list(reversed(reverse_post_order_named_modules))
+    return post_order_named_modules
+
+
+def _get_managed_param_to_fqn(
+    module_to_wrap: nn.Module,
+    ignored_params: Set[nn.Parameter],
+    visited_modules: Set[nn.Module],
+    root_prefix: str,
+) -> Dict[nn.Parameter, str]:
+    """
+    This returns a dict that maps managed parameter to its FQN for the given
+    ``module_to_wrap``. The dict's keys are exactly the parameters that would
+    be managed by the module, where this is achieved by calling this function
+    on the modules to wrap in reverse topological order, destructively updating
+    ``visited_modules``, and not traversing into those modules. The FQNs are
+    prefixed from the root (via ``root_prefix``) to be more informative.
+
+    NOTE: This function is meant to be called pre-wrapping and iteratively in
+    reverse topological order to cover the full module tree. This differs from
+    the ``_get_param_to_fqn()`` function meant to be called post-wrapping and
+    on the full module tree in one shot. Given those differences, we do not try
+    to unify the two.
+    """
+    param_to_fqn: Dict[nn.Parameter, str] = {}
+    # Run BFS (or any tree traversal works)
+    queue = collections.deque([(module_to_wrap, root_prefix)])
+    visited_modules.add(module_to_wrap)
+    while queue:
+        module, prefix = queue.popleft()
+        for param_name, param in module.named_parameters(recurse=False):
+            if param not in ignored_params:
+                fqn = param_name if prefix == "" else prefix + "." + param_name
+                param_to_fqn[param] = fqn
+        for child_module_name, child_module in module.named_children():
+            if child_module is None:  # only for overrides of `named_children()`
+                continue
+            if child_module not in visited_modules:
+                visited_modules.add(child_module)
+                child_prefix = (
+                    child_module_name
+                    if prefix == ""
+                    else prefix + "." + child_module_name
+                )
+                queue.append((child_module, child_prefix))
+    return param_to_fqn
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/api.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4551ddf8e62694edc7c7b9d934c1e8beb2a58d63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/api.py
@@ -0,0 +1,410 @@
+"""
+This file includes public APIs for FSDP such as the classes used for the
+constructor arguments.
+"""
+
+from dataclasses import dataclass
+from enum import auto, Enum
+
+from typing import Optional, Sequence, Type
+
+import torch
+from torch.nn.modules.batchnorm import _BatchNorm
+
+__all__ = [
+    "ShardingStrategy",
+    "BackwardPrefetch",
+    "MixedPrecision",
+    "CPUOffload",
+    "StateDictType",
+    "StateDictConfig",
+    "FullStateDictConfig",
+    "LocalStateDictConfig",
+    "ShardedStateDictConfig",
+    "OptimStateDictConfig",
+    "FullOptimStateDictConfig",
+    "LocalOptimStateDictConfig",
+    "ShardedOptimStateDictConfig",
+    "StateDictSettings",
+]
+
+
+class ShardingStrategy(Enum):
+    """
+    This specifies the sharding strategy to be used for distributed training by
+    :class:`FullyShardedDataParallel`.
+
+    - ``FULL_SHARD``: Parameters, gradients, and optimizer states are sharded.
+      For the parameters, this strategy unshards (via all-gather) before the
+      forward, reshards after the forward, unshards before the backward
+      computation, and reshards after the backward computation. For gradients,
+      it synchronizes and shards them (via reduce-scatter) after the backward
+      computation. The sharded optimizer states are updated locally per rank.
+    - ``SHARD_GRAD_OP``: Gradients and optimizer states are sharded during
+      computation, and additionally, parameters are sharded outside
+      computation. For the parameters, this strategy unshards before the
+      forward, does not reshard them after the forward, and only reshards them
+      after the backward computation. The sharded optimizer states are updated
+      locally per rank. Inside ``no_sync()``, the parameters are not resharded
+      after the backward computation.
+    - ``NO_SHARD``: Parameters, gradients, and optimizer states are not sharded
+      but instead replicated across ranks similar to PyTorch's
+      :class:`DistributedDataParallel` API. For gradients, this strategy
+      synchronizes them (via all-reduce) after the backward computation. The
+      unsharded optimizer states are updated locally per rank.
+    - ``HYBRID_SHARD``: Apply ``FULL_SHARD`` within a node, and replicate parameters across
+      nodes. This results in reduced communication volume as expensive all-gathers and
+      reduce-scatters are only done within a node, which can be more performant for medium
+      -sized models.
+    - ``_HYBRID_SHARD_ZERO2``: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across
+      nodes. This is like ``HYBRID_SHARD``, except this may provide even higher throughput
+      since the unsharded parameters are not freed after the forward pass, saving the
+      all-gathers in the pre-backward.
+    """
+
+    FULL_SHARD = auto()
+    SHARD_GRAD_OP = auto()
+    NO_SHARD = auto()
+    HYBRID_SHARD = auto()
+    _HYBRID_SHARD_ZERO2 = auto()
+
+
+class BackwardPrefetch(Enum):
+    """
+    This configures explicit backward prefetching, which improves throughput by
+    enabling communication and computation overlap in the backward pass at the
+    cost of slightly increased memory usage.
+
+    - ``BACKWARD_PRE``: This enables the most overlap but increases memory
+      usage the most. This prefetches the next set of parameters *before* the
+      current set of parameters' gradient computation. This overlaps the *next
+      all-gather* and the *current gradient computation*, and at the peak, it
+      holds the current set of parameters, next set of parameters, and current
+      set of gradients in memory.
+    - ``BACKWARD_POST``: This enables less overlap but requires less memory
+      usage. This prefetches the next set of parameters *after* the current
+      set of parameters' gradient computation. This overlaps the *current
+      reduce-scatter* and the *next gradient computation*, and it frees the
+      current set of parameters before allocating memory for the next set of
+      parameters, only holding the next set of parameters and current set of
+      gradients in memory at the peak.
+    - FSDP's ``backward_prefetch`` argument accepts ``None``, which disables
+      the backward prefetching altogether. This has no overlap and does not
+      increase memory usage. In general, we do not recommend this setting since
+      it may degrade throughput significantly.
+
+    For more technical context: For a single process group using NCCL backend,
+    any collectives, even if issued from different streams, contend for the
+    same per-device NCCL stream, which implies that the relative order in which
+    the collectives are issued matters for overlapping. The two backward
+    prefetching values correspond to different issue orders.
+    """
+
+    # NOTE: For both modes, the ordering that defines "current" and "next" is
+    # not always exact in the current implementation. A mistargeted prefetch
+    # simply means that the parameter memory is allocated earlier than needed,
+    # possibly increasing peak memory usage, but does not affect correctness.
+    BACKWARD_PRE = auto()
+    BACKWARD_POST = auto()
+
+
+@dataclass
+class MixedPrecision:
+    """
+    This configures FSDP-native mixed precision training.
+
+    Attributes:
+        param_dtype (Optional[torch.dtype]): This specifies the dtype for model
+            parameters during forward and backward and thus the dtype for
+            forward and backward computation. Outside forward and backward, the
+            *sharded* parameters are kept in full precision (e.g. for the
+            optimizer step), and for model checkpointing, the parameters are
+            always saved in full precision. (Default: ``None``)
+        reduce_dtype (Optional[torch.dtype]): This specifies the dtype for
+            gradient reduction (i.e. reduce-scatter or all-reduce). If this is
+            ``None`` but ``param_dtype`` is not ``None``, then this takes on
+            the ``param_dtype`` value, still running gradient reduction in low
+            precision. This is permitted to differ from ``param_dtype``, e.g.
+            to force gradient reduction to run in full precision. (Default:
+            ``None``)
+        buffer_dtype (Optional[torch.dtype]): This specifies the dtype for
+            buffers. FSDP does not shard buffers. Rather, FSDP casts them to
+            ``buffer_dtype`` in the first forward pass and keeps them in that
+            dtype thereafter. For model checkpointing, the buffers are saved
+            in full precision except for ``LOCAL_STATE_DICT``. (Default:
+            ``None``)
+        keep_low_precision_grads (bool): If ``False``, then FSDP upcasts
+            gradients to full precision after the backward pass in preparation
+            for the optimizer step. If ``True``, then FSDP keeps the gradients
+            in the dtype used for gradient reduction, which can save memory if
+            using a custom optimizer that supports running in low precision.
+            (Default: ``False``)
+        cast_forward_inputs (bool): If ``True``, then this FSDP module casts
+            its forward args and kwargs to ``param_dtype``. This is to ensure
+            that parameter and input dtypes match for forward computation, as
+            required by many ops. This may need to be set to ``True`` when only
+            applying mixed precision to some but not all FSDP modules, in which
+            case a mixed-precision FSDP submodule needs to recast its inputs.
+            (Default: ``False``)
+        cast_root_forward_inputs (bool): If ``True``, then the root FSDP module
+            casts its forward args and kwargs to ``param_dtype``, overriding
+            the value of ``cast_forward_inputs``. For non-root FSDP modules,
+            this does not do anything. (Default: ``True``)
+        _module_classes_to_ignore: (Sequence[Type[nn.Module]]): This specifies
+            module classes to ignore for mixed precision when using an
+            ``auto_wrap_policy``: Modules of these classes will have FSDP
+            applied to them separately with mixed precision disabled (meaning
+            that the final FSDP construction would deviate from the specified
+            policy). If ``auto_wrap_policy`` is not specified, then this does
+            not do anything. This API is experimental and subject to change.
+            (Default: ``(_BatchNorm,)``)
+
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Only floating point tensors are cast to their specified dtypes.
+
+    .. note:: In ``summon_full_params``, parameters are forced to full
+        precision, but buffers are not.
+
+    .. note:: Layer norm and batch norm accumulate in ``float32`` even when
+        their inputs are in a low precision like ``float16`` or ``bfloat16``.
+        Disabling FSDP's mixed precision for those norm modules only means that
+        the affine parameters are kept in ``float32``. However, this incurs
+        separate all-gathers and reduce-scatters for those norm modules, which
+        may be inefficient, so if the workload permits, the user should prefer
+        to still apply mixed precision to those modules.
+
+    .. note:: By default, if the user passes a model with any ``_BatchNorm``
+        modules and specifies an ``auto_wrap_policy``, then the batch norm
+        modules will have FSDP applied to them separately with mixed precision
+        disabled. See the ``_module_classes_to_ignore`` argument.
+
+    .. note:: ``MixedPrecision`` has ``cast_root_forward_inputs=True`` and
+        ``cast_forward_inputs=False`` by default. For the root FSDP instance,
+        its ``cast_root_forward_inputs`` takes precedence over its
+        ``cast_forward_inputs``. For non-root FSDP instances, their
+        ``cast_root_forward_inputs`` values are ignored. The default setting is
+        sufficient for the typical case where each FSDP instance has the same
+        ``MixedPrecision`` configuration and only needs to cast inputs to the
+        ``param_dtype`` at the beginning of the model's forward pass.
+
+    .. note:: For nested FSDP instances with different ``MixedPrecision``
+        configurations, we recommend setting individual ``cast_forward_inputs``
+        values to configure casting inputs or not before each instance's
+        forward. In such a case, since the casts happen before each FSDP
+        instance's forward, a parent FSDP instance should have its non-FSDP
+        submodules run before its FSDP submodules to avoid the activation dtype
+        being changed due to a different ``MixedPrecision`` configuration.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> model = nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3))
+            >>> model[1] = FSDP(
+            >>>     model[1],
+            >>>     mixed_precision=MixedPrecision(param_dtype=torch.float16, cast_forward_inputs=True),
+            >>> )
+            >>> model = FSDP(
+            >>>     model,
+            >>>     mixed_precision=MixedPrecision(param_dtype=torch.bfloat16, cast_forward_inputs=True),
+            >>> )
+
+        The above shows a working example. On the other hand, if ``model[1]``
+        were replaced with ``model[0]``, meaning that the submodule using
+        different ``MixedPrecision`` ran its forward first, then ``model[1]``
+        would incorrectly see ``float16`` activations instead of ``bfloat16``
+        ones.
+
+    """
+
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+    buffer_dtype: Optional[torch.dtype] = None
+    keep_low_precision_grads: bool = False
+    cast_forward_inputs: bool = False
+    cast_root_forward_inputs: bool = True
+    _module_classes_to_ignore: Sequence[Type[torch.nn.Module]] = (_BatchNorm,)
+
+
+@dataclass
+class CPUOffload:
+    """
+    This configures CPU offloading.
+
+    Attributes:
+        offload_params (bool): This specifies whether to offload parameters to
+            CPU when not involved in computation. If ``True``, then this
+            offloads gradients to CPU as well, meaning that the optimizer step
+            runs on CPU.
+    """
+
+    offload_params: bool = False
+
+
+class StateDictType(Enum):
+    """
+    This enum indicates that which type of ``state_dict`` the FSDP module is
+    currently processing (returning or loading).
+    The default value is FULL_STATE_DICT to comply the PyTorch convention.
+    ..note::
+        FSDP currently supports three types of ``state_dict``:
+            1. ``state_dict/load_state_dict`: this pair of APIs return and load
+               the non-sharded, unflattened parameters. The semantics is the
+               same as using DDP.
+            2. ``_local_state_dict/_load_local_state_dict``: this pair of APIs return
+               and load local sharded, flattened parameters. The values returned
+               by ``_local_state_dict`` can be directly used by FSDP and is only
+               meaningful to FSDP (because parameters are flattened). Note that
+               these APIs are meant for use via the :func:`state_dict_type`
+               context manager as follows:
+                   >>> # xdoctest: +SKIP("undefined variables")
+                   >>> with fsdp.state_dict_type(StateDictType.LOCAL_STATE_DICT):
+                   ...     state = fsdp.state_dict()  # loads local state dict
+            3. ``_sharded_state_dict/_load_sharded_state_dict``: this pair of APIs
+               return and load sharded, unflattened parameters. The ``state_dict``
+               return by ``sharded_state_dict`` can be used by all other parallel
+               schemes (resharding may be required).
+    """
+
+    FULL_STATE_DICT = auto()
+    LOCAL_STATE_DICT = auto()
+    SHARDED_STATE_DICT = auto()
+
+
+@dataclass
+class StateDictConfig:
+    """
+    ``StateDictConfig`` is the base class for all ``state_dict`` configuration
+    classes. Users should instantiate a child class (e.g.
+    ``FullStateDictConfig``) in order to configure settings for the
+    corresponding ``state_dict`` type supported by FSDP.
+
+    Attributes:
+        offload_to_cpu (bool): If ``True``, then FSDP offloads the state dict
+            values to CPU, and if ``False``, then FSDP keeps them on GPU.
+            (Default: ``False``)
+    """
+
+    offload_to_cpu: bool = False
+
+
+@dataclass
+class FullStateDictConfig(StateDictConfig):
+    """
+    ``FullStateDictConfig`` is a config class meant to be used with
+    ``StateDictType.FULL_STATE_DICT``. We recommend enabling both
+    ``offload_to_cpu=True`` and ``rank0_only=True`` when saving full state
+    dicts to save GPU memory and CPU memory, respectively. This config class
+    is meant to be used via the :func:`state_dict_type` context manager as
+    follows:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> fsdp = FSDP(model, auto_wrap_policy=...)
+        >>> cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+        >>> with FSDP.state_dict_type(fsdp, StateDictType.FULL_STATE_DICT, cfg):
+        >>>     state = fsdp.state_dict()
+        >>>     # `state` will be empty on non rank 0 and contain CPU tensors on rank 0.
+        >>> # To reload checkpoint for inference, finetuning, transfer learning, etc:
+        >>> model = model_fn() # Initialize model in preparation for wrapping with FSDP
+        >>> if dist.get_rank() == 0:
+        >>>     # Load checkpoint only on rank 0 to avoid memory redundancy
+        >>>     state_dict = torch.load("my_checkpoint.pt")
+        >>>     model.load_state_dict(state_dict)
+        >>> # All ranks initialize FSDP module as usual. `sync_module_states` argument
+        >>> # communicates loaded checkpoint states from rank 0 to rest of the world.
+        >>> fsdp = FSDP(model, device_id=torch.cuda.current_device(), auto_wrap_policy=..., sync_module_states=True)
+        >>> # After this point, all ranks have FSDP model with loaded checkpoint.
+
+    Attributes:
+        rank0_only (bool): If ``True``, then only rank 0 saves the full state
+            dict, and nonzero ranks save an empty dict. If ``False``, then all
+            ranks save the full state dict. (Default: ``False``)
+    """
+
+    rank0_only: bool = False
+
+
+@dataclass
+class LocalStateDictConfig(StateDictConfig):
+    pass
+
+
+@dataclass
+class ShardedStateDictConfig(StateDictConfig):
+    """
+    ``ShardedStateDictConfig`` is a config class meant to be used with
+    ``StateDictType.SHARDED_STATE_DICT``.
+
+    Attributes:
+        _use_dtensor (bool): If ``True``, then FSDP saves the state dict values
+            as ``DTensor``, and if ``False``, then FSDP saves them as
+            ``ShardedTensor``. (Default: ``False``)
+
+    .. warning:: ``_use_dtensor`` is a private field of :class:`ShardedStateDictConfig`
+      and it is used by FSDP to determine the type of state dict values. Users should not
+      manually modify ``_use_dtensor``.
+    """
+
+    _use_dtensor: bool = False
+
+
+@dataclass
+class OptimStateDictConfig:
+    """
+    ``OptimStateDictConfig`` is the base class for all ``optim_state_dict``
+    configuration classes.  Users should instantiate a child class (e.g.
+    ``FullOptimStateDictConfig``) in order to configure settings for the
+    corresponding ``optim_state_dict`` type supported by FSDP.
+
+    Attributes:
+        offload_to_cpu (bool): If ``True``, then FSDP offloads the state dict's
+            tensor values to CPU, and if ``False``, then FSDP keeps them on the
+            original device (which is GPU unless parameter CPU offloading is
+            enabled). (Default: ``True``)
+    """
+
+    offload_to_cpu: bool = True
+
+
+@dataclass
+class FullOptimStateDictConfig(OptimStateDictConfig):
+    """
+    Attributes:
+        rank0_only (bool): If ``True``, then only rank 0 saves the full state
+            dict, and nonzero ranks save an empty dict. If ``False``, then all
+            ranks save the full state dict. (Default: ``False``)
+    """
+
+    rank0_only: bool = False
+
+
+@dataclass
+class LocalOptimStateDictConfig(OptimStateDictConfig):
+    offload_to_cpu: bool = False
+
+
+@dataclass
+class ShardedOptimStateDictConfig(OptimStateDictConfig):
+    """
+    ``ShardedOptimStateDictConfig`` is a config class meant to be used with
+    ``StateDictType.SHARDED_STATE_DICT``.
+
+    Attributes:
+        _use_dtensor (bool): If ``True``, then FSDP saves the state dict values
+            as ``DTensor``, and if ``False``, then FSDP saves them as
+            ``ShardedTensor``. (Default: ``False``)
+
+    .. warning:: ``_use_dtensor`` is a private field of :class:`ShardedOptimStateDictConfig`
+      and it is used by FSDP to determine the type of state dict values. Users should not
+      manually modify ``_use_dtensor``.
+    """
+
+    _use_dtensor: bool = False
+
+
+@dataclass
+class StateDictSettings:
+    state_dict_type: StateDictType
+    state_dict_config: StateDictConfig
+    optim_state_dict_config: OptimStateDictConfig
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3bf141619a9d02e41f1c2f115caa6fc37086260
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py
@@ -0,0 +1,2075 @@
+# mypy: ignore-errors
+
+import contextlib
+import copy
+import functools
+import math
+import traceback
+import warnings
+from contextlib import contextmanager
+from enum import auto, Enum
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.distributed as dist
+import torch.distributed.fsdp._traversal_utils as traversal_utils
+import torch.nn as nn
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    _CHECKPOINT_WRAPPED_MODULE,
+    ActivationWrapper,
+)
+from torch.distributed.algorithms._comm_hooks import LOW_PRECISION_HOOKS
+from torch.distributed.fsdp._common_utils import (
+    _FSDPState,
+    _get_param_to_fqns,
+    FSDP_PREFIX,
+    FSDP_WRAPPED_MODULE,
+    TrainingState,
+)
+from torch.distributed.fsdp._dynamo_utils import _annotate_modules_for_dynamo
+from torch.distributed.fsdp._init_utils import (
+    _check_orig_params_flattened,
+    _init_buffer_state,
+    _init_core_state,
+    _init_device_handle,
+    _init_extension,
+    _init_ignored_module_states,
+    _init_param_handle_from_module,
+    _init_prefetching_state,
+    _init_process_group_state,
+    _init_runtime_state,
+    _init_state_dict_state,
+    HYBRID_SHARDING_STRATEGIES,
+    ProcessGroupType,
+)
+from torch.distributed.fsdp._runtime_utils import (
+    _get_fsdp_root_states,
+    _is_fsdp_root,
+    _lazy_init,
+    _post_forward,
+    _post_forward_reshard,
+    _pre_forward,
+    _pre_forward_unshard,
+    _root_pre_forward,
+)
+from torch.distributed.fsdp._wrap_utils import _auto_wrap
+from torch.distributed.fsdp.api import (
+    BackwardPrefetch,
+    CPUOffload,
+    FullOptimStateDictConfig,
+    FullStateDictConfig,
+    LocalOptimStateDictConfig,
+    LocalStateDictConfig,
+    MixedPrecision,
+    OptimStateDictConfig,
+    ShardedOptimStateDictConfig,
+    ShardedStateDictConfig,
+    ShardingStrategy,
+    StateDictConfig,
+    StateDictSettings,
+    StateDictType,
+)
+from torch.distributed.utils import _p_assert
+from ._flat_param import FlatParameter
+
+from ._optim_utils import (
+    _flatten_optim_state_dict,
+    _get_param_id_to_param_from_optim_input,
+    _get_param_key_to_param,
+    _get_param_to_param_id_from_optim_input,
+    _get_param_to_param_key,
+    _optim_state_dict,
+    _rekey_sharded_optim_state_dict,
+    _set_optim_use_dtensor,
+)
+from ._state_dict_utils import _register_all_state_dict_hooks
+from ._unshard_param_utils import (
+    _deregister_orig_params,
+    _register_flat_param,
+    _register_orig_params,
+    _unshard_params,
+    _unshard_params_recurse,
+)
+from .wrap import CustomPolicy, ModuleWrapPolicy
+
+
+__all__ = [
+    "FullyShardedDataParallel",
+    "OptimStateKeyType",
+]
+
+
+FLAT_PARAM = "_flat_param"
+
+
+class OptimStateKeyType(Enum):
+    """Represents the type of key in an optimizer state-dict."""
+
+    PARAM_NAME = auto()
+    PARAM_ID = auto()
+
+
+class FullyShardedDataParallel(nn.Module, _FSDPState):
+    """A wrapper for sharding module parameters across data parallel workers.
+
+    This is inspired by `Xu et al.`_ as well as the ZeRO Stage 3 from DeepSpeed_.
+    FullyShardedDataParallel is commonly shortened to FSDP.
+
+    .. _`Xu et al.`: https://arxiv.org/abs/2004.13336
+    .. _DeepSpeed: https://www.deepspeed.ai/
+
+    For advanced notes please refer to :ref:`fsdp_notes`.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> import torch
+        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        >>> torch.cuda.set_device(device_id)
+        >>> sharded_module = FSDP(my_module)
+        >>> optim = torch.optim.Adam(sharded_module.parameters(), lr=0.0001)
+        >>> x = sharded_module(x, y=3, z=torch.Tensor([1]))
+        >>> loss = x.sum()
+        >>> loss.backward()
+        >>> optim.step()
+
+    .. warning::
+        The optimizer must be initialized *after* the module has been wrapped
+        with FSDP since FSDP will shard and transform the module's parameters
+        in a way that may not preserve the original parameter variables. Thus,
+        the previously initialized optimizer may have stale references to the
+        parameters.
+
+    .. warning::
+        If the destination CUDA device has ID ``dev_id``, either (1)
+        ``module`` should already be placed on that device, (2) the device
+        should be set using ``torch.cuda.set_device(dev_id)``, or (3)
+        ``dev_id`` should be passed into the ``device_id`` constructor
+        argument. This FSDP instance's compute device will be that destination
+        device. For (1) and (3), the FSDP initialization always occurs on GPU.
+        For (2), the FSDP initialization happens on ``module`` 's current
+        device, which may be CPU.
+
+    .. warning::
+        FSDP currently does not support gradient accumulation outside
+        ``no_sync()`` when using CPU offloading. Trying to do so yields
+        incorrect results since FSDP will use the newly-reduced gradient
+        instead of accumulating with any existing gradient.
+
+    .. warning::
+        Changing the original parameter variable names after construction will
+        lead to undefined behavior.
+
+    .. warning::
+        Passing in the ``sync_module_states=True`` flag requires ``module`` to
+        be on GPU or to use the ``device_id`` argument to specify a CUDA device
+        that FSDP will move ``module`` to in the FSDP constructor. This is
+        because ``sync_module_states=True`` requires GPU communication.
+
+    .. warning::
+        As of PyTorch 1.12, FSDP only offers limited support for shared parameters
+        (for example, setting one ``Linear`` layer's weight to another's). In
+        particular, modules that share parameters must be wrapped as part of the
+        same FSDP unit. If enhanced shared parameter support is needed for your
+        use case, please ping https://github.com/pytorch/pytorch/issues/77724
+
+    .. warning::
+        FSDP has some constraints on freezing parameters (i.e. setting
+        ``param.requires_grad=False``). For ``use_orig_params=False``, each
+        FSDP instance must manage parameters that are all frozen or all
+        non-frozen. For ``use_orig_params=True``, FSDP supports mixing frozen
+        and non-frozen, but we recommend not doing so since then the gradient
+        memory usage will be higher than expected (namely, equivalent to not
+        freezing those parameters). This means that ideally, frozen parameters
+        should be isolated into their own ``nn.Module`` s and wrapped
+        separately with FSDP.
+
+    .. note::
+        Attempting to run the forward pass of a submodule that is contained in an
+        FSDP instance is not supported and will result in errors. This is because the
+        submodule's parameters will be sharded, but it itself is not an FSDP instance,
+        so its forward pass will not all-gather the full parameters appropriately.
+        This could potentially happen when attempting to run only the encoder of a
+        encoder-decoder model, and the encoder is not wrapped in its own FSDP instance. To
+        resolve this, please wrap the submodule in its own FSDP unit.
+
+    .. note::
+        FSDP moves input tensors to the ``forward`` method to the GPU compute
+        device, so the user does not need to manually move them from CPU.
+
+    .. warning::
+        The user should not modify the parameters between forward and backward
+        without using the :meth:`summon_full_params` context since the
+        modifications may not persist. Moreover, for ``use_orig_params=False``,
+        accessing the original parameters between forward and backward may
+        raise an illegal memory access.
+
+    .. warning::
+        For ``use_orig_params=True``, ``ShardingStrategy.SHARD_GRAD_OP``
+        exposes the unsharded parameters, not the sharded parameters, after
+        forward since it does not free the unsharded ones, unlike
+        ``ShardingStrategy.FULL_SHARD``. One caveat is that, since gradients
+        are always sharded or ``None``, ``ShardingStrategy.SHARD_GRAD_OP`` will
+        not expose the sharded gradients with the unsharded parameters after
+        forward. If you want to inspect the gradients, try
+        :meth:`summon_full_params` with ``with_grads=True``.
+
+    .. warning::
+        FSDP replaces managed modules' parameters with ``torch.Tensor`` views
+        during forward and backward computation for autograd-related reasons.
+        If your module's forward relies on saved references to the parameters
+        instead of reacquiring the references each iteration, then it will not
+        see FSDP's newly created views, and autograd will not work correctly.
+
+    .. note::
+        With ``limit_all_gathers=True``, you may see a gap in the FSDP
+        pre-forward where the CPU thread is not issuing any kernels. This is
+        intentional and shows the rate limiter in effect. Synchronizing the CPU
+        thread in that way prevents over-allocating memory for subsequent
+        all-gathers, and it should not actually delay GPU kernel execution.
+
+    .. note::
+        When using ``sharding_strategy=ShardingStrategy.HYBRID_SHARD`` with the
+        sharding process group being intra-node and the replication process
+        group being inter-node, setting ``NCCL_CROSS_NIC=1`` can help improve
+        the all-reduce times over the replication process group for some
+        cluster setups.
+
+    .. warning::
+        FSDP does not work with double backwards due to how it registers
+        backward hooks.
+
+    Args:
+        module (nn.Module):
+            This is the module to be wrapped with FSDP.
+        process_group (Optional[Union[ProcessGroup, Tuple[ProcessGroup, ProcessGroup]]]):
+            This is the process group over which the model is sharded and thus
+            the one used for FSDP's all-gather and reduce-scatter collective
+            communications. If ``None``, then FSDP uses the default process
+            group. For hybrid sharding strategies such as
+            ``ShardingStrategy.HYBRID_SHARD``, users can pass in a tuple of
+            process groups, representing the groups over which to shard and
+            replicate, respectively. If ``None``, then FSDP constructs process
+            groups for the user to shard intra-node and replicate inter-node.
+            (Default: ``None``)
+        sharding_strategy (Optional[ShardingStrategy]):
+            This configures the sharding strategy, which may trade off memory
+            saving and communication overhead. See :class:`ShardingStrategy`
+            for details. (Default: ``FULL_SHARD``)
+        cpu_offload (Optional[CPUOffload]):
+            This configures CPU offloading. If this is set to ``None``, then
+            no CPU offloading happens. See :class:`CPUOffload` for details.
+            (Default: ``None``)
+        auto_wrap_policy (Optional[Union[Callable[[nn.Module, bool, int], bool], ModuleWrapPolicy, CustomPolicy]]):
+            This specifies a policy to apply FSDP to submodules of ``module``,
+            which is needed for communication and computation overlap and thus
+            affects performance. If ``None``, then FSDP only applies to
+            ``module``, and users should manually apply FSDP to parent modules
+            themselves (proceeding bottom-up). For convenience, this accepts
+            ``ModuleWrapPolicy`` directly, which allows users to specify the
+            module classes to wrap (e.g. the transformer block). Otherwise,
+            this should be a callable that takes in three arguments
+            ``module: nn.Module``, ``recurse: bool``, and
+            ``nonwrapped_numel: int`` and should return a ``bool`` specifying
+            whether the passed-in ``module`` should have FSDP applied if
+            ``recurse=False`` or if the traversal should continue into the
+            module's subtree if ``recurse=True``. Users may add additional
+            arguments to the callable. The ``size_based_auto_wrap_policy`` in
+            ``torch.distributed.fsdp.wrap.py`` gives an example callable that
+            applies FSDP to a module if the parameters in its subtree exceed
+            100M numel. We recommend printing the model after applying FSDP
+            and adjusting as needed.
+
+            Example::
+
+                >>> def custom_auto_wrap_policy(
+                >>>     module: nn.Module,
+                >>>     recurse: bool,
+                >>>     nonwrapped_numel: int,
+                >>>     # Additional custom arguments
+                >>>     min_num_params: int = int(1e8),
+                >>> ) -> bool:
+                >>>     return nonwrapped_numel >= min_num_params
+                >>> # Configure a custom `min_num_params`
+                >>> my_auto_wrap_policy = functools.partial(custom_auto_wrap_policy, min_num_params=int(1e5))
+
+        backward_prefetch (Optional[BackwardPrefetch]):
+            This configures explicit backward prefetching of all-gathers. If
+            ``None``, then FSDP does not backward prefetch, and there is no
+            communication and computation overlap in the backward pass. See
+            :class:`BackwardPrefetch` for details. (Default: ``BACKWARD_PRE``)
+        mixed_precision (Optional[MixedPrecision]):
+            This configures native mixed precision for FSDP. If this is set to
+            ``None``, then no mixed precision is used. Otherwise, parameter,
+            buffer, and gradient reduction dtypes can be set. See
+            :class:`MixedPrecision` for details. (Default: ``None``)
+        ignored_modules (Optional[Iterable[torch.nn.Module]]): Modules whose
+            own parameters and child modules' parameters and buffers are
+            ignored by this instance. None of the modules directly in
+            ``ignored_modules`` should be :class:`FullyShardedDataParallel`
+            instances, and any child modules that are already-constructed
+            :class:`FullyShardedDataParallel` instances will not be ignored if
+            they are nested under this instance. This argument may be used to
+            avoid sharding specific parameters at module granularity when using an
+            ``auto_wrap_policy`` or if parameters' sharding is not managed by
+            FSDP. (Default: ``None``)
+        param_init_fn (Optional[Callable[[nn.Module], None]]):
+            A ``Callable[torch.nn.Module] -> None`` that
+            specifies how modules that are currently on the meta device should
+            be initialized onto an actual device. As of v1.12, FSDP detects
+            modules with parameters or buffers on meta device via ``is_meta``
+            and either applies ``param_init_fn`` if specified or calls
+            ``nn.Module.reset_parameters()`` otherwise. For both cases, the
+            implementation should *only* initialize the parameters/buffers of
+            the module, not those of its submodules. This is to avoid
+            re-initialization. In addition, FSDP also supports deferred
+            initialization via torchdistX's (https://github.com/pytorch/torchdistX)
+            ``deferred_init()`` API, where the deferred modules are initialized
+            by calling ``param_init_fn`` if specified or torchdistX's default
+            ``materialize_module()`` otherwise. If ``param_init_fn`` is
+            specified, then it is applied to all meta-device modules, meaning
+            that it should probably case on the module type. FSDP calls the
+            initialization function before parameter flattening and sharding.
+
+            Example::
+
+                >>> # xdoctest: +SKIP("undefined variables")
+                >>> module = MyModule(device="meta")
+                >>> def my_init_fn(module: nn.Module):
+                >>>     # E.g. initialize depending on the module type
+                >>>     ...
+                >>> fsdp_model = FSDP(module, param_init_fn=my_init_fn, auto_wrap_policy=size_based_auto_wrap_policy)
+                >>> print(next(fsdp_model.parameters()).device) # current CUDA device
+                >>> # With torchdistX
+                >>> module = deferred_init.deferred_init(MyModule, device="cuda")
+                >>> # Will initialize via deferred_init.materialize_module().
+                >>> fsdp_model = FSDP(module, auto_wrap_policy=size_based_auto_wrap_policy)
+
+        device_id (Optional[Union[int, torch.device]]): An ``int`` or
+            ``torch.device`` giving the CUDA device on which FSDP
+            initialization takes place, including the module initialization
+            if needed and the parameter sharding. This should be specified to
+            improve initialization speed if ``module`` is on CPU. If the
+            default CUDA device was set (e.g. via ``torch.cuda.set_device``),
+            then the user may pass ``torch.cuda.current_device`` to this.
+            (Default: ``None``)
+        sync_module_states (bool): If ``True``, then each FSDP module will
+            broadcast module parameters and buffers from rank 0 to ensure that
+            they are replicated across ranks (adding communication overhead to
+            this constructor). This can help load ``state_dict`` checkpoints
+            via ``load_state_dict`` in a memory efficient way. See
+            :class:`FullStateDictConfig` for an example of this. (Default:
+            ``False``)
+        forward_prefetch (bool): If ``True``, then FSDP *explicitly* prefetches
+            the next forward-pass all-gather before the current forward
+            computation. This is only useful for CPU-bound workloads, in which
+            case issuing the next all-gather earlier may improve overlap. This
+            should only be used for static-graph models since the prefetching
+            follows the first iteration's execution order. (Default: ``False``)
+        limit_all_gathers (bool): If ``True``, then FSDP explicitly
+            synchronizes the CPU thread to ensure GPU memory usage from only
+            *two* consecutive FSDP instances (the current instance running
+            computation and the next instance whose all-gather is prefetched).
+            If ``False``, then FSDP allows the CPU thread to issue all-gathers
+            without any extra synchronization. (Default: ``True``) We often
+            refer to this feature as the "rate limiter". This flag should only
+            be set to ``False`` for specific CPU-bound workloads with low
+            memory pressure in which case the CPU thread can aggressively issue
+            all kernels without concern for the GPU memory usage.
+        use_orig_params (bool): Setting this to ``True`` has FSDP use
+            ``module`` 's original parameters. FSDP exposes those original
+            parameters to the user via :meth:`nn.Module.named_parameters`
+            instead of FSDP's internal :class:`FlatParameter` s. This means
+            that the optimizer step runs on the original parameters, enabling
+            per-original-parameter hyperparameters. FSDP preserves the original
+            parameter variables and manipulates their data between unsharded
+            and sharded forms, where they are always views into the underlying
+            unsharded or sharded :class:`FlatParameter`, respectively. With the
+            current algorithm, the sharded form is always 1D, losing the
+            original tensor structure. An original parameter may have all,
+            some, or none of its data present for a given rank. In the none
+            case, its data will be like a size-0 empty tensor. Users should not
+            author programs relying on what data is present for a given
+            original parameter in its sharded form. ``True`` is required to
+            use ``torch.compile()``. Setting this to ``False`` exposes FSDP's
+            internal :class:`FlatParameter` s to the user via
+            :meth:`nn.Module.named_parameters`. (Default: ``False``)
+        ignored_states (Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]):
+            Ignored parameters or modules that will not be managed by this FSDP
+            instance, meaning that the parameters are not sharded and their
+            gradients are not reduced across ranks. This argument unifies with
+            the existing ``ignored_modules`` argument, and we may deprecate
+            ``ignored_modules`` soon. For backward compatibility, we keep both
+            ``ignored_states`` and `ignored_modules``, but FSDP only allows one
+            of them to be specified as not ``None``.
+    """
+
+    def __init__(
+        self,
+        module: nn.Module,
+        process_group: ProcessGroupType = None,
+        sharding_strategy: Optional[ShardingStrategy] = None,
+        cpu_offload: Optional[CPUOffload] = None,
+        auto_wrap_policy: Optional[
+            Union[Callable, ModuleWrapPolicy, CustomPolicy]
+        ] = None,
+        backward_prefetch: Optional[BackwardPrefetch] = BackwardPrefetch.BACKWARD_PRE,
+        mixed_precision: Optional[MixedPrecision] = None,
+        ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
+        param_init_fn: Optional[Callable[[nn.Module], None]] = None,
+        device_id: Optional[Union[int, torch.device]] = None,
+        sync_module_states: bool = False,
+        forward_prefetch: bool = False,
+        limit_all_gathers: bool = True,
+        use_orig_params: bool = False,
+        ignored_states: Union[
+            Optional[Iterable[torch.nn.Parameter]], Optional[Iterable[torch.nn.Module]]
+        ] = None,
+        device_mesh: Optional[DeviceMesh] = None,
+    ):
+        torch._C._log_api_usage_once("torch.distributed.fsdp")
+        super().__init__()
+        _init_ignored_module_states(self, module, ignored_modules, ignored_states)
+        _init_device_handle(self, module, self._ignored_params, device_id)
+
+        # Add module annotations for Dynamo support (see function for details)
+        _annotate_modules_for_dynamo(module, self._ignored_modules, use_orig_params)
+
+        # Initializes self.process_group, along with rank and world size. This will
+        # also set another attribute, _inter_node_pg, to control the process group
+        # over which sharding occurs, if sharding_strategy is {HYBRID_SHARD, _HYBRID_SHARD_ZERO2}.
+        # Note that this is done before auto_wrapping, so that child FSDP modules simply pick up
+        # the same process group state as the root FSDP module.
+        self._device_mesh = device_mesh
+        _init_process_group_state(
+            self,
+            process_group,
+            sharding_strategy,
+            auto_wrap_policy,
+            device_mesh,
+        )
+        if auto_wrap_policy is not None:
+            root_kwargs = {
+                "process_group": process_group,
+                "sharding_strategy": sharding_strategy,
+                "cpu_offload": cpu_offload,
+                "backward_prefetch": backward_prefetch,
+                "mixed_precision": mixed_precision,
+                "param_init_fn": param_init_fn,
+                "device_id": device_id,
+                "sync_module_states": sync_module_states,
+                "forward_prefetch": forward_prefetch,
+                "limit_all_gathers": limit_all_gathers,
+                "use_orig_params": use_orig_params,
+                "ignored_states": self._ignored_params,
+                "device_mesh": device_mesh,
+            }
+            if sharding_strategy in HYBRID_SHARDING_STRATEGIES and device_mesh is None:
+                # Share root process groups with children to maintain
+                # the invariant that all FSDP modules will have the same
+                # process groups.
+                root_kwargs["process_group"] = (self.process_group, self._inter_node_pg)
+
+            _auto_wrap(
+                module,
+                auto_wrap_policy,
+                self._ignored_modules,
+                self._ignored_params,
+                root_kwargs,
+                FullyShardedDataParallel,
+            )
+
+        backward_prefetch_limit = 1
+        forward_prefetch_limit = 1
+        _init_core_state(
+            self,
+            sharding_strategy,
+            mixed_precision,
+            cpu_offload,
+            limit_all_gathers,
+            use_orig_params,
+            backward_prefetch_limit,
+            forward_prefetch_limit,
+        )
+        _init_runtime_state(self)
+        _init_prefetching_state(self, backward_prefetch, forward_prefetch)
+        _init_buffer_state(self, module)
+        # extension needs to be set before `_init_param_handle_from_module()`
+        _init_extension(self, device_mesh)
+        _init_param_handle_from_module(
+            self,
+            module,
+            device_id,
+            param_init_fn,
+            sync_module_states,
+        )
+        self._fsdp_wrapped_module = module
+        if not use_orig_params:
+            _check_orig_params_flattened(self, self._ignored_params)
+            _register_flat_param(self, self)
+
+        # `_state_dict_type` controls the `state_dict()` behavior, which is
+        # implemented using post-save and pre-load hooks
+        _init_state_dict_state(self)
+        _register_all_state_dict_hooks(self)
+
+    @property
+    def module(self) -> nn.Module:
+        """Return the wrapped module."""
+        # FSDP's `.module` must refer to the innermost wrapped module when
+        # composing with other module wrappers in order for state dict to work
+        if isinstance(self._fsdp_wrapped_module, ActivationWrapper):
+            return getattr(self._fsdp_wrapped_module, _CHECKPOINT_WRAPPED_MODULE)
+        return self._fsdp_wrapped_module
+
+    @property
+    def _has_params(self) -> bool:
+        """Returns whether this FSDP instance manages any parameters."""
+        return hasattr(self, "_handle") and self._handle is not None
+
+    @property
+    def _flat_param(self) -> Optional[FlatParameter]:
+        return self._handle.flat_param if self._handle else None
+
+    def __getattr__(self, name: str) -> Any:
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self._fsdp_wrapped_module, name)
+
+    def __getitem__(self, key: int) -> Any:
+        """Forward indexing calls in case the module is an ``nn.Sequential``."""
+        if hasattr(self, FSDP_WRAPPED_MODULE):
+            return self._fsdp_wrapped_module.__getitem__(key)  # type: ignore[operator]
+        return super().__getitem__(key)
+
+    def check_is_root(self) -> bool:
+        """Check if this instance is a root FSDP module."""
+        return _is_fsdp_root(self, self)
+
+    @staticmethod
+    def fsdp_modules(
+        module: nn.Module,
+        root_only: bool = False,
+    ) -> List["FullyShardedDataParallel"]:
+        """Return all nested FSDP instances.
+
+        This possibly includes ``module`` itself and only includes FSDP root modules if ``root_only=True``.
+
+        Args:
+            module (torch.nn.Module): Root module, which may or may not be an
+                ``FSDP`` module.
+            root_only (bool): Whether to return only FSDP root modules.
+                (Default: ``False``)
+
+        Returns:
+            List[FullyShardedDataParallel]: FSDP modules that are nested in
+            the input ``module``.
+        """
+        if root_only:
+            return _get_fsdp_root_states(module)
+        return traversal_utils._get_fsdp_states(module)
+
+    def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel":
+        r"""Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
+
+        Typical use includes initializing the parameters of a model (see also :ref:`nn-init-doc`).
+
+        Compared to ``torch.nn.Module.apply``, this version additionally gathers
+        the full parameters before applying ``fn``. It should not be called from
+        within another ``summon_full_params`` context.
+
+        Args:
+            fn (:class:`Module` -> None): function to be applied to each submodule
+
+        Returns:
+            Module: self
+        """
+        uninitialized = self._is_root is None
+        self._assert_state(TrainingState.IDLE)
+        # Use `_unshard_params_recurse()` with `recurse=False` instead of
+        # `_unshard_fsdp_state_params()` directly to perform lazy
+        # initialization, which is needed to initialize `FlatParameter`
+        # parameter attributes as required by the unshard logic
+        with _unshard_params_recurse(
+            self,
+            self,
+            recurse=False,
+            writeback=True,
+            rank0_only=False,
+            offload_to_cpu=False,
+            with_grads=False,
+        ):
+            ret = super().apply(fn)
+
+        # Reset lazy init called in `_unshard_params_recurse()` since `apply()`
+        # may have been called on FSDP instance that is not truly a root, in
+        # which case it will be incorrectly marked as one.
+        if uninitialized and self._is_root:
+            for module in traversal_utils._get_fsdp_states(self):
+                module._reset_lazy_init()
+
+        return ret
+
+    def _mixed_precision_enabled_for_buffers(self) -> bool:
+        """Return whether the user explicitly enabled buffer mixed precision.
+
+        NOTE: Unlike parameters and gradient reduction, buffer mixed precision
+        is applied at the FSDP instance level, not the ``FlatParameter`` level,
+        which may be different for the composable code path.
+        """
+        return self.mixed_precision.buffer_dtype is not None
+
+    def _low_precision_hook_enabled(self) -> bool:
+        """Whether a low precision hook is registered or not."""
+        return self._comm_hook is not None and self._comm_hook in LOW_PRECISION_HOOKS
+
+    def _reset_lazy_init(self) -> None:
+        """Reset instance so :func:`_lazy_init` will run on the next forward."""
+        self._is_root: Optional[bool] = None
+
+    @staticmethod
+    def set_state_dict_type(
+        module: nn.Module,
+        state_dict_type: StateDictType,
+        state_dict_config: Optional[StateDictConfig] = None,
+        optim_state_dict_config: Optional[OptimStateDictConfig] = None,
+    ) -> StateDictSettings:
+        """Set the ``state_dict_type`` of all the descendant FSDP modules of the target module.
+
+        Also takes (optional) configuration for the model's and optimizer's state dict.
+        The target module does not have to be a FSDP module. If the target
+        module is a FSDP module, its ``state_dict_type`` will also be changed.
+
+        .. note:: This API should be called for only the top-level (root)
+            module.
+
+        .. note:: This API enables users to transparently use the conventional
+            ``state_dict`` API to take model checkpoints in cases where the
+            root FSDP module is wrapped by another ``nn.Module``. For example,
+            the following will ensure ``state_dict`` is called on all non-FSDP
+            instances, while dispatching into `sharded_state_dict` implementation
+            for FSDP:
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> model = DDP(FSDP(...))
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.SHARDED_STATE_DICT,
+            >>>     state_dict_config = ShardedStateDictConfig(offload_to_cpu=True),
+            >>>     optim_state_dict_config = OptimStateDictConfig(offload_to_cpu=True),
+            >>> )
+            >>> param_state_dict = model.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(model, optim)
+
+        Args:
+            module (torch.nn.Module): Root module.
+            state_dict_type (StateDictType): the desired ``state_dict_type`` to set.
+            state_dict_config (Optional[StateDictConfig]): the configuration for the
+                target ``state_dict_type``.
+            optim_state_dict_config (Optional[OptimStateDictConfig]): the configuration
+                for the optimizer state dict.
+
+        Returns:
+            A StateDictSettings that include the previous state_dict type and
+            configuration for the module.
+        """
+        _state_dict_type_to_config = {
+            StateDictType.FULL_STATE_DICT: FullStateDictConfig,
+            StateDictType.LOCAL_STATE_DICT: LocalStateDictConfig,
+            StateDictType.SHARDED_STATE_DICT: ShardedStateDictConfig,
+        }
+        _optim_state_dict_type_to_config = {
+            StateDictType.FULL_STATE_DICT: FullOptimStateDictConfig,
+            StateDictType.LOCAL_STATE_DICT: LocalOptimStateDictConfig,
+            StateDictType.SHARDED_STATE_DICT: ShardedOptimStateDictConfig,
+        }
+
+        # Use the default config if a state_dict config is not set.
+        state_dict_config_type = _state_dict_type_to_config[state_dict_type]
+        optim_state_dict_config_type = _optim_state_dict_type_to_config[state_dict_type]
+        if state_dict_config is None:
+            state_dict_config = state_dict_config_type()
+        if optim_state_dict_config is None:
+            optim_state_dict_config = optim_state_dict_config_type()
+        if state_dict_config_type != type(state_dict_config):
+            raise RuntimeError(
+                f"Expected state_dict_config of type {state_dict_config_type} "
+                f"but got {type(state_dict_config)}"
+            )
+        if optim_state_dict_config_type != type(optim_state_dict_config):
+            raise RuntimeError(
+                f"Expected optim_state_dict_config of type {optim_state_dict_config_type} "
+                f"but got {type(optim_state_dict_config)}"
+            )
+
+        # Set the state_dict type and configurations.
+        prev_state_dict_type = None
+        prev_state_dict_config = None
+        prev_optim_state_dict_config = None
+        for submodule in traversal_utils._get_fsdp_states(module):
+            if prev_state_dict_type is None:
+                prev_state_dict_type = submodule._state_dict_type
+            else:
+                assert (
+                    prev_state_dict_type == submodule._state_dict_type
+                ), "All FSDP modules should have the same state_dict_type."
+            if prev_state_dict_config is None:
+                prev_state_dict_config = submodule._state_dict_config
+            else:
+                assert isinstance(
+                    submodule._state_dict_config, type(prev_state_dict_config)
+                ), "All FSDP modules must have the same type of state_dict_config."
+            if prev_optim_state_dict_config is None:
+                prev_optim_state_dict_config = submodule._optim_state_dict_config
+            else:
+                assert isinstance(
+                    submodule._optim_state_dict_config,
+                    type(prev_optim_state_dict_config),
+                ), "All FSDP modules must have the same type of optim_state_dict_config."
+
+            submodule._state_dict_type = state_dict_type
+            submodule._state_dict_config = state_dict_config
+            submodule._optim_state_dict_config = optim_state_dict_config
+
+        return StateDictSettings(
+            prev_state_dict_type, prev_state_dict_config, prev_optim_state_dict_config
+        )
+
+    @staticmethod
+    def get_state_dict_type(module: nn.Module) -> StateDictSettings:
+        """Get the state_dict_type and the corresponding configurations for the FSDP modules rooted at ``module``.
+
+        The target module does not have to be an FSDP module.
+
+        Returns:
+            A ``StateDictSettings`` containing the state_dict_type and
+            state_dict / optim_state_dict configs that are currently set.
+
+        Raises:
+            ``AssertionError`` if the ``StateDictSettings`` for different
+            FSDP submodules differ.
+        """
+        state_dict_settings: Optional[StateDictSettings] = None
+        for submodule in FullyShardedDataParallel.fsdp_modules(module):
+            if state_dict_settings is None:
+                state_dict_settings = StateDictSettings(
+                    state_dict_type=submodule._state_dict_type,
+                    state_dict_config=submodule._state_dict_config,
+                    optim_state_dict_config=submodule._optim_state_dict_config,
+                )
+                _set_optim_use_dtensor(submodule, state_dict_settings)
+            else:
+                submodule_settings = StateDictSettings(
+                    submodule._state_dict_type,
+                    submodule._state_dict_config,
+                    submodule._optim_state_dict_config,
+                )
+                assert state_dict_settings == submodule_settings, (
+                    "All FSDP modules must have the same state dict settings."
+                    f"Got {submodule_settings} and {state_dict_settings}."
+                )
+                _set_optim_use_dtensor(submodule, submodule_settings)
+        return state_dict_settings
+
+    @staticmethod
+    @contextlib.contextmanager
+    def state_dict_type(
+        module: nn.Module,
+        state_dict_type: StateDictType,
+        state_dict_config: Optional[StateDictConfig] = None,
+        optim_state_dict_config: Optional[OptimStateDictConfig] = None,
+    ) -> Generator:
+        """Set the ``state_dict_type`` of all the descendant FSDP modules of the target module.
+
+        This context manager has the same functions as :meth:`set_state_dict_type`. Read the document of
+        :meth:`set_state_dict_type` for the detail.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> model = DDP(FSDP(...))
+            >>> with FSDP.state_dict_type(
+            >>>     model,
+            >>>     StateDictType.SHARDED_STATE_DICT,
+            >>> ):
+            >>>     checkpoint = model.state_dict()
+
+        Args:
+            module (torch.nn.Module): Root module.
+            state_dict_type (StateDictType): the desired ``state_dict_type`` to set.
+            state_dict_config (Optional[StateDictConfig]): the model ``state_dict``
+                configuration for the target ``state_dict_type``.
+            optim_state_dict_config (Optional[OptimStateDictConfig]): the optimizer
+               ``state_dict`` configuration for the target ``state_dict_type``.
+        """
+        prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(
+            module,
+            state_dict_type,
+            state_dict_config,
+            optim_state_dict_config,
+        )
+        yield
+        FullyShardedDataParallel.set_state_dict_type(
+            module,
+            prev_state_dict_settings.state_dict_type,
+            prev_state_dict_settings.state_dict_config,
+            prev_state_dict_settings.optim_state_dict_config,
+        )
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """Run the forward pass for the wrapped module, inserting FSDP-specific pre- and post-forward sharding logic."""
+        handle = self._handle
+        with torch.autograd.profiler.record_function(
+            "FullyShardedDataParallel.forward"
+        ):
+            args, kwargs = _root_pre_forward(self, self, args, kwargs)
+            unused = None
+            args, kwargs = _pre_forward(
+                self,
+                handle,
+                _pre_forward_unshard,
+                self._fsdp_wrapped_module,
+                args,
+                kwargs,
+            )
+            if handle:
+                _p_assert(
+                    handle.flat_param.device == self.compute_device,
+                    "Expected `FlatParameter` to be on the compute device "
+                    f"{self.compute_device} but got {handle.flat_param.device}",
+                )
+            output = self._fsdp_wrapped_module(*args, **kwargs)
+            return _post_forward(
+                self, handle, _post_forward_reshard, self, unused, output
+            )
+
+    @staticmethod
+    @contextlib.contextmanager
+    def summon_full_params(
+        module: nn.Module,
+        recurse: bool = True,
+        writeback: bool = True,
+        rank0_only: bool = False,
+        offload_to_cpu: bool = False,
+        with_grads: bool = False,
+    ) -> Generator:
+        r"""Expose full params for FSDP instances with this context manager.
+
+        Can be useful *after* forward/backward for a model to get
+        the params for additional processing or checking. It can take a non-FSDP
+        module and will summon full params for all contained FSDP modules as
+        well as their children, depending on the ``recurse`` argument.
+
+        .. note:: This can be used on inner FSDPs.
+        .. note:: This can *not* be used within a forward or backward pass. Nor
+            can forward and backward be started from within this context.
+        .. note:: Parameters will revert to their local shards after the context
+            manager exits, storage behavior is the same as forward.
+        .. note:: The full parameters can be modified, but only the portion
+            corresponding to the local param shard will persist after the
+            context manager exits (unless ``writeback=False``, in which case
+            changes will be discarded). In the case where FSDP does not shard
+            the parameters, currently only when ``world_size == 1``, or ``NO_SHARD``
+            config, the modification is persisted regardless of ``writeback``.
+        .. note:: This method works on modules which are not FSDP themselves but
+            may contain multiple independent FSDP units. In that case, the given
+            arguments will apply to all contained FSDP units.
+
+        .. warning:: Note that ``rank0_only=True`` in conjunction with
+            ``writeback=True`` is not currently supported and will raise an
+            error. This is because model parameter shapes would be different
+            across ranks within the context, and writing to them can lead to
+            inconsistency across ranks when the context is exited.
+
+        .. warning:: Note that ``offload_to_cpu`` and ``rank0_only=False`` will
+            result in full parameters being redundantly copied to CPU memory for
+            GPUs that reside on the same machine, which may incur the risk of
+            CPU OOM. It is recommended to use ``offload_to_cpu`` with
+            ``rank0_only=True``.
+
+        Args:
+            recurse (bool, Optional): recursively summon all params for nested
+                FSDP instances (default: True).
+            writeback (bool, Optional): if ``False``, modifications to params are
+                discarded after the context manager exits;
+                disabling this can be slightly more efficient (default: True)
+            rank0_only (bool, Optional): if ``True``, full parameters are
+                materialized on only global rank 0. This means that within the
+                context, only rank 0 will have full parameters and the other
+                ranks will have sharded parameters. Note that setting
+                ``rank0_only=True`` with ``writeback=True`` is not supported,
+                as model parameter shapes will be different across ranks
+                within the context, and writing to them can lead to
+                inconsistency across ranks when the context is exited.
+            offload_to_cpu (bool, Optional): If ``True``, full parameters are
+                offloaded to CPU. Note that this offloading currently only
+                occurs if the parameter is sharded (which is only not the case
+                for world_size = 1 or ``NO_SHARD`` config). It is recommended
+                to use ``offload_to_cpu`` with ``rank0_only=True`` to avoid
+                redundant copies of model parameters being offloaded to the same CPU memory.
+            with_grads (bool, Optional): If ``True``, gradients are also
+                unsharded with the parameters. Currently, this is only
+                supported when passing ``use_orig_params=True`` to the FSDP
+                constructor and ``offload_to_cpu=False`` to this method.
+                (Default: ``False``)
+        """
+        with _unshard_params(
+            module, recurse, writeback, rank0_only, offload_to_cpu, with_grads
+        ):
+            yield
+
+    @contextlib.contextmanager
+    def _deregister_orig_params_ctx(self):
+        """Deregister the original parameters and expose the :class:`FlatParameter`.
+
+        If a :class:`FlatParameter` is sharded, then
+        this refreshes the sharded views before exiting. This method should
+        only be called when using the original parameters.
+        """
+        _p_assert(
+            self._use_orig_params,
+            "`_deregister_orig_params_ctx()` should only be called when "
+            "`_use_orig_params=True`",
+        )
+        for fsdp_module in traversal_utils._get_fsdp_states(self):
+            _deregister_orig_params(fsdp_module, fsdp_module)
+        try:
+            yield
+        finally:
+            for fsdp_module in traversal_utils._get_fsdp_states(self):
+                _register_orig_params(fsdp_module, fsdp_module)
+
+    def _apply(self, *args, **kwargs):
+        """Deregister the original parameters and expose the :class:`FlatParameter` s before calling ``_apply()``."""
+        # When using the original parameters: Since (1) the `FlatParameter`s
+        # own the storage and (2) `_apply()` is the subroutine underlying the
+        # most common storage-changing ops like `to()` and `cuda()`, we
+        # override `_apply()` to have the storage change directly performed on
+        # the `FlatParameter`s instead of applying to the original parameters
+        # and then writing back to the `FlatParameter`s.
+        context = (
+            self._deregister_orig_params_ctx()
+            if self._use_orig_params
+            else contextlib.nullcontext()
+        )
+        with context:
+            return super()._apply(*args, **kwargs)
+
+    def named_buffers(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.Tensor]]:
+        """Return an iterator over module buffers, yielding both the name of the buffer and the buffer itself.
+
+        Intercepts buffer names and removes all occurrences of the FSDP-specific flattened buffer prefix
+        when inside the :meth:`summon_full_params` context manager.
+        """
+        should_clean_name = self.training_state == TrainingState.SUMMON_FULL_PARAMS
+        for buffer_name, buffer in super().named_buffers(*args, **kwargs):
+            if should_clean_name:
+                # Remove any instances of the FSDP-specific prefix; there can
+                # be multiple in the case of nested FSDP modules
+                buffer_name = buffer_name.replace(FSDP_PREFIX, "")
+            yield (buffer_name, buffer)
+
+    def named_parameters(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
+        """Return an iterator over module parameters, yielding both the name of the parameter and the parameter itself.
+
+        Intercepts parameter names and removes all occurrences of the FSDP-specific flattened parameter prefix
+        when inside the :meth:`summon_full_params` context manager.
+        """
+        should_clean_name = self.training_state == TrainingState.SUMMON_FULL_PARAMS
+        for param_name, param in super().named_parameters(*args, **kwargs):
+            if should_clean_name:
+                # Remove any instances of the FSDP-specific prefix; there can
+                # be multiple in the case of nested FSDP modules
+                param_name = param_name.replace(FSDP_PREFIX, "")
+            yield (param_name, param)
+
+    def _assert_state(self, state: Union[TrainingState, List[TrainingState]]) -> None:
+        """Assert we are in the given state."""
+        # Since assert can be turned off and this error checking
+        # is really important, we use explicit error checking
+        # and raise a ValueError if needed.
+        if isinstance(state, TrainingState):
+            state = [state]
+        if self.training_state not in state:
+            msg = (
+                f"expected to be in states {state} but current state "
+                f"is {self.training_state}"
+            )
+            # In case we are failing in the context of autograd hook, asserting
+            # may not generate useful msg. So, let's print it to be sure.
+            if self.rank == 0:
+                print(f"Asserting FSDP instance is: {self}")
+                print(f"ERROR: {msg}")
+                traceback.print_stack()
+            raise ValueError(msg)
+
+    @contextmanager
+    def no_sync(self) -> Generator:
+        """Disable gradient synchronizations across FSDP instances.
+
+        Within this context, gradients will be accumulated in module
+        variables, which will later be synchronized in the first
+        forward-backward pass after exiting the context. This should only be
+        used on the root FSDP instance and will recursively apply to all
+        children FSDP instances.
+
+        .. note:: This likely results in higher memory usage because FSDP will
+            accumulate the full model gradients (instead of gradient shards)
+            until the eventual sync.
+
+        .. note:: When used with CPU offloading, the gradients will not be
+            offloaded to CPU when inside the context manager. Instead, they
+            will only be offloaded right after the eventual sync.
+        """
+        _lazy_init(self, self)
+        if not self._is_root:
+            raise RuntimeError(
+                "`no_sync()` on inner FSDP instances is not supported. Please call `no_sync()` on root FSDP module."
+            )
+        self._assert_state(TrainingState.IDLE)
+        old_flags = []
+        for m in self.modules():
+            if isinstance(m, FullyShardedDataParallel):
+                old_flags.append((m, m._sync_gradients))
+                m._sync_gradients = False
+        try:
+            yield
+        finally:
+            for m, old_flag in old_flags:
+                assert not m._sync_gradients, (
+                    "`_sync_gradients` was incorrectly set to "
+                    "`True` while in the `no_sync()` context manager"
+                )
+                m._sync_gradients = old_flag
+
+    @torch.no_grad()
+    def clip_grad_norm_(
+        self, max_norm: Union[float, int], norm_type: Union[float, int] = 2.0
+    ) -> torch.Tensor:
+        """Clip the gradient norm of all parameters.
+
+        The norm is computed over all parameters' gradients as viewed as a single vector, and the
+        gradients are modified in-place.
+
+        Args:
+            max_norm (float or int): max norm of the gradients
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'``
+                for infinity norm.
+
+        Returns:
+            Total norm of the parameters (viewed as a single vector).
+
+        .. note:: If every FSDP instance uses ``NO_SHARD``, meaning that no
+            gradients are sharded across ranks, then you may directly use
+            :func:`torch.nn.utils.clip_grad_norm_`.
+
+        .. note:: If at least some FSDP instance uses a sharded strategy (i.e.
+            one other than ``NO_SHARD``), then you should use this method
+            instead of :func:`torch.nn.utils.clip_grad_norm_` since this method
+            handles the fact that gradients are sharded across ranks.
+
+        .. note:: The total norm returned will have the "largest" dtype across
+            all parameters/gradients as defined by PyTorch's type promotion
+            semantics. For example, if *all* parameters/gradients use a low
+            precision dtype, then the returned norm's dtype will be that low
+            precision dtype, but if there exists at least one parameter/
+            gradient using FP32, then the returned norm's dtype will be FP32.
+
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications.
+        """
+        _lazy_init(self, self)
+        if not self._is_root:
+            raise RuntimeError(
+                "`clip_grad_norm_()` should only be called on the root FSDP instance"
+            )
+        self._assert_state(TrainingState.IDLE)
+        # If every FSDP instance uses `NO_SHARD`, then we can directly use
+        # the normal `nn.utils` one targeting local gradients
+        all_no_shard = all(
+            not handle.uses_sharded_strategy for handle in self._all_handles
+        )
+        if all_no_shard:
+            return torch.nn.utils.clip_grad_norm_(
+                self.parameters(), max_norm, norm_type
+            )
+        # Otherwise, there exists some FSDP instance using a sharded strategy,
+        # where sharded and non-sharded parameters must be handled separately
+        max_norm = float(max_norm)
+        norm_type = float(norm_type)
+        sharded_params = set()
+        nonsharded_params = set()  # `NO_SHARD` or not FSDP-managed
+        grads: List[torch.Tensor] = []
+        for handle in self._all_handles:
+            target_set = (
+                sharded_params if handle.uses_sharded_strategy else nonsharded_params
+            )
+            if handle._use_orig_params:
+                for param in handle.flat_param._params:
+                    target_set.add(param)
+                    if param.grad is not None:
+                        grads.append(param.grad)
+            else:
+                target_set.add(handle.flat_param)
+                if handle.flat_param.grad is not None:
+                    grads.append(handle.flat_param.grad)
+        for param in self.parameters():
+            not_fsdp_managed = (
+                param not in sharded_params and param not in nonsharded_params
+            )
+            if not_fsdp_managed:
+                nonsharded_params.add(param)
+                if param.grad is not None:
+                    grads.append(param.grad)
+        # Compute local norms (forced to be in FP32)
+        local_sharded_norm = _get_grad_norm(sharded_params, norm_type).to(
+            self.compute_device
+        )
+        local_nonsharded_norm = _get_grad_norm(nonsharded_params, norm_type).to(
+            self.compute_device
+        )
+        # Reconstruct the total gradient norm depending on the norm type
+        if norm_type == math.inf:
+            total_norm = torch.maximum(local_sharded_norm, local_nonsharded_norm)
+            dist.all_reduce(
+                total_norm, op=torch.distributed.ReduceOp.MAX, group=self.process_group
+            )
+        else:
+            total_norm = local_sharded_norm**norm_type
+            dist.all_reduce(total_norm, group=self.process_group)
+            # All-reducing the local non-sharded norm would count it an extra
+            # world-size-many times
+            total_norm += local_nonsharded_norm**norm_type
+            total_norm = total_norm ** (1.0 / norm_type)
+        if self.cpu_offload.offload_params:
+            total_norm = total_norm.cpu()
+
+        clip_coef = max_norm / (total_norm + 1e-6)
+        # Multiplying by the clamped coefficient is meaningless when it is
+        # equal to 1, but it avoids the host-device sync that would result from
+        # `if clip_coef < 1`
+        clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+        for grad in grads:
+            grad.mul_(clip_coef_clamped.to(grad.device, grad.dtype))
+        # Use the "largest" dtype by type promotion semantics to use the same
+        # dtype as if we did not force local norm computation to be in FP32
+        if len(grads) == 0:
+            # If this rank has no gradients, then we must default to FP32
+            # unless we use additional communication, which we prefer to avoid
+            # since `clip_grad_norm_()` is called in the training loop
+            warnings.warn(
+                f"Called FSDP.clip_grad_norm_() on rank {self.rank} with no "
+                "gradients -- returning the total norm in the default dtype "
+                f"{total_norm.dtype}"
+            )  # warn since this is generally unexpected
+            return total_norm
+        total_norm_dtype = functools.reduce(
+            torch.promote_types,
+            [grad.dtype for grad in grads],
+        )
+        return total_norm.to(total_norm_dtype)
+
+    @staticmethod
+    def _warn_optim_input(optim_input):
+        if optim_input is not None:
+            warnings.warn(
+                "The `optim_input` argument is deprecated and will be removed after PyTorch 1.13. You may remove it "
+                "from your code without changing its functionality."
+            )
+
+    @staticmethod
+    def _is_using_optim_input(optim_input, optim) -> bool:
+        if optim_input is None and optim is None:
+            # Use the default behavior of `optim_input``
+            return True
+        if optim_input is not None:
+            # Use the `optim_input` code path
+            return True
+        # Use the `optim` code path
+        return False
+
+    @staticmethod
+    def _warn_legacy_optim_state_dict(curr: str, new: str):
+        warnings.warn(
+            f"``FullyShardedDataParallel.{curr}``is being deprecated and is "
+            f"replaced by ``FullyShardedDataParallel.{new}``. "
+            f"``FullyShardedDataParallel.{curr}`` may be removed after PyTorch 2.2."
+        )
+
+    @staticmethod
+    def _optim_state_dict_impl(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        optim_state_dict: Dict[str, Any],
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        rank0_only: bool = True,
+        full_state_dict: bool = True,
+        group: Optional[dist.ProcessGroup] = None,
+        cpu_offload: bool = True,
+    ) -> Dict[str, Any]:
+        """Transform the state-dict of an optimizer corresponding to a sharded model.
+
+        This is the internal API that is used by all the optim_state_dict implementations.
+        Given model, optim, the original optim_state_dict, this API removes the
+        FSDP internal information and internal sharding from the optim_state_dict.
+        """
+        if full_state_dict:
+            FullyShardedDataParallel._warn_optim_input(optim_input)
+            using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+                optim_input,
+                optim,
+            )
+        else:
+            using_optim_input = False
+            assert optim_input is None and not rank0_only
+
+        use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
+            0
+        ]._use_orig_params
+        assert all(
+            use_orig_params == m._use_orig_params
+            for m in FullyShardedDataParallel.fsdp_modules(model)
+        ), "Not all FSDP modules have the same _use_orig_params value"
+
+        return _optim_state_dict(
+            model=model,
+            optim=optim,
+            optim_state_dict=optim_state_dict,
+            optim_input=optim_input,
+            rank0_only=rank0_only,
+            shard_state=not full_state_dict,
+            group=group,
+            using_optim_input=using_optim_input,
+            use_orig_params=use_orig_params,
+            cpu_offload=cpu_offload,
+        )
+
+    @staticmethod
+    def _optim_state_dict_to_load_impl(
+        optim_state_dict: Dict[str, Any],
+        model: torch.nn.Module,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
+        full_state_dict: bool = True,
+        rank0_only: bool = False,
+        is_named_optimizer: bool = False,
+        group: Optional[dist.ProcessGroup] = None,
+    ) -> Dict[str, Any]:
+        """
+        Convert an optimizer state-dict so that it can be loaded into the optimizer associated with the FSDP model.
+
+        This is the internal API that is used by all the load optim_state_dict implementations.
+        Given model, optim, and the saved optim_state_dict, this API adds the FSDP
+        internal information and internal sharding to the optim_state_dict.
+        """
+        if full_state_dict:
+            FullyShardedDataParallel._warn_optim_input(optim_input)
+            using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+                optim_input,
+                optim,
+            )
+        else:
+            using_optim_input = False
+            assert optim_input is None and not rank0_only
+
+        use_orig_params = FullyShardedDataParallel.fsdp_modules(model)[
+            0
+        ]._use_orig_params
+        assert all(
+            use_orig_params == m._use_orig_params
+            for m in FullyShardedDataParallel.fsdp_modules(model)
+        ), "Not all FSDP modules have the same _use_orig_params value"
+
+        if rank0_only and dist.get_rank(group) > 0:
+            optim_state_dict = {}
+        sharded_osd = _flatten_optim_state_dict(
+            optim_state_dict,
+            model=model,
+            use_orig_params=use_orig_params,
+            optim=(optim if is_named_optimizer else None),
+            rank0_only=rank0_only,
+            group=group,
+        )
+        return _rekey_sharded_optim_state_dict(
+            sharded_osd,
+            model=model,
+            optim=optim,
+            optim_input=optim_input,
+            using_optim_input=using_optim_input,
+            is_named_optimizer=is_named_optimizer,
+        )
+
+    @staticmethod
+    def full_optim_state_dict(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        rank0_only: bool = True,
+        group: Optional[dist.ProcessGroup] = None,
+    ) -> Dict[str, Any]:
+        """Return the full optimizer state-dict.
+
+        Consolidates the full optimizer state on rank 0 and returns it
+        as a :class:`dict` following the convention of
+        :meth:`torch.optim.Optimizer.state_dict`, i.e. with keys ``"state"``
+        and ``"param_groups"``. The flattened parameters in ``FSDP`` modules
+        contained in ``model`` are mapped back to their unflattened parameters.
+
+        .. warning:: This needs to be called on all ranks since it uses
+            collective communications. However, if ``rank0_only=True``, then
+            the state dict is only populated on rank 0, and all other ranks
+            return an empty :class:`dict`.
+
+        .. warning:: Unlike ``torch.optim.Optimizer.state_dict()``, this method
+            uses full parameter names as keys instead of parameter IDs.
+
+        .. note:: Like in :meth:`torch.optim.Optimizer.state_dict`, the tensors
+            contained in the optimizer state dict are not cloned, so there may
+            be aliasing surprises. For best practices, consider saving the
+            returned optimizer state dict immediately, e.g. using
+            ``torch.save()``.
+
+        Args:
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer ``optim`` representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
+            rank0_only (bool): If ``True``, saves the populated :class:`dict`
+                only on rank 0; if ``False``, saves it on all ranks. (Default:
+                ``True``)
+            group (dist.ProcessGroup): Model's process group or ``None`` if using
+                the default process group. (Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: A :class:`dict` containing the optimizer state for
+            ``model`` 's original unflattened parameters and including keys
+            "state" and "param_groups" following the convention of
+            :meth:`torch.optim.Optimizer.state_dict`. If ``rank0_only=True``,
+            then nonzero ranks return an empty :class:`dict`.
+        """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "full_optim_state_dict", "optim_state_dict"
+        )
+        return FullyShardedDataParallel._optim_state_dict_impl(
+            model=model,
+            optim=optim,
+            optim_state_dict=optim.state_dict(),
+            optim_input=optim_input,
+            rank0_only=rank0_only,
+            group=group,
+            full_state_dict=True,
+        )
+
+    @staticmethod
+    def sharded_optim_state_dict(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        group: Optional[dist.ProcessGroup] = None,
+    ) -> Dict[str, Any]:
+        """Return the optimizer state-dict in its sharded form.
+
+        The API is similar to :meth:`full_optim_state_dict` but this API chunks
+        all non-zero-dimension states to :class:`ShardedTensor` to save memory.
+        This API should only be used when the model ``state_dict`` is derived
+        with the context manager ``with state_dict_type(SHARDED_STATE_DICT):``.
+
+        For the detailed usage, refer to :meth:`full_optim_state_dict`.
+
+        .. warning:: The returned state dict contains ``ShardedTensor`` and
+            cannot be directly used by the regular ``optim.load_state_dict``.
+        """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "sharded_optim_state_dict", "optim_state_dict"
+        )
+        return FullyShardedDataParallel._optim_state_dict_impl(
+            model=model,
+            optim=optim,
+            optim_state_dict=optim.state_dict(),
+            optim_input=None,
+            rank0_only=False,
+            full_state_dict=False,
+            group=group,
+        )
+
+    @staticmethod
+    def shard_full_optim_state_dict(
+        full_optim_state_dict: Dict[str, Any],
+        model: torch.nn.Module,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
+    ) -> Dict[str, Any]:
+        """Shard a full optimizer state-dict.
+
+        Remaps the state in ``full_optim_state_dict`` to flattened parameters instead of unflattened
+        parameters and restricts to only this rank's part of the optimizer state.
+        The first argument should be the return value of :meth:`full_optim_state_dict`.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> model, optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(model, optim)
+            >>> torch.save(full_osd, PATH)
+            >>> # Define new model with possibly different world size
+            >>> new_model, new_optim = ...
+            >>> full_osd = torch.load(PATH)
+            >>> sharded_osd = FSDP.shard_full_optim_state_dict(full_osd, new_model)
+            >>> new_optim.load_state_dict(sharded_osd)
+
+        .. note:: Both :meth:`shard_full_optim_state_dict` and
+            :meth:`scatter_full_optim_state_dict` may be used to get the
+            sharded optimizer state dict to load. Assuming that the full
+            optimizer state dict resides in CPU memory, the former requires
+            each rank to have the full dict in CPU memory, where each rank
+            individually shards the dict without any communication, while the
+            latter requires only rank 0 to have the full dict in CPU memory,
+            where rank 0 moves each shard to GPU memory (for NCCL) and
+            communicates it to ranks appropriately. Hence, the former has
+            higher aggregate CPU memory cost, while the latter has higher
+            communication cost.
+
+        Args:
+            full_optim_state_dict (Dict[str, Any]): Optimizer state dict
+                corresponding to the unflattened parameters and holding the
+                full non-sharded optimizer state.
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                correspond to the optimizer state in ``full_optim_state_dict``.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
+            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
+                the state dict returned by this method. This is the preferred
+                argument to use over ``optim_input``. (Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: The full optimizer state dict now remapped to
+            flattened parameters instead of unflattened parameters and
+            restricted to only include this rank's part of the optimizer state.
+        """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "shard_full_optim_state_dict", "optim_state_dict_to_load"
+        )
+        return FullyShardedDataParallel._optim_state_dict_to_load_impl(
+            optim_state_dict=full_optim_state_dict,
+            model=model,
+            optim_input=optim_input,
+            optim=optim,
+            full_state_dict=True,
+            is_named_optimizer=False,
+        )
+
+    @staticmethod
+    def flatten_sharded_optim_state_dict(
+        sharded_optim_state_dict: Dict[str, Any],
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+    ) -> Dict[str, Any]:
+        """Flatten a sharded optimizer state-dict.
+
+        The API is similar to :meth:`shard_full_optim_state_dict`. The only
+        difference is that the input ``sharded_optim_state_dict`` should be
+        returned from :meth:`sharded_optim_state_dict`. Therefore, there will
+        be all-gather calls on each rank to gather ``ShardedTensor`` s.
+
+        Args:
+            sharded_optim_state_dict (Dict[str, Any]): Optimizer state dict
+                corresponding to the unflattened parameters and holding the
+                sharded optimizer state.
+            model (torch.nn.Module):
+                Refer to :meth:`shard_full_optim_state_dict`.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+
+        Returns:
+            Refer to :meth:`shard_full_optim_state_dict`.
+        """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "flatten_sharded_optim_state_dict", "optim_state_dict_to_load"
+        )
+        return FullyShardedDataParallel._optim_state_dict_to_load_impl(
+            optim_state_dict=sharded_optim_state_dict,
+            model=model,
+            optim_input=None,
+            optim=optim,
+            full_state_dict=False,
+            is_named_optimizer=False,
+        )
+
+    @staticmethod
+    def scatter_full_optim_state_dict(
+        full_optim_state_dict: Optional[Dict[str, Any]],
+        model: torch.nn.Module,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
+        group: Optional[Any] = None,
+    ) -> Dict[str, Any]:
+        """Scatter the full optimizer state dict from rank 0 to all other ranks.
+
+        Returns the sharded optimizer state dict on each rank.
+        The return value is the same as :meth:`shard_full_optim_state_dict`, and on rank
+        0, the first argument should be the return value of
+        :meth:`full_optim_state_dict`.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> model, optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(model, optim)  # only non-empty on rank 0
+            >>> # Define new model with possibly different world size
+            >>> new_model, new_optim, new_group = ...
+            >>> sharded_osd = FSDP.scatter_full_optim_state_dict(full_osd, new_model, group=new_group)
+            >>> new_optim.load_state_dict(sharded_osd)
+
+        .. note:: Both :meth:`shard_full_optim_state_dict` and
+            :meth:`scatter_full_optim_state_dict` may be used to get the
+            sharded optimizer state dict to load. Assuming that the full
+            optimizer state dict resides in CPU memory, the former requires
+            each rank to have the full dict in CPU memory, where each rank
+            individually shards the dict without any communication, while the
+            latter requires only rank 0 to have the full dict in CPU memory,
+            where rank 0 moves each shard to GPU memory (for NCCL) and
+            communicates it to ranks appropriately. Hence, the former has
+            higher aggregate CPU memory cost, while the latter has higher
+            communication cost.
+
+        Args:
+            full_optim_state_dict (Optional[Dict[str, Any]]): Optimizer state
+                dict corresponding to the unflattened parameters and holding
+                the full non-sharded optimizer state if on rank 0; the argument
+                is ignored on nonzero ranks.
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                correspond to the optimizer state in ``full_optim_state_dict``.
+            optim_input (Optional[Union[List[Dict[str, Any]], Iterable[torch.nn.Parameter]]]):
+                Input passed into the optimizer representing either a
+                :class:`list` of parameter groups or an iterable of parameters;
+                if ``None``, then this method assumes the input was
+                ``model.parameters()``. This argument is deprecated, and there
+                is no need to pass it in anymore. (Default: ``None``)
+            optim (Optional[torch.optim.Optimizer]): Optimizer that will load
+                the state dict returned by this method. This is the preferred
+                argument to use over ``optim_input``. (Default: ``None``)
+            group (dist.ProcessGroup): Model's process group or ``None`` if
+                using the default process group. (Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: The full optimizer state dict now remapped to
+            flattened parameters instead of unflattened parameters and
+            restricted to only include this rank's part of the optimizer state.
+        """
+        FullyShardedDataParallel._warn_legacy_optim_state_dict(
+            "scatter_full_optim_state_dict", "optim_state_dict_to_load"
+        )
+        return FullyShardedDataParallel._optim_state_dict_to_load_impl(
+            optim_state_dict=full_optim_state_dict,
+            model=model,
+            optim_input=optim_input,
+            optim=optim,
+            full_state_dict=True,
+            rank0_only=True,
+            is_named_optimizer=False,
+            group=group,
+        )
+
+    @staticmethod
+    def rekey_optim_state_dict(
+        optim_state_dict: Dict[str, Any],
+        optim_state_key_type: OptimStateKeyType,
+        model: torch.nn.Module,
+        optim_input: Optional[
+            Union[
+                List[Dict[str, Any]],
+                Iterable[torch.nn.Parameter],
+            ]
+        ] = None,
+        optim: Optional[torch.optim.Optimizer] = None,
+    ) -> Dict[str, Any]:
+        """Re-keys the optimizer state dict ``optim_state_dict`` to use the key type ``optim_state_key_type``.
+
+        This can be used to achieve compatibility between optimizer state dicts from models with FSDP
+        instances and ones without.
+
+        To re-key an FSDP full optimizer state dict (i.e. from
+        :meth:`full_optim_state_dict`) to use parameter IDs and be loadable to
+        a non-wrapped model::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> wrapped_model, wrapped_optim = ...
+            >>> full_osd = FSDP.full_optim_state_dict(wrapped_model, wrapped_optim)
+            >>> nonwrapped_model, nonwrapped_optim = ...
+            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(full_osd, OptimStateKeyType.PARAM_ID, nonwrapped_model)
+            >>> nonwrapped_optim.load_state_dict(rekeyed_osd)
+
+        To re-key a normal optimizer state dict from a non-wrapped model to be
+        loadable to a wrapped model::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> nonwrapped_model, nonwrapped_optim = ...
+            >>> osd = nonwrapped_optim.state_dict()
+            >>> rekeyed_osd = FSDP.rekey_optim_state_dict(osd, OptimStateKeyType.PARAM_NAME, nonwrapped_model)
+            >>> wrapped_model, wrapped_optim = ...
+            >>> sharded_osd = FSDP.shard_full_optim_state_dict(rekeyed_osd, wrapped_model)
+            >>> wrapped_optim.load_state_dict(sharded_osd)
+
+        Returns:
+            Dict[str, Any]: The optimizer state dict re-keyed using the
+            parameter keys specified by ``optim_state_key_type``.
+        """
+        FullyShardedDataParallel._warn_optim_input(optim_input)
+        using_optim_input = FullyShardedDataParallel._is_using_optim_input(
+            optim_input,
+            optim,
+        )
+        assert optim_state_key_type in (
+            OptimStateKeyType.PARAM_NAME,
+            OptimStateKeyType.PARAM_ID,
+        )
+        osd = optim_state_dict  # alias
+        # Validate that the existing parameter keys are uniformly typed
+        uses_param_name_mask = [type(param_key) is str for param_key in osd["state"]]
+        uses_param_id_mask = [type(param_key) is int for param_key in osd["state"]]
+        if (any(uses_param_name_mask) and not all(uses_param_name_mask)) or (
+            any(uses_param_id_mask) and not all(uses_param_id_mask)
+        ):
+            error_msg = f"Invalid parameter keys: {osd['state'].keys()}"
+            raise ValueError(error_msg)
+        # Return directly if the existing key type matches the target key type
+        if (
+            optim_state_key_type == OptimStateKeyType.PARAM_NAME
+            and all(uses_param_name_mask)
+        ) or (
+            optim_state_key_type == OptimStateKeyType.PARAM_ID
+            and all(uses_param_id_mask)
+        ):
+            return osd
+        # Otherwise, actually perform the re-keying
+        new_osd = {}
+        if optim_state_key_type == OptimStateKeyType.PARAM_NAME:  # ID -> name
+            param_id_to_param = (
+                _get_param_id_to_param_from_optim_input(model, optim_input)
+                if using_optim_input
+                else _get_param_key_to_param(optim)
+            )
+            param_to_param_name = _get_param_to_fqn(model)
+            param_id_to_param_name: List[str] = [
+                param_to_param_name[param] for param in param_id_to_param.values()
+            ]
+            new_osd["state"] = {
+                param_id_to_param_name[param_id]: param_state
+                for param_id, param_state in osd["state"].items()
+            }
+            new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
+            for param_group in new_osd["param_groups"]:
+                param_group["params"] = sorted(
+                    [
+                        param_id_to_param_name[param_id]
+                        for param_id in param_group["params"]
+                    ]
+                )
+            return new_osd
+        elif optim_state_key_type == OptimStateKeyType.PARAM_ID:  # name -> ID
+            param_name_to_param = _get_fqn_to_param(model)
+            param_to_param_id = (
+                _get_param_to_param_id_from_optim_input(model, optim_input)
+                if using_optim_input
+                else _get_param_to_param_key(optim)
+            )
+            # Because not all model parameters may be passed as the optimizer
+            # input, we may need to drop some parameters from this mapping
+            param_name_to_param_id = {
+                param_name: param_to_param_id[param]
+                for param_name, param in param_name_to_param.items()
+                if param in param_to_param_id
+            }
+            new_osd["state"] = {
+                param_name_to_param_id[param_name]: param_state
+                for param_name, param_state in osd["state"].items()
+            }
+            new_osd["param_groups"] = copy.deepcopy(osd["param_groups"])
+            for param_group in new_osd["param_groups"]:
+                param_group["params"] = sorted(
+                    [
+                        param_name_to_param_id[param_name]
+                        for param_name in param_group["params"]
+                    ]
+                )
+            return new_osd
+        return new_osd  # should never reach here
+
+    @staticmethod
+    def optim_state_dict(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        optim_state_dict: Optional[Dict[str, Any]] = None,
+        group: Optional[dist.ProcessGroup] = None,
+    ) -> Dict[str, Any]:
+        """
+        Transform the state-dict of an optimizer corresponding to a sharded model.
+
+        The given state-dict can be transformed to one of three types:
+        1) full optimizer state_dict, 2) sharded optimizer state_dict, 3) local optimizer state_dict.
+
+        For full optimizer state_dict, all states are unflattened and not sharded.
+        Rank0 only and CPU only can be specified via :meth:`state_dict_type` to
+        avoid OOM.
+
+        For sharded optimizer state_dict, all states are unflattened but sharded.
+        CPU only can be specified via :meth:`state_dict_type` to further save
+        memory.
+
+        For local state_dict, no transformation will be performed. But a state
+        will be converted from nn.Tensor to ShardedTensor to represent its sharding
+        nature (this is not supported yet).
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> from torch.distributed.fsdp import StateDictType
+            >>> from torch.distributed.fsdp import FullStateDictConfig
+            >>> from torch.distributed.fsdp import FullOptimStateDictConfig
+            >>> # Save a checkpoint
+            >>> model, optim = ...
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> state_dict = model.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(model, optim)
+            >>> save_a_checkpoint(state_dict, optim_state_dict)
+            >>> # Load a checkpoint
+            >>> model, optim = ...
+            >>> state_dict, optim_state_dict = load_a_checkpoint()
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> model.load_state_dict(state_dict)
+            >>> optim_state_dict = FSDP.optim_state_dict_to_load(
+            >>>     model, optim, optim_state_dict
+            >>> )
+            >>> optim.load_state_dict(optim_state_dict)
+
+        Args:
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            optim_state_dict (Dict[str, Any]): the target optimizer state_dict to
+                transform. If the value is None, optim.state_dict() will be used. (
+                Default: ``None``)
+            group (dist.ProcessGroup): Model's process group across which parameters
+                are sharded or ``None`` if using the default process group. (
+                Default: ``None``)
+
+        Returns:
+            Dict[str, Any]: A :class:`dict` containing the optimizer state for
+            ``model``. The sharding of the optimizer state is based on
+            ``state_dict_type``.
+        """
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
+        if optim_state_dict is None:
+            optim_state_dict = optim.state_dict()
+        return FullyShardedDataParallel._optim_state_dict_impl(
+            model=model,
+            optim=optim,
+            optim_state_dict=optim_state_dict,
+            optim_input=None,
+            rank0_only=getattr(
+                state_dict_settings.optim_state_dict_config, "rank0_only", False
+            ),
+            full_state_dict=state_dict_settings.state_dict_type
+            == StateDictType.FULL_STATE_DICT,
+            group=group,
+            cpu_offload=getattr(
+                state_dict_settings.optim_state_dict_config, "offload_to_cpu", True
+            ),
+        )
+
+    @staticmethod
+    def optim_state_dict_to_load(
+        model: torch.nn.Module,
+        optim: torch.optim.Optimizer,
+        optim_state_dict: Dict[str, Any],
+        is_named_optimizer: bool = False,
+        load_directly: bool = False,
+        group: Optional[dist.ProcessGroup] = None,
+    ) -> Dict[str, Any]:
+        """
+        Convert an optimizer state-dict so that it can be loaded into the optimizer associated with the FSDP model.
+
+        Given a ``optim_state_dict`` that is transformed through
+        :meth:`optim_state_dict`, it gets converted to the flattened optimizer
+        state_dict that can be loaded to ``optim`` which is the optimizer for
+        ``model``. ``model`` must be sharded by FullyShardedDataParallel.
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+            >>> from torch.distributed.fsdp import StateDictType
+            >>> from torch.distributed.fsdp import FullStateDictConfig
+            >>> from torch.distributed.fsdp import FullOptimStateDictConfig
+            >>> # Save a checkpoint
+            >>> model, optim = ...
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> state_dict = model.state_dict()
+            >>> original_osd = optim.state_dict()
+            >>> optim_state_dict = FSDP.optim_state_dict(
+            >>>     model,
+            >>>     optim,
+            >>>     optim_state_dict=original_osd
+            >>> )
+            >>> save_a_checkpoint(state_dict, optim_state_dict)
+            >>> # Load a checkpoint
+            >>> model, optim = ...
+            >>> state_dict, optim_state_dict = load_a_checkpoint()
+            >>> FSDP.set_state_dict_type(
+            >>>     model,
+            >>>     StateDictType.FULL_STATE_DICT,
+            >>>     FullStateDictConfig(rank0_only=False),
+            >>>     FullOptimStateDictConfig(rank0_only=False),
+            >>> )
+            >>> model.load_state_dict(state_dict)
+            >>> optim_state_dict = FSDP.optim_state_dict_to_load(
+            >>>     model, optim, optim_state_dict
+            >>> )
+            >>> optim.load_state_dict(optim_state_dict)
+
+        Args:
+            model (torch.nn.Module): Root module (which may or may not be a
+                :class:`FullyShardedDataParallel` instance) whose parameters
+                were passed into the optimizer ``optim``.
+            optim (torch.optim.Optimizer): Optimizer for ``model`` 's
+                parameters.
+            optim_state_dict (Dict[str, Any]): The optimizer states to be loaded.
+            is_named_optimizer (bool): Is this optimizer a NamedOptimizer or
+                KeyedOptimizer. Only set to True if ``optim`` is TorchRec's
+                KeyedOptimizer or torch.distributed's NamedOptimizer.
+            load_directly (bool): If this is set to True, this API will also
+                call optim.load_state_dict(result) before returning the result.
+                Otherwise, users are responsible to call ``optim.load_state_dict()``
+                (Default: ``False``)
+            group (dist.ProcessGroup): Model's process group across which parameters
+                are sharded or ``None`` if using the default process group. (
+                Default: ``None``)
+        """
+        state_dict_settings = FullyShardedDataParallel.get_state_dict_type(model)
+        result = FullyShardedDataParallel._optim_state_dict_to_load_impl(
+            optim_state_dict=optim_state_dict,
+            model=model,
+            optim_input=None,
+            optim=optim,
+            full_state_dict=(
+                state_dict_settings.state_dict_type == StateDictType.FULL_STATE_DICT
+            ),
+            rank0_only=getattr(
+                state_dict_settings.optim_state_dict_config, "rank0_only", False
+            ),
+            is_named_optimizer=is_named_optimizer,
+            group=group,
+        )
+        if load_directly:
+            optim.load_state_dict(result)
+        return result
+
+    def register_comm_hook(self, state: object, hook: callable):
+        """Register a communication hook.
+
+        This is an enhancement that provides a flexible hook to users where they can specify how FSDP aggregates
+        gradients across multiple workers.
+        This hook can be used to implement several algorithms like
+        `GossipGrad <https://arxiv.org/abs/1803.05880>`_ and gradient compression
+        which involve different communication strategies for
+        parameter syncs while training with :class:`FullyShardedDataParallel`.
+
+        .. warning ::
+            FSDP communication hook should be registered before running an initial forward pass
+            and only once.
+
+        Args:
+            state (object): Passed to the hook to maintain any state information during the training process.
+                            Examples include error feedback in gradient compression,
+                            peers to communicate with next in `GossipGrad <https://arxiv.org/abs/1803.05880>`_, etc.
+                            It is locally stored by each worker
+                            and shared by all the gradient tensors on the worker.
+            hook (Callable): Callable, which has one of the following signatures:
+                            1) ``hook: Callable[torch.Tensor] -> None``:
+                            This function takes in a Python tensor, which represents
+                            the full, flattened, unsharded gradient with respect to all variables
+                            corresponding to the model this FSDP unit is wrapping
+                            (that are not wrapped by other FSDP sub-units).
+                            It then performs all necessary processing and returns ``None``;
+                            2) ``hook: Callable[torch.Tensor, torch.Tensor] -> None``:
+                            This function takes in two Python tensors, the first one represents
+                            the full, flattened, unsharded gradient with respect to all variables
+                            corresponding to the model this FSDP unit is wrapping
+                            (that are not wrapped by other FSDP sub-units). The latter
+                            represents a pre-sized tensor to store a chunk of a sharded gradient after
+                            reduction.
+                            In both cases, callable performs all necessary processing and returns ``None``.
+                            Callables with signature 1 are expected to handle gradient communication for a `NO_SHARD` case.
+                            Callables with signature 2 are expected to handle gradient communication for sharded cases.
+
+        """
+        if not self.check_is_root():
+            raise AssertionError(
+                "register_comm_hook can only be called on a root instance."
+            )
+        for fsdp_state in traversal_utils._get_fsdp_states(self):
+            if fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+                raise AssertionError(
+                    f"Communication hook is not supported for hybrid strategies: {fsdp_state.sharding_strategy}"
+                )
+            if fsdp_state._comm_hook is not None:
+                raise AssertionError("A communication hook is already registered")
+            if not callable(hook):
+                raise ValueError(
+                    f"The communication hook must be callable but got {hook}"
+                )
+            fsdp_state._comm_hook = hook
+            fsdp_state._comm_hook_state = state
+
+
+def _get_grad_norm(
+    params: Iterable[nn.Parameter],
+    norm_type: float,
+) -> torch.Tensor:
+    """
+    Return the gradient norm of parameters ``param`` s, where the gradients are viewed as a single vector.
+
+    The returned norm is in FP32 even if parameters/gradients are in a low precision. This is because the downstream
+    use of this return value is a reduction across ranks.
+    """
+    params_with_grad = [param for param in params if param.grad is not None]
+    if len(params_with_grad) == 0:
+        return torch.tensor(0.0)
+    grads = [param.grad for param in params_with_grad]
+    grad_dtypes = {grad.dtype for grad in grads}
+    if len(grad_dtypes) != 1:
+        raise ValueError(
+            f"Requires uniform dtype across all gradients but got {grad_dtypes}"
+        )
+    # Compute the gradient norm in FP32, where we treat the gradients as a
+    # single vector
+    grad_norm = torch.linalg.vector_norm(
+        torch.stack(
+            [
+                torch.linalg.vector_norm(grad.detach(), norm_type, dtype=torch.float32)
+                for grad in grads
+            ],
+        ),
+        norm_type,
+        dtype=torch.float32,
+    )
+    return grad_norm
+
+
+def _get_param_to_fqn(
+    model: torch.nn.Module,
+) -> Dict[torch.nn.Parameter, str]:
+    """
+    Construct a mapping from parameters to their parameter names.
+
+    The ``model`` should not contain any :class:`FullyShardedDataParallel` instances, which
+    means that none of the parameters should be ``FlatParameter`` s. As a
+    result, compared to :meth:`_get_param_to_fqns`, the mapped
+    values may be flattened from singleton :class:`list` s to the contained
+    names themselves.
+
+    Args:
+        model (torch.nn.Module): Root module, which should not contain any
+            :class:`FullyShardedDataParallel` instances.
+    """
+    param_to_param_names = _get_param_to_fqns(model)
+    for param_names in param_to_param_names.values():
+        assert (
+            len(param_names) > 0
+        ), "`_get_param_to_fqns()` should not construct empty lists"
+        if len(param_names) > 1:
+            raise RuntimeError(
+                "Each parameter should only map to one parameter name but got "
+                f"{len(param_names)}: {param_names}"
+            )
+    param_to_param_name = {
+        param: param_names[0] for param, param_names in param_to_param_names.items()
+    }
+    return param_to_param_name
+
+
+def _get_fqn_to_param(
+    model: torch.nn.Module,
+) -> Dict[str, torch.nn.Parameter]:
+    """Construct the inverse mapping of :meth:`_get_param_to_fqn`."""
+    param_to_param_name = _get_param_to_fqn(model)
+    return dict(zip(param_to_param_name.values(), param_to_param_name.keys()))
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f144d57cd361fe650729c7bd022383bcc7dbb3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py
@@ -0,0 +1,388 @@
+import logging
+from collections import abc, defaultdict
+from typing import Any, Dict, Iterable, List, Optional, overload, Sequence, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch.amp.grad_scaler import _MultiDeviceReplicator, GradScaler, OptState
+from torch.distributed.distributed_c10d import ProcessGroup
+
+log = logging.getLogger(__name__)
+
+
+def _refresh_per_optimizer_state() -> Dict[str, Any]:
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+def _is_supported_device(tensor: torch.Tensor) -> bool:
+    return tensor.is_cuda or tensor.device.type in ("xla", "cpu", "hpu")
+
+
+class _GeneralMultiDeviceReplicator(_MultiDeviceReplicator):
+    """
+    Lazily serves tensor to request device. This class extends
+    _MultiDeviceReplicator to allow support for "cpu" as a device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert _is_supported_device(master_tensor)
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+
+class ShardedGradScaler(GradScaler):
+    """
+    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
+    functionality from GradScaler:
+    * Supports Pytorch DDP and FSDP implementations
+    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
+    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
+    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
+    nodes
+
+    Example::
+
+        # Creates a ShardedGradScaler once at the beginning of training.
+        scaler = ShardedGradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+            Default: ``True``
+        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
+            process group for sharding
+    """
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        init_scale: float = 2.0**16,
+        backoff_factor: float = 0.5,
+        growth_factor: float = 2.0,
+        growth_interval: int = 2000,
+        enabled: bool = True,
+        process_group: Optional[ProcessGroup] = dist.group.WORLD,
+    ) -> None:
+        super().__init__(
+            device,
+            init_scale=init_scale,
+            backoff_factor=backoff_factor,
+            growth_factor=growth_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
+        if self._enabled:
+            self.process_group = process_group
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    @overload
+    def scale(self, outputs: torch.Tensor) -> torch.Tensor:
+        ...
+
+    @overload
+    def scale(self, outputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        ...
+
+    @overload
+    def scale(self, outputs: Tuple[torch.Tensor, ...]) -> Tuple[torch.Tensor, ...]:
+        ...
+
+    @overload
+    def scale(self, outputs: Iterable[torch.Tensor]) -> Iterable[torch.Tensor]:
+        ...
+
+    def scale(
+        self, outputs: Union[torch.Tensor, Iterable[torch.Tensor]]
+    ) -> Union[torch.Tensor, Iterable[torch.Tensor]]:
+        if not self._enabled:
+            return outputs
+
+        if isinstance(outputs, torch.Tensor):
+            assert _is_supported_device(outputs)
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            scaled_output = outputs * self._scale.to(
+                device=outputs.device, non_blocking=True
+            )
+            # Here we ensure the return dtype is the same as the outputs dtype.
+            # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision
+            # format (fp16, bf16) and so the scaled loss should be of the same dtype.
+            return scaled_output.type(outputs.dtype)
+
+        stash: List[_GeneralMultiDeviceReplicator] = []
+
+        def apply_scale(val: Union[torch.Tensor, Iterable[torch.Tensor]]):
+            if isinstance(val, torch.Tensor):
+                assert _is_supported_device(val)
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_GeneralMultiDeviceReplicator(self._scale))
+                scaled_val = val * stash[0].get(val.device)
+                # Here we ensure the return dtype is the same as the outputs dtype.
+                # For the FSDP + Mixed Precision use case, the loss output is in the Mixed Precision
+                # format (fp16, bf16) and so the scaled loss should be of the same dtype.
+                return scaled_val.type(val.dtype)
+            if isinstance(val, abc.Iterable):
+                iterator = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterator)
+                return iterator
+            raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _foreach_non_finite_check_and_unscale_cpu_(
+        self,
+        grads: Sequence[torch.Tensor],
+        found_inf: torch.Tensor,
+        inv_scale: torch.Tensor,
+    ) -> None:
+        if len(grads) == 0:
+            return
+        assert inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."
+        assert found_inf.numel() == 1, "found_inf must be a 1-element tensor."
+
+        for grad in grads:
+            if grad.device.type != "cpu":
+                log.error(
+                    "tensor device is %s but was expected to be ``cpu``",
+                    grad.device,
+                )
+                raise ValueError(
+                    "Gradients were found on a non-CPU device when"
+                    " expected to be on CPU."
+                )
+            if (
+                torch.isinf(grad).any().item() is True
+                or torch.isnan(grad).any().item() is True
+            ):
+                found_inf.data = torch.tensor([1.0])
+                break
+            else:
+                grad.data *= inv_scale.item()
+
+    def _unscale_grads_(
+        self,
+        optimizer: torch.optim.Optimizer,
+        inv_scale: torch.Tensor,
+        found_inf: torch.Tensor,
+        allow_fp16: bool = True,
+    ) -> Dict[torch.device, torch.Tensor]:
+        per_device_inv_scale = _GeneralMultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _GeneralMultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be thousands of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            # coalesce is not supported in torch.float16
+                            param_grad_fp32 = param.grad.type(torch.float32).coalesce()
+                            param.grad = param_grad_fp32.type(torch.float16)
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    per_device_and_dtype_grads[to_unscale.device][
+                        to_unscale.dtype
+                    ].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    if grads[0].device.type == "cpu":
+                        self._foreach_non_finite_check_and_unscale_cpu_(
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
+                        )
+                    else:
+                        torch._amp_foreach_non_finite_check_and_unscale_(
+                            grads,
+                            per_device_found_inf.get(device),
+                            per_device_inv_scale.get(device),
+                        )
+        # There exist contexts (e.g. w/ `use_orig_params=True`) wherein some
+        # ranks may have no (non-zero sized) parameter shards, necessitating the
+        # initialization of `per_device_found_inf._per_device_tensors` here
+        if not per_device_found_inf._per_device_tensors:
+            assert self._scale is not None
+            per_device_found_inf.get(self._scale.device)
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer: torch.optim.Optimizer) -> None:
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update()."
+            )
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=self._scale.device
+        )
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, True
+        )
+        optimizer_state["stage"] = OptState.UNSCALED
+
+        # Synchronize the detected inf across the ranks
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+        works = []
+        found_inf_on_cpus = []
+        found_inf_on_cudas = []
+
+        for found_inf in optimizer_state["found_inf_per_device"].values():
+            if self._device == "cuda" and found_inf.device.type == "cpu":
+                found_inf_on_cpus.append(found_inf)
+                found_inf_on_cuda = found_inf.cuda()
+                found_inf_on_cudas.append(found_inf_on_cuda)
+                works.append(
+                    dist.all_reduce(
+                        found_inf_on_cuda, async_op=True, group=self.process_group
+                    )
+                )
+            else:
+                works.append(
+                    dist.all_reduce(found_inf, async_op=True, group=self.process_group)
+                )
+        for work in works:
+            work.wait()
+        if found_inf_on_cpus:
+            torch._foreach_copy_(found_inf_on_cpus, found_inf_on_cudas)
+
+    def _amp_update_scale_cpu_(self, found_inf: torch.Tensor) -> None:
+        """
+        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
+        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
+        """
+        assert self._scale is not None and self._growth_tracker is not None
+
+        if found_inf.item() >= 1.0:
+            self._scale *= self._backoff_factor
+            self._growth_tracker.fill_(0)
+        else:
+            successful = self._growth_tracker + 1
+            if successful == self._growth_interval:
+                self._scale *= self._growth_factor
+                self._growth_tracker.fill_(0)
+            else:
+                self._growth_tracker = successful
+
+    def update(self, new_scale: Optional[Union[float, torch.Tensor]] = None) -> None:
+        """
+        Updates the scale factor.
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+        Args:
+            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")  # type: ignore[var-annotated]
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor or \
+                    torch.FloatTensor with requires_grad=False."
+                assert new_scale.device.type == self._device, reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            if _scale.device.type == "cpu":
+                self._amp_update_scale_cpu_(found_inf_combined)
+            else:
+                torch._amp_update_scale_(
+                    self._scale,  # type: ignore[arg-type]
+                    self._growth_tracker,  # type: ignore[arg-type]
+                    found_inf_combined,
+                    self._growth_factor,  # type: ignore[arg-type]
+                    self._backoff_factor,  # type: ignore[arg-type]
+                    self._growth_interval,  # type: ignore[arg-type]
+                )
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
diff --git a/MLPY/Lib/site-packages/torch/distributed/fsdp/wrap.py b/MLPY/Lib/site-packages/torch/distributed/fsdp/wrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..5122a6dc8431fa1280683e0f73157b39e6c5a123
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/fsdp/wrap.py
@@ -0,0 +1,606 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import copy
+from abc import ABC, abstractmethod
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Dict,
+    Generator,
+    Iterable,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch.nn as nn
+
+__all__ = [
+    "always_wrap_policy",
+    "lambda_auto_wrap_policy",
+    "transformer_auto_wrap_policy",
+    "size_based_auto_wrap_policy",
+    "enable_wrap",
+    "wrap",
+    "CustomPolicy",
+    "ModuleWrapPolicy",
+]
+
+
+# NOTE: We intentionally keep this function simple and isolate the complexity
+# to `fn` to enable using this function generically. We may move this to a
+# non-FSDP-specific folder and/or make it public in the future.
+def _post_order_apply(
+    root_module: nn.Module,
+    fn: Callable[[nn.Module], Optional[nn.Module]],
+):
+    """
+    This applies ``fn`` to every module in the module tree of ``root_module``
+    following a post-order traversal. If ``fn`` returns an :class:`nn.Module`,
+    then this replaces the original module with the newly returned one in the
+    tree. Otherwise, ``fn`` should return ``None``, in which case the module is
+    not changed.
+    """
+    # Track visited modules to avoid visiting shared modules multiple times
+    visited_modules: Set[nn.Module] = {root_module}
+
+    def _post_order_apply_inner(
+        module: nn.Module,
+        module_name: str,
+        parent_module: Optional[nn.Module],
+    ):
+        for child_module_name, child_module in module.named_children():
+            if child_module not in visited_modules:
+                visited_modules.add(child_module)
+                _post_order_apply_inner(child_module, child_module_name, module)
+        optional_module = fn(module)
+        if optional_module is not None:
+            assert isinstance(parent_module, nn.Module), (
+                "Non-root modules should have their parent module set but got "
+                f"{parent_module} for {module}"
+            )
+            assert module_name, (
+                "Non-root modules should have their module name set but got "
+                f"an empty module name for {module}"
+            )
+            assert isinstance(
+                optional_module, nn.Module
+            ), f"fn should return None or an nn.Module but got {optional_module}"
+            setattr(parent_module, module_name, optional_module)
+
+    _post_order_apply_inner(root_module, "", None)
+
+
+def _construct_wrap_fn(
+    root_module: nn.Module,
+    target_module_to_kwargs: Dict[nn.Module, Dict[str, Any]],
+    fsdp_fn: Callable,
+) -> Callable[[nn.Module], Optional[nn.Module]]:
+    """
+    This constructs the "wrap" function to pass to :func:`_post_order_apply`
+    based on ``target_module_to_kwargs``, which should be constructed from the
+    wrapping policy.
+    """
+
+    def fn(module: nn.Module) -> Optional[nn.Module]:
+        # Explicitly avoid wrapping the root module since for FSDP, it is
+        # handled by the caller
+        if module in target_module_to_kwargs and module is not root_module:
+            kwargs = target_module_to_kwargs[module]
+            return fsdp_fn(module, **kwargs)
+        return None
+
+    return fn
+
+
+def _run_mixed_precision_override_policy(
+    root_module: nn.Module,
+    module_classes: Iterable[Type[nn.Module]],
+    ignored_modules: Set[nn.Module],
+    root_kwargs: Dict[str, Any],
+    target_module_to_kwargs: Dict[nn.Module, Dict[str, Any]],
+):
+    module_classes_tuple = tuple(set(module_classes))
+    for module in root_module.modules():
+        if module in ignored_modules:
+            continue
+        elif isinstance(module, module_classes_tuple):
+            # This policy overrides any existing policy
+            if module not in target_module_to_kwargs:
+                # Only inherit from the root kwargs if not already specified
+                target_module_to_kwargs[module] = root_kwargs
+            target_module_to_kwargs[module]["mixed_precision"] = None
+    return target_module_to_kwargs
+
+
+def always_wrap_policy(*args, **kwargs) -> bool:
+    """
+    A simple recursive wrap policy that always returns ``True``. This means
+    that every submodule is wrapped by the wrapper class in
+    :func:`_recursive_wrap`.
+    """
+    return True
+
+
+class _Policy(ABC):
+    """
+    This defines an abstract base class that represents a policy for applying
+    a module-level API.
+    """
+
+    @abstractmethod
+    def _run_policy(
+        self,
+        root_module: nn.Module,
+        ignored_modules: Set[nn.Module],
+        root_kwargs: Dict[str, Any],
+    ) -> Dict[nn.Module, Dict[str, Any]]:
+        """
+        This should return a dict ``target_module_to_kwargs`` that maps from
+        each target module to wrap to its kwargs.
+        """
+        ...
+
+
+def _module_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    nonwrapped_numel: int,
+    module_classes: Set[Type[nn.Module]],
+) -> bool:
+    """
+    This auto wrap policy wraps every module that is an instance of any type in
+    ``module_classes`` as its own FSDP instance. The root module given by
+    ``module`` is always wrapped as an FSDP instance regardless. Since the
+    wrapping proceeds bottom up, each FSDP instance manages the parameters in
+    its subtree excluding any already managed by a child FSDP instance.
+
+    Args:
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+        module_classes (Set[Type[nn.Module]]): Set of module classes that are
+            wrapped as FSDP instances.
+
+    Returns:
+        ``True`` if ``recurse=True``, and whether ``module`` should be wrapped
+        if ``recurse=False``.
+    """
+    if recurse:
+        return True  # always recurse
+    return isinstance(module, tuple(module_classes))
+
+
+class ModuleWrapPolicy(_Policy):
+    """
+    This policy applies to every module of the specified module classes,
+    passing in the kwargs given to the root.
+    """
+
+    def __init__(self, module_classes: Iterable[Type[nn.Module]]):
+        module_classes_set = set(module_classes)
+        self._module_classes = module_classes_set
+        self._module_classes_str = str(module_classes_set)
+
+    def _run_policy(
+        self,
+        root_module: nn.Module,
+        ignored_modules: Set[nn.Module],
+        root_kwargs: Dict[str, Any],
+    ) -> Dict[nn.Module, Dict[str, Any]]:
+        module_classes = tuple(self._module_classes)
+        target_module_to_kwargs: Dict[nn.Module, Dict[str, Any]] = {}
+        for module in root_module.modules():
+            if module in ignored_modules:
+                continue
+            elif isinstance(module, module_classes):
+                # Shallow copy to avoid coupling changes across modules
+                target_module_to_kwargs[module] = copy.copy(root_kwargs)
+        return target_module_to_kwargs
+
+    def __call__(self, module, recurse, *args, **kwargs):
+        # nonwrapped_numel is not used.
+        return _module_wrap_policy(
+            module, recurse, nonwrapped_numel=-1, module_classes=self._module_classes
+        )
+
+    def __repr__(self) -> str:
+        return super().__repr__() + f"({self._module_classes_str})"
+
+
+class CustomPolicy(_Policy):
+    """
+    This policy takes in a lambda function that maps a given ``nn.Module`` to
+    either ``False``, ``True``, or a kwarg dictionary.
+    - If the function returns ``False`` or an empty dictionary, then the module
+      does not have the API applied.
+    - If the function returns ``True``, then the module has the API applied
+      with the root's kwargs.
+    - If the function returns a non-empty dictionary, then the module has the
+      API applied, and the dictionary overrides the root's kwargs.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> model = init_transformer_model(...)
+        >>> def lambda_fn(module: nn.Module):
+        >>>     if module is model.lm_head:
+        >>>         return {"sharding_strategy": ShardingStrategy.SHARD_GRAD_OP}
+        >>>     elif isinstance(module, TransformerBlock):
+        >>>         return True
+        >>>     return False
+        >>> policy = CustomPolicy(lambda_fn)
+        >>> fsdp_model = FSDP(model, auto_wrap_policy=policy)
+    """
+
+    def __init__(self, lambda_fn: Callable[[nn.Module], Union[bool, Dict[str, Any]]]):
+        self._lambda_fn = lambda_fn
+
+    def _run_policy(
+        self,
+        root_module: nn.Module,
+        ignored_modules: Set[nn.Module],
+        root_kwargs: Dict[str, Any],
+    ) -> Dict[nn.Module, Dict[str, Any]]:
+        target_module_to_kwargs: Dict[nn.Module, Dict[str, Any]] = {}
+        for module in root_module.modules():
+            if module in ignored_modules:
+                continue
+            res = self._lambda_fn(module)
+            if not isinstance(res, (dict, bool)):
+                raise ValueError(
+                    "The lambda_fn passed to CustomPolicy should return "
+                    f"False/True or a kwarg dict, but it returned {res}"
+                )
+            if not res:
+                continue
+            kwargs = copy.copy(root_kwargs)
+            if isinstance(res, dict):
+                # Override the root kwargs with the ones specified by the
+                # lambda function
+                kwargs.update(res)
+            target_module_to_kwargs[module] = kwargs
+        return target_module_to_kwargs
+
+
+def lambda_auto_wrap_policy(
+    module: nn.Module, recurse: bool, nonwrapped_numel: int, lambda_fn: Callable
+) -> bool:
+    """
+    A convenient auto wrap policy to wrap submodules based on an arbitrary user
+    function. If `lambda_fn(submodule) == True``, the submodule will be wrapped as
+    a `wrapper_cls` unit.
+
+    Return if a module should be wrapped during auto wrapping.
+
+    The first three parameters are required by :func:`_recursive_wrap`.
+
+    Args:
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+
+        lambda_fn (Callable[[nn.Module], bool]): If this returns ``True``, then
+            this module will be wrapped.
+    """
+    if recurse:
+        return True  # always recurse
+    return lambda_fn(module)
+
+
+def transformer_auto_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    nonwrapped_numel: int,
+    transformer_layer_cls: Set[Type[nn.Module]],
+) -> bool:
+    """
+    See :func:`_module_wrap_policy`, where ``transformer_layer_cls`` is the
+    same as ``module_classes``. Note that shared parameters must be wrapped in
+    the same FSDP instance, so this auto wrap policy can help wrap shared
+    embeddings into the same FSDP instance for transformer models.
+    """
+    return _module_wrap_policy(module, recurse, nonwrapped_numel, transformer_layer_cls)
+
+
+def _wrap_module_cls_individually(
+    module: nn.Module, module_classes: Sequence[type], recurse: bool, *args, **kwargs
+):
+    if recurse:
+        # always recurse
+        return True
+    else:
+        # if not recursing, decide whether we should wrap based on whether the type of module
+        # is in `module_classes`.
+        return isinstance(module, tuple(module_classes))
+
+
+def _or_policy(
+    module: nn.Module,
+    recurse: bool,
+    nonwrapped_numel: int,
+    policies,
+) -> bool:
+    """
+    A policy that wraps ``module`` if any policy in the passed in iterable of
+    ``policies`` returns ``True``.
+    """
+    return any(
+        policy(module=module, recurse=recurse, nonwrapped_numel=nonwrapped_numel)
+        for policy in policies
+    )
+
+
+def size_based_auto_wrap_policy(
+    module: nn.Module,
+    recurse: bool,
+    nonwrapped_numel: int,
+    # Additional custom arguments
+    min_num_params: int = int(1e8),
+    force_leaf_modules: Optional[Set[Type[nn.Module]]] = None,
+    exclude_wrap_modules: Optional[Set[Type[nn.Module]]] = None,
+) -> bool:
+    """
+    A size-based auto wrap policy.
+
+    Args:
+        module (nn.Module): Current module being considered.
+        recurse (bool): If ``False``, then this function must decide whether
+            ``module`` should be wrapped as an FSDP instance or not. If
+            ``True``, then the function is still recursing down the module
+            tree as a part of the DFS.
+        nonwrapped_numel (int): Parameter numel not yet wrapped.
+
+        min_num_params (int): Customizable policy input that controls the size
+            threshold over which a module is ready to be wrapped. This is in
+            units of numel.
+        force_leaf_modules (Set[Type[nn.Module]]): Set of module types to keep
+            as leaves, i.e. their children will never be wrapped.
+        exclude_wrap_modules (Set[Type[nn.Module]]): Set of module types to be
+            excluded in wrapping.
+
+    Returns:
+        Whether ``module`` should be wrapped.
+    """
+    force_leaf_modules = (
+        size_based_auto_wrap_policy.FORCE_LEAF_MODULES  # type: ignore[attr-defined]
+        if force_leaf_modules is None
+        else force_leaf_modules
+    )
+    exclude_wrap_modules = (
+        size_based_auto_wrap_policy.EXCLUDE_WRAP_MODULES  # type: ignore[attr-defined]
+        if exclude_wrap_modules is None
+        else exclude_wrap_modules
+    )
+
+    # Keep the argument `min_num_params` for BC for now, but it represents the
+    # minimum non-wrapped *numel* before triggering a wrapping
+    min_nonwrapped_numel = min_num_params
+    is_large = nonwrapped_numel >= min_nonwrapped_numel
+    if recurse:
+        # We should recurse if the module is big enough but not in force_leaf_modules list.
+        return is_large and not isinstance(module, tuple(force_leaf_modules))
+    else:
+        # If we are not recursing, determine if we should wrap.
+        return is_large and not isinstance(module, tuple(exclude_wrap_modules))
+
+
+# Set those defaults to the size_based_auto_wrap_policy function. Make them easy to be imported.
+size_based_auto_wrap_policy.EXCLUDE_WRAP_MODULES = {nn.ModuleList, nn.ModuleDict}  # type: ignore[attr-defined]
+size_based_auto_wrap_policy.FORCE_LEAF_MODULES = {nn.MultiheadAttention}  # type: ignore[attr-defined]
+
+
+@contextlib.contextmanager
+def enable_wrap(
+    *, wrapper_cls: Any, **wrapper_kwargs: Any
+) -> Generator[None, None, None]:
+    """
+    Context manager to wrap modules using a wrapper.
+
+    Useful for when you'd like to apply the same configuration arguments to all
+    child modules that you wrap. A particularly important use case is wrapping
+    large layers so that they get sharded (in-place) during initialization, to
+    avoid running out of system memory. Large layers can indicate that they
+    should be sharded via the ``wrap`` annotation and this context manager can
+    provide the exact configuration for these nested instances.
+
+    Usage::
+
+        with enable_wrap(wrapper_cls, **params):
+            # Wraps layer in FSDP by default if within context
+            self.l1 = wrap(torch.nn.Linear(5, 5))
+
+    Args:
+        wrapper_cls:
+            Class that `wrap` annotation will `wrap` modules with, such as
+            `FullyShardedDataParallel`.
+        **wrapper_kwargs:
+            Configuration settings that will be passed to all ``wrap``
+            instances inside the context
+    """
+    kwargs = {
+        "wrapper_cls": wrapper_cls,
+        **wrapper_kwargs,
+    }
+    with _ConfigAutoWrap(**kwargs):
+        yield
+
+
+def wrap(module: nn.Module, **wrap_overrides: Any) -> nn.Module:
+    """
+    Annotate that a module should be wrapped. Annotated modules will only be
+    wrapped if inside of an :func:`enable_wrap` context manager. This allows
+    a module to be initialized both with and without a wrapper without code
+    change.
+
+    The class that this function wraps the passed in ``nn.Module`` with is the
+    passed in ``wrapper_cls`` argument into ``enable_wrap``. Both
+    ``enable_wrap`` and ``wrap`` can take in kwargs specifying how to construct
+    the ``wrapper_cls`` instance. In the case of duplicate kwargs in
+    ``enable_wrap`` and ``wrap``, the argument passed into ``wrap`` will be
+    respected.
+
+    Usage::
+
+        with enable_wrap(wrapper_cls=FSDP, **fsdp_config):
+            # Wraps layer in FSDP by default if within context
+            self.l1 = wrap(torch.nn.Linear(5, 5))
+
+    Args:
+        module (nn.Module): module to wrap (if in :func:`enable_wrap` context)
+        **wrap_overrides: configuration overrides that will take priority over
+            the values provided by the :func:`enable_wrap` context
+    """
+    if _ConfigAutoWrap.in_autowrap_context:
+        assert _ConfigAutoWrap.wrapper_cls is not None
+
+        wrap_overrides = {**_ConfigAutoWrap.kwargs, **wrap_overrides}
+        return _wrap(
+            module,
+            _ConfigAutoWrap.wrapper_cls,
+            **wrap_overrides,
+        )
+    return module
+
+
+def _wrap(module: nn.Module, wrapper_cls: Callable, **kwargs) -> nn.Module:
+    assert wrapper_cls is not None
+    if hasattr(module, "_wrap_overrides"):
+        # If module has a _wrap_overrides attribute, we force overriding the
+        # FSDP config with these attributes for this module. Currently this
+        # is only used to disable mixed precision for BatchNorm when
+        # auto_wrapping.
+        overrides = {**kwargs, **module._wrap_overrides}  # type: ignore[arg-type]
+        return wrapper_cls(module, **overrides)
+
+    return wrapper_cls(module, **kwargs)
+
+
+def _recursive_wrap(
+    module: nn.Module,
+    auto_wrap_policy: Callable,
+    wrapper_cls: Callable,
+    ignored_modules: Set[nn.Module],
+    ignored_params: Set[nn.Parameter],
+    only_wrap_children: bool = False,
+    **kwargs: Any,
+) -> Tuple[nn.Module, int]:
+    """
+    Wraps submodules of ``module`` for which ``auto_wrap_policy`` returns
+    ``True`` with ``wrapper_cls``.
+
+    Args:
+        module (nn.Module): Module to recursively wrap.
+        auto_wrap_policy (Callable): A callable representing a policy that
+            determines which modules to recursively wrap with ``wrapper_cls``.
+        ignored_modules (Set[torch.nn.Module]): Modules to ignore when
+            wrapping.
+        ignored_params (Set[torch.nn.Parameter]): Parameters to ignore when
+            wrapping; these should be the parameters contained in the modules
+            in ``ignored_modules``.
+    Returns:
+        (nn.Module, int):
+            ``module`` after wrapping and the numel recursively wrapped.
+    """
+    assert auto_wrap_policy is not None, "Must specify auto_wrap_policy."
+    assert wrapper_cls is not None, "Must specify wrapper_cls"
+    # Make sure no child is already wrapped.
+    for _, child in module.named_modules():
+        if child in ignored_modules:
+            continue
+        try:
+            assert not isinstance(child, cast(type, wrapper_cls))
+        except TypeError:
+            # wrapper_cls is a function as opposed to a class type, just bypass above check.
+            pass
+
+    # We count all params, assuming none of them are already wrapped.
+    nonwrapped_numel = sum(
+        p.numel() for p in module.parameters() if p not in ignored_params
+    )
+
+    assert auto_wrap_policy is not None
+    if auto_wrap_policy(module=module, recurse=True, nonwrapped_numel=nonwrapped_numel):
+        total_wrapped_numel = 0
+        # Iterate through the children, recursively wrap if necessary
+        for name, child in module.named_children():
+            if child in ignored_modules:
+                continue
+            wrapped_child, num_wrapped_params = _recursive_wrap(
+                module=child,
+                auto_wrap_policy=auto_wrap_policy,
+                wrapper_cls=wrapper_cls,
+                ignored_modules=ignored_modules,
+                ignored_params=ignored_params,
+                **kwargs,
+            )
+            setattr(module, name, wrapped_child)
+            # Keep track of how many parameters have been wrapped
+            total_wrapped_numel += num_wrapped_params
+        # decide if we need to wrap the current module,
+        # since the left over parameters exceed the number of params to wrap
+        remainder = nonwrapped_numel - total_wrapped_numel
+        if not only_wrap_children and auto_wrap_policy(
+            module=module, recurse=False, nonwrapped_numel=remainder
+        ):
+            # Leaf node or final wrapping of the remainder both happen here.
+            return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
+        else:
+            return module, total_wrapped_numel
+    return module, 0
+
+
+class _ConfigAutoWrap:
+    """
+    Helper class to wrap modules based on default config args via a context manager.
+    See :func:`enable_wrap` for more information.
+    """
+
+    in_autowrap_context: bool = False  # Context flag
+    wrapper_cls: Optional[Callable] = None  # The wrapper class
+    kwargs: Dict[str, Any] = {}  # Wrapper's args
+
+    def __init__(self, **kwargs: Dict[str, Any]):
+        self.kwargs = kwargs
+
+    @staticmethod
+    def enable_autowrap_context(kwargs: Any) -> None:
+        if _ConfigAutoWrap.in_autowrap_context:
+            raise NotImplementedError(
+                "You are already within an autowrap context and we currently do not supported nested autowrap."
+            )
+        _ConfigAutoWrap.in_autowrap_context = True
+        # Get and save the wrapper cls for the context.
+        assert (
+            "wrapper_cls" in kwargs.keys()
+        ), "Expected to pass in wrapper_cls arg into _ConfigAutoWrap."
+        _ConfigAutoWrap.wrapper_cls = cast(Callable, kwargs["wrapper_cls"])
+        del kwargs["wrapper_cls"]
+        # Save the rest.
+        _ConfigAutoWrap.kwargs = kwargs
+
+    @staticmethod
+    def disable_autowrap_context() -> None:
+        _ConfigAutoWrap.in_autowrap_context = False
+        _ConfigAutoWrap.wrapper_cls = None
+        _ConfigAutoWrap.kwargs = {}
+
+    def __enter__(self) -> None:
+        self.enable_autowrap_context(self.kwargs)
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self.disable_autowrap_context()
diff --git a/MLPY/Lib/site-packages/torch/distributed/launch.py b/MLPY/Lib/site-packages/torch/distributed/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..babf258707b2cd719dae5159de1b5b691c69cac3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/launch.py
@@ -0,0 +1,198 @@
+r"""
+Module ``torch.distributed.launch``.
+
+``torch.distributed.launch`` is a module that spawns up multiple distributed
+training processes on each of the training nodes.
+
+.. warning::
+
+    This module is going to be deprecated in favor of :ref:`torchrun <launcher-api>`.
+
+The utility can be used for single-node distributed training, in which one or
+more processes per node will be spawned. The utility can be used for either
+CPU training or GPU training. If the utility is used for GPU training,
+each distributed process will be operating on a single GPU. This can achieve
+well-improved single-node training performance. It can also be used in
+multi-node distributed training, by spawning up multiple processes on each node
+for well-improved multi-node distributed training performance as well.
+This will especially be beneficial for systems with multiple Infiniband
+interfaces that have direct-GPU support, since all of them can be utilized for
+aggregated communication bandwidth.
+
+In both cases of single-node distributed training or multi-node distributed
+training, this utility will launch the given number of processes per node
+(``--nproc-per-node``). If used for GPU training, this number needs to be less
+or equal to the number of GPUs on the current system (``nproc_per_node``),
+and each process will be operating on a single GPU from *GPU 0 to
+GPU (nproc_per_node - 1)*.
+
+**How to use this module:**
+
+1. Single-Node multi-process distributed training
+
+::
+
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+
+2. Multi-Node multi-process distributed training: (e.g. two nodes)
+
+
+Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
+
+::
+
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node-rank=0 --master-addr="192.168.1.1"
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+Node 2:
+
+::
+
+    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node-rank=1 --master-addr="192.168.1.1"
+               --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+3. To look up what optional arguments this module offers:
+
+::
+
+    python -m torch.distributed.launch --help
+
+
+**Important Notices:**
+
+1. This utility and multi-process distributed (single-node or
+multi-node) GPU training currently only achieves the best performance using
+the NCCL distributed backend. Thus NCCL backend is the recommended backend to
+use for GPU training.
+
+2. In your training program, you must parse the command-line argument:
+``--local-rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+If your training program uses GPUs, you should ensure that your code only
+runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
+
+Parsing the local_rank argument
+
+::
+
+    >>> # xdoctest: +SKIP
+    >>> import argparse
+    >>> parser = argparse.ArgumentParser()
+    >>> parser.add_argument("--local-rank", type=int)
+    >>> args = parser.parse_args()
+
+Set your device to local rank using either
+
+::
+
+    >>> torch.cuda.set_device(args.local_rank)  # before your code runs
+
+or
+
+::
+
+    >>> with torch.cuda.device(args.local_rank):
+    >>>    # your code to run
+    >>>    ...
+
+3. In your training program, you are supposed to call the following function
+at the beginning to start the distributed backend. It is strongly recommended
+that ``init_method=env://``. Other init methods (e.g. ``tcp://``) may work,
+but ``env://`` is the one that is officially supported by this module.
+
+::
+
+    >>> torch.distributed.init_process_group(backend='YOUR BACKEND',
+    >>>                                      init_method='env://')
+
+4. In your training program, you can either use regular distributed functions
+or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
+training program uses GPUs for training and you would like to use
+:func:`torch.nn.parallel.DistributedDataParallel` module,
+here is how to configure it.
+
+::
+
+    >>> model = torch.nn.parallel.DistributedDataParallel(model,
+    >>>                                                   device_ids=[args.local_rank],
+    >>>                                                   output_device=args.local_rank)
+
+Please ensure that ``device_ids`` argument is set to be the only GPU device id
+that your code will be operating on. This is generally the local rank of the
+process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
+and ``output_device`` needs to be ``args.local_rank`` in order to use this
+utility
+
+5. Another way to pass ``local_rank`` to the subprocesses via environment variable
+``LOCAL_RANK``. This behavior is enabled when you launch the script with
+``--use-env=True``. You must adjust the subprocess example above to replace
+``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
+will not pass ``--local-rank`` when you specify this flag.
+
+.. warning::
+
+    ``local_rank`` is NOT globally unique: it is only unique per process
+    on a machine.  Thus, don't use it to decide if you should, e.g.,
+    write to a networked filesystem.  See
+    https://github.com/pytorch/pytorch/issues/12042 for an example of
+    how things can go wrong if you don't do this correctly.
+
+
+
+"""
+
+import logging
+import warnings
+
+from torch.distributed.run import get_args_parser, run
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args(args):
+    parser = get_args_parser()
+    parser.add_argument(
+        "--use-env",
+        "--use_env",
+        default=False,
+        action="store_true",
+        help="Use environment variable to pass "
+        "'local rank'. For legacy reasons, the default value is False. "
+        "If set to True, the script will not pass "
+        "--local-rank as argument, and will instead set LOCAL_RANK.",
+    )
+    return parser.parse_args(args)
+
+
+def launch(args):
+    if args.no_python and not args.use_env:
+        raise ValueError(
+            "When using the '--no-python' flag,"
+            " you must also set the '--use-env' flag."
+        )
+    run(args)
+
+
+def main(args=None):
+    warnings.warn(
+        "The module torch.distributed.launch is deprecated\n"
+        "and will be removed in future. Use torchrun.\n"
+        "Note that --use-env is set by default in torchrun.\n"
+        "If your script expects `--local-rank` argument to be set, please\n"
+        "change it to read from `os.environ['LOCAL_RANK']` instead. See \n"
+        "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n"
+        "further instructions\n",
+        FutureWarning,
+    )
+    args = parse_args(args)
+    launch(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/distributed/launcher/__init__.py b/MLPY/Lib/site-packages/torch/distributed/launcher/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e1923710adbdaffe0930da1118e1ec6d60b331c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/launcher/__init__.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env/python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from torch.distributed.launcher.api import (  # noqa: F401
+    LaunchConfig,
+    elastic_launch,
+    launch_agent,
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc3615084d9c0d822523bc81d2eaf0e0747036ad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a65d9b855d9f8fbdd2c7bb29feb2c8237de2fd0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/launcher/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/launcher/api.py b/MLPY/Lib/site-packages/torch/distributed/launcher/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b3f6cf03b7564140115fe3331b306050f4ebf2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/launcher/api.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch.distributed.elastic.rendezvous.registry as rdzv_registry
+from torch.distributed.elastic import events, metrics
+from torch.distributed.elastic.agent.server.api import WorkerSpec
+from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, LogsSpecs, SignalException
+from torch.distributed.elastic.multiprocessing.errors import ChildFailedError
+from torch.distributed.elastic.rendezvous import RendezvousParameters
+from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
+from torch.distributed.elastic.utils.logging import get_logger
+
+__all__ = ['LaunchConfig', 'elastic_launch', 'launch_agent']
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class LaunchConfig:
+    """
+    Creates a rendezvous config.
+
+    Args:
+        min_nodes: Minimum amount of nodes that the user function will
+                        be launched on. Elastic agent ensures that the user
+                        function start only when the min_nodes amount enters
+                        the rendezvous.
+        max_nodes: Maximum amount of nodes that the user function
+                        will be launched on.
+        nproc_per_node: On each node the elastic agent will launch
+                            this amount of workers that will execute user
+                            defined function.
+        rdzv_backend: rdzv_backend to use in the rendezvous (zeus-adapter, etcd).
+        rdzv_endpoint: The endpoint of the rdzv sync. storage.
+        rdzv_configs: Key, value pair that specifies rendezvous specific configuration.
+        rdzv_timeout: Legacy argument that specifies timeout for the rendezvous. It is going
+            to be removed in future versions, see the note below. The default timeout is 900 seconds.
+        run_id: The unique run id of the job (if not passed a unique one will be
+                deduced from run environment - flow workflow id in flow - or auto generated).
+        role: User defined role of the worker (defaults to "trainer").
+        max_restarts: The maximum amount of restarts that elastic agent will conduct
+                    on workers before failure.
+        monitor_interval: The interval in seconds that is used by the elastic_agent
+                        as a period of monitoring workers.
+        start_method: The method is used by the elastic agent to start the
+                    workers (spawn, fork, forkserver).
+        metrics_cfg: configuration to initialize metrics.
+        local_addr: address of the local node if any. If not set, a lookup on the local
+                machine's FQDN will be performed.
+        local_ranks_filter: ranks for which to show logs in console. If not set, show from all.
+    ..note:
+        `rdzv_timeout` is a legacy argument that will be removed in future.
+        Set the timeout via `rdzv_configs['timeout']`
+
+    """
+
+    min_nodes: int
+    max_nodes: int
+    nproc_per_node: int
+    logs_specs: Optional[LogsSpecs] = None
+    run_id: str = ""
+    role: str = "default_role"
+    rdzv_endpoint: str = ""
+    rdzv_backend: str = "etcd"
+    rdzv_configs: Dict[str, Any] = field(default_factory=dict)
+    rdzv_timeout: int = -1
+    max_restarts: int = 3
+    monitor_interval: float = 30
+    start_method: str = "spawn"
+    log_line_prefix_template: Optional[str] = None
+    metrics_cfg: Dict[str, str] = field(default_factory=dict)
+    local_addr: Optional[str] = None
+
+    def __post_init__(self):
+        default_timeout = 900
+        if self.rdzv_timeout != -1:
+            self.rdzv_configs["timeout"] = self.rdzv_timeout
+        elif "timeout" not in self.rdzv_configs:
+            self.rdzv_configs["timeout"] = default_timeout
+
+        # Post-processing to enable refactoring to introduce logs_specs due to non-torchrun API usage
+        if self.logs_specs is None:
+            self.logs_specs = DefaultLogsSpecs()
+
+
+class elastic_launch:
+    """
+    Launches an torchelastic agent on the container that invoked the entrypoint.
+
+        1. Pass the ``entrypoint`` arguments as non ``kwargs`` (e.g. no named parameters)/
+           ``entrypoint`` can be a function or a command.
+        2. The return value is a map of each worker's output mapped
+           by their respective global rank.
+
+    Usage
+
+    ::
+
+    def worker_fn(foo):
+        # ...
+
+    def main():
+        # entrypoint is a function.
+        outputs = elastic_launch(LaunchConfig, worker_fn)(foo)
+        # return rank 0's output
+        return outputs[0]
+
+        # entrypoint is a command and ``script.py`` is the python module.
+        outputs = elastic_launch(LaunchConfig, "script.py")(args)
+        outputs = elastic_launch(LaunchConfig, "python")("script.py")
+    """
+
+    def __init__(
+        self,
+        config: LaunchConfig,
+        entrypoint: Union[Callable, str, None],
+    ):
+        self._config = config
+        self._entrypoint = entrypoint
+
+    def __call__(self, *args):
+        return launch_agent(self._config, self._entrypoint, list(args))
+
+
+def _get_entrypoint_name(
+    entrypoint: Union[Callable, str, None], args: List[Any]
+) -> str:
+    """Retrieve entrypoint name with the rule:
+    1. If entrypoint is a function, use ``entrypoint.__qualname__``.
+    2. If entrypoint is a string, check its value:
+        2.1 if entrypoint equals to ``sys.executable`` (like "python"), use the first element from ``args``
+            which does not start with hifen letter (for example, "-u" will be skipped).
+        2.2 otherwise, use ``entrypoint`` value.
+    3. Otherwise, return empty string.
+    """
+    if isinstance(entrypoint, Callable):  # type: ignore[arg-type]
+        return entrypoint.__name__  # type: ignore[union-attr]
+    elif isinstance(entrypoint, str):
+        if entrypoint == sys.executable:
+            return next((arg for arg in args if arg[0] != "-"), "")
+        else:
+            return entrypoint
+    else:
+        return ""
+
+
+def _get_addr_and_port(
+    rdzv_parameters: RendezvousParameters,
+) -> Tuple[Optional[str], Optional[int]]:
+    if rdzv_parameters.backend != "static":
+        return (None, None)
+    endpoint = rdzv_parameters.endpoint
+    endpoint = endpoint.strip()
+    if not endpoint:
+        raise ValueError(
+            "Endpoint is missing in endpoint. Try to add --master-addr and --master-port"
+        )
+    master_addr, master_port = parse_rendezvous_endpoint(endpoint, default_port=-1)
+    if master_port == -1:
+        raise ValueError(
+            f"port is missing in endpoint: {endpoint}. Try to specify --master-port"
+        )
+    return (master_addr, master_port)
+
+
+def launch_agent(
+    config: LaunchConfig,
+    entrypoint: Union[Callable, str, None],
+    args: List[Any],
+) -> Dict[int, Any]:
+    if not config.run_id:
+        run_id = str(uuid.uuid4().int)
+        logger.warning("config has no run_id, generated a random run_id: %s", run_id)
+        config.run_id = run_id
+
+    entrypoint_name = _get_entrypoint_name(entrypoint, args)
+
+    logger.info(
+        "Starting elastic_operator with launch configs:\n"
+        "  entrypoint       : %(entrypoint)s\n"
+        "  min_nodes        : %(min_nodes)s\n"
+        "  max_nodes        : %(max_nodes)s\n"
+        "  nproc_per_node   : %(nproc_per_node)s\n"
+        "  run_id           : %(run_id)s\n"
+        "  rdzv_backend     : %(rdzv_backend)s\n"
+        "  rdzv_endpoint    : %(rdzv_endpoint)s\n"
+        "  rdzv_configs     : %(rdzv_configs)s\n"
+        "  max_restarts     : %(max_restarts)s\n"
+        "  monitor_interval : %(monitor_interval)s\n"
+        "  log_dir          : %(log_dir)s\n"
+        "  metrics_cfg      : %(metrics_cfg)s\n",
+        {
+            "entrypoint": entrypoint_name,
+            "min_nodes": config.min_nodes,
+            "max_nodes": config.max_nodes,
+            "nproc_per_node": config.nproc_per_node,
+            "run_id": config.run_id,
+            "rdzv_backend": config.rdzv_backend,
+            "rdzv_endpoint": config.rdzv_endpoint,
+            "rdzv_configs": config.rdzv_configs,
+            "max_restarts": config.max_restarts,
+            "monitor_interval": config.monitor_interval,
+            "log_dir": config.logs_specs.root_log_dir,  # type: ignore[union-attr]
+            "metrics_cfg": config.metrics_cfg
+        }
+    )
+
+    rdzv_parameters = RendezvousParameters(
+        backend=config.rdzv_backend,
+        endpoint=config.rdzv_endpoint,
+        run_id=config.run_id,
+        min_nodes=config.min_nodes,
+        max_nodes=config.max_nodes,
+        local_addr=config.local_addr,
+        **config.rdzv_configs,
+    )
+
+    master_addr, master_port = _get_addr_and_port(rdzv_parameters)
+
+    spec = WorkerSpec(
+        role=config.role,
+        local_world_size=config.nproc_per_node,
+        entrypoint=entrypoint,
+        args=tuple(args),
+        rdzv_handler=rdzv_registry.get_rendezvous_handler(rdzv_parameters),
+        max_restarts=config.max_restarts,
+        monitor_interval=config.monitor_interval,
+        master_addr=master_addr,
+        master_port=master_port,
+        local_addr=config.local_addr,
+    )
+
+    agent = LocalElasticAgent(
+        spec=spec,
+        logs_specs=config.logs_specs,  # type: ignore[arg-type]
+        start_method=config.start_method,
+        log_line_prefix_template=config.log_line_prefix_template,
+    )
+
+    shutdown_rdzv = True
+    try:
+        metrics.initialize_metrics(metrics.MetricsConfig(config.metrics_cfg))
+
+        result = agent.run()
+        # records that agent.run() has succeeded NOT that workers have succeeded
+        events.record(agent.get_event_succeeded())
+
+        if result.is_failed():
+            # ChildFailedError is treated specially by @record
+            # if the error files for the failed children exist
+            # @record will copy the first error (root cause)
+            # to the error file of the launcher process.
+            raise ChildFailedError(
+                name=entrypoint_name,
+                failures=result.failures,
+            )
+
+        return result.return_values
+    except ChildFailedError:
+        raise
+    except SignalException:
+        # when the agent dies with a signal do NOT shutdown the rdzv_handler
+        # since this closes the rendezvous on this rdzv_id permanently and
+        # prevents any additional scaling events
+        shutdown_rdzv = False
+        events.record(agent.get_event_failed())
+        raise
+    except Exception:
+        events.record(agent.get_event_failed())
+        raise
+    finally:
+        if shutdown_rdzv:
+            spec.rdzv_handler.shutdown()
diff --git a/MLPY/Lib/site-packages/torch/distributed/logging_handlers.py b/MLPY/Lib/site-packages/torch/distributed/logging_handlers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a775863e0b06b2f7597cd9cae85d19110271a1f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/logging_handlers.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, List
+
+__all__: List[str] = []
+
+_log_handlers: Dict[str, logging.Handler] = {
+    "default": logging.NullHandler(),
+}
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/__init__.py b/MLPY/Lib/site-packages/torch/distributed/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..625a0c95db7f0abf299697ab017038df21c3e0b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/nn/__init__.py
@@ -0,0 +1,4 @@
+import torch
+if torch.distributed.rpc.is_available():
+    from .api.remote_module import RemoteModule
+from .functional import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bc78b0c91cb3905083c840e9a535d24bcec4650
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f26246000fe624edf5834bb64331beed216fe07e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/api/__init__.py b/MLPY/Lib/site-packages/torch/distributed/nn/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c05488093f99cc8008a441c43c3c057ffb1f981f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/remote_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/remote_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b092a38973d1f3d2cea1d1ffd400b210be8319a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/api/__pycache__/remote_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/api/remote_module.py b/MLPY/Lib/site-packages/torch/distributed/nn/api/remote_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ad29ee898e26650bdfe0494d6559de05fa0a87
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/nn/api/remote_module.py
@@ -0,0 +1,760 @@
+#!/usr/bin/python3
+import collections
+import io
+import sys
+import types
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor, device, dtype, nn
+from torch.distributed.nn.jit import instantiator
+from torch.distributed import _remote_device
+from torch.distributed.rpc.internal import _internal_rpc_pickler
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+from torch.utils.hooks import RemovableHandle
+
+__all__ = ["RemoteModule"]
+
+_grad_t = Union[Tuple[Tensor, ...], Tensor]
+# See https://mypy.readthedocs.io/en/latest/generics.html#generic-methods-and-generic-self for the use
+# of `T` to annotate `self`. Many methods of `Module` return `self` and we want those return values to be
+# the type of the subclass, not the looser type of `Module`.
+T = TypeVar("T", bound="Module")
+
+_NON_SCRIPTABLE_REMOTE_MODULE_MODULE = (
+    instantiator.instantiate_non_scriptable_remote_module_template()
+)
+
+_REMOTE_MODULE_PICKLED_ATTRIBUTES = (
+    "on",
+    "device",
+    "is_device_map_set",
+    "is_scriptable",
+    "generated_methods",
+    "module_rref",
+)
+
+_SerializedRemoteModule = collections.namedtuple("_SerializedRemoteModule", _REMOTE_MODULE_PICKLED_ATTRIBUTES)  # type: ignore[misc]
+
+# These attributes are mostly from RemoteModule's parent class and are intentionally not pickled.
+# A new attribute of RemoteModule should be either in _REMOTE_MODULE_PICKLED_ATTRIBUTES
+# or _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING.
+# Otherwise, it will not be pickled.
+_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING = (
+    "training",
+    "_parameters",
+    "_buffers",
+    "_non_persistent_buffers_set",
+    "_backward_hooks",
+    "_backward_pre_hooks",
+    "_is_full_backward_hook",
+    "_forward_hooks",
+    "_forward_hooks_with_kwargs",
+    "_forward_hooks_always_called",
+    "_forward_pre_hooks",
+    "_forward_pre_hooks_with_kwargs",
+    "_state_dict_hooks",
+    "_state_dict_pre_hooks",
+    "_load_state_dict_pre_hooks",
+    "_load_state_dict_post_hooks",
+    "_state_dict_pre_hooks",
+    "_modules",
+    # The two attributes below are generated methods, not available at pickling time.
+    "forward_async",
+    "forward",
+)
+
+
+# RPC handler.
+def _instantiate_template(module_interface_cls, enable_moving_cpu_tensors_to_cuda):
+    instantiator.instantiate_scriptable_remote_module_template(
+        module_interface_cls, enable_moving_cpu_tensors_to_cuda
+    )
+
+
+def _create_module(module_cls, args, kwargs, device):
+    module = module_cls(*args, **kwargs)
+    if not isinstance(module, nn.Module):
+        raise ValueError(
+            "Expect `module_cls(*args, **kwargs)` returns an instance of <class nn.Module>, "
+            f"but it returns an instance of {type(module)}."
+        )
+    module.to(device)
+    return module
+
+
+def _create_module_with_interface(
+    module_cls, args, kwargs, device, module_interface_cls
+):
+    module = _create_module(module_cls, args, kwargs, device)
+    if module_interface_cls is not None:
+        module = torch.jit.script(module)
+    return rpc.RRef(module, module_interface_cls)
+
+
+def _param_rrefs(module_rref, recurse) -> List[rpc.RRef[Parameter]]:
+    ret: List[rpc.RRef[Parameter]] = []
+    for param in module_rref.local_value().parameters(recurse):
+        ret.append(rpc.RRef(param))
+    return ret
+
+
+def _raise_not_supported(name: str) -> None:
+    raise ValueError(f"Method ``{name}`` not supported for RemoteModule")
+
+
+class _RemoteModule(nn.Module):
+
+    def __new__(cls, *args, **kwargs):
+        # Use __new__ for logging purposes.
+        torch._C._log_api_usage_once("torch.distributed.nn.api.remote_module")
+        return super().__new__(cls)
+
+    def __init__(
+        self,
+        remote_device: str,
+        module_cls: Type[nn.Module],
+        args: Optional[Tuple] = None,
+        kwargs: Optional[Dict[str, Any]] = None,
+        _module_interface_cls: Any = None,
+    ):
+        """
+        RemoteModule instance can only be created after RPC initialization.
+
+        It creates a user-specified module on a specified remote node.
+        It behaves like a regular ``nn.Module`` except that the ``forward`` method is
+        executed on the remote node.
+        It takes care of autograd recording to ensure the backward pass propagates
+        gradients back to the corresponding remote module.
+        It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__,
+        without incurring any overheads of copying the actual module,
+        which is equivalent to an :class:`~torch.distributed.rpc.RRef`
+        pointing to the remote module.
+
+        The arguments of ``forward_async`` and ``forward`` are the same as
+        the ``forward`` method of the module returned by the ``module_cls``.
+
+        Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now.
+
+        Particularly, to create a hybrid model, typically the local modules should be
+        created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``).
+        Hybrid Example:
+                >>> class HybridModel(nn.Module):
+                >>>     def __init__(self):
+                >>>         nn.Module.__init__(self)
+                >>>         self.remote_embedding = RemoteModule(...)
+                >>>         self.local_linear = nn.Linear(...)
+
+        For example, if ``module_cls`` returns an instance of ``nn.Linear``,
+        that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``,
+        the generated ``RemoteModule`` will have 2 methods in signature of
+        ``def forward(input: Tensor) -> Tensor:`` and
+        ``def forward_async(input: Tensor) -> Future[Tensor]:``.
+
+        .. note::
+            If the remote module is placed on a cuda device,
+            any input CPU tensors will be automatically moved to the same cuda device,
+            and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend.
+
+        Args:
+            remote_device (str): Device on the destination worker where we'd like to place this module.
+                The device can be a local device or a remote device specified by one of the following remote
+                formats:
+
+                    1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0").
+                    2. "<worker_name>/<device>" (ex: "trainer0/cuda:0").
+
+                In addition, the device field can be optional and the default value is "cpu".
+            module_cls (nn.Module): For example,
+                >>> class MyModule(nn.Module):
+                >>>     def forward(input):
+                >>>         return input + 1
+                >>>
+                >>> module_cls = MyModule
+            args (Sequence, optional): args to be passed to ``module_cls``.
+            kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
+            _module_interface_cls (type, optional): The TorchScript interface type for the module
+                to be created. The type object should be decorated by @torch.jit.interface.
+                If not provided, the generated RemoteModule is not torchscript-able.
+                Warning, this is an experimental API and susceptible to frequent changes.
+
+        Returns:
+            A remote module instance which wraps the :class:`~nn.Module` created by the
+            user-provided ``module_cls``, it has a blocking ``forward`` method and an
+            asynchronous ``forward_async`` method that returns a future of the ``forward`` call
+            on the user-provided module on the remote side.
+
+        Example::
+            Run the following code in two different processes:
+
+            >>> # xdoctest: +SKIP("distributed")
+            >>> # On worker 0:
+            >>> import torch
+            >>> import torch.distributed.rpc as rpc
+            >>> from torch import nn, Tensor
+            >>> from torch.distributed.nn.api.remote_module import RemoteModule
+            >>>
+            >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+            >>> remote_linear_module = RemoteModule(
+            >>>     "worker1/cpu", nn.Linear, args=(20, 30),
+            >>> )
+            >>> input = torch.randn(128, 20)
+            >>> ret_fut = remote_linear_module.forward_async(input)
+            >>> ret = ret_fut.wait()
+            >>> rpc.shutdown()
+
+            >>> # On worker 1:
+            >>> import torch
+            >>> import torch.distributed.rpc as rpc
+            >>>
+            >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+            >>> rpc.shutdown()
+        """
+        super().__init__()
+
+        enable_moving_cpu_tensors_to_cuda = self._prepare_init(remote_device)
+
+        # Default arguments preparation.
+        args = args if args is not None else ()
+        kwargs = kwargs if kwargs is not None else {}
+
+        if _module_interface_cls is not None:
+            # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
+            self.is_scriptable = True
+
+            # Instantiate template on remote side.
+            fut = rpc.rpc_async(
+                self.on,
+                _instantiate_template,
+                (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
+            )
+
+            self._init_template(
+                _module_interface_cls, enable_moving_cpu_tensors_to_cuda
+            )
+
+            # Instantiate template on remote side.
+            fut = rpc.rpc_async(
+                self.on,
+                _instantiate_template,
+                (_module_interface_cls, enable_moving_cpu_tensors_to_cuda),
+            )
+
+            # Create the module on the remote side.
+            fut.wait()  # Ensure remote_module_cls is available on remote side.
+
+            # TODO: We need to change this to rpc.remote, and make it async (see the else branch below).
+            # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote
+            # See https://github.com/pytorch/pytorch/issues/58098 for more context.
+            self.module_rref = rpc.rpc_sync(
+                self.on,
+                _create_module_with_interface,
+                (module_cls, args, kwargs, self.device, _module_interface_cls),
+            )
+        else:
+            self.is_scriptable = False
+            self.generated_methods = (
+                _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
+            )
+            # Create the module on the remote side.
+            self.module_rref = rpc.remote(
+                self.on,
+                _create_module,
+                (module_cls, args, kwargs, self.device),
+            )
+
+        self._install_generated_methods()
+        self._check_attribute_picklability()
+
+    def remote_parameters(self, recurse: bool = True) -> List[rpc.RRef[Parameter]]:
+        """
+        Return a list of :class:`~torch.distributed.rpc.RRef` pointing to the remote module's parameters.
+
+        This can typically be used in conjunction
+        with :class:`~torch.distributed.optim.DistributedOptimizer`.
+
+        Args:
+            recurse (bool): if True, then returns parameters of the remote
+                module and all submodules of the remote module. Otherwise,
+                returns only parameters that are direct members of the
+                remote module.
+
+        Returns:
+            A list of :class:`~torch.distributed.rpc.RRef` (``List[RRef[nn.Parameter]]``)
+            to remote module's parameters.
+        """
+        return rpc.rpc_sync(self.on, _param_rrefs, args=(self.module_rref, recurse))
+
+    def get_module_rref(self) -> rpc.RRef[nn.Module]:
+        """Return an :class:`~torch.distributed.rpc.RRef` (``RRef[nn.Module]``) pointing to the remote module."""
+        return self.module_rref
+
+    @torch.jit.export
+    def __getstate__(self):
+        raise RuntimeError(
+            "Cannot pickle RemoteModule in python pickler. RemoteModule can only be pickled when using RPC"
+        )
+
+    @torch.jit.export
+    def __setstate__(self, state):
+        raise RuntimeError(
+            "Cannot unpickle RemoteModule in python pickler. RemoteModule can only be unpickled when using RPC"
+        )
+
+    def register_buffer(
+        self, name: str, tensor: Optional[Tensor], persistent: bool = True
+    ) -> None:
+        _raise_not_supported(self.register_buffer.__name__)
+
+    def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
+        _raise_not_supported(self.register_parameter.__name__)
+
+    def add_module(self, name: str, module: Optional[Module]) -> None:
+        _raise_not_supported(self.add_module.__name__)
+
+    def apply(self: T, fn: Callable[[Module], None]) -> T:  # type: ignore[return]
+        _raise_not_supported(self.apply.__name__)
+
+    def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
+        _raise_not_supported(self.cuda.__name__)
+
+    def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
+        _raise_not_supported(self.ipu.__name__)
+
+    def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:  # type: ignore[return]
+        _raise_not_supported(self.xpu.__name__)
+
+    def cpu(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.cpu.__name__)
+
+    def type(self: T, dst_type: Union[dtype, str]) -> T:  # type: ignore[return]
+        _raise_not_supported(self.type.__name__)
+
+    def float(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.float.__name__)
+
+    def double(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.double.__name__)
+
+    def half(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.half.__name__)
+
+    def bfloat16(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.bfloat16.__name__)
+
+    def to(self, *args, **kwargs) -> T:  # type: ignore[misc, return, type-var]
+        _raise_not_supported(self.to.__name__)
+
+    def register_backward_hook(  # type: ignore[return]
+        self, hook: Callable[[Module, _grad_t, _grad_t], Union[None, _grad_t]]
+    ) -> RemovableHandle:
+        _raise_not_supported(self.register_backward_hook.__name__)
+
+    def register_forward_pre_hook(  # type: ignore[return]
+        self,
+        hook: Union[
+            Callable[[T, Tuple[Any, ...]], Optional[Any]],
+            Callable[[T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]],
+        ],
+        prepend: bool = False,
+        with_kwargs: bool = False,
+    ) -> RemovableHandle:
+        _raise_not_supported(self.register_forward_pre_hook.__name__)
+
+    def register_forward_hook(  # type: ignore[return, override]
+        self,
+        hook: Union[
+            Callable[[T, Tuple[Any, ...], Any], Optional[Any]],
+            Callable[[T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]],
+        ],
+        prepend: bool = False,
+        with_kwargs: bool = False,
+    ) -> RemovableHandle:
+        _raise_not_supported(self.register_forward_hook.__name__)
+
+    def state_dict(self, *args, **kwargs):
+        _raise_not_supported(self.state_dict.__name__)
+
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        strict: bool = True,
+        assign: bool = False,
+    ):
+        _raise_not_supported(self.load_state_dict.__name__)
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        raise ValueError(
+            "Method ``parameters`` not supported for RemoteModule. Please use ``remote_parameters`` instead."
+        )
+
+    def named_parameters(  # type: ignore[return]
+        self,
+        prefix: str = "",
+        recurse: bool = True,
+        remove_duplicate: bool = True
+    ) -> Iterator[Tuple[str, Parameter]]:
+        _raise_not_supported(self.named_parameters.__name__)
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:  # type: ignore[return]
+        _raise_not_supported(self.buffers.__name__)
+
+    def named_buffers(  # type: ignore[return]
+        self,
+        prefix: str = "",
+        recurse: bool = True,
+        remove_duplicate: bool = True
+    ) -> Iterator[Tuple[str, Tensor]]:
+        _raise_not_supported(self.named_buffers.__name__)
+
+    def children(self) -> Iterator[Module]:  # type: ignore[return]
+        _raise_not_supported(self.children.__name__)
+
+    def named_children(self) -> Iterator[Tuple[str, Module]]:  # type: ignore[return]
+        _raise_not_supported(self.named_children.__name__)
+
+    def modules(self) -> Iterator[Module]:  # type: ignore[return]
+        _raise_not_supported(self.modules.__name__)
+
+    def named_modules(
+        self,
+        memo: Optional[Set[Module]] = None,
+        prefix: str = "",
+        remove_duplicate: bool = True,
+    ):
+        _raise_not_supported(self.named_modules.__name__)
+
+    def train(self: T, mode: bool = True) -> T:
+        return self.module_rref.rpc_sync().train()  # type: ignore[operator, union-attr]
+
+    def eval(self: T) -> T:
+        return self.module_rref.rpc_sync().eval()  # type: ignore[operator, union-attr]
+
+    def requires_grad_(self: T, requires_grad: bool = True) -> T:  # type: ignore[return]
+        _raise_not_supported(self.requires_grad_.__name__)
+
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        _raise_not_supported(self.zero_grad.__name__)
+
+    def share_memory(self: T) -> T:  # type: ignore[return]
+        _raise_not_supported(self.share_memory.__name__)
+
+    def extra_repr(self) -> str:  # type: ignore[return]
+        _raise_not_supported(self.extra_repr.__name__)
+
+    def _prepare_init(self, remote_device_str: str) -> bool:
+        """Prepare the initialization and returns whether to enable automatically moving CPU tensors to CUDA devices."""
+        # Sanity check.
+        assert rpc._is_current_rpc_agent_set(), "RemoteModule only works in RPC."
+
+        remote_device = _remote_device(remote_device_str)
+        self.on = remote_device.worker_name() if remote_device.worker_name() is not None else remote_device.rank()
+        self.device = str(remote_device.device())
+        agent = rpc._get_current_rpc_agent()
+        # If the device map of the remote worker is set,
+        # then enable moving any input CPU tensors to the same cuda device.
+        self.is_device_map_set = bool(
+            agent._get_device_map(agent.get_worker_info(self.on))  # type: ignore[arg-type]
+        )
+        # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``:
+        # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set,
+        # then any CPU tensors can still be moved to a cuda device to run forward,
+        # but the output must be moved back to CPU before being sent over the wire.
+        enable_moving_cpu_tensors_to_cuda = torch.device(self.device).type == "cuda"
+        return enable_moving_cpu_tensors_to_cuda
+
+    def _init_template(self, module_interface_cls, enable_moving_cpu_tensors_to_cuda):
+        """Instantiate template on local side."""
+        generated_module = instantiator.instantiate_scriptable_remote_module_template(
+            module_interface_cls, enable_moving_cpu_tensors_to_cuda
+        )
+        self.generated_methods = generated_module._generated_methods
+
+    def _check_attribute_picklability(self):
+        """Check if all the attribute has explicitly defined whether to be pickled (i.e., picklability)."""
+        for k in self.__dict__.keys():
+            if (
+                k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES
+                and k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING
+            ):
+                raise AttributeError(
+                    f"Attribute {k} must be either in ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` or "
+                    "``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``."
+                )
+
+    def _install_generated_methods(self):
+        for method in self.generated_methods:
+            method_name = method.__name__
+            method = torch.jit.export(method)
+            setattr(self, method_name, types.MethodType(method, self))
+
+    @staticmethod
+    def init_from_module_rref(
+        remote_device: str,
+        module_rref: rpc.RRef[nn.Module],
+        _module_interface_cls: Any = None,
+    ):
+        """
+        Besides the constructor, a RemoteModule instance can also be initialized given a module RRef.
+
+        This alternate initialization method can be particularly useful if we want to create multiple
+        RemoteModule instances that share the same underlying module and reduce memory consumption.
+
+        Moreover, this also provides a workaround for passing script RemoteModule over RPC,
+        which is not supported. The recommended way is as follows:
+
+            1. the sender creates a RemoteModule;
+            2. the sender sends its ``module_rref`` over RPC;
+            3. the receiver calls this method to initialize another RemoteModule using the same ``module_rref``.
+
+        Example::
+            Run the following code in two different processes:
+
+            >>> # xdoctest: +SKIP("distributed")
+            >>> # On worker 0:
+            >>> import torch
+            >>> import torch.distributed.rpc as rpc
+            >>> from torch import nn, Tensor
+            >>> from torch.distributed.nn.api.remote_module import RemoteModule
+            >>>
+            >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+            >>> remote_module = RemoteModule(
+            >>>     "worker1/cpu", nn.Linear, args=(20, 30),
+            >>> )
+            >>>
+            >>> remote_module1 = rpc.rpc_sync(
+            >>>     "worker1/cpu",
+            >>>     RemoteModule.init_from_module_rref,
+            >>>     ("worker1/cpu", remote_module1.get_module_rref()),
+            >>> )
+            >>> rpc.shutdown()
+
+            >>> # On worker 1:
+            >>> import torch
+            >>> import torch.distributed.rpc as rpc
+            >>>
+            >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+            >>> rpc.shutdown()
+
+        Args:
+            remote_device (str): Device on the destination worker where we'd like to place this module.
+                The device can be a local device or a remote device specified by one of the following remote
+                formats:
+
+                    1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0").
+                    2. "<worker_name>/<device>" (ex: "trainer0/cuda:0").
+
+                In addition, the device field can be optional and the default value is "cpu".
+            module_rref (RRef[nn.Module]): The module reference shared by both the caller and
+                the created remote module.
+            _module_interface_cls (type, optional): The TorchScript interface type for the module
+                to be created. The type object should be decorated by @torch.jit.interface.
+                If not provided, the generated RemoteModule is not torchscript-able.
+                Warning, this is an experimental API and susceptible to frequent changes.
+
+        Returns:
+            A remote module instance which wraps the :class:`~nn.Module` created by the
+            user-provided ``module_rref``, it has a blocking ``forward`` method and an
+            asynchronous ``forward_async`` method that returns a future of the ``forward`` call
+            on the user-provided module on the remote side.
+        """
+        # NOTE: if a new attribute is added to this class, also need to add it
+        # to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` for pickling/unpickling.
+
+        remote_module = object.__new__(RemoteModule)
+
+        enable_moving_cpu_tensors_to_cuda = remote_module._prepare_init(remote_device)
+
+        if _module_interface_cls is not None:
+            # Users reply on this field to know if this generated RemoteModule is TorchScript-able.
+            remote_module.is_scriptable = True
+
+            remote_module._init_template(
+                _module_interface_cls, enable_moving_cpu_tensors_to_cuda
+            )
+        else:
+            remote_module.is_scriptable = False
+            remote_module.generated_methods = (
+                _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods
+            )
+        remote_module.module_rref = module_rref
+
+        remote_module._install_generated_methods()
+        remote_module._check_attribute_picklability()
+
+        return remote_module
+
+
+class RemoteModule(_RemoteModule):
+    """
+        A RemoteModule instance can only be created after RPC initialization.
+
+        It creates a user-specified module on a specified remote node.
+        It behaves like a regular ``nn.Module`` except that the ``forward`` method is
+        executed on the remote node.
+        It takes care of autograd recording to ensure the backward pass propagates
+        gradients back to the corresponding remote module.
+
+        It generates two methods ``forward_async`` and ``forward`` based on the
+        signature of the ``forward`` method of ``module_cls``. ``forward_async``
+        runs asynchronously and returns a Future. The arguments of ``forward_async``
+        and ``forward`` are the same as the ``forward`` method of the module
+        returned by the ``module_cls``.
+
+        For example, if ``module_cls`` returns an instance of ``nn.Linear``,
+        that has ``forward`` method signature: ``def forward(input: Tensor) -> Tensor:``,
+        the generated ``RemoteModule`` will have 2 methods with the signatures:
+
+        | ``def forward(input: Tensor) -> Tensor:``
+        | ``def forward_async(input: Tensor) -> Future[Tensor]:``
+
+    Args:
+        remote_device (str): Device on the destination worker where we'd like to place this module.
+            The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
+            E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
+            In addition, the device field can be optional and the default value is "cpu".
+        module_cls (nn.Module): Class for the module to be created remotely. For example,
+
+            >>> class MyModule(nn.Module):
+            >>>     def forward(input):
+            >>>         return input + 1
+            >>>
+            >>> module_cls = MyModule
+
+        args (Sequence, optional): args to be passed to ``module_cls``.
+        kwargs (Dict, optional): kwargs to be passed to ``module_cls``.
+
+    Returns:
+        A remote module instance which wraps the :class:`~nn.Module` created by the
+        user-provided ``module_cls``, it has a blocking ``forward`` method and an
+        asynchronous ``forward_async`` method that returns a future of the ``forward`` call
+        on the user-provided module on the remote side.
+
+    Example::
+        Run the following code in two different processes:
+
+        >>> # xdoctest: +SKIP("distributed")
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> from torch import nn, Tensor
+        >>> from torch.distributed.nn.api.remote_module import RemoteModule
+        >>>
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> remote_linear_module = RemoteModule(
+        >>>     "worker1/cpu", nn.Linear, args=(20, 30),
+        >>> )
+        >>> input = torch.randn(128, 20)
+        >>> ret_fut = remote_linear_module.forward_async(input)
+        >>> ret = ret_fut.wait()
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>>
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+
+        Furthermore, a more practical example that is combined with
+        `DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel>`__ (DDP)
+        can be found in this `tutorial <https://pytorch.org/tutorials/advanced/rpc_ddp_tutorial.html>`__.
+    """
+
+    def __init__(
+        self,
+        remote_device: str,
+        module_cls: Type[nn.Module],
+        args: Optional[Tuple] = None,
+        kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(remote_device, module_cls, args, kwargs)
+
+
+def _remote_module_receiver(
+    *remote_module_pickled_attrs,
+):
+    """Deserializes a RemoteModule."""
+    serialized_remote_module = _SerializedRemoteModule._make(
+        remote_module_pickled_attrs
+    )
+    m = object.__new__(RemoteModule)
+    m.__dict__.update(serialized_remote_module._asdict())
+
+    # Unpickling the attribute `module_rref` must invoke RRef's `_deserialize()` method.
+    m.module_rref = rpc.PyRRef._deserialize(m.module_rref)
+
+    # Install generated methods when unpickled.
+    for method in m.generated_methods:
+        method_name = method.__name__
+        method = torch.jit.export(method)
+        setattr(m, method_name, types.MethodType(method, m))
+
+    return m
+
+
+def _remote_module_reducer(remote_module):
+    """Serialize a RemoteModule."""
+    pickled_attrs = {}
+    for k, v in remote_module.__dict__.items():
+        # Pickling the attribute `module_rref` must invoke RRef's `_serialize()` method.
+        if k == "module_rref":
+            pickled_attrs[k] = v._serialize()
+        elif k in _REMOTE_MODULE_PICKLED_ATTRIBUTES:
+            pickled_attrs[k] = v
+        # Check if unpickled attributes are all in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING.
+        elif k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING:
+            print(
+                f"The new attribute ``{k}`` of RemoteModule is ignored during RPC pickling. "
+                "To pickle this attribute, please add it to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES``. "
+                "Otherwise, please explicitly add it to ``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.",
+                file=sys.stderr,
+            )
+
+    return (
+        _remote_module_receiver,
+        tuple(pickled_attrs.values()),
+    )
+
+
+def _recursive_script_module_receiver(
+    recursive_script_module_serialized,
+):
+    """Deserializes a RecursiveScriptModule that does not contain a script RemoteModule."""
+    f = io.BytesIO(recursive_script_module_serialized)
+    m = torch.jit.load(f)
+    return m
+
+
+def _recursive_script_module_reducer(recursive_script_module):
+    """Serialize a RecursiveScriptModule that does not contain a script RemoteModule, and raises an error otherwise."""
+    if hasattr(recursive_script_module._c, "module_rref"):
+        raise RuntimeError(
+            "Passing a script RemoteModule over RPC is not supported. Please create a RemoteModule in the sender, "
+            "send the `module_rref` to the receiver, and create a new instance on the receiver end by passing this `module_rref`."
+        )
+
+    f = io.BytesIO()
+    torch.jit.save(recursive_script_module, f)
+    return (_recursive_script_module_receiver, (f.getvalue(),))
+
+
+_internal_rpc_pickler._register_reducer(RemoteModule, _remote_module_reducer)
+_internal_rpc_pickler._register_reducer(
+    torch.jit.RecursiveScriptModule, _recursive_script_module_reducer
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/functional.py b/MLPY/Lib/site-packages/torch/distributed/nn/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..4477bc62583b9a473d9077203362fb6e8e957b77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/nn/functional.py
@@ -0,0 +1,440 @@
+import torch
+import torch.distributed as dist
+from torch.autograd import Function
+# The two imports below are not always available depending on the
+# USE_DISTRIBUTED compile flag. Make sure they raise import error
+# if we're trying to use them.
+from torch.distributed import group, ReduceOp
+
+def broadcast(tensor, src, group=group.WORLD):
+    """
+    Broadcasts the tensor to the whole group.
+
+    ``tensor`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Arguments:
+        tensor (Tensor): Data to be sent if ``src`` is the rank of current
+            process.
+        src (int): Source rank.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        Tensor: Received tensor from the broadcast op.
+
+    """
+    return _Broadcast.apply(src, group, tensor)
+
+
+def gather(tensor, dst=0, group=group.WORLD):
+    """
+    Gathers a list of tensors in a single process.
+
+    Arguments:
+        tensor (Tensor): Input tensor.
+        dst (int, optional): Destination rank (default is 0).
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        tuple[Tensor]: List of appropriately-sized tensors with the gathered data.
+    """
+    return _Gather.apply(dst, group, tensor)
+
+
+def scatter(tensors, src=0, group=group.WORLD):
+    """
+    Scatters a list of tensors to all processes in a group.
+
+    Each process will receive exactly one tensor and store its data in the
+    ``tensor`` argument.
+
+    Arguments:
+        tensors (list[Tensor]): List of tensors to scatter on the source rank.
+            Receivers must pass ``None`.
+        src (int, optional): Source rank (default is 0).
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        Tensor: Output tensor from the scatter operation.
+
+    """
+    return _Scatter.apply(src, group, *tensors)
+
+
+def reduce(tensor, dst, op=ReduceOp.SUM, group=group.WORLD):
+    """
+    Reduces the tensor data across all machines.
+
+    Only the process with rank ``dst`` is going to receive the final result.
+
+    Arguments:
+        tensor (Tensor): Input of the collective.
+        dst (int): Destination rank.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        Tensor: Output of the collective.
+
+    """
+    return _Reduce.apply(dst, op, group, tensor)
+
+
+def reduce_scatter(output, input_list, op=ReduceOp.SUM, group=group.WORLD):
+    """
+    Reduces, then scatters a list of tensors to all processes in a group.
+
+    Arguments:
+        output (Tensor): Output tensor.
+        input_list (list[Tensor]): List of tensors to reduce and scatter.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        Tensor: Output of the collective.
+
+    """
+    return _Reduce_Scatter.apply(op, group, output, *input_list)
+
+
+def all_gather(tensor, group=group.WORLD):
+    """
+    Gathers tensors from the whole group in a list.
+
+    Arguments:
+        tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        tuple([Tensor]): Output of the collective.
+
+    """
+    return _AllGather.apply(group, tensor)
+
+def _all_gather_base(output_tensor, input_tensor, group=group.WORLD):
+    """
+    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.
+
+    Args:
+        output_tensor (Tensor): Output tensor. It should contain
+            correctly-sized tensors to be used for output of the collective.
+        input_tensor (Tensor): Tensor to be broadcast from current process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used.
+
+    Examples:
+        >>> # All tensors below are of torch.int64 dtype.
+        >>> # We have 2 process groups, 2 ranks.
+        >>> # xdoctest: +SKIP("incorrect want text")
+        >>> output_tensor = torch.zeros(2, dtype=torch.int64)
+        >>> output_tensor
+        [tensor([0, 0])] # Rank 0 and 1
+        >>> tensor = torch.arange(1, dtype=torch.int64) + 1 + rank
+        >>> tensor
+        tensor([1]) # Rank 0
+        tensor([2]) # Rank 1
+        >>> dist.all_gather_base(output_tensor, tensor)
+        >>> output_tensor
+        tensor([1,2]) # Rank 0
+        tensor([1,2]) # Rank 1
+
+    .. warning::
+        `_all_gather_base` is experimental and subject to change.
+        It is the caller's responsibility to ensure the output_tensor
+        is correctly sized.
+
+    """
+    return _AllGatherBase.apply(output_tensor, input_tensor, group)
+
+
+def all_to_all(output_tensor_list, input_tensor_list, group=group.WORLD):
+    """
+    Each process scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.
+
+    Arguments:
+        output_tensor_list (list[Tensor]): list of tensors to gather one per rank.
+        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        tuple([Tensor]): Output of the collective.
+
+    """
+    return _AlltoAll.apply(group, output_tensor_list, *input_tensor_list)
+
+
+def all_to_all_single(
+    output,
+    input,
+    output_split_sizes=None,
+    input_split_sizes=None,
+    group=group.WORLD,
+):
+    """
+    Each process splits input tensor and then scatters the split list to all processes in a group.
+
+    Then concatenate the received tensors from all the processes in the group and return single output tensor.
+
+    Arguments:
+        output (Tensor): Gathered concatenated output tensor.
+        input (Tensor): Input tensor to scatter.
+        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
+            if specified None or empty, dim 0 of ``output`` tensor must divide
+            equally by ``world_size``.
+        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
+            if specified None or empty, dim 0 of ``input`` tensor must divide
+            equally by ``world_size``.
+
+    Returns:
+        Tensor: Output of the collective.
+
+    """
+    return _AlltoAllSingle.apply(
+        group, output, output_split_sizes, input_split_sizes, input
+    )
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=group.WORLD):
+    """
+    Reduces the tensor data across all machines in such a way that all get the final result.
+
+    After the call the returned tensor is going to be bitwise
+    identical in all processes.
+
+    Arguments:
+        tensor (Tensor): Input of the collective.
+        op (optional): One of the values from
+            ``torch.distributed.ReduceOp``
+            enum.  Specifies an operation used for element-wise reductions.
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        Tensor: Output of the collective
+
+    """
+    return _AllReduce.apply(op, group, tensor)
+
+
+class _Broadcast(Function):
+    @staticmethod
+    def forward(ctx, src, group, tensor):
+        ctx.src = src
+        ctx.group = group
+        ctx.rank = dist.get_rank(group=group)
+        # torch.distributed makes all the calls in place
+        # we allocate new tensors to avoid this
+        tensor = tensor.clone()
+        dist.broadcast(tensor, src, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        gx = _Reduce.apply(ctx.src, ReduceOp.SUM, ctx.group, grad_output)
+        if ctx.src != ctx.rank:
+            gx.zero_()
+        return (None, None, gx)
+
+
+class _Gather(Function):
+    @staticmethod
+    def forward(ctx, dst, group, tensor):
+        ctx.dst = dst
+        ctx.group = group
+        # Need to create a list of tensors here to do the
+        # aggregation, get it from the group size
+        # tensor should be correctly sized for the method
+        # gathering
+        tensor_list = [
+            torch.zeros_like(tensor) for i in range(dist.get_world_size(group=group))
+        ]
+
+        tensor = tensor.contiguous()
+        if dist.get_rank(group=group) == dst:
+            dist.gather(tensor, tensor_list, dst, group=group)
+        else:
+            dist.gather(tensor, None, dst, group=group)
+        return tuple(tensor_list)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None) + (_Scatter.apply(ctx.dst, ctx.group, *grad_outputs),)
+
+
+class _Scatter(Function):
+    @staticmethod
+    def forward(ctx, src, group, *tensors):
+        ctx.src = src
+        ctx.group = group
+        assert all(t.size() == tensors[0].size() for t in tensors)
+        output = torch.zeros_like(tensors[0])
+        if dist.get_rank(group=group) == src:
+            dist.scatter(output, list(tensors), src, group=group)
+        else:
+            dist.scatter(output, None, src, group=group)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None) + _Gather.apply(ctx.src, ctx.group, grad_output)
+
+
+class _Reduce(Function):
+    @staticmethod
+    def forward(ctx, src, op, group, tensor):
+        ctx.src = src
+        ctx.group = group
+        tensor = tensor.clone()
+        dist.reduce(tensor, src, op=op, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + (_Broadcast.apply(ctx.src, ctx.group, grad_output),)
+
+
+class _Reduce_Scatter(Function):
+    @staticmethod
+    def forward(ctx, op, group, tensor, *input_tensor_list):
+        ctx.group = group
+        # Need contiguous tensors for collectives.
+        tensor = tensor.contiguous()
+        input_tensor_list = tuple(t.contiguous() for t in input_tensor_list)
+        dist.reduce_scatter(tensor, list(input_tensor_list), op=op, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + _AllGather.apply(ctx.group, grad_output)
+
+
+class _AllGather(Function):
+    @staticmethod
+    def forward(ctx, group, tensor):
+        # Need contiguous tensors for collectives.
+        tensor = tensor.contiguous()
+
+        ctx.group = group
+        out_tensor_list = [
+            torch.empty_like(tensor) for _ in range(dist.get_world_size(group=group))
+        ]
+
+        dist.all_gather(out_tensor_list, tensor, group=group)
+        return tuple(out_tensor_list)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
+            rank = dist.get_rank(group=ctx.group)
+            gx = torch.empty_like(grad_outputs[rank])
+            gx = _Reduce_Scatter.apply(ReduceOp.SUM, ctx.group, gx, *grad_outputs)
+        else:
+            # As many backends doesn't support ReduceScatter, we use AlltoAll with .sum()
+            # to emulate the ReduceScatter behavior
+            tensor_list = [torch.empty_like(tensor) for tensor in grad_outputs]
+            gxs = _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
+            gx = torch.sum(torch.stack(gxs), dim=0)
+        return (None, gx)
+
+class _AllGatherBase(Function):
+    @staticmethod
+    def forward(ctx, output_tensor, input_tensor, group):
+        ctx.group = group
+        dist._all_gather_base(output_tensor, input_tensor.contiguous(), group=group)
+        return output_tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if dist.get_backend(group=ctx.group) is dist.Backend.NCCL:
+            world_size = dist.get_world_size(group=ctx.group)
+            out_size = list(grad_output.size())
+            if out_size[0] % world_size != 0:
+                raise RuntimeError(
+                    f'Tensor with dimensions: {out_size} does '
+                    f'not have first dimension divisible by world_size: {world_size}'
+                )
+            out_size[0] = out_size[0] // dist.get_world_size(group=ctx.group)
+            gx = torch.empty(out_size, device=grad_output.device, dtype=grad_output.dtype)
+            dist._reduce_scatter_base(gx, grad_output, ReduceOp.SUM, ctx.group)
+        else:
+            raise RuntimeError("Backend not supported!")
+        return (None, gx, None)
+
+class _AlltoAll(Function):
+    @staticmethod
+    def forward(ctx, group, out_tensor_list, *tensors):
+        ctx.group = group
+        ctx.input_tensor_size_list = [
+            tensors[i].size() for i in range(dist.get_world_size(group=group))
+        ]
+        my_rank = dist.get_rank(group=group)
+        tensors = tuple(t.contiguous() for t in tensors)
+        # Implement it on means of scatter/gather, send/recv async operations have issues
+        if dist.get_backend(group=group) is dist.Backend.GLOO:
+            for i in range(dist.get_world_size(group=group)):
+                to_send = None
+                if i == my_rank:
+                    to_send = list(tensors)
+                dist.scatter(out_tensor_list[i], to_send, i, group=group)
+        else:
+            dist.all_to_all(
+                out_tensor_list,
+                list(tensors),
+                group=group,
+            )
+        return tuple(out_tensor_list)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        tensor_list = [
+            torch.empty(size, device=grad_outputs[0].device, dtype=grad_outputs[0].dtype)
+            for size in ctx.input_tensor_size_list
+        ]
+        return (None, None) + _AlltoAll.apply(ctx.group, tensor_list, *grad_outputs)
+
+
+class _AlltoAllSingle(Function):
+    @staticmethod
+    def forward(ctx, group, output, output_split_sizes, input_split_sizes, input):
+        ctx.group = group
+        ctx.input_size = input.size()
+        ctx.output_split_sizes = input_split_sizes
+        ctx.input_split_sizes = output_split_sizes
+        dist.all_to_all_single(
+            output,
+            input,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+        )
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        tensor = torch.empty(ctx.input_size, device=grad_output.device, dtype=grad_output.dtype)
+        return (None, None, None, None) + (
+            _AlltoAllSingle.apply(
+                ctx.group,
+                tensor,
+                ctx.output_split_sizes,
+                ctx.input_split_sizes,
+                grad_output.contiguous(),
+            ),
+        )
+
+
+class _AllReduce(Function):
+    @staticmethod
+    def forward(ctx, op, group, tensor):
+        ctx.group = group
+        ctx.op = op
+        tensor = tensor.clone()
+        dist.all_reduce(tensor, op=op, group=group)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None) + (_AllReduce.apply(ctx.op, ctx.group, grad_output),)
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/__init__.py b/MLPY/Lib/site-packages/torch/distributed/nn/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c59e68425d9a7692d862a58dcc06b32c3c91e2b6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/instantiator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/instantiator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f251304788b835010037a9724d1702a808d2f921
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/jit/__pycache__/instantiator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/instantiator.py b/MLPY/Lib/site-packages/torch/distributed/nn/jit/instantiator.py
new file mode 100644
index 0000000000000000000000000000000000000000..56121cc2cd57ec38521f31c00881cf826e4ea59d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/nn/jit/instantiator.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python3
+import importlib
+import logging
+import os
+import sys
+import tempfile
+from typing import Optional
+
+import torch
+from torch.distributed.nn.jit.templates.remote_module_template import (
+    get_remote_module_template,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+_FILE_PREFIX = "_remote_module_"
+_TEMP_DIR = tempfile.TemporaryDirectory()
+INSTANTIATED_TEMPLATE_DIR_PATH = _TEMP_DIR.name
+logger.info("Created a temporary directory at %s", INSTANTIATED_TEMPLATE_DIR_PATH)
+sys.path.append(INSTANTIATED_TEMPLATE_DIR_PATH)
+
+
+def get_arg_return_types_from_interface(module_interface):
+    assert getattr(
+        module_interface, "__torch_script_interface__", False
+    ), "Expect a TorchScript class interface decorated by @torch.jit.interface."
+    qualified_name = torch._jit_internal._qualified_name(module_interface)
+    cu = torch.jit._state._python_cu
+    module_interface_c = cu.get_interface(qualified_name)
+    assert (
+        "forward" in module_interface_c.getMethodNames()
+    ), f"Expect forward in interface methods, while it has {module_interface_c.getMethodNames()}"
+    method_schema = module_interface_c.getMethod("forward")
+
+    arg_str_list = []
+    arg_type_str_list = []
+    assert method_schema is not None
+    for argument in method_schema.arguments:
+        arg_str_list.append(argument.name)
+
+        if argument.has_default_value():
+            default_value_str = f" = {argument.default_value}"
+        else:
+            default_value_str = ""
+        arg_type_str = f"{argument.name}: {argument.type}{default_value_str}"
+        arg_type_str_list.append(arg_type_str)
+
+    arg_str_list = arg_str_list[1:]  # Remove "self".
+    args_str = ", ".join(arg_str_list)
+
+    arg_type_str_list = arg_type_str_list[1:]  # Remove "self".
+    arg_types_str = ", ".join(arg_type_str_list)
+
+    assert len(method_schema.returns) == 1
+    argument = method_schema.returns[0]
+    return_type_str = str(argument.type)
+
+    return args_str, arg_types_str, return_type_str
+
+
+def _write(out_path, text):
+    old_text: Optional[str]
+    try:
+        with open(out_path) as f:
+            old_text = f.read()
+    except OSError:
+        old_text = None
+    if old_text != text:
+        with open(out_path, "w") as f:
+            logger.info("Writing %s", out_path)
+            f.write(text)
+    else:
+        logger.info("Skipped writing %s", out_path)
+
+
+def _do_instantiate_remote_module_template(
+    generated_module_name, str_dict, enable_moving_cpu_tensors_to_cuda
+):
+    generated_code_text = get_remote_module_template(
+        enable_moving_cpu_tensors_to_cuda
+    ).format(**str_dict)
+    out_path = os.path.join(
+        INSTANTIATED_TEMPLATE_DIR_PATH, f"{generated_module_name}.py"
+    )
+    _write(out_path, generated_code_text)
+
+    # From importlib doc,
+    # > If you are dynamically importing a module that was created since
+    # the interpreter began execution (e.g., created a Python source file),
+    # you may need to call invalidate_caches() in order for the new module
+    # to be noticed by the import system.
+    importlib.invalidate_caches()
+    generated_module = importlib.import_module(f"{generated_module_name}")
+    return generated_module
+
+
+def instantiate_scriptable_remote_module_template(
+    module_interface_cls, enable_moving_cpu_tensors_to_cuda=True
+):
+    if not getattr(module_interface_cls, "__torch_script_interface__", False):
+        raise ValueError(
+            f"module_interface_cls {module_interface_cls} must be a type object decorated by "
+            "@torch.jit.interface"
+        )
+
+    # Generate the template instance name.
+    module_interface_cls_name = torch._jit_internal._qualified_name(
+        module_interface_cls
+    ).replace(".", "_")
+    generated_module_name = f"{_FILE_PREFIX}{module_interface_cls_name}"
+
+    # Generate type annotation strs.
+    assign_module_interface_cls_str = (
+        f"from {module_interface_cls.__module__} import "
+        f"{module_interface_cls.__name__} as module_interface_cls"
+    )
+    args_str, arg_types_str, return_type_str = get_arg_return_types_from_interface(
+        module_interface_cls
+    )
+    kwargs_str = ""
+    arrow_and_return_type_str = f" -> {return_type_str}"
+    arrow_and_future_return_type_str = f" -> Future[{return_type_str}]"
+
+    str_dict = dict(
+        assign_module_interface_cls=assign_module_interface_cls_str,
+        arg_types=arg_types_str,
+        arrow_and_return_type=arrow_and_return_type_str,
+        arrow_and_future_return_type=arrow_and_future_return_type_str,
+        args=args_str,
+        kwargs=kwargs_str,
+        jit_script_decorator="@torch.jit.script",
+    )
+    return _do_instantiate_remote_module_template(
+        generated_module_name, str_dict, enable_moving_cpu_tensors_to_cuda
+    )
+
+
+def instantiate_non_scriptable_remote_module_template():
+    generated_module_name = f"{_FILE_PREFIX}non_scriptable"
+    str_dict = dict(
+        assign_module_interface_cls="module_interface_cls = None",
+        args="*args",
+        kwargs="**kwargs",
+        arg_types="*args, **kwargs",
+        arrow_and_return_type="",
+        arrow_and_future_return_type="",
+        jit_script_decorator="",
+    )
+    # For a non-scriptable template, always enable moving CPU tensors to a cuda device,
+    # because there is no syntax limitation on the extra handling caused by the script.
+    return _do_instantiate_remote_module_template(generated_module_name, str_dict, True)
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__init__.py b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebabf3ef4943cb2bd926a83e8647d932f8876fa8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/remote_module_template.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/remote_module_template.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3162a75a35f3a295f744d4f7bb1a1c8902f8ec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/__pycache__/remote_module_template.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/remote_module_template.py b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/remote_module_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc2a431dc3b1297590c47411035a058b5097b2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/nn/jit/templates/remote_module_template.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python3
+
+
+def get_remote_module_template(enable_moving_cpu_tensors_to_cuda: bool):
+    return _TEMPLATE_PREFIX + (
+        _REMOTE_FORWARD_TEMPLATE_ENABLE_MOVING_CPU_TENSORS_TO_CUDA
+        if enable_moving_cpu_tensors_to_cuda
+        else _REMOTE_FORWARD_TEMPLATE
+    )
+
+
+_TEMPLATE_PREFIX = """from typing import *
+
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch._jit_internal import Future
+from torch.distributed.rpc import RRef
+from typing import Tuple  # pyre-ignore: unused import
+
+
+{assign_module_interface_cls}
+
+
+def forward_async(self, {arg_types}){arrow_and_future_return_type}:
+    args = (self.module_rref, self.device, self.is_device_map_set, {args})
+    kwargs = {{{kwargs}}}
+    return rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+
+
+def forward(self, {arg_types}){arrow_and_return_type}:
+    args = (self.module_rref, self.device, self.is_device_map_set, {args})
+    kwargs = {{{kwargs}}}
+    ret_fut = rpc.rpc_async(
+        self.module_rref.owner(),
+        _remote_forward,
+        args,
+        kwargs,
+    )
+    return ret_fut.wait()
+
+
+_generated_methods = [
+    forward_async,
+    forward,
+]
+
+
+{jit_script_decorator}
+"""
+
+# This template may cause typing error (the mismatch between ``Tuple[()]`` and ``Tuple[Any]``)
+# even if the code is only used for instantiation but not execution.
+# Therefore, only include handling moving CPU tensors to a cuda device if necessary.
+# TODO: Merge these two templates together in the future once TorchScript syntax is improved.
+_REMOTE_FORWARD_TEMPLATE_ENABLE_MOVING_CPU_TENSORS_TO_CUDA = """
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, {arg_types}){arrow_and_return_type}:
+    module = module_rref.local_value()
+    device = torch.device(device)
+
+    if device.type != "cuda":
+        return module.forward({args}, {kwargs})
+
+    # If the module is on a cuda device,
+    # move any CPU tensor in args or kwargs to the same cuda device.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
+    args = ({args},)
+    out_args: Tuple[()] = ()
+    for arg in args:
+        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
+        out_args = out_args + arg
+
+    kwargs = {{{kwargs}}}
+    for k, v in kwargs.items():
+        if isinstance(v, Tensor):
+            kwargs[k] = kwargs[k].to(device)
+
+    if is_device_map_set:
+        return module.forward(*out_args, {kwargs})
+
+    # If the device map is empty, then only CPU tensors are allowed to send over wire,
+    # so have to move any GPU tensor to CPU in the output.
+    # Since torch script does not support generator expression,
+    # have to use concatenation instead of
+    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, {kwargs}))``.
+    ret: Tuple[()] = ()
+    for i in module.forward(*out_args, {kwargs}):
+        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
+        ret = ret + i
+    return ret
+"""
+
+_REMOTE_FORWARD_TEMPLATE = """
+def _remote_forward(
+    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, {arg_types}){arrow_and_return_type}:
+    module = module_rref.local_value()
+
+    return module.forward({args}, {kwargs})
+"""
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__init__.py b/MLPY/Lib/site-packages/torch/distributed/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c3b37ae54a5d1ebbada508988623a57cf07acd9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/__init__.py
@@ -0,0 +1,34 @@
+"""
+:mod:`torch.distributed.optim` exposes DistributedOptimizer, which takes a list
+of remote parameters (:class:`~torch.distributed.rpc.RRef`) and runs the
+optimizer locally on the workers where the parameters live.  The distributed
+optimizer can use any of the local optimizer :ref:`optimizer-algorithms` to
+apply the gradients on each worker.
+"""
+import torch
+from torch import optim
+
+from .apply_optimizer_in_backward import (
+    _apply_optimizer_in_backward,
+    _get_in_backward_optimizers,
+)
+from .functional_adadelta import _FunctionalAdadelta
+
+from .functional_adagrad import _FunctionalAdagrad
+from .functional_adam import _FunctionalAdam
+from .functional_adamax import _FunctionalAdamax
+from .functional_adamw import _FunctionalAdamW
+from .functional_rmsprop import _FunctionalRMSprop
+from .functional_rprop import _FunctionalRprop
+from .functional_sgd import _FunctionalSGD
+from .named_optimizer import _NamedOptimizer
+from .utils import as_functional_optim
+
+
+# DistributedOptimizer imports torch.distributed.rpc names, so gate availability
+# based on RPC being available.
+if hasattr(torch._C, "_rpc_init"):
+    from .optimizer import DistributedOptimizer
+
+from .post_localSGD_optimizer import PostLocalSGDOptimizer
+from .zero_redundancy_optimizer import ZeroRedundancyOptimizer
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d91c1d8e65a98e40e17777a5da127cde23c35e21
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/apply_optimizer_in_backward.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/apply_optimizer_in_backward.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e56fdd2fd0eedaf44997b92d1a7f89b6acd8d07
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/apply_optimizer_in_backward.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adadelta.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adadelta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8166212d674990bdb830d5caa9030b3b2c3580d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adadelta.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adagrad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adagrad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1eb9417a655b0fa3737fe971e2b61463a9e474da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adagrad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adam.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adam.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df47c6d98856c39a52f5b2fcc181ac2427219219
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adam.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamax.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamax.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19d5c0ed68522a3a4cc34c56d443553c67836e78
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamax.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamw.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamw.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4e88e7248c84bd4f98d73f8e198982b79d6834e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_adamw.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rmsprop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rmsprop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dd0247f7038b7a0be849907fdb2cf790c049a14
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rmsprop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rprop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rprop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57a741cdcf26270c457faebc75df605bcd39279f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_rprop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_sgd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_sgd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5974af622f2623e24929709b688fd16180bc61a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/functional_sgd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/named_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/named_optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b716e24744624fbf9d2f060edc324be3d816ffb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/named_optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e7746d1d773ed61939b94845f1605d86d3b95a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/post_localSGD_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/post_localSGD_optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a637b34cffbe7970b71611bc472c000befa7984a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/post_localSGD_optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3be951c7299123fc2dd955a1c83f02acb1e86a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/zero_redundancy_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/zero_redundancy_optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e705ed2527fcdf2099dd71491e8b4fea112ebf9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/optim/__pycache__/zero_redundancy_optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/apply_optimizer_in_backward.py b/MLPY/Lib/site-packages/torch/distributed/optim/apply_optimizer_in_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..182cfc6ddb9ea9d299546e21b249ec571284038c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/apply_optimizer_in_backward.py
@@ -0,0 +1,118 @@
+from typing import Any, Dict, Iterable, List, no_type_check, Type
+
+import torch
+
+__all__: List[str] = []
+
+# WeakTensorKeyDictionary to store relevant meta-data for the Tensor/Parameter
+# without changing it's life-time.
+# NOTE: Alternative is to add the meta-data as an attribute to the tensor,
+#       but that will serialize the meta-data if Tensor is serialized.
+param_to_optim_hook_handle_map = torch.utils.weak.WeakTensorKeyDictionary()
+param_to_acc_grad_map = torch.utils.weak.WeakTensorKeyDictionary()
+
+@no_type_check
+def _apply_optimizer_in_backward(
+    optimizer_class: Type[torch.optim.Optimizer],
+    params: Iterable[torch.nn.Parameter],
+    optimizer_kwargs: Dict[str, Any],
+    register_hook: bool = True,
+) -> None:
+    """
+    Upon ``backward()``, the optimizer specified for each parameter will fire after
+    the gradient has been accumulated into the parameter.
+
+    Note - gradients for these parameters will be set to None after ``backward()``.
+    This means that any other optimizer not specified via `_apply_optimizer_in_backward`
+    over this parameter will be a no-op.
+
+    Args:
+        optimizer_class: (Type[torch.optim.Optimizer]): Optimizer to apply to parameter
+        params: (Iterator[nn.Parameter]): parameters to apply optimizer state to
+        optimizer_kwargs: (Dict[str, Any]): kwargs to pass to optimizer constructor
+        register_hook: (bool): whether to register a hook that runs the optimizer
+            after gradient for this parameter is accumulated. This is the default
+            way that optimizer in backward is implemented, but specific use cases
+            (such as DDP) may wish to override this to implement custom behavior.
+            (Default = True)
+
+    Example::
+        params_generator = model.parameters()
+        param_1 = next(params_generator)
+        remainder_params = list(params_generator)
+
+        apply_optimizer_in_backward(torch.optim.SGD, [param_1], {"lr": .02})
+        apply_optimizer_in_backward(torch.optim.Adam, remainder_params, {"lr": .04})
+
+        model(...).sum().backward() # after backward, parameters will already
+        # have their registered optimizer(s) applied.
+
+    """
+    torch._C._log_api_usage_once(
+        "torch.distributed.optim.apply_optimizer_in_backward"
+    )
+
+    @no_type_check
+    def _apply_optimizer_in_backward_to_param(param: torch.nn.Parameter) -> None:
+        # view_as creates a node in autograd graph that allows us access to the
+        # parameter's AccumulateGrad autograd function object. We register a
+        # hook on this object to fire the optimizer when the gradient for
+        # this parameter is ready (has been accumulated into .grad field)
+
+        # Don't create a new acc_grad if we already have one
+        # i.e. for shared parameters or attaching multiple optimizers to a param.
+        if param not in param_to_acc_grad_map:
+            param_to_acc_grad_map[param] = param.view_as(param).grad_fn.next_functions[0][0]
+
+        optimizer = optimizer_class([param], **optimizer_kwargs)
+
+        if not hasattr(param, "_in_backward_optimizers"):
+            param._in_backward_optimizers = []  # type: ignore[attr-defined]
+            # TODO: Remove these attributes once we have a better way of accessing
+            # optimizer classes and kwargs for a parameter.
+            param._optimizer_classes = []  # type: ignore[attr-defined]
+            param._optimizer_kwargs = []  # type: ignore[attr-defined]
+
+        param._in_backward_optimizers.append(optimizer)  # type: ignore[attr-defined]
+        param._optimizer_classes.append(optimizer_class)  # type: ignore[attr-defined]
+        param._optimizer_kwargs.append(optimizer_kwargs)  # type: ignore[attr-defined]
+
+        if not register_hook:
+            return
+
+        def optimizer_hook(*_unused) -> None:
+            for opt in param._in_backward_optimizers:  # type: ignore[attr-defined]
+                opt.step()
+
+            param.grad = None
+
+        handle = param_to_acc_grad_map[param].register_hook(optimizer_hook)  # type: ignore[attr-defined]
+        if param not in param_to_optim_hook_handle_map:
+            param_to_optim_hook_handle_map[param] = []
+        param_to_optim_hook_handle_map[param].append(handle)
+
+    for param in params:
+        _apply_optimizer_in_backward_to_param(param)
+
+
+def _get_in_backward_optimizers(module: torch.nn.Module) -> List[torch.optim.Optimizer]:
+    """
+    Return a list of in-backward optimizers applied to ``module``'s parameters. Note that these
+    optimizers are not intended to directly have their ``step`` or ``zero_grad`` methods called
+    by the user and are intended to be used for things like checkpointing.
+
+    Args:
+        module: (torch.nn.Module): model to retrieve in-backward optimizers for
+
+    Returns:
+        List[torch.optim.Optimizer]: the in-backward optimizers.
+
+    Example::
+        _apply_optimizer_in_backward(torch.optim.SGD, model.parameters(), {'lr': 0.01})
+        optims = _get_optimizers_in_backward(model)
+    """
+    optims: List[torch.optim.Optimizer] = []
+    for param in module.parameters():
+        optims.extend(getattr(param, "_in_backward_optimizers", []))
+
+    return optims
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_adadelta.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..77f4d83fb60a060452f5f7214c1685392bbff6e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adadelta.py
@@ -0,0 +1,102 @@
+from typing import Dict, List, Optional
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional Adadelta Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdadelta:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1.0,
+        rho: float = 0.9,
+        eps: float = 1e-6,
+        weight_decay: float = 0.0,
+        foreach: bool = False,
+        maximize: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "rho": rho,
+            "eps": eps,
+            "weight_decay": weight_decay,
+        }
+        self.foreach = foreach
+        self.maximize = maximize
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        square_avgs = []
+        acc_deltas = []
+        lr = self.defaults["lr"]
+        rho = self.defaults["rho"]
+        eps = self.defaults["eps"]
+        weight_decay = self.defaults["weight_decay"]
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+        has_complex = False
+        for param, gradient in zip(params, gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    state["square_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    state["acc_delta"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+
+                state = self.state[param]
+                square_avgs.append(state["square_avg"])
+                acc_deltas.append(state["acc_delta"])
+
+        with torch.no_grad():
+            F.adadelta(
+                params_with_grad,
+                grads,
+                square_avgs,
+                acc_deltas,
+                lr=lr,
+                rho=rho,
+                eps=eps,
+                weight_decay=weight_decay,
+                foreach=self.foreach,
+                maximize=self.maximize,
+                has_complex=has_complex
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_adagrad.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..280201ae4cf61be11c724d14a064964dc1797f84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adagrad.py
@@ -0,0 +1,104 @@
+from typing import Dict, List, Optional
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional Adagrad Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly let the user pass gradients to the `step` function
+# this is so that we could separate the gradients and parameters
+# and allow multithreaded trainer to update the parameters
+# without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdagrad:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        lr_decay: float = 0.0,
+        weight_decay: float = 0.0,
+        initial_accumulator_value: float = 0.0,
+        warmup_lr_multiplier: float = 1.0,
+        warmup_num_iters: float = 0.0,
+        eps: float = 1e-10,
+        coalesce_grad: bool = True,
+        foreach: bool = False,
+        maximize: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "lr_decay": lr_decay,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "initial_accumulator_value": initial_accumulator_value,
+            "warmup_lr_multiplier": warmup_lr_multiplier,
+            "warmup_num_iters": warmup_num_iters,
+        }
+        self.coalesce_grad = coalesce_grad
+        self.foreach = foreach
+        self.maximize = maximize
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        # TODO: no union or any types in TorchScript, make step a scalar tensor instead
+        # This is also needed by if we want to share_memory on the step across processes
+        for p in self.param_group["params"]:
+            self.state[p] = {
+                "sum": torch.full_like(p.data, initial_accumulator_value),
+                "step": torch.tensor(0.0),
+            }
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        state_sums = []
+        state_steps: List[Tensor] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_sparse_grad, has_complex = False, False
+        for param, gradient in zip(self.param_group["params"], gradients):
+            if gradient is not None:
+                has_sparse_grad |= gradient.is_sparse
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                state = self.state[param]
+                state_sums.append(state["sum"])
+                state_steps.append(state["step"])
+
+        with torch.no_grad():
+            F.adagrad(
+                params,
+                grads,
+                state_sums,
+                state_steps,
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                lr_decay=self.defaults["lr_decay"],
+                eps=self.defaults["eps"],
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
+                maximize=self.maximize,
+                has_complex=has_complex,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_adam.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9739bd1c6fb2793ca85c8dc9c9eba55c346ffc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adam.py
@@ -0,0 +1,196 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional Adam Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdam:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+        amsgrad: bool = False,
+        maximize: bool = False,
+        foreach: bool = False,
+        fused: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        self.defaults = {
+            "lr": lr,
+            "eps": eps,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "weight_decay": weight_decay,
+        }
+        self.amsgrad = amsgrad
+        self.maximize = maximize
+        self.foreach = foreach
+        self.fused = fused
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+    def step_param(self, param: Tensor, grad: Optional[Tensor]):
+        """
+        Similar to step, but operates on a single parameter and optionally a
+        gradient tensor.
+        """
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_avg_sqs = []
+        max_exp_avg_sqs = []
+        state_steps: List[Tensor] = []
+        has_complex = torch.is_complex(param)
+        if grad is not None:
+            params_with_grad.append(param)
+            grads.append(grad)
+        if param not in self.state:
+            self.state[param] = {}
+            state = self.state[param]
+            state["step"] = torch.tensor(0.0)
+            state["exp_avg"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
+            state["exp_avg_sq"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
+            if self.amsgrad:
+                state["max_exp_avg_sq"] = torch.zeros_like(
+                    param, memory_format=torch.preserve_format
+                )
+
+        state = self.state[param]
+        exp_avgs.append(state["exp_avg"])
+        exp_avg_sqs.append(state["exp_avg_sq"])
+
+        if self.amsgrad:
+            max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+        state_steps.append(state["step"])
+        with torch.no_grad():
+            F.adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                has_complex=has_complex,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_avg_sqs = []
+        max_exp_avg_sqs = []
+        state_steps: List[Tensor] = []
+        has_complex = False
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(self.param_group["params"], gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    if self.amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
+
+                state = self.state[param]
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+
+                if self.amsgrad:
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+                state_steps.append(state["step"])
+
+        with torch.no_grad():
+            F.adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                has_complex=has_complex,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamax.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e02c4ae16edb177c9770aee42728c60980d0cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamax.py
@@ -0,0 +1,117 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional Adamax Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdamax:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+        foreach: bool = False,
+        maximize: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        self.defaults = {
+            "lr": lr,
+            "eps": eps,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "weight_decay": weight_decay,
+        }
+        self.foreach = foreach
+        self.maximize = maximize
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_infs = []
+        state_steps: List[Tensor] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_complex = False
+        for param, gradient in zip(self.param_group["params"], gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_inf"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+
+                state = self.state[param]
+
+                exp_avgs.append(state["exp_avg"])
+                exp_infs.append(state["exp_inf"])
+                state_steps.append(state["step"])
+
+        with torch.no_grad():
+            F.adamax(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_infs,
+                state_steps,
+                eps=self.defaults["eps"],
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                foreach=self.foreach,
+                maximize=self.maximize,
+                has_complex=has_complex,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamw.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..58752d34615848bcecb354966f8a955e187ee407
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_adamw.py
@@ -0,0 +1,197 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional AdamW Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalAdamW:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 1e-2,
+        amsgrad: bool = False,
+        maximize: bool = False,
+        foreach: bool = False,
+        fused: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        self.defaults = {
+            "lr": lr,
+            "eps": eps,
+            "beta1": betas[0],
+            "beta2": betas[1],
+            "weight_decay": weight_decay,
+        }
+        self.amsgrad = amsgrad
+        self.maximize = maximize
+        self.foreach = foreach
+        self.fused = fused
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+    def step_param(self, param: Tensor, grad: Optional[Tensor]):
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_avg_sqs = []
+        max_exp_avg_sqs = []
+        state_steps: List[Tensor] = []
+        has_complex = torch.is_complex(param)
+        if grad is not None:
+            params_with_grad.append(param)
+            grads.append(grad)
+        # Lazy state initialization
+        if param not in self.state:
+            self.state[param] = {}
+            state = self.state[param]
+            state["step"] = torch.tensor(0.0)
+            # Exponential moving average of gradient values
+            state["exp_avg"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
+            # Exponential moving average of squared gradient values
+            state["exp_avg_sq"] = torch.zeros_like(
+                param, memory_format=torch.preserve_format
+            )
+            if self.amsgrad:
+                # Maintains max of all exp. moving avg. of sq. grad. values
+                state["max_exp_avg_sq"] = torch.zeros_like(
+                    param, memory_format=torch.preserve_format
+                )
+
+        state = self.state[param]
+
+        exp_avgs.append(state["exp_avg"])
+        exp_avg_sqs.append(state["exp_avg_sq"])
+
+        if self.amsgrad:
+            max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+        state_steps.append(state["step"])
+        with torch.no_grad():
+            F.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+                has_complex=has_complex,
+            )
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        exp_avgs = []
+        exp_avg_sqs = []
+        max_exp_avg_sqs = []
+        state_steps: List[Tensor] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_complex = False
+        for param, gradient in zip(self.param_group["params"], gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    if self.amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state["max_exp_avg_sq"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
+
+                state = self.state[param]
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+
+                if self.amsgrad:
+                    max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+
+                state_steps.append(state["step"])
+
+        with torch.no_grad():
+            F.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=self.amsgrad,
+                maximize=self.maximize,
+                beta1=self.defaults["beta1"],
+                beta2=self.defaults["beta2"],
+                lr=self.defaults["lr"],
+                weight_decay=self.defaults["weight_decay"],
+                eps=self.defaults["eps"],
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+                has_complex=has_complex,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_rmsprop.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..61c4b15fa79b94fe463c1a8ea507e38975b5bfaa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_rmsprop.py
@@ -0,0 +1,122 @@
+from typing import Dict, List, Optional
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional RMSprop Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalRMSprop:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        alpha: float = 0.99,
+        eps: float = 1e-8,
+        weight_decay: float = 0.0,
+        momentum: float = 0.0,
+        centered: bool = False,
+        foreach: bool = False,
+        maximize: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "alpha": alpha,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "momentum": momentum,
+        }
+        self.centered = centered
+        self.foreach = foreach
+        self.maximize = maximize
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        square_avgs = []
+        grad_avgs = []
+        momentum_buffer_list = []
+        lr = self.defaults["lr"]
+        alpha = self.defaults["alpha"]
+        eps = self.defaults["eps"]
+        momentum = self.defaults["momentum"]
+        weight_decay = self.defaults["weight_decay"]
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_complex = False
+        for param, gradient in zip(params, gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    state["square_avg"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    if momentum > 0:
+                        state["momentum_buffer"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
+                    if self.centered:
+                        state["grad_avg"] = torch.zeros_like(
+                            param, memory_format=torch.preserve_format
+                        )
+
+                state = self.state[param]
+                square_avgs.append(state["square_avg"])
+                if momentum > 0:
+                    momentum_buffer_list.append(state["momentum_buffer"])
+                if self.centered:
+                    grad_avgs.append(state["grad_avg"])
+
+                state["step"] += 1
+
+        with torch.no_grad():
+            F.rmsprop(
+                params_with_grad,
+                grads,
+                square_avgs,
+                grad_avgs,
+                momentum_buffer_list,
+                lr=lr,
+                alpha=alpha,
+                eps=eps,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                centered=self.centered,
+                foreach=self.foreach,
+                maximize=self.maximize,
+                has_complex=has_complex,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_rprop.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_rprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e4d5fd9b19f1e26f4b1ddf1f6348c9e602da4a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_rprop.py
@@ -0,0 +1,100 @@
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional Rprop Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalRprop:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        etas: Tuple[float, float] = (0.5, 1.2),
+        step_sizes: Tuple[float, float] = (1e-6, 50),
+        foreach: bool = False,
+        maximize: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        self.defaults = {
+            "lr": lr,
+        }
+        self.etas = etas
+        self.step_sizes = step_sizes
+        self.foreach = foreach
+        self.maximize = maximize
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        prevs = []
+        step_sizes = []
+        lr = self.defaults["lr"]
+        etaminus, etaplus = self.etas
+        step_size_min, step_size_max = self.step_sizes
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_complex = False
+        for param, gradient in zip(params, gradients):
+            if gradient is not None:
+                has_complex |= torch.is_complex(param)
+                params_with_grad.append(param)
+                grads.append(gradient)
+                # Lazy state initialization
+                if param not in self.state:
+                    self.state[param] = {}
+                    state = self.state[param]
+                    state["step"] = torch.tensor(0.0)
+                    state["prev"] = torch.zeros_like(
+                        param, memory_format=torch.preserve_format
+                    )
+                    state["step_size"] = torch.full_like(gradient, lr)
+
+                state = self.state[param]
+                prevs.append(state["prev"])
+                step_sizes.append(state["step_size"])
+
+                state["step"] += 1
+
+        with torch.no_grad():
+            F.rprop(
+                params_with_grad,
+                grads,
+                prevs,
+                step_sizes,
+                step_size_min=step_size_min,
+                step_size_max=step_size_max,
+                etaminus=etaminus,
+                etaplus=etaplus,
+                foreach=self.foreach,
+                maximize=self.maximize,
+                has_complex=has_complex,
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/functional_sgd.py b/MLPY/Lib/site-packages/torch/distributed/optim/functional_sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb305c2fb3c21e51d1de970b0e3c8107c3a401e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/functional_sgd.py
@@ -0,0 +1,160 @@
+from typing import Dict, List, Optional
+
+import torch
+import torch.optim._functional as F
+
+from torch import Tensor
+
+__all__: List[str] = []
+
+# Define a TorchScript compatible Functional SGD Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly allow the distributed optimizer pass gradients to
+# the `step` function. In this way, we could separate the gradients
+# and parameters and allow multithreaded trainer to update the
+# parameters without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class _FunctionalSGD:
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        momentum: float = 0.0,
+        dampening: float = 0.0,
+        weight_decay: float = 0.0,
+        nesterov: bool = False,
+        maximize: bool = False,
+        foreach: bool = False,
+        fused: bool = False,
+        _allow_empty_param_list: bool = False,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "momentum": momentum,
+            "dampening": dampening,
+            "weight_decay": weight_decay,
+        }
+        self.nesterov = nesterov
+        self.maximize = maximize
+        self.foreach = foreach
+        self.fused = fused
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0 and not _allow_empty_param_list:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+    def step_param(self, param: Tensor, grad: Optional[Tensor]):
+        """Similar to self.step, but operates on a single parameter and
+        its gradient.
+        """
+        # TODO: Once step_param interface is robust, refactor step to call
+        # step param on each param.
+        weight_decay = self.defaults["weight_decay"]
+        momentum = self.defaults["momentum"]
+        dampening = self.defaults["dampening"]
+        lr = self.defaults["lr"]
+        params = [param]
+        momentum_buffer_list: List[Optional[Tensor]] = []
+        grads = []
+
+        has_sparse_grad = False
+        if grad is not None:
+            grads.append(grad)
+            if grad.is_sparse:
+                has_sparse_grad = True
+            if param not in self.state:
+                self.state[param] = {}
+            state = self.state[param]
+            if "momentum_buffer" not in state:
+                momentum_buffer_list.append(None)
+            else:
+                momentum_buffer_list.append(state["momentum_buffer"])
+
+        with torch.no_grad():
+            F.sgd(
+                params,
+                grads,
+                momentum_buffer_list,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                lr=lr,
+                dampening=dampening,
+                nesterov=self.nesterov,
+                maximize=self.maximize,
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
+        # update momentum_buffer in state
+        state = self.state[param]
+        momentum_buffer = momentum_buffer_list[0]
+        if momentum_buffer is not None:
+            state["momentum_buffer"] = momentum_buffer
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group["params"]
+        params_with_grad = []
+        grads = []
+        momentum_buffer_list: List[Optional[Tensor]] = []
+        lr = self.defaults["lr"]
+        weight_decay = self.defaults["weight_decay"]
+        momentum = self.defaults["momentum"]
+        dampening = self.defaults["dampening"]
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        has_sparse_grad = False
+        for param, gradient in zip(params, gradients):
+            if gradient is not None:
+                params_with_grad.append(param)
+                grads.append(gradient)
+                if gradient.is_sparse:
+                    has_sparse_grad = True
+
+                if param not in self.state:
+                    self.state[param] = {}
+
+                state = self.state[param]
+                if "momentum_buffer" not in state:
+                    momentum_buffer_list.append(None)
+                else:
+                    momentum_buffer_list.append(state["momentum_buffer"])
+
+        with torch.no_grad():
+            F.sgd(
+                params_with_grad,
+                grads,
+                momentum_buffer_list,
+                weight_decay=weight_decay,
+                momentum=momentum,
+                lr=lr,
+                dampening=dampening,
+                nesterov=self.nesterov,
+                maximize=self.maximize,
+                has_sparse_grad=has_sparse_grad,
+                foreach=self.foreach,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
+            )
+
+        # update momentum_buffers in state
+        for i, p in enumerate(params_with_grad):
+            state = self.state[p]
+            momentum_buffer = momentum_buffer_list[i]
+            if momentum_buffer is not None:
+                state["momentum_buffer"] = momentum_buffer
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/named_optimizer.py b/MLPY/Lib/site-packages/torch/distributed/optim/named_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..87cb734abccb132ee994555d7f5cf7bb9a823c90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/named_optimizer.py
@@ -0,0 +1,331 @@
+import logging
+import warnings
+
+from copy import deepcopy
+from typing import Any, Callable, Collection, Dict, List, Mapping, Optional, Union, overload
+
+import torch
+import torch.nn as nn
+from torch import optim
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+
+__all__: List[str] = []
+
+logger = logging.getLogger(__name__)
+
+
+class _NamedOptimizer(optim.Optimizer):
+    """
+    ``_NamedOptimizer`` takes a dict of parameters and exposes ``state_dict`` by parameter key.
+
+    We replace the original key (number) in an optim to the
+    fully qualified name (FQN) string. User can initialize the optim as they
+    initialize a PyTorch optim, the only difference is that they also need to
+    pass in the FQN of each parameters.
+
+    Args:
+        named_parameters (Mapping[str, Union[torch.Tensor, ShardedTensor]]):
+            Mapping from FQN to parameter.
+        optimizer_class (optim.Optimizer):
+            The class of optimizer to instantiate.
+        param_groups (Collection[Mapping[str, Any]]):
+            `param_groups` to pass to optimizer if specified.
+            The key of the inner map needs to be FQNs.
+            Default: None
+        module (nn.Module): the module whose parameters to updated
+            by the optimizer.
+        args: arguments to pass to the optimizer constructor.
+        kwargs: arguments to pass to the optimizer constructor.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch import optim
+        >>> from torch.distributed.optim import _NamedOptimizer
+        >>>
+        >>> # Define the named optimizer.
+        >>> m = Model(...)
+        >>> named_optim = _NamedOptimizer(m.named_parameters(), optim.SGD)
+        >>> # Forward pass + backward pass.
+        >>> named_optim.step()
+        >>> ...
+        >>> # Call state_dict for the named optimizer returns a FQN state_dict.
+        >>> named_optim.state_dict()
+
+    Warning: This API is still in development and subject to change.
+
+    TODO: Add tutorial for _NamedOptimizer.
+    TODO: Add documentation in the docstring for the public attributes
+          like self.param_groups and self.named_parameters.
+    """
+
+    def __init__(
+        self,
+        named_parameters: Mapping[str, Union[torch.Tensor, ShardedTensor]],
+        optimizer_class: optim.Optimizer,
+        param_groups: Optional[Collection[Mapping[str, Any]]] = None,
+        module: Optional[nn.Module] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        torch._C._log_api_usage_once("torch.distributed.optim._NamedOptimizer")
+        self.param_groups: Collection[Mapping[str, Any]] = param_groups  # type: ignore[assignment]
+        self._param_groups_check()
+        self.named_parameters = dict(named_parameters)
+        params_for_optimizer = (
+            self.named_parameters.values() if param_groups is None else param_groups
+        )
+        self._optimizer = optimizer_class(  # type: ignore[operator]
+            params_for_optimizer,
+            *args,
+            **kwargs,
+        )
+        self.module = module
+        if param_groups is None:
+            self.ordered_param_keys = list(self.named_parameters.keys())
+        else:
+            warnings.warn(
+                "Since we pass in param_groups, we will use param_groups to "
+                "initialize the optimizer, not all parameters of the module."
+            )
+            param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
+            ordered_param_keys = []
+            for group in param_groups:
+                for param in group["params"]:
+                    if param not in param_to_key:
+                        raise ValueError(
+                            f"Expect param name {param} found in param group but is missing."
+                        )
+                    ordered_param_keys.append(param_to_key[param])
+            self.ordered_param_keys = ordered_param_keys
+        # Update param_groups from optimizer.
+        self.param_groups = self._optimizer.param_groups
+
+    def _param_groups_check(self):
+        if self.param_groups is not None:
+            for param_group in self.param_groups:
+                assert isinstance(param_group, dict), "param group must be a dict"
+                assert "params" in param_group, "param group must contain key params"
+                params = param_group["params"]
+                if isinstance(params, torch.Tensor):
+                    params = [params]
+                params = list(params)
+                for param in params:
+                    if not isinstance(param, torch.Tensor):
+                        raise TypeError(
+                            "optimizer can only optimize Tensors, "
+                            "but one of the params is " + torch.typename(param)
+                        )
+                param_group["params"] = params
+
+    def state_dict(self) -> Dict[str, Any]:
+        """
+        Return the ``state_dict`` of the optimizer.
+
+        Instead of using number to index
+        parameters, we will use module fully qualified name (FQN) as the key.
+        """
+        state_dict = self._optimizer.state_dict()
+        param_groups = state_dict["param_groups"]
+
+        ret_state = {
+            self.ordered_param_keys[st_key]: state_val
+            for st_key, state_val in state_dict["state"].items()
+        }
+
+        ret_groups = []
+        for group in param_groups:
+            param_keys = []
+            for param in group["params"]:
+                param_keys.append(self.ordered_param_keys[param])
+            ret_group = {"params": sorted(param_keys)}
+            for k, v in group.items():
+                if k != "params":
+                    ret_group[k] = deepcopy(v)
+            ret_groups.append(ret_group)
+
+        return self._post_state_dict({"state": ret_state, "param_groups": ret_groups})
+
+    @overload
+    def step(self, closure: None = ...) -> None:
+        ...
+
+    @overload
+    def step(self, closure: Callable[[], float]) -> float:
+        ...
+
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        """
+        Perform a single optimization step.
+
+        This will call :meth:`torch.optim.Optimizer.step` on the wrapped
+        optimizer.
+        """
+        return self._optimizer.step(closure=closure)
+
+    @property
+    def state(self) -> Mapping[torch.Tensor, Any]:  # type: ignore[override]
+        return self._optimizer.state
+
+    def load_state_dict(self, state_dict: Mapping[str, Any]) -> None:
+        """
+        Define the default behavior to load a state_dict for ``_NamedOptimizer``.
+
+        Sample Code
+        ```
+            my_model = MyModule()
+            optimizer = _NamedOptimizer(my_model.named_parameters(), Adagrad)
+            ...
+
+            optim_state_dict = optimizer.state_dict()
+            ...
+            ...
+
+            optimizer.load_state_dict(optim_state_dict)
+            ...
+        ```
+        Args:
+            state_dict (Dict[str, Any]) : A ``state_dict`` to load into the optimizer.
+                Note that this state dict update is performed in place.
+
+        .. note:: PyTorch is using lazy init to initialize the optim states.
+            So it is possible that there is no optim state when user call
+            ``load_state_dict`` and for ``_NamedOptimizer`` we make it stricter
+            that users can only call ``load_state_dict`` after the state is initialized.
+            By doing this, we can validate the optim ``state_dict`` to be loaded.
+        """
+        new_state_dict = self._optimizer.state_dict()
+        state_dict = self._pre_load_state_dict(state_dict)
+        state = state_dict["state"]
+        new_state = new_state_dict["state"]
+        if len(new_state) == 0:
+            raise ValueError(
+                "Expects the optim to be initialized before load but found not initialized."
+            )
+
+        for idx, param_key in enumerate(self.ordered_param_keys):
+            # When the conditional training is performed, not all parameters are updated in the optim.
+            if param_key not in state.keys():
+                continue
+            if len(state[param_key]) != len(new_state[idx]):
+                raise ValueError(
+                    f"Expects equal length as {len(new_state[idx])} for parameter {param_key} but found: {len(state[param_key])}"
+                )
+            # Iterate through all optimizer states.
+            for state_key, state_val in new_state[idx].items():
+                if state_key not in state[param_key]:
+                    raise ValueError(
+                        f"Expects state {state_key} for parameter {param_key} but not found."
+                    )
+
+                src_state_val = state[param_key][state_key]
+                if isinstance(state_val, ShardedTensor):
+                    assert isinstance(src_state_val, ShardedTensor)
+                    num_shards = len(state_val.local_shards())
+                    num_new_shards = len(src_state_val.local_shards())
+                    if num_shards != num_new_shards:
+                        raise ValueError(
+                            f"Expects equal number of shards as {num_new_shards} but found {num_shards} for {param_key}/{state_key}"
+                        )
+                    for shard, src_shard in zip(
+                        state_val.local_shards(), src_state_val.local_shards()
+                    ):
+                        shard.tensor.detach().copy_(src_shard.tensor)
+                elif isinstance(state_val, torch.Tensor):
+                    assert isinstance(src_state_val, torch.Tensor)
+                    state_val.detach().copy_(src_state_val)
+                else:
+                    new_state[idx][state_key] = deepcopy(src_state_val)
+
+        # Load param_groups of state_dict
+        src_param_groups = state_dict["param_groups"]
+        new_param_groups = new_state_dict["param_groups"]
+
+        src_group_map = {}
+        for group in src_param_groups:
+            param_keys = list(group["params"])
+            src_group_map[_gen_param_group_key(param_keys)] = group
+        new_group_map = {}
+        for new_group in new_param_groups:
+            param_keys = []
+            for param_key in new_group["params"]:
+                param_keys.append(self.ordered_param_keys[param_key])  # type: ignore[call-overload]
+            new_group_map[_gen_param_group_key(param_keys)] = new_group
+        for group_key, new_group in new_group_map.items():
+            # When not all parameters are used in training or receive gradient, aka., not all parameters
+            # would be in the param_group. Thus we skip the group_key here.
+            if group_key not in src_group_map:
+                continue
+            src_group = src_group_map[group_key]
+            if len(src_group) != len(new_group):
+                raise ValueError(
+                    f"Expects equal param_group size as {len(new_group)} for group {group_key} but found {len(src_group)}."
+                )
+            for k in src_group:
+                if k not in new_group:
+                    raise ValueError(
+                        f"Expects group key {k} to be in group {group_key} in `state_dict` but is missing."
+                    )
+                if k != "params":
+                    new_group[k] = deepcopy(src_group[k])
+
+        self._optimizer.load_state_dict(new_state_dict)
+
+    def add_param_group(self, param_group: Mapping[str, Any]) -> None:
+        """
+        Add a param group to the :class:`_NamedOptimizer` s `param_groups`.
+
+        Warning: This API is still in development and subject to change.
+        """
+        assert isinstance(param_group, dict), "param group must be a dict"
+
+        params = param_group["params"]
+        if isinstance(params, torch.Tensor):
+            param_group["params"] = [params]
+        else:
+            param_group["params"] = list(params)
+
+        param_to_key = {param: key for key, param in self.named_parameters.items()}  # type: ignore[misc, has-type]
+        for param in param_group["params"]:
+            if param not in param_to_key:
+                raise ValueError("some parameters are not in the module")
+            self.ordered_param_keys.append(param_to_key[param])
+
+        self._optimizer.add_param_group(param_group)
+        # Update param_groups from optimizer.
+        self.param_groups = self._optimizer.param_groups
+
+    def init_state(self) -> None:
+        """
+        Run a dummy optimizer step, which allows to initialize optimizer state because we do lazy init for most optimizers.
+
+        This allows doing in-place loading of optimizer state from a checkpoint.
+        """
+        for param in self.named_parameters.values():
+            if param.requires_grad:
+                t = torch.zeros_like(param)
+                param.grad = torch.autograd.Variable(t)
+        # Calling ``step`` will load the initial state for optimizer states.
+        self.step(closure=None)
+
+    def _pre_load_state_dict(self, state_dict) -> Dict[str, Any]:
+        # TODO(chienchin): This API should be FSDP agnostic and should support
+        # general user hooks.
+        if isinstance(self.module, FSDP):
+            return FSDP.optim_state_dict_to_load(
+                self.module, self._optimizer, state_dict, is_named_optimizer=True
+            )
+        return state_dict
+
+    def _post_state_dict(self, state_dict) -> Dict[str, Any]:
+        # TODO(chienchin): This API should be FSDP agnostic and should support
+        # general user hooks.
+        if isinstance(self.module, FSDP):
+            FSDP.optim_state_dict(self.module, self._optimizer, state_dict)
+        return state_dict
+
+
+def _gen_param_group_key(param_keys: List[str]) -> str:
+    """Concatenate all param keys as a unique indentifier for one param group."""
+    return "/".join(sorted(param_keys))
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/optimizer.py b/MLPY/Lib/site-packages/torch/distributed/optim/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7dc1607fd8ed52078ba3407d468767464f8575
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/optimizer.py
@@ -0,0 +1,254 @@
+import logging
+
+from collections import defaultdict
+from threading import Lock
+from typing import List, Optional
+
+import torch
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.jit as jit
+import torch.nn as nn
+from torch import Tensor
+from torch.distributed.rpc import RRef
+from .utils import functional_optim_map
+
+__all__ = ["DistributedOptimizer"]
+
+logger = logging.getLogger(__name__)
+
+
+# XXX: we define a _ScriptModuleOptimizer here to explicitly
+# compile the FunctionalOptimizer class into TorchScript
+# This is because ScriptClass instance still lives in
+# python unless you explicitly compile it as an attribute
+# in ScriptModule or pass it to a ScriptFunction
+# _ScriptLocalOptimizerInterface serves as a common
+# interface type for Optimizer ScriptModules.
+#
+# TODO (wanchaol): remove this once we added TorchScript
+# class reference semantics
+@jit.interface
+class _ScriptLocalOptimizerInterface:
+    def step(self, autograd_ctx_id: int) -> None:
+        pass
+
+
+class _ScriptLocalOptimizer(nn.Module):
+    # TorchScript does not support multithread concurrent compiling.
+    # request_callback might invoke concurrent compiling, so we
+    # serialize the compiling with a lock
+    compile_lock = Lock()
+
+    def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        super().__init__()
+        self._local_params = [rref.local_value() for rref in local_params_rref]
+        self.optim = optim_cls(self._local_params, *args, **kwargs)
+
+    @jit.export
+    def step(self, autograd_ctx_id: int):
+        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
+        # apply functional optimizer step with a list of gradients
+        grads: List[Optional[Tensor]] = [
+            all_local_grads[p] if p in all_local_grads else None
+            for p in self._local_params
+        ]
+
+        self.optim.step(grads)
+
+
+# TODO (wanchaol): remove/merge this with ScriptLocalOptimizer once
+# we have converted all to functional optimizer in distributed.optim
+class _LocalOptimizer:
+    # Ideally we would only need to share a lock for instances of
+    # _LocalOptimizer that deal with the same parameters. We are
+    # making a simplifying assumption here that if there is more
+    # than one instance of _LocalOptimizer per worker, they will
+    # be optimizing the same parameters (e.g. each data parallel
+    # trainer will create its own instance of _LocalOptimizer but
+    # they will all optimize the same parameters on each worker)
+    global_lock = Lock()
+
+    def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
+        self._local_params = [rref.local_value() for rref in local_params_rref]
+        self.optim = optim_cls(self._local_params, *args, **kwargs)
+
+    def step(self, autograd_ctx_id):
+        all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
+
+        with _LocalOptimizer.global_lock:
+            for param, grad in all_local_grads.items():
+                param.grad = grad
+            self.optim.step()
+
+
+def _new_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
+    return rpc.RRef(_LocalOptimizer(optim_cls, local_params_rref, *args, **kwargs))
+
+
+def _local_optimizer_step(local_optim_rref, autograd_ctx_id):
+    local_optim = local_optim_rref.local_value()
+    local_optim.step(autograd_ctx_id)
+
+
+# new/step functions combined with _ScriptLocalOptimizer to provide GIL-free optimizer
+def _new_script_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
+    optim = _ScriptLocalOptimizer(optim_cls, local_params_rref, *args, **kwargs)
+
+    with _ScriptLocalOptimizer.compile_lock:
+        script_optim = jit.script(optim)
+        return rpc.RRef(script_optim, _ScriptLocalOptimizerInterface)
+
+
+@jit.script
+def _script_local_optimizer_step(
+    local_optim_rref: RRef[_ScriptLocalOptimizerInterface], autograd_ctx_id: int
+) -> None:
+    local_optim = local_optim_rref.local_value()
+    local_optim.step(autograd_ctx_id)
+
+
+def _wait_for_all(rpc_futs):
+    # TODO: improve error propagation
+    exception = None
+    results = []
+    for fut in rpc_futs:
+        try:
+            results.append(fut.wait())
+        except Exception as e:
+            results.append(e)
+            exception = e
+    if exception is not None:
+        raise exception
+    return results
+
+
+class DistributedOptimizer:
+    """
+    DistributedOptimizer takes remote references to parameters scattered
+    across workers and applies the given optimizer locally for each parameter.
+
+    This class uses :meth:`~torch.distributed.autograd.get_gradients` in order
+    to retrieve the gradients for specific parameters.
+
+    Concurrent calls to
+    :meth:`~torch.distributed.optim.DistributedOptimizer.step`,
+    either from the same or different clients, will
+    be serialized on each worker -- as each worker's optimizer can only work
+    on one set of gradients at a time. However, there is no guarantee that
+    the full forward-backward-optimizer sequence will execute for one client
+    at a time. This means that the gradients being applied may not correspond
+    to the latest forward pass executed on a given worker. Also, there is no
+    guaranteed ordering across workers.
+
+    `DistributedOptimizer` creates the local optimizer with TorchScript enabled
+    by default, so that optimizer updates are not blocked by the Python Global
+    Interpreter Lock (GIL) in the case of multithreaded training (e.g. Distributed
+    Model Parallel). This feature is currently enabled for most optimizers. You
+    can also follow `the recipe`__ in PyTorch tutorials to enable TorchScript support
+    for your own custom optimizers.
+
+    Args:
+        optimizer_class (optim.Optimizer): the class of optimizer to
+            instantiate on each worker.
+        params_rref (list[RRef]): list of RRefs to local or remote parameters
+            to optimize.
+        args: arguments to pass to the optimizer constructor on each worker.
+        kwargs: arguments to pass to the optimizer constructor on each worker.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> import torch.distributed.autograd as dist_autograd
+        >>> import torch.distributed.rpc as rpc
+        >>> from torch import optim
+        >>> from torch.distributed.optim import DistributedOptimizer
+        >>>
+        >>> with dist_autograd.context() as context_id:
+        >>>   # Forward pass.
+        >>>   rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
+        >>>   rref2 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 1))
+        >>>   loss = rref1.to_here() + rref2.to_here()
+        >>>
+        >>>   # Backward pass.
+        >>>   dist_autograd.backward(context_id, [loss.sum()])
+        >>>
+        >>>   # Optimizer.
+        >>>   dist_optim = DistributedOptimizer(
+        >>>      optim.SGD,
+        >>>      [rref1, rref2],
+        >>>      lr=0.05,
+        >>>   )
+        >>>   dist_optim.step(context_id)
+
+    __ https://github.com/pytorch/tutorials/pull/1465
+    """
+
+    def __init__(self, optimizer_class, params_rref, *args, **kwargs):
+        torch._C._log_api_usage_once("torch.distributed.optim.DistributedOptimizer")
+        per_worker_params_rref = defaultdict(list)
+        for param in params_rref:
+            per_worker_params_rref[param.owner()].append(param)
+
+        if optimizer_class in functional_optim_map and jit._state._enabled:
+            optim_ctor = functional_optim_map.get(optimizer_class)
+        else:
+            optim_ctor = optimizer_class
+        self.is_functional_optim = optim_ctor != optimizer_class
+
+        if self.is_functional_optim:
+            optimizer_new_func = _new_script_local_optimizer
+        else:
+            logger.warning(
+                "Creating the optimizer %s without TorchScript support, "
+                "this might result in slow computation time in multithreading environment"
+                "(i.e. Distributed Model Parallel training on CPU) due to the Python's "
+                "Global Interpreter Lock (GIL). Please file an issue if you need this "
+                "optimizer in TorchScript. ",
+                optimizer_class
+            )
+            optimizer_new_func = _new_local_optimizer
+
+        remote_optim_futs = []
+        for worker, param_rrefs in per_worker_params_rref.items():
+            remote_optim_rref_fut = rpc.rpc_async(
+                worker,
+                optimizer_new_func,
+                args=(optim_ctor, param_rrefs) + args,
+                kwargs=kwargs,
+            )
+            remote_optim_futs.append(remote_optim_rref_fut)
+
+        self.remote_optimizers = _wait_for_all(remote_optim_futs)
+
+    def step(self, context_id):
+        """
+        Performs a single optimization step.
+
+        This will call :meth:`torch.optim.Optimizer.step` on each worker
+        containing parameters to be optimized, and will block until all workers
+        return. The provided ``context_id`` will be used to retrieve the
+        corresponding :class:`~torch.distributed.autograd.context` that
+        contains the gradients that should be applied to the parameters.
+
+        Args:
+            context_id: the autograd context id for which we should run the
+                optimizer step.
+        """
+        dist_autograd._is_valid_context(context_id)
+
+        optimizer_step_func = (
+            _script_local_optimizer_step
+            if self.is_functional_optim
+            else _local_optimizer_step
+        )
+
+        rpc_futs = []
+        for optimizer in self.remote_optimizers:
+            rpc_futs.append(
+                rpc.rpc_async(
+                    optimizer.owner(),
+                    optimizer_step_func,
+                    args=(optimizer, context_id),
+                )
+            )
+        _wait_for_all(rpc_futs)
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/post_localSGD_optimizer.py b/MLPY/Lib/site-packages/torch/distributed/optim/post_localSGD_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..85514aade2d84347e84bebb5a44e1eed3db06218
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/post_localSGD_optimizer.py
@@ -0,0 +1,109 @@
+import warnings
+
+import torch
+import torch.distributed.algorithms.model_averaging.averagers as averagers
+
+
+class PostLocalSGDOptimizer(torch.optim.Optimizer):
+    r"""
+    Wraps an arbitrary :class:`torch.optim.Optimizer` and runs `post-local SGD <https://arxiv.org/abs/1808.07217>`_,
+    This optimizer runs local optimizer at every step.
+    After the warm-up stage, it averages parameters periodically afer the local optimizer is applied.
+
+    Args:
+        optim: The local optimizer.
+        averager: A model averager instance to run post-localSGD algorithm.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> import torch
+        >>> import torch.distributed as dist
+        >>> import torch.distributed.algorithms.model_averaging.averagers as averagers
+        >>> import torch.nn as nn
+        >>> from torch.distributed.optim import PostLocalSGDOptimizer
+        >>> from torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook import (
+        >>>   PostLocalSGDState,
+        >>>   post_localSGD_hook,
+        >>> )
+        >>>
+        >>> model = nn.parallel.DistributedDataParallel(
+        >>>    module, device_ids=[rank], output_device=rank
+        >>> )
+        >>>
+        >>> # Register a post-localSGD communication hook.
+        >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100)
+        >>> model.register_comm_hook(state, post_localSGD_hook)
+        >>>
+        >>> # Create a post-localSGD optimizer that wraps a local optimizer.
+        >>> # Note that ``warmup_steps`` used in ``PostLocalSGDOptimizer`` must be the same as
+        >>> # ``start_localSGD_iter`` used in ``PostLocalSGDState``.
+        >>> local_optim = torch.optim.SGD(params=model.parameters(), lr=0.01)
+        >>> opt = PostLocalSGDOptimizer(
+        >>>     optim=local_optim,
+        >>>     averager=averagers.PeriodicModelAverager(period=4, warmup_steps=100)
+        >>> )
+        >>>
+        >>> # In the first 100 steps, DDP runs global gradient averaging at every step.
+        >>> # After 100 steps, DDP runs gradient averaging within each subgroup (intra-node by default),
+        >>> # and post-localSGD optimizer runs global model averaging every 4 steps after applying the local optimizer.
+        >>> for step in range(0, 200):
+        >>>    opt.zero_grad()
+        >>>    loss = loss_fn(output, labels)
+        >>>    loss.backward()
+        >>>    opt.step()
+    """
+
+    def __init__(self, optim: torch.optim.Optimizer, averager: averagers.ModelAverager):
+        self.optim = optim
+        self.param_groups = self.optim.param_groups
+        self.averager = averager
+
+    @property
+    def state(self):
+        return self.optim.state
+
+    def __repr__(self):
+        return self.optim.__repr__()
+
+    def state_dict(self):
+        r"""
+        This is the same as :class:`torch.optim.Optimizer` :meth:`state_dict`,
+        but adds an extra entry to record model averager's step to the checkpoint
+        to ensure reload does not cause unnecessary warm up again.
+        """
+        optim_state_dict = self.optim.state_dict()
+        optim_state_dict["step"] = self.averager.step
+        return optim_state_dict
+
+    def load_state_dict(self, state_dict):
+        r"""
+        This is the same as :class:`torch.optim.Optimizer` :meth:`load_state_dict`,
+        but also restores model averager's step value to the one
+        saved in the provided ``state_dict``.
+
+        If there is no ``"step"`` entry in ``state_dict``,
+        it will raise a warning and initialize the model averager's step to 0.
+        """
+        self.optim.load_state_dict(state_dict)
+        if "step" in state_dict:
+            self.averager.step = state_dict["step"]
+        else:
+            warnings.warn(
+                "Loaded state dict does not contain a step counter for an averager. "
+                "Setting step counter to 0."
+            )
+            self.averager.step = 0
+
+    def step(self):
+        r"""
+        Performs a single optimization step (parameter update).
+        """
+        self.optim.step()
+        self.averager.average_parameters(params=self.param_groups)
+
+    def zero_grad(self, set_to_none: bool = True):  # type: ignore[override]
+        self.optim.zero_grad(set_to_none=set_to_none)
+
+    def add_param_group(self, param_group):
+        self.optim.add_param_group(param_group)
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/utils.py b/MLPY/Lib/site-packages/torch/distributed/optim/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa9507f1a29d2aadc6cc8672048daae9256e73c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/utils.py
@@ -0,0 +1,63 @@
+from typing import Type
+
+from torch import optim
+from .functional_adadelta import _FunctionalAdadelta
+from .functional_adagrad import _FunctionalAdagrad
+from .functional_adam import _FunctionalAdam
+from .functional_adamax import _FunctionalAdamax
+from .functional_adamw import _FunctionalAdamW
+from .functional_rmsprop import _FunctionalRMSprop
+from .functional_rprop import _FunctionalRprop
+from .functional_sgd import _FunctionalSGD
+
+# dict to map a user passed in optimizer_class to a functional
+# optimizer class if we have already defined inside the
+# distributed.optim package, this is so that we hide the
+# functional optimizer to user and still provide the same API.
+functional_optim_map = {
+    optim.Adagrad: _FunctionalAdagrad,
+    optim.Adam: _FunctionalAdam,
+    optim.AdamW: _FunctionalAdamW,
+    optim.SGD: _FunctionalSGD,
+    optim.Adadelta: _FunctionalAdadelta,
+    optim.RMSprop: _FunctionalRMSprop,
+    optim.Rprop: _FunctionalRprop,
+    optim.Adamax: _FunctionalAdamax,
+}
+
+
+def register_functional_optim(key, optim):
+    """
+    Interface to insert a new functional optimizer to functional_optim_map
+    ``fn_optim_key`` and ``fn_optimizer`` are user defined. The optimizer and key
+    need not be of :class:`torch.optim.Optimizer` (e.g. for custom optimizers)
+    Example::
+        >>> # import the new functional optimizer
+        >>> # xdoctest: +SKIP
+        >>> from xyz import fn_optimizer
+        >>> from torch.distributed.optim.utils import register_functional_optim
+        >>> fn_optim_key = "XYZ_optim"
+        >>> register_functional_optim(fn_optim_key, fn_optimizer)
+    """
+    if key not in functional_optim_map:
+        functional_optim_map[key] = optim
+
+
+def as_functional_optim(optim_cls: Type, *args, **kwargs):
+    try:
+        functional_cls = functional_optim_map[optim_cls]
+    except KeyError as e:
+        raise ValueError(
+            f"Optimizer {optim_cls} does not have a functional " f"counterpart!"
+        ) from e
+
+    return _create_functional_optim(functional_cls, *args, **kwargs)
+
+
+def _create_functional_optim(functional_optim_cls: Type, *args, **kwargs):
+    return functional_optim_cls(
+        [],
+        *args,
+        **kwargs,
+        _allow_empty_param_list=True,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.py b/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f3be9ce6a99bb56045c746b709b96fa38200e77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.py
@@ -0,0 +1,1651 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+r"""Zero Redundancy Optimizer."""
+import collections
+import copy
+import enum
+import inspect
+import io
+import logging
+from itertools import chain
+from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed.algorithms.join import Join, Joinable, JoinHook
+from torch.distributed.optim.utils import functional_optim_map
+from torch.optim import Optimizer
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ZeroRedundancyOptimizer"]
+
+
+# Credits:  classy_vision/generic/distributed_util.py
+def _recursive_copy_to_device(
+    value: Any,
+    non_blocking: bool,
+    device: torch.device,
+) -> Any:
+    r"""
+    Recursively searches lists, tuples, dicts and copies tensors to device if possible.
+
+    Non-tensor values are passed as-is in the result.
+
+    .. note:  These are all copies, so if there are two objects that reference
+    the same object, then after this call, there will be two different objects
+    referenced on the device.
+    """
+    if isinstance(value, torch.Tensor):
+        return value.to(device, non_blocking=non_blocking)
+
+    if isinstance(value, (list, tuple)):
+        values = [
+            _recursive_copy_to_device(val, non_blocking=non_blocking, device=device)
+            for val in value
+        ]
+        return values if isinstance(value, list) else tuple(values)
+
+    if isinstance(value, collections.abc.Mapping):
+        return {
+            key: _recursive_copy_to_device(
+                val, non_blocking=non_blocking, device=device
+            )
+            for key, val in value.items()
+        }
+
+    return value
+
+
+def _is_trainable(param: torch.Tensor) -> bool:
+    r"""Return if a parameter is trainable, where trainability is equivalent to requiring a gradient."""
+    return param.requires_grad
+
+
+def _broadcast_object(
+    obj: Any,
+    src_rank: int,
+    group: object = dist.group.WORLD,
+    device: torch.device = torch.device("cpu"),
+) -> Any:
+    r"""
+    Broadcasts an object to the given group.
+
+    It will be sending the object if called from the source rank and receiving
+    the object otherwise.
+
+    Arguments:
+        obj: object to broadcast; only used if called on the source rank.
+        src_rank (int): source rank.
+        group (``ProcessGroup``, optional): group used for the broadcast
+            (default: ``dist.group.WORLD``).
+        device (``torch.device``, optional): device to send from or receive
+            to (default: ``torch.device("cpu")``).
+
+    Returns:
+        The broadcasted object.
+    """
+    if dist.get_rank() == src_rank:
+        # Send the object
+        buffer = io.BytesIO()
+        torch.save(obj, buffer)
+        data = bytearray(buffer.getbuffer())
+        length_tensor = torch.LongTensor([len(data)]).to(device)
+        data_send_tensor = torch.ByteTensor(data).to(device)
+        dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
+        dist.broadcast(data_send_tensor, src=src_rank, group=group, async_op=False)
+    else:
+        # Receive the object
+        length_tensor = torch.LongTensor([0]).to(device)
+        dist.broadcast(length_tensor, src=src_rank, group=group, async_op=False)
+        data_recv_tensor = torch.empty(
+            [int(length_tensor.item())], dtype=torch.uint8, device=device
+        )
+        dist.broadcast(data_recv_tensor, src=src_rank, group=group, async_op=False)
+        buffer = io.BytesIO(data_recv_tensor.cpu().numpy())
+        obj = torch.load(buffer, map_location=device)
+    return obj
+
+
+class _ZeROJoinHook(JoinHook):
+    def __init__(self, zero):
+        assert isinstance(zero, ZeroRedundancyOptimizer), (
+            "ZeRO join hook requires passing in a ZeroRedundancyOptimizer "
+            "instance as the state"
+        )
+        self.zero = zero
+        super().__init__()
+
+    def main_hook(self):
+        """
+        Perform an optimizer step.
+
+        This step updates the joined process's shard of
+        the parameters and broadcasts those parameters.
+        """
+        self.zero.step()
+
+
+class _DDPBucketAssignment:
+    r"""
+    Represent a :class:`DistributedDataParallel` bucket assignment.
+
+    This means that a (possibly non-strict) subset of the parameters corresponding to
+    a DDP bucket assigned to a rank to update.
+
+    Attributes:
+        bucket_index (int): index of the bucket determined by the DDP gradient
+            bucket all-reduce order.
+        parameters (List[torch.Tensor]): model parameters in the bucket
+            assigned to this rank.
+        offset (int): offset into the :class:`GradBucket` 's :meth:`parameters`
+            giving the index of the first element in the passed-in
+            ``parameters``; this equivalently indexes into the
+            :class:`GradBucket` 's :meth:`gradients`.
+        device (torch.device): device on which the parameters are stored.
+        tensor (torch.Tensor): flattened tensor giving the data of the
+            parameter subset assigned to the rank.
+    """
+
+    def __init__(
+        self,
+        bucket_index: int,
+        parameters: List[torch.Tensor],
+        offset: int,
+    ):
+        self.bucket_index = bucket_index
+        self.parameters = parameters
+        self.offset = offset
+        if len(self.parameters) == 0:
+            raise ValueError("Empty bucket assignment")
+        # DDP guarantees all parameters in the bucket have the same device
+        self.device: torch.device = self.parameters[0].device
+        self.tensor: Optional[torch.Tensor] = None
+
+
+class _OverlapStatus(enum.IntEnum):
+    r"""
+    Define possible statuses that :class:`ZeroRedundancyOptimizer` can be in when overlapping with :class:`DistributedDataParallel`.
+
+    Attributes:
+        ``UNINITIALIZED``: The ZeRO instance is effectively uninitialized and
+            is waiting for DDP to finalize its bucketing.
+        ``DDP_HAS_REBUILT_BUCKETS``: DDP has rebuilt its buckets, meaning that
+            its bucketing is finalized. The ZeRO instance can now collect the
+            necessary information about the DDP bucketing.
+        ``INITIALIZED``: The ZeRO instance is fully initialized and can now
+            optimize parameters.
+    """
+
+    UNINITIALIZED = 0
+    DDP_HAS_REBUILT_BUCKETS = 1
+    INITIALIZED = 2
+
+
+class _OverlapInfo:
+    r"""
+    Information needed by :class:`ZeroRedundancyOptimizer` to overlap with :class:`DistributedDataParallel`.
+
+    Arguments:
+        world_size (int): world size of the process group being used.
+
+    Attributes:
+        shard_buckets (bool): if ``True``, then the assignment of each
+            :class:`DistributedDataParallel` bucket is partitioned across
+            possibly multiple :class:`ZeroRedundancyOptimizer` instances (i.e.
+            across possibly multiple ranks) to approximate uniformity following
+            a threshold given by the total parameter size divided by the world
+            size; if ``False``, then each bucket is wholly assigned to a single
+            :class:`ZeroRedundancyOptimizer` instance (i.e. to a single rank);
+            this should be set to the value passed into the hook constructor.
+        status (_OverlapStatus): current status; see :class:`_OverlapStatus`
+            for more information.
+        params_per_bucket (List[List[torch.Tensor]]): ``params_per_bucket[i]``
+            gives the model parameters in the ``i``th bucket.
+        params_per_rank (List[List[torch.Tensor]]): ``params_per_rank[i]``
+            gives the model parameters assigned to the ``i``th rank, where the
+            parameters are grouped by increasing bucket indices.
+        offsets (Dict[int, int]): maps from bucket index to the offset in
+            ``self.params_per_rank[rank]`` giving the index of the first
+            parameter in that bucket, where ``rank`` is this process's own
+            rank; the keys of this :class:`dict` are the bucket indices
+            assigned to this rank.
+        num_bucket_assignments (int): total number of bucket assignments across
+            all ranks; this is equal to the number of
+            :class:`DistributedDataParallel` gradient buckets if
+            ``shard_buckets=False`` and possibly greater otherwise.
+        total_size (int, optional): total size of all buckets (i.e. sum of
+            ``param.numel()`` for all ``param`` across all buckets) if
+            ``shard_buckets=True``; otherwise, ``None``.
+        broadcast_handles (List[Work]): :class:`list` of async work handles for
+            the parameter broadcasts.
+        bucket_index_to_future (Dict[int, torch.futures.Future]):
+            :class:`dict` mapping bucket index to the corresponding all-reduce
+            future.
+        bucket_index_to_bucket (Dict[int, dist.GradBucket]): :class:`dict`
+            mapping bucket index to the corresponding bucket.
+        bucket_indices_seen (List[int]): :class:`list` of the bucket indices
+            seen on this iteration.
+    """
+
+    def __init__(self, world_size) -> None:
+        self.status: _OverlapStatus = _OverlapStatus.UNINITIALIZED
+        self.shard_buckets: bool = False
+
+        # Modified per bucket reconstruction
+        self.params_per_bucket: List[List[torch.Tensor]] = []
+        self.params_per_rank: List[List[torch.Tensor]] = [[] for _ in range(world_size)]
+        self.offsets: Dict[int, int] = {}
+        # Group Ranks
+        self.assigned_ranks_per_bucket: List[Set[int]] = []
+        self.num_bucket_assignments: int = 0
+        self.total_size: Optional[int] = None
+
+        # Modified per iteration
+        self.broadcast_handles: List[Any] = []
+        self.bucket_indices_seen: List[int] = []
+        # Used by `hook_with_zero_step()`
+        self.bucket_index_to_future: Dict[int, torch.futures.Future] = {}
+        self.bucket_index_to_bucket: Dict[int, dist.GradBucket] = {}
+
+    def wait_for_broadcasts(self) -> None:
+        r"""
+        Wait for all parameter broadcasts.
+
+        This function should be called once all broadcasts have been scheduled,
+        meaning ``self.broadcast_handles`` is filled. This clears ``self.broadcast_handles``
+        in preparation for the next iteration.
+        """
+        assert (
+            len(self.broadcast_handles) == self.num_bucket_assignments
+        ), f"Missing at least one broadcast handle on rank {dist.get_rank()}"
+        _ = [x.wait() for x in self.broadcast_handles]
+        self.broadcast_handles.clear()
+
+    def clear_per_iter_info(self) -> None:
+        r"""
+        Clear the data structures that are modified per-iteration.
+
+        This function should be called at the end of an iteration.
+        """
+        self.bucket_indices_seen.clear()
+        self.bucket_index_to_future.clear()
+        self.bucket_index_to_bucket.clear()
+
+
+class ZeroRedundancyOptimizer(Optimizer, Joinable):
+    r"""
+    Wrap an arbitrary :class:`optim.Optimizer <torch.optim.Optimizer>` and shards its states across ranks in the group.
+
+    The sharing is done as described by ZeRO_.
+
+    The local optimizer instance in each rank is only
+    responsible for updating approximately ``1 / world_size`` parameters and
+    hence only needs to keep ``1 / world_size`` optimizer states. After
+    parameters are updated locally, each rank will broadcast its parameters to
+    all other peers to keep all model replicas in the same state.
+    ``ZeroRedundancyOptimizer`` can be used in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel` to reduce per-rank peak
+    memory consumption.
+
+    ``ZeroRedundancyOptimizer`` uses a sorted-greedy algorithm to pack a number
+    of parameters at each rank. Each parameter belongs to a single rank and is
+    not divided among ranks. The partition is arbitrary and might not match the
+    the parameter registration or usage order.
+
+    Arguments:
+        params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s
+            or :class:`dict` s giving all parameters, which will be sharded
+            across ranks.
+
+    Keyword Args:
+        optimizer_class (:class:`torch.nn.Optimizer`): the class of the local
+            optimizer.
+        process_group (``ProcessGroup``, optional): ``torch.distributed``
+            ``ProcessGroup`` (default: ``dist.group.WORLD`` initialized by
+            :meth:`torch.distributed.init_process_group`).
+        parameters_as_bucket_view (bool, optional): if ``True``, parameters are
+            packed into buckets to speed up communication, and ``param.data``
+            fields point to bucket views at different offsets; if ``False``,
+            each individual parameter is communicated separately, and each
+            ``params.data`` stays intact (default: ``False``).
+        overlap_with_ddp (bool, optional): if ``True``, :meth:`step` is
+            overlapped with :class:`DistributedDataParallel` 's gradient
+            synchronization; this requires (1) either a functional optimizer
+            for the ``optimizer_class`` argument or one with a functional
+            equivalent and (2) registering a DDP communication hook
+            constructed from one of the functions in ``ddp_zero_hook.py``;
+            parameters are packed into buckets matching those in
+            :class:`DistributedDataParallel`, meaning that the
+            ``parameters_as_bucket_view`` argument is ignored.
+            If ``False``, :meth:`step` runs disjointly after the backward pass
+            (per normal).
+            (default: ``False``)
+        **defaults: any trailing arguments, which are forwarded to the local
+            optimizer.
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> import torch.nn as nn
+        >>> from torch.distributed.optim import ZeroRedundancyOptimizer
+        >>> from torch.nn.parallel import DistributedDataParallel as DDP
+        >>> model = nn.Sequential(*[nn.Linear(2000, 2000).to(rank) for _ in range(20)])
+        >>> ddp = DDP(model, device_ids=[rank])
+        >>> opt = ZeroRedundancyOptimizer(
+        >>>     ddp.parameters(),
+        >>>     optimizer_class=torch.optim.Adam,
+        >>>     lr=0.01
+        >>> )
+        >>> ddp(inputs).sum().backward()
+        >>> opt.step()
+
+    .. warning::
+        Currently, ``ZeroRedundancyOptimizer`` requires that all of the
+        passed-in parameters are the same dense type.
+
+    .. warning::
+        If you pass ``overlap_with_ddp=True``, be wary of the following: Given
+        the way that overlapping :class:`DistributedDataParallel` with
+        :class:`ZeroRedundancyOptimizer` is currently implemented, the first
+        two or three training iterations do not perform parameter updates in
+        the optimizer step, depending on if ``static_graph=False`` or
+        ``static_graph=True``, respectively. This is because it needs
+        information about the gradient bucketing strategy used by
+        :class:`DistributedDataParallel`, which is not finalized until the
+        second forward pass if ``static_graph=False`` or until the third
+        forward pass if ``static_graph=True``. To adjust for this, one option
+        is to prepend dummy inputs.
+
+    .. warning:: ZeroRedundancyOptimizer is experimental and subject to change.
+
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+
+    """
+
+    def __init__(
+        self,
+        params,
+        optimizer_class: Type[Optimizer],
+        process_group: Optional[Any] = None,
+        parameters_as_bucket_view: bool = False,
+        overlap_with_ddp: bool = False,
+        **defaults: Any,
+    ):
+        r"""Init."""
+        # Perform type and assumption checks on the input parameters
+        params = self._verify_and_init_params(params)
+        self._verify_same_dense_param_type()
+
+        # NOTE: The parent constructor uses `add_param_group()` which is
+        # partially overloaded in ZeroRedundancyOptimizer, so we use the
+        # `initialized` flag to dissociate the behaviour of `add_param_group()`
+        # between the parent and child.
+        self.initialized = False
+
+        Optimizer.__init__(self, params, defaults)
+        Joinable.__init__(self)
+        # Now, all parameters are held in both `self._all_params` and
+        # `self.param_groups`
+
+        # Internal data structures (`_cache` indicates lazily evaluated)
+        self._param_to_rank_cache: Dict[torch.Tensor, int] = {}
+        self._param_to_index_cache: Dict[torch.Tensor, int] = {}
+        self._partition_parameters_cache: List[List[Dict]] = []
+        self._index_to_param_cache: List[torch.Tensor] = []
+        self._device_to_params_per_rank_cache: Dict[
+            torch.device, List[List[torch.Tensor]]
+        ] = {}
+        self._bucket_assignments_per_rank_cache: List[
+            Dict[int, _DDPBucketAssignment]
+        ] = []
+        self._is_trainable_mask = self._get_is_trainable_mask()
+
+        # Default device for collective communication and buckets
+        self._default_device = self._all_params[0].device
+
+        self.process_group = (
+            process_group if process_group is not None else dist.group.WORLD
+        )
+        self.world_size: int = dist.get_world_size(self.process_group)
+        self.rank: int = dist.get_rank(self.process_group)
+        self.global_rank: int = dist.distributed_c10d.get_global_rank(
+            self.process_group, self.rank
+        )
+
+        self._overlap_with_ddp: bool = overlap_with_ddp
+        self._optim_defaults = defaults
+        self._optim_constructor = self._get_optimizer_constructor(optimizer_class)
+
+        # If `overlap_with_ddp=True`, local optimizer initialization is delayed
+        # to run time after the necessary information has been collected
+        if not overlap_with_ddp:
+            self._init_local_optimizer()
+        else:
+            self._overlap_info: _OverlapInfo = _OverlapInfo(self.world_size)
+            if parameters_as_bucket_view:
+                logger.warning(
+                    "`parameters_as_bucket_view=True` will be ignored since "
+                    "`overlap_with_ddp=True`; instead, a different bucketing "
+                    "strategy will be used"
+                )
+
+        # `self._buckets` is used if `parameters_as_bucket_view=True`, in
+        # which case parameter data is flattened into contiguous bucket tensors
+        self.parameters_as_bucket_view = parameters_as_bucket_view
+        self._buckets: List[List[torch.Tensor]] = []
+        self._build_param_buckets()
+
+        # Optional consolidated optimizer state, only populated if this rank
+        # is the target in `consolidate_state_dict()`
+        self._all_state_dicts: List[Dict[str, Any]] = []
+
+        self.initialized = True
+
+    def _clear_cache(self) -> None:
+        r"""Clear the cached data structures giving partition information."""
+        self._partition_parameters_cache.clear()
+        self._param_to_rank_cache.clear()
+        self._index_to_param_cache.clear()
+        self._param_to_index_cache.clear()
+        self._device_to_params_per_rank_cache.clear()
+        self._bucket_assignments_per_rank_cache.clear()
+
+    def add_param_group(self, param_group: Dict[str, Any]) -> None:
+        r"""
+        Add a parameter group to the :class:`Optimizer` 's ``param_groups``.
+
+        This can be useful when fine tuning a pre-trained network, as frozen
+        layers can be made trainable and added to the :class:`Optimizer` as
+        training progresses.
+
+        Arguments:
+            param_group (dict): specifies the parameters to be optimized and
+                group-specific optimization options.
+
+        .. warning:: This method handles updating the shards on all partitions
+            but needs to be called on all ranks. Calling this on a subset of
+            the ranks will cause the training to hang because communication
+            primitives are called depending on the managed parameters and
+            expect all the ranks to participate on the same set of parameters.
+        """
+        if self.initialized and self._overlap_with_ddp:
+            raise RuntimeError(
+                "ZeroRedundancyOptimizer with `overlap_with_ddp=True` only "
+                "supports a single parameter group"
+            )
+
+        super().add_param_group(param_group)
+        # NOTE: The rest of the method assumes that the call to the parent's
+        # `add_param_group()` appends the new parameter group and preserves
+        # the previous parameter-group ordering
+
+        if self.initialized:
+            # Force a re-partitioning of the parameters
+            self._clear_cache()
+            param_groups = self._partition_parameters()[self.rank]
+            # NOTE: All parameters in the old parameter groups should be
+            # assigned to the same ranks so that the local optimizers do not
+            # need to be reinitialized
+
+            # Add the parameters assigned to this rank from the new parameter
+            # group to the local optimizer, if any
+            if len(param_groups) == len(self.optim.param_groups) + 1:
+                self.optim.add_param_group(param_groups[-1])
+
+            # Update the bucketing strategy accordingly
+            if self.parameters_as_bucket_view:
+                self._build_param_buckets()
+
+    def consolidate_state_dict(self, to: int = 0) -> None:
+        r"""
+        Consolidate a list of ``state_dict`` s (one per rank) on the target rank.
+
+        Arguments:
+            to (int): the rank that receives the optimizer states (default: 0).
+
+        Raises:
+            RuntimeError: if ``overlap_with_ddp=True`` and this method is
+                called before this :class:`ZeroRedundancyOptimizer` instance
+                has been fully initialized, which happens once
+                :class:`DistributedDataParallel` gradient buckets have been
+                rebuilt.
+
+        .. warning:: This needs to be called on all ranks.
+        """
+        self._check_overlap_initialized()
+
+        # Sync the exposed `param_groups` attributes to the local optimizer in
+        # case they have been updated
+        self._sync_param_groups(self.param_groups, self.optim.param_groups)
+
+        # Pull the sharded state from all ranks and store them in rank order
+        empty_messenger = torch.tensor(
+            [0], dtype=torch.uint8, device=self._default_device
+        )
+
+        # NOTE: We wastefully use `broadcast()` (e.g. instead of `gather()`)
+        # due to compatibility issues with NCCL backend; a possible follow-up
+        # is to move all sharded state management to RPC RRef
+        self._all_state_dicts = []
+        for rank in range(self.world_size):
+            global_rank = dist.distributed_c10d.get_global_rank(
+                self.process_group, rank
+            )
+            if self.rank == to:
+                # Consolidate all local `state_dict`s on this rank, storing on
+                # CPU to save GPU memory
+                if rank == self.rank:
+                    # Directly append own optimizer state
+                    self._all_state_dicts.append(
+                        _recursive_copy_to_device(
+                            self.optim.state_dict(),
+                            non_blocking=True,
+                            device=torch.device("cpu"),
+                        )
+                    )
+                else:
+                    # Receive the optimizer state from the source rank
+                    local_state_dict = _broadcast_object(
+                        empty_messenger,
+                        src_rank=global_rank,
+                        group=self.process_group,
+                        device=self._default_device,
+                    )
+                    self._all_state_dicts.append(
+                        _recursive_copy_to_device(
+                            local_state_dict,
+                            non_blocking=True,
+                            device=torch.device("cpu"),
+                        )
+                    )
+            else:
+                if rank == self.rank:
+                    # Send the optimizer state to the target rank
+                    _ = _broadcast_object(
+                        self.optim.state_dict(),
+                        src_rank=self.global_rank,
+                        group=self.process_group,
+                        device=self._default_device,
+                    )
+                elif rank != to:
+                    # Discard the received object; `broadcast()` is used for
+                    # compatibility reasons
+                    _ = _broadcast_object(
+                        empty_messenger,
+                        src_rank=global_rank,
+                        group=self.process_group,
+                        device=self._default_device,
+                    )
+
+    def _verify_params_per_rank(
+        self,
+        params_per_rank: List[List[torch.Tensor]],
+    ) -> None:
+        r"""
+        Verify ``params_per_rank`` for :meth:`_partition_parameters`.
+
+        The verification is done by checking that ``params_per_rank`` has length equal
+        to the world size and that it does not contain any parameters not passed into the
+        :class:`ZeroRedundancyOptimizer` constructor.
+
+        The parameters in ``params_per_rank`` being a strict subset of those
+        passed into the constructor is valid since some parameters may be
+        frozen.
+
+        Raises:
+            ValueError: if ``params_per_rank`` does not have length equal to
+                the world size or if it contains a parameter that was not
+                passed into the :class:`ZeroRedundancyOptimizer` constructor.
+        """
+        if len(params_per_rank) != self.world_size:
+            raise ValueError(
+                "`params_per_rank` must have length equal to the world size"
+            )
+        all_params_set = set(self._all_params)
+        for params in params_per_rank:
+            for param in params:
+                if param not in all_params_set:
+                    raise ValueError(
+                        "Passing a new parameter in `params_per_rank` that "
+                        "was not passed into the ZeroRedundancyOptimizer "
+                        "constructor"
+                    )
+
+    def _partition_param_group(
+        self, param_group: Dict[str, Any], params_per_rank: List[List[torch.Tensor]]
+    ) -> None:
+        r"""
+        Partition the parameter group ``param_group`` according to ``params_per_rank``.
+
+        The partition will modify the ``self._partition_parameters_cache``. This method should
+        only be used as a subroutine for :meth:`_partition_parameters`.
+
+        Arguments:
+            param_group (dict[str, Any]): a parameter group as normally defined
+                in an optimizer state.
+            params_per_rank (list[list[torch.Tensor]]): a :class:`list` of
+                length world size containing :class:`list` s of parameters to
+                assign to each rank.
+        """
+        for rank, params in enumerate(params_per_rank):
+            rank_param_group = copy.copy(param_group)
+            rank_param_group["params"] = params
+            self._partition_parameters_cache[rank].append(rank_param_group)
+
+    def _partition_parameters(
+        self,
+        params_per_rank: Optional[List[List[torch.Tensor]]] = None,
+    ) -> List[List[Dict]]:
+        r"""
+        Partitions parameters across distributed data parallel ranks.
+
+        Arguments:
+            params_per_rank (list[list[torch.Tensor]], optional): a
+                :class:`list` of length world size containing :class:`list` s
+                of parameters to assign to each rank; this provides a way to
+                specify a partition manually.
+                If ``None``, the parameters are partitioned according to an
+                internal algorithm.
+                (default: ``None``)
+
+        Returns:
+            A :class:`list` where each element of the list contains the
+            ``param_groups`` for a rank (which itself is a :class:`list` of
+            :class:`dict`); element 0 corresponds to rank 0, etc.; each rank
+            stores the ``param_groups`` for all ranks for the collective
+            communication in :meth:`step`.
+
+        Raises:
+            ValueError: see :meth:`_validate_params_per_rank`.
+            RuntimeError: if ``params_per_rank`` is not ``None`` and this
+                :class:`ZeroRedundancyOptimizer` instance is using more than
+                one parameter group.
+        """
+        if params_per_rank is None:
+            # Partition the parameters optimizing for uniformity
+            if len(self._partition_parameters_cache) == 0:
+                self._partition_parameters_cache = [[] for _ in range(self.world_size)]
+                sizes = [0] * self.world_size
+                for param_group in self.param_groups:
+                    param_group_params_per_rank: List[List] = [
+                        [] for _ in range(self.world_size)
+                    ]
+                    # Sort the parameters by size (largest first)
+                    params_sorted = sorted(
+                        param_group["params"], key=lambda t: t.numel(), reverse=True
+                    )
+                    for param in params_sorted:
+                        # Greedily add the parameter to rank with smallest size so far
+                        rank = self._get_min_index(sizes)
+                        param_group_params_per_rank[rank].append(param)
+                        sizes[rank] += param.numel()
+                    # Apply the constructed partition of the parameter group
+                    self._partition_param_group(
+                        param_group, param_group_params_per_rank
+                    )
+
+            return self._partition_parameters_cache
+
+        # Partition the parameters according to `params_per_rank`
+        assert len(self._partition_parameters_cache) == 0, (
+            "Specifying `params_per_rank` should only be done when the "
+            "parameters have not been partitioned yet"
+        )
+        if len(self.param_groups) != 1:
+            raise RuntimeError(
+                "Specifying `params_per_rank` only supports a single parameter group"
+            )
+        self._verify_params_per_rank(params_per_rank)
+        self._partition_parameters_cache = [[] for _ in range(self.world_size)]
+
+        # Apply the passed-in partition of the parameter group
+        param_group = self.param_groups[0]
+        self._partition_param_group(param_group, params_per_rank)
+
+        return self._partition_parameters_cache
+
+    @property
+    def _param_to_rank(self) -> Dict[torch.Tensor, int]:
+        r""":class:`dict` mapping parameters to their assigned data parallel rank in the partition."""
+        if len(self._param_to_rank_cache) == 0:
+            for rank, param_groups in enumerate(self._partition_parameters()):
+                for param_group in param_groups:
+                    for param in param_group["params"]:
+                        self._param_to_rank_cache[param] = rank
+        return self._param_to_rank_cache
+
+    @property
+    def _param_to_index(self) -> Dict[torch.Tensor, int]:
+        r"""
+        :class:`dict` mapping parameters to their indices in the global optimizer state.
+
+        NOTE: This assumes that the global optimizer state's indexing (in
+        ``state_dict``) follows a linear ordering over the parameter groups.
+        """
+        if len(self._param_to_index_cache) == 0:
+            self._param_to_index_cache = {
+                p: i
+                for i, p in enumerate(chain(*(g["params"] for g in self.param_groups)))
+            }
+        return self._param_to_index_cache
+
+    @property
+    def _index_to_param(self) -> List[torch.Tensor]:
+        r"""List mapping parameter indices in the global optimizer scheme to the actual params."""
+        if len(self._index_to_param_cache) == 0:
+            self._index_to_param_cache = list(
+                chain(*(g["params"] for g in self.param_groups))
+            )
+        return self._index_to_param_cache
+
+    def _broadcast_params_from_rank(self, rank: int):
+        r"""
+        Broadcast the shard of parameters from a given rank to all other ranks asynchronously.
+
+        Arguments:
+            rank (int): the source rank.
+
+        Returns:
+            A :class:`list` of async work handles for the ``broadcast()`` s
+            performed to synchronize the parameters.
+        """
+        assert not self._overlap_with_ddp, (
+            "`_broadcast_params_from_rank()` should not be used if "
+            "`overlap_with_ddp=True`; instead, the broadcasting should "
+            "happen in the DDP communication hook"
+        )
+        handles = []
+        if self.parameters_as_bucket_view:
+            for dev_i_buckets in self._buckets:
+                bucket = dev_i_buckets[rank]
+                global_rank = dist.distributed_c10d.get_global_rank(
+                    self.process_group, rank
+                )
+                handles.append(
+                    dist.broadcast(
+                        tensor=bucket,
+                        src=global_rank,
+                        group=self.process_group,
+                        async_op=True,
+                    )
+                )
+        else:
+            param_groups = self._partition_parameters()[rank]
+            global_rank = dist.distributed_c10d.get_global_rank(
+                self.process_group, rank
+            )
+            for param_group in param_groups:
+                for param in param_group["params"]:
+                    handles.append(
+                        dist.broadcast(
+                            tensor=param.data,
+                            src=global_rank,
+                            group=self.process_group,
+                            async_op=True,
+                        )
+                    )
+        return handles
+
+    def _sync_params(self):
+        r"""
+        Sync all parameter shards across the ranks.
+
+        This rank sends its shard of the parameters to all other ranks and
+        receives a shard from each other rank. This is done using
+        ``broadcast()``. Parameters are sent bucket-by-bucket if
+        ``parameters_as_bucket_view=True``and sent parameter-by-parameter
+        otherwise.
+        """
+        handles = []
+        for rank in range(self.world_size):
+            handles.extend(self._broadcast_params_from_rank(rank))
+        _ = [x.wait() for x in handles]
+
+    @property
+    def _device_to_params_per_rank(
+        self,
+    ) -> Dict[torch.device, List[List[torch.Tensor]]]:
+        r"""
+        Return device parameters assigned per rank.
+
+        :class:`dict` mapping each device to a :class:`list` of the per-rank parameter
+        lists filtered to only include the parameters stored on that device.
+        Each per-rank parameter list gives the parameters assigned to that rank
+        to update.
+
+        This is used for constructing the parameter buckets if
+        ``parameters_as_bucket_view=True``.
+
+        Let ``dev_i`` denote the ``i``th device for this rank. Then:
+        ``dev_0`` maps to a list containing:
+            rank 0's assigned parameters stored on ``dev_0``,
+            rank 1's assigned parameters stored on ``dev_0``,
+            ...
+        ``dev_1`` maps to a list containing:
+            rank 0's assigned parameters stored on ``dev_1``,
+            rank 1's assigned parameters stored on ``dev_1``,
+            ...
+        ...
+        """
+        assert self.parameters_as_bucket_view, (
+            "`_device_to_params_per_rank` should only be used if "
+            "`parameters_as_bucket_view=True`"
+        )
+        if len(self._device_to_params_per_rank_cache) == 0:
+            for rank, param_groups in enumerate(self._partition_parameters()):
+                for param_group in param_groups:
+                    for param in param_group["params"]:
+                        device = param.device
+                        if device not in self._device_to_params_per_rank_cache:
+                            self._device_to_params_per_rank_cache[device] = [
+                                [] for _ in range(self.world_size)
+                            ]
+                        self._device_to_params_per_rank_cache[device][rank].append(
+                            param
+                        )
+        return self._device_to_params_per_rank_cache
+
+    def _get_min_index(
+        self,
+        values: List[int],
+        disallowed_indices: Optional[Set[int]] = None,
+    ) -> int:
+        r"""
+        Return ``values.index(min(values))``, except only uses one pass.
+
+        It also excludes any indices in ``disallowed_indices`` if provided.
+
+        Arguments:
+            values: (List[int]): :class:`list` of values.
+            disallowed_indices (Optional[Set[int]]): indices that are
+                disallowed from being the returned min index.
+        """
+        min_index = -1
+        min_value = float("inf")
+        for i, value in enumerate(values):
+            if disallowed_indices and i in disallowed_indices:
+                continue
+            if value < min_value:
+                min_value = value
+                min_index = i
+        assert min_index >= 0, "All indices are disallowed"
+        return min_index
+
+    def _assign_bucket_subset_to_rank(
+        self,
+        bucket_index: int,
+        bucket_params: List[torch.Tensor],
+        bucket_offset: int,
+        assigned_rank: int,
+        assigned_ranks_per_bucket: List[Set[int]],
+    ) -> None:
+        r"""
+        Assign ``bucket_params`` to the rank with the least size assigned so far and collects relevant information.
+
+        The model parameters given by ``bucket_params`` represents a (possibly non-strict)
+        subset of the parameters corresponding to a :class:`DistributedDataParallel` bucket.
+
+        Arguments:
+            bucket_index (int): index of the :class:`DistributedDataParallel`
+                gradient bucket.
+            bucket_params (List[torch.Tensor]): subset of the parameters
+                corresponding to the bucket to assign.
+            bucket_offset (int): offset giving the index of the first element
+                in ``bucket_params`` in the bucket's full parameter list.
+            assigned_rank (int): group rank to assign to.
+            assigned_ranks_per_bucket (List[Set[int]]): :class:`set` of group ranks
+                assigned to each bucket.
+        """
+        overlap_info = self._overlap_info
+        if len(bucket_params) == 0:
+            raise ValueError("Empty bucket assignment")
+        params_per_rank = overlap_info.params_per_rank
+        offsets = overlap_info.offsets
+
+        self._bucket_assignments_per_rank_cache[assigned_rank][
+            bucket_index
+        ] = _DDPBucketAssignment(bucket_index, bucket_params, bucket_offset)
+        if self.global_rank == assigned_rank:
+            offsets[bucket_index] = len(params_per_rank[assigned_rank])
+        params_per_rank[assigned_rank].extend(bucket_params)
+        assigned_ranks_per_bucket[bucket_index].add(assigned_rank)
+        self._overlap_info.num_bucket_assignments += 1
+
+    @property
+    def _bucket_assignments_per_rank(self) -> List[Dict[int, _DDPBucketAssignment]]:
+        r"""
+        Return DDP bucket parameters assigned per rank.
+
+        :class:`list` of length world size consisting of :class:`dict` s
+        mapping bucket indices to :class:`_DDPBucketAssignment` s for each
+        rank.
+        """
+        assert self._overlap_with_ddp, (
+            "`_bucket_assignments_per_rank` only be used if `overlap_with_ddp=True`"
+        )
+        if len(self._bucket_assignments_per_rank_cache) > 0:
+            return self._bucket_assignments_per_rank_cache
+
+        overlap_info = self._overlap_info
+        assert overlap_info.status == _OverlapStatus.INITIALIZED
+
+        self._bucket_assignments_per_rank_cache = [{} for _ in range(self.world_size)]
+        params_per_bucket = overlap_info.params_per_bucket
+
+        if overlap_info.shard_buckets:
+            # Define the assignment threshold to approximate uniformity
+            assert overlap_info.total_size is not None, "`total_size` was not computed"
+            threshold = overlap_info.total_size / self.world_size  # type: ignore[operator]
+            size_per_rank = [0 for _ in range(self.world_size)]
+
+        num_buckets = len(params_per_bucket)
+        overlap_info.assigned_ranks_per_bucket = [set() for _ in range(num_buckets)]
+        assigned_ranks_per_bucket = overlap_info.assigned_ranks_per_bucket
+        if not overlap_info.shard_buckets:
+            # Assign each DDP bucket entirely to a single rank
+            for bucket_index, bucket_params in enumerate(params_per_bucket):
+                assert len(bucket_params) > 0, "Empty bucket"
+                assigned_rank = self._get_assigned_rank(bucket_index)
+                self._assign_bucket_subset_to_rank(
+                    bucket_index,
+                    bucket_params,
+                    0,
+                    assigned_rank,
+                    assigned_ranks_per_bucket,
+                )
+        else:
+            # Assign each DDP bucket to possibly multiple ranks
+            # Specifically, sort the DDP buckets by increasing size, and for
+            # each bucket, iteratively assign the maximal unassigned subset
+            # with size less than `threshold` to the rank with the least total
+            # size so far -- each such assignment is represented by a
+            # `_DDPBucketAssignment` instance and only contains parameters from
+            # a single DDP bucket
+            params_per_bucket_enum = sorted(
+                enumerate(params_per_bucket), key=lambda x: sum(p.numel() for p in x[1])
+            )
+            for bucket_index, bucket_params in params_per_bucket_enum:
+                assert len(bucket_params) > 0, "Empty bucket"
+                bucket_offset = 0
+                assignment_size = 0
+                for param_index, param in enumerate(bucket_params):
+                    param_numel = param.numel()
+                    if (
+                        assignment_size + param_numel >= threshold
+                        and param_index > bucket_offset
+                    ):
+                        assigned_rank = self._get_min_index(
+                            size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                        )
+                        # Include up to but not including the parameter that
+                        # exceeded the threshold
+                        self._assign_bucket_subset_to_rank(
+                            bucket_index,
+                            bucket_params[bucket_offset:param_index],
+                            bucket_offset,
+                            assigned_rank,
+                            assigned_ranks_per_bucket,
+                        )
+                        size_per_rank[assigned_rank] += assignment_size
+                        bucket_offset = param_index
+                        assignment_size = 0
+                    assignment_size += param_numel
+                # Assign the remainder of the bucket so that no assignment
+                # spans across two buckets
+                assigned_rank = self._get_min_index(
+                    size_per_rank, assigned_ranks_per_bucket[bucket_index]
+                )
+                self._assign_bucket_subset_to_rank(
+                    bucket_index,
+                    bucket_params[bucket_offset:],
+                    bucket_offset,
+                    assigned_rank,
+                    assigned_ranks_per_bucket,
+                )
+                size_per_rank[assigned_rank] += assignment_size
+
+        return self._bucket_assignments_per_rank_cache
+
+    def _local_step(
+        self,
+        gradients: Optional[List[Optional[torch.Tensor]]] = None,
+        closure: Optional[Callable[[], float]] = None,
+        **kwargs: Any,
+    ) -> Optional[float]:
+        r"""
+        Perform a single optimizer step without syncing parameters across ranks.
+
+        Arguments:
+            gradients (list[Optional[torch.Tensor]], optional): a :class:`list`
+                of length equal to the number of parameters assigned to this
+                rank containing gradient tensors or ``None`` as its elements;
+                a ``None`` in the :class:`list` indicates that the
+                corresponding parameter should not be updated.
+                If the argument itself is ``None``, then all parameters are
+                updated, and the gradients are assumed to be already populated.
+                (default: ``None``)
+            closure (Callable): a closure that re-evaluates the model and
+                returns the loss; optional for most optimizers and should be
+                ``None`` if ``gradients`` is not ``None``; (default: ``None``)
+        Returns:
+            Optional loss depending on the underlying local optimizer.
+
+        .. warning::
+            The argument ``gradients`` should only be specified (i.e. not
+            ``None``) if ``overlap_with_ddp=True``, in which case
+            :class:`ZeroRedundancyOptimizer` wraps a functional optimizer.
+        """
+        Join.notify_join_context(self)
+        # Check if the model trainability has changed
+        is_trainable_mask = self._get_is_trainable_mask()
+        if is_trainable_mask != self._is_trainable_mask:
+            if self._overlap_with_ddp:
+                raise RuntimeError(
+                    "ZeroRedundancyOptimizer with `overlap_with_ddp=True` "
+                    "does not support changing parameter trainability at run "
+                    "time"
+                )
+            logger.warning(
+                "ZeroRedundancyOptimizer detected that the trainable "
+                "parameters changed; rebuilding the parameter buckets if "
+                "enabled"
+            )
+            self._build_param_buckets()
+            self._is_trainable_mask = is_trainable_mask
+
+        # Sync the exposed `param_groups` attributes to the local optimizer in
+        # case they have been updated
+        self._sync_param_groups(self.param_groups, self.optim.param_groups)
+
+        # Run the optimizer step on this shard only
+        if gradients is None:
+            loss = (
+                self.optim.step(**kwargs)
+                if closure is None
+                else self.optim.step(closure=closure, **kwargs)
+            )
+        else:
+            assert self._overlap_with_ddp, (
+                "Specifying `gradients` should not "
+                "be used when `overlap_with_ddp=False`"
+            )
+            assert closure is None, (
+                "`closure` is not supported when using a local functional optimizer"
+            )
+            loss = self.optim.step(gradients=gradients)
+
+        # Sync any updated attributes in the local optimizer to the exposed
+        # `param_groups`
+        self._sync_param_groups(self.optim.param_groups, self.param_groups)
+
+        return loss
+
+    def step(
+        self,
+        closure: Optional[Callable[[], float]] = None,
+        **kwargs: Any,
+    ) -> Optional[float]:
+        r"""
+        Perform a single optimizer step and syncs parameters across all ranks.
+
+        Arguments:
+            closure (Callable): a closure that re-evaluates the model and
+                returns the loss; optional for most optimizers.
+        Returns:
+            Optional loss depending on the underlying local optimizer.
+
+        .. note: Any extra parameters are passed to the base optimizer as-is.
+        """
+        if self._overlap_with_ddp:
+            logger.warning(
+                "`step()` should not be included in the training loop when "
+                "`overlap_with_ddp=True`"
+            )
+            return None
+
+        # Perform the local optimizer step
+        loss = self._local_step(closure=closure, **kwargs)
+
+        # Sync all of the updated parameter shards across the ranks
+        self._sync_params()
+
+        return loss
+
+    def join_hook(self, **kwargs):
+        r"""
+        Return the ZeRO join hook.
+
+        It enables training on uneven inputs by
+        shadowing the collective communications in the optimizer step.
+
+        Gradients must be properly set before this hook is called.
+
+        Arguments:
+            kwargs (dict): a :class:`dict` containing any keyword arguments
+                to modify the behavior of the join hook at run time; all
+                :class:`Joinable` instances sharing the same join context
+                manager are forwarded the same value for ``kwargs``.
+
+        This hook does not support any keyword arguments; i.e. ``kwargs`` is
+        unused.
+        """
+        return _ZeROJoinHook(self)
+
+    @property
+    def join_device(self) -> torch.device:
+        r"""Return default device."""
+        return self._default_device
+
+    @property
+    def join_process_group(self) -> Any:
+        r"""Return process group."""
+        return self.process_group
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        r"""
+        Load the state pertaining to the given rank from the input ``state_dict``, updating the local optimizer as needed.
+
+        Arguments:
+            state_dict (dict): optimizer state; should be an object returned
+                from a call to :meth:`state_dict`.
+
+        Raises:
+            RuntimeError: if ``overlap_with_ddp=True`` and this method is
+                called before this :class:`ZeroRedundancyOptimizer` instance
+                has been fully initialized, which happens once
+                :class:`DistributedDataParallel` gradient buckets have been
+                rebuilt.
+        """
+        self._check_overlap_initialized()
+
+        for index, value in state_dict["state"].items():
+            param = self._index_to_param[index]
+            if self._param_to_rank[param] != self.rank:
+                # Clear any state irrelevant to this rank
+                state_dict["state"][index] = None
+            else:
+                # Load the parameter state to the local optimizer
+                self.optim.state[param] = _recursive_copy_to_device(
+                    value, non_blocking=True, device=param.device
+                )
+                # Force zero-dimensional tensors (like Adam "step") on CPU
+                for state_name, state_value in self.optim.state[param].items():
+                    if torch.is_tensor(state_value) and state_value.dim() == 0:
+                        self.optim.state[param][state_name] = state_value.cpu()
+
+        super().load_state_dict(state_dict)
+
+        # Sync the input state with the exposed and local optimizer states
+        self._sync_param_groups(state_dict["param_groups"], self.param_groups)
+        self._sync_param_groups(self.param_groups, self.optim.param_groups)
+
+    def state_dict(self) -> Dict[str, Any]:
+        r"""
+        Return the last global optimizer state known to this rank.
+
+        .. warning:
+            If the state has not been consolidated to this rank, this raises a
+            runtime error, and even if it has, the state may not be up-to-date,
+            depending on when :meth:`consolidate_state_dict` was last called.
+
+        Raises:
+            RuntimeError: if ``overlap_with_ddp=True`` and this method is
+                called before this :class:`ZeroRedundancyOptimizer` instance
+                has been fully initialized, which happens once
+                :class:`DistributedDataParallel` gradient buckets have been
+                rebuilt; or if this method is called without a preceding call
+                to :meth:`consolidate_state_dict`.
+        """
+        self._check_overlap_initialized()
+
+        if len(self._all_state_dicts) == 0:
+            raise RuntimeError(
+                "Optimizer state has not been consolidated on this rank. "
+                f"Please call `consolidate_state_dict(to={self.rank})` on "
+                "all ranks beforehand if you meant to save the global state."
+            )
+
+        # Get the possibly-stale global optimizer state that uses global
+        # parameter indexing
+        state_dict = super().state_dict()
+
+        # Update the global optimizer state with local state information,
+        # factoring in the translation from local to global indexing
+        for rank, local_state_dict in enumerate(self._all_state_dicts):
+            local_param_groups = local_state_dict["param_groups"]
+            global_param_groups = self._partition_parameters()[rank]
+            assert len(local_param_groups) == len(
+                global_param_groups
+            ), "Mismatch between number of local and global parameter groups"
+
+            for local_param_group, global_param_group in zip(
+                local_param_groups, global_param_groups
+            ):
+                # `local_param_group` stores local indices, while
+                # `global_param_group` stores the tensors directly
+                local_param_indices = local_param_group["params"]
+                global_params = global_param_group["params"]
+
+                assert len(local_param_indices) == len(
+                    global_params
+                ), "Mismatch between number of local and global parameters in parameter group"
+                for local_param_index, global_param in zip(
+                    local_param_indices, global_params
+                ):
+                    # Update the global parameter state, if any
+                    if local_param_index in local_state_dict["state"]:
+                        global_param_index = self._param_to_index[global_param]
+                        state_dict["state"][global_param_index] = local_state_dict[
+                            "state"
+                        ][local_param_index]
+
+        # Sort the parameters in the state
+        state_dict["state"] = dict(sorted(state_dict["state"].items()))
+        return state_dict
+
+    @staticmethod
+    def _sync_param_groups(
+        src_param_groups: List[Dict[Any, Any]],
+        dst_param_groups: List[Dict[Any, Any]],
+    ) -> None:
+        r"""
+        Sync the attributes from the source parameter groups to the destination parameter groups.
+
+        Example attributes include learning rate or scheduler attributes. The
+        two parameter groups should have the same length (i.e. same number of
+        parameter groups).
+
+        Arguments:
+            src_param_groups (list[dict]): parameter groups giving the
+                attribute settings to copy.
+            dst_param_groups (list[dict]): parameter groups giving the
+                attribute settings to set.
+        """
+        assert len(src_param_groups) == len(
+            dst_param_groups
+        ), "Mismatch between number of source and destination parameter groups"
+        for src_param_group, dst_param_group in zip(src_param_groups, dst_param_groups):
+            # Sync all attributes except the parameters
+            for attr in filter(lambda x: x != "params", src_param_group.keys()):
+                dst_param_group[attr] = src_param_group[attr]
+
+    def _build_param_buckets(self) -> None:
+        r"""
+        Build parameter buckets if ``parameters_as_bucket_view=True``.
+
+        For each device that stores this rank's parameters, there is a
+        bucket (represented as a tensor) containing all of the parameters on
+        that device that are assigned to a given rank in the parameter update
+        partition.
+
+        This method is called in the constructor and any time parameter
+        trainability is changed.
+
+        .. warning::
+            The current implementation assumes that all of the parameters in a
+            bucket are of the same dense type when allocating the bucket's
+            tensor.
+
+        .. warning::
+            If the model parameters are stored across more than one device,
+            then the storage partitioning must be the same across all
+            processes in order for parameter synchronization to work.
+        """
+        if not self.parameters_as_bucket_view or self._overlap_with_ddp:
+            return
+
+        # `self._buckets[i][j]` are the parameters stored on device i and
+        # assigned to rank j
+        num_devices = len(self._device_to_params_per_rank)
+        self._buckets = [[] for _ in range(num_devices)]  # type: ignore[assignment]
+
+        for dev_i, (device, params_per_rank) in enumerate(
+            self._device_to_params_per_rank.items()
+        ):
+            for params in params_per_rank:
+                bucket_size = 0
+                dtype = None
+                trainable_params = []
+                for param in params:
+                    if not _is_trainable(param):
+                        # Clone in case the parameter was previously part of
+                        # a bucket to avoid the data from being destroyed
+                        param.data = param.data.detach().clone()
+                    else:
+                        bucket_size += param.numel()
+                        trainable_params.append(param)
+                    dtype = param.dtype  # assumes all same dtype
+
+                if bucket_size == 0:
+                    # Create a dummy bucket if there are no parameters
+                    bucket = torch.zeros(1, device=device)
+                else:
+                    # Construct the bucket (assuming all dense and same dtype)
+                    bucket = torch.empty(bucket_size, dtype=dtype, device=device)
+                    offset = 0
+                    for param in trainable_params:
+                        offset_next = offset + param.numel()
+                        bucket[offset:offset_next].copy_(param.data.flatten())
+                        param.data = bucket[offset:offset_next].view_as(param.data)
+                        offset = offset_next
+                self._buckets[dev_i].append(bucket)  # type: ignore[arg-type]
+
+    def _build_ddp_param_buckets(self) -> None:
+        r"""
+        Build the DDP bucket with parameters assigned to this rank.
+
+        For each DDP bucket with parameters assigned to this rank, flattens the
+        data of those parameters into a single tensor and saves the tensor to
+        the ``tensor`` attribute in the corresponding
+        :class:`_DDPBucketAssignment` instance stored in
+        ``self._bucket_assignments_per_rank``.
+
+        :class:`DistributedDataParallel` guarantees that the parameters
+        corresponding to a gradient bucket have the same device and the same
+        dtype.
+        """
+        for bucket_assignments in self._bucket_assignments_per_rank:
+            for bucket_assignment in bucket_assignments.values():
+                params = bucket_assignment.parameters
+                bucket_size = 0
+                dtype = None
+                for param in params:
+                    assert _is_trainable(param), (
+                        "Model parameter "
+                        "corresponding to a gradient in a DDP bucket should "
+                        "require a gradient"
+                    )
+                    bucket_size += param.numel()
+                    dtype = param.dtype  # assumes all same dtype
+                assert bucket_size > 0, "Empty bucket"
+
+                # Construct the bucket tensor (assuming all dense and same dtype)
+                tensor = torch.empty(
+                    bucket_size, dtype=dtype, device=bucket_assignment.device
+                )
+                offset = 0
+                for param in params:
+                    offset_next = offset + param.numel()
+                    tensor[offset:offset_next].copy_(param.data.flatten())
+                    param.data = tensor[offset:offset_next].view_as(param.data)
+                    offset = offset_next
+                bucket_assignment.tensor = tensor
+
+    def _verify_and_init_params(
+        self,
+        params: Any,
+    ) -> Union[List[torch.Tensor], List[dict]]:
+        r"""
+        Verify the type of ``params`` and initializes ``self._all_params`` as a :class:`list` of all parameters.
+
+        The initializagtion will first make sure that provided ``params`` is valid.
+
+        Arguments:
+            params (Any): Candidate parameter list or parameter groups to verify.
+
+        Raises:
+            TypeError: ``params`` has an invalid type.
+            ValueError: ``params`` is empty.
+
+        Returns:
+            The persistent form of ``params`` to be passed into the parent
+            :class:`Optimizer` constructor -- i.e. returns ``params`` as a
+            :class:`list` to ensure that it can be iterated over again.
+        """
+        if isinstance(params, torch.Tensor):
+            raise TypeError(
+                "`params` argument should be an iterable of "
+                f"Tensors, but got {torch.typename(params)}"
+            )
+        try:
+            all_params = list(params)
+        except TypeError as e:
+            raise TypeError(
+                "`params` argument should be an iterable of Tensors"
+                f" or dicts, but got {torch.typename(params)}"
+            ) from e
+        if len(all_params) == 0:
+            raise ValueError("ZeroRedundancyOptimizer got an empty parameter list")
+        all_tensors = True
+        all_dicts = True
+        for param in all_params:
+            all_tensors &= isinstance(param, torch.Tensor)
+            all_dicts &= isinstance(param, dict)
+        if not all_tensors and not all_dicts:
+            raise TypeError(
+                "`params` argument should be an iterable of Tensors or dicts"
+            )
+        # Ensure that `self._all_params` contains a list of all parameters
+        if all_tensors:
+            self._all_params = all_params
+        elif all_dicts:
+            self._all_params = []
+            # `all_params` contains parameter groups (not parameters)
+            for param_group in all_params:
+                if "params" not in param_group:
+                    raise ValueError(
+                        "Each parameter group passed-in via `params` must "
+                        "have a 'params' key mapping to the parameters in "
+                        "the group"
+                    )
+                self._all_params.extend(param_group["params"])
+        return all_params
+
+    def _verify_same_dense_param_type(self) -> None:
+        r"""
+        Verify that all parameters are of the same dense type.
+
+        The method assumes that ``self._all_params`` has been initialized
+        and is non-empty.
+
+        Raises:
+            ValueError: ``params`` contains sparse parameters or parameters
+            of varying dense types.
+
+        NOTE: This method can be removed once support for sparse parameters
+        and varying parameter types is added.
+        """
+        typename = torch.typename(self._all_params[0])
+        if self._all_params[0].is_sparse:
+            raise ValueError(
+                "ZeroRedundancyOptimizer only supports using "
+                "the same dense type for all parameters but got "
+                f"{typename}"
+            )
+        for param in self._all_params[1:]:
+            other_typename = torch.typename(param)
+            if other_typename != typename:
+                raise ValueError(
+                    "ZeroRedundancyOptimizer only supports "
+                    "using the same dense type for all "
+                    f"parameters but got both {typename} and "
+                    f"{other_typename}"
+                )
+
+    def _get_is_trainable_mask(self) -> List[bool]:
+        r"""Return a boolean mask indicating if each parameter is trainable (``requires_grad``) or not."""
+        return list(map(_is_trainable, self._all_params))
+
+    def _init_local_optimizer(self) -> None:
+        r"""
+        Initialize this rank's local optimizer, responsible for its subset of the parameters.
+
+        The local optimizer is saved in ``self.optim``.
+        """
+        assert (
+            self._optim_constructor is not None
+        ), "The local optimizer class has not been set"
+
+        param_groups = self._partition_parameters()[self.rank]
+        # `overlap_with_ddp=True` requires a local functional optimizer
+        if self._overlap_with_ddp:
+            # Functional optimizers only support a single parameter group and
+            # require passing in the parameters as a list
+            assert len(param_groups) == 1, (
+                "Initializing the local "
+                "functional optimizer with more than one parameter group"
+            )
+            params = param_groups[0]["params"]
+            # Try to pass `_allow_empty_param_list=True` to avoid erroring
+            if (
+                "_allow_empty_param_list"
+                in inspect.signature(self._optim_constructor).parameters
+            ):
+                self.optim: Any = self._optim_constructor(
+                    params, **self._optim_defaults, _allow_empty_param_list=True
+                )
+            else:
+                logger.warning(
+                    "%s does not support the argument "
+                    "`_allow_empty_param_list`; ZeroRedundancyOptimizer may "
+                    "error due to an empty parameter list",
+                    self._optim_constructor
+                )
+                self.optim: Any = self._optim_constructor(params, **self._optim_defaults)  # type: ignore[no-redef]
+
+            # Log information about the DDP and ZeRO bucketing
+            if dist.get_debug_level() != dist.DebugLevel.OFF:
+                local_numel = sum(p.numel() for p in params)
+                num_assigned_buckets = len(
+                    self._bucket_assignments_per_rank[self.global_rank]
+                )
+                logger.info(
+                    "rank %s with %s parameters "
+                    "across %s buckets",
+                    self.global_rank, local_numel, num_assigned_buckets
+                )
+                if self.global_rank == 0:
+                    logger.info(
+                        "%s DDP "
+                        "buckets and "
+                        "%s bucket "
+                        "assignments",
+                        len(self._overlap_info.params_per_bucket), self._overlap_info.num_bucket_assignments
+                    )
+        else:
+            # NOTE: Passing `param_groups` into the local optimizer constructor
+            # bypasses the empty parameter list check
+            self.optim: Optimizer = self._optim_constructor(param_groups, **self._optim_defaults)  # type: ignore[no-redef]
+
+        # TODO: Manually add `self.param_groups` if using a functional
+        # optimizer; remove this if/when the functional optimizers support
+        # multiple parameter groups
+        if self._overlap_with_ddp and not hasattr(self.optim, "param_groups"):
+            assert hasattr(self.optim, "param_group"), (
+                "The functional optimizer should set at least one of the "
+                "attributes `param_group` or `param_groups`"
+            )
+            self.optim.param_groups = [self.optim.param_group]  # type: ignore[attr-defined]
+
+        self._sync_param_groups(self.optim.param_groups, self.param_groups)
+
+    def _init_zero_for_overlap(self) -> None:
+        r"""Perform a delayed initialization of the local optimizer and the supporting data structures."""
+        assert self._overlap_with_ddp, (
+            "`_init_zero_for_overlap()` should only be called when "
+            "`overlap_with_ddp=True`"
+        )
+        self._overlap_info.status = _OverlapStatus.INITIALIZED
+        self._clear_cache()
+        self._partition_parameters(self._overlap_info.params_per_rank)
+        self._build_ddp_param_buckets()
+        self._init_local_optimizer()
+
+    def _get_assigned_rank(self, bucket_index: int) -> int:
+        r"""
+        Return the single rank assigned to a :class:`DistributedDataParallel` gradient bucket.
+
+        Arguments:
+            bucket_index (int): index of the :class:`DistributedDataParallel`
+                bucket for which to get the assigned rank.
+        """
+        assert not self._overlap_info.shard_buckets, (
+            "The bucket assignment requires global bucket information and "
+            "will be computed later; there should be no need to use this "
+            "method"
+        )
+        return bucket_index % self.world_size
+
+    def _check_overlap_initialized(self):
+        r"""
+        Check the delayed initialization depending on the value of ``overlap_with_ddp``.
+
+        The delayed initialization has occurred (see
+        :meth:`_init_zero_for_overlap`) if ``overlap_with_ddp=True``, and
+        raises a ``RuntimeError`` if not. This should preface methods that
+        should not be run before that delayed initialization.
+
+        Raises:
+            RuntimeError: if ``overlap_with_ddp=True`` and
+                :meth:`_init_zero_for_overlap` has not been called.
+        """
+        if (
+            self._overlap_with_ddp
+            and self._overlap_info.status != _OverlapStatus.INITIALIZED
+        ):
+            raise RuntimeError(
+                "This method should not be called until this "
+                "ZeroRedundancyOptimizer instance has been fully "
+                "initialized"
+            )
+
+    def _get_optimizer_constructor(self, optimizer_class: Any) -> Any:
+        r"""
+        Return the optimizer constructor using validation and transformation depending on ``overlap_with_ddp``.
+
+        Returns:
+            - ``optimizer_class`` if ``overlap_with_ddp=False`` and
+                ``optimizer_class`` is not a functional optimizer.
+            - ``optimizer_class`` if ``overlap_with_ddp=True`` and
+                ``optimizer_class`` is already a functional optimizer.
+            - The functional equivalent of ``optimizer_class`` if
+                ``overlap_with_ddp=True`` and ``optimizer_class`` is not
+                already a functional optimizer (assuming the equivalent
+                exists).
+
+        Raises:
+            ValueError:
+
+                - if ``overlap_with_ddp=True`` but ``optimizer_class`` is
+                    neither a functional optimizer nor translatable to a
+                    functional optimizer.
+                - if ``overlap_with_ddp=False`` and ``optimizer_class`` is a
+                    functional optimizer.
+        """
+        functional_optims = functional_optim_map.values()
+        if not self._overlap_with_ddp:
+            if optimizer_class in functional_optims:
+                # Using a functional optimizer is only supported when
+                # `overlap_with_ddp=True`
+                raise ValueError(
+                    f"Passing in a functional optimizer {optimizer_class} "
+                    "when `overlap_with_ddp=False`"
+                )
+            else:
+                return optimizer_class
+        else:
+            if optimizer_class in functional_optims:
+                # Already a functional optimizer
+                return optimizer_class
+            elif optimizer_class in functional_optim_map:
+                # Translate the passed-in optimizer class to its functional
+                # equivalent if `overlap_with_ddp=True`
+                optim_constructor = functional_optim_map[optimizer_class]
+                logger.info(
+                    "Using the functional optimizer %s "
+                    "instead of %s since "
+                    "`overlap_with_ddp=True`",
+                    optim_constructor, optimizer_class
+                )
+                return optim_constructor
+            else:
+                raise ValueError(
+                    "Using `ddp_with_overlap=True` requires using a "
+                    "functional optimizer, but there is no supported functional "
+                    f"optimizer equivalent for {optimizer_class}"
+                )
diff --git a/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.pyi b/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..22c434bb4fddb21d4780aa80c51aacff5bde08ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/optim/zero_redundancy_optimizer.pyi
@@ -0,0 +1,83 @@
+import enum
+from typing import Any, Callable, Dict, List, Optional, overload, Set, Type
+
+import torch
+from torch.distributed.algorithms.join import Joinable, JoinHook
+from torch.optim import Optimizer
+
+class _ZeROJoinHook(JoinHook):
+    zero: Any = ...
+    def __init__(self, zero: Any) -> None: ...
+    def main_hook(self) -> None: ...
+
+class _DDPBucketAssignment:
+    bucket_index: int
+    parameters: List[torch.Tensor]
+    offset: int
+    device: torch.device
+    tensor: Optional[torch.Tensor]
+
+class _OverlapStatus(enum.IntEnum):
+    UNINITIALIZED: int = ...
+    DDP_HAS_REBUILT_BUCKETS: int = ...
+    INITIALIZED: int = ...
+
+class _OverlapInfo:
+    status: Any = ...
+    params_per_bucket: Any = ...
+    params_per_rank: Any = ...
+    offsets: Any = ...
+    broadcast_handles: Any = ...
+    bucket_index_to_future: Any = ...
+    bucket_index_to_bucket: Any = ...
+    bucket_indices_seen: Any = ...
+    assigned_ranks_per_bucket: List[Set[int]] = ...
+    total_size: int = ...
+    shard_buckets: bool = ...
+    def __init__(self) -> None: ...
+    def wait_for_broadcasts(self) -> None: ...
+    def clear_per_iter_info(self) -> None: ...
+
+class ZeroRedundancyOptimizer(Optimizer, Joinable):
+    functional_optim_map: Any = ...
+    initialized: bool = ...
+    process_group: Any = ...
+    world_size: int = ...
+    rank: int = ...
+    global_rank: int = ...
+    parameters_as_bucket_view: bool = ...
+    optim: Any = ...
+    _device_to_device_index: Dict[torch.device, int] = ...
+    _overlap_with_ddp: bool = ...
+    _overlap_info: _OverlapInfo = ...
+    _buckets: List[List[torch.Tensor]] = ...
+    _bucket_assignments_per_rank: List[Dict[int, _DDPBucketAssignment]] = ...
+    def __init__(
+        self,
+        params: Any,
+        optimizer_class: Type[Optimizer],
+        process_group: Optional[Any] = ...,
+        parameters_as_bucket_view: bool = ...,
+        overlap_with_ddp: bool = ...,
+        **defaults: Any,
+    ) -> None: ...
+    def add_param_group(self, param_group: Dict[str, Any]) -> None: ...
+    def consolidate_state_dict(self, to: int = ...) -> None: ...
+    @overload
+    def step(self, closure: None = ..., **kwargs: Any) -> None: ...
+    @overload
+    def step(self, closure: Callable[[], float], **kwargs: Any) -> float: ...
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None: ...
+    def state_dict(self) -> Dict[str, Any]: ...
+    def _local_step(
+        self,
+        gradients: Optional[List[Optional[torch.Tensor]]] = None,
+        closure: Optional[Callable[[], float]] = None,
+        **kwargs: Any,
+    ) -> Optional[float]: ...
+    def _get_assigned_rank(self, bucket_index: int) -> int: ...
+    def _init_zero_for_overlap(self) -> None: ...
+    def join_hook(self, **kwargs): ...
+    @property
+    def join_device(self) -> torch.device: ...
+    def join_process_group(self) -> Any: ...
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/__init__.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f62634d1e9258f7d0d2f9e27504356ccea03b4d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/__init__.py
@@ -0,0 +1,7 @@
+import warnings
+warnings.warn(
+    "torch.distributed.pipeline is deprecated. For up-to-date pipeline parallel "
+    "implementation, please refer to the PiPPy library under the PyTorch "
+    "organization (Pipeline Parallelism for PyTorch): "
+    "https://github.com/pytorch/PiPPy"
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d890ae88b2f29f8e4a1155fa2c1852efb41ed0bd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__init__.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e05daf288010e98488a99cccd375a5d5cea3784b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""A Pipe implementation in PyTorch."""
+from .checkpoint import is_checkpointing, is_recomputing
+from .pipe import Pipe, WithDevice
+from .microbatch import NoChunk
+
+__all__ = ["Pipe", "is_checkpointing", "is_recomputing"]
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d46aad5319d519cd5c12c285eb4608fb94c4ce4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/batchnorm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/batchnorm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d961a8d85c8cfc57840501c7920daf11be3fd1a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/batchnorm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/checkpoint.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/checkpoint.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a008c682a0204e827e0ae2563da5c085055a0312
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/checkpoint.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/copy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/copy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46e7b4faf68f815f298fbec54fba9399f2afcaf6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/copy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/dependency.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/dependency.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc0a5b7e1335d541182e0ef3445c78818b761df3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/dependency.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/microbatch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/microbatch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3c0b2e0ec2b6733b885f716a9c44564b4a481ef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/microbatch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/phony.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/phony.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb588cff3a0721eafabcb551517ed11f0efeb10b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/phony.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipe.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fb761651b69025870faa893ea0e6c7e1d069247
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipe.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipeline.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipeline.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e28957f87e633df05438159435226c7d64edc329
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/pipeline.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/stream.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/stream.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0164d127975a53cd89be3989d42bf33032bd46e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/stream.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64745d401c75b5a02fb8d9c40fe0bb924c9b0772
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/worker.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/worker.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28e57244590f30929dc1afd8bdb9e1a7726d8cad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/__pycache__/worker.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__init__.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3204f5e93573ba4cd23d24aa1a14dc9bca3cf1a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__init__.py
@@ -0,0 +1,164 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""A helper to roughly balance a sequential module.
+
+Usage::
+
+    import torch
+    from torch.distributed.pipeline.sync import Pipe
+    from torch.distributed.pipeline.sync.balance import balance_by_time
+
+    sample = torch.empty(128, 3, 224, 224)
+    balance = balance_by_time(torch.cuda.device_count(), model, sample)
+
+    pipe = Pipe(model, balance, chunks=8)
+
+"""
+from typing import Any, List, Union, Sequence
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+
+from . import blockpartition
+from .profile import profile_sizes, profile_times
+
+__all__ = ["balance_by_time", "balance_by_size"]
+
+
+Device = Union[torch.device, int, str]
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+
+def balance_cost(cost: List[int], partitions: int) -> List[int]:
+    partitioned = blockpartition.solve(cost, partitions)
+    return [len(p) for p in partitioned]
+
+
+def balance_by_time(
+    partitions: int,
+    module: nn.Sequential,
+    sample: Union[List[Any], Tensor],
+    *,
+    timeout: float = 1.0,
+    device: Device = torch.device("cuda"),
+) -> List[int]:
+    """Naive automatic balancing by elapsed time per layer.
+    ::
+
+        sample = torch.empty(128, 3, 224, 224)
+        balance = balance_by_time(torch.cuda.device_count(), model, sample)
+        pipe = Pipe(model, balance, chunks=8)
+
+    Args:
+        partitions (int):
+            intended number of partitions
+        module (torch.nn.Sequential):
+            sequential module to be partitioned
+        sample (torch.Tensor):
+            example input with arbitrary batch size
+
+    Keyword Args:
+        timeout (float):
+            profiling iterates again if the timeout (in second) is not exceeded
+            (default: ``1.0``)
+        device ('cpu' or 'cuda' device):
+            CPU or CUDA device where each layer is profiled (default: the
+            current CUDA device)
+
+    Returns:
+        A list of number of layers in each partition. Use it for the `balance`
+        parameter of :class:`~torchpipe.Pipe`.
+
+    .. note::
+        `module` and `sample` must be placed on the same device.
+
+    """
+    times = profile_times(module, sample, timeout, torch.device(device))
+    return balance_cost(times, partitions)
+
+
+def balance_by_size(
+    partitions: int,
+    module: nn.Sequential,
+    input: Union[List[Any], Tensor],
+    *,
+    chunks: int = 1,
+    param_scale: float = 2.0,
+    device: Device = torch.device("cuda"),
+) -> List[int]:
+    """Naive automatic balancing by CUDA memory usage per layer.
+
+    During training, required memory for parameters depends on which optimizer
+    is used. Optimizers may use buffers for each parameter to track
+    optimization statistics internally, such as momentum buffer in SGD.
+
+    To get more reliable size based balance, you should specify `param_scale`
+    with regard to your optimizer. The default `param_scale` is 2 instead of 1
+    due to gradient accumulation which is necessary for every optimizer.
+
+    Follow this guide to choose correct `param_scale` for typical optimizers:
+
+    =========  =============  =========================================
+    Optimizer  `param_scale`  Internal State
+    =========  =============  =========================================
+    SGD        2--3           (momentum_buffer)
+    Adam       4--5           exp_avg, exp_avg_sq, (max_exp_avg_sq)
+    Adadelta   4              square_avg, acc_delta
+    Adagrad    3              sum
+    RMSprop    3--5           square_avg, (momentum_buffer), (grad_avg)
+    =========  =============  =========================================
+
+    Here's a simple example with the Adam optimizer::
+
+        balance = balance_by_size(
+            torch.cuda.device_count(),
+            model,
+
+            # Same size with mini-batch to train
+            torch.empty(1024, 3, 224, 224),
+
+            # Number of micro-batches to train with Pipe
+            chunks=8,
+
+            # 4 for Adam
+            param_scale=4.0,
+        )
+
+        pipe = Pipe(model, balance, chunks=8)
+        adam = Adam(pipe.parameters())
+
+    Args:
+        partitions (int):
+            intended number of partitions
+        module (torch.nn.Sequential):
+            sequential module to be partitioned
+        input (torch.Tensor):
+            example mini-batch with the same size to train
+
+    Keyword Args:
+        chunks (int):
+            number of micro-batches will be used to train (default: ``1``)
+        param_scale (float):
+            how many copies of parameters would be allocated for training. It
+            depends on optimizer. See the above guide. (default: ``2.0``)
+        device ('cuda' device):
+            CUDA device where each layer is profiled (default: the current CUDA
+            device)
+
+    Returns:
+        A list of number of layers in each partition. Use it for the `balance`
+        parameter of :class:`~torchpipe.Pipe`.
+
+    .. note::
+        `module` and `input` must be placed on the same CUDA device.
+
+    """
+    sizes = profile_sizes(module, input, chunks, param_scale, torch.device(device))
+    return balance_cost(sizes, partitions)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c07fe92e67695ff44d1ab05cc2a8ba4b0f1852a5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/blockpartition.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/blockpartition.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e958f676dddbe4ffb89272a9bd7ed62a593bf46
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/blockpartition.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/profile.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/profile.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77fc197e3dc20a0ad8b33e6e11227d082e116bde
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/__pycache__/profile.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/blockpartition.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/blockpartition.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95d42771d5f6696433de9cf85db66738fe71d8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/blockpartition.py
@@ -0,0 +1,95 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Implements "Block Partitions of Sequences" by Imre Bárány et al.
+
+Paper: https://arxiv.org/pdf/1308.2452.pdf
+
+"""
+from typing import Iterator, List, Tuple
+
+__all__ = ["solve"]
+
+
+def solve(sequence: List[int], partitions: int = 1) -> List[List[int]]:
+    """Splits a sequence into several partitions to minimize variance for each
+    partition.
+
+    The result might not be optimal. However, it can be done only in O(kn³),
+    where k is the number of partitions and n is the length of the sequence.
+
+    """
+    if partitions < 1:
+        raise ValueError(f"partitions must be a positive integer ({partitions} < 1)")
+
+    n = len(sequence)
+    if n < partitions:
+        raise ValueError(f"sequence is shorter than intended partitions ({n} < {partitions})")
+
+    # Normalize the sequence in [0, 1].
+    minimum = min(sequence)
+    maximum = max(sequence) - minimum
+
+    normal_sequence: List[float]
+    if maximum == 0:
+        normal_sequence = [0 for _ in sequence]
+    else:
+        normal_sequence = [(x - minimum) / maximum for x in sequence]
+
+    splits = [n // partitions * (x + 1) for x in range(partitions - 1)] + [n]
+
+    def block_size(i: int) -> float:
+        start = splits[i - 1] if i > 0 else 0
+        stop = splits[i]
+        return sum(normal_sequence[start:stop])
+
+    def leaderboard() -> Iterator[Tuple[float, int]]:
+        return ((block_size(i), i) for i in range(partitions))
+
+    while True:
+        """
+        (1) Fix p ∈ [k] with M(P) = bp. So Bp is a maximal block of P.
+        """
+        # max_size: M(P)
+        max_size, p = max(leaderboard())
+
+        while True:
+            """
+            (2) If M(P) ≤ m(P) + 1, then stop.
+            """
+            # min_size: m(P)
+            min_size, q = min(leaderboard())
+
+            if max_size <= min_size + 1:
+                return [sequence[i:j] for i, j in zip([0] + splits[:-1], splits)]
+
+            """
+            (3) If M(P) > m(P) + 1, then let m(P) = bq for the q ∈ [k] which is
+            closest to p (ties broken arbitrarily). Thus Bq is a minimal block
+            of P. Let Bh be the block next to Bq between Bp and Bq. (Note that
+            Bh is a non-empty block: if it were, then m(P) = 0 and we should
+            have chosen Bh instead of Bq.)
+            """
+            if p < q:
+                """
+                So either p < q and then h = q−1 and we define P ∗ by moving
+                the last element from Bh = Bq−1 to Bq,
+                """
+                h = q - 1
+                splits[h] -= 1
+            else:
+                """
+                or q < p, and then h = q + 1 and P ∗ is obtained by moving the
+                first element of Bh = Bq+1 to Bq.
+                """
+                h = q + 1
+                splits[q] += 1
+
+            """
+            Set P = P ∗ . If p = h, then go to (1), else go to (2).
+            """
+            if p == h:
+                break
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/profile.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0a6b7d00b8cd168c657e0d69d202d023842f9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/profile.py
@@ -0,0 +1,116 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Per-layer profilers."""
+import copy
+import time
+from typing import Any, Generator, List, Union, Sequence
+
+import torch
+from torch import Tensor
+import torch.nn as nn
+
+from ..microbatch import Batch
+
+__all__: List[str] = []
+
+
+Device = Union[torch.device, int, str]
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+
+def layerwise_sandbox(module: nn.Sequential, device: torch.device,) -> Generator[nn.Module, None, None]:
+    """Copies layers for ease to profile. It doesn't modify the given
+    module.
+    """
+    for layer in module:
+        layer_copy = copy.deepcopy(layer)
+        layer_copy.to(device)
+        layer_copy.train()
+        yield layer_copy
+
+
+def detach(batch: Batch) -> None:
+    """Detaches from autograd graph."""
+    for i, x in enumerate(batch):
+        batch[i] = x.detach().requires_grad_(x.requires_grad)
+
+
+def profile_times(module: nn.Sequential, sample: Union[List[Any], Tensor], timeout: float, device: torch.device,) -> List[int]:
+    """Profiles elapsed times per layer."""
+    if any(p.grad is not None for p in module.parameters()):
+        raise ValueError("some parameter already has gradient")
+
+    _batch = Batch(sample)
+    for i, x in enumerate(_batch):
+        _batch[i] = x.detach().to(device).requires_grad_(x.requires_grad)
+
+    time_bufs: List[List[float]] = [[] for _ in module]
+    begun_at = time.time()
+
+    while time.time() - begun_at < timeout:
+        batch = _batch
+
+        for i, layer in enumerate(layerwise_sandbox(module, device)):
+            detach(batch)
+
+            if device.type == "cuda":
+                torch.cuda.synchronize(device)
+            tick = time.time()
+
+            # Forward
+            batch = batch.call(layer)
+
+            # Backward
+            backward_tensors = tuple(y for y in batch if y.requires_grad)
+            if backward_tensors:
+                torch.autograd.backward(backward_tensors, backward_tensors)
+
+            if device.type == "cuda":
+                torch.cuda.synchronize(device)
+            tock = time.time()
+
+            time_bufs[i].append(tock - tick)
+
+    us = 1_000_000
+    return [sum(int(t * us) for t in buf) for buf in time_bufs]
+
+
+def profile_sizes(
+    module: nn.Sequential, input: Union[List[Any], Tensor], chunks: int, param_scale: float, device: torch.device,
+) -> List[int]:
+    """Profiles CUDA memory usage per layer."""
+    if device.type != "cuda":
+        raise ValueError("size profiler supports only CUDA device")
+
+    batch = Batch(input)
+    sizes: List[int] = []
+
+    latent_scale = batch[0].size(0) / chunks
+    for i, x in enumerate(batch):
+        batch[i] = x[:1].detach().to(device).requires_grad_(x.requires_grad)
+
+    for layer in layerwise_sandbox(module, device):
+        detach(batch)
+
+        # Detect memory usage at forward.
+        torch._C._cuda_clearCublasWorkspaces()
+        memory_before = torch.cuda.memory_allocated(device)
+        batch = batch.call(layer)
+        torch._C._cuda_clearCublasWorkspaces()
+        memory_after = torch.cuda.memory_allocated(device)
+        latent_size = memory_after - memory_before
+
+        # Analyze size of parameters.
+        param_size = sum(p._typed_storage()._nbytes() for p in layer.parameters())
+
+        # Combine size of parameters and activations with normalize scales.
+        size = latent_size * latent_scale + param_size * param_scale
+        sizes.append(int(size))
+
+    return sizes
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/py.typed b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..f4830a6416775aae091858a4ac5158ce69f7de29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/_balance/py.typed
@@ -0,0 +1,6 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/batchnorm.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..882ebe8266feaec65765d82fcbc9b362da5be40f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/batchnorm.py
@@ -0,0 +1,159 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tracks the running statistics per mini-batch instead of micro-batch."""
+from typing import TypeVar, cast
+
+import torch
+from torch import Tensor, nn
+from torch.nn.functional import batch_norm
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from .checkpoint import is_recomputing
+
+__all__ = ["DeferredBatchNorm"]
+
+
+TModule = TypeVar("TModule", bound=nn.Module)
+
+
+class DeferredBatchNorm(_BatchNorm):
+    """A BatchNorm layer tracks multiple micro-batches to update running statistics per mini-batch."""
+
+    sum: Tensor
+    sum_squares: Tensor
+    running_mean: Tensor
+    running_var: Tensor
+    num_batches_tracked: Tensor
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        chunks: int = 1,
+    ) -> None:
+        super().__init__(num_features, eps, momentum, affine, track_running_stats=True)
+
+        self.register_buffer("sum", torch.zeros_like(self.running_mean))
+        self.register_buffer("sum_squares", torch.zeros_like(self.running_var))
+
+        self.counter = 0
+        self.tracked = 0
+        self.chunks = chunks
+
+    def _check_input_dim(self, input: Tensor) -> None:
+        # It's the typical _check_input_dim() implementation in PyTorch.
+        if input.dim() <= 2:
+            raise ValueError("expected at least 3D input (got %dD input)" % input.dim())
+
+    def _track(self, input: Tensor) -> bool:
+        """Tracks statistics of a micro-batch."""
+        # Dimensions except channel. For example, (0, 2, 3) is for BatchNorm2d.
+        dim = [0]
+        dim.extend(range(2, input.dim()))
+
+        with torch.no_grad():
+            self.sum += input.sum(dim)
+            self.sum_squares += (input ** 2).sum(dim)
+
+        size = input.size().numel() // input.size(1)
+        self.counter += size
+        self.tracked += 1
+
+        return self.tracked == self.chunks
+
+    def _commit(self) -> None:
+        """Update the running statistics of a mini-batch."""
+        exponential_average_factor = 0.0
+        self.num_batches_tracked += 1
+        if self.momentum is None:  # use cumulative moving average
+            exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+        else:  # use exponential moving average
+            exponential_average_factor = self.momentum
+
+        mean = self.sum / self.counter
+        var = self.sum_squares / self.counter - mean ** 2
+
+        # Calculate the exponential moving average here.
+        m = exponential_average_factor
+
+        self.running_mean *= 1 - m
+        self.running_mean += mean * m
+
+        self.running_var *= 1 - m
+        self.running_var += var * m
+
+        self.sum.zero_()
+        self.sum_squares.zero_()
+        self.counter = 0
+        self.tracked = 0
+
+    def forward(self, input: Tensor) -> Tensor:
+        if not self.training:
+            # Don't train parameters on the evaluation mode.
+            return batch_norm(
+                input,
+                running_mean=self.running_mean,
+                running_var=self.running_var,
+                weight=self.weight,
+                bias=self.bias,
+                training=False,
+                momentum=0.0,
+                eps=self.eps,
+            )
+
+        if not is_recomputing():
+            # Track a micro-batch on the training mode
+            # but not under a recomputation.
+            tracked_enough = self._track(input)
+
+            # Update the running statistics for a mini-batch
+            # if it has tracked enough micro-batches.
+            if tracked_enough:
+                self._commit()
+
+        # Normalize a micro-batch and train the parameters.
+        return batch_norm(
+            input,
+            running_mean=None,
+            running_var=None,
+            weight=self.weight,
+            bias=self.bias,
+            training=True,
+            momentum=0.0,
+            eps=self.eps,
+        )
+
+    @classmethod
+    def convert_deferred_batch_norm(cls, module: TModule, chunks: int = 1) -> TModule:
+        """Converts a :class:`nn.BatchNorm` or underlying :class:`nn.BatchNorm`s into :class:`DeferredBatchNorm`::
+
+            from torchvision.models.resnet import resnet101
+            from torchpipe.batchnorm import DeferredBatchNorm
+            model = resnet101()
+            model = DeferredBatchNorm.convert_deferred_batch_norm(model)
+
+        """
+        if isinstance(module, DeferredBatchNorm) and module.chunks is chunks:
+            return cast(TModule, module)
+
+        module_output: nn.Module = module
+
+        if isinstance(module, _BatchNorm) and module.track_running_stats:
+            module_output = DeferredBatchNorm(module.num_features, module.eps, module.momentum, module.affine, chunks)
+            if module.affine:
+                module_output.register_parameter("weight", module.weight)
+                module_output.register_parameter("bias", module.bias)
+            module_output.register_buffer("running_mean", module.running_mean)
+            module_output.register_buffer("running_var", module.running_var)
+            module_output.register_buffer("num_batches_tracked", module.num_batches_tracked)
+
+        for name, child in module.named_children():
+            module_output.add_module(name, cls.convert_deferred_batch_norm(child, chunks))
+
+        return cast(TModule, module_output)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/checkpoint.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7624392d36a2e5e83ea53b9ed87ad7bdb4380ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/checkpoint.py
@@ -0,0 +1,364 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Checkpointing with preceding recomputation.
+
+PyTorch already provides the official checkpointing utilities in
+:mod:`torch.utils.checkpoint`. The official checkpointing combines
+recomputation and recursive backpropagation into one autograd function named
+``CheckpointFunction``. Hence, the recomputation can be started only when the
+gradients arrive to the function. In Pipe, the recomputation needs to precede
+the gradient arrival to minimize the GPU idle time.
+
+We solve this problem by introducing separate autograd functions named
+:class:`Recompute` and :class:`Checkpoint`. Each function represents
+recomputation and recursive backpropagation, respectively. We can manipulate
+the control flow in aspect of both the autograd engine and CUDA with a pair of
+the functions.
+
+Specifically, we place CUDA stream synchronization between :class:`Recompute`
+and :class:`Checkpoint` to delay only :class:`Checkpoint` until the gradient is
+copied entirely.
+
+"""
+from collections import deque
+from contextlib import contextmanager
+import threading
+from typing import (
+    Any,
+    Deque,
+    Generator,
+    List,
+    Optional,
+    Protocol,
+    Union,
+    Sequence,
+    Tuple
+)
+
+import torch
+from torch import Tensor
+import torch.autograd
+
+from .dependency import fork, join
+from .microbatch import Batch
+from .phony import get_phony
+
+__all__ = ["Function", "checkpoint", "Checkpointing", "ThreadLocal", "enable_checkpointing",
+           "enable_recomputing", "is_checkpointing", "is_recomputing", "Context", "save_rng_states",
+           "restore_rng_states", "Checkpoint", "Recompute"]
+
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+# Types for shared memory between Checkpoint and Recompute.
+Recomputed = Tuple[TensorOrTensors, Tensors]  # (output, input_leaf)
+RNGStates = Tuple[Tensor, Optional[Tensor]]  # (cpu_rng_state, gpu_rng_state)
+
+
+# Protocol with __call__ instead of Callable can be used as an attribute type.
+# See: https://github.com/python/mypy/issues/708#issuecomment-561735949
+class Function(Protocol):
+    def __call__(self, input: TensorOrTensors) -> TensorOrTensors:
+        ...
+
+
+def checkpoint(function: Function, input):
+    """Make a checkpoint with a simple interface like
+    :func:`torch.utils.checkpoint.checkpoint`. It's only used to test or debug
+    :class:`Checkpoint` and :class:`Recompute` without boilerplate.
+    """
+    batch = Batch(input)
+
+    chk = Checkpointing(function, batch)
+    batch = chk.checkpoint()
+    chk.recompute(batch)
+
+    return batch.values
+
+
+class Checkpointing:
+    """Generates a pair of :class:`Checkpoint` and :class:`Recompute`."""
+
+    def __init__(self, function: Function, batch: Batch) -> None:
+        self.function = function
+        self.batch = batch
+
+        # Shared memory between Checkpoint and Recompute. 1-length deque is
+        # used for mutability and length limitation.
+        self.recomputed: Deque[Recomputed] = deque(maxlen=1)
+        self.rng_states: Deque[RNGStates] = deque(maxlen=1)
+
+    def checkpoint(self) -> Batch:
+        """Return a batch applied by :class:`Checkpoint`."""
+        input_atomic = self.batch.atomic
+        inputs = tuple(self.batch)
+
+        # Use a phony which requires grad to ensure that Checkpoint can be
+        # tracked by the autograd engine even when none of the input tensors
+        # require grad.
+        phony = get_phony(self.batch.get_device(), requires_grad=True)
+
+        output = Checkpoint.apply(phony, self.recomputed, self.rng_states, self.function, input_atomic, *inputs)
+
+        # Gradients are only supported for float Tensors.
+        if isinstance(output, tuple):
+            output = tuple([x.detach() if torch.is_tensor(x) and not x.is_floating_point() else x for x in output])
+
+        return Batch(output)
+
+    def recompute(self, batch: Batch) -> None:
+        """Apply :class:`Recompute` to the batch in place."""
+        input_atomic = self.batch.atomic
+        inputs = tuple(self.batch)
+
+        # Use a tensor in the batch to tie together fork-join
+        tensor_idx = batch.find_tensor_idx()
+        # batch[tensor_idx] is always requiring grad, because it has been passed
+        # checkpoint with a phony requiring grad.
+        batch[tensor_idx], phony = fork(batch[tensor_idx])
+        phony = Recompute.apply(phony, self.recomputed, self.rng_states, self.function, input_atomic, *inputs)
+        batch[tensor_idx] = join(batch[tensor_idx], phony)
+
+
+class ThreadLocal(threading.local):
+    def __init__(self) -> None:
+        self.is_checkpointing = False
+        self.is_recomputing = False
+
+
+thread_local = ThreadLocal()
+
+
+@contextmanager
+def enable_checkpointing() -> Generator[None, None, None]:
+    """Make :func:`is_checkpointing` return :data:`True` within a context."""
+    orig = thread_local.is_checkpointing
+    thread_local.is_checkpointing = True
+    try:
+        yield
+    finally:
+        thread_local.is_checkpointing = orig
+
+
+@contextmanager
+def enable_recomputing() -> Generator[None, None, None]:
+    """Makes :func:`is_recomputing` return :data:`True` within a context."""
+    orig = thread_local.is_recomputing
+    thread_local.is_recomputing = True
+    try:
+        yield
+    finally:
+        thread_local.is_recomputing = orig
+
+
+def is_checkpointing() -> bool:
+    """Whether the current forward propagation is under checkpointing.
+
+    Returns:
+        bool: :data:`True` if it's under checkpointing.
+
+    """
+    return thread_local.is_checkpointing
+
+
+def is_recomputing() -> bool:
+    """Whether the current forward propagation is under checkpoint recomputation.
+
+    Use this to prevent duplicated side-effects at forward
+    propagation::
+
+        class Counter(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.counter = 0
+
+            def forward(self, input):
+                if not is_recomputing():
+                    self.counter += 1
+                return input
+
+    Returns:
+        bool: :data:`True` if it's under checkpoint recomputation.
+
+    .. seealso:: :ref:`Detecting Recomputation`
+
+    """
+    return thread_local.is_recomputing
+
+
+class Context:
+    """The common interface between the :class:`Checkpoint` and :class:`Recompute` context."""
+
+    recomputed: Deque[Recomputed]
+    rng_states: Deque[RNGStates]
+    function: Function
+    input_atomic: bool
+    inputs: Sequence[Any]
+
+    saved_tensors: Tuple[Tensor, ...]
+
+    def save_for_backward(self, *tensors: Tensor) -> None:  # pragma: no cover
+        pass
+
+
+def save_rng_states(device: torch.device, rng_states: Deque[RNGStates],) -> None:
+    """:
+    Capture the current random number generator states.
+
+    meth:`Checkpoint.forward` captures the current PyTorch's random number
+    generator states at CPU and GPU to reuse in :meth:`Recompute.backward`.
+
+    .. seealso:: :ref:`Referential Transparency`
+
+    """
+    cpu_rng_state = torch.get_rng_state()
+
+    gpu_rng_state: Optional[Tensor]
+    if device.type == "cuda":
+        gpu_rng_state = torch.cuda.get_rng_state(device)
+    else:
+        gpu_rng_state = None
+
+    rng_states.append((cpu_rng_state, gpu_rng_state))
+
+
+@contextmanager
+def restore_rng_states(device: torch.device, rng_states: Deque[RNGStates],) -> Generator[None, None, None]:
+    """:
+    Restore the random number generator state.
+
+    meth:`Recompute.backward` restores the random number generator states
+    captured by :func:`save_rng_states` within its context.
+
+    .. seealso:: :ref:`Referential Transparency`
+
+    """
+    cpu_rng_state, gpu_rng_state = rng_states.pop()
+
+    gpu_devices: List[torch.device] = []
+    if device.type == "cuda":
+        gpu_devices.append(device)
+
+    with torch.random.fork_rng(gpu_devices):
+        torch.set_rng_state(cpu_rng_state)
+        if gpu_rng_state is not None:
+            torch.cuda.set_rng_state(gpu_rng_state, device)
+        yield
+
+
+class Checkpoint(torch.autograd.Function):
+    @staticmethod
+    # type: ignore[override]
+    def forward(
+        ctx: Context,
+        phony: Tensor,
+        recomputed: Deque[Recomputed],
+        rng_states: Deque[RNGStates],
+        function: Function,
+        input_atomic: bool,
+        *inputs,
+    ):
+        ctx.recomputed = recomputed
+        ctx.rng_states = rng_states
+
+        save_rng_states(phony.device, ctx.rng_states)
+
+        ctx.function = function
+        ctx.input_atomic = input_atomic
+        if input_atomic:
+            tensors = [inputs[0]]
+        else:
+            tensors = []
+            for input in inputs:
+                if torch.is_tensor(input):
+                    tensors.append(input)
+
+        ctx.save_for_backward(*tensors)
+
+        with torch.no_grad(), enable_checkpointing():
+            if input_atomic:
+                assert len(inputs) == 1
+                output = function(inputs[0])
+            else:
+                output = function(*inputs)
+        return output
+
+    @staticmethod
+    def backward(ctx: Context, *grad_output: Tensor,) -> Tuple[Optional[Tensor], ...]:  # pragma: no cover
+        output, input_leaf = ctx.recomputed.pop()
+
+        if isinstance(output, tuple):
+            outputs = output
+        else:
+            outputs = (output,)
+        if any(torch.is_tensor(y) and y.requires_grad for y in outputs):
+            tensors = tuple([x for x in outputs if torch.is_tensor(x) and x.requires_grad])
+            torch.autograd.backward(tensors, grad_output)
+
+        grad_input: List[Optional[Tensor]] = [None, None, None, None, None]
+        grad_input.extend(x.grad if torch.is_tensor(x) else None for x in input_leaf)
+        return tuple(grad_input)
+
+
+class Recompute(torch.autograd.Function):
+    @staticmethod
+    # type: ignore[override]
+    def forward(
+        ctx: Context,
+        phony: Tensor,
+        recomputed: Deque[Recomputed],
+        rng_states: Deque[RNGStates],
+        function: Function,
+        input_atomic: bool,
+        *inputs,
+    ) -> Tensor:
+        ctx.recomputed = recomputed
+        ctx.rng_states = rng_states
+
+        ctx.function = function
+        ctx.input_atomic = input_atomic
+        ctx.inputs = inputs
+        if input_atomic:
+            tensors = [inputs[0]]
+        else:
+            tensors = []
+            for input in inputs:
+                if torch.is_tensor(input):
+                    tensors.append(input)
+        ctx.save_for_backward(*tensors)
+
+        return phony
+
+    @staticmethod
+    def backward(ctx: Context, *grad_output: Tensor) -> Tuple[None, ...]:  # pragma: no cover
+        inputs = ctx.inputs
+        inputs_leaf = tuple(x.detach().requires_grad_(x.requires_grad) if torch.is_tensor(x) else x for x in inputs)
+
+        # Get the device for the inputs from a tensor
+        device = None
+        for input in inputs:
+            if torch.is_tensor(input):
+                device = input.device
+                break
+
+        if device is None:
+            raise RuntimeError(f'No tensors found in {inputs}')
+
+        with restore_rng_states(device, ctx.rng_states):
+            with torch.enable_grad(), enable_recomputing():
+                if ctx.input_atomic:
+                    assert len(inputs_leaf) == 1
+                    output = ctx.function(inputs_leaf[0])
+                else:
+                    output = ctx.function(*inputs_leaf)
+
+        ctx.recomputed.append((output, inputs_leaf))
+
+        grad_input: List[None] = [None, None, None, None, None]
+        grad_input.extend(None for _ in ctx.inputs)
+        return tuple(grad_input)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/copy.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e124cd42538905dbe056a7300854b776e2df88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/copy.py
@@ -0,0 +1,108 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Autograd functions for stream-aware CUDA copy.
+
+It is used to overlap copy and computation on the same GPU.
+"""
+from collections import deque
+from typing import Deque, List, Optional, Tuple, Sequence
+
+import torch
+from torch import Tensor
+
+from .stream import AbstractStream, current_stream, get_device, record_stream, use_stream, wait_stream
+
+__all__: List[str] = ["Context", "Copy", "Wait"]
+
+
+Tensors = Sequence[Tensor]
+
+
+# Common interface between :class:`Copy` and :class:`Wait`.
+class Context:
+    prev_stream: AbstractStream
+    next_stream: AbstractStream
+
+
+class Copy(torch.autograd.Function):
+    """Copies tensors on specific streams."""
+
+    @staticmethod
+    # type: ignore[override]
+    def forward(ctx: Context, prev_stream: AbstractStream, next_stream: AbstractStream, *input,) -> Tensors:
+        ctx.prev_stream = prev_stream
+        ctx.next_stream = next_stream
+
+        output = []
+        output_stream = current_stream(get_device(next_stream))
+
+        with use_stream(prev_stream), use_stream(next_stream):
+            for x in input:
+                if torch.is_tensor(x):
+                    y = x.to(get_device(next_stream), non_blocking=True)
+                    output.append(y)
+
+                    # 'prev_stream' is not where 'x' has been allocated.
+                    record_stream(x, prev_stream)
+                    # 'y' has been allocated on 'next_stream'.
+                    # It might be used on the current stream captured as 'output_stream'.
+                    record_stream(y, output_stream)
+                else:
+                    output.append(x)
+
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx: Context, *grad_output: Tensor,) -> Tuple[Optional[Tensor], ...]:
+        prev_stream = ctx.prev_stream
+        next_stream = ctx.next_stream
+
+        grad_input: Deque[Tensor] = deque(maxlen=len(grad_output))
+        input_stream = current_stream(get_device(prev_stream))
+
+        with use_stream(prev_stream), use_stream(next_stream):
+            for x in reversed(grad_output):
+                y = x.to(get_device(prev_stream), non_blocking=True)
+                grad_input.appendleft(y)
+
+                # 'next_stream' is not where 'x' has been allocated.
+                record_stream(x, next_stream)
+                # 'y' has been allocated on 'prev_stream'.
+                # It might be used on the current stream captured as 'input_stream'.
+                record_stream(y, input_stream)
+
+        grad_streams: Tuple[Optional[Tensor], ...] = (None, None)
+        return grad_streams + tuple(grad_input)
+
+
+class Wait(torch.autograd.Function):
+    """Synchronizes a stream to another stream.
+
+    Place it just before you want to start an operation on the next stream,
+    provided that all operations on the previous stream are done.
+
+    """
+
+    @staticmethod
+    # type: ignore[override]
+    def forward(ctx: Context, prev_stream: AbstractStream, next_stream: AbstractStream, *input) -> Tensors:
+        ctx.prev_stream = prev_stream
+        ctx.next_stream = next_stream
+
+        wait_stream(next_stream, prev_stream)
+
+        return tuple(x.detach() if torch.is_tensor(x) else x for x in input)
+
+    @staticmethod
+    def backward(ctx: Context, *grad_input: Tensor,) -> Tuple[Optional[Tensor], ...]:
+        prev_stream = ctx.prev_stream
+        next_stream = ctx.next_stream
+
+        wait_stream(prev_stream, next_stream)
+
+        grad_streams: Tuple[Optional[Tensor], ...] = (None, None)
+        return grad_streams + grad_input
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/dependency.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/dependency.py
new file mode 100644
index 0000000000000000000000000000000000000000..de3d57e5e16e69503806f5194bfcb981a133d4d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/dependency.py
@@ -0,0 +1,54 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Arbitrary dependency between two autograd lanes."""
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from .phony import get_phony
+
+__all__: List[str] = ["fork", "Fork", "join", "Join"]
+
+
+def fork(input: Tensor) -> Tuple[Tensor, Tensor]:
+    """Branches out from an autograd lane of the given tensor."""
+    if torch.is_grad_enabled() and input.requires_grad:
+        input, phony = Fork.apply(input)
+    else:
+        phony = get_phony(input.device, requires_grad=False)
+
+    return input, phony
+
+
+class Fork(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: "Fork", input: Tensor) -> Tuple[Tensor, Tensor]:  # type: ignore[override]
+        phony = get_phony(input.device, requires_grad=False)
+        return input.detach(), phony.detach()
+
+    @staticmethod
+    def backward(ctx: "Fork", grad_input: Tensor, grad_grad: Tensor) -> Tensor:  # type: ignore[override]
+        return grad_input
+
+
+def join(input: Tensor, phony: Tensor) -> Tensor:
+    """Merge two autograd lanes."""
+    if torch.is_grad_enabled() and (input.requires_grad or phony.requires_grad):
+        input = Join.apply(input, phony)
+
+    return input
+
+
+class Join(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx: "Join", input: Tensor, phony: Tensor) -> Tensor:  # type: ignore[override]
+        return input.detach()
+
+    @staticmethod
+    def backward(ctx: "Join", grad_input: Tensor) -> Tuple[Tensor, None]:  # type: ignore[override]
+        return grad_input, None
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/microbatch.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/microbatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b5646b3b075952f54d6fc85aa3f11892900ec7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/microbatch.py
@@ -0,0 +1,234 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Manipulation of micro-batches."""
+import typing
+from typing import Any, Callable, List, Union, cast, Sequence
+
+import torch
+from torch import Tensor
+import torch.cuda.comm
+
+__all__: List[str] = ["NoChunk", "Batch", "check", "scatter", "gather"]
+
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+Function = Callable[[TensorOrTensors], Union[List[Any], Tensor]]
+
+
+class NoChunk:
+    """
+    Wrapper for a Tensor in :meth:`Pipe.forward` indicating that the tensor
+    should not be chunked on the batch dimension and instead be replicated
+    as-is across all micro-batches. This is useful for tensors which might
+    not have any 'batch' semantics for the model.
+    """
+    def __init__(self, inp: Tensor):
+        if not torch.is_tensor(inp):
+            raise TypeError(f'NoChunk only supported for tensors, found: {inp}')
+        self._tensor = inp
+
+    @property
+    def tensor(self):
+        return self._tensor
+
+
+class Batch:
+    """
+    An abstraction representing a microbatch in the pipeline.
+    """
+
+    def __init__(self, values: Union[List[Any], Tensor]) -> None:
+        self._values = values
+        self.atomic = torch.is_tensor(values)
+
+        # Verify at least on tensor
+        if not self.atomic:
+            if not any(torch.is_tensor(value) for value in self._values):
+                raise TypeError(f'No tensors found in batch: {self._values}')
+
+    @property
+    def tensor(self) -> Tensor:
+        """Retrieves the underlying tensor."""
+        if not self.atomic:
+            raise AttributeError("not atomic batch")
+        return cast(Tensor, self._values)
+
+    @property
+    def values(self):
+        """Retrieves the underlying values for the batch"""
+        return self._values
+
+    def find_tensor_idx(self):
+        """
+        Retrieves the index of first tensor found.
+        """
+        if self.atomic:
+            return 0
+        for i, value in enumerate(self._values):
+            if torch.is_tensor(value):
+                return i
+
+        raise TypeError("No tensor found!")
+
+    def get_device(self):
+        """
+        Retrieves the device for this microbatch.
+        """
+        if self.atomic:
+            return self._values.device  # type: ignore[union-attr]
+
+        for value in self._values:
+            if torch.is_tensor(value):
+                return value.device
+
+    def call(self, function: Function) -> "Batch":
+        """Calls a function on the microbatch. It also wraps
+        the output with :class:`Batch`.
+        """
+        if self.atomic:
+            return Batch(function(self._values))
+        else:
+            return Batch(function(*self._values))
+
+    def __repr__(self) -> str:
+        return f"Batch[atomic={self.atomic!r}]({self._values!r})"
+
+    def __iter__(self):
+        if self.atomic:
+            yield self._values
+        else:
+            yield from self._values
+
+    def __len__(self) -> int:
+        return 1 if self.atomic else len(self._values)
+
+    def __getitem__(self, index: int):
+        if not self.atomic:
+            return self._values[index]
+
+        if index != 0:
+            raise IndexError("atomic batch allows index 0 only")
+
+        return self._values
+
+    # NOTE(sublee): pyflakes can't detect "overload" instead of "typing.overload".
+    @typing.overload
+    def __setitem__(self, index: int, value: Tensor) -> None:
+        ...
+
+    @typing.overload
+    def __setitem__(self, index: slice, value: Tensors) -> None:
+        ...
+
+    def __setitem__(self, index: Union[int, slice], value) -> None:
+        if isinstance(index, int):
+            self._setitem_by_index(index, value)
+        else:
+            self._setitem_by_slice(index, value)
+
+    def _setitem_by_index(self, index: int, value) -> None:
+        if not self.atomic:
+            i = index
+            self._values = self._values[:i] + (value,) + self._values[i + 1 :]  # type: ignore[operator]
+            return
+
+        if index != 0:
+            raise IndexError("atomic batch allows index 0 only")
+
+        self._values = value
+
+    def _setitem_by_slice(self, index: slice, value) -> None:
+        if not (index.start is index.stop is index.step is None):  # noqa: E714
+            raise NotImplementedError("only slice [:] supported")
+
+        if not self.atomic:
+            self._values = value
+            return
+
+        if len(value) != 1:
+            raise IndexError("atomic batch cannot be replaced with multiple tensors")
+
+        self._values = value[0]
+
+
+def check(first_device, *inputs) -> None:
+    """
+    Checks whether the input contains at least one tensor and each tensor is
+    on the same device as the first partition.
+
+    Raises:
+        ValueError: input does not contain at least one tensor
+
+    """
+
+    if not any(torch.is_tensor(input) for input in inputs):
+        raise TypeError(f'inputs do not have any tensors: {inputs}')
+    if any(torch.is_tensor(input) and input.device != first_device for input in inputs):
+        raise ValueError('All inputs should be on the same device as the first partition')
+
+
+def scatter(*inputs, chunks: int) -> List[Batch]:
+    """Splits an input mini-batch into multiple micro-batches."""
+    if len(inputs) == 1 and isinstance(inputs[0], Tensor):
+        return [Batch(x) for x in inputs[0].chunk(chunks)]
+
+    batches: List[Any] = [[] for _ in range(chunks)]
+    # Actual number of chunks produced
+    num_chunks = -1
+    for input in inputs:
+        if torch.is_tensor(input):
+            # Chunk only tensors.
+            tensors = input.chunk(chunks)
+
+            # Validate number of chunks equal across all inputs.
+            if num_chunks != -1 and num_chunks != len(tensors):
+                raise RuntimeError(f'Found different number of chunks produced for inputs: {num_chunks} and {len(tensors)}')
+            num_chunks = len(tensors)
+
+            for i, tensor in enumerate(tensors):
+                batches[i].append(tensor)
+        else:
+            # Replicate non-tensors or tensors wrapped with 'NoChunk'.
+            for i in range(chunks):
+                if isinstance(input, NoChunk):
+                    # Extract the tensor out.
+                    batches[i].append(input.tensor)
+                else:
+                    batches[i].append(input)
+
+    # Truncate to actual number of chunks
+    batches = batches[:num_chunks]
+
+    return [Batch(x) for x in batches]
+
+
+def gather(outputs: List[Batch]):
+    """Concatenates output micro-batches into a mini-batch."""
+    output: Any
+
+    if outputs[0].atomic:
+        tensors = tuple(b.tensor for b in outputs)
+        output = torch.cat(tensors)
+    else:
+        output_buf: List[Any] = []
+        for i in range(len(outputs[0])):
+            output_type = type(outputs[0][i])
+            current_outputs = []
+            for batch in outputs:
+                if output_type != type(batch[i]):
+                    raise TypeError(f'Types for microbatch outputs do not match, found: {output_type} and {type(batch[i])}')
+                current_outputs.append(batch[i])
+
+            if torch.is_tensor(outputs[0][i]):
+                output_buf.append(torch.cat(current_outputs))
+            else:
+                output_buf.append(current_outputs)
+
+        output = tuple(output_buf)
+
+    return output
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/phony.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/phony.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee19ffb6cc82032b5d61db81820eb8034d13ea84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/phony.py
@@ -0,0 +1,50 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Provides phony for arbitrary dependency in a autograd graph."""
+from typing import Dict, List, Tuple
+
+import torch
+from torch import Tensor
+
+from .stream import default_stream, use_stream
+
+__all__: List[str] = ["get_phony"]
+
+
+_phonies: Dict[Tuple[torch.device, bool], Tensor] = {}
+
+
+def get_phony(device: torch.device, *, requires_grad: bool) -> Tensor:
+    """Get a phony. Phony is tensor without space.
+
+    It is useful to make arbitrary dependency in a autograd graph because it doesn't require any
+    gradient accumulation.
+
+    .. note::
+
+        Phonies for each device are cached. If an autograd function gets a phony
+        internally, the phony must be detached to be returned. Otherwise, the
+        autograd engine will mutate the cached phony in-place::
+
+            class Phonify(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, input):
+                    phony = get_phony(input.device, requires_grad=False)
+                    return phony.detach()  # detach() is necessary.
+
+    """
+    key = (device, requires_grad)
+
+    try:
+        phony = _phonies[key]
+    except KeyError:
+        with use_stream(default_stream(device)):
+            phony = torch.empty(0, device=device, requires_grad=requires_grad)
+
+        _phonies[key] = phony
+
+    return phony
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipe.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..139bc701926601b2f8232b2406f47b7c65afa3f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipe.py
@@ -0,0 +1,490 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""The Pipe interface."""
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Iterable, Iterator, List, Optional, Union, Sequence, Tuple, cast
+
+import torch
+from torch import Tensor, nn
+from torch.distributed.rpc import RRef
+import torch.autograd
+import torch.cuda
+
+from . import microbatch
+from .batchnorm import DeferredBatchNorm
+from .pipeline import Pipeline
+from .skip.layout import inspect_skip_layout
+from .skip.skippable import verify_skippables
+from .stream import AbstractStream, new_stream
+
+__all__ = ["Pipe", "BalanceError", "PipeSequential", "WithDevice"]
+
+
+Device = Union[torch.device, int, str]
+Devices = Union[Iterable[Device], List[Device]]
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+if TYPE_CHECKING:
+    # Typechecking: nn.Module is not a Generic
+    Module = nn.Module[TensorOrTensors]  # type: ignore[type-arg]
+    NamedModules = OrderedDict[str, Module]
+else:
+    Module = nn.Module
+    NamedModules = OrderedDict
+
+
+def _recommend_auto_balance(message: str) -> str:
+    """Expands a message with recommendation to :mod:`torchpipe.balance`."""
+    return f"""{message}
+
+If your model is still under development, its optimal balance would change
+frequently. In this case, we highly recommend 'torch.distributed.pipeline.sync.balance' for
+naive automatic balancing:
+
+  from torch.distributed.pipeline.sync import Pipe
+  from torch.distributed.pipeline.sync.balance import balance_by_time
+
+  partitions = torch.cuda.device_count()
+  sample = torch.empty(...)
+  balance = balance_by_time(partitions, model, sample)
+
+  model = Pipe(model, balance, ...)
+"""
+
+
+def _verify_module(module: nn.Sequential) -> None:
+    if not isinstance(module, nn.Sequential):
+        raise TypeError("module must be nn.Sequential to be partitioned")
+
+    named_children = list(module.named_children())
+    if len(named_children) != len(module):
+        raise ValueError("module with duplicate children is not supported")
+
+
+def _verify_splitting(
+    module: nn.Sequential, partitions: List[nn.Sequential], devices: List[torch.device]
+) -> None:
+    num_parameters = len(list(module.parameters()))
+    num_child_parameters = sum(len(list(child.parameters())) for child in module.children())
+    if num_parameters == num_child_parameters:
+        return
+
+    for i in range(len(partitions)):
+        for j in range(i + 1, len(partitions)):
+            parti = partitions[i]
+            partj = partitions[j]
+            if devices[i] == devices[j]:
+                continue
+            for p in parti.parameters():
+                for q in partj.parameters():
+                    if p is q:
+                        raise ValueError("module with duplicate parameters on distinct devices is not supported")
+
+
+class BalanceError(ValueError):
+    pass
+
+
+def _retrieve_device(module: nn.Module) -> torch.device:
+    """Validates all parameters in the Module have the same device and returns
+    the appropriate device.
+
+    Args:
+        An ``nn.Module`` to process.
+
+    Returns:
+        ``torch.Device`` for the entire module.
+
+    Raises:
+        ValueError:
+            If devices for ``nn.Module`` parameters are not all same.
+    """
+
+    device = None
+    for parameter in module.parameters():
+        if device is None:
+            device = parameter.device
+        elif device != parameter.device:
+            raise ValueError(
+                f'nn.Module: {module}, should have all parameters on a single device,'
+                ' please use .to() to place the module on a single device')
+
+    return device if device is not None else torch.device("cpu")
+
+
+class PipeSequential(nn.Sequential):
+    """
+    Pipe variant of ``nn.Sequential`` which supports multiple inputs.
+    """
+
+    def forward(self, *inputs):
+        for module in self:
+            if isinstance(inputs, Tuple):  # type: ignore[arg-type]
+                inputs = module(*inputs)
+            else:
+                # Don't expand single variables (ex: lists/Tensor)
+                inputs = module(inputs)
+        return inputs
+
+
+class WithDevice(nn.Module):
+    """
+    Wraps an ``nn.Module`` which is part of ``nn.Sequential`` passed into :class:`Pipe`
+    that overrides the device for that module. In cases where :class:`Pipe`
+    can't implicitly determine the device for the module and places it on CPU,
+    this wrapper can be used to override the implicit behavior and explicitly
+    specify which device a module should run on.
+
+    The provided module is also moved to the given device via ``.to(device)``
+    by :class:`Pipe`
+
+    Args:
+        module(:class:`torch.nn.Module`): The module to be wrapped.
+        device(:class:`torch.device`): The device to run the module on.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> fc1 = nn.Linear(16, 8).cuda(0)
+        >>> fc2 = nn.Linear(8, 4).cuda(1)
+        >>> dropout = nn.Dropout()
+        >>>
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
+        >>> # Dropout does not have any parameters/buffers, but we want to
+        >>> # run it on cuda:1 to avoid any GPU to CPU transfers.
+        >>> model = nn.Sequential(fc1, fc2, WithDevice(dropout, 'cuda:1'))
+        >>> # xdoctest: +SKIP("Needs RPC framework init")
+        >>> model = Pipe(model, chunks=8)
+    """
+    def __init__(self, module: nn.Module, device: torch.device):
+        super().__init__()
+        self._module = module
+        self._device = torch.device(device)
+
+    def forward(self, *args, **kwargs):
+        return self._module(*args, **kwargs)
+
+    @property
+    def module(self):
+        return self._module
+
+    @property
+    def device(self):
+        return self._device
+
+
+def _assemble_partition(modules: List[nn.Module]):
+    modules_list: List[nn.Module] = []
+    for module in modules:
+        if isinstance(module, nn.Sequential):
+            modules_list.extend(module.children())
+        else:
+            modules_list.append(module)
+    return PipeSequential(*modules_list)
+
+
+def _split_module(modules: nn.Sequential) -> Tuple[List[nn.Sequential], List[torch.device]]:
+    partitions = []
+    devices = []
+
+    current_partition = []
+    current_device = None
+    for name, module in modules.named_children():
+        if isinstance(module, WithDevice):
+            # Process device override and move module to appropriate device.
+            device = module.device
+            module = module.module
+            module.to(device)
+        else:
+            device = _retrieve_device(module)
+        if current_device is not None and (current_device != device or device.type == 'cpu'):
+            partitions.append(_assemble_partition(current_partition))
+            devices.append(current_device)
+            current_partition = []
+        current_device = device
+        current_partition.append(module)
+
+    if current_device is not None:
+        partitions.append(_assemble_partition(current_partition))
+        devices.append(current_device)
+
+    partitions = cast(List[nn.Sequential], nn.ModuleList(partitions))
+
+    return partitions, devices
+
+
+MOVING_DENIED = TypeError("denied to move parameters and buffers, because Pipe should manage device placement")
+
+
+class Pipe(Module):
+    """Wraps an arbitrary :class:`nn.Sequential <torch.nn.Sequential>` module
+    to train on using synchronous pipeline parallelism. If the module requires
+    lots of memory and doesn't fit on a single GPU, pipeline parallelism is a
+    useful technique to employ for training.
+
+    The implementation is based on the torchgpipe_ paper.
+
+    .. _torchgpipe: https://arxiv.org/abs/2004.09910
+
+    Pipe combines pipeline parallelism with checkpointing to reduce peak
+    memory required to train while minimizing device under-utilization.
+
+    You should place all the modules on the appropriate devices and wrap them
+    into an :class:`nn.Sequential <torch.nn.Sequential>` module defining the
+    desired order of execution. If a module does not contain any
+    parameters/buffers, it is assumed this module should be executed on CPU
+    and appropriate input tensors to the module are moved to CPU before
+    execution. This behavior can be overridden by the :class:`WithDevice`
+    wrapper which can be used to explicitly specify which device a module
+    should run on.
+
+    Args:
+        module (:class:`nn.Sequential <torch.nn.Sequential>`):
+            sequential module to be parallelized using pipelining. Each module
+            in the sequence has to have all of its parameters on a single
+            device. Each module in the sequence has to either be an nn.Module
+            or :class:`nn.Sequential <torch.nn.Sequential>` (to combine multiple
+            sequential modules on a single device)
+        chunks (int):
+            number of micro-batches (default: ``1``)
+        checkpoint (str):
+            when to enable checkpointing, one of ``'always'``,
+            ``'except_last'``, or ``'never'`` (default: ``'except_last'``).
+            ``'never'`` disables checkpointing completely, ``'except_last'``
+            enables checkpointing for all micro-batches except the last one
+            and ``'always'`` enables checkpointing for all micro-batches.
+        deferred_batch_norm (bool):
+            whether to use deferred ``BatchNorm`` moving statistics (default:
+            :data:`False`). If set to :data:`True`, we track statistics across
+            multiple micro-batches to update the running statistics per
+            mini-batch.
+
+    Raises:
+        TypeError:
+            the module is not a :class:`nn.Sequential <torch.nn.Sequential>`.
+        ValueError:
+            invalid arguments
+
+    Example::
+        Pipeline of two FC layers across GPUs 0 and 1.
+
+        >>> # Need to initialize RPC framework first.
+        >>> # xdoctest: +SKIP
+        >>> os.environ['MASTER_ADDR'] = 'localhost'
+        >>> os.environ['MASTER_PORT'] = '29500'
+        >>> torch.distributed.rpc.init_rpc('worker', rank=0, world_size=1)
+        >>>
+        >>> # Build pipe.
+        >>> fc1 = nn.Linear(16, 8).cuda(0)
+        >>> fc2 = nn.Linear(8, 4).cuda(1)
+        >>> model = nn.Sequential(fc1, fc2)
+        >>> model = Pipe(model, chunks=8)
+        >>> input = torch.rand(16, 16).cuda(0)
+        >>> output_rref = model(input)
+
+    .. note::
+        You can wrap a :class:`Pipe` model with
+        :class:`torch.nn.parallel.DistributedDataParallel` only when the
+        checkpoint parameter of :class:`Pipe` is ``'never'``.
+
+    .. note::
+        :class:`Pipe` only supports intra-node pipelining currently, but
+        will be expanded to support inter-node pipelining in the future.
+        The forward function returns an :class:`~torch.distributed.rpc.RRef`
+        to allow for inter-node pipelining in the future, where the output
+        might be on a remote host. For intra-node pipelining you can use
+        :meth:`~torch.distributed.rpc.RRef.local_value` to retrieve the
+        output locally.
+
+    .. warning::
+        :class:`Pipe` is experimental and subject to change.
+    """
+
+    def __init__(
+        self,
+        module: nn.Sequential,
+        chunks: int = 1,
+        checkpoint: str = "except_last",
+        deferred_batch_norm: bool = False,
+    ) -> None:
+        super().__init__()
+
+        # Check if RPC framework is initialized.
+        if not torch.distributed.rpc._is_current_rpc_agent_set():
+            raise RuntimeError(
+                'Please initialize RPC framework for Pipe using '
+                'torch.distributed.rpc.init_rpc')
+
+        chunks = int(chunks)
+        checkpoint = str(checkpoint)
+
+        if chunks <= 0:
+            raise ValueError("number of chunks must be positive integer")
+        if checkpoint not in ["always", "except_last", "never"]:
+            raise ValueError("checkpoint is not one of 'always', 'except_last', or 'never'")
+
+        _verify_module(module)
+
+        # Verify if the underlying skippable modules satisfy integrity. The
+        # integrity can be verified before forward() because it is static.
+        verify_skippables(module)
+
+        self.chunks = chunks
+        self.checkpoint = checkpoint
+
+        if deferred_batch_norm:
+            module = DeferredBatchNorm.convert_deferred_batch_norm(module, chunks)
+
+        self.partitions, self.devices = _split_module(module)
+        _verify_splitting(module, self.partitions, self.devices)
+
+        self._copy_streams: List[List[AbstractStream]] = []
+        self._skip_layout = inspect_skip_layout(self.partitions)
+
+        # Separate CUDA streams for copy.
+        copy_streams = self._ensure_copy_streams()
+
+        # The micro-batch index where the checkpointing stops.
+        checkpoint_stop = {"always": self.chunks, "except_last": self.chunks - 1, "never": 0}[self.checkpoint]
+
+        self.pipeline = Pipeline(self.partitions, self.devices, copy_streams, self._skip_layout, checkpoint_stop)
+
+    def __len__(self) -> int:
+        """Counts the length of the underlying sequential module."""
+        return sum(len(p) for p in self.partitions)
+
+    def __getitem__(self, index: int) -> nn.Module:
+        """Gets a layer in the underlying sequential module."""
+        partitions = self.partitions
+        if index < 0:
+            partitions = partitions[::-1]
+
+        for partition in partitions:
+            try:
+                return partition[index]
+            except IndexError:
+                pass
+
+            shift = len(partition)
+
+            if index < 0:
+                index += shift
+            else:
+                index -= shift
+
+        raise IndexError
+
+    def __iter__(self) -> Iterator[nn.Module]:
+        """Iterates over children of the underlying sequential module."""
+        for partition in self.partitions:
+            yield from partition
+
+    # Pipe should manage the device of each partition.
+    # Deny cuda(), cpu(), and to() with device, by TypeError.
+    def cuda(self, device: Optional[Device] = None) -> "Pipe":
+        raise MOVING_DENIED
+
+    def cpu(self) -> "Pipe":
+        raise MOVING_DENIED
+
+    def to(self, *args: Any, **kwargs: Any) -> "Pipe":
+        # Deny these usages:
+        #
+        # - to(device[, dtype, non_blocking])
+        # - to(tensor[, non_blocking])
+        #
+        # But allow this:
+        #
+        # - to(dtype[, non_blocking])
+        #
+        if "device" in kwargs or "tensor" in kwargs:
+            raise MOVING_DENIED
+
+        if args:
+            if isinstance(args[0], (torch.device, int, str)):
+                raise MOVING_DENIED
+            if torch.is_tensor(args[0]):
+                raise MOVING_DENIED
+
+        return super().to(*args, **kwargs)
+
+    def _ensure_copy_streams(self) -> List[List[AbstractStream]]:
+        """Ensures that :class:`Pipe` caches CUDA streams for copy.
+
+        It's worth to cache CUDA streams although PyTorch already manages a
+        pool of pre-allocated CUDA streams, because it may reduce GPU memory
+        fragmentation when the number of micro-batches is small.
+
+        """
+        if not self._copy_streams:
+            for device in self.devices:
+                self._copy_streams.append([new_stream(device) for _ in range(self.chunks)])
+
+        return self._copy_streams
+
+    def forward(self, *inputs) -> RRef:
+        """
+        Processes a single input mini-batch through the pipe and returns an
+        :class:`~torch.distributed.rpc.RRef` pointing to the output.
+        :class:`Pipe` is a fairly transparent module wrapper. It doesn't
+        modify the input and output signature of the underlying module. But
+        there's type restriction. Input and output have to contain at least one
+        tensor. This restriction is applied at partition boundaries too.
+
+        The sequence of inputs are fed into the first stage of the pipeline as
+        ``*inputs``. As a result the positional args for this function should
+        match the positional args for the first stage of the pipeline. The same
+        condition applies for output of one stage of the pipeline which is the
+        input for the next stage.
+
+        The input tensor is split into multiple micro-batches based on the
+        ``chunks`` parameter used to initialize :class:`Pipe`. The batch size
+        is assumed to be the first dimension of the tensor and if the batch
+        size is less than ``chunks``, the number of micro-batches is equal to
+        the batch size.
+
+        Only tensors are split into multiple micro-batches, non-Tensor inputs
+        are just replicated as-is in each micro-batch. For non-Tensor outputs
+        in the last stage of the pipeline, they are aggregated as a ``List``
+        and returned the user. For example, if you have 2 micro-batches
+        returning the integer 5, the user would receive the consolidated
+        output of `[5, 5]`
+
+        All the input tensors need to be on the same device as the first
+        partition of the pipeline.
+
+        If a tensor is wrapped with the :class:`NoChunk` wrapper, the tensor
+        is not split across micro-batches and is replicated as-is similar to
+        non-tensors.
+
+        Args:
+            inputs: input mini-batch
+
+        Returns:
+            :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch
+
+        Raises:
+            TypeError: input doesn't contain at least one tensor
+
+        """
+        first_partition_device = self.devices[0] if len(self.devices) != 0 else torch.device("cpu")
+        microbatch.check(first_partition_device, *inputs)
+
+        if not self.devices:
+            # Empty sequential module is not illegal.
+            return RRef(*inputs)
+
+        # Divide a mini-batch into micro-batches.
+        batches = microbatch.scatter(*inputs, chunks=self.chunks)
+
+        # Run pipeline parallelism.
+        self.pipeline.run(batches)
+
+        # Merge the micro-batches into one mini-batch.
+        output = microbatch.gather(batches)
+        return RRef(output)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipeline.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c9dc1c93dab43bd28bb76ca6bb6ec389a046d33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/pipeline.py
@@ -0,0 +1,255 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""The pipeline parallelism of Pipe."""
+from queue import Queue
+from types import TracebackType
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Type, Union, cast, Sequence
+
+import torch
+from torch import Tensor, nn
+from torch.autograd.profiler import record_function
+
+from .checkpoint import Checkpointing
+from .copy import Copy, Wait
+from .dependency import fork, join
+from .microbatch import Batch
+from .skip.layout import SkipLayout
+from .skip.tracker import SkipTrackerThroughPotals, use_skip_tracker
+from .stream import AbstractStream, current_stream, use_device
+from .worker import Task, create_workers
+
+__all__: List[str] = ["Pipeline"]
+
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
+
+# Queue is generic only in stubs.
+# https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
+if TYPE_CHECKING:
+    InQueue = Queue[Optional["Task"]]
+    OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
+else:
+    InQueue = Queue
+    OutQueue = Queue
+
+
+def _depend(fork_from: Batch, join_to: Batch) -> None:
+    fork_from_idx = fork_from.find_tensor_idx()
+    join_to_idx = join_to.find_tensor_idx()
+
+    fork_from[fork_from_idx], phony = fork(fork_from[fork_from_idx])
+    join_to[join_to_idx] = join(join_to[join_to_idx], phony)
+
+
+def _copy(batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream) -> None:
+    batch[:] = Copy.apply(prev_stream, next_stream, *batch)
+    # Gradients are only supported for float Tensors.
+    batch[:] = tuple([x.detach() if torch.is_tensor(x) and not x.is_floating_point() else x for x in batch])
+
+
+def _wait(batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream) -> None:
+    batch[:] = Wait.apply(prev_stream, next_stream, *batch)
+    # Gradients are only supported for float Tensors.
+    batch[:] = tuple([x.detach() if torch.is_tensor(x) and not x.is_floating_point() else x for x in batch])
+
+
+def _clock_cycles(m: int, n: int) -> Iterable[List[Tuple[int, int]]]:
+    """Generate schedules for each clock cycle."""
+    # m: number of micro-batches
+    # n: number of partitions
+    # i: index of micro-batch
+    # j: index of partition
+    # k: clock number
+    #
+    # k (i,j) (i,j) (i,j)
+    # - ----- ----- -----
+    # 0 (0,0)
+    # 1 (1,0) (0,1)
+    # 2 (2,0) (1,1) (0,2)
+    # 3       (2,1) (1,2)
+    # 4             (2,2)
+    for k in range(m + n - 1):
+        yield [(k - j, j) for j in range(max(1 + k - m, 0), min(1 + k, n))]
+
+
+class Pipeline:
+    """The pipeline parallelism for Pipe."""
+
+    def __init__(
+        self,
+        partitions: List[nn.Sequential],
+        devices: List[torch.device],
+        copy_streams: List[List[AbstractStream]],
+        skip_layout: SkipLayout,
+        checkpoint_stop: int,
+    ) -> None:
+        self.partitions = partitions
+        self.devices = devices
+        self.copy_streams = copy_streams
+        self.skip_layout = skip_layout
+        self.checkpoint_stop = checkpoint_stop
+        (self.in_queues, self.out_queues) = create_workers(devices)
+
+    def run(self, batches: List[Batch]) -> None:
+        """Runs pipeline parallelism.
+
+        It modifies the given batches in place.
+
+        """
+        partitions = self.partitions
+        devices = self.devices
+        skip_layout = self.skip_layout
+
+        m = len(batches)
+        n = len(partitions)
+
+        skip_trackers = [SkipTrackerThroughPotals(skip_layout) for _ in batches]
+
+        for schedule in _clock_cycles(m, n):
+            self.fence(batches, schedule, skip_trackers)
+            self.compute(batches, schedule, skip_trackers)
+
+    def fence(
+        self, batches: List[Batch], schedule: List[Tuple[int, int]], skip_trackers: List[SkipTrackerThroughPotals],
+    ) -> None:
+        """Copy micro-batches after computation for the previous micro-batches."""
+        copy_streams = self.copy_streams
+        skip_layout = self.skip_layout
+
+        for i, j in schedule:
+            # Ensure that batches[i-1] is executed after batches[i] in
+            # backpropagation by an explicit dependency.
+            if i != 0 and j != 0:
+                _depend(batches[i - 1], batches[i])
+
+            next_stream = copy_streams[j][i]
+
+            for prev_j, ns, name in skip_layout.copy_policy(j):
+                prev_stream = copy_streams[prev_j][i]
+                skip_trackers[i].copy(batches[i], prev_stream, next_stream, ns, name)
+
+            if j != 0:
+                prev_stream = copy_streams[j - 1][i]
+                _copy(batches[i], prev_stream, next_stream)
+
+    def compute(
+        self, batches: List[Batch], schedule: List[Tuple[int, int]], skip_trackers: List[SkipTrackerThroughPotals],
+    ) -> None:
+        """Run tasks with synchronization to copy streams."""
+        partitions = self.partitions
+        devices = self.devices
+        copy_streams = self.copy_streams
+        checkpoint_stop = self.checkpoint_stop
+
+        # Disable checkpointing if in eval mode.
+        if not self.partitions[0].training:
+            checkpoint_stop = 0
+
+        n = len(partitions)
+        streams = [current_stream(d) for d in devices]
+        exc_info: Optional[ExcInfo] = None
+
+        # With checkpointing, the autograd graph looks like this diagram:
+        # ┌─────┸──────┐
+        # │    Copy    │
+        # └─────┰──────┘   (fence)
+        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
+        #       ┃          (compute)
+        # ┌─────┸──────┐
+        # │    Wait    │ [1] Synchronize the current stream with the copy stream.
+        # └─────┰──────┘
+        # ┌─────┸──────┐
+        # │ Checkpoint │ [2] Compute a partition within checkpointing.
+        # └─────┰──────┘
+        # ┌─────┸──────┐
+        # │    Wait    │ [3] Synchronize the copy stream with the current stream.
+        # └─────┰──────┘
+        #       ┠ ─ ─ ─ ┐
+        #       ┃ ┌─────┴─────┐
+        #       ┃ │ Recompute │ [4] Schedule the recomputation at backpropagation.
+        #       ┃ └─────┬─────┘
+        #       ┠ ─ ─ ─ ┘
+        #       ┃
+        # ─ ─ ─ ╂ ─ ─ ─ ─ ─ ─ ─ ─ ─
+        # ┌─────┸──────┐   (fence)
+        # │    Copy    │
+        # └─────┰──────┘
+        for i, j in schedule:
+            batch = batches[i]
+            partition = partitions[j]
+
+            # Synchronize with the copied input. ([1] in the diagram)
+            if j != 0:
+                _wait(batch, copy_streams[j][i], streams[j])
+
+            # Determine whether checkpointing or not.
+            checkpoint = i < checkpoint_stop
+            if checkpoint:
+
+                def function(
+                    *inputs,
+                    partition: nn.Module = partition,
+                    skip_tracker: SkipTrackerThroughPotals = skip_trackers[i],
+                    chunk_id: int = i,
+                    part_id: int = j,
+                ) -> TensorOrTensors:
+                    with use_skip_tracker(skip_tracker), record_function("chunk%d-part%d" % (chunk_id, part_id)):
+                        return partition(*inputs)
+
+                chk = Checkpointing(function, batch)  # type: ignore[arg-type]
+                task = Task(streams[j], compute=chk.checkpoint, finalize=chk.recompute)
+                del function, chk
+
+            else:
+
+                def compute(
+                    batch: Batch = batch,
+                    partition: nn.Module = partition,
+                    skip_tracker: SkipTrackerThroughPotals = skip_trackers[i],
+                    chunk_id: int = i,
+                    part_id: int = j,
+                ) -> Batch:
+                    with use_skip_tracker(skip_tracker), record_function("chunk%d-part%d" % (chunk_id, part_id)):
+                        return batch.call(partition)
+
+                task = Task(streams[j], compute=compute, finalize=None)
+                del compute
+
+            # Compute tasks in parallel. ([2] in the diagram)
+            self.in_queues[j].put(task)
+
+        for i, j in schedule:
+            ok, payload = self.out_queues[j].get()
+
+            # Hold the first exception.
+            if exc_info is not None:
+                continue
+            elif not ok:
+                exc_info = cast(ExcInfo, payload)
+                continue
+
+            task, batch = cast(Tuple[Task, Batch], payload)
+
+            # The copy stream synchronizes to copy the output. ([3] in the
+            # diagram)
+            if j != n - 1:
+                _wait(batch, streams[j], copy_streams[j][i])
+
+            # Finalize tasks. If checkpointing is enabled, here the
+            # recomputation is scheduled at backpropagation. ([4] in the
+            # diagram)
+            with use_device(devices[j]):
+                task.finalize(batch)
+
+            batches[i] = batch
+
+        # Fail at the first exception.
+        if exc_info is not None:
+            raise exc_info[0].with_traceback(exc_info[1], exc_info[2])
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/py.typed b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..f4830a6416775aae091858a4ac5158ce69f7de29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/py.typed
@@ -0,0 +1,6 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__init__.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e79f0eaa9f6ecef7c31880f25348eb6f4704ec68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__init__.py
@@ -0,0 +1,11 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Supports efficiency with skip connections."""
+from .namespace import Namespace
+from .skippable import pop, skippable, stash, verify_skippables
+
+__all__ = ["skippable", "stash", "pop", "verify_skippables", "Namespace"]
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f48cff79887d45cec515fbc87b00dc3f7a0cfc9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/layout.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/layout.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47bbd3d583cecfd48e0c002c49c4c30a3c763862
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/layout.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/namespace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/namespace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bd9d0a359df1d44821b77e6f6d0df8d56226b41
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/namespace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/portal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/portal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..665915b1e8e8c081a767fea4b683998a8e8ca516
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/portal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/skippable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/skippable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fbb7c1c435edba8a8ffc66ea9f11ef1934357e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/skippable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/tracker.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/tracker.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b2d9a3c42f4f4cfc15fccc08a3b280ba37277a6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/__pycache__/tracker.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/layout.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..332108af23a30b0d70c9a4b3cf45b32d14c42375
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/layout.py
@@ -0,0 +1,92 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Static skip connection layout of ``@skippable`` modules."""
+from typing import Dict, Iterable, List, Tuple
+
+from torch import nn
+
+from .namespace import Namespace
+
+__all__: List[str] = []
+
+
+class SkipLayout:
+    """Represents a skip connection layout across partitions."""
+
+    # Skip routes indexed by 'ns, name': {(ns, name): (prev_j, next_j), ...}
+    by_ns_name: Dict[Tuple[Namespace, str], Tuple[int, int]]
+
+    # Skip routes indexed by partition number 'j': [[next_j]: [(prev_j, ns, name), ...], ...]
+    by_partition: List[List[Tuple[int, Namespace, str]]]
+
+    def __init__(self, num_partitions: int, skip_routes: Dict[Tuple[Namespace, str], Tuple[int, int]],) -> None:
+        # The skip routes are already indexed by 'ns, name'.
+        self.by_ns_name = skip_routes
+
+        # Index skip routes by partition number 'j'.
+        self.by_partition = [[] for _ in range(num_partitions)]
+
+        for (ns, name), (prev_j, next_j) in skip_routes.items():
+            self.by_partition[next_j].append((prev_j, ns, name))
+
+        for p in self.by_partition:
+            p.sort()
+
+    def copy_policy(self, next_j: int) -> Iterable[Tuple[int, Namespace, str]]:
+        """Generates skip routes for the given destination partition number.
+        The skip routes are sorted by source partition number in ascending
+        order.
+
+        Yields:
+            Each tuple of (source partition number, namespace, name).
+
+        """
+        for prev_j, ns, name in self.by_partition[next_j]:
+            if prev_j == next_j:
+                # This skip tensor will be popped at the same partition where
+                # it is stashed. In this case, copy is not required.
+                continue
+
+            yield (prev_j, ns, name)
+
+    def requires_copy(self, ns: Namespace, name: str) -> bool:
+        """Whether the given namespace and name requires partition-to-partition
+        copy or not.
+        """
+        prev_j, next_j = self.by_ns_name.get((ns, name), (-1, -1))
+        return prev_j != next_j
+
+
+def inspect_skip_layout(partitions: List[nn.Sequential]) -> SkipLayout:
+    """Inspects the skip connection layout in the given partitions."""
+    # NOTE(sublee): Hide circular import inside this subroutine. Circular
+    # import is not ideal but placing this logic near to SkipLayout may
+    # increase cohesion of code.
+    from .skippable import Skippable
+
+    skip_routes: Dict[Tuple[Namespace, str], Tuple[int, int]] = {}
+    stashed_at: Dict[Tuple[Namespace, str], int] = {}
+
+    for j, partition in enumerate(partitions):
+        def inspect_layer(layer):
+            if not isinstance(layer, Skippable):
+                return
+
+            for ns, name in layer.stashable():
+                stashed_at[(ns, name)] = j
+
+            for ns, name in layer.poppable():
+                prev_j = stashed_at.pop((ns, name))
+                skip_routes[(ns, name)] = (prev_j, j)
+
+        if isinstance(partition, nn.Sequential):
+            for layer in partition:
+                inspect_layer(layer)
+        else:
+            inspect_layer(partition)
+
+    return SkipLayout(len(partitions), skip_routes)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/namespace.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/namespace.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fcb2687be37496c932e0c8e83ed811f82ec0ae2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/namespace.py
@@ -0,0 +1,50 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Provides isolated namespace of skip tensors."""
+import abc
+from functools import total_ordering
+from typing import Any
+import uuid
+
+__all__ = ["Namespace"]
+
+
+@total_ordering
+class Namespace(metaclass=abc.ABCMeta):
+    """Namespace for isolating skip tensors used by :meth:`isolate()
+    <torchpipe.skip.skippable.Skippable.isolate>`.
+    """
+
+    __slots__ = ("id",)
+
+    def __init__(self) -> None:
+        self.id = uuid.uuid4()
+
+    def __repr__(self) -> str:
+        return f"<Namespace '{self.id}'>"
+
+    def __hash__(self) -> int:
+        return hash(self.id)
+
+    # Namespaces should support ordering, since SkipLayout will sort tuples
+    # including a namespace. But actual order between namespaces is not
+    # important. That's why they are ordered by version 4 UUID which generates
+    # random numbers.
+    def __lt__(self, other: Any) -> bool:
+        if isinstance(other, Namespace):
+            return self.id < other.id
+        return False
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, Namespace):
+            return self.id == other.id
+        return False
+
+
+# 'None' is the default namespace,
+# which means that 'isinstance(None, Namespace)' is 'True'.
+Namespace.register(type(None))
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/portal.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/portal.py
new file mode 100644
index 0000000000000000000000000000000000000000..97481245907908074364ac90b5ff7c918d1c423e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/portal.py
@@ -0,0 +1,231 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Portal keeps a tensor in the pocket plane. The tensor becomes hidden to the
+autograd engine. The shared context of three functions (:class:`PortalBlue`,
+:class:`PortalOrange`, and :class:`PortalCopy`) out of the computation graph is
+one of the most important feature of :mod:`torchpipe.skip`.
+
+The metaphor is inspired by Portal™ from Valve.
+
+"""
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from ..copy import Context as CopyContext
+from ..copy import Copy
+from ..phony import get_phony
+from ..stream import AbstractStream, get_device
+
+__all__: List[str] = []
+
+
+class Portal:
+    """A portal for a tensor."""
+
+    def __init__(self, tensor: Optional[Tensor], tensor_life: int) -> None:
+        self.put_tensor(tensor, tensor_life)
+        self.grad: Optional[Tensor] = None
+
+    def blue(self) -> Tensor:
+        """Creates a :class:`PortalBlue` which hides the underlying tensor from
+        the autograd engine.
+
+        Join the returning phony to the main lane of the autograd graph to
+        assure the correct backpropagation::
+
+            PortalBlue --+
+                         |
+            ---------- Join --
+
+        """
+        tensor = self.use_tensor()
+
+        if tensor is None:
+            return get_phony(torch.device("cpu"), requires_grad=False)
+
+        return PortalBlue.apply(self, tensor)
+
+    def orange(self, phony: Tensor) -> Optional[Tensor]:
+        """Creates a :class:`PortalOrange` which retrieves the hidden tensor
+        without losing ability of backpropagation.
+
+        Give a phony forked from the main lane of an autograd graph::
+
+                +-- PortalOrange --+
+                |                  |
+            -- Fork --------- f(a, b) --
+
+        """
+        self.check_tensor_life()
+
+        if self.tensor is None:
+            return self.use_tensor()
+
+        return PortalOrange.apply(self, phony)
+
+    def copy(self, prev_stream: AbstractStream, next_stream: AbstractStream, phony: Tensor,) -> Tensor:
+        """Copies the hidden tensor by a :class:`PortalCopy`.
+
+        Give a phony and use the returning phony to keep backpropagation::
+
+                +-- PortalCopy --+
+                |                |
+            -- Fork ---------- Join --
+
+        """
+        if self.tensor is None:
+            return get_phony(torch.device("cpu"), requires_grad=False)
+
+        return PortalCopy.apply(self, prev_stream, next_stream, phony)
+
+    def check_tensor_life(self) -> None:
+        if self.tensor_life <= 0:
+            raise RuntimeError("tensor in portal has been removed")
+
+    def put_tensor(self, tensor: Optional[Tensor], tensor_life: int) -> None:
+        """Stores a tensor into this portal."""
+        # [Life of Tensor through Portal]
+        #
+        # The tensor can be retrieved by use_tensor() up to 'tensor_life'
+        # times. When the life becomes 0, the tensor will be deleted for
+        # deallocation in CUDA memory.
+        #
+        # The below events participate in a tensor through a portal.
+        # Note that [x] denotes the events which call use_tensor():
+        #
+        #  1. [x] blue()
+        #  2. [ ]   PortalBlue.forward
+        #  3. [ ] copy()
+        #  4. [ ]   PortalCopy.forward
+        #  5. [ ] orange()
+        #  6. [x]   PortalOrange.forward
+        # - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        #  7. [ ] orange() (recomputed)
+        #  8. [x]   PortalOrange.forward (recomputed)
+        #  9. [ ]   PortalOrange.backward
+        # 10. [ ] PortalCopy.backward
+        # 11. [x] blue() (recomputed)
+        # 12. [ ]   PortalBlue.forward (recomputed)
+        # 13. [ ]   PortalBlue.backward
+        #
+        self.tensor_life = tensor_life
+
+        if tensor_life > 0:
+            self.tensor = tensor
+        else:
+            self.tensor = None
+
+    def use_tensor(self) -> Optional[Tensor]:
+        """Retrieves the underlying tensor and decreases the tensor  life. When
+        the life becomes 0, it the tensor will be removed.
+        """
+        self.check_tensor_life()
+
+        tensor = self.tensor
+
+        self.tensor_life -= 1
+
+        if self.tensor_life <= 0:
+            self.tensor = None
+
+        return tensor
+
+    def put_grad(self, grad: Tensor) -> None:
+        """Stores a gradient into this portal."""
+        self.grad = grad
+
+    def use_grad(self) -> Tensor:
+        """Retrieves and removes the underlying gradient. The gradient is
+        always ephemeral.
+        """
+        if self.grad is None:
+            raise RuntimeError("grad in portal has been removed or never set")
+
+        grad = self.grad
+        self.grad = None
+        return grad
+
+
+# Common interface between :class:`PortalBlue`, :class:`PortalOrange`, and
+# :class:`PortalCopy`.
+class Context(CopyContext):
+    portal: Portal
+
+
+class PortalBlue(torch.autograd.Function):
+    """Hides a tensor from the autograd engine by a :class:`Portal`."""
+
+    @staticmethod
+    # type: ignore[override]
+    def forward(
+        ctx: Context,
+        portal: Portal,
+        # This tensor must be retrieved by portal.use_tensor().
+        tensor: Tensor,
+    ) -> Tensor:
+        ctx.portal = portal
+
+        phony = get_phony(tensor.device, requires_grad=False)
+        return phony.detach()
+
+    @staticmethod
+    # type: ignore[override]
+    def backward(ctx: Context, grad_phony: Tensor,) -> Tuple[None, Tensor]:
+        # The paired PortalOrange should keep the gradient.
+        grad = ctx.portal.use_grad()
+        return None, grad
+
+
+class PortalOrange(torch.autograd.Function):
+    """Retrieves the hidden tensor from a :class:`Portal`."""
+
+    @staticmethod
+    # type: ignore[override]
+    def forward(ctx: Context, portal: Portal, phony: Tensor) -> Tensor:
+        ctx.portal = portal
+
+        tensor = portal.use_tensor()
+        assert tensor is not None
+
+        return tensor.detach()
+
+    @staticmethod
+    def backward(ctx: Context, grad: Tensor) -> Tuple[None, None]:  # type: ignore[override]
+        # The paired PortalBlue will use the gradient.
+        ctx.portal.put_grad(grad)
+        return None, None
+
+
+class PortalCopy(torch.autograd.Function):
+    """Copies the hidden tensor in a :class:`Portal`. It replaces the hidden
+    tensor with copied one.
+    """
+
+    @staticmethod
+    # type: ignore[override]
+    def forward(
+        ctx: Context, portal: Portal, prev_stream: AbstractStream, next_stream: AbstractStream, phony: Tensor,
+    ) -> Tensor:
+        ctx.portal = portal
+
+        assert portal.tensor is not None
+        (portal.tensor,) = Copy.forward(ctx, prev_stream, next_stream, portal.tensor)
+
+        phony = get_phony(get_device(next_stream), requires_grad=False)
+        return phony.detach()
+
+    @staticmethod
+    # type: ignore[override]
+    def backward(ctx: Context, grad_phony: Tensor,) -> Tuple[None, None, None, None]:
+        portal = ctx.portal
+
+        assert portal.grad is not None
+        _, _, portal.grad = Copy.backward(ctx, portal.grad)
+
+        return None, None, None, None
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/skippable.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/skippable.py
new file mode 100644
index 0000000000000000000000000000000000000000..8deaa5bb7b0ea3df2c36ecf38ed964eaac2130d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/skippable.py
@@ -0,0 +1,431 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""The user interface to define skip connections."""
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    ClassVar,
+    Dict,
+    FrozenSet,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
+
+from torch import Tensor, nn
+
+from ..microbatch import Batch
+from .namespace import Namespace
+from .tracker import current_skip_tracker
+
+__all__ = ["skippable", "stash", "pop", "verify_skippables"]
+
+
+Tensors = Sequence[Tensor]
+TensorOrTensors = Union[Tensor, Tensors]
+
+StashPop = Union["stash", "pop"]
+StashPopGenerator = Generator[StashPop, Optional[Tensor], TensorOrTensors]
+if TYPE_CHECKING:
+    # Typechecking: nn.Module is not a Generic
+    SkippableModule = nn.Module[Union[StashPopGenerator, TensorOrTensors]]  # type: ignore[type-arg]
+else:
+    SkippableModule = nn.Module
+
+T = TypeVar("T", bound="Skippable")
+
+
+class Skippable(nn.Module):
+    """The base class for skippable modules.
+
+    Do not use this class directly. Define a subclass by :func:`skippable`
+    instead.
+
+    """
+
+    module_cls: ClassVar[Type[SkippableModule]]
+    stashable_names: ClassVar[FrozenSet[str]]
+    poppable_names: ClassVar[FrozenSet[str]]
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__()
+        self.module = self.module_cls(*args, **kwargs)  # type: ignore[call-arg]
+        self.namespaces: Dict[str, Namespace] = {}
+
+    def __repr__(self) -> str:
+        return f"@skippable({self.module})"
+
+    def namespaced(self, name: str) -> Tuple[Namespace, str]:
+        """Prepend namespace for the given skip name."""
+        ns = self.namespaces.get(name)
+        ns = cast(Namespace, ns)
+        return (ns, name)
+
+    def stashable(self) -> Iterable[Tuple[Namespace, str]]:
+        """Iterate over namespaced skip names to be stashed."""
+        for name in self.stashable_names:
+            yield self.namespaced(name)
+
+    def poppable(self) -> Iterable[Tuple[Namespace, str]]:
+        """Iterate over namespaced skip names to be popped."""
+        for name in self.poppable_names:
+            yield self.namespaced(name)
+
+    def isolate(self: T, ns: Namespace, *, only: Optional[Iterable[str]] = None) -> T:
+        r"""Isolate a specified subset or the whole set of skip tensors.
+
+        In a single sequential module, skip tensors with the same
+        name are not allowed unless they are isolated by different namespaces.
+
+        Here's an example using the same name for skip tensors twice. Each pair
+        of ``Layer1`` and ``Layer2`` is isolated with its own namespace ``ns1``
+        and ``ns2``. There is no conflict anymore::
+
+            ns1 = Namespace()
+            ns2 = Namespace()
+
+            model = nn.Sequential(
+                Layer1().isolate(ns1),
+                Layer1().isolate(ns2),
+                Layer2(),
+                Layer3().isolate(ns2),
+                Layer3().isolate(ns1),
+            )
+
+        When `only` parameter is omitted, all skip tensors are isolated. You
+        can isolate a subset of skip tensors by passing `only` parameter::
+
+            ns_alice = Namespace()
+            ns_bob = Namespace()
+
+            model = nn.Sequential(
+                ...
+                StashStashPop().isolate(ns_alice, only=['alice']) \
+                               .isolate(ns_bob, only=['bob']),
+                ...
+            )
+
+        Args:
+            ns (Namespace):
+                namespace for isolation
+
+        Keyword Args:
+            only (iterable of strs):
+                names of specific skip tensors to be isolated (omit this option
+                to isolate all skip tensors declared in this module)
+
+        Returns:
+            this module itself
+
+        """
+        names: Iterable[str]
+
+        if only is None:
+            names = self.stashable_names | self.poppable_names
+        else:
+            names = set(only)
+
+        for name in names:
+            self.namespaces[name] = ns
+
+        return self
+
+    def dispatch(
+        self,
+        input,
+        handle_stash: Callable[[str, Optional[Tensor]], None],
+        handle_pop: Callable[[str], Optional[Tensor]],
+    ):
+        """Dispatch :class:`stash` or :class:`pop` commands.
+
+        The commands are generated by the module's ``forward()``.
+        """
+        generator = self.module(input)
+
+        if not isinstance(generator, Generator):
+            # The underlying module returned output without any yield.
+            output = generator
+            return output
+
+        try:
+            op = next(generator)
+
+            while True:
+                if isinstance(op, stash):
+                    handle_stash(op.name, op.tensor)
+                    op = next(generator)
+                    continue
+
+                if isinstance(op, pop):
+                    tensor = handle_pop(op.name)
+                    op = generator.send(tensor)
+                    continue
+
+                raise TypeError(f"{op!r} is not a command from @skippable")
+
+        except StopIteration as stop:
+            output = stop.args[0]
+            return output
+
+    def forward(self, input: Union[List[Any], Tensor]) -> TensorOrTensors:
+        """Perform the forward propagation.
+
+        :class:`stash` or :class:`pop` commands will be handled by portals
+        silently. The portals won't be exposed to users.
+
+        Raises:
+            RuntimeError:
+                illegal 'stash' or 'pop' is found.
+
+        """
+        skip_tracker = current_skip_tracker()
+        stashed_tensors: Dict[str, Optional[Tensor]] = {}
+
+        # Load skip tensors that might be popped.
+        poppable_tensors = {}
+        batch = Batch(input)
+        for ns, name in self.poppable():
+            try:
+                poppable_tensors[name] = skip_tracker.load(batch, ns, name)
+            except KeyError as e:
+                raise RuntimeError(f"'{name}' has not been stashed") from e
+        input = batch.values
+
+        # Handle skip commands.
+        def handle_stash(name: str, tensor: Optional[Tensor]) -> None:
+            if name not in self.stashable_names:
+                raise RuntimeError(f"'{name}' has not been declared as stashable")
+            stashed_tensors[name] = tensor
+
+        def handle_pop(name: str) -> Optional[Tensor]:
+            if name not in self.poppable_names:
+                raise RuntimeError(f"'{name}' has not been declared as poppable")
+            return poppable_tensors.pop(name)
+
+        output = self.dispatch(input, handle_stash, handle_pop)
+
+        # All declared skips must be stashed or popped.
+        not_stashed = self.stashable_names - stashed_tensors.keys()
+        if not_stashed:
+            comma_names = ", ".join(f"'{n}'" for n in not_stashed)
+            raise RuntimeError(f"{comma_names} must be stashed but have not")
+
+        not_popped = poppable_tensors.keys()
+        if not_popped:
+            comma_names = ", ".join(f"'{n}'" for n in not_popped)
+            raise RuntimeError(f"{comma_names} must be popped but have not")
+
+        # Save stashed skip tensors.
+        batch = Batch(output)
+        for ns, name in self.stashable():
+            tensor = stashed_tensors[name]
+            skip_tracker.save(batch, ns, name, tensor)
+        output = batch.values
+
+        return output
+
+
+# TODO(sublee): Move to above of Skippable class for better read flow.
+def skippable(
+    stash: Iterable[str] = (), pop: Iterable[str] = (),
+) -> Callable[[Type[SkippableModule]], Type[Skippable]]:
+    """Define a decorator to create :class:`nn.Module <torch.nn.Module>` with skip connections.
+
+    These decorated modules are called "skippable". This functionality works perfectly
+    fine even when the module is not wrapped by :class:`~torch.distributed.pipeline.sync.Pipe`.
+
+    Each skip tensor is managed by its name. Before manipulating skip tensors,
+    a skippable module must statically declare the names for skip tensors by
+    `stash` and/or `pop` parameters. Skip tensors with pre-declared name can be
+    stashed by ``yield stash(name, tensor)`` or popped by ``tensor = yield
+    pop(name)``.
+
+    Here is an example with three layers. A skip tensor named "1to3" is stashed
+    and popped at the first and last layer, respectively::
+
+        @skippable(stash=['1to3'])
+        class Layer1(nn.Module):
+            def forward(self, input):
+                yield stash('1to3', input)
+                return f1(input)
+
+        class Layer2(nn.Module):
+            def forward(self, input):
+                return f2(input)
+
+        @skippable(pop=['1to3'])
+        class Layer3(nn.Module):
+            def forward(self, input):
+                skip_1to3 = yield pop('1to3')
+                return f3(input) + skip_1to3
+
+        model = nn.Sequential(Layer1(), Layer2(), Layer3())
+
+    One skippable module can stash or pop multiple skip tensors::
+
+        @skippable(stash=['alice', 'bob'], pop=['carol'])
+        class StashStashPop(nn.Module):
+            def forward(self, input):
+                yield stash('alice', f_alice(input))
+                yield stash('bob', f_bob(input))
+                carol = yield pop('carol')
+                return input + carol
+
+    Every skip tensor must be associated with exactly one pair of `stash` and
+    `pop`. :class:`~torch.distributed.pipeline.sync.Pipe` checks this
+    restriction automatically when wrapping a module. You can also check the
+    restriction by :func:`verify_skippables`
+    without :class:`~torch.distributed.pipeline.sync.Pipe`.
+
+    """
+    stashable_names = frozenset(stash)
+    poppable_names = frozenset(pop)
+
+    def extend_skippable(module_cls: Type[SkippableModule]) -> Type[Skippable]:
+        name = module_cls.__name__
+        bases = (Skippable,)
+        attrs = {"module_cls": module_cls, "stashable_names": stashable_names, "poppable_names": poppable_names}
+        return type(name, bases, attrs)
+
+    return extend_skippable
+
+
+class stash:
+    """The command to stash a skip tensor.
+
+    ::
+
+        def forward(self, input):
+            yield stash('name', input)
+            return f(input)
+
+    Args:
+        name (str): name of skip tensor
+        input (torch.Tensor or None): tensor to pass to the skip connection
+
+    """
+
+    __slots__ = ("name", "tensor")
+
+    def __init__(self, name: str, tensor: Optional[Tensor]) -> None:
+        self.name = name
+        self.tensor = tensor
+
+
+class pop:
+    """The command to pop a skip tensor.
+
+    ::
+
+        def forward(self, input):
+            skip = yield pop('name')
+            return f(input) + skip
+
+    Args:
+        name (str): name of skip tensor
+
+    Returns:
+        the skip tensor previously stashed by another layer under the same name
+
+    """
+
+    __slots__ = ("name",)
+
+    def __init__(self, name: str) -> None:
+        self.name = name
+
+
+def verify_skippables(module: nn.Sequential) -> None:
+    """Verify if the underlying skippable modules satisfy integrity.
+
+    Every skip tensor must have only one pair of `stash` and `pop`. If there
+    are one or more unmatched pairs, it will raise :exc:`TypeError` with the
+    detailed messages.
+
+    Here are a few failure cases. :func:`verify_skippables` will report failure
+    for these cases::
+
+        # Layer1 stashes "1to3".
+        # Layer3 pops "1to3".
+
+        nn.Sequential(Layer1(), Layer2())
+        #               └──── ?
+
+        nn.Sequential(Layer2(), Layer3())
+        #                   ? ────┘
+
+        nn.Sequential(Layer1(), Layer2(), Layer3(), Layer3())
+        #               └───────────────────┘       ^^^^^^
+
+        nn.Sequential(Layer1(), Layer1(), Layer2(), Layer3())
+        #             ^^^^^^      └───────────────────┘
+
+    To use the same name for multiple skip tensors, they must be isolated by
+    different namespaces. See :meth:`isolate()
+    <torchpipe.skip.skippable.Skippable.isolate>`.
+
+    Raises:
+        TypeError:
+            one or more pairs of `stash` and `pop` are not matched.
+
+    """
+    stashed: Set[Tuple[Namespace, str]] = set()
+    popped: Set[Tuple[Namespace, str]] = set()
+    msgs: List[str] = []
+
+    for layer_name, layer in module.named_children():
+        if not isinstance(layer, Skippable):
+            continue
+
+        for name in layer.stashable_names & layer.poppable_names:
+            msg = f"'{layer_name}' declared '{name}' both as stashable and as poppable"
+            msgs.append(msg)
+
+        for ns, name in layer.stashable():
+            if name in layer.poppable_names:
+                continue
+
+            if (ns, name) in stashed:
+                msg = f"'{layer_name}' redeclared '{name}' as stashable but not isolated by namespace"
+                msgs.append(msg)
+                continue
+
+            stashed.add((ns, name))
+
+        for ns, name in layer.poppable():
+            if name in layer.stashable_names:
+                continue
+
+            if (ns, name) in popped:
+                msg = f"'{layer_name}' redeclared '{name}' as poppable but not isolated by namespace"
+                msgs.append(msg)
+                continue
+
+            if (ns, name) not in stashed:
+                msg = f"'{layer_name}' declared '{name}' as poppable but it was not stashed"
+                msgs.append(msg)
+                continue
+
+            popped.add((ns, name))
+
+    for (_, name) in stashed - popped:
+        msg = f"no module declared '{name}' as poppable but stashed"
+        msgs.append(msg)
+
+    if msgs:
+        raise TypeError(
+            "one or more pairs of stash and pop do not match:\n\n%s" "" % "\n".join("* %s" % x for x in msgs)
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/tracker.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..33cac8b1deaea59110941867fac6250a439e01b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/skip/tracker.py
@@ -0,0 +1,180 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tracks skip tensors on a thread."""
+from contextlib import contextmanager
+import threading
+from typing import Dict, Generator, List, Optional, Tuple
+
+from torch import Tensor
+
+from ..checkpoint import is_checkpointing
+from ..dependency import fork, join
+from ..microbatch import Batch
+from ..stream import AbstractStream
+from .layout import SkipLayout
+from .namespace import Namespace
+from .portal import Portal
+
+__all__: List[str] = []
+
+
+class SkipTracker:
+    """Tracks saved skip tensors.
+
+    It will update the given micro-batch in place. This is because when it
+    manipulates the underlying skip tensors, the current micro-batch also has
+    to be connected with the skip tensors.
+
+    One thread has one skip tracker. Call :func:`current_skip_tracker` to get
+    the skip tracker on the current thread.
+
+    """
+
+    def __init__(self) -> None:
+        self.tensors: Dict[Tuple[Namespace, str], Optional[Tensor]] = {}
+
+    def save(self, batch: Batch, ns: Namespace, name: str, tensor: Optional[Tensor]) -> None:
+        self.tensors[(ns, name)] = tensor
+
+    def load(self, batch: Batch, ns: Namespace, name: str) -> Optional[Tensor]:
+        return self.tensors.pop((ns, name))
+
+    def copy(
+        self, batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream, ns: Namespace, name: str,
+    ) -> None:
+        raise TypeError("copy is not supported for non-portal skip tensors")
+
+
+class SkipTrackerThroughPotals(SkipTracker):
+    """Tracks saved skip tensors through portals. The skip tensors will be
+    hidden in portals so that the autograd engine does not need to track them.
+
+    This tracker is only used when the training or evaluating module is wrapped
+    with :class:`torchpipe.Pipe`.
+
+    """
+
+    def __init__(self, skip_layout: SkipLayout) -> None:
+        super().__init__()
+        self.skip_layout = skip_layout
+        self.portals: Dict[Tuple[Namespace, str], Portal] = {}
+
+    def save(self, batch: Batch, ns: Namespace, name: str, tensor: Optional[Tensor]) -> None:
+        """Saves the stashed skip tensor in a portal. The portal is then
+        connected to the given micro-batch with :class:`Join`.
+        """
+        if not self.skip_layout.requires_copy(ns, name):
+            super().save(batch, ns, name, tensor)
+            return
+
+        # See [Tensor Life of Portal] at Portal.put_tensor() to understand the
+        # below tensor_life values. Here are the selected events which retrieve
+        # the tensor in portal:
+        #
+        #  1. [x] blue()
+        #     ...
+        #  6. [x]   PortalOrange.forward
+        #     ...
+        #  8. [x]   PortalOrange.forward (recomputed)
+        #     ...
+        # 11. [x] blue() (recomputed)
+        #
+        if (ns, name) not in self.portals:
+            if is_checkpointing():
+                # Under checkpointing, the tensor used by the first
+                # PortalOrange should be alive in the portal. This tensor will
+                # be used again by the second PortalOrange during the
+                # recomputation.
+                tensor_life = 3  # Delete at [8. PortalOrange.forward (recomputed)]
+            else:
+                tensor_life = 2  # Delete at [6. PortalOrange.forward]
+
+            portal = Portal(tensor, tensor_life)
+            self.portals[(ns, name)] = portal
+
+        else:
+            # Under recomputation, the portal already exists.
+            portal = self.portals[(ns, name)]
+
+            # The existing tensor life already became 0. It should be reset as
+            # 1 to delete the tensor after the second PortalBlue immediately.
+            tensor_life = 1  # Delete at [11. blue() (recomputed)]
+
+            portal.put_tensor(tensor, tensor_life)
+
+        phony = portal.blue()
+        tensor_idx = batch.find_tensor_idx()
+        batch[tensor_idx] = join(batch[tensor_idx], phony)
+
+    def load(self, batch: Batch, ns: Namespace, name: str) -> Optional[Tensor]:
+        """Loads a skip tensor from the corresponding portal to pop. The given
+        micro-batch is connected to the portal with :class:`Fork`.
+        """
+        if not self.skip_layout.requires_copy(ns, name):
+            tensor = super().load(batch, ns, name)
+            return tensor
+
+        portal = self.portals[(ns, name)]
+        tensor_idx = batch.find_tensor_idx()
+        batch[tensor_idx], phony = fork(batch[tensor_idx])
+        tensor = portal.orange(phony)
+        return tensor
+
+    def copy(
+        self, batch: Batch, prev_stream: AbstractStream, next_stream: AbstractStream, ns: Namespace, name: str,
+    ) -> None:
+        """Copies the skip tensor in the corresponding portal. The given
+        micro-batch and the portal will be tied with :class:`Fork` and
+        :class:`Join`.
+        """
+        assert self.skip_layout.requires_copy(ns, name)
+
+        tensor_idx = batch.find_tensor_idx()
+        batch[tensor_idx], phony = fork(batch[tensor_idx])
+
+        portal = self.portals[(ns, name)]
+        phony = portal.copy(prev_stream, next_stream, phony)
+
+        batch[tensor_idx] = join(batch[tensor_idx], phony)
+
+
+class ThreadLocal(threading.local):
+    def __init__(self) -> None:
+        self.skip_tracker: Optional[SkipTracker] = None
+
+
+thread_local = ThreadLocal()
+
+
+@contextmanager
+def use_skip_tracker(skip_tracker: SkipTracker) -> Generator[None, None, None]:
+    """Registers the given skip tracker on the current thread within a
+    context::
+
+        with use_skip_tracker(my_skip_tracker):
+            ...
+
+    """
+    orig = thread_local.skip_tracker
+
+    thread_local.skip_tracker = skip_tracker
+
+    try:
+        yield
+    finally:
+        thread_local.skip_tracker = orig
+
+
+def current_skip_tracker() -> SkipTracker:
+    """Gets the skip tracker on the current thread."""
+    skip_tracker = thread_local.skip_tracker
+
+    if skip_tracker is None:
+        skip_tracker = SkipTracker()
+        thread_local.skip_tracker = skip_tracker
+
+    return skip_tracker
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/stream.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..54b97c29211da62cd347cf188661ab1b41e42efd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/stream.py
@@ -0,0 +1,120 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Utilities for eliminating boilerplate code to handle abstract streams with
+CPU device.
+"""
+from contextlib import contextmanager
+from typing import Generator, List, Union, cast
+
+import torch
+
+__all__: List[str] = ["CPUStreamType", "new_stream", "current_stream", "default_stream",
+                      "use_device", "use_stream", "get_device", "wait_stream", "record_stream",
+                      "is_cuda", "as_cuda"]
+
+
+class CPUStreamType:
+    pass
+
+
+# The placeholder on place of streams for the CPU device instead of CUDA.
+CPUStream = CPUStreamType()
+
+# It represents both CUDA streams and the CPU stream.
+AbstractStream = Union[torch.cuda.Stream, CPUStreamType]
+
+
+def new_stream(device: torch.device) -> AbstractStream:
+    """Creates a new stream for either CPU or CUDA device."""
+    if device.type != "cuda":
+        return CPUStream
+    return torch.cuda.Stream(device)
+
+
+def current_stream(device: torch.device) -> AbstractStream:
+    """:func:`torch.cuda.current_stream` for either CPU or CUDA device."""
+    if device.type != "cuda":
+        return CPUStream
+    return torch.cuda.current_stream(device)
+
+
+def default_stream(device: torch.device) -> AbstractStream:
+    """:func:`torch.cuda.default_stream` for either CPU or CUDA device."""
+    if device.type != "cuda":
+        return CPUStream
+    return torch.cuda.default_stream(device)
+
+
+@contextmanager
+def use_device(device: torch.device) -> Generator[None, None, None]:
+    """:func:`torch.cuda.device` for either CPU or CUDA device."""
+    if device.type != "cuda":
+        yield
+        return
+
+    with torch.cuda.device(device):
+        yield
+
+
+@contextmanager
+def use_stream(stream: AbstractStream) -> Generator[None, None, None]:
+    """:func:`torch.cuda.stream` for either CPU or CUDA stream."""
+    if not is_cuda(stream):
+        yield
+        return
+
+    with torch.cuda.stream(as_cuda(stream)):
+        yield
+
+
+def get_device(stream: AbstractStream) -> torch.device:
+    """Gets the device from CPU or CUDA stream."""
+    if is_cuda(stream):
+        return as_cuda(stream).device
+    return torch.device("cpu")
+
+
+def wait_stream(source: AbstractStream, target: AbstractStream) -> None:
+    """:meth:`torch.cuda.Stream.wait_stream` for either CPU or CUDA stream. It
+    makes the source stream wait until the target stream completes work queued.
+    """
+    if is_cuda(target):
+        if is_cuda(source):
+            # A CUDA stream waits another CUDA stream.
+            as_cuda(source).wait_stream(as_cuda(target))
+        else:
+            # CPU waits a CUDA stream.
+            as_cuda(target).synchronize()
+
+    # If the target is CPU, synchronization is not required.
+
+
+def record_stream(tensor: torch.Tensor, stream: AbstractStream) -> None:
+    """:meth:`torch.Tensor.record_stream` for either CPU or CUDA stream."""
+    if is_cuda(stream):
+        # NOTE(sublee): record_stream() on a shifted view tensor throws
+        # RuntimeError in PyTorch 1.1.0, and does nothing in 1.2.0. To safely
+        # protect the tensor against unexpected reallocation, here we use a
+        # temporal tensor associated with the same storage without shifting as
+        # a workaround.
+        #
+        # Issue: https://github.com/pytorch/pytorch/issues/27366
+        #
+        tensor = tensor.new_empty([0]).set_(tensor._typed_storage())
+
+        # Typechecking: torch.cuda.Stream is incompatible with torch._C.Stream
+        tensor.record_stream(as_cuda(stream))  # type: ignore[arg-type]
+
+
+def is_cuda(stream: AbstractStream) -> bool:
+    """Returns ``True`` if the given stream is a valid CUDA stream."""
+    return stream is not CPUStream
+
+
+def as_cuda(stream: AbstractStream) -> torch.cuda.Stream:
+    """Casts the given stream as :class:`torch.cuda.Stream`."""
+    return cast(torch.cuda.Stream, stream)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/utils.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32bad20160168f8994e357176f150dc5aa34012
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/utils.py
@@ -0,0 +1,38 @@
+from torch import nn
+from typing import List, Optional
+
+__all__ = ["partition_model"]
+
+def partition_model(
+        module: nn.Sequential,
+        balance: List[int],
+        devices: Optional[List[int]] = None):
+    """
+    Partions the model accross multiple GPU devices.
+
+    Given an :class:`nn.Sequential <torch.nn.Sequential>` module, partitions
+    the model across multiple GPU devices according the provided ``balance``
+    and ``devices``.
+
+    Args:
+        module (:class:`nn.Sequential <torch.nn.Sequential>`):
+            Sequential model representing the pipe.
+        balance (List[int]):
+            List indicating the number of layers in each partition.
+        devices (List[int], optional):
+            List indicating the device to use for each partition. Defaults to
+            ``range(len(balance))``
+    """
+    device_idx = 0
+    pipe_idx = 0
+    balanced_pipe = []
+    for num_layers in balance:
+        layers = []
+        for i in range(num_layers):
+            layers.append(module[pipe_idx])
+            pipe_idx += 1
+        device = device_idx if devices is None else devices[device_idx]
+        balanced_pipe.append(nn.Sequential(*layers).to(device))
+        device_idx += 1
+
+    return nn.Sequential(*balanced_pipe)
diff --git a/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/worker.py b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9ed8ded9a30a630f537406b9ff9aea50faa88c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/pipeline/sync/worker.py
@@ -0,0 +1,132 @@
+# Copyright 2019 Kakao Brain
+#
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+"""Multithreading in pipeline parallelism."""
+from contextlib import contextmanager
+from queue import Queue
+import sys
+from threading import Thread
+from types import TracebackType
+from typing import TYPE_CHECKING, Callable, Dict, Generator, List, Optional, Tuple, Type, Union, cast
+
+import torch
+
+from .microbatch import Batch
+from .stream import AbstractStream, use_device, use_stream
+
+__all__: List[str] = ["Task", "worker", "create_workers", "spawn_workers"]
+
+
+ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType]
+
+# Queue is generic only in stubs.
+# https://mypy.readthedocs.io/en/latest/common_issues.html#using-classes-that-are-generic-in-stubs-but-not-at-runtime
+if TYPE_CHECKING:
+    InQueue = Queue[Optional["Task"]]
+    OutQueue = Queue[Tuple[bool, Union[Tuple["Task", Batch], ExcInfo, None]]]
+else:
+    InQueue = Queue
+    OutQueue = Queue
+
+
+class Task:
+    """A task represents how to compute a micro-batch on a partition.
+
+    It consists of two parts: :meth:`compute` and :meth:`finalize`.
+    :meth:`compute` should be executed in worker threads concurrently.
+    :meth:`finalize` should be executed after when worker threads complete to
+    execute :meth:`compute`.
+
+    :meth:`compute` might be boosted by worker threads. Because it produces
+    several CUDA API calls by user code. In PyTorch, parallel CUDA API calls
+    are not serialized through GIL. So more than one CUDA API call can be
+    produced at the same time.
+
+    """
+
+    def __init__(
+        self, stream: AbstractStream, *, compute: Callable[[], Batch], finalize: Optional[Callable[[Batch], None]],
+    ) -> None:
+        self.stream = stream
+        self._compute = compute
+        self._finalize = finalize
+        self._grad_enabled = torch.is_grad_enabled()
+
+    def compute(self) -> Batch:
+        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
+            return self._compute()
+
+    def finalize(self, batch: Batch) -> None:
+        if self._finalize is None:
+            return
+        with use_stream(self.stream), torch.set_grad_enabled(self._grad_enabled):
+            self._finalize(batch)
+
+
+def worker(in_queue: InQueue, out_queue: OutQueue, device: torch.device) -> None:
+    """Main loop of a worker thread."""
+    with use_device(device):
+        while True:
+            task = in_queue.get()
+
+            if task is None:
+                break
+
+            try:
+                batch = task.compute()
+            except Exception:
+                exc_info = cast(ExcInfo, sys.exc_info())
+                out_queue.put((False, exc_info))
+                continue
+
+            out_queue.put((True, (task, batch)))
+
+    done = (False, None)
+    out_queue.put(done)
+
+
+def create_workers(devices: List[torch.device],) -> Tuple[List[InQueue], List[OutQueue]]:
+    """Spawns worker threads. A worker thread is bound to a device."""
+    in_queues: List[InQueue] = []
+    out_queues: List[OutQueue] = []
+
+    # Spawn workers.
+    workers: Dict[torch.device, Tuple[InQueue, OutQueue]] = {}
+
+    def normalize_device(device: torch.device) -> torch.device:
+        if device.type == "cuda" and device.index is None:
+            return torch.device("cuda", index=torch.cuda.current_device())
+
+        if device.type == "cpu" and device.index is not None:
+            return torch.device("cpu")
+
+        return device
+
+    for device in devices:
+        device = normalize_device(device)
+
+        try:
+            in_queue, out_queue = workers[device]
+        except KeyError:
+            in_queue = Queue()
+            out_queue = Queue()
+            workers[device] = (in_queue, out_queue)
+
+            t = Thread(target=worker, args=(in_queue, out_queue, device), daemon=True,)
+            t.start()
+
+        in_queues.append(in_queue)
+        out_queues.append(out_queue)
+
+    return (in_queues, out_queues)
+
+@contextmanager
+def spawn_workers(devices: List[torch.device],) -> Generator[Tuple[List[InQueue], List[OutQueue]], None, None]:
+    try:
+        (in_queues, out_queues) = create_workers(devices)
+        yield (in_queues, out_queues)
+    finally:
+        pass
diff --git a/MLPY/Lib/site-packages/torch/distributed/remote_device.py b/MLPY/Lib/site-packages/torch/distributed/remote_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..45bde5aefb28f09db425233500179b97edf537cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/remote_device.py
@@ -0,0 +1,128 @@
+from typing import Optional, Union
+
+import torch
+
+
+class _remote_device:
+    """
+    Represents a device on a remote worker.
+
+    Args:
+        remote_device (str or torch.device): Represents a device on a remote worker.
+            The string format should be one of the following:
+
+                1. "<workername>/<device>", where the device field can be parsed as torch.device type.
+                   E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
+                   In addition, the device field can be optional and the default value is "cpu".
+                2. "rank:<rank>/<device>", where <rank> is the rank of the
+                   process and device can be parsed as torch.device type.
+                   E.g., "rank:0/cpu", "rank:0", "rank:0/cuda:0"
+                3. <workername> and <rank> are optional and formats like "cpu"
+                    and "cuda:1", just represent local devices.
+    """
+
+    def __init__(self, remote_device: Union[str, torch.device]):
+        PARSE_ERROR = (
+            f"Could not parse remote_device: {remote_device}. The valid format is "
+            "'<workername>/<device>' or 'rank:<rank>/<device>' or '<device>'"
+        )
+        self._worker_name = None
+        self._rank = None
+        self._device: Optional[Union[str, int, torch.device]] = None
+
+        if isinstance(remote_device, torch.device):
+            self._device = remote_device
+        elif isinstance(remote_device, str):
+            fields = remote_device.split("/")
+            if len(fields) == 2:
+                self._worker_name, self._device = fields
+            elif len(fields) == 1:
+                # Check if this is a valid device.
+                if _remote_device._is_valid_local_device(fields[0]):
+                    self._device = fields[0]
+                else:
+                    self._worker_name = fields[0]
+                    self._device = "cpu"
+            else:
+                raise ValueError(PARSE_ERROR)
+        else:
+            raise TypeError(f'Invalid type for remote_device: {type(remote_device)}')
+
+        # Do some basic sanity check (no empty string)
+        if self._worker_name is not None and not self._worker_name:
+            raise ValueError(PARSE_ERROR)
+
+        # Validate the device.
+        self._device = torch.device(self._device)
+
+        # Check for rank based format.
+        if self._worker_name is not None:
+            fields = self._worker_name.split(":")
+            if len(fields) == 2:
+                # rank:<rank>/device format, extract rank
+                if fields[0] == "rank" and fields[1].isdigit():
+                    self._rank = int(fields[1])  # type: ignore[assignment]
+                    self._worker_name = None
+                else:
+                    raise ValueError(PARSE_ERROR)
+            elif len(fields) > 2:
+                raise ValueError(PARSE_ERROR)
+
+    @staticmethod
+    def _is_valid_local_device(device):
+        # Check for torch.device
+        try:
+            torch.device(device)
+            return True
+        except Exception:
+            return False
+
+    def worker_name(self) -> Optional[str]:
+        """Return the name of remote worker representing the remote device and ``None`` if no worker name is available."""
+        return self._worker_name
+
+    def rank(self) -> Optional[int]:
+        """
+        Returns the rank of remote worker representing the remote device.
+        Returns ``None`` if no rank is available.
+        """
+        return self._rank
+
+    def device(self) -> torch.device:
+        """Return the local device on the remote worker."""
+        return self._device  # type: ignore[return-value]
+
+    def __repr__(self):
+        if self._device is not None:
+            if self._worker_name is not None:
+                return f'{self._worker_name}/{self._device}'
+            elif self._rank is not None:
+                return f'rank:{self._rank}/{self._device}'
+            else:
+                return str(self._device)
+        else:
+            if self._worker_name is not None:
+                return f'{self._worker_name}'
+            elif self._rank is not None:
+                return f'{self._rank}'
+            else:
+                raise RuntimeError('Invalid state!')
+
+    def __eq__(self, other):
+        if not isinstance(other, _remote_device):
+            return False
+
+        if (
+            self._worker_name == other._worker_name
+            and self._device == other._device
+            and self._rank == other._rank
+        ):
+            return True
+
+        return False
+
+
+    def __hash__(self):
+        return hash(self._worker_name) ^ \
+            hash(self._device) ^ \
+            hash(self._rank)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rendezvous.py b/MLPY/Lib/site-packages/torch/distributed/rendezvous.py
new file mode 100644
index 0000000000000000000000000000000000000000..20b86c0e1896f50b4ff478b2df22042f51f4a429
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rendezvous.py
@@ -0,0 +1,256 @@
+try:
+    from urllib.parse import urlparse, urlunparse
+except ImportError as e:
+    raise ImportError(
+        "urllib cannot be found, urlparse from python2 is no longer supported."
+    ) from e
+
+import numbers
+import os
+import sys
+from datetime import timedelta
+from typing import Dict, Optional, Callable, Iterator, Tuple
+
+from torch.distributed import FileStore, PrefixStore, Store, TCPStore
+
+from .constants import default_pg_timeout
+
+
+_rendezvous_handlers: Dict[str, Callable[..., Iterator[Tuple[Store, int, int]]]] = {}
+
+
+def register_rendezvous_handler(scheme, handler):
+    """
+    Register a new rendezvous handler.
+
+    Before we can run collective algorithms, participating processes
+    need to find each other and exchange information to be able to
+    communicate. We call this process rendezvous.
+
+    The outcome of the rendezvous process is a triplet containing a
+    shared key/value store, the rank of the process, and the total
+    number of participating processes.
+
+    If none of the bundled rendezvous methods apply to your execution
+    environment you can opt to register your own rendezvous handler.
+    Pick a unique name and use the URL scheme to identify it when
+    calling the `rendezvous()` function.
+
+    Args:
+        scheme (str): URL scheme to identify your rendezvous handler.
+        handler (function): Handler that is invoked when the
+            `rendezvous()` function is called with a URL that uses
+            the corresponding scheme. It must be a generator function
+            that yields the triplet.
+    """
+    global _rendezvous_handlers
+    if scheme in _rendezvous_handlers:
+        raise RuntimeError(
+            f"Rendezvous handler for {scheme}:// already registered"
+        )
+    _rendezvous_handlers[scheme] = handler
+
+
+# Query will have format "rank=0&world_size=1" and is
+# converted into {"rank": 0, "world_size": 1}
+def _query_to_dict(query: str) -> Dict[str, str]:
+    return {pair[0]: pair[1] for pair in (pair.split("=") for pair in filter(None, query.split("&")))}
+
+
+def _rendezvous_helper(url: str, rank: int, world_size_opt: Optional[int], **kwargs):
+    result = urlparse(url)
+    if world_size_opt is None:
+        world_size = -1
+        if result.scheme == "env":
+            rank = int(os.environ.get("RANK", rank))
+            # If the world_size env variable is not present then it is a dynamic group
+            world_size = int(os.environ.get("WORLD_SIZE", world_size))
+    else:
+        world_size = world_size_opt
+    if rank != -1 or world_size != -1 or world_size_opt is None:
+        query_dict = _query_to_dict(result.query)
+        assert (
+            "rank" not in query_dict and "world_size" not in query_dict
+        ), f"The url: {url} has node-specific arguments(rank, world_size) already."
+        if rank != -1:
+            query_dict["rank"] = str(rank)
+        if world_size != -1 or world_size_opt is None:
+            query_dict["world_size"] = str(world_size)
+        result = result._replace(
+            query=f"{'&'.join([f'{k}={v}' for k, v in query_dict.items()])}"
+        )
+        url = urlunparse(result)
+
+    if result.scheme not in _rendezvous_handlers:
+        raise RuntimeError(f"No rendezvous handler for {result.scheme}://")
+    return _rendezvous_handlers[result.scheme](url, **kwargs)
+
+
+def rendezvous(url: str, rank: int = -1, world_size: int = -1, **kwargs):
+    if not isinstance(url, (str, bytes)):
+        raise RuntimeError(f"`url` must be a string. {type(url)}: {url}")
+
+    if not isinstance(rank, numbers.Integral):
+        raise RuntimeError(f"`rank` must be an integer. {rank}")
+
+    if not isinstance(world_size, numbers.Integral):
+        raise RuntimeError(f"`world_size` must be an integer. {world_size}")
+
+    return _rendezvous_helper(url, rank, world_size, **kwargs)
+
+
+def _create_store_from_options(backend_options, rank):
+    store, _, _ = next(_rendezvous_helper(backend_options.init_method, rank, None))
+    return store
+
+
+def _rendezvous_error(msg):
+    return ValueError("Error initializing torch.distributed using " + msg)
+
+
+def _file_rendezvous_handler(url: str, **kwargs):
+    def _error(msg):
+        return _rendezvous_error("file:// rendezvous: " + msg)
+
+    result = urlparse(url)
+    path = result.path
+    if sys.platform == "win32":
+        import urllib.request
+
+        full_path = result.netloc + result.path
+        path = urllib.request.url2pathname(full_path)
+        if path:
+            # Normalizing an empty string produces ".", which is not expected.
+            path = os.path.normpath(path)
+
+    if not path:
+        raise _error("path missing")
+    query_dict = _query_to_dict(result.query)
+    if "rank" not in query_dict:
+        raise _error("rank parameter missing")
+    if "world_size" not in query_dict:
+        raise _error("world size parameter missing")
+
+    rank = int(query_dict["rank"])
+    world_size = int(query_dict["world_size"])
+    store = FileStore(path, world_size)
+    yield (store, rank, world_size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform rerendezvous using file:// method")
+
+
+def _torchelastic_use_agent_store() -> bool:
+    return os.environ.get("TORCHELASTIC_USE_AGENT_STORE", None) == str(True)
+
+
+def _create_c10d_store(hostname, port, rank, world_size, timeout, use_libuv=False) -> Store:
+    """
+    Smartly creates a c10d Store object on ``rank`` based on whether we need to re-use agent store.
+
+    The TCPStore server is assumed to be hosted
+    on ``hostname:port``.
+
+    If ``torchelastic_use_agent_store()`` is ``True``, then it is assumed that
+    the agent leader (node rank 0) hosts the TCPStore server (for which the
+    endpoint is specified by the given ``hostname:port``). Hence
+    ALL ranks will create and return a TCPStore client (e.g. ``start_daemon=False``).
+
+    If ``torchelastic_use_agent_store()`` is ``False``, then rank 0 will host
+    the TCPStore (with multi-tenancy) and it is assumed that rank 0's hostname
+    and port are correctly passed via ``hostname`` and ``port``. All
+    non-zero ranks will create and return a TCPStore client.
+    """
+    # check if port is uint16_t
+    if not 0 <= port < 2**16:
+        raise ValueError(f"port must have value from 0 to 65535 but was {port}.")
+
+    if _torchelastic_use_agent_store():
+        attempt = os.environ["TORCHELASTIC_RESTART_COUNT"]
+        tcp_store = TCPStore(hostname, port, world_size, False, timeout)
+        return PrefixStore(f"/worker/attempt_{attempt}", tcp_store)
+    else:
+        start_daemon = rank == 0
+        return TCPStore(
+            hostname, port, world_size, start_daemon, timeout, multi_tenant=True, use_libuv=use_libuv
+        )
+
+
+def _tcp_rendezvous_handler(
+    url: str, timeout: timedelta = default_pg_timeout, **kwargs
+):
+    def _error(msg):
+        return _rendezvous_error("tcp:// rendezvous: " + msg)
+
+    result = urlparse(url)
+    if not result.port:
+        raise _error("port number missing")
+    query_dict = _query_to_dict(result.query)
+    if "rank" not in query_dict:
+        raise _error("rank parameter missing")
+    if "world_size" not in query_dict:
+        raise _error("world size parameter missing")
+
+    rank = int(query_dict["rank"])
+    world_size = int(query_dict["world_size"])
+    use_libuv = query_dict.get("use_libuv", "0") == "1"
+    assert result.hostname is not None
+
+    store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout, use_libuv)
+
+    yield (store, rank, world_size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform re-rendezvous using tcp:// method")
+
+
+def _env_rendezvous_handler(
+    url: str, timeout: timedelta = default_pg_timeout, **kwargs
+):
+    def _error(msg):
+        return _rendezvous_error("env:// rendezvous: " + msg)
+
+    def _env_error(var):
+        return _error(f"environment variable {var} expected, but not set")
+
+    def _get_env_or_raise(env_var: str) -> str:
+        env_val = os.environ.get(env_var, None)
+        if not env_val:
+            raise _env_error(env_var)
+        else:
+            return env_val
+
+    result = urlparse(url)
+    query_dict = _query_to_dict(result.query)
+
+    rank: int
+    world_size: int
+    master_port: int
+    master_addr: str
+
+    if "rank" in query_dict:
+        rank = int(query_dict["rank"])
+    else:
+        rank = int(_get_env_or_raise("RANK"))
+
+    if "world_size" in query_dict:
+        world_size = int(query_dict["world_size"])
+    else:
+        world_size = int(_get_env_or_raise("WORLD_SIZE"))
+
+
+    master_addr = _get_env_or_raise("MASTER_ADDR")
+    master_port = int(_get_env_or_raise("MASTER_PORT"))
+    use_libuv = query_dict.get("use_libuv", os.environ.get("USE_LIBUV", "0")) == "1"
+
+    store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout, use_libuv)
+
+    yield (store, rank, world_size)
+
+    # If this configuration is invalidated, there is nothing we can do about it
+    raise RuntimeError("Unable to perform re-rendezvous using env:// method")
+
+
+register_rendezvous_handler("tcp", _tcp_rendezvous_handler)
+register_rendezvous_handler("env", _env_rendezvous_handler)
+register_rendezvous_handler("file", _file_rendezvous_handler)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__init__.py b/MLPY/Lib/site-packages/torch/distributed/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af4dfbcb348ce865fead98a154d543ad2c3e3e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/__init__.py
@@ -0,0 +1,249 @@
+from datetime import timedelta
+import logging
+import os
+import threading
+import warnings
+from typing import Generator, Tuple
+from urllib.parse import urlparse
+
+import torch
+import torch.distributed as dist
+
+logger = logging.getLogger(__name__)
+
+
+_init_counter = 0
+_init_counter_lock = threading.Lock()
+
+__all__ = ["is_available"]
+
+def is_available() -> bool:
+    return hasattr(torch._C, "_rpc_init")
+
+
+if is_available() and not torch._C._rpc_init():
+    raise RuntimeError("Failed to initialize torch.distributed.rpc")
+
+
+if is_available():
+    from torch._C._distributed_c10d import Store
+    from torch._C._distributed_rpc import (
+        _disable_jit_rref_pickle,
+        _enable_jit_rref_pickle,
+        _disable_server_process_global_profiler,
+        _enable_server_process_global_profiler,
+        _set_and_start_rpc_agent,
+        _reset_current_rpc_agent,
+        _delete_all_user_and_unforked_owner_rrefs,
+        _destroy_rref_context,
+        _set_profiler_node_id,
+        _is_current_rpc_agent_set,
+        _rref_context_get_debug_info,
+        _cleanup_python_rpc_handler,
+        _invoke_rpc_builtin,
+        _invoke_rpc_python_udf,
+        _invoke_rpc_torchscript,
+        _invoke_remote_builtin,
+        _invoke_remote_python_udf,
+        _invoke_remote_torchscript,
+        _set_rpc_timeout,
+        _get_current_rpc_agent,
+        get_rpc_timeout,
+        enable_gil_profiling,
+        RpcBackendOptions,
+        _TensorPipeRpcBackendOptionsBase,
+        RpcAgent,
+        PyRRef,
+        TensorPipeAgent,
+        RemoteProfilerManager,
+        WorkerInfo,
+        _DEFAULT_INIT_METHOD,
+        _DEFAULT_NUM_WORKER_THREADS,
+        _UNSET_RPC_TIMEOUT,
+        _DEFAULT_RPC_TIMEOUT_SEC,
+    )  # noqa: F401
+
+    from . import api, backend_registry, functions
+    from .api import *  # noqa: F401,F403
+    import numbers
+
+    import torch.distributed.autograd as dist_autograd
+
+    from .backend_registry import BackendType
+    from .options import TensorPipeRpcBackendOptions  # noqa: F401
+    from .server_process_global_profiler import (
+        _server_process_global_profile,
+    )
+
+    rendezvous_iterator: Generator[Tuple[Store, int, int], None, None]
+
+    __all__ += ["init_rpc", "BackendType", "TensorPipeRpcBackendOptions"]
+    __all__ = __all__ + api.__all__ + backend_registry.__all__  # noqa: PLE0605
+
+    def init_rpc(
+        name,
+        backend=None,
+        rank=-1,
+        world_size=None,
+        rpc_backend_options=None,
+    ):
+        r"""
+        Initializes RPC primitives such as the local RPC agent
+        and distributed autograd, which immediately makes the current
+        process ready to send and receive RPCs.
+
+        Args:
+            name (str): a globally unique name of this node. (e.g.,
+                ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``)
+                Name can only contain number, alphabet, underscore, colon,
+                and/or dash, and must be shorter than 128 characters.
+            backend (BackendType, optional): The type of RPC backend
+                implementation. Supported values is
+                ``BackendType.TENSORPIPE`` (the default).
+                See :ref:`rpc-backends` for more information.
+            rank (int): a globally unique id/rank of this node.
+            world_size (int): The number of workers in the group.
+            rpc_backend_options (RpcBackendOptions, optional): The options
+                passed to the RpcAgent constructor. It must be an agent-specific
+                subclass of :class:`~torch.distributed.rpc.RpcBackendOptions`
+                and contains agent-specific initialization configurations. By
+                default, for all agents, it sets the default timeout to 60
+                seconds and performs the rendezvous with an underlying process
+                group initialized using ``init_method = "env://"``,
+                meaning that environment variables ``MASTER_ADDR`` and
+                ``MASTER_PORT`` need to be set properly. See
+                :ref:`rpc-backends` for more information and find which options
+                are available.
+        """
+        torch._C._log_api_usage_once("torch.distributed.init_rpc")
+        if backend is not None and not isinstance(
+            backend, backend_registry.BackendType
+        ):
+            raise TypeError("Argument backend must be a member of BackendType")
+
+        if rpc_backend_options is not None and not isinstance(
+            rpc_backend_options, RpcBackendOptions
+        ):
+            raise TypeError(
+                "Argument rpc_backend_options must be an instance of RpcBackendOptions"
+            )
+
+        # Try to detect the backend from the options
+        if backend is None and rpc_backend_options is not None:
+            for candidate_backend in BackendType:
+                if isinstance(
+                    rpc_backend_options,
+                    type(
+                        backend_registry.construct_rpc_backend_options(
+                            candidate_backend
+                        )
+                    ),
+                ):
+                    backend = candidate_backend
+                    break
+            else:
+                raise TypeError(
+                    f"Could not infer backend for options {rpc_backend_options}"
+                )
+            # Ignore type error because mypy doesn't handle dynamically generated type objects (#4865)
+            if backend != BackendType.TENSORPIPE:  # type: ignore[attr-defined]
+                logger.warning(
+                    "RPC was initialized with no explicit backend but with options "  # type: ignore[attr-defined]
+                    "corresponding to %(backend)s, hence that backend will be used "
+                    "instead of the default BackendType.TENSORPIPE. To silence this "
+                    "warning pass `backend=%(backend)s` explicitly.",
+                    {'backend': backend}
+                )
+
+        if backend is None:
+            backend = BackendType.TENSORPIPE  # type: ignore[attr-defined]
+
+        if rpc_backend_options is None:
+            # default construct a set of RPC backend options.
+            rpc_backend_options = backend_registry.construct_rpc_backend_options(
+                backend
+            )
+
+        # Create store, performs rendezvous for static RPC group.
+        if not world_size:
+            # If world_size is not set in construction and also not set in environment variables
+            # The store will be created for the dynamic group setting
+            store = dist._create_store_from_options(rpc_backend_options, rank)
+        else:
+            # This rendezvous state sometimes is destroyed before all processes
+            # finishing handshaking. To avoid that issue, we make it global to
+            # keep it alive.
+            global rendezvous_iterator
+            rendezvous_iterator = dist.rendezvous(
+                rpc_backend_options.init_method, rank=rank, world_size=world_size
+            )
+            store, _, _ = next(rendezvous_iterator)
+        # Use same timeout as RPC.
+        store.set_timeout(timedelta(seconds=rpc_backend_options.rpc_timeout))
+
+        # Use a PrefixStore to distinguish multiple invocations.
+        with _init_counter_lock:
+            global _init_counter
+            store = dist.PrefixStore(str(f"rpc_prefix_{_init_counter}"), store)
+            _init_counter += 1
+
+        # Initialize autograd before RPC since _init_rpc_backend guarantees all
+        # processes sync via the store. If we initialize autograd after RPC,
+        # there could be a race where some nodes might have initialized autograd
+        # and others might not have. As a result, a node calling
+        # torch.distributed.autograd.backward() would run into errors since
+        # other nodes might not have been initialized.
+        dist_autograd._init(rank)
+
+        _set_profiler_node_id(rank)
+        # Initialize RPC.
+        _init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
+
+    def _validate_rpc_args(backend, store, name, rank, world_size, rpc_backend_options):
+        type_mapping = {
+            backend: backend_registry.BackendType,
+            store: dist.Store,
+            name: str,
+            rank: numbers.Integral,
+            # world_size can be None for a dynamic group
+            world_size: (numbers.Integral, type(None)),
+            rpc_backend_options: RpcBackendOptions,
+        }
+        for arg, arg_type in type_mapping.items():
+            if not isinstance(arg, arg_type):  # type: ignore[arg-type]
+                raise RuntimeError(
+                    f"Argument {arg} must be of type {arg_type} but got type {type(arg)}"
+                )
+
+    def _init_rpc_backend(
+        backend=BackendType.TENSORPIPE,  # type: ignore[attr-defined]
+        store=None,
+        name=None,
+        rank=-1,
+        world_size=None,
+        rpc_backend_options=None,
+    ):
+
+        _validate_rpc_args(backend, store, name, rank, world_size, rpc_backend_options)
+
+        if _is_current_rpc_agent_set():
+            raise RuntimeError("RPC is already initialized")
+
+        # Initialize RPC.
+        rpc_agent = backend_registry.init_backend(
+            backend,
+            store=store,
+            name=name,
+            rank=rank,
+            world_size=world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        api._init_rpc_states(rpc_agent)
+
+    @api._require_initialized
+    def _get_debug_info():
+        info = _rref_context_get_debug_info()
+        info.update(api._get_current_rpc_agent().get_debug_info())
+        info.update(dist_autograd._get_debug_info())
+        return info
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b737902c3b858523709a5890947d89e1c9307a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3de56f5bd7849de63dea094b509526d0c58d786
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dabaaaa871a7ebb77b5856bba0a54353b03d63d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/backend_registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/backend_registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..024bf88af18e2083d7956f9425ff1d36339b95fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/backend_registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/constants.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e4749b6d3390c66736685ca17c71dd4303889cf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/constants.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ada019d95c2758157aae69994e443bdad993018f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/internal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/internal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31be07c9137041ea6d49004c7419839309962c1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/internal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/options.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/options.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fefff5c856f1b4cc82ca26a8dab977b20257558
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/options.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/rref_proxy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/rref_proxy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49b52870da33581e59fec3915be2c8c58fa9706d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/rref_proxy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/server_process_global_profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/server_process_global_profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80e125354f502fa35970dbb34b380548e2d6af9a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/__pycache__/server_process_global_profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__init__.py b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..311c67fae2eba7a453a8713e2195cdcd9c2a83cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__init__.py
@@ -0,0 +1,18 @@
+
+import torch
+
+
+def is_available():
+    return hasattr(torch._C, "_faulty_agent_init")
+
+
+if is_available() and not torch._C._faulty_agent_init():
+    raise RuntimeError("Failed to initialize torch.distributed.rpc._testing")
+
+if is_available():
+    # Registers FAULTY_TENSORPIPE RPC backend.
+    from . import faulty_agent_backend_registry
+    from torch._C._distributed_rpc_testing import (
+        FaultyTensorPipeRpcBackendOptions,
+        FaultyTensorPipeAgent,
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..575dcb52396833511947ccf7f3c7234889701c42
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/faulty_agent_backend_registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/faulty_agent_backend_registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db87aaf03444ad81290ce1c13892b22648aed4d7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/__pycache__/faulty_agent_backend_registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/faulty_agent_backend_registry.py b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/faulty_agent_backend_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad053948e0d55151e45cb1fbad687641d4930e28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/_testing/faulty_agent_backend_registry.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import torch.distributed as dist
+import torch.distributed.rpc as rpc
+
+def _faulty_tensorpipe_construct_rpc_backend_options_handler(
+    rpc_timeout,
+    init_method,
+    num_worker_threads,
+    messages_to_fail,
+    messages_to_delay,
+    num_fail_sends,
+    **kwargs
+):
+    from . import FaultyTensorPipeRpcBackendOptions
+
+    return FaultyTensorPipeRpcBackendOptions(
+        num_worker_threads=num_worker_threads,
+        rpc_timeout=rpc_timeout,
+        init_method=init_method,
+        messages_to_fail=messages_to_fail,
+        messages_to_delay=messages_to_delay,
+        num_fail_sends=num_fail_sends,
+    )
+
+
+def _faulty_tensorpipe_init_backend_handler(
+    store, name, rank, world_size, rpc_backend_options
+):
+    from . import FaultyTensorPipeAgent
+    from . import FaultyTensorPipeRpcBackendOptions
+    from torch.distributed.rpc import api
+
+    if not isinstance(store, dist.Store):
+        raise TypeError(f"`store` must be a c10d::Store. {store}")
+
+    if not isinstance(
+        rpc_backend_options, FaultyTensorPipeRpcBackendOptions
+    ):
+        raise TypeError(
+            f"`rpc_backend_options` must be a `FaultyTensorPipeRpcBackendOptions`. {rpc_backend_options}"
+        )
+
+    agent = FaultyTensorPipeAgent(
+        store,
+        name,
+        rank,
+        world_size,
+        rpc_backend_options,
+        {},  # reverse_device_map
+        [],  # devices
+    )
+    api._init_rpc_states(agent)
+
+    return agent
+
+
+rpc.backend_registry.register_backend(
+    "FAULTY_TENSORPIPE",
+    _faulty_tensorpipe_construct_rpc_backend_options_handler,
+    _faulty_tensorpipe_init_backend_handler,
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/_utils.py b/MLPY/Lib/site-packages/torch/distributed/rpc/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af5299e19240a15da940406d0a3918aaa9d59cce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/_utils.py
@@ -0,0 +1,37 @@
+from contextlib import contextmanager
+from typing import cast
+import logging
+from . import api
+from . import TensorPipeAgent
+
+logger = logging.getLogger(__name__)
+
+@contextmanager
+def _group_membership_management(store, name, is_join):
+    token_key = "RpcGroupManagementToken"
+    join_or_leave = "join" if is_join else "leave"
+    my_token = f"Token_for_{name}_{join_or_leave}"
+    while True:
+        # Retrieve token from store to signal start of rank join/leave critical section
+        returned = store.compare_set(token_key, "", my_token).decode()
+        if returned == my_token:
+            # Yield to the function this context manager wraps
+            yield
+            # Finished, now exit and release token
+            # Update from store to signal end of rank join/leave critical section
+            store.set(token_key, "")
+            # Other will wait for this token to be set before they execute
+            store.set(my_token, "Done")
+            break
+        else:
+            # Store will wait for the token to be released
+            try:
+                store.wait([returned])
+            except RuntimeError:
+                logger.error("Group membership token %s timed out waiting for %s to be released.", my_token, returned)
+                raise
+
+def _update_group_membership(worker_info, my_devices, reverse_device_map, is_join):
+    agent = cast(TensorPipeAgent, api._get_current_rpc_agent())
+    ret = agent._update_group_membership(worker_info, my_devices, reverse_device_map, is_join)
+    return ret
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/api.py b/MLPY/Lib/site-packages/torch/distributed/rpc/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c2006a93bd7869f86bbc3cb4bb0169d9c9a1d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/api.py
@@ -0,0 +1,947 @@
+__all__ = ["shutdown", "get_worker_info", "remote", "rpc_sync",
+           "rpc_async", "RRef", "AllGatherStates", "method_factory", "new_method"]
+
+import collections
+import contextlib
+import functools
+import inspect
+import logging
+import threading
+from typing import Dict, Generic, TypeVar, Set, Any, TYPE_CHECKING
+
+import torch
+from torch.futures import Future
+
+from torch._C._distributed_rpc import (
+    PyRRef,
+    RemoteProfilerManager,
+    WorkerInfo,
+    TensorPipeAgent,
+    get_rpc_timeout,
+    _cleanup_python_rpc_handler,
+    _delete_all_user_and_unforked_owner_rrefs,
+    _destroy_rref_context,
+    _get_current_rpc_agent,
+    _invoke_remote_builtin,
+    _invoke_remote_python_udf,
+    _invoke_remote_torchscript,
+    _invoke_rpc_builtin,
+    _invoke_rpc_python_udf,
+    _invoke_rpc_torchscript,
+    _is_current_rpc_agent_set,
+    _reset_current_rpc_agent,
+    _set_and_start_rpc_agent,
+)
+
+from .internal import (
+    PythonUDF,
+    RPCExecMode,
+    _internal_rpc_pickler,
+    _build_rpc_profiling_key,
+)
+
+from .constants import DEFAULT_SHUTDOWN_TIMEOUT, UNSET_RPC_TIMEOUT
+
+from ._utils import _group_membership_management, _update_group_membership
+
+logger = logging.getLogger(__name__)
+
+# NB: Ignoring RRef leaks during shutdown. Without this, applications have to
+# make sure there is no references to any RRef in the application code and
+# Python GC has done its job to delete those RRefs. This is could result in bad
+# debugging experiences especially when for large applications. Therefore, by
+# default, we are going to ignore RRef leaks during shutdown. This is usually
+# fine as shutdown means applications have done training and no longer care
+# about states.
+#
+# To enable RRef leak checking, set this _ignore_rref_leak to False
+_ignore_rref_leak = True
+_default_pickler = _internal_rpc_pickler
+
+@contextlib.contextmanager
+def _use_rpc_pickler(rpc_pickler):
+    r"""
+    rpc_pickler: (.internal._InternalRPCPickler) Overrides the default RPC pickler
+    """
+    global _default_pickler
+    _default_pickler = rpc_pickler
+    try:
+        yield
+    finally:
+        _default_pickler = _internal_rpc_pickler
+
+
+def _require_initialized(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not _is_current_rpc_agent_set():
+            raise RuntimeError(
+                "RPC has not been initialized. Call "
+                "torch.distributed.rpc.init_rpc first."
+            )
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class AllGatherStates:
+    def __init__(self):
+        # Each `gathered_objects` is an empty dict at beginning.
+        # The leader worker is elected as the first worker in a sorted worker
+        # name list. Whenever there is a worker entering `_all_gather()`, it
+        # runs `_gather_to_leader()` on the leader to add its own name and
+        # data obj to this dict. The leader also adds itself's name to the dict
+        # on calling `_all_gather()`.
+        # Once `set(gathered_objects.keys()) == _ALL_WORKER_NAMES`, the leader
+        # will broadcast the gathered dict to all follower workers and set their
+        # `gathered_objects` field and the `proceed_signal` field.
+        self.gathered_objects = {}
+        # All workers wait on this signal until it receives all gathered
+        # objects.
+        self.proceed_signal = threading.Event()
+
+
+# States used by `def _all_gather()`.
+# `_ALL_WORKER_NAMES` is initialized on initializing RPC layer.
+_ALL_WORKER_NAMES: Set[Any] = set()
+_all_gather_dict_lock = threading.RLock()
+_all_gather_sequence_id: Dict[str, int] = {}
+_all_gather_sequence_id_to_states: collections.defaultdict = collections.defaultdict(AllGatherStates)
+
+
+def _init_rpc_states(agent):
+    worker_infos = agent.get_worker_infos()
+    global _ALL_WORKER_NAMES
+    _ALL_WORKER_NAMES = {worker_info.name for worker_info in worker_infos}
+
+    # NB: backend implementation might have already set the rpc_agent.
+    if not _is_current_rpc_agent_set():
+        _set_and_start_rpc_agent(agent)
+
+
+def _gather_to_leader(sequence_id, worker_name, obj, worker_names=None):
+    with _all_gather_dict_lock:
+        if not worker_names:
+            worker_names = _ALL_WORKER_NAMES
+            assert (
+                worker_name in worker_names
+            ), f"{worker_name} is not expected by leader."
+        states = _all_gather_sequence_id_to_states[sequence_id]
+        assert (
+            worker_name not in states.gathered_objects
+        ), f"{worker_name} reported intent sequence id {sequence_id} twice. "
+        states.gathered_objects[worker_name] = obj
+        if worker_names == set(states.gathered_objects.keys()):
+            states.proceed_signal.set()
+
+
+def _broadcast_to_followers(sequence_id, objects_map):
+    with _all_gather_dict_lock:
+        states = _all_gather_sequence_id_to_states[sequence_id]
+
+    assert (
+        not states.proceed_signal.is_set()
+    ), f"Termination signal sequence id {sequence_id} got set twice."
+    states.gathered_objects = objects_map
+    states.proceed_signal.set()
+
+_thread_local_var = threading.local()
+
+
+@contextlib.contextmanager
+def _wait_all():
+    r"""
+    A context manager that collects all futures returned by ``rpc_async`` and
+    waits them on the context manager's exit; relieving the user of needing
+    to explicitly call wait.
+
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> with rpc._wait_all():
+        >>>    fut_1 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+        >>>    fut_2 = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+        >>> #fut_1 and fut_2 are waited on
+    """
+    _thread_local_var.future_list = []
+    try:
+        yield
+    finally:
+        try:
+            torch.futures.wait_all(_thread_local_var.future_list)
+        finally:
+            del _thread_local_var.future_list
+
+
+@_require_initialized
+def _all_gather(obj, worker_names=None, timeout: float = UNSET_RPC_TIMEOUT):
+    r"""
+    This is similar to torch.distributed.all_gather(), but is using RPC. It
+    picks the worker with the smallest name (alphabetic order) as the leader.
+    Then all followers send their data ``obj`` to the leader. After the leader
+    has received all, it will broadcast the results back to all followers. This
+    function blocks until all workers have received the gathered results.
+    """
+    if not worker_names:
+        assert (
+            _ALL_WORKER_NAMES is not None
+        ), "`_ALL_WORKER_NAMES` is not initialized for `def _all_gather`."
+        worker_names = _ALL_WORKER_NAMES
+    leader_name = min(worker_names)
+
+    self_name = _get_current_rpc_agent().get_worker_info().name
+
+    with _all_gather_dict_lock:
+        concat_names = "".join(sorted(worker_names))
+        sequence_num = _all_gather_sequence_id.get(concat_names, 0)
+        _all_gather_sequence_id[concat_names] = sequence_num + 1
+        sequence_id = concat_names + str(sequence_num)
+
+    is_leader = leader_name == self_name
+
+    if timeout == UNSET_RPC_TIMEOUT:
+        # Timeout is specified by agent for RPC calls
+        rpc_timeout = get_rpc_timeout()
+        # No timeout for signal
+        signal_timeout = None
+    elif timeout == DEFAULT_SHUTDOWN_TIMEOUT:
+        # No timeout for RPC
+        rpc_timeout = timeout
+        # No timeout for signal
+        signal_timeout = None
+    else:
+        # Signal and RPC timeout use the same timeout
+        signal_timeout = rpc_timeout = timeout
+
+    # Phase 1: Followers send it's object to the leader
+    if is_leader:
+        _gather_to_leader(sequence_id, self_name, obj, worker_names)
+    else:
+        rpc_sync(
+            leader_name,
+            _gather_to_leader,
+            args=(sequence_id, self_name, obj, worker_names),
+            timeout=rpc_timeout,
+        )
+
+    with _all_gather_dict_lock:
+        states = _all_gather_sequence_id_to_states[sequence_id]
+
+    # Timeout is either set by function parameter or None (which is indefinite)
+    states.proceed_signal.wait(timeout=signal_timeout)
+
+    # Phase 2: Leader broadcast gathered results to all followers
+    # Leader's signal is the first to be unblocked, after receiving all
+    # followers' data objects.
+    if is_leader:
+        worker_name_to_response_future_dict = {}
+        for follower_name in worker_names - {leader_name}:
+            fut = rpc_async(
+                follower_name,
+                _broadcast_to_followers,
+                args=(sequence_id, states.gathered_objects),
+                timeout=rpc_timeout
+            )
+            worker_name_to_response_future_dict[follower_name] = fut
+
+        errors = []
+        for follower_name, fut in worker_name_to_response_future_dict.items():
+            try:
+                fut.wait()
+            except RuntimeError as ex:
+                errors.append((follower_name, ex))
+
+        if errors:
+            raise RuntimeError(
+                f"Followers {[e[0] for e in errors]} timed out in _all_gather "
+                f"after {rpc_timeout:.2f} seconds. The first exception is {errors[0][1]}"
+            )
+
+    # Clean up for the states using the sequence_id
+    with _all_gather_dict_lock:
+        states = _all_gather_sequence_id_to_states.pop(sequence_id)
+    return states.gathered_objects
+
+
+@_require_initialized
+def _barrier(worker_names):
+    r"""
+    Synchronizes local and remote RPC processes.
+
+    This will block until all local and remote RPC processes specified under worker_names
+    reach this method to wait for all outstanding work to complete.
+
+    Args:
+        worker_names (List[str]): The set of workers to synchronize.
+
+    """
+    try:
+        _all_gather(None, set(worker_names))
+    except RuntimeError as ex:
+        logger.error(
+            "Failed to complete barrier, got error %s", ex
+        )
+
+
+@_require_initialized
+def _wait_all_workers(timeout=DEFAULT_SHUTDOWN_TIMEOUT):
+    r"""
+    Block until all local and remote RPC processes reach this method and wait
+    for all outstanding work to complete. Every RPC process must call this
+    method before exit to perform a graceful shutdown. This should be used to
+    terminate the RPC framework, and there is no guarantee that the RPC
+    framework will work after this method returns.
+    """
+    try:
+        _all_gather(None, timeout=timeout)
+    except RuntimeError as ex:
+        logger.error(
+            "Failed to respond to 'Shutdown Proceed' in time, got error %s", ex
+        )
+        raise ex
+
+
+@_require_initialized
+def shutdown(graceful=True, timeout=DEFAULT_SHUTDOWN_TIMEOUT):
+    r"""
+    Perform a shutdown of the RPC agent, and then destroy the RPC agent. This
+    stops the local agent from accepting outstanding requests, and shuts
+    down the RPC framework by terminating all RPC threads. If ``graceful=True``,
+    this will block until all local and remote RPC processes reach this method
+    and wait for all outstanding work to complete. Otherwise, if
+    ``graceful=False``, this is a local shutdown, and it does not wait for other
+    RPC processes to reach this method.
+
+    .. warning::
+        For :class:`~torch.futures.Future` objects returned by
+        :meth:`~torch.distributed.rpc.rpc_async`, ``future.wait()`` should not
+        be called after ``shutdown()``.
+
+    Args:
+        graceful (bool): Whether to do a graceful shutdown or not. If True,
+                         this will 1) wait until there is no pending system
+                         messages for ``UserRRefs`` and delete them; 2) block
+                         until all local and remote RPC processes have reached
+                         this method and wait for all outstanding work to
+                         complete.
+
+    Example::
+        Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
+        on both workers. Refer to :meth:`~torch.distributed.init_process_group`
+        API for more details. For example,
+
+        export MASTER_ADDR=localhost
+        export MASTER_PORT=5678
+
+        Then run the following code in two different processes:
+
+        >>> # xdoctest: +SKIP
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> # do some work
+        >>> result = rpc.rpc_sync("worker1", torch.add, args=(torch.ones(1), 1))
+        >>> # ready to shutdown
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> # wait for worker 0 to finish work, and then shutdown.
+        >>> rpc.shutdown()
+    """
+    if graceful:
+        try:
+            agent = _get_current_rpc_agent()
+            if not isinstance(agent, TensorPipeAgent) or agent.is_static_group:
+                _wait_all_workers(timeout)
+                _delete_all_user_and_unforked_owner_rrefs()
+                agent.join(shutdown=True, timeout=timeout)
+            else:
+                # This is a dynamic group so we need to grab the token for the operation
+                my_worker_info = agent.get_worker_info()
+                my_name = my_worker_info.name
+                with _group_membership_management(agent.store, my_name, False):
+                    all_worker_infos = agent.get_worker_infos()
+                    for worker in all_worker_infos:
+                        if worker.name != my_name:
+                            rpc_sync(worker.name, _update_group_membership, args=(my_worker_info, [], {}, False))
+                    agent.join(shutdown=True, timeout=timeout)
+        finally:
+            # In case of errors, continue to complete the local shutdown.
+            _finalize_shutdown()
+    else:
+        _finalize_shutdown()
+
+
+def _finalize_shutdown():
+    try:
+        # This raises a `TORCH_CHECK()` exception on RRef leak detected.
+        _destroy_rref_context(_ignore_rref_leak)
+    finally:
+        _get_current_rpc_agent().shutdown()
+        # clean up python rpc handler in shutdown(), see comments in
+        # PythonRpcHandler::cleanup(), call it in python API because the
+        # cleanup() function has python dependency, it assumes python
+        # interpreter exists.
+        # No matter if RRef leak exception is raised, this clean-up code
+        # must run to avoid destruction segfault in Python 3.5.
+        #
+        # future.wait() should not be called after shutdown().
+        # pythonRpcHandler is cleaned up in shutdown(), after
+        # shutdown(), python objects returned from rpc python call can not be
+        # resolved.
+        _cleanup_python_rpc_handler()
+        _reset_current_rpc_agent()
+
+
+@_require_initialized
+def get_worker_info(worker_name=None):
+    r"""
+    Get :class:`~torch.distributed.rpc.WorkerInfo` of a given worker name.
+    Use this :class:`~torch.distributed.rpc.WorkerInfo` to avoid passing an
+    expensive string on every invocation.
+
+    Args:
+        worker_name (str): the string name of a worker. If ``None``, return the
+                           the id of the current worker. (default ``None``)
+
+    Returns:
+        :class:`~torch.distributed.rpc.WorkerInfo` instance for the given
+        ``worker_name`` or :class:`~torch.distributed.rpc.WorkerInfo` of the
+        current worker if ``worker_name`` is ``None``.
+    """
+    if worker_name is not None:
+        return _get_current_rpc_agent().get_worker_info(worker_name)
+    else:
+        return _get_current_rpc_agent().get_worker_info()
+
+
+def _to_worker_info(to):
+    if isinstance(to, WorkerInfo):
+        return to
+    elif isinstance(to, (str, int)):
+        return get_worker_info(to)
+    else:
+        raise ValueError(f"Cannot get WorkerInfo from name {to}")
+
+
+def _rref_typeof_on_owner(rref, blocking: bool = True):
+    rref_type = type(rref.local_value())
+    if blocking:
+        return rref_type
+    else:
+        # Wrap result into a completed Future. This is so that if blocking=`False`
+        # is specified, we return a future regardless of if this call is on user
+        # or owner.
+        future = Future[type]()
+        future.set_result(rref_type)
+        return future
+
+
+def _rref_typeof_on_user(rref, timeout: float = UNSET_RPC_TIMEOUT, blocking: bool = True):
+    fut = rpc_async(
+        rref.owner(),
+        _rref_typeof_on_owner,
+        args=(rref,),
+        timeout=timeout
+    )
+    if blocking:
+        return fut.wait()
+    else:
+        return fut
+
+
+T = TypeVar("T")
+GenericWithOneTypeVar = Generic[T]
+
+
+if TYPE_CHECKING:
+    class RRef(PyRRef[T], Generic[T]):
+        pass
+else:
+    try:
+        # Combine the implementation class and the type class.
+        class RRef(PyRRef, Generic[T]):
+            pass
+    except TypeError:
+        # TypeError: metaclass conflict: the metaclass of a derived class
+        # must be a (non-strict) subclass of the metaclasses of all its bases
+        # Mypy doesn't understand __class__ (mypy bug #4177)
+        class RRefMeta(PyRRef.__class__, GenericWithOneTypeVar.__class__):  # type: ignore[name-defined, misc, valid-type]
+            pass
+
+        # Combine the implementation class and the type class.
+        # Types for classes expecting a certain generic parameter (mypy bug #7791)
+        class RRef(PyRRef, GenericWithOneTypeVar, metaclass=RRefMeta):  # type: ignore[misc, no-redef, valid-type]
+            pass
+
+
+# Install docstrings from `PyRRef` to `RRef`.
+#
+# This is for the fact that pybind11 generates the parameter
+# `self` as type `rpc.PyRRef`, so a `:inherited-members:`
+# under `.. autoclass:: RRef` does not work.
+# we have to do the following process to replace `rpc.PyRRef` with `rpc.RRef`.
+#
+def method_factory(method_name, docstring):
+    def method(self, *args, **kwargs):
+        return getattr(super(RRef, self), method_name)(*args, **kwargs)
+
+    if method.__doc__:
+        method.__doc__ = docstring
+    return method
+
+
+for method_name, method in inspect.getmembers(PyRRef):
+    # Ignore magic methods, except "__str__".
+    if method_name.startswith("_") and method_name != "__str__":
+        continue
+
+    # Get pybind11 generated docstring.
+    # It's like,
+    """
+    to_here(self: torch.distributed.rpc.PyRRef, timeout: float=-1.0) -> object
+
+        Blocking call that copies the value of the RRef from the owner
+        to the local node and returns it. If the current node is the
+        owner, returns a reference to the local value.
+    """
+    docstring = getattr(method, "__doc__", None)
+    assert docstring is not None, "RRef user-facing methods should all have docstrings."
+
+    # Do surgery on pybind11 generated docstrings.
+    docstring = docstring.replace("torch.distributed.rpc.PyRRef", "torch.distributed.rpc.RRef")
+
+    # Attach user-facing RRef method with modified docstring.
+    new_method = method_factory(method_name, docstring)
+    setattr(RRef, method_name, new_method)
+
+
+@_require_initialized
+def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
+    r"""
+    Make a remote call to run ``func`` on worker ``to`` and return an
+    :class:`~torch.distributed.rpc.RRef` to the result value immediately.
+    Worker ``to`` will be the owner of the returned
+    :class:`~torch.distributed.rpc.RRef`, and the worker calling ``remote`` is
+    a user. The owner manages the global reference count of its
+    :class:`~torch.distributed.rpc.RRef`, and the owner
+    :class:`~torch.distributed.rpc.RRef` is only destructed when globally there
+    are no living references to it.
+
+    Args:
+        to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
+        func (Callable): a callable function, such as Python callables, builtin
+                         operators (e.g. :meth:`~torch.add`) and annotated
+                         TorchScript functions.
+        args (tuple): the argument tuple for the ``func`` invocation.
+        kwargs (dict): is a dictionary of keyword arguments for the ``func``
+                       invocation.
+
+        timeout (float, optional): timeout in seconds for this remote call. If the
+                                   creation of this
+                                   :class:`~torch.distributed.rpc.RRef` on worker
+                                   ``to`` is not successfully processed on this
+                                   worker within this timeout, then the next time
+                                   there is an attempt to use the RRef (such as
+                                   ``to_here()``), a timeout will be raised
+                                   indicating this failure. A value of 0 indicates
+                                   an infinite timeout, i.e. a timeout error will
+                                   never be raised. If not provided, the default
+                                   value set during initialization or with
+                                   ``_set_rpc_timeout`` is used.
+
+    Returns:
+        A user :class:`~torch.distributed.rpc.RRef` instance to the result
+        value. Use the blocking API :meth:`torch.distributed.rpc.RRef.to_here`
+        to retrieve the result value locally.
+
+    .. warning ::
+        The ``remote`` API does not copy storages of argument tensors until
+        sending them over the wire, which could be done by a different thread
+        depending on the RPC backend type. The caller should make sure that the
+        contents of those tensors stay intact until the returned RRef is
+        confirmed by the owner, which can be checked using the
+        :meth:`torch.distributed.rpc.RRef.confirmed_by_owner` API.
+
+    .. warning ::
+        Errors such as timeouts for the ``remote`` API are handled on a
+        best-effort basis. This means that when remote calls initiated by
+        ``remote`` fail, such as with a timeout error, we take a best-effort
+        approach to error handling. This means that errors are handled and set
+        on the resulting RRef on an asynchronous basis. If the RRef has not been
+        used by the application before this handling (such as ``to_here`` or
+        fork call), then future uses of the ``RRef`` will appropriately raise
+        errors. However, it is possible that the user application will use the
+        ``RRef`` before the errors are handled. In this case, errors may not be
+        raised as they have not yet been handled.
+
+    Example::
+
+        Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
+        on both workers. Refer to :meth:`~torch.distributed.init_process_group`
+        API for more details. For example,
+
+        export MASTER_ADDR=localhost
+        export MASTER_PORT=5678
+
+        Then run the following code in two different processes:
+
+        >>> # xdoctest: +SKIP
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> rref1 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 3))
+        >>> rref2 = rpc.remote("worker1", torch.add, args=(torch.ones(2), 1))
+        >>> x = rref1.to_here() + rref2.to_here()
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+
+        Below is an example of running a TorchScript function using RPC.
+
+        >>> # On both workers:
+        >>> @torch.jit.script
+        >>> def my_script_add(tensor: torch.Tensor, scalar: int):
+        >>>    return torch.add(tensor, scalar)
+
+        >>> # On worker 0:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> rref = rpc.remote("worker1", my_script_add, args=(torch.ones(2), 3))
+        >>> rref.to_here()
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+    """
+    torch._C._log_api_usage_once("torch.distributed.rpc_remote")
+    qualified_name = torch.jit._builtins._find_builtin(func)
+    dst_worker_info = _to_worker_info(to)
+    should_profile = _get_should_profile()
+
+    ctx_manager = _enable_rpc_profiler(should_profile, qualified_name, func, RPCExecMode.REMOTE, dst_worker_info)
+
+    with ctx_manager as rf:
+        args = args if args else ()
+        kwargs = kwargs if kwargs else {}
+
+        is_async_exec = hasattr(func, "_wrapped_async_rpc_function")
+
+        if is_async_exec:
+            wrapped = func._wrapped_async_rpc_function
+            if isinstance(wrapped, torch.jit.ScriptFunction):
+                func = wrapped
+
+        if qualified_name is not None:
+            rref = _invoke_remote_builtin(dst_worker_info, qualified_name, timeout, *args, **kwargs)
+        elif isinstance(func, torch.jit.ScriptFunction):
+            rref = _invoke_remote_torchscript(
+                dst_worker_info.name,
+                torch._jit_internal._qualified_name(func),
+                timeout,
+                is_async_exec,
+                *args,
+                **kwargs,
+            )
+        else:
+            (pickled_python_udf, tensors) = _default_pickler.serialize(
+                PythonUDF(func, args, kwargs)
+            )
+            rref = _invoke_remote_python_udf(
+                dst_worker_info,
+                pickled_python_udf,
+                tensors,
+                timeout,
+                is_async_exec
+            )
+        # attach profiling information
+        if should_profile:
+            assert torch.autograd._profiler_enabled()
+            assert rf is not None
+            fut = rf._call_end_callbacks_on_future(rref._get_future())
+            rref._set_profiling_future(fut)
+
+    return rref
+
+
+def _invoke_rpc(to, func, rpc_type, args=None, kwargs=None, rpc_timeout: float = UNSET_RPC_TIMEOUT):
+    if not callable(func):
+        raise TypeError("function should be callable.")
+
+    qualified_name = torch.jit._builtins._find_builtin(func)
+    dst_worker_info = _to_worker_info(to)
+
+    should_profile = _get_should_profile()
+
+    ctx_manager = _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info)
+
+    with ctx_manager as rf:
+        args = args if args else ()
+        kwargs = kwargs if kwargs else {}
+
+        is_async_exec = hasattr(func, "_wrapped_async_rpc_function")
+
+        if is_async_exec:
+            wrapped = func._wrapped_async_rpc_function
+            if isinstance(wrapped, torch.jit.ScriptFunction):
+                func = wrapped
+
+        if qualified_name is not None:
+            fut = _invoke_rpc_builtin(
+                dst_worker_info,
+                qualified_name,
+                rpc_timeout,
+                *args,
+                **kwargs
+            )
+        elif isinstance(func, torch.jit.ScriptFunction):
+            fut = _invoke_rpc_torchscript(
+                dst_worker_info.name,
+                torch._jit_internal._qualified_name(func),
+                args,
+                kwargs,
+                rpc_timeout,
+                is_async_exec
+            )
+        else:
+            (pickled_python_udf, tensors) = _default_pickler.serialize(
+                PythonUDF(func, args, kwargs)
+            )
+            fut = _invoke_rpc_python_udf(
+                dst_worker_info,
+                pickled_python_udf,
+                tensors,
+                rpc_timeout,
+                is_async_exec
+            )
+        if should_profile:
+            assert torch.autograd._profiler_enabled()
+            assert rf is not None
+            # Schedule profiling callbacks to run when the future completes.
+            # This returns a future that is completed when the original future
+            # completes and the profiling callbacks have been completed as well,
+            # to guarantee that fut.wait() completes the profiling. This new
+            # future will contain the same value as the original future.
+            fut = rf._call_end_callbacks_on_future(fut)
+    return fut
+
+
+@_require_initialized
+def rpc_sync(to, func, args=None, kwargs=None, timeout: float = UNSET_RPC_TIMEOUT):
+    r"""
+    Make a blocking RPC call to run function ``func`` on worker ``to``. RPC
+    messages are sent and received in parallel to execution of Python code. This
+    method is thread-safe.
+
+    Args:
+        to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
+        func (Callable): a callable function, such as Python callables, builtin
+                         operators (e.g. :meth:`~torch.add`) and annotated
+                         TorchScript functions.
+        args (tuple): the argument tuple for the ``func`` invocation.
+        kwargs (dict): is a dictionary of keyword arguments for the ``func``
+                       invocation.
+        timeout (float, optional): timeout in seconds to use for this RPC. If
+                                   the RPC does not complete in this amount of
+                                   time, an exception indicating it has
+                                   timed out will be raised. A value of 0
+                                   indicates an infinite timeout, i.e. a timeout
+                                   error will never be raised. If not provided,
+                                   the default value set during initialization
+                                   or with ``_set_rpc_timeout`` is used.
+
+    Returns:
+        Returns the result of running ``func`` with ``args`` and ``kwargs``.
+
+    Example::
+        Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
+        on both workers. Refer to :meth:`~torch.distributed.init_process_group`
+        API for more details. For example,
+
+        export MASTER_ADDR=localhost
+        export MASTER_PORT=5678
+
+        Then run the following code in two different processes:
+
+        >>> # xdoctest: +SKIP
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> ret = rpc.rpc_sync("worker1", torch.add, args=(torch.ones(2), 3))
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+
+        Below is an example of running a TorchScript function using RPC.
+
+        >>> # On both workers:
+        >>> @torch.jit.script
+        >>> def my_script_add(tensor: torch.Tensor, scalar: int):
+        >>>    return torch.add(tensor, scalar)
+
+        >>> # On worker 0:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> ret = rpc.rpc_sync("worker1", my_script_add, args=(torch.ones(2), 3))
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+
+    """
+    torch._C._log_api_usage_once("torch.distributed.rpc_sync")
+    fut = _invoke_rpc(to, func, RPCExecMode.SYNC, args, kwargs, timeout)
+    return fut.wait()
+
+
+@_require_initialized
+def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
+    r"""
+    Make a non-blocking RPC call to run function ``func`` on worker ``to``. RPC
+    messages are sent and received in parallel to execution of Python code. This
+    method is thread-safe. This method will immediately return a
+    :class:`~torch.futures.Future` that can be awaited on.
+
+    Args:
+        to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
+        func (Callable): a callable function, such as Python callables, builtin
+                         operators (e.g. :meth:`~torch.add`) and annotated
+                         TorchScript functions.
+        args (tuple): the argument tuple for the ``func`` invocation.
+        kwargs (dict): is a dictionary of keyword arguments for the ``func``
+                       invocation.
+        timeout (float, optional): timeout in seconds to use for this RPC. If
+                                   the RPC does not complete in this amount of
+                                   time, an exception indicating it has
+                                   timed out will be raised. A value of 0
+                                   indicates an infinite timeout, i.e. a timeout
+                                   error will never be raised. If not provided,
+                                   the default value set during initialization
+                                   or with ``_set_rpc_timeout`` is used.
+
+
+    Returns:
+        Returns a :class:`~torch.futures.Future` object that can be waited
+        on. When completed, the return value of ``func`` on ``args`` and
+        ``kwargs`` can be retrieved from the :class:`~torch.futures.Future`
+        object.
+
+    .. warning ::
+        Using GPU tensors as arguments or return values of ``func`` is not
+        supported since we don't support sending GPU tensors over the wire. You
+        need to explicitly copy GPU tensors to CPU before using them as
+        arguments or return values of ``func``.
+
+    .. warning ::
+        The ``rpc_async`` API does not copy storages of argument tensors until
+        sending them over the wire, which could be done by a different thread
+        depending on the RPC backend type. The caller should make sure that the
+        contents of those tensors stay intact until the returned
+        :class:`~torch.futures.Future` completes.
+
+    Example::
+        Make sure that ``MASTER_ADDR`` and ``MASTER_PORT`` are set properly
+        on both workers. Refer to :meth:`~torch.distributed.init_process_group`
+        API for more details. For example,
+
+        export MASTER_ADDR=localhost
+        export MASTER_PORT=5678
+
+        Then run the following code in two different processes:
+
+        >>> # xdoctest: +SKIP
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> fut1 = rpc.rpc_async("worker1", torch.add, args=(torch.ones(2), 3))
+        >>> fut2 = rpc.rpc_async("worker1", min, args=(1, 2))
+        >>> result = fut1.wait() + fut2.wait()
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+
+        Below is an example of running a TorchScript function using RPC.
+
+        >>> # On both workers:
+        >>> @torch.jit.script
+        >>> def my_script_add(tensor: torch.Tensor, scalar: int):
+        >>>    return torch.add(tensor, scalar)
+
+        >>> # On worker 0:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> fut = rpc.rpc_async("worker1", my_script_add, args=(torch.ones(2), 3))
+        >>> ret = fut.wait()
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> rpc.shutdown()
+    """
+    torch._C._log_api_usage_once("torch.distributed.rpc_async")
+    fut = _invoke_rpc(to, func, RPCExecMode.ASYNC, args, kwargs, timeout)
+    if hasattr(_thread_local_var, "future_list"):
+        _thread_local_var.future_list.append(fut)
+    return fut
+
+
+def _get_should_profile():
+    # Legacy profiler should be enabled. RPC profiling is not supported with
+    # Kineto profiler.
+    ActiveProfilerType = torch._C._profiler.ActiveProfilerType
+    return (
+        torch.autograd._profiler_enabled() and
+        torch._C._autograd._profiler_type() == ActiveProfilerType.LEGACY  # type: ignore[attr-defined]
+    )
+
+
+def _enable_rpc_profiler(should_profile, qualified_name, func, rpc_type, dst_worker_info):
+    ctx_manager = contextlib.nullcontext()
+
+    if should_profile:
+        # Create appropriate string representation based on type of func
+        # (builtin, script, python)
+        if qualified_name is None:
+            func_name = (
+                torch._jit_internal._qualified_name(func)
+                if isinstance(func, torch.jit.ScriptFunction)
+                else func.__qualname__
+            )
+        else:
+            func_name = qualified_name
+        # Build RPC profiling key.
+        rpc_profiling_key = _build_rpc_profiling_key(
+            rpc_type,
+            func_name,
+            get_worker_info().name,
+            dst_worker_info.name,
+        )
+        RemoteProfilerManager.set_current_profiling_key(rpc_profiling_key)
+        # Mypy doesn't support re-def of a variable not in the same block (#1174)
+        ctx_manager = torch.autograd.profiler.record_function(rpc_profiling_key)  # type: ignore[assignment]
+
+    return ctx_manager
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/backend_registry.py b/MLPY/Lib/site-packages/torch/distributed/rpc/backend_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28651842199b3e4cc10c8c2665a277f9dbf9da3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/backend_registry.py
@@ -0,0 +1,395 @@
+__all__ = ["init_backend", "backend_registered", "construct_rpc_backend_options", "register_backend", "BackendType", "BackendValue"]
+
+import collections
+import enum
+from typing import cast, Dict, List, Set, Tuple
+
+import torch
+import torch.distributed as dist
+from ._utils import _group_membership_management, _update_group_membership
+
+from . import api
+from . import constants as rpc_constants
+
+__all__ = ["backend_registered", "register_backend", "construct_rpc_backend_options", "init_backend",
+           "BackendValue", "BackendType"]
+
+BackendValue = collections.namedtuple(
+    "BackendValue", ["construct_rpc_backend_options_handler", "init_backend_handler"]
+)
+
+
+def _backend_type_repr(self):
+    return "BackendType." + self.name
+
+
+_backend_type_doc = """
+    An enum class of available backends.
+
+    PyTorch ships with a builtin ``BackendType.TENSORPIPE`` backend.
+    Additional ones can be registered using the
+    :func:`~torch.distributed.rpc.backend_registry.register_backend` function.
+"""
+
+# Create an enum type, `BackendType`, with empty members.
+# Can't handle Function Enum API (mypy bug #9079)
+BackendType = enum.Enum(value="BackendType", names=dict())  # type: ignore[misc]
+# Unable to assign a function a method (mypy bug #2427)
+BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
+
+if BackendType.__doc__:
+    BackendType.__doc__ = _backend_type_doc
+
+def backend_registered(backend_name):
+    """
+    Checks if backend_name is registered as an RPC backend.
+
+    Args:
+        backend_name (str): string to identify the RPC backend.
+    Returns:
+        True if the backend has been registered with ``register_backend``, else
+        False.
+    """
+    return backend_name in BackendType.__members__.keys()
+
+
+def register_backend(
+    backend_name, construct_rpc_backend_options_handler, init_backend_handler
+):
+    """Registers a new RPC backend.
+
+    Args:
+        backend_name (str): backend string to identify the handler.
+        construct_rpc_backend_options_handler (function):
+            Handler that is invoked when
+            rpc_backend.construct_rpc_backend_options(**dict) is called.
+        init_backend_handler (function): Handler that is invoked when the
+            `_init_rpc_backend()` function is called with a backend.
+             This returns the agent.
+    """
+    global BackendType
+    if backend_registered(backend_name):
+        raise RuntimeError(f"RPC backend {backend_name}: already registered")
+    # Create a new enum type, `BackendType`, with extended members.
+    existing_enum_dict = {member.name: member.value for member in BackendType}
+    extended_enum_dict = dict(
+        {
+            backend_name: BackendValue(
+                construct_rpc_backend_options_handler=construct_rpc_backend_options_handler,
+                init_backend_handler=init_backend_handler,
+            )
+        },
+        **existing_enum_dict
+    )
+    # Can't handle Function Enum API (mypy bug #9079)
+    BackendType = enum.Enum(value="BackendType", names=extended_enum_dict)  # type: ignore[misc]
+    # Unable to assign a function a method (mypy bug #2427)
+    BackendType.__repr__ = _backend_type_repr  # type: ignore[assignment]
+    if BackendType.__doc__:
+        BackendType.__doc__ = _backend_type_doc
+    return BackendType[backend_name]
+
+def construct_rpc_backend_options(
+    backend,
+    rpc_timeout=rpc_constants.DEFAULT_RPC_TIMEOUT_SEC,
+    init_method=rpc_constants.DEFAULT_INIT_METHOD,
+    **kwargs
+):
+
+    return backend.value.construct_rpc_backend_options_handler(
+        rpc_timeout, init_method, **kwargs
+    )
+
+def init_backend(backend, *args, **kwargs):
+    return backend.value.init_backend_handler(*args, **kwargs)
+
+def _init_process_group(store, rank, world_size):
+    # Initialize ProcessGroup.
+    process_group_timeout = rpc_constants.DEFAULT_PROCESS_GROUP_TIMEOUT
+
+    # We're using a bunch of private APIs here since `new_group` requires the
+    # default group to be initialized.
+    group = dist.ProcessGroupGloo(store, rank, world_size, process_group_timeout)
+
+    assert group is not None, "Failed to initialize default ProcessGroup."
+
+    if (rank != -1) and (rank != group.rank()):
+        raise RuntimeError(
+            f"rank argument {rank} doesn't match pg rank {group.rank()}"
+        )
+    if (world_size != -1) and (world_size != group.size()):
+        raise RuntimeError(
+            f"world_size argument {world_size} doesn't match pg size {group.size()}"
+        )
+    return group
+
+def _tensorpipe_construct_rpc_backend_options_handler(
+    rpc_timeout,
+    init_method,
+    num_worker_threads=rpc_constants.DEFAULT_NUM_WORKER_THREADS,
+    _transports=None,
+    _channels=None,
+    **kwargs
+):
+    from . import TensorPipeRpcBackendOptions
+
+    return TensorPipeRpcBackendOptions(
+        rpc_timeout=rpc_timeout,
+        init_method=init_method,
+        num_worker_threads=num_worker_threads,
+        _transports=_transports,
+        _channels=_channels,
+    )
+
+
+def _tensorpipe_validate_devices(devices, device_count):
+    return all(
+        d.type == "cpu" or (d.type == "cuda" and 0 <= d.index < device_count)
+        for d in devices
+    )
+
+
+# detect if any worker has invalid device_map configurations, and return
+# reverse device maps
+def _tensorpipe_exchange_and_check_all_device_maps(
+    my_name, my_device_count, my_device_maps, my_devices, group
+):
+    gathered: List[Tuple[
+        str, int, Dict[str, Dict[torch.device, torch.device]], List[torch.device]
+    ]] = [("", 0, {}, []) for _ in range(group.size())]
+    dist.all_gather_object(
+        gathered, (my_name, my_device_count, my_device_maps, my_devices), group
+    )
+    all_names = [name for name, _, _, _ in gathered]
+    all_device_counts = {name: count for name, count, _, _ in gathered}
+    all_device_maps = {name: map_ for name, _, map_, _ in gathered}
+    all_devices = {name: devices for name, _, _, devices in gathered}
+
+    _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices)
+
+    # passed all checked, construct reverse mapping and get list of devices handled by this agent
+    reverse_device_maps = _create_reverse_mapping(my_name, all_names, all_device_maps)
+    my_devices = _create_device_list(my_devices, my_device_maps, reverse_device_maps)
+    return reverse_device_maps, my_devices
+
+def _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices, is_static_group=True):
+    for node in all_names:
+        devices = all_devices[node]
+        if len(set(devices)) != len(devices):
+            raise ValueError(
+                f"Node {node} has duplicated devices\n"
+                f"devices = {devices}"
+            )
+        if not _tensorpipe_validate_devices(devices, all_device_counts[node]):
+            raise ValueError(
+                f"Node {node} has devices with invalid indices\n"
+                f"devices = {devices}\n"
+                f"device count = {all_device_counts[node]}"
+            )
+
+    for source_node in all_names:
+        # For dynamic group (non-static) do not check the target node name since it may not have joined yet
+        if is_static_group and not set(all_device_maps[source_node].keys()).issubset(all_names):
+            raise ValueError(
+                f"Node {source_node} has invalid target node names in its device maps\n"
+                f"device maps = {all_device_maps[source_node].keys()}\n"
+                f"node names = {all_names}"
+            )
+        for target_node, map_ in all_device_maps[source_node].items():
+            if len(set(map_.values())) != len(map_):
+                raise ValueError(
+                    f"Node {source_node} has duplicated target devices "
+                    f"in its device map for {target_node}\n"
+                    f"device map = {map_}"
+                )
+            if all_devices[source_node]:
+                if not set(map_.keys()).issubset(all_devices[source_node]):
+                    raise ValueError(
+                        f"Node {source_node} has unexpected source devices "
+                        f"in its device map for {target_node}\n"
+                        f"device map = {map_}\n"
+                        f"devices = {all_devices[source_node]}"
+                    )
+            elif not _tensorpipe_validate_devices(
+                map_.keys(), all_device_counts[source_node]
+            ):
+                raise ValueError(
+                    f"Node {source_node} has source devices with invalid indices "
+                    f"in its device map for {target_node}\n"
+                    f"device map = {map_}\n"
+                    f"device count = {all_device_counts[source_node]}"
+                )
+            if all_devices.get(target_node, []):
+                if not set(map_.values()).issubset(all_devices[target_node]):
+                    raise ValueError(
+                        f"Node {source_node} has unexpected target devices "
+                        f"in its device map for {target_node}\n"
+                        f"device map = {map_}\n"
+                        f"devices = {all_devices[target_node]}"
+                    )
+            elif target_node in all_device_counts and not _tensorpipe_validate_devices(
+                map_.values(), all_device_counts[target_node]
+            ):
+                raise ValueError(
+                    f"Node {source_node} has target devices with invalid indices "
+                    f"in its device map for {target_node}\n"
+                    f"device map = {map_}\n"
+                    f"device count = {all_device_counts[target_node]}"
+                )
+
+def _create_device_list(my_devices, my_device_maps, reverse_device_maps):
+    if not my_devices:
+        devices_set: Set[torch.device] = set()
+        for map_ in my_device_maps.values():
+            devices_set.update(map_.keys())
+        for map_ in reverse_device_maps.values():
+            devices_set.update(map_.keys())
+        devices_set.discard(torch.device("cpu"))
+        my_devices = list(devices_set)
+    my_devices = sorted(my_devices, key=lambda d: d.index)
+    return my_devices
+
+def _create_reverse_mapping(my_name, all_names, all_device_maps):
+    reverse_device_maps: Dict[str, Dict[torch.device, torch.device]] = {}
+    for node in all_names:
+        if my_name in all_device_maps[node]:
+            reverse_device_maps[node] = {
+                v: k for k, v in all_device_maps[node][my_name].items()
+            }
+    return reverse_device_maps
+
+def _get_device_infos():
+    from . import TensorPipeAgent
+    agent = cast(TensorPipeAgent, api._get_current_rpc_agent())
+    opts = agent._get_backend_options()
+    device_count = torch.cuda.device_count()
+    if torch.cuda.is_available() and opts.devices:
+        torch.cuda.init()
+    return device_count, opts.device_maps, opts.devices
+
+def _set_devices_and_reverse_device_map(agent):
+    from . import TensorPipeAgent
+    agent = cast(TensorPipeAgent, agent)
+    # Group state is retrieved from local agent
+    # On initialization, tensorpipe agent retrieves information from all existing workers, so group state is valid
+    my_worker_info = agent.get_worker_info()
+    my_name = my_worker_info.name
+    all_worker_infos = agent.get_worker_infos()
+    # One round to get device_maps of all workers and construct reverse device maps
+    all_device_counts, all_device_maps, all_devices, all_names = {}, {}, {}, []
+    for worker_info in all_worker_infos:
+        worker_name = worker_info.name
+        if worker_name != my_name:
+            # TODO: make async?
+            device_count, device_map, devices = api.rpc_sync(worker_name, _get_device_infos)
+        else:
+            opts = agent._get_backend_options()
+            device_count, device_map, devices = torch.cuda.device_count(), opts.device_maps, opts.devices
+        all_device_counts[worker_name] = device_count
+        all_device_maps[worker_name] = device_map
+        all_devices[worker_name] = devices
+        all_names.append(worker_name)
+
+    _validate_device_maps(all_names, all_device_counts, all_device_maps, all_devices, is_static_group=False)
+    reverse_device_maps = _create_reverse_mapping(my_name, all_names, all_device_maps)
+
+    # Perform RPC call to all workers, including itself, to include newly joined worker information and device maps
+    for worker_name in all_names:
+        # Set device list for each worker
+        all_devices[worker_name] = _create_device_list(all_devices[worker_name], all_device_maps[worker_name], reverse_device_maps)
+        api.rpc_sync(worker_name, _update_group_membership,
+                     args=(my_worker_info, all_devices[worker_name], reverse_device_maps, True))
+
+def _tensorpipe_init_backend_handler(store, name, rank, world_size, rpc_backend_options):
+    from . import TensorPipeAgent
+    from . import TensorPipeRpcBackendOptions
+    if not isinstance(store, dist.Store):
+        raise TypeError(f"`store` must be a c10d::Store. {store}")
+
+    if not isinstance(
+        rpc_backend_options, TensorPipeRpcBackendOptions
+    ):
+        raise TypeError(
+            f"`rpc_backend_options` must be a `TensorPipeRpcBackendOptions`. {rpc_backend_options}"
+        )
+
+    device_count = torch.cuda.device_count()
+
+    is_static_group = True if world_size else False
+    # world_size is specified so this is a static group (ranks cannot join and leave)
+    if is_static_group:
+        # The agent's join method is required to behave like a barrier and perform
+        # collective operations, for which it relies on a process group, instead of
+        # re-implementing this on top of RPCs.
+        group = _init_process_group(store, rank, world_size)
+
+        reverse_device_maps, devices = _tensorpipe_exchange_and_check_all_device_maps(
+            name,
+            device_count,
+            rpc_backend_options.device_maps,
+            rpc_backend_options.devices,
+            group,
+        )
+
+        if torch.cuda.is_available() and devices:
+            # It's necessary to initialize PyTorch CUDA states here (e.g.,
+            # CUDACachingAllocator). If this is missing, we could hit errors like
+            # "allocator not initialized", because other processes might send
+            # CUDA-related RPC request to this process before user code in this
+            # process initializes its PyTorch CUDA states.
+            torch.cuda.init()
+
+        # TODO: add try-except and destroy _agent in all processes if any fails.
+        agent = TensorPipeAgent(
+            store,
+            name,
+            rank,
+            world_size,
+            rpc_backend_options,
+            reverse_device_maps,
+            devices,
+        )
+
+        api._init_rpc_states(agent)
+
+        # Run one dummy round of RPC to initialize channels/transports. Without
+        # this, it's easy to hit timeout in rpc.shutdown() if there is no other RPC
+        # on that process before rpc.shutdown(), as the agent initialization can
+        # take longer than 5s.
+        api._all_gather(None, timeout=rpc_backend_options.rpc_timeout)
+        # Need a barrier here to make sure no peers leave before the rank0 finishes
+        # _all_gather
+        group.barrier().wait()
+
+        return agent
+    # initialization for dynamic rpc (ranks can join and leave)
+    else:
+        with _group_membership_management(store, name, True):
+            # Construct TPAgent with empty reverse_device_map and devices
+            # these properties will be updated after initialization
+            agent = TensorPipeAgent(
+                store,
+                name,
+                rank,
+                world_size,
+                rpc_backend_options,
+                {},
+                [],
+            )
+            api._init_rpc_states(agent)
+
+            try:
+                # Notify all workers in group this rank has joined and set devices and reverse_device_map
+                # This is a synchronous operation that completes once all existing ranks are updated
+                _set_devices_and_reverse_device_map(agent)
+                pass
+            except Exception:
+                api.shutdown()
+                raise
+            return agent
+
+register_backend(
+    "TENSORPIPE",
+    _tensorpipe_construct_rpc_backend_options_handler,
+    _tensorpipe_init_backend_handler,
+)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/constants.py b/MLPY/Lib/site-packages/torch/distributed/rpc/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dba916b05002459e67c4499e7fdb450ef3dbb38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/constants.py
@@ -0,0 +1,24 @@
+from datetime import timedelta
+from typing import List
+from torch._C._distributed_rpc import (
+    _DEFAULT_INIT_METHOD,
+    _DEFAULT_NUM_WORKER_THREADS,
+    _DEFAULT_RPC_TIMEOUT_SEC,
+    _UNSET_RPC_TIMEOUT,
+)
+
+
+# For any RpcAgent.
+DEFAULT_RPC_TIMEOUT_SEC: float = _DEFAULT_RPC_TIMEOUT_SEC
+DEFAULT_INIT_METHOD: str = _DEFAULT_INIT_METHOD
+DEFAULT_SHUTDOWN_TIMEOUT: float = 0
+
+# For TensorPipeAgent.
+DEFAULT_NUM_WORKER_THREADS: int = _DEFAULT_NUM_WORKER_THREADS
+# Ensure that we don't time out when there are long periods of time without
+# any operations against the underlying ProcessGroup.
+DEFAULT_PROCESS_GROUP_TIMEOUT: timedelta = timedelta(milliseconds=2 ** 31 - 1)
+# Value indicating that timeout is not set for RPC call, and the default should be used.
+UNSET_RPC_TIMEOUT: float = _UNSET_RPC_TIMEOUT
+
+__all__: List[str] = []
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/functions.py b/MLPY/Lib/site-packages/torch/distributed/rpc/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d76b5f7e7d3ab72bdaee791c523ca068190431
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/functions.py
@@ -0,0 +1,166 @@
+import functools
+
+
+def async_execution(fn):
+    r"""
+    A decorator for a function indicating that the return value of the function
+    is guaranteed to be a :class:`~torch.futures.Future` object and this
+    function can run asynchronously on the RPC callee. More specifically, the
+    callee extracts the :class:`~torch.futures.Future` returned by the wrapped
+    function and installs subsequent processing steps as a callback to that
+    :class:`~torch.futures.Future`. The installed callback will read the value
+    from the :class:`~torch.futures.Future` when completed and send the
+    value back as the RPC response. That also means the returned
+    :class:`~torch.futures.Future` only exists on the callee side and is never
+    sent through RPC. This decorator is useful when the wrapped function's
+    (``fn``) execution needs to pause and resume due to, e.g., containing
+    :meth:`~torch.distributed.rpc.rpc_async` or waiting for other signals.
+
+    .. note:: To enable asynchronous execution, applications must pass the
+        function object returned by this decorator to RPC APIs. If RPC detected
+        attributes installed by this decorator, it knows that this function
+        returns a ``Future`` object and will handle that accordingly.
+        However, this does not mean this decorator has to be outmost one when
+        defining a function. For example, when combined with ``@staticmethod``
+        or ``@classmethod``, ``@rpc.functions.async_execution`` needs to be the
+        inner decorator to allow the target function be recognized as a static
+        or class function. This target function can still execute asynchronously
+        because, when accessed, the static or class method preserves attributes
+        installed by ``@rpc.functions.async_execution``.
+
+
+    Example::
+        The returned :class:`~torch.futures.Future` object can come from
+        :meth:`~torch.distributed.rpc.rpc_async`,
+        :meth:`~torch.futures.Future.then`, or :class:`~torch.futures.Future`
+        constructor. The example below shows directly using the
+        :class:`~torch.futures.Future` returned by
+        :meth:`~torch.futures.Future.then`.
+
+        >>> from torch.distributed import rpc
+        >>>
+        >>> # omitting setup and shutdown RPC
+        >>>
+        >>> # On all workers
+        >>> @rpc.functions.async_execution
+        >>> def async_add_chained(to, x, y, z):
+        >>>     # This function runs on "worker1" and returns immediately when
+        >>>     # the callback is installed through the `then(cb)` API. In the
+        >>>     # mean time, the `rpc_async` to "worker2" can run concurrently.
+        >>>     # When the return value of that `rpc_async` arrives at
+        >>>     # "worker1", "worker1" will run the lambda function accordingly
+        >>>     # and set the value for the previously returned `Future`, which
+        >>>     # will then trigger RPC to send the result back to "worker0".
+        >>>     return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        >>>         lambda fut: fut.wait() + z
+        >>>     )
+        >>>
+        >>> # On worker0
+        >>> # xdoctest: +SKIP
+        >>> ret = rpc.rpc_sync(
+        >>>     "worker1",
+        >>>     async_add_chained,
+        >>>     args=("worker2", torch.ones(2), 1, 1)
+        >>> )
+        >>> print(ret)  # prints tensor([3., 3.])
+
+        When combined with TorchScript decorators, this decorator must be the
+        outmost one.
+
+        >>> from torch import Tensor
+        >>> from torch.futures import Future
+        >>> from torch.distributed import rpc
+        >>>
+        >>> # omitting setup and shutdown RPC
+        >>>
+        >>> # On all workers
+        >>> @torch.jit.script
+        >>> def script_add(x: Tensor, y: Tensor) -> Tensor:
+        >>>     return x + y
+        >>>
+        >>> @rpc.functions.async_execution
+        >>> @torch.jit.script
+        >>> def async_add(to: str, x: Tensor, y: Tensor) -> Future[Tensor]:
+        >>>     return rpc.rpc_async(to, script_add, (x, y))
+        >>>
+        >>> # On worker0
+        >>> ret = rpc.rpc_sync(
+        >>>     "worker1",
+        >>>     async_add,
+        >>>     args=("worker2", torch.ones(2), 1)
+        >>> )
+        >>> print(ret)  # prints tensor([2., 2.])
+
+        When combined with static or class method, this decorator must be the
+        inner one.
+
+        >>> from torch.distributed import rpc
+        >>>
+        >>> # omitting setup and shutdown RPC
+        >>>
+        >>> # On all workers
+        >>> class AsyncExecutionClass:
+        >>>
+        >>>     @staticmethod
+        >>>     @rpc.functions.async_execution
+        >>>     def static_async_add(to, x, y, z):
+        >>>         return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        >>>             lambda fut: fut.wait() + z
+        >>>         )
+        >>>
+        >>>     @classmethod
+        >>>     @rpc.functions.async_execution
+        >>>     def class_async_add(cls, to, x, y, z):
+        >>>         ret_fut = torch.futures.Future()
+        >>>         rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        >>>             lambda fut: ret_fut.set_result(fut.wait() + z)
+        >>>         )
+        >>>         return ret_fut
+        >>>
+        >>>     @rpc.functions.async_execution
+        >>>     def bound_async_add(self, to, x, y, z):
+        >>>         return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        >>>             lambda fut: fut.wait() + z
+        >>>         )
+        >>>
+        >>> # On worker0
+        >>> ret = rpc.rpc_sync(
+        >>>     "worker1",
+        >>>     AsyncExecutionClass.static_async_add,
+        >>>     args=("worker2", torch.ones(2), 1, 2)
+        >>> )
+        >>> print(ret)  # prints tensor([4., 4.])
+        >>>
+        >>> ret = rpc.rpc_sync(
+        >>>     "worker1",
+        >>>     AsyncExecutionClass.class_async_add,
+        >>>     args=("worker2", torch.ones(2), 1, 2)
+        >>> )
+        >>> print(ret)  # prints tensor([4., 4.])
+
+        This decorator also works with RRef helpers, i.e., .
+        :meth:`torch.distributed.rpc.RRef.rpc_sync`,
+        :meth:`torch.distributed.rpc.RRef.rpc_async`, and
+        :meth:`torch.distributed.rpc.RRef.remote`.
+
+        >>> from torch.distributed import rpc
+        >>>
+        >>> # reuse the AsyncExecutionClass class above
+        >>> rref = rpc.remote("worker1", AsyncExecutionClass)
+        >>> ret = rref.rpc_sync().static_async_add("worker2", torch.ones(2), 1, 2)
+        >>> print(ret)  # prints tensor([4., 4.])
+        >>>
+        >>> rref = rpc.remote("worker1", AsyncExecutionClass)
+        >>> ret = rref.rpc_async().static_async_add("worker2", torch.ones(2), 1, 2).wait()
+        >>> print(ret)  # prints tensor([4., 4.])
+        >>>
+        >>> rref = rpc.remote("worker1", AsyncExecutionClass)
+        >>> ret = rref.remote().static_async_add("worker2", torch.ones(2), 1, 2).to_here()
+        >>> print(ret)  # prints tensor([4., 4.])
+    """
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+    # Can't declare and use attributes of function objects (mypy#2087)
+    wrapper._wrapped_async_rpc_function = fn  # type: ignore[attr-defined]
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/internal.py b/MLPY/Lib/site-packages/torch/distributed/rpc/internal.py
new file mode 100644
index 0000000000000000000000000000000000000000..9938352d0e6bf9c68f9fa2edb4fd66d662b105e8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/internal.py
@@ -0,0 +1,281 @@
+import collections
+import copyreg
+import io
+import pickle
+import sys
+import threading
+import traceback
+from enum import Enum
+
+import torch
+import torch.distributed as dist
+from torch._C._distributed_rpc import _get_current_rpc_agent
+
+__all__ = ["RPCExecMode", "serialize", "deserialize", "PythonUDF", "RemoteException"]
+
+# Thread local tensor tables to store tensors while pickling torch.Tensor
+# objects
+_thread_local_tensor_tables = threading.local()
+_pickler = pickle.Pickler
+_unpickler = pickle.Unpickler
+
+
+class RPCExecMode(Enum):
+    SYNC = "sync"
+    ASYNC = "async"
+    ASYNC_JIT = "async_jit"
+    REMOTE = "remote"
+
+
+class _InternalRPCPickler:
+    r"""
+    This class provides serialize() and deserialize() interfaces to serialize
+    data to be "binary string + tensor table" format
+    So for RPC python UDF function and args, non tensor data will be serialized
+    into regular binary string, tensor data will be put into thread local tensor
+    tables, this serialization format is consistent with builtin operator and args
+    using JIT pickler. This format will make tensor handling in C++ much easier,
+    e.g. attach tensor to distributed autograd graph in C++
+    """
+
+    def __init__(self):
+        # Ignore type error because dispatch_table is defined in third-party package
+        self._dispatch_table = copyreg.dispatch_table.copy()  # type: ignore[attr-defined]
+        self._dispatch_table[torch.Tensor] = self._tensor_reducer
+        # Used for registering customized picklers.
+        self._class_reducer_dict = {}
+
+    def _register_reducer(self, obj_class, reducer):
+        # For the same class, only register the reducer once.
+        if obj_class not in self._class_reducer_dict:
+            self._class_reducer_dict[obj_class] = reducer
+
+    @classmethod
+    def _tensor_receiver(cls, tensor_index):
+        global _thread_local_tensor_tables
+        return _thread_local_tensor_tables.recv_tables[tensor_index]
+
+    def _tensor_reducer(self, tensor):
+        global _thread_local_tensor_tables
+        _thread_local_tensor_tables.send_tables.append(tensor)
+        tensor_index = len(_thread_local_tensor_tables.send_tables) - 1
+        return (_InternalRPCPickler._tensor_receiver, (tensor_index,))
+
+    @classmethod
+    def _py_rref_receiver(cls, rref_fork_data):
+        return dist.rpc.PyRRef._deserialize(rref_fork_data)
+
+    def _py_rref_reducer(self, py_rref):
+        rref_fork_data = py_rref._serialize()
+        return (_InternalRPCPickler._py_rref_receiver, (rref_fork_data,))
+
+    def _rref_reducer(self, rref):
+        return self._py_rref_reducer(rref)
+
+    @classmethod
+    def _script_module_receiver(cls, script_module_serialized):
+        """
+        Given a serialized representation of a ScriptModule created with torch.jit.save,
+        loads and returns the ScriptModule.
+        """
+        f = io.BytesIO(script_module_serialized)
+        m = torch.jit.load(f)
+        return m
+
+    def _script_module_reducer(self, script_module):
+        """
+        Serializes a ScriptModule.
+        """
+        f = io.BytesIO()
+        torch.jit.save(script_module, f)
+        return (_InternalRPCPickler._script_module_receiver, (f.getvalue(),))
+
+    def serialize(self, obj):
+        r"""
+        Serialize non tensor data into binary string, tensor data into
+        tensor table
+        """
+        f = io.BytesIO()
+        p = _pickler(f)
+        p.dispatch_table = self._dispatch_table
+
+        # rpc api could accept user picklers inheriting from _InternalRPCPickler to serialize rref,
+        # user picklers could have different initialization function from _InternalRPCPickler,
+        # but all the user picklers should call serialize() and use _rref_reducer to pickle rref
+        # in python. also, when _internal_rpc_pickler is imported to rpc/api.py, rpc.RRef is not
+        # compiled yet, it is not good place to access rpc.RRef inside _InternalRPCPickler constructor,
+        # so putting rref's dispatch table here
+        #
+        # The return value of a `rpc.remote(..)` call is type of `rpc.PyRRef`.
+        # The deserialized RRef object on an RPC receiver side is type of `rpc.PyRRef`.
+        # Ignore type error because dispatch_table is defined in third-party package
+        p.dispatch_table[dist.rpc.PyRRef] = self._py_rref_reducer  # type: ignore[index]
+        # An RRef created locally by RRef Python constructor is type of `rpc.RRef`.
+        # Ignore type error because dispatch_table is defined in third-party package
+        p.dispatch_table[dist.rpc.RRef] = self._rref_reducer  # type: ignore[index]
+
+        # Add dispatch pickling for ScriptModule or its subclass.
+        if isinstance(obj, torch.jit.ScriptModule):
+            # Ignore type error because dispatch_table is defined in third-party package
+            p.dispatch_table[obj.__class__] = self._script_module_reducer  # type: ignore[index]
+
+        # Install customized picklers.
+        for class_name in self._class_reducer_dict.keys():
+            p.dispatch_table[class_name] = self._class_reducer_dict[class_name]  # type: ignore[index]
+
+        # save _thread_local_tensor_tables.send_tables if it is in nested call
+        global _thread_local_tensor_tables
+        if hasattr(_thread_local_tensor_tables, "send_tables"):
+            old_send_tables = _thread_local_tensor_tables.send_tables
+        else:
+            old_send_tables = None
+        _thread_local_tensor_tables.send_tables = []
+
+        p.dump(obj)
+
+        # restore _thread_local_tensor_tables.send_tables if return
+        # from nested call, otherwise clean up the table
+        tensors = _thread_local_tensor_tables.send_tables
+        if old_send_tables is not None:
+            _thread_local_tensor_tables.send_tables = old_send_tables
+        else:
+            del _thread_local_tensor_tables.send_tables
+
+        return (f.getvalue(), tensors)
+
+    def deserialize(self, binary_data, tensor_table):
+        r"""
+        Deserialize binary string + tensor table to original obj
+        """
+        # save _thread_local_tensor_tables.recv_tables if it is in nested call
+        global _thread_local_tensor_tables
+        if hasattr(_thread_local_tensor_tables, "recv_tables"):
+            old_recv_tables = _thread_local_tensor_tables.recv_tables
+        else:
+            old_recv_tables = None
+        _thread_local_tensor_tables.recv_tables = tensor_table
+
+        try:
+            unpickler = _unpickler(io.BytesIO(binary_data))
+            ret = unpickler.load()
+        except AttributeError as e:
+            # Occurs when function is not found on module/class during
+            # unpickling.
+            except_str = (
+                str(e)
+                + """ Default RPC pickler does not serialize
+            function code. Ensure that UDFs are defined on both caller and
+            callee modules."""
+            )
+            ret = AttributeError(except_str)
+            # Ensure the stack trace gets preserved
+            ret.__cause__ = e
+
+        # restore _thread_local_tensor_tables.recv_tables if return
+        # from nested call, otherwise clean up the table
+        if old_recv_tables is not None:
+            _thread_local_tensor_tables.recv_tables = old_recv_tables
+        else:
+            del _thread_local_tensor_tables.recv_tables
+
+        return ret
+
+
+# Create _internal_rpc_pickler only once to initialize _dispatch_table only once
+_internal_rpc_pickler = _InternalRPCPickler()
+
+
+def serialize(obj):
+    return _internal_rpc_pickler.serialize(obj)
+
+
+def deserialize(binary_data, tensor_table):
+    return _internal_rpc_pickler.deserialize(binary_data, tensor_table)
+
+
+def _run_function(python_udf):
+    r"""
+    This function is exclusively called from C++.
+    See ``torch/csrc/distributed/rpc/python_rpc_handler.cpp``.
+
+    Runs a Python UDF and returns its return value.
+    Wraps any exception in ``RemoteException`` if the function raises.
+    """
+    try:
+        if isinstance(python_udf, AttributeError):
+            raise python_udf
+        result = python_udf.func(*python_udf.args, **python_udf.kwargs)
+    except Exception as e:
+        # except str = exception info + traceback string
+        except_str = (
+            f"On {_get_current_rpc_agent().get_worker_info()}:\n"
+            f"{repr(e)}\n{traceback.format_exc()}"
+        )
+        print(except_str, file=sys.stderr)
+        result = RemoteException(except_str, type(e))
+    return result
+
+
+def _handle_exception(result):
+    if isinstance(result, RemoteException):
+        exception_msg = result.msg.encode("utf-8").decode("unicode_escape")
+        # We wrap exception re-creation here in case some exception classes
+        # cannot be constructed directly from a string.
+        exc = None
+        try:
+            exc = result.exception_type(exception_msg)
+        except BaseException as e:
+            raise RuntimeError(  # noqa: B904
+                f"Failed to create original exception type. Error msg was {str(e)}"
+                f" Original exception on remote side was {exception_msg}"
+            ) from e
+
+        if exc is not None:
+            raise exc
+
+
+def _build_rpc_profiling_key(
+    exec_type, func_name, current_worker_name, dst_worker_name
+):
+    """
+    Builds the key that RPC calls are profiled with using the autograd profiler.
+    This will be the name of the corresponding Event recorded in the profiler.
+
+    Args:
+        exec_type (RPCExecMode): Type of RPC/RRef call
+        func_name (str): Name of function being profiled.
+        current_worker_name (str): Name of current worker.
+        dst_worker_name (str): Name of the destination worker.
+
+    Returns:
+        String representing profiling key
+    """
+    profile_key = f"rpc_{exec_type.value}#{func_name}({current_worker_name} -> {dst_worker_name})"
+    return profile_key
+
+
+def _start_record_function(exec_type, func_name, current_worker_name, dest_worker_name):
+    """
+    This function should be called from RPC/RRef functions to create a
+    RecordFunction object for profiling. This function also runs the before
+    callbacks that start the profiling, though the user is responsible for
+    running the appropriate callbacks when the function to be profiled finishes.
+
+    Args:
+        exec_type (RPCExecMode): Type of RPC/RRef call
+        func_name (str): Name of function being profiled.
+        current_worker_name (str): Name of current worker.
+        dest_worker_name (str): Name of the destination worker.
+
+    Returns:
+        An instance of `torch.autograd._RecordFunction`.
+    """
+    assert torch.autograd._profiler_enabled(), "Autograd profiler should be enabled."
+    profile_key = f"rpc_{exec_type.value}#{str(func_name)}({current_worker_name} -> {dest_worker_name})"
+    rf = torch.autograd._RecordFunction()  # type: ignore[attr-defined]
+    torch.autograd._run_before_callbacks(rf, profile_key)  # type: ignore[attr-defined]
+    return rf
+
+
+PythonUDF = collections.namedtuple("PythonUDF", ["func", "args", "kwargs"])
+RemoteException = collections.namedtuple("RemoteException", ["msg", "exception_type"])
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/options.py b/MLPY/Lib/site-packages/torch/distributed/rpc/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..0791fef1bfa6d1757027486293b0b6e4a148f1f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/options.py
@@ -0,0 +1,172 @@
+from typing import Dict, List, Optional, Union
+
+import torch
+from torch._C._distributed_rpc import _TensorPipeRpcBackendOptionsBase
+from . import constants as rpc_contants
+
+
+DeviceType = Union[int, str, torch.device]
+
+__all__ = ["TensorPipeRpcBackendOptions"]
+
+def _to_device(device: DeviceType) -> torch.device:
+    device = torch.device(device)
+    if device.type != "cuda":
+        raise ValueError(
+            "`set_devices` expect a list of CUDA devices, but got "
+            f"device type {device.type}."
+        )
+    return device
+
+
+def _to_device_map(
+    device_map: Dict[DeviceType, DeviceType]
+) -> Dict[torch.device, torch.device]:
+    full_device_map: Dict[torch.device, torch.device] = {}
+    reverse_map: Dict[torch.device, torch.device] = {}
+    for k, v in device_map.items():
+        k, v = torch.device(k), torch.device(v)
+        if v in reverse_map:
+            raise ValueError(
+                "`device_map` only supports 1-to-1 mapping, "
+                f"trying to map {k} and {reverse_map[v]} to {v}"
+            )
+        full_device_map[k] = v
+        reverse_map[v] = k
+    return full_device_map
+
+
+def _to_device_list(devices: List[DeviceType]) -> List[torch.device]:
+    return list(map(_to_device, devices))
+
+
+class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
+    r"""
+    The backend options for
+    :class:`~torch.distributed.rpc.TensorPipeAgent`, derived from
+    :class:`~torch.distributed.rpc.RpcBackendOptions`.
+
+    Args:
+        num_worker_threads (int, optional): The number of threads in the
+            thread-pool used by
+            :class:`~torch.distributed.rpc.TensorPipeAgent` to execute
+            requests (default: 16).
+        rpc_timeout (float, optional): The default timeout, in seconds,
+            for RPC requests (default: 60 seconds). If the RPC has not
+            completed in this timeframe, an exception indicating so will
+            be raised. Callers can override this timeout for individual
+            RPCs in :meth:`~torch.distributed.rpc.rpc_sync` and
+            :meth:`~torch.distributed.rpc.rpc_async` if necessary.
+        init_method (str, optional): The URL to initialize the distributed
+            store used for rendezvous. It takes any value accepted for the
+            same argument of :meth:`~torch.distributed.init_process_group`
+            (default: ``env://``).
+        device_maps (Dict[str, Dict], optional): Device placement mappings from
+            this worker to the callee. Key is the callee worker name and value
+            the dictionary (``Dict`` of ``int``, ``str``, or ``torch.device``)
+            that maps this worker's devices to the callee worker's devices.
+            (default: ``None``)
+        devices (List[int, str, or ``torch.device``], optional): all local
+            CUDA devices used by RPC agent. By Default, it will be initialized
+            to all local devices from its own ``device_maps`` and corresponding
+            devices from its peers' ``device_maps``. When processing CUDA RPC
+            requests, the agent will properly synchronize CUDA streams for
+            all devices in this ``List``.
+    """
+
+    def __init__(
+        self,
+        *,
+        num_worker_threads: int = rpc_contants.DEFAULT_NUM_WORKER_THREADS,
+        rpc_timeout: float = rpc_contants.DEFAULT_RPC_TIMEOUT_SEC,
+        init_method: str = rpc_contants.DEFAULT_INIT_METHOD,
+        device_maps: Optional[Dict[str, Dict[DeviceType, DeviceType]]] = None,
+        devices: Optional[List[DeviceType]] = None,
+        _transports: Optional[List] = None,
+        _channels: Optional[List] = None,
+    ):
+        full_device_maps = (
+            {}
+            if device_maps is None
+            else {k: _to_device_map(v) for k, v in device_maps.items()}
+        )
+        full_device_list = [] if devices is None else _to_device_list(devices)
+        super().__init__(
+            num_worker_threads,
+            _transports,
+            _channels,
+            rpc_timeout,
+            init_method,
+            full_device_maps,
+            full_device_list,
+        )
+
+    def set_device_map(self, to: str, device_map: Dict[DeviceType, DeviceType]):
+        r"""
+        Set device mapping between each RPC caller and callee pair. This
+        function can be called multiple times to incrementally add
+        device placement configurations.
+
+        Args:
+            to (str): Callee name.
+            device_map (Dict of int, str, or torch.device): Device placement
+                mappings from this worker to the callee. This map must be
+                invertible.
+
+        Example:
+            >>> # xdoctest: +SKIP("distributed")
+            >>> # both workers
+            >>> def add(x, y):
+            >>>     print(x)  # tensor([1., 1.], device='cuda:1')
+            >>>     return x + y, (x + y).to(2)
+            >>>
+            >>> # on worker 0
+            >>> options = TensorPipeRpcBackendOptions(
+            >>>     num_worker_threads=8,
+            >>>     device_maps={"worker1": {0: 1}}
+            >>>     # maps worker0's cuda:0 to worker1's cuda:1
+            >>> )
+            >>> options.set_device_map("worker1", {1: 2})
+            >>> # maps worker0's cuda:1 to worker1's cuda:2
+            >>>
+            >>> rpc.init_rpc(
+            >>>     "worker0",
+            >>>     rank=0,
+            >>>     world_size=2,
+            >>>     backend=rpc.BackendType.TENSORPIPE,
+            >>>     rpc_backend_options=options
+            >>> )
+            >>>
+            >>> x = torch.ones(2)
+            >>> rets = rpc.rpc_sync("worker1", add, args=(x.to(0), 1))
+            >>> # The first argument will be moved to cuda:1 on worker1. When
+            >>> # sending the return value back, it will follow the invert of
+            >>> # the device map, and hence will be moved back to cuda:0 and
+            >>> # cuda:1 on worker0
+            >>> print(rets[0])  # tensor([2., 2.], device='cuda:0')
+            >>> print(rets[1])  # tensor([2., 2.], device='cuda:1')
+        """
+        full_device_map = _to_device_map(device_map)
+        curr_device_maps = super().device_maps
+
+        if to in curr_device_maps:
+            for k, v in full_device_map.items():
+                if k in curr_device_maps[to] and v != curr_device_maps[to][k]:
+                    raise ValueError(
+                        "`set_device_map` only supports 1-to-1 mapping, trying"
+                        f" to map {k} to {v} and {curr_device_maps[to][k]}"
+                    )
+
+        super()._set_device_map(to, full_device_map)
+
+    def set_devices(self, devices: List[DeviceType]):
+        r"""
+        Set local devices used by the TensorPipe RPC agent. When processing
+        CUDA RPC requests, the TensorPipe RPC agent will properly synchronize
+        CUDA streams for all devices in this ``List``.
+
+        Args:
+            devices (List of int, str, or torch.device): local devices used by
+                the TensorPipe RPC agent.
+        """
+        self.devices = _to_device_list(devices)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/rref_proxy.py b/MLPY/Lib/site-packages/torch/distributed/rpc/rref_proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7219d74c9b9cdb01a1f480abc1927b66fb000a6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/rref_proxy.py
@@ -0,0 +1,74 @@
+from functools import partial
+
+from . import functions
+from . import rpc_async
+
+import torch
+from .constants import UNSET_RPC_TIMEOUT
+from torch.futures import Future
+
+def _local_invoke(rref, func_name, args, kwargs):
+    return getattr(rref.local_value(), func_name)(*args, **kwargs)
+
+@functions.async_execution
+def _local_invoke_async_execution(rref, func_name, args, kwargs):
+    return getattr(rref.local_value(), func_name)(*args, **kwargs)
+
+def _invoke_rpc(rref, rpc_api, func_name, timeout, *args, **kwargs):
+    def _rref_type_cont(rref_fut):
+        rref_type = rref_fut.value()
+
+        _invoke_func = _local_invoke
+        # Bypass ScriptModules when checking for async function attribute.
+        bypass_type = issubclass(rref_type, torch.jit.ScriptModule) or issubclass(
+            rref_type, torch._C.ScriptModule
+        )
+        if not bypass_type:
+            func = getattr(rref_type, func_name)
+            if hasattr(func, "_wrapped_async_rpc_function"):
+                _invoke_func = _local_invoke_async_execution
+
+        return rpc_api(
+            rref.owner(),
+            _invoke_func,
+            args=(rref, func_name, args, kwargs),
+            timeout=timeout
+        )
+
+    rref_fut = rref._get_type(timeout=timeout, blocking=False)
+
+    if rpc_api != rpc_async:
+        rref_fut.wait()
+        return _rref_type_cont(rref_fut)
+    else:
+        # A little explanation on this.
+        # rpc_async returns a Future pointing to the return value of `func_name`, it returns a `Future[T]`
+        # Calling _rref_type_cont from the `then` lambda causes Future wrapping. IOW, `then` returns a `Future[Future[T]]`
+        # To address that, we return a Future that is completed with the result of the async call.
+        result: Future = Future()
+
+        def _wrap_rref_type_cont(fut):
+            try:
+                _rref_type_cont(fut).then(_complete_op)
+            except BaseException as ex:
+                result.set_exception(ex)
+
+        def _complete_op(fut):
+            try:
+                result.set_result(fut.value())
+            except BaseException as ex:
+                result.set_exception(ex)
+
+        rref_fut.then(_wrap_rref_type_cont)
+        return result
+
+# This class manages proxied RPC API calls for RRefs. It is entirely used from
+# C++ (see python_rpc_handler.cpp).
+class RRefProxy:
+    def __init__(self, rref, rpc_api, timeout=UNSET_RPC_TIMEOUT):
+        self.rref = rref
+        self.rpc_api = rpc_api
+        self.rpc_timeout = timeout
+
+    def __getattr__(self, func_name):
+        return partial(_invoke_rpc, self.rref, self.rpc_api, func_name, self.rpc_timeout)
diff --git a/MLPY/Lib/site-packages/torch/distributed/rpc/server_process_global_profiler.py b/MLPY/Lib/site-packages/torch/distributed/rpc/server_process_global_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f061904510c6cd11999372a0c82ff6c2157c0057
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/rpc/server_process_global_profiler.py
@@ -0,0 +1,177 @@
+#!/usr/bin/python3
+
+import itertools
+
+import torch
+from torch.autograd.profiler_legacy import profile
+from typing import List
+
+from . import (
+    _disable_server_process_global_profiler,
+    _enable_server_process_global_profiler,
+)
+
+__all__: List[str] = []
+
+class _server_process_global_profile(profile):
+    """
+    It has the same API as ``torch.autograd.profiler.profile`` class,
+    except that it enables profiling on all threads running RPC server request callbacks.
+
+    Context manager that manages autograd profiler state and holds a summary of results.
+    Under the hood it just records events of functions being executed in C++ and
+    exposes those events to Python. You can wrap any code into it and it will
+    only report runtime of PyTorch functions.
+    Note: profiler is thread local and is automatically propagated into the async tasks
+
+    Args:
+        enabled (bool, optional): Setting this to False makes this context manager a no-op.
+            Default: ``True``.
+
+        use_cuda (bool, optional): Enables timing of CUDA events as well using the cudaEvent API.
+            Adds approximately 4us of overhead to each tensor operation.
+            Default: ``False``
+
+        record_shapes (bool, optional): If shapes recording is set, information
+            about input dimensions will be collected. This allows one to see which
+            dimensions have been used under the hood and further group by them
+            using prof.key_averages(group_by_input_shape=True). Please note that
+            shape recording might skew your profiling data. It is recommended to
+            use separate runs with and without shape recording to validate the timing.
+            Most likely the skew will be negligible for bottom most events (in a case
+            of nested function calls). But for higher level functions the total
+            self cpu time might be artificially increased because of the shape
+            collection.
+
+        profile_memory (bool, optional): Whether to report memory usage, default: ``False``
+
+    .. warning:
+        Enabling memory profiling incurs additional profiler overhead
+
+    .. warning:
+        Due to some CUDA multiprocessing limitations (multiprocessing-cuda-note_),
+        one cannot use the profiler with ``use_cuda = True`` to benchmark
+        DataLoaders with ``num_workers > 0``. If you wish to benchmark data loading,
+        please use ``use_cuda = False`` or ``num_workers = 0``.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # On worker 0:
+        >>> import torch
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker0", rank=0, world_size=2)
+        >>> x, y = torch.tensor(1), torch.tensor(2)
+        >>> outer_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
+        >>> outer_profile_rref.rpc_sync().__enter__()
+        >>> rpc.rpc_sync(dst_worker_name, torch.add, (x, y))
+        >>> inner_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
+        >>> inner_profile_rref.rpc_sync().__enter__()
+        >>> rpc.rpc_sync(dst_worker_name, torch.sub, (x, y))
+        >>> inner_profile_rref.rpc_sync().__exit__(None, None, None)
+        >>> outer_profile_rref.rpc_sync().__exit__(None, None, None)
+        >>> print(inner_profile_rref.rpc_sync().key_averages())
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        Name       Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        sub        85.06%           76.275us         100.00%          89.667us         89.667us         1
+        empty      14.94%           13.392us         14.94%           13.392us         13.392us         1
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        Self CPU time total: 89.667us
+        >>> print(outer_profile_rref.rpc_sync().key_averages())
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        Name       Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     Number of Calls
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        sub        35.65%           76.275us         41.91%           89.667us         89.667us         1
+        empty      12.67%           27.101us         12.67%           27.101us         13.551us         2
+        add        51.68%           110.550us        58.09%           124.259us        124.259us        1
+        ---------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
+        Self CPU time total: 213.926us
+        >>> rpc.shutdown()
+
+        >>> # On worker 1:
+        >>> import torch.distributed.rpc as rpc
+        >>> rpc.init_rpc("worker1", rank=1, world_size=2)
+        >>> # wait for worker 0 to finish work, and then shutdown.
+        >>> rpc.shutdown()
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __enter__(self):
+        """
+        Turn on server-side process-global profiling.
+        This enables thread-local profiler on all RPC threads running server-side request callbacks.
+        """
+        if not self.enabled:
+            return
+
+        if self.entered:  # type: ignore[has-type]
+            raise RuntimeError("autograd profiler traces are not reentrant")
+        self.entered = True
+
+        profiler_kind = (
+            torch.autograd.ProfilerState.CUDA
+            if self.use_cuda
+            else torch.autograd.ProfilerState.CPU
+        )
+        profiler_config = torch.autograd.ProfilerConfig(
+            profiler_kind,
+            self.record_shapes,
+            self.profile_memory,
+            False,
+            False,
+            False,
+            torch.profiler._ExperimentalConfig())
+        _enable_server_process_global_profiler(profiler_config)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Turn off server-side process-global profiling.
+        Aggregate all profiling events recorded by RPC threads.
+
+        These attributes are assigned on exiting context.
+
+        Attributes:
+            function_events (torch.autograd.profiler.EventList).  It's a list that has helper
+            methods, like 1) show record items in a pretty-print table.
+            2) do averaging by grouping on keys. 3) and more.
+
+            process_global_function_events (List[torch.autograd.profiler.FunctionEvent]).
+            It's a list of ``FunctionEvent`` elements. Every element is a profiling result
+            of an RPC request handling within the profiling range.
+        """
+        if not self.enabled:
+            return
+
+        process_global_events = _disable_server_process_global_profiler()
+
+        # Every element in this list is a thread profiling result from an RPC request handling.
+        process_global_function_events = []
+        for thread_local_events in process_global_events:
+            # Parse from ``Event``s to ``FunctionEvent``s.
+            thread_local_function_events = torch.autograd.profiler_legacy._parse_legacy_records(
+                thread_local_events
+            )
+            thread_local_function_events.sort(
+                key=lambda function_event: [
+                    function_event.time_range.start,
+                    -(function_event.time_range.end),
+                ]
+            )
+            process_global_function_events.append(thread_local_function_events)
+
+        flattened_function_events = list(
+            itertools.chain.from_iterable(process_global_function_events)
+        )
+        self.function_events = torch.autograd.profiler_util.EventList(
+            flattened_function_events,
+            use_cuda=self.use_cuda,
+            profile_memory=self.profile_memory,
+        )
+        self.function_events._build_tree()
+
+        self.process_global_function_events = process_global_function_events
+
+        return False
diff --git a/MLPY/Lib/site-packages/torch/distributed/run.py b/MLPY/Lib/site-packages/torch/distributed/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..120c9cec42a6117a1173c550bd0b3ee32edec5f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/run.py
@@ -0,0 +1,883 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Superset of ``torch.distributed.launch``.
+
+``torchrun`` provides a superset of the functionality as ``torch.distributed.launch``
+with the following additional functionalities:
+
+1. Worker failures are handled gracefully by restarting all workers.
+
+2. Worker ``RANK`` and ``WORLD_SIZE`` are assigned automatically.
+
+3. Number of nodes is allowed to change between minimum and maximum sizes (elasticity).
+
+.. note:: ``torchrun`` is a python
+          `console script <https://packaging.python.org/en/latest/specifications/entry-points/#use-for-scripts>`_
+          to the main module
+          `torch.distributed.run <https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py>`_
+          declared in the ``entry_points`` configuration in
+          `setup.py <https://github.com/pytorch/pytorch/blob/master/setup.py>`_.
+          It is equivalent to invoking ``python -m torch.distributed.run``.
+
+
+Transitioning from torch.distributed.launch to torchrun
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+``torchrun`` supports the same arguments as ``torch.distributed.launch`` **except**
+for ``--use-env`` which is now deprecated. To migrate from ``torch.distributed.launch``
+to ``torchrun`` follow these steps:
+
+1.  If your training script is already reading ``local_rank`` from the ``LOCAL_RANK`` environment variable.
+    Then you need simply omit the ``--use-env`` flag, e.g.:
+
+    +--------------------------------------------------------------------+--------------------------------------------+
+    |         ``torch.distributed.launch``                               |                ``torchrun``                |
+    +====================================================================+============================================+
+    |                                                                    |                                            |
+    | .. code-block:: shell-session                                      | .. code-block:: shell-session              |
+    |                                                                    |                                            |
+    |    $ python -m torch.distributed.launch --use-env train_script.py  |    $ torchrun train_script.py              |
+    |                                                                    |                                            |
+    +--------------------------------------------------------------------+--------------------------------------------+
+
+2.  If your training script reads local rank from a ``--local-rank`` cmd argument.
+    Change your training script to read from the ``LOCAL_RANK`` environment variable as
+    demonstrated by the following code snippet:
+
+    +-------------------------------------------------------+----------------------------------------------------+
+    |         ``torch.distributed.launch``                  |                    ``torchrun``                    |
+    +=======================================================+====================================================+
+    |                                                       |                                                    |
+    | .. code-block:: python                                | .. code-block:: python                             |
+    |                                                       |                                                    |
+    |                                                       |                                                    |
+    |    import argparse                                    |     import os                                      |
+    |    parser = argparse.ArgumentParser()                 |     local_rank = int(os.environ["LOCAL_RANK"])     |
+    |    parser.add_argument("--local-rank", type=int)      |                                                    |
+    |    args = parser.parse_args()                         |                                                    |
+    |                                                       |                                                    |
+    |    local_rank = args.local_rank                       |                                                    |
+    |                                                       |                                                    |
+    +-------------------------------------------------------+----------------------------------------------------+
+
+The aformentioned changes suffice to migrate from ``torch.distributed.launch`` to ``torchrun``.
+To take advantage of new features such as elasticity, fault-tolerance, and error reporting of ``torchrun``
+please refer to:
+
+* :ref:`elastic_train_script` for more information on authoring training scripts that are ``torchrun`` compliant.
+* the rest of this page for more information on the features of ``torchrun``.
+
+
+Usage
+--------
+
+Single-node multi-worker
+++++++++++++++++++++++++++++++
+
+::
+
+    torchrun
+        --standalone
+        --nnodes=1
+        --nproc-per-node=$NUM_TRAINERS
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+Stacked single-node multi-worker
++++++++++++++++++++++++++++++++++++
+
+To run multiple instances (separate jobs) of single-node, multi-worker on the
+same host, we need to make sure that each instance (job) is
+setup on different ports to avoid port conflicts (or worse, two jobs being merged
+as a single job). To do this you have to run with ``--rdzv-backend=c10d``
+and specify a different port by setting ``--rdzv-endpoint=localhost:$PORT_k``.
+For ``--nodes=1``, its often convenient to let ``torchrun`` pick a free random
+port automatically instead of manually assigning different ports for each run.
+
+::
+
+    torchrun
+        --rdzv-backend=c10d
+        --rdzv-endpoint=localhost:0
+        --nnodes=1
+        --nproc-per-node=$NUM_TRAINERS
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+
+Fault tolerant (fixed sized number of workers, no elasticity, tolerates 3 failures)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+::
+
+    torchrun
+        --nnodes=$NUM_NODES
+        --nproc-per-node=$NUM_TRAINERS
+        --max-restarts=3
+        --rdzv-id=$JOB_ID
+        --rdzv-backend=c10d
+        --rdzv-endpoint=$HOST_NODE_ADDR
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
+the port on which the C10d rendezvous backend should be instantiated and hosted. It can be any
+node in your training cluster, but ideally you should pick a node that has a high bandwidth.
+
+.. note::
+   If no port number is specified ``HOST_NODE_ADDR`` defaults to 29400.
+
+Elastic (``min=1``, ``max=4``, tolerates up to 3 membership changes or failures)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+::
+
+    torchrun
+        --nnodes=1:4
+        --nproc-per-node=$NUM_TRAINERS
+        --max-restarts=3
+        --rdzv-id=$JOB_ID
+        --rdzv-backend=c10d
+        --rdzv-endpoint=$HOST_NODE_ADDR
+        YOUR_TRAINING_SCRIPT.py (--arg1 ... train script args...)
+
+``HOST_NODE_ADDR``, in form <host>[:<port>] (e.g. node1.example.com:29400), specifies the node and
+the port on which the C10d rendezvous backend should be instantiated and hosted. It can be any
+node in your training cluster, but ideally you should pick a node that has a high bandwidth.
+
+.. note::
+   If no port number is specified ``HOST_NODE_ADDR`` defaults to 29400.
+
+Note on rendezvous backend
+------------------------------
+
+For multi-node training you need to specify:
+
+1. ``--rdzv-id``: A unique job id (shared by all nodes participating in the job)
+2. ``--rdzv-backend``: An implementation of
+   :py:class:`torch.distributed.elastic.rendezvous.RendezvousHandler`
+3. ``--rdzv-endpoint``: The endpoint where the rendezvous backend is running; usually in form
+   ``host:port``.
+
+Currently ``c10d`` (recommended), ``etcd-v2``, and ``etcd`` (legacy)  rendezvous backends are
+supported out of the box. To use ``etcd-v2`` or ``etcd``, setup an etcd server with the ``v2`` api
+enabled (e.g. ``--enable-v2``).
+
+.. warning::
+   ``etcd-v2`` and ``etcd`` rendezvous use etcd API v2. You MUST enable the v2 API on the etcd
+   server. Our tests use etcd v3.4.3.
+
+.. warning::
+   For etcd-based rendezvous we recommend using ``etcd-v2`` over ``etcd`` which is functionally
+   equivalent, but uses a revised implementation. ``etcd`` is in maintenance mode and will be
+   removed in a future version.
+
+Definitions
+--------------
+
+1. ``Node`` - A physical instance or a container; maps to the unit that the job manager works with.
+
+2. ``Worker`` - A worker in the context of distributed training.
+
+3. ``WorkerGroup`` - The set of workers that execute the same function (e.g. trainers).
+
+4. ``LocalWorkerGroup`` - A subset of the workers in the worker group running on the same node.
+
+5. ``RANK`` - The rank of the worker within a worker group.
+
+6. ``WORLD_SIZE`` - The total number of workers in a worker group.
+
+7. ``LOCAL_RANK`` - The rank of the worker within a local worker group.
+
+8. ``LOCAL_WORLD_SIZE`` - The size of the local worker group.
+
+9. ``rdzv_id`` - A user-defined id that uniquely identifies the worker group for a job. This id is
+   used by each node to join as a member of a particular worker group.
+
+9. ``rdzv_backend`` - The backend of the rendezvous (e.g. ``c10d``). This is typically a strongly
+   consistent key-value store.
+
+10. ``rdzv_endpoint`` - The rendezvous backend endpoint; usually in form ``<host>:<port>``.
+
+A ``Node`` runs ``LOCAL_WORLD_SIZE`` workers which comprise a ``LocalWorkerGroup``. The union of
+all ``LocalWorkerGroups`` in the nodes in the job comprise the ``WorkerGroup``.
+
+Environment Variables
+----------------------
+
+The following environment variables are made available to you in your script:
+
+1. ``LOCAL_RANK`` -  The local rank.
+
+2. ``RANK`` -  The global rank.
+
+3. ``GROUP_RANK`` - The rank of the worker group. A number between 0 and ``max_nnodes``. When
+   running a single worker group per node, this is the rank of the node.
+
+4. ``ROLE_RANK`` -  The rank of the worker across all the workers that have the same role. The role
+   of the worker is specified in the ``WorkerSpec``.
+
+5. ``LOCAL_WORLD_SIZE`` - The local world size (e.g. number of workers running locally); equals to
+   ``--nproc-per-node`` specified on ``torchrun``.
+
+6. ``WORLD_SIZE`` - The world size (total number of workers in the job).
+
+7. ``ROLE_WORLD_SIZE`` - The total number of workers that was launched with the same role specified
+   in ``WorkerSpec``.
+
+8. ``MASTER_ADDR`` - The FQDN of the host that is running worker with rank 0; used to initialize
+   the Torch Distributed backend.
+
+9. ``MASTER_PORT`` - The port on the ``MASTER_ADDR`` that can be used to host the C10d TCP store.
+
+10. ``TORCHELASTIC_RESTART_COUNT`` - The number of worker group restarts so far.
+
+11. ``TORCHELASTIC_MAX_RESTARTS`` - The configured maximum number of restarts.
+
+12. ``TORCHELASTIC_RUN_ID`` - Equal to the rendezvous ``run_id`` (e.g. unique job id).
+
+13. ``PYTHON_EXEC`` - System executable override. If provided, the python user script will
+    use the value of ``PYTHON_EXEC`` as executable. The `sys.executable` is used by default.
+
+Deployment
+------------
+
+1. (Not needed for the C10d backend) Start the rendezvous backend server and get the endpoint (to be
+   passed as ``--rdzv-endpoint`` to the launcher script)
+
+2. Single-node multi-worker: Start the launcher on the host to start the agent process which
+   creates and monitors a local worker group.
+
+3. Multi-node multi-worker: Start the launcher with the same arguments on all the nodes
+   participating in training.
+
+When using a job/cluster manager the entry point command to the multi-node job should be this
+launcher.
+
+Failure Modes
+---------------
+
+1. Worker failure: For a training job with ``n`` workers, if ``k<=n`` workers fail all workers
+   are stopped and restarted up to ``max_restarts``.
+
+2. Agent failure: An agent failure results in a local worker group failure. It is up to the job
+   manager to fail the entire job (gang semantics) or attempt to replace the node. Both behaviors
+   are supported by the agent.
+
+3. Node failure: Same as agent failure.
+
+Membership Changes
+--------------------
+
+1. Node departure (scale-down): The agent is notified of the departure, all existing workers are
+   stopped, a new ``WorkerGroup`` is formed, and all workers are started with a new ``RANK`` and
+   ``WORLD_SIZE``.
+
+2. Node arrival (scale-up): The new node is admitted to the job, all existing workers are stopped,
+   a new ``WorkerGroup`` is formed, and all workers are started with a new ``RANK`` and
+   ``WORLD_SIZE``.
+
+Important Notices
+--------------------
+
+1. This utility and multi-process distributed (single-node or
+   multi-node) GPU training currently only achieves the best performance using
+   the NCCL distributed backend. Thus NCCL backend is the recommended backend to
+   use for GPU training.
+
+2. The environment variables necessary to initialize a Torch process group are provided to you by
+   this module, no need for you to pass ``RANK`` manually.  To initialize a process group in your
+   training script, simply run:
+
+::
+
+ >>> # xdoctest: +SKIP("stub")
+ >>> import torch.distributed as dist
+ >>> dist.init_process_group(backend="gloo|nccl")
+
+3. In your training program, you can either use regular distributed functions
+   or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
+   training program uses GPUs for training and you would like to use
+   :func:`torch.nn.parallel.DistributedDataParallel` module,
+   here is how to configure it.
+
+::
+
+    local_rank = int(os.environ["LOCAL_RANK"])
+    model = torch.nn.parallel.DistributedDataParallel(model,
+                                                      device_ids=[local_rank],
+                                                      output_device=local_rank)
+
+Please ensure that ``device_ids`` argument is set to be the only GPU device id
+that your code will be operating on. This is generally the local rank of the
+process. In other words, the ``device_ids`` needs to be ``[int(os.environ("LOCAL_RANK"))]``,
+and ``output_device`` needs to be ``int(os.environ("LOCAL_RANK"))`` in order to use this
+utility
+
+
+4. On failures or membership changes ALL surviving workers are killed immediately. Make sure to
+   checkpoint your progress. The frequency of checkpoints should depend on your job's tolerance
+   for lost work.
+
+5. This module only supports homogeneous ``LOCAL_WORLD_SIZE``. That is, it is assumed that all
+   nodes run the same number of local workers (per role).
+
+6. ``RANK`` is NOT stable. Between restarts, the local workers on a node can be assigned a
+   different range of ranks than before. NEVER hard code any assumptions about the stable-ness of
+   ranks or some correlation between ``RANK`` and ``LOCAL_RANK``.
+
+7. When using elasticity (``min_size!=max_size``) DO NOT hard code assumptions about
+   ``WORLD_SIZE`` as the world size can change as nodes are allowed to leave and join.
+
+8. It is recommended for your script to have the following structure:
+
+::
+
+  def main():
+    load_checkpoint(checkpoint_path)
+    initialize()
+    train()
+
+  def train():
+    for batch in iter(dataset):
+      train_step(batch)
+
+      if should_checkpoint:
+        save_checkpoint(checkpoint_path)
+
+9. (Recommended) On worker errors, this tool will summarize the details of the error
+   (e.g. time, rank, host, pid, traceback, etc). On each node, the first error (by timestamp)
+   is heuristically reported as the "Root Cause" error. To get tracebacks as part of this
+   error summary print out, you must decorate your main entrypoint function in your
+   training script as shown in the example below. If not decorated, then the summary
+   will not include the traceback of the exception and will only contain the exitcode.
+   For details on torchelastic error handling see: https://pytorch.org/docs/stable/elastic/errors.html
+
+::
+
+  from torch.distributed.elastic.multiprocessing.errors import record
+
+  @record
+  def main():
+      # do train
+      pass
+
+  if __name__ == "__main__":
+      main()
+
+"""
+import logging
+import os
+import sys
+import uuid
+import importlib.metadata as metadata
+from argparse import REMAINDER, ArgumentParser
+from typing import Callable, List, Tuple, Type, Union, Optional, Set
+
+import torch
+from torch.distributed.argparse_util import check_env, env
+from torch.distributed.elastic.multiprocessing import DefaultLogsSpecs, LogsSpecs, Std
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.elastic.rendezvous.utils import _parse_rendezvous_config
+from torch.distributed.elastic.utils import macros
+from torch.distributed.elastic.utils.logging import get_logger
+from torch.distributed.launcher.api import LaunchConfig, elastic_launch
+from torch.utils.backend_registration import _get_custom_mod_func
+
+log = get_logger(__name__)
+
+
+def get_args_parser() -> ArgumentParser:
+    """Parse the command line options."""
+    parser = ArgumentParser(description="Torch Distributed Elastic Training Launcher")
+
+    #
+    # Worker/node size related arguments.
+    #
+
+    parser.add_argument(
+        "--nnodes",
+        action=env,
+        type=str,
+        default="1:1",
+        help="Number of nodes, or the range of nodes in form <minimum_nodes>:<maximum_nodes>.",
+    )
+    parser.add_argument(
+        "--nproc-per-node",
+        "--nproc_per_node",
+        action=env,
+        type=str,
+        default="1",
+        help="Number of workers per node; supported values: [auto, cpu, gpu, int].",
+    )
+
+    #
+    # Rendezvous related arguments
+    #
+
+    parser.add_argument(
+        "--rdzv-backend",
+        "--rdzv_backend",
+        action=env,
+        type=str,
+        default="static",
+        help="Rendezvous backend.",
+    )
+    parser.add_argument(
+        "--rdzv-endpoint",
+        "--rdzv_endpoint",
+        action=env,
+        type=str,
+        default="",
+        help="Rendezvous backend endpoint; usually in form <host>:<port>.",
+    )
+    parser.add_argument(
+        "--rdzv-id",
+        "--rdzv_id",
+        action=env,
+        type=str,
+        default="none",
+        help="User-defined group id.",
+    )
+    parser.add_argument(
+        "--rdzv-conf",
+        "--rdzv_conf",
+        action=env,
+        type=str,
+        default="",
+        help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
+    )
+    parser.add_argument(
+        "--standalone",
+        action=check_env,
+        help="Start a local standalone rendezvous backend that is represented by a C10d TCP store "
+        "on a free port. Useful when launching single-node, multi-worker job. If specified "
+        "--rdzv-backend, --rdzv-endpoint, --rdzv-id are auto-assigned and any explicitly set values "
+        "are ignored.",
+    )
+
+    #
+    # User-code launch related arguments.
+    #
+
+    parser.add_argument(
+        "--max-restarts",
+        "--max_restarts",
+        action=env,
+        type=int,
+        default=0,
+        help="Maximum number of worker group restarts before failing.",
+    )
+    parser.add_argument(
+        "--monitor-interval",
+        "--monitor_interval",
+        action=env,
+        type=float,
+        default=5,
+        help="Interval, in seconds, to monitor the state of workers.",
+    )
+    parser.add_argument(
+        "--start-method",
+        "--start_method",
+        action=env,
+        type=str,
+        default="spawn",
+        choices=["spawn", "fork", "forkserver"],
+        help="Multiprocessing start method to use when creating workers.",
+    )
+    parser.add_argument(
+        "--role",
+        action=env,
+        type=str,
+        default="default",
+        help="User-defined role for the workers.",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        action=check_env,
+        help="Change each process to interpret the launch script as a Python module, executing "
+        "with the same behavior as 'python -m'.",
+    )
+    parser.add_argument(
+        "--no-python",
+        "--no_python",
+        action=check_env,
+        help="Skip prepending the training script with 'python' - just execute it directly. Useful "
+        "when the script is not a Python script.",
+    )
+
+    parser.add_argument(
+        "--run-path",
+        "--run_path",
+        action=check_env,
+        help="Run the training script with runpy.run_path in the same interpreter."
+        " Script must be provided as an abs path (e.g. /abs/path/script.py)."
+        " Takes precedence over --no-python.",
+    )
+    parser.add_argument(
+        "--log-dir",
+        "--log_dir",
+        action=env,
+        type=str,
+        default=None,
+        help="Base directory to use for log files (e.g. /var/log/torch/elastic). The same "
+        "directory is re-used for multiple runs (a unique job-level sub-directory is created with "
+        "rdzv_id as the prefix).",
+    )
+    parser.add_argument(
+        "-r",
+        "--redirects",
+        action=env,
+        type=str,
+        default="0",
+        help="Redirect std streams into a log file in the log directory (e.g. [-r 3] redirects "
+        "both stdout+stderr for all workers, [-r 0:1,1:2] redirects stdout for local rank 0 and "
+        "stderr for local rank 1).",
+    )
+    parser.add_argument(
+        "-t",
+        "--tee",
+        action=env,
+        type=str,
+        default="0",
+        help="Tee std streams into a log file and also to console (see --redirects for format).",
+    )
+
+    parser.add_argument(
+        "--local-ranks-filter",
+        "--local_ranks_filter",
+        action=env,
+        type=str,
+        default="",
+        help="Only show logs from specified ranks in console (e.g. [--local_ranks_filter=0,1,2] will "
+        "only show logs from rank 0, 1 and 2). This will only apply to stdout and stderr, not to"
+        "log files saved via --redirect or --tee",
+    )
+
+    #
+    # Backwards compatible parameters with caffe2.distributed.launch.
+    #
+
+    parser.add_argument(
+        "--node-rank",
+        "--node_rank",
+        type=int,
+        action=env,
+        default=0,
+        help="Rank of the node for multi-node distributed training.",
+    )
+    parser.add_argument(
+        "--master-addr",
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        action=env,
+        help="Address of the master node (rank 0) that only used for static rendezvous. It should "
+        "be either the IP address or the hostname of rank 0. For single node multi-proc training "
+        "the --master-addr can simply be 127.0.0.1; IPv6 should have the pattern "
+        "`[0:0:0:0:0:0:0:1]`.",
+    )
+    parser.add_argument(
+        "--master-port",
+        "--master_port",
+        default=29500,
+        type=int,
+        action=env,
+        help="Port on the master node (rank 0) to be used for communication during distributed "
+        "training. It is only used for static rendezvous.",
+    )
+    parser.add_argument(
+        "--local-addr",
+        "--local_addr",
+        default=None,
+        type=str,
+        action=env,
+        help="Address of the local node. If specified, will use the given address for connection. "
+        "Else, will look up the local node address instead. Else, it will be default to local "
+        "machine's FQDN.",
+    )
+
+    parser.add_argument(
+        "--logs-specs",
+        "--logs_specs",
+        default=None,
+        type=str,
+        help="torchrun.logs_specs group entrypoint name, value must be type of LogsSpecs. "
+        "Can be used to override custom logging behavior.",
+    )
+
+    #
+    # Positional arguments.
+    #
+
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help="Full path to the (single GPU) training program/script to be launched in parallel, "
+        "followed by all the arguments for the training script.",
+    )
+
+    # Rest from the training program.
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+
+    return parser
+
+
+def parse_args(args):
+    parser = get_args_parser()
+    return parser.parse_args(args)
+
+
+def parse_min_max_nnodes(nnodes: str):
+    arr = nnodes.split(":")
+
+    if len(arr) == 1:
+        min_nodes = max_nodes = int(arr[0])
+    elif len(arr) == 2:
+        min_nodes = int(arr[0])
+        max_nodes = int(arr[1])
+    else:
+        raise RuntimeError(f'nnodes={nnodes} is not in "MIN:MAX" format')  # noqa: E231
+
+    return min_nodes, max_nodes
+
+
+def determine_local_world_size(nproc_per_node: str):
+    try:
+        logging.info("Using nproc_per_node=%s.", nproc_per_node)
+        return int(nproc_per_node)
+    except ValueError as e:
+        if nproc_per_node == "cpu":
+            num_proc = os.cpu_count()
+            device_type = "cpu"
+        elif nproc_per_node == "gpu":
+            if not torch.cuda.is_available():
+                raise ValueError("Cuda is not available.") from e
+            device_type = "gpu"
+            num_proc = torch.cuda.device_count()
+        elif nproc_per_node == torch._C._get_privateuse1_backend_name():
+            if not _get_custom_mod_func("is_available")():
+                raise ValueError(f"{nproc_per_node} is not available.") from e
+            device_type = nproc_per_node
+            num_proc = _get_custom_mod_func("device_count")()
+        elif nproc_per_node == "auto":
+            if torch.cuda.is_available():
+                num_proc = torch.cuda.device_count()
+                device_type = "gpu"
+            elif hasattr(torch, torch._C._get_privateuse1_backend_name()) and \
+                    _get_custom_mod_func("is_available")():
+                num_proc = _get_custom_mod_func("device_count")()
+                device_type = torch._C._get_privateuse1_backend_name()
+            else:
+                num_proc = os.cpu_count()
+                device_type = "cpu"
+        else:
+            raise ValueError(f"Unsupported nproc_per_node value: {nproc_per_node}") from e
+
+        log.info(
+            "Using nproc_per_node=%s,"
+            " setting to %s since the instance "
+            "has %s %s",
+            nproc_per_node, num_proc, os.cpu_count(), device_type
+        )
+        return num_proc
+
+
+def get_rdzv_endpoint(args):
+    if args.rdzv_backend == "static" and not args.rdzv_endpoint:
+        return f"{args.master_addr}:{args.master_port}"  # noqa: E231
+    return args.rdzv_endpoint
+
+
+def get_use_env(args) -> bool:
+    """
+    Retrieve ``use_env`` from the args.
+
+    ``use_env`` is a legacy argument, if ``use_env`` is False, the
+    ``--node-rank`` argument will be transferred to all worker processes.
+    ``use_env`` is only used by the ``torch.distributed.launch`` and will
+    be deprecated in future releases.
+    """
+    if not hasattr(args, "use_env"):
+        return True
+    return args.use_env
+
+
+def _get_logs_specs_class(logs_specs_name: Optional[str]) -> Type[LogsSpecs]:
+    """
+    Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param.
+    Provides plugin mechanism to provide custom implementation of LogsSpecs.
+
+    Returns `DefaultLogsSpecs` when logs_spec_name is None.
+    Raises ValueError when entrypoint for `logs_spec_name` can't be found in entrypoints.
+    """
+    logs_specs_cls = None
+    if logs_specs_name is not None:
+        eps = metadata.entry_points()
+        if hasattr(eps, "select"):  # >= 3.10
+            group = eps.select(group="torchrun.logs_specs")
+            if group.select(name=logs_specs_name):
+                logs_specs_cls = group[logs_specs_name].load()
+
+        elif specs := eps.get("torchrun.logs_specs"):  # < 3.10
+            if entrypoint_list := [ep for ep in specs if ep.name == logs_specs_name]:
+                logs_specs_cls = entrypoint_list[0].load()
+
+        if logs_specs_cls is None:
+            raise ValueError(f"Could not find entrypoint under 'torchrun.logs_specs[{logs_specs_name}]' key")
+
+        logging.info("Using logs_spec '%s' mapped to %s", logs_specs_name, str(logs_specs_cls))
+    else:
+        logs_specs_cls = DefaultLogsSpecs
+
+    return logs_specs_cls
+
+
+def config_from_args(args) -> Tuple[LaunchConfig, Union[Callable, str], List[str]]:
+    # If ``args`` not passed, defaults to ``sys.argv[:1]``
+    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
+    assert 0 < min_nodes <= max_nodes
+    assert args.max_restarts >= 0
+
+    if hasattr(args, "master_addr") and args.rdzv_backend != "static" and not args.rdzv_endpoint:
+        log.warning(
+            "master_addr is only used for static rdzv_backend and when rdzv_endpoint "
+            "is not specified."
+        )
+
+    nproc_per_node = determine_local_world_size(args.nproc_per_node)
+    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
+        omp_num_threads = 1
+        log.warning(
+            "\n*****************************************\n"
+            "Setting OMP_NUM_THREADS environment variable for each process to be "
+            "%s in default, to avoid your system being overloaded, "
+            "please further tune the variable for optimal performance in "
+            "your application as needed. \n"
+            "*****************************************",
+            omp_num_threads
+        )
+        # This env variable will be passed down to the subprocesses
+        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
+
+    log_line_prefix_template = os.getenv("TORCHELASTIC_LOG_LINE_PREFIX_TEMPLATE")
+
+    rdzv_configs = _parse_rendezvous_config(args.rdzv_conf)
+
+    if args.rdzv_backend == "static":
+        rdzv_configs["rank"] = args.node_rank
+
+    rdzv_endpoint = get_rdzv_endpoint(args)
+
+    ranks: Optional[Set[int]] = None
+    if args.local_ranks_filter:
+        try:
+            ranks = set(map(int, args.local_ranks_filter.split(",")))
+            assert ranks
+        except Exception as e:
+            raise Exception(
+                "--local_ranks_filter must be a comma-separated list of integers e.g. --local_ranks_filter=0,1,2"
+            ) from e
+
+    logs_specs_cls: Type[LogsSpecs] = _get_logs_specs_class(args.logs_specs)
+    logs_specs = logs_specs_cls(
+        log_dir=args.log_dir,
+        redirects=Std.from_str(args.redirects),
+        tee=Std.from_str(args.tee),
+        local_ranks_filter=ranks,
+    )
+
+    config = LaunchConfig(
+        min_nodes=min_nodes,
+        max_nodes=max_nodes,
+        nproc_per_node=nproc_per_node,
+        run_id=args.rdzv_id,
+        role=args.role,
+        rdzv_endpoint=rdzv_endpoint,
+        rdzv_backend=args.rdzv_backend,
+        rdzv_configs=rdzv_configs,
+        max_restarts=args.max_restarts,
+        monitor_interval=args.monitor_interval,
+        start_method=args.start_method,
+        log_line_prefix_template=log_line_prefix_template,
+        local_addr=args.local_addr,
+        logs_specs=logs_specs,
+    )
+
+    with_python = not args.no_python
+    cmd: Union[Callable, str]
+    cmd_args = []
+    use_env = get_use_env(args)
+    if args.run_path:
+        cmd = run_script_path
+        cmd_args.append(args.training_script)
+    else:
+        if with_python:
+            cmd = os.getenv("PYTHON_EXEC", sys.executable)
+            cmd_args.append("-u")
+            if args.module:
+                cmd_args.append("-m")
+            cmd_args.append(args.training_script)
+        else:
+            if args.module:
+                raise ValueError(
+                    "Don't use both the '--no-python' flag"
+                    " and the '--module' flag at the same time."
+                )
+            cmd = args.training_script
+    if not use_env:
+        cmd_args.append(f"--local-rank={macros.local_rank}")
+    cmd_args.extend(args.training_script_args)
+
+    return config, cmd, cmd_args
+
+
+def run_script_path(training_script: str, *training_script_args: str):
+    """
+    Run the provided `training_script` from within this interpreter.
+
+    Usage: `script_as_function("/abs/path/to/script.py", "--arg1", "val1")`
+    """
+    import runpy
+    import sys
+
+    sys.argv = [training_script] + [*training_script_args]
+    runpy.run_path(sys.argv[0], run_name="__main__")
+
+
+def run(args):
+    if args.standalone:
+        args.rdzv_backend = "c10d"
+        args.rdzv_endpoint = "localhost:0"
+        args.rdzv_id = str(uuid.uuid4())
+        log.info(
+            "\n**************************************\n"
+            "Rendezvous info:\n"
+            "--rdzv-backend=%s "
+            "--rdzv-endpoint=%s "
+            "--rdzv-id=%s\n"
+            "**************************************\n",
+            args.rdzv_backend, args.rdzv_endpoint, args.rdzv_id
+        )
+
+    config, cmd, cmd_args = config_from_args(args)
+    elastic_launch(
+        config=config,
+        entrypoint=cmd,
+    )(*cmd_args)
+
+
+@record
+def main(args=None):
+    args = parse_args(args)
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/__init__.py b/MLPY/Lib/site-packages/torch/distributed/tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df247c72ce438d6c965b49ab9e173373ebbb74c2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__init__.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfa7a940b2ffbfd3ba8ba81862705dd7a046c19f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torch.distributed.tensor.parallel.api import parallelize_module
+
+from torch.distributed.tensor.parallel.loss import loss_parallel
+from torch.distributed.tensor.parallel.style import (
+    ColwiseParallel,
+    ParallelStyle,
+    PrepareModuleInput,
+    PrepareModuleOutput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+
+__all__ = [
+    "ColwiseParallel",
+    "ParallelStyle",
+    "PrepareModuleInput",
+    "PrepareModuleOutput",
+    "RowwiseParallel",
+    "SequenceParallel",
+    "parallelize_module",
+    "loss_parallel"
+]
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecf7f940cefe3a648dd5fab924ac9205761cb1f7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_data_parallel_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_data_parallel_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ec8b7a19e2ed5c32fc076c4972adc439c115726
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_data_parallel_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db34013947773a89cda4c68b24dd4a307f93c954
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/api.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/api.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e22b75090c44c35cbe5056bf33d2d0a36656ec3d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/api.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/ddp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/ddp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5386f02fd77900a786e060c878320a241083a074
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/ddp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/fsdp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/fsdp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6141aac0902acd4447093cd31bd36fd5a5fd44a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/fsdp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/input_reshard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/input_reshard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5604920f9059e0c708ffa54f75cc71de2c2a2ea9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/input_reshard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/loss.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/loss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81b27f4864440fa1851aec646f7de7492c3a06d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/loss.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/style.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/style.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c697ad3051067a3628b20d975d20912e0151cb30
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/__pycache__/style.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_data_parallel_utils.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_data_parallel_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7390282e5c4448c5fd93ad94b20d86875027d48b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_data_parallel_utils.py
@@ -0,0 +1,51 @@
+from functools import partial
+from typing import no_type_check, Optional, Tuple
+
+import torch
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.placement_types import DTensorSpec
+
+
+@no_type_check
+def sync_grad_hook(grad, *, device_handle=None, compute_stream=None):
+    if isinstance(grad, AsyncCollectiveTensor):
+        if compute_stream is not None:
+            with device_handle.stream(compute_stream):
+                grad = grad.wait()
+        else:
+            grad = grad.wait()
+
+    return grad
+
+
+def _flatten_tensor(
+    tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, Optional[DTensorSpec]]:
+    if isinstance(tensor, DTensor):
+        tensor._local_tensor.requires_grad_()
+        return tensor._local_tensor, tensor._spec
+    return tensor, None
+
+
+@no_type_check
+def _unflatten_tensor(tensor, spec, *, device_handle=None, compute_stream=None):
+    # unflatten would mainly be called everytime FSDP allgather parameters.
+    result = DTensor.from_local(
+        tensor,
+        spec.mesh,
+        spec.placements,
+        run_check=False,
+        shape=spec.shape,
+        stride=spec.stride,
+    )
+    if tensor.requires_grad:
+        # only register the hook if the tensor requires grad
+        tensor.register_hook(
+            partial(
+                sync_grad_hook,
+                device_handle=device_handle,
+                compute_stream=compute_stream,
+            )
+        )
+    return result
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_utils.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..314b3c0e9768aaafc4f03cb4f08b85bebd998e61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/_utils.py
@@ -0,0 +1,60 @@
+import warnings
+from typing import Tuple, Union
+
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed._tensor.placement_types import Placement
+from torch.distributed.device_mesh import _mesh_resources
+try:
+    from torch._dynamo.external_utils import is_compiling as is_torchdynamo_compiling
+except Exception:
+    def is_torchdynamo_compiling():  # type: ignore[misc]
+        return False
+
+LayoutsType = Union[Placement, Tuple[Placement, ...]]
+
+
+def _deprecate_warnings(func_name: str, extra_msg: str) -> None:
+    """
+    Inject common validation logics for `_prepare_input` funcs via this decorator.
+
+    Include verifying that input needs to be either a :class:`Tensor` or :class:`DTensor`
+    and only 1D :class:`DeviceMesh` is passed in.
+    """
+    # TODO: Will follow up with dynamo POC to make warnings.warn working with dynamo.
+    if not is_torchdynamo_compiling():
+        warnings.warn(f"{func_name} is deprecated and will be removed soon. {extra_msg}")
+
+
+def _validate_tp_mesh_dim(
+    device_mesh: DeviceMesh,
+) -> None:
+    """
+    Check whether TP mesh dimension is valid or not.
+
+    Args:
+        device_mesh (:class:`DeviceMesh`):
+            The `device_mesh` where we perform
+            Tensor Parallelism on.
+
+    Return:
+        `True` if the mesh dimension
+        is valid, `False` otherwise.
+    """
+    if device_mesh.ndim > 1:
+        raise ValueError(f"Tensor Parallel only accepts a 1D DeviceMesh, but found {device_mesh.ndim}D!"
+                         "If you have a 2-D or N-D device_mesh, consider passing in device_mesh[\"tp\"]")
+
+    parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
+    if parent_mesh:
+        if parent_mesh.ndim != 2:
+            raise RuntimeError(
+                f"Found TP device_mesh has a parent mesh with dims {parent_mesh.ndim}",
+                "Currently we only support 2D TP composition with DP.",
+            )
+
+        tp_mesh_dim = _mesh_resources.get_parent_mesh_dim(device_mesh)
+        if tp_mesh_dim != 1:
+            raise RuntimeError(
+                f"Found TP device_mesh on the {tp_mesh_dim} dimension of its parent mesh.",
+                "Currently we only support intranode TP and TP needs to be the innermost dimension on its parent mesh.",
+            )
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/api.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fa5497a8a7ca2dfec302316dfa33d66d0851afe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/api.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, Union
+
+import torch
+import torch.distributed._tensor.random as random
+import torch.nn as nn
+from torch.distributed._tensor import (
+    DeviceMesh,
+)
+from torch.distributed._tensor.random import (
+    is_rng_supported_mesh,
+    TensorParallelRNGTracker,
+)
+from torch.distributed.tensor.parallel._utils import _validate_tp_mesh_dim
+from torch.distributed.tensor.parallel.style import (
+    ParallelStyle,
+)
+
+
+__all__ = [
+    "parallelize_module",
+]
+
+
+def parallelize_module(  # type: ignore[return]
+    module: nn.Module,
+    device_mesh: DeviceMesh,
+    parallelize_plan: Union[ParallelStyle, Dict[str, ParallelStyle]],
+) -> nn.Module:
+    """
+    Apply Tensor Parallelism in PyTorch by parallelizing modules or sub-modules based on a user-specified plan.
+
+    We parallelize module or sub_modules based on a parallelize_plan. The parallelize_plan contains
+    :class:`ParallelStyle`, which indicates how user wants the module or sub_module
+    to be parallelized.
+
+    User can also specify different parallel style per module fully qualified name (FQN).
+
+    Note that ``parallelize_module`` only accepts a 1-D :class:`DeviceMesh`, if you have a 2-D or N-D :class:`DeviceMesh`,
+    slice the DeviceMesh to a 1-D sub DeviceMesh first then pass to this API(i.e. ``device_mesh[\"tp\"]``)
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        parallelize_plan (Union[:class:`ParallelStyle`, Dict[str, :class:`ParallelStyle`]]):
+            The plan used to parallelize the module. It can be either a
+            :class:`ParallelStyle` object which contains how
+            we prepare input/output for Tensor Parallelism or it can be a
+            dict of module FQN and its corresponding :class:`ParallelStyle` object.
+    Return:
+        A :class:`nn.Module` object parallelized.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch.distributed.tensor.parallel import parallelize_module, ColwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>>
+        >>> # Define the module.
+        >>> m = Model(...)
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>> m = parallelize_module(m, tp_mesh, {"w1": ColwiseParallel(), "w2": RowwiseParallel()})
+        >>>
+
+    .. note:: For complex module architecture like Attention, MLP layers, we recommend composing
+        different ParallelStyles together (i.e. ``ColwiseParallel`` and ``RowwiseParallel``) and pass
+        as a parallelize_plan, to achieves the desired sharding computation.
+    """
+    torch._C._log_api_usage_once("torch.distributed.tensor.parallel.parallelize_module")
+
+    _validate_tp_mesh_dim(device_mesh)
+
+    # instantiate a TP RNG state tracker if it's not there
+    if is_rng_supported_mesh(device_mesh) and not isinstance(
+        random._rng_tracker, TensorParallelRNGTracker
+    ):
+        random._rng_tracker = TensorParallelRNGTracker(device_mesh.device_type)
+        # TODO: we should allow user to pass in the default seed from a config
+        random._rng_tracker._manual_seed(device_mesh, base_seed=1234)
+        # By default we execute random ops in non-tensor-parallel region. If users want
+        # to execute in tensor-parallel region, they can manually set this field to True
+        # after parallelizing the model.
+        random._rng_tracker.distribute_region_enabled = False
+
+    if isinstance(parallelize_plan, ParallelStyle):
+        return parallelize_plan._apply(module, device_mesh)
+    elif isinstance(parallelize_plan, dict):
+        for module_path, parallelize_style in parallelize_plan.items():
+            sub_module = module.get_submodule(module_path)
+            parent_module = module
+            if "." in module_path:
+                parent_module_path = ".".join(module_path.split(".")[:-1])
+                parent_module = module.get_submodule(parent_module_path)
+                module_path = module_path.split(".")[-1]
+            parent_module.register_module(  # type: ignore[call-arg] # pyre-ignore[20]
+                module_path,
+                parallelize_module(  # type: ignore[arg-type]
+                    sub_module, device_mesh, parallelize_style  # type: ignore[arg-type] # pyre-ignore[6]
+                ),
+            )
+        return module
+    else:
+        raise RuntimeError(  # pyre-ignore[7]
+            "Expect Union[ParallelStyle, Dict[str, ParallelStyle]] for"
+            f" parallelize_plan, {type(parallelize_plan)} found!"
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/ddp.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/ddp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba58ac5096e2a88a95beae4c3e89105f0162eff1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/ddp.py
@@ -0,0 +1,96 @@
+from typing import Any, List, Tuple
+
+import torch.nn as nn
+from torch.distributed.tensor.parallel._data_parallel_utils import (
+    _flatten_tensor,
+    _unflatten_tensor,
+)
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+def _get_submodule_n_params(module: nn.Module, path: str):
+    """
+    Get submodule and the direct path of parameter from the module
+    """
+    if "." in path:
+        path_list = path.split(".")
+        parent_module_path = ".".join(path_list[:-1])
+        module = module.get_submodule(parent_module_path)
+        path = path_list[-1]
+    return module, path
+
+
+def _update_module_param(param_list: List[Tuple[nn.Module, str, nn.Parameter]]):
+    """
+    Update parameters within the module
+    """
+    for item in param_list:
+        parent_module, module_path, t = item
+        assert hasattr(parent_module, module_path)
+        delattr(parent_module, module_path)
+        setattr(parent_module, module_path, t)
+
+
+def _reconstruct_dtensor(module: nn.Module, _input: Any):
+    """
+    Recontruct DTensor parameters from local tensors
+    """
+    param_list = []
+    # TODO: To add perf optimizations to this iterations
+    for name, t in module.named_parameters():
+        if hasattr(t, "_st_info"):
+            dtensor = _unflatten_tensor(t, t._st_info)
+            param_list.append((*_get_submodule_n_params(module, name), dtensor))
+    _update_module_param(param_list)  # type: ignore[arg-type]
+
+
+def _localize_dtensor(module: nn.Module, *_: Any):
+    """
+    Convert DTensor parameters to local tensors
+    """
+    param_list = []
+    for name, param in module.named_parameters():
+        t, sharding_info = _flatten_tensor(param)
+        if sharding_info is not None:
+            t = nn.Parameter(t)
+            t._st_info = sharding_info  # type: ignore[attr-defined]
+            param_list.append((*_get_submodule_n_params(module, name), t))
+    _update_module_param(param_list)  # type: ignore[arg-type]
+
+
+def _pre_dp_module_transform(module: nn.Module):
+    """
+    Enable the composability between Tensor Parallelism (TP) and Data
+    Parallelism(DP) in PyTorch when using DDP. We need to convert Parameters which
+    are DTensors to local tensors before wrapping with data parallelism API.
+    We then register two hooks, one for converting local tensors back to DTensor
+    preforward and one to convert DTensors back to tensors after Forward. By
+    integrating this way, we avoid any special handling of DTensor parameters by DDP
+    and get DTensor's gradients propagated back to DP, e.g. gradient buckets of DDP.
+
+    For now, this API only works with ``DistributedDataParallel``. It will later support
+    other DP methods such as FSDP.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module which has been applied TP on.
+
+    Example::
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch.distributed.tensor.parallel import parallelize_module, PairwiseParallel
+        >>> from torch.nn.parallel import DistributedDataParallel as DDP
+        >>> from torch.distributed.tensor.parallel.ddp import pre_dp_module_transform
+        >>>
+        >>> # Define the module.
+        >>> m = module(...)
+        >>> parallelize_module(m, PairwiseParallel())
+        >>> m = pre_dp_module_transform(m)
+        >>> m = DDP(m)
+        >>>
+    """
+
+    _localize_dtensor(module, None, None)
+    # TODO: To add test cases and ensure that it works for nested modules
+    module.register_forward_pre_hook(_reconstruct_dtensor)
+    module.register_forward_hook(_localize_dtensor)
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/fsdp.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d67798f77c04096c6541d4343c5d8b0dfc97b00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/fsdp.py
@@ -0,0 +1,391 @@
+import copy
+from typing import Any, cast, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+
+import torch.distributed._shard.sharding_spec as shard_spec
+import torch.distributed.distributed_c10d as c10d
+from torch.distributed._shard.sharded_tensor import (
+    Shard,
+    ShardedTensor,
+    ShardedTensorMetadata,
+    TensorProperties,
+)
+
+from torch.distributed._shard.sharding_spec import ShardMetadata
+from torch.distributed._shard.sharding_spec.chunk_sharding_spec import ChunkShardingSpec
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard as DShard
+from torch.distributed.device_mesh import _mesh_resources
+
+from torch.distributed.fsdp._common_utils import _set_fsdp_flattened
+from torch.distributed.fsdp._fsdp_extensions import FSDPExtensions
+from torch.distributed.fsdp._shard_utils import _create_chunk_sharded_tensor
+from torch.distributed.remote_device import _remote_device
+from torch.distributed.tensor.parallel._data_parallel_utils import (
+    _flatten_tensor,
+    _unflatten_tensor,
+)
+
+__all__ = ["DTensorExtensions"]
+
+
+def _get_box(tensor: DTensor) -> Tuple[torch.Size, torch.Size]:
+    device_mesh = tensor.device_mesh
+    assert device_mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+    placement = tensor.placements[0]
+    offsets = [0] * len(tensor.size())
+    num_chunks = device_mesh.size(mesh_dim=0)
+
+    if tensor.placements[0].is_shard():
+        shard_dim = cast(DShard, placement).dim
+        chunk_size = tensor.size(shard_dim) // num_chunks
+        offsets[shard_dim] = chunk_size
+
+    return (torch.Size(offsets), tensor._local_tensor.size())
+
+
+def _get_box_for(tensor: DTensor, idx: int) -> Tuple[torch.Size, torch.Size]:
+    offsets, size = _get_box(tensor)
+    return (torch.Size([val * idx for val in offsets]), size)
+
+
+def _get_local_box(tensor: DTensor) -> Tuple[torch.Size, torch.Size]:
+    device_mesh = tensor.device_mesh
+    coord = device_mesh.get_coordinate()
+    assert coord is not None
+    return _get_box_for(tensor, coord[0])
+
+
+def _create_shard_md_from_dt(dt: DTensor, current_rank: int) -> ShardMetadata:
+    mesh = dt.device_mesh
+    assert mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+    offsets, sizes = _get_local_box(dt)
+    return ShardMetadata(
+        shard_offsets=list(offsets),
+        shard_sizes=list(sizes),
+        placement=f"rank:{current_rank}/{dt._local_tensor.device}",
+    )
+
+
+def _create_sharded_tensor_md_from_dt(
+    dt: DTensor, dt_pg: c10d.ProcessGroup
+) -> ShardedTensorMetadata:
+    # This is where it gets tricky, we have to produce a ShardedTensor that has full coverage
+    # and yet has only one valid shard for the current rank.
+
+    shards_md = []
+    my_rank = dist.get_rank(dt_pg)
+    scapegoat_rank = 0 if my_rank > 0 else 1
+
+    if dt.placements[0].is_shard():
+        shard_count = dt_pg.size()
+    else:
+        shard_count = 1
+
+    for i in range(shard_count):
+        offsets, sizes = _get_box_for(dt, i)
+        shards_md.append(
+            ShardMetadata(
+                shard_offsets=list(offsets),
+                shard_sizes=list(sizes),
+                placement=(
+                    f"rank:{scapegoat_rank if i > 0 else my_rank}/{dt._local_tensor.device}"
+                ),
+            )
+        )
+
+    return ShardedTensorMetadata(
+        shards_metadata=shards_md,
+        size=dt.size(),
+        tensor_properties=TensorProperties(
+            dtype=dt.dtype,
+            layout=dt.layout,
+            requires_grad=dt.requires_grad,
+            # ignore memory_format and pin_memory as those are not supported by DT
+        ),
+    )
+
+
+def _get_dt_pg(dt: DTensor) -> c10d.ProcessGroup:
+    mesh = dt.device_mesh
+    assert mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+    dim_groups = mesh.get_group()
+    assert isinstance(dim_groups, list)
+    return dim_groups[0]
+
+
+def _rewrite_spec_if_needed(
+    spec: shard_spec.ShardingSpec, tensor: torch.Tensor, rank: int
+) -> shard_spec.ShardingSpec:
+    """
+    Rewrite ``spec`` to match the device of ``tensor``.
+
+    FSDP.sharded_optim_state_dict sneakly ships optimizer state to CPU so if the original ShardingSpec
+    produces CUDA metadata, ST construction bombs.
+    """
+    if not isinstance(spec, ChunkShardingSpec):
+        return spec
+
+    # let's see if we need
+    rewrite = False
+    for p in spec.placements:
+        p = cast(_remote_device, p)
+        if p.rank() == rank and p.device() != tensor.device:
+            rewrite = True
+            break
+    if rewrite:
+        spec = copy.deepcopy(spec)
+        for i, placement in enumerate(spec.placements):
+            placement = cast(_remote_device, placement)
+            if placement.rank() == rank and placement.device() != tensor.device:
+                spec.placements[i] = _remote_device(f"rank:{rank}/{tensor.device}")
+
+    return spec
+
+
+def _chunk_tensor(
+    tensor: torch.Tensor,
+    rank: int,
+    world_size: int,
+    num_devices_per_node: int,
+    pg: dist.ProcessGroup,
+) -> torch.Tensor:
+    if type(tensor) is ShardedTensor:
+        assert len(tensor.local_shards()) == 1
+
+        inner_param = tensor.local_tensor()
+        inner_st = _create_chunk_sharded_tensor(
+            inner_param,
+            rank,
+            world_size,
+            num_devices_per_node,
+            pg,
+        )
+
+        outer_local_shard = tensor.local_shards()[0]
+        shards: List[Shard] = [
+            Shard(inner_st, copy.deepcopy(outer_local_shard.metadata))
+        ]
+        st_meta = copy.deepcopy(tensor.metadata())
+        st_meta.tensor_properties.requires_grad = False
+
+        st_outer = ShardedTensor._init_from_local_shards_and_global_metadata(
+            shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=tensor._process_group,
+            init_rrefs=False,
+        )
+        return st_outer
+    elif type(tensor) is DTensor:
+        device_mesh = tensor.device_mesh
+        assert device_mesh.ndim == 1, "Only 1D DeviceMeshes currently handled"
+
+        inner_param = tensor._local_tensor
+
+        inner_st = _create_chunk_sharded_tensor(
+            inner_param,
+            rank,
+            world_size,
+            torch.cuda.device_count(),
+            pg,
+        )
+
+        dt_pg = _get_dt_pg(tensor)
+        # We do this differently here, we create a ST with no local shards then patch it
+        shards = [
+            Shard(inner_st, _create_shard_md_from_dt(tensor, dist.get_rank(dt_pg)))
+        ]
+
+        st_meta = _create_sharded_tensor_md_from_dt(tensor, dt_pg)
+        st_meta.tensor_properties.requires_grad = False
+
+        st_outer = ShardedTensor._init_from_local_shards_and_global_metadata(
+            shards,
+            sharded_tensor_metadata=st_meta,
+            process_group=dt_pg,
+            init_rrefs=False,
+        )
+
+        return st_outer
+    else:
+        return _create_chunk_sharded_tensor(
+            tensor,
+            rank,
+            world_size,
+            num_devices_per_node,
+            pg,
+        )
+
+
+def _chunk_dtensor(
+    tensor: torch.Tensor,
+    rank: int,
+    device_mesh: DeviceMesh,
+) -> DTensor:
+    """
+    Shard a tensor to chunks along the first dimension.
+
+    The local rank will gets its corresponding chunk as the local tensor to create a DTensor.
+    """
+    parent_mesh = _mesh_resources.get_parent_mesh(device_mesh)
+    if parent_mesh is None:
+        raise RuntimeError("No parent device_mesh is found for FSDP device_mesh.")
+    if parent_mesh.ndim < 2:
+        raise RuntimeError(
+            f"Found parent device_mesh of ndim={parent_mesh.ndim},",
+            "but meshes must be at least 2D.",
+        )
+
+    # We need to explicitly call .detach() to return a new tensor detached from the current graph.
+    tensor = tensor.clone().detach()
+
+    # When a layer is not involved in TP, then the tensor will not be a DTensor.
+    # e.g. When a layer is not sppecified in the parallelize_plan, TP will have no effect on the layer.
+    # e.g. When you do PairwiseParallel on a 3 layer model, TP will have no effect on the third layer.
+    if isinstance(tensor, torch.Tensor) and not isinstance(tensor, DTensor):
+
+        # For tensors, it is replicated across tp dimension and sharded across FSDP dimension.
+        # TP is the inner dimension and FSDP is the outer dimension.
+        # Therefore, shard placements for tensor is (Shard(0), Replicate()).
+        replicate_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+        shard_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+        shard_placements[0] = DShard(0)  # type: ignore[call-overload]
+
+        return DTensor.from_local(
+            tensor, parent_mesh, replicate_placements
+        ).redistribute(
+            device_mesh=parent_mesh,
+            placements=shard_placements,
+        )
+
+    else:
+        tp_placements = tensor.placements
+        tp_placement = tp_placements[0]
+
+        tensor = tensor.to_local()
+
+        # For DTensors, it is sharded across tp dimension first and then sharded across FSDP dimension.
+        # TP is the inner dimension and FSDP is the outer dimension.
+        # Therefore, shard placements for tensor is (Shard(0), tp_placement).
+        # For higher dimensional meshes, it is replicated across other dimensions. For example, with
+        # HSDP the shard placements for tensor is (Replicate, Shard(0), tp_placement).
+        replicate_placements = [Replicate() for _ in range(parent_mesh.ndim)]
+        replicate_placements[-1] = tp_placement  # type: ignore[call-overload]
+        shard_placements = [Replicate() for i in range(parent_mesh.ndim)]  # type: ignore[misc]
+        shard_placements[-2] = DShard(0)  # type: ignore[call-overload]
+        shard_placements[-1] = tp_placement  # type: ignore[call-overload]
+
+        return DTensor.from_local(
+            tensor, parent_mesh, replicate_placements
+        ).redistribute(
+            device_mesh=parent_mesh,
+            placements=shard_placements,
+        )
+
+
+def _pre_load_state_dict(
+    tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, List[Shard]]:
+    shards = cast(ShardedTensor, tensor).local_shards()
+    if len(shards) == 1 and type(shards[0].tensor) is ShardedTensor:
+        inner_tensor = shards[0].tensor
+        shards = inner_tensor.local_shards()  # pyre-ignore[16]
+        tensor = inner_tensor
+
+    return (tensor, shards if len(shards) > 0 else [])
+
+
+def _all_gather_dtensor(
+    tensor: DTensor,
+    parent_mesh: Optional[DeviceMesh],
+) -> torch.Tensor:
+    """All gather a DTensor in its FSDP dimension and return the local tensor."""
+    assert parent_mesh == tensor.device_mesh
+
+    placements = list(copy.deepcopy(tensor.placements))
+    # FSDP + TP: [Shard(0), tp_placement] -> [Replicate(), tp_placement]
+    # HSDP + TP: [Replicate(), Shard(0), tp_placement] -> [Replicate(), Replicate(), tp_placement]
+    for i in range(0, len(placements) - 1):
+        placements[i] = Replicate()
+    tensor = tensor.redistribute(
+        device_mesh=tensor.device_mesh,
+        placements=placements,
+    )
+
+    return tensor.to_local()
+
+
+class DTensorExtensions(FSDPExtensions):
+    """
+    DTensorExtension is the TensorFlattener extension needed for 2D FSDP + TP.
+
+    This is the implementation for FSDPExtensions defined in
+    https://github.com/pytorch/pytorch/blob/main/torch/distributed/fsdp/_fsdp_extensions.py
+    """
+    def __init__(self, device_handle) -> None:
+        super().__init__()
+        self.compute_stream = None
+        self.device_handle = device_handle
+        # we have to use the dynamo disable this way to disable dynamo as the decorater way would
+        # trigger build failure with torch deploy...
+        self.post_unflatten_transform = torch._dynamo.disable(self.post_unflatten_transform)  # type: ignore[method-assign]
+
+    def pre_flatten_transform(
+        self,
+        tensor: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Optional[Any]]:
+        return _flatten_tensor(tensor)
+
+    def post_unflatten_transform(
+        self, tensor: torch.Tensor, param_extension: Any
+    ) -> torch.Tensor:
+        stream = self.compute_stream or self.device_handle.current_stream()
+        with self.device_handle.stream(stream):
+            # runtime we put the unflattened tensor call on the compute stream since
+            # the unflattened tensor might contain computations in fwd/bwd where we
+            # need to sync properly.
+            # TODO: this is a short term fix and we should make the get_unflat_views
+            # directly happen in the compute stream.
+            result = _unflatten_tensor(
+                tensor,
+                param_extension,
+                device_handle=self.device_handle,
+                compute_stream=self.compute_stream
+            )
+            _set_fsdp_flattened(result)
+            return result
+
+    def chunk_tensor(
+        self,
+        tensor: torch.Tensor,
+        rank: int,
+        world_size: int,
+        num_devices_per_node: int,
+        pg: dist.ProcessGroup,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        return _chunk_tensor(tensor, rank, world_size, num_devices_per_node, pg)
+
+    def chunk_dtensor(
+        self,
+        tensor: torch.Tensor,
+        rank: int,
+        device_mesh: DeviceMesh,
+    ) -> torch.Tensor:
+        return _chunk_dtensor(tensor, rank, device_mesh)
+
+    def pre_load_state_dict_transform(
+        self,
+        tensor: torch.Tensor,
+    ) -> Tuple[torch.Tensor, List[Shard]]:
+        return _pre_load_state_dict(tensor)
+
+    def all_gather_dtensor(
+        self,
+        tensor: DTensor,
+        parent_mesh: Optional[DeviceMesh],
+    ) -> torch.Tensor:
+        return _all_gather_dtensor(tensor, parent_mesh)
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/input_reshard.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/input_reshard.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba61692411591d940c5206f6c2873a7378866a13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/input_reshard.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from functools import partial
+from typing import Any, Optional, Tuple
+
+import torch
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
+
+__all__ = [
+    "input_reshard",
+]
+
+
+def input_reshard(
+    module: torch.nn.Module,
+    tp_device_mesh: DeviceMesh,
+    input_reshard_dim: Optional[int] = None,
+) -> torch.nn.Module:
+    """
+    Register hooks to an nn.Module for input resharding, enabling sharding and restoration during backward computation.
+
+    Register hooks to an nn.Module with input resharding so that we can shard
+    per the given `tp_device_mesh` and `input_reshard_dim` and restore the
+    input back when recomputing the activations in the backward. The reason
+    why we can do this is that for Tensor Parallel(TP), the input are same
+    across all TP ranks.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be registered with input resharding.
+        tp_device_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for Tensor Parallel.
+        input_reshard_dim (Optional[int]):
+            The dimension of where we perform the sharding
+            of input. If set None, there is no sharding of input.
+            Default: None
+
+    Return:
+        A :class:`nn.Module` object registered with TP input resharding.
+    """
+    cx: Optional[torch.autograd.graph.saved_tensors_hooks] = None
+
+    def input_reshard_forward_pre_hook(_: torch.nn.Module, _i: Tuple[Any, ...]) -> None:
+        saved_tensor_hooks = torch.autograd.graph.saved_tensors_hooks(
+            partial(_pack_hook_tp, tp_device_mesh, input_reshard_dim),
+            partial(_unpack_hook_tp, tp_device_mesh, input_reshard_dim),
+        )
+        saved_tensor_hooks.__enter__()
+        nonlocal cx
+        cx = saved_tensor_hooks  # type: ignore[name-defined]
+
+    def input_reshard_backward_hook(_: torch.nn.Module, _i: Tuple[Any, ...], _o: Any) -> Any:
+        nonlocal cx
+        cx.__exit__()  # type: ignore[name-defined, union-attr]
+
+    if input_reshard_dim is None:
+        return module
+    module.register_forward_pre_hook(input_reshard_forward_pre_hook)
+    module.register_forward_hook(input_reshard_backward_hook)
+    return module
+
+
+def _pack_hook_tp(mesh: DeviceMesh, input_reshard_dim: int, x: torch.Tensor) -> Any:  # noqa: D401
+    """Hook function called after FWD to shard input."""
+    if isinstance(x, DTensor) and all(p.is_replicate() for p in x._spec.placements):
+        return x.redistribute(device_mesh=mesh, placements=[Shard(input_reshard_dim)])
+    elif (
+        not isinstance(x, DTensor)
+        and isinstance(x, torch.Tensor)
+        and x.numel() >= mesh.size()
+    ):
+        return (
+            DTensor.from_local(x, device_mesh=mesh)
+            .redistribute(device_mesh=mesh, placements=[Shard(input_reshard_dim)])
+            .to_local()
+        )
+    else:
+        return x
+
+
+def _unpack_hook_tp(mesh: DeviceMesh, input_reshard_dim: int, x: Any) -> torch.Tensor:  # noqa: D401
+    """Hook function called before activation recomputing in BWD to restore input."""
+    if (
+        isinstance(x, DTensor)
+        and len(x._spec.placements) == 1
+        and x._spec.placements[0].is_shard()
+    ):
+        return x.redistribute(device_mesh=mesh, placements=[Replicate()])
+    elif (
+        not isinstance(x, DTensor)
+        and isinstance(x, torch.Tensor)
+        and x.numel() >= mesh.size()
+    ):
+        return (
+            DTensor.from_local(
+                x, device_mesh=mesh, placements=[Shard(input_reshard_dim)]
+            )
+            .redistribute(device_mesh=mesh, placements=[Replicate()])
+            .to_local()
+        )
+    else:
+        return x
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/loss.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..44df123f23d48281e9f45cfa82d516ce5a61a80b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/loss.py
@@ -0,0 +1,484 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+from typing import cast, Dict, Optional, Tuple
+
+import torch
+import torch._prims_common as utils
+import torch.distributed._functional_collectives as funcol
+import torch.distributed.distributed_c10d as c10d
+from torch import Tensor
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from torch.distributed._tensor.ops.embedding_ops import _MaskPartial
+from torch.distributed._tensor.ops.math_ops import (
+    _skip_dim,
+    Reduction,
+    replicate_reduction_dims,
+)
+from torch.distributed._tensor.placement_types import Placement, TensorMeta
+from torch.distributed.device_mesh import DeviceMesh
+
+aten = torch.ops.aten
+
+
+__all__ = ["loss_parallel"]
+
+
+@contextlib.contextmanager
+def loss_parallel():
+    """
+    A context manager that enables loss parallelism, where efficient parallelized loss computation
+    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
+    loss is supported.
+
+    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
+    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
+    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.
+
+    Args:
+        input (:class:`DTensor`):
+            Input logits. Assumed to be sharded on the class dimension.
+        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
+            Must be ground truth class indices (class probabilities currently not supported).
+            Assumed to be replicated across the ``DeviceMesh``.
+        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
+            If given, assumed to be replicated across the ``DeviceMesh``.
+        label_smoothing:
+            Currently not supported.
+
+    Returns:
+        A replicated :class:`DTensor`.
+
+    Example:
+        A sharded DTensor is manually created here to showcase the usage.
+        In practice, it is usually the output of a TP module.
+
+        >>> # xdoctest: +SKIP("distributed")
+        >>> from torch.distributed.tensor.parallel import loss_parallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> device_mesh = init_device_mesh("cuda", (8,))
+        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
+        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
+        >>> target = torch.randint(16, (4,), device="cuda")
+        >>> with loss_parallel():
+        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
+        >>>     loss.backward()
+        >>> ...
+    """
+    _enable_custom_loss_ops()
+
+    yield
+
+    _disable_custom_loss_ops()
+
+
+# Currently only needs to support one dimensional DeviceMesh; in general return
+# the mesh_dim with placements[mesh_dim].is_shard(dim)
+def _find_all_reduce_mesh_dim(placements: Tuple[Placement, ...], dim: int) -> int:
+    if not len(placements) == 1:
+        raise ValueError(
+            "Currently loss_parallel() only supports input on one-dimensional DeviceMesh."
+        )
+    if not placements[0].is_shard(dim):
+        raise ValueError(
+            f"loss_parallel() should be enabled only when the input tensor is sharded on dimension {dim}."
+        )
+    return 0
+
+
+def _cast_to_dtensor(
+    tensor, placements: Tuple[Placement, ...], mesh: DeviceMesh
+) -> DTensor:
+    if isinstance(tensor, DTensor):
+        if tensor.placements == placements:
+            return tensor
+        else:
+            raise RuntimeError(f"Expected {placements} but got {tensor.placements}.")
+    elif isinstance(tensor, torch.Tensor):
+        return DTensor.from_local(
+            tensor, device_mesh=mesh, placements=placements, run_check=False
+        )
+    else:
+        raise TypeError(f"Unsupported type {type(tensor)}")
+
+
+def _propagate_tensor_meta(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> TensorMeta:
+    op_info = DTensor._op_dispatcher.unwrap_to_op_info(op_call, args, kwargs)
+    tensor_meta = DTensor._op_dispatcher.sharding_propagator._propagate_tensor_meta(
+        op_info.schema
+    )
+    if isinstance(tensor_meta, TensorMeta):
+        return tensor_meta
+    elif isinstance(tensor_meta, tuple):
+        return tensor_meta[0]
+    else:
+        raise RuntimeError(f"Unexpected tensor meta type: {type(tensor_meta)}.")
+
+
+# NOTE: The implementation follows torch._decomp.decomposition._log_softmax,
+# with all_reduce manually inserted to perform distributed computation.
+def _log_softmax(x, dim, half_to_float, mesh, mesh_dim):
+    x = x.contiguous()
+    if half_to_float:
+        assert x.dtype == torch.half
+    computation_dtype, result_dtype = utils.elementwise_dtypes(
+        x, type_promotion_kind=utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    )
+    x = x.to(computation_dtype)
+    if x.numel() == 0:
+        shifted = x
+    else:
+        x_max = torch.amax(x, dim, keepdim=True)
+        x_max = funcol.all_reduce(
+            x_max, reduceOp=c10d.ReduceOp.MAX.name, group=(mesh, mesh_dim)
+        )
+        shifted = x - x_max
+    shifted_sumexp = torch.sum(torch.exp(shifted), dim, keepdim=True)
+    shifted_sumexp = funcol.all_reduce(
+        shifted_sumexp, reduceOp=c10d.ReduceOp.SUM.name, group=(mesh, mesh_dim)
+    )
+    shifted_logsumexp = torch.log(shifted_sumexp)
+    result = shifted - shifted_logsumexp
+    if not half_to_float:
+        result = result.to(result_dtype)
+    return result
+
+
+def _log_softmax_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    x = cast(DTensor, args[0])
+    dim = cast(int, args[1])
+    half_to_float = cast(bool, args[2])
+
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, dim)
+
+    output_tensor_meta = _propagate_tensor_meta(op_call, args, kwargs)
+
+    res = _log_softmax(x._local_tensor, dim, half_to_float, spec.mesh, mesh_dim)
+
+    return DTensor(
+        res,
+        spec.mesh,
+        spec.placements,
+        shape=output_tensor_meta.shape,
+        dtype=output_tensor_meta.dtype,
+        requires_grad=res.requires_grad,
+        stride=output_tensor_meta.stride,
+    )
+
+
+# NOTE: As explained below at _nll_loss_and_log_softmax_backward, the
+# _log_softmax_backward_handler does not actually do any computation.
+def _log_softmax_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    grad_output = cast(DTensor, args[0])
+    input_dtype = cast(torch.dtype, args[3])
+    return grad_output.to(input_dtype)
+
+
+# NOTE: The implementation follows torch._decomp.decomposition._nll_loss_forward,
+# with customized communication inserted to perform distributed computation.
+def _nll_loss_forward(
+    x: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    local_weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    channel_dim_size: int,
+    mesh: DeviceMesh,
+    mesh_dim: int,
+) -> Tuple[Tensor, Tensor]:
+    n_dims = x.dim()
+    channel_dim = 1
+    if n_dims < 2:
+        channel_dim = 0
+
+    def _weight_view(weight: Tensor) -> Tensor:
+        if n_dims > 1:
+            shape = [
+                1,
+            ] * n_dims
+            shape[channel_dim] = weight.shape[0]
+            w = weight.view(shape)
+        else:
+            w = weight
+        return w
+
+    if weight is not None:
+        w = _weight_view(weight)
+        assert local_weight is not None
+        local_w = _weight_view(local_weight)
+        x = x * local_w
+    safe_target = torch.where(target != ignore_index, target, 0)
+    safe_target_ = safe_target.unsqueeze(channel_dim)
+
+    # The following code block is a distributed version of
+    # result = -torch.gather(self, channel_dim, safe_target_).squeeze(channel_dim)
+    partial_placement = _MaskPartial(logical_dim_size=channel_dim_size)
+    safe_target_partial_ = partial_placement._partition_value(
+        safe_target_, mesh, mesh_dim
+    )
+    result_partial = torch.gather(x, channel_dim, safe_target_partial_)
+    # an all_reduce happens here
+    result_reduced = partial_placement._reduce_value(result_partial, mesh, mesh_dim)
+    result = -result_reduced.squeeze(channel_dim)
+
+    result = torch.where(target != ignore_index, result, 0)
+
+    if reduction == Reduction.NONE.value and n_dims > 1:
+        total_weight = x.new_full((), 0.0)
+        return result, total_weight
+
+    if weight is not None:
+        new_shape = list(x.shape)
+        new_shape[channel_dim] = -1
+        w = w.expand(new_shape)
+        wsum = torch.gather(w, channel_dim, safe_target_).squeeze(channel_dim)
+        wsum = torch.where(target != ignore_index, wsum, 0)
+        total_weight = wsum.sum()
+    else:
+        total_weight = (target != ignore_index).sum().to(x)
+
+    # NOTE: this is correct only on 1D DeviceMesh; o/w additional
+    #       all-reduce on result and total_weight is needed
+    if reduction == Reduction.SUM.value:
+        result = result.sum()
+    elif reduction == Reduction.MEAN.value:
+        result = result.sum() / total_weight
+
+    return result, total_weight
+
+
+def _nll_loss_forward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    x = cast(DTensor, args[0])
+    target = args[1]
+    weight = args[2]
+    reduction = cast(int, args[3])
+    ignore_index = cast(int, args[4])
+
+    channel_dim = 1 if x.dim() >= 2 else 0
+    channel_dim_size = x.shape[channel_dim]
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, channel_dim)
+
+    # Check user input: if target and weight are not DTensors, convert them to DTensors;
+    # if they are DTensors, check that they have the desired placements.
+    target_placements = _skip_dim(
+        replicate_reduction_dims(spec.placements, [channel_dim]), channel_dim
+    )
+    all_replicate_placements = (Replicate(),) * spec.mesh.ndim
+    target = _cast_to_dtensor(target, target_placements, spec.mesh)
+    local_weight = None
+    if weight is not None:
+        weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
+        # For local computation, both (replicated) weight and (sharded) local_weight
+        # are needed in _nll_loss_forward(). local_weight is generated here using
+        # DTensor API, without incurring any communication.
+        sharded_placements = [
+            Shard(0) if i == mesh_dim else Replicate() for i in range(spec.mesh.ndim)
+        ]
+        local_weight = weight.redistribute(spec.mesh, sharded_placements)._local_tensor
+        assert local_weight.shape[0] == x._local_tensor.shape[channel_dim]
+
+    if reduction == Reduction.NONE.value:
+        output_placements = target_placements
+    else:
+        output_placements = all_replicate_placements
+
+    # tensor inputs to _propagate_tensor_meta need to be DTensors
+    args = list(args)
+    args[1], args[2] = target, weight
+    output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
+
+    result, total_weight = _nll_loss_forward(
+        x._local_tensor,
+        target._local_tensor,
+        weight._local_tensor if weight is not None else None,
+        local_weight,
+        reduction,
+        ignore_index,
+        channel_dim_size,
+        spec.mesh,
+        mesh_dim,
+    )
+
+    return (
+        DTensor(
+            result,
+            spec.mesh,
+            output_placements,
+            shape=output_tensor_meta.shape,
+            dtype=output_tensor_meta.dtype,
+            requires_grad=result.requires_grad,
+            stride=output_tensor_meta.stride,
+        ),
+        total_weight,
+    )
+
+
+# NOTE: The backward computation of cross_entropy goes through two steps:
+# backward for nll_loss and then backward for log_softmax. In loss parallel,
+# the two steps are fused into the following function (called by _nll_loss_backward_handler)
+# to avoid communication when target contains class indices not class probabilities.
+# Also note that the _log_softmax_backward_handler does not perform computation.
+# The implementation resembles _nll_loss_backward and _log_softmax_backward_data
+# from torch._decomp.decomposition.
+def _nll_loss_and_log_softmax_backward(
+    grad_output: Tensor,
+    x: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor],
+    reduction: int,
+    ignore_index: int,
+    total_weight: Tensor,
+    channel_dim_size: int,
+    mesh: DeviceMesh,
+    mesh_dim: int,
+) -> Tensor:
+    channel_dim = 0 if x.dim() < 2 else 1
+    if reduction == Reduction.MEAN.value:
+        grad_output = grad_output / total_weight
+
+    target = target.unsqueeze(channel_dim)
+    safe_target = torch.where(target != ignore_index, target, 0)
+    grad_input = torch.zeros_like(x)
+
+    # The following code block is a distributed version of
+    # grad_input = torch.scatter(grad_input, channel_dim, safe_target, -1.0)
+    partial_placement = _MaskPartial(logical_dim_size=channel_dim_size)
+    safe_target = safe_target.squeeze(channel_dim).flatten()
+    masked_safe_target = partial_placement._partition_value(safe_target, mesh, mesh_dim)
+    # only update grad_input to -1 if not masked
+    assert partial_placement.mask_buffer.data is not None
+    grad_update = partial_placement.mask_buffer.data.float() - 1.0
+    arange_1d = torch.arange(
+        masked_safe_target.shape[0], device=masked_safe_target.device
+    )
+    # The first two cases with x.dim() <= 2 are for aten.nll_loss_backward.default;
+    # the last case is for aten.nll_loss2d_backward.default.
+    if x.dim() == 1:
+        grad_input[masked_safe_target] = grad_update
+    elif x.dim() == 2:
+        grad_input[arange_1d, masked_safe_target] = grad_update
+    else:
+        grad_input_t = grad_input.transpose(channel_dim, -1)
+        intermidate_shape = grad_input_t.shape
+        grad_input_2d = grad_input_t.reshape(-1, x.shape[channel_dim])
+        grad_input_2d[arange_1d, masked_safe_target] = grad_update
+        grad_input = grad_input_2d.view(intermidate_shape).transpose(channel_dim, -1)
+
+    if grad_input.dim() > grad_output.dim() > 0:
+        grad_output = grad_output.unsqueeze(channel_dim)
+
+    if weight is not None:
+        new_shape = [1 for _ in range(x.dim())]
+        new_shape[channel_dim] = weight.shape[0]
+        weight = weight.reshape(new_shape)
+        # In order for fused computation to work, the following line is rewritten.
+        # grad_output = grad_output * weight
+        new_shape = list(x.shape)
+        new_shape[channel_dim] = -1
+        w = weight.expand(new_shape)
+        w_target = torch.gather(w, channel_dim, target)
+        grad_output = grad_output * w_target
+
+    grad_output = torch.where(target != ignore_index, grad_output, 0)
+
+    # NOTE: Instead of directly returning the grad_input as grad_output for log_softmax,
+    # here we perform backward computation for log_softmax altogether to avoid the
+    # otherwise extra all_gather communication.
+    # return grad_input * grad_output
+    return (grad_input + torch.exp(x)) * grad_output
+
+
+def _nll_loss_backward_handler(
+    op_call: torch._ops.OpOverload,
+    args: Tuple[object, ...],
+    kwargs: Dict[str, object],
+) -> object:
+    grad_output = cast(DTensor, args[0])
+    x = cast(DTensor, args[1])
+    target = args[2]
+    weight = args[3]
+    reduction = cast(int, args[4])
+    ignore_index = cast(int, args[5])
+    total_weight = cast(Tensor, args[6])
+
+    channel_dim = 1 if x.dim() >= 2 else 0
+    channel_dim_size = x.shape[channel_dim]
+    spec = x._spec
+    mesh_dim = _find_all_reduce_mesh_dim(spec.placements, channel_dim)
+
+    # if target and weight are not DTensors, convert them to DTensors
+    target_placements = _skip_dim(
+        replicate_reduction_dims(spec.placements, [channel_dim]), channel_dim
+    )
+    all_replicate_placements = (Replicate(),) * spec.mesh.ndim
+    target = _cast_to_dtensor(target, target_placements, spec.mesh)
+    if weight is not None:
+        weight = _cast_to_dtensor(weight, all_replicate_placements, spec.mesh)
+
+    # tensor inputs to _propagate_tensor_meta need to be DTensors
+    args = list(args)
+    args[2], args[3] = target, weight
+    args[6] = _cast_to_dtensor(total_weight, all_replicate_placements, spec.mesh)
+    output_tensor_meta = _propagate_tensor_meta(op_call, tuple(args), kwargs)
+
+    result = _nll_loss_and_log_softmax_backward(
+        grad_output._local_tensor,
+        x._local_tensor,
+        target._local_tensor,
+        weight._local_tensor if weight is not None else None,
+        reduction,
+        ignore_index,
+        total_weight,
+        channel_dim_size,
+        spec.mesh,
+        mesh_dim,
+    )
+
+    return DTensor(
+        result,
+        spec.mesh,
+        # the output sharding is the same as input sharding: Shard(channel_dim) on mesh_dim
+        spec.placements,
+        shape=output_tensor_meta.shape,
+        dtype=output_tensor_meta.dtype,
+        requires_grad=result.requires_grad,
+        stride=output_tensor_meta.stride,
+    )
+
+
+customized_loss_ops = {
+    aten._log_softmax.default: _log_softmax_handler,
+    aten._log_softmax_backward_data.default: _log_softmax_backward_handler,
+    aten.nll_loss_forward.default: _nll_loss_forward_handler,
+    aten.nll_loss2d_forward.default: _nll_loss_forward_handler,
+    aten.nll_loss_backward.default: _nll_loss_backward_handler,
+    aten.nll_loss2d_backward.default: _nll_loss_backward_handler,
+}
+
+
+def _enable_custom_loss_ops():
+    DTensor._op_dispatcher._custom_op_handlers.update(customized_loss_ops)
+
+
+def _disable_custom_loss_ops():
+    for custom_op in customized_loss_ops:
+        DTensor._op_dispatcher._custom_op_handlers.pop(custom_op)
diff --git a/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/style.py b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/style.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d892097b77bf25c7a478a7f9f0d22592fb933d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/tensor/parallel/style.py
@@ -0,0 +1,489 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from abc import ABC, abstractmethod
+from typing import Optional, Union, Tuple
+from functools import partial
+
+import torch
+import torch.nn as nn
+from torch.distributed._tensor import DeviceMesh, DTensor, Placement, Replicate, Shard, distribute_tensor, distribute_module
+
+
+__all__ = [
+    "ParallelStyle",
+    "RowwiseParallel",
+    "SequenceParallel",
+    "ColwiseParallel",
+    "PrepareModuleInput",
+    "PrepareModuleOutput",
+]
+
+
+class ParallelStyle(ABC):
+    """
+    The parallel style contract defines how the module or submodule should be parallelized.
+
+    It only defines the ``apply`` method for ``parallelize_module`` to use, this allows maximum
+    flexibility for different kind of style implementations.
+    """
+
+    @abstractmethod
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        ...
+
+
+class ColwiseParallel(ParallelStyle):
+    """
+    Partition a compatible nn.Module in a column-wise fashion. Currently supports nn.Linear and nn.Embedding.
+    Users can compose it together with RowwiseParallel to achieve the sharding of more complicated modules.
+    (i.e. MLP, Attention)
+
+    Keyword Args:
+        input_layouts (Placement, optional):
+            The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to
+            become a DTensor. If not specified, we assume the input tensor to be replicated.
+        output_layouts (Placement, optional):
+            The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module
+            with the user desired layout. If not specified, the output tensor is sharded on the last dimension.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: True.
+    Returns:
+        A :class:`ParallelStyle` object that represents Colwise sharding of the nn.Module.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, ColwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "w1" nn.Linear submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "w1" Linear will be converted to Replicated DTensor
+        >>> # and the output of "w1" will return :class:`torch.Tensor` that shards on the last dim.
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"w1": ColwiseParallel()})
+        >>> ...
+
+    .. note:: By default ``ColwiseParallel`` output is sharded on the last dimension if the ``output_layouts`` not
+        specified, if there're operators that require specific tensor shape (i.e. before the paired ``RowwiseParallel``),
+        keep in mind that if the output is sharded the operator might need to be adjusted to the sharded size.
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Replicate(), )
+        self.output_layouts = (output_layouts or Shard(-1), )
+        # colwise linear runtime sharding (desired sharding):
+        # 1. requires replicate input
+        # 2. shard output on last dim
+        self.desired_input_layouts = (Replicate(), )
+        self.use_local_output = use_local_output
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        # TODO: figure out dynamo support for instance method and switch this to instance method
+
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+
+        # transform the input layouts to the desired layouts of ColwiseParallel
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
+        return input_tensor
+
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(0)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(0)])
+            )
+            module.register_parameter(name, dist_param)
+
+    def _partition_embedding_fn(self, name, module, device_mesh):
+        # colwise shard embedding.weight is straight forward as Shard(1)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(1)])
+            )
+            module.register_parameter(name, dist_param)
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # outputs is a shard on last dimension DTensor, i.e. Shard(-1)
+        outputs = outputs.redistribute(placements=output_layouts, async_op=True)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if isinstance(module, nn.Linear):
+            partition_fn = self._partition_linear_fn
+        elif isinstance(module, nn.Embedding):
+            partition_fn = self._partition_embedding_fn
+        else:
+            raise NotImplementedError("ColwiseParallel currently only support nn.Linear and nn.Embedding!")
+
+        return distribute_module(
+            module,
+            device_mesh,
+            partition_fn,
+            partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
+            partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
+        )
+
+
+class RowwiseParallel(ParallelStyle):
+    """
+    Partition a compatible nn.Module in a row-wise fashion. Currently supports nn.Linear and nn.Embedding.
+    Users can compose it with ColwiseParallel to achieve the sharding of more complicated modules.
+    (i.e. MLP, Attention)
+
+    Keyword Args:
+        input_layouts (Placement, optional):
+            The DTensor layout of input tensor for the nn.Module, this is used to annotate the input tensor to
+            become a DTensor. If not specified, we assume the input tensor to be sharded on the last dimension.
+        output_layouts (Placement, optional):
+            The DTensor layout of the output for the nn.Module, this is used to ensure the output of the nn.Module
+            with the user desired layout. If not specified, the output tensor is replicated.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: True.
+    Returns:
+        A :class:`ParallelStyle` object that represents Rowwise sharding of the nn.Module.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, RowwiseParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "w2" nn.Linear submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "w2" Linear will be converted to DTensor that shards on the last dim
+        >>> # and the output of "w2" will return a replicated :class:`torch.Tensor`.
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"w2": RowwiseParallel()}),
+        >>> ...
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True
+    ):
+        super().__init__()
+        self.input_layouts = (input_layouts or Shard(-1), )
+        self.output_layouts = (output_layouts or Replicate(), )
+        self.use_local_output = use_local_output
+
+    @staticmethod
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(input_tensor, device_mesh, input_layouts, run_check=False)
+
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(placements=desired_input_layouts, async_op=True)
+        return input_tensor
+
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        module.register_parameter("weight", nn.Parameter(
+            distribute_tensor(module.weight, device_mesh, [Shard(1)])
+        ))
+        if module.bias is not None:
+            module.register_parameter("bias", nn.Parameter(
+                distribute_tensor(module.bias, device_mesh, [Replicate()])
+            ))
+
+    def _partition_embedding_fn(self, name, module, device_mesh):
+        # rowwise shard embedding.weight is Shard(0)
+        for name, param in module.named_parameters():
+            dist_param = nn.Parameter(
+                distribute_tensor(param, device_mesh, [Shard(0)])
+            )
+            module.register_parameter(name, dist_param)
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # Rowwise sharding produces partial output, depending on output layouts:
+        # 1. to replicate -> allreduce
+        # 2. to shard -> reduce_scatter
+        outputs = outputs.redistribute(placements=output_layouts, async_op=True)
+        # back to local tensor if use_local_output is True
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        if isinstance(module, nn.Linear):
+            partition_fn = self._partition_linear_fn
+            # rowwise linear runtime sharding requires input tensor shard on last dim
+            self.desired_input_layouts: Tuple[Placement, ...] = (Shard(-1), )
+        elif isinstance(module, nn.Embedding):
+            partition_fn = self._partition_embedding_fn
+            # rowwise embedding runtime sharding requires input tensor replicated
+            self.desired_input_layouts = (Replicate(), )
+        else:
+            raise NotImplementedError("RowwiseParallel currently only support nn.Linear and nn.Embedding!")
+
+        return distribute_module(
+            module,
+            device_mesh,
+            partition_fn,
+            partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts),
+            partial(self._prepare_output_fn, self.output_layouts, self.use_local_output),
+        )
+
+
+class SequenceParallel(ParallelStyle):
+    """
+    SequenceParallel replicates a compatible ``nn.Module`` parameters and runs the sharded computation with
+    input sharded on the sequence dimension. This currently supports ``nn.LayerNorm``, ``nn.Dropout``, and the
+    `RMSNorm python implementation <https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34>`__
+
+    This style implements the operation that is described in the paper
+    `Reducing Activation Recomputation in Large Transformer Models <https://arxiv.org/abs/2205.05198>`__
+
+    Both the input and output of the ``nn.Module`` will be sharded on the sequence dimension.
+
+    Keyword Args:
+        sequence_dim (int, optional):
+            The sequence dimension of the input tensor for the ``nn.Module``, this is used to annotate the input tensor to
+            become a DTensor that is sharded on the sequence dimension, default: 1.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module output, default: False.
+    Returns:
+        A :class:`ParallelStyle` object that represents Sequence Parallel of the ``nn.Module``.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, SequenceParallel
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> m = Model(...)  # m is a nn.Module that contains a "norm" nn.LayerNorm submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # By default, the input of the "norm" will be converted to DTensor that shards on the sequence dim
+        >>> # and the output of "norm" will return a sharded on sequence dimension :class:`DTensor`.
+        >>>
+        >>> sharded_mod = parallelize_module(m, tp_mesh, {"norm": SequenceParallel()}),
+        >>> ...
+
+    .. note:: SequenceParallel style assumes ones initialization if there are weights in the nn.Module (i.e.
+        ``nn.LayerNorm`` or ``RMSNorm``, and they by default have ones initialization). If you have custom
+        inits for the weights on those modules, you need to broadcast the weights before/after parallelizing
+        to ensure that they are replicated.
+    """
+    def __init__(
+        self,
+        *,
+        sequence_dim: int = 1,
+        use_local_output: bool = False
+    ):
+        super().__init__()
+        self.sequence_dim = sequence_dim
+        self.use_local_output = use_local_output
+
+    def _replicate_module_fn(self, name: str, module: nn.Module, device_mesh: DeviceMesh):
+        for p_name, param in module.named_parameters():
+            # simple replication with fixed ones_ init from LayerNorm/RMSNorm, which allow
+            # us to simply just use from_local
+            replicated_param = torch.nn.Parameter(
+                DTensor.from_local(param, device_mesh, [Replicate()], run_check=False)
+            )
+            module.register_parameter(p_name, replicated_param)
+
+    @staticmethod
+    def _prepare_input_fn(sequence_dim, mod, inputs, device_mesh):
+        input_tensor = inputs[0]
+        if isinstance(input_tensor, DTensor):
+            return inputs
+        elif isinstance(input_tensor, torch.Tensor):
+            return DTensor.from_local(input_tensor, device_mesh, [Shard(sequence_dim)], run_check=False)
+        else:
+            raise ValueError(f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}")
+
+    @staticmethod
+    def _prepare_output_fn(use_local_output, mod, outputs, device_mesh):
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            self._replicate_module_fn,
+            partial(self._prepare_input_fn, self.sequence_dim),
+            partial(self._prepare_output_fn, self.use_local_output),
+        )
+
+
+class PrepareModuleInput(ParallelStyle):
+    """
+    Configure the nn.Module's inputs to convert the input tensors of the nn.Module to DTensors at runtime according to
+    ``input_layouts``, and perform layout redistribution according to the ``desired_input_layouts``.
+
+    Keyword Args:
+        input_layouts (Union[Placement, Tuple[Placement]]):
+            The DTensor layouts of input tensors for the nn.Module, this is used to convert the input tensors to
+            DTensors. If some inputs are not torch.Tensor or no need to convert to DTensors, ``None`` need to be specified
+            as a placeholder.
+        desired_input_layouts (Union[Placement, Tuple[Placement]]):
+            The desired DTensor layout of input tensors for the nn.Module, this is used to ensure the inputs of the nn.Module
+            have the desired DTensor layouts. This argument needs to have the same length with ``input_layouts``.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module inputs, default: False.
+    Returns:
+        A :class:`ParallelStyle` object that prepares the sharding layouts of the nn.Module's inputs.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, PrepareModuleInput
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> block = TransformerBlock(...)  # block is a nn.Module that contains an "attn" Attention submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # According to the style specified below, the first input of attn will be annotated to Sharded DTensor
+        >>> # and then redistributed to Replicated DTensor.
+        >>> parallelize_module(
+        >>>     block, # this can be a submodule or module
+        >>>     tp_mesh,
+        >>>     parallelize_plan={
+        >>>         "attn": PrepareModuleInput(
+        >>>             input_layouts=(Shard(0), None, None, ...),
+        >>>             desired_input_layouts=(Replicate(), None, None, ...)
+        >>>         ),
+        >>>     }
+        >>> )
+    """
+
+    def __init__(
+        self,
+        *,
+        input_layouts: Union[Placement, Tuple[Placement]],
+        desired_input_layouts: Union[Placement, Tuple[Placement]],
+        use_local_output: bool = False
+    ):
+        self.input_layouts = (input_layouts,) if isinstance(input_layouts, Placement) else input_layouts
+        self.desired_input_layouts = \
+            (desired_input_layouts,) if isinstance(desired_input_layouts, Placement) else desired_input_layouts
+        self.use_local_output = use_local_output
+        assert len(self.input_layouts) == len(self.desired_input_layouts), \
+            "input_layouts and desired_input_layouts should have same length!"
+
+    def _prepare_input_fn(self, inputs, device_mesh):
+        prepared_inputs = []
+        if not isinstance(inputs, tuple):
+            inputs = (inputs,)
+        if len(inputs) != len(self.input_layouts):
+            raise ValueError("module inputs and input_layouts should have same length!")
+
+        for inp, input_layout, desired_layout in zip(inputs, self.input_layouts, self.desired_input_layouts):
+            if input_layout is not None:
+                if isinstance(inp, DTensor):
+                    # TODO: re-enable the check once we fix the compile path
+                    # assert inp.placements[0] == input_layout
+                    dt_inp = inp
+                else:
+                    dt_inp = DTensor.from_local(inp, device_mesh, (input_layout,), run_check=False)
+                if input_layout != desired_layout:
+                    dt_inp = dt_inp.redistribute(placements=(desired_layout,))
+                prepared_inputs.append(dt_inp.to_local() if self.use_local_output else dt_inp)
+            else:
+                prepared_inputs.append(inp)
+        return tuple(prepared_inputs)
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        module.register_forward_pre_hook(lambda _, inputs: self._prepare_input_fn(inputs, device_mesh))  # type: ignore[misc, call-arg]
+        return module
+
+
+class PrepareModuleOutput(ParallelStyle):
+    """
+    Configure the nn.Module's outputs to convert the output tensors of the nn.Module to DTensors at runtime according to
+    ``output_layouts``, and perform layout redistribution according to the ``desired_output_layouts``.
+
+    Keyword Args:
+        output_layouts (Union[Placement, Tuple[Placement]]):
+            The DTensor layouts of output tensors for the nn.Module, this is used to convert the output tensors to
+            DTensors if they are :class:`torch.Tensor`. If some outputs are not torch.Tensor or no need to convert to DTensors,
+            ``None`` need to be specified as a placeholder.
+        desired_output_layouts (Union[Placement, Tuple[Placement]]):
+            The desired DTensor layouts of output tensors for the nn.Module, this is used to ensure the outputs of the nn.Module
+            have the desired DTensor layouts.
+        use_local_output (bool, optional):
+            Whether to use local :class:`torch.Tensor` instead of :class:`DTensor` for the module outputs, default: True.
+    Returns:
+        A ParallelStyle object that prepares the sharding layouts of the nn.Module's outputs.
+
+    Example::
+        >>> # xdoctest: +SKIP(failing)
+        >>> from torch.distributed.tensor.parallel import parallelize_module, PrepareModuleOutput
+        >>> from torch.distributed.device_mesh import init_device_mesh
+        >>> ...
+        >>> block = TransformerBlock(...)  # block is a nn.Module that contains an "attn" Attention submodule
+        >>> tp_mesh = init_device_mesh("cuda", (8,))
+        >>>
+        >>> # According to the style specified below, the output of the TransformerBlock will be converted to Replicated DTensor
+        >>> # and then redistributed to Sharded DTensor.
+        >>> parallelize_module(
+        >>>     block, # this can be a submodule or module
+        >>>     tp_mesh,
+        >>>     parallelize_plan = PrepareModuleOutput(
+        >>>         output_layouts=Replicate(),
+        >>>         desired_output_layouts=Shard(0)
+        >>>     )
+        >>> )
+    """
+    def __init__(
+        self,
+        *,
+        output_layouts: Union[Placement, Tuple[Placement]],
+        desired_output_layouts: Union[Placement, Tuple[Placement]],
+        use_local_output: bool = True
+    ):
+        self.output_layouts = (output_layouts,) if isinstance(output_layouts, Placement) else output_layouts
+        self.desired_output_layouts = \
+            (desired_output_layouts,) if isinstance(desired_output_layouts, Placement) else desired_output_layouts
+        self.use_local_output = use_local_output
+        assert len(self.output_layouts) == len(self.desired_output_layouts), \
+            "output_layouts and desired_output_layouts should have same length!"
+
+    def _prepare_out_fn(self, outputs, device_mesh):
+        prepared_outputs = []
+        if not isinstance(outputs, tuple):
+            outputs = (outputs,)
+        if len(outputs) != len(self.output_layouts):
+            raise ValueError("module outputs and output_layouts should have same length!")
+        for out, out_layout, desired_out_layout in zip(outputs, self.output_layouts, self.desired_output_layouts):
+            if out_layout is not None:
+                if isinstance(out, DTensor):
+                    # TODO: re-enable the check once we fix the compile path
+                    # assert out.placements[0] == out_layout
+                    dt_out = out
+                else:
+                    dt_out = DTensor.from_local(out, device_mesh, (out_layout,), run_check=False)
+
+                if out_layout != desired_out_layout:
+                    dt_out = dt_out.redistribute(placements=(desired_out_layout,))
+                prepared_outputs.append(dt_out.to_local() if self.use_local_output else dt_out)
+            else:
+                prepared_outputs.append(out)
+        if len(prepared_outputs) == 1:
+            return prepared_outputs[0]
+        else:
+            return tuple(prepared_outputs)
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        module.register_forward_hook(lambda _, inputs, outputs: self._prepare_out_fn(outputs, device_mesh))  # type: ignore[misc, call-arg]
+        return module
diff --git a/MLPY/Lib/site-packages/torch/distributed/utils.py b/MLPY/Lib/site-packages/torch/distributed/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c67357e9a3a5f7fed4349552fdf4b794de76d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributed/utils.py
@@ -0,0 +1,339 @@
+import dataclasses
+import traceback
+from typing import Any, Callable, Container, Dict, List, Optional, OrderedDict, Tuple, TypeVar, overload
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.nn.parallel._functions import _get_stream
+from torch.nn.parallel.scatter_gather import _is_namedtuple
+from torch.nn.utils.rnn import PackedSequence
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+def _pack_kwargs(*args: Any, **kwargs: Any) -> Tuple[Tuple[Any, ...], Tuple[str, ...]]:
+    """
+    Turn argument list into separate key list and value list (unpack_kwargs does the opposite).
+
+    Inspiration: https://github.com/facebookresearch/fairscale/blob/eeb6684/fairscale/internal/containers.py#L70
+    Usage::
+
+        kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4)
+        assert kwarg_keys == ("a", "b")
+        assert flat_args == (1, 2, 3, 4)
+        args, kwargs = unpack_kwargs(kwarg_keys, flat_args)
+        assert args == (1, 2)
+        assert kwargs == {"a": 3, "b": 4}
+    Returns:
+        Tuple[Tuple[Any, ...], Tuple[str, ...]]: The first tuple element gives
+        gives both positional args and kwarg values, where the positional args
+        proceed kwarg values and kwarg values are ordered consistently with the
+        kwarg keys. The second tuple element gives the kwarg keys.
+        The second tuple element's length is at most the first tuple element's length.
+    """
+    kwarg_keys: List[str] = []
+    flat_args: List[Any] = list(args)
+    for k, v in kwargs.items():
+        kwarg_keys.append(k)
+        flat_args.append(v)
+
+    return tuple(flat_args), tuple(kwarg_keys)
+
+def _cast_forward_inputs(
+    dtype: Optional[torch.dtype],
+    *args: Any,
+    **kwargs: Any,
+) -> Tuple[Any, Any]:
+    """
+    Cast floating point tensors in ``args`` and ``kwargs`` to ``input_dtype``.
+
+    This respects the existing ``requires_grad`` on the tensors.
+    """
+    if dtype is None:
+        return args, kwargs
+
+    def cast_fn(x: torch.Tensor) -> torch.Tensor:
+        if not torch.is_floating_point(x) or x.dtype == dtype:
+            return x
+        return x.to(dtype)
+
+    return (_apply_to_tensors(cast_fn, args), _apply_to_tensors(cast_fn, kwargs))
+
+def _unpack_kwargs(flat_args: Tuple[Any, ...], kwarg_keys: Tuple[str, ...]) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
+    """See _pack_kwargs."""
+    assert len(kwarg_keys) <= len(
+        flat_args
+    ), f"too many keys {len(kwarg_keys)} vs. {len(flat_args)}"
+    if len(kwarg_keys) == 0:
+        return flat_args, {}
+    args = flat_args[: -len(kwarg_keys)]
+    kwargs = dict(zip(kwarg_keys, flat_args[-len(kwarg_keys) :]))
+    return args, kwargs
+
+
+S = TypeVar("S", dict, list, tuple)
+T = TypeVar("T", torch.Tensor, PackedSequence)
+
+
+@overload
+def _recursive_to(inputs: S, target_device: torch.device, use_side_stream_for_tensor_copies: bool) -> List[S]:
+    ...
+
+
+@overload
+def _recursive_to(inputs: T, target_device: torch.device, use_side_stream_for_tensor_copies: bool) -> Tuple[T]:
+    ...
+
+
+def _recursive_to(inputs, target_device, use_side_stream_for_tensor_copies):
+    r"""Recursively moves input to the target_device."""
+
+    def to_map(obj):
+        if isinstance(obj, (torch.Tensor, PackedSequence)):
+            device = obj.data.device if isinstance(obj, PackedSequence) else obj.device
+            if device == target_device:
+                return (obj,)
+            if not use_side_stream_for_tensor_copies:
+                return (obj.to(target_device),)
+            else:
+                # If the custom module is not registered to torch, stream is not used for acceleration
+                device_mod = getattr(torch, device.type, None)
+                if device.type == "cpu" or device_mod is None:
+                    return (obj.to(target_device),)
+                # Perform CPU -> target_device copies in a background stream. This code is
+                # motivated from similar logic in torch/nn/parallel/_functions.py
+                stream = _get_stream(target_device)
+                with device_mod.stream(stream):
+                    output = obj.to(target_device)
+                # synchronize with the copy stream
+                with device_mod.device(target_device.index):
+                    current_stream = device_mod.current_stream()
+                    # Sync the current stream with the copy stream
+                    current_stream.wait_stream(stream)
+                    # Ensure tensor memory is not reused until work on
+                    # main stream is complete
+                    if isinstance(obj, PackedSequence):
+                        output.data.record_stream(current_stream)  # type: ignore[arg-type]
+                    else:
+                        assert isinstance(output, torch.Tensor)
+                        output.record_stream(current_stream)  # type: ignore[arg-type]
+                return (output,)
+        if _is_namedtuple(obj):
+            return [type(obj)(*args) for args in zip(*map(to_map, obj))]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(to_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return [list(i) for i in zip(*map(to_map, obj))]
+        if isinstance(obj, dict) and len(obj) > 0:
+            return [type(obj)(i) for i in zip(*map(to_map, obj.items()))]
+        return [obj]
+
+    # Avoid reference cycle
+    try:
+        res = to_map(inputs)
+    finally:
+        to_map = None  # type: ignore[assignment]
+    return res
+
+
+def _p_assert(cond: Any, s: str, raise_assertion_error: bool = True) -> None:
+    """Alternate to ``assert`` when in the backward context to print the error message ``s`` since otherwise, it is swallowed."""
+    if not cond:
+        print(s)
+        traceback.print_stack()
+        if raise_assertion_error:
+            raise AssertionError(s)
+
+
+def _alloc_storage(tensor: torch.Tensor, size: torch.Size) -> None:
+    """
+    Allocate storage for ``tensor`` with the given size.
+
+    Returns:
+        bool: ``True`` if this method allocated storage and ``False`` if the
+        storage was already allocated.
+    """
+    with torch.no_grad():
+        if (
+            not torch.distributed._functional_collectives.is_torchdynamo_compiling()
+        ):
+            already_allocated = tensor._typed_storage()._size() == size.numel()
+            if not already_allocated:
+                tensor_storage_size = tensor._typed_storage()._size()
+                _p_assert(
+                    tensor_storage_size == 0,
+                    "Tensor storage should have been resized to be 0 but got PLACEHOLDEr",
+                )
+                tensor._typed_storage()._resize_(size.numel())
+
+
+def _free_storage(tensor: torch.Tensor):
+    """
+    Frees the underlying storage of ``tensor``.
+
+    Returns:
+        bool: ``True`` if the method freed the storage and ``False`` if the
+        storage was already freed.
+    """
+    with torch.no_grad():
+        if (
+            not torch.distributed._functional_collectives.is_torchdynamo_compiling()
+        ):
+            already_freed = tensor._typed_storage()._size() == 0
+            if not already_freed:
+                _p_assert(
+                    tensor.storage_offset() == 0,
+                    "Freeing a tensor's storage is unsafe when it is not the sole occupant\n"
+                    f"storage offset: {tensor.storage_offset()}\n"
+                    f"storage size: {tensor._typed_storage()._size()}\n"
+                    f"tensor shape: {tensor.shape}",
+                )
+                tensor._typed_storage()._resize_(0)
+
+
+
+Q = TypeVar("Q")
+R = TypeVar("R", dict, list, tuple, set, OrderedDict, PackedSequence, Any)
+
+
+@overload
+def _apply_to_tensors(fn: Callable[[torch.Tensor], Q], container: torch.Tensor) -> Q:
+    ...
+
+
+@overload
+def _apply_to_tensors(fn: Callable[[torch.Tensor], Any], container: R) -> R:
+    ...
+
+
+def _apply_to_tensors(fn, container):
+    """Recursively apply to all tensor in different kinds of container types."""
+
+    def apply(x):
+        if isinstance(x, torch.Tensor):
+            return fn(x)
+        elif hasattr(x, "__dataclass_fields__"):
+            dc = dataclasses.replace(x)
+            for f in dataclasses.fields(dc):
+                name = f.name
+                setattr(dc, name, apply(getattr(dc, name)))
+            return dc
+        elif isinstance(x, OrderedDict):
+            od = x.__class__()
+            for key, value in x.items():
+                od[key] = apply(value)
+            return od
+        elif isinstance(x, PackedSequence):
+            apply(x.data)
+            return x
+        elif isinstance(x, dict):
+            return {key: apply(value) for key, value in x.items()}
+        elif _is_namedtuple(x):
+            res = (apply(el) for el in x)
+            return type(x)(*res)
+        elif isinstance(x, (list, tuple, set)):
+            return type(x)(apply(el) for el in x)
+        else:
+            return x
+
+    return apply(container)
+
+
+def _to_kwargs(
+    inputs: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]],
+    target_device: torch.device,
+    use_side_stream_for_tensor_copies: bool,
+) -> Tuple[Tuple[Any, ...], Tuple[Dict[str, Any], ...]]:
+    moved_inputs = (
+        _recursive_to(inputs, target_device, use_side_stream_for_tensor_copies)
+        if inputs
+        else []
+    )
+    moved_kwargs = (
+        _recursive_to(kwargs, target_device, use_side_stream_for_tensor_copies)
+        if kwargs
+        else []
+    )
+    if len(moved_inputs) < len(moved_kwargs):
+        moved_inputs.extend([() for _ in range(len(moved_kwargs) - len(inputs))])
+    elif len(moved_kwargs) < len(moved_inputs):
+        moved_kwargs.extend([{} for _ in range(len(moved_inputs) - len(moved_kwargs))])
+    return tuple(moved_inputs), tuple(moved_kwargs)
+
+
+def _verify_param_shape_across_processes(
+    process_group: dist.ProcessGroup, tensors: List[torch.Tensor], logger: Optional[dist.Logger] = None
+):
+    return dist._verify_params_across_processes(process_group, tensors, logger)
+
+
+def _sync_module_states(
+    module: nn.Module,
+    process_group: dist.ProcessGroup,
+    broadcast_bucket_size: int,
+    src: int,
+    params_and_buffers_to_ignore: Container[str],
+    broadcast_buffers: bool = True,
+) -> None:
+    """
+    Sync ``module``'s parameters and buffers state.
+
+    Syncs ``module``'s parameters and buffers state so that all ranks contain
+    the same module state across all ranks. Note that this API assumes that all
+    parameter shapes are consistent before running the synchronization. This can
+    be checked with ``_verify_param_shape_across_processes``.
+    """
+    module_states: List[torch.Tensor] = []
+    for name, param in module.named_parameters():
+        if name not in params_and_buffers_to_ignore:
+            module_states.append(param.detach())
+
+    if broadcast_buffers:
+        for name, buffer in module.named_buffers():
+            if name not in params_and_buffers_to_ignore:
+                module_states.append(buffer.detach())
+
+    _sync_params_and_buffers(process_group, module_states, broadcast_bucket_size, src)
+
+
+def _sync_params_and_buffers(
+    process_group: dist.ProcessGroup,
+    module_states: List[torch.Tensor],
+    broadcast_bucket_size: int,
+    src: int,
+) -> None:
+    """Synchronize ``module_states`` (list of tensors) across all processes by broadcasting them from rank 0."""
+    if len(module_states) > 0:
+        dist._broadcast_coalesced(
+            process_group, module_states, broadcast_bucket_size, src
+        )
+
+
+def _replace_by_prefix(
+    state_dict: Dict[str, Any],
+    old_prefix: str,
+    new_prefix: str,
+) -> None:
+    """
+    Replace all keys that match a given old_prefix with a new_prefix (in-place).
+
+    Usage::
+
+        state_dict = {"layer.xyz": torch.tensor(1)}
+        replace_by_prefix_(state_dict, "layer.", "module.layer.")
+        assert state_dict == {"module.layer.xyz": torch.tensor(1)}
+    """
+    if old_prefix == new_prefix:
+        raise ValueError("old_prefix and new_prefix must be distinct")
+    for key in list(state_dict.keys()):
+        if not key.startswith(old_prefix):
+            continue
+        new_key = new_prefix + key[len(old_prefix) :]
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
+
+
+def _data_ptr_allocated(tensor: torch.Tensor) -> bool:
+    return tensor.untyped_storage().data_ptr() > 0
diff --git a/MLPY/Lib/site-packages/torch/distributions/__init__.py b/MLPY/Lib/site-packages/torch/distributions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66c2c9a8f5fd2e01a4c52fe439081cd43529c3c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/__init__.py
@@ -0,0 +1,171 @@
+r"""
+The ``distributions`` package contains parameterizable probability distributions
+and sampling functions. This allows the construction of stochastic computation
+graphs and stochastic gradient estimators for optimization. This package
+generally follows the design of the `TensorFlow Distributions`_ package.
+
+.. _`TensorFlow Distributions`:
+    https://arxiv.org/abs/1711.10604
+
+It is not possible to directly backpropagate through random samples. However,
+there are two main methods for creating surrogate functions that can be
+backpropagated through. These are the score function estimator/likelihood ratio
+estimator/REINFORCE and the pathwise derivative estimator. REINFORCE is commonly
+seen as the basis for policy gradient methods in reinforcement learning, and the
+pathwise derivative estimator is commonly seen in the reparameterization trick
+in variational autoencoders. Whilst the score function only requires the value
+of samples :math:`f(x)`, the pathwise derivative requires the derivative
+:math:`f'(x)`. The next sections discuss these two in a reinforcement learning
+example. For more details see
+`Gradient Estimation Using Stochastic Computation Graphs`_ .
+
+.. _`Gradient Estimation Using Stochastic Computation Graphs`:
+     https://arxiv.org/abs/1506.05254
+
+Score function
+^^^^^^^^^^^^^^
+
+When the probability density function is differentiable with respect to its
+parameters, we only need :meth:`~torch.distributions.Distribution.sample` and
+:meth:`~torch.distributions.Distribution.log_prob` to implement REINFORCE:
+
+.. math::
+
+    \Delta\theta  = \alpha r \frac{\partial\log p(a|\pi^\theta(s))}{\partial\theta}
+
+where :math:`\theta` are the parameters, :math:`\alpha` is the learning rate,
+:math:`r` is the reward and :math:`p(a|\pi^\theta(s))` is the probability of
+taking action :math:`a` in state :math:`s` given policy :math:`\pi^\theta`.
+
+In practice we would sample an action from the output of a network, apply this
+action in an environment, and then use ``log_prob`` to construct an equivalent
+loss function. Note that we use a negative because optimizers use gradient
+descent, whilst the rule above assumes gradient ascent. With a categorical
+policy, the code for implementing REINFORCE would be as follows::
+
+    probs = policy_network(state)
+    # Note that this is equivalent to what used to be called multinomial
+    m = Categorical(probs)
+    action = m.sample()
+    next_state, reward = env.step(action)
+    loss = -m.log_prob(action) * reward
+    loss.backward()
+
+Pathwise derivative
+^^^^^^^^^^^^^^^^^^^
+
+The other way to implement these stochastic/policy gradients would be to use the
+reparameterization trick from the
+:meth:`~torch.distributions.Distribution.rsample` method, where the
+parameterized random variable can be constructed via a parameterized
+deterministic function of a parameter-free random variable. The reparameterized
+sample therefore becomes differentiable. The code for implementing the pathwise
+derivative would be as follows::
+
+    params = policy_network(state)
+    m = Normal(*params)
+    # Any distribution with .has_rsample == True could work based on the application
+    action = m.rsample()
+    next_state, reward = env.step(action)  # Assuming that reward is differentiable
+    loss = -reward
+    loss.backward()
+"""
+
+from .bernoulli import Bernoulli
+from .beta import Beta
+from .binomial import Binomial
+from .categorical import Categorical
+from .cauchy import Cauchy
+from .chi2 import Chi2
+from .constraint_registry import biject_to, transform_to
+from .continuous_bernoulli import ContinuousBernoulli
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exp_family import ExponentialFamily
+from .exponential import Exponential
+from .fishersnedecor import FisherSnedecor
+from .gamma import Gamma
+from .geometric import Geometric
+from .gumbel import Gumbel
+from .half_cauchy import HalfCauchy
+from .half_normal import HalfNormal
+from .independent import Independent
+from .inverse_gamma import InverseGamma
+from .kl import _add_kl_info, kl_divergence, register_kl
+from .kumaraswamy import Kumaraswamy
+from .laplace import Laplace
+from .lkj_cholesky import LKJCholesky
+from .log_normal import LogNormal
+from .logistic_normal import LogisticNormal
+from .lowrank_multivariate_normal import LowRankMultivariateNormal
+from .mixture_same_family import MixtureSameFamily
+from .multinomial import Multinomial
+from .multivariate_normal import MultivariateNormal
+from .negative_binomial import NegativeBinomial
+from .normal import Normal
+from .one_hot_categorical import OneHotCategorical, OneHotCategoricalStraightThrough
+from .pareto import Pareto
+from .poisson import Poisson
+from .relaxed_bernoulli import RelaxedBernoulli
+from .relaxed_categorical import RelaxedOneHotCategorical
+from .studentT import StudentT
+from .transformed_distribution import TransformedDistribution
+from .transforms import *  # noqa: F403
+from . import transforms
+from .uniform import Uniform
+from .von_mises import VonMises
+from .weibull import Weibull
+from .wishart import Wishart
+
+_add_kl_info()
+del _add_kl_info
+
+__all__ = [
+    "Bernoulli",
+    "Beta",
+    "Binomial",
+    "Categorical",
+    "Cauchy",
+    "Chi2",
+    "ContinuousBernoulli",
+    "Dirichlet",
+    "Distribution",
+    "Exponential",
+    "ExponentialFamily",
+    "FisherSnedecor",
+    "Gamma",
+    "Geometric",
+    "Gumbel",
+    "HalfCauchy",
+    "HalfNormal",
+    "Independent",
+    "InverseGamma",
+    "Kumaraswamy",
+    "LKJCholesky",
+    "Laplace",
+    "LogNormal",
+    "LogisticNormal",
+    "LowRankMultivariateNormal",
+    "MixtureSameFamily",
+    "Multinomial",
+    "MultivariateNormal",
+    "NegativeBinomial",
+    "Normal",
+    "OneHotCategorical",
+    "OneHotCategoricalStraightThrough",
+    "Pareto",
+    "RelaxedBernoulli",
+    "RelaxedOneHotCategorical",
+    "StudentT",
+    "Poisson",
+    "Uniform",
+    "VonMises",
+    "Weibull",
+    "Wishart",
+    "TransformedDistribution",
+    "biject_to",
+    "kl_divergence",
+    "register_kl",
+    "transform_to",
+]
+__all__.extend(transforms.__all__)
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6622ed0c298136dd98c8cb01f8d5379b8eb9daba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/bernoulli.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/bernoulli.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b2d2a0b23659b4f19d8c490e230a9a1635863b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/bernoulli.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/beta.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/beta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47b88a751c5f12357027de63c8091302fad51f3a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/beta.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/binomial.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/binomial.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f2f4fd99f62427d8111b172b2a99b913ba3f2a0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/binomial.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/categorical.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/categorical.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfbc2c4ba56dd7e210a145501b362439b4c39aa7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/categorical.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/cauchy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/cauchy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d14eb7f6a5ac6b396893fd4e0fb925da37642e2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/cauchy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/chi2.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/chi2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1350f63bf19774ae7a91341c0a7b8079f1181fc9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/chi2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraint_registry.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraint_registry.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c80b0f77676e5ab94baf4461493fa8f679f9b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraint_registry.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraints.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraints.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d38704cd40b34eed5e51ea81fbac5c563d81aa10
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/constraints.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/continuous_bernoulli.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/continuous_bernoulli.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..517ab1cf0564f2c9d3b1a5a2a147f06c0ba3e497
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/continuous_bernoulli.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/dirichlet.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/dirichlet.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..180eda18fdf137f3bb930c2a3a2a0202bf66a871
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/dirichlet.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/distribution.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/distribution.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15dcdfa00790b6be374d7239e3c67ee728fc791c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/distribution.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/exp_family.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/exp_family.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19ae9da6d5a89b02bf217e22115734a4f0d6517c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/exp_family.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/exponential.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/exponential.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8aa5a49e7f9baddfbd36662d106551f656257b9e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/exponential.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/fishersnedecor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/fishersnedecor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19e6073e24f8c8b358739261dd3f9da01e098e90
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/fishersnedecor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/gamma.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/gamma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2f8fc0f1d80ed8bda2720068c839a30c1053eb7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/gamma.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/geometric.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/geometric.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b6de6e6205596279bfc2de05973b329164493b9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/geometric.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/gumbel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/gumbel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a0fa2bfa297a2764d06b0006efa99c82beefe9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/gumbel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_cauchy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_cauchy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..109c7434b55504493e8e7a670e60a8ad3551a692
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_cauchy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..314ecfae63882443b70be2e8c64461570d2d772d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/half_normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/independent.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/independent.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f22024469159ee2488092f95c63ea758625e6767
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/independent.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/inverse_gamma.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/inverse_gamma.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d95f72f5fb18cf7d28dc5d184a25fe449089e8ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/inverse_gamma.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/kl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/kl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07e82ecd2fb02ed3c094c5dad3d51003242e55ea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/kl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/kumaraswamy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/kumaraswamy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e82b8194bc64e22c5cdfc070e023af7a0f9f15e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/kumaraswamy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/laplace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/laplace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..166f92486106b38859bf3bd5ce86fe114c382f1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/laplace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/lkj_cholesky.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/lkj_cholesky.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9dc2b48b97d11097ee73cd81e75d49821a4aa9a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/lkj_cholesky.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/log_normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/log_normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27f86105c9399b1fa8fb6b8ac563f00565177711
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/log_normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/logistic_normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/logistic_normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..787190086ef056318b3270edc5bae682d2d7744a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/logistic_normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/lowrank_multivariate_normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/lowrank_multivariate_normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a8085a90479487e7234e9e10a11d35eaf1b1d4e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/lowrank_multivariate_normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/mixture_same_family.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/mixture_same_family.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65cafc1ec1e5364341cc547596543cdbdcc98927
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/mixture_same_family.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/multinomial.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/multinomial.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..777c98ea0e9aee8bea5e874af18545c9d55c1f21
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/multinomial.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/multivariate_normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/multivariate_normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5f9522c63aed983277409063bee0acda40bd9e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/multivariate_normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/negative_binomial.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/negative_binomial.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72cf0c0b702a89cc43301a4ecfbad012618c89f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/negative_binomial.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/normal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/normal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f7c5f6b76e72f943bc0699983921a4a3e603f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/normal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/one_hot_categorical.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/one_hot_categorical.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..310fc17d7a1982d4b0b0df43be2635a403a6657f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/one_hot_categorical.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/pareto.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/pareto.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c466b9e939f3a9d722317babe78c83c20294dc5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/pareto.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/poisson.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/poisson.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f23195fbf4b0d9a806fe53ddeaaaea83bb2f0b7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/poisson.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_bernoulli.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_bernoulli.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5a7366fa7f4c7d8f8c8504f592911ffa3d33f45
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_bernoulli.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_categorical.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_categorical.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dde6bb4b9357c38c495aff682a48e4fbc0739cd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/relaxed_categorical.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/studentT.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/studentT.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15733f72e80c96a95cf1fafa029c318730694b85
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/studentT.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/transformed_distribution.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/transformed_distribution.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..095080ff2ca2ceee3b8f34f1b255c62948d0ac08
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/transformed_distribution.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/transforms.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f70b4839a42c78776b1293faa4825e25ea70073
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/transforms.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/uniform.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/uniform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..355c38d3db4c41c4038f99ed66e253e185c475d7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/uniform.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..784d07c37df2892545e56cf93d5801a572353c14
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/von_mises.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/von_mises.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17284650552d73f9a6fefeb1f593f8d5e93fd43a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/von_mises.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/weibull.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/weibull.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ba30a5a45b2efce01083b728876b29fbb2042fd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/weibull.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/__pycache__/wishart.cpython-39.pyc b/MLPY/Lib/site-packages/torch/distributions/__pycache__/wishart.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da2fe2969d0b886defab55cf2ce6147da8115d1b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/distributions/__pycache__/wishart.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/distributions/bernoulli.py b/MLPY/Lib/site-packages/torch/distributions/bernoulli.py
new file mode 100644
index 0000000000000000000000000000000000000000..479c6b9a56bc72a4e0e213783f8ec0738606f276
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/bernoulli.py
@@ -0,0 +1,130 @@
+from numbers import Number
+
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import (
+    broadcast_all,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+from torch.nn.functional import binary_cross_entropy_with_logits
+
+__all__ = ["Bernoulli"]
+
+
+class Bernoulli(ExponentialFamily):
+    r"""
+    Creates a Bernoulli distribution parameterized by :attr:`probs`
+    or :attr:`logits` (but not both).
+
+    Samples are binary (0 or 1). They take the value `1` with probability `p`
+    and `0` with probability `1 - p`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Bernoulli(torch.tensor([0.3]))
+        >>> m.sample()  # 30% chance 1; 70% chance 0
+        tensor([ 0.])
+
+    Args:
+        probs (Number, Tensor): the probability of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+    """
+    arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    support = constraints.boolean
+    has_enumerate_support = True
+    _mean_carrier_measure = 0
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            is_scalar = isinstance(probs, Number)
+            (self.probs,) = broadcast_all(probs)
+        else:
+            is_scalar = isinstance(logits, Number)
+            (self.logits,) = broadcast_all(logits)
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Bernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Bernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @property
+    def mean(self):
+        return self.probs
+
+    @property
+    def mode(self):
+        mode = (self.probs >= 0.5).to(self.probs)
+        mode[self.probs == 0.5] = nan
+        return mode
+
+    @property
+    def variance(self):
+        return self.probs * (1 - self.probs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.bernoulli(self.probs.expand(shape))
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        return -binary_cross_entropy_with_logits(logits, value, reduction="none")
+
+    def entropy(self):
+        return binary_cross_entropy_with_logits(
+            self.logits, self.probs, reduction="none"
+        )
+
+    def enumerate_support(self, expand=True):
+        values = torch.arange(2, dtype=self._param.dtype, device=self._param.device)
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
+        return values
+
+    @property
+    def _natural_params(self):
+        return (torch.logit(self.probs),)
+
+    def _log_normalizer(self, x):
+        return torch.log1p(torch.exp(x))
diff --git a/MLPY/Lib/site-packages/torch/distributions/beta.py b/MLPY/Lib/site-packages/torch/distributions/beta.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d8cc383f0e1eba5a83ad803c4503e11cf04b85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/beta.py
@@ -0,0 +1,107 @@
+from numbers import Number, Real
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.dirichlet import Dirichlet
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Beta"]
+
+
+class Beta(ExponentialFamily):
+    r"""
+    Beta distribution parameterized by :attr:`concentration1` and :attr:`concentration0`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Beta(torch.tensor([0.5]), torch.tensor([0.5]))
+        >>> m.sample()  # Beta distributed with concentration concentration1 and concentration0
+        tensor([ 0.1046])
+
+    Args:
+        concentration1 (float or Tensor): 1st concentration parameter of the distribution
+            (often referred to as alpha)
+        concentration0 (float or Tensor): 2nd concentration parameter of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {
+        "concentration1": constraints.positive,
+        "concentration0": constraints.positive,
+    }
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, concentration1, concentration0, validate_args=None):
+        if isinstance(concentration1, Real) and isinstance(concentration0, Real):
+            concentration1_concentration0 = torch.tensor(
+                [float(concentration1), float(concentration0)]
+            )
+        else:
+            concentration1, concentration0 = broadcast_all(
+                concentration1, concentration0
+            )
+            concentration1_concentration0 = torch.stack(
+                [concentration1, concentration0], -1
+            )
+        self._dirichlet = Dirichlet(
+            concentration1_concentration0, validate_args=validate_args
+        )
+        super().__init__(self._dirichlet._batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Beta, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._dirichlet = self._dirichlet.expand(batch_shape)
+        super(Beta, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        return self.concentration1 / (self.concentration1 + self.concentration0)
+
+    @property
+    def mode(self):
+        return self._dirichlet.mode[..., 0]
+
+    @property
+    def variance(self):
+        total = self.concentration1 + self.concentration0
+        return self.concentration1 * self.concentration0 / (total.pow(2) * (total + 1))
+
+    def rsample(self, sample_shape=()):
+        return self._dirichlet.rsample(sample_shape).select(-1, 0)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        heads_tails = torch.stack([value, 1.0 - value], -1)
+        return self._dirichlet.log_prob(heads_tails)
+
+    def entropy(self):
+        return self._dirichlet.entropy()
+
+    @property
+    def concentration1(self):
+        result = self._dirichlet.concentration[..., 0]
+        if isinstance(result, Number):
+            return torch.tensor([result])
+        else:
+            return result
+
+    @property
+    def concentration0(self):
+        result = self._dirichlet.concentration[..., 1]
+        if isinstance(result, Number):
+            return torch.tensor([result])
+        else:
+            return result
+
+    @property
+    def _natural_params(self):
+        return (self.concentration1, self.concentration0)
+
+    def _log_normalizer(self, x, y):
+        return torch.lgamma(x) + torch.lgamma(y) - torch.lgamma(x + y)
diff --git a/MLPY/Lib/site-packages/torch/distributions/binomial.py b/MLPY/Lib/site-packages/torch/distributions/binomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..847c6779770bd1481798707ac8dd8ed247bd5b02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/binomial.py
@@ -0,0 +1,165 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import (
+    broadcast_all,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+
+__all__ = ["Binomial"]
+
+
+def _clamp_by_zero(x):
+    # works like clamp(x, min=0) but has grad at 0 is 0.5
+    return (x.clamp(min=0) + x - x.clamp(max=0)) / 2
+
+
+class Binomial(Distribution):
+    r"""
+    Creates a Binomial distribution parameterized by :attr:`total_count` and
+    either :attr:`probs` or :attr:`logits` (but not both). :attr:`total_count` must be
+    broadcastable with :attr:`probs`/:attr:`logits`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Binomial(100, torch.tensor([0 , .2, .8, 1]))
+        >>> x = m.sample()
+        tensor([   0.,   22.,   71.,  100.])
+
+        >>> m = Binomial(torch.tensor([[5.], [10.]]), torch.tensor([0.5, 0.8]))
+        >>> x = m.sample()
+        tensor([[ 4.,  5.],
+                [ 7.,  6.]])
+
+    Args:
+        total_count (int or Tensor): number of Bernoulli trials
+        probs (Tensor): Event probabilities
+        logits (Tensor): Event log-odds
+    """
+    arg_constraints = {
+        "total_count": constraints.nonnegative_integer,
+        "probs": constraints.unit_interval,
+        "logits": constraints.real,
+    }
+    has_enumerate_support = True
+
+    def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            (
+                self.total_count,
+                self.probs,
+            ) = broadcast_all(total_count, probs)
+            self.total_count = self.total_count.type_as(self.probs)
+        else:
+            (
+                self.total_count,
+                self.logits,
+            ) = broadcast_all(total_count, logits)
+            self.total_count = self.total_count.type_as(self.logits)
+
+        self._param = self.probs if probs is not None else self.logits
+        batch_shape = self._param.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Binomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(Binomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @constraints.dependent_property(is_discrete=True, event_dim=0)
+    def support(self):
+        return constraints.integer_interval(0, self.total_count)
+
+    @property
+    def mean(self):
+        return self.total_count * self.probs
+
+    @property
+    def mode(self):
+        return ((self.total_count + 1) * self.probs).floor().clamp(max=self.total_count)
+
+    @property
+    def variance(self):
+        return self.total_count * self.probs * (1 - self.probs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.binomial(
+                self.total_count.expand(shape), self.probs.expand(shape)
+            )
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        log_factorial_n = torch.lgamma(self.total_count + 1)
+        log_factorial_k = torch.lgamma(value + 1)
+        log_factorial_nmk = torch.lgamma(self.total_count - value + 1)
+        # k * log(p) + (n - k) * log(1 - p) = k * (log(p) - log(1 - p)) + n * log(1 - p)
+        #     (case logit < 0)              = k * logit - n * log1p(e^logit)
+        #     (case logit > 0)              = k * logit - n * (log(p) - log(1 - p)) + n * log(p)
+        #                                   = k * logit - n * logit - n * log1p(e^-logit)
+        #     (merge two cases)             = k * logit - n * max(logit, 0) - n * log1p(e^-|logit|)
+        normalize_term = (
+            self.total_count * _clamp_by_zero(self.logits)
+            + self.total_count * torch.log1p(torch.exp(-torch.abs(self.logits)))
+            - log_factorial_n
+        )
+        return (
+            value * self.logits - log_factorial_k - log_factorial_nmk - normalize_term
+        )
+
+    def entropy(self):
+        total_count = int(self.total_count.max())
+        if not self.total_count.min() == total_count:
+            raise NotImplementedError(
+                "Inhomogeneous total count not supported by `entropy`."
+            )
+
+        log_prob = self.log_prob(self.enumerate_support(False))
+        return -(torch.exp(log_prob) * log_prob).sum(0)
+
+    def enumerate_support(self, expand=True):
+        total_count = int(self.total_count.max())
+        if not self.total_count.min() == total_count:
+            raise NotImplementedError(
+                "Inhomogeneous total count not supported by `enumerate_support`."
+            )
+        values = torch.arange(
+            1 + total_count, dtype=self._param.dtype, device=self._param.device
+        )
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
+        return values
diff --git a/MLPY/Lib/site-packages/torch/distributions/categorical.py b/MLPY/Lib/site-packages/torch/distributions/categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe7d6df01a5ce75ba6e5330c82bcaf5250d795
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/categorical.py
@@ -0,0 +1,155 @@
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import lazy_property, logits_to_probs, probs_to_logits
+
+__all__ = ["Categorical"]
+
+
+class Categorical(Distribution):
+    r"""
+    Creates a categorical distribution parameterized by either :attr:`probs` or
+    :attr:`logits` (but not both).
+
+    .. note::
+        It is equivalent to the distribution that :func:`torch.multinomial`
+        samples from.
+
+    Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is ``probs.size(-1)``.
+
+    If `probs` is 1-dimensional with length-`K`, each element is the relative probability
+    of sampling the class at that index.
+
+    If `probs` is N-dimensional, the first N-1 dimensions are treated as a batch of
+    relative probability vectors.
+
+    .. note:: The `probs` argument must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1 along the last dimension. :attr:`probs`
+              will return this normalized value.
+              The `logits` argument will be interpreted as unnormalized log probabilities
+              and can therefore be any real number. It will likewise be normalized so that
+              the resulting probabilities sum to 1 along the last dimension. :attr:`logits`
+              will return this normalized value.
+
+    See also: :func:`torch.multinomial`
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m.sample()  # equal probability of 0, 1, 2, 3
+        tensor(3)
+
+    Args:
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities (unnormalized)
+    """
+    arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    has_enumerate_support = True
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            if probs.dim() < 1:
+                raise ValueError("`probs` parameter must be at least one-dimensional.")
+            self.probs = probs / probs.sum(-1, keepdim=True)
+        else:
+            if logits.dim() < 1:
+                raise ValueError("`logits` parameter must be at least one-dimensional.")
+            # Normalize
+            self.logits = logits - logits.logsumexp(dim=-1, keepdim=True)
+        self._param = self.probs if probs is not None else self.logits
+        self._num_events = self._param.size()[-1]
+        batch_shape = (
+            self._param.size()[:-1] if self._param.ndimension() > 1 else torch.Size()
+        )
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Categorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        param_shape = batch_shape + torch.Size((self._num_events,))
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(param_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(param_shape)
+            new._param = new.logits
+        new._num_events = self._num_events
+        super(Categorical, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @constraints.dependent_property(is_discrete=True, event_dim=0)
+    def support(self):
+        return constraints.integer_interval(0, self._num_events - 1)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    @property
+    def mean(self):
+        return torch.full(
+            self._extended_shape(),
+            nan,
+            dtype=self.probs.dtype,
+            device=self.probs.device,
+        )
+
+    @property
+    def mode(self):
+        return self.probs.argmax(axis=-1)
+
+    @property
+    def variance(self):
+        return torch.full(
+            self._extended_shape(),
+            nan,
+            dtype=self.probs.dtype,
+            device=self.probs.device,
+        )
+
+    def sample(self, sample_shape=torch.Size()):
+        if not isinstance(sample_shape, torch.Size):
+            sample_shape = torch.Size(sample_shape)
+        probs_2d = self.probs.reshape(-1, self._num_events)
+        samples_2d = torch.multinomial(probs_2d, sample_shape.numel(), True).T
+        return samples_2d.reshape(self._extended_shape(sample_shape))
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        value = value.long().unsqueeze(-1)
+        value, log_pmf = torch.broadcast_tensors(value, self.logits)
+        value = value[..., :1]
+        return log_pmf.gather(-1, value).squeeze(-1)
+
+    def entropy(self):
+        min_real = torch.finfo(self.logits.dtype).min
+        logits = torch.clamp(self.logits, min=min_real)
+        p_log_p = logits * self.probs
+        return -p_log_p.sum(-1)
+
+    def enumerate_support(self, expand=True):
+        num_events = self._num_events
+        values = torch.arange(num_events, dtype=torch.long, device=self._param.device)
+        values = values.view((-1,) + (1,) * len(self._batch_shape))
+        if expand:
+            values = values.expand((-1,) + self._batch_shape)
+        return values
diff --git a/MLPY/Lib/site-packages/torch/distributions/cauchy.py b/MLPY/Lib/site-packages/torch/distributions/cauchy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ce3016066928a7a030face80733d0b6386b600
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/cauchy.py
@@ -0,0 +1,90 @@
+import math
+from numbers import Number
+
+import torch
+from torch import inf, nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Cauchy"]
+
+
+class Cauchy(Distribution):
+    r"""
+    Samples from a Cauchy (Lorentz) distribution. The distribution of the ratio of
+    independent normally distributed random variables with means `0` follows a
+    Cauchy distribution.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Cauchy(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Cauchy distribution with loc=0 and scale=1
+        tensor([ 2.3214])
+
+    Args:
+        loc (float or Tensor): mode or median of the distribution.
+        scale (float or Tensor): half width at half maximum.
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.real
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Cauchy, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Cauchy, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        return torch.full(
+            self._extended_shape(), nan, dtype=self.loc.dtype, device=self.loc.device
+        )
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        return torch.full(
+            self._extended_shape(), inf, dtype=self.loc.dtype, device=self.loc.device
+        )
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = self.loc.new(shape).cauchy_()
+        return self.loc + eps * self.scale
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return (
+            -math.log(math.pi)
+            - self.scale.log()
+            - (((value - self.loc) / self.scale) ** 2).log1p()
+        )
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5
+
+    def icdf(self, value):
+        return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc
+
+    def entropy(self):
+        return math.log(4 * math.pi) + self.scale.log()
diff --git a/MLPY/Lib/site-packages/torch/distributions/chi2.py b/MLPY/Lib/site-packages/torch/distributions/chi2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8978cd242e9ad43782f1420c5f1b7bc04f6bf0e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/chi2.py
@@ -0,0 +1,33 @@
+from torch.distributions import constraints
+from torch.distributions.gamma import Gamma
+
+__all__ = ["Chi2"]
+
+
+class Chi2(Gamma):
+    r"""
+    Creates a Chi-squared distribution parameterized by shape parameter :attr:`df`.
+    This is exactly equivalent to ``Gamma(alpha=0.5*df, beta=0.5)``
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Chi2(torch.tensor([1.0]))
+        >>> m.sample()  # Chi2 distributed with shape df=1
+        tensor([ 0.1046])
+
+    Args:
+        df (float or Tensor): shape parameter of the distribution
+    """
+    arg_constraints = {"df": constraints.positive}
+
+    def __init__(self, df, validate_args=None):
+        super().__init__(0.5 * df, 0.5, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Chi2, _instance)
+        return super().expand(batch_shape, new)
+
+    @property
+    def df(self):
+        return self.concentration * 2
diff --git a/MLPY/Lib/site-packages/torch/distributions/constraint_registry.py b/MLPY/Lib/site-packages/torch/distributions/constraint_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86b6dead2666bb1426200d55900d6135bdde768
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/constraint_registry.py
@@ -0,0 +1,292 @@
+r"""
+PyTorch provides two global :class:`ConstraintRegistry` objects that link
+:class:`~torch.distributions.constraints.Constraint` objects to
+:class:`~torch.distributions.transforms.Transform` objects. These objects both
+input constraints and return transforms, but they have different guarantees on
+bijectivity.
+
+1. ``biject_to(constraint)`` looks up a bijective
+   :class:`~torch.distributions.transforms.Transform` from ``constraints.real``
+   to the given ``constraint``. The returned transform is guaranteed to have
+   ``.bijective = True`` and should implement ``.log_abs_det_jacobian()``.
+2. ``transform_to(constraint)`` looks up a not-necessarily bijective
+   :class:`~torch.distributions.transforms.Transform` from ``constraints.real``
+   to the given ``constraint``. The returned transform is not guaranteed to
+   implement ``.log_abs_det_jacobian()``.
+
+The ``transform_to()`` registry is useful for performing unconstrained
+optimization on constrained parameters of probability distributions, which are
+indicated by each distribution's ``.arg_constraints`` dict. These transforms often
+overparameterize a space in order to avoid rotation; they are thus more
+suitable for coordinate-wise optimization algorithms like Adam::
+
+    loc = torch.zeros(100, requires_grad=True)
+    unconstrained = torch.zeros(100, requires_grad=True)
+    scale = transform_to(Normal.arg_constraints['scale'])(unconstrained)
+    loss = -Normal(loc, scale).log_prob(data).sum()
+
+The ``biject_to()`` registry is useful for Hamiltonian Monte Carlo, where
+samples from a probability distribution with constrained ``.support`` are
+propagated in an unconstrained space, and algorithms are typically rotation
+invariant.::
+
+    dist = Exponential(rate)
+    unconstrained = torch.zeros(100, requires_grad=True)
+    sample = biject_to(dist.support)(unconstrained)
+    potential_energy = -dist.log_prob(sample).sum()
+
+.. note::
+
+    An example where ``transform_to`` and ``biject_to`` differ is
+    ``constraints.simplex``: ``transform_to(constraints.simplex)`` returns a
+    :class:`~torch.distributions.transforms.SoftmaxTransform` that simply
+    exponentiates and normalizes its inputs; this is a cheap and mostly
+    coordinate-wise operation appropriate for algorithms like SVI. In
+    contrast, ``biject_to(constraints.simplex)`` returns a
+    :class:`~torch.distributions.transforms.StickBreakingTransform` that
+    bijects its input down to a one-fewer-dimensional space; this a more
+    expensive less numerically stable transform but is needed for algorithms
+    like HMC.
+
+The ``biject_to`` and ``transform_to`` objects can be extended by user-defined
+constraints and transforms using their ``.register()`` method either as a
+function on singleton constraints::
+
+    transform_to.register(my_constraint, my_transform)
+
+or as a decorator on parameterized constraints::
+
+    @transform_to.register(MyConstraintClass)
+    def my_factory(constraint):
+        assert isinstance(constraint, MyConstraintClass)
+        return MyTransform(constraint.param1, constraint.param2)
+
+You can create your own registry by creating a new :class:`ConstraintRegistry`
+object.
+"""
+
+import numbers
+
+from torch.distributions import constraints, transforms
+
+__all__ = [
+    "ConstraintRegistry",
+    "biject_to",
+    "transform_to",
+]
+
+
+class ConstraintRegistry:
+    """
+    Registry to link constraints to transforms.
+    """
+
+    def __init__(self):
+        self._registry = {}
+        super().__init__()
+
+    def register(self, constraint, factory=None):
+        """
+        Registers a :class:`~torch.distributions.constraints.Constraint`
+        subclass in this registry. Usage::
+
+            @my_registry.register(MyConstraintClass)
+            def construct_transform(constraint):
+                assert isinstance(constraint, MyConstraint)
+                return MyTransform(constraint.arg_constraints)
+
+        Args:
+            constraint (subclass of :class:`~torch.distributions.constraints.Constraint`):
+                A subclass of :class:`~torch.distributions.constraints.Constraint`, or
+                a singleton object of the desired class.
+            factory (Callable): A callable that inputs a constraint object and returns
+                a  :class:`~torch.distributions.transforms.Transform` object.
+        """
+        # Support use as decorator.
+        if factory is None:
+            return lambda factory: self.register(constraint, factory)
+
+        # Support calling on singleton instances.
+        if isinstance(constraint, constraints.Constraint):
+            constraint = type(constraint)
+
+        if not isinstance(constraint, type) or not issubclass(
+            constraint, constraints.Constraint
+        ):
+            raise TypeError(
+                f"Expected constraint to be either a Constraint subclass or instance, but got {constraint}"
+            )
+
+        self._registry[constraint] = factory
+        return factory
+
+    def __call__(self, constraint):
+        """
+        Looks up a transform to constrained space, given a constraint object.
+        Usage::
+
+            constraint = Normal.arg_constraints['scale']
+            scale = transform_to(constraint)(torch.zeros(1))  # constrained
+            u = transform_to(constraint).inv(scale)           # unconstrained
+
+        Args:
+            constraint (:class:`~torch.distributions.constraints.Constraint`):
+                A constraint object.
+
+        Returns:
+            A :class:`~torch.distributions.transforms.Transform` object.
+
+        Raises:
+            `NotImplementedError` if no transform has been registered.
+        """
+        # Look up by Constraint subclass.
+        try:
+            factory = self._registry[type(constraint)]
+        except KeyError:
+            raise NotImplementedError(
+                f"Cannot transform {type(constraint).__name__} constraints"
+            ) from None
+        return factory(constraint)
+
+
+biject_to = ConstraintRegistry()
+transform_to = ConstraintRegistry()
+
+
+################################################################################
+# Registration Table
+################################################################################
+
+
+@biject_to.register(constraints.real)
+@transform_to.register(constraints.real)
+def _transform_to_real(constraint):
+    return transforms.identity_transform
+
+
+@biject_to.register(constraints.independent)
+def _biject_to_independent(constraint):
+    base_transform = biject_to(constraint.base_constraint)
+    return transforms.IndependentTransform(
+        base_transform, constraint.reinterpreted_batch_ndims
+    )
+
+
+@transform_to.register(constraints.independent)
+def _transform_to_independent(constraint):
+    base_transform = transform_to(constraint.base_constraint)
+    return transforms.IndependentTransform(
+        base_transform, constraint.reinterpreted_batch_ndims
+    )
+
+
+@biject_to.register(constraints.positive)
+@biject_to.register(constraints.nonnegative)
+@transform_to.register(constraints.positive)
+@transform_to.register(constraints.nonnegative)
+def _transform_to_positive(constraint):
+    return transforms.ExpTransform()
+
+
+@biject_to.register(constraints.greater_than)
+@biject_to.register(constraints.greater_than_eq)
+@transform_to.register(constraints.greater_than)
+@transform_to.register(constraints.greater_than_eq)
+def _transform_to_greater_than(constraint):
+    return transforms.ComposeTransform(
+        [
+            transforms.ExpTransform(),
+            transforms.AffineTransform(constraint.lower_bound, 1),
+        ]
+    )
+
+
+@biject_to.register(constraints.less_than)
+@transform_to.register(constraints.less_than)
+def _transform_to_less_than(constraint):
+    return transforms.ComposeTransform(
+        [
+            transforms.ExpTransform(),
+            transforms.AffineTransform(constraint.upper_bound, -1),
+        ]
+    )
+
+
+@biject_to.register(constraints.interval)
+@biject_to.register(constraints.half_open_interval)
+@transform_to.register(constraints.interval)
+@transform_to.register(constraints.half_open_interval)
+def _transform_to_interval(constraint):
+    # Handle the special case of the unit interval.
+    lower_is_0 = (
+        isinstance(constraint.lower_bound, numbers.Number)
+        and constraint.lower_bound == 0
+    )
+    upper_is_1 = (
+        isinstance(constraint.upper_bound, numbers.Number)
+        and constraint.upper_bound == 1
+    )
+    if lower_is_0 and upper_is_1:
+        return transforms.SigmoidTransform()
+
+    loc = constraint.lower_bound
+    scale = constraint.upper_bound - constraint.lower_bound
+    return transforms.ComposeTransform(
+        [transforms.SigmoidTransform(), transforms.AffineTransform(loc, scale)]
+    )
+
+
+@biject_to.register(constraints.simplex)
+def _biject_to_simplex(constraint):
+    return transforms.StickBreakingTransform()
+
+
+@transform_to.register(constraints.simplex)
+def _transform_to_simplex(constraint):
+    return transforms.SoftmaxTransform()
+
+
+# TODO define a bijection for LowerCholeskyTransform
+@transform_to.register(constraints.lower_cholesky)
+def _transform_to_lower_cholesky(constraint):
+    return transforms.LowerCholeskyTransform()
+
+
+@transform_to.register(constraints.positive_definite)
+@transform_to.register(constraints.positive_semidefinite)
+def _transform_to_positive_definite(constraint):
+    return transforms.PositiveDefiniteTransform()
+
+
+@biject_to.register(constraints.corr_cholesky)
+@transform_to.register(constraints.corr_cholesky)
+def _transform_to_corr_cholesky(constraint):
+    return transforms.CorrCholeskyTransform()
+
+
+@biject_to.register(constraints.cat)
+def _biject_to_cat(constraint):
+    return transforms.CatTransform(
+        [biject_to(c) for c in constraint.cseq], constraint.dim, constraint.lengths
+    )
+
+
+@transform_to.register(constraints.cat)
+def _transform_to_cat(constraint):
+    return transforms.CatTransform(
+        [transform_to(c) for c in constraint.cseq], constraint.dim, constraint.lengths
+    )
+
+
+@biject_to.register(constraints.stack)
+def _biject_to_stack(constraint):
+    return transforms.StackTransform(
+        [biject_to(c) for c in constraint.cseq], constraint.dim
+    )
+
+
+@transform_to.register(constraints.stack)
+def _transform_to_stack(constraint):
+    return transforms.StackTransform(
+        [transform_to(c) for c in constraint.cseq], constraint.dim
+    )
diff --git a/MLPY/Lib/site-packages/torch/distributions/constraints.py b/MLPY/Lib/site-packages/torch/distributions/constraints.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bece7c44c276fe5cf5d47a15cd6d3127a1d7c2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/constraints.py
@@ -0,0 +1,657 @@
+r"""
+The following constraints are implemented:
+
+- ``constraints.boolean``
+- ``constraints.cat``
+- ``constraints.corr_cholesky``
+- ``constraints.dependent``
+- ``constraints.greater_than(lower_bound)``
+- ``constraints.greater_than_eq(lower_bound)``
+- ``constraints.independent(constraint, reinterpreted_batch_ndims)``
+- ``constraints.integer_interval(lower_bound, upper_bound)``
+- ``constraints.interval(lower_bound, upper_bound)``
+- ``constraints.less_than(upper_bound)``
+- ``constraints.lower_cholesky``
+- ``constraints.lower_triangular``
+- ``constraints.multinomial``
+- ``constraints.nonnegative``
+- ``constraints.nonnegative_integer``
+- ``constraints.one_hot``
+- ``constraints.positive_integer``
+- ``constraints.positive``
+- ``constraints.positive_semidefinite``
+- ``constraints.positive_definite``
+- ``constraints.real_vector``
+- ``constraints.real``
+- ``constraints.simplex``
+- ``constraints.symmetric``
+- ``constraints.stack``
+- ``constraints.square``
+- ``constraints.symmetric``
+- ``constraints.unit_interval``
+"""
+
+import torch
+
+__all__ = [
+    "Constraint",
+    "boolean",
+    "cat",
+    "corr_cholesky",
+    "dependent",
+    "dependent_property",
+    "greater_than",
+    "greater_than_eq",
+    "independent",
+    "integer_interval",
+    "interval",
+    "half_open_interval",
+    "is_dependent",
+    "less_than",
+    "lower_cholesky",
+    "lower_triangular",
+    "multinomial",
+    "nonnegative",
+    "nonnegative_integer",
+    "one_hot",
+    "positive",
+    "positive_semidefinite",
+    "positive_definite",
+    "positive_integer",
+    "real",
+    "real_vector",
+    "simplex",
+    "square",
+    "stack",
+    "symmetric",
+    "unit_interval",
+]
+
+
+class Constraint:
+    """
+    Abstract base class for constraints.
+
+    A constraint object represents a region over which a variable is valid,
+    e.g. within which a variable can be optimized.
+
+    Attributes:
+        is_discrete (bool): Whether constrained space is discrete.
+            Defaults to False.
+        event_dim (int): Number of rightmost dimensions that together define
+            an event. The :meth:`check` method will remove this many dimensions
+            when computing validity.
+    """
+
+    is_discrete = False  # Default to continuous.
+    event_dim = 0  # Default to univariate.
+
+    def check(self, value):
+        """
+        Returns a byte tensor of ``sample_shape + batch_shape`` indicating
+        whether each event in value satisfies this constraint.
+        """
+        raise NotImplementedError
+
+    def __repr__(self):
+        return self.__class__.__name__[1:] + "()"
+
+
+class _Dependent(Constraint):
+    """
+    Placeholder for variables whose support depends on other variables.
+    These variables obey no simple coordinate-wise constraints.
+
+    Args:
+        is_discrete (bool): Optional value of ``.is_discrete`` in case this
+            can be computed statically. If not provided, access to the
+            ``.is_discrete`` attribute will raise a NotImplementedError.
+        event_dim (int): Optional value of ``.event_dim`` in case this
+            can be computed statically. If not provided, access to the
+            ``.event_dim`` attribute will raise a NotImplementedError.
+    """
+
+    def __init__(self, *, is_discrete=NotImplemented, event_dim=NotImplemented):
+        self._is_discrete = is_discrete
+        self._event_dim = event_dim
+        super().__init__()
+
+    @property
+    def is_discrete(self):
+        if self._is_discrete is NotImplemented:
+            raise NotImplementedError(".is_discrete cannot be determined statically")
+        return self._is_discrete
+
+    @property
+    def event_dim(self):
+        if self._event_dim is NotImplemented:
+            raise NotImplementedError(".event_dim cannot be determined statically")
+        return self._event_dim
+
+    def __call__(self, *, is_discrete=NotImplemented, event_dim=NotImplemented):
+        """
+        Support for syntax to customize static attributes::
+
+            constraints.dependent(is_discrete=True, event_dim=1)
+        """
+        if is_discrete is NotImplemented:
+            is_discrete = self._is_discrete
+        if event_dim is NotImplemented:
+            event_dim = self._event_dim
+        return _Dependent(is_discrete=is_discrete, event_dim=event_dim)
+
+    def check(self, x):
+        raise ValueError("Cannot determine validity of dependent constraint")
+
+
+def is_dependent(constraint):
+    return isinstance(constraint, _Dependent)
+
+
+class _DependentProperty(property, _Dependent):
+    """
+    Decorator that extends @property to act like a `Dependent` constraint when
+    called on a class and act like a property when called on an object.
+
+    Example::
+
+        class Uniform(Distribution):
+            def __init__(self, low, high):
+                self.low = low
+                self.high = high
+            @constraints.dependent_property(is_discrete=False, event_dim=0)
+            def support(self):
+                return constraints.interval(self.low, self.high)
+
+    Args:
+        fn (Callable): The function to be decorated.
+        is_discrete (bool): Optional value of ``.is_discrete`` in case this
+            can be computed statically. If not provided, access to the
+            ``.is_discrete`` attribute will raise a NotImplementedError.
+        event_dim (int): Optional value of ``.event_dim`` in case this
+            can be computed statically. If not provided, access to the
+            ``.event_dim`` attribute will raise a NotImplementedError.
+    """
+
+    def __init__(
+        self, fn=None, *, is_discrete=NotImplemented, event_dim=NotImplemented
+    ):
+        super().__init__(fn)
+        self._is_discrete = is_discrete
+        self._event_dim = event_dim
+
+    def __call__(self, fn):
+        """
+        Support for syntax to customize static attributes::
+
+            @constraints.dependent_property(is_discrete=True, event_dim=1)
+            def support(self):
+                ...
+        """
+        return _DependentProperty(
+            fn, is_discrete=self._is_discrete, event_dim=self._event_dim
+        )
+
+
+class _IndependentConstraint(Constraint):
+    """
+    Wraps a constraint by aggregating over ``reinterpreted_batch_ndims``-many
+    dims in :meth:`check`, so that an event is valid only if all its
+    independent entries are valid.
+    """
+
+    def __init__(self, base_constraint, reinterpreted_batch_ndims):
+        assert isinstance(base_constraint, Constraint)
+        assert isinstance(reinterpreted_batch_ndims, int)
+        assert reinterpreted_batch_ndims >= 0
+        self.base_constraint = base_constraint
+        self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
+        super().__init__()
+
+    @property
+    def is_discrete(self):
+        return self.base_constraint.is_discrete
+
+    @property
+    def event_dim(self):
+        return self.base_constraint.event_dim + self.reinterpreted_batch_ndims
+
+    def check(self, value):
+        result = self.base_constraint.check(value)
+        if result.dim() < self.reinterpreted_batch_ndims:
+            expected = self.base_constraint.event_dim + self.reinterpreted_batch_ndims
+            raise ValueError(
+                f"Expected value.dim() >= {expected} but got {value.dim()}"
+            )
+        result = result.reshape(
+            result.shape[: result.dim() - self.reinterpreted_batch_ndims] + (-1,)
+        )
+        result = result.all(-1)
+        return result
+
+    def __repr__(self):
+        return f"{self.__class__.__name__[1:]}({repr(self.base_constraint)}, {self.reinterpreted_batch_ndims})"
+
+
+class _Boolean(Constraint):
+    """
+    Constrain to the two values `{0, 1}`.
+    """
+
+    is_discrete = True
+
+    def check(self, value):
+        return (value == 0) | (value == 1)
+
+
+class _OneHot(Constraint):
+    """
+    Constrain to one-hot vectors.
+    """
+
+    is_discrete = True
+    event_dim = 1
+
+    def check(self, value):
+        is_boolean = (value == 0) | (value == 1)
+        is_normalized = value.sum(-1).eq(1)
+        return is_boolean.all(-1) & is_normalized
+
+
+class _IntegerInterval(Constraint):
+    """
+    Constrain to an integer interval `[lower_bound, upper_bound]`.
+    """
+
+    is_discrete = True
+
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        super().__init__()
+
+    def check(self, value):
+        return (
+            (value % 1 == 0) & (self.lower_bound <= value) & (value <= self.upper_bound)
+        )
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += (
+            f"(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})"
+        )
+        return fmt_string
+
+
+class _IntegerLessThan(Constraint):
+    """
+    Constrain to an integer interval `(-inf, upper_bound]`.
+    """
+
+    is_discrete = True
+
+    def __init__(self, upper_bound):
+        self.upper_bound = upper_bound
+        super().__init__()
+
+    def check(self, value):
+        return (value % 1 == 0) & (value <= self.upper_bound)
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += f"(upper_bound={self.upper_bound})"
+        return fmt_string
+
+
+class _IntegerGreaterThan(Constraint):
+    """
+    Constrain to an integer interval `[lower_bound, inf)`.
+    """
+
+    is_discrete = True
+
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+        super().__init__()
+
+    def check(self, value):
+        return (value % 1 == 0) & (value >= self.lower_bound)
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += f"(lower_bound={self.lower_bound})"
+        return fmt_string
+
+
+class _Real(Constraint):
+    """
+    Trivially constrain to the extended real line `[-inf, inf]`.
+    """
+
+    def check(self, value):
+        return value == value  # False for NANs.
+
+
+class _GreaterThan(Constraint):
+    """
+    Constrain to a real half line `(lower_bound, inf]`.
+    """
+
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+        super().__init__()
+
+    def check(self, value):
+        return self.lower_bound < value
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += f"(lower_bound={self.lower_bound})"
+        return fmt_string
+
+
+class _GreaterThanEq(Constraint):
+    """
+    Constrain to a real half line `[lower_bound, inf)`.
+    """
+
+    def __init__(self, lower_bound):
+        self.lower_bound = lower_bound
+        super().__init__()
+
+    def check(self, value):
+        return self.lower_bound <= value
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += f"(lower_bound={self.lower_bound})"
+        return fmt_string
+
+
+class _LessThan(Constraint):
+    """
+    Constrain to a real half line `[-inf, upper_bound)`.
+    """
+
+    def __init__(self, upper_bound):
+        self.upper_bound = upper_bound
+        super().__init__()
+
+    def check(self, value):
+        return value < self.upper_bound
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += f"(upper_bound={self.upper_bound})"
+        return fmt_string
+
+
+class _Interval(Constraint):
+    """
+    Constrain to a real interval `[lower_bound, upper_bound]`.
+    """
+
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        super().__init__()
+
+    def check(self, value):
+        return (self.lower_bound <= value) & (value <= self.upper_bound)
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += (
+            f"(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})"
+        )
+        return fmt_string
+
+
+class _HalfOpenInterval(Constraint):
+    """
+    Constrain to a real interval `[lower_bound, upper_bound)`.
+    """
+
+    def __init__(self, lower_bound, upper_bound):
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        super().__init__()
+
+    def check(self, value):
+        return (self.lower_bound <= value) & (value < self.upper_bound)
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__[1:]
+        fmt_string += (
+            f"(lower_bound={self.lower_bound}, upper_bound={self.upper_bound})"
+        )
+        return fmt_string
+
+
+class _Simplex(Constraint):
+    """
+    Constrain to the unit simplex in the innermost (rightmost) dimension.
+    Specifically: `x >= 0` and `x.sum(-1) == 1`.
+    """
+
+    event_dim = 1
+
+    def check(self, value):
+        return torch.all(value >= 0, dim=-1) & ((value.sum(-1) - 1).abs() < 1e-6)
+
+
+class _Multinomial(Constraint):
+    """
+    Constrain to nonnegative integer values summing to at most an upper bound.
+
+    Note due to limitations of the Multinomial distribution, this currently
+    checks the weaker condition ``value.sum(-1) <= upper_bound``. In the future
+    this may be strengthened to ``value.sum(-1) == upper_bound``.
+    """
+
+    is_discrete = True
+    event_dim = 1
+
+    def __init__(self, upper_bound):
+        self.upper_bound = upper_bound
+
+    def check(self, x):
+        return (x >= 0).all(dim=-1) & (x.sum(dim=-1) <= self.upper_bound)
+
+
+class _LowerTriangular(Constraint):
+    """
+    Constrain to lower-triangular square matrices.
+    """
+
+    event_dim = 2
+
+    def check(self, value):
+        value_tril = value.tril()
+        return (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
+
+
+class _LowerCholesky(Constraint):
+    """
+    Constrain to lower-triangular square matrices with positive diagonals.
+    """
+
+    event_dim = 2
+
+    def check(self, value):
+        value_tril = value.tril()
+        lower_triangular = (
+            (value_tril == value).view(value.shape[:-2] + (-1,)).min(-1)[0]
+        )
+
+        positive_diagonal = (value.diagonal(dim1=-2, dim2=-1) > 0).min(-1)[0]
+        return lower_triangular & positive_diagonal
+
+
+class _CorrCholesky(Constraint):
+    """
+    Constrain to lower-triangular square matrices with positive diagonals and each
+    row vector being of unit length.
+    """
+
+    event_dim = 2
+
+    def check(self, value):
+        tol = (
+            torch.finfo(value.dtype).eps * value.size(-1) * 10
+        )  # 10 is an adjustable fudge factor
+        row_norm = torch.linalg.norm(value.detach(), dim=-1)
+        unit_row_norm = (row_norm - 1.0).abs().le(tol).all(dim=-1)
+        return _LowerCholesky().check(value) & unit_row_norm
+
+
+class _Square(Constraint):
+    """
+    Constrain to square matrices.
+    """
+
+    event_dim = 2
+
+    def check(self, value):
+        return torch.full(
+            size=value.shape[:-2],
+            fill_value=(value.shape[-2] == value.shape[-1]),
+            dtype=torch.bool,
+            device=value.device,
+        )
+
+
+class _Symmetric(_Square):
+    """
+    Constrain to Symmetric square matrices.
+    """
+
+    def check(self, value):
+        square_check = super().check(value)
+        if not square_check.all():
+            return square_check
+        return torch.isclose(value, value.mT, atol=1e-6).all(-2).all(-1)
+
+
+class _PositiveSemidefinite(_Symmetric):
+    """
+    Constrain to positive-semidefinite matrices.
+    """
+
+    def check(self, value):
+        sym_check = super().check(value)
+        if not sym_check.all():
+            return sym_check
+        return torch.linalg.eigvalsh(value).ge(0).all(-1)
+
+
+class _PositiveDefinite(_Symmetric):
+    """
+    Constrain to positive-definite matrices.
+    """
+
+    def check(self, value):
+        sym_check = super().check(value)
+        if not sym_check.all():
+            return sym_check
+        return torch.linalg.cholesky_ex(value).info.eq(0)
+
+
+class _Cat(Constraint):
+    """
+    Constraint functor that applies a sequence of constraints
+    `cseq` at the submatrices at dimension `dim`,
+    each of size `lengths[dim]`, in a way compatible with :func:`torch.cat`.
+    """
+
+    def __init__(self, cseq, dim=0, lengths=None):
+        assert all(isinstance(c, Constraint) for c in cseq)
+        self.cseq = list(cseq)
+        if lengths is None:
+            lengths = [1] * len(self.cseq)
+        self.lengths = list(lengths)
+        assert len(self.lengths) == len(self.cseq)
+        self.dim = dim
+        super().__init__()
+
+    @property
+    def is_discrete(self):
+        return any(c.is_discrete for c in self.cseq)
+
+    @property
+    def event_dim(self):
+        return max(c.event_dim for c in self.cseq)
+
+    def check(self, value):
+        assert -value.dim() <= self.dim < value.dim()
+        checks = []
+        start = 0
+        for constr, length in zip(self.cseq, self.lengths):
+            v = value.narrow(self.dim, start, length)
+            checks.append(constr.check(v))
+            start = start + length  # avoid += for jit compat
+        return torch.cat(checks, self.dim)
+
+
+class _Stack(Constraint):
+    """
+    Constraint functor that applies a sequence of constraints
+    `cseq` at the submatrices at dimension `dim`,
+    in a way compatible with :func:`torch.stack`.
+    """
+
+    def __init__(self, cseq, dim=0):
+        assert all(isinstance(c, Constraint) for c in cseq)
+        self.cseq = list(cseq)
+        self.dim = dim
+        super().__init__()
+
+    @property
+    def is_discrete(self):
+        return any(c.is_discrete for c in self.cseq)
+
+    @property
+    def event_dim(self):
+        dim = max(c.event_dim for c in self.cseq)
+        if self.dim + dim < 0:
+            dim += 1
+        return dim
+
+    def check(self, value):
+        assert -value.dim() <= self.dim < value.dim()
+        vs = [value.select(self.dim, i) for i in range(value.size(self.dim))]
+        return torch.stack(
+            [constr.check(v) for v, constr in zip(vs, self.cseq)], self.dim
+        )
+
+
+# Public interface.
+dependent = _Dependent()
+dependent_property = _DependentProperty
+independent = _IndependentConstraint
+boolean = _Boolean()
+one_hot = _OneHot()
+nonnegative_integer = _IntegerGreaterThan(0)
+positive_integer = _IntegerGreaterThan(1)
+integer_interval = _IntegerInterval
+real = _Real()
+real_vector = independent(real, 1)
+positive = _GreaterThan(0.0)
+nonnegative = _GreaterThanEq(0.0)
+greater_than = _GreaterThan
+greater_than_eq = _GreaterThanEq
+less_than = _LessThan
+multinomial = _Multinomial
+unit_interval = _Interval(0.0, 1.0)
+interval = _Interval
+half_open_interval = _HalfOpenInterval
+simplex = _Simplex()
+lower_triangular = _LowerTriangular()
+lower_cholesky = _LowerCholesky()
+corr_cholesky = _CorrCholesky()
+square = _Square()
+symmetric = _Symmetric()
+positive_semidefinite = _PositiveSemidefinite()
+positive_definite = _PositiveDefinite()
+cat = _Cat
+stack = _Stack
diff --git a/MLPY/Lib/site-packages/torch/distributions/continuous_bernoulli.py b/MLPY/Lib/site-packages/torch/distributions/continuous_bernoulli.py
new file mode 100644
index 0000000000000000000000000000000000000000..a867738dbe74085414bc3d3a591dc40531a57173
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/continuous_bernoulli.py
@@ -0,0 +1,235 @@
+import math
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import (
+    broadcast_all,
+    clamp_probs,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+from torch.nn.functional import binary_cross_entropy_with_logits
+
+__all__ = ["ContinuousBernoulli"]
+
+
+class ContinuousBernoulli(ExponentialFamily):
+    r"""
+    Creates a continuous Bernoulli distribution parameterized by :attr:`probs`
+    or :attr:`logits` (but not both).
+
+    The distribution is supported in [0, 1] and parameterized by 'probs' (in
+    (0,1)) or 'logits' (real-valued). Note that, unlike the Bernoulli, 'probs'
+    does not correspond to a probability and 'logits' does not correspond to
+    log-odds, but the same names are used due to the similarity with the
+    Bernoulli. See [1] for more details.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = ContinuousBernoulli(torch.tensor([0.3]))
+        >>> m.sample()
+        tensor([ 0.2538])
+
+    Args:
+        probs (Number, Tensor): (0,1) valued parameters
+        logits (Number, Tensor): real valued parameters whose sigmoid matches 'probs'
+
+    [1] The continuous Bernoulli: fixing a pervasive error in variational
+    autoencoders, Loaiza-Ganem G and Cunningham JP, NeurIPS 2019.
+    https://arxiv.org/abs/1907.06845
+    """
+    arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    support = constraints.unit_interval
+    _mean_carrier_measure = 0
+    has_rsample = True
+
+    def __init__(
+        self, probs=None, logits=None, lims=(0.499, 0.501), validate_args=None
+    ):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            is_scalar = isinstance(probs, Number)
+            (self.probs,) = broadcast_all(probs)
+            # validate 'probs' here if necessary as it is later clamped for numerical stability
+            # close to 0 and 1, later on; otherwise the clamped 'probs' would always pass
+            if validate_args is not None:
+                if not self.arg_constraints["probs"].check(self.probs).all():
+                    raise ValueError("The parameter probs has invalid values")
+            self.probs = clamp_probs(self.probs)
+        else:
+            is_scalar = isinstance(logits, Number)
+            (self.logits,) = broadcast_all(logits)
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        self._lims = lims
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(ContinuousBernoulli, _instance)
+        new._lims = self._lims
+        batch_shape = torch.Size(batch_shape)
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(ContinuousBernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    def _outside_unstable_region(self):
+        return torch.max(
+            torch.le(self.probs, self._lims[0]), torch.gt(self.probs, self._lims[1])
+        )
+
+    def _cut_probs(self):
+        return torch.where(
+            self._outside_unstable_region(),
+            self.probs,
+            self._lims[0] * torch.ones_like(self.probs),
+        )
+
+    def _cont_bern_log_norm(self):
+        """computes the log normalizing constant as a function of the 'probs' parameter"""
+        cut_probs = self._cut_probs()
+        cut_probs_below_half = torch.where(
+            torch.le(cut_probs, 0.5), cut_probs, torch.zeros_like(cut_probs)
+        )
+        cut_probs_above_half = torch.where(
+            torch.ge(cut_probs, 0.5), cut_probs, torch.ones_like(cut_probs)
+        )
+        log_norm = torch.log(
+            torch.abs(torch.log1p(-cut_probs) - torch.log(cut_probs))
+        ) - torch.where(
+            torch.le(cut_probs, 0.5),
+            torch.log1p(-2.0 * cut_probs_below_half),
+            torch.log(2.0 * cut_probs_above_half - 1.0),
+        )
+        x = torch.pow(self.probs - 0.5, 2)
+        taylor = math.log(2.0) + (4.0 / 3.0 + 104.0 / 45.0 * x) * x
+        return torch.where(self._outside_unstable_region(), log_norm, taylor)
+
+    @property
+    def mean(self):
+        cut_probs = self._cut_probs()
+        mus = cut_probs / (2.0 * cut_probs - 1.0) + 1.0 / (
+            torch.log1p(-cut_probs) - torch.log(cut_probs)
+        )
+        x = self.probs - 0.5
+        taylor = 0.5 + (1.0 / 3.0 + 16.0 / 45.0 * torch.pow(x, 2)) * x
+        return torch.where(self._outside_unstable_region(), mus, taylor)
+
+    @property
+    def stddev(self):
+        return torch.sqrt(self.variance)
+
+    @property
+    def variance(self):
+        cut_probs = self._cut_probs()
+        vars = cut_probs * (cut_probs - 1.0) / torch.pow(
+            1.0 - 2.0 * cut_probs, 2
+        ) + 1.0 / torch.pow(torch.log1p(-cut_probs) - torch.log(cut_probs), 2)
+        x = torch.pow(self.probs - 0.5, 2)
+        taylor = 1.0 / 12.0 - (1.0 / 15.0 - 128.0 / 945.0 * x) * x
+        return torch.where(self._outside_unstable_region(), vars, taylor)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return clamp_probs(logits_to_probs(self.logits, is_binary=True))
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device)
+        with torch.no_grad():
+            return self.icdf(u)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device)
+        return self.icdf(u)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        return (
+            -binary_cross_entropy_with_logits(logits, value, reduction="none")
+            + self._cont_bern_log_norm()
+        )
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        cut_probs = self._cut_probs()
+        cdfs = (
+            torch.pow(cut_probs, value) * torch.pow(1.0 - cut_probs, 1.0 - value)
+            + cut_probs
+            - 1.0
+        ) / (2.0 * cut_probs - 1.0)
+        unbounded_cdfs = torch.where(self._outside_unstable_region(), cdfs, value)
+        return torch.where(
+            torch.le(value, 0.0),
+            torch.zeros_like(value),
+            torch.where(torch.ge(value, 1.0), torch.ones_like(value), unbounded_cdfs),
+        )
+
+    def icdf(self, value):
+        cut_probs = self._cut_probs()
+        return torch.where(
+            self._outside_unstable_region(),
+            (
+                torch.log1p(-cut_probs + value * (2.0 * cut_probs - 1.0))
+                - torch.log1p(-cut_probs)
+            )
+            / (torch.log(cut_probs) - torch.log1p(-cut_probs)),
+            value,
+        )
+
+    def entropy(self):
+        log_probs0 = torch.log1p(-self.probs)
+        log_probs1 = torch.log(self.probs)
+        return (
+            self.mean * (log_probs0 - log_probs1)
+            - self._cont_bern_log_norm()
+            - log_probs0
+        )
+
+    @property
+    def _natural_params(self):
+        return (self.logits,)
+
+    def _log_normalizer(self, x):
+        """computes the log normalizing constant as a function of the natural parameter"""
+        out_unst_reg = torch.max(
+            torch.le(x, self._lims[0] - 0.5), torch.gt(x, self._lims[1] - 0.5)
+        )
+        cut_nat_params = torch.where(
+            out_unst_reg, x, (self._lims[0] - 0.5) * torch.ones_like(x)
+        )
+        log_norm = torch.log(torch.abs(torch.exp(cut_nat_params) - 1.0)) - torch.log(
+            torch.abs(cut_nat_params)
+        )
+        taylor = 0.5 * x + torch.pow(x, 2) / 24.0 - torch.pow(x, 4) / 2880.0
+        return torch.where(out_unst_reg, log_norm, taylor)
diff --git a/MLPY/Lib/site-packages/torch/distributions/dirichlet.py b/MLPY/Lib/site-packages/torch/distributions/dirichlet.py
new file mode 100644
index 0000000000000000000000000000000000000000..514433b3478bc08ae1168c4e56294fad04afa03f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/dirichlet.py
@@ -0,0 +1,123 @@
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+
+__all__ = ["Dirichlet"]
+
+
+# This helper is exposed for testing.
+def _Dirichlet_backward(x, concentration, grad_output):
+    total = concentration.sum(-1, True).expand_as(concentration)
+    grad = torch._dirichlet_grad(x, concentration, total)
+    return grad * (grad_output - (x * grad_output).sum(-1, True))
+
+
+class _Dirichlet(Function):
+    @staticmethod
+    def forward(ctx, concentration):
+        x = torch._sample_dirichlet(concentration)
+        ctx.save_for_backward(x, concentration)
+        return x
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        x, concentration = ctx.saved_tensors
+        return _Dirichlet_backward(x, concentration, grad_output)
+
+
+class Dirichlet(ExponentialFamily):
+    r"""
+    Creates a Dirichlet distribution parameterized by concentration :attr:`concentration`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Dirichlet(torch.tensor([0.5, 0.5]))
+        >>> m.sample()  # Dirichlet distributed with concentration [0.5, 0.5]
+        tensor([ 0.1046,  0.8954])
+
+    Args:
+        concentration (Tensor): concentration parameter of the distribution
+            (often referred to as alpha)
+    """
+    arg_constraints = {
+        "concentration": constraints.independent(constraints.positive, 1)
+    }
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, concentration, validate_args=None):
+        if concentration.dim() < 1:
+            raise ValueError(
+                "`concentration` parameter must be at least one-dimensional."
+            )
+        self.concentration = concentration
+        batch_shape, event_shape = concentration.shape[:-1], concentration.shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Dirichlet, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape + self.event_shape)
+        super(Dirichlet, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=()):
+        shape = self._extended_shape(sample_shape)
+        concentration = self.concentration.expand(shape)
+        return _Dirichlet.apply(concentration)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return (
+            torch.xlogy(self.concentration - 1.0, value).sum(-1)
+            + torch.lgamma(self.concentration.sum(-1))
+            - torch.lgamma(self.concentration).sum(-1)
+        )
+
+    @property
+    def mean(self):
+        return self.concentration / self.concentration.sum(-1, True)
+
+    @property
+    def mode(self):
+        concentrationm1 = (self.concentration - 1).clamp(min=0.0)
+        mode = concentrationm1 / concentrationm1.sum(-1, True)
+        mask = (self.concentration < 1).all(axis=-1)
+        mode[mask] = torch.nn.functional.one_hot(
+            mode[mask].argmax(axis=-1), concentrationm1.shape[-1]
+        ).to(mode)
+        return mode
+
+    @property
+    def variance(self):
+        con0 = self.concentration.sum(-1, True)
+        return (
+            self.concentration
+            * (con0 - self.concentration)
+            / (con0.pow(2) * (con0 + 1))
+        )
+
+    def entropy(self):
+        k = self.concentration.size(-1)
+        a0 = self.concentration.sum(-1)
+        return (
+            torch.lgamma(self.concentration).sum(-1)
+            - torch.lgamma(a0)
+            - (k - a0) * torch.digamma(a0)
+            - ((self.concentration - 1.0) * torch.digamma(self.concentration)).sum(-1)
+        )
+
+    @property
+    def _natural_params(self):
+        return (self.concentration,)
+
+    def _log_normalizer(self, x):
+        return x.lgamma().sum(-1) - torch.lgamma(x.sum(-1))
diff --git a/MLPY/Lib/site-packages/torch/distributions/distribution.py b/MLPY/Lib/site-packages/torch/distributions/distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee9424a7518f8593cda5957ef6acf51a266e3b5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/distribution.py
@@ -0,0 +1,336 @@
+import warnings
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.utils import lazy_property
+from torch.types import _size
+
+__all__ = ["Distribution"]
+
+
+class Distribution:
+    r"""
+    Distribution is the abstract base class for probability distributions.
+    """
+
+    has_rsample = False
+    has_enumerate_support = False
+    _validate_args = __debug__
+
+    @staticmethod
+    def set_default_validate_args(value: bool) -> None:
+        """
+        Sets whether validation is enabled or disabled.
+
+        The default behavior mimics Python's ``assert`` statement: validation
+        is on by default, but is disabled if Python is run in optimized mode
+        (via ``python -O``). Validation may be expensive, so you may want to
+        disable it once a model is working.
+
+        Args:
+            value (bool): Whether to enable validation.
+        """
+        if value not in [True, False]:
+            raise ValueError
+        Distribution._validate_args = value
+
+    def __init__(
+        self,
+        batch_shape: torch.Size = torch.Size(),
+        event_shape: torch.Size = torch.Size(),
+        validate_args: Optional[bool] = None,
+    ):
+        self._batch_shape = batch_shape
+        self._event_shape = event_shape
+        if validate_args is not None:
+            self._validate_args = validate_args
+        if self._validate_args:
+            try:
+                arg_constraints = self.arg_constraints
+            except NotImplementedError:
+                arg_constraints = {}
+                warnings.warn(
+                    f"{self.__class__} does not define `arg_constraints`. "
+                    + "Please set `arg_constraints = {}` or initialize the distribution "
+                    + "with `validate_args=False` to turn off validation."
+                )
+            for param, constraint in arg_constraints.items():
+                if constraints.is_dependent(constraint):
+                    continue  # skip constraints that cannot be checked
+                if param not in self.__dict__ and isinstance(
+                    getattr(type(self), param), lazy_property
+                ):
+                    continue  # skip checking lazily-constructed args
+                value = getattr(self, param)
+                valid = constraint.check(value)
+                if not valid.all():
+                    raise ValueError(
+                        f"Expected parameter {param} "
+                        f"({type(value).__name__} of shape {tuple(value.shape)}) "
+                        f"of distribution {repr(self)} "
+                        f"to satisfy the constraint {repr(constraint)}, "
+                        f"but found invalid values:\n{value}"
+                    )
+        super().__init__()
+
+    def expand(self, batch_shape: torch.Size, _instance=None):
+        """
+        Returns a new distribution instance (or populates an existing instance
+        provided by a derived class) with batch dimensions expanded to
+        `batch_shape`. This method calls :class:`~torch.Tensor.expand` on
+        the distribution's parameters. As such, this does not allocate new
+        memory for the expanded distribution instance. Additionally,
+        this does not repeat any args checking or parameter broadcasting in
+        `__init__.py`, when an instance is first created.
+
+        Args:
+            batch_shape (torch.Size): the desired expanded size.
+            _instance: new instance provided by subclasses that
+                need to override `.expand`.
+
+        Returns:
+            New distribution instance with batch dimensions expanded to
+            `batch_size`.
+        """
+        raise NotImplementedError
+
+    @property
+    def batch_shape(self) -> torch.Size:
+        """
+        Returns the shape over which parameters are batched.
+        """
+        return self._batch_shape
+
+    @property
+    def event_shape(self) -> torch.Size:
+        """
+        Returns the shape of a single sample (without batching).
+        """
+        return self._event_shape
+
+    @property
+    def arg_constraints(self) -> Dict[str, constraints.Constraint]:
+        """
+        Returns a dictionary from argument names to
+        :class:`~torch.distributions.constraints.Constraint` objects that
+        should be satisfied by each argument of this distribution. Args that
+        are not tensors need not appear in this dict.
+        """
+        raise NotImplementedError
+
+    @property
+    def support(self) -> Optional[Any]:
+        """
+        Returns a :class:`~torch.distributions.constraints.Constraint` object
+        representing this distribution's support.
+        """
+        raise NotImplementedError
+
+    @property
+    def mean(self) -> torch.Tensor:
+        """
+        Returns the mean of the distribution.
+        """
+        raise NotImplementedError
+
+    @property
+    def mode(self) -> torch.Tensor:
+        """
+        Returns the mode of the distribution.
+        """
+        raise NotImplementedError(f"{self.__class__} does not implement mode")
+
+    @property
+    def variance(self) -> torch.Tensor:
+        """
+        Returns the variance of the distribution.
+        """
+        raise NotImplementedError
+
+    @property
+    def stddev(self) -> torch.Tensor:
+        """
+        Returns the standard deviation of the distribution.
+        """
+        return self.variance.sqrt()
+
+    def sample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
+        """
+        Generates a sample_shape shaped sample or sample_shape shaped batch of
+        samples if the distribution parameters are batched.
+        """
+        with torch.no_grad():
+            return self.rsample(sample_shape)
+
+    def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched.
+        """
+        raise NotImplementedError
+
+    def sample_n(self, n: int) -> torch.Tensor:
+        """
+        Generates n samples or n batches of samples if the distribution
+        parameters are batched.
+        """
+        warnings.warn(
+            "sample_n will be deprecated. Use .sample((n,)) instead", UserWarning
+        )
+        return self.sample(torch.Size((n,)))
+
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        """
+        Returns the log of the probability density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def cdf(self, value: torch.Tensor) -> torch.Tensor:
+        """
+        Returns the cumulative density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def icdf(self, value: torch.Tensor) -> torch.Tensor:
+        """
+        Returns the inverse cumulative density/mass function evaluated at
+        `value`.
+
+        Args:
+            value (Tensor):
+        """
+        raise NotImplementedError
+
+    def enumerate_support(self, expand: bool = True) -> torch.Tensor:
+        """
+        Returns tensor containing all values supported by a discrete
+        distribution. The result will enumerate over dimension 0, so the shape
+        of the result will be `(cardinality,) + batch_shape + event_shape`
+        (where `event_shape = ()` for univariate distributions).
+
+        Note that this enumerates over all batched tensors in lock-step
+        `[[0, 0], [1, 1], ...]`. With `expand=False`, enumeration happens
+        along dim 0, but with the remaining batch dimensions being
+        singleton dimensions, `[[0], [1], ..`.
+
+        To iterate over the full Cartesian product use
+        `itertools.product(m.enumerate_support())`.
+
+        Args:
+            expand (bool): whether to expand the support over the
+                batch dims to match the distribution's `batch_shape`.
+
+        Returns:
+            Tensor iterating over dimension 0.
+        """
+        raise NotImplementedError
+
+    def entropy(self) -> torch.Tensor:
+        """
+        Returns entropy of distribution, batched over batch_shape.
+
+        Returns:
+            Tensor of shape batch_shape.
+        """
+        raise NotImplementedError
+
+    def perplexity(self) -> torch.Tensor:
+        """
+        Returns perplexity of distribution, batched over batch_shape.
+
+        Returns:
+            Tensor of shape batch_shape.
+        """
+        return torch.exp(self.entropy())
+
+    def _extended_shape(self, sample_shape: _size = torch.Size()) -> Tuple[int, ...]:
+        """
+        Returns the size of the sample returned by the distribution, given
+        a `sample_shape`. Note, that the batch and event shapes of a distribution
+        instance are fixed at the time of construction. If this is empty, the
+        returned shape is upcast to (1,).
+
+        Args:
+            sample_shape (torch.Size): the size of the sample to be drawn.
+        """
+        if not isinstance(sample_shape, torch.Size):
+            sample_shape = torch.Size(sample_shape)
+        return torch.Size(sample_shape + self._batch_shape + self._event_shape)
+
+    def _validate_sample(self, value: torch.Tensor) -> None:
+        """
+        Argument validation for distribution methods such as `log_prob`,
+        `cdf` and `icdf`. The rightmost dimensions of a value to be
+        scored via these methods must agree with the distribution's batch
+        and event shapes.
+
+        Args:
+            value (Tensor): the tensor whose log probability is to be
+                computed by the `log_prob` method.
+        Raises
+            ValueError: when the rightmost dimensions of `value` do not match the
+                distribution's batch and event shapes.
+        """
+        if not isinstance(value, torch.Tensor):
+            raise ValueError("The value argument to log_prob must be a Tensor")
+
+        event_dim_start = len(value.size()) - len(self._event_shape)
+        if value.size()[event_dim_start:] != self._event_shape:
+            raise ValueError(
+                f"The right-most size of value must match event_shape: {value.size()} vs {self._event_shape}."
+            )
+
+        actual_shape = value.size()
+        expected_shape = self._batch_shape + self._event_shape
+        for i, j in zip(reversed(actual_shape), reversed(expected_shape)):
+            if i != 1 and j != 1 and i != j:
+                raise ValueError(
+                    f"Value is not broadcastable with batch_shape+event_shape: {actual_shape} vs {expected_shape}."
+                )
+        try:
+            support = self.support
+        except NotImplementedError:
+            warnings.warn(
+                f"{self.__class__} does not define `support` to enable "
+                + "sample validation. Please initialize the distribution with "
+                + "`validate_args=False` to turn off validation."
+            )
+            return
+        assert support is not None
+        valid = support.check(value)
+        if not valid.all():
+            raise ValueError(
+                "Expected value argument "
+                f"({type(value).__name__} of shape {tuple(value.shape)}) "
+                f"to be within the support ({repr(support)}) "
+                f"of the distribution {repr(self)}, "
+                f"but found invalid values:\n{value}"
+            )
+
+    def _get_checked_instance(self, cls, _instance=None):
+        if _instance is None and type(self).__init__ != cls.__init__:
+            raise NotImplementedError(
+                f"Subclass {self.__class__.__name__} of {cls.__name__} that defines a custom __init__ method "
+                "must also define a custom .expand() method."
+            )
+        return self.__new__(type(self)) if _instance is None else _instance
+
+    def __repr__(self) -> str:
+        param_names = [k for k, _ in self.arg_constraints.items() if k in self.__dict__]
+        args_string = ", ".join(
+            [
+                f"{p}: {self.__dict__[p] if self.__dict__[p].numel() == 1 else self.__dict__[p].size()}"
+                for p in param_names
+            ]
+        )
+        return self.__class__.__name__ + "(" + args_string + ")"
diff --git a/MLPY/Lib/site-packages/torch/distributions/exp_family.py b/MLPY/Lib/site-packages/torch/distributions/exp_family.py
new file mode 100644
index 0000000000000000000000000000000000000000..b06585810a02a42ea239cbd2fc110197e655335e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/exp_family.py
@@ -0,0 +1,62 @@
+import torch
+from torch.distributions.distribution import Distribution
+
+__all__ = ["ExponentialFamily"]
+
+
+class ExponentialFamily(Distribution):
+    r"""
+    ExponentialFamily is the abstract base class for probability distributions belonging to an
+    exponential family, whose probability mass/density function has the form is defined below
+
+    .. math::
+
+        p_{F}(x; \theta) = \exp(\langle t(x), \theta\rangle - F(\theta) + k(x))
+
+    where :math:`\theta` denotes the natural parameters, :math:`t(x)` denotes the sufficient statistic,
+    :math:`F(\theta)` is the log normalizer function for a given family and :math:`k(x)` is the carrier
+    measure.
+
+    Note:
+        This class is an intermediary between the `Distribution` class and distributions which belong
+        to an exponential family mainly to check the correctness of the `.entropy()` and analytic KL
+        divergence methods. We use this class to compute the entropy and KL divergence using the AD
+        framework and Bregman divergences (courtesy of: Frank Nielsen and Richard Nock, Entropies and
+        Cross-entropies of Exponential Families).
+    """
+
+    @property
+    def _natural_params(self):
+        """
+        Abstract method for natural parameters. Returns a tuple of Tensors based
+        on the distribution
+        """
+        raise NotImplementedError
+
+    def _log_normalizer(self, *natural_params):
+        """
+        Abstract method for log normalizer function. Returns a log normalizer based on
+        the distribution and input
+        """
+        raise NotImplementedError
+
+    @property
+    def _mean_carrier_measure(self):
+        """
+        Abstract method for expected carrier measure, which is required for computing
+        entropy.
+        """
+        raise NotImplementedError
+
+    def entropy(self):
+        """
+        Method to compute the entropy using Bregman divergence of the log normalizer.
+        """
+        result = -self._mean_carrier_measure
+        nparams = [p.detach().requires_grad_() for p in self._natural_params]
+        lg_normal = self._log_normalizer(*nparams)
+        gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True)
+        result += lg_normal
+        for np, g in zip(nparams, gradients):
+            result -= (np * g).reshape(self._batch_shape + (-1,)).sum(-1)
+        return result
diff --git a/MLPY/Lib/site-packages/torch/distributions/exponential.py b/MLPY/Lib/site-packages/torch/distributions/exponential.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b624d3b88dc160aa8de4c6e60227c9cfbb33934
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/exponential.py
@@ -0,0 +1,84 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Exponential"]
+
+
+class Exponential(ExponentialFamily):
+    r"""
+    Creates a Exponential distribution parameterized by :attr:`rate`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Exponential(torch.tensor([1.0]))
+        >>> m.sample()  # Exponential distributed with rate=1
+        tensor([ 0.1046])
+
+    Args:
+        rate (float or Tensor): rate = 1 / scale of the distribution
+    """
+    arg_constraints = {"rate": constraints.positive}
+    support = constraints.nonnegative
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.rate.reciprocal()
+
+    @property
+    def mode(self):
+        return torch.zeros_like(self.rate)
+
+    @property
+    def stddev(self):
+        return self.rate.reciprocal()
+
+    @property
+    def variance(self):
+        return self.rate.pow(-2)
+
+    def __init__(self, rate, validate_args=None):
+        (self.rate,) = broadcast_all(rate)
+        batch_shape = torch.Size() if isinstance(rate, Number) else self.rate.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Exponential, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Exponential, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        return self.rate.new(shape).exponential_() / self.rate
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return self.rate.log() - self.rate * value
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 1 - torch.exp(-self.rate * value)
+
+    def icdf(self, value):
+        return -torch.log1p(-value) / self.rate
+
+    def entropy(self):
+        return 1.0 - torch.log(self.rate)
+
+    @property
+    def _natural_params(self):
+        return (-self.rate,)
+
+    def _log_normalizer(self, x):
+        return -torch.log(-x)
diff --git a/MLPY/Lib/site-packages/torch/distributions/fishersnedecor.py b/MLPY/Lib/site-packages/torch/distributions/fishersnedecor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d01bcd5105ad644c8fce4e202c2bc21b5dbcd0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/fishersnedecor.py
@@ -0,0 +1,98 @@
+from numbers import Number
+
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.gamma import Gamma
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["FisherSnedecor"]
+
+
+class FisherSnedecor(Distribution):
+    r"""
+    Creates a Fisher-Snedecor distribution parameterized by :attr:`df1` and :attr:`df2`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = FisherSnedecor(torch.tensor([1.0]), torch.tensor([2.0]))
+        >>> m.sample()  # Fisher-Snedecor-distributed with df1=1 and df2=2
+        tensor([ 0.2453])
+
+    Args:
+        df1 (float or Tensor): degrees of freedom parameter 1
+        df2 (float or Tensor): degrees of freedom parameter 2
+    """
+    arg_constraints = {"df1": constraints.positive, "df2": constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, df1, df2, validate_args=None):
+        self.df1, self.df2 = broadcast_all(df1, df2)
+        self._gamma1 = Gamma(self.df1 * 0.5, self.df1)
+        self._gamma2 = Gamma(self.df2 * 0.5, self.df2)
+
+        if isinstance(df1, Number) and isinstance(df2, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.df1.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(FisherSnedecor, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df1 = self.df1.expand(batch_shape)
+        new.df2 = self.df2.expand(batch_shape)
+        new._gamma1 = self._gamma1.expand(batch_shape)
+        new._gamma2 = self._gamma2.expand(batch_shape)
+        super(FisherSnedecor, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        df2 = self.df2.clone(memory_format=torch.contiguous_format)
+        df2[df2 <= 2] = nan
+        return df2 / (df2 - 2)
+
+    @property
+    def mode(self):
+        mode = (self.df1 - 2) / self.df1 * self.df2 / (self.df2 + 2)
+        mode[self.df1 <= 2] = nan
+        return mode
+
+    @property
+    def variance(self):
+        df2 = self.df2.clone(memory_format=torch.contiguous_format)
+        df2[df2 <= 4] = nan
+        return (
+            2
+            * df2.pow(2)
+            * (self.df1 + df2 - 2)
+            / (self.df1 * (df2 - 2).pow(2) * (df2 - 4))
+        )
+
+    def rsample(self, sample_shape=torch.Size(())):
+        shape = self._extended_shape(sample_shape)
+        #   X1 ~ Gamma(df1 / 2, 1 / df1), X2 ~ Gamma(df2 / 2, 1 / df2)
+        #   Y = df2 * df1 * X1 / (df1 * df2 * X2) = X1 / X2 ~ F(df1, df2)
+        X1 = self._gamma1.rsample(sample_shape).view(shape)
+        X2 = self._gamma2.rsample(sample_shape).view(shape)
+        tiny = torch.finfo(X2.dtype).tiny
+        X2.clamp_(min=tiny)
+        Y = X1 / X2
+        Y.clamp_(min=tiny)
+        return Y
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        ct1 = self.df1 * 0.5
+        ct2 = self.df2 * 0.5
+        ct3 = self.df1 / self.df2
+        t1 = (ct1 + ct2).lgamma() - ct1.lgamma() - ct2.lgamma()
+        t2 = ct1 * ct3.log() + (ct1 - 1) * torch.log(value)
+        t3 = (ct1 + ct2) * torch.log1p(ct3 * value)
+        return t1 + t2 - t3
diff --git a/MLPY/Lib/site-packages/torch/distributions/gamma.py b/MLPY/Lib/site-packages/torch/distributions/gamma.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b39cb12a407803debd0fec717180248ae090320
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/gamma.py
@@ -0,0 +1,108 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Gamma"]
+
+
+def _standard_gamma(concentration):
+    return torch._standard_gamma(concentration)
+
+
+class Gamma(ExponentialFamily):
+    r"""
+    Creates a Gamma distribution parameterized by shape :attr:`concentration` and :attr:`rate`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Gamma(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # Gamma distributed with concentration=1 and rate=1
+        tensor([ 0.1046])
+
+    Args:
+        concentration (float or Tensor): shape parameter of the distribution
+            (often referred to as alpha)
+        rate (float or Tensor): rate = 1 / scale of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {
+        "concentration": constraints.positive,
+        "rate": constraints.positive,
+    }
+    support = constraints.nonnegative
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.concentration / self.rate
+
+    @property
+    def mode(self):
+        return ((self.concentration - 1) / self.rate).clamp(min=0)
+
+    @property
+    def variance(self):
+        return self.concentration / self.rate.pow(2)
+
+    def __init__(self, concentration, rate, validate_args=None):
+        self.concentration, self.rate = broadcast_all(concentration, rate)
+        if isinstance(concentration, Number) and isinstance(rate, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.concentration.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gamma, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Gamma, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        value = _standard_gamma(self.concentration.expand(shape)) / self.rate.expand(
+            shape
+        )
+        value.detach().clamp_(
+            min=torch.finfo(value.dtype).tiny
+        )  # do not record in autograd graph
+        return value
+
+    def log_prob(self, value):
+        value = torch.as_tensor(value, dtype=self.rate.dtype, device=self.rate.device)
+        if self._validate_args:
+            self._validate_sample(value)
+        return (
+            torch.xlogy(self.concentration, self.rate)
+            + torch.xlogy(self.concentration - 1, value)
+            - self.rate * value
+            - torch.lgamma(self.concentration)
+        )
+
+    def entropy(self):
+        return (
+            self.concentration
+            - torch.log(self.rate)
+            + torch.lgamma(self.concentration)
+            + (1.0 - self.concentration) * torch.digamma(self.concentration)
+        )
+
+    @property
+    def _natural_params(self):
+        return (self.concentration - 1, -self.rate)
+
+    def _log_normalizer(self, x, y):
+        return torch.lgamma(x + 1) + (x + 1) * torch.log(-y.reciprocal())
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return torch.special.gammainc(self.concentration, self.rate * value)
diff --git a/MLPY/Lib/site-packages/torch/distributions/geometric.py b/MLPY/Lib/site-packages/torch/distributions/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a1322fd72d6144df1e7c6d85728cff672a7ecac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/geometric.py
@@ -0,0 +1,128 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import (
+    broadcast_all,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+from torch.nn.functional import binary_cross_entropy_with_logits
+
+__all__ = ["Geometric"]
+
+
+class Geometric(Distribution):
+    r"""
+    Creates a Geometric distribution parameterized by :attr:`probs`,
+    where :attr:`probs` is the probability of success of Bernoulli trials.
+
+    .. math::
+
+        P(X=k) = (1-p)^{k} p, k = 0, 1, ...
+
+    .. note::
+        :func:`torch.distributions.geometric.Geometric` :math:`(k+1)`-th trial is the first success
+        hence draws samples in :math:`\{0, 1, \ldots\}`, whereas
+        :func:`torch.Tensor.geometric_` `k`-th trial is the first success hence draws samples in :math:`\{1, 2, \ldots\}`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Geometric(torch.tensor([0.3]))
+        >>> m.sample()  # underlying Bernoulli has 30% chance 1; 70% chance 0
+        tensor([ 2.])
+
+    Args:
+        probs (Number, Tensor): the probability of sampling `1`. Must be in range (0, 1]
+        logits (Number, Tensor): the log-odds of sampling `1`.
+    """
+    arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    support = constraints.nonnegative_integer
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            (self.probs,) = broadcast_all(probs)
+        else:
+            (self.logits,) = broadcast_all(logits)
+        probs_or_logits = probs if probs is not None else logits
+        if isinstance(probs_or_logits, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = probs_or_logits.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+        if self._validate_args and probs is not None:
+            # Add an extra check beyond unit_interval
+            value = self.probs
+            valid = value > 0
+            if not valid.all():
+                invalid_value = value.data[~valid]
+                raise ValueError(
+                    "Expected parameter probs "
+                    f"({type(value).__name__} of shape {tuple(value.shape)}) "
+                    f"of distribution {repr(self)} "
+                    f"to be positive but found invalid values:\n{invalid_value}"
+                )
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Geometric, _instance)
+        batch_shape = torch.Size(batch_shape)
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+        super(Geometric, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        return 1.0 / self.probs - 1.0
+
+    @property
+    def mode(self):
+        return torch.zeros_like(self.probs)
+
+    @property
+    def variance(self):
+        return (1.0 / self.probs - 1.0) / self.probs
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        tiny = torch.finfo(self.probs.dtype).tiny
+        with torch.no_grad():
+            if torch._C._get_tracing_state():
+                # [JIT WORKAROUND] lack of support for .uniform_()
+                u = torch.rand(shape, dtype=self.probs.dtype, device=self.probs.device)
+                u = u.clamp(min=tiny)
+            else:
+                u = self.probs.new(shape).uniform_(tiny, 1)
+            return (u.log() / (-self.probs).log1p()).floor()
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        value, probs = broadcast_all(value, self.probs)
+        probs = probs.clone(memory_format=torch.contiguous_format)
+        probs[(probs == 1) & (value == 0)] = 0
+        return value * (-probs).log1p() + self.probs.log()
+
+    def entropy(self):
+        return (
+            binary_cross_entropy_with_logits(self.logits, self.probs, reduction="none")
+            / self.probs
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/gumbel.py b/MLPY/Lib/site-packages/torch/distributions/gumbel.py
new file mode 100644
index 0000000000000000000000000000000000000000..303e8d8e5d9980c5ccc6d3a436a6ff5a750cf39c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/gumbel.py
@@ -0,0 +1,81 @@
+import math
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, ExpTransform
+from torch.distributions.uniform import Uniform
+from torch.distributions.utils import broadcast_all, euler_constant
+
+__all__ = ["Gumbel"]
+
+
+class Gumbel(TransformedDistribution):
+    r"""
+    Samples from a Gumbel Distribution.
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Gumbel(torch.tensor([1.0]), torch.tensor([2.0]))
+        >>> m.sample()  # sample from Gumbel distribution with loc=1, scale=2
+        tensor([ 1.0124])
+
+    Args:
+        loc (float or Tensor): Location parameter of the distribution
+        scale (float or Tensor): Scale parameter of the distribution
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.real
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        finfo = torch.finfo(self.loc.dtype)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            base_dist = Uniform(finfo.tiny, 1 - finfo.eps, validate_args=validate_args)
+        else:
+            base_dist = Uniform(
+                torch.full_like(self.loc, finfo.tiny),
+                torch.full_like(self.loc, 1 - finfo.eps),
+                validate_args=validate_args,
+            )
+        transforms = [
+            ExpTransform().inv,
+            AffineTransform(loc=0, scale=-torch.ones_like(self.scale)),
+            ExpTransform().inv,
+            AffineTransform(loc=loc, scale=-self.scale),
+        ]
+        super().__init__(base_dist, transforms, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Gumbel, _instance)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        return super().expand(batch_shape, _instance=new)
+
+    # Explicitly defining the log probability function for Gumbel due to precision issues
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        y = (self.loc - value) / self.scale
+        return (y - y.exp()) - self.scale.log()
+
+    @property
+    def mean(self):
+        return self.loc + self.scale * euler_constant
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def stddev(self):
+        return (math.pi / math.sqrt(6)) * self.scale
+
+    @property
+    def variance(self):
+        return self.stddev.pow(2)
+
+    def entropy(self):
+        return self.scale.log() + (1 + euler_constant)
diff --git a/MLPY/Lib/site-packages/torch/distributions/half_cauchy.py b/MLPY/Lib/site-packages/torch/distributions/half_cauchy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9880ff6e7518c40ecd74745ecedf4f0c5b1d3457
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/half_cauchy.py
@@ -0,0 +1,82 @@
+import math
+
+import torch
+from torch import inf
+from torch.distributions import constraints
+from torch.distributions.cauchy import Cauchy
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AbsTransform
+
+__all__ = ["HalfCauchy"]
+
+
+class HalfCauchy(TransformedDistribution):
+    r"""
+    Creates a half-Cauchy distribution parameterized by `scale` where::
+
+        X ~ Cauchy(0, scale)
+        Y = |X| ~ HalfCauchy(scale)
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = HalfCauchy(torch.tensor([1.0]))
+        >>> m.sample()  # half-cauchy distributed with scale=1
+        tensor([ 2.3214])
+
+    Args:
+        scale (float or Tensor): scale of the full Cauchy distribution
+    """
+    arg_constraints = {"scale": constraints.positive}
+    support = constraints.nonnegative
+    has_rsample = True
+
+    def __init__(self, scale, validate_args=None):
+        base_dist = Cauchy(0, scale, validate_args=False)
+        super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfCauchy, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return torch.full(
+            self._extended_shape(),
+            math.inf,
+            dtype=self.scale.dtype,
+            device=self.scale.device,
+        )
+
+    @property
+    def mode(self):
+        return torch.zeros_like(self.scale)
+
+    @property
+    def variance(self):
+        return self.base_dist.variance
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        value = torch.as_tensor(
+            value, dtype=self.base_dist.scale.dtype, device=self.base_dist.scale.device
+        )
+        log_prob = self.base_dist.log_prob(value) + math.log(2)
+        log_prob = torch.where(value >= 0, log_prob, -inf)
+        return log_prob
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 2 * self.base_dist.cdf(value) - 1
+
+    def icdf(self, prob):
+        return self.base_dist.icdf((prob + 1) / 2)
+
+    def entropy(self):
+        return self.base_dist.entropy() - math.log(2)
diff --git a/MLPY/Lib/site-packages/torch/distributions/half_normal.py b/MLPY/Lib/site-packages/torch/distributions/half_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..60cc45c6633599006e4981437e0903a6ac4df913
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/half_normal.py
@@ -0,0 +1,74 @@
+import math
+
+import torch
+from torch import inf
+from torch.distributions import constraints
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AbsTransform
+
+__all__ = ["HalfNormal"]
+
+
+class HalfNormal(TransformedDistribution):
+    r"""
+    Creates a half-normal distribution parameterized by `scale` where::
+
+        X ~ Normal(0, scale)
+        Y = |X| ~ HalfNormal(scale)
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = HalfNormal(torch.tensor([1.0]))
+        >>> m.sample()  # half-normal distributed with scale=1
+        tensor([ 0.1046])
+
+    Args:
+        scale (float or Tensor): scale of the full Normal distribution
+    """
+    arg_constraints = {"scale": constraints.positive}
+    support = constraints.nonnegative
+    has_rsample = True
+
+    def __init__(self, scale, validate_args=None):
+        base_dist = Normal(0, scale, validate_args=False)
+        super().__init__(base_dist, AbsTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(HalfNormal, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return self.scale * math.sqrt(2 / math.pi)
+
+    @property
+    def mode(self):
+        return torch.zeros_like(self.scale)
+
+    @property
+    def variance(self):
+        return self.scale.pow(2) * (1 - 2 / math.pi)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        log_prob = self.base_dist.log_prob(value) + math.log(2)
+        log_prob = torch.where(value >= 0, log_prob, -inf)
+        return log_prob
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 2 * self.base_dist.cdf(value) - 1
+
+    def icdf(self, prob):
+        return self.base_dist.icdf((prob + 1) / 2)
+
+    def entropy(self):
+        return self.base_dist.entropy() - math.log(2)
diff --git a/MLPY/Lib/site-packages/torch/distributions/independent.py b/MLPY/Lib/site-packages/torch/distributions/independent.py
new file mode 100644
index 0000000000000000000000000000000000000000..2510c4724d57ae6c81be7f04019f40f15beef785
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/independent.py
@@ -0,0 +1,125 @@
+from typing import Dict
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import _sum_rightmost
+
+__all__ = ["Independent"]
+
+
+class Independent(Distribution):
+    r"""
+    Reinterprets some of the batch dims of a distribution as event dims.
+
+    This is mainly useful for changing the shape of the result of
+    :meth:`log_prob`. For example to create a diagonal Normal distribution with
+    the same shape as a Multivariate Normal distribution (so they are
+    interchangeable), you can::
+
+        >>> from torch.distributions.multivariate_normal import MultivariateNormal
+        >>> from torch.distributions.normal import Normal
+        >>> loc = torch.zeros(3)
+        >>> scale = torch.ones(3)
+        >>> mvn = MultivariateNormal(loc, scale_tril=torch.diag(scale))
+        >>> [mvn.batch_shape, mvn.event_shape]
+        [torch.Size([]), torch.Size([3])]
+        >>> normal = Normal(loc, scale)
+        >>> [normal.batch_shape, normal.event_shape]
+        [torch.Size([3]), torch.Size([])]
+        >>> diagn = Independent(normal, 1)
+        >>> [diagn.batch_shape, diagn.event_shape]
+        [torch.Size([]), torch.Size([3])]
+
+    Args:
+        base_distribution (torch.distributions.distribution.Distribution): a
+            base distribution
+        reinterpreted_batch_ndims (int): the number of batch dims to
+            reinterpret as event dims
+    """
+    arg_constraints: Dict[str, constraints.Constraint] = {}
+
+    def __init__(
+        self, base_distribution, reinterpreted_batch_ndims, validate_args=None
+    ):
+        if reinterpreted_batch_ndims > len(base_distribution.batch_shape):
+            raise ValueError(
+                "Expected reinterpreted_batch_ndims <= len(base_distribution.batch_shape), "
+                f"actual {reinterpreted_batch_ndims} vs {len(base_distribution.batch_shape)}"
+            )
+        shape = base_distribution.batch_shape + base_distribution.event_shape
+        event_dim = reinterpreted_batch_ndims + len(base_distribution.event_shape)
+        batch_shape = shape[: len(shape) - event_dim]
+        event_shape = shape[len(shape) - event_dim :]
+        self.base_dist = base_distribution
+        self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Independent, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.base_dist = self.base_dist.expand(
+            batch_shape + self.event_shape[: self.reinterpreted_batch_ndims]
+        )
+        new.reinterpreted_batch_ndims = self.reinterpreted_batch_ndims
+        super(Independent, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def has_rsample(self):
+        return self.base_dist.has_rsample
+
+    @property
+    def has_enumerate_support(self):
+        if self.reinterpreted_batch_ndims > 0:
+            return False
+        return self.base_dist.has_enumerate_support
+
+    @constraints.dependent_property
+    def support(self):
+        result = self.base_dist.support
+        if self.reinterpreted_batch_ndims:
+            result = constraints.independent(result, self.reinterpreted_batch_ndims)
+        return result
+
+    @property
+    def mean(self):
+        return self.base_dist.mean
+
+    @property
+    def mode(self):
+        return self.base_dist.mode
+
+    @property
+    def variance(self):
+        return self.base_dist.variance
+
+    def sample(self, sample_shape=torch.Size()):
+        return self.base_dist.sample(sample_shape)
+
+    def rsample(self, sample_shape=torch.Size()):
+        return self.base_dist.rsample(sample_shape)
+
+    def log_prob(self, value):
+        log_prob = self.base_dist.log_prob(value)
+        return _sum_rightmost(log_prob, self.reinterpreted_batch_ndims)
+
+    def entropy(self):
+        entropy = self.base_dist.entropy()
+        return _sum_rightmost(entropy, self.reinterpreted_batch_ndims)
+
+    def enumerate_support(self, expand=True):
+        if self.reinterpreted_batch_ndims > 0:
+            raise NotImplementedError(
+                "Enumeration over cartesian product is not implemented"
+            )
+        return self.base_dist.enumerate_support(expand=expand)
+
+    def __repr__(self):
+        return (
+            self.__class__.__name__
+            + f"({self.base_dist}, {self.reinterpreted_batch_ndims})"
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/inverse_gamma.py b/MLPY/Lib/site-packages/torch/distributions/inverse_gamma.py
new file mode 100644
index 0000000000000000000000000000000000000000..418460df7100a34d4b0063b2aae92397a63fb673
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/inverse_gamma.py
@@ -0,0 +1,80 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.gamma import Gamma
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import PowerTransform
+
+
+__all__ = ["InverseGamma"]
+
+
+class InverseGamma(TransformedDistribution):
+    r"""
+    Creates an inverse gamma distribution parameterized by :attr:`concentration` and :attr:`rate`
+    where::
+
+        X ~ Gamma(concentration, rate)
+        Y = 1 / X ~ InverseGamma(concentration, rate)
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterinistic")
+        >>> m = InverseGamma(torch.tensor([2.0]), torch.tensor([3.0]))
+        >>> m.sample()
+        tensor([ 1.2953])
+
+    Args:
+        concentration (float or Tensor): shape parameter of the distribution
+            (often referred to as alpha)
+        rate (float or Tensor): rate = 1 / scale of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {
+        "concentration": constraints.positive,
+        "rate": constraints.positive,
+    }
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, concentration, rate, validate_args=None):
+        base_dist = Gamma(concentration, rate, validate_args=validate_args)
+        neg_one = -base_dist.rate.new_ones(())
+        super().__init__(
+            base_dist, PowerTransform(neg_one), validate_args=validate_args
+        )
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(InverseGamma, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def concentration(self):
+        return self.base_dist.concentration
+
+    @property
+    def rate(self):
+        return self.base_dist.rate
+
+    @property
+    def mean(self):
+        result = self.rate / (self.concentration - 1)
+        return torch.where(self.concentration > 1, result, torch.inf)
+
+    @property
+    def mode(self):
+        return self.rate / (self.concentration + 1)
+
+    @property
+    def variance(self):
+        result = self.rate.square() / (
+            (self.concentration - 1).square() * (self.concentration - 2)
+        )
+        return torch.where(self.concentration > 2, result, torch.inf)
+
+    def entropy(self):
+        return (
+            self.concentration
+            + self.rate.log()
+            + self.concentration.lgamma()
+            - (1 + self.concentration) * self.concentration.digamma()
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/kl.py b/MLPY/Lib/site-packages/torch/distributions/kl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fe67ea6cb56b6030a22369af71c2bc3e737620b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/kl.py
@@ -0,0 +1,971 @@
+import math
+import warnings
+from functools import total_ordering
+from typing import Callable, Dict, Tuple, Type
+
+import torch
+from torch import inf
+
+from .bernoulli import Bernoulli
+from .beta import Beta
+from .binomial import Binomial
+from .categorical import Categorical
+from .cauchy import Cauchy
+from .continuous_bernoulli import ContinuousBernoulli
+from .dirichlet import Dirichlet
+from .distribution import Distribution
+from .exp_family import ExponentialFamily
+from .exponential import Exponential
+from .gamma import Gamma
+from .geometric import Geometric
+from .gumbel import Gumbel
+from .half_normal import HalfNormal
+from .independent import Independent
+from .laplace import Laplace
+from .lowrank_multivariate_normal import (
+    _batch_lowrank_logdet,
+    _batch_lowrank_mahalanobis,
+    LowRankMultivariateNormal,
+)
+from .multivariate_normal import _batch_mahalanobis, MultivariateNormal
+from .normal import Normal
+from .one_hot_categorical import OneHotCategorical
+from .pareto import Pareto
+from .poisson import Poisson
+from .transformed_distribution import TransformedDistribution
+from .uniform import Uniform
+from .utils import _sum_rightmost, euler_constant as _euler_gamma
+
+_KL_REGISTRY: Dict[
+    Tuple[Type, Type], Callable
+] = {}  # Source of truth mapping a few general (type, type) pairs to functions.
+_KL_MEMOIZE: Dict[
+    Tuple[Type, Type], Callable
+] = {}  # Memoized version mapping many specific (type, type) pairs to functions.
+
+__all__ = ["register_kl", "kl_divergence"]
+
+
+def register_kl(type_p, type_q):
+    """
+    Decorator to register a pairwise function with :meth:`kl_divergence`.
+    Usage::
+
+        @register_kl(Normal, Normal)
+        def kl_normal_normal(p, q):
+            # insert implementation here
+
+    Lookup returns the most specific (type,type) match ordered by subclass. If
+    the match is ambiguous, a `RuntimeWarning` is raised. For example to
+    resolve the ambiguous situation::
+
+        @register_kl(BaseP, DerivedQ)
+        def kl_version1(p, q): ...
+        @register_kl(DerivedP, BaseQ)
+        def kl_version2(p, q): ...
+
+    you should register a third most-specific implementation, e.g.::
+
+        register_kl(DerivedP, DerivedQ)(kl_version1)  # Break the tie.
+
+    Args:
+        type_p (type): A subclass of :class:`~torch.distributions.Distribution`.
+        type_q (type): A subclass of :class:`~torch.distributions.Distribution`.
+    """
+    if not isinstance(type_p, type) and issubclass(type_p, Distribution):
+        raise TypeError(
+            f"Expected type_p to be a Distribution subclass but got {type_p}"
+        )
+    if not isinstance(type_q, type) and issubclass(type_q, Distribution):
+        raise TypeError(
+            f"Expected type_q to be a Distribution subclass but got {type_q}"
+        )
+
+    def decorator(fun):
+        _KL_REGISTRY[type_p, type_q] = fun
+        _KL_MEMOIZE.clear()  # reset since lookup order may have changed
+        return fun
+
+    return decorator
+
+
+@total_ordering
+class _Match:
+    __slots__ = ["types"]
+
+    def __init__(self, *types):
+        self.types = types
+
+    def __eq__(self, other):
+        return self.types == other.types
+
+    def __le__(self, other):
+        for x, y in zip(self.types, other.types):
+            if not issubclass(x, y):
+                return False
+            if x is not y:
+                break
+        return True
+
+
+def _dispatch_kl(type_p, type_q):
+    """
+    Find the most specific approximate match, assuming single inheritance.
+    """
+    matches = [
+        (super_p, super_q)
+        for super_p, super_q in _KL_REGISTRY
+        if issubclass(type_p, super_p) and issubclass(type_q, super_q)
+    ]
+    if not matches:
+        return NotImplemented
+    # Check that the left- and right- lexicographic orders agree.
+    # mypy isn't smart enough to know that _Match implements __lt__
+    # see: https://github.com/python/typing/issues/760#issuecomment-710670503
+    left_p, left_q = min(_Match(*m) for m in matches).types  # type: ignore[type-var]
+    right_q, right_p = min(_Match(*reversed(m)) for m in matches).types  # type: ignore[type-var]
+    left_fun = _KL_REGISTRY[left_p, left_q]
+    right_fun = _KL_REGISTRY[right_p, right_q]
+    if left_fun is not right_fun:
+        warnings.warn(
+            "Ambiguous kl_divergence({}, {}). Please register_kl({}, {})".format(
+                type_p.__name__, type_q.__name__, left_p.__name__, right_q.__name__
+            ),
+            RuntimeWarning,
+        )
+    return left_fun
+
+
+def _infinite_like(tensor):
+    """
+    Helper function for obtaining infinite KL Divergence throughout
+    """
+    return torch.full_like(tensor, inf)
+
+
+def _x_log_x(tensor):
+    """
+    Utility function for calculating x log x
+    """
+    return tensor * tensor.log()
+
+
+def _batch_trace_XXT(bmat):
+    """
+    Utility function for calculating the trace of XX^{T} with X having arbitrary trailing batch dimensions
+    """
+    n = bmat.size(-1)
+    m = bmat.size(-2)
+    flat_trace = bmat.reshape(-1, m * n).pow(2).sum(-1)
+    return flat_trace.reshape(bmat.shape[:-2])
+
+
+def kl_divergence(p: Distribution, q: Distribution) -> torch.Tensor:
+    r"""
+    Compute Kullback-Leibler divergence :math:`KL(p \| q)` between two distributions.
+
+    .. math::
+
+        KL(p \| q) = \int p(x) \log\frac {p(x)} {q(x)} \,dx
+
+    Args:
+        p (Distribution): A :class:`~torch.distributions.Distribution` object.
+        q (Distribution): A :class:`~torch.distributions.Distribution` object.
+
+    Returns:
+        Tensor: A batch of KL divergences of shape `batch_shape`.
+
+    Raises:
+        NotImplementedError: If the distribution types have not been registered via
+            :meth:`register_kl`.
+    """
+    try:
+        fun = _KL_MEMOIZE[type(p), type(q)]
+    except KeyError:
+        fun = _dispatch_kl(type(p), type(q))
+        _KL_MEMOIZE[type(p), type(q)] = fun
+    if fun is NotImplemented:
+        raise NotImplementedError(
+            f"No KL(p || q) is implemented for p type {p.__class__.__name__} and q type {q.__class__.__name__}"
+        )
+    return fun(p, q)
+
+
+################################################################################
+# KL Divergence Implementations
+################################################################################
+
+# Same distributions
+
+
+@register_kl(Bernoulli, Bernoulli)
+def _kl_bernoulli_bernoulli(p, q):
+    t1 = p.probs * (
+        torch.nn.functional.softplus(-q.logits)
+        - torch.nn.functional.softplus(-p.logits)
+    )
+    t1[q.probs == 0] = inf
+    t1[p.probs == 0] = 0
+    t2 = (1 - p.probs) * (
+        torch.nn.functional.softplus(q.logits) - torch.nn.functional.softplus(p.logits)
+    )
+    t2[q.probs == 1] = inf
+    t2[p.probs == 1] = 0
+    return t1 + t2
+
+
+@register_kl(Beta, Beta)
+def _kl_beta_beta(p, q):
+    sum_params_p = p.concentration1 + p.concentration0
+    sum_params_q = q.concentration1 + q.concentration0
+    t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + (sum_params_p).lgamma()
+    t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + (sum_params_q).lgamma()
+    t3 = (p.concentration1 - q.concentration1) * torch.digamma(p.concentration1)
+    t4 = (p.concentration0 - q.concentration0) * torch.digamma(p.concentration0)
+    t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p)
+    return t1 - t2 + t3 + t4 + t5
+
+
+@register_kl(Binomial, Binomial)
+def _kl_binomial_binomial(p, q):
+    # from https://math.stackexchange.com/questions/2214993/
+    # kullback-leibler-divergence-for-binomial-distributions-p-and-q
+    if (p.total_count < q.total_count).any():
+        raise NotImplementedError(
+            "KL between Binomials where q.total_count > p.total_count is not implemented"
+        )
+    kl = p.total_count * (
+        p.probs * (p.logits - q.logits) + (-p.probs).log1p() - (-q.probs).log1p()
+    )
+    inf_idxs = p.total_count > q.total_count
+    kl[inf_idxs] = _infinite_like(kl[inf_idxs])
+    return kl
+
+
+@register_kl(Categorical, Categorical)
+def _kl_categorical_categorical(p, q):
+    t = p.probs * (p.logits - q.logits)
+    t[(q.probs == 0).expand_as(t)] = inf
+    t[(p.probs == 0).expand_as(t)] = 0
+    return t.sum(-1)
+
+
+@register_kl(ContinuousBernoulli, ContinuousBernoulli)
+def _kl_continuous_bernoulli_continuous_bernoulli(p, q):
+    t1 = p.mean * (p.logits - q.logits)
+    t2 = p._cont_bern_log_norm() + torch.log1p(-p.probs)
+    t3 = -q._cont_bern_log_norm() - torch.log1p(-q.probs)
+    return t1 + t2 + t3
+
+
+@register_kl(Dirichlet, Dirichlet)
+def _kl_dirichlet_dirichlet(p, q):
+    # From http://bariskurt.com/kullback-leibler-divergence-between-two-dirichlet-and-beta-distributions/
+    sum_p_concentration = p.concentration.sum(-1)
+    sum_q_concentration = q.concentration.sum(-1)
+    t1 = sum_p_concentration.lgamma() - sum_q_concentration.lgamma()
+    t2 = (p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)
+    t3 = p.concentration - q.concentration
+    t4 = p.concentration.digamma() - sum_p_concentration.digamma().unsqueeze(-1)
+    return t1 - t2 + (t3 * t4).sum(-1)
+
+
+@register_kl(Exponential, Exponential)
+def _kl_exponential_exponential(p, q):
+    rate_ratio = q.rate / p.rate
+    t1 = -rate_ratio.log()
+    return t1 + rate_ratio - 1
+
+
+@register_kl(ExponentialFamily, ExponentialFamily)
+def _kl_expfamily_expfamily(p, q):
+    if not type(p) == type(q):
+        raise NotImplementedError(
+            "The cross KL-divergence between different exponential families cannot \
+                            be computed using Bregman divergences"
+        )
+    p_nparams = [np.detach().requires_grad_() for np in p._natural_params]
+    q_nparams = q._natural_params
+    lg_normal = p._log_normalizer(*p_nparams)
+    gradients = torch.autograd.grad(lg_normal.sum(), p_nparams, create_graph=True)
+    result = q._log_normalizer(*q_nparams) - lg_normal
+    for pnp, qnp, g in zip(p_nparams, q_nparams, gradients):
+        term = (qnp - pnp) * g
+        result -= _sum_rightmost(term, len(q.event_shape))
+    return result
+
+
+@register_kl(Gamma, Gamma)
+def _kl_gamma_gamma(p, q):
+    t1 = q.concentration * (p.rate / q.rate).log()
+    t2 = torch.lgamma(q.concentration) - torch.lgamma(p.concentration)
+    t3 = (p.concentration - q.concentration) * torch.digamma(p.concentration)
+    t4 = (q.rate - p.rate) * (p.concentration / p.rate)
+    return t1 + t2 + t3 + t4
+
+
+@register_kl(Gumbel, Gumbel)
+def _kl_gumbel_gumbel(p, q):
+    ct1 = p.scale / q.scale
+    ct2 = q.loc / q.scale
+    ct3 = p.loc / q.scale
+    t1 = -ct1.log() - ct2 + ct3
+    t2 = ct1 * _euler_gamma
+    t3 = torch.exp(ct2 + (1 + ct1).lgamma() - ct3)
+    return t1 + t2 + t3 - (1 + _euler_gamma)
+
+
+@register_kl(Geometric, Geometric)
+def _kl_geometric_geometric(p, q):
+    return -p.entropy() - torch.log1p(-q.probs) / p.probs - q.logits
+
+
+@register_kl(HalfNormal, HalfNormal)
+def _kl_halfnormal_halfnormal(p, q):
+    return _kl_normal_normal(p.base_dist, q.base_dist)
+
+
+@register_kl(Laplace, Laplace)
+def _kl_laplace_laplace(p, q):
+    # From http://www.mast.queensu.ca/~communications/Papers/gil-msc11.pdf
+    scale_ratio = p.scale / q.scale
+    loc_abs_diff = (p.loc - q.loc).abs()
+    t1 = -scale_ratio.log()
+    t2 = loc_abs_diff / q.scale
+    t3 = scale_ratio * torch.exp(-loc_abs_diff / p.scale)
+    return t1 + t2 + t3 - 1
+
+
+@register_kl(LowRankMultivariateNormal, LowRankMultivariateNormal)
+def _kl_lowrankmultivariatenormal_lowrankmultivariatenormal(p, q):
+    if p.event_shape != q.event_shape:
+        raise ValueError(
+            "KL-divergence between two Low Rank Multivariate Normals with\
+                          different event shapes cannot be computed"
+        )
+
+    term1 = _batch_lowrank_logdet(
+        q._unbroadcasted_cov_factor, q._unbroadcasted_cov_diag, q._capacitance_tril
+    ) - _batch_lowrank_logdet(
+        p._unbroadcasted_cov_factor, p._unbroadcasted_cov_diag, p._capacitance_tril
+    )
+    term3 = _batch_lowrank_mahalanobis(
+        q._unbroadcasted_cov_factor,
+        q._unbroadcasted_cov_diag,
+        q.loc - p.loc,
+        q._capacitance_tril,
+    )
+    # Expands term2 according to
+    # inv(qcov) @ pcov = [inv(qD) - inv(qD) @ qW @ inv(qC) @ qW.T @ inv(qD)] @ (pW @ pW.T + pD)
+    #                  = [inv(qD) - A.T @ A] @ (pD + pW @ pW.T)
+    qWt_qDinv = q._unbroadcasted_cov_factor.mT / q._unbroadcasted_cov_diag.unsqueeze(-2)
+    A = torch.linalg.solve_triangular(q._capacitance_tril, qWt_qDinv, upper=False)
+    term21 = (p._unbroadcasted_cov_diag / q._unbroadcasted_cov_diag).sum(-1)
+    term22 = _batch_trace_XXT(
+        p._unbroadcasted_cov_factor * q._unbroadcasted_cov_diag.rsqrt().unsqueeze(-1)
+    )
+    term23 = _batch_trace_XXT(A * p._unbroadcasted_cov_diag.sqrt().unsqueeze(-2))
+    term24 = _batch_trace_XXT(A.matmul(p._unbroadcasted_cov_factor))
+    term2 = term21 + term22 - term23 - term24
+    return 0.5 * (term1 + term2 + term3 - p.event_shape[0])
+
+
+@register_kl(MultivariateNormal, LowRankMultivariateNormal)
+def _kl_multivariatenormal_lowrankmultivariatenormal(p, q):
+    if p.event_shape != q.event_shape:
+        raise ValueError(
+            "KL-divergence between two (Low Rank) Multivariate Normals with\
+                          different event shapes cannot be computed"
+        )
+
+    term1 = _batch_lowrank_logdet(
+        q._unbroadcasted_cov_factor, q._unbroadcasted_cov_diag, q._capacitance_tril
+    ) - 2 * p._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+    term3 = _batch_lowrank_mahalanobis(
+        q._unbroadcasted_cov_factor,
+        q._unbroadcasted_cov_diag,
+        q.loc - p.loc,
+        q._capacitance_tril,
+    )
+    # Expands term2 according to
+    # inv(qcov) @ pcov = [inv(qD) - inv(qD) @ qW @ inv(qC) @ qW.T @ inv(qD)] @ p_tril @ p_tril.T
+    #                  = [inv(qD) - A.T @ A] @ p_tril @ p_tril.T
+    qWt_qDinv = q._unbroadcasted_cov_factor.mT / q._unbroadcasted_cov_diag.unsqueeze(-2)
+    A = torch.linalg.solve_triangular(q._capacitance_tril, qWt_qDinv, upper=False)
+    term21 = _batch_trace_XXT(
+        p._unbroadcasted_scale_tril * q._unbroadcasted_cov_diag.rsqrt().unsqueeze(-1)
+    )
+    term22 = _batch_trace_XXT(A.matmul(p._unbroadcasted_scale_tril))
+    term2 = term21 - term22
+    return 0.5 * (term1 + term2 + term3 - p.event_shape[0])
+
+
+@register_kl(LowRankMultivariateNormal, MultivariateNormal)
+def _kl_lowrankmultivariatenormal_multivariatenormal(p, q):
+    if p.event_shape != q.event_shape:
+        raise ValueError(
+            "KL-divergence between two (Low Rank) Multivariate Normals with\
+                          different event shapes cannot be computed"
+        )
+
+    term1 = 2 * q._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(
+        -1
+    ) - _batch_lowrank_logdet(
+        p._unbroadcasted_cov_factor, p._unbroadcasted_cov_diag, p._capacitance_tril
+    )
+    term3 = _batch_mahalanobis(q._unbroadcasted_scale_tril, (q.loc - p.loc))
+    # Expands term2 according to
+    # inv(qcov) @ pcov = inv(q_tril @ q_tril.T) @ (pW @ pW.T + pD)
+    combined_batch_shape = torch._C._infer_size(
+        q._unbroadcasted_scale_tril.shape[:-2], p._unbroadcasted_cov_factor.shape[:-2]
+    )
+    n = p.event_shape[0]
+    q_scale_tril = q._unbroadcasted_scale_tril.expand(combined_batch_shape + (n, n))
+    p_cov_factor = p._unbroadcasted_cov_factor.expand(
+        combined_batch_shape + (n, p.cov_factor.size(-1))
+    )
+    p_cov_diag = torch.diag_embed(p._unbroadcasted_cov_diag.sqrt()).expand(
+        combined_batch_shape + (n, n)
+    )
+    term21 = _batch_trace_XXT(
+        torch.linalg.solve_triangular(q_scale_tril, p_cov_factor, upper=False)
+    )
+    term22 = _batch_trace_XXT(
+        torch.linalg.solve_triangular(q_scale_tril, p_cov_diag, upper=False)
+    )
+    term2 = term21 + term22
+    return 0.5 * (term1 + term2 + term3 - p.event_shape[0])
+
+
+@register_kl(MultivariateNormal, MultivariateNormal)
+def _kl_multivariatenormal_multivariatenormal(p, q):
+    # From https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Kullback%E2%80%93Leibler_divergence
+    if p.event_shape != q.event_shape:
+        raise ValueError(
+            "KL-divergence between two Multivariate Normals with\
+                          different event shapes cannot be computed"
+        )
+
+    half_term1 = q._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(
+        -1
+    ) - p._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+    combined_batch_shape = torch._C._infer_size(
+        q._unbroadcasted_scale_tril.shape[:-2], p._unbroadcasted_scale_tril.shape[:-2]
+    )
+    n = p.event_shape[0]
+    q_scale_tril = q._unbroadcasted_scale_tril.expand(combined_batch_shape + (n, n))
+    p_scale_tril = p._unbroadcasted_scale_tril.expand(combined_batch_shape + (n, n))
+    term2 = _batch_trace_XXT(
+        torch.linalg.solve_triangular(q_scale_tril, p_scale_tril, upper=False)
+    )
+    term3 = _batch_mahalanobis(q._unbroadcasted_scale_tril, (q.loc - p.loc))
+    return half_term1 + 0.5 * (term2 + term3 - n)
+
+
+@register_kl(Normal, Normal)
+def _kl_normal_normal(p, q):
+    var_ratio = (p.scale / q.scale).pow(2)
+    t1 = ((p.loc - q.loc) / q.scale).pow(2)
+    return 0.5 * (var_ratio + t1 - 1 - var_ratio.log())
+
+
+@register_kl(OneHotCategorical, OneHotCategorical)
+def _kl_onehotcategorical_onehotcategorical(p, q):
+    return _kl_categorical_categorical(p._categorical, q._categorical)
+
+
+@register_kl(Pareto, Pareto)
+def _kl_pareto_pareto(p, q):
+    # From http://www.mast.queensu.ca/~communications/Papers/gil-msc11.pdf
+    scale_ratio = p.scale / q.scale
+    alpha_ratio = q.alpha / p.alpha
+    t1 = q.alpha * scale_ratio.log()
+    t2 = -alpha_ratio.log()
+    result = t1 + t2 + alpha_ratio - 1
+    result[p.support.lower_bound < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Poisson, Poisson)
+def _kl_poisson_poisson(p, q):
+    return p.rate * (p.rate.log() - q.rate.log()) - (p.rate - q.rate)
+
+
+@register_kl(TransformedDistribution, TransformedDistribution)
+def _kl_transformed_transformed(p, q):
+    if p.transforms != q.transforms:
+        raise NotImplementedError
+    if p.event_shape != q.event_shape:
+        raise NotImplementedError
+    return kl_divergence(p.base_dist, q.base_dist)
+
+
+@register_kl(Uniform, Uniform)
+def _kl_uniform_uniform(p, q):
+    result = ((q.high - q.low) / (p.high - p.low)).log()
+    result[(q.low > p.low) | (q.high < p.high)] = inf
+    return result
+
+
+# Different distributions
+@register_kl(Bernoulli, Poisson)
+def _kl_bernoulli_poisson(p, q):
+    return -p.entropy() - (p.probs * q.rate.log() - q.rate)
+
+
+@register_kl(Beta, ContinuousBernoulli)
+def _kl_beta_continuous_bernoulli(p, q):
+    return (
+        -p.entropy()
+        - p.mean * q.logits
+        - torch.log1p(-q.probs)
+        - q._cont_bern_log_norm()
+    )
+
+
+@register_kl(Beta, Pareto)
+def _kl_beta_infinity(p, q):
+    return _infinite_like(p.concentration1)
+
+
+@register_kl(Beta, Exponential)
+def _kl_beta_exponential(p, q):
+    return (
+        -p.entropy()
+        - q.rate.log()
+        + q.rate * (p.concentration1 / (p.concentration1 + p.concentration0))
+    )
+
+
+@register_kl(Beta, Gamma)
+def _kl_beta_gamma(p, q):
+    t1 = -p.entropy()
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (q.concentration - 1) * (
+        p.concentration1.digamma() - (p.concentration1 + p.concentration0).digamma()
+    )
+    t4 = q.rate * p.concentration1 / (p.concentration1 + p.concentration0)
+    return t1 + t2 - t3 + t4
+
+
+# TODO: Add Beta-Laplace KL Divergence
+
+
+@register_kl(Beta, Normal)
+def _kl_beta_normal(p, q):
+    E_beta = p.concentration1 / (p.concentration1 + p.concentration0)
+    var_normal = q.scale.pow(2)
+    t1 = -p.entropy()
+    t2 = 0.5 * (var_normal * 2 * math.pi).log()
+    t3 = (
+        E_beta * (1 - E_beta) / (p.concentration1 + p.concentration0 + 1)
+        + E_beta.pow(2)
+    ) * 0.5
+    t4 = q.loc * E_beta
+    t5 = q.loc.pow(2) * 0.5
+    return t1 + t2 + (t3 - t4 + t5) / var_normal
+
+
+@register_kl(Beta, Uniform)
+def _kl_beta_uniform(p, q):
+    result = -p.entropy() + (q.high - q.low).log()
+    result[(q.low > p.support.lower_bound) | (q.high < p.support.upper_bound)] = inf
+    return result
+
+
+# Note that the KL between a ContinuousBernoulli and Beta has no closed form
+
+
+@register_kl(ContinuousBernoulli, Pareto)
+def _kl_continuous_bernoulli_infinity(p, q):
+    return _infinite_like(p.probs)
+
+
+@register_kl(ContinuousBernoulli, Exponential)
+def _kl_continuous_bernoulli_exponential(p, q):
+    return -p.entropy() - torch.log(q.rate) + q.rate * p.mean
+
+
+# Note that the KL between a ContinuousBernoulli and Gamma has no closed form
+# TODO: Add ContinuousBernoulli-Laplace KL Divergence
+
+
+@register_kl(ContinuousBernoulli, Normal)
+def _kl_continuous_bernoulli_normal(p, q):
+    t1 = -p.entropy()
+    t2 = 0.5 * (math.log(2.0 * math.pi) + torch.square(q.loc / q.scale)) + torch.log(
+        q.scale
+    )
+    t3 = (p.variance + torch.square(p.mean) - 2.0 * q.loc * p.mean) / (
+        2.0 * torch.square(q.scale)
+    )
+    return t1 + t2 + t3
+
+
+@register_kl(ContinuousBernoulli, Uniform)
+def _kl_continuous_bernoulli_uniform(p, q):
+    result = -p.entropy() + (q.high - q.low).log()
+    return torch.where(
+        torch.max(
+            torch.ge(q.low, p.support.lower_bound),
+            torch.le(q.high, p.support.upper_bound),
+        ),
+        torch.ones_like(result) * inf,
+        result,
+    )
+
+
+@register_kl(Exponential, Beta)
+@register_kl(Exponential, ContinuousBernoulli)
+@register_kl(Exponential, Pareto)
+@register_kl(Exponential, Uniform)
+def _kl_exponential_infinity(p, q):
+    return _infinite_like(p.rate)
+
+
+@register_kl(Exponential, Gamma)
+def _kl_exponential_gamma(p, q):
+    ratio = q.rate / p.rate
+    t1 = -q.concentration * torch.log(ratio)
+    return (
+        t1
+        + ratio
+        + q.concentration.lgamma()
+        + q.concentration * _euler_gamma
+        - (1 + _euler_gamma)
+    )
+
+
+@register_kl(Exponential, Gumbel)
+def _kl_exponential_gumbel(p, q):
+    scale_rate_prod = p.rate * q.scale
+    loc_scale_ratio = q.loc / q.scale
+    t1 = scale_rate_prod.log() - 1
+    t2 = torch.exp(loc_scale_ratio) * scale_rate_prod / (scale_rate_prod + 1)
+    t3 = scale_rate_prod.reciprocal()
+    return t1 - loc_scale_ratio + t2 + t3
+
+
+# TODO: Add Exponential-Laplace KL Divergence
+
+
+@register_kl(Exponential, Normal)
+def _kl_exponential_normal(p, q):
+    var_normal = q.scale.pow(2)
+    rate_sqr = p.rate.pow(2)
+    t1 = 0.5 * torch.log(rate_sqr * var_normal * 2 * math.pi)
+    t2 = rate_sqr.reciprocal()
+    t3 = q.loc / p.rate
+    t4 = q.loc.pow(2) * 0.5
+    return t1 - 1 + (t2 - t3 + t4) / var_normal
+
+
+@register_kl(Gamma, Beta)
+@register_kl(Gamma, ContinuousBernoulli)
+@register_kl(Gamma, Pareto)
+@register_kl(Gamma, Uniform)
+def _kl_gamma_infinity(p, q):
+    return _infinite_like(p.concentration)
+
+
+@register_kl(Gamma, Exponential)
+def _kl_gamma_exponential(p, q):
+    return -p.entropy() - q.rate.log() + q.rate * p.concentration / p.rate
+
+
+@register_kl(Gamma, Gumbel)
+def _kl_gamma_gumbel(p, q):
+    beta_scale_prod = p.rate * q.scale
+    loc_scale_ratio = q.loc / q.scale
+    t1 = (
+        (p.concentration - 1) * p.concentration.digamma()
+        - p.concentration.lgamma()
+        - p.concentration
+    )
+    t2 = beta_scale_prod.log() + p.concentration / beta_scale_prod
+    t3 = (
+        torch.exp(loc_scale_ratio)
+        * (1 + beta_scale_prod.reciprocal()).pow(-p.concentration)
+        - loc_scale_ratio
+    )
+    return t1 + t2 + t3
+
+
+# TODO: Add Gamma-Laplace KL Divergence
+
+
+@register_kl(Gamma, Normal)
+def _kl_gamma_normal(p, q):
+    var_normal = q.scale.pow(2)
+    beta_sqr = p.rate.pow(2)
+    t1 = (
+        0.5 * torch.log(beta_sqr * var_normal * 2 * math.pi)
+        - p.concentration
+        - p.concentration.lgamma()
+    )
+    t2 = 0.5 * (p.concentration.pow(2) + p.concentration) / beta_sqr
+    t3 = q.loc * p.concentration / p.rate
+    t4 = 0.5 * q.loc.pow(2)
+    return (
+        t1
+        + (p.concentration - 1) * p.concentration.digamma()
+        + (t2 - t3 + t4) / var_normal
+    )
+
+
+@register_kl(Gumbel, Beta)
+@register_kl(Gumbel, ContinuousBernoulli)
+@register_kl(Gumbel, Exponential)
+@register_kl(Gumbel, Gamma)
+@register_kl(Gumbel, Pareto)
+@register_kl(Gumbel, Uniform)
+def _kl_gumbel_infinity(p, q):
+    return _infinite_like(p.loc)
+
+
+# TODO: Add Gumbel-Laplace KL Divergence
+
+
+@register_kl(Gumbel, Normal)
+def _kl_gumbel_normal(p, q):
+    param_ratio = p.scale / q.scale
+    t1 = (param_ratio / math.sqrt(2 * math.pi)).log()
+    t2 = (math.pi * param_ratio * 0.5).pow(2) / 3
+    t3 = ((p.loc + p.scale * _euler_gamma - q.loc) / q.scale).pow(2) * 0.5
+    return -t1 + t2 + t3 - (_euler_gamma + 1)
+
+
+@register_kl(Laplace, Beta)
+@register_kl(Laplace, ContinuousBernoulli)
+@register_kl(Laplace, Exponential)
+@register_kl(Laplace, Gamma)
+@register_kl(Laplace, Pareto)
+@register_kl(Laplace, Uniform)
+def _kl_laplace_infinity(p, q):
+    return _infinite_like(p.loc)
+
+
+@register_kl(Laplace, Normal)
+def _kl_laplace_normal(p, q):
+    var_normal = q.scale.pow(2)
+    scale_sqr_var_ratio = p.scale.pow(2) / var_normal
+    t1 = 0.5 * torch.log(2 * scale_sqr_var_ratio / math.pi)
+    t2 = 0.5 * p.loc.pow(2)
+    t3 = p.loc * q.loc
+    t4 = 0.5 * q.loc.pow(2)
+    return -t1 + scale_sqr_var_ratio + (t2 - t3 + t4) / var_normal - 1
+
+
+@register_kl(Normal, Beta)
+@register_kl(Normal, ContinuousBernoulli)
+@register_kl(Normal, Exponential)
+@register_kl(Normal, Gamma)
+@register_kl(Normal, Pareto)
+@register_kl(Normal, Uniform)
+def _kl_normal_infinity(p, q):
+    return _infinite_like(p.loc)
+
+
+@register_kl(Normal, Gumbel)
+def _kl_normal_gumbel(p, q):
+    mean_scale_ratio = p.loc / q.scale
+    var_scale_sqr_ratio = (p.scale / q.scale).pow(2)
+    loc_scale_ratio = q.loc / q.scale
+    t1 = var_scale_sqr_ratio.log() * 0.5
+    t2 = mean_scale_ratio - loc_scale_ratio
+    t3 = torch.exp(-mean_scale_ratio + 0.5 * var_scale_sqr_ratio + loc_scale_ratio)
+    return -t1 + t2 + t3 - (0.5 * (1 + math.log(2 * math.pi)))
+
+
+@register_kl(Normal, Laplace)
+def _kl_normal_laplace(p, q):
+    loc_diff = p.loc - q.loc
+    scale_ratio = p.scale / q.scale
+    loc_diff_scale_ratio = loc_diff / p.scale
+    t1 = torch.log(scale_ratio)
+    t2 = (
+        math.sqrt(2 / math.pi) * p.scale * torch.exp(-0.5 * loc_diff_scale_ratio.pow(2))
+    )
+    t3 = loc_diff * torch.erf(math.sqrt(0.5) * loc_diff_scale_ratio)
+    return -t1 + (t2 + t3) / q.scale - (0.5 * (1 + math.log(0.5 * math.pi)))
+
+
+@register_kl(Pareto, Beta)
+@register_kl(Pareto, ContinuousBernoulli)
+@register_kl(Pareto, Uniform)
+def _kl_pareto_infinity(p, q):
+    return _infinite_like(p.scale)
+
+
+@register_kl(Pareto, Exponential)
+def _kl_pareto_exponential(p, q):
+    scale_rate_prod = p.scale * q.rate
+    t1 = (p.alpha / scale_rate_prod).log()
+    t2 = p.alpha.reciprocal()
+    t3 = p.alpha * scale_rate_prod / (p.alpha - 1)
+    result = t1 - t2 + t3 - 1
+    result[p.alpha <= 1] = inf
+    return result
+
+
+@register_kl(Pareto, Gamma)
+def _kl_pareto_gamma(p, q):
+    common_term = p.scale.log() + p.alpha.reciprocal()
+    t1 = p.alpha.log() - common_term
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (1 - q.concentration) * common_term
+    t4 = q.rate * p.alpha * p.scale / (p.alpha - 1)
+    result = t1 + t2 + t3 + t4 - 1
+    result[p.alpha <= 1] = inf
+    return result
+
+
+# TODO: Add Pareto-Laplace KL Divergence
+
+
+@register_kl(Pareto, Normal)
+def _kl_pareto_normal(p, q):
+    var_normal = 2 * q.scale.pow(2)
+    common_term = p.scale / (p.alpha - 1)
+    t1 = (math.sqrt(2 * math.pi) * q.scale * p.alpha / p.scale).log()
+    t2 = p.alpha.reciprocal()
+    t3 = p.alpha * common_term.pow(2) / (p.alpha - 2)
+    t4 = (p.alpha * common_term - q.loc).pow(2)
+    result = t1 - t2 + (t3 + t4) / var_normal - 1
+    result[p.alpha <= 2] = inf
+    return result
+
+
+@register_kl(Poisson, Bernoulli)
+@register_kl(Poisson, Binomial)
+def _kl_poisson_infinity(p, q):
+    return _infinite_like(p.rate)
+
+
+@register_kl(Uniform, Beta)
+def _kl_uniform_beta(p, q):
+    common_term = p.high - p.low
+    t1 = torch.log(common_term)
+    t2 = (
+        (q.concentration1 - 1)
+        * (_x_log_x(p.high) - _x_log_x(p.low) - common_term)
+        / common_term
+    )
+    t3 = (
+        (q.concentration0 - 1)
+        * (_x_log_x(1 - p.high) - _x_log_x(1 - p.low) + common_term)
+        / common_term
+    )
+    t4 = (
+        q.concentration1.lgamma()
+        + q.concentration0.lgamma()
+        - (q.concentration1 + q.concentration0).lgamma()
+    )
+    result = t3 + t4 - t1 - t2
+    result[(p.high > q.support.upper_bound) | (p.low < q.support.lower_bound)] = inf
+    return result
+
+
+@register_kl(Uniform, ContinuousBernoulli)
+def _kl_uniform_continuous_bernoulli(p, q):
+    result = (
+        -p.entropy()
+        - p.mean * q.logits
+        - torch.log1p(-q.probs)
+        - q._cont_bern_log_norm()
+    )
+    return torch.where(
+        torch.max(
+            torch.ge(p.high, q.support.upper_bound),
+            torch.le(p.low, q.support.lower_bound),
+        ),
+        torch.ones_like(result) * inf,
+        result,
+    )
+
+
+@register_kl(Uniform, Exponential)
+def _kl_uniform_exponetial(p, q):
+    result = q.rate * (p.high + p.low) / 2 - ((p.high - p.low) * q.rate).log()
+    result[p.low < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Uniform, Gamma)
+def _kl_uniform_gamma(p, q):
+    common_term = p.high - p.low
+    t1 = common_term.log()
+    t2 = q.concentration.lgamma() - q.concentration * q.rate.log()
+    t3 = (
+        (1 - q.concentration)
+        * (_x_log_x(p.high) - _x_log_x(p.low) - common_term)
+        / common_term
+    )
+    t4 = q.rate * (p.high + p.low) / 2
+    result = -t1 + t2 + t3 + t4
+    result[p.low < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Uniform, Gumbel)
+def _kl_uniform_gumbel(p, q):
+    common_term = q.scale / (p.high - p.low)
+    high_loc_diff = (p.high - q.loc) / q.scale
+    low_loc_diff = (p.low - q.loc) / q.scale
+    t1 = common_term.log() + 0.5 * (high_loc_diff + low_loc_diff)
+    t2 = common_term * (torch.exp(-high_loc_diff) - torch.exp(-low_loc_diff))
+    return t1 - t2
+
+
+# TODO: Uniform-Laplace KL Divergence
+
+
+@register_kl(Uniform, Normal)
+def _kl_uniform_normal(p, q):
+    common_term = p.high - p.low
+    t1 = (math.sqrt(math.pi * 2) * q.scale / common_term).log()
+    t2 = (common_term).pow(2) / 12
+    t3 = ((p.high + p.low - 2 * q.loc) / 2).pow(2)
+    return t1 + 0.5 * (t2 + t3) / q.scale.pow(2)
+
+
+@register_kl(Uniform, Pareto)
+def _kl_uniform_pareto(p, q):
+    support_uniform = p.high - p.low
+    t1 = (q.alpha * q.scale.pow(q.alpha) * (support_uniform)).log()
+    t2 = (_x_log_x(p.high) - _x_log_x(p.low) - support_uniform) / support_uniform
+    result = t2 * (q.alpha + 1) - t1
+    result[p.low < q.support.lower_bound] = inf
+    return result
+
+
+@register_kl(Independent, Independent)
+def _kl_independent_independent(p, q):
+    if p.reinterpreted_batch_ndims != q.reinterpreted_batch_ndims:
+        raise NotImplementedError
+    result = kl_divergence(p.base_dist, q.base_dist)
+    return _sum_rightmost(result, p.reinterpreted_batch_ndims)
+
+
+@register_kl(Cauchy, Cauchy)
+def _kl_cauchy_cauchy(p, q):
+    # From https://arxiv.org/abs/1905.10965
+    t1 = ((p.scale + q.scale).pow(2) + (p.loc - q.loc).pow(2)).log()
+    t2 = (4 * p.scale * q.scale).log()
+    return t1 - t2
+
+
+def _add_kl_info():
+    """Appends a list of implemented KL functions to the doc for kl_divergence."""
+    rows = [
+        "KL divergence is currently implemented for the following distribution pairs:"
+    ]
+    for p, q in sorted(
+        _KL_REGISTRY, key=lambda p_q: (p_q[0].__name__, p_q[1].__name__)
+    ):
+        rows.append(
+            f"* :class:`~torch.distributions.{p.__name__}` and :class:`~torch.distributions.{q.__name__}`"
+        )
+    kl_info = "\n\t".join(rows)
+    if kl_divergence.__doc__:
+        kl_divergence.__doc__ += kl_info  # type: ignore[operator]
diff --git a/MLPY/Lib/site-packages/torch/distributions/kumaraswamy.py b/MLPY/Lib/site-packages/torch/distributions/kumaraswamy.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a07e72b67c9a442ee89a8007b7843802674038a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/kumaraswamy.py
@@ -0,0 +1,97 @@
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, PowerTransform
+from torch.distributions.uniform import Uniform
+from torch.distributions.utils import broadcast_all, euler_constant
+
+__all__ = ["Kumaraswamy"]
+
+
+def _moments(a, b, n):
+    """
+    Computes nth moment of Kumaraswamy using using torch.lgamma
+    """
+    arg1 = 1 + n / a
+    log_value = torch.lgamma(arg1) + torch.lgamma(b) - torch.lgamma(arg1 + b)
+    return b * torch.exp(log_value)
+
+
+class Kumaraswamy(TransformedDistribution):
+    r"""
+    Samples from a Kumaraswamy distribution.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Kumaraswamy(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Kumaraswamy distribution with concentration alpha=1 and beta=1
+        tensor([ 0.1729])
+
+    Args:
+        concentration1 (float or Tensor): 1st concentration parameter of the distribution
+            (often referred to as alpha)
+        concentration0 (float or Tensor): 2nd concentration parameter of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {
+        "concentration1": constraints.positive,
+        "concentration0": constraints.positive,
+    }
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, concentration1, concentration0, validate_args=None):
+        self.concentration1, self.concentration0 = broadcast_all(
+            concentration1, concentration0
+        )
+        finfo = torch.finfo(self.concentration0.dtype)
+        base_dist = Uniform(
+            torch.full_like(self.concentration0, 0),
+            torch.full_like(self.concentration0, 1),
+            validate_args=validate_args,
+        )
+        transforms = [
+            PowerTransform(exponent=self.concentration0.reciprocal()),
+            AffineTransform(loc=1.0, scale=-1.0),
+            PowerTransform(exponent=self.concentration1.reciprocal()),
+        ]
+        super().__init__(base_dist, transforms, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Kumaraswamy, _instance)
+        new.concentration1 = self.concentration1.expand(batch_shape)
+        new.concentration0 = self.concentration0.expand(batch_shape)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def mean(self):
+        return _moments(self.concentration1, self.concentration0, 1)
+
+    @property
+    def mode(self):
+        # Evaluate in log-space for numerical stability.
+        log_mode = (
+            self.concentration0.reciprocal() * (-self.concentration0).log1p()
+            - (-self.concentration0 * self.concentration1).log1p()
+        )
+        log_mode[(self.concentration0 < 1) | (self.concentration1 < 1)] = nan
+        return log_mode.exp()
+
+    @property
+    def variance(self):
+        return _moments(self.concentration1, self.concentration0, 2) - torch.pow(
+            self.mean, 2
+        )
+
+    def entropy(self):
+        t1 = 1 - self.concentration1.reciprocal()
+        t0 = 1 - self.concentration0.reciprocal()
+        H0 = torch.digamma(self.concentration0 + 1) + euler_constant
+        return (
+            t0
+            + t1 * H0
+            - torch.log(self.concentration1)
+            - torch.log(self.concentration0)
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/laplace.py b/MLPY/Lib/site-packages/torch/distributions/laplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..64bf2e3937cd2ea705aee512e12f4ce70190a649
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/laplace.py
@@ -0,0 +1,94 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Laplace"]
+
+
+class Laplace(Distribution):
+    r"""
+    Creates a Laplace distribution parameterized by :attr:`loc` and :attr:`scale`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Laplace(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # Laplace distributed with loc=0, scale=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of the distribution
+        scale (float or Tensor): scale of the distribution
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.real
+    has_rsample = True
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        return 2 * self.scale.pow(2)
+
+    @property
+    def stddev(self):
+        return (2**0.5) * self.scale
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Laplace, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Laplace, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        finfo = torch.finfo(self.loc.dtype)
+        if torch._C._get_tracing_state():
+            # [JIT WORKAROUND] lack of support for .uniform_()
+            u = torch.rand(shape, dtype=self.loc.dtype, device=self.loc.device) * 2 - 1
+            return self.loc - self.scale * u.sign() * torch.log1p(
+                -u.abs().clamp(min=finfo.tiny)
+            )
+        u = self.loc.new(shape).uniform_(finfo.eps - 1, 1)
+        # TODO: If we ever implement tensor.nextafter, below is what we want ideally.
+        # u = self.loc.new(shape).uniform_(self.loc.nextafter(-.5, 0), .5)
+        return self.loc - self.scale * u.sign() * torch.log1p(-u.abs())
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return -torch.log(2 * self.scale) - torch.abs(value - self.loc) / self.scale
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(
+            -(value - self.loc).abs() / self.scale
+        )
+
+    def icdf(self, value):
+        term = value - 0.5
+        return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs())
+
+    def entropy(self):
+        return 1 + torch.log(2 * self.scale)
diff --git a/MLPY/Lib/site-packages/torch/distributions/lkj_cholesky.py b/MLPY/Lib/site-packages/torch/distributions/lkj_cholesky.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c35ec50cf8f50078decd87a9cb8879703ffce1a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/lkj_cholesky.py
@@ -0,0 +1,142 @@
+"""
+This closely follows the implementation in NumPyro (https://github.com/pyro-ppl/numpyro).
+
+Original copyright notice:
+
+# Copyright: Contributors to the Pyro project.
+# SPDX-License-Identifier: Apache-2.0
+"""
+
+import math
+
+import torch
+from torch.distributions import Beta, constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["LKJCholesky"]
+
+
+class LKJCholesky(Distribution):
+    r"""
+    LKJ distribution for lower Cholesky factor of correlation matrices.
+    The distribution is controlled by ``concentration`` parameter :math:`\eta`
+    to make the probability of the correlation matrix :math:`M` generated from
+    a Cholesky factor proportional to :math:`\det(M)^{\eta - 1}`. Because of that,
+    when ``concentration == 1``, we have a uniform distribution over Cholesky
+    factors of correlation matrices::
+
+        L ~ LKJCholesky(dim, concentration)
+        X = L @ L' ~ LKJCorr(dim, concentration)
+
+    Note that this distribution samples the
+    Cholesky factor of correlation matrices and not the correlation matrices
+    themselves and thereby differs slightly from the derivations in [1] for
+    the `LKJCorr` distribution. For sampling, this uses the Onion method from
+    [1] Section 3.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> l = LKJCholesky(3, 0.5)
+        >>> l.sample()  # l @ l.T is a sample of a correlation 3x3 matrix
+        tensor([[ 1.0000,  0.0000,  0.0000],
+                [ 0.3516,  0.9361,  0.0000],
+                [-0.1899,  0.4748,  0.8593]])
+
+    Args:
+        dimension (dim): dimension of the matrices
+        concentration (float or Tensor): concentration/shape parameter of the
+            distribution (often referred to as eta)
+
+    **References**
+
+    [1] `Generating random correlation matrices based on vines and extended onion method` (2009),
+    Daniel Lewandowski, Dorota Kurowicka, Harry Joe.
+    Journal of Multivariate Analysis. 100. 10.1016/j.jmva.2009.04.008
+    """
+    arg_constraints = {"concentration": constraints.positive}
+    support = constraints.corr_cholesky
+
+    def __init__(self, dim, concentration=1.0, validate_args=None):
+        if dim < 2:
+            raise ValueError(
+                f"Expected dim to be an integer greater than or equal to 2. Found dim={dim}."
+            )
+        self.dim = dim
+        (self.concentration,) = broadcast_all(concentration)
+        batch_shape = self.concentration.size()
+        event_shape = torch.Size((dim, dim))
+        # This is used to draw vectorized samples from the beta distribution in Sec. 3.2 of [1].
+        marginal_conc = self.concentration + 0.5 * (self.dim - 2)
+        offset = torch.arange(
+            self.dim - 1,
+            dtype=self.concentration.dtype,
+            device=self.concentration.device,
+        )
+        offset = torch.cat([offset.new_zeros((1,)), offset])
+        beta_conc1 = offset + 0.5
+        beta_conc0 = marginal_conc.unsqueeze(-1) - 0.5 * offset
+        self._beta = Beta(beta_conc1, beta_conc0)
+        super().__init__(batch_shape, event_shape, validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LKJCholesky, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.dim = self.dim
+        new.concentration = self.concentration.expand(batch_shape)
+        new._beta = self._beta.expand(batch_shape + (self.dim,))
+        super(LKJCholesky, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    def sample(self, sample_shape=torch.Size()):
+        # This uses the Onion method, but there are a few differences from [1] Sec. 3.2:
+        # - This vectorizes the for loop and also works for heterogeneous eta.
+        # - Same algorithm generalizes to n=1.
+        # - The procedure is simplified since we are sampling the cholesky factor of
+        #   the correlation matrix instead of the correlation matrix itself. As such,
+        #   we only need to generate `w`.
+        y = self._beta.sample(sample_shape).unsqueeze(-1)
+        u_normal = torch.randn(
+            self._extended_shape(sample_shape), dtype=y.dtype, device=y.device
+        ).tril(-1)
+        u_hypersphere = u_normal / u_normal.norm(dim=-1, keepdim=True)
+        # Replace NaNs in first row
+        u_hypersphere[..., 0, :].fill_(0.0)
+        w = torch.sqrt(y) * u_hypersphere
+        # Fill diagonal elements; clamp for numerical stability
+        eps = torch.finfo(w.dtype).tiny
+        diag_elems = torch.clamp(1 - torch.sum(w**2, dim=-1), min=eps).sqrt()
+        w += torch.diag_embed(diag_elems)
+        return w
+
+    def log_prob(self, value):
+        # See: https://mc-stan.org/docs/2_25/functions-reference/cholesky-lkj-correlation-distribution.html
+        # The probability of a correlation matrix is proportional to
+        #   determinant ** (concentration - 1) = prod(L_ii ^ 2(concentration - 1))
+        # Additionally, the Jacobian of the transformation from Cholesky factor to
+        # correlation matrix is:
+        #   prod(L_ii ^ (D - i))
+        # So the probability of a Cholesky factor is propotional to
+        #   prod(L_ii ^ (2 * concentration - 2 + D - i)) = prod(L_ii ^ order_i)
+        # with order_i = 2 * concentration - 2 + D - i
+        if self._validate_args:
+            self._validate_sample(value)
+        diag_elems = value.diagonal(dim1=-1, dim2=-2)[..., 1:]
+        order = torch.arange(2, self.dim + 1, device=self.concentration.device)
+        order = 2 * (self.concentration - 1).unsqueeze(-1) + self.dim - order
+        unnormalized_log_pdf = torch.sum(order * diag_elems.log(), dim=-1)
+        # Compute normalization constant (page 1999 of [1])
+        dm1 = self.dim - 1
+        alpha = self.concentration + 0.5 * dm1
+        denominator = torch.lgamma(alpha) * dm1
+        numerator = torch.mvlgamma(alpha - 0.5, dm1)
+        # pi_constant in [1] is D * (D - 1) / 4 * log(pi)
+        # pi_constant in multigammaln is (D - 1) * (D - 2) / 4 * log(pi)
+        # hence, we need to add a pi_constant = (D - 1) * log(pi) / 2
+        pi_constant = 0.5 * dm1 * math.log(math.pi)
+        normalize_term = pi_constant + numerator - denominator
+        return unnormalized_log_pdf - normalize_term
diff --git a/MLPY/Lib/site-packages/torch/distributions/log_normal.py b/MLPY/Lib/site-packages/torch/distributions/log_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..536c5c307fdcd75d58608b11203ff00a40e12923
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/log_normal.py
@@ -0,0 +1,62 @@
+from torch.distributions import constraints
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import ExpTransform
+
+__all__ = ["LogNormal"]
+
+
+class LogNormal(TransformedDistribution):
+    r"""
+    Creates a log-normal distribution parameterized by
+    :attr:`loc` and :attr:`scale` where::
+
+        X ~ Normal(loc, scale)
+        Y = exp(X) ~ LogNormal(loc, scale)
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = LogNormal(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # log-normal distributed with mean=0 and stddev=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of log of distribution
+        scale (float or Tensor): standard deviation of log of the distribution
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.positive
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        base_dist = Normal(loc, scale, validate_args=validate_args)
+        super().__init__(base_dist, ExpTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogNormal, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def loc(self):
+        return self.base_dist.loc
+
+    @property
+    def scale(self):
+        return self.base_dist.scale
+
+    @property
+    def mean(self):
+        return (self.loc + self.scale.pow(2) / 2).exp()
+
+    @property
+    def mode(self):
+        return (self.loc - self.scale.square()).exp()
+
+    @property
+    def variance(self):
+        scale_sq = self.scale.pow(2)
+        return scale_sq.expm1() * (2 * self.loc + scale_sq).exp()
+
+    def entropy(self):
+        return self.base_dist.entropy() + self.loc
diff --git a/MLPY/Lib/site-packages/torch/distributions/logistic_normal.py b/MLPY/Lib/site-packages/torch/distributions/logistic_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23d0bb7a6f0411f03de33b3f3b7f9c46abd4d79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/logistic_normal.py
@@ -0,0 +1,54 @@
+from torch.distributions import constraints
+from torch.distributions.normal import Normal
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import StickBreakingTransform
+
+__all__ = ["LogisticNormal"]
+
+
+class LogisticNormal(TransformedDistribution):
+    r"""
+    Creates a logistic-normal distribution parameterized by :attr:`loc` and :attr:`scale`
+    that define the base `Normal` distribution transformed with the
+    `StickBreakingTransform` such that::
+
+        X ~ LogisticNormal(loc, scale)
+        Y = log(X / (1 - X.cumsum(-1)))[..., :-1] ~ Normal(loc, scale)
+
+    Args:
+        loc (float or Tensor): mean of the base distribution
+        scale (float or Tensor): standard deviation of the base distribution
+
+    Example::
+
+        >>> # logistic-normal distributed with mean=(0, 0, 0) and stddev=(1, 1, 1)
+        >>> # of the base Normal distribution
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = LogisticNormal(torch.tensor([0.0] * 3), torch.tensor([1.0] * 3))
+        >>> m.sample()
+        tensor([ 0.7653,  0.0341,  0.0579,  0.1427])
+
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, loc, scale, validate_args=None):
+        base_dist = Normal(loc, scale, validate_args=validate_args)
+        if not base_dist.batch_shape:
+            base_dist = base_dist.expand([1])
+        super().__init__(
+            base_dist, StickBreakingTransform(), validate_args=validate_args
+        )
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogisticNormal, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def loc(self):
+        return self.base_dist.base_dist.loc
+
+    @property
+    def scale(self):
+        return self.base_dist.base_dist.scale
diff --git a/MLPY/Lib/site-packages/torch/distributions/lowrank_multivariate_normal.py b/MLPY/Lib/site-packages/torch/distributions/lowrank_multivariate_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf3fda59c6144089026674d9f6de60be8be5421e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/lowrank_multivariate_normal.py
@@ -0,0 +1,237 @@
+import math
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.multivariate_normal import _batch_mahalanobis, _batch_mv
+from torch.distributions.utils import _standard_normal, lazy_property
+
+__all__ = ["LowRankMultivariateNormal"]
+
+
+def _batch_capacitance_tril(W, D):
+    r"""
+    Computes Cholesky of :math:`I + W.T @ inv(D) @ W` for a batch of matrices :math:`W`
+    and a batch of vectors :math:`D`.
+    """
+    m = W.size(-1)
+    Wt_Dinv = W.mT / D.unsqueeze(-2)
+    K = torch.matmul(Wt_Dinv, W).contiguous()
+    K.view(-1, m * m)[:, :: m + 1] += 1  # add identity matrix to K
+    return torch.linalg.cholesky(K)
+
+
+def _batch_lowrank_logdet(W, D, capacitance_tril):
+    r"""
+    Uses "matrix determinant lemma"::
+        log|W @ W.T + D| = log|C| + log|D|,
+    where :math:`C` is the capacitance matrix :math:`I + W.T @ inv(D) @ W`, to compute
+    the log determinant.
+    """
+    return 2 * capacitance_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) + D.log().sum(
+        -1
+    )
+
+
+def _batch_lowrank_mahalanobis(W, D, x, capacitance_tril):
+    r"""
+    Uses "Woodbury matrix identity"::
+        inv(W @ W.T + D) = inv(D) - inv(D) @ W @ inv(C) @ W.T @ inv(D),
+    where :math:`C` is the capacitance matrix :math:`I + W.T @ inv(D) @ W`, to compute the squared
+    Mahalanobis distance :math:`x.T @ inv(W @ W.T + D) @ x`.
+    """
+    Wt_Dinv = W.mT / D.unsqueeze(-2)
+    Wt_Dinv_x = _batch_mv(Wt_Dinv, x)
+    mahalanobis_term1 = (x.pow(2) / D).sum(-1)
+    mahalanobis_term2 = _batch_mahalanobis(capacitance_tril, Wt_Dinv_x)
+    return mahalanobis_term1 - mahalanobis_term2
+
+
+class LowRankMultivariateNormal(Distribution):
+    r"""
+    Creates a multivariate normal distribution with covariance matrix having a low-rank form
+    parameterized by :attr:`cov_factor` and :attr:`cov_diag`::
+
+        covariance_matrix = cov_factor @ cov_factor.T + cov_diag
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = LowRankMultivariateNormal(torch.zeros(2), torch.tensor([[1.], [0.]]), torch.ones(2))
+        >>> m.sample()  # normally distributed with mean=`[0,0]`, cov_factor=`[[1],[0]]`, cov_diag=`[1,1]`
+        tensor([-0.2102, -0.5429])
+
+    Args:
+        loc (Tensor): mean of the distribution with shape `batch_shape + event_shape`
+        cov_factor (Tensor): factor part of low-rank form of covariance matrix with shape
+            `batch_shape + event_shape + (rank,)`
+        cov_diag (Tensor): diagonal part of low-rank form of covariance matrix with shape
+            `batch_shape + event_shape`
+
+    Note:
+        The computation for determinant and inverse of covariance matrix is avoided when
+        `cov_factor.shape[1] << cov_factor.shape[0]` thanks to `Woodbury matrix identity
+        <https://en.wikipedia.org/wiki/Woodbury_matrix_identity>`_ and
+        `matrix determinant lemma <https://en.wikipedia.org/wiki/Matrix_determinant_lemma>`_.
+        Thanks to these formulas, we just need to compute the determinant and inverse of
+        the small size "capacitance" matrix::
+
+            capacitance = I + cov_factor.T @ inv(cov_diag) @ cov_factor
+    """
+    arg_constraints = {
+        "loc": constraints.real_vector,
+        "cov_factor": constraints.independent(constraints.real, 2),
+        "cov_diag": constraints.independent(constraints.positive, 1),
+    }
+    support = constraints.real_vector
+    has_rsample = True
+
+    def __init__(self, loc, cov_factor, cov_diag, validate_args=None):
+        if loc.dim() < 1:
+            raise ValueError("loc must be at least one-dimensional.")
+        event_shape = loc.shape[-1:]
+        if cov_factor.dim() < 2:
+            raise ValueError(
+                "cov_factor must be at least two-dimensional, "
+                "with optional leading batch dimensions"
+            )
+        if cov_factor.shape[-2:-1] != event_shape:
+            raise ValueError(
+                f"cov_factor must be a batch of matrices with shape {event_shape[0]} x m"
+            )
+        if cov_diag.shape[-1:] != event_shape:
+            raise ValueError(
+                f"cov_diag must be a batch of vectors with shape {event_shape}"
+            )
+
+        loc_ = loc.unsqueeze(-1)
+        cov_diag_ = cov_diag.unsqueeze(-1)
+        try:
+            loc_, self.cov_factor, cov_diag_ = torch.broadcast_tensors(
+                loc_, cov_factor, cov_diag_
+            )
+        except RuntimeError as e:
+            raise ValueError(
+                f"Incompatible batch shapes: loc {loc.shape}, cov_factor {cov_factor.shape}, cov_diag {cov_diag.shape}"
+            ) from e
+        self.loc = loc_[..., 0]
+        self.cov_diag = cov_diag_[..., 0]
+        batch_shape = self.loc.shape[:-1]
+
+        self._unbroadcasted_cov_factor = cov_factor
+        self._unbroadcasted_cov_diag = cov_diag
+        self._capacitance_tril = _batch_capacitance_tril(cov_factor, cov_diag)
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LowRankMultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new.cov_diag = self.cov_diag.expand(loc_shape)
+        new.cov_factor = self.cov_factor.expand(loc_shape + self.cov_factor.shape[-1:])
+        new._unbroadcasted_cov_factor = self._unbroadcasted_cov_factor
+        new._unbroadcasted_cov_diag = self._unbroadcasted_cov_diag
+        new._capacitance_tril = self._capacitance_tril
+        super(LowRankMultivariateNormal, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @lazy_property
+    def variance(self):
+        return (
+            self._unbroadcasted_cov_factor.pow(2).sum(-1) + self._unbroadcasted_cov_diag
+        ).expand(self._batch_shape + self._event_shape)
+
+    @lazy_property
+    def scale_tril(self):
+        # The following identity is used to increase the numerically computation stability
+        # for Cholesky decomposition (see http://www.gaussianprocess.org/gpml/, Section 3.4.3):
+        #     W @ W.T + D = D1/2 @ (I + D-1/2 @ W @ W.T @ D-1/2) @ D1/2
+        # The matrix "I + D-1/2 @ W @ W.T @ D-1/2" has eigenvalues bounded from below by 1,
+        # hence it is well-conditioned and safe to take Cholesky decomposition.
+        n = self._event_shape[0]
+        cov_diag_sqrt_unsqueeze = self._unbroadcasted_cov_diag.sqrt().unsqueeze(-1)
+        Dinvsqrt_W = self._unbroadcasted_cov_factor / cov_diag_sqrt_unsqueeze
+        K = torch.matmul(Dinvsqrt_W, Dinvsqrt_W.mT).contiguous()
+        K.view(-1, n * n)[:, :: n + 1] += 1  # add identity matrix to K
+        scale_tril = cov_diag_sqrt_unsqueeze * torch.linalg.cholesky(K)
+        return scale_tril.expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @lazy_property
+    def covariance_matrix(self):
+        covariance_matrix = torch.matmul(
+            self._unbroadcasted_cov_factor, self._unbroadcasted_cov_factor.mT
+        ) + torch.diag_embed(self._unbroadcasted_cov_diag)
+        return covariance_matrix.expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @lazy_property
+    def precision_matrix(self):
+        # We use "Woodbury matrix identity" to take advantage of low rank form::
+        #     inv(W @ W.T + D) = inv(D) - inv(D) @ W @ inv(C) @ W.T @ inv(D)
+        # where :math:`C` is the capacitance matrix.
+        Wt_Dinv = (
+            self._unbroadcasted_cov_factor.mT
+            / self._unbroadcasted_cov_diag.unsqueeze(-2)
+        )
+        A = torch.linalg.solve_triangular(self._capacitance_tril, Wt_Dinv, upper=False)
+        precision_matrix = (
+            torch.diag_embed(self._unbroadcasted_cov_diag.reciprocal()) - A.mT @ A
+        )
+        return precision_matrix.expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        W_shape = shape[:-1] + self.cov_factor.shape[-1:]
+        eps_W = _standard_normal(W_shape, dtype=self.loc.dtype, device=self.loc.device)
+        eps_D = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
+        return (
+            self.loc
+            + _batch_mv(self._unbroadcasted_cov_factor, eps_W)
+            + self._unbroadcasted_cov_diag.sqrt() * eps_D
+        )
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        diff = value - self.loc
+        M = _batch_lowrank_mahalanobis(
+            self._unbroadcasted_cov_factor,
+            self._unbroadcasted_cov_diag,
+            diff,
+            self._capacitance_tril,
+        )
+        log_det = _batch_lowrank_logdet(
+            self._unbroadcasted_cov_factor,
+            self._unbroadcasted_cov_diag,
+            self._capacitance_tril,
+        )
+        return -0.5 * (self._event_shape[0] * math.log(2 * math.pi) + log_det + M)
+
+    def entropy(self):
+        log_det = _batch_lowrank_logdet(
+            self._unbroadcasted_cov_factor,
+            self._unbroadcasted_cov_diag,
+            self._capacitance_tril,
+        )
+        H = 0.5 * (self._event_shape[0] * (1.0 + math.log(2 * math.pi)) + log_det)
+        if len(self._batch_shape) == 0:
+            return H
+        else:
+            return H.expand(self._batch_shape)
diff --git a/MLPY/Lib/site-packages/torch/distributions/mixture_same_family.py b/MLPY/Lib/site-packages/torch/distributions/mixture_same_family.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ba3be63cce2446ed9c839294284485fb4713f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/mixture_same_family.py
@@ -0,0 +1,214 @@
+from typing import Dict
+
+import torch
+from torch.distributions import Categorical, constraints
+from torch.distributions.distribution import Distribution
+
+__all__ = ["MixtureSameFamily"]
+
+
+class MixtureSameFamily(Distribution):
+    r"""
+    The `MixtureSameFamily` distribution implements a (batch of) mixture
+    distribution where all component are from different parameterizations of
+    the same distribution type. It is parameterized by a `Categorical`
+    "selecting distribution" (over `k` component) and a component
+    distribution, i.e., a `Distribution` with a rightmost batch shape
+    (equal to `[k]`) which indexes each (batch of) component.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP("undefined vars")
+        >>> # Construct Gaussian Mixture Model in 1D consisting of 5 equally
+        >>> # weighted normal distributions
+        >>> mix = D.Categorical(torch.ones(5,))
+        >>> comp = D.Normal(torch.randn(5,), torch.rand(5,))
+        >>> gmm = MixtureSameFamily(mix, comp)
+
+        >>> # Construct Gaussian Mixture Model in 2D consisting of 5 equally
+        >>> # weighted bivariate normal distributions
+        >>> mix = D.Categorical(torch.ones(5,))
+        >>> comp = D.Independent(D.Normal(
+        ...          torch.randn(5,2), torch.rand(5,2)), 1)
+        >>> gmm = MixtureSameFamily(mix, comp)
+
+        >>> # Construct a batch of 3 Gaussian Mixture Models in 2D each
+        >>> # consisting of 5 random weighted bivariate normal distributions
+        >>> mix = D.Categorical(torch.rand(3,5))
+        >>> comp = D.Independent(D.Normal(
+        ...         torch.randn(3,5,2), torch.rand(3,5,2)), 1)
+        >>> gmm = MixtureSameFamily(mix, comp)
+
+    Args:
+        mixture_distribution: `torch.distributions.Categorical`-like
+            instance. Manages the probability of selecting component.
+            The number of categories must match the rightmost batch
+            dimension of the `component_distribution`. Must have either
+            scalar `batch_shape` or `batch_shape` matching
+            `component_distribution.batch_shape[:-1]`
+        component_distribution: `torch.distributions.Distribution`-like
+            instance. Right-most batch dimension indexes component.
+    """
+    arg_constraints: Dict[str, constraints.Constraint] = {}
+    has_rsample = False
+
+    def __init__(
+        self, mixture_distribution, component_distribution, validate_args=None
+    ):
+        self._mixture_distribution = mixture_distribution
+        self._component_distribution = component_distribution
+
+        if not isinstance(self._mixture_distribution, Categorical):
+            raise ValueError(
+                " The Mixture distribution needs to be an "
+                " instance of torch.distributions.Categorical"
+            )
+
+        if not isinstance(self._component_distribution, Distribution):
+            raise ValueError(
+                "The Component distribution need to be an "
+                "instance of torch.distributions.Distribution"
+            )
+
+        # Check that batch size matches
+        mdbs = self._mixture_distribution.batch_shape
+        cdbs = self._component_distribution.batch_shape[:-1]
+        for size1, size2 in zip(reversed(mdbs), reversed(cdbs)):
+            if size1 != 1 and size2 != 1 and size1 != size2:
+                raise ValueError(
+                    f"`mixture_distribution.batch_shape` ({mdbs}) is not "
+                    "compatible with `component_distribution."
+                    f"batch_shape`({cdbs})"
+                )
+
+        # Check that the number of mixture component matches
+        km = self._mixture_distribution.logits.shape[-1]
+        kc = self._component_distribution.batch_shape[-1]
+        if km is not None and kc is not None and km != kc:
+            raise ValueError(
+                f"`mixture_distribution component` ({km}) does not"
+                " equal `component_distribution.batch_shape[-1]`"
+                f" ({kc})"
+            )
+        self._num_component = km
+
+        event_shape = self._component_distribution.event_shape
+        self._event_ndims = len(event_shape)
+        super().__init__(
+            batch_shape=cdbs, event_shape=event_shape, validate_args=validate_args
+        )
+
+    def expand(self, batch_shape, _instance=None):
+        batch_shape = torch.Size(batch_shape)
+        batch_shape_comp = batch_shape + (self._num_component,)
+        new = self._get_checked_instance(MixtureSameFamily, _instance)
+        new._component_distribution = self._component_distribution.expand(
+            batch_shape_comp
+        )
+        new._mixture_distribution = self._mixture_distribution.expand(batch_shape)
+        new._num_component = self._num_component
+        new._event_ndims = self._event_ndims
+        event_shape = new._component_distribution.event_shape
+        super(MixtureSameFamily, new).__init__(
+            batch_shape=batch_shape, event_shape=event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @constraints.dependent_property
+    def support(self):
+        # FIXME this may have the wrong shape when support contains batched
+        # parameters
+        return self._component_distribution.support
+
+    @property
+    def mixture_distribution(self):
+        return self._mixture_distribution
+
+    @property
+    def component_distribution(self):
+        return self._component_distribution
+
+    @property
+    def mean(self):
+        probs = self._pad_mixture_dimensions(self.mixture_distribution.probs)
+        return torch.sum(
+            probs * self.component_distribution.mean, dim=-1 - self._event_ndims
+        )  # [B, E]
+
+    @property
+    def variance(self):
+        # Law of total variance: Var(Y) = E[Var(Y|X)] + Var(E[Y|X])
+        probs = self._pad_mixture_dimensions(self.mixture_distribution.probs)
+        mean_cond_var = torch.sum(
+            probs * self.component_distribution.variance, dim=-1 - self._event_ndims
+        )
+        var_cond_mean = torch.sum(
+            probs * (self.component_distribution.mean - self._pad(self.mean)).pow(2.0),
+            dim=-1 - self._event_ndims,
+        )
+        return mean_cond_var + var_cond_mean
+
+    def cdf(self, x):
+        x = self._pad(x)
+        cdf_x = self.component_distribution.cdf(x)
+        mix_prob = self.mixture_distribution.probs
+
+        return torch.sum(cdf_x * mix_prob, dim=-1)
+
+    def log_prob(self, x):
+        if self._validate_args:
+            self._validate_sample(x)
+        x = self._pad(x)
+        log_prob_x = self.component_distribution.log_prob(x)  # [S, B, k]
+        log_mix_prob = torch.log_softmax(
+            self.mixture_distribution.logits, dim=-1
+        )  # [B, k]
+        return torch.logsumexp(log_prob_x + log_mix_prob, dim=-1)  # [S, B]
+
+    def sample(self, sample_shape=torch.Size()):
+        with torch.no_grad():
+            sample_len = len(sample_shape)
+            batch_len = len(self.batch_shape)
+            gather_dim = sample_len + batch_len
+            es = self.event_shape
+
+            # mixture samples [n, B]
+            mix_sample = self.mixture_distribution.sample(sample_shape)
+            mix_shape = mix_sample.shape
+
+            # component samples [n, B, k, E]
+            comp_samples = self.component_distribution.sample(sample_shape)
+
+            # Gather along the k dimension
+            mix_sample_r = mix_sample.reshape(
+                mix_shape + torch.Size([1] * (len(es) + 1))
+            )
+            mix_sample_r = mix_sample_r.repeat(
+                torch.Size([1] * len(mix_shape)) + torch.Size([1]) + es
+            )
+
+            samples = torch.gather(comp_samples, gather_dim, mix_sample_r)
+            return samples.squeeze(gather_dim)
+
+    def _pad(self, x):
+        return x.unsqueeze(-1 - self._event_ndims)
+
+    def _pad_mixture_dimensions(self, x):
+        dist_batch_ndims = len(self.batch_shape)
+        cat_batch_ndims = len(self.mixture_distribution.batch_shape)
+        pad_ndims = 0 if cat_batch_ndims == 1 else dist_batch_ndims - cat_batch_ndims
+        xs = x.shape
+        x = x.reshape(
+            xs[:-1]
+            + torch.Size(pad_ndims * [1])
+            + xs[-1:]
+            + torch.Size(self._event_ndims * [1])
+        )
+        return x
+
+    def __repr__(self):
+        args_string = (
+            f"\n  {self.mixture_distribution},\n  {self.component_distribution}"
+        )
+        return "MixtureSameFamily" + "(" + args_string + ")"
diff --git a/MLPY/Lib/site-packages/torch/distributions/multinomial.py b/MLPY/Lib/site-packages/torch/distributions/multinomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b76a0b3424f8b6f4fb1b952f07d18331fa4d0cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/multinomial.py
@@ -0,0 +1,135 @@
+import torch
+from torch import inf
+from torch.distributions import Categorical, constraints
+from torch.distributions.binomial import Binomial
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Multinomial"]
+
+
+class Multinomial(Distribution):
+    r"""
+    Creates a Multinomial distribution parameterized by :attr:`total_count` and
+    either :attr:`probs` or :attr:`logits` (but not both). The innermost dimension of
+    :attr:`probs` indexes over categories. All other dimensions index over batches.
+
+    Note that :attr:`total_count` need not be specified if only :meth:`log_prob` is
+    called (see example below)
+
+    .. note:: The `probs` argument must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1 along the last dimension. :attr:`probs`
+              will return this normalized value.
+              The `logits` argument will be interpreted as unnormalized log probabilities
+              and can therefore be any real number. It will likewise be normalized so that
+              the resulting probabilities sum to 1 along the last dimension. :attr:`logits`
+              will return this normalized value.
+
+    -   :meth:`sample` requires a single shared `total_count` for all
+        parameters and samples.
+    -   :meth:`log_prob` allows different `total_count` for each parameter and
+        sample.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("FIXME: found invalid values")
+        >>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.]))
+        >>> x = m.sample()  # equal probability of 0, 1, 2, 3
+        tensor([ 21.,  24.,  30.,  25.])
+
+        >>> Multinomial(probs=torch.tensor([1., 1., 1., 1.])).log_prob(x)
+        tensor([-4.1338])
+
+    Args:
+        total_count (int): number of trials
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities (unnormalized)
+    """
+    arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    total_count: int
+
+    @property
+    def mean(self):
+        return self.probs * self.total_count
+
+    @property
+    def variance(self):
+        return self.total_count * self.probs * (1 - self.probs)
+
+    def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
+        if not isinstance(total_count, int):
+            raise NotImplementedError("inhomogeneous total_count is not supported")
+        self.total_count = total_count
+        self._categorical = Categorical(probs=probs, logits=logits)
+        self._binomial = Binomial(total_count=total_count, probs=self.probs)
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Multinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count
+        new._categorical = self._categorical.expand(batch_shape)
+        super(Multinomial, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @constraints.dependent_property(is_discrete=True, event_dim=1)
+    def support(self):
+        return constraints.multinomial(self.total_count)
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    def sample(self, sample_shape=torch.Size()):
+        sample_shape = torch.Size(sample_shape)
+        samples = self._categorical.sample(
+            torch.Size((self.total_count,)) + sample_shape
+        )
+        # samples.shape is (total_count, sample_shape, batch_shape), need to change it to
+        # (sample_shape, batch_shape, total_count)
+        shifted_idx = list(range(samples.dim()))
+        shifted_idx.append(shifted_idx.pop(0))
+        samples = samples.permute(*shifted_idx)
+        counts = samples.new(self._extended_shape(sample_shape)).zero_()
+        counts.scatter_add_(-1, samples, torch.ones_like(samples))
+        return counts.type_as(self.probs)
+
+    def entropy(self):
+        n = torch.tensor(self.total_count)
+
+        cat_entropy = self._categorical.entropy()
+        term1 = n * cat_entropy - torch.lgamma(n + 1)
+
+        support = self._binomial.enumerate_support(expand=False)[1:]
+        binomial_probs = torch.exp(self._binomial.log_prob(support))
+        weights = torch.lgamma(support + 1)
+        term2 = (binomial_probs * weights).sum([0, -1])
+
+        return term1 + term2
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        logits = logits.clone(memory_format=torch.contiguous_format)
+        log_factorial_n = torch.lgamma(value.sum(-1) + 1)
+        log_factorial_xs = torch.lgamma(value + 1).sum(-1)
+        logits[(value == 0) & (logits == -inf)] = 0
+        log_powers = (logits * value).sum(-1)
+        return log_factorial_n - log_factorial_xs + log_powers
diff --git a/MLPY/Lib/site-packages/torch/distributions/multivariate_normal.py b/MLPY/Lib/site-packages/torch/distributions/multivariate_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed25dbf3e0edffa6b65d421502210f87b1cdef4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/multivariate_normal.py
@@ -0,0 +1,262 @@
+import math
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import _standard_normal, lazy_property
+
+__all__ = ["MultivariateNormal"]
+
+
+def _batch_mv(bmat, bvec):
+    r"""
+    Performs a batched matrix-vector product, with compatible but different batch shapes.
+
+    This function takes as input `bmat`, containing :math:`n \times n` matrices, and
+    `bvec`, containing length :math:`n` vectors.
+
+    Both `bmat` and `bvec` may have any number of leading dimensions, which correspond
+    to a batch shape. They are not necessarily assumed to have the same batch shape,
+    just ones which can be broadcasted.
+    """
+    return torch.matmul(bmat, bvec.unsqueeze(-1)).squeeze(-1)
+
+
+def _batch_mahalanobis(bL, bx):
+    r"""
+    Computes the squared Mahalanobis distance :math:`\mathbf{x}^\top\mathbf{M}^{-1}\mathbf{x}`
+    for a factored :math:`\mathbf{M} = \mathbf{L}\mathbf{L}^\top`.
+
+    Accepts batches for both bL and bx. They are not necessarily assumed to have the same batch
+    shape, but `bL` one should be able to broadcasted to `bx` one.
+    """
+    n = bx.size(-1)
+    bx_batch_shape = bx.shape[:-1]
+
+    # Assume that bL.shape = (i, 1, n, n), bx.shape = (..., i, j, n),
+    # we are going to make bx have shape (..., 1, j,  i, 1, n) to apply batched tri.solve
+    bx_batch_dims = len(bx_batch_shape)
+    bL_batch_dims = bL.dim() - 2
+    outer_batch_dims = bx_batch_dims - bL_batch_dims
+    old_batch_dims = outer_batch_dims + bL_batch_dims
+    new_batch_dims = outer_batch_dims + 2 * bL_batch_dims
+    # Reshape bx with the shape (..., 1, i, j, 1, n)
+    bx_new_shape = bx.shape[:outer_batch_dims]
+    for sL, sx in zip(bL.shape[:-2], bx.shape[outer_batch_dims:-1]):
+        bx_new_shape += (sx // sL, sL)
+    bx_new_shape += (n,)
+    bx = bx.reshape(bx_new_shape)
+    # Permute bx to make it have shape (..., 1, j, i, 1, n)
+    permute_dims = (
+        list(range(outer_batch_dims))
+        + list(range(outer_batch_dims, new_batch_dims, 2))
+        + list(range(outer_batch_dims + 1, new_batch_dims, 2))
+        + [new_batch_dims]
+    )
+    bx = bx.permute(permute_dims)
+
+    flat_L = bL.reshape(-1, n, n)  # shape = b x n x n
+    flat_x = bx.reshape(-1, flat_L.size(0), n)  # shape = c x b x n
+    flat_x_swap = flat_x.permute(1, 2, 0)  # shape = b x n x c
+    M_swap = (
+        torch.linalg.solve_triangular(flat_L, flat_x_swap, upper=False).pow(2).sum(-2)
+    )  # shape = b x c
+    M = M_swap.t()  # shape = c x b
+
+    # Now we revert the above reshape and permute operators.
+    permuted_M = M.reshape(bx.shape[:-1])  # shape = (..., 1, j, i, 1)
+    permute_inv_dims = list(range(outer_batch_dims))
+    for i in range(bL_batch_dims):
+        permute_inv_dims += [outer_batch_dims + i, old_batch_dims + i]
+    reshaped_M = permuted_M.permute(permute_inv_dims)  # shape = (..., 1, i, j, 1)
+    return reshaped_M.reshape(bx_batch_shape)
+
+
+def _precision_to_scale_tril(P):
+    # Ref: https://nbviewer.jupyter.org/gist/fehiepsi/5ef8e09e61604f10607380467eb82006#Precision-to-scale_tril
+    Lf = torch.linalg.cholesky(torch.flip(P, (-2, -1)))
+    L_inv = torch.transpose(torch.flip(Lf, (-2, -1)), -2, -1)
+    Id = torch.eye(P.shape[-1], dtype=P.dtype, device=P.device)
+    L = torch.linalg.solve_triangular(L_inv, Id, upper=False)
+    return L
+
+
+class MultivariateNormal(Distribution):
+    r"""
+    Creates a multivariate normal (also called Gaussian) distribution
+    parameterized by a mean vector and a covariance matrix.
+
+    The multivariate normal distribution can be parameterized either
+    in terms of a positive definite covariance matrix :math:`\mathbf{\Sigma}`
+    or a positive definite precision matrix :math:`\mathbf{\Sigma}^{-1}`
+    or a lower-triangular matrix :math:`\mathbf{L}` with positive-valued
+    diagonal entries, such that
+    :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`. This triangular matrix
+    can be obtained via e.g. Cholesky decomposition of the covariance.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        >>> m.sample()  # normally distributed with mean=`[0,0]` and covariance_matrix=`I`
+        tensor([-0.2102, -0.5429])
+
+    Args:
+        loc (Tensor): mean of the distribution
+        covariance_matrix (Tensor): positive-definite covariance matrix
+        precision_matrix (Tensor): positive-definite precision matrix
+        scale_tril (Tensor): lower-triangular factor of covariance, with positive-valued diagonal
+
+    Note:
+        Only one of :attr:`covariance_matrix` or :attr:`precision_matrix` or
+        :attr:`scale_tril` can be specified.
+
+        Using :attr:`scale_tril` will be more efficient: all computations internally
+        are based on :attr:`scale_tril`. If :attr:`covariance_matrix` or
+        :attr:`precision_matrix` is passed instead, it is only used to compute
+        the corresponding lower triangular matrices using a Cholesky decomposition.
+    """
+    arg_constraints = {
+        "loc": constraints.real_vector,
+        "covariance_matrix": constraints.positive_definite,
+        "precision_matrix": constraints.positive_definite,
+        "scale_tril": constraints.lower_cholesky,
+    }
+    support = constraints.real_vector
+    has_rsample = True
+
+    def __init__(
+        self,
+        loc,
+        covariance_matrix=None,
+        precision_matrix=None,
+        scale_tril=None,
+        validate_args=None,
+    ):
+        if loc.dim() < 1:
+            raise ValueError("loc must be at least one-dimensional.")
+        if (covariance_matrix is not None) + (scale_tril is not None) + (
+            precision_matrix is not None
+        ) != 1:
+            raise ValueError(
+                "Exactly one of covariance_matrix or precision_matrix or scale_tril may be specified."
+            )
+
+        if scale_tril is not None:
+            if scale_tril.dim() < 2:
+                raise ValueError(
+                    "scale_tril matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = torch.broadcast_shapes(scale_tril.shape[:-2], loc.shape[:-1])
+            self.scale_tril = scale_tril.expand(batch_shape + (-1, -1))
+        elif covariance_matrix is not None:
+            if covariance_matrix.dim() < 2:
+                raise ValueError(
+                    "covariance_matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = torch.broadcast_shapes(
+                covariance_matrix.shape[:-2], loc.shape[:-1]
+            )
+            self.covariance_matrix = covariance_matrix.expand(batch_shape + (-1, -1))
+        else:
+            if precision_matrix.dim() < 2:
+                raise ValueError(
+                    "precision_matrix must be at least two-dimensional, "
+                    "with optional leading batch dimensions"
+                )
+            batch_shape = torch.broadcast_shapes(
+                precision_matrix.shape[:-2], loc.shape[:-1]
+            )
+            self.precision_matrix = precision_matrix.expand(batch_shape + (-1, -1))
+        self.loc = loc.expand(batch_shape + (-1,))
+
+        event_shape = self.loc.shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+        if scale_tril is not None:
+            self._unbroadcasted_scale_tril = scale_tril
+        elif covariance_matrix is not None:
+            self._unbroadcasted_scale_tril = torch.linalg.cholesky(covariance_matrix)
+        else:  # precision_matrix is not None
+            self._unbroadcasted_scale_tril = _precision_to_scale_tril(precision_matrix)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(MultivariateNormal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        loc_shape = batch_shape + self.event_shape
+        cov_shape = batch_shape + self.event_shape + self.event_shape
+        new.loc = self.loc.expand(loc_shape)
+        new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril
+        if "covariance_matrix" in self.__dict__:
+            new.covariance_matrix = self.covariance_matrix.expand(cov_shape)
+        if "scale_tril" in self.__dict__:
+            new.scale_tril = self.scale_tril.expand(cov_shape)
+        if "precision_matrix" in self.__dict__:
+            new.precision_matrix = self.precision_matrix.expand(cov_shape)
+        super(MultivariateNormal, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @lazy_property
+    def scale_tril(self):
+        return self._unbroadcasted_scale_tril.expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @lazy_property
+    def covariance_matrix(self):
+        return torch.matmul(
+            self._unbroadcasted_scale_tril, self._unbroadcasted_scale_tril.mT
+        ).expand(self._batch_shape + self._event_shape + self._event_shape)
+
+    @lazy_property
+    def precision_matrix(self):
+        return torch.cholesky_inverse(self._unbroadcasted_scale_tril).expand(
+            self._batch_shape + self._event_shape + self._event_shape
+        )
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        return (
+            self._unbroadcasted_scale_tril.pow(2)
+            .sum(-1)
+            .expand(self._batch_shape + self._event_shape)
+        )
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
+        return self.loc + _batch_mv(self._unbroadcasted_scale_tril, eps)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        diff = value - self.loc
+        M = _batch_mahalanobis(self._unbroadcasted_scale_tril, diff)
+        half_log_det = (
+            self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+        )
+        return -0.5 * (self._event_shape[0] * math.log(2 * math.pi) + M) - half_log_det
+
+    def entropy(self):
+        half_log_det = (
+            self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1)
+        )
+        H = 0.5 * self._event_shape[0] * (1.0 + math.log(2 * math.pi)) + half_log_det
+        if len(self._batch_shape) == 0:
+            return H
+        else:
+            return H.expand(self._batch_shape)
diff --git a/MLPY/Lib/site-packages/torch/distributions/negative_binomial.py b/MLPY/Lib/site-packages/torch/distributions/negative_binomial.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55754a0a0fe17c1d50444a76f9314e8a052f0d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/negative_binomial.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn.functional as F
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import (
+    broadcast_all,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+
+__all__ = ["NegativeBinomial"]
+
+
+class NegativeBinomial(Distribution):
+    r"""
+    Creates a Negative Binomial distribution, i.e. distribution
+    of the number of successful independent and identical Bernoulli trials
+    before :attr:`total_count` failures are achieved. The probability
+    of success of each Bernoulli trial is :attr:`probs`.
+
+    Args:
+        total_count (float or Tensor): non-negative number of negative Bernoulli
+            trials to stop, although the distribution is still valid for real
+            valued count
+        probs (Tensor): Event probabilities of success in the half open interval [0, 1)
+        logits (Tensor): Event log-odds for probabilities of success
+    """
+    arg_constraints = {
+        "total_count": constraints.greater_than_eq(0),
+        "probs": constraints.half_open_interval(0.0, 1.0),
+        "logits": constraints.real,
+    }
+    support = constraints.nonnegative_integer
+
+    def __init__(self, total_count, probs=None, logits=None, validate_args=None):
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            (
+                self.total_count,
+                self.probs,
+            ) = broadcast_all(total_count, probs)
+            self.total_count = self.total_count.type_as(self.probs)
+        else:
+            (
+                self.total_count,
+                self.logits,
+            ) = broadcast_all(total_count, logits)
+            self.total_count = self.total_count.type_as(self.logits)
+
+        self._param = self.probs if probs is not None else self.logits
+        batch_shape = self._param.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(NegativeBinomial, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.total_count = self.total_count.expand(batch_shape)
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(NegativeBinomial, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @property
+    def mean(self):
+        return self.total_count * torch.exp(self.logits)
+
+    @property
+    def mode(self):
+        return ((self.total_count - 1) * self.logits.exp()).floor().clamp(min=0.0)
+
+    @property
+    def variance(self):
+        return self.mean / torch.sigmoid(-self.logits)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    @lazy_property
+    def _gamma(self):
+        # Note we avoid validating because self.total_count can be zero.
+        return torch.distributions.Gamma(
+            concentration=self.total_count,
+            rate=torch.exp(-self.logits),
+            validate_args=False,
+        )
+
+    def sample(self, sample_shape=torch.Size()):
+        with torch.no_grad():
+            rate = self._gamma.sample(sample_shape=sample_shape)
+            return torch.poisson(rate)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+
+        log_unnormalized_prob = self.total_count * F.logsigmoid(
+            -self.logits
+        ) + value * F.logsigmoid(self.logits)
+
+        log_normalization = (
+            -torch.lgamma(self.total_count + value)
+            + torch.lgamma(1.0 + value)
+            + torch.lgamma(self.total_count)
+        )
+        # The case self.total_count == 0 and value == 0 has probability 1 but
+        # lgamma(0) is infinite. Handle this case separately using a function
+        # that does not modify tensors in place to allow Jit compilation.
+        log_normalization = log_normalization.masked_fill(
+            self.total_count + value == 0.0, 0.0
+        )
+
+        return log_unnormalized_prob - log_normalization
diff --git a/MLPY/Lib/site-packages/torch/distributions/normal.py b/MLPY/Lib/site-packages/torch/distributions/normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..11079ebb0618f783d5969d5121fbf494524ae9d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/normal.py
@@ -0,0 +1,109 @@
+import math
+from numbers import Number, Real
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import _standard_normal, broadcast_all
+
+__all__ = ["Normal"]
+
+
+class Normal(ExponentialFamily):
+    r"""
+    Creates a normal (also called Gaussian) distribution parameterized by
+    :attr:`loc` and :attr:`scale`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
+        >>> m.sample()  # normally distributed with loc=0 and scale=1
+        tensor([ 0.1046])
+
+    Args:
+        loc (float or Tensor): mean of the distribution (often referred to as mu)
+        scale (float or Tensor): standard deviation of the distribution
+            (often referred to as sigma)
+    """
+    arg_constraints = {"loc": constraints.real, "scale": constraints.positive}
+    support = constraints.real
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    @property
+    def mean(self):
+        return self.loc
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def stddev(self):
+        return self.scale
+
+    @property
+    def variance(self):
+        return self.stddev.pow(2)
+
+    def __init__(self, loc, scale, validate_args=None):
+        self.loc, self.scale = broadcast_all(loc, scale)
+        if isinstance(loc, Number) and isinstance(scale, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.loc.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Normal, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        super(Normal, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.normal(self.loc.expand(shape), self.scale.expand(shape))
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        eps = _standard_normal(shape, dtype=self.loc.dtype, device=self.loc.device)
+        return self.loc + eps * self.scale
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        # compute the variance
+        var = self.scale**2
+        log_scale = (
+            math.log(self.scale) if isinstance(self.scale, Real) else self.scale.log()
+        )
+        return (
+            -((value - self.loc) ** 2) / (2 * var)
+            - log_scale
+            - math.log(math.sqrt(2 * math.pi))
+        )
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        return 0.5 * (
+            1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2))
+        )
+
+    def icdf(self, value):
+        return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2)
+
+    def entropy(self):
+        return 0.5 + 0.5 * math.log(2 * math.pi) + torch.log(self.scale)
+
+    @property
+    def _natural_params(self):
+        return (self.loc / self.scale.pow(2), -0.5 * self.scale.pow(2).reciprocal())
+
+    def _log_normalizer(self, x, y):
+        return -0.25 * x.pow(2) / y + 0.5 * torch.log(-math.pi / y)
diff --git a/MLPY/Lib/site-packages/torch/distributions/one_hot_categorical.py b/MLPY/Lib/site-packages/torch/distributions/one_hot_categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f354f2ca1a4458d5a273d82cc0481669fc52ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/one_hot_categorical.py
@@ -0,0 +1,129 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.categorical import Categorical
+from torch.distributions.distribution import Distribution
+
+__all__ = ["OneHotCategorical", "OneHotCategoricalStraightThrough"]
+
+
+class OneHotCategorical(Distribution):
+    r"""
+    Creates a one-hot categorical distribution parameterized by :attr:`probs` or
+    :attr:`logits`.
+
+    Samples are one-hot coded vectors of size ``probs.size(-1)``.
+
+    .. note:: The `probs` argument must be non-negative, finite and have a non-zero sum,
+              and it will be normalized to sum to 1 along the last dimension. :attr:`probs`
+              will return this normalized value.
+              The `logits` argument will be interpreted as unnormalized log probabilities
+              and can therefore be any real number. It will likewise be normalized so that
+              the resulting probabilities sum to 1 along the last dimension. :attr:`logits`
+              will return this normalized value.
+
+    See also: :func:`torch.distributions.Categorical` for specifications of
+    :attr:`probs` and :attr:`logits`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = OneHotCategorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))
+        >>> m.sample()  # equal probability of 0, 1, 2, 3
+        tensor([ 0.,  0.,  0.,  1.])
+
+    Args:
+        probs (Tensor): event probabilities
+        logits (Tensor): event log probabilities (unnormalized)
+    """
+    arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    support = constraints.one_hot
+    has_enumerate_support = True
+
+    def __init__(self, probs=None, logits=None, validate_args=None):
+        self._categorical = Categorical(probs, logits)
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(OneHotCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new._categorical = self._categorical.expand(batch_shape)
+        super(OneHotCategorical, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @property
+    def _param(self):
+        return self._categorical._param
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def mean(self):
+        return self._categorical.probs
+
+    @property
+    def mode(self):
+        probs = self._categorical.probs
+        mode = probs.argmax(axis=-1)
+        return torch.nn.functional.one_hot(mode, num_classes=probs.shape[-1]).to(probs)
+
+    @property
+    def variance(self):
+        return self._categorical.probs * (1 - self._categorical.probs)
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    def sample(self, sample_shape=torch.Size()):
+        sample_shape = torch.Size(sample_shape)
+        probs = self._categorical.probs
+        num_events = self._categorical._num_events
+        indices = self._categorical.sample(sample_shape)
+        return torch.nn.functional.one_hot(indices, num_events).to(probs)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        indices = value.max(-1)[1]
+        return self._categorical.log_prob(indices)
+
+    def entropy(self):
+        return self._categorical.entropy()
+
+    def enumerate_support(self, expand=True):
+        n = self.event_shape[0]
+        values = torch.eye(n, dtype=self._param.dtype, device=self._param.device)
+        values = values.view((n,) + (1,) * len(self.batch_shape) + (n,))
+        if expand:
+            values = values.expand((n,) + self.batch_shape + (n,))
+        return values
+
+
+class OneHotCategoricalStraightThrough(OneHotCategorical):
+    r"""
+    Creates a reparameterizable :class:`OneHotCategorical` distribution based on the straight-
+    through gradient estimator from [1].
+
+    [1] Estimating or Propagating Gradients Through Stochastic Neurons for Conditional Computation
+    (Bengio et al, 2013)
+    """
+    has_rsample = True
+
+    def rsample(self, sample_shape=torch.Size()):
+        samples = self.sample(sample_shape)
+        probs = self._categorical.probs  # cached via @lazy_property
+        return samples + (probs - probs.detach())
diff --git a/MLPY/Lib/site-packages/torch/distributions/pareto.py b/MLPY/Lib/site-packages/torch/distributions/pareto.py
new file mode 100644
index 0000000000000000000000000000000000000000..3297b47488ecd2e5143b105d461fd695a83add7f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/pareto.py
@@ -0,0 +1,60 @@
+from torch.distributions import constraints
+from torch.distributions.exponential import Exponential
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, ExpTransform
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Pareto"]
+
+
+class Pareto(TransformedDistribution):
+    r"""
+    Samples from a Pareto Type 1 distribution.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Pareto(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Pareto distribution with scale=1 and alpha=1
+        tensor([ 1.5623])
+
+    Args:
+        scale (float or Tensor): Scale parameter of the distribution
+        alpha (float or Tensor): Shape parameter of the distribution
+    """
+    arg_constraints = {"alpha": constraints.positive, "scale": constraints.positive}
+
+    def __init__(self, scale, alpha, validate_args=None):
+        self.scale, self.alpha = broadcast_all(scale, alpha)
+        base_dist = Exponential(self.alpha, validate_args=validate_args)
+        transforms = [ExpTransform(), AffineTransform(loc=0, scale=self.scale)]
+        super().__init__(base_dist, transforms, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Pareto, _instance)
+        new.scale = self.scale.expand(batch_shape)
+        new.alpha = self.alpha.expand(batch_shape)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def mean(self):
+        # mean is inf for alpha <= 1
+        a = self.alpha.clamp(min=1)
+        return a * self.scale / (a - 1)
+
+    @property
+    def mode(self):
+        return self.scale
+
+    @property
+    def variance(self):
+        # var is inf for alpha <= 2
+        a = self.alpha.clamp(min=2)
+        return self.scale.pow(2) * a / ((a - 1).pow(2) * (a - 2))
+
+    @constraints.dependent_property(is_discrete=False, event_dim=0)
+    def support(self):
+        return constraints.greater_than_eq(self.scale)
+
+    def entropy(self):
+        return (self.scale / self.alpha).log() + (1 + self.alpha.reciprocal())
diff --git a/MLPY/Lib/site-packages/torch/distributions/poisson.py b/MLPY/Lib/site-packages/torch/distributions/poisson.py
new file mode 100644
index 0000000000000000000000000000000000000000..fac9ee0aa691c16956063d4b3dfd7e6f1e670d8e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/poisson.py
@@ -0,0 +1,77 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Poisson"]
+
+
+class Poisson(ExponentialFamily):
+    r"""
+    Creates a Poisson distribution parameterized by :attr:`rate`, the rate parameter.
+
+    Samples are nonnegative integers, with a pmf given by
+
+    .. math::
+      \mathrm{rate}^k \frac{e^{-\mathrm{rate}}}{k!}
+
+    Example::
+
+        >>> # xdoctest: +SKIP("poisson_cpu not implemented for 'Long'")
+        >>> m = Poisson(torch.tensor([4]))
+        >>> m.sample()
+        tensor([ 3.])
+
+    Args:
+        rate (Number, Tensor): the rate parameter
+    """
+    arg_constraints = {"rate": constraints.nonnegative}
+    support = constraints.nonnegative_integer
+
+    @property
+    def mean(self):
+        return self.rate
+
+    @property
+    def mode(self):
+        return self.rate.floor()
+
+    @property
+    def variance(self):
+        return self.rate
+
+    def __init__(self, rate, validate_args=None):
+        (self.rate,) = broadcast_all(rate)
+        if isinstance(rate, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.rate.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Poisson, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.rate = self.rate.expand(batch_shape)
+        super(Poisson, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def sample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        with torch.no_grad():
+            return torch.poisson(self.rate.expand(shape))
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        rate, value = broadcast_all(self.rate, value)
+        return value.xlogy(rate) - rate - (value + 1).lgamma()
+
+    @property
+    def _natural_params(self):
+        return (torch.log(self.rate),)
+
+    def _log_normalizer(self, x):
+        return torch.exp(x)
diff --git a/MLPY/Lib/site-packages/torch/distributions/relaxed_bernoulli.py b/MLPY/Lib/site-packages/torch/distributions/relaxed_bernoulli.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f56aad243aaf7f0c456e4e61679d1df3d53563e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/relaxed_bernoulli.py
@@ -0,0 +1,149 @@
+from numbers import Number
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import SigmoidTransform
+from torch.distributions.utils import (
+    broadcast_all,
+    clamp_probs,
+    lazy_property,
+    logits_to_probs,
+    probs_to_logits,
+)
+
+__all__ = ["LogitRelaxedBernoulli", "RelaxedBernoulli"]
+
+
+class LogitRelaxedBernoulli(Distribution):
+    r"""
+    Creates a LogitRelaxedBernoulli distribution parameterized by :attr:`probs`
+    or :attr:`logits` (but not both), which is the logit of a RelaxedBernoulli
+    distribution.
+
+    Samples are logits of values in (0, 1). See [1] for more details.
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Number, Tensor): the probability of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+
+    [1] The Concrete Distribution: A Continuous Relaxation of Discrete Random
+    Variables (Maddison et al, 2017)
+
+    [2] Categorical Reparametrization with Gumbel-Softmax
+    (Jang et al, 2017)
+    """
+    arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    support = constraints.real
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        self.temperature = temperature
+        if (probs is None) == (logits is None):
+            raise ValueError(
+                "Either `probs` or `logits` must be specified, but not both."
+            )
+        if probs is not None:
+            is_scalar = isinstance(probs, Number)
+            (self.probs,) = broadcast_all(probs)
+        else:
+            is_scalar = isinstance(logits, Number)
+            (self.logits,) = broadcast_all(logits)
+        self._param = self.probs if probs is not None else self.logits
+        if is_scalar:
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self._param.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(LogitRelaxedBernoulli, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        if "probs" in self.__dict__:
+            new.probs = self.probs.expand(batch_shape)
+            new._param = new.probs
+        if "logits" in self.__dict__:
+            new.logits = self.logits.expand(batch_shape)
+            new._param = new.logits
+        super(LogitRelaxedBernoulli, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._param.new(*args, **kwargs)
+
+    @lazy_property
+    def logits(self):
+        return probs_to_logits(self.probs, is_binary=True)
+
+    @lazy_property
+    def probs(self):
+        return logits_to_probs(self.logits, is_binary=True)
+
+    @property
+    def param_shape(self):
+        return self._param.size()
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        probs = clamp_probs(self.probs.expand(shape))
+        uniforms = clamp_probs(
+            torch.rand(shape, dtype=probs.dtype, device=probs.device)
+        )
+        return (
+            uniforms.log() - (-uniforms).log1p() + probs.log() - (-probs).log1p()
+        ) / self.temperature
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        diff = logits - value.mul(self.temperature)
+        return self.temperature.log() + diff - 2 * diff.exp().log1p()
+
+
+class RelaxedBernoulli(TransformedDistribution):
+    r"""
+    Creates a RelaxedBernoulli distribution, parametrized by
+    :attr:`temperature`, and either :attr:`probs` or :attr:`logits`
+    (but not both). This is a relaxed version of the `Bernoulli` distribution,
+    so the values are in (0, 1), and has reparametrizable samples.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = RelaxedBernoulli(torch.tensor([2.2]),
+        ...                      torch.tensor([0.1, 0.2, 0.3, 0.99]))
+        >>> m.sample()
+        tensor([ 0.2951,  0.3442,  0.8918,  0.9021])
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Number, Tensor): the probability of sampling `1`
+        logits (Number, Tensor): the log-odds of sampling `1`
+    """
+    arg_constraints = {"probs": constraints.unit_interval, "logits": constraints.real}
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        base_dist = LogitRelaxedBernoulli(temperature, probs, logits)
+        super().__init__(base_dist, SigmoidTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedBernoulli, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def temperature(self):
+        return self.base_dist.temperature
+
+    @property
+    def logits(self):
+        return self.base_dist.logits
+
+    @property
+    def probs(self):
+        return self.base_dist.probs
diff --git a/MLPY/Lib/site-packages/torch/distributions/relaxed_categorical.py b/MLPY/Lib/site-packages/torch/distributions/relaxed_categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cac7b9c285a51538d9c7219584285564dfc807e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/relaxed_categorical.py
@@ -0,0 +1,139 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.categorical import Categorical
+from torch.distributions.distribution import Distribution
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import ExpTransform
+from torch.distributions.utils import broadcast_all, clamp_probs
+
+__all__ = ["ExpRelaxedCategorical", "RelaxedOneHotCategorical"]
+
+
+class ExpRelaxedCategorical(Distribution):
+    r"""
+    Creates a ExpRelaxedCategorical parameterized by
+    :attr:`temperature`, and either :attr:`probs` or :attr:`logits` (but not both).
+    Returns the log of a point in the simplex. Based on the interface to
+    :class:`OneHotCategorical`.
+
+    Implementation based on [1].
+
+    See also: :func:`torch.distributions.OneHotCategorical`
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Tensor): event probabilities
+        logits (Tensor): unnormalized log probability for each event
+
+    [1] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables
+    (Maddison et al, 2017)
+
+    [2] Categorical Reparametrization with Gumbel-Softmax
+    (Jang et al, 2017)
+    """
+    arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    support = (
+        constraints.real_vector
+    )  # The true support is actually a submanifold of this.
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        self._categorical = Categorical(probs, logits)
+        self.temperature = temperature
+        batch_shape = self._categorical.batch_shape
+        event_shape = self._categorical.param_shape[-1:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(ExpRelaxedCategorical, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.temperature = self.temperature
+        new._categorical = self._categorical.expand(batch_shape)
+        super(ExpRelaxedCategorical, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    def _new(self, *args, **kwargs):
+        return self._categorical._new(*args, **kwargs)
+
+    @property
+    def param_shape(self):
+        return self._categorical.param_shape
+
+    @property
+    def logits(self):
+        return self._categorical.logits
+
+    @property
+    def probs(self):
+        return self._categorical.probs
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        uniforms = clamp_probs(
+            torch.rand(shape, dtype=self.logits.dtype, device=self.logits.device)
+        )
+        gumbels = -((-(uniforms.log())).log())
+        scores = (self.logits + gumbels) / self.temperature
+        return scores - scores.logsumexp(dim=-1, keepdim=True)
+
+    def log_prob(self, value):
+        K = self._categorical._num_events
+        if self._validate_args:
+            self._validate_sample(value)
+        logits, value = broadcast_all(self.logits, value)
+        log_scale = torch.full_like(
+            self.temperature, float(K)
+        ).lgamma() - self.temperature.log().mul(-(K - 1))
+        score = logits - value.mul(self.temperature)
+        score = (score - score.logsumexp(dim=-1, keepdim=True)).sum(-1)
+        return score + log_scale
+
+
+class RelaxedOneHotCategorical(TransformedDistribution):
+    r"""
+    Creates a RelaxedOneHotCategorical distribution parametrized by
+    :attr:`temperature`, and either :attr:`probs` or :attr:`logits`.
+    This is a relaxed version of the :class:`OneHotCategorical` distribution, so
+    its samples are on simplex, and are reparametrizable.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = RelaxedOneHotCategorical(torch.tensor([2.2]),
+        ...                              torch.tensor([0.1, 0.2, 0.3, 0.4]))
+        >>> m.sample()
+        tensor([ 0.1294,  0.2324,  0.3859,  0.2523])
+
+    Args:
+        temperature (Tensor): relaxation temperature
+        probs (Tensor): event probabilities
+        logits (Tensor): unnormalized log probability for each event
+    """
+    arg_constraints = {"probs": constraints.simplex, "logits": constraints.real_vector}
+    support = constraints.simplex
+    has_rsample = True
+
+    def __init__(self, temperature, probs=None, logits=None, validate_args=None):
+        base_dist = ExpRelaxedCategorical(
+            temperature, probs, logits, validate_args=validate_args
+        )
+        super().__init__(base_dist, ExpTransform(), validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(RelaxedOneHotCategorical, _instance)
+        return super().expand(batch_shape, _instance=new)
+
+    @property
+    def temperature(self):
+        return self.base_dist.temperature
+
+    @property
+    def logits(self):
+        return self.base_dist.logits
+
+    @property
+    def probs(self):
+        return self.base_dist.probs
diff --git a/MLPY/Lib/site-packages/torch/distributions/studentT.py b/MLPY/Lib/site-packages/torch/distributions/studentT.py
new file mode 100644
index 0000000000000000000000000000000000000000..7881b5e50088d071a0a778c7229198b66f2c00b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/studentT.py
@@ -0,0 +1,116 @@
+import math
+
+import torch
+from torch import inf, nan
+from torch.distributions import Chi2, constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import _standard_normal, broadcast_all
+
+__all__ = ["StudentT"]
+
+
+class StudentT(Distribution):
+    r"""
+    Creates a Student's t-distribution parameterized by degree of
+    freedom :attr:`df`, mean :attr:`loc` and scale :attr:`scale`.
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = StudentT(torch.tensor([2.0]))
+        >>> m.sample()  # Student's t-distributed with degrees of freedom=2
+        tensor([ 0.1046])
+
+    Args:
+        df (float or Tensor): degrees of freedom
+        loc (float or Tensor): mean of the distribution
+        scale (float or Tensor): scale of the distribution
+    """
+    arg_constraints = {
+        "df": constraints.positive,
+        "loc": constraints.real,
+        "scale": constraints.positive,
+    }
+    support = constraints.real
+    has_rsample = True
+
+    @property
+    def mean(self):
+        m = self.loc.clone(memory_format=torch.contiguous_format)
+        m[self.df <= 1] = nan
+        return m
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @property
+    def variance(self):
+        m = self.df.clone(memory_format=torch.contiguous_format)
+        m[self.df > 2] = (
+            self.scale[self.df > 2].pow(2)
+            * self.df[self.df > 2]
+            / (self.df[self.df > 2] - 2)
+        )
+        m[(self.df <= 2) & (self.df > 1)] = inf
+        m[self.df <= 1] = nan
+        return m
+
+    def __init__(self, df, loc=0.0, scale=1.0, validate_args=None):
+        self.df, self.loc, self.scale = broadcast_all(df, loc, scale)
+        self._chi2 = Chi2(self.df)
+        batch_shape = self.df.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(StudentT, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.df = self.df.expand(batch_shape)
+        new.loc = self.loc.expand(batch_shape)
+        new.scale = self.scale.expand(batch_shape)
+        new._chi2 = self._chi2.expand(batch_shape)
+        super(StudentT, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    def rsample(self, sample_shape=torch.Size()):
+        # NOTE: This does not agree with scipy implementation as much as other distributions.
+        # (see https://github.com/fritzo/notebooks/blob/master/debug-student-t.ipynb). Using DoubleTensor
+        # parameters seems to help.
+
+        #   X ~ Normal(0, 1)
+        #   Z ~ Chi2(df)
+        #   Y = X / sqrt(Z / df) ~ StudentT(df)
+        shape = self._extended_shape(sample_shape)
+        X = _standard_normal(shape, dtype=self.df.dtype, device=self.df.device)
+        Z = self._chi2.rsample(sample_shape)
+        Y = X * torch.rsqrt(Z / self.df)
+        return self.loc + self.scale * Y
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        y = (value - self.loc) / self.scale
+        Z = (
+            self.scale.log()
+            + 0.5 * self.df.log()
+            + 0.5 * math.log(math.pi)
+            + torch.lgamma(0.5 * self.df)
+            - torch.lgamma(0.5 * (self.df + 1.0))
+        )
+        return -0.5 * (self.df + 1.0) * torch.log1p(y**2.0 / self.df) - Z
+
+    def entropy(self):
+        lbeta = (
+            torch.lgamma(0.5 * self.df)
+            + math.lgamma(0.5)
+            - torch.lgamma(0.5 * (self.df + 1))
+        )
+        return (
+            self.scale.log()
+            + 0.5
+            * (self.df + 1)
+            * (torch.digamma(0.5 * (self.df + 1)) - torch.digamma(0.5 * self.df))
+            + 0.5 * self.df.log()
+            + lbeta
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/transformed_distribution.py b/MLPY/Lib/site-packages/torch/distributions/transformed_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..6165c8cf39087325ae84c42d612f836c7daba2a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/transformed_distribution.py
@@ -0,0 +1,215 @@
+from typing import Dict
+
+import torch
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.independent import Independent
+from torch.distributions.transforms import ComposeTransform, Transform
+from torch.distributions.utils import _sum_rightmost
+
+__all__ = ["TransformedDistribution"]
+
+
+class TransformedDistribution(Distribution):
+    r"""
+    Extension of the Distribution class, which applies a sequence of Transforms
+    to a base distribution.  Let f be the composition of transforms applied::
+
+        X ~ BaseDistribution
+        Y = f(X) ~ TransformedDistribution(BaseDistribution, f)
+        log p(Y) = log p(X) + log |det (dX/dY)|
+
+    Note that the ``.event_shape`` of a :class:`TransformedDistribution` is the
+    maximum shape of its base distribution and its transforms, since transforms
+    can introduce correlations among events.
+
+    An example for the usage of :class:`TransformedDistribution` would be::
+
+        # Building a Logistic Distribution
+        # X ~ Uniform(0, 1)
+        # f = a + b * logit(X)
+        # Y ~ f(X) ~ Logistic(a, b)
+        base_distribution = Uniform(0, 1)
+        transforms = [SigmoidTransform().inv, AffineTransform(loc=a, scale=b)]
+        logistic = TransformedDistribution(base_distribution, transforms)
+
+    For more examples, please look at the implementations of
+    :class:`~torch.distributions.gumbel.Gumbel`,
+    :class:`~torch.distributions.half_cauchy.HalfCauchy`,
+    :class:`~torch.distributions.half_normal.HalfNormal`,
+    :class:`~torch.distributions.log_normal.LogNormal`,
+    :class:`~torch.distributions.pareto.Pareto`,
+    :class:`~torch.distributions.weibull.Weibull`,
+    :class:`~torch.distributions.relaxed_bernoulli.RelaxedBernoulli` and
+    :class:`~torch.distributions.relaxed_categorical.RelaxedOneHotCategorical`
+    """
+    arg_constraints: Dict[str, constraints.Constraint] = {}
+
+    def __init__(self, base_distribution, transforms, validate_args=None):
+        if isinstance(transforms, Transform):
+            self.transforms = [
+                transforms,
+            ]
+        elif isinstance(transforms, list):
+            if not all(isinstance(t, Transform) for t in transforms):
+                raise ValueError(
+                    "transforms must be a Transform or a list of Transforms"
+                )
+            self.transforms = transforms
+        else:
+            raise ValueError(
+                f"transforms must be a Transform or list, but was {transforms}"
+            )
+
+        # Reshape base_distribution according to transforms.
+        base_shape = base_distribution.batch_shape + base_distribution.event_shape
+        base_event_dim = len(base_distribution.event_shape)
+        transform = ComposeTransform(self.transforms)
+        if len(base_shape) < transform.domain.event_dim:
+            raise ValueError(
+                "base_distribution needs to have shape with size at least {}, but got {}.".format(
+                    transform.domain.event_dim, base_shape
+                )
+            )
+        forward_shape = transform.forward_shape(base_shape)
+        expanded_base_shape = transform.inverse_shape(forward_shape)
+        if base_shape != expanded_base_shape:
+            base_batch_shape = expanded_base_shape[
+                : len(expanded_base_shape) - base_event_dim
+            ]
+            base_distribution = base_distribution.expand(base_batch_shape)
+        reinterpreted_batch_ndims = transform.domain.event_dim - base_event_dim
+        if reinterpreted_batch_ndims > 0:
+            base_distribution = Independent(
+                base_distribution, reinterpreted_batch_ndims
+            )
+        self.base_dist = base_distribution
+
+        # Compute shapes.
+        transform_change_in_event_dim = (
+            transform.codomain.event_dim - transform.domain.event_dim
+        )
+        event_dim = max(
+            transform.codomain.event_dim,  # the transform is coupled
+            base_event_dim + transform_change_in_event_dim,  # the base dist is coupled
+        )
+        assert len(forward_shape) >= event_dim
+        cut = len(forward_shape) - event_dim
+        batch_shape = forward_shape[:cut]
+        event_shape = forward_shape[cut:]
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(TransformedDistribution, _instance)
+        batch_shape = torch.Size(batch_shape)
+        shape = batch_shape + self.event_shape
+        for t in reversed(self.transforms):
+            shape = t.inverse_shape(shape)
+        base_batch_shape = shape[: len(shape) - len(self.base_dist.event_shape)]
+        new.base_dist = self.base_dist.expand(base_batch_shape)
+        new.transforms = self.transforms
+        super(TransformedDistribution, new).__init__(
+            batch_shape, self.event_shape, validate_args=False
+        )
+        new._validate_args = self._validate_args
+        return new
+
+    @constraints.dependent_property(is_discrete=False)
+    def support(self):
+        if not self.transforms:
+            return self.base_dist.support
+        support = self.transforms[-1].codomain
+        if len(self.event_shape) > support.event_dim:
+            support = constraints.independent(
+                support, len(self.event_shape) - support.event_dim
+            )
+        return support
+
+    @property
+    def has_rsample(self):
+        return self.base_dist.has_rsample
+
+    def sample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped sample or sample_shape shaped batch of
+        samples if the distribution parameters are batched. Samples first from
+        base distribution and applies `transform()` for every transform in the
+        list.
+        """
+        with torch.no_grad():
+            x = self.base_dist.sample(sample_shape)
+            for transform in self.transforms:
+                x = transform(x)
+            return x
+
+    def rsample(self, sample_shape=torch.Size()):
+        """
+        Generates a sample_shape shaped reparameterized sample or sample_shape
+        shaped batch of reparameterized samples if the distribution parameters
+        are batched. Samples first from base distribution and applies
+        `transform()` for every transform in the list.
+        """
+        x = self.base_dist.rsample(sample_shape)
+        for transform in self.transforms:
+            x = transform(x)
+        return x
+
+    def log_prob(self, value):
+        """
+        Scores the sample by inverting the transform(s) and computing the score
+        using the score of the base distribution and the log abs det jacobian.
+        """
+        if self._validate_args:
+            self._validate_sample(value)
+        event_dim = len(self.event_shape)
+        log_prob = 0.0
+        y = value
+        for transform in reversed(self.transforms):
+            x = transform.inv(y)
+            event_dim += transform.domain.event_dim - transform.codomain.event_dim
+            log_prob = log_prob - _sum_rightmost(
+                transform.log_abs_det_jacobian(x, y),
+                event_dim - transform.domain.event_dim,
+            )
+            y = x
+
+        log_prob = log_prob + _sum_rightmost(
+            self.base_dist.log_prob(y), event_dim - len(self.base_dist.event_shape)
+        )
+        return log_prob
+
+    def _monotonize_cdf(self, value):
+        """
+        This conditionally flips ``value -> 1-value`` to ensure :meth:`cdf` is
+        monotone increasing.
+        """
+        sign = 1
+        for transform in self.transforms:
+            sign = sign * transform.sign
+        if isinstance(sign, int) and sign == 1:
+            return value
+        return sign * (value - 0.5) + 0.5
+
+    def cdf(self, value):
+        """
+        Computes the cumulative distribution function by inverting the
+        transform(s) and computing the score of the base distribution.
+        """
+        for transform in self.transforms[::-1]:
+            value = transform.inv(value)
+        if self._validate_args:
+            self.base_dist._validate_sample(value)
+        value = self.base_dist.cdf(value)
+        value = self._monotonize_cdf(value)
+        return value
+
+    def icdf(self, value):
+        """
+        Computes the inverse cumulative distribution function using
+        transform(s) and computing the score of the base distribution.
+        """
+        value = self._monotonize_cdf(value)
+        value = self.base_dist.icdf(value)
+        for transform in self.transforms:
+            value = transform(value)
+        return value
diff --git a/MLPY/Lib/site-packages/torch/distributions/transforms.py b/MLPY/Lib/site-packages/torch/distributions/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..14e44f88014fb9810701e9e31b0d63abf15a039f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/transforms.py
@@ -0,0 +1,1245 @@
+import functools
+import math
+import numbers
+import operator
+import weakref
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from torch.distributions import constraints
+from torch.distributions.utils import (
+    _sum_rightmost,
+    broadcast_all,
+    lazy_property,
+    tril_matrix_to_vec,
+    vec_to_tril_matrix,
+)
+from torch.nn.functional import pad, softplus
+
+__all__ = [
+    "AbsTransform",
+    "AffineTransform",
+    "CatTransform",
+    "ComposeTransform",
+    "CorrCholeskyTransform",
+    "CumulativeDistributionTransform",
+    "ExpTransform",
+    "IndependentTransform",
+    "LowerCholeskyTransform",
+    "PositiveDefiniteTransform",
+    "PowerTransform",
+    "ReshapeTransform",
+    "SigmoidTransform",
+    "SoftplusTransform",
+    "TanhTransform",
+    "SoftmaxTransform",
+    "StackTransform",
+    "StickBreakingTransform",
+    "Transform",
+    "identity_transform",
+]
+
+
+class Transform:
+    """
+    Abstract class for invertable transformations with computable log
+    det jacobians. They are primarily used in
+    :class:`torch.distributions.TransformedDistribution`.
+
+    Caching is useful for transforms whose inverses are either expensive or
+    numerically unstable. Note that care must be taken with memoized values
+    since the autograd graph may be reversed. For example while the following
+    works with or without caching::
+
+        y = t(x)
+        t.log_abs_det_jacobian(x, y).backward()  # x will receive gradients.
+
+    However the following will error when caching due to dependency reversal::
+
+        y = t(x)
+        z = t.inv(y)
+        grad(z.sum(), [y])  # error because z is x
+
+    Derived classes should implement one or both of :meth:`_call` or
+    :meth:`_inverse`. Derived classes that set `bijective=True` should also
+    implement :meth:`log_abs_det_jacobian`.
+
+    Args:
+        cache_size (int): Size of cache. If zero, no caching is done. If one,
+            the latest single value is cached. Only 0 and 1 are supported.
+
+    Attributes:
+        domain (:class:`~torch.distributions.constraints.Constraint`):
+            The constraint representing valid inputs to this transform.
+        codomain (:class:`~torch.distributions.constraints.Constraint`):
+            The constraint representing valid outputs to this transform
+            which are inputs to the inverse transform.
+        bijective (bool): Whether this transform is bijective. A transform
+            ``t`` is bijective iff ``t.inv(t(x)) == x`` and
+            ``t(t.inv(y)) == y`` for every ``x`` in the domain and ``y`` in
+            the codomain. Transforms that are not bijective should at least
+            maintain the weaker pseudoinverse properties
+            ``t(t.inv(t(x)) == t(x)`` and ``t.inv(t(t.inv(y))) == t.inv(y)``.
+        sign (int or Tensor): For bijective univariate transforms, this
+            should be +1 or -1 depending on whether transform is monotone
+            increasing or decreasing.
+    """
+
+    bijective = False
+    domain: constraints.Constraint
+    codomain: constraints.Constraint
+
+    def __init__(self, cache_size=0):
+        self._cache_size = cache_size
+        self._inv = None
+        if cache_size == 0:
+            pass  # default behavior
+        elif cache_size == 1:
+            self._cached_x_y = None, None
+        else:
+            raise ValueError("cache_size must be 0 or 1")
+        super().__init__()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["_inv"] = None
+        return state
+
+    @property
+    def event_dim(self):
+        if self.domain.event_dim == self.codomain.event_dim:
+            return self.domain.event_dim
+        raise ValueError("Please use either .domain.event_dim or .codomain.event_dim")
+
+    @property
+    def inv(self):
+        """
+        Returns the inverse :class:`Transform` of this transform.
+        This should satisfy ``t.inv.inv is t``.
+        """
+        inv = None
+        if self._inv is not None:
+            inv = self._inv()
+        if inv is None:
+            inv = _InverseTransform(self)
+            self._inv = weakref.ref(inv)
+        return inv
+
+    @property
+    def sign(self):
+        """
+        Returns the sign of the determinant of the Jacobian, if applicable.
+        In general this only makes sense for bijective transforms.
+        """
+        raise NotImplementedError
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        if type(self).__init__ is Transform.__init__:
+            return type(self)(cache_size=cache_size)
+        raise NotImplementedError(f"{type(self)}.with_cache is not implemented")
+
+    def __eq__(self, other):
+        return self is other
+
+    def __ne__(self, other):
+        # Necessary for Python2
+        return not self.__eq__(other)
+
+    def __call__(self, x):
+        """
+        Computes the transform `x => y`.
+        """
+        if self._cache_size == 0:
+            return self._call(x)
+        x_old, y_old = self._cached_x_y
+        if x is x_old:
+            return y_old
+        y = self._call(x)
+        self._cached_x_y = x, y
+        return y
+
+    def _inv_call(self, y):
+        """
+        Inverts the transform `y => x`.
+        """
+        if self._cache_size == 0:
+            return self._inverse(y)
+        x_old, y_old = self._cached_x_y
+        if y is y_old:
+            return x_old
+        x = self._inverse(y)
+        self._cached_x_y = x, y
+        return x
+
+    def _call(self, x):
+        """
+        Abstract method to compute forward transformation.
+        """
+        raise NotImplementedError
+
+    def _inverse(self, y):
+        """
+        Abstract method to compute inverse transformation.
+        """
+        raise NotImplementedError
+
+    def log_abs_det_jacobian(self, x, y):
+        """
+        Computes the log det jacobian `log |dy/dx|` given input and output.
+        """
+        raise NotImplementedError
+
+    def __repr__(self):
+        return self.__class__.__name__ + "()"
+
+    def forward_shape(self, shape):
+        """
+        Infers the shape of the forward computation, given the input shape.
+        Defaults to preserving shape.
+        """
+        return shape
+
+    def inverse_shape(self, shape):
+        """
+        Infers the shapes of the inverse computation, given the output shape.
+        Defaults to preserving shape.
+        """
+        return shape
+
+
+class _InverseTransform(Transform):
+    """
+    Inverts a single :class:`Transform`.
+    This class is private; please instead use the ``Transform.inv`` property.
+    """
+
+    def __init__(self, transform: Transform):
+        super().__init__(cache_size=transform._cache_size)
+        self._inv: Transform = transform
+
+    @constraints.dependent_property(is_discrete=False)
+    def domain(self):
+        assert self._inv is not None
+        return self._inv.codomain
+
+    @constraints.dependent_property(is_discrete=False)
+    def codomain(self):
+        assert self._inv is not None
+        return self._inv.domain
+
+    @property
+    def bijective(self):
+        assert self._inv is not None
+        return self._inv.bijective
+
+    @property
+    def sign(self):
+        assert self._inv is not None
+        return self._inv.sign
+
+    @property
+    def inv(self):
+        return self._inv
+
+    def with_cache(self, cache_size=1):
+        assert self._inv is not None
+        return self.inv.with_cache(cache_size).inv
+
+    def __eq__(self, other):
+        if not isinstance(other, _InverseTransform):
+            return False
+        assert self._inv is not None
+        return self._inv == other._inv
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({repr(self._inv)})"
+
+    def __call__(self, x):
+        assert self._inv is not None
+        return self._inv._inv_call(x)
+
+    def log_abs_det_jacobian(self, x, y):
+        assert self._inv is not None
+        return -self._inv.log_abs_det_jacobian(y, x)
+
+    def forward_shape(self, shape):
+        return self._inv.inverse_shape(shape)
+
+    def inverse_shape(self, shape):
+        return self._inv.forward_shape(shape)
+
+
+class ComposeTransform(Transform):
+    """
+    Composes multiple transforms in a chain.
+    The transforms being composed are responsible for caching.
+
+    Args:
+        parts (list of :class:`Transform`): A list of transforms to compose.
+        cache_size (int): Size of cache. If zero, no caching is done. If one,
+            the latest single value is cached. Only 0 and 1 are supported.
+    """
+
+    def __init__(self, parts: List[Transform], cache_size=0):
+        if cache_size:
+            parts = [part.with_cache(cache_size) for part in parts]
+        super().__init__(cache_size=cache_size)
+        self.parts = parts
+
+    def __eq__(self, other):
+        if not isinstance(other, ComposeTransform):
+            return False
+        return self.parts == other.parts
+
+    @constraints.dependent_property(is_discrete=False)
+    def domain(self):
+        if not self.parts:
+            return constraints.real
+        domain = self.parts[0].domain
+        # Adjust event_dim to be maximum among all parts.
+        event_dim = self.parts[-1].codomain.event_dim
+        for part in reversed(self.parts):
+            event_dim += part.domain.event_dim - part.codomain.event_dim
+            event_dim = max(event_dim, part.domain.event_dim)
+        assert event_dim >= domain.event_dim
+        if event_dim > domain.event_dim:
+            domain = constraints.independent(domain, event_dim - domain.event_dim)
+        return domain
+
+    @constraints.dependent_property(is_discrete=False)
+    def codomain(self):
+        if not self.parts:
+            return constraints.real
+        codomain = self.parts[-1].codomain
+        # Adjust event_dim to be maximum among all parts.
+        event_dim = self.parts[0].domain.event_dim
+        for part in self.parts:
+            event_dim += part.codomain.event_dim - part.domain.event_dim
+            event_dim = max(event_dim, part.codomain.event_dim)
+        assert event_dim >= codomain.event_dim
+        if event_dim > codomain.event_dim:
+            codomain = constraints.independent(codomain, event_dim - codomain.event_dim)
+        return codomain
+
+    @lazy_property
+    def bijective(self):
+        return all(p.bijective for p in self.parts)
+
+    @lazy_property
+    def sign(self):
+        sign = 1
+        for p in self.parts:
+            sign = sign * p.sign
+        return sign
+
+    @property
+    def inv(self):
+        inv = None
+        if self._inv is not None:
+            inv = self._inv()
+        if inv is None:
+            inv = ComposeTransform([p.inv for p in reversed(self.parts)])
+            self._inv = weakref.ref(inv)
+            inv._inv = weakref.ref(self)
+        return inv
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return ComposeTransform(self.parts, cache_size=cache_size)
+
+    def __call__(self, x):
+        for part in self.parts:
+            x = part(x)
+        return x
+
+    def log_abs_det_jacobian(self, x, y):
+        if not self.parts:
+            return torch.zeros_like(x)
+
+        # Compute intermediates. This will be free if parts[:-1] are all cached.
+        xs = [x]
+        for part in self.parts[:-1]:
+            xs.append(part(xs[-1]))
+        xs.append(y)
+
+        terms = []
+        event_dim = self.domain.event_dim
+        for part, x, y in zip(self.parts, xs[:-1], xs[1:]):
+            terms.append(
+                _sum_rightmost(
+                    part.log_abs_det_jacobian(x, y), event_dim - part.domain.event_dim
+                )
+            )
+            event_dim += part.codomain.event_dim - part.domain.event_dim
+        return functools.reduce(operator.add, terms)
+
+    def forward_shape(self, shape):
+        for part in self.parts:
+            shape = part.forward_shape(shape)
+        return shape
+
+    def inverse_shape(self, shape):
+        for part in reversed(self.parts):
+            shape = part.inverse_shape(shape)
+        return shape
+
+    def __repr__(self):
+        fmt_string = self.__class__.__name__ + "(\n    "
+        fmt_string += ",\n    ".join([p.__repr__() for p in self.parts])
+        fmt_string += "\n)"
+        return fmt_string
+
+
+identity_transform = ComposeTransform([])
+
+
+class IndependentTransform(Transform):
+    """
+    Wrapper around another transform to treat
+    ``reinterpreted_batch_ndims``-many extra of the right most dimensions as
+    dependent. This has no effect on the forward or backward transforms, but
+    does sum out ``reinterpreted_batch_ndims``-many of the rightmost dimensions
+    in :meth:`log_abs_det_jacobian`.
+
+    Args:
+        base_transform (:class:`Transform`): A base transform.
+        reinterpreted_batch_ndims (int): The number of extra rightmost
+            dimensions to treat as dependent.
+    """
+
+    def __init__(self, base_transform, reinterpreted_batch_ndims, cache_size=0):
+        super().__init__(cache_size=cache_size)
+        self.base_transform = base_transform.with_cache(cache_size)
+        self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return IndependentTransform(
+            self.base_transform, self.reinterpreted_batch_ndims, cache_size=cache_size
+        )
+
+    @constraints.dependent_property(is_discrete=False)
+    def domain(self):
+        return constraints.independent(
+            self.base_transform.domain, self.reinterpreted_batch_ndims
+        )
+
+    @constraints.dependent_property(is_discrete=False)
+    def codomain(self):
+        return constraints.independent(
+            self.base_transform.codomain, self.reinterpreted_batch_ndims
+        )
+
+    @property
+    def bijective(self):
+        return self.base_transform.bijective
+
+    @property
+    def sign(self):
+        return self.base_transform.sign
+
+    def _call(self, x):
+        if x.dim() < self.domain.event_dim:
+            raise ValueError("Too few dimensions on input")
+        return self.base_transform(x)
+
+    def _inverse(self, y):
+        if y.dim() < self.codomain.event_dim:
+            raise ValueError("Too few dimensions on input")
+        return self.base_transform.inv(y)
+
+    def log_abs_det_jacobian(self, x, y):
+        result = self.base_transform.log_abs_det_jacobian(x, y)
+        result = _sum_rightmost(result, self.reinterpreted_batch_ndims)
+        return result
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({repr(self.base_transform)}, {self.reinterpreted_batch_ndims})"
+
+    def forward_shape(self, shape):
+        return self.base_transform.forward_shape(shape)
+
+    def inverse_shape(self, shape):
+        return self.base_transform.inverse_shape(shape)
+
+
+class ReshapeTransform(Transform):
+    """
+    Unit Jacobian transform to reshape the rightmost part of a tensor.
+
+    Note that ``in_shape`` and ``out_shape`` must have the same number of
+    elements, just as for :meth:`torch.Tensor.reshape`.
+
+    Arguments:
+        in_shape (torch.Size): The input event shape.
+        out_shape (torch.Size): The output event shape.
+    """
+
+    bijective = True
+
+    def __init__(self, in_shape, out_shape, cache_size=0):
+        self.in_shape = torch.Size(in_shape)
+        self.out_shape = torch.Size(out_shape)
+        if self.in_shape.numel() != self.out_shape.numel():
+            raise ValueError("in_shape, out_shape have different numbers of elements")
+        super().__init__(cache_size=cache_size)
+
+    @constraints.dependent_property
+    def domain(self):
+        return constraints.independent(constraints.real, len(self.in_shape))
+
+    @constraints.dependent_property
+    def codomain(self):
+        return constraints.independent(constraints.real, len(self.out_shape))
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return ReshapeTransform(self.in_shape, self.out_shape, cache_size=cache_size)
+
+    def _call(self, x):
+        batch_shape = x.shape[: x.dim() - len(self.in_shape)]
+        return x.reshape(batch_shape + self.out_shape)
+
+    def _inverse(self, y):
+        batch_shape = y.shape[: y.dim() - len(self.out_shape)]
+        return y.reshape(batch_shape + self.in_shape)
+
+    def log_abs_det_jacobian(self, x, y):
+        batch_shape = x.shape[: x.dim() - len(self.in_shape)]
+        return x.new_zeros(batch_shape)
+
+    def forward_shape(self, shape):
+        if len(shape) < len(self.in_shape):
+            raise ValueError("Too few dimensions on input")
+        cut = len(shape) - len(self.in_shape)
+        if shape[cut:] != self.in_shape:
+            raise ValueError(
+                f"Shape mismatch: expected {shape[cut:]} but got {self.in_shape}"
+            )
+        return shape[:cut] + self.out_shape
+
+    def inverse_shape(self, shape):
+        if len(shape) < len(self.out_shape):
+            raise ValueError("Too few dimensions on input")
+        cut = len(shape) - len(self.out_shape)
+        if shape[cut:] != self.out_shape:
+            raise ValueError(
+                f"Shape mismatch: expected {shape[cut:]} but got {self.out_shape}"
+            )
+        return shape[:cut] + self.in_shape
+
+
+class ExpTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = \exp(x)`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, ExpTransform)
+
+    def _call(self, x):
+        return x.exp()
+
+    def _inverse(self, y):
+        return y.log()
+
+    def log_abs_det_jacobian(self, x, y):
+        return x
+
+
+class PowerTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = x^{\text{exponent}}`.
+    """
+    domain = constraints.positive
+    codomain = constraints.positive
+    bijective = True
+
+    def __init__(self, exponent, cache_size=0):
+        super().__init__(cache_size=cache_size)
+        (self.exponent,) = broadcast_all(exponent)
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return PowerTransform(self.exponent, cache_size=cache_size)
+
+    @lazy_property
+    def sign(self):
+        return self.exponent.sign()
+
+    def __eq__(self, other):
+        if not isinstance(other, PowerTransform):
+            return False
+        return self.exponent.eq(other.exponent).all().item()
+
+    def _call(self, x):
+        return x.pow(self.exponent)
+
+    def _inverse(self, y):
+        return y.pow(1 / self.exponent)
+
+    def log_abs_det_jacobian(self, x, y):
+        return (self.exponent * y / x).abs().log()
+
+    def forward_shape(self, shape):
+        return torch.broadcast_shapes(shape, getattr(self.exponent, "shape", ()))
+
+    def inverse_shape(self, shape):
+        return torch.broadcast_shapes(shape, getattr(self.exponent, "shape", ()))
+
+
+def _clipped_sigmoid(x):
+    finfo = torch.finfo(x.dtype)
+    return torch.clamp(torch.sigmoid(x), min=finfo.tiny, max=1.0 - finfo.eps)
+
+
+class SigmoidTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = \frac{1}{1 + \exp(-x)}` and :math:`x = \text{logit}(y)`.
+    """
+    domain = constraints.real
+    codomain = constraints.unit_interval
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, SigmoidTransform)
+
+    def _call(self, x):
+        return _clipped_sigmoid(x)
+
+    def _inverse(self, y):
+        finfo = torch.finfo(y.dtype)
+        y = y.clamp(min=finfo.tiny, max=1.0 - finfo.eps)
+        return y.log() - (-y).log1p()
+
+    def log_abs_det_jacobian(self, x, y):
+        return -F.softplus(-x) - F.softplus(x)
+
+
+class SoftplusTransform(Transform):
+    r"""
+    Transform via the mapping :math:`\text{Softplus}(x) = \log(1 + \exp(x))`.
+    The implementation reverts to the linear function when :math:`x > 20`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, SoftplusTransform)
+
+    def _call(self, x):
+        return softplus(x)
+
+    def _inverse(self, y):
+        return (-y).expm1().neg().log() + y
+
+    def log_abs_det_jacobian(self, x, y):
+        return -softplus(-x)
+
+
+class TanhTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = \tanh(x)`.
+
+    It is equivalent to
+    ```
+    ComposeTransform([AffineTransform(0., 2.), SigmoidTransform(), AffineTransform(-1., 2.)])
+    ```
+    However this might not be numerically stable, thus it is recommended to use `TanhTransform`
+    instead.
+
+    Note that one should use `cache_size=1` when it comes to `NaN/Inf` values.
+
+    """
+    domain = constraints.real
+    codomain = constraints.interval(-1.0, 1.0)
+    bijective = True
+    sign = +1
+
+    def __eq__(self, other):
+        return isinstance(other, TanhTransform)
+
+    def _call(self, x):
+        return x.tanh()
+
+    def _inverse(self, y):
+        # We do not clamp to the boundary here as it may degrade the performance of certain algorithms.
+        # one should use `cache_size=1` instead
+        return torch.atanh(y)
+
+    def log_abs_det_jacobian(self, x, y):
+        # We use a formula that is more numerically stable, see details in the following link
+        # https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80
+        return 2.0 * (math.log(2.0) - x - softplus(-2.0 * x))
+
+
+class AbsTransform(Transform):
+    r"""
+    Transform via the mapping :math:`y = |x|`.
+    """
+    domain = constraints.real
+    codomain = constraints.positive
+
+    def __eq__(self, other):
+        return isinstance(other, AbsTransform)
+
+    def _call(self, x):
+        return x.abs()
+
+    def _inverse(self, y):
+        return y
+
+
+class AffineTransform(Transform):
+    r"""
+    Transform via the pointwise affine mapping :math:`y = \text{loc} + \text{scale} \times x`.
+
+    Args:
+        loc (Tensor or float): Location parameter.
+        scale (Tensor or float): Scale parameter.
+        event_dim (int): Optional size of `event_shape`. This should be zero
+            for univariate random variables, 1 for distributions over vectors,
+            2 for distributions over matrices, etc.
+    """
+    bijective = True
+
+    def __init__(self, loc, scale, event_dim=0, cache_size=0):
+        super().__init__(cache_size=cache_size)
+        self.loc = loc
+        self.scale = scale
+        self._event_dim = event_dim
+
+    @property
+    def event_dim(self):
+        return self._event_dim
+
+    @constraints.dependent_property(is_discrete=False)
+    def domain(self):
+        if self.event_dim == 0:
+            return constraints.real
+        return constraints.independent(constraints.real, self.event_dim)
+
+    @constraints.dependent_property(is_discrete=False)
+    def codomain(self):
+        if self.event_dim == 0:
+            return constraints.real
+        return constraints.independent(constraints.real, self.event_dim)
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return AffineTransform(
+            self.loc, self.scale, self.event_dim, cache_size=cache_size
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, AffineTransform):
+            return False
+
+        if isinstance(self.loc, numbers.Number) and isinstance(
+            other.loc, numbers.Number
+        ):
+            if self.loc != other.loc:
+                return False
+        else:
+            if not (self.loc == other.loc).all().item():
+                return False
+
+        if isinstance(self.scale, numbers.Number) and isinstance(
+            other.scale, numbers.Number
+        ):
+            if self.scale != other.scale:
+                return False
+        else:
+            if not (self.scale == other.scale).all().item():
+                return False
+
+        return True
+
+    @property
+    def sign(self):
+        if isinstance(self.scale, numbers.Real):
+            return 1 if float(self.scale) > 0 else -1 if float(self.scale) < 0 else 0
+        return self.scale.sign()
+
+    def _call(self, x):
+        return self.loc + self.scale * x
+
+    def _inverse(self, y):
+        return (y - self.loc) / self.scale
+
+    def log_abs_det_jacobian(self, x, y):
+        shape = x.shape
+        scale = self.scale
+        if isinstance(scale, numbers.Real):
+            result = torch.full_like(x, math.log(abs(scale)))
+        else:
+            result = torch.abs(scale).log()
+        if self.event_dim:
+            result_size = result.size()[: -self.event_dim] + (-1,)
+            result = result.view(result_size).sum(-1)
+            shape = shape[: -self.event_dim]
+        return result.expand(shape)
+
+    def forward_shape(self, shape):
+        return torch.broadcast_shapes(
+            shape, getattr(self.loc, "shape", ()), getattr(self.scale, "shape", ())
+        )
+
+    def inverse_shape(self, shape):
+        return torch.broadcast_shapes(
+            shape, getattr(self.loc, "shape", ()), getattr(self.scale, "shape", ())
+        )
+
+
+class CorrCholeskyTransform(Transform):
+    r"""
+    Transforms an uncontrained real vector :math:`x` with length :math:`D*(D-1)/2` into the
+    Cholesky factor of a D-dimension correlation matrix. This Cholesky factor is a lower
+    triangular matrix with positive diagonals and unit Euclidean norm for each row.
+    The transform is processed as follows:
+
+        1. First we convert x into a lower triangular matrix in row order.
+        2. For each row :math:`X_i` of the lower triangular part, we apply a *signed* version of
+           class :class:`StickBreakingTransform` to transform :math:`X_i` into a
+           unit Euclidean length vector using the following steps:
+           - Scales into the interval :math:`(-1, 1)` domain: :math:`r_i = \tanh(X_i)`.
+           - Transforms into an unsigned domain: :math:`z_i = r_i^2`.
+           - Applies :math:`s_i = StickBreakingTransform(z_i)`.
+           - Transforms back into signed domain: :math:`y_i = sign(r_i) * \sqrt{s_i}`.
+    """
+    domain = constraints.real_vector
+    codomain = constraints.corr_cholesky
+    bijective = True
+
+    def _call(self, x):
+        x = torch.tanh(x)
+        eps = torch.finfo(x.dtype).eps
+        x = x.clamp(min=-1 + eps, max=1 - eps)
+        r = vec_to_tril_matrix(x, diag=-1)
+        # apply stick-breaking on the squared values
+        # Note that y = sign(r) * sqrt(z * z1m_cumprod)
+        #             = (sign(r) * sqrt(z)) * sqrt(z1m_cumprod) = r * sqrt(z1m_cumprod)
+        z = r**2
+        z1m_cumprod_sqrt = (1 - z).sqrt().cumprod(-1)
+        # Diagonal elements must be 1.
+        r = r + torch.eye(r.shape[-1], dtype=r.dtype, device=r.device)
+        y = r * pad(z1m_cumprod_sqrt[..., :-1], [1, 0], value=1)
+        return y
+
+    def _inverse(self, y):
+        # inverse stick-breaking
+        # See: https://mc-stan.org/docs/2_18/reference-manual/cholesky-factors-of-correlation-matrices-1.html
+        y_cumsum = 1 - torch.cumsum(y * y, dim=-1)
+        y_cumsum_shifted = pad(y_cumsum[..., :-1], [1, 0], value=1)
+        y_vec = tril_matrix_to_vec(y, diag=-1)
+        y_cumsum_vec = tril_matrix_to_vec(y_cumsum_shifted, diag=-1)
+        t = y_vec / (y_cumsum_vec).sqrt()
+        # inverse of tanh
+        x = (t.log1p() - t.neg().log1p()) / 2
+        return x
+
+    def log_abs_det_jacobian(self, x, y, intermediates=None):
+        # Because domain and codomain are two spaces with different dimensions, determinant of
+        # Jacobian is not well-defined. We return `log_abs_det_jacobian` of `x` and the
+        # flattened lower triangular part of `y`.
+
+        # See: https://mc-stan.org/docs/2_18/reference-manual/cholesky-factors-of-correlation-matrices-1.html
+        y1m_cumsum = 1 - (y * y).cumsum(dim=-1)
+        # by taking diagonal=-2, we don't need to shift z_cumprod to the right
+        # also works for 2 x 2 matrix
+        y1m_cumsum_tril = tril_matrix_to_vec(y1m_cumsum, diag=-2)
+        stick_breaking_logdet = 0.5 * (y1m_cumsum_tril).log().sum(-1)
+        tanh_logdet = -2 * (x + softplus(-2 * x) - math.log(2.0)).sum(dim=-1)
+        return stick_breaking_logdet + tanh_logdet
+
+    def forward_shape(self, shape):
+        # Reshape from (..., N) to (..., D, D).
+        if len(shape) < 1:
+            raise ValueError("Too few dimensions on input")
+        N = shape[-1]
+        D = round((0.25 + 2 * N) ** 0.5 + 0.5)
+        if D * (D - 1) // 2 != N:
+            raise ValueError("Input is not a flattend lower-diagonal number")
+        return shape[:-1] + (D, D)
+
+    def inverse_shape(self, shape):
+        # Reshape from (..., D, D) to (..., N).
+        if len(shape) < 2:
+            raise ValueError("Too few dimensions on input")
+        if shape[-2] != shape[-1]:
+            raise ValueError("Input is not square")
+        D = shape[-1]
+        N = D * (D - 1) // 2
+        return shape[:-2] + (N,)
+
+
+class SoftmaxTransform(Transform):
+    r"""
+    Transform from unconstrained space to the simplex via :math:`y = \exp(x)` then
+    normalizing.
+
+    This is not bijective and cannot be used for HMC. However this acts mostly
+    coordinate-wise (except for the final normalization), and thus is
+    appropriate for coordinate-wise optimization algorithms.
+    """
+    domain = constraints.real_vector
+    codomain = constraints.simplex
+
+    def __eq__(self, other):
+        return isinstance(other, SoftmaxTransform)
+
+    def _call(self, x):
+        logprobs = x
+        probs = (logprobs - logprobs.max(-1, True)[0]).exp()
+        return probs / probs.sum(-1, True)
+
+    def _inverse(self, y):
+        probs = y
+        return probs.log()
+
+    def forward_shape(self, shape):
+        if len(shape) < 1:
+            raise ValueError("Too few dimensions on input")
+        return shape
+
+    def inverse_shape(self, shape):
+        if len(shape) < 1:
+            raise ValueError("Too few dimensions on input")
+        return shape
+
+
+class StickBreakingTransform(Transform):
+    """
+    Transform from unconstrained space to the simplex of one additional
+    dimension via a stick-breaking process.
+
+    This transform arises as an iterated sigmoid transform in a stick-breaking
+    construction of the `Dirichlet` distribution: the first logit is
+    transformed via sigmoid to the first probability and the probability of
+    everything else, and then the process recurses.
+
+    This is bijective and appropriate for use in HMC; however it mixes
+    coordinates together and is less appropriate for optimization.
+    """
+
+    domain = constraints.real_vector
+    codomain = constraints.simplex
+    bijective = True
+
+    def __eq__(self, other):
+        return isinstance(other, StickBreakingTransform)
+
+    def _call(self, x):
+        offset = x.shape[-1] + 1 - x.new_ones(x.shape[-1]).cumsum(-1)
+        z = _clipped_sigmoid(x - offset.log())
+        z_cumprod = (1 - z).cumprod(-1)
+        y = pad(z, [0, 1], value=1) * pad(z_cumprod, [1, 0], value=1)
+        return y
+
+    def _inverse(self, y):
+        y_crop = y[..., :-1]
+        offset = y.shape[-1] - y.new_ones(y_crop.shape[-1]).cumsum(-1)
+        sf = 1 - y_crop.cumsum(-1)
+        # we clamp to make sure that sf is positive which sometimes does not
+        # happen when y[-1] ~ 0 or y[:-1].sum() ~ 1
+        sf = torch.clamp(sf, min=torch.finfo(y.dtype).tiny)
+        x = y_crop.log() - sf.log() + offset.log()
+        return x
+
+    def log_abs_det_jacobian(self, x, y):
+        offset = x.shape[-1] + 1 - x.new_ones(x.shape[-1]).cumsum(-1)
+        x = x - offset.log()
+        # use the identity 1 - sigmoid(x) = exp(-x) * sigmoid(x)
+        detJ = (-x + F.logsigmoid(x) + y[..., :-1].log()).sum(-1)
+        return detJ
+
+    def forward_shape(self, shape):
+        if len(shape) < 1:
+            raise ValueError("Too few dimensions on input")
+        return shape[:-1] + (shape[-1] + 1,)
+
+    def inverse_shape(self, shape):
+        if len(shape) < 1:
+            raise ValueError("Too few dimensions on input")
+        return shape[:-1] + (shape[-1] - 1,)
+
+
+class LowerCholeskyTransform(Transform):
+    """
+    Transform from unconstrained matrices to lower-triangular matrices with
+    nonnegative diagonal entries.
+
+    This is useful for parameterizing positive definite matrices in terms of
+    their Cholesky factorization.
+    """
+
+    domain = constraints.independent(constraints.real, 2)
+    codomain = constraints.lower_cholesky
+
+    def __eq__(self, other):
+        return isinstance(other, LowerCholeskyTransform)
+
+    def _call(self, x):
+        return x.tril(-1) + x.diagonal(dim1=-2, dim2=-1).exp().diag_embed()
+
+    def _inverse(self, y):
+        return y.tril(-1) + y.diagonal(dim1=-2, dim2=-1).log().diag_embed()
+
+
+class PositiveDefiniteTransform(Transform):
+    """
+    Transform from unconstrained matrices to positive-definite matrices.
+    """
+
+    domain = constraints.independent(constraints.real, 2)
+    codomain = constraints.positive_definite  # type: ignore[assignment]
+
+    def __eq__(self, other):
+        return isinstance(other, PositiveDefiniteTransform)
+
+    def _call(self, x):
+        x = LowerCholeskyTransform()(x)
+        return x @ x.mT
+
+    def _inverse(self, y):
+        y = torch.linalg.cholesky(y)
+        return LowerCholeskyTransform().inv(y)
+
+
+class CatTransform(Transform):
+    """
+    Transform functor that applies a sequence of transforms `tseq`
+    component-wise to each submatrix at `dim`, of length `lengths[dim]`,
+    in a way compatible with :func:`torch.cat`.
+
+    Example::
+
+       x0 = torch.cat([torch.range(1, 10), torch.range(1, 10)], dim=0)
+       x = torch.cat([x0, x0], dim=0)
+       t0 = CatTransform([ExpTransform(), identity_transform], dim=0, lengths=[10, 10])
+       t = CatTransform([t0, t0], dim=0, lengths=[20, 20])
+       y = t(x)
+    """
+
+    transforms: List[Transform]
+
+    def __init__(self, tseq, dim=0, lengths=None, cache_size=0):
+        assert all(isinstance(t, Transform) for t in tseq)
+        if cache_size:
+            tseq = [t.with_cache(cache_size) for t in tseq]
+        super().__init__(cache_size=cache_size)
+        self.transforms = list(tseq)
+        if lengths is None:
+            lengths = [1] * len(self.transforms)
+        self.lengths = list(lengths)
+        assert len(self.lengths) == len(self.transforms)
+        self.dim = dim
+
+    @lazy_property
+    def event_dim(self):
+        return max(t.event_dim for t in self.transforms)
+
+    @lazy_property
+    def length(self):
+        return sum(self.lengths)
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return CatTransform(self.transforms, self.dim, self.lengths, cache_size)
+
+    def _call(self, x):
+        assert -x.dim() <= self.dim < x.dim()
+        assert x.size(self.dim) == self.length
+        yslices = []
+        start = 0
+        for trans, length in zip(self.transforms, self.lengths):
+            xslice = x.narrow(self.dim, start, length)
+            yslices.append(trans(xslice))
+            start = start + length  # avoid += for jit compat
+        return torch.cat(yslices, dim=self.dim)
+
+    def _inverse(self, y):
+        assert -y.dim() <= self.dim < y.dim()
+        assert y.size(self.dim) == self.length
+        xslices = []
+        start = 0
+        for trans, length in zip(self.transforms, self.lengths):
+            yslice = y.narrow(self.dim, start, length)
+            xslices.append(trans.inv(yslice))
+            start = start + length  # avoid += for jit compat
+        return torch.cat(xslices, dim=self.dim)
+
+    def log_abs_det_jacobian(self, x, y):
+        assert -x.dim() <= self.dim < x.dim()
+        assert x.size(self.dim) == self.length
+        assert -y.dim() <= self.dim < y.dim()
+        assert y.size(self.dim) == self.length
+        logdetjacs = []
+        start = 0
+        for trans, length in zip(self.transforms, self.lengths):
+            xslice = x.narrow(self.dim, start, length)
+            yslice = y.narrow(self.dim, start, length)
+            logdetjac = trans.log_abs_det_jacobian(xslice, yslice)
+            if trans.event_dim < self.event_dim:
+                logdetjac = _sum_rightmost(logdetjac, self.event_dim - trans.event_dim)
+            logdetjacs.append(logdetjac)
+            start = start + length  # avoid += for jit compat
+        # Decide whether to concatenate or sum.
+        dim = self.dim
+        if dim >= 0:
+            dim = dim - x.dim()
+        dim = dim + self.event_dim
+        if dim < 0:
+            return torch.cat(logdetjacs, dim=dim)
+        else:
+            return sum(logdetjacs)
+
+    @property
+    def bijective(self):
+        return all(t.bijective for t in self.transforms)
+
+    @constraints.dependent_property
+    def domain(self):
+        return constraints.cat(
+            [t.domain for t in self.transforms], self.dim, self.lengths
+        )
+
+    @constraints.dependent_property
+    def codomain(self):
+        return constraints.cat(
+            [t.codomain for t in self.transforms], self.dim, self.lengths
+        )
+
+
+class StackTransform(Transform):
+    """
+    Transform functor that applies a sequence of transforms `tseq`
+    component-wise to each submatrix at `dim`
+    in a way compatible with :func:`torch.stack`.
+
+    Example::
+
+       x = torch.stack([torch.range(1, 10), torch.range(1, 10)], dim=1)
+       t = StackTransform([ExpTransform(), identity_transform], dim=1)
+       y = t(x)
+    """
+
+    transforms: List[Transform]
+
+    def __init__(self, tseq, dim=0, cache_size=0):
+        assert all(isinstance(t, Transform) for t in tseq)
+        if cache_size:
+            tseq = [t.with_cache(cache_size) for t in tseq]
+        super().__init__(cache_size=cache_size)
+        self.transforms = list(tseq)
+        self.dim = dim
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return StackTransform(self.transforms, self.dim, cache_size)
+
+    def _slice(self, z):
+        return [z.select(self.dim, i) for i in range(z.size(self.dim))]
+
+    def _call(self, x):
+        assert -x.dim() <= self.dim < x.dim()
+        assert x.size(self.dim) == len(self.transforms)
+        yslices = []
+        for xslice, trans in zip(self._slice(x), self.transforms):
+            yslices.append(trans(xslice))
+        return torch.stack(yslices, dim=self.dim)
+
+    def _inverse(self, y):
+        assert -y.dim() <= self.dim < y.dim()
+        assert y.size(self.dim) == len(self.transforms)
+        xslices = []
+        for yslice, trans in zip(self._slice(y), self.transforms):
+            xslices.append(trans.inv(yslice))
+        return torch.stack(xslices, dim=self.dim)
+
+    def log_abs_det_jacobian(self, x, y):
+        assert -x.dim() <= self.dim < x.dim()
+        assert x.size(self.dim) == len(self.transforms)
+        assert -y.dim() <= self.dim < y.dim()
+        assert y.size(self.dim) == len(self.transforms)
+        logdetjacs = []
+        yslices = self._slice(y)
+        xslices = self._slice(x)
+        for xslice, yslice, trans in zip(xslices, yslices, self.transforms):
+            logdetjacs.append(trans.log_abs_det_jacobian(xslice, yslice))
+        return torch.stack(logdetjacs, dim=self.dim)
+
+    @property
+    def bijective(self):
+        return all(t.bijective for t in self.transforms)
+
+    @constraints.dependent_property
+    def domain(self):
+        return constraints.stack([t.domain for t in self.transforms], self.dim)
+
+    @constraints.dependent_property
+    def codomain(self):
+        return constraints.stack([t.codomain for t in self.transforms], self.dim)
+
+
+class CumulativeDistributionTransform(Transform):
+    """
+    Transform via the cumulative distribution function of a probability distribution.
+
+    Args:
+        distribution (Distribution): Distribution whose cumulative distribution function to use for
+            the transformation.
+
+    Example::
+
+        # Construct a Gaussian copula from a multivariate normal.
+        base_dist = MultivariateNormal(
+            loc=torch.zeros(2),
+            scale_tril=LKJCholesky(2).sample(),
+        )
+        transform = CumulativeDistributionTransform(Normal(0, 1))
+        copula = TransformedDistribution(base_dist, [transform])
+    """
+
+    bijective = True
+    codomain = constraints.unit_interval
+    sign = +1
+
+    def __init__(self, distribution, cache_size=0):
+        super().__init__(cache_size=cache_size)
+        self.distribution = distribution
+
+    @property
+    def domain(self):
+        return self.distribution.support
+
+    def _call(self, x):
+        return self.distribution.cdf(x)
+
+    def _inverse(self, y):
+        return self.distribution.icdf(y)
+
+    def log_abs_det_jacobian(self, x, y):
+        return self.distribution.log_prob(x)
+
+    def with_cache(self, cache_size=1):
+        if self._cache_size == cache_size:
+            return self
+        return CumulativeDistributionTransform(self.distribution, cache_size=cache_size)
diff --git a/MLPY/Lib/site-packages/torch/distributions/uniform.py b/MLPY/Lib/site-packages/torch/distributions/uniform.py
new file mode 100644
index 0000000000000000000000000000000000000000..20e3e726db7e71f237f0e9533fef85a04255d697
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/uniform.py
@@ -0,0 +1,99 @@
+from numbers import Number
+
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Uniform"]
+
+
+class Uniform(Distribution):
+    r"""
+    Generates uniformly distributed random samples from the half-open interval
+    ``[low, high)``.
+
+    Example::
+
+        >>> m = Uniform(torch.tensor([0.0]), torch.tensor([5.0]))
+        >>> m.sample()  # uniformly distributed in the range [0.0, 5.0)
+        >>> # xdoctest: +SKIP
+        tensor([ 2.3418])
+
+    Args:
+        low (float or Tensor): lower range (inclusive).
+        high (float or Tensor): upper range (exclusive).
+    """
+    # TODO allow (loc,scale) parameterization to allow independent constraints.
+    arg_constraints = {
+        "low": constraints.dependent(is_discrete=False, event_dim=0),
+        "high": constraints.dependent(is_discrete=False, event_dim=0),
+    }
+    has_rsample = True
+
+    @property
+    def mean(self):
+        return (self.high + self.low) / 2
+
+    @property
+    def mode(self):
+        return nan * self.high
+
+    @property
+    def stddev(self):
+        return (self.high - self.low) / 12**0.5
+
+    @property
+    def variance(self):
+        return (self.high - self.low).pow(2) / 12
+
+    def __init__(self, low, high, validate_args=None):
+        self.low, self.high = broadcast_all(low, high)
+
+        if isinstance(low, Number) and isinstance(high, Number):
+            batch_shape = torch.Size()
+        else:
+            batch_shape = self.low.size()
+        super().__init__(batch_shape, validate_args=validate_args)
+
+        if self._validate_args and not torch.lt(self.low, self.high).all():
+            raise ValueError("Uniform is not defined when low>= high")
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Uniform, _instance)
+        batch_shape = torch.Size(batch_shape)
+        new.low = self.low.expand(batch_shape)
+        new.high = self.high.expand(batch_shape)
+        super(Uniform, new).__init__(batch_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @constraints.dependent_property(is_discrete=False, event_dim=0)
+    def support(self):
+        return constraints.interval(self.low, self.high)
+
+    def rsample(self, sample_shape=torch.Size()):
+        shape = self._extended_shape(sample_shape)
+        rand = torch.rand(shape, dtype=self.low.dtype, device=self.low.device)
+        return self.low + rand * (self.high - self.low)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        lb = self.low.le(value).type_as(self.low)
+        ub = self.high.gt(value).type_as(self.low)
+        return torch.log(lb.mul(ub)) - torch.log(self.high - self.low)
+
+    def cdf(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        result = (value - self.low) / (self.high - self.low)
+        return result.clamp(min=0, max=1)
+
+    def icdf(self, value):
+        result = value * (self.high - self.low) + self.low
+        return result
+
+    def entropy(self):
+        return torch.log(self.high - self.low)
diff --git a/MLPY/Lib/site-packages/torch/distributions/utils.py b/MLPY/Lib/site-packages/torch/distributions/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..146ae2adc7e4f917ffb65b82549712f3e7745565
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/utils.py
@@ -0,0 +1,177 @@
+from functools import update_wrapper
+from numbers import Number
+from typing import Any, Dict
+
+import torch
+import torch.nn.functional as F
+from torch.overrides import is_tensor_like
+
+euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
+
+__all__ = [
+    "broadcast_all",
+    "logits_to_probs",
+    "clamp_probs",
+    "probs_to_logits",
+    "lazy_property",
+    "tril_matrix_to_vec",
+    "vec_to_tril_matrix",
+]
+
+
+def broadcast_all(*values):
+    r"""
+    Given a list of values (possibly containing numbers), returns a list where each
+    value is broadcasted based on the following rules:
+      - `torch.*Tensor` instances are broadcasted as per :ref:`_broadcasting-semantics`.
+      - numbers.Number instances (scalars) are upcast to tensors having
+        the same size and type as the first tensor passed to `values`.  If all the
+        values are scalars, then they are upcasted to scalar Tensors.
+
+    Args:
+        values (list of `numbers.Number`, `torch.*Tensor` or objects implementing __torch_function__)
+
+    Raises:
+        ValueError: if any of the values is not a `numbers.Number` instance,
+            a `torch.*Tensor` instance, or an instance implementing __torch_function__
+    """
+    if not all(is_tensor_like(v) or isinstance(v, Number) for v in values):
+        raise ValueError(
+            "Input arguments must all be instances of numbers.Number, "
+            "torch.Tensor or objects implementing __torch_function__."
+        )
+    if not all(is_tensor_like(v) for v in values):
+        options: Dict[str, Any] = dict(dtype=torch.get_default_dtype())
+        for value in values:
+            if isinstance(value, torch.Tensor):
+                options = dict(dtype=value.dtype, device=value.device)
+                break
+        new_values = [
+            v if is_tensor_like(v) else torch.tensor(v, **options) for v in values
+        ]
+        return torch.broadcast_tensors(*new_values)
+    return torch.broadcast_tensors(*values)
+
+
+def _standard_normal(shape, dtype, device):
+    if torch._C._get_tracing_state():
+        # [JIT WORKAROUND] lack of support for .normal_()
+        return torch.normal(
+            torch.zeros(shape, dtype=dtype, device=device),
+            torch.ones(shape, dtype=dtype, device=device),
+        )
+    return torch.empty(shape, dtype=dtype, device=device).normal_()
+
+
+def _sum_rightmost(value, dim):
+    r"""
+    Sum out ``dim`` many rightmost dimensions of a given tensor.
+
+    Args:
+        value (Tensor): A tensor of ``.dim()`` at least ``dim``.
+        dim (int): The number of rightmost dims to sum out.
+    """
+    if dim == 0:
+        return value
+    required_shape = value.shape[:-dim] + (-1,)
+    return value.reshape(required_shape).sum(-1)
+
+
+def logits_to_probs(logits, is_binary=False):
+    r"""
+    Converts a tensor of logits into probabilities. Note that for the
+    binary case, each value denotes log odds, whereas for the
+    multi-dimensional case, the values along the last dimension denote
+    the log probabilities (possibly unnormalized) of the events.
+    """
+    if is_binary:
+        return torch.sigmoid(logits)
+    return F.softmax(logits, dim=-1)
+
+
+def clamp_probs(probs):
+    eps = torch.finfo(probs.dtype).eps
+    return probs.clamp(min=eps, max=1 - eps)
+
+
+def probs_to_logits(probs, is_binary=False):
+    r"""
+    Converts a tensor of probabilities into logits. For the binary case,
+    this denotes the probability of occurrence of the event indexed by `1`.
+    For the multi-dimensional case, the values along the last dimension
+    denote the probabilities of occurrence of each of the events.
+    """
+    ps_clamped = clamp_probs(probs)
+    if is_binary:
+        return torch.log(ps_clamped) - torch.log1p(-ps_clamped)
+    return torch.log(ps_clamped)
+
+
+class lazy_property:
+    r"""
+    Used as a decorator for lazy loading of class attributes. This uses a
+    non-data descriptor that calls the wrapped method to compute the property on
+    first call; thereafter replacing the wrapped method into an instance
+    attribute.
+    """
+
+    def __init__(self, wrapped):
+        self.wrapped = wrapped
+        update_wrapper(self, wrapped)
+
+    def __get__(self, instance, obj_type=None):
+        if instance is None:
+            return _lazy_property_and_property(self.wrapped)
+        with torch.enable_grad():
+            value = self.wrapped(instance)
+        setattr(instance, self.wrapped.__name__, value)
+        return value
+
+
+class _lazy_property_and_property(lazy_property, property):
+    """We want lazy properties to look like multiple things.
+
+    * property when Sphinx autodoc looks
+    * lazy_property when Distribution validate_args looks
+    """
+
+    def __init__(self, wrapped):
+        property.__init__(self, wrapped)
+
+
+def tril_matrix_to_vec(mat: torch.Tensor, diag: int = 0) -> torch.Tensor:
+    r"""
+    Convert a `D x D` matrix or a batch of matrices into a (batched) vector
+    which comprises of lower triangular elements from the matrix in row order.
+    """
+    n = mat.shape[-1]
+    if not torch._C._get_tracing_state() and (diag < -n or diag >= n):
+        raise ValueError(f"diag ({diag}) provided is outside [{-n}, {n-1}].")
+    arange = torch.arange(n, device=mat.device)
+    tril_mask = arange < arange.view(-1, 1) + (diag + 1)
+    vec = mat[..., tril_mask]
+    return vec
+
+
+def vec_to_tril_matrix(vec: torch.Tensor, diag: int = 0) -> torch.Tensor:
+    r"""
+    Convert a vector or a batch of vectors into a batched `D x D`
+    lower triangular matrix containing elements from the vector in row order.
+    """
+    # +ve root of D**2 + (1+2*diag)*D - |diag| * (diag+1) - 2*vec.shape[-1] = 0
+    n = (
+        -(1 + 2 * diag)
+        + ((1 + 2 * diag) ** 2 + 8 * vec.shape[-1] + 4 * abs(diag) * (diag + 1)) ** 0.5
+    ) / 2
+    eps = torch.finfo(vec.dtype).eps
+    if not torch._C._get_tracing_state() and (round(n) - n > eps):
+        raise ValueError(
+            f"The size of last dimension is {vec.shape[-1]} which cannot be expressed as "
+            + "the lower triangular part of a square D x D matrix."
+        )
+    n = round(n.item()) if isinstance(n, torch.Tensor) else round(n)
+    mat = vec.new_zeros(vec.shape[:-1] + torch.Size((n, n)))
+    arange = torch.arange(n, device=vec.device)
+    tril_mask = arange < arange.view(-1, 1) + (diag + 1)
+    mat[..., tril_mask] = vec
+    return mat
diff --git a/MLPY/Lib/site-packages/torch/distributions/von_mises.py b/MLPY/Lib/site-packages/torch/distributions/von_mises.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6fdef81b67cbcb4edd60d353d99d57bb7a09e3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/von_mises.py
@@ -0,0 +1,209 @@
+import math
+
+import torch
+import torch.jit
+from torch.distributions import constraints
+from torch.distributions.distribution import Distribution
+from torch.distributions.utils import broadcast_all, lazy_property
+
+__all__ = ["VonMises"]
+
+
+def _eval_poly(y, coef):
+    coef = list(coef)
+    result = coef.pop()
+    while coef:
+        result = coef.pop() + y * result
+    return result
+
+
+_I0_COEF_SMALL = [
+    1.0,
+    3.5156229,
+    3.0899424,
+    1.2067492,
+    0.2659732,
+    0.360768e-1,
+    0.45813e-2,
+]
+_I0_COEF_LARGE = [
+    0.39894228,
+    0.1328592e-1,
+    0.225319e-2,
+    -0.157565e-2,
+    0.916281e-2,
+    -0.2057706e-1,
+    0.2635537e-1,
+    -0.1647633e-1,
+    0.392377e-2,
+]
+_I1_COEF_SMALL = [
+    0.5,
+    0.87890594,
+    0.51498869,
+    0.15084934,
+    0.2658733e-1,
+    0.301532e-2,
+    0.32411e-3,
+]
+_I1_COEF_LARGE = [
+    0.39894228,
+    -0.3988024e-1,
+    -0.362018e-2,
+    0.163801e-2,
+    -0.1031555e-1,
+    0.2282967e-1,
+    -0.2895312e-1,
+    0.1787654e-1,
+    -0.420059e-2,
+]
+
+_COEF_SMALL = [_I0_COEF_SMALL, _I1_COEF_SMALL]
+_COEF_LARGE = [_I0_COEF_LARGE, _I1_COEF_LARGE]
+
+
+def _log_modified_bessel_fn(x, order=0):
+    """
+    Returns ``log(I_order(x))`` for ``x > 0``,
+    where `order` is either 0 or 1.
+    """
+    assert order == 0 or order == 1
+
+    # compute small solution
+    y = x / 3.75
+    y = y * y
+    small = _eval_poly(y, _COEF_SMALL[order])
+    if order == 1:
+        small = x.abs() * small
+    small = small.log()
+
+    # compute large solution
+    y = 3.75 / x
+    large = x - 0.5 * x.log() + _eval_poly(y, _COEF_LARGE[order]).log()
+
+    result = torch.where(x < 3.75, small, large)
+    return result
+
+
+@torch.jit.script_if_tracing
+def _rejection_sample(loc, concentration, proposal_r, x):
+    done = torch.zeros(x.shape, dtype=torch.bool, device=loc.device)
+    while not done.all():
+        u = torch.rand((3,) + x.shape, dtype=loc.dtype, device=loc.device)
+        u1, u2, u3 = u.unbind()
+        z = torch.cos(math.pi * u1)
+        f = (1 + proposal_r * z) / (proposal_r + z)
+        c = concentration * (proposal_r - f)
+        accept = ((c * (2 - c) - u2) > 0) | ((c / u2).log() + 1 - c >= 0)
+        if accept.any():
+            x = torch.where(accept, (u3 - 0.5).sign() * f.acos(), x)
+            done = done | accept
+    return (x + math.pi + loc) % (2 * math.pi) - math.pi
+
+
+class VonMises(Distribution):
+    """
+    A circular von Mises distribution.
+
+    This implementation uses polar coordinates. The ``loc`` and ``value`` args
+    can be any real number (to facilitate unconstrained optimization), but are
+    interpreted as angles modulo 2 pi.
+
+    Example::
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = VonMises(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # von Mises distributed with loc=1 and concentration=1
+        tensor([1.9777])
+
+    :param torch.Tensor loc: an angle in radians.
+    :param torch.Tensor concentration: concentration parameter
+    """
+
+    arg_constraints = {"loc": constraints.real, "concentration": constraints.positive}
+    support = constraints.real
+    has_rsample = False
+
+    def __init__(self, loc, concentration, validate_args=None):
+        self.loc, self.concentration = broadcast_all(loc, concentration)
+        batch_shape = self.loc.shape
+        event_shape = torch.Size()
+        super().__init__(batch_shape, event_shape, validate_args)
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        log_prob = self.concentration * torch.cos(value - self.loc)
+        log_prob = (
+            log_prob
+            - math.log(2 * math.pi)
+            - _log_modified_bessel_fn(self.concentration, order=0)
+        )
+        return log_prob
+
+    @lazy_property
+    def _loc(self):
+        return self.loc.to(torch.double)
+
+    @lazy_property
+    def _concentration(self):
+        return self.concentration.to(torch.double)
+
+    @lazy_property
+    def _proposal_r(self):
+        kappa = self._concentration
+        tau = 1 + (1 + 4 * kappa**2).sqrt()
+        rho = (tau - (2 * tau).sqrt()) / (2 * kappa)
+        _proposal_r = (1 + rho**2) / (2 * rho)
+        # second order Taylor expansion around 0 for small kappa
+        _proposal_r_taylor = 1 / kappa + kappa
+        return torch.where(kappa < 1e-5, _proposal_r_taylor, _proposal_r)
+
+    @torch.no_grad()
+    def sample(self, sample_shape=torch.Size()):
+        """
+        The sampling algorithm for the von Mises distribution is based on the
+        following paper: D.J. Best and N.I. Fisher, "Efficient simulation of the
+        von Mises distribution." Applied Statistics (1979): 152-157.
+
+        Sampling is always done in double precision internally to avoid a hang
+        in _rejection_sample() for small values of the concentration, which
+        starts to happen for single precision around 1e-4 (see issue #88443).
+        """
+        shape = self._extended_shape(sample_shape)
+        x = torch.empty(shape, dtype=self._loc.dtype, device=self.loc.device)
+        return _rejection_sample(
+            self._loc, self._concentration, self._proposal_r, x
+        ).to(self.loc.dtype)
+
+    def expand(self, batch_shape):
+        try:
+            return super().expand(batch_shape)
+        except NotImplementedError:
+            validate_args = self.__dict__.get("_validate_args")
+            loc = self.loc.expand(batch_shape)
+            concentration = self.concentration.expand(batch_shape)
+            return type(self)(loc, concentration, validate_args=validate_args)
+
+    @property
+    def mean(self):
+        """
+        The provided mean is the circular one.
+        """
+        return self.loc
+
+    @property
+    def mode(self):
+        return self.loc
+
+    @lazy_property
+    def variance(self):
+        """
+        The provided variance is the circular one.
+        """
+        return (
+            1
+            - (
+                _log_modified_bessel_fn(self.concentration, order=1)
+                - _log_modified_bessel_fn(self.concentration, order=0)
+            ).exp()
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/weibull.py b/MLPY/Lib/site-packages/torch/distributions/weibull.py
new file mode 100644
index 0000000000000000000000000000000000000000..3277175e74c9723d256e4e3c9ee3141aa994cf81
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/weibull.py
@@ -0,0 +1,83 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.exponential import Exponential
+from torch.distributions.gumbel import euler_constant
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, PowerTransform
+from torch.distributions.utils import broadcast_all
+
+__all__ = ["Weibull"]
+
+
+class Weibull(TransformedDistribution):
+    r"""
+    Samples from a two-parameter Weibull distribution.
+
+    Example:
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = Weibull(torch.tensor([1.0]), torch.tensor([1.0]))
+        >>> m.sample()  # sample from a Weibull distribution with scale=1, concentration=1
+        tensor([ 0.4784])
+
+    Args:
+        scale (float or Tensor): Scale parameter of distribution (lambda).
+        concentration (float or Tensor): Concentration parameter of distribution (k/shape).
+    """
+    arg_constraints = {
+        "scale": constraints.positive,
+        "concentration": constraints.positive,
+    }
+    support = constraints.positive
+
+    def __init__(self, scale, concentration, validate_args=None):
+        self.scale, self.concentration = broadcast_all(scale, concentration)
+        self.concentration_reciprocal = self.concentration.reciprocal()
+        base_dist = Exponential(
+            torch.ones_like(self.scale), validate_args=validate_args
+        )
+        transforms = [
+            PowerTransform(exponent=self.concentration_reciprocal),
+            AffineTransform(loc=0, scale=self.scale),
+        ]
+        super().__init__(base_dist, transforms, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Weibull, _instance)
+        new.scale = self.scale.expand(batch_shape)
+        new.concentration = self.concentration.expand(batch_shape)
+        new.concentration_reciprocal = new.concentration.reciprocal()
+        base_dist = self.base_dist.expand(batch_shape)
+        transforms = [
+            PowerTransform(exponent=new.concentration_reciprocal),
+            AffineTransform(loc=0, scale=new.scale),
+        ]
+        super(Weibull, new).__init__(base_dist, transforms, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @property
+    def mean(self):
+        return self.scale * torch.exp(torch.lgamma(1 + self.concentration_reciprocal))
+
+    @property
+    def mode(self):
+        return (
+            self.scale
+            * ((self.concentration - 1) / self.concentration)
+            ** self.concentration.reciprocal()
+        )
+
+    @property
+    def variance(self):
+        return self.scale.pow(2) * (
+            torch.exp(torch.lgamma(1 + 2 * self.concentration_reciprocal))
+            - torch.exp(2 * torch.lgamma(1 + self.concentration_reciprocal))
+        )
+
+    def entropy(self):
+        return (
+            euler_constant * (1 - self.concentration_reciprocal)
+            + torch.log(self.scale * self.concentration_reciprocal)
+            + 1
+        )
diff --git a/MLPY/Lib/site-packages/torch/distributions/wishart.py b/MLPY/Lib/site-packages/torch/distributions/wishart.py
new file mode 100644
index 0000000000000000000000000000000000000000..cec018b5caa33871c17479c726e10eff5244fc38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/distributions/wishart.py
@@ -0,0 +1,335 @@
+import math
+import warnings
+from numbers import Number
+from typing import Optional, Union
+
+import torch
+from torch import nan
+from torch.distributions import constraints
+from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.multivariate_normal import _precision_to_scale_tril
+from torch.distributions.utils import lazy_property
+
+
+__all__ = ["Wishart"]
+
+_log_2 = math.log(2)
+
+
+def _mvdigamma(x: torch.Tensor, p: int) -> torch.Tensor:
+    assert x.gt((p - 1) / 2).all(), "Wrong domain for multivariate digamma function."
+    return torch.digamma(
+        x.unsqueeze(-1)
+        - torch.arange(p, dtype=x.dtype, device=x.device).div(2).expand(x.shape + (-1,))
+    ).sum(-1)
+
+
+def _clamp_above_eps(x: torch.Tensor) -> torch.Tensor:
+    # We assume positive input for this function
+    return x.clamp(min=torch.finfo(x.dtype).eps)
+
+
+class Wishart(ExponentialFamily):
+    r"""
+    Creates a Wishart distribution parameterized by a symmetric positive definite matrix :math:`\Sigma`,
+    or its Cholesky decomposition :math:`\mathbf{\Sigma} = \mathbf{L}\mathbf{L}^\top`
+
+    Example:
+        >>> # xdoctest: +SKIP("FIXME: scale_tril must be at least two-dimensional")
+        >>> m = Wishart(torch.Tensor([2]), covariance_matrix=torch.eye(2))
+        >>> m.sample()  # Wishart distributed with mean=`df * I` and
+        >>>             # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j
+
+    Args:
+        df (float or Tensor): real-valued parameter larger than the (dimension of Square matrix) - 1
+        covariance_matrix (Tensor): positive-definite covariance matrix
+        precision_matrix (Tensor): positive-definite precision matrix
+        scale_tril (Tensor): lower-triangular factor of covariance, with positive-valued diagonal
+    Note:
+        Only one of :attr:`covariance_matrix` or :attr:`precision_matrix` or
+        :attr:`scale_tril` can be specified.
+        Using :attr:`scale_tril` will be more efficient: all computations internally
+        are based on :attr:`scale_tril`. If :attr:`covariance_matrix` or
+        :attr:`precision_matrix` is passed instead, it is only used to compute
+        the corresponding lower triangular matrices using a Cholesky decomposition.
+        'torch.distributions.LKJCholesky' is a restricted Wishart distribution.[1]
+
+    **References**
+
+    [1] Wang, Z., Wu, Y. and Chu, H., 2018. `On equivalence of the LKJ distribution and the restricted Wishart distribution`.
+    [2] Sawyer, S., 2007. `Wishart Distributions and Inverse-Wishart Sampling`.
+    [3] Anderson, T. W., 2003. `An Introduction to Multivariate Statistical Analysis (3rd ed.)`.
+    [4] Odell, P. L. & Feiveson, A. H., 1966. `A Numerical Procedure to Generate a SampleCovariance Matrix`. JASA, 61(313):199-203.
+    [5] Ku, Y.-C. & Bloomfield, P., 2010. `Generating Random Wishart Matrices with Fractional Degrees of Freedom in OX`.
+    """
+    arg_constraints = {
+        "covariance_matrix": constraints.positive_definite,
+        "precision_matrix": constraints.positive_definite,
+        "scale_tril": constraints.lower_cholesky,
+        "df": constraints.greater_than(0),
+    }
+    support = constraints.positive_definite
+    has_rsample = True
+    _mean_carrier_measure = 0
+
+    def __init__(
+        self,
+        df: Union[torch.Tensor, Number],
+        covariance_matrix: Optional[torch.Tensor] = None,
+        precision_matrix: Optional[torch.Tensor] = None,
+        scale_tril: Optional[torch.Tensor] = None,
+        validate_args=None,
+    ):
+        assert (covariance_matrix is not None) + (scale_tril is not None) + (
+            precision_matrix is not None
+        ) == 1, "Exactly one of covariance_matrix or precision_matrix or scale_tril may be specified."
+
+        param = next(
+            p
+            for p in (covariance_matrix, precision_matrix, scale_tril)
+            if p is not None
+        )
+
+        if param.dim() < 2:
+            raise ValueError(
+                "scale_tril must be at least two-dimensional, with optional leading batch dimensions"
+            )
+
+        if isinstance(df, Number):
+            batch_shape = torch.Size(param.shape[:-2])
+            self.df = torch.tensor(df, dtype=param.dtype, device=param.device)
+        else:
+            batch_shape = torch.broadcast_shapes(param.shape[:-2], df.shape)
+            self.df = df.expand(batch_shape)
+        event_shape = param.shape[-2:]
+
+        if self.df.le(event_shape[-1] - 1).any():
+            raise ValueError(
+                f"Value of df={df} expected to be greater than ndim - 1 = {event_shape[-1]-1}."
+            )
+
+        if scale_tril is not None:
+            self.scale_tril = param.expand(batch_shape + (-1, -1))
+        elif covariance_matrix is not None:
+            self.covariance_matrix = param.expand(batch_shape + (-1, -1))
+        elif precision_matrix is not None:
+            self.precision_matrix = param.expand(batch_shape + (-1, -1))
+
+        self.arg_constraints["df"] = constraints.greater_than(event_shape[-1] - 1)
+        if self.df.lt(event_shape[-1]).any():
+            warnings.warn(
+                "Low df values detected. Singular samples are highly likely to occur for ndim - 1 < df < ndim."
+            )
+
+        super().__init__(batch_shape, event_shape, validate_args=validate_args)
+        self._batch_dims = [-(x + 1) for x in range(len(self._batch_shape))]
+
+        if scale_tril is not None:
+            self._unbroadcasted_scale_tril = scale_tril
+        elif covariance_matrix is not None:
+            self._unbroadcasted_scale_tril = torch.linalg.cholesky(covariance_matrix)
+        else:  # precision_matrix is not None
+            self._unbroadcasted_scale_tril = _precision_to_scale_tril(precision_matrix)
+
+        # Chi2 distribution is needed for Bartlett decomposition sampling
+        self._dist_chi2 = torch.distributions.chi2.Chi2(
+            df=(
+                self.df.unsqueeze(-1)
+                - torch.arange(
+                    self._event_shape[-1],
+                    dtype=self._unbroadcasted_scale_tril.dtype,
+                    device=self._unbroadcasted_scale_tril.device,
+                ).expand(batch_shape + (-1,))
+            )
+        )
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Wishart, _instance)
+        batch_shape = torch.Size(batch_shape)
+        cov_shape = batch_shape + self.event_shape
+        new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape)
+        new.df = self.df.expand(batch_shape)
+
+        new._batch_dims = [-(x + 1) for x in range(len(batch_shape))]
+
+        if "covariance_matrix" in self.__dict__:
+            new.covariance_matrix = self.covariance_matrix.expand(cov_shape)
+        if "scale_tril" in self.__dict__:
+            new.scale_tril = self.scale_tril.expand(cov_shape)
+        if "precision_matrix" in self.__dict__:
+            new.precision_matrix = self.precision_matrix.expand(cov_shape)
+
+        # Chi2 distribution is needed for Bartlett decomposition sampling
+        new._dist_chi2 = torch.distributions.chi2.Chi2(
+            df=(
+                new.df.unsqueeze(-1)
+                - torch.arange(
+                    self.event_shape[-1],
+                    dtype=new._unbroadcasted_scale_tril.dtype,
+                    device=new._unbroadcasted_scale_tril.device,
+                ).expand(batch_shape + (-1,))
+            )
+        )
+
+        super(Wishart, new).__init__(batch_shape, self.event_shape, validate_args=False)
+        new._validate_args = self._validate_args
+        return new
+
+    @lazy_property
+    def scale_tril(self):
+        return self._unbroadcasted_scale_tril.expand(
+            self._batch_shape + self._event_shape
+        )
+
+    @lazy_property
+    def covariance_matrix(self):
+        return (
+            self._unbroadcasted_scale_tril
+            @ self._unbroadcasted_scale_tril.transpose(-2, -1)
+        ).expand(self._batch_shape + self._event_shape)
+
+    @lazy_property
+    def precision_matrix(self):
+        identity = torch.eye(
+            self._event_shape[-1],
+            device=self._unbroadcasted_scale_tril.device,
+            dtype=self._unbroadcasted_scale_tril.dtype,
+        )
+        return torch.cholesky_solve(identity, self._unbroadcasted_scale_tril).expand(
+            self._batch_shape + self._event_shape
+        )
+
+    @property
+    def mean(self):
+        return self.df.view(self._batch_shape + (1, 1)) * self.covariance_matrix
+
+    @property
+    def mode(self):
+        factor = self.df - self.covariance_matrix.shape[-1] - 1
+        factor[factor <= 0] = nan
+        return factor.view(self._batch_shape + (1, 1)) * self.covariance_matrix
+
+    @property
+    def variance(self):
+        V = self.covariance_matrix  # has shape (batch_shape x event_shape)
+        diag_V = V.diagonal(dim1=-2, dim2=-1)
+        return self.df.view(self._batch_shape + (1, 1)) * (
+            V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V)
+        )
+
+    def _bartlett_sampling(self, sample_shape=torch.Size()):
+        p = self._event_shape[-1]  # has singleton shape
+
+        # Implemented Sampling using Bartlett decomposition
+        noise = _clamp_above_eps(
+            self._dist_chi2.rsample(sample_shape).sqrt()
+        ).diag_embed(dim1=-2, dim2=-1)
+
+        i, j = torch.tril_indices(p, p, offset=-1)
+        noise[..., i, j] = torch.randn(
+            torch.Size(sample_shape) + self._batch_shape + (int(p * (p - 1) / 2),),
+            dtype=noise.dtype,
+            device=noise.device,
+        )
+        chol = self._unbroadcasted_scale_tril @ noise
+        return chol @ chol.transpose(-2, -1)
+
+    def rsample(self, sample_shape=torch.Size(), max_try_correction=None):
+        r"""
+        .. warning::
+            In some cases, sampling algorithm based on Bartlett decomposition may return singular matrix samples.
+            Several tries to correct singular samples are performed by default, but it may end up returning
+            singular matrix samples. Singular samples may return `-inf` values in `.log_prob()`.
+            In those cases, the user should validate the samples and either fix the value of `df`
+            or adjust `max_try_correction` value for argument in `.rsample` accordingly.
+        """
+
+        if max_try_correction is None:
+            max_try_correction = 3 if torch._C._get_tracing_state() else 10
+
+        sample_shape = torch.Size(sample_shape)
+        sample = self._bartlett_sampling(sample_shape)
+
+        # Below part is to improve numerical stability temporally and should be removed in the future
+        is_singular = self.support.check(sample)
+        if self._batch_shape:
+            is_singular = is_singular.amax(self._batch_dims)
+
+        if torch._C._get_tracing_state():
+            # Less optimized version for JIT
+            for _ in range(max_try_correction):
+                sample_new = self._bartlett_sampling(sample_shape)
+                sample = torch.where(is_singular, sample_new, sample)
+
+                is_singular = ~self.support.check(sample)
+                if self._batch_shape:
+                    is_singular = is_singular.amax(self._batch_dims)
+
+        else:
+            # More optimized version with data-dependent control flow.
+            if is_singular.any():
+                warnings.warn("Singular sample detected.")
+
+                for _ in range(max_try_correction):
+                    sample_new = self._bartlett_sampling(is_singular[is_singular].shape)
+                    sample[is_singular] = sample_new
+
+                    is_singular_new = ~self.support.check(sample_new)
+                    if self._batch_shape:
+                        is_singular_new = is_singular_new.amax(self._batch_dims)
+                    is_singular[is_singular.clone()] = is_singular_new
+
+                    if not is_singular.any():
+                        break
+
+        return sample
+
+    def log_prob(self, value):
+        if self._validate_args:
+            self._validate_sample(value)
+        nu = self.df  # has shape (batch_shape)
+        p = self._event_shape[-1]  # has singleton shape
+        return (
+            -nu
+            * (
+                p * _log_2 / 2
+                + self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1)
+                .log()
+                .sum(-1)
+            )
+            - torch.mvlgamma(nu / 2, p=p)
+            + (nu - p - 1) / 2 * torch.linalg.slogdet(value).logabsdet
+            - torch.cholesky_solve(value, self._unbroadcasted_scale_tril)
+            .diagonal(dim1=-2, dim2=-1)
+            .sum(dim=-1)
+            / 2
+        )
+
+    def entropy(self):
+        nu = self.df  # has shape (batch_shape)
+        p = self._event_shape[-1]  # has singleton shape
+        V = self.covariance_matrix  # has shape (batch_shape x event_shape)
+        return (
+            (p + 1)
+            * (
+                p * _log_2 / 2
+                + self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1)
+                .log()
+                .sum(-1)
+            )
+            + torch.mvlgamma(nu / 2, p=p)
+            - (nu - p - 1) / 2 * _mvdigamma(nu / 2, p=p)
+            + nu * p / 2
+        )
+
+    @property
+    def _natural_params(self):
+        nu = self.df  # has shape (batch_shape)
+        p = self._event_shape[-1]  # has singleton shape
+        return -self.precision_matrix / 2, (nu - p - 1) / 2
+
+    def _log_normalizer(self, x, y):
+        p = self._event_shape[-1]
+        return (y + (p + 1) / 2) * (
+            -torch.linalg.slogdet(-2 * x).logabsdet + _log_2 * p
+        ) + torch.mvlgamma(y + (p + 1) / 2, p=p)
diff --git a/MLPY/Lib/site-packages/torch/export/__init__.py b/MLPY/Lib/site-packages/torch/export/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0c27c4d227fd2f8eb1821ac2a595678dc54fd7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/__init__.py
@@ -0,0 +1,344 @@
+import builtins
+import copy
+import dataclasses
+import inspect
+import io
+import os
+import sys
+import typing
+import warnings
+from enum import auto, Enum
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
+import torch
+import torch.utils._pytree as pytree
+from torch.fx._compatibility import compatibility
+
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import PassManager
+
+from torch.utils._pytree import (
+    FlattenFunc,
+    FromDumpableContextFn,
+    ToDumpableContextFn,
+    UnflattenFunc,
+)
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # Do not import unconditionally, as they import sympy and importing sympy is very slow
+    from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+
+
+__all__ = [
+    "Constraint",
+    "Dim",
+    "ExportBackwardSignature",
+    "ExportGraphSignature",
+    "ExportedProgram",
+    "ModuleCallEntry",
+    "ModuleCallSignature",
+    "dims",
+    "dynamic_dim",
+    "export",
+    "load",
+    "register_dataclass",
+    "save",
+    "unflatten",
+    "FlatArgsAdapter",
+    "UnflattenedModule",
+]
+
+
+from .dynamic_shapes import Constraint, Dim, dims, dynamic_dim
+from .exported_program import ExportedProgram, ModuleCallEntry, ModuleCallSignature
+from .graph_signature import ExportBackwardSignature, ExportGraphSignature
+from .unflatten import FlatArgsAdapter, unflatten, UnflattenedModule
+
+
+PassType = Callable[[torch.fx.GraphModule], Optional[PassResult]]
+
+
+def export(
+    mod: torch.nn.Module,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
+    strict: bool = True,
+    preserve_module_call_signature: Tuple[str, ...] = (),
+) -> ExportedProgram:
+    """
+    :func:`export` takes an arbitrary Python callable (an nn.Module, a function or
+    a method) along with example inputs, and produces a traced graph representing
+    only the Tensor computation of the function in an Ahead-of-Time (AOT) fashion,
+    which can subsequently be executed with different inputs or serialized.  The
+    traced graph (1) produces normalized operators in the functional ATen operator set
+    (as well as any user-specified custom operators), (2) has eliminated all Python control
+    flow and data structures (with certain exceptions), and (3) records the set of
+    shape constraints needed to show that this normalization and control-flow elimination
+    is sound for future inputs.
+
+    **Soundness Guarantee**
+
+    While tracing, :func:`export()` takes note of shape-related assumptions
+    made by the user program and the underlying PyTorch operator kernels.
+    The output :class:`ExportedProgram` is considered valid only when these
+    assumptions hold true.
+
+    Tracing makes assumptions on the shapes (not values) of input tensors.
+    Such assumptions must be validated at graph capture time for :func:`export`
+    to succeed. Specifically:
+
+    - Assumptions on static shapes of input tensors are automatically validated without additional effort.
+    - Assumptions on dynamic shape of input tensors require explicit specification
+      by using the :func:`Dim` API to construct dynamic dimensions and by associating
+      them with example inputs through the ``dynamic_shapes`` argument.
+
+    If any assumption can not be validated, a fatal error will be raised. When that happens,
+    the error message will include suggested fixes to the specification that are needed
+    to validate the assumptions. For example :func:`export` might suggest the
+    following fix to the definition of a dynamic dimension ``dim0_x``, say appearing in the
+    shape associated with input ``x``, that was previously defined as ``Dim("dim0_x")``::
+
+        dim = Dim("dim0_x", max=5)
+
+    This example means the generated code requires dimension 0 of input ``x`` to be less
+    than or equal to 5 to be valid. You can inspect the suggested fixes to dynamic dimension
+    definitions and then copy them verbatim into your code without needing to change the
+    ``dynamic_shapes`` argument to your :func:`export` call.
+
+    Args:
+        mod: We will trace the forward method of this module.
+
+        args: Example positional inputs.
+
+        kwargs: Optional example keyword inputs.
+
+        dynamic_shapes:
+         An optional argument where the type should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
+
+        strict: When enabled (default), the export function will trace the program through
+         TorchDynamo which will ensure the soundness of the resulting graph. Otherwise, the
+         exported program will not validate the implicit assumptions baked into the graph and
+         may cause behavior divergence between the original model and the exported one. This is
+         useful when users need to workaround bugs in the tracer, or simply want incrementally
+         enable safety in their models. Note that this does not affect the resulting IR spec
+         to be different and the model will be serialized in the same way regardless of what value
+         is passed here.
+         WARNING: This option is experimental and use this at your own risk.
+
+    Returns:
+        An :class:`ExportedProgram` containing the traced callable.
+
+    **Acceptable input/output types**
+
+    Acceptable types of inputs (for ``args`` and ``kwargs``) and outputs include:
+
+    - Primitive types, i.e. ``torch.Tensor``, ``int``, ``float``, ``bool`` and ``str``.
+    - Dataclasses, but they must be registered by calling :func:`register_dataclass` first.
+    - (Nested) Data structures comprising of ``dict``, ``list``, ``tuple``, ``namedtuple`` and
+      ``OrderedDict`` containing all above types.
+
+    """
+    from ._trace import _export
+
+    if not isinstance(mod, torch.nn.Module):
+        raise ValueError(
+            f"Expected `mod` to be an instance of `torch.nn.Module`, got {type(mod)}."
+        )
+
+    return _export(
+        mod,
+        args,
+        kwargs,
+        dynamic_shapes,
+        strict=strict,
+        preserve_module_call_signature=preserve_module_call_signature,
+    )
+
+
+def save(
+    ep: ExportedProgram,
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    opset_version: Optional[Dict[str, int]] = None,
+) -> None:
+    """
+
+    .. warning::
+        Under active development, saved files may not be usable in newer versions
+        of PyTorch.
+
+    Saves an :class:`ExportedProgram` to a file-like object. It can then be
+    loaded using the Python API :func:`torch.export.load <torch.export.load>`.
+
+    Args:
+        ep (ExportedProgram): The exported program to save.
+
+        f (Union[str, os.PathLike, io.BytesIO): A file-like object (has to
+         implement write and flush) or a string containing a file name.
+
+        extra_files (Optional[Dict[str, Any]]): Map from filename to contents
+         which will be stored as part of f.
+
+        opset_version (Optional[Dict[str, int]]): A map of opset names
+         to the version of this opset
+
+
+    Example::
+
+        import torch
+        import io
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + 10
+
+        ep = torch.export.export(MyModule(), (torch.randn(5),))
+
+        # Save to file
+        torch.export.save(ep, 'exported_program.pt2')
+
+        # Save to io.BytesIO buffer
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+
+        # Save with extra files
+        extra_files = {'foo.txt': b'bar'.decode('utf-8')}
+        torch.export.save(ep, 'exported_program.pt2', extra_files=extra_files)
+
+    """
+    from torch._export import save
+
+    if not isinstance(ep, ExportedProgram):
+        raise TypeError(
+            f"The 'ep' parameter must be an instance of 'ExportedProgram', got '{type(ep).__name__}' instead."
+        )
+
+    save(ep, f, extra_files=extra_files, opset_version=opset_version)
+
+
+def load(
+    f: Union[str, os.PathLike, io.BytesIO],
+    *,
+    extra_files: Optional[Dict[str, Any]] = None,
+    expected_opset_version: Optional[Dict[str, int]] = None,
+) -> ExportedProgram:
+    """
+
+    .. warning::
+        Under active development, saved files may not be usable in newer versions
+        of PyTorch.
+
+    Loads an :class:`ExportedProgram` previously saved with
+    :func:`torch.export.save <torch.export.save>`.
+
+    Args:
+        ep (ExportedProgram): The exported program to save.
+
+        f (Union[str, os.PathLike, io.BytesIO): A file-like object (has to
+         implement write and flush) or a string containing a file name.
+
+        extra_files (Optional[Dict[str, Any]]): The extra filenames given in
+         this map would be loaded and their content would be stored in the
+         provided map.
+
+        expected_opset_version (Optional[Dict[str, int]]): A map of opset names
+         to expected opset versions
+
+    Returns:
+        An :class:`ExportedProgram` object
+
+    Example::
+
+        import torch
+        import io
+
+        # Load ExportedProgram from file
+        ep = torch.export.load('exported_program.pt2')
+
+        # Load ExportedProgram from io.BytesIO object
+        with open('exported_program.pt2', 'rb') as f:
+            buffer = io.BytesIO(f.read())
+        buffer.seek(0)
+        ep = torch.export.load(buffer)
+
+        # Load with extra files.
+        extra_files = {'foo.txt': ''}  # values will be replaced with data
+        ep = torch.export.load('exported_program.pt2', extra_files=extra_files)
+        print(extra_files['foo.txt'])
+        print(ep(torch.randn(5)))
+    """
+    from torch._export import load
+
+    return load(
+        f, extra_files=extra_files, expected_opset_version=expected_opset_version
+    )
+
+
+def register_dataclass(
+    cls: Type[Any],
+    *,
+    serialized_type_name: Optional[str] = None,
+) -> None:
+    """
+    Registers a dataclass as a valid input/output type for :func:`torch.export.export`.
+
+    Args:
+        cls: the dataclass type to register
+        serialized_type_name: The serialized name for the dataclass. This is
+        required if you want to serialize the pytree TreeSpec containing this
+        dataclass.
+
+    Example::
+
+        @dataclass
+        class InputDataClass:
+            feature: torch.Tensor
+            bias: int
+
+        class OutputDataClass:
+            res: torch.Tensor
+
+        torch.export.register_dataclass(InputDataClass)
+        torch.export.register_dataclass(OutputDataClass)
+
+        def fn(o: InputDataClass) -> torch.Tensor:
+            res = res=o.feature + o.bias
+            return OutputDataClass(res=res)
+
+        ep = torch.export.export(fn, (InputDataClass(torch.ones(2, 2), 1), ))
+        print(ep)
+
+    """
+
+    from torch._export.utils import register_dataclass_as_pytree_node
+
+    return register_dataclass_as_pytree_node(
+        cls, serialized_type_name=serialized_type_name
+    )
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86ec921f4a4e68c078e0682a1ee15f9b405d0a15
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_auto_functionalized_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_auto_functionalized_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e34f8969990041ae882e2c5e31b81267b3eca530
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_auto_functionalized_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_effect_tokens_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_effect_tokens_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1506966b7daf57bc81aac28628d404a04a6fb03f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_remove_effect_tokens_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_safeguard.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_safeguard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfe58e97e4df8e2286f57d56842aafd8ed3280cd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_safeguard.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_trace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_trace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06dbfa8b23d7976c56aea236a49e60cf4a577b15
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_trace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_tree_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_tree_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d920098d4fd007bfd375d58e2cf88e1b347820a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_tree_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/_unlift.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/_unlift.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f25a81e3c1b3da5ada919dd125c3de800acfd73f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/_unlift.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/custom_obj.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/custom_obj.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d146d777bf1ba4b929bedfad9aad28d2e818870
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/custom_obj.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/dynamic_shapes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/dynamic_shapes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ac1b09a0455f865c7f202d119b98b8039427708
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/dynamic_shapes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/exported_program.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/exported_program.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4848acf731f8e1c1af743dfafc6b67c755533881
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/exported_program.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/graph_signature.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/graph_signature.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85e8b5a2828803d24d1fc5baf1205e6ca6df8e8b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/graph_signature.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/__pycache__/unflatten.cpython-39.pyc b/MLPY/Lib/site-packages/torch/export/__pycache__/unflatten.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eace8613c9a58a3f7bf3674d119543c4c8203c27
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/export/__pycache__/unflatten.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/export/_remove_auto_functionalized_pass.py b/MLPY/Lib/site-packages/torch/export/_remove_auto_functionalized_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63ed74589c2d98e2befc003e74ff983c42663ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_remove_auto_functionalized_pass.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import operator
+from typing import List
+
+import torch
+from torch._higher_order_ops.auto_functionalize import (
+    auto_functionalized,
+    get_mutable_arg_names,
+)
+from torch.export import ExportedProgram
+
+
+def unsafe_remove_auto_functionalized_pass(
+    ep: ExportedProgram,
+) -> ExportedProgram:
+    """
+    This pass removes an instances of the higher order op 'auto_functionalized',
+    and modifies the calling EP inplace to have the original mutator op.
+    This pass doesn't perform safety checks to make sure that this inplace mutation is safe.
+    """
+    auto_functionalize_nodes: List[torch.fx.Node] = []
+    for module in ep.graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        for node in ep.graph.nodes:
+            if node.op == "call_function" and node.target is auto_functionalized:
+                auto_functionalize_nodes.append(node)
+
+    # Update every use of the HOP
+    for node in reversed(auto_functionalize_nodes):
+        func = node.args[0]
+        original_kwargs = node.kwargs
+        assert isinstance(func, torch._ops.OpOverload)
+
+        with ep.graph.inserting_before(node):
+            # This makes the call_function refer to every arg as a kwarg, this is weird but probably fine?
+            new_node = ep.graph.call_function(func, kwargs=node.kwargs)
+        for k, v in node.meta.items():
+            new_node.meta[k] = v
+
+        # Replace auto_functionalize(func, args) with just func(args)
+        node.replace_all_uses_with(new_node)
+
+        mutable_args_names = get_mutable_arg_names(new_node.target)
+        output_specs = ep.graph_signature.output_specs
+
+        # update the users of the auto_func node (the getitem nodes)
+        for user in list(new_node.users.keys()):
+            assert user.target == operator.getitem
+            # getitem corresponding to a mutated input, just replace all uses with the original input
+            if user.args[1] >= len(func._schema.returns):
+                assert user.args[1] <= len(func._schema.returns) + len(
+                    mutable_args_names
+                )
+
+                # If the result of getitem was used in an output node, update the output spec with the correct name
+                adusted_index = user.args[1] - len(func._schema.returns)
+                original_arg = original_kwargs[mutable_args_names[adusted_index]]
+                for spec in output_specs:
+                    if spec.arg.name == user.name:
+                        spec.arg.name = original_arg.name  # pyre-ignore
+                        break
+
+                # This is a little fragile/implementation dependent, but the order of the mutable args is the same as the order
+                # of the getitem calls following the HOP.
+                user.replace_all_uses_with(
+                    original_kwargs[mutable_args_names[adusted_index]]
+                )
+
+        if len(func._schema.returns) == 1:
+            # If the function has 1 return then it will just directly return the
+            # result -- we don't need a getitem. So we can replace all the
+            # getitem(auto_functionalized, 0) with just the note itself.
+            for user in list(new_node.users.keys()):
+                if user.args[1] == 0:
+                    user.replace_all_uses_with(new_node)
+
+                    # Same case as above, update the output spec if getitem result used in an output node
+                    for spec in output_specs:
+                        if spec.arg.name == user.name:
+                            spec.arg.name = new_node.name
+                            break
+
+        new_node.meta["val"] = node.meta["val"][: len(func._schema.returns)]
+        ep.graph.erase_node(node)
+
+    ep.graph.eliminate_dead_code()
+    return ep
diff --git a/MLPY/Lib/site-packages/torch/export/_remove_effect_tokens_pass.py b/MLPY/Lib/site-packages/torch/export/_remove_effect_tokens_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..911381067123c3d0e34343421e4c47f1c13ff412
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_remove_effect_tokens_pass.py
@@ -0,0 +1,126 @@
+import operator
+from typing import List
+
+import torch
+from torch._higher_order_ops.effects import with_effects
+from .exported_program import ExportedProgram
+from .graph_signature import (
+    InputKind,
+    InputSpec,
+    OutputKind,
+    OutputSpec,
+    TensorArgument,
+)
+
+
+def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
+    """
+    Removes the existance of tokens from the exported program, including:
+    - Removes the input and output tokens
+    - Replaces with_effects(token, func, args) with just func(args)
+
+    This function does an inplace modification on the given ExportedProgram.
+    """
+    num_tokens: int = 0
+    input_token_names: List[str] = []
+    new_input_specs: List[InputSpec] = []
+    for inp in ep.graph_signature.input_specs:
+        if inp.kind == InputKind.TOKEN:
+            num_tokens += 1
+            assert isinstance(inp.arg, TensorArgument)
+            input_token_names.append(inp.arg.name)
+        else:
+            new_input_specs.append(inp)
+
+    num_out_tokens: int = 0
+    new_output_specs: List[str] = []
+    output_token_names: List[OutputSpec] = []
+    for out in ep.graph_signature.output_specs:
+        if out.kind == OutputKind.TOKEN:
+            num_out_tokens += 1
+            output_token_names.append(out.arg.name)
+        else:
+            new_output_specs.append(out)
+
+    assert num_tokens == num_out_tokens
+
+    output_node = None
+    with_effect_nodes: List[torch.fx.Node] = []
+    for node in ep.graph.nodes:
+        if node.op == "output":
+            output_node = node
+            break
+
+        if not (node.op == "call_function" and node.target is with_effects):
+            continue
+
+        with_effect_nodes.append(node)
+
+    # Remove tokens from outputs
+    assert output_node is not None
+    output_args = output_node.args[0]
+    assert len(output_args) >= num_tokens
+    out_token_nodes = output_args[:num_tokens]
+    output_node.args = (tuple(output_args[num_tokens:]),)
+    for out_token in out_token_nodes:
+        assert out_token.name in output_token_names
+        ep.graph.erase_node(out_token)
+
+    # Replace with_effects(token, func, args) with just func(args)
+    for node in reversed(with_effect_nodes):
+        func = node.args[1]
+        assert isinstance(func, torch._ops.OpOverload)
+
+        with ep.graph.inserting_before(node):
+            new_node = ep.graph.call_function(func, node.args[2:])
+        for k, v in node.meta.items():
+            new_node.meta[k] = v
+
+        node.replace_all_uses_with(new_node)
+
+        # Update user getitem nodes
+        for user in list(new_node.users.keys()):
+            assert user.target == operator.getitem
+            # getitem(with_effects, 0) == token
+            if user.args[1] == 0:
+                ep.graph.erase_node(user)
+
+        if len(func._schema.returns) == 1:
+            # If the function has 1 return then it will just directly return the
+            # result -- we don't need a getitem. So we can replace all the
+            # getitem(with_effects, 1) with just the note itself.
+            for user in list(new_node.users.keys()):
+                assert user.args[1] == 1
+                user.replace_all_uses_with(new_node)
+
+            new_node.meta["val"] = node.meta["val"][1]
+        elif len(func._schema.returns) > 1:
+            # If the function has more than 1 return then since we got rid of
+            # the 1st return value (the token), we need to bump all the other
+            # getitem calls by 1 down
+            for user in list(new_node.users.keys()):
+                assert user.args[1] >= 1
+                user.args = (user.args[0], user.args[1] - 1)
+
+            new_node.meta["val"] = node.meta["val"][1:]
+        else:
+            assert len(func._schema.returns) == 0
+            assert len(new_node.users) == 0
+            new_node.meta["val"] = None
+
+        ep.graph.erase_node(node)
+
+    # Remove tokens from inputs
+    placeholders = [node for node in ep.graph.nodes if node.op == "placeholder"]
+    assert len(placeholders) >= num_tokens
+    inp_token_nodes = placeholders[:num_tokens]
+    for inp_token in inp_token_nodes:
+        assert inp_token.name in input_token_names
+        ep.graph.erase_node(inp_token)
+
+    # Update graph signature
+    ep.graph_signature.input_specs = new_input_specs
+    ep.graph_signature.output_specs = new_output_specs
+
+    ep.graph.eliminate_dead_code()
+    return ep
diff --git a/MLPY/Lib/site-packages/torch/export/_safeguard.py b/MLPY/Lib/site-packages/torch/export/_safeguard.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96e595e3d04df98636dbdaf3e5849e1b54dd216
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_safeguard.py
@@ -0,0 +1,42 @@
+import torch
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.overrides import TorchFunctionMode
+
+
+class AutogradStateOpsFailSafeguard(TorchFunctionMode):
+    """
+    Detect grad state ops during exporting the graph and fail the process by
+    raising an error, to avoid unexpected behavior. Those grad mode ops could be:
+    `torch.no_grad`
+    `torch.enable_grad`
+    `torch.set_grad_enabled`
+
+    Export with predispatch mode is exempted.
+    """
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        unsupported_grad_mode_ops = [
+            torch._C._set_grad_enabled,
+        ]
+        # It's only enabled while tracing, by confirming the torch dispatch mode is
+        # any active PROXY. This is to allow the autograd ops out of tracing.
+        current_state = torch._C.is_grad_enabled()
+        if func in unsupported_grad_mode_ops:
+            assert len(args) == 1
+            changed_state = args[0]
+            mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+            # Intend to check if it's not the pre_dispatch mode. It's allowed to use
+            # autograd ops in pre_dispatch mode, e.g. `torch.no_grad`
+            if (
+                mode
+                and isinstance(mode, ProxyTorchDispatchMode)
+                and not mode.pre_dispatch
+                and changed_state != current_state
+            ):
+                raise RuntimeError(
+                    f"Encountered autograd state manager op {func} trying to change global autograd state "
+                    "while exporting. This is unsafe because we don't capture this op in torch.export "
+                    "today, hence we can't reflect the user intention soundly."
+                )
+        return func(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/export/_trace.py b/MLPY/Lib/site-packages/torch/export/_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..87b7bc48868c5091a132367e44ce5187afc8713e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_trace.py
@@ -0,0 +1,1060 @@
+import dataclasses
+import functools
+import inspect
+import logging
+import re
+import time
+import warnings
+from contextlib import contextmanager, nullcontext
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch._dynamo
+import torch.fx
+
+import torch.utils._pytree as pytree
+from torch._dynamo.exc import UserError, UserErrorType
+from torch._export.non_strict_utils import (
+    make_constraints,
+    make_fake_inputs,
+    make_fake_params_buffers,
+)
+from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+    _AddRuntimeAssertionsForInlineConstraintsPass,
+)
+from torch._export.passes.collect_tracepoints_pass import CollectTracepointsPass
+from torch._export.passes.lift_constants_pass import (
+    ConstantAttrMap,
+    lift_constants_pass,
+    rewrite_script_object_meta,
+)
+from torch._export.wrappers import _wrap_submodules
+from torch._functorch.aot_autograd import aot_export_module
+from torch._guards import detect_fake_mode
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch._utils_internal import log_export_usage
+from torch.export.exported_program import OutputKind
+from torch.fx.experimental.symbolic_shapes import (
+    ConstraintViolationError,
+    free_unbacked_symbols,
+    GuardOnDataDependentSymNode,
+    ShapeEnv,
+)
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from torch.utils._sympy.value_ranges import ValueRangeError
+
+from ._safeguard import AutogradStateOpsFailSafeguard
+
+from .dynamic_shapes import _process_constraints, Constraint
+from .exported_program import (
+    _disable_prexisiting_fake_mode,
+    ExportedProgram,
+    InputKind,
+    ModuleCallEntry,
+    ModuleCallSignature,
+)
+from .graph_signature import (
+    _sig_to_specs,
+    ArgumentSpec,
+    ConstantArgument,
+    CustomObjArgument,
+    ExportGraphSignature,
+    SymIntArgument,
+    TensorArgument,
+)
+
+
+log = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class ExportDynamoConfig:
+    """
+    Manage Export-specific configurations of Dynamo.
+    """
+
+    allow_rnn: bool = True
+    reorderable_logging_functions: Set[Callable] = dataclasses.field(
+        default_factory=set
+    )
+
+
+DEFAULT_EXPORT_DYNAMO_CONFIG = ExportDynamoConfig()
+DEFAULT_EXPORT_DYNAMO_CONFIG.reorderable_logging_functions = {
+    logging.critical,
+    logging.debug,
+    logging.error,
+    logging.exception,
+    logging.info,
+    logging.log,
+    logging.warning,
+    print,
+    warnings.warn,
+}
+
+
+@contextmanager
+def _ignore_backend_decomps():
+    orig_mkldnn_flag = torch.backends.mkldnn.set_flags(False)
+    orig_nnpack_flag = torch.backends.nnpack.set_flags(False)
+    try:
+        yield
+    finally:
+        torch.backends.mkldnn.set_flags(*orig_mkldnn_flag)
+        torch.backends.nnpack.set_flags(*orig_nnpack_flag)
+
+
+def _convert_input_to_fake(gm, args, kwargs):
+    params_buffers = _get_params_buffers(gm)
+    fake_inps: List[torch.Tensor] = []
+    for node in gm.graph.nodes:
+        if node.op == "placeholder" and "val" in node.meta:
+            fake_val = node.meta["val"]
+            if fake_val is not None and isinstance(fake_val, torch.Tensor):
+                fake_inps.append(fake_val)
+
+    if detected_fake_mode := detect_fake_mode(fake_inps):
+        fake_mode = detected_fake_mode
+    else:
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+
+    if len(args) == 0 and len(kwargs) == 0:
+        return (), {}, params_buffers, fake_mode
+
+    count = 0
+
+    def convert_to_fake(x):
+        nonlocal count
+        val = fake_inps[count]
+        count += 1
+        return val
+
+    fake_args = pytree.tree_map_only(torch.Tensor, convert_to_fake, args)
+    # TODO properly use the cached fake tensor
+    fake_kwargs = pytree.tree_map_only(torch.Tensor, fake_mode.from_tensor, kwargs)
+    fake_params_buffers = pytree.tree_map_only(
+        torch.Tensor,
+        functools.partial(fake_mode.from_tensor, static_shapes=True),
+        params_buffers,
+    )
+    return fake_args, fake_kwargs, fake_params_buffers, fake_mode
+
+
+def _replace_param_buffer_names(param_buffer_table, sig):
+    for spec in sig.input_specs:
+        if spec.kind in (
+            InputKind.PARAMETER,
+            InputKind.BUFFER,
+        ):
+            spec.target = param_buffer_table[spec.target]
+    for spec in sig.output_specs:
+        if spec.kind in (
+            OutputKind.BUFFER_MUTATION,
+            OutputKind.GRADIENT_TO_PARAMETER,
+        ):
+            spec.target = param_buffer_table[spec.target]
+
+
+def _convert_to_positional_args(orig_arg_names, args, kwargs):
+    assert len(orig_arg_names) == len(args) + len(kwargs), (
+        f"Total number of arg names is expected to be {len(orig_arg_names)} "
+        f"but got {len(args)} positional args, {len(kwargs)} kwargs."
+    )
+    reordered_kwargs = [kwargs[kw_name] for kw_name in orig_arg_names[len(args) :]]
+    return (
+        *args,
+        *reordered_kwargs,
+    )
+
+
+def _normalize_nn_module_stack(gm_torch_level, root_cls):
+    # Append a root module to every nn_module_stack.
+    root = "L['self']"
+    root_key = re.sub(r"[^a-zA-Z0-9]", "_", root)
+    for gm in gm_torch_level.modules():
+        if not isinstance(gm, torch.fx.GraphModule):
+            continue
+        for node in gm.graph.nodes:
+            if node.op in ["placeholder", "output"]:
+                continue
+            add_root = True
+            if nn_module_stack := node.meta.get("nn_module_stack", {}):
+                path, ty = next(iter(nn_module_stack.values()))
+                # After deserializing the class `ty` might not exist anymore so
+                # it could be a string
+                if inspect.isclass(ty) and issubclass(ty, torch.nn.Module):
+                    # TODO Figure out why sometimes we have root sometimes we don't.
+                    if path == root and ty is root_cls:
+                        add_root = False
+                else:
+                    assert isinstance(ty, str)
+            if add_root:
+
+                def normalize_path(path):
+                    try:
+                        parts = []
+
+                        class Path:
+                            def __getattr__(self, name):
+                                parts.append(name)
+                                return self
+
+                            def __getitem__(self, idx):
+                                parts.append(str(idx))
+                                return self
+
+                        eval(path, {"L": {"self": Path()}})
+                        return ".".join(parts)
+                    except Exception:  # TODO(zhxchen17) Remove this.
+                        return path
+
+                nn_module_stack = {root_key: (root, root_cls), **nn_module_stack}
+                node.meta["nn_module_stack"] = {
+                    key: (normalize_path(path), ty)
+                    for key, (path, ty) in nn_module_stack.items()
+                }
+
+
+def _get_param_buffer_mapping(
+    original_module: torch.nn.Module,
+    traced_module: torch.nn.Module,
+) -> Dict[str, str]:
+    """
+    Returns a mapping of parameter/buffer names from the new module to the
+    original model. This is to help with restoring the FQN for parameter/buffers
+    of a traced module to what the original module contains.
+    """
+
+    param_lookup: Dict[int, List[str]] = {}
+    buffer_lookup: Dict[int, List[str]] = {}
+    for name, param in original_module.named_parameters(remove_duplicate=False):
+        param_lookup.setdefault(id(param), []).append(name)
+    for name, buffer in original_module.named_buffers(remove_duplicate=False):
+        buffer_lookup.setdefault(id(buffer), []).append(name)
+
+    param_buffer_table: Dict[str, str] = {}
+    for dynamo_name, dynamo_param in traced_module.named_parameters(
+        remove_duplicate=False
+    ):
+        assert dynamo_name not in param_buffer_table
+        if id(dynamo_param) in param_lookup:
+            param_buffer_table[dynamo_name] = param_lookup[id(dynamo_param)].pop()
+
+    for dynamo_name, dynamo_buffer in traced_module.named_buffers(
+        remove_duplicate=False
+    ):
+        assert dynamo_name not in param_buffer_table
+        if id(dynamo_buffer) in buffer_lookup:
+            param_buffer_table[dynamo_name] = buffer_lookup[id(dynamo_buffer)].pop()
+
+    return param_buffer_table
+
+
+def _remap_constants(
+    orig_constant_attrs: ConstantAttrMap,
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+) -> None:
+    """Rewrite the graph signature and constants table to use the FQN from the original module."""
+    remap_table: Dict[str, str] = {}
+    for name, value in constants.items():
+        if value in orig_constant_attrs:
+            remap_table[name] = orig_constant_attrs[value]
+
+    for spec in graph_signature.input_specs:
+        if spec.kind in (
+            InputKind.CONSTANT_TENSOR,
+            InputKind.CUSTOM_OBJ,
+        ):
+            orig_target = spec.target
+            assert orig_target is not None
+            spec.target = remap_table.get(orig_target, orig_target)
+
+            constant = constants[orig_target]
+            del constants[orig_target]
+            constants[spec.target] = constant
+
+
+def _restore_state_dict(
+    original_module: torch.nn.Module, traced_module: torch.fx.GraphModule
+) -> None:
+    """
+    Restores the state dict of the traced module to that of the original module.
+    """
+    param_buffer_table = _get_param_buffer_mapping(original_module, traced_module)
+    # Since the graph module is flattened (no module heirarchy), we
+    # need to noramlize the module by replacing "." with "_". If we
+    # don't, it will try to save the weight to a submodule which no
+    # longer exists.
+    for name, fqn in param_buffer_table.items():
+        param_buffer_table[name] = fqn.replace(".", "_")
+
+    # Replace state dict attr names with the fqn
+    for name, fqn in param_buffer_table.items():
+        if not hasattr(traced_module, name):
+            continue
+
+        attr = getattr(traced_module, name)
+        if isinstance(attr, torch.Tensor) and not isinstance(attr, torch.nn.Parameter):
+            traced_module.register_buffer(fqn, attr)
+        else:
+            setattr(traced_module, fqn, attr)
+        delattr(traced_module, name)
+
+    # Replace graph getattr nodes with the correct name
+    for node in traced_module.graph.nodes:
+        if node.op == "get_attr":
+            attr_name = node.target
+            if attr_name in param_buffer_table:
+                node.target = param_buffer_table[attr_name]
+
+    traced_module.recompile()
+
+
+def _export_to_torch_ir(
+    f: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    constraints: Optional[List[Constraint]] = None,
+    *,
+    preserve_module_call_signature: Tuple[str, ...] = (),
+    disable_constraint_solver: bool = False,
+    restore_fqn: bool = True,
+    _log_export_usage: bool = True,
+) -> torch.fx.GraphModule:
+    """
+    Traces either an nn.Module's forward function or just a callable with PyTorch
+    operations inside and produce a torch.fx.GraphModule in torch IR.
+    """
+
+    if _log_export_usage:
+        log_export_usage(event="export.private_api", flags={"_export_to_torch_ir"})
+
+    kwargs = kwargs or {}
+
+    if not isinstance(args, tuple):
+        raise UserError(
+            UserErrorType.INVALID_INPUT,
+            f"Expecting `args` to be a tuple of example positional inputs, got {type(args)}",
+        )
+
+    with torch._dynamo.config.patch(dataclasses.asdict(DEFAULT_EXPORT_DYNAMO_CONFIG)):
+        try:
+            module_call_specs: Dict[str, Dict[str, pytree.TreeSpec]] = {}
+            with _wrap_submodules(
+                f, preserve_module_call_signature, module_call_specs
+            ), _ignore_backend_decomps():
+                gm_torch_level, _ = torch._dynamo.export(
+                    f,
+                    constraints=constraints,  # type: ignore[arg-type]
+                    assume_static_by_default=True,
+                    tracing_mode="symbolic",
+                    disable_constraint_solver=disable_constraint_solver,
+                    _log_export_usage=_log_export_usage,
+                )(
+                    *args,
+                    **kwargs,
+                )
+        except (ConstraintViolationError, ValueRangeError) as e:
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: TRY200
+        except GuardOnDataDependentSymNode as e:
+            raise UserError(  # noqa: TRY200
+                UserErrorType.ANTI_PATTERN,
+                f"Consider annotating your code using torch._constrain_as_*(). {str(e)}",
+                case_name="constrain_as_size_example",
+            )
+
+    gm_torch_level.meta["module_call_specs"] = module_call_specs
+
+    if isinstance(f, torch.nn.Module) and restore_fqn:
+        _restore_state_dict(f, gm_torch_level)
+
+    return gm_torch_level
+
+
+def _gather_constant_attrs(m: torch.nn.Module) -> ConstantAttrMap:
+    """Search the module hierarchy, gathering up all tensor and ScriptObject constants.
+
+    Returns a dictionary mapping hash(value) to the name of the constant. We
+    have to abuse `hash` here unfortunately, see: [ScriptObject hash].
+    """
+    constants = ConstantAttrMap()
+    buffers_parameters = set(m.buffers())
+    buffers_parameters.update(m.parameters())
+
+    def inner(m: torch.nn.Module, prefix_atoms: List[str], constants):
+        for k, v in m.__dict__.items():
+            if isinstance(v, (torch.Tensor, torch.ScriptObject)):
+                if v in buffers_parameters:
+                    # filter out buffers and parameters, leaving only constants
+                    continue
+
+                fqn = ".".join(prefix_atoms + [k])
+                if v in constants:
+                    raise ValueError(
+                        f"Duplicate reference to constant attribute found: '{constants[v]}' and '{fqn}'."
+                    )
+
+                constants[v] = fqn
+        for k, v in m.named_children():
+            inner(v, prefix_atoms + [k], constants)
+
+    inner(m, [], constants)
+    return constants
+
+
+def _export_non_strict(
+    mod: torch.nn.Module,
+    fake_args,
+    fake_kwargs,
+    fake_params_buffers,
+    constant_attrs: ConstantAttrMap,
+    *,
+    transform=lambda x: x,  # TODO(zhxchen17) Revisit if this is needed later.
+    pre_dispatch=False,
+):
+    # [NOTE] If the user is exporting under training mode, we want to detect if there is any
+    # state change in the autograd global state and error. If the user is exporting under inference
+    # mode, we don't care.
+    is_grad_enabled = torch._C.is_grad_enabled()
+    grad_safe_guard = (
+        AutogradStateOpsFailSafeguard() if is_grad_enabled else nullcontext()
+    )
+
+    @contextmanager
+    def _compiling_state_context():
+        old_value = torch.compiler._is_compiling_flag
+        try:
+            torch.compiler._is_compiling_flag = True
+            yield
+        finally:
+            torch.compiler._is_compiling_flag = old_value
+
+    # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
+    # otherwise aot_export_module will error out because it sees a mix of fake_modes.
+    # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
+    with torch.nn.utils.stateless._reparametrize_module(
+        mod, fake_params_buffers
+    ), grad_safe_guard, _ignore_backend_decomps(), _compiling_state_context():  # type: ignore[attr-defined]
+        gm, graph_signature = transform(aot_export_module)(
+            mod,
+            fake_args,
+            trace_joint=False,
+            pre_dispatch=pre_dispatch,
+            kwargs=fake_kwargs,
+        )
+    # TODO unfortunately preserving graph-level metadata is not
+    # working well with aot_export. So we manually copy it.
+    # (The node-level meta is addressed above.)
+    if isinstance(mod, torch.fx.GraphModule) and hasattr(mod, "meta"):
+        gm.meta.update(mod.meta)
+
+    if pre_dispatch:
+        from torch._export.passes.replace_set_grad_with_hop_pass import (
+            replace_set_grad_with_hop_pass,
+        )
+
+        gm = replace_set_grad_with_hop_pass(gm)
+
+    # NOTE: aot_export adds symint metadata for placeholders with int values;
+    # since these become specialized, we replace such metadata with the original values
+    flat_args = pytree.tree_leaves((fake_args, fake_kwargs))
+    index = 0
+    total_non_user_inputs = (
+        len(graph_signature.parameters)
+        + len(graph_signature.buffers)
+        + len(graph_signature.input_tokens)
+    )
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            if index >= total_non_user_inputs:
+                user_arg = flat_args[index - total_non_user_inputs]
+                if not isinstance(user_arg, torch.Tensor):
+                    node.meta["val"] = user_arg
+            index += 1
+
+    is_joint = graph_signature.backward_signature is not None
+
+    def make_argument_spec(node) -> ArgumentSpec:
+        if isinstance(node, (int, bool, float, type(None))):
+            # For const outputs we just directly return this
+            return ConstantArgument(value=node)
+
+        assert (
+            "val" in node.meta
+        ), f"{node} is not a constant or a node with a 'val' metadata field"
+        val = node.meta["val"]
+        if isinstance(val, FakeTensor):
+            return TensorArgument(name=node.name)
+        elif isinstance(val, torch.SymInt):
+            return SymIntArgument(name=node.name)
+        elif isinstance(val, torch.ScriptObject):
+            return CustomObjArgument(
+                name=node.name, class_fqn=val._type().qualified_name()  # type: ignore[attr-defined]
+            )
+        else:
+            # TODO: this branch is likely wrong, all permissible ConstantArgument type
+            # should have been handled already
+            return ConstantArgument(value=val)
+
+    input_specs, output_specs = _sig_to_specs(
+        user_inputs=set(graph_signature.user_inputs),
+        inputs_to_parameters=graph_signature.inputs_to_parameters,  # type: ignore[arg-type]
+        inputs_to_buffers=graph_signature.inputs_to_buffers,  # type: ignore[arg-type]
+        user_outputs=set(graph_signature.user_outputs),  # type: ignore[arg-type]
+        buffer_mutations=graph_signature.buffers_to_mutate,  # type: ignore[arg-type]
+        user_input_mutations=graph_signature.user_inputs_to_mutate,  # type: ignore[arg-type]
+        grad_params=graph_signature.backward_signature.gradients_to_parameters if is_joint else {},  # type: ignore[arg-type, union-attr]
+        grad_user_inputs=graph_signature.backward_signature.gradients_to_user_inputs if is_joint else {},  # type: ignore[arg-type, union-attr]
+        loss_output=graph_signature.backward_signature.loss_output if is_joint else None,  # type: ignore[arg-type, union-attr]
+        inputs=[
+            make_argument_spec(node)
+            for node in gm.graph.nodes
+            if node.op == "placeholder"
+        ],
+        outputs=[
+            make_argument_spec(node)
+            for node in pytree.tree_leaves(next(iter(reversed(gm.graph.nodes))).args)
+        ],
+        input_tokens=graph_signature.input_tokens,
+        output_tokens=graph_signature.output_tokens,
+    )
+    export_graph_signature = ExportGraphSignature(
+        input_specs=input_specs, output_specs=output_specs
+    )
+
+    constants = rewrite_script_object_meta(gm)
+    constants.update(lift_constants_pass(gm, export_graph_signature, constant_attrs))
+
+    @dataclasses.dataclass
+    class _ExportedProgramNonStrict:
+        gm: torch.fx.GraphModule
+        sig: ExportGraphSignature
+        constants: Dict[str, Union[torch.Tensor, torch._C.ScriptObject]]
+
+    return _ExportedProgramNonStrict(
+        gm,
+        export_graph_signature,
+        constants,
+    )
+
+
+def _get_params_buffers(mod: torch.nn.Module) -> Dict[str, torch.Tensor]:
+    params_buffers: Dict[str, torch.Tensor] = {}
+    for name, param in mod.named_parameters(remove_duplicate=False):
+        params_buffers[name] = param
+
+    for name, buffer in mod.named_buffers(remove_duplicate=False):
+        params_buffers[name] = buffer
+    return params_buffers
+
+
+def _rewrite_dynamo_tensor_constants(
+    orig_mod_buffers: Set[torch.Tensor],
+    traced_mod_buffers: Dict[str, torch.Tensor],
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+):
+    """Dynamo erroneously marks tensor attributes on modules as a buffers.
+
+    Rewrite them to be tensor constants.
+    """
+    for spec in graph_signature.input_specs:
+        if spec.kind == InputKind.BUFFER:
+            assert spec.target is not None
+            value = traced_mod_buffers[spec.target]
+            if value not in orig_mod_buffers:
+                # This was a tensor constant erroneously marked as a buffer.
+                # Convert it int oa constant in the graph signature, and add its
+                # value to the constants table.
+                spec.kind = InputKind.CONSTANT_TENSOR
+                constants[spec.target] = value
+
+
+def _rewrite_non_persistent_buffers(
+    orig_mod: torch.nn.Module,
+    graph_signature: ExportGraphSignature,
+    constants: Dict[str, Union[torch.Tensor, torch.ScriptObject]],
+):
+    """Dynamo erroneously drops the persistent flag on buffers.
+
+    Rewrite non-persistent buffers to reflect the original module.
+    """
+    state_dict = orig_mod.state_dict()
+    for spec in graph_signature.input_specs:
+        if spec.kind == InputKind.BUFFER:
+            assert spec.target is not None
+            if spec.target not in state_dict:
+                assert spec.target not in constants
+                spec.persistent = False
+                constants[spec.target] = orig_mod.get_buffer(spec.target)
+
+
+def get_ep_stats(ep: ExportedProgram) -> Dict[str, Any]:
+    op_count = 0
+    op_set = set()
+    for m in ep.graph_module.modules():
+        if not isinstance(m, torch.fx.GraphModule):
+            continue
+        for node in m.graph.nodes:
+            if node.op != "call_function":
+                continue
+            op_count += 1
+            assert hasattr(node.target, "__module__")
+            assert hasattr(node.target, "__name__")
+            op_set.add(f"{node.target.__module__}.{node.target.__name__}")
+    return {"op_count": op_count, "op_set": op_set}
+
+
+_EXPORT_FLAGS: Optional[Set[str]] = None
+
+
+def _log_export_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        global _EXPORT_FLAGS
+        try:
+            start = time.time()
+            ep = fn(*args, **kwargs)
+            end = time.time()
+            log_export_usage(
+                event="export.time",
+                metrics=end - start,
+                flags=_EXPORT_FLAGS,
+                **get_ep_stats(ep),
+            )
+        except Exception as e:
+            t = type(e)
+            error_type = t.__module__ + "." + t.__qualname__
+            log_export_usage(
+                event="export.error",
+                type=error_type,
+                message=str(e),
+                flags=_EXPORT_FLAGS,
+            )
+            raise e
+        finally:
+            _EXPORT_FLAGS = None
+
+        return ep
+
+    return wrapper
+
+
+@_log_export_wrapper
+@_disable_prexisiting_fake_mode
+def _export(
+    mod: torch.nn.Module,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
+    *,
+    strict: bool = True,
+    preserve_module_call_signature: Tuple[str, ...] = (),
+    pre_dispatch: bool = False,
+) -> ExportedProgram:
+    """
+    Traces either an nn.Module's forward function or just a callable with PyTorch
+    operations inside and produce a ExportedProgram.
+
+    Args:
+        f: the `nn.Module` to trace.
+
+        args: example positional inputs.
+
+        kwargs: optional example keyword inputs.
+
+        dynamic_shapes:
+         An optional argument where the type should either be:
+         1) a dict from argument names of ``f`` to their dynamic shape specifications,
+         2) a tuple that specifies dynamic shape specifications for each input in original order.
+         If you are specifying dynamism on keyword args, you will need to pass them in the order that
+         is defined in the original function signature.
+
+         The dynamic shape of a tensor argument can be specified as either
+         (1) a dict from dynamic dimension indices to :func:`Dim` types, where it is
+         not required to include static dimension indices in this dict, but when they are,
+         they should be mapped to None; or (2) a tuple / list of :func:`Dim` types or None,
+         where the :func:`Dim` types correspond to dynamic dimensions, and static dimensions
+         are denoted by None. Arguments that are dicts or tuples / lists of tensors are
+         recursively specified by using mappings or sequences of contained specifications.
+
+        preserve_module_call_signature: A list of submodule paths for which the original
+            calling conventions are preserved as metadata.
+
+    Returns:
+        An ExportedProgram containing the traced method.
+    """
+    from .dynamic_shapes import _process_dynamic_shapes
+
+    global _EXPORT_FLAGS
+    flags = set()
+    flags.add("strict" if strict else "non_strict")
+    flags.add("pre_dispatch" if pre_dispatch else "aot_dispatch")
+    log_export_usage(event="export.enter", flags=flags)
+    _EXPORT_FLAGS = flags
+
+    constraints = _process_dynamic_shapes(mod, args, kwargs, dynamic_shapes) or []
+
+    kwargs = kwargs or {}
+
+    constant_attrs = _gather_constant_attrs(mod)
+
+    flat_args, orig_in_spec = pytree.tree_flatten((args, kwargs))
+
+    if not strict:
+        out_spec = None
+
+        module_call_specs: Dict[str, Dict[str, pytree.TreeSpec]] = {}
+
+        def strip_root(x):
+            if isinstance(x, str) and x.startswith("_export_root"):
+                stripped = x[len("_export_root") :]
+                return stripped[1:] if stripped.startswith(".") else stripped
+            return x
+
+        def fixup_key(x):
+            return "L__self__" + strip_root(x)
+
+        def _tuplify_outputs(aot_export):
+            def _aot_export_non_strict(mod, args, kwargs=None, **flags):
+                kwargs = kwargs or {}
+
+                class Wrapper(torch.nn.Module):
+                    def __init__(self, mod):
+                        super().__init__()
+                        self._export_root = mod
+
+                    def forward(self, *args, **kwargs):
+                        nonlocal out_spec
+                        if isinstance(self._export_root, torch.fx.GraphModule):
+                            with torch.fx.traceback.preserve_node_meta():
+                                tree_out = torch.fx.Interpreter(self._export_root).run(
+                                    *args, **kwargs
+                                )
+                        else:
+                            tree_out = self._export_root(*args, **kwargs)
+                        flat_outs, out_spec = pytree.tree_flatten(tree_out)
+                        return tuple(flat_outs)
+
+                wrapped_mod = Wrapper(mod)
+                # Patch export_root to the signatures so that wrapper module correctly populates the
+                # in/out spec
+                new_preserved_call_signatures = [
+                    "_export_root." + i for i in preserve_module_call_signature
+                ]
+                with _wrap_submodules(
+                    wrapped_mod, new_preserved_call_signatures, module_call_specs
+                ):
+                    gm, sig = aot_export(wrapped_mod, args, kwargs=kwargs, **flags)
+
+                sig.parameters = pytree.tree_map(strip_root, sig.parameters)
+                sig.buffers = pytree.tree_map(strip_root, sig.buffers)
+                sig.inputs_to_buffers = pytree.tree_map(
+                    strip_root, sig.inputs_to_buffers
+                )
+                sig.inputs_to_parameters = pytree.tree_map(
+                    strip_root, sig.inputs_to_parameters
+                )
+                sig.buffers_to_mutate = pytree.tree_map(
+                    strip_root, sig.buffers_to_mutate
+                )
+                for node in gm.graph.nodes:
+                    if "nn_module_stack" in node.meta:
+                        nn_module_stack = node.meta["nn_module_stack"]
+                        node.meta["nn_module_stack"] = {
+                            fixup_key(key): val
+                            for key, val in pytree.tree_map(
+                                strip_root, nn_module_stack
+                            ).items()
+                        }
+
+                return gm, sig
+
+            return _aot_export_non_strict
+
+        (
+            fake_mode,
+            fake_args,
+            fake_kwargs,
+            equalities_inputs,
+            original_signature,
+        ) = make_fake_inputs(mod, args, kwargs, constraints)
+
+        fake_params_buffers = make_fake_params_buffers(
+            fake_mode, _get_params_buffers(mod)
+        )
+        with fake_mode:
+            ep_non_strict = _export_non_strict(
+                mod,
+                fake_args,
+                fake_kwargs,
+                fake_params_buffers,
+                constant_attrs,
+                pre_dispatch=pre_dispatch,
+                transform=_tuplify_outputs,
+            )
+        try:
+            range_constraints = make_constraints(
+                fake_mode,
+                equalities_inputs,
+                original_signature,
+                ep_non_strict.gm,
+            )
+        except (ConstraintViolationError, ValueRangeError) as e:
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: TRY200
+
+        assert out_spec is not None
+
+        gm = ep_non_strict.gm
+
+        module_call_signatures = {
+            strip_root(fqn): ModuleCallSignature(inputs=[], outputs=[], **specs)
+            for fqn, specs in module_call_specs.items()
+        }
+
+        if len(preserve_module_call_signature) > 0:
+            for node in gm.graph.nodes:
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    if "path" in node.kwargs:
+                        path = strip_root(node.kwargs["path"])
+                        with gm.graph.inserting_before(node):
+                            new_node = gm.graph.create_node(
+                                "call_function",
+                                torch.ops.higher_order._export_tracepoint,
+                                args=node.args,
+                                kwargs={
+                                    "path": path,
+                                    "kind": node.kwargs["kind"],
+                                },
+                            )
+                            node.replace_all_uses_with(new_node)
+                            gm.graph.erase_node(node)
+
+            res = CollectTracepointsPass(module_call_signatures, ep_non_strict.sig)(gm)
+            assert res is not None
+            gm = res.graph_module
+
+        _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+        return ExportedProgram(
+            root=gm,
+            graph=gm.graph,
+            graph_signature=ep_non_strict.sig,
+            state_dict=mod.state_dict(keep_vars=True),
+            range_constraints=range_constraints,
+            module_call_graph=[
+                ModuleCallEntry(
+                    "",
+                    ModuleCallSignature(
+                        inputs=[], outputs=[], in_spec=orig_in_spec, out_spec=out_spec
+                    ),
+                )
+            ]
+            + [
+                ModuleCallEntry(fqn, sig) for fqn, sig in module_call_signatures.items()
+            ],
+            example_inputs=(args, kwargs),
+            constants=ep_non_strict.constants,
+        )
+
+    gm_torch_level = _export_to_torch_ir(
+        mod,
+        args,
+        kwargs,
+        constraints,
+        preserve_module_call_signature=preserve_module_call_signature,
+        restore_fqn=False,  # don't need to restore because we will do it later
+        _log_export_usage=False,
+    )
+
+    # We detect the fake_mode by looking at gm_torch_level's placeholders, this is the fake_mode created in dynamo.
+    (
+        fake_args,
+        fake_kwargs,
+        fake_params_buffers,
+        dynamo_fake_mode,
+    ) = _convert_input_to_fake(gm_torch_level, args, kwargs)
+
+    # First, we want to pass through the graph to try populating
+    # val field for getattr if there is anything missing.
+    # This can happen when quantization adds extra params and forgets
+    # to update "val"
+    for node in gm_torch_level.graph.nodes:
+        if node.op == "get_attr" and "val" not in node.meta:
+            attr = getattr(gm_torch_level, node.target)
+            # Checks if it is not a HigherOrderOp branch or a module
+            if not isinstance(attr, torch.nn.Module):
+                assert (
+                    dynamo_fake_mode is not None
+                ), "Cannot find dynamo_fake_mode. This could be due to the exported graph module have no placeholders."
+                node.meta["val"] = dynamo_fake_mode.from_tensor(
+                    attr, static_shapes=True
+                )
+
+    # When aot_export lifts the params, we lose the nn_module_stack
+    # and source_fn from the param nodes as they are treated as fresh inputs
+    # Therefore, we manually extract them before calling into aot_export
+    params_buffers_to_node_meta = {}
+    for node in gm_torch_level.graph.nodes:
+        target = node.target
+        meta = node.meta
+        if node.op == "call_module":
+            submodule = getattr(gm_torch_level, target)
+            if isinstance(submodule, torch.nn.Module):
+                for name, _ in submodule.named_parameters(
+                    recurse=True, remove_duplicate=False
+                ):
+                    params_buffers_to_node_meta[target + "." + name] = meta
+
+                for name, _ in submodule.named_buffers(
+                    recurse=True, remove_duplicate=False
+                ):
+                    params_buffers_to_node_meta[target + "." + name] = meta
+
+        if node.op == "get_attr":
+            submodule = getattr(gm_torch_level, target)
+            if not isinstance(submodule, torch.fx.GraphModule):
+                params_buffers_to_node_meta[target] = meta
+
+        # If the call_function uses param as input, we also need to update params' meta
+        # with this call_function node's meta.
+        # This is basically the same flow as torch.fx.traceback.preserve_meta()
+        if node.op == "call_function" and not isinstance(
+            node.target, torch._ops.HigherOrderOperator
+        ):
+            for arg in node._input_nodes:
+                if arg.op == "get_attr":
+                    for entry in torch.fx.proxy._COPY_META_FIELDS:
+                        if entry in meta:
+                            params_buffers_to_node_meta[arg.target][entry] = meta[entry]
+
+    # Fix the graph output signature to be tuple if scalar
+    out_spec = orig_out_spec = gm_torch_level._out_spec
+    assert out_spec is not None
+    # aot_export expect the return type to always be a tuple.
+    if out_spec.type not in (list, tuple):
+        out_spec = pytree.TreeSpec(tuple, None, [out_spec])
+
+    orig_arg_names = gm_torch_level.graph._codegen.pytree_info.orig_args  # type: ignore[attr-defined]
+
+    gm_torch_level.graph._codegen = _PyTreeCodeGen(
+        _PyTreeInfo(
+            orig_arg_names,
+            gm_torch_level._in_spec,
+            out_spec,
+        )
+    )
+    gm_torch_level.recompile()
+
+    _normalize_nn_module_stack(gm_torch_level, type(mod))
+
+    # NOTE: graph module expects only positional args
+    ep_non_strict = _export_non_strict(
+        gm_torch_level,
+        _convert_to_positional_args(orig_arg_names, fake_args, fake_kwargs),
+        {},
+        fake_params_buffers,
+        constant_attrs,
+        pre_dispatch=pre_dispatch,
+    )
+
+    gm = ep_non_strict.gm
+    export_graph_signature = ep_non_strict.sig
+    constants = ep_non_strict.constants
+
+    # After aot_export, set the param/buffer metadata back into placeholders
+    # Technically, users can still construct this data from param names
+    # without relying on this metadata
+    for node in gm.graph.nodes:
+        if node.op == "placeholder":
+            if node.target in export_graph_signature.inputs_to_parameters:
+                param_name = export_graph_signature.inputs_to_parameters[node.target]
+                if param_name in params_buffers_to_node_meta:
+                    for k, v in params_buffers_to_node_meta[param_name].items():
+                        node.meta[k] = v
+            if node.target in export_graph_signature.inputs_to_buffers:
+                buffer_name = export_graph_signature.inputs_to_buffers[node.target]
+                if buffer_name in params_buffers_to_node_meta:
+                    for k, v in params_buffers_to_node_meta[buffer_name].items():
+                        node.meta[k] = v
+
+    # The unbacked symint symbols are updated in aot_export
+    # so we serialize them here instead of inside dynamo
+
+    gm.meta["inline_constraints"] = {
+        k: v
+        for k, v in dynamo_fake_mode.shape_env.var_to_range.items()
+        if free_unbacked_symbols(k)
+    }
+
+    num_lifted = next(
+        (
+            i
+            for i, s in enumerate(export_graph_signature.input_specs)
+            if s.kind == InputKind.USER_INPUT
+        ),
+        len(export_graph_signature.input_specs),
+    )
+    range_constraints = _process_constraints(
+        dynamo_fake_mode,
+        gm,
+        num_lifted,
+        flat_args,
+    )
+
+    # Do some cleanups on the graph module to restore the state dict to the
+    # expected form. Each of these steps should probably get fixed upstream.
+    # 1. Remove tensor constants that were added as buffers.
+    _rewrite_dynamo_tensor_constants(
+        orig_mod_buffers=set(mod.buffers()),
+        traced_mod_buffers=dict(gm_torch_level.named_buffers()),
+        graph_signature=ep_non_strict.sig,
+        constants=ep_non_strict.constants,
+    )
+    # 2. Restore FQN of param/buffers
+    param_buffer_table: Dict[str, str] = _get_param_buffer_mapping(mod, gm_torch_level)
+    _replace_param_buffer_names(param_buffer_table, export_graph_signature)
+
+    # 3. Remove non-persistent buffers from the graph signature
+    _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+
+    # 4. Rewrite constants to have the same FQN as the original module.
+    _remap_constants(constant_attrs, export_graph_signature, constants)
+
+    module_call_signatures = {
+        fqn: ModuleCallSignature(inputs=[], outputs=[], **specs)
+        for fqn, specs in gm_torch_level.meta["module_call_specs"].items()
+    }
+
+    if len(preserve_module_call_signature) > 0:
+        res = CollectTracepointsPass(module_call_signatures, export_graph_signature)(gm)
+        assert res is not None
+        gm = res.graph_module
+
+    assert orig_out_spec is not None
+    exported_program = ExportedProgram(
+        root=gm,
+        graph=gm.graph,
+        graph_signature=export_graph_signature,
+        state_dict=mod.state_dict(keep_vars=True),
+        range_constraints=range_constraints,
+        module_call_graph=[
+            ModuleCallEntry(
+                "",
+                ModuleCallSignature(
+                    inputs=[], outputs=[], in_spec=orig_in_spec, out_spec=orig_out_spec
+                ),
+            )
+        ]
+        + [ModuleCallEntry(fqn, sig) for fqn, sig in module_call_signatures.items()],
+        example_inputs=(args, kwargs),
+        constants=constants,
+    )
+    log.debug("Exported program from AOTAutograd:\n%s", exported_program)
+
+    if len(range_constraints) > 0:
+        exported_program = exported_program._transform_do_not_use(
+            _AddRuntimeAssertionsForInlineConstraintsPass(range_constraints)
+        )
+
+    return exported_program
diff --git a/MLPY/Lib/site-packages/torch/export/_tree_utils.py b/MLPY/Lib/site-packages/torch/export/_tree_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..827793ae94febdfd10a869f48720b8f41eea2a40
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_tree_utils.py
@@ -0,0 +1,64 @@
+from typing import Any, Callable, Dict, Optional
+
+from torch.utils._pytree import Context, TreeSpec
+
+
+def reorder_kwargs(user_kwargs: Dict[str, Any], spec: TreeSpec) -> Dict[str, Any]:
+    """Reorder user-provided kwargs to match the order in `spec`. `spec` is
+    expected to be the in_spec of an exported program, i.e. the spec that
+    results from flattening `(args, kwargs)`.
+
+    We need this to provide consistent input ordering, such so that users can
+    pass in foo(a=a, b=b) OR foo(b=b, a=a) and receive the same result.
+    """
+    # Make sure that the spec is actually shaped like (args, kwargs)
+    assert spec.type is tuple
+    assert spec.num_children == 2
+    kwargs_spec = spec.children_specs[1]
+    assert kwargs_spec.type is dict
+
+    if set(user_kwargs) != set(kwargs_spec.context):
+        raise ValueError(
+            f"kwarg key mismatch: "
+            f"Got {list(user_kwargs)} but expected {kwargs_spec.context}"
+        )
+
+    reordered_kwargs = {}
+    for kw in kwargs_spec.context:
+        reordered_kwargs[kw] = user_kwargs[kw]
+
+    return reordered_kwargs
+
+
+def is_equivalent(
+    spec1: TreeSpec,
+    spec2: TreeSpec,
+    equivalence_fn: Callable[[Optional[type], Context, Optional[type], Context], bool],
+) -> bool:
+    """Customizable equivalence check for two TreeSpecs.
+
+    Arguments:
+        spec1: The first TreeSpec to compare
+        spec2: The second TreeSpec to compare
+        equivalence_fn: A function to determine the equivalence of two
+            TreeSpecs by examining their types and contexts. It will be called like:
+
+                equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context)
+
+            This function will be applied recursively to all children.
+
+    Returns:
+        True if the two TreeSpecs are equivalent, False otherwise.
+    """
+    if not equivalence_fn(spec1.type, spec1.context, spec2.type, spec2.context):
+        return False
+
+    # Recurse on children
+    if len(spec1.children_specs) != len(spec2.children_specs):
+        return False
+
+    for child_spec1, child_spec2 in zip(spec1.children_specs, spec2.children_specs):
+        if not is_equivalent(child_spec1, child_spec2, equivalence_fn):
+            return False
+
+    return True
diff --git a/MLPY/Lib/site-packages/torch/export/_unlift.py b/MLPY/Lib/site-packages/torch/export/_unlift.py
new file mode 100644
index 0000000000000000000000000000000000000000..0171c94ddc7d0372732c98aace680e6f8d565946
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/_unlift.py
@@ -0,0 +1,314 @@
+import copy
+from itertools import chain
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.utils._pytree as pytree
+from torch._export.utils import _check_input_constraints_for_graph
+from torch.export.unflatten import _assign_attr, _AttrKind
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+from ._remove_effect_tokens_pass import _remove_effect_tokens
+
+from .exported_program import (
+    ExportedProgram,
+    ExportGraphSignature,
+    InputKind,
+    OutputKind,
+)
+
+
+@torch._dynamo.disable
+def _check_input_constraints_pre_hook(self, *args, **kwargs):
+    flat_args_with_path, received_spec = pytree.tree_flatten_with_path(args)
+
+    if received_spec != self._in_spec:
+        raise ValueError(  # noqa: TRY200
+            "Trying to flatten user inputs with exported input tree spec: \n"
+            f"{self._in_spec}\n"
+            "but actually got inputs with tree spec of: \n"
+            f"{received_spec}"
+        )
+
+    return _check_input_constraints_for_graph(
+        [node for node in self.graph.nodes if node.op == "placeholder"],
+        flat_args_with_path,
+        self.range_constraints,
+    )
+
+
+def _unlift_inputs_as_getattr(
+    gm: torch.fx.GraphModule,
+    lifted_inputs: List[Optional[str]],
+) -> Tuple[Dict[str, torch.fx.Node], Dict[str, torch.fx.Node]]:
+    """
+    Unlift inputs referring to params/buffers/constants as getattr nodes in the
+    graph
+    """
+    unlifted_name_to_node = {}
+    input_name_to_node = {}
+
+    placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"]
+    assert len(lifted_inputs) == len(placeholder_nodes)
+    for input_node, lifted_node in zip(placeholder_nodes, lifted_inputs):
+        if lifted_node is None:
+            input_name_to_node[input_node.name] = input_node
+
+        else:
+            with gm.graph.inserting_after(input_node):
+                getattr_node = gm.graph.get_attr(lifted_node)
+                input_node.replace_all_uses_with(getattr_node)
+                metadata = input_node.meta
+                gm.graph.erase_node(input_node)
+                getattr_node.meta = metadata
+                unlifted_name_to_node[lifted_node] = getattr_node
+
+    return unlifted_name_to_node, input_name_to_node
+
+
+def _insert_copy_for_mutations(
+    gm: torch.fx.GraphModule,
+    mutated_outputs: List[Optional[str]],
+    unlifted_name_to_node: Dict[str, torch.fx.Node],
+    input_name_to_node: Dict[str, torch.fx.Node],
+) -> None:
+    """
+    Find the all the buffers and inputs that were mutated and insert copy_
+    operators to reflect mutations.
+    """
+    output_node = None
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            output_node = node
+            break
+    assert output_node is not None
+    outputs = pytree.tree_flatten(output_node.args)[0]
+    assert len(outputs) == len(mutated_outputs)
+
+    user_output_nodes = []
+    for return_node, mutated_node_name in zip(outputs, mutated_outputs):
+        if mutated_node_name is None:
+            user_output_nodes.append(return_node)
+            continue
+
+        if mutated_node_name in unlifted_name_to_node:
+            mutated_node = unlifted_name_to_node[mutated_node_name]
+        elif mutated_node_name in input_name_to_node:
+            mutated_node = input_name_to_node[mutated_node_name]
+        else:
+            raise RuntimeError(
+                f"Could not find {mutated_node_name} in either buffer or input nodes"
+            )
+
+        with gm.graph.inserting_before(output_node):
+            _ = gm.graph.call_function(
+                torch.ops.aten.copy_.default, (mutated_node, return_node)
+            )
+
+    with gm.graph.inserting_before(output_node):
+        # Only return user outputs
+        new_output = gm.graph.output(tuple(user_output_nodes))
+        output_node.replace_all_uses_with(new_output)
+        gm.graph.erase_node(output_node)
+
+
+def _get_codegen(
+    in_spec: pytree.TreeSpec,
+    out_spec: Optional[pytree.TreeSpec],
+) -> _PyTreeCodeGen:
+    """
+    Create the codegen for the graph module based on the in/out specs
+    """
+    if (
+        in_spec.type == tuple
+        and in_spec.num_children == 2
+        and in_spec.children_specs[0].type == tuple
+        and in_spec.children_specs[1].type == dict
+    ):
+        # if in_spec contains the args (tuple) and kwargs (dict)
+        names = [f"arg_{i}" for i in range(in_spec.children_specs[0].num_children)]
+        # add kwarg names
+        names.extend(in_spec.children_specs[1].context)
+    else:
+        names = [f"arg_{i}" for i in range(in_spec.num_children)]
+
+    return _PyTreeCodeGen(
+        _PyTreeInfo(
+            names,
+            in_spec,
+            out_spec,
+        )
+    )
+
+
+def _unlift(
+    gm: torch.fx.GraphModule,
+    lifted_inputs: List[Optional[str]],
+    mutated_outputs: List[Optional[str]],
+    in_spec: pytree.TreeSpec,
+    out_spec: Optional[pytree.TreeSpec],
+    state_dict: Dict[str, Any],
+    constants: Dict[str, Any],
+):
+    """
+    Args:
+        lifted_inputs: A list matching the graph module's input nodes. For
+        an input node that is referring to a lifted parameter/buffer, this
+        list will contain the fqn the corresponding attribute. Otherwise, this
+        list will contain None. This is used to unlift the lifted parameters as
+        get_attr nodes.
+
+        mutated_outputs: A list matching the graph module's output nodes. For
+        an output node that is referring to a mutated buffer or user input, this
+        list will contain the name of the corresponding buffer or user input
+        that needs to be mutated. Otherwise, this list will contain None. This
+        is used to re-insert an inplace copy_ operator to copy the mutated
+        values back to the original node.
+    """
+    unlifted_name_to_node, input_name_to_node = _unlift_inputs_as_getattr(
+        gm, lifted_inputs
+    )
+    _insert_copy_for_mutations(
+        gm, mutated_outputs, unlifted_name_to_node, input_name_to_node
+    )
+    gm.graph._codegen = _get_codegen(in_spec, out_spec)
+    gm.graph.lint()
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    return gm
+
+
+def _register_attrs_to_new_gm(
+    new_gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    state_dict: Dict[str, Any],
+    constants: Dict[str, Any],
+) -> None:
+    non_persistent_buffers = set(graph_signature.non_persistent_buffers)
+    for name in graph_signature.buffers:
+        if name in non_persistent_buffers:
+            persistent = False
+            value = constants[name]
+        else:
+            persistent = True
+            value = state_dict[name]
+        _assign_attr(
+            value, new_gm, name, attr_kind=_AttrKind.BUFFER, persistent=persistent
+        )
+    for name in graph_signature.parameters:
+        value = state_dict[name]
+        _assign_attr(
+            value,
+            new_gm,
+            name,
+            attr_kind=_AttrKind.PARAMETER,
+        )
+
+    for name in chain(
+        graph_signature.lifted_custom_objs, graph_signature.lifted_tensor_constants
+    ):
+        value = constants[name]
+        _assign_attr(
+            value,
+            new_gm,
+            name,
+            attr_kind=_AttrKind.CONSTANT,
+        )
+
+
+class _StatefulGraphModuleFactory(type):
+    """
+    Metaclass that ensures a private constructor for _StatefulGraphModule
+    """
+
+    def __call__(cls, *args, **kwargs):
+        raise TypeError(
+            f"{cls.__module__}.{cls.__qualname__} has no public constructor. "
+        )
+
+    def _create(cls, root, graph, range_constraints=None):
+        return super().__call__(
+            root,
+            graph,
+            range_constraints=range_constraints,
+        )
+
+
+class _StatefulGraphModule(torch.fx.GraphModule, metaclass=_StatefulGraphModuleFactory):
+    def __init__(self, root, graph, range_constraints=None):
+        super().__init__(root, graph)
+        # Need to fix up non-persistent buffers.
+        self.range_constraints = range_constraints or []
+
+
+def _create_stateful_graph_module(
+    plain_graph_module: torch.fx.GraphModule,
+    range_constraints,
+    # TODO(suo) this should not be optional, but is since we still ahve
+    # capture_pre_autograd_graph grr
+    graph_signature: Optional[ExportGraphSignature] = None,
+):
+    stateful_gm = _StatefulGraphModule._create(
+        plain_graph_module,
+        plain_graph_module.graph,
+        range_constraints=range_constraints,
+    )
+    stateful_gm.register_forward_pre_hook(
+        _check_input_constraints_pre_hook, with_kwargs=True
+    )
+
+    if graph_signature is None:
+        return stateful_gm
+    # Fix up non-persistent buffers. torch.fx does not distinguish between
+    # persistent and non-persistent buffers, so we must restore that distinction
+    # here.
+    for buffer in graph_signature.non_persistent_buffers:
+        _assign_attr(
+            plain_graph_module.get_buffer(buffer),
+            stateful_gm,
+            buffer,
+            attr_kind=_AttrKind.BUFFER,
+            persistent=False,
+        )
+
+    return stateful_gm
+
+
+def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.nn.Module:
+    ep = _remove_effect_tokens(ep)
+    new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
+    _register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
+
+    lifted_inputs: List[Optional[str]] = [
+        in_spec.target
+        if in_spec.kind
+        in (
+            InputKind.BUFFER,
+            InputKind.CONSTANT_TENSOR,
+            InputKind.PARAMETER,
+            InputKind.CUSTOM_OBJ,
+        )
+        else None
+        for in_spec in ep.graph_signature.input_specs
+    ]
+
+    mutated_outputs: List[Optional[str]] = [
+        out_spec.target
+        if out_spec.kind in (OutputKind.BUFFER_MUTATION, OutputKind.USER_INPUT_MUTATION)
+        else None
+        for out_spec in ep.graph_signature.output_specs
+    ]
+
+    new_gm = _unlift(
+        new_gm,
+        lifted_inputs,
+        mutated_outputs,
+        ep.call_spec.in_spec,
+        ep.call_spec.out_spec,
+        ep.state_dict,
+        ep.constants,
+    )
+    unlift_gm = _create_stateful_graph_module(
+        new_gm, ep.range_constraints, ep.graph_signature
+    )
+    unlift_gm.meta.update(ep.graph_module.meta)
+    return unlift_gm
diff --git a/MLPY/Lib/site-packages/torch/export/custom_obj.py b/MLPY/Lib/site-packages/torch/export/custom_obj.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b04215c31fb79af34511606600a856bc5ba6a8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/custom_obj.py
@@ -0,0 +1,16 @@
+from dataclasses import dataclass
+
+
+__all__ = ["ScriptObjectMeta"]
+
+
+@dataclass
+class ScriptObjectMeta:
+    """
+    Metadata which is stored on nodes representing ScriptObjects.
+    """
+
+    # Key into constants table to retrieve the real ScriptObject.
+    constant_name: str
+
+    class_fqn: str
diff --git a/MLPY/Lib/site-packages/torch/export/dynamic_shapes.py b/MLPY/Lib/site-packages/torch/export/dynamic_shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..a34447908ca3a836598c3e237e24da789cc7900d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/dynamic_shapes.py
@@ -0,0 +1,876 @@
+import builtins
+import dataclasses
+import inspect
+import math
+import sys
+import weakref
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.utils._pytree import SUPPORTED_NODES
+
+from .exported_program import ExportedProgram
+
+if TYPE_CHECKING:
+    from sympy import Symbol
+
+    from torch._guards import Source
+
+    from ..fx.experimental.symbolic_shapes import ShapeEnv, StrictMinMaxConstraint
+
+__all__ = ["Constraint", "Dim", "dims", "dynamic_dim"]
+
+
+class _Dim(type):
+    """
+    Metaclass for :func:`Dim` types.
+    """
+
+    @staticmethod
+    def readable(name, min_, max_):
+        if min_ == 2:
+            min_ = None
+        if max_ == sys.maxsize - 1:
+            max_ = None
+        if min_ is None and max_ is None:
+            return f"Dim('{name}')"
+        if min_ is None:
+            return f"Dim('{name}', max={max_})"
+        if max_ is None:
+            return f"Dim('{name}', min={min_})"
+        return f"Dim('{name}', min={min_}, max={max_})"
+
+    def __add__(cls, other):
+        # e.g., dim + 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to add {other} to {cls.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x + other)
+
+    def __radd__(cls, other):
+        return cls + other
+
+    def __sub__(cls, other):
+        # e.g., dim - 1
+        if type(other) is not int:
+            raise NotImplementedError(
+                f"Attempted to subtract {other} from {cls.__name__}, where an integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x - other)
+
+    def __rsub__(cls, other):
+        raise NotImplementedError(
+            f"Attempted to negate {cls.__name__}. "
+            "(Only increasing linear operations with integer coefficients are supported.)"
+        )
+
+    def __mul__(cls, other):
+        # e.g., dim * 2
+        if type(other) is not int or other <= 0:
+            raise NotImplementedError(
+                f"Attempted to multiply {other} with {cls.__name__}, where a positive integer was expected. "
+                "(Only increasing linear operations with integer coefficients are supported.)"
+            )
+        return cls._derive(lambda x: x * other)
+
+    def __rmul__(cls, other):
+        return cls * other
+
+    def _derived_name(cls, fn):
+        from sympy import sympify
+
+        return str(fn(sympify(cls.__name__)))
+
+    def _derive(cls, fn):
+        return _DerivedDim(cls._derived_name(fn), (int,), {"root": cls, "fn": fn})
+
+
+class _DerivedDim(_Dim):
+    """
+    Metaclass for derived :func:`Dim` types.
+
+    Currently we only support increasing linear expressions with integer coefficients.
+    In other words, a derived Dim can always be written in the form Ax + B, where
+    x is a regular Dim (i.e., non-derived Dim), A and B are integers, and A is positive.
+    (In particular, the latter ensures that x < y => Ax + B < Ay + B.)
+    These restrictions on the form of derived Dims makes the metatheory simpler: e.g.,
+    it simplifies computing ranges for derived Dims, solving for underlying regular Dims,
+    deciding equalities between derived Dims, and so on.
+
+    The function lambda x: Ax + B is expressed by `fn`, where x is a normal Dim, `root`.
+    The range of a derived Dim is computed by mapping `fn` over the range of its `root`.
+    """
+
+    @property
+    def min(self):
+        # assume that self.fn is an increasing function
+        # TODO(avik): use sympy value range analysis instead?
+        from sympy import Integer
+
+        _min_symint = self.fn(Integer(self.root.min))  # type: ignore[attr-defined]
+        assert _min_symint >= 2, (
+            f"Expected derived min value of {self.__name__} to be >= 2. "
+            f"Please specify an appropriate min value for {self.root.__name__} "  # type: ignore[attr-defined]
+            f"(currently {self.root.min})."  # type: ignore[attr-defined]
+        )
+        return int(_min_symint)
+
+    @property
+    def max(self):
+        # assume that self.fn is an increasing function
+        # TODO(avik): use sympy value range analysis instead?
+        from sympy import Integer
+
+        _max_symint = self.fn(Integer(self.root.max))  # type: ignore[attr-defined]
+        assert _max_symint <= sys.maxsize - 1, (
+            f"Expected derived max value of {self.__name__} to be <= {sys.maxsize - 1}. "
+            f"Please specify an appropriate max value for {self.root.__name__} "  # type: ignore[attr-defined]
+            f"(currently {self.root.max})."  # type: ignore[attr-defined]
+        )
+        return int(_max_symint)
+
+    def _derive(self, fn):
+        # We support nesting, e.g., 2*dim + 1.
+        # This is implemented by composing operations on the same root.
+        # As a consequence, roots are always regular Dims (i.e., not derived Dims).
+        return _DerivedDim(
+            self._derived_name(fn),
+            (int,),
+            {"root": self.root, "fn": lambda x: fn(self.fn(x))},  # type: ignore[attr-defined]
+        )
+
+
+def Dim(name: str, *, min: Optional[int] = None, max: Optional[int] = None):
+    """
+    :func:`Dim` constructs a type analogous to a named symbolic integer with a range.
+    It can be used to describe multiple possible values of a dynamic tensor dimension.
+    Note that different dynamic dimensions of the same tensor, or of different tensors,
+    can be described by the same type.
+
+    Args:
+        name (str): Human-readable name for debugging.
+        min (Optional[int]): Minimum possible value of given symbol (inclusive)
+        max (Optional[int]): Maximum possible value of given symbol (inclusive)
+
+    Returns:
+        A type that can be used in dynamic shape specifications for tensors.
+    """
+    _min = 2 if min is None else builtins.max(min, 2)
+    _max = sys.maxsize - 1 if max is None else builtins.min(max, sys.maxsize - 1)
+    assert _max > _min, f"Cannot create Dim with inconsistent min={min}, max={max}"
+    dim = _Dim(name, (int,), {"min": _min, "max": _max})
+    dim.__module__ = getattr(
+        inspect.getmodule(inspect.stack()[1][0]), "__name__", "__main__"
+    )
+    return dim
+
+
+def dims(*names: str, min: Optional[int] = None, max: Optional[int] = None):
+    """
+    Util to create multiple :func:`Dim` types.
+    """
+    return tuple(Dim(name, min=min, max=max) for name in names)
+
+
+@dataclasses.dataclass
+class _ConstraintTarget:
+    """
+    This represents input tensor dimensions.  Don't create this
+    class directly; instead, use :func:`dynamic_dim`.
+    """
+
+    w_tensor: Any  # weakref to torch.Tensor
+    # TODO: We don't need t_id; we can get it off of w_tensor
+    t_id: int
+    dim: int
+
+
+class _ConstraintFactory(type):
+    """
+    Metaclass that ensures a private constructor for :class:`_Constraint`
+    """
+
+    def __call__(cls, *args, **kwargs):
+        raise TypeError(
+            f"{cls.__module__}.{cls.__qualname__} has no public constructor. "
+            f"Please use torch.export.dynamic_dim() to create one"
+        )
+
+    def _create(
+        cls, w_tensor, t_id, dim, constraint_range, shared=None, debug_name=None
+    ):
+        return super().__call__(
+            w_tensor, t_id, dim, constraint_range, shared, debug_name
+        )
+
+
+def _create_constraint(
+    w_tensor, t_id, dim, constraint_range, shared=None, debug_name=None
+):
+    return _Constraint._create(
+        w_tensor, t_id, dim, constraint_range, shared, debug_name
+    )
+
+
+@dataclasses.dataclass
+class _Constraint(_ConstraintTarget, metaclass=_ConstraintFactory):
+    """
+
+    .. warning::
+        Do not construct :class:`_Constraint` directly, use :func:`dynamic_dim` instead.
+
+    This represents constraints on input tensor dimensions, e.g., requiring
+    them to be fully polymorphic or within some range.
+
+    """
+
+    # NOTE(avik): In the future, this could be Union[StrictMinMaxConstraint, <other kinds>]
+    constraint_range: "StrictMinMaxConstraint"
+    # Represent that `constraint_range` is shared with another _ConstraintTarget, which
+    # typically arises because of a specified equality with another dynamic dimension.
+    shared: Optional[_ConstraintTarget] = None
+    debug_name: Optional[str] = None
+
+    def _clone_with_range(self, lower=2, upper=math.inf):
+        # Import sympy locally
+        from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+        from torch.utils._sympy.value_ranges import ValueRanges
+
+        constraint_range = StrictMinMaxConstraint(
+            vr=self.constraint_range.vr & ValueRanges(lower=lower, upper=upper),
+            warn_only=False,
+        )
+        return _create_constraint(
+            self.w_tensor,
+            self.t_id,
+            self.dim,
+            constraint_range,
+            self.shared,
+            self.debug_name,
+        )
+
+    def __ge__(self, lower):
+        return self._clone_with_range(lower=lower)
+
+    def __gt__(self, lower):
+        return self._clone_with_range(lower=lower + 1)
+
+    def __le__(self, upper):
+        return self._clone_with_range(upper=upper)
+
+    def __lt__(self, upper):
+        return self._clone_with_range(upper=upper - 1)
+
+    def __bool__(self):
+        # NOTE(avik): We do not support compound expressions like a <= x <= b.
+        # This is because Python implicitly desugars them into bool(a <= x) and bool(x <= b),
+        # and moreover, enforces that any overload of __bool__ must return True or False.
+        # FWIW, sympy also raises TypeError in this case.
+        raise TypeError(
+            "Cannot determine truth value of _Constraint. "
+            "If you are trying to combine _Constraint's with logical connectives, "
+            "you can specify them separately instead."
+        )
+
+    @property
+    def serializable_spec(self):
+        # We need a serialization compatible format of the constraint so that it
+        # can be savedin the graph module w/o breaking the module serialization.
+        # The saved constraints will be used directly for the post-exporting pass
+        # that converts constraints to runtime assertion. The saved constraints
+        # will not be saved in the serialized module.
+        # TODO: A better way is needed. Currently we use 't_id' to map the constraint,
+        # which is not reliable
+        return {
+            "t_id": self.t_id,
+            "dim": self.dim,
+            "min": self.constraint_range.vr.lower,
+            "max": self.constraint_range.vr.upper,
+        }
+
+    def __eq__(self, other):
+        if not isinstance(other, _Constraint):
+            raise TypeError(
+                "A dynamic dim can be specified equal only to another dynamic dim. "
+                f"Equality with {type(other)} is not supported."
+            )
+
+        # import sympy locally
+        from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+
+        constraint_range = StrictMinMaxConstraint(
+            vr=self.constraint_range.vr & other.constraint_range.vr,
+            warn_only=False,
+        )
+        if self.debug_name is None:
+            debug_name = other.debug_name
+        else:
+            assert other.debug_name is None or self.debug_name == other.debug_name
+            debug_name = self.debug_name
+        return _create_constraint(
+            self.w_tensor,
+            self.t_id,
+            self.dim,
+            constraint_range,
+            shared=_ConstraintTarget(other.w_tensor, other.t_id, other.dim),
+            debug_name=debug_name,
+        )
+
+
+@dataclasses.dataclass
+class _PhantomRoot:
+    """
+    This represents the root of a derived Dim where the root does not directly
+    specify the shape of any input dimension, but the derived Dim does.
+
+    e.g., the input shapes 2*dim and dim + 1 are related via a "phantom" dim.
+
+    The fields `name`, `constraint_range`, and `val` carried by a phantom root
+    help create a symbol for it. Any derived dims with this phantom root are
+    backed by expressions over this symbol.
+    """
+
+    name: str
+    constraint_range: "StrictMinMaxConstraint"
+    val: int
+
+
+@dataclasses.dataclass
+class _DerivedConstraint(_ConstraintTarget):
+    """
+    This represents a derived Dim, whose root is either a regular constraint target
+    (which directly specifies the shape of some input dimension) or a phantom root
+    (which does so indirectly).
+    """
+
+    # NOTE: This is not currently a subclass of _Constraint because we do not support
+    # `shared` for derived `Dim`s. Indeed, sharing is a necessary concept only for
+    # legacy constraints based on `dynamic_dim`: equality can be expressed simply by
+    # reusing the same (derived or normal) `Dim`.
+    root: Union[_ConstraintTarget, _PhantomRoot]
+    fn: Callable
+    constraint_range: "StrictMinMaxConstraint"
+    debug_name: Optional[str] = None
+
+    @property
+    def shared(self):
+        # Some code paths expect a union of _Constraint and _DerivedConstraint.
+        # Thus we expose a `shared` field that is always None.
+        # TODO(avik): clean this up
+        return None
+
+    @property
+    def serializable_spec(self):
+        # same as _Constraint.serializable_spec
+        return {
+            "t_id": self.t_id,
+            "dim": self.dim,
+            "min": self.constraint_range.vr.lower,
+            "max": self.constraint_range.vr.upper,
+        }
+
+
+Constraint = Union[_Constraint, _DerivedConstraint]
+
+
+def dynamic_dim(t: torch.Tensor, index: int, debug_name: Optional[str] = None):
+    """
+    .. warning::
+        (This feature is DEPRECATED. See :func:`Dim` instead.)
+
+    :func:`dynamic_dim` constructs a :class:`_Constraint` object that describes the dynamism of
+    a dimension ``index`` of tensor ``t``. :class:`_Constraint` objects should be passed to
+    ``constraints`` argument of :func:`export`.
+
+    Args:
+        t (torch.Tensor): Example input tensor that have dynamic dimension size(s)
+        index (int): Index of dynamic dimension
+
+    Returns:
+        A :class:`_Constraint` object that describes shape dynamism. It can be passed to :func:`export` so
+        that :func:`export` does not assume static size of specified tensor, i.e. keeping it dynamic
+        as a symbolic size rather than specializing according to size of example tracing input.
+
+    Specifically :func:`dynamic_dim` can be used to express following types of dynamism.
+
+    - Size of a dimension is dynamic and unbounded::
+
+        t0 = torch.rand(2, 3)
+        t1 = torch.rand(3, 4)
+
+        # First dimension of t0 can be dynamic size rather than always being static size 2
+        constraints = [dynamic_dim(t0, 0)]
+        ep = export(fn, (t0, t1), constraints=constraints)
+
+    - Size of a dimension is dynamic with a lower bound::
+
+        t0 = torch.rand(10, 3)
+        t1 = torch.rand(3, 4)
+
+        # First dimension of t0 can be dynamic size with a lower bound of 5 (inclusive)
+        # Second dimension of t1 can be dynamic size with a lower bound of 2 (exclusive)
+        constraints = [
+            dynamic_dim(t0, 0) >= 5,
+            dynamic_dim(t1, 1) > 2,
+        ]
+        ep = export(fn, (t0, t1), constraints=constraints)
+
+    - Size of a dimension is dynamic with an upper bound::
+
+        t0 = torch.rand(10, 3)
+        t1 = torch.rand(3, 4)
+
+        # First dimension of t0 can be dynamic size with a upper bound of 16 (inclusive)
+        # Second dimension of t1 can be dynamic size with a upper bound of 8 (exclusive)
+        constraints = [
+            dynamic_dim(t0, 0) <= 16,
+            dynamic_dim(t1, 1) < 8,
+        ]
+        ep = export(fn, (t0, t1), constraints=constraints)
+
+    - Size of a dimension is dynamic and it is always equal to size of another dynamic dimension::
+
+        t0 = torch.rand(10, 3)
+        t1 = torch.rand(3, 4)
+
+        # Sizes of second dimension of t0 and first dimension are always equal
+        constraints = [
+            dynamic_dim(t0, 1) == dynamic_dim(t1, 0),
+        ]
+        ep = export(fn, (t0, t1), constraints=constraints)
+
+    - Mix and match all types above as long as they do not express conflicting requirements
+
+    """
+    from torch._dynamo.exc import UserError, UserErrorType
+
+    if not isinstance(t, torch.Tensor):
+        raise UserError(
+            UserErrorType.DYNAMIC_DIM,
+            f"Expected tensor as input to dynamic_dim but got {type(t)}",
+        )
+
+    if t.dim() < 1:
+        raise UserError(
+            UserErrorType.DYNAMIC_DIM, "Cannot mark 0-dimension tensors to be dynamic"
+        )
+
+    if index >= t.dim():
+        raise UserError(
+            UserErrorType.DYNAMIC_DIM,
+            f"Expected the dimension passed to dynamic_dim to be in the range [0:{t.dim()-1}]"
+            f" but got {index}, which is out of bounds for the given tensor.",
+        )
+
+    # Import sympy locally
+    import sympy
+
+    from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+    from torch.utils._sympy.value_ranges import ValueRanges
+
+    return _create_constraint(
+        weakref.ref(t),
+        id(t),
+        index,
+        StrictMinMaxConstraint(
+            vr=ValueRanges(lower=2, upper=sympy.oo), warn_only=False
+        ),
+        debug_name=debug_name,
+    )
+
+
+def _process_equalities(
+    constraint: Constraint,
+    get_sources: Callable[[int, int], List["Source"]],
+    shape_env: "ShapeEnv",
+    source_pairs: List[Tuple["Source", "Source"]],
+    derived_equalities: List[Tuple["Source", Union["Source", "Symbol"], Callable]],
+    phantom_symbols: Dict[str, "Symbol"],
+):
+    """
+    Updates `source_pairs`, `derived_equalities`, and `phantom_symbols` (which become
+    fields of `EqualityConstraint`) based on a given input `constraint`.
+    """
+
+    source, *other_sources = get_sources(constraint.t_id, constraint.dim)
+    # When t.size()[dim] maps to src0, src1, ..., srcN, we add
+    # constraints that make src0 "equal" to src1, ..., srcN.
+    source_pairs.extend((source, other_source) for other_source in other_sources)
+    if not isinstance(constraint, _DerivedConstraint):
+        if constraint.shared is not None:
+            # Moreover, when t.size()[dim] is specified equal to t'.size()[dim']
+            # and t'.size()[dim'] maps to src1', ..., srcN', we add
+            # constraints that also make src0 "equal" to src1', ..., srcN'.
+            other_sources = get_sources(constraint.shared.t_id, constraint.shared.dim)
+            source_pairs.extend(
+                (source, other_source) for other_source in other_sources
+            )
+    else:
+        # branch based on the root of the _DerivedConstraint
+        if not isinstance(constraint.root, _PhantomRoot):
+            # either root points to an input source
+            root = get_sources(constraint.root.t_id, constraint.root.dim)[0]  # type: ignore[assignment]
+        else:
+            # or root points to a phantom symbol
+            if constraint.root.name in phantom_symbols:
+                root = phantom_symbols[constraint.root.name]  # type: ignore[assignment]
+            else:
+                # create a phantom symbol in the shape env based on the _PhantomRoot
+                root = shape_env.create_symbol(
+                    val=constraint.root.val,
+                    source=torch._dynamo.source.ConstantSource(constraint.root.name),
+                    dynamic_dim=torch.fx.experimental.symbolic_shapes.DimDynamic.DYNAMIC,
+                    constraint_dim=constraint.root.constraint_range,
+                )
+                phantom_symbols[constraint.root.name] = root  # type: ignore[assignment]
+
+        fn = constraint.fn
+        # A derived equality (source, root, fn) informally corresponds to source = fn(root).
+        # Here source describes an input and root might describe another input or a phantom symbol.
+        derived_equalities.append((source, root, fn))
+
+
+def _process_dynamic_shapes(
+    f: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
+) -> Optional[List[Constraint]]:
+    from collections import defaultdict
+    from collections.abc import Mapping, Sequence
+
+    from torch._dynamo.exc import UserError, UserErrorType
+
+    if dynamic_shapes is None or len(dynamic_shapes) == 0:
+        return None
+
+    kwargs = kwargs if kwargs is not None else {}
+
+    def tree_zip(combined_args, dynamic_shapes):
+        if isinstance(combined_args, (tuple, list)):
+            if not isinstance(dynamic_shapes, Sequence):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dynamic_shapes of a {type(combined_args)} to be a Sequence, "
+                    f"got {dynamic_shapes} instead",
+                )
+            if len(combined_args) != len(dynamic_shapes):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected {dynamic_shapes} to have {len(combined_args)} items",
+                )
+            for i, shape in enumerate(dynamic_shapes):
+                yield from tree_zip(combined_args[i], shape)
+        elif isinstance(combined_args, dict):
+            if not isinstance(dynamic_shapes, Mapping):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dynamic_shapes of a {type(combined_args)} to be a Mapping, "
+                    f"got {dynamic_shapes} instead",
+                )
+            if len(combined_args) != len(dynamic_shapes):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected {dynamic_shapes} to have {len(combined_args)} items",
+                )
+            for k, shape in dynamic_shapes.items():
+                yield from tree_zip(combined_args[k], shape)
+        elif type(combined_args) in SUPPORTED_NODES:
+            if not isinstance(dynamic_shapes, Sequence):
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dynamic_shapes of a user-registered class (e.g., "
+                    f"{type(combined_args)}) to be a Sequence that matches the "
+                    f"flattened structure, but got {dynamic_shapes} instead",
+                )
+            yield from tree_zip(
+                SUPPORTED_NODES[type(combined_args)].flatten_fn(combined_args)[0],
+                dynamic_shapes,
+            )
+        elif isinstance(combined_args, torch.Tensor):
+            yield (combined_args, dynamic_shapes)
+        else:
+            if dynamic_shapes is not None:
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Expected dynamic_shapes of a {type(combined_args)} to be None, "
+                    f"got {dynamic_shapes} instead",
+                )
+
+    # map of Dim names representing input shape dimensions to constraints on them
+    symbols: Dict[str, List[Constraint]] = defaultdict(list)
+    # track roots that do not directly represent input shape dimensions
+    phantom_roots: Dict[str, _PhantomRoot] = {}
+    derived_constraints_with_phantom_root: List[_DerivedConstraint] = []
+
+    def to_constraint(dim, tensor, i):
+        import sympy
+
+        from torch.fx.experimental.symbolic_shapes import StrictMinMaxConstraint
+        from torch.utils._sympy.solve import try_solve
+        from torch.utils._sympy.value_ranges import ValueRanges
+
+        def root_value():
+            # given tensor.shape[i] is the value of dim = fn(root),
+            # find the value of root
+            symbol = sympy.Symbol(dim.root.__name__, integer=True)
+            expr = dim.fn(symbol)
+            solution = try_solve(sympy.Eq(expr, tensor.shape[i]), symbol)
+            if solution is not None:
+                return int(solution[1])  # type: ignore[call-overload]
+            else:
+                raise UserError(  # noqa: TRY200
+                    UserErrorType.CONSTRAINT_VIOLATION,
+                    f"Expected shape[{i}] = {tensor.shape[i]} of input Tensor to be "
+                    f"of the form {expr}, where {symbol} is an integer",
+                )
+
+        if isinstance(dim, _DerivedDim):
+            # generate a _DerivedConstraint where the root is:
+            # - either a _ConstraintTarget (if dim.root directly describes an input shape)
+            # - or a _PhantomRoot (otherwise)
+            dim_root = dim.root  # type: ignore[attr-defined]
+            if dim_root.__name__ in symbols:
+                # root represents an input shape dimension
+                root_constraint = symbols[dim_root.__name__][0]
+                root = _ConstraintTarget(
+                    root_constraint.w_tensor,
+                    root_constraint.t_id,
+                    root_constraint.dim,
+                )
+            elif dim_root.__name__ not in phantom_roots:
+                # create a phantom root
+                root = _PhantomRoot(  # type: ignore[assignment]
+                    name=dim_root.__name__,
+                    constraint_range=StrictMinMaxConstraint(
+                        vr=ValueRanges(lower=dim_root.min, upper=dim_root.max),
+                        warn_only=False,
+                    ),
+                    val=root_value(),
+                )
+                phantom_roots[dim_root.__name__] = root  # type: ignore[assignment]
+            else:
+                root = phantom_roots[dim_root.__name__]  # type: ignore[assignment]
+            constraint = _DerivedConstraint(
+                weakref.ref(tensor),
+                id(tensor),
+                i,
+                root,
+                dim.fn,  # type: ignore[attr-defined]
+                StrictMinMaxConstraint(
+                    vr=ValueRanges(lower=dim.min, upper=dim.max),
+                    warn_only=False,
+                ),
+                debug_name=dim.__name__,
+            )
+            if isinstance(root, _PhantomRoot):
+                # NOTE(avik): since we have not processed all inputs yet, we may replace this
+                # with a root that does represent an input shape dimension later (see below)
+                derived_constraints_with_phantom_root.append(constraint)
+        else:
+            constraint = dynamic_dim(tensor, i, debug_name=dim.__name__)
+            if dim.min != 2:
+                constraint = constraint >= dim.min
+            if dim.max != sys.maxsize - 1:
+                constraint = constraint <= dim.max
+        return constraint
+
+    bounds: Dict[str, Tuple[int, int]] = {}
+
+    def check_same_bounds(dim):
+        if dim.__name__ in symbols:
+            min_, max_ = bounds[dim.__name__]
+            if dim.min != min_ or dim.max != max_:
+                this_ = _Dim.readable(dim.__name__, min_, max_)
+                that_ = _Dim.readable(dim.__name__, dim.min, dim.max)
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Found different definitions {this_} and {that_} "
+                    f"for the same symbolic dimension {dim}!",
+                )
+
+        else:
+            bounds[dim.__name__] = (dim.min, dim.max)
+
+    def update_symbols(tensor, shape):
+        if isinstance(shape, dict):
+            for i, dim in shape.items():
+                if isinstance(dim, _Dim):
+                    check_same_bounds(dim)
+                    constraint = to_constraint(dim, tensor, i)
+                    symbols[dim.__name__].append(constraint)
+                else:
+                    if dim is not None:
+                        raise UserError(
+                            UserErrorType.INVALID_INPUT,
+                            f"Unexpected item #{i} ({dim}) in dynamic_shape {shape} of Tensor, "
+                            "try None instead",
+                        )
+        elif isinstance(shape, (tuple, list)):
+            for i, dim in enumerate(shape):
+                if isinstance(dim, _Dim):
+                    check_same_bounds(dim)
+                    constraint = to_constraint(dim, tensor, i)
+                    symbols[dim.__name__].append(constraint)
+                else:
+                    if dim is not None:
+                        raise UserError(
+                            UserErrorType.INVALID_INPUT,
+                            f"Unexpected item #{i} ({dim}) in dynamic_shape {shape} of Tensor, "
+                            "try None instead",
+                        )
+        else:
+            if shape is not None:
+                raise UserError(
+                    UserErrorType.INVALID_INPUT,
+                    f"Unexpected dynamic_shape {shape} of Tensor, " "try None instead",
+                )
+
+    import inspect
+
+    if isinstance(f, ExportedProgram):
+        f = f.module()
+    signature = (
+        inspect.signature(f.forward)
+        if isinstance(f, torch.nn.Module)
+        else inspect.signature(f)
+    )
+    combined_args = signature.bind(*args, **kwargs).arguments
+
+    # This means user didn't specify dynamic shapes with argument names.
+    combined_args = combined_args if isinstance(dynamic_shapes, Mapping) else list(combined_args.values())  # type: ignore[assignment]
+    for tensor, shape in tree_zip(combined_args, dynamic_shapes):
+        update_symbols(tensor, shape)
+
+    constraints = []
+    for derived_constraint_with_phantom_root in derived_constraints_with_phantom_root:
+        phantom_root_name = derived_constraint_with_phantom_root.root.name  # type: ignore[union-attr]
+        if phantom_root_name in symbols:
+            # We found an input shape dimension corresponding to this name, so we
+            # do not need a phantom symbol for it after all.
+            # NOTE(avik): Overall we want to maintain the invariant that roots that
+            # are phantom symbols are really "phantom," i.e., they cannot be represented
+            # by any input source. This is important when we are deciding derived equalities,
+            # since we can focus our attention exclusively on input sources: deciding
+            # derived equalities involving phantom symbols are, in comparison, trivial.
+            derived_constraint_with_phantom_root.root = symbols[phantom_root_name][0]
+
+    for dynamic_dims in symbols.values():
+        if all(
+            isinstance(dynamic_dim, _DerivedConstraint) for dynamic_dim in dynamic_dims
+        ):
+            constraints.extend(dynamic_dims)
+        else:
+            primary, *others = dynamic_dims
+            if others:
+                for other in others:
+                    constraints.append(primary == other)  # type: ignore[arg-type]
+            else:
+                constraints.append(primary)
+
+    return constraints  # type: ignore[return-value]
+
+
+def _process_constraints(
+    fake_mode,
+    graph_module: torch.fx.GraphModule,
+    num_lifted_params_buffers: int,
+    example_inputs: List[torch.Tensor],
+) -> Dict:
+    """
+    Process the constraints stored in the graph module to return something more readable.
+
+    Args:
+        graph_module (torch.fx.GraphModule): GraphModule returned from
+            dynamo.export, which contains the "input_shape_constraints" and
+            "inline_constraints" metadata
+
+        example_inputs: Flattened list of example inputs used to export the graph module
+
+    Returns:
+        range_constraints (Dict[sympy.Symbol, ValueRanges]): Mapping of
+            symbols (from SymInts) appearing in the fake tensors in
+            node.meta["val"] to their range constraints, which are a tuple
+            containing (lower, upper) constraints.
+    """
+    from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+        InputDim,
+    )
+
+    # Import sympy locally
+    from torch.fx.experimental.symbolic_shapes import SymInt
+    from torch.utils._sympy.value_ranges import ValueRanges
+
+    input_shape_constraints = graph_module.meta.get("input_shape_constraints", [])
+    inline_constraints = graph_module.meta.get("inline_constraints", [])
+
+    # Create dict mapping tensor_id to node names
+    tensor_id_to_nodes: Dict[int, List[str]] = defaultdict(list)
+    # Create dict mapping placeholder node names to their nodes
+    placeholder_nodes: Dict[str, torch.fx.Node] = {}
+    for i, node in enumerate(graph_module.graph.nodes):
+        if node.op != "placeholder":
+            # All placeholder nodes should be together in the beginning of the
+            # graph
+            break
+        if i >= num_lifted_params_buffers:
+            example_input = example_inputs[i - num_lifted_params_buffers]
+            tensor_id_to_nodes[id(example_input)].append(node.name)
+            placeholder_nodes[node.name] = node
+
+    # Create dict mapping (node name, dim) a list of range (lower, upper)
+    # constraints
+    multi_range_constraints: Dict[InputDim, List[ValueRanges]] = defaultdict(list)
+    for constraint in input_shape_constraints:
+        for node in tensor_id_to_nodes[constraint["t_id"]]:
+            node_dim = InputDim(node, constraint["dim"])
+
+            # Accumulate range constraints
+            multi_range_constraints[node_dim].append(
+                ValueRanges(constraint["min"], constraint["max"])
+            )
+
+    # Create dict mapping symbol to a singular range (lower, upper)
+    range_constraints: Dict[Any, ValueRanges] = {}
+
+    # Add inline constraints to range_constraints
+    range_constraints = {
+        symbol: inline_constraints[symbol] for symbol in inline_constraints
+    }
+
+    free_symbols: Set["Symbol"] = set()
+    # Add input range constraints to range_constraints
+    for input_dim, multi_range_constraint in multi_range_constraints.items():  # type: ignore[assignment]
+        # Simplify the range constraints into a single range constraint
+        # Ex. ranges [2, 10] and [3, 11] would get merged to [3, 10]
+        min_vals = [rc.lower for rc in multi_range_constraint]
+        max_vals = [rc.upper for rc in multi_range_constraint]
+        min_val = max(min_vals)  # type: ignore[type-var]
+        max_val = min(max_vals)  # type: ignore[type-var]
+        assert min_val <= max_val  # type: ignore[operator]
+
+        # Add input node range constraints
+        val = placeholder_nodes[input_dim.input_name].meta["val"]
+        assert isinstance(val, FakeTensor)
+        symint = val.shape[input_dim.dim]
+        assert isinstance(
+            symint, SymInt
+        ), f"Expected SymInt but got {symint}: {type(symint)}"
+        symbol = symint.node.expr
+        range_constraints[symbol] = ValueRanges(min_val, max_val)
+        free_symbols.update(symbol.free_symbols)
+
+    for symbol in free_symbols:
+        if symbol not in range_constraints:
+            # Placeholders can have symbolic shapes that are derived expressions.
+            # The above code will record direct range constraints for them
+            # so that we can do runtime assertions. In addition, for serde checks
+            # we want to record range constraints for their root symbols.
+            range_constraints[symbol] = fake_mode.shape_env.var_to_range[symbol]
+
+    return range_constraints
diff --git a/MLPY/Lib/site-packages/torch/export/exported_program.py b/MLPY/Lib/site-packages/torch/export/exported_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..0093133ea91ca8129796b91bcb77d4987c3d1c6b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/exported_program.py
@@ -0,0 +1,745 @@
+import copy
+import dataclasses
+import functools
+import types
+import warnings
+from collections import namedtuple
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+
+if TYPE_CHECKING:
+    # Import the following modules during type checking to enable code intelligence features,
+    # such as auto-completion in tools like pylance, even when these modules are not explicitly
+    # imported in user code.
+
+    import sympy
+
+    from torch.utils._sympy.value_ranges import ValueRanges
+
+import torch
+import torch.utils._pytree as pytree
+from torch.export._tree_utils import is_equivalent, reorder_kwargs
+from torch.fx._compatibility import compatibility
+from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
+
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.infra.pass_manager import PassManager
+
+from .graph_signature import (  # noqa: F401
+    _sig_to_specs,
+    ArgumentSpec,
+    ConstantArgument,
+    CustomObjArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    OutputKind,
+    OutputSpec,
+    SymIntArgument,
+    TensorArgument,
+)
+
+
+__all__ = [
+    "ExportedProgram",
+    "ModuleCallEntry",
+    "ModuleCallSignature",
+]
+
+
+PassType = Callable[[torch.fx.GraphModule], Optional[PassResult]]
+
+
+@dataclasses.dataclass
+class ModuleCallSignature:
+    inputs: List[ArgumentSpec]
+    outputs: List[ArgumentSpec]
+    in_spec: pytree.TreeSpec
+    out_spec: pytree.TreeSpec
+
+
+@dataclasses.dataclass
+class ModuleCallEntry:
+    fqn: str
+    signature: Optional[ModuleCallSignature] = None
+
+
+def _disable_prexisiting_fake_mode(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        with maybe_disable_fake_tensor_mode():
+            return fn(*args, **kwargs)
+
+    return wrapper
+
+
+def _fx_collection_equivalence_fn(
+    spec1_type: Optional[type],
+    spec1_context: pytree.Context,
+    spec2_type: Optional[type],
+    spec2_context: pytree.Context,
+) -> bool:
+    """Treat containers and their immutable variants as the same type. Otherwise
+    compare as normal.
+    """
+    if spec1_type is None or spec2_type is None:
+        return spec1_type is spec2_type and spec1_context == spec2_context
+
+    if issubclass(spec1_type, (dict, immutable_dict)) and issubclass(
+        spec2_type, (dict, immutable_dict)
+    ):
+        return spec1_context == spec2_context
+
+    if issubclass(spec1_type, (list, immutable_list)) and issubclass(
+        spec2_type, (list, immutable_list)
+    ):
+        return spec1_context == spec2_context
+
+    return spec1_type is spec2_type and spec1_context == spec2_context
+
+
+class ExportedProgram:
+    """
+    Package of a program from :func:`export`. It contains
+    an :class:`torch.fx.Graph` that represents Tensor computation, a state_dict containing
+    tensor values of all lifted parameters and buffers, and various metadata.
+
+    You can call an ExportedProgram like the original callable traced by
+    :func:`export` with the same calling convention.
+
+    To perform transformations on the graph, use ``.module`` property to access
+    an :class:`torch.fx.GraphModule`. You can then use
+    `FX transformation <https://pytorch.org/docs/stable/fx.html#writing-transformations>`_
+    to rewrite the graph. Afterwards, you can simply use :func:`export`
+    again to construct a correct ExportedProgram.
+    """
+
+    def __init__(
+        self,
+        root: Union[torch.nn.Module, Dict[str, Any]],
+        graph: torch.fx.Graph,
+        graph_signature: ExportGraphSignature,
+        state_dict: Dict[str, Union[torch.Tensor, torch.nn.Parameter]],
+        range_constraints: "Dict[sympy.Symbol, Any]",
+        module_call_graph: List[ModuleCallEntry],
+        example_inputs: Optional[Tuple[Tuple[Any, ...], Dict[str, Any]]] = None,
+        verifier: Optional[Type[Any]] = None,  # TODO Change typing hint to Verifier.
+        tensor_constants: Optional[
+            Dict[str, torch.Tensor]
+        ] = None,  # TODO: deprecate this
+        constants: Optional[
+            Dict[str, Union[torch.Tensor, torch._C.ScriptObject]]
+        ] = None,
+    ):
+        # Remove codegen related things from the graph. It should just be a flat graph.
+        graph._codegen = torch.fx.graph.CodeGen()
+        self._graph_module = _create_graph_module_for_export(root, graph)
+        if isinstance(root, torch.fx.GraphModule):
+            self._graph_module.meta.update(root.meta)
+
+        self._graph_signature: ExportGraphSignature = graph_signature
+        self._state_dict: Dict[str, Any] = state_dict
+        self._range_constraints: "Dict[sympy.Symbol, ValueRanges]" = range_constraints
+        assert module_call_graph is not None
+        self._module_call_graph: List[ModuleCallEntry] = module_call_graph
+        self._example_inputs = example_inputs
+
+        self._constants = tensor_constants or constants or {}
+        assert self._constants is not None
+
+        from torch._export.verifier import Verifier
+
+        if verifier is None:
+            verifier = Verifier
+        assert issubclass(verifier, Verifier)
+        self._verifier = verifier
+        # Validate should be always the last step of the constructor.
+        self.verifier().check(self)
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def graph_module(self):
+        return self._graph_module
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def graph(self):
+        return self.graph_module.graph
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def graph_signature(self):
+        return self._graph_signature
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def state_dict(self):
+        return self._state_dict
+
+    @compatibility(is_backward_compatible=False)
+    def parameters(self) -> Iterator[torch.nn.Parameter]:
+        """
+        Returns an iterator over original module's parameters.
+        """
+        for _, param in self.named_parameters():
+            yield param
+
+    @compatibility(is_backward_compatible=False)
+    def named_parameters(self) -> Iterator[Tuple[str, torch.nn.Parameter]]:
+        """
+        Returns an iterator over original module parameters, yielding
+        both the name of the parameter as well as the parameter itself.
+        """
+        for param_name in self.graph_signature.parameters:
+            yield param_name, self.state_dict[param_name]
+
+    @compatibility(is_backward_compatible=False)
+    def buffers(self) -> Iterator[torch.Tensor]:
+        """
+        Returns an iterator over original module buffers.
+        """
+        for _, buf in self.named_buffers():
+            yield buf
+
+    @compatibility(is_backward_compatible=False)
+    def named_buffers(self) -> Iterator[Tuple[str, torch.Tensor]]:
+        """
+        Returns an iterator over original module buffers, yielding
+        both the name of the buffer as well as the buffer itself.
+        """
+        non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
+        for buffer_name in self.graph_signature.buffers:
+            if buffer_name in non_persistent_buffers:
+                yield buffer_name, self.constants[buffer_name]
+            else:
+                yield buffer_name, self.state_dict[buffer_name]
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def range_constraints(self):
+        return self._range_constraints
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def module_call_graph(self):
+        return self._module_call_graph
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def example_inputs(self):
+        return self._example_inputs
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def call_spec(self):
+        CallSpec = namedtuple("CallSpec", ["in_spec", "out_spec"])
+
+        if len(self.module_call_graph) == 0:
+            return CallSpec(in_spec=None, out_spec=None)
+        assert self.module_call_graph[0].fqn == ""
+        return CallSpec(
+            in_spec=self.module_call_graph[0].signature.in_spec,
+            out_spec=self.module_call_graph[0].signature.out_spec,
+        )
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def verifier(self) -> Any:
+        return self._verifier
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def dialect(self) -> str:
+        return self._verifier.dialect
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def tensor_constants(self):
+        return self._constants
+
+    @property
+    @compatibility(is_backward_compatible=False)
+    def constants(self):
+        return self._constants
+
+    def _get_flat_args_with_check(self, args, kwargs):
+        """Flatten args, kwargs using pytree, then, check specs.
+
+        Args:
+            args: List[Any] original args passed to __call__
+            kwargs: Dict[str, Any] original kwargs passed to __call
+
+        Returns:
+            A tuple of (flat_args, received_spec)
+            flat_args is flattend args / kwargs
+            received_spec is the pytree spec produced while flattening the
+            tuple (args, kwargs)
+        """
+        in_spec = self.call_spec.in_spec
+        if in_spec is not None:
+            kwargs = reorder_kwargs(kwargs, in_spec)
+        flat_args_with_path, received_spec = pytree.tree_flatten_with_path(
+            (args, kwargs)
+        )  # type: ignore[possibly-undefined]
+        self._check_input_constraints(flat_args_with_path)
+        flat_args = tuple(x[1] for x in flat_args_with_path)
+        return flat_args, received_spec
+
+    def _graph_module_flat_inputs(self, args: Any, kwargs: Any) -> Any:
+        """Transform args, kwargs of __call__ to args for graph_module.
+
+        self.graph_module takes stuff from state dict as inputs.
+        The invariant is for ep: ExportedProgram is
+        ep(args, kwargs) ==
+          ep.postprocess(ep.graph_module(ep.graph_module_flat_inputs(args, kwargs)))
+        """
+
+        in_spec = self.call_spec.in_spec
+        flat_args, received_spec = self._get_flat_args_with_check(args, kwargs)
+        if in_spec is not None and not is_equivalent(
+            received_spec, in_spec, _fx_collection_equivalence_fn
+        ):
+            raise ValueError(
+                "Trying to flatten user inputs with exported input tree spec: \n"
+                f"{in_spec}\n"
+                "but actually got inputs with tree spec of: \n"
+                f"{received_spec}"
+            )
+
+        additional_inputs = []
+        for input_ in self.graph_signature.input_specs:
+            if input_.kind == InputKind.USER_INPUT:
+                continue
+            elif input_.kind in (
+                InputKind.PARAMETER,
+                InputKind.BUFFER,
+            ):
+                if input_.persistent is False:
+                    # This is a non-persistent buffer, grab it from our
+                    # constants instead of the state dict.
+                    additional_inputs.append(self.constants[input_.target])
+                else:
+                    additional_inputs.append(self.state_dict[input_.target])
+            elif input_.kind in (
+                InputKind.CONSTANT_TENSOR,
+                InputKind.CUSTOM_OBJ,
+            ):
+                additional_inputs.append(self.constants[input_.target])
+        additional_inputs = tuple(additional_inputs)
+
+        # NOTE: calling convention is first params, then buffers, then args as user supplied them.
+        # See: torch/_functorch/aot_autograd.py#L1034
+        return additional_inputs + flat_args
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        raise RuntimeError(
+            "Unable to call ExportedProgram directly. "
+            "You should use `exported_program.module()` instead."
+        )
+
+    def _postprocess_graph_module_outputs(self, res, orig_args, orig_kwargs):
+        """Process potential mutations to the input.
+
+        Because self.graph_module is functional, so mutations has to be written
+        back after execution of graph_module.
+        """
+        import torch._export.error as error
+
+        flat_args, _ = self._get_flat_args_with_check(orig_args, orig_kwargs)
+        if self.call_spec.out_spec is not None:
+            buffer_mutation = self.graph_signature.buffers_to_mutate
+            user_input_mutation = self.graph_signature.user_inputs_to_mutate
+            num_mutated = len(buffer_mutation) + len(user_input_mutation)
+            mutated_values = res[:num_mutated]
+
+            # Exclude dependency token from final result.
+            assertion_dep_token = self.graph_signature.assertion_dep_token
+            if assertion_dep_token is not None:
+                assertion_dep_token_index = next(iter(assertion_dep_token.keys()))
+                res = res[:assertion_dep_token_index]
+
+            res = res[num_mutated:]
+            try:
+                res = pytree.tree_unflatten(res, self.call_spec.out_spec)
+            except Exception:
+                _, received_spec = pytree.tree_flatten(res)
+                raise error.InternalError(  # noqa: TRY200
+                    "Trying to flatten user outputs with exported output tree spec: \n"
+                    f"{self.call_spec.out_spec}\n"
+                    "but actually got outputs with tree spec of: \n"
+                    f"{received_spec}"
+                )
+            finally:
+                user_inputs = [
+                    spec
+                    for spec in self.graph_signature.input_specs
+                    if spec.kind == InputKind.USER_INPUT
+                ]
+                for i, value in enumerate(mutated_values):
+                    output_spec = self.graph_signature.output_specs[i]
+                    if output_spec.kind == OutputKind.BUFFER_MUTATION:
+                        assert output_spec.target is not None
+                        self.state_dict[output_spec.target] = value
+                    elif output_spec.kind == OutputKind.USER_INPUT_MUTATION:
+                        assert output_spec.target is not None
+                        index = next(
+                            i
+                            for i, spec in enumerate(user_inputs)
+                            if spec.arg.name == output_spec.target
+                        )
+                        flat_args[index].copy_(value)
+                    else:
+                        raise AssertionError(f"Unexpected kind: {output_spec.kind}")
+        return res
+
+    def __str__(self) -> str:
+        graph_module = self.graph_module.print_readable(print_output=False).replace(
+            "\n", "\n    "
+        )
+        string = (
+            "ExportedProgram:\n"
+            f"    {graph_module}\n"
+            f"Graph signature: {self.graph_signature}\n"
+            f"Range constraints: {self.range_constraints}\n"
+        )
+        return string
+
+    def module(self) -> torch.nn.Module:
+        """
+        Returns a self contained GraphModule with all the parameters/buffers inlined.
+        """
+        from ._unlift import _unlift_exported_program_lifted_states
+
+        module = _unlift_exported_program_lifted_states(self)
+
+        def _train(self, mode: bool = True):
+            raise NotImplementedError("Calling train() is not supported yet.")
+
+        def _eval(self, mode: bool = True):
+            raise NotImplementedError("Calling eval() is not supported yet.")
+
+        module.train = types.MethodType(_train, module)  # type: ignore[method-assign]
+        module.eval = types.MethodType(_eval, module)  # type: ignore[method-assign]
+        return module
+
+    @_disable_prexisiting_fake_mode
+    def run_decompositions(
+        self, decomp_table: Optional[Dict[torch._ops.OperatorBase, Callable]] = None
+    ) -> "ExportedProgram":
+        """
+        Run a set of decompositions on the exported program and returns a new
+        exported program. By default we will run the Core ATen decompositions to
+        get operators in the
+        `Core ATen Operator Set <https://pytorch.org/docs/stable/torch.compiler_ir.html>`_.
+
+        For now, we do not decompose joint graphs.
+        """
+        from torch._decomp import core_aten_decompositions
+        from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+            _AddRuntimeAssertionsForInlineConstraintsPass,
+        )
+        from torch._export.passes.lift_constants_pass import (
+            ConstantAttrMap,
+            lift_constants_pass,
+        )
+        from torch._export.passes.replace_sym_size_ops_pass import (
+            _replace_sym_size_ops_pass,
+        )
+        from torch._functorch.aot_autograd import aot_export_module
+
+        def _get_placeholders(gm):
+            placeholders = []
+            for node in gm.graph.nodes:
+                if node.op != "placeholder":
+                    break
+                placeholders.append(node)
+            return placeholders
+
+        decomp_table = decomp_table or core_aten_decompositions()
+
+        old_placeholders = _get_placeholders(self.graph_module)
+        fake_args = [node.meta["val"] for node in old_placeholders]
+
+        buffers_to_remove = [name for name, _ in self.graph_module.named_buffers()]
+        for name in buffers_to_remove:
+            delattr(self.graph_module, name)
+        # TODO(zhxhchen17) Return the new graph_signature directly.
+        gm, graph_signature = aot_export_module(
+            self.graph_module, fake_args, decompositions=decomp_table, trace_joint=False
+        )
+
+        # Update the signatures with the new placeholder names in case they
+        # changed when calling aot_export
+        def update_arg(old_arg, new_ph):
+            if isinstance(old_arg, ConstantArgument):
+                return old_arg
+            elif isinstance(old_arg, TensorArgument):
+                return TensorArgument(name=new_ph.name)
+            elif isinstance(old_arg, SymIntArgument):
+                return SymIntArgument(name=new_ph.name)
+            raise RuntimeError(f"Type of old_arg not supported: {type(old_arg)}")
+
+        new_placeholders = _get_placeholders(gm)
+        new_outputs = list(gm.graph.nodes)[-1].args[0]
+
+        # To match the output target with correct input for input mutations
+        # need to find the old to new placeholder map
+        old_new_placeholder_map = {
+            spec.arg.name: new_placeholders[i].name
+            for i, spec in enumerate(self.graph_signature.input_specs)
+            if not isinstance(spec.arg, ConstantArgument)
+        }
+
+        input_specs = [
+            InputSpec(
+                spec.kind,
+                update_arg(spec.arg, new_placeholders[i]),
+                spec.target,
+                spec.persistent,
+            )
+            for i, spec in enumerate(self.graph_signature.input_specs)
+        ]
+        output_specs = [
+            OutputSpec(
+                spec.kind,
+                update_arg(spec.arg, new_outputs[i]),
+                old_new_placeholder_map.get(spec.target, spec.target),
+            )
+            for i, spec in enumerate(self.graph_signature.output_specs)
+        ]
+
+        assert len(new_placeholders) == len(old_placeholders)
+
+        new_graph_signature = ExportGraphSignature(
+            input_specs=input_specs, output_specs=output_specs
+        )
+        # NOTE: aot_export adds symint metadata for placeholders with int
+        # values; since these become specialized, we replace such metadata with
+        # the original values.
+        # Also, set the param/buffer metadata back to the placeholders.
+        for old_node, new_node in zip(old_placeholders, new_placeholders):
+            if not isinstance(old_node.meta["val"], torch.Tensor):
+                new_node.meta["val"] = old_node.meta["val"]
+
+            if (
+                new_node.target in new_graph_signature.inputs_to_parameters
+                or new_node.target in new_graph_signature.inputs_to_buffers
+            ):
+                for k, v in old_node.meta.items():
+                    new_node.meta[k] = v
+
+        # TODO unfortunately preserving graph-level metadata is not
+        # working well with aot_export. So we manually copy it.
+        # (The node-level meta is addressed above.)
+        gm.meta.update(self.graph_module.meta)
+
+        new_range_constraints = _get_updated_range_constraints(gm)
+
+        constants = lift_constants_pass(gm, new_graph_signature, ConstantAttrMap())
+        for k, v in constants.items():
+            assert k not in self.constants
+            self.constants[k] = v
+
+        _replace_sym_size_ops_pass(gm)
+        exported_program = ExportedProgram(
+            root=gm,
+            graph=gm.graph,
+            graph_signature=new_graph_signature,
+            state_dict=self.state_dict,
+            range_constraints=new_range_constraints,
+            module_call_graph=copy.deepcopy(self.module_call_graph),
+            example_inputs=self.example_inputs,
+            verifier=self.verifier,
+            constants=self.constants,
+        )
+
+        if len(new_range_constraints) > 0:
+            exported_program = exported_program._transform_do_not_use(
+                _AddRuntimeAssertionsForInlineConstraintsPass(new_range_constraints)
+            )
+
+        return exported_program
+
+    def _transform_do_not_use(self, *passes: PassType) -> "ExportedProgram":
+        pm = PassManager(list(passes))
+        res = pm(self.graph_module)
+        transformed_gm = res.graph_module if res is not None else self.graph_module
+        assert transformed_gm is not None
+
+        if transformed_gm is self.graph_module and not res.modified:
+            return self
+
+        # TODO(zhxchen17) Remove this.
+        def _get_updated_graph_signature(
+            old_signature: ExportGraphSignature,
+            new_gm: torch.fx.GraphModule,
+        ) -> ExportGraphSignature:
+            """
+            Update the graph signature's user_input/user_outputs.
+            """
+            new_input_specs = []
+            for i, node in enumerate(new_gm.graph.nodes):
+                if node.op != "placeholder":
+                    break
+
+                assert i < len(
+                    old_signature.input_specs
+                ), "Number of inputs changed after transformation"
+                old_input_spec = old_signature.input_specs[i]
+                arg = (
+                    old_input_spec.arg
+                    if isinstance(
+                        old_input_spec.arg, (ConstantArgument, CustomObjArgument)
+                    )
+                    else type(old_input_spec.arg)(node.name)
+                )
+                new_input_specs.append(
+                    InputSpec(
+                        old_input_spec.kind,
+                        arg,
+                        old_input_spec.target,
+                        old_input_spec.persistent,
+                    )
+                )
+
+            output_node = list(new_gm.graph.nodes)[-1]
+            assert output_node.op == "output"
+
+            new_output_specs = []
+            for i, node in enumerate(output_node.args[0]):
+                assert i < len(
+                    old_signature.output_specs
+                ), "Number of outputs changed after transformation"
+                old_output_spec = old_signature.output_specs[i]
+                arg = (
+                    old_output_spec.arg
+                    if isinstance(
+                        old_output_spec.arg, (ConstantArgument, CustomObjArgument)
+                    )
+                    else type(old_output_spec.arg)(node.name)
+                )
+                new_output_specs.append(
+                    OutputSpec(old_output_spec.kind, arg, old_output_spec.target)
+                )
+
+            new_signature = ExportGraphSignature(
+                input_specs=new_input_specs, output_specs=new_output_specs
+            )
+            return new_signature
+
+        transformed_ep = ExportedProgram(
+            root=transformed_gm,
+            graph=transformed_gm.graph,
+            graph_signature=_get_updated_graph_signature(
+                self.graph_signature, transformed_gm
+            ),
+            state_dict=self.state_dict,
+            range_constraints=_get_updated_range_constraints(transformed_gm),
+            module_call_graph=copy.deepcopy(self._module_call_graph),
+            example_inputs=self.example_inputs,
+            verifier=self.verifier,
+            constants=self.constants,
+        )
+        transformed_ep.graph_module.meta.update(self.graph_module.meta)
+        transformed_ep.graph_module.meta.update(res.graph_module.meta)
+        return transformed_ep
+
+    def _check_input_constraints(self, flat_args_with_path):
+        from torch._export.utils import _check_input_constraints_for_graph
+
+        placeholders = [p for p in self.graph.nodes if p.op == "placeholder"]
+        input_placeholders = [
+            p
+            for p, s in zip(placeholders, self.graph_signature.input_specs)
+            if s.kind == InputKind.USER_INPUT
+        ]
+        _check_input_constraints_for_graph(
+            input_placeholders, flat_args_with_path, self.range_constraints
+        )
+
+    def _validate(self):
+        self.verifier().check(self)
+
+    # TODO(zhxchen17) Formalize this.
+    def _update(
+        self, graph_module, graph_signature, state_dict=None
+    ) -> "ExportedProgram":
+        return ExportedProgram(
+            root=graph_module,
+            graph=graph_module.graph,
+            graph_signature=graph_signature,
+            state_dict=state_dict or self.state_dict,
+            range_constraints=copy.deepcopy(self.range_constraints),
+            module_call_graph=copy.deepcopy(self._module_call_graph),
+            example_inputs=self.example_inputs,
+            verifier=self.verifier,
+            tensor_constants=self.tensor_constants,
+        )
+
+
+def _get_updated_range_constraints(
+    gm: torch.fx.GraphModule,
+) -> "Dict[sympy.Symbol, Any]":
+    def get_shape_env(gm):
+        vals = [
+            node.meta["val"]
+            for node in gm.graph.nodes
+            if node.meta.get("val", None) is not None
+        ]
+        from torch._guards import detect_fake_mode
+
+        fake_mode = detect_fake_mode(vals)
+        if fake_mode is not None:
+            return fake_mode.shape_env
+        for v in vals:
+            if isinstance(v, torch.SymInt):
+                return v.node.shape_env
+
+    shape_env = get_shape_env(gm)
+    if shape_env is None:
+        return {}
+    range_constraints = {
+        k: v
+        for k, v in shape_env.var_to_range.items()
+        if k not in shape_env.replacements
+    }
+    # Only when we have an unbacked symint, and it's used as constructor inputs,
+    # runtime_var_to_range will make a difference compated to var_to_range.
+    # e.g. [2, oo) -> [0, oo)
+    for k, v in shape_env.var_to_range.items():
+        if k not in shape_env.replacements:
+            range_constraints[k] = v
+    return range_constraints
+
+
+def _create_graph_module_for_export(root, graph):
+    try:
+        gm = torch.fx.GraphModule(root, graph)
+    except SyntaxError:
+        # If custom objects stored in memory are being used in the graph,
+        # the generated python code will result in a syntax error on the custom
+        # object, since it is unable to parse the in-memory object. However
+        # we can still run the graph eagerly through torch.fx.Interpreter,
+        # so we will bypass this error.
+        warnings.warn(
+            "Unable to execute the generated python source code from "
+            "the graph. The graph module will no longer be directly callable, "
+            "but you can still run the ExportedProgram, and if needed, you can "
+            "run the graph module eagerly using torch.fx.Interpreter."
+        )
+        gm = torch.fx.GraphModule(root, torch.fx.Graph())
+        gm._graph = graph
+
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/export/graph_signature.py b/MLPY/Lib/site-packages/torch/export/graph_signature.py
new file mode 100644
index 0000000000000000000000000000000000000000..57919aae7a1cca6166dedad764c7bbadbc66ca2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/graph_signature.py
@@ -0,0 +1,504 @@
+import dataclasses
+from enum import auto, Enum
+from typing import Collection, Dict, List, Mapping, Optional, Set, Tuple, Union
+
+
+__all__ = [
+    "ConstantArgument",
+    "CustomObjArgument",
+    "ExportBackwardSignature",
+    "ExportGraphSignature",
+    "InputKind",
+    "InputSpec",
+    "OutputKind",
+    "OutputSpec",
+    "SymIntArgument",
+    "TensorArgument",
+]
+
+
+@dataclasses.dataclass
+class TensorArgument:
+    name: str
+
+
+@dataclasses.dataclass
+class SymIntArgument:
+    name: str
+
+
+@dataclasses.dataclass
+class CustomObjArgument:
+    name: str
+    class_fqn: str
+
+
+@dataclasses.dataclass
+class ConstantArgument:
+    value: Union[int, float, bool, None]
+
+
+ArgumentSpec = Union[
+    TensorArgument, SymIntArgument, ConstantArgument, CustomObjArgument
+]
+
+
+class InputKind(Enum):
+    USER_INPUT = auto()
+    PARAMETER = auto()
+    BUFFER = auto()
+    CONSTANT_TENSOR = auto()
+    CUSTOM_OBJ = auto()
+    TOKEN = auto()
+
+
+@dataclasses.dataclass
+class InputSpec:
+    kind: InputKind
+    arg: ArgumentSpec
+    target: Optional[str]
+    persistent: Optional[bool] = None
+
+    def __post_init__(self):
+        if self.kind == InputKind.BUFFER:
+            assert (
+                self.persistent is not None
+            ), "Failed to specify persistent flag on BUFFER."
+        assert isinstance(
+            self.arg,
+            (TensorArgument, SymIntArgument, ConstantArgument, CustomObjArgument),
+        ), f"got {type(self.arg)}"
+
+
+class OutputKind(Enum):
+    USER_OUTPUT = auto()
+    LOSS_OUTPUT = auto()
+    BUFFER_MUTATION = auto()
+    GRADIENT_TO_PARAMETER = auto()
+    GRADIENT_TO_USER_INPUT = auto()
+    USER_INPUT_MUTATION = auto()
+    TOKEN = auto()
+
+
+@dataclasses.dataclass
+class OutputSpec:
+    kind: OutputKind
+    arg: ArgumentSpec
+    target: Optional[str]
+
+    def __post_init__(self):
+        assert isinstance(self.arg, (TensorArgument, SymIntArgument, ConstantArgument))
+
+
+def _sig_to_specs(
+    *,
+    user_inputs: Set[str],
+    inputs_to_parameters: Mapping[str, str],
+    inputs_to_buffers: Mapping[str, str],
+    user_outputs: Set[str],
+    buffer_mutations: Mapping[str, str],
+    user_input_mutations: Mapping[str, str],
+    grad_params: Mapping[str, str],
+    grad_user_inputs: Mapping[str, str],
+    loss_output: Optional[str],
+    inputs: List[ArgumentSpec],
+    outputs: List[ArgumentSpec],
+    input_tokens: List[str],
+    output_tokens: List[str],
+) -> Tuple[List[InputSpec], List[OutputSpec]]:
+    def to_input_spec(inp: ArgumentSpec) -> InputSpec:
+        if not isinstance(inp, TensorArgument):
+            return InputSpec(kind=InputKind.USER_INPUT, arg=inp, target=None)
+        name = inp.name
+        if name in user_inputs:
+            return InputSpec(kind=InputKind.USER_INPUT, arg=inp, target=None)
+        elif name in inputs_to_parameters:
+            return InputSpec(
+                kind=InputKind.PARAMETER,
+                arg=inp,
+                target=inputs_to_parameters[name],
+            )
+        elif name in inputs_to_buffers:
+            return InputSpec(
+                kind=InputKind.BUFFER,
+                arg=inp,
+                target=inputs_to_buffers[name],
+                # Mark as True for now; we will fix this up to distinguish
+                # persistent from non-persistent later in tracing.
+                # See: rewrite_non_persistent_buffers()
+                # TODO(suo): this is horrible.
+                persistent=True,
+            )
+        elif name in input_tokens:
+            return InputSpec(kind=InputKind.TOKEN, arg=inp, target=None)
+        else:
+            raise AssertionError(f"Unknown tensor input kind: {name}")
+
+    def to_output_spec(idx: int, o: ArgumentSpec) -> OutputSpec:
+        if not isinstance(o, TensorArgument):
+            return OutputSpec(kind=OutputKind.USER_OUTPUT, arg=o, target=None)
+        name = o.name
+        if idx < len(buffer_mutations) + len(user_input_mutations) + len(output_tokens):
+            if name in buffer_mutations:
+                return OutputSpec(
+                    kind=OutputKind.BUFFER_MUTATION,
+                    arg=o,
+                    target=buffer_mutations[name],
+                )
+            elif name in user_input_mutations:
+                return OutputSpec(
+                    kind=OutputKind.USER_INPUT_MUTATION,
+                    arg=o,
+                    target=user_input_mutations[name],
+                )
+            elif name in output_tokens:
+                return OutputSpec(kind=OutputKind.TOKEN, arg=o, target=None)
+            else:
+                raise AssertionError(f"Unknown tensor mutation kind: {name}")
+        else:
+            if name in user_outputs:
+                return OutputSpec(kind=OutputKind.USER_OUTPUT, arg=o, target=None)
+
+            elif name in grad_params:
+                return OutputSpec(
+                    kind=OutputKind.GRADIENT_TO_PARAMETER,
+                    arg=o,
+                    target=grad_params[name],
+                )
+            elif name in grad_user_inputs:
+                return OutputSpec(
+                    kind=OutputKind.GRADIENT_TO_USER_INPUT,
+                    arg=o,
+                    target=grad_user_inputs[name],
+                )
+            elif name == loss_output:
+                return OutputSpec(kind=OutputKind.LOSS_OUTPUT, arg=o, target=None)
+
+            else:
+                raise AssertionError(f"Unknown tensor output kind: {name}")
+
+    input_specs = [to_input_spec(inp) for inp in inputs]
+    output_specs = [to_output_spec(idx, o) for idx, o in enumerate(outputs)]
+    return input_specs, output_specs
+
+
+@dataclasses.dataclass
+class ExportBackwardSignature:
+    gradients_to_parameters: Dict[str, str]
+    gradients_to_user_inputs: Dict[str, str]
+    loss_output: str
+
+
+@dataclasses.dataclass
+class ExportGraphSignature:
+    """
+    :class:`ExportGraphSignature` models the input/output signature of Export Graph,
+    which is a fx.Graph with stronger invariants gurantees.
+
+    Export Graph is functional and does not access "states" like parameters
+    or buffers within the graph via ``getattr`` nodes. Instead, :func:`export`
+    gurantees that parameters, buffers, and constant tensors are lifted out of
+    the graph as inputs.  Similarly, any mutations to buffers are not included
+    in the graph either, instead the updated values of mutated buffers are
+    modeled as additional outputs of Export Graph.
+
+    The ordering of all inputs and outputs are::
+
+        Inputs = [*parameters_buffers_constant_tensors, *flattened_user_inputs]
+        Outputs = [*mutated_inputs, *flattened_user_outputs]
+
+    e.g. If following module is exported::
+
+        class CustomModule(nn.Module):
+            def __init__(self):
+                super(CustomModule, self).__init__()
+
+                # Define a parameter
+                self.my_parameter = nn.Parameter(torch.tensor(2.0))
+
+                # Define two buffers
+                self.register_buffer('my_buffer1', torch.tensor(3.0))
+                self.register_buffer('my_buffer2', torch.tensor(4.0))
+
+            def forward(self, x1, x2):
+                # Use the parameter, buffers, and both inputs in the forward method
+                output = (x1 + self.my_parameter) * self.my_buffer1 + x2 * self.my_buffer2
+
+                # Mutate one of the buffers (e.g., increment it by 1)
+                self.my_buffer2.add_(1.0) # In-place addition
+
+                return output
+
+    Resulting Graph would be::
+
+        graph():
+            %arg0_1 := placeholder[target=arg0_1]
+            %arg1_1 := placeholder[target=arg1_1]
+            %arg2_1 := placeholder[target=arg2_1]
+            %arg3_1 := placeholder[target=arg3_1]
+            %arg4_1 := placeholder[target=arg4_1]
+            %add_tensor := call_function[target=torch.ops.aten.add.Tensor](args = (%arg3_1, %arg0_1), kwargs = {})
+            %mul_tensor := call_function[target=torch.ops.aten.mul.Tensor](args = (%add_tensor, %arg1_1), kwargs = {})
+            %mul_tensor_1 := call_function[target=torch.ops.aten.mul.Tensor](args = (%arg4_1, %arg2_1), kwargs = {})
+            %add_tensor_1 := call_function[target=torch.ops.aten.add.Tensor](args = (%mul_tensor, %mul_tensor_1), kwargs = {})
+            %add_tensor_2 := call_function[target=torch.ops.aten.add.Tensor](args = (%arg2_1, 1.0), kwargs = {})
+            return (add_tensor_2, add_tensor_1)
+
+    Resulting ExportGraphSignature would be::
+
+        ExportGraphSignature(
+            input_specs=[
+                InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg0_1'), target='my_parameter'),
+                InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg1_1'), target='my_buffer1'),
+                InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg2_1'), target='my_buffer2'),
+                InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg3_1'), target=None),
+                InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg4_1'), target=None)
+            ],
+            output_specs=[
+                OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='add_2'), target='my_buffer2'),
+                OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='add_1'), target=None)
+            ]
+        )
+    """
+
+    input_specs: List[InputSpec]
+    output_specs: List[OutputSpec]
+
+    # A list of parameters uniquely identified by mangled fully qualified name
+    @property
+    def parameters(self) -> Collection[str]:
+        # TODO Make this tuple.
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.PARAMETER
+            if isinstance(s.target, str)
+        ]
+
+    # A list of buffers uniquely identified by mangled fully qualified name
+    @property
+    def buffers(self) -> Collection[str]:
+        # TODO Make this tuple.
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.BUFFER
+            if isinstance(s.target, str)
+        ]
+
+    @property
+    def non_persistent_buffers(self) -> Collection[str]:
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.BUFFER
+            if s.persistent is False
+            if isinstance(s.target, str)
+        ]
+
+    # A list of lifted constant tensors
+    @property
+    def lifted_tensor_constants(self) -> Collection[str]:
+        # TODO Make this tuple.
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.CONSTANT_TENSOR
+            if isinstance(s.target, str)
+        ]
+
+    @property
+    def lifted_custom_objs(self) -> Collection[str]:
+        # TODO Make this tuple.
+        return [
+            s.target
+            for s in self.input_specs
+            if s.kind == InputKind.CUSTOM_OBJ
+            if isinstance(s.target, str)
+        ]
+
+    # Graph node names of pytree-flattened inputs of original program
+    @property
+    def user_inputs(self) -> Collection[Union[int, float, bool, None, str]]:
+        user_inputs: List[Union[int, float, bool, None, str]] = []
+        for s in self.input_specs:
+            if s.kind != InputKind.USER_INPUT:
+                continue
+
+            if isinstance(s.arg, (TensorArgument, SymIntArgument, CustomObjArgument)):
+                user_inputs.append(s.arg.name)
+            elif isinstance(s.arg, ConstantArgument):
+                user_inputs.append(s.arg.value)
+            else:
+                raise RuntimeError(f"{s.arg} is not a valid user inputs")
+        return tuple(user_inputs)
+
+    # Graph node names of pytree-flattened outputs of original program
+    @property
+    def user_outputs(self) -> Collection[Union[int, float, bool, None, str]]:
+        user_outputs: List[Union[int, float, bool, None, str]] = []
+        for s in self.output_specs:
+            if s.kind != OutputKind.USER_OUTPUT:
+                continue
+
+            if isinstance(s.arg, (TensorArgument, SymIntArgument)):
+                user_outputs.append(s.arg.name)
+            elif isinstance(s.arg, ConstantArgument):
+                user_outputs.append(s.arg.value)
+            else:
+                raise RuntimeError(f"{s.arg} is not a valid user output")
+        return tuple(user_outputs)
+
+    # A dictionary mapping graph input node names to parameters. If a graph input
+    # name is found in this dictionary, it is guranteed to be a lifted parameter.
+    @property
+    def inputs_to_parameters(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.input_specs
+            if s.kind == InputKind.PARAMETER
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        }
+
+    # A dictionary mapping graph input node names to buffers. If a graph input
+    # name is found in this dictionary, it is guranteed to be a lifted buffer.
+    @property
+    def inputs_to_buffers(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target  # type: ignore[union-attr, misc]
+            for s in self.input_specs
+            if s.kind == InputKind.BUFFER
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        }
+
+    # A dictionary mapping graph output node names to buffers that are mutated in the
+    # original program. Buffers that are not mutated will not be found in this dictionary.
+    @property
+    def buffers_to_mutate(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.output_specs
+            if s.kind == OutputKind.BUFFER_MUTATION
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        }
+
+    @property
+    def user_inputs_to_mutate(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.output_specs
+            if s.kind == OutputKind.USER_INPUT_MUTATION
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        }
+
+    # A dictionary mapping graph input node names to lifted tensor constants.
+    @property
+    def inputs_to_lifted_tensor_constants(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.input_specs
+            if s.kind == InputKind.CONSTANT_TENSOR
+            and isinstance(s.arg, TensorArgument)
+            and isinstance(s.target, str)
+        }
+
+    @property
+    def inputs_to_lifted_custom_objs(self) -> Mapping[str, str]:
+        return {
+            s.arg.name: s.target
+            for s in self.input_specs
+            if s.kind == InputKind.CUSTOM_OBJ
+            and isinstance(s.arg, CustomObjArgument)
+            and isinstance(s.target, str)
+        }
+
+    @property
+    def backward_signature(self) -> Optional[ExportBackwardSignature]:
+        loss_output = None
+        gradients_to_parameters: Dict[str, str] = {}
+        gradients_to_user_inputs: Dict[str, str] = {}
+        for spec in self.output_specs:
+            if spec.kind == OutputKind.LOSS_OUTPUT:
+                assert loss_output is None
+                assert isinstance(spec.arg, TensorArgument)
+                loss_output = spec.arg.name
+            elif spec.kind == OutputKind.GRADIENT_TO_PARAMETER:
+                assert isinstance(spec.target, str)
+                assert isinstance(spec.arg, TensorArgument)
+                gradients_to_parameters[spec.arg.name] = spec.target
+            elif spec.kind == OutputKind.GRADIENT_TO_USER_INPUT:
+                assert isinstance(spec.target, str)
+                assert isinstance(spec.arg, TensorArgument)
+                gradients_to_user_inputs[spec.arg.name] = spec.target
+
+        if loss_output is None:
+            return None
+
+        return ExportBackwardSignature(
+            loss_output=loss_output,
+            gradients_to_parameters=gradients_to_parameters,
+            gradients_to_user_inputs=gradients_to_user_inputs,
+        )
+
+    # Map from assertion dependency token index to assertion dep token output
+    # name in output. The shape of output after aot_autograd will be like:
+    # (updated_inputs, user_outputs, dep_token).
+    @property
+    def assertion_dep_token(self) -> Optional[Mapping[int, str]]:
+        return None
+
+    @property
+    def input_tokens(self) -> List[str]:
+        input_tokens = []
+        for s in self.input_specs:
+            if s.kind == InputKind.TOKEN:
+                assert isinstance(s.arg, TensorArgument)
+                input_tokens.append(s.arg.name)
+        return input_tokens
+
+    @property
+    def output_tokens(self) -> List[str]:
+        output_tokens = []
+        for s in self.output_specs:
+            if s.kind == OutputKind.TOKEN:
+                assert isinstance(s.arg, TensorArgument)
+                output_tokens.append(s.arg.name)
+        return output_tokens
+
+    def __post_init__(self) -> None:
+        assertion_dep_token = self.assertion_dep_token
+        if assertion_dep_token is None:
+            return
+        assert len(assertion_dep_token) == 1
+        assertion_dep_token_index = next(iter(assertion_dep_token.keys()))
+        assert (
+            len(self.user_outputs) + len(self.buffers_to_mutate)
+            == assertion_dep_token_index
+        )
+
+    def replace_all_uses(self, old: str, new: str):
+        """
+        Replace all uses of the old name with new name in the signature.
+        """
+        assert isinstance(old, str)
+        assert isinstance(new, str)
+        arg_types = (TensorArgument, SymIntArgument, CustomObjArgument)
+        for o in self.output_specs:
+            if isinstance(o.arg, arg_types):
+                if o.arg.name == old:
+                    o.arg.name = new
+        for i in self.input_specs:
+            if isinstance(i.arg, arg_types):
+                if i.arg.name == old:
+                    i.arg.name = new
+
+    def get_replace_hook(self):
+        def _(old, new, user):
+            if user.op in ("output", "input"):
+                self.replace_all_uses(old.name, new)
+
+        return _
diff --git a/MLPY/Lib/site-packages/torch/export/unflatten.py b/MLPY/Lib/site-packages/torch/export/unflatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..90d46ca1792359ff6a5487f5eb8e91e4b4a6dbb5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/export/unflatten.py
@@ -0,0 +1,860 @@
+import abc
+import copy
+import operator
+from copy import deepcopy
+from enum import Enum
+from itertools import chain
+from typing import Any, cast, Dict, List, Optional, Union
+
+import torch
+import torch.fx._pytree as fx_pytree
+import torch.utils._pytree as pytree
+from torch.export._tree_utils import reorder_kwargs
+from torch.export.exported_program import (
+    ConstantArgument,
+    ExportedProgram,
+    ModuleCallSignature,
+    SymIntArgument,
+    TensorArgument,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._pytree import GetAttrKey, SequenceKey
+
+__all__ = ["InterpreterModule", "UnflattenedModule", "unflatten", "FlatArgsAdapter"]
+
+
+class _AttrKind(Enum):
+    PARAMETER = "parameter"
+    BUFFER = "buffer"
+    CONSTANT = "constant"
+
+
+# Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
+# This installs empty Modules where none exist yet if they are subpaths of target
+def _assign_attr(
+    from_obj: Union[torch.Tensor, torch.ScriptObject],
+    to_module: torch.nn.Module,
+    target: str,
+    attr_kind: _AttrKind,
+    persistent: bool = True,
+):
+    *prefix, field = target.split(".")
+    for item in prefix:
+        t = getattr(to_module, item, None)
+
+        if t is None:
+            t = torch.nn.Module()
+            setattr(to_module, item, t)
+        to_module = t
+
+    if attr_kind == _AttrKind.PARAMETER:
+        assert isinstance(from_obj, torch.nn.Parameter)
+        to_module.register_parameter(field, from_obj)
+    elif attr_kind == _AttrKind.BUFFER:
+        assert isinstance(from_obj, torch.Tensor)
+        to_module.register_buffer(field, from_obj, persistent=persistent)
+    elif attr_kind == _AttrKind.CONSTANT:
+        assert isinstance(from_obj, (torch.Tensor, torch.ScriptObject))
+        setattr(to_module, field, from_obj)
+
+
+class InterpreterModule(torch.nn.Module):
+    """A module that uses torch.fx.Interpreter to execute instead of the usual
+    codegen that GraphModule uses. This provides better stack trace information
+    and makes it easier to debug execution.
+    """
+
+    def __init__(
+        self,
+        graph: torch.fx.Graph,
+    ):
+        super().__init__()
+        self.graph = graph
+        self.graph.owning_module = self
+
+    def forward(self, *args, **kwargs):
+        assert self.graph_module is not None, "Didn't finalize this InterpreterModule"
+        if torch.compiler.is_dynamo_compiling():
+            # Dynamo cannot trace through torch.fx.Interpreter, so fall back to
+            # GraphModule codegen in this instance.
+            return self.graph_module(*args, **kwargs)
+        else:
+            if kwargs:
+                # Handle **kwargs. FX only natively supports positional
+                # arguments (through placeholders). So in order to pass in
+                # kwargs, we must correspond the names of the placeholders with
+                # the keys in the kwarg dict.
+                arg_list = list(args)
+                kwarg_names = self.arg_names[len(arg_list) :]
+                for kwarg_name in kwarg_names:
+                    if kwarg_name in kwargs:
+                        arg_list.append(kwargs[kwarg_name])
+
+                # Assert that the kwargs passed in exactly match the positional
+                # arguments specified by the GraphModule. This should be
+                # guaranteed by the unflattening process.
+                assert len(kwarg_names) == len(kwargs)
+                assert len(arg_list) == len(self.arg_names)
+                args = tuple(arg_list)
+
+            return torch.fx.Interpreter(self, graph=self.graph).run(
+                *args, enable_io_processing=False
+            )
+
+    def finalize(self):
+        # We need to "finalize" because GraphModule populates its own state_dict
+        # based on the get_attrs observed in the graph. So we need to fully
+        # construct the graph and call _sink_params before generating this
+        # GraphModule.
+
+        # need to set `graph_module` directly on the dict to avoid it getting
+        # registered as a submodule.
+        self.__dict__["graph_module"] = torch.fx.GraphModule(self, self.graph)
+        self.graph.lint()
+
+        # Cache arg names for kwarg handling (see forward())
+        self.arg_names = []
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                self.arg_names.append(node.target)
+
+
+class FlatArgsAdapter(abc.ABC):
+    """
+    Adapts input arguments with ``input_spec`` to align ``target_spec``.
+    """
+
+    @abc.abstractmethod
+    def adapt(
+        self,
+        target_spec: pytree.TreeSpec,
+        input_spec: pytree.TreeSpec,
+        input_args: List[Any],
+    ) -> List[Any]:
+        """NOTE: This adapter may mutate given ``input_args_with_path``."""
+        ...
+
+
+class UnflattenedModule(torch.nn.Module):
+    def __init__(
+        self,
+        export_module: ExportedProgram,
+        flat_args_adapter: Optional[FlatArgsAdapter] = None,
+    ):
+        super().__init__()
+        if export_module.graph_signature.backward_signature is not None:
+            raise ValueError("Unflattening on JointExportModule NYI")
+
+        export_graph = deepcopy(export_module.graph)
+        self.graph_signature = deepcopy(export_module.graph_signature)
+        self.graph = torch.fx.Graph()
+        self.module_call_graph = deepcopy(export_module.module_call_graph)
+        self.flat_args_adapter = flat_args_adapter
+        # Flag to indicate whether args have been adapted.
+        self.adapted = False
+
+        _inplace_buffer_mutations(export_graph, self.graph_signature)
+        _outline_submodules(export_graph, self)
+
+        self.range_constraints = export_module.range_constraints
+        self.equality_constraints: List = []
+
+        state_dict = export_module.state_dict
+        for name in self.graph_signature.parameters:
+            cloned = torch.nn.Parameter(state_dict[name].clone())
+            _assign_attr(
+                cloned,
+                self,
+                name,
+                attr_kind=_AttrKind.PARAMETER,
+            )
+
+        non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
+        for name in self.graph_signature.buffers:
+            if name in non_persistent_buffers:
+                persistent = False
+                cloned = export_module.constants[name].clone()
+            else:
+                persistent = True
+                cloned = state_dict[name].clone()
+
+            _assign_attr(
+                cloned,
+                self,
+                name,
+                attr_kind=_AttrKind.BUFFER,
+                persistent=persistent,
+            )
+
+        for fqn in chain(
+            self.graph_signature.lifted_tensor_constants,
+            self.graph_signature.lifted_custom_objs,
+        ):
+            constant = export_module.constants[fqn]
+            if isinstance(constant, torch.Tensor):
+                constant = constant.clone()
+            _assign_attr(
+                constant,
+                self,
+                fqn,
+                attr_kind=_AttrKind.CONSTANT,
+            )
+
+        inputs_to_state: Dict[str, str] = {
+            **self.graph_signature.inputs_to_parameters,
+            **self.graph_signature.inputs_to_buffers,
+            **self.graph_signature.inputs_to_lifted_tensor_constants,
+            **self.graph_signature.inputs_to_lifted_custom_objs,
+        }
+
+        _sink_params(self, inputs_to_state, [])
+        # Check all input nodes has been processed.
+        for module in self.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "placeholder":
+                    continue
+                assert node.name not in inputs_to_state
+
+        # Cache so we don't have to compute this every time.
+        # NOTE: this needs to be kept in sync with the placeholders in
+        # self.graph, but currently we have no way to guarantee that.
+        self.input_placeholders = [
+            node for node in self.graph.nodes if node.op == "placeholder"
+        ]
+        self.check_input_constraints = True
+        assert self.module_call_graph[0].fqn == ""
+
+    def forward(self, *args, **kwargs):
+        signature = self.module_call_graph[0].signature
+
+        reordered_kwargs = reorder_kwargs(kwargs, signature.in_spec)
+
+        flat_args_with_path, in_spec = pytree.tree_flatten_with_path(
+            (args, reordered_kwargs)
+        )
+        flat_args = [x[1] for x in flat_args_with_path]
+        if is_fx_tracing():
+            return_val = torch.fx.Interpreter(self, graph=self.graph).run(
+                *flat_args, enable_io_processing=False
+            )
+            # For scalar return value, fx.Graph wraps in a tuple
+            if isinstance(return_val, tuple) and len(return_val) == 1:
+                return return_val[0]
+            return return_val
+
+        if in_spec != signature.in_spec:
+            if not self.adapted:
+                print(
+                    "Input treespec does not match with exported module's: \n"
+                    f"Input treespec: {in_spec}. ",
+                    f"Exported module treespec: {signature.in_spec}",
+                )
+            if self.flat_args_adapter is None:
+                raise TypeError(
+                    "There is no flat args adapter sepcified. "
+                    "Are you sure you are calling this with the right arguments? "
+                )
+            else:
+                if not self.adapted:
+                    print("Adapting flat arg to match exported module's treespec")
+                flat_args = self.flat_args_adapter.adapt(
+                    target_spec=signature.in_spec,
+                    input_spec=in_spec,
+                    input_args=flat_args,
+                )
+                self.adapted = True
+                if len(flat_args) != signature.in_spec.num_leaves:
+                    raise TypeError(
+                        f"Flat args adaption failed, number of args mismatch "
+                        f"Adatped: {len(flat_args)} \n"
+                        f"Exported module: {signature.in_spec.num_leaves}"
+                    )
+
+        if self.check_input_constraints:
+            # Import here to avoid an unfortunate circular dependency.
+            # TODO(suo): untangle this.
+            from torch._export.utils import _check_input_constraints_for_graph
+
+            if self.adapted is True:
+                # TODO(suo): The FlatArgsAdapter returns a list of flat args,
+                # which we don't have keypaths for. For now, just create a dummy
+                # keypath to associate with the arg.
+                new_flat_args_with_path = [  # type: ignore[var-annotated]
+                    ((SequenceKey(idx=0), GetAttrKey(name="<unknown location>")), arg)
+                    for arg in flat_args
+                ]
+            else:
+                new_flat_args_with_path = flat_args_with_path  # type: ignore[assignment]
+
+            _check_input_constraints_for_graph(
+                self.input_placeholders, new_flat_args_with_path, self.range_constraints
+            )
+        tree_out = torch.fx.Interpreter(self, graph=self.graph).run(
+            *flat_args, enable_io_processing=False
+        )
+        return pytree.tree_unflatten(tree_out, signature.out_spec)
+
+
+def unflatten(
+    module: ExportedProgram, flat_args_adapter: Optional[FlatArgsAdapter] = None
+) -> UnflattenedModule:
+    """Unflatten an ExportedProgram, producing a module with the same module
+    hierarchy as the original eager module. This can be useful if you are trying
+    to use :mod:`torch.export` with another system that expects a module
+    hierachy instead of the flat graph that :mod:`torch.export` usually produces.
+
+    .. note:: The args/kwargs of unflattened modules will not necessarily match
+        the eager module, so doing a module swap (e.g. :code:`self.submod =
+        new_mod`) will not necessarily work. If you need to swap a module out, you
+        need to set the :code:`preserve_module_call_signature` parameter of
+        :func:`torch.export.export`.
+
+    Args:
+        module (ExportedProgram): The ExportedProgram to unflatten.
+        flat_args_adapter (Optional[FlatArgsAdapter]): Adapt flat args if input TreeSpec does not match with exported module's.
+
+    Returns:
+        An instance of :class:`UnflattenedModule`, which has the same module
+        hierarchy as the original eager module pre-export.
+    """
+    return UnflattenedModule(module, flat_args_adapter)
+
+
+def _inplace_buffer_mutations(graph: torch.fx.Graph, graph_signature) -> None:
+    """Transform buffer mutations from their functionalized form into a copy_
+    node in the graph.
+
+    Functionalization represents buffer mutation by passing the buffer as an input and output. So for example, the eager code:
+        def forward(self, x):
+            self.buffer += x
+            return x * x
+
+    Will become a graph that looks like:
+        def forward(self, buffer, x):
+            mutated_buffer = aten.add(buffer, x)
+            mul = aten.mul(x, x)
+            return (mutated_buffer, mul)
+
+    We want to inplace this into something that looks like the original eager code:
+        def forward(self, buffer, x):
+            mutated_buffer = aten.add(buffer, x)
+            buffer.copy_(mutated_buffer)
+            mul = aten.mul(x, x)
+            return (mul,)
+    """
+    output_node = next(iter(reversed(graph.nodes)))
+    assert output_node.op == "output" and len(output_node.args) == 1
+    return_args = output_node.args[0]
+
+    mutation_node_to_buffer = graph_signature.buffers_to_mutate
+    mutations = return_args[: len(mutation_node_to_buffer)]
+    buffers_to_inputs = {v: k for k, v in graph_signature.inputs_to_buffers.items()}
+    input_name_to_node = {
+        node.name: node for node in graph.nodes if node.op == "placeholder"
+    }
+
+    for mutation in mutations:
+        buffer_name = mutation_node_to_buffer[mutation.name]
+        input_name = buffers_to_inputs[buffer_name]
+        input_node = input_name_to_node[input_name]
+
+        with graph.inserting_after(mutation):
+            new_node = graph.create_node(
+                "call_function", torch.ops.aten.copy_, (input_node, mutation)
+            )
+            for k, v in mutation.meta.items():
+                new_node.meta[k] = v
+        # Replace all uses of the previously functional mutation with our copy_ output.
+        mutation.replace_all_uses_with(new_node, lambda x: x is not new_node)
+
+    # Remove the mutated buffer from the graph outputs, since we don't need to
+    # thread it through anymore. We don't need to handle the inputs, which will
+    # be handled by _sink_params.
+    user_outputs = tuple(
+        return_args[len(mutation_node_to_buffer) :],
+    )
+    output_node.args = ((user_outputs),)
+
+
+def _is_prefix(candidate, target):
+    """Check whether `candidate` is a prefix of `target`."""
+    return len(candidate) < len(target) and target[: len(candidate)] == candidate
+
+
+def _compute_accessor(parent_fqn: str, child_fqn: str) -> str:
+    if parent_fqn == "":
+        # Handle the root module correctly.
+        return child_fqn
+
+    parent_split = parent_fqn.split(".")
+    child_split = child_fqn.split(".")
+
+    assert (
+        child_split[: len(parent_split)] == parent_split
+    ), f"Child module '{child_fqn}' is not a descendant of parent module '{parent_fqn}'"
+    return ".".join(child_split[len(parent_split) :])
+
+
+def _verify_graph_equivalence(x: torch.nn.Module, y: torch.nn.Module):
+    def graph_dump(graph: torch.fx.Graph) -> str:
+        ret = []
+        nodes_idx: Dict[int, int] = {}
+
+        def arg_dump(arg) -> str:
+            if isinstance(arg, torch.fx.Node):
+                return "%" + str(nodes_idx[id(arg)])
+            return str(arg)
+
+        for i, node in enumerate(graph.nodes):
+            args_dump = [str(arg) for arg in pytree.tree_map(arg_dump, node.args)]
+            args_dump += [
+                f"{key}={value}"
+                for key, value in pytree.tree_map(arg_dump, node.kwargs).items()
+            ]
+            target = node.target if node.op == "call_function" else ""
+            ret.append(f"{i}: {node.op}[{target}]({', '.join(args_dump)})")
+            nodes_idx[id(node)] = i
+        return "\n".join(ret)
+
+    assert graph_dump(x.graph) == graph_dump(y.graph)
+
+
+def _add_spec(gm: torch.nn.Module, spec) -> str:
+    i = 0
+    while hasattr(gm, f"_spec_{i}"):
+        i += 1
+    name = f"_spec_{i}"
+    setattr(gm, name, spec)
+    return name
+
+
+def _generate_flatten(gm: torch.nn.Module, node, spec) -> torch.fx.Node:
+    name = _add_spec(gm, spec)
+    spec_node = gm.graph.get_attr(name)
+    return gm.graph.call_function(fx_pytree.tree_flatten_spec, (node, spec_node))
+
+
+def _generate_unflatten(gm: torch.nn.Module, nodes, spec) -> torch.fx.Node:
+    name = _add_spec(gm, spec)
+    spec_node = gm.graph.get_attr(name)
+    return gm.graph.call_function(pytree.tree_unflatten, (nodes, spec_node))
+
+
+def _add_submodule(mod: torch.nn.Module, target: str, module_to_add: torch.nn.Module):
+    *prefix, field = target.split(".")
+
+    for item in prefix:
+        submod = getattr(mod, item, None)
+
+        if submod is None:
+            submod = torch.nn.Module()
+            setattr(mod, item, submod)
+
+        if not isinstance(submod, torch.nn.Module):
+            return False
+
+        mod = submod
+
+    mod.add_module(field, module_to_add)
+
+
+class _ModuleFrame:
+    def __init__(
+        self,
+        flat_graph,
+        nodes,
+        seen_nodes,
+        seen_modules,
+        parent,
+        module_stack,
+        module_id,
+        module_call_graph: Dict[str, ModuleCallSignature],
+        module: Optional[torch.nn.Module] = None,
+    ):
+        self.flat_graph = flat_graph
+        self.nodes = nodes
+        self.seen_nodes = seen_nodes
+        self.seen_modules = seen_modules
+        self.parent = parent
+        self.module_stack = module_stack
+        self.module_id = module_id
+
+        self.module_call_graph = module_call_graph
+        self.verbose = False
+
+        self.fqn = self.module_stack[-1]
+        if module is not None:
+            self.module = module
+        else:
+            self.module = InterpreterModule(torch.fx.Graph())
+        if self.module_id in self.seen_modules:
+            self.cached_graph_module = self.seen_modules[self.module_id]
+        else:
+            self.cached_graph_module = None
+            self.seen_modules[self.module_id] = self.module
+
+        self.graph = self.module.graph
+
+        # Mapping of nodes in the flat graph to nodes in this graph.
+        self.node_map: Dict[torch.fx.Node, torch.fx.Node] = {}
+        self.node_to_placeholder = {}
+
+        self.parent_call_module: Optional[torch.fx.Node] = None
+        if parent is not None:
+            accessor = _compute_accessor(parent.fqn, self.fqn)
+            _add_submodule(
+                parent.module,
+                accessor,
+                self.module
+                if self.cached_graph_module is None
+                else self.cached_graph_module,
+            )
+            self.parent_call_module = parent.graph.call_module(accessor)
+
+        signature = module_call_graph.get(self.fqn)
+        if signature is not None and self.parent is not None:
+            assert signature.in_spec.num_children == 2
+            args_spec = signature.in_spec.children_specs[0]
+            kwargs_spec = signature.in_spec.children_specs[1]
+            assert args_spec.context is None
+            assert kwargs_spec.context is not None
+
+            with self.graph.inserting_after(None):
+                arg_nodes = []
+                for idx in range(args_spec.num_children):
+                    arg_nodes.append(self.graph.placeholder(f"_positional_arg_{idx}"))
+                kwarg_nodes = {}
+                for name in kwargs_spec.context:
+                    kwarg_nodes[name] = self.graph.placeholder(name)
+                flat_args = _generate_flatten(
+                    self.module,
+                    (tuple(arg_nodes), kwarg_nodes),
+                    signature.in_spec,
+                )
+                for idx, arg in enumerate(signature.inputs):
+                    flat_arg_node = self.graph.create_node(
+                        op="call_function",
+                        target=operator.getitem,
+                        args=(flat_args, idx),
+                        name=arg.name
+                        if not isinstance(arg, ConstantArgument)
+                        else f"_constant_{idx}",
+                    )
+                    if isinstance(arg, ConstantArgument):
+                        continue
+                    flat_arg_node.meta = copy.copy(self.seen_nodes[arg.name].meta)
+                    self.node_to_placeholder[self.seen_nodes[arg.name]] = flat_arg_node
+
+            with self.parent.graph.inserting_before(self.parent_call_module):
+                input_nodes: List[Optional[torch.fx.Node]] = []
+                for input in signature.inputs:
+                    if isinstance(input, ConstantArgument) and input.value is None:
+                        input_nodes.append(None)
+                    else:
+                        assert isinstance(input, (TensorArgument, SymIntArgument))
+                        input_nodes.append(
+                            self.parent.remap_input(self.seen_nodes[input.name])
+                        )
+
+                inputs_node = _generate_unflatten(
+                    self.parent.module,
+                    input_nodes,
+                    signature.in_spec,
+                )
+
+                args_node = self.parent.graph.call_function(
+                    operator.getitem, (inputs_node, 0)
+                )
+                kwargs_node = self.parent.graph.call_function(
+                    operator.getitem, (inputs_node, 1)
+                )
+                arg_nodes = [
+                    self.parent.graph.call_function(operator.getitem, (args_node, i))
+                    for i in range(args_spec.num_children)
+                ]
+                kwarg_nodes = {
+                    k: self.parent.graph.call_function(
+                        operator.getitem, (kwargs_node, k)
+                    )
+                    for k in kwargs_spec.context
+                }
+            assert self.parent_call_module is not None
+            self.parent_call_module.args = tuple(arg_nodes)
+            self.parent_call_module.kwargs = kwarg_nodes
+
+    def add_placeholder(self, x):
+        assert x.graph is self.flat_graph
+        # x is not in subgraph, create a new placeholder for subgraph
+        with self.graph.inserting_before(None):
+            placeholder_node = self.graph.placeholder(x.name, type_expr=x.type)
+        # copy all meta fields, even if some fields might be irrelvant for
+        # the placeholder node
+        placeholder_node.meta = copy.copy(x.meta)
+        self.node_to_placeholder[x] = placeholder_node
+
+    def remap_input(self, x):
+        assert x.graph is self.flat_graph
+        if x in self.node_map:
+            return self.node_map[x]
+        if x not in self.node_to_placeholder:
+            self.add_placeholder(x)
+            if self.parent_call_module is not None:
+                # Important to *prepend* the output to match how we are
+                # inserting placeholder nodes.
+                self.parent_call_module.insert_arg(0, self.parent.remap_input(x))
+        return self.node_to_placeholder[x]
+
+    def finalize_outputs(self):
+        orig_outputs = []
+
+        signature = self.module_call_graph.get(self.fqn)
+        if signature is not None and self.parent is not None:
+            for output in signature.outputs:
+                if isinstance(output, (TensorArgument, SymIntArgument)):
+                    orig_outputs.append(self.seen_nodes[output.name])
+                else:
+                    raise RuntimeError(
+                        f"Unsupported data type for output node: {output}"
+                    )
+
+            tree_out_node = _generate_unflatten(
+                self.module,
+                tuple(
+                    self.node_map[self.seen_nodes[output.name]]
+                    for output in orig_outputs
+                ),
+                signature.out_spec,
+            )
+            parent_out: Optional[torch.fx.Node] = _generate_flatten(
+                self.parent.module, self.parent_call_module, signature.out_spec
+            )
+            graph_outputs: Union[torch.fx.Node, List[torch.fx.Node]] = tree_out_node
+        else:
+            graph_outputs = []
+            # Iterate through nodes we have copied into self.graph.
+            for orig_node in self.node_map.keys():
+                for user_node in orig_node.users:
+                    if user_node.name not in self.seen_nodes:
+                        # external user node, need to expose as an output
+                        orig_outputs.append(orig_node)
+                        graph_outputs.append(self.node_map[orig_node])
+                        break
+
+            parent_out = self.parent_call_module
+            if len(graph_outputs) == 1:
+                graph_outputs = graph_outputs[0]
+
+        assert isinstance(graph_outputs, (list, torch.fx.Node))
+
+        self.graph.output(graph_outputs)
+
+        # Rewrite outputs in parent module
+        if parent_out is None:
+            return
+
+        parent_out.meta["val"] = (
+            graph_outputs.meta.get("val")
+            if isinstance(graph_outputs, torch.fx.Node)
+            else [o.meta.get("val") for o in graph_outputs]
+        )
+
+        if len(orig_outputs) == 1 and signature is None:
+            self.parent.node_map[orig_outputs[0]] = parent_out
+        else:
+            for i, orig_output in enumerate(orig_outputs):
+                # Use Proxy to record getitem access.
+                proxy_out = torch.fx.Proxy(parent_out)[i].node  # type: ignore[index]
+                proxy_out.meta["val"] = orig_output.meta.get("val")
+                self.parent.node_map[orig_output] = proxy_out
+
+        if self.cached_graph_module is not None:
+            _verify_graph_equivalence(self.cached_graph_module, self.module)
+
+    def copy_node(self, node):
+        self.print("copying", node.format_node())
+        self.node_map[node] = self.graph.node_copy(node, self.remap_input)
+        self.seen_nodes[node.name] = node
+
+    def run_outer(self):
+        i = 0
+        for node in self.flat_graph.nodes:
+            self.print(i, node.meta.get("nn_module_stack"), node.format_node())
+            i += 1
+
+        # Copy all graph inputs
+        node_idx: int = 0
+        node = self.nodes[node_idx]
+        while node.op == "placeholder":
+            self.copy_node(node)
+            node_idx += 1
+            node = self.nodes[node_idx]
+
+        self.run_from(node_idx)
+
+        # Copy graph outputs
+        for node in self.flat_graph.nodes:
+            if node.op == "output":
+                self.copy_node(node)
+
+    def print(self, *args, **kwargs):
+        if self.verbose:
+            print(*args, **kwargs)
+
+    def run_from(self, node_idx):
+        module_idx = 0
+        # Walk through the graph, building up a new graph with the right submodules
+        while node_idx < len(self.nodes):
+            node = self.nodes[node_idx]
+            assert node.op != "placeholder"
+
+            self.print()
+            self.print("STEP", node_idx, node.format_node())
+            self.print(self.module_stack)
+            if node.op == "output":
+                if len(self.module_stack) == 1:
+                    # We want the output node of the original graph to be handled
+                    # specially by the outermost stack frame (in run_outer). So
+                    # skip finalization here.
+                    return node_idx
+
+                # We've reached the end of the graph. Wrap up all the existing stack frames.
+                self.finalize_outputs()
+                return node_idx
+
+            node_module_stack = (
+                [path for path, ty in node.meta["nn_module_stack"].values()]
+                if "nn_module_stack" in node.meta
+                else self.module_stack
+            )
+            if node_module_stack[: len(self.module_stack)] != self.module_stack:
+                # This means that the current module is done executing and the
+                # current node is the beginning of a new module.
+                #
+                # In this case, we should finalize this module and return without
+                # incrementing the node counter.
+                self.finalize_outputs()
+                self.print("outlining", self.fqn)
+                self.print(self.graph)
+                return node_idx
+
+            assert node_module_stack is not None
+
+            if _is_prefix(self.module_stack, node_module_stack):
+                # This means that the current node represents the execution of a new
+                # module.
+                next_module = node_module_stack[len(self.module_stack)]
+                self.print("Creating new stack frame for", next_module)
+                # Run a nested version of module outliner from the current node
+                # counter. Once it is complete, continue from that point.
+                node_idx = _ModuleFrame(
+                    self.flat_graph,
+                    self.nodes,
+                    self.seen_nodes,
+                    self.seen_modules,
+                    self,
+                    self.module_stack + [next_module],
+                    list(node.meta["nn_module_stack"].keys())[len(self.module_stack)],
+                    self.module_call_graph,
+                ).run_from(node_idx)
+                module_idx += 1
+                continue
+
+            # The only remaining possibility is that we are in the right stack
+            # frame. Copy the node into this frame's graph and increment the node counter.
+            assert node_module_stack == self.module_stack
+            self.copy_node(node)
+            node_idx += 1
+
+
+def _outline_submodules(orig_graph: torch.fx.Graph, root_module: UnflattenedModule):
+    seen_nodes: Dict[str, torch.fx.Node] = {}
+    seen_modules: Dict[int, torch.nn.Module] = {}
+    _ModuleFrame(
+        orig_graph,
+        tuple(orig_graph.nodes),
+        seen_nodes,
+        seen_modules,
+        None,
+        [""],
+        "",
+        {
+            entry.fqn: entry.signature
+            for entry in root_module.module_call_graph
+            if entry.signature
+        },
+        module=root_module,
+    ).run_outer()
+
+
+def _sink_params(
+    module: torch.nn.Module,
+    inputs_to_state: Dict[str, str],
+    scope: List[str],
+):
+    """Sink params, buffers, and constants from graph inputs into get_attr nodes.
+
+    Exported modules are purely functional, so they pass their parameters and
+    buffers in as inputs to the graph.
+
+    To replicate eager's semantics, we need to get them from the module state
+    via get_attr instead.
+
+    module: GraphModule, potentially containining nested submodules.
+    inputs_to_state: mapping graph input names to the corresponding key in the state_dict.
+    scope: tracks where we are in the module hierarchy, so that we can emit the
+        right `getattr(self, "foo.bar")` calls, etc.
+    """
+    # We need to use _modules here instead of named_children(), because we
+    # explicitly want duplicate modules to show up in the traversal.
+    for name, submodule in module._modules.items():
+        _sink_params(cast(torch.nn.Module, submodule), inputs_to_state, scope + [name])
+
+    if not hasattr(module, "graph"):
+        # Not all modules have graphs defined, if they are empty modules with no operations (like ParameterList)
+        return
+
+    graph = module.graph
+    inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
+    the_last_input = inputs[-1]
+
+    # Also remove from call_module nodes
+    call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
+    for node in call_module_nodes:
+        node.args = tuple(filter(lambda n: n.name not in inputs_to_state, node.args))
+
+    for node in inputs:
+        if node.name not in inputs_to_state:
+            continue
+
+        if len(node.users) > 0:
+            state_name = inputs_to_state[node.name].split(".")
+            # If there's a mismatch beteewn scope name and state name, then there must be multuple scopes
+            # pointing to the same state name, meaning some modules are shared. In such case, we can simply
+            # skip updating the current node because another later iteration will take care of this input
+            # node when the unique match between scope and state name occurs.
+            # To make sure this always happen, we should enforce the invariant that no placeholder node
+            # in the unflattened graph appears in inputs_to_state dict, which means all the extra input
+            # nodes have been handled.
+            if state_name[: len(scope)] != scope:
+                continue
+            attr_path = state_name[len(scope) :]
+            state_attr = _recursive_getattr(module, attr_path)
+            assert isinstance(state_attr, (torch.Tensor, torch.ScriptObject))
+
+            # Make sure the newly created get_attr node is placed after the last placeholder node
+            with graph.inserting_after(the_last_input):
+                new_node = graph.create_node("get_attr", ".".join(attr_path))
+
+            node.replace_all_uses_with(new_node, propagate_meta=True)
+        graph.erase_node(node)
+    if isinstance(module, InterpreterModule):
+        module.finalize()
+
+
+def _recursive_getattr(obj, attr_path):
+    for attr in attr_path:
+        obj = getattr(obj, attr)
+
+    return obj
diff --git a/MLPY/Lib/site-packages/torch/fft/__init__.py b/MLPY/Lib/site-packages/torch/fft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fd07fb38dfcd97bac41c7dce6286c06698ef981
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fft/__init__.py
@@ -0,0 +1,1360 @@
+import sys
+
+import torch
+from torch._C import _add_docstr, _fft  # type: ignore[attr-defined]
+from torch._torch_docs import factory_common_args, common_args
+
+__all__ = ['fft', 'ifft', 'fft2', 'ifft2', 'fftn', 'ifftn',
+           'rfft', 'irfft', 'rfft2', 'irfft2', 'rfftn', 'irfftn',
+           'hfft', 'ihfft', 'fftfreq', 'rfftfreq', 'fftshift', 'ifftshift',
+           'Tensor']
+
+Tensor = torch.Tensor
+
+# Note: This not only adds the doc strings for the spectral ops, but
+# connects the torch.fft Python namespace to the torch._C._fft builtins.
+
+fft = _add_docstr(_fft.fft_fft, r"""
+fft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the one dimensional discrete Fourier transform of :attr:`input`.
+
+Note:
+    The Fourier domain representation of any real signal satisfies the
+    Hermitian property: `X[i] = conj(X[-i])`. This function always returns both
+    the positive and negative frequency terms even though, for real inputs, the
+    negative frequencies are redundant. :func:`~torch.fft.rfft` returns the
+    more compact one-sided representation where only the positive frequencies
+    are returned.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
+Args:
+    input (Tensor): the input tensor
+    n (int, optional): Signal length. If given, the input will either be zero-padded
+        or trimmed to this length before computing the FFT.
+    dim (int, optional): The dimension along which to take the one dimensional FFT.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.fft`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Calling the backward transform (:func:`~torch.fft.ifft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifft`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.arange(4)
+    >>> t
+    tensor([0, 1, 2, 3])
+    >>> torch.fft.fft(t)
+    tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
+
+    >>> t = torch.tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j])
+    >>> torch.fft.fft(t)
+    tensor([12.+16.j, -8.+0.j, -4.-4.j,  0.-8.j])
+""".format(**common_args))
+
+ifft = _add_docstr(_fft.fft_ifft, r"""
+ifft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the one dimensional inverse discrete Fourier transform of :attr:`input`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
+Args:
+    input (Tensor): the input tensor
+    n (int, optional): Signal length. If given, the input will either be zero-padded
+        or trimmed to this length before computing the IFFT.
+    dim (int, optional): The dimension along which to take the one dimensional IFFT.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ifft`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Calling the forward transform (:func:`~torch.fft.fft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifft`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
+    >>> torch.fft.ifft(t)
+    tensor([0.+0.j, 1.+0.j, 2.+0.j, 3.+0.j])
+""".format(**common_args))
+
+fft2 = _add_docstr(_fft.fft_fft2, r"""
+fft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the 2 dimensional discrete Fourier transform of :attr:`input`.
+Equivalent to :func:`~torch.fft.fftn` but FFTs only the last two dimensions by default.
+
+Note:
+    The Fourier domain representation of any real signal satisfies the
+    Hermitian property: ``X[i, j] = conj(X[-i, -j])``. This
+    function always returns all positive and negative frequency terms even
+    though, for real inputs, half of these values are redundant.
+    :func:`~torch.fft.rfft2` returns the more compact one-sided representation
+    where only the positive frequencies of the last dimension are returned.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.fft2`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ifft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n``
+        between the two transforms. This is required to make
+        :func:`~torch.fft.ifft2` the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> fft2 = torch.fft.fft2(x)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.fft2`
+    here is equivalent to two one-dimensional :func:`~torch.fft.fft` calls:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
+    >>> torch.testing.assert_close(fft2, two_ffts, check_stride=False)
+
+""".format(**common_args))
+
+ifft2 = _add_docstr(_fft.fft_ifft2, r"""
+ifft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the 2 dimensional inverse discrete Fourier transform of :attr:`input`.
+Equivalent to :func:`~torch.fft.ifftn` but IFFTs only the last two dimensions by default.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ifft2`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.fft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifft2`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> ifft2 = torch.fft.ifft2(x)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ifft2`
+    here is equivalent to two one-dimensional :func:`~torch.fft.ifft` calls:
+
+    >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
+    >>> torch.testing.assert_close(ifft2, two_iffts, check_stride=False)
+
+""".format(**common_args))
+
+fftn = _add_docstr(_fft.fft_fftn, r"""
+fftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the N dimensional discrete Fourier transform of :attr:`input`.
+
+Note:
+    The Fourier domain representation of any real signal satisfies the
+    Hermitian property: ``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])``. This
+    function always returns all positive and negative frequency terms even
+    though, for real inputs, half of these values are redundant.
+    :func:`~torch.fft.rfftn` returns the more compact one-sided representation
+    where only the positive frequencies of the last dimension are returned.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.fftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ifftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n``
+        between the two transforms. This is required to make
+        :func:`~torch.fft.ifftn` the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> fftn = torch.fft.fftn(x)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.fftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.fft` calls:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.fft(x, dim=0), dim=1)
+    >>> torch.testing.assert_close(fftn, two_ffts, check_stride=False)
+
+""".format(**common_args))
+
+ifftn = _add_docstr(_fft.fft_ifftn, r"""
+ifftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the N dimensional inverse discrete Fourier transform of :attr:`input`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ifftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.fftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ifftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> x = torch.rand(10, 10, dtype=torch.complex64)
+    >>> ifftn = torch.fft.ifftn(x)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ifftn`
+    here is equivalent to two one-dimensional :func:`~torch.fft.ifft` calls:
+
+    >>> two_iffts = torch.fft.ifft(torch.fft.ifft(x, dim=0), dim=1)
+    >>> torch.testing.assert_close(ifftn, two_iffts, check_stride=False)
+
+""".format(**common_args))
+
+rfft = _add_docstr(_fft.fft_rfft, r"""
+rfft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the one dimensional Fourier transform of real-valued :attr:`input`.
+
+The FFT of a real signal is Hermitian-symmetric, ``X[i] = conj(X[-i])`` so
+the output contains only the positive frequencies below the Nyquist frequency.
+To compute the full output, use :func:`~torch.fft.fft`
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
+Args:
+    input (Tensor): the real input tensor
+    n (int, optional): Signal length. If given, the input will either be zero-padded
+        or trimmed to this length before computing the real FFT.
+    dim (int, optional): The dimension along which to take the one dimensional real FFT.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.rfft`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the FFT orthonormal)
+
+        Calling the backward transform (:func:`~torch.fft.irfft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfft`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.arange(4)
+    >>> t
+    tensor([0, 1, 2, 3])
+    >>> torch.fft.rfft(t)
+    tensor([ 6.+0.j, -2.+2.j, -2.+0.j])
+
+    Compare against the full output from :func:`~torch.fft.fft`:
+
+    >>> torch.fft.fft(t)
+    tensor([ 6.+0.j, -2.+2.j, -2.+0.j, -2.-2.j])
+
+    Notice that the symmetric element ``T[-1] == T[1].conj()`` is omitted.
+    At the Nyquist frequency ``T[-2] == T[2]`` is it's own symmetric pair,
+    and therefore must always be real-valued.
+""".format(**common_args))
+
+irfft = _add_docstr(_fft.fft_irfft, r"""
+irfft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.rfft`.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier
+domain, as produced by :func:`~torch.fft.rfft`. By the Hermitian property, the
+output will be real-valued.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`n`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal length :attr:`n`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+    With default arguments, size of the transformed dimension should be (2^n + 1) as argument
+    `n` defaults to even output size = 2 * (transformed_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor representing a half-Hermitian signal
+    n (int, optional): Output signal length. This determines the length of the
+        output signal. If given, the input will either be zero-padded or trimmed to this
+        length before computing the real IFFT.
+        Defaults to even output: ``n=2*(input.size(dim) - 1)``.
+    dim (int, optional): The dimension along which to take the one dimensional real IFFT.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.irfft`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal)
+
+        Calling the forward transform (:func:`~torch.fft.rfft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfft`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.linspace(0, 1, 5)
+    >>> t
+    tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])
+    >>> T = torch.fft.rfft(t)
+    >>> T
+    tensor([ 2.5000+0.0000j, -0.6250+0.8602j, -0.6250+0.2031j])
+
+    Without specifying the output length to :func:`~torch.fft.irfft`, the output
+    will not round-trip properly because the input is odd-length:
+
+    >>> torch.fft.irfft(T)
+    tensor([0.1562, 0.3511, 0.7812, 1.2114])
+
+    So, it is recommended to always pass the signal length :attr:`n`:
+
+    >>> roundtrip = torch.fft.irfft(T, t.numel())
+    >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
+
+""".format(**common_args))
+
+rfft2 = _add_docstr(_fft.fft_rfft2, r"""
+rfft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the 2-dimensional discrete Fourier transform of real :attr:`input`.
+Equivalent to :func:`~torch.fft.rfftn` but FFTs only the last two dimensions by default.
+
+The FFT of a real signal is Hermitian-symmetric, ``X[i, j] = conj(X[-i, -j])``,
+so the full :func:`~torch.fft.fft2` output contains redundant information.
+:func:`~torch.fft.rfft2` instead omits the negative frequencies in the last
+dimension.
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.rfft2`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.irfft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfft2`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.rand(10, 10)
+    >>> rfft2 = torch.fft.rfft2(t)
+    >>> rfft2.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.fft2`, we have all
+    elements up to the Nyquist frequency.
+
+    >>> fft2 = torch.fft.fft2(t)
+    >>> torch.testing.assert_close(fft2[..., :6], rfft2, check_stride=False)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.rfft2`
+    here is equivalent to a combination of :func:`~torch.fft.fft` and
+    :func:`~torch.fft.rfft`:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.rfft(t, dim=1), dim=0)
+    >>> torch.testing.assert_close(rfft2, two_ffts, check_stride=False)
+
+""".format(**common_args))
+
+irfft2 = _add_docstr(_fft.fft_irfft2, r"""
+irfft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.rfft2`.
+Equivalent to :func:`~torch.fft.irfftn` but IFFTs only the last two dimensions by default.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier
+domain, as produced by :func:`~torch.fft.rfft2`. By the Hermitian property, the
+output will be real-valued.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`s`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal shape :attr:`s`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.irfft2`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.rfft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfft2`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.rand(10, 9)
+    >>> T = torch.fft.rfft2(t)
+
+    Without specifying the output length to :func:`~torch.fft.irfft2`, the output
+    will not round-trip properly because the input is odd-length in the last
+    dimension:
+
+    >>> torch.fft.irfft2(T).size()
+    torch.Size([10, 8])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.irfft2(T, t.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
+
+""".format(**common_args))
+
+rfftn = _add_docstr(_fft.fft_rfftn, r"""
+rfftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the N-dimensional discrete Fourier transform of real :attr:`input`.
+
+The FFT of a real signal is Hermitian-symmetric,
+``X[i_1, ..., i_n] = conj(X[-i_1, ..., -i_n])`` so the full
+:func:`~torch.fft.fftn` output contains redundant information.
+:func:`~torch.fft.rfftn` instead omits the negative frequencies in the
+last dimension.
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.rfftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.irfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.rand(10, 10)
+    >>> rfftn = torch.fft.rfftn(t)
+    >>> rfftn.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.fftn`, we have all
+    elements up to the Nyquist frequency.
+
+    >>> fftn = torch.fft.fftn(t)
+    >>> torch.testing.assert_close(fftn[..., :6], rfftn, check_stride=False)
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.rfftn`
+    here is equivalent to a combination of :func:`~torch.fft.fft` and
+    :func:`~torch.fft.rfft`:
+
+    >>> two_ffts = torch.fft.fft(torch.fft.rfft(t, dim=1), dim=0)
+    >>> torch.testing.assert_close(rfftn, two_ffts, check_stride=False)
+
+""".format(**common_args))
+
+irfftn = _add_docstr(_fft.fft_irfftn, r"""
+irfftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.rfftn`.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the Fourier
+domain, as produced by :func:`~torch.fft.rfftn`. By the Hermitian property, the
+output will be real-valued.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`s`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal shape :attr:`s`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.irfftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the real IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.rfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.irfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.rand(10, 9)
+    >>> T = torch.fft.rfftn(t)
+
+    Without specifying the output length to :func:`~torch.fft.irfft`, the output
+    will not round-trip properly because the input is odd-length in the last
+    dimension:
+
+    >>> torch.fft.irfftn(T).size()
+    torch.Size([10, 8])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.irfftn(T, t.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.testing.assert_close(roundtrip, t, check_stride=False)
+
+""".format(**common_args))
+
+hfft = _add_docstr(_fft.fft_hfft, r"""
+hfft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the one dimensional discrete Fourier transform of a Hermitian
+symmetric :attr:`input` signal.
+
+Note:
+
+    :func:`~torch.fft.hfft`/:func:`~torch.fft.ihfft` are analogous to
+    :func:`~torch.fft.rfft`/:func:`~torch.fft.irfft`. The real FFT expects
+    a real signal in the time-domain and gives a Hermitian symmetry in the
+    frequency-domain. The Hermitian FFT is the opposite; Hermitian symmetric in
+    the time-domain and real-valued in the frequency-domain. For this reason,
+    special care needs to be taken with the length argument :attr:`n`, in the
+    same way as with :func:`~torch.fft.irfft`.
+
+Note:
+    Because the signal is Hermitian in the time-domain, the result will be
+    real in the frequency domain. Note that some input frequencies must be
+    real-valued to satisfy the Hermitian property. In these cases the imaginary
+    component will be ignored. For example, any imaginary component in
+    ``input[0]`` would result in one or more complex frequency terms which
+    cannot be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`n`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. So, it is recommended to always pass the signal length :attr:`n`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+    With default arguments, size of the transformed dimension should be (2^n + 1) as argument
+    `n` defaults to even output size = 2 * (transformed_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor representing a half-Hermitian signal
+    n (int, optional): Output signal length. This determines the length of the
+        real output. If given, the input will either be zero-padded or trimmed to this
+        length before computing the Hermitian FFT.
+        Defaults to even output: ``n=2*(input.size(dim) - 1)``.
+    dim (int, optional): The dimension along which to take the one dimensional Hermitian FFT.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.hfft`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the Hermitian FFT orthonormal)
+
+        Calling the backward transform (:func:`~torch.fft.ihfft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfft`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    Taking a real-valued frequency signal and bringing it into the time domain
+    gives Hermitian symmetric output:
+
+    >>> t = torch.linspace(0, 1, 5)
+    >>> t
+    tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])
+    >>> T = torch.fft.ifft(t)
+    >>> T
+    tensor([ 0.5000-0.0000j, -0.1250-0.1720j, -0.1250-0.0406j, -0.1250+0.0406j,
+            -0.1250+0.1720j])
+
+    Note that ``T[1] == T[-1].conj()`` and ``T[2] == T[-2].conj()`` is
+    redundant. We can thus compute the forward transform without considering
+    negative frequencies:
+
+    >>> torch.fft.hfft(T[:3], n=5)
+    tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])
+
+    Like with :func:`~torch.fft.irfft`, the output length must be given in order
+    to recover an even length output:
+
+    >>> torch.fft.hfft(T[:3])
+    tensor([0.1250, 0.2809, 0.6250, 0.9691])
+""".format(**common_args))
+
+ihfft = _add_docstr(_fft.fft_ihfft, r"""
+ihfft(input, n=None, dim=-1, norm=None, *, out=None) -> Tensor
+
+Computes the inverse of :func:`~torch.fft.hfft`.
+
+:attr:`input` must be a real-valued signal, interpreted in the Fourier domain.
+The IFFT of a real signal is Hermitian-symmetric, ``X[i] = conj(X[-i])``.
+:func:`~torch.fft.ihfft` represents this in the one-sided form where only the
+positive frequencies below the Nyquist frequency are included. To compute the
+full output, use :func:`~torch.fft.ifft`.
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimension.
+
+Args:
+    input (Tensor): the real input tensor
+    n (int, optional): Signal length. If given, the input will either be zero-padded
+        or trimmed to this length before computing the Hermitian IFFT.
+    dim (int, optional): The dimension along which to take the one dimensional Hermitian IFFT.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ihfft`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the IFFT orthonormal)
+
+        Calling the forward transform (:func:`~torch.fft.hfft`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfft`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> t = torch.arange(5)
+    >>> t
+    tensor([0, 1, 2, 3, 4])
+    >>> torch.fft.ihfft(t)
+    tensor([ 2.0000-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j])
+
+    Compare against the full output from :func:`~torch.fft.ifft`:
+
+    >>> torch.fft.ifft(t)
+    tensor([ 2.0000-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j,
+            -0.5000+0.6882j])
+""".format(**common_args))
+
+hfft2 = _add_docstr(_fft.fft_hfft2, r"""
+hfft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the 2-dimensional discrete Fourier transform of a Hermitian symmetric
+:attr:`input` signal. Equivalent to :func:`~torch.fft.hfftn` but only
+transforms the last two dimensions by default.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the time
+domain. By the Hermitian property, the Fourier transform will be real-valued.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the Hermitian FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.hfft2`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the Hermitian FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ihfft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfft2`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    Starting from a real frequency-space signal, we can generate a
+    Hermitian-symmetric time-domain signal:
+    >>> T = torch.rand(10, 9)
+    >>> t = torch.fft.ihfft2(T)
+
+    Without specifying the output length to :func:`~torch.fft.hfftn`, the
+    output will not round-trip properly because the input is odd-length in the
+    last dimension:
+
+    >>> torch.fft.hfft2(t).size()
+    torch.Size([10, 10])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.hfft2(t, T.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.allclose(roundtrip, T)
+    True
+
+""".format(**common_args))
+
+ihfft2 = _add_docstr(_fft.fft_ihfft2, r"""
+ihfft2(input, s=None, dim=(-2, -1), norm=None, *, out=None) -> Tensor
+
+Computes the 2-dimensional inverse discrete Fourier transform of real
+:attr:`input`. Equivalent to :func:`~torch.fft.ihfftn` but transforms only the
+two last dimensions by default.
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the Hermitian IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: last two dimensions.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ihfft2`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the Hermitian IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.hfft2`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfft2`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> T = torch.rand(10, 10)
+    >>> t = torch.fft.ihfft2(t)
+    >>> t.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.ifft2`, the
+    Hermitian time-space signal takes up only half the space.
+
+    >>> fftn = torch.fft.ifft2(t)
+    >>> torch.allclose(fftn[..., :6], rfftn)
+    True
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ihfft2`
+    here is equivalent to a combination of :func:`~torch.fft.ifft` and
+    :func:`~torch.fft.ihfft`:
+
+    >>> two_ffts = torch.fft.ifft(torch.fft.ihfft(t, dim=1), dim=0)
+    >>> torch.allclose(t, two_ffts)
+    True
+
+""".format(**common_args))
+
+hfftn = _add_docstr(_fft.fft_hfftn, r"""
+hfftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the n-dimensional discrete Fourier transform of a Hermitian symmetric
+:attr:`input` signal.
+
+:attr:`input` is interpreted as a one-sided Hermitian signal in the time
+domain. By the Hermitian property, the Fourier transform will be real-valued.
+
+Note:
+    :func:`~torch.fft.hfftn`/:func:`~torch.fft.ihfftn` are analogous to
+    :func:`~torch.fft.rfftn`/:func:`~torch.fft.irfftn`. The real FFT expects
+    a real signal in the time-domain and gives Hermitian symmetry in the
+    frequency-domain. The Hermitian FFT is the opposite; Hermitian symmetric in
+    the time-domain and real-valued in the frequency-domain. For this reason,
+    special care needs to be taken with the shape argument :attr:`s`, in the
+    same way as with :func:`~torch.fft.irfftn`.
+
+Note:
+    Some input frequencies must be real-valued to satisfy the Hermitian
+    property. In these cases the imaginary component will be ignored.
+    For example, any imaginary component in the zero-frequency term cannot
+    be represented in a real output and so will always be ignored.
+
+Note:
+    The correct interpretation of the Hermitian input depends on the length of
+    the original data, as given by :attr:`s`. This is because each input shape
+    could correspond to either an odd or even length signal. By default, the
+    signal is assumed to be even length and odd signals will not round-trip
+    properly. It is recommended to always pass the signal shape :attr:`s`.
+
+Note:
+    Supports torch.half and torch.chalf on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+    With default arguments, the size of last dimension should be (2^n + 1) as argument
+    `s` defaults to even output size = 2 * (last_dim_size - 1)
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the real FFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Defaults to even output in the last dimension:
+        ``s[-1] = 2*(input.size(dim[-1]) - 1)``.
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        The last dimension must be the half-Hermitian compressed dimension.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the forward transform
+        (:func:`~torch.fft.hfftn`), these correspond to:
+
+        * ``"forward"`` - normalize by ``1/n``
+        * ``"backward"`` - no normalization
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the Hermitian FFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical FFT size.
+        Calling the backward transform (:func:`~torch.fft.ihfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (no normalization).
+
+Keyword args:
+    {out}
+
+Example:
+
+    Starting from a real frequency-space signal, we can generate a
+    Hermitian-symmetric time-domain signal:
+    >>> T = torch.rand(10, 9)
+    >>> t = torch.fft.ihfftn(T)
+
+    Without specifying the output length to :func:`~torch.fft.hfftn`, the
+    output will not round-trip properly because the input is odd-length in the
+    last dimension:
+
+    >>> torch.fft.hfftn(t).size()
+    torch.Size([10, 10])
+
+    So, it is recommended to always pass the signal shape :attr:`s`.
+
+    >>> roundtrip = torch.fft.hfftn(t, T.size())
+    >>> roundtrip.size()
+    torch.Size([10, 9])
+    >>> torch.allclose(roundtrip, T)
+    True
+
+""".format(**common_args))
+
+ihfftn = _add_docstr(_fft.fft_ihfftn, r"""
+ihfftn(input, s=None, dim=None, norm=None, *, out=None) -> Tensor
+
+Computes the N-dimensional inverse discrete Fourier transform of real :attr:`input`.
+
+:attr:`input` must be a real-valued signal, interpreted in the Fourier domain.
+The n-dimensional IFFT of a real signal is Hermitian-symmetric,
+``X[i, j, ...] = conj(X[-i, -j, ...])``. :func:`~torch.fft.ihfftn` represents
+this in the one-sided form where only the positive frequencies below the
+Nyquist frequency are included in the last signal dimension. To compute the
+full output, use :func:`~torch.fft.ifftn`.
+
+Note:
+    Supports torch.half on CUDA with GPU Architecture SM53 or greater.
+    However it only supports powers of 2 signal length in every transformed dimensions.
+
+Args:
+    input (Tensor): the input tensor
+    s (Tuple[int], optional): Signal size in the transformed dimensions.
+        If given, each dimension ``dim[i]`` will either be zero-padded or
+        trimmed to the length ``s[i]`` before computing the Hermitian IFFT.
+        If a length ``-1`` is specified, no padding is done in that dimension.
+        Default: ``s = [input.size(d) for d in dim]``
+    dim (Tuple[int], optional): Dimensions to be transformed.
+        Default: all dimensions, or the last ``len(s)`` dimensions if :attr:`s` is given.
+    norm (str, optional): Normalization mode. For the backward transform
+        (:func:`~torch.fft.ihfftn`), these correspond to:
+
+        * ``"forward"`` - no normalization
+        * ``"backward"`` - normalize by ``1/n``
+        * ``"ortho"`` - normalize by ``1/sqrt(n)`` (making the Hermitian IFFT orthonormal)
+
+        Where ``n = prod(s)`` is the logical IFFT size.
+        Calling the forward transform (:func:`~torch.fft.hfftn`) with the same
+        normalization mode will apply an overall normalization of ``1/n`` between
+        the two transforms. This is required to make :func:`~torch.fft.ihfftn`
+        the exact inverse.
+
+        Default is ``"backward"`` (normalize by ``1/n``).
+
+Keyword args:
+    {out}
+
+Example:
+
+    >>> T = torch.rand(10, 10)
+    >>> ihfftn = torch.fft.ihfftn(T)
+    >>> ihfftn.size()
+    torch.Size([10, 6])
+
+    Compared against the full output from :func:`~torch.fft.ifftn`, we have all
+    elements up to the Nyquist frequency.
+
+    >>> ifftn = torch.fft.ifftn(t)
+    >>> torch.allclose(ifftn[..., :6], ihfftn)
+    True
+
+    The discrete Fourier transform is separable, so :func:`~torch.fft.ihfftn`
+    here is equivalent to a combination of :func:`~torch.fft.ihfft` and
+    :func:`~torch.fft.ifft`:
+
+    >>> two_iffts = torch.fft.ifft(torch.fft.ihfft(t, dim=1), dim=0)
+    >>> torch.allclose(ihfftn, two_iffts)
+    True
+
+""".format(**common_args))
+
+fftfreq = _add_docstr(_fft.fft_fftfreq, r"""
+fftfreq(n, d=1.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the discrete Fourier Transform sample frequencies for a signal of size :attr:`n`.
+
+Note:
+    By convention, :func:`~torch.fft.fft` returns positive frequency terms
+    first, followed by the negative frequencies in reverse order, so that
+    ``f[-i]`` for all :math:`0 < i \leq n/2`` in Python gives the negative
+    frequency terms. For an FFT of length :attr:`n` and with inputs spaced in
+    length unit :attr:`d`, the frequencies are::
+
+        f = [0, 1, ..., (n - 1) // 2, -(n // 2), ..., -1] / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftfreq` follows NumPy's
+    convention of taking it to be negative.
+
+Args:
+    n (int): the FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> torch.fft.fftfreq(5)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    For even input, we can see the Nyquist frequency at ``f[2]`` is given as
+    negative:
+
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+rfftfreq = _add_docstr(_fft.fft_rfftfreq, r"""
+rfftfreq(n, d=1.0, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
+
+Computes the sample frequencies for :func:`~torch.fft.rfft` with a signal of size :attr:`n`.
+
+Note:
+    :func:`~torch.fft.rfft` returns Hermitian one-sided output, so only the
+    positive frequency terms are returned. For a real FFT of length :attr:`n`
+    and with inputs spaced in length unit :attr:`d`, the frequencies are::
+
+        f = torch.arange((n + 1) // 2) / (d * n)
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. Unlike :func:`~torch.fft.fftfreq`,
+    :func:`~torch.fft.rfftfreq` always returns it as positive.
+
+Args:
+    n (int): the real FFT length
+    d (float, optional): The sampling length scale.
+        The spacing between individual samples of the FFT input.
+        The default assumes unit spacing, dividing that result by the actual
+        spacing gives the result in physical frequency units.
+
+Keyword Args:
+    {out}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Example:
+
+    >>> torch.fft.rfftfreq(5)
+    tensor([0.0000, 0.2000, 0.4000])
+
+    >>> torch.fft.rfftfreq(4)
+    tensor([0.0000, 0.2500, 0.5000])
+
+    Compared to the output from :func:`~torch.fft.fftfreq`, we see that the
+    Nyquist frequency at ``f[2]`` has changed sign:
+    >>> torch.fft.fftfreq(4)
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+""".format(**factory_common_args))
+
+fftshift = _add_docstr(_fft.fft_fftshift, r"""
+fftshift(input, dim=None) -> Tensor
+
+Reorders n-dimensional FFT data, as provided by :func:`~torch.fft.fftn`, to have
+negative frequency terms first.
+
+This performs a periodic shift of n-dimensional data such that the origin
+``(0, ..., 0)`` is moved to the center of the tensor. Specifically, to
+``input.shape[dim] // 2`` in each selected dimension.
+
+Note:
+    By convention, the FFT returns positive frequency terms first, followed by
+    the negative frequencies in reverse order, so that ``f[-i]`` for all
+    :math:`0 < i \leq n/2` in Python gives the negative frequency terms.
+    :func:`~torch.fft.fftshift` rearranges all frequencies into ascending order
+    from negative to positive with the zero-frequency term in the center.
+
+Note:
+    For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as
+    either negative or positive. :func:`~torch.fft.fftshift` always puts the
+    Nyquist term at the 0-index. This is the same convention used by
+    :func:`~torch.fft.fftfreq`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> f = torch.fft.fftfreq(4)
+    >>> f
+    tensor([ 0.0000,  0.2500, -0.5000, -0.2500])
+
+    >>> torch.fft.fftshift(f)
+    tensor([-0.5000, -0.2500,  0.0000,  0.2500])
+
+    Also notice that the Nyquist frequency term at ``f[2]`` was moved to the
+    beginning of the tensor.
+
+    This also works for multi-dimensional transforms:
+
+    >>> x = torch.fft.fftfreq(5, d=1/5) + 0.1 * torch.fft.fftfreq(5, d=1/5).unsqueeze(1)
+    >>> x
+    tensor([[ 0.0000,  1.0000,  2.0000, -2.0000, -1.0000],
+            [ 0.1000,  1.1000,  2.1000, -1.9000, -0.9000],
+            [ 0.2000,  1.2000,  2.2000, -1.8000, -0.8000],
+            [-0.2000,  0.8000,  1.8000, -2.2000, -1.2000],
+            [-0.1000,  0.9000,  1.9000, -2.1000, -1.1000]])
+
+    >>> torch.fft.fftshift(x)
+    tensor([[-2.2000, -1.2000, -0.2000,  0.8000,  1.8000],
+            [-2.1000, -1.1000, -0.1000,  0.9000,  1.9000],
+            [-2.0000, -1.0000,  0.0000,  1.0000,  2.0000],
+            [-1.9000, -0.9000,  0.1000,  1.1000,  2.1000],
+            [-1.8000, -0.8000,  0.2000,  1.2000,  2.2000]])
+
+    :func:`~torch.fft.fftshift` can also be useful for spatial data. If our
+    data is defined on a centered grid (``[-(N//2), (N-1)//2]``) then we can
+    use the standard FFT defined on an uncentered grid (``[0, N)``) by first
+    applying an :func:`~torch.fft.ifftshift`.
+
+    >>> x_centered = torch.arange(-5, 5)
+    >>> x_uncentered = torch.fft.ifftshift(x_centered)
+    >>> fft_uncentered = torch.fft.fft(x_uncentered)
+
+    Similarly, we can convert the frequency domain components to centered
+    convention by applying :func:`~torch.fft.fftshift`.
+
+    >>> fft_centered = torch.fft.fftshift(fft_uncentered)
+
+    The inverse transform, from centered Fourier space back to centered spatial
+    data, can be performed by applying the inverse shifts in reverse order:
+
+    >>> x_centered_2 = torch.fft.fftshift(torch.fft.ifft(torch.fft.ifftshift(fft_centered)))
+    >>> torch.testing.assert_close(x_centered.to(torch.complex64), x_centered_2, check_stride=False)
+
+
+""")
+
+ifftshift = _add_docstr(_fft.fft_ifftshift, r"""
+ifftshift(input, dim=None) -> Tensor
+
+Inverse of :func:`~torch.fft.fftshift`.
+
+Args:
+    input (Tensor): the tensor in FFT order
+    dim (int, Tuple[int], optional): The dimensions to rearrange.
+        Only dimensions specified here will be rearranged, any other dimensions
+        will be left in their original order.
+        Default: All dimensions of :attr:`input`.
+
+Example:
+
+    >>> f = torch.fft.fftfreq(5)
+    >>> f
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+    A round-trip through :func:`~torch.fft.fftshift` and
+    :func:`~torch.fft.ifftshift` gives the same result:
+
+    >>> shifted = torch.fft.fftshift(f)
+    >>> torch.fft.ifftshift(shifted)
+    tensor([ 0.0000,  0.2000,  0.4000, -0.4000, -0.2000])
+
+""")
diff --git a/MLPY/Lib/site-packages/torch/fft/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fft/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..366ebc07852ba0cdee9ffb10c48c89a9e1581991
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fft/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/func/__init__.py b/MLPY/Lib/site-packages/torch/func/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc09ec09c286cf71ccf5dab0df16f535d673910a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/func/__init__.py
@@ -0,0 +1,13 @@
+from torch._functorch.eager_transforms import (
+    vjp,
+    jvp,
+    jacrev,
+    jacfwd,
+    hessian,
+    functionalize,
+    linearize
+)
+from torch._functorch.apis import grad, grad_and_value
+from torch._functorch.functional_call import functional_call, stack_module_state
+from torch._functorch.batch_norm_replacement import replace_all_batch_norm_modules_
+from torch._functorch.apis import vmap
diff --git a/MLPY/Lib/site-packages/torch/func/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/func/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd25816d50d626323cf2b37b645e2731b29311ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/func/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/functional.py b/MLPY/Lib/site-packages/torch/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a08b93b8e8bf00dea9b2ef5248534073a2220c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/functional.py
@@ -0,0 +1,1983 @@
+from typing import (
+    List, Tuple, Optional, Union, Any, Sequence, TYPE_CHECKING
+)
+import operator
+import itertools
+
+import torch
+from torch._C import _add_docstr
+import torch.nn.functional as F
+from ._lowrank import svd_lowrank, pca_lowrank
+from .overrides import (
+    has_torch_function, has_torch_function_unary, has_torch_function_variadic,
+    handle_torch_function)
+from ._jit_internal import boolean_dispatch
+from ._jit_internal import _overload as overload
+
+Tensor = torch.Tensor
+from torch import _VF
+
+__all__ = [
+    'atleast_1d',
+    'atleast_2d',
+    'atleast_3d',
+    'align_tensors',
+    'broadcast_shapes',
+    'broadcast_tensors',
+    'cartesian_prod',
+    'block_diag',
+    'cdist',
+    'chain_matmul',
+    'einsum',
+    'istft',
+    'lu',
+    'norm',
+    'meshgrid',
+    'pca_lowrank',
+    'split',
+    'stft',
+    'svd_lowrank',
+    'tensordot',
+    'unique',
+    'unique_consecutive',
+    'unravel_index',
+]
+
+
+def broadcast_tensors(*tensors):
+    r"""broadcast_tensors(*tensors) -> List of Tensors
+
+    Broadcasts the given tensors according to :ref:`broadcasting-semantics`.
+
+    Args:
+        *tensors: any number of tensors of the same type
+
+    .. warning::
+
+        More than one element of a broadcasted tensor may refer to a single
+        memory location. As a result, in-place operations (especially ones that
+        are vectorized) may result in incorrect behavior. If you need to write
+        to the tensors, please clone them first.
+
+    Example::
+
+        >>> x = torch.arange(3).view(1, 3)
+        >>> y = torch.arange(2).view(2, 1)
+        >>> a, b = torch.broadcast_tensors(x, y)
+        >>> a.size()
+        torch.Size([2, 3])
+        >>> a
+        tensor([[0, 1, 2],
+                [0, 1, 2]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(broadcast_tensors, tensors, *tensors)
+    return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
+
+
+def broadcast_shapes(*shapes):
+    r"""broadcast_shapes(*shapes) -> Size
+
+    Similar to :func:`broadcast_tensors` but for shapes.
+
+    This is equivalent to
+    ``torch.broadcast_tensors(*map(torch.empty, shapes))[0].shape``
+    but avoids the need create to intermediate tensors. This is useful for
+    broadcasting tensors of common batch shape but different rightmost shape,
+    e.g. to broadcast mean vectors with covariance matrices.
+
+    Example::
+
+        >>> torch.broadcast_shapes((2,), (3, 1), (1, 1, 1))
+        torch.Size([1, 3, 2])
+
+    Args:
+        \*shapes (torch.Size): Shapes of tensors.
+
+    Returns:
+        shape (torch.Size): A shape compatible with all input shapes.
+
+    Raises:
+        RuntimeError: If shapes are incompatible.
+    """
+    # This wrapper exists to support variadic args.
+    # TODO Move this to C++ once the jit has better support for torch.Size.
+    if not torch.jit.is_tracing():
+        max_len = 0
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                if max_len < 1:
+                    max_len = 1
+            elif isinstance(shape, (tuple, list)):
+                s = len(shape)
+                if max_len < s:
+                    max_len = s
+        result = [1] * max_len
+
+        from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+
+        for shape in shapes:
+            if isinstance(shape, (int, torch.SymInt)):
+                shape = (shape,)
+            if isinstance(shape, (tuple, list)):
+                for i in range(-1, -1 - len(shape), -1):
+                    if shape[i] < 0:
+                        raise RuntimeError(f"Trying to create tensor with negative dimension ({shape[i]}): ({shape[i]})")
+                    # NB: result is initialized to 1 so this is effectively an
+                    # equals one test
+                    if guard_size_oblivious(shape[i] == 1) or guard_size_oblivious(shape[i] == result[i]):
+                        continue
+                    if result[i] != 1:
+                        raise RuntimeError("Shape mismatch: objects cannot be broadcast to a single shape")
+                    result[i] = shape[i]
+            else:
+                raise RuntimeError("Input shapes should be of type ints, a tuple of ints, or a list of ints, got ", shape)
+        return torch.Size(result)
+    else:
+        # with implementation above, torch.jit.trace hardcodes the sizes which makes subsequent replays fail
+        with torch.no_grad():
+            scalar = torch.zeros((), device="cpu")
+            tensors = [scalar.expand(shape) for shape in shapes]
+            tensors = broadcast_tensors(*tensors)
+            return tensors[0].shape
+
+
+def split(
+    tensor: Tensor, split_size_or_sections: Union[int, List[int]], dim: int = 0
+) -> Tuple[Tensor, ...]:
+    r"""Splits the tensor into chunks. Each chunk is a view of the original tensor.
+
+    If :attr:`split_size_or_sections` is an integer type, then :attr:`tensor` will
+    be split into equally sized chunks (if possible). Last chunk will be smaller if
+    the tensor size along the given dimension :attr:`dim` is not divisible by
+    :attr:`split_size`.
+
+    If :attr:`split_size_or_sections` is a list, then :attr:`tensor` will be split
+    into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
+    to :attr:`split_size_or_sections`.
+
+    Args:
+        tensor (Tensor): tensor to split.
+        split_size_or_sections (int) or (list(int)): size of a single chunk or
+            list of sizes for each chunk
+        dim (int): dimension along which to split the tensor.
+
+    Example::
+
+        >>> a = torch.arange(10).reshape(5, 2)
+        >>> a
+        tensor([[0, 1],
+                [2, 3],
+                [4, 5],
+                [6, 7],
+                [8, 9]])
+        >>> torch.split(a, 2)
+        (tensor([[0, 1],
+                 [2, 3]]),
+         tensor([[4, 5],
+                 [6, 7]]),
+         tensor([[8, 9]]))
+        >>> torch.split(a, [1, 4])
+        (tensor([[0, 1]]),
+         tensor([[2, 3],
+                 [4, 5],
+                 [6, 7],
+                 [8, 9]]))
+    """
+    if has_torch_function_unary(tensor):
+        return handle_torch_function(
+            split, (tensor,), tensor, split_size_or_sections, dim=dim)
+    # Overwriting reason:
+    # This dispatches to two ATen functions depending on the type of
+    # split_size_or_sections. The branching code is in _tensor.py, which we
+    # call here.
+    return tensor.split(split_size_or_sections, dim)
+
+
+def einsum(*args: Any) -> Tensor:
+    r"""einsum(equation, *operands) -> Tensor
+
+    Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation
+    based on the Einstein summation convention.
+
+    Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them
+    in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of
+    this format are described below, but the general idea is to label every dimension of the input :attr:`operands`
+    with some subscript and define which subscripts are part of the output. The output is then computed by summing
+    the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the
+    output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`.
+    Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why).
+
+    Equation:
+
+        The :attr:`equation` string specifies the subscripts (letters in `[a-zA-Z]`) for each dimension of
+        the input :attr:`operands` in the same order as the dimensions, separating subscripts for each operand by a
+        comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript
+        must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is
+        repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand
+        must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that
+        appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order.
+        The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based
+        on the subscripts, and then summing out the dimensions whose subscripts are not part of the output.
+
+        Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation
+        followed by the subscripts for the output. For instance, the following equation computes the transpose of a
+        matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and
+        at most once for the output.
+
+        Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis.
+        Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts,
+        e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth
+        dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the
+        'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not
+        explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions),
+        before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements
+        batch matrix multiplication `'...ij,...jk'`.
+
+        A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis,
+        arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands.
+
+    .. note::
+
+        ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions
+        covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output.
+
+    .. note::
+
+        This function uses opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) to speed up computation or to
+        consume less memory by optimizing contraction order. This optimization occurs when there are at least three
+        inputs, since the order does not matter otherwise. Note that finding _the_ optimal path is an NP-hard problem,
+        thus, opt_einsum relies on different heuristics to achieve near-optimal results. If opt_einsum is not available,
+        the default order is to contract from left to right.
+
+        To bypass this default behavior, add the following line to disable the usage of opt_einsum and skip path
+        calculation: `torch.backends.opt_einsum.enabled = False`
+
+        To specify which strategy you'd like for opt_einsum to compute the contraction path, add the following line:
+        `torch.backends.opt_einsum.strategy = 'auto'`. The default strategy is 'auto', and we also support 'greedy' and
+        'optimal'. Disclaimer that the runtime of 'optimal' is factorial in the number of inputs! See more details in
+        the opt_einsum documentation (https://optimized-einsum.readthedocs.io/en/stable/path_finding.html).
+
+    .. note::
+
+        As of PyTorch 1.10 :func:`torch.einsum` also supports the sublist format (see examples below). In this format,
+        subscripts for each operand are specified by sublists, list of integers in the range [0, 52). These sublists
+        follow their operands, and an extra sublist can appear at the end of the input to specify the output's
+        subscripts., e.g. `torch.einsum(op1, sublist1, op2, sublist2, ..., [subslist_out])`. Python's `Ellipsis` object
+        may be provided in a sublist to enable broadcasting as described in the Equation section above.
+
+    Args:
+        equation (str): The subscripts for the Einstein summation.
+        operands (List[Tensor]): The tensors to compute the Einstein summation of.
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # trace
+        >>> torch.einsum('ii', torch.randn(4, 4))
+        tensor(-1.2104)
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # diagonal
+        >>> torch.einsum('ii->i', torch.randn(4, 4))
+        tensor([-0.1034,  0.7952, -0.2433,  0.4545])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # outer product
+        >>> x = torch.randn(5)
+        >>> y = torch.randn(4)
+        >>> torch.einsum('i,j->ij', x, y)
+        tensor([[ 0.1156, -0.2897, -0.3918,  0.4963],
+                [-0.3744,  0.9381,  1.2685, -1.6070],
+                [ 0.7208, -1.8058, -2.4419,  3.0936],
+                [ 0.1713, -0.4291, -0.5802,  0.7350],
+                [ 0.5704, -1.4290, -1.9323,  2.4480]])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # batch matrix multiplication
+        >>> As = torch.randn(3, 2, 5)
+        >>> Bs = torch.randn(3, 5, 4)
+        >>> torch.einsum('bij,bjk->bik', As, Bs)
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> # with sublist format and ellipsis
+        >>> torch.einsum(As, [..., 0, 1], Bs, [..., 1, 2], [..., 0, 2])
+        tensor([[[-1.0564, -1.5904,  3.2023,  3.1271],
+                [-1.6706, -0.8097, -0.8025, -2.1183]],
+
+                [[ 4.2239,  0.3107, -0.5756, -0.2354],
+                [-1.4558, -0.3460,  1.5087, -0.8530]],
+
+                [[ 2.8153,  1.8787, -4.3839, -1.2112],
+                [ 0.3728, -2.1131,  0.0921,  0.8305]]])
+
+        >>> # batch permute
+        >>> A = torch.randn(2, 3, 4, 5)
+        >>> torch.einsum('...ij->...ji', A).shape
+        torch.Size([2, 3, 5, 4])
+
+        >>> # equivalent to torch.nn.functional.bilinear
+        >>> A = torch.randn(3, 5, 4)
+        >>> l = torch.randn(2, 5)
+        >>> r = torch.randn(2, 4)
+        >>> torch.einsum('bn,anm,bm->ba', l, A, r)
+        tensor([[-0.3430, -5.2405,  0.4494],
+                [ 0.3311,  5.5201, -3.0356]])
+    """
+    import torch.backends.opt_einsum as opt_einsum
+    # This wrapper exists to support variadic args.
+    if len(args) < 2:
+        raise ValueError('einsum(): must specify the equation string and at least one operand, '
+                         'or at least one operand and its subscripts list')
+
+    equation = None
+    operands = None
+
+    if isinstance(args[0], torch.Tensor):
+        # Convert the subscript list format which is an interleaving of operand and its subscripts
+        # list with an optional output subscripts list at the end (see documentation for more details on this)
+        # to the equation string format by creating the equation string from the subscripts list and grouping the
+        # input operands into a tensorlist (List[Tensor]).
+        def parse_subscript(n: int) -> str:
+            if n == Ellipsis:
+                return '...'
+            if n >= 0 and n < 26:
+                return chr(ord('A') + n)
+            if n >= 26 and n < 52:
+                return chr(ord('a') + n - 26)
+            raise ValueError('einsum(): subscript in subscript list is not within the valid range [0, 52)')
+
+        # Parse subscripts for input operands
+        equation = ','.join(''.join(parse_subscript(s) for s in l) for l in args[1::2])
+
+        # Parse optional output subscripts (provided when the number of arguments is odd)
+        if len(args) % 2 == 1:
+            equation += '->' + ''.join(parse_subscript(s) for s in args[-1])
+            operands = args[:-1:2]
+        else:
+            operands = args[::2]
+    else:
+        equation = args[0]
+        operands = args[1:]
+
+    if has_torch_function(operands):
+        return handle_torch_function(einsum, operands, equation, *operands)
+
+    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        _operands = operands[0]
+        # recurse incase operands contains value that has torch function
+        # in the original implementation this line is omitted
+        return einsum(equation, *_operands)
+
+    if len(operands) <= 2 or not opt_einsum.enabled:
+        # the path for contracting 0 or 1 time(s) is already optimized
+        # or the user has disabled using opt_einsum
+        return _VF.einsum(equation, operands)  # type: ignore[attr-defined]
+
+    path = None
+    if opt_einsum.is_available():
+        _opt_einsum = opt_einsum.get_opt_einsum()
+        tupled_path = _opt_einsum.contract_path(equation, *operands, optimize=opt_einsum.strategy)[0]
+        # flatten path for dispatching to C++
+        path = [item for pair in tupled_path for item in pair]
+    return _VF.einsum(equation, operands, path=path)  # type: ignore[attr-defined]
+
+
+# This wrapper exists to support variadic args.
+if TYPE_CHECKING:
+    # The JIT doesn't understand Union, so only add type annotation for mypy
+    def meshgrid(*tensors: Union[Tensor, List[Tensor]],
+                 indexing: Optional[str] = None) -> Tuple[Tensor, ...]:
+        return _meshgrid(*tensors, indexing=indexing)
+else:
+    def meshgrid(*tensors, indexing: Optional[str] = None) -> Tuple[Tensor, ...]:
+        r"""Creates grids of coordinates specified by the 1D inputs in `attr`:tensors.
+
+        This is helpful when you want to visualize data over some
+        range of inputs. See below for a plotting example.
+
+        Given :math:`N` 1D tensors :math:`T_0 \ldots T_{N-1}` as
+        inputs with corresponding sizes :math:`S_0 \ldots S_{N-1}`,
+        this creates :math:`N` N-dimensional tensors :math:`G_0 \ldots
+        G_{N-1}`, each with shape :math:`(S_0, ..., S_{N-1})` where
+        the output :math:`G_i` is constructed by expanding :math:`T_i`
+        to the result shape.
+
+        .. note::
+            0D inputs are treated equivalently to 1D inputs of a
+            single element.
+
+        .. warning::
+            `torch.meshgrid(*tensors)` currently has the same behavior
+            as calling `numpy.meshgrid(*arrays, indexing='ij')`.
+
+            In the future `torch.meshgrid` will transition to
+            `indexing='xy'` as the default.
+
+            https://github.com/pytorch/pytorch/issues/50276 tracks
+            this issue with the goal of migrating to NumPy's behavior.
+
+        .. seealso::
+
+            :func:`torch.cartesian_prod` has the same effect but it
+            collects the data in a tensor of vectors.
+
+        Args:
+            tensors (list of Tensor): list of scalars or 1 dimensional tensors. Scalars will be
+                treated as tensors of size :math:`(1,)` automatically
+
+            indexing: (str, optional): the indexing mode, either "xy"
+                or "ij", defaults to "ij". See warning for future changes.
+
+                If "xy" is selected, the first dimension corresponds
+                to the cardinality of the second input and the second
+                dimension corresponds to the cardinality of the first
+                input.
+
+                If "ij" is selected, the dimensions are in the same
+                order as the cardinality of the inputs.
+
+        Returns:
+            seq (sequence of Tensors): If the input has :math:`N`
+            tensors of size :math:`S_0 \ldots S_{N-1}``, then the
+            output will also have :math:`N` tensors, where each tensor
+            is of shape :math:`(S_0, ..., S_{N-1})`.
+
+        Example::
+
+            >>> x = torch.tensor([1, 2, 3])
+            >>> y = torch.tensor([4, 5, 6])
+
+            Observe the element-wise pairings across the grid, (1, 4),
+            (1, 5), ..., (3, 6). This is the same thing as the
+            cartesian product.
+            >>> grid_x, grid_y = torch.meshgrid(x, y, indexing='ij')
+            >>> grid_x
+            tensor([[1, 1, 1],
+                    [2, 2, 2],
+                    [3, 3, 3]])
+            >>> grid_y
+            tensor([[4, 5, 6],
+                    [4, 5, 6],
+                    [4, 5, 6]])
+
+            This correspondence can be seen when these grids are
+            stacked properly.
+            >>> torch.equal(torch.cat(tuple(torch.dstack([grid_x, grid_y]))),
+            ...             torch.cartesian_prod(x, y))
+            True
+
+            `torch.meshgrid` is commonly used to produce a grid for
+            plotting.
+            >>> # xdoctest: +REQUIRES(module:matplotlib)
+            >>> # xdoctest: +REQUIRES(env:DOCTEST_SHOW)
+            >>> import matplotlib.pyplot as plt
+            >>> xs = torch.linspace(-5, 5, steps=100)
+            >>> ys = torch.linspace(-5, 5, steps=100)
+            >>> x, y = torch.meshgrid(xs, ys, indexing='xy')
+            >>> z = torch.sin(torch.sqrt(x * x + y * y))
+            >>> ax = plt.axes(projection='3d')
+            >>> ax.plot_surface(x.numpy(), y.numpy(), z.numpy())
+            >>> plt.show()
+
+        .. image:: ../_static/img/meshgrid.png
+            :width: 512
+
+        """
+        return _meshgrid(*tensors, indexing=indexing)
+
+
+def _meshgrid(*tensors, indexing: Optional[str]):
+    if has_torch_function(tensors):
+        return handle_torch_function(meshgrid, tensors, *tensors, indexing=indexing)
+    if len(tensors) == 1 and isinstance(tensors[0], (list, tuple)):
+        # the old interface of passing the operands as one list argument
+        tensors = tensors[0]  # type: ignore[assignment]
+
+    # Continue allowing call of old method that takes no indexing
+    # kwarg for forward compatibility reasons.
+    #
+    # Remove this two weeks after landing.
+    kwargs = {} if indexing is None else {'indexing': indexing}
+    return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
+
+
+def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
+         win_length: Optional[int] = None, window: Optional[Tensor] = None,
+         center: bool = True, pad_mode: str = 'reflect', normalized: bool = False,
+         onesided: Optional[bool] = None,
+         return_complex: Optional[bool] = None) -> Tensor:
+    r"""Short-time Fourier transform (STFT).
+
+    .. warning::
+        From version 1.8.0, :attr:`return_complex` must always be given
+        explicitly for real inputs and `return_complex=False` has been
+        deprecated. Strongly prefer `return_complex=True` as in a future
+        pytorch release, this function will only return complex tensors.
+
+        Note that :func:`torch.view_as_real` can be used to recover a real
+        tensor with an extra last dimension for real and imaginary components.
+
+    .. warning::
+        From version 2.1, a warning will be provided if a :attr:`window` is
+        not specified. In a future release, this attribute will be required.
+        Not providing a window currently defaults to using a rectangular window,
+        which may result in undesirable artifacts. Consider using tapered windows,
+        such as :func:`torch.hann_window`.
+
+    The STFT computes the Fourier transform of short overlapping windows of the
+    input. This giving frequency components of the signal as they change over
+    time. The interface of this function is modeled after (but *not* a drop-in
+    replacement for) librosa_ stft function.
+
+    .. _librosa: https://librosa.org/doc/latest/generated/librosa.stft.html
+
+    Ignoring the optional batch dimension, this method computes the following
+    expression:
+
+    .. math::
+        X[\omega, m] = \sum_{k = 0}^{\text{win\_length-1}}%
+                            \text{window}[k]\ \text{input}[m \times \text{hop\_length} + k]\ %
+                            \exp\left(- j \frac{2 \pi \cdot \omega k}{\text{n\_fft}}\right),
+
+    where :math:`m` is the index of the sliding window, and :math:`\omega` is
+    the frequency :math:`0 \leq \omega < \text{n\_fft}` for ``onesided=False``,
+    or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for ``onesided=True``.
+
+    * :attr:`input` must be either a 1-D time sequence or a 2-D batch of time
+      sequences.
+
+    * If :attr:`hop_length` is ``None`` (default), it is treated as equal to
+      ``floor(n_fft / 4)``.
+
+    * If :attr:`win_length` is ``None`` (default), it is treated as equal to
+      :attr:`n_fft`.
+
+    * :attr:`window` can be a 1-D tensor of size :attr:`win_length`, e.g., from
+      :meth:`torch.hann_window`. If :attr:`window` is ``None`` (default), it is
+      treated as if having :math:`1` everywhere in the window. If
+      :math:`\text{win\_length} < \text{n\_fft}`, :attr:`window` will be padded on
+      both sides to length :attr:`n_fft` before being applied.
+
+    * If :attr:`center` is ``True`` (default), :attr:`input` will be padded on
+      both sides so that the :math:`t`-th frame is centered at time
+      :math:`t \times \text{hop\_length}`. Otherwise, the :math:`t`-th frame
+      begins at time  :math:`t \times \text{hop\_length}`.
+
+    * :attr:`pad_mode` determines the padding method used on :attr:`input` when
+      :attr:`center` is ``True``. See :meth:`torch.nn.functional.pad` for
+      all available options. Default is ``"reflect"``.
+
+    * If :attr:`onesided` is ``True`` (default for real input), only values for
+      :math:`\omega` in :math:`\left[0, 1, 2, \dots, \left\lfloor
+      \frac{\text{n\_fft}}{2} \right\rfloor + 1\right]` are returned because
+      the real-to-complex Fourier transform satisfies the conjugate symmetry,
+      i.e., :math:`X[m, \omega] = X[m, \text{n\_fft} - \omega]^*`.
+      Note if the input or window tensors are complex, then :attr:`onesided`
+      output is not possible.
+
+    * If :attr:`normalized` is ``True`` (default is ``False``), the function
+      returns the normalized STFT results, i.e., multiplied by :math:`(\text{frame\_length})^{-0.5}`.
+
+    * If :attr:`return_complex` is ``True`` (default if input is complex), the
+      return is a ``input.dim() + 1`` dimensional complex tensor. If ``False``,
+      the output is a ``input.dim() + 2`` dimensional real tensor where the last
+      dimension represents the real and imaginary components.
+
+    Returns either a complex tensor of size :math:`(* \times N \times T)` if
+    :attr:`return_complex` is true, or a real tensor of size :math:`(* \times N
+    \times T \times 2)`. Where :math:`*` is the optional batch size of
+    :attr:`input`, :math:`N` is the number of frequencies where STFT is applied
+    and :math:`T` is the total number of frames used.
+
+    .. warning::
+      This function changed signature at version 0.4.1. Calling with the
+      previous signature may cause error or return incorrect result.
+
+    Args:
+        input (Tensor): the input tensor of shape `(B?, L)` where `B?` is an optional
+            batch dimension
+        n_fft (int): size of Fourier transform
+        hop_length (int, optional): the distance between neighboring sliding window
+            frames. Default: ``None`` (treated as equal to ``floor(n_fft / 4)``)
+        win_length (int, optional): the size of window frame and STFT filter.
+            Default: ``None``  (treated as equal to :attr:`n_fft`)
+        window (Tensor, optional): the optional window function.
+            Shape must be 1d and `<= n_fft`
+            Default: ``None`` (treated as window of all :math:`1` s)
+        center (bool, optional): whether to pad :attr:`input` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (str, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        normalized (bool, optional): controls whether to return the normalized STFT results
+             Default: ``False``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy for real inputs.
+            Default: ``True`` for real :attr:`input` and :attr:`window`, ``False`` otherwise.
+        return_complex (bool, optional): whether to return a complex tensor, or
+            a real tensor with an extra last dimension for the real and
+            imaginary components.
+
+            .. versionchanged:: 2.0
+               ``return_complex`` is now a required argument for real inputs,
+               as the default is being transitioned to ``True``.
+
+            .. deprecated:: 2.0
+               ``return_complex=False`` is deprecated, instead use ``return_complex=True``
+               Note that calling :func:`torch.view_as_real` on the output will
+               recover the deprecated output format.
+
+    Returns:
+        Tensor: A tensor containing the STFT result with shape `(B?, N, T, C?)` where
+           - `B?` is an optional batch dimension from the input.
+           - `N` is the number of frequency samples, `(n_fft // 2) + 1` for
+             `onesided=True`, or otherwise `n_fft`.
+           - `T` is the number of frames, `1 + L // hop_length`
+             for `center=True`, or `1 + (L - n_fft) // hop_length` otherwise.
+           - `C?` is an optional length-2 dimension of real and imaginary
+             components, present when `return_complex=False`.
+
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            stft, (input,), input, n_fft, hop_length=hop_length, win_length=win_length,
+            window=window, center=center, pad_mode=pad_mode, normalized=normalized,
+            onesided=onesided, return_complex=return_complex)
+    # NOTE: Do not edit. This code will be removed once the forward-compatibility
+    #       period is over for PR #73432
+    if center:
+        signal_dim = input.dim()
+        extended_shape = [1] * (3 - signal_dim) + list(input.size())
+        pad = int(n_fft // 2)
+        input = F.pad(input.view(extended_shape), [pad, pad], pad_mode)
+        input = input.view(input.shape[-signal_dim:])
+    return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+                    normalized, onesided, return_complex)
+
+
+istft = _add_docstr(
+    torch.istft,
+    "istft(input, n_fft, hop_length=None, win_length=None, window=None, center=True, "
+    "normalized=False, onesided=None, length=None, return_complex=False) -> Tensor:\n"
+    r"""
+Inverse short time Fourier Transform. This is expected to be the inverse of :func:`~torch.stft`.
+
+.. warning::
+    From version 2.1, a warning will be provided if a :attr:`window` is
+    not specified. In a future release, this attribute will be required.
+    Please provide the same window used in the stft call.
+
+It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the
+least squares estimation of the original signal. The algorithm will check using the NOLA condition (
+nonzero overlap).
+
+Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelope
+created by the summation of all the windows is never zero at certain point in time. Specifically,
+:math:`\sum_{t=-\infty}^{\infty} |w|^2[n-t\times hop\_length] \cancel{=} 0`.
+
+Since :func:`~torch.stft` discards elements at the end of the signal if they do not fit in a frame,
+``istft`` may return a shorter signal than the original signal (can occur if :attr:`center` is False
+since the signal isn't padded). If `length` is given in the arguments and is longer than expected,
+``istft`` will pad zeros to the end of the returned signal.
+
+If :attr:`center` is ``True``, then there will be padding e.g. ``'constant'``, ``'reflect'``, etc.
+Left padding can be trimmed off exactly because they can be calculated but right padding cannot be
+calculated without additional information.
+
+Example: Suppose the last window is:
+``[17, 18, 0, 0, 0]`` vs ``[18, 0, 0, 0, 0]``
+
+The :attr:`n_fft`, :attr:`hop_length`, :attr:`win_length` are all the same which prevents the calculation
+of right padding. These additional values could be zeros or a reflection of the signal so providing
+:attr:`length` could be useful. If :attr:`length` is ``None`` then padding will be aggressively removed
+(some loss of signal).
+
+[1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
+IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
+
+Args:
+    input (Tensor): The input tensor. Expected to be in the format of :func:`~torch.stft`,
+        output. That is a complex tensor of shape `(B?, N, T)` where
+
+        - `B?` is an optional batch dimension
+        - `N` is the number of frequency samples, `(n_fft // 2) + 1`
+          for onesided input, or otherwise `n_fft`.
+        - `T` is the number of frames, `1 + length // hop_length` for centered stft,
+          or `1 + (length - n_fft) // hop_length` otherwise.
+
+        .. versionchanged:: 2.0
+            Real datatype inputs are no longer supported. Input must now have a
+            complex datatype, as returned by ``stft(..., return_complex=True)``.
+    n_fft (int): Size of Fourier transform
+    hop_length (Optional[int]): The distance between neighboring sliding window frames.
+        (Default: ``n_fft // 4``)
+    win_length (Optional[int]): The size of window frame and STFT filter. (Default: ``n_fft``)
+    window (Optional[torch.Tensor]): The optional window function.
+        Shape must be 1d and `<= n_fft`
+        (Default: ``torch.ones(win_length)``)
+    center (bool): Whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is
+        centered at time :math:`t \times \text{hop\_length}`.
+        (Default: ``True``)
+    normalized (bool): Whether the STFT was normalized. (Default: ``False``)
+    onesided (Optional[bool]): Whether the STFT was onesided.
+        (Default: ``True`` if `n_fft != fft_size` in the input size)
+    length (Optional[int]): The amount to trim the signal by (i.e. the
+        original signal length). Defaults to `(T - 1) * hop_length` for
+        centered stft, or `n_fft + (T - 1) * hop_length` otherwise, where `T`
+        is the number of input frames.
+    return_complex (Optional[bool]):
+        Whether the output should be complex, or if the input should be
+        assumed to derive from a real signal and window.
+        Note that this is incompatible with ``onesided=True``.
+        (Default: ``False``)
+
+Returns:
+    Tensor: Least squares estimation of the original signal of shape `(B?, length)` where
+        `B?` is an optional batch dimension from the input tensor.
+""")
+
+
+if TYPE_CHECKING:
+    # These _impl functions return a variable number of tensors as output with
+    # __torch_function__; tuple unpacking is done already rather than being
+    # done by the caller of the _impl function
+    _unique_impl_out = Any
+else:
+    _unique_impl_out = Tuple[Tensor, Tensor, Tensor]
+
+
+def _unique_impl(input: Tensor, sorted: bool = True,
+                 return_inverse: bool = False, return_counts: bool = False,
+                 dim: Optional[int] = None) -> _unique_impl_out:
+    r"""unique(input, sorted=True, return_inverse=False, return_counts=False, dim=None) -> Tuple[Tensor, Tensor, Tensor]
+
+    Returns the unique elements of the input tensor.
+
+    .. note:: This function is different from :func:`torch.unique_consecutive` in the sense that
+        this function also eliminates non-consecutive duplicate values.
+
+    .. note:: Currently in the CUDA implementation and the CPU implementation,
+        `torch.unique` always sort the tensor at the beginning regardless of the `sort` argument.
+        Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
+        :func:`torch.unique_consecutive` which avoids the sorting.
+
+    Args:
+        input (Tensor): the input tensor
+        sorted (bool): Whether to sort the unique elements in ascending order
+            before returning as output.
+        return_inverse (bool): Whether to also return the indices for where
+            elements in the original input ended up in the returned unique list.
+        return_counts (bool): Whether to also return the counts for each unique
+            element.
+        dim (int, optional): the dimension to operate upon. If ``None``, the
+            unique of the flattened input is returned. Otherwise, each of the
+            tensors indexed by the given dimension is treated as one of the
+            elements to apply the unique operation upon. See examples for more
+            details. Default: ``None``
+
+    Returns:
+        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing
+
+            - **output** (*Tensor*): the output list of unique scalar elements.
+            - **inverse_indices** (*Tensor*): (optional) if
+              :attr:`return_inverse` is True, there will be an additional
+              returned tensor (same shape as input) representing the indices
+              for where elements in the original input map to in the output;
+              otherwise, this function will only return a single tensor.
+            - **counts** (*Tensor*): (optional) if
+              :attr:`return_counts` is True, there will be an additional
+              returned tensor (same shape as output or output.size(dim),
+              if dim was specified) representing the number of occurrences
+              for each unique value or tensor.
+
+    Example::
+
+        >>> output = torch.unique(torch.tensor([1, 3, 2, 3], dtype=torch.long))
+        >>> output
+        tensor([1, 2, 3])
+
+        >>> output, inverse_indices = torch.unique(
+        ...     torch.tensor([1, 3, 2, 3], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3])
+        >>> inverse_indices
+        tensor([0, 2, 1, 2])
+
+        >>> output, inverse_indices = torch.unique(
+        ...     torch.tensor([[1, 3], [2, 3]], dtype=torch.long), sorted=True, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3])
+        >>> inverse_indices
+        tensor([[0, 2],
+                [1, 2]])
+
+        >>> a = torch.tensor([
+        ...     [
+        ...         [1, 1, 0, 0],
+        ...         [1, 1, 0, 0],
+        ...         [0, 0, 1, 1],
+        ...     ],
+        ...     [
+        ...         [0, 0, 1, 1],
+        ...         [0, 0, 1, 1],
+        ...         [1, 1, 1, 1],
+        ...     ],
+        ...     [
+        ...         [1, 1, 0, 0],
+        ...         [1, 1, 0, 0],
+        ...         [0, 0, 1, 1],
+        ...     ],
+        ... ])
+
+        >>> # If we call `torch.unique(a, dim=0)`, each of the tensors `a[idx, :, :]`
+        >>> # will be compared. We can see that `a[0, :, :]` and `a[2, :, :]` match
+        >>> # each other, so one of them will be removed.
+        >>> (a[0, :, :] == a[2, :, :]).all()
+        tensor(True)
+        >>> a_unique_dim0 = torch.unique(a, dim=0)
+        >>> a_unique_dim0
+        tensor([[[0, 0, 1, 1],
+                 [0, 0, 1, 1],
+                 [1, 1, 1, 1]],
+                [[1, 1, 0, 0],
+                 [1, 1, 0, 0],
+                 [0, 0, 1, 1]]])
+
+        >>> # Notice which sub-tensors from `a` match with the sub-tensors from
+        >>> # `a_unique_dim0`:
+        >>> (a_unique_dim0[0, :, :] == a[1, :, :]).all()
+        tensor(True)
+        >>> (a_unique_dim0[1, :, :] == a[0, :, :]).all()
+        tensor(True)
+
+        >>> # For `torch.unique(a, dim=1)`, each of the tensors `a[:, idx, :]` are
+        >>> # compared. `a[:, 0, :]` and `a[:, 1, :]` match each other, so one of
+        >>> # them will be removed.
+        >>> (a[:, 0, :] == a[:, 1, :]).all()
+        tensor(True)
+        >>> torch.unique(a, dim=1)
+        tensor([[[0, 0, 1, 1],
+                 [1, 1, 0, 0]],
+                [[1, 1, 1, 1],
+                 [0, 0, 1, 1]],
+                [[0, 0, 1, 1],
+                 [1, 1, 0, 0]]])
+
+        >>> # For `torch.unique(a, dim=2)`, the tensors `a[:, :, idx]` are compared.
+        >>> # `a[:, :, 0]` and `a[:, :, 1]` match each other. Also, `a[:, :, 2]` and
+        >>> # `a[:, :, 3]` match each other as well. So in this case, two of the
+        >>> # sub-tensors will be removed.
+        >>> (a[:, :, 0] == a[:, :, 1]).all()
+        tensor(True)
+        >>> (a[:, :, 2] == a[:, :, 3]).all()
+        tensor(True)
+        >>> torch.unique(a, dim=2)
+        tensor([[[0, 1],
+                 [0, 1],
+                 [1, 0]],
+                [[1, 0],
+                 [1, 0],
+                 [1, 1]],
+                [[0, 1],
+                 [0, 1],
+                 [1, 0]]])
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            unique, (input,), input, sorted=sorted, return_inverse=return_inverse,
+            return_counts=return_counts, dim=dim)
+
+    if dim is not None:
+        output, inverse_indices, counts = _VF.unique_dim(
+            input,
+            dim,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+        )
+    else:
+        output, inverse_indices, counts = torch._unique2(
+            input,
+            sorted=sorted,
+            return_inverse=return_inverse,
+            return_counts=return_counts,
+        )
+    return output, inverse_indices, counts
+
+
+def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
+                             return_counts: bool = False,
+                             dim: Optional[int] = None) -> _unique_impl_out:
+    r"""Eliminates all but the first element from every consecutive group of equivalent elements.
+
+    .. note:: This function is different from :func:`torch.unique` in the sense that this function
+        only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
+        in C++.
+
+    Args:
+        input (Tensor): the input tensor
+        return_inverse (bool): Whether to also return the indices for where
+            elements in the original input ended up in the returned unique list.
+        return_counts (bool): Whether to also return the counts for each unique
+            element.
+        dim (int): the dimension to apply unique. If ``None``, the unique of the
+            flattened input is returned. default: ``None``
+
+    Returns:
+        (Tensor, Tensor (optional), Tensor (optional)): A tensor or a tuple of tensors containing
+
+            - **output** (*Tensor*): the output list of unique scalar elements.
+            - **inverse_indices** (*Tensor*): (optional) if
+              :attr:`return_inverse` is True, there will be an additional
+              returned tensor (same shape as input) representing the indices
+              for where elements in the original input map to in the output;
+              otherwise, this function will only return a single tensor.
+            - **counts** (*Tensor*): (optional) if
+              :attr:`return_counts` is True, there will be an additional
+              returned tensor (same shape as output or output.size(dim),
+              if dim was specified) representing the number of occurrences
+              for each unique value or tensor.
+
+    Example::
+
+        >>> x = torch.tensor([1, 1, 2, 2, 3, 1, 1, 2])
+        >>> output = torch.unique_consecutive(x)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+
+        >>> output, inverse_indices = torch.unique_consecutive(x, return_inverse=True)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+        >>> inverse_indices
+        tensor([0, 0, 1, 1, 2, 3, 3, 4])
+
+        >>> output, counts = torch.unique_consecutive(x, return_counts=True)
+        >>> output
+        tensor([1, 2, 3, 1, 2])
+        >>> counts
+        tensor([2, 2, 1, 2, 1])
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            unique_consecutive, (input,), input, return_inverse=return_inverse,
+            return_counts=return_counts, dim=dim)
+    output, inverse_indices, counts = _VF.unique_consecutive(  # type: ignore[attr-defined]
+        input, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
+    return output, inverse_indices, counts
+
+
+def _return_counts(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, _, counts = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output, counts
+
+
+def _return_output(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tensor
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output
+
+
+def _return_inverse(input, sorted=True, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_impl(input, sorted, return_inverse, return_counts, dim)
+
+    output, inverse_indices, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+    return output, inverse_indices
+
+
+_return_inverse_false = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=3,
+    default=False,
+    if_true=_return_counts,
+    if_false=_return_output,
+    module_name=__name__,
+    func_name='unique')
+
+_return_inverse_true = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=3,
+    default=False,
+    if_true=_unique_impl,
+    if_false=_return_inverse,
+    module_name=__name__,
+    func_name='unique')
+
+# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
+# resolve the output type in TorchScript we need to statically know the value of both parameters
+
+unique = boolean_dispatch(
+    arg_name='return_inverse',
+    arg_index=2,
+    default=False,
+    if_true=_return_inverse_true,
+    if_false=_return_inverse_false,
+    module_name=__name__,
+    func_name='unique')
+unique.__doc__ = _unique_impl.__doc__
+
+
+def _consecutive_return_counts(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, _, counts = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output, counts
+
+
+def _consecutive_return_output(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tensor
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, _, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output
+
+
+def _consecutive_return_inverse(input, return_inverse=False, return_counts=False, dim=None):
+    # type: (Tensor, bool, bool, Optional[int]) -> Tuple[Tensor, Tensor]
+
+    if has_torch_function_unary(input):
+        return _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+
+    output, inverse_indices, _ = _unique_consecutive_impl(input, return_inverse, return_counts, dim)
+    return output, inverse_indices
+
+
+_consecutive_return_inverse_false = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=1,
+    default=False,
+    if_true=_consecutive_return_counts,
+    if_false=_consecutive_return_output,
+    module_name=__name__,
+    func_name='unique_consecutive')
+
+_consecutive_return_inverse_true = boolean_dispatch(
+    arg_name='return_counts',
+    arg_index=1,
+    default=False,
+    if_true=_unique_consecutive_impl,
+    if_false=_consecutive_return_inverse,
+    module_name=__name__,
+    func_name='unique_consecutive')
+
+# The return type of unique depends on `return_inverse`, and `return_counts` so in order to
+# resolve the output type in TorchScript we need to statically know the value of both parameters
+
+unique_consecutive = boolean_dispatch(
+    arg_name='return_inverse',
+    arg_index=2,
+    default=False,
+    if_true=_consecutive_return_inverse_true,
+    if_false=_consecutive_return_inverse_false,
+    module_name=__name__,
+    func_name='unique_consecutive')
+unique_consecutive.__doc__ = _unique_consecutive_impl.__doc__
+
+if TYPE_CHECKING:
+    pass
+    # There's no good way to use this type annotation without breaking JIT
+    # overloads. So leave untyped for mypy for now.
+else:
+    @overload
+    def tensordot(a, b, dims: int = 2, out: Optional[torch.Tensor] = None):
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: Tuple[List[int], List[int]], out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: List[List[int]], out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+    @overload  # noqa: F811
+    def tensordot(a, b, dims: torch.Tensor, out: Optional[torch.Tensor] = None):  # noqa: F811
+        pass
+
+
+def tensordot(a, b, dims=2, out: Optional[torch.Tensor] = None):  # noqa: F811
+    r"""Returns a contraction of a and b over multiple dimensions.
+
+    :attr:`tensordot` implements a generalized matrix product.
+
+    Args:
+      a (Tensor): Left tensor to contract
+      b (Tensor): Right tensor to contract
+      dims (int or Tuple[List[int], List[int]] or List[List[int]] containing two lists or Tensor): number of dimensions to
+         contract or explicit lists of dimensions for :attr:`a` and
+         :attr:`b` respectively
+
+    When called with a non-negative integer argument :attr:`dims` = :math:`d`, and
+    the number of dimensions of :attr:`a` and :attr:`b` is :math:`m` and :math:`n`,
+    respectively, :func:`~torch.tensordot` computes
+
+    .. math::
+        r_{i_0,...,i_{m-d}, i_d,...,i_n}
+          = \sum_{k_0,...,k_{d-1}} a_{i_0,...,i_{m-d},k_0,...,k_{d-1}} \times b_{k_0,...,k_{d-1}, i_d,...,i_n}.
+
+    When called with :attr:`dims` of the list form, the given dimensions will be contracted
+    in place of the last :math:`d` of :attr:`a` and the first :math:`d` of :math:`b`. The sizes
+    in these dimensions must match, but :func:`~torch.tensordot` will deal with broadcasted
+    dimensions.
+
+    Examples::
+
+        >>> a = torch.arange(60.).reshape(3, 4, 5)
+        >>> b = torch.arange(24.).reshape(4, 3, 2)
+        >>> torch.tensordot(a, b, dims=([1, 0], [0, 1]))
+        tensor([[4400., 4730.],
+                [4532., 4874.],
+                [4664., 5018.],
+                [4796., 5162.],
+                [4928., 5306.]])
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> a = torch.randn(3, 4, 5, device='cuda')
+        >>> b = torch.randn(4, 5, 6, device='cuda')
+        >>> c = torch.tensordot(a, b, dims=2).cpu()
+        tensor([[ 8.3504, -2.5436,  6.2922,  2.7556, -1.0732,  3.2741],
+                [ 3.3161,  0.0704,  5.0187, -0.4079, -4.3126,  4.8744],
+                [ 0.8223,  3.9445,  3.2168, -0.2400,  3.4117,  1.7780]])
+
+        >>> a = torch.randn(3, 5, 4, 6)
+        >>> b = torch.randn(6, 4, 5, 3)
+        >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0]))
+        tensor([[  7.7193,  -2.4867, -10.3204],
+                [  1.5513, -14.4737,  -6.5113],
+                [ -0.2850,   4.2573,  -3.5997]])
+    """
+    if has_torch_function_variadic(a, b):
+        return handle_torch_function(tensordot, (a, b), a, b, dims=dims, out=out)
+
+    if not isinstance(dims, (tuple, list, torch.Tensor, int, torch.SymInt)):
+        raise RuntimeError("tensordot expects dims to be int or "
+                           + "Tuple[List[int], List[int]] or "
+                           + "List[List[int]] containing two lists, but got "
+                           + f"dims={dims}")
+
+    dims_a: List[int] = []
+    dims_b: List[int] = []
+
+    if isinstance(dims, (tuple, list)):
+        dims_a, dims_b = dims
+
+    if isinstance(dims, torch.Tensor):
+        num_elements = dims.numel()
+        if num_elements > 1:
+            assert dims.size()[0] == 2
+            dims_a = torch.jit.annotate(List[int], dims[0].tolist())
+            dims_b = torch.jit.annotate(List[int], dims[1].tolist())
+        else:
+            dims_val = int(dims.item())
+            if dims_val < 0:
+                raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
+            dims_a = list(range(-dims_val, 0))
+            dims_b = list(range(dims_val))
+
+    if isinstance(dims, (int, torch.SymInt)):
+        if dims < 0:
+            raise RuntimeError(f"tensordot expects dims >= 0, but got dims={dims}")
+        if dims > min(a.dim(), b.dim()):
+            raise RuntimeError(f"tensordot expects dims < ndim_a or ndim_b, but got dims={dims}")
+        dims_a = list(range(-dims, 0))
+        dims_b = list(range(dims))
+
+    if out is None:
+        return _VF.tensordot(a, b, dims_a, dims_b)  # type: ignore[attr-defined]
+    else:
+        return _VF.tensordot(a, b, dims_a, dims_b, out=out)  # type: ignore[attr-defined]
+
+
+def cartesian_prod(*tensors: Tensor) -> Tensor:
+    """Do cartesian product of the given sequence of tensors. The behavior is similar to
+    python's `itertools.product`.
+
+    Args:
+        *tensors: any number of 1 dimensional tensors.
+
+    Returns:
+        Tensor: A tensor equivalent to converting all the input tensors into lists,
+        do `itertools.product` on these lists, and finally convert the resulting list
+        into tensor.
+
+    Example::
+
+        >>> import itertools
+        >>> a = [1, 2, 3]
+        >>> b = [4, 5]
+        >>> list(itertools.product(a, b))
+        [(1, 4), (1, 5), (2, 4), (2, 5), (3, 4), (3, 5)]
+        >>> tensor_a = torch.tensor(a)
+        >>> tensor_b = torch.tensor(b)
+        >>> torch.cartesian_prod(tensor_a, tensor_b)
+        tensor([[1, 4],
+                [1, 5],
+                [2, 4],
+                [2, 5],
+                [3, 4],
+                [3, 5]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(cartesian_prod, tensors, *tensors)
+    return _VF.cartesian_prod(tensors)  # type: ignore[attr-defined]
+
+
+def block_diag(*tensors):
+    """Create a block diagonal matrix from provided tensors.
+
+    Args:
+        *tensors: One or more tensors with 0, 1, or 2 dimensions.
+
+    Returns:
+        Tensor: A 2 dimensional tensor with all the input tensors arranged in
+        order such that their upper left and lower right corners are
+        diagonally adjacent. All other elements are set to 0.
+
+    Example::
+
+        >>> import torch
+        >>> A = torch.tensor([[0, 1], [1, 0]])
+        >>> B = torch.tensor([[3, 4, 5], [6, 7, 8]])
+        >>> C = torch.tensor(7)
+        >>> D = torch.tensor([1, 2, 3])
+        >>> E = torch.tensor([[4], [5], [6]])
+        >>> torch.block_diag(A, B, C, D, E)
+        tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 3, 4, 5, 0, 0, 0, 0, 0],
+                [0, 0, 6, 7, 8, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 7, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 1, 2, 3, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 5],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 6]])
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(block_diag, tensors, *tensors)
+    return torch._C._VariableFunctions.block_diag(tensors)  # type: ignore[attr-defined]
+
+
+def cdist(x1, x2, p=2., compute_mode='use_mm_for_euclid_dist_if_necessary'):
+    # type: (Tensor, Tensor, float, str) -> (Tensor)
+    r"""Computes batched the p-norm distance between each pair of the two collections of row vectors.
+
+    Args:
+        x1 (Tensor): input tensor of shape :math:`B \times P \times M`.
+        x2 (Tensor): input tensor of shape :math:`B \times R \times M`.
+        p: p value for the p-norm distance to calculate between each vector pair
+            :math:`\in [0, \infty]`.
+        compute_mode:
+            'use_mm_for_euclid_dist_if_necessary' - will use matrix multiplication approach to calculate
+            euclidean distance (p = 2) if P > 25 or R > 25
+            'use_mm_for_euclid_dist' - will always use matrix multiplication approach to calculate
+            euclidean distance (p = 2)
+            'donot_use_mm_for_euclid_dist' - will never use matrix multiplication approach to calculate
+            euclidean distance (p = 2)
+            Default: use_mm_for_euclid_dist_if_necessary.
+
+    If x1 has shape :math:`B \times P \times M` and x2 has shape :math:`B \times R \times M` then the
+    output will have shape :math:`B \times P \times R`.
+
+    This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
+    if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
+    `scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
+    scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.
+
+    Example:
+
+        >>> a = torch.tensor([[0.9041,  0.0196], [-0.3108, -2.4423], [-0.4821,  1.059]])
+        >>> a
+        tensor([[ 0.9041,  0.0196],
+                [-0.3108, -2.4423],
+                [-0.4821,  1.0590]])
+        >>> b = torch.tensor([[-2.1763, -0.4713], [-0.6986,  1.3702]])
+        >>> b
+        tensor([[-2.1763, -0.4713],
+                [-0.6986,  1.3702]])
+        >>> torch.cdist(a, b, p=2)
+        tensor([[3.1193, 2.0959],
+                [2.7138, 3.8322],
+                [2.2830, 0.3791]])
+    """
+    if has_torch_function_variadic(x1, x2):
+        return handle_torch_function(
+            cdist, (x1, x2), x1, x2, p=p, compute_mode=compute_mode)
+    if compute_mode == 'use_mm_for_euclid_dist_if_necessary':
+        return _VF.cdist(x1, x2, p, None)  # type: ignore[attr-defined]
+    elif compute_mode == 'use_mm_for_euclid_dist':
+        return _VF.cdist(x1, x2, p, 1)  # type: ignore[attr-defined]
+    elif compute_mode == 'donot_use_mm_for_euclid_dist':
+        return _VF.cdist(x1, x2, p, 2)  # type: ignore[attr-defined]
+    else:
+        raise ValueError(f"{compute_mode} is not a valid value for compute_mode")
+
+
+def atleast_1d(*tensors):
+    r"""
+    Returns a 1-dimensional view of each input tensor with zero dimensions.
+    Input tensors with one or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example::
+
+        >>> x = torch.arange(2)
+        >>> x
+        tensor([0, 1])
+        >>> torch.atleast_1d(x)
+        tensor([0, 1])
+        >>> x = torch.tensor(1.)
+        >>> x
+        tensor(1.)
+        >>> torch.atleast_1d(x)
+        tensor([1.])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_1d((x, y))
+        (tensor([0.5000]), tensor([1.]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_1d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_1d(tensors)  # type: ignore[attr-defined]
+
+
+def atleast_2d(*tensors):
+    r"""
+    Returns a 2-dimensional view of each input tensor with zero dimensions.
+    Input tensors with two or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example::
+
+        >>> x = torch.tensor(1.)
+        >>> x
+        tensor(1.)
+        >>> torch.atleast_2d(x)
+        tensor([[1.]])
+        >>> x = torch.arange(4).view(2, 2)
+        >>> x
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.atleast_2d(x)
+        tensor([[0, 1],
+                [2, 3]])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_2d((x, y))
+        (tensor([[0.5000]]), tensor([[1.]]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_2d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_2d(tensors)  # type: ignore[attr-defined]
+
+
+def atleast_3d(*tensors):
+    r"""
+    Returns a 3-dimensional view of each input tensor with zero dimensions.
+    Input tensors with three or more dimensions are returned as-is.
+
+    Args:
+        input (Tensor or list of Tensors)
+
+    Returns:
+        output (Tensor or tuple of Tensors)
+
+    Example:
+
+        >>> x = torch.tensor(0.5)
+        >>> x
+        tensor(0.5000)
+        >>> torch.atleast_3d(x)
+        tensor([[[0.5000]]])
+        >>> y = torch.arange(4).view(2, 2)
+        >>> y
+        tensor([[0, 1],
+                [2, 3]])
+        >>> torch.atleast_3d(y)
+        tensor([[[0],
+                 [1]],
+                <BLANKLINE>
+                [[2],
+                 [3]]])
+        >>> x = torch.tensor(1).view(1, 1, 1)
+        >>> x
+        tensor([[[1]]])
+        >>> torch.atleast_3d(x)
+        tensor([[[1]]])
+        >>> x = torch.tensor(0.5)
+        >>> y = torch.tensor(1.)
+        >>> torch.atleast_3d((x, y))
+        (tensor([[[0.5000]]]), tensor([[[1.]]]))
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(tensors):
+        return handle_torch_function(atleast_3d, tensors, *tensors)
+    if len(tensors) == 1:
+        tensors = tensors[0]
+    return _VF.atleast_3d(tensors)  # type: ignore[attr-defined]
+
+
+if TYPE_CHECKING:
+    pass
+    # There's no good way to use this type annotation; cannot rename norm() to
+    # _norm_impl() in a way that doesn't break JIT overloads. So leave untyped
+    # for mypy for now.
+    #    def norm(input: Tensor,
+    #             p: Optional[Union[str, Number]] = "fro",
+    #             dim: Optional[Union[int, List[int]]] = None,
+    #             keepdim: bool = False,
+    #             out: Optional[Tensor] = None,
+    #             dtype: _dtype = None) -> Tensor:
+    #        return _norm_impl(input, p, dim, keepdim, out, dtype)
+else:
+    # TODO: type dim as BroadcastingList when
+    # https://github.com/pytorch/pytorch/issues/33782 is fixed
+    @overload
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):
+        # type: (Tensor, str, Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, Optional[number], Optional[List[int]], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, Optional[number], Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+    @overload  # noqa: F811
+    def norm(input, p="fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+        # type: (Tensor, str, Optional[int], bool, Optional[Tensor], Optional[int]) -> Tensor
+        pass
+
+
+def norm(input, p: Optional[Union[float, str]] = "fro", dim=None, keepdim=False, out=None, dtype=None):  # noqa: F811
+    r"""Returns the matrix norm or vector norm of a given tensor.
+
+    .. warning::
+
+        torch.norm is deprecated and may be removed in a future PyTorch release.
+        Its documentation and behavior may be incorrect, and it is no longer
+        actively maintained.
+
+        Use :func:`torch.linalg.vector_norm` when computing vector norms and
+        :func:`torch.linalg.matrix_norm` when computing matrix norms.
+        For a function with a similar behavior as this one see :func:`torch.linalg.norm`.
+        Note, however, the signature for these functions is slightly different than the
+        signature for ``torch.norm``.
+
+    Args:
+        input (Tensor): The input tensor. Its data type must be either a floating
+            point or complex type. For complex inputs, the norm is calculated using the
+            absolute value of each element. If the input is complex and neither
+            :attr:`dtype` nor :attr:`out` is specified, the result's data type will
+            be the corresponding floating point type (e.g. float if :attr:`input` is
+            complexfloat).
+
+        p (int, float, inf, -inf, 'fro', 'nuc', optional): the order of norm. Default: ``'fro'``
+            The following norms can be calculated:
+
+            ======  ==============  ==========================
+            ord     matrix norm     vector norm
+            ======  ==============  ==========================
+            'fro'   Frobenius norm  --
+            'nuc'   nuclear norm    --
+            Number  --              sum(abs(x)**ord)**(1./ord)
+            ======  ==============  ==========================
+
+            The vector norm can be calculated across any number of dimensions.
+            The corresponding dimensions of :attr:`input` are flattened into
+            one dimension, and the norm is calculated on the flattened
+            dimension.
+
+            Frobenius norm produces the same result as ``p=2`` in all cases
+            except when :attr:`dim` is a list of three or more dims, in which
+            case Frobenius norm throws an error.
+
+            Nuclear norm can only be calculated across exactly two dimensions.
+
+        dim (int, tuple of ints, list of ints, optional):
+            Specifies which dimension or dimensions of :attr:`input` to
+            calculate the norm across. If :attr:`dim` is ``None``, the norm will
+            be calculated across all dimensions of :attr:`input`. If the norm
+            type indicated by :attr:`p` does not support the specified number of
+            dimensions, an error will occur.
+        keepdim (bool, optional): whether the output tensors have :attr:`dim`
+            retained or not. Ignored if :attr:`dim` = ``None`` and
+            :attr:`out` = ``None``. Default: ``False``
+        out (Tensor, optional): the output tensor. Ignored if
+            :attr:`dim` = ``None`` and :attr:`out` = ``None``.
+        dtype (:class:`torch.dtype`, optional): the desired data type of
+            returned tensor. If specified, the input tensor is casted to
+            :attr:`dtype` while performing the operation. Default: None.
+
+    .. note::
+        Even though ``p='fro'`` supports any number of dimensions, the true
+        mathematical definition of Frobenius norm only applies to tensors with
+        exactly two dimensions. :func:`torch.linalg.matrix_norm` with ``ord='fro'``
+        aligns with the mathematical definition, since it can only be applied across
+        exactly two dimensions.
+
+    Example::
+
+        >>> import torch
+        >>> a = torch.arange(9, dtype= torch.float) - 4
+        >>> b = a.reshape((3, 3))
+        >>> torch.norm(a)
+        tensor(7.7460)
+        >>> torch.norm(b)
+        tensor(7.7460)
+        >>> torch.norm(a, float('inf'))
+        tensor(4.)
+        >>> torch.norm(b, float('inf'))
+        tensor(4.)
+        >>> c = torch.tensor([[ 1, 2, 3], [-1, 1, 4]] , dtype=torch.float)
+        >>> torch.norm(c, dim=0)
+        tensor([1.4142, 2.2361, 5.0000])
+        >>> torch.norm(c, dim=1)
+        tensor([3.7417, 4.2426])
+        >>> torch.norm(c, p=1, dim=1)
+        tensor([6., 6.])
+        >>> d = torch.arange(8, dtype=torch.float).reshape(2, 2, 2)
+        >>> torch.norm(d, dim=(1, 2))
+        tensor([ 3.7417, 11.2250])
+        >>> torch.norm(d[0, :, :]), torch.norm(d[1, :, :])
+        (tensor(3.7417), tensor(11.2250))
+    """
+
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            norm, (input,), input, p=p, dim=dim, keepdim=keepdim, out=out, dtype=dtype)
+
+    # NB. All the repeated code and weird python is to please TorchScript.
+    #     For a more compact implementation see the relevant function in `_refs/__init__.py`
+
+    # We don't do this for MPS or sparse tensors
+    if input.layout == torch.strided and input.device.type in \
+            ("cpu", "cuda", "meta", torch.utils.backend_registration._privateuse1_backend_name):
+        if dim is not None:
+            if isinstance(dim, (int, torch.SymInt)):
+                _dim = [dim]
+            else:
+                _dim = dim
+        else:
+            _dim = None  # type: ignore[assignment]
+
+        if isinstance(p, str):
+            if p == "fro" and (dim is None or isinstance(dim, (int, torch.SymInt)) or len(dim) <= 2):
+                if out is None:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype)
+                else:
+                    return torch.linalg.vector_norm(input, 2, _dim, keepdim, dtype=dtype, out=out)
+
+            # Here we either call the nuclear norm, or we call matrix_norm with some arguments
+            # that will throw an error
+            if _dim is None:
+                _dim = list(range(input.ndim))
+            if out is None:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.matrix_norm(input, p, _dim, keepdim, dtype=dtype, out=out)
+        else:
+            # NB. p should be Union[str, number], not Optional!
+            _p = 2.0 if p is None else p
+            if out is None:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype)
+            else:
+                return torch.linalg.vector_norm(input, _p, _dim, keepdim, dtype=dtype, out=out)
+
+    ndim = input.dim()
+
+    # catch default case
+    if dim is None and out is None and dtype is None and p is not None:
+        if isinstance(p, str):
+            if p == "fro":
+                return _VF.frobenius_norm(input, dim=(), keepdim=keepdim)
+        if not isinstance(p, str):
+            _dim = [i for i in range(ndim)]  # noqa: C416 TODO: rewrite as list(range(m))
+            return _VF.norm(input, p, dim=_dim, keepdim=keepdim)  # type: ignore[attr-defined]
+
+    # TODO: when https://github.com/pytorch/pytorch/issues/33782 is fixed
+    # remove the overloads where dim is an int and replace with BraodcastingList1
+    # and remove next four lines, replace _dim with dim
+    if dim is not None:
+        if isinstance(dim, (int, torch.SymInt)):
+            _dim = [dim]
+        else:
+            _dim = dim
+    else:
+        _dim = None  # type: ignore[assignment]
+
+    if isinstance(p, str):
+        if p == "fro":
+            if dtype is not None:
+                raise ValueError("dtype argument is not supported in frobenius norm")
+
+            if _dim is None:
+                _dim = list(range(ndim))
+            if out is None:
+                return _VF.frobenius_norm(input, _dim, keepdim=keepdim)  # type: ignore[arg-type]
+            else:
+                return _VF.frobenius_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+        elif p == "nuc":
+            if dtype is not None:
+                raise ValueError("dtype argument is not supported in nuclear norm")
+            if _dim is None:
+                if out is None:
+                    return _VF.nuclear_norm(input, keepdim=keepdim)  # type: ignore[arg-type]
+                else:
+                    return _VF.nuclear_norm(input, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+            else:
+                if out is None:
+                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim)  # type: ignore[arg-type]
+                else:
+                    return _VF.nuclear_norm(input, _dim, keepdim=keepdim, out=out)  # type: ignore[arg-type]
+        raise RuntimeError(f"only valid string values are 'fro' and 'nuc', found {p}")
+    else:
+        if _dim is None:
+            _dim = list(range(ndim))
+
+        if out is None:
+            if dtype is None:
+                return _VF.norm(input, p, _dim, keepdim=keepdim)  # type: ignore[attr-defined]
+            else:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype)  # type: ignore[attr-defined]
+        else:
+            if dtype is None:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, out=out)  # type: ignore[attr-defined]
+            else:
+                return _VF.norm(input, p, _dim, keepdim=keepdim, dtype=dtype, out=out)  # type: ignore[attr-defined]
+
+def unravel_index(indices: Tensor, shape: Union[int, Sequence[int], torch.Size]) -> Tuple[Tensor, ...]:
+    r"""Converts a tensor of flat indices into a tuple of coordinate tensors that
+    index into an arbitrary tensor of the specified shape.
+
+    Args:
+        indices (Tensor): An integer tensor containing indices into the
+            flattened version of an arbitrary tensor of shape :attr:`shape`.
+            All elements must be in the range ``[0, prod(shape) - 1]``.
+
+        shape (int, sequence of ints, or torch.Size): The shape of the arbitrary
+            tensor. All elements must be non-negative.
+
+    Returns:
+        tuple of Tensors: Each ``i``-th tensor in the output corresponds with
+        dimension ``i`` of :attr:`shape`. Each tensor has the same shape as
+        ``indices`` and contains one index into dimension ``i`` for each of the
+        flat indices given by ``indices``.
+
+    Example::
+
+        >>> import torch
+        >>> torch.unravel_index(torch.tensor(4), (3, 2))
+        (tensor(2),
+         tensor(0))
+
+        >>> torch.unravel_index(torch.tensor([4, 1]), (3, 2))
+        (tensor([2, 0]),
+         tensor([0, 1]))
+
+        >>> torch.unravel_index(torch.tensor([0, 1, 2, 3, 4, 5]), (3, 2))
+        (tensor([0, 0, 1, 1, 2, 2]),
+         tensor([0, 1, 0, 1, 0, 1]))
+
+        >>> torch.unravel_index(torch.tensor([1234, 5678]), (10, 10, 10, 10))
+        (tensor([1, 5]),
+         tensor([2, 6]),
+         tensor([3, 7]),
+         tensor([4, 8]))
+
+        >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (10, 10, 10, 10))
+        (tensor([[1], [5]]),
+         tensor([[2], [6]]),
+         tensor([[3], [7]]),
+         tensor([[4], [8]]))
+
+        >>> torch.unravel_index(torch.tensor([[1234], [5678]]), (100, 100))
+        (tensor([[12], [56]]),
+         tensor([[34], [78]]))
+    """
+    if has_torch_function_unary(indices):
+        return handle_torch_function(
+            unravel_index, (indices,), indices, shape=shape)
+    res_tensor = _unravel_index(indices, shape)
+    return res_tensor.unbind(-1)
+
+def _unravel_index(indices: Tensor, shape: Union[int, Sequence[int]]) -> Tensor:
+    torch._check_type(
+        not indices.is_complex() and not indices.is_floating_point() and not indices.dtype == torch.bool,
+        lambda: f"expected 'indices' to be integer dtype, but got {indices.dtype}")
+
+    torch._check_type(
+        isinstance(shape, (int, torch.SymInt, Sequence)),
+        lambda: f"expected 'shape' to be int or sequence of ints, but got {type(shape)}")
+
+    if isinstance(shape, (int, torch.SymInt)):
+        shape = torch.Size([shape])
+    else:
+        for dim in shape:
+            torch._check_type(
+                isinstance(dim, (int, torch.SymInt)),
+                lambda: f"expected 'shape' sequence to only contain ints, but got {type(dim)}")
+        shape = torch.Size(shape)
+
+    torch._check_value(
+        all(dim >= 0 for dim in shape),
+        lambda: f"'shape' cannot have negative values, but got {tuple(shape)}")
+
+    coefs = list(reversed(list(itertools.accumulate(reversed(shape[1:] + torch.Size([1])), func=operator.mul))))
+    return indices.unsqueeze(-1).floor_divide(
+        torch.tensor(coefs, device=indices.device, dtype=torch.int64)
+    ) % torch.tensor(shape, device=indices.device, dtype=torch.int64)
+
+def chain_matmul(*matrices, out=None):
+    r"""Returns the matrix product of the :math:`N` 2-D tensors. This product is efficiently computed
+    using the matrix chain order algorithm which selects the order in which incurs the lowest cost in terms
+    of arithmetic operations (`[CLRS]`_). Note that since this is a function to compute the product, :math:`N`
+    needs to be greater than or equal to 2; if equal to 2 then a trivial matrix-matrix product is returned.
+    If :math:`N` is 1, then this is a no-op - the original matrix is returned as is.
+
+    .. warning::
+
+        :func:`torch.chain_matmul` is deprecated and will be removed in a future PyTorch release.
+        Use :func:`torch.linalg.multi_dot` instead, which accepts a list of two or more tensors
+        rather than multiple arguments.
+
+    Args:
+        matrices (Tensors...): a sequence of 2 or more 2-D tensors whose product is to be determined.
+        out (Tensor, optional): the output tensor. Ignored if :attr:`out` = ``None``.
+
+    Returns:
+        Tensor: if the :math:`i^{th}` tensor was of dimensions :math:`p_{i} \times p_{i + 1}`, then the product
+        would be of dimensions :math:`p_{1} \times p_{N + 1}`.
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> a = torch.randn(3, 4)
+        >>> b = torch.randn(4, 5)
+        >>> c = torch.randn(5, 6)
+        >>> d = torch.randn(6, 7)
+        >>> # will raise a deprecation warning
+        >>> torch.chain_matmul(a, b, c, d)
+        tensor([[ -2.3375,  -3.9790,  -4.1119,  -6.6577,   9.5609, -11.5095,  -3.2614],
+                [ 21.4038,   3.3378,  -8.4982,  -5.2457, -10.2561,  -2.4684,   2.7163],
+                [ -0.9647,  -5.8917,  -2.3213,  -5.2284,  12.8615, -12.2816,  -2.5095]])
+
+    .. _`[CLRS]`: https://mitpress.mit.edu/books/introduction-algorithms-third-edition
+    """
+    # This wrapper exists to support variadic args.
+    if has_torch_function(matrices):
+        return handle_torch_function(chain_matmul, matrices, *matrices)
+
+    if out is None:
+        return _VF.chain_matmul(matrices)  # type: ignore[attr-defined]
+    else:
+        return _VF.chain_matmul(matrices, out=out)  # type: ignore[attr-defined]
+
+
+def _lu_impl(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Any) -> Tuple[Tensor, Tensor, Tensor]
+    r"""Computes the LU factorization of a matrix or batches of matrices
+    :attr:`A`. Returns a tuple containing the LU factorization and
+    pivots of :attr:`A`.  Pivoting is done if :attr:`pivot` is set to
+    ``True``.
+
+    .. warning::
+
+        :func:`torch.lu` is deprecated in favor of :func:`torch.linalg.lu_factor`
+        and :func:`torch.linalg.lu_factor_ex`. :func:`torch.lu` will be removed in a
+        future PyTorch release.
+        ``LU, pivots, info = torch.lu(A, compute_pivots)`` should be replaced with
+
+        .. code:: python
+
+            LU, pivots = torch.linalg.lu_factor(A, compute_pivots)
+
+        ``LU, pivots, info = torch.lu(A, compute_pivots, get_infos=True)`` should be replaced with
+
+        .. code:: python
+
+            LU, pivots, info = torch.linalg.lu_factor_ex(A, compute_pivots)
+
+    .. note::
+        * The returned permutation matrix for every matrix in the batch is
+          represented by a 1-indexed vector of size ``min(A.shape[-2], A.shape[-1])``.
+          ``pivots[i] == j`` represents that in the ``i``-th step of the algorithm,
+          the ``i``-th row was permuted with the ``j-1``-th row.
+        * LU factorization with :attr:`pivot` = ``False`` is not available
+          for CPU, and attempting to do so will throw an error. However,
+          LU factorization with :attr:`pivot` = ``False`` is available for
+          CUDA.
+        * This function does not check if the factorization was successful
+          or not if :attr:`get_infos` is ``True`` since the status of the
+          factorization is present in the third element of the return tuple.
+        * In the case of batches of square matrices with size less or equal
+          to 32 on a CUDA device, the LU factorization is repeated for
+          singular matrices due to the bug in the MAGMA library
+          (see magma issue 13).
+        * ``L``, ``U``, and ``P`` can be derived using :func:`torch.lu_unpack`.
+
+    .. warning::
+        The gradients of this function will only be finite when :attr:`A` is full rank.
+        This is because the LU decomposition is just differentiable at full rank matrices.
+        Furthermore, if :attr:`A` is close to not being full rank,
+        the gradient will be numerically unstable as it depends on the computation of :math:`L^{-1}` and :math:`U^{-1}`.
+
+    Args:
+        A (Tensor): the tensor to factor of size :math:`(*, m, n)`
+        pivot (bool, optional): controls whether pivoting is done. Default: ``True``
+        get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
+                                    Default: ``False``
+        out (tuple, optional): optional output tuple. If :attr:`get_infos` is ``True``,
+                               then the elements in the tuple are Tensor, IntTensor,
+                               and IntTensor. If :attr:`get_infos` is ``False``, then the
+                               elements in the tuple are Tensor, IntTensor. Default: ``None``
+
+    Returns:
+        (Tensor, IntTensor, IntTensor (optional)): A tuple of tensors containing
+
+            - **factorization** (*Tensor*): the factorization of size :math:`(*, m, n)`
+
+            - **pivots** (*IntTensor*): the pivots of size :math:`(*, \text{min}(m, n))`.
+              ``pivots`` stores all the intermediate transpositions of rows.
+              The final permutation ``perm`` could be reconstructed by
+              applying ``swap(perm[i], perm[pivots[i] - 1])`` for ``i = 0, ..., pivots.size(-1) - 1``,
+              where ``perm`` is initially the identity permutation of :math:`m` elements
+              (essentially this is what :func:`torch.lu_unpack` is doing).
+
+            - **infos** (*IntTensor*, *optional*): if :attr:`get_infos` is ``True``, this is a tensor of
+              size :math:`(*)` where non-zero values indicate whether factorization for the matrix or
+              each minibatch has succeeded or failed
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> A = torch.randn(2, 3, 3)
+        >>> A_LU, pivots = torch.lu(A)
+        >>> A_LU
+        tensor([[[ 1.3506,  2.5558, -0.0816],
+                 [ 0.1684,  1.1551,  0.1940],
+                 [ 0.1193,  0.6189, -0.5497]],
+
+                [[ 0.4526,  1.2526, -0.3285],
+                 [-0.7988,  0.7175, -0.9701],
+                 [ 0.2634, -0.9255, -0.3459]]])
+        >>> pivots
+        tensor([[ 3,  3,  3],
+                [ 3,  3,  3]], dtype=torch.int32)
+        >>> A_LU, pivots, info = torch.lu(A, get_infos=True)
+        >>> if info.nonzero().size(0) == 0:
+        ...     print('LU factorization succeeded for all samples!')
+        LU factorization succeeded for all samples!
+    """
+    # If get_infos is True, then we don't need to check for errors and vice versa
+    return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))
+
+if TYPE_CHECKING:
+    _ListOrSeq = Sequence[Tensor]
+else:
+    _ListOrSeq = List[Tensor]
+
+
+def _check_list_size(out_len: int, get_infos: bool, out: _ListOrSeq) -> None:
+    get_infos_int = 1 if get_infos else 0
+    if out_len - get_infos_int != 2:
+        raise TypeError(f"expected tuple of {2 + int(get_infos)} elements but got {out_len}")
+    if not isinstance(out, (tuple, list)):
+        raise TypeError(f"argument 'out' must be tuple of Tensors, not {type(out).__name__}")
+
+
+def _lu_with_infos(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor, Tensor]]) -> Tuple[Tensor, Tensor, Tensor]
+    if has_torch_function_unary(A):
+        return handle_torch_function(
+            lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
+    result = _lu_impl(A, pivot, get_infos, out)
+    if out is not None:
+        _check_list_size(len(out), get_infos, out)
+        for i in range(len(out)):
+            out[i].resize_as_(result[i]).copy_(result[i])
+        return out
+    else:
+        return result  # A_LU, pivots, infos
+
+
+def _lu_no_infos(A, pivot=True, get_infos=False, out=None):
+    # type: (Tensor, bool, bool, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
+    # need to check for torch_function here so that we exit if
+    if has_torch_function_unary(A):
+        return handle_torch_function(
+            lu, (A,), A, pivot=pivot, get_infos=get_infos, out=out)
+    result = _lu_impl(A, pivot, get_infos, out)
+    if out is not None:
+        _check_list_size(len(out), get_infos, out)
+        for i in range(len(out)):
+            out[i].resize_as_(result[i]).copy_(result[i])
+        return out
+    else:
+        return result[0], result[1]  # A_LU, pivots
+
+# The return type of lu depends on `get_infos`, so in order to resolve the output type
+# of lu in TorchScript we need to statically know the value of `get_infos`
+lu = boolean_dispatch(
+    arg_name='get_infos',
+    arg_index=2,
+    default=False,
+    if_true=_lu_with_infos,
+    if_false=_lu_no_infos,
+    module_name=__name__,
+    func_name='lu')
+lu.__doc__ = _lu_impl.__doc__
+
+
+def align_tensors(*tensors):
+    raise RuntimeError('`align_tensors` not yet implemented.')
diff --git a/MLPY/Lib/site-packages/torch/futures/__init__.py b/MLPY/Lib/site-packages/torch/futures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98a32e3da8cbebfa6489c1da74b559b4daab126
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/futures/__init__.py
@@ -0,0 +1,318 @@
+from __future__ import annotations
+
+from typing import cast, Callable, Generic, List, Optional, Type, TypeVar, Union
+
+import torch
+
+__all__ = ['Future', 'collect_all', 'wait_all']
+
+T = TypeVar("T")
+S = TypeVar("S")
+
+
+class _PyFutureMeta(type(torch._C.Future), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+
+
+class Future(torch._C.Future, Generic[T], metaclass=_PyFutureMeta):
+    r"""
+    Wrapper around a ``torch._C.Future`` which encapsulates an asynchronous
+    execution of a callable, e.g. :meth:`~torch.distributed.rpc.rpc_async`. It
+    also exposes a set of APIs to add callback functions and set results.
+
+    .. warning:: GPU support is a beta feature, subject to changes.
+    """
+
+    def __init__(self, *, devices: Optional[List[Union[int, str, torch.device]]] = None):
+        r"""
+        Create an empty unset ``Future``. If the future is intended to hold
+        values containing CUDA tensors, (a superset of) their CUDA devices must
+        be specified at construction. (This is only supported if
+        ``torch.cuda.is_available()`` returns ``True``). This is needed to
+        ensure proper CUDA stream synchronization. The child futures, returned
+        by the ``then`` method, will inherit these devices.
+
+        Args:
+            devices(``List[Union[int, str, torch.device]]``, optional): the set
+                of devices on which tensors contained in this future's value are
+                allowed to reside and on which callbacks are allowed to operate.
+        """
+        if devices is None:
+            devices = []
+        super().__init__([torch.device(d) for d in devices])
+
+    def done(self) -> bool:
+        r"""
+        Return ``True`` if this ``Future`` is done. A ``Future`` is done if it
+        has a result or an exception.
+
+        If the value contains tensors that reside on GPUs, ``Future.done()``
+        will return ``True`` even if the asynchronous kernels that are
+        populating those tensors haven't yet completed running on the device,
+        because at such stage the result is already usable, provided one
+        performs the appropriate synchronizations (see :meth:`wait`).
+        """
+        return super().done()
+
+    def wait(self) -> T:
+        r"""
+        Block until the value of this ``Future`` is ready.
+
+        If the value contains tensors that reside on GPUs, then an additional
+        synchronization is performed with the kernels (executing on the device)
+        which may be asynchronously populating those tensors. Such sync is
+        non-blocking, which means that ``wait()`` will insert the necessary
+        instructions in the current streams to ensure that further operations
+        enqueued on those streams will be properly scheduled after the async
+        kernels but, once that is done, ``wait()`` will return, even if those
+        kernels are still running. No further synchronization is required when
+        accessing and using the values, as long as one doesn't change streams.
+
+        Returns:
+            The value held by this ``Future``. If the function (callback or RPC)
+            creating the value has thrown an error, this ``wait`` method will
+            also throw an error.
+        """
+        return super().wait()
+
+    def value(self) -> T:
+        r"""
+        Obtain the value of an already-completed future.
+
+        This method should only be called after a call to :meth:`wait` has
+        completed, or inside a callback function passed to :meth:`then`. In
+        other cases this ``Future`` may not yet hold a value and calling
+        ``value()`` could fail.
+
+        If the value contains tensors that reside on GPUs, then this method will
+        *not* perform any additional synchronization. This should be done
+        beforehand, separately, through a call to :meth:`wait` (except within
+        callbacks, for which it's already being taken care of by :meth:`then`).
+
+        Returns:
+            The value held by this ``Future``. If the function (callback or RPC)
+            creating the value has thrown an error, this ``value()`` method will
+            also throw an error.
+        """
+        return super().value()
+
+    def then(self, callback: Callable[[Future[T]], S]) -> Future[S]:
+        r"""
+        Append the given callback function to this ``Future``, which will be run
+        when the ``Future`` is completed.  Multiple callbacks can be added to
+        the same ``Future``, but the order in which they will be executed cannot
+        be guaranteed (to enforce a certain order consider chaining:
+        ``fut.then(cb1).then(cb2)``). The callback must take one argument, which
+        is the reference to this ``Future``. The callback function can use the
+        :meth:`value` method to get the value. Note that if this ``Future`` is
+        already completed, the given callback will be run immediately inline.
+
+        If the ``Future``'s value contains tensors that reside on GPUs, the
+        callback might be invoked while the async kernels that are populating
+        those tensors haven't yet finished executing on the device. However, the
+        callback will be invoked with some dedicated streams set as current
+        (fetched from a global pool) which will be synchronized with those
+        kernels. Hence any operation performed by the callback on these tensors
+        will be scheduled on the device after the kernels complete. In other
+        words, as long as the callback doesn't switch streams, it can safely
+        manipulate the result without any additional synchronization. This is
+        similar to the non-blocking behavior of :meth:`wait`.
+
+        Similarly, if the callback returns a value that contains tensors that
+        reside on a GPU, it can do so even if the kernels that are producing
+        these tensors are still running on the device, as long as the callback
+        didn't change streams during its execution. If one wants to change
+        streams, one must be careful to re-synchronize them with the original
+        streams, that is, those that were current when the callback was invoked.
+
+        Args:
+            callback(``Callable``): a ``Callable`` that takes this ``Future`` as
+                                    the only argument.
+
+        Returns:
+            A new ``Future`` object that holds the return value of the
+            ``callback`` and will be marked as completed when the given
+            ``callback`` finishes.
+
+        .. note:: Note that if the callback function throws, either
+            through the original future being completed with an exception and
+            calling ``fut.wait()``, or through other code in the callback, the
+            future returned by ``then`` will be marked appropriately with the
+            encountered error. However, if this callback later completes
+            additional futures, those futures are not marked as completed with
+            an error and the user is responsible for handling completion/waiting
+            on those futures independently.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
+            >>> def callback(fut):
+            ...     print(f"RPC return value is {fut.wait()}.")
+            >>> fut = torch.futures.Future()
+            >>> # The inserted callback will print the return value when
+            >>> # receiving the response from "worker1"
+            >>> cb_fut = fut.then(callback)
+            >>> chain_cb_fut = cb_fut.then(
+            ...     lambda x : print(f"Chained cb done. {x.wait()}")
+            ... )
+            >>> fut.set_result(5)
+            RPC return value is 5.
+            Chained cb done. None
+        """
+        return cast(Future[S], super().then(callback))
+
+    def add_done_callback(self, callback: Callable[[Future[T]], None]) -> None:
+        r"""
+        Append the given callback function to this ``Future``, which will be run
+        when the ``Future`` is completed.  Multiple callbacks can be added to
+        the same ``Future``, but the order in which they will be executed cannot
+        be guaranteed. The callback must take one argument, which is the
+        reference to this ``Future``. The callback function can use the
+        :meth:`value` method to get the value. Note that if this ``Future`` is
+        already completed, the given callback will be run inline.
+
+        We recommend that you use the :meth:`then` method as it provides a way
+        to synchronize after your callback has completed. ``add_done_callback``
+        can be cheaper if your callback does not return anything. But both
+        :meth:`then` and ``add_done_callback`` use the same callback
+        registration API under the hood.
+
+        With respect to GPU tensors, this method behaves in the same way as
+        :meth:`then`.
+
+        Args:
+            callback(``Future``): a ``Callable`` that takes in one argument,
+                which is the reference to this ``Future``.
+
+        .. note:: Note that if the callback function throws, either
+            through the original future being completed with an exception and
+            calling ``fut.wait()``, or through other code in the callback,
+            error handling must be carefully taken care of. For example, if
+            this callback later completes additional futures, those futures are
+            not marked as completed with an error and the user is responsible
+            for handling completion/waiting on those futures independently.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
+            >>> def callback(fut):
+            ...     print("This will run after the future has finished.")
+            ...     print(fut.wait())
+            >>> fut = torch.futures.Future()
+            >>> fut.add_done_callback(callback)
+            >>> fut.set_result(5)
+            This will run after the future has finished.
+            5
+        """
+        super().add_done_callback(callback)
+
+    def set_result(self, result: T) -> None:
+        r"""
+        Set the result for this ``Future``, which will mark this ``Future`` as
+        completed and trigger all attached callbacks. Note that a ``Future``
+        cannot be marked completed twice.
+
+        If the result contains tensors that reside on GPUs, this method can be
+        called even if the asynchronous kernels that are populating those
+        tensors haven't yet completed running on the device, provided that the
+        streams on which those kernels were enqueued are set as the current ones
+        when this method is called. Put simply, it's safe to call this method
+        immediately after launching those kernels, without any additional
+        synchronization, as long as one doesn't change streams in between. This
+        method will record events on all the relevant current streams and will
+        use them to ensure proper scheduling for all the consumers of this
+        ``Future``.
+
+        Args:
+            result (object): the result object of this ``Future``.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
+            >>> import threading
+            >>> import time
+            >>> def slow_set_future(fut, value):
+            ...     time.sleep(0.5)
+            ...     fut.set_result(value)
+            >>> fut = torch.futures.Future()
+            >>> t = threading.Thread(
+            ...     target=slow_set_future,
+            ...     args=(fut, torch.ones(2) * 3)
+            ... )
+            >>> t.start()
+            >>> print(fut.wait())
+            tensor([3., 3.])
+            >>> t.join()
+        """
+        super().set_result(result)
+
+    def set_exception(self, result: T) -> None:
+        r"""
+        Set an exception for this ``Future``, which will mark this ``Future`` as
+        completed with an error and trigger all attached callbacks. Note that
+        when calling wait()/value() on this ``Future``, the exception set here
+        will be raised inline.
+
+        Args:
+            result (BaseException): the exception for this ``Future``.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
+            >>> fut = torch.futures.Future()
+            >>> fut.set_exception(ValueError("foo"))
+            >>> fut.wait()
+            Traceback (most recent call last):
+            ...
+            ValueError: foo
+        """
+        assert isinstance(result, Exception), f"{result} is of type {type(result)}, not an Exception."
+
+        def raise_error(fut_result):
+            raise fut_result
+
+        super()._set_unwrap_func(raise_error)
+        self.set_result(result)  # type: ignore[arg-type]
+
+
+def collect_all(futures: List[Future]) -> Future[List[Future]]:
+    r"""
+    Collects the provided :class:`~torch.futures.Future` objects into a single
+    combined :class:`~torch.futures.Future` that is completed when all of the
+    sub-futures are completed.
+
+    Args:
+        futures (list): a list of :class:`~torch.futures.Future` objects.
+
+    Returns:
+        Returns a :class:`~torch.futures.Future` object to a list of the passed
+        in Futures.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_FUTURES)
+        >>> fut0 = torch.futures.Future()
+        >>> fut1 = torch.futures.Future()
+        >>> fut = torch.futures.collect_all([fut0, fut1])
+        >>> fut0.set_result(0)
+        >>> fut1.set_result(1)
+        >>> fut_list = fut.wait()
+        >>> print(f"fut0 result = {fut_list[0].wait()}")
+        fut0 result = 0
+        >>> print(f"fut1 result = {fut_list[1].wait()}")
+        fut1 result = 1
+    """
+    return cast(Future[List[Future]], torch._C._collect_all(cast(List[torch._C.Future], futures)))
+
+
+def wait_all(futures: List[Future]) -> List:
+    r"""
+    Waits for all provided futures to be complete, and returns
+    the list of completed values. If any of the futures encounters an error,
+    the method will exit early and report the error not waiting for other
+    futures to complete.
+
+    Args:
+        futures (list): a list of :class:`~torch.futures.Future` object.
+
+    Returns:
+        A list of the completed :class:`~torch.futures.Future` results. This
+        method will throw an error if ``wait`` on any
+        :class:`~torch.futures.Future` throws.
+    """
+    return [fut.wait() for fut in torch._C._collect_all(cast(List[torch._C.Future], futures)).wait()]
diff --git a/MLPY/Lib/site-packages/torch/futures/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/futures/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b62fa75e53d60a3e12f9f0d8f83d1b053ba2781a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/futures/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__init__.py b/MLPY/Lib/site-packages/torch/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9096227068557199f9c17ba0ee4c4b9bef985502
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/__init__.py
@@ -0,0 +1,89 @@
+r'''
+FX is a toolkit for developers to use to transform ``nn.Module``
+instances. FX consists of three main components: a **symbolic tracer,**
+an **intermediate representation**, and **Python code generation**. A
+demonstration of these components in action:
+
+::
+
+    import torch
+    # Simple module for demonstration
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.param = torch.nn.Parameter(torch.rand(3, 4))
+            self.linear = torch.nn.Linear(4, 5)
+
+        def forward(self, x):
+            return self.linear(x + self.param).clamp(min=0.0, max=1.0)
+
+    module = MyModule()
+
+    from torch.fx import symbolic_trace
+    # Symbolic tracing frontend - captures the semantics of the module
+    symbolic_traced : torch.fx.GraphModule = symbolic_trace(module)
+
+    # High-level intermediate representation (IR) - Graph representation
+    print(symbolic_traced.graph)
+    """
+    graph():
+        %x : [num_users=1] = placeholder[target=x]
+        %param : [num_users=1] = get_attr[target=param]
+        %add : [num_users=1] = call_function[target=operator.add](args = (%x, %param), kwargs = {})
+        %linear : [num_users=1] = call_module[target=linear](args = (%add,), kwargs = {})
+        %clamp : [num_users=1] = call_method[target=clamp](args = (%linear,), kwargs = {min: 0.0, max: 1.0})
+        return clamp
+    """
+
+    # Code generation - valid Python code
+    print(symbolic_traced.code)
+    """
+    def forward(self, x):
+        param = self.param
+        add = x + param;  x = param = None
+        linear = self.linear(add);  add = None
+        clamp = linear.clamp(min = 0.0, max = 1.0);  linear = None
+        return clamp
+    """
+
+The **symbolic tracer** performs "symbolic execution" of the Python
+code. It feeds fake values, called Proxies, through the code. Operations
+on theses Proxies are recorded. More information about symbolic tracing
+can be found in the :func:`symbolic_trace` and :class:`Tracer`
+documentation.
+
+The **intermediate representation** is the container for the operations
+that were recorded during symbolic tracing. It consists of a list of
+Nodes that represent function inputs, callsites (to functions, methods,
+or :class:`torch.nn.Module` instances), and return values. More information
+about the IR can be found in the documentation for :class:`Graph`. The
+IR is the format on which transformations are applied.
+
+**Python code generation** is what makes FX a Python-to-Python (or
+Module-to-Module) transformation toolkit. For each Graph IR, we can
+create valid Python code matching the Graph's semantics. This
+functionality is wrapped up in :class:`GraphModule`, which is a
+:class:`torch.nn.Module` instance that holds a :class:`Graph` as well as a
+``forward`` method generated from the Graph.
+
+Taken together, this pipeline of components (symbolic tracing ->
+intermediate representation -> transforms -> Python code generation)
+constitutes the Python-to-Python transformation pipeline of FX. In
+addition, these components can be used separately. For example,
+symbolic tracing can be used in isolation to capture a form of
+the code for analysis (and not transformation) purposes. Code
+generation can be used for programmatically generating models, for
+example from a config file. There are many uses for FX!
+
+Several example transformations can be found at the
+`examples <https://github.com/pytorch/examples/tree/master/fx>`__
+repository.
+'''
+
+from .graph_module import GraphModule
+from ._symbolic_trace import symbolic_trace, Tracer, wrap, PH, ProxyableClassMeta
+from .graph import Graph, CodeGen
+from .node import Node, map_arg, has_side_effect
+from .proxy import Proxy
+from .interpreter import Interpreter as Interpreter, Transformer as Transformer
+from .subgraph_rewriter import replace_pattern
diff --git a/MLPY/Lib/site-packages/torch/fx/__init__.pyi b/MLPY/Lib/site-packages/torch/fx/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..cd49d6e4a7685a71bcec8e67060c30addcc92bc4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/__init__.pyi
@@ -0,0 +1,11 @@
+from ._symbolic_trace import (
+    symbolic_trace as symbolic_trace,
+    Tracer as Tracer,
+    wrap as wrap,
+)
+from .graph import Graph as Graph
+from .graph_module import GraphModule as GraphModule
+from .interpreter import Interpreter as Interpreter, Transformer as Transformer
+from .node import has_side_effect as has_side_effect, map_arg as map_arg, Node as Node
+from .proxy import Proxy as Proxy
+from .subgraph_rewriter import replace_pattern as replace_pattern
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49d940e2232f97efd65d55b6d59192d8e1023b80
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/_compatibility.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/_compatibility.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3cd3839ba7da50e9129a06d95f9d5c8193ac3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/_compatibility.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..114bb63f35686b613da45aa816110d0c8ebb5659
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/_lazy_graph_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/_pytree.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/_pytree.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad4c5c949940de3682468746270c66de1d003d75
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/_pytree.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1142b16df1c5a1f26d898b3c333b33b35c7a2079
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/_symbolic_trace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/annotate.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/annotate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3740f8f884cda76c3f47cf743cdb4a7448b406d5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/annotate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8549dbd7a2ea60b9576ddbd0abf86dab0c04269
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0701a607a36730db64d8adf33a102249033a6cab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/graph_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/graph_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de89122380d284cd8e310c5b73e19ae90e22e0bc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/graph_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/immutable_collections.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/immutable_collections.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60b54a852dd35247f9c08775943ede3db6951bb3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/immutable_collections.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/interpreter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/interpreter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9892a39be0090ac82b7c7a06c5921baea4449a80
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/interpreter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/node.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/node.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4302a97a559db61c67a0d5646c12b739def8fbf2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/node.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/operator_schemas.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/operator_schemas.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42ac3abe7902d5994c817e024fa11ffe64873ba7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/operator_schemas.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/proxy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/proxy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae595d6f20ff2a6d490a16c502d7280a771141fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/proxy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86be2a47283d3a1fe2e361feeb6ad798e1755945
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/subgraph_rewriter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/tensor_type.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/tensor_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..883fc307bb589978227137d488e4e5628622a727
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/tensor_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/__pycache__/traceback.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/__pycache__/traceback.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f79497b9c1c2ff2189bac4be20d158d6d30c554
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/__pycache__/traceback.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/_compatibility.py b/MLPY/Lib/site-packages/torch/fx/_compatibility.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b3da3cbe981d01bb4cd9777320185a367e4c13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/_compatibility.py
@@ -0,0 +1,34 @@
+from typing import Any, Dict
+import textwrap
+
+_BACK_COMPAT_OBJECTS : Dict[Any, None] = {}
+_MARKED_WITH_COMPATIBILITY : Dict[Any, None] = {}
+
+def compatibility(is_backward_compatible : bool):
+    if is_backward_compatible:
+
+        def mark_back_compat(fn):
+            docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '')
+            docstring += """
+.. note::
+    Backwards-compatibility for this API is guaranteed.
+"""
+            fn.__doc__ = docstring
+            _BACK_COMPAT_OBJECTS.setdefault(fn)
+            _MARKED_WITH_COMPATIBILITY.setdefault(fn)
+            return fn
+
+        return mark_back_compat
+    else:
+
+        def mark_not_back_compat(fn):
+            docstring = textwrap.dedent(getattr(fn, '__doc__', None) or '')
+            docstring += """
+.. warning::
+    This API is experimental and is *NOT* backward-compatible.
+"""
+            fn.__doc__ = docstring
+            _MARKED_WITH_COMPATIBILITY.setdefault(fn)
+            return fn
+
+        return mark_not_back_compat
diff --git a/MLPY/Lib/site-packages/torch/fx/_lazy_graph_module.py b/MLPY/Lib/site-packages/torch/fx/_lazy_graph_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bcaf61263d40a4646a7c0b1a92dd6104610e9d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/_lazy_graph_module.py
@@ -0,0 +1,182 @@
+from contextlib import contextmanager
+
+from torch.fx import GraphModule
+from torch.fx.graph_module import (
+    _format_import_block,
+    reduce_graph_module,
+    reduce_package_graph_module,
+)
+from torch.package import PackageExporter, sys_importer
+from ._compatibility import compatibility
+
+_use_lazy_graph_module_flag = False
+_force_skip_lazy_graph_module_flag = False
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def _force_skip_lazy_graph_module():
+    """
+    Skip using lazy graph module disregarding the setting of _use_lazy_graph_module.
+    Use to skip _LazyGraphModule when testing inductor torchscript related backend.
+
+    torch.jit.script a _LazyGraphModule results in following error:
+        https://gist.github.com/shunting314/5143654c8084aed84ecd19b818258a69
+    """
+    try:
+        global _force_skip_lazy_graph_module_flag
+        prior = _force_skip_lazy_graph_module_flag
+        _force_skip_lazy_graph_module_flag = True
+        yield
+    finally:
+        _force_skip_lazy_graph_module_flag = prior
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def _use_lazy_graph_module(should_use: bool):
+    try:
+        global _use_lazy_graph_module_flag
+        prior = _use_lazy_graph_module_flag
+        _use_lazy_graph_module_flag = (
+            should_use and not _force_skip_lazy_graph_module_flag
+        )
+        yield
+    finally:
+        _use_lazy_graph_module_flag = prior
+
+
+@compatibility(is_backward_compatible=False)
+def _get_graph_module_cls():
+    return _LazyGraphModule if _use_lazy_graph_module_flag else GraphModule
+
+
+def _make_graph_module(*args, graph_module_cls=None, **kwargs):
+    if graph_module_cls is None:
+        graph_module_cls = _get_graph_module_cls()
+
+    return graph_module_cls(*args, **kwargs)
+
+
+@compatibility(is_backward_compatible=False)
+class _LazyGraphModule(GraphModule):
+    """
+    The main difference between _LazyGraphModule and GraphModule is how recompile happens.
+    GraphModule will do a 'recompile' call to generate python code and the forward method when it's
+    constructed. Later on if the graph get updated, recompile method can be called again to refresh
+    the saved python code and forward method.
+
+    However in some cases especially in inductor, the recompilation can be a waste since we never
+    check the python code for the graph module or call its forward method. A few more concreate
+    examples regarding pattern matching fx passes in inductor:
+    1. some passes will update the graph to be compiled and then call recompile on the GraphModule.
+    2. some passes will trace small pattern function to search it in the graph being compiled and
+       replace the match with the traced graph of a replacement function. The pattern graph and
+       replacement graph are quite small but there are large amount of them. Doing GraphModule.recompile
+       for them in GraphModule.__init__ is also a waste of time.
+
+    However simply skip calling GraphModule.recompile in these scenarios is also dangeruous.
+    People may want to check the python code or call the GraphModule's forward method for debugging purposes.
+
+    The way _LazyGraphModule solves it is, we override the recompile method to just mark the
+    need for recompilation but does not do the actual recompilation. Later on if people really
+    access the compiled python code or call the GraphModule's forward method, we do the real
+    recompilation.
+    """
+
+    @classmethod
+    def from_graphmodule(cls, gm: GraphModule):
+        if isinstance(gm, _LazyGraphModule):
+            return gm
+        else:
+            return _LazyGraphModule(gm, gm.graph)
+
+    @staticmethod
+    def force_recompile(gm):
+        """
+        Sometimes we need force a recompile as a workaround
+        - we want to do the real recompilation before symbolic_trace to avoid error:
+            https://gist.github.com/shunting314/75549c2e82ae07ac1139c94a3583d259
+        """
+        if isinstance(gm, _LazyGraphModule):
+            gm.real_recompile()
+
+    def real_recompile(self):
+        if self._needs_recompile():
+            self._real_recompile()
+
+    @classmethod
+    def _needs_recompile(cls):
+        return cls.forward is cls._lazy_forward
+
+    def _lazy_forward(self, *args, **kwargs):
+        # Call self.real_recompile() rather than self._real_recompile() here.
+        # The _lazy_forward method may be saved and call repeatedly.
+        # Calling self.real_recompile can make sure we skip recompilation if
+        # we have already done so.
+        self.real_recompile()
+        assert not self._needs_recompile()
+
+        # call `__call__` rather than 'forward' since recompilation may
+        # install a wrapper for `__call__` to provide a customized error
+        # message.
+        return self(*args, **kwargs)
+
+    forward = _lazy_forward
+
+    # TODO: we shold handle __reduce_deploy__ the same way as __reduce_package__,
+    # or __reduce__ by calling _real_recompile. But I don't find a good way
+    # to test __reduce_deploy__ out. Also it's very unlikely that LazyGraphModule
+    # will be used in torch::deploy. So it's skipped for now.
+
+    def __reduce_package__(self, exporter: PackageExporter):
+        """
+        Follow GraphModule.__reduce__ but call 'self._real_recompile' rather
+        than 'self.recompile' since for a _LazyGraphModule, self.recompile just
+        mark the need of recompilation and does not return the PythonCode object.
+        """
+        python_code = self._real_recompile()
+        dict_without_graph = self.__dict__.copy()
+        dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
+        del dict_without_graph["_graph"]
+
+        generated_module_name = f"fx-generated._{exporter.get_unique_id()}"
+        import_block = _format_import_block(python_code.globals, exporter.importer)
+        module_code = import_block + self.code
+        exporter.save_source_string(generated_module_name, module_code)
+        return (
+            reduce_package_graph_module,
+            (dict_without_graph, generated_module_name),
+        )
+
+    def __reduce__(self):
+        """
+        Follow GraphModule.__reduce__ but call 'self._real_recompile' rather
+        than 'self.recompile' since for a _LazyGraphModule, self.recompile just
+        mark the need of recompilation and does not return the PythonCode object.
+        """
+        python_code = self._real_recompile()
+        dict_without_graph = self.__dict__.copy()
+        import_block = _format_import_block(python_code.globals, sys_importer)
+        del dict_without_graph["_graph"]
+        return (reduce_graph_module, (dict_without_graph, import_block))
+
+    def _real_recompile(self):
+        return super().recompile()
+
+    @classmethod
+    def recompile(cls):
+        cls.forward = cls._lazy_forward
+
+    @property
+    def code(self) -> str:
+        self.real_recompile()
+        return super().code
+
+    def __str__(self) -> str:
+        """
+        str(GraphModule) will access the _code attribute. Make sure recompile
+        happens so _code attribute is available.
+        """
+        self.real_recompile()
+        return super().__str__()
diff --git a/MLPY/Lib/site-packages/torch/fx/_pytree.py b/MLPY/Lib/site-packages/torch/fx/_pytree.py
new file mode 100644
index 0000000000000000000000000000000000000000..510be5f33516bf7a2bd90f1f92ef18bc540a519b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/_pytree.py
@@ -0,0 +1,102 @@
+from collections import namedtuple
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type
+
+import torch.return_types
+
+from torch.utils._pytree import PyTree, TreeSpec
+
+FlattenFuncSpec = Callable[[PyTree, TreeSpec], List]
+FlattenFuncExactMatchSpec = Callable[[PyTree, TreeSpec], bool]
+
+SUPPORTED_NODES: Dict[Type[Any], FlattenFuncSpec] = {}
+SUPPORTED_NODES_EXACT_MATCH: Dict[Type[Any], Optional[FlattenFuncExactMatchSpec]] = {}
+
+
+def register_pytree_flatten_spec(
+    cls: Type[Any],
+    flatten_fn_spec: FlattenFuncSpec,
+    flatten_fn_exact_match_spec: Optional[FlattenFuncExactMatchSpec] = None,
+) -> None:
+    SUPPORTED_NODES[cls] = flatten_fn_spec
+    SUPPORTED_NODES_EXACT_MATCH[cls] = flatten_fn_exact_match_spec
+
+
+def tree_flatten_spec(
+    pytree: PyTree,
+    spec: TreeSpec,
+    exact_structural_match=False,
+) -> List[Any]:
+    if spec.is_leaf():
+        return [pytree]
+    if spec.type not in SUPPORTED_NODES:
+        raise RuntimeError(
+            f"{type(pytree)} does not have a flatten_fn_spec associated with it. Please register one with "
+            "torch.fx._pytree.register_pytree_flatten_spec.  If you have serialized your model, make "
+            "sure that any custom pytrees have been registered before loading it.",
+        )
+    flatten_fn_spec = SUPPORTED_NODES[spec.type]
+    child_pytrees = flatten_fn_spec(pytree, spec)
+    if exact_structural_match:
+        flatten_fn_exact_match_spec = SUPPORTED_NODES_EXACT_MATCH[spec.type]
+        if flatten_fn_exact_match_spec and not flatten_fn_exact_match_spec(
+            pytree,
+            spec,
+        ):
+            raise RuntimeError(f"Cannot flatten pytree {pytree}, given spec: {spec}")
+    result = []
+    for child, child_spec in zip(child_pytrees, spec.children_specs):
+        flat = tree_flatten_spec(child, child_spec, exact_structural_match)
+        result += flat
+    return result
+
+
+def _dict_flatten_spec(d: Dict[Any, Any], spec: TreeSpec) -> List[Any]:
+    return [d[k] for k in spec.context]
+
+
+def _list_flatten_spec(d: List[Any], spec: TreeSpec) -> List[Any]:
+    return [d[i] for i in range(spec.num_children)]
+
+
+def _tuple_flatten_spec(d: Tuple[Any], spec: TreeSpec) -> List[Any]:
+    return [d[i] for i in range(spec.num_children)]
+
+
+def _namedtuple_flatten_spec(d: NamedTuple, spec: TreeSpec) -> List[Any]:
+    return [d[i] for i in range(spec.num_children)]
+
+
+def _dict_flatten_spec_exact_match(d: Dict[Any, Any], spec: TreeSpec) -> bool:
+    return len(d) == spec.num_children
+
+
+def _list_flatten_spec_exact_match(d: List[Any], spec: TreeSpec) -> bool:
+    return len(d) == spec.num_children
+
+
+def _tuple_flatten_spec_exact_match(d: Tuple[Any], spec: TreeSpec) -> bool:
+    return len(d) == spec.num_children
+
+
+def _namedtuple_flatten_spec_exact_match(d: NamedTuple, spec: TreeSpec) -> bool:
+    return len(d) == spec.num_children
+
+
+register_pytree_flatten_spec(dict, _dict_flatten_spec, _dict_flatten_spec_exact_match)
+register_pytree_flatten_spec(list, _list_flatten_spec, _list_flatten_spec_exact_match)
+register_pytree_flatten_spec(
+    tuple,
+    _tuple_flatten_spec,
+    _tuple_flatten_spec_exact_match,
+)
+for return_type in torch.return_types.all_return_types:
+    register_pytree_flatten_spec(
+        return_type,
+        _tuple_flatten_spec,
+        _tuple_flatten_spec_exact_match,
+    )
+register_pytree_flatten_spec(
+    namedtuple,  # type: ignore[arg-type]
+    _namedtuple_flatten_spec,
+    _namedtuple_flatten_spec_exact_match,
+)
diff --git a/MLPY/Lib/site-packages/torch/fx/_symbolic_trace.py b/MLPY/Lib/site-packages/torch/fx/_symbolic_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..e08453e846a2355a9a6a1053aeec777829cd177d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/_symbolic_trace.py
@@ -0,0 +1,1202 @@
+import builtins
+import copy
+import functools
+import inspect
+import math
+import os
+import warnings
+import collections
+from itertools import chain
+from types import CodeType, FunctionType, ModuleType
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.utils._pytree as pytree
+from torch._C import ScriptObject  # type: ignore[attr-defined]
+
+from ._compatibility import compatibility
+from .graph import _PyTreeCodeGen, _PyTreeInfo, Graph
+from .graph_module import GraphModule
+from ._lazy_graph_module import _make_graph_module
+from .node import Argument, base_types, map_aggregate
+from .proxy import ParameterProxy, Proxy, TracerBase, Scope, ScopeContextManager
+
+HAS_VARSTUFF = inspect.CO_VARARGS | inspect.CO_VARKEYWORDS
+
+# These need to run in global scope to handle nested calls correctly
+_orig_module_call: Callable = torch.nn.Module.__call__
+_orig_module_getattr: Callable = torch.nn.Module.__getattr__
+
+_proxyable_classes: Dict[Type, None] = {}
+
+_is_fx_tracing_flag = False
+
+
+def is_fx_tracing():
+    return _is_fx_tracing_flag
+
+@compatibility(is_backward_compatible=True)
+class ProxyableClassMeta(type):
+    """
+    ProxyableClassMeta allows you to make construction of a given Python class
+    symbolically traceable. For example::
+
+        import torch
+        import torch.fx
+
+        class TensorPair(metaclass=torch.fx.ProxyableClassMeta):
+            def __init__(self, left, right):
+                self.left, self.right = left, right
+
+            def add(self, other):
+                l = self.left + other.left
+                r = self.right + other.right
+                return TensorPair(l, r)
+
+            def mul(self, other):
+                l = self.left * other.left
+                r = self.right * other.right
+                return TensorPair(l, r)
+
+        def use_tensor_pair_ctor(x : TensorPair, y : torch.Tensor):
+            s = x.add(TensorPair(y, y))
+            return s.mul(x)
+
+        x = TensorPair(torch.randn(5, 3), torch.randn(5, 3))
+        y = torch.randn(5, 3)
+        ref_out = use_tensor_pair_ctor(x, y)
+
+        traced = torch.fx.symbolic_trace(use_tensor_pair_ctor)
+        print(traced.code)
+        '''
+        def forward(self, x : __main___TensorPair, y : torch.Tensor):
+            tensor_pair = __main___TensorPair(y, y);  y = None
+            add = x.add(tensor_pair);  tensor_pair = None
+            mul = add.mul(x);  add = x = None
+            return mul
+        '''
+
+    From this example, we can see that construction of a class (``TensorPair``)
+    defined with ``ProxyableClassMeta`` as metaclass can be recorded in symbolic
+    tracing.
+    """
+
+    def __init__(cls, name, bases, attrs):
+        _proxyable_classes.setdefault(cls)
+        super().__init__(name, bases, attrs)
+
+    def __call__(cls, *args, **kwargs):
+        instance = cls.__new__(cls)  # type: ignore[call-overload]
+
+        if not is_fx_tracing():
+            cls.__init__(instance, *args, **kwargs)  # type: ignore[misc]
+            return instance
+
+        found_proxies = []
+
+        def check_proxy(a):
+            if isinstance(a, Proxy):
+                found_proxies.append(a)
+
+        map_aggregate(args, check_proxy)
+        map_aggregate(kwargs, check_proxy)
+
+        if len(found_proxies) != 0:
+            tracer = found_proxies[0].tracer
+            return tracer.create_proxy("call_function", cls, args, kwargs)
+        else:
+            cls.__init__(instance, *args, **kwargs)  # type: ignore[misc]
+            return instance
+
+
+def _patch_function(fn: FunctionType, nargs: int) -> FunctionType:
+    co = fn.__code__
+    co_flags = co.co_flags & ~HAS_VARSTUFF
+    co_args: tuple
+    if hasattr(co, "co_qualname"):
+        # Python-3.11+ code signature
+        co_args = (
+            nargs,
+            0,
+            0,
+            co.co_nlocals,
+            co.co_stacksize,
+            co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_qualname,  # type: ignore[attr-defined]
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_exceptiontable,  # type: ignore[attr-defined]
+            co.co_freevars,
+            co.co_cellvars,
+        )
+    elif hasattr(co, "co_posonlyargcount"):
+        co_args = (
+            nargs,
+            0,
+            0,
+            co.co_nlocals,
+            co.co_stacksize,
+            co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_freevars,
+            co.co_cellvars,
+        )
+    else:
+        co_args = (
+            nargs,
+            0,
+            co.co_nlocals,
+            co.co_stacksize,
+            co_flags,
+            co.co_code,
+            co.co_consts,
+            co.co_names,
+            co.co_varnames,
+            co.co_filename,
+            co.co_name,
+            co.co_firstlineno,
+            co.co_lnotab,
+            co.co_freevars,
+            co.co_cellvars,
+        )
+    new_code = CodeType(*co_args)  # type: ignore[arg-type]
+    return FunctionType(
+        new_code, fn.__globals__, fn.__name__, fn.__defaults__, fn.__closure__
+    )
+
+    # we need to insert placeholder nodes for *args and **kwargs
+    # we can't call this function normally, otherwise it would try to unpack them
+    # instead, let's make python think that args and kwargs are normal variables
+
+
+@compatibility(is_backward_compatible=False)
+class PHBase:
+    """
+    Object representing an input placeholder to `concrete_args`
+    """
+
+    def __repr__(self):
+        return "PH"
+
+
+PH = PHBase()
+
+
+@compatibility(is_backward_compatible=False)
+class PHWithMeta(PHBase):
+    """
+    Object representing an input placeholder to `concrete_args`
+    """
+    def __init__(self, ph_key: Optional[str] = None):
+        super().__init__()
+
+        # Provide a hey for user to identify placeholder node during analysis
+        self.ph_key = ph_key
+
+
+def _transfer_attrs(fr, to):
+    for attr_name in dir(fr):
+        attr_val = getattr(fr, attr_name)
+        if (
+            not callable(attr_val)
+            and not attr_name.startswith("__")
+            and not hasattr(to, attr_name)
+        ):
+            setattr(to, attr_name, attr_val)
+
+
+@compatibility(is_backward_compatible=True)
+class Tracer(TracerBase):
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `math`s path from the
+    # build environment (e.g. `<module 'math' from '/leaked/path').
+
+    """Tracer(autowrap_modules=(math,), autowrap_functions=())
+
+    ``Tracer`` is the class that implements the symbolic tracing functionality
+    of ``torch.fx.symbolic_trace``. A call to ``symbolic_trace(m)`` is equivalent
+    to ``Tracer().trace(m)``.
+
+    Tracer can be subclassed to override various behaviors of the tracing
+    process. The different behaviors that can be overridden are described
+    in the docstrings of the methods on this class.
+    """
+
+    # Not checking BC on this API because the default value for `autowrap_modules`
+    # includes the local filepath to the `math` module, which would jitter
+    # across machines.
+    @compatibility(is_backward_compatible=True)
+    def __init__(
+        self,
+        autowrap_modules: Tuple[ModuleType] = (math,),
+        autowrap_functions: Tuple[Callable, ...] = (),
+        param_shapes_constant: bool = False,
+    ) -> None:
+        # This method's signature is overridden by the first line of this class'
+        # docstring. If this method's signature is modified, the signature that
+        # overrides it also should be modified accordingly.
+
+        """
+        Construct a Tracer object.
+
+        Args:
+
+            autowrap_modules (Tuple[ModuleType]): defaults to `(math, )`,
+                Python modules whose functions should be wrapped automatically
+                without needing to use fx.wrap(). Backward-compatibility for
+                this parameter is guaranteed.
+
+            autowrap_functions (Tuple[Callable, ...]): defaults to `()`,
+                Python functions that should be wrapped automatically without
+                needing to use fx.wrap(). Backward compatibility for this
+                parameter is guaranteed.
+
+            param_shapes_constant (bool): When this flag is set,  calls to shape,
+                size and a few other shape like attributes of a module's parameter
+                will be evaluated directly, rather than returning a new Proxy value
+                for an attribute access. Backward compatibility for this parameter
+                is guaranteed.
+        """
+
+        super().__init__()
+
+        # Functions we will eagerly wrap when we see them while tracing
+        # this captures both `math.sqrt()` and `from math import sqrt` automatically
+        self._autowrap_function_ids: Set[int] = {
+            id(value)
+            for name, value in chain(*[m.__dict__.items() for m in autowrap_modules])
+            if not name.startswith("_") and callable(value)
+        }
+        self._autowrap_function_ids.update({id(f) for f in autowrap_functions})
+
+        # Python modules to apply autowrap to at the start, in addition to
+        # modules we see while tracing
+        self._autowrap_search: List[ModuleType] = list(autowrap_modules)
+        self.param_shapes_constant = param_shapes_constant
+
+        self.submodule_paths: Optional[Dict[torch.nn.Module, str]] = None
+        self.root_module_name: str = ""
+        # Maps the containing module's name to the operator name
+        self.scope = Scope("", None)
+        # Records the module call stack
+        self.module_stack = collections.OrderedDict()
+        # Mapping of node name to module scope
+        self.node_name_to_scope: Dict[str, Tuple[str, type]] = {}
+
+    @compatibility(is_backward_compatible=True)
+    def create_arg(self, a: Any) -> "Argument":
+        """
+        A method to specify the behavior of tracing when preparing values to
+        be used as arguments to nodes in the ``Graph``.
+
+        By default, the behavior includes:
+
+        #. Iterate through collection types (e.g. tuple, list, dict) and recursively
+           call ``create_args`` on the elements.
+        #. Given a Proxy object, return a reference to the underlying IR ``Node``
+        #. Given a non-Proxy Tensor object, emit IR for various cases:
+
+            * For a Parameter, emit a ``get_attr`` node referring to that Parameter
+            * For a non-Parameter Tensor, store the Tensor away in a special
+              attribute referring to that attribute.
+
+        This method can be overridden to support more types.
+
+        Args:
+
+            a (Any): The value to be emitted as an ``Argument`` in the ``Graph``.
+
+
+        Returns:
+
+            The value ``a`` converted into the appropriate ``Argument``
+        """
+        # The base tracer is used to construct Graphs when there is no associated
+        # module hierarchy, so it can never create parameter references.
+        # The default tracer adds the ability to refer to parameters when
+        # tracing modules.
+        if isinstance(a, torch.nn.Parameter):
+            for n, p in self.root.named_parameters():
+                if a is p:
+                    return self.create_node("get_attr", n, (), {})
+            raise NameError("parameter is not a member of this module")
+        elif isinstance(a, torch.Tensor):
+            for n_, p_ in self.root.named_buffers():
+                if a is p_:
+                    return self.create_node("get_attr", n_, (), {})
+        elif isinstance(a, torch.nn.Module):
+            for n_, p_ in self.root.named_modules():
+                if a is p_:
+                    return self.create_node("get_attr", n_, (), {})
+        # For NamedTuple instances that appear literally as args, we emit
+        # a node to construct the NamedTuple and use that Node as the argument.
+        if isinstance(a, tuple) and hasattr(a, "_fields"):
+            args = tuple(self.create_arg(elem) for elem in a)
+            return self.create_node("call_function", a.__class__, args, {})
+
+        # Tensors do not have a reliable string repr() from which they can be
+        # constructed (and we probably don't want to rely on that, either), so
+        # for any constant Tensor values we encounter, first search for if they
+        # are an attribute of some module in the module hierarchy. If so, emit
+        # a get_attr to retrieve that tensor. Otherwise, we'll store away the
+        # tensor value into a special attribute on the Module s.t. we can
+        # retrieve it with a get_attr.
+        if isinstance(a, (torch.Tensor, ScriptObject)):
+            qualname: Optional[str] = self.tensor_attrs.get(a)
+
+            # Tensor was not found in the Module hierarchy, stow it away in a
+            # special attribute and set the qualname to refer to that
+            if not qualname:
+                i = 0
+                while True:
+                    qualname = f"_tensor_constant{i}"
+                    if not hasattr(self.root, qualname):
+                        break
+                    i += 1
+                self.tensor_attrs[a] = qualname
+                setattr(self.root, qualname, a)
+
+            return self.create_node("get_attr", qualname, (), {})
+
+        if type(a) in _proxyable_classes:
+            # This is an instance of a proxyable class for which we did not
+            # witness its construction. Intern this as a constant attribute
+
+            # TODO: binary search
+            i = 0
+            while True:
+                qualname = f"_{a.__class__.__name__}_constant_{i}"
+                if not hasattr(self.root, qualname):
+                    break
+                i += 1
+            setattr(self.root, qualname, a)
+
+            return self.create_node("get_attr", qualname, (), {})
+
+        return super().create_arg(a)
+
+    @compatibility(is_backward_compatible=True)
+    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
+        """
+        A method to specify whether a given ``nn.Module`` is a "leaf" module.
+
+        Leaf modules are the atomic units that appear in
+        the IR, referenced by ``call_module`` calls. By default,
+        Modules in the PyTorch standard library namespace (torch.nn)
+        are leaf modules. All other modules are traced through and
+        their constituent ops are recorded, unless specified otherwise
+        via this parameter.
+
+        Args:
+
+            m (Module): The module being queried about
+            module_qualified_name (str): The path to root of this module. For example,
+                if you have a module hierarchy where submodule ``foo`` contains
+                submodule ``bar``, which contains submodule ``baz``, that module will
+                appear with the qualified name ``foo.bar.baz`` here.
+        """
+        return (
+            (m.__module__.startswith("torch.nn") or m.__module__.startswith("torch.ao.nn"))
+            and not isinstance(m, torch.nn.Sequential)
+        )
+
+    @compatibility(is_backward_compatible=True)
+    def path_of_module(self, mod: torch.nn.Module) -> str:
+        """
+        Helper method to find the qualified name of ``mod`` in the Module hierarchy
+        of ``root``. For example, if ``root`` has a submodule named ``foo``, which has
+        a submodule named ``bar``, passing ``bar`` into this function will return
+        the string "foo.bar".
+
+        Args:
+
+            mod (str): The ``Module`` to retrieve the qualified name for.
+        """
+        # Prefer the O(1) algorithm
+        if self.submodule_paths:
+            path = self.submodule_paths.get(mod)
+            if path is None:
+                raise NameError("module is not installed as a submodule")
+            assert isinstance(path, str)
+            return path
+        # O(N^2) fallback in the case that we didn't store the submodule
+        # paths.
+        else:
+            for n, p in self.root.named_modules():
+                if mod is p:
+                    return n
+            raise NameError("module is not installed as a submodule")
+
+    @compatibility(is_backward_compatible=True)
+    def call_module(
+        self,
+        m: torch.nn.Module,
+        forward: Callable[..., Any],
+        args: Tuple[Any, ...],
+        kwargs: Dict[str, Any],
+    ) -> Any:
+        """
+        Method that specifies the behavior of this ``Tracer`` when it encounters
+        a call to an ``nn.Module`` instance.
+
+        By default, the behavior is to check if the called module is a leaf module
+        via ``is_leaf_module``. If it is, emit a ``call_module`` node referring to
+        ``m`` in the ``Graph``. Otherwise, call the ``Module`` normally, tracing through
+        the operations in its ``forward`` function.
+
+        This method can be overridden to--for example--create nested traced
+        GraphModules, or any other behavior you would want while tracing across
+        ``Module`` boundaries.
+
+        Args:
+
+            m (Module): The module for which a call is being emitted
+            forward (Callable): The forward() method of the ``Module`` to be invoked
+            args (Tuple): args of the module callsite
+            kwargs (Dict): kwargs of the module callsite
+
+        Return:
+
+            The return value from the Module call. In the case that a ``call_module``
+            node was emitted, this is a ``Proxy`` value. Otherwise, it is whatever
+            value was returned from the ``Module`` invocation.
+        """
+        module_qualified_name = self.path_of_module(m)
+        with ScopeContextManager(self.scope, Scope(module_qualified_name, type(m))) as _scope:
+            # module_stack is an ordered dict so writing then deleting the
+            # entry is equivalent to push/pop on a list
+            self.module_stack[_scope.module_path] = (module_qualified_name, _scope.module_type)
+            if not self.is_leaf_module(m, module_qualified_name):
+                ret_val = forward(*args, **kwargs)
+            else:
+                ret_val = self.create_proxy("call_module", module_qualified_name, args, kwargs)
+            key, _ = self.module_stack.popitem(last=True)
+            assert key == _scope.module_path, f" Unexpected key {key}"
+
+        return ret_val
+
+    @compatibility(is_backward_compatible=False)
+    def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: Dict[str, Any]):
+        """
+        Method that specifies the behavior of this ``Tracer`` when we call getattr
+        on a call to an ``nn.Module`` instance.
+
+        By default, the behavior is to return a proxy value for the attribute. It
+        also stores the proxy value in the ``parameter_proxy_cache``, so that future
+        calls will reuse the proxy rather than creating a new one.
+
+        This method can be overridden to --for example-- not return proxies when
+        querying parameters.
+
+        Args:
+
+            attr (str): The name of the attribute being queried
+            attr_val (Any): The value of the attribute
+            parameter_proxy_cache (Dict[str, Any]): A cache of attr names to proxies
+
+        Return:
+
+            The return value from the getattr call.
+        """
+        def maybe_get_proxy_for_attr(
+            attr_val, collection_to_search, parameter_proxy_cache
+        ):
+            for n, p in collection_to_search:
+                if attr_val is p:
+                    if n not in parameter_proxy_cache:
+                        kwargs = {}
+                        if (
+                            "proxy_factory_fn"
+                            in inspect.signature(self.create_proxy).parameters
+                        ):
+                            kwargs["proxy_factory_fn"] = (
+                                None
+                                if not self.param_shapes_constant
+                                else lambda node: ParameterProxy(
+                                    self, node, n, attr_val
+                                )
+                            )
+                        val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs)  # type: ignore[arg-type]
+                        parameter_proxy_cache[n] = val_proxy
+                    return parameter_proxy_cache[n]
+            return None
+
+        if isinstance(attr_val, torch.nn.Parameter):
+            maybe_parameter_proxy = maybe_get_proxy_for_attr(
+                attr_val, self.root.named_parameters(), parameter_proxy_cache
+            )
+            if maybe_parameter_proxy is not None:
+                return maybe_parameter_proxy
+
+        if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
+            maybe_buffer_proxy = maybe_get_proxy_for_attr(
+                attr_val, self.root.named_buffers(), parameter_proxy_cache
+            )
+            if maybe_buffer_proxy is not None:
+                return maybe_buffer_proxy
+
+        return attr_val
+
+    # This method will be refactored
+    @compatibility(is_backward_compatible=False)
+    def create_args_for_root(self, root_fn, is_module, concrete_args=None):
+        """
+        Create ``placeholder`` nodes corresponding to the signature of the ``root``
+        Module. This method introspects root's signature and emits those
+        nodes accordingly, also supporting ``*args`` and ``**kwargs``.
+        """
+        # In some cases, a function or method has been decorated with a wrapper
+        # defined via ``functools.wraps``. In this case, the outer code object
+        # will likely not contain the actual parameters we care about, so unwrap
+        # the function to get to the innermost callable.
+        fn_for_analysis = inspect.unwrap(root_fn)
+        co = fn_for_analysis.__code__
+        total_args = co.co_argcount + co.co_kwonlyargcount
+        orig_args = list(co.co_varnames)
+        names_iter = iter(co.co_varnames)
+        args: List[Any] = []
+        skip_arg_idx = 0
+        if is_module:
+            if total_args == 0:
+                raise RuntimeError(
+                    "``self`` argument cannot be part of *args expansion!"
+                )
+            skip_arg_idx = 1
+            next(names_iter)  # skip self
+            args.append(self.root)
+
+        sig = inspect.signature(fn_for_analysis)
+
+
+        # This covers the very specific case where we are passing in flat
+        # concrete_args as a tuple, but our traced fn takes (*args, **kwargs).
+        # In this case, just take the concrete_args and pass them through.
+        name_idx = 0
+        if isinstance(concrete_args, tuple) and \
+                len(concrete_args) > 0 and \
+                (co.co_flags & HAS_VARSTUFF) and \
+                total_args == 1:
+            for concrete_arg in concrete_args:
+                out = self.create_proxy("placeholder", f"input_{name_idx}", (), {})
+                if isinstance(concrete_arg, PHBase):
+                    if concrete_arg != PH:
+                        # Transfer attrs in the case where you're using a placeholder other
+                        # than the singleton PH (PH has no attributes to transfer).
+                        # Proxies were created out of the placeholders.
+                        # Transfer any metadata (put on the placeholders in the form of
+                        # attributes set by the user) from the placeholder to the
+                        # underlying nodes (the proxy is unwrapped by the user, but
+                        # the metadata should hold).
+                        _transfer_attrs(fr=concrete_arg, to=out.node)
+                args.append(out)
+                name_idx += 1
+            return root_fn, args
+
+        arg_names = [next(names_iter) for idx in range(skip_arg_idx, total_args)]
+        if isinstance(concrete_args, tuple):
+            if len(arg_names) != len(concrete_args):
+                raise RuntimeError(
+                    f"Tracing expected {len(arg_names)} arguments but got {len(concrete_args)} concrete arguments"
+                )
+            concrete_args = dict(zip(arg_names, concrete_args))
+
+        def proxy_placeholder(name):
+            return self._proxy_placeholder(name, concrete_args, sig, fn_for_analysis)
+
+        args.extend(proxy_placeholder(names) for names in arg_names)
+
+        if co.co_kwonlyargcount > 0 or co.co_flags & HAS_VARSTUFF:
+            # TODO: type annotations for *args and **kwargs
+            if co.co_flags & inspect.CO_VARARGS:
+                args.append(proxy_placeholder("*" + next(names_iter)))
+            if co.co_flags & inspect.CO_VARKEYWORDS:
+                args.append(proxy_placeholder("**" + next(names_iter)))
+            root_fn = _patch_function(root_fn, len(args))
+
+        flat_args, in_spec = pytree.tree_flatten(tuple(args))
+        if not all(child.is_leaf() for child in in_spec.children_specs):
+            # In the case that we have pytree-flattened inputs in
+            # `concrete_args`, generate a flattening wrapper around the
+            # original root function and return that.
+            self.graph._codegen = _PyTreeCodeGen(
+                _PyTreeInfo(orig_args[:total_args], in_spec, None)
+            )
+
+            def flatten_fn(*args):
+                tree_args = pytree.tree_unflatten(list(args), in_spec)
+                tree_out = root_fn(*tree_args)
+                out_args, out_spec = pytree.tree_flatten(tree_out)
+                assert isinstance(self.graph._codegen, _PyTreeCodeGen)
+                self.graph._codegen.pytree_info = (
+                    self.graph._codegen.pytree_info._replace(out_spec=out_spec)
+                )
+                return out_args
+
+            return flatten_fn, flat_args
+        return root_fn, args
+
+    @compatibility(is_backward_compatible=True)
+    def trace(
+        self,
+        root: Union[torch.nn.Module, Callable[..., Any]],
+        concrete_args: Optional[Dict[str, Any]] = None,
+    ) -> Graph:
+        """
+        Trace ``root`` and return the corresponding FX ``Graph`` representation. ``root``
+        can either be an ``nn.Module`` instance or a Python callable.
+
+        Note that after this call, ``self.root`` may be different from the ``root`` passed
+        in here. For example, when a free function is passed to ``trace()``, we will
+        create an ``nn.Module`` instance to use as the root and add embedded constants
+        to.
+
+
+        Args:
+
+            root (Union[Module, Callable]): Either a ``Module`` or a function to be
+                traced through. Backwards-compatibility for this parameter is
+                guaranteed.
+            concrete_args (Optional[Dict[str, any]]): Concrete arguments that should
+                not be treated as Proxies. This parameter is experimental and
+                its backwards-compatibility is *NOT* guaranteed.
+
+        Returns:
+
+            A ``Graph`` representing the semantics of the passed-in ``root``.
+        """
+        global _is_fx_tracing_flag
+        old_is_fx_tracing_flag = _is_fx_tracing_flag
+        _is_fx_tracing_flag = True
+        try:
+            if isinstance(root, torch.nn.Module):
+
+                # do real recompilation for _LazyGraphModule before retracing since the trace
+                # method can not trace the _lazy_forward method. Got error:
+                #   https://gist.github.com/shunting314/75549c2e82ae07ac1139c94a3583d259
+                # without this.
+                from torch.fx._lazy_graph_module import _LazyGraphModule
+                _LazyGraphModule.force_recompile(root)
+
+                self.root = root
+
+                assert hasattr(
+                    type(root), self.traced_func_name
+                ), f"traced_func_name={self.traced_func_name} doesn't exist in {type(root).__name__}"
+
+                fn = getattr(type(root), self.traced_func_name)
+                self.root_module_name = root._get_name()
+                self.submodule_paths = {mod: name for name, mod in root.named_modules()}
+            else:
+                self.root = torch.nn.Module()
+                fn = root
+
+            tracer_cls: Optional[Type[Tracer]] = getattr(self, "__class__", None)
+            self.graph = Graph(tracer_cls=tracer_cls)
+            if hasattr(fn, '__code__'):
+                code = fn.__code__
+                self.graph._co_fields = {
+                    'co_name': code.co_name,
+                    'co_filename': code.co_filename,
+                    'co_firstlineno': code.co_firstlineno,
+                }
+
+            # When we encounter a Tensor value that's not a parameter, we look if it
+            # is some other attribute on the model. Construct a dict mapping Tensor
+            # values to the qualified name here for efficiency. This is used downstream
+            # in create_arg
+            self.tensor_attrs: Dict[Union[torch.Tensor, ScriptObject], str] = {}
+
+            def collect_tensor_attrs(m: torch.nn.Module, prefix_atoms: List[str]):
+                for k, v in m.__dict__.items():
+                    if isinstance(v, (torch.Tensor, ScriptObject)):
+                        self.tensor_attrs[v] = ".".join(prefix_atoms + [k])
+                for k, v in m.named_children():
+                    collect_tensor_attrs(v, prefix_atoms + [k])
+
+            collect_tensor_attrs(self.root, [])
+
+            assert isinstance(fn, FunctionType)
+
+            fn_globals = fn.__globals__  # run before it gets patched
+            fn, args = self.create_args_for_root(
+                fn, isinstance(root, torch.nn.Module), concrete_args
+            )
+
+            parameter_proxy_cache: Dict[
+                str, Proxy
+            ] = {}  # Reduce number of get_attr calls
+
+            # Method dispatch on parameters is not recorded unless it's directly used.
+            # Thus, we need to insert a proxy when __getattr__ requests a parameter.
+            @functools.wraps(_orig_module_getattr)
+            def module_getattr_wrapper(mod, attr):
+                attr_val = _orig_module_getattr(mod, attr)
+                return self.getattr(attr, attr_val, parameter_proxy_cache)
+
+            @functools.wraps(_orig_module_call)
+            def module_call_wrapper(mod, *args, **kwargs):
+                def forward(*args, **kwargs):
+                    return _orig_module_call(mod, *args, **kwargs)
+
+                _autowrap_check(
+                    patcher,
+                    getattr(getattr(mod, "forward", mod), "__globals__", {}),
+                    self._autowrap_function_ids,
+                )
+                return self.call_module(mod, forward, args, kwargs)
+
+            with _Patcher() as patcher:
+                # allow duplicate patches to support the case of nested calls
+                patcher.patch_method(
+                    torch.nn.Module,
+                    "__getattr__",
+                    module_getattr_wrapper,
+                    deduplicate=False,
+                )
+                patcher.patch_method(
+                    torch.nn.Module, "__call__", module_call_wrapper, deduplicate=False
+                )
+                _patch_wrapped_functions(patcher)
+                _autowrap_check(patcher, fn_globals, self._autowrap_function_ids)
+                for module in self._autowrap_search:
+                    _autowrap_check(
+                        patcher, module.__dict__, self._autowrap_function_ids
+                    )
+                self.create_node(
+                    "output",
+                    "output",
+                    (self.create_arg(fn(*args)),),
+                    {},
+                    type_expr=fn.__annotations__.get("return", None),
+                )
+
+            self.submodule_paths = None
+        finally:
+            _is_fx_tracing_flag = old_is_fx_tracing_flag
+        return self.graph
+
+    def __deepcopy__(self, memo):
+        # _autowrap_search contains modules, which cannot be deepcopied.
+        new_tracer = Tracer.__new__(Tracer)
+
+        for k, v in self.__dict__.items():
+            if k in {'_autowrap_search'}:
+                new_obj = copy.copy(v)
+            else:
+                new_obj = copy.deepcopy(v, memo)
+
+            new_tracer.__dict__[k] = new_obj
+
+        return new_tracer
+
+    def _proxy_placeholder(self, name, concrete_args, sig, fn_for_analysis):
+        if concrete_args is not None and name in concrete_args:
+            cnt = 0
+
+            def replace_ph(x):
+                nonlocal cnt
+                cnt += 1
+                param = sig.parameters[name]
+                default = (
+                    ()
+                    if param.default is inspect.Parameter.empty
+                    else (param.default,)
+                )
+                out = self.create_proxy(
+                    "placeholder", f"{name}_{str(cnt)}", default, {}
+                )
+                if isinstance(x, PHBase):
+                    if x != PH:
+                        # Transfer attrs in the case where you're using a placeholder other
+                        # than the singleton PH (PH has no attributes to transfer).
+                        # Proxies were created out of the placeholders.
+                        # Transfer any metadata (put on the placeholders in the form of
+                        # attributes set by the user) from the placeholder to the
+                        # underlying nodes (the proxy is unwrapped by the user, but
+                        # the metadata should hold).
+                        _transfer_attrs(fr=x, to=out.node)
+
+                    return out
+                # Union[int, bool] == bool in Python <= 3.6
+                if (
+                    type(x) == bool
+                    or type(x) in base_types
+                    and type(x) != torch.Tensor
+                ):
+                    torch._assert(
+                        out == x,
+                        f"{name} has been specialized to have value {x} but got another value",
+                    )
+                elif x is None:
+                    args = (
+                        out,
+                        f"{name} has been specialized to have value None but got another value",
+                    )
+                    self.create_proxy("call_function", _assert_is_none, args, {})
+                else:
+                    warnings.warn(
+                        f"Was not able to add assertion to guarantee correct input {name} to "
+                        f"specialized function. It is up to the user to make sure that your inputs match the "
+                        f"inputs you specialized the function with."
+                    )
+
+                return x
+
+            return pytree.tree_map(replace_ph, concrete_args[name])
+        if name[0] == "*":
+            default = ()
+        else:
+            param = sig.parameters[name]
+            default = () if param.default is inspect.Parameter.empty else (param.default,)  # type: ignore[assignment]
+        return self.create_proxy(
+            "placeholder",
+            name,
+            default,
+            {},
+            type_expr=fn_for_analysis.__annotations__.get(name, None)
+        )
+
+
+# Dictionary of (id(globals dict), function name) => globals_dict to patch for
+# the purposes of the wrap() API.
+# We key by the globals dict id and function name to ensure we're wrapping a given
+# function only once.
+_wrapped_fns_to_patch: Dict[Tuple[int, str], dict] = {}
+
+# List of methods on classes to wrap (class type, function name)
+# this currently only works for Tensor.* methods that aren't traced properly
+_wrapped_methods_to_patch: List[Tuple[type, str]] = []
+
+if os.environ.get("FX_PATCH_GETITEM") == "1":
+    # This change is needed to trace models like PositionalEmbedding from BERT:
+    # https://github.com/pytorch/benchmark/blob/master/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py
+    # but causes issues in quantization documented here:
+    # https://github.com/pytorch/pytorch/issues/50710
+    # once that is fixed we can make this the default behavior.
+    _wrapped_methods_to_patch.append((torch.Tensor, "__getitem__"))
+
+
+def _find_proxy(*objects_to_search):
+    """
+    Recursively search a data structure for a Proxy() and return it,
+    return None if not found.
+    """
+    proxy = None
+
+    def find_proxy(x):
+        nonlocal proxy
+        if isinstance(x, Proxy):
+            proxy = x
+
+    map_aggregate(objects_to_search, find_proxy)
+    return proxy
+
+
+def _create_wrapped_func(orig_fn):
+    @functools.wraps(orig_fn)
+    def wrapped(*args, **kwargs):
+        """
+        Given an closed-over ``orig_function`` to invoke, search the args and kwargs for
+        a Proxy object. If there is one, emit a ``call_function`` node to preserve the
+        call to this leaf function directly. Otherwise, just return the results of
+        this function call, as this function is not being traced.
+        """
+        proxy = _find_proxy(args, kwargs)
+        if proxy is not None:
+            return_proxy = proxy.tracer.create_proxy(
+                "call_function", orig_fn, args, kwargs
+            )
+            return_proxy.node.meta["is_wrapped"] = True
+            return return_proxy
+        return orig_fn(*args, **kwargs)
+
+    return wrapped
+
+
+def _create_wrapped_method(cls, name):
+    orig_fn = getattr(cls, name)
+
+    @functools.wraps(orig_fn)
+    def wrapped(*args, **kwargs):
+        """
+        Search the args and kwargs for a Proxy object. If there is one,
+        emit a ``call_method`` node to preserve the call to this method
+        directly. Otherwise, just return the results of this function
+        call, as this function is not being traced.
+        """
+        proxy = _find_proxy(args, kwargs)
+        if proxy is not None:
+            return proxy.tracer.create_proxy("call_method", name, args, kwargs)
+        return orig_fn(*args, **kwargs)
+
+    return wrapped
+
+
+class _PatchedFn(NamedTuple):
+    frame_dict: Any
+    fn_name: str
+    orig_fn: Any
+
+    def revert(self):
+        raise NotImplementedError()
+
+
+class _PatchedFnSetItem(_PatchedFn):
+    def revert(self):
+        self.frame_dict[self.fn_name] = self.orig_fn
+
+
+class _PatchedFnDel(_PatchedFn):
+    def revert(self):
+        del self.frame_dict[self.fn_name]
+
+
+class _PatchedFnSetAttr(_PatchedFn):
+    def revert(self):
+        setattr(self.frame_dict, self.fn_name, self.orig_fn)
+
+
+class _Patcher:
+    def __init__(self):
+        super().__init__()
+        self.patches_made: List[_PatchedFn] = []
+        self.visited: Set[int] = set()
+
+    def patch(
+        self,
+        frame_dict: Dict[str, Any],
+        name: str,
+        new_fn: Callable,
+        deduplicate: bool = True,
+    ):
+        """
+        Replace frame_dict[name] with new_fn until we exit the context manager.
+        """
+        new_fn.__fx_already_patched = deduplicate  # type: ignore[attr-defined]
+        if name not in frame_dict and hasattr(builtins, name):
+            self.patches_made.append(_PatchedFnDel(frame_dict, name, None))
+        elif getattr(frame_dict[name], "__fx_already_patched", False):
+            return  # already patched, no need to do it again
+        else:
+            self.patches_made.append(
+                _PatchedFnSetItem(frame_dict, name, frame_dict[name])
+            )
+        frame_dict[name] = new_fn
+
+    def patch_method(
+        self, cls: type, name: str, new_fn: Callable, deduplicate: bool = True
+    ):
+        """
+        Replace object_or_dict.name with new_fn until we exit the context manager.
+        """
+        new_fn.__fx_already_patched = deduplicate  # type: ignore[attr-defined]
+        orig_fn = getattr(cls, name)
+        if getattr(orig_fn, "__fx_already_patched", False):
+            return  # already patched, no need to do it again
+        self.patches_made.append(_PatchedFnSetAttr(cls, name, orig_fn))
+        setattr(cls, name, new_fn)
+
+    def visit_once(self, thing: Any):
+        """Return True on the first call to with thing, otherwise false"""
+        idx = id(thing)
+        if idx in self.visited:
+            return False
+        self.visited.add(idx)
+        return True
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """
+        Undo all the changes made via self.patch() and self.patch_method()
+        """
+        while self.patches_made:
+            # unpatch in reverse order to handle duplicates correctly
+            self.patches_made.pop().revert()
+        self.visited.clear()
+
+
+def _patch_wrapped_functions(patcher: _Patcher):
+    """
+    Go through ``_wrapped_fn_patch_table`` and, for each frame object, wrap
+    the listed global functions in the `_create_wrapped_func` wrapper.
+    """
+    for (_, name), frame_dict in _wrapped_fns_to_patch.copy().items():
+        if name not in frame_dict and hasattr(builtins, name):
+            orig_fn = getattr(builtins, name)
+        else:
+            orig_fn = frame_dict[name]
+        patcher.patch(frame_dict, name, _create_wrapped_func(orig_fn))
+
+    for cls, name in _wrapped_methods_to_patch:
+        patcher.patch_method(cls, name, _create_wrapped_method(cls, name))
+
+
+def _autowrap_check(
+    patcher: _Patcher, frame_dict: Dict[str, Any], function_ids: Set[int]
+):
+    """
+    Some methods, like `math.sqrt` are common enough we want to automatically wrap them as we see them.
+    This method searches a scope for them and patches them if found.
+    """
+    if patcher.visit_once(frame_dict):
+        for name, value in frame_dict.items():
+            if (
+                not name.startswith("_")
+                and callable(value)
+                and id(value) in function_ids
+            ):
+                patcher.patch(frame_dict, name, _create_wrapped_func(value))
+
+
+@compatibility(is_backward_compatible=True)
+def wrap(fn_or_name: Union[str, Callable]):
+    """
+    This function can be called at module-level scope to register fn_or_name as a "leaf function".
+    A "leaf function" will be preserved as a CallFunction node in the FX trace instead of being
+    traced through::
+
+        # foo/bar/baz.py
+        def my_custom_function(x, y):
+            return x * x + y * y
+
+        torch.fx.wrap('my_custom_function')
+
+        def fn_to_be_traced(x, y):
+            # When symbolic tracing, the below call to my_custom_function will be inserted into
+            # the graph rather than tracing it.
+            return my_custom_function(x, y)
+
+    This function can also equivalently be used as a decorator::
+
+        # foo/bar/baz.py
+        @torch.fx.wrap
+        def my_custom_function(x, y):
+            return x * x + y * y
+
+    A wrapped function can be thought of a "leaf function", analogous to the concept of
+    "leaf modules", that is, they are functions that are left as calls in the FX trace
+    rather than traced through.
+
+    Args:
+
+        fn_or_name (Union[str, Callable]): The function or name of the global function to insert into the
+            graph when it's called
+    """
+    if not callable(fn_or_name) and not isinstance(fn_or_name, str):
+        raise RuntimeError(
+            "Unsupported type for global function! Must be either a callable or "
+            "string name"
+        )
+
+    if callable(fn_or_name):
+        assert not isinstance(fn_or_name, str)  # to make mypy happy
+        fn_name = fn_or_name.__name__
+    else:
+        assert isinstance(
+            fn_or_name, str
+        ), "fn_or_name must be a global function or string name"
+        fn_name = fn_or_name
+
+    currentframe = inspect.currentframe()
+    assert currentframe is not None
+    f = currentframe.f_back
+    assert f is not None
+    if f.f_code.co_name != "<module>":
+        raise NotImplementedError("wrap must be called at the top level of a module")
+
+    # consider implementing Callable version of this via _autowrap_function_ids / _autowrap_search
+    # semantics would be slightly different, but would add support `from x import wrapped_function`
+    _wrapped_fns_to_patch[(id(f.f_globals), fn_name)] = f.f_globals
+    return fn_or_name
+
+
+@compatibility(is_backward_compatible=True)
+def symbolic_trace(
+    root: Union[torch.nn.Module, Callable[..., Any]],
+    concrete_args: Optional[Dict[str, Any]] = None,
+) -> GraphModule:
+    """
+    Symbolic tracing API
+
+    Given an ``nn.Module`` or function instance ``root``, this function will return a ``GraphModule``
+    constructed by recording operations seen while tracing through ``root``.
+
+    ``concrete_args`` allows you to partially specialize your function, whether it's to remove control flow or data structures.
+
+    For example::
+
+        def f(a, b):
+            if b == True:
+                return a
+            else:
+                return a*2
+
+    FX can typically not trace through this due to the presence of control
+    flow. However, we can use `concrete_args` to specialize on the value of
+    `b` to trace through this::
+
+        f = fx.symbolic_trace(f, concrete_args={'b': False})
+        assert f(3, False)  == 6
+
+    Note that although you can still pass in different values of `b`, they will be ignored.
+
+    We can also use `concrete_args` to eliminate data-structure handling from
+    our function. This will use pytrees to flatten your input. To avoid
+    overspecializing, pass in `fx.PH` for values that shouldn't be
+    specialized. For example::
+
+        def f(x):
+            out = 0
+            for v in x.values():
+                out += v
+            return out
+        f = fx.symbolic_trace(f, concrete_args={'x': {'a': fx.PH, 'b': fx.PH, 'c': fx.PH}})
+        assert f({'a': 1, 'b': 2, 'c': 4}) == 7
+
+
+    Args:
+        root (Union[torch.nn.Module, Callable]): Module or function to be traced and converted
+            into a Graph representation.
+        concrete_args (Optional[Dict[str, any]]): Inputs to be partially specialized
+
+    Returns:
+        GraphModule: a Module created from the recorded operations from ``root``.
+    """
+    tracer = Tracer()
+    graph = tracer.trace(root, concrete_args)
+    name = (
+        root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    )
+    return _make_graph_module(tracer.root, graph, name)
+
+
+@wrap
+def _assert_is_none(value, msg):
+    assert value is None, msg
diff --git a/MLPY/Lib/site-packages/torch/fx/annotate.py b/MLPY/Lib/site-packages/torch/fx/annotate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a771009b04405fe381674138a7762efae0b6de2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/annotate.py
@@ -0,0 +1,21 @@
+from torch.fx.proxy import Proxy
+from ._compatibility import compatibility
+
+@compatibility(is_backward_compatible=False)
+def annotate(val, type):
+    # val could be either a regular value (not tracing)
+    # or fx.Proxy (tracing)
+    if isinstance(val, Proxy):
+        if val.node.type:
+            raise RuntimeError(f"Tried to annotate a value that already had a type on it!"
+                               f" Existing type is {val.node.type} "
+                               f"and new type is {type}. "
+                               f"This could happen if you tried to annotate a function parameter "
+                               f"value (in which case you should use the type slot "
+                               f"on the function signature) or you called "
+                               f"annotate on the same value twice")
+        else:
+            val.node.type = type
+        return val
+    else:
+        return val
diff --git a/MLPY/Lib/site-packages/torch/fx/config.py b/MLPY/Lib/site-packages/torch/fx/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2539e748df4aa8016359bd1b068baa7653fcf686
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/config.py
@@ -0,0 +1,6 @@
+# Whether to disable showing progress on compilation passes
+# Need to add a new config otherwise wil get a circular import if dynamo config is imported here
+disable_progress = True
+
+# If True this also shows the node names in each pass, for small models this is great but larger models it's quite noisy
+verbose_progress = False
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__init__.py b/MLPY/Lib/site-packages/torch/fx/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f79cd452ce43b99533bc6993c4f95bbee4a45fd4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_backward_state.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_backward_state.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c329fa6b150bddb2908217850f4a3e632704705a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_backward_state.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e81713a0958c5f7958694dbf1e5aaca8371fa6b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_sym_dispatch_mode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_sym_dispatch_mode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02fac73ef922ed8b2bb9c8196435f7e5501d20c5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/_sym_dispatch_mode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/accelerator_partitioner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/accelerator_partitioner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbc0f6880c6417efb1927642afc5b37e53ce8a1a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/accelerator_partitioner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/const_fold.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/const_fold.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b79bddcfb7291c15d55a7fe324620ea78a6bd425
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/const_fold.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/debug.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/debug.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50773f27abbd25a65f761da1d4ce5527d291d201
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/debug.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/graph_gradual_typechecker.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/graph_gradual_typechecker.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efd0b203678004b0756579c451c3428a1e951996
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/graph_gradual_typechecker.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/merge_matmul.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/merge_matmul.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c30e6c21f3f2cf89e139073c0dac7ea5ed38e11
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/merge_matmul.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/meta_tracer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/meta_tracer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a2f4ddde21e82fe407adba5c04ceb70a6204ba2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/meta_tracer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/normalize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/normalize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d8eb1260f1009de0aff27edd3e0cac472c4a5ed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/normalize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/optimization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/optimization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f61bad3a01dcaf019277ac4d777ceb6d9caba446
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/optimization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/partitioner_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/partitioner_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4df3e4f26c5de1d75431ebc5d66dc5d28daffa4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/partitioner_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/proxy_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/proxy_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..421e5360d50ee065c1575f6663db5bb9aed3d8d3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/proxy_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/recording.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/recording.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fafd7f16f9c25921802d1a4527f572a5e5d89b04
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/recording.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/refinement_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/refinement_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52f5b718bb8b812f28681548c0af6c31e1b75f1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/refinement_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/rewriter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/rewriter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11911e3aa45de838b54db3002d64ad3025d3d414
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/rewriter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/schema_type_annotation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/schema_type_annotation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b42322c237613e5ed5b34bc9dfa75c5965d58c5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/schema_type_annotation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/sym_node.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/sym_node.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e82518b7083ef477e269fc512cb2ff05edf75dc4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/sym_node.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/symbolic_shapes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/symbolic_shapes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6640d3a5a322305ca55a81ca6167cb5127f1fc28
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/symbolic_shapes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/unify_refinements.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/unify_refinements.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3789b225b37c556046ed5eb9a6c9116d0c0354de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/unify_refinements.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/validator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/validator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a24a581a4404fe63a2b61a72e5f5e6a98783c20
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/__pycache__/validator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/_backward_state.py b/MLPY/Lib/site-packages/torch/fx/experimental/_backward_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc9705413e9c29714d6e165c4b2ab3b34796124
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/_backward_state.py
@@ -0,0 +1,27 @@
+import torch.fx
+
+
+class BackwardState:
+    """
+    BackwardState is used to pass Python hooks from the forwards pass
+    into the backwards pass in Dynamo+Compiled Autograd.
+
+    It is created by TorchDynamo and has special handling there.
+    Dynamo will pass an empty BackwardState to the forwards, then populate
+    members on it (via setattr) only after the forwards graph is finished.
+    Later on, in CompileAutograd we will inline and add the needed guards
+    on the BackwardState.
+
+    BackwardState is identified and has special handling in AOTAutograd.
+    During AOTAutograd:
+        1) BackwardState is an input to the forwards graph
+        2) It must only be used in the backwards
+        3) It will be empty in the forwards
+        4) In the forwards we add a wrapper to save it
+        5) In the backwards it becomes an input
+        6) There can only be one per graph
+
+    BackwardState requires CompiledAutograd.
+    """
+
+    proxy: torch.fx.Proxy
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/_config.py b/MLPY/Lib/site-packages/torch/fx/experimental/_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f7e9c4cb5b41002f7e1d560ddc001032083a8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/_config.py
@@ -0,0 +1,76 @@
+import os
+import sys
+
+from typing import Optional
+
+# [@compile_ignored: debug] Uses z3 for validating the guard optimizations transformations.
+translation_validation = (
+    os.environ.get("TORCHDYNAMO_TRANSLATION_VALIDATION", "0") == "1"
+)
+# Timeout (in milliseconds) for z3 finding a solution.
+# [@compile_ignored: debug]
+translation_validation_timeout = int(
+    os.environ.get("TORCHDYNAMO_TRANSLATION_VALIDATION_TIMEOUT", "600000")
+)
+# Disables bisection for translation validation.
+#
+# Translation validation bisection is enabled by default, if translation validation
+# is also enabled. This should help finding guard simplification issues. However,
+# since validation uses Z3 for bisecting, it might take a lot of time.
+#
+# Set this configuration option so as to avoid bisecting.
+# [@compile_ignored: debug]
+translation_validation_no_bisect = (
+    os.environ.get("TORCHDYNAMO_TRANSLATION_NO_BISECT", "0") == "1"
+)
+# Checks whether replaying ShapeEnv events on a freshly constructed one yields
+# the a ShapeEnv with the same state. This should be used only in testing.
+check_shape_env_recorded_events = False
+
+# TODO: Perhaps consider allowing unions for the configs below (so you can hit
+# multiple reps at the same time)
+
+# Give extended debug information if the string representation of a guard
+# matches this.  For example, set this to "Ne(s0, 10)" and whenever we issue
+# this guard, we will generate full Python and C++ backtrace
+# [@compile_ignored: debug]
+extended_debug_guard_added = os.environ.get(
+    "TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED", None
+)
+
+# Give extended debug information when a particular symbol is allocated.  For
+# example, set this to "u2" and whenever we create this symbol, we will
+# generate full Python and C++ backtrace
+# [@compile_ignored: debug]
+extended_debug_create_symbol = os.environ.get(
+    "TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL", None
+)
+
+# Give extended debug information (C++ backtrace) for all extended debug
+# settings as well as errors.  The C++ backtrace is slow and very spammy so we
+# don't include it by default even when you're requesting extended debug.
+# [@compile_ignored: debug]
+extended_debug_cpp = os.environ.get("TORCHDYNAMO_EXTENDED_DEBUG_CPP", "") != ""
+
+# [@compile_ignored: debug] Show a warning for every specialization
+print_specializations = False
+
+# wraps (un)equalities with 'Not' class after recording the correct expression
+# in the FX graph. This should incorrectly construct the divisible and replacement
+# lists, and incorrectly issue guards.
+inject_EVALUATE_EXPR_flip_equality_TESTING_ONLY = False
+
+# [@compile_ignored: debug] Validate that ShapeEnv's version key is updated correctly
+validate_shape_env_version_key = False
+
+# If we produce more than this many guards on a symbol, force the symbol to
+# get specialized and bail out if this many guards mention this particular
+# symbol.  This may be slightly more aggressive than the true number of guards
+# issued (as we test if we've hit the limit on-the-fly, whereas we may
+# do further simplifications at final guard issuance time that make guards
+# irrelevant.)
+symbol_guard_limit_before_specialize: Optional[int] = None
+
+from torch.utils._config_module import install_config_module
+
+install_config_module(sys.modules[__name__])
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/_sym_dispatch_mode.py b/MLPY/Lib/site-packages/torch/fx/experimental/_sym_dispatch_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..7922d0dbeb40584fd55ee7688a012ce31380a148
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/_sym_dispatch_mode.py
@@ -0,0 +1,58 @@
+from typing import List, Optional, Type
+
+__all__ = ["SymDispatchMode", "handle_sym_dispatch", "sym_function_mode"]
+
+SYM_FUNCTION_MODE: Optional["SymDispatchMode"] = None
+
+
+# SymDispatchMode gets invoked whenever an operation is processed on
+# a PySymInt.  When this occurs, you get called at __sym_dispatch__
+# with the operation in question.  This is symmetric to TorchDispatchMode
+# but with some caveats:
+#
+#   - In TorchDispatchMode, you get the same arguments as what a user
+#     invoked your API with; e.g., if you call torch.ops.aten.foo(a, b),
+#     you get (a, b) as args to your call.  In SymDispatchMode, if
+#     you call a + b (where a and b are SymInts), you will get
+#     (a.node, b.node) as your args (these are PySymInts)
+#
+#   - SymInt/PySymInt don't have FX proxy support (unlike, e.g., Tensor).
+#     So you have to manually call Tracer/create_node to write into
+#     the graph.  See ProxySymDispatchMode for an example
+#
+class SymDispatchMode:
+    def __sym_dispatch__(self, func, types, args, kwargs):
+        raise NotImplementedError()
+
+    def __enter__(self):
+        global SYM_FUNCTION_MODE
+        old = SYM_FUNCTION_MODE
+        if hasattr(self, "inner"):
+            raise RuntimeError(
+                f"{self} has already been used as a mode. Please use a fresh version"
+            )
+        else:
+            self.inner = old
+        SYM_FUNCTION_MODE = self
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global SYM_FUNCTION_MODE
+        SYM_FUNCTION_MODE = self.inner
+
+
+def handle_sym_dispatch(func, args, kwargs):
+    global SYM_FUNCTION_MODE
+    mode = sym_function_mode()
+    assert mode
+    SYM_FUNCTION_MODE = mode.inner
+    try:
+        # TODO: properly compute types
+        types: List[Type] = []
+        return mode.__sym_dispatch__(func, types, args, kwargs)
+    finally:
+        SYM_FUNCTION_MODE = mode
+
+
+def sym_function_mode():
+    return SYM_FUNCTION_MODE
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/accelerator_partitioner.py b/MLPY/Lib/site-packages/torch/fx/experimental/accelerator_partitioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d086c05db1e31b979e53024d51c3024327414a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/accelerator_partitioner.py
@@ -0,0 +1,1078 @@
+import operator
+from collections import deque
+from typing import Dict, List, Set, NamedTuple, Tuple, Deque
+
+import torch
+from torch.fx.passes.graph_manipulation import get_size_of_all_nodes
+from torch.fx.experimental.partitioner_utils import (
+    Partition,
+    Device,
+    PartitionerConfig,
+    get_partition_to_latency_mapping,
+    get_latency_of_partitioned_graph,
+    NodeLatency,
+    get_extra_size_of,
+    PartitionMode,
+)
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node, map_arg
+from torch.fx.passes.split_module import split_module
+
+
+class DAGNode:
+    """DAGNode class maintains useful information for a partition (submodule),
+    and its input submodules and output submodules.
+    """
+
+    def __init__(
+        self,
+        submodule_node: Node,
+        input_nodes: List[Node],
+        output_nodes: List[Node],
+        logical_device_ids: List[int],
+        size_bytes: int,
+    ) -> None:
+        self.submodule_node: Node = submodule_node
+        self.input_nodes: List[Node] = input_nodes
+        self.output_nodes: List[Node] = output_nodes
+        self.logical_device_ids: List[int] = logical_device_ids
+        self.size_bytes = size_bytes
+
+    def __str__(self) -> str:
+        return str(self.submodule_node)
+
+
+class DAG:
+    """DAG class contains all the DAG nodes"""
+
+    def __init__(self) -> None:
+        self.nodes: List[DAGNode] = []
+
+    def create_node(
+        self,
+        submodule_node: Node,
+        input_nodes: List[Node],
+        output_nodes: List[Node],
+        logical_devices: List[int],
+        size_bytes: int,
+    ) -> None:
+        node = DAGNode(
+            submodule_node, input_nodes, output_nodes, logical_devices, size_bytes
+        )
+        self.nodes.append(node)
+
+
+class PartitionResult(NamedTuple):
+    """NameTuple used for returning DAG and a new fx module"""
+
+    dag: DAG
+    module_with_submodules: GraphModule
+
+
+"""Followings are some helper functions for partition manipulation"""
+
+
+def reset_partition_device(partitions):
+    for partition in partitions:
+        partition.logical_device_ids = []
+
+
+def combine_two_partitions(
+    partition_0: Partition, partition_1: Partition, partitions: List[Partition]
+) -> None:
+    """Given a list of partitions and its two partitions,
+    combine these two partitions into a new one appending to the partitions
+    and remove the previous two partitions from the list of partitions
+    """
+    partition = Partition(len(partitions))
+    partition.nodes = partition_0.nodes.union(partition_1.nodes)
+    partition.recalculate_mem_size()
+    partitions.append(partition)
+    partitions.remove(partition_0)
+    partitions.remove(partition_1)
+    reorganize_partitions(partitions)
+    return
+
+
+def set_parents_and_children(partitions: List[Partition]) -> None:
+    """Given a list of partitions, mark parents and children for each partition"""
+    # Go through all nodes in a partition.
+    # If a node's user is in other partition,
+    # then the other partition is this partition's children.
+    # This partition is the other partition's parent
+    for partition in partitions:
+        partition.children = set()
+        partition.parents = set()
+    for partition in partitions:
+        for node in partition.nodes:
+            # For each node in the current partition, find its users
+            users = node.users
+            for n in users:
+                # Find which the partition the user node belongs to.
+                # Note that if the node itself is also belongs to that partition,
+                # that partition is not the child of the current partition
+                for p in partitions:
+                    if p != partition and n in p.nodes and node not in p.nodes:
+                        partition.children.add(p)
+                        p.parents.add(partition)
+    return
+
+
+def reorganize_partitions(partitions: List[Partition]) -> None:
+    """Given a list of partitions, reorganize partition id,
+    its parents and its children for each partition
+    """
+    # Rearrange partition ids
+    for i, partition in enumerate(partitions):
+        partition.partition_id = i
+    set_parents_and_children(partitions)
+    return
+
+
+def get_bfs_level_partition(partitions: List[Partition]) -> None:
+    """Given a list of partitions,
+    mark the bfs level for each partition
+    """
+    current_level: Set[Partition] = set()
+    visited: Set[Partition] = set()
+    for partition in partitions:
+        # If a partition has no parent, it should be in root level
+        if len(partition.parents) == 0:
+            current_level.add(partition)
+    next_level: Set[Partition] = set()
+    level = 0
+    # bfs
+    while current_level:
+        partition = current_level.pop()
+        partition.bfs_level = level
+        visited.add(partition)
+        children = partition.children
+        for child in children:
+            if child not in next_level:
+                next_level.add(child)
+        if not current_level:
+            current_level = next_level.copy()
+            next_level = set()
+            level += 1
+    return
+
+
+def get_node_to_partition_mapping(partitions: List[Partition]) -> Dict[Node, int]:
+    """Given a list of partitions,return node to partition mapping"""
+    node_to_partition: Dict[Node, int] = {}
+    for partition in partitions:
+        for node in partition.nodes:
+            node_to_partition[node] = partition.partition_id
+    return node_to_partition
+
+
+def get_logical_id_to_device(devices: List[Device]) -> Dict[int, Device]:
+    """Get a mapping from device logical ID to Device object."""
+    logical_id_to_device: Dict[int, Device] = {}
+    for d in devices:
+        logical_id_to_device[d.logical_id] = d
+    return logical_id_to_device
+
+
+def get_device_partition_stats(
+    partitions: List[Partition], devices: List[Device]
+) -> Tuple[Dict[Device, List[Partition]], Dict[Device, int], List[Partition]]:
+    """Given a list of partitions and a list of devices, returns:
+    1. A mapping from device to partitions on it;
+    2. A mapping from device to its remaining memory size;
+    3. A list of partitions that do not have a device.
+    """
+    # logical id to device
+    logical_id_to_device = get_logical_id_to_device(devices)
+    # Track partitions on device
+    device_to_partitions: Dict[Device, List[Partition]] = {}
+    # Track device's left mem size
+    device_to_left_mem_bytes: Dict[Device, int] = {}
+    for d in devices:
+        device_to_partitions[d] = []
+        device_to_left_mem_bytes[d] = d.available_mem_bytes
+
+    # Deal with the partitions that already have a device
+    # and also collect all partitions without a device (no_device_partitions)
+    no_device_partitions = []
+    for partition in partitions:
+        if partition.logical_device_ids != []:
+            for logical_id in partition.logical_device_ids:
+                device = logical_id_to_device[logical_id]
+                device_to_partitions[device].append(partition)
+                device_to_left_mem_bytes[device] -= partition.used_mem_bytes
+        else:
+            no_device_partitions.append(partition)
+
+    return (
+        device_to_partitions,
+        device_to_left_mem_bytes,
+        no_device_partitions,
+    )
+
+
+def get_device_to_partitions_mapping(
+    partitions: List[Partition], devices: List[Device]
+):
+    """Given a list of partitions and a list of devices,
+    map each partition into a device.
+    """
+
+    def calculate_extra_mem_bytes_needed_for(
+        partition: Partition, partitions: List[Partition]
+    ):
+        all_nodes: Set[Node] = set()
+        for p in partitions:
+            all_nodes = all_nodes.union(p.nodes)
+        if len(all_nodes) == 0:
+            return partition.used_mem_bytes
+        all_nodes = all_nodes.union(partition.nodes)
+        extra_size_needed = 0
+        for node in partition.nodes:
+            extra_size_needed += get_extra_size_of(node, all_nodes)
+        return extra_size_needed
+
+    def find_device_for(partition: Partition):
+        """Given a partition, find a logical device for the partition
+        The algorithm is to put the partition on the device
+        that has just enough mem left for that partition.
+        device_to_left_mem_bytes is a dictionary between device and its left mem size
+        sorted by its left mem size
+        """
+        for d in device_to_left_mem_bytes:
+            extra_size_needed = calculate_extra_mem_bytes_needed_for(
+                partition, device_to_partitions[d]
+            )
+            if extra_size_needed < device_to_left_mem_bytes[d]:
+                device_to_partitions[d].append(partition)
+                partition.logical_device_ids.append(d.logical_id)
+                device_to_left_mem_bytes[d] -= extra_size_needed
+                return True
+        return False
+
+    (
+        device_to_partitions,
+        device_to_left_mem_bytes,
+        no_device_partitions,
+    ) = get_device_partition_stats(partitions, devices)
+
+    # Find devices for all the partitions without a device
+    found_device = True
+    for partition in no_device_partitions:
+        device_to_left_mem_bytes = dict(sorted(device_to_left_mem_bytes.items(), key=lambda item: item[1]))
+        found_device = find_device_for(partition)
+        if not found_device:
+            break
+    return found_device
+
+
+def check_dependency(partition):
+    """Given a partition,check if there is a circular dependency on
+    this partition using bfs
+    """
+    visited: Set[Partition] = {partition}
+    queue: Deque[Partition] = deque([partition])
+    while queue:
+        p = queue.popleft()
+        for child in p.children:
+            if child == partition:
+                return True
+            else:
+                if child not in visited:
+                    visited.add(child)
+                    queue.append(child)
+    return False
+
+
+class Partitioner:
+    """A fx module may not fit into one device.
+    Partitioner class helps partition one fx module into submodules (partitions),
+    so that the submodules can be executed crossing different accelerators.
+    The main function of this class is self.partition_graph.
+    It partitions the fx module based on the scheme specified in partition_config
+    A DAG structure is returned
+    along with a new fx module with submodule nodes.
+    """
+
+    def __init__(self) -> None:
+        self.partitions: List[Partition] = []
+        self.node_to_partition: Dict[Node, int] = {}
+        self.devices: List[Device] = []
+
+    def partition_graph(
+        self,
+        fx_module: GraphModule,
+        torch_module: torch.nn.Module,
+        partitioner_config: PartitionerConfig,
+    ) -> PartitionResult:
+        """Given the fx module, torch module and partitioner_config,
+        find the partitions, do the partitions,
+        and then return a DAG and a new fx module with submodule nodes (partitions)
+        """
+        self.graph_module = fx_module
+        self.torch_module = torch_module
+        self.devices = partitioner_config.devices
+        if len(self.devices) == 0:
+            raise RuntimeError("No devices")
+        # Tag the size in bytes to all nodes in the graph_module.
+        get_size_of_all_nodes(self.graph_module)
+        # Check if there are op nodes in the fx module
+        nodes = self.graph_module.graph.nodes
+        if all(node.op in {"placeholder", "get_attr", "output"} for node in nodes):
+            raise RuntimeError("No Partition since no operations in the module")
+        # Calculate total size of the fx module
+        total_size_of_graph = 0
+        for node in nodes:
+            if node.op == "output":
+                break
+            total_size_of_graph += node.size_bytes.total_size
+        # Find the device with the max mem size
+        device_with_max_mem = max(self.devices, key=lambda d: d.available_mem_bytes)
+        # AOT based partition
+        if partitioner_config.mode == PartitionMode.aot_based:
+            self.aot_based_partition(
+                partitioner_config.node_to_partition_mapping,
+                partitioner_config.partition_to_logical_device_mapping,
+            )
+        # Single partition if the whole module can be fit into one device
+        elif total_size_of_graph <= device_with_max_mem.available_mem_bytes:
+            self.find_single_partition(
+                total_size_of_graph, logical_device_id=device_with_max_mem.logical_id
+            )
+        elif total_size_of_graph > sum([d.available_mem_bytes for d in self.devices]):
+            raise RuntimeError("Devices have no enough memory for the module")
+        else:
+            # Sparse nn based partition
+            if partitioner_config.mode == PartitionMode.sparse_nn:
+                available_mem_bytes = self.devices[0].available_mem_bytes
+                if not all(
+                    device.available_mem_bytes == available_mem_bytes
+                    for device in self.devices
+                ):
+                    raise RuntimeError("All devices must have same memory size!")
+                # sparse_nn_partition only support same memory size
+                # TODO: add different size support for sparse_nn_partition
+                self.sparse_nn_partition(available_mem_bytes)
+            # Cost aware partition
+            elif partitioner_config.mode == PartitionMode.cost_aware:
+                self.cost_aware_partition(
+                    partitioner_config.transfer_rate_bytes_per_sec,
+                    partitioner_config.node_to_latency_mapping,
+                )
+            # KL based partition
+            elif partitioner_config.mode == PartitionMode.kl_based:
+                self.kl_based_partition(
+                    partitioner_config.transfer_rate_bytes_per_sec,
+                    partitioner_config.node_to_latency_mapping,
+                )
+            else:
+                self.size_based_partition()
+
+        # Saturate host if possible.
+        if partitioner_config.saturate_host:
+            self.saturate_host()
+
+        # Partition the graph module based on the partition assignment.
+        module_with_submodules = self.do_partition()
+
+        # The DAG contains DAGNodes with info of each partition's input nodes, output nodes
+        # and how partitions are connected.
+        dag = self.dump_dag(module_with_submodules)
+        ret = PartitionResult(dag, module_with_submodules)
+        return ret
+
+    def find_single_partition(
+        self, total_size_of_graph, logical_device_id: int = 0
+    ) -> None:
+        """Fit the whole fx module into one device"""
+        partition_0 = self.create_partition()
+        for node in self.graph_module.graph.nodes:
+            if node.op == "output":
+                # Skip the output node, but there can
+                # be nodes after the output in certain cases.
+                continue
+            partition_0.nodes.add(node)
+        partition_0.used_mem_bytes = total_size_of_graph
+        partition_0.logical_device_ids = [logical_device_id]
+        # Get the node to partition mapping
+        self.node_to_partition = get_node_to_partition_mapping(self.partitions)
+        return
+
+    def size_based_partition(self) -> None:
+        """This method is to partition the fx module based on memory size.
+        It uses greedy approach. The result may not be the best.
+        The basic idea is:
+        Step 1:
+        Find a device which has enough memory to fit the current node, create a empty partition
+        with the size of that device.
+        Then keep adding the following nodes into the partition until the partition is full.
+        Step 2:
+        Repeat Step 1 until no device left
+        Step 3:
+        If some nodes are left, create a partition for each left node (single node partition).
+        and then try to map those partitions into logical devices with enough mem left.
+        """
+
+        def find_device_based_on_size(node) -> Device:
+            """Given a node, this function is to find a logical device
+            that could fit the node.
+            """
+            mem_size_needed = get_extra_size_of(node, set())
+            device = Device("", -1, -1)
+            for d in self.devices:
+                if (
+                    d not in occupied_devices
+                    and d.available_mem_bytes >= mem_size_needed
+                ):
+                    device = d
+                    break
+            if device.available_mem_bytes < 0:
+                raise RuntimeError(str(node) + "is too large to fit any device")
+            occupied_devices.append(device)
+            return device
+
+        # Track partition and its left mem size
+        partition_to_left_mem_bytes: Dict[Partition, int] = {}
+        # Track all the devices that have been used
+        occupied_devices: List[Device] = []
+        partition = self.create_partition()
+        for node in self.graph_module.graph.nodes:
+            if node.op in {"call_module", "call_method", "call_function"}:
+                # Check if there are devices left
+                if len(self.partitions) <= len(self.devices):
+                    total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                    # Check if the current partition is the very first partition
+                    if partition.used_mem_bytes == 0:
+                        # Find a device to fit the first node, return available mem size
+                        device = find_device_based_on_size(node)
+                        occupied_devices.append(device)
+                        # Update partition and its left mem size
+                        partition_to_left_mem_bytes[
+                            partition
+                        ] = device.available_mem_bytes
+                        # Update available mem for the current partition
+                        partition.logical_device_ids.append(device.logical_id)
+                    else:
+                        # The current partition is not the first partition
+                        # Check if the current node can fit into current partition
+                        if (
+                            partition_to_left_mem_bytes[partition]
+                            < total_size_of_input_nodes
+                        ):
+                            # Check if no device is left
+                            if len(self.partitions) == len(self.devices):
+                                # No device is left
+                                # Put the previous partitions into a list (non_single_node_partitions)
+                                non_single_node_partitions = self.partitions[:]
+                                # Create the first single node partition for the current node
+                                self.create_single_node_partition(node)
+                                continue
+                            # Some devices are still left
+                            # Create a new partition with a mem size that is enough for the current node
+                            device = find_device_based_on_size(node)
+                            partition = self.create_partition()
+                            total_size_of_input_nodes = get_extra_size_of(
+                                node, partition.nodes
+                            )
+                            partition_to_left_mem_bytes[
+                                partition
+                            ] = device.available_mem_bytes
+                            partition.logical_device_ids.append(device.logical_id)
+                    partition.add_node(node)
+                    partition_to_left_mem_bytes[partition] -= total_size_of_input_nodes
+                # Create single node partitions if no device is left
+                else:
+                    self.create_single_node_partition(node)
+        reorganize_partitions(self.partitions)
+        # Get the node to partition mapping
+        self.node_to_partition = get_node_to_partition_mapping(self.partitions)
+        # Mapping all partitions into device
+        found_partition_to_device_mapping = get_device_to_partitions_mapping(
+            self.partitions, self.devices
+        )
+        if not found_partition_to_device_mapping:
+            raise RuntimeError("Cannot Get a Valid Partition to Logical Device Mapping")
+        return
+
+    def saturate_host(self) -> None:
+        """Saturate host by assigning replicates to unused devices with enough memory.
+        It uses a greedy approach to find a next available set of devices to place all split
+        partitions: For each used device, it searches for an idle device with minimal memory
+        size that can hold all the partition located on that device; If the search is successful
+        for all used devices, it then assigns the new devices' logical ID to the corresponding
+        partition.
+        """
+        (
+            device_to_partitions,
+            device_to_left_mem_bytes,
+            no_device_partitions,
+        ) = get_device_partition_stats(self.partitions, self.devices)
+
+        assert (
+            len(no_device_partitions) == 0
+        ), f"Expect no_device_partitions has 0 device, but get {len(no_device_partitions)}"
+
+        # Devices that hold partitions
+        used_devices = [d for d in self.devices if len(device_to_partitions[d]) > 0]
+        # Track replicates of the assigned devices
+        replicated_device_to_used_device: Dict[Device, Device] = {}
+
+        while len(used_devices) * 2 + len(replicated_device_to_used_device) <= len(
+            self.devices
+        ):
+            # Success flag for this round
+            success = True
+            # Devices that have not been assigned
+            idle_devices = [
+                d
+                for d in self.devices
+                if d not in used_devices and d not in replicated_device_to_used_device
+            ]
+            # Temporary mapping from replicated device to original device
+            temp_replicate_mapping = {}
+
+            # Find a new device to replicate all partitions on an used device
+            for used_device in used_devices:
+                # Idle devices that have enough memory
+                available_devices = [
+                    d
+                    for d in idle_devices
+                    if d.available_mem_bytes
+                    >= used_device.available_mem_bytes
+                    - device_to_left_mem_bytes[used_device]
+                ]
+                if len(available_devices) == 0:
+                    success = False
+                    break
+                new_device = min(available_devices, key=lambda d: d.available_mem_bytes)
+                idle_devices.remove(new_device)
+                temp_replicate_mapping[new_device] = used_device
+
+            if not success:
+                break
+            replicated_device_to_used_device.update(temp_replicate_mapping)
+
+        # Update logical device IDs assigned to the partitions
+        for (
+            replicate_device,
+            original_device,
+        ) in replicated_device_to_used_device.items():
+            logical_id = replicate_device.logical_id
+            for partition in device_to_partitions[original_device]:
+                partition.logical_device_ids.append(logical_id)
+        for p in self.partitions:
+            print(p.logical_device_ids)
+
+    def do_partition(self) -> GraphModule:
+        """Return a new fx module with submodule nodes (partitions)."""
+        module_with_submodules = split_module(
+            self.graph_module,
+            self.torch_module,
+            lambda node: self.node_to_partition[node],
+        )
+        return module_with_submodules
+
+    def dump_dag(self, module_with_submodules: GraphModule) -> DAG:
+        """Return the dag structure and the new fx module with submodules."""
+        dag = DAG()
+        for node in module_with_submodules.graph.nodes:
+            if node.op == "output":
+                break
+            if node.op in {"placeholder", "get_attr"}:
+                continue
+            if node.target == operator.__getitem__:
+                continue
+            input_nodes: Dict[Node, None] = {}
+            map_arg(node.args, input_nodes.setdefault)
+            map_arg(node.kwargs, input_nodes.setdefault)
+            # When a node has two or more output nodes,
+            # it outputs its result to 'getitem' nodes.
+            # Those 'getitem' nodes are the output node for this node.
+            # Otherwise, the output node is this node itself.
+            if len(node.users) > 1:
+                output_nodes = list(node.users)
+            else:
+                output_nodes = [node]
+            partition_id = int(node.name.rsplit("_", 1)[-1])
+            device_ids = self.partitions[partition_id].logical_device_ids
+            size_bytes = self.partitions[partition_id].used_mem_bytes
+            dag.create_node(
+                node, list(input_nodes), output_nodes, device_ids, size_bytes
+            )
+        return dag
+
+    def create_partition(self) -> Partition:
+        """Create a partition and append it to self.partitions."""
+        partition_id = len(self.partitions)
+        partition = Partition(partition_id)
+        self.partitions.append(partition)
+        return partition
+
+    def create_single_node_partition(self, node):
+        """Create a partition for a single node"""
+        partition = self.create_partition()
+        partition.add_node(node)
+        return
+
+    def sparse_nn_partition(self, available_mem_bytes: int) -> None:
+        """This method partition a sparse nn module.
+        It is size based partition but different from size_based_partition,
+        it only works when all the devices have same memory size (available_mem_bytes).
+        In the future, devices with different mem sizes will be supported like size_based_partition.
+        It first traverse all the nodes and do the partitions based on the same memory size.
+        If the current partition has no enough memory left for a new op node
+        (call_module, call_method, call_function), a new partition is created.
+        When crossing the boundary between non-embedding nodes and embedding nodes,
+        a new partition is created regardlessly.
+        For example, if the current node is a non-embedding node but the next node is an
+        embedding node, a new partition is created for the next node.
+        After the partition, the partitions are combined as much as possible.
+        The rule is that a non-embedding partition only
+        combines with another non-embedding one.
+        So as the embedding partitions.
+        """
+
+        def combine_partitions_based_on_size(
+            partitions: List[Partition], available_mem_bytes: int
+        ) -> None:
+            """Combining small partitions together to keep as less partitions as possible.
+            Here is an example of the algorithm to do this:
+            Assume some partitions, we first sort them based on partition used memory size.
+            [(partition_4, 1), (partition_3, 1), (partition_2, 2), (partition_1, 7), (partition_0, 9)]
+            The available memory is 10.
+            step 1: self.find_partition_to_combine_based_on_size()
+            First, mark bfs level for each partition
+            Second, look the smallest partition, partition_4: 10 - 1 = 9
+            It means any partition has a used memory equal or less than 9 could combine this partition
+            We go from the largest and selection partition_0.
+            Check the bfs level for two partitions, if the level difference is less than 2,
+            it can be combined.
+            step 2: repeat step 1 until no partitions can be combined
+            """
+            find_combination = True
+            while find_combination:
+                # Sort partitions based on memory size
+                sorted_partitions = sorted(partitions, key=lambda p: p.used_mem_bytes)
+                # Mark bfs level
+                get_bfs_level_partition(self.partitions)
+                find_combination, partitions = find_partition_to_combine_based_on_size(
+                    sorted_partitions, available_mem_bytes, partitions
+                )
+            return
+
+        def calculate_mem_bytes_needed(p1, p2):
+            """Given two partitions, calculate how many mem bytes
+            are needed if two partitions are combined
+            """
+            nodes = p1.nodes.union(p2.nodes)
+            mem_bytes_needed = 0
+            for node in nodes:
+                mem_bytes_needed += get_extra_size_of(node, nodes)
+            return mem_bytes_needed
+
+        def find_partition_to_combine_based_on_size(
+            sorted_partitions: List[Partition],
+            available_mem_bytes: int,
+            partitions: List[Partition],
+        ) -> Tuple[bool, List[Partition]]:
+            """step 1 in combine_partition_based_on_size()"""
+            find_combination = False
+            smallest_partition = sorted_partitions.pop(0)
+            for p in sorted_partitions[::-1]:
+                if abs(smallest_partition.bfs_level - p.bfs_level) <= 1:
+                    # Calculate how many bytes needed if combined
+                    mem_bytes_needed = calculate_mem_bytes_needed(p, smallest_partition)
+                    if mem_bytes_needed <= available_mem_bytes:
+                        combine_two_partitions(p, smallest_partition, self.partitions)
+                        partitions.remove(smallest_partition)
+                        partitions.remove(p)
+                        partitions.append(self.partitions[-1])
+                        find_combination = True
+                        break
+            return find_combination, partitions
+
+        def reset_partition_in_sparse_nn(partition, new_partition=True):
+            """If crossing the boundary between non-embedding nodes and
+            embedding nodes, create a new partition
+            """
+            if in_embedding_region:
+                embedding_partitions.append(partition)
+            else:
+                non_embedding_partitions.append(partition)
+            if new_partition:
+                partition = self.create_partition()
+                partition.left_mem_bytes = available_mem_bytes
+                return partition
+            return None
+
+        def is_embedding_node(node: Node) -> bool:
+            """Check if a node is an embedding node"""
+            if node.op == "call_module":
+                submodule = self.graph_module
+                for atom in str(node.target).split("."):
+                    if not hasattr(submodule, atom):
+                        raise RuntimeError(
+                            f"Module {submodule} has no attribute {atom}"
+                        )
+                    submodule = getattr(submodule, atom)
+                    if "Embedding" in str(submodule):
+                        return True
+            return False
+
+        # Track embedding partitions and non-embedding partitions separately
+        embedding_partitions: List[Partition] = []
+        non_embedding_partitions: List[Partition] = []
+        # A Flag to check the boundary
+        in_embedding_region: bool = False
+        partition = self.create_partition()
+        for node in self.graph_module.graph.nodes:
+            if node.op in {"call_module", "call_method", "call_function"}:
+                # Check if crossing the boundary between embedding nodes and non embedding nodes
+                if is_embedding_node(node) != in_embedding_region:
+                    # Crossing the boundary
+                    # Check if the current partition is an empty partition
+                    if partition.used_mem_bytes != 0:
+                        # The current partition isn't an empty partition. Create a new one.
+                        partition = reset_partition_in_sparse_nn(partition)
+                    in_embedding_region = not in_embedding_region
+                total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                if (
+                    total_size_of_input_nodes + partition.used_mem_bytes
+                    > available_mem_bytes
+                ):
+                    partition = reset_partition_in_sparse_nn(partition)
+                    total_size_of_input_nodes = get_extra_size_of(node, partition.nodes)
+                    if total_size_of_input_nodes > available_mem_bytes:
+                        raise RuntimeError(
+                            node.target + "is too large to fit into a device"
+                        )
+                partition.add_node(node)
+        reset_partition_in_sparse_nn(partition, new_partition=False)
+        # Set parents and children for partitions
+        set_parents_and_children(self.partitions)
+        # Combining non-embedding partitions
+        combine_partitions_based_on_size(non_embedding_partitions, available_mem_bytes)
+        # Combining embedding partitions
+        combine_partitions_based_on_size(embedding_partitions, available_mem_bytes)
+        total_size_of_non_embedding_partitions = 0
+        for partition in non_embedding_partitions:
+            total_size_of_non_embedding_partitions += partition.used_mem_bytes
+        # Check if devices are enough for all partitions
+        if len(embedding_partitions) > len(self.devices):
+            msg = (
+                "Need "
+                + str(len(embedding_partitions))
+                + " devices, but only "
+                + str(len(self.devices))
+                + " provided"
+            )
+            raise RuntimeError(msg)
+        occupied_devices = []
+        for i, partition in enumerate(embedding_partitions):
+            # Check if all non-embedding partitions can fit into embedding partition devices
+            if (
+                total_size_of_non_embedding_partitions + partition.used_mem_bytes
+                > available_mem_bytes
+            ):
+                raise RuntimeError(
+                    "partition_"
+                    + str(partition.partition_id)
+                    + "(embedding partition) and non embedding partitions can not fit into one device"
+                )
+            else:
+                # Add logical device to the partition
+                partition.logical_device_ids = [self.devices[i].logical_id]
+                occupied_devices.append(self.devices[i].logical_id)
+        # Add logical devices to the non_embedding_partitions
+        for partition in non_embedding_partitions:
+            partition.logical_device_ids = occupied_devices
+        # Get the node to partition mapping
+        self.node_to_partition = get_node_to_partition_mapping(self.partitions)
+        return
+
+    def cost_aware_partition(
+        self,
+        transfer_rate_bytes_per_sec: float,
+        node_to_latency_mapping: Dict[Node, NodeLatency],
+    ) -> None:
+        """This method is to partition the fx module based on the cost.
+        The cost is the total latency of running the whole fx module.
+        In partitioner_utils.py, the cost model is built.
+        The cost aware partition algorithm is:
+        #1. At every beginning, each node is a partition.
+            Then we map all the partitions to the devices
+            and calculate the cost
+        #2. Then try to pre-combine any two of the partitions if the two
+            partitions can be combined.
+            (the bfs level is less than 2 or two partitions are connected and
+            can find partition to device mapping)
+            See if any partition pair could reduce the current cost.
+            Choose the pair that shows the minimum cost and then combine them
+        #3. Repeat #2 until the cost cannot be reduced.
+        """
+
+        def try_combining_partitions(p0_index, p1_index, partitions) -> float:
+            """Given two partitions and a list of partitions, combine these two partitions
+            and see what is the cost of the modified partition list
+            """
+            p0 = partitions[p0_index]
+            p1 = partitions[p1_index]
+            """If two partitions' bfs level are less than 2 or two partitions are connected to each other,
+               then they can be combined
+            """
+            if (
+                (abs(p0.bfs_level - p1.bfs_level) <= 1)
+                or (p0 in p1.parents)
+                or p0 in (p1.children)
+            ):
+                combine_two_partitions(p0, p1, partitions)
+                # Check if a circular dependency exists after combining
+                if check_dependency(partitions[-1]):
+                    return float("inf")
+                # Check if the modified partition list can be mapped to devices after combination
+                reset_partition_device(partitions)
+                found_deivce = get_device_to_partitions_mapping(
+                    partitions, self.devices
+                )
+                if not found_deivce:
+                    return float("inf")
+                # Calculate the new cost
+                partition_to_latency_mapping = get_partition_to_latency_mapping(
+                    partitions, node_to_latency_mapping
+                )
+                cost = get_latency_of_partitioned_graph(
+                    partitions,
+                    partition_to_latency_mapping,
+                    transfer_rate_bytes_per_sec,
+                )
+                return cost
+            # If two partition can not be combined, the cost is inf
+            return float("inf")
+
+        def search_combination(
+            transfer_rate_bytes_per_sec, node_to_latency_mapping
+        ) -> bool:
+            """Given transfer rate between partitions and each node's latency,
+            find two partitions to combine so the cost of the partitions can
+            be reduced.
+            The algorithm is :
+            1. Go through all the partition pairs and see
+            if any pair of partitions can be combined.
+            2. Calculate the cost after the combination.
+            3. Select the minimum cost and combine its corresponding partition pair.
+            """
+            partition_to_latency_mapping = get_partition_to_latency_mapping(
+                self.partitions, node_to_latency_mapping
+            )
+            cost = get_latency_of_partitioned_graph(
+                self.partitions,
+                partition_to_latency_mapping,
+                transfer_rate_bytes_per_sec,
+            )
+            if len(self.partitions) == 1:
+                return False
+            partition_pair: List[int] = []
+            for i in range(len(self.partitions) - 1):
+                for j in range(i + 1, len(self.partitions)):
+                    # Try to combine the partition pair
+                    # and see the new cost after combination
+                    new_cost = try_combining_partitions(i, j, self.partitions[:])
+                    if new_cost <= cost:
+                        partition_pair = [i, j]
+                        cost = new_cost
+                    reorganize_partitions(self.partitions)
+            # If a partition pair is found, combine them
+            if len(partition_pair) != 0:
+                p0 = self.partitions[partition_pair[0]]
+                p1 = self.partitions[partition_pair[1]]
+                combine_two_partitions(p0, p1, self.partitions)
+            get_bfs_level_partition(self.partitions)
+            reset_partition_device(self.partitions)
+            get_device_to_partitions_mapping(self.partitions, self.devices)
+            return len(partition_pair) != 0
+
+        for node in self.graph_module.graph.nodes:
+            if node.op not in {"placeholder", "get_attr", "output"}:
+                self.create_single_node_partition(node)
+        # Set up parent partitions and children partitions for each partition
+        set_parents_and_children(self.partitions)
+        # Get bfs level for each partition
+        get_bfs_level_partition(self.partitions)
+        find_combination = True
+        while find_combination:
+            # Search for a pair partition to generate the minimum new cost,
+            # then combine them
+            find_combination = search_combination(
+                transfer_rate_bytes_per_sec, node_to_latency_mapping
+            )
+        # Make sure all partitions are set up correctly
+        reorganize_partitions(self.partitions)
+        # Set up node to partition mapping
+        self.node_to_partition = get_node_to_partition_mapping(self.partitions)
+        return
+
+    def kl_based_partition(
+        self,
+        transfer_rate_bytes_per_sec: float,
+        node_to_latency_mapping: Dict[Node, NodeLatency],
+    ) -> None:
+        """This function is a cost aware partition based
+        on Kernighan-Lin algorithm.
+        First, the graph is partitioned using size_based_partition.
+        Then, each node is swapped with any other node in a different
+        partition, and at the same time, the cost is estimated after
+        the swapping.
+        For example, we have nodes n0, n1, n2, n3 and n4.
+        Using size_based_partition, n0 and n1 are in Partition p0.
+        n2, n3 and n4 in Partition p1. The current cost is estimated.
+        We first tried using n0 to swap with n2 from the other partition.
+        Then we see that swapping n0 and n2 shows a lower cost
+        than the current cost and it is the minimum among other pairs like
+        (n0, None)(This means moving n0 to Partition without swapping other nodes),
+        (n0, n3) and (n0, n4). We swap n0 and n2 and set the new cost
+        as the current cost.
+        Then We repeat this process for all the other nodes until all swapping pairs
+        are tried.
+        """
+
+        def swap_nodes(n0, n1, p0, p1):
+            # Either n0 or n1 could be None
+            # That means we simply move the node
+            # to another partition
+            if n0 is not None:
+                p0.remove_node(n0)
+                p1.add_node(n0)
+            if n1 is not None:
+                p0.add_node(n1)
+                p1.remove_node(n1)
+
+        def try_swap_nodes(
+            n0, n1, p0, p1, node_to_latency_mapping, transfer_rate_per_sec
+        ):
+            cost = float("inf")
+            swap_nodes(n0, n1, p0, p1)
+            # Reorganize partitions after swapping
+            reorganize_partitions(self.partitions)
+            # Check if there is a circular dependency after swapping
+            if (not check_dependency(p0)) and (not check_dependency(p1)):
+                reset_partition_device(self.partitions)
+                partition_to_latency_mapping = get_partition_to_latency_mapping(
+                    self.partitions, node_to_latency_mapping
+                )
+                # Check if all partitions can be mapped to logical devices after swapping
+                found_device = get_device_to_partitions_mapping(
+                    self.partitions, self.devices
+                )
+                if not found_device:
+                    cost = float("inf")
+                else:
+                    cost = get_latency_of_partitioned_graph(
+                        self.partitions,
+                        partition_to_latency_mapping,
+                        transfer_rate_bytes_per_sec,
+                    )
+            # Swap back and reset all partitions back to original
+            swap_nodes(n1, n0, p0, p1)
+            reorganize_partitions(self.partitions)
+            reset_partition_device(self.partitions)
+            get_device_to_partitions_mapping(self.partitions, self.devices)
+            return cost
+
+        def swap_node_to_partition(
+            node, p0, p1, node_to_latency_mapping, transfer_rate_per_sec
+        ):
+            """This function helps to swap one node from partition p0
+            with all the nodes in another partition p1
+            """
+            p1_nodes = list(p1.nodes) + [None]
+            min_cost = float("inf")
+            node_pair: List[Node] = []
+            for n1 in p1_nodes:
+                # Ignore the node if it is not a op node
+                if n1 is not None and n1.op in {"placeholder", "get_attr"}:
+                    continue
+                # Try swapping node in p0 with n1 in p1
+                cost = try_swap_nodes(
+                    node, n1, p0, p1, node_to_latency_mapping, transfer_rate_per_sec
+                )
+                if cost < min_cost:
+                    node_pair = [node, n1]
+                    min_cost = cost
+            return cost, node_pair  # type: ignore[possibly-undefined]
+
+        # First use size_base_partition
+        self.size_based_partition()
+        partition_to_latency_mapping = get_partition_to_latency_mapping(
+            self.partitions, node_to_latency_mapping
+        )
+        # Calculate the cost of the partitions
+        cost = get_latency_of_partitioned_graph(
+            self.partitions, partition_to_latency_mapping, transfer_rate_bytes_per_sec
+        )
+        # Keep tracking the node pair that shows the better cost
+        node_pair: List[Node] = []
+        # Keep tracking the partition pair of node pair
+        partition_pair: List[Partition] = []
+        # Collect all the op nodes from the graph
+        op_nodes = []
+        for n in self.graph_module.graph.nodes:
+            if n.op not in {"placeholder", "get_attr", "output"}:
+                op_nodes.append(n)
+        for node in op_nodes:
+            # Find which partition the current node belongs
+            p0_index = self.node_to_partition[node]
+            p0 = self.partitions[p0_index]
+            # Go through all the other partitions to swap
+            # with other nodes from those partitions
+            for p1_index, _ in enumerate(self.partitions):
+                if p0_index != p1_index:
+                    p1 = self.partitions[p1_index]
+                    new_cost, new_node_pair = swap_node_to_partition(
+                        node,
+                        p0,
+                        p1,
+                        node_to_latency_mapping,
+                        transfer_rate_bytes_per_sec,
+                    )
+                    # Update the cost
+                    # Track the swapped node pair and their partitions
+                    if new_cost < cost:
+                        cost = new_cost
+                        node_pair = new_node_pair
+                        partition_pair = [p0, p1]
+            # Do the swapping after trying all the nodes from a partition
+            if len(node_pair) != 0:
+                swap_nodes(
+                    node_pair[0], node_pair[1], partition_pair[0], partition_pair[1]
+                )
+                reorganize_partitions(self.partitions)
+                get_device_to_partitions_mapping(self.partitions, self.devices)
+        reorganize_partitions(self.partitions)
+        # Mapping the device to the partition
+        get_device_to_partitions_mapping(self.partitions, self.devices)
+        return
+
+    def aot_based_partition(
+        self, node_to_partition_mapping, partition_to_logical_device_mapping
+    ):
+        """This function helps to rebuild the partitions given the nodes and its
+        corresponding partition id
+        """
+        partition_id_to_partition_mapping: Dict[int, Partition] = {}
+        self.node_to_partition = node_to_partition_mapping
+        for node in self.node_to_partition:
+            partition_id = self.node_to_partition[node]
+            # If the requested partition has not been created, create the partition
+            if partition_id not in partition_id_to_partition_mapping:
+                partition = Partition(partition_id)
+                self.partitions.append(partition)
+                partition_id_to_partition_mapping[partition_id] = partition
+                partition.logical_device_ids = partition_to_logical_device_mapping[
+                    partition_id
+                ]
+            else:
+                partition = partition_id_to_partition_mapping[
+                    self.node_to_partition[node]
+                ]
+            # Add the current node into the partition
+            partition.add_node(node)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/const_fold.py b/MLPY/Lib/site-packages/torch/fx/experimental/const_fold.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e4a002b6f831f579cc647142567815e7bcdfb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/const_fold.py
@@ -0,0 +1,289 @@
+import re
+from typing import Callable, Dict, Optional, Set, Union
+
+import torch.fx
+from torch.fx.node import map_arg
+from torch.fx.passes.split_module import split_module
+
+
+__all__ = ['FoldedGraphModule', 'get_unique_attr_name_in_module', 'split_const_subgraphs']
+
+class FoldedGraphModule(torch.fx.GraphModule):
+    """
+    FoldedGraphModule is a GraphModule which also contains another
+    `const_subgraph_module` representing a subgraph which has all const attr
+    inputs and which can be run once before running the main standard
+    `graph`. The `const_output_names` are the ordered list names of attrs which
+    represent what each respective output from the const_subgraph should be set
+    on which attrs.
+    """
+
+    def __init__(
+        self,
+        root: torch.nn.Module,
+        graph: torch.fx.Graph,
+        const_subgraph: Optional[torch.fx.Graph] = None,
+        fx_const_folded_attrs_name: Optional[str] = None,
+        device_for_folded_attrs: str = "cuda",
+    ):
+        super().__init__(root, graph)
+        self.const_subgraph_module = (
+            None
+            if const_subgraph is None
+            else torch.fx.GraphModule(root, const_subgraph)
+        )
+        self.has_folding_been_run = False
+        self.fx_const_folded_attrs_name = fx_const_folded_attrs_name
+        self.device_for_folded_attrs = device_for_folded_attrs
+
+    def __call__(self, *args, **kwargs):
+        if not self.has_folding_been_run:
+            self.run_folding()
+        return super().__call__(*args)
+
+    def run_folding(self):
+        # If there's no const subgraph module or attr output names to use, return
+        # early as there is no const folding to perform.
+        if (
+            self.const_subgraph_module is None
+            or self.fx_const_folded_attrs_name is None
+        ):
+            return
+
+        assert not self.has_folding_been_run
+        self.has_folding_been_run = True
+
+        # Actually run const folding subgraph. Note that single attr const fold
+        # subgraphs output a single Tensor while multiple outputs are returned as
+        # Tuple[Tensor,].
+        folded_attrs = self.const_subgraph_module()
+
+        def _create_param(i):
+            return torch.nn.Parameter(
+                i
+                if not isinstance(i, int)
+                else torch.Tensor([i]).to(device=self.device_for_folded_attrs),
+                requires_grad=i.requires_grad if isinstance(i, torch.Tensor) else False,
+            )
+
+        params = (
+            torch.nn.ParameterList([_create_param(i) for i in folded_attrs])
+            if isinstance(folded_attrs, tuple)
+            else _create_param(folded_attrs)
+        )
+        setattr(self, self.fx_const_folded_attrs_name, params)
+
+
+def _inline_module(gm: torch.fx.GraphModule, inline_mod_name: str):
+    """
+    Given `gm` and some graph module which is called with target name `inline_mod_name`,
+    this helper will inline all of the nodes from that called graph module into `gm`.
+    """
+    # Fetch the inner graph module that we want to inline inside `gm`.
+    inline_mod = dict(gm.named_modules())[inline_mod_name]
+    assert isinstance(inline_mod, torch.fx.GraphModule)
+    call_mod_node_to_replace = None
+    for node in gm.graph.nodes:
+        if node.op == "call_module" and node.target == inline_mod_name:
+            call_mod_node_to_replace = node
+            break
+    assert call_mod_node_to_replace is not None
+
+    # Now actually do the swap. Note that we have to keep track of new nodes that are
+    # copied into `gm` -- we do this via replacement_mapping.
+    call_mod_args = call_mod_node_to_replace.args
+    replacement_mapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    ph_count = 0
+
+    def replacement_fn(node):
+        new_node = replacement_mapping[node]
+        new_node.meta = node.meta.copy()
+        return new_node
+
+    for inline_node in inline_mod.graph.nodes:
+        if inline_node.op == "placeholder":
+            replacement_mapping[inline_node] = call_mod_args[ph_count]
+            ph_count += 1
+            continue
+
+        if inline_node.op == "output":
+            outputs = inline_node.args[0]
+            output_replacements = map_arg(outputs, replacement_fn)
+            call_mod_node_to_replace.replace_all_uses_with(output_replacements)
+            continue
+
+        with gm.graph.inserting_before(call_mod_node_to_replace):
+            new_node = gm.graph.node_copy(inline_node, replacement_fn)
+        replacement_mapping[inline_node] = new_node
+
+    gm.graph.eliminate_dead_code()
+
+
+def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
+    """
+    Make sure the name is unique (in a module) and can represents an attr.
+    """
+    # Delete all characters that are illegal in a Python identifier.
+    name = re.sub("[^0-9a-zA-Z_]+", "_", name)
+    if name[0].isdigit():
+        name = f"_{name}"
+    # Now make sure it is in fact unique to the module by incrementing suffix value.
+    while hasattr(mod_traced, name):
+        match = re.match(r"(.*)_(\d+)$", name)
+        if match is None:
+            name = name + "_1"
+        else:
+            base, num = match.group(1, 2)
+            name = f"{base}_{int(num) + 1}"
+
+    return name
+
+
+def split_const_subgraphs(
+    module: Union[torch.nn.Module, torch.fx.GraphModule],
+    skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None,
+    device_for_folded_attrs: str = "cpu",
+) -> FoldedGraphModule:
+    """
+    Looks through `module` for any nodes that have all constant attribute inputs
+    and separates them out into their own constant subgraph, and returns a
+    FoldedGraphModule which runs that constant subgraph on the first run to set
+    attributes on the module prior to running the non-constant portion of the
+    graph.
+    """
+    if not isinstance(module, torch.fx.GraphModule):
+        mod_traced = torch.fx.symbolic_trace(module)
+    else:
+        mod_traced = module
+
+    # Build up a list of const_nodes, defined as nodes that are themselves
+    # get_attrs, or have all get_attr or other constant node inputs.
+    const_nodes: Set[torch.fx.Node] = set()
+    found_const_folding = False
+    for node in mod_traced.graph.nodes:
+        # Skip over placeholders/outputs because they can't be const folded and
+        # we don't want to add tags to them.
+        if node.op in {"placeholder", "output"}:
+            continue
+
+        # If the node itself is constant, or all of its inputs are constant,
+        # then tag it as constant.
+        if node.op != "get_attr" and not set(node.all_input_nodes).issubset(
+            const_nodes
+        ):
+            continue
+
+        # If provided skip folding function says to skip, then skip.
+        if skip_folding_node_fn and skip_folding_node_fn(node):
+            continue
+
+        # Skip folding side-effectful functions
+        if node.is_impure():
+            continue
+
+        # Must be a constant foldable node at this point.
+        const_nodes.add(node)
+        if node.op != "get_attr":
+            found_const_folding = True
+
+    # If we did not find any const folding then return early without a const fold subgraph.
+    if not found_const_folding:
+        return FoldedGraphModule(mod_traced, mod_traced.graph)
+
+    # Partition the module into two: submod_0 for constant folding subgraph, and
+    # submod_1 for the rest.
+    def mod_partition(node: torch.fx.Node):
+        return 0 if node in const_nodes else 1
+
+    split = split_module(mod_traced, module, mod_partition)
+
+    const_gm, non_const_gm = split.submod_0, split.submod_1
+    const_mod_name, non_const_mod_name = "submod_0", "submod_1"
+
+    # The module that a call_module node refers to gets copied to submodules during split.
+    # The path to the module also gets inlined, i.e. mod.a.b -> mod_a_b. Here we need to
+    # attach inlined modules to `split` as it's the owning module now.
+    for node in non_const_gm.graph.nodes:
+        if node.op == "call_module":
+            setattr(split, node.target, getattr(non_const_gm, node.target))
+    for node in const_gm.graph.nodes:
+        if node.op == "call_module":
+            setattr(split, node.target, getattr(const_gm, node.target))
+
+    # split_module currently does not use get_attrs for attrs. Instead it passes
+    # them in as args from the parent module, which used get_attrs. Here we set
+    # them as get_attrs inside const_gm, allowing for running folding without
+    # somehow a priori knowing the attrs that should be passed as args. We can
+    # unconditionally do this for all placeholders because we know all
+    # placeholders to const_gm must be constants accessible via get_attr.
+    call_const_gm_args = None
+    for node in split.graph.nodes:
+        if node.op == "call_module":
+            if node.target == const_mod_name:
+                call_const_gm_args = node.args
+                break
+    assert call_const_gm_args is not None
+
+    # Here we do the actual replacement of placeholders to get_attrs. Note that here we
+    # set the const_gm.graph into a new root_const_gm with split as the root module,
+    # because we are fetching attributes directly from the root module, instead of
+    # fetching them from const_gm. Example: The const_gm must have some format like:
+    # graph():
+    #    %inp : [num_users=1] = placeholder[target=const_inp]
+    #    %add : [num_users=1] = call_function[target=operator.add](args = (%inp, %inp), kwargs = {})
+    #    return add
+    # We replace that with the following, which does not have any placeholders:
+    # graph():
+    #    %inp_1 : [num_users=1] = get_attr[target=const_inp]
+    #    %add : [num_users=1] = call_function[target=operator.add](args = (%inp_1, %inp_1), kwargs = {})
+    #    return add
+    root_const_gm = torch.fx.GraphModule(split, const_gm.graph)
+    for node in root_const_gm.graph.nodes:
+        if node.op == "output":
+            multiple_outputs = isinstance(node.args[0], tuple)
+            continue
+        if node.op != "placeholder":
+            continue
+        in_node = next(n for n in call_const_gm_args if n.name == node.target)
+        assert in_node.op == "get_attr"
+        with root_const_gm.graph.inserting_before(node):
+            new_node = root_const_gm.graph.get_attr(in_node.target)
+        new_node.meta = node.meta.copy()
+        node.replace_all_uses_with(new_node)
+        root_const_gm.graph.erase_node(node)
+    assert "multiple_outputs" in locals()
+
+    # Now find the call to const_gm inside split, and replace it with a getattr to the
+    # folded tensor(s) that result from constant folding. Note that we don't need to
+    # worry about whether this is one or more tensors because the original graph
+    # correctly uses getitem to extract individual tensors if there are multiple folded.
+    fx_const_folded_attrs_name = get_unique_attr_name_in_module(
+        split, "_FX_CONST_FOLDED_ATTRS"
+    )
+    setattr(
+        split,
+        fx_const_folded_attrs_name,
+        torch.nn.ParameterList() if multiple_outputs else torch.nn.Parameter(),  # type: ignore[possibly-undefined]
+    )
+    for node in split.graph.nodes:
+        if node.op == "call_module" and node.target == const_mod_name:
+            with node.graph.inserting_before(node):
+                folded_attrs = node.graph.get_attr(fx_const_folded_attrs_name)
+            folded_attrs.meta = node.meta.copy()
+            node.replace_all_uses_with(folded_attrs)
+            break
+
+    split.graph.eliminate_dead_code()
+
+    # Finally, inline the non-constant submod into the split submod. This is so that the
+    # original caller who may have passed in a graph module will get back out a graph
+    # module whose graph is traced to the same granularity.
+    _inline_module(split, non_const_mod_name)
+
+    return FoldedGraphModule(
+        split,
+        split.graph,
+        root_const_gm.graph,
+        fx_const_folded_attrs_name,
+        device_for_folded_attrs,
+    )
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/debug.py b/MLPY/Lib/site-packages/torch/fx/experimental/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c89590a704535a6d2dccd404b873a20dbccb169
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/debug.py
@@ -0,0 +1,31 @@
+import torch.fx as fx
+
+def set_trace(gm: fx.GraphModule) -> fx.GraphModule:
+    """
+    Sets a breakpoint in `gm`'s generated python code. It drops into pdb when
+    `gm` gets run.
+
+    Args:
+        gm: graph module to insert breakpoint. It is then recompiled for it to
+            take effect.
+
+    Returns:
+        the `gm` with breakpoint inserted.
+    """
+    def insert_pdb(body):
+        return ["import pdb; pdb.set_trace()\n", *body]
+
+    with gm.graph.on_generate_code(
+        make_transformer=lambda cur_transform: (
+            # new code transformer to register
+            lambda body: (
+                insert_pdb(
+                    cur_transform(body) if cur_transform
+                    else body
+                )
+            )
+        )
+    ):
+        gm.recompile()
+
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/graph_gradual_typechecker.py b/MLPY/Lib/site-packages/torch/fx/experimental/graph_gradual_typechecker.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c85b879a070fb4e37c5d6b55e12ab7bae2c5b9c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/graph_gradual_typechecker.py
@@ -0,0 +1,914 @@
+from functools import reduce
+import torch
+import operator
+from torch.fx.tensor_type import Dyn, is_consistent, TensorType, is_more_precise
+from typing import Callable, Dict
+from torch.fx.node import Target, Node
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.conv import Conv2d
+from torch.fx.experimental.refinement_types import Equality
+import itertools
+
+from torch.fx.experimental.unification import Var  # type: ignore[attr-defined]
+
+import sympy
+
+_INFERENCE_RULES: Dict[Target, Callable] = {}
+_REFINEMENT_RULES: Dict[Target, Callable] = {}
+_RULES: Dict[Target, Callable] = {}
+
+
+def expand_to_tensor_dim(t, n):
+    """
+    Expand a type to the desired tensor dimension if possible
+    Raise an error otherwise.
+    - t is the given type
+    - n is a number of dimensions to expand to
+    """
+    if t == Dyn:
+        dims = [Dyn] * n
+        return TensorType(tuple(dims))
+    elif isinstance(t, TensorType):
+        if len(t.__args__) != n:
+            raise TypeError(f'Cannot extend tensor. Tensor {t} has rank {len(t.__args__)}. It should have rank {n}')
+        return t
+    else:
+        raise TypeError(f'Cannot match the type {t}')
+
+
+def broadcast_types(t1, t2):
+    """
+    Applies broadcasting to both given types such that they
+    become consistent with eachother and returns two new
+    resulting types
+    """
+
+    # if either type is Dyn, do nothing since the types are already consistent
+    if t1 == Dyn or t2 == Dyn or isinstance(t1, Var) or isinstance(t2, Var):
+        return t1, t2
+
+    if isinstance(t1, TensorType) and isinstance(t2, TensorType):
+        s1 = len(t1.__args__)
+        s2 = len(t2.__args__)
+
+        new_t1 = list(t1.__args__)
+        new_t2 = list(t2.__args__)
+
+        # We make the types the same length which is the first requirement
+        # for consistency
+        if s1 > s2:
+            for i in range(s1 - s2):
+                new_t2.insert(0, 1)
+
+        elif s2 > s1:
+            for i in range(s2 - s1):
+                new_t1.insert(0, 1)
+
+        # we replace occurrences of "1" with each tensor with
+        # the corresponding type from the other tensor
+        for i, (x, y) in enumerate(zip(new_t1, new_t2)):
+            if x == 1:
+                new_t1[i] = y
+            elif y == 1:
+                new_t2[i] = x
+
+        # at this point our tensors should be consistent
+        # and we can apply the element-wise operation and find the right dimension
+        # for the output of the operation
+        (t1, t2) = TensorType(tuple(new_t1)), TensorType(tuple(new_t2))
+        return (t1, t2)
+    else:
+        raise TypeError(f'Cannot broadcast types {t1} and {t2}')
+
+def register_inference_rule(call_target):
+    def register(fn):
+        if call_target in _INFERENCE_RULES:
+            raise RuntimeError(f'Inference rule already registered for {call_target}!')
+        _INFERENCE_RULES[call_target] = fn
+        return fn
+    return register
+
+def register_refinement_rule(call_target):
+    def register(fn):
+        if call_target in _REFINEMENT_RULES:
+            raise RuntimeError(f'Refinement rule already registered for {call_target}!')
+        _REFINEMENT_RULES[call_target] = fn
+        return fn
+    return register
+
+def register_algebraic_expressions_inference_rule(call_target):
+    def register(fn):
+        if call_target in _RULES:
+            raise RuntimeError(f'Rule already registered for {call_target}!')
+        _RULES[call_target] = fn
+        return fn
+    return register
+
+@register_inference_rule(torch.add)
+@register_inference_rule(operator.add)
+def add_inference_rule(n: Node):
+    """
+    Apply the addition inference rule. This includes:
+    - scalar addition
+    - broadcasting semantics
+
+    Note that we always return the least precise type between
+    the operands (after applying broadcasting) to be the final type of the operation
+
+    Note that we do not modify the operand types themselves after applying broadcasting
+    to them. We only use them to calculate the final type
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], Node)
+    t1 = n.args[0].type
+    t2 = n.args[1].type
+
+    # handle scalar addition
+    if t1 == int and isinstance(t2, TensorType):
+        n.type = t2
+        return n.type
+
+    # handle scalar addition
+    elif t2 == int and isinstance(t1, TensorType):
+        n.type = t1
+        return n.type
+
+    # we bring the new types to the point where
+    # we can check for consistency
+    # any inconsistency would not have been caused
+    # by broadcasting at this point
+    (new_t1, new_t2) = broadcast_types(t1, t2)
+
+    if new_t1 != t1 or new_t2 != t2:
+        n.meta['broadcast'] = True
+        n.meta[str(n.args[0])] = new_t1
+        n.meta[str(n.args[1])] = new_t2
+
+    else:
+        n.meta['broadcast'] = False
+
+    new_t1 = t1 if not n.meta['broadcast'] else new_t1
+    new_t2 = t2 if not n.meta['broadcast'] else new_t2
+
+    # we check for consistency between the new types
+    if is_consistent(new_t1, new_t2):
+        # we return the less precise type because
+        # broadcasting may have happened
+        # for operands with shape [1,2,Dyn] and [1,2,1]
+        # we have to assign the node [1,2,Dyn]
+        if is_more_precise(new_t1, new_t2):
+            n.type = new_t2
+        else:
+            n.type = new_t1
+        return n.type
+    else:
+        raise TypeError(f'Cannot add arguments {n.args[0]} ({ n.args[0].type}) and {n.args[1]} ({ n.args[1].type}) in node {n}.'
+                        f' Types should match ')
+
+@register_inference_rule(getattr)
+def get_attr_inference_rule(n: Node, traced):
+    """
+    The current getattr rule only handles the shape attribute
+    Can be extended to other attributes
+    The most representitive type we have is "Dyn" but the system
+    can be extended with more types, such as a type to represent shapes
+    """
+    attr_node = n.args[0]
+    attr_name = n.args[1]
+
+    if attr_name == "shape":
+        n.type = Dyn
+    else:
+        raise TypeError("Not yet implemented")
+
+    # TODO. We leave it like this till we add a type to represent tensor sizes
+    return n.type
+
+@register_inference_rule(torch.transpose)
+def transpose_inference_rule(n: Node):
+    """
+    We check that dimensions for the transpose operations
+    are within range of the tensor type of the node
+    """
+    if n.target == torch.transpose:
+        assert isinstance(n.args[0], Node)
+        t = n.args[0].type
+
+        assert isinstance(n.args[1], int)
+        assert isinstance(n.args[2], int)
+        dim1, dim2 = n.args[1], n.args[2]
+
+        if t == Dyn:
+            n.type = Dyn
+            return n.type
+
+        elif isinstance(t, TensorType):
+            if 0 <= dim1 < len(t.__args__) and 0 <= dim2 < len(t.__args__):
+                new_type = list(t.__args__)
+                new_type[dim1], new_type[dim2] = new_type[dim2], new_type[dim1]
+                final = TensorType(new_type)
+                n.type = get_greatest_upper_bound(n.type, final)
+                return n.type
+            else:
+                raise TypeError(f'Cannot transpose {dim1} and {dim2} in type {t} for node {n}')
+        else:
+            raise TypeError(f'Cannot transpose {dim1} and {dim2} in type {t} for node {n}')
+
+
+@register_inference_rule(torch.reshape)
+def reshape_inference_rule(n: Node):
+    """
+    Without dynamism, the rule checks that the
+    product of the elements of the argument tensor
+    type is equal to the product of the elements
+    of the required shape. We gradualize this rule
+    by adding a case to handle fully dynamic input
+    as well as input where some of the tensor dimensions
+    are unknown. In this case we check for divisibility
+    """
+    assert isinstance(n.args[0], Node)
+    t1 = n.args[0].type
+
+    assert isinstance(n.args[1], list)
+    t2 = n.args[1]
+    t2_type = TensorType([Dyn if elem == -1 else elem for elem in t2])
+
+    # if we do not know the original tensor dimension,
+    # we return the required dimension
+    if t1 == Dyn:
+        n.type = t2_type
+        return t2_type
+
+    # if any of the dimensions are unknown,
+    # we check for divisibility
+    elif isinstance(t1, TensorType):
+        assert isinstance(t1, TensorType)
+        a = [e if e != Dyn else 1 for e in t1.__args__]
+        p1 = reduce(operator.mul, a)
+        p2 = reduce(operator.mul, t2)
+        if p1 % p2 == 0 or p2 % p1 == 0:
+            n.type = t2_type
+            return t2_type
+        else:
+            raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}')
+    else:
+        raise TypeError(f'Cannot reshape in node {n} from {t1} to {t2_type}')
+
+@register_inference_rule(BatchNorm2d)
+def bn2d_inference_rule(n: Node, module_instance):
+    """
+    Given a BatchNorm2D instance and a node check the following conditions:
+    - the input type can be expanded to a size 4 tensor: t =  (x_1, x_2, x_3, x_4)
+    - the current node type can be expanded to a size 4 tensor: t' =  (x_1', x_2', x_3', x_4')
+    - t is consistent with t'
+    - x_2 is consistent with the module's num_features
+    - x_2' is consistent with the module's num_features
+    output type: the more precise type of t and t'
+    """
+    assert isinstance(n.args[0], Node)
+    n.args[0].type = expand_to_tensor_dim(n.args[0].type, 4)
+    arg_type = n.args[0].type
+    n.type = expand_to_tensor_dim(n.type, 4)
+
+    # we check the conditions on the incoming argument
+    # and any existing annotation
+    # we also check for consistency between both annotations
+    if is_consistent(arg_type.__args__[1], module_instance.num_features) and \
+            is_consistent(n.type.__args__[1], module_instance.num_features) and \
+            is_consistent(arg_type, n.type):
+
+        # we choose the more precise type
+        # to be the node type
+        # so if an incoming argument has more type information
+        # we set this node's type to be the argument type
+        n.type = get_greatest_upper_bound(arg_type, n.type)
+        return n.type
+    else:
+        raise TypeError(f'Cannot apply {module_instance} with input type {arg_type} and existing type {n.type} on {n}')
+
+
+def calculate_out_dimension(d_in, module_instance, index):
+    """
+    For calculating h_in and w_out according to the conv2D documentation
+    """
+    padding = (module_instance.padding, module_instance.padding) \
+        if isinstance(module_instance.padding, int) else module_instance.padding
+    kernel_size = (module_instance.kernel_size, module_instance.kernel_size) \
+        if isinstance(module_instance.kernel_size, int) else module_instance.kernel_size
+    stride = (module_instance.stride, module_instance.stride) \
+        if isinstance(module_instance.stride, int) else module_instance.stride
+    dilation = (module_instance.dilation, module_instance.dilation) \
+        if isinstance(module_instance.dilation, int) else module_instance.dilation
+
+    DIMENSION_TYPES = (int, sympy.Symbol)
+
+    if d_in == Dyn:
+        return Dyn
+
+    elif isinstance(d_in, DIMENSION_TYPES):
+        n = d_in + 2 * padding[index] - \
+            dilation[index] * \
+            (kernel_size[index] - 1) - 1
+
+        return (n // stride[0]) + 1
+
+    else:
+        raise TypeError(f'{d_in} in {module_instance} must be a number or Dyn. Received {type(d_in)}')
+
+
+def get_greatest_upper_bound(type1, type2):
+    """
+    Get the most precise type that's consistent with the given types
+    """
+    if type1 == Dyn:
+        return type2
+    elif type2 == Dyn:
+        return type1
+    elif isinstance(type1, TensorType) and isinstance(type2, TensorType):
+        if not is_consistent(type1, type2):
+            raise TypeError(f'Inconsistent types {type1}, {type2}')
+        gub = [t1 if is_more_precise(t1, t2) else t2 for (t1, t2) in zip(type1.__args__, type2.__args__)]
+        return TensorType(tuple(gub))
+
+
+@register_inference_rule(Conv2d)
+def conv2d_inference_rule(n: Node, module_instance):
+    """
+    Given a Conv2D instance and a node check the following conditions:
+    - the input type can be expanded to a size 4 tensor: t =  (x_1, x_2, H, W)
+    - the current node type can be expanded to a size 4 tensor: t' =  (x_1', x_2', x_3', x_4')
+    - x_2 is consistent with the module's in_channels
+    - let o = (x_1, out_channels, H_out, W_out)
+    then the output is the greatest upper bound of o and the existing node type t'.
+    """
+    assert isinstance(n.args[0], Node)
+    n.args[0].type = expand_to_tensor_dim(n.args[0].type, 4)
+    arg_type = n.args[0].type
+    curr_node_type = expand_to_tensor_dim(n.type, 4)
+
+    if is_consistent(arg_type.__args__[1], module_instance.in_channels):
+        w_in = arg_type.__args__[3]
+        h_in = arg_type.__args__[2]
+        h_out = calculate_out_dimension(h_in, module_instance, 0)
+        w_out = calculate_out_dimension(w_in, module_instance, 1)
+        new_type = TensorType((arg_type.__args__[0], module_instance.out_channels, h_out, w_out))
+        gub = get_greatest_upper_bound(new_type, curr_node_type)
+        n.type = gub
+        return n.type
+    else:
+        raise TypeError(f'Cannot apply {module_instance} with input type { arg_type} and existing type {n.type} on {n}')
+
+
+@register_inference_rule(torch.nn.ReLU)
+def relu_inference_rule(n: Node, module_instance):
+    """
+    Input and output shapes should be equal.
+    """
+    assert isinstance(n.args[0], Node)
+
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+
+    if isinstance(n.args[0].type, TensorType):
+        n.type = get_greatest_upper_bound(n.args[0].type, n.type)
+    return n.type
+
+
+def maxpool2d_check(typ, module_instance):
+    """
+    Applies the maxpool2d shape information to the input
+    this affects the last two dimensions
+    """
+    new_type_list = list(typ.__args__)
+    if len(new_type_list) == 4 or len(new_type_list) == 3:
+        w_in = new_type_list[-1]
+        h_in = new_type_list[-2]
+
+        h_out = calculate_out_dimension(h_in, module_instance, 0)
+        w_out = calculate_out_dimension(w_in, module_instance, 1)
+
+        new_type_list[-1] = w_out
+        new_type_list[-2] = h_out
+        return TensorType(tuple(new_type_list))
+
+    else:
+        raise TypeError(f'Wrong size {typ} for {module_instance}')
+
+
+@register_inference_rule(torch.nn.MaxPool2d)
+def maxpool2d_inference_rule(n: Node, module_instance):
+    """
+    Given a MaxPool2D instance and a node check the following conditions:
+    - Input size matches size 3 or 4
+    - Current node type is consistent with the output type we will calculate
+    - Input size matches output size and the last two dimensions of the output
+      are w_out and h_out. The remaining dimensions are the same as the input
+    - Our final result is the greatest upper bound of the output we calculate
+      and the current node type.
+    """
+    assert isinstance(n.args[0], Node)
+
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output = maxpool2d_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(output, n.type)
+    return n.type
+
+
+
+def linear_check(tensor_type, module_instance):
+    """
+    Checks that an input tensor type satisfies the conditions for linear operation
+    and returns the output type based on in and out features given by module_instance
+    """
+    if len(tensor_type.__args__) >= 2:
+        if is_consistent(module_instance.in_features, tensor_type.__args__[-1]):
+            new_type_args = list(tensor_type.__args__)
+            new_type_args[-1] = module_instance.out_features
+            return TensorType(tuple(new_type_args))
+        else:
+            raise TypeError(f'Inconsistent {module_instance.in_features} and {tensor_type.__args__[-1]} in {module_instance}')
+    else:
+        raise TypeError(f'Type {tensor_type} must have rank 2 or more.')
+
+
+@register_inference_rule(torch.nn.Linear)
+def linear_inference_rule(n: Node, module_instance):
+    """
+    Applies the shape information to the input then gets the greatest upper bound
+    of the resulting type and the existing type
+    """
+    assert isinstance(n.args[0], Node)
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output_type = linear_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(output_type, n.type)
+    return n.type
+
+
+def adaptiveavgpool2d_check(tensor_type, module_instance):
+    output_size = module_instance.output_size
+    if isinstance(output_size, int):
+        output_size = [output_size, output_size]
+    elif isinstance(output_size, tuple):
+        output_size = list(output_size)
+        if output_size[0] is None:
+            output_size[0] = output_size[1]
+        if output_size[1] is None:
+            output_size[1] = output_size[0]
+
+    new_type_list = list(tensor_type.__args__)
+
+    if len(tensor_type.__args__) == 4 or len(tensor_type.__args__) == 3:
+        new_type_list[-1] = output_size[1]
+        new_type_list[-2] = output_size[0]
+
+        return TensorType(tuple(new_type_list))
+
+    else:
+        raise TypeError(f'Tensor ranks must be 3 or 4. Got {tensor_type}')
+
+@register_inference_rule(torch.nn.AdaptiveAvgPool2d)
+def adaptiveavgpool2d_inference_rule(n: Node, module_instance):
+    """
+    The input and output sizes should be the same except for the last
+    two dimensions taken from the input, which represent width and height
+    """
+    assert isinstance(n.args[0], Node)
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+    if isinstance(n.args[0].type, TensorType):
+        output_type = adaptiveavgpool2d_check(n.args[0].type, module_instance)
+        n.type = get_greatest_upper_bound(n.type, output_type)
+    return n.type
+
+def flatten_check(tensor_type, start_dim, end_dim):
+    l = len(tensor_type.__args__)
+
+    start_dim = l if start_dim == -1 else abs(start_dim)
+    end_dim = l + end_dim + 1 if end_dim < 0 else end_dim + 1
+
+    if 0 <= start_dim <= (l - 1) and 0 <= end_dim <= l and start_dim < end_dim:
+        my_args = list(tensor_type.__args__)
+        lhs = my_args[0:start_dim]
+        rhs = my_args[end_dim:]
+        mid = my_args[start_dim:end_dim]
+        if Dyn in mid:
+            mid = [Dyn]
+        else:
+            mid = [reduce(operator.mul, my_args[start_dim:end_dim])]
+        new_type_list = lhs + mid + rhs
+        return TensorType(tuple(new_type_list))
+    else:
+        raise TypeError(f'Incompatible dimensions {start_dim}, {end_dim - 1} in type {tensor_type}')
+
+@register_inference_rule(torch.flatten)
+def flatten_inference_rule(n: Node):
+    """
+    Applies the flatten shape information to the input then gets the
+    greatest upper bound of the resulting type and the existing type
+    """
+    assert isinstance(n.args[0], Node)
+
+    # set the default start and end dims
+    start_dim = 1
+    end_dim = -1
+
+    if len(n.args) > 1:
+        assert isinstance(n.args[1], int)
+        start_dim = n.args[1]
+
+    if len(n.args) > 2:
+        assert isinstance(n.args[2], int)
+        end_dim = n.args[2]
+
+    if n.args[0].type == Dyn and isinstance(n.type, TensorType):
+        n.args[0].type = expand_to_tensor_dim(n.args[0].type, len(n.type.__args__))
+
+    if isinstance(n.args[0].type, TensorType):
+        output_type = flatten_check(n.args[0].type, start_dim, end_dim)
+        n.type = get_greatest_upper_bound(output_type , n.type)
+
+    return n.type
+
+class GraphTypeChecker:
+    def __init__(self, env, traced):
+        self.env = env
+        self.traced = traced
+
+    def type_check(self):
+        """
+        A gradual type checker for graphs
+        Effect: every node's field type will be
+        populated with a type after type-checking is done
+        """
+        graph = self.traced.graph
+
+        # type check every node with gradual type rules
+        # if any node does not type check return false
+        for n in graph.nodes:
+            self.type_check_node(n)
+        return True
+
+    def type_check_node(self, n: Node):
+        """
+        Type check a given fx node.
+        Current operations:
+        - Reshape
+        - Transpose
+        - Add
+        - Relu
+        - conv2d
+        - batchnorm2d
+        - flatten
+        - maxpool2d
+        - adaptiveavgpool2d
+        - linear
+        """
+        if n.type is None:
+            n.type = Dyn
+
+        if n.op == 'placeholder':
+            return n.type
+
+        elif n.op == 'get_attr':
+            t = get_parameter(self.traced, n.target)  # type: ignore[arg-type]
+            if isinstance(t.data, torch.Tensor):
+                n.type = TensorType(t.data.shape)
+            return n.type
+
+        elif n.op == 'call_function':
+            if n.target == getattr:
+                assert getattr in _INFERENCE_RULES
+                return _INFERENCE_RULES[n.target](n, self.traced)
+
+            elif n.target in _INFERENCE_RULES:
+                return _INFERENCE_RULES[n.target](n)
+            else:
+                raise RuntimeError(f'No inference rule registered for target {n.target}!')
+
+        elif n.op == 'call_module':
+            module_instance = self.traced.get_submodule(n.target)
+            if type(module_instance) in _INFERENCE_RULES:
+                return _INFERENCE_RULES[type(module_instance)](n, module_instance)
+            else:
+                raise RuntimeError(f'No inference rule registered for class {type(module_instance)}!')
+
+        elif n.op == 'output':
+            def get_node_type(a):
+                return a.type
+            n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+            return n.type
+
+        else:
+            raise NotImplementedError(f"Method {n.op} not yet implemented")
+
+
+@register_refinement_rule(Conv2d)
+def conv_refinement_rule(n: Node):
+    """
+    The equality constraints are between the first dimension of
+    the input and output
+    """
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        res = [Equality(arg_type.__args__[0], n.type.__args__[0])]
+        return res
+
+
+@register_refinement_rule(torch.nn.Linear)
+def linear_refinement_rule(n: Node):
+    """
+    The equality constraints are between the first dimension of
+    the input and output
+    """
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        res = [Equality(arg_type.__args__[0], n.type.__args__[0])]
+    return res
+
+@register_refinement_rule(BatchNorm2d)
+@register_refinement_rule(torch.nn.ReLU)
+def all_eq(n: Node):
+    """
+    For operations where the input shape is equal to the output shape
+    """
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        args1 = arg_type.__args__
+        args2 = n.type.__args__
+        res = [Equality(args1[i], args2[i]) for i in range(len(args1))]
+    return res
+
+
+@register_refinement_rule(torch.nn.AdaptiveAvgPool2d)
+@register_refinement_rule(torch.nn.MaxPool2d)
+def first_two_eq(n: Node):
+    """
+    For operations where the first two dimensions of the input and output shape
+    are equal
+    """
+    res = []
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        args1 = arg_type.__args__
+        args2 = n.type.__args__
+        res = [Equality(args1[0], args2[0]), Equality(args1[1], args2[1])]
+    return res
+
+
+@register_refinement_rule(torch.add)
+@register_refinement_rule(operator.add)
+def element_wise_eq(n: Node):
+    """
+    For element-wise operations and handles broadcasting.
+    Note that after applying broadcasting to the arguments
+    we are able to determine if certain dimensions have not been broadcast
+    if they are symbolicallu equal.
+
+    in this case, we can establish equality between those dimensions and the
+    corresponding output dimensions.
+
+    Note that it takes two iterations for this result. One iteration to establish
+    equality between certain dimensions of the operands (requiring the whole solver
+    including unification) and another iteration to establish equality between the operands
+    and the resulting type, requiring another round of constraint generation and unificaiton.
+    """
+    res = []
+    if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
+        arg_type1 = n.args[0].type
+        arg_type2 = n.args[1].type
+        if isinstance(arg_type1, TensorType) and isinstance(arg_type2, TensorType) and isinstance(n.type, TensorType):
+            args1, args2 = broadcast_types(arg_type1, arg_type2)
+            # by this point, we know that args1 and args2 are the same size.
+            a1 = args1.__args__
+            a2 = args2.__args__
+            a3 = n.type.__args__
+
+            # we would be here in the second iteration where we establish equality
+            # between operand type dimensions and the resulting type dimensions
+            r = []
+            for x, y, z in zip(a1, a2, a3):
+                if x == y:
+                    r.append(Equality(x, z))
+            res = r
+    return res
+
+
+@register_refinement_rule(torch.flatten)
+def flatten_refinement_rule(n: Node):
+    """
+    Generates equality constraints between the dimensions of the input and output
+    that will not be involved in the flatten operation
+    """
+    assert isinstance(n.args[0], Node)
+
+    eq_const = []
+
+    start_dim = 1
+    end_dim = -1
+
+    if len(n.args) > 1:
+        assert isinstance(n.args[1], int)
+        start_dim = n.args[1]
+
+    if len(n.args) > 2:
+        assert isinstance(n.args[2], int)
+        end_dim = n.args[2]
+
+    if isinstance(n.type, TensorType) and isinstance(n.args[0].type, TensorType):
+        l = len(n.type.__args__)
+        arg_type = n.args[0].type
+        start_dim = l if start_dim == -1 else start_dim
+        end_dim = l + end_dim + 1 if end_dim < 0 else end_dim + 1
+
+        for t1, t2 in zip(n.type.__args__[0:start_dim], arg_type.__args__[0:start_dim]):
+            eq_const.append(Equality(t1, t2))
+
+        for t1, t2 in zip(n.type.__args__[end_dim:], arg_type.__args__[end_dim:]):
+            eq_const.append(Equality(t1, t2))
+    return eq_const
+
+
+@register_algebraic_expressions_inference_rule(Conv2d)
+def conv_rule(n: Node, module_instance):
+    """
+    Represents the outout in terms of an algrbraic expression w.r.t
+    the input when possible
+    """
+    assert isinstance(n.args[0], Node)
+    arg_type = n.args[0].type
+    if isinstance(arg_type, TensorType) and isinstance(n.type, TensorType):
+        w_in = arg_type.__args__[3]
+        h_in = arg_type.__args__[2]
+        h_out = calculate_out_dimension(h_in, module_instance, 0)
+        w_out = calculate_out_dimension(w_in, module_instance, 1)
+        new_type = TensorType((n.type.__args__[0], n.type.__args__[1], h_out, w_out))
+        n.type = new_type
+        return new_type
+
+class Refine:
+    """
+    Symbolic shape inference.
+    Generates constraints over type variables.
+    Currently all constraints are equality constraints.
+    """
+    def __init__(self, traced):
+        self.constraints = []
+        self.traced = traced
+        self.symbol_iter = itertools.count(start=0, step=1)
+
+    def refine(self):
+        """
+        Generates constraints for
+        every node in the graph based on
+        the operation.
+        """
+        graph = self.traced.graph
+        for n in graph.nodes:
+            self.refine_node(n)
+        return True
+
+    def symbolic_relations(self):
+        """
+        Infers algebraic relations
+        """
+        graph = self.traced.graph
+        for n in graph.nodes:
+            self.infer_symbolic_relations(n)
+        return True
+
+    def replace_dyn_with_fresh_var(self, typ):
+        """
+        Replace all unknown types with fresh type variables.
+        """
+        if typ == Dyn:
+            new_symbol = Var(next(self.symbol_iter))
+            return new_symbol
+        elif isinstance(typ, TensorType):
+            new_args = [self.replace_dyn_with_fresh_var(a) for a in typ.__args__]
+            return TensorType(tuple(new_args))
+        elif isinstance(typ, list):
+            return [self.replace_dyn_with_fresh_var(t) for t in typ]
+        elif isinstance(typ, tuple):
+            return (self.replace_dyn_with_fresh_var(t) for t in typ)
+        else:
+            return typ
+
+
+    def convert_to_sympy_symbols(self, typ):
+        """
+        Replace all unknown types with fresh type variables.
+        """
+        if isinstance(typ, Var):
+            return sympy.symbols(str(typ))
+        elif isinstance(typ, TensorType):
+            new_args = [self.convert_to_sympy_symbols(a) for a in typ.__args__]
+            return TensorType(tuple(new_args))
+        elif isinstance(typ, list):
+            return [self.convert_to_sympy_symbols(t) for t in typ]
+        elif isinstance(typ, tuple):
+            return (self.convert_to_sympy_symbols(t) for t in typ)
+        else:
+            return typ
+
+    def refine_node(self, n: Node):
+        """
+        Returns a list of equality constraints for
+        call_module and call_function nodes.
+        Models the relation between input and output dimensions
+        using constraints in case they are both tensors.
+        All operations used in resnet50 are defined.
+        """
+        if n.type is None:
+            n.type = Dyn
+
+        n.type = self.replace_dyn_with_fresh_var(n.type)
+
+        if n.op == 'call_function':
+            if n.target in _REFINEMENT_RULES:
+                self.constraints += _REFINEMENT_RULES[n.target](n)
+            else:
+                pass
+
+        if n.op == 'call_module':
+            module_instance = self.traced.get_submodule(n.target)
+            if type(module_instance) in _REFINEMENT_RULES:
+                self.constraints += _REFINEMENT_RULES[type(module_instance)](n)
+            else:
+                pass
+
+        if n.op == 'output':
+            def get_node_type(a):
+                return a.type
+            n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+            return n.type
+
+        else:
+            pass
+
+    def infer_symbolic_relations(self, n: Node):
+        n.type = self.convert_to_sympy_symbols(n.type)
+        if n.op == 'call_function':
+            if n.target in _RULES:
+                return _RULES[n.target](n)
+            else:
+                pass
+
+        if n.op == 'call_module':
+            module_instance = self.traced.get_submodule(n.target)
+            if type(module_instance) in _RULES:
+                return _RULES[type(module_instance)](n, module_instance)
+            else:
+                pass
+
+        if n.op == 'output':
+            def get_node_type(a):
+                return a.type
+            n.type = torch.fx.node.map_arg(n.args[0], get_node_type)
+            return n.type
+
+        else:
+            pass
+
+def get_parameter(traced, target: str):
+    """
+    Returns the parameter given by ``target`` if it exists,
+    otherwise throws an error.
+
+    See the docstring for ``get_submodule`` for a more detailed
+    explanation of this method's functionality as well as how to
+    correctly specify ``target``.
+
+    Args:
+        target: The fully-qualified string name of the Parameter
+            to look for. (See ``get_submodule`` for how to specify a
+            fully-qualified string.)
+
+    Returns:
+        torch.nn.Parameter: The Parameter referenced by ``target``
+
+    Raises:
+        AttributeError: If the target string references an invalid
+            path or resolves to something that is not an
+            ``nn.Parameter``
+    """
+    module_path, _, param_name = target.rpartition(".")
+
+    mod: torch.nn.Module = traced.get_submodule(module_path)
+
+    if not hasattr(mod, param_name):
+        raise AttributeError(mod._get_name() + " has no attribute `" + param_name + "`")
+
+    param: torch.nn.Parameter = getattr(mod, param_name)
+
+    return param
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/merge_matmul.py b/MLPY/Lib/site-packages/torch/fx/experimental/merge_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..a14d1a9d8ab0fc86283b1fc1deacce6cd1ef2f17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/merge_matmul.py
@@ -0,0 +1,171 @@
+import torch
+
+from torch.fx.node import Node
+from torch.fx._symbolic_trace import symbolic_trace
+from torch.fx.passes.tools_common import legalize_graph
+import itertools
+import operator
+
+from typing import Dict, List, Tuple
+
+
+def split_result_tensors(
+    result: torch.Tensor, inputs: List[torch.Tensor]
+) -> Tuple[torch.Tensor, ...]:
+    """
+    A free function for use in the merge_matmul graph transformation below that
+    splits the output from a merged matmul into the individual results for each
+    input tensor.
+
+    Arguments:
+        result: The merged matmul result tensor.
+        inputs: The list of inputs that were merged into one for the matmul.
+
+    Returns:
+        List of matmul results for each input tensor.
+    """
+    # When fx tracer is running, x.shape[0] will be torch.fx.Attribute but we
+    # need an int even when tracing
+    if isinstance(result, torch.fx.Proxy):
+        splits = [0] * len(inputs)
+    else:
+        splits = [x.shape[0] for x in inputs]
+
+    return torch.split(result, splits)
+
+
+def may_depend_on(a: Node, b: Node, search_depth: int = 6):
+    """
+    Determine if one node depends on another in a torch.fx.Graph.
+
+    Arguments:
+        a: The node that may have a dependency on b.
+        b: The node that a may have a dependency on.
+        search_depth: In the case of an indirect dependency, this function
+                        searches upto this many nodes away in search of a
+                        data dependency. If none is found, the function
+                        makes the conservative assumption that there is a
+                        dependency.
+
+    Returns:
+        True if a may depend on b, False if it definitely does not.
+    """
+    # Equivalence is defined as dependence.
+    if a == b:
+        return True
+
+    # If a has no inputs, it cannot depend on b.
+    if len(a.all_input_nodes) == 0:
+        return False
+
+    # If the search depth has been exhausted and no conclusion has been
+    # reached, assume that there is a data dependency.
+    if search_depth == 0:
+        return True
+
+    # Recursively check all inputs of a.
+    for inp in a.all_input_nodes:
+        if may_depend_on(inp, b, search_depth - 1):
+            return True
+
+    return False
+
+
+def are_nodes_independent(nodes: List[Node]):
+    """
+    Check if all of the given nodes are pairwise-data independent.
+
+    Arguments:
+        nodes: The nodes to check for data dependencies.
+
+    Returns:
+        True if any pair in nodes has a data dependency.
+    """
+    # For each pair in nodes:
+    for i, j in itertools.combinations(nodes, 2):
+        if may_depend_on(i, j) or may_depend_on(j, i):
+            return False
+
+    return True
+
+
+def merge_matmul(in_mod: torch.nn.Module):
+    """
+    A graph transformation that merges matrix multiplication operations that share the same right-hand
+    side operand into one large matrix multiplication.
+               ____      _________        _________
+      ----    |    |    |         |     M|  A * C  |
+    M| A  |  T| B  | * K|    C    | =    |---------|
+      ---- ,  |    |    |         |     T|  B * C  |
+       K       ----      ---------        ---------
+                K            R                R
+    """
+    gm = symbolic_trace(in_mod)
+
+    rhs_users: Dict[Node, List[Node]] = {}
+    lhs_users: Dict[Node, List[Node]] = {}
+
+    # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to
+    # the matmul of which they are the LHS/RHS.
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target is not torch.matmul:
+            continue
+
+        lhs, rhs = node.args
+
+        # TODO: Properly handle aliasing caused by get_attr. For now,
+        # use the attribute name as the operand if the node is a
+        # get_attr.
+        lhs = lhs.target if lhs.op == "get_attr" else lhs
+        rhs = rhs.target if rhs.op == "get_attr" else rhs
+
+        lhs_users.setdefault(lhs, []).append(node)
+        rhs_users.setdefault(rhs, []).append(node)
+
+    for rhs, mms in rhs_users.items():
+        # There must be at least matmuls for a merge to make sense.
+        if len(mms) < 2:
+            continue
+
+        # All matmuls must not depend on each other directly or indirectly
+        # in order for the merge to be possible.
+        if not are_nodes_independent(mms):
+            continue
+
+        lhs_vals = [mm.args[0] for mm in mms]
+
+        # Merge the matmul.
+        # Collect a list of LHS operands and the single RHS operand.
+        lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals]
+        rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs
+
+        # Concatenate all the LHS operands.
+        merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {})
+
+        # Multiply the concatenated LHS operands with the one RHS. This will produce
+        # the same results as all the individual matmuls involving rhs in the original graph,
+        # but they will all be concatenated together.
+        merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {})
+
+        # Split the result of the merged matmul using the shapes of the LHS operands
+        # to ascertain how large each chunk should be.
+        merge_mm_split = gm.graph.call_function(
+            split_result_tensors, (merge_mm, lhs), {}
+        )
+        merge_mm_res = [
+            gm.graph.call_function(operator.getitem, (merge_mm_split, out), {})
+            for out in range(len(lhs))
+        ]
+
+        # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul.
+        for old, new in zip(mms, merge_mm_res):
+            old.replace_all_uses_with(new)
+            gm.graph.erase_node(old)
+
+        # All of the new nodes created above were inserted at the end, so we need to sort
+        # the nodes topologically to make sure all definitions precede uses.
+        legalize_graph(gm)
+
+    gm.recompile()
+    gm.graph.lint()
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/meta_tracer.py b/MLPY/Lib/site-packages/torch/fx/experimental/meta_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..143c96be65de2d0931ef26b93c6d12b3f8efe6d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/meta_tracer.py
@@ -0,0 +1,268 @@
+import torch
+import torch.fx
+import warnings
+import functools
+import builtins
+
+from typing import Any, Callable, Dict, Optional, Union
+
+def embedding_override(self, input):
+    return torch.empty(*input.shape, self.weight.shape[-1], device='meta')
+
+
+def nn_layernorm_override(self, input):
+    return input
+
+
+def torch_relu_override(x):
+    return x
+
+
+def torch_nn_relu_override(self, x):
+    return x
+
+
+def functional_relu_override(x, inplace=False):
+    assert not inplace, 'dont support inplace functional.relu for metatensor analysis'
+    return x
+
+
+def torch_where_override(condition, x, y):
+    # torch.where returns the broadcasted tensor of condition, x, and y,
+    # so hack it by using addition
+    return condition.to(device='meta') + x.to(device='meta') + y.to(device='meta')
+
+
+def torch_abs_override(input, *, out=None):
+    assert out is None, 'Dont support in-place abs for MetaTensor analysis'
+    return input
+
+manual_meta_overrides : Dict[Callable, Callable] = {
+    torch.nn.Embedding: embedding_override,
+    torch.nn.LayerNorm: nn_layernorm_override,
+    torch.relu: torch_relu_override,
+    torch.nn.functional.relu: functional_relu_override,
+    torch.nn.ReLU: torch_nn_relu_override,
+    torch.where: torch_where_override,
+    torch.abs: torch_abs_override,
+}
+
+def gen_constructor_wrapper(target):
+    @functools.wraps(target)
+    def wrapper(*args, **kwargs):
+        proxy = None
+
+        def check_has_proxy(v):
+            if isinstance(v, torch.fx.Proxy):
+                nonlocal proxy
+                proxy = v
+        torch.fx.node.map_aggregate(args, check_has_proxy)
+        torch.fx.node.map_aggregate(kwargs, check_has_proxy)
+
+        if proxy is not None:
+            return proxy.tracer.create_proxy('call_function', target, args, kwargs)
+        else:
+            return target(*args, **kwargs)
+    return wrapper, target
+
+class MetaProxy(torch.fx.Proxy):
+    def install_tensor_meta(self, tensor_meta):
+        self._tensor_meta = tensor_meta
+
+    def size(self, dim=None):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.size(*[dim] if dim else [])
+        return self.tracer.create_proxy('call_method', 'size', (self, dim) if dim else (self,), {})
+
+    def dim(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.dim()
+        return self.tracer.create_proxy('call_method', 'dim', (self,), {})
+
+    @property
+    def shape(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.shape
+        return self.tracer.create_proxy('call_function', builtins.getattr, (self, 'shape'), {})
+
+    @property
+    def dtype(self):
+        if hasattr(self, '_tensor_meta') and self._tensor_meta is not None:
+            return self._tensor_meta.dtype
+        return self.tracer.create_proxy('call_function', builtins.getattr, (self, 'dtype'), {})
+
+    @property
+    def device(self):
+        # Hack so we can track when devices are used. During meta-tensor propagation,
+        # replace these values with a constant 'meta'
+        return MetaDeviceAttribute(self, 'device')
+
+    def __getattr__(self, k):
+        if k == '_tensor_meta':
+            return self.__getattribute__(k)
+        # note: not added to the graph yet, if this is a method call
+        # we peephole optimize to the method invocation
+        return MetaAttribute(self, k)
+
+class MetaAttribute(MetaProxy):
+    def __init__(self, root, attr: str):
+
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._node = None
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
+
+class MetaDeviceAttribute(MetaAttribute):
+    pass
+
+def proxys_to_metas(v):
+    if isinstance(v, MetaDeviceAttribute):
+        return 'meta'
+    if isinstance(v, torch.fx.Proxy):
+        assert isinstance(v, MetaProxy), f'Expected MetaProxy but got {type(v)}'
+        assert hasattr(v, '_tensor_meta'), 'MetaProxy does not have an associated meta'
+        return v._tensor_meta
+    return v
+
+class MetaTracer(torch.fx.Tracer):
+    allow_insert_stateless_mods : bool = True
+
+    _TORCH_METHODS_TO_PATCH = ['arange', 'zeros', 'ones', 'full_like', 'eye']
+
+    def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None):
+        rv = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
+
+        if kind == 'placeholder' and target in self.meta_args:
+            rv.install_tensor_meta(self.meta_args[target])
+            return rv
+
+        if target in self.orig_fns:
+            # NOTE: tensor constructors in PyTorch define the `device` argument as
+            # *kwargs-only*. That is why this works. If you add methods to
+            # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
+            # this will break and you will likely see issues where we cannot infer
+            # the size of the output.
+            if 'device' in kwargs:
+                kwargs['device'] = 'meta'
+
+        try:
+            args_metas = torch.fx.node.map_aggregate(args, proxys_to_metas)
+            kwargs_metas = torch.fx.node.map_aggregate(kwargs, proxys_to_metas)
+
+            if kind == 'call_function':
+                meta_target = manual_meta_overrides.get(target, target)
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+            elif kind == 'call_method':
+                meta_out = getattr(args_metas[0], target)(*args_metas[1:], **kwargs_metas)
+            elif kind == 'call_module':
+                assert hasattr(self, 'orig_forward')
+                self._disable_module_getattr = True
+                try:
+                    mod = self.root.get_submodule(target)
+                    mod_type = type(mod)
+                    if mod_type in manual_meta_overrides:
+                        meta_out = manual_meta_overrides[mod_type](mod, *args_metas, **kwargs_metas)
+                    else:
+                        meta_out = self.orig_forward(*args_metas, **kwargs_metas)
+                finally:
+                    self._disable_module_getattr = False
+            elif kind == 'get_attr':
+                self._disable_module_getattr = True
+                try:
+                    attr_itr = self.root
+                    atoms = target.split('.')
+                    for atom in atoms:
+                        attr_itr = getattr(attr_itr, atom)
+                    assert isinstance(attr_itr, torch.Tensor)
+                    meta_out = attr_itr.to(device='meta')
+                finally:
+                    self._disable_module_getattr = False
+            else:
+                return rv
+
+            # TODO
+            assert isinstance(rv, torch.fx.Proxy), 'Dont support composite output yet'
+            rv.install_tensor_meta(meta_out)
+        except Exception as e:
+            warnings.warn(f'Could not compute metadata for {kind} target {target}: {e}')
+
+        return rv
+
+    def getattr(self, attr, attr_val, parameter_proxy_cache):
+        if getattr(self, '_disable_module_getattr', False):
+            return attr_val
+        else:
+            return super().getattr(attr, attr_val, parameter_proxy_cache)
+
+    def call_module(self, m, forward, args, kwargs):
+        self.orig_forward = forward
+        return super().call_module(m, forward, args, kwargs)
+
+    def _insert_module_as_submodule(self, mod: torch.nn.Module) -> str:
+        """
+        Helper method which tries to insert a module that was not declared as submodule.
+        """
+        idx = 0
+        mod_name = mod.__class__.__name__.lower()
+        path = f"{mod_name}_{idx}"
+        while hasattr(self.root, path):
+            path = f"{mod_name}_{idx}"
+            idx += 1
+
+        self.root.add_module(path, mod)
+        return path
+
+    def path_of_module(self, mod: torch.nn.Module) -> str:
+        try:
+            return super().path_of_module(mod)
+        except NameError as e:
+            if self.allow_insert_stateless_mods and len(list(mod.parameters())) == 0 and len(list(mod.buffers())) == 0:
+                path = self._insert_module_as_submodule(mod)
+                self.prev_module = path
+                return path
+            raise
+
+    def proxy(self, node):
+        return MetaProxy(node, self)
+
+    def trace(self, root, meta_args : Dict[str, torch.Tensor], concrete_args=None):
+        assert isinstance(meta_args, dict)
+        self.meta_args = meta_args
+
+        self.patched_torch_methods = {
+            target: gen_constructor_wrapper(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH
+        }
+        self.orig_fns = set()
+
+        for name, (wrapper, orig) in self.patched_torch_methods.items():
+            setattr(torch, name, wrapper)
+            self.orig_fns.add(orig)
+
+        try:
+            graph = super().trace(root, concrete_args)
+            graph._tracer_extras = {'meta_args': meta_args}
+            return graph
+        finally:
+            for name, (_, orig) in self.patched_torch_methods.items():
+                setattr(torch, name, orig)
+
+
+def symbolic_trace(root : Union[torch.nn.Module, Callable[..., Any]],
+                   meta_args : Optional[Dict[str, torch.Tensor]] = None,
+                   concrete_args: Optional[Dict[str, Any]] = None) -> torch.fx.GraphModule:
+    tracer = MetaTracer()
+    graph = tracer.trace(root, meta_args, concrete_args)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    gm = torch.fx.GraphModule(tracer.root, graph, name)
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__init__.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ecd695974a387abdb7fcb3618ddd14e6b82b0ef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4fc77613d9b6045f052969042946aa8eb3f37fc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_generator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_generator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ef26ebb6e7c993de92b1ee7f91f47c12bc01021
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_generator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_transformation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_transformation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f35becf9565832439eca13c5188318830d7c1363
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/constraint_transformation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/operation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/operation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13418d3a9018626d3394e019432391c09c10cfac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/operation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/transform_to_z3.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/transform_to_z3.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efa521704e2d028ad3625871d8b5dce1d692b871
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/transform_to_z3.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/util.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/util.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ac5b598e7b80738631efed7315124e4267edce2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/util.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/z3_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/z3_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c82ea5aebef10983a91ff568ebc68e40ca02d280
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/__pycache__/z3_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint.py
new file mode 100644
index 0000000000000000000000000000000000000000..b187f07878befdc1ef027fa35bbf83f3d542040a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint.py
@@ -0,0 +1,557 @@
+from torch.fx.experimental.migrate_gradual_types.operation import op_add, op_sub, op_mul, op_div, \
+    op_mod, op_gt, op_lt, op_neq, op_eq
+from torch.fx.tensor_type import TensorType, Dyn
+
+
+class Constraint:
+    pass
+
+
+class Conj(Constraint):
+    def __init__(self, conjuncts):
+        """
+        :param conjuncts: Conjunction of constraints
+        """
+        self.conjucts = conjuncts
+
+    def __eq__(self, other):
+        if isinstance(other, Conj):
+            return self.conjucts == other.conjucts and self.conjucts == other.conjucts
+        else:
+            return False
+
+    def __repr__(self):
+        return f'And({self.conjucts})'
+
+
+class Disj(Constraint):
+    def __init__(self, disjuncts):
+        """
+        :param disjuncts: Disjunction of constraints
+        """
+        self.disjuncts = disjuncts
+
+    def __eq__(self, other):
+        if isinstance(other, Disj):
+            return self.disjuncts == other.disjuncts and self.disjuncts == other.disjuncts
+        else:
+            return False
+
+    def __repr__(self):
+        return f'Or({self.disjuncts})'
+
+
+class Prod(Constraint):
+    def __init__(self, products):
+        """
+        :param products: lists of dimensions to multiply
+        """
+        self.products = products
+
+    def __eq__(self, other):
+        if isinstance(other, Prod):
+            return self.products == other.products and self.products == other.products
+        else:
+            return False
+
+    def __repr__(self):
+        return f'Product({self.products})'
+
+
+class T(Constraint):
+    """
+    True
+    """
+    def __init__(self):
+        pass
+
+    def __eq__(self, other):
+        return isinstance(other, T)
+
+    def __repr__(self):
+        return 'True'
+
+class F(Constraint):
+    """
+    False
+    """
+    def __init__(self):
+        pass
+
+    def __eq__(self, other):
+        return isinstance(other, F)
+
+    def __repr__(self):
+        return 'False'
+
+
+class BinaryConstraint(Constraint):
+    """
+    Represents all binary operations
+    """
+    def __init__(self, lhs, rhs, op):
+        """
+        :param lhs: lhs of the constraint
+        :param rhs: rhs of the constraint
+        :param op: string representing the operation
+        """
+        self.lhs = lhs
+        self.rhs = rhs
+        self.op = op
+
+    def __eq__(self, other):
+        if isinstance(other, BinaryConstraint):
+            return self.lhs == other.lhs and self.rhs == other.rhs and self.op == other.op
+        else:
+            return False
+
+    def __repr__(self):
+        return f'({self.lhs} {self.op} {self.rhs})'
+
+
+class BinConstraintT(BinaryConstraint):
+    """
+    Binary constraints about tensors
+    """
+    def __init__(self, lhs, rhs, op):
+        assert (isinstance(lhs, (TVar, TensorType, int)) or lhs == Dyn) and \
+               (isinstance(rhs, (TVar, TensorType, int)) or rhs == Dyn)
+        super().__init__(lhs, rhs, op)
+
+    def __eq__(self, other):
+        return super().__eq__(other)
+
+
+class BinConstraintD(BinaryConstraint):
+    """
+    Binary constraints about dimensions
+    """
+    def __init__(self, lhs, rhs, op):
+        assert is_algebraic_expression(lhs) or is_dim(lhs) or is_bool_expr(lhs)
+        assert is_algebraic_expression(rhs) or is_dim(rhs) or is_bool_expr(rhs)
+
+        super().__init__(lhs, rhs, op)
+
+    def __eq__(self, other):
+        return super().__eq__(other)
+
+
+
+class TGreatestUpperBound(Constraint):
+    """
+    Greatest Upper bound for tensors with dynamic type
+    """
+    def __init__(self, res, rhs1, rhs2):
+        """
+        :param res: tensor variable that stores the result of the outout
+        :param rhs1: tensor or tensor variable
+        :param rhs2: tensor or tensor variabke
+        """
+        self.res = res
+        self.rhs1 = rhs1
+        self.rhs2 = rhs2
+
+    def __repr__(self):
+        return f'{self.res} = {self.rhs1}⊔*{self.rhs2}'
+
+    def __eq__(self, other):
+        if isinstance(other, TGreatestUpperBound):
+            return self.res == other.res and self.rhs1 == other.rhs1 and self.rhs2 == other.rhs2
+        else:
+            return False
+
+
+class DGreatestUpperBound(Constraint):
+    """
+    Greatest Upper bound for dimensions
+    """
+    def __init__(self, res, rhs1, rhs2):
+        """
+        :param res: Dimension variable to store the result
+        :param rhs1: dimension variable 1
+        :param rhs2: dimension variable 2
+        """
+        assert is_dim(res)
+        assert is_dim(rhs1)
+        assert is_dim(rhs2)
+
+        self.res = res
+        self.rhs1 = rhs1
+        self.rhs2 = rhs2
+
+    def __repr__(self):
+        return f'{self.res} = {self.rhs1}⊔{self.rhs2}'
+
+    def __eq__(self, other):
+        if isinstance(other, DGreatestUpperBound):
+            return self.res == other.res and self.rhs1 == other.rhs1 and self.rhs2 == other.rhs2
+        else:
+            return False
+
+
+class CanReshape(Constraint):
+    """
+    can_reshape constraint
+    """
+    def __init__(self, src, target):
+        """
+        :param src: tensor variable
+        :param target: tensor
+        """
+        self.src = src
+        self.target = target
+
+    def __repr__(self):
+        return f'can-reshape({self.src}, {self.target})'
+
+    def __eq__(self, other):
+        if isinstance(other, CanReshape):
+            return self.src == other.src and self.target == other.target
+        else:
+            return False
+
+
+class IndexSelect(Constraint):
+
+    def __init__(self, tensor_size, input_var, dim_replace, index, output):
+        """
+        Args:
+            input_var: input to index_select
+            tensor_size: tensor size we are considering
+            dim_replace: the dimension of the output at "index"
+            index: location of the dimensions to replace in the input
+            output: variable to store the result
+        """
+        assert isinstance(input_var, TVar)
+        assert isinstance(output, TVar)
+        assert isinstance(dim_replace, DVar) or dim_replace == Dyn
+        assert isinstance(index, int)
+
+        self.input_var = input_var
+        self.tensor_size = tensor_size
+        self.dim_replace = dim_replace
+        self.index = index
+        self.output = output
+
+    def __repr__(self):
+
+        return f' {self.output} = ' \
+               f'IndexSelect({self.input_var}, ' \
+               f'tensor_size: {self.tensor_size}, ' \
+               f'{self.dim_replace}, ' \
+               f'{self.index})'
+
+    def __eq__(self, other):
+        if isinstance(other, IndexSelect):
+            return self.tensor_size == other.tensor_size and \
+                self.dim_replace == other.dim_replace and \
+                self.index == other.index and \
+                self.output == other.output and \
+                self.input_var == other.input_var
+        else:
+            return False
+
+
+class Transpose(Constraint):
+
+    def __init__(self, tensor_size, input_var, index1, index2, output):
+        """
+        Args:
+            tensor_size: current tensor size
+            input_var: variable to hold input
+            index1: dimension 1
+            index2: dimension 2
+            output: output that stores result
+        """
+        assert isinstance(input_var, TVar)
+        assert isinstance(output, TVar)
+        assert isinstance(index1, int)
+        assert isinstance(index2, int)
+
+        self.input_var = input_var
+        self.tensor_size = tensor_size
+        self.index1 = index1
+        self.index2 = index2
+        self.output = output
+
+    def __repr__(self):
+
+        return f' {self.output} = ' \
+               f'Transpose({self.input_var}, ' \
+               f'tensor_size: {self.tensor_size}, ' \
+               f'{self.index1}, ' \
+               f'{self.index2})'
+
+    def __eq__(self, other):
+        if isinstance(other, Transpose):
+            return self.tensor_size == other.tensor_size and \
+                self.index1 == other.index1 and \
+                self.index2 == other.index2 and \
+                self.output == other.output and \
+                self.input_var == other.input_var
+        else:
+            return False
+
+
+class GetItem(Constraint):
+
+    def __init__(self, tensor_size, index, res, input_var):
+        """
+        Constraint for getting item given a tensor size
+        :param tensor_size: actual number
+        :param index: actual number representing the index
+        :param res: dimension variable to carry the item we get
+        :param input_var: a tensor variable from which we will get item
+        """
+        assert isinstance(res, DVar)
+
+        self.res = res
+        self.tensor_size = tensor_size
+        self.index = index
+        self.input_var = input_var
+
+    def __repr__(self):
+        return f' {self.res} = GetItem({self.input_var}, tensor_size: {self.tensor_size}, {self.index})'
+
+    def __eq__(self, other):
+        if isinstance(other, GetItem):
+            return self.res == other.res and \
+                self.tensor_size == other.tensor_size and \
+                self.index == other.index and \
+                self.input_var == other.input_var
+        else:
+            return False
+
+class GetItemTensor(Constraint):
+
+    def __init__(self, tensor_size, index_tuple, res, input_var):
+        """
+        Constraint for getting item given a tensor size
+        However, when the argument is a tuple, we will
+        expect a tensor
+        :param tensor_size: actual number representing the rank
+        :param index_tuple: tuple for indexing
+        :param res: tensor variable to carry the item we get
+        :param input_var: a tensor variable from which we will get item
+        """
+        assert isinstance(res, TVar)
+
+        self.res = res
+        self.tensor_size = tensor_size
+        self.index_tuple = index_tuple
+        self.input_var = input_var
+
+    def __repr__(self):
+        return f' {self.res} = GetItemT({self.input_var}, tensor_size: {self.tensor_size}, {self.index_tuple})'
+
+    def __eq__(self, other):
+        if isinstance(other, GetItemTensor):
+            return self.res == other.res and \
+                self.tensor_size == other.tensor_size and \
+                self.index_tuple == other.index_tuple and \
+                self.input_var == other.input_var
+        else:
+            return False
+
+class CalcConv(Constraint):
+
+    def __init__(self, conv_result, input_var, c_out, kernel, padding, stride, dilation, matching_constraint_vars):
+        """
+        :param conv_result: the convolution result
+        :param input_var: input to convolution
+        :param c_out: output chanel type
+        :param kernel: kernel tuple
+        """
+        self.conv_result = conv_result
+        self.input_var = input_var
+        self.c_out = c_out
+        self.kernel = kernel
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.matching_constraint = matching_constraint_vars
+
+    def __repr__(self):
+        return f'{self.conv_result} =' \
+               f' calc-conv({self.input_var},' \
+               f' {self.c_out}, {self.kernel}, ' \
+               f'{self.padding}, {self.stride},' \
+               f' {self.dilation})'
+
+    def __eq__(self, other):
+        if isinstance(other, CalcConv):
+            return self.conv_result == other.conv_result and self.input_var == other.input_var and \
+                self.c_out == other.c_out and self.kernel == other.kernel and self.padding == other.padding \
+                and self.stride == other.stride and self.dilation == other.dilation \
+                and self.matching_constraint == other.matching_constraint
+        else:
+            return False
+
+
+class CalcMaxPool(Constraint):
+
+    def __init__(self, maxpool_result, input_var, kernel, padding, stride, dilation, matching_constraint_vars):
+        """
+        :param maxpool_result: the result of maxpool
+        :param input_var: input to convolution
+        :param kernel: kernel tuple
+        """
+        self.maxpool_result = maxpool_result
+        self.input_var = input_var
+        self.kernel = kernel
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.matching_constraint = matching_constraint_vars
+
+    def __repr__(self):
+        return f'{self.maxpool_result} =' \
+               f' calc-maxpool({self.input_var},' \
+               f'  {self.kernel}, ' \
+               f'{self.padding}, {self.stride},' \
+               f' {self.dilation})'
+
+    def __eq__(self, other):
+        if isinstance(other, CalcMaxPool):
+            return self.maxpool_result == other.maxpool_result and self.input_var == other.input_var \
+                and self.kernel == other.kernel and self.padding == other.padding \
+                and self.stride == other.stride and self.dilation == other.dilation \
+                and self.matching_constraint == other.matching_constraint
+        else:
+            return False
+
+
+class ApplyBroadcasting(Constraint):
+    def __init__(self, res1, res2, input1, input2):
+        """
+        :param res1: resulting tensor 1
+        :param res2: resulting tensor 2
+        :param input1: tensor variable 1
+        :param input2: tensor variable 2
+        """
+        self.res1 = res1
+        self.res2 = res2
+        self.input1 = input1
+        self.input2 = input2
+
+    def __eq__(self, other):
+        if isinstance(other, ApplyBroadcasting):
+            return self.res1 == other.res1 \
+                and self.res2 == other.res2 \
+                and self.input1 == other.input1 \
+                and self.input2 == other.input2
+        else:
+            return False
+
+    def __repr__(self):
+        return f'{self.res1}, {self.res2} ='f' apply-broadcasting({self.input1},' f' {self.input2})'
+
+
+class CalcProduct(Constraint):
+    """
+    Given correct dimensions, calculate the product for flatten accounting for Dyn
+    """
+    def __init__(self, start, end, flattened, dims_to_flatten):
+        """
+        :param start: start index
+        :param end: end index
+        :param flattened: variable to store the product
+        :param dims_to_flatten: the type which we will flatten
+        """
+        assert isinstance(dims_to_flatten, list)
+        assert isinstance(flattened, TVar)
+        assert isinstance(start, int)
+        assert isinstance(end, int)
+
+        self.start = start
+        self.end = end
+        self.dims_to_flatten = dims_to_flatten
+        self.flattened = flattened
+
+    def __eq__(self, other):
+        if isinstance(other, CalcProduct):
+            return self.start == other.start and self.end == other.end and \
+                self.dims_to_flatten == other.dims_to_flatten and self.flattened == other.flattened
+
+        else:
+            return False
+
+    def __repr__(self):
+        return f'{self.flattened} = CalcProduct({self.start}, {self.end}, {self.dims_to_flatten})'
+
+
+class TVar:
+    """
+    Tensor variable with no tensor constructor
+    """
+    def __init__(self, tvar):
+        """
+        :param tvar: tensor variable
+        """
+        self.tvar = tvar
+
+    def __repr__(self):
+        return f'TV({self.tvar})'
+
+    def __eq__(self, other):
+        if isinstance(other, TVar):
+            return self.tvar == other.tvar
+        else:
+            return False
+
+
+class DVar:
+    """
+    Dimension variable
+    """
+    def __init__(self, c):
+        """
+        :param c: character or number
+        """
+        self.c = c
+
+    def __repr__(self):
+        return f'DV({self.c})'
+
+    def __eq__(self, other):
+        if isinstance(other, DVar):
+            return self.c == other.c
+        else:
+            return False
+
+
+class BVar:
+    """
+    Boolean variable
+    """
+    def __init__(self, c):
+        """
+        :param c: character or number
+        """
+        self.c = c
+
+    def __repr__(self):
+        return f'BV({self.c})'
+
+    def __eq__(self, other):
+        if isinstance(other, BVar):
+            return self.c == other.c
+        else:
+            return False
+
+
+def is_algebraic_expression(constraint):
+    if isinstance(constraint, BinConstraintD):
+        return constraint.op in [op_add, op_sub, op_div, op_mul, op_mod]
+    else:
+        return isinstance(constraint, Prod)
+
+
+def is_bool_expr(constraint):
+    if isinstance(constraint, BinConstraintD):
+        return constraint.op in [op_gt, op_lt, op_neq, op_eq]
+    else:
+        return isinstance(constraint, (BVar, Conj, Disj))
+
+def is_dim(d):
+    return isinstance(d, (DVar, int)) or d == Dyn
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_generator.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a25bf62c824b97eb23852526a57baff91e5e517
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_generator.py
@@ -0,0 +1,1279 @@
+import torch
+import operator
+import warnings
+from typing import Callable, Dict, Iterable
+
+from torch.fx._symbolic_trace import _assert_is_none
+from torch.fx.experimental.migrate_gradual_types.constraint import ApplyBroadcasting, CalcProduct, \
+    Disj, TGreatestUpperBound, CalcMaxPool, CalcConv, Conj, BinConstraintT, CanReshape, BinConstraintD, GetItem, T, F, \
+    TVar, DVar, GetItemTensor, IndexSelect, Transpose, DGreatestUpperBound
+from torch.fx.experimental.migrate_gradual_types.operation import \
+    op_eq, op_matching, op_consistency, op_leq, op_precision, op_gt, op_div, op_sub, op_neq, op_lt, op_add, op_mul
+from torch.fx.node import Target, Node
+from torch.fx.experimental.migrate_gradual_types.util import gen_tensor_dims, gen_nat_constraints, gen_dvar, gen_tvar, \
+    gen_bvar
+
+from torch.fx.tensor_type import Dyn, TensorType
+from torch.nn.modules.conv import Conv2d
+from torch.nn.modules.batchnorm import BatchNorm2d
+
+_INFERENCE_RULES: Dict[Target, Callable] = {}
+
+MAX_TENSOR_RANK = 4
+
+def register_inference_rule(call_target):
+    def register(fn):
+        if call_target in _INFERENCE_RULES:
+            raise RuntimeError(f'Inference rule already registered for {call_target}!')
+        _INFERENCE_RULES[call_target] = fn
+        return fn
+    return register
+
+
+def generate_flatten_constraints(start_dim, end_dim, input, flattened, n, counter):
+    d, counter = gen_tensor_dims(n, counter)
+    c1 = BinConstraintT(input, TensorType(d), op_eq)
+    start_dim = n if start_dim == -1 else abs(start_dim)
+    end_dim = n + end_dim + 1 if end_dim < 0 else end_dim + 1
+    c2 = CalcProduct(start_dim, end_dim, flattened, d)
+    nat_constraints = gen_nat_constraints(d)
+    return Conj([c1, c2, *nat_constraints]), counter
+
+
+@register_inference_rule(getattr)
+def get_attr_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    If the attribute is "device" then the tensor shape is preserved
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], str)
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+
+    input = symbols[n.args[0]]
+    attr = n.args[1]
+
+    if attr == 'device':
+        return [BinConstraintT(input, output, op_eq)], counter
+    else:
+        raise NotImplementedError('Not yet implemented')
+
+@register_inference_rule(torch.bmm)
+def bmm_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Constraints that match the input to a size 3 tensor
+    and switch the dimensions according to the rules
+    of batch multiplication
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], Node)
+
+    bmm_output, counter = gen_tvar(counter)
+    symbols[n] = bmm_output
+
+    bmm_input1 = symbols[n.args[0]]
+    bmm_input2 = symbols[n.args[1]]
+
+    dims_input1, counter = gen_tensor_dims(3, counter)
+    dims_input2, counter = gen_tensor_dims(3, counter)
+
+    inputs_dyn = Conj([BinConstraintT(bmm_input1, Dyn, op_eq),
+                       BinConstraintT(bmm_input2, Dyn, op_eq),
+                       BinConstraintT(bmm_output, Dyn, op_eq)])
+
+    input1_dyn = Conj([BinConstraintT(bmm_input1, Dyn, op_eq),
+                       BinConstraintT(bmm_input2, TensorType(dims_input2), op_eq),
+                       BinConstraintT(bmm_output, TensorType([dims_input2[0], Dyn, dims_input2[2]]), op_eq)])
+
+    input2_dyn = Conj([BinConstraintT(bmm_input2, Dyn, op_eq),
+                       BinConstraintT(bmm_input1, TensorType(dims_input1), op_eq),
+                       BinConstraintT(bmm_output, TensorType([dims_input1[0], dims_input1[1], Dyn]), op_eq)])
+
+    consistency_constraints = [BinConstraintD(dims_input1[0], dims_input2[0], op_consistency)]
+
+    batch_size, counter = gen_dvar(counter)
+
+    inputs_are_tensors = Conj([BinConstraintT(bmm_input1, TensorType(dims_input1), op_eq),
+                               BinConstraintT(bmm_input2, TensorType(dims_input2), op_eq),
+                               BinConstraintT(bmm_output, TensorType([batch_size, dims_input1[1], dims_input2[2]]), op_eq),
+                               *consistency_constraints, DGreatestUpperBound(batch_size, dims_input1[0], dims_input2[0])])
+
+    return [Disj([inputs_dyn, input1_dyn, input2_dyn, inputs_are_tensors])], counter
+
+
+@register_inference_rule("index_select")
+def index_select_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    We constrain the second argument to a vector or Dyn.
+    The output replaces the input with the shape of the vector
+    at the position given by the index (first argument)
+    """
+    # print(n.args)
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], int)
+    assert isinstance(n.args[2], Node)
+
+
+
+    index_select, counter = gen_tvar(counter)
+    symbols[n] = index_select
+
+    dims, counter = gen_tensor_dims(1, counter)
+
+    # equality constraint
+    is_size_1 = BinConstraintT(symbols[n.args[2]], TensorType(dims), op_eq)
+    is_dyn = BinConstraintT(symbols[n.args[2]], Dyn, op_eq)
+
+    c2 = Conj([is_size_1, Disj([IndexSelect(i + 1, symbols[n.args[0]], dims[0], n.args[1], index_select)
+                                for i in range(MAX_TENSOR_RANK)])])
+    c3 = Conj([is_dyn, Disj([IndexSelect(i + 1, symbols[n.args[0]], Dyn, n.args[1], index_select)
+                             for i in range(MAX_TENSOR_RANK)])])
+
+    return [Disj([c2, c3])], counter
+
+
+@register_inference_rule("expand")
+def expand_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    We generate the exact constraints as we do for tensor additions but we constraint
+    the rank of this expression to be equal to len(n.args[1:]) so that only
+    those cases get considered for the output
+    """
+    assert isinstance(n.args[0], Node)
+
+    # define the output for expand
+    expand, counter = gen_tvar(counter)
+    symbols[n] = expand
+
+    # since we do not have two nodes here, we will construct an argument variable
+    e1 = symbols[n.args[0]]
+    e2, counter = gen_tvar(counter)
+
+    e2_nat_constraints = []
+    for arg in n.args[1:]:
+        assert isinstance(arg, (Node, int))
+        if isinstance(arg, Node):
+            assert isinstance(symbols[arg], DVar)
+            e2_nat_constraints.append(BinConstraintD(0, symbols[arg], op_leq))
+
+    e2_constraint = BinConstraintT(e2, TensorType([arg if isinstance(arg, int) else symbols[arg] for arg in n.args[1:]]), op_eq)
+
+    constraints, counter = gen_broadcasting_constraints(e1, e2, symbols, counter, expand)
+
+    # constraint the output size
+    dims, counter = gen_tensor_dims(len(n.args[1:]), counter)
+    nat_constraints = gen_nat_constraints(dims)
+    c = [BinConstraintT(expand, TensorType(dims), op_eq), *nat_constraints, e2_constraint, *e2_nat_constraints]
+    constraints += c
+
+    return constraints, counter
+
+
+@register_inference_rule(torch.nn.functional.gelu)
+@register_inference_rule(torch.nn.functional.dropout)
+@register_inference_rule(torch.nn.functional.softmax)
+@register_inference_rule("detach")
+@register_inference_rule("to")
+@register_inference_rule("int")
+@register_inference_rule("long")
+@register_inference_rule("contiguous")
+@register_inference_rule(torch.ones)
+@register_inference_rule(torch.zeros)
+def equality_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    We generate the constraint: input = output
+    """
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+
+    if isinstance(n.args[0], Node):
+        input = symbols[n.args[0]]
+        if isinstance(input, TVar):
+            return [BinConstraintT(input, output, op_eq)], counter
+
+        # then we have dimension variables
+        else:
+            for arg in n.args:
+                assert isinstance(symbols[arg], DVar)
+        my_size = [symbols[arg] for arg in n.args]
+        return [BinConstraintT(output, TensorType(my_size), op_eq)], counter
+
+    elif isinstance(n.args[0], tuple):
+        # then the tuple is the size
+        assert len(n.args[0]) <= 4
+        my_size = [symbols[arg] for arg in n.args[0]]
+        return [BinConstraintT(output, TensorType(my_size), op_eq)], counter
+    else:
+        raise NotImplementedError('Method not yet implemented')
+
+
+@register_inference_rule("transpose")
+def transpose_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Can be considered as a sequence of two index selects, so we generate constraints accordingly
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], int)
+    assert isinstance(n.args[2], int)
+
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+
+    from_arg = symbols[n.args[0]]
+    assert isinstance(from_arg, TVar)
+
+    # input and output are dyn
+    is_dyn = Conj([BinConstraintT(from_arg, Dyn, op_eq), BinConstraintT(output, Dyn, op_eq)])
+
+    # or input is a tensor and we actually do the replacement
+    c3 = Disj([Transpose(i + 1, from_arg, n.args[1], n.args[2], output) for i in range(MAX_TENSOR_RANK)])
+
+    return [Disj([is_dyn, c3])], counter
+
+
+@register_inference_rule("type_as")
+def type_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    We generate the constraint: input = output
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], Node)
+
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+
+    from_arg = symbols[n.args[0]]
+    to_arg = symbols[n.args[1]]
+
+    assert isinstance(from_arg, TVar)
+    assert isinstance(to_arg, TVar)
+
+    return [BinConstraintT(from_arg, to_arg, op_consistency),
+            BinConstraintT(output, to_arg, op_eq)], counter
+
+@register_inference_rule("masked_fill_")
+def masked_fill_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Similar to addition. For now we implement the constraints when
+    the argument is a boolean tensor. There is also a case for when
+    it is a condition. We will leave this out for now.
+    """
+
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], Node)
+
+    # We will retrieve the type variables from the symbol table
+    # and confirm they are tensor variables
+
+    e1 = symbols[n.args[0]]
+    e2 = symbols[n.args[1]]
+
+    if isinstance(e1, TVar) and isinstance(e2, TVar):
+        masked_fill_tensor, counter = gen_tvar(counter)
+        symbols[n] = masked_fill_tensor
+        return gen_broadcasting_constraints(e1, e2, symbols, counter, masked_fill_tensor)
+    else:
+        raise NotImplementedError('Not yet implemented')
+
+
+@register_inference_rule(torch.nn.functional.embedding)
+def embedding_inference_rule_functional(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    embedding_dim_weights = symbols[n.args[1]]
+
+    # will treat this as a static shape. So we will not use matching.
+    weight_dims, counter = gen_tensor_dims(2, counter)
+    equality_constraint = BinConstraintT(embedding_dim_weights, TensorType(weight_dims), op_eq)
+    embedding_dim = weight_dims[1]
+    constraints, counter = gen_embedding_rules(n, symbols, embedding_dim, counter)
+    return [equality_constraint] + constraints, counter
+
+
+@register_inference_rule(torch.nn.modules.sparse.Embedding)
+def embedding_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    """
+    The output shape differs from the input shape in the last dimension
+    """
+    assert isinstance(n.args[0], Node)
+    return gen_embedding_rules(n, symbols, module_instance.embedding_dim, counter)
+
+
+def gen_embedding_rules(n: Node, symbols, embedding_dim, counter):
+
+    embedding_output, counter = gen_tvar(counter)
+    symbols[n] = embedding_output
+    embedding_input = symbols[n.args[0]]
+
+    input_dyn = BinConstraintT(embedding_input, Dyn, op_eq)
+    output_dyn = BinConstraintT(embedding_output, Dyn, op_eq)
+
+    c1 = Conj([input_dyn, output_dyn])
+    c2 = []
+
+    for i in range(1, MAX_TENSOR_RANK):
+        new_dims, counter = gen_tensor_dims(i, counter)
+        nat_constraints = gen_nat_constraints(new_dims)
+
+        # we consider all tensor sizes and append embedding_dim to the end of the output dimension in all cases
+        c_tensor_i = Conj([BinConstraintT(embedding_input, TensorType(new_dims), op_eq),
+                           BinConstraintT(embedding_output, TensorType(new_dims + [embedding_dim]), op_eq)] +
+                          nat_constraints)
+        c2.append(c_tensor_i)
+
+    return [Disj([c1, Disj(c2)])], counter
+
+
+@register_inference_rule(torch.tensor)
+def tensor_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    If the tensor is a scalar, we will skip it since we
+    do not support scalars yet. We will add support in the future
+    if it's needed. For our examples so far, scalars are not needed.
+    """
+    return [], counter
+
+
+@register_inference_rule("reshape")
+@register_inference_rule("view")
+def view_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Similar to reshape but with an extra condition on the strides
+    """
+    assert isinstance(n.args[0], Node)
+
+    # generate the new variable
+    my_view, counter = gen_tvar(counter)
+    symbols[n] = my_view
+
+
+    src_var = symbols[n.args[0]]
+    t2 = [symbols[elem] if isinstance(elem, Node) else elem for elem in n.args[1:]]  # target shape
+    t2_type = []
+    num_constraints = []
+
+    for t in t2:
+        if t == -1:
+            var, counter = gen_dvar(counter)
+            t2_type.append(var)
+            num_constraints.append(BinConstraintD(var, Dyn, op_neq))
+
+        else:
+            num_constraints.append(BinConstraintD(t, Dyn, op_neq))
+            t2_type.append(t)
+
+    t2_type = TensorType(t2_type)  # type: ignore[assignment]
+
+    c1 = BinConstraintT(my_view, t2_type, op_eq)
+    c2 = CanReshape(src_var, t2_type)
+
+    # TODO: add the extra check mentioned here:
+    # https://pytorch.org/docs/stable/generated/torch.Tensor.view.html#torch.Tensor.view
+
+    return [c1, c2] + num_constraints, counter  # type: ignore[operator]
+
+
+@register_inference_rule("size")
+def size_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    The constraint is just lhs = rhs.
+    Ex: size = input_ids.size()
+    """
+
+
+    if len(n.args) == 1:
+        # generate the new variable
+        size, counter = gen_tvar(counter)
+        symbols[n] = size
+        input = symbols[n.args[0]]
+        c = BinConstraintT(input, size, op_eq)
+        return [c], counter
+
+    elif len(n.args) == 2:
+        # TODO: review this rule; should input = dyn; output = dyn be included here?
+        if isinstance(n.args[1], int):
+            # generate the new variable
+            size_index, counter = gen_dvar(counter)
+            symbols[n] = size_index
+            input = symbols[n.args[0]]
+            c2 = [GetItem(i + 1, n.args[1], size_index, input) for i in range(MAX_TENSOR_RANK)]
+            c3 = BinConstraintD(0, size_index, op_leq)
+
+            input_dyn = BinConstraintT(input, Dyn, op_eq)
+            output_dyn = BinConstraintD(size_index, Dyn, op_eq)
+            c1 = Conj([input_dyn, output_dyn])
+
+            return [Disj([c1, Conj([Disj(c2), c3])])], counter
+
+        else:
+            raise NotImplementedError
+
+    else:
+        raise NotImplementedError
+
+
+def range_check(i, n):
+    """
+    Checks if an index i is within range of a size n list
+    Args:
+        i: index
+        n: list size
+
+    Returns: Boolean
+    """
+    if i >= 0:
+        return T() if i < n else F()
+    else:
+        return T() if i >= n else F()
+
+
+@register_inference_rule(torch.cumsum)
+def cumsum_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Input and output shapes should be equal
+    We should verify that the index is valid
+    """
+    assert isinstance(n.args[0], Node)
+    arg_1 = n.args[1] if len(n.args) > 1 else n.kwargs["dim"]
+    assert isinstance(arg_1, int)
+
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+    input = symbols[n.args[0]]
+
+    input_dyn = BinConstraintT(input, Dyn, op_eq)
+    output_dyn = BinConstraintT(output, Dyn, op_eq)
+    c1 = Conj([input_dyn, output_dyn])
+    c2 = []
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        new_dims, counter = gen_tensor_dims(i, counter)
+
+        nat_constraints = gen_nat_constraints(new_dims)
+
+        c_tensor_i = Conj([BinConstraintT(input, TensorType(new_dims), op_eq),
+                           BinConstraintT(output, TensorType(new_dims), op_eq)] +
+                          [range_check(arg_1, i)] + nat_constraints)
+
+        c2.append(c_tensor_i)
+    dyn_or_tensor = Disj([c1, Disj(c2)])
+    return [dyn_or_tensor], counter
+
+
+@register_inference_rule(_assert_is_none)
+def assert_inference_rule(n: Node, symbols, constraints, counter):
+    assert len(n.users) == 0
+    return [], counter
+
+
+@register_inference_rule(operator.getitem)
+def getitem_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    # dimension output case
+    if isinstance(n.args[1], int):
+        # create and store the new dimension variable
+        get_item_output, counter = gen_dvar(counter)
+        symbols[n] = get_item_output
+
+        # retrieve arg variables
+        get_item_arg = symbols[n.args[0]]
+        assert isinstance(get_item_arg, TVar)
+
+
+        # if the input is dynamic, we accept any index and return
+        # a dynamic dimension as output
+        input_dyn = BinConstraintT(get_item_arg, Dyn, op_eq)
+        output_dyn = BinConstraintD(get_item_output, Dyn, op_eq)
+        c1 = Conj([input_dyn, output_dyn])
+
+        # if the input is a tensor,
+        # generate a getItem constraint which will be expanded based on the
+        # tensor dimension.
+
+        c2 = [GetItem(i + 1, n.args[1], get_item_output, get_item_arg) for i in range(MAX_TENSOR_RANK)]
+
+
+        # since the output is a dimension, we make sure it's a natural number
+        # added as a conjunction to the disjunction of c2
+        c3 = BinConstraintD(0, get_item_output, op_leq)
+        return [Disj([c1, Conj([Disj(c2), c3])])], counter
+
+    # tensor output case
+    elif isinstance(n.args[1], tuple):
+        # create and store the new tensor variable
+        get_item_output, counter = gen_tvar(counter)
+        symbols[n] = get_item_output
+
+        # retrieve arg variables
+        if n.args[0] in symbols:
+            get_item_arg = symbols[n.args[0]]
+            assert isinstance(get_item_arg, TVar)
+
+            input_dyn = BinConstraintT(get_item_arg, Dyn, op_eq)
+            output_dyn = BinConstraintT(get_item_output, Dyn, op_eq)  # type: ignore[assignment]
+            c1 = Conj([input_dyn, output_dyn])
+
+            c2 = [GetItemTensor(i + 1, n.args[1], get_item_output, get_item_arg)  # type: ignore[misc]
+                  for i in range(MAX_TENSOR_RANK)]
+        else:
+            # TODO: we should figure out why there is a key-error here.
+            return [], counter
+
+        return [Disj([c1, *c2])], counter
+
+    else:
+        raise RuntimeError('Method not yet implemented')
+
+
+@register_inference_rule(operator.gt)
+def gt_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], (Node, int))
+    assert isinstance(n.args[1], (Node, int))
+
+    # We make sure this node will not be used again. We do not
+    # generate a constraint about that node. Only about the operands.
+
+    e1 = symbols[n.args[0]] if isinstance(n.args[0], Node) else n.args[0]
+    e2 = symbols[n.args[1]] if isinstance(n.args[1], Node) else n.args[1]
+
+    if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
+        if isinstance(e1, TVar) and isinstance(e2, TVar):
+            gt_tensor, counter = gen_tvar(counter)
+            symbols[n] = gt_tensor
+            return gen_broadcasting_constraints(e1, e2, symbols, counter, gt_tensor)
+
+        elif isinstance(e1, DVar) and isinstance(e2, DVar):
+            # This is meant to be used for flow analysis only
+            gt_constraint = BinConstraintD(e1, e2, op_gt)
+
+            my_gt, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_gt, gt_constraint, op_eq)
+            return [equality_constraint], counter
+
+        else:
+            raise RuntimeError('Sort Mismatch')
+
+    elif isinstance(n.args[0], Node) and not isinstance(n.args[1], Node):
+        if isinstance(e1, DVar):
+            # This is meant to be used for flow analysis only
+            gt_constraint = BinConstraintD(e1, e2, op_gt)
+
+            my_gt, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_gt, gt_constraint, op_eq)
+            return [equality_constraint], counter
+
+        elif isinstance(e1, TVar) and isinstance(e2, int):
+            # then we made the wrong assumption about the argument being a tensor
+            # so we should fix the assumption
+            warnings.warn(f'Made the wrong assumption for node {n}. Correctness not guaranteed.')
+
+            new_e1, counter = gen_dvar(counter)
+            symbols[n.args[0]] = new_e1
+            symbols[n.args[0]]
+
+            gt_constraint = BinConstraintD(new_e1, e2, op_gt)
+
+            my_gt, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_gt, gt_constraint, op_eq)
+            return [equality_constraint], counter
+
+        else:
+            raise NotImplementedError('Method not yet implemented')
+
+    else:
+        raise NotImplementedError('Method not yet implemented')
+
+
+@register_inference_rule(operator.eq)
+def eq_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], (Node, int))
+    assert isinstance(n.args[1], (Node, int))
+
+    e1 = symbols[n.args[0]] if isinstance(n.args[0], Node) else n.args[0]
+    e2 = symbols[n.args[1]] if isinstance(n.args[1], Node) else n.args[1]
+
+    if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
+        if isinstance(e1, TVar) and isinstance(e2, TVar):
+            eq_tensor, counter = gen_tvar(counter)
+            symbols[n] = eq_tensor
+            return gen_broadcasting_constraints(e1, e2, symbols, counter, eq_tensor)
+
+        elif isinstance(e1, DVar) and isinstance(e2, DVar):
+            # This is meant to be used for flow analysis only
+            eq_constraint = BinConstraintD(e1, e2, op_eq)
+
+            my_eq, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_eq, eq_constraint, op_eq)
+            return [equality_constraint], counter
+
+        else:
+            raise RuntimeError('Sort Mismatch')
+
+    elif isinstance(n.args[0], Node) and not isinstance(n.args[1], Node):
+        if isinstance(e1, DVar):
+            # This is meant to be used for flow analysis only
+            eq_constraint = BinConstraintD(e1, e2, op_eq)
+
+            my_eq, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_eq, eq_constraint, op_eq)
+            return [equality_constraint], counter
+        else:
+            raise NotImplementedError('Method not yet implemented')
+    else:
+        raise NotImplementedError('Method not yet implemented')
+
+@register_inference_rule(operator.ne)
+def neq_inference_rule(n: Node, symbols, constraints, counter):
+    """
+    Translates to inconsistent in gradual types.
+    To prove inequality, we should prove that
+    tensors are either different sizes or
+    disagree on at least one dimension
+
+    This is a WIP (works when the condition
+    is false. We are working on making this operation work
+    when the condition is true as well)
+    """
+    assert isinstance(n.args[0], Node)
+    assert isinstance(n.args[1], tuple)
+
+    # implementing for size 3 and 4
+    if len(n.args[1]) == 3:
+
+        assert isinstance(n.args[1][0], (Node, int))
+        assert isinstance(n.args[1][1], (Node, int))
+        assert isinstance(n.args[1][2], (Node, int))
+
+        lhs = symbols[n.args[0]]
+
+        b, counter = gen_tensor_dims(4, counter)
+        input_is_size3 = BinConstraintT(lhs, TensorType([b[0], b[1], b[2]]), op_eq)
+
+        d1 = n.args[1][0] if isinstance(n.args[1][0], int) else symbols[n.args[1][0]]
+        d2 = n.args[1][1] if isinstance(n.args[1][1], int) else symbols[n.args[1][1]]
+        d3 = n.args[1][2] if isinstance(n.args[1][2], int) else symbols[n.args[1][2]]
+
+        # dimensions not equal
+        my_ne, counter = gen_bvar(counter)
+        neq_1 = BinConstraintD(d1, b[0], op_neq)
+        neq_2 = BinConstraintD(d2, b[1], op_neq)
+        neq_3 = BinConstraintD(d3, b[2], op_neq)
+
+        # dimensions inconsistent
+        dims_inconsistent1 = Conj([BinConstraintD(d1, Dyn, op_neq), BinConstraintD(b[0], Dyn, op_neq), neq_1])
+        dims_inconsistent2 = Conj([BinConstraintD(d2, Dyn, op_neq), BinConstraintD(b[1], Dyn, op_neq), neq_2])
+        dims_inconsistent3 = Conj([BinConstraintD(d3, Dyn, op_neq), BinConstraintD(b[2], Dyn, op_neq), neq_3])
+
+        dims_inconsistent = Disj([dims_inconsistent1, dims_inconsistent2, dims_inconsistent3])
+
+        # we are covering size 3 and 4 only for now
+        ne_constraint = Conj([input_is_size3, dims_inconsistent])
+
+        my_ne, counter = gen_bvar(counter)
+        equality_constraint = BinConstraintD(my_ne, ne_constraint, op_eq)
+
+    elif len(n.args[1]) == 4:
+
+        assert isinstance(n.args[1][0], (Node, int))
+        assert isinstance(n.args[1][1], (Node, int))
+        assert isinstance(n.args[1][2], (Node, int))
+        assert isinstance(n.args[1][3], (Node, int))
+
+        lhs = symbols[n.args[0]]
+
+        b1, counter = gen_dvar(counter)
+        b2, counter = gen_dvar(counter)
+        b3, counter = gen_dvar(counter)
+        b4, counter = gen_dvar(counter)
+
+        input_is_size4 = BinConstraintT(lhs, TensorType([b1, b2, b3, b4]), op_eq)
+
+        d1 = n.args[1][0] if isinstance(n.args[1][0], int) else symbols[n.args[1][0]]
+        d2 = n.args[1][1] if isinstance(n.args[1][1], int) else symbols[n.args[1][1]]
+        d3 = n.args[1][2] if isinstance(n.args[1][2], int) else symbols[n.args[1][2]]
+        d4 = n.args[1][3] if isinstance(n.args[1][3], int) else symbols[n.args[1][3]]
+
+        # dimensions not equal
+        my_ne, counter = gen_bvar(counter)
+        neq_1 = BinConstraintD(d1, b1, op_neq)
+        neq_2 = BinConstraintD(d2, b2, op_neq)
+        neq_3 = BinConstraintD(d3, b3, op_neq)
+        neq_4 = BinConstraintD(d4, b4, op_neq)
+
+        # dimensions to inconsistent
+        dims_inconsistent1 = Conj([BinConstraintD(d1, Dyn, op_neq), BinConstraintD(b1, Dyn, op_neq), neq_1])
+        dims_inconsistent2 = Conj([BinConstraintD(d2, Dyn, op_neq), BinConstraintD(b2, Dyn, op_neq), neq_2])
+        dims_inconsistent3 = Conj([BinConstraintD(d3, Dyn, op_neq), BinConstraintD(b3, Dyn, op_neq), neq_3])
+        dims_inconsistent4 = Conj([BinConstraintD(d4, Dyn, op_neq), BinConstraintD(b3, Dyn, op_neq), neq_4])
+
+        dims_inconsistent = Disj([dims_inconsistent1, dims_inconsistent2, dims_inconsistent3, dims_inconsistent4])
+
+        ne_constraint = Conj([input_is_size4, dims_inconsistent])
+
+        my_ne, counter = gen_bvar(counter)
+
+        equality_constraint = BinConstraintD(my_ne, ne_constraint, op_eq)
+
+    else:
+        raise NotImplementedError('Method not yet implemented')
+
+    return [equality_constraint], counter
+
+
+@register_inference_rule(operator.lt)
+def lt_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], (Node, int))
+    assert isinstance(n.args[1], (Node, int))
+
+    # We make sure this node will not be used again. We do not
+    # generate a constraint about that node. Only about the operands.
+
+    e1 = symbols[n.args[0]] if isinstance(n.args[0], Node) else n.args[0]
+    e2 = symbols[n.args[1]] if isinstance(n.args[1], Node) else n.args[1]
+
+    if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
+        if isinstance(e1, TVar) and isinstance(e2, TVar):
+            lt_tensor, counter = gen_tvar(counter)
+            symbols[n] = lt_tensor
+            return gen_broadcasting_constraints(e1, e2, symbols, counter, lt_tensor)
+
+        elif isinstance(e1, DVar) and isinstance(e2, DVar):
+            # This is meant to be used for flow analysis only
+            lt_constraint = BinConstraintD(e1, e2, op_lt)
+
+            my_lt, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_lt, lt_constraint, op_eq)
+            return [equality_constraint], counter
+
+        else:
+            raise RuntimeError('Sort Mismatch')
+
+    elif isinstance(n.args[0], Node) and not isinstance(n.args[1], Node):
+        if isinstance(e1, DVar):
+            # This is meant to be used for flow analysis only
+            lt_constraint = BinConstraintD(e1, e2, op_lt)
+
+            my_lt, counter = gen_bvar(counter)
+            equality_constraint = BinConstraintD(my_lt, lt_constraint, op_eq)
+            return [equality_constraint], counter
+        else:
+            raise NotImplementedError('Method not yet implemented')
+
+    else:
+        raise NotImplementedError('Method not yet implemented')
+
+
+@register_inference_rule(torch.full)
+def full_inference_rule(n: Node, symbols, constraints, counter):
+    full, counter = gen_tvar(counter)
+    symbols[n] = full
+    res = []
+
+    assert isinstance(n.args[0], Iterable)
+    for arg in n.args[0]:
+        dim = arg if isinstance(arg, int) else symbols[arg]
+        res.append(dim)
+    c = BinConstraintT(full, TensorType(list(res)), op_eq)  # type: ignore[arg-type]
+    return [c], counter
+
+
+# TODO normalize index
+@register_inference_rule(torch.arange)
+def arange_inference_rule(n: Node, symbols, constraints, counter):
+    start = 0
+    step = 1
+
+    if len(n.args) == 1:
+        end = symbols[n.args[0]]
+    else:
+        raise NotImplementedError('Not yet implemented')
+
+    # int((end - start) / step)
+    d1, counter = gen_dvar(counter)
+    size_constraint = BinConstraintD(d1, BinConstraintD(BinConstraintD(end, start, op_sub), step, op_div), op_eq)
+    arange, counter = gen_tvar(counter)
+    symbols[n] = arange
+
+    # either the a parameter is a number or it is Dyn
+    c1 = Disj([BinConstraintD(end, Dyn, op_eq),
+               BinConstraintD(start, Dyn, op_eq),
+               BinConstraintD(step, Dyn, op_eq)])
+    c2 = BinConstraintD(d1, Dyn, op_eq)
+    both_dyn = Conj([c1, c2])
+
+    c11 = Conj([BinConstraintD(end, Dyn, op_neq),
+                BinConstraintD(start, Dyn, op_neq),
+                BinConstraintD(step, Dyn, op_neq)])
+    c22 = BinConstraintD(d1, Dyn, op_neq)
+    both_numbers = Conj([c11, c22, size_constraint])
+
+    return [BinConstraintT(arange, TensorType([d1]), op_eq), Disj([both_dyn, both_numbers])], counter
+
+def gen_broadcasting_constraints(e1, e2, symbols, counter, output_var):
+    # additional vars that don't correspond to expressions
+    e11, counter = gen_tvar(counter)
+    e22, counter = gen_tvar(counter)
+
+    # generate constraints
+    c1 = TGreatestUpperBound(output_var, e11, e22)
+    c2 = ApplyBroadcasting(e11, e22, e1, e2)
+    c3 = BinConstraintT(e11, e22, op_consistency)
+    return [c1, c2, c3], counter
+
+
+@register_inference_rule(operator.mul)
+@register_inference_rule(torch.ne)
+@register_inference_rule("ne")
+@register_inference_rule(torch.add)
+@register_inference_rule(operator.add)
+def broadcasting_inference_rule(n: Node, symbols, constraints, counter):
+
+    op_code = None
+    if n.target == operator.add or n.target == torch.add:
+        op_code = op_add
+    elif n.target == operator.mul:
+        op_code = op_mul
+
+    if isinstance(n.args[0], Node) and isinstance(n.args[1], Node):
+        if isinstance(symbols[n.args[0]], TVar) and isinstance(symbols[n.args[1]], TVar):
+            my_output, counter = gen_tvar(counter)
+            symbols[n] = my_output
+            e1 = symbols[n.args[0]]
+            e2 = symbols[n.args[1]]
+
+            return gen_broadcasting_constraints(e1, e2, symbols, counter, my_output)
+        else:
+            raise NotImplementedError('Method not yet implemented')
+
+    elif isinstance(n.args[0], Node) and isinstance(n.args[1], (int, float)):
+        if isinstance(symbols[n.args[0]], TVar):
+            my_output, counter = gen_tvar(counter)
+            symbols[n] = my_output
+            e1 = symbols[n.args[0]]
+            return [BinConstraintT(my_output, e1, op_eq)], counter
+        elif isinstance(symbols[n.args[0]], DVar):
+            my_output, counter = gen_dvar(counter)
+            symbols[n] = my_output
+            e1 = symbols[n.args[0]]
+
+            # we will propagate the runtime value here since this is regular addition
+            c = Conj([BinConstraintD(my_output, BinConstraintD(e1, n.args[1], op_code), op_eq),
+                      BinConstraintD(0, my_output, op_leq)])
+            return [c], counter
+
+    elif isinstance(n.args[1], Node) and isinstance(n.args[0], (int, float)):
+        if isinstance(symbols[n.args[1]], TVar):
+            my_output, counter = gen_tvar(counter)
+            symbols[n] = my_output
+            e2 = symbols[n.args[1]]
+            return [BinConstraintT(my_output, e2, op_eq)], counter
+        elif isinstance(symbols[n.args[1]], DVar):
+            my_output, counter = gen_dvar(counter)
+            symbols[n] = my_output
+            e2 = symbols[n.args[1]]
+
+            # we will propagate the runtime value here since this is regular addition
+            c = Conj([BinConstraintD(my_output, BinConstraintD(e2, n.args[0], op_code), op_eq),
+                      BinConstraintD(0, my_output, op_leq)])
+            return [c], counter
+
+        else:
+            raise NotImplementedError('Method not yet implemented')
+
+    else:
+        # TODO generate add constraints for scalar addition
+        raise NotImplementedError('Addition not yet implemented')
+
+
+@register_inference_rule(torch.flatten)
+def flatten_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    # generate the new variable
+    flattened, counter = gen_tvar(counter)
+    symbols[n] = flattened
+
+    input = symbols[n.args[0]]
+
+    # set the default start and end dims
+    start_dim = 1
+    end_dim = -1
+
+    if len(n.args) > 1:
+        assert isinstance(n.args[1], int)
+        start_dim = n.args[1]
+
+    if len(n.args) > 2:
+        assert isinstance(n.args[2], int)
+        end_dim = n.args[2]
+
+    c1 = BinConstraintT(input, Dyn, op_eq)
+    c2 = BinConstraintT(flattened, Dyn, op_eq)
+    both_dyn = Conj([c1, c2])
+
+    const = []
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        c, counter = generate_flatten_constraints(start_dim, end_dim, input, flattened, i, counter)
+        const.append(c)
+
+    return [Disj([both_dyn, *const])], counter
+
+
+@register_inference_rule(torch.nn.functional.layer_norm)
+def layer_norm_functional(n: Node, symbols, constraints, counter):
+    """
+    We generate the constraint: input = output
+    """
+    assert isinstance(n.args[0], Node)
+    return gen_layer_norm_constraints(n, n.args[1], symbols, counter)
+
+
+@register_inference_rule(torch.nn.LayerNorm)
+def layer_norm_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    """
+    Input and output shapes should be equal.
+    Input should be consistent with the normalized_shape
+    """
+    assert isinstance(n.args[0], Node)
+    return gen_layer_norm_constraints(n, module_instance.normalized_shape, symbols, counter)
+
+
+def gen_layer_norm_constraints(n: Node, normalized_shape, symbols, counter):
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+    input = symbols[n.args[0]]
+
+    input_dyn = BinConstraintT(input, Dyn, op_eq)
+    output_dyn = BinConstraintT(output, Dyn, op_eq)
+
+    c1 = Conj([input_dyn, output_dyn])
+
+    c2 = []
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        new_dims_rhs, counter = gen_tensor_dims(i, counter)
+        nat_constraints = gen_nat_constraints(new_dims_rhs)
+
+        c_tensor_i = Conj([BinConstraintT(input, TensorType(new_dims_rhs), op_eq),
+                           BinConstraintT(output, TensorType(new_dims_rhs), op_eq)] +
+                          add_layer_norm_constraints(new_dims_rhs, list(normalized_shape)) +
+                          nat_constraints)
+        c2.append(c_tensor_i)
+    return [Disj([c1, Disj(c2)])], counter
+
+@register_inference_rule(torch.nn.Dropout)
+@register_inference_rule(torch.nn.ReLU)
+def relu_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    """
+    Input and output shapes should be equal.
+    """
+    assert isinstance(n.args[0], Node)
+    output, counter = gen_tvar(counter)
+    symbols[n] = output
+    input = symbols[n.args[0]]
+    assert isinstance(input, TVar)
+    return [BinConstraintT(input, output, op_eq)], counter
+
+
+@register_inference_rule(torch.nn.Linear)
+def linear_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    """
+    Input and output sizes should be the same except for the last dimension
+    If the input is Dyn, then so should the output
+    """
+    assert isinstance(n.args[0], Node)
+    return linear_constraints(n, module_instance.in_features, module_instance.out_features, symbols, counter)
+
+
+@register_inference_rule("dim")  # type: ignore[attr-defined]
+def torch_dim_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+    my_dim, counter = gen_dvar(counter)
+    symbols[n] = my_dim
+    input = symbols[n.args[0]]
+
+    input_dyn = BinConstraintT(input, Dyn, op_eq)
+    output_dyn = BinConstraintD(my_dim, Dyn, op_eq)
+
+    c1 = []
+
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        new_dims_rhs_1, counter = gen_tensor_dims(i, counter)
+
+        c_tensor_i = Conj([BinConstraintT(input, TensorType(new_dims_rhs_1), op_eq),
+                           BinConstraintD(my_dim, i, op_eq)])
+        c1.append(c_tensor_i)
+
+    return [Disj([Conj([input_dyn, output_dyn]), Disj(c1)])], counter
+
+
+@register_inference_rule(torch._C._nn.linear)  # type: ignore[attr-defined]
+def torch_linear_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+    weight_dims, counter = gen_tensor_dims(2, counter)
+    equality_constraint = BinConstraintT(symbols[n.args[1]], TensorType(weight_dims), op_eq)
+    constraints, counter = linear_constraints(n, weight_dims[1], weight_dims[0], symbols, counter)
+    return [equality_constraint] + constraints, counter
+
+
+def linear_constraints(n: Node, in_features, out_features, symbols, counter):
+    linear_output, counter = gen_tvar(counter)
+    symbols[n] = linear_output
+    linear_input = symbols[n.args[0]]
+
+    input_dyn = BinConstraintT(linear_input, Dyn, op_eq)
+    output_dyn = BinConstraintT(linear_output, Dyn, op_eq)
+
+    c1 = Conj([input_dyn, output_dyn])
+
+    c2 = []
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        new_dims_rhs_1, counter = gen_tensor_dims(i, counter)
+        new_dims_rhs_2, counter = gen_tensor_dims(i, counter)
+
+        nat_constraints = gen_nat_constraints(new_dims_rhs_1 + new_dims_rhs_2)
+
+        c_tensor_i = Conj([BinConstraintT(linear_input, TensorType(new_dims_rhs_1), op_eq),
+                           BinConstraintT(linear_output, TensorType(new_dims_rhs_2), op_eq)] +
+                          add_linear_constraints(new_dims_rhs_1, new_dims_rhs_2, in_features, out_features) +
+                          nat_constraints)
+        c2.append(c_tensor_i)
+    return [Disj([c1, Disj(c2)])], counter
+
+def add_layer_norm_constraints(input_dim, normalized_dim):
+    """
+    The constraints say that the type has te form: [*, 1024, 1024]
+     while the normalized_dim have the form [1024, 1024]
+    Args:
+        input_dim: Input shape of layer norm
+        normalized_dim: normalized_dim parameter of the module instance
+
+    """
+
+    # in this case we return false since there's a pattern mismatch
+    if len(normalized_dim) > len(input_dim):
+        return [F()]
+
+    else:
+        constraints = []
+        for i, n in zip(reversed(input_dim), reversed(normalized_dim)):
+            constraints.append(BinConstraintD(i, n, op_consistency))
+        return constraints
+
+
+def add_linear_constraints(dims1, dims2, in_features, out_features):
+    assert len(dims1) == len(dims2)
+    constraints = []
+    for i in range(len(dims1)):
+        if i == len(dims1) - 1:
+            constraints.append(BinConstraintD(dims1[i], in_features, op_consistency))
+            constraints.append(BinConstraintD(dims2[i], out_features, op_eq))
+        else:
+            constraints.append(BinConstraintD(dims1[i], dims2[i], op_eq))
+
+    return constraints
+
+
+@register_inference_rule(torch.reshape)
+def reshape_inference_rule(n: Node, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    # generate the new variable
+    my_reshape, counter = gen_tvar(counter)
+    symbols[n] = my_reshape
+
+    src_var = symbols[n.args[0]]
+    t2 = n.args[1]
+    t2_type = TensorType([Dyn if elem == -1 else elem for elem in t2])  # type: ignore[union-attr]
+    c1 = BinConstraintT(my_reshape, t2_type, op_eq)  # type: ignore[union-attr]
+    c2 = CanReshape(src_var, t2_type)
+
+    return [c1, c2], counter
+
+
+@register_inference_rule(BatchNorm2d)
+def batchnorm_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    # generate the new variable
+    batchnorm_output, counter = gen_tvar(counter)
+    symbols[n] = batchnorm_output
+    batchnorm_input = symbols[n.args[0]]
+
+    # dim vars
+    d1, counter = gen_dvar(counter)
+    d2, counter = gen_dvar(counter)
+    d3, counter = gen_dvar(counter)
+    d4, counter = gen_dvar(counter)
+
+    nat_constraints = gen_nat_constraints([d1, d2, d3, d4])
+
+    c1 = BinConstraintT(batchnorm_input, TensorType([d1, d2, d3, d4]), op_matching)
+    c2 = BinConstraintT(batchnorm_input, batchnorm_output, op_eq)
+    return [c1, c2, *nat_constraints], counter
+
+
+@register_inference_rule(torch.nn.AdaptiveAvgPool2d)
+def adaptive_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    avg_pool, counter = gen_tvar(counter)
+
+    symbols[n] = avg_pool
+    input_var = symbols[n.args[0]]
+
+    # dim vars
+    d1, counter = gen_dvar(counter)
+    d2, counter = gen_dvar(counter)
+    d3, counter = gen_dvar(counter)
+    d4, counter = gen_dvar(counter)
+    nat_constraints = gen_nat_constraints([d1, d2, d3, d4])
+    c1 = BinConstraintT(input_var, TensorType([d1, d2, d3, d4]), op_matching)
+    c2 = BinConstraintT(avg_pool, TensorType([d1, d2, module_instance.output_size[0], module_instance.output_size[1]]), op_eq)
+
+    return [c1, c2, *nat_constraints], counter
+
+
+@register_inference_rule(Conv2d)
+def conv2d_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+
+    my_conv, counter = gen_tvar(counter)
+    symbols[n] = my_conv
+    input_var = symbols[n.args[0]]
+
+    # dim vars
+    [d1, d2, d3, d4], counter = gen_tensor_dims(MAX_TENSOR_RANK, counter)
+
+    # c1 = Matching(input_var, TensorType([d1, d2, d3, d4]))
+    c1 = BinConstraintT(input_var, TensorType([d1, d2, d3, d4]), op_matching)
+
+    # c2 = DConsistency(module_instance.in_channels, d2)
+    c2 = BinConstraintD(module_instance.in_channels, d2, op_consistency)
+
+    c3 = CalcConv(my_conv, input_var,
+                  module_instance.out_channels,
+                  module_instance.kernel_size,
+                  module_instance.padding,
+                  module_instance.stride,
+                  module_instance.dilation, [d1, d2, d3, d4])
+
+    nat_constraints = gen_nat_constraints([d1, d2, d3, d4])
+
+    return [c1, c2, c3, *nat_constraints], counter
+
+
+@register_inference_rule(torch.nn.MaxPool2d)
+def maxpool_inference_rule(n: Node, module_instance, symbols, constraints, counter):
+    assert isinstance(n.args[0], Node)
+    maxpool, counter = gen_tvar(counter)
+    symbols[n] = maxpool
+    input_var = symbols[n.args[0]]
+
+    # dim vars
+    [d1, d2, d3, d4], counter = gen_tensor_dims(MAX_TENSOR_RANK, counter)
+
+    c1 = BinConstraintT(input_var, TensorType([d1, d2, d3, d4]), op_matching)
+
+    c2 = CalcMaxPool(maxpool, input_var, module_instance.kernel_size, module_instance.padding,
+                     module_instance.stride, module_instance.dilation, [d1, d2, d3, d4])
+
+    nat_constraints = gen_nat_constraints([d1, d2, d3, d4])
+
+    return [c1, c2, *nat_constraints], counter
+
+
+class ConstraintGenerator:
+    def __init__(self, traced, graph=None):
+        self.traced = traced  # traced or tracer.root
+        self.traced_params = dict(self.traced.named_parameters())
+        self.constraints = []
+        self.symbol_dict = {}
+        self.graph = traced.graph if hasattr(traced, 'graph') else graph
+
+
+    def generate_constraints(self, counter=0):
+        """
+        Iterate through every node and generate constraints
+        Effect: self.constraints will be populated with the final constraints
+        """
+        graph = self.graph
+
+        all_constraints = []
+
+        for n in graph.nodes:
+            (constraints, counter) = self.generate_constraints_node(n, counter)
+            all_constraints += constraints
+
+        return Conj(all_constraints), counter
+
+    def generate_constraints_node(self, n: Node, counter):
+        """
+        Generate constraints the given node:
+        Currently supported operations:
+        - Reshape
+        - Add
+        - conv2d
+        """
+
+        if n.op == 'placeholder':
+            x, counter = gen_tvar(counter)
+            self.symbol_dict[n] = x
+
+            my_type = n.type
+
+            if n.type != Dyn and (not isinstance(n.type, TensorType)):
+                if n.type == torch.nn.parameter.Parameter:
+                    # since we have a parameter, the shape must be static
+                    assert 'example_value' in n.meta
+                    my_type = TensorType(n.meta['example_value'].size())
+                else:
+                    my_type = Dyn
+
+            c1 = BinConstraintT(my_type, x, op_precision)
+            c2 = BinConstraintT(x, MAX_TENSOR_RANK, op_leq)
+            return [c1, c2], counter
+
+        elif n.op == 'call_function':
+            if n.target in _INFERENCE_RULES:
+                return _INFERENCE_RULES[n.target](n, self.symbol_dict, self.constraints, counter)
+            else:
+                raise RuntimeError(f'No inference rule registered for target {n.target}!')
+
+        elif n.op == 'call_module':
+
+            module_instance = self.traced.get_submodule(n.target)
+            if type(module_instance) in _INFERENCE_RULES:
+                return _INFERENCE_RULES[type(module_instance)](n,
+                                                               module_instance,
+                                                               self.symbol_dict,
+                                                               self.constraints, counter)
+            else:
+                raise RuntimeError(f'No inference rule registered for class {type(module_instance)}!')
+
+        elif n.op == 'call_method':
+            if n.target in _INFERENCE_RULES:
+                return _INFERENCE_RULES[n.target](n, self.symbol_dict, self.constraints, counter)
+            else:
+                raise RuntimeError(f'No inference rule registered for target {n.target}!')
+
+        elif n.op == 'get_attr':
+            t = self.traced_params.get(n.target, None)
+
+            if isinstance(t, torch.Tensor):
+                if len(t.shape) > 0:
+                    res = list(t.shape)
+                    attr_type = TensorType(res)
+                    output, counter = gen_tvar(counter)
+                    self.symbol_dict[n] = output
+                    return [BinConstraintT(output, attr_type, op_eq)], counter
+                else:
+                    # scalar?
+                    return [], counter
+            else:
+                return [], counter
+
+        elif n.op == 'output':
+            return [], counter
+
+        else:
+            raise NotImplementedError(f"Method {n.op} not yet implemented")
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
new file mode 100644
index 0000000000000000000000000000000000000000..614b12426c599bfa950c97a0cc095fb3ddb81afe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/constraint_transformation.py
@@ -0,0 +1,1040 @@
+# mypy: ignore-errors
+import copy
+import itertools
+from torch.fx.experimental.migrate_gradual_types.constraint_generator import BinConstraintT, MAX_TENSOR_RANK
+from torch.fx.experimental.migrate_gradual_types.constraint import T, BinConstraintD, Conj, Constraint, DVar, TVar, \
+    Transpose
+from torch.fx.experimental.migrate_gradual_types.constraint import Disj, TGreatestUpperBound
+from torch.fx.experimental.migrate_gradual_types.constraint import DGreatestUpperBound
+from torch.fx.experimental.migrate_gradual_types.constraint import CalcConv, CalcMaxPool
+from torch.fx.experimental.migrate_gradual_types.constraint import CalcProduct, CanReshape
+from torch.fx.experimental.migrate_gradual_types.constraint import ApplyBroadcasting, Prod, F, GetItem, GetItemTensor, IndexSelect
+from torch.fx.experimental.migrate_gradual_types.operation import op_eq, op_precision, op_leq, op_matching
+from torch.fx.experimental.migrate_gradual_types.operation import op_consistency, op_neq
+from torch.fx.experimental.migrate_gradual_types.operation import op_mul, op_add, op_sub, op_div, op_mod
+from torch.fx.experimental.migrate_gradual_types.util import gen_tensor_dims, gen_nat_constraints, gen_dvar
+from torch.fx.tensor_type import TensorType, Dyn
+from typing import Callable, Dict, List
+
+_TRANSFORMATION_RULES: Dict[Constraint, Callable] = {}
+
+
+def register_transformation_rule(call_target):
+    def register(fn):
+        if call_target in _TRANSFORMATION_RULES:
+            raise RuntimeError(f'Transformation rule already registered for {call_target}!')
+        _TRANSFORMATION_RULES[call_target] = fn
+        return fn
+    return register
+
+
+def valid_index(index, dims):
+    """
+    Given a list of dimensions, checks if an index is valid in the list
+    """
+    try:
+        dims[index]
+        return T()
+    except IndexError:
+        return F()
+
+
+@register_transformation_rule(Transpose)
+def transform_transpose(constraint, counter):
+    """
+    Similar to a sequence of two index-selects
+    """
+    dims, counter = gen_tensor_dims(constraint.tensor_size, counter)
+    is_valid_index1 = valid_index(constraint.index1, dims)
+    is_valid_index2 = valid_index(constraint.index2, dims)
+    new_dims = copy.deepcopy(dims)
+    nat_constraints = gen_nat_constraints(dims)
+
+    if is_valid_index1 == T() and is_valid_index2 == T():
+        new_dims[constraint.index1] = dims[constraint.index2]
+        new_dims[constraint.index2] = dims[constraint.index1]
+
+    transformed_constraint = Conj([BinConstraintT(constraint.input_var, TensorType(dims), op_eq),
+                                   *nat_constraints,
+                                   is_valid_index1, is_valid_index2,
+                                   BinConstraintT(constraint.output, TensorType(new_dims), op_eq)])
+    return transformed_constraint, counter
+
+
+@register_transformation_rule(IndexSelect)
+def transform_index_select(constraint, counter):
+    """
+    The constraints consider the given tensor size, checks if the index is valid
+    and if so, generates a constraint for replacing the input dimension
+    with the required dimension
+    """
+    dims, counter = gen_tensor_dims(constraint.tensor_size, counter)
+    is_valid_index = valid_index(constraint.index, dims)
+    nat_constraints = gen_nat_constraints(dims)
+
+    # if the index is valid then replace the input dimension with the new dimension
+    # otherwise the dimension will not be replaced and the clause will contain False
+    if is_valid_index == T():
+        new_dims = copy.deepcopy(dims)
+        new_dims[constraint.index] = constraint.dim_replace
+
+    transformed_constraint = Conj([BinConstraintT(constraint.input_var, TensorType(dims), op_eq),
+                                   *nat_constraints,
+                                   is_valid_index,
+                                   BinConstraintT(constraint.output, TensorType(new_dims), op_eq)])
+
+    # print(constraints)
+    return transformed_constraint, counter
+
+
+@register_transformation_rule(GetItem)
+def transform_get_item(constraint, counter):
+    """
+    generate an equality of the form:
+    t = [a1, ..., an]
+    then generate constraints that check if the given index is valid
+    given this particular tensor size.
+    If the index is valid, generate a constraint to get the item
+    Note that we already handled the Dyn input case in the previous
+    step.
+    Args:
+        constraint: GetItem which assumes we are getting an item from a tensor (not Dyn)
+        counter: variable tracking
+    Returns: simplified constraints for GetItem
+
+    """
+    dims, counter = gen_tensor_dims(constraint.tensor_size, counter)
+    nat_constraints = gen_nat_constraints(dims)
+
+
+    is_valid_index = valid_index(constraint.index, dims)
+
+    all_constraints = [BinConstraintT(constraint.input_var, TensorType(dims), op_eq),
+                       *nat_constraints,
+                       is_valid_index]
+
+    # if the index is valid, we generate a constraint for getting an item
+    # otherwise this clause will have been UNSAT due to the wrong index
+    if is_valid_index == T():
+        all_constraints.append(BinConstraintD(constraint.res, dims[constraint.index], op_eq))
+
+    return Conj(all_constraints), counter
+
+def valid_index_tensor(index, dims):
+    """
+    if the slice instances exceed the length of the dimensions
+    then this is a type error so we return False
+    """
+    slice_count = 0
+    for s in index:
+        if isinstance(s, slice):
+            slice_count += 1
+    if slice_count > len(dims):
+        return F()
+    else:
+        return T()
+
+@register_transformation_rule(GetItemTensor)
+def transform_get_item_tensor(constraint, counter):
+    """
+    When the index is a tuple, then the output will be a tensor
+    TODO: we have to check if this is the case for all HF models
+
+    The cases we are covering here are a tuple with one of:
+     - slice with default argument
+     - None
+
+     None appends 1 to the input tensor dimensions
+     so each occurrence of 'None' increases the rank by 1
+
+     slice with default arguments does not change the rank
+    """
+    assert isinstance(constraint.index_tuple, tuple)
+
+
+    # generate a result tensor of the expected size
+    dims, counter = gen_tensor_dims(constraint.tensor_size, counter)
+    nat_constraints = gen_nat_constraints(dims)
+
+    # generate a place-holder list of the right rank
+    # where "slice" does not contribute to the rank and "None" does
+    none_c = constraint.index_tuple.count(None)
+    resulting_tensor_dims = (none_c + len(dims)) * [None]
+
+    dim_index = 0
+    for i in range(len(constraint.index_tuple)):
+
+        # append 1 to the right location of the resulting tensor
+        if constraint.index_tuple[i] is None:
+            resulting_tensor_dims[i] = 1
+
+        elif constraint.index_tuple[i] == slice(None, None, None):
+            pass
+
+        else:
+            raise NotImplementedError('Method not yet implemented')
+
+    # append the remaining dimensions to the right location
+    dim_index = 0
+    for i in range(len(resulting_tensor_dims)):
+        if resulting_tensor_dims[i] is None:
+            resulting_tensor_dims[i] = dims[dim_index]
+            dim_index += 1
+
+    # check if the index is valid
+    is_valid_index = valid_index_tensor(constraint.index_tuple, dims)
+
+    # check if the resulting tensor is within bounds
+    if len(resulting_tensor_dims) > 4:
+        return F(), counter
+
+    else:
+        constraints = [BinConstraintT(constraint.input_var, TensorType(dims), op_eq),
+                       BinConstraintT(constraint.res, TensorType(resulting_tensor_dims), op_eq),
+                       *nat_constraints,
+                       is_valid_index]
+        return Conj(constraints), counter
+
+
+@register_transformation_rule(BinConstraintT)
+def generate_binconstraint_t(constraint, counter):
+    """
+    Transform binary constraints for tensors
+    """
+
+    # precision constraints
+    if constraint.op == op_precision:
+        if constraint.lhs == Dyn:
+            return T(), counter
+        elif isinstance(constraint.lhs, TensorType):
+            is_fully_static = all(d != Dyn for d in constraint.lhs.__args__)
+            if is_fully_static:
+                return BinConstraintT(constraint.lhs, constraint.rhs, op_eq), counter
+            else:
+                new_dims = []
+
+                for _ in range(len(constraint.lhs.__args__)):
+                    dim, counter = gen_dvar(counter)
+                    new_dims.append(dim)
+
+                new_dim_constraints = [BinConstraintD(old_dim, new_dim, op_precision) for
+                                       new_dim, old_dim in zip(new_dims, constraint.lhs.__args__)] + \
+                                      [BinConstraintT(constraint.rhs, TensorType(new_dims), op_eq)] + \
+                                      [BinConstraintD(1, new_dim, op_leq) for
+                                       new_dim in new_dims]
+                return Conj(new_dim_constraints), counter
+
+    # matching
+    elif constraint.op == op_matching:
+        assert isinstance(constraint.rhs, TensorType)
+        d1 = constraint.rhs.__args__[0]
+        d2 = constraint.rhs.__args__[1]
+        d3 = constraint.rhs.__args__[2]
+        d4 = constraint.rhs.__args__[3]
+
+        conj = [BinConstraintT(constraint.lhs, Dyn, op_eq),
+                BinConstraintD(d1, Dyn, op_eq),
+                BinConstraintD(d2, Dyn, op_eq),
+                BinConstraintD(d3, Dyn, op_eq),
+                BinConstraintD(d4, Dyn, op_eq)]
+        return Disj([Conj(conj),
+                     BinConstraintT(constraint.lhs, TensorType([d1, d2, d3, d4]), op_eq)]), counter
+
+    elif constraint.op == op_consistency:
+        c_dyn = Disj([BinConstraintT(constraint.lhs, Dyn, op_eq), BinConstraintT(constraint.rhs, Dyn, op_eq)])
+        [c_tensor_1, c_tensor_2, c_tensor_3, c_tensor_4], counter = gen_consistency_constraints(constraint, counter)
+
+        return Disj([c_dyn, c_tensor_1, c_tensor_2, c_tensor_3, c_tensor_4]), counter
+
+    elif constraint.op == op_leq:
+        assert isinstance(constraint.rhs, int)
+        disj = [BinConstraintT(constraint.lhs, Dyn, op_eq)]
+        for i in range(1, constraint.rhs + 1):
+            dims = []
+            for j in range(1, i + 1):
+                dim_var, counter = gen_dvar(counter)
+                dims.append(dim_var)
+            disj.append(BinConstraintT(constraint.lhs, TensorType(dims), op_eq))
+        return Disj(disj), counter
+    else:
+        return constraint, counter
+
+
+@register_transformation_rule(BinConstraintD)
+def generate_binconstraint_d(constraint, counter):
+    """
+    Transform binary constraints for dimensions
+    """
+    if constraint.op == op_precision:
+        if isinstance(constraint.lhs, int):
+            return BinConstraintD(constraint.lhs, constraint.rhs, op_eq), counter
+        elif constraint.lhs == Dyn:
+            return T(), counter
+
+    elif constraint.op == op_consistency:
+        return Disj([BinConstraintD(constraint.lhs, constraint.rhs, op_eq),
+                     BinConstraintD(constraint.rhs, Dyn, op_eq), BinConstraintD(constraint.lhs, Dyn, op_eq)]), counter
+
+    else:
+        return constraint, counter
+
+
+@register_transformation_rule(Conj)
+def generate_conj(constraint, counter):
+    """
+    Transform conjunctions
+    """
+    new = []
+    for c in constraint.conjucts:
+        new_c, counter = transform_constraint(c, counter)
+        new.append(new_c)
+    return Conj(new), counter
+
+
+@register_transformation_rule(Disj)
+def generate_disj(constraint, counter):
+    """
+    Transform disjunctions
+    """
+    new = []
+    for c in constraint.disjuncts:
+        new_c, counter = transform_constraint(c, counter)
+        new.append(new_c)
+    return Disj(new), counter
+
+
+@register_transformation_rule(TGreatestUpperBound)
+def generate_gub(constraint, counter):
+    """
+    Transform greatest upper bound for tensors. Results in equality and Greatest Upper Bound
+    on dimensions
+    """
+    c1 = Conj([Disj([BinConstraintT(constraint.rhs1, Dyn, op_eq),
+                     BinConstraintT(constraint.rhs2, Dyn, op_eq)]), BinConstraintT(constraint.res, Dyn, op_eq)])
+
+    [c2, c3, c4, c5], counter = gen_greatest_upper_bound(constraint, counter)
+
+    return Disj([c1, c2, c3, c4, c5]), counter
+
+
+@register_transformation_rule(DGreatestUpperBound)
+def generate_d_gub(constraint, counter):
+    """
+    Transform greatest upper bound for dimensions into equality constraints
+    """
+    c1 = Conj([BinConstraintD(constraint.rhs1, Dyn, op_eq), BinConstraintD(constraint.res, constraint.rhs2, op_eq)])
+    c2 = Conj([BinConstraintD(constraint.rhs2, Dyn, op_eq), BinConstraintD(constraint.res, constraint.rhs1, op_eq)])
+    c3 = Conj([BinConstraintD(constraint.rhs2, constraint.rhs1, op_eq), BinConstraintD(constraint.res, constraint.rhs1, op_eq)])
+    return Disj([c1, c2, c3]), counter
+
+
+@register_transformation_rule(CalcConv)
+def generate_calc_conv(constraint, counter):
+    d, counter = gen_tensor_dims(4, counter)
+    conv_result = TensorType([d[0], d[1], d[2], d[3]])
+
+    # the convolution result is a tensor of size 4
+    c1 = BinConstraintT(constraint.conv_result, conv_result, op_eq)
+
+    # the second dimension of the output is equal to the output channels
+    c2 = Conj([BinConstraintD(d[1], constraint.c_out, op_eq), BinConstraintD(d[1], Dyn, op_neq)])
+
+    # the input corresponds to the output in the first dimension of the convolution
+    c3 = BinConstraintD(constraint.matching_constraint[0], d[0], op_eq)
+
+    c4, c5 = calc_last_two_dims(constraint, d)
+
+    leq_constraints = Conj([BinConstraintD(0, d[0], op_leq),
+                            BinConstraintD(0, d[1], op_leq),
+                            BinConstraintD(0, d[2], op_leq),
+                            BinConstraintD(0, d[3], op_leq)])
+
+    return Conj([c1, c2, c3, c4, c5, leq_constraints]), counter
+
+
+@register_transformation_rule(CalcMaxPool)
+def generate_calc_maxpool(constraint, counter):
+    """
+    Transform maxpool constraints
+    """
+    d, counter = gen_tensor_dims(4, counter)
+    maxpool_result = TensorType([d[0], d[1], d[2], d[3]])
+
+    # the maxpool result is a tensor of size 4
+    c1 = BinConstraintT(constraint.maxpool_result, maxpool_result, op_eq)
+
+    # the input corresponds to the output in the first and second dimension of maxpool
+    c2 = BinConstraintD(constraint.matching_constraint[1], d[1], op_eq)
+    c3 = BinConstraintD(constraint.matching_constraint[0], d[0], op_eq)
+    c4, c5 = calc_last_two_dims(constraint, d)
+
+    leq_constraints = Conj([BinConstraintD(0, d[0], op_leq),
+                            BinConstraintD(0, d[1], op_leq),
+                            BinConstraintD(0, d[2], op_leq),
+                            BinConstraintD(0, d[3], op_leq)])
+
+    return Conj([c1, c2, c3, c4, c5, leq_constraints]), counter
+
+
+@register_transformation_rule(CalcProduct)
+def generate_calc_product(constraint, counter):
+    """
+    Transform flatten constraints
+    """
+    start = constraint.start
+    end = constraint.end
+    dims = constraint.dims_to_flatten
+    flattened = constraint.flattened
+    n = len(constraint.dims_to_flatten)
+
+    # this will be evaluated right here
+    boundary_check = (0 <= start and start < end and end <= n)
+
+    c_boundary = T() if boundary_check else F()
+
+    lhs = dims[0:start]
+    rhs = dims[end:]
+    mid = dims[start:end]
+
+    all_possibilities = generate_all_int_dyn_dim_possibilities(mid)
+
+    all_constraints = []
+
+    for p in all_possibilities:
+        p = list(p)
+        # this tells us there is a dynamic variable
+        contains_dyn = not all(constraint.op == op_neq for constraint in p)
+        if contains_dyn:
+            mid_var = [Dyn]
+            total_constraints = lhs + mid_var + rhs
+            if len(total_constraints) > 4:
+                all_constraints.append(F())
+            else:
+                all_constraints.append(Conj([BinConstraintT(flattened, TensorType(lhs + mid_var + rhs), op_eq)] + p))
+        else:
+            new_var, counter = gen_dvar(counter)
+            mid_eq_prod = Conj([BinConstraintD(new_var, Prod(mid), op_eq), BinConstraintD(new_var, Dyn, op_neq)])
+            mid_var = [new_var]
+            total_constraints = lhs + mid_var + rhs
+            if len(total_constraints) > 4:
+                all_constraints.append(F())
+            else:
+                all_constraints.append(Conj([BinConstraintT(flattened, TensorType(lhs + mid_var + rhs), op_eq), mid_eq_prod] + p))
+
+    return Conj([Disj(all_constraints), c_boundary]), counter
+
+
+@register_transformation_rule(CanReshape)
+def generate_reshape(constraint, counter):
+    """
+    Transform reshape constraints
+    """
+    d, counter = gen_tensor_dims(4, counter)
+
+    d1 = d[0]
+    d2 = d[1]
+    d3 = d[2]
+    d4 = d[3]
+
+    target = constraint.target.__args__
+
+    is_fully_static = all(d != Dyn for d in target)
+
+    # dynamic tensor
+    c1_dyn = BinConstraintT(constraint.src, Dyn, op_eq)
+    c2_tensor1 = BinConstraintT(constraint.src, TensorType([d1]), op_eq)
+    c2_tensor2 = BinConstraintT(constraint.src, TensorType([d1, d2]), op_eq)
+    c2_tensor3 = BinConstraintT(constraint.src, TensorType([d1, d2, d3]), op_eq)
+    c2_tensor4 = BinConstraintT(constraint.src, TensorType([d1, d2, d3, d4]), op_eq)
+
+    d1_eq_dyn = BinConstraintD(d1, Dyn, op_eq)
+    d1_neq_dyn = BinConstraintD(d1, Dyn, op_neq)
+
+    d2_eq_dyn = BinConstraintD(d2, Dyn, op_eq)
+    d2_neq_dyn = BinConstraintD(d2, Dyn, op_neq)
+
+    d3_eq_dyn = BinConstraintD(d3, Dyn, op_eq)
+    d3_neq_dyn = BinConstraintD(d3, Dyn, op_neq)
+
+    d4_eq_dyn = BinConstraintD(d3, Dyn, op_eq)
+    d4_neq_dyn = BinConstraintD(d3, Dyn, op_neq)
+
+    nat_d1 = BinConstraintD(0, d1, op_leq)
+    nat_d2 = BinConstraintD(0, d2, op_leq)
+    nat_d3 = BinConstraintD(0, d3, op_leq)
+    nat_d4 = BinConstraintD(0, d4, op_leq)
+
+    if is_fully_static:
+        # size 1 tensor
+        c3_tensor1 = Disj([d1_eq_dyn,
+                           (Conj([d1_neq_dyn,
+                                  BinConstraintD(d1, Prod(target), op_eq)]))])
+        all_tensor_1 = Conj([c2_tensor1, c3_tensor1])
+
+        # size 2 tensor
+        all_tensor_2 = Conj([c2_tensor2, gen_all_reshape_possibilities([d1, d2], target)])
+
+        # size 3 tensor
+        all_tensor_3 = Conj([c2_tensor3, gen_all_reshape_possibilities([d1, d2, d3], target)])
+
+        # size 4 tensor
+        all_tensor_4 = Conj([c2_tensor4, gen_all_reshape_possibilities([d1, d2, d3, d4], target)])
+
+        return Conj([Disj([c1_dyn, all_tensor_1, all_tensor_2, all_tensor_3, all_tensor_4]),
+                     nat_d1, nat_d2, nat_d3, nat_d4]), counter
+
+    # then there must be exactly one occurrence of dyn
+    else:
+        new_target = []
+
+        for n in target:
+            if n != Dyn:
+                new_target.append(n)
+
+        # tensor 1
+        c3_tensor1 = Disj([d1_eq_dyn,
+                           (Conj([d1_neq_dyn,
+                                  is_dim_div_by_target(new_target, d1)]))])
+        all_tensor_1 = Conj([c2_tensor1, c3_tensor1])
+
+        # tensor 2
+        c21 = Disj([d1_eq_dyn, d2_eq_dyn])
+        c22 = Conj([d1_neq_dyn, d2_neq_dyn, is_dim_div_by_target(new_target, Prod([d1, d2]))])
+        all_tensor_2 = Conj([c2_tensor2, Disj([c21, c22])])
+
+        # tensor 3
+        c31 = Disj([d1_eq_dyn, d2_eq_dyn, d3_eq_dyn])
+        c32 = Conj([d1_neq_dyn, d2_neq_dyn, d3_neq_dyn, is_dim_div_by_target(new_target, Prod([d1, d2, d3]))])
+        all_tensor_3 = Conj([c2_tensor3, Disj([c31, c32])])
+
+        # tensor 4
+        c41 = Disj([d1_eq_dyn, d2_eq_dyn, d3_eq_dyn, d4_eq_dyn])
+        c42 = Conj([d1_neq_dyn, d2_neq_dyn, d3_neq_dyn, d4_neq_dyn, is_dim_div_by_target(new_target, Prod([d1, d2, d3, d4]))])
+        all_tensor_4 = Conj([c2_tensor4, Disj([c41, c42])])
+
+        return Conj([Disj([c1_dyn, all_tensor_1, all_tensor_2, all_tensor_3, all_tensor_4]),
+                     nat_d1, nat_d2, nat_d3, nat_d4]), counter
+
+
+@register_transformation_rule(ApplyBroadcasting)
+def generate_broadcasting(constraint, counter):
+    """
+    Transform broadcasting constraints
+    """
+    e11, e12 = constraint.res1, constraint.res2
+    e1, e2 = constraint.input1, constraint.input2
+
+    e1_dyn = BinConstraintT(e1, Dyn, op_eq)
+    e2_dyn = BinConstraintT(e2, Dyn, op_eq)
+
+    # Introduce dimensions
+    e1_equal_e11 = BinConstraintT(e1, e11, op_eq)
+    e2_equal_e12 = BinConstraintT(e2, e12, op_eq)
+
+    # dyn possibility
+    e1_dyn_constraint = Conj([e1_dyn, e1_equal_e11, e2_equal_e12])
+    e2_dyn_constraint = Conj([e2_dyn, e1_equal_e11, e2_equal_e12])
+
+    # tensor possibility
+    # generate dimensions to create tensors of size 1
+    final_tensor_1_constraint, _, _, nat_dims_1, counter = \
+        gen_broadcasting_constraints(e1, e2, e11, e12, 1, counter)
+
+    # generate dimensions to create tensors of size 2
+    final_tensor_2_constraint_no_padding, final_tensor_2_constraint_padding_arg1, \
+        final_tensor_2_constraint_padding_arg2, nat_dims_2, counter = \
+        gen_broadcasting_constraints(e1, e2, e11, e12, 2, counter)
+
+    # generate dimensions to create tensors of size 3
+    final_tensor_3_constraint_no_padding, final_tensor_3_constraint_padding_arg1, \
+        final_tensor_3_constraint_padding_arg2, nat_dims_3, counter = \
+        gen_broadcasting_constraints(e1, e2, e11, e12, 3, counter)
+
+    # generate dimensions to create tensors of size 4
+    final_tensor_4_constraint_no_padding, final_tensor_4_constraint_padding_arg1, \
+        final_tensor_4_constraint_padding_arg2, nat_dims_4, counter = \
+        gen_broadcasting_constraints(e1, e2, e11, e12, 4, counter)
+
+    final_result = Disj([
+        e1_dyn_constraint,
+        e2_dyn_constraint,
+        final_tensor_1_constraint,
+        final_tensor_2_constraint_no_padding,
+        final_tensor_2_constraint_padding_arg1,
+        final_tensor_2_constraint_padding_arg2,
+        final_tensor_3_constraint_no_padding,
+        final_tensor_3_constraint_padding_arg1,
+        final_tensor_3_constraint_padding_arg2,
+        final_tensor_4_constraint_no_padding,
+        final_tensor_4_constraint_padding_arg1,
+        final_tensor_4_constraint_padding_arg2
+    ])
+
+    return Conj([final_result, *nat_dims_1, *nat_dims_2, *nat_dims_3, *nat_dims_4]), counter
+
+
+def transform_constraint(constraint: Constraint, counter: int):
+    """
+    Transforms a constraint into a simpler constraint.
+    Ex: precision and consistency are transformed to equality
+    Args:
+        constraint: constraint to be transformed
+        counter: for variable tracking
+
+    Returns: Constraint
+
+    """
+    if type(constraint) in _TRANSFORMATION_RULES:
+        return _TRANSFORMATION_RULES[type(constraint)](constraint, counter)
+
+    else:
+        return constraint, counter
+
+
+
+
+def calc_last_two_dims(constraint, d: List[DVar]):
+    """
+    Generates constraints for the last two dimensions of a convolution or a maxpool output
+    Args:
+        constraint: CalcConv or CalcMaxPool
+        d: The list of output dimensions
+
+    Returns: Constraints for calculating the last two dimensions of the output
+
+    """
+
+    assert isinstance(constraint, (CalcConv, CalcMaxPool))
+
+    b3 = constraint.matching_constraint[2]
+    b4 = constraint.matching_constraint[3]
+
+    b3_dyn = Conj([BinConstraintD(d[2], Dyn, op_eq), BinConstraintD(b3, Dyn, op_eq)])
+    b4_dyn = Conj([BinConstraintD(d[3], Dyn, op_eq), BinConstraintD(b4, Dyn, op_eq)])
+
+    d3_not_dyn = Conj([BinConstraintD(d[2], Dyn, op_neq), BinConstraintD(b3, Dyn, op_neq)])
+    d4_not_dyn = Conj([BinConstraintD(d[3], Dyn, op_neq), BinConstraintD(b4, Dyn, op_neq)])
+
+    # transform parameters into tuples incase they are not already
+    padding = (constraint.padding, constraint.padding) \
+        if isinstance(constraint.padding, int) else constraint.padding
+    kernel = (constraint.kernel, constraint.kernel) \
+        if isinstance(constraint.kernel, int) else constraint.kernel
+    stride = (constraint.stride, constraint.stride) \
+        if isinstance(constraint.stride, int) else constraint.stride
+    dilation = (constraint.dilation, constraint.dilation) \
+        if isinstance(constraint.dilation, int) else constraint.dilation
+
+    f1 = BinConstraintD(b3, BinConstraintD(2, padding[0], op_mul), op_add)
+    f2 = BinConstraintD(dilation[0], BinConstraintD(kernel[0], 1, op_sub), op_mul)
+    f3 = BinConstraintD(BinConstraintD(BinConstraintD(f1, f2, op_sub), 1, op_sub), stride[0], op_div)
+    f4 = BinConstraintD(f3, 1, op_add)
+
+    c4 = Disj([b3_dyn, Conj([d3_not_dyn, BinConstraintD(d[2], f4, op_eq)])])
+
+    f11 = BinConstraintD(b4, BinConstraintD(2, padding[1], op_mul), op_add)
+    f22 = BinConstraintD(dilation[1], BinConstraintD(kernel[1], 1, op_sub), op_mul)
+    f33 = BinConstraintD(BinConstraintD(BinConstraintD(f11, f22, op_sub), 1, op_sub), stride[1], op_div)
+    f44 = BinConstraintD(f33, 1, op_add)
+
+    c5 = Disj([b4_dyn, Conj([d4_not_dyn, BinConstraintD(d[3], f44, op_eq)])])
+
+    return c4, c5
+
+
+def generate_all_int_dyn_dim_possibilities(my_list: List[DVar]):
+    """
+    Generate all possibilities of being equal or not equal to dyn for my_list
+    Args:
+        my_list: List of tensor dimensions
+
+    Returns: A list of a list of constraints. Each list of constraints corresponds to
+    one possibility about the values of the dimension variables
+    """
+    # generate all possibilities of being equal or not equal to dyn for my_list
+    eq_possibilities = [BinConstraintD(my_list[i], Dyn, op_eq) for i in range(len(my_list))]
+    neq_possibilities = [BinConstraintD(my_list[i], Dyn, op_neq) for i in range(len(my_list))]
+    d_possibilities = []
+
+    for i in zip(eq_possibilities, neq_possibilities):
+        d_possibilities.append(list(i))
+    all_possibilities = list(itertools.product(*d_possibilities))
+    return all_possibilities
+
+
+def is_target_div_by_dim(target: List[int], dim: List[DVar]):
+    """
+    Generate constraints to check if the target dimensions are divisible by the input dimensions
+    Args:
+        target: Target dimensions
+        dim: Input dimensions
+
+    Returns: Constraints to check divisibility
+
+    """
+    return BinConstraintD(BinConstraintD(Prod(target), dim, op_mod), 0, op_eq)
+
+
+def is_dim_div_by_target(target: List[int], dim: List[DVar]):
+    """
+    Generate constraints to check if the input dimensions is divisible by the target dimensions
+    Args:
+        target: Target dimensions
+        dim:  Input dimensions
+
+    Returns: Constraints to check divisibility
+
+    """
+    return BinConstraintD(BinConstraintD(dim, Prod(target), op_mod), 0, op_eq)
+
+
+def gen_all_reshape_possibilities(list_of_dims, target):
+    """
+    Consider all possibilities what the input dimensions could be (number or dynamic)
+    Then generate the appropriate constraints using multiplication or mod depending on the possibility
+    The possibilities we consider here are the cross product of being equal to dyn or not equal to dyn
+    for the input. Target is fixed because at most one dimension could be dyn.
+    We have different cases for this.
+
+    Args:
+        list_of_dims: The input list of dimensions
+        target: The tensor we want to reshape to
+
+    Returns: A disjunction of transformed reshape constraints
+
+    """
+    all_possibilities = generate_all_int_dyn_dim_possibilities(list_of_dims)
+
+    all_constraints = []
+
+    for p in all_possibilities:
+        to_multiply = []
+
+        p = list(p)
+
+        for constraint in p:
+            assert isinstance(constraint, BinConstraintD)
+            if constraint.op == op_neq:
+                to_multiply.append(constraint.lhs)
+
+        if not to_multiply:
+            all_constraints.append(Conj(p))
+
+        elif len(to_multiply) < len(list_of_dims):
+            all_constraints.append(Conj(p + [is_target_div_by_dim(target, Prod(to_multiply))]))
+        else:
+            all_constraints.append(Conj(p + [BinConstraintD(Prod(list_of_dims),
+                                                            Prod(target), op_eq)]))
+
+    return Disj(all_constraints)
+
+
+def broadcast_dim(tensor_input1, tensor_input2, res1, res2, index, padding=False):
+    """
+    Apply broadcasting to the 'index' dimension of tensor_input1.
+    Args:
+        tensor_input1: should represent [d1, ..., d_index, ...] where d_index = 1
+        tensor_input2: represents the second input
+        res1: broadcasted result 1
+        res2: broadcasted result 2
+        index: the index to broadcast
+        padding: If padding was used, then tensor_input1[index] does not exist
+
+    Returns:
+
+    """
+    if tensor_input1[index] is None:
+        assert padding
+
+
+    if not padding:
+        # then the inputs are the same length so they all have dimensions at "index"
+        return Conj([BinConstraintD(tensor_input1[index], 1, op_eq),
+                     BinConstraintD(res1[index], res2[index], op_eq),
+                     BinConstraintD(res2[index], tensor_input2[index], op_eq)])
+
+    else:
+        # we don't set the input dimension to 1, since it doesn't exist.
+        return Conj([BinConstraintD(res1[index], res2[index], op_eq),
+                     BinConstraintD(res2[index], tensor_input2[index], op_eq)])
+
+
+def apply_padding(e1_var: TVar,
+                  e11: BinConstraintT,
+                  e2: BinConstraintT,
+                  e12: BinConstraintT,
+                  d2: List[DVar],
+                  d11: List[DVar],
+                  d12: List[DVar],
+                  counter: int):
+    """
+    We are considering the possibility where one input has less dimensions than
+    another input, so we apply padding to the broadcasted results
+
+    Args:
+        e1_var: Variable representing the first input where padding will be
+        e11: constraint of the form e11 = Tensortype[d1, ..., dn]
+        e2:  constraint of the form e2 = Tensortype[d1, ..., dn]
+        e12: constraint of the form e11 = Tensortype[d1, ..., dn]
+        d2: Tensor variables for the second input
+        d11: Tensor variables for the broadcasted first input
+        d12: Tensor variables for the broadcasted second input
+        counter: variable tracking
+
+    Returns: A new constraint whose goal is to apply padding to the broadcasted result
+
+    """
+
+    res = []
+
+    # pad the shorter input with None so we can pass it to the broadcasting helper function
+    for i in range(1, len(d2)):
+
+        d1, counter = gen_tensor_dims(i, counter)
+
+        nat_constraints = gen_nat_constraints(d1 + d2 + d11 + d12)
+
+        e1 = BinConstraintT(e1_var, TensorType(d1), op_eq)
+
+        simulate_padding = [None] * (len(d2) - i)
+
+        assert len(simulate_padding + d1) == len(d2)
+
+        broadcast_padding = []
+
+        # for every padding size, we also consider broadcasting
+        for j in range(len(d2) - i):
+            broadcast_padding.append(broadcast_dim(simulate_padding, d2, d11, d12, j, True))
+
+        # we consider the possibilities for broadcasting for every dimension. Since we already
+        # padded d1, we do not consider it while broadcasting
+        all_broadcasting_possibilities = generate_all_broadcasting_possibilities_no_padding(d1,
+                                                                                            d2[(len(d2) - i):],
+                                                                                            d11[(len(d2) - i):],
+                                                                                            d12[(len(d2) - i):])
+        # combine all constraints into a conjunction
+        c = Conj([e1, e11, e2, e12,
+                  *broadcast_padding,
+                  all_broadcasting_possibilities,
+                  *nat_constraints
+                  ])
+        res.append(c)
+
+    return Disj(res), counter
+
+
+def no_broadcast_dim_with_index(d1: List[DVar],
+                                d2: List[DVar],
+                                d3: List[DVar],
+                                d4: List[DVar],
+                                i: int):
+    """
+    Args:
+        d1: input 1
+        d2: input 2
+        d3: simulated broadcasting for input 1
+        d4: simulated broadcasting for input 2
+        i: the rank of the resulting tensor addition
+
+    Returns: Constraints for when no broadcasting occurs
+    """
+    return Conj([
+        Disj([
+            Conj([BinConstraintD(d1[i], 1, op_eq),
+                  BinConstraintD(d2[i], 1, op_eq)]),
+
+            Conj([BinConstraintD(d1[i], 1, op_neq),
+                  BinConstraintD(d2[i], 1, op_neq)])]),
+
+        BinConstraintD(d1[i], d3[i], op_eq),
+        BinConstraintD(d2[i], d4[i], op_eq)])
+
+
+
+def gen_lists_of_dims(num_tensors: int, dim_size: int, counter: int):
+    """
+    Generate lists of DVar to represent tensor dimensions
+    Args:
+        num_tensors: the required number of tensors
+        dim_size: the number of dimensions for each tensor
+        counter: variable tracking
+
+    Returns: A list of a list of tensor dimensions
+
+    """
+    res = []
+
+    for _ in range(num_tensors):
+        dims, counter = gen_tensor_dims(dim_size, counter)
+        res.append(dims)
+
+    return res, counter
+
+
+def create_equality_constraints_for_broadcasting(e1: TVar,
+                                                 e2: TVar,
+                                                 e11: TVar,
+                                                 e12: TVar,
+                                                 d1: List[DVar],
+                                                 d2: List[DVar],
+                                                 d11: List[DVar],
+                                                 d12: List[DVar]):
+    """
+    Create equality constraints for when no broadcasting occurs
+    Args:
+        e1: Input 1
+        e2: Input 2
+        e11: Broadcasted input 1
+        e12: Broadcasted input 2
+        d1: Variables that store dimensions for e1
+        d2: Variables that store dimensions for e2
+        d11: Variables that store dimensions for e11
+        d12: Variables that store dimensions for e22
+
+    Returns: Four equality constraints
+
+    """
+
+    e1_tensor = BinConstraintT(e1, TensorType(d1), op_eq)
+    e11_tensor = BinConstraintT(e11, TensorType(d11), op_eq)
+    e2_tensor = BinConstraintT(e2, TensorType(d2), op_eq)
+    e12_tensor = BinConstraintT(e12, TensorType(d12), op_eq)
+    return [e1_tensor, e11_tensor, e2_tensor, e12_tensor]
+
+
+def gen_consistency_constraints(constraint: Constraint, counter: int):
+    """
+    Args:
+        constraint: Consistency constraint on tensors
+        counter: for variable tracking
+
+    Returns: Equality and consistency constraints on dimensions
+
+    """
+
+    all_constraints = []
+
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        new_dims_rhs_1, counter = gen_tensor_dims(i, counter)
+        new_dims_rhs_2, counter = gen_tensor_dims(i, counter)
+
+        nat_constraints = gen_nat_constraints(new_dims_rhs_1 + new_dims_rhs_2)
+
+        c_tensor_i = Conj([BinConstraintT(constraint.lhs, TensorType(new_dims_rhs_1), op_eq),
+                           BinConstraintT(constraint.rhs, TensorType(new_dims_rhs_2), op_eq)] +
+                          [BinConstraintD(d1, d2, op_consistency) for
+                           d1, d2 in zip(new_dims_rhs_1, new_dims_rhs_2)] + nat_constraints)
+
+        all_constraints.append(c_tensor_i)
+
+    return all_constraints, counter
+
+
+def gen_greatest_upper_bound(constraint: TGreatestUpperBound, counter: int):
+    """
+    Args:
+        constraint: Greatest upper bound on tensors
+        counter: variable tracking
+
+    Returns: A set of equality constraints and DGreatestUpperBound constraints
+
+    """
+
+    all_constraints = []
+
+    for i in range(1, MAX_TENSOR_RANK + 1):
+        c = []
+        dims1, counter = gen_tensor_dims(i, counter)
+        c1tensor = TensorType(dims1)
+
+        dims2, counter = gen_tensor_dims(i, counter)
+        c2tensor = TensorType(dims2)
+
+        dims3, counter = gen_tensor_dims(i, counter)
+        c3tensor = TensorType(dims3)
+
+        c += [BinConstraintT(constraint.rhs1, c1tensor, op_eq),
+              BinConstraintT(constraint.rhs2, c2tensor, op_eq),
+              BinConstraintT(constraint.res, c3tensor, op_eq)] + \
+            gen_nat_constraints(dims1 + dims2 + dims3)
+
+        assert len(c3tensor.__args__) == len(c1tensor.__args__) == len(c2tensor.__args__)
+        for i in range(len(c3tensor.__args__)):
+            c.append(DGreatestUpperBound(c3tensor.__args__[i],
+                                         c1tensor.__args__[i],
+                                         c2tensor.__args__[i]))
+
+        all_constraints.append(Conj(c))
+    return all_constraints, counter
+
+
+def generate_all_broadcasting_possibilities_no_padding(d1: List[DVar], d2: List[DVar], d11: List[DVar], d12: List[DVar]):
+    """
+    Generate broadcasting constraints assuming no padding. Broadcasting can happen at any dimension.
+    We look at all combinations for all dimensions in d1 and d2
+    Args:
+        d1: input1 dimensions
+        d2: input2 dimensions
+        d11: broadcasted input1 dimensions
+        d12: broadcasted input2 dimensions
+
+    Returns: broadcasting constraints relating the input dimensions to the broadcasted dimensions
+
+    """
+
+    size = len(d1)
+
+    res2 = []
+
+    for i in range(size):
+        t1 = broadcast_dim(d1, d2, d11, d12, i)
+        t2 = broadcast_dim(d2, d1, d12, d11, i)
+        t3 = no_broadcast_dim_with_index(d1, d2, d11, d12, i)
+
+        res2.append(Disj([t1, t2, t3]))
+
+    return Conj(res2)
+
+
+def gen_broadcasting_constraints(e1: TVar, e2: TVar, e11: TVar, e12: TVar, i: int, counter: int):
+    """
+    Simulates broadcasting on e1 and e2 and returns the results
+    respectively in e11 and e12. Because of gradual types,
+    e1 and e2 may not be equal. Similarly, e11 and e12 may not
+    be equal. e11 and e12 should be guaranteed to be consistent
+    as they represent the shapes of the tensors to be added after
+    broadcasting.
+    Args:
+        e1: TVar representing the type of input 1
+        e2: TVar representing the type of input 2
+        e11: TVar representing the representing broadcasted input 1
+        e12: TVar representing the representing broadcasted input 2
+        i: The rank of the resulting type of addition
+        counter: for variable tracking
+
+    Returns: Simplified broadcasting constraints
+
+    """
+    dims, counter = gen_lists_of_dims(4, i, counter)
+    [d1, d2, d3, d4] = dims
+    nat_dims_i = gen_nat_constraints(list(itertools.chain.from_iterable(dims)))
+
+    initialize_tensors_constraints = create_equality_constraints_for_broadcasting(e1, e2, e11, e12,
+                                                                                  d1, d2, d3, d4)
+
+    [e1_tensor, e11_tensor, e2_tensor, e12_tensor] = initialize_tensors_constraints
+
+    # without padding, broadcast all possibilities for tensors of size i
+    final_tensor_constraint_no_padding = Conj([*initialize_tensors_constraints,
+                                               generate_all_broadcasting_possibilities_no_padding(d1, d2, d3, d4)])
+
+    # with padding, broadcast all possibilities for tensors of size i
+    final_tensor_constraint_padding_arg1, counter = \
+        apply_padding(e1, e11_tensor, e2_tensor, e12_tensor, d2, d3, d4, counter)
+
+    final_tensor_constraint_padding_arg2, counter = \
+        apply_padding(e2, e12_tensor, e1_tensor, e11_tensor, d1, d4, d3, counter)
+
+    return final_tensor_constraint_no_padding, \
+        final_tensor_constraint_padding_arg1, \
+        final_tensor_constraint_padding_arg2, nat_dims_i, counter
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/operation.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b86d3ced1b0c0e056349c384a3c0bce10de823
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/operation.py
@@ -0,0 +1,14 @@
+op_add = '+'
+op_sub = '-'
+op_mul = '*'
+op_div = '/'
+op_eq = '='
+op_neq = '!='
+op_imp = '=>'
+op_matching = '⊳'
+op_consistency = '~'
+op_precision = '⊑'
+op_leq = '≤'
+op_lt = '<'
+op_gt = '>'
+op_mod = '%'
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/transform_to_z3.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/transform_to_z3.py
new file mode 100644
index 0000000000000000000000000000000000000000..9376ef7eb7ceb7d63bcb2b22af0857b24a6958a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/transform_to_z3.py
@@ -0,0 +1,348 @@
+from torch.fx.experimental.migrate_gradual_types.constraint import Conj, Disj, T, F, BinConstraintT, BVar, is_bool_expr
+from torch.fx.experimental.migrate_gradual_types.constraint import BinConstraintD, TVar, DVar
+from torch.fx.experimental.migrate_gradual_types.constraint import Prod, is_algebraic_expression, is_dim
+from torch.fx.experimental.migrate_gradual_types.constraint_generator import ConstraintGenerator
+from torch.fx.experimental.migrate_gradual_types.constraint_transformation import transform_constraint
+from torch.fx.experimental.migrate_gradual_types.operation import op_add, op_eq, op_neq, op_gt, op_lt
+from torch.fx.experimental.migrate_gradual_types.operation import op_leq, op_sub, op_div, op_mul, op_mod
+from torch.fx.tensor_type import TensorType, Dyn
+
+try:
+    import z3  # type: ignore[import]
+    from torch.fx.experimental.migrate_gradual_types.z3_types import tensor_type, z3_dyn, D
+    HAS_Z3 = True
+
+    def transform_to_z3(constraint, counter, dimension_dict):
+        if isinstance(constraint, Conj):
+            conjuncts = []
+            for c in constraint.conjucts:
+                new_c, counter = transform_to_z3(c, counter, dimension_dict)
+                conjuncts.append(new_c)
+            return z3.And(conjuncts), counter
+
+        elif isinstance(constraint, Disj):
+            disjuncts = []
+            for c in constraint.disjuncts:
+                new_c, counter = transform_to_z3(c, counter, dimension_dict)
+                disjuncts.append(new_c)
+            return z3.Or(disjuncts), counter
+
+        elif isinstance(constraint, T):
+            return True, counter
+
+        elif isinstance(constraint, F):
+            return False, counter
+
+        elif isinstance(constraint, BinConstraintT):
+            if constraint.op == op_eq:
+                lhs, counter = transform_var(constraint.lhs, counter, dimension_dict)
+                rhs, counter = transform_var(constraint.rhs, counter, dimension_dict)
+                return (lhs == rhs), counter
+
+            else:
+                raise NotImplementedError('Method not yet implemented')
+
+        elif isinstance(constraint, BinConstraintD):
+            if constraint.op == op_eq:
+
+                if isinstance(constraint.lhs, BVar) and is_bool_expr(constraint.rhs):
+                    transformed_rhs, counter = transform_to_z3(constraint.rhs, counter, dimension_dict)
+                    transformed_lhs = z3.Bool(constraint.lhs.c)
+                    return transformed_lhs == transformed_rhs, counter
+
+                elif is_dim(constraint.lhs) and is_dim(constraint.rhs):
+                    # with dimension transformations we consider the encoding
+                    lhs, counter = transform_dimension(constraint.lhs, counter, dimension_dict)
+                    rhs, counter = transform_dimension(constraint.rhs, counter, dimension_dict)
+                    return lhs == rhs, counter
+
+                else:
+                    # then we have an algebraic expression which means that we disregard the
+                    # first element of the encoding
+                    lhs, counter = transform_algebraic_expression(constraint.lhs, counter, dimension_dict)
+                    rhs, counter = transform_algebraic_expression(constraint.rhs, counter, dimension_dict)
+                    return lhs == rhs, counter
+
+            # The assumption here is that the LHS and RHS must be dimensions
+            elif constraint.op == op_neq:
+                assert is_dim(constraint.lhs)
+                assert is_dim(constraint.rhs)
+                lhs, counter = transform_dimension(constraint.lhs, counter, dimension_dict)
+                rhs, counter = transform_dimension(constraint.rhs, counter, dimension_dict)
+                if constraint.rhs == Dyn or constraint.lhs == Dyn:
+                    if constraint.rhs == Dyn:
+                        return lhs.arg(0) == 1, counter
+                    elif constraint.lhs == Dyn:
+                        return rhs.arg(0) == 1, counter
+
+                # if one of the instances is a number
+                elif isinstance(constraint.lhs, int) or isinstance(constraint.rhs, int):
+                    if isinstance(constraint.lhs, int):
+                        return z3.Or([rhs.arg(0) == 0, z3.And([rhs.arg(0) == 1, lhs.arg(1) != rhs.arg(1)])]), counter
+
+                    elif isinstance(constraint.rhs, int):
+                        return z3.Or([lhs.arg(0) == 0, z3.And([lhs.arg(0) == 1, lhs.arg(1) != rhs.arg(1)])]), counter
+
+                else:
+                    return z3.Or([z3.And([lhs.arg(0) == 0, rhs.arg(0) != 0]),
+                                  z3.And([lhs.arg(0) != 0, rhs.arg(0) == 0]),
+                                  z3.And([lhs.arg(0) != 0, rhs.arg(0) != 0, lhs.arg(1) != rhs.arg(1)])]), counter
+
+
+            elif constraint.op == op_leq:
+                # if the dimensions are not dyn, this will come into effect
+                # there would have been another constraint specifying if a given dimension
+                # is dyn or not
+                assert is_dim(constraint.lhs) and is_dim(constraint.rhs)
+                lhs, counter = transform_algebraic_expression(constraint.lhs, counter, dimension_dict)
+                rhs, counter = transform_algebraic_expression(constraint.rhs, counter, dimension_dict)
+                return lhs <= rhs, counter
+
+            elif constraint.op == op_gt:
+                assert is_dim(constraint.lhs) and is_dim(constraint.rhs)
+                lhs, counter = transform_algebraic_expression(constraint.lhs, counter, dimension_dict)
+                rhs, counter = transform_algebraic_expression(constraint.rhs, counter, dimension_dict)
+                return lhs > rhs, counter
+
+            elif constraint.op == op_lt:
+                assert is_dim(constraint.lhs) and is_dim(constraint.rhs)
+                lhs, counter = transform_algebraic_expression(constraint.lhs, counter, dimension_dict)
+                rhs, counter = transform_algebraic_expression(constraint.rhs, counter, dimension_dict)
+                return lhs < rhs, counter
+
+            else:
+                raise NotImplementedError('operation not yet implemented')
+
+        else:
+            raise NotImplementedError('Operation not yet implemented')
+
+
+    def transform_var(tensor, counter, dimension_dict):
+        """
+        Transforms tensor variables to a format understood by z3
+        Args:
+            tensor: Tensor variable or a tensor type potentially with variable dimensions
+        Returns: Transformed variable to a z3 format
+
+        """
+        if isinstance(tensor, TensorType):
+            res = []
+            for t in tensor.__args__:
+                transformed, counter = transform_dimension(t, counter, dimension_dict)
+                res.append(transformed)
+
+            assert len(res) <= 4
+            if len(tensor.__args__) == 1:
+                return tensor_type.tensor1(res[0]), counter
+            elif len(tensor.__args__) == 2:
+                return tensor_type.tensor2(res[0], res[1]), counter
+            elif len(tensor.__args__) == 3:
+                return tensor_type.tensor3(res[0], res[1], res[2]), counter
+            elif len(tensor.__args__) == 4:
+                return tensor_type.tensor4(res[0], res[1], res[2], res[3]), counter
+
+        elif tensor == Dyn:
+            return z3_dyn, counter
+
+        elif isinstance(tensor, TVar):
+            return z3.Const(tensor.tvar, tensor_type), counter
+
+    def transform_dimension(dimension, counter, dimension_dict):
+        """
+        Takes a dimension variable or a number and transforms it to a tuple
+        according to our scheme
+        Args:
+            dimension: The dimension to be transformed
+            counter: variable tracking
+
+        Returns:  tuple and the current counter
+
+        """
+        if dimension == Dyn:
+            counter += 1
+            return D(0, z3.Int(counter)), counter
+        elif isinstance(dimension, int):
+            return D(1, dimension), counter
+        elif isinstance(dimension, DVar):
+            if dimension.c in dimension_dict:
+                return D(z3.Int(dimension_dict[dimension.c]), z3.Int(dimension.c)), counter
+            else:
+                counter += 1
+                dimension_dict[dimension.c] = counter
+                return D(z3.Int(counter), z3.Int(dimension.c)), counter
+
+
+    def transform_algebraic_expression(expr, counter, dimension_dict):
+        """
+        Transforms an algebraic expression to z3 format
+        Args:
+            expr: An expression is either a dimension variable or an algebraic-expression
+
+
+        Returns: the transformed expression
+
+        """
+        assert is_algebraic_expression(expr) or is_dim(expr)
+
+        if is_dim(expr):
+            transformed, counter = transform_dimension(expr, counter, dimension_dict)
+            return transformed.arg(1), counter
+
+        elif isinstance(expr, Prod):
+
+            dims = []
+            for dim in expr.products:
+                assert is_dim(dim)
+                d, counter = transform_dimension(dim, counter, dimension_dict)
+                dims.append(d.arg(1))
+            return z3.Product(dims), counter
+
+        elif is_algebraic_expression(expr):
+
+            lhs, counter = transform_algebraic_expression(expr.lhs, counter, dimension_dict)
+            rhs, counter = transform_algebraic_expression(expr.rhs, counter, dimension_dict)
+
+            if expr.op == op_sub:
+                c = lhs - rhs
+
+            elif expr.op == op_add:
+                c = lhs + rhs
+
+            elif expr.op == op_div:
+                c = lhs / rhs
+
+            elif expr.op == op_mul:
+                c = lhs * rhs
+
+            elif expr.op == op_mod:
+                c = lhs % rhs
+
+            else:
+                raise NotImplementedError('operation not yet implemented')
+
+            return c, counter
+
+        else:
+            raise RuntimeError
+
+
+    def transform_all_constraints(traced, counter=0):
+        """
+        Given a trace, generates constraints and transforms them to z3 format
+
+        """
+        dimension_dict = {}  # type: ignore[var-annotated]
+
+        generator = ConstraintGenerator(traced)
+        new_constraints, counter = generator.generate_constraints(counter)
+
+        # print(new_constraints.conjucts[0])
+        # print(*new_constraints.conjucts, sep='\n')
+
+        # transform precision, matching, consistency till obtaining a fixed point
+        new_constraints, counter = iterate_till_fixed_point(new_constraints, counter)
+        # print(new_constraints)
+        # print(new_constraints.conjucts)
+        # new_constraints.conjucts = new_constraints.conjucts[:-1]
+        # print(*new_constraints.conjucts, sep='\n')
+
+        transformed, counter = transform_to_z3(new_constraints, counter, dimension_dict)
+        # print(transformed)
+        return transformed
+
+    def iterate_till_fixed_point(constraints, counter):
+        """
+        Transform constraints till reaching a fixed point
+        """
+        old_c = None
+        while old_c != constraints:
+            old_c = constraints
+            constraints, counter = transform_constraint(constraints, counter)
+        return constraints, counter
+
+    def transform_all_constraints_trace_time(tracer_root, graph, node, counter=0):
+        """
+        Takes a node and a graph and generates two sets of constraints.
+        One set constraints the node's constraints and another set
+        constraints the negation of the node's constraints
+        Args:
+            tracer_root: the root for getting the module instances
+            graph: the graph so far in the tracing process
+            node: node that represents a conditional
+            counter: variable tracking
+
+        Returns: Two sets of constraints. One with a conjunction with the
+        the conditional constraint and the other with a conjunction with
+        its negation.
+
+        """
+        dimension_dict = {}  # type: ignore[var-annotated]
+
+        generator = ConstraintGenerator(tracer_root, graph)
+        new_constraints, counter = generator.generate_constraints(counter)
+
+        condition_constraint = new_constraints.conjucts[-1]
+
+        # we know the constraint is a conjunction where the last constraint is about the conditional
+        # so remove the last constraint
+        new_constraints.conjucts = new_constraints.conjucts[:-1]
+
+        # transform precision, matching, consistency till obtaining a fixed point
+        new_constraints, counter = iterate_till_fixed_point(new_constraints, counter)
+
+
+        # since the function returns a list of one element, we get the first element
+        # we are only interested in the RHS in this case because the LHS just stores
+        # the result
+
+        # we make sure the constraint is of the form:
+        # c = b where b is a boolean expression
+        # and we consider b (constraint.rhs) for transformation
+        assert isinstance(condition_constraint.lhs, BVar)
+        assert is_bool_expr(condition_constraint.rhs)
+        condition_constraint_rhs = condition_constraint.rhs
+
+        # transform the condition constraint
+        condition_constraint_rhs, counter = iterate_till_fixed_point(condition_constraint_rhs, counter)
+
+        transformed, counter = transform_to_z3(new_constraints, counter, dimension_dict)
+
+        transformed_condition_constraint, counter = transform_to_z3(condition_constraint_rhs, counter, dimension_dict)
+
+        negation_transformed_condition_constraint = z3.Not(transformed_condition_constraint)
+
+        return z3.And([transformed, transformed_condition_constraint]), \
+            z3.And([transformed, negation_transformed_condition_constraint])
+
+
+    def evaluate_conditional_with_constraints(tracer_root, graph, node, counter=0, user_constraints=None):
+        """
+        Given an IR and a node representing a conditional, evaluate the conditional
+        and its negation
+        Args:
+            tracer_root: Tracer root for module instances
+            node: The node to be evaluated
+
+        Returns: the results of evaluating the condition and the negation with
+        the rest of the constraints
+
+        """
+
+        transformed_positive, transformed_negative = \
+            transform_all_constraints_trace_time(tracer_root, graph, node, counter)
+
+        s = z3.Solver()
+        s.add(transformed_positive)
+        if user_constraints is not None:
+            s.add(user_constraints)
+        condition = s.check()
+
+        s = z3.Solver()
+        s.add(transformed_negative)
+        if user_constraints is not None:
+            s.add(user_constraints)
+        negation = s.check()
+        return condition, negation
+
+except ImportError:
+    HAS_Z3 = False
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/util.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..384d4a6c56bac39e7f341198bb881f5ecb52db62
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/util.py
@@ -0,0 +1,52 @@
+from torch.fx.experimental.migrate_gradual_types.constraint import TVar, DVar, BinConstraintD, \
+    BVar
+from torch.fx.experimental.migrate_gradual_types.operation import op_leq
+
+
+def gen_tvar(curr):
+    """
+    Generate a tensor variable
+    :param curr: The current counter
+    :return: a tensor variable and the updated counter
+    """
+    curr += 1
+    return TVar(curr), curr
+
+
+def gen_dvar(curr):
+    """
+    Generate a dimension variable
+    :param curr: the current counter
+    :return: a dimension variable and an updated counter
+    """
+    curr += 1
+    return DVar(curr), curr
+
+def gen_bvar(curr):
+    """
+    Generate a boolean variable
+    :param curr: the current counter
+    :return: a boolean variable and an updated counter
+    """
+    curr += 1
+    return BVar(curr), curr
+
+def gen_tensor_dims(n, curr):
+    """
+    Generate a list of tensor dimensions
+    :param n:  the number of dimensions
+    :param curr: the current counter
+    :return: a list of dimension variables and an updated counter
+    """
+    dims = []
+    for _ in range(n):
+        dvar, curr = gen_dvar(curr)
+        dims.append(dvar)
+    return dims, curr
+
+
+def gen_nat_constraints(list_of_dims):
+    """
+    Generate natural number constraints for dimensions
+    """
+    return [BinConstraintD(0, d, op_leq) for d in list_of_dims]
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/z3_types.py b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/z3_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bc3f6798e8a0e0107c4dcf8ce0377567089bad4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/migrate_gradual_types/z3_types.py
@@ -0,0 +1,29 @@
+try:
+    import z3  # type: ignore[import]
+    HAS_Z3 = True
+    # dynamic type
+    dyn = z3.DeclareSort('Dyn')
+    dyn_type = z3.Const('dyn', dyn)
+
+    # dimension
+    dim = z3.Datatype('dim')
+    dim.declare('dim', ('0', z3.IntSort()), ('1', z3.IntSort()))
+    dim = dim.create()
+
+    # tensors
+    tensor_type = z3.Datatype('TensorType')
+    tensor_type.declare('Dyn', ('dyn', dyn))
+    tensor_type.declare('tensor1', ('0', dim))
+    tensor_type.declare('tensor2', ('0', dim), ('1', dim))
+    tensor_type.declare('tensor3', ('0', dim), ('1', dim), ('2', dim))
+    tensor_type.declare('tensor4', ('0', dim), ('1', dim), ('2', dim), ('3', dim))
+    tensor_type = tensor_type.create()
+
+    # create dimension
+    D = dim.dim
+
+    z3_dyn = tensor_type.Dyn(dyn_type)
+
+
+except ImportError:
+    HAS_Z3 = False
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/normalize.py b/MLPY/Lib/site-packages/torch/fx/experimental/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..4642f54c5f8a4e32053270a27b133091ca653be9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/normalize.py
@@ -0,0 +1,162 @@
+import operator
+from typing import Any, Callable, Dict, Tuple, Optional
+
+import torch
+import torch.fx
+import torch.fx as fx
+from torch.fx import Transformer, Proxy
+from torch.fx.node import Argument, Target, Node, map_aggregate
+from torch.fx.operator_schemas import (
+    normalize_module,
+    normalize_function,
+    create_type_hint,
+)
+
+from .schema_type_annotation import AnnotateTypesWithSchema
+
+
+class NormalizeArgs(Transformer):
+    """
+    Normalize arguments to Python targets. This means that
+    `args/kwargs` will be matched up to the module/functional's
+    signature and rewritten to exclusively kwargs in positional order
+    if `normalize_to_only_use_kwargs` is true. Also populates default
+    values. Does not support positional-only parameters or varargs
+    parameters (*args, **kwargs).
+
+    If the nodes have 'type' metadata, it will use it to disambiguate
+    overloads. Otherwise, it will throw an error.
+
+    Example usage:
+        m = torchvision.models.resnet18()
+        traced = torch.fx.symbolic_trace(m)
+        traced = NormalizeArgs(traced).transform()
+    """
+
+    def __init__(
+        self, module: torch.fx.GraphModule, normalize_to_only_use_kwargs: bool = True
+    ):
+        super().__init__(module)
+        self.node_map: Dict[Proxy, Node] = {}
+        self.normalize_to_only_use_kwargs = normalize_to_only_use_kwargs
+
+    def run_node(self, n: Node) -> Any:
+        args, kwargs = self.fetch_args_kwargs_from_env(n)
+
+        def get_type(arg):
+            if isinstance(arg, fx.Node):
+                return n.meta["type"] if "type" in n.meta else None
+            return type(arg)
+
+        arg_types = map_aggregate(n.args, get_type)
+        assert isinstance(arg_types, tuple)
+        arg_types = tuple([create_type_hint(i) for i in arg_types])
+        kwarg_types = {k: get_type(v) for k, v in kwargs.items()}
+        if n.op == "call_function":
+            out = self.call_function(n.target, args, kwargs, arg_types, kwarg_types)
+        else:
+            out = super().run_node(n)
+        if n.op != "output":
+            self.node_map[out] = n
+            out.node.meta = n.meta
+            out.node.type = n.type
+        return out
+
+    def call_function(
+        self,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Any],
+        arg_types: Optional[Tuple[Any, ...]] = None,
+        kwarg_types: Optional[Dict[str, Any]] = None,
+    ):
+        assert callable(target)
+        new_args_and_kwargs = normalize_function(
+            target,
+            args,  # type: ignore[arg-type]
+            kwargs,
+            arg_types,  # type: ignore[arg-type]
+            kwarg_types,
+            self.normalize_to_only_use_kwargs,
+        )
+        if new_args_and_kwargs:
+            new_args, new_kwargs = new_args_and_kwargs
+            return self.tracer.create_proxy(
+                "call_function", target, new_args, new_kwargs
+            )
+        else:
+            return super().call_function(target, args, kwargs)
+
+    def call_module(
+        self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]
+    ):
+        assert isinstance(target, str)
+        new_args_and_kwargs = normalize_module(
+            self.module,
+            target,
+            args,  # type: ignore[arg-type]
+            kwargs,
+            self.normalize_to_only_use_kwargs,
+        )
+        if new_args_and_kwargs:
+            new_args, new_kwargs = new_args_and_kwargs
+            return super().call_module(target, new_args, new_kwargs)
+        else:
+            return super().call_module(target, args, kwargs)
+
+
+class NormalizeOperators(AnnotateTypesWithSchema):
+    """
+    Normalize callsites that are different ways of "spelling" the same
+    invocation into a single, canonical call. Currently supports:
+
+    1. Normalize operators (e.g. operator.add) to the `torch` ops they
+       ultimately invoke (e.g. torch.add) when it is possible to statically
+       reason that
+
+    Example usage:
+
+        m = torchvision.models.resnet18()
+
+        traced = torch.fx.symbolic_trace(m)
+
+        traced = NormalizeOperators(traced).transform()
+    """
+
+    binary_magic_method_remap: Dict[
+        Callable[[Any, Any], Any], Callable[[Any, Any], Any]
+    ] = {
+        torch.add: operator.add,
+        torch.mul: operator.mul,
+        torch.sub: operator.sub,
+        torch.div: operator.truediv,
+        torch.floor_divide: operator.floordiv,
+        torch.remainder: operator.mod,
+        torch.eq: operator.eq,
+        torch.ne: operator.ne,
+        torch.lt: operator.lt,
+        torch.le: operator.le,
+        torch.gt: operator.gt,
+        torch.ge: operator.ge,
+    }
+
+    def call_function(
+        self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]
+    ):
+        # Normalize operators according to the magic methods implemented on tensors here:
+        # https://github.com/pytorch/pytorch/blob/28c5d90b679c6b38bf4183ec99f16d933c2f1bcd/tools/autograd/templates/python_variable_methods.cpp#L1137 # noqa: B950
+
+        assert callable(target)
+
+        if target in self.binary_magic_method_remap:
+            if len(args) != 2:
+                return super().call_function(target, args, kwargs)
+            lhs, rhs = args
+
+            return super().call_function(
+                target=self.binary_magic_method_remap[target],
+                args=(lhs, rhs),
+                kwargs={},
+            )
+
+        return super().call_function(target, args, kwargs)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/optimization.py b/MLPY/Lib/site-packages/torch/fx/experimental/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3dc401cc39b0dc586701ff2741343d33f5df7ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/optimization.py
@@ -0,0 +1,408 @@
+import torch.fx as fx
+from torch.fx.node import Argument, Target
+from torch.nn.utils.fusion import fuse_conv_bn_eval
+from typing import Type, Dict, Any, Tuple, Iterable, Optional, List, cast
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.fx.passes.shape_prop import ShapeProp
+import copy
+from collections import defaultdict
+import torch.utils.mkldnn as th_mkldnn
+import operator
+import time
+import logging
+from enum import Enum
+
+def _parent_name(target : str) -> Tuple[str, str]:
+    """
+    Splits a qualname into parent path and last atom.
+    For example, `foo.bar.baz` -> (`foo.bar`, `baz`)
+    """
+    *parent, name = target.rsplit('.', 1)
+    return parent[0] if parent else '', name
+
+# Works for length 2 patterns with 2 modules
+def matches_module_pattern(pattern: Iterable[Type], node: fx.Node, modules: Dict[str, Any]):
+    if len(node.args) == 0:
+        return False
+    nodes: Tuple[Any, fx.Node] = (node.args[0], node)
+    for expected_type, current_node in zip(pattern, nodes):
+        if not isinstance(current_node, fx.Node):
+            return False
+        if current_node.op != 'call_module':
+            return False
+        if not isinstance(current_node.target, str):
+            return False
+        if current_node.target not in modules:
+            return False
+        if type(modules[current_node.target]) is not expected_type:
+            return False
+    return True
+
+
+def replace_node_module(node: fx.Node, modules: Dict[str, Any], new_module: torch.nn.Module):
+    assert isinstance(node.target, str)
+    parent_name, name = _parent_name(node.target)
+    modules[node.target] = new_module
+    setattr(modules[parent_name], name, new_module)
+
+def fuse(model: torch.nn.Module, inplace=False, no_trace=False) -> torch.nn.Module:
+    """
+    Fuses convolution/BN layers for inference purposes. Will deepcopy your
+    model by default, but can modify the model inplace as well.
+    """
+    patterns = [(nn.Conv1d, nn.BatchNorm1d),
+                (nn.Conv2d, nn.BatchNorm2d),
+                (nn.Conv3d, nn.BatchNorm3d)]
+    if not inplace:
+        model = copy.deepcopy(model)
+    if not no_trace or not isinstance(model, torch.fx.GraphModule):
+        fx_model = fx.symbolic_trace(model)
+    else:
+        fx_model = model
+    modules = dict(fx_model.named_modules())
+    new_graph = copy.deepcopy(fx_model.graph)
+
+    for pattern in patterns:
+        for node in new_graph.nodes:
+            if matches_module_pattern(pattern, node, modules):
+                if len(node.args[0].users) > 1:  # Output of conv is used by other nodes
+                    continue
+                conv = modules[node.args[0].target]
+                bn = modules[node.target]
+                if not bn.track_running_stats:
+                    continue
+                fused_conv = fuse_conv_bn_eval(conv, bn)
+                replace_node_module(node.args[0], modules, fused_conv)
+                node.replace_all_uses_with(node.args[0])
+                new_graph.erase_node(node)
+    return fx.GraphModule(fx_model, new_graph)
+
+def remove_dropout(model: nn.Module) -> nn.Module:
+    """
+    Removes all dropout layers from the module.
+    """
+    fx_model = fx.symbolic_trace(model)
+
+    class DropoutRemover(torch.fx.Transformer):
+        def call_module(self, target : Target, args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+            if isinstance(self.submodules[target], nn.Dropout):
+                assert len(args) == 1
+                return args[0]
+            else:
+                return super().call_module(target, args, kwargs)
+    return DropoutRemover(fx_model).transform()
+
+def extract_subgraph(orig_module: nn.Module, nodes: List[fx.Node], inputs: List[fx.Node], outputs: List[fx.Node]):
+    """
+    Given lists of nodes from an existing graph that represent a subgraph, returns a submodule that executes that subgraph.
+    """
+    new_graph = fx.Graph()
+    env: Dict[fx.Node, fx.Node] = {}
+    for input in inputs:
+        new_node = new_graph.placeholder(input.name)
+        env[input] = new_node
+    for node in nodes:
+        new_node = new_graph.node_copy(node, lambda x: env[x])
+        env[node] = new_node
+    new_graph.output([env[output] for output in outputs])
+    new_graph.lint()
+    return fx.GraphModule(orig_module, new_graph)
+
+mkldnn_supported = [
+    nn.Conv2d, nn.Linear, nn.BatchNorm2d, nn.ReLU, nn.MaxPool2d, nn.AvgPool2d, nn.AdaptiveAvgPool2d,
+    torch.relu, torch.transpose, torch.sigmoid,
+    F.relu, F.avg_pool2d, F.adaptive_avg_pool2d
+]
+# These are operators that may not be convertible into MKLDNN ops (e.g. the
+# args are scalar values). Thus, we only include them in the subgraph if their
+# arguments are already in MKLDNN.
+# TODO: Determine whether this can be removed after type inference.
+mkldnn_supported_unknown = [operator.add, operator.mul]
+mkldnn_map = {
+    nn.Conv2d: th_mkldnn.MkldnnConv2d,
+    nn.Linear: th_mkldnn.MkldnnLinear,
+    nn.BatchNorm2d: lambda a, _: th_mkldnn.MkldnnBatchNorm(a)
+}
+
+
+def modules_to_mkldnn(nodes: List[fx.Node], modules: Dict[str, nn.Module]):
+    """
+    For each node, if it's a module that can be preconverted into MKLDNN,
+    then we do so and create a mapping to allow us to convert from the MKLDNN
+    version of the module to the original.
+    """
+    old_modules: Dict[nn.Module, nn.Module] = {}
+    for node in nodes:
+        if node.op == 'call_module':
+            assert isinstance(node.target, str)
+            cur_module = modules[node.target]
+            if type(cur_module) in mkldnn_map:
+                new_module = mkldnn_map[type(cur_module)](cur_module, torch.float)
+                assert isinstance(new_module, nn.Module)
+                old_modules[new_module] = copy.deepcopy(cur_module)
+                replace_node_module(node, modules, new_module)
+    return old_modules
+
+def reset_modules(nodes: List[fx.Node], modules: Dict[str, nn.Module], old_modules: Dict[nn.Module, nn.Module]):
+    """
+    Maps each module that's been changed with `modules_to_mkldnn` back to its
+    original.
+    """
+    for node in nodes:
+        if node.op == 'call_module':
+            assert (isinstance(node.target, str))
+            cur_module = modules[node.target]
+            if cur_module in old_modules:
+                replace_node_module(node, modules, old_modules[cur_module])
+
+class MklSubgraph:
+    def __init__(self, fx_graph: fx.Graph):
+        self.fx_graph = fx_graph
+        self.nodes: List[fx.Node] = []
+        self.start_nodes: List[fx.Node] = []
+        self.end_nodes: List[fx.Node] = []
+
+def gen_mkl_autotuner(example_inputs, iters=10, warmup=1):
+    """
+    This generates a heuristic that can be passed into `optimize_for_inference` that
+    determines whether a subgraph should be run in MKL by running it with the example_inputs.
+
+    Example usage:
+        heuristic = gen_mkl_autotuner(example_inputs, iters=10)
+        fast_model = optimization.optimize_for_inference(model, heuristic)
+    """
+    fx_model = None
+    old_modules = None
+
+    def use_mkl_heuristic(graph: MklSubgraph) -> bool:
+        nonlocal fx_model, old_modules
+        input_nodes = graph.start_nodes
+        if fx_model is None:
+            fx_model = graph.fx_graph.owning_module
+            old_modules = graph.fx_graph.old_modules  # type: ignore[attr-defined]
+            ShapeProp(fx_model).propagate(example_inputs)
+        sample_inputs = [torch.randn(node.shape) for node in input_nodes]  # type: ignore[attr-defined]
+        output_args = cast(List[fx.Node], [node.args[0] for node in graph.end_nodes])
+        submodule = extract_subgraph(fx_model, graph.nodes, input_nodes, output_args)
+
+        def benchmark(f):
+            for _ in range(warmup):
+                f()
+            begin = time.time()
+            for _ in range(iters):
+                out = f()
+            return time.time() - begin
+
+        mkl_time = benchmark(lambda: [i.to_dense() for i in submodule(*[i.to_mkldnn() for i in sample_inputs])])
+
+        reset_modules(submodule.graph.nodes, dict(submodule.named_modules()), old_modules)
+        no_mkl_time = benchmark(lambda: submodule(*sample_inputs))
+        return mkl_time < no_mkl_time
+    return use_mkl_heuristic
+
+def use_mkl_length(graph: MklSubgraph) -> bool:
+    """
+    This is a heuristic that can be passed into `optimize_for_inference` that
+    determines whether a subgraph should be run in MKL by checking if there
+    are more than 2 nodes in it
+    """
+    return len(graph.nodes) > 2
+
+class UnionFind:
+    def __init__(self, n):
+        self.parent: List[Optional[int]] = [None] * n
+        self.size: List[int] = [0] * n
+
+    def make_set(self, v: int):
+        self.parent[v] = v
+        self.size[v] = 1
+
+    def find(self, v: int) -> int:
+        par = self.parent[v]
+        if v == par:
+            return v
+        assert par is not None
+        self.parent[v] = self.find(par)
+        return cast(int, self.parent[v])
+
+    def join(self, a: int, b: int):
+        a, b = self.find(a), self.find(b)
+        if a == b:
+            return a
+        if self.size[a] < self.size[b]:
+            a, b = b, a
+        self.parent[b] = a
+        self.size[a] += self.size[b]
+
+def optimize_for_inference(
+    model: torch.nn.Module,
+    pass_config: Optional[Dict[str, Any]] = None,
+    tracer: Type[fx.Tracer] = fx.Tracer
+) -> torch.nn.Module:
+    """
+    Performs a set of optimization passes to optimize a model for the
+    purposes of inference. Specifically, the passes that are run are:
+    1. Conv/BN fusion
+    2. Dropout removal
+    3. MKL layout optimizations
+
+    The third optimization takes a function `use_mkl_heuristic` that's used
+    to determine whether a subgraph should be explicitly run in MKL layout.
+
+    Note: As FX does not currently handle aliasing, this pass currently
+    assumes nothing aliases. If that isn't true, use at your own risk.
+    """
+    default_pass_config = {
+        "conv_bn_fuse": True,
+        "remove_dropout": True,
+        "mkldnn_layout_optimize": {'heuristic': use_mkl_length},
+    }
+    if pass_config is None:
+        pass_config = {}
+    default_pass_config.update(pass_config)
+
+    if default_pass_config["conv_bn_fuse"]:
+        model = fuse(model)
+    if default_pass_config["remove_dropout"]:
+        model = remove_dropout(model)
+    if default_pass_config["mkldnn_layout_optimize"] is False:
+        return model
+    if not isinstance(default_pass_config["mkldnn_layout_optimize"], dict):
+        raise RuntimeError("mkldnn_layout_optimize config is not a dict")
+    if "heuristic" not in default_pass_config["mkldnn_layout_optimize"]:
+        raise RuntimeError("Heuristic not found in mkldnn_layout_optimize config")
+    use_mkl_heuristic = default_pass_config["mkldnn_layout_optimize"]["heuristic"]
+
+    cur_tracer = tracer()
+    fx_graph = cur_tracer.trace(copy.deepcopy(model))
+    fx_model = fx.GraphModule(cur_tracer.root, fx_graph)
+    modules: Dict[str, nn.Module] = dict(model.named_modules())
+
+    class MklSupport(Enum):
+        NO = 1
+        YES = 2
+        UNKNOWN = 3
+
+    # Inserts to_mkldnn and to_dense around every node we want to be a MKLDNN node.
+    # If the op is in `mkldnn_supported` then we always treat it as a MKLDNN node.
+    # However, if it's in `mkldnn_supported_unknown`, then we only treat it as
+    # a MKLDNN node if its inputs are MKLDNN nodes.
+    for node in list(fx_graph.nodes):
+        supports_mkldnn = MklSupport.NO
+        if node.op == 'call_module':
+            cur_module = modules[node.target]
+            if type(cur_module) in mkldnn_supported:
+                supports_mkldnn = MklSupport.YES
+                sample_parameter = next(cur_module.parameters(), None)
+                if sample_parameter is not None:
+                    assert sample_parameter.dtype == torch.float, "this pass is only for torch.float modules"
+                    assert sample_parameter.device == torch.device('cpu'), "this pass is only for CPU modules"
+        elif node.op == 'call_function':
+            if node.target in mkldnn_supported:
+                supports_mkldnn = MklSupport.YES
+            elif node.target in mkldnn_supported_unknown:
+                supports_mkldnn = MklSupport.UNKNOWN
+
+        if supports_mkldnn != MklSupport.NO:
+            if supports_mkldnn == MklSupport.UNKNOWN:
+                if not any(arg.target == 'to_dense' for arg in node.args):
+                    continue
+            with fx_graph.inserting_before(node):
+                mkldnn_args = fx.map_arg(node.args, lambda n: fx_graph.call_method('to_mkldnn', (n, )))
+
+            node.args = cast(Tuple[fx.node.Argument], mkldnn_args)
+
+            with fx_graph.inserting_after(node):
+                dense_x = fx_graph.create_node('call_method', 'to_dense', (node,))
+                node.replace_all_uses_with(dense_x)
+                dense_x.args = (node,)
+
+    # Does pre-conversion of all modules into MKLDNN (when possible)
+    old_modules = modules_to_mkldnn(list(fx_graph.nodes), modules)
+    fx_graph.old_modules = old_modules  # type: ignore[attr-defined]
+
+    # optimizes all a -> to_dense -> to_mkldnn -> b patterns into a -> b
+    for node in fx_graph.nodes:
+        if node.op == 'call_method' and node.target == 'to_dense':
+            prv_node = node.args[0]
+            users = list(node.users)
+            for user in users:
+                if user.op == 'call_method' and user.target == 'to_mkldnn':
+                    user.replace_all_uses_with(prv_node)
+                    fx_graph.erase_node(user)
+            if len(node.users) == 0:
+                fx_graph.erase_node(node)
+
+
+    num_nodes = len(fx_graph.nodes)
+    uf = UnionFind(num_nodes)
+
+    def get_color(n):
+        if hasattr(n, 'color'):  # Current node is part of a MKL subgraph
+            return uf.find(n.color)
+        if hasattr(n, 'start_color'):  # Current node is input to MKL subgraph
+            return uf.find(n.start_color)
+        return None
+
+
+    # This code is to find each MKLDNN subgraph. Each MKLDNN subgraph consists
+    # of input nodes (which are only `to_mkldnn` calls), output nodes
+    # (`to_dense` calls), and intermediate nodes, which are run entirely on
+    # MKLDNN layout tensors.
+    #
+    # Specifically, this code does a flood fill on a directed acyclic graph
+    # (DAG), starting from each possible "start node" (i.e: `to_mkldnn` nodes).
+    # If every node only had one input, this would be sufficient. However, in
+    # the case that a node has multiple inputs coming from different start
+    # nodes (i.e. colors), we need to join these 2 colors into 1. That's done
+    # using a Disjoint Set Union.
+    for cur_idx, node in enumerate(fx_graph.nodes):
+        if node.op == 'call_method' and node.target == 'to_mkldnn':
+            node.start_color = cur_idx
+            uf.make_set(cur_idx)
+        elif node.op == 'call_method' and node.target == 'to_dense':
+            assert get_color(node.args[0]) is not None
+            node.end_color = get_color(node.args[0])
+        else:
+            cur_colors = [get_color(i) for i in node.all_input_nodes if isinstance(i, fx.Node) if get_color(i) is not None]
+
+            if len(cur_colors) == 0:
+                continue
+            assert not any(i is None for i in cur_colors)
+            cur_colors = sorted(cur_colors)
+            node.color = cur_colors[0]
+            for other_color in cur_colors[1:]:
+                uf.join(cur_colors[0], other_color)
+
+
+    mkldnn_graphs: Dict[int, MklSubgraph] = defaultdict(lambda: MklSubgraph(fx_graph))
+    for node in fx_graph.nodes:
+        if hasattr(node, 'color'):
+            mkldnn_graphs[uf.find(node.color)].nodes.append(node)
+        if hasattr(node, 'start_color'):
+            mkldnn_graphs[uf.find(node.start_color)].start_nodes.append(node)
+        if hasattr(node, 'end_color'):
+            mkldnn_graphs[uf.find(node.end_color)].end_nodes.append(node)
+
+
+    # Now that we have all the subgraphs, we need to decide which MKLDNN
+    # subgraphs we actually want to keep in MKLDNN.
+    for graph in mkldnn_graphs.values():
+        if not use_mkl_heuristic(graph):
+            for node in graph.start_nodes + graph.end_nodes:
+                prv = node.args[0]
+                node.replace_all_uses_with(prv)
+                fx_graph.erase_node(node)
+            reset_modules(graph.nodes, modules, old_modules)
+
+    mkldnn_conversions = 0
+    for node in fx_graph.nodes:
+        if node.target == 'to_mkldnn' or node.target == 'to_dense':
+            mkldnn_conversions += 1
+
+    logging.getLogger(__name__).info(f"mkldnn conversions: {mkldnn_conversions}")
+    fx_graph.lint()
+    result = fx.GraphModule(model, fx_graph)
+    return result
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/partitioner_utils.py b/MLPY/Lib/site-packages/torch/fx/experimental/partitioner_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a520a0ff9845c0c0ab0c934ac9ec4487b9a19c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/partitioner_utils.py
@@ -0,0 +1,317 @@
+from enum import Enum
+from typing import NamedTuple, Dict, List, Set
+
+from torch.fx.node import Node, map_arg
+
+
+class Partition:
+    """Partition class contains all the information about an individual partition.
+    It also provides necessary methods for manipulation the partition.
+    """
+
+    def __init__(self, partition_id: int) -> None:
+        self.nodes: Set[Node] = set()
+        self.partition_id = partition_id
+        self.parents: Set[Partition] = set()
+        self.children: Set[Partition] = set()
+        self.bfs_level: int = -1
+        self.used_mem_bytes: int = 0
+        self.logical_device_ids: List[int] = []
+
+    def __str__(self):
+        return str(self.partition_id)
+
+    def recalculate_mem_size(self):
+        self.used_mem_bytes = 0
+        for node in self.nodes:
+            self.used_mem_bytes += get_extra_size_of(node, self.nodes)
+
+    def add_node(self, node):
+        input_nodes: Dict[Node, None] = {}
+        map_arg(node.args, input_nodes.setdefault)
+        map_arg(node.kwargs, input_nodes.setdefault)
+        # Add current node's input nodes if they are placeholder or constants
+        for n in input_nodes:
+            if n.op in {"placeholder", "get_attr"}:
+                self.nodes.add(n)
+        self.nodes.add(node)
+        self.recalculate_mem_size()
+
+    def remove_node(self, node):
+        # Remove a node only if the node is in the partition
+        if node in self.nodes:
+            self.nodes.remove(node)
+            # Collect the node's input nodes
+            input_nodes: Dict[Node, None] = {}
+            map_arg(node.args, input_nodes.setdefault)
+            map_arg(node.kwargs, input_nodes.setdefault)
+            # Check if an input node is a placeholder or get_attr,
+            # and this input node is not used by some other nodes in this partition,
+            # the remove this input node
+            for input_node in input_nodes:
+                if all(
+                    n not in self.nodes for n in input_node.users
+                ) and input_node.op in {"placeholder", "get_attr"}:
+                    self.nodes.remove(input_node)
+            self.recalculate_mem_size()
+
+
+class Device(NamedTuple):
+    name: str
+    available_mem_bytes: int
+    logical_id: int
+
+
+class NodeLatency(NamedTuple):
+    # Latency due to the memory bandwidth
+    mem_latency_sec: float
+    # Latency due to the computation
+    computer_latency_sec: float
+
+
+class PartitionLatency(NamedTuple):
+    # Sum of all nodes' memory latency on the critical path
+    mem_latency_sec: float
+    # Sum of all nodes' compute latency on the critical path
+    computer_latency_sec: float
+    # Latency of the critical path
+    overall_latency_sec: float
+
+
+class PartitionMode(Enum):
+    size_based = 0
+    sparse_nn = 1
+    cost_aware = 2
+    kl_based = 3
+    aot_based = 4
+
+
+class PartitionerConfig(NamedTuple):
+    devices: List[Device]
+    mode: PartitionMode = PartitionMode.size_based
+    transfer_rate_bytes_per_sec: float = 0.0
+    node_to_latency_mapping: Dict[Node, NodeLatency] = {}
+    node_to_partition_mapping: Dict[Node, int] = {}
+    partition_to_logical_device_mapping: Dict[int, List[int]] = {}
+    # Saturate host by replicating partitions to the remaining idle devices.
+    saturate_host: bool = False
+
+
+def get_extra_size_of(node: Node, nodes: Set[Node]) -> int:
+    """Given a node and a set of nodes,
+    this function return the extra size that needed
+    if this node is included in this set.
+    """
+    # Find all its input nodes
+    input_nodes: Dict[Node, None] = {}
+    map_arg(node.args, input_nodes.setdefault)
+    map_arg(node.kwargs, input_nodes.setdefault)
+    # Calculate total size of related nodes
+    total_size_of_input_nodes = 0
+    for n in input_nodes:
+        # Make sure this node hasn't been in this set yet
+        if n not in nodes:
+            size_bytes = getattr(n, "size_bytes", None)
+            if size_bytes:
+                total_size_of_input_nodes += size_bytes.output_size
+            else:
+                raise RuntimeError("node has no size_bytes attr")
+    # Don't forget the op node itself
+    size_bytes = getattr(node, "size_bytes", None)
+    if size_bytes:
+        total_size_of_input_nodes += size_bytes.total_size
+    else:
+        raise RuntimeError("node has no size_bytes attr")
+    return total_size_of_input_nodes
+
+
+def get_latency_of_one_partition(
+    partition: Partition, node_to_latency_mapping: Dict[Node, NodeLatency]
+) -> PartitionLatency:
+    """Given a partition and its nodes' latency, return a PartitionLatency for this partition"""
+
+    def get_top_nodes(partition: Partition) -> List[Node]:
+        """Given a partition, return a list of nodes on the top bfs level"""
+        top_nodes: List[Node] = []
+        for node in partition.nodes:
+            # Skip placeholder and get_attr nodes
+            if node.op in {"placeholder", "get_attr"}:
+                continue
+            input_nodes: Dict[Node, None] = {}
+            map_arg(node.args, input_nodes.setdefault)
+            map_arg(node.kwargs, input_nodes.setdefault)
+            # If a node has no input nodes in this partition,
+            # or its input nodes in this partition are placeholders and get_attrs
+            # this node is on the top bfs level in this partition
+            if not any(
+                n in partition.nodes and n.op not in {"placeholder", "get_attr"}
+                    for n in input_nodes
+            ):
+                top_nodes.append(node)
+        return top_nodes
+
+    def dfs_helper(node: Node, partition_latency) -> PartitionLatency:
+        """Given a top node of a partition, this function returns
+        the latency of the critical path in the partition
+        """
+        node_latency = node_to_latency_mapping[node]
+        # Calculate the current overall latency of the partition
+        overall_latency_sec = partition_latency.overall_latency_sec + max(
+            node_latency.computer_latency_sec, node_latency.mem_latency_sec
+        )
+        # Update the mem latency of this path
+        mem_latency_sec = (
+            partition_latency.mem_latency_sec + node_latency.mem_latency_sec
+        )
+        # Update the compute latency of this path
+        computer_latency_sec = (
+            partition_latency.computer_latency_sec + node_latency.computer_latency_sec
+        )
+        # Get all users of this node that are in this partition
+        users = set(node.users).intersection(partition.nodes)
+        if users:
+            max_latency = PartitionLatency(
+                mem_latency_sec=0.0, computer_latency_sec=0.0, overall_latency_sec=0.0
+            )
+            for n in users:
+                # Get new partition latency recursively
+                new_partition_latency = dfs_helper(
+                    n,
+                    PartitionLatency(
+                        mem_latency_sec, computer_latency_sec, overall_latency_sec
+                    ),
+                )
+                if (
+                    new_partition_latency.overall_latency_sec
+                    > max_latency.overall_latency_sec
+                ):
+                    max_latency = new_partition_latency
+            return max_latency
+        # If there is no user, the node is at bottom of the partition
+        return PartitionLatency(
+            mem_latency_sec, computer_latency_sec, overall_latency_sec
+        )
+
+    # Main part starts
+    # Get all top level nodes of this partition
+    top_nodes = get_top_nodes(partition)
+    critical_path_latency = PartitionLatency(
+        mem_latency_sec=0.0, computer_latency_sec=0.0, overall_latency_sec=0.0
+    )
+    # Go through all top nodes and find the largest latency (critical pass latency)
+    for node in top_nodes:
+        partition_latency = dfs_helper(
+            node,
+            PartitionLatency(
+                mem_latency_sec=0.0, computer_latency_sec=0.0, overall_latency_sec=0.0
+            ),
+        )
+        if (
+            partition_latency.overall_latency_sec
+            > critical_path_latency.overall_latency_sec
+        ):
+            critical_path_latency = partition_latency
+    return critical_path_latency
+
+
+def get_partition_to_latency_mapping(
+    partitions: List[Partition], node_to_latency_mapping: Dict[Node, NodeLatency]
+) -> Dict[Partition, PartitionLatency]:
+    """Given all the partitions and node_to_latency_mapping dictionary,
+    return a mapping dictionary of each partition to its overall latency
+    """
+    partition_to_latency_mapping: Dict[Partition, PartitionLatency] = {}
+    # Go through each partition and get its latency
+    for partition in partitions:
+        partition_latency = get_latency_of_one_partition(
+            partition, node_to_latency_mapping
+        )
+        partition_to_latency_mapping[partition] = partition_latency
+    return partition_to_latency_mapping
+
+
+def get_comm_latency_between(
+    parent_partition: Partition,
+    child_partition: Partition,
+    transfer_rate_bytes_per_sec: float,
+):
+    """Given two partitions (parent and child),
+    calculate the communication latency between the two.
+    """
+    # If two partitions are on the same device, the comm latency is 0.
+    if (
+        parent_partition.logical_device_ids != []
+        and child_partition.logical_device_ids != []
+        and parent_partition.logical_device_ids == child_partition.logical_device_ids
+    ):
+        return 0.0
+    # Keep tracking the communication size between parent and child
+    comm_size = 0
+    # Keep tracking all the counted node
+    visited_nodes = set()
+    # Go through all nodes in the child partition
+    # If a node has input nodes from the parent partition,
+    # the output size of those input nodes will be counted
+    # and added to comm_size
+    for node in child_partition.nodes:
+        input_nodes: Dict[Node, None] = {}
+        map_arg(node.args, input_nodes.setdefault)
+        map_arg(node.kwargs, input_nodes.setdefault)
+        for n in input_nodes:
+            if n in parent_partition.nodes and n not in visited_nodes:
+                size_bytes = getattr(n, "size_bytes", None)
+                if size_bytes is not None:
+                    comm_size += size_bytes.output_size
+                visited_nodes.add(n)
+    return comm_size / transfer_rate_bytes_per_sec
+
+
+def get_latency_of_partitioned_graph(
+    partitions: List[Partition],
+    partition_to_latency_mapping: Dict[Partition, PartitionLatency],
+    transfer_rate_bytes_per_sec: float,
+):
+    """Given all partitions in a graph, find the critical path among all partitions
+    and return its latency as the latency of the whole graph
+    """
+
+    def dfs_helper(partition: Partition, latency_so_far_sec: float) -> float:
+        """This function helps to recursively get the latency of a path of partitions"""
+        # Update latency by adding current partition's latency
+        latency_so_far_sec += partition_to_latency_mapping[
+            partition
+        ].overall_latency_sec
+        children = partition.children
+        if partition.children:
+            max_latency_sec = 0.0
+            for child in partition.children:
+                # Calculate latency between
+                comm_latency_sec = get_comm_latency_between(
+                    partition, child, transfer_rate_bytes_per_sec
+                )
+                new_latency_sec = dfs_helper(
+                    child, latency_so_far_sec + comm_latency_sec
+                )
+                if new_latency_sec > max_latency_sec:
+                    max_latency_sec = new_latency_sec
+            return max_latency_sec
+        return latency_so_far_sec
+
+    def get_top_partitions(partitions: List[Partition]) -> List[Partition]:
+        """This function is to return all the partitions without parents
+        as the starting points of all the paths
+        """
+        top_partitions = []
+        for partition in partitions:
+            # If a partition has no parents, then it is a top partition
+            if len(partition.parents) == 0:
+                top_partitions.append(partition)
+        return top_partitions
+
+    top_partitions = get_top_partitions(partitions)
+    critical_path_latency_sec = 0.0
+    for partition in top_partitions:
+        latency_sec = dfs_helper(partition, 0.0)
+        if latency_sec > critical_path_latency_sec:
+            critical_path_latency_sec = latency_sec
+    return critical_path_latency_sec
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/proxy_tensor.py b/MLPY/Lib/site-packages/torch/fx/experimental/proxy_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8bd1500088c0f2e19cfefa63fd40eab41b693e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/proxy_tensor.py
@@ -0,0 +1,1122 @@
+# mypy: ignore-errors
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import contextlib
+import functools
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.utils._pytree as pytree
+from torch.fx import Tracer, GraphModule
+from torch.fx.graph_module import _assign_attr
+from weakref import WeakKeyDictionary
+from collections import defaultdict
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode, unset_fake_temporarily, is_fake
+from torch._dispatch.python import enable_python_dispatcher, enable_pre_dispatch
+import torch.fx as fx
+from torch.fx.node import _side_effectful_need_to_be_preserved_pre_dispatch
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from contextlib import contextmanager, nullcontext
+import inspect
+from dataclasses import dataclass
+import weakref
+import operator
+from torch.utils._stats import count
+import logging
+
+from torch.overrides import TorchFunctionMode
+
+from torch.utils._python_dispatch import (
+    TorchDispatchMode,
+    _disable_infra_mode,
+    _push_mode,
+    _unset_infra_mode,
+)
+
+from ._backward_state import BackwardState
+from .sym_node import SymNode
+from ._sym_dispatch_mode import SymDispatchMode
+from torch.fx import Proxy
+import torch.fx.traceback as fx_traceback
+from torch import SymInt, SymFloat, SymBool
+from torch.utils.weak import WeakTensorKeyDictionary, WeakIdKeyDictionary, _WeakHashRef
+
+__all__ = ["PythonKeyTracer", "dispatch_trace", "make_fx", "DecompositionInterpreter", "py_sym_types", "get_innermost_proxy_mode"]
+
+aten = torch.ops.aten
+prim = torch.ops.prim
+
+log = logging.getLogger(__name__)
+not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
+
+CURRENT_DECOMPOSITION_TABLE: Dict[torch._ops.OperatorBase, Callable] = {}
+
+CONSTANT_NUMEL_LIMIT = 1
+
+# We currently convert all SymInt to proxies before we use them.
+# This could plausibly be handled at the Dynamo level.
+pytree.register_pytree_node(
+    torch.Size,
+    lambda xs: (list(xs), None),
+    lambda xs, _: tuple(xs),
+    flatten_with_keys_fn=lambda xs: (
+        [(pytree.SequenceKey(i), x) for i, x in enumerate(xs)],
+        None,
+    ),
+)
+def fake_signature(fn, nargs):
+    """FX gets confused by varargs, de-confuse it"""
+    argnames = ",".join(f"arg{i}" for i in range(nargs))
+    return eval(f"lambda {argnames}: fn({argnames})", {"fn": fn})
+
+@contextmanager
+def decompose(decomposition_table):
+    global CURRENT_DECOMPOSITION_TABLE
+    old_decomposition_table = CURRENT_DECOMPOSITION_TABLE
+    CURRENT_DECOMPOSITION_TABLE = decomposition_table
+    try:
+        yield CURRENT_DECOMPOSITION_TABLE
+    finally:
+        CURRENT_DECOMPOSITION_TABLE = old_decomposition_table
+
+# ensure we cannot collide with other properties
+proxy_slot = object()
+no_default = object()
+
+py_sym_types = (SymInt, SymFloat, SymBool)
+
+def is_sym_node(node):
+    assert hasattr(node, 'meta'), "All nodes traced with proxy_tensor should have meta"
+    return "val" in node.meta and isinstance(node.meta['val'], py_sym_types)
+
+def set_proxy_slot(obj, tracer, proxy):
+    if isinstance(obj, torch.Tensor):
+        # We DO want to clobber proxies whenever we run an inplace operation
+        # on a tensor, and it affects the metadata on the proxy.
+        tracer.tensor_tracker[obj] = proxy
+    elif isinstance(obj, torch.ScriptObject):
+        # We DO want to clobber proxies, with a similar rationale as for tensors.
+        tracer.script_object_tracker[obj] = proxy
+    else:
+        # NB: Never clobber pre-existing proxy.  Although the proxies
+        # are in principle equivalent, when we do graph partitioning
+        # we need there not to be spurious dependencies on tangent inputs.
+        # This works because primals get their SymInts set first, and
+        # THEN later we allocate tangent inputs.  Make sure if a SymInt
+        # is derivable from a primal that we use that.
+        assert isinstance(obj, py_sym_types), type(obj)
+        if obj not in tracer.symnode_tracker:
+            tracer.symnode_tracker[obj] = proxy
+
+def has_proxy_slot(obj, tracer):
+    assert isinstance(obj, (torch.Tensor, SymNode)), type(obj)
+    return get_proxy_slot(obj, tracer, False, lambda _: True)
+
+# the default argument is what to return if the slot is not set.
+# the transform argument is handy if you need to extract a subfield from
+# the successfully looked up result (but NOT the default.)
+def get_proxy_slot(obj, tracer, default=no_default, transform=lambda x: x):
+    if isinstance(obj, torch.Tensor):
+        tracker = tracer.tensor_tracker
+    elif isinstance(obj, torch.ScriptObject):
+        tracker = tracer.script_object_tracker
+    else:
+        assert isinstance(obj, py_sym_types), type(obj)
+        tracker = tracer.symnode_tracker
+
+    if obj not in tracker:
+        if default is no_default:
+            raise RuntimeError(f"{obj} is not tracked with proxy for {tracer}")
+        return default
+    return transform(tracker[obj])
+
+def snapshot_fake(val):
+    return val.detach()
+
+def extract_val(val):
+    if is_fake(val):
+        return snapshot_fake(val)
+    elif isinstance(val, py_sym_types):
+        return val
+    elif isinstance(val, torch.ScriptObject):
+        return val
+    elif isinstance(val, BackwardState):
+        return val
+    elif isinstance(val, (list, tuple)):
+        return val.__class__([extract_val(x) for x in val])
+    elif isinstance(val, torch.Tensor):
+        if not val.is_sparse:
+            # NB: Kinda hacky, but we should try to get val as the metadata
+            # everywhere
+            # TODO: This doesn't properly track storages.  A more robust
+            # approach would be to maintain a per-trace FakeTensorMode and
+            # from_real_tensor to create fake values (don't forget to
+            # snapshot_fake)
+            fake_tensor_mode = FakeTensorMode(allow_fallback_kernels=True)
+            with fake_tensor_mode:
+                return torch.empty_strided(val.shape, val.stride(), device=val.device, dtype=val.dtype)
+        else:
+            return None
+    elif isinstance(val, (int, float, bool)):
+        return val
+
+# What invariants do we have for the 'val' set on the FX node?  It has accurate
+# metadata... but only for metadata that exists "below" all other subsystems
+# (most notably autograd, but also vmap, functorch transforms, etc).  This means
+# you can get the dtype, shape, stride, storage, but you CANNOT get requires_grad,
+# grad_fn, _base (_base actually may be set due to recursive call to
+# ADInplaceOrView, but you shouldn't rely on it.)
+def set_meta(proxy, val):
+    proxy.node.meta['val'] = extract_val(val)
+    # Best effort tensor_meta setting; prefer using val!
+    if is_fake(val):
+        proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
+    elif isinstance(val, torch.Tensor) and not val.is_sparse:
+        proxy.node.meta['tensor_meta'] = _extract_tensor_metadata(val)
+    return proxy
+
+def thunkify(f, *args, **kwargs):
+    """
+    Delays computation of f until it's called again
+    Also caches the result
+    """
+    return functools.lru_cache(1)(functools.partial(f, *args, **kwargs))
+
+def track_tensor(tensor, proxy, *, constant, tracer):
+    def try_set_proxy_slot(outer_s, proxy_callable, *args):
+        assert callable(proxy_callable)
+        if isinstance(outer_s, SymInt):
+            set_proxy_slot(outer_s, tracer, thunkify(proxy_callable, outer_s, *args))
+    # The basic idea is that we need to associate each tensor/SymInt
+    # with a Proxy.  How do we setup this association?  We just store
+    # the proxy on the proxy slot of the object, keyed on the tracer
+    # (so that if we have multiple tracers at the same time, they
+    # don't clobber each other.)
+    for i, s in enumerate(tensor.shape):
+        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_size.int(proxy, i), x), i)
+
+    for i, s in enumerate(tensor.stride()):
+        try_set_proxy_slot(s, lambda x, i: set_meta(torch.ops.aten.sym_stride.int(proxy, i), x), i)
+
+    try_set_proxy_slot(tensor.numel(), lambda x: set_meta(torch.ops.aten.sym_numel.default(proxy), x))
+    try_set_proxy_slot(tensor.storage_offset(), lambda x: set_meta(torch.ops.aten.sym_storage_offset.default(proxy), x))
+    set_proxy_slot(tensor, tracer, _ProxyTensor(proxy, constant))
+
+def track_tensor_tree(inner_res, proxy_res, *, constant, tracer):
+    def wrap_with_proxy(e, proxy, constant):
+        if isinstance(e, torch.Tensor):
+            track_tensor(e, proxy, tracer=tracer, constant=constant)
+            set_meta(proxy, e)
+        elif isinstance(e, py_sym_types):
+            # NB: eagerly set meta here, so that the numbering is in order
+            set_meta(proxy, e)
+            set_proxy_slot(e, tracer, lambda: proxy)
+        elif isinstance(e, torch.ScriptObject):
+            set_proxy_slot(e, tracer, proxy)
+            set_meta(proxy, e)
+        elif isinstance(e, (tuple, list)):
+            if isinstance(proxy, fx.Proxy):
+                set_meta(proxy, e)
+
+            # example use case: allreduce_ returns ([tensor], work)
+            for idx, ee in enumerate(e):
+                wrap_with_proxy(ee, proxy[idx], get_constant(idx))
+        elif isinstance(e, dict):
+            # In theory we could support const-prop when proxy-tensor-tracing
+            # operators that returns dicts of tensors, but we have no use case
+            # for it today (since the only op we currently trace that can
+            # return a dict is triton_kernel_wrapper_functional/mutation,
+            # which does not participate in const-prop)
+            assert constant is None
+
+            if isinstance(proxy, fx.Proxy):
+                set_meta(proxy, e)
+
+            # example use case: triton_kernel_wrapper takes arguments as kwargs
+            for key, val in e.items():
+                wrap_with_proxy(val, proxy[key], None)
+        elif isinstance(e, BackwardState):
+            set_meta(proxy, e)
+            e.proxy = proxy
+        else:
+            # intentionally pass on primitives
+            pass
+
+
+    def get_constant(idx):
+        if constant is None:
+            return None
+        else:
+            return constant[idx]
+
+    wrap_with_proxy(inner_res, proxy_res, constant)
+
+    return inner_res
+
+
+def maybe_disable_fake_tensor_mode():
+    # TODO: figure out if this API generally makes sense and bake it into the
+    # library
+    return unset_fake_temporarily()
+
+
+@dataclass
+class _ProxyTensor:
+    proxy: Proxy
+    constant: Optional[torch.Tensor]
+
+
+def fetch_sym_proxy(tracer):
+    def inner(e):
+        n = e.node
+        if n.constant is not None:
+            return n.constant
+        if e.node.expr.is_number:
+            if isinstance(e, SymBool):
+                return bool(e.node.expr)
+            elif isinstance(e, SymInt):
+                return int(e.node.expr)
+            return float(e.node.expr)
+        else:
+            # NB: we REQUIRE all symints to be tracked
+            return get_proxy_slot(e, tracer)()
+    return inner
+
+
+def fetch_object_proxy(tracer):
+    return lambda t: get_proxy_slot(t, tracer, t)
+
+HANDLED_TYPES = (torch.Tensor, torch.nn.Parameter, FakeTensor)
+
+def proxy_call(proxy_mode, func, pre_dispatch, args, kwargs):
+    unrecognized_types = []
+
+    def can_handle_tensor(x):
+        r = type(x) in HANDLED_TYPES or has_proxy_slot(x, proxy_mode.tracer)
+        if proxy_mode._allow_fake_constant:
+            r = r or type(x) in (torch._subclasses.FakeTensor,)
+        if not r:
+            unrecognized_types.append(type(x))
+        return r
+
+    # If there are any tensor subclasses, we need to handle those tensor subclasses first
+    # TODO: we could use types to test this
+    if not pytree.tree_all_only(torch.Tensor, can_handle_tensor, (args, kwargs)):
+        not_implemented_log.debug("ProxyTensorMode tensors without proxy had unrecognized subclasses: %s", unrecognized_types)
+        return NotImplemented
+
+    r = maybe_handle_decomp(proxy_mode, func, args, kwargs)
+    if r is not NotImplemented:
+        return r
+
+    # For pre-autograd tracing, we do not want to run CompositeImplicit decomps.
+    if not pre_dispatch and func not in [
+        torch.ops.aten.size.default, torch.ops.aten.stride.default, torch.ops.aten.storage_offset.default
+    ]:
+        with proxy_mode:
+            r = func.decompose(*args, **kwargs)
+            if r is not NotImplemented:
+                return r
+
+    tracer = proxy_mode.tracer
+    f_args, f_kwargs = pytree.tree_map_only((torch.Tensor, torch.ScriptObject), fetch_object_proxy(tracer), (args, kwargs))
+
+    # If there are SymInts, we also should not consider this constant.
+    # However, fake tensor handling of SymInts is sufficiently broken that
+    # I couldn't write a test for this case
+    all_constant = (
+        pytree.tree_all_only(_ProxyTensor, lambda t: t.constant is not None, (f_args, f_kwargs))
+        # TODO: maybe constant SymInts should also be allowed?  Not sure if
+        # this can happen
+        and pytree.tree_all_only((SymInt, SymFloat, SymBool), lambda _: False, (args, kwargs))
+    )
+
+    if torch.Tag.data_dependent_output in func.tags:
+        # Check if all of the Tensor inputs are constants
+        if all_constant:
+            const_args, const_kwargs = pytree.tree_map_only(
+                _ProxyTensor, lambda t: t.constant, (f_args, f_kwargs)
+            )
+            with maybe_disable_fake_tensor_mode():
+                return func(*const_args, **const_kwargs)
+        # If any of the Tensor inputs are "real" (not FakeTensor), we may
+        # incorrectly burn in constants by allowing this access.  Raise
+        # an error in this case
+        if proxy_mode._error_on_data_dependent_ops and pytree.tree_all_only(torch.Tensor, lambda t: not is_fake(t), (args, kwargs)):
+            raise RuntimeError(
+                f"It appears that you're trying to get value out of a tracing tensor with {func} - erroring out! "
+                "It's likely that this is caused by data-dependent control flow or similar.  "
+                "It may be possible to trace this with dynamic shapes; try setting tracing_mode='symbolic' "
+                "in your make_fx call."
+            )
+    proxy_args, proxy_kwargs = pytree.tree_map_only(
+        (SymInt, SymFloat, SymBool),
+        fetch_sym_proxy(proxy_mode.tracer),
+        pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (f_args, f_kwargs))
+    )
+
+    # When we trace through a torch.tensor invocation, you never actually
+    # see a torch.ops.aten.tensor call. Instead, the way this function is
+    # implemented internally is that we allocate a plain tensor (this is
+    # *guaranteed* to be a plain tensor, we disable all modes when doing
+    # so), and then call at::lift_fresh on it (to give modes a chance to do
+    # their stuff).  Furthermore, the tensor argument to lift_fresh is guaranteed
+    # to be freshly allocated, so we want lift_fresh to be a no-op (directly
+    # returning the input argument).
+    #
+    # Here is the basic problem: when we trace this sequence of executions
+    # into an FX graph, what happens to this call sequence?  Traditionally,
+    # tensor constants get interned as buffers on the FX GraphModule.  But
+    # this is dangerous.  Consider:
+    #
+    #       x = torch.tensor(1)
+    #       x.add_(2)
+    #
+    # Naively, this traces into:
+    #
+    #       t = self._tensor_constant0  # initialized to torch.tensor(1)
+    #       x = torch.ops.aten.lift_fresh(t)
+    #       x.add_(2)
+    #
+    # If lift_fresh returns t directly, the subsequent add_ call will
+    # modify the tensor constant. Really, the problem is we've violated
+    # the invariant the argument to lift is fresh.  So what we should
+    # preserve the invariant by replacing lift_fresh with lift_fresh_copy:
+    #
+    #       t = self._tensor_constant0  # initialized to torch.tensor(1)
+    #       x = torch.ops.aten.lift_fresh_copy(t)
+    #       x.add_(2)
+    #
+    # This is what the overload modification does.
+    if func is torch.ops.aten.lift_fresh.default:
+        func = torch.ops.aten.lift_fresh_copy.default
+
+
+    proxy_out = proxy_mode.tracer.create_proxy('call_function', func, proxy_args, proxy_kwargs,
+                                               name=proxy_mode.tracer.graph._target_to_str(func.overloadpacket.__name__))
+
+    # This makes DCE marginally less likely to DCE inplace operations.
+    # It is not strictly necessary
+    # Kind of a hacky way to test if an op is in-place or not
+    if func.overloadpacket.__name__[-1] == "_" and func.overloadpacket.__name__[0] != "_":
+        if isinstance(args[0], List):
+            # e.g., c10d::allreduce_ returns a list of tensors as the first element
+            # in the output.
+            for i, a in enumerate(args[0]):
+                a.proxy = proxy_out[0][i]
+        else:
+            args[0].proxy = proxy_out
+
+    out = func(*args, **kwargs)
+
+    # In some circumstances, we will be tracing in a situation where a tensor
+    # is *statically* known to be a constant (currently, this only happens if
+    # you run torch.tensor; deterministic factory functions like torch.arange
+    # don't get this treatment).  When the tensor in question is small, it's
+    # helpful to due constant propagation in case we call item() (in which
+    # case we can return the constant value that is known, rather than give
+    # an error.)  The logic here tests if constant propagation is possible
+    # (because all of the inputs are constant).  If so, we disable fake tensor
+    # mode (if it is on) and do true compute on the constant.
+    #
+    # It's worth highlighting that we're making a policy decision here.
+    # There is a potential that the tensor is actually quite large, and we
+    # don't actually want to run the compute.  The tensor being quite large
+    # is one of the reasons why factory functions don't get this treatment
+    # (since they can be quite large; if a parameter is initialized to a
+    # constant value it will be!)  Similarly, there is also a potential
+    # to run an operator that blows up the size of a small tensor; we don't
+    # protect against this case, but we could force, e.g., only single
+    # element constant computation by testing the numel of the result before
+    # propagating const-ness.  Similarly, we don't require the constant to
+    # live on CPU, but we could.
+    any_constant = pytree.tree_any_only(_ProxyTensor, lambda t: t.constant is not None, (f_args, f_kwargs))
+
+    constant = None
+
+    # If this is a lift, the input tensor is guaranteed to be a
+    # constant, so we keep a copy of the original argument along so
+    # we can query it if we're asked to item() it at some later point
+    if func is torch.ops.aten.lift_fresh_copy.default and out.numel() <= CONSTANT_NUMEL_LIMIT:
+        with maybe_disable_fake_tensor_mode():
+            constant = args[0].clone()
+    elif (
+        torch.Tag.nondeterministic_seeded not in func.tags
+        and all_constant
+        and any_constant
+        and pytree.tree_all_only(torch.Tensor, lambda t: t.numel() <= CONSTANT_NUMEL_LIMIT, out)
+    ):
+        # NB: do NOT include factories as constants
+        with maybe_disable_fake_tensor_mode():
+            const_args, const_kwargs = pytree.tree_map_only(
+                _ProxyTensor, lambda t: t.constant, (f_args, f_kwargs)
+            )
+            constant = func(*const_args, **const_kwargs)
+    else:
+        constant = None
+
+    track_tensor_tree(out, proxy_out, constant=constant, tracer=tracer)
+    return out
+
+class _SymNodeDict:
+    """
+    Wrapper around a dictionary that will hash SymInts with their nodes
+    """
+    def __init__(self):
+        self.sym_node_dict = {}
+
+    def __setitem__(self, key: py_sym_types, value: Any):
+        self.sym_node_dict[key.node] = value
+
+    def __getitem__(self, key: py_sym_types):
+        return self.sym_node_dict[key.node]
+
+    def __contains__(self, key: py_sym_types):
+        return key.node in self.sym_node_dict
+
+    def get(self, key: py_sym_types, default: Any = None):
+        return self.sym_node_dict.get(key.node, default)
+
+class PythonKeyTracer(Tracer):
+    def __init__(self):
+        super().__init__(autowrap_modules=())
+        self.tensor_tracker = WeakTensorKeyDictionary()
+        self.symnode_tracker = _SymNodeDict()  # type: ignore[var-annotated]
+        self.script_object_tracker = WeakIdKeyDictionary(dict=None, ref_type=_WeakHashRef)
+
+    # In general, we don't want to make modules leaves. In principle, users of
+    # this tracer might want to override this in order to turn a couple specific
+    # modules into leaves in the traced graph.
+    def call_module(
+            self, m: torch.nn.Module, forward: Callable[..., Any], args: Tuple[Any, ...], kwargs: Dict[str, Any]
+    ) -> Any:
+        return forward(*args, **kwargs)
+
+    # We don't want to turn getattr calls into proxies. So we just return the actual value.
+    def getattr(self, attr, attr_val, parameter_proxy_cache):
+        return attr_val
+
+    def create_arg(self, a: Any):
+        if isinstance(a, torch.nn.Parameter):
+            for n, p in self.root.named_parameters():
+                if a is p:
+                    return self.create_node('get_attr', n, (), {})
+            qualname: Optional[str] = None
+
+            if not qualname:
+                i = 0
+                while True:
+                    qualname = f'_param_constant{i}'
+                    if not hasattr(self.root, qualname):
+                        break
+                    i += 1
+                setattr(self.root, qualname, a)
+
+            return self.create_node('get_attr', qualname, (), {})
+        elif isinstance(a, (SymInt, SymFloat, SymBool)):
+            assert a.node.constant is not None
+            return a.node.constant
+        return super().create_arg(a)
+
+    def unwrap_proxy(self, e):
+        if isinstance(e, torch.Tensor):
+            return get_proxy_slot(e, self, e, lambda e: e.proxy)
+        elif isinstance(e, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+            return get_proxy_slot(e, self, e, lambda e: e())
+        elif isinstance(e, torch.ScriptObject):
+            return get_proxy_slot(e, self, e)
+        else:
+            return e
+
+
+@torch._disable_dynamo
+def dispatch_trace(
+        root: Union[torch.nn.Module, Callable],
+        tracer: Tracer,
+        concrete_args: Optional[Tuple[Any, ...]] = None,
+) -> GraphModule:
+    graph = tracer.trace(root, concrete_args)
+    from torch._inductor.fx_passes.dedupe_symint_uses import dedupe_symints
+    dedupe_symints(graph)
+    name = root.__class__.__name__ if isinstance(root, torch.nn.Module) else root.__name__
+    return fx._lazy_graph_module._make_graph_module(tracer.root, graph, name)
+
+
+def wrap_key(f, tensors, tracer, pre_dispatch: bool):
+    flat_tensors, tensors_spec = pytree.tree_flatten(tensors)
+
+    @functools.wraps(f)
+    def wrapped(*proxies):
+        flat_proxies, proxies_spec = pytree.tree_flatten(proxies)
+        assert len(flat_proxies) == len(flat_tensors)
+        with disable_proxy_modes_tracing() as m:
+            assert isinstance(m, ProxyTorchDispatchMode)
+            track_tensor_tree(flat_tensors, flat_proxies, constant=None, tracer=tracer)
+
+        out = f(*tensors)
+        out = pytree.tree_map_only(
+            torch.Tensor,
+            lambda t: get_proxy_slot(t, tracer, t, lambda x: x.proxy),
+            out
+        )
+        out = pytree.tree_map_only(
+            (SymInt, SymFloat, SymBool),
+            lambda t: get_proxy_slot(t, tracer)(),
+            out
+        )
+        return out
+
+    return wrapped
+
+ORIGINAL_ATEN = None
+@contextmanager
+def set_original_aten_op(func):
+    global ORIGINAL_ATEN
+    if ORIGINAL_ATEN is None and fx_traceback.has_preserved_node_meta():
+        ORIGINAL_ATEN = func
+        fx_traceback.current_meta['original_aten'] = func
+        try:
+            yield
+        finally:
+            ORIGINAL_ATEN = None
+            fx_traceback.current_meta['original_aten'] = None
+    else:
+        yield
+
+
+
+# This mode is **only** used for pre_dispatch tracing.
+# In particular, we need to make sure that autograd/autocast API's
+# that do not desugar into dispatcher operators stay in the graph.
+class PreDispatchTorchFunctionMode(TorchFunctionMode):
+
+    def __init__(self, tracer):
+        self.tracer = tracer
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        if func in _side_effectful_need_to_be_preserved_pre_dispatch:
+            # It's for passing the export verifier which needs to verify the meta['val']
+            # TODO(tmanlaibaatar): we should systematically couple it with expoert verifier,
+            # instead of hardcoding it here.
+            node = self.tracer.create_node("call_function", func, args, {})
+            if func is torch._C._set_grad_enabled:
+                node.meta['val'] = None
+            return node
+            # Don't actually run the function! We just want to trace the calls
+            # into a graph. We don't actualy want to change global autograd state.
+        return func(*args, **kwargs)
+
+
+class ProxyTorchDispatchMode(TorchDispatchMode):
+    def __init__(self, tracer, tracing_mode, pre_dispatch=False, _allow_fake_constant=False, _error_on_data_dependent_ops=True):
+        dk = torch._C.DispatchKey.PreDispatch if pre_dispatch else None
+        super().__init__(dk)
+        self.tracer = tracer
+        self.tracing_mode = tracing_mode
+        self.enable_tracing = True
+        self.pre_dispatch = pre_dispatch
+        self._allow_fake_constant = _allow_fake_constant
+        self._error_on_data_dependent_ops = _error_on_data_dependent_ops
+        self.sym_mode = ProxySymDispatchMode(tracer)
+        self.trace_state = {}
+        self._managers = []
+        # Indicates to our torch_dispatch dispatching infra that
+        # this is an "infra" mode with lower dispatching precedence.
+        self._mode_key = torch._C._TorchDispatchModeKey.PROXY
+        # Every time we enter a mode, we maintain a stack telling us what the previous
+        # ProxyTorchDispatchMode state was (if there was any).
+        # This lets us properly reset the state on exit.
+        self.enter_stack: List[Optional[ProxyTorchDispatchMode]] = []
+
+    @count
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        with self.sym_mode.enable(False), set_original_aten_op(func):
+            return self.inner_torch_dispatch(func, types, args, kwargs)
+
+    def __enter__(self):
+        # sym mode first, then us...
+        m = self.sym_mode.enable(True)
+        self._managers.append(m)
+        m.__enter__()
+        # Stash and store the previous proxy mode (there may or may not be one)
+        maybe_prev_proxy_mode = _unset_infra_mode(torch._C._TorchDispatchModeKey.PROXY)
+        self.enter_stack.append(maybe_prev_proxy_mode)
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        m = self._managers.pop()
+        # ...exit us first, then sym mode
+        b = super().__exit__(exc_type, exc_value, traceback)
+
+        # Re-enable the previous proxy mode, if there was one.
+        mb_previous_proxy_mode = self.enter_stack.pop()
+        if mb_previous_proxy_mode is not None:
+            _push_mode(mb_previous_proxy_mode)
+
+        if not b:
+            return m.__exit__(exc_type, exc_value, traceback)
+        else:
+            return m.__exit__(None, None, None)
+
+
+    def inner_torch_dispatch(self, func, types, args=(), kwargs=None):
+        if not self.enable_tracing:
+            return func(*args, **kwargs)
+
+        if func in [prim.device.default]:
+            return func(*args, **kwargs)
+
+        return proxy_call(self, func, self.pre_dispatch, args, kwargs)
+
+
+class ProxySymDispatchMode(SymDispatchMode):
+    def __init__(self, tracer):
+        super().__init__()
+        self.tracer = tracer
+        # When false, we don't trace operations.  If you do this, you MUST
+        # call track_tensor/track_tensor_tree on all results of the operation
+        # to ensure we can adequately track the results
+        self.enable_tracing = True
+
+    @contextmanager
+    def enable(self, b):
+        old = self.enable_tracing
+        self.enable_tracing = b
+        try:
+            yield
+        finally:
+            self.enable_tracing = old
+
+    def _compute_proxy(self, func, args, out: Union[SymInt, SymFloat, SymBool]):
+        n_args = tuple(
+            get_proxy_slot(a, self.tracer)().node if isinstance(a, py_sym_types) else a
+            for a in args
+        )
+
+        # func doesn't have a __torch_function__ that Proxy can interpose, so
+        # we gotta do it manually
+        n_out = self.tracer.create_node("call_function", func, n_args, {})
+        p_out = fx.Proxy(n_out, self.tracer)
+        set_meta(p_out, out)
+        return p_out
+
+    def __sym_dispatch__(self, func, types, args, kwargs):
+        if not self.enable_tracing:
+            return func(*args, **kwargs)
+
+        # Peephole optimize multiply by one
+        # NB: be careful not to trigger guards here!
+        if func == operator.mul:
+            if isinstance(args[1], int) and args[1] == 1:
+                return args[0]
+            elif isinstance(args[0], int) and args[0] == 1:
+                return args[1]
+
+        # For speed, we assume there are no nested data structures
+        # (otherwise we could use tree_map)
+        # We also assume there are no keyword arguments.
+        assert not kwargs
+        out = func(*args, **kwargs)
+
+        # If func returned a constant, we don't need to trace; we have
+        # determined that the result is constant (no matter if the inputs
+        # were symbolic) and it is no longer necessary to trace the
+        # computation.  This could occur if func triggered some guards.
+        if isinstance(out, py_sym_types):
+            # Delays tracing out the proxies on this op until we actually need it
+            p_out_thunk = thunkify(self._compute_proxy, func=func, args=args, out=out)
+            set_proxy_slot(out, self.tracer, p_out_thunk)
+
+        return out
+
+
+# TODO: I'm not sure what the point of this class is; you can just
+# make_fx through a regular Interpreter
+class DecompositionInterpreter(torch.fx.Interpreter):
+    def __init__(self, module: torch.fx.GraphModule, new_graph: torch.fx.Graph, decomposition_table=None, **kwargs):
+        super().__init__(module, **kwargs)
+        self.new_graph = new_graph
+        self.tracer = torch.fx.proxy.GraphAppendingTracer(self.new_graph)
+        # Blegh
+        self.tracer.tensor_tracker = WeakTensorKeyDictionary()  # type: ignore[attr-defined]
+        self.tracer.symnode_tracker = weakref.WeakKeyDictionary()  # type: ignore[attr-defined]
+        self.decomposition_table = decomposition_table
+        if self.decomposition_table is None:
+            self.decomposition_table = {}
+        self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
+
+    def placeholder(self, target, args, kwargs):
+        out = super().placeholder(target, args, kwargs)
+        proxy = torch.fx.Proxy(self.new_graph.placeholder(target), self.tracer)
+        track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+        # TODO handle case where the first character of target is '*'
+        return out
+
+    def get_attr(self, target, args, kwargs):
+        out = super().get_attr(target, args, kwargs)
+        proxy = torch.fx.Proxy(self.new_graph.get_attr(target), self.tracer)
+        track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+        return out
+
+    # call_function, call_method, call_module get traced automatically by the outer mode.
+
+    def output(self, target, args, kwargs):
+        out = super().output(target, args, kwargs)
+
+        def unwrap(e):
+            return get_proxy_slot(e, self.tracer, e, lambda x: x.proxy.node)
+        self.new_graph.output(pytree.tree_map(unwrap, out))
+        return out
+
+    def run(self, *args, **kwargs):
+        # Should enter the mode at least once for being able to restore it later
+        # See: https://github.com/pytorch/pytorch/pull/82549#discussion_r934782025
+        with decompose(self.decomposition_table), self.mode:
+            return super().run(*args, **kwargs)
+
+
+def wrapper_and_args_for_make_fx(func, args, kwargs):
+    # make_fx doesn't support kwargs, so we need to do this flattening
+    # and then unflatten the args before calling func
+    flat_args, spec = pytree.tree_flatten((args, kwargs))
+
+    def wrapped(flat_args):
+        fn_args, fn_kwargs = pytree.tree_unflatten(flat_args, spec)
+        return func(*fn_args, **fn_kwargs)
+    return wrapped, flat_args
+
+@contextmanager
+def disable_autocast_cache():
+    old_value = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    try:
+        yield
+    finally:
+        torch.set_autocast_cache_enabled(old_value)
+
+
+class _ModuleStackTracer(PythonKeyTracer):
+    r"""Customized version of PythonKeyTracer that retains module stack
+    information in node.meta["nn_module_stack"].
+
+    FX symbolic trace actually does this already, but it relies on `self.root`
+    being the actual module being traced. Since make_fx traces a lambda of our
+    creation, things don't work properly.
+
+    So for this version we hold onto a reference to the original module
+    (scope_root) and use that to match the path. Also when we see,
+            A
+           / \
+          B   C
+           \ /
+            D
+    we want to record the path as A.B.D by recording only one path.
+    See Note [Preserving the nn module stack metadata during export non-strict mode]  # noqa: W605
+    """
+
+    def __init__(self, scope_root):
+        super().__init__()
+        self.scope_root = scope_root
+        self.proxy_paths = WeakKeyDictionary()
+        self.proxy_modules = WeakKeyDictionary()
+        self.counter = 0
+
+        self.module_id_cache = defaultdict(list)
+        for name, mod in self.scope_root.named_modules(remove_duplicate=False):
+            self.module_id_cache[id(mod)].append(name)
+
+        self_ = self
+
+        class AttrProxy:
+            def __init__(self, base, path):
+                self.__class__ = type(
+                    base.__class__.__name__,
+                    (self.__class__, base.__class__),
+                    {},
+                )
+                self.__dict__ = base.__dict__
+                self.__class__.__module__ = base.__class__.__module__
+                self.__class__.__qualname__ = base.__class__.__qualname__
+                self_.proxy_paths[self] = path
+                self_.proxy_modules[self] = base
+
+            def __getattr__(self, name):
+                assert isinstance(self, torch.nn.Module)
+                attr_val = super().__getattr__(name)
+                if isinstance(attr_val, AttrProxy):
+                    attr_val = self_.proxy_modules[attr_val]
+                elif not isinstance(attr_val, torch.nn.Module):
+                    return attr_val
+                return AttrProxy(attr_val, self_.proxy_paths[self] + "." + name)
+
+            @property
+            def _modules(self):
+                assert "_modules" in self.__dict__
+                submodules = self.__dict__["_modules"]
+                assert isinstance(submodules, dict)
+                return {
+                    key: AttrProxy(value, self_.proxy_paths[self] + "." + str(key))
+                    for key, value in submodules.items()
+                }
+
+        self.proxy_type = AttrProxy
+
+    def path_of_module(self, mod: torch.nn.Module) -> str:
+        """
+        Use tracked access path during tracing instead of the default BFS behavior.
+        Still use all the possible module paths to verify the result.
+        """
+        if mod is self.scope_root:
+            return ""
+
+        if isinstance(mod, self.proxy_type):
+            return self.proxy_paths[mod]
+
+        return Tracer.path_of_module(self, mod)
+
+    def getattr(self, attr, attr_val, parameter_proxy_cache):
+        if not isinstance(attr_val, torch.nn.Module) or isinstance(attr_val, torch.fx.GraphModule):
+            return super().getattr(attr, attr_val, parameter_proxy_cache)
+        if isinstance(attr_val, self.proxy_type):
+            return attr_val
+        return self.proxy_type(attr_val, attr)
+
+    def trace(self, root, concrete_args):
+        res = super().trace(root, concrete_args)
+        # Since we are making AttrProxy mimic the original
+        # submodule, when someone registers a module directly
+        # to the tracer while tracing, the proxy object gets registered
+        # first. So we need to replace the proxy modules with the real ones
+        # This can happen during HOO tracing
+        proxy_module_names_to_be_replaced = []
+        for name, module in self.root.named_modules():
+            if module in self.proxy_modules:
+                proxy_module_names_to_be_replaced.append((name, module))
+
+        def _delete_proxy_attr(obj, target):
+            # Copied from fx/graph_module.py
+            # Customized it for proxy type
+            atoms = target.split(".")
+            path, target_submod = atoms[:-1], atoms[-1]
+            assert isinstance(obj, torch.nn.Module)
+            mod = obj
+
+            # Get the parent module
+            for item in path:
+
+                if not hasattr(mod, item):
+                    return False
+
+                mod = getattr(mod, item)
+
+                if not isinstance(mod, (self.proxy_type, torch.nn.Module)):
+                    return False
+
+            if not hasattr(mod, target_submod):
+                return False
+
+            # At least the leaf module should be proxy type.
+            if not isinstance(getattr(mod, target_submod), self.proxy_type):
+                return False
+
+            delattr(mod, target_submod)
+            return True
+
+        for (proxy_module_name, proxy_module) in proxy_module_names_to_be_replaced:
+            _delete_proxy_attr(self.root, proxy_module_name)
+            actual_module = self.proxy_modules[proxy_module]
+            _assign_attr(actual_module, self.root, proxy_module_name)
+
+        return res
+
+
+    def call_module(self, m, forward, args, kwargs):
+        """PythonKeyTracer overrides call_module to avoid the scope handling,
+        but we actually want it.
+        """
+        from torch._dynamo import OptimizedModule
+        # FIXME (tmanlaibaatar)
+        # When we call torch.compile inside HOO, we will end up
+        # invoking a module that is not registered on the root. For
+        # now, we just inline them. But once we start supporting
+        # mark_strict in export, we do need to properly handle this.
+        # Right now, it doesn't matter because current non-strict
+        # use cases don't need to work with HOO.
+        if isinstance(m, (OptimizedModule, GraphModule)):
+            return forward(*args, **kwargs)
+        return Tracer.call_module(self, m, forward, args, kwargs)
+
+
+    def is_leaf_module(self, m, module_qualified_name):
+        return False
+
+
+def make_fx(f,
+            decomposition_table=None,
+            tracing_mode="real",
+            _allow_non_fake_inputs=False,
+            *,
+            pre_dispatch=False,
+            record_module_stack=False,
+            _allow_fake_constant=False,
+            _error_on_data_dependent_ops=True):
+    assert tracing_mode in ["real", "fake", "symbolic"]
+
+    if decomposition_table is None:
+        decomposition_table = {}
+
+    if torch.ops.aten.sym_numel.default not in decomposition_table:
+        decomposition_table = {
+            **decomposition_table,
+            torch.ops.aten.sym_numel.default: torch._decomp.decompositions.sym_numel
+        }
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        # Avoid importing sympy at a module level
+        from .symbolic_shapes import ShapeEnv
+
+        phs = pytree.tree_map(lambda _: fx.PH, args)  # type: ignore[attr-defined]
+
+        if hasattr(f, "_orig_mod") and record_module_stack:
+            scope_root = f._orig_mod
+            fx_tracer = _ModuleStackTracer(scope_root)
+        else:
+            fx_tracer = PythonKeyTracer()
+        fake_tensor_mode: Any = nullcontext()
+        if tracing_mode == "real":
+            fake_tensor_mode = nullcontext()
+        elif tracing_mode == "fake":
+            import torch._dynamo
+            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+            if fake_tensor_mode is None:
+                fake_tensor_mode = FakeTensorMode(
+                    allow_fallback_kernels=True,
+                    allow_non_fake_inputs=_allow_non_fake_inputs,
+                    shape_env=ShapeEnv(),
+                    static_shapes=True,
+                )
+        elif tracing_mode == "symbolic":
+            import torch._dynamo
+            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+            if fake_tensor_mode is None:
+                shape_env = ShapeEnv()
+                fake_tensor_mode = FakeTensorMode(
+                    allow_fallback_kernels=False,
+                    allow_non_fake_inputs=_allow_non_fake_inputs,
+                    shape_env=shape_env)
+            else:
+                shape_env = fake_tensor_mode.shape_env
+                assert shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
+
+        else:
+            raise AssertionError(f"Unexpected tracing type: {tracing_mode}")
+
+        python_dispatcher_mode: Any = nullcontext()
+        pre_dispatch_mode: Any = nullcontext()
+        # pre-autograd tracing uses per-dispatch-key modes,
+        # which requires the python dispatcher
+        if tracing_mode == "symbolic" or pre_dispatch:
+            python_dispatcher_mode = enable_python_dispatcher()
+        if pre_dispatch:
+            pre_dispatch_mode = enable_pre_dispatch()
+
+        proxy_function_mode: Any = nullcontext()
+        if pre_dispatch:
+            proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer)
+
+        proxy_mode = ProxyTorchDispatchMode(fx_tracer,
+                                            tracing_mode,
+                                            pre_dispatch=pre_dispatch,
+                                            _allow_fake_constant=_allow_fake_constant,
+                                            _error_on_data_dependent_ops=_error_on_data_dependent_ops)
+
+        arg_count = 0
+
+        def wrap_fake(x):
+            nonlocal arg_count
+            # TODO: it would be nice to line these up with the names
+            # FX will choose for the placeholders, but we don't
+            # actually know what the names will be at this point yet
+            # NB: the Source here is actually meaningless
+            from torch._dynamo.source import ConstantSource
+            source = ConstantSource(f"input{arg_count}")
+            if isinstance(x, torch.Tensor):
+                arg_count += 1
+                return fake_tensor_mode.from_tensor(x, source=source)  # type: ignore[attr-defined]
+            # NB: don't match on bools
+            elif type(x) is int and tracing_mode == "symbolic":
+                return shape_env.create_symintnode(shape_env.create_symbol(x, source, positive=None), hint=x, source=source)
+
+            return x
+
+        sym_mode = proxy_mode.sym_mode
+
+        wrap_fn_map = {
+            "real": lambda x: x,
+            "fake": wrap_fake,
+            "symbolic": wrap_fake,
+        }
+        args = pytree.tree_map(wrap_fn_map[tracing_mode], args)
+
+        if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
+            # FX doesn't support varargs, so we gotta fake up a wrapper
+            # TODO: Would be nice to fix this at the source...
+            func = fake_signature(f, len(phs))
+        else:
+            func = f
+
+        # We disable the autocast cache as the autocast cache causes type conversions on parameters to
+        # check a cache, which introduces untracked tensors into the graph
+        #
+        # We also disable tracing by any other tensor proxy-based tracers except the current. The
+        # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is
+        # thus irrelevant to any external functional trace.
+        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, pre_dispatch_mode, proxy_function_mode, \
+             sym_mode, proxy_mode, disable_autocast_cache():
+            t = dispatch_trace(wrap_key(func, args, fx_tracer, pre_dispatch), tracer=fx_tracer, concrete_args=tuple(phs))
+
+        # TODO: kind of a bad way to do it, should maybe figure out a better way
+        if tracing_mode == "symbolic":
+            t.shape_env = shape_env  # type: ignore[assignment]
+        return t
+
+    return wrapped
+
+
+def get_torch_dispatch_modes():
+    return torch.utils._python_dispatch._get_current_dispatch_mode_stack()
+
+
+def get_innermost_proxy_mode():
+    return torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+
+
+@contextlib.contextmanager
+def disable_proxy_modes_tracing():
+    return _disable_infra_mode(torch._C._TorchDispatchModeKey.PROXY)
+
+
+def maybe_handle_decomp(proxy_mode, op, args, kwargs):
+    if op in CURRENT_DECOMPOSITION_TABLE:
+        with proxy_mode:
+            return CURRENT_DECOMPOSITION_TABLE[op](*args, **kwargs)
+    return NotImplemented
+
+
+def get_isolated_graphmodule(func, args, kwargs, tracing_mode="real"):
+    """A helper function used to get the GraphModule for the given func.
+
+    It's expected to be used in the ProxyTensor tracing context.
+    It detaches the args and kwargs from the current tracer so that the trace of
+    the current graph module can be created without any side-effects.
+    """
+    wrapped, all_args = wrapper_and_args_for_make_fx(func, args, kwargs)
+
+    with disable_proxy_modes_tracing():
+        gm = make_fx(wrapped, tracing_mode=tracing_mode)(all_args)
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/recording.py b/MLPY/Lib/site-packages/torch/fx/experimental/recording.py
new file mode 100644
index 0000000000000000000000000000000000000000..755394fffbbb41d636265227c95161b94d823cc4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/recording.py
@@ -0,0 +1,458 @@
+import functools
+import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils._pytree as pytree
+
+
+__all__ = [
+    "ShapeEnvEvent",
+    "record_shapeenv_event",
+    "replay_shape_env_events",
+    "FakeTensorMeta",
+    "shape_env_check_state_equal",
+    "NotEqualError",
+]
+
+# [Note: Recording ShapeEnv Events]
+# =================================
+#
+# What is a ShapeEnv event?
+# -------------------------
+# We consider a ShapeEnv event every function call (ShapeEnv method or
+# independent function) that modifies the state of the ShapeEnv instance.
+# Such calls are recorded alongside their positional and keyword arguments,
+# so that it may be replayed over a different ShapeEnv instance.
+#
+# See [Note: ShapeEnv State Equality] for what is considered the state
+# of a ShapeEnv instance.
+#
+# What is it for?
+# ---------------
+# ShapeEnv events recording is used for reconstructing the ShapeEnv in an
+# arbitrary state in time.
+#
+# Being able to arbitrarily replay events like so is useful, mainly for
+# translation validation bisection. i.e. if a ValidationException has been
+# raised, find the earliest point in time where the translation validation
+# fails.
+#
+# Besides that, it also allows us to inspect the given instance and,
+# for example, check the guards that would actually be issued at that point.
+#
+# What kind of arguments can be stored in an event?
+# -------------------------------------------------
+# There's no specific rule for what cannot be used as an argument.
+# That said, pay special attention to the following cases:
+#
+#   1. Tensor inputs: there are some tests that check whether the inputs
+#      were garbage collected after execution. These will fail if there's
+#      an event that is holding a reference to those inputs.
+#
+#   2. ShapeEnv arguments: if there is an argument of ShapeEnv type, that
+#      will be automatically replaced by the new given ShapeEnv instance.
+#
+#   3. SymTypes arguments: they also hold references to ShapeEnv. So,
+#      whenever we see them, we create a new instance, replacing the
+#      ShapeEnv reference.
+#
+#   4. FX nodes: specifically, FX nodes from the FX graph for symbolic
+#      shapes. That argument must be replaced when replaying the event at
+#      ShapeEnvEvent.run, since it has to reference a node from the given
+#      instance, and not from the recorded instance.
+
+
+# Event class for reconstructing ShapeEnv at arbitrary time.
+#
+# Represents a method call that mutates ShapeEnv in a way that affects the
+# issued guards, when ShapeEnv.produce_guards is called.
+@dataclass
+class ShapeEnvEvent:
+    # ShapeEnv method.
+    f: Callable
+
+    # Arguments and keyword arguments called with.
+    args: Optional[List[Any]] = None
+    kwargs: Optional[Dict[str, Any]] = None
+
+    # List of tracked_fakes at the time the method was called.
+    tracked_fakes: Optional[List[Any]] = None
+
+    # Name of the captured event.
+    # Used for special handling of particular methods.
+    name: Optional[str] = None
+
+    # Replay itself, but using shape_env as self.
+    def run(self, shape_env=None) -> Any:
+        from torch.fx.experimental.symbolic_shapes import (
+            is_symbolic,
+            ShapeEnv,
+            SymTypes,
+        )
+
+        # Special handling for the constructor event.
+        if self.f is ShapeEnv:
+            assert shape_env is None and self.args is None and self.kwargs is not None
+            return ShapeEnv(**self.kwargs)
+
+        assert shape_env is not None
+        args = list(self.args or list())
+        kwargs = dict(self.kwargs or dict())
+
+        # Replace any argument of type ShapeEnv by the given one.
+        args, kwargs = pytree.tree_map_only(
+            ShapeEnv, lambda _: shape_env, (args, kwargs)
+        )
+
+        # Replace any argument of type SymTypes by a new instance,
+        # replacing its ShapeEnv reference.
+        args, kwargs = pytree.tree_map_only(
+            lambda x: isinstance(x, SymTypes) and is_symbolic(x),
+            lambda a: type(a)(a.node.with_shape_env(shape_env)),
+            (args, kwargs),
+        )
+
+        # Converts FX nodes using the mapping argument.
+        def maybe_convert_node(x: Any) -> Any:
+            if not isinstance(x, torch.fx.Node):
+                # Don't do anything to x if it's not an FX node.
+                return x
+
+            # If, at some point, we created an FX node, it means that translation validation is on.
+            # It also means we are building an FX graph for symbolic shapes at shape_env.graph, and
+            # we are tracking node names at shape_env.name_to_node.
+            assert hasattr(shape_env, "name_to_node")
+            name_to_node = shape_env.name_to_node  # type: ignore[attr-defined]
+            assert x.name in name_to_node
+            return name_to_node[x.name]
+
+        # Replaces the value of an specific argument by the result of fn.
+        def replacearg(index: int, key: str, fn: Callable):
+            if index < len(args):
+                args[index] = fn(args[index])
+            if key in kwargs:
+                kwargs[key] = fn(kwargs[key])
+
+        if self.is_create_fx_call_function():
+            # ShapeEnv.create_fx_call_function:
+            # "args" parameter is a tuple of FX nodes from the FX graph of the old ShapeEnv.
+            # They must be replaced, since a "call_function" FX node with this tuple as argument
+            # will be added to the FX graph of the new shape_env.
+            replacearg(
+                index=2,
+                key="args",
+                fn=lambda args: tuple(maybe_convert_node(a) for a in args),
+            )
+        if self.is_evaluate_expr() or self.is_defer_runtime_assert():
+            # ShapeEnv.evaluate_expr and ShapeEnv.defer_runtime_assert:
+            # "fx_node" parameter is an (optional) FX node that represents the evaluate expression.
+            # They must be replaced, since it will be part of a "call_function" FX node for
+            # torch._assert, which will be added to the FX graph of the new shape_env.
+            replacearg(index=3, key="fx_node", fn=maybe_convert_node)
+
+        # Actually call the method with the converted arguments.
+        return self.f(*args, **kwargs)
+
+    def __str__(self) -> str:
+        name = self.name if self.name is not None else self.f.__name__
+        return f"event: {name} ({self.args}, {self.kwargs})"
+
+    def is_create_fx_call_function(self) -> bool:
+        return self.name == "_create_fx_call_function"
+
+    def is_evaluate_expr(self) -> bool:
+        return self.name == "evaluate_expr"
+
+    def is_defer_runtime_assert(self) -> bool:
+        return self.name == "defer_runtime_assert"
+
+
+# Extracts a ShapeEnv instance inside args and kwargs.
+# Specifically, it looks for:
+#   1. ShapeEnv arguments
+#   2. SymInt, SymFloat, or SymBool arguments
+# If we find more than one object of any of the above types, we
+# also check that the ShapeEnv instance is the same for all of them.
+def _extract_shape_env_and_assert_equal(args, kwargs):
+    from torch.fx.experimental.symbolic_shapes import is_symbolic, ShapeEnv, SymTypes
+
+    def assert_equal(old: Optional[ShapeEnv], new: ShapeEnv) -> ShapeEnv:
+        if old is not None:
+            assert old is new, "call with different ShapeEnv"
+        return new
+
+    shape_env = None
+    for val in itertools.chain(args, kwargs.values()):
+        if isinstance(val, ShapeEnv):
+            shape_env = assert_equal(shape_env, val)
+        if isinstance(val, SymTypes) and is_symbolic(val):
+            shape_env = assert_equal(shape_env, val.node.shape_env)
+
+    return shape_env
+
+
+# Decorator for recording the given function as a replayable event.
+#
+# This decorator should be used at every function that mutates the state of
+# ShapeEnv in some way that affects the resulting issued guards (i.e. when
+# ShapeEnv.produce_guards is called).
+#
+# save_tracked_fakes: saves a snapshot of the TrackedFake list.
+# This is used when calling ShapeEnv.produce_guards at arbitrary points in time.
+#
+# When to save the list of TrackedFake?
+# =====================================
+# We should save the list of TrackedFake whenever the translation validation
+# bisection may actually stop and call the produce_guards method at the moment
+# right after the recorded function was played. In other words, since the
+# bisection bisects through torch._assert calls, we should save in all methods
+# that adds a torch._assert call to the symbolic shapes FX graph.
+#
+# At the moment, there are 2 methods that save the list:
+#   - ShapeEnv.evaluate_expr
+#   - ShapeEnv.defer_runtime_assert
+def record_shapeenv_event(*, save_tracked_fakes: bool = False) -> Callable:
+    def decorator(fn: Callable) -> Callable:
+        assert callable(fn)
+        name = fn.__name__
+
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+            if isinstance(args[0], ShapeEnv) and args[0].is_recording:  # type: ignore[has-type]
+                # If ShapeEnv is already recording an event, call the wrapped
+                # function directly.
+                #
+                # NB: here, we skip the check of whether all ShapeEnv instances
+                # are equal, in favor of a faster dispatch.
+                return fn(*args, **kwargs)
+
+            # Retrieve an instance of ShapeEnv.
+            # Assumption: the collection of args and kwargs may not reference
+            # different ShapeEnv instances.
+            self = _extract_shape_env_and_assert_equal(args, kwargs)
+
+            # If we are calling this function without any ShapeEnv instance
+            # alive in its arguments, we don't record and call the original.
+            if self is None:
+                return fn(*args, **kwargs)
+
+            # Otherwise, start recording and call the function.
+            with self._recording():
+                # Take a snapshot of the current tracked_fakes.
+                tracked_fakes = (
+                    self._snapshot_tracked_fakes() if save_tracked_fakes else None
+                )
+                # Record the event for 'fn'.
+                event = ShapeEnvEvent(
+                    fn, list(args), kwargs, tracked_fakes, name=fn.__name__
+                )
+                self.events.append(event)
+                # Play the event on this ShapeEnv.
+                return event.run(self)
+
+        return wrapper
+
+    return decorator
+
+
+# Replays the ShapeEnvEvents list.
+# It assumes the first event is the constructor call.
+#
+# fn: transforms an old FX node into one corresponding to the newly created ShapeEnv.
+def replay_shape_env_events(events):
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+    constructor_event = events[0]
+    assert constructor_event.f == ShapeEnv
+
+    # Constructs the new ShapeEnv.
+    shape_env = constructor_event.run()
+
+    for event in events[1:]:
+        try:
+            # Actually replays each event.
+            # We need to call create_mapping_fn every time, since the node list might
+            # change after each event is replayed.
+            event.run(shape_env)
+        except Exception as e:
+            raise RuntimeError(f"failed when running event: {event}") from e
+
+    return shape_env
+
+
+# FakeTensor metadata.
+# This is to be used in place of FakeTensor placeholders when calling
+# ShapeEnv.produce_guards.
+@dataclass
+class FakeTensorMeta:
+    tensor_size: Tuple[Union[int, torch.SymInt], ...]
+    tensor_stride: Tuple[Union[int, torch.SymInt], ...]
+    tensor_storage_offset: Union[int, torch.SymInt]
+    is_nested: bool
+
+    def size(self) -> Tuple[Union[int, torch.SymInt], ...]:
+        return self.tensor_size
+
+    def stride(self) -> Tuple[Union[int, torch.SymInt], ...]:
+        return self.tensor_stride
+
+    def storage_offset(self) -> Union[int, torch.SymInt]:
+        return self.tensor_storage_offset
+
+    def dim(self) -> int:
+        return len(self.tensor_size)
+
+    @staticmethod
+    def from_fake(fake) -> "FakeTensorMeta":
+        return FakeTensorMeta(
+            fake.size(), fake.stride(), fake.storage_offset(), fake.is_nested
+        )
+
+
+# [Note: ShapeEnv State Equality]
+# ===============================
+#
+# What is considered ShapeEnv state?
+# ----------------------------------
+# We consider to be the state of a ShapeEnv instance everything that
+# is not in the inline tuple inside remove_nonstate_variables function.
+# That is: the fields within ShapeEnv that modify the flow of execution
+# of the program.
+#
+# So, for example: the replacements field might influence on how an
+# expression is simplified. That, in turn, may result in a guard being
+# statically known (i.e. not added).
+#
+# On the other hand, var_to_stack serves only changes what is printed
+# in the screen, i.e. used only for debugging purposes. Therefore, we
+# should not consider it when comparing states.
+#
+# What to do on NotEqualError?
+# ----------------------------
+# Here are a few possible causes for getting a NotEqualError raised:
+#
+#   1. New field that does not belong in the ShapeEnv state.
+#      For example: log field of type ShapeEnvLoggerAdapter. Different
+#      ShapeEnv instances will always have different ShapeEnvLoggerAdapter
+#      instances, i.e. equality comparison would fail.
+#      Solution: add it to the inlined tuple inside remove_nonstate_variables
+#      function inside check_equal method.
+#
+#   2. New field that is not directly comparable across instances.
+#      For example: guards field of type List[ShapeGuard]. More specifically,
+#      the ShapeGuard type holds an expression and a stack information
+#      for debugging purposes. When replaying the even on a new ShapeEnv
+#      instance, the stack would be different, which would trigger this error.
+#      Solution: add a special case to the map_value function inside
+#      check_equal function.
+#
+#   3. Mutation of ShapeEnv on some not recorded function.
+#      If a mutation of the state of ShapeEnv happens inside a function
+#      that is not recorded (or that no caller in the stack is recorded),
+#      then, the replayed ShapeEnv won't catch that.
+#      Solution: decorate the function with record_shape_env_event.
+
+
+# Checks whether the state of two ShapeEnv are equal w.r.t. the guards
+# returned by ShapeEnv.produce_guards.
+def shape_env_check_state_equal(env1, env2, non_state_variable_names, map_value):
+    # Collect and remove variables that don't necessarily represent the state
+    # of a ShapeEnv. Note: we copy the dictionary so that we don't modify the
+    # instance itself.
+    env1_vars = vars(env1).copy()
+    env2_vars = vars(env2).copy()
+
+    for v in non_state_variable_names:
+        if v in env1_vars:
+            env1_vars.pop(v)
+        if v in env2_vars:
+            env2_vars.pop(v)
+
+    # Function for transforming the mismatched values into string.
+    # Needed, since dict and set entries order might not be the same every time.
+    def value_to_str(value: Any) -> str:
+        if isinstance(value, dict):
+            return (
+                "{"
+                + ", ".join(f"{k}: {value[k]}" for k in sorted(value.keys(), key=str))
+                + "}"
+            )
+        if isinstance(value, set):
+            return "{" + ", ".join(f"{v}" for v in sorted(value)) + "}"
+        return str(value)
+
+    # Compares env1_vars with env2_vars.
+    # Here, we allow the value of each field to be mapped, so that we appropriately
+    # compare the two values.
+    def compare_vars(
+        map_value: Callable[[str, Any], Any]
+    ) -> List[Tuple[str, str, str]]:
+        env1_set, env2_set = set(env1_vars), set(env2_vars)
+
+        # First, compare the set of keys in each vars dictionary.
+        if env1_set != env2_set:
+            raise NotEqualError(
+                "field set mismatch:",
+                [
+                    (
+                        "found unique fields:",
+                        str(sorted(env1_set - env2_set)),
+                        str(sorted(env2_set - env1_set)),
+                    ),
+                ],
+            )
+
+        # Then, sort the keys, and compare the mapped values of each key.
+        sorted_keys = list(env1_set)
+        sorted_keys.sort()
+
+        mapped_dict = [
+            (k, map_value(k, env1_vars[k]), map_value(k, env2_vars[k]))
+            for k in sorted_keys
+        ]
+
+        # Return a list of tuples representing the fields that did not match
+        # alongside their respective mapped values.
+        return [
+            (f"{k}: values don't match.", value_to_str(val1), value_to_str(val2))
+            for k, val1, val2 in mapped_dict
+            if val1 != val2
+        ]
+
+    # Accumulate the mismatching fields.
+    errors = compare_vars(map_value)
+
+    if len(errors) > 0:
+        raise NotEqualError("field values don't match:", errors)
+
+
+class NotEqualError(Exception):
+    def __init__(
+        self,
+        msg: str,
+        mismatched: List[Tuple[str, str, str]],
+    ) -> None:
+        details = "\n".join(
+            [
+                "\n".join(
+                    [
+                        f"==> {inner_msg}",
+                        f"  >  Left: {str1}",
+                        f"  > Right: {str2}",
+                    ]
+                )
+                for inner_msg, str1, str2 in mismatched
+            ]
+        )
+
+        super().__init__(
+            f"""\
+ShapeEnv not equal: {msg}
+
+{details}
+"""
+        )
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/refinement_types.py b/MLPY/Lib/site-packages/torch/fx/experimental/refinement_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea1b75a7221a08b4abc8e0b2d421dc92e44867c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/refinement_types.py
@@ -0,0 +1,16 @@
+class Equality:
+    def __init__(self, lhs, rhs):
+        self.lhs = lhs
+        self.rhs = rhs
+
+    def __str__(self):
+        return f'{self.lhs} = {self.rhs}'
+
+    def __repr__(self):
+        return f'{self.lhs} = {self.rhs}'
+
+    def __eq__(self, other):
+        if isinstance(other, Equality):
+            return self.lhs == other.lhs and self.rhs == other.rhs
+        else:
+            return False
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/rewriter.py b/MLPY/Lib/site-packages/torch/fx/experimental/rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7472c9cde89d2eb873bcc2bab4c18a9be0fee216
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/rewriter.py
@@ -0,0 +1,121 @@
+import ast
+import inspect
+import textwrap
+import copy
+import functools
+from types import FunctionType
+from typing import cast, Union, Callable, Dict, Optional, Any
+from torch.fx._symbolic_trace import Tracer
+from torch.fx.graph import Graph
+from torch._sources import normalize_source_lines
+import torch
+
+class AST_Rewriter(ast.NodeTransformer):
+    """
+    Take a FunctionType object representing a `forward` method, then
+    perform an AST rewrite to swap out nodes that are not symbolically
+    traceable with a callsite to the FX alternative.
+
+    To support swapping out an AST node, define a new `visit` method on
+    that node. For more details, see:
+    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
+    """
+
+    def rewrite(self, fn: FunctionType):
+
+        # Normalize the source lines
+        sourcelines, _ = inspect.getsourcelines(fn)
+        sourcelines = normalize_source_lines(sourcelines)
+        source = ''.join(sourcelines)
+        normalized_str = textwrap.dedent(source)
+
+        # Rewrite the original AST
+        source_ast = ast.parse(normalized_str)
+        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
+
+        # Pull out the compiled function from the newly-created Module
+        code = compile(dest_ast, "", "exec")
+        globals_dict = copy.copy(fn.__globals__)
+        keys_before = set(globals_dict.keys())
+        exec(code, globals_dict)
+        new_keys = list(set(globals_dict.keys()) - keys_before)
+        assert len(new_keys) == 1
+        fn_compiled = globals_dict[new_keys[0]]
+
+        # return the compiled function with the original globals
+        def change_func_globals(f, globals):
+            """Based on https://stackoverflow.com/a/13503277/2988730 (@unutbu)"""
+            # __globals__ is a private member of the function class
+            # so we have to copy the function, f, all of its member, except f.__globals__
+            g = FunctionType(
+                f.__code__,
+                globals,
+                name=f.__name__,
+                argdefs=f.__defaults__,
+                closure=f.__closure__,
+            )
+            g = functools.update_wrapper(g, f)
+            g.__kwdefaults__ = copy.copy(f.__kwdefaults__)
+            return g
+        # Return the correct FunctionType object
+        return change_func_globals(fn_compiled, globals=fn.__globals__)
+
+    def visit_Assert(self, node):
+        """
+        Swap out the Assert node (Python's `assert`) with a callsite to the
+        symbolically-traceable torch._assert function
+        """
+        # Create the Call node
+        n = ast.parse('torch._assert()', mode='eval')
+        assert isinstance(n, ast.Expression)
+        call_node = n.body
+        assert isinstance(call_node, ast.Call)
+        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
+        call_node.args = [node.test, msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = ast.Expr(value=call_node)
+
+        # Return the new Call node to signify that we want to use it as
+        # a replacement for the original _assert node
+        return ast.copy_location(expr_wrapper, node)
+
+    def visit_AnnAssign(self, node):
+        """
+        Swap out Python's AnnAssign with an Assign node where the annotation function is called.
+        Example:
+             Original:
+             y: Tensor_Type(1,2,3, Dyn) = f2(x)
+            Output:
+             y = annotate(f2(x),Tensor_Type((1,2,3,Dyn)))
+        """
+        return ast.Assign(targets=[node.target], value=ast.Call(
+            func=ast.Name(id='annotate', ctx=ast.Load()),
+            args=[node.value, node.annotation], keywords=[]))
+
+
+class RewritingTracer(Tracer):
+    def trace(self, root: Union[torch.nn.Module, Callable], concrete_args: Optional[Dict[str, Any]] = None) -> Graph:
+        return super().trace(_rewrite(root), concrete_args)
+
+
+def _rewrite(fn: Union[torch.nn.Module, Callable]) -> Union[torch.nn.Module, Callable]:
+    if isinstance(fn, torch.nn.Module):
+        # Rewrite this module's `forward` as well as the `forward`s of
+        # all of this module's recursive descendents. Return the new,
+        # rewritten module hierarchy.
+        def rewrite_module(m : torch.nn.Module):
+            class RewrittenModule(torch.nn.Module):
+                def __init__(self, orig):
+                    super().__init__()
+                    for k, v in orig.__dict__.items():
+                        if isinstance(v, torch.nn.Module):
+                            self.__dict__[k] = copy.copy(rewrite_module(v))
+                        else:
+                            self.__dict__[k] = copy.copy(v)
+            RewrittenModule.forward = AST_Rewriter().rewrite(cast(FunctionType, m.forward))
+            return RewrittenModule(m)
+        return rewrite_module(fn)
+    else:
+        # Rewrite this single free function
+        return AST_Rewriter().rewrite(cast(FunctionType, fn))
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/schema_type_annotation.py b/MLPY/Lib/site-packages/torch/fx/experimental/schema_type_annotation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb979715acb9de42b586c926080ddc812df403d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/schema_type_annotation.py
@@ -0,0 +1,111 @@
+import torch
+import torch.fx
+import inspect
+from typing import Any, Dict, Optional, Tuple
+from torch.fx.node import Argument, Target
+from torch._jit_internal import boolean_dispatched
+from torch.fx.operator_schemas import _torchscript_type_to_python_type
+
+from torch.fx import Transformer
+
+class AnnotateTypesWithSchema(Transformer):
+    """
+    Use Python function signatures to annotate types for `Nodes` within an FX graph.
+    This pulls out Python function signatures for:
+
+        1. Standard `torch.nn` Module calls
+        2. `torch.nn.functional` calls
+        3. Attribute fetches via `get_attr`
+
+    Example usage:
+
+        m = torchvision.models.resnet18()
+
+        traced = torch.fx.symbolic_trace(m)
+
+        traced = AnnotateTypesWithSchema(traced).transform()
+
+    """
+    def __init__(self, module : torch.nn.Module, annotate_functionals : bool = True,
+                 annotate_modules : bool = True, annotate_get_attrs : bool = True):
+        super().__init__(module)
+        self.annotate_functionals = annotate_functionals
+        self.annotate_modules = annotate_modules
+        self.annotate_get_attrs = annotate_get_attrs
+
+    def call_function(self, target : Target, args : Tuple[Argument, ...], kwargs : Dict[str, Any]):
+        python_ret_type = None
+        if self.annotate_functionals and target.__module__ == 'torch.nn.functional':
+            target_for_analysis = target
+            if target in boolean_dispatched:
+                # HACK: `boolean_dispatch` as used in `torch.nn.functional` makes it so that we have
+                # a 2-way dispatch based on a boolean value. Here we check that the `true` and `false`
+                # branches of the dispatch have exactly the same signature. If they do, use the `true`
+                # branch signature for analysis. Otherwise, leave this un-normalized
+                assert not isinstance(target, str)
+                dispatched = boolean_dispatched[target]
+                if_true, if_false = dispatched['if_true'], dispatched['if_false']
+                # TODO: can we emit the union of these? What are the implications on TorchScript
+                # compilation?
+                if inspect.signature(if_true).return_annotation != inspect.signature(if_false).return_annotation:
+                    return super().call_function(target, args, kwargs)
+                target_for_analysis = if_true
+
+            python_ret_type = self._extract_python_return_type(target_for_analysis)
+
+        return_proxy = super().call_function(target, args, kwargs)
+        return_proxy.node.type = return_proxy.node.type if return_proxy.node.type else python_ret_type
+        return return_proxy
+
+    def call_module(self, target : Target, args : Tuple[Argument, ...], kwargs : Dict[str, Any]):
+        python_ret_type = None
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        if self.annotate_modules and hasattr(submod.__class__, '__name__'):
+            classname = submod.__class__.__name__
+            if getattr(torch.nn, classname, None) == submod.__class__:
+                python_ret_type = self._extract_python_return_type(submod.forward)
+        return_proxy = super().call_module(target, args, kwargs)
+        return_proxy.node.type = return_proxy.node.type if return_proxy.node.type else python_ret_type
+        return return_proxy
+
+    def get_attr(self, target : torch.fx.node.Target, args : Tuple[Argument, ...], kwargs : Dict[str, Any]):
+        attr_proxy = super().get_attr(target, args, kwargs)
+
+        if self.annotate_get_attrs:
+            module_itr = self.module
+            assert isinstance(target, str)
+            atoms = target.split('.')
+            for i, atom in enumerate(atoms):
+                if not hasattr(module_itr, atom):
+                    raise RuntimeError(f'Node referenced nonextent target {".".join(atoms[:i])}!')
+                module_itr = getattr(module_itr, atom)
+
+            maybe_inferred_ts_type = torch._C._jit_try_infer_type(module_itr)
+            if maybe_inferred_ts_type.success():
+                python_type = _torchscript_type_to_python_type(maybe_inferred_ts_type.type())
+                attr_proxy.node.type = python_type if not attr_proxy.node.type else attr_proxy.node.type
+
+        return attr_proxy
+
+    def _extract_python_return_type(self, target : Target) -> Optional[Any]:
+        """
+        Given a Python call target, try to extract the Python return annotation
+        if it is available, otherwise return None
+
+        Args:
+
+            target (Callable): Python callable to get return annotation for
+
+        Returns:
+
+            Optional[Any]: Return annotation from the `target`, or None if it was
+                not available.
+        """
+        assert callable(target)
+        try:
+            sig = inspect.signature(target)
+        except (ValueError, TypeError):
+            return None
+
+        return sig.return_annotation if sig.return_annotation is not inspect.Signature.empty else None
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/sym_node.py b/MLPY/Lib/site-packages/torch/fx/experimental/sym_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..e558dd22dce50d3bf11b60a589fe18dab3ca3bc6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/sym_node.py
@@ -0,0 +1,1330 @@
+"""
+This file does three things:
+- Contains the definition of SymNode
+- Installs all the magic methods into SymBool, SymFloat, SymFloat at import time
+- Does not depend on sympy at import time
+
+As this file is imported from within torch/__init__.py we do not want it to depend on SymPy
+to avoid having to load SymPy at import time, as doing so is *very* slow.
+"""
+
+import builtins
+import itertools
+import logging
+import math
+import operator
+import sys
+from functools import lru_cache, update_wrapper
+from typing import Optional, Type, TYPE_CHECKING, Union
+
+import torch
+
+# NB: The sym_* functions are used via getattr() and must be imported here.
+from torch import (  # noqa: F401
+    sym_float,
+    sym_ite,
+    sym_max,
+    sym_min,
+    sym_not,
+    SymBool,
+    SymFloat,
+    SymInt,
+)
+
+from torch.fx.experimental._sym_dispatch_mode import (
+    handle_sym_dispatch,
+    sym_function_mode,
+)
+
+if TYPE_CHECKING:
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+log = logging.getLogger(__name__)
+sym_node_log = torch._logging.getArtifactLogger(__name__, "sym_node")
+
+
+__all__ = ["SymNode", "method_to_operator", "magic_methods"]
+
+
+SymTypes = (SymInt, SymFloat, SymBool)
+
+
+def _to_symtype(t):
+    if t is bool:
+        return SymBool
+    if t is int:
+        return SymInt
+    if t is float:
+        return SymFloat
+    return t
+
+
+# TODO: An incomplete list
+# 1. Set variables to be equal when we do equality
+# 2. Specialize on 0/1 when we do subtraction
+class SymNode:
+    """
+    This is a type erased SymInt/SymFloat which we use to do actual operations.
+    End users don't touch this.  Magic methods are NOT defined on this object.
+    """
+
+    def __init__(
+        self,
+        expr,
+        shape_env,
+        pytype,
+        hint: Optional[Union[int, float, bool]],
+        constant=None,
+        fx_node=None,
+    ):
+        self._expr = expr
+        self.shape_env = shape_env
+        self.pytype = pytype
+        # What's the difference between hint and constant?
+        #
+        # - A constant is known to be invariant across invocations of the model;
+        #   it will always be this value.  We only really know this when we
+        #   encounter an honest-to-goodness literal (when wrapping it into
+        #   a SymNode, we set constant.)  Most of the time, constant is None
+        #
+        # - A hint is a *particular* value from the particular run we are
+        #   tracing, but it may vary the next time around.  It's useful to
+        #   keep this around, as if we need a concrete value from a SymNode,
+        #   we will return the hint and guard on the expression that produced
+        #   it giving the same hint next time around.  The hint is not
+        #   guaranteed to be set either: if you have an unbacked SymNode,
+        #   there won't be any hint; it was the result of some tensor-dependent
+        #   computation, but we don't know what it actually is because we
+        #   haven't actually run the tensor computation.
+        #
+        # If _hint is None, we will query maybe_evaluate_static(compute_hint=True)
+        # in hopes that we've learned enough about the unbacked symints to
+        # discharge the hint; otherwise, you're likely to just error out.
+        #
+        # (A previous version of this system had some optimizations to only
+        # recompute when it was possible we had learned enough about the
+        # unbacked symint that a hint was now possible, but as we added more
+        # potential refinements to unbacked symints this got harder to keep
+        # in sync, so we've deleted it for now.)
+        if hint is not None:
+            assert type(hint) is pytype or type(hint) is _to_symtype(pytype), (
+                "Cannot create SymNode of type "
+                f"{pytype} with incompatible hint of type {type(hint)}"
+            )
+        self._hint = hint
+        self.constant: Optional[Union[int, float, bool]] = constant
+
+        # Record the FX node of the current node if we are doing translation
+        # validation. They will be used for building the input assertions for
+        # the translation validation problem.
+        self.fx_node = (
+            fx_node if self.shape_env._translation_validation_enabled else None
+        )
+
+    def with_shape_env(self, shape_env: "ShapeEnv") -> "SymNode":
+        return SymNode(
+            self._expr, shape_env, self.pytype, self._hint, self.constant, self.fx_node
+        )
+
+    @property
+    def expr(self):
+        return self.shape_env.replace(self._expr)
+
+    # Recompute the hint and see if we've got it now
+    # Precondition: self._hint is None
+    def _update_hint(self):
+        r = self.shape_env._maybe_evaluate_static(self.expr, compute_hint=True)
+        if r is not None:
+            self._hint = self.pytype(r) if not isinstance(r, SymTypes) else r
+
+    @property
+    def hint(self):
+        if self._hint is None:
+            self._update_hint()
+        return self._hint
+
+    def has_hint(self):
+        if self._hint is None:
+            self._update_hint()
+        return self._hint is not None
+
+    def require_hint(self, fallback=None):
+        if self._hint is None:
+            self._update_hint()
+        if self._hint is None:
+            if fallback is not None:
+                return fallback
+            # NB: we expect this to raise
+            return self.shape_env.size_hint(self.expr)
+        return self._hint
+
+    def maybe_as_int(self):
+        if self.expr.is_number:
+            return int(self.expr)
+        else:
+            return None
+
+    def is_int(self):
+        return self.pytype is int
+
+    def is_float(self):
+        return self.pytype is float
+
+    def is_bool(self):
+        return self.pytype is bool
+
+    def is_nested_int(self):
+        # Unbacked SymInts cannot be nested int today
+        return (
+            self._hint is not None
+            and isinstance(self._hint, SymInt)
+            and self._hint.node.is_nested_int()
+        )
+
+    def wrap_int(self, num):
+        assert type(num) is int
+        import sympy
+
+        return SymNode(
+            sympy.Integer(num), self.shape_env, int, num, constant=num, fx_node=num
+        )
+
+    def wrap_float(self, num):
+        assert type(num) is float
+        import sympy
+
+        return SymNode(
+            sympy.Float(num), self.shape_env, float, num, constant=num, fx_node=num
+        )
+
+    def wrap_bool(self, num):
+        assert type(num) is bool
+        import sympy
+
+        return SymNode(
+            sympy.true if num else sympy.false,
+            self.shape_env,
+            bool,
+            num,
+            constant=num,
+            fx_node=num,
+        )
+
+    def clone(self):
+        return self
+
+    def str(self):
+        return f"{self.expr}"
+
+    def __str__(self):
+        return self.str()
+
+    def __repr__(self):
+        return self.str()
+
+    # These methods call the metaprogrammed methods, they're hand written
+    # here so we get good stack traces
+    def abs(self) -> "SymNode":
+        return self._abs()  # type: ignore[attr-defined]
+
+    def pos(self) -> "SymNode":
+        return self._pos()  # type: ignore[attr-defined]
+
+    def round(self, ndigits=None) -> "SymNode":
+        return self._round(ndigits)  # type: ignore[attr-defined]
+
+    def add(self, other) -> "SymNode":
+        return self._add(other)  # type: ignore[attr-defined]
+
+    def sub(self, other) -> "SymNode":
+        return self._sub(other)  # type: ignore[attr-defined]
+
+    def mul(self, other) -> "SymNode":
+        return self._mul(other)  # type: ignore[attr-defined]
+
+    def mod(self, other) -> "SymNode":
+        return self._mod(other)  # type: ignore[attr-defined]
+
+    def pow(self, other) -> "SymNode":
+        return self._pow(other)  # type: ignore[attr-defined]
+
+    def and_(self, other) -> "SymNode":
+        return self._and_(other)  # type: ignore[attr-defined]
+
+    def or_(self, other) -> "SymNode":
+        return self._or_(other)  # type: ignore[attr-defined]
+
+    def truediv(self, other) -> "SymNode":
+        return self._truediv(other)  # type: ignore[attr-defined]
+
+    def floordiv(self, other) -> "SymNode":
+        return self._floordiv(other)  # type: ignore[attr-defined]
+
+    def lshift(self, other) -> "SymNode":
+        return self._lshift(other)  # type: ignore[attr-defined]
+
+    def rshift(self, other) -> "SymNode":
+        return self._rshift(other)  # type: ignore[attr-defined]
+
+    def sym_not(self) -> "SymNode":  # noqa: F811
+        return self._sym_not()  # type: ignore[attr-defined]
+
+    def eq(self, other) -> "SymNode":
+        return self._eq(other)  # type: ignore[attr-defined]
+
+    def ne(self, other) -> "SymNode":
+        return self._ne(other)  # type: ignore[attr-defined]
+
+    def gt(self, other) -> "SymNode":
+        return self._gt(other)  # type: ignore[attr-defined]
+
+    def lt(self, other) -> "SymNode":
+        return self._lt(other)  # type: ignore[attr-defined]
+
+    def le(self, other) -> "SymNode":
+        return self._le(other)  # type: ignore[attr-defined]
+
+    def ge(self, other) -> "SymNode":
+        return self._ge(other)  # type: ignore[attr-defined]
+
+    def floor(self) -> "SymNode":
+        return self._floor()  # type: ignore[attr-defined]
+
+    def is_integer(self) -> "SymNode":
+        return self._is_integer()  # type: ignore[attr-defined]
+
+    def sym_float(self) -> "SymNode":  # noqa: F811
+        return self._sym_float()  # type: ignore[attr-defined]
+
+    def sym_int(self) -> "SymNode":
+        return self._sym_int()  # type: ignore[attr-defined]
+
+    def ceil(self) -> "SymNode":
+        return self._ceil()  # type: ignore[attr-defined]
+
+    def neg(self) -> "SymNode":
+        return self._neg()  # type: ignore[attr-defined]
+
+    def sym_min(self, other) -> "SymNode":  # noqa: F811
+        return self._sym_min(other)  # type: ignore[attr-defined]
+
+    def sym_max(self, other) -> "SymNode":  # noqa: F811
+        return self._sym_max(other)  # type: ignore[attr-defined]
+
+    def sym_ite(self, then_val, else_val) -> "SymNode":
+        return self._sym_ite(then_val, else_val)  # type: ignore[attr-defined]
+
+    def is_contiguous(self, sizes, strides) -> "SymNode":
+        return self._is_contiguous(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_contiguous_2d(self, sizes, strides) -> "SymNode":
+        return self._is_channels_last_contiguous_2d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_contiguous_3d(self, sizes, strides) -> "SymNode":
+        return self._is_channels_last_contiguous_3d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_strides_2d(self, sizes, strides) -> "SymNode":
+        return self._is_channels_last_strides_2d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_channels_last_strides_3d(self, sizes, strides) -> "SymNode":
+        return self._is_channels_last_strides_3d(sizes, strides)  # type: ignore[attr-defined]
+
+    def is_non_overlapping_and_dense_indicator(self, sizes, strides) -> "SymNode":
+        return self._is_non_overlapping_and_dense_indicator(sizes, strides)  # type: ignore[attr-defined]
+
+    # Make C++ happy
+    def sym_or(self, other):
+        return self.or_(other)
+
+    def sym_and(self, other):
+        return self.and_(other)
+
+    def is_non_overlapping_and_dense(self, sizes, strides):
+        return self.is_non_overlapping_and_dense_indicator(sizes, strides).eq(to_node(self, 1))  # type: ignore[attr-defined]
+
+    def int_(self):
+        return self.guard_int("", 0)  # NB: uses Python backtrace
+
+    # You can manually trigger a guard with this function
+    def guard_int(self, file, line):
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        r = self.shape_env.evaluate_expr(self.expr, self.hint, fx_node=self.fx_node)
+        try:
+            return int(r)
+        except Exception:
+            log.warning("Failed to convert to int: %s", r)
+            raise
+
+    def guard_float(self, file, line):
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        r = self.shape_env.evaluate_expr(
+            self.expr, self.hint, fx_node=self.fx_node, expect_rational=False
+        )
+        try:
+            return float(r)
+        except Exception:
+            log.warning("Failed to convert to float: %s", r)
+            raise
+
+    def guard_bool(self, file, line):
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        r = self.shape_env.evaluate_expr(self.expr, self.hint, fx_node=self.fx_node)
+        try:
+            return bool(r)
+        except Exception:
+            log.warning("Failed to convert to bool: %s", r)
+            raise
+
+    def expect_true(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+        if self.has_hint() and not free_unbacked_symbols(self.expr):
+            # OK to generate guards
+            return self.guard_bool(file, line)
+        # Generate a deferred runtime assert (this might actually end up doing
+        # a regular guard if we can!)
+        # TODO: file/line here is very important, because the assert has been
+        # deferred so you can't backtrace easily
+        return self.shape_env.defer_runtime_assert(
+            self.expr, f"{file}:{line}", fx_node=self.fx_node
+        )
+
+    def expect_size(self, file, line):
+        from torch.fx.experimental.symbolic_shapes import _advise_is_size
+
+        b = self.ge(self.wrap_int(0))
+        # Generate a deferred runtime assert
+        r = b.expect_true(file, line)
+        # Refine compile time range, but only if it's unbacked.
+        # If you refine range for hinted variables, you can end up making
+        # improper deductions since compile time reasoning may be
+        # incompatible with runtime reasoning.
+        if r and not self.has_hint():
+            _advise_is_size(SymInt(self))
+        return r
+
+    def guard_size_oblivious(self, file, line):
+        """
+        Like guard_bool, but if we encounter unbacked symbols, if those symbols
+        are size-like, we will treat them as >= 2 for the purposes of the analysis.
+
+        This CHANGES the runtime semantics, but all size-oblivious sites have been
+        audited to ensure that the runtime semantics don't change in a material way.
+        Acceptable runtime semantic changes are, e.g., squeeze() no longer dropping
+        an unbacked one size, or a tensor reporting as non-contiguous even if it's
+        contiguous if it would have been reported contiguous due to being empty.
+        """
+        # TODO: use the file/line for some useful diagnostic on why a
+        # guard occurred
+        r = self.shape_env.evaluate_expr(
+            self.expr, self.hint, fx_node=self.fx_node, size_oblivious=True
+        )
+        try:
+            return bool(r)
+        except Exception:
+            log.warning("Failed to convert to bool: %s", r)
+            raise
+
+    def bool_(self):
+        return self.guard_bool("", 0)
+
+    def is_symbolic(self):
+        return True
+
+    def nested_int(self):
+        return None
+
+    def is_constant(self):
+        return False
+
+
+# TODO: this probably needs the sizes-strides eval functions
+METHOD_TO_OPERATOR = {
+    "pos": operator.pos,
+    "abs": operator.abs,
+    "add": operator.add,
+    "and": operator.and_,
+    "ceil": math.ceil,
+    "eq": operator.eq,
+    "floor": math.floor,
+    "floordiv": operator.floordiv,
+    "ge": operator.ge,
+    "gt": operator.gt,
+    "is_integer": lambda x: x.is_integer(),
+    "le": operator.le,
+    "lshift": operator.lshift,
+    "lt": operator.lt,
+    "mod": operator.mod,
+    "mul": operator.mul,
+    "ne": operator.ne,
+    "neg": operator.neg,
+    "or": operator.or_,
+    "pow": operator.pow,
+    "round": builtins.round,
+    "rshift": operator.rshift,
+    "sub": operator.sub,
+    "sym_float": sym_float,
+    "sym_ite": sym_ite,
+    "sym_max": sym_max,
+    "sym_min": sym_min,
+    "sym_not": sym_not,
+    "truediv": operator.truediv,
+}
+
+unary_magic_methods = {
+    "abs",
+    "sym_float",
+    "ceil",
+    "floor",
+    "neg",
+    "sym_not",
+    "pos",
+}
+
+
+# Adding math ops: sqrt, cos, sin, ...
+def _get_sym_node_fn(name):
+    def fn(self):
+        return getattr(self, f"_sym_{name}")()
+
+    return fn
+
+
+math_op_names = (
+    "sqrt",
+    "cos",
+    "cosh",
+    "sin",
+    "sinh",
+    "tan",
+    "tanh",
+    "asin",
+    "acos",
+    "atan",
+)
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    priv_sym_name = f"_{sym_name}"
+    setattr(SymNode, sym_name, _get_sym_node_fn(name))
+    METHOD_TO_OPERATOR[sym_name] = getattr(torch, priv_sym_name)
+    unary_magic_methods.add(sym_name)
+    __all__.append(sym_name)
+
+
+# Unary methods that are not magic methods
+unary_nonmagic_methods = {
+    "is_integer",
+}
+
+unary_methods = unary_magic_methods | unary_nonmagic_methods
+
+# Most methods are only registered on SymInt and SymFloat
+# Some methods are only be registered on SymBool
+only_bool_magic_methods = {"and", "or", "sym_not", "sym_ite"}
+# Methods that implicitly convert SymBool into SymInt
+bool_becomes_int_magic_methods = {"add", "sub", "mul"}
+# Methods that are also on SymBool, in addition to on SymInt and SymFloat
+also_bool_magic_methods = {"eq"}
+bool_magic_methods = only_bool_magic_methods | also_bool_magic_methods
+
+# Methods that are only for float
+only_float_magic_methods = {"is_integer"}
+
+
+magic_methods_on_operator_with_trailing_underscore = {"and", "or"}
+
+
+always_float_magic_methods = {"truediv", "sym_float", "pow"}
+
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    always_float_magic_methods.add(sym_name)
+
+
+always_int_magic_methods = {"ceil", "floor"}
+always_bool_magic_methods = {
+    "eq",
+    "ne",
+    "gt",
+    "lt",
+    "le",
+    "ge",
+    "and",
+    "or",
+    "sym_not",
+    "is_non_overlapping_and_dense",
+    "is_integer",
+}
+
+# Methods that have a `__foo__` as well as `__rfoo__`
+
+
+def _sympy_truediv(a, b):
+    from torch.utils._sympy.functions import TrueDiv
+
+    return TrueDiv(a, b)
+
+
+def _sympy_floordiv(a, b):
+    from torch.utils._sympy.functions import FloorDiv
+
+    return FloorDiv(a, b)
+
+
+def _sympy_mod(a, b):
+    from torch.utils._sympy.functions import Mod
+
+    return Mod(a, b)
+
+
+def _sympy_pow(a, b):
+    from torch.utils._sympy.functions import Pow
+
+    return Pow(a, b)
+
+
+def _sympy_and(a, b):
+    import sympy
+
+    return sympy.And(a, b)
+
+
+def _sympy_or(a, b):
+    import sympy
+
+    return sympy.Or(a, b)
+
+
+def _sympy_lshift(a, b):
+    from torch.utils._sympy.functions import LShift
+
+    return LShift(a, b)
+
+
+def _sympy_rshift(a, b):
+    from torch.utils._sympy.functions import RShift
+
+    return RShift(a, b)
+
+
+reflectable_magic_methods = {
+    "add": operator.add,
+    "sub": operator.sub,
+    "mul": operator.mul,
+    "mod": _sympy_mod,
+    "pow": _sympy_pow,
+    "and": _sympy_and,
+    "or": _sympy_or,
+    "truediv": _sympy_truediv,
+    "floordiv": _sympy_floordiv,
+    "lshift": _sympy_lshift,
+    "rshift": _sympy_rshift,
+}
+
+
+def _floor_ceil_helper(a, fn):
+    import sympy
+
+    if isinstance(a, sympy.Mul):
+        aa = a.args
+        if len(aa) == 2 and isinstance(aa[0], sympy.Float) and aa[1].is_integer:
+            coef = sympy.Integer(aa[0])
+            if aa[0] == coef:  # structural equality test
+                return coef * aa[1]
+    if (
+        isinstance(a, sympy.Float)
+        and a == sympy.Integer(a)
+        or isinstance(a, sympy.Integer)
+    ):
+        return sympy.Integer(a)
+    return fn(a)
+
+
+def _sympy_floor(a):
+    import sympy
+
+    return _floor_ceil_helper(a, sympy.floor)
+
+
+def _sympy_ceil(a):
+    import sympy
+
+    return _floor_ceil_helper(a, sympy.ceiling)
+
+
+def _sympy_eq(a, b):
+    import sympy
+
+    return sympy.Eq(a, b)
+
+
+def _sympy_ne(a, b):
+    import sympy
+
+    return sympy.Ne(a, b)
+
+
+def _sympy_gt(a, b):
+    import sympy
+
+    return sympy.Gt(a, b)
+
+
+def _sympy_lt(a, b):
+    import sympy
+
+    return sympy.Lt(a, b)
+
+
+def _sympy_le(a, b):
+    import sympy
+
+    return sympy.Le(a, b)
+
+
+def _sympy_ge(a, b):
+    import sympy
+
+    return sympy.Ge(a, b)
+
+
+def _sympy_min(a, b):
+    import sympy
+
+    return sympy.Min(a, b)
+
+
+def _sympy_max(a, b):
+    import sympy
+
+    return sympy.Max(a, b)
+
+
+def _sympy_ite(a, t, f):
+    import sympy
+
+    return sympy.Piecewise((t, a), (f, True))
+
+
+current_module = sys.modules[__name__]
+
+
+def _get_sym_math_fn(name):
+    def fn(a):
+        import sympy
+
+        return getattr(sympy, name)(a)
+
+    return fn
+
+
+for name in math_op_names:
+    priv_sympy_name = f"_sympy_{name}"
+    fn = _get_sym_math_fn(name)
+    fn.__qualname__ = fn.__name__ = priv_sympy_name
+    setattr(current_module, priv_sympy_name, fn)
+
+del fn, name, priv_sympy_name  # type: ignore[possibly-undefined]
+
+
+def _sympy_abs(a):
+    import sympy
+
+    return sympy.Abs(a)
+
+
+def _sympy_round(number, ndigits=None):
+    from torch.utils._sympy.functions import Round, RoundDecimal
+
+    if ndigits is None:
+        return Round(number)
+    else:
+        return RoundDecimal(number, ndigits)
+
+
+def _sympy_sym_float(a):
+    # Cannot use sympy.Float(a) here, coz it expects python literals
+    # Multiply by 1.0 to cast to float. This is needed when the input
+    # is a SymInt which has the assumption that it is integer and
+    # SymPy will otherwise assume that return value cannot be a float.
+    return a * 1.0
+
+
+def _sympy_is_integer(a):
+    import sympy
+
+    return sympy.Eq(sympy.floor(a), a)
+
+
+magic_methods = {
+    **reflectable_magic_methods,
+    "sym_not": operator.invert,
+    "pos": operator.pos,
+    "eq": _sympy_eq,
+    "ne": _sympy_ne,
+    "gt": _sympy_gt,
+    "lt": _sympy_lt,
+    "le": _sympy_le,
+    "ge": _sympy_ge,
+    "floor": _sympy_floor,
+    "sym_float": _sympy_sym_float,
+    "ceil": _sympy_ceil,
+    "neg": operator.neg,
+    "sym_min": _sympy_min,
+    "sym_max": _sympy_max,
+    "sym_ite": _sympy_ite,
+    "abs": _sympy_abs,
+    "round": _sympy_round,
+    "is_integer": _sympy_is_integer,
+}
+
+
+for name in math_op_names:
+    sym_name = f"sym_{name}"
+    magic_methods[sym_name] = getattr(current_module, f"_sympy_{name}")
+
+del name, sym_name, math_op_names, current_module  # type: ignore[possibly-undefined]
+
+
+def sympy_is_contiguous(sizes, strides):
+    dim = len(sizes)
+    return sympy_is_contiguous_generic(sizes, strides, list(range(dim - 1, -1, -1)))
+
+
+def sympy_is_contiguous_generic(sizes, strides, dim_order):
+    import sympy
+
+    dim = len(sizes)
+
+    if len(dim_order) != dim:
+        return sympy.false
+
+    is_contiguous = sympy.true
+    z = sympy.Integer(1)
+    # Contiguous if the strides make sense (or the dim is size 1)
+    for d in dim_order:
+        is_contiguous &= sympy.Eq(sizes[d], sympy.Integer(1)) | sympy.Eq(strides[d], z)
+        z *= sizes[d]
+    # OR if any size is zero
+    for d in range(dim):
+        is_contiguous |= sympy.Eq(sizes[d], sympy.Integer(0))
+    return is_contiguous
+
+
+# NB: There is a TODO in C++ to allow omitting the batch dim.  If that
+# happens you will need to refactor this
+
+
+def sympy_is_channels_last_contiguous_2d(sizes, strides):
+    return sympy_is_contiguous_generic(sizes, strides, [1, 3, 2, 0])
+
+
+def sympy_is_channels_last_contiguous_3d(sizes, strides):
+    return sympy_is_contiguous_generic(sizes, strides, [1, 4, 3, 2, 0])
+
+
+def sympy_is_channels_last_strides_generic(sizes, strides, dim_order):
+    import sympy
+
+    dim = len(sizes)
+
+    if dim != len(dim_order):
+        return sympy.false
+
+    m = sympy.Integer(0)
+    r = sympy.true
+
+    # special case for trivial C dimension. default to NCHW
+    r &= sympy.Ne(strides[1], 0)
+
+    for d in dim_order:
+        r &= sympy.Ne(sizes[d], 0) & (strides[d] >= m)
+        # Fallback to NCHW as default layout for ambiguous cases
+        # This is the flaw of implicit memory_format from strides.
+        # N111 tensor with identical strides for size 1 dimension;
+        # Two cases could lead us here:
+        # a. N111 contiguous Tensor ([N,1,1,1]@[1,1,1,1])
+        # b. N11W contiguous Tensor sliced on the W-dimension.
+        # ([N,1,1,1]@[W,W,W,W])
+        if d == 0:
+            r &= sympy.Ne(m, strides[1])
+        # This is necessary to:
+        # 1. distinguish the memory_format of N1H1;
+        #     [H, 1, 1, 1] channels_last stride
+        #     [H, H, 1, 1] contiguous stride
+        # 2. permutation of 1C1W:
+        #     [1, C, 1, H]@[HC, H, H, 1] transpose(1, 3)
+        #     [1, H, 1, C]@[HC, 1, H, H] shouldn't be identified as
+        #     channels_last
+        m = strides[d] * sympy.Max(sizes[d], 1)
+
+    return r
+
+
+def sympy_is_channels_last_strides_2d(sizes, strides):
+    return sympy_is_channels_last_strides_generic(sizes, strides, [1, 3, 2, 0])
+
+
+def sympy_is_channels_last_strides_3d(sizes, strides):
+    return sympy_is_channels_last_strides_generic(sizes, strides, [1, 4, 3, 2, 0])
+
+
+def _sympy_is_non_overlapping_and_dense_indicator(sizes, strides):
+    from torch.utils._sympy.functions import IsNonOverlappingAndDenseIndicator
+
+    return IsNonOverlappingAndDenseIndicator(*sizes, *strides)
+
+
+sizes_strides_methods = {
+    # TODO: These could also be done with indicators, maybe it is better
+    # for reasoning to do it that way
+    "is_contiguous": sympy_is_contiguous,
+    "is_channels_last_contiguous_2d": sympy_is_channels_last_contiguous_2d,
+    "is_channels_last_contiguous_3d": sympy_is_channels_last_contiguous_3d,
+    "is_channels_last_strides_2d": sympy_is_channels_last_strides_2d,
+    "is_channels_last_strides_3d": sympy_is_channels_last_strides_3d,
+    "is_non_overlapping_and_dense_indicator": _sympy_is_non_overlapping_and_dense_indicator,
+}
+
+alternate_impl_if_hinted_methods = {
+    "sym_min": builtins.min,
+    "sym_max": builtins.max,
+}
+
+
+def to_node(self, num):
+    if isinstance(num, SymTypes):
+        return num.node
+    elif type(num) is bool:
+        return self.wrap_bool(num)
+    elif type(num) is int:
+        return self.wrap_int(num)
+    elif type(num) is float:
+        return self.wrap_float(num)
+    else:
+        # NotImplemented is important so that Python tries the
+        # other magic method
+        return NotImplemented
+
+
+def wrap_node(x):
+    # TODO: let C++ also take advantage of this
+    if isinstance(x, SymNode) and x.constant is not None:
+        return x.constant
+    if x.is_int():
+        return SymInt(x)
+    elif x.is_float():
+        return SymFloat(x)
+    elif x.is_bool():
+        return SymBool(x)
+    else:
+        raise AssertionError(f"unrecognized return type {x}")
+
+
+def method_to_operator(method):
+    return METHOD_TO_OPERATOR[method]
+
+
+def _make_node_magic(method, func):
+    func = lru_cache(256)(func)
+
+    if method in magic_methods_on_operator_with_trailing_underscore:
+        method_attr = f"{method}_"
+    else:
+        method_attr = method
+
+    def binary_magic_impl(self, other):
+        from torch.fx.experimental.symbolic_shapes import safe_expand
+
+        op = method_to_operator(method)
+
+        out_hint = None
+        if self.hint is not None and other.hint is not None:
+            out_hint = op(self.hint, other.hint)
+
+        alternate_impl = alternate_impl_if_hinted_methods.get(method)
+        if alternate_impl and out_hint is not None:
+            return to_node(self, alternate_impl(wrap_node(self), wrap_node(other)))
+
+        if sym_function_mode():
+            return to_node(
+                self, handle_sym_dispatch(op, (wrap_node(self), wrap_node(other)), {})
+            )
+        assert isinstance(other, SymNode)
+        # TODO: consider constant prop here
+        try:
+            out = func(self.expr, other.expr)
+        except Exception:
+            log.warning("failed to eval %s(%s, %s)", method, self.expr, other.expr)
+            raise
+        out = safe_expand(out)
+        sym_node_log.debug("%s %s %s -> %s", func, self.expr, other.expr, out)
+        pytype: Type
+        # This is not strictly correct. In Python, a**b may return complex when
+        # a < 0 and b is a float: (-1)**2.1. Same for sympy.sqrt(-3.14). This
+        # returns a float while both arguments are ints: 2**(-1). Also, max and
+        # min do not type promote. To avoid having data-dependent control flow
+        # here, we just set the type to float if one of the args is a float. In
+        # case of a type mismatch, we assume that it will be detected during
+        # evaluation.
+        if method in always_float_magic_methods:
+            pytype = float
+        elif method in always_bool_magic_methods:
+            pytype = bool
+        elif self.pytype is float or other.pytype is float:
+            pytype = float
+        else:
+            pytype = self.pytype
+
+        if (
+            pytype is not None
+            and out_hint is not None
+            and not isinstance(out_hint, SymTypes)
+        ):
+            out_hint = pytype(out_hint)
+
+        # Create a FX node that corresponds to the operation being applied to
+        # this node.
+        fx_node, _ = self.shape_env._create_fx_call_function(
+            op, (self.fx_node, other.fx_node)
+        )
+        return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
+
+    def unary_magic_impl(self):
+        from torch.fx.experimental.symbolic_shapes import safe_expand
+
+        op = method_to_operator(method)
+        if sym_function_mode():
+            return to_node(self, handle_sym_dispatch(op, (wrap_node(self),), {}))
+        # TODO: consider constant prop here
+        expr = self.expr
+        if method == "floor" or method == "ceiling":
+            expr = self.shape_env._simplify_floor_div(expr)
+
+        try:
+            out = func(expr)
+        except Exception:
+            log.warning("failed to eval %s(%s)", method, expr)
+            raise
+        sym_node_log.debug("%s %s -> %s", func, expr, out)
+        out_hint = None
+        if self.hint is not None:
+            out_hint = op(self.hint)
+        out = safe_expand(out)
+        pytype: Type
+        if method in always_int_magic_methods:
+            pytype = int
+        elif method in always_bool_magic_methods:
+            pytype = bool
+        elif method in always_float_magic_methods:
+            pytype = float
+        else:
+            pytype = self.pytype
+
+        fx_node, _ = self.shape_env._create_fx_call_function(op, (self.fx_node,))
+        return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
+
+    if method in unary_methods:
+        setattr(SymNode, f"_{method_attr}", unary_magic_impl)
+    elif method == "sym_ite":
+
+        def sym_ite_impl(pred_node, then_node, else_node):
+            from torch.fx.experimental.symbolic_shapes import safe_expand
+
+            out_hint = then_node.hint if pred_node.hint else else_node.hint
+            if sym_function_mode():
+                return to_node(
+                    pred_node,
+                    handle_sym_dispatch(
+                        sym_ite,
+                        (
+                            wrap_node(pred_node),
+                            wrap_node(then_node),
+                            wrap_node(else_node),
+                        ),
+                        {},
+                    ),
+                )
+
+            try:
+                out = func(pred_node.expr, then_node.expr, else_node.expr)
+            except Exception:
+                log.warning(
+                    "failed to eval %s(%s, %s, %s)",
+                    method,
+                    pred_node.expr,
+                    then_node.expr,
+                    else_node.expr,
+                )
+                raise
+
+            out = safe_expand(out)
+            fx_node, _ = pred_node.shape_env._create_fx_call_function(
+                sym_ite, (pred_node.fx_node, then_node.fx_node, else_node.fx_node)
+            )
+            return SymNode(
+                out, pred_node.shape_env, then_node.pytype, out_hint, fx_node=fx_node
+            )
+
+        setattr(SymNode, f"_{method_attr}", sym_ite_impl)
+    elif method == "round":
+
+        def round_impl(self, ndigits=None):
+            from torch.fx.experimental.symbolic_shapes import safe_expand
+
+            op = builtins.round
+            if sym_function_mode():
+                return to_node(
+                    self, handle_sym_dispatch(op, (wrap_node(self), ndigits), {})
+                )
+
+            expr = self.expr
+            try:
+                out = func(expr, ndigits)
+            except Exception:
+                log.warning("failed to eval %s(%s, ndigits=%s)", method, expr, ndigits)
+                raise
+            out = safe_expand(out)
+
+            pytype = int if ndigits is None else self.pytype
+
+            out_hint = None
+            if self.hint is not None:
+                out_hint = op(self.hint, ndigits)
+
+            # Internally, None is used as sentinel to indicate that a something is not a node on an FX graph. At the
+            # same time, there is no way to wrap a plain None into an FX node. Thus, there is no way to pass None here
+            # without triggering some asserts that check whether we are mixing FX nodes with untracked arguments. The
+            # hack down below works, because all round function down the line all take ndigits=None as default in their
+            # signature.
+            # TODO: Remove the args construction below if a different sentinel is used by FX.
+            args = [self.fx_node]
+            if ndigits is not None:
+                args.append(ndigits)
+            fx_node, _ = self.shape_env._create_fx_call_function(op, tuple(args))
+            return SymNode(out, self.shape_env, pytype, out_hint, fx_node=fx_node)
+
+        setattr(SymNode, f"_{method_attr}", round_impl)
+    else:
+        setattr(SymNode, f"_{method_attr}", binary_magic_impl)
+
+
+def _make_node_sizes_strides(method, func):
+    # NB: don't LRU cache, lots of arguments
+
+    def sizes_strides_impl(self, sizes, strides):
+        op = getattr(sys.modules[__name__], method)
+        if sym_function_mode():
+            return to_node(
+                self,
+                handle_sym_dispatch(
+                    op,
+                    ([wrap_node(s) for s in sizes], [wrap_node(s) for s in strides]),
+                    {},
+                ),
+            )
+        size_exprs = [s.expr for s in sizes]
+        stride_exprs = [s.expr for s in strides]
+        try:
+            out = func(size_exprs, stride_exprs)
+        except Exception:
+            log.warning("failed to eval %s(%s, %s)", method, size_exprs, stride_exprs)
+            raise
+        # bool is never expandable
+
+        size_hints = []
+        out_hint = None
+        for s in sizes:
+            if s.hint is None:
+                break
+            size_hints.append(s.hint)
+        else:
+            stride_hints = []
+            for s in strides:
+                if s.hint is None:
+                    break
+                stride_hints.append(s.hint)
+            else:
+                out_hint = op(size_hints, stride_hints)
+
+        # NB: This is the indicator function, not the actual bool!
+        pytype: Type
+        if method.endswith("_indicator"):
+            pytype = int
+        else:
+            pytype = bool
+        return SymNode(out, self.shape_env, pytype, out_hint)
+
+    setattr(SymNode, f"_{method}", sizes_strides_impl)
+
+    # TODO: This is technically hotpath, but in the ideal end state
+    # guards on this will resolve at a higher level so you never
+    # spend time in this code
+    def sizes_strides_user(sizes, strides):
+        import sympy
+
+        from torch.fx.experimental.symbolic_shapes import (
+            eval_is_non_overlapping_and_dense,
+        )
+
+        for a in itertools.chain(sizes, strides):
+            if isinstance(a, SymInt):
+                return wrap_node(
+                    getattr(a.node, method)(
+                        [to_node(a.node, b) for b in sizes],
+                        [to_node(a.node, b) for b in strides],
+                    )
+                )
+        if method == "is_non_overlapping_and_dense_indicator":
+            return eval_is_non_overlapping_and_dense(sizes, strides)
+        else:
+            # TODO: this is an awful implementation
+            return bool(
+                func(
+                    [sympy.sympify(a) for a in sizes],
+                    [sympy.sympify(a) for a in strides],
+                )
+            )
+
+    # Skip for is_non_overlapping_and_dense_indicator
+    if not hasattr(sys.modules[__name__], method):
+        setattr(sys.modules[__name__], method, sizes_strides_user)
+
+
+for method, func in magic_methods.items():
+    _make_node_magic(method, func)
+
+for method, func in sizes_strides_methods.items():
+    _make_node_sizes_strides(method, func)
+
+
+def _make_user_magic(method, user_type):
+    # User magic takes care of wrapping the other operand into a node,
+    # so that our internal logic can assume everything is nodes
+
+    if method in magic_methods_on_operator_with_trailing_underscore:
+        method_attr = f"sym_{method}"
+    else:
+        method_attr = method
+
+    def get_constant(x: Union[SymInt, int, SymFloat, float, SymBool, bool]):
+        if isinstance(x, (int, float, bool)):
+            return x
+        if isinstance(x, SymBool):
+            return x.node.guard_bool("", 0)
+        raise AssertionError("expect to be called with constant SymBools")
+
+    def is_constant(x):
+        if isinstance(x, (int, float, bool)):
+            return True
+        if isinstance(x, (SymInt, SymFloat, SymBool)):
+            return x.node.is_constant()
+        return False
+
+    if method in bool_becomes_int_magic_methods:
+
+        def promote(x):
+            """Implements True+True=2, which works in python but not sympy"""
+            if isinstance(x, SymBool):
+                return SymInt(x.node.wrap_int(int(x)))
+            return x
+
+    else:
+
+        def promote(x):
+            return x
+
+    # Before and after performing the operation, check if any operands are constant.
+    # If so, extract out the constant values first. If `self` itself is a
+    # constant, then "redispatch" by calling back into the operator. Sometimes
+    # this means that operations involving SymBool return plain bools.
+    # Alternatively, we could also rewrap into constant Symbool (i.e. by
+    # implementing wrap_bool in ConstantSymNodeImpl), but we're not doing that
+    # today for no particular reason.
+    def unary_magic_impl(self):
+        self = promote(self)
+        if is_constant(self):
+            return (method_to_operator(method))(get_constant(self))
+        return wrap_node(getattr(self.node, method_attr)())
+
+    def binary_magic_impl(self, other):
+        sym_node_log.debug("MAGIC %s %s %s", method, self, other)
+        self = promote(self)
+        other = promote(other)
+        if is_constant(self):
+            return (method_to_operator(method))(get_constant(self), other)
+        if is_constant(other):
+            other = get_constant(other)
+        other_node = to_node(self.node, other)
+        if other_node is NotImplemented:
+            return NotImplemented
+        ret = wrap_node(getattr(self.node, method_attr)(other_node))
+        return get_constant(ret) if is_constant(ret) else ret
+
+    def rbinary_magic_impl(self, other):
+        self = promote(self)
+        other = promote(other)
+        if is_constant(self):
+            return (method_to_operator(method))(get_constant(self), other)
+        if is_constant(other):
+            other = get_constant(other)
+        other_node = to_node(self.node, other)
+        if other_node is NotImplemented:
+            return NotImplemented
+        ret = wrap_node(getattr(other_node, method_attr)(self.node))
+        return get_constant(ret) if is_constant(ret) else ret
+
+    if method in unary_magic_methods:
+        setattr(user_type, f"__{method}__", unary_magic_impl)
+    elif method in unary_nonmagic_methods:
+        orig = getattr(user_type, method)
+        setattr(user_type, method, update_wrapper(unary_magic_impl, orig))
+    elif method == "sym_ite":
+
+        def sym_ite_magic_impl(pred, then_val, else_val):
+            pred_node = pred.node
+            then_node = to_node(pred_node, then_val)
+            else_node = to_node(pred_node, else_val)
+            if then_node is NotImplemented or else_node is NotImplemented:
+                return NotImplemented
+            assert (
+                isinstance(then_node, SymNode)
+                and isinstance(else_node, SymNode)
+                and then_node.pytype == else_node.pytype
+            )
+            ret = wrap_node(getattr(pred.node, method_attr)(then_node, else_node))
+            return get_constant(ret) if ret.node.is_constant() else ret
+
+        setattr(user_type, f"__{method}__", sym_ite_magic_impl)
+    elif method == "round":
+
+        def round_magic_impl(self, ndigits=None):
+            if is_constant(self):
+                return builtins.round(get_constant(self), ndigits)
+
+            return wrap_node(getattr(self.node, method)(ndigits))
+
+        setattr(user_type, f"__{method}__", round_magic_impl)
+    else:
+        setattr(user_type, f"__{method}__", binary_magic_impl)
+        if method in reflectable_magic_methods:
+            setattr(user_type, f"__r{method}__", rbinary_magic_impl)
+
+
+for method, func in magic_methods.items():  # type: ignore[assignment]
+    if method in only_bool_magic_methods:
+        _make_user_magic(method, SymBool)
+        continue
+    if method in only_float_magic_methods:
+        _make_user_magic(method, SymFloat)
+        continue
+    if method in also_bool_magic_methods or method in bool_becomes_int_magic_methods:
+        _make_user_magic(method, SymBool)
+    _make_user_magic(method, SymInt)
+    _make_user_magic(method, SymFloat)
+
+del method
+del func
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/symbolic_shapes.py b/MLPY/Lib/site-packages/torch/fx/experimental/symbolic_shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..5317287e37b1c16952d5d62461f4f954a8d85bda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/symbolic_shapes.py
@@ -0,0 +1,4362 @@
+# mypy: ignore-errors
+
+"""
+``torch.fx.experimental.symbolic_shapes`` provides interfaces for interacting with
+our symbolic shapes reasoning system that is used heavily in torch.compile.  Although
+this is not generally considered public API, when writing framework code in PyTorch
+as well as extensions to PyTorch (e.g., in custom operator implementations), you may
+need to make use of these APIs to setup dynamic shapes support appropriately.
+"""
+
+import builtins
+import collections
+import functools
+import inspect
+import itertools
+import logging
+import math
+import operator
+import re
+import sys
+import threading
+import traceback
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import lru_cache
+from typing import (
+    Any,
+    cast,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+    TYPE_CHECKING
+)
+from typing_extensions import TypeAlias
+
+import torch
+import torch.fx
+import torch.fx.traceback as fx_traceback
+from torch.fx.experimental import _config as config
+
+from torch.fx.experimental.recording import (
+    FakeTensorMeta,
+    ShapeEnvEvent,
+    record_shapeenv_event,
+    replay_shape_env_events,
+    shape_env_check_state_equal
+)
+from torch.fx.experimental.sym_node import SymNode, SymTypes
+
+# NB: The sym_* functions are used via getattr() and must be imported here.
+from torch import SymBool, SymFloat, SymInt
+from torch._guards import ShapeGuard, Source, TracingContext
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+from torch.utils._sympy.functions import FloorDiv, Mod, IsNonOverlappingAndDenseIndicator
+from torch.utils._sympy.solve import try_solve
+from torch.utils._sympy.value_ranges import bound_sympy, SymPyValueRangeAnalysis, ValueRanges, ValueRangeError
+from torch.utils._sympy.singleton_int import SingletonInt
+from torch.utils._traceback import format_frame, CapturedTraceback
+from torch._utils_internal import signpost_event
+from torch._subclasses.meta_utils import is_sparse_any
+
+from torch._logging import LazyString
+
+if TYPE_CHECKING:
+    from torch._dynamo.source import TensorPropertySource
+
+InputList = List
+DimList = List
+
+log = logging.getLogger(__name__)
+
+class GuardOnDataDependentSymNode(RuntimeError):
+    pass
+
+import sympy
+from sympy.printing.str import StrPrinter
+from sympy.printing.precedence import precedence, PRECEDENCE
+
+aten = torch._ops.ops.aten  # type: ignore[has-type]
+
+__all__ = [
+    "has_symbolic_sizes_strides", "create_contiguous", "ShapeEnv", "is_concrete_int",
+    "guard_int", "guard_float", "guard_scalar", "canonicalize_bool_expr",
+    "hint_int", "SYMPY_INTERP", "free_symbols", "is_symbol_binding_fx_node",
+    "is_concrete_bool", "is_nested_int", "SHAPEENV_EVENT_KEY", "CURRENT_NODE_KEY",
+    "has_free_symbols", "sym_eq", "SymbolicContext", "StatelessSymbolicContext",
+    "StatefulSymbolicContext", "SubclassSymbolicContext", "statically_known_true",
+    "guard_size_oblivious",
+]
+
+# FX node metadata keys for symbolic shape FX graph.
+SHAPEENV_EVENT_KEY = "shapeenv_event"
+CURRENT_NODE_KEY = "current_node"
+
+# These are modules that contain generic code for interacting with ShapeEnv
+# which are unlikely to identify a particular interesting guard statement
+@lru_cache(None)
+def uninteresting_files() -> Set[str]:
+    import torch._inductor.sizevars
+    import torch._library.abstract_impl
+    import torch._subclasses.meta_utils
+    import torch._subclasses.fake_tensor
+    mods = [
+        sys.modules[__name__],
+        torch.fx.experimental.recording,
+        torch.fx.experimental.sym_node,
+        torch.fx.interpreter,
+        torch,
+        torch._inductor.sizevars,
+        torch._library.abstract_impl,
+        torch._subclasses.meta_utils,
+        torch._subclasses.fake_tensor,
+    ]
+    return {inspect.getfile(m) for m in mods}
+
+# We don't bother with the metaclass as all of the dispatching logic happens
+# entirely from Python
+#
+# Didn't bother with ancestors for now, unlikely to have multiple modes for
+# symints right now
+
+class ConstraintViolationError(RuntimeError):
+    pass
+
+def has_symbolic_sizes_strides(elem) -> bool:
+    return elem._has_symbolic_sizes_strides
+
+Int = Union[torch.SymInt, int]
+
+def create_contiguous(shape: Sequence[Int]) -> List[Int]:
+    strides: List[Int] = [1]
+    for dim in reversed(shape[:-1]):
+        strides.append(dim * strides[-1])
+    return list(reversed(strides))
+
+def hint_int(a: Union[torch.SymInt, int], fallback: Optional[int] = None) -> int:
+    """
+    Retrieve the hint for an int (based on the underlying real values as observed
+    at runtime).  If no hint is available (e.g., because data dependent shapes),
+    if fallback is not None, use that instead (otherwise raise an error).
+    """
+    if isinstance(a, torch.SymInt):
+        return a.node.require_hint(fallback)
+    assert type(a) is int, a
+    return a
+
+Scalar = Union[torch.SymInt, torch.SymFloat, torch.SymBool, int, float, bool]
+
+def has_hint(a: Scalar) -> bool:
+    if isinstance(a, SymTypes):
+        return a.node.has_hint()
+    return True
+
+def is_concrete_int(a: Union[int, SymInt]) -> bool:
+    r""" Utility to check if underlying object
+    in SymInt is concrete value. Also returns
+    true if integer is passed in.
+
+    Args:
+        a (SymInt or int): Object to test if it int
+    """
+    assert isinstance(a, (SymInt, int))
+
+    if isinstance(a, int):
+        return True
+
+    if isinstance(a.node.expr, sympy.core.numbers.Integer):
+        return True
+
+    return False
+
+# In obscure Meta only situations, sympy.logic.boolalg doesn't exist at runtime.
+# So make sure only type checker evaluates this alias.
+# Xref: https://www.internalfb.com/diff/D53324783
+SympyBoolean: TypeAlias = "sympy.logic.boolalg.Boolean"
+
+def guard_size_oblivious(expr: Union[torch.SymBool, bool]) -> bool:
+    """
+    Perform a guard on a symbolic boolean expression in a size oblivious way.
+    This is typically used when a non-oblivious test would result in a guard
+    on a data dependent value of which we don't know the value of at compile time.
+    When a guard is tested this way, we may diverge in behavior from how regular
+    PyTorch semantics would treat it.  For more information, see
+    https://github.com/pytorch/pytorch/pull/118579
+    """
+    if isinstance(expr, torch.SymBool):
+        return expr.node.guard_size_oblivious("", 0)
+    else:
+        assert isinstance(expr, bool)
+        return expr
+
+def canonicalize_bool_expr(expr: SympyBoolean) -> SympyBoolean:
+    r""" Canonicalize a boolean expression by transforming it into a lt / le
+    inequality and moving all the non-constant terms to the rhs.
+    We canonicalize And / Ors / Not via cnf and then canonicalize their subexpr
+    recursively
+    nb. sympy.Rel.canonical is not good enough https://github.com/sympy/sympy/issues/25924
+
+    Args:
+        expr (sympy.Expr): Expression to canonicalize
+    """
+    # Canonicalise an inequality by transforming it into a lt / le
+    # inequality and moving all the non-constant terms to the rhs
+    # We canonicalise And / Ors / Not via cnf
+    # nb. Relational.canonical in sympy is broken
+    # https://github.com/sympy/sympy/issues/25924
+
+    if not isinstance(expr, (sympy.Rel, sympy.And, sympy.Or, sympy.Not, sympy.Eq, sympy.Ne)):
+        return expr
+
+    if isinstance(expr, (sympy.And, sympy.Or, sympy.Not)):
+        expr = sympy.logic.boolalg.to_cnf(expr)
+    return _canonicalize_bool_expr_impl(expr)
+
+def _canonicalize_bool_expr_impl(expr: SympyBoolean) -> SympyBoolean:
+    """
+    After canonicalization, we are guaranteed to have eliminated Ge/Gt relations
+    (rewriting them to Le/Lt, respectively).
+    """
+    if isinstance(expr, (sympy.And, sympy.Or)):
+        return type(expr)(*map(canonicalize_bool_expr, expr.args))
+
+    opposite = {sympy.Gt: sympy.Lt, sympy.Ge: sympy.Le}
+    if isinstance(expr, tuple(opposite.keys())):
+        lhs = expr.rhs - expr.lhs
+        t = opposite[type(expr)]
+    else:
+        assert isinstance(expr, (sympy.Lt, sympy.Le, sympy.Eq, sympy.Ne))
+        lhs = expr.lhs - expr.rhs
+        t = type(expr)
+    rhs = 0
+    if isinstance(lhs, sympy.Add):
+        cts = []
+        variables = []
+        for term in lhs.args:
+            if term.is_number:
+                cts.append(term)
+            else:
+                variables.append(term)
+        lhs = sympy.Add(*variables)
+        rhs = -sympy.Add(*cts)
+    return t(lhs, rhs)
+
+def is_concrete_bool(a: Union[bool, SymBool]) -> bool:
+    r""" Utility to check if underlying object
+    in SymBool is concrete value. Also returns
+    true if integer is passed in.
+    Args:
+        a (SymBool or bool): Object to test if it bool
+    """
+    assert isinstance(a, (SymBool, bool))
+
+    if isinstance(a, bool):
+        return True
+
+    if isinstance(a.node.expr, (sympy.logic.boolalg.BooleanTrue, sympy.logic.boolalg.BooleanFalse)):
+        return True
+
+    return False
+
+def is_nested_int(s):
+    return isinstance(s, torch.SymInt) and s.node.is_nested_int()
+
+def _iterate_exprs(val: Union[SymInt, torch.Tensor]) -> Iterable[sympy.Basic]:
+    if isinstance(val, SymTypes):
+        # This allow applies to the jagged layout NestedTensor case as
+        # nested ints are not symbolic
+        if is_symbolic(val):
+            yield val.node.expr
+    elif isinstance(val, sympy.Basic):
+        yield val
+    elif isinstance(val, (int, float, bool)):
+        pass
+    elif is_sparse_any(val):
+        yield from _iterate_exprs(val.size())
+    elif isinstance(val, torch.Tensor):
+        yield from _iterate_exprs(val.size())
+        yield from _iterate_exprs(val.stride())
+        yield from _iterate_exprs(val.storage_offset())
+    elif isinstance(val, (tuple, list)):
+        for s in val:
+            yield from _iterate_exprs(s)
+    elif val is None:
+        pass
+    else:
+        raise AssertionError(f"cannot extract sympy expressions from {val} {type(val)}")
+
+def free_symbols(val: Union[SymInt, torch.Tensor]) -> Set[sympy.Symbol]:
+    if val is None:
+        return set()
+    itr = _iterate_exprs(val)
+    # we need at least 1 to call union, so we hand code the identity
+    try:
+        first_expr = next(itr)
+    except StopIteration:
+        return set()
+
+    return first_expr.free_symbols.union(*(e.free_symbols for e in itr))
+
+def has_free_symbols(val: Union[SymInt, torch.Tensor]) -> bool:
+    """Faster version of bool(free_symbols(val))"""
+    return not all(e.is_number for e in _iterate_exprs(val))
+
+# Like free_symbols, but filtered to only report unbacked symbols
+def free_unbacked_symbols(x):
+    # NB: keep synced with is_unbacked_symint
+    return {s for s in free_symbols(x) if s.name.startswith(("u", "f"))}
+
+# WARNING: Don't use this on Dynamo produced graphs, they don't have meta
+# setup!
+def is_symbol_binding_fx_node(node) -> Optional[sympy.Symbol]:
+    if (
+        node.op == "placeholder" and
+        "val" in node.meta and
+        isinstance(node.meta["val"], torch.SymInt) and
+        isinstance(node.meta["val"].node.expr, sympy.Symbol)
+    ):
+        return node.meta["val"].node.expr
+    return None
+
+def find_symbol_binding_fx_nodes(graph):
+    return {
+        node.meta["val"].node.expr: node
+        for node in graph.nodes
+        if is_symbol_binding_fx_node(node)
+    }
+
+def definitely_true(a):
+    """
+    Returns True only if we can tell that a is True, possibly introducing
+    a guard in the process.  If a depends on some unbacked SymInt, we may
+    return False even though there may exist a possible value of the SymInt
+    that would cause the expression to return True.
+
+    When is it appropriate to use definitely_true?  First, if you can use
+    a higher level combinator like parallel_or/parallel_and, prefer using
+    those instead, they are definitely safe (modulo short-circuiting).
+    Second, it can be used if the program would behave equivalently if
+    definitely_true always returned False (parallel_or/parallel_and are
+    examples of this pattern, modulo short-circuiting).  Finally, it even
+    be OK if the program wouldn't behave equivalently, so long as the
+    change is semantics preserving.  It can be semantics preserving if
+    the program errors in more cases than it did previously (but otherwise
+    behaves identically), or if it changes some quantity in a way that
+    doesn't matter (e.g., strides often fall in this bucket.)
+    """
+    if isinstance(a, SymBool):
+        if a.node.has_hint():
+            return guard_bool(a)
+        else:
+            return False
+    return bool(a)
+
+def definitely_false(a):
+    """
+    Returns True only if we can tell that a is False, possibly introducing
+    a guard in the process.  If a depends on some unbacked SymInt, we may
+    return False even though there may exist a possible value of the SymInt
+    that would cause the expression a to be False.  See definitely_true
+    for more usage guidance.
+    """
+    if isinstance(a, SymBool):
+        if a.node.has_hint():
+            return not guard_bool(a)
+        else:
+            return False
+    return not bool(a)
+
+def statically_known_true(x: Union[bool, SymBool]) -> bool:
+    """Returns True if x can be simplified to a constant and is true.
+
+    .. note::
+        This function doesn't introduce new guards, so the expression may end
+        up evaluating to true at runtime even if this function returns False.
+
+    Args:
+        x (bool, SymBool): The expression to try statically evaluating
+
+    """
+    if isinstance(x, SymBool):
+        expr = x.node.expr
+        shape_env = x.node.shape_env
+        try:
+            simplified = shape_env._maybe_evaluate_static(expr)
+            if simplified is not None:
+                return bool(simplified)
+        except Exception:
+            log.debug("Could not simplify %s", expr)
+        return False
+    assert isinstance(x, bool)
+    return x
+
+
+def parallel_or(*args):
+    """
+    Evaluate the logical OR of several arguments, avoiding guarding on
+    unbacked SymInts if another argument is definitely True.
+    """
+    if any(statically_known_true(a) for a in args):
+        return True
+    if any(definitely_true(a) for a in args):
+        return True
+    return any(args)
+
+def parallel_and(*args):
+    """
+    Evaluate the logical FALSE of several arguments, avoiding guarding on
+    unbacked SymInts if another argument is definitely False.
+    """
+    if any(statically_known_true(torch.sym_not(a)) for a in args):
+        return False
+    if any(definitely_false(a) for a in args):
+        return False
+    return all(args)
+
+def sym_eq(x, y):
+    """
+    Like ==, but when run on list/tuple, it will recursively test equality
+    and use sym_and to join the results together, without guarding.
+    """
+    if (isinstance(x, tuple) and isinstance(y, tuple)) or (isinstance(x, list) and isinstance(y, list)):
+        if len(x) != len(y):
+            return False
+        return functools.reduce(operator.and_, map(sym_eq, x, y), True)
+    elif isinstance(x, (int, torch.SymInt)) and isinstance(y, (int, torch.SymInt)):
+        return x == y
+    else:
+        raise AssertionError(f"unexpected sym_eq between {type(x)} {type(y)}")
+
+def guard_scalar(a):
+    if isinstance(a, (SymBool, bool)):
+        return guard_bool(a)
+    elif isinstance(a, (SymInt, int)):
+        return guard_int(a)
+    elif isinstance(a, (SymFloat, float)):
+        return guard_float(a)
+    else:
+        raise AssertionError(f"unrecognized scalar {a}")
+
+
+@record_shapeenv_event()
+def _constrain_symbol_range(shape_env, s: sympy.Symbol, compiler_min: int, compiler_max: int):
+    upd_vr = ValueRanges(compiler_min, compiler_max)
+    old_vr = shape_env.var_to_range.get(s, ValueRanges.unknown())
+    new_vr = shape_env.var_to_range[s] = old_vr & upd_vr
+    if new_vr != old_vr:
+        log.info("_constrain_symbol_range %s [%s, %s]", s, new_vr.lower, new_vr.upper)
+
+
+def _advise_is_size(a):
+    """
+    Don't use this directly; use torch._check_is_size instead.
+
+    This is a softer version of _constrain_range_for_size (with min=0,
+    max=Inf).  Instead of forcibly constraining a variable (and erroring if we
+    failed to constrain it), it will simply advise us that a size is
+    constrained in some way.  We will always defer a runtime assert for this
+    constraint if we cannot prove it at compile-time, but we we only
+    *sometimes* learn useful extra information at compile-time with this
+    information.  This is in contrast to constrain_range_for_size, where if
+    you don't call that on a fresh unbacked symint, chances are we will choke.
+
+    TODO: Make Dynamo handle this appropriately if this is seen in Dynamo-ed
+    code.  Right now this is only really used in code with AOTAutograd trace
+    through, so it is not a big problem that this isn't supported, but in
+    principle all of this code should be Dynamo'able too.
+
+    TODO: I didn't support min/max because I didn't have a use case where this
+    actually helped.  In principle we can support it, it just makes the
+    implementation below more complicated.
+    """
+
+    # This must always succeed, because the sole allowed caller _check_is_size
+    # was responsible for expect_true'ing this
+    assert a >= 0
+
+    # NB: it's important not to constrain range for size for *hinted* SymInts,
+    # because it is not only unsound, it will immediately trip our asserts
+    # that hints have to be consistent with static analysis!  If you somehow
+    # have an unbounded SymInt that later constrains to 1, this will be
+    # inconsistent with the range
+    if (
+        isinstance(a, SymInt)
+        and isinstance(a.node, SymNode)
+        and not a.node.has_hint()
+        and isinstance(a.node.expr, sympy.Symbol)
+    ):
+        _constrain_range_for_size(a)
+
+@record_shapeenv_event()
+def _constrain_range_for_size(a, min: Optional[int] = None, max: Optional[int] = None):
+    """
+    This function is NOT INTENDED to be used by itself.
+    """
+
+    if isinstance(a, (SymFloat, SymBool)):
+        raise ValueError("Constraining SymFloat/SymBool is nyi")
+
+    assert isinstance(a, SymInt), "can only constrain range for SymInt"
+    assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+
+    if min is None:
+        min = 0
+    if max is None:
+        max = sympy.oo
+
+    if max < min:
+        raise ValueError(
+            "Maximum value to constrain_as_size can't be less than the specified min value, "
+            "received min={min} and max={max}"
+        )
+
+    _constrain_symbol_range(
+        a.node.shape_env,
+        a.node.expr,
+        compiler_min=min,
+        compiler_max=max,
+    )
+    a.node.shape_env.size_like.add(a.node.expr)
+
+
+# inclusive both ways
+@record_shapeenv_event()
+def constrain_range(a, *, min: Optional[int], max: Optional[int] = None):
+    """
+    Applies a constraint that the passed in SymInt must lie between min-max
+    inclusive-inclusive, WITHOUT introducing a guard on the SymInt (meaning
+    that it can be used on unbacked SymInts).  If min/max are None, we assume
+    that the dimension is unbounded in that direction.  Repeated application
+    of constrain_range intersects the ranges.  This is a fairly low level API
+    that doesn't have a lot of safety guarantees (TODO: provide higher level
+    APIs).
+
+    Currently, we use this API in the following circumstance: when we allocate
+    an unbacked SymInt, denoting an integer quantity which is data dependent,
+    we ordinarily do not know anything about what values it may take.  This
+    means that any sort of guard on it will immediately fail.  However, in
+    many cases, we know something about the unbacked SymInt: for example, we
+    know that nonzero(x).size(0) must be >= 0.  We use constrain_range to
+    narrow the possible range, declaring that negative symbols are impossible.
+    This permits to definitely answer True to queries like 'nnz >= 0', even if
+    we don't know what the actual (hinted) value of 'nnz' is.  In fact, we
+    actually use constrain_range to unsoundly discharge common guards: for an
+    unbacked SymInt produced by nonzero, we will also assume that it is not
+    equal to 0/1 (even though these are perfectly possible values at runtime),
+    because we generally expect graphs that are valid for N=2 to also be valid
+    for N=1.
+    """
+    if min is None:
+        min = -sympy.oo
+    if max is None:
+        max = sympy.oo
+
+    if max < min:
+        raise ValueError(
+            "Maximum value to constrain_as_size can't be less than the specified min value, "
+            "received min={min} and max={max}"
+        )
+
+    if isinstance(a, int):
+        if not (min <= a <= max):
+            raise ValueError(f"Invalid value {a} for range [{min}:{max}]")
+        return
+
+    if isinstance(a.node.expr, sympy.Integer):
+        if not (min <= int(a.node.expr) <= max):
+            raise ValueRangeError(f"Invalid value {int(a.node.expr)} for range [{min}:{max}]")
+        return
+    assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+
+    # TODO: Shouldn't we install a guard if the symbol is backed?  Or is the
+    # semantics that this is an "unchecked" assert (but it this actually
+    # something useful?  Might be better to restrict only for unbacked
+    # SymInt).
+    _constrain_symbol_range(
+        a.node.shape_env,
+        a.node.expr,
+        compiler_min=min,
+        compiler_max=max,
+    )
+
+
+@record_shapeenv_event()
+def constrain_unify(a, b):
+    """
+    Given two SymInts, constrain them so that they must be equal.  NB:
+    this will not work with SymInts that represent nontrivial expressions
+    (yet!)
+    """
+    # TODO: this does not install a deferred runtime assert yet
+
+    # TODO: Maybe dedupe this with _maybe_guard_rel?
+    if not isinstance(a, SymInt):
+        if not isinstance(b, SymInt):
+            assert a == b
+        else:
+            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+            shape_env = b.node.shape_env
+            shape_env.replacements[b.node.expr] = sympy.Integer(a)
+    else:
+        # TODO: Actually, we can support this as long as one of them is a symbol.
+        # NB: We can't actually do "unification" as our operators are not
+        # injective
+        assert isinstance(a.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+        shape_env = a.node.shape_env
+        if not isinstance(b, SymInt):
+            shape_env.replacements[a.node.expr] = sympy.Integer(b)
+        else:
+            assert a.node.shape_env is b.node.shape_env
+            assert isinstance(b.node.expr, sympy.Symbol), "constraining non-Symbols NYI"
+            new_var = shape_env._find(a.node.expr)
+            shape_env.replacements[b.node.expr] = new_var
+
+# Assume that a boolean is true for the purposes of subsequent symbolic
+# reasoning.  This will keep track of corresponding runtime checks to verify
+# that the result is upheld: either as a regular guard, or as a special set
+# of asserts which are triggered when an unbacked SymInt is allocated.
+#
+# DO NOT use this function for these cases:
+#
+#  - This is inappropriate for "branching" conditions (where both
+#    true and false result in valid programs).  We will always assume
+#    the condition evaluates true, and so it will never be possible
+#    to trace the false condition when you use it.  For true branching
+#    on unbacked SymInts, you must use torch.cond; if you incorrectly
+#    use expect_true in this case, you will make the false branch
+#    unreachable (as we will simply assume that only the true branch
+#    is ever exercised).
+#
+#  - This is inappropriate for situations where you know some other system
+#    invariant guarantees that this property holds, since you don't
+#    really need to insert a runtime check in that case.  Use something
+#    like constrain_range in that case.
+#
+# This API has a hitch.  To avoid having to reimplement error reporting
+# capabilities, this function CAN return False.  The invariant is that
+# the surrounding code must raise an error when this function returns
+# False.  This is quite low level, so we recommend using other functions
+# like check() which enforce this in a more intuitive way.
+#
+# By the way, this name is a nod to the __builtin_expect macro,
+# which is used similarly (but unlike __builtin_expect, you MUST fail
+# in the unlikely branch.)  (I think expect is a good name; in recent
+# versions of C++, this is replaced with [[likely]], which is weaker
+# and not accurate for this function!)
+def expect_true(a, skip: int = 0):
+    if isinstance(a, SymBool):
+        # TODO: check perf implications of this
+        frame = inspect.currentframe()
+        for _ in range(skip + 1):  # always run this loop at least once
+            frame = frame.f_back
+        return a.node.expect_true(frame.f_code.co_filename, frame.f_lineno)
+    assert type(a) is bool, a
+    return a
+
+def guard_bool(a):
+    if isinstance(a, SymBool):
+        return a.node.guard_bool("", 0)  # NB: uses Python backtrace
+    assert type(a) is bool, a
+    return a
+
+def guard_int(a):
+    if isinstance(a, SymInt):
+        return a.node.guard_int("", 0)  # NB: uses Python backtrace
+    assert type(a) is int, a
+    return a
+
+def guard_float(a):
+    if isinstance(a, SymFloat):
+        return a.node.guard_float("", 0)  # NB: uses Python backtrace
+    assert isinstance(a, float), a
+    return a
+
+# Given a GraphModule, return all the FakeTensors for all the placeholders
+def fx_placeholder_vals(gm):
+    return [n.meta['val'] for n in gm.graph.nodes if n.op == "placeholder"]
+
+def fx_placeholder_targets(gm):
+    return [n.target for n in gm.graph.nodes if n.op == "placeholder"]
+
+# Given a GraphModule and arguments to run it with, evaluate that the guards
+# for its associated ShapeEnv are satisfied by the passed arguments.  This
+# WILL check for duck sizing.
+def eval_guards(gm, *args, ignore_static=True):
+    return gm.shape_env.evaluate_guards_for_args(fx_placeholder_vals(gm), args, ignore_static=ignore_static)
+
+def bind_symbols(gm, *args):
+    return gm.shape_env.bind_symbols(fx_placeholder_vals(gm), args)
+
+def _assert_bound_is_rational(expr: sympy.Expr, bound: ValueRanges):
+    """
+    We assert that the bounds are either Boolean, or not finite, or can be computed
+    in exact prevision via rational arithmetic.
+    The only exception to this is the rare case when the user calls `sqrt(s0)`
+    sqrt is turned into sympy.Pow so we just match for that (it matches more things, but still)
+    """
+    assert bound.lower.is_rational or bound.lower.is_Boolean or not bound.lower.is_finite or expr.has(sympy.Pow), (bound, expr)
+    assert bound.upper.is_rational or bound.upper.is_Boolean or not bound.upper.is_finite or expr.has(sympy.Pow), (bound, expr)
+
+class DimDynamic(Enum):
+    """
+    Controls how to perform symbol allocation for a dimension.  It is always
+    sound to default this to DYNAMIC, but the policies DUCK and STATIC can
+    result in better trace-time and compile-time performance, as they reduce
+    the number of allocated symbols and generally make your graph more static.
+
+    NB: If we notice you've applied a constraint to the dimension, we will
+    force it to DYNAMIC for simplicity.
+
+    DimDynamic is controlled by a variety of higher level UX features.
+    Currently:
+
+    - In eager mode, the default policy is DUCK.
+        - The default is changed to STATIC with assume_static_by_default.
+        - An individual dim is marked DYNAMIC if you mark_dynamic_dim.
+    - In export mode, the default policy is STATIC.
+        - An individual dim is marked DYNAMIC if you mention it as dynamic_dim
+          in the constraints kwarg.
+    """
+    # Treat the dimension symbolically
+    DYNAMIC = 0
+    # Treat the dimension symbolically, but if its hint matches another
+    # dynamic dimension, unify the two symbols ("duck sizing")
+    DUCK = 1
+    # Treat the dimension statically based on its hint
+    STATIC = 2
+
+
+# NB: These constraints affect both clients and backends: given some
+# constraint C, the client must pass inputs that satisfy the constraint,
+# while a backend must not introduce guards BEYOND this constraint.
+# For clarity, we document the implications on both sides for both the client
+# and the backend.
+#
+# NB: These constraints are on a *single* dimension.  In principle, we could
+# also have multi-dimension constraints, but our guess is that this is not
+# actually useful and so we are not supporting it right now.
+#
+# NB: Strict constraints are typically only suitable for export, as in eager
+# a backend like inductor may validly introduce extra, discretionary guards
+# to improve performance of code.  A StrictMinMaxConstraint would be brittle
+# under future optimizations performed by inductor; we don't guarantee
+# eager code with StrictMinMaxConstraint will keep working in the future!
+
+@dataclass(frozen=True)
+class Constraint:
+    warn_only: bool
+
+@dataclass(frozen=True)
+class StrictMinMaxConstraint(Constraint):
+    """
+    For clients: the size at this dimension must be within 'vr' (which
+    specifies a lower and upper bound, inclusive-inclusive) AND it
+    must be non-negative and should not be 0 or 1 (but see NB below).
+
+    For backends: there must not be any guards on this dimension which
+    are not implied by the given lower and upper bound.  Regardless of
+    the lower bound, the backend can assume the size is non-negative
+    and that it is not 0 or 1.
+
+    An unbounded StrictMinMaxConstraint can be thought of as a strict version
+    of "RelaxedUnspecConstraint".
+
+    NB: Export will often unsoundly assume that a graph works for 0/1, even
+    though at trace time we assumed size is not 0 or 1.  The idea is that
+    if we produce a graph that works for a range of values, it will be OK
+    for N=0/1 too.
+    """
+    vr: ValueRanges
+
+    def render(self, source: Source):
+        """Format the constrain equation"""
+        # TODO: better printing for -oo and oo
+        return f"{self.vr.lower} <= {source.name()} <= {self.vr.upper}"
+
+@dataclass(frozen=True)
+class RelaxedUnspecConstraint(Constraint):
+    """
+    For clients: no explicit constraint; constraint is whatever is implicitly
+    inferred by guards from tracing.
+
+    For backends: there must exist at least TWO possible values for the
+    size at this dimension which satisfy the guards for this dimension.
+
+    In other words, this constraint helps us distinguish between "we don't
+    care if this dimension specializes or not" versus "this dimension must be
+    unspecialized."  However, this constraint doesn't say very much about what
+    specialization is permitted; for example, if we guard on a size being
+    even, this would still be acceptable under an unspec constraint.  This
+    makes RelaxedUnspecConstraint useful for eager mode, where your backend compiler
+    may add constraints to otherwise dynamic dimensions; we can't assert that
+    there are NO guards as this is brittle because compilers should be able to
+    add extra constraints.  If you want to assert that there are no guards,
+    use StrictMinMaxConstraint with an unbounded ValueRanges.
+    """
+    def render(self, source: Source):
+        return f"RelaxedUnspecConstraint({source.name()})"
+
+# NB: None here indicates the client constraint is whatever is implicitly
+# inferred by guards from tracing, and that a backend can add whatever guards
+# it wants (including fully specializing the value).
+DimConstraint = Union[StrictMinMaxConstraint, RelaxedUnspecConstraint, None]
+
+@dataclass(frozen=True)
+class EqualityConstraint(Constraint):
+    """
+    Represent and decide various kinds of equality constraints between input sources.
+
+    A "source pair" is a pair of input sources for dynamic dimensions that
+    are specified equal. We represent `source_pairs` in a union-find forest
+    so that we can efficiently check whether two such sources are transitively equal.
+
+    A "derived equality" relates an input source to an expression over a root.
+    The root can be another input source, corresponding to some dynamic dimension,
+    or a phantom symbol that does not directly represent any dynamic dimension. We
+    represent `derived_equalities` involving input sources in a transitively-closed map
+    so that we can efficiently check whether an input source is transitively equal to
+    a given expression over another input source.
+    (NOTE: In contrast, it is easy to decide whether an input source is transitively equal
+    to a given expression over a phantom symbol; such expressions are already in canonical
+    form and so the problem reduces to symbolic expression equality.)
+    """
+    source_pairs: List[Tuple[Source, Source]]
+    derived_equalities: List[Tuple[Source, Union[Source, sympy.Symbol], Callable[[sympy.Expr], sympy.Expr]]]
+    phantom_symbols: List[sympy.Symbol]
+
+    def __post_init__(self):
+        """Pre-processing to answer queries `is_equal` and `is_derived` below.
+
+        Example: Suppose we are given:
+          source_pairs [a = b, b = c]
+          derived_equalities [d = c + 1, e = d - 1]
+        We first construct a union find with source_pairs:
+          _parents = {a: a, b: a, c: a}
+        Then we compute canonical symbolic expressions, recursively applying derived_equalities
+        until we bottom out:
+          _defs = {d: c + 1, e: (c + 1) - 1 aka c}
+        """
+
+        # self._parents is a map from input sources to input sources where, conceptually,
+        # these are directed edges in a union-find forest
+        _parents: Dict[Source, Source] = {}
+        object.__setattr__(self, "_parents", _parents)
+        # self._defs is a map from input sources to "canonical" symbolic expressions,
+        # i.e., unary expressions with symbols that corresponds to regular Dims (i.e.,
+        # not derived Dims)
+        _defs: Dict[Source, sympy.Expr] = {}
+        object.__setattr__(self, "_defs", _defs)
+
+        for source1, source2 in self.source_pairs:
+            # preprocess into a union-find forest
+            self._union(self._find(source1), self._find(source2))
+        for source, root, fn in self.derived_equalities:
+            # preprocess into a transitively-closed map
+            # NOTE(avik): we reuse the union-find forest for canonicalizing input sources
+            if isinstance(root, sympy.Symbol):
+                self._defs[self._find(source)] = fn(root)
+            else:
+                self._defs[self._find(source)] = fn(self._rewrite(root))
+
+    def _find(self, source):
+        # chase edges to find the root of this equivalence class
+        if source in self._parents:
+            return self._find(self._parents[source])
+        else:
+            return source
+
+    def _union(self, root1, root2):
+        # merge two equivalence classes by adding an edge from one root to the other
+        if root1 != root2:
+            self._parents[root1] = root2
+
+    def _rewrite(self, src):
+        # always represent the given source by the root of its equivalence class
+        src = self._find(src)
+        if src in self._defs:
+            # simply look up the definition if it exists
+            # NOTE(avik): This works because definitions are always transitively-closed;
+            # otherwise we would have to do recursive rewriting.
+            return self._defs[src]
+        else:
+            # otherwise, create a symbol representing the source
+            return sympy.Symbol(src.name())
+
+    def is_equal(self, source1, source2):
+        return (
+            # check whether source1 and source2 have the same root
+            self._find(source1) == self._find(source2) or
+            # check whether source1 is derived equal to source2
+            self.is_derived(source1, source2, lambda x: x)
+        )
+
+    def is_derived(self, src, symbol_src, fn):
+        # check whether both src and symbol_src have the same definition
+        return self._rewrite(src) == fn(self._rewrite(symbol_src))
+
+
+def _assert_symbol_context(symbolic_context):
+    assert isinstance(symbolic_context, SymbolicContext), "Invalid symbolic_context object"
+    assert type(symbolic_context) is not SymbolicContext, "Illegal usage of symbolic_context ABC"
+
+
+@dataclass(frozen=True)
+class SymbolicContext:
+    """
+    Data structure specifying how we should create symbols in
+    ``create_symbolic_sizes_strides_storage_offset``; e.g., should
+    they be static or dynamic.
+
+    This is an abstract base class because we are probably going to add
+    another version of this that says "use exactly these SymInts, don't
+    allocate fresh symbols."
+    """
+    pass
+
+
+@dataclass(frozen=True)
+class StatelessSymbolicContext(SymbolicContext):
+    """
+    Create symbols in ``create_symbolic_sizes_strides_storage_offset`` via
+    a symbolic_context determination as given by ``DimDynamic`` and ``DimConstraint``.
+    This will cause fresh symbols to be allocated
+    """
+    dynamic_sizes: DimList[DimDynamic]
+    constraint_sizes: DimList[DimConstraint] = None
+    # If the tensor is a view, this should be populated for the base. It contains
+    # information on how to allocate symbols when recursively fakeifying the base
+    # during view fake-ification.
+    view_base_context: Optional[SymbolicContext] = None
+    # TODO: add storage offset and stride symbolic_context
+
+    def __post_init__(self):
+        if self.constraint_sizes is None:
+            object.__setattr__(self, 'constraint_sizes', [None] * len(self.dynamic_sizes))
+
+
+# note [Tensor Fakification and Symbol Caching]
+#
+# As of the time of this note, dynamo creates a fresh fake tensor mode for backends.
+# The reason we do this is because there are certain classes of operations, namely,
+# metadata mutations, that change tensor size, stride, etc. This means that the fake tensor
+# state at the end of a dynamo trace is different than the fake tensor state at the beginning
+# of a trace. Backends like aot_autograd need a fresh fake tensor to correctly track metadata mutation,
+# view relationships, etc.
+#
+# As we create a new fake mode, we also lose the memoization that comes with it. Rather than
+# transfer the memoization cache, we instead transfer the shape env. However, with this
+# comes nuance - as dynamo is selective in how it makes symbolic shapes. Due to strategies in
+# automatic dynamic and constraints, the policy for which dims are dynamic is nuanced and varies across
+# recompilations.
+#
+# In order to preserve the symbolic decisions made during dynamo tensor fakification, we pass
+# a StatefulSymbolicContext at creation time. This object is tracked, per tensor, on the TracingContext.
+# The lifecycle of this object should match the lifecycle of the original dynamo tracked tensor, and it is
+# safe to reuse this object as many times as necessary to create a fake tensor. Fake tensors
+# created with new fake modes should produce the same exact symbols as the original, providing the same shape_env
+# is used.
+# TODO(voz): Shape env validation
+@dataclass(frozen=True)
+class StatefulSymbolicContext(StatelessSymbolicContext):
+    """
+    Create symbols in ``create_symbolic_sizes_strides_storage_offset`` via
+    a symbolic_context determination as given by a cache of Source:Symbol. A cache hit
+    will reuse a stored symbol, and a cache miss will write to this cache.
+
+    This behaves like StatelessSymbolicContext, except the cache supersedes the
+    other values - dynamic_sizes and constraint_sizes will not be read if we cache
+    hit.
+
+    It is the cache owners responsibility to maintain the lifecycle of the cache
+    w/r/t different shape_envs, clearing, etc.
+    """
+    tensor_source: Source = None
+    # Why is this keyd on int first?
+    # That integer is actually the id of the shape_env. This cache short-circuits symbol
+    # creation, and we must store it per shape env. Now, while tracing invariants are a single
+    # shape env per tracing context, and every new frame gets a new shape_env. So where would we have
+    # multiple shape envs? The answer lies in recording. When we are replaying, replay_shape_env_events
+    # is invoked, and creates a new shape_env. Replaying events against this new shape_env will
+    # cause it to fail with unknown symbols, as the symbols cached here will skip creation, and never
+    # get recorded in var_to_val, etc.
+    # TODO(voz): consider a weakref to the shape_env here
+    shape_env_to_source_to_symbol_cache : Dict[int, Dict["TensorPropertySource", "sympy.Expr"]] = None
+
+    def __post_init__(self):
+        # The None default is annoying, but required because of dataclass limitations
+        assert self.tensor_source is not None
+        if not self.shape_env_to_source_to_symbol_cache:
+            object.__setattr__(self, 'shape_env_to_source_to_symbol_cache', {})
+
+
+@dataclass(frozen=True)
+class SubclassSymbolicContext(StatefulSymbolicContext):
+    """
+    The correct symbolic context for a given inner tensor of a traceable tensor subclass
+    may differ from that of the outer symbolic context. This structure allows for this
+    flexibility, with inner symbolic contexts mapped via attr -> symbolic context.
+    """
+    inner_contexts: Dict[str, SymbolicContext] = None
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self.inner_contexts is None:
+            self.inner_contexts = {}
+
+
+def is_symbolic(val: Union[int, SymInt, float, SymFloat, bool, SymBool]) -> bool:
+    if isinstance(val, (int, float, bool)):
+        return False
+    return val.node.is_symbolic()
+
+IndicatorTypes = (IsNonOverlappingAndDenseIndicator,)
+
+@lru_cache(256)
+def safe_expand(r):
+    if hasattr(r, 'expand'):
+        try:
+            return sympy.expand(r)
+        except RecursionError:
+            log.warning("RecursionError in sympy.expand(%s)", r)
+            return r
+    else:
+        return r
+
+def error():
+    raise AssertionError("shouldn't be hit")
+
+
+# TODO: Deduplicate this with torch/_prims_common/__init__.py
+def eval_is_non_overlapping_and_dense(sizes, strides):
+    return int(guard_bool(_eval_is_non_overlapping_and_dense(sizes, strides)))
+
+def _eval_is_non_overlapping_and_dense(sizes, strides):
+    dim = len(sizes)
+
+    # Short-circuits for tensors of rank one, which are
+    # non-overlapping and "dense" if their stride is one
+    # or it is a 0/1 element tensor
+    if dim == 1:
+        return strides[0] == 1 or sizes[0] < 2
+
+    # Checks that there exists a permutation of the strides s.t. the tensor would be contiguous
+    # Sorts (length, stride) pairs by stride
+    lengths_and_strides = sorted(
+        zip(sizes, strides), key=operator.itemgetter(1)
+    )
+
+    # Unlike the C++ code, we don't move the 0/1 size dimensions to the
+    # end.  So we have to keep going for this code.
+    expected_stride = 1
+    for length, stride in lengths_and_strides:
+
+        if length == 1:
+            continue
+
+        if stride != expected_stride:
+            return False
+
+        expected_stride *= length
+
+    return True
+
+
+def cast_symbool_to_symint_guardless(symbool: torch.SymBool) -> torch.SymInt:
+    int_sym = sympy.Piecewise((1, symbool.node.expr), (0, True))
+    return symbool.node.shape_env.create_symintnode(int_sym, hint=int(symbool.node.require_hint()))
+
+SYMPY_INTERP = {
+    'Abs': operator.abs,
+    'Eq': operator.eq,
+    'Ne': operator.ne,
+    'Gt': operator.gt,
+    'Lt': operator.lt,
+    'Le': operator.le,
+    'Ge': operator.ge,
+    'Min': min,
+    'Max': max,
+    'Mod': operator.mod,
+    'FloorDiv': operator.floordiv,
+    'TrueDiv': operator.truediv,
+    'IsNonOverlappingAndDenseIndicator': eval_is_non_overlapping_and_dense,
+    'floor': math.floor,
+    'ceiling': math.ceil,
+    'cast_symbool_to_symint_guardless': cast_symbool_to_symint_guardless,
+    'Round': builtins.round,
+    'RoundDecimal': builtins.round,
+}
+
+
+def _lru_cache(fn, maxsize=None):
+    """
+    Wrapper around lru_cache that clears when new info about shapes has been
+    updated.
+
+    Use lru_cache if the output is always the same, regardless of the
+    constraints we know now (i.e. evaluate_expr)
+
+    Use _lru_cache otherwise.
+
+    Also note that this depends on _update_version_counter being called on the
+    shape environment whenever the constraints are updated, otherwise the cache
+    will not be cleared.
+    """
+    fn_cache = lru_cache(maxsize)(fn)
+    prior_version = 0
+
+    if config.validate_shape_env_version_key:
+        prior_key = None
+
+        @functools.wraps(fn)
+        def wrapper(self, *args, **kwargs):
+            nonlocal prior_version, prior_key
+            if prior_key is None:
+                prior_key = self._get_key()
+
+            if prior_version != self._version_counter:
+                fn_cache.cache_clear()
+                prior_version = self._version_counter
+                prior_key = self._get_key()
+            else:
+                assert prior_key == self._get_key(), \
+                    "ShapeEnv cache key changed without version being updated!"
+
+            return fn_cache(self, *args, **kwargs)
+
+    else:
+
+        @functools.wraps(fn)
+        def wrapper(self, *args, **kwargs):
+            nonlocal prior_version
+            if prior_version != self._version_counter:
+                fn_cache.cache_clear()
+                prior_version = self._version_counter
+
+            return fn_cache(self, *args, **kwargs)
+
+    wrapper.cache_clear = fn_cache.cache_clear
+    wrapper.cache_info = fn_cache.cache_info  # type: ignore[attr-defined]
+    return wrapper
+
+
+# This is pretty similar to ShapeGuard but it also comes with a message,
+# and is exclusively used for things that MUST be true (unlike guards,
+# which can evaluate False, in which case you just choose not to use
+# a particular specialization)
+@dataclass(frozen=True)
+class RuntimeAssert:
+    expr: sympy.Expr
+    msg: str = field(repr=False)
+    stack: str = field(repr=False)
+
+
+class ShapeGuardPrinter(StrPrinter):
+    def __init__(
+        self,
+        symbol_to_source,
+        source_ref,
+        var_to_sources,
+    ):
+        super().__init__()
+        self.symbol_to_source = symbol_to_source
+        self.source_ref = source_ref
+        self.var_to_sources = var_to_sources
+
+    def _print_Not(self, expr):
+        return 'not %s' % (self.parenthesize(expr.args[0], PRECEDENCE["Not"]))
+
+    def _print_And(self, expr):
+        return self.stringify(expr.args, " and ", PRECEDENCE["And"])
+
+    def _print_Or(self, expr):
+        return self.stringify(expr.args, " or ", PRECEDENCE["Or"])
+
+    def _print_Symbol(self, expr) -> str:
+        assert isinstance(expr, sympy.Symbol), str(type(expr))
+
+        def repr_symbol_to_source():
+            return repr({
+                symbol: [s.name() for s in sources]
+                for symbol, sources in self.symbol_to_source.items()
+            })
+
+        assert self.symbol_to_source.get(expr), (
+            f"{expr} (could be from {[s.name() for s in self.var_to_sources[expr]]}) "
+            f"not in {repr_symbol_to_source()}.  If this assert is failing, it could be "
+            "due to the issue described in https://github.com/pytorch/pytorch/pull/90665"
+        )
+        return self.source_ref(self.symbol_to_source[expr][0])
+
+
+class LoggingShapeGuardPrinter(ShapeGuardPrinter):
+    def __init__(self, var_to_sources):
+        super().__init__(var_to_sources, lambda n: n.name(), var_to_sources)
+
+
+class DynamicDimConstraintPrinter(StrPrinter):
+    """
+    Printer for dynamic dim constraints.
+    - Instead of t.size()[d] it prints dynamic_dim(t, d)
+    - Instead of Eq(_, _), Mod(_, _), etc. it prints _ == _, _ % _, etc.
+
+    We use this to suggest code for specifying dynamic dim constraints.
+    """
+    def __init__(self, symbol_to_source, source_name_to_debug_name):
+        super().__init__()
+        self.symbol_to_source = symbol_to_source
+        self.source_name_to_debug_name = source_name_to_debug_name
+
+    def print_source(self, source) -> str:
+        if self.source_name_to_debug_name:
+            return source.name()
+        return f"dynamic_dim({source.base.name()}, {source.idx})"
+
+    def _print_Symbol(self, expr) -> str:
+        assert isinstance(expr, sympy.Symbol), str(type(expr))
+        assert self.symbol_to_source.get(expr), (
+            f"Unknown symbol {expr} created by constraints solver"
+        )
+        return self.print_source(self.symbol_to_source[expr][0])
+
+    def _print_Relational(self, expr):
+        return '{} {} {}'.format(
+            self.parenthesize(expr.lhs, precedence(expr)),
+            expr.rel_op,
+            self.parenthesize(expr.rhs, precedence(expr))
+        )
+
+
+class DimConstraints:
+    """
+    Custom solver for a system of constraints on symbolic dimensions.
+    Solutions are "static" values or simplified "dynamic" constraints.
+    """
+
+    def __init__(self, symbol_to_source, var_to_val, marked_dynamic, source_name_to_debug_name):
+        # We try to solve systems of inequalities with 1 free variable.
+        self._univariate_inequalities: Dict[sympy.Symbol, Set[sympy.Expr]] = defaultdict(set)
+        # Among them, we prioritize solving for a free variable that has equalities.
+        # NOTE: _symbols_with_equalities is always a subset of _univariate_inequalities.keys()
+        # and removing a symbol from the former => removing it from the latter.
+        self._symbols_with_equalities: Set[sympy.Symbol] = set()
+        # A solution of a free variable with equalities becomes a substitution.
+        # We use these substitutions to simplify other constraints.
+        # NOTE: removing a symbol from _symbols_with_equalities => adding it to _substitutions.
+        self._substitutions: Dict[sympy.Symbol, sympy.Integer] = {}
+
+        # In general, constraints may have // and % operations.
+        # Of course, // can be expressed in terms of / and %.
+        # Our inequality solver can handle / but not %. So we need to transform them away.
+        # We do so by using the values of variables as hints to evaluate %.
+        # For soundness we record additional congruence guards and solve them separately.
+        self._var_to_val: Dict[sympy.Symbol, sympy.Integer] = var_to_val
+        self._congruences: Set[sympy.Expr] = defaultdict(set)
+
+        # We do not try to (directly) solve inequalities with > 1 free variables.
+        # NOTE: free variables in these inequalities cannot also be in _substitutions.
+        self._multivariate_inequalities: Set[sympy.Expr] = set()
+
+        # We park external equalities between free variables here.
+        self._symbolic_equivalences: List[Tuple[Source, sympy.Expr]] = []
+
+        # Solutions come in two forms:
+        # - (static) specializations
+        # - (dynamic) inequalities / congruences
+        self._static_results: Set[str] = set()
+        self._dynamic_results: Set[str] = set()
+
+        # printer for solutions
+        self._dcp = DynamicDimConstraintPrinter(symbol_to_source, source_name_to_debug_name)
+
+        # inconsistencies found on substituting with concrete values / static solutions
+        self._inconsistencies: List[str] = []
+
+        # symbols that are marked dynamic
+        self._marked_dynamic = marked_dynamic
+
+    def rewrite_with_congruences(self, s, expr):
+        """
+        Eliminate expressions of the form b // d and b % d while adding congruences of the form b % d == k.
+        This leaves rational operators (in particular of the form b / d) that our inequality solver can handle.
+        We solve the added congruences separately (using our congruence solver, see below).
+        """
+        def mod_handler(*args):
+            # Suppose that we have an expression of the form b % d with free variable s.
+            # Using the value of s as a "hint," we can evaluate b % d to a value k.
+            # Then we can rewrite b % d to k while adding the guard b % d == k.
+
+            # NOTE(avik): This abstraction is provably sound but, in general, incomplete. It is complete IFF
+            # the original expression always evaluates to a constant value (i.e., it does not vary with s).
+            # In other words,
+            # - solutions of s with the rewritten expression are guaranteed to also be solutions of s with
+            #   the original expression;
+            # - while it may be possible to find solutions of s with the original expression that are not
+            #   solutions with the rewritten expression, in that case the original expression cannot evaluate
+            #   to the same value for all solutions of s.
+            #
+            # Should we be worried about this incompleteness? No, because of the following reasons:
+            # 1. It unblocks dramatic simplification that would not be otherwise possible with current tech
+            #    (i.e., "don't let perfect be the enemy of the good").
+            # 2. We already have a tradition of using hints to add guards in the compiler for making progress.
+            # 3. We have not yet seen a counterexample arise in practice! In particular, any congruence guards
+            #    we generate (or simplify to) seem to be of the form b % d == k where k is a constant.
+            #
+            # Here's a theoretical counterexample: 3*s % (s + 1) == s - 2, that is satisfied by all s >= 2.
+            # With any hint (say) s = k, we'd rewrite this to: 3*s % (s + 1) == k - 2. But, substituting, we
+            # would then get k - 2 == s - 2, and thus s = k as the (only, constant) solution!
+            base, divisor = args
+            base, divisor = self.rewrite_with_congruences(s, base), self.rewrite_with_congruences(s, divisor)
+            mod_reduced = base.subs(self._var_to_val) % divisor.subs(self._var_to_val)
+            congruence = (base - mod_reduced) % divisor
+            if congruence != 0:
+                self._congruences[s].add(congruence)
+            return mod_reduced
+
+        def floor_div_handler(*args):
+            # Suppose that we have an expression of the form b // d with free variable s.
+            # Using the value of s, we can evaluate b % d to a value k.
+            # Then we can rewrite b // d to (b - k) / d, while adding the guard b % d == k.
+
+            # NOTE(avik): This is exactly equivalent to rewriting b // d as (b - (b % d)) / d
+            # and eliminating b % d as above.
+            base, divisor = args
+            base, divisor = self.rewrite_with_congruences(s, base), self.rewrite_with_congruences(s, divisor)
+            mod_reduced = base.subs(self._var_to_val) % divisor.subs(self._var_to_val)
+            congruence = (base - mod_reduced) % divisor
+            if congruence != 0:
+                self._congruences[s].add(congruence)
+            return (base - mod_reduced) / divisor
+
+        if expr.has(Mod):
+            expr = expr.replace(Mod, mod_handler)
+        if expr.has(FloorDiv):
+            expr = expr.replace(FloorDiv, floor_div_handler)
+        return expr
+
+    def add(self, expr) -> bool:
+        """Add an expression to the set of constraints.
+
+        Return whether the expression is a trivial constraint (i.e., an obvious tautology).
+        """
+        if expr == sympy.true:
+            return True
+        orig_expr = expr
+        orig_reduced = orig_expr.subs(self._var_to_val)
+        # TODO(avik): https://github.com/pytorch/pytorch/issues/101093
+        # It is possible that `expr` will fail the consistency check because of
+        # precision errors. Specifically, on substituting its free symbols with
+        # their concrete values, we might end up comparing floats. Until we have
+        # a fix for this issue, we delay raising such failures. See solve().
+        if orig_reduced == sympy.false:
+            self._inconsistencies.append(f"{orig_expr} is inconsistent!")
+        if isinstance(expr, sympy.Ne):
+            # we're not going to do anything useful with these, so drop them
+            return False
+        free_symbols = expr.free_symbols
+        assert free_symbols, f"Did not expect constraint with no free variables: {expr}"
+        if len(free_symbols) > 1:
+            # multivariate: record and move on
+            self._multivariate_inequalities.add(expr)
+        else:
+            # univariate: can solve these immediately
+            s = next(iter(free_symbols))
+            # eliminate // and % (see documentation of `rewrite_with_congruences` above)
+            old_n_congruences = len(self._congruences[s])
+            expr = self.rewrite_with_congruences(s, expr)
+            new_n_congruences = len(self._congruences[s])
+            if expr == sympy.true:
+                return old_n_congruences == new_n_congruences
+            reduced = expr.subs(self._var_to_val)
+            if reduced == sympy.false:
+                self._inconsistencies.append(
+                    f"{expr}, obtained by rewriting {orig_expr} with congruences, "
+                    "is inconsistent!"
+                )
+            if isinstance(expr, sympy.Eq):
+                # special status for symbols that have equalities (see `solve` below)
+                self._symbols_with_equalities.add(s)
+            self._univariate_inequalities[s].add(expr)
+        return False
+
+    def add_equality(self, source, expr):
+        """Add an equality constraint"""
+        if expr.is_number:
+            # specialization, right here
+            self._static_results.add(f"{source.name()} == {expr}")
+        else:
+            # these will resolve to either specializations or dynamic equality constraints
+            self._symbolic_equivalences.append((source, expr))
+
+    def _reduce_congruences(self):
+        reduced_congruences = {}
+        for s, congruences in self._congruences.items():
+            remainder_modulus_pairs = []
+            congruences_to_check = set()
+            for congruence in congruences:
+                base, divisor = congruence.args
+                # We are given a congruence of the form base % divisor == 0 with a free variable s. So:
+                # - we transform this into an equation of the form base = divisor * tmp;
+                # - we solve this equation for s to get a linear solution with free variable tmp.
+                tmp = sympy.Symbol("tmp", integer=True)
+                symbol, solution = sympy.solve_linear(base - divisor * tmp, symbols=[s])
+                # See https://docs.sympy.org/latest/modules/solvers/solvers.html#sympy.solvers.solvers.solve_linear
+                # for how to interpret the results.
+                if s == symbol:
+                    # This means the solution is of the form s = modulus*tmp + remainder.
+                    modulus, remainder = sympy.polys.polytools.div(solution, tmp)
+                    if isinstance(modulus, sympy.Integer) and isinstance(remainder, sympy.Integer):
+                        # Make sure 0 <= remainder <= modulus.
+                        remainder = remainder % modulus
+                        remainder_modulus_pairs.append((remainder, modulus))
+                        continue
+                # This means that we did not get a unique solution to the equation.
+                # No problem, we will check it.
+                congruences_to_check.add(congruence)
+            # Finally we solve for a congruence s such that s = r_i mod m_i for each (r_i, m_i).
+            # The solution will be a congruence of the form s = r mod m.
+            # NOTE(avik): Since the given m_i may not be pairwise coprime, we can't just use CRT.
+            if remainder_modulus_pairs:
+                remainder, modulus = sympy.ntheory.modular.solve_congruence(*remainder_modulus_pairs)
+                reduced_congruences[s] = {(s - remainder) % modulus}
+                substitution = {s: modulus * sympy.Symbol("tmp", integer=True) + remainder}
+                reduced_congruences[s].update(
+                    congruence for congruence in congruences_to_check
+                    if not sympy.checksol(congruence, substitution)
+                )
+            else:
+                reduced_congruences[s] = congruences_to_check
+
+        return reduced_congruences
+
+    def _raise_inconsistencies(self):
+        if self._inconsistencies:
+            msg = "\n".join(self._inconsistencies)
+            self._inconsistencies.clear()
+            raise ValueError(f"The following inconsistencies were found:\n{msg}")
+
+    def _force_specialization(self, s):
+        val = self._var_to_val[s]
+        self._static_results.add(f"{self._dcp.symbol_to_source[s][0].name()} == {val}")
+        self._substitutions[s] = val
+
+    def _specialize_divisor_symbols(self):
+        for expr in self._multivariate_inequalities:
+            for atom in expr.atoms(FloorDiv, Mod):
+                _, divisor = atom.args
+                for s in divisor.free_symbols:
+                    self._force_specialization(s)
+
+        multivariate_inequalities = self._multivariate_inequalities
+        self._multivariate_inequalities = set()
+        for expr in multivariate_inequalities:
+            self.add(expr.subs(self._substitutions))
+        self._raise_inconsistencies()
+        self._univariate_inequalities = {
+            s: exprs
+            for s, exprs in self._univariate_inequalities.items()
+            if s not in self._substitutions
+        }
+        self._congruences = {
+            s: congruences
+            for s, congruences in self._congruences.items()
+            if s not in self._substitutions
+        }
+
+    def solve(self, disable_congruences=True, disable_equivalences=True):
+        """Solve the system of constraint equations to find simplified constraints
+        """
+        self._raise_inconsistencies()
+        # as long as there are symbols with equalities, solve for them
+        # NOTE(avik): this is guaranteed to terminate (#iterations <= #symbols)
+        while self._symbols_with_equalities:
+            s = self._symbols_with_equalities.pop()
+            exprs = self._univariate_inequalities.pop(s)
+            solution = sympy.solvers.inequalities.reduce_inequalities(exprs, s)
+            if isinstance(solution, sympy.And):
+                solution = next((arg for arg in solution.args if isinstance(arg, sympy.Eq)), solution)
+            assert isinstance(solution, sympy.Eq), f"Expected an equality constraint for {s}, got {solution}"
+            symbol, val = solution.args
+            assert symbol == s, f"Expected a constraint on {s} instead of on {symbol}"
+            # because this is univariate, the solution is a specialization
+            self._static_results.add(f"{self._dcp.symbol_to_source[s][0].name()} == {val}")
+            # add this as a substitution to simplify other constraints
+            self._substitutions[s] = val
+
+            # simplify multivariate inequalities: some of them will now become univariate!
+            multivariate_inequalities = self._multivariate_inequalities
+            self._multivariate_inequalities = set()
+            for expr in multivariate_inequalities:
+                self.add(expr.subs(s, self._substitutions[s]))
+            self._raise_inconsistencies()
+
+        self._specialize_divisor_symbols()
+
+        # solve linear congruences
+        # NOTE(avik): We do not need to solve them for symbols that have already been specialized.
+        reduced_congruences = self._reduce_congruences()
+        for s, congruences in reduced_congruences.items():
+            for congruence in congruences:
+                # any congruence that cannot be checked becomes a dynamic constraint as well
+                if s not in self._substitutions or not sympy.checksol(congruence, {s: self._substitutions[s]}):
+                    if self._is_supported_congruence(congruence):
+                        base, divisor = congruence.args
+                        tmp_name = f"_{self._dcp.source_name_to_debug_name[self._dcp.symbol_to_source[s][0].name()]}"
+                        tmp = sympy.Symbol(tmp_name, integer=True)
+                        from torch._dynamo.source import ConstantSource
+                        self._dcp.symbol_to_source[tmp] = [ConstantSource(tmp_name)]
+                        r = try_solve(sympy.Eq(base, divisor * tmp), s)
+                        self._dynamic_results.add(self._dcp.doprint(sympy.Eq(s, r[1])))
+                    elif disable_congruences:
+                        self._force_specialization(s)
+                        self._univariate_inequalities.pop(s, None)
+
+        # remaining symbols have only pure inequalities (no equalities)
+        for s, exprs in self._univariate_inequalities.items():
+            try:
+                solution = sympy.solvers.inequalities.reduce_inequalities(exprs, s)
+                # because this is univariate, the solution is a dynamic (range) constraint
+                if isinstance(solution, sympy.Or):
+                    solution = next(iter(arg for arg in solution.args if arg.subs(self._var_to_val)))
+                if isinstance(solution, sympy.And):
+                    for arg in solution.args:
+                        self._dynamic_results.add(self._dcp.doprint(arg))
+                else:
+                    self._dynamic_results.add(self._dcp.doprint(solution))
+            except (NotImplementedError, AssertionError) as e:
+                log.warning("Failed to reduce inequalities: %s", e)
+                for expr in exprs:
+                    self._dynamic_results.add(self._dcp.doprint(expr))
+
+        # simplify symbolic equivalences: some of them will now become specializations!
+        symbolic_equivalences = self._symbolic_equivalences
+        self._symbolic_equivalences = []
+        for source, expr in symbolic_equivalences:
+            if disable_equivalences and not self._is_supported_equivalence(expr):
+                for s in expr.free_symbols:
+                    self._force_specialization(s)
+                    sexpr = self._dcp._print_Symbol(s)
+                    self._dynamic_results = {r for r in self._dynamic_results if sexpr not in r}
+            self.add_equality(source, expr.subs(self._substitutions))
+
+        # remaining symbolic equivalences become dynamic equality constraints
+        for source, expr in self._symbolic_equivalences:
+            self._dynamic_results.add(f"{self._dcp.print_source(source)} == {self._dcp.doprint(expr)}")
+
+    @classmethod
+    def _is_supported_equivalence(cls, expr):
+        # Currently supported Dim ops are linear expressions with integer coefficients.
+        # So check that expr only contains +, *, ints, and a single occurrence of a symbol.
+        # (See also documentation of dynamic_shapes._DerivedDim.)
+        if isinstance(expr, (sympy.Add, sympy.Mul)):
+            lhs, rhs = expr.args
+            return (
+                (cls._is_supported_equivalence(lhs) and isinstance(rhs, sympy.Integer)) or
+                (isinstance(lhs, sympy.Integer) and cls._is_supported_equivalence(rhs))
+            )
+        return isinstance(expr, sympy.Symbol)
+
+    @classmethod
+    def _is_supported_congruence(cls, congruence):
+        base, divisor = congruence.args
+        # Congruences that can be currently expressed with supported Dim ops are
+        # of the form (x + a) % b == 0, where x is a Dim and a and b are constants.
+        # This allows us to derive x as b*y - a for some Dim y.
+        # (See also documentation of dynamic_shapes._DerivedDim.)
+        if isinstance(base, sympy.Add):
+            lhs, rhs = base.args
+            cond = (
+                (isinstance(lhs, sympy.Symbol) and isinstance(rhs, sympy.Integer)) or
+                (isinstance(lhs, sympy.Integer) and isinstance(rhs, sympy.Symbol))
+            )
+        else:
+            cond = isinstance(base, sympy.Symbol)
+        cond = cond and isinstance(divisor, sympy.Integer)
+        return cond
+
+    def forced_specializations(self):
+        """Returns a dictionary of the names of symbols to their specialized value
+        """
+        def debug_name(src):
+            name = src.name()
+            if self._dcp.source_name_to_debug_name:
+                return f"{self._dcp.source_name_to_debug_name[name]} = {name}"
+            else:
+                return name
+
+        return {
+            debug_name(self._dcp.symbol_to_source[s][0]): val
+            for s, val in self._substitutions.items()
+            if s in self._marked_dynamic
+        }
+
+    def remove_redundant_dynamic_results(self):
+        """Remove constraints of the form 2 <= dynamic_dim(...) as 2 is the default
+        lower bound.
+        """
+        candidates_for_removal = []
+        dynamic_results = set()
+        for dc in self._dynamic_results:
+            # Instead of 2 <= dynamic_dim(...) simply suggest dynamic_dim(...).
+            # There is no change in behavior since 2 is the default lower bound.
+            dc_ = re.sub(r"2 <= dynamic_dim(.+)", r"dynamic_dim\1", dc)
+            if dc != dc_:
+                candidates_for_removal.append(dc_)
+            else:
+                dynamic_results.add(dc_)
+        for dc in candidates_for_removal:
+            # remove dynamic_dim(t, 0) as a constraint when dynamic_dim(t, 0) also
+            # appears as part of another constraint
+            found = False
+            for other_dc in dynamic_results:
+                if dc in other_dc:
+                    found = True
+            if not found:
+                dynamic_results.add(dc)
+        self._dynamic_results = dynamic_results
+
+    def prettify_results(
+        self,
+        original_signature: inspect.Signature,
+        constraint_violation_error=None,
+        forced_specializations=None,
+    ):
+        """Format a message for constraint violation erros"""
+        if self._dcp.source_name_to_debug_name:
+            def transform(s):
+                for k, v in self._dcp.source_name_to_debug_name.items():
+                    s = s.replace(k, v)
+                return s
+
+            results = defaultdict(dict)
+
+            def flip(op):
+                if op == "<=":
+                    return ">="
+                if op == ">=":
+                    return "<="
+                if op == "<":
+                    return ">"
+                if op == ">":
+                    return "<"
+                assert op == "=="
+                return op
+
+            def relation_with_digit(expr, op, digit):
+                if op == "<=":
+                    results[expr]["max"] = digit
+                elif op == "<":
+                    results[expr]["max"] = digit - 1
+                elif op == ">=":
+                    results[expr]["min"] = digit
+                elif op == ">":
+                    results[expr]["min"] = digit + 1
+                else:
+                    assert op == "=="
+                    results[expr]["eq"] = digit
+
+            for s in self._static_results.union(self._dynamic_results):
+                t = transform(s)
+                if t == s:
+                    continue
+                left, op, right = re.split(r"( == | <= | >= | < | > )", t)
+                op = op.strip()
+                if op == "==" and left == right:
+                    continue
+                if right.isdigit():
+                    relation_with_digit(left, op, int(right))
+                elif left.isdigit():
+                    relation_with_digit(right, flip(op), int(left))
+                else:
+                    assert op == "=="
+                    results[left]["eq"] = sympy.sympify(right)
+
+            buf = ""
+            debug_names = set()
+            if forced_specializations:
+                debug_names.update(k.split(" = ")[0] for k in forced_specializations.keys())
+                buf += (
+                    f"Specializations unexpectedly required ({', '.join(debug_names)})! "
+                    "For more information, run with TORCH_LOGS=\"+dynamic\".\n"
+                )
+                for s, val in forced_specializations.items():
+                    buf += f"  - {s} must be specialized to {val} because the guards generated for it are too complex.\n"
+
+            dims = []
+            others = []
+            match = None
+            if constraint_violation_error:
+                match = re.search(r"Constraints violated \((.*)\)", constraint_violation_error.args[0])
+            if match is not None:
+                debug_names.update(match.expand(r'\1').split(', '))
+
+            for k, c in sorted(results.items()):
+                # if k not in debug_names:
+                #     continue
+                if "eq" in c:
+                    other = c["eq"]
+                    if isinstance(other, int):
+                        others.append(f"{k} = None  # {other}")
+                    elif self._is_supported_equivalence(other):
+                        s = next(iter(other.free_symbols))
+                        if s not in results:
+                            modulus, remainder = sympy.polys.polytools.div(other, s)
+                            c_min = c.get("min", 2)
+                            min_ = math.ceil((c_min - remainder) / modulus)
+                            c_max = c.get("max", sys.maxsize - 1)
+                            max_ = math.floor((c_max - remainder) / modulus)
+                            dims.append(f"{s} = Dim('{s}', min={min_}, max={max_})  # {c_min} <= {other} <= {c_max}")
+                        others.append(f"{k} = {other}")
+                else:
+                    min_ = c.get("min", None)
+                    if min_ == 2:
+                        min_ = None
+                    max_ = c.get("max", None)
+                    if min_ is not None and max_ is not None:
+                        dims.append(f"{k} = Dim('{k}', min={min_}, max={max_})")
+                    elif min_ is not None:
+                        dims.append(f"{k} = Dim('{k}', min={min_})")
+                    elif max_ is not None:
+                        dims.append(f"{k} = Dim('{k}', max={max_})")
+                    else:
+                        dims.append(f"{k} = Dim('{k}')")
+
+            buf += "\nSuggested fixes:\n  "
+            buf += "\n  ".join(dims + others)
+
+            return buf
+
+        # Note: Model inputs are wrapped as LocalSource in dynamo.
+        # LocalSource.name() wraps the name with L[""]. We use regular
+        # expression to do the replacement to avoid traversing up
+        # the source hierarchy manually.
+        def extract_and_rewrite_local(dc):
+            match = re.search(r"L\['(.+?)'\]", dc)
+            if match is None:
+                return
+            arg = match.expand(r'\1')
+            dc = re.sub(r"L\['(.+?)'\]", r'\1', dc)
+            return arg, dc
+
+        def group(results, args_index):
+            groups = defaultdict(list)
+            for dc in results:
+                local = extract_and_rewrite_local(dc)
+                if local is None:
+                    # This can happen, e.g., with `assume_constant_result`.
+                    # In that case, we drop the constraint.
+                    # TODO(avik) Maybe we should generate an assertion here?
+                    continue
+                arg, dc = local
+                if arg in args_index:
+                    groups[args_index[arg]].append(dc)
+                else:
+                    # This can happen, e.g., with decorators that change the signature.
+                    # In that case, we drop the constraint. Seems hard to do better. :/
+                    # TODO(avik) Maybe warn that `arg` in not in `signature`?
+                    continue
+            sorted_groups = []
+            for idx, dcs in sorted(groups.items()):
+                _, arg = idx
+                sorted_groups.append((arg, sorted(dcs)))
+            return sorted_groups
+
+        signature = original_signature.replace(return_annotation=inspect.Signature.empty)
+        args_index = {}
+        for i, arg in enumerate(signature.parameters.keys()):
+            args_index[arg] = (i, arg)
+
+        def print_results(grouped, indent, result_fn):
+            nonlocal buf
+
+            space = False
+            for arg, results in grouped:
+                if space:
+                    buf += "\n"
+                else:
+                    space = True
+                buf += f"\n{indent}# {arg}:"
+                for result in results:
+                    buf += f"\n{indent}{result_fn(result)}"
+
+        buf = ""
+        if forced_specializations:
+            buf += (
+                "Some dynamic dimensions need to be specialized because "
+                "the constraints inferred for them are too complex to specify.\n"
+            )
+            for s, val in forced_specializations.items():
+                buf += f"  - {s}, which was marked dynamic, must be specialized to {val}.\n"
+        indent = 4 * " "
+        if self._static_results:
+            grouped_static_results = group(self._static_results, args_index)
+            buf += "\nThe following dimensions have been specialized and CANNOT be dynamic."
+            buf += f"\n```\ndef specializations{str(signature)}:"
+            print_results(
+                grouped_static_results,
+                indent,
+                lambda result: f"assert {result}",
+            )
+            buf += "\n```\n"
+        if self._dynamic_results:
+            grouped_dynamic_results = group(self._dynamic_results, args_index)
+            buf += "\nThe following dimensions CAN be dynamic."
+            buf += "\nPlease use the following code to specify the constraints they must satisfy:"
+            buf += f"\n```\ndef specify_constraints{str(signature)}:"
+            buf += f"\n{indent}return ["
+            print_results(
+                grouped_dynamic_results,
+                indent * 2,
+                lambda result: f"{result},",
+            )
+            buf += f"\n{indent}]\n```\n"
+        return buf
+
+
+TLS = threading.local()
+
+
+class ShapeEnv:
+    # This is a wrapper over the actual __init__ function.
+    #
+    # Where to add a new constructor parameter to ShapeEnv?
+    # =====================================================
+    # This __init__ function should be used only for parameters related to event recording.
+    # These are parameters that we don't wish to pass down the road to new ShapeEnv instances
+    # created from replaying events.
+    #
+    # If you wish to add a parameter to the constructor of ShapeEnv, unrelated to event
+    # recording, do so in the _init function.
+    def __init__(
+        self, *,
+        should_record_events: Optional[bool] = None,
+        tracked_fakes: Optional[List[Any]] = None,
+        **kwargs
+    ) -> None:
+        self._init(**kwargs)
+
+        # Disable event recording when replaying.
+        kwargs["should_record_events"] = False
+
+        from torch.fx.experimental.validator import translation_validation_enabled
+        self._translation_validation_enabled = translation_validation_enabled()
+
+        # If not specified, enable event recording if both:
+        #   - Translation validation is on
+        #   - Translation validation bisection is not disabled
+        self.should_record_events = (
+            should_record_events
+            if should_record_events is not None
+            else (
+                self._translation_validation_enabled
+                and not config.translation_validation_no_bisect
+            )
+        )
+
+        # Enable event recording check if both:
+        #   - It should record events
+        #   - The recording check is enabled
+        self.check_recorded_events = (
+            self.should_record_events and config.check_shape_env_recorded_events
+        )
+
+        # This will make sure we only record the top-level function call.
+        self.is_recording = not self.should_record_events
+        # Keep track of the list of tracked fakes.
+        self.tracked_fakes = tracked_fakes
+        # List of events for reconstructing ShapeEnv at arbitrary points in time.
+        self.events: List[ShapeEnvEvent] = (
+            [ShapeEnvEvent(ShapeEnv, kwargs=kwargs)] if self.should_record_events else []
+        )
+
+    # Pro-tip: if you add new field to ShapeEnv, this affects some accept
+    # tests.  Accept their output with:
+    #
+    #   EXPECTTEST_ACCEPT=1 python test/dynamo/test_dynamic_shapes.py -k test_shape_env_equal
+    #
+    def _init(
+        self, *,
+        allow_scalar_outputs=True,
+        allow_dynamic_output_shape_ops=True,
+        # NB: These are legacy configuration that help us make good choices
+        # when the constraint/dynamic dims are not explicitly passed to us.
+        # Ideally we will fix all call sites to be explicit and not have
+        # implicit choices, but this apparently was pretty involved.
+        assume_static_by_default=False,
+        # Note - On 0/1 specialization
+        #
+        # The following options affect decisions we make about eager
+        # specialization.  Disabling them will increase trace time (as we do
+        # more symbolic reasoning) and can also harm the quality of generated
+        # code (because inductor may not be able to specialize for bounds
+        # being equal--although if we later respecialize because of a guard,
+        # your code may be just as good as it was before.)
+        #
+        # When True, eagerly specialize input sizes which have 0/1.
+        specialize_zero_one=True,
+        # When True, assume input sizes which have the same size are
+        # symbolically equal.
+        duck_shape=True,
+        # For debugging
+        co_fields=None,
+        # XXX Add any new settings that could affect FakeTensor evaluation
+        # to: torch._subclasses.fake_tensor._ShapeEnvSettings
+    ):
+        # Not directly used by ShapeEnv; indirectly used by FakeTensor
+        self.allow_scalar_outputs = allow_scalar_outputs
+        self.allow_dynamic_output_shape_ops = allow_dynamic_output_shape_ops
+        self.guards: List[ShapeGuard] = []
+        # Maps symbolic ints to their original concrete values
+        # Currently populated from tensors
+        self.var_to_val: Dict[sympy.Symbol, sympy.Integer] = {}
+        # Maps symbolic ints to their min/max range.  These ranges
+        # are conservative: the int MUST fall in the range, but the
+        # range may contain ints which may not actually appear in
+        # practice
+        self.var_to_range: Dict[sympy.Symbol, ValueRanges] = {}
+        self.source_name_to_debug_name: Dict[str, str] = {}
+        self.var_to_sources: Dict[sympy.Symbol, List[Source]] = {}
+        self.var_to_stack: Dict[sympy.Symbol, CapturedTraceback] = {}
+        # Maps from sympy ints to expressions representing them
+        # Populated from equality guards (i.e. a.shape[0] == b.shape[0])
+        self.replacements: Dict[sympy.Symbol, sympy.Expr] = {}
+        # Set holds a % b expressions that evaluate to 0.
+        self.divisible: Set[sympy.Expr] = set()
+        # Set that holds "size-like" symbols.  When we perform
+        # "size-oblivious" tests, these can be assumed to be >= 2.
+        self.size_like: Set[sympy.Symbol] = set()
+        # Duck-shaping says that if two input tensors have the same size,
+        # they get assigned the same symbolic variable
+        self.val_to_var: Dict[int, sympy.Expr] = {}
+        if specialize_zero_one:
+            self.val_to_var = {0: sympy.Integer(0), 1: sympy.Integer(1)}
+        self.unbacked_symfloat_counter = itertools.count()
+        self.unbacked_symint_counter = itertools.count()
+        # Similar to guards, but these MUST evaluate to true and can
+        # only be evaluated at runtime midway through (i.e., they always
+        # involve unbacked symints)
+        #
+        # For efficiency reasons, we index in the following way.  Suppose you have
+        # a runtime assert i0 + i1 <= s1.  We pick the most recently allocated
+        # symbol in the source expression and add the assert to the list for
+        # that symbol e.g., {i1: [i0 + i1 <= s1]}.
+        #
+        # We access the runtime asserts in two situations:
+        #
+        #   - When we are guarding on an expression, we will attempt to
+        #     statically evaluate it, in case the unbacked SymInts can
+        #     simplify away.  If we have a runtime assert, we may be able
+        #     to discharge the guard entirely.  We only need to attempt
+        #     runtime asserts that mention freevars of the expression in
+        #     question.
+        #
+        #   - When we are performing codegen (in Inductor for eager, or
+        #     when finalizing the export FX graph), we need to know what
+        #     extra runtime asserts to insert.  Whenever an unbacked
+        #     SymInt comes into scope, all runtime asserts involving it
+        #     become eligible for insertion (so long as all of their other
+        #     free unbacked symbols are also in scope).  We technically
+        #     can handle any choice of key by kicking inexpressible asserts
+        #     to the next unbacked symbol to wait on, but if we choose the
+        #     latest key, an assert will only show up at the moment when
+        #     we can actually codegen it.
+        self.deferred_runtime_asserts: Dict[sympy.Symbol, List[RuntimeAssert]] = {}
+        # This exists so we can efficiently invalidate the cache (it's used as
+        # part of the cache key); otherwise we'd have to iterate through
+        # deferred_runtime_asserts to compute its length
+        self.num_deferred_runtime_asserts = 0
+        self.assume_static_by_default = assume_static_by_default
+        self.specialize_zero_one = specialize_zero_one
+        self.duck_shape = duck_shape
+        self.log = log
+        self.log.debug("create_env")
+        self.frozen = False
+        self.dim_constraints: Optional[DimConstraints] = None
+        self.counter = collections.Counter()
+        # Mapping from sympy.Symbol to the number of guards which mention this
+        # symbol
+        self.symbol_guard_counter = collections.Counter()
+        # A selection of important fields on co_field; solely used for
+        # signpost_event
+        self.co_fields = co_fields if co_fields else {}
+
+        # Version counter used to invalidate cached values
+        self._prev_cache_key = self._get_key()
+        self._version_counter = 0
+
+        # Cache for FX nodes.
+        # Maps an already built node a tuple of:
+        #   1. node's target
+        #   2. list of arguments
+        # This drastically reduces the size of the FX graph, avoiding
+        # duplicated nodes.
+        self.fx_node_cache: Dict[Tuple[Callable, Tuple[Any, ...]], torch.fx.Node] = {}
+        self.source_to_symbol: Dict[str, sympy.Symbol] = {}
+
+        from torch.fx.experimental.validator import translation_validation_enabled
+        self._translation_validation_enabled = translation_validation_enabled()
+
+        if self._translation_validation_enabled:
+            from torch.fx.experimental.validator import TranslationValidator
+
+            self.validator = TranslationValidator()
+            self.graph = torch.fx.Graph()
+            # Create an output graph and start inserting before that.
+            # This is needed when 'deepcopy'-ing this object.
+            self.graph.inserting_before(self.graph.output(None))
+
+            # Mapping of each node name to the node itself.
+            #
+            # This is useful for matching an FX node from a recorded ShapeEnv.graph
+            # to the FX node of the ShapeEnv we are running the event on.
+            #
+            # Whenever you add a node to self.graph, you must add a mapping to this
+            # variable. Otherwise, the built FX graph on the replayed ShapeEnv will
+            # not be valid.
+            self.name_to_node: Dict[str, torch.fx.Node] = {}
+
+    def check_equal(self, other: "ShapeEnv") -> None:
+        """Compare another ShapeEnv for equivalence
+        """
+        # ShapeEnv fields that are not relevant for the outcome of
+        # ShapeEnv.produce_guards call:
+        #   - Debugging variables
+        #   - Translation validation related variables
+        #   - Events recording related variables
+        non_state_variable_names = (
+            "counter",
+            "log",
+            "var_to_stack",
+            "fx_node_cache",
+            "graph",
+            "validator",
+            "check_recorded_events",
+            "should_record_events",
+            "is_recording",
+            "tracked_fakes",
+            "events",
+            "source_name_to_debug_name",
+            "_prev_cache_key",
+            "_version_counter",
+        )
+
+        # Mapping of the value of each to-be-compared field into the values that
+        # should actually be compared.
+        #
+        # You should modify this if, for example, the field that holds state and
+        # debugging information. e.g. ShapeGuard holds the actual guard (sympy.Expr)
+        # and the stack when it was added to the set of guards. In order to compare
+        # it, we throw away the stack information.
+        def map_value(key: str, value: Any) -> Any:
+            if key in ("unbacked_symfloat_counter", "unbacked_symint_counter"):
+                from copy import copy
+
+                # For itertools.count(), we compare the next integer returned
+                # by the count iterators. Not that we need to copy the iterator
+                # first. Otherwise we are mutating the object.
+                return next(copy(value))
+            elif key == "guards":
+                # Transform the list of ShapeGuard into a list of expressions.
+                return [g.expr for g in value]
+            elif key == "deferred_runtime_asserts":
+                # Transform the list of RuntimeAsserts into a list of expressions.
+                return {s: [ra.expr for ra in ras] for s, ras in value.items()}
+            elif key == "name_to_node":
+                # Compare just the set of keys is the same.
+                return set(value.keys())
+            elif key == "symbol_guard_counter":
+                # Skip this for comparisons
+                return None
+            return value
+
+        shape_env_check_state_equal(self, other, non_state_variable_names, map_value)
+
+    def _snapshot_tracked_fakes(self) -> Optional[List[Any]]:
+        if self.tracked_fakes is None:
+            return None
+
+        from torch._dynamo.variables.builder import TrackedFake
+
+        def maybe_transform_fake(fake: TrackedFake):
+            inner_fake = fake.fake \
+                if isinstance(fake.fake, torch.SymInt) \
+                else FakeTensorMeta.from_fake(fake.fake)
+            # Even though TrackedFake accepts either a Union[SymInt, FakeTensor], here we give it a
+            # FakeTensorMeta for two reasons:
+            #   1. this is all the information we need when recording ShapeEnvEvents.
+            #   2. it works even if each TrackedFake changes its metadata.
+            return TrackedFake(inner_fake, fake.source, fake.symbolic_context)  # type: ignore[arg-type]
+
+        return [maybe_transform_fake(fake) for fake in self.tracked_fakes]
+
+    def _last_event_index(self) -> int:
+        return len(self.events) - 1
+
+    @contextmanager
+    def _recording(self):
+        self.is_recording = True
+        try:
+            yield
+        finally:
+            self.is_recording = False
+
+    @record_shapeenv_event()
+    def freeze(self):
+        """Freeze this ShapeEnv to stop accumulating guards
+
+        A frozen ShapeEnv will ignore any further guards generated on it and
+        only emit a warning which may lead to accuracy problems.
+        """
+        self.frozen = True
+
+    def _create_symbol_for_source(self, source: Source) -> Optional[sympy.Symbol]:
+        if not self._translation_validation_enabled:
+            return None
+        srcname = source.name()
+        if source not in self.source_to_symbol:
+            self.source_to_symbol[srcname] = sympy.Symbol(srcname, integer=True)
+        return self.source_to_symbol[srcname]
+
+    def _add_z3var(self, symbol: sympy.Symbol, type: Type) -> None:
+        if self._translation_validation_enabled:
+            self.validator.add_var(symbol, type)
+
+    def _add_target_expr(self, expr) -> None:
+        if self._translation_validation_enabled:
+            self.validator.add_target_expr(expr)
+
+    def _add_assertion(self, expr) -> None:
+        if self._translation_validation_enabled:
+            self.validator.add_assertion(expr)
+
+    def _check_translation_validate(self) -> None:
+        if self._translation_validation_enabled:
+            self.validator.validate()
+
+    @record_shapeenv_event()
+    def _create_fx_call_function(
+            self,
+            op: Callable,
+            args: Tuple,
+    ) -> Tuple[Optional[torch.fx.Node], bool]:
+        # Cache this tuple in order to avoid duplicated nodes.
+        node_key = (op, args)
+        # Flags whether the returned node was cached or not.
+        fresh = False
+
+        if self._translation_validation_enabled and node_key not in self.fx_node_cache:
+            from torch.fx.experimental.validator import z3op
+
+            # Presence of None in the arguments implies that we should ignore this operation.
+            if any(a is None for a in args):
+                # We check if we are not mixing SymNode that should not be ignored
+                # (fx_node is not None) with those that should (fx_node is None).
+                assert all(not isinstance(a, torch.fx.Node) for a in args)
+                return None, fresh
+
+            fresh = True
+            lifted_op = z3op(op, self.validator)
+
+            # If translation validation is enabled, all arguments must have its
+            # own FX node.
+            assert all(a is not None for a in args), f"missing arg in FX graph ({op.__name__}): {args}"
+            node = self.fx_node_cache[node_key] = self.graph.call_function(lifted_op, args)
+            self.name_to_node[node.name] = node
+
+        return self.fx_node_cache.get(node_key, None), fresh
+
+    def _create_fx_placeholder_and_z3var(
+            self,
+            symbol: sympy.Symbol,
+            type: Type,
+    ) -> Optional[torch.fx.Node]:
+        if not self._translation_validation_enabled:
+            return None
+
+        node_key = (self.graph.placeholder, (symbol,))
+
+        # Check if we haven't added this symbol already.
+        # If so, skip the placeholder creation, as it
+        # generates invalid Python code.
+        if node_key not in self.fx_node_cache:
+            # Add a Z3 variable according to 'type'.
+            self._add_z3var(symbol, type)
+            # Create the FX placeholder out of a mangled name.
+            mangled_name = re.sub(r'[^a-zA-Z0-9]', '_', re.sub(r'[()]', '', symbol.name))
+            node = self.fx_node_cache[node_key] = self.graph.placeholder(mangled_name)
+            self.name_to_node[node.name] = node
+            # Attach the 'symbol' to the placeholder so that we can retrieve
+            # the Z3 variable later.
+            node.meta["symbol"] = symbol
+
+        return self.fx_node_cache[node_key]
+
+    def _remove_fx_node(self, node: Optional[torch.fx.Node]) -> None:
+        if self._translation_validation_enabled and node is not None:
+            self.name_to_node.pop(node.name)
+            self.graph.erase_node(node)
+
+    def _add_fx_node_metadata(self, node: torch.fx.Node) -> None:
+        from torch._dynamo.utils import get_current_node
+
+        if self.should_record_events:
+            node.meta[SHAPEENV_EVENT_KEY] = self._last_event_index()
+            node.meta[CURRENT_NODE_KEY] = get_current_node()
+
+    def _suppress_guards_tls(self):
+        return getattr(TLS, "suppress_guards", False)
+
+    @record_shapeenv_event()
+    def _suppress_guards_enter(self):
+        TLS.suppress_guards = True
+
+    @record_shapeenv_event()
+    def _suppress_guards_exit(self):
+        TLS.suppress_guards = False
+
+    @contextmanager
+    def suppress_guards(self):
+        """Context manager to ignore all guards generated inside"""
+        self._suppress_guards_enter()
+        try:
+            yield
+        finally:
+            self._suppress_guards_exit()
+
+    def _get_key(self):
+        """
+        Defines the current "state" of the guards we've accumulated in this ShapeEnv.
+        Determines when we need to invalidate our cache
+        """
+        return (len(self.replacements), len(self.divisible), self.num_deferred_runtime_asserts)
+
+    def _update_version_counter(self):
+        # The shape environment is queried orders of magnitude more often than
+        # it is changed, so we summarise the cache key into a linearly
+        # increasing version counter which is cheaper to check in _lru_cache
+
+        # Only update version counter if the state actually changed
+        cur_key = self._get_key()
+        if self._prev_cache_key != cur_key:
+            self._prev_cache_key = cur_key
+            self._version_counter += 1
+
+    def _produce_dyn_sizes(self,
+                           ex_size: Sequence[int],
+                           source: Source,
+                           symbolic_context: SymbolicContext
+                           ) -> List[sympy.Expr]:
+        return self._produce_dyn_sizes_from_int_tuple(tuple(ex_size), source, symbolic_context)
+
+    def _produce_dyn_sizes_from_int_tuple(self,
+                                          tensor_size: Tuple[int],
+                                          source: Source,
+                                          symbolic_context: SymbolicContext,
+                                          ) -> List[sympy.Expr]:
+        assert all(not is_symbolic(val) for val in tensor_size), f"Expect size to be a plain tuple of ints but got {tensor_size}"
+        from torch._dynamo.source import TensorPropertySource, TensorProperty
+        _assert_symbol_context(symbolic_context)
+        dynamic_dims = symbolic_context.dynamic_sizes
+        constraint_dims = symbolic_context.constraint_sizes
+        size = []
+        for i, val in enumerate(tensor_size):
+            size.append(self.create_symbol(
+                val,
+                TensorPropertySource(source, TensorProperty.SIZE, i),
+                dynamic_dims[i],
+                constraint_dims[i],
+                symbolic_context=symbolic_context
+            ))
+        return size
+
+    def create_symbolic_sizes_strides_storage_offset(
+        self,
+        ex: torch.Tensor,
+        source: Source,
+        *,
+        symbolic_context: Optional[SymbolicContext] = None,
+    ):
+        """
+        Returns a list of symbolic sizes and strides for the given tensor.
+        We try our best to express stride in terms of the sizes, so as to not
+        introduce new symbolic variables.
+        """
+
+        # Dynamo may want to wrap FakeTensors with SymInt sizes up e.g. make_fx(opt_f(), tracing_mode="symbolic").
+        # We create symbols in shape_env using the backed hints behind SymInt.
+
+        # Case 1: when SymInt is backed, dynamo can proceed with FakeTensors that have concrete shape.
+        # produce_guards will trigger specializations on the outer stuff
+
+        # Case 2: when the SymInt is unbacked, we will throw an data dependent error in require_hint().
+        #
+        # It's probably good for now but it's important to note that this approach has implications for
+        # the original shape_env when checking guards in different order.
+
+        # Example:
+        # ---------
+        # Consider a function "opt_f" as shown below:
+
+        # @torch.compile()
+        # def opt_f(x: bool, y: Tensor):
+        #   if x == True:
+        #     return y + torch.randn([4])
+        #   else:
+        #     return y
+        # Depending on the sequence of calls, we might install two different sets of guards:
+
+        # 1. opt_f(False, y):
+        #    - "x == False" (always works for any size y)
+
+        # 2. opt_f(True, y):
+        #    - Triggers recompilation and results in guards like:
+        #      - "x == True and y.size(0) == 4"
+        #      - (or "y.size(0) == 4 and x == True")
+
+        # The order of checking the guards matters. In this specific example:
+        # If True branch guard check precedes False branch and for True branch, y.size(0) check precedes x == True,
+        # we may have an unnessary shape speciliazation for y.
+        def maybe_specialize_sym_int_with_hint(maybe_sym) -> int:
+            assert isinstance(maybe_sym, (int, torch.SymInt))
+            if is_symbolic(maybe_sym):
+                assert maybe_sym.node.shape_env is not self, \
+                    "expect the symbol is created from an shape env other than current one."
+                return maybe_sym.node.require_hint()
+            return maybe_sym
+
+        ex_size = tuple(maybe_specialize_sym_int_with_hint(sz) for sz in ex.size())
+        ex_stride = tuple(maybe_specialize_sym_int_with_hint(sd) for sd in ex.stride())
+        ex_storage_offset = maybe_specialize_sym_int_with_hint(ex.storage_offset())
+
+        return self._create_symbolic_sizes_strides_storage_offset(
+            ex_size,
+            ex_stride,
+            ex_storage_offset,
+            [_is_dim_dynamic(ex, i) for i in range(ex.dim())],
+            source,
+            symbolic_context=symbolic_context,
+        )
+
+    @record_shapeenv_event()
+    def _create_symbolic_sizes_strides_storage_offset(
+        self,
+        ex_size: Sequence[int],
+        ex_stride: Sequence[int],
+        ex_storage_offset: int,
+        is_dim_dynamic: Sequence[bool],
+        source: Source,
+        *,
+        symbolic_context: Optional[SymbolicContext] = None,
+    ):
+        dim = len(ex_size)
+
+        # Reimplement the legacy behavior
+        if symbolic_context is None:
+            constraint_dims = [None] * dim
+            dynamic_dims = []
+            for i in range(dim):
+                # NB: This is encapsulation breaking!  Legacy behavior was
+                # bad.
+                if is_dim_dynamic[i]:
+                    r = DimDynamic.DYNAMIC
+                elif self.assume_static_by_default:
+                    r = DimDynamic.STATIC
+                else:
+                    r = DimDynamic.DUCK
+                dynamic_dims.append(r)
+            dynamic_dims = [DimDynamic.DUCK] * dim
+            # symbolic_context is None - set one
+            symbolic_context = StatelessSymbolicContext(dynamic_sizes=dynamic_dims, constraint_sizes=constraint_dims)
+        # We got a StatelessSymbolicContext
+        _assert_symbol_context(symbolic_context)
+        constraint_dims = symbolic_context.constraint_sizes
+        dynamic_dims = symbolic_context.dynamic_sizes
+
+        # TODO: make this configurable from outside symbolic_context; we made a symbolic_context
+        # decision here where if all sizes are static, we are going to
+        # specialize all of the inner strides/offset too. We don't have to
+        # do this, and arguably we should ALWAYS allow for dynamic offset,
+        # this is cheap.
+        # TODO: This should be DYNAMIC, using DUCK for BC
+        dynamic_strides_offset = DimDynamic.STATIC if all(r == DimDynamic.STATIC for r in dynamic_dims) else DimDynamic.DUCK
+
+        assert len(dynamic_dims) == dim, f"{len(dynamic_dims)} != {dim}"
+        assert len(constraint_dims) == dim
+
+        from torch._dynamo.source import TensorPropertySource, TensorProperty
+        size: List[sympy.Expr] = self._produce_dyn_sizes_from_int_tuple(ex_size, source, symbolic_context)
+        stride: List[Optional[sympy.Expr]] = [None] * len(size)
+        for i, val in enumerate(ex_stride):
+            if val in (0, 1):
+                stride[i] = sympy.Integer(val)
+        while any(x is None for x in stride):
+            candidates = {
+                ex_size[i] * ex_stride[i]: size[i] * stride[i]
+                for i in range(len(size))
+                if stride[i] is not None and ex_stride[i] >= 0
+            }
+
+            # iterate over unbound strides in sorted order
+            def _nested_int_aware_sort(tup):
+                return (
+                    # Order nested ints by their coefficients.
+                    # 1 here to order nested ints after non-nested-ints.
+                    (1, tup[0].node.nested_int_coeff(), tup[1]) if is_nested_int(tup[0])
+                    else (0, *tup)
+                )
+            val_list = sorted(
+                [(ex_stride[i], i) for i in range(len(stride)) if stride[i] is None],
+                key=_nested_int_aware_sort,
+            )
+            for _, i in val_list:
+                if stride[i] is None and ex_stride[i] in candidates:
+                    stride[i] = candidates[ex_stride[i]]
+                    candidates[ex_size[i] * ex_stride[i]] = size[i] * stride[i]
+
+            if any(x is None for x in stride):
+                # bind the smallest unbound stride to a new variable
+                val, i = min(
+                    [
+                        (ex_stride[i], i)
+                        for i in range(len(stride))
+                        if stride[i] is None
+                    ], key=_nested_int_aware_sort
+                )
+                stride[i] = self.create_symbol(
+                    val,
+                    TensorPropertySource(source, TensorProperty.STRIDE, i),
+                    dynamic_dim=dynamic_strides_offset,
+                    constraint_dim=None,
+                    symbolic_context=symbolic_context,
+                )
+        assert all(x is not None for x in stride)
+
+        sym_sizes = [
+            self.create_symintnode(
+                sym,
+                hint=hint,
+                source=TensorPropertySource(source, TensorProperty.SIZE, i),
+            )
+            for i, (sym, hint) in enumerate(zip(size, ex_size))
+        ]
+        sym_stride = []
+        for i, stride_expr in enumerate(stride):
+            # NB: Don't duck size the stride; instead use the expression
+            # we computed
+            assert stride_expr is not None
+            sym_stride.append(self.create_symintnode(
+                stride_expr, hint=ex_stride[i], source=TensorPropertySource(source, TensorProperty.STRIDE, i)))
+        sym_storage_offset = self.create_symintnode(
+            self.create_symbol(
+                ex_storage_offset,
+                TensorPropertySource(source, TensorProperty.STORAGE_OFFSET),
+                dynamic_dim=dynamic_strides_offset,
+                constraint_dim=None,
+                symbolic_context=symbolic_context
+            ),
+            hint=ex_storage_offset,
+            source=TensorPropertySource(source, TensorProperty.STORAGE_OFFSET))
+        return tuple(sym_sizes), tuple(sym_stride), sym_storage_offset
+
+    @record_shapeenv_event()
+    def create_symintnode(
+            self,
+            sym: "sympy.Expr",
+            *,
+            hint: Optional[int],
+            source: Optional[Source] = None,
+    ):
+        """Create a SymInt value from a symbolic expression
+
+        If you know what the current hint value of the SymInt to be created
+        is, pass it into hint.  Otherwise, pass None and we will make our best
+        guess
+
+        """
+        source_name = source.name() if source else None
+
+        if self._translation_validation_enabled and source is not None:
+            # Create a new symbol for this source.
+            symbol = self._create_symbol_for_source(source)
+            assert symbol is not None
+
+            # Create a new FX placeholder and Z3 variable for 'symbol'.
+            fx_node = self._create_fx_placeholder_and_z3var(symbol, int)
+
+            # Add an equality assertion for the newly created symbol and 'sym'.
+            self._add_assertion(sympy.Eq(symbol, sym))
+        else:
+            fx_node = None
+
+        if isinstance(sym, sympy.Integer):
+            if hint is not None:
+                assert int(sym) == hint
+            out = int(sym)
+        else:
+            out = SymInt(SymNode(sym, self, int, hint, fx_node=fx_node))
+        return out
+
+    @record_shapeenv_event()
+    def create_unspecified_symint_and_symbol(self, value, source, dynamic_dim):
+        """Create a SymInt wrapping a new unspecified symbol"""
+        return self.create_symintnode(
+            self.create_unspecified_symbol(
+                value,
+                source=source,
+                dynamic_dim=dynamic_dim,
+            ),
+            hint=value,
+            source=source,
+        )
+
+    def create_symboolnode(self, sym: "sympy.Expr"):
+        """Create a SymBool object from a sympy boolean expression"""
+        # This function is only being used in serialization, so we do not track it
+        # for validation.
+        return SymBool(SymNode(sym, self, bool, None))
+
+    def _log_create_unbacked_symbol(self, prefix: str, symbol, vr: ValueRanges):
+        is_debug = config.extended_debug_create_symbol is not None and str(symbol) in config.extended_debug_create_symbol.split(',')
+        fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+        log.info(
+            "%s %s [%s, %s]%s (%s)%s",
+            prefix, symbol, vr.lower, vr.upper, maybe_user_loc, format_frame(fsummary), maybe_extra_debug, stack_info=is_debug
+        )
+
+    @record_shapeenv_event()
+    def create_unbacked_symfloat(self):
+        """Create a symbolic float without a hint value
+        """
+        symbol: sympy.Symbol = sympy.Symbol(f"f{next(self.unbacked_symfloat_counter)}")
+        self.counter["create_unbacked_symbol"] += 1
+        self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
+        vr = self.var_to_range[symbol] = ValueRanges.unknown()
+
+        # Create a new FX placeholder and Z3 variable for 'symbol'.
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, float)
+
+        self._log_create_unbacked_symbol("create_unbacked_symfloat", symbol, vr)
+
+        return SymFloat(SymNode(symbol, self, float, None, fx_node=fx_node))
+
+    @record_shapeenv_event()
+    def create_unbacked_symint(self):
+        """Create a symbolic integer without a hint value
+        """
+        symbol: sympy.Symbol = sympy.Symbol(f"u{next(self.unbacked_symint_counter)}", integer=True)
+        self.counter["create_unbacked_symbol"] += 1
+        self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
+        vr = self.var_to_range[symbol] = self._default_unspecified_value_range()
+
+        # Create a new FX placeholder and Z3 variable for 'symbol'.
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, int)
+
+        self._log_create_unbacked_symbol("create_unbacked_symint", symbol, vr)
+
+        return SymInt(SymNode(symbol, self, int, None, fx_node=fx_node))
+
+    def is_unbacked_symint(self, symbol: sympy.Symbol) -> bool:
+        """Check if a sympy symbol matches the naming convention for unbacked symbols
+        """
+        # NB: keep synced with free_unbacked_symbols
+        return str(symbol).startswith("u")
+
+    @record_shapeenv_event()
+    def create_unbacked_symbool(self):
+        """Create a symbolic boolean without a hint value
+        """
+        symbol: sympy.Symbol = sympy.Symbol(f"u{next(self.unbacked_symint_counter)}", integer=True)
+        self.counter["create_unbacked_symbol"] += 1
+        self.var_to_stack[symbol] = CapturedTraceback.extract(skip=1)
+        vr = self.var_to_range[symbol] = ValueRanges(0, 1)
+
+        # Create a new FX placeholder and Z3 variable for 'symbol'.
+        fx_node = self._create_fx_placeholder_and_z3var(symbol, bool)
+
+        self._log_create_unbacked_symbol("create_unbacked_symbool", symbol, vr)
+
+        return SymBool(SymNode(sympy.Eq(symbol, 1), self, bool, None, fx_node=fx_node))
+
+    @record_shapeenv_event()
+    def create_unspecified_symbol(
+        self,
+        val: Union[int, SymInt],
+        source: Source,
+        dynamic_dim: DimDynamic = DimDynamic.DUCK,
+        constraint_dim: DimConstraint = None,  # NB: includes None
+    ) -> "sympy.Expr":
+        """Create a symbol with an unspecified value
+
+        Compared to standard symbols we do not assume the value is positive,
+        nor do we specialze on zero or one values.
+        """
+        # 'positive' is None for unspecified symbols, since we can't
+        # assume that it will be neither positive nor negative.
+
+        # We don't want to specialize zero one val for unspecified symbol
+        # so that we can always get a new symbol despite val.
+        return self.create_symbol(
+            val,
+            source,
+            dynamic_dim,
+            constraint_dim,
+            positive=None,
+            do_not_specialize_zero_one=True,
+            symbolic_context=None)
+
+    @record_shapeenv_event()
+    def create_symbol(
+        self,
+        val: int,
+        source: Source,
+        dynamic_dim: DimDynamic = DimDynamic.DUCK,
+        constraint_dim: DimConstraint = None,  # NB: includes None
+        positive: Optional[bool] = True,
+        do_not_specialize_zero_one: bool = False,
+        symbolic_context=None,
+    ) -> "sympy.Expr":
+        """Create a new symbol which is tracked by this ShapeEnv
+        """
+        # see note [Tensor Fakification and Symbol Caching]
+        source_name = source.name()
+        if (isinstance(symbolic_context, StatefulSymbolicContext)
+                and id(self) not in symbolic_context.shape_env_to_source_to_symbol_cache):
+            symbolic_context.shape_env_to_source_to_symbol_cache[id(self)] = {}
+
+        if (isinstance(symbolic_context, StatefulSymbolicContext)
+                and source_name
+                and (source_name in symbolic_context.shape_env_to_source_to_symbol_cache[id(self)])):
+            return symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][source_name]
+
+        if do_not_specialize_zero_one:
+            specialize_zero_one = False
+        else:
+            specialize_zero_one = self.specialize_zero_one
+
+        assert isinstance(source, Source), f"{type(source)} {source}"
+        assert not (positive and val < 0), f"positive set for negative value: {val}"
+        # It's always sound to allocate a symbol as DYNAMIC.  If the user
+        # constrained the symbol, force the symbolic_context to DYNAMIC, because our
+        # constraint code will do weird stuff if, e.g., it's duck shaped
+        if constraint_dim is not None:
+            dynamic_dim = DimDynamic.DYNAMIC
+
+        if dynamic_dim is DimDynamic.STATIC:
+            out = sympy.Integer(val)
+            if isinstance(symbolic_context, StatefulSymbolicContext) and source_name:
+                symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][source_name] = out
+            return out
+
+        elif dynamic_dim is DimDynamic.DUCK:
+            # duck_shape can be used to globally turn off duck shaping, even
+            # if it was requested
+            duck = self.duck_shape
+        elif dynamic_dim is DimDynamic.DYNAMIC:
+            duck = False
+        else:
+            raise AssertionError(f"unhandled dynamic_dim {dynamic_dim}")
+
+        if val in (0, 1) and specialize_zero_one:
+            r = self.val_to_var[val]
+        elif not duck or val not in self.val_to_var:
+            # If we're not duck shaping, we always create a new symbol
+            # Even if we're duck shaping, if we haven't seen this particular
+            # value before, we also create a new symbol
+            sympy_expr = sympy.Symbol(f"s{len(self.var_to_val)}", positive=positive, integer=True)
+            # We always associate vars to vals
+            if isinstance(val, int):
+                self.var_to_val[sympy_expr] = sympy.Integer(val)
+            else:
+                # Only used for jagged layout nested tensors
+                self.var_to_val[sympy_expr] = SingletonInt(val.node.nested_int(), coeff=val.node.nested_int_coeff())
+
+            # Do the appending later, because we always want to populate this
+            self.var_to_sources[sympy_expr] = []
+            # Create a Z3 variable for the new symbol.
+            self._add_z3var(sympy_expr, int)
+
+            if duck:
+                # Make sure to reuse this symbol for subsequent duck shaping
+                self.val_to_var[val] = sympy_expr
+
+            if isinstance(val, int):
+                if positive:
+                    # Add assertions for the newly created symbols
+                    self._add_assertion(sympy_expr > 1)
+
+                    # Apply default range, which assumes not zero-one
+                    self.var_to_range[sympy_expr] = self._default_value_range()
+                else:
+                    self.var_to_range[sympy_expr] = self._default_unspecified_value_range()
+
+                # Small performance optimization: if we have a min-max constraint,
+                # we can proactively narrow to that range
+                if isinstance(constraint_dim, StrictMinMaxConstraint):
+                    assert not duck
+                    self.var_to_range[sympy_expr] &= constraint_dim.vr
+
+                vr = self.var_to_range[sympy_expr]
+
+                if val not in vr:
+                    raise ConstraintViolationError(f"{val} not in range [{vr.lower}, {vr.upper}]")
+
+                range_str = f"[{vr.lower}, {vr.upper}]"
+            else:
+                # Skip var_range logic for SingletonInt
+                # Only used for jagged layout nested tensors
+                range_str = ""
+
+            r = sympy_expr
+
+            is_debug = (
+                config.extended_debug_create_symbol is not None and
+                str(sympy_expr) in config.extended_debug_create_symbol.split(',')
+            )
+            fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+            self.log.info(
+                "create_symbol %s = %s for %s %s%s (%s)%s",
+                sympy_expr, val, source.name(), range_str,
+                maybe_user_loc, format_frame(fsummary), maybe_extra_debug, stack_info=is_debug
+            )
+
+            self.counter["create_symbol"] += 1
+        else:
+            # This implements duck-shaping: input sizes that match are assigned
+            # the same symint
+            r = self.val_to_var[val]
+            self.log.debug("create_symbol %s duck sized %s", r, source.name())
+
+        if isinstance(r, sympy.Symbol):
+            r_sources = self.var_to_sources[r]
+            r_sources.append(source)
+            if not source.is_ephemeral() and r_sources[0].is_ephemeral():
+                # prefer non-ephemeral source first since it may be guarded on later
+                r_sources[0], r_sources[-1] = r_sources[-1], r_sources[0]
+
+            # This ensures we get zeros in symbol_guard_counts, which makes
+            # some queries simpler (since we will accumulate mass on 0 this
+            # way)
+            self.symbol_guard_counter[r] = 0
+
+        if isinstance(symbolic_context, StatefulSymbolicContext) and source_name:
+            symbolic_context.shape_env_to_source_to_symbol_cache[id(self)][source_name] = r
+        return r
+
+    def _debug_name(self, source):
+        src_name = source.name()
+        return self.source_name_to_debug_name.get(src_name, src_name)
+
+    def _render_range_for_constraint_violation(self, source, c):
+        if isinstance(c, StrictMinMaxConstraint):
+            lower, upper = c.vr.lower, c.vr.upper
+            default = self._default_value_range()
+            if lower <= default.lower:
+                lower = None
+            if upper >= default.upper:
+                upper = None
+            c_render = f"{self._debug_name(source)} = {source.name()} in the specified range"
+            if lower is not None and upper is not None:
+                c_render += f" {lower} <= {self._debug_name(source)} <= {upper}"
+            elif lower is None and upper is not None:
+                c_render += f" {self._debug_name(source)} <= {upper}"
+            elif lower is not None and upper is None:
+                c_render += f" {lower} <= {self._debug_name(source)}"
+            return c_render
+        return c.render(source)
+
+    def produce_guards(
+        self,
+        placeholders,
+        sources,
+        source_ref=lambda n: n.name(),
+        *,
+        input_contexts: Optional[DimList[SymbolicContext]] = None,
+        # Encodes user-specified input shape equations of the form s = s' and s = fn(s').
+        # (See docs on EqualityConstraint for details of the encoding.)
+        equalities_inputs: Optional[EqualityConstraint] = None,
+        _simplified=False,
+        # Indicates if we should produce guards for known static values.
+        ignore_static=True,
+    ) -> List[str]:
+        """
+        Generates a list of guards strings which, when evaluated in a context that
+        defines tensors for all the sources, returns True or False depending
+        on if the guards in the list evaluated to True or not.  Primarily used by Dynamo,
+        but this is also helpful for manual testing of guards (see
+        evaluate_guards_for_args)
+
+        For convenience in testing, a source is allowed to be a str,
+        in which case we will assume it is a LocalSource
+
+        simplified lets you omit duck sizing, equality and 0/1 guards.
+        This is useful for testing when you don't care about the boilerplate
+        guards, and it may be helpful for user output too (be careful though;
+        some equality guards are nontrivial!  It would be nice to get simplified
+        output to print them too).  It's private because it's not
+        intended for normal use
+        """
+        self.log.info("produce_guards")
+
+        # Check if we get to the same ShapeEnv state by replaying the recorded events.
+        # This will create a new ShapeEnv instance, and call all recorded function
+        # calls on this new instance. Finally, it will check whether this new instance
+        # has equal state.
+        #
+        # It's important that we do it in the begining of this function, since it modifies
+        # self.dim_constraints through its execution. Changes that happen in this method
+        # aren't interesting, since this is the function call we wish to reproduce at the
+        # end. If we wish to simply reproduce ShapeEnv instances even after this call,
+        # this method should also be recorded.
+        if self.check_recorded_events:
+            shape_env = replay_shape_env_events(self.events)
+            self.check_equal(shape_env)
+
+        assert len(placeholders) == len(sources), f"len({placeholders}) != len({sources})"
+        Tensorlike = (torch.Tensor, FakeTensorMeta)
+
+        def _create_no_constraints_context(t):
+            return StatelessSymbolicContext(
+                # Ignored; only the constraints part is relevant below.
+                dynamic_sizes=[DimDynamic.DYNAMIC] * t.dim(),
+                constraint_sizes=[None] * t.dim()
+            )
+
+        # Expand optional inputs, or verify invariants are upheld
+        if input_contexts is None:
+            input_contexts = [
+                _create_no_constraints_context(t) if isinstance(t, Tensorlike)
+                else None for t in placeholders
+            ]
+        else:
+            assert len(input_contexts) == len(placeholders)
+            for i, (t, context) in enumerate(zip(placeholders, input_contexts)):
+                if isinstance(t, Tensorlike):
+                    if context is None:
+                        input_contexts[i] = _create_no_constraints_context(t)
+                else:
+                    assert isinstance(t, (SymInt, int))
+                    assert not isinstance(context, list)
+
+        # It took a lot of sweat to figure out the algorithm here.  Let's
+        # explain how it works.
+        #
+        # The ShapeEnv lifecycle looks something like this:
+        #
+        # - For each input, you either generate a fresh Sympy symbol (s0) to
+        #   represent its value (a binding site), or you reuse some
+        #   preexisting symbol or expression, skipping the symbol allocation
+        #   (e.g., duck sizing to a preexisting symbol, or expressing a
+        #   stride as a multiplication of a separate stride and size.)
+        #   Naively, you might expect to bind a fresh Sympy symbol for
+        #   every input, but this is fairly wasteful as most of these
+        #   symbols immediately simplify away, and if you don't eagerly
+        #   specialize, e.g., 0/1 symbols, you end up with very complicated
+        #   expressions that are not optimizable in practice.
+        #
+        # - You perform some compute on these symbols, occasionally
+        #   introducing guards on boolean expressions on these symbols.
+        #   In particular, whenever we guard on equality (_maybe_guard_rel),
+        #   we can simplify shapes; e.g., when s0 == s1 * 2, we can now
+        #   replace all occurrences of s0 with s1 * 2.  Sometimes, a
+        #   boolean expression evaluation doesn't introduce a guard, as
+        #   the guard is already entailed by the simplifications we have
+        #   applied.
+        #
+        # - In the end, you have a bunch of replacements (saying how to
+        #   simplify shapes) and a bunch of guards (all the equality guards
+        #   are trivial, because they're covered by the replacements).
+        #
+        # From the ShapeEnv, we must generate a Python expression that, when
+        # evaluated on a set of inputs, tells us whether or not these boolean
+        # expressions would have evaluated in the same way.  However,
+        # we cannot easily compute this, as we elide recording boolean
+        # expressions when we think they are vacuously true.  Thus, we seek
+        # an approximation: we must generate an expression, if true, would have
+        # produced an "equivalent" ShapeEnv, which would answer guard
+        # expressions in the same way.
+        #
+        # Our notion of equivalence is a bit subtle.  For example, consider
+        # the ShapeEnv created from an input of size (5, 4) versus (4, 4)
+        # (no other guards.)  Duck sizing would generate (s0, s1) in the first
+        # case but (s0, s0) in the second.  We do NOT assume that size
+        # variables are disjoint; so in fact a graph that assumes the input
+        # could be (s0, s1) subsumes (s0, s0) (setting s0 == s1), but not
+        # vice versa.  However, consider an analogous case (1,) versus (2,).
+        # Duck sizing generates (1,) and (s0,); the (s0,) graph does NOT
+        # subsume the (1,) graph because we assume that any size variables
+        # is NOT 0/1 (and make simplifications according to this; e.g., if
+        # we queried s0 == 0, we would immediately return False without
+        # returning a guard.)
+        #
+        # So, it is perhaps easier to flip things on their head: the guard
+        # expressions we generate here say what simplifications are valid,
+        # and what are not.  Below, we explain each of the guard expressions
+        # we generate
+
+        # TODO: Make this more efficient by binding all the size/stride/offsets
+        # to locals before performing tests on them.
+
+        from torch._dynamo.source import TensorPropertySource, TensorProperty, NegateSource
+
+        # Actual codegen must be delayed as we don't necessarily know what
+        # the symbol mapping is
+        input_guards = []
+
+        symbol_to_source = collections.defaultdict(list)
+        symbol_to_constraints = collections.defaultdict(set)
+        constraint_violations : List[Tuple[bool, Callable[[], str]]] = []
+
+        def record_constraint_violation(warn_only, debug_name, msg, hint=None):
+            constraint_violations.append(
+                (warn_only, debug_name, lambda: f"{msg}{hint()}" if hint else msg)
+            )
+
+        def is_dim(src):
+            return isinstance(src, TensorPropertySource) and src.prop is TensorProperty.SIZE
+
+        if equalities_inputs:
+            source_index = {}
+            for i, src in enumerate(sources):
+                source_index[src.name()] = i
+
+            def get_expression(tensor_dim_src):
+                fake = placeholders[source_index[tensor_dim_src.base.name()]]
+                symint = fake.shape[tensor_dim_src.idx]
+                if isinstance(symint, torch.SymInt):
+                    return symint.node.expr
+                else:
+                    assert type(symint) is int, f"Expected int, got {type(symint)}"
+                    return symint
+
+            for src1, src2 in equalities_inputs.source_pairs:
+                expr1, expr2 = get_expression(src1), get_expression(src2)
+                # Check whether given input shape values satisfy a specified equation s = s'.
+                # - Raise when the equation was violated by the given input shape values.
+                # - Otherwise issue a guard to constrain them.
+                concrete_val = self.evaluate_expr(sympy.Eq(expr1, expr2))
+                if not concrete_val:
+                    raise ConstraintViolationError(
+                        f"{src1.name()} = {expr1.subs(self.var_to_val)}"
+                        " is not equal to "
+                        f"{src2.name()} = {expr2.subs(self.var_to_val)}"
+                    )
+
+            for src, root, fn in equalities_inputs.derived_equalities:
+                expr1 = get_expression(src)
+                # recall that root is either a phantom symbol or an input source
+                expr2, debug_name = (
+                    (root, self.var_to_sources[root][0].name()) if isinstance(root, sympy.Symbol)
+                    else (get_expression(root), self._debug_name(root))
+                )
+                expr2_ = fn(expr2)
+                # Check whether given input shape values satisfy a specified equation s = fn(s').
+                # - Raise when the equation was violated by the given input shape values.
+                # - Otherwise issue a guard to constrain them.
+                concrete_val = self.evaluate_expr(sympy.Eq(expr1, expr2_))
+                if not concrete_val:
+                    raise ConstraintViolationError(
+                        f"Expected input {src.name()} to be equal to "
+                        f"{fn(sympy.Symbol(debug_name))}, "
+                        f"where {debug_name} = {expr2.subs(self.var_to_val)}, "
+                        f"but got {expr1.subs(self.var_to_val)}"
+                    )
+
+            for phantom_symbol in equalities_inputs.phantom_symbols:
+                # we created additional phantom symbols that are not input shape dimensions
+                symbol_to_source[phantom_symbol].extend(self.var_to_sources[phantom_symbol])
+
+        # How do we know what the value of s0 is?  Fresh variables can only be
+        # bound by inputs, so there MUST be some other input which binds the
+        # variable.  If there is no such input, this is an error in our
+        # system.  We record where all symbols come from, to help you diagnose
+        # why those symbols didn't occur.
+        #
+        # In fact, generally speaking it is only possible for the "outermost"
+        # user of a ShapeEnv to evaluate the guards, because some inputs may
+        # not be available to inner levels.  For example, Dynamo can guard on
+        # tensors that never actually become graph arguments (they are
+        # pruned).  In this case, only Dynamo knows about these arguments.
+        def track_symint(source, val, constraint=None):
+            log.debug("track_symint %s %s %s", LazyString(source.name), val, constraint)
+            assert not isinstance(val, SymInt) or is_symbolic(val)
+
+            if isinstance(val, SymInt) and val.node.maybe_as_int() is not None:
+                val = val.node.maybe_as_int()
+
+            if isinstance(val, SymInt):
+                s = val.node.expr
+                if isinstance(s, sympy.Symbol):
+                    symbol_to_source[s].append(source)
+                    if constraint is not None:
+                        symbol_to_constraints[s].add(constraint)
+                elif isinstance(-s, sympy.Symbol):
+                    symbol_to_source[-s].append(NegateSource(source))
+                else:
+                    constraint_violated = False
+                    if isinstance(constraint, StrictMinMaxConstraint):
+                        # try inferring the ranges of the expr s
+                        sym_vrs = {x: self.var_to_range.get(x, None) for x in s.free_symbols}
+                        if all(vr is not None for vr in sym_vrs.values()):
+                            expr_vr = bound_sympy(s, sym_vrs)
+                            if expr_vr != constraint.vr:
+                                # the expr and constrain ranges don't match
+                                constraint_violated = True
+                        else:
+                            # some of the free symbols in s don't have ranges
+                            constraint_violated = True
+                    elif isinstance(constraint, RelaxedUnspecConstraint):
+                        if s.is_number:
+                            i = int(s)
+                            # Don't complain about 0/1 specialization, we
+                            # expect to have to compile in this case anyway
+                            if i not in (0, 1):
+                                constraint_violated = True
+                    if constraint_violated:
+                        def hint(s):
+                            sexpr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(s)
+                            return f"{sexpr}."
+
+                        var_with_range = self._render_range_for_constraint_violation(source, constraint)
+                        msg = (
+                            f"Not all values of {var_with_range} are valid because "
+                            f"{self._debug_name(source)} was inferred to be equal to "
+                        )
+                        record_constraint_violation(
+                            constraint.warn_only,
+                            self._debug_name(source),
+                            msg,
+                            hint=functools.partial(hint, s),
+                        )
+
+                input_guards.append((source, s))
+            else:
+                s = sympy.Integer(val)
+                input_guards.append((source, s))
+                constraint_violated = False
+                if isinstance(constraint, StrictMinMaxConstraint):
+                    constraint_violated = True
+                elif isinstance(constraint, RelaxedUnspecConstraint):
+                    # Don't complain about 0/1 specialization, we
+                    # expect to have to compile in this case anyway
+                    if val not in (0, 1):
+                        constraint_violated = True
+                if constraint_violated:
+                    var_with_range = self._render_range_for_constraint_violation(source, constraint)
+                    msg = (
+                        f"Not all values of {var_with_range} are valid because "
+                        f"{self._debug_name(source)} was inferred to be a constant ({val})."
+                    )
+                    record_constraint_violation(constraint.warn_only, self._debug_name(source), msg)
+
+        for t, source, context in zip(placeholders, sources, input_contexts):
+            if isinstance(source, str):
+                from torch._dynamo.source import LocalSource
+                source = LocalSource(source)
+            assert isinstance(source, Source)
+            if t is None:
+                continue
+            if isinstance(t, (SymInt, int)):
+                track_symint(source, t)
+                continue
+            assert isinstance(t, Tensorlike)
+            if is_traceable_wrapper_subclass(t):
+                from torch._dynamo.source import AttrSource
+
+                assert isinstance(context, SubclassSymbolicContext)
+
+                # For subclasses, we need to track symints on BOTH the outer
+                # and inner tensors.
+                sources_tensors_constraints = [
+                    (source, t, context.constraint_sizes)
+                ]
+                attrs, _ = t.__tensor_flatten__()
+                for attr in attrs:
+                    inner_t = getattr(t, attr)
+                    inner_context = context.inner_contexts[attr]
+                    sources_tensors_constraints.append((
+                        AttrSource(source, attr),
+                        inner_t,
+                        inner_context.constraint_sizes
+                    ))
+            else:
+                sources_tensors_constraints = [(source, t, context.constraint_sizes)]
+
+            for src, curr_t, constraint in sources_tensors_constraints:
+                if is_sparse_any(curr_t):
+                    for i, ss in enumerate(curr_t.size()):
+                        property_source = TensorPropertySource(src, TensorProperty.SIZE, i)
+                        track_symint(property_source, ss, constraint[i])
+                else:
+                    for i, ss in enumerate(curr_t.size()):
+                        property_source = TensorPropertySource(src, TensorProperty.SIZE, i)
+                        track_symint(property_source, ss, constraint[i])
+                    for i, ss in enumerate(curr_t.stride()):
+                        track_symint(TensorPropertySource(src, TensorProperty.STRIDE, i), ss)
+                    track_symint(TensorPropertySource(src, TensorProperty.STORAGE_OFFSET), curr_t.storage_offset())
+
+        # 1. Every input must equal the final simplified symbolic expression
+        #    stored on the placeholder.  Given a placeholder (s0*2, s1),
+        #    if we have an input (2, 3), we must show s0*2 == 2 and s1 == 3.
+        #    This does a lot of work: it covers duck sizing and equality guards.
+        exprs = []
+        self.dim_constraints = DimConstraints(
+            symbol_to_source,
+            self.var_to_val,
+            set(symbol_to_constraints.keys()),
+            self.source_name_to_debug_name,
+        )
+
+        if not _simplified:
+            for source, expr in input_guards:
+                if self._translation_validation_enabled:
+                    # Ignore sources that were not turned into SymInts.
+                    srcname = source.name()
+                    if srcname in self.source_to_symbol:
+                        self._add_target_expr(sympy.Eq(self.source_to_symbol[srcname], expr))
+
+                # Small optimization
+                if (
+                    isinstance(expr, sympy.Symbol) and
+                    symbol_to_source.get(expr) and
+                    source == symbol_to_source[expr][0]
+                ):
+                    continue
+
+                # This logic excludes static values found on tensors from guarding, because
+                # dynamo's check_tensor_fn does that (see guards.cpp).
+                # However, for non tensor sources, we still need to guard here.
+                if ignore_static and isinstance(source, TensorPropertySource):
+                    if expr.is_number:
+                        self.log.debug("Skipping guard %s", f"{source_ref(source)} == {expr}")
+                        continue
+
+                if is_dim(source):
+                    self.dim_constraints.add_equality(source, expr)
+
+                sexpr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
+                exprs.append(f"{source_ref(source)} == {sexpr}")
+                if (
+                    isinstance(source, TensorPropertySource)
+                    and source.prop is TensorProperty.SIZE
+                    and equalities_inputs
+                    and len(expr.free_symbols) == 1
+                ):
+                    symbol = next(iter(expr.free_symbols))
+                    if (
+                        isinstance(expr, sympy.Symbol) and
+                        expr in symbol_to_constraints and
+                        not equalities_inputs.is_equal(source, symbol_to_source[expr][0])
+                    ):
+                        msg = (
+                            f"The values of {self._debug_name(source)} = {source.name()} and "
+                            f"{self._debug_name(symbol_to_source[expr][0])} = {symbol_to_source[expr][0].name()} "
+                            "must always be equal."
+                        )
+                        record_constraint_violation(equalities_inputs.warn_only, self._debug_name(source), msg)
+
+                    if (
+                        not isinstance(expr, sympy.Symbol) and
+                        symbol in symbol_to_constraints and
+                        not equalities_inputs.is_derived(source, symbol_to_source[symbol][0], lambda x: expr.subs(symbol, x))
+                    ):
+                        src = symbol_to_source[symbol][0]
+                        msg = (
+                            f"The values of {self._debug_name(source)} = {source.name()} must always be related to "
+                            f"the values of {self._debug_name(src)} = {src.name()} by "
+                            f"{self._debug_name(source)} = {expr.subs(symbol, sympy.sympify(self._debug_name(src)))}."
+                        )
+                        record_constraint_violation(equalities_inputs.warn_only, self._debug_name(source), msg)
+
+                # NB: Not necessary to report constraint violations here:
+                # constraints are guaranteed to be on symbols (we've already
+                # caught constants and non-atomic expressions), so we only
+                # have relational constraints, but we don't support those
+                # at the moment
+
+        # 2. Every guard must evaluate to True (but remember many guards
+        #    like s0 == s1*2 because trivial due to simplification)
+        issued = set()
+
+        def issue_guard(guard: ShapeGuard) -> None:
+            expr = self.simplify(guard.expr)
+
+            # Avoid re-issueing the same guard.
+            if expr in issued:
+                return
+
+            issued.add(expr)
+
+            try:
+                is_trivial = False
+                if any(is_dim(source) for s in expr.free_symbols for source in symbol_to_source[s]):
+                    is_trivial = self.dim_constraints.add(expr)
+                guard_expr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
+                exprs.append(guard_expr)
+                self._add_target_expr(expr)
+                # A non-relational constraint on a single sizevar can violate
+                # a constraint
+                if not is_trivial and len(expr.free_symbols) == 1:
+                    symbol = next(iter(expr.free_symbols))
+                    source = symbol_to_source[symbol][0]
+                    constraints = symbol_to_constraints[symbol]
+                    for c in constraints:
+                        if isinstance(c, StrictMinMaxConstraint):
+                            var_with_range = self._render_range_for_constraint_violation(source, c)
+                            msg = (
+                                f"Not all values of {var_with_range} "
+                                f"satisfy the generated guard {guard_expr}."
+                            )
+                            record_constraint_violation(c.warn_only, self._debug_name(source), msg)
+                        elif isinstance(c, RelaxedUnspecConstraint):
+                            # This is fine, we allow guards here as long as it
+                            # didn't constrain it to one value  (we don't
+                            # actually know this; this depends on our
+                            # ValueRanges reasoning capability)
+                            pass
+                        else:
+                            raise AssertionError(f"unrecognized constraint {c}")
+            except Exception:
+                self.log.warning("Failing guard allocated at: \n%s", ''.join(guard.stack.format()))
+                raise
+
+        # First, issue all the non-trivial guards.
+        for guard in self.guards:
+            if self._maybe_evaluate_static(guard.expr) is not None:
+                continue
+            issue_guard(guard)
+
+        # 3. Every symbol must be within its value range (this handles 0/1
+        # specialization too).
+        for symbol, sources in symbol_to_source.items():
+            r = self.var_to_range.get(symbol)
+            if r is None:
+                if symbol not in self.var_to_range:
+                    continue
+                r = self.var_to_range[symbol]
+
+            assert sources
+            assert symbol.is_integer
+            bounds = []
+            if r.lower != -sympy.oo:
+                if any(is_dim(source) for source in sources):
+                    self.dim_constraints.add(sympy.Ge(symbol, r.lower))
+                # Only print lower bound in simplified mode if it is not the
+                # default
+                if not _simplified or r.lower != self._default_value_range().lower:
+                    bounds.append(str(r.lower))
+            bounds.append(source_ref(sources[0]))
+            # NB: This looks like an off-by-one error but it's not: the
+            # upper bound may be sys.maxsize - 1 because we intentionally
+            # exclude sys.maxsize from our bounds to deal with direct
+            # == INT_MAX guards, but it's still dumb to actually test it.
+            # Note that you can be off by a pretty large constant and it
+            # won't matter because sizes in practice will be no where near
+            # the 64-bit limit.
+            if r.upper != sympy.oo and r.upper < sys.maxsize - 1:
+                if any(is_dim(source) for source in sources):
+                    self.dim_constraints.add(sympy.Le(symbol, r.upper))
+                # nontrivial upper bound is always interesting
+                bounds.append(str(r.upper))
+            if len(bounds) > 1:
+                exprs.append(" <= ".join(bounds))
+
+                # Check constraints
+                constraints = symbol_to_constraints[symbol]
+                for c in constraints:
+                    if isinstance(c, StrictMinMaxConstraint):
+                        # NB: By default, we have a restrictive range
+                        # 2 <= s0 <= sys.maxsize - 1.  But export users generally
+                        # expect to be able to specify nice ranges like [0, oo]
+                        if not (c.vr & self._default_value_range()).issubset(r):
+                            source = sources[0]
+
+                            expr = sympy.And(sympy.Le(r.lower, symbol), sympy.Le(symbol, r.upper))
+                            guard_expr = ShapeGuardPrinter(symbol_to_source, source_ref, self.var_to_sources).doprint(expr)
+                            var_with_range = self._render_range_for_constraint_violation(source, c)
+                            msg = (
+                                f"Not all values of {var_with_range} satisfy the generated guard {guard_expr}"
+                            )
+                            record_constraint_violation(
+                                c.warn_only,
+                                self._debug_name(source),
+                                msg,
+                            )
+
+        if constraint_violations:
+            warn_msgs = []
+            error_msgs = []
+            debug_names = set()
+            for warn_only, debug_name, msg in constraint_violations:
+                if warn_only:
+                    msg = f"  {len(warn_msgs) + 1}. {msg()}"
+                    warn_msgs.append(msg)
+                else:
+                    msg = f"  - {msg()}"
+                    error_msgs.append(msg)
+                    debug_names.add(debug_name)
+            if len(error_msgs) > 0:
+                debug_names = ', '.join(debug_names)
+                err = '\n'.join(error_msgs)
+                raise ConstraintViolationError(
+                    f"Constraints violated ({debug_names})! "
+                    "For more information, run with TORCH_LOGS=\"+dynamic\".\n"
+                    f"{err}"
+                )
+            elif len(warn_msgs) > 0:
+                log.debug("%s Warning only constraints violated", len(warn_msgs))
+
+        signpost_event(
+            "dynamic",
+            "produce_guards",
+            {
+                **self.co_fields,
+                **self.counter,
+                "num_guards": len(exprs),
+                "free_symbols": sum(1 for v in symbol_to_source.values() if v),
+                # The keys are meaningless from an aggregate perspective, so
+                # don't include them.  Biggest first.
+                "symbol_guard_counts": sorted(self.symbol_guard_counter.values(), reverse=True),
+            },
+        )
+
+        if self._translation_validation_enabled:
+            from torch.fx.experimental.validator import PopulateValidator
+
+            # Add all deferred runtime assertions; these are not technically
+            # handled by produce_guards but we need to put them in the target
+            # set
+            for ras in self.deferred_runtime_asserts.values():
+                for ra in ras:
+                    self._add_target_expr(ra.expr)
+
+            # Add value range bound guards for all symbols with no trivial bounds.
+            # Reason: '_maybe_evaluate_static' may eliminate guards based on the
+            # refined value ranges.
+            for sym, vr in self.var_to_range.items():
+                if vr.lower != -sympy.oo:
+                    self._add_target_expr(sympy.Le(vr.lower, sym))
+                if vr.upper != sympy.oo:
+                    self._add_target_expr(sympy.Le(sym, vr.upper))
+
+            # Before validating, populate the input of the validator with the
+            # built FX graph.
+            with fx_traceback.preserve_node_meta():
+                PopulateValidator(self.graph, self.validator).run()
+
+        self._check_translation_validate()
+        return exprs
+
+    def produce_guards_expression(self, placeholders, ignore_static=True):
+        """
+        Expected to be used with evaluate_guards_expression(). Produces the guards
+        for the given placeholders and returns a string expression to be evaluated
+        by evaluate_guards_expression given concrete values for the placeholders.
+        """
+        from torch._dynamo.source import LocalSource
+        arg_names = [f"t{i}" for i in range(len(placeholders))]
+        guards = self.produce_guards(placeholders, [LocalSource(a) for a in arg_names], ignore_static=ignore_static)
+        if guards:
+            return " and ".join(guards)
+        return None
+
+    def evaluate_guards_expression(self, code, args):
+        """
+        Expected to be used with produce_guards_expression(). Evaluates an expression
+        generated by produce_guards_expression for the given concrete args.
+        """
+        arg_names = [f"t{i}" for i in range(len(args))]
+        return eval(code, SYMPY_INTERP, {"L": dict(zip(arg_names, args))})
+
+    def evaluate_guards_for_args(self, placeholders, args, *, ignore_static=True):
+        """Generate guards for a graph's placeholder values and evaluate the guards with args
+        """
+        code = self.produce_guards_expression(placeholders, ignore_static=ignore_static)
+        if code:
+            return self.evaluate_guards_expression(code, args)
+        return True
+
+    def bind_symbols(self, placeholders, args):
+        """
+        Given a paired list of placeholders (fake tensors with
+        symbolic sizes) and concrete arguments (regular tensors
+        with real sizes), returns a dictionary mapping each
+        symbol to its real value.  So for example, if you
+        have a placeholder with size (s0, s1), binding
+        (2, 4) to it will give you {s0: 2, s1: 4}.  This is
+        not guaranteed to bind ALL symbols in the ShapeEnv;
+        we can't bind a symbol if it doesn't occur in any placeholder,
+        and symbols that already have replacements won't get bindings.
+
+        This is a little duplicative with evaluate_guards but
+        it's different enough that it seemed cleanest to make
+        another copy.  This assumes the guards are already checked,
+        though if it's cheap we'll check for shenanigans
+        """
+        bindings: Dict[sympy.Symbol, int] = {}
+
+        def bind_symint(arg, val):
+            if isinstance(val, SymInt):
+                s = val.node.expr
+
+                if isinstance(s, sympy.Symbol):
+                    if s in bindings:
+                        assert bindings[s] == arg, f"{bindings[s]} != {arg}"
+                    else:
+                        bindings[s] = arg
+                elif isinstance(-s, sympy.Symbol):
+                    if -s in bindings:
+                        assert bindings[-s] == -arg, f"{bindings[-s]} != {-arg}"
+                    else:
+                        bindings[-s] = -arg
+
+        for t, arg in zip(placeholders, args):
+            if t is None:
+                continue
+            if isinstance(t, SymInt):
+                bind_symint(arg, t)
+                continue
+            assert isinstance(t, torch.Tensor)
+            for i, s in enumerate(t.size()):
+                bind_symint(arg.size(i), s)
+            for i, s in enumerate(t.stride()):
+                bind_symint(arg.stride(i), s)
+            bind_symint(arg.storage_offset(), t.storage_offset())
+
+        return bindings
+
+    def get_nontrivial_guards(self):
+        """Returns a list of guard expressions that aren't statically known (i.e. not trivial)"""
+        return [self.simplify(guard.expr) for guard in self.guards if self._maybe_evaluate_static(guard.expr) is None]
+
+    def format_guards(self, verbose=False):
+        """Format this shape env's guard expressions with optional traceback info if verbose"""
+        def format_tb(tb):
+            if not verbose:
+                return ""
+            return f"\n   Guarded at:\n{''.join('   ' + l for l in tb.format())}"
+
+        return '\n'.join(f" - {guard.expr}{format_tb(guard.stack)}" for guard in self.guards)
+
+    def bound_sympy(self, expr: sympy.Expr, size_oblivious: bool = False) -> ValueRanges:
+        """Given a sympy expression, computes a ValueRanges bound for what values it can be"""
+        var_to_range = {x: self.var_to_range.get(x, None) for x in expr.free_symbols}
+        if size_oblivious:
+            # Clamp values of size-like variables
+            for x in self.size_like & var_to_range.keys():
+                if var_to_range[x] is not None:
+                    var_to_range[x] &= ValueRanges(2, sympy.oo)
+        return bound_sympy(expr, var_to_range)
+
+    @_lru_cache
+    def _maybe_evaluate_static(
+        self, expr: "sympy.Expr", *, unbacked_only: bool = False, compute_hint: bool = False,
+        expect_rational=True, size_oblivious: bool = False
+    ) -> "Optional[sympy.Expr]":
+        """
+        Tries to evaluate expr without introducing guards
+
+        If unbacked_only == True, then we only do substitutions on
+        unbacked SymInts (leaving regular hinted integers alone).  This could
+        result in an expression that still contains backed SymInts, which you
+        could then potentially guard on.
+
+        Use compute_hint == True if you are trying to compute a non-binding
+        hint for the particular hint values of backed SymInts, e.g., if
+        s0 happens to be 3 this run, compute_hint will subsitute s0 with 3.
+        """
+        expr = self.simplify(expr)
+
+        if compute_hint:
+            expr = expr.xreplace(self.var_to_val)
+
+        expr = canonicalize_bool_expr(expr)
+
+        symbols = list(expr.free_symbols)
+
+        # Apply known runtime asserts
+        for s in symbols:
+            # Unbacked symints only
+            if s in self.var_to_val:
+                continue
+
+            subst = {}
+
+            def add_expr(expr):
+                # Expr and negation
+                subst[canonicalize_bool_expr(expr)] = sympy.true
+                subst[canonicalize_bool_expr(sympy.Not(expr))] = sympy.false
+                if isinstance(expr, sympy.Rel):
+                    # multiplying by -1 changes the direction of the inequality
+                    dual = type(expr)(-expr.rhs, -expr.lhs)
+                    subst[canonicalize_bool_expr(dual)] = sympy.true
+                    subst[canonicalize_bool_expr(sympy.Not(dual))] = sympy.false
+
+            for e in itertools.chain(self.guards, self.deferred_runtime_asserts.get(s, ())):
+                e = e.expr
+                if compute_hint:
+                    e = canonicalize_bool_expr(e.xreplace(self.var_to_val))
+                add_expr(e)
+                # Other relational expressions this expression implies
+                if isinstance(e, sympy.Eq):
+                    add_expr(sympy.Le(e.lhs, e.rhs))
+                    add_expr(sympy.Ge(e.lhs, e.rhs))
+                elif isinstance(e, sympy.Lt):
+                    add_expr(sympy.Le(e.lhs, e.rhs))
+                    add_expr(sympy.Ne(e.lhs, e.rhs))
+
+            # NB: this helps us deal with And/Or connectives
+            expr = expr.subs(subst)
+
+        # Simplify making use of value range lower bound
+        new_shape_env = {}
+        new_range_env = {}
+        for idx, k in enumerate(symbols):
+            if isinstance(self.var_to_val.get(k, None), SingletonInt):
+                # Skip var_to_range logic for SingletonInt which is only used
+                # for jagged layout NestedTensors today
+                continue
+            vr = self.var_to_range[k]
+            if size_oblivious and k in self.size_like:
+                lower = max(2, vr.lower)
+            else:
+                lower = vr.lower
+            # Don't do anything if we don't have a nontrivial lower bound
+            # Also don't do anything if we asked only to simplify unbacked
+            # SymInt
+            if (
+                lower < (-sys.maxsize - 1) // 2 or
+                (unbacked_only and k in self.var_to_val)
+            ):
+                new_range_env[k] = vr
+                continue
+            # Positive means >= 1
+            # Positive - 1 means >= 0
+            # Positive + lower - 1 means >= lower
+            # The new symbol 's' is "too low", so when we substitute it in
+            # we have to increase it by offset (and conversely, the new
+            # variables have to have their value range bounds adjusted as
+            # well)
+            s = sympy.Symbol(f"shape_{idx}", positive=True, integer=True)
+            offset = lower - 1
+            new_shape_env[k] = s + offset
+            new_range_env[s] = SymPyValueRangeAnalysis.add(vr, -offset)
+
+        def replace(expr, repl):
+            return expr.xreplace(repl)
+
+        try:
+            new_expr = replace(expr, new_shape_env)
+        except RecursionError:
+            log.warning("RecursionError in sympy.xreplace(%s, %s)", expr, new_shape_env)
+            self.counter["sympy_recursion_error"] += 1
+            return None
+
+        floor_div_replace = {}
+        for atom in new_expr.atoms(FloorDiv):
+            floor_div_replace[atom] = sympy.floor(atom.args[0] / atom.args[1])
+        new_expr = safe_expand(new_expr.xreplace(floor_div_replace))
+        # TODO: when unbacked_only, can sometimes early return even when there
+        # are still free symbols
+        if new_expr.is_number:
+            return new_expr
+
+        # Check if the range can solve it statically
+        out = bound_sympy(new_expr, new_range_env)
+        if expect_rational:
+            _assert_bound_is_rational(new_expr, out)
+            if out.is_singleton():
+                return out.lower
+
+        return new_expr if unbacked_only else None
+
+    @_lru_cache
+    def replace(self, expr: "sympy.Expr") -> "sympy.Expr":
+        """Apply symbol replacements to any symbols in the given expression
+        """
+        replacements = {s: self._find(cast(sympy.Symbol, s)) for s in expr.free_symbols}
+        return safe_expand(expr.xreplace(replacements))
+
+    @_lru_cache
+    def _update_divisible(self):
+        new_divisible = set()
+        for k in self.divisible:
+            res = self.replace(k)
+            if not res.is_number:
+                new_divisible.add(k)
+
+        self.divisible = new_divisible
+        self._update_version_counter()
+
+    @_lru_cache
+    def simplify(self, expr: "sympy.Expr") -> "sympy.Expr":
+        """Use known constraints and replacements to simplify the given expr
+        """
+        expr = self.replace(expr)
+        # TODO it would seem that this pass is not necessary given the
+        # below replacement of // with /, but for nested FloorDivs
+        # the non-recursive replacement doesn't work, and
+        # recursive makes it hard to look up divisibility,
+        # because existing divisibility info has FloorDiv in it, not /
+        # for now just do a separate pass to catch common nested case
+        if expr.has(FloorDiv):
+            self._update_divisible()
+            div_replacements = {}
+            for atom in expr.atoms(FloorDiv):
+                base, divisor = atom.args
+                if isinstance(divisor, FloorDiv):
+                    base1, divisor1 = divisor.args
+                    if self.replace(Mod(base, divisor)) in self.divisible and \
+                            base == base1 and self.replace(Mod(base1, divisor1)) in self.divisible:
+                        div_replacements[atom] = divisor1
+            expr = expr.xreplace(div_replacements)
+            expr = safe_expand(expr)
+        if expr.has(FloorDiv):
+            div_replacements = {}
+            pows = expr.atoms(sympy.Pow)
+            rationals = expr.atoms(sympy.Rational).difference(expr.atoms(sympy.Integer))
+            for fd in expr.atoms(FloorDiv):
+                base, divisor = fd.args
+                if self.replace(Mod(base, divisor)) in self.divisible:
+                    div_replacements[fd] = base / divisor
+            new_expr = expr.xreplace(div_replacements)
+            new_expr = safe_expand(new_expr)
+            new_pows = new_expr.atoms(sympy.Pow)
+            new_rationals = new_expr.atoms(sympy.Rational).difference(new_expr.atoms(sympy.Integer))
+            # divisions simplified away
+            if new_pows.issubset(pows) and new_rationals.issubset(rationals):
+                expr = new_expr
+        return expr
+
+    @lru_cache(256)
+    def size_hint(self, expr: "sympy.Expr", *, allow_none=False):
+        """
+        Gets a size hint for a given expression from the underlying shapes we had.
+        Does not introduce a guard, so only use this when you can guarantee that
+        your code is still valid for arbitrary shapes (such as optimization decisions)
+        """
+        result_expr = safe_expand(expr).xreplace(self.var_to_val)
+        if not result_expr.is_number:
+
+            from torch.utils._sympy.singleton_int import SingletonInt
+
+            if isinstance(result_expr, SingletonInt):
+                return None
+            r = self._maybe_evaluate_static(result_expr, compute_hint=True)
+            if r is not None:
+                return r
+            if allow_none:
+                return None
+            raise self._make_data_dependent_error(result_expr, expr)
+        return result_expr
+
+    # NB: keep in sync with size_hint
+    @lru_cache(256)
+    def has_hint(self, expr: "sympy.Expr"):
+        result_expr = safe_expand(expr).xreplace(self.var_to_val)
+        return result_expr.is_number or self._maybe_evaluate_static(result_expr) is not None
+
+    def _make_data_dependent_error(self, expr, unhinted_expr, *, size_oblivious_result: Optional[bool] = None):
+        # TODO: in a Dynamo context, having user code, and having the
+        # name of the local, will be much better
+        size_like_symbols = []
+        for s in expr.free_symbols:
+            stacktrace = ''.join(self.var_to_stack[s].format())
+            self.log.debug("Data dependent variable '%s' allocated at:\n%s", s, stacktrace)
+            if s in self.size_like:
+                size_like_symbols.append(s)
+        size_oblivious_result_msg = ""
+        if size_oblivious_result is not None:
+            size_oblivious_result_msg = (
+                f"ATTENTION: guard_size_oblivious would fix the error, evaluating expression to {size_oblivious_result}.\n"
+                "Maybe you need to add guard_size_oblivious to framework code, see doc below for more guidance.\n\n"
+            )
+        fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(True)
+        return GuardOnDataDependentSymNode(
+            f"Could not guard on data-dependent expression {expr} (unhinted: {unhinted_expr}).  "
+            f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
+            f"{size_oblivious_result_msg}"
+            "Potential framework code culprit (scroll up for full backtrace):\n"
+            f"{''.join(traceback.StackSummary.from_list([fsummary]).format())}\n"
+            "For more information, run with TORCH_LOGS=\"dynamic\"\n"
+            "For extended logs when we create symbols, also add "
+            f"TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL=\"{','.join(map(str, expr.free_symbols))}\"\n"
+            "If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1\n"
+            "For more debugging help, see "
+            "https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing\n" +
+            maybe_extra_debug
+            # TODO: Help text about how to use our runtime tests to fix this
+            # problem
+        )
+
+    def _set_replacement(self, a: "sympy.Symbol", tgt: "sympy.Expr", msg: str) -> None:
+        """
+        Adds or updates a replacement for a symbol.
+        Use this instead of `self.replacements[a] = tgt`.
+        """
+
+        # Precondition: a == tgt
+        assert isinstance(a, sympy.Symbol)
+
+        # Handles nested tensor symbolic variables which don't have
+        # var_to_range bounds
+        tgt_bound = None
+        if a in self.var_to_range:
+            src_bound = self.var_to_range[a]
+
+            # If you have x in [2, maxint], then 2*x in [4, 2*maxint].
+            # But we don't really care that the max bound says we can
+            # go beyond the maximum integer size, because we aren't
+            # using bigints anyway.  Arguably, ValueRanges should know
+            # to do this truncation automaticaly (to avoid doing
+            # bigint compute in range analysis), but right now it doesn't
+            # so we need to get rid of some unnecessary precision.
+            int_range = ValueRanges(-sys.maxsize - 1, sys.maxsize - 1)
+
+            def issubset(x, y):
+                return (x & int_range).issubset(y & int_range)
+
+            # First, refine the value range of a based on the computed value range
+            # of tgt.  This is always OK to do, even if we decide not to do the
+            # substitution in the end.  This might be a no-op, if a already has
+            # a tighter bound
+            tgt_bound = self.bound_sympy(tgt)
+            self.var_to_range[a] = src_bound & tgt_bound
+
+            # Next, check if we can update the range of free symbols in tgt
+            # based on the range in a. But only do it if:
+            #  - the source bound non-trivially improves over what we get out of
+            #    the existing bounds.
+            #  - the replacement is univariate and we can invert the tgt expression
+            if not issubset(tgt_bound, src_bound) and len(tgt.free_symbols) == 1:
+                b = next(iter(tgt.free_symbols))
+                # Try to invert the equality
+                r = try_solve(sympy.Eq(a, tgt), b, floordiv_inequality=False)
+                if r is not None:
+                    b_bound = self.bound_sympy(r[1])
+                    self.var_to_range[b] = b_bound & self.var_to_range[b]
+                    tgt_bound = self.bound_sympy(tgt)
+                    assert issubset(tgt_bound, src_bound)
+
+            # TODO: Should we propagate size-like-ness?
+            #
+            # Pros: if u0 is size-like, intuitively u0 == u1 should cause u1
+            # to become size-like.
+            #
+            # Cons: if u0 is size-like, what about u0 - 1 == u1?  You CAN'T
+            # propagate in this case, because what if u0 == 0, then u1 is negative
+            # and clearly isn't a size.  So, at minimum, any f(x) whose value
+            # range isn't [0, inf] given x in [0, inf] cannot propagate
+            # size-like-ness.  But there are many situations where you could
+            # imagine u1 is going to be size-like and actually you just didn't
+            # have a refined enough value range on u0.  Since even innocuous
+            # looking arithmetic operations can destroy size-like-ness, it's
+            # best to not propagate it at all and force the user to annotate it
+            # as necessary.
+            #
+            # Compromise: we preserve size-like-ness only for exact equality
+            # and nothing else.
+            if a in self.size_like and isinstance(tgt, sympy.Symbol):
+                self.size_like.add(tgt)
+            elif isinstance(tgt, sympy.Symbol) and tgt in self.size_like:
+                self.size_like.add(a)
+
+            # Now, decide if we will do the substitution.
+            #
+            #  - If the source has a non-trivial range, only substitute if
+            #    we preserve this range.  Note that we may have propagated
+            #    the src_range to free variables in tgt when tgt is univariate
+            #    and we could find an inverse, which helps us achieve this.
+            #    This ensures we never "forget" about user defined ranges,
+            #    even if they end up being defined on composite formulas
+            #    like s0 + s1.
+            #
+            #  - If the variable is unbacked, only substitute if the substitution
+            #    would preserve the bounds also under size-like-ness conditions.
+
+            if not issubset(tgt_bound, src_bound):
+                self.log.debug("skipped set_replacement %s = %s (%s) [%s not subset of %s]", a, tgt, msg, tgt_bound, src_bound)
+                return
+            elif a in self.size_like:
+                tgt_bound_so = self.bound_sympy(tgt, size_oblivious=True)
+                # This is morally equivalent to self.bound_sympy(a, size_oblivious=True)
+                # but handles substitutions like u0 == 0
+                src_bound_so = self.var_to_range[a]
+                if src_bound_so.upper >= 2:
+                    src_bound_so &= ValueRanges(2, sympy.oo)
+                if not issubset(tgt_bound_so, src_bound_so):
+                    self.log.debug("skipped set_replacement %s = %s (%s) "
+                                   "[%s not subset of %s (size-oblivious conditions)]", a, tgt, msg, tgt_bound_so, src_bound_so)
+                    return
+
+        if config.print_specializations and isinstance(tgt, (sympy.Integer, sympy.Float)):
+            # specializing to a constant, which is likely unexpected
+
+            # NOTE(avik): It is possible that we try logging the same specialization multiple times, e.g.,
+            # when adding a to self.replacements, and again when simplifying an expression containing a.
+            # Thus to avoid duplication, checking whether a is in self.replacements isn't enough; if it is,
+            # it must not already map to `tgt`. Fortunately this check is cheap because `tgt` is a constant.
+            if a not in self.replacements or tgt != self.replacements[a]:
+                self.log.warning("Specializing %s to %s", self.var_to_sources[a][0].name(), tgt)
+                self.log.debug("SPECIALIZATION", stack_info=True)
+        log.info("set_replacement %s = %s (%s) %s", a, tgt, msg, tgt_bound)
+        self.replacements[a] = tgt
+        self._update_version_counter()
+
+        # When specializing 'a == tgt', the equality should be also conveyed to
+        # Z3, in case an expression uses 'a'.
+        self._add_target_expr(sympy.Eq(a, tgt))
+
+    def _add_divisible(self, expr: "sympy.Expr"):
+        self.divisible.add(expr)
+        self._update_version_counter()
+
+    @_lru_cache
+    @record_shapeenv_event()
+    def _find(self, a: "sympy.Symbol") -> "sympy.Expr":
+        """
+        Implements a DSU-like algorithm to find the variable that represents a
+        Also handles transitive non-identity replacements.
+
+        a: b + c
+        c: d
+        """
+        if a not in self.replacements:
+            return a
+        res = self.replacements[a]
+        cur_replace = {s: self._find(s) for s in res.free_symbols}
+        self._set_replacement(a, self.replacements[a].xreplace(cur_replace), "find")
+        return self.replacements[a]
+
+    @lru_cache(256)
+    def _maybe_guard_rel(self, expr: "sympy.Rel") -> None:
+        """
+        The relational guard is guarded to be true.  Use this information to
+        simplify shapes (i.e. a == b or a % 5 == 0)
+        """
+        assert isinstance(expr, sympy.Rel)
+
+        # A good example of what goes wrong if you don't do this is
+        # python test/functorch/test_aotdispatch.py -k
+        # test_aot_autograd_symbolic_module_exhaustive_nn_LazyConv3d_cpu_float32
+        if isinstance(expr, sympy.Ne):
+            return
+
+        free = list(expr.free_symbols)
+
+        assert len(free) > 0, f"The expression should not be static by this point: {expr}"
+        # In case of really gnarly expression, we don't blow up
+        if len(free) > 5:
+            return
+
+        # Prioritize unbacked symints for solving by ordering them last.
+        # Prefer to simplify out lexicographically higher symbols (i.e. simplify out s4 over s3).
+        #   (NB: this unfortunately isn't strictly equivalent to simplifying out newer symbols)
+        # Prefer to simplify out symbols with ephemeral sources.
+        def _smart_symbol_sort(x):
+            has_only_ephemeral_sources = (
+                x in self.var_to_sources and all(s.is_ephemeral() for s in self.var_to_sources[x])
+            )
+            size = self.size_hint(x, allow_none=True) or sys.maxsize
+            name = x.name
+            # 1 puts ephemeral sourced symbols first when sorting in reverse
+            return (1 if has_only_ephemeral_sources else 0, size, name)
+
+        free = sorted(free, key=_smart_symbol_sort, reverse=True)  # type: ignore[attr-defined]
+        lhs = expr.lhs
+        rhs = expr.rhs
+
+        self._refine_ranges(expr)
+
+        # The rest of this stuff is for equality only
+        if not isinstance(expr, sympy.Eq):
+            return
+
+        if not expr.has(Mod):
+            try:
+                floor_div_atoms = lhs.atoms(FloorDiv).union(rhs.atoms(FloorDiv))
+                if len(floor_div_atoms) > 0 and any(a.divisor != 1 for a in floor_div_atoms):
+                    raise NotImplementedError
+                # short-circuit when no solving is needed
+
+                if isinstance(lhs, sympy.Symbol) and free_unbacked_symbols(lhs):
+                    self._set_replacement(lhs, self._find(rhs), "trivial_lhs")
+                elif isinstance(rhs, sympy.Symbol) and free_unbacked_symbols(rhs):
+                    self._set_replacement(rhs, self._find(lhs), "trivial_rhs")
+                else:
+                    r = try_solve(expr, free[0], floordiv_inequality=False)
+                    if r is not None and all(t.is_integer for t in sympy.preorder_traversal(r[1])):
+                        new_var = self._find(r[1])
+                        ok = False
+                        if self.is_unbacked_symint(free[0]):
+                            # If you have i0 + i1 + i2 = s0, don't substitute i2 =
+                            # s0 - i0 - i1.  Arguably this should be OK but the
+                            # runtime assert machinery is very delicate right now
+                            # so this causes things to fail e.g.,
+                            # test_split_unbacked_sizes
+                            ok = len(free_unbacked_symbols(new_var)) <= 1
+                            msg = "solve_unbacked"
+                        else:
+                            # Never substitute backed with unbacked
+                            ok = len(free_unbacked_symbols(new_var)) == 0
+                            msg = "solve_backed"
+                        if ok:
+                            self._set_replacement(cast(sympy.Symbol, free[0]), new_var, msg)
+            except NotImplementedError:
+                pass
+        if expr.has(Mod):
+            mod_expr = next(iter(expr.atoms(Mod)))
+            try:
+                r = try_solve(expr, mod_expr, floordiv_inequality=False)
+                if r is not None and r[1] == 0:
+                    self._add_divisible(mod_expr)
+                    # This is a little bit of extra logic to make things like
+                    # torch.empty(i0, q).view(c, -1, q) work out
+                    p, q = mod_expr.args
+                    if isinstance(q, sympy.Number) and isinstance(p, sympy.Mul) and len(p.args) == 2:
+                        c, i0 = p.args
+                        # Given Mod(c * i0, q) == 0
+                        if (
+                            isinstance(c, sympy.Number) and
+                            isinstance(i0, sympy.Symbol) and
+                            self.is_unbacked_symint(i0)
+                        ):
+                            # We have Mod(i0, q / c) == 0, which means we can
+                            # rewrite i0 as (q / gcd(q, c)) * i1
+                            d = q / sympy.gcd(q, c)
+                            i1 = self.create_unbacked_symint().node.expr
+                            # Propagate the value ranges.  It doesn't really
+                            # matter if we use truediv or floordiv, because we
+                            # have established divisibility.
+                            self.var_to_range[i1] = SymPyValueRangeAnalysis.truediv(
+                                self.var_to_range[i0], ValueRanges.wrap(d)
+                            )
+                            # Propagate size-like-ness
+                            if i0 in self.size_like:
+                                self.size_like.add(i1)
+                            self._set_replacement(i0, d * i1, "divisibility")
+
+            except NotImplementedError:
+                pass
+        return
+
+    # See: Note - On 0/1 specialization
+    # NB: sys.maxsize is NOT allowed for sizes, because we use MAX_INT
+    # as a sentinel sometimes.  Your sizevar isn't going to be
+    # anywhere near the max 64-bit integer anyway.
+    def _default_value_range(self) -> ValueRanges:
+        lower = 2 if self.specialize_zero_one else 0
+        return ValueRanges(lower, sys.maxsize - 1)
+
+    def _default_unspecified_value_range(self) -> ValueRanges:
+        return ValueRanges(-sys.maxsize - 1, sys.maxsize)
+
+    @_lru_cache
+    def _simplify_floor_div(self, expr):
+        floor_divs = tuple(expr.atoms(FloorDiv))
+        # we expect floor_divs to be exact,
+        # and thus add the guards for the exact floordivs,
+        # even if tracing doesn't require them otherwise
+        for fd in reversed(floor_divs):
+            base, divisor = fd.args
+            mod_expr = Mod(base, divisor)
+            eq_expr = sympy.Eq(mod_expr, 0)
+            # add necessary mod guards
+            self.evaluate_expr(eq_expr)
+        return self.simplify(expr)
+
+    # We're about to add a guard/runtime assert, check if the ShapeEnv is frozen
+    # and if so issue a warning
+    def _check_frozen(self, expr, concrete_val):
+        if self.frozen:
+            self.counter["ignored_backward_guard"] += 1
+            signpost_event(
+                "dynamic",
+                "evaluate_expr_frozen",
+                {
+                    **self.co_fields,
+                    "ignored_guard": f"{expr} == {concrete_val}",
+                    # no version = original state (this signpost is expected)
+                    # version 2 = dynamic backwards is eagerly compiled
+                    "version": 2,
+                },
+            )
+            log.warning("Ignored guard %s == %s, this could result in accuracy problems", expr, concrete_val)
+
+
+    def _get_stack_summary(self, is_debug: bool = False):
+        fsummary = None
+        frame = inspect.currentframe()
+        try:
+            while frame is not None:
+                if frame.f_code.co_filename not in uninteresting_files():
+                    fsummary = traceback.FrameSummary(
+                        frame.f_code.co_filename,
+                        frame.f_lineno,
+                        frame.f_code.co_name,
+                    )
+                    break
+                frame = frame.f_back
+        finally:
+            del frame
+
+        # NB: this stack is truncated, but it's fine because the main
+        # stack_info will give you the rest of the info you need
+        maybe_user_loc = ""
+        user_tb = TracingContext.extract_stack()
+        if user_tb:
+            maybe_user_loc = " at " + format_frame(user_tb[-1])
+
+        maybe_extra_debug = ""
+        if is_debug and user_tb:
+            maybe_extra_debug = (
+                '\nUser Stack (most recent call last):\n' +
+                '  (snipped, see stack below for prefix)\n' +
+                ''.join(traceback.format_list(user_tb))
+            )
+        if is_debug and config.extended_debug_cpp:
+            cpp_stack = CapturedTraceback.extract(cpp=True)
+            maybe_extra_debug += "\nC++ stack trace:\n" + ''.join(cpp_stack.format())
+
+        return fsummary, maybe_user_loc, maybe_extra_debug
+
+    def _log_guard(self, prefix: str, g, forcing_spec: bool):
+        if self.log.isEnabledFor(logging.INFO):
+            str_g = str(g)
+            is_debug = config.extended_debug_guard_added is not None and str_g == config.extended_debug_guard_added
+            fsummary, maybe_user_loc, maybe_extra_debug = self._get_stack_summary(is_debug)
+            self.log.info(
+                "%s %s [guard added]%s (%s)%s",
+                prefix if not forcing_spec else f"{prefix} (forcing_spec)",
+                str_g,
+                maybe_user_loc,
+                format_frame(fsummary),
+                maybe_extra_debug,
+                stack_info=is_debug,
+            )
+
+    @lru_cache(256)
+    @record_shapeenv_event(save_tracked_fakes=True)
+    def evaluate_expr(self, orig_expr: "sympy.Expr", hint=None, fx_node=None,
+                      expect_rational=True, size_oblivious: bool = False, *, forcing_spec: bool = False):
+        """
+        Given an expression, evaluates it, adding guards if necessary
+        """
+
+        # TODO: split conjunctions and evaluate them separately
+
+        @lru_cache(None)
+        def compute_concrete_val():
+            if hint is None:
+                return self.size_hint(orig_expr)
+            else:
+                return sympy.sympify(hint)
+
+        # Check if:
+        #   1. 'translation_validation' is set
+        #   2. the corresponding 'fx_node' is not 'None'
+        #   3. the guard should not be suppressed
+        #
+        # If all of the above check, we create an FX node representing the
+        # actual expression to be guarded.
+        node = None
+        fresh = False
+        if (
+                self._translation_validation_enabled
+                and fx_node is not None
+                and not self._suppress_guards_tls()
+                and not size_oblivious
+        ):
+            concrete_val = compute_concrete_val()
+            if concrete_val is sympy.true:
+                node, fresh = self._create_fx_call_function(torch._assert, (fx_node,))
+            elif concrete_val is sympy.false:
+                neg, _ = self._create_fx_call_function(operator.not_, (fx_node,))
+                node, fresh = self._create_fx_call_function(torch._assert, (neg,))
+            else:
+                eql, _ = self._create_fx_call_function(operator.eq, (fx_node, concrete_val))
+                node, fresh = self._create_fx_call_function(torch._assert, (eql,))
+
+            assert node is not None
+            # If this is a fresh node, we have to remember the event index that
+            # corresponds to this assertion node.
+            # Reason: so that, given an assertion node, we can replay the ShapeEnv
+            # events until the point where this assertion node was freshly created.
+            if fresh:
+                self._add_fx_node_metadata(node)
+
+        # After creating the FX node corresponding to orig_expr, we must make sure that
+        # no error will be raised until the end of this function.
+        #
+        # Reason: the translation validation may become invalid otherwise.
+        #
+        # If an error is raised before the end of this function, we remove the FX node
+        # inserted, and re-raise the error.
+        guard = None
+        tb = None
+
+        try:
+            if orig_expr.is_number:
+                self.log.debug("eval %s [trivial]", orig_expr)
+                # NB: don't test float as there may be precision issues
+                if isinstance(hint, (int, bool)):
+                    assert orig_expr == hint, f"{orig_expr} != {hint}"
+                return orig_expr
+
+            expr = orig_expr
+
+            static_expr = self._maybe_evaluate_static(expr,
+                                                      expect_rational=expect_rational,
+                                                      size_oblivious=size_oblivious)
+            if static_expr is not None:
+                self.log.debug("eval %s == %s [statically known]", orig_expr, static_expr)
+                # NB: don't test float as there may be precision issues
+                if isinstance(hint, (int, bool)):
+                    assert static_expr == hint, f"{static_expr} != {hint}"
+                return static_expr
+
+            if not (expr.free_symbols <= self.var_to_val.keys()):
+                # TODO: dedupe this with _maybe_evaluate_static
+                # Attempt to eliminate the unbacked SymInt
+                new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
+                if not (new_expr.free_symbols <= self.var_to_val.keys()):
+                    size_oblivious_result = None
+                    if not size_oblivious:
+                        size_oblivious_result = self._maybe_evaluate_static(
+                            expr,
+                            expect_rational=expect_rational,
+                            size_oblivious=True
+                        )
+
+                    raise self._make_data_dependent_error(
+                        expr.xreplace(self.var_to_val),
+                        expr,
+                        size_oblivious_result=size_oblivious_result
+                    )
+                expr = new_expr
+
+            concrete_val = compute_concrete_val()
+            self._check_frozen(expr, concrete_val)
+
+            if (
+                    config.inject_EVALUATE_EXPR_flip_equality_TESTING_ONLY
+                    and isinstance(hint, bool)
+                    and isinstance(expr, (sympy.Eq, sympy.Ne))
+            ):
+                expr = sympy.Not(expr)
+
+            # Turn this into a boolean expression, no longer need to consult
+            # concrete_val
+            suppress_maybe_guard_rel = False
+            if concrete_val is sympy.true:
+                g = expr
+            elif concrete_val is sympy.false:
+                g = sympy.Not(expr)
+            else:
+                # WARNING: we cannot actually do simplifications on guards
+                # on floating point values, because Sympy generally does not
+                # think expressions on integers can ever be equal to floating
+                # point (e.g., sympy.Eq(s0/6, 0.5) evaluates to False).  Without
+                # very clear algebraic laws that hold for floating point, such
+                # simplifications are error prone anyway, so be sure not to
+                # maybe_guard_rel in those cases.
+                if not isinstance(concrete_val, sympy.Integer):
+                    suppress_maybe_guard_rel = True
+                g = sympy.Eq(expr, concrete_val)  # type: ignore[arg-type]
+
+            if isinstance(g, sympy.Rel):
+                # TODO: If we successfully eliminate a symbol via equality, it
+                # is not actually necessary to save a guard for the equality,
+                # as we will implicitly generate a guard when we match that
+                # input against the symbol.  Probably the easiest way to
+                # implement this is to have maybe_guard_rel return a bool
+                # saying if it "subsumed" the guard (and therefore the guard
+                # is no longer necessary)
+                self._maybe_guard_rel(g)
+
+            if not self._suppress_guards_tls():
+                stack = CapturedTraceback.extract(skip=1)
+                guard = ShapeGuard(g, stack)
+                # TODO: deal with duplicate guards somehow
+                self.guards.append(guard)
+        except Exception:
+            if fresh:
+                self._remove_fx_node(node)
+            raise
+        else:
+            if not self._suppress_guards_tls():
+                assert guard is not None
+
+                self._log_guard("eval", g, forcing_spec=forcing_spec)
+
+                for s in g.free_symbols:
+                    self.symbol_guard_counter[s] += 1
+                    # Forcing_spec to avoid infinite recursion
+                    if (
+                        not forcing_spec and
+                        config.symbol_guard_limit_before_specialize is not None and
+                        self.symbol_guard_counter[s] > config.symbol_guard_limit_before_specialize
+                    ):
+                        # Force specialization
+                        self.log.info(
+                            "symbol_guard_limit_before_specialize=%s exceeded on %s",
+                            config.symbol_guard_limit_before_specialize,
+                            s
+                        )
+                        self.evaluate_expr(s, forcing_spec=True)
+            else:
+                self.log.debug("eval %s [guard suppressed]", g)
+
+        return concrete_val
+
+    def cleanup(self):
+        """
+        Break reference cycles.
+
+        This destroys the stacks. If you really want to keep them, we
+        just need some way to break references on code objects.
+        """
+        for g in self.guards:
+            g.stack.cleanup()
+        for s in self.var_to_stack.values():
+            s.cleanup()
+        for ras in self.deferred_runtime_asserts.values():
+            for ra in ras:
+                ra.stack.cleanup()
+
+    @record_shapeenv_event(save_tracked_fakes=True)
+    def defer_runtime_assert(self, orig_expr: "sympy.Expr", msg, fx_node=None):
+        """Create an assert that is checked at runtime
+
+        Args:
+            orig_expr (sympy.Expr): Boolean expression to assert is true
+            msg (str): Message to display on assertion failure
+            fx_node (Optional, torch.fx.Node): node in ``self.graph`` corresponding
+                to the expression, if applicable
+
+        """
+        expr = orig_expr
+
+        # TODO: split conjunctions and evaluate them separately
+
+        static_expr = self._maybe_evaluate_static(expr)
+        if static_expr is not None:
+            self.log.debug("runtime_assert %s == %s [statically known]", orig_expr, static_expr)
+            return static_expr
+
+        # Attempt to eliminate the unbacked SymInt
+        new_expr = self._maybe_evaluate_static(expr, unbacked_only=True)
+        if new_expr.free_symbols <= self.var_to_val.keys():
+            # Do a normal guard
+            return self.evaluate_expr(new_expr, fx_node=fx_node)
+        # NB: Don't use new_expr as expr; it could contain gunk like shape0
+        # which we don't want to guard on
+
+        # OK, we're definitely doing a runtime assert now
+        if (
+            self._translation_validation_enabled
+            and fx_node is not None
+            and not self._suppress_guards_tls()
+        ):
+            node, fresh = self._create_fx_call_function(torch._assert, (fx_node,))
+            assert node is not None
+            if fresh:
+                self._add_fx_node_metadata(node)
+
+        self._check_frozen(expr, sympy.true)
+
+        # eliminate symbols on equality tests / refine ranges
+        if isinstance(expr, sympy.Rel):
+            self._maybe_guard_rel(expr)
+
+        if not self._suppress_guards_tls():
+            # canonicalise to remove equations that are trivially equal
+            orig_expr = expr
+            expr = canonicalize_bool_expr(expr)
+            stack = CapturedTraceback.extract(skip=1)
+            ra = RuntimeAssert(expr, msg, stack)
+            # TODO: Do this in a way that is less janky than int(s.name[1:])
+            cands = sorted([s for s in expr.free_symbols if s.name.startswith("u")], key=lambda s: int(s.name[1:]))
+            self.deferred_runtime_asserts.setdefault(cands[-1], []).append(ra)
+            self.num_deferred_runtime_asserts += 1
+            self._update_version_counter()
+            self._log_guard("runtime_assert", orig_expr, forcing_spec=False)
+        else:
+            self.log.debug("runtime_assert %s [guard suppressed]", expr)
+
+        return True
+
+    # Refines the ranges of the variables present in 'guard'.
+    #
+    # This function tries to refine the range of the variables inside
+    # 'guard' by reasoning about it. Specifically, when 'guard' is a
+    # 'sympy.Relational' operation.
+    #
+    # It does mainly 3 things:
+    #   1. Tries to isolate a variable in the left-hand side
+    #   2. Compute the value range of the right-hand side
+    #   3. Update the value range of the variable, if better
+    def _refine_ranges(self, expr: sympy.Expr) -> None:
+        expr = self.simplify(expr)
+
+        for symbol in expr.free_symbols:
+            assert isinstance(symbol, sympy.Symbol)
+
+            if isinstance(self.var_to_val.get(symbol, None), SingletonInt):
+                # Skip var_to_range logic for SingletonInt which is only used
+                # for jagged layout NestedTensors today
+                continue
+
+            r = try_solve(expr, symbol)
+
+            if r is None or not (symbol.is_integer and r[1].is_integer):
+                # Range refinement only supports integer symbols for now.
+                # There are lots of SymPy bugs when it comes to comparing
+                # reals and integers, so we skip that for now.
+                continue
+
+            r_expr, rhs = r
+            vr = self.var_to_range[symbol]
+            lower, upper = vr.lower, vr.upper
+
+            rhs_vr = bound_sympy(rhs, self.var_to_range)
+            _assert_bound_is_rational(rhs, rhs_vr)
+
+            # Let's suppose that we have a preexisting range for x [0, 100].
+            # Now, we issue a guard x > y, where the range for y is [50, 150].
+            # Then, lower = 0, rhs_vr.lower = 50 and therefore refinement can happen,
+            # refining x to [51, 100], since x must be greater than y, but the lowest
+            # y could be is 50.
+            #
+            # sympy.Eq may update both lower and upper bounds.
+            # sympy.G{t,e} may update the lower bound, only.
+            # sympy.L{t,e} may update the upper bound, only.
+            if lower < rhs_vr.lower and isinstance(r_expr, (sympy.Eq, sympy.Ge, sympy.Gt)):
+                # Strictly greater relations allow us to refine a bit more, since
+                # x < y implies that the lower bound for x is: y + 1.
+                lower = rhs_vr.lower + int(isinstance(r_expr, sympy.Gt))
+            if upper > rhs_vr.upper and isinstance(r_expr, (sympy.Eq, sympy.Le, sympy.Lt)):
+                upper = rhs_vr.upper - int(isinstance(r_expr, sympy.Lt))
+
+            # Do nothing if the new value range is no better than what we already have.
+            if vr == ValueRanges(lower, upper):
+                continue
+
+            # Updates the range and the guards corresponding to each bound of the symbol.
+            self.var_to_range[symbol] = ValueRanges(lower, upper)
+            # Clears the cache, since this update can change the result.
+            self._maybe_evaluate_static.cache_clear()
+
+def _is_int(expr):
+    return isinstance(expr, SymInt) and expr.node.expr.is_number
+
+# WARNING: This is legacy, DO NOT USE
+def _is_dim_dynamic(t, d):
+    return hasattr(t, "_dynamo_dynamic_indices") and d in t._dynamo_dynamic_indices
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__init__.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e36933d6bf060e76a60df991002ac37ef52440
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__init__.py
@@ -0,0 +1,4 @@
+# mypy: disable-error-code=attr-defined
+from .core import unify, reify  # noqa: F403
+from .more import unifiable  # noqa: F403
+from .variable import var, isvar, vars, variables, Var  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a7e074ec4d8ee3801985889f4a60e608f4d42e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/core.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/core.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65d0ef03b065aafedcc7c76c61b1355a015bb9d6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/core.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/dispatch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/dispatch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2beff5703af8b4f703958fba5db2c662f4380b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/dispatch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/match.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/match.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e892fbbd0e2d94a4faab12fc195ccb28fb939a53
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/match.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/more.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/more.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43f41e3b7a86313f551ee92c6c5b958833bd4390
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/more.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/unification_tools.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/unification_tools.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1da9101f537cb0f6cf0fdcb428cfe3ba99ae9e10
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/unification_tools.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79887ebff41997b16520f5ac7ea617dd08d21e40
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/variable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/variable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac168095063f023aa84379ff3c3f361402694acc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/__pycache__/variable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/core.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..5594c534bb8014f47564ec2eac4488f0887d849a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/core.py
@@ -0,0 +1,118 @@
+from collections.abc import Iterator  # type: ignore[import]
+from functools import partial
+
+from .unification_tools import assoc  # type: ignore[import]
+from .utils import transitive_get as walk
+from .variable import isvar
+from .dispatch import dispatch
+
+__all__ = ["reify", "unify"]
+
+###############
+# Reification #
+###############
+
+@dispatch(Iterator, dict)
+def _reify(t, s):
+    return map(partial(reify, s=s), t)
+    # return (reify(arg, s) for arg in t)
+_reify
+
+@dispatch(tuple, dict)  # type: ignore[no-redef]
+def _reify(t, s):
+    return tuple(reify(iter(t), s))
+_reify
+
+@dispatch(list, dict)  # type: ignore[no-redef]
+def _reify(t, s):
+    return list(reify(iter(t), s))
+_reify
+
+@dispatch(dict, dict)  # type: ignore[no-redef]
+def _reify(d, s):
+    return {k: reify(v, s) for k, v in d.items()}
+_reify
+
+@dispatch(object, dict)  # type: ignore[no-redef]
+def _reify(o, s):
+    return o  # catch all, just return the object
+
+def reify(e, s):
+    """ Replace variables of expression with substitution
+    >>> # xdoctest: +SKIP
+    >>> x, y = var(), var()
+    >>> e = (1, x, (3, y))
+    >>> s = {x: 2, y: 4}
+    >>> reify(e, s)
+    (1, 2, (3, 4))
+    >>> e = {1: x, 3: (y, 5)}
+    >>> reify(e, s)
+    {1: 2, 3: (4, 5)}
+    """
+    if isvar(e):
+        return reify(s[e], s) if e in s else e
+    return _reify(e, s)
+
+###############
+# Unification #
+###############
+
+seq = tuple, list, Iterator
+
+@dispatch(seq, seq, dict)
+def _unify(u, v, s):
+    if len(u) != len(v):
+        return False
+    for uu, vv in zip(u, v):  # avoiding recursion
+        s = unify(uu, vv, s)
+        if s is False:
+            return False
+    return s
+#
+# @dispatch((set, frozenset), (set, frozenset), dict)
+# def _unify(u, v, s):
+#     i = u & v
+#     u = u - i
+#     v = v - i
+#     return _unify(sorted(u), sorted(v), s)
+#
+#
+# @dispatch(dict, dict, dict)
+# def _unify(u, v, s):
+#     if len(u) != len(v):
+#         return False
+#     for key, uval in iteritems(u):
+#         if key not in v:
+#             return False
+#         s = unify(uval, v[key], s)
+#         if s is False:
+#             return False
+#     return s
+#
+#
+# @dispatch(object, object, dict)
+# def _unify(u, v, s):
+#     return False  # catch all
+
+
+@dispatch(object, object, dict)
+def unify(u, v, s):  # no check at the moment
+    """ Find substitution so that u == v while satisfying s
+    >>> x = var('x')
+    >>> unify((1, x), (1, 2), {})
+    {~x: 2}
+    """
+    u = walk(u, s)
+    v = walk(v, s)
+    if u == v:
+        return s
+    if isvar(u):
+        return assoc(s, u, v)
+    if isvar(v):
+        return assoc(s, v, u)
+    return _unify(u, v, s)
+unify
+
+@dispatch(object, object)  # type: ignore[no-redef]
+def unify(u, v):
+    return unify(u, v, {})
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/dispatch.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d6d2d7efde128dd7fa9f78f414df757b3d87a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/dispatch.py
@@ -0,0 +1,6 @@
+from functools import partial
+from .multipledispatch import dispatch  # type: ignore[import]
+
+namespace = {}  # type: ignore[var-annotated]
+
+dispatch = partial(dispatch, namespace=namespace)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/match.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/match.py
new file mode 100644
index 0000000000000000000000000000000000000000..56c04e0134e231396daa6d5f36563365a4409752
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/match.py
@@ -0,0 +1,121 @@
+from .core import unify, reify  # type: ignore[attr-defined]
+from .variable import isvar
+from .utils import _toposort, freeze
+from .unification_tools import groupby, first  # type: ignore[import]
+
+
+class Dispatcher:
+    def __init__(self, name):
+        self.name = name
+        self.funcs = {}
+        self.ordering = []
+
+    def add(self, signature, func):
+        self.funcs[freeze(signature)] = func
+        self.ordering = ordering(self.funcs)
+
+    def __call__(self, *args, **kwargs):
+        func, s = self.resolve(args)
+        return func(*args, **kwargs)
+
+    def resolve(self, args):
+        n = len(args)
+        for signature in self.ordering:
+            if len(signature) != n:
+                continue
+            s = unify(freeze(args), signature)
+            if s is not False:
+                result = self.funcs[signature]
+                return result, s
+        raise NotImplementedError("No match found. \nKnown matches: "
+                                  + str(self.ordering) + "\nInput: " + str(args))
+
+    def register(self, *signature):
+        def _(func):
+            self.add(signature, func)
+            return self
+        return _
+
+
+class VarDispatcher(Dispatcher):
+    """ A dispatcher that calls functions with variable names
+    >>> # xdoctest: +SKIP
+    >>> d = VarDispatcher('d')
+    >>> x = var('x')
+    >>> @d.register('inc', x)
+    ... def f(x):
+    ...     return x + 1
+    >>> @d.register('double', x)
+    ... def f(x):
+    ...     return x * 2
+    >>> d('inc', 10)
+    11
+    >>> d('double', 10)
+    20
+    """
+    def __call__(self, *args, **kwargs):
+        func, s = self.resolve(args)
+        d = {k.token: v for k, v in s.items()}
+        return func(**d)
+
+
+global_namespace = {}  # type: ignore[var-annotated]
+
+
+def match(*signature, **kwargs):
+    namespace = kwargs.get('namespace', global_namespace)
+    dispatcher = kwargs.get('Dispatcher', Dispatcher)
+
+    def _(func):
+        name = func.__name__
+
+        if name not in namespace:
+            namespace[name] = dispatcher(name)
+        d = namespace[name]
+
+        d.add(signature, func)
+
+        return d
+    return _
+
+
+def supercedes(a, b):
+    """ ``a`` is a more specific match than ``b`` """
+    if isvar(b) and not isvar(a):
+        return True
+    s = unify(a, b)
+    if s is False:
+        return False
+    s = {k: v for k, v in s.items() if not isvar(k) or not isvar(v)}
+    if reify(a, s) == a:
+        return True
+    if reify(b, s) == b:
+        return False
+
+
+# Taken from multipledispatch
+def edge(a, b, tie_breaker=hash):
+    """ A should be checked before B
+    Tie broken by tie_breaker, defaults to ``hash``
+    """
+    if supercedes(a, b):
+        if supercedes(b, a):
+            return tie_breaker(a) > tie_breaker(b)
+        else:
+            return True
+    return False
+
+
+# Taken from multipledispatch
+def ordering(signatures):
+    """ A sane ordering of signatures to check, first to last
+    Topological sort of edges as given by ``edge`` and ``supercedes``
+    """
+    signatures = list(map(tuple, signatures))
+    edges = [(a, b) for a in signatures for b in signatures if edge(a, b)]
+    edges = groupby(first, edges)
+    for s in signatures:
+        if s not in edges:
+            edges[s] = []
+    edges = {k: [b for a, b in v] for k, v in edges.items()}  # type: ignore[attr-defined, assignment]
+    return _toposort(edges)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/more.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/more.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d13b155b6de946802bd1459ecf7a2f2783c909
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/more.py
@@ -0,0 +1,117 @@
+from .core import unify, reify  # type: ignore[attr-defined]
+from .dispatch import dispatch
+
+
+def unifiable(cls):
+    """ Register standard unify and reify operations on class
+    This uses the type and __dict__ or __slots__ attributes to define the
+    nature of the term
+    See Also:
+    >>> # xdoctest: +SKIP
+    >>> class A(object):
+    ...     def __init__(self, a, b):
+    ...         self.a = a
+    ...         self.b = b
+    >>> unifiable(A)
+    <class 'unification.more.A'>
+    >>> x = var('x')
+    >>> a = A(1, 2)
+    >>> b = A(1, x)
+    >>> unify(a, b, {})
+    {~x: 2}
+    """
+    _unify.add((cls, cls, dict), unify_object)
+    _reify.add((cls, dict), reify_object)
+
+    return cls
+
+
+#########
+# Reify #
+#########
+
+
+def reify_object(o, s):
+    """ Reify a Python object with a substitution
+    >>> # xdoctest: +SKIP
+    >>> class Foo(object):
+    ...     def __init__(self, a, b):
+    ...         self.a = a
+    ...         self.b = b
+    ...     def __str__(self):
+    ...         return "Foo(%s, %s)"%(str(self.a), str(self.b))
+    >>> x = var('x')
+    >>> f = Foo(1, x)
+    >>> print(f)
+    Foo(1, ~x)
+    >>> print(reify_object(f, {x: 2}))
+    Foo(1, 2)
+    """
+    if hasattr(o, '__slots__'):
+        return _reify_object_slots(o, s)
+    else:
+        return _reify_object_dict(o, s)
+
+
+def _reify_object_dict(o, s):
+    obj = object.__new__(type(o))
+    d = reify(o.__dict__, s)
+    if d == o.__dict__:
+        return o
+    obj.__dict__.update(d)
+    return obj
+
+
+def _reify_object_slots(o, s):
+    attrs = [getattr(o, attr) for attr in o.__slots__]
+    new_attrs = reify(attrs, s)
+    if attrs == new_attrs:
+        return o
+    else:
+        newobj = object.__new__(type(o))
+        for slot, attr in zip(o.__slots__, new_attrs):
+            setattr(newobj, slot, attr)
+        return newobj
+
+
+@dispatch(slice, dict)
+def _reify(o, s):
+    """ Reify a Python ``slice`` object """
+    return slice(*reify((o.start, o.stop, o.step), s))
+
+
+#########
+# Unify #
+#########
+
+
+def unify_object(u, v, s):
+    """ Unify two Python objects
+    Unifies their type and ``__dict__`` attributes
+    >>> # xdoctest: +SKIP
+    >>> class Foo(object):
+    ...     def __init__(self, a, b):
+    ...         self.a = a
+    ...         self.b = b
+    ...     def __str__(self):
+    ...         return "Foo(%s, %s)"%(str(self.a), str(self.b))
+    >>> x = var('x')
+    >>> f = Foo(1, x)
+    >>> g = Foo(1, 2)
+    >>> unify_object(f, g, {})
+    {~x: 2}
+    """
+    if type(u) != type(v):
+        return False
+    if hasattr(u, '__slots__'):
+        return unify([getattr(u, slot) for slot in u.__slots__],
+                     [getattr(v, slot) for slot in v.__slots__],
+                     s)
+    else:
+        return unify(u.__dict__, v.__dict__, s)
+
+
+@dispatch(slice, slice, dict)
+def _unify(u, v, s):
+    """ Unify a Python ``slice`` object """
+    return unify((u.start, u.stop, u.step), (v.start, v.stop, v.step), s)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__init__.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..527b66d546e372ed1bfd1f1f5ec9ab3772436979
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__init__.py
@@ -0,0 +1,3 @@
+from .core import dispatch
+from .dispatcher import (Dispatcher, halt_ordering, restart_ordering,
+                         MDNotImplementedError)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1199d4f22232280ad60ca9c6881242fd2f8acb88
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/conflict.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/conflict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c587ef5c96b54a49ea509eb0f1607bba6dc75b0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/conflict.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/core.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/core.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd351e35bd62ad0cf613ecfd3dc7d0062ae62575
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/core.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/dispatcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/dispatcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6561dea6279d24b774c5d239c90e73362bdc12
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/dispatcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3037e8169aa2b76041bd502c1c939168418bae29
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/variadic.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/variadic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de6dbb990a26986b5e25e66c014f7846315f7b86
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/__pycache__/variadic.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/conflict.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/conflict.py
new file mode 100644
index 0000000000000000000000000000000000000000..021f4fdf5c9b5fa715d3933e2b00aaddcfd73d66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/conflict.py
@@ -0,0 +1,119 @@
+from .utils import _toposort, groupby
+from .variadic import isvariadic
+
+__all__ = ["AmbiguityWarning", "supercedes", "consistent", "ambiguous", "ambiguities", "super_signature",
+           "edge", "ordering"]
+
+class AmbiguityWarning(Warning):
+    pass
+
+
+def supercedes(a, b):
+    """ A is consistent and strictly more specific than B """
+    if len(a) < len(b):
+        # only case is if a is empty and b is variadic
+        return not a and len(b) == 1 and isvariadic(b[-1])
+    elif len(a) == len(b):
+        return all(map(issubclass, a, b))
+    else:
+        # len(a) > len(b)
+        p1 = 0
+        p2 = 0
+        while p1 < len(a) and p2 < len(b):
+            cur_a = a[p1]
+            cur_b = b[p2]
+            if not (isvariadic(cur_a) or isvariadic(cur_b)):
+                if not issubclass(cur_a, cur_b):
+                    return False
+                p1 += 1
+                p2 += 1
+            elif isvariadic(cur_a):
+                assert p1 == len(a) - 1
+                return p2 == len(b) - 1 and issubclass(cur_a, cur_b)
+            elif isvariadic(cur_b):
+                assert p2 == len(b) - 1
+                if not issubclass(cur_a, cur_b):
+                    return False
+                p1 += 1
+        return p2 == len(b) - 1 and p1 == len(a)
+
+
+def consistent(a, b):
+    """ It is possible for an argument list to satisfy both A and B """
+
+    # Need to check for empty args
+    if not a:
+        return not b or isvariadic(b[0])
+    if not b:
+        return not a or isvariadic(a[0])
+
+    # Non-empty args check for mutual subclasses
+    if len(a) == len(b):
+        return all(issubclass(aa, bb) or issubclass(bb, aa)
+                   for aa, bb in zip(a, b))
+    else:
+        p1 = 0
+        p2 = 0
+        while p1 < len(a) and p2 < len(b):
+            cur_a = a[p1]
+            cur_b = b[p2]
+            if not issubclass(cur_b, cur_a) and not issubclass(cur_a, cur_b):
+                return False
+            if not (isvariadic(cur_a) or isvariadic(cur_b)):
+                p1 += 1
+                p2 += 1
+            elif isvariadic(cur_a):
+                p2 += 1
+            elif isvariadic(cur_b):
+                p1 += 1
+        # We only need to check for variadic ends
+        # Variadic types are guaranteed to be the last element
+        return (isvariadic(cur_a) and p2 == len(b) or  # type: ignore[possibly-undefined]
+                isvariadic(cur_b) and p1 == len(a))  # type: ignore[possibly-undefined]
+
+
+def ambiguous(a, b):
+    """ A is consistent with B but neither is strictly more specific """
+    return consistent(a, b) and not (supercedes(a, b) or supercedes(b, a))
+
+
+def ambiguities(signatures):
+    """ All signature pairs such that A is ambiguous with B """
+    signatures = list(map(tuple, signatures))
+    return {(a, b) for a in signatures for b in signatures
+            if hash(a) < hash(b)
+            and ambiguous(a, b)
+            and not any(supercedes(c, a) and supercedes(c, b)
+            for c in signatures)}
+
+
+def super_signature(signatures):
+    """ A signature that would break ambiguities """
+    n = len(signatures[0])
+    assert all(len(s) == n for s in signatures)
+
+    return [max((type.mro(sig[i]) for sig in signatures), key=len)[0]
+            for i in range(n)]
+
+
+def edge(a, b, tie_breaker=hash):
+    """ A should be checked before B
+    Tie broken by tie_breaker, defaults to ``hash``
+    """
+    # A either supercedes B and B does not supercede A or if B does then call
+    # tie_breaker
+    return supercedes(a, b) and (not supercedes(b, a) or tie_breaker(a) > tie_breaker(b))
+
+
+def ordering(signatures):
+    """ A sane ordering of signatures to check, first to last
+    Topological sort of edges as given by ``edge`` and ``supercedes``
+    """
+    signatures = list(map(tuple, signatures))
+    edges = [(a, b) for a in signatures for b in signatures if edge(a, b)]
+    edges = groupby(lambda x: x[0], edges)
+    for s in signatures:
+        if s not in edges:
+            edges[s] = []
+    edges = {k: [b for a, b in v] for k, v in edges.items()}  # type: ignore[assignment, attr-defined]
+    return _toposort(edges)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/core.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..e767aa4beab222d23e0ded688f620f2483e8c555
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/core.py
@@ -0,0 +1,83 @@
+import inspect
+import sys
+
+from .dispatcher import Dispatcher, MethodDispatcher
+
+global_namespace = {}  # type: ignore[var-annotated]
+
+__all__ = ["dispatch", "ismethod"]
+
+def dispatch(*types, **kwargs):
+    """ Dispatch function on the types of the inputs
+    Supports dispatch on all non-keyword arguments.
+    Collects implementations based on the function name.  Ignores namespaces.
+    If ambiguous type signatures occur a warning is raised when the function is
+    defined suggesting the additional method to break the ambiguity.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> @dispatch(int)
+        ... def f(x):
+        ...     return x + 1
+        >>> @dispatch(float)
+        ... def f(x):
+        ...     return x - 1
+        >>> # xdoctest: +SKIP
+        >>> f(3)
+        4
+        >>> f(3.0)
+        2.0
+        >>> # Specify an isolated namespace with the namespace keyword argument
+        >>> my_namespace = {}
+        >>> @dispatch(int, namespace=my_namespace)
+        ... def foo(x):
+        ...     return x + 1
+        >>> # Dispatch on instance methods within classes
+        >>> class MyClass(object):
+        ...     @dispatch(list)
+        ...     def __init__(self, data):
+        ...         self.data = data
+        ...     @dispatch(int)
+        ...     def __init__(self, datum):
+        ...         self.data = [datum]
+        >>> MyClass([1, 2, 3]).data
+        [1, 2, 3]
+        >>> MyClass(3).data
+        [3]
+    """
+    namespace = kwargs.get('namespace', global_namespace)
+
+    types = tuple(types)
+
+    def _df(func):
+        name = func.__name__
+
+        if ismethod(func):
+            dispatcher = inspect.currentframe().f_back.f_locals.get(  # type: ignore[union-attr]
+                name,  # type: ignore[union-attr]
+                MethodDispatcher(name),
+            )
+        else:
+            if name not in namespace:
+                namespace[name] = Dispatcher(name)
+            dispatcher = namespace[name]
+
+        dispatcher.add(types, func)
+        return dispatcher
+    return _df
+
+
+def ismethod(func):
+    """ Is func a method?
+    Note that this has to work as the method is defined but before the class is
+    defined.  At this stage methods look like functions.
+    """
+    if hasattr(inspect, "signature"):
+        signature = inspect.signature(func)
+        return signature.parameters.get('self', None) is not None
+    else:
+        if sys.version_info.major < 3:
+            spec = inspect.getargspec(func)  # type: ignore[attr-defined]
+        else:
+            spec = inspect.getfullargspec(func)  # type: ignore[union-attr, assignment]
+        return spec and spec.args and spec.args[0] == 'self'
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/dispatcher.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb4a1d5ce86a9c4d59aade063b906a1b237318d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/dispatcher.py
@@ -0,0 +1,430 @@
+from warnings import warn
+import inspect
+from .conflict import ordering, ambiguities, super_signature, AmbiguityWarning
+from .utils import expand_tuples
+from .variadic import Variadic, isvariadic
+import itertools as itl
+
+__all__ = ["MDNotImplementedError", "ambiguity_warn", "halt_ordering", "restart_ordering", "variadic_signature_matches_iter",
+           "variadic_signature_matches", "Dispatcher", "source", "MethodDispatcher", "str_signature", "warning_text"]
+
+class MDNotImplementedError(NotImplementedError):
+    """ A NotImplementedError for multiple dispatch """
+
+
+def ambiguity_warn(dispatcher, ambiguities):
+    """ Raise warning when ambiguity is detected
+    Parameters
+    ----------
+    dispatcher : Dispatcher
+        The dispatcher on which the ambiguity was detected
+    ambiguities : set
+        Set of type signature pairs that are ambiguous within this dispatcher
+    See Also:
+        Dispatcher.add
+        warning_text
+    """
+    warn(warning_text(dispatcher.name, ambiguities), AmbiguityWarning)
+
+
+def halt_ordering():
+    """Deprecated interface to temporarily disable ordering.
+    """
+    warn(
+        'halt_ordering is deprecated, you can safely remove this call.',
+        DeprecationWarning,
+    )
+
+
+def restart_ordering(on_ambiguity=ambiguity_warn):
+    """Deprecated interface to temporarily resume ordering.
+    """
+    warn(
+        'restart_ordering is deprecated, if you would like to eagerly order'
+        'the dispatchers, you should call the ``reorder()`` method on each'
+        ' dispatcher.',
+        DeprecationWarning,
+    )
+
+
+def variadic_signature_matches_iter(types, full_signature):
+    """Check if a set of input types matches a variadic signature.
+    Notes
+    -----
+    The algorithm is as follows:
+    Initialize the current signature to the first in the sequence
+    For each type in `types`:
+        If the current signature is variadic
+            If the type matches the signature
+                yield True
+            Else
+                Try to get the next signature
+                If no signatures are left we can't possibly have a match
+                    so yield False
+        Else
+            yield True if the type matches the current signature
+            Get the next signature
+    """
+    sigiter = iter(full_signature)
+    sig = next(sigiter)
+    for typ in types:
+        matches = issubclass(typ, sig)
+        yield matches
+        if not isvariadic(sig):
+            # we're not matching a variadic argument, so move to the next
+            # element in the signature
+            sig = next(sigiter)
+    else:
+        try:
+            sig = next(sigiter)
+        except StopIteration:
+            assert isvariadic(sig)
+            yield True
+        else:
+            # We have signature items left over, so all of our arguments
+            # haven't matched
+            yield False
+
+
+def variadic_signature_matches(types, full_signature):
+    # No arguments always matches a variadic signature
+    assert full_signature
+    return all(variadic_signature_matches_iter(types, full_signature))
+
+
+class Dispatcher:
+    """ Dispatch methods based on type signature
+    Use ``dispatch`` to add implementations
+    Examples
+    --------
+    >>> # xdoctest: +SKIP("bad import name")
+    >>> from multipledispatch import dispatch
+    >>> @dispatch(int)
+    ... def f(x):
+    ...     return x + 1
+    >>> @dispatch(float)
+    ... def f(x):
+    ...     return x - 1
+    >>> f(3)
+    4
+    >>> f(3.0)
+    2.0
+    """
+    __slots__ = '__name__', 'name', 'funcs', '_ordering', '_cache', 'doc'
+
+    def __init__(self, name, doc=None):
+        self.name = self.__name__ = name
+        self.funcs = {}
+        self.doc = doc
+
+        self._cache = {}
+
+    def register(self, *types, **kwargs):
+        """ register dispatcher with new implementation
+        >>> # xdoctest: +SKIP
+        >>> f = Dispatcher('f')
+        >>> @f.register(int)
+        ... def inc(x):
+        ...     return x + 1
+        >>> @f.register(float)
+        ... def dec(x):
+        ...     return x - 1
+        >>> @f.register(list)
+        ... @f.register(tuple)
+        ... def reverse(x):
+        ...     return x[::-1]
+        >>> f(1)
+        2
+        >>> f(1.0)
+        0.0
+        >>> f([1, 2, 3])
+        [3, 2, 1]
+        """
+        def _df(func):
+            self.add(types, func, **kwargs)   # type: ignore[call-arg]
+            return func
+        return _df
+
+    @classmethod
+    def get_func_params(cls, func):
+        if hasattr(inspect, "signature"):
+            sig = inspect.signature(func)
+            return sig.parameters.values()
+
+    @classmethod
+    def get_func_annotations(cls, func):
+        """ get annotations of function positional parameters
+        """
+        params = cls.get_func_params(func)
+        if params:
+            Parameter = inspect.Parameter
+
+            params = (param for param in params
+                      if param.kind in
+                      (Parameter.POSITIONAL_ONLY,
+                       Parameter.POSITIONAL_OR_KEYWORD))
+
+            annotations = tuple(
+                param.annotation
+                for param in params)
+
+            if all(ann is not Parameter.empty for ann in annotations):
+                return annotations
+
+    def add(self, signature, func):
+        """ Add new types/method pair to dispatcher
+        >>> # xdoctest: +SKIP
+        >>> D = Dispatcher('add')
+        >>> D.add((int, int), lambda x, y: x + y)
+        >>> D.add((float, float), lambda x, y: x + y)
+        >>> D(1, 2)
+        3
+        >>> D(1, 2.0)
+        Traceback (most recent call last):
+        ...
+        NotImplementedError: Could not find signature for add: <int, float>
+        >>> # When ``add`` detects a warning it calls the ``on_ambiguity`` callback
+        >>> # with a dispatcher/itself, and a set of ambiguous type signature pairs
+        >>> # as inputs.  See ``ambiguity_warn`` for an example.
+        """
+        # Handle annotations
+        if not signature:
+            annotations = self.get_func_annotations(func)
+            if annotations:
+                signature = annotations
+
+        # Handle union types
+        if any(isinstance(typ, tuple) for typ in signature):
+            for typs in expand_tuples(signature):
+                self.add(typs, func)
+            return
+
+        new_signature = []
+
+        for index, typ in enumerate(signature, start=1):
+            if not isinstance(typ, (type, list)):
+                str_sig = ', '.join(c.__name__ if isinstance(c, type)
+                                    else str(c) for c in signature)
+                raise TypeError(f"Tried to dispatch on non-type: {typ}\n"
+                                f"In signature: <{str_sig}>\n"
+                                f"In function: {self.name}")
+
+            # handle variadic signatures
+            if isinstance(typ, list):
+                if index != len(signature):
+                    raise TypeError(
+                        'Variadic signature must be the last element'
+                    )
+
+                if len(typ) != 1:
+                    raise TypeError(
+                        'Variadic signature must contain exactly one element. '
+                        'To use a variadic union type place the desired types '
+                        'inside of a tuple, e.g., [(int, str)]'
+                    )
+                new_signature.append(Variadic[typ[0]])
+            else:
+                new_signature.append(typ)
+
+        self.funcs[tuple(new_signature)] = func
+        self._cache.clear()
+
+        try:
+            del self._ordering
+        except AttributeError:
+            pass
+
+    @property
+    def ordering(self):
+        try:
+            return self._ordering
+        except AttributeError:
+            return self.reorder()
+
+    def reorder(self, on_ambiguity=ambiguity_warn):
+        self._ordering = od = ordering(self.funcs)
+        amb = ambiguities(self.funcs)
+        if amb:
+            on_ambiguity(self, amb)
+        return od
+
+    def __call__(self, *args, **kwargs):
+        types = tuple([type(arg) for arg in args])
+        try:
+            func = self._cache[types]
+        except KeyError as e:
+            func = self.dispatch(*types)
+            if not func:
+                raise NotImplementedError(
+                    f'Could not find signature for {self.name}: <{str_signature(types)}>') from e
+            self._cache[types] = func
+        try:
+            return func(*args, **kwargs)
+
+        except MDNotImplementedError as e:
+            funcs = self.dispatch_iter(*types)
+            next(funcs)  # burn first
+            for func in funcs:
+                try:
+                    return func(*args, **kwargs)
+                except MDNotImplementedError:
+                    pass
+
+            raise NotImplementedError(
+                "Matching functions for "
+                f"{self.name}: <{str_signature(types)}> found, but none completed successfully",) from e
+
+    def __str__(self):
+        return f"<dispatched {self.name}>"
+    __repr__ = __str__
+
+    def dispatch(self, *types):
+        """Determine appropriate implementation for this type signature
+        This method is internal.  Users should call this object as a function.
+        Implementation resolution occurs within the ``__call__`` method.
+        >>> # xdoctest: +SKIP
+        >>> from multipledispatch import dispatch
+        >>> @dispatch(int)
+        ... def inc(x):
+        ...     return x + 1
+        >>> implementation = inc.dispatch(int)
+        >>> implementation(3)
+        4
+        >>> print(inc.dispatch(float))
+        None
+        See Also:
+          ``multipledispatch.conflict`` - module to determine resolution order
+        """
+
+        if types in self.funcs:
+            return self.funcs[types]
+
+        try:
+            return next(self.dispatch_iter(*types))
+        except StopIteration:
+            return None
+
+    def dispatch_iter(self, *types):
+
+        n = len(types)
+        for signature in self.ordering:
+            if len(signature) == n and all(map(issubclass, types, signature)):
+                result = self.funcs[signature]
+                yield result
+            elif len(signature) and isvariadic(signature[-1]):
+                if variadic_signature_matches(types, signature):
+                    result = self.funcs[signature]
+                    yield result
+
+    def resolve(self, types):
+        """ Determine appropriate implementation for this type signature
+        .. deprecated:: 0.4.4
+            Use ``dispatch(*types)`` instead
+        """
+        warn("resolve() is deprecated, use dispatch(*types)",
+             DeprecationWarning)
+
+        return self.dispatch(*types)
+
+    def __getstate__(self):
+        return {'name': self.name,
+                'funcs': self.funcs}
+
+    def __setstate__(self, d):
+        self.name = d['name']
+        self.funcs = d['funcs']
+        self._ordering = ordering(self.funcs)
+        self._cache = {}
+
+    @property
+    def __doc__(self):
+        docs = [f"Multiply dispatched method: {self.name}"]
+
+        if self.doc:
+            docs.append(self.doc)
+
+        other = []
+        for sig in self.ordering[::-1]:
+            func = self.funcs[sig]
+            if func.__doc__:
+                s = f'Inputs: <{str_signature(sig)}>\n'
+                s += '-' * len(s) + '\n'
+                s += func.__doc__.strip()
+                docs.append(s)
+            else:
+                other.append(str_signature(sig))
+
+        if other:
+            docs.append('Other signatures:\n    ' + '\n    '.join(other))
+
+        return '\n\n'.join(docs)
+
+    def _help(self, *args):
+        return self.dispatch(*map(type, args)).__doc__
+
+    def help(self, *args, **kwargs):
+        """ Print docstring for the function corresponding to inputs """
+        print(self._help(*args))
+
+    def _source(self, *args):
+        func = self.dispatch(*map(type, args))
+        if not func:
+            raise TypeError("No function found")
+        return source(func)
+
+    def source(self, *args, **kwargs):
+        """ Print source code for the function corresponding to inputs """
+        print(self._source(*args))
+
+
+def source(func):
+    s = f'File: {inspect.getsourcefile(func)}\n\n'
+    s = s + inspect.getsource(func)
+    return s
+
+
+class MethodDispatcher(Dispatcher):
+    """ Dispatch methods based on type signature
+    See Also:
+        Dispatcher
+    """
+    __slots__ = ('obj', 'cls')
+
+    @classmethod
+    def get_func_params(cls, func):
+        if hasattr(inspect, "signature"):
+            sig = inspect.signature(func)
+            return itl.islice(sig.parameters.values(), 1, None)
+
+    def __get__(self, instance, owner):
+        self.obj = instance
+        self.cls = owner
+        return self
+
+    def __call__(self, *args, **kwargs):
+        types = tuple([type(arg) for arg in args])
+        func = self.dispatch(*types)
+        if not func:
+            raise NotImplementedError(f'Could not find signature for {self.name}: <{str_signature(types)}>')
+        return func(self.obj, *args, **kwargs)
+
+
+def str_signature(sig):
+    """ String representation of type signature
+    >>> str_signature((int, float))
+    'int, float'
+    """
+    return ', '.join(cls.__name__ for cls in sig)
+
+
+def warning_text(name, amb):
+    """ The text for ambiguity warnings """
+    text = f"\nAmbiguities exist in dispatched function {name}\n\n"
+    text += "The following signatures may result in ambiguous behavior:\n"
+    for pair in amb:
+        text += "\t" + \
+                ', '.join('[' + str_signature(s) + ']' for s in pair) + "\n"
+    text += "\n\nConsider making the following additions:\n\n"
+    text += '\n\n'.join(['@dispatch(' + str_signature(super_signature(s))
+                         + f')\ndef {name}(...)' for s in amb])
+    return text
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/utils.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcd060d95eeefe3648e0de143271bbac0629b71e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/utils.py
@@ -0,0 +1,125 @@
+from collections import OrderedDict
+
+__all__ = ["raises", "expand_tuples", "reverse_dict", "groupby", "typename"]
+
+def raises(err, lamda):
+    try:
+        lamda()
+        return False
+    except err:
+        return True
+
+
+def expand_tuples(L):
+    """
+    >>> expand_tuples([1, (2, 3)])
+    [(1, 2), (1, 3)]
+    >>> expand_tuples([1, 2])
+    [(1, 2)]
+    """
+    if not L:
+        return [()]
+    elif not isinstance(L[0], tuple):
+        rest = expand_tuples(L[1:])
+        return [(L[0],) + t for t in rest]
+    else:
+        rest = expand_tuples(L[1:])
+        return [(item,) + t for t in rest for item in L[0]]
+
+
+# Taken from theano/theano/gof/sched.py
+# Avoids licensing issues because this was written by Matthew Rocklin
+def _toposort(edges):
+    """ Topological sort algorithm by Kahn [1] - O(nodes + vertices)
+    inputs:
+        edges - a dict of the form {a: {b, c}} where b and c depend on a
+    outputs:
+        L - an ordered list of nodes that satisfy the dependencies of edges
+    >>> _toposort({1: (2, 3), 2: (3, )})
+    [1, 2, 3]
+    >>> # Closely follows the wikipedia page [2]
+    >>> # [1] Kahn, Arthur B. (1962), "Topological sorting of large networks",
+    >>> # Communications of the ACM
+    >>> # [2] http://en.wikipedia.org/wiki/Toposort#Algorithms
+    """
+    incoming_edges = reverse_dict(edges)
+    incoming_edges = OrderedDict((k, set(val))
+                                 for k, val in incoming_edges.items())
+    S = OrderedDict.fromkeys(v for v in edges if v not in incoming_edges)
+    L = []
+
+    while S:
+        n, _ = S.popitem()
+        L.append(n)
+        for m in edges.get(n, ()):
+            assert n in incoming_edges[m]
+            incoming_edges[m].remove(n)
+            if not incoming_edges[m]:
+                S[m] = None
+    if any(incoming_edges.get(v, None) for v in edges):
+        raise ValueError("Input has cycles")
+    return L
+
+
+def reverse_dict(d):
+    """Reverses direction of dependence dict
+    >>> d = {'a': (1, 2), 'b': (2, 3), 'c':()}
+    >>> reverse_dict(d)  # doctest: +SKIP
+    {1: ('a',), 2: ('a', 'b'), 3: ('b',)}
+    :note: dict order are not deterministic. As we iterate on the
+        input dict, it make the output of this function depend on the
+        dict order. So this function output order should be considered
+        as undeterministic.
+    """
+    result = OrderedDict()  # type: ignore[var-annotated]
+    for key in d:
+        for val in d[key]:
+            result[val] = result.get(val, tuple()) + (key, )
+    return result
+
+
+# Taken from toolz
+# Avoids licensing issues because this version was authored by Matthew Rocklin
+def groupby(func, seq):
+    """ Group a collection by a key function
+    >>> names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
+    >>> groupby(len, names)  # doctest: +SKIP
+    {3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
+    >>> iseven = lambda x: x % 2 == 0
+    >>> groupby(iseven, [1, 2, 3, 4, 5, 6, 7, 8])  # doctest: +SKIP
+    {False: [1, 3, 5, 7], True: [2, 4, 6, 8]}
+    See Also:
+        ``countby``
+    """
+
+    d = OrderedDict()  # type: ignore[var-annotated]
+    for item in seq:
+        key = func(item)
+        if key not in d:
+            d[key] = list()
+        d[key].append(item)
+    return d
+
+
+def typename(type):
+    """Get the name of `type`.
+    Parameters
+    ----------
+    type : Union[Type, Tuple[Type]]
+    Returns
+    -------
+    str
+        The name of `type` or a tuple of the names of the types in `type`.
+    Examples
+    --------
+    >>> typename(int)
+    'int'
+    >>> typename((int, float))
+    '(int, float)'
+    """
+    try:
+        return type.__name__
+    except AttributeError:
+        if len(type) == 1:
+            return typename(*type)
+        return f"({', '.join(map(typename, type))})"
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/variadic.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/variadic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab008d6fcad276ef66076eb4a426bf3e390688cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/multipledispatch/variadic.py
@@ -0,0 +1,91 @@
+from .utils import typename
+
+__all__ = ["VariadicSignatureType", "isvariadic", "VariadicSignatureMeta", "Variadic"]
+
+class VariadicSignatureType(type):
+    # checking if subclass is a subclass of self
+    def __subclasscheck__(cls, subclass):
+        other_type = (subclass.variadic_type if isvariadic(subclass)
+                      else (subclass,))
+        return subclass is cls or all(
+            issubclass(other, cls.variadic_type) for other in other_type  # type: ignore[attr-defined]
+        )
+
+    def __eq__(cls, other):
+        """
+        Return True if other has the same variadic type
+        Parameters
+        ----------
+        other : object (type)
+            The object (type) to check
+        Returns
+        -------
+        bool
+            Whether or not `other` is equal to `self`
+        """
+        return (isvariadic(other) and
+                set(cls.variadic_type) == set(other.variadic_type))  # type: ignore[attr-defined]
+
+    def __hash__(cls):
+        return hash((type(cls), frozenset(cls.variadic_type)))  # type: ignore[attr-defined]
+
+
+def isvariadic(obj):
+    """Check whether the type `obj` is variadic.
+    Parameters
+    ----------
+    obj : type
+        The type to check
+    Returns
+    -------
+    bool
+        Whether or not `obj` is variadic
+    Examples
+    --------
+    >>> # xdoctest: +SKIP
+    >>> isvariadic(int)
+    False
+    >>> isvariadic(Variadic[int])
+    True
+    """
+    return isinstance(obj, VariadicSignatureType)
+
+
+class VariadicSignatureMeta(type):
+    """A metaclass that overrides ``__getitem__`` on the class. This is used to
+    generate a new type for Variadic signatures. See the Variadic class for
+    examples of how this behaves.
+    """
+    def __getitem__(cls, variadic_type):
+        if not (isinstance(variadic_type, (type, tuple)) or type(variadic_type)):
+            raise ValueError("Variadic types must be type or tuple of types"
+                             " (Variadic[int] or Variadic[(int, float)]")
+
+        if not isinstance(variadic_type, tuple):
+            variadic_type = variadic_type,
+        return VariadicSignatureType(
+            f'Variadic[{typename(variadic_type)}]',
+            (),
+            dict(variadic_type=variadic_type, __slots__=())
+        )
+
+
+class Variadic(metaclass=VariadicSignatureMeta):
+    """A class whose getitem method can be used to generate a new type
+    representing a specific variadic signature.
+    Examples
+    --------
+    >>> # xdoctest: +SKIP
+    >>> Variadic[int]  # any number of int arguments
+    <class 'multipledispatch.variadic.Variadic[int]'>
+    >>> Variadic[(int, str)]  # any number of one of int or str arguments
+    <class 'multipledispatch.variadic.Variadic[(int, str)]'>
+    >>> issubclass(int, Variadic[int])
+    True
+    >>> issubclass(int, Variadic[(int, str)])
+    True
+    >>> issubclass(str, Variadic[(int, str)])
+    True
+    >>> issubclass(float, Variadic[(int, str)])
+    False
+    """
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/unification_tools.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/unification_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe0c3fafa32ed4f3f49dec01e0b25d02c009291a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/unification_tools.py
@@ -0,0 +1,395 @@
+import collections
+import operator
+from functools import reduce
+from collections.abc import Mapping
+
+__all__ = ('merge', 'merge_with', 'valmap', 'keymap', 'itemmap',
+           'valfilter', 'keyfilter', 'itemfilter',
+           'assoc', 'dissoc', 'assoc_in', 'update_in', 'get_in')
+
+
+def _get_factory(f, kwargs):
+    factory = kwargs.pop('factory', dict)
+    if kwargs:
+        raise TypeError(f"{f.__name__}() got an unexpected keyword argument '{kwargs.popitem()[0]}'")
+    return factory
+
+
+def merge(*dicts, **kwargs):
+    """ Merge a collection of dictionaries
+
+    >>> merge({1: 'one'}, {2: 'two'})
+    {1: 'one', 2: 'two'}
+
+    Later dictionaries have precedence
+
+    >>> merge({1: 2, 3: 4}, {3: 3, 4: 4})
+    {1: 2, 3: 3, 4: 4}
+
+    See Also:
+        merge_with
+    """
+    if len(dicts) == 1 and not isinstance(dicts[0], Mapping):
+        dicts = dicts[0]
+    factory = _get_factory(merge, kwargs)
+
+    rv = factory()
+    for d in dicts:
+        rv.update(d)
+    return rv
+
+
+def merge_with(func, *dicts, **kwargs):
+    """ Merge dictionaries and apply function to combined values
+
+    A key may occur in more than one dict, and all values mapped from the key
+    will be passed to the function as a list, such as func([val1, val2, ...]).
+
+    >>> merge_with(sum, {1: 1, 2: 2}, {1: 10, 2: 20})
+    {1: 11, 2: 22}
+
+    >>> merge_with(first, {1: 1, 2: 2}, {2: 20, 3: 30})  # doctest: +SKIP
+    {1: 1, 2: 2, 3: 30}
+
+    See Also:
+        merge
+    """
+    if len(dicts) == 1 and not isinstance(dicts[0], Mapping):
+        dicts = dicts[0]
+    factory = _get_factory(merge_with, kwargs)
+
+    result = factory()
+    for d in dicts:
+        for k, v in d.items():
+            if k not in result:
+                result[k] = [v]
+            else:
+                result[k].append(v)
+    return valmap(func, result, factory)
+
+
+def valmap(func, d, factory=dict):
+    """ Apply function to values of dictionary
+
+    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
+    >>> valmap(sum, bills)  # doctest: +SKIP
+    {'Alice': 65, 'Bob': 45}
+
+    See Also:
+        keymap
+        itemmap
+    """
+    rv = factory()
+    rv.update(zip(d.keys(), map(func, d.values())))
+    return rv
+
+
+def keymap(func, d, factory=dict):
+    """ Apply function to keys of dictionary
+
+    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
+    >>> keymap(str.lower, bills)  # doctest: +SKIP
+    {'alice': [20, 15, 30], 'bob': [10, 35]}
+
+    See Also:
+        valmap
+        itemmap
+    """
+    rv = factory()
+    rv.update(zip(map(func, d.keys()), d.values()))
+    return rv
+
+
+def itemmap(func, d, factory=dict):
+    """ Apply function to items of dictionary
+
+    >>> accountids = {"Alice": 10, "Bob": 20}
+    >>> itemmap(reversed, accountids)  # doctest: +SKIP
+    {10: "Alice", 20: "Bob"}
+
+    See Also:
+        keymap
+        valmap
+    """
+    rv = factory()
+    rv.update(map(func, d.items()))
+    return rv
+
+
+def valfilter(predicate, d, factory=dict):
+    """ Filter items in dictionary by value
+
+    >>> iseven = lambda x: x % 2 == 0
+    >>> d = {1: 2, 2: 3, 3: 4, 4: 5}
+    >>> valfilter(iseven, d)
+    {1: 2, 3: 4}
+
+    See Also:
+        keyfilter
+        itemfilter
+        valmap
+    """
+    rv = factory()
+    for k, v in d.items():
+        if predicate(v):
+            rv[k] = v
+    return rv
+
+
+def keyfilter(predicate, d, factory=dict):
+    """ Filter items in dictionary by key
+
+    >>> iseven = lambda x: x % 2 == 0
+    >>> d = {1: 2, 2: 3, 3: 4, 4: 5}
+    >>> keyfilter(iseven, d)
+    {2: 3, 4: 5}
+
+    See Also:
+        valfilter
+        itemfilter
+        keymap
+    """
+    rv = factory()
+    for k, v in d.items():
+        if predicate(k):
+            rv[k] = v
+    return rv
+
+
+def itemfilter(predicate, d, factory=dict):
+    """ Filter items in dictionary by item
+
+    >>> def isvalid(item):
+    ...     k, v = item
+    ...     return k % 2 == 0 and v < 4
+
+    >>> d = {1: 2, 2: 3, 3: 4, 4: 5}
+    >>> itemfilter(isvalid, d)
+    {2: 3}
+
+    See Also:
+        keyfilter
+        valfilter
+        itemmap
+    """
+    rv = factory()
+    for item in d.items():
+        if predicate(item):
+            k, v = item
+            rv[k] = v
+    return rv
+
+
+def assoc(d, key, value, factory=dict):
+    """ Return a new dict with new key value pair
+
+    New dict has d[key] set to value. Does not modify the initial dictionary.
+
+    >>> assoc({'x': 1}, 'x', 2)
+    {'x': 2}
+    >>> assoc({'x': 1}, 'y', 3)   # doctest: +SKIP
+    {'x': 1, 'y': 3}
+    """
+    d2 = factory()
+    d2.update(d)
+    d2[key] = value
+    return d2
+
+
+def dissoc(d, *keys, **kwargs):
+    """ Return a new dict with the given key(s) removed.
+
+    New dict has d[key] deleted for each supplied key.
+    Does not modify the initial dictionary.
+
+    >>> dissoc({'x': 1, 'y': 2}, 'y')
+    {'x': 1}
+    >>> dissoc({'x': 1, 'y': 2}, 'y', 'x')
+    {}
+    >>> dissoc({'x': 1}, 'y') # Ignores missing keys
+    {'x': 1}
+    """
+    factory = _get_factory(dissoc, kwargs)
+    d2 = factory()
+
+    if len(keys) < len(d) * .6:
+        d2.update(d)
+        for key in keys:
+            if key in d2:
+                del d2[key]
+    else:
+        remaining = set(d)
+        remaining.difference_update(keys)
+        for k in remaining:
+            d2[k] = d[k]
+    return d2
+
+
+def assoc_in(d, keys, value, factory=dict):
+    """ Return a new dict with new, potentially nested, key value pair
+
+    >>> purchase = {'name': 'Alice',
+    ...             'order': {'items': ['Apple', 'Orange'],
+    ...                       'costs': [0.50, 1.25]},
+    ...             'credit card': '5555-1234-1234-1234'}
+    >>> assoc_in(purchase, ['order', 'costs'], [0.25, 1.00]) # doctest: +SKIP
+    {'credit card': '5555-1234-1234-1234',
+     'name': 'Alice',
+     'order': {'costs': [0.25, 1.00], 'items': ['Apple', 'Orange']}}
+    """
+    return update_in(d, keys, lambda x: value, value, factory)
+
+
+def update_in(d, keys, func, default=None, factory=dict):
+    """ Update value in a (potentially) nested dictionary
+
+    inputs:
+    d - dictionary on which to operate
+    keys - list or tuple giving the location of the value to be changed in d
+    func - function to operate on that value
+
+    If keys == [k0,..,kX] and d[k0]..[kX] == v, update_in returns a copy of the
+    original dictionary with v replaced by func(v), but does not mutate the
+    original dictionary.
+
+    If k0 is not a key in d, update_in creates nested dictionaries to the depth
+    specified by the keys, with the innermost value set to func(default).
+
+    >>> inc = lambda x: x + 1
+    >>> update_in({'a': 0}, ['a'], inc)
+    {'a': 1}
+
+    >>> transaction = {'name': 'Alice',
+    ...                'purchase': {'items': ['Apple', 'Orange'],
+    ...                             'costs': [0.50, 1.25]},
+    ...                'credit card': '5555-1234-1234-1234'}
+    >>> update_in(transaction, ['purchase', 'costs'], sum) # doctest: +SKIP
+    {'credit card': '5555-1234-1234-1234',
+     'name': 'Alice',
+     'purchase': {'costs': 1.75, 'items': ['Apple', 'Orange']}}
+
+    >>> # updating a value when k0 is not in d
+    >>> update_in({}, [1, 2, 3], str, default="bar")
+    {1: {2: {3: 'bar'}}}
+    >>> update_in({1: 'foo'}, [2, 3, 4], inc, 0)
+    {1: 'foo', 2: {3: {4: 1}}}
+    """
+    ks = iter(keys)
+    k = next(ks)
+
+    rv = inner = factory()
+    rv.update(d)
+
+    for key in ks:
+        if k in d:
+            d = d[k]
+            dtemp = factory()
+            dtemp.update(d)
+        else:
+            d = dtemp = factory()
+
+        inner[k] = inner = dtemp
+        k = key
+
+    if k in d:
+        inner[k] = func(d[k])
+    else:
+        inner[k] = func(default)
+    return rv
+
+
+def get_in(keys, coll, default=None, no_default=False):
+    """ Returns coll[i0][i1]...[iX] where [i0, i1, ..., iX]==keys.
+
+    If coll[i0][i1]...[iX] cannot be found, returns ``default``, unless
+    ``no_default`` is specified, then it raises KeyError or IndexError.
+
+    ``get_in`` is a generalization of ``operator.getitem`` for nested data
+    structures such as dictionaries and lists.
+
+    >>> transaction = {'name': 'Alice',
+    ...                'purchase': {'items': ['Apple', 'Orange'],
+    ...                             'costs': [0.50, 1.25]},
+    ...                'credit card': '5555-1234-1234-1234'}
+    >>> get_in(['purchase', 'items', 0], transaction)
+    'Apple'
+    >>> get_in(['name'], transaction)
+    'Alice'
+    >>> get_in(['purchase', 'total'], transaction)
+    >>> get_in(['purchase', 'items', 'apple'], transaction)
+    >>> get_in(['purchase', 'items', 10], transaction)
+    >>> get_in(['purchase', 'total'], transaction, 0)
+    0
+    >>> get_in(['y'], {}, no_default=True)
+    Traceback (most recent call last):
+        ...
+    KeyError: 'y'
+
+    See Also:
+        itertoolz.get
+        operator.getitem
+    """
+    try:
+        return reduce(operator.getitem, keys, coll)
+    except (KeyError, IndexError, TypeError):
+        if no_default:
+            raise
+        return default
+
+
+def getter(index):
+    if isinstance(index, list):
+        if len(index) == 1:
+            index = index[0]
+            return lambda x: (x[index],)
+        elif index:
+            return operator.itemgetter(*index)
+        else:
+            return lambda x: ()
+    else:
+        return operator.itemgetter(index)
+
+
+def groupby(key, seq):
+    """ Group a collection by a key function
+
+    >>> names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
+    >>> groupby(len, names)  # doctest: +SKIP
+    {3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
+
+    >>> iseven = lambda x: x % 2 == 0
+    >>> groupby(iseven, [1, 2, 3, 4, 5, 6, 7, 8])  # doctest: +SKIP
+    {False: [1, 3, 5, 7], True: [2, 4, 6, 8]}
+
+    Non-callable keys imply grouping on a member.
+
+    >>> groupby('gender', [{'name': 'Alice', 'gender': 'F'},
+    ...                    {'name': 'Bob', 'gender': 'M'},
+    ...                    {'name': 'Charlie', 'gender': 'M'}]) # doctest:+SKIP
+    {'F': [{'gender': 'F', 'name': 'Alice'}],
+     'M': [{'gender': 'M', 'name': 'Bob'},
+           {'gender': 'M', 'name': 'Charlie'}]}
+
+    Not to be confused with ``itertools.groupby``
+
+    See Also:
+        countby
+    """
+    if not callable(key):
+        key = getter(key)
+    d = collections.defaultdict(lambda: [].append)  # type: ignore[var-annotated]
+    for item in seq:
+        d[key(item)](item)
+    rv = {}
+    for k, v in d.items():
+        rv[k] = v.__self__  # type: ignore[var-annotated, attr-defined]
+    return rv
+
+
+def first(seq):
+    """ The first element in a sequence
+
+    >>> first('ABC')
+    'A'
+    """
+    return next(iter(seq))
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/utils.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4887c8f715489c8ce3ecb0616c24b1975f792048
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/utils.py
@@ -0,0 +1,105 @@
+__all__ = ["hashable", "transitive_get", "raises", "reverse_dict", "xfail", "freeze"]
+def hashable(x):
+    try:
+        hash(x)
+        return True
+    except TypeError:
+        return False
+
+
+def transitive_get(key, d):
+    """ Transitive dict.get
+    >>> d = {1: 2, 2: 3, 3: 4}
+    >>> d.get(1)
+    2
+    >>> transitive_get(1, d)
+    4
+    """
+    while hashable(key) and key in d:
+        key = d[key]
+    return key
+
+
+def raises(err, lamda):
+    try:
+        lamda()
+        return False
+    except err:
+        return True
+
+
+# Taken from theano/theano/gof/sched.py
+# Avoids licensing issues because this was written by Matthew Rocklin
+def _toposort(edges):
+    """ Topological sort algorithm by Kahn [1] - O(nodes + vertices)
+    inputs:
+        edges - a dict of the form {a: {b, c}} where b and c depend on a
+    outputs:
+        L - an ordered list of nodes that satisfy the dependencies of edges
+    >>> # xdoctest: +SKIP
+    >>> _toposort({1: (2, 3), 2: (3, )})
+    [1, 2, 3]
+    Closely follows the wikipedia page [2]
+    [1] Kahn, Arthur B. (1962), "Topological sorting of large networks",
+    Communications of the ACM
+    [2] http://en.wikipedia.org/wiki/Toposort#Algorithms
+    """
+    incoming_edges = reverse_dict(edges)
+    incoming_edges = {k: set(val) for k, val in incoming_edges.items()}
+    S = ({v for v in edges if v not in incoming_edges})
+    L = []
+
+    while S:
+        n = S.pop()
+        L.append(n)
+        for m in edges.get(n, ()):
+            assert n in incoming_edges[m]
+            incoming_edges[m].remove(n)
+            if not incoming_edges[m]:
+                S.add(m)
+    if any(incoming_edges.get(v, None) for v in edges):
+        raise ValueError("Input has cycles")
+    return L
+
+
+def reverse_dict(d):
+    """Reverses direction of dependence dict
+    >>> d = {'a': (1, 2), 'b': (2, 3), 'c':()}
+    >>> reverse_dict(d)  # doctest: +SKIP
+    {1: ('a',), 2: ('a', 'b'), 3: ('b',)}
+    :note: dict order are not deterministic. As we iterate on the
+        input dict, it make the output of this function depend on the
+        dict order. So this function output order should be considered
+        as undeterministic.
+    """
+    result = {}  # type: ignore[var-annotated]
+    for key in d:
+        for val in d[key]:
+            result[val] = result.get(val, tuple()) + (key, )
+    return result
+
+
+def xfail(func):
+    try:
+        func()
+        raise Exception("XFailed test passed")  # pragma:nocover
+    except Exception:
+        pass
+
+
+def freeze(d):
+    """ Freeze container to hashable form
+    >>> freeze(1)
+    1
+    >>> freeze([1, 2])
+    (1, 2)
+    >>> freeze({1: 2}) # doctest: +SKIP
+    frozenset([(1, 2)])
+    """
+    if isinstance(d, dict):
+        return frozenset(map(freeze, d.items()))
+    if isinstance(d, set):
+        return frozenset(map(freeze, d))
+    if isinstance(d, (tuple, list)):
+        return tuple(map(freeze, d))
+    return d
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unification/variable.py b/MLPY/Lib/site-packages/torch/fx/experimental/unification/variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..778d2e1cbbdbbd6d9e6c96127f15a349243bd915
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unification/variable.py
@@ -0,0 +1,85 @@
+from contextlib import contextmanager
+from .utils import hashable
+from .dispatch import dispatch
+
+_global_logic_variables = set()  # type: ignore[var-annotated]
+_glv = _global_logic_variables
+
+
+class Var:
+    """ Logic Variable """
+
+    _id = 1
+
+    def __new__(cls, *token):
+        if len(token) == 0:
+            token = f"_{Var._id}"  # type: ignore[assignment]
+            Var._id += 1
+        elif len(token) == 1:
+            token = token[0]
+
+        obj = object.__new__(cls)
+        obj.token = token  # type: ignore[attr-defined]
+        return obj
+
+    def __str__(self):
+        return "~" + str(self.token)  # type: ignore[attr-defined]
+    __repr__ = __str__
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.token == other.token  # type: ignore[attr-defined]
+
+    def __hash__(self):
+        return hash((type(self), self.token))  # type: ignore[attr-defined]
+
+
+def var():
+    return lambda *args: Var(*args)
+
+
+def vars():
+    return lambda n: [var() for i in range(n)]
+
+
+@dispatch(Var)
+def isvar(v):
+    return True
+
+isvar
+
+
+@dispatch(object)  # type: ignore[no-redef]
+def isvar(o):
+    return not not _glv and hashable(o) and o in _glv
+
+
+@contextmanager
+def variables(*variables):
+    """
+    Context manager for logic variables
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined vars")
+        >>> from __future__ import with_statement
+        >>> with variables(1):
+        ...     print(isvar(1))
+        True
+        >>> print(isvar(1))
+        False
+        >>> # Normal approach
+        >>> from unification import unify
+        >>> x = var('x')
+        >>> unify(x, 1)
+        {~x: 1}
+        >>> # Context Manager approach
+        >>> with variables('x'):
+        ...     print(unify('x', 1))
+        {'x': 1}
+    """
+    old_global_logic_variables = _global_logic_variables.copy()
+    _global_logic_variables.update(set(variables))
+    try:
+        yield
+    finally:
+        _global_logic_variables.clear()
+        _global_logic_variables.update(old_global_logic_variables)
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/unify_refinements.py b/MLPY/Lib/site-packages/torch/fx/experimental/unify_refinements.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30cadfd04c3cdd40023a360419af0587d09fdd8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/unify_refinements.py
@@ -0,0 +1,120 @@
+from torch.fx.experimental.graph_gradual_typechecker import Refine
+from torch.fx.tensor_type import TensorType
+from torch.fx.experimental.unification import Var, unify  # type: ignore[attr-defined]
+
+
+def infer_symbolic_types_single_pass(traced):
+    """
+    Calls our symbolic inferencer once.
+    """
+    r = Refine(traced)
+    r.refine()
+    mgu = unify_eq(r.constraints)
+    substitute_all_types(traced.graph, mgu)
+
+def infer_symbolic_types(traced):
+    """
+    Calls our symbolic inferencer twice.
+    This is useful when one pass is not enough
+    to infer all the information such as the case
+    for braodcasting.
+    """
+    r = Refine(traced)
+    r.refine()
+    mgu = unify_eq(r.constraints)
+    substitute_all_types(traced.graph, mgu)
+
+    r = Refine(traced)
+    r.refine()
+    mgu = unify_eq(r.constraints)
+    substitute_all_types(traced.graph, mgu)
+
+    r.symbolic_relations()
+
+def convert_eq(list_of_eq):
+    """
+    Convert equality constraints in the right format
+    to be used by unification library.
+    """
+    lhs = []
+    rhs = []
+    for eq in list_of_eq:
+        lhs.append(eq.lhs)
+        rhs.append(eq.rhs)
+    return tuple(lhs), tuple(rhs)
+
+
+def unify_eq(list_of_eq):
+    """
+    Apply unification to a set of
+    equality constraints
+    """
+    lhs, rhs = convert_eq(list_of_eq)
+    return unify(lhs, rhs)
+
+
+def substitute_solution_one_type(mapping, t):
+    """
+    Apply the most general unifier to a type
+    """
+    if isinstance(t, Var):
+        if t in mapping.keys():
+            return mapping[t]
+        else:
+            return t
+
+    elif isinstance(t, TensorType):
+        new_type = []
+        for typ in t.__args__:
+            if typ in mapping.keys():
+                new_type.append(mapping[typ])
+            else:
+                new_type.append(typ)
+        return TensorType(tuple(new_type))
+
+    elif isinstance(t, list):
+        new_type = []
+        for typ in t:
+            new_type.append(substitute_solution_one_type(mapping, typ))
+        return new_type
+
+    elif isinstance(t, tuple):
+        new_type = []
+        for typ in t:
+            new_type.append(substitute_solution_one_type(mapping, typ))
+        return tuple(new_type)
+
+    else:
+        return t
+
+
+def substitute_all_types(graph, mapping):
+    """
+    Apply the most general unifier to all types in a graph
+    till reaching a fixed point. If the input and output graph
+    are the same, we converge.
+    """
+    flag = True
+    while flag:
+        flag = False
+        for k in mapping:
+            old_mapping_val = mapping[k]
+            if mapping[k] in mapping.keys():
+                new_key = mapping[k]
+                mapping[k] = mapping[new_key]
+            if old_mapping_val != mapping[k]:
+                flag = True
+
+    for n in graph.nodes:
+        n.type = substitute_solution_one_type(mapping, n.type)
+
+def check_for_type_equality(g1, g2):
+    """
+    A check equality to be used in fixed points.
+    We do not use graph equality but instead type
+    equality.
+    """
+    for n, m in zip(g1.nodes, g2.nodes):
+        if n.type != m.type:
+            return False
+    return True
diff --git a/MLPY/Lib/site-packages/torch/fx/experimental/validator.py b/MLPY/Lib/site-packages/torch/fx/experimental/validator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a732c06f202bd3664305793d76650978e0cea4e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/experimental/validator.py
@@ -0,0 +1,766 @@
+import functools
+import logging
+import math
+import operator
+import sympy
+import builtins
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+import torch.fx
+import torch.fx.traceback as fx_traceback
+
+from torch._dynamo.exc import TorchDynamoException
+from torch.fx.node import Argument, Target
+from torch.utils._sympy.interp import sympy_interp
+
+log = logging.getLogger(__name__)
+
+try:
+    import z3  # type: ignore[import]
+
+    # Translation Validation for Dynamo guards
+    # ========================================
+    #
+    # Checks whether optimizations applied to the collected guards are
+    # valid. In other words, whether the guard function we actually run
+    # does not have false positives (unsound).
+    #
+    # In order to do so, we build the guards using 2 different information
+    # attached to each 'SymNode':
+    #   1. SymPy expressions
+    #   2. FX nodes
+    #
+    # SymPy expressions have implicit optimizations baked within itself,
+    # which may have a few bugs. On the other hand, we build the FX graph
+    # manually, with no optimizations enabled. This gives us access to
+    # the "ground truth".
+    #
+    # We then convert into Z3 expressions both the SymPy expressions
+    # (see [Note: SympyToZ3]) that reach 'ShapeEnv.produce_guards' function
+    # and the FX nodes (see [Note: PopulateValidator]) that go through
+    # 'ShapeEnv.evaluate_expr' function. Finally, we run the validation.
+    # (see [Note: TranslationValidator])
+
+    # Better Z3 to string implementation (for a small fraction of Z3).
+    #
+    # Here are the things we clean before showing the Z3 expression:
+    #   - Rename a few ops (e.g. "Distinct" ==> "!=")
+    #
+    #   - Ignore ToInt and ToReal operations:
+    #     usually they don't really matter
+    #
+    #   - Transform (ToInt (/ ...)) into (idiv ...):
+    #     this is the pattern for floor division
+    #
+    #   - Collect a chain of the same operations into one
+    def z3str(e: z3.ExprRef) -> str:
+        assert z3.is_expr(e), f"unsupported expression type: {e}"
+
+        def get_args_str(e: z3.ExprRef) -> List[str]:
+            return [z3str(e.arg(i)) for i in range(e.num_args())]
+
+        # First, we simplify the given expression.
+        # This is done using rewriting rules, so shouldn't take long.
+        e = z3.simplify(e)
+
+
+        # Only support function applications.
+        # Even Z3 "variables" are, in fact, function applications.
+        if not z3.is_app(e):
+            raise ValueError(f"can't print Z3 expression: {e}")
+
+        if z3.is_int_value(e) or z3.is_rational_value(e):
+            return e.as_string()  # type: ignore[attr-defined]
+
+        decl = e.decl()
+        kind = decl.kind()
+        op = str(decl)
+        args = get_args_str(e)
+
+        if kind == z3.Z3_OP_POWER:
+            op = "pow"
+
+        elif kind in (z3.Z3_OP_ADD, z3.Z3_OP_MUL):
+            # Collect the arguments of chains of ADD and MUL.
+            # This is safe, since they are associative.
+
+            def collect_str_args(e):
+                if not (z3.is_app(e) and e.decl().kind() == kind):
+                    return [z3str(e)]
+                else:
+                    return [
+                        x
+                        for i in range(e.num_args())
+                        for x in collect_str_args(e.arg(i))
+                    ]
+
+            args = collect_str_args(e)
+
+        elif kind == z3.Z3_OP_NOT:
+            # Revert some conversions that z3.simplify applies:
+            #   - a != b ==> (Not (== a b)) ==> (!= a b)
+            #   - a < b ==> (Not (<= b a)) ==> (> b a)
+            #   - a > b ==> (Not (<= a b)) ==> (> a b)
+
+            assert e.num_args() == 1
+            arg = e.arg(0)
+
+            assert z3.is_app(arg)
+            argkind = arg.decl().kind()
+
+            logic_inverse = {
+                z3.Z3_OP_EQ: "!=",
+                z3.Z3_OP_LE: ">",
+                z3.Z3_OP_GE: "<",
+            }
+
+            if argkind in logic_inverse:
+                op = logic_inverse[argkind]
+                args = get_args_str(arg)
+
+        elif kind in (z3.Z3_OP_TO_INT, z3.Z3_OP_TO_REAL):
+            assert e.num_args() == 1
+            argstr = z3str(e.arg(0))
+
+            # Check if it's the floor division pattern.
+            if argstr.startswith("(/"):
+                return "(idiv" + argstr[2:]
+
+            # Otherwise, just ignore it.
+            return argstr
+
+        elif kind == z3.Z3_OP_UNINTERPRETED:
+            assert e.num_args() == 0
+            return str(decl)
+
+        string = op + " " + " ".join(args)
+        return f"({string.rstrip()})"
+
+    # Implementation of Python semantics as Z3 expressions.
+    #
+    # Z3 Real-Int theory has operators with semantics that differ that of
+    # Python. Therefore, in order to get it right, we need to implement
+    # the (Python) semantics we are relying on in Z3.
+    @dataclass
+    class _Z3Ops:
+        # Validator used for adding assertions as needed.
+        # e.g. div(a, b) requires b != 0.
+        validator: "TranslationValidator"
+
+        # The 2 functions below are used for conditionally casting between
+        # integer and reals.
+        #
+        # Returns a real expression from 'x'.
+        @staticmethod
+        def to_real(x: z3.ArithRef) -> z3.ArithRef:
+            return x if x.is_real() else z3.ToReal(x)
+
+        # Returns an integer expression from 'x'.
+        @staticmethod
+        def to_int(x: z3.ArithRef) -> z3.ArithRef:
+            return x if x.is_int() else z3.ToInt(x)
+
+        # Implements Python division semantics.
+        def div(self, numerator: z3.ArithRef, denominator: z3.ArithRef) -> z3.ArithRef:
+            self.validator.add_assertion(denominator != 0)  # type: ignore[arg-type]
+            return _Z3Ops.to_real(numerator) / _Z3Ops.to_real(denominator)
+
+        def floor(self, number: z3.ArithRef) -> z3.ArithRef:
+            # Z3 ToInt function rounds a real number towards negative infinity.
+            return _Z3Ops.to_int(number)
+
+        # Python semantics for 'FloorDiv' states that before applying the floor
+        # function, the operands are converted to their common type.
+        def floordiv(self, numerator: z3.ArithRef, denominator: z3.ArithRef) -> z3.ArithRef:
+            cast_result_to_real = numerator.is_real() or denominator.is_real()
+            result = _Z3Ops.to_int(self.div(numerator, denominator))
+            # Since the 'result' is already an integer, we just have to check
+            # whether we should cast it to real.
+            return _Z3Ops.to_real(result) if cast_result_to_real else result
+
+        def ceil(self, number: z3.ArithRef) -> z3.ArithRef:
+            return z3.If(
+                self.floor(number) < number,
+                self.floor(number + 1),
+                number
+            )  # type: ignore[return-value]
+
+        def max(self, a: z3.ArithRef, b: z3.ArithRef) -> z3.ArithRef:
+            return z3.If(a > b, a, b)  # type: ignore[return-value]
+
+        def min(self, a: z3.ArithRef, b: z3.ArithRef) -> z3.ArithRef:
+            return z3.If(a < b, a, b)  # type: ignore[return-value]
+
+        # Python semantics for 'Mod' is defined as: p % q = p - floordiv(p, q) * q
+        # It should work with both integer and reals.
+        def mod(self, p: z3.ArithRef, q: z3.ArithRef) -> z3.ArithRef:
+            return p - self.floordiv(p, q) * q
+
+        def pow(self, base: z3.ArithRef, exp: z3.ArithRef) -> z3.ArithRef:
+            # Z3 can't handle complex numbers very well.
+            self.validator.add_assertion(z3.Or(base != 0, exp > 0))  # type: ignore[arg-type]
+            return base ** exp
+
+        def sqrt(self, number: z3.ArithRef) -> z3.ArithRef:
+            # Square-root:
+            # 1. Only work with reals
+            number = _Z3Ops.to_real(number)
+            # 2. The number should be positive or zero.
+            #    Otherwise, Z3 returns 'unknown'.
+            self.validator.add_assertion(number >= 0)
+            return number ** 0.5
+
+        def abs(self, number: z3.ArithRef) -> z3.ArithRef:
+            return z3.Abs(number)
+
+        def round(self, number: z3.ArithRef, ndigits: Optional[z3.ArithRef] = None) -> z3.ArithRef:
+            if ndigits is not None:
+                raise ValueError("round(..., ndigits=) is currently not supported by shape validations.")
+
+            # Pythons builtin 'round' implements the 'round half to even' strategy
+            # See https://en.wikipedia.org/wiki/Rounding#Rounding_half_to_even
+            # z3 has an equivalent z3.fpRoundToIntegral(z3.RoundNearestTiesToEven(), ...), but this only applies to
+            # floating point numbers, which is different from real numbers that we are dealing with here.
+            # Instead, we implement 'round half to even' in terms of 'round half up' (floor(x + 0.5)) and
+            # 'round half down' (ceil(x - 0.5)).
+            # Assuming 'round half up' is the default case, we need to correct ..., -3.5, -1.5, 0.5, 2.5, 4.5, ...
+            # to round down, i.e. use the 'round half down' strategy
+            return z3.If(
+                self.mod(number, z3.IntVal(2)) == 0.5,
+                self.ceil(number - 0.5),
+                self.floor(number + 0.5),
+            )
+
+    # Lifts a callable to be used in Z3.
+    #
+    # This function replaces the given 'op' by a function that:
+    #
+    #   1. Lifts the arguments into Z3 (i.e. make them inhabitants of Z3)
+    #
+    #   2. Calls an operation that corresponds to 'op', but works with Z3
+    #      inhabitants (left as is if it works as is)
+    def z3op(op: Callable, validator: "TranslationValidator") -> Callable:
+        # Operations that have booleans as their argument.
+        # This is needed because the argument of some FX nodes were
+        # literal integers, instead of booleans. So, whenever this flag
+        # is set, we also convert ints to booleans.
+        boolean_ops = {operator.not_, operator.and_, operator.or_}
+        as_bool = op in boolean_ops
+
+        # Lifts the function into 'z3.ExprRef' domain.
+        def lift(func):
+            def wrap(a) -> z3.ExprRef:
+                if isinstance(a, (z3.ArithRef, z3.BoolRef)):
+                    return a
+                # Convert it into a Z3 value, if it is some of the supported
+                # types below.
+                if isinstance(a, bool) or (as_bool and isinstance(a, int)):
+                    return z3.BoolVal(bool(a))
+                if isinstance(a, (int, sympy.Integer)):
+                    return z3.IntVal(int(a))
+                if isinstance(a, (float, sympy.Float)):
+                    return z3.RealVal(float(a))
+                raise ValueError(f"can't lift type: {type(a)}")
+
+            @functools.wraps(func)
+            def wrapper(*args):
+                # Lifts the arguments into a list of Z3 inhabitants.
+                wrapped_args = (wrap(a) for a in args)
+                # Run the function on the Z3 expressions.
+                return func(*wrapped_args)
+
+            return wrapper
+
+        ops = _Z3Ops(validator)
+        replacement_map = {
+            # Operator module.
+            operator.not_: lift(z3.Not),
+            operator.and_: lift(z3.And),
+            operator.or_: lift(z3.Or),
+            operator.floordiv: lift(ops.floordiv),
+            operator.truediv: lift(ops.div),
+            operator.mod: lift(ops.mod),
+            operator.abs: lift(ops.abs),
+            builtins.round: lift(ops.round),
+
+            # Math module.
+            math.ceil: lift(ops.ceil),
+            math.floor: lift(ops.floor),
+
+            # Torch module.
+            torch.sym_float: lift(ops.to_real),
+            torch.sym_max: lift(ops.max),
+            torch.sym_min: lift(ops.min),
+            torch.sym_ite: lift(lambda b, t, f: t if b else f),
+            torch._sym_sqrt: lift(ops.sqrt),  # type: ignore[attr-defined]
+            # Not lifted because we only use this function as a
+            # marker for adding the expression as validator input.
+            torch._assert: torch._assert,
+        }
+        return replacement_map[op] if op in replacement_map else lift(op)
+
+    # Processes an FX graph, populating the given validator.
+    #
+    # [Note: PopulateValidator]
+    # This class walks through each node in the FX graph, translating
+    # them into the Z3 world.
+    #
+    # Then, whenever it finds an 'torch._assert' call_function operation,
+    # it adds the Z3 expression corresponding to the argument as validator
+    # input.
+    class PopulateValidator(torch.fx.Interpreter):
+        def __init__(self, graph: torch.fx.Graph, validator: "TranslationValidator"):
+            # Reference to the translation validator.
+            self.validator = validator
+
+            # Build the graph module and call `Interpreter` constructor.
+            module = torch.fx.GraphModule(root={}, graph=graph)
+            super().__init__(module, garbage_collect_values=True)
+
+        def placeholder(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
+            symbol = fx_traceback.get_current_meta()["symbol"]
+            return self.validator.z3var(symbol)
+
+        def call_function(self, target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Any]) -> Any:
+            if target != torch._assert:
+                # Actually runs the node target function (which is already
+                # lifted) with its arguments.
+                return super().call_function(target, args, kwargs)
+            # Adds the Z3 expression corresponding to the first argument
+            # as a validator input.
+            assert len(args) == 1, f"expected 1 argument on assertion. Got: {len(args)} "
+            self.validator.add_source_expr(args[0])  # type: ignore[arg-type]
+
+    # Translates SymPy expressions into Z3 expressions.
+    #
+    # [Note: SympyToZ3]
+    # At the time of the translation, all free variables present in the
+    # SymPy expression being translated must be already mapped to a Z3
+    # integer variable.
+    class SympyToZ3:
+        OPERATOR_HANDLES = {"add", "mul", "eq", "ne", "lt", "gt", "le", "ge"}
+
+        def __init__(
+                self,
+                validator: "TranslationValidator",
+        ) -> None:
+            self._validator = validator
+            self._ops = _Z3Ops(self._validator)
+
+        def constant(self, value: Any, dtype: torch.dtype) -> z3.ExprRef:
+            if dtype is torch.int64:
+                return z3.IntVal(int(value))
+            if dtype is torch.double:
+                return z3.RealVal(float(value))
+            if dtype is torch.bool:
+                return z3.BoolVal(bool(value))
+            raise ValueError(f"unsupported dtype (SympyToZ3): {dtype}")
+
+        def truediv(self, numerator: z3.ArithRef, denominator: z3.ArithRef) -> z3.ArithRef:
+            return self._ops.div(numerator, denominator)
+
+        def floordiv(self, numerator: z3.ArithRef, denominator: z3.ArithRef) -> z3.ArithRef:
+            return self._ops.floordiv(numerator, denominator)
+
+        def div(self, numerator: z3.ArithRef, denominator: z3.ArithRef) -> z3.ArithRef:
+            return self._ops.floordiv(numerator, denominator)
+
+        def pow(self, base: z3.ArithRef, exp: z3.ArithRef) -> z3.ArithRef:
+            return self._ops.pow(base, exp)
+
+        def mod(self, p: z3.ArithRef, q: z3.ArithRef) -> z3.ArithRef:
+            return self._ops.mod(p, q)
+
+        def round(self, number: z3.ArithRef, ndigits: Optional[z3.ArithRef] = None) -> z3.ArithRef:
+            return self._ops.round(number, ndigits)
+
+        def __getattr__(self, name: str) -> Any:
+            REPLACEMENT = {
+                "and_": z3.And,
+                "or_": z3.Or,
+                "not_": z3.Not,
+                "floor": self._ops.floor,
+                "ceil": self._ops.ceil,
+                "minimum": self._ops.min,
+                "maximum": self._ops.max,
+            }
+
+            if name in REPLACEMENT:
+                return REPLACEMENT[name]
+            if name in self.OPERATOR_HANDLES:
+                return getattr(operator, name)
+            raise AttributeError(f"unhandled operator: {name}")
+
+        def run(self, expr: sympy.Basic) -> z3.ExprRef:
+            return sympy_interp(self, self._validator.symbols, expr)  # type: ignore[arg-type]
+
+    # Dynamo guards translation validator.
+    #
+    # [Note: TranslationValidator]
+    # Verifies whether the guards issued by 'ShapeEnv.produce_guards' are sound.
+    # That is: whether those (target) guards only yield TRUE whenever the original,
+    # unoptimized, (source) guards yield TRUE.
+    #
+    # More concretely, given 'source' and 'target' guard expressions, we wish to
+    # check whether the following expression holds:
+    #
+    # Not(And(source)) AND And(target)
+    #
+    # i.e. whether there is an assignment of the free variables where the opposite
+    # happens: target is TRUE, but source is FALSE.
+    class TranslationValidator:
+        def __init__(self) -> None:
+            log.debug("new instance")
+
+            # Mapping of SymPy symbols to Z3 variables.
+            self.symbols: Dict[sympy.Symbol, z3.ExprRef] = {}
+
+            # Set of source Z3 expressions.
+            # They represent the generated guards without any kind of
+            # simplification or transformation.
+            self._source_exprs: Set[z3.BoolRef] = set()
+
+            # Set of target Z3 expressions.
+            # They represent the actual checked guards at runtime. They might
+            # be simplified or transformed versions of the source guards.
+            self._target_exprs: Set[z3.BoolRef] = set()
+
+            # Set of Z3 expressions representing assertions over both the
+            # source and target expressions.
+            self._assertions: Set[z3.BoolRef] = set()
+
+        # Retrieves the corresponding Z3 variable.
+        def z3var(self, symbol: sympy.Symbol) -> z3.ExprRef:
+            assert symbol in self.symbols, f"Z3 variable not found for: {symbol}"
+            return self.symbols[symbol]
+
+        # Create a variable in Z3 of 'type' for 'symbol', if it doesn't already exists.
+        def add_var(self, symbol: sympy.Symbol, type: Type) -> z3.ExprRef:
+            if symbol in self.symbols:
+                return self.symbols[symbol]
+
+            log.debug("new variable: %s (%s)", symbol.name, type.__name__)
+
+            if type is int:
+                var = z3.Int(symbol.name)
+
+                # If 'symbol' is positive (SymPy assumption), we have to
+                # convey it to Z3 as well.
+                if symbol.is_positive:  # type: ignore[attr-defined]
+                    self._target_exprs.add(var > 0)
+            elif type is float:
+                var = z3.Real(symbol.name)
+            elif type is bool:
+                var = z3.Bool(symbol.name)
+            else:
+                raise RuntimeError(f"unsupported type for Z3 variable: {type}")
+
+            self.symbols[symbol] = var
+            return var
+
+        # Checks whether all symbols were already added.
+        def _check_freesymbols(self, e: sympy.Basic) -> None:
+            for s in e.free_symbols:
+                assert isinstance(s, sympy.Symbol)
+                # Call 'z3var' just to check whether there's already a
+                # Z3 variable corresponding to 's'.
+                self.z3var(s)
+
+
+        def to_z3_boolean_expr(self, e: sympy.Basic) -> z3.BoolRef:
+            z3expr = SympyToZ3(self).run(e)
+            assert isinstance(z3expr, z3.BoolRef), f"expected boolean expression. Got: {z3expr}"
+            return z3expr
+
+        def add_source_expr(self, e: z3.BoolRef) -> None:
+            if e not in self._source_exprs:
+                log.debug("add source guard: %s", z3str(e))
+            self._source_exprs.add(e)
+
+        def add_target_expr(self, e: sympy.Expr) -> None:
+            self._check_freesymbols(e)
+            z3expr = self.to_z3_boolean_expr(e)
+            if e not in self._target_exprs:
+                log.debug("add target guard: %s", z3str(z3expr))
+            self._target_exprs.add(z3expr)
+
+        def add_assertion(self, e: Union[z3.BoolRef, sympy.Basic]) -> None:
+            if isinstance(e, sympy.Basic):
+                self._check_freesymbols(e)
+                ref = self.to_z3_boolean_expr(e)
+            else:
+                ref = e
+            assert isinstance(ref, z3.BoolRef)
+            if ref not in self._assertions:
+                log.debug("add assertion: %s", z3str(ref))
+            self._assertions.add(ref)
+
+        def validate(self) -> None:
+            from torch._dynamo.utils import dynamo_timed
+
+            if len(self._source_exprs) == 0 or len(self._target_exprs) == 0:
+                # If there are no source/target expressions, there's nothing we really
+                # wish to prove. So, we just return.
+                return None
+
+            # Here, we use "QF_NRA" logic for the solver:
+            #   "Quantifier-free Non-linear Real Arithmetic".
+            #
+            # Most of the guards expressions have:
+            #   1. arithmetic between integer and reals
+            #   2. no quantifiers
+            #   3. potentially non-linear.
+            #
+            # Although there's also "QF_NIRA" (mixed integer-real arithmetic),
+            # "QF_NRA" seems to work better on 'dynamo/test_dynamic_shapes.py'.
+            solver = z3.SolverFor("QF_NRA")
+            # Set a timeout for finding a solution.
+            solver.set(timeout=translation_validation_timeout())
+
+            # Add all the assertions to the solver.
+            for assertion in self._assertions:
+                solver.add(assertion)
+
+            # "Is there any case where it's TRUE for the target expressions,
+            #  but FALSE for the source expressions?"
+            solver.add(z3.Not(z3.And(*self._source_exprs)))
+            solver.add(*self._target_exprs)
+
+            log.debug("translation validation: start")
+            r = dynamo_timed()(solver.check)()
+            if r == z3.sat:
+                # Target expressions are unsound.
+                # Log the found model and the source expressions that failed.
+                model = solver.model()
+                raise ValidationException(
+                    model, self._assertions, self._target_exprs,
+                    failed_source_exprs=[
+                        inp for inp in self._source_exprs if not model.evaluate(inp)
+                    ]
+                )
+            else:
+                if r == z3.unknown:
+                    # Could not find a solution. It didn't fail, but it also
+                    # didn't succeed. Canceling the validation execution (keyboard
+                    # interrupt) also gets to this branch.
+                    log.warning("translation validation: could not validate: got z3.unknown")
+                else:
+                    # Target expressions are sound.
+                    assert r == z3.unsat
+                    log.debug("translation validation: success")
+
+except ImportError:
+    _HAS_Z3 = False
+
+    __all__ = [
+        "translation_validation_enabled", "translation_validation_timeout",
+        "ValidationException", "BisectValidationException",
+    ]
+
+else:
+    _HAS_Z3 = True
+
+    __all__ = [
+        "z3str", "z3op", "PopulateValidator", "SympyToZ3", "TranslationValidator",
+        "translation_validation_enabled", "translation_validation_timeout",
+        "ValidationException", "BisectValidationException",
+    ]
+
+from torch.fx.experimental import _config as config
+
+def translation_validation_enabled() -> bool:
+    # Checks everytime this function is called, in case the Dynamo
+    # option is set, but Z3 is not installed.
+    _assert_z3_installed_if_tv_set()
+    return _HAS_Z3 and config.translation_validation
+
+
+def translation_validation_timeout() -> int:
+    return config.translation_validation_timeout
+
+
+def _assert_z3_installed_if_tv_set():
+    assert _HAS_Z3 or not config.translation_validation, (
+        "translation validation requires Z3 package. Please, either install "
+        "z3-solver or disable translation validation."
+    )
+
+
+class ValidationException(TorchDynamoException):
+    def __init__(self, model, assertions, target_exprs, failed_source_exprs):
+        assert _HAS_Z3
+
+        def symbolstr(sym) -> str:
+            return f"{sym}: {model[sym]}"
+
+        def joinlines(xs) -> str:
+            return "\n".join(f"  ==> {x}" for x in xs)
+
+        model_str = joinlines(sorted(map(symbolstr, model)))
+        assertions_str = joinlines(sorted(map(z3str, assertions)))
+        target_exprs_str = joinlines(sorted(map(z3str, target_exprs)))
+        failed_source_exprs_str = joinlines(sorted(map(z3str, failed_source_exprs)))
+
+        self.msg = "translation validation failed."
+        self.details = f"""\
+Model:
+{model_str}
+
+Assertions:
+{assertions_str}
+
+Target Expressions:
+{target_exprs_str}
+
+Failed Source Expressions:
+{failed_source_exprs_str}"""
+
+    def __str__(self):
+        return f"{self.msg}\n\n{self.details}"
+
+
+class BisectValidationException(TorchDynamoException):
+    def __init__(self, validation_exc, expr, failed_action, traced_node):
+        self.msg = f"translation validation failed when {failed_action}: {expr}"
+        self.details = f"""\
+Failure occurred while running node:
+    {traced_node.format_node()}
+
+{validation_exc.details}"""
+
+    def __str__(self):
+        return f"{self.msg}\n\n{self.details}"
+
+# Checks when this module is loaded.
+_assert_z3_installed_if_tv_set()
+
+# Translation validation bisection.
+#
+# Bisect into the torch._assert nodes recorded in the shape_env FX graph, and raise
+# the earliest ValidationException.
+#
+# As guards are added by ShapeEnv.evaluate_expr calls, some simplification errors
+# might be silently happening. This function tries to nail down exactly at which
+# point things went wrong from a validation perspective.
+def bisect(shape_env):
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv, SHAPEENV_EVENT_KEY, CURRENT_NODE_KEY
+    from torch.fx.experimental.recording import FakeTensorMeta, ShapeEnvEvent, replay_shape_env_events
+
+    events = shape_env.events
+
+    # Retrieves the ShapeEnvEvent associated with node.
+    def get_node_event(node: torch.fx.Node) -> ShapeEnvEvent:
+        assert SHAPEENV_EVENT_KEY in node.meta
+        return events[node.meta[SHAPEENV_EVENT_KEY]]
+
+    # Creates a new instance of fake, but updating every symbolic value's ShapeEnv
+    # reference to the one given as argument.
+    #
+    # This is needed so as not to simplify a symbolic expression using a ShapeEnv
+    # "from the future", where it may have a different set of replacements.
+    def new_with_shape_env(shape_env: ShapeEnv, fake) -> Any:
+        if isinstance(fake, int):
+            return fake
+        if isinstance(fake, torch.SymInt):
+            return torch.SymInt(fake.node.with_shape_env(shape_env))
+        assert isinstance(fake, FakeTensorMeta)
+        return FakeTensorMeta(
+            tuple(new_with_shape_env(shape_env, s) for s in fake.size()),
+            tuple(new_with_shape_env(shape_env, s) for s in fake.stride()),
+            new_with_shape_env(shape_env, fake.storage_offset()),
+            fake.is_nested,
+        )
+
+    # Checks whether the given shape_env fails when produce_guards is called.
+    def check_shapeenv_fails(shape_env: ShapeEnv, tracked_fakes: Optional[List[Any]]) -> Optional[ValidationException]:
+        assert tracked_fakes is not None
+        try:
+            # This produce_guards call is a best-effort replication, since we
+            # don't populate EqualityConstraint list. Reason: we would also have
+            # to save OutputGraph.tracked_fakes_id_to_source.
+            shape_env.produce_guards(
+                [new_with_shape_env(shape_env, a.fake) for a in tracked_fakes],
+                [a.source for a in tracked_fakes],
+                input_contexts=[a.symbolic_context for a in tracked_fakes],
+            )
+            return None
+        except ValidationException as e:
+            return e
+
+    # Checks whether the ShapeEnv reconstructed by replaying the events until
+    # node is created fails when produce_guards is called.
+    def check_node_fails(node: torch.fx.Node) -> Optional[ValidationException]:
+        number = node.meta[SHAPEENV_EVENT_KEY]
+        # Reconstruct shape_env until the event at event_number.
+        shape_env = replay_shape_env_events(events[:number + 1])
+        shape_env.graph.lint()
+        return check_shapeenv_fails(shape_env, events[number].tracked_fakes)
+
+    last_exception = check_shapeenv_fails(shape_env, shape_env._snapshot_tracked_fakes())
+
+    if not last_exception:
+        # We don't actually fail due to a produce_guards call.
+        # Stop and don't bisect.
+        log.info("translation validation succeeded: no errors found.")
+        return
+
+    if not shape_env.should_record_events or config.translation_validation_no_bisect:
+        # Bisection is off.
+        # Return the last ValidationException we got.
+        raise last_exception
+
+    # Cache the raised exception (if any) at each bisection point.
+    exception = {}
+
+    # Bisection happens on the assertion nodes of the recorded FX graph for
+    # dynamic shapes.
+    assert_nodes = [node for node in shape_env.graph.nodes if node.target == torch._assert]
+
+    # Preparing the indices for binary search.
+    left, mid, right = 0, 0, len(assert_nodes) - 1
+
+    while left < right:
+        mid = (left + right) // 2
+
+        node = assert_nodes[mid]
+        log.debug("bisecting at %s: %s", mid, get_node_event(node))
+
+        # Check whether the new shape_env raises a ValidationException or not.
+        exception[mid] = check_node_fails(node)
+
+        if exception[mid]:
+            right = mid
+        else:
+            left = mid + 1
+
+    assert left in exception and isinstance(exception[left], ValidationException)
+
+    node = assert_nodes[left]
+    event = get_node_event(node)
+
+    if event.is_evaluate_expr():
+        failed_action = "evaluating"
+    else:
+        assert event.is_defer_runtime_assert(), f"unexpected event type: {event}"
+        failed_action = "adding runtime assert"
+
+    args = event.args
+    assert args is not None
+    assert len(args) >= 2, (
+        f"bisecting expects {event.name} to have at least 2 positional arguments. "
+        f"Got: {len(args)}"
+    )
+    assert isinstance(args[1], sympy.Basic), (
+        f"bisecting expects {event.name} to have a SymPy expression as its second argument. "
+        f"Got: {type(args[1])}"
+    )
+
+    raise BisectValidationException(
+        exception[left],
+        expr=args[1],
+        failed_action=failed_action,
+        traced_node=node.meta[CURRENT_NODE_KEY],
+    )
diff --git a/MLPY/Lib/site-packages/torch/fx/graph.py b/MLPY/Lib/site-packages/torch/fx/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab867dc732d02c5bc4b6e85d88b6866d611e31f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/graph.py
@@ -0,0 +1,1653 @@
+from collections import defaultdict
+from .node import Node, Argument, Target, map_arg, _type_repr, _get_qualified_name
+import torch.utils._pytree as pytree
+from . import _pytree as fx_pytree
+from ._compatibility import compatibility
+
+import contextlib
+from typing import TYPE_CHECKING, Callable, Any, List, Dict, NamedTuple, Optional, Tuple, Set, FrozenSet, Type
+from dataclasses import dataclass
+from contextlib import contextmanager
+import copy
+import enum
+import torch
+import keyword
+import re
+import builtins
+import math
+import warnings
+import inspect
+
+__all__ = ["PythonCode", "CodeGen", "Graph"]
+
+if TYPE_CHECKING:
+    from .graph_module import GraphModule  # noqa: F401
+    from ._symbolic_trace import Tracer   # noqa: F401
+
+
+# Mapping of builtins to their `typing` equivalent.
+_origin_type_map = {
+    list: List,
+    dict: Dict,
+    set: Set,
+    frozenset: FrozenSet,
+    tuple: Tuple,
+}
+
+
+# Signature for functions thattransforms the body (`list[str]`) of the
+# generated code
+TransformCodeFunc = Callable[[List[str]], List[str]]
+
+
+class _CustomBuiltin(NamedTuple):
+    """Additional objs that we add to every graph's globals.
+
+    The repr() for some standard library objects is not valid Python code without
+    an import. For common objects of this sort, we bundle them in the globals of
+    every FX graph.
+    """
+    # How to import this object from the standard library.
+    import_str: str
+    # The actual object, produced from that import string.
+    obj: Any
+
+_custom_builtins: Dict[str, _CustomBuiltin] = {}
+
+
+def _register_custom_builtin(name: str, import_str: str, obj: Any):
+    _custom_builtins[name] = _CustomBuiltin(import_str, obj)
+
+
+_register_custom_builtin('inf', 'from math import inf', math.inf)
+_register_custom_builtin('nan', 'from math import nan', math.nan)
+_register_custom_builtin('NoneType', 'NoneType = type(None)', type(None))
+_register_custom_builtin('torch', 'import torch', torch)
+_register_custom_builtin('device', 'from torch import device', torch.device)
+_register_custom_builtin('fx_pytree', 'import torch.fx._pytree as fx_pytree', fx_pytree)
+_register_custom_builtin('pytree', 'import torch.utils._pytree as pytree', pytree)
+
+
+def _is_magic(x: str) -> bool:
+    return x.startswith('__') and x.endswith('__')
+
+
+def _snake_case(s: str) -> str:
+    """
+    Transforms the given string ``s`` to a Python-style variable name
+
+    Examples:
+        ``mod.snake_case`` -> ``mod.snake_case``
+        ``mod.pascalCase``-> ``mod.pascal_case``
+        ``mod.ALL_CAPS`` -> ``mod.all_caps``
+    """
+    chars = []
+    prev_lower = False
+    for c in s:
+        if prev_lower and c.isupper():
+            chars.append('_')
+        chars.append(c.lower())
+        prev_lower = c.islower()
+    return ''.join(chars)
+
+
+def _is_from_torch(obj: Any) -> bool:
+    module_name = getattr(obj, '__module__', None)
+    if module_name is not None:
+        base_module = module_name.partition('.')[0]
+        return (
+            base_module == 'torch' and
+            not module_name.startswith("torch._dynamo.") and
+            not module_name.startswith("torch._inductor.")
+        )
+
+    name = getattr(obj, '__name__', None)
+    # exclude torch because torch.torch.torch.torch works. idk mang
+    if name is not None and name != 'torch':
+        for guess in [torch, torch.nn.functional]:
+            if getattr(guess, name, None) is obj:
+                return True
+
+    return False
+
+
+class _Namespace:
+    """A context for associating names uniquely with objects.
+
+    The following invariants are enforced:
+    - Each object gets a single name.
+    - Each name is unique within a given namespace.
+    - Names generated do not shadow builtins, unless the object is indeed that builtin.
+    """
+    def __init__(self):
+        self._obj_to_name: Dict[Any, str] = {}
+        self._unassociated_names = set()
+        self._used_names: Set[str] = set()
+        self._base_count: Dict[str, int] = defaultdict(int)
+
+        self._illegal_char_regex = re.compile('[^0-9a-zA-Z_]+')
+        self._name_suffix_regex = re.compile(r"(.*)_(\d+)$")
+
+    def create_name(self, candidate: str, obj: Optional[Any]) -> str:
+        """Create a unique name.
+
+        Arguments:
+            candidate: used as the basis for the unique name, relevant to the user.
+            obj: If not None, an object that will be associated with the unique name.
+        """
+        if obj is not None and obj in self._obj_to_name:
+            return self._obj_to_name[obj]
+
+        # delete all characters that are illegal in a Python identifier
+        candidate = self._illegal_char_regex.sub('_', candidate)
+
+        if not candidate:
+            candidate = '_unnamed'
+
+        if candidate[0].isdigit():
+            candidate = f'_{candidate}'
+
+        match = self._name_suffix_regex.match(candidate)
+        if match is None:
+            base = candidate
+            num = None
+        else:
+            base, num_str = match.group(1, 2)
+            num = int(num_str)
+
+        candidate = base if num is None else f'{base}_{num}'
+        if not num:
+            num = self._base_count[base]
+
+        while candidate in self._used_names or self._is_illegal_name(candidate, obj):
+            num += 1
+            candidate = f'{base}_{num}'
+
+        self._used_names.add(candidate)
+        self._base_count[base] = num
+        if obj is None:
+            self._unassociated_names.add(candidate)
+        else:
+            self._obj_to_name[obj] = candidate
+        return candidate
+
+    def associate_name_with_obj(self, name: str, obj: Any):
+        """Associate a unique name with an object.
+
+        Neither `name` nor `obj` should be associated already.
+        """
+        assert obj not in self._obj_to_name
+        assert name in self._unassociated_names
+        self._obj_to_name[obj] = name
+        self._unassociated_names.remove(name)
+
+    def _is_illegal_name(self, name: str, obj: Any) -> bool:
+        # 1. keywords are never allowed as names.
+        if name in keyword.kwlist:
+            return True
+
+        # 2. Can't shadow a builtin name, unless you *are* that builtin.
+        if name in builtins.__dict__:
+            return obj is not builtins.__dict__[name]
+
+        # 3. Can't shadow our custom builtins either
+        if name in _custom_builtins:
+            return obj is not _custom_builtins[name].obj
+
+        return False
+
+    def _rename_object(self, obj: Any, name: str):
+        assert obj in self._obj_to_name
+        self._obj_to_name[obj] = name
+        self._used_names.add(name)
+
+dtype_abbrs = {
+    torch.bfloat16: 'bf16',
+    torch.float64: 'f64',
+    torch.float32: 'f32',
+    torch.float16: 'f16',
+    torch.float8_e4m3fn: 'f8e4m3fn',
+    torch.float8_e5m2: 'f8e5m2',
+    torch.float8_e4m3fnuz: 'f8e4m3fnuz',
+    torch.float8_e5m2fnuz: 'f8e5m2fnuz',
+    torch.complex32: 'c32',
+    torch.complex64: 'c64',
+    torch.complex128: 'c128',
+    torch.int8: 'i8',
+    torch.int16: 'i16',
+    torch.int32: 'i32',
+    torch.int64: 'i64',
+    torch.bool: 'b8',
+    torch.uint8: 'u8',
+    torch.uint32: 'u32',
+    torch.uint64: 'u64',
+}
+
+@compatibility(is_backward_compatible=True)
+@dataclass
+class PythonCode:
+    """
+    Represents all the information necessary to exec or save a graph as Python code.
+    """
+    # Python source code for the forward function definition.
+    src: str
+    # Values in global scope during execution of `src_def`.
+    globals: Dict[str, Any]
+    # Optional mapping from the forward function's line number to
+    # node index.
+    _lineno_map: Optional[Dict[int, Optional[int]]]
+
+
+def _format_target(base: str, target: str) -> str:
+    elems = target.split('.')
+    r = base
+    for e in elems:
+        if not e.isidentifier():
+            r = f'getattr({r}, "{e}")'
+        else:
+            r = f'{r}.{e}'
+    return r
+
+class _InsertPoint:
+    def __init__(self, graph, new_insert):
+        self.graph = graph
+        self.orig_insert, graph._insert = graph._insert, new_insert
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, type, value, tb):
+        self.graph._insert = self.orig_insert
+
+class _node_list:
+    def __init__(self, graph: 'Graph', direction: str = '_next'):
+        assert direction in ['_next', '_prev']
+        self.graph = graph
+        self.direction = direction
+
+    def __len__(self):
+        return self.graph._len
+
+    def __iter__(self):
+        root = self.graph._root
+        if self.direction == "_next":
+            cur = root._next
+            while cur is not root:
+                if not cur._erased:
+                    yield cur
+                cur = cur._next
+        else:
+            assert self.direction == "_prev"
+            cur = root._prev
+            while cur is not root:
+                if not cur._erased:
+                    yield cur
+                cur = cur._prev
+
+    def __reversed__(self):
+        return _node_list(self.graph, '_next' if self.direction == '_prev' else '_prev')
+
+class _PyTreeInfo(NamedTuple):
+    """
+    Contains extra info stored when we're using Pytrees
+    """
+    orig_args: List[str]
+    in_spec: pytree.TreeSpec
+    out_spec: Optional[pytree.TreeSpec]
+
+@dataclass(frozen=True)
+class _ParsedStackTrace:
+    """
+    Represents the top-most frame of a parsed stack trace
+    """
+    file: str
+    lineno: str
+    name: str
+    code: str
+
+# get File:lineno code from stack_trace
+def _parse_stack_trace(stack_trace: str):
+    if stack_trace is None:
+        return None
+    pattern = re.compile(r"^File \"(.+)\", line (\d+), in (.+)$")
+    lines = stack_trace.strip().split('\n')
+    # stacktrace should have innermost frame last, so we
+    # iterate backwards to find the first line that starts
+    # with 'File '
+    summary_str = ""
+    for idx in range(len(lines) - 2, -1, -1):
+        line = lines[idx].strip()
+        matches = pattern.match(line)
+        if matches:
+            file = matches.group(1)
+            lineno = matches.group(2)
+            name = matches.group(3)
+            # next line should be the code
+            code = lines[idx + 1].strip()
+            return _ParsedStackTrace(file, lineno, name, code)
+    return None
+
+@compatibility(is_backward_compatible=False)
+class CodeGen:
+    def __init__(self):
+        self._body_transformer: Optional[TransformCodeFunc] = None
+        self._func_name: str = "forward"
+
+    def gen_fn_def(self, free_vars: List[str], maybe_return_annotation: str) -> str:
+        """
+        Given the free variables and a return annotation, generates the beginning of the FX function.
+        By default, `gen_fn_def(['a', 'b'], '') == 'def {self._func_name}(a, b):'`
+        """
+        # If the original function didn't have self as its first argument, we
+        # would have added it.
+        if len(free_vars) == 0 or free_vars[0] != 'self':
+            free_vars.insert(0, 'self')
+        return f"def {self._func_name}({', '.join(free_vars)}){maybe_return_annotation}:"
+
+    def generate_output(self, output_args: Argument) -> str:
+        """
+        Given the output arguments, generates the return statement of the FX function.
+        Note: The returned statement should not be indented.
+        """
+        return f'return {repr(output_args)}'
+
+    def process_inputs(self, *args: Any) -> Any:
+        """
+        Transforms the inputs so that the graph can take them as arguments, as
+        non-default codegen may result in the inputs to the function being
+        different from the inputs to the graph.
+
+        If the graph was directly runnable, this invariant should hold true
+        `f.graph.process_outputs(f.graph(*f.graph.process_inputs(*inputs))) == f(*inputs)`
+        """
+        return args
+
+    def process_outputs(self, outputs: Any) -> Any:
+        """
+        Transforms the outputs of the graph to be identical to the codegen.
+
+        See ``process_inputs`` for more details.
+        """
+        return outputs
+
+    def additional_globals(self) -> List[Tuple[str, Any]]:
+        """
+        If your codegen uses extra global values, add tuples of (identifier,reference to the value) here.
+        For example, return ['List', typing.List] if you need ``List`` in the global context.
+        """
+        return []
+
+    def _gen_python_code(
+        self, nodes, root_module: str, namespace: _Namespace, *, verbose: bool = False,
+    ) -> PythonCode:
+        free_vars: List[str] = []
+        body: List[str] = []
+        globals_: Dict[str, Any] = {}
+        wrapped_fns: Dict[str, None] = {}
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation : List[str] = ['']
+
+        def add_global(name_hint: str, obj: Any):
+            """Add an obj to be tracked as a global.
+
+            We call this for names that reference objects external to the
+            Graph, like functions or types.
+
+            Returns: the global name that should be used to reference 'obj' in generated source.
+            """
+            if _is_from_torch(obj) and obj != torch.device:  # to support registering torch.device
+                # HACK: workaround for how torch custom ops are registered. We
+                # can't import them like normal modules so they must retain their
+                # fully qualified name.
+                return _get_qualified_name(obj)
+
+            # normalize the name hint to get a proper identifier
+            global_name = namespace.create_name(name_hint, obj)
+
+            if global_name in globals_:
+                assert globals_[global_name] is obj
+                return global_name
+            globals_[global_name] = obj
+            return global_name
+
+        # Pre-fill the globals table with registered builtins.
+        for name, (_, obj) in _custom_builtins.items():
+            add_global(name, obj)
+
+        def type_repr(o : Any):
+            if o == ():
+                # Empty tuple is used for empty tuple type annotation Tuple[()]
+                return '()'
+
+            typename = _type_repr(o)
+
+            if hasattr(o, '__origin__'):
+                # This is a generic type, e.g. typing.List[torch.Tensor]
+                origin_type = _origin_type_map.get(o.__origin__, o.__origin__)
+                origin_typename = add_global(_type_repr(origin_type), origin_type)
+
+                if hasattr(o, '__args__'):
+                    # Assign global names for each of the inner type variables.
+                    args = [type_repr(arg) for arg in o.__args__]
+
+                    if len(args) == 0:
+                        # Bare type, such as `typing.Tuple` with no subscript
+                        # This code-path used in Python < 3.9
+                        return origin_typename
+
+                    return f'{origin_typename}[{",".join(args)}]'
+                else:
+                    # Bare type, such as `typing.Tuple` with no subscript
+                    # This code-path used in Python 3.9+
+                    return origin_typename
+
+            # Common case: this is a regular module name like 'foo.bar.baz'
+            return add_global(typename, o)
+
+        def _get_repr(arg: Any) -> str:
+            # Handle NamedTuples (if it has `_fields`) via add_global.
+            if isinstance(arg, tuple) and hasattr(arg, '_fields'):
+                qualified_name = _get_qualified_name(type(arg))
+                global_name = add_global(qualified_name, type(arg))
+                return f"{global_name}{repr(tuple(arg))}"
+            elif isinstance(arg, torch._ops.OpOverload):
+                qualified_name = _get_qualified_name(arg)
+                global_name = add_global(qualified_name, arg)
+                return f"{global_name}"
+            elif isinstance(arg, enum.Enum):
+                cls = arg.__class__
+                clsname = add_global(cls.__name__, cls)
+                return f"{clsname}.{arg.name}"
+            return repr(arg)
+
+        def _format_args(args: Tuple[Argument, ...], kwargs: Dict[str, Argument]) -> str:
+            args_s = ', '.join(_get_repr(a) for a in args)
+            kwargs_s = ', '.join(f'{k} = {_get_repr(v)}' for k, v in kwargs.items())
+            if args_s and kwargs_s:
+                return f'{args_s}, {kwargs_s}'
+            return args_s or kwargs_s
+
+        # Run through reverse nodes and record the first instance of a use
+        # of a given node. This represents the *last* use of the node in the
+        # execution order of the program, which we will use to free unused
+        # values
+        node_to_last_use : Dict[Node, Node] = {}
+        user_to_last_uses : Dict[Node, List[Node]] = {}
+
+        def register_last_uses(n : Node, user : Node):
+            if n not in node_to_last_use:
+                node_to_last_use[n] = user
+                user_to_last_uses.setdefault(user, []).append(n)
+
+        for node in reversed(nodes):
+            map_arg(node.args, lambda n: register_last_uses(n, node))
+            map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+        def delete_unused_values(user : Node):
+            """
+            Delete values after their last use. This ensures that values that are
+            not used in the remainder of the code are freed and the memory usage
+            of the code is optimal.
+            """
+            if user.op == 'placeholder':
+                return
+            if user.op == 'output':
+                body.append('\n')
+                return
+            nodes_to_delete = user_to_last_uses.get(user, [])
+            if len(nodes_to_delete):
+                to_delete_str = ' = '.join([repr(n) for n in nodes_to_delete] + ['None'])
+                body.append(f';  {to_delete_str}\n')
+            else:
+                body.append('\n')
+
+        prev_stacktrace = None
+
+        def append_stacktrace_summary(node : Node):
+            """
+            Append a summary of the stacktrace to the generated code. This is
+            useful for debugging.
+            """
+            nonlocal prev_stacktrace
+
+            if node.op not in {'placeholder', 'output'}:
+                if node.stack_trace:
+                    if node.stack_trace != prev_stacktrace:
+                        prev_stacktrace = node.stack_trace
+                        summary_str = ""
+
+                        parsed_stack_trace = _parse_stack_trace(node.stack_trace)
+
+                        if parsed_stack_trace is not None:
+                            lineno = parsed_stack_trace.lineno
+                            code = parsed_stack_trace.code
+                            name = parsed_stack_trace.name
+                            summary_str = f'File: {parsed_stack_trace.file}:{lineno} in {name}, code: {code}'
+
+                        body.append(f'\n# {summary_str}\n')
+                elif prev_stacktrace != "":
+                    prev_stacktrace = ""
+                    body.append('\n# No stacktrace found for following nodes\n')
+
+        def stringify_shape(shape : torch.Size) -> str:
+            return f"[{', '.join(str(x) for x in shape)}]"
+
+        def emit_node(node : Node):
+            maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}'
+
+            if verbose:
+                # override annotation with more detailed information
+                from torch._subclasses.fake_tensor import FakeTensor
+                from torch.fx.experimental.proxy_tensor import py_sym_types
+                from torch.fx.passes.shape_prop import TensorMetadata
+
+                meta_val = node.meta.get('val', node.meta.get('tensor_meta', None))
+
+                # use string as annotation, to make it valid python code
+                if isinstance(meta_val, FakeTensor):
+                    maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
+                elif isinstance(meta_val, py_sym_types):
+                    maybe_type_annotation = f': "Sym({meta_val})"'
+                elif isinstance(meta_val, TensorMetadata):
+                    maybe_type_annotation = f': "{dtype_abbrs[meta_val.dtype]}{stringify_shape(meta_val.shape)}"'
+
+            if node.op == 'placeholder':
+                assert isinstance(node.target, str)
+                maybe_default_arg = '' if not node.args else f' = {_get_repr(node.args[0])}'
+                free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}')
+                raw_name = node.target.replace('*', '')
+                if raw_name != repr(node):
+                    body.append(f'{repr(node)} = {raw_name}\n')
+                return
+            elif node.op == 'call_method':
+                assert isinstance(node.target, str)
+                body.append(
+                    f'{repr(node)}{maybe_type_annotation} = {_format_target(_get_repr(node.args[0]), node.target)}'
+                    f'({_format_args(node.args[1:], node.kwargs)})')
+                return
+            elif node.op == 'call_function':
+                assert callable(node.target)
+                # pretty print operators
+                if getattr(node.target, "__module__", "") == '_operator' and node.target.__name__ in magic_methods:
+                    assert isinstance(node.args, tuple)
+                    body.append(f'{repr(node)}{maybe_type_annotation} = '
+                                f'{magic_methods[node.target.__name__].format(*(_get_repr(a) for a in node.args))}')
+                    return
+
+                # pretty print inplace operators; required for jit.script to work properly
+                # not currently supported in normal FX graphs, but generated by torchdynamo
+                if getattr(node.target, "__module__", "") == '_operator' and node.target.__name__ in inplace_methods:
+                    body.append(f'{inplace_methods[node.target.__name__].format(*(_get_repr(a) for a in node.args))};  '
+                                f'{repr(node)}{maybe_type_annotation} = {_get_repr(node.args[0])}')
+                    return
+
+                qualified_name = _get_qualified_name(node.target)
+                global_name = add_global(qualified_name, node.target)
+                # special case for getattr: node.args could be 2-argument or 3-argument
+                # 2-argument: attribute access; 3-argument: fall through to attrib function call with default value
+                if global_name == 'getattr' and \
+                   isinstance(node.args, tuple) and \
+                   isinstance(node.args[1], str) and \
+                   node.args[1].isidentifier() and \
+                   len(node.args) == 2:
+                    body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(_get_repr(node.args[0]), node.args[1])}')
+                    return
+                body.append(f'{repr(node)}{maybe_type_annotation} = {global_name}({_format_args(node.args, node.kwargs)})')
+                if node.meta.get('is_wrapped', False):
+                    wrapped_fns.setdefault(global_name)
+                return
+            elif node.op == 'call_module':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = '
+                            f'{_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})')
+                return
+            elif node.op == 'get_attr':
+                assert isinstance(node.target, str)
+                body.append(f'{repr(node)}{maybe_type_annotation} = {_format_target(root_module, node.target)}')
+                return
+            elif node.op == 'output':
+                if node.type is not None:
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
+                body.append(self.generate_output(node.args[0]))
+                return
+            raise NotImplementedError(f'node: {node.op} {node.target}')
+
+        for i, node in enumerate(nodes):
+            # NOTE: emit_node does not emit a string with newline. It depends
+            # on delete_unused_values to append one
+            if verbose:
+                append_stacktrace_summary(node)
+            # emit a counter comment to keep track of
+            # node index, which will be deleted later
+            # after going through _body_transformer
+            body.append(f"# COUNTER: {i}\n")
+            emit_node(node)
+            delete_unused_values(node)
+
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+
+
+
+        if len(wrapped_fns) > 0:
+            wrap_name = add_global('wrap', torch.fx.wrap)
+            wrap_stmts = '\n'.join([f'{wrap_name}("{name}")' for name in wrapped_fns])
+        else:
+            wrap_stmts = ''
+
+        if self._body_transformer:
+            body = self._body_transformer(body)
+
+        for name, value in self.additional_globals():
+            add_global(name, value)
+
+        prologue = self.gen_fn_def(free_vars, maybe_return_annotation[0])
+
+        # remove counter and generate lineno to node index mapping
+        lineno_map: Dict[int, Optional[int]] = {}
+        prologue_len = prologue.count('\n') + 1
+        new_lines: List[str] = []
+        cur_idx = None
+        for line in ''.join(body).split('\n'):
+            counter = re.search(r"# COUNTER: (\d+)", line)
+            if counter and counter.group(1) is not None:
+                cur_idx = int(counter.group(1))
+            else:
+                lineno_map[len(new_lines) + prologue_len] = cur_idx
+                new_lines.append(line)
+
+        code = "\n".join(new_lines).lstrip('\n')
+        code = '\n'.join('    ' + line for line in code.split('\n'))
+
+        fn_code = f"""
+{wrap_stmts}
+
+{prologue}
+{code}"""
+        return PythonCode(fn_code, globals_, _lineno_map=lineno_map)
+
+
+# Ideally, we'd like to refactor all of the pytree logic into this codegen
+# class. Unfortunately, there are 3 areas we currently need extra logic in FX.
+# 1. In the initial symbolic trace, the pytree logic is tied up with `concrete_args`.
+# 2. In the FX graph, we need to access 2 attributes - in_spec and out_spec.
+#    Since we can't access .graph within the FX forward, we need to copy the attribute to the module.
+# 3. We currently can't register the pytree imports with `add_global` - not sure why.
+class _PyTreeCodeGen(CodeGen):
+    def __init__(self, pytree_info: _PyTreeInfo):
+        super().__init__()
+        self.pytree_info: _PyTreeInfo = pytree_info
+
+    def process_inputs(self, *inputs: Any) -> Any:
+        flat_args = pytree.arg_tree_leaves(*inputs)
+        return flat_args
+
+    def process_outputs(self, out: Any) -> Any:
+        if self.pytree_info is None or self.pytree_info.out_spec is None:
+            return out
+        if not isinstance(out, (list, tuple)):
+            out = [out]
+        assert self.pytree_info.out_spec is not None
+        return pytree.tree_unflatten(out, self.pytree_info.out_spec)
+
+    def gen_fn_def(self, free_vars, maybe_return_annotation):
+        # Given a user function/model:
+        #   myargs = [myargs0, myargs1]
+        #   mykwargs = {'mykwargs0': ..., 'mykwargs1': ...}
+        #   def forward(self, mypos, *myargs, mykey=None, **mykwargs):
+        #
+        # The generated code flattens all keywords into positional arguments for `forward()`
+        #   e.g forward(self, mypos, myargs0, myargs1, mykey, mykwargs0, mykwargs1):
+        #
+        # Within `forward`, `tree_flatten_spec``still parses args and kwargs separately
+        #   e.g. tree_flatten_spec(([mypos, myargs0, myargs1],
+        #                           {'mykey':mykey, 'mykwargs0':mykwargs0, 'mykwargs1':mykwargs1}),
+        #                          self._in_spec)
+        #
+        # If the user function/model does not have keywords, the dict is suppressed from tree_flatten_spec
+        #   e.g. tree_flatten_spec([mypos, myargs0, myargs1]), self._in_spec)
+        if self.pytree_info is None:
+            return super().gen_fn_def(free_vars, maybe_return_annotation)
+
+        fn_args = self.pytree_info.orig_args
+        has_orig_self = (fn_args[0] == 'self') if len(fn_args) > 0 else False
+        if has_orig_self:
+            free_vars.insert(0, 'self')
+        fn_definition = super().gen_fn_def(fn_args[:], maybe_return_annotation)
+
+        if len(free_vars) > 0:  # pytree has placeholders in it
+            # when kwargs is present, in_spec is tuple(args, kwargs)
+            has_args_kwargs_tuple = self.pytree_info.in_spec.type == tuple and \
+                self.pytree_info.in_spec.num_children == 2 and \
+                self.pytree_info.in_spec.children_specs[0].type == tuple and \
+                self.pytree_info.in_spec.children_specs[1].type == dict
+            fn_kwargs = '{}'
+            fn_signature = f"[{', '.join(fn_args)}], self._in_spec"
+            if has_args_kwargs_tuple:
+                count_args = self.pytree_info.in_spec.children_specs[0].num_children
+                fn_args = self.pytree_info.orig_args[:count_args]
+                fn_kwargs = '{' + ', '.join(f"'{k}':{v}" for k, v in zip(
+                                  self.pytree_info.in_spec.children_specs[1].context,
+                                  self.pytree_info.orig_args[count_args:])) + '}'
+                fn_signature = f"([{', '.join(fn_args)}], {fn_kwargs}), self._in_spec"
+
+            # in Python, `var1: annotation1, var2: annotation2 = function_call()` is invalid.
+            # we need to split it to two lines:
+            # one for annotation: `var1: annotation1; var2: annotation2;` (note the semicolon)
+            # one for code: `var1, var2, = function_call()`
+            without_annotation = [x.split(":")[0] for x in free_vars]
+            has_annotation = [x + "; " for x in free_vars if ":" in x]
+            if len(has_annotation) > 0:
+                fn_definition += "\n    " + "".join(has_annotation) + "\n"
+            fn_definition += f"""
+    {', '.join(without_annotation)}, = fx_pytree.tree_flatten_spec({fn_signature})"""
+        return fn_definition
+
+    def generate_output(self, output_args):
+        if self.pytree_info and self.pytree_info.out_spec:
+            return f'return pytree.tree_unflatten({repr(output_args)}, self._out_spec)'
+        else:
+            return super().generate_output(output_args)
+
+@compatibility(is_backward_compatible=True)
+class Graph:
+    """
+    ``Graph`` is the main data structure used in the FX Intermediate Representation.
+    It consists of a series of ``Node`` s, each representing callsites (or other
+    syntactic constructs). The list of ``Node`` s, taken together, constitute a
+    valid Python function.
+
+    For example, the following code
+
+    .. code-block:: python
+
+        import torch
+        import torch.fx
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.rand(3, 4))
+                self.linear = torch.nn.Linear(4, 5)
+
+            def forward(self, x):
+                return torch.topk(torch.sum(self.linear(x + self.linear.weight).relu(), dim=-1), 3)
+
+        m = MyModule()
+        gm = torch.fx.symbolic_trace(m)
+
+    Will produce the following Graph::
+
+        print(gm.graph)
+
+    .. code-block:: text
+
+        graph(x):
+            %linear_weight : [num_users=1] = self.linear.weight
+            %add_1 : [num_users=1] = call_function[target=operator.add](args = (%x, %linear_weight), kwargs = {})
+            %linear_1 : [num_users=1] = call_module[target=linear](args = (%add_1,), kwargs = {})
+            %relu_1 : [num_users=1] = call_method[target=relu](args = (%linear_1,), kwargs = {})
+            %sum_1 : [num_users=1] = call_function[target=torch.sum](args = (%relu_1,), kwargs = {dim: -1})
+            %topk_1 : [num_users=1] = call_function[target=torch.topk](args = (%sum_1, 3), kwargs = {})
+            return topk_1
+
+    For the semantics of operations represented in the ``Graph``, please see :class:`Node`.
+    """
+
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, owning_module: Optional["GraphModule"] = None, tracer_cls: Optional[Type["Tracer"]] = None,
+                 tracer_extras: Optional[Dict[str, Any]] = None):
+        """
+        Construct an empty Graph.
+        """
+        self._root : Node = Node(self, '', 'root', '', (), {})
+        self._used_names : Dict[str, int] = {}  # base name -> number
+        self._insert = self._root.prepend
+        self._len = 0
+        self._graph_namespace = _Namespace()
+        self._owning_module = owning_module
+        self._tracer_cls = tracer_cls
+        self._tracer_extras = tracer_extras
+        self._codegen = CodeGen()
+        self._co_fields : Dict[str, Any] = {}
+
+    @property
+    def owning_module(self):
+        return self._owning_module
+
+    @owning_module.setter
+    def owning_module(self, mod: Optional["GraphModule"]):
+        self._owning_module = mod
+
+    @property
+    def nodes(self) -> _node_list:
+        """
+        Get the list of Nodes that constitute this Graph.
+
+        Note that this ``Node`` list representation is a doubly-linked list. Mutations
+        during iteration (e.g. delete a Node, add a Node) are safe.
+
+        Returns:
+
+            A doubly-linked list of Nodes. Note that ``reversed`` can be called on
+            this list to switch iteration order.
+        """
+        return _node_list(self)
+
+    @compatibility(is_backward_compatible=True)
+    def graph_copy(self, g : 'Graph', val_map : Dict[Node, Node], return_output_node=False) -> 'Optional[Argument]':
+        """
+        Copy all nodes from a given graph into ``self``.
+
+        Args:
+
+            g (Graph): The source graph from which to copy Nodes.
+
+            val_map (Dict[Node, Node]): a dictionary that will be populated with a mapping
+                from nodes in ``g`` to nodes in ``self``. Note that ``val_map`` can be passed
+                in with values in it already to override copying of certain values.
+
+        Returns:
+
+            The value in ``self`` that is now equivalent to the output value in ``g``,
+            if ``g`` had an ``output`` node. ``None`` otherwise.
+        """
+        for node in g.nodes:
+            if node in val_map:
+                continue
+            if node.op == 'output':
+                rv = map_arg(node.args[0], lambda n: val_map[n])
+                return rv if not return_output_node else (rv, node)
+            val_map[node] = self.node_copy(node, lambda n : val_map[n])
+        return None
+
+    def __deepcopy__(self, memo=None) -> 'Graph':
+        """
+        Explicitly implement __deepcopy__ to prevent excessive recursion depth
+        from the default implementation. This uses graph_copy to copy the nodes
+        in an iterative way, rather than recursive. It also populates the
+        memoization table to prevent unnecessary copies (e.g. references to
+        nodes or other parts of the Graph from a custom GraphModule implementation.
+        """
+        memo = memo if memo else {}
+        g = Graph(tracer_cls=self._tracer_cls)
+        output_vals = g.graph_copy(self, val_map=memo, return_output_node=True)
+        g._codegen = copy.deepcopy(self._codegen)
+        assert isinstance(output_vals, tuple)
+        output_val, old_output_node = output_vals
+        new_output_node = g.output(output_val, type_expr=getattr(old_output_node, 'type', None))
+        new_output_node.meta = copy.copy(old_output_node.meta)
+        return g
+
+    @compatibility(is_backward_compatible=True)
+    def create_node(self, op: str, target: 'Target',
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
+                    name: Optional[str] = None,
+                    type_expr: Optional[Any] = None) -> Node:
+        """
+        Create a ``Node`` and add it to the ``Graph`` at the current insert-point.
+        Note that the current insert-point can be set via :meth:`Graph.inserting_before`
+        and :meth:`Graph.inserting_after`.
+
+        Args:
+            op (str): the opcode for this Node. One of 'call_function', 'call_method', 'get_attr',
+                'call_module', 'placeholder', or 'output'. The semantics of these opcodes are
+                described in the ``Graph`` docstring.
+
+            args (Optional[Tuple[Argument, ...]]): is a tuple of arguments to this node.
+
+            kwargs (Optional[Dict[str, Argument]]): the kwargs of this Node
+
+            name (Optional[str]): an optional string name for the ``Node``.
+                This will influence the name of the value assigned to in the
+                Python generated code.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly-created and inserted node.
+        """
+        assert op in ('call_function', 'call_method', 'get_attr', 'call_module', 'placeholder', 'output')
+        args = () if args is None else args
+        kwargs = {} if kwargs is None else kwargs
+        assert isinstance(args, tuple), "args must be a tuple"
+        assert isinstance(kwargs, dict), "kwargs must be a dict"
+
+        candidate = name if name is not None else self._target_to_str(target)
+        name = self._graph_namespace.create_name(candidate, None)
+        n = Node(self, name, op, target, args, kwargs, type_expr)
+
+        self._graph_namespace.associate_name_with_obj(name, n)
+
+        self._insert(n)
+        self._len += 1
+        return n
+
+    @compatibility(is_backward_compatible=False)
+    def process_inputs(self, *args):
+        """
+        Processes args so that they can be passed to the FX graph.
+        """
+        return self._codegen.process_inputs(*args)
+
+    @compatibility(is_backward_compatible=False)
+    def process_outputs(self, out):
+        return self._codegen.process_outputs(out)
+
+
+    @compatibility(is_backward_compatible=True)
+    def erase_node(self, to_erase : Node) -> None:
+        """
+        Erases a ``Node`` from the ``Graph``. Throws an exception if
+        there are still users of that node in the ``Graph``.
+
+        Args:
+
+            to_erase (Node): The ``Node`` to erase from the ``Graph``.
+        """
+        if len(to_erase.users) > 0:
+            raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {len(to_erase.users)} '
+                               f'users in the graph: {to_erase.users}!')
+        if to_erase.graph != self:
+            raise RuntimeError(f"Attempting to remove {to_erase} from wrong graph!")
+        if to_erase._erased:
+            warnings.warn(f"erase_node({to_erase}) on an already erased node")
+            return
+
+        to_erase._remove_from_list()
+        to_erase._erased = True  # iterators may retain handles to erased nodes
+        self._len -= 1
+
+        # Null out this Node's argument nodes so that the Nodes referred to
+        # can update their ``users`` accordingly
+        new_args = map_arg(to_erase.args, lambda n: None)
+        assert isinstance(new_args, tuple)
+        to_erase.args = new_args
+        new_kwargs = map_arg(to_erase.kwargs, lambda n: None)
+        assert isinstance(new_kwargs, dict)
+        to_erase.kwargs = new_kwargs
+
+    @compatibility(is_backward_compatible=True)
+    def inserting_before(self, n: Optional[Node] = None):
+        """Set the point at which create_node and companion methods will insert into the graph.
+        When used within a 'with' statement, this will temporary set the insert point and
+        then restore it when the with statement exits::
+
+            with g.inserting_before(n):
+                ... # inserting before node n
+            ... # insert point restored to what it was previously
+            g.inserting_before(n) #  set the insert point permanently
+
+        Args:
+
+            n (Optional[Node]): The node before which to insert. If None this will insert before
+                the beginning of the entire graph.
+
+        Returns:
+            A resource manager that will restore the insert point on ``__exit__``.
+        """
+        if n is None:
+            return self.inserting_after(self._root)
+        assert n.graph == self, "Node to insert before is not in graph."
+        return _InsertPoint(self, n.prepend)
+
+    @compatibility(is_backward_compatible=True)
+    def inserting_after(self, n: Optional[Node] = None):
+        """Set the point at which create_node and companion methods will insert into the graph.
+        When used within a 'with' statement, this will temporary set the insert point and
+        then restore it when the with statement exits::
+
+            with g.inserting_after(n):
+                ... # inserting after node n
+            ... # insert point restored to what it was previously
+            g.inserting_after(n) #  set the insert point permanently
+
+        Args:
+
+            n (Optional[Node]): The node before which to insert. If None this will insert after
+                the beginning of the entire graph.
+
+        Returns:
+            A resource manager that will restore the insert point on ``__exit__``.
+        """
+        if n is None:
+            return self.inserting_before(self._root)
+        assert n.graph == self, "Node to insert after is not in graph."
+        return _InsertPoint(self, n.append)
+
+    @compatibility(is_backward_compatible=True)
+    def placeholder(self, name: str, type_expr: Optional[Any] = None,
+                    default_value : Any = inspect.Signature.empty) -> Node:
+        """
+        Insert a ``placeholder`` node into the Graph. A ``placeholder`` represents
+        a function input.
+
+        Args:
+
+            name (str): A name for the input value. This corresponds to the name
+                of the positional argument to the function this ``Graph`` represents.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have. This is needed in some
+                cases for proper code generation (e.g. when the function is used
+                subsequently in TorchScript compilation).
+
+            default_value (Any): The default value this function argument should take
+                on. NOTE: to allow for `None` as a default value, `inspect.Signature.empty`
+                should be passed as this argument to specify that the parameter does _not_
+                have a default value.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
+        """
+        args = () if default_value is inspect.Signature.empty else (default_value,)
+        return self.create_node('placeholder', name, args=args, type_expr=type_expr)
+
+    @compatibility(is_backward_compatible=True)
+    def get_attr(self, qualified_name: str, type_expr: Optional[Any] = None) -> Node:
+        """
+        Insert a ``get_attr`` node into the Graph. A ``get_attr`` ``Node`` represents the
+        fetch of an attribute from the ``Module`` hierarchy.
+
+        Args:
+
+            qualified_name (str): the fully-qualified name of the attribute to be retrieved.
+                For example, if the traced Module has a submodule named ``foo``, which has a
+                submodule named ``bar``, which has an attribute named ``baz``, the qualified
+                name ``foo.bar.baz`` should be passed as ``qualified_name``.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+
+        Returns:
+
+            The newly-created and inserted ``get_attr`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
+        """
+        def _get_attr_reference_exists(mod: torch.nn.Module, qualified_name: str) -> bool:
+            module_path, _, name = qualified_name.rpartition(".")
+
+            try:
+                submod: torch.nn.Module = mod.get_submodule(module_path)
+            except AttributeError:
+                warnings.warn(f"Failed to fetch module {module_path}!")
+                return False
+
+            if not hasattr(submod, name):
+                return False
+
+            res = getattr(submod, name)
+
+            if (not isinstance(res, torch.nn.Module)
+                    and not isinstance(res, torch.nn.Parameter)
+                    and name not in submod._buffers):
+                return False
+
+            return True
+
+        if (self.owning_module and
+                not _get_attr_reference_exists(self.owning_module, qualified_name)):
+            warnings.warn("Attempted to insert a get_attr Node with no "
+                          "underlying reference in the owning "
+                          "GraphModule! Call "
+                          "GraphModule.add_submodule to add the "
+                          "necessary submodule, "
+                          "GraphModule.add_parameter to add the "
+                          "necessary Parameter, or "
+                          "nn.Module.register_buffer to add the "
+                          "necessary buffer", stacklevel=2)
+        return self.create_node('get_attr', qualified_name, type_expr=type_expr)
+
+    @compatibility(is_backward_compatible=True)
+    def call_module(self,
+                    module_name: str,
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
+                    type_expr: Optional[Any] = None) -> Node:
+        """
+        Insert a ``call_module`` ``Node`` into the ``Graph``. A ``call_module`` node
+        represents a call to the forward() function of a ``Module`` in the ``Module``
+        hierarchy.
+
+        Args:
+
+            module_name (str): The qualified name of the ``Module`` in the ``Module``
+                hierarchy to be called. For example, if the traced ``Module`` has a
+                submodule named ``foo``, which has a submodule named ``bar``, the
+                qualified name ``foo.bar`` should be passed as ``module_name`` to
+                call that module.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called method. Note that this should *not* include a ``self`` argument.
+
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called method
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly-created and inserted ``call_module`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
+        """
+        if (self.owning_module and
+                self.owning_module.get_submodule(module_name) is None):
+            warnings.warn("Attempted to insert a call_module Node with "
+                          "no underlying reference in the owning "
+                          "GraphModule! Call "
+                          "GraphModule.add_submodule to add the "
+                          "necessary submodule")
+        return self.create_node('call_module', module_name, args, kwargs, type_expr=type_expr)
+
+    @compatibility(is_backward_compatible=True)
+    def call_method(self,
+                    method_name: str,
+                    args: Optional[Tuple['Argument', ...]] = None,
+                    kwargs: Optional[Dict[str, 'Argument']] = None,
+                    type_expr: Optional[Any] = None) -> Node:
+        """
+        Insert a ``call_method`` ``Node`` into the ``Graph``. A ``call_method`` node
+        represents a call to a given method on the 0th element of ``args``.
+
+        Args:
+
+            method_name (str): The name of the method to apply to the self argument.
+                For example, if args[0] is a ``Node`` representing a ``Tensor``,
+                then to call ``relu()`` on that ``Tensor``, pass ``relu`` to ``method_name``.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called method. Note that this *should* include a ``self`` argument.
+
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called method
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly created and inserted ``call_method`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
+        """
+        return self.create_node('call_method', method_name, args, kwargs, type_expr=type_expr)
+
+    @compatibility(is_backward_compatible=True)
+    def call_function(self,
+                      the_function: Callable[..., Any],
+                      args: Optional[Tuple['Argument', ...]] = None,
+                      kwargs: Optional[Dict[str, 'Argument']] = None,
+                      type_expr: Optional[Any] = None) -> Node:
+        """
+        Insert a ``call_function`` ``Node`` into the ``Graph``. A ``call_function`` node
+        represents a call to a Python callable, specified by ``the_function``.
+
+        Args:
+
+            the_function (Callable[..., Any]): The function to be called. Can be any PyTorch
+                operator, Python function, or member of the ``builtins`` or ``operator``
+                namespaces.
+
+            args (Optional[Tuple[Argument, ...]]): The positional arguments to be passed
+                to the called function.
+
+            kwargs (Optional[Dict[str, Argument]]): The keyword arguments to be passed
+                to the called function
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        Returns:
+
+            The newly created and inserted ``call_function`` node.
+
+        .. note::
+            The same insertion point and type expression rules apply for this method
+            as :meth:`Graph.create_node`.
+        """
+        return self.create_node('call_function', the_function, args, kwargs, type_expr=type_expr)
+
+    @compatibility(is_backward_compatible=True)
+    def node_copy(self, node: Node, arg_transform: Callable[[Node], 'Argument'] = lambda x: x) -> Node:
+        """
+        Copy a node from one graph into another. ``arg_transform`` needs to transform arguments from
+        the graph of node to the graph of self. Example::
+
+            # Copying all the nodes in `g` into `new_graph`
+            g : torch.fx.Graph = ...
+            new_graph = torch.fx.graph()
+            value_remap = {}
+            for node in g.nodes:
+                value_remap[node] = new_graph.node_copy(node, lambda n : value_remap[n])
+
+        Args:
+
+            node (Node): The node to copy into ``self``.
+
+            arg_transform (Callable[[Node], Argument]): A function that transforms
+                ``Node`` arguments in node's ``args`` and ``kwargs`` into the
+                equivalent argument in ``self``. In the simplest case, this should
+                retrieve a value out of a table mapping Nodes in the original
+                graph to ``self``.
+        """
+        args = map_arg(node.args, arg_transform)
+        kwargs = map_arg(node.kwargs, arg_transform)
+        assert isinstance(args, tuple)
+        assert isinstance(kwargs, dict)
+        result_node = self.create_node(node.op, node.target, args, kwargs, node.name, node.type)
+        result_node.meta = copy.copy(node.meta)
+        return result_node
+
+    @compatibility(is_backward_compatible=True)
+    def output(self, result: 'Argument', type_expr: Optional[Any] = None):
+        """
+        Insert an ``output`` ``Node`` into the ``Graph``. An ``output`` node represents
+        a ``return`` statement in Python code. ``result`` is the value that should
+        be returned.
+
+        Args:
+
+            result (Argument): The value to be returned.
+
+            type_expr (Optional[Any]): an optional type annotation representing the
+                Python type the output of this node will have.
+
+        .. note::
+
+            The same insertion point and type expression rules apply for this method
+            as ``Graph.create_node``.
+        """
+        return self.create_node(op='output', target='output', args=(result,), type_expr=type_expr)
+
+    def _target_to_str(self, target : Target) -> str:
+        if callable(target):
+            op = target.__name__
+        else:
+            assert isinstance(target, str)
+            op = target
+            if _is_magic(op):
+                op = op[2:-2]
+        op = _snake_case(op)
+        return op
+
+    @compatibility(is_backward_compatible=True)
+    def python_code(self, root_module: str, *, verbose: bool = False) -> PythonCode:
+        """
+        Turn this ``Graph`` into valid Python code.
+
+        Args:
+
+            root_module (str): The name of the root module on which to look-up
+                qualified name targets. This is usually 'self'.
+
+        Returns:
+
+            A PythonCode object, consisting of two fields:
+                src: the Python source code representing the object
+                globals: a dictionary of global names in `src` -> the objects that they reference.
+        """
+        # NOTE: [Graph Namespaces]
+        #
+        # There are two types of symbols in generated Python source code:
+        # locals and globals.
+        #   Locals are locally defined by the output of a node in the Graph.
+        #   Globals are references to external objects, like functions or types.
+        #
+        # When generating Python code, we need to make sure to name things
+        # appropriately. In particular:
+        # - All names should be unique, to avoid weird shadowing bugs.
+        # - These names need to be consistent, e.g. a object should always be
+        #   referenced by the same name.
+        #
+        # To do this, we create a new namespace just for this source. All names
+        # that get printed must come from this namespace.
+        #
+        # Why can't we re-use node.name? Because it was generated within the
+        # namespace `self._graph_namespace`. In order to provide uniqueness
+        # over both locals (node.name) *and* globals, we create a completely
+        # new namespace to put all identifiers in.
+        namespace = _Namespace()
+
+        # Override Node's repr to generate a valid name within our namespace.
+        # Since repr() is designed to produce a valid Python expression, it
+        # makes sense to re-use it. This way, it's easy to print something like
+        # Tuple[Node, Node] by simply calling repr() on it. Node's __repr__ is
+        # implemented cooperatively to allow this.
+        def node_repr(n: Node):
+            return namespace.create_name(n.name, n)
+
+        @contextmanager
+        def override_node_repr(graph: Graph):
+            orig_repr_fns = {}
+            for node in graph.nodes:
+                orig_repr_fns[node] = node._repr_fn
+                node._repr_fn = node_repr
+            try:
+                yield None
+            finally:
+                # restore the original repr functions
+                for node in graph.nodes:
+                    node._repr_fn = orig_repr_fns[node]
+
+        with override_node_repr(self):
+            return self._python_code(root_module, namespace, verbose=verbose)
+
+    def _python_code(self, root_module: str, namespace: _Namespace, *, verbose: bool = False) -> PythonCode:
+        return self._codegen._gen_python_code(self.nodes, root_module, namespace, verbose=verbose)
+
+
+    def __str__(self) -> str:
+        """
+        Return a human-readable (not machine-readable) string representation
+        of this Graph
+        """
+        placeholder_names : List[str] = []
+        # This is a one-element array just so ``format_node`` can modify the closed
+        # over value
+        maybe_return_typename : List[str] = ['']
+
+        node_strs = [node.format_node(placeholder_names) for node in self.nodes]
+        param_str = ', '.join(placeholder_names)
+        s = f'graph({param_str}){maybe_return_typename[0]}:'
+        for node_str in node_strs:
+            if node_str:
+                s += '\n    ' + node_str
+        return s
+
+    @compatibility(is_backward_compatible=True)
+    def print_tabular(self):
+        """
+        Prints the intermediate representation of the graph in tabular
+        format. Note that this API requires the ``tabulate`` module to be
+        installed.
+        """
+        try:
+            from tabulate import tabulate
+        except ImportError:
+            print("`print_tabular` relies on the library `tabulate`, "
+                  "which could not be found on this machine. Run `pip "
+                  "install tabulate` to install the library.")
+            raise
+
+        node_specs = [[n.op, n.name, n.target, n.args, n.kwargs]
+                      for n in self.nodes]
+        print(tabulate(node_specs,
+              headers=['opcode', 'name', 'target', 'args', 'kwargs']))
+
+    @compatibility(is_backward_compatible=True)
+    def lint(self):
+        """
+        Runs various checks on this Graph to make sure it is well-formed. In
+        particular:
+        - Checks Nodes have correct ownership (owned by this graph)
+        - Checks Nodes appear in topological order
+        - If this Graph has an owning GraphModule, checks that targets
+        exist in that GraphModule
+        """
+
+        # Check topo order
+        def check_arg(arg : Node, n : Optional[Node] = None) -> None:
+            context_str = f' of Node \'{n}\' ' if n else ' '
+            if arg.graph is not self:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}does not belong to this Graph, '
+                                   f'but was used as an argument! If you are copying nodes from another graph, make '
+                                   f'sure to use ``arg_transform`` on node_copy() to remap values\n{self}')
+            if arg not in seen_values:
+                raise RuntimeError(f'Argument \'{arg}\'{context_str}was used before it has been '
+                                   f'defined! Please check that Nodes in the graph are topologically ordered\n{self}')
+
+        seen_names : Set[str] = set()
+        seen_values : Set[Node] = set()
+        for node in self.nodes:
+            if node.op not in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output']:
+                raise RuntimeError(f'Node {node} had unknown opcode {node.op}!')
+            if node.graph is not self:
+                raise RuntimeError(f'Node \'{node}\' does not belong to this Graph!')
+            map_arg(node.args, lambda arg: check_arg(arg, node))
+            map_arg(node.kwargs, lambda arg: check_arg(arg, node))
+            seen_values.add(node)
+
+            if node.name in seen_names:
+                raise RuntimeError(f'Node redefined name {node.name}!')
+            seen_names.add(node.name)
+
+        # Check targets are legit
+        if self.owning_module:
+            for node in self.nodes:
+                if node.op == 'call_function':
+                    if not callable(node.target):
+                        raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but '
+                                         'a Callable is expected')
+                else:
+                    if not isinstance(node.target, str):
+                        raise ValueError(f'Node {node} target {node.target} has type {torch.typename(node.target)} but '
+                                         'a str is expected')
+                if node.op in ['get_attr', 'call_module']:
+                    target_atoms = node.target.split('.')
+                    m_itr = self.owning_module
+                    for i, atom in enumerate(target_atoms):
+                        new_m_itr = getattr(m_itr, atom, None)
+                        seen_qualname = '.'.join(target_atoms[:i])
+                        if new_m_itr is None:
+                            raise RuntimeError(f'Node {node} target {node.target} references nonexistent attribute '
+                                               f'{atom} of {seen_qualname}')
+                        if (node.op == "call_module"
+                                and not isinstance(new_m_itr, torch.nn.Module)):
+                            raise RuntimeError(f'Node {node} target {node.target} {atom} of {seen_qualname} does '
+                                               'not reference an nn.Module')
+                        elif (node.op == "get_attr"
+                              and not isinstance(new_m_itr, torch.nn.Module)
+                              and not isinstance(new_m_itr, torch.nn.Parameter)
+                              and atom not in m_itr._buffers):
+                            warnings.warn(f'Node {node} target {node.target} {atom} of {seen_qualname} does '
+                                          'not reference an nn.Module, nn.Parameter, or buffer, which is '
+                                          'what \'get_attr\' Nodes typically target')
+                        else:
+                            m_itr = new_m_itr
+
+    @compatibility(is_backward_compatible=True)
+    def eliminate_dead_code(self):
+        """
+        Remove all dead code from the graph, based on each node's number of
+        users, and whether the nodes have any side effects. The graph must be
+        topologically sorted before calling.
+
+        Returns:
+          bool: Whether the graph was changed as a result of the pass.
+
+        Example:
+
+        Before dead code is eliminated, `a` from `a = x + 1` below has no users
+        and thus can be eliminated from the graph without having an effect.
+
+        .. code-block:: python
+
+            def forward(self, x):
+                a = x + 1
+                return x + self.attr_1
+
+        After dead code is eliminated, `a = x + 1` has been removed, and the rest
+        of `forward` remains.
+
+        .. code-block:: python
+
+            def forward(self, x):
+                return x + self.attr_1
+
+        .. warning::
+
+            Dead code elimination has some heuristics to avoid removing
+            side-effectful nodes (see Node.is_impure) but in general coverage
+            is very bad, so you should assume that this method is not sound
+            to call unless you know that your FX graph consists entirely
+            of functional operations.
+        """
+        # Lint the graph first to make sure its topologically sorted, otherwise
+        # DCE below will not behave as expected.
+        self.lint()
+
+        # Reverse iterate so that when we remove a node, any nodes used as an
+        # input to that node have an updated user count that no longer reflects
+        # the removed node.
+        changed = False
+        for node in reversed(self.nodes):
+            if not node.is_impure() and len(node.users) == 0:
+                self.erase_node(node)
+                changed = True
+
+        return changed
+
+    @compatibility(is_backward_compatible=False)
+    def set_codegen(self, codegen: CodeGen):
+        self._codegen = codegen
+
+    @compatibility(is_backward_compatible=False)
+    def on_generate_code(
+        self,
+        make_transformer: Callable[[Optional[TransformCodeFunc]], TransformCodeFunc]
+    ):
+        """Register a transformer function when python code is generated
+
+        Args:
+            make_transformer (Callable[[Optional[TransformCodeFunc]], TransformCodeFunc]):
+                a function that returns a code transformer to be registered.
+                This function is called by `on_generate_code` to obtain the
+                code transformer.
+
+                This function is also given as its input the currently
+                registered code transformer (or None if nothing is registered),
+                in case it is not desirable to overwrite it. This is useful to
+                chain code transformers together.
+
+        Returns:
+            a context manager that when used in a `with` statement, to automatically
+            restore the previously registered code transformer.
+
+        Example:
+
+        .. code-block:: python
+
+
+            gm: fx.GraphModule = ...
+
+            # This is a code transformer we want to register. This code
+            # transformer prepends a pdb import and trace statement at the very
+            # beginning of the generated torch.fx code to allow for manual
+            # debugging with the PDB library.
+            def insert_pdb(body):
+                return ["import pdb; pdb.set_trace()\\n", *body]
+
+            # Registers `insert_pdb`, and overwrites the current registered
+            # code transformer (given by `_` to the lambda):
+            gm.graph.on_generate_code(
+                lambda _: insert_pdb
+            )
+
+            # Or alternatively, registers a code transformer which first
+            # runs `body` through existing registered transformer, then
+            # through `insert_pdb`:
+            gm.graph.on_generate_code(
+                lambda current_trans: (
+                    lambda body: insert_pdb(
+                        current_trans(body) if current_trans
+                        else body
+                    )
+                )
+            )
+
+            gm.recompile()
+            gm(*inputs)  # drops into pdb
+
+
+        This function can also be used as a context manager, with the benefit to
+        automatically restores the previously registered code transformer:
+
+        .. code-block:: python
+
+            # ... continue from previous example
+
+            with gm.graph.on_generate_code(lambda _: insert_pdb):
+                # do more stuff with `gm`...
+                gm.recompile()
+                gm(*inputs)  # drops into pdb
+
+            # now previous code transformer is restored (but `gm`'s code with pdb
+            # remains - that means you can run `gm` with pdb here too, until you
+            # run next `recompile()`).
+        """
+        on_gen_code_old = self._codegen._body_transformer
+        self._codegen._body_transformer = make_transformer(on_gen_code_old)
+
+        @contextlib.contextmanager
+        def on_generate_code_context_manager():
+            try:
+                yield
+            finally:
+                self._codegen._body_transformer = on_gen_code_old
+
+        return on_generate_code_context_manager()
+
+
+reflectable_magic_methods = {
+    'add': '{} + {}',
+    'sub': '{} - {}',
+    'mul': '{} * {}',
+    'floordiv': '{} // {}',
+    'truediv': '{} / {}',
+    'div': '{} / {}',
+    'mod': '{} % {}',
+    'pow': '{} ** {}',
+    'lshift': '{} << {}',
+    'rshift': '{} >> {}',
+    'and_': '{} & {}',
+    'or_': '{} | {}',
+    'xor': '{} ^ {}',
+    'getitem': '{}[{}]',
+    'matmul': '{} @ {}',
+}
+
+magic_methods = dict({
+    'eq': '{} == {}',
+    'ne': '{} != {}',
+    'lt': '{} < {}',
+    'gt': '{} > {}',
+    'le': '{} <= {}',
+    'ge': '{} >= {}',
+    'pos': '+{}',
+    'neg': '-{}',
+    'invert': '~{}'}, **reflectable_magic_methods)
+
+inplace_methods = {
+    'iadd': '{} += {}',
+    'iand': '{} &= {}',
+    'ifloordiv': '{} //= {}',
+    'ilshift': '{} <<= {}',
+    'imod': '{} %= {}',
+    'imul': '{} *= {}',
+    'imatmul': '{} @= {}',
+    'ior': '{} |= {}',
+    'ipow': '{} **= {}',
+    'irshift': '{} >>= {}',
+    'isub': '{} -= {}',
+    'itruediv': '{} /= {}',
+    'ixor': '{} ^= {}',
+    'setitem': '{}[{}] = {}',
+}
diff --git a/MLPY/Lib/site-packages/torch/fx/graph_module.py b/MLPY/Lib/site-packages/torch/fx/graph_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c1d8357ad0212bc5261a540cf3beeb61f545e9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/graph_module.py
@@ -0,0 +1,884 @@
+import contextlib
+import copy
+import itertools
+import linecache
+import os
+import sys
+import traceback
+import warnings
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Set, Type, Union
+
+import torch
+import torch.nn as nn
+import torch.overrides
+from torch.nn.modules.module import _addindent
+from torch.package import Importer, PackageExporter, PackageImporter, sys_importer
+
+from ._compatibility import compatibility
+from .graph import _custom_builtins, _is_from_torch, _PyTreeCodeGen, Graph, PythonCode
+
+__all__ = [
+    "reduce_graph_module",
+    "reduce_package_graph_module",
+    "reduce_deploy_graph_module",
+    "GraphModule",
+]
+
+_USER_PRESERVED_ATTRIBUTES_KEY = "_user_preserved_attributes"
+
+# Normal exec loses the source code, however we can work with
+# the linecache module to recover it.
+# Using _exec_with_source will add it to our local cache
+# and then tools like TorchScript will be able to get source info.
+class _EvalCacheLoader:
+    def __init__(self):
+        self.eval_cache = {}
+        self.next_id = 0
+
+    def cache(self, src: str, globals: Dict[str, Any], co_fields=None):
+        """Store the source in a private cache, and add a lazy entry in linecache
+        that allows the source to be retrieved by 'filename'.
+
+        Args:
+            src (str): The module source to cache
+            globals (dict): The module globals
+
+        Returns:
+            str: The cache key (and dummy filename) generated for src.
+        """
+
+        key = self._get_key()
+        if co_fields:
+            key += f" from {co_fields['co_filename']}:{co_fields['co_firstlineno']} in {co_fields['co_name']}"
+        self.eval_cache[key] = src
+
+        # Don't mutate globals so that this loader is only used
+        # to populate linecache, and doesn't interact with other modules
+        # that might check `__loader__`
+        globals_copy = globals.copy()
+        globals_copy["__file__"] = key
+        globals_copy["__name__"] = key
+        globals_copy["__loader__"] = self
+        linecache.lazycache(key, globals_copy)
+
+        return key
+
+    # Part of the loader protocol (PEP 302)
+    # linecache will use this method when trying to find source code
+    def get_source(self, module_name) -> Optional[str]:
+        if module_name in self.eval_cache:
+            return self.eval_cache[module_name]
+        return None
+
+    def _get_key(self):
+        key = f"<eval_with_key>.{self.next_id}"
+        self.next_id += 1
+        return key
+
+
+_loader = _EvalCacheLoader()
+
+
+def _exec_with_source(src: str, globals: Dict[str, Any], co_fields=None):
+    key = _loader.cache(src, globals, co_fields)
+    exec(compile(src, key, "exec"), globals)
+
+
+def _forward_from_src(src: str, globals: Dict[str, Any], co_fields=None):
+    return _method_from_src(
+        method_name="forward", src=src, globals=globals, co_fields=co_fields
+    )
+
+
+def _method_from_src(
+    method_name: str, src: str, globals: Dict[str, Any], co_fields=None
+) -> Callable:
+    # avoid mutating the passed in dict
+    globals_copy = globals.copy()
+    _exec_with_source(src, globals_copy, co_fields)
+    fn = globals_copy[method_name]
+    del globals_copy[method_name]
+    return fn
+
+
+def _format_import_statement(name: str, obj: Any, importer: Importer) -> str:
+    if name in _custom_builtins:
+        return _custom_builtins[name].import_str
+    if _is_from_torch(name):
+        return "import torch"
+    module_name, attr_name = importer.get_name(obj)
+    return f"from {module_name} import {attr_name} as {name}"
+
+
+def _format_import_block(globals: Dict[str, Any], importer: Importer):
+    import_strs: Set[str] = set()
+    for name, obj in globals.items():
+        import_strs.add(_format_import_statement(name, obj, importer))
+    # Sort the imports so we have a stable import block that allows us to
+    # hash the graph module and get a consistent key for use in a cache.
+    return "\n".join(sorted(import_strs))
+
+
+@compatibility(is_backward_compatible=True)
+def reduce_graph_module(body: Dict[Any, Any], import_block: str) -> torch.nn.Module:
+    # BC: attribute name was changed from `code` to `_code` to facilitate
+    # making `code` into a property and adding a docstring to it
+    fn_src = body.get("_code") or body["code"]
+    forward = _forward_from_src(import_block + fn_src, {})
+    return _deserialize_graph_module(forward, body)
+
+
+@compatibility(is_backward_compatible=True)
+def reduce_package_graph_module(
+    importer: PackageImporter, body: Dict[Any, Any], generated_module_name: str
+) -> torch.nn.Module:
+    forward = importer.import_module(generated_module_name).forward
+    return _deserialize_graph_module(forward, body)
+
+
+@compatibility(is_backward_compatible=True)
+def reduce_deploy_graph_module(
+    importer: PackageImporter, body: Dict[Any, Any], import_block: str
+) -> torch.nn.Module:
+    ns = {}
+    ns["__builtins__"] = importer.patched_builtins
+    fn_src = body.get("_code")
+    assert fn_src is not None
+    forward = _forward_from_src(import_block + fn_src, ns)
+    return _deserialize_graph_module(forward, body)
+
+
+# We create a dummy class here because symbolic_trace pulls the forward()
+# function off of the class, rather than the instance. This class is used
+# in _deserialize_graph_module() below.
+class _CodeOnlyModule(torch.nn.Module):
+    def __init__(self, body):
+        super().__init__()
+        self.__dict__ = body
+
+
+def _deserialize_graph_module(forward, body: Dict[Any, Any], graph_module_cls=None) -> torch.nn.Module:
+    """
+    Deserialize a GraphModule given the dictionary of the original module,
+    using the code to reconstruct the graph. We delete the actual graph before
+    saving the dictionary so that changes to the in-memory graph format do not
+    get serialized.
+    """
+
+    # Try to retrieve the forward source in a backward-compatible way
+    _CodeOnlyModule.forward = forward
+
+    tracer_cls = body.get("_tracer_cls")
+    if tracer_cls is None:
+        from ._symbolic_trace import Tracer
+
+        tracer_cls = Tracer
+
+    graphmodule_cls_name = body.get("_graphmodule_cls_name", "GraphModule")
+
+    # This is a workaround for a mypy linter issue related to
+    # passing base class as an argument - https://github.com/python/mypy/issues/5865.
+    cls_tracer: Any = tracer_cls
+
+    class KeepModules(cls_tracer):
+        # we shouldn't trace into any of the submodules,
+        # because they were not traced in the original GraphModule
+        def is_leaf_module(self, _: torch.nn.Module, __: str) -> bool:
+            return True
+
+    com = _CodeOnlyModule(body)
+
+    tracer_extras = body.get("_tracer_extras", {})
+    graph = KeepModules().trace(com, **tracer_extras)
+
+    # Manually set Tracer class on the reconstructed Graph, to avoid
+    # referencing the private local subclass KeepModules.
+    graph._tracer_cls = tracer_cls
+    from ._lazy_graph_module import _make_graph_module
+    gm = _make_graph_module(com, graph, class_name=graphmodule_cls_name, graph_module_cls=graph_module_cls)
+
+    # The GraphModule constructor only retains attributes referenced by the graph.
+    # In this case, our goal is return a GraphModule as close to identical as the one
+    # put into the package. If any additional attributes were present in body,
+    # we should keep them.
+    for k, v in body.items():
+        if not hasattr(gm, k):
+            setattr(gm, k, v)
+    return gm
+
+
+# copy an attribute value with qualified name 'target' from 'from_module' to 'to_module'
+# This installs empty Modules where none exist yet if they are subpaths of target
+def _copy_attr(from_module: torch.nn.Module, to_module: torch.nn.Module, target: str):
+    *prefix, field = target.split(".")
+    for item in prefix:
+        f = getattr(from_module, item)
+        t = getattr(to_module, item, None)
+        if f is t:
+            # we have already installed one of its parents
+            # (e.g. target = root.linear.weight, but we have already installed root.linear)
+            # once we install a parent, we no longer need to copy the children
+            # since all the needed properties will already be present
+            return
+
+        if t is None:
+            t = torch.nn.Module()
+            setattr(to_module, item, t)
+        from_module, to_module = f, t
+
+    orig = getattr(from_module, field)
+    # If it is a tensor and not a parameter attribute of a module, it should be a named buffer.
+    # So, we register it as a named buffer in the target module.
+    if isinstance(orig, torch.Tensor) and not isinstance(orig, torch.nn.Parameter):
+        to_module.register_buffer(field, orig)
+    else:
+        setattr(to_module, field, orig)
+
+
+# Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
+# This installs empty Modules where none exist yet if they are subpaths of target
+def _assign_attr(from_obj: Any, to_module: torch.nn.Module, target: str):
+    *prefix, field = target.split(".")
+    for item in prefix:
+        t = getattr(to_module, item, None)
+
+        if t is None:
+            t = torch.nn.Module()
+            setattr(to_module, item, t)
+        to_module = t
+
+    # If it is a tensor and not a parameter attribute of a module, it should be a named buffer.
+    # So, we register it as a named buffer in the target module.
+    if isinstance(from_obj, torch.Tensor) and not isinstance(
+        from_obj, torch.nn.Parameter
+    ):
+        to_module.register_buffer(field, from_obj)
+    else:
+        setattr(to_module, field, from_obj)
+
+
+class _WrappedCall:
+    def __init__(self, cls, cls_call):
+        self.cls = cls
+        self.cls_call = cls_call
+
+    # Previously, if an error occurred when valid
+    # symbolically-traced code was run with an invalid input, the
+    # user would see the source of the error as coming from
+    # `File "<eval_with_key_N">`, where N is some number. We use
+    # this function to generate a more informative error message. We
+    # return the traceback itself, a message explaining that the
+    # error occurred in a traced Module's generated forward
+    # function, and five lines of context surrounding the faulty
+    # line
+    @staticmethod
+    def _generate_error_message(frame_summary: traceback.FrameSummary) -> str:
+        # auxiliary variables (for readability)
+        err_lineno = frame_summary.lineno
+        assert err_lineno is not None
+        line = frame_summary.line
+        assert line is not None
+        err_line_len = len(line)
+        all_src_lines = linecache.getlines(frame_summary.filename)
+
+        # constituent substrings of the error message
+        tb_repr = traceback.format_exc()
+        custom_msg = (
+            "Call using an FX-traced Module, "
+            f"line {err_lineno} of the traced Module's "
+            "generated forward function:"
+        )
+        before_err = "".join(all_src_lines[err_lineno - 2 : err_lineno])
+        marker = "~" * err_line_len + "~~~ <--- HERE"
+        err_and_after_err = "\n".join(all_src_lines[err_lineno : err_lineno + 2])
+
+        # joined message
+        return "\n".join([tb_repr, custom_msg, before_err, marker, err_and_after_err])
+
+    def __call__(self, obj, *args, **kwargs):
+        try:
+            if self.cls_call is not None:
+                return self.cls_call(obj, *args, **kwargs)
+            else:
+                return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
+        except Exception as e:
+            assert e.__traceback__
+            topmost_framesummary: traceback.FrameSummary = (
+                traceback.StackSummary.extract(traceback.walk_tb(e.__traceback__))[-1]
+            )  # type: ignore[arg-type]
+            if "eval_with_key" in topmost_framesummary.filename:
+                print(
+                    _WrappedCall._generate_error_message(topmost_framesummary),
+                    file=sys.stderr,
+                )
+                raise e.with_traceback(None)  # noqa: TRY200
+            else:
+                raise e
+
+@compatibility(is_backward_compatible=True)
+class GraphModule(torch.nn.Module):
+    """
+    GraphModule is an nn.Module generated from an fx.Graph. Graphmodule has a
+    ``graph`` attribute, as well as ``code`` and ``forward`` attributes generated
+    from that ``graph``.
+
+    .. warning::
+
+        When ``graph`` is reassigned, ``code`` and ``forward`` will be automatically
+        regenerated. However, if you edit the contents of the ``graph`` without reassigning
+        the ``graph`` attribute itself, you must call ``recompile()`` to update the generated
+        code.
+    """
+
+    def __new__(cls: "Type[GraphModule]", *args, **kwargs):
+        # each instance of a graph module needs its own forward method
+        # so create a new singleton class for each instance.
+        # it is a subclass of the user-defined class, the only difference
+        # is an extra layer to install the forward method
+
+        # address issue described at https://github.com/pytorch/pytorch/issues/63883
+        # in other words, traverse class hierarchy to fix the redundant class definition problem
+        for t in cls.__mro__:
+            c = t.__qualname__.split(".")[-1]
+            if c != "GraphModuleImpl":
+                cls = t
+                break
+
+        class GraphModuleImpl(cls):  # type: ignore[misc, valid-type]
+            pass
+
+        return super().__new__(GraphModuleImpl)
+
+    @compatibility(is_backward_compatible=True)
+    def __init__(
+        self,
+        root: Union[torch.nn.Module, Dict[str, Any]],
+        graph: Graph,
+        class_name: str = "GraphModule",
+    ):
+        """
+        Construct a GraphModule.
+
+        Args:
+
+            root (Union[torch.nn.Module, Dict[str, Any]):
+                ``root`` can either be an nn.Module instance or a Dict mapping strings to any attribute type.
+                In the case that ``root`` is a Module, any references to Module-based objects (via qualified
+                name) in the Graph's Nodes' ``target`` field will be copied over from the respective place
+                within ``root``'s Module hierarchy into the GraphModule's module hierarchy.
+                In the case that ``root`` is a dict, the qualified name found in a Node's ``target`` will be
+                looked up directly in the dict's keys. The object mapped to by the Dict will be copied
+                over into the appropriate place within the GraphModule's module hierarchy.
+
+            graph (Graph): ``graph`` contains the nodes this GraphModule should use for code generation
+
+            class_name (str): ``name`` denotes the name of this GraphModule for debugging purposes. If it's unset, all
+                error messages will report as originating from ``GraphModule``. It may be helpful to set this
+                to ``root``'s original name or a name that makes sense within the context of your transform.
+        """
+        super().__init__()
+        self.__class__.__name__ = class_name
+        if isinstance(root, torch.nn.Module):
+            if hasattr(root, "training"):
+                self.training = root.training
+
+            # When we pickle/unpickle graph module, we don't want to drop any module or attributes.
+            if isinstance(root, _CodeOnlyModule):
+                for k, _ in root.named_children():
+                    _copy_attr(root, self, k)
+
+                for k, _ in root.named_buffers():
+                    _copy_attr(root, self, k)
+
+                for k, _ in root.named_parameters():
+                    _copy_attr(root, self, k)
+
+            for node in graph.nodes:
+                if node.op in ["get_attr", "call_module"]:
+                    assert isinstance(node.target, str)
+                    _copy_attr(root, self, node.target)
+        elif isinstance(root, dict):
+            targets_to_copy = []
+            for node in graph.nodes:
+                if node.op in ["get_attr", "call_module"]:
+                    assert isinstance(node.target, str)
+                    if node.target not in root:
+                        raise RuntimeError(
+                            "Node "
+                            + str(node)
+                            + " referenced target "
+                            + node.target
+                            + " but that target was not provided in ``root``!"
+                        )
+                    targets_to_copy.append(node.target)
+            # Sort targets in ascending order of the # of atoms.
+            # This will ensure that less deeply nested attributes are assigned
+            # before more deeply nested attributes. For example, foo.bar
+            # will be assigned before foo.bar.baz. Otherwise, we might assign
+            # the user-provided ``foo.bar`` and wipe out the previously-assigned
+            # ``foo.bar.baz``
+            targets_to_copy.sort(key=lambda t: t.count("."))
+            for target_to_copy in targets_to_copy:
+                _assign_attr(root[target_to_copy], self, target_to_copy)
+        else:
+            raise RuntimeError("Unsupported type " + str(root) + " passed for root!")
+
+        self.graph = graph
+
+        # Store the Tracer class responsible for creating a Graph separately as part of the
+        # GraphModule state, except when the Tracer is defined in a local namespace.
+        # Locally defined Tracers are not pickleable. This is needed because torch.package will
+        # serialize a GraphModule without retaining the Graph, and needs to use the correct Tracer
+        # to re-create the Graph during deserialization.
+        self._tracer_cls = None
+        if (
+            self.graph._tracer_cls
+            and "<locals>" not in self.graph._tracer_cls.__qualname__
+        ):
+            self._tracer_cls = self.graph._tracer_cls
+
+        self._tracer_extras = {}
+        if self.graph._tracer_extras:
+            self._tracer_extras = self.graph._tracer_extras
+
+        # Dictionary to store metadata
+        self.meta: Dict[str, Any] = {}
+        self._replace_hook = None
+
+    # TorchScript breaks trying to compile the graph setter because of the
+    # continued string literal. Issue here: https://github.com/pytorch/pytorch/issues/44842
+    #
+    # Shouldn't be an issue since these methods shouldn't be used in TorchScript anyway
+    __jit_unused_properties__ = ["graph"]
+
+    @property
+    def graph(self) -> Graph:
+        """
+        Return the ``Graph`` underlying this ``GraphModule``
+        """
+        return self._graph
+
+    @graph.setter
+    def graph(self, g: Graph) -> None:
+        """
+        Set the underlying ``Graph`` for this ``GraphModule``. This will internally
+        recompile the ``GraphModule`` so that the generated ``forward()`` function
+        corresponds to ``g``
+        """
+        assert isinstance(g, Graph), f"Expected a Graph instance, but got {type(g)}"
+        self._graph = g
+        g.owning_module = self
+        self.recompile()
+
+    @compatibility(is_backward_compatible=False)
+    def to_folder(self, folder: Union[str, os.PathLike], module_name: str = "FxModule"):
+        """Dumps out module to ``folder`` with ``module_name`` so that it can be
+        imported with ``from <folder> import <module_name>``
+
+        Args:
+
+            folder (Union[str, os.PathLike]): The folder to write the code out to
+
+            module_name (str): Top-level name to use for the ``Module`` while
+                writing out the code
+        """
+        folder = Path(folder)
+        Path(folder).mkdir(exist_ok=True)
+        torch.save(self.state_dict(), folder / "state_dict.pt")
+        tab = " " * 4
+        custom_builtins = "\n".join([v.import_str for v in _custom_builtins.values()])
+        model_str = f"""
+import torch
+{custom_builtins}
+
+from torch.nn import *
+class {module_name}(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+"""
+
+        def _gen_model_repr(module_name: str, module: torch.nn.Module) -> Optional[str]:
+            safe_reprs = [
+                nn.Linear,
+                nn.Conv1d,
+                nn.Conv2d,
+                nn.Conv3d,
+                nn.BatchNorm1d,
+                nn.BatchNorm2d,
+                nn.BatchNorm3d,
+            ]
+            if type(module) in safe_reprs:
+                return f"{module.__repr__()}"
+            else:
+                return None
+
+        blobified_modules = []
+        for module_name, module in self.named_children():
+            module_str = _gen_model_repr(module_name, module)
+            if module_str is None:
+                module_file = folder / f"{module_name}.pt"
+                torch.save(module, module_file)
+                blobified_modules.append(module_name)
+                module_repr = module.__repr__().replace("\r", " ").replace("\n", " ")
+                module_str = f"torch.load(r'{module_file}') # {module_repr}"
+            model_str += f"{tab*2}self.{module_name} = {module_str}\n"
+
+        for buffer_name, buffer in self._buffers.items():
+            if buffer is None:
+                continue
+            model_str += f"{tab*2}self.register_buffer('{buffer_name}', torch.empty({list(buffer.shape)}, dtype={buffer.dtype}))\n"
+
+        for param_name, param in self._parameters.items():
+            if param is None:
+                continue
+            model_str += f"{tab*2}self.{param_name} = torch.nn.Parameter(torch.empty({list(param.shape)}, dtype={param.dtype}))\n"
+
+        model_str += (
+            f"{tab*2}self.load_state_dict(torch.load(r'{folder}/state_dict.pt'))\n"
+        )
+        model_str += f"{_addindent(self.code, 4)}\n"
+
+        module_file = folder / "module.py"
+        module_file.write_text(model_str)
+
+        init_file = folder / "__init__.py"
+        init_file.write_text("from .module import *")
+
+        if len(blobified_modules) > 0:
+            warnings.warn(
+                "Was not able to save the following children modules as reprs -"
+                f"saved as pickled files instead: {blobified_modules}"
+            )
+
+    @compatibility(is_backward_compatible=True)
+    def add_submodule(self, target: str, m: torch.nn.Module) -> bool:
+        """
+        Adds the given submodule to ``self``.
+
+        This installs empty Modules where none exist yet if they are
+        subpaths of ``target``.
+
+        Args:
+            target: The fully-qualified string name of the new submodule
+                (See example in ``nn.Module.get_submodule`` for how to
+                specify a fully-qualified string.)
+            m: The submodule itself; the actual object we want to
+                install in the current Module
+
+        Return:
+            bool: Whether or not the submodule could be inserted. For
+                this method to return True, each object in the chain
+                denoted by ``target`` must either a) not exist yet,
+                or b) reference an ``nn.Module`` (not a parameter or
+                other attribute)
+        """
+        *prefix, field = target.split(".")
+        mod: torch.nn.Module = self
+
+        for item in prefix:
+
+            submod = getattr(mod, item, None)
+
+            if submod is None:
+                submod = torch.nn.Module()
+                setattr(mod, item, submod)
+
+            if not isinstance(submod, torch.nn.Module):
+                return False
+
+            mod = submod
+
+        mod.add_module(field, m)
+        return True
+
+    @compatibility(is_backward_compatible=True)
+    def delete_submodule(self, target: str) -> bool:
+        """
+        Deletes the given submodule from ``self``.
+
+        The module will not be deleted if ``target`` is not a valid
+        target.
+
+        Args:
+            target: The fully-qualified string name of the new submodule
+                (See example in ``nn.Module.get_submodule`` for how to
+                specify a fully-qualified string.)
+
+        Returns:
+            bool: Whether or not the target string referenced a
+                submodule we want to delete. A return value of ``False``
+                means that the ``target`` was not a valid reference to
+                a submodule.
+        """
+        atoms = target.split(".")
+        path, target_submod = atoms[:-1], atoms[-1]
+        mod: torch.nn.Module = self
+
+        # Get the parent module
+        for item in path:
+
+            if not hasattr(mod, item):
+                return False
+
+            mod = getattr(mod, item)
+
+            if not isinstance(mod, torch.nn.Module):
+                return False
+
+        if not hasattr(mod, target_submod):
+            return False
+
+        if not isinstance(getattr(mod, target_submod), torch.nn.Module):
+            return False
+
+        delattr(mod, target_submod)
+        return True
+
+    @compatibility(is_backward_compatible=True)
+    def delete_all_unused_submodules(self) -> None:
+        """
+        Deletes all unused submodules from ``self``.
+
+        A Module is considered "used" if any one of the following is
+        true:
+        1. It has children that are used
+        2. Its forward is called directly via a ``call_module`` node
+        3. It has a non-Module attribute that is used from a
+        ``get_attr`` node
+
+        This method can be called to clean up an ``nn.Module`` without
+        manually calling ``delete_submodule`` on each unused submodule.
+        """
+        used: List[str] = []
+
+        for node in self.graph.nodes:
+
+            if node.op == "call_module" or node.op == "get_attr":
+
+                # A list of strings representing the different parts
+                # of the path. For example, `foo.bar.baz` gives us
+                # ["foo", "bar", "baz"]
+                fullpath = node.target.split(".")
+
+                # If we're looking at multiple parts of a path, join
+                # join them with a dot. Otherwise, return that single
+                # element without doing anything to it.
+                def join_fn(x: str, y: str) -> str:
+                    return ".".join([x, y] if y else [x])
+
+                # Progressively collect all the names of intermediate
+                # modules. For example, if we have the target
+                # `foo.bar.baz`, we'll add `foo`, `foo.bar`, and
+                # `foo.bar.baz` to the list.
+                used.extend(itertools.accumulate(fullpath, join_fn))
+
+                # For a `call_module` node, also register all recursive submodules
+                # as used
+                if node.op == "call_module":
+                    try:
+                        submod = self.get_submodule(node.target)
+
+                        for submod_name, _ in submod.named_modules():
+                            if submod_name != "":
+                                used.append(".".join([node.target, submod_name]))
+                    except AttributeError:
+                        # Node referenced nonexistent submodule, don't need to
+                        # worry about GCing anything
+                        pass
+
+        to_delete = [name for name, _ in self.named_modules() if name not in used]
+
+        for name in to_delete:
+            self.delete_submodule(name)
+
+    @property
+    def code(self) -> str:
+        """
+        Return the Python code generated from the ``Graph`` underlying this
+        ``GraphModule``.
+        """
+        if not hasattr(self, "_code"):
+            raise RuntimeError(
+                "Code has not been generated! Please report a bug to PyTorch"
+            )
+        return self._code
+
+    @compatibility(is_backward_compatible=True)
+    def recompile(self) -> PythonCode:
+        """
+        Recompile this GraphModule from its ``graph`` attribute. This should be
+        called after editing the contained ``graph``, otherwise the generated
+        code of this ``GraphModule`` will be out of date.
+        """
+        if isinstance(self._graph._codegen, _PyTreeCodeGen):
+            self._in_spec = self._graph._codegen.pytree_info.in_spec
+            self._out_spec = self._graph._codegen.pytree_info.out_spec
+        python_code = self._graph.python_code(root_module="self")
+        self._code = python_code.src
+        self._lineno_map = python_code._lineno_map
+
+        cls = type(self)
+        co_fields = self._graph._co_fields if hasattr(self._graph, "_co_fields") else {}
+        cls.forward = _forward_from_src(self._code, python_code.globals, co_fields)
+
+        # Determine whether this class explicitly defines a __call__ implementation
+        # to wrap. If it does, save it in order to have wrapped_call invoke it.
+        # If it does not, wrapped_call can use a dynamic call to super() instead.
+        # In most cases, super().__call__ should be torch.nn.Module.__call__.
+        # We do not want to hold a reference to Module.__call__ here; doing so will
+        # bypass patching of torch.nn.Module.__call__ done while symbolic tracing.
+        cls_call = cls.__call__ if "__call__" in vars(cls) else None
+
+        if "_wrapped_call" not in vars(cls):
+            cls._wrapped_call = _WrappedCall(cls, cls_call)  # type: ignore[attr-defined]
+
+        def call_wrapped(self, *args, **kwargs):
+            return self._wrapped_call(self, *args, **kwargs)
+
+        cls.__call__ = call_wrapped  # type: ignore[method-assign]
+
+        return python_code
+
+    # Passing Tracer as argument allows subclasses extending fx.GraphModule
+    # define their own Tracer (extending fx.Tracer).
+    def __reduce_deploy__(self, importer: Importer):
+        dict_without_graph = self.__dict__.copy()
+        dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
+        del dict_without_graph["_graph"]
+
+        python_code = self.recompile()
+        import_block = _format_import_block(python_code.globals, importer)
+        return (reduce_deploy_graph_module, (dict_without_graph, import_block))
+
+    def __reduce_package__(self, exporter: PackageExporter):
+        dict_without_graph = self.__dict__.copy()
+        dict_without_graph["_graphmodule_cls_name"] = self.__class__.__name__
+        del dict_without_graph["_graph"]
+
+        generated_module_name = f"fx-generated._{exporter.get_unique_id()}"
+        python_code = self.recompile()
+        import_block = _format_import_block(python_code.globals, exporter.importer)
+        module_code = import_block + self.code
+        exporter.save_source_string(generated_module_name, module_code)
+        return (
+            reduce_package_graph_module,
+            (dict_without_graph, generated_module_name),
+        )
+
+    def __reduce__(self):
+        """
+        Serialization of GraphModule. We serialize only the generated code, not
+        the underlying ``Graph``. This is because ``Graph`` does not have on-disk
+        backward-compatibility guarantees, whereas Python source code does.
+        On the deserialization side, we symbolically trace through the generated
+        code to regenerate the underlying ``Graph``
+        """
+        dict_without_graph = self.__dict__.copy()
+
+        python_code = self.recompile()
+        import_block = _format_import_block(python_code.globals, sys_importer)
+        del dict_without_graph["_graph"]
+        return (reduce_graph_module, (dict_without_graph, import_block))
+
+    def _deepcopy_init(self):
+        return GraphModule.__init__
+
+    # because __reduce__ is defined for serialization,
+    # we need to define deepcopy otherwise it will call __reduce__
+    # and cause symbolic tracing to occur every time we try to copy the object
+    def __deepcopy__(self, memo):
+        res = type(self).__new__(type(self))
+        memo[id(self)] = res
+        fake_mod = _CodeOnlyModule(copy.deepcopy(self.__dict__, memo))
+        self._deepcopy_init()(res, fake_mod, fake_mod.__dict__["_graph"])
+        # hooks are lost during `GraphModule.__init__`, so we need to copy over
+        # them explicitly, note right now we are only copying state_dict related
+        # hooks, to reduce bc-related issues, we can copy forward/backward related
+        # hooks in the future as well if needed
+        extra_preserved_attrs = [
+            "_state_dict_hooks",
+            "_load_state_dict_pre_hooks",
+            "_load_state_dict_post_hooks",
+            "_replace_hook",
+        ]
+        for attr in extra_preserved_attrs:
+            if attr in self.__dict__:
+                setattr(res, attr, copy.deepcopy(self.__dict__[attr], memo))
+        res.meta = copy.deepcopy(getattr(self, "meta", {}), memo)
+        if _USER_PRESERVED_ATTRIBUTES_KEY in res.meta:
+            for attr_name, attr in res.meta[_USER_PRESERVED_ATTRIBUTES_KEY].items():
+                setattr(res, attr_name, attr)
+        return res
+
+    def __copy__(self):
+        from ._lazy_graph_module import _make_graph_module
+        res = _make_graph_module(self, self.graph)
+        res.meta = getattr(self, "meta", {})
+        return res
+
+    @compatibility(is_backward_compatible=False)
+    def print_readable(self, print_output=True):
+        """
+        Return the Python code generated for current GraphModule and its children GraphModules
+        """
+        verbose_python_code = self._graph.python_code(root_module="self", verbose=True)
+        module_code = verbose_python_code.src
+        module_code = module_code.lstrip("\n")
+        module_code = f"class {self._get_name()}(torch.nn.Module):\n" + module_code
+        module_code = _addindent(module_code, 4)
+
+        submodule_code_list = [""]
+        for submodule in self.children():
+            if isinstance(submodule, GraphModule):
+                submodule_code_list.append(submodule.print_readable(print_output=False))
+        submodule_code = "\n".join(submodule_code_list)
+        submodule_code = _addindent(submodule_code, 4)
+
+        output = module_code + submodule_code
+        if print_output:
+            print(module_code + submodule_code)
+        return output
+
+    def __str__(self) -> str:
+        orig_str = super().__str__()
+        print_readable_reminder = (
+            "# To see more debug info, please use `graph_module.print_readable()`"
+        )
+        return "\n".join([orig_str, self._code, print_readable_reminder])
+
+    def _replicate_for_data_parallel(self):
+        new_gm = self.__copy__()
+        new_gm._is_replica = True
+        return new_gm
+
+    @contextlib.contextmanager
+    def _set_replace_hook(self, f):
+        """
+        Takes a callable which will be called everytime when we replace a node
+        to a new node, or change the node's name. Callable takes three arguments:
+        the old node we're changing, and NAME of the new node, followed by the
+        user node which consumes the old node to be replaced.
+        """
+        assert callable(f), "Replace hook must be a callable."
+        prev, self._replace_hook = self._replace_hook, f
+        try:
+            yield
+        finally:
+            self._replace_hook = prev
+
+
+# workarounds for issues in __torch_function__
+
+# WAR for __torch_function__ not handling tensor lists,
+# fix is in https://github.com/pytorch/pytorch/pull/34725
+# orig_cat = torch.cat
+# def patched_cat(*args, **kwargs):
+#     tensors = args[0]
+#     for t in tensors:
+#         if isinstance(t, Proxy):
+#             return t.__torch_function__(patched_cat, (), args, kwargs)
+#     return orig_cat(*args, **kwargs)
+# patched_cat.__module__ = 'torch'
+# patched_cat.__name__ = 'cat'
+# torch.cat = patched_cat
diff --git a/MLPY/Lib/site-packages/torch/fx/immutable_collections.py b/MLPY/Lib/site-packages/torch/fx/immutable_collections.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fb87bf1bc80901568e75efe13b7d119709b49f8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/immutable_collections.py
@@ -0,0 +1,112 @@
+from typing import Any, Dict, Iterable, List, Tuple
+
+from torch.utils._pytree import (
+    _dict_flatten,
+    _dict_flatten_with_keys,
+    _dict_unflatten,
+    _list_flatten,
+    _list_flatten_with_keys,
+    _list_unflatten,
+    Context,
+    register_pytree_node,
+)
+
+from ._compatibility import compatibility
+
+
+__all__ = ["immutable_list", "immutable_dict"]
+
+_help_mutation = """\
+If you are attempting to modify the kwargs or args of a torch.fx.Node object,
+instead create a new copy of it and assign the copy to the node:
+    new_args = ... # copy and mutate args
+    node.args = new_args
+"""
+
+
+def _no_mutation(self, *args, **kwargs):
+    raise NotImplementedError(
+        f"'{type(self).__name__}' object does not support mutation. {_help_mutation}",
+    )
+
+
+def _create_immutable_container(base, mutable_functions):
+    container = type("immutable_" + base.__name__, (base,), {})
+    for attr in mutable_functions:
+        setattr(container, attr, _no_mutation)
+    return container
+
+
+immutable_list = _create_immutable_container(
+    list,
+    [
+        "__delitem__",
+        "__iadd__",
+        "__imul__",
+        "__setitem__",
+        "append",
+        "clear",
+        "extend",
+        "insert",
+        "pop",
+        "remove",
+    ],
+)
+immutable_list.__reduce__ = lambda self: (immutable_list, (tuple(iter(self)),))
+immutable_list.__hash__ = lambda self: hash(tuple(self))
+
+compatibility(is_backward_compatible=True)(immutable_list)
+
+immutable_dict = _create_immutable_container(
+    dict,
+    [
+        "__delitem__",
+        "__setitem__",
+        "clear",
+        "pop",
+        "popitem",
+        "update",
+    ],
+)
+immutable_dict.__reduce__ = lambda self: (immutable_dict, (iter(self.items()),))
+immutable_dict.__hash__ = lambda self: hash(tuple(self.items()))
+compatibility(is_backward_compatible=True)(immutable_dict)
+
+
+# Register immutable collections for PyTree operations
+def _immutable_dict_flatten(d: Dict[Any, Any]) -> Tuple[List[Any], Context]:
+    return _dict_flatten(d)
+
+
+def _immutable_dict_unflatten(
+    values: Iterable[Any],
+    context: Context,
+) -> Dict[Any, Any]:
+    return immutable_dict(_dict_unflatten(values, context))
+
+
+def _immutable_list_flatten(d: List[Any]) -> Tuple[List[Any], Context]:
+    return _list_flatten(d)
+
+
+def _immutable_list_unflatten(
+    values: Iterable[Any],
+    context: Context,
+) -> List[Any]:
+    return immutable_list(_list_unflatten(values, context))
+
+
+register_pytree_node(
+    immutable_dict,
+    _immutable_dict_flatten,
+    _immutable_dict_unflatten,
+    serialized_type_name="torch.fx.immutable_collections.immutable_dict",
+    flatten_with_keys_fn=_dict_flatten_with_keys,
+)
+register_pytree_node(
+    immutable_list,
+    _immutable_list_flatten,
+    _immutable_list_unflatten,
+    serialized_type_name="torch.fx.immutable_collections.immutable_list",
+    flatten_with_keys_fn=_list_flatten_with_keys,
+)
diff --git a/MLPY/Lib/site-packages/torch/fx/interpreter.py b/MLPY/Lib/site-packages/torch/fx/interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..267c394acf406c65699a317e15d1c9914c77bdfd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/interpreter.py
@@ -0,0 +1,512 @@
+from .graph_module import GraphModule
+from ._lazy_graph_module import _make_graph_module
+from .graph import Graph
+from .node import Argument, Node, Target, map_arg, map_aggregate
+from .proxy import Proxy
+from ._symbolic_trace import Tracer
+from ._compatibility import compatibility
+from . import config
+import torch.fx.traceback as fx_traceback
+import torch
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+import inspect
+from contextlib import contextmanager
+from torch.hub import tqdm
+
+__all__ = ['Interpreter', 'Transformer']
+
+@compatibility(is_backward_compatible=True)
+class Interpreter:
+    """
+    An Interpreter executes an FX graph Node-by-Node. This pattern
+    can be useful for many things, including writing code
+    transformations as well as analysis passes.
+
+    Methods in the Interpreter class can be overridden to customize
+    the behavior of execution. The map of overrideable methods
+    in terms of call hierarchy::
+
+        run()
+            +-- run_node
+                +-- placeholder()
+                +-- get_attr()
+                +-- call_function()
+                +-- call_method()
+                +-- call_module()
+                +-- output()
+
+    Example:
+
+        Suppose we want to swap all instances of ``torch.neg`` with
+        ``torch.sigmoid`` and vice versa (including their ``Tensor``
+        method equivalents). We could subclass Interpreter like so::
+
+            class NegSigmSwapInterpreter(Interpreter):
+                def call_function(self, target : Target,
+                                  args : Tuple, kwargs : Dict) -> Any:
+                    if target == torch.sigmoid:
+                        return torch.neg(*args, **kwargs)
+                    return super().call_function(n)
+
+                def call_method(self, target : Target,
+                                args : Tuple, kwargs : Dict) -> Any:
+                    if target == 'neg':
+                        call_self, *args_tail = args
+                        return call_self.sigmoid(*args_tail, **kwargs)
+                    return super().call_method(n)
+
+            def fn(x):
+                return torch.sigmoid(x).neg()
+
+            gm = torch.fx.symbolic_trace(fn)
+            input = torch.randn(3, 4)
+            result = NegSigmSwapInterpreter(gm).run(input)
+            torch.testing.assert_close(result, torch.neg(input).sigmoid())
+
+    Args:
+        module (torch.nn.Module): The module to be executed
+        garbage_collect_values (bool): Whether to delete values after their last
+            use within the Module's execution. This ensures optimal memory usage during
+            execution. This can be disabled to, for example, examine all of the intermediate
+            values in the execution by looking at the ``Interpreter.env`` attribute.
+        graph (Optional[Graph]): If passed, the interpreter will execute this
+            graph instead of `module.graph`, using the provided `module`
+            argument to satisfy any requests for state.
+    """
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, module: torch.nn.Module, garbage_collect_values: bool = True, graph: Optional[Graph] = None):
+        self.module = module
+        self.submodules = dict(self.module.named_modules())
+        if graph is not None:
+            self.graph = graph
+        else:
+            self.graph = self.module.graph
+        self.env : Dict[Node, Any] = {}
+        self.name = "Interpreter"
+        self.garbage_collect_values = garbage_collect_values
+        self.extra_traceback = True
+
+        if self.garbage_collect_values:
+            # Run through reverse nodes and record the first instance of a use
+            # of a given node. This represents the *last* use of the node in the
+            # execution order of the program, which we will use to free unused
+            # values
+            node_to_last_use : Dict[Node, Node] = {}
+            self.user_to_last_uses : Dict[Node, List[Node]] = {}
+
+            def register_last_uses(n : Node, user : Node):
+                if n not in node_to_last_use:
+                    node_to_last_use[n] = user
+                    self.user_to_last_uses.setdefault(user, []).append(n)
+
+            for node in reversed(self.graph.nodes):
+                map_arg(node.args, lambda n: register_last_uses(n, node))
+                map_arg(node.kwargs, lambda n: register_last_uses(n, node))
+
+    @compatibility(is_backward_compatible=True)
+    def run(self, *args, initial_env : Optional[Dict[Node, Any]] = None, enable_io_processing : bool = True) -> Any:
+        """
+        Run `module` via interpretation and return the result.
+
+        Args:
+            *args: The arguments to the Module to run, in positional order
+            initial_env (Optional[Dict[Node, Any]]): An optional starting environment for execution.
+                This is a dict mapping `Node` to any value. This can be used, for example, to
+                pre-populate results for certain `Nodes` so as to do only partial evaluation within
+                the interpreter.
+            enable_io_processing (bool): If true, we process the inputs and outputs with graph's process_inputs and
+                process_outputs function first before using them.
+
+        Returns:
+            Any: The value returned from executing the Module
+        """
+        self.env = initial_env if initial_env is not None else {}
+
+        # Positional function args are consumed left-to-right by
+        # `placeholder` nodes. Use an iterator to keep track of
+        # position and extract those values.
+        if enable_io_processing:
+            args = self.graph.process_inputs(*args)
+        self.args_iter : Iterator[Any] = iter(args)
+        pbar = tqdm(total=len(self.graph.nodes),
+                    desc=f"{self.name}: {str(list(self.graph.nodes)) if config.verbose_progress else ''}",
+                    initial=0, position=0, leave=True, disable=config.disable_progress, delay=0)
+
+        for node in self.graph.nodes:
+            pbar.update(1)
+            if node in self.env:
+                # Short circuit if we have this value. This could
+                # be used, for example, for partial evaluation
+                # where the caller has pre-populated `env` with
+                # values for a subset of the program.
+                continue
+
+            try:
+                self.env[node] = self.run_node(node)
+            except Exception as e:
+                if self.extra_traceback:
+                    msg = f"While executing {node.format_node()}"
+                    msg = f'{e.args[0]}\n\n{msg}' if e.args else str(msg)
+                    msg += f"\nOriginal traceback:\n{node.stack_trace}"
+                    e.args = (msg,) + e.args[1:]
+                    if isinstance(e, KeyError):
+                        raise RuntimeError(*e.args) from e
+                raise
+
+            if self.garbage_collect_values:
+                for to_delete in self.user_to_last_uses.get(node, []):
+                    del self.env[to_delete]
+
+            if node.op == 'output':
+                output_val = self.env[node]
+                return self.graph.process_outputs(output_val) if enable_io_processing else output_val
+
+    @compatibility(is_backward_compatible=True)
+    def boxed_run(self, args_list):
+        """
+        Run `module` via interpretation and return the result.  This uses the "boxed"
+        calling convention, where you pass a list of arguments, which will be cleared
+        by the interpreter.  This ensures that input tensors are promptly deallocated.
+        """
+        args_iter = iter(args_list)
+        env = {}
+        for n in self.graph.nodes:
+            if n.op == "placeholder":
+                env[n] = next(args_iter)
+        args_list.clear()
+        return self.run(initial_env=env)
+
+    @contextmanager
+    def _set_current_node(self, node):
+        with fx_traceback.set_current_meta(node):
+            yield
+
+    @compatibility(is_backward_compatible=True)
+    def run_node(self, n : Node) -> Any:
+        """
+        Run a specific node ``n`` and return the result.
+        Calls into placeholder, get_attr, call_function,
+        call_method, call_module, or output depending
+        on ``node.op``
+
+        Args:
+            n (Node): The Node to execute
+
+        Returns:
+            Any: The result of executing ``n``
+        """
+        with self._set_current_node(n):
+            args, kwargs = self.fetch_args_kwargs_from_env(n)
+            assert isinstance(args, tuple)
+            assert isinstance(kwargs, dict)
+            return getattr(self, n.op)(n.target, args, kwargs)
+
+    # Main Node running APIs
+    @compatibility(is_backward_compatible=True)
+    def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute a ``placeholder`` node. Note that this is stateful:
+        ``Interpreter`` maintains an internal iterator over
+        arguments passed to ``run`` and this method returns
+        next() on that iterator.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Returns:
+            Any: The argument value that was retrieved.
+        """
+        assert isinstance(target, str)
+        if target.startswith('*'):
+            # For a starred parameter e.g. `*args`, retrieve all
+            # remaining values from the args list.
+            return list(self.args_iter)
+        else:
+            try:
+                return next(self.args_iter)
+            except StopIteration as si:
+                if len(args) > 0:
+                    return args[0]
+                else:
+                    raise RuntimeError(f'Expected positional argument for parameter {target}, but one was not passed in!') from si
+
+    @compatibility(is_backward_compatible=True)
+    def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute a ``get_attr`` node. Will retrieve an attribute
+        value from the ``Module`` hierarchy of ``self.module``.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return:
+            Any: The value of the attribute that was retrieved
+        """
+        assert isinstance(target, str)
+        return self.fetch_attr(target)
+
+    @compatibility(is_backward_compatible=True)
+    def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_function`` node and return the result.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            Any: The value returned by the function invocation
+        """
+        assert not isinstance(target, str)
+
+        # Execute the function and return the result
+        return target(*args, **kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_method`` node and return the result.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            Any: The value returned by the method invocation
+        """
+        # args[0] is the `self` object for this method call
+        self_obj, *args_tail = args
+
+        # Execute the method and return the result
+        assert isinstance(target, str)
+        return getattr(self_obj, target)(*args_tail, **kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute a ``call_module`` node and return the result.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return
+            Any: The value returned by the module invocation
+        """
+        # Retrieve executed args and kwargs values from the environment
+
+        # Execute the method and return the result
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+
+        return submod(*args, **kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def output(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        """
+        Execute an ``output`` node. This really just retrieves
+        the value referenced by the ``output`` node and returns it.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+
+        Return:
+            Any: The return value referenced by the output node
+        """
+        return args[0]
+
+    # Helper methods
+    @compatibility(is_backward_compatible=True)
+    def fetch_attr(self, target : str):
+        """
+        Fetch an attribute from the ``Module`` hierarchy of ``self.module``.
+
+        Args:
+            target (str): The fully-qualified name of the attribute to fetch
+
+        Return:
+            Any: The value of the attribute.
+        """
+        target_atoms = target.split('.')
+        attr_itr = self.module
+        for i, atom in enumerate(target_atoms):
+            if not hasattr(attr_itr, atom):
+                raise RuntimeError(f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}")
+            attr_itr = getattr(attr_itr, atom)
+        return attr_itr
+
+    @compatibility(is_backward_compatible=True)
+    def fetch_args_kwargs_from_env(self, n : Node) -> Tuple[Tuple, Dict]:
+        """
+        Fetch the concrete values of ``args`` and ``kwargs`` of node ``n``
+        from the current execution environment.
+
+        Args:
+            n (Node): The node for which ``args`` and ``kwargs`` should be fetched.
+
+        Return:
+            Tuple[Tuple, Dict]: ``args`` and ``kwargs`` with concrete values for ``n``.
+        """
+        args = self.map_nodes_to_values(n.args, n)
+        assert isinstance(args, tuple)
+        kwargs = self.map_nodes_to_values(n.kwargs, n)
+        assert isinstance(kwargs, dict)
+        return args, kwargs
+
+    @compatibility(is_backward_compatible=True)
+    def map_nodes_to_values(self, args : Argument, n : Node) -> Argument:
+        """
+        Recursively descend through ``args`` and look up the concrete value
+        for each ``Node`` in the current execution environment.
+
+        Args:
+            args (Argument): Data structure within which to look up concrete values
+
+            n (Node): Node to which ``args`` belongs. This is only used for error reporting.
+        """
+        def load_arg(n_arg : Node) -> Any:
+            if n_arg not in self.env:
+                raise RuntimeError(f'Node {n} referenced nonexistent value {n_arg}! Run Graph.lint() '
+                                   f'to diagnose such issues')
+            return self.env[n_arg]
+        return map_arg(args, load_arg)
+
+@compatibility(is_backward_compatible=True)
+class Transformer(Interpreter):
+    """
+    ``Transformer`` is a special type of interpreter that produces a
+    new ``Module``. It exposes a ``transform()`` method that returns
+    the transformed ``Module``. ``Transformer`` does not require
+    arguments to run, as ``Interpreter`` does. ``Transformer`` works
+    entirely symbolically.
+
+    Example:
+
+        Suppose we want to swap all instances of ``torch.neg`` with
+        ``torch.sigmoid`` and vice versa (including their ``Tensor``
+        method equivalents). We could subclass ``Transformer`` like so::
+
+            class NegSigmSwapXformer(Transformer):
+                def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+                    if target == torch.sigmoid:
+                        return torch.neg(*args, **kwargs)
+                    return super().call_function(n)
+
+                def call_method(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+                    if target == 'neg':
+                        call_self, *args_tail = args
+                        return call_self.sigmoid(*args_tail, **kwargs)
+                    return super().call_method(n)
+
+            def fn(x):
+                return torch.sigmoid(x).neg()
+
+            gm = torch.fx.symbolic_trace(fn)
+
+            transformed : torch.nn.Module = NegSigmSwapXformer(gm).transform()
+            input = torch.randn(3, 4)
+            torch.testing.assert_close(transformed(input), torch.neg(input).sigmoid())
+
+    Args:
+        module (GraphModule): The ``Module`` to be transformed.
+    """
+
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, module):
+        super().__init__(module)
+        self.new_graph = Graph()
+        self.new_graph.set_codegen(module.graph._codegen)
+
+        class TransformerTracer(Tracer):
+            def __init__(self, graph: Graph):
+                super().__init__()
+                self.graph = graph
+                self.tensor_attrs: Dict[torch.Tensor, str] = {}  # type: ignore[assignment]
+
+            def is_leaf_module(self, _, __) -> bool:
+                return True
+
+        self.tracer = TransformerTracer(self.new_graph)
+        self.tracer.root = module
+
+    @compatibility(is_backward_compatible=True)
+    def placeholder(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
+        """
+        Execute a ``placeholder`` node. In ``Transformer``, this is
+        overridden to insert a new ``placeholder`` into the output
+        graph.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+        """
+        assert isinstance(target, str)
+        default_value = next(iter(args)) if args else inspect.Signature.empty
+        return Proxy(self.new_graph.placeholder(target, default_value=default_value), self.tracer)
+
+    @compatibility(is_backward_compatible=True)
+    def get_attr(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Proxy:
+        """
+        Execute a ``get_attr`` node. In ``Transformer``, this is
+        overridden to insert a new ``get_attr`` node into the output
+        graph.
+
+        Args:
+            target (Target): The call target for this node. See
+                `Node <https://pytorch.org/docs/master/fx.html#torch.fx.Node>`__ for
+                details on semantics
+            args (Tuple): Tuple of positional args for this invocation
+            kwargs (Dict): Dict of keyword arguments for this invocation
+        """
+        assert isinstance(target, str)
+        return self.tracer.create_proxy("get_attr", target, args, kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def call_module(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        # Override so that the leaf module policy from `self.tracer` is respected.
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        return self.tracer.call_module(submod, submod.forward, args, kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def call_function(self, target : 'Target', args : Tuple[Argument, ...], kwargs : Dict[str, Any]) -> Any:
+        # Override so that functions that were wrapped are still wrapped.
+        return self.tracer.create_proxy('call_function', target, args, kwargs)
+
+    @compatibility(is_backward_compatible=True)
+    def transform(self) -> GraphModule:
+        """
+        Transform ``self.module`` and return the transformed
+        ``GraphModule``.
+        """
+        with fx_traceback.preserve_node_meta():
+            result = super().run(enable_io_processing=False)
+        if result is not None:
+            def strip_proxy(a : Union[Argument, Proxy]) -> Any:
+                return a.node if isinstance(a, Proxy) else a
+            self.new_graph.output(map_aggregate(result, strip_proxy))
+        return _make_graph_module(self.module, self.new_graph)
diff --git a/MLPY/Lib/site-packages/torch/fx/node.py b/MLPY/Lib/site-packages/torch/fx/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7130b87fd03f051d531f57f0d2b276af61bd234
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/node.py
@@ -0,0 +1,726 @@
+# mypy: ignore-errors
+
+# Nodes represent a definition of a value in our graph of operators.
+from typing import TYPE_CHECKING, Union, Callable, Any, Tuple, List, Optional, Dict, Set
+from ._compatibility import compatibility
+from .immutable_collections import immutable_dict, immutable_list
+import torch
+import builtins
+import types
+import inspect
+import warnings
+from torch.fx.operator_schemas import normalize_function, normalize_module, ArgsKwargsPair
+from .._ops import ops as _ops
+
+if TYPE_CHECKING:
+    from .graph import Graph
+
+__all__ = ['Node', 'map_arg', 'map_aggregate', "has_side_effect"]
+
+BaseArgumentTypes = Union[str, int, float, bool, complex, torch.dtype,
+                          torch.Tensor, torch.device, torch.memory_format, torch.layout, torch._ops.OpOverload]
+base_types = BaseArgumentTypes.__args__  # type: ignore[attr-defined]
+
+Target = Union[Callable[..., Any], str]
+
+Argument = Optional[Union[
+    Tuple[Any, ...],  # actually Argument, but mypy can't represent recursive types
+    List[Any],  # actually Argument
+    Dict[str, Any],  # actually Argument
+    slice,  # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
+    range,
+    'Node',
+    BaseArgumentTypes
+]]
+
+_side_effectful_need_to_be_preserved_pre_dispatch: Set[Callable] = {
+    torch._C._set_grad_enabled,
+    torch.amp._enter_autocast,
+    torch.amp._exit_autocast,
+}
+
+# TODO: Either refactor this into 2 functions 1 dce for functional graphs and 1 dce for all graphs,
+# or add logic to correctly mark all inplace ops as side effectful.
+_side_effectful_functions: Set[Callable] = {
+    torch._assert,
+    torch._assert_async,
+    _ops.aten._assert_async.msg,
+    _ops.aten._assert_scalar.default,
+    _ops.aten.copy_.default,
+    _ops.aten.index_put_.default,
+    _ops.aten.sym_constrain_range.default,
+    _ops.aten.sym_constrain_range_for_size.default,
+    _ops.profiler._record_function_enter,
+    _ops.profiler._record_function_enter_new,
+    _ops.profiler._record_function_exit,
+    _ops.inductor.accumulate_grad_.default,
+    _ops.inductor.resize_storage_bytes_.default,
+} | _side_effectful_need_to_be_preserved_pre_dispatch
+
+
+@compatibility(is_backward_compatible=False)
+def has_side_effect(fn: Callable) -> None:
+    _side_effectful_functions.add(fn)
+    return fn
+
+
+# this is fixed on master, WAR for 1.5
+def _find_module_of_method(orig_method: Callable[..., Any]) -> str:
+    name = orig_method.__name__
+    module = orig_method.__module__
+    if module is not None:
+        return module
+    for guess in [torch, torch.nn.functional]:
+        if getattr(guess, name, None) is orig_method:
+            return guess.__name__
+    raise RuntimeError(f'cannot find module for {orig_method}')
+
+# Borrowed from CPython typing module
+# https://github.com/python/cpython/blob/f90dc36c15d7fee0efaf6d39e97be0bdf2683e93/Lib/typing.py#L156
+def _type_repr(obj):
+    """Return the repr() of an object, special-casing types (internal helper).
+    If obj is a type, we return a shorter version than the default
+    type.__repr__, based on the module and qualified name, which is
+    typically enough to uniquely identify a type.  For everything
+    else, we fall back on repr(obj).
+    """
+    if isinstance(obj, type):
+        if obj.__module__ == 'builtins':
+            return obj.__qualname__
+        return f'{obj.__module__}.{obj.__qualname__}'
+    if obj is ...:
+        return '...'
+    if isinstance(obj, types.FunctionType):
+        return obj.__name__
+    return repr(obj)
+
+def _get_qualified_name(func: Callable[..., Any]) -> str:
+    # things like getattr just appear in builtins
+    if getattr(builtins, func.__name__, None) is func:
+        return func.__name__
+    # torch.Tensor.{fn}
+    if (isinstance(func, (types.MethodDescriptorType, types.WrapperDescriptorType))
+       and func is getattr(torch.Tensor, func.__name__, None)):
+        return f"torch.Tensor.{func.__name__}"
+    name = func.__name__
+    if name == "<lambda>":
+        # For lambdas, try to get their defining name in the module
+        try:
+            name = inspect.getsource(func).split("=")[0].strip()
+        except Exception as e:
+            raise RuntimeError("Unable to represent lambda") from e
+    module = _find_module_of_method(func)
+    module = module.replace('torch._ops', 'torch.ops')  # WAR for bug in how torch.ops assigns module
+    # Fixup segment_reduce mismatch
+    if module == "torch" and name == "segment_reduce":
+        name = "_" + name
+    return f'{module}.{name}'
+
+def _format_arg(arg, max_list_len=float('inf')) -> str:
+    if hasattr(arg, '_custom_fx_repr_fn'):
+        return arg._custom_fx_repr_fn()
+    elif isinstance(arg, list):
+        items = ', '.join(_format_arg(a) for idx, a in enumerate(arg) if idx < max_list_len)
+        maybe_len = '' if len(arg) < max_list_len + 1 else f', ...[total_len={len(arg)}]'
+        return f'[{items}{maybe_len}]'
+    elif isinstance(arg, tuple):
+        items = ', '.join(_format_arg(a) for idx, a in enumerate(arg) if idx < max_list_len)
+        maybe_len = '' if len(arg) < max_list_len + 1 else f', ...[total_len={len(arg)}]'
+        maybe_comma = ',' if len(arg) == 1 else ''
+        return f'({items}{maybe_comma}{maybe_len})'
+    elif isinstance(arg, dict):
+        items_str = ', '.join(f'{k}: {_format_arg(v)}' for k, v in arg.items())
+        return f'{{{items_str}}}'
+
+    if isinstance(arg, Node):
+        return '%' + str(arg)
+    else:
+        return str(arg)
+
+@compatibility(is_backward_compatible=True)
+class Node:
+    """
+    ``Node`` is the data structure that represents individual operations within
+    a ``Graph``. For the most part, Nodes represent callsites to various entities,
+    such as operators, methods, and Modules (some exceptions include nodes that
+    specify function inputs and outputs). Each ``Node`` has a function specified
+    by its ``op`` property. The ``Node`` semantics for each value of ``op`` are as follows:
+
+    - ``placeholder`` represents a function input. The ``name`` attribute specifies the name this value will take on.
+      ``target`` is similarly the name of the argument. ``args`` holds either: 1) nothing, or 2) a single argument
+      denoting the default parameter of the function input. ``kwargs`` is don't-care. Placeholders correspond to
+      the function parameters (e.g. ``x``) in the graph printout.
+    - ``get_attr`` retrieves a parameter from the module hierarchy. ``name`` is similarly the name the result of the
+      fetch is assigned to. ``target`` is the fully-qualified name of the parameter's position in the module hierarchy.
+      ``args`` and ``kwargs`` are don't-care
+    - ``call_function`` applies a free function to some values. ``name`` is similarly the name of the value to assign
+      to. ``target`` is the function to be applied. ``args`` and ``kwargs`` represent the arguments to the function,
+      following the Python calling convention
+    - ``call_module`` applies a module in the module hierarchy's ``forward()`` method to given arguments. ``name`` is
+      as previous. ``target`` is the fully-qualified name of the module in the module hierarchy to call.
+      ``args`` and ``kwargs`` represent the arguments to invoke the module on, *excluding the self argument*.
+    - ``call_method`` calls a method on a value. ``name`` is as similar. ``target`` is the string name of the method
+      to apply to the ``self`` argument. ``args`` and ``kwargs`` represent the arguments to invoke the module on,
+      *including the self argument*
+    - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement
+      in the Graph printout.
+    """
+
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target',
+                 args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'],
+                 return_type : Optional[Any] = None) -> None:
+        """
+        Instantiate an instance of ``Node``. Note: most often, you want to use the
+        Graph APIs, i.e. ``Graph.call_module``, ``Graph.call_method``, etc. rather
+        than instantiating a ``Node`` directly.
+
+        Args:
+            graph (Graph): The ``Graph`` to which this ``Node`` should belong.
+
+            name (str): The name to which the output of this ``Node`` should be assigned
+
+            op (str): The opcode for this ``Node``. Can be one of 'placeholder',
+                'call_method', 'call_module', 'call_function', 'get_attr',
+                'output'
+
+            target ('Target'): The target this op should call. See the broader
+                ``Node`` docstring for more details.
+
+            args (Tuple['Argument']): The args to be passed to ``target``
+
+            kwargs (Dict[str, 'Argument']): The kwargs to be passed to ``target``
+
+            return_type (Optional[Any]): The python type expression representing the
+                type of the output of this node. This field can be used for
+                annotation of values in the generated code or for other types
+                of analyses.
+        """
+        self.graph = graph
+        self.name = name  # unique name of value being created
+        assert op in ['placeholder', 'call_method', 'call_module', 'call_function', 'get_attr', 'output', 'root']
+        self.op = op  # the kind of operation = placeholder|call_method|call_module|call_function|get_attr
+        if op == 'call_function':
+            if not callable(target):
+                raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} '
+                                 'but a Callable is expected')
+        else:
+            if not isinstance(target, str):
+                raise ValueError(f'Node [graph = {graph}, name = \'{name}\'] target {target} has type {torch.typename(target)} '
+                                 'but a str is expected')
+        self.target = target  # for method/module/function, the name of the method/module/function/attr
+        # being invoked, e.g add, layer1, or torch.add
+
+        # All `Node`-valued inputs. Key is the Node, value is don't-care.
+        # The public API for this is `all_input_nodes`, this private attribute
+        # should not be accessed directly.
+        self._input_nodes : Dict[Node, None] = {}
+        self.__update_args_kwargs(map_arg(args, lambda x: x), map_arg(kwargs, lambda x: x))  # type: ignore[arg-type]
+
+        # All of the nodes that use the value produced by this Node
+        # Note one user may correspond to several uses, e.g. the node fo ``x + x``
+        # would appear once here, but represents two uses.
+        #
+        # Is a dict to act as an "ordered set". Keys are significant, value dont-care
+        self.users : Dict[Node, None] = {}
+        # Type expression representing the output value of this node.
+        # This should contain the same class of Type objects that would appear
+        # as type annotations for function inputs/outputs.
+        #
+        # For placeholder nodes, this value will be used to type-annotate the
+        # generated function parameters.
+        # For the return node, this value will be used to type-annotate the
+        # generated function return type. (Note this is a special case. ``return``
+        # does not produce a value, it's more of a notation. Thus, this value
+        # describes the type of args[0] in the ``return`` node.
+        self.type : Optional[Any] = return_type
+        self._prev = self
+        self._next = self
+        self._erased = False
+
+        # If set, use this fn to print this node
+        self._repr_fn : Optional[Callable[[Node], str]] = None
+
+        # Dictionary to store metadata passes need to do their
+        # transformations. This metadata is preserved across node copies
+        self.meta : Dict[str, Any] = {}
+
+    @property
+    def next(self) -> 'Node':
+        """
+        Returns the next ``Node`` in the linked list of Nodes.
+
+        Returns:
+
+            The next ``Node`` in the linked list of Nodes.
+        """
+        return self._next
+
+    @property
+    def prev(self) -> 'Node':
+        """
+        Returns the previous ``Node`` in the linked list of Nodes.
+
+        Returns:
+
+            The previous ``Node`` in the linked list of Nodes.
+        """
+        return self._prev
+
+    @compatibility(is_backward_compatible=True)
+    def prepend(self, x: 'Node') -> None:
+        """
+        Insert x before this node in the list of nodes in the graph. Example::
+
+            Before: p -> self
+                    bx -> x -> ax
+            After:  p -> x -> self
+                    bx -> ax
+
+        Args:
+            x (Node): The node to put before this node. Must be a member of the same graph.
+        """
+        assert self.graph == x.graph, "Attempting to move a Node into a different Graph"
+        if self == x:
+            warnings.warn("Trying to prepend a node to itself. This behavior has no effect on the graph.")
+            return
+        x._remove_from_list()
+        p = self._prev
+        p._next, x._prev = x, p
+        x._next, self._prev = self, x
+
+    @compatibility(is_backward_compatible=True)
+    def append(self, x: 'Node') -> None:
+        """
+        Insert ``x`` after this node in the list of nodes in the graph.
+        Equivalent to ``self.next.prepend(x)``
+
+        Args:
+            x (Node): The node to put after this node. Must be a member of the same graph.
+        """
+        self._next.prepend(x)
+
+    def _remove_from_list(self):
+        p, n = self._prev, self._next
+        p._next, n._prev = n, p
+
+    @property
+    def args(self) -> Tuple[Argument, ...]:
+        """
+        The tuple of arguments to this ``Node``. The interpretation of arguments
+        depends on the node's opcode. See the :class:`Node` docstring for more
+        information.
+
+        Assignment to this property is allowed. All accounting of uses and users
+        is updated automatically on assignment.
+        """
+        return self._args
+
+    @args.setter
+    def args(self, a : Tuple[Argument, ...]):
+        """
+        Set the tuple of arguments to this Node. The interpretation of arguments
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
+        information.
+        """
+        # DO NOT CALL `__update_args_kwargs` directly. The correct way to
+        # set `args` is via direct assignment, i.e. `node.args = new_args`
+        self.__update_args_kwargs(map_arg(a, lambda x: x), self._kwargs)  # type: ignore[arg-type]
+
+    @property
+    def kwargs(self) -> Dict[str, Argument]:
+        """
+        The dict of keyword arguments to this ``Node``. The interpretation of arguments
+        depends on the node's opcode. See the :class:`Node` docstring for more
+        information.
+
+        Assignment to this property is allowed. All accounting of uses and users
+        is updated automatically on assignment.
+        """
+        return self._kwargs
+
+    @kwargs.setter
+    def kwargs(self, k : Dict[str, Argument]):
+        """
+        Set the dict of kwargs to this Node. The interpretation of arguments
+        depends on the node's opcode. See the ``fx.Graph`` docstring for more
+        information.
+        """
+        # DO NOT CALL `__update_args_kwargs` directly. The correct way to
+        # set `args` is via direct assignment, i.e. `node.kwargs = new_kwargs`
+        self.__update_args_kwargs(self._args, map_arg(k, lambda x: x))  # type: ignore[arg-type]
+
+    @property
+    def all_input_nodes(self) -> List['Node']:
+        """
+        Return all Nodes that are inputs to this Node. This is equivalent to
+        iterating over ``args`` and ``kwargs`` and only collecting the values that
+        are Nodes.
+
+        Returns:
+
+            List of ``Nodes`` that appear in the ``args`` and ``kwargs`` of this
+            ``Node``, in that order.
+        """
+        return list(self._input_nodes.keys())
+
+    @compatibility(is_backward_compatible=True)
+    def update_arg(self, idx : int, arg : Argument) -> None:
+        """
+        Update an existing positional argument to contain the new value
+        ``arg``. After calling, ``self.args[idx] == arg``.
+
+        Args:
+
+            idx (int): The index into ``self.args`` of the element to update
+            arg (Argument): The new argument value to write into ``args``
+        """
+        args = list(self.args)
+        args[idx] = arg
+        self.args = tuple(args)
+
+    @compatibility(is_backward_compatible=True)
+    def insert_arg(self, idx : int, arg : Argument) -> None:
+        """
+        Insert an positional argument to the argument list with given index.
+
+        Args:
+
+            idx (int): The index of the element in ``self.args`` to be inserted before.
+            arg (Argument): The new argument value to insert into ``args``
+        """
+        assert 0 <= idx <= len(self.args), "insert_args index must be between 0 and len(self.args)"
+        args_left = self.args[:idx]
+        args_right = self.args[idx:]
+
+        self._args = args_left + (arg,) + args_right
+
+        _new_input_nodes = {}
+        map_arg(arg, _new_input_nodes.setdefault)
+
+        for new_use in _new_input_nodes.keys():
+            if new_use not in self._input_nodes:
+                self._input_nodes.setdefault(new_use)
+                new_use.users.setdefault(self)
+
+    @compatibility(is_backward_compatible=True)
+    def update_kwarg(self, key : str, arg : Argument) -> None:
+        """
+        Update an existing keyword argument to contain the new value
+        ``arg``. After calling, ``self.kwargs[key] == arg``.
+
+        Args:
+
+            key (str): The key in ``self.kwargs`` of the element to update
+            arg (Argument): The new argument value to write into ``kwargs``
+        """
+        kwargs = dict(self.kwargs)
+        kwargs[key] = arg
+        self.kwargs = kwargs
+
+    @property
+    def stack_trace(self) -> Optional[str]:
+        """
+        Return the Python stack trace that was recorded during tracing, if any.
+        When traced with fx.Tracer, this property is usually populated by
+        `Tracer.create_proxy`. To record stack traces during tracing for debug purposes,
+        set `record_stack_traces = True` on the `Tracer` instance.
+        When traced with dynamo, this property will be populated by default by
+        `OutputGraph.create_proxy`.
+
+        stack_trace would have the innermost frame at the end of the string.
+        """
+        return self.meta.get("stack_trace", None)
+
+    @stack_trace.setter
+    def stack_trace(self, trace : Optional[str]):
+        self.meta["stack_trace"] = trace
+
+    def __update_args_kwargs(self, new_args : Tuple['Argument', ...], new_kwargs : Dict[str, 'Argument']):
+        """
+        This API is internal. Do *not* call it directly.
+        """
+        self._args = new_args
+        self._kwargs = new_kwargs
+
+        for old_use in self._input_nodes.keys():
+            old_use.users.pop(self)
+
+        self._input_nodes = {}
+        map_arg(self._args, self._input_nodes.setdefault)
+        map_arg(self._kwargs, self._input_nodes.setdefault)
+
+        for new_use in self._input_nodes.keys():
+            new_use.users.setdefault(self)
+
+    def __repr__(self) -> str:
+        if self._repr_fn:
+            return self._repr_fn(self)
+        return self.name
+
+    def _pretty_print_target(self, target):
+        """
+        Make target printouts more user-friendly.
+        1) builtins will be printed as `builtins.xyz`
+        2) operators will be printed as `operator.xyz`
+        3) other callables will be printed with qualified name, e.g. torch.add
+        """
+        if isinstance(target, str):
+            return target
+        if hasattr(target, '__module__'):
+            if not hasattr(target, '__name__'):
+                # Just to be defensive, if we don't have `__name__`, get the
+                # qualname. Not sure if this happens for any members of `operator`
+                # or `builtins`. This fallback path is not as good, since e.g.
+                # things in `operator` have `_operator` as their __module__.
+                return _get_qualified_name(target)
+            if target.__module__ == 'builtins':
+                return f'builtins.{target.__name__}'
+            elif target.__module__ == '_operator':
+                return f'operator.{target.__name__}'
+        return _get_qualified_name(target)
+
+    @compatibility(is_backward_compatible=True)
+    def format_node(self,
+                    placeholder_names: Optional[List[str]] = None,
+                    maybe_return_typename: Optional[List[str]] = None) -> Optional[str]:
+        """
+        Return a descriptive string representation of ``self``.
+
+        This method can be used with no arguments as a debugging
+        utility.
+
+        This function is also used internally in the ``__str__`` method
+        of ``Graph``. Together, the strings in ``placeholder_names``
+        and ``maybe_return_typename`` make up the signature of the
+        autogenerated ``forward`` function in this Graph's surrounding
+        GraphModule. ``placeholder_names`` and ``maybe_return_typename``
+        should not be used otherwise.
+
+        Args:
+            placeholder_names: A list that will store formatted strings
+                representing the placeholders in the generated
+                ``forward`` function. Internal use only.
+            maybe_return_typename: A single-element list that will store
+                a formatted string representing the output of the
+                generated ``forward`` function. Internal use only.
+
+        Returns:
+            str: If 1) we're using ``format_node`` as an internal helper
+                in the ``__str__`` method of ``Graph``, and 2) ``self``
+                is a placeholder Node, return ``None``. Otherwise,
+                return a  descriptive string representation of the
+                current Node.
+        """
+        if self.op == 'placeholder':
+            assert isinstance(self.target, str)
+            arg_str = self.target
+            arg_str += arg_str + f': {_type_repr(self.type)}' if self.type else ''
+            if placeholder_names:
+                placeholder_names.append(arg_str)
+                return None
+            maybe_typename = f'{_type_repr(self.type)} ' if self.type else ''
+            default_val = '(default=' + str(self.args[0]) + ')' if self.args else ''
+            return f'%{self.name} : {maybe_typename}[num_users={len(self.users)}] = {self.op}[target={self.target}]{default_val}'
+        elif self.op == 'get_attr':
+            maybe_typename = f'{_type_repr(self.type)} ' if self.type is not None else ''
+            return f'%{self.name} : {maybe_typename}[num_users={len(self.users)}] = ' \
+                   f'{self.op}[target={self._pretty_print_target(self.target)}]'
+        elif self.op == 'output':
+            if self.type and maybe_return_typename:
+                maybe_return_typename[0] = f' -> {_type_repr(self.type)}'
+            return f'return {self.args[0]}'
+        else:
+            maybe_typename = f'{_type_repr(self.type)} ' if self.type is not None else ''
+            return f'%{self.name} : {maybe_typename}[num_users={len(self.users)}] = ' \
+                   f'{self.op}[target={self._pretty_print_target(self.target)}](' \
+                   f'args = {_format_arg(self.args)}, kwargs = {_format_arg(self.kwargs)})'
+
+    @compatibility(is_backward_compatible=True)
+    def replace_all_uses_with(self,
+                              replace_with : 'Node',
+                              delete_user_cb: Callable[['Node'], bool] = lambda user: True,
+                              *,
+                              propagate_meta=False
+                              ) -> List['Node']:
+        """
+        Replace all uses of ``self`` in the Graph with the Node ``replace_with``.
+
+        Args:
+
+            replace_with (Node): The node to replace all uses of ``self`` with.
+            delete_user_cb (Callable): Callback that is called to determine
+              whether a given user of the self node should be removed.
+            propagate_meta (bool): Whether or not to copy all properties
+              on the .meta field of the original node onto the replacement node.
+              For safety, this is only valid to do if the replacement node
+              doesn't already have an existing .meta field.
+
+        Returns:
+
+            The list of Nodes on which this change was made.
+        """
+        if propagate_meta:
+            assert len(replace_with.meta) == 0, \
+                'Called node.replace_all_uses_with(replace_with, propagate_meta=True), ' \
+                'but replace_with already has .meta keys'
+            for k, v in self.meta.items():
+                replace_with.meta[k] = v
+        to_process = list(self.users)
+        skipped = []
+        m = self.graph.owning_module
+        for use_node in to_process:
+            if not delete_user_cb(use_node):
+                skipped.append(use_node)
+                continue
+
+            def maybe_replace_node(n : Node) -> Node:
+                if n == self:
+                    return replace_with
+                else:
+                    return n
+
+            if getattr(m, "_replace_hook", None):
+                m._replace_hook(old=self, new=replace_with.name, user=use_node)
+
+            new_args = map_arg(use_node.args, maybe_replace_node)
+            new_kwargs = map_arg(use_node.kwargs, maybe_replace_node)
+            assert isinstance(new_args, tuple)
+            assert isinstance(new_kwargs, dict)
+            use_node.__update_args_kwargs(new_args, new_kwargs)
+
+        assert len(self.users) - len(skipped) == 0
+        return [n for n in to_process if n not in skipped]
+
+    @compatibility(is_backward_compatible=False)
+    def is_impure(self):
+        """
+        Returns whether this op is impure, i.e. if its op is a placeholder or
+        output, or if a call_function or call_module which is impure.
+
+        Returns:
+
+            bool: If the op is impure or not.
+        """
+        if self.op in {"placeholder", "output"}:
+            return True
+
+        # Check if an impure function.
+        if self.op == "call_function":
+            return self.target in _side_effectful_functions
+
+        # Check if an impure module.
+        if self.op == "call_module":
+            assert (
+                self.graph.owning_module is not None
+            ), "self.graph.owning_module not set for purity check"
+            target_mod = self.graph.owning_module.get_submodule(self.target)
+            assert (
+                target_mod is not None
+            ), f"Did not find expected submodule target {self.target}"
+            return getattr(target_mod, "_is_impure", False)
+
+        return False
+
+    @compatibility(is_backward_compatible=False)
+    def normalized_arguments(
+            self, root : torch.nn.Module, arg_types : Optional[Tuple[Any]] = None,
+            kwarg_types : Optional[Dict[str, Any]] = None,
+            normalize_to_only_use_kwargs : bool = False) -> Optional[ArgsKwargsPair]:
+        """
+        Returns normalized arguments to Python targets. This means that
+        `args/kwargs` will be matched up to the module/functional's
+        signature and return exclusively kwargs in positional order
+        if `normalize_to_only_use_kwargs` is true.
+        Also populates default values. Does not support positional-only
+        parameters or varargs parameters.
+
+        Supports module calls.
+
+        May require `arg_types` and `kwarg_types` in order to disambiguate overloads.
+
+        Args:
+            root (torch.nn.Module): Module upon which to resolve module targets.
+            arg_types (Optional[Tuple[Any]]): Tuple of arg types for the args
+            kwarg_types (Optional[Dict[str, Any]]): Dict of arg types for the kwargs
+            normalize_to_only_use_kwargs (bool): Whether to normalize to only use kwargs.
+
+        Returns:
+
+            Returns NamedTuple ArgsKwargsPair, or `None` if not successful.
+        """
+        if self.op == 'call_function':
+            assert callable(self.target)
+            return normalize_function(self.target, self.args, self.kwargs, arg_types, kwarg_types)  # type: ignore[arg-type]
+        elif self.op == 'call_module':
+            assert isinstance(self.target, str)
+            return normalize_module(root, self.target, self.args, self.kwargs)  # type: ignore[arg-type]
+
+        return None
+
+    @compatibility(is_backward_compatible=True)
+    def replace_input_with(self, old_input: 'Node', new_input: 'Node'):
+        """
+        Loop through input nodes of ``self``, and replace all instances of
+        ``old_input`` with ``new_input``.
+
+        Args:
+
+            old_input (Node): The old input node to be replaced.
+            new_input (Node): The new input node to replace ``old_input``.
+        """
+        def maybe_replace_node(n : Node) -> Node:
+            return new_input if n == old_input else n
+
+        m = self.graph.owning_module
+        if getattr(m, "_replace_hook", None):
+            m._replace_hook(old=old_input, new=new_input.name, user=self)
+
+        new_args = map_arg(self.args, maybe_replace_node)
+        new_kwargs = map_arg(self.kwargs, maybe_replace_node)
+        assert isinstance(new_args, tuple)
+        assert isinstance(new_kwargs, dict)
+        self.__update_args_kwargs(new_args, new_kwargs)
+
+    def _rename(self, candidate: str):
+        if candidate == self.name:
+            return
+        name = self.graph._graph_namespace.create_name(candidate, None)
+        self.name = name
+        self.graph._graph_namespace._rename_object(self, name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name == 'name' and hasattr(self, "name"):
+            m = self.graph.owning_module
+            if getattr(m, "_replace_hook", None):
+                assert isinstance(value, str)
+                for user in self.users:
+                    m._replace_hook(old=self, new=value, user=user)
+        object.__setattr__(self, name, value)
+
+
+@compatibility(is_backward_compatible=True)
+def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument:
+    """
+    Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys.
+    """
+    assert callable(fn), "torch.fx.map_arg(a, fn): fn must be a callable"
+    return map_aggregate(a, lambda x: fn(x) if isinstance(x, Node) else x)
+
+@compatibility(is_backward_compatible=True)
+def map_aggregate(a: Argument, fn: Callable[[Argument], Argument]) -> Argument:
+    """
+    Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys.
+    """
+    if isinstance(a, tuple):
+        t = tuple(map_aggregate(elem, fn) for elem in a)
+        # Support NamedTuple (if it has `_fields`) by repacking into original type.
+        return t if not hasattr(a, '_fields') else type(a)(*t)
+    elif isinstance(a, list):
+        return immutable_list(map_aggregate(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return immutable_dict((k, map_aggregate(v, fn)) for k, v in a.items())
+    elif isinstance(a, slice):
+        return slice(map_aggregate(a.start, fn), map_aggregate(a.stop, fn), map_aggregate(a.step, fn))
+    else:
+        return fn(a)
diff --git a/MLPY/Lib/site-packages/torch/fx/operator_schemas.py b/MLPY/Lib/site-packages/torch/fx/operator_schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..c781fc533e67445a2ecd3ebd5627ed28eb45d2c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/operator_schemas.py
@@ -0,0 +1,441 @@
+import torch
+import inspect
+import numbers
+import types
+import typing
+import enum
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple, NamedTuple, cast, TYPE_CHECKING
+from torch._jit_internal import boolean_dispatched
+from ._compatibility import compatibility
+from torch._ops import OpOverloadPacket, OpOverload
+
+if TYPE_CHECKING:
+    from .node import Argument
+
+__all__ = ["ArgsKwargsPair", "check_for_mutable_operation", "get_signature_for_torch_op", "create_type_hint",
+           "type_matches", "normalize_function", "normalize_module"]
+
+@compatibility(is_backward_compatible=False)
+class ArgsKwargsPair(NamedTuple):
+    """
+    Simple named tuple for wrapping args/kwargs pairs.
+    """
+    args: Tuple[Any, ...]
+    kwargs: Dict[str, Any]
+
+_manual_overrides : Dict[Callable, List[inspect.Signature]] = {}
+
+def _nonzero_schemas():
+    signatures = []
+
+    def nonzero(self):
+        pass
+    signatures.append(inspect.signature(nonzero))
+
+    def nonzero(self, *, as_tuple : bool):  # type: ignore[no-redef]
+        pass
+    signatures.append(inspect.signature(nonzero))
+
+    return signatures
+
+_manual_overrides[torch.nonzero] = _nonzero_schemas()
+
+class _FakeGlobalNamespace:
+    def __getattr__(self, name):
+        if name == 'torch':
+            return torch
+        raise RuntimeError('Expected a torch namespace lookup')
+
+_type_eval_globals = {'Tensor' : torch.Tensor, 'Device' : torch.device, 'Layout' : torch.layout,
+                      'number' : numbers.Number, 'Future' : torch.jit.Future,
+                      'AnyEnumType' : enum.Enum, 'QScheme' : torch.qscheme,
+                      '__torch__': _FakeGlobalNamespace(), 'NoneType': type(None),
+                      'Storage': torch.UntypedStorage,
+                      't': typing.TypeVar('t')}
+for k in dir(typing):
+    _type_eval_globals[k] = getattr(typing, k)
+
+def _torchscript_type_to_python_type(ts_type : 'torch._C.JitType') -> Any:
+    """
+    Convert a TorchScript type to a Python type (including subtypes) via
+    eval'ing the annotation_str. _type_eval_globals sets up expressions
+    like "List" and "Future" to map to actual types (typing.List and jit.Future)
+    """
+    return eval(ts_type.annotation_str, _type_eval_globals)
+
+def _torchscript_schema_to_signature_impl(ts_schema : torch._C.FunctionSchema) -> inspect.Signature:
+    from inspect import Parameter
+    parameters : List[Parameter] = []
+    for arg in ts_schema.arguments:
+        arg_type = _torchscript_type_to_python_type(arg.type)
+        default = arg.default_value if arg.has_default_value() else Parameter.empty
+        # TODO: Figure out if this is safe. It seems like when generating the type signatures for
+        # PythonArgParser, we emit signatures with `input` instead of `self` as the first tensor
+        # argument name. Downstream, if someone converts that positional argument to a keyword
+        # argument, the name mismatch will break things, so here we're going to normalize the
+        # name to "input"
+        name = arg.name if arg.name != 'self' else 'input'
+        kind = Parameter.KEYWORD_ONLY if arg.kwarg_only else Parameter.POSITIONAL_OR_KEYWORD
+        # "from" is a keyword therefore it must be a POSITIONAL_ONLY argument
+        if name == "from":
+            assert kind == Parameter.POSITIONAL_OR_KEYWORD
+            # ParameterKind type is internal implementation detail to inspec package
+            # which makes it hard to do type annotation
+            kind = Parameter.POSITIONAL_ONLY  # type: ignore[assignment]
+            # This renders all previous arguments to positional only
+            for idx, p in enumerate(parameters):
+                assert p.kind == Parameter.POSITIONAL_OR_KEYWORD
+                parameters[idx] = Parameter(name=p.name, kind=Parameter.POSITIONAL_ONLY, default=p.default, annotation=p.annotation)
+        parameters.append(Parameter(name=name, kind=kind, default=default, annotation=arg_type))
+    return_types = [_torchscript_type_to_python_type(ret.type) for ret in ts_schema.returns]
+    if len(return_types) == 0:
+        return_type = None
+    elif len(return_types) == 1:
+        return_type = return_types[0]
+    else:
+        return_type = tuple(return_types)
+
+    return inspect.Signature(parameters, return_annotation=return_type)
+
+_SCHEMA_TO_SIGNATURE_CACHE : Dict[Tuple[str, str], inspect.Signature] = {}
+
+def _torchscript_schema_to_signature(ts_schema : torch._C.FunctionSchema) -> inspect.Signature:
+    # Cached as it's called in the hot path of FakeTensor dispatch
+    cache_key = ts_schema.name, ts_schema.overload_name
+    cache_val = _SCHEMA_TO_SIGNATURE_CACHE.get(cache_key)
+    if cache_val is not None:
+        return cache_val
+
+    res = _torchscript_schema_to_signature_impl(ts_schema)
+    _SCHEMA_TO_SIGNATURE_CACHE[cache_key] = res
+    return res
+
+@compatibility(is_backward_compatible=False)
+def check_for_mutable_operation(target : Callable, args : Tuple['Argument', ...], kwargs : Dict[str, 'Argument']):
+    signatures, schemas = get_signature_for_torch_op(target, return_schemas=True)
+
+    if signatures and schemas:
+        matched_schemas = []
+
+        # Iterate through all of the schema until we find one that matches
+        # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
+        # values. If none matches, `new_args_and_kwargs` will be None
+        for candidate_signature, schema in zip(signatures, schemas):
+            try:
+                candidate_signature.bind(*args, **kwargs)
+                matched_schemas.append((candidate_signature, schema))
+            except TypeError as e:
+                continue
+
+        def throw_if_mutable(schema):
+            if schema.is_mutable:
+                raise RuntimeError(f'Tried to trace mutable operation {schema}. FX only supports functional '
+                                   f'code, so operations that mutate operands in-place (e.g. via `out` arguments) '
+                                   f'are not supported')
+
+        if len(matched_schemas) == 0:
+            # Did not match any schema. Cannot check for mutation
+            pass
+        elif len(matched_schemas) == 1:
+            # Matched exactly one schema, unambiguous
+            _, schema_to_check = matched_schemas[0]
+            throw_if_mutable(schema_to_check)
+            pass
+        else:
+            # Ambiguous schema match. Since mutability checking is best effort,
+            # do nothing.
+            pass
+
+@compatibility(is_backward_compatible=False)
+def get_signature_for_torch_op(op : Callable, return_schemas : bool = False):
+    """
+    Given an operator on the `torch` namespace, return a list of `inspect.Signature`
+    objects corresponding to the overloads of that op.. May return `None` if a signature
+    could not be retrieved.
+
+    Args:
+        op (Callable): An operator on the `torch` namespace to look up a signature for
+
+    Returns:
+        Optional[List[inspect.Signature]]: A list of signatures for the overloads of this
+            operator, or None if the operator signatures could not be retrieved. If
+            return_schemas=True, returns a tuple containing the optional Python signatures
+            and the optional TorchScript Function signature
+    """
+    if isinstance(op, OpOverload):
+        schemas = [op._schema]
+    elif isinstance(op, OpOverloadPacket):
+        schemas = [getattr(op, overload)._schema for overload in op.overloads()]
+    else:
+        override = _manual_overrides.get(op)
+        if override:
+            return (override, None) if return_schemas else None
+
+        aten_fn = torch.jit._builtins._find_builtin(op)
+
+        if aten_fn is None:
+            return (None, None) if return_schemas else None
+        schemas = torch._C._jit_get_schemas_for_operator(aten_fn)
+
+    signatures = [_torchscript_schema_to_signature(schema) for schema in schemas]
+    return (signatures, schemas) if return_schemas else signatures
+
+@compatibility(is_backward_compatible=False)
+def create_type_hint(x):
+    try:
+        if isinstance(x, (list, tuple)):
+            # todo(chilli): Figure out the right way for mypy to handle this
+            if isinstance(x, list):
+                def ret_type(x):
+                    return List[x]  # type: ignore[valid-type]
+            else:
+                def ret_type(x):
+                    return Tuple[x, ...]
+            if len(x) == 0:
+                return ret_type(Any)
+            base_type = x[0]
+            for t in x:
+                if issubclass(t, base_type):
+                    continue
+                elif issubclass(base_type, t):
+                    base_type = t
+                else:
+                    return ret_type(Any)
+            return ret_type(base_type)
+    except Exception as e:
+        # We tried to create a type hint for list but failed.
+        warnings.warn(f"We were not able to successfully create type hint from the type {x}")
+        pass
+    return x
+
+@compatibility(is_backward_compatible=False)
+def type_matches(signature_type : Any, argument_type : Any):
+    sig_origin_type = getattr(signature_type, '__origin__', signature_type)
+
+    if signature_type is argument_type:
+        return True
+
+    # Union types in signature. Given type needs to match one of the
+    # contained types in the Union
+    if sig_origin_type is typing.Union and signature_type != argument_type:
+        sig_contained = signature_type.__args__
+        return any(type_matches(c, argument_type) for c in sig_contained)
+
+    if signature_type is List[int] and argument_type is int:
+        # int can be promoted to List[int]
+        return True
+
+    if getattr(signature_type, '__origin__', None) in {list, List}:
+        sig_el_type = signature_type.__args__[0]
+        if not inspect.isclass(sig_el_type):
+            warnings.warn(
+                f"Does not support nested parametric types, got {signature_type}. Please file a bug.")
+            return False
+        if getattr(argument_type, '__origin__', None) in {list, List}:
+            return issubclass(argument_type.__args__[0], sig_el_type)
+
+        def is_homogeneous_tuple(t):
+            if getattr(t, "__origin__", None) not in {tuple, Tuple}:
+                return False
+            contained = t.__args__
+            if t.__args__ == ((),):  # Tuple[()].__args__ == ((),) for some reason
+                return True
+            return all((c is Ellipsis) or issubclass(c, sig_el_type) for c in contained)
+
+        # Tuple[T] is accepted for List[T] parameters
+        return is_homogeneous_tuple(argument_type)
+
+    # Dtype is an int in schemas
+    if signature_type is int and argument_type is torch.dtype:
+        return True
+
+    if signature_type is numbers.Number and argument_type in {int, float}:
+        return True
+    if inspect.isclass(argument_type) and inspect.isclass(signature_type):
+        return issubclass(argument_type, signature_type)
+
+    return False
+
+@compatibility(is_backward_compatible=False)
+def normalize_function(
+        target: Callable, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None, arg_types : Optional[Tuple[Any]] = None,
+        kwarg_types : Optional[Dict[str, Any]] = None,
+        normalize_to_only_use_kwargs : bool = False) -> Optional[ArgsKwargsPair]:
+    """
+    Returns normalized arguments to PyTorch functions. This means that
+    `args/kwargs` will be matched up to the functional's
+    signature and return exclusively kwargs in positional order if
+    `normalize_to_only_use_kwargs` is True.
+    Also populates default values. Does not support positional-only
+    parameters or varargs parameters (*args, **kwargs). Does not support modules.
+
+    May require `arg_types` and `kwarg_types` in order to disambiguate overloads.
+
+    Args:
+        target (Callable): Function that we are normalizing
+        args (Tuple[Any]): Tuple of args to the function
+        kwargs (Optional[Dict[str, Any]]): Dict of kwargs to the function
+        arg_types (Optional[Tuple[Any]]): Tuple of arg types for the args
+        kwarg_types (Optional[Dict[str, Any]]): Dict of arg types for the kwargs
+        normalize_to_only_use_kwargs (bool): Whether to normalize to only use kwargs.
+
+    Returns:
+
+        Returns normalized_args_and_kwargs, or `None` if not successful.
+    """
+    if kwargs is None:
+        kwargs = {}
+    new_args_and_kwargs = None
+    if not isinstance(target, types.BuiltinFunctionType) and not (
+        isinstance(target, (OpOverloadPacket, OpOverload))
+    ):
+        target_for_analysis = target
+        if target in boolean_dispatched:
+            # HACK: `boolean_dispatch` as used in `torch.nn.functional` makes it so that we have
+            # a 2-way dispatch based on a boolean value. Here we check that the `true` and `false`
+            # branches of the dispatch have exactly the same signature. If they do, use the `true`
+            # branch signature for analysis. Otherwise, leave this un-normalized
+            assert not isinstance(target, str)
+            dispatched = boolean_dispatched[target]
+            if_true, if_false = dispatched['if_true'], dispatched['if_false']
+            if inspect.signature(if_true).parameters != inspect.signature(if_false).parameters:
+                return None
+            target_for_analysis = if_true
+
+        assert callable(target_for_analysis)
+        sig = inspect.signature(inspect.unwrap(target_for_analysis))
+        new_args_and_kwargs = _args_kwargs_to_normalized_args_kwargs(sig, args, kwargs, normalize_to_only_use_kwargs)
+    else:
+        assert callable(target)
+        torch_op_schemas = get_signature_for_torch_op(target)
+        matched_schemas = []
+        if torch_op_schemas:
+            # Iterate through all of the schema until we find one that matches
+            # If one matches, populate `new_args_and_kwargs` with the new args/kwargs
+            # values. If none matches, `new_args_and_kwargs` will be None
+            for candidate_signature in torch_op_schemas:
+                try:
+                    candidate_signature.bind(*args, **kwargs)
+                    matched_schemas.append(candidate_signature)
+                except TypeError as e:
+                    continue
+
+            if len(matched_schemas) == 0:
+                # Did not match any schema. Cannot normalize
+                pass
+            elif len(matched_schemas) == 1:
+                # Matched exactly one schema, unambiguous
+                new_args_and_kwargs = _args_kwargs_to_normalized_args_kwargs(matched_schemas[0], args, kwargs,
+                                                                             normalize_to_only_use_kwargs)
+            else:
+                if arg_types is not None or kwarg_types is not None:
+                    arg_types = arg_types if arg_types else cast(Tuple[Any], ())
+                    kwarg_types = kwarg_types if kwarg_types else {}
+                    for candidate_signature in torch_op_schemas:
+                        sig_matches = True
+                        try:
+                            bound_types = candidate_signature.bind(*arg_types, **kwarg_types)
+                            for arg_name, arg_type in bound_types.arguments.items():
+                                param = candidate_signature.parameters[arg_name]
+                                sig_matches = sig_matches and type_matches(param.annotation, arg_type)
+                        except TypeError as e:
+                            sig_matches = False
+                        if sig_matches:
+                            new_args_and_kwargs = _args_kwargs_to_normalized_args_kwargs(candidate_signature, args, kwargs,
+                                                                                         normalize_to_only_use_kwargs)
+                            break
+                else:
+                    # Matched more than one schema. In this situation, the caller must provide the types of
+                    # the arguments of the overload they expect.
+                    schema_printouts = '\n'.join(str(schema) for schema in matched_schemas)
+                    raise RuntimeError(f'Tried to normalize arguments to {torch.typename(target)} but '
+                                       f'the schema match was ambiguous! Please provide argument types to '
+                                       f'the normalize_arguments() call. Available schemas:\n{schema_printouts}')
+
+    return new_args_and_kwargs
+
+@compatibility(is_backward_compatible=False)
+def normalize_module(
+        root: torch.nn.Module, target: str, args: Tuple[Any], kwargs : Optional[Dict[str, Any]] = None,
+        normalize_to_only_use_kwargs : bool = False) -> Optional[ArgsKwargsPair]:
+    """
+    Returns normalized arguments to PyTorch modules. This means that
+    `args/kwargs` will be matched up to the functional's
+    signature and return exclusively kwargs in positional order if
+    `normalize_to_only_use_kwargs` is True.
+    Also populates default values. Does not support positional-only
+    parameters or varargs parameters (*args, **kwargs).
+
+    Args:
+        root (nn.Module): root module upon which we query modules
+        target (Callable): Function that we are normalizing
+        args (Tuple[Any]): Tuple of args to the function
+        kwargs (Optional[Dict[str, Any]]): Dict of kwargs to the function
+        normalize_to_only_use_kwargs (bool): Whether to normalize to only use kwargs.
+
+    Returns:
+
+        Returns normalized_args_and_kwargs, or `None` if not successful.
+    """
+    try:
+        submod = root.get_submodule(target)
+    except AttributeError as e:
+        raise RuntimeError(f"Tried to normalize node with target {target} but root did not "
+                           f"have that target!") from e
+    if hasattr(submod.__class__, '__name__'):
+        classname = submod.__class__.__name__
+        if getattr(torch.nn, classname, None) == submod.__class__:
+            sig = inspect.signature(inspect.unwrap(submod.forward))
+            if kwargs is None:
+                kwargs = {}
+            new_args_and_kwargs = _args_kwargs_to_normalized_args_kwargs(sig, args, kwargs,
+                                                                         normalize_to_only_use_kwargs)
+            return new_args_and_kwargs
+    return None
+
+def _args_kwargs_to_normalized_args_kwargs(sig : inspect.Signature, args : Tuple[Any, ...],
+                                           kwargs : Dict[str, Any],
+                                           normalize_to_only_use_kwargs : bool) -> Optional[ArgsKwargsPair]:
+    """
+    Given a call target, args, and kwargs, return the arguments normalized into
+    an ArgsKwargsPair, or None if the type signature is not supported by
+    this normalization.
+
+    Args:
+
+        sig (inspect.Signature): Signature object for the target
+        args (Tuple): Arguments that appear at the callsite for `target`
+        kwargs (Dict): Keyword arguments that appear at the callsite for `target`
+        normalize_to_only_use_kwargs (bool): Whether to normalize to only use kwargs.
+
+    Returns:
+
+        Optional[ArgsKwargsPair]: Normalized args and kwargs for `target`, or `None` if
+            this target is not supported.
+    """
+
+    # Don't currently support positional-only
+    # or varargs (*args, **kwargs) signatures
+    supported_parameter_types = {
+        inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY}
+    if any(p.kind not in supported_parameter_types for p in sig.parameters.values()):
+        # Add an exception for one signature, which is common for random/uniform, i.e.:
+        # Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None
+        # `from` is Python keyword and as such functions with that signature should have
+        # positional-only args, but at the same time they could be dispatched as kwargs
+        if list(sig.parameters.keys()) != ['input', 'from', 'to', 'generator']:
+            return None
+
+    bound_args = sig.bind(*args, **kwargs)
+    bound_args.apply_defaults()
+
+    new_kwargs : Dict[str, Any] = {}
+    new_args : List[Any] = []
+    for i, param in enumerate(sig.parameters):
+        if not normalize_to_only_use_kwargs and i < len(args):
+            new_args.append(bound_args.arguments[param])
+        else:
+            new_kwargs[param] = bound_args.arguments[param]
+
+    return ArgsKwargsPair(tuple(new_args), new_kwargs)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ba1c8585df0ef2821ab0c8b31d170ce2bb4d59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/__init__.py
@@ -0,0 +1,11 @@
+from . import graph_drawer
+from . import graph_manipulation
+from . import net_min_base
+from . import operator_support
+from . import param_fetch
+from . import reinplace
+from . import shape_prop
+from . import split_module
+from . import split_utils
+from . import splitter_base
+from . import tools_common
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a197d5f9245b23dce87e52f516888bb273532aa0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/annotate_getitem_nodes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/annotate_getitem_nodes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0033dce8bbe76577842405651580ba0c16979340
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/annotate_getitem_nodes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..598dfb125f4000d62ca3f52c404c65ba35262d2c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/fake_tensor_prop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_drawer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_drawer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9748353172c084bbc68d03b863a0a9c258c52f79
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_drawer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_manipulation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_manipulation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02e9c23a8d246c551fcca65666cc6f211ee9d37d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/graph_manipulation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50d150eb796e2779d91a78ad8bda83bed5c269ef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/net_min_base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/operator_support.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/operator_support.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b8ef7232f6b6ce8e635143f5e615bfd909e3e86
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/operator_support.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e2a985dbac76e73c69fdd6d089385a16d19f154
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/param_fetch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/pass_manager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/pass_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26de3f26cc6ad0626e466a091aacd1b5199bb353
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/pass_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/reinplace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/reinplace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cdfa4d687e3aead08cca0b271136cc715dd294f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/reinplace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/shape_prop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/shape_prop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df9d7595ffc76c60bc3f839abf053993f353128c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/shape_prop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3b72cfc2d06d6dbd5edbbc4918bed3a5ad12c64
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e063011b0c824b0702d865e71d4b3bde665c8ec8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/split_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..767688822ff587bdae95b90e5ffac479b6debf6d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/splitter_base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3db16cfef9ae20a5633983b629b13d7e8f6e7db9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/__pycache__/tools_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/annotate_getitem_nodes.py b/MLPY/Lib/site-packages/torch/fx/passes/annotate_getitem_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a0fc6476c4a645d9211e5f27267858265b067de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/annotate_getitem_nodes.py
@@ -0,0 +1,44 @@
+import operator
+
+import torch
+
+
+def annotate_getitem_nodes(graph: torch.fx.Graph) -> None:
+    """
+    Annotate the type of getitem nodes, inferred from the type of sequence node.
+    If sequence node is not annotated with a type, do nothing.
+    Currently support getitem nodes from Tuple, List, and NamedTuple sequence node.
+
+    This is helpful since annotations on local names within function are lost during FX transforms.
+    Adding back known type annotation for getitem nodes to improve jit scriptability.
+
+    Args:
+        graph (Graph): The graph to be annotated
+    """
+    for node in graph.nodes:
+        if node.target == operator.getitem:
+            sequence_node, index_node = node.args
+            if not sequence_node.type:
+                continue
+            # container types
+            if hasattr(sequence_node.type, "_name"):
+                parameterized_types = sequence_node.type.__args__
+                if sequence_node.type._name == "Tuple":
+                    if len(parameterized_types) == 2 and isinstance(
+                        parameterized_types[1], type(...)
+                    ):
+                        node.type = parameterized_types[0]
+                    else:
+                        assert len(parameterized_types) > index_node
+                        node_type = parameterized_types[index_node]
+                        node.type = node_type
+                elif sequence_node.type._name == "List":
+                    assert len(parameterized_types) == 1
+                    node.type = parameterized_types[0]
+            # NamedTuple type
+            elif hasattr(sequence_node.type, "__annotations__"):
+                if sequence_node.type == torch.Tensor:
+                    continue
+                sequence_node_field_types = sequence_node.type.__annotations__
+                field_name = sequence_node.type._fields[index_node]
+                node.type = sequence_node_field_types[field_name]
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/backends/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc670b27419e4a024a6558bb88b3d4819467244c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/cudagraphs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/cudagraphs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42dd1cb9e3edbd0f3470c4e2942a47affaa35539
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/backends/__pycache__/cudagraphs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/backends/cudagraphs.py b/MLPY/Lib/site-packages/torch/fx/passes/backends/cudagraphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f261e41fc72cc6a1e04e7ea76e23c9da278e6b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/backends/cudagraphs.py
@@ -0,0 +1,56 @@
+import torch
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.utils import _pytree as pytree
+
+import operator
+
+class CudaGraphsSupport(OperatorSupport):
+    # TODO: why is submodules passed here
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        if node.op not in CALLABLE_NODE_OPS:
+            return False
+
+        if node.target in [torch.ops.aten.embedding_dense_backward.default]:
+            return False
+
+        if node.target in [operator.getitem]:
+            return True
+
+        found_not_cuda = False
+
+        def meta_fk(meta):
+            return meta["val"] if "val" in meta else meta["fake_result"]
+
+        def find_not_cuda(t):
+            nonlocal found_not_cuda
+            if isinstance(t, torch.Tensor) and t.device.type != 'cuda':
+                found_not_cuda = True
+
+        for n in node.all_input_nodes:
+            pytree.tree_map_(find_not_cuda, meta_fk(n.meta))
+
+        pytree.tree_map_(find_not_cuda, meta_fk(node.meta))
+
+        # NB: factory function is accounted for because the result would be
+        # cpu or cuda
+
+        return not found_not_cuda
+
+def partition_cudagraphs(gm, inputs):
+    """
+    Partition an FX graph into sub-GraphModules that can be validly run under
+    CUDA graphs.  For a subgraph to be runnable under CUDA, all of the operations
+    must involve CUDA tensors only/
+    """
+
+    FakeTensorProp(gm).propagate(*inputs)
+    supported_ops = CudaGraphsSupport()
+    # TODO: single node partition may be wrong due to the pessimization
+    # from copying in and out the data.  Check in benchmarks, perhaps
+    partitioner = CapabilityBasedPartitioner(gm, supported_ops, allows_single_node_partition=True)
+    partitions = partitioner.propose_partitions()
+    fused_graph = partitioner.fuse_partitions(partitions)
+    return fused_graph
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/dialect/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/dialect/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8a3db5fa0f83c4785bdc671d8e550e61742978a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/dialect/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d5a0c4074f0162ac79c57ed810104f4a0892566
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/cse_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/cse_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52449e2652d43ccf801bf424fcc701b7d25e8221
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/__pycache__/cse_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/cse_pass.py b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/cse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cbfb54f2b116d6c4b63a33f80f5dfadf3af20a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/dialect/common/cse_pass.py
@@ -0,0 +1,112 @@
+from typing import Dict, Tuple, Any
+
+import torch
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.utils._pytree import tree_flatten
+
+from torch.fx import GraphModule, Graph
+from torch.fx import Node
+
+aten = torch.ops.aten
+
+
+# stateful ops are banned from CSE
+rand_ops = {aten.dropout, aten._fused_dropout, aten._standard_gamma, aten.bernoulli, aten.multinomial, aten.native_dropout, aten.normal, aten.poisson, aten.binomial, aten.rrelu, aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm}  # noqa: E501,B950
+
+inplace_ops = {aten.add_, aten.sub_, aten.mul_, aten.div_, aten.pow_, aten.lerp_, aten.relu_, aten.sigmoid_, aten.tanh_}  # noqa: E501
+
+
+@torch.fx._compatibility.compatibility(is_backward_compatible=False)
+def get_CSE_banned_ops():
+    return rand_ops.union(inplace_ops)
+
+
+@torch.fx._compatibility.compatibility(is_backward_compatible=False)
+class CSEPass(PassBase):
+
+    def __init__(self, banned_ops=None):
+        """
+        This version of CSE Pass aims to be dialect agnostic, and it's implemented purely based on the connectivity between fx.Node.
+
+        For functional dialects, user would only need to specify the random ops in ban list.
+
+        Warning: CSE Pass cannot be safely applied on a FX graph in non-functional dialects.
+        If your dialect contains stateful operators, please customized the banned_ops.
+
+        """
+        if banned_ops is None:
+            banned_ops = set()
+        self.banned_ops = banned_ops
+        super().__init__()
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        """
+        Return a new copy of torch.fx.GraphModule with CSE applied to the input graph
+
+        Example usage:
+
+        from torch.fx.experimental.proxy_tensor import make_fx
+        def f(a):
+            b = a * a
+            c = a * a
+            return b+c
+
+        p = CSEPass()
+        traced_graph = make_fx(f)(torch.tensor(1))
+        print(traced_graph)
+        result = p(traced_graph)
+        print(result.graph_module)
+        """
+        def get_aten_target(node):
+            if hasattr(node.target, 'overloadpacket'):
+                return node.target.overloadpacket
+            return node.target
+
+        modified = False
+        new_graph = Graph()
+        env: Dict[Node, Node] = {}  # map from node in the old graph to node in the new graph
+        hash_env: Dict[Tuple[torch._ops.OpOverload, int], Node] = {}  # map from hash to a node in the new graph
+        token_map: Dict[Tuple[torch._ops.OpOverload, int], Dict[str, Any]] = {}  # map from hash to token
+        for n in graph_module.graph.nodes:
+            # The placeholder, output, and get_attr nodes are copied to the new graph without change
+            # do not CSE away random operations
+            if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or get_aten_target(n) in self.banned_ops:
+                new_node = new_graph.node_copy(n, lambda x: env[x])
+                env[n] = new_node
+            else:  # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method'
+                # substitute args and kwargs members to their mapping in env if exists
+                # specs can be used to reconstruct nested list/dictionaries
+                def substitute(arg_list):
+                    arg_list, spec = tree_flatten(arg_list)
+                    for i in range(len(arg_list)):
+                        v = arg_list[i]
+                        if isinstance(v, Node) and v in env:
+                            arg_list[i] = env[v]
+                    return tuple(arg_list), spec
+                args, args_spec = substitute(n.args)
+                kwargs, kwargs_spec = substitute(n.kwargs)
+
+                # each token corresponds to a unique node
+                # nodes with the same token can be substituted
+                token = {"target": n.target, "args": args, "args_spec": args_spec,
+                         "kwargs": kwargs, "kwargs_spec": kwargs_spec}
+
+                # hash substituted args to a number, do not hash specs because specs are not hashable
+                hash_arg = hash((args, kwargs))
+                hash_val = (n.target, hash_arg)
+
+                # check if a node has a substitute and can be eliminated
+                hash_val_in_hash_env = hash_val in hash_env
+                if hash_val_in_hash_env and token_map[hash_val] == token:
+                    modified = True  # substitution happens and the graph is modified
+                    env[n] = hash_env[hash_val]
+                    continue
+
+                new_node = new_graph.node_copy(n, lambda x: env[x])
+                env[n] = new_node
+                if not hash_val_in_hash_env:
+                    hash_env[hash_val] = new_node
+                    token_map[hash_val] = token
+
+        csed_gm = GraphModule(graph_module, new_graph)
+        return PassResult(csed_gm, modified)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/fake_tensor_prop.py b/MLPY/Lib/site-packages/torch/fx/passes/fake_tensor_prop.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a12caf9e9b6c45f0c8255de8d971c517203cfab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/fake_tensor_prop.py
@@ -0,0 +1,73 @@
+from typing import Optional
+
+import torch.fx
+from torch.fx import Node
+from torch.fx._compatibility import compatibility
+from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
+from torch.fx.experimental.proxy_tensor import py_sym_types, snapshot_fake
+from torch.fx.node import map_aggregate
+
+__all__ = ['FakeTensorProp']
+
+@compatibility(is_backward_compatible=False)
+class FakeTensorProp(torch.fx.Interpreter):
+    """
+    Execute an FX graph Node-by-Node and record a fake tensor representing
+    the metadata for the node.  Unlike ShapeProp, (1) this propagation
+    is cheap--it does the propagation with meta tensors which do not actually
+    store data, and (2) the fake tensors have much more fine grained information,
+    e.g., they have accurate alias information that can be consulted by looking
+    at the storages.
+
+    Args:
+         module (GraphModule): The module to be executed
+         mode (Optional[FakeTensorMode]): The dispatch mode used to execute computation indicated by each FX Node.
+    """
+    def __init__(self, module: torch.fx.GraphModule, mode: Optional[FakeTensorMode] = None):
+        super().__init__(module)
+        if mode is None:
+            mode = FakeTensorMode()
+        self._mode = mode
+
+    def run_node(self, n: Node):
+        import sympy
+        from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+
+        result = super().run_node(n)
+        sym = None
+        if (
+            'val' in n.meta and
+            isinstance(v := n.meta['val'], torch.SymInt) and
+            isinstance(v.node.expr, sympy.Symbol) and free_unbacked_symbols(v)
+        ):
+            sym = v
+
+        def extract_val(obj):
+            if isinstance(obj, FakeTensor):
+                return snapshot_fake(obj)
+            elif isinstance(obj, torch.Tensor):
+                # TODO: How is it possible that we get a non fake tensor?  We
+                # should be running under the mode...
+                return snapshot_fake(self._mode.from_tensor(obj, static_shapes=True))
+            elif isinstance(obj, py_sym_types):
+                return obj
+            else:
+                return None
+
+        meta = map_aggregate(result, extract_val)
+        if meta is not None:
+            n.meta['val'] = meta
+            if sym is not None:
+                torch._check(meta == v)
+        return result
+
+    def propagate(self, *args):
+        fake_args = [
+            self._mode.from_tensor(a) if isinstance(a, torch.Tensor) else a
+            for a in args
+        ]
+        return self.propagate_dont_convert_inputs(*fake_args)
+
+    def propagate_dont_convert_inputs(self, *args):
+        with self._mode:
+            return super().run(*args)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/graph_drawer.py b/MLPY/Lib/site-packages/torch/fx/passes/graph_drawer.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe821f8dd7189c22b0f673f6dee79b06b85185f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/graph_drawer.py
@@ -0,0 +1,421 @@
+
+import hashlib
+import torch
+import torch.fx
+from typing import Any, Dict, Optional, TYPE_CHECKING
+from torch.fx.node import _get_qualified_name, _format_arg
+from torch.fx.graph import _parse_stack_trace
+from torch.fx.passes.shape_prop import TensorMetadata
+from torch.fx._compatibility import compatibility
+from itertools import chain
+
+__all__ = ['FxGraphDrawer']
+try:
+    import pydot
+    HAS_PYDOT = True
+except ImportError:
+    HAS_PYDOT = False
+
+_COLOR_MAP = {
+    "placeholder": '"AliceBlue"',
+    "call_module": "LemonChiffon1",
+    "get_param": "Yellow2",
+    "get_attr": "LightGrey",
+    "output": "PowderBlue",
+}
+
+_HASH_COLOR_MAP = [
+    "CadetBlue1",
+    "Coral",
+    "DarkOliveGreen1",
+    "DarkSeaGreen1",
+    "GhostWhite",
+    "Khaki1",
+    "LavenderBlush1",
+    "LightSkyBlue",
+    "MistyRose1",
+    "MistyRose2",
+    "PaleTurquoise2",
+    "PeachPuff1",
+    "Salmon",
+    "Thistle1",
+    "Thistle3",
+    "Wheat1",
+]
+
+_WEIGHT_TEMPLATE = {
+    "fillcolor": "Salmon",
+    "style": '"filled,rounded"',
+    "fontcolor": "#000000",
+}
+
+if HAS_PYDOT:
+    @compatibility(is_backward_compatible=False)
+    class FxGraphDrawer:
+        """
+        Visualize a torch.fx.Graph with graphviz
+        Basic usage:
+            g = FxGraphDrawer(symbolic_traced, "resnet18")
+            g.get_dot_graph().write_svg("a.svg")
+        """
+
+        def __init__(
+            self,
+            graph_module: torch.fx.GraphModule,
+            name: str,
+            ignore_getattr: bool = False,
+            ignore_parameters_and_buffers: bool = False,
+            skip_node_names_in_args: bool = True,
+            parse_stack_trace: bool = False,
+            dot_graph_shape: Optional[str] = None,
+        ):
+            self._name = name
+            self.dot_graph_shape = (
+                dot_graph_shape if dot_graph_shape is not None else "record"
+            )
+            _WEIGHT_TEMPLATE["shape"] = self.dot_graph_shape
+
+            self._dot_graphs = {
+                name: self._to_dot(
+                    graph_module, name, ignore_getattr, ignore_parameters_and_buffers, skip_node_names_in_args, parse_stack_trace
+                )
+            }
+
+            for node in graph_module.graph.nodes:
+                if node.op != "call_module":
+                    continue
+
+                leaf_node = self._get_leaf_node(graph_module, node)
+
+                if not isinstance(leaf_node, torch.fx.GraphModule):
+                    continue
+
+
+                self._dot_graphs[f"{name}_{node.target}"] = self._to_dot(
+                    leaf_node,
+                    f"{name}_{node.target}",
+                    ignore_getattr,
+                    ignore_parameters_and_buffers,
+                    skip_node_names_in_args,
+                    parse_stack_trace,
+                )
+
+        def get_dot_graph(self, submod_name=None) -> pydot.Dot:
+            """
+            Visualize a torch.fx.Graph with graphviz
+            Example:
+                >>> # xdoctest: +REQUIRES(module:pydot)
+                >>> # define module
+                >>> class MyModule(torch.nn.Module):
+                >>>     def __init__(self):
+                >>>         super().__init__()
+                >>>         self.linear = torch.nn.Linear(4, 5)
+                >>>     def forward(self, x):
+                >>>         return self.linear(x).clamp(min=0.0, max=1.0)
+                >>> module = MyModule()
+                >>> # trace the module
+                >>> symbolic_traced = torch.fx.symbolic_trace(module)
+                >>> # setup output file
+                >>> import ubelt as ub
+                >>> dpath = ub.Path.appdir('torch/tests/FxGraphDrawer').ensuredir()
+                >>> fpath = dpath / 'linear.svg'
+                >>> # draw the graph
+                >>> g = FxGraphDrawer(symbolic_traced, "linear")
+                >>> g.get_dot_graph().write_svg(fpath)
+            """
+            if submod_name is None:
+                return self.get_main_dot_graph()
+            else:
+                return self.get_submod_dot_graph(submod_name)
+
+        def get_main_dot_graph(self) -> pydot.Dot:
+            return self._dot_graphs[self._name]
+
+        def get_submod_dot_graph(self, submod_name) -> pydot.Dot:
+            return self._dot_graphs[f"{self._name}_{submod_name}"]
+
+        def get_all_dot_graphs(self) -> Dict[str, pydot.Dot]:
+            return self._dot_graphs
+
+        def _get_node_style(self, node: torch.fx.Node) -> Dict[str, str]:
+
+            template = {
+                "shape": self.dot_graph_shape,
+                "fillcolor": "#CAFFE3",
+                "style": '"filled,rounded"',
+                "fontcolor": "#000000",
+            }
+            if node.op in _COLOR_MAP:
+                template["fillcolor"] = _COLOR_MAP[node.op]
+            else:
+                # Use a random color for each node; based on its name so it's stable.
+                target_name = node._pretty_print_target(node.target)
+                target_hash = int(hashlib.md5(target_name.encode()).hexdigest()[:8], 16)
+                template["fillcolor"] = _HASH_COLOR_MAP[target_hash % len(_HASH_COLOR_MAP)]
+            return template
+
+        def _get_leaf_node(
+            self, module: torch.nn.Module, node: torch.fx.Node
+        ) -> torch.nn.Module:
+            py_obj = module
+            assert isinstance(node.target, str)
+            atoms = node.target.split(".")
+            for atom in atoms:
+                if not hasattr(py_obj, atom):
+                    raise RuntimeError(
+                        str(py_obj) + " does not have attribute " + atom + "!"
+                    )
+                py_obj = getattr(py_obj, atom)
+            return py_obj
+
+        def _typename(self, target: Any) -> str:
+            if isinstance(target, torch.nn.Module):
+                ret = torch.typename(target)
+            elif isinstance(target, str):
+                ret = target
+            else:
+                ret = _get_qualified_name(target)
+
+            # Escape "{" and "}" to prevent dot files like:
+            # https://gist.github.com/SungMinCho/1a017aab662c75d805c5954d62c5aabc
+            # which triggers `Error: bad label format (...)` from dot
+            return ret.replace("{", r"\{").replace("}", r"\}")
+
+        # shorten path to avoid drawing long boxes
+        # for full path = '/home/weif/pytorch/test.py'
+        # return short path = 'pytorch/test.py'
+        def _shorten_file_name(
+            self,
+            full_file_name: str,
+            truncate_to_last_n: int = 2,
+        ):
+            splits = full_file_name.split('/')
+            if len(splits) >= truncate_to_last_n:
+                return '/'.join(splits[-truncate_to_last_n:])
+            return full_file_name
+
+
+        def _get_node_label(
+            self,
+            module: torch.fx.GraphModule,
+            node: torch.fx.Node,
+            skip_node_names_in_args: bool,
+            parse_stack_trace: bool,
+        ) -> str:
+            def _get_str_for_args_kwargs(arg):
+                if isinstance(arg, tuple):
+                    prefix, suffix = r"|args=(\l", r",\n)\l"
+                    arg_strs_list = [_format_arg(a, max_list_len=8) for a in arg]
+                elif isinstance(arg, dict):
+                    prefix, suffix = r"|kwargs={\l", r",\n}\l"
+                    arg_strs_list = [
+                        f"{k}: {_format_arg(v, max_list_len=8)}"
+                        for k, v in arg.items()
+                    ]
+                else:  # Fall back to nothing in unexpected case.
+                    return ""
+
+                # Strip out node names if requested.
+                if skip_node_names_in_args:
+                    arg_strs_list = [a for a in arg_strs_list if "%" not in a]
+                if len(arg_strs_list) == 0:
+                    return ""
+                arg_strs = prefix + r",\n".join(arg_strs_list) + suffix
+                if len(arg_strs_list) == 1:
+                    arg_strs = arg_strs.replace(r"\l", "").replace(r"\n", "")
+                return arg_strs.replace("{", r"\{").replace("}", r"\}")
+
+
+            label = "{" + f"name=%{node.name}|op_code={node.op}\n"
+
+            if node.op == "call_module":
+                leaf_module = self._get_leaf_node(module, node)
+                label += r"\n" + self._typename(leaf_module) + r"\n|"
+                extra = ""
+                if hasattr(leaf_module, "__constants__"):
+                    extra = r"\n".join(
+                        [f"{c}: {getattr(leaf_module, c)}" for c in leaf_module.__constants__]  # type: ignore[union-attr]
+                    )
+                label += extra + r"\n"
+            else:
+                label += f"|target={self._typename(node.target)}" + r"\n"
+                if len(node.args) > 0:
+                    label += _get_str_for_args_kwargs(node.args)
+                if len(node.kwargs) > 0:
+                    label += _get_str_for_args_kwargs(node.kwargs)
+                label += f"|num_users={len(node.users)}" + r"\n"
+
+            tensor_meta = node.meta.get('tensor_meta')
+            label += self._tensor_meta_to_label(tensor_meta)
+
+            # for original fx graph
+            # print buf=buf0, n_origin=6
+            buf_meta = node.meta.get('buf_meta', None)
+            if buf_meta is not None:
+                label += f"|buf={buf_meta.name}" + r"\n"
+                label += f"|n_origin={buf_meta.n_origin}" + r"\n"
+
+            # for original fx graph
+            # print file:lineno code
+            if parse_stack_trace and node.stack_trace is not None:
+                parsed_stack_trace = _parse_stack_trace(node.stack_trace)
+                fname = self._shorten_file_name(parsed_stack_trace.file)
+                label += f"|file={fname}:{parsed_stack_trace.lineno} {parsed_stack_trace.code}" + r"\n"
+
+
+            return label + "}"
+
+        def _tensor_meta_to_label(self, tm) -> str:
+            if tm is None:
+                return ""
+            elif isinstance(tm, TensorMetadata):
+                return self._stringify_tensor_meta(tm)
+            elif isinstance(tm, list):
+                result = ""
+                for item in tm:
+                    result += self._tensor_meta_to_label(item)
+                return result
+            elif isinstance(tm, dict):
+                result = ""
+                for v in tm.values():
+                    result += self._tensor_meta_to_label(v)
+                return result
+            elif isinstance(tm, tuple):
+                result = ""
+                for item in tm:
+                    result += self._tensor_meta_to_label(item)
+                return result
+            else:
+                raise RuntimeError(f"Unsupported tensor meta type {type(tm)}")
+
+        def _stringify_tensor_meta(self, tm: TensorMetadata) -> str:
+            result = ""
+            if not hasattr(tm, "dtype"):
+                print("tm", tm)
+            result += "|" + "dtype" + "=" + str(tm.dtype) + r"\n"
+            result += "|" + "shape" + "=" + str(tuple(tm.shape)) + r"\n"
+            result += "|" + "requires_grad" + "=" + str(tm.requires_grad) + r"\n"
+            result += "|" + "stride" + "=" + str(tm.stride) + r"\n"
+            if tm.is_quantized:
+                assert tm.qparams is not None
+                assert "qscheme" in tm.qparams
+                qscheme = tm.qparams["qscheme"]
+                if qscheme in {
+                        torch.per_tensor_affine,
+                        torch.per_tensor_symmetric,
+                }:
+                    result += "|" + "q_scale" + "=" + str(tm.qparams["scale"]) + r"\n"
+                    result += "|" + "q_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\n"
+                elif qscheme in {
+                        torch.per_channel_affine,
+                        torch.per_channel_symmetric,
+                        torch.per_channel_affine_float_qparams,
+                }:
+                    result += "|" + "q_per_channel_scale" + "=" + str(tm.qparams["scale"]) + r"\n"
+                    result += "|" + "q_per_channel_zero_point" + "=" + str(tm.qparams["zero_point"]) + r"\n"
+                    result += "|" + "q_per_channel_axis" + "=" + str(tm.qparams["axis"]) + r"\n"
+                else:
+                    raise RuntimeError(f"Unsupported qscheme: {qscheme}")
+                result += "|" + "qscheme" + "=" + str(tm.qparams["qscheme"]) + r"\n"
+            return result
+
+        def _get_tensor_label(self, t: torch.Tensor) -> str:
+            return str(t.dtype) + str(list(t.shape)) + r"\n"
+
+        # when parse_stack_trace=True
+        # print file:lineno code
+        def _to_dot(
+            self,
+            graph_module: torch.fx.GraphModule,
+            name: str,
+            ignore_getattr: bool,
+            ignore_parameters_and_buffers: bool,
+            skip_node_names_in_args: bool,
+            parse_stack_trace: bool,
+        ) -> pydot.Dot:
+            """
+            Actual interface to visualize a fx.Graph. Note that it takes in the GraphModule instead of the Graph.
+            If ignore_parameters_and_buffers is True, the parameters and buffers
+            created with the module will not be added as nodes and edges.
+            """
+
+            # "TB" means top-to-bottom rank direction in layout
+            dot_graph = pydot.Dot(name, rankdir="TB")
+
+
+            buf_name_to_subgraph = {}
+
+            for node in graph_module.graph.nodes:
+                if ignore_getattr and node.op == "get_attr":
+                    continue
+
+                style = self._get_node_style(node)
+                dot_node = pydot.Node(
+                    node.name, label=self._get_node_label(graph_module, node, skip_node_names_in_args, parse_stack_trace), **style
+                )
+
+                current_graph = dot_graph
+
+                buf_meta = node.meta.get('buf_meta', None)
+                if buf_meta is not None and buf_meta.n_origin > 1:
+                    buf_name = buf_meta.name
+                    if buf_name not in buf_name_to_subgraph:
+                        buf_name_to_subgraph[buf_name] = pydot.Cluster(buf_name, label=buf_name)
+                    current_graph = buf_name_to_subgraph.get(buf_name)
+
+                current_graph.add_node(dot_node)
+
+                def get_module_params_or_buffers():
+                    for pname, ptensor in chain(
+                        leaf_module.named_parameters(), leaf_module.named_buffers()
+                    ):
+                        pname1 = node.name + "." + pname
+                        label1 = (
+                            pname1 + "|op_code=get_" + "parameter"
+                            if isinstance(ptensor, torch.nn.Parameter)
+                            else "buffer" + r"\l"
+                        )
+                        dot_w_node = pydot.Node(
+                            pname1,
+                            label="{" + label1 + self._get_tensor_label(ptensor) + "}",
+                            **_WEIGHT_TEMPLATE,
+                        )
+                        dot_graph.add_node(dot_w_node)
+                        dot_graph.add_edge(pydot.Edge(pname1, node.name))
+
+                if node.op == "call_module":
+                    leaf_module = self._get_leaf_node(graph_module, node)
+
+                    if not ignore_parameters_and_buffers and not isinstance(leaf_module, torch.fx.GraphModule):
+                        get_module_params_or_buffers()
+
+            for subgraph in buf_name_to_subgraph.values():
+                subgraph.set('color', 'royalblue')
+                subgraph.set('penwidth', '2')
+                dot_graph.add_subgraph(subgraph)
+
+            for node in graph_module.graph.nodes:
+                if ignore_getattr and node.op == "get_attr":
+                    continue
+
+                for user in node.users:
+                    dot_graph.add_edge(pydot.Edge(node.name, user.name))
+
+            return dot_graph
+
+else:
+    if not TYPE_CHECKING:
+        @compatibility(is_backward_compatible=False)
+        class FxGraphDrawer:
+            def __init__(
+                self,
+                graph_module: torch.fx.GraphModule,
+                name: str,
+                ignore_getattr: bool = False,
+                ignore_parameters_and_buffers: bool = False,
+                skip_node_names_in_args: bool = True,
+                parse_stack_trace: bool = False,
+                dot_graph_shape: Optional[str] = None,
+            ):
+                raise RuntimeError('FXGraphDrawer requires the pydot package to be installed. Please install '
+                                   'pydot through your favorite Python package manager.')
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/graph_manipulation.py b/MLPY/Lib/site-packages/torch/fx/passes/graph_manipulation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf16efebf644937fdef7ce7dd8f504a210da134
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/graph_manipulation.py
@@ -0,0 +1,110 @@
+from typing import Any, Dict, List, NamedTuple, Optional
+
+import torch
+from torch.fx._compatibility import compatibility
+from torch.fx.graph import Graph
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import (
+    map_arg,
+    Node,
+    Target,
+)
+from torch.fx.passes.shape_prop import ShapeProp
+
+__all__ = ['replace_target_nodes_with', 'size_bytes', 'get_size_of_all_nodes', 'get_tensor_meta',
+           'get_size_of_node']
+
+@compatibility(is_backward_compatible=False)
+def replace_target_nodes_with(
+    fx_module: GraphModule,
+    old_op: str,
+    old_target: Target,
+    new_op: str,
+    new_target: Target,
+):
+    """Modifies all nodes in fx_module.graph.nodes which match the specified op code and target,
+    and updates them to match the new op code and target"""
+    new_graph = Graph()
+    val_map: Dict[Node, Node] = {}
+    for node in fx_module.graph.nodes:
+        if node.op == old_op and node.target == old_target:
+            args = map_arg(node.args, lambda n: val_map[n])
+            kwargs = map_arg(node.kwargs, lambda n: val_map[n])
+            assert isinstance(args, tuple)
+            assert isinstance(kwargs, dict)
+            val_map[node] = new_graph.create_node(
+                new_op, new_target, args, kwargs, node.name
+            )
+        else:
+            val_map[node] = new_graph.node_copy(node, lambda n: val_map[n])
+    fx_module.graph = new_graph
+
+
+@compatibility(is_backward_compatible=False)
+class size_bytes(NamedTuple):
+    output_size: int
+    total_size: int
+
+
+@compatibility(is_backward_compatible=False)
+def get_size_of_all_nodes(
+    fx_module: GraphModule, args: Optional[List[torch.Tensor]] = None
+) -> None:
+    """Given a fx graph module, update each node with its total size (weights + bias + output)
+    and its output_size(output). For a non-module node, the total size is the output size.
+    return total size"""
+    if args is not None:
+        # Mark shape and dtype for each node (node.shape and node.dtype)
+        ShapeProp(fx_module).propagate(*args)
+    # Calculate the total size of the whole fx graph
+    total_size_of_graph = 0.0
+    for node in fx_module.graph.nodes:
+        if node.op == "output":
+            break
+        node.size_bytes = get_size_of_node(fx_module, node)
+    return
+
+
+@compatibility(is_backward_compatible=False)
+def get_tensor_meta(node: Node) -> Any:
+    tensor_meta = node.meta.get("tensor_meta")
+
+    if not tensor_meta:
+        raise RuntimeError(
+            f"Node {node} has no tensor metadata associated with it! "
+            f"Check that shape propagation has run."
+        )
+
+    return tensor_meta
+
+
+@compatibility(is_backward_compatible=False)
+def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes:
+    """Given a node with node.dtype and node.shape, return its total size and its output size.
+    total_size = weights + bias + output_size
+    """
+    # Total num of elements
+    total_num_of_elems = 0
+    # For a module, conside all parameters
+    if node.op == "call_module":
+        submodule_dict = dict(fx_module.named_modules())
+        submodule = submodule_dict[node.target]
+        parameters = submodule.named_parameters()
+        # Parameters are named tuples
+        for name, p in parameters:
+            total_num_of_elems += p.numel()
+    # Don't forget the output size
+    # node.shape is the shape of this node's output
+    tensor_meta = get_tensor_meta(node)
+    output_elem = tensor_meta.shape.numel()
+    total_num_of_elems += output_elem
+    # Assume for now if it's quantized then it's qint8 or quint8
+    if tensor_meta.is_quantized:
+        size_per_elem_bytes = torch._empty_affine_quantized(
+            [], dtype=tensor_meta.dtype
+        ).element_size()
+    else:
+        size_per_elem_bytes = torch.tensor([], dtype=tensor_meta.dtype).element_size()
+    total_size = size_per_elem_bytes * total_num_of_elems
+    output_size = size_per_elem_bytes * output_elem
+    return size_bytes(output_size, total_size)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/infra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6398778292d887c23e3c69c4eb1f75fd9c516d2e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/infra/__init__.py
@@ -0,0 +1,2 @@
+
+from . import pass_manager
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f51957750e851408b2f6bd0ccdcff2da6c430f4b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/partitioner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/partitioner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7aa121543c091945f67c8a2c1fb67440147c966
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/partitioner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3012f7811617112f42077f55b5b0d4c0a4932616
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_base.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_manager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37bb9842130a3f019142b1ce74fa8913aa84398f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/infra/__pycache__/pass_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/partitioner.py b/MLPY/Lib/site-packages/torch/fx/passes/infra/partitioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0093618f5833874b529bbea16c45138c400491
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/infra/partitioner.py
@@ -0,0 +1,329 @@
+from torch.fx.passes.utils.fuser_utils import fuse_by_partitions
+import collections
+import itertools
+import logging
+
+from copy import copy
+from typing import Dict, Iterable, List, Optional, Sequence, Set
+
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node, _get_qualified_name
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+class Partition:
+    def __init__(self, id: Optional[int] = None, nodes: Optional[Iterable[Node]] = None):
+        self.id = id
+        self.nodes: Set[Node] = set(nodes) if nodes is not None else set()
+
+    def __repr__(self) -> str:
+        return str(self.nodes)
+
+    def add_node(self, node: Node):
+        self.nodes.add(node)
+
+    def remove_node(self, node: Node):
+        self.nodes.remove(node)
+
+    def size(self):
+        return len(self.nodes)
+
+class _DependencyViewer:
+    def __init__(self, graph_module: GraphModule):
+        self.upstreams = collections.defaultdict(set)
+        self.downstreams = collections.defaultdict(set)
+
+        for node in graph_module.graph.nodes:
+            for input_node in node.all_input_nodes:
+                # add input_node and input_node's upstream dependency
+                self.upstreams[node].add(input_node)
+                self.upstreams[node].update(self.upstreams[input_node])
+
+        for node in reversed(graph_module.graph.nodes):
+            for output_node in node.users:
+                # add output_node and output_node's downstream dependency
+                self.downstreams[node].add(output_node)
+                self.downstreams[node].update(self.downstreams[output_node])
+
+    def downstreams_of(self, node: Node) -> Set[Node]:
+        return self.downstreams[node]
+
+    def upstreams_of(self, node: Node) -> Set[Node]:
+        return self.upstreams[node]
+
+class CapabilityBasedPartitioner:
+
+    def __init__(self,
+                 graph_module: GraphModule,
+                 operator_support: OperatorSupportBase,
+                 allows_single_node_partition: bool = False,
+                 non_compute_ops: Optional[Sequence[str]] = None,
+                 allowed_single_node_partition_ops: Optional[Sequence[str]] = None,
+                 ) -> None:
+        self.graph_module = graph_module
+        self.operator_support = operator_support
+        self.allows_single_node_partition = allows_single_node_partition
+        self.non_compute_ops = non_compute_ops if non_compute_ops is not None else []
+        self.allowed_single_node_partition_ops = (
+            allowed_single_node_partition_ops
+            if allowed_single_node_partition_ops is not None
+            else []
+        )
+        self.dependency_viewer = _DependencyViewer(graph_module)
+
+    def __is_node_supported(self, node: Node) -> bool:
+        return (
+            self.operator_support.is_node_supported(dict(self.graph_module.named_modules()), node)
+        )
+
+    def propose_partitions(self) -> List[Partition]:
+        # partition_map is a mapping from partition id to a set of partition id's.
+        # The value set contains all the partition ids that can be reached by doing a
+        # DFS starting from the partition id in the key.
+        partition_map : Dict[int, Set] = collections.defaultdict(set)
+
+        # assumptions: nodes in candidate list is sorted in topological order
+        assignment: Dict[Node, int] = {}   # mapping from node to partition_id
+        partitions_by_id: Dict[int, Partition] = {}  # mapping from partition_id to partition
+        new_partition_id = itertools.count()
+
+        # try to merge partition other_id into partition self_id
+        # merge only happens if the end graph doesn't contain cyclic dependency
+        # returns `True` when merge happens, `False` otherwise.
+        def maybe_merge_partition(self_id: int, other_id: int):
+            # merged_nodes is the union of nodes in two partition to-be-merged
+            merged_nodes = copy(partitions_by_id[self_id].nodes)
+            merged_nodes.update(partitions_by_id[other_id].nodes)
+
+            def dfs_iter_find_cycle(all_user_nodes: List[Node]):
+                for user_node in all_user_nodes:
+                    visited_partition_ids = set()
+
+                    for path_node in self.dependency_viewer.downstreams_of(user_node):
+                        # If any of the nodes in the dfs path of this node are in the merged_nodes
+                        # list then there is a cycle in the graph.
+                        if path_node in merged_nodes:
+                            return True
+
+                        # If any of the nodes in the dfs path of this node are in the assignment
+                        # map then we have to make sure that the partitions that these nodes belong
+                        # to do not form a cycle with the current partitions being merged. This means
+                        # iterating through all the nodes in all the parititons that are traversed in
+                        # the dfs path and checking if they are in the merged_nodes list.
+                        if path_node in assignment:
+                            partition_id = assignment[path_node]
+                            # If the partition id has already been visited then we know that it doesn't
+                            # form a cycle with the current partitions being merged.
+                            if partition_id in visited_partition_ids:
+                                continue
+                            p_map = partition_map[partition_id]
+                            if self_id in p_map or other_id in p_map:
+                                return True
+
+                            visited_partition_ids.add(partition_id)
+
+                return False
+
+            # check if merge would create cyclic dependency.
+            all_user_nodes = []
+            for node in merged_nodes:
+                for user_node in node.users:
+                    if user_node not in merged_nodes:
+                        all_user_nodes.append(user_node)
+
+            if dfs_iter_find_cycle(all_user_nodes):
+                # return false indicating cyclic dependency found and
+                # merge is aborted
+                return False
+
+            # no cyclic dependency found, move forward with the merge
+            # updating partition nodes
+            partitions_by_id[self_id].nodes = merged_nodes
+            # updating assignment map
+            for node in partitions_by_id[other_id].nodes:
+                assignment[node] = self_id
+            # delete other partition
+            del partitions_by_id[other_id]
+
+            partition_map[self_id] = partition_map[self_id].union(partition_map[other_id])
+            del partition_map[other_id]
+
+            return True
+
+        def merge_single_node(node: Node, id: Optional[int]):
+            def _update_partition_map(node: Node, id: int):
+                # Iterate through all the downstream nodes of this node and update the partition map
+                # to indicate that there is a path from the partition id of this node to the target
+                # partition id.
+                downstream_nodes = self.dependency_viewer.downstreams_of(node)
+                for curr_node in downstream_nodes:
+                    target_id = assignment.get(curr_node, None)
+                    if target_id is not None:
+                        partition_map[id].add(target_id)
+
+                # Iterate through all the upstream nodes of this node and update the partition map
+                # to indicate that there is a path from the partition id of the upstream node to the
+                # current node's partition id.
+                upstream_nodes = self.dependency_viewer.upstreams_of(node)
+                for curr_node in upstream_nodes:
+                    source_id = assignment.get(curr_node, None)
+                    if source_id is not None:
+                        partition_map[source_id].add(id)
+
+            if node in assignment:
+                partitions_by_id[assignment[node]].remove_node(node)
+
+            if id is None:
+                assignment.pop(node)
+            elif id not in partitions_by_id:
+                assignment[node] = id
+                partitions_by_id[id] = Partition(id=id, nodes=[node])
+                _update_partition_map(node, id)
+            else:
+                assignment[node] = id
+                partitions_by_id[id].add_node(node)
+                _update_partition_map(node, id)
+
+        logger.debug("Proposing partitions...")
+
+        for node in reversed(self.graph_module.graph.nodes):
+            # use Dict as an ordered set to ensure deterministic partitioning result, don't care value
+            merge_candidates: Dict[int, None] = {}
+
+            # Note a limited horizontal fusion is enabled:
+            #   when `node` is not supported, the code below attempts to fuse consumer of `node`.
+            #
+            # I don't see a need to add a knob to disable horizontal fusion yet, we can short-cut
+            # the fusion by adding an `else` block here to skip horizontal fusion.
+            if self.__is_node_supported(node) and node not in assignment:
+                partition_id = next(new_partition_id)
+                merge_single_node(node, partition_id)
+                merge_candidates[partition_id] = None
+
+            # merge all possible partitions
+            for node in assignment:
+                merge_candidates[assignment[node]] = None
+
+            merge_candidates_list = list(merge_candidates.keys())
+            if len(merge_candidates_list) > 1:
+                self_id = merge_candidates_list[0]
+                for other_id in merge_candidates_list[1:]:
+                    # note: merge partition `other_id` into partition `self_id` if
+                    # it doesn't create cyclic dependency in the graph, otherwise,
+                    # this is a no-op
+                    maybe_merge_partition(self_id, other_id)
+
+        # post processing to re-assign "getitem" nodes into upstream partition
+        logger.debug("Reassigning getitem nodes to its producer node's partition...")
+        nodes_reassignment: Dict[Node, int] = {}
+        for node in self.graph_module.graph.nodes:
+            is_tuple_output = True
+            for user in node.users:
+                if user.op != "call_function" or \
+                   _get_qualified_name(user.target) != "_operator.getitem":     # type: ignore[arg-type]
+                    is_tuple_output = False
+                    break
+
+            # node has tuple outputs, re-assign all following getitem node into node's partition
+            if is_tuple_output:
+                id = assignment.get(node, None)     # type: ignore[arg-type]
+                for user in node.users:
+                    if assignment.get(user, None) != id:    # type: ignore[arg-type]
+                        nodes_reassignment[user] = id  # type: ignore[assignment]
+        for node, id in nodes_reassignment.items():
+            merge_single_node(node, id)
+
+        # filter out single node partitions
+        if not self.allows_single_node_partition:
+            logger.debug("Filtering out single node partitions...")
+            default_non_compute_ops = {"torch.ops.aten.view", "_operator.getitem"}
+            non_compute_ops = default_non_compute_ops.union(set(self.non_compute_ops))
+            partitions_to_remove: List[int] = []
+            for id, partition in partitions_by_id.items():
+                compute_node_count = 0
+                for node in partition.nodes:
+                    if node.op == "call_function":
+                        assert callable(node.target)
+                        if _get_qualified_name(node.target) not in non_compute_ops:
+                            compute_node_count += 1
+                        if _get_qualified_name(node.target) in self.allowed_single_node_partition_ops:
+                            compute_node_count += 1
+                if compute_node_count <= 1:
+                    partitions_to_remove.append(id)
+            for id in partitions_to_remove:
+                del partitions_by_id[id]
+
+        logger.debug("Partitions proposed:")
+        for id, partition in partitions_by_id.items():
+            logger.debug("partition #%s: %s", id, [node.name for node in partition.nodes])
+
+        return list(partitions_by_id.values())
+
+    def fuse_partitions(self, partitions: List[Partition]) -> GraphModule:
+        logger.debug("Fusing partitions...")
+        # fuse_by_partitions expects partitions in List[List[Node]]: [ [node0, node1], [node2, node3] ]
+        return fuse_by_partitions(self.graph_module, [list(partition.nodes) for partition in partitions])
+
+    # remove non-compute-ops that sits at the boundary of a partition.
+    def remove_bookend_non_compute_ops(self, partitions: List[Partition]):
+        non_compute_ops = set(self.non_compute_ops)
+
+        def is_non_compute_node(node: Node):
+            return node.op == "call_function" and \
+                _get_qualified_name(node.target) in non_compute_ops  # type: ignore[arg-type]
+
+        # cache transparent nodes
+        transparent_input_nodes: Dict[Node, bool] = {}
+        transparent_output_nodes: Dict[Node, bool] = {}
+
+        def is_transparent_input_node(node: Node, partition: Set[Node], removed_nodes: Set[Node]):
+            if node.op == "placeholder" or (node not in partition) or (node in removed_nodes):
+                return True
+            if node in transparent_input_nodes:
+                return transparent_input_nodes[node]
+            if is_non_compute_node(node):
+                for input_n in node.all_input_nodes:
+                    if not is_transparent_input_node(input_n, partition, removed_nodes):
+                        transparent_input_nodes[node] = False
+                        return False
+                transparent_input_nodes[node] = True
+                return True
+            transparent_input_nodes[node] = False
+            return False
+
+        def is_transparent_output_node(node: Node, partition: Set[Node], removed_nodes: Set[Node]):
+            if node.op == "placeholder" or (node not in partition) or (node in removed_nodes):
+                return True
+            if node in transparent_output_nodes:
+                return transparent_output_nodes[node]
+            if is_non_compute_node(node):
+                for output_n in node.users:
+                    if not is_transparent_output_node(output_n, partition, removed_nodes):
+                        transparent_output_nodes[node] = False
+                        return False
+                transparent_output_nodes[node] = True
+                return True
+            transparent_output_nodes[node] = False
+            return False
+
+        for partition in partitions:
+            # Note it's ok to use `set` here, since we are only query if a node
+            # has been removed. We are NEVER going to iterate on nodes inside
+            # the set.
+            remove_node: Set[Node] = set()
+            for node in partition.nodes:
+                if is_non_compute_node(node) and \
+                    (is_transparent_input_node(node, partition.nodes, remove_node) or
+                     is_transparent_output_node(node, partition.nodes, remove_node)):
+                    remove_node.add(node)
+
+            if len(remove_node) != 0:
+                partition.nodes = partition.nodes - remove_node
+
+    def partition_and_fuse(self) -> GraphModule:
+        partitions = self.propose_partitions()
+        fused_gm = self.fuse_partitions(partitions)
+        return fused_gm
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_base.py b/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb027a90e0a4541006a255d245c343daac645abc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_base.py
@@ -0,0 +1,75 @@
+import abc
+from collections import namedtuple
+from typing import Optional
+
+from torch.fx.graph_module import GraphModule
+from torch.fx._compatibility import compatibility
+
+
+__all__ = ['PassResult', 'PassBase']
+
+@compatibility(is_backward_compatible=False)
+class PassResult(namedtuple("PassResult", ["graph_module", "modified"])):
+    """
+    Result of a pass:
+        graph_module: The modified graph module
+        modified: A flag for if the pass has modified the graph module
+    """
+    def __new__(cls, graph_module, modified):
+        return super().__new__(cls, graph_module, modified)
+
+@compatibility(is_backward_compatible=False)
+class PassBase(abc.ABC):
+    """
+    Base interface for implementing passes.
+
+    It is required to implement the `call` function so that we can directly
+    pass instances of the Pass directly to the PassManager and call them as a
+    function.
+
+    We can directly pass an instance of a class implementing this interface into
+    the PassManager's `passes` attribute.
+    """
+
+    def __call__(self, graph_module: GraphModule) -> Optional[PassResult]:
+        """
+        Runs the precondition check, the pass itself, and the postcondition check.
+        """
+
+        self.requires(graph_module)
+        res = self.call(graph_module)
+        self.ensures(graph_module)
+        return res
+
+    @abc.abstractmethod
+    def call(self, graph_module: GraphModule) -> Optional[PassResult]:
+        """
+        The pass that is run through the given graph module. To implement a
+        pass, it is required to implement this function.
+
+        Args:
+            graph_module: The graph module we will run a pass on
+        """
+        pass
+
+    def requires(self, graph_module: GraphModule) -> None:  # noqa: B027
+        """
+        This function will be called before the pass is run and will check that
+        the given graph module contains the preconditions needed to run the
+        pass. It is not required to implement this function.
+
+        Args:
+            graph_module: The graph module we will run checks on
+        """
+        pass
+
+    def ensures(self, graph_module: GraphModule) -> None:  # noqa: B027
+        """
+        This function will be called after the pass is run and will check that
+        the given graph module contains the postconditions needed to run the
+        pass. It is not required to implement this function.
+
+        Args:
+            graph_module: The graph module we will run checks on
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_manager.py b/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d4580af2f4ba06b44b6c9d6297eb130fab4147
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/infra/pass_manager.py
@@ -0,0 +1,303 @@
+import inspect
+import logging
+from queue import Queue
+from functools import wraps
+from typing import Callable, Dict, List
+
+import torch.nn as nn
+from torch.fx.graph_module import GraphModule
+from torch.fx._compatibility import compatibility
+from torch.fx.passes.infra.pass_base import PassResult
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
+__all__ = ['pass_result_wrapper', 'this_before_that_pass_constraint', 'PassManager']
+
+@compatibility(is_backward_compatible=False)
+def pass_result_wrapper(fn: Callable) -> Callable:
+    """
+    Wrapper for passes which currently do not return a PassResult.
+    This wrapper makes them return a PassResult containing the modified object
+    and True for the "modified" flag.
+
+    Args:
+        fn (Callable[Module, Any])
+
+    Returns:
+        wrapped_fn (Callable[Module, PassResult])
+    """
+    if fn is None:
+        return None
+
+    @wraps(fn)
+    def wrapped_fn(gm):
+        res = fn(gm)
+        if res is None:
+            return PassResult(gm, True)
+        if isinstance(res, PassResult):
+            return res
+        elif isinstance(res, nn.Module):
+            return PassResult(res, True)
+
+    if not inspect.isfunction(fn):
+        wrapped_fn.__name__ = type(fn).__name__
+
+    return wrapped_fn
+
+def _validate_pass_schedule_constraint(
+    constraint: Callable[[Callable, Callable], bool], passes: List[Callable]
+) -> None:
+    for i, a in enumerate(passes):
+        for j, b in enumerate(passes[i + 1 :]):
+            if constraint(a, b):
+                continue
+            raise RuntimeError(
+                f"pass schedule constraint violated. Expected {a} before {b}"
+                f" but found {a} at index {i} and {b} at index{j} in pass"
+                f" list."
+            )
+
+def _topological_sort_passes(
+    passes: List[Callable], constraints: List[Callable]
+) -> List[Callable]:
+    """
+    Args
+        passes: Passes that we are ordering
+        constraints: Constraints applied on these passes
+
+    Returns
+        A sorted list of callables and a boolean of if a circular dependency
+        existed
+    """
+    if len(constraints) == 0:
+        return passes
+
+    # Contruct a graph mapping nodes to a list of their users
+    graph: Dict[Callable, List[Callable]] = {p : [] for p in passes}
+    indegree_map: Dict[Callable, int] = dict.fromkeys(passes, 0)
+    candidates: Queue = Queue()
+    for a in passes:
+        for b in passes:
+            if a == b:
+                continue
+
+            for constraint in constraints:
+                if not constraint(a, b):
+                    graph[b].append(a)
+                    indegree_map[a] += 1
+
+        if indegree_map[a] == 0:
+            candidates.put(a)
+
+    visited: Dict[Callable, bool] = dict.fromkeys(passes, False)
+    sorted_passes: List[Callable] = []
+
+    while not candidates.empty():
+        p = candidates.get()
+        sorted_passes.append(p)
+        visited[p] = True
+
+        for n in graph[p]:
+            if not visited[n]:
+                indegree_map[n] -= 1
+                if indegree_map[n] == 0:
+                    candidates.put(n)
+
+    # Check if there are unvisited nodes (aka cycles in the graph)
+    cycle_passes = list(filter(lambda p: indegree_map[p] != 0, indegree_map.keys()))
+    if len(cycle_passes) != 0:
+        error = f"Circular dependency detected within the following passes: {cycle_passes}"
+        raise RuntimeError(error)
+
+    return sorted_passes
+
+@compatibility(is_backward_compatible=False)
+def this_before_that_pass_constraint(this: Callable, that: Callable) -> Callable:
+    """
+    Defines a partial order ('depends on' function) where `this` must occur
+    before `that`.
+
+    For example, the following pass list and constraint list would be invalid.
+    ```
+    passes = [pass_b, pass_a]
+
+    constraints = [
+        this_before_that_pass_constraint(pass_a, pass_b)
+    ]
+    ```
+
+    Args:
+        this (Callable): pass which should occur first
+        that (Callable): pass which should occur later
+
+    Returns:
+        depends_on (Callable[[Object, Object], bool]
+    """
+
+    def depends_on(a: Callable, b: Callable):
+        if a == that and b == this:
+            return False
+        return True
+
+    return depends_on
+
+
+@compatibility(is_backward_compatible=False)
+class PassManager:
+    """
+    Construct a PassManager.
+
+    Collects passes and constraints. This defines the pass schedule, manages
+    pass constraints and pass execution.
+
+    Args:
+        passes (Optional[List[Callable]]): List of passes. A pass is a
+            callable which modifies an object and returns a PassResult
+        constraint (Optional[List[Callable]]): List of constraints. A
+            constraint is a callable which takes two passes (A, B) and returns
+            True if A depends on B and False otherwise. See implementation of
+            `this_before_that_pass_constraint` for example.
+        steps (int): Max number of times we run the passes (default = 1).
+        run_checks_after_each_pass (bool): Whether to run checks and linting
+            after each pass
+        suppress_check_failures (bool): Whether to raise errors when running
+            checks
+    """
+
+    passes: List[Callable[[nn.Module], PassResult]]
+    constraints: List[Callable[[Callable, Callable], bool]]
+    _validated: bool = False
+    steps: int = 1
+
+    def __init__(
+        self,
+        passes=None,
+        constraints=None,
+        steps=None,
+        run_checks_after_each_pass: bool = False,
+        suppress_check_failures: bool = False,
+    ):
+        self.passes = passes or []
+        self.constraints = constraints or []
+        if steps:
+            self.steps = steps
+
+        self.run_checks_after_each_pass = run_checks_after_each_pass
+        self.suppress_check_failures = suppress_check_failures
+
+    def add_pass(self, _pass: Callable):
+        """
+        Adds a pass into the current list of passes.
+        """
+        self.passes.append(_pass)
+        self._validated = False
+
+    def add_constraint(self, constraint: Callable):
+        """
+        Adds a constraint into the current list of constraints.
+        """
+        self.constraints.append(constraint)
+        self._validated = False
+
+    def validate_constraints(self):
+        """
+        Validates that current pass schedule defined by `self.passes` is valid
+        according to all constraints in `self.constraints`
+        """
+        if self._validated:
+            return
+        for constraint in self.constraints:
+            _validate_pass_schedule_constraint(constraint, self.passes)
+        self._validated = True
+
+    def solve_constraints(self):
+        """
+        Finds a valid traversal order based on the given constraints and orders
+        the passes based on this order.
+
+        If a circular dependency exists between the constraints and steps = 1,
+        then we will raise an error because if steps != 1 this means that we
+        will re-run the passes, allowing for circular dependencies.
+        """
+        self.passes = _topological_sort_passes(self.passes, self.constraints)
+        self._validated = True
+
+    def add_checks(self, check: Callable) -> None:
+        """
+        Adds a function which takes runs various checks on a given graph module.
+        This function is run before and after each pass if the
+        `run_checks_after_each_pass` flag is enabled.
+        """
+        sig = inspect.signature(check)
+
+        if len(list(sig.parameters.values())) != 1:
+            raise TypeError("PassManager check function should only take in one variable, a module")
+
+        setattr(self, "check", check)  # noqa: B010
+
+    def check(self, module: nn.Module) -> None:
+        pass
+
+    def __call__(self, module: nn.Module) -> PassResult:
+        """
+        Runs a list of passes in the order based on `self.passes` on the given
+        graph module. Each time a pass is run, checks and linting will be run on
+        the graph module if `run_checks_after_each_pass` is set.
+
+        If the module is a graph module, we will run the list of passes until
+        the graph stops changing, or until `steps` number of times.
+        """
+        # Order the passes based on the constraints
+        if not self._validated:
+            self.solve_constraints()
+
+        # Check graph invariants
+        self.check(module)
+
+        # Run the set of passes `steps` number of times or until the graph stops
+        # changing
+        overall_modified = False
+        for _ in range(self.steps):
+            modified = False
+
+            # Run the set of passes on the graph module
+            for i, fn in enumerate(self.passes):
+                fn_name = fn.__name__ if inspect.isfunction(fn) else type(fn).__name__
+                logger.debug("Running pass '%s'", fn_name)
+
+                try:
+                    res = fn(module)
+
+                    if not isinstance(res, PassResult) and not hasattr(
+                        res, "graph_module"
+                    ):
+                        raise TypeError(
+                            f"The result of the pass {fn_name} should be type PassResult."
+                            + "Please wrap it with pass_result_wrapper()"
+                        )
+                    module = res.graph_module
+                    modified = modified or res.modified
+
+                    if isinstance(module, GraphModule):
+                        logger.debug("Graph after pass '%s': %s", fn_name, module.graph)
+                        module.recompile()
+
+                    # Check graph invariants
+                    if self.run_checks_after_each_pass:
+                        self.check(module)
+
+                except Exception as e:
+                    prev_pass_names = [
+                        p.__name__ if inspect.isfunction(p) else type(p).__name__
+                        for p in self.passes[:i]
+                    ]
+                    msg = f"An error occurred when running the '{fn_name}' pass after the following passes: {prev_pass_names}"
+                    raise Exception(msg) from e
+
+            # If the graph no longer changes, then we can stop running these passes
+            overall_modified = overall_modified or modified
+            if not modified:
+                break
+
+        return PassResult(module, overall_modified)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/net_min_base.py b/MLPY/Lib/site-packages/torch/fx/passes/net_min_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2d6c1fb0d88a072480a3b2f74032d3bd4869999
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/net_min_base.py
@@ -0,0 +1,731 @@
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.fx
+
+from torch.fx._compatibility import compatibility
+from torch.fx.node import map_arg
+
+from .shape_prop import ShapeProp
+from .split_utils import split_by_tags
+from .tools_common import (
+    CALLABLE_NODE_OPS,
+    FxNetAccFusionsFinder,
+    Names,
+    NodeList,
+    NodeSet,
+    TensorOrTensors,
+    Tensors,
+)
+
+__all__ = [
+    "FxNetMinimizerBadModuleError",
+    "FxNetMinimizerRunFuncError",
+    "FxNetMinimizerResultMismatchError",
+]
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@compatibility(is_backward_compatible=False)
+class FxNetMinimizerBadModuleError(Exception):
+    """
+    Raised if failed to split out a minimize module
+    """
+
+    pass
+
+
+@compatibility(is_backward_compatible=False)
+class FxNetMinimizerRunFuncError(Exception):
+    """
+    Raised if error occurs during run_a or run_b functions
+    """
+
+    pass
+
+
+@compatibility(is_backward_compatible=False)
+class FxNetMinimizerResultMismatchError(Exception):
+    """
+    Raised if comparing function thinks the results are mismatching.
+    """
+
+    pass
+
+
+@dataclass
+class _MinimizerSettingBase:
+    """
+    Args:
+    `accumulate_error`: Instead of using a's input for both converted module to verify
+    , use the previous outputs of each converted module as input to accumulate the
+    errors.
+
+    `traverse_method`: "sequential" or "binary" or "accumulate"
+    Determine the way of traverse the nodes in FX module.
+
+    `find_all`: Minimizer will go through the entire model and return all problematic nodes.
+
+    `return_intermediate`: If true, when using `run_nodes()` function to run the
+    model, intermediate results of all the ops will be returned as output.
+    """
+
+    accumulate_error: bool = False
+    traverse_method: str = "sequential"
+    find_all: bool = False
+    return_intermediate: bool = False
+
+    def __str__(self):
+        settings_str = "FX Minimizer Settings:\n"
+
+        for k, v in vars(self).items():
+            settings_str += f"\t{k}: {v}\n"
+
+        return settings_str
+
+
+class _MinimizerBase:
+    """
+    This class is used to automatically find problematic nodes in a model. It takes a FX
+    graphmodule and generate some submodules while traverse the graph. Then two functions
+    `run_a` and `run_b` will be used to run the same submodule and a function `compare_fn`
+    will be used to compare the results.
+
+    Currently we provides two ways to traverse the graph and generate submodules.
+        1. Sequential traversal: this will traverse the graph node by node and generate
+           one submodule with one sigle node.
+        2. Binary searching: this will do a binary search style traversal on the graph.
+
+    For internal Users, a guide can be found here https://fb.quip.com/HDtuAgiKGfkP.
+    """
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Tensors,
+        compare_fn: Callable[
+            [TensorOrTensors, TensorOrTensors, Names], Tuple[float, bool]
+        ],
+        settings: _MinimizerSettingBase,
+        module_exporter: Optional[
+            Callable[
+                [List[torch.Tensor], torch.fx.GraphModule, str],
+                None
+            ]
+        ] = None,
+    ):
+        assert isinstance(module, torch.fx.GraphModule)
+
+        self.module = module
+        self.sample_input = sample_input
+        self.compare_fn = compare_fn
+        self.module_exporter = module_exporter
+        self.settings = settings
+
+        # Stores outputs of run_a function
+        self.a_outputs: Dict[str, Any] = {}
+
+        # Stores outputs of run_b function
+        self.b_outputs: Dict[str, Any] = {}
+
+        # Stores the results of compare_fn
+        self.results: Dict[Any, Any] = {}
+
+        # Stores the report for the runs
+        self.reports: List[List[str]] = []
+
+        # Current iteration
+        self.iteration: int = 0
+
+        callable_nodes = {
+            node for node in self.module.graph.nodes if node.op in CALLABLE_NODE_OPS
+        }
+        ShapeProp(self.module).propagate(*self.sample_input)
+        self.fusions = FxNetAccFusionsFinder(self.module, callable_nodes)()
+
+        # Check if number of input in sample_input matches the number of placeholders
+        placeholders = [
+            node.name for node in self.module.graph.nodes if node.op == "placeholder"
+        ]
+        assert len(placeholders) == len(self.sample_input)
+
+        # Store sample_input
+        for i, name in enumerate(placeholders):
+            self.a_outputs[name] = sample_input[i]
+            self.b_outputs[name] = sample_input[i]
+
+    def run_a(self, mod: torch.fx.GraphModule, inputs: Tensors) -> TensorOrTensors:
+        """
+        Run `mod` with `inputs` and generate output. The output will be compared with
+        output of run_b().
+        """
+        raise RuntimeError("run_a() is not implemented.")
+
+    def run_b(self, mod: torch.fx.GraphModule, inputs: Tensors) -> TensorOrTensors:
+        """
+        Run `mod` with `inputs` and generate output. The output will be compared with
+        output of run_a().
+        """
+        raise RuntimeError("run_b() is not implemented.")
+
+    def _store_outputs(
+        self,
+        a_result: TensorOrTensors,
+        b_result: TensorOrTensors,
+        submodule: torch.fx.GraphModule,
+    ):
+        """
+        Store the outputs of self.run_a() and self.run_b() into self.a_outputs and
+        self.b_outputs, so that we can use them when execute preceding nodes that
+        use those outputs as inputs.
+
+        Args:
+            a_result: Output of self.run_a(). Could be a tensor or tensors.
+            b_result: Output of self.run_b(). Could be a tensor or tensors.
+            submodule: The module that generates a_result and b_result.
+        """
+        output_node = next(
+            node for node in submodule.graph.nodes if node.op == "output"
+        )
+
+        # Only one output
+        if isinstance(output_node.args[0], torch.fx.Node):
+            self.a_outputs[output_node.args[0].name] = a_result
+            self.b_outputs[output_node.args[0].name] = b_result
+        # Multiple outputs
+        else:
+            for i, arg in enumerate(output_node.args[0]):
+                self.a_outputs[arg.name] = a_result[i]
+                self.b_outputs[arg.name] = b_result[i]
+
+    def _get_submod_inputs(
+        self, main_module: torch.fx.GraphModule, submod_path: str
+    ) -> Tuple[Tensors, Tensors]:
+        """
+        Try get submodule inputs from stored outputs. If not found then use
+        torch_glow.get_submod_inputs to get the inputs.
+
+        If accumulate_error is False, use a_input for run_a() and run_b()
+        otherwise use a_input for run_a and b_input for run_b.
+
+        Args:
+            main_module: Top-levlel fx module.
+            submod_path: Path to the submodule we want to run and compare results.
+
+        Returns:
+            a_input: List of tensor(s) that will be used by run_a() as submodule inputs.
+            b_input: List of tensor(s) that will be used by run_b() as submodule inputs.
+        """
+        a_input = []
+        b_input = []
+        submodule = getattr(main_module, submod_path)
+        placeholders = [
+            node.name for node in submodule.graph.nodes if node.op == "placeholder"
+        ]
+
+        # If all placeholder can be found in stored outputs, use stored
+        # outputs as inputs. Otherwise, use `torch_glow.get_submod_inputs`
+        # to get the inputs.
+        if set(placeholders) <= self.a_outputs.keys():
+            for name in placeholders:
+                a_input.append(self.a_outputs[name])
+                b_input.append(self.b_outputs[name])
+        else:
+            if self.settings.accumulate_error:
+                print(f"Can't find previous stored outputs named {placeholders}!")
+
+            def get_inputs(self: torch.nn.Module, inputs: Any):
+                nonlocal a_input
+                a_input = inputs
+
+            # Use forward hook to get the inputs to the submodule
+            handle = submodule.register_forward_pre_hook(get_inputs)
+            main_module(*self.sample_input)
+            handle.remove()
+
+            b_input = a_input
+
+        if not self.settings.accumulate_error:
+            return a_input, a_input
+
+        return a_input, b_input
+
+    def _tag_nodes(self, selected_nodes: NodeSet):
+        """
+        Tag selected nodes with tag "minimize". Nodes with the same tags will
+        be split to the same submodule afterwards.
+
+        Args:
+            selected_nodes: Nodes that we want to minimize. We will tag those nodes
+                with "minimize", all preceding nodes with "main_0" and all following
+                nodes with "main_1".
+        """
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+
+            if node in selected_nodes:
+                node.tag = "minimize"
+            elif any(
+                n.tag in {"minimize", "main_1"}
+                for n in node.all_input_nodes
+                if n.op in CALLABLE_NODE_OPS
+            ):
+                node.tag = "main_1"
+            else:
+                node.tag = "main_0"
+
+    def _build_submodule(self, nodes: NodeSet) -> Tuple[torch.fx.GraphModule, str]:
+        """
+        Split self.module so that one submodule consists of `nodes` and only `nodes`.
+
+        Args:
+            nodes: Nodes that we want to include in the minimize submodule.
+
+        Returns:
+            split_module (torch.fx.GraphModule): the module after split.
+            submodule_name (str): the name of the submodule that consists of `nodes`.
+        """
+        # Color provided nodes
+        self._tag_nodes(nodes)
+
+        # Split module based on coloring
+        split_module = split_by_tags(self.module, ["main_0", "minimize", "main_1"])
+
+        # Find submodule containing colored nodes
+        submodule_name: str = ""
+        for child_name, _ in split_module.named_children():
+            # Skip submodules we're not interested in at the moment
+            if "minimize" not in child_name:
+                continue
+
+            if submodule_name == "":
+                submodule_name = child_name
+            else:
+                raise FxNetMinimizerBadModuleError(
+                    f"Expected only one minimize submodule with nodes {nodes}"
+                )
+
+        if submodule_name == "":
+            raise FxNetMinimizerBadModuleError(
+                f"Minimize submodule was not found with nodes {nodes}"
+            )
+
+        return split_module, submodule_name
+
+    def _run_and_compare(
+        self, split_module: torch.fx.GraphModule, submod_name: str, output_names: Names
+    ):
+        """
+        Run the submodule in `split_module` that has name `submod_name`
+        using `self.run_a` and `self.run_b` and compare their results.
+
+        Args:
+            split_module: Main module that contains the minimize submodule.
+            submod_name: Name of the minimize submodule.
+            output_names: Names of the node we want to output. If None, we
+                will use the original output.
+        """
+        submodule = getattr(split_module, submod_name)
+        a_input, b_input = self._get_submod_inputs(split_module, submod_name)
+
+        if len(self.reports) == 0:
+            self.reports.append([])
+            self.iteration = 1
+
+        report = self.reports[self.iteration - 1]
+        report.append("Run and compare ...")
+
+        if output_names:
+            output_nodes: NodeList = []
+            for node in submodule.graph.nodes:
+                if node.op == "output":
+                    submodule.graph.erase_node(node)
+
+                if node.name in output_names:
+                    output_nodes.append(node)
+
+            submodule.graph.output(
+                output_nodes[0] if len(output_nodes) == 1 else tuple(output_nodes)
+            )
+            submodule.graph.lint()
+            submodule.recompile()
+
+        # Use name of args in output node as key to store comparison result
+        for node in submodule.graph.nodes:
+            if node.op == "output":
+                result_key = map_arg(node.args, lambda x: x.name)
+
+        try:
+            a_result = self.run_a(submodule, a_input)
+            b_result = self.run_b(submodule, b_input)
+            self._store_outputs(a_result, b_result, submodule)
+        except Exception as e:
+            report.append(f"Exception raised when running {submod_name}: {e}")
+            raise FxNetMinimizerRunFuncError(  # noqa: TRY200
+                f"Exception raised when running {submod_name}: {e}"
+            )
+
+        # Compare results
+        names: Names = output_names
+        if output_names is None:
+            names = [str(v) for v in result_key]  # type: ignore[possibly-undefined]
+
+        numeric_result, bool_result = self.compare_fn(a_result, b_result, names)
+
+        self.results[result_key] = numeric_result  # type: ignore[possibly-undefined]
+        report.append(f"Numerical accuracy = {numeric_result}")
+        if not bool_result:
+            report.append(f"Result mismatch for {result_key}")
+            if self.module_exporter:
+                self.module_exporter(
+                    List[torch.Tensor](a_input), submodule, str(result_key[0]) + "_cpu",
+                )
+                self.module_exporter(
+                    List[torch.Tensor](b_input), submodule, str(result_key[0]) + "_acc",
+                )
+            raise FxNetMinimizerResultMismatchError(f"Result mismatch for {result_key}")
+
+    def _binary_search_impl(
+        self, all_nodes: NodeList, start_idx: int, end_idx: int
+    ) -> NodeSet:
+        """
+        Recursive binary search implementation.
+        """
+        nodes: NodeList = all_nodes[start_idx:end_idx]
+
+        report: List[str] = []
+        self.reports.append(report)
+        self.iteration += 1
+        report.append(f"Binary search iteration {self.iteration}.")
+        report.append(
+            f"From node index {start_idx} to {end_idx-1}. "
+            f"Size of the interested node list is {len(nodes)}"
+        )
+
+        cur_nodes: NodeSet = set(nodes)
+
+        for node in nodes:
+            if node in self.fusions:
+                cur_nodes.update(self.fusions[node])
+
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, [])
+        except (FxNetMinimizerRunFuncError, FxNetMinimizerResultMismatchError):
+
+            if len(nodes) == 1:
+                report.append(
+                    f"This is the last node in the sub-module. "
+                    f"Search in the current branch is successful with culprit = {cur_nodes}."
+                )
+                self.print_report(report)
+                return cur_nodes
+
+            report.append(
+                "Proceed to split and lower the halves of the current "
+                "sub-module individually."
+            )
+            self.print_report(report)
+
+            mid = len(nodes) // 2
+            culprits = self._binary_search_impl(all_nodes, start_idx, start_idx + mid)
+
+            if len(culprits) != 0 and not self.settings.find_all:
+                return culprits
+
+            culprits = self._binary_search_impl(all_nodes, start_idx + mid, end_idx)
+
+            if len(culprits) == 0:
+                report.append(
+                    f"Further split and lowering found no errors. "
+                    f"Unable to minimize the submodule with list of nodes: {nodes}"
+                )
+                self.print_report(report)
+
+            return culprits
+        else:
+            report.append("No discrepancy found.")
+            self.print_report(report)
+            return set()
+
+    def _binary_traverse(self, nodes: NodeList) -> NodeSet:
+        """
+        Binary search on `nodes` for culprit.
+        """
+        return self._binary_search_impl(nodes, 0, len(nodes))
+
+    def _sequential_traverse(self, nodes: NodeList) -> NodeSet:
+        """
+        Traverse `nodes` one by one and determine if any of them is a culprit.
+        """
+        culprits: NodeSet = set()
+
+        for node in nodes:
+            report: List[str] = []
+            self.reports.append(report)
+            self.iteration += 1
+            report.append(f"Sequential traverse iteration {self.iteration}.")
+            report.append(f"Visit node: {node.name}")
+
+            _LOGGER.info("Visit node: %s", node.name)
+            cur_nodes: NodeSet = {node}
+
+            if node in self.fusions:
+                cur_nodes = self.fusions[node]
+
+            try:
+                split_module, submod_name = self._build_submodule(cur_nodes)
+                self._run_and_compare(split_module, submod_name, [node.name])
+                self.print_report(report)
+            except (FxNetMinimizerResultMismatchError):
+                culprits.add(node)
+                report.append(f"Found culprit from numeric error: {node}")
+                self.print_report(report)
+                if not self.settings.find_all:
+                    return culprits
+            except (FxNetMinimizerRunFuncError):
+                culprits.update(cur_nodes)
+                report.append(f"Found culprit from run error: {node}")
+                self.print_report(report)
+                if not self.settings.find_all:
+                    return culprits
+
+        return culprits
+
+    def _defined_traverse(self, nodes: NodeList) -> NodeSet:
+        """
+        run user defined `nodes` and determine if it is a culprit.
+        """
+        culprits: NodeSet = set()
+
+        first_node_name = nodes[0].name
+        output_node_name = nodes[-1].name
+        report = [f"Defined graph from {first_node_name} to {output_node_name}"]
+        cur_nodes: NodeSet = set(nodes)
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, [output_node_name])
+            self.print_report(report)
+        except (FxNetMinimizerResultMismatchError, FxNetMinimizerRunFuncError):
+            report.append(f"Found culprit {cur_nodes}")
+            self.print_report(report)
+            return culprits
+
+        return culprits
+
+    def _accumulate_traverse(self, nodes: NodeList) -> NodeSet:
+        culprits: NodeSet = set()
+        nodes_to_run: NodeSet = set()
+
+        # find_all is not supported for accumulate traversal because all the
+        # ops run on NNPI. So we return after the first op that raises error.
+        if self.settings.find_all:
+            print("'Find All' mode is not supported in accumulate traversal.")
+            return culprits
+
+        for node in nodes:
+            report: List[str] = []
+            self.reports.append(report)
+            self.iteration += 1
+            report.append(f"Accumulate traverse iteration {self.iteration}.")
+
+            nodes_to_run.add(node)
+
+            node_name = node.name
+            if node_name is not None and isinstance(node_name, tuple):
+                node_name = node_name[0]
+            assert node_name is not None and isinstance(
+                node_name, str
+            ), f"minimize: node_name: {node_name}"
+
+            report.append(f"Add node: {node_name}")
+
+            try:
+                split_module, submod_name = self._build_submodule(nodes_to_run)
+                self._run_and_compare(split_module, submod_name, [node_name])
+                self.print_report(report)
+            except (FxNetMinimizerResultMismatchError, FxNetMinimizerRunFuncError):
+                culprits.add(node)
+                report.append(f"Found culprit {node}")
+                self.print_report(report)
+                return culprits
+
+        return culprits
+
+    def _skip_traverse_impl(self, all_nodes: NodeList, start_idx: int, end_idx: int) -> NodeSet:
+        """
+        Skip certain nodes in graph based on settings
+        """
+        culprits: NodeSet = set()
+        nodes: NodeList = all_nodes[start_idx:end_idx]
+
+        report: List[str] = []
+        self.reports.append(report)
+        self.iteration += 1
+        report.append(f" Nodes block {self.iteration}.")
+        report.append(
+            f"From node index {start_idx} to {end_idx-1}. "
+            f"Size of the interested node list is {len(nodes)}"
+        )
+
+        cur_nodes: NodeSet = set(nodes)
+
+        for node in nodes:
+            if node in self.fusions:
+                cur_nodes.update(self.fusions[node])
+
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, [])
+        except (FxNetMinimizerResultMismatchError):
+            culprits.update(cur_nodes)
+            report.append(f"Found culprit from numeric error: {cur_nodes}")
+            self.print_report(report)
+            return culprits
+        except (FxNetMinimizerRunFuncError):
+            culprits.update(cur_nodes)
+            report.append(f"Found culprit from run error: {node}")
+            self.print_report(report)
+            return culprits
+        else:
+            report.append("No discrepancy found.")
+            self.print_report(report)
+            return set()
+
+
+    def _skip_traverse(self, all_nodes: NodeList, skip_nodes: List) -> NodeSet:
+        """
+        Skip certain nodes in graph based on settings
+        """
+        start_idx = 0
+        num_nodes = len(all_nodes)
+        idx = 0
+        culprits = set()
+        while idx < num_nodes:
+            node = all_nodes[idx]
+            if (node.name in skip_nodes):  # skip the node
+                if idx > start_idx:
+                    culprits = self._skip_traverse_impl(all_nodes, start_idx, idx)
+                start_idx = idx + 1
+            elif idx == num_nodes - 1 and start_idx <= idx:  # last node
+                culprits = self._skip_traverse_impl(all_nodes, start_idx, idx + 1)
+            idx += 1
+
+        return culprits
+
+
+
+    def _collect_nodes(self, start: Optional[str], end: Optional[str]) -> NodeList:
+        """
+        Collect nodes in the model that between nodes with name of `start` and `end`.
+        These two nodes are also included.
+        """
+        nodes: NodeList = []
+        add_node = start is None
+
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+
+            if node.name == start:
+                add_node = True
+
+            if add_node:
+                nodes.append(node)
+
+            if node.name == end:
+                break
+
+        return nodes
+
+    def run_nodes(self, start: Optional[str] = None, end: Optional[str] = None):
+        """
+        Run part of the model from `start` node to `end` node. If `start` is None
+        then we start from the beginning of the model. If `end` is None then we
+        stop at the end of the model.
+
+        Args:
+            start: The name of the node which is the first node of the submodule
+                we want to run. If set to None, then we'll start with the first
+                node of the model.
+            end: The name of the node which is the last node of the submodule we
+                want to run. If set to None, we'll end with the last node of the
+                model.
+        """
+        nodes = self._collect_nodes(start, end)
+        cur_nodes = set(nodes)
+
+        for node in nodes:
+            if node in self.fusions:
+                cur_nodes.update(self.fusions[node])
+
+        output_names = []
+        if self.settings.return_intermediate:
+            output_names = [node.name for node in nodes]
+
+        try:
+            split_module, submod_name = self._build_submodule(cur_nodes)
+            self._run_and_compare(split_module, submod_name, output_names)
+        except (
+            FxNetMinimizerRunFuncError,
+            FxNetMinimizerResultMismatchError,
+        ) as e:
+            print(e)
+
+    def print_report(self, report: List[str]):
+        for i in range(len(report)):
+            if i > 0:
+                print(" . " + report[i])
+            else:
+                print(report[i])
+
+    def print_reports(self):
+        for report in self.reports:
+            self.print_report(report)
+
+    def minimize(
+        self, start: Optional[str] = None, end: Optional[str] = None, skip_nodes: Optional[List] = None,
+    ) -> NodeSet:
+        """
+        Minimizing the model from node with name `start` to node with name `end` base
+        on self.settings. Find culprits that causes FxNetMinimizerRunFuncError or
+        FxNetMinimizerResultMismatchError errors.
+
+        Args:
+            start: The name of the node where we want to start minimizing. If set
+                to None, then we'll start with the first node of the model.
+            end: The name of the node where we want to terminate minimizing. If
+                set to None, we'll end with the last node of the model.
+
+        Returns:
+            nodes: A list of nodes that causes FxNetMinimizerRunFuncError or
+                FxNetMinimizerResultMismatchError errors during minimizing.
+        """
+
+        print(self.settings)
+        print(self.module.graph)
+
+        nodes = self._collect_nodes(start, end)
+
+        if self.settings.traverse_method == "sequential":
+            return self._sequential_traverse(nodes)
+
+        if self.settings.traverse_method == "binary":
+            return self._binary_traverse(nodes)
+
+        if self.settings.traverse_method == "accumulate":
+            return self._accumulate_traverse(nodes)
+
+        if self.settings.traverse_method == "skip":
+            if (skip_nodes is None):
+                raise RuntimeError("'skip_nodes' can't be None when 'traverse_method' is 'skip'.")
+            return self._skip_traverse(nodes, skip_nodes)
+
+        if self.settings.traverse_method == "defined":
+            return self._defined_traverse(nodes)
+
+        raise RuntimeError(f"Unknown traverse method {self.settings.traverse_method}!")
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/operator_support.py b/MLPY/Lib/site-packages/torch/fx/passes/operator_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac5df5ec226c7c3fead84ebb7e8f22886ef027bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/operator_support.py
@@ -0,0 +1,217 @@
+import abc
+import typing as t
+
+import torch
+import torch.fx
+from torch.fx._compatibility import compatibility
+from .shape_prop import TensorMetadata
+from .tools_common import get_node_target, CALLABLE_NODE_OPS
+
+
+__all__ = ['OperatorSupportBase', 'OperatorSupport', 'create_op_support', 'chain', 'OpSupports', 'any_chain']
+
+# fx.Node.target typename, as returned by `get_node_target()`
+TargetTypeName = str
+
+# Arguments' dtypes for a given node, see `OperatorSupport`
+SupportedArgumentDTypes = t.Optional[
+    t.Tuple[
+        t.Sequence[t.Sequence[torch.dtype]],
+        t.Dict[str, t.Sequence[torch.dtype]],
+    ]
+]
+
+SupportDict = t.Mapping[TargetTypeName, SupportedArgumentDTypes]
+
+
+@compatibility(is_backward_compatible=False)
+class OperatorSupportBase(abc.ABC):
+    """Interface for determining if a fx.Node is supported by a backend"""
+    @abc.abstractmethod
+    def is_node_supported(
+        self, submodules: t.Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        raise NotImplementedError()
+
+
+@compatibility(is_backward_compatible=False)
+class OperatorSupport(OperatorSupportBase):
+    """
+    `_support_dict` maps node.target typename to supported inputs dtypes.
+
+    node.target typename is retrieved using helper function `get_node_target()`
+
+    If supported inputs dtypes is None, it means any dtype is supported, else
+    we should see a tuple like (([dtypes], ...), {"name":[dtypes], ...}).
+
+    The first tuple ([dtypes], ...) indicates what dtypes are supported for
+    inputs in node.args and the second dict {"name": [dtypes], ...} indicates
+    what dtypes are supported for inputs in node.kwargs.
+
+    For inputs in args, if we don't want to check it, we can put None there,
+    e.g. (None, [torch.float]) indicates that we don't care about the type of
+    the first input in args. And for inputs in kwargs, if not listed, will not
+    be checked.
+    """
+
+    _support_dict: SupportDict
+
+    def __init__(
+        self,
+        support_dict: t.Optional[SupportDict] = None
+    ):
+        self._support_dict = support_dict or {}
+
+    def is_node_supported(
+        self, submodules: t.Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        """
+        Args:
+            `submodules`: mapping from module name to the module. This can be
+                          retrieved by calling model.named_modules().
+
+            `node`: a Fx node that we want to determine whether it's supported.
+
+        Returns:
+            `is_supported`: whether the arg `node` is supported.
+        """
+        if node.op not in CALLABLE_NODE_OPS:
+            return True
+
+        target = get_node_target(submodules, node)
+
+        # Target not found in _support_dict meaning that we don't support this op at all
+        if target not in self._support_dict:
+            return False
+
+        # The rule for target is None meaning that we accept any dtype
+        if self._support_dict[target] is None:
+            return True
+
+        args_dtypes, kwargs_dtypes = self._support_dict[target]  # type: ignore[misc]
+
+        # Check args dtypes
+        for i, dtypes in enumerate(args_dtypes):
+            if len(node.args) <= i:
+                break
+
+            # None indicates we don't care about the dtype of args[i]
+            if dtypes is None:
+                continue
+
+            # If arg is not a node then we don't check it
+            if not isinstance(node.args[i], torch.fx.Node):
+                continue
+
+            arg_dtype = _get_arg_dtype(node.args[i])  # type: ignore[arg-type]
+            if arg_dtype not in dtypes:
+                return False
+
+        # Check kwargs dtypes
+        for k, dtypes in kwargs_dtypes.items():
+            if k not in node.kwargs:
+                continue
+
+            # If arg is not a node then we don't check it
+            if not isinstance(node.kwargs[k], torch.fx.Node):
+                continue
+
+            kwarg_dtype = _get_arg_dtype(node.kwargs[k])  # type: ignore[arg-type]
+            if kwarg_dtype not in dtypes:
+                return False
+
+        return True
+
+
+# ======================================================================
+# Functional interfaces and utils for defining basic operator support logic
+# and composing them into more complex ones
+# ======================================================================
+
+IsNodeSupported = t.Callable[[t.Mapping[str, torch.nn.Module], torch.fx.Node], bool]
+
+
+@compatibility(is_backward_compatible=False)
+def create_op_support(is_node_supported: IsNodeSupported) -> OperatorSupportBase:
+    """Wraps a `IsNodeSupported` function into an `OperatorSupportBase` instance
+
+    `IsNodeSupported` has the same call signature as
+    `OperatorSupportBase.is_node_supported`
+    """
+    class FunctionalOperatorSupport(OperatorSupportBase):
+        def is_node_supported(
+                self, submodules: t.Mapping[str, torch.nn.Module], node: torch.fx.Node
+        ) -> bool:
+            return is_node_supported(submodules, node)
+    return FunctionalOperatorSupport()
+
+
+@compatibility(is_backward_compatible=False)
+def chain(*op_support: OperatorSupportBase) -> OperatorSupportBase:
+    """Combines a sequence of `OperatorSupportBase` instances to form a single `OperatorSupportBase`
+    instance by evaluating each input `OperatorSupportBase` instance, and returns False if
+    any of it reports False.
+    """
+    def _chain(submods, node) -> bool:
+        return all(
+            x.is_node_supported(submods, node)
+            for x in op_support
+        )
+    return create_op_support(_chain)
+
+
+@compatibility(is_backward_compatible=False)
+def any_chain(*op_support: OperatorSupportBase) -> OperatorSupportBase:
+    """Combines a sequence of `OperatorSupportBase` instances to form a single `OperatorSupportBase`
+    instance by evaluating each input `OperatorSupportBase` instance, and returns True if
+    any of it reports True.
+    """
+    def _any_chain(submods, node) -> bool:
+        return any(
+            x.is_node_supported(submods, node)
+            for x in op_support
+        )
+    return create_op_support(_any_chain)
+
+
+@compatibility(is_backward_compatible=False)
+class OpSupports:
+    """A set of atomic `OperatorSupportBase` instances that can be combined together
+    to form more complex operator support logic.
+    """
+    @classmethod
+    def decline_if_input_dtype(cls, dtype: torch.dtype) -> OperatorSupportBase:
+        """Report a node as non-supported, if any of its arguments is of dtype"""
+
+        def _decline_if_input_dtype(
+            submodules: t.Mapping[str, torch.nn.Module],
+            node: torch.fx.Node,
+        ) -> bool:
+            for arg in node.all_input_nodes:
+                arg_dtype = _get_arg_dtype(arg)
+                if arg_dtype == dtype:
+                    return False
+            return True
+        return create_op_support(_decline_if_input_dtype)
+
+    @classmethod
+    def decline_if_node_in_names(cls, disallow_set: t.Set[str]) -> OperatorSupportBase:
+        """
+        If a node has a name that is in the disallow set, reported it as non-supported.
+        """
+        def _decline_if_node_in_names(
+            submodules: t.Mapping[str, torch.nn.Module],
+            node: torch.fx.Node,
+        ) -> bool:
+            if node.name in disallow_set:
+                return False
+            else:
+                return True
+        return create_op_support(_decline_if_node_in_names)
+
+
+def _get_arg_dtype(arg: torch.fx.Node) -> t.Any:
+    assert isinstance(arg, torch.fx.Node)
+    tensor_meta = arg.meta.get("tensor_meta")  # type: ignore[union-attr]
+    dtype = tensor_meta.dtype if isinstance(tensor_meta, TensorMetadata) else arg.meta["type"]
+    return dtype
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/param_fetch.py b/MLPY/Lib/site-packages/torch/fx/passes/param_fetch.py
new file mode 100644
index 0000000000000000000000000000000000000000..22ec7305d8191862d31257c6d29ecf3863873e19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/param_fetch.py
@@ -0,0 +1,66 @@
+from torch.fx.graph_module import GraphModule
+from typing import Any, Callable, Dict, List, Tuple, Type
+import torch
+import torch.nn as nn
+
+from torch.fx._compatibility import compatibility
+
+__all__ = ['default_matching', 'extract_attrs_for_lowering', 'lift_lowering_attrs_to_nodes']
+
+# Matching method matches the attribute name of current version to the attribute name of `target_version`
+@compatibility(is_backward_compatible=False)
+def default_matching(name: str, target_version: int) -> str:
+    """Default matching method
+    """
+    return name
+
+# This dict maps the nn.Module class name to the attribute name list that we want to fetch for lowering.
+# The first integer in the tuple is the version number of the nn.Module class when we create the parameter list.
+# If there's a version mismatch then it means the parameter names in the book might be mismatched with nn.Module.
+module_fetch_book: Dict[Type, Tuple[int, List[str], Callable[[str, int], str]]] = {
+    torch.nn.modules.linear.Linear: (1, ["weight", "bias"], default_matching),
+    torch.nn.modules.conv.Conv2d: (
+        1, ["weight", "bias", "kernel_size", "stride", "padding", "dilation", "groups", "padding_mode"], default_matching
+    ),
+    torch.nn.modules.batchnorm.BatchNorm2d: (2, ["weight", "bias", "running_mean", "running_var", "eps"], default_matching),
+    torch.nn.modules.pooling.AdaptiveAvgPool2d: (1, [], default_matching),
+    torch.nn.modules.pooling.MaxPool2d: (
+        1, ["kernel_size", "stride", "padding", "dilation", "return_indices", "ceil_mode"], default_matching
+    ),
+    torch.nn.modules.activation.ReLU: (1, ["inplace"], default_matching),
+}
+
+@compatibility(is_backward_compatible=False)
+def extract_attrs_for_lowering(mod: nn.Module) -> Dict[str, Any]:
+    """If `mod` is in `module_fetch_book`, fetch the mod's attributes that in the `module_fetch_book`
+    after checking module's version is compatible with the `module_fetch_book`.
+    """
+    attrs_for_lowering: Dict[str, Any] = {}
+    attrs_for_lowering["name"] = torch.typename(mod)
+
+    if type(mod) in module_fetch_book:
+        version, param_to_fetch, matching_method = module_fetch_book[type(mod)]
+        if version < mod._version:
+            raise RuntimeError(f"Fetcher version {version} try to fetch {torch.typename(mod)} version {mod._version}, "
+                               "please upgrade the module_fetch_book, open an issue and @842974287 "
+                               "or report a bug to AIACC team directly.")
+        for attr in param_to_fetch:
+            attrs_for_lowering[attr] = getattr(mod, matching_method(attr, mod._version))
+    else:
+        raise RuntimeError(f"{torch.typename(mod)} is not in the module_fetch_book yet, "
+                           "please add it to the module_fetch_book, open an issue and @842974287 "
+                           "or report a bug to AIACC team directly.")
+    return attrs_for_lowering
+
+@compatibility(is_backward_compatible=False)
+def lift_lowering_attrs_to_nodes(fx_module: GraphModule) -> None:
+    """Recursively traverse all `fx_module` nodes and fetch the module's attributes if the node is a leaf module.
+    """
+    submodules = dict(fx_module.named_modules())
+
+    for node in fx_module.graph.nodes:
+        if node.op == "call_module":
+            if isinstance(submodules[node.target], GraphModule):
+                lift_lowering_attrs_to_nodes(submodules[node.target])
+            else:
+                node.attrs_for_lowering = extract_attrs_for_lowering(submodules[node.target])
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/pass_manager.py b/MLPY/Lib/site-packages/torch/fx/passes/pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdf93182d37fbef674f79580c4702fecdf640a26
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/pass_manager.py
@@ -0,0 +1,257 @@
+from functools import wraps
+from inspect import unwrap
+from typing import Callable, List, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "PassManager",
+    "inplace_wrapper",
+    "log_hook",
+    "loop_pass",
+    "this_before_that_pass_constraint",
+    "these_before_those_pass_constraint",
+]
+
+# for callables which modify object inplace and return something other than
+# the object on which they act
+def inplace_wrapper(fn: Callable) -> Callable:
+    """
+    Convenience wrapper for passes which modify an object inplace. This
+    wrapper makes them return the modified object instead.
+
+    Args:
+        fn (Callable[Object, Any])
+
+    Returns:
+        wrapped_fn (Callable[Object, Object])
+    """
+
+    @wraps(fn)
+    def wrapped_fn(gm):
+        val = fn(gm)
+        return gm
+
+    return wrapped_fn
+
+def log_hook(fn: Callable, level=logging.INFO) -> Callable:
+    """
+    Logs callable output.
+
+    This is useful for logging output of passes. Note inplace_wrapper replaces
+    the pass output with the modified object. If we want to log the original
+    output, apply this wrapper before inplace_wrapper.
+
+
+    ```
+    def my_pass(d: Dict) -> bool:
+        changed = False
+        if 'foo' in d:
+            d['foo'] = 'bar'
+            changed = True
+        return changed
+
+    pm = PassManager(
+        passes=[
+            inplace_wrapper(log_hook(my_pass))
+        ]
+    )
+    ```
+
+    Args:
+        fn (Callable[Type1, Type2])
+        level: logging level (e.g. logging.INFO)
+
+    Returns:
+        wrapped_fn (Callable[Type1, Type2])
+    """
+    @wraps(fn)
+    def wrapped_fn(gm):
+        val = fn(gm)
+        logger.log(level, "Ran pass %s\t Return value: %s", fn, val)
+        return val
+
+    return wrapped_fn
+
+
+
+def loop_pass(base_pass: Callable, n_iter: Optional[int] = None, predicate: Optional[Callable] = None):
+    """
+    Convenience wrapper for passes which need to be applied multiple times.
+
+    Exactly one of `n_iter`or `predicate` must be specified.
+
+    Args:
+        base_pass (Callable[Object, Object]): pass to be applied in loop
+        n_iter (int, optional): number of times to loop pass
+        predicate (Callable[Object, bool], optional):
+
+    """
+    assert (n_iter is not None) ^ (
+        predicate is not None
+    ), "Exactly one of `n_iter`or `predicate` must be specified."
+
+    @wraps(base_pass)
+    def new_pass(source):
+        output = source
+        if n_iter is not None and n_iter > 0:
+            for _ in range(n_iter):
+                output = base_pass(output)
+        elif predicate is not None:
+            while predicate(output):
+                output = base_pass(output)
+        else:
+            raise RuntimeError(
+                f"loop_pass must be given positive int n_iter (given "
+                f"{n_iter}) xor predicate (given {predicate})"
+            )
+        return output
+
+    return new_pass
+
+
+# Pass Schedule Constraints:
+#
+# Implemented as 'depends on' operators. A constraint is satisfied iff a list
+# has a valid partial ordering according to this comparison operator.
+def _validate_pass_schedule_constraint(
+    constraint: Callable[[Callable, Callable], bool], passes: List[Callable]
+):
+    for i, a in enumerate(passes):
+        for j, b in enumerate(passes[i + 1 :]):
+            if constraint(a, b):
+                continue
+            raise RuntimeError(
+                f"pass schedule constraint violated. Expected {a} before {b}"
+                f" but found {a} at index {i} and {b} at index{j} in pass"
+                f" list."
+            )
+
+
+def this_before_that_pass_constraint(this: Callable, that: Callable):
+    """
+    Defines a partial order ('depends on' function) where `this` must occur
+    before `that`.
+    """
+
+    def depends_on(a: Callable, b: Callable):
+        if a == that and b == this:
+            return False
+        return True
+
+    return depends_on
+
+
+def these_before_those_pass_constraint(these: Callable, those: Callable):
+    """
+    Defines a partial order ('depends on' function) where `these` must occur
+    before `those`. Where the inputs are 'unwrapped' before comparison.
+
+    For example, the following pass list and constraint list would be invalid.
+    ```
+    passes = [
+        loop_pass(pass_b, 3),
+        loop_pass(pass_a, 5),
+    ]
+
+    constraints = [
+        these_before_those_pass_constraint(pass_a, pass_b)
+    ]
+    ```
+
+    Args:
+        these (Callable): pass which should occur first
+        those (Callable): pass which should occur later
+
+    Returns:
+        depends_on (Callable[[Object, Object], bool]
+    """
+
+    def depends_on(a: Callable, b: Callable):
+        if unwrap(a) == those and unwrap(b) == these:
+            return False
+        return True
+
+    return depends_on
+
+
+class PassManager:
+    """
+    Construct a PassManager.
+
+    Collects passes and constraints. This defines the pass schedule, manages
+    pass constraints and pass execution.
+
+    Args:
+        passes (Optional[List[Callable]]): list of passes. A pass is a
+            callable which modifies an object and returns modified object
+        constraint (Optional[List[Callable]]): list of constraints. A
+            constraint is a callable which takes two passes (A, B) and returns
+            True if A depends on B and False otherwise. See implementation of
+            `this_before_that_pass_constraint` for example.
+    """
+
+    passes: List[Callable]
+    constraints: List[Callable]
+    _validated: bool = False
+
+    def __init__(
+        self,
+        passes=None,
+        constraints=None,
+    ):
+        self.passes = passes or []
+        self.constraints = constraints or []
+
+    @classmethod
+    def build_from_passlist(cls, passes):
+        pm = PassManager(passes)
+        # TODO(alexbeloi): add constraint management/validation
+        return pm
+
+    def add_pass(self, _pass: Callable):
+        self.passes.append(_pass)
+        self._validated = False
+
+    def add_constraint(self, constraint):
+        self.constraints.append(constraint)
+        self._validated = False
+
+    def remove_pass(self, _passes: List[str]):
+        if _passes is None:
+            return
+        passes_left = []
+        for ps in self.passes:
+            if ps.__name__ not in _passes:
+                passes_left.append(ps)
+        self.passes = passes_left
+        self._validated = False
+
+    def replace_pass(self, _target, _replacement):
+        passes_left = []
+        for ps in self.passes:
+            if ps.__name__ == _target.__name__:
+                passes_left.append(_replacement)
+            else:
+                passes_left.append(ps)
+        self.passes = passes_left
+        self._validated = False
+
+    def validate(self):
+        """
+        Validates that current pass schedule defined by `self.passes` is valid
+        according to all constraints in `self.constraints`
+        """
+        if self._validated:
+            return
+        for constraint in self.constraints:
+            _validate_pass_schedule_constraint(constraint, self.passes)
+        self._validated = True
+
+    def __call__(self, source):
+        self.validate()
+        out = source
+        for _pass in self.passes:
+            out = _pass(out)
+        return out
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/reinplace.py b/MLPY/Lib/site-packages/torch/fx/passes/reinplace.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f8bfe4eb2f77023755781a9e67cf99e25fe8117
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/reinplace.py
@@ -0,0 +1,675 @@
+import torch
+from torch.fx import Node
+from torch.fx._compatibility import compatibility
+from torch._subclasses.fake_tensor import FakeTensorMode, FakeTensor
+from torch.utils._pytree import tree_map_only
+from torch.utils import _pytree as pytree
+from torch.multiprocessing.reductions import StorageWeakRef
+
+import _operator
+from enum import Enum
+import itertools
+from typing import Set, Dict
+from collections import defaultdict
+
+__all__ = ['reinplace']
+
+class _ViewType(Enum):
+    NonView = 0
+    SingleOutputView = 1
+    MultiOutputView = 2
+
+def _is_view_op(tgt):
+    if tgt is not None and isinstance(tgt, torch._ops.OpOverload):
+        schema = tgt._schema
+        if len(schema.arguments) > 0:
+            first_arg = schema.arguments[0]
+            # check if op is a view
+            return first_arg.alias_info is not None and not first_arg.alias_info.is_write
+
+def _get_view_type(tgt) -> _ViewType:
+    if tgt is not None and isinstance(tgt, torch._ops.OpOverload):
+        schema = tgt._schema
+        if len(schema.arguments) > 0:
+            first_arg = schema.arguments[0]
+            # check if op is a view
+            if first_arg.alias_info is not None and not first_arg.alias_info.is_write:
+                # check if op is a multi-output view
+                if '*' in first_arg.alias_info.after_set:
+                    return _ViewType.MultiOutputView
+                else:
+                    return _ViewType.SingleOutputView
+    return _ViewType.NonView
+
+
+# Stores a bunch of metadata related to functionalization each node.
+# Relevant metadata:
+# n.meta['fake_result']: FakeTensor (same type as the output of the node, but with FakeTenors instead of Tensors)
+#   The fake tensor output from running the current node
+# n.meta['view_of']: Node
+#   If the current node n is a view of some base tensor, the 'view_of' field tells us which
+#   view node was used to generate the current node (a view tensor).
+#   This information actually makes `fake_result` redundant, but we can use `fake_result`
+#   to sanity check that our aliasing information is correct.
+@compatibility(is_backward_compatible=False)
+class _FunctionalizationMetadataProp(torch.fx.Interpreter):
+
+    def run_node(self, node: Node):
+        self.node_counter += 1
+        result = super().run_node(node)
+        node.meta['fake_result'] = result
+        node.meta['node_idx'] = self.node_counter
+
+        # (1) Update metadata with the list of nodes that are used by this node
+        # copy_() doesn't read from its first argument; it writes to it, overwriting previous data.
+        # We don't want to treat it as "being used as an input".
+        node_args = node.args
+        if node.target is torch.ops.aten.copy_.default:
+            node_args = node_args[1:]
+
+        # (2) Update metadata to track aliasing information about view tensor nodes.
+        if node.op == 'call_function':
+            view_type = _get_view_type(node.target)
+            if view_type == _ViewType.SingleOutputView:
+                assert isinstance(node.args[0], Node)
+                node.meta['view_of'] = node.args[0]
+            elif view_type == _ViewType.MultiOutputView:
+                self.multi_output_view_nodes[node] = node.args[0]
+
+            # Check if we returned a multi-output view,
+            # and we're now grabbing the individual views from the output.
+            #
+            # For multi-output views, we want to map each output view to the base,
+            # but this mapping involves two separate nodes in FX IR.
+            # e.g. "a, b = x_1.split(...)" becomes:
+            #    %split_tensor : [num_users=2] = call_function[target=torch.ops.aten.split.Tensor](args = (%x_1, 2), kwargs = {})
+            #    %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%split_tensor, 0), kwargs = {})
+            #    %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_tensor, 1), kwargs = {})
+            # And we'd like to set:
+            #    getitem1.meta['view_of'] = x_1
+            elif node.target is _operator.getitem:
+                list_arg = node.args[0]
+                maybe_base_of_view = self.multi_output_view_nodes.get(list_arg, None)
+                if maybe_base_of_view is not None:
+                    # Note: we could also track indexing info here for multi-output views.
+                    # I don't think this metadata is strictly needed for de-functionalization.
+                    assert isinstance(maybe_base_of_view, Node)
+                    node.meta['view_of'] = maybe_base_of_view
+
+        if 'view_of' in node.meta:
+            # We're linking the current node with its first argument as views.
+            # Assert here that this is actually the case, and their storages are the same.
+            assert isinstance(node.meta['fake_result'], FakeTensor)
+            assert isinstance(node.meta['view_of'].meta['fake_result'], FakeTensor)
+            view_storage = StorageWeakRef(node.meta['fake_result']._typed_storage())
+            base_storage = StorageWeakRef(node.meta['view_of'].meta['fake_result']._typed_storage())
+            assert view_storage == base_storage
+        return result
+
+
+
+    def propagate(self, *args):
+        self.multi_output_view_nodes = {}
+        self.node_counter = -1
+
+        with FakeTensorMode() as mode:
+            fake_args = [mode.from_tensor(a) for a in args]
+            return super().run(*fake_args)
+
+def _schemas_match(functional_schema, inplace_schema):
+    names_match = inplace_schema.name.endswith("_") and inplace_schema.name[:-1] == functional_schema.name
+    arg_types_match = len(functional_schema.arguments) == len(inplace_schema.arguments) and all(
+        a1.type == a2.type for a1, a2 in zip(functional_schema.arguments, inplace_schema.arguments))
+    # for the inplace op, its first argument should be mutable
+    assert inplace_schema.arguments[0].alias_info is not None and inplace_schema.arguments[0].alias_info.is_write
+    # and its remaining arguments shouldn't be.
+    assert all(a.alias_info is None for a in inplace_schema.arguments[1:])
+    return names_match and arg_types_match
+
+# TODO: this should be beefed up to be able to properly re-inplace with:
+# - mutating ops (e.g. _fused_moving_avg_obs_fq_helper)
+# - out= ops (e.g. angle -> angle.out)
+# TODO: we should also figure this info out using torchgen.
+def _maybe_get_inplace_op(op):
+    # __module__ seems broken; it returns torch._ops.aten which doesn't exist
+    if not isinstance(op, torch._ops.OpOverload):
+        return None
+    # Some view ops have inplace variants (as_strided_, etc),
+    # but we do NOT want the reinplacing pass to directly add these into the program.
+    # (they'll require extra special handling, aren't aren't really useful for perf anyway)
+    if _is_view_op(op):
+        return None
+    op_namespace = op.__module__.split(".")[-1]
+    op_base_name = op.overloadpacket.__name__
+    maybe_namespace_module = getattr(torch.ops, op_namespace)
+    maybe_inplace_op = None if maybe_namespace_module is None else getattr(maybe_namespace_module, f'{op_base_name}_', None)
+    if maybe_inplace_op is None:
+        return None
+
+    inplace_overloads = [
+        getattr(maybe_inplace_op, overload_name) for overload_name in maybe_inplace_op.overloads()
+    ]
+    inplace_overloads_with_matching_schemas = [
+        f
+        for f in inplace_overloads
+        if _schemas_match(op._schema, f._schema)
+    ]
+    # Just because foo() and foo_() are both existing operators,
+    # They aren't guaranteed to have compatible schemas.
+    # For example, pow.Scalar(Scalar self, Tensor exponent) has no valid inplace variant,
+    # Even though several overloads of pow_ exist.
+    if len(inplace_overloads_with_matching_schemas) == 0:
+        return None
+    assert len(inplace_overloads_with_matching_schemas) == 1
+    inplace_op = inplace_overloads_with_matching_schemas[0]
+    return inplace_op
+
+_VIEW_INVERSE_MAP = {
+    torch.ops.aten.diagonal_scatter.default: torch.ops.aten.diagonal.default,
+    torch.ops.aten.select_scatter.default: torch.ops.aten.select.int,
+    torch.ops.aten.slice_scatter.default: torch.ops.aten.slice.Tensor,
+    torch.ops.aten.as_strided_scatter.default: torch.ops.aten.as_strided.default,
+}
+
+# This function, given a set of set of (aliased) tensor nodes,
+# Returns any nodes in the graph that *use* any of the aliases, that occur *after* op_index
+# in the node ordering.
+def _get_all_later_node_usages(tensor_aliases: Set[Node], op_index: int):
+    def _add_if_tensor(x, set_):
+        if isinstance(x, FakeTensor):
+            set_.add(StorageWeakRef(x._typed_storage()))
+
+    nodes_used_after = set()
+    for t in tensor_aliases:
+        # get all nodes that use the current alias
+        usage_nodes = t.users
+        for n in usage_nodes:
+            # We only care about usages after the current node
+            if 'node_idx' not in n.meta or n.meta['node_idx'] <= op_index:
+                continue
+            # We also don't care about intermediate view ops.
+            # They only matter if their output is then used elsewhere
+            # (either in an out-of-place op, or as an output to the function).
+            if n in tensor_aliases:
+                if isinstance(n.target, torch._ops.OpOverload) or n.target == _operator.getitem:
+                    continue
+            nodes_used_after.add(n)
+    return nodes_used_after
+
+# Given an op that we're trying to re-inplace, "b = foo(a)",
+# And given a {view}_scatter op that shows up later in the graph, "y = {view}_scatter(base, x, args...)"
+# Then re-inplacing `foo()` would allow us to remove the `{view}_scatter` op entirely, IF:
+# If there are any aliases in the alias_set(a) that satisfy:
+# (1) The base of "alias", "alias_base", has the same size/stride/offset metadata as "base"
+# (2) The output of running {view}(alias, args...) gives you the same size/stride/offset metadata
+#     as "alias"
+def _get_view_inverse_node_usages(later_node_usages: Set[Node], self_aliases: Set[Node]) -> Set[Node]:
+    def matching_view_metadata(a, b):
+        return a.size() == b.size() and \
+            a.stride() == b.stride() and \
+            a.storage_offset() == b.storage_offset()
+
+    view_inverse_nodes = set()
+    # Go through them in node order, so we can see chains of view_scatter ops.
+    for n in sorted(later_node_usages, key=lambda x: x.meta['node_idx']):
+        if n.target not in _VIEW_INVERSE_MAP:
+            continue
+        base = n.args[0]
+        mutated_view = n.args[1]
+        assert isinstance(base, Node)
+        assert isinstance(base.meta['fake_result'], FakeTensor)
+        assert isinstance(mutated_view, Node)
+        assert isinstance(mutated_view.meta['fake_result'], FakeTensor)
+        # Check that this view_inverse op actually corresponds to taking doing the inverse
+        # of one of our existing self_alias nodes.
+        original_view = _VIEW_INVERSE_MAP[n.target]
+        for self_alias in self_aliases:
+            # We're looking for some alias of the self arg, "alias",
+            # that was created from some op `alias = foo(base, args...)`
+            # such that the current _scatter op "inverts" that foo call.
+            # We can check that by running the original op again, and checking that the strides match.
+            if 'view_of' not in self_alias.meta:
+                continue
+            self_alias_base = self_alias.meta['view_of']
+            try:
+                # The we're trying to re-use the args from the view_scatter call inside of the corresponding
+                # view op, which might throw. This just indicates that view_scatter op isn't a valid inverse
+                # of the current alias we're looking at.
+                view_replay_metadata = original_view(self_alias_base.meta['fake_result'], *n.args[2:], **n.kwargs)
+                expected_metadata = self_alias.meta['fake_result']
+                # If the alias and its base both have matching metadata, then this view_scatter op is valid to re-inplace.
+                if matching_view_metadata(self_alias_base.meta['fake_result'], base.meta['fake_result']) and \
+                        matching_view_metadata(view_replay_metadata, expected_metadata):
+                    view_inverse_nodes.add(n)
+            except Exception:
+                continue
+
+    return view_inverse_nodes
+
+
+@compatibility(is_backward_compatible=True)
+def reinplace(gm, *sample_args):
+    """
+    Given an fx.GraphModule, modifies it to perform "reinplacing",
+    mutating the nodes of the graph.
+    We look for out-of-place op call sites like `b = a.add(...)`,
+    and convert them to be inplace (`b = a.add_(...)`),
+    as long as the input to the current operator ("a") isn't re-used
+    anywhere later in the graph.
+
+    This pass currently expects to operate on a **functional, ATen** graph.
+    This can be obtained by running `make_fx(functionalize(f))`.
+
+    Sample inputs are needed to determine aliasing relationships of the inputs.
+    In general, we can't reinplace node `b = a.add(...)` if "a" aliases any of the
+    inputs to the program.
+
+    Given a node "b = foo(a, args...) the algorithm for re-inplacing is as follows:
+
+    (1) Perform some initial checks on the metadata of "a" and "args..."
+        that can disqualify them from being reinplaced.
+
+      (1a) Check that the self argument we're attempting to reinplace
+           has acceptable dtype/size metadata to reinplace with.
+
+           For example, if we have:
+             a = torch.ones(1)
+             b = torch.ones(10)
+             out = torch.add(a, b)
+           We can't turn that into
+             a.add_(b)
+           Because that would require resizing "a".
+
+           Similarly, we can't convert torch.ge(a, b) into a.ge_(b),
+           because that would require changing a's dtype (from e.g. float32 to bool).
+           Note that in this specific example, we could technically do better..
+
+           If we see the pattern:
+             a_1 = a.ge(b)
+             a_2 = aten._to_copy(a_1, a.dtype)
+           Then we this should be valid to completely re-inplace
+           (this is exactly what functionalization will emit when it sees a.ge_(b)).
+
+           This optimization is only really important for user programs
+           that directly use inplace comparison ops though.
+
+           We also cannot re-inplace on tensors that have overlapping memory,
+           e.g. torch.ones(1).expand(4, 4).add_(1)
+
+      (1b) Check if "a" is an alias of any of the program inputs.
+
+          If it is, skip and move to the next node.
+          Inplace'ing an op that would cause it to mutate a program is not sound,
+          because that would be a side effect visible to the user.
+
+          NOTE: there's a future optimization that we should make:
+          if "a" is a (alias of a)  program input, but later in the program
+          there is a node that looks like "a.copy_(...)",
+          Then re-inplacing is ok to do - we are temporarily re-using a's buffer,
+          which will later be overwritten by the copy_() call.
+
+          This will be an important optimization to have for programs that mutate
+          their inputs. It currently isn't implemented though.
+
+      (1c) Check if "a" and "args..." alias
+
+          For example, re-inplacing to create code like the below
+          isn't guaranteed to be sound:
+
+            aten.mul_(a, a)
+
+    (2) Check that "a" and all of its outstanding aliases are not used anywhere
+        later in the graph. If this is the case, then it's safe to re-inplace
+        to "b = foo_(a)".
+
+        There are a few caveats to this, explained in more detail below:
+        (a) If "a" is used later as an argument to a view op, that is okay.
+            It's only a problem if "a" (or that view) is later passed
+            into a normal operator, or if it is returned as the program output.
+        (b) If "a" is a repeat argument in `foo()`, then don't reinplace.
+            Most ATen kernels don't make any guarantees that this is sound,
+            e.g. if you do aten.mul_(a, a).
+            So we'll just ban re-inplacing in this case.
+            It's only a problem if "a" (or that view) is later passed
+        (c) If "a" is used as an input into a view "inverse" / "scatter"
+            operator, it is potentially fine to re-inplace
+            (and remove that scatter operator from the graph).
+            See below for a more detailed example.
+
+        NOTE: there is an optimization in this step that is crucial
+        to fully recovering performance from functionalization.
+
+        Given this program:
+        def f(x):
+            a = torch.ops.aten.add(x, x)
+            b = torch.ops.aten.diagonal(a)
+            torch.ops.aten.fill_(b, 0)
+            return d
+
+        Functionalization will emit the following:
+        def f(x):
+            a = torch.ops.aten.add(x, x)
+            b = torch.ops.aten.diagonal(a, 0, 1)
+            b_updated = torch.ops.aten.fill(b, 0)
+            a_updated = torch.ops.aten.diagonal_scatter(a, b_updated, 0, 1)
+            return a_updated
+
+        Ordinarily, we would not be able to reinplace the fill,
+        because "b" aliases with "a" which is used by the diagonal_scatter call.
+
+        "re-inplacing" is on the hook for figuring out that it is ok to
+        completely, the expensive diagonal_scatter call, if we re-inplace the add().
+
+        So, for every `alias in alias_set(a)`, instead of checking
+        that "alias" is not used anywhere later in the graph,
+        we check that
+            EITHER:
+          (a) alias is not used anywhere later in the graph
+            OR:
+          (b) alias is used exactly once later on in the graph,
+              in the following op:
+
+                out = foo_scatter(alias, x, args...)
+
+              where the following must hold:
+                (i) "foo_scatter" is the "inverse" operator for foo.
+                    This only applies to "foo" ops that are view operators,
+                    which view into a subset of the original tensor's memory.
+                    In practice, there are ~4 operators where this applies:
+                      diagonal -> diagonal_scatter
+                      slice -> slice_scatter
+                      select -> select_scatter
+                      as_strided -> as_strided_scatter
+                (ii) "args..." are the same between the foo() and foo_scatter() calls.
+
+    (3) Perform the actual re-inplacing on foo!
+
+      (3b) is the common case, but special care is needed for {view}_scatter (3a)
+
+      (3a) {view}_scatter ops.
+
+        Consider this program:
+          a = torch.zeros(2, 2)
+          b = torch.ones(2)
+          a[0] = b
+
+        Post functionalization, that will look like:
+          a = torch.zeros(2)
+          b = torch.ones(1)
+          a_updated = torch.select_scatter(a, b, 0, 0)
+
+        In this case though, there is no "functional" op to re-inplace!
+        Instead, we'd like to directly remove toe select_scatter call.
+        We already know from (3) that this is valid,
+        because "a" has no later usages in the graph.
+
+        We perform the re-inplacing on the {view}_scatter op like so
+        Before:
+          a_updated = torch.select_scatter(a, b, args...)
+        After:
+          a_slice = a.select(a, args...)
+          a_slice.copy_(b)
+
+      (3b) Otherwise, replace the functional op with its inplace variant.
+        Before:
+          b = foo(a, args...)
+        After:
+          a.foo_(args...)
+
+    (4) Finally, after converting either:
+          Before:
+            b = foo(a)
+          After:
+            foo_(a)
+        or
+          Before:
+            b = {slice}_scatter(a, mutated_slice, args...)
+          After:
+            slice = {slice}(a, args...)
+            slice.copy_(mutated_slice)
+
+        We now need to find all later nodes that use "b" as an argument
+        and update them to take in "a" instead.
+
+        Note that for the majority of inplace ops, this isn't actually necessary
+        (because most inplace ops return "self" as their output).
+        This isn't generally true for all mutable ops though, which is why
+        we need to actually replace all of the arguments.
+
+        We also need to update our metadata of Dict[StorageWeakRef, Set[Node]],
+        That maps a given tensor storage to the set of all nodes that take in that storage
+        as an input.
+        Specifically, re-inplacing `b = foo(a)` causes "a" and "b"'s sets to get fused
+        together.
+
+    (5) Any "view_inverse/scatter" nodes that were identified as "it's ok to ignore them"
+        during step (3) get manually deleted from the graph.
+        Their outputs are no longer used, so technically standard DCE would be able
+        to do this, but we can no longer run FX's DCE pass now that we have mutable
+        ops in the graph.
+    """
+    _FunctionalizationMetadataProp(gm).propagate(*sample_args)
+
+    # Useful debug printing
+    # def _print(x):
+    # if isinstance(x, FakeTensor):
+    # print(f'fake_result: {StorageWeakRef(x._typed_storage()).cdata}')
+
+    # for n in gm.graph.nodes:
+    # print(n.format_node())
+    # if hasattr(n, 'meta'):
+    # print(f'node_idx: {n.meta["node_idx"]}')
+    # if 'fake_result' in n.meta:
+    # tree_map(_print, n.meta['fake_result'])
+    # if 'view_of' in n.meta:
+    # print(f'view_of: {str(n.meta["view_of"])}')
+    # print()
+
+    # We need to know which nodes correspond to inputs (or their aliases)
+    # so we know not to re-inplace them.
+    # NOTE: later, we'll need to add an optimization for fully recovering performance
+    # on programs that mutate inputs.
+    input_storages = {
+        StorageWeakRef(
+            node.meta['fake_result']._typed_storage()
+        ) for node in gm.graph.nodes if node.op == 'placeholder'}
+
+
+    # We also need to know for a given node, what are all of its aliasing nodes.
+    storage_to_nodes: Dict[StorageWeakRef, Set[Node]] = defaultdict(set)
+    for n in gm.graph.nodes:
+        if 'fake_result' in n.meta:
+            # Tree-mapping because some ops can return lists of tensors.
+            def _add_to_map(x):
+                if isinstance(x, FakeTensor):
+                    storage_to_nodes[StorageWeakRef(x._typed_storage())].add(n)
+            pytree.tree_map_(_add_to_map, n.meta['fake_result'])
+
+    # inplace-ify functional ops, subject to the constraints written below.
+    all_later_view_inverse_nodes_to_delete = set()
+    for idx, node in enumerate(gm.graph.nodes):
+        if node.op == 'call_function':
+
+            # Today, the re-inplace pass on directly acts on:
+            # - functional ops with an inplace variant
+            # - {view}_scatter ops that can be potentially removed from the graph.
+            # Both of these ops take in tensor first args, so filtering on this condition
+            # makes the later code simpler.
+            # We should revisit this at some point though, particularly when we also want
+            # the reinplacer to be able to handle out= and mutable operators
+            # and tensorlist first args (like `_foreach_` ops).
+            if not isinstance(node.target, torch._ops.OpOverload):
+                continue
+            if len(node.target._schema.arguments) < 1:
+                continue
+            if type(node.target._schema.arguments[0].type) != torch.TensorType:
+                continue
+
+            # Step 1a: Check that the self argument we're attempting to reinplace
+            # has the same size/stride as the output.
+            # For example, we shouldn't try to reinplace torch.add(scalar_tensor, larger_tensor)
+            # As it would require resizing scalar_tensor.
+            # (We could potentially swizzle this into larger_tensor.add_(scalar_tensor),
+            # this is probably an optimization to revisit later).
+            self_arg = node.args[0]
+            self_flattened = pytree.tree_leaves(self_arg.meta['fake_result'])
+            node_flattened = pytree.tree_leaves(node.meta['fake_result'])
+            self_has_wrong_metadata = False
+            if len(self_flattened) == len(node_flattened):
+                for self_meta, node_meta in zip(self_flattened, node_flattened):
+                    if self_meta.numel() != node_meta.numel():
+                        self_has_wrong_metadata = True
+                    if self_meta.dtype != node_meta.dtype:
+                        self_has_wrong_metadata = True
+                    # We also cannot re-inplace on tensors that have internal memory overlap.
+                    # e.g. torch.ones(1).expand(4, 4).add_(1)
+                    if torch._debug_has_internal_overlap(self_meta) == 1:
+                        self_has_wrong_metadata = True
+            # Here, we (optimistically) assume that a.resize(b) is valid to re-inplace,
+            # Since users should never really be calling the functional "torch.ops.aten.resize"
+            # op directly in their programs.
+            if self_has_wrong_metadata and node.target != torch.ops.aten.resize.default:
+                continue
+
+            # Step 1b: ensure that the op we're trying to re-inplace isn't a program input
+            self_arg_name = self_arg.name
+            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result']._typed_storage())
+            if self_arg_storage in input_storages:
+                # TODO: later, add the optimization for handling `copy_()` calls in the graph.
+                continue
+            if len([x for x in node.args if x is self_arg]) > 1:
+                # Step 1c:
+                # Calling stuff like aten.mul_(a, a) isn't guaranteed to be sound,
+                # so we prevent re-inplacing in this case.
+                continue
+
+            self_arg_storage = StorageWeakRef(self_arg.meta['fake_result']._typed_storage())
+            self_aliases = storage_to_nodes[self_arg_storage]
+
+            # First, we find all later usages of any of the aliases of self_arg.
+            later_node_usages = _get_all_later_node_usages(self_aliases, node.meta['node_idx'])
+            # Then, we check if any of those later usages are actually view_scatter ops
+            # that are safe to fully remove.
+            later_view_inverse_node_usages = _get_view_inverse_node_usages(later_node_usages, self_aliases)
+
+            # Step 2: Check to see if the input to the op is re-used later in the graph.
+            # If not (same goes for its aliases), then this op is safe to re-in place.
+            # This is a slightly roundabout way to check that there are no later usages of the current self argument.
+            # (later_view_inverse_node_usages corresponds to "view_scatter" nodes that we are allowed to delete)
+            can_reinplace = len(later_node_usages - later_view_inverse_node_usages) == 0
+            if not can_reinplace:
+                continue
+
+            # Step 3a: Special handling for when we see *_scatter operators.
+            # When we see an operator like `b = torch.slice_scatter(a, ...)`,
+            # instead of trying to "inplace" it into a.slice_scatter_(..._),
+            # we would prefer to remove it from the graph entirely,
+            # and instead copy_() the slice directly into the larger tensor.
+            # See the description of the algorithm for a full example.
+            if node.target in _VIEW_INVERSE_MAP and node not in all_later_view_inverse_nodes_to_delete:
+                view_op = _VIEW_INVERSE_MAP[node.target]
+                # Before:
+                #   base_updated = torch.ops.aten.slice_scatter.default(base, mutated_slice, args...)
+                # After:
+                #   slice = torch.ops.aten.slice.default(base, args...)
+                #   slice.copy_(mutated_slice)
+                with gm.graph.inserting_before(node):
+                    mutated_slice_node = node.args[1]
+                    remaining_slice_args = node.args[2:]
+                    slice_node = gm.graph.create_node(
+                        'call_function', view_op, (self_arg,) + tuple(remaining_slice_args), node.kwargs)
+                    copy_node = gm.graph.create_node(
+                        'call_function', torch.ops.aten.copy_.default, (slice_node, mutated_slice_node,), {})
+                # Add the slice_scatter node to our "nodes to delete" list.
+                all_later_view_inverse_nodes_to_delete.add(node)
+
+
+            else:
+                # Step 3b: Check to see if this operator has an inplace variant.
+                maybe_inplace_op = _maybe_get_inplace_op(node.target)
+                if maybe_inplace_op is None:
+                    continue
+                # And if so, replace it with its inplace variant.
+                node.target = maybe_inplace_op
+
+            # At this point, 'storage_to_nodes' will be stale.
+            # Now that we're inplacing `b = foo(a)`, we need to effectively
+            # union together the dict values for b and a's storage.
+            # Hmm... morally I think we also want to keep the `fake_result` metadata
+            # up to date here, but I'm not sure how easy it is to do.
+            # Maybe it's fine to wait until the end of the pass to update it.
+            curr_node_storage = StorageWeakRef(node.meta['fake_result']._typed_storage())
+            storage_to_nodes[self_arg_storage].update(storage_to_nodes[curr_node_storage])
+            storage_to_nodes[curr_node_storage].update(storage_to_nodes[self_arg_storage])
+
+            # Need to remember the view_scatter view nodes we found so we can remove them alter.
+            all_later_view_inverse_nodes_to_delete.update(later_view_inverse_node_usages)
+
+            # Step 4:
+            # Now that we've replaced b = a.foo() with a.foo_(),
+            # We need to replace any later usages of "b" with "a"
+            for old in itertools.chain([node], later_view_inverse_node_usages):
+                new = old.args[0]
+                nodes_to_update = [n for n in old.users if n.meta['node_idx'] > node.meta['node_idx']]
+                for node_to_update in nodes_to_update:
+                    new_args = []
+                    args = node_to_update.args
+
+                    def replace_arg(a):
+                        if a == old:
+                            return new
+                        return a
+
+                    # First, replace usages of "b" with "a"
+                    node_to_update.args = tree_map_only(Node, replace_arg, node_to_update.args)
+                    node_to_update.kwargs = tree_map_only(Node, replace_arg, node_to_update.kwargs)
+
+                    # Second, update our storage_to_nodes data structure.
+                    old_flattened_res = pytree.tree_leaves(old.meta['fake_result'])
+                    node_flattened_res = pytree.tree_leaves(node_to_update.meta['fake_result'])
+
+                    old_res_storage = {
+                        StorageWeakRef(
+                            x._typed_storage()
+                        ) for x in old_flattened_res if isinstance(x, FakeTensor)}
+                    node_res_storage = {
+                        StorageWeakRef(
+                            x._typed_storage()
+                        ) for x in node_flattened_res if isinstance(x, FakeTensor)}
+
+                    # This will happen if we're updating a view op, e.g.
+                    # e.g. replacing
+                    #     x = view(old)
+                    #     x = view(new)
+                    # When that happens, we need to make sure to keep our
+                    # storage mapping up to date.
+                    #
+                    # We're checking for len(...) == 1 here because all view ops are guaranteed to return either a single tensor,
+                    # or multiple tensors that all share the same storage.
+                    # We can't just check equality because we might encounter FX nodes that return zero tensor outputs.
+                    if len(old_res_storage) == 1 and len(node_res_storage) == 1 and old_res_storage == node_res_storage:
+                        new_flattened_res = pytree.tree_leaves(new.meta['fake_result'])
+                        new_res_storage = {
+                            StorageWeakRef(
+                                x._typed_storage()
+                            ) for x in new_flattened_res if isinstance(x, FakeTensor)}
+                        assert len(new_res_storage) == 1
+                        (old_ref,) = old_res_storage
+                        (new_ref,) = new_res_storage
+                        (node_ref,) = node_res_storage
+                        # Technically, "old_ref" and all its aliases will remain
+                        # in our mapping.
+                        # That should be fine though, since we deleted "old"
+                        # from the graph at this point.
+                        storage_to_nodes[node_ref].update(storage_to_nodes[new_ref])
+                        storage_to_nodes[new_ref].update(storage_to_nodes[node_ref])
+
+    # Step 4: delete any _scatter nodes that we de-functionalized
+    # Need to take care not to delete any of these nodes until after *all* modifications
+    # to the graph are finished.
+    for to_delete in all_later_view_inverse_nodes_to_delete:
+        gm.graph.erase_node(to_delete)
+
+
+    gm.recompile()
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/shape_prop.py b/MLPY/Lib/site-packages/torch/fx/passes/shape_prop.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a665340c7b159cf6d50a4d3be888c6df40e1f8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/shape_prop.py
@@ -0,0 +1,195 @@
+# mypy: ignore-errors
+
+import torch
+import torch.fx
+import traceback
+
+from torch._dispatch.python import enable_python_dispatcher
+from torch.fx.node import Node, map_aggregate
+from typing import Any, Tuple, NamedTuple, Optional, Dict
+from torch.fx._compatibility import compatibility
+from torch._guards import detect_fake_mode
+
+__all__ = ['TensorMetadata', 'ShapeProp']
+
+@compatibility(is_backward_compatible=True)
+class TensorMetadata(NamedTuple):
+    # TensorMetadata is a structure containing pertinent information
+    # about a tensor within a PyTorch program.
+
+    # General Tensor metadata
+    shape : torch.Size
+    dtype : torch.dtype
+    requires_grad : bool
+    stride : Tuple[int, ...]
+    memory_format : Optional[torch.memory_format]
+
+    # Quantization metadata
+    is_quantized : bool
+    qparams: Dict[str, Any]
+
+def _extract_tensor_metadata(result : torch.Tensor, include_contiguity=True) -> TensorMetadata:
+    """
+    Extract a TensorMetadata NamedTuple describing `result`.
+    """
+    shape = result.shape
+    dtype = result.dtype
+    requires_grad = result.requires_grad
+    stride = result.stride()
+
+    memory_format = None
+
+    if include_contiguity:
+        memory_formats = {
+            torch.contiguous_format,
+            torch.channels_last,
+            torch.channels_last_3d,
+        }
+        for query_format in memory_formats:
+            if result.is_contiguous(memory_format=query_format):
+                memory_format = query_format
+                break
+
+    is_quantized = result.is_quantized
+    qparams: Dict[str, Any] = {}
+    if is_quantized:
+        qscheme = result.qscheme()
+        qparams["qscheme"] = qscheme
+        if qscheme in {torch.per_tensor_affine, torch.per_tensor_symmetric}:
+            qparams["scale"] = result.q_scale()  # type: ignore[assignment]
+            qparams["zero_point"] = result.q_zero_point()  # type: ignore[assignment]
+        elif qscheme in {torch.per_channel_affine, torch.per_channel_affine_float_qparams, torch.per_channel_symmetric}:
+            # In this branch, scale and zero_point are expected to be tensors,
+            # we store the values as immutable_list in TensorMetadata for
+            # easier serialization downstream
+            qparams["scale"] = result.q_per_channel_scales().tolist()  # type: ignore[assignment]
+            qparams["zero_point"] = result.q_per_channel_zero_points().tolist()  # type: ignore[assignment]
+            qparams["axis"] = result.q_per_channel_axis()  # type: ignore[assignment]
+
+    return TensorMetadata(
+        shape, dtype, requires_grad, stride, memory_format, is_quantized, qparams)
+
+@compatibility(is_backward_compatible=True)
+class ShapeProp(torch.fx.Interpreter):
+    """
+    Execute an FX graph Node-by-Node and
+    record the shape and type of the result
+    into the corresponding node.
+
+    Example:
+         In this example, we record the shape
+         and data type of a module given
+         an example input ``torch.randn(50, D_in)``.
+         We print the name, shape and dtype of each node.
+
+        class TwoLayerNet(torch.nn.Module):
+            def __init__(self, D_in, H, D_out):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(D_in, H)
+                self.linear2 = torch.nn.Linear(H, D_out)
+            def forward(self, x):
+                h_relu = self.linear1(x).clamp(min=0)
+                y_pred = self.linear2(h_relu)
+                return y_pred
+        N, D_in, H, D_out = 64, 1000, 100, 10
+        x = torch.randn(N, D_in)
+        y = torch.randn(N, D_out)
+        model = TwoLayerNet(D_in, H, D_out)
+        gm = torch.fx.symbolic_trace(model)
+        sample_input = torch.randn(50, D_in)
+        ShapeProp(gm).propagate(sample_input)
+
+        for node in gm.graph.nodes:
+            print(node.name, node.meta['tensor_meta'].dtype,
+                node.meta['tensor_meta'].shape)
+
+        The output of this code is:
+
+        x torch.float32 torch.Size([50, 1000])
+        linear1 torch.float32 torch.Size([50, 100])
+        clamp_1 torch.float32 torch.Size([50, 100])
+        linear2 torch.float32 torch.Size([50, 10])
+        output torch.float32 torch.Size([50, 10])
+
+    Args:
+         module (GraphModule): The module to be executed
+         fake_mode (FakeTensorMode): A fake mode for copying the gm
+
+    """
+    def __init__(self, gm, fake_mode=None):
+        super().__init__(gm)
+        if fake_mode is None:
+            fake_mode = detect_fake_mode()
+        if fake_mode is not None:
+            from torch._dynamo.utils import deepcopy_to_fake_tensor
+            # Note:
+            # We need fake execution cause the inputs are fake, however, we cannot fakify the module
+            # - because we need to write to the tensor_meta of the real module. So we fakify to
+            # produce a result (L131 below), to extract tensor meta, and then keep going.
+            #
+            # If we were to fakify, we would write to the wrong node, and then downstream fusion
+            # would be missing the tensor_meta.
+            #
+            # See torch/_inductor/overrides.py for where this is called upstream of fusion.
+            self.fake_module = deepcopy_to_fake_tensor(self.module, fake_mode)
+            self.fake_mode = fake_mode
+        else:
+            self.fake_module = None
+            self.fake_mode = None
+
+        self.real_module = self.module
+
+    def run_node(self, n : Node) -> Any:
+        try:
+            if self.fake_module is not None:
+                # Hacky swap. Alternatively, we could do this with overriding
+                # call_module and get_attr.
+                self.module = self.fake_module
+            try:
+                if self.fake_mode is not None:
+                    with self.fake_mode, enable_python_dispatcher():
+                        result = super().run_node(n)
+                else:
+                    result = super().run_node(n)
+            finally:
+                self.module = self.real_module
+        except Exception as e:
+            traceback.print_exc()
+            raise RuntimeError(
+                f"ShapeProp error for: node={n.format_node()} with "
+                f"meta={n.meta}"
+            ) from e
+
+        found_tensor = False
+
+        def extract_tensor_meta(obj):
+            if isinstance(obj, torch.Tensor):
+                nonlocal found_tensor
+                found_tensor = True
+                return _extract_tensor_metadata(obj)
+            else:
+                return obj
+
+        meta = map_aggregate(result, extract_tensor_meta)
+        if found_tensor:
+            n.meta['tensor_meta'] = meta
+
+        n.meta['type'] = type(result)
+        return result
+
+    def propagate(self, *args):
+        """
+        Run `module` via interpretation and return the result and
+        record the shape and type of each node.
+
+        Args:
+            *args (Tensor): the sample input.
+
+        Returns:
+            Any: The value returned from executing the Module
+        """
+        if self.fake_mode is not None:
+            fake_args = [self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args]
+        else:
+            fake_args = args
+        return super().run(*fake_args)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/split_module.py b/MLPY/Lib/site-packages/torch/fx/passes/split_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36022dabfdfc7fe34f07f5e98cabb8c84d0bb04
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/split_module.py
@@ -0,0 +1,514 @@
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Set, TYPE_CHECKING
+from collections import OrderedDict
+import logging
+
+import torch
+from torch.fx._compatibility import compatibility
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+
+if TYPE_CHECKING:
+    import sympy  # noqa: F401
+
+__all__ = ["Partition", "split_module"]
+_LOGGER = logging.getLogger(__name__)
+
+@compatibility(is_backward_compatible=True)
+class Partition:
+    def __init__(self, name: str):
+        self.name: str = name
+        self.submod_name = f"submod_{name}"
+        self.node_names: List[str] = []
+        self.inputs: Dict[str, None] = {}
+        self.outputs: Dict[str, None] = {}
+        self.dependencies: Dict[str, None] = {}
+        self.dependents: Dict[str, None] = {}
+        self.graph: torch.fx.graph.Graph = torch.fx.graph.Graph()
+        self.environment: Dict[Node, Node] = {}
+        self.targets: Dict[str, Any] = {}
+
+    def __repr__(self) -> str:
+        return (
+            f"name: {self.name},\n"
+            f" nodes: {self.node_names},\n"
+            f" inputs: {self.inputs},\n"
+            f" outputs: {self.outputs},\n"
+            f" partitions depended on: {self.dependencies},\n"
+            f" partition dependents: {self.dependents}"
+        )
+
+
+# Creates subgraphs out of main graph
+@compatibility(is_backward_compatible=True)
+def split_module(
+    m: GraphModule,
+    root_m: torch.nn.Module,
+    split_callback: Callable[[Node], int],
+    qualname_map: Optional[Dict[str, str]] = None,
+    keep_original_order: Optional[bool] = False,
+    keep_original_node_name: Optional[bool] = False,
+):
+    """
+    Creates subgraphs out of main graph
+
+    Args:
+        m (GraphModule): Graph module to split
+        root_m (torch.nn.Module): root nn module. Not currently used. Included
+            because the root nn module is usually transformed via
+            torch.fx._symbolic_trace.symbolic_trace (see example below)
+        split_callback (Callable[[Node], int]): Callable function
+            that maps a given Node instance to a numeric partition identifier.
+            split_module will use this function as the policy for which operations
+            appear in which partitions in the output Module.
+        qualname_map: Optional[Dict[str, str]]: optional output parameter that returns a
+            mapping from new target names in the module after split to old target
+            names in the original module.
+        keep_original_order: Optional[bool]: keep the original order of the GraphModule
+            or use the Topological order of the new constructed GraphModule
+
+
+    Returns:
+        GraphModule: the module after split.
+
+    Example:
+
+        This is a sample setup:
+
+            import torch
+            from torch.fx.symbolic_trace import symbolic_trace
+            from torch.fx.graph_module import GraphModule
+            from torch.fx.node import Node
+            from torch.fx.passes.split_module import split_module
+
+            class MyModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.param = torch.nn.Parameter(torch.rand(3, 4))
+                    self.linear = torch.nn.Linear(4, 5)
+
+                def forward(self, x, y):
+                    z = self.linear(x + self.param).clamp(min=0.0, max=1.0)
+                    w = self.linear(y).clamp(min=0.0, max=1.0)
+                    return z + w
+
+            # symbolically trace model
+            my_module = MyModule()
+            my_module_traced = symbolic_trace(my_module)
+
+            # random mod partitioning
+            partition_counter = 0
+            NPARTITIONS = 3
+
+            def mod_partition(node: Node):
+                global partition_counter
+                partition = partition_counter % NPARTITIONS
+                partition_counter = (partition_counter + 1) % NPARTITIONS
+                return partition
+
+            # split module in module with submodules
+            module_with_submodules = split_module(
+                my_module_traced, my_module, mod_partition
+            )
+
+        Output looks like this. Original graph is broken into partitions
+
+            > print(module_with_submodules)
+            GraphModule(
+                (submod_0): GraphModule(
+                    (linear): Linear(in_features=4, out_features=5, bias=True)
+                )
+                (submod_1): GraphModule(
+                    (linear): Linear(in_features=4, out_features=5, bias=True)
+                )
+                (submod_2): GraphModule()
+            )
+
+            def forward(self, x, y):
+                param = self.param
+                submod_0 = self.submod_0(x, param, y);  x = param = y = None
+                getitem = submod_0[0]
+                getitem_1 = submod_0[1];  submod_0 = None
+                submod_1 = self.submod_1(getitem, getitem_1);  getitem = getitem_1 = None
+                getitem_2 = submod_1[0]
+                getitem_3 = submod_1[1];  submod_1 = None
+                submod_2 = self.submod_2(getitem_2, getitem_3);  getitem_2 = getitem_3 = None
+                return submod_2
+
+        Output of split module is the same as output of input traced module.
+        This is an example within a test setting:
+
+            > orig_out = my_module_traced(x, y)
+            > submodules_out = module_with_submodules(x, y)
+            > self.assertEqual(orig_out, submodules_out)
+            True
+    """
+
+    def construct_graph(
+        node: Node,
+        base_mod_env: Dict[str, Node],
+        base_mod_attrs: Dict[str, torch.fx.graph_module.GraphModule],
+    ):
+        if node.op == "placeholder":
+            default_value = (
+                node.args[0] if len(node.args) > 0 else inspect.Signature.empty
+            )
+            if keep_original_node_name:
+                args = () if default_value is inspect.Signature.empty else (default_value,)
+                base_mod_env[node.name] = base_mod_graph.create_node('placeholder', node.name, args=args, type_expr=node.type)
+            else:
+                base_mod_env[node.name] = base_mod_graph.placeholder(
+                    node.target, type_expr=node.type, default_value=default_value
+                )
+            base_mod_env[node.name].meta = node.meta.copy()
+        elif node.op == "get_attr":
+            base_mod_env[node.name] = base_mod_graph.get_attr(node.target)
+            base_mod_env[node.name].meta = node.meta.copy()
+            attr_val = m
+            for atom in node.target.split("."):  # type: ignore[union-attr]
+                if not hasattr(attr_val, atom):
+                    raise AttributeError(f"Node target {node.target} not found!")
+                attr_val = getattr(attr_val, atom)
+            base_mod_attrs[node.target] = attr_val  # type: ignore[index]
+        return base_mod_env, base_mod_attrs
+
+    partitions: Dict[str, Partition] = {}
+    orig_nodes: Dict[str, Node] = {}
+    symbol_to_node: Dict["sympy.Symbol", Node] = {}
+
+    def record_cross_partition_use(
+        def_node: Node, use_node: Optional[Node]
+    ):  # noqa: B950
+        from torch.fx.experimental.symbolic_shapes import free_symbols
+
+        defined = getattr(def_node, "_fx_partition", None)
+        used = getattr(use_node, "_fx_partition", None)
+        if defined != used:
+            if defined is not None:
+                def_partition = partitions[defined]
+                def_partition.outputs.setdefault(def_node.name)
+                if used is not None:
+                    def_partition.dependents.setdefault(used)
+
+            if used is not None:
+                use_partition = partitions[used]
+                use_partition.inputs.setdefault(def_node.name)
+                if (def_val := def_node.meta.get("example_value")) is not None:
+                    for s in sorted(free_symbols(def_val), key=str):
+                        use_partition.inputs.setdefault(symbol_to_node[s].name)
+                if defined is not None:
+                    use_partition.dependencies.setdefault(defined)
+
+    def instantiate_node_partition_mapping(node):
+        partition_name = str(split_callback(node))
+
+        # add node to partitions
+        partition = partitions.get(partition_name)
+        if partition is None:
+            partitions[partition_name] = partition = Partition(partition_name)
+
+        partition.node_names.append(node.name)
+        node._fx_partition = partition_name
+
+    # Global State Nodes are nodes which by their global state effects,
+    # "taint" all downstream nodes while they are active.
+    GLOBAL_STATE_NODES = [
+        torch.amp._enter_autocast,
+        torch.amp._exit_autocast,
+        torch._C._set_grad_enabled
+    ]
+
+    # For grad regions:
+    # ------------------------
+    # 1. first region: we do nothing
+    # 2. subsequent regions: we insert the set_grad at the beginning
+    grad_regions: OrderedDict[Node, Set[int]] = OrderedDict()
+
+    # For autocast regions:
+    # ------------------------
+    # 1. first region: we will only insert the _exit at the end
+    # 2. intermediate regions: we will insert both the
+    #    _enter at the beginning and _exit at the end
+    # 3. last region: we will only insert _enter at the beginning
+    # We will do so in the order in which the autocasts were instantiated.
+    autocast_regions: OrderedDict[Node, Set[int]] = OrderedDict()
+    autocast_exits: Dict[Node, Optional[Node]] = {}
+
+    active_grad = None
+    active_autocasts = set()
+
+    import sympy  # noqa: F811
+
+    for node in m.graph.nodes:
+        if node.op in ["placeholder", "get_attr", "output"]:
+            if (
+                node.op == "placeholder" and
+                (val := node.meta.get("example_value")) is not None and
+                isinstance(val, torch.SymInt) and
+                isinstance(val.node.expr, sympy.Symbol)
+            ):
+                symbol_to_node[val.node.expr] = node
+            continue
+
+        instantiate_node_partition_mapping(node)
+
+        if node.op == "call_function" and node.target in GLOBAL_STATE_NODES:
+            if node.target == torch._C._set_grad_enabled:
+                assert len(node.args) == 1
+                assert isinstance(node.args[0], bool)
+                active_grad = node
+                grad_regions[active_grad] = set({split_callback(node)})
+            elif node.target == torch.amp._enter_autocast:
+                # Should all be python constants
+                assert all(not isinstance(arg, Node) for arg in node.args)
+                active_autocasts.add(node)
+                autocast_regions[node] = set({split_callback(node)})
+                autocast_exits[node] = None
+            elif node.target == torch.amp._exit_autocast:
+                assert len(node.args) == 1
+                autocast_regions[node.args[0]].add(split_callback(node))
+                active_autocasts.remove(node.args[0])
+                autocast_exits[node.args[0]] = node
+
+        if active_grad is not None:
+            grad_regions[active_grad].add(split_callback(node))
+
+        for a in active_autocasts:
+            autocast_regions[a].add(split_callback(node))
+
+    assert all(v is not None for v in autocast_exits.values()), "autocast must exit"
+
+    autocast_regions = {k: sorted(v) for k, v in autocast_regions.items()}
+    grad_regions = {k: sorted(v) for k, v in grad_regions.items()}
+
+    if _LOGGER.isEnabledFor(logging.DEBUG):
+        _LOGGER.debug("autocast_regions: %s", autocast_regions)
+        _LOGGER.debug("grad_regions: %s", grad_regions)
+
+    assert_monotonically_increasing = bool(autocast_regions) or bool(grad_regions)
+
+    # split nodes into partitions
+    highest_partition = -1
+    for node in m.graph.nodes:
+        orig_nodes[node.name] = node
+
+        # TODO currently placeholders/parameters aren't put into random partitions,
+        # rather they're added to the graphs where they are used down below
+        if node.op in ["placeholder", "get_attr"]:
+            continue
+        if node.op == "output":
+            torch.fx.graph.map_arg(
+                node.args[0], lambda n: record_cross_partition_use(n, None)
+            )
+            continue
+
+        if assert_monotonically_increasing:
+            pid = split_callback(node)
+            assert highest_partition <= pid, \
+                ("autocast or set_grad_enabled require monotonically increasing partitions:"
+                 f"highest: {highest_partition}, this node's: {pid}")
+            highest_partition = pid
+
+        # do not capture cross-partition dependencies for global state nodes as they will be
+        # self-contained - their setup and unwind will be isolated to each partition submodule.
+        if node.target not in GLOBAL_STATE_NODES:
+            torch.fx.graph.map_arg(
+                node.args, lambda def_node: record_cross_partition_use(def_node, node)
+            )
+            torch.fx.graph.map_arg(
+                node.kwargs, lambda def_node: record_cross_partition_use(def_node, node)
+            )  # noqa: B950
+
+    original_partition_order = list(partitions.keys())
+    # find partitions with no dependencies
+    root_partitions: List[str] = []
+    for partition_name, partition in partitions.items():
+        if not len(partition.dependencies):
+            root_partitions.append(partition_name)
+
+    # check partitions for circular dependencies and create topological partition ordering
+    sorted_partitions: List[str] = []
+    while root_partitions:
+        root_partition = root_partitions.pop()
+        sorted_partitions.append(root_partition)
+        for dependent in partitions[root_partition].dependents:
+            partitions[dependent].dependencies.pop(root_partition)
+            if not partitions[dependent].dependencies:
+                root_partitions.append(dependent)
+    if len(sorted_partitions) != len(partitions):
+        raise RuntimeError("cycle exists between partitions!")
+
+    # Enter prelude
+    for regions_mapping in [autocast_regions, grad_regions]:
+        for node, regions in regions_mapping.items():
+            assert len(regions) > 0
+            partitions[str(regions[0])].environment[node] = node
+            for r in regions[1:]:
+                partition = partitions[str(r)]
+                new_node = partition.graph.create_node(
+                    op=node.op,
+                    target=node.target,
+                    args=tuple(arg for arg in node.args),
+                    kwargs={},
+                    type_expr=node.type,
+                )
+                new_node.meta = node.meta.copy()  # is it really a good idea to copy this?
+                partition.environment[node] = new_node
+
+    # add placeholders to partition inputs
+    for partition_name in sorted_partitions:
+        partition = partitions[partition_name]
+        for inp in partition.inputs:
+            placeholder = partition.graph.placeholder(
+                inp,
+                type_expr=orig_nodes[inp].type,
+            )
+            placeholder.meta = orig_nodes[inp].meta.copy()
+            partition.environment[orig_nodes[inp]] = placeholder
+
+    # Transform nodes and collect targets for partition's submodule
+    for node in m.graph.nodes:
+        if hasattr(node, "_fx_partition"):
+            partition = partitions[node._fx_partition]
+
+            # swap out old graph nodes in kw/args with references to new nodes in this submodule
+            environment = partition.environment
+            gathered_args = torch.fx.graph.map_arg(node.args, lambda n: environment[n])
+            gathered_kwargs = torch.fx.graph.map_arg(
+                node.kwargs, lambda n: environment[n]
+            )
+
+            if node.op not in ["call_module", "get_attr"]:
+                target = node.target
+            else:
+                target_atoms = node.target.split(".")
+                target_attr = m
+                for atom in target_atoms:
+                    if not hasattr(target_attr, atom):
+                        raise AttributeError(f"Operator target {node.target} not found!")
+                    target_attr = getattr(target_attr, atom)
+                # target = target_atoms[-1]
+                target = "_".join(target_atoms)
+                partition.targets[target] = target_attr
+                # Fill in the passed-in mapping from new qualname to old qualname
+                if qualname_map is not None:
+                    # When creating the split module later, the submodules will have
+                    # path prefix matching the corresponding partition's submod_name
+                    qualname = f"{partition.submod_name}.{target}"
+                    qualname_map[qualname] = node.target
+
+            assert isinstance(gathered_args, tuple)
+            assert isinstance(gathered_kwargs, dict)
+            name = node.name if keep_original_node_name else None
+            new_node = partition.graph.create_node(
+                op=node.op,
+                target=target,
+                args=gathered_args,
+                kwargs=gathered_kwargs,
+                type_expr=node.type,
+                name=name,
+            )
+            new_node.meta = node.meta.copy()
+            partition.environment[node] = new_node
+
+    # Exit epilogue
+    for regions_mapping in [autocast_regions]:
+        for node in reversed(regions_mapping):
+            regions = regions_mapping[node]
+            assert len(regions) > 0
+            for r in regions[:-1]:
+                partition = partitions[str(r)]
+                exit_node = autocast_exits[node]
+                assert exit_node is not None, "Missing exit node"
+                new_node = partition.graph.create_node(
+                    op=exit_node.op,
+                    target=exit_node.target,
+                    args=(partition.environment[node],),
+                    kwargs={},
+                    type_expr=exit_node.type,
+                )
+                new_node.meta = exit_node.meta.copy()  # is it really a good idea to copy this?
+
+    # original module environment dict mapping node names to nodes
+    orig_mod_env: Dict[str, Node] = {}
+    # Set up values to construct base module
+    base_mod_env: Dict[str, Node] = {}
+    base_mod_graph: torch.fx.graph.Graph = torch.fx.graph.Graph()
+    base_mod_attrs: Dict[str, torch.fx.graph_module.GraphModule] = {}
+    if not keep_original_order:
+        for node in m.graph.nodes:
+            base_mod_env, base_mod_attrs = construct_graph(
+                node, base_mod_env, base_mod_attrs
+            )
+
+    else:
+        # Go through the graph to construct the mapping dict
+        for node in m.graph.nodes:
+            orig_mod_env[node.name] = node
+
+    # Do some things iterating over the partitions in topological order again:
+    # 1) Finish off submodule Graphs by setting corresponding outputs
+    # 2) Construct GraphModules for each submodule
+    # 3) Construct the base graph by emitting calls to those submodules in
+    #    topological order or original order specified by keep_original_order
+
+    construct_order_partitions = (
+        sorted_partitions if not keep_original_order else original_partition_order
+    )
+
+    already_constructed_attr_nodes = set()
+    for partition_name in construct_order_partitions:
+        partition = partitions[partition_name]
+
+        # Set correct output values
+        output_vals = tuple(
+            partition.environment[orig_nodes[name]] for name in partition.outputs
+        )
+
+        # skip output node generation if there are no output values
+        num_output_vals = len(output_vals)
+        if num_output_vals == 1:
+            partition.graph.output(output_vals[0])
+        elif num_output_vals > 1:
+            partition.graph.output(output_vals)
+
+        if keep_original_order:
+            # first get the attr nodes required by this partition
+            orig_mod_attr_nodes: List[Node] = [
+                orig_mod_env[key] for key in partition.inputs
+            ]
+            # Construct GraphModule for this partition
+            for node in orig_mod_attr_nodes:  # type: ignore[attr-defined]
+                if node in already_constructed_attr_nodes:
+                    continue
+                base_mod_env, base_mod_attrs = construct_graph(
+                    node, base_mod_env, base_mod_attrs
+                )
+                already_constructed_attr_nodes.add(node)
+
+        base_mod_attrs[partition.submod_name] = torch.fx.graph_module.GraphModule(
+            partition.targets, partition.graph
+        )  # noqa: B950
+
+        # Emit call in base graph to this submodule
+        output_val = base_mod_graph.call_module(
+            partition.submod_name,
+            tuple(base_mod_env[name] for name in partition.inputs),
+        )
+
+        num_outputs = len(partition.outputs)
+        if num_outputs > 1:
+            # Unpack multiple return values from submodule
+            output_val_proxy = torch.fx.proxy.Proxy(output_val)
+            for i, output_name in enumerate(partition.outputs):
+                base_mod_env[output_name] = output_val_proxy[i].node  # type: ignore[index]
+        elif num_outputs == 1:
+            base_mod_env[next(iter(partition.outputs))] = output_val
+
+    for node in m.graph.nodes:
+        if node.op == "output":
+            base_mod_graph.output(
+                torch.fx.graph.map_arg(node.args[0], lambda n: base_mod_env[n.name])
+            )  # noqa: B950
+
+    return torch.fx.graph_module.GraphModule(base_mod_attrs, base_mod_graph)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/split_utils.py b/MLPY/Lib/site-packages/torch/fx/passes/split_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f682a42dbb91b6b8b88f7b3f9e854724b718e30a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/split_utils.py
@@ -0,0 +1,302 @@
+import copy
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import torch.fx
+from torch.fx._compatibility import compatibility
+from torch.fx.graph import map_arg
+from torch.fx.passes.utils import HolderModule, lift_subgraph_as_module
+
+from .tools_common import NodeList
+
+__all__ = ["getattr_recursive", "setattr_recursive", "Component", "split_by_tags"]
+
+
+@compatibility(is_backward_compatible=False)
+def getattr_recursive(obj, name):
+    for layer in name.split("."):
+        if hasattr(obj, layer):
+            obj = getattr(obj, layer)
+        else:
+            return None
+    return obj
+
+
+@compatibility(is_backward_compatible=False)
+def setattr_recursive(obj, attr, value):
+    if "." not in attr:
+        setattr(obj, attr, value)
+    else:
+        layer = attr.split(".")
+        setattr_recursive(getattr(obj, layer[0]), ".".join(layer[1:]), value)
+
+
+@compatibility(is_backward_compatible=False)
+@dataclass
+class Component:
+    """
+    A component serves as a container for a subgraph we want to create afterwards.
+    """
+
+    graph: torch.fx.Graph
+    order: int
+    name: str
+
+    # Stores the placeholder nodes in `graph`.
+    input_placeholders: List = field(default_factory=list)
+
+    # Store the nodes in original graph that are placeholder in `graph`.
+    orig_inputs: List = field(default_factory=list)
+
+    # Store the nodes in original graph that are outputs in `graph`.
+    orig_outputs: List = field(default_factory=list)
+
+    # Mapping from get_attr node in original graph to get_attr node in `graph`.
+    getattr_maps: Dict[torch.fx.Node, torch.fx.Node] = field(default_factory=dict)
+    constructor_args: List[str] = field(default_factory=list)
+    gm: Optional[torch.fx.GraphModule] = None
+
+
+@compatibility(is_backward_compatible=False)
+def split_by_tags(
+    gm: torch.fx.GraphModule,
+    tags: List[str],
+    return_fqn_mapping: bool = False,
+    return_tuple: bool = False,
+    GraphModuleCls: Type[torch.fx.GraphModule] = torch.fx.GraphModule,
+) -> Union[torch.fx.GraphModule, Tuple[torch.fx.GraphModule, Dict[str, str]]]:
+    """
+    Splits a GraphModule using tags on its graph nodes. We honor the order of
+    tags. For example, we have tags = ["a", "b", "c"], the function will create
+    the initial submodules in the order of "a", "b", "c".
+
+    To set a tag:
+    gm.graph.nodes[idx].tag = "mytag"
+
+    This will result in all nodes with the same tag being extracted and placed in their
+    own submodule. For placeholder, output and get_attr node, the tag is ignored. placeholder
+    and output nodes are created when needed while get_attr nodes get copied to submodules
+    where they are used.
+
+    Given the following module def:
+
+    class SimpleModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(...)
+            self.linear2 = torch.nn.Linear(...)
+            self.linear3 = torch.nn.Linear(...)
+
+        def forward(self, in1, in2):
+            r1 = self.linear1(in1)
+            r2 = self.linear2(in2)
+            r3 = torch.cat([r1, r2])
+            return self.linear3(r3)
+
+    Marking the node corresponding to in1 with the tag sc.REQUEST_ONLY.lower() results in the following split:
+
+    ro:
+    def forward(self, in1):
+        self = self.root
+        linear1 = self.linear1(in1)
+        return linear1
+
+    main:
+    def forward(self, in2, linear1):
+        self = self.root
+        linear2 = self.linear2(in2)
+        cat_1 = torch.cat([linear1, linear2])
+        linear3 = self.linear3(cat_1)
+        return linear3
+
+    main:
+    def forward(self, in1, in2):
+        self = self.root
+        ro_0 = self.ro_0(in1)
+        main_1 = self.main_1(in2, ro_0)
+        return main_1
+
+    Returns:
+        split_gm: torch fx graph after split
+        orig_to_split_fqn_mapping: a map between the original fqn and the fqn
+            after split for call_module and get_attr.
+    """
+
+    def flatten(x: torch.fx.node.Argument) -> NodeList:
+        """
+        Stores nodes in x to a list and returns the list.
+        """
+        r: NodeList = []
+        map_arg(x, r.append)
+        return r
+
+    # Mapping from node in original module to node in created submodule.
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+    # Mapping from node in original module or created submodules to
+    # corresponding component.
+    node_to_component: Dict[torch.fx.Node, Component] = {}
+
+    # Mapping from tag to the corresponding component.
+    tag_to_component: Dict[str, Component] = {}
+
+    # Stores all components.
+    all_components: List[Component] = []
+
+    # Stores nodes that will be used in main graph.
+    used_in_main: Dict[torch.fx.Node, None] = {}
+
+    # Main graph after split.
+    main_g = torch.fx.Graph()
+
+    # Mapping from node in original module to node in main graph after split.
+    main_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+    # Output node of original module.
+    output_node: Optional[torch.fx.Node] = None
+
+    # Create a component for each tag, we don't expect to create other components afterwards.
+    for tag in tags:
+        comp = Component(torch.fx.Graph(), len(all_components), f"{tag}")
+        all_components.append(comp)
+        tag_to_component[tag] = comp
+
+    # Traverse the nodes in original graph and take care of them.
+    for node in gm.graph.nodes:
+        if node.op == "output":
+            if output_node is not None:
+                raise RuntimeError("Multiple output nodes in graph!")
+            output_node = node
+            continue
+
+        # Placeholders in the original graph get copied to main graph.
+        if node.op == "placeholder":
+            main_remapping[node] = main_g.placeholder(node.name, type_expr=node.type)
+            main_remapping[node].meta = copy.copy(node.meta)
+            continue
+
+        # Get_attr nodes are ignored because we are not tagging them.
+        # Instead, we copy them directly to the submodules use them afterwards.
+        if node.op == "get_attr":
+            continue
+
+        # Now we process callable nodes which are nodes with op of call_module,
+        # call_function or call_method. Every callable nodes should be tagged.
+        assert hasattr(node, "tag")
+
+        upstream_components = [
+            node_to_component[x]
+            for x in flatten(node.args) + flatten(node.kwargs)
+            if x.op not in {"placeholder", "get_attr"}
+        ]
+
+        comp = tag_to_component[node.tag]
+        node_to_component[node] = comp
+
+        # Max order of upperstream components.
+        mx = max((c.order for c in upstream_components), default=0)
+
+        # Expect the component for `node` has higher order then its upstream components.
+        assert comp.order >= mx
+
+        # Map a input of `node` to nodes in the component's graph.
+        def remap_func(x):
+            # If input is a get_attr node, copy it to current component's graph.
+            # Returns the get_attr node in current component's graph.
+            if x.op == "get_attr":
+                if x not in comp.getattr_maps:
+                    comp.getattr_maps[x] = comp.graph.get_attr(
+                        x.target, type_expr=x.type
+                    )
+                return comp.getattr_maps[x]
+
+            # If input is not a placeholder, it should have been put into a component
+            # already. If it's the current component then we return the corresponding
+            # node in the component.
+            if x.op != "placeholder" and node_to_component[x] == comp:
+                return node_remapping[x]
+
+            # If input is a placeholder or it's in other components, we want to make it
+            # as a placeholder in current component's graph.
+            if x not in comp.orig_inputs:
+                comp.orig_inputs.append(x)
+                placeholder = comp.graph.placeholder(x.name, type_expr=x.type)
+                placeholder.meta = copy.copy(x.meta)
+                comp.input_placeholders.append(placeholder)
+                used_in_main[x] = None
+
+            return comp.input_placeholders[comp.orig_inputs.index(x)]
+
+        n = comp.graph.node_copy(node, remap_func)
+        n.tag = node.tag  # type: ignore[attr-defined]
+        node_remapping[node] = n
+        node_to_component[n] = comp
+
+    if output_node is None:
+        raise RuntimeError("Graph had no output node!")
+
+    for x in flatten(output_node.args[0]):
+        if x.op == "get_attr":
+            # We don't need components mapping for nodes of type "get_attr"
+            # that are consumed by the output. Only need to make sure we create
+            # corresponding counterparts in the resulting graph.
+            main_remapping[x] = main_g.get_attr(x.name, type_expr=x.type)
+        else:
+            # All component results consumed by the output node should be
+            # marked as "used in main".
+            used_in_main[x] = None
+
+    # If a node is used in main graph then we mark it as an output in the component
+    # it belongs to.
+    for n in used_in_main:
+        if n.op != "placeholder":
+            node_to_component[n].orig_outputs.append(n)
+
+    # Now we create a graphmodule for each component.
+    orig_to_split_fqn_mapping: Dict[str, str] = {}
+    for comp in all_components:
+        outs = tuple(map(node_remapping.__getitem__, comp.orig_outputs))
+
+        if return_tuple:
+            comp.graph.output(outs)
+        else:
+            # Take care of the args of FX output node. If there's a single
+            # output then the output node args is like (output_single), else
+            # if there're multiple outputs then the output node args is like
+            # ((output_0, output_1, ...)).
+            comp.graph.output(outs[0] if len(outs) == 1 else outs)
+
+        comp.gm, comp_orig_to_split_fqn_mapping = lift_subgraph_as_module(
+            gm, subgraph=comp.graph, comp_name=comp.name
+        )
+        orig_to_split_fqn_mapping.update(comp_orig_to_split_fqn_mapping)
+
+        # Create a call_module node in main graph.
+        main_node = main_g.call_module(
+            comp.name,
+            args=tuple(map(main_remapping.__getitem__, comp.orig_inputs)),
+            kwargs=None,
+        )
+
+        if len(outs) == 1 and not return_tuple:
+            main_remapping[comp.orig_outputs[0]] = main_node
+        else:
+            for i, o in enumerate(comp.orig_outputs):
+                # Use Proxy to record getitem access.
+                main_remapping[o] = torch.fx.Proxy(main_node)[i].node  # type: ignore[index]
+
+    main_g.output(map_arg(output_node.args[0], main_remapping.__getitem__))
+    main_root = HolderModule({comp.name: comp.gm for comp in all_components})
+    main_g._codegen = gm.graph._codegen
+
+    # If the output nodes consumes get_attr directly in the original graph,
+    # then we need to make sure get_attr is copied to the new graph.
+    for x in flatten(output_node.args[0]):
+        if x.op == "get_attr":
+            setattr(main_root, x.name, getattr_recursive(gm, x.target))  # type: ignore[arg-type]
+
+    result_gm = GraphModuleCls(main_root, main_g)
+    if return_fqn_mapping:
+        return result_gm, orig_to_split_fqn_mapping
+
+    return result_gm
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/splitter_base.py b/MLPY/Lib/site-packages/torch/fx/passes/splitter_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2e4d93f99b56e1ba161a3b5a17d395f6a6c8e18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/splitter_base.py
@@ -0,0 +1,871 @@
+import argparse
+import copy
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import NamedTuple, Sequence, Iterable, Any, List, Dict, Optional, Tuple
+import logging
+
+import torch
+from torch.fx.passes.graph_manipulation import get_size_of_node
+from torch.fx.node import map_arg
+from torch.fx._compatibility import compatibility
+
+from .operator_support import (
+    get_node_target,
+    OperatorSupportBase,
+)
+from .graph_drawer import FxGraphDrawer
+from .shape_prop import ShapeProp
+from .split_utils import split_by_tags
+from .tools_common import (
+    FxNetAccFusionsFinder,
+    CALLABLE_NODE_OPS,
+    Tensors,
+    NodeList,
+    NodeSet,
+    is_node_output_tensor,
+)
+
+
+__all__ = ['FxNetAccNodesFinder', 'FxNetSplitterInternalError', 'Subgraph', 'SplitResult', 'generate_inputs_for_submodules']
+_LOGGER = logging.getLogger(__name__)
+
+DEFAULT_MIN_ACC_MODULE_SIZE = 1
+DEFAULT_SKIP_FUSION = False
+DEFAULT_ALLOW_NON_TENSOR = False
+
+class _SplitterSettingBase:
+    def __init__(
+        self,
+        min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE,
+        skip_fusion=DEFAULT_SKIP_FUSION,
+        allow_non_tensor=DEFAULT_ALLOW_NON_TENSOR
+    ):
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--min-acc-module-size",
+            "--min_acc_module_size",
+            required=False,
+            type=int,
+            help="Minimum size limit of an accelerator subgraph.",
+        )
+        parser.add_argument(
+            "--skip-fusion",
+            "--skip_fusion",
+            default=False,
+            action="store_true",
+            help="If true then no fusion groups. Fusion group is used to "
+            "enforce no non-tensor data flow between submodules. If we don't "
+            "have this constrain, setting this to false is recommended as it "
+            "can reduce overhead.",
+        )
+        parser.add_argument(
+            "--allow-non-tensor",
+            "--allow_non_tensor",
+            default=False,
+            action="store_true",
+            help="For some backends non-tensor data flow between cpu and them "
+            "are not allowed. Therefore, if a node supported by accelerator but "
+            "it has non-tensor inputs or outputs to a cpu node we would want to "
+            "consider it as a cpu node during splitting. However, for some backends "
+            "we might not care about non-tensor data flow and we can set this option "
+            "to true to disable the functionality that prevent non-tensor data flow.",
+        )
+        args, unknown = parser.parse_known_args()
+
+        self.min_acc_module_size: int = args.min_acc_module_size if args.min_acc_module_size else min_acc_module_size
+        self.skip_fusion: bool = args.skip_fusion if args.skip_fusion else skip_fusion
+        self.allow_non_tensor: bool = args.allow_non_tensor if args.allow_non_tensor else allow_non_tensor
+
+
+@compatibility(is_backward_compatible=False)
+class FxNetAccNodesFinder:
+    """
+    Finds a set of nodes that can be supported on ACC, excluding nodes that have non-tensor
+    input/output to cpu nodes to prevent non-tensor data flow between backends and cpu.
+
+    I.e. if we have a chain:
+
+    ACC_NODE_1 -> ACC_NODE_2 -> ACC_NODE_3 -> CPU_NODE_1
+
+    where every ACC node produces non-tensor output, then they all should be treated as CPU nodes.
+
+    This behavior can be turned off by passing allow_non_tensor=True.
+    """
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        operator_support: OperatorSupportBase,
+        allow_non_tensor: bool,
+    ):
+        self.module = module
+        self.operator_support = operator_support
+        self.allow_non_tensor = allow_non_tensor
+
+    def reduce_acc_nodes_non_tensor_input_helper(
+        self, cpu_worklist: NodeList
+    ):
+        """
+        Transitively excludes nodes from ACC supported set.
+        For every node in the worklist:
+        - removes its downstream ACC nodes from ACC supported set,
+        - if any downstream ACC node produces non-tensor output,
+          then it gets added into the worklist.
+        """
+        while cpu_worklist:
+            node = cpu_worklist.pop(0)
+
+            for user in node.users:
+                if user in self.acc_nodes:
+                    self.acc_nodes.remove(user)
+                    if not is_node_output_tensor(user):
+                        cpu_worklist.append(user)
+
+    def reduce_acc_nodes_non_tensor_input(self):
+        """
+        Excludes nodes from ACC supported set that have direct
+        upstream CPU nodes that produce non-tensor outputs.
+        """
+        non_tensor_cpu_nodes: NodeList = []
+
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+            if node in self.acc_nodes:
+                continue
+            if is_node_output_tensor(node):
+                continue
+            non_tensor_cpu_nodes.append(node)
+
+        self.reduce_acc_nodes_non_tensor_input_helper(non_tensor_cpu_nodes)
+
+    def reduce_acc_nodes_non_tensor_output(self):
+        """
+        Excludes nodes from ACC supported set that produce non-tensor
+        outputs and have downstream CPU nodes.
+        """
+        while True:
+            new_cpu_nodes: NodeList = []
+
+            for acc_node in self.acc_nodes:
+                if is_node_output_tensor(acc_node):
+                    continue
+                for user in acc_node.users:
+                    if user not in self.acc_nodes:
+                        new_cpu_nodes.append(acc_node)
+                        break
+
+            if not new_cpu_nodes:
+                break
+
+            for new_cpu_node in new_cpu_nodes:
+                self.acc_nodes.remove(new_cpu_node)
+
+            self.reduce_acc_nodes_non_tensor_input_helper(new_cpu_nodes)
+
+    def __call__(self) -> NodeSet:
+        submodules = dict(self.module.named_modules())
+        self.acc_nodes = {
+            n
+            for n in self.module.graph.nodes
+            if n.op in CALLABLE_NODE_OPS
+            and self.operator_support.is_node_supported(submodules, n)
+        }
+
+        if not self.allow_non_tensor:
+            self.reduce_acc_nodes_non_tensor_input()
+            self.reduce_acc_nodes_non_tensor_output()
+
+        return self.acc_nodes
+
+@compatibility(is_backward_compatible=False)
+class FxNetSplitterInternalError(Exception):
+    pass
+
+@compatibility(is_backward_compatible=False)
+@dataclass
+class Subgraph:
+    is_acc: bool
+    nodes: NodeList
+
+
+@compatibility(is_backward_compatible=False)
+class SplitResult(NamedTuple):
+    """
+    Stores the results of the splitter.
+
+    Attributes:
+        split_module: root module after splitting.
+        submodule_inputs: a dict that maps submodule name to its inputs.
+        non_acc_submodule_prefix: the prefix for non acc submodules. For
+            acc submodule the prefix is alwasy "_run_on_acc_".
+    """
+
+    split_module: torch.fx.GraphModule
+    submodule_inputs: Dict[str, Any]
+    non_acc_submodule_prefix: str
+
+
+@compatibility(is_backward_compatible=False)
+def generate_inputs_for_submodules(
+    model: torch.nn.Module,
+    inputs: Sequence[Any],
+    target_submodules: Iterable[str],
+    deepcopy: bool = False,
+) -> Dict[str, Any]:
+    """
+    Generate inputs for targeting submdoules in the given model. Note that if two submodules refer to the same obj, this
+    function doesn't work.
+
+    Args:
+        model: root model.
+        inputs: inputs to the root model.
+        target_submodules: submodules that we want to generate inputs for.
+
+    Returns:
+        A dict that maps from submodule name to its inputs.
+    """
+
+    handles = []
+    results = {}
+    submodule_to_names = {mod: name for name, mod in model.named_modules()}
+
+    def pre_forward(module, module_inputs):
+        results[submodule_to_names[module]] = copy.deepcopy(module_inputs) if deepcopy else module_inputs
+
+    for name, mod in model.named_modules():
+        if name in target_submodules:
+            handles.append(mod.register_forward_pre_hook(pre_forward))
+
+    def clean_up_handles():
+        for h in handles:
+            h.remove()
+
+    try:
+        with torch.no_grad():
+            model(*inputs)
+    except Exception as e:
+        clean_up_handles()
+        raise e
+
+    clean_up_handles()
+    return results
+
+
+class _SplitterBase:
+    """
+    Splits a GraphModule into sub-GraphModules for execution on CPU or the accelerator.
+    Output is a GraphModule with supported and unsupported operators grouped into as few sub-GraphModules as possible.
+    Assumes that only "call_module", "call_function" and "call_method" from FX IR can potentially be executed on the accelerator.
+
+    Given the following graph:
+          ==> b ==>
+        //         \\
+       a             d
+        \\         //
+          ==> c ==>
+
+    class SimpleModule(torch.nn.Module):
+        def forward(self, a):
+            b = torch.sin(a)
+            c = torch.cos(a)
+            d = b + c
+            return d
+
+    and providing "operator_support" that indicates that 'b' and 'c' can be executed on the accelerator,
+    we will get the following split result:
+
+    main:
+    def forward(self, a):
+        run_on_acc_0_0 = self._run_on_acc_0_0(a)
+        getitem = run_on_acc_0_0[0]
+        getitem_1 = run_on_acc_0_0[1]
+        run_on_cpu_1_1 = self._run_on_cpu_1_1(getitem, getitem_1)
+        return run_on_cpu_1_1
+
+    _run_on_acc_0_0:
+    def forward(self, a):
+        sin_1 = torch.sin(a)
+        cos_1 = torch.cos(a)
+        return (sin_1, cos_1)
+
+    _run_on_cpu_1_1:
+    def forward(self, sin_1, cos_1):
+        add_1 = sin_1 + cos_1
+        return add_1
+    """
+
+    # PCIe bandwidth for the backend, default to 100 GB/s
+    PCIe_BW = 100 * 2 ** 30
+
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Sequence[Any],
+        operator_support: OperatorSupportBase,
+        settings: _SplitterSettingBase,
+        non_acc_submodule_name: str = "_run_on_cpu_",
+    ):
+        """
+        Preprocesses graph before splitting:
+        - finds nodes supported by ACC,
+        - finds fusion groups for ACC nodes having non-tensor IO,
+        - builds a graph of direct dependencies,
+        - builds a map of fused nodes to their fusions.
+        As a result we get self.acc_nodes, self.deps and self.fusions.
+        """
+        assert isinstance(module, torch.fx.GraphModule)
+
+        self.module = module
+        ShapeProp(self.module).propagate(*sample_input)
+
+        self.settings = settings
+        self.operator_support = operator_support
+        self.sample_input = sample_input
+        self.acc_nodes = FxNetAccNodesFinder(self.module, self.operator_support, self.settings.allow_non_tensor)()
+
+        if self.settings.skip_fusion:
+            self.fusions = {}
+        else:
+            self.fusions = FxNetAccFusionsFinder(module, self.acc_nodes)()
+
+        # Modify deps to add more deps for fused nodes
+        self.deps = self.find_deps()
+        self.update_deps_for_fusions()
+
+        self.non_acc_submodule_name = non_acc_submodule_name
+        self._node_submodule_map: Dict[str, str] = {}
+
+    # ===============================================================
+    # Helpers for ctor and initial state
+    # ===============================================================
+
+    def get_node_submodule_map(self) -> Dict[str, str]:
+        """ Returns a map from node name to submodule name, e.g.
+            node: main_module_impl_impl_over_arch_unary_multiple_embedding
+              _pooling_embedding_pooling_sparse_entity_equivalence_key
+              _proxy_embedding_bag
+            maps to submodule name of: _run_on_acc_1
+        """
+        return self._node_submodule_map
+
+    def find_deps(self) -> Dict[torch.fx.Node, NodeSet]:
+        """
+        Builds a graph of node dependencies. Leaf nodes don't have any
+        dependencies and the "output" node doesn't have nodes depending on it.
+
+        Resulting graph has only direct dependencies, i.e. there are no
+        transitive dependencies.
+        """
+        deps: Dict[torch.fx.Node, NodeSet] = defaultdict(set)
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+
+            for user in node.users:
+                if user.op != "output":
+                    deps[user].add(node)
+        return deps
+
+    def update_deps_for_fusions(self):
+        """
+        Updates graph of dependencies so that:
+        - nodes from the same fusion depend on the same set of outer nodes,
+        - outer nodes depending on a fusion depend on all nodes in that fusion.
+        """
+        for node in self.fusions:
+            fusion = self.fusions[node]
+            for fused_neighbor in fusion:
+                self.deps[node].update(self.deps[fused_neighbor] - fusion)
+
+                for user in fused_neighbor.users:
+                    if user not in fusion:
+                        self.deps[user].add(node)
+
+    # ===============================================================
+    # Helpers for preview
+    # ===============================================================
+
+    def _lower_model_to_backend(
+        self, mod: torch.fx.GraphModule, inputs: Tensors
+    ) -> torch.nn.Module:
+        """
+        Lower the model to a backend.
+        """
+
+        return mod
+
+    def _find_culprit(
+        self, mod: torch.fx.GraphModule, inputs: Tensors
+    ) -> str:
+        """
+        When an error occurs during lowering or running the lowered mod, we use this
+        function to find culprits in the `mod` that causes the error.
+        """
+
+        return "Unable to find a culprit because _find_culprit() function is not implemented."
+
+    def _draw_graph_based_on_node_support(
+        self, mod: torch.fx.GraphModule, supported_nodes: NodeList
+    ):
+        color_map = {
+            "default": "AliceBlue",
+            "supported": "chartreuse1",
+            "unsupported": "crimson",
+        }
+
+        class CustomDrawer(FxGraphDrawer):
+            def _get_node_style(self, node):
+                template = super()._get_node_style(node)
+                if node in supported_nodes:
+                    template["fillcolor"] = color_map["supported"]
+                elif node.op in CALLABLE_NODE_OPS:
+                    template["fillcolor"] = color_map["unsupported"]
+                else:
+                    template["fillcolor"] = color_map["default"]
+
+                return template
+
+        drawer = CustomDrawer(mod, "node_support", ignore_getattr=True)
+        dot_graph = drawer.get_main_dot_graph()
+        dot_graph.write_raw("node_support.dot")
+
+    def node_support_preview(self, dump_graph: bool = False):
+        submodules = dict(self.module.named_modules())
+
+        supported_nodes: NodeList = []
+        supported_node_types = defaultdict(set)
+        unsupported_node_types = defaultdict(set)
+
+        def get_dtype(arg):
+            tensor_meta = arg.meta.get("tensor_meta")
+            return getattr(tensor_meta, "dtype", None)
+
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+
+            target = get_node_target(submodules, node)
+
+            # Store dtype of arg in node.args. If arg doesn't have dtype, i.e. not a tensor, we'll store None.
+            arg_dtypes = [
+                get_dtype(arg) if isinstance(arg, torch.fx.Node) else None
+                for arg in node.args
+            ]
+
+            # Find last non-None element. If all elements are None, return max_len.
+            last_index = len(arg_dtypes) - next(
+                (
+                    i
+                    for i, dtype in enumerate(reversed(arg_dtypes))
+                    if dtype is not None
+                ),
+                len(arg_dtypes),
+            )
+
+            # Strip None elements at the end.
+            arg_dtypes_tuple = tuple(arg_dtypes[:last_index])
+            kwarg_dtypes_tuple = tuple(
+                (k, get_dtype(arg))
+                for k, arg in node.kwargs.items()
+                if isinstance(arg, torch.fx.Node)
+            )
+
+            if self.operator_support.is_node_supported(submodules, node):
+                supported_nodes.append(node)
+                supported_node_types[target].add((arg_dtypes_tuple, kwarg_dtypes_tuple))
+            else:
+                unsupported_node_types[target].add((arg_dtypes_tuple, kwarg_dtypes_tuple))
+
+        if dump_graph:
+            self._draw_graph_based_on_node_support(self.module, supported_nodes)
+
+        reports = "\nSupported node types in the model:\n"
+        for t, dtypes in supported_node_types.items():
+            for arg_dtypes_tuple, kwarg_dtypes_tuple in dtypes:
+                reports += f"{t}: ({arg_dtypes_tuple}, {dict(kwarg_dtypes_tuple)})\n"
+
+        reports += "\nUnsupported node types in the model:\n"
+        for t, dtypes in unsupported_node_types.items():
+            for arg_dtypes_tuple, kwarg_dtypes_tuple in dtypes:
+                reports += f"{t}: ({arg_dtypes_tuple}, {dict(kwarg_dtypes_tuple)})\n"
+
+        print(reports)
+
+        # Return reports for testing purpose
+        return reports
+
+    def split_preview(self, dump_graph: bool = False):
+        reports = ""
+        subgraphs = self.put_nodes_into_subgraphs()
+        acc_subgraphs_num = len([g for g in subgraphs if g.is_acc])
+        cpu_subgraphs_num = len(subgraphs) - acc_subgraphs_num
+        reports += f"Before removing small acc subgraphs, total {len(subgraphs)} subgraphs are created:"
+        reports += f" {acc_subgraphs_num} acc subgraphs and {cpu_subgraphs_num} cpu subgraphs.\n"
+
+        subgraphs = self.remove_small_acc_subgraphs(subgraphs)
+        acc_subgraphs_num = len([g for g in subgraphs if g.is_acc])
+        cpu_subgraphs_num = len(subgraphs) - acc_subgraphs_num
+        reports += f"After removing small acc subgraphs, total {len(subgraphs)} subgraphs are created:"
+        reports += f" {acc_subgraphs_num} acc subgraphs and {cpu_subgraphs_num} cpu subgraphs.\n"
+
+        for i, subgraph in enumerate(subgraphs):
+            reports += f"_run_on_acc_{i}: " if subgraph.is_acc else f"{self.non_acc_submodule_name}{i}: "
+            reports += f"{len(subgraph.nodes)} node(s)\n"
+
+        self.tag(subgraphs)
+        split_mod = self.split(remove_tag=True)
+        split_mod.eval()
+
+        if dump_graph:
+            drawer = FxGraphDrawer(
+                split_mod, "preview", ignore_getattr=True
+            )
+            dot_graphs = drawer.get_all_dot_graphs()
+            for name, dot_graph in dot_graphs.items():
+                dot_graph.write_raw(f"{name}.dot")
+
+        max_qps: float = self.PCIe_BW
+        bottleneck_module = ""
+
+        for node in split_mod.graph.nodes:
+            if node.op == "call_module" and "acc" in node.target:
+                reports += f"\nProcessing acc submodule {node.target}\n"
+
+                submod = getattr(split_mod, node.target)
+
+                def get_submod_inputs(main_mod, submod, example_inputs):
+                    sub_inputs = None
+
+                    def get_inputs(self, inputs):
+                        nonlocal sub_inputs
+                        sub_inputs = inputs
+
+                    handle = submod.register_forward_pre_hook(get_inputs)
+                    main_mod(*example_inputs)
+                    handle.remove()
+                    return sub_inputs
+
+                submod_inputs = get_submod_inputs(
+                    split_mod, submod, self.sample_input
+                )
+                ShapeProp(submod).propagate(*submod_inputs)
+
+                total_input_bytes = 0
+                total_output_bytes = 0
+
+                reports += "Checking inputs...\n"
+                for n in submod.graph.nodes:
+                    if n.op == "placeholder":
+                        if not is_node_output_tensor(n):
+                            reports += f"Input {n.name} is not a tensor, this might cause problems during lowering!\n"
+                        else:
+                            total_input_bytes += get_size_of_node(submod, n)[0]
+                    if n.op == "output":
+                        output_node = n
+
+                reports += "Checking outputs...\n"
+
+                def get_bytes(node: torch.fx.Node):
+                    nonlocal total_output_bytes
+                    nonlocal reports
+                    if not is_node_output_tensor(node):
+                        reports += f"Output {node.name} is not a tensor, this might cause problems during lowering!\n"
+                    else:
+                        total_output_bytes += get_size_of_node(submod, node)[0]
+
+                map_arg(output_node.args, get_bytes)  # type: ignore[possibly-undefined]
+                qps = self.PCIe_BW / max(total_input_bytes, total_output_bytes)
+                reports += f"Total input size in bytes is {total_input_bytes}, total output size in bytes is {total_output_bytes},"
+                reports += f" theoretical max qps (bounds by PCIe bandwidth) for this submodule is {qps}.\n"
+
+                if qps < max_qps:
+                    max_qps = qps
+                    bottleneck_module = node.target
+
+                try:
+                    lowered_submod = self._lower_model_to_backend(submod, submod_inputs)
+                except RuntimeError:
+                    reports += "Run into an error during lowering!\n"
+                    reports += self._find_culprit(submod, submod_inputs)
+                    continue
+
+                try:
+                    lowered_submod(*submod_inputs)
+                except RuntimeError:
+                    reports += "Run into an error during inference!\n"
+                    reports += self._find_culprit(submod, submod_inputs)
+                else:
+                    reports += "Lowering and running succeed!\n"
+
+        reports += f"\nTheoretical max qps (bounds by PCIe bandwidth) for this model is {max_qps},"
+        reports += f" bottleneck is submodule {bottleneck_module}."
+        print(reports)
+
+        # return the reports for testing purposes
+        return reports
+
+    # ===============================================================
+    # Helpers for extend_acc_subgraph() method
+    # ===============================================================
+
+    def find_reverse_deps(
+        self, tag_id: Optional[int] = None
+    ) -> Dict[torch.fx.Node, NodeSet]:
+        """
+        Builds reversed topological node dependencies, if tag_id is specified,
+        we ignore nodes that are in later subgraph i.e. nodes have greater tag_id.
+        """
+        result: Dict[torch.fx.Node, NodeSet] = defaultdict(set)
+
+        for node in self.module.graph.nodes:
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+
+            for user in node.users:
+                if user.op not in CALLABLE_NODE_OPS:
+                    continue
+
+                if tag_id is None or (int(user.tag.split("_")[-1]) < tag_id):
+                    result[node].add(user)
+
+        return result
+
+    def update_reverse_deps_for_fusions(
+        self, deps: Dict[torch.fx.Node, NodeSet]
+    ):
+        processed_node = set()
+
+        for node, fusion in self.fusions.items():
+            if node in processed_node:
+                continue
+
+            new_dep = set()
+
+            # Create a new dependency set which include all the
+            # dependencies of the nodes in the fusion group
+            for n in fusion:
+                new_dep.update(deps[n])
+
+            # Exclude nodes in the fusion
+            new_dep.difference_update(fusion)
+
+            # Update dependency
+            for n in fusion:
+                deps[n] = new_dep
+
+                for arg in n.all_input_nodes:
+                    if arg not in fusion:
+                        deps[arg].update(fusion)
+
+                processed_node.add(n)
+
+    def find_parent_nodes_of_subgraph(self, tag: str) -> NodeSet:
+        """
+        Finds parent nodes of the `tag` subgraph.
+
+        Traverse the inputs of nodes in the subgraph, if input doesn't belong to the subgraph
+        and is not a placeholder, we consider it as the parent node of the subgraph.
+        """
+        parent_nodes = set()
+
+        for node in self.module.graph.nodes:
+            if node.op in CALLABLE_NODE_OPS and node.tag == tag:
+                for arg in node.all_input_nodes:
+                    if arg.op in CALLABLE_NODE_OPS and arg.tag != tag:
+                        parent_nodes.add(arg)
+
+        return parent_nodes
+
+    def extend_acc_subgraph(self, tag: str):
+        """
+        Extend the acc subgraph with `tag` going the reversed topological direction.
+        """
+        # Dict that maps node to its users and ignore users that
+        # are in the subgraph that has greater tag
+        deps = self.find_reverse_deps(tag_id=int(tag.split("_")[-1]))
+        self.update_reverse_deps_for_fusions(deps)
+
+        # Parent nodes of the subgraph
+        parent_nodes = self.find_parent_nodes_of_subgraph(tag)
+
+        visited_nodes: NodeSet = set()
+
+        while parent_nodes:
+            node = None
+
+            # Find a acc node that depends on visited nodes only
+            for n in parent_nodes:
+                if deps[n] <= visited_nodes and n in self.acc_nodes:
+                    node = n
+                    break
+
+            if node is None:
+                break
+
+            # Put the node into `tag` subgraph
+            node.tag = tag  # type: ignore[attr-defined]
+            parent_nodes.remove(node)
+            visited_nodes.add(node)
+
+            # If node is in a fusion group, add all fusion buddies to parent nodes
+            if node in self.fusions:
+                for fusion_node in self.fusions[node]:
+                    if fusion_node not in visited_nodes:
+                        parent_nodes.add(fusion_node)
+
+            # Add inputs of the node to parent nodes
+            for arg in node.all_input_nodes:
+                if arg.op in CALLABLE_NODE_OPS and arg not in visited_nodes:
+                    parent_nodes.add(arg)
+
+    # ===============================================================
+    # Helpers for split() method
+    # ===============================================================
+
+    def starter_nodes(self) -> Tuple[NodeSet, NodeSet]:
+        """
+        Finds nodes that consume module inputs or get_attr nodes.
+        """
+        starter_cpu_nodes: NodeSet = set()
+        starter_acc_nodes: NodeSet = set()
+        for node in self.module.graph.nodes:
+            if node.op not in {"placeholder", "get_attr"}:
+                continue
+            for user in node.users:
+                if user in self.acc_nodes:
+                    starter_acc_nodes.add(user)
+                else:
+                    starter_cpu_nodes.add(user)
+        return starter_cpu_nodes, starter_acc_nodes
+
+    def put_nodes_into_subgraphs(self) -> List[Subgraph]:
+        # We start graph traversal from leaf nodes
+        current_cpu_nodes, current_acc_nodes = self.starter_nodes()
+        visited_nodes: NodeSet = set()
+
+        # Determine which subgraph to start from based on which subgraph has
+        # 0-dep node
+        acc_subgraph: bool = not any(len(self.deps[n]) == 0 for n in current_cpu_nodes)
+
+        current_subgraph_nodes: NodeList = []
+
+        # Result accumulator
+        subgraphs: List[Subgraph] = []
+        while current_cpu_nodes or current_acc_nodes:
+            # Find the first node that should belong to the current subgraph and has all dependencies resolved
+            current_nodes = current_acc_nodes if acc_subgraph else current_cpu_nodes
+            node = next(
+                (n for n in current_nodes if self.deps[n] <= visited_nodes),
+                None,
+            )
+
+            # If nothing was found, then it's time to flip the mode and start a new subgraph
+            if node is None:
+                if not current_subgraph_nodes:
+                    raise FxNetSplitterInternalError("Subgraph can't be empty")
+
+                subgraphs.append(
+                    Subgraph(is_acc=acc_subgraph, nodes=current_subgraph_nodes)
+                )
+                acc_subgraph = not acc_subgraph
+                current_subgraph_nodes = []
+                continue
+
+            current_nodes.remove(node)
+            visited_nodes.add(node)
+            current_subgraph_nodes.append(node)
+
+            # Add fusion buddies
+            if node in self.fusions:
+                if node in self.acc_nodes:
+                    current_acc_nodes.update(self.fusions[node] - visited_nodes)
+                else:
+                    current_cpu_nodes.update(self.fusions[node] - visited_nodes)
+
+            # Put depending nodes into the queue
+            for user in node.users:
+                if user.op not in CALLABLE_NODE_OPS:
+                    continue
+
+                # Add downstream nodes
+                if user in self.acc_nodes:
+                    current_acc_nodes.add(user)
+                else:
+                    current_cpu_nodes.add(user)
+
+        # Check if the last subgraph was not created
+        if current_subgraph_nodes:
+            subgraphs.append(
+                Subgraph(is_acc=acc_subgraph, nodes=current_subgraph_nodes)
+            )
+
+        if not subgraphs:
+            raise FxNetSplitterInternalError("Couldn't create subgraphs")
+
+        return subgraphs
+
+    def remove_small_acc_subgraphs(self, subgraphs: List[Subgraph]) -> List[Subgraph]:
+        """
+        This pass finds ACC submodules with less than specified size and merges
+        them with adjacent CPU submodules.
+        """
+        result: List[Subgraph] = []
+        for subgraph in subgraphs:
+            if subgraph.is_acc:
+                if len(subgraph.nodes) >= self.settings.min_acc_module_size:
+                    result.append(subgraph)
+                else:
+                    print(
+                        "Eliminating acc subgraph because it's smaller than the threshold: "
+                        f"{len(subgraph.nodes)} < {self.settings.min_acc_module_size}"
+                    )
+                    if result:
+                        result[-1].nodes.extend(subgraph.nodes)
+                    else:
+                        subgraph.is_acc = False
+                        result.append(subgraph)
+            else:
+                if result and not result[-1].is_acc:
+                    result[-1].nodes.extend(subgraph.nodes)
+                else:
+                    result.append(subgraph)
+        return result
+
+    def tag(self, subgraphs: List[Subgraph]):
+        self.tags: List[str] = []
+        for subgraph in subgraphs:
+            tag = f"_run_on_acc_{len(self.tags)}" if subgraph.is_acc else f"{self.non_acc_submodule_name}{len(self.tags)}"
+            self.tags.append(tag)
+            for node in subgraph.nodes:
+                if hasattr(node, "tag"):
+                    raise FxNetSplitterInternalError(f"Node {node} was already tagged")
+
+                node.tag = tag  # type: ignore[attr-defined]
+                self._node_submodule_map[node.name] = tag
+
+    def split(self, remove_tag: bool = False) -> torch.fx.GraphModule:
+        split_module = split_by_tags(self.module, self.tags)
+        if remove_tag:
+            for node in self.module.graph.nodes:
+                if hasattr(node, "tag"):
+                    del node.tag
+        return split_module
+
+    def __call__(self) -> torch.fx.GraphModule:
+        subgraphs = self.put_nodes_into_subgraphs()
+        subgraphs = self.remove_small_acc_subgraphs(subgraphs)
+        acc_subgraphs_count = len([s for s in subgraphs if s.is_acc])
+        non_acc_subgraphs_count = len(subgraphs) - acc_subgraphs_count
+        print(f"Got {acc_subgraphs_count} acc subgraphs and {non_acc_subgraphs_count} non-acc subgraphs")
+        self.tag(subgraphs)
+        return self.split()
+
+    def generate_split_results(self) -> SplitResult:
+        split_module = self()
+        submodule_names = []
+        for name, mod in split_module.named_children():
+            submodule_names.append(name)
+        submodule_inputs = generate_inputs_for_submodules(split_module, self.sample_input, submodule_names)
+        return SplitResult(split_module, submodule_inputs, self.non_acc_submodule_name)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/tests/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..061466ab8991a904303af8b28067103c4ba47bc4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c537637b705f543b820a842e795913231650492
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/tests/__pycache__/test_pass_manager.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/tests/test_pass_manager.py b/MLPY/Lib/site-packages/torch/fx/passes/tests/test_pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bcbbcbdd76710e0015e85361635296ffb0bd876
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/tests/test_pass_manager.py
@@ -0,0 +1,58 @@
+import unittest
+
+from ..pass_manager import (
+    inplace_wrapper,
+    PassManager,
+    these_before_those_pass_constraint,
+    this_before_that_pass_constraint,
+)
+
+
+class TestPassManager(unittest.TestCase):
+    def test_pass_manager_builder(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+        pm.validate()
+
+    def test_this_before_that_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        pm = PassManager(passes)
+
+        # add unfulfillable constraint
+        pm.add_constraint(this_before_that_pass_constraint(passes[-1], passes[0]))
+
+        self.assertRaises(RuntimeError, pm.validate)
+
+    def test_these_before_those_pass_constraint(self) -> None:
+        passes = [lambda x: 2 * x for _ in range(10)]
+        constraint = these_before_those_pass_constraint(passes[-1], passes[0])
+        pm = PassManager(
+            [inplace_wrapper(p) for p in passes]
+        )
+
+        # add unfulfillable constraint
+        pm.add_constraint(constraint)
+
+        self.assertRaises(RuntimeError, pm.validate)
+
+    def test_two_pass_managers(self) -> None:
+        """Make sure we can construct the PassManager twice and not share any
+        state between them"""
+
+        passes = [lambda x: 2 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm1 = PassManager()
+        for p in passes:
+            pm1.add_pass(p)
+        pm1.add_constraint(constraint)
+        output1 = pm1(1)
+        self.assertEqual(output1, 2 ** 3)
+
+        passes = [lambda x: 3 * x for _ in range(3)]
+        constraint = these_before_those_pass_constraint(passes[0], passes[1])
+        pm2 = PassManager()
+        for p in passes:
+            pm2.add_pass(p)
+        pm2.add_constraint(constraint)
+        output2 = pm2(1)
+        self.assertEqual(output2, 3 ** 3)
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/tools_common.py b/MLPY/Lib/site-packages/torch/fx/passes/tools_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ecc8daf3db75132c8d101ca34c27207737094eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/tools_common.py
@@ -0,0 +1,273 @@
+from typing import List, Tuple, Union, Dict, Any, Set, Mapping, Optional
+import collections
+from dataclasses import dataclass
+
+import torch
+import torch.fx
+from torch.fx.node import _get_qualified_name
+from torch.fx._compatibility import compatibility
+
+__all__ = ['get_acc_ops_name', 'get_node_target', 'is_node_output_tensor', 'FxNetAccFusionsFinder', 'legalize_graph']
+
+Tensors = Union[Tuple[torch.Tensor], List[torch.Tensor]]
+TensorOrTensors = Union[torch.Tensor, Tensors]
+NodeList = List[torch.fx.Node]
+NodeSet = Set[torch.fx.Node]
+Names = List[str]
+CALLABLE_NODE_OPS = {"call_module", "call_function", "call_method"}
+
+
+@compatibility(is_backward_compatible=False)
+def get_acc_ops_name(k):
+    if isinstance(k, str):
+        return k
+    elif k.__module__ and "acc_ops" in k.__module__:
+        return f"acc_ops.{k.__name__}"
+    else:
+        module = k.__module__.replace('torch._ops', 'torch.ops')  # WAR for bug in how torch.ops assigns module
+        return f"{module if module else ''}.{k.__name__}"
+
+
+@compatibility(is_backward_compatible=False)
+def get_node_target(submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node) -> str:
+    """
+    Given a `node` returns its target typename.
+
+    For "call_method" node, return node.target which is the name of that method being called.
+    This could potential lead to conflict but should be okay because normally it's on a tensor.
+
+    For "call_function" node, return typename of node.target.
+
+    For "call_module" node, return typename of the module that node.target point to.
+
+    If seeing "_VariableFunctionsClass" in the target name string, it will be replaced by
+    "torch". e.g. _VariableFunctionsClass.relu would become torch.relu.
+    """
+
+    assert node.op in CALLABLE_NODE_OPS, (
+        "Expect op types of " + ", ".join(CALLABLE_NODE_OPS) + f", but found {node.op}"
+    )
+
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        submod = submodules[node.target]
+        submod_type = getattr(submod, "_base_class_origin", type(submod))
+        return get_acc_ops_name(submod_type)
+    elif node.op == "call_function":
+        target: Any = node.target
+        return (
+            f"acc_ops.{target.__name__}"
+            if target.__module__ is not None and "acc_ops" in target.__module__
+            else _get_qualified_name(target)
+        )
+    else:
+        assert isinstance(node.target, str)
+        return node.target
+
+@compatibility(is_backward_compatible=False)
+def is_node_output_tensor(node: torch.fx.Node) -> bool:
+    """Checks if the node output produces a Tensor or not.
+
+    NOTE: This requires to run `ShapeProp` on the containing fx graph before
+    calling this function. This is because it works by checking the `type`
+    metadata on the node. This metadata is produced by the `ShapeProp`.
+    """
+    type_ = node.meta.get("type", None)
+    return type_ is not None and issubclass(type_, torch.Tensor)
+
+@compatibility(is_backward_compatible=False)
+class FxNetAccFusionsFinder:
+    """
+    Finds groups of connected ACC nodes that pass non-tensor data between each other.
+    Such groups are called fusion groups.
+    """
+
+    def __init__(self, module: torch.fx.GraphModule, acc_nodes: NodeSet):
+        self.module = module
+        self.nodes = list(module.graph.nodes)
+        self.acc_nodes = acc_nodes
+
+    @dataclass
+    class FusionGroup:
+        # The smallest idx of nodes in the fusion group after topological sorting all the nodes in the model.
+        top_node_idx: int
+
+        # Nodes in this fusion group.
+        nodes: NodeSet
+
+        # Inputs to this fusion group.
+        inputs: NodeSet
+
+        # Nodes that in the fusion group that haven't been processed yet.
+        nodes_need_process: NodeSet
+
+        def add_node(self, node):
+            """
+            Add a node to fusion group.
+            """
+            if node in self.nodes:
+                return
+
+            self.nodes_need_process.add(node)
+            self.nodes.add(node)
+            self.inputs.discard(node)
+            self.inputs.update(
+                {
+                    n
+                    for n in node.all_input_nodes
+                    if n.op in CALLABLE_NODE_OPS and n not in self.nodes
+                }
+            )
+
+    def recursive_add_node(
+        self,
+        fusion_group: "FxNetAccFusionsFinder.FusionGroup",
+        inputs: Union[NodeSet, NodeList],
+        visited: Optional[NodeSet] = None,
+    ):
+        """
+        Start from inputs and going reverse topological order. If any upstream node
+        is in the fusion group, add all the nodes in this path to fusion group.
+        """
+        for arg in inputs:
+            # skip the node if already seen
+            if visited is not None:
+                if arg in visited:
+                    continue
+                visited.add(arg)
+
+            # Skip placeholder and get_attr because they won't be in the fusion group.
+            if arg.op not in CALLABLE_NODE_OPS:
+                continue
+
+            # If the node has smaller idx, it's already an upstream node of the fusion
+            # group. We don't need to check it anymore.
+            if self.nodes.index(arg) < fusion_group.top_node_idx:
+                continue
+
+            # If the node is in the fusion group, return True.
+            if arg in fusion_group.nodes:
+                return True
+
+            # Check the upstream nodes of the node, if any of them is in the fusion group
+            # we'll add this node to fusion group and return True.
+            if self.recursive_add_node(fusion_group, arg.all_input_nodes, visited):
+                fusion_group.add_node(arg)
+                return True
+
+        return False
+
+    def __call__(self) -> Dict[torch.fx.Node, NodeSet]:
+        result: Dict[torch.fx.Node, NodeSet] = {}
+        acc_nodes = list(self.acc_nodes)
+
+        for node in acc_nodes:
+            if node in result:
+                continue
+            if node.op not in CALLABLE_NODE_OPS:
+                continue
+            if "tensor_meta" in node.meta:
+                continue
+            if node not in self.acc_nodes:
+                continue
+
+            fusion_group: FxNetAccFusionsFinder.FusionGroup = self.FusionGroup(
+                top_node_idx=self.nodes.index(node),
+                nodes={node},
+                inputs=set(node.all_input_nodes),
+                nodes_need_process={node},
+            )
+            while fusion_group.nodes_need_process:
+                node = fusion_group.nodes_need_process.pop()
+                self.recursive_add_node(
+                    fusion_group,
+                    fusion_group.inputs,
+                    visited=set(),
+                )
+
+                # Optionally add downstream nodes
+                if "tensor_meta" not in node.meta:
+                    for user in node.users:
+                        if user.op not in CALLABLE_NODE_OPS:
+                            continue
+                        if user in fusion_group.nodes:
+                            continue
+
+                        fusion_group.add_node(user)
+                        self.recursive_add_node(
+                            fusion_group,
+                            fusion_group.inputs,
+                            visited=set(),
+                        )
+
+                # Add some upstream nodes
+                for arg in node.all_input_nodes:
+                    if arg.op not in CALLABLE_NODE_OPS:
+                        continue
+                    if "tensor_meta" in arg.meta:
+                        continue
+                    if arg in fusion_group.nodes:
+                        continue
+
+                    fusion_group.add_node(arg)
+                    fusion_group.top_node_idx = min(
+                        fusion_group.top_node_idx, self.nodes.index(arg)
+                    )
+                    self.recursive_add_node(
+                        fusion_group,
+                        fusion_group.inputs,
+                        visited=set(),
+                    )
+
+            if not (set(fusion_group.nodes) <= self.acc_nodes):
+                self.acc_nodes -= fusion_group.nodes
+            else:
+                for n in fusion_group.nodes:
+                    result[n] = fusion_group.nodes
+
+        return result
+
+
+@compatibility(is_backward_compatible=False)
+def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Replace the graph of the given GraphModule with one that contains the same nodes as the
+    original, but in topologically sorted order.
+
+    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
+    order of its input GraphModule, so that this order is restored before further transformation.
+
+    Arguments:
+        gm: The graph module to topologically sort. It is modified in-place.
+
+    Returns:
+        The graph module in-place sorted
+    """
+    indeg = dict.fromkeys(gm.graph.nodes, 0)
+    new_graph = torch.fx.Graph()
+    # Track how many unfulfilled dependencies each node has
+    for node in gm.graph.nodes:
+        for user in node.users:
+            indeg[user] += 1
+    queue: collections.deque = collections.deque()
+    # Add all nodes with no dependencies to the queue
+    for node in gm.graph.nodes:
+        if indeg[node] == 0:
+            queue.append(node)
+    env: Dict[torch.fx.Node, torch.fx.Node] = {}
+    # Pop nodes from the queue, and add nodes that have had all their
+    # dependencies fulfilled
+    while len(queue) > 0:
+        cur = queue.popleft()
+        env[cur] = new_graph.node_copy(cur, lambda x: env[x])
+        for user in cur.users:
+            indeg[user] -= 1
+            if indeg[user] == 0:
+                queue.append(user)
+    # If the new graph's size is not as large as the old one, then there must be
+    # a cycle (i.e. some node's dependencies were not satisfied.)
+    if len(new_graph.nodes) < len(gm.graph.nodes):
+        raise RuntimeError(f"Input graph has cycles, unable to add {[node for node in indeg if indeg[node] != 0]}")
+    new_graph._codegen = gm.graph._codegen
+    gm.graph = new_graph
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__init__.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bbbabf92272a3979ec9197c5ed3cd38352d2472
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/__init__.py
@@ -0,0 +1 @@
+from .common import lift_subgraph_as_module, HolderModule, compare_graphs
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71804d0bf6960ed9a8f85c819fe679be9719dd3a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e47d0d25b587619c669383b34021b5223aa2683
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f869bffbc69c0930d72cf8d69ecba8127de88c62
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/fuser_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb62a5d57b31fd079d6e8f68c77a7c6c818ae966
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..749025d795688e58a9b58c6d882d9d48e560cec2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/matcher_with_name_node_map_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/source_matcher_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/source_matcher_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e6407d92eb285b07967d18393e0431e626679d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/fx/passes/utils/__pycache__/source_matcher_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/common.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4a04920d5b17d997a93588154103f11247a562
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/common.py
@@ -0,0 +1,95 @@
+from typing import Dict, Tuple
+
+from torch.fx._compatibility import compatibility
+from torch.fx.graph import Graph
+
+from torch.fx.graph_module import GraphModule
+from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
+from torch.nn import Module
+
+
+__all__ = ["HolderModule", "lift_subgraph_as_module", "compare_graphs"]
+
+
+@compatibility(is_backward_compatible=False)
+class HolderModule(Module):
+    """
+    HolderModule is used to copy all the attributes from original module to submodules
+    that uses the attributes
+    """
+
+    def __init__(self, d):
+        super().__init__()
+        for k, v in d.items():
+            self.add_module(k, v)
+
+
+@compatibility(is_backward_compatible=False)
+def lift_subgraph_as_module(
+    gm: GraphModule,
+    subgraph: Graph,
+    comp_name: str = "",
+    class_name: str = "GraphModule",
+) -> Tuple[GraphModule, Dict[str, str]]:
+    """
+    Create a GraphModule for subgraph, which copies the necessary attributes from the original parent graph_module.
+
+    Args:
+        gm (GraphModule): parent graph module
+
+        subgraph (Graph): a valid subgraph that contains copied nodes from the parent graph
+
+        comp_name (str): name for the new component
+
+        class_name (str): name for the submodule
+
+    """
+
+    # Loop through all module calls (call_module) and param fetches (get_attr)
+    # in this component, creating HolderModules as necessary to match the path.
+    # e.g. if in the original module there's a get_attr node fetches "conv.weight".
+    # We create a HolderModule as root -> add a HolderModule named "conv" ->
+    # make "weight" a attribute of "conv" HolderModule and point to conv.weight in
+    # the original module.
+    submodule = HolderModule({})
+    orig_to_split_fqn_mapping: Dict[str, str] = {}
+    for n in subgraph.nodes:
+        if n.op not in ("call_module", "get_attr"):
+            continue
+
+        target = n.target
+        assert isinstance(target, str)
+        target_name_parts = target.split(".")
+        curr = submodule
+        orig_gm = gm
+
+        for name in target_name_parts[:-1]:
+            if not hasattr(curr, name):
+                curr.add_module(name, HolderModule({}))
+
+            curr = getattr(curr, name)
+            orig_gm = getattr(orig_gm, name)
+
+        leaf_node_name = target_name_parts[-1]
+        leaf_node = getattr(orig_gm, leaf_node_name)
+
+        orig_to_split_fqn_mapping[target] = f"{comp_name}.{target}"
+        # Relies on custom __setattr__ magic.
+        setattr(curr, leaf_node_name, leaf_node)
+
+    return GraphModule(submodule, subgraph, class_name), orig_to_split_fqn_mapping
+
+
+@compatibility(is_backward_compatible=False)
+def compare_graphs(left: Graph, right: Graph) -> bool:
+    """
+    Return True if two graphs are identical, i.e they
+        - have the same number of outputs in the same order
+        - have the same number of inputs in the same order
+        - have the same set of nodes, and identical connectivity
+    """
+
+    matcher = SubgraphMatcher(left, match_output=True, match_placeholder=True)
+    matches = matcher.match(right)
+
+    return len(matches) > 0
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/fuser_utils.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/fuser_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..54bf8364c4d4633d5a054870337bb0bba05d04db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/fuser_utils.py
@@ -0,0 +1,233 @@
+import copy
+from queue import SimpleQueue
+from typing import List, Dict, Tuple
+
+import torch.fx
+from torch.fx.graph_module import GraphModule
+from torch.fx.graph import Graph
+from torch.fx.node import Node
+from torch.fx.passes.tools_common import NodeList, NodeSet, legalize_graph
+from torch.fx.passes.utils import lift_subgraph_as_module
+from torch.fx._compatibility import compatibility
+
+@compatibility(is_backward_compatible=False)
+def topo_sort(nodes: NodeList) -> NodeList:
+    # sort nodes according to the topological order
+    indegree_map = dict.fromkeys(nodes, 0)
+    candidates: SimpleQueue = SimpleQueue()
+
+    for node in nodes:
+        for n in node.all_input_nodes:
+            if n in indegree_map:
+                indegree_map[node] += 1
+        if indegree_map[node] == 0:
+            candidates.put(node)
+
+    sorted_nodes: NodeList = list()
+    while not candidates.empty():
+        node = candidates.get()
+        sorted_nodes.append(node)
+
+        for n in node.users:
+            if n in indegree_map:
+                indegree_map[n] -= 1
+                if indegree_map[n] == 0:
+                    candidates.put(n)
+
+    assert len(nodes) == len(sorted_nodes), "topological sorted nodes doesn't have same length as input nodes"
+
+    return sorted_nodes
+
+
+@compatibility(is_backward_compatible=False)
+def validate_partition(partition: NodeList) -> bool:
+    # verify the partition does't form a dependency cycle in the original graph
+    # returns True for valid partition, False for invalid
+
+    partition_set = set(partition)
+
+    outputs: NodeList = list()
+    for node in partition_set:
+        for user_node in node.users:
+            if user_node not in partition_set:
+                # external user node, need to expose as an output
+                outputs.append(user_node)
+
+    # Perform BFS on the partition outputs.
+    # If it reaches a node within the partition, then it found a cycle.
+    # This function takes the ownership of `root_nodes` and may modify it.
+    def bfs_find_cycle(root_nodes: NodeList) -> bool:
+        # Set used to exclude nodes that have already been visited.
+        # If a node has been visited, that node and all its children have
+        # been checked for cycles.
+        visited: NodeSet = set()
+
+        # Start with `root_nodes` and traverse through (toward child nodes)
+        # their connected sub-graph. Nodes in `visited` won't be added
+        # to `queue` again.
+        queue: NodeList = root_nodes
+        while queue:
+            current = queue.pop()
+            visited.add(current)
+            if current in partition_set:
+                # Started from partition's `output` nodes, and reached
+                # another node in partition. Cycle!
+                return True
+            for user_node in current.users:
+                if user_node in visited:
+                    continue
+                queue.append(user_node)
+        # `root_nodes` don't cause cycle.
+        return False
+
+    # Use all output nodes as roots to traverse
+    # the graph to check cycles.
+    if bfs_find_cycle(outputs):
+        return False
+
+    return True
+
+
+@compatibility(is_backward_compatible=False)
+def fuse_as_graphmodule(gm: GraphModule,
+                        nodes: NodeList,
+                        module_name: str) -> Tuple[GraphModule, Tuple[Node, ...], Tuple[Node, ...]]:
+
+    """
+    Fuse nodes in graph_module into a GraphModule.
+
+    Args:
+        gm (GraphModule): target graph_module
+
+        nodes (List[Node]): list of nodes in `gm` to fuse, where the node must be topologically sorted
+
+        module_name: class name for the fused GraphModule
+
+    Returns:
+        fused_gm (GraphModule): fused graph module, where its node is a copy of `nodes` in `gm`
+
+        original_inputs (Tuple[Node, ...]): input nodes to `nodes` in original `gm`
+
+        original_outputs (Tuple[Node, ...]): consumer nodes of `nodes` in original `gm`
+
+    """
+
+    # assumption: nodes are already sorted in topo order
+
+    for node in nodes:
+        assert node.graph.owning_module is gm, f"{node} doesn't belong to passed in graph module {gm._get_name()}"
+        assert not node._erased, f"{node} has been removed from owning graph"
+        assert node in gm.graph.nodes, f"{node} is not found in graph module {gm._get_name()}"
+
+    # validates partition doesn't introduce dependency circles in the graph
+    assert validate_partition(nodes), "Invalid partition, found dependency cycles"
+
+    subgraph = Graph()
+
+    node_to_placeholder: Dict[Node, Node] = {}  # mapping of nodes from old graph to placeholder in new graph
+    node_map: Dict[Node, Node] = {}       # mapping of nodes from old graph to new graph
+
+    # handles inputs through graph.node_copy's arg_transform functions
+    def remap_inputs(x):
+        if x.op == "get_attr":
+            # TODO: do we really need copy the get_attr node into the graph?
+            # do something here
+            pass
+
+        if x in nodes:
+            # x is inside subgraph, return the copied node
+            # the node should have been copied aleady, as we are copying graph in the topological order
+            return node_map[x]
+
+        if x not in node_to_placeholder:
+            # x is not in subgraph, create a new placeholder for subgraph
+            placeholder_node = subgraph.placeholder(x.name, type_expr=x.type)
+            # copy all meta fields, even if some fields might be irrelvant for the placeholder node
+            placeholder_node.meta = copy.copy(x.meta)
+            node_to_placeholder[x] = placeholder_node
+
+        return node_to_placeholder[x]
+
+    # copy nodes in topological order
+    for node in nodes:
+        new_node = subgraph.node_copy(node, remap_inputs)
+        node_map[node] = new_node
+
+    # handles outputs
+    output_mapping: Dict[Node, Node] = {}  # mapping from old output to new outputs
+
+    for node in nodes:
+        for user_node in node.users:
+            if user_node not in nodes:
+                # external user node, need to expose as an output
+                output_mapping[node] = node_map[node]
+
+    # outs contain nodes in the new subgraph
+    outs = tuple(output_mapping.values())
+
+    # Take care of the args of FX output node. If there's a single
+    # output then the output node args is like (output_single), else
+    # if there're multiple outputs then the output node args is like
+    # ((output_0, output_1, ...)).
+    subgraph.output(outs[0] if len(outs) == 1 else outs)
+
+    # lint to ensure correctness
+    subgraph.lint()
+    fused_gm: GraphModule
+    fused_gm, _ = lift_subgraph_as_module(gm, subgraph, comp_name="", class_name=module_name)
+
+    # sub_gm's input nodes in the original module
+    original_inputs: Tuple[Node, ...] = tuple(node_to_placeholder.keys())
+
+    # sub_gm's outputs node in the original module
+    original_outputs: Tuple[Node, ...] = tuple(output_mapping.keys())
+
+    return fused_gm, original_inputs, original_outputs
+
+
+@compatibility(is_backward_compatible=False)
+def insert_subgm(gm: GraphModule, sub_gm: GraphModule, orig_inputs: Tuple[Node, ...], orig_outputs: Tuple[Node, ...]):
+    # add sub_gm into gm
+    submodule_name = sub_gm.__class__.__name__
+    gm.add_submodule(submodule_name, sub_gm)
+
+    # Create a call_module node in main graph.
+    module_node = gm.graph.call_module(
+        submodule_name,
+        args=orig_inputs,
+        kwargs=None)
+
+    if len(orig_outputs) == 1:
+        # main_remapping[comp.orig_outputs[0]] = module_node
+        orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True)
+    else:
+        for i, orig_output in enumerate(orig_outputs):
+            # Use Proxy to record getitem access.
+            proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
+            orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
+    return gm
+
+@compatibility(is_backward_compatible=False)
+def erase_nodes(gm: GraphModule, nodes: NodeList):
+
+    # erase original nodes in inversed topological order
+    for node in reversed(nodes):
+        gm.graph.erase_node(node)
+
+
+@compatibility(is_backward_compatible=False)
+def fuse_by_partitions(gm: GraphModule, partitions: List[NodeList]) -> GraphModule:
+    for partition_id, nodes in enumerate(partitions):
+        sorted_nodes = topo_sort(nodes)
+
+        submodule_name = "fused_" + str(partition_id)
+        sub_gm, orig_inputs, orig_outputs = fuse_as_graphmodule(gm, sorted_nodes, submodule_name)
+
+        insert_subgm(gm, sub_gm, orig_inputs, orig_outputs)
+
+        erase_nodes(gm, sorted_nodes)
+
+    # topological sort original gm with newly created sub_gm
+    legalize_graph(gm)
+
+    return gm
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_utils.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1f9e84998ff0bf01517ff09bc7bccccdbfd634
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_utils.py
@@ -0,0 +1,400 @@
+from dataclasses import dataclass, field
+from collections import defaultdict
+import copy
+import torch
+from torch.fx import (
+    Node,
+    Graph,
+)
+from torch.fx._compatibility import compatibility
+from typing import Dict, List, Set, Any, Union, Tuple
+import logging
+import os
+
+__all__ = ['SubgraphMatcher', 'InternalMatch']
+
+# Set`PYTORCH_MATCHER_LOGLEVEL=INFO` to see debug logs
+def _init_logger():
+    logger = logging.getLogger(__name__)
+
+    level = os.environ.get('PYTORCH_MATCHER_LOGLEVEL', 'WARNING').upper()
+    logger.setLevel(level)
+    console = logging.StreamHandler()
+    formatter = logging.Formatter("%(filename)s > %(message)s")
+    console.setFormatter(formatter)
+    console.setLevel(level)
+    # add the handlers to the logger
+    logger.addHandler(console)
+    logger.propagate = False
+    return logger
+
+logger = _init_logger()
+
+@compatibility(is_backward_compatible=False)
+@dataclass
+class InternalMatch:
+    # Nodes from which the match was found
+    anchors: List[Node]
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node] = field(default_factory=dict)
+
+    # nodes in target graph that are matched placeholder in pattern
+    placeholder_nodes: List[Node] = field(default_factory=list)
+
+    # nodes in matched subgraph returned by output
+    returning_nodes: List[Node] = field(default_factory=list)
+
+    # map from a string name to a node in the target graph
+    # only available if the matcher is `SubgraphMatcherWithNameNodesMap`
+    name_node_map: Dict[str, Node] = field(default_factory=dict)
+
+    def __copy__(self):
+        return InternalMatch(anchors=self.anchors, nodes_map=self.nodes_map.copy(),
+                             placeholder_nodes=self.placeholder_nodes.copy(),
+                             returning_nodes=self.returning_nodes.copy())
+
+@compatibility(is_backward_compatible=False)
+class SubgraphMatcher:
+    def __init__(self, pattern: Graph,
+                 match_output: bool = False,
+                 match_placeholder: bool = False,
+                 remove_overlapping_matches: bool = True,
+                 ignore_literals: bool = False) -> None:
+        """
+        Args:
+            pattern: the targeted matching pattern, represented in fx.Graph.
+            match_output: If True, output node in the pattern graph will be treated as a part of the targeted pattern.
+                If False, output node is ignored during match.
+            match_placeholder: If True, placeholder node in the pattern graph will be treated as a part of
+                the targeted pattern. If False, placeholder nodes will be used a wildcard.
+            remove_overlapping_matches: If True, in the case of overlapping matches, only the first match
+                will be returned.
+            ignore_literals: If True, will not check if literals are equal and
+                will instead treat them as wildcards.
+        """
+
+        self.pattern = pattern
+        self.match_output = match_output
+        self.match_placeholder = match_placeholder
+        self.remove_overlapping_matches = remove_overlapping_matches
+        self.ignore_literals = ignore_literals
+
+        if len(pattern.nodes) == 0:
+            raise ValueError("SubgraphMatcher cannot be initialized with an empty pattern")
+
+        for node in pattern.nodes:
+            if node.op != "output":
+                assert len(node.users) > 0, \
+                       "SubgraphMatcher cannot be initialized with an pattern with dead code"
+
+        # TODO: assert pattern is a connected graph
+
+        self.pattern_placeholder_nodes = [n for n in pattern.nodes if n.op == "placeholder"]
+        output_node = next(iter(reversed(pattern.nodes)))
+        # nodes returned by outputs
+        self.pattern_returning_nodes: List[Node] = output_node.all_input_nodes
+
+        self.pattern_anchors: List[Node] = []
+        if match_output:
+            self.pattern_anchors = [output_node]
+        else:
+            # If a node has output_node as the ONLY user, then this node is a graph sink,
+            # and should be matched against as an anchor
+            self.pattern_anchors = [n for n in output_node.all_input_nodes if len(n.users) == 1]
+
+    def _match_attributes(self, pn: Node, gn: Node) -> bool:
+        # Attributes matching is complicated. Right now we only support matching constant tensor
+        assert isinstance(pn.target, str), f"pn.target {pn.target} must be a string."
+        assert isinstance(gn.target, str), f"gn.target {gn.target} must be a string."
+
+        # TODO(tmanlaibaatar) should probably make this actual API
+        def _getattr(model: torch.fx.GraphModule, attr_name: str):
+            *prefix, field = attr_name.split(".")
+            t = model
+            for item in prefix:
+                t = getattr(t, item, None)  # type: ignore[assignment]
+                assert t is not None
+
+            return getattr(t, field)
+
+        pn_value = _getattr(pn.graph.owning_module, pn.target)
+        gn_value = _getattr(gn.graph.owning_module, gn.target)
+
+        if type(pn_value) != type(gn_value):
+            return False
+
+        # Don't require exact match on tensor values.
+        if isinstance(pn_value, torch.Tensor):
+            return isinstance(gn_value, torch.Tensor)
+        else:
+            raise RuntimeError(f"Unsupported type {pn_value} when matching attributes")
+        return False
+
+    def _nodes_are_equal(self, pn: Node, gn: Node) -> bool:
+        # if exact match for placeholder is not required, then use placeholder as a wildcard
+        if not self.match_placeholder and pn.op == "placeholder":
+            return True
+
+        if pn.op == gn.op:
+            if pn.op == "placeholder" or pn.op == "output":
+                return True
+            elif pn.op == "get_attr":
+                return self._match_attributes(pn, gn)
+            return pn.target == gn.target
+        return False
+
+    def _is_contained(self, nodes_map: Dict[Node, Node]) -> bool:
+        # `lookup` represents all the nodes in `original_graph`
+        # that are part of `pattern`
+
+        # Placeholders can be used by other nodes in the graphs
+        lookup: Dict[Node, Node] = {gn : pn for pn, gn in nodes_map.items() if pn.op != "placeholder"}
+
+        for gn, pn in lookup.items():
+            # nodes returned by output are allowed to be used in other areas of the graph
+            if pn in self.pattern_returning_nodes:
+                continue
+
+            for user in gn.users:
+                # If this node has users that were not in `lookup`, then it must leak out of the
+                # pattern subgraph
+                if user not in lookup:
+                    return False
+        return True
+
+    def _remove_overlapping_matches(self, matches: List[InternalMatch]) -> List[InternalMatch]:
+        non_overlapping_matches: List[InternalMatch] = list()
+        nodes_matched: Set[Node] = set()
+
+        for match in matches:
+            found_overlap = False
+            for pn, gn in match.nodes_map.items():
+                if pn.op not in {"placeholder", "output"} and gn in nodes_matched:
+                    found_overlap = True
+                    break
+
+            if not found_overlap:
+                non_overlapping_matches.append(match)
+                for pn, gn in match.nodes_map.items():
+                    if pn.op not in {"placeholder", "output"}:
+                        nodes_matched.add(gn)
+        return non_overlapping_matches
+
+    def _match_literals(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
+        assert not (isinstance(pn, Node) and isinstance(gn, Node)), "pn and gn cannot both be Node"
+
+        if isinstance(pn, Node) and not isinstance(gn, Node):
+            if pn.op == "placeholder":
+                # Check if we've already matched these nodes in the current
+                # traversal
+                if pn in match.nodes_map:
+                    return match.nodes_map[pn] == gn
+
+                match.nodes_map[pn] = gn
+                return True
+            else:
+                return False
+        elif not isinstance(pn, Node) and isinstance(gn, Node):
+            return False
+        else:
+            return type(gn) == type(pn) and gn == pn
+
+    def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
+        logger.info("  matching %s to %s", pn, gn)
+
+        assert isinstance(pn, Node) and isinstance(gn, Node), str(f"pn and gn must be Node, pn: {pn}, gn: {gn}")
+
+        # Check if we've already matched these nodes in the current
+        # traversal
+        if pn in match.nodes_map:
+            return match.nodes_map[pn] == gn
+
+        # TODO: use a more efficient way to check if gn is matched before: two-way dict
+        if gn in match.nodes_map.values():
+            return False
+
+        if not self._nodes_are_equal(pn, gn):
+            return False
+
+        # Optimistically mark `pn` as a match for `gn`, and save a local copy of match
+        saved_match = copy.copy(match)
+        match.nodes_map[pn] = gn
+
+        # Placeholder is a wildcard and can be matched with any python object
+        # (including list/tuple)
+        if pn.op == "placeholder":
+            return True
+
+        # Recursively traverse upwards to check if `pn` is a true
+        # match for `gn`
+        match_found = True
+
+        def _match_args(args1: Union[List, Tuple], args2: Union[List, Tuple]) -> bool:
+            if len(args1) != len(args2):
+                return False
+
+            for a1, a2 in zip(args1, args2):
+                if isinstance(a1, Node) and isinstance(a2, Node):
+                    matched = self._match_nodes(a1, a2, match)
+                elif isinstance(a1, (list, tuple)) and isinstance(a2, (list, tuple)):
+                    matched = _match_args(a1, a2)
+                else:
+                    matched = self._match_literals(a1, a2, match) or self.ignore_literals
+
+                if not matched:
+                    return False
+
+            return True
+
+        # Flatten all args/kwargs into 1 list of args
+        pn_args, gn_args = None, None
+        if (
+            (len(pn.args) != len(gn.args) or list(pn.kwargs.keys()) != list(gn.kwargs.keys())) and
+            pn.op == "call_function" and
+            isinstance(pn.target, torch._ops.OpOverload)
+        ):
+            args_schema = pn.target._schema.arguments
+
+            def get_all_arguments(orig_args, orig_kwargs):
+                all_args = []
+                for i, schema in enumerate(args_schema):
+                    if schema.name in orig_kwargs:
+                        all_args.append(orig_kwargs[schema.name])
+                    elif not schema.kwarg_only and i < len(orig_args):
+                        all_args.append(orig_args[i])
+                    else:
+                        all_args.append(schema.default_value)
+                return all_args
+
+            pn_args = get_all_arguments(pn.args, pn.kwargs)
+            gn_args = get_all_arguments(gn.args, gn.kwargs)
+
+        elif len(pn.args) == len(gn.args) and list(pn.kwargs.keys()) == list(gn.kwargs.keys()):
+            pn_args = list(pn.args)
+            gn_args = list(gn.args)
+            pn_args.extend(list(pn.kwargs.values()))
+            gn_args.extend(list(gn.kwargs.values()))
+        else:
+            match_found = False
+
+        match_found = (
+            match_found and
+            pn_args is not None and
+            gn_args is not None and
+            _match_args(pn_args, gn_args)
+        )
+
+        if not match_found:
+            # revert to saved_match before matching with current node
+            match = copy.copy(saved_match)
+            return False
+
+        return True
+
+    def match(self, graph: Graph) -> List[InternalMatch]:
+        """
+        Returns:
+            The matched subgraphs.
+            Thre returned subgraph would be fully self-contained, meaning the nodes (except placeholder
+            and nodes returned by output) can only be consumed by nodes within the matched subgraph.
+
+        Subgraph pattern matcher is implemented with the backtracking style in the following steps:
+
+        1. We first identify all the anchor nodes in the pattern graph. The anchor nodes
+        are the "sinks" (nodes with no user other than the output node) of the pattern graph.
+        One pattern graph could have multiple anchors if it has multiple return values.
+
+        2. In the target graph, we identify the potential candidate nodes that can be matched
+        with each anchor. These anchor-candidate pairs are the starting points for
+        pairwise per-node matching.
+
+        3. For each anchor-candidate pair, we simultaneously traverse backwards (DFS) in both
+        pattern and target graphs. For every pattern nodes along traversal path, we compare it
+        against the target nodes. In case any comparison failed, the match for this anchor-candidate
+        pair fails. A match is found when DFS completes traversing the graph. See `self._match_nodes`
+        for more details.
+
+        4. In the case of multiple anchors, every anchor will need to find a match using step 3.
+        In addition, the matches found between anchors need to have a common intersection node
+        in order for the match to be valid. This is implemented with backtracking. See `backtracking`
+        for more details.
+
+        Notice: graph traversal must be done in the reverser order because a tensor can have multiple
+        consumers, but can only have a single producer. Only with reverser order, we can we jointly
+        traverse the pattern and target graph in a deterministic path.
+
+        Warning: In theory, this backtracking algorithm have an **exponential** time complexity. However,
+        in practice, it's unlikely to blow up.
+
+        """
+        from torch.fx.passes.utils.fuser_utils import validate_partition
+
+        # find candidate nodes to match with pattern anchors
+        match_candidates: Dict[Node, List[Node]] = defaultdict(list)
+        for pattern_anchor in self.pattern_anchors:
+            for node in graph.nodes:
+                if self._nodes_are_equal(pattern_anchor, node):
+                    match_candidates[pattern_anchor].append(node)
+        match_candidates_list = list(match_candidates.items())
+
+        logger.info("Initial match_candidates_list: %s\n", match_candidates_list)
+
+        matches: List[InternalMatch] = []
+
+        def backtracking(anchor_index, match):
+            if anchor_index == len(match_candidates_list):
+                match.placeholder_nodes = [match.nodes_map[pn] for pn in self.pattern_placeholder_nodes]
+                match.returning_nodes = [match.nodes_map[pn] for pn in self.pattern_returning_nodes]
+                matches.append(match)
+
+                logger.info("Found a match: %s\n", match)
+                return
+
+            pattern_anchor, candidate_nodes = match_candidates_list[anchor_index]
+            saved_match = copy.copy(match)
+
+            for node in candidate_nodes:
+                logger.info("Trying to match anchor %s to %s", pattern_anchor, node)
+
+                match_found = self._match_nodes(pattern_anchor, node, match)
+                if match_found:
+                    # match next anchor
+                    backtracking(anchor_index + 1, match)
+                else:
+                    logger.info("Failed to match anchor %s to %s\n", pattern_anchor, node)
+
+                # revert to saved_match before matching with current anchor
+                match = copy.copy(saved_match)
+
+        match = InternalMatch(anchors=self.pattern_anchors)
+        if match_candidates_list:
+            backtracking(0, match)
+
+        # filter out the matches where the subgraph is not fully_contained
+        before = len(matches)
+        matches = [match for match in matches if self._is_contained(match.nodes_map)]
+        after = len(matches)
+        if before != after:
+            logger.info("Filtered out %s matches because they are not fully contained", before - after)
+
+        # filter out the matches that form a cycle if the subgraph is fused
+        valid_matches = []
+        for match in matches:
+            matched_compute_nodes = \
+                [gn for pn, gn in match.nodes_map.items() if pn.op not in {"placeholder", "output"}]
+            if validate_partition(matched_compute_nodes):
+                valid_matches.append(match)
+        if len(valid_matches) != len(matches):
+            logger.info("Filtered out %s matches because \
+                          matched subgraph would form a cycle if fused", len(matches) - len(valid_matches))
+
+        if self.remove_overlapping_matches:
+            before = len(valid_matches)
+            matches = self._remove_overlapping_matches(valid_matches)
+            after = len(matches)
+            if before != after:
+                logger.info("Filtered out %s matches because matched subgraphs are overlapping", before - after)
+
+        logger.info("Matches returned: %s", matches)
+
+        return matches
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b59aed14e1af47f7116f44ace6b8c09668d33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/matcher_with_name_node_map_utils.py
@@ -0,0 +1,113 @@
+from typing import Dict, List, Tuple
+
+from torch.fx import Graph, GraphModule, Node
+
+from torch.fx._compatibility import compatibility
+from .matcher_utils import InternalMatch, SubgraphMatcher
+
+__all__ = ["SubgraphMatcherWithNameNodeMap"]
+
+
+def _split_to_graph_and_name_node_map(
+    gm: GraphModule,
+) -> Tuple[GraphModule, Dict[str, Node]]:
+    from torch.fx.graph import _PyTreeInfo
+    from torch.utils._pytree import tree_flatten, tree_unflatten
+
+    name_node_map = {}
+    for n in gm.graph.nodes:
+        if n.op == "output":
+            assert gm._out_spec is not None
+            output = tree_unflatten(n.args[0], gm._out_spec)
+            assert isinstance(
+                output, tuple
+            ), "Expecting the pattern graph to return a tuple"
+            assert (
+                len(output) >= 2
+            ), "Expecting the pattern graph to have at least two outputs"
+            *out, name_node_map = output
+            flattened, out_spec = tree_flatten(out)
+            assert isinstance(
+                name_node_map, Dict
+            ), "Expecting the input graph to have a dict output as the last element"
+            n.args = (flattened,)
+            orig_pytree_info = gm._graph._codegen.pytree_info
+            gm._graph._codegen.pytree_info = _PyTreeInfo(
+                orig_pytree_info.orig_args, orig_pytree_info.in_spec, out_spec
+            )
+    gm.recompile()
+    return gm, name_node_map
+
+
+@compatibility(is_backward_compatible=False)
+class SubgraphMatcherWithNameNodeMap(SubgraphMatcher):
+    """Extends SubgraphMatcher to support querying the matched subgraph nodes through node name,
+    this requires pattern to have specific format (returning and additional dictionary at the output,
+    that has node name as key, and the node in the pattern graph as value, see Example for more details)
+
+    Difference with SubgraphMatcher is that it takes a `pattern_gm` GraphModule as input during
+    initialization since we need to modify the graph (which requires `recompile` the GraphModule)
+
+    Example::
+        def pattern(x, weight):
+            conv = F.conv2d(x, weight)
+            relu = F.relu(conv)
+            return relu, {"conv": conv, "relu": relu}
+
+        def target_graph(x, weight):
+            conv = F.conv2d(x, weight)
+            relu = F.relu(conv)
+            relu *= 2
+            return relu
+
+        pattern_gm = capture_pre_autograd_graph(pattern, example_inputs)
+        target_gm = capture_pre_autograd_graph(target_graph, example_inputs)
+        matcher = SubgraphMatcherWithNameNodeMap(pattern_gm)
+        matches = matcher.match(target_gm)
+        for match in matches:
+            match.name_node_map["conv"].meta["annotation"] = ...
+
+    """
+
+    def __init__(
+        self,
+        pattern_gm: GraphModule,
+        match_output: bool = False,
+        match_placeholder: bool = False,
+        remove_overlapping_matches: bool = True,
+        ignore_literals: bool = False,
+    ) -> None:
+        pattern_gm, name_node_map = _split_to_graph_and_name_node_map(pattern_gm)
+        self.name_node_map = name_node_map
+        super().__init__(
+            pattern_gm.graph,
+            match_output,
+            match_placeholder,
+            remove_overlapping_matches,
+            ignore_literals,
+        )
+
+    def match(self, graph: Graph) -> List[InternalMatch]:
+        """The returned InternalMatch will have name_node_map populated with a map
+        from node name (str) to the target node, e.g.
+        {"conv": target_conv_ndoe, "relu": target_relu_node}
+
+        this requires the pattern graph returns an additional
+        output of node name to node, e.g. instead of:
+        ```
+        def pattern(...):
+            ...
+            return relu
+        ```
+        we should do:
+        ```
+        def pattern(...):
+            ...
+            return relu, {"conv": conv, "relu": relu}
+        ``` instead
+        """
+        internal_matches = super().match(graph)
+        for internal_match in internal_matches:
+            for k, n in self.name_node_map.items():
+                internal_match.name_node_map[k] = internal_match.nodes_map[n]
+        return internal_matches
diff --git a/MLPY/Lib/site-packages/torch/fx/passes/utils/source_matcher_utils.py b/MLPY/Lib/site-packages/torch/fx/passes/utils/source_matcher_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..aea6cb4191cdc4d04b1e6dc941ac35d2b8490dc2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/passes/utils/source_matcher_utils.py
@@ -0,0 +1,144 @@
+from dataclasses import dataclass, field
+from torch.fx.graph import Graph
+from torch.fx.node import Node
+from torch.fx._compatibility import compatibility
+from typing import Dict, List, Any, Type, Optional, Callable
+import logging
+import os
+
+
+__all__ = ['get_source_partitions', 'check_subgraphs_connected', 'SourcePartition']
+
+# Set`PYTORCH_MATCHER_LOGLEVEL=INFO` to see debug logs
+def _init_logger():
+    logger = logging.getLogger(__name__)
+
+    level = os.environ.get('PYTORCH_MATCHER_LOGLEVEL', 'WARNING').upper()
+    logger.setLevel(level)
+    console = logging.StreamHandler()
+    formatter = logging.Formatter("%(filename)s > %(message)s")
+    console.setFormatter(formatter)
+    console.setLevel(level)
+    # add the handlers to the logger
+    logger.addHandler(console)
+    logger.propagate = False
+    return logger
+
+logger = _init_logger()
+
+
+@compatibility(is_backward_compatible=False)
+@dataclass
+class SourcePartition:
+    # Nodes in a particular partition
+    nodes: List[Node]
+
+    # The source these nodes decomposed from
+    source: Any
+
+    # Nodes in the graph that are needed as inputs to the partition
+    input_nodes: List[Node] = field(default_factory=list)
+
+    # Nodes in the partition that are being used by nodes outside of the
+    # partition
+    output_nodes: List[Node] = field(default_factory=list)
+
+    # Parameters that are being used
+    params: List[Node] = field(default_factory=list)
+
+
+@compatibility(is_backward_compatible=False)
+def get_source_partitions(
+    graph: Graph,
+    wanted_sources: List[Any],
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Dict[Any, List[SourcePartition]]:
+    """
+    Args:
+        graph: The graph we want to partition
+        wanted_sources: List of sources of nodes that were decomposed from this
+            source. This can be a function (ex. torch.nn.functional.linear) or a
+            leaf module type (ex. torch.nn.Linear).
+
+    Returns:
+        Dictionary mapping sources that were given to a list of SourcePartitions
+        that correspond to the list of nodes that were decomposed from the given
+        source.
+    """
+    modules: Dict[Type, Dict[str, List[Node]]] = {}
+
+    for node in graph.nodes:
+        # The metadata source_fn should contain a tuple of a unique name for the
+        # source, and the source function if the node is decomposed from a
+        # function, or the type of module if the node is decomposed from a leaf
+        # module
+
+        if (source_fn_st := node.meta.get("source_fn_stack", None)) is None:
+            continue
+
+        source_fn = source_fn_st[-1]
+        if source_fn[1] not in wanted_sources:
+            continue
+
+        diff_modules = modules.setdefault(source_fn[1], {})
+        partition = diff_modules.setdefault(source_fn[0], [])
+        partition.append(node)
+
+    def make_partition(nodes: List[Node], module_type: Type) -> SourcePartition:
+        input_nodes = set()
+        output_nodes = set()
+        params = set()
+        for node in nodes:
+            for arg in node.args:
+                if isinstance(arg, Node) and arg not in nodes:
+                    input_nodes.add(arg)
+
+            if node.op == "get_attr":
+                params.add(node)
+
+            for user in node.users.keys():
+                if user not in nodes:
+                    output_nodes.add(node)
+
+        return SourcePartition(
+            nodes,
+            module_type,
+            list(input_nodes),
+            list(output_nodes),
+            list(params),  # type: ignore[arg-type]
+        )
+
+    ret: Dict[Type[Any], List[SourcePartition]] = {}
+
+    if filter_fn:
+        # for each partition, we apply filter_fn to filter out all partitions that doesn't satisfy the
+        # filter condition
+        filtered_modules = {}
+        for tp, name_to_partition in modules.items():
+            filtered_name_to_partition = {
+                name: partition
+                for name, partition in name_to_partition.items()
+                if all(map(filter_fn, partition))
+            }
+            filtered_modules[tp] = filtered_name_to_partition
+        modules = filtered_modules
+
+    for k, v in modules.items():
+        ret[k] = [make_partition(partition, k) for partition in v.values()]
+
+    return ret
+
+
+@compatibility(is_backward_compatible=False)
+def check_subgraphs_connected(subgraph1: SourcePartition, subgraph2: SourcePartition) -> bool:
+    """
+    Given two subgraphs A and B (in the form of a list of nodes), checks if
+    A has nodes connecting to at least one node in B -- aka there exists a node
+    in B that uses a node in A (not the other way around).
+    """
+
+    for node in reversed(subgraph1.nodes):
+        for user in node.users.keys():
+            if user in subgraph2.nodes:
+                return True
+    return False
diff --git a/MLPY/Lib/site-packages/torch/fx/proxy.py b/MLPY/Lib/site-packages/torch/fx/proxy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ec706d1318f429181547894f24cf60c033545b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/proxy.py
@@ -0,0 +1,565 @@
+# mypy: ignore-errors
+
+import enum
+import dis
+import copy
+import sys
+import torch
+import inspect
+import operator
+import traceback
+import collections
+
+from dataclasses import is_dataclass, fields
+
+
+from .graph import magic_methods, reflectable_magic_methods, Graph
+from typing import Tuple, Dict, OrderedDict, Optional, Any, Iterator, Callable
+from .node import Target, Node, Argument, base_types, map_aggregate
+from ._compatibility import compatibility
+from .operator_schemas import check_for_mutable_operation
+import torch.fx.traceback as fx_traceback
+
+__all__ = ['TracerBase', 'GraphAppendingTracer', 'TraceError',
+           'Proxy', 'Attribute', 'ParameterProxy', 'Scope',
+           'ScopeContextManager']
+
+
+@compatibility(is_backward_compatible=False)
+class Scope:
+    """ Scope object that records the module path and the module type
+    of a module. Scope is used to track the information of the module
+    that contains a Node in a Graph of GraphModule. For example::
+
+        class Sub(torch.nn.Module):
+            def forward(self, x):
+                # This will be a call_method Node in GraphModule,
+                # scope for this would be (module_path="sub", module_type=Sub)
+                return x.transpose(1, 2)
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                self.sub = Sub()
+
+            def forward(self, x):
+                # This will be a call_method Node as well,
+                # scope for this would be (module_path="", None)
+                x = x.transpose(1, 2)
+                x = self.sub(x)
+                return x
+
+    """
+
+    def __init__(self, module_path: str, module_type: Any):
+        super().__init__()
+        self.module_path = module_path
+        self.module_type = module_type
+
+
+@compatibility(is_backward_compatible=False)
+class ScopeContextManager:
+    """ A context manager to track the Scope of Node during symbolic tracing.
+    When entering a forward function of a Module, we'll update the scope information of
+    the current module, and when we exit, we'll restore the previous scope information.
+    """
+
+    def __init__(
+        self,
+        scope: Scope,
+        current_scope: Scope,
+    ):
+        super().__init__()
+        # Keep a copy of prev scope to restore on exit
+        self._prev_scope = copy.copy(scope)
+        # Update scope to current scope
+        scope.module_path = current_scope.module_path
+        scope.module_type = current_scope.module_type
+        # Save a reference so we can restore it
+        self._scope = scope
+
+    def __enter__(self):
+        return self._scope
+
+    def __exit__(self, *args):
+        self._scope.module_path = self._prev_scope.module_path
+        self._scope.module_type = self._prev_scope.module_type
+        return
+
+
+_COPY_META_FIELDS = ["nn_module_stack", "source_fn_stack", "original_aten", "recompute", "from_node", "quantization_tag"]
+
+
+@compatibility(is_backward_compatible=True)
+class TracerBase:
+    graph: Graph
+    record_stack_traces : bool = False
+    # Feature flag for mutable schema checking
+    # Enableby default in 1.12
+    check_mutable_operations : bool = False
+    # Feature flag for assert tracing
+    trace_asserts : bool = False
+    # Feature flag for proxying accesses to buffer values
+    proxy_buffer_attributes : bool = False
+
+    # Name of the function to be traced. It will only be used when
+    # ``root`` is an instance of ``nn.Module``
+    traced_func_name: str = "forward"
+
+    # Maps the containing module's name to the operator name
+    scope : Scope
+
+    # Records the module call stack
+    module_stack: OrderedDict[str, Tuple[str, Any]]
+
+    # Mapping of node name to module scope
+    node_name_to_scope: Dict[str, Tuple[str, type]]
+
+    @compatibility(is_backward_compatible=True)
+    def create_node(self, kind : str, target : Target,
+                    args : Tuple[Argument, ...], kwargs : Dict[str, Argument], name : Optional[str] = None,
+                    type_expr : Optional[Any] = None) -> Node:
+        """
+        Inserts a graph node given target, args, kwargs, and name.
+
+        This method can be overridden to do extra checking, validation, or
+        modification of values used in node creation. For example, one might
+        want to disallow in-place operations from being recorded.
+        """
+        if kind == 'call_function' and self.check_mutable_operations:
+            check_for_mutable_operation(target, args, kwargs)
+
+        node = self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+        # TODO node_name_to_scope will be depreciated in favor of
+        # node.meta['nn_module_stack']
+        self.node_name_to_scope[node.name] = (
+            self.scope.module_path,
+            self.scope.module_type,
+        )
+        # Optionally set stack trace on the created Node for debugging purposes
+        if fx_traceback.has_preserved_node_meta():
+            current_meta: Dict[str, Any] = fx_traceback.get_current_meta()
+
+            stack_trace = current_meta.get("stack_trace")
+            if stack_trace:
+                node.stack_trace = stack_trace
+            # Explicitly set the stack_trace, nn_module_stack and source_fn on the node.meta
+            # If other meta fields are needed, they can be added here
+            for field in _COPY_META_FIELDS:
+                if field in current_meta:
+                    node.meta[field] = copy.copy(current_meta[field])
+
+            # Here we decrement to account for the sequence_nr having
+            # just been incremented while tracing this lowered aten op.
+            new_seq_nr = torch.autograd._get_sequence_nr() - 1
+            # The sequence_nr increments every time a new autograd Node
+            # is created. During the FWD pass we store the sequence_nr
+            # corresponding to the last autograd Node created on this fx
+            # node's meta.  A single aten op can create multiple autograd
+            # nodes as is the case with in-place foreach ops. During the
+            # BWD pass we retrieve the sequence_nr stored on the current
+            # executing autograd Node. See NOTE [ Sequence Number ].
+            if current_meta.get("in_grad_fn", 0) > 0:
+                new_seq_nr = current_meta["grad_fn_seq_nr"][-1]
+            node.meta["seq_nr"] = new_seq_nr
+
+        elif self.module_stack:
+            node.meta['nn_module_stack'] = copy.copy(self.module_stack)
+        return node
+
+    @compatibility(is_backward_compatible=True)
+    def proxy(self, node: Node) -> 'Proxy':
+        return Proxy(node, self)
+
+    @compatibility(is_backward_compatible=True)
+    def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs: Dict[str, Any],
+                     name: Optional[str] = None, type_expr : Optional[Any] = None,
+                     proxy_factory_fn: Callable[[Node], 'Proxy'] = None):
+        '''
+        Create a Node from the given arguments, then return the Node
+        wrapped in a Proxy object.
+
+        If kind = 'placeholder', then we're creating a Node that
+        represents the parameter of a function. If we need to encode
+        a default parameter, we use the ``args`` tuple. ``args`` is
+        otherwise empty for ``placeholder`` Nodes.
+        '''
+
+        args_ = self.create_arg(args)
+        kwargs_ = self.create_arg(kwargs)
+        assert isinstance(args_, tuple)
+        assert isinstance(kwargs_, dict)
+
+        node = self.create_node(kind, target, args_, kwargs_, name, type_expr)
+
+        if not proxy_factory_fn:
+            proxy = self.proxy(node)
+        else:
+            proxy = proxy_factory_fn(node)
+
+        if self.record_stack_traces and not proxy.node.stack_trace:
+            user_frame = self._find_user_frame()
+            if user_frame:
+                summary = traceback.extract_stack(user_frame)
+                tb_lines = summary.format()
+                # stack_trace would have innermost frame at the bottom
+                proxy.node.stack_trace = ''.join(tb_lines)
+
+        return proxy
+
+    def _find_user_frame(self):
+        """
+        Find the Python stack frame executing the user code during
+        symbolic tracing.
+        """
+        # We have to do a little dance here. Basically, walk up the callstack and
+        # record the first frame not in the pytorch source. This is the frame executing
+        # the user code during tracing.
+        frame = inspect.currentframe()
+
+        pt_files = ['torch/fx/proxy.py',
+                    'torch/fx/_symbolic_trace.py',
+                    'torch/fx/experimental/proxy_tensor.py',
+                    'torch/_ops.py',
+                    'torch/_tensor.py',
+                    'torch/utils/_python_dispatch.py',
+                    'torch/_prims_common/wrappers.py',
+                    'torch/_refs/__init__.py',
+                    'torch/_refs/nn/functional/__init__.py',
+                    'torch/utils/_stats.py',
+                    ]
+        while frame:
+            frame = frame.f_back
+            if frame and all(not frame.f_code.co_filename.endswith(file) for file in pt_files):
+                break
+
+        if not frame:
+            return None
+
+        return frame
+
+    @compatibility(is_backward_compatible=True)
+    def create_arg(self, a: Any) -> Argument:
+        """
+        A method that lowers the objects seen as arguments during symbolic evaluation
+        into Argument types that can be stored in IR.
+
+        Can be override to support more trace-specific types.
+        """
+        if not isinstance(a, Proxy) and hasattr(a, '__fx_create_arg__'):
+            return a.__fx_create_arg__(self)
+        # aggregates
+        elif isinstance(a, tuple) and hasattr(a, '_fields'):
+            # NamedTuple constructors don't seem to like getting a generator
+            # expression as an argument to their constructor, so build this
+            # intermediate tuple and unpack it into the NamedTuple constructor
+            args = tuple(self.create_arg(elem) for elem in a)
+            return type(a)(*args)  # type: ignore[arg-type]
+        elif isinstance(a, (tuple, list)):
+            return type(a)(self.create_arg(elem) for elem in a)
+        elif isinstance(a, dict):
+            r = {}
+            for k, v in a.items():
+                # Check for invalid dict keys. We do not want a Proxy to appear
+                # anywhere within the key. Since keys can be collection types,
+                # we iterate through the key with map_aggregate
+                k = self.create_arg(k)
+
+                def no_node(arg):
+                    if isinstance(arg, Node):
+                        raise RuntimeError("Keys for dictionaries used as an argument cannot contain a "
+                                           f"Node. Got key: {k}")
+                map_aggregate(k, no_node)
+
+                r[k] = self.create_arg(v)
+            return r
+        elif isinstance(a, slice):
+            return slice(self.create_arg(a.start), self.create_arg(a.stop), self.create_arg(a.step))
+
+        elif isinstance(a, range):
+            return range(self.create_arg(a.start), self.create_arg(a.stop), self.create_arg(a.step))
+
+        elif isinstance(a, torch._ops.OpOverload):
+            return a
+
+        if isinstance(a, Proxy):
+            # base case: we unwrap the Proxy object
+            return a.node
+
+        if is_dataclass(a):
+            kwargs = {field.name: self.create_arg(getattr(a, field.name)) for field in fields(a)}
+            return self.create_node("call_function", a.__class__, (), kwargs)
+
+        elif isinstance(a, (*base_types, enum.Enum)) or a is None or a is ...:
+            return a
+        raise NotImplementedError(f"argument of type: {type(a)}")
+
+    @compatibility(is_backward_compatible=True)
+    def to_bool(self, obj: 'Proxy') -> bool:
+        """Called when a proxy object is being converted to a boolean, such as
+        when used in control flow.  Normally we don't know what to do because
+        we don't know the value of the proxy, but a custom tracer can attach more
+        information to the graph node using create_node and can choose to return a value.
+        """
+        raise TraceError('symbolically traced variables cannot be used as inputs to control flow')
+
+    @compatibility(is_backward_compatible=True)
+    def iter(self, obj: 'Proxy') -> Iterator:
+        """Called when a proxy object is being iterated over, such as
+        when used in control flow.  Normally we don't know what to do because
+        we don't know the value of the proxy, but a custom tracer can attach more
+        information to the graph node using create_node and can choose to return an iterator.
+        """
+        raise TraceError('Proxy object cannot be iterated. This can be '
+                         'attempted when the Proxy is used in a loop or'
+                         ' as a *args or **kwargs function argument. '
+                         'See the torch.fx docs on pytorch.org for a '
+                         'more detailed explanation of what types of '
+                         'control flow can be traced, and check out the'
+                         ' Proxy docstring for help troubleshooting '
+                         'Proxy iteration errors')
+
+    @compatibility(is_backward_compatible=True)
+    def keys(self, obj: 'Proxy') -> Any:
+        """Called when a proxy object is has the keys() method called.
+        This is what happens when ** is called on a proxy. This should return an
+        iterator it ** is suppose to work in your custom tracer.
+        """
+        return Attribute(obj, 'keys')()
+
+
+# used in Proxy object when just appending to the graph while not tracing.
+@compatibility(is_backward_compatible=True)
+class GraphAppendingTracer(TracerBase):
+    def __init__(self, graph: Graph):
+        super().__init__()
+        self.graph = graph
+        self.scope = Scope("", None)
+        self.module_stack = collections.OrderedDict()
+        self.node_name_to_scope = {}
+
+@compatibility(is_backward_compatible=False)
+def assert_fn(x):
+    assert x
+
+@compatibility(is_backward_compatible=True)
+class TraceError(ValueError):
+    pass
+
+@compatibility(is_backward_compatible=True)
+class Proxy:
+    """
+    ``Proxy`` objects are ``Node`` wrappers that flow through the
+    program during symbolic tracing and record all the operations
+    (``torch`` function calls, method calls, operators) that they touch
+    into the growing FX Graph.
+
+    If you're doing graph transforms, you can wrap your own ``Proxy``
+    method around a raw ``Node`` so that you can use the overloaded
+    operators to add additional things to a ``Graph``.
+
+    ``Proxy`` objects cannot be iterated. In other words, the symbolic
+    tracer will throw an error if a ``Proxy`` is used in a loop or as
+    an ``*args``/``**kwargs`` function argument.
+
+    There are two main ways around this:
+    1. Factor out the untraceable logic into a top-level function and
+    use ``fx.wrap`` on it.
+    2. If the control flow is static (i.e. the loop trip count is
+    based on some hyperparameter), the code can be kept in its original
+    position and refactored into something like::
+
+        for i in range(self.some_hyperparameter):
+            indexed_item = proxied_value[i]
+
+    For a more detailed description into the Proxy internals, check out
+    the "Proxy" section in `torch/fx/OVERVIEW.md`
+    """
+
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, node: Node, tracer: 'Optional[TracerBase]' = None):
+        if tracer is None:
+            # This allows you to create a Proxy object around a raw Node
+            tracer = GraphAppendingTracer(node.graph)
+        self.tracer = tracer
+        self.node = node
+
+    def __repr__(self) -> str:
+        return f'Proxy({self.node.name})'
+
+    def __getattr__(self, k) -> 'Attribute':
+        # note: not added to the graph yet, if this is a method call
+        # we peephole optimize to the method invocation
+        return Attribute(self, k)
+
+    def __call__(self, *args, **kwargs) -> 'Proxy':
+        return self.tracer.create_proxy('call_method', '__call__', (self,) + args, kwargs)
+
+    def __iter__(self) -> Iterator['Proxy']:
+        frame = inspect.currentframe()
+        assert frame is not None
+        calling_frame = frame.f_back
+        assert calling_frame is not None
+        inst_list = list(dis.get_instructions(calling_frame.f_code))
+        if sys.version_info >= (3, 11):
+            from bisect import bisect_left
+            inst_idx = bisect_left(inst_list, calling_frame.f_lasti, key=lambda x: x.offset)
+        else:
+            inst_idx = calling_frame.f_lasti // 2
+        inst = inst_list[inst_idx]
+        if inst.opname == 'UNPACK_SEQUENCE':
+            return (self[i] for i in range(inst.argval))  # type: ignore[index]
+
+        return self.tracer.iter(self)
+
+    def __abs__(self):
+        return self.tracer.create_proxy('call_function', operator.abs, (self,), {})
+
+    def __bool__(self) -> bool:
+        if self.tracer.trace_asserts:
+            # check if this boolean is used in an assertion, bytecode pattern for assertions
+            # is pretty stable for Python 3.7--3.9
+            frame = inspect.currentframe()
+            assert frame is not None
+            calling_frame = frame.f_back
+            assert calling_frame is not None
+            insts = list(dis.get_instructions(calling_frame.f_code))
+            if sys.version_info >= (3, 11):
+                from bisect import bisect_left
+                cur = bisect_left(insts, calling_frame.f_lasti, key=lambda x: x.offset)
+            else:
+                cur = calling_frame.f_lasti // 2
+            inst = insts[cur]
+
+            if inst.opname == 'POP_JUMP_IF_TRUE':
+                first = insts[cur + 1]
+                assert inst.arg is not None
+                last = insts[inst.arg // 2 - 1]
+                starts_with_assert = (first.opname == 'LOAD_GLOBAL' and first.argval == 'AssertionError'
+                                      or first.opname == 'LOAD_ASSERTION_ERROR')
+                if starts_with_assert and last.opname == 'RAISE_VARARGS':
+                    self.tracer.create_proxy('call_function', assert_fn, (self,), {})
+                    return True
+
+        return self.tracer.to_bool(self)
+
+    @compatibility(is_backward_compatible=True)
+    def keys(self):
+        return self.tracer.keys(self)
+
+    def __len__(self):
+        raise RuntimeError("'len' is not supported in symbolic tracing by default. If you want "
+                           "this call to be recorded, please call torch.fx.wrap('len') at "
+                           "module scope")
+
+    @classmethod
+    def __torch_function__(cls, orig_method, types, args=None, kwargs=None):
+        args = args if args else ()
+        kwargs = kwargs if kwargs else {}
+
+        tracers : Dict[Any, None] = {}
+
+        def find_tracer(a):
+            if isinstance(a, cls):
+                tracers[a.tracer] = None
+        torch.fx.node.map_aggregate(args, find_tracer)
+        torch.fx.node.map_aggregate(kwargs, find_tracer)
+
+        if len(tracers) > 1:
+            raise RuntimeError(f'Found multiple different tracers {list(tracers.keys())} while '
+                               f'trying to trace operations {orig_method}')
+        tracer = next(iter(tracers.keys()))
+
+        if isinstance(orig_method, torch._C.ScriptMethod):
+            args = (orig_method.owner,) + args
+            return tracer.create_proxy('call_method', orig_method.name, args, kwargs)
+        if torch.overrides.is_tensor_method_or_property(orig_method):
+            return tracer.create_proxy('call_method', orig_method.__name__, args, kwargs)
+        else:
+            if isinstance(orig_method, torch._ops.HigherOrderOperator):
+                # TODO: Define how to symbolically trace HigherOrderOperators
+                raise RuntimeError("Unable to symbolically trace HigherOrderOperators")
+            return tracer.create_proxy('call_function', orig_method, args, kwargs,
+                                       name=tracer.graph._target_to_str(orig_method.__name__))
+
+
+@compatibility(is_backward_compatible=True)
+class Attribute(Proxy):
+    @compatibility(is_backward_compatible=True)
+    def __init__(self, root: Proxy, attr: str):
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._node: Optional[Node] = None
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy('call_function', getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy('call_method', self.attr, (self.root,) + args, kwargs)
+
+
+@compatibility(is_backward_compatible=False)
+class ParameterProxy(Proxy):
+    """
+    A special proxy which lets "shape", "size", "dim", and a few other
+    attribute accesses pass through to the underlying  module parameter object,
+    so that conditional tests on these attributes will not throw exception during tracing
+    """
+    def __init__(self, tracer: TracerBase, node: Node, name, param):
+        super().__init__(node, tracer)
+        assert isinstance(param, torch.nn.Parameter)
+        self.param = param
+        self.name = name
+
+    def __repr__(self) -> str:
+        return f'ParameterProxy({self.name})'
+
+    @property
+    def shape(self):
+        return self.param.shape
+
+    def size(self):
+        return self.param.size()
+
+    def dim(self):
+        return self.param.dim()
+
+    @property
+    def ndim(self):
+        return self.param.ndim
+
+    def numel(self):
+        return self.param.numel()
+
+    def nelement(self):
+        return self.param.nelement()
+
+
+for method in magic_methods:
+    def _scope(method):
+        def impl(*args, **kwargs):
+            tracer = args[0].tracer
+            target = getattr(operator, method)
+            return tracer.create_proxy('call_function', target, args, kwargs)
+        impl.__name__ = method
+        as_magic = f'__{method.strip("_")}__'
+        setattr(Proxy, as_magic, impl)
+    _scope(method)
+
+def _define_reflectable(orig_method_name):
+    method_name = f'__r{orig_method_name.strip("_")}__'
+
+    def impl(self, rhs):
+        target = getattr(operator, orig_method_name)
+        return self.tracer.create_proxy('call_function', target, (rhs, self), {})
+    impl.__name__ = method_name
+    impl.__qualname__ = method_name
+    setattr(Proxy, method_name, impl)
+
+for orig_method_name in reflectable_magic_methods:
+    _define_reflectable(orig_method_name)
diff --git a/MLPY/Lib/site-packages/torch/fx/subgraph_rewriter.py b/MLPY/Lib/site-packages/torch/fx/subgraph_rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7364794cf863acdc4935f519d869250ef63d3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/subgraph_rewriter.py
@@ -0,0 +1,349 @@
+from .graph_module import GraphModule
+from .graph import Graph
+from .node import Node
+from ._symbolic_trace import symbolic_trace
+from ._compatibility import compatibility
+
+import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Union, TYPE_CHECKING
+import torch
+
+if TYPE_CHECKING:
+    from .passes.utils.matcher_with_name_node_map_utils import InternalMatch
+
+__all__ = ['Match', 'replace_pattern', 'replace_pattern_with_filters', "ReplacedPatterns"]
+
+@compatibility(is_backward_compatible=True)
+class Match(NamedTuple):
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+
+@compatibility(is_backward_compatible=False)
+@dataclass
+class ReplacedPatterns:
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+    # List of nodes that were added into the graph
+    replacements: List[Node]
+
+def _replace_attributes(gm: GraphModule, replacement: torch.nn.Module) -> None:
+    gm.delete_all_unused_submodules()
+
+    if isinstance(replacement, GraphModule):
+        replacement.graph.lint()
+
+    def try_get_attr(gm: torch.nn.Module, target: str) -> Optional[Any]:
+        module_path, _, attr_name = target.rpartition(".")
+        try:
+            mod: torch.nn.Module = gm.get_submodule(module_path)
+        except AttributeError:
+            return None
+        attr = getattr(mod, attr_name, None)
+        return attr
+
+    for node in gm.graph.nodes:
+        if node.op == "call_module" or node.op == "get_attr":
+
+            gm_attr = try_get_attr(gm, node.target)
+            replacement_attr = try_get_attr(replacement, node.target)
+
+            # CASE 1: This target already exists as an attribute in our
+            # result GraphModule. Whether or not it exists in
+            # `replacement`, the existing submodule takes precedence.
+            if gm_attr is not None:
+                continue
+
+            # CASE 2: The target exists as an attribute in `replacement`
+            # only, so we need to copy it over.
+            elif replacement_attr is not None:
+                new_attr = copy.deepcopy(replacement_attr)
+                if isinstance(replacement_attr, torch.nn.Module):
+                    gm.add_submodule(node.target, new_attr)
+                else:
+                    setattr(gm, node.target, new_attr)
+
+            # CASE 3: The target doesn't exist as an attribute in `gm`
+            # or `replacement`
+            else:
+                raise RuntimeError("Attempted to create a \"", node.op,
+                                   "\" node during subgraph rewriting "
+                                   f"with target {node.target}, but "
+                                   "the referenced attribute does not "
+                                   "exist in the replacement GraphModule")
+
+    gm.graph.lint()
+
+
+@compatibility(is_backward_compatible=True)
+def replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule]
+) -> List[Match]:
+    """
+    Matches all possible non-overlapping sets of operators and their
+    data dependencies (``pattern``) in the Graph of a GraphModule
+    (``gm``), then replaces each of these matched subgraphs with another
+    subgraph (``replacement``).
+
+    Args:
+        ``gm``: The GraphModule that wraps the Graph to operate on
+        ``pattern``: The subgraph to match in ``gm`` for replacement
+        ``replacement``: The subgraph to replace ``pattern`` with
+
+    Returns:
+        List[Match]: A list of ``Match`` objects representing the places
+        in the original graph that ``pattern`` was matched to. The list
+        is empty if there are no matches. ``Match`` is defined as:
+
+        .. code-block:: python
+
+            class Match(NamedTuple):
+                # Node from which the match was found
+                anchor: Node
+                # Maps nodes in the pattern subgraph to nodes in the larger graph
+                nodes_map: Dict[Node, Node]
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from torch.fx import symbolic_trace, subgraph_rewriter
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, w1, w2):
+                m1 = torch.cat([w1, w2]).sum()
+                m2 = torch.cat([w1, w2]).sum()
+                return x + torch.max(m1) + torch.max(m2)
+
+        def pattern(w1, w2):
+            return torch.cat([w1, w2]).sum()
+
+        def replacement(w1, w2):
+            return torch.stack([w1, w2])
+
+        traced_module = symbolic_trace(M())
+
+        subgraph_rewriter.replace_pattern(traced_module, pattern, replacement)
+
+    The above code will first match ``pattern`` in the ``forward``
+    method of ``traced_module``. Pattern-matching is done based on
+    use-def relationships, not node names. For example, if you had
+    ``p = torch.cat([a, b])`` in ``pattern``, you could match
+    ``m = torch.cat([a, b])`` in the original ``forward`` function,
+    despite the variable names being different (``p`` vs ``m``).
+
+    The ``return`` statement in ``pattern`` is matched based on its
+    value only; it may or may not match to the ``return`` statement in
+    the larger graph. In other words, the pattern doesn't have to extend
+    to the end of the larger graph.
+
+    When the pattern is matched, it will be removed from the larger
+    function and replaced by ``replacement``. If there are multiple
+    matches for ``pattern`` in the larger function, each non-overlapping
+    match will be replaced. In the case of a match overlap, the first
+    found match in the set of overlapping matches will be replaced.
+    ("First" here being defined as the first in a topological ordering
+    of the Nodes' use-def relationships. In most cases, the first Node
+    is the parameter that appears directly after ``self``, while the
+    last Node is whatever the function returns.)
+
+    One important thing to note is that the parameters of the
+    ``pattern`` Callable must be used in the Callable itself,
+    and the parameters of the ``replacement`` Callable must match
+    the pattern. The first rule is why, in the above code block, the
+    ``forward`` function has parameters ``x, w1, w2``, but the
+    ``pattern`` function only has parameters ``w1, w2``. ``pattern``
+    doesn't use ``x``, so it shouldn't specify ``x`` as a parameter.
+    As an example of the second rule, consider replacing
+
+    .. code-block:: python
+
+        def pattern(x, y):
+            return torch.neg(x) + torch.relu(y)
+
+    with
+
+    .. code-block:: python
+
+        def replacement(x, y):
+            return torch.relu(x)
+
+    In this case, ``replacement`` needs the same number of parameters
+    as ``pattern`` (both ``x`` and ``y``), even though the parameter
+    ``y`` isn't used in ``replacement``.
+
+    After calling ``subgraph_rewriter.replace_pattern``, the generated
+    Python code looks like this:
+
+    .. code-block:: python
+
+        def forward(self, x, w1, w2):
+            stack_1 = torch.stack([w1, w2])
+            sum_1 = stack_1.sum()
+            stack_2 = torch.stack([w1, w2])
+            sum_2 = stack_2.sum()
+            max_1 = torch.max(sum_1)
+            add_1 = x + max_1
+            max_2 = torch.max(sum_2)
+            add_2 = add_1 + max_2
+            return add_2
+    """
+    match_and_replacements = _replace_pattern(gm, pattern, replacement)
+    return [Match(anchor=m.anchor, nodes_map=m.nodes_map) for m in match_and_replacements]
+
+
+# Experimental API, not backward compatible
+@compatibility(is_backward_compatible=False)
+def replace_pattern_with_filters(
+    gm: GraphModule,
+    pattern: Union[Callable, Graph, GraphModule],
+    replacement: Union[Callable, Graph, GraphModule],
+    match_filters: Optional[List[Callable[["InternalMatch", Graph, Graph], bool]]] = None,
+    ignore_literals: bool = False,
+) -> List[ReplacedPatterns]:
+    """
+    See replace_pattern for documentation. This function is an overload with an additional match_filter argument.
+
+    Args:
+        ``match_filters``: A list of functions that take in
+            (match: InternalMatch, original_graph: Graph, pattern_graph: Graph) and return a boolean indicating
+            whether the match satisfies the condition.
+            See matcher_utils.py for definition of InternalMatch.
+    """
+
+    return _replace_pattern(gm, pattern, replacement, match_filters, ignore_literals)
+
+
+def _replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, Graph, GraphModule],
+    replacement: Union[Callable, Graph, GraphModule],
+    match_filters: Optional[List[Callable[["InternalMatch", Graph, Graph], bool]]] = None,
+    ignore_literals: bool = False,
+) -> List[ReplacedPatterns]:
+
+    from torch.fx.passes.utils.matcher_utils import SubgraphMatcher, InternalMatch
+
+    if match_filters is None:
+        match_filters = []
+
+    # Get the graphs for `gm`, `pattern`, `replacement`
+    original_graph: Graph = gm.graph
+
+    if isinstance(pattern, GraphModule):
+        pattern_graph = pattern.graph
+    elif isinstance(pattern, Graph):
+        pattern_graph = pattern
+    else:
+        pattern_graph = symbolic_trace(pattern).graph
+
+    if isinstance(replacement, GraphModule):
+        replacement_graph = replacement.graph
+    elif isinstance(replacement, Graph):
+        replacement_graph = replacement
+    else:
+        replacement_graph = symbolic_trace(replacement).graph
+
+    matcher = SubgraphMatcher(pattern_graph, match_output=False, match_placeholder=False,
+                              remove_overlapping_matches=True, ignore_literals=ignore_literals)
+    _matches: List[InternalMatch] = matcher.match(original_graph)
+
+    # Filter out matches that don't match the filter
+    _matches = [
+        m for m in _matches
+        if all(match_filter(m, original_graph, pattern_graph)
+               for match_filter in match_filters)
+    ]
+
+    replacement_placeholders = [n for n in replacement_graph.nodes if n.op == "placeholder"]
+
+    # As we progressively replace nodes, we'll need to keep track of how the match results should change
+    match_changed_node: Dict[Node, Node] = {}
+
+    match_and_replacements = []
+    for match in _matches:
+
+        # Build connecting between replacement graph's input and original graph input producer node
+
+        # Initialize `val_map` with mappings from placeholder nodes in
+        # `replacement` to their corresponding node in `original_graph`
+        assert len(match.placeholder_nodes) == len(replacement_placeholders)
+        val_map: Dict[Node, Node] = {}
+        for rn, gn in zip(replacement_placeholders, match.placeholder_nodes):
+            if isinstance(gn, Node):
+                val_map[rn] = match_changed_node.get(gn, gn)
+                if gn != val_map[rn]:
+                    # Update match.placeholder_nodes and match.nodes_map with the node that replaced gn
+                    gn_ind = match.placeholder_nodes.index(gn)
+                    match.placeholder_nodes[gn_ind] = match_changed_node[gn]
+                    map_key = list(match.nodes_map.keys())[list(match.nodes_map.values()).index(gn)]
+                    match.nodes_map[map_key] = match_changed_node[gn]
+            else:
+                val_map[rn] = gn
+
+        # Copy the replacement graph over
+        user_nodes: Set[Node] = set()
+        for n in match.returning_nodes:
+            for user in n.users:
+                user_nodes.add(user)
+        assert user_nodes, "The returning_nodes should have at least one user node"
+
+        if len(user_nodes) == 1:
+            first_user_node = next(iter(user_nodes))
+        else:
+            # If there are multiple user nodes, we need to find the first user node
+            # in the current execution order of the `original_graph`
+            for n in original_graph.nodes:
+                if n in user_nodes:
+                    first_user_node = n
+                    break
+
+        with original_graph.inserting_before(first_user_node):  # type: ignore[possibly-undefined]
+            copied_returning_nodes = original_graph.graph_copy(replacement_graph, val_map)
+
+        if isinstance(copied_returning_nodes, Node):
+            copied_returning_nodes = (copied_returning_nodes, )
+
+        # Get a list of nodes that have been replaced into the graph
+        replacement_nodes: List[Node] = [v for v in val_map.values() if v not in match.placeholder_nodes]
+
+        # Hook the output Node of the replacement subgraph in to the
+        # original Graph at the correct location
+        assert len(match.returning_nodes) == len(copied_returning_nodes)
+        for gn, copied_node in zip(match.returning_nodes, copied_returning_nodes):
+            gn.replace_all_uses_with(copied_node)
+            match_changed_node[gn] = copied_node
+        # Remove the original nodes
+        for node in reversed(pattern_graph.nodes):
+            if node.op != "placeholder" and node.op != "output":
+                gn = match.nodes_map[node]
+                gm.graph.erase_node(gn)
+
+        match_and_replacements.append(
+            ReplacedPatterns(
+                anchor=match.anchors[0],
+                nodes_map=match.nodes_map,
+                replacements=replacement_nodes
+            )
+        )
+
+    # Update the passed-in GraphModule to reflect the new state of
+    # `original_graph`
+    gm.recompile()
+
+    # If `replacement` was an nn.Module, we'll need to make sure that
+    # all the submodules have been copied over correctly
+    if isinstance(replacement, torch.nn.Module):
+        _replace_attributes(gm, replacement)
+
+    return match_and_replacements
diff --git a/MLPY/Lib/site-packages/torch/fx/tensor_type.py b/MLPY/Lib/site-packages/torch/fx/tensor_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec0105c846d2a9249c9a930738db2ec8b8b2aab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/tensor_type.py
@@ -0,0 +1,104 @@
+from torch.fx.experimental.unification import Var  # type: ignore[attr-defined]
+
+from ._compatibility import compatibility
+
+
+@compatibility(is_backward_compatible=False)
+class TensorType:
+    """
+    TensorType defines a type for tensors, which consists of a list of dimensions.
+    Example:
+        class M(torch.nn.Module):
+            def forward(self, x:TensorType((1,2,3, Dyn)), y:TensorType((1,2,3, Dyn))):
+                return torch.add(x, y)
+    """
+
+    def __init__(self, dim):
+        self.__origin__ = TensorType
+        self.__args__ = dim
+
+    def __repr__(self):
+        return f'TensorType[{self.__args__}]'
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return list(self.__args__) == list(other.__args__)
+        else:
+            return False
+
+    @staticmethod
+    def __class_getitem__(*args):
+        if len(args) == 1 and isinstance(args[0], tuple):
+            args = args[0]
+        return TensorType(tuple(args))
+
+
+class _DynType:
+    """
+    _DynType defines a type which stands for the absence of type information.
+    """
+    def __init__(self):
+        self.__name__ = '_DynType'
+
+    def __eq__(self, other):
+        return isinstance(other, self.__class__)
+
+    def __str__(self):
+        return "Dyn"
+
+    def __repr__(self):
+        return "Dyn"
+
+
+Dyn = _DynType()
+
+@compatibility(is_backward_compatible=False)
+def is_consistent(t1, t2):
+    """
+    A binary relation denoted by ~ that determines if t1 is consistent with t2.
+    The relation is reflexive, symmetric but not transitive.
+    returns True if t1 and t2 are consistent and False otherwise.
+    Example:
+        Dyn ~ TensorType((1,2,3))
+        int ~ Dyn
+        int ~ int
+        TensorType((1,Dyn,3)) ~ TensorType((1,2,3))
+    """
+
+    if t1 == t2:
+        return True
+
+    if t1 == Dyn or t2 == Dyn or isinstance(t1, Var) or isinstance(t2, Var):
+        return True
+
+    if isinstance(t1, TensorType) and isinstance(t2, TensorType):
+        return len(t1.__args__) == len(t2.__args__) and \
+            all(is_consistent(elem1, elem2) for elem1, elem2 in zip(t1.__args__, t2.__args__))
+    else:
+        return False
+
+
+@compatibility(is_backward_compatible=False)
+def is_more_precise(t1, t2):
+    """
+    A binary relation denoted by <= that determines if t1 is more precise than t2.
+    The relation is reflexive and transitive.
+    returns True if t1 is more precise than t2 and False otherwise.
+    Example:
+        Dyn >= TensorType((1,2,3))
+        int >= Dyn
+        int >= int
+        TensorType((1,Dyn,3)) <= TensorType((1,2,3))
+    """
+    if t1 == t2:
+        return True
+
+    if isinstance(t2, _DynType):
+        return True
+
+    if isinstance(t1, TensorType) and isinstance(t2, TensorType):
+        return len(t1.__args__) == len(t2.__args__) and \
+            all(is_more_precise(elem1, elem2) for elem1, elem2 in zip(t1.__args__, t2.__args__))
+
+    else:
+        return False
diff --git a/MLPY/Lib/site-packages/torch/fx/traceback.py b/MLPY/Lib/site-packages/torch/fx/traceback.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ea48ca1107523575a36ff822b10b81373e2046
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/fx/traceback.py
@@ -0,0 +1,99 @@
+import traceback
+from contextlib import contextmanager
+from typing import List, Any, Dict
+from ._compatibility import compatibility
+
+__all__ = ['preserve_node_meta', 'has_preserved_node_meta',
+           'set_stack_trace', 'set_grad_fn_seq_nr', 'reset_grad_fn_seq_nr',
+           'format_stack', 'set_current_meta', 'get_current_meta']
+
+current_meta: Dict[str, Any] = {}
+should_preserve_node_meta = False
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def preserve_node_meta():
+    global should_preserve_node_meta
+
+    saved_should_preserve_node_meta = should_preserve_node_meta
+    try:
+        should_preserve_node_meta = True
+        yield
+    finally:
+        should_preserve_node_meta = saved_should_preserve_node_meta
+
+
+@compatibility(is_backward_compatible=False)
+def set_stack_trace(stack : List[str]):
+    global current_meta
+
+    if should_preserve_node_meta and stack:
+        current_meta["stack_trace"] = "".join(stack)
+
+
+@compatibility(is_backward_compatible=False)
+def set_grad_fn_seq_nr(seq_nr):
+    global current_meta
+
+    if should_preserve_node_meta:
+        # The seq_nr is captured by eager mode in the grad_fn during forward
+        current_meta["grad_fn_seq_nr"] = current_meta.get("grad_fn_seq_nr", []) + [seq_nr]
+        current_meta["in_grad_fn"] = current_meta.get("in_grad_fn", 0) + 1
+
+
+@compatibility(is_backward_compatible=False)
+def reset_grad_fn_seq_nr():
+    # NB: reset state properly, this would be helpful towards supporting
+    #     reentrant autograd if we actually wanted to do that.
+    global current_meta
+    if should_preserve_node_meta:
+        current_level = current_meta.get("in_grad_fn", 0)
+        assert current_level > 0
+        if current_level == 1:
+            del current_meta["in_grad_fn"]
+            del current_meta["grad_fn_seq_nr"]
+        else:
+            current_meta["in_grad_fn"] = current_level - 1
+            current_meta["grad_fn_seq_nr"].pop()
+
+
+@compatibility(is_backward_compatible=False)
+def format_stack() -> List[str]:
+    if should_preserve_node_meta:
+        return [current_meta.get("stack_trace", "")]
+    else:
+        # fallback to traceback.format_stack()
+        return traceback.format_list(traceback.extract_stack()[:-1])
+
+
+@compatibility(is_backward_compatible=False)
+def has_preserved_node_meta() -> bool:
+    return should_preserve_node_meta
+
+
+@compatibility(is_backward_compatible=False)
+@contextmanager
+def set_current_meta(node):
+    global current_meta
+    if should_preserve_node_meta and node.meta:
+        saved_meta = current_meta
+        try:
+            current_meta = node.meta.copy()
+
+            # Append (node.name, node.target) onto "from_node" for provenance tracking
+            if "from_node" not in current_meta:
+                current_meta["from_node"] = [(node.name, node.target)]
+            elif current_meta["from_node"][-1][0] != node.name:
+                current_meta["from_node"].append((node.name, node.target))
+
+            yield
+        finally:
+            current_meta = saved_meta
+    else:
+        yield
+
+
+@compatibility(is_backward_compatible=False)
+def get_current_meta() -> Dict[str, Any]:
+    return current_meta
diff --git a/MLPY/Lib/site-packages/torch/hub.py b/MLPY/Lib/site-packages/torch/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5e3a9e55397b59799b4eeb3846c1ceb480eab4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/hub.py
@@ -0,0 +1,764 @@
+import contextlib
+import errno
+import hashlib
+import json
+import os
+import re
+import shutil
+import sys
+import tempfile
+import torch
+import uuid
+import warnings
+import zipfile
+from pathlib import Path
+from typing import Dict, Optional, Any
+from urllib.error import HTTPError, URLError
+from urllib.request import urlopen, Request
+from urllib.parse import urlparse  # noqa: F401
+from torch.serialization import MAP_LOCATION
+
+class _Faketqdm:  # type: ignore[no-redef]
+
+    def __init__(self, total=None, disable=False,
+                 unit=None, *args, **kwargs):
+        self.total = total
+        self.disable = disable
+        self.n = 0
+        # Ignore all extra *args and **kwargs lest you want to reinvent tqdm
+
+    def update(self, n):
+        if self.disable:
+            return
+
+        self.n += n
+        if self.total is None:
+            sys.stderr.write(f"\r{self.n:.1f} bytes")
+        else:
+            sys.stderr.write(f"\r{100 * self.n / float(self.total):.1f}%")
+        sys.stderr.flush()
+
+    # Don't bother implementing; use real tqdm if you want
+    def set_description(self, *args, **kwargs):
+        pass
+
+    def write(self, s):
+        sys.stderr.write(f"{s}\n")
+
+    def close(self):
+        self.disable = True
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.disable:
+            return
+
+        sys.stderr.write('\n')
+
+try:
+    from tqdm import tqdm  # If tqdm is installed use it, otherwise use the fake wrapper
+except ImportError:
+    tqdm = _Faketqdm
+
+__all__ = [
+    'download_url_to_file',
+    'get_dir',
+    'help',
+    'list',
+    'load',
+    'load_state_dict_from_url',
+    'set_dir',
+]
+
+# matches bfd8deac from resnet18-bfd8deac.pth
+HASH_REGEX = re.compile(r'-([a-f0-9]*)\.')
+
+_TRUSTED_REPO_OWNERS = ("facebookresearch", "facebookincubator", "pytorch", "fairinternal")
+ENV_GITHUB_TOKEN = 'GITHUB_TOKEN'
+ENV_TORCH_HOME = 'TORCH_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+VAR_DEPENDENCY = 'dependencies'
+MODULE_HUBCONF = 'hubconf.py'
+READ_DATA_CHUNK = 128 * 1024
+_hub_dir: Optional[str] = None
+
+
+@contextlib.contextmanager
+def _add_to_sys_path(path):
+    sys.path.insert(0, path)
+    try:
+        yield
+    finally:
+        sys.path.remove(path)
+
+
+# Copied from tools/shared/module_loader to be included in torch package
+def _import_module(name, path):
+    import importlib.util
+    from importlib.abc import Loader
+    spec = importlib.util.spec_from_file_location(name, path)
+    assert spec is not None
+    module = importlib.util.module_from_spec(spec)
+    assert isinstance(spec.loader, Loader)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _remove_if_exists(path):
+    if os.path.exists(path):
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+def _git_archive_link(repo_owner, repo_name, ref):
+    # See https://docs.github.com/en/rest/reference/repos#download-a-repository-archive-zip
+    return f"https://github.com/{repo_owner}/{repo_name}/zipball/{ref}"
+
+
+def _load_attr_from_module(module, func_name):
+    # Check if callable is defined in the module
+    if func_name not in dir(module):
+        return None
+    return getattr(module, func_name)
+
+
+def _get_torch_home():
+    torch_home = os.path.expanduser(
+        os.getenv(ENV_TORCH_HOME,
+                  os.path.join(os.getenv(ENV_XDG_CACHE_HOME,
+                                         DEFAULT_CACHE_DIR), 'torch')))
+    return torch_home
+
+
+def _parse_repo_info(github):
+    if ':' in github:
+        repo_info, ref = github.split(':')
+    else:
+        repo_info, ref = github, None
+    repo_owner, repo_name = repo_info.split('/')
+
+    if ref is None:
+        # The ref wasn't specified by the user, so we need to figure out the
+        # default branch: main or master. Our assumption is that if main exists
+        # then it's the default branch, otherwise it's master.
+        try:
+            with urlopen(f"https://github.com/{repo_owner}/{repo_name}/tree/main/"):
+                ref = 'main'
+        except HTTPError as e:
+            if e.code == 404:
+                ref = 'master'
+            else:
+                raise
+        except URLError as e:
+            # No internet connection, need to check for cache as last resort
+            for possible_ref in ("main", "master"):
+                if os.path.exists(f"{get_dir()}/{repo_owner}_{repo_name}_{possible_ref}"):
+                    ref = possible_ref
+                    break
+            if ref is None:
+                raise RuntimeError(
+                    "It looks like there is no internet connection and the "
+                    f"repo could not be found in the cache ({get_dir()})"
+                ) from e
+    return repo_owner, repo_name, ref
+
+
+def _read_url(url):
+    with urlopen(url) as r:
+        return r.read().decode(r.headers.get_content_charset('utf-8'))
+
+
+def _validate_not_a_forked_repo(repo_owner, repo_name, ref):
+    # Use urlopen to avoid depending on local git.
+    headers = {'Accept': 'application/vnd.github.v3+json'}
+    token = os.environ.get(ENV_GITHUB_TOKEN)
+    if token is not None:
+        headers['Authorization'] = f'token {token}'
+    for url_prefix in (
+            f'https://api.github.com/repos/{repo_owner}/{repo_name}/branches',
+            f'https://api.github.com/repos/{repo_owner}/{repo_name}/tags'):
+        page = 0
+        while True:
+            page += 1
+            url = f'{url_prefix}?per_page=100&page={page}'
+            response = json.loads(_read_url(Request(url, headers=headers)))
+            # Empty response means no more data to process
+            if not response:
+                break
+            for br in response:
+                if br['name'] == ref or br['commit']['sha'].startswith(ref):
+                    return
+
+    raise ValueError(f'Cannot find {ref} in https://github.com/{repo_owner}/{repo_name}. '
+                     'If it\'s a commit from a forked repo, please call hub.load() with forked repo directly.')
+
+
+def _get_cache_or_reload(github, force_reload, trust_repo, calling_fn, verbose=True, skip_validation=False):
+    # Setup hub_dir to save downloaded files
+    hub_dir = get_dir()
+    os.makedirs(hub_dir, exist_ok=True)
+    # Parse github repo information
+    repo_owner, repo_name, ref = _parse_repo_info(github)
+    # Github allows branch name with slash '/',
+    # this causes confusion with path on both Linux and Windows.
+    # Backslash is not allowed in Github branch name so no need to
+    # to worry about it.
+    normalized_br = ref.replace('/', '_')
+    # Github renames folder repo-v1.x.x to repo-1.x.x
+    # We don't know the repo name before downloading the zip file
+    # and inspect name from it.
+    # To check if cached repo exists, we need to normalize folder names.
+    owner_name_branch = '_'.join([repo_owner, repo_name, normalized_br])
+    repo_dir = os.path.join(hub_dir, owner_name_branch)
+    # Check that the repo is in the trusted list
+    _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo=trust_repo, calling_fn=calling_fn)
+
+    use_cache = (not force_reload) and os.path.exists(repo_dir)
+
+    if use_cache:
+        if verbose:
+            sys.stderr.write(f'Using cache found in {repo_dir}\n')
+    else:
+        # Validate the tag/branch is from the original repo instead of a forked repo
+        if not skip_validation:
+            _validate_not_a_forked_repo(repo_owner, repo_name, ref)
+
+        cached_file = os.path.join(hub_dir, normalized_br + '.zip')
+        _remove_if_exists(cached_file)
+
+        try:
+            url = _git_archive_link(repo_owner, repo_name, ref)
+            sys.stderr.write(f'Downloading: \"{url}\" to {cached_file}\n')
+            download_url_to_file(url, cached_file, progress=False)
+        except HTTPError as err:
+            if err.code == 300:
+                # Getting a 300 Multiple Choices error likely means that the ref is both a tag and a branch
+                # in the repo. This can be disambiguated by explicitely using refs/heads/ or refs/tags
+                # See https://git-scm.com/book/en/v2/Git-Internals-Git-References
+                # Here, we do the same as git: we throw a warning, and assume the user wanted the branch
+                warnings.warn(
+                    f"The ref {ref} is ambiguous. Perhaps it is both a tag and a branch in the repo? "
+                    "Torchhub will now assume that it's a branch. "
+                    "You can disambiguate tags and branches by explicitly passing refs/heads/branch_name or "
+                    "refs/tags/tag_name as the ref. That might require using skip_validation=True."
+                )
+                disambiguated_branch_ref = f"refs/heads/{ref}"
+                url = _git_archive_link(repo_owner, repo_name, ref=disambiguated_branch_ref)
+                download_url_to_file(url, cached_file, progress=False)
+            else:
+                raise
+
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
+
+        _remove_if_exists(cached_file)
+        _remove_if_exists(repo_dir)
+        shutil.move(extracted_repo, repo_dir)  # rename the repo
+
+    return repo_dir
+
+
+def _check_repo_is_trusted(repo_owner, repo_name, owner_name_branch, trust_repo, calling_fn="load"):
+    hub_dir = get_dir()
+    filepath = os.path.join(hub_dir, "trusted_list")
+
+    if not os.path.exists(filepath):
+        Path(filepath).touch()
+    with open(filepath) as file:
+        trusted_repos = tuple(line.strip() for line in file)
+
+    # To minimize friction of introducing the new trust_repo mechanism, we consider that
+    # if a repo was already downloaded by torchhub, then it is already trusted (even if it's not in the allowlist)
+    trusted_repos_legacy = next(os.walk(hub_dir))[1]
+
+    owner_name = '_'.join([repo_owner, repo_name])
+    is_trusted = (
+        owner_name in trusted_repos
+        or owner_name_branch in trusted_repos_legacy
+        or repo_owner in _TRUSTED_REPO_OWNERS
+    )
+
+    # TODO: Remove `None` option in 2.0 and change the default to "check"
+    if trust_repo is None:
+        if not is_trusted:
+            warnings.warn(
+                "You are about to download and run code from an untrusted repository. In a future release, this won't "
+                "be allowed. To add the repository to your trusted list, change the command to {calling_fn}(..., "
+                "trust_repo=False) and a command prompt will appear asking for an explicit confirmation of trust, "
+                f"or {calling_fn}(..., trust_repo=True), which will assume that the prompt is to be answered with "
+                f"'yes'. You can also use {calling_fn}(..., trust_repo='check') which will only prompt for "
+                f"confirmation if the repo is not already trusted. This will eventually be the default behaviour")
+        return
+
+    if (trust_repo is False) or (trust_repo == "check" and not is_trusted):
+        response = input(
+            f"The repository {owner_name} does not belong to the list of trusted repositories and as such cannot be downloaded. "
+            "Do you trust this repository and wish to add it to the trusted list of repositories (y/N)?")
+        if response.lower() in ("y", "yes"):
+            if is_trusted:
+                print("The repository is already trusted.")
+        elif response.lower() in ("n", "no", ""):
+            raise Exception("Untrusted repository.")
+        else:
+            raise ValueError(f"Unrecognized response {response}.")
+
+    # At this point we're sure that the user trusts the repo (or wants to trust it)
+    if not is_trusted:
+        with open(filepath, "a") as file:
+            file.write(owner_name + "\n")
+
+
+def _check_module_exists(name):
+    import importlib.util
+    return importlib.util.find_spec(name) is not None
+
+
+def _check_dependencies(m):
+    dependencies = _load_attr_from_module(m, VAR_DEPENDENCY)
+
+    if dependencies is not None:
+        missing_deps = [pkg for pkg in dependencies if not _check_module_exists(pkg)]
+        if len(missing_deps):
+            raise RuntimeError(f"Missing dependencies: {', '.join(missing_deps)}")
+
+
+def _load_entry_from_hubconf(m, model):
+    if not isinstance(model, str):
+        raise ValueError('Invalid input: model should be a string of function name')
+
+    # Note that if a missing dependency is imported at top level of hubconf, it will
+    # throw before this function. It's a chicken and egg situation where we have to
+    # load hubconf to know what're the dependencies, but to import hubconf it requires
+    # a missing package. This is fine, Python will throw proper error message for users.
+    _check_dependencies(m)
+
+    func = _load_attr_from_module(m, model)
+
+    if func is None or not callable(func):
+        raise RuntimeError(f'Cannot find callable {model} in hubconf')
+
+    return func
+
+
+def get_dir():
+    r"""
+    Get the Torch Hub cache directory used for storing downloaded models & weights.
+
+    If :func:`~torch.hub.set_dir` is not called, default path is ``$TORCH_HOME/hub`` where
+    environment variable ``$TORCH_HOME`` defaults to ``$XDG_CACHE_HOME/torch``.
+    ``$XDG_CACHE_HOME`` follows the X Design Group specification of the Linux
+    filesystem layout, with a default value ``~/.cache`` if the environment
+    variable is not set.
+    """
+    # Issue warning to move data if old env is set
+    if os.getenv('TORCH_HUB'):
+        warnings.warn('TORCH_HUB is deprecated, please use env TORCH_HOME instead')
+
+    if _hub_dir is not None:
+        return _hub_dir
+    return os.path.join(_get_torch_home(), 'hub')
+
+
+def set_dir(d):
+    r"""
+    Optionally set the Torch Hub directory used to save downloaded models & weights.
+
+    Args:
+        d (str): path to a local folder to save downloaded models & weights.
+    """
+    global _hub_dir
+    _hub_dir = os.path.expanduser(d)
+
+
+def list(github, force_reload=False, skip_validation=False, trust_repo=None, verbose=True):
+    r"""
+    List all callable entrypoints available in the repo specified by ``github``.
+
+    Args:
+        github (str): a string with format "repo_owner/repo_name[:ref]" with an optional
+            ref (tag or branch). If ``ref`` is not specified, the default branch is assumed to be ``main`` if
+            it exists, and otherwise ``master``.
+            Example: 'pytorch/vision:0.10'
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
+            Default is ``False``.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        trust_repo (bool, str or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v2.0.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v2.0.
+        verbose (bool, optional): If ``False``, mute messages about hitting
+            local caches. Note that the message about first download cannot be
+            muted. Default is ``True``.
+
+    Returns:
+        list: The available callables entrypoint
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
+        >>> entrypoints = torch.hub.list('pytorch/vision', force_reload=True)
+    """
+    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "list", verbose=verbose,
+                                    skip_validation=skip_validation)
+
+    with _add_to_sys_path(repo_dir):
+        hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF)
+        hub_module = _import_module(MODULE_HUBCONF, hubconf_path)
+
+    # We take functions starts with '_' as internal helper functions
+    entrypoints = [f for f in dir(hub_module) if callable(getattr(hub_module, f)) and not f.startswith('_')]
+
+    return entrypoints
+
+
+def help(github, model, force_reload=False, skip_validation=False, trust_repo=None):
+    r"""
+    Show the docstring of entrypoint ``model``.
+
+    Args:
+        github (str): a string with format <repo_owner/repo_name[:ref]> with an optional
+            ref (a tag or a branch). If ``ref`` is not specified, the default branch is assumed
+            to be ``main`` if it exists, and otherwise ``master``.
+            Example: 'pytorch/vision:0.10'
+        model (str): a string of entrypoint name defined in repo's ``hubconf.py``
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download.
+            Default is ``False``.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the ref
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        trust_repo (bool, str or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v2.0.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v2.0.
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
+        >>> print(torch.hub.help('pytorch/vision', 'resnet18', force_reload=True))
+    """
+    repo_dir = _get_cache_or_reload(github, force_reload, trust_repo, "help", verbose=True,
+                                    skip_validation=skip_validation)
+
+    with _add_to_sys_path(repo_dir):
+        hubconf_path = os.path.join(repo_dir, MODULE_HUBCONF)
+        hub_module = _import_module(MODULE_HUBCONF, hubconf_path)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry.__doc__
+
+
+def load(repo_or_dir, model, *args, source='github', trust_repo=None, force_reload=False, verbose=True,
+         skip_validation=False,
+         **kwargs):
+    r"""
+    Load a model from a github repo or a local directory.
+
+    Note: Loading a model is the typical use case, but this can also be used to
+    for loading other objects such as tokenizers, loss functions, etc.
+
+    If ``source`` is 'github', ``repo_or_dir`` is expected to be
+    of the form ``repo_owner/repo_name[:ref]`` with an optional
+    ref (a tag or a branch).
+
+    If ``source`` is 'local', ``repo_or_dir`` is expected to be a
+    path to a local directory.
+
+    Args:
+        repo_or_dir (str): If ``source`` is 'github',
+            this should correspond to a github repo with format ``repo_owner/repo_name[:ref]`` with
+            an optional ref (tag or branch), for example 'pytorch/vision:0.10'. If ``ref`` is not specified,
+            the default branch is assumed to be ``main`` if it exists, and otherwise ``master``.
+            If ``source`` is 'local'  then it should be a path to a local directory.
+        model (str): the name of a callable (entrypoint) defined in the
+            repo/dir's ``hubconf.py``.
+        *args (optional): the corresponding args for callable ``model``.
+        source (str, optional): 'github' or 'local'. Specifies how
+            ``repo_or_dir`` is to be interpreted. Default is 'github'.
+        trust_repo (bool, str or None): ``"check"``, ``True``, ``False`` or ``None``.
+            This parameter was introduced in v1.12 and helps ensuring that users
+            only run code from repos that they trust.
+
+            - If ``False``, a prompt will ask the user whether the repo should
+              be trusted.
+            - If ``True``, the repo will be added to the trusted list and loaded
+              without requiring explicit confirmation.
+            - If ``"check"``, the repo will be checked against the list of
+              trusted repos in the cache. If it is not present in that list, the
+              behaviour will fall back onto the ``trust_repo=False`` option.
+            - If ``None``: this will raise a warning, inviting the user to set
+              ``trust_repo`` to either ``False``, ``True`` or ``"check"``. This
+              is only present for backward compatibility and will be removed in
+              v2.0.
+
+            Default is ``None`` and will eventually change to ``"check"`` in v2.0.
+        force_reload (bool, optional): whether to force a fresh download of
+            the github repo unconditionally. Does not have any effect if
+            ``source = 'local'``. Default is ``False``.
+        verbose (bool, optional): If ``False``, mute messages about hitting
+            local caches. Note that the message about first download cannot be
+            muted. Does not have any effect if ``source = 'local'``.
+            Default is ``True``.
+        skip_validation (bool, optional): if ``False``, torchhub will check that the branch or commit
+            specified by the ``github`` argument properly belongs to the repo owner. This will make
+            requests to the GitHub API; you can specify a non-default GitHub token by setting the
+            ``GITHUB_TOKEN`` environment variable. Default is ``False``.
+        **kwargs (optional): the corresponding kwargs for callable ``model``.
+
+    Returns:
+        The output of the ``model`` callable when called with the given
+        ``*args`` and ``**kwargs``.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
+        >>> # from a github repo
+        >>> repo = 'pytorch/vision'
+        >>> model = torch.hub.load(repo, 'resnet50', weights='ResNet50_Weights.IMAGENET1K_V1')
+        >>> # from a local directory
+        >>> path = '/some/local/path/pytorch/vision'
+        >>> # xdoctest: +SKIP
+        >>> model = torch.hub.load(path, 'resnet50', weights='ResNet50_Weights.DEFAULT')
+    """
+    source = source.lower()
+
+    if source not in ('github', 'local'):
+        raise ValueError(
+            f'Unknown source: "{source}". Allowed values: "github" | "local".')
+
+    if source == 'github':
+        repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load",
+                                           verbose=verbose, skip_validation=skip_validation)
+
+    model = _load_local(repo_or_dir, model, *args, **kwargs)
+    return model
+
+
+def _load_local(hubconf_dir, model, *args, **kwargs):
+    r"""
+    Load a model from a local directory with a ``hubconf.py``.
+
+    Args:
+        hubconf_dir (str): path to a local directory that contains a
+            ``hubconf.py``.
+        model (str): name of an entrypoint defined in the directory's
+            ``hubconf.py``.
+        *args (optional): the corresponding args for callable ``model``.
+        **kwargs (optional): the corresponding kwargs for callable ``model``.
+
+    Returns:
+        a single model with corresponding pretrained weights.
+
+    Example:
+        >>> # xdoctest: +SKIP("stub local path")
+        >>> path = '/some/local/path/pytorch/vision'
+        >>> model = _load_local(path, 'resnet50', weights='ResNet50_Weights.IMAGENET1K_V1')
+    """
+    with _add_to_sys_path(hubconf_dir):
+        hubconf_path = os.path.join(hubconf_dir, MODULE_HUBCONF)
+        hub_module = _import_module(MODULE_HUBCONF, hubconf_path)
+
+        entry = _load_entry_from_hubconf(hub_module, model)
+        model = entry(*args, **kwargs)
+
+    return model
+
+
+def download_url_to_file(url: str, dst: str, hash_prefix: Optional[str] = None,
+                         progress: bool = True) -> None:
+    r"""Download object at the given URL to a local path.
+
+    Args:
+        url (str): URL of the object to download
+        dst (str): Full path where object will be saved, e.g. ``/tmp/temporary_file``
+        hash_prefix (str, optional): If not None, the SHA256 downloaded file should start with ``hash_prefix``.
+            Default: None
+        progress (bool, optional): whether or not to display a progress bar to stderr
+            Default: True
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
+        >>> # xdoctest: +REQUIRES(POSIX)
+        >>> torch.hub.download_url_to_file('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', '/tmp/temporary_file')
+
+    """
+    file_size = None
+    req = Request(url, headers={"User-Agent": "torch.hub"})
+    u = urlopen(req)
+    meta = u.info()
+    if hasattr(meta, 'getheaders'):
+        content_length = meta.getheaders("Content-Length")
+    else:
+        content_length = meta.get_all("Content-Length")
+    if content_length is not None and len(content_length) > 0:
+        file_size = int(content_length[0])
+
+    # We deliberately save it in a temp file and move it after
+    # download is complete. This prevents a local working checkpoint
+    # being overridden by a broken download.
+    # We deliberately do not use NamedTemporaryFile to avoid restrictive
+    # file permissions being applied to the downloaded file.
+    dst = os.path.expanduser(dst)
+    for seq in range(tempfile.TMP_MAX):
+        tmp_dst = dst + '.' + uuid.uuid4().hex + '.partial'
+        try:
+            f = open(tmp_dst, 'w+b')
+        except FileExistsError:
+            continue
+        break
+    else:
+        raise FileExistsError(errno.EEXIST, 'No usable temporary file name found')
+
+    try:
+        if hash_prefix is not None:
+            sha256 = hashlib.sha256()
+        with tqdm(total=file_size, disable=not progress,
+                  unit='B', unit_scale=True, unit_divisor=1024) as pbar:
+            while True:
+                buffer = u.read(READ_DATA_CHUNK)
+                if len(buffer) == 0:
+                    break
+                f.write(buffer)  # type: ignore[possibly-undefined]
+                if hash_prefix is not None:
+                    sha256.update(buffer)  # type: ignore[possibly-undefined]
+                pbar.update(len(buffer))
+
+        f.close()
+        if hash_prefix is not None:
+            digest = sha256.hexdigest()  # type: ignore[possibly-undefined]
+            if digest[:len(hash_prefix)] != hash_prefix:
+                raise RuntimeError(f'invalid hash value (expected "{hash_prefix}", got "{digest}")')
+        shutil.move(f.name, dst)
+    finally:
+        f.close()
+        if os.path.exists(f.name):
+            os.remove(f.name)
+
+
+# Hub used to support automatically extracts from zipfile manually compressed by users.
+# The legacy zip format expects only one file from torch.save() < 1.6 in the zip.
+# We should remove this support since zipfile is now default zipfile format for torch.save().
+def _is_legacy_zip_format(filename: str) -> bool:
+    if zipfile.is_zipfile(filename):
+        infolist = zipfile.ZipFile(filename).infolist()
+        return len(infolist) == 1 and not infolist[0].is_dir()
+    return False
+
+
+def _legacy_zip_load(filename: str, model_dir: str, map_location: MAP_LOCATION, weights_only: bool) -> Dict[str, Any]:
+    warnings.warn('Falling back to the old format < 1.6. This support will be '
+                  'deprecated in favor of default zipfile format introduced in 1.6. '
+                  'Please redo torch.save() to save it in the new zipfile format.')
+    # Note: extractall() defaults to overwrite file if exists. No need to clean up beforehand.
+    #       We deliberately don't handle tarfile here since our legacy serialization format was in tar.
+    #       E.g. resnet18-5c106cde.pth which is widely used.
+    with zipfile.ZipFile(filename) as f:
+        members = f.infolist()
+        if len(members) != 1:
+            raise RuntimeError('Only one file(not dir) is allowed in the zipfile')
+        f.extractall(model_dir)
+        extraced_name = members[0].filename
+        extracted_file = os.path.join(model_dir, extraced_name)
+    return torch.load(extracted_file, map_location=map_location, weights_only=weights_only)
+
+
+def load_state_dict_from_url(
+    url: str,
+    model_dir: Optional[str] = None,
+    map_location: MAP_LOCATION = None,
+    progress: bool = True,
+    check_hash: bool = False,
+    file_name: Optional[str] = None,
+    weights_only: bool = False,
+) -> Dict[str, Any]:
+    r"""Loads the Torch serialized object at the given URL.
+
+    If downloaded file is a zip file, it will be automatically
+    decompressed.
+
+    If the object is already present in `model_dir`, it's deserialized and
+    returned.
+    The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+    ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+    Args:
+        url (str): URL of the object to download
+        model_dir (str, optional): directory in which to save the object
+        map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load)
+        progress (bool, optional): whether or not to display a progress bar to stderr.
+            Default: True
+        check_hash(bool, optional): If True, the filename part of the URL should follow the naming convention
+            ``filename-<sha256>.ext`` where ``<sha256>`` is the first eight or more
+            digits of the SHA256 hash of the contents of the file. The hash is used to
+            ensure unique names and to verify the contents of the file.
+            Default: False
+        file_name (str, optional): name for the downloaded file. Filename from ``url`` will be used if not set.
+        weights_only(bool, optional): If True, only weights will be loaded and no complex pickled objects.
+            Recommended for untrusted sources. See :func:`~torch.load` for more details.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_HUB)
+        >>> state_dict = torch.hub.load_state_dict_from_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
+
+    """
+    # Issue warning to move data if old env is set
+    if os.getenv('TORCH_MODEL_ZOO'):
+        warnings.warn('TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead')
+
+    if model_dir is None:
+        hub_dir = get_dir()
+        model_dir = os.path.join(hub_dir, 'checkpoints')
+
+    os.makedirs(model_dir, exist_ok=True)
+
+    parts = urlparse(url)
+    filename = os.path.basename(parts.path)
+    if file_name is not None:
+        filename = file_name
+    cached_file = os.path.join(model_dir, filename)
+    if not os.path.exists(cached_file):
+        sys.stderr.write(f'Downloading: "{url}" to {cached_file}\n')
+        hash_prefix = None
+        if check_hash:
+            r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+            hash_prefix = r.group(1) if r else None
+        download_url_to_file(url, cached_file, hash_prefix, progress=progress)
+
+    if _is_legacy_zip_format(cached_file):
+        return _legacy_zip_load(cached_file, model_dir, map_location, weights_only)
+    return torch.load(cached_file, map_location=map_location, weights_only=weights_only)
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ATen.h b/MLPY/Lib/site-packages/torch/include/ATen/ATen.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a33d74a04a0a1ae07d1e000b8c73bc3ca3cda4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ATen.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#if !defined(_MSC_VER) && __cplusplus < 201703L
+#error C++17 or later compatible compiler is required to use ATen.
+#endif
+
+#include <ATen/Context.h>
+#include <ATen/Device.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/DimVector.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Formatting.h>
+#include <ATen/Functions.h>
+#include <ATen/NamedTensor.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/Tensor.h>
+#include <ATen/TensorGeometry.h>
+#include <ATen/TensorIndexing.h>
+#include <ATen/TensorOperators.h>
+#include <ATen/Version.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Scalar.h>
+#include <ATen/core/UnsafeFromTH.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+
+// TODO: try to remove this
+// There is some back story, see https://github.com/pytorch/pytorch/issues/48684
+#include <ATen/NativeFunctions.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/AccumulateType.h b/MLPY/Lib/site-packages/torch/include/ATen/AccumulateType.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a26b07c647477e71f807d7c00f16ae8a4ab3b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/AccumulateType.h
@@ -0,0 +1,153 @@
+#pragma once
+#include <ATen/Config.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+
+// Defines the accumulation type for a scalar type.
+// Example:
+//   using accscalar_t = acc_type<scalar_t, /*is_cuda*/true>;
+//
+// Accumulation types are an important concept in numeric computing
+// because you frequently want to perform intermediate computations
+// at a higher precision than the input and output precision, to avoid
+// compounding internal rounding errors.  Accumulation is the most
+// well-known intermediate computation (it is of great importance for
+// sum reduction and matrix multiply, for example), but in PyTorch
+// acc_type ends up getting used for all sorts of other intermediate
+// computations, so it perhaps would be more accurately (ahem) called an
+// "accurate" type.  acc_type is especially important for reduced
+// precision operations like float16 and bfloat16, where relatively
+// benign looking inputs can easily end up overflowing/underflowing.
+//
+// acc_type is parametrized by whether or not you are running on CUDA
+// or not, because on CUDA double precision operations are expensive
+// and so by default, we don't actually want to use double as an
+// acc_type on CUDA.  A lot of things are typed out below, but
+// basically, the table is generated by a few rules:
+//
+//  If bool:
+//      Use 'bool' as acc_type.
+//  If floating point:
+//      If CUDA, use 'float' as acc_type (unless scalar_t is double),
+//      otherwise (CPU) use 'double'
+//  If integral:
+//      Use 'int64_t' as acc_type
+//
+// You're not forced to use this template; if you happen to know
+// something specific about your use case, you can specify your own
+// desired behavior.  This template, however, will give you a reasonable
+// default that will work for all dtypes supported in PyTorch.
+
+#if defined(__CUDACC__)
+#include <cuda.h>
+#include <cuda_fp16.h>
+#elif defined(__HIPCC__)
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+namespace at {
+
+template <typename T, c10::DeviceType D>
+struct AccumulateTypeDevice {};
+
+template <typename T, bool>
+struct AccumulateType {};
+
+template <typename T>
+struct AccumulateType<T, false> {
+  using type = typename AccumulateTypeDevice<T, c10::DeviceType::CPU>::type;
+};
+
+template <typename T>
+struct AccumulateType<T, true> {
+  using type = typename AccumulateTypeDevice<T, c10::DeviceType::CUDA>::type;
+};
+
+template <typename T, c10::DeviceType device>
+using acc_type_device = typename AccumulateTypeDevice<T, device>::type;
+
+template <typename T, bool is_cuda>
+using acc_type = typename AccumulateType<T, is_cuda>::type;
+
+#define ACC_TYPE(t, acc_t, device_type)         \
+  template <>                                   \
+  struct AccumulateTypeDevice<t, device_type> { \
+    using type = acc_t;                         \
+  };
+#define MPS_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::MPS)
+#define CUDA_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CUDA)
+#define CPU_ACC_TYPE(t, acc_t) ACC_TYPE(t, acc_t, c10::DeviceType::CPU)
+
+MPS_ACC_TYPE(BFloat16, float);
+MPS_ACC_TYPE(Half, float);
+MPS_ACC_TYPE(Float8_e5m2, float);
+MPS_ACC_TYPE(Float8_e4m3fn, float);
+MPS_ACC_TYPE(Float8_e5m2fnuz, float);
+MPS_ACC_TYPE(Float8_e4m3fnuz, float);
+MPS_ACC_TYPE(float, float);
+MPS_ACC_TYPE(double, float);
+MPS_ACC_TYPE(int8_t, int64_t);
+MPS_ACC_TYPE(uint8_t, int64_t);
+MPS_ACC_TYPE(char, int64_t);
+MPS_ACC_TYPE(int16_t, int64_t);
+MPS_ACC_TYPE(int32_t, int64_t);
+MPS_ACC_TYPE(int64_t, int64_t);
+MPS_ACC_TYPE(bool, bool);
+MPS_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+MPS_ACC_TYPE(c10::complex<float>, c10::complex<float>);
+MPS_ACC_TYPE(c10::complex<double>, c10::complex<float>);
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+CUDA_ACC_TYPE(half, float);
+#endif
+CUDA_ACC_TYPE(BFloat16, float);
+CUDA_ACC_TYPE(Half, float);
+CUDA_ACC_TYPE(Float8_e5m2, float);
+CUDA_ACC_TYPE(Float8_e4m3fn, float);
+CUDA_ACC_TYPE(Float8_e5m2fnuz, float);
+CUDA_ACC_TYPE(Float8_e4m3fnuz, float);
+CUDA_ACC_TYPE(float, float);
+CUDA_ACC_TYPE(double, double);
+CUDA_ACC_TYPE(int8_t, int64_t);
+CUDA_ACC_TYPE(uint8_t, int64_t);
+CUDA_ACC_TYPE(char, int64_t);
+CUDA_ACC_TYPE(int16_t, int64_t);
+CUDA_ACC_TYPE(int32_t, int64_t);
+CUDA_ACC_TYPE(int64_t, int64_t);
+CUDA_ACC_TYPE(bool, bool);
+CUDA_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+CUDA_ACC_TYPE(c10::complex<float>, c10::complex<float>);
+CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);
+
+CPU_ACC_TYPE(BFloat16, float);
+CPU_ACC_TYPE(Half, float);
+CPU_ACC_TYPE(Float8_e5m2, float);
+CPU_ACC_TYPE(Float8_e4m3fn, float);
+CPU_ACC_TYPE(Float8_e5m2fnuz, float);
+CPU_ACC_TYPE(Float8_e4m3fnuz, float);
+CPU_ACC_TYPE(float, double);
+CPU_ACC_TYPE(double, double);
+CPU_ACC_TYPE(int8_t, int64_t);
+CPU_ACC_TYPE(uint8_t, int64_t);
+CPU_ACC_TYPE(char, int64_t);
+CPU_ACC_TYPE(int16_t, int64_t);
+CPU_ACC_TYPE(int32_t, int64_t);
+CPU_ACC_TYPE(int64_t, int64_t);
+CPU_ACC_TYPE(bool, bool);
+CPU_ACC_TYPE(c10::complex<Half>, c10::complex<float>);
+CPU_ACC_TYPE(c10::complex<float>, c10::complex<double>);
+CPU_ACC_TYPE(c10::complex<double>, c10::complex<double>);
+
+TORCH_API c10::ScalarType toAccumulateType(
+    c10::ScalarType type,
+    c10::DeviceType device);
+TORCH_API c10::ScalarType toAccumulateType(c10::ScalarType type, bool is_cuda);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ArrayRef.h b/MLPY/Lib/site-packages/torch/include/ATen/ArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c1febe4654361afa6b90cd38898b90cf8a8d17f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ArrayRef.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/util/ArrayRef.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Backend.h b/MLPY/Lib/site-packages/torch/include/ATen/Backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..34b3b191549d2be6218da30bc2acab3baa215888
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Backend.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/core/Backend.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Backtrace.h b/MLPY/Lib/site-packages/torch/include/ATen/Backtrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d6eba46720207605fd2b6640ce48c9ae0bffd20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Backtrace.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Backtrace.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CPUApplyUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/CPUApplyUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3135125e8d348b2b363617be3cc4a703fe814443
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CPUApplyUtils.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <ATen/CollapseDims.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+#include <cstring>
+#include <limits>
+
+namespace at {
+
+/*
+ * The basic strategy for apply is as follows:
+ *
+ * 1. Starting with the outermost index, loop until we reach a dimension where
+ * the data is no longer contiguous, i.e. the stride at that dimension is not
+ * equal to the size of the tensor defined by the outer dimensions. Let's call
+ * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then
+ * A is equal to the entire Tensor. Let's call the inner tensor B.
+ *
+ * 2. We loop through the indices in B, starting at its outermost dimension. For
+ * example, if B is a 2x2 matrix, then we do:
+ *
+ * B[0][0]
+ * B[0][1]
+ * B[1][0]
+ * B[1][1]
+ *
+ * We set the offset into the underlying storage as (storageOffset + stride_B *
+ * index_B), i.e. basically we compute the offset into the storage as we would
+ * normally for a Tensor. But because we are guaranteed the subsequent data is
+ * contiguous in memory, we can simply loop for sizeof(A) iterations and perform
+ * the operation, without having to follow the order described by the strides of
+ * A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in
+ * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor,
+ * then the first two dimensions can be merged for the purposes of APPLY,
+ * reducing the number of nested loops.
+ */
+
+inline Tensor sort_strides(Tensor& tensor_) {
+  IntArrayRef strides = tensor_.strides();
+  std::vector<int64_t> indices;
+  indices.reserve(tensor_.ndimension());
+  for (const auto i : c10::irange(tensor_.ndimension())) {
+    indices.push_back(i);
+  }
+  std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
+    return strides[i1] > strides[i2];
+  });
+  Tensor tensor = tensor_.permute(indices);
+  return tensor;
+}
+
+template <typename T, int N>
+struct strided_tensor_iter_fixed {
+ public:
+  T* data_ = NULL;
+  int64_t dim_ = 0;
+
+  int64_t counter_[N] = {0};
+  int64_t sizes_[N] = {0};
+  int64_t strides_[N] = {0};
+
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
+  void operator=(strided_tensor_iter_fixed const& x) = delete;
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) = default;
+  strided_tensor_iter_fixed(
+      Tensor& tensor,
+      C10_UNUSED bool sort_strides = false)
+      : data_(tensor.data_ptr<T>()) {
+    std::memset(counter_, 0, sizeof(int64_t) * N);
+    if (tensor.dim() > 0) {
+      std::memcpy(
+          sizes_, tensor.sizes().data(), tensor.dim() * sizeof(int64_t));
+      std::memcpy(
+          strides_, tensor.strides().data(), tensor.dim() * sizeof(int64_t));
+    }
+    dim_ = std::get<1>(collapse_dims(sizes_, strides_, tensor.ndimension()));
+  }
+};
+
+template <typename T>
+struct strided_tensor_iter {
+ private:
+ public:
+  T* data_ = NULL;
+  int64_t dim_;
+
+  std::vector<int64_t> counter_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  strided_tensor_iter(strided_tensor_iter const&) = delete;
+  void operator=(strided_tensor_iter const& x) = delete;
+  strided_tensor_iter(strided_tensor_iter&&) = default;
+  strided_tensor_iter(Tensor& tensor)
+      : data_(tensor.data_ptr<T>()),
+        dim_(tensor.ndimension()),
+        counter_(dim_, 0),
+        sizes_(tensor.sizes().vec()),
+        strides_(tensor.strides().vec()) {
+    dim_ = std::get<1>(collapse_dims(sizes_.data(), strides_.data(), dim_));
+  }
+};
+
+inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
+  if (tensors.empty())
+    return true;
+  int64_t all_numel = tensors[0].numel();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (tensors[i].numel() != all_numel)
+      return false;
+  }
+  return true;
+}
+
+inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
+  std::ostringstream oss;
+  oss << "inconsistent tensor size, expected ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].sizes() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].sizes()
+      << " to have the same number of elements, but got ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].numel() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].numel()
+      << " elements respectively";
+  return oss.str();
+}
+
+inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
+  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
+  checkLayout("CPU_tensor_apply", tensors, kStrided);
+  if (!_all_equal_numel(tensors))
+    AT_ERROR(_all_equal_numel_error(tensors));
+  // An empty tensor has no elements
+  for (auto& t : tensors)
+    if (t.numel() == 0)
+      return false;
+  return true;
+}
+
+inline int64_t _max_dim_tensors(ArrayRef<Tensor> tensors) {
+  int64_t dim = 0;
+  for (auto& t : tensors)
+    dim = std::max(dim, t.ndimension());
+  return dim;
+}
+
+inline void iterate(int64_t /*size*/){};
+
+template <typename Arg, typename... Args>
+inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
+  iter.counter_[iter.dim_ - 1] += size;
+  iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1];
+  iterate(size, iter_tail...);
+}
+
+inline bool iterate_continue() {
+  return true;
+};
+
+template <typename Arg, typename... Args>
+inline bool iterate_continue(Arg& iter, Args&... iter_tail) {
+  return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] &&
+      iterate_continue(iter_tail...);
+}
+
+inline int64_t max_iterate_size() {
+  return std::numeric_limits<int64_t>::max();
+};
+
+template <typename Arg, typename... Args>
+inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) {
+  return std::min(
+      (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]),
+      max_iterate_size(iter_tail...));
+}
+
+inline void iterate_overflow(){};
+
+template <typename Arg, typename... Args>
+inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
+  if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) {
+    for (int64_t i = iter.dim_ - 1; i > 0; i--) {
+      if (iter.counter_[i] == iter.sizes_[i]) {
+        iter.counter_[i] = 0;
+        iter.counter_[i - 1]++;
+        iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) +
+            iter.strides_[i - 1];
+      }
+    }
+  }
+  iterate_overflow(iter_tail...);
+}
+
+inline void forward(int64_t /*offset*/){};
+
+template <typename Arg, typename... Args>
+inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
+  int64_t multi = offset;
+  for (int64_t i = iter.dim_ - 1; i >= 0; i--) {
+    int64_t inc = multi % iter.sizes_[i];
+    multi = multi / iter.sizes_[i];
+    iter.data_ = iter.data_ + inc * iter.strides_[i];
+    iter.counter_[i] += inc;
+  }
+  forward(offset, iter_tail...);
+}
+
+inline int64_t max_dim() {
+  return 0;
+}
+
+template <typename Arg, typename... Args>
+inline int64_t max_dim(Arg& iter, Args&... iter_tail) {
+  return std::max(iter.dim_, max_dim(iter_tail...));
+}
+
+inline void apply_op(){};
+
+template <typename Op, typename... Args>
+inline void apply_op(
+    int64_t numel,
+    int64_t offset,
+    const Op& op,
+    Args... iters) {
+  // For 0-dim tensors
+  if (numel == 1 && max_dim(iters...) == 0) {
+    op(*iters.data_...);
+    return;
+  }
+  if (offset > 0)
+    forward(offset, iters...);
+  // Splitting this into chunks helps the compiler create faster assembly
+  for (int64_t i = 0; i < numel;) {
+    for (; iterate_continue(iters...) && i < numel;) {
+      op(*iters.data_...);
+      iterate(1, iters...);
+      i++;
+    }
+    iterate_overflow(iters...);
+  }
+}
+
+/*
+  Apply a pointwise operator to sequence of tensors
+
+  The calling convention for op is a function/functor that takes the same
+  number of pointers of type scalar as the number of given tensors. For example,
+  to compute a = b * c, op would be of the form:
+  [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] =
+  b_val[0] * c_val[0]; };
+*/
+
+template <typename scalar1, typename scalar2, typename Op>
+inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2));
+  }
+}
+
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+inline void CPU_tensor_apply3(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3));
+  }
+}
+
+template <
+    typename scalar1,
+    typename scalar2,
+    typename scalar3,
+    typename scalar4,
+    typename Op>
+inline void CPU_tensor_apply4(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    Tensor tensor4,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3),
+        strided_tensor_iter_fixed<scalar4, 8>(tensor4));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3),
+        strided_tensor_iter<scalar4>(tensor4));
+  }
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CPUFixedAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/CPUFixedAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4d75a9245fc877e844dc1db699d68c286fe6a5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CPUFixedAllocator.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+
+// This file creates a fake allocator that just throws exceptions if
+// it is actually used.
+
+// state passed to the allocator is the std::function<void(void*)> called
+// when the blob is release by ATen
+
+namespace at {
+
+static cpu_fixed_malloc(void*, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_realloc(void*, void*, ptrdiff_t) {
+  AT_ERROR("attempting to resize a tensor view of an external blob");
+}
+
+static cpu_fixed_free(void* state, void* allocation) {
+  auto on_release = static_cast<std::function<void(void*)>*>(state);
+  (*on_release)(allocation);
+  delete on_release;
+}
+
+static Allocator CPU_fixed_allocator = {
+    cpu_fixed_malloc,
+    cpu_fixed_realloc,
+    cpu_fixed_free};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb55baadc951e7a56b5e6c3b832e1868cb64684d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CPUFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..78548339cb9c38e5891d1a1606a8eaa48bc0e5df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CPUFunctions_inl.h
@@ -0,0 +1,576 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_cpu_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_adaptive_avg_pool2d_cpu_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_cpu_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_cpu_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_cpu_dispatch.h>
+#include <ATen/ops/_add_relu_cpu_dispatch.h>
+#include <ATen/ops/_addmm_activation_cpu_dispatch.h>
+#include <ATen/ops/_aminmax_cpu_dispatch.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_cpu_dispatch.h>
+#include <ATen/ops/_amp_update_scale_cpu_dispatch.h>
+#include <ATen/ops/_assert_async_cpu_dispatch.h>
+#include <ATen/ops/_cdist_backward_cpu_dispatch.h>
+#include <ATen/ops/_cdist_forward_cpu_dispatch.h>
+#include <ATen/ops/_cholesky_solve_helper_cpu_dispatch.h>
+#include <ATen/ops/_compute_linear_combination_cpu_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_cpu_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_cpu_dispatch.h>
+#include <ATen/ops/_convert_weight_to_int4pack_cpu_dispatch.h>
+#include <ATen/ops/_ctc_loss_cpu_dispatch.h>
+#include <ATen/ops/_ctc_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/_cummax_helper_cpu_dispatch.h>
+#include <ATen/ops/_cummin_helper_cpu_dispatch.h>
+#include <ATen/ops/_dirichlet_grad_cpu_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_cpu_dispatch.h>
+#include <ATen/ops/_embedding_bag_cpu_dispatch.h>
+#include <ATen/ops/_embedding_bag_dense_backward_cpu_dispatch.h>
+#include <ATen/ops/_embedding_bag_forward_only_cpu_dispatch.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_cpu_dispatch.h>
+#include <ATen/ops/_empty_affine_quantized_cpu_dispatch.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_cpu_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_cpu_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_cpu_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_cpu_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_cpu_dispatch.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h>
+#include <ATen/ops/_fft_c2c_cpu_dispatch.h>
+#include <ATen/ops/_fft_c2r_cpu_dispatch.h>
+#include <ATen/ops/_fft_r2c_cpu_dispatch.h>
+#include <ATen/ops/_foobar_cpu_dispatch.h>
+#include <ATen/ops/_foreach_abs_cpu_dispatch.h>
+#include <ATen/ops/_foreach_acos_cpu_dispatch.h>
+#include <ATen/ops/_foreach_add_cpu_dispatch.h>
+#include <ATen/ops/_foreach_addcdiv_cpu_dispatch.h>
+#include <ATen/ops/_foreach_addcmul_cpu_dispatch.h>
+#include <ATen/ops/_foreach_asin_cpu_dispatch.h>
+#include <ATen/ops/_foreach_atan_cpu_dispatch.h>
+#include <ATen/ops/_foreach_ceil_cpu_dispatch.h>
+#include <ATen/ops/_foreach_clamp_max_cpu_dispatch.h>
+#include <ATen/ops/_foreach_clamp_min_cpu_dispatch.h>
+#include <ATen/ops/_foreach_copy_cpu_dispatch.h>
+#include <ATen/ops/_foreach_cos_cpu_dispatch.h>
+#include <ATen/ops/_foreach_cosh_cpu_dispatch.h>
+#include <ATen/ops/_foreach_div_cpu_dispatch.h>
+#include <ATen/ops/_foreach_erf_cpu_dispatch.h>
+#include <ATen/ops/_foreach_erfc_cpu_dispatch.h>
+#include <ATen/ops/_foreach_exp_cpu_dispatch.h>
+#include <ATen/ops/_foreach_expm1_cpu_dispatch.h>
+#include <ATen/ops/_foreach_floor_cpu_dispatch.h>
+#include <ATen/ops/_foreach_frac_cpu_dispatch.h>
+#include <ATen/ops/_foreach_lerp_cpu_dispatch.h>
+#include <ATen/ops/_foreach_lgamma_cpu_dispatch.h>
+#include <ATen/ops/_foreach_log_cpu_dispatch.h>
+#include <ATen/ops/_foreach_log10_cpu_dispatch.h>
+#include <ATen/ops/_foreach_log1p_cpu_dispatch.h>
+#include <ATen/ops/_foreach_log2_cpu_dispatch.h>
+#include <ATen/ops/_foreach_maximum_cpu_dispatch.h>
+#include <ATen/ops/_foreach_minimum_cpu_dispatch.h>
+#include <ATen/ops/_foreach_mul_cpu_dispatch.h>
+#include <ATen/ops/_foreach_neg_cpu_dispatch.h>
+#include <ATen/ops/_foreach_norm_cpu_dispatch.h>
+#include <ATen/ops/_foreach_pow_cpu_dispatch.h>
+#include <ATen/ops/_foreach_reciprocal_cpu_dispatch.h>
+#include <ATen/ops/_foreach_round_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sigmoid_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sign_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sin_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sinh_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sqrt_cpu_dispatch.h>
+#include <ATen/ops/_foreach_sub_cpu_dispatch.h>
+#include <ATen/ops/_foreach_tan_cpu_dispatch.h>
+#include <ATen/ops/_foreach_tanh_cpu_dispatch.h>
+#include <ATen/ops/_foreach_trunc_cpu_dispatch.h>
+#include <ATen/ops/_foreach_zero_cpu_dispatch.h>
+#include <ATen/ops/_functional_assert_async_cpu_dispatch.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_cpu_dispatch.h>
+#include <ATen/ops/_fused_sdp_choice_cpu_dispatch.h>
+#include <ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_cpu_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_cpu_dispatch.h>
+#include <ATen/ops/_index_put_impl_cpu_dispatch.h>
+#include <ATen/ops/_linalg_det_cpu_dispatch.h>
+#include <ATen/ops/_linalg_eigh_cpu_dispatch.h>
+#include <ATen/ops/_linalg_eigvals_cpu_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_cpu_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_cpu_dispatch.h>
+#include <ATen/ops/_linalg_svd_cpu_dispatch.h>
+#include <ATen/ops/_local_scalar_dense_cpu_dispatch.h>
+#include <ATen/ops/_log_softmax_cpu_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_cpu_dispatch.h>
+#include <ATen/ops/_logcumsumexp_cpu_dispatch.h>
+#include <ATen/ops/_make_dep_token_cpu_dispatch.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_cpu_dispatch.h>
+#include <ATen/ops/_masked_softmax_cpu_dispatch.h>
+#include <ATen/ops/_masked_softmax_backward_cpu_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_cpu_dispatch.h>
+#include <ATen/ops/_native_multi_head_attention_cpu_dispatch.h>
+#include <ATen/ops/_nested_from_padded_cpu_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_cpu_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h>
+#include <ATen/ops/_nested_view_from_buffer_cpu_dispatch.h>
+#include <ATen/ops/_pdist_backward_cpu_dispatch.h>
+#include <ATen/ops/_pdist_forward_cpu_dispatch.h>
+#include <ATen/ops/_prelu_kernel_cpu_dispatch.h>
+#include <ATen/ops/_prelu_kernel_backward_cpu_dispatch.h>
+#include <ATen/ops/_reshape_alias_cpu_dispatch.h>
+#include <ATen/ops/_sample_dirichlet_cpu_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_cpu_dispatch.h>
+#include <ATen/ops/_segment_reduce_backward_cpu_dispatch.h>
+#include <ATen/ops/_slow_conv2d_backward_cpu_dispatch.h>
+#include <ATen/ops/_slow_conv2d_forward_cpu_dispatch.h>
+#include <ATen/ops/_softmax_cpu_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_cpu_dispatch.h>
+#include <ATen/ops/_spdiags_cpu_dispatch.h>
+#include <ATen/ops/_stack_cpu_dispatch.h>
+#include <ATen/ops/_standard_gamma_cpu_dispatch.h>
+#include <ATen/ops/_standard_gamma_grad_cpu_dispatch.h>
+#include <ATen/ops/_test_functorch_fallback_cpu_dispatch.h>
+#include <ATen/ops/_test_optional_filled_intlist_cpu_dispatch.h>
+#include <ATen/ops/_test_optional_floatlist_cpu_dispatch.h>
+#include <ATen/ops/_test_optional_intlist_cpu_dispatch.h>
+#include <ATen/ops/_to_sparse_cpu_dispatch.h>
+#include <ATen/ops/_to_sparse_bsc_cpu_dispatch.h>
+#include <ATen/ops/_to_sparse_bsr_cpu_dispatch.h>
+#include <ATen/ops/_to_sparse_csc_cpu_dispatch.h>
+#include <ATen/ops/_to_sparse_csr_cpu_dispatch.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_cpu_dispatch.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_cpu_dispatch.h>
+#include <ATen/ops/_unique_cpu_dispatch.h>
+#include <ATen/ops/_unique2_cpu_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_cpu_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_cpu_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_cpu_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_cpu_dispatch.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_cpu_dispatch.h>
+#include <ATen/ops/_weight_int4pack_mm_cpu_dispatch.h>
+#include <ATen/ops/_weight_int8pack_mm_cpu_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_cpu_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_backward_cpu_dispatch.h>
+#include <ATen/ops/abs_cpu_dispatch.h>
+#include <ATen/ops/acos_cpu_dispatch.h>
+#include <ATen/ops/acosh_cpu_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool3d_cpu_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_cpu_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_cpu_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_cpu_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_cpu_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_cpu_dispatch.h>
+#include <ATen/ops/add_cpu_dispatch.h>
+#include <ATen/ops/addbmm_cpu_dispatch.h>
+#include <ATen/ops/addcdiv_cpu_dispatch.h>
+#include <ATen/ops/addcmul_cpu_dispatch.h>
+#include <ATen/ops/addmm_cpu_dispatch.h>
+#include <ATen/ops/addmv_cpu_dispatch.h>
+#include <ATen/ops/addr_cpu_dispatch.h>
+#include <ATen/ops/all_cpu_dispatch.h>
+#include <ATen/ops/amax_cpu_dispatch.h>
+#include <ATen/ops/amin_cpu_dispatch.h>
+#include <ATen/ops/aminmax_cpu_dispatch.h>
+#include <ATen/ops/angle_cpu_dispatch.h>
+#include <ATen/ops/any_cpu_dispatch.h>
+#include <ATen/ops/arange_cpu_dispatch.h>
+#include <ATen/ops/argmax_cpu_dispatch.h>
+#include <ATen/ops/argmin_cpu_dispatch.h>
+#include <ATen/ops/argsort_cpu_dispatch.h>
+#include <ATen/ops/as_strided_cpu_dispatch.h>
+#include <ATen/ops/asin_cpu_dispatch.h>
+#include <ATen/ops/asinh_cpu_dispatch.h>
+#include <ATen/ops/atan_cpu_dispatch.h>
+#include <ATen/ops/atan2_cpu_dispatch.h>
+#include <ATen/ops/atanh_cpu_dispatch.h>
+#include <ATen/ops/avg_pool2d_cpu_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_cpu_dispatch.h>
+#include <ATen/ops/avg_pool3d_cpu_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_cpu_dispatch.h>
+#include <ATen/ops/baddbmm_cpu_dispatch.h>
+#include <ATen/ops/batch_norm_update_stats_cpu_dispatch.h>
+#include <ATen/ops/bernoulli_cpu_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_cpu_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_backward_cpu_dispatch.h>
+#include <ATen/ops/bincount_cpu_dispatch.h>
+#include <ATen/ops/binomial_cpu_dispatch.h>
+#include <ATen/ops/bitwise_and_cpu_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_cpu_dispatch.h>
+#include <ATen/ops/bitwise_not_cpu_dispatch.h>
+#include <ATen/ops/bitwise_or_cpu_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_cpu_dispatch.h>
+#include <ATen/ops/bitwise_xor_cpu_dispatch.h>
+#include <ATen/ops/bmm_cpu_dispatch.h>
+#include <ATen/ops/bucketize_cpu_dispatch.h>
+#include <ATen/ops/cat_cpu_dispatch.h>
+#include <ATen/ops/cauchy_cpu_dispatch.h>
+#include <ATen/ops/ceil_cpu_dispatch.h>
+#include <ATen/ops/channel_shuffle_cpu_dispatch.h>
+#include <ATen/ops/cholesky_cpu_dispatch.h>
+#include <ATen/ops/cholesky_inverse_cpu_dispatch.h>
+#include <ATen/ops/clamp_cpu_dispatch.h>
+#include <ATen/ops/clamp_max_cpu_dispatch.h>
+#include <ATen/ops/clamp_min_cpu_dispatch.h>
+#include <ATen/ops/col2im_cpu_dispatch.h>
+#include <ATen/ops/complex_cpu_dispatch.h>
+#include <ATen/ops/conj_physical_cpu_dispatch.h>
+#include <ATen/ops/copysign_cpu_dispatch.h>
+#include <ATen/ops/cos_cpu_dispatch.h>
+#include <ATen/ops/cosh_cpu_dispatch.h>
+#include <ATen/ops/count_nonzero_cpu_dispatch.h>
+#include <ATen/ops/cumprod_cpu_dispatch.h>
+#include <ATen/ops/cumsum_cpu_dispatch.h>
+#include <ATen/ops/dense_dim_cpu_dispatch.h>
+#include <ATen/ops/dequantize_cpu_dispatch.h>
+#include <ATen/ops/digamma_cpu_dispatch.h>
+#include <ATen/ops/div_cpu_dispatch.h>
+#include <ATen/ops/dot_cpu_dispatch.h>
+#include <ATen/ops/elu_cpu_dispatch.h>
+#include <ATen/ops/elu_backward_cpu_dispatch.h>
+#include <ATen/ops/embedding_dense_backward_cpu_dispatch.h>
+#include <ATen/ops/embedding_renorm_cpu_dispatch.h>
+#include <ATen/ops/empty_cpu_dispatch.h>
+#include <ATen/ops/empty_strided_cpu_dispatch.h>
+#include <ATen/ops/eq_cpu_dispatch.h>
+#include <ATen/ops/equal_cpu_dispatch.h>
+#include <ATen/ops/erf_cpu_dispatch.h>
+#include <ATen/ops/erfc_cpu_dispatch.h>
+#include <ATen/ops/erfinv_cpu_dispatch.h>
+#include <ATen/ops/exp_cpu_dispatch.h>
+#include <ATen/ops/exp2_cpu_dispatch.h>
+#include <ATen/ops/expm1_cpu_dispatch.h>
+#include <ATen/ops/exponential_cpu_dispatch.h>
+#include <ATen/ops/eye_cpu_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_cpu_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_cpu_dispatch.h>
+#include <ATen/ops/fill_cpu_dispatch.h>
+#include <ATen/ops/flip_cpu_dispatch.h>
+#include <ATen/ops/floor_cpu_dispatch.h>
+#include <ATen/ops/floor_divide_cpu_dispatch.h>
+#include <ATen/ops/fmax_cpu_dispatch.h>
+#include <ATen/ops/fmin_cpu_dispatch.h>
+#include <ATen/ops/fmod_cpu_dispatch.h>
+#include <ATen/ops/frac_cpu_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_cpu_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_cpu_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_cpu_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_backward_cpu_dispatch.h>
+#include <ATen/ops/frexp_cpu_dispatch.h>
+#include <ATen/ops/from_file_cpu_dispatch.h>
+#include <ATen/ops/gather_cpu_dispatch.h>
+#include <ATen/ops/gcd_cpu_dispatch.h>
+#include <ATen/ops/ge_cpu_dispatch.h>
+#include <ATen/ops/gelu_cpu_dispatch.h>
+#include <ATen/ops/gelu_backward_cpu_dispatch.h>
+#include <ATen/ops/geometric_cpu_dispatch.h>
+#include <ATen/ops/geqrf_cpu_dispatch.h>
+#include <ATen/ops/glu_cpu_dispatch.h>
+#include <ATen/ops/glu_backward_cpu_dispatch.h>
+#include <ATen/ops/glu_backward_jvp_cpu_dispatch.h>
+#include <ATen/ops/glu_jvp_cpu_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_cpu_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_backward_cpu_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_cpu_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_backward_cpu_dispatch.h>
+#include <ATen/ops/gt_cpu_dispatch.h>
+#include <ATen/ops/hardshrink_cpu_dispatch.h>
+#include <ATen/ops/hardshrink_backward_cpu_dispatch.h>
+#include <ATen/ops/hardsigmoid_cpu_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_cpu_dispatch.h>
+#include <ATen/ops/hardswish_cpu_dispatch.h>
+#include <ATen/ops/hardswish_backward_cpu_dispatch.h>
+#include <ATen/ops/hardtanh_cpu_dispatch.h>
+#include <ATen/ops/hardtanh_backward_cpu_dispatch.h>
+#include <ATen/ops/heaviside_cpu_dispatch.h>
+#include <ATen/ops/histc_cpu_dispatch.h>
+#include <ATen/ops/histogram_cpu_dispatch.h>
+#include <ATen/ops/huber_loss_cpu_dispatch.h>
+#include <ATen/ops/huber_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/hypot_cpu_dispatch.h>
+#include <ATen/ops/i0_cpu_dispatch.h>
+#include <ATen/ops/igamma_cpu_dispatch.h>
+#include <ATen/ops/igammac_cpu_dispatch.h>
+#include <ATen/ops/im2col_cpu_dispatch.h>
+#include <ATen/ops/index_cpu_dispatch.h>
+#include <ATen/ops/index_add_cpu_dispatch.h>
+#include <ATen/ops/index_copy_cpu_dispatch.h>
+#include <ATen/ops/index_fill_cpu_dispatch.h>
+#include <ATen/ops/index_reduce_cpu_dispatch.h>
+#include <ATen/ops/index_select_cpu_dispatch.h>
+#include <ATen/ops/is_set_to_cpu_dispatch.h>
+#include <ATen/ops/isin_cpu_dispatch.h>
+#include <ATen/ops/isnan_cpu_dispatch.h>
+#include <ATen/ops/isneginf_cpu_dispatch.h>
+#include <ATen/ops/isposinf_cpu_dispatch.h>
+#include <ATen/ops/kthvalue_cpu_dispatch.h>
+#include <ATen/ops/lcm_cpu_dispatch.h>
+#include <ATen/ops/le_cpu_dispatch.h>
+#include <ATen/ops/leaky_relu_cpu_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_cpu_dispatch.h>
+#include <ATen/ops/lerp_cpu_dispatch.h>
+#include <ATen/ops/lgamma_cpu_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_cpu_dispatch.h>
+#include <ATen/ops/linalg_cross_cpu_dispatch.h>
+#include <ATen/ops/linalg_eig_cpu_dispatch.h>
+#include <ATen/ops/linalg_eigvals_cpu_dispatch.h>
+#include <ATen/ops/linalg_householder_product_cpu_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_cpu_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_cpu_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_cpu_dispatch.h>
+#include <ATen/ops/linalg_lstsq_cpu_dispatch.h>
+#include <ATen/ops/linalg_lu_cpu_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_cpu_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_cpu_dispatch.h>
+#include <ATen/ops/linalg_matrix_exp_cpu_dispatch.h>
+#include <ATen/ops/linalg_qr_cpu_dispatch.h>
+#include <ATen/ops/linalg_solve_triangular_cpu_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_cpu_dispatch.h>
+#include <ATen/ops/linspace_cpu_dispatch.h>
+#include <ATen/ops/log_cpu_dispatch.h>
+#include <ATen/ops/log10_cpu_dispatch.h>
+#include <ATen/ops/log1p_cpu_dispatch.h>
+#include <ATen/ops/log2_cpu_dispatch.h>
+#include <ATen/ops/log_normal_cpu_dispatch.h>
+#include <ATen/ops/log_sigmoid_backward_cpu_dispatch.h>
+#include <ATen/ops/log_sigmoid_forward_cpu_dispatch.h>
+#include <ATen/ops/logaddexp_cpu_dispatch.h>
+#include <ATen/ops/logaddexp2_cpu_dispatch.h>
+#include <ATen/ops/logical_and_cpu_dispatch.h>
+#include <ATen/ops/logical_not_cpu_dispatch.h>
+#include <ATen/ops/logical_or_cpu_dispatch.h>
+#include <ATen/ops/logical_xor_cpu_dispatch.h>
+#include <ATen/ops/logit_cpu_dispatch.h>
+#include <ATen/ops/logit_backward_cpu_dispatch.h>
+#include <ATen/ops/logspace_cpu_dispatch.h>
+#include <ATen/ops/lshift_cpu_dispatch.h>
+#include <ATen/ops/lt_cpu_dispatch.h>
+#include <ATen/ops/lu_unpack_cpu_dispatch.h>
+#include <ATen/ops/masked_fill_cpu_dispatch.h>
+#include <ATen/ops/masked_scatter_cpu_dispatch.h>
+#include <ATen/ops/masked_select_cpu_dispatch.h>
+#include <ATen/ops/max_cpu_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_cpu_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_cpu_dispatch.h>
+#include <ATen/ops/max_pool3d_with_indices_cpu_dispatch.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_cpu_dispatch.h>
+#include <ATen/ops/max_unpool2d_cpu_dispatch.h>
+#include <ATen/ops/max_unpool3d_cpu_dispatch.h>
+#include <ATen/ops/maximum_cpu_dispatch.h>
+#include <ATen/ops/mean_cpu_dispatch.h>
+#include <ATen/ops/median_cpu_dispatch.h>
+#include <ATen/ops/min_cpu_dispatch.h>
+#include <ATen/ops/minimum_cpu_dispatch.h>
+#include <ATen/ops/mish_cpu_dispatch.h>
+#include <ATen/ops/mish_backward_cpu_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_cpu_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_cpu_dispatch.h>
+#include <ATen/ops/mm_cpu_dispatch.h>
+#include <ATen/ops/mode_cpu_dispatch.h>
+#include <ATen/ops/mse_loss_cpu_dispatch.h>
+#include <ATen/ops/mse_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/mul_cpu_dispatch.h>
+#include <ATen/ops/multi_margin_loss_cpu_dispatch.h>
+#include <ATen/ops/multi_margin_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss_forward_cpu_dispatch.h>
+#include <ATen/ops/multinomial_cpu_dispatch.h>
+#include <ATen/ops/mvlgamma_cpu_dispatch.h>
+#include <ATen/ops/nan_to_num_cpu_dispatch.h>
+#include <ATen/ops/nanmedian_cpu_dispatch.h>
+#include <ATen/ops/nansum_cpu_dispatch.h>
+#include <ATen/ops/narrow_copy_cpu_dispatch.h>
+#include <ATen/ops/native_batch_norm_cpu_dispatch.h>
+#include <ATen/ops/native_batch_norm_backward_cpu_dispatch.h>
+#include <ATen/ops/native_channel_shuffle_cpu_dispatch.h>
+#include <ATen/ops/native_dropout_cpu_dispatch.h>
+#include <ATen/ops/native_dropout_backward_cpu_dispatch.h>
+#include <ATen/ops/native_group_norm_cpu_dispatch.h>
+#include <ATen/ops/native_group_norm_backward_cpu_dispatch.h>
+#include <ATen/ops/native_layer_norm_cpu_dispatch.h>
+#include <ATen/ops/native_layer_norm_backward_cpu_dispatch.h>
+#include <ATen/ops/ne_cpu_dispatch.h>
+#include <ATen/ops/neg_cpu_dispatch.h>
+#include <ATen/ops/nextafter_cpu_dispatch.h>
+#include <ATen/ops/nll_loss2d_backward_cpu_dispatch.h>
+#include <ATen/ops/nll_loss2d_forward_cpu_dispatch.h>
+#include <ATen/ops/nll_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/nll_loss_forward_cpu_dispatch.h>
+#include <ATen/ops/nonzero_cpu_dispatch.h>
+#include <ATen/ops/nonzero_static_cpu_dispatch.h>
+#include <ATen/ops/norm_cpu_dispatch.h>
+#include <ATen/ops/normal_cpu_dispatch.h>
+#include <ATen/ops/ormqr_cpu_dispatch.h>
+#include <ATen/ops/pixel_shuffle_cpu_dispatch.h>
+#include <ATen/ops/pixel_unshuffle_cpu_dispatch.h>
+#include <ATen/ops/poisson_cpu_dispatch.h>
+#include <ATen/ops/polar_cpu_dispatch.h>
+#include <ATen/ops/polygamma_cpu_dispatch.h>
+#include <ATen/ops/pow_cpu_dispatch.h>
+#include <ATen/ops/prod_cpu_dispatch.h>
+#include <ATen/ops/put_cpu_dispatch.h>
+#include <ATen/ops/quantize_per_channel_cpu_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_cpu_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_cpu_dispatch.h>
+#include <ATen/ops/random_cpu_dispatch.h>
+#include <ATen/ops/randperm_cpu_dispatch.h>
+#include <ATen/ops/range_cpu_dispatch.h>
+#include <ATen/ops/reciprocal_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad1d_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad2d_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad2d_backward_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad3d_cpu_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_cpu_dispatch.h>
+#include <ATen/ops/relu_cpu_dispatch.h>
+#include <ATen/ops/remainder_cpu_dispatch.h>
+#include <ATen/ops/renorm_cpu_dispatch.h>
+#include <ATen/ops/repeat_interleave_cpu_dispatch.h>
+#include <ATen/ops/replication_pad1d_cpu_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_cpu_dispatch.h>
+#include <ATen/ops/replication_pad2d_cpu_dispatch.h>
+#include <ATen/ops/replication_pad2d_backward_cpu_dispatch.h>
+#include <ATen/ops/replication_pad3d_cpu_dispatch.h>
+#include <ATen/ops/replication_pad3d_backward_cpu_dispatch.h>
+#include <ATen/ops/resize_cpu_dispatch.h>
+#include <ATen/ops/roll_cpu_dispatch.h>
+#include <ATen/ops/round_cpu_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_cpu_dispatch.h>
+#include <ATen/ops/rshift_cpu_dispatch.h>
+#include <ATen/ops/rsqrt_cpu_dispatch.h>
+#include <ATen/ops/rsub_cpu_dispatch.h>
+#include <ATen/ops/scatter_cpu_dispatch.h>
+#include <ATen/ops/scatter_add_cpu_dispatch.h>
+#include <ATen/ops/scatter_reduce_cpu_dispatch.h>
+#include <ATen/ops/searchsorted_cpu_dispatch.h>
+#include <ATen/ops/segment_reduce_cpu_dispatch.h>
+#include <ATen/ops/set_cpu_dispatch.h>
+#include <ATen/ops/sgn_cpu_dispatch.h>
+#include <ATen/ops/sigmoid_cpu_dispatch.h>
+#include <ATen/ops/sigmoid_backward_cpu_dispatch.h>
+#include <ATen/ops/sign_cpu_dispatch.h>
+#include <ATen/ops/signbit_cpu_dispatch.h>
+#include <ATen/ops/silu_cpu_dispatch.h>
+#include <ATen/ops/silu_backward_cpu_dispatch.h>
+#include <ATen/ops/sin_cpu_dispatch.h>
+#include <ATen/ops/sinc_cpu_dispatch.h>
+#include <ATen/ops/sinh_cpu_dispatch.h>
+#include <ATen/ops/slow_conv3d_forward_cpu_dispatch.h>
+#include <ATen/ops/slow_conv_dilated2d_cpu_dispatch.h>
+#include <ATen/ops/slow_conv_dilated3d_cpu_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_cpu_dispatch.h>
+#include <ATen/ops/slow_conv_transpose3d_cpu_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_cpu_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_backward_cpu_dispatch.h>
+#include <ATen/ops/softplus_cpu_dispatch.h>
+#include <ATen/ops/softplus_backward_cpu_dispatch.h>
+#include <ATen/ops/softshrink_cpu_dispatch.h>
+#include <ATen/ops/softshrink_backward_cpu_dispatch.h>
+#include <ATen/ops/sort_cpu_dispatch.h>
+#include <ATen/ops/sparse_dim_cpu_dispatch.h>
+#include <ATen/ops/special_airy_ai_cpu_dispatch.h>
+#include <ATen/ops/special_bessel_j0_cpu_dispatch.h>
+#include <ATen/ops/special_bessel_j1_cpu_dispatch.h>
+#include <ATen/ops/special_bessel_y0_cpu_dispatch.h>
+#include <ATen/ops/special_bessel_y1_cpu_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_cpu_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_cpu_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_cpu_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_cpu_dispatch.h>
+#include <ATen/ops/special_entr_cpu_dispatch.h>
+#include <ATen/ops/special_erfcx_cpu_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_cpu_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_cpu_dispatch.h>
+#include <ATen/ops/special_i0e_cpu_dispatch.h>
+#include <ATen/ops/special_i1_cpu_dispatch.h>
+#include <ATen/ops/special_i1e_cpu_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_cpu_dispatch.h>
+#include <ATen/ops/special_log_ndtr_cpu_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_cpu_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_cpu_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_cpu_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_cpu_dispatch.h>
+#include <ATen/ops/special_ndtri_cpu_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_cpu_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_cpu_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_cpu_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_cpu_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_cpu_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_cpu_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_cpu_dispatch.h>
+#include <ATen/ops/special_xlog1py_cpu_dispatch.h>
+#include <ATen/ops/special_zeta_cpu_dispatch.h>
+#include <ATen/ops/sqrt_cpu_dispatch.h>
+#include <ATen/ops/sspaddmm_cpu_dispatch.h>
+#include <ATen/ops/std_cpu_dispatch.h>
+#include <ATen/ops/std_mean_cpu_dispatch.h>
+#include <ATen/ops/sub_cpu_dispatch.h>
+#include <ATen/ops/sum_cpu_dispatch.h>
+#include <ATen/ops/take_cpu_dispatch.h>
+#include <ATen/ops/tan_cpu_dispatch.h>
+#include <ATen/ops/tanh_cpu_dispatch.h>
+#include <ATen/ops/tanh_backward_cpu_dispatch.h>
+#include <ATen/ops/threshold_cpu_dispatch.h>
+#include <ATen/ops/threshold_backward_cpu_dispatch.h>
+#include <ATen/ops/to_mkldnn_cpu_dispatch.h>
+#include <ATen/ops/topk_cpu_dispatch.h>
+#include <ATen/ops/trace_cpu_dispatch.h>
+#include <ATen/ops/triangular_solve_cpu_dispatch.h>
+#include <ATen/ops/tril_cpu_dispatch.h>
+#include <ATen/ops/tril_indices_cpu_dispatch.h>
+#include <ATen/ops/triu_cpu_dispatch.h>
+#include <ATen/ops/triu_indices_cpu_dispatch.h>
+#include <ATen/ops/trunc_cpu_dispatch.h>
+#include <ATen/ops/unfold_cpu_dispatch.h>
+#include <ATen/ops/unfold_backward_cpu_dispatch.h>
+#include <ATen/ops/uniform_cpu_dispatch.h>
+#include <ATen/ops/unique_consecutive_cpu_dispatch.h>
+#include <ATen/ops/unique_dim_cpu_dispatch.h>
+#include <ATen/ops/unique_dim_consecutive_cpu_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_cpu_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_cpu_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_linear1d_cpu_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_cpu_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_cpu_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_cpu_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_cpu_dispatch.h>
+#include <ATen/ops/var_cpu_dispatch.h>
+#include <ATen/ops/var_mean_cpu_dispatch.h>
+#include <ATen/ops/vdot_cpu_dispatch.h>
+#include <ATen/ops/view_cpu_dispatch.h>
+#include <ATen/ops/view_as_complex_cpu_dispatch.h>
+#include <ATen/ops/view_as_real_cpu_dispatch.h>
+#include <ATen/ops/where_cpu_dispatch.h>
+#include <ATen/ops/xlogy_cpu_dispatch.h>
+#include <ATen/ops/zero_cpu_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CPUGeneratorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/CPUGeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..34dc2c57b29e05e6efa46a6951001f626d7d9046
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CPUGeneratorImpl.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/core/MT19937RNGEngine.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+
+struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
+  ~CPUGeneratorImpl() override = default;
+
+  // CPUGeneratorImpl methods
+  std::shared_ptr<CPUGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  static c10::DeviceType device_type();
+  uint32_t random();
+  uint64_t random64();
+  c10::optional<float> next_float_normal_sample();
+  c10::optional<double> next_double_normal_sample();
+  void set_next_float_normal_sample(c10::optional<float> randn);
+  void set_next_double_normal_sample(c10::optional<double> randn);
+  at::mt19937 engine();
+  void set_engine(at::mt19937 engine);
+
+ private:
+  CPUGeneratorImpl* clone_impl() const override;
+  at::mt19937 engine_;
+  c10::optional<float> next_float_normal_sample_;
+  c10::optional<double> next_double_normal_sample_;
+};
+
+namespace detail {
+
+TORCH_API const Generator& getDefaultCPUGenerator();
+TORCH_API Generator
+createCPUGenerator(uint64_t seed_val = default_rng_seed_val);
+
+} // namespace detail
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fae8914e35a412df8a615d3f403c4aee5ba758e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CUDAFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a805f1717f26c93472b0c6603d638d6176652dc8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CUDAFunctions_inl.h
@@ -0,0 +1,614 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_cuda_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_cuda_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_cuda_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_cuda_dispatch.h>
+#include <ATen/ops/_addmm_activation_cuda_dispatch.h>
+#include <ATen/ops/_aminmax_cuda_dispatch.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h>
+#include <ATen/ops/_amp_update_scale_cuda_dispatch.h>
+#include <ATen/ops/_assert_async_cuda_dispatch.h>
+#include <ATen/ops/_cdist_backward_cuda_dispatch.h>
+#include <ATen/ops/_cdist_forward_cuda_dispatch.h>
+#include <ATen/ops/_cholesky_solve_helper_cuda_dispatch.h>
+#include <ATen/ops/_compute_linear_combination_cuda_dispatch.h>
+#include <ATen/ops/_conv_depthwise2d_cuda_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_cuda_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_cuda_dispatch.h>
+#include <ATen/ops/_convert_weight_to_int4pack_cuda_dispatch.h>
+#include <ATen/ops/_cslt_compress_cuda_dispatch.h>
+#include <ATen/ops/_cslt_sparse_mm_cuda_dispatch.h>
+#include <ATen/ops/_cslt_sparse_mm_search_cuda_dispatch.h>
+#include <ATen/ops/_ctc_loss_cuda_dispatch.h>
+#include <ATen/ops/_ctc_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/_cudnn_ctc_loss_cuda_dispatch.h>
+#include <ATen/ops/_cudnn_init_dropout_state_cuda_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_cuda_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_backward_cuda_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_cuda_dispatch.h>
+#include <ATen/ops/_cummax_helper_cuda_dispatch.h>
+#include <ATen/ops/_cummin_helper_cuda_dispatch.h>
+#include <ATen/ops/_dirichlet_grad_cuda_dispatch.h>
+#include <ATen/ops/_efficient_attention_backward_cuda_dispatch.h>
+#include <ATen/ops/_efficient_attention_forward_cuda_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_cuda_dispatch.h>
+#include <ATen/ops/_embedding_bag_cuda_dispatch.h>
+#include <ATen/ops/_embedding_bag_dense_backward_cuda_dispatch.h>
+#include <ATen/ops/_embedding_bag_forward_only_cuda_dispatch.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_cuda_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_cuda_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_cuda_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_cuda_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_cuda_dispatch.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cuda_dispatch.h>
+#include <ATen/ops/_fft_c2c_cuda_dispatch.h>
+#include <ATen/ops/_fft_c2r_cuda_dispatch.h>
+#include <ATen/ops/_fft_r2c_cuda_dispatch.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_cuda_dispatch.h>
+#include <ATen/ops/_flash_attention_backward_cuda_dispatch.h>
+#include <ATen/ops/_flash_attention_forward_cuda_dispatch.h>
+#include <ATen/ops/_foreach_abs_cuda_dispatch.h>
+#include <ATen/ops/_foreach_acos_cuda_dispatch.h>
+#include <ATen/ops/_foreach_add_cuda_dispatch.h>
+#include <ATen/ops/_foreach_addcdiv_cuda_dispatch.h>
+#include <ATen/ops/_foreach_addcmul_cuda_dispatch.h>
+#include <ATen/ops/_foreach_asin_cuda_dispatch.h>
+#include <ATen/ops/_foreach_atan_cuda_dispatch.h>
+#include <ATen/ops/_foreach_ceil_cuda_dispatch.h>
+#include <ATen/ops/_foreach_clamp_max_cuda_dispatch.h>
+#include <ATen/ops/_foreach_clamp_min_cuda_dispatch.h>
+#include <ATen/ops/_foreach_copy_cuda_dispatch.h>
+#include <ATen/ops/_foreach_cos_cuda_dispatch.h>
+#include <ATen/ops/_foreach_cosh_cuda_dispatch.h>
+#include <ATen/ops/_foreach_div_cuda_dispatch.h>
+#include <ATen/ops/_foreach_erf_cuda_dispatch.h>
+#include <ATen/ops/_foreach_erfc_cuda_dispatch.h>
+#include <ATen/ops/_foreach_exp_cuda_dispatch.h>
+#include <ATen/ops/_foreach_expm1_cuda_dispatch.h>
+#include <ATen/ops/_foreach_floor_cuda_dispatch.h>
+#include <ATen/ops/_foreach_frac_cuda_dispatch.h>
+#include <ATen/ops/_foreach_lerp_cuda_dispatch.h>
+#include <ATen/ops/_foreach_lgamma_cuda_dispatch.h>
+#include <ATen/ops/_foreach_log_cuda_dispatch.h>
+#include <ATen/ops/_foreach_log10_cuda_dispatch.h>
+#include <ATen/ops/_foreach_log1p_cuda_dispatch.h>
+#include <ATen/ops/_foreach_log2_cuda_dispatch.h>
+#include <ATen/ops/_foreach_maximum_cuda_dispatch.h>
+#include <ATen/ops/_foreach_minimum_cuda_dispatch.h>
+#include <ATen/ops/_foreach_mul_cuda_dispatch.h>
+#include <ATen/ops/_foreach_neg_cuda_dispatch.h>
+#include <ATen/ops/_foreach_norm_cuda_dispatch.h>
+#include <ATen/ops/_foreach_pow_cuda_dispatch.h>
+#include <ATen/ops/_foreach_reciprocal_cuda_dispatch.h>
+#include <ATen/ops/_foreach_round_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sigmoid_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sign_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sin_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sinh_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sqrt_cuda_dispatch.h>
+#include <ATen/ops/_foreach_sub_cuda_dispatch.h>
+#include <ATen/ops/_foreach_tan_cuda_dispatch.h>
+#include <ATen/ops/_foreach_tanh_cuda_dispatch.h>
+#include <ATen/ops/_foreach_trunc_cuda_dispatch.h>
+#include <ATen/ops/_foreach_zero_cuda_dispatch.h>
+#include <ATen/ops/_fused_adam_cuda_dispatch.h>
+#include <ATen/ops/_fused_adamw_cuda_dispatch.h>
+#include <ATen/ops/_fused_dropout_cuda_dispatch.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_cuda_dispatch.h>
+#include <ATen/ops/_fused_sdp_choice_cuda_dispatch.h>
+#include <ATen/ops/_fused_sgd_cuda_dispatch.h>
+#include <ATen/ops/_index_put_impl_cuda_dispatch.h>
+#include <ATen/ops/_int_mm_cuda_dispatch.h>
+#include <ATen/ops/_linalg_det_cuda_dispatch.h>
+#include <ATen/ops/_linalg_eigh_cuda_dispatch.h>
+#include <ATen/ops/_linalg_eigvals_cuda_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_cuda_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_cuda_dispatch.h>
+#include <ATen/ops/_linalg_svd_cuda_dispatch.h>
+#include <ATen/ops/_local_scalar_dense_cuda_dispatch.h>
+#include <ATen/ops/_log_softmax_cuda_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_cuda_dispatch.h>
+#include <ATen/ops/_logcumsumexp_cuda_dispatch.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_cuda_dispatch.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_cuda_dispatch.h>
+#include <ATen/ops/_masked_scale_cuda_dispatch.h>
+#include <ATen/ops/_masked_softmax_cuda_dispatch.h>
+#include <ATen/ops/_masked_softmax_backward_cuda_dispatch.h>
+#include <ATen/ops/_mixed_dtypes_linear_cuda_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_cuda_dispatch.h>
+#include <ATen/ops/_native_multi_head_attention_cuda_dispatch.h>
+#include <ATen/ops/_nested_from_padded_cuda_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_cuda_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_cuda_dispatch.h>
+#include <ATen/ops/_nested_view_from_buffer_cuda_dispatch.h>
+#include <ATen/ops/_pdist_backward_cuda_dispatch.h>
+#include <ATen/ops/_pdist_forward_cuda_dispatch.h>
+#include <ATen/ops/_pin_memory_cuda_dispatch.h>
+#include <ATen/ops/_prelu_kernel_cuda_dispatch.h>
+#include <ATen/ops/_prelu_kernel_backward_cuda_dispatch.h>
+#include <ATen/ops/_reshape_alias_cuda_dispatch.h>
+#include <ATen/ops/_sample_dirichlet_cuda_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_cuda_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_cuda_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward_cuda_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_cuda_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward_cuda_dispatch.h>
+#include <ATen/ops/_scaled_mm_cuda_dispatch.h>
+#include <ATen/ops/_segment_reduce_backward_cuda_dispatch.h>
+#include <ATen/ops/_slow_conv2d_backward_cuda_dispatch.h>
+#include <ATen/ops/_slow_conv2d_forward_cuda_dispatch.h>
+#include <ATen/ops/_softmax_cuda_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_cuda_dispatch.h>
+#include <ATen/ops/_sparse_semi_structured_linear_cuda_dispatch.h>
+#include <ATen/ops/_standard_gamma_cuda_dispatch.h>
+#include <ATen/ops/_standard_gamma_grad_cuda_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_cuda_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_cuda_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_cuda_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_bsc_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_bsr_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_csc_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_csr_cuda_dispatch.h>
+#include <ATen/ops/_to_sparse_semi_structured_cuda_dispatch.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_cuda_dispatch.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_cuda_dispatch.h>
+#include <ATen/ops/_triton_multi_head_attention_cuda_dispatch.h>
+#include <ATen/ops/_triton_scaled_dot_attention_cuda_dispatch.h>
+#include <ATen/ops/_unique_cuda_dispatch.h>
+#include <ATen/ops/_unique2_cuda_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_cuda_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_cuda_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_cuda_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_cuda_dispatch.h>
+#include <ATen/ops/_use_cudnn_ctc_loss_cuda_dispatch.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_cuda_dispatch.h>
+#include <ATen/ops/_weight_int4pack_mm_cuda_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_cuda_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_backward_cuda_dispatch.h>
+#include <ATen/ops/abs_cuda_dispatch.h>
+#include <ATen/ops/acos_cuda_dispatch.h>
+#include <ATen/ops/acosh_cuda_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool3d_cuda_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_cuda_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_cuda_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_cuda_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_cuda_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h>
+#include <ATen/ops/add_cuda_dispatch.h>
+#include <ATen/ops/addbmm_cuda_dispatch.h>
+#include <ATen/ops/addcdiv_cuda_dispatch.h>
+#include <ATen/ops/addcmul_cuda_dispatch.h>
+#include <ATen/ops/addmm_cuda_dispatch.h>
+#include <ATen/ops/addmv_cuda_dispatch.h>
+#include <ATen/ops/addr_cuda_dispatch.h>
+#include <ATen/ops/all_cuda_dispatch.h>
+#include <ATen/ops/amax_cuda_dispatch.h>
+#include <ATen/ops/amin_cuda_dispatch.h>
+#include <ATen/ops/aminmax_cuda_dispatch.h>
+#include <ATen/ops/angle_cuda_dispatch.h>
+#include <ATen/ops/any_cuda_dispatch.h>
+#include <ATen/ops/arange_cuda_dispatch.h>
+#include <ATen/ops/argmax_cuda_dispatch.h>
+#include <ATen/ops/argmin_cuda_dispatch.h>
+#include <ATen/ops/argsort_cuda_dispatch.h>
+#include <ATen/ops/as_strided_cuda_dispatch.h>
+#include <ATen/ops/asin_cuda_dispatch.h>
+#include <ATen/ops/asinh_cuda_dispatch.h>
+#include <ATen/ops/atan_cuda_dispatch.h>
+#include <ATen/ops/atan2_cuda_dispatch.h>
+#include <ATen/ops/atanh_cuda_dispatch.h>
+#include <ATen/ops/avg_pool2d_cuda_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_cuda_dispatch.h>
+#include <ATen/ops/avg_pool3d_cuda_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_cuda_dispatch.h>
+#include <ATen/ops/baddbmm_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_backward_elemt_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_backward_reduce_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_elemt_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_stats_cuda_dispatch.h>
+#include <ATen/ops/batch_norm_update_stats_cuda_dispatch.h>
+#include <ATen/ops/bernoulli_cuda_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_cuda_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_backward_cuda_dispatch.h>
+#include <ATen/ops/bincount_cuda_dispatch.h>
+#include <ATen/ops/binomial_cuda_dispatch.h>
+#include <ATen/ops/bitwise_and_cuda_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_cuda_dispatch.h>
+#include <ATen/ops/bitwise_not_cuda_dispatch.h>
+#include <ATen/ops/bitwise_or_cuda_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_cuda_dispatch.h>
+#include <ATen/ops/bitwise_xor_cuda_dispatch.h>
+#include <ATen/ops/bmm_cuda_dispatch.h>
+#include <ATen/ops/bucketize_cuda_dispatch.h>
+#include <ATen/ops/cat_cuda_dispatch.h>
+#include <ATen/ops/cauchy_cuda_dispatch.h>
+#include <ATen/ops/ceil_cuda_dispatch.h>
+#include <ATen/ops/channel_shuffle_cuda_dispatch.h>
+#include <ATen/ops/cholesky_cuda_dispatch.h>
+#include <ATen/ops/cholesky_inverse_cuda_dispatch.h>
+#include <ATen/ops/clamp_cuda_dispatch.h>
+#include <ATen/ops/clamp_max_cuda_dispatch.h>
+#include <ATen/ops/clamp_min_cuda_dispatch.h>
+#include <ATen/ops/col2im_cuda_dispatch.h>
+#include <ATen/ops/complex_cuda_dispatch.h>
+#include <ATen/ops/conj_physical_cuda_dispatch.h>
+#include <ATen/ops/conv_depthwise3d_cuda_dispatch.h>
+#include <ATen/ops/convolution_backward_cuda_dispatch.h>
+#include <ATen/ops/copysign_cuda_dispatch.h>
+#include <ATen/ops/cos_cuda_dispatch.h>
+#include <ATen/ops/cosh_cuda_dispatch.h>
+#include <ATen/ops/count_nonzero_cuda_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_cuda_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_cuda_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_cuda_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_backward_cuda_dispatch.h>
+#include <ATen/ops/cudnn_convolution_cuda_dispatch.h>
+#include <ATen/ops/cudnn_convolution_add_relu_cuda_dispatch.h>
+#include <ATen/ops/cudnn_convolution_relu_cuda_dispatch.h>
+#include <ATen/ops/cudnn_convolution_transpose_cuda_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_cuda_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_cuda_dispatch.h>
+#include <ATen/ops/cumprod_cuda_dispatch.h>
+#include <ATen/ops/cumsum_cuda_dispatch.h>
+#include <ATen/ops/dense_dim_cuda_dispatch.h>
+#include <ATen/ops/dequantize_cuda_dispatch.h>
+#include <ATen/ops/digamma_cuda_dispatch.h>
+#include <ATen/ops/div_cuda_dispatch.h>
+#include <ATen/ops/dot_cuda_dispatch.h>
+#include <ATen/ops/elu_cuda_dispatch.h>
+#include <ATen/ops/elu_backward_cuda_dispatch.h>
+#include <ATen/ops/embedding_dense_backward_cuda_dispatch.h>
+#include <ATen/ops/embedding_renorm_cuda_dispatch.h>
+#include <ATen/ops/empty_cuda_dispatch.h>
+#include <ATen/ops/empty_strided_cuda_dispatch.h>
+#include <ATen/ops/eq_cuda_dispatch.h>
+#include <ATen/ops/equal_cuda_dispatch.h>
+#include <ATen/ops/erf_cuda_dispatch.h>
+#include <ATen/ops/erfc_cuda_dispatch.h>
+#include <ATen/ops/erfinv_cuda_dispatch.h>
+#include <ATen/ops/exp_cuda_dispatch.h>
+#include <ATen/ops/exp2_cuda_dispatch.h>
+#include <ATen/ops/expm1_cuda_dispatch.h>
+#include <ATen/ops/exponential_cuda_dispatch.h>
+#include <ATen/ops/eye_cuda_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_cuda_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_cuda_dispatch.h>
+#include <ATen/ops/fill_cuda_dispatch.h>
+#include <ATen/ops/flip_cuda_dispatch.h>
+#include <ATen/ops/floor_cuda_dispatch.h>
+#include <ATen/ops/floor_divide_cuda_dispatch.h>
+#include <ATen/ops/fmax_cuda_dispatch.h>
+#include <ATen/ops/fmin_cuda_dispatch.h>
+#include <ATen/ops/fmod_cuda_dispatch.h>
+#include <ATen/ops/frac_cuda_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_cuda_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_cuda_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_cuda_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_backward_cuda_dispatch.h>
+#include <ATen/ops/frexp_cuda_dispatch.h>
+#include <ATen/ops/gather_cuda_dispatch.h>
+#include <ATen/ops/gcd_cuda_dispatch.h>
+#include <ATen/ops/ge_cuda_dispatch.h>
+#include <ATen/ops/gelu_cuda_dispatch.h>
+#include <ATen/ops/gelu_backward_cuda_dispatch.h>
+#include <ATen/ops/geometric_cuda_dispatch.h>
+#include <ATen/ops/geqrf_cuda_dispatch.h>
+#include <ATen/ops/glu_cuda_dispatch.h>
+#include <ATen/ops/glu_backward_cuda_dispatch.h>
+#include <ATen/ops/glu_backward_jvp_cuda_dispatch.h>
+#include <ATen/ops/glu_jvp_cuda_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_cuda_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_backward_cuda_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_cuda_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_backward_cuda_dispatch.h>
+#include <ATen/ops/gt_cuda_dispatch.h>
+#include <ATen/ops/hardshrink_cuda_dispatch.h>
+#include <ATen/ops/hardshrink_backward_cuda_dispatch.h>
+#include <ATen/ops/hardsigmoid_cuda_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_cuda_dispatch.h>
+#include <ATen/ops/hardswish_cuda_dispatch.h>
+#include <ATen/ops/hardswish_backward_cuda_dispatch.h>
+#include <ATen/ops/hardtanh_cuda_dispatch.h>
+#include <ATen/ops/hardtanh_backward_cuda_dispatch.h>
+#include <ATen/ops/heaviside_cuda_dispatch.h>
+#include <ATen/ops/histc_cuda_dispatch.h>
+#include <ATen/ops/huber_loss_cuda_dispatch.h>
+#include <ATen/ops/huber_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/hypot_cuda_dispatch.h>
+#include <ATen/ops/i0_cuda_dispatch.h>
+#include <ATen/ops/igamma_cuda_dispatch.h>
+#include <ATen/ops/igammac_cuda_dispatch.h>
+#include <ATen/ops/im2col_cuda_dispatch.h>
+#include <ATen/ops/index_cuda_dispatch.h>
+#include <ATen/ops/index_add_cuda_dispatch.h>
+#include <ATen/ops/index_copy_cuda_dispatch.h>
+#include <ATen/ops/index_fill_cuda_dispatch.h>
+#include <ATen/ops/index_reduce_cuda_dispatch.h>
+#include <ATen/ops/index_select_cuda_dispatch.h>
+#include <ATen/ops/is_pinned_cuda_dispatch.h>
+#include <ATen/ops/is_set_to_cuda_dispatch.h>
+#include <ATen/ops/isin_cuda_dispatch.h>
+#include <ATen/ops/isnan_cuda_dispatch.h>
+#include <ATen/ops/isneginf_cuda_dispatch.h>
+#include <ATen/ops/isposinf_cuda_dispatch.h>
+#include <ATen/ops/kthvalue_cuda_dispatch.h>
+#include <ATen/ops/lcm_cuda_dispatch.h>
+#include <ATen/ops/le_cuda_dispatch.h>
+#include <ATen/ops/leaky_relu_cuda_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_cuda_dispatch.h>
+#include <ATen/ops/lerp_cuda_dispatch.h>
+#include <ATen/ops/lgamma_cuda_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_cuda_dispatch.h>
+#include <ATen/ops/linalg_cross_cuda_dispatch.h>
+#include <ATen/ops/linalg_eig_cuda_dispatch.h>
+#include <ATen/ops/linalg_eigvals_cuda_dispatch.h>
+#include <ATen/ops/linalg_householder_product_cuda_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_cuda_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_cuda_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_cuda_dispatch.h>
+#include <ATen/ops/linalg_lstsq_cuda_dispatch.h>
+#include <ATen/ops/linalg_lu_cuda_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_cuda_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_cuda_dispatch.h>
+#include <ATen/ops/linalg_matrix_exp_cuda_dispatch.h>
+#include <ATen/ops/linalg_qr_cuda_dispatch.h>
+#include <ATen/ops/linalg_solve_triangular_cuda_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_cuda_dispatch.h>
+#include <ATen/ops/linspace_cuda_dispatch.h>
+#include <ATen/ops/log_cuda_dispatch.h>
+#include <ATen/ops/log10_cuda_dispatch.h>
+#include <ATen/ops/log1p_cuda_dispatch.h>
+#include <ATen/ops/log2_cuda_dispatch.h>
+#include <ATen/ops/log_normal_cuda_dispatch.h>
+#include <ATen/ops/log_sigmoid_backward_cuda_dispatch.h>
+#include <ATen/ops/log_sigmoid_forward_cuda_dispatch.h>
+#include <ATen/ops/logaddexp_cuda_dispatch.h>
+#include <ATen/ops/logaddexp2_cuda_dispatch.h>
+#include <ATen/ops/logical_and_cuda_dispatch.h>
+#include <ATen/ops/logical_not_cuda_dispatch.h>
+#include <ATen/ops/logical_or_cuda_dispatch.h>
+#include <ATen/ops/logical_xor_cuda_dispatch.h>
+#include <ATen/ops/logit_cuda_dispatch.h>
+#include <ATen/ops/logit_backward_cuda_dispatch.h>
+#include <ATen/ops/logspace_cuda_dispatch.h>
+#include <ATen/ops/lshift_cuda_dispatch.h>
+#include <ATen/ops/lt_cuda_dispatch.h>
+#include <ATen/ops/lu_unpack_cuda_dispatch.h>
+#include <ATen/ops/masked_fill_cuda_dispatch.h>
+#include <ATen/ops/masked_scatter_cuda_dispatch.h>
+#include <ATen/ops/masked_select_cuda_dispatch.h>
+#include <ATen/ops/max_cuda_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_cuda_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_cuda_dispatch.h>
+#include <ATen/ops/max_pool3d_with_indices_cuda_dispatch.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_cuda_dispatch.h>
+#include <ATen/ops/max_unpool2d_cuda_dispatch.h>
+#include <ATen/ops/max_unpool3d_cuda_dispatch.h>
+#include <ATen/ops/maximum_cuda_dispatch.h>
+#include <ATen/ops/mean_cuda_dispatch.h>
+#include <ATen/ops/median_cuda_dispatch.h>
+#include <ATen/ops/min_cuda_dispatch.h>
+#include <ATen/ops/minimum_cuda_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_cuda_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_backward_cuda_dispatch.h>
+#include <ATen/ops/miopen_convolution_cuda_dispatch.h>
+#include <ATen/ops/miopen_convolution_add_relu_cuda_dispatch.h>
+#include <ATen/ops/miopen_convolution_relu_cuda_dispatch.h>
+#include <ATen/ops/miopen_convolution_transpose_cuda_dispatch.h>
+#include <ATen/ops/miopen_depthwise_convolution_cuda_dispatch.h>
+#include <ATen/ops/miopen_rnn_cuda_dispatch.h>
+#include <ATen/ops/miopen_rnn_backward_cuda_dispatch.h>
+#include <ATen/ops/mish_cuda_dispatch.h>
+#include <ATen/ops/mish_backward_cuda_dispatch.h>
+#include <ATen/ops/mm_cuda_dispatch.h>
+#include <ATen/ops/mode_cuda_dispatch.h>
+#include <ATen/ops/mse_loss_cuda_dispatch.h>
+#include <ATen/ops/mse_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/mul_cuda_dispatch.h>
+#include <ATen/ops/multi_margin_loss_cuda_dispatch.h>
+#include <ATen/ops/multi_margin_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss_forward_cuda_dispatch.h>
+#include <ATen/ops/multinomial_cuda_dispatch.h>
+#include <ATen/ops/mvlgamma_cuda_dispatch.h>
+#include <ATen/ops/nan_to_num_cuda_dispatch.h>
+#include <ATen/ops/nanmedian_cuda_dispatch.h>
+#include <ATen/ops/nansum_cuda_dispatch.h>
+#include <ATen/ops/native_batch_norm_cuda_dispatch.h>
+#include <ATen/ops/native_batch_norm_backward_cuda_dispatch.h>
+#include <ATen/ops/native_dropout_cuda_dispatch.h>
+#include <ATen/ops/native_dropout_backward_cuda_dispatch.h>
+#include <ATen/ops/native_group_norm_cuda_dispatch.h>
+#include <ATen/ops/native_group_norm_backward_cuda_dispatch.h>
+#include <ATen/ops/native_layer_norm_cuda_dispatch.h>
+#include <ATen/ops/native_layer_norm_backward_cuda_dispatch.h>
+#include <ATen/ops/ne_cuda_dispatch.h>
+#include <ATen/ops/neg_cuda_dispatch.h>
+#include <ATen/ops/nextafter_cuda_dispatch.h>
+#include <ATen/ops/nll_loss2d_backward_cuda_dispatch.h>
+#include <ATen/ops/nll_loss2d_forward_cuda_dispatch.h>
+#include <ATen/ops/nll_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/nll_loss_forward_cuda_dispatch.h>
+#include <ATen/ops/nonzero_cuda_dispatch.h>
+#include <ATen/ops/norm_cuda_dispatch.h>
+#include <ATen/ops/normal_cuda_dispatch.h>
+#include <ATen/ops/ormqr_cuda_dispatch.h>
+#include <ATen/ops/poisson_cuda_dispatch.h>
+#include <ATen/ops/polar_cuda_dispatch.h>
+#include <ATen/ops/polygamma_cuda_dispatch.h>
+#include <ATen/ops/pow_cuda_dispatch.h>
+#include <ATen/ops/prod_cuda_dispatch.h>
+#include <ATen/ops/put_cuda_dispatch.h>
+#include <ATen/ops/quantize_per_channel_cuda_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_cuda_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_cuda_dispatch.h>
+#include <ATen/ops/random_cuda_dispatch.h>
+#include <ATen/ops/randperm_cuda_dispatch.h>
+#include <ATen/ops/range_cuda_dispatch.h>
+#include <ATen/ops/reciprocal_cuda_dispatch.h>
+#include <ATen/ops/record_stream_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad1d_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad2d_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad2d_backward_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad3d_cuda_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_cuda_dispatch.h>
+#include <ATen/ops/relu_cuda_dispatch.h>
+#include <ATen/ops/remainder_cuda_dispatch.h>
+#include <ATen/ops/renorm_cuda_dispatch.h>
+#include <ATen/ops/repeat_interleave_cuda_dispatch.h>
+#include <ATen/ops/replication_pad1d_cuda_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_cuda_dispatch.h>
+#include <ATen/ops/replication_pad2d_cuda_dispatch.h>
+#include <ATen/ops/replication_pad2d_backward_cuda_dispatch.h>
+#include <ATen/ops/replication_pad3d_cuda_dispatch.h>
+#include <ATen/ops/replication_pad3d_backward_cuda_dispatch.h>
+#include <ATen/ops/resize_cuda_dispatch.h>
+#include <ATen/ops/roll_cuda_dispatch.h>
+#include <ATen/ops/round_cuda_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_cuda_dispatch.h>
+#include <ATen/ops/rshift_cuda_dispatch.h>
+#include <ATen/ops/rsqrt_cuda_dispatch.h>
+#include <ATen/ops/rsub_cuda_dispatch.h>
+#include <ATen/ops/scatter_cuda_dispatch.h>
+#include <ATen/ops/scatter_add_cuda_dispatch.h>
+#include <ATen/ops/scatter_reduce_cuda_dispatch.h>
+#include <ATen/ops/searchsorted_cuda_dispatch.h>
+#include <ATen/ops/segment_reduce_cuda_dispatch.h>
+#include <ATen/ops/set_cuda_dispatch.h>
+#include <ATen/ops/sgn_cuda_dispatch.h>
+#include <ATen/ops/sigmoid_cuda_dispatch.h>
+#include <ATen/ops/sigmoid_backward_cuda_dispatch.h>
+#include <ATen/ops/sign_cuda_dispatch.h>
+#include <ATen/ops/signbit_cuda_dispatch.h>
+#include <ATen/ops/silu_cuda_dispatch.h>
+#include <ATen/ops/silu_backward_cuda_dispatch.h>
+#include <ATen/ops/sin_cuda_dispatch.h>
+#include <ATen/ops/sinc_cuda_dispatch.h>
+#include <ATen/ops/sinh_cuda_dispatch.h>
+#include <ATen/ops/slow_conv_dilated2d_cuda_dispatch.h>
+#include <ATen/ops/slow_conv_dilated3d_cuda_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_cuda_dispatch.h>
+#include <ATen/ops/slow_conv_transpose3d_cuda_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_cuda_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_backward_cuda_dispatch.h>
+#include <ATen/ops/softplus_cuda_dispatch.h>
+#include <ATen/ops/softplus_backward_cuda_dispatch.h>
+#include <ATen/ops/softshrink_cuda_dispatch.h>
+#include <ATen/ops/softshrink_backward_cuda_dispatch.h>
+#include <ATen/ops/sort_cuda_dispatch.h>
+#include <ATen/ops/sparse_dim_cuda_dispatch.h>
+#include <ATen/ops/special_airy_ai_cuda_dispatch.h>
+#include <ATen/ops/special_bessel_j0_cuda_dispatch.h>
+#include <ATen/ops/special_bessel_j1_cuda_dispatch.h>
+#include <ATen/ops/special_bessel_y0_cuda_dispatch.h>
+#include <ATen/ops/special_bessel_y1_cuda_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_cuda_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_cuda_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_cuda_dispatch.h>
+#include <ATen/ops/special_entr_cuda_dispatch.h>
+#include <ATen/ops/special_erfcx_cuda_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_cuda_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_cuda_dispatch.h>
+#include <ATen/ops/special_i0e_cuda_dispatch.h>
+#include <ATen/ops/special_i1_cuda_dispatch.h>
+#include <ATen/ops/special_i1e_cuda_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_cuda_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_cuda_dispatch.h>
+#include <ATen/ops/special_log_ndtr_cuda_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_cuda_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_cuda_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_cuda_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_cuda_dispatch.h>
+#include <ATen/ops/special_ndtri_cuda_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_cuda_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_cuda_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_cuda_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_cuda_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_cuda_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_cuda_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_cuda_dispatch.h>
+#include <ATen/ops/special_xlog1py_cuda_dispatch.h>
+#include <ATen/ops/special_zeta_cuda_dispatch.h>
+#include <ATen/ops/split_with_sizes_copy_cuda_dispatch.h>
+#include <ATen/ops/sqrt_cuda_dispatch.h>
+#include <ATen/ops/sspaddmm_cuda_dispatch.h>
+#include <ATen/ops/std_cuda_dispatch.h>
+#include <ATen/ops/std_mean_cuda_dispatch.h>
+#include <ATen/ops/sub_cuda_dispatch.h>
+#include <ATen/ops/sum_cuda_dispatch.h>
+#include <ATen/ops/take_cuda_dispatch.h>
+#include <ATen/ops/tan_cuda_dispatch.h>
+#include <ATen/ops/tanh_cuda_dispatch.h>
+#include <ATen/ops/tanh_backward_cuda_dispatch.h>
+#include <ATen/ops/threshold_cuda_dispatch.h>
+#include <ATen/ops/threshold_backward_cuda_dispatch.h>
+#include <ATen/ops/topk_cuda_dispatch.h>
+#include <ATen/ops/trace_cuda_dispatch.h>
+#include <ATen/ops/triangular_solve_cuda_dispatch.h>
+#include <ATen/ops/tril_cuda_dispatch.h>
+#include <ATen/ops/tril_indices_cuda_dispatch.h>
+#include <ATen/ops/triu_cuda_dispatch.h>
+#include <ATen/ops/triu_indices_cuda_dispatch.h>
+#include <ATen/ops/trunc_cuda_dispatch.h>
+#include <ATen/ops/unfold_cuda_dispatch.h>
+#include <ATen/ops/unfold_backward_cuda_dispatch.h>
+#include <ATen/ops/uniform_cuda_dispatch.h>
+#include <ATen/ops/unique_consecutive_cuda_dispatch.h>
+#include <ATen/ops/unique_dim_cuda_dispatch.h>
+#include <ATen/ops/unique_dim_consecutive_cuda_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_cuda_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_cuda_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_linear1d_cuda_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_cuda_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_cuda_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_cuda_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_cuda_dispatch.h>
+#include <ATen/ops/var_cuda_dispatch.h>
+#include <ATen/ops/var_mean_cuda_dispatch.h>
+#include <ATen/ops/vdot_cuda_dispatch.h>
+#include <ATen/ops/view_cuda_dispatch.h>
+#include <ATen/ops/view_as_complex_cuda_dispatch.h>
+#include <ATen/ops/view_as_real_cuda_dispatch.h>
+#include <ATen/ops/where_cuda_dispatch.h>
+#include <ATen/ops/xlogy_cuda_dispatch.h>
+#include <ATen/ops/zero_cuda_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CachedTensorUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/CachedTensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..573ac8e18c2548bde3f97e1489be102d9bd89f3d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CachedTensorUtils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace at::caching {
+
+// Some systems (just cudagraphs currently) will persist a static tensor output
+// whose TensorImpl does not change across iterations. For these tensors caching
+// dtype conversions is invalid. Additionally, there will be an extra reference
+// count to these cached tensors that would prevent buffer inplacing and other
+// checks on tensor uniqueness. If we are not using these systems the enabled
+// flag will be false and we will avoid the hash lookup.
+
+TORCH_API bool is_cached_tensor(const at::Tensor& t);
+TORCH_API void add_cached_tensor(const at::Tensor& t);
+TORCH_API void remove_cached_tensor(const at::Tensor& t);
+TORCH_API void set_cached_tensors_enabled(bool enable);
+
+// For gradient buffer stealing we will adjust the use count of tensors
+// which are persisted by cudagraphs, just as we need to adjust reference
+// count of tensors with hooks.
+TORCH_API size_t adjusted_use_count(const at::Tensor& t);
+
+} // namespace at::caching
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CollapseDims.h b/MLPY/Lib/site-packages/torch/include/ATen/CollapseDims.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ca0d9db788470049ff8ce48a433217ffeb5cc3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CollapseDims.h
@@ -0,0 +1,94 @@
+#include <c10/util/Exception.h>
+#include <utility>
+
+namespace at {
+
+/*
+[collapse dims] Updates sizes, and strides to reflect a "collapse" of
+the info, possibly excluding the optional excludeDim. A "collapsed" version
+of the info is the fewest dims that order the tensor's elements in the same
+way as the original info. If excludeDim is specified, the collapse is the
+fewest dims that order the tensor's elements as the original and preserve the
+excluded dimension, unless the tensor collapses to a point.
+
+This function returns a pair of values.
+
+1) The (new) index of the preserved dimension if excludeDim is
+specified. 0 if the tensor is collapsed to a point. -1
+otherwise.
+
+2) The new number of dimensions.
+*/
+template <typename T>
+inline std::pair<int64_t, int64_t> collapse_dims(
+    T* sizes,
+    T* strides,
+    int64_t dims,
+    const int excludeDim = -1) {
+  TORCH_CHECK(
+      excludeDim >= -1 && excludeDim < dims,
+      "expected excluded dim between -1 and dims - 1");
+
+  int64_t stopDim = (excludeDim == -1) ? dims : excludeDim;
+  int64_t newIndex = -1;
+  int64_t oldIndex = 0;
+  int64_t remappedExcludedDim = -1;
+
+  while (oldIndex < dims) {
+    // Finds a dimension to collapse into
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      ++oldIndex;
+      break;
+    }
+
+    // Collapses dims
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+
+      if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
+        sizes[newIndex] *= sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      } else {
+        ++newIndex;
+        sizes[newIndex] = sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      }
+    }
+
+    // Handles excludeDim being set (oldIndex == excludeDim)
+    if (oldIndex != dims) {
+      // Preserves excluded dimension
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      remappedExcludedDim = newIndex;
+
+      // Restarts iteration after excludeDim
+      ++oldIndex;
+      stopDim = dims;
+    }
+  }
+
+  // Handles special case of all dims size 1
+  if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+
+    return std::pair<int64_t, int64_t>(0, 1);
+  }
+
+  dims = newIndex + 1;
+  return std::pair<int64_t, int64_t>(remappedExcludedDim, dims);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4564ba4f32f3a84b638058a84edda3bb49230b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CompositeExplicitAutogradFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1f2556cedee558c7ada96911a13e77ce1d6107d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradFunctions_inl.h
@@ -0,0 +1,542 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeexplicitautograd_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_adaptive_avg_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_add_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_aminmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_amp_update_scale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_assert_scalar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cdist_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cdist_forward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cholesky_solve_helper_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_chunk_cat_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_coalesce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_coalesced_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_conj_physical_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_copy_from_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_copy_from_and_resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_ctc_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_ctc_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_ctc_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_init_dropout_state_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_dirichlet_grad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_dense_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_empty_affine_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_euclidean_dist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foobar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_abs_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_acos_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_add_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_addcmul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_asin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_ceil_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_clamp_max_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_clamp_min_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_cos_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_cosh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_div_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_erf_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_erfc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_exp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_expm1_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_floor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_frac_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_lerp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_lgamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log10_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_maximum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_minimum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_mul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_neg_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_pow_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_round_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_sub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_tan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_tanh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_trunc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_assert_scalar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_sym_constrain_range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_adam_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_adamw_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_dropout_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fused_sgd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fw_primal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_fw_primal_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_has_same_storage_numel_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_bin_edges_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_index_put_impl_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_is_all_true_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_is_any_true_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_linalg_check_errors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_lstm_mps_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_dual_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_dual_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_scale_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_masked_softmax_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mkldnn_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mps_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_mps_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_native_multi_head_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_neg_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_neg_view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_from_padded_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_get_values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_mask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_tensor_strides_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_nnpack_spatial_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pack_padded_sequence_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pdist_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_pin_memory_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_print_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_reshape_alias_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_reshape_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_resize_output_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sample_dirichlet_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_segment_reduce_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_slow_conv2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_addmm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csr_prod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csr_sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_log_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_mask_projection_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_softmax_backward_data_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sparse_matmul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sum_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_spdiags_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_stack_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_functorch_fallback_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_filled_intlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_optional_intlist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_parallel_materialize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_test_warn_in_autograd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_dense_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_bsc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_bsr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_csc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_to_sparse_csr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_trilinear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_triton_multi_head_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unique_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unique2_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_index_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_index_put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_unsafe_view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_interface_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/abs_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/add_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/addr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/affine_grid_generator_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/alias_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/alias_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/all_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/allclose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/any_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/arange_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/argsort_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/as_strided_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/as_strided_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bartlett_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_backward_elemt_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_backward_reduce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bernoulli_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bincount_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/binomial_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_and_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_or_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bitwise_xor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/blackman_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/block_diag_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/bucketize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cauchy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ccol_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ccol_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/celu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/channel_shuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cholesky_solve_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/clone_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/col_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/col_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/complex_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conj_physical_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/conv_tbc_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_backward_overrideable_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/convolution_overrideable_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copy_sparse_to_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/copysign_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/count_nonzero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/crow_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/crow_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_add_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cummax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/cummin_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dequantize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/detach_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/detach_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dist_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/div_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/dot_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_dense_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_renorm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_permuted_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_quantized_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/empty_strided_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/expand_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/expand_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/exponential_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/eye_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fftfreq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fft_rfftfreq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/flip_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/floor_divide_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/fmod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/frexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/from_file_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/full_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/full_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/geometric_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/glu_backward_jvp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/glu_jvp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hamming_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hann_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/hardswish_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/huber_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/index_fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/index_put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/int_repr_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_coalesced_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_pinned_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/is_same_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/isinf_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/isnan_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/kaiser_window_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/kthvalue_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_fresh_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_lstsq_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matrix_exp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_pinv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linear_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/linspace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/log_normal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/log_softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logcumsumexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_and_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_not_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_or_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logical_xor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logspace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/logsumexp_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lshift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/lstm_mps_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_fill_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/masked_scatter_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/matmul_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/median_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_convolution_transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_depthwise_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_rnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/miopen_rnn_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_convolution_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_input_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mode_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mps_convolution_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mps_convolution_transpose_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mul_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mv_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/mvlgamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/nanmedian_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_batch_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_dropout_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_dropout_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_group_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_group_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_layer_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_layer_norm_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/native_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_empty_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_empty_strided_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_full_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_ones_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/new_zeros_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/normal_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ones_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/ones_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/permute_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/permute_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/pixel_shuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/pixel_unshuffle_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/poisson_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/polar_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/polygamma_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/prod_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/put_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/q_per_channel_scales_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/q_per_channel_zero_points_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_batch_norm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool1d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_max_pool3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rad2deg_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rand_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rand_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randint_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randint_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randn_like_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/random_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/randperm_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/relu_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/remainder_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/repeat_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/repeat_interleave_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_as_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/resize_as_sparse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/roll_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rot90_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/row_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/row_indices_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rshift_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/rsub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/scalar_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/segment_reduce_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/select_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/set_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_inverse_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slice_scatter_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slow_conv_dilated2d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/slow_conv_dilated3d_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/soft_margin_loss_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/soft_margin_loss_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/softmax_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sort_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_compressed_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_coo_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_mask_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_resize_and_clear_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_xlog1py_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/special_zeta_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_with_sizes_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/split_with_sizes_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/squeeze_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/squeeze_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/stack_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/std_mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sub_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sum_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sym_constrain_range_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/t_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/t_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/to_mkldnn_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/to_padded_tensor_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/trace_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/transpose_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/transpose_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/tril_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/triu_indices_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unbind_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unfold_backward_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unfold_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/uniform_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_consecutive_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unique_dim_consecutive_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsafe_split_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsafe_split_with_sizes_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsqueeze_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/unsqueeze_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/values_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/values_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/var_mean_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/vdot_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_as_complex_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_as_real_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/view_copy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/xlogy_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zero_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zeros_compositeexplicitautograd_dispatch.h>
+#include <ATen/ops/zeros_like_compositeexplicitautograd_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f96cd9c9d7ba2a487c5b4943b2e50fa2e2d2b99
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CompositeExplicitAutogradNonFunctionalFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..22915229c825ab0cf1aa8488a3bdb67931b96601
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeExplicitAutogradNonFunctionalFunctions_inl.h
@@ -0,0 +1,323 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeexplicitautogradnonfunctional_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_addmm_activation_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_conj_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_fw_primal_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_linalg_det_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_linalg_eigh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_linalg_svd_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_log_softmax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_make_dual_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_neg_view_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_nested_get_values_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_reshape_alias_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_softmax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_trilinear_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/_values_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/acosh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/addcdiv_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/addcmul_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/addmm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/addmv_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/alias_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/all_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/amax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/amin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/aminmax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/any_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/argmax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/as_strided_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/asin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/asinh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/atan_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/atan2_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/atanh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/avg_pool2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/avg_pool3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/baddbmm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bernoulli_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_and_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_or_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bitwise_xor_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/ccol_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/ceil_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/clamp_max_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/clamp_min_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/col_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/copysign_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/cos_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/cosh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/crow_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/cumprod_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/cumsum_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/detach_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/diag_embed_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/diagonal_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/diagonal_scatter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/digamma_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/div_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/elu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/elu_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/eq_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/erf_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/erfc_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/erfinv_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/exp_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/exp2_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/expand_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/expm1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/floor_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fmax_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fmin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fmod_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/frac_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/gcd_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/ge_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/gelu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/gelu_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/glu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/gt_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/hardshrink_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/hardshrink_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/hardsigmoid_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/heaviside_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/hypot_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/i0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/igamma_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/igammac_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/index_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/index_add_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/index_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/index_reduce_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/isin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/isneginf_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/isposinf_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lcm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/le_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/leaky_relu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lerp_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lgamma_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lift_fresh_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_cross_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_lu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_pinv_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_qr_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/log_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/log10_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/log1p_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/log2_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/logaddexp_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/logaddexp2_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/logit_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/logsumexp_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lt_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/lu_unpack_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/maximum_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/mean_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/min_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/minimum_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/mish_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/mm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/mse_loss_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/mul_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/narrow_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/ne_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/neg_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/new_empty_strided_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/nextafter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/nll_loss_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/nll_loss_forward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/norm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/permute_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/pixel_shuffle_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/pixel_unshuffle_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/polygamma_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/pow_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/prod_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/reciprocal_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/reflection_pad1d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/reflection_pad3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/remainder_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/renorm_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/replication_pad1d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/replication_pad2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/replication_pad3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/round_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/row_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/rsqrt_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/scatter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/scatter_add_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/scatter_reduce_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/select_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/select_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/select_scatter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sgn_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sigmoid_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sigmoid_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sign_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/signbit_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/silu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/silu_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sin_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sinc_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sinh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/slice_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/slice_scatter_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/softplus_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/softplus_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/softshrink_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/softshrink_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sort_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_airy_ai_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_bessel_j0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_bessel_j1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_bessel_y0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_bessel_y1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_entr_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_erfcx_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_i0e_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_i1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_i1e_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_log_ndtr_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_ndtri_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_xlog1py_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/special_zeta_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/split_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/split_with_sizes_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sqrt_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/squeeze_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sub_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/sum_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/t_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/tan_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/tanh_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/tanh_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/threshold_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/topk_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/transpose_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/triangular_solve_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/tril_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/triu_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/trunc_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/unbind_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/unfold_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/unsqueeze_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_linear1d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/values_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/view_as_complex_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/view_as_real_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/view_copy_compositeexplicitautogradnonfunctional_dispatch.h>
+#include <ATen/ops/xlogy_compositeexplicitautogradnonfunctional_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..91de7d33c69904c252bc999926c1073464bb3e8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CompositeImplicitAutogradFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fed37f3e886e0692495f4b3d1a8dacabc0380e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradFunctions_inl.h
@@ -0,0 +1,500 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeimplicitautograd_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_add_batch_dim_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_assert_tensor_metadata_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_autocast_to_full_precision_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_autocast_to_reduced_precision_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Byte_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Char_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Double_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Float_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Int_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Long_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cast_Short_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_choose_qparams_per_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_convolution_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_convolution_double_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_convolution_mode_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cufft_clear_plan_cache_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_dim_arange_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_gather_sparse_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_is_zerotensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_lu_with_info_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_nnpack_available_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_pack_padded_sequence_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_pad_circular_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_pad_enum_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_pad_packed_sequence_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_propagate_xla_data_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_remove_batch_dim_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_reshape_from_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_rowwise_prune_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_saturate_weight_to_fp16_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_shape_as_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sobol_engine_draw_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sobol_engine_ff_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sobol_engine_initialize_state_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sobol_engine_scramble_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_log_softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_mm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_sparse_sum_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_test_ambiguous_defaults_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_test_check_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_test_serialization_subcmul_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_test_string_default_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_to_cpu_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_unpack_dual_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_version_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/_weight_norm_differentiable_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/absolute_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/adjoint_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/affine_grid_generator_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/align_as_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/align_tensors_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/align_to_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/all_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/alpha_dropout_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/and_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/any_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arccos_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arccosh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arcsin_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arctan_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arctan2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/arctanh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/argsort_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/argwhere_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/atleast_1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/atleast_2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/avg_pool1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/batch_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/bilinear_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/broadcast_tensors_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/broadcast_to_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/can_cast_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cartesian_prod_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cat_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cdist_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/chain_matmul_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/chalf_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/choose_qparams_optimized_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/chunk_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/clip_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/coalesce_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/column_stack_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/combinations_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/concat_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/concatenate_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conj_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conj_physical_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/contiguous_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv_tbc_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv_transpose1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv_transpose2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/conv_transpose3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/corrcoef_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cosine_embedding_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cosine_similarity_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cov_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cross_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cross_entropy_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/ctc_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cudnn_is_acceptable_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cummax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cummaxmin_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cummin_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cumprod_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cumprod_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cumsum_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/cumulative_trapezoid_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/data_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/det_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/diag_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/diagflat_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/diagonal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/diff_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/divide_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/dropout_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/dsplit_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/dstack_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/einsum_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/embedding_sparse_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/empty_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/expand_as_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/feature_alpha_dropout_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/feature_dropout_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_fftshift_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_hfft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_hfft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_hfftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ifft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ifft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ifftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ifftshift_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ihfft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ihfft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_ihfftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_irfft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_irfftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_rfft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_rfft2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fft_rfftn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fill_diagonal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fix_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/flatten_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/flatten_dense_tensors_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fliplr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/flipud_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/float_power_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/frobenius_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/gather_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/gather_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/ger_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/gradient_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/greater_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/greater_equal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/group_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/gru_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/gru_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/hinge_embedding_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/histogramdd_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/hsplit_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/hstack_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/imag_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/index_add_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/index_copy_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/index_fill_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/index_select_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/index_select_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/inner_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/instance_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/inverse_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_complex_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_conj_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_distributed_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_floating_point_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_inference_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_leaf_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_neg_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_nonzero_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_signed_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/is_vulkan_available_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/isclose_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/isfinite_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/isreal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/istft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/item_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/kl_div_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/kron_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/kthvalue_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/layer_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/ldexp_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/less_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/less_equal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_cholesky_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_cond_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_det_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_diagonal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_eigh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_eigvals_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_eigvalsh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_inv_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matmul_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matrix_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matrix_power_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_matrix_rank_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_multi_dot_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_slogdet_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_solve_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_solve_ex_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_svd_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_svdvals_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_tensorinv_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_tensorsolve_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_vander_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linalg_vecdot_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/linear_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/log_sigmoid_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/log_softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/logcumsumexp_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/logdet_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/lstm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/lstm_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/lu_solve_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/mH_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/mT_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/margin_ranking_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/masked_select_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/matmul_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/matrix_exp_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/matrix_exp_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/matrix_power_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/max_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool1d_with_indices_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/max_pool3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/mean_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/median_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/meshgrid_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/min_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/mish_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/mode_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/moveaxis_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/movedim_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/msort_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/multilabel_margin_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/multiply_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nanmean_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nanmedian_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nanquantile_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/narrow_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/native_channel_shuffle_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/negative_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nested_to_padded_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nll_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nll_loss2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nll_loss_nd_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nonzero_numpy_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/norm_except_dim_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/not_equal_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/nuclear_norm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/numpy_T_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/one_hot_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/or_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/orgqr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/outer_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/output_nr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pad_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pad_sequence_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pairwise_distance_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pdist_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pin_memory_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/pinverse_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/poisson_nll_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/positive_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/prelu_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/prod_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/promote_types_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/qr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/quantile_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_gru_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_lstm_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_rnn_relu_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rand_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/randn_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/ravel_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/real_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/refine_names_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/relu6_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rename_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/repeat_interleave_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/requires_grad_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/reshape_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/reshape_as_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/resolve_conj_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/resolve_neg_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/result_type_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/retain_grad_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/retains_grad_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rnn_relu_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rnn_relu_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rnn_tanh_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rnn_tanh_cell_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/row_stack_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/rrelu_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/scaled_dot_product_attention_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/scatter_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/scatter_add_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/select_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/selu_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/set_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/set_data_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/silu_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/slogdet_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/smm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sort_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_bsc_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_bsr_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_coo_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_csc_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sparse_csr_tensor_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_erf_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_erfc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_erfinv_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_exp2_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_expit_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_expm1_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_gammainc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_gammaln_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_i0_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_log1p_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_log_softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_logit_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_logsumexp_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_ndtr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_psi_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_round_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_sinc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_softmax_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/special_xlogy_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/split_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/square_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/squeeze_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sspaddmm_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/std_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/std_mean_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/stft_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/stride_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/subtract_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sum_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sum_to_size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/svd_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/swapdims_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sym_numel_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sym_size_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sym_storage_offset_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/sym_stride_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/take_along_dim_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/tensor_split_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/tensordot_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/thnn_conv2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/tile_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_dense_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_dense_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_sparse_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_sparse_bsc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_sparse_bsr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_sparse_csc_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/to_sparse_csr_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/trace_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/transpose_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/trapezoid_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/trapz_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/triplet_margin_loss_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/true_divide_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/type_as_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/unbind_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/unflatten_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/unflatten_dense_tensors_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/unsafe_chunk_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_linear1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/value_selecting_reduction_backward_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/vander_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/var_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/var_mean_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/view_as_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/vsplit_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/vstack_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/where_compositeimplicitautograd_dispatch.h>
+#include <ATen/ops/xor_compositeimplicitautograd_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b065d6c742bf58aa05ef33b03c16f7f61bddbe4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5b7c77fbca654b74b428193f1da16b348f6d325
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeimplicitautogradnestedtensor_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/randn_like_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_as_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Config.h b/MLPY/Lib/site-packages/torch/include/ATen/Config.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c22566d9c7b87ac9d41156b1d7ba71f7f6fefda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Config.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// Test these using #if AT_MKL_ENABLED(), not #ifdef, so that it's
+// obvious if you forgot to include Config.h
+//    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
+//
+// DO NOT put the macros for CUDA libraries in this file; they belong in cuda/CUDAConfig.h
+
+#define AT_MKLDNN_ENABLED() 1
+#define AT_MKLDNN_ACL_ENABLED() 0
+#define AT_MKL_ENABLED() 1
+#define AT_MKL_SEQUENTIAL() 0
+#define AT_POCKETFFT_ENABLED() 0
+#define AT_NNPACK_ENABLED() 0
+#define CAFFE2_STATIC_LINK_CUDA() 0
+#define AT_BUILD_WITH_BLAS() 1
+#define AT_BUILD_WITH_LAPACK() 1
+#define AT_PARALLEL_OPENMP 1
+#define AT_PARALLEL_NATIVE 0
+#define AT_PARALLEL_NATIVE_TBB 0
+#define AT_BLAS_F2C() 0
+#define AT_BLAS_USE_CBLAS_DOT() 0
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Context.h b/MLPY/Lib/site-packages/torch/include/ATen/Context.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8e2d98216334f124c52ba2765e69be140f00196
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Context.h
@@ -0,0 +1,560 @@
+#pragma once
+
+#include <ATen/CPUGeneratorImpl.h>
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/LinalgBackend.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/detail/HIPHooksInterface.h>
+#include <ATen/detail/IPUHooksInterface.h>
+#include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <ATen/detail/ORTHooksInterface.h>
+#include <ATen/detail/PrivateUse1HooksInterface.h>
+#include <ATen/detail/XPUHooksInterface.h>
+#include <c10/core/QEngine.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+#include <cstdint>
+#include <mutex>
+
+namespace at {
+
+class Tensor;
+
+enum class TORCH_API Float32MatmulPrecision { HIGHEST, HIGH, MEDIUM };
+
+class TORCH_API Context {
+ public:
+  Context();
+
+  const Generator& defaultGenerator(Device device) {
+    c10::DeviceType device_type = device.type();
+    initCUDAIfNeeded(device_type);
+    initHIPIfNeeded(device_type);
+    if (device_type == at::kCPU) {
+      return at::detail::getDefaultCPUGenerator();
+    } else if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks().getDefaultMPSGenerator();
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
+    } else if (device_type == at::kIPU) {
+      return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
+    } else if (device_type == at::kPrivateUse1) {
+      return at::GetPrivateUse1HooksInterface()->getDefaultGenerator(
+          device.index());
+    } else {
+      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+    }
+  }
+  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+    c10::DeviceType device_type = opt_device_type.has_value()
+        ? opt_device_type.value()
+        : at::getAccelerator(true).value();
+    if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks();
+    } else if (device_type == at::kPrivateUse1) {
+      return at::detail::getPrivateUse1Hooks();
+    } else {
+      AT_ERROR(
+          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+    }
+  }
+  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
+    initCUDAIfNeeded(device_type);
+    initHIPIfNeeded(device_type);
+    initXPUIfNeeded(device_type);
+    if (device_type == at::kCPU) {
+      return c10::DeviceType::CPU;
+    } else if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks().getDeviceFromPtr(data);
+    } else if (device_type == at::kPrivateUse1) {
+      return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data);
+    } else {
+      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+    }
+  }
+  static bool isPinnedPtr(const void* data) {
+    return detail::getCUDAHooks().isPinnedPtr(data);
+  }
+  static bool hasOpenMP();
+  static bool hasMKL();
+  static bool hasLAPACK();
+  static bool hasMKLDNN();
+  static bool hasMAGMA() {
+    return detail::getCUDAHooks().hasMAGMA();
+  }
+  static bool hasCUDA() {
+    return detail::getCUDAHooks().hasCUDA();
+  }
+  static bool hasMTIA() {
+    return detail::getMTIAHooks().hasMTIA();
+  }
+  static bool hasCUDART() {
+    return detail::getCUDAHooks().hasCUDART();
+  }
+  static long versionCUDART() {
+    return detail::getCUDAHooks().versionCUDART();
+  }
+  static bool hasCuDNN() {
+    return detail::getCUDAHooks().hasCuDNN();
+  }
+  static long versionCuDNN() {
+    return detail::getCUDAHooks().versionCuDNN();
+  }
+  static bool hasCuSOLVER() {
+    return detail::getCUDAHooks().hasCuSOLVER();
+  }
+  static bool hasHIP() {
+    return detail::getHIPHooks().hasHIP();
+  }
+  static bool hasMPS() {
+    return detail::getMPSHooks().hasMPS();
+  }
+  static bool hasIPU() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
+  }
+  static bool hasXLA() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
+  }
+  static bool hasXPU() {
+    return detail::getXPUHooks().hasXPU();
+  }
+  static bool hasLazy() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy);
+  }
+  static bool hasORT() {
+    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT);
+  }
+  // defined in header so that getNonVariableType has ability to inline
+  // call_once check. getNonVariableType is called fairly frequently
+  void lazyInitCUDA() {
+    c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); });
+  }
+  void lazyInitHIP() {
+    c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
+  }
+  void lazyInitXPU() {
+    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
+  }
+  void lazyInitPrivateUse1() {
+    c10::call_once(thp_init, [&] {
+      if (isPrivateUse1HooksRegistered()) {
+        at::GetPrivateUse1HooksInterface()->initPrivateUse1();
+      }
+    });
+  }
+  static const at::cuda::NVRTC& getNVRTC() {
+    return detail::getCUDAHooks().nvrtc();
+  }
+
+  static bool setFlushDenormal(bool on);
+
+  // NB: This method is *purely* whether or not a user requested
+  // that CuDNN was enabled, it doesn't actually say anything about
+  // whether or not CuDNN is actually usable.  Use cudnn_is_acceptable
+  // to test this instead
+  bool userEnabledCuDNN() const;
+  void setUserEnabledCuDNN(bool e);
+  bool userEnabledMkldnn() const;
+  void setUserEnabledMkldnn(bool e);
+  bool benchmarkCuDNN() const;
+  void setBenchmarkCuDNN(bool);
+  int benchmarkLimitCuDNN() const;
+  void setBenchmarkLimitCuDNN(int);
+  bool deterministicCuDNN() const;
+  void setDeterministicCuDNN(bool);
+  bool userEnabledNNPACK() const;
+  void setUserEnabledNNPACK(bool e);
+
+  // Note [Disabling Fused SDP Kernels]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Flash and Memory Efficient SDP kernels are enabled by default.
+  // However, they can be disabled by setting
+  // at::globalContext().setUserEnabledFlashSDP(false) flag.
+  // This is useful for debugging purposes. For example, if you want to
+  // compare the performance of the flash SDP kernels with the unfused
+  // kernel, you can disable the flash SDP kernels. By disabling
+  // the math SDP kernel, you can force your code to use flash kernels.
+  // The math SDP kernel can be disabled by setting
+  // at::globalContext().setUserEnabledMathSDP(false) flag.
+  void setSDPUseFlash(bool);
+  bool userEnabledFlashSDP() const;
+
+  void setSDPUseMemEfficient(bool);
+  bool userEnabledMemEfficientSDP() const;
+
+  void setSDPUseMath(bool);
+  bool userEnabledMathSDP() const;
+
+  void setSDPUseCuDNN(bool);
+  bool userEnabledCuDNNSDP() const;
+
+  at::LinalgBackend linalgPreferredBackend() const;
+  void setLinalgPreferredBackend(at::LinalgBackend);
+
+  // Note [Enabling Deterministic Operations]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Operations in PyTorch that normally act nondeterministically, but have an
+  // alternate deterministic implementation, should satisfy the following
+  // requirements:
+  //
+  // * Include this comment: "See Note [Enabling Deterministic Operations]"
+  //
+  // * Check the value of `at::globalContext().deterministicAlgorithms()` to
+  // toggle
+  //   between nondeterministic and deterministic implementations.
+  //
+  // * Have an entry in the list of PyTorch operations that toggle between
+  // nondeterministic
+  //   and deterministic implementations, in the docstring of
+  //   `use_deterministic_algorithms()` in torch/__init__.py
+  //
+  // `example_func()` below shows an example of toggling between
+  // nondeterministic and deterministic implementations:
+  //
+  //    void example_func() {
+  //      // See Note [Enabling Deterministic Operations]
+  //      if (at::globalContext().deterministicAlgorithms()) {
+  //        example_func_deterministic();
+  //      } else {
+  //        example_func_nondeterministic();
+  //      }
+  //    }
+
+  bool deterministicAlgorithms() const;
+  bool deterministicAlgorithmsWarnOnly() const;
+  void setDeterministicAlgorithms(bool, bool);
+  bool deterministicFillUninitializedMemory() const;
+  void setDeterministicFillUninitializedMemory(bool);
+
+  // Note [Writing Nondeterministic Operations]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Operations in PyTorch that act nondeterministically and do not have an
+  // alternate deterministic implementation should satisfy the following
+  // requirements:
+  //
+  // * Include this comment: "See Note [Writing Nondeterministic Operations]"
+  //
+  // * Include a comment explaining why the operation is nondeterministic.
+  //
+  // * Throw an error when `Context::deterministicAlgorithms()` is true. Most
+  //   of the time, this should be accomplished by calling
+  //   `at::globalContext().alertNotDeterminstic()`.  However, if the
+  //   nondeterministic behavior is caused by the CuBLAS workspace
+  //   configuration in CUDA >= 10.2,
+  //   `at::globalContext().alertCuBLASConfigNotDeterministic()` should be
+  //   called instead (in this case, a comment explaining why the operation is
+  //   nondeterministic is not necessary). See below for details on these
+  //   methods.
+  //
+  // * Have an entry in the list of nondeterministic PyTorch operations in the
+  //   docstring of `use_deterministic_algorithms()` in torch/__init__.py
+  //
+  // * Have a test function in `test/test_torch.py` whose name begins with
+  //   `test_nondeterministic_alert_`. Alternatively, if CuBLAS workspace
+  //   configuration is the reason for nondeterminism, the operation should be
+  //   included in the `test_cublas_config_nondeterministic_alert` test. Any new
+  //   tests should ideally follow a pattern similar to the existing ones.
+  //
+  // `example_func()` below shows an example of the comments and error-throwing
+  // code for a nondeterministic operation:
+  //
+  //    void example_func() {
+  //      // See Note [Writing Nondeterministic Operations]
+  //      // Nondeterministic because <reason>
+  //      at::globalContext().alertNondeterministic("example_func");
+  //      ...
+  //    }
+
+  // Throws an error if `Context::deterministicAlgorithms()` is true
+  static void alertNotDeterministic(c10::string_view const& caller);
+
+  // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA
+  // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or
+  // ":4096:8". For more details:
+  // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility
+  void alertCuBLASConfigNotDeterministic() const;
+
+  void setFloat32MatmulPrecision(const std::string& s);
+  bool allowTF32CuDNN() const;
+  void setAllowTF32CuDNN(bool);
+  bool allowTF32CuBLAS() const;
+  void setAllowTF32CuBLAS(bool);
+  Float32MatmulPrecision float32MatmulPrecision() const;
+  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
+  bool allowFP16ReductionCuBLAS() const;
+  void setAllowFP16ReductionCuBLAS(bool);
+  bool allowBF16ReductionCuBLAS() const;
+  void setAllowBF16ReductionCuBLAS(bool);
+  at::QEngine qEngine() const;
+  void setQEngine(at::QEngine e);
+  static const std::vector<at::QEngine>& supportedQEngines();
+  static bool isXNNPACKAvailable();
+  void setCheckSparseTensorInvariants(bool e);
+  bool checkSparseTensorInvariants() const;
+  // This method is used to release the original weight after pre-packing.
+  // It should be called once before loading/running the model.
+  // NB: By default it is set to true for mobile builds.
+  void setReleaseWeightsWhenPrepacking(bool e);
+  bool releaseWeightsWhenPrepacking() const;
+
+  void setDisplayVmapFallbackWarnings(bool enabled);
+  bool areVmapFallbackWarningsEnabled() const;
+
+  void setDefaultMobileCPUAllocator();
+  void unsetDefaultMobileCPUAllocator();
+  bool allowFP16ReductionCPU() const;
+  void setAllowFP16ReductionCPU(bool);
+
+ private:
+  void initCUDAIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::CUDA) {
+      lazyInitCUDA();
+    }
+  }
+  void initHIPIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::HIP) {
+      lazyInitHIP();
+    }
+  }
+  void initXPUIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::XPU) {
+      lazyInitXPU();
+    }
+  }
+  static bool checkCuBLASConfigDeterministic();
+  c10::once_flag thc_init;
+  c10::once_flag thh_init;
+  c10::once_flag thx_init;
+  c10::once_flag thp_init;
+  bool enabled_cudnn = true;
+  bool deterministic_cudnn = false;
+  bool _deterministic_algorithms = false;
+  bool _deterministic_algorithms_warn_only = false;
+  bool _deterministic_fill_uninitialized_memory = true;
+  bool enabled_flashSDP = true;
+  bool enabled_mem_efficientSDP = true;
+  bool enabled_mathSDP = true;
+  bool enabled_cudnnSDP = false;
+#ifdef USE_ROCM
+  bool benchmark_cudnn = true;
+#else
+  bool benchmark_cudnn = false;
+#endif
+  Float32MatmulPrecision float32_matmul_precision =
+      c10::utils::check_env("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE") == true
+      ? at::Float32MatmulPrecision::HIGH
+      : at::Float32MatmulPrecision::HIGHEST;
+  int benchmark_limit_cudnn = 10;
+  bool allow_tf32_cudnn = true;
+  bool allow_fp16_reduction_cublas = true;
+  bool allow_bf16_reduction_cublas = true;
+  bool enabled_mkldnn = true;
+  bool enabled_nnpack = true;
+  at::LinalgBackend linalg_preferred_backend =
+      c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true
+      ? at::LinalgBackend::Cusolver
+      : at::LinalgBackend::Default;
+#ifdef C10_MOBILE
+  bool release_original_weights = true;
+#else
+  bool release_original_weights = false;
+#endif
+  bool display_vmap_fallback_warnings_ = false;
+  c10::optional<at::QEngine> quantized_engine = c10::nullopt;
+  bool enable_sparse_tensor_invariant_checks = false;
+  bool allow_fp16_reduction_cpu = false;
+
+  Allocator* prev_allocator_ptr_{nullptr};
+};
+
+TORCH_API Context& globalContext();
+
+static inline void init() {
+  globalContext();
+}
+
+TORCH_API Allocator* getCPUAllocator();
+
+static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
+    Backend p,
+    ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      p, s);
+}
+
+static inline DeprecatedTypeProperties& CPU(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::CPU, s);
+}
+
+static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::CUDA, s);
+}
+
+static inline DeprecatedTypeProperties& HIP(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::HIP, s);
+}
+
+static inline DeprecatedTypeProperties& MPS(ScalarType s) {
+  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+      Backend::MPS, s);
+}
+
+static inline bool hasCUDA() {
+  return globalContext().hasCUDA();
+}
+
+static inline bool hasMTIA() {
+  return globalContext().hasMTIA();
+}
+
+static inline bool hasHIP() {
+  return globalContext().hasHIP();
+}
+
+static inline bool hasIPU() {
+  return globalContext().hasIPU();
+}
+
+static inline bool hasXLA() {
+  return globalContext().hasXLA();
+}
+
+static inline bool hasMPS() {
+  return globalContext().hasMPS();
+}
+
+static inline bool hasORT() {
+  return globalContext().hasORT();
+}
+
+static inline bool hasXPU() {
+  return globalContext().hasXPU();
+}
+
+// Despite its name, this function returns the number of *CUDA* GPUs.
+static inline size_t getNumGPUs() {
+  // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
+  // FUNCTION.  If you are interested in interrogating the number of
+  // devices for a specific device type, add that function to the
+  // relevant library (e.g., similar to at::cuda::device_count())
+  if (hasCUDA() && hasHIP()) {
+    throw std::runtime_error(
+        "Enabling both CUDA and HIP in ATen is not supported, as HIP masquerades "
+        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
+        "means HIP.  Rebuild PyTorch with one or the other disabled.");
+  } else if (hasCUDA()) {
+    return detail::getCUDAHooks().getNumGPUs();
+  } else if (hasHIP()) {
+    return detail::getHIPHooks().getNumGPUs();
+  } else {
+    return 0;
+  }
+}
+
+static inline bool hasOpenMP() {
+  return globalContext().hasOpenMP();
+}
+
+static inline bool hasMKL() {
+  return globalContext().hasMKL();
+}
+
+static inline bool hasLAPACK() {
+  return globalContext().hasLAPACK();
+}
+
+static inline bool hasMAGMA() {
+  return globalContext().hasMAGMA();
+}
+
+static inline bool hasMKLDNN() {
+  return globalContext().hasMKLDNN();
+}
+
+static inline void manual_seed(uint64_t seed) {
+  auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen.mutex());
+    gen.set_current_seed(seed);
+  }
+  // NB: Sometimes we build with CUDA, but we don't have any GPUs
+  // available. In that case, we must not seed CUDA; it will fail!
+  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
+  if (hasCUDA() && cuda_num_gpus > 0) {
+    for (const auto i : c10::irange(cuda_num_gpus)) {
+      auto cuda_gen = globalContext().defaultGenerator(
+          Device(at::kCUDA, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(cuda_gen.mutex());
+        cuda_gen.set_current_seed(seed);
+      }
+    }
+  }
+
+  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
+  if (hasXPU() && xpu_num_gpus) {
+    for (const auto i : c10::irange(xpu_num_gpus)) {
+      auto xpu_gen = globalContext().defaultGenerator(
+          Device(at::kXPU, static_cast<c10::DeviceIndex>(i)));
+      {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(xpu_gen.mutex());
+        xpu_gen.set_current_seed(seed);
+      }
+    }
+  }
+
+  if (hasMPS()) {
+    auto mps_gen = globalContext().defaultGenerator(c10::DeviceType::MPS);
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(mps_gen.mutex());
+    mps_gen.set_current_seed(seed);
+  }
+}
+
+// When the global flag `allow_tf32` is set to true, cuBLAS handles are
+// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
+// For some operators, such as addmv, TF32 offers no performance improvement
+// but causes precision loss. To help this case, this class implements
+// a RAII guard that can be used to quickly disable TF32 within its scope.
+//
+// Usage:
+//     NoTF32Guard disable_tf32;
+struct TORCH_API NoTF32Guard {
+  NoTF32Guard();
+  ~NoTF32Guard();
+  static bool should_disable_tf32();
+
+ private:
+  bool changed = false;
+};
+
+struct TORCH_API ROCmBackwardPassGuard {
+  ROCmBackwardPassGuard();
+  ~ROCmBackwardPassGuard();
+  static bool is_backward_pass();
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/DLConvertor.h b/MLPY/Lib/site-packages/torch/include/ATen/DLConvertor.h
new file mode 100644
index 0000000000000000000000000000000000000000..70254bc97a4f61b531bc8491d0ba7b93d253fc63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/DLConvertor.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Tensor.h>
+#include <ATen/dlpack.h>
+
+// this convertor will:
+// 1) take a Tensor object and wrap it in the DLPack tensor
+// 2) take a dlpack tensor and convert it to the ATen Tensor
+
+namespace at {
+
+TORCH_API ScalarType toScalarType(const DLDataType& dtype);
+TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
+TORCH_API Tensor fromDLPack(DLManagedTensor* src);
+C10_DEPRECATED_MESSAGE("Please migrate to a non-const variant")
+inline Tensor fromDLPack(const DLManagedTensor* src) {
+  return fromDLPack(const_cast<DLManagedTensor*>(src));
+}
+TORCH_API Tensor
+fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
+TORCH_API DLDataType getDLDataType(const Tensor& t);
+TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Device.h b/MLPY/Lib/site-packages/torch/include/ATen/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..77626cce2465850485e137b148845ee38b9ebb4d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Device.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/core/Device.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/DeviceAccelerator.h b/MLPY/Lib/site-packages/torch/include/ATen/DeviceAccelerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea564ed66b2e755e441c0338cf30290c90af96fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/DeviceAccelerator.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS, XPU
+
+namespace at {
+
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/DeviceGuard.h b/MLPY/Lib/site-packages/torch/include/ATen/DeviceGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cd52c27cd0b984f96097b8efed095a0e8a84016
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/DeviceGuard.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/IListRef.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/ScalarType.h> // TensorList whyyyyy
+
+namespace at {
+
+// Are you here because you're wondering why DeviceGuard(tensor) no
+// longer works?  For code organization reasons, we have temporarily(?)
+// removed this constructor from DeviceGuard.  The new way to
+// spell it is:
+//
+//    OptionalDeviceGuard guard(device_of(tensor));
+
+/// Return the Device of a Tensor, if the Tensor is defined.
+inline c10::optional<Device> device_of(const Tensor& t) {
+  if (t.defined()) {
+    return c10::make_optional(t.device());
+  } else {
+    return c10::nullopt;
+  }
+}
+
+inline c10::optional<Device> device_of(const c10::optional<Tensor>& t) {
+  return t.has_value() ? device_of(t.value()) : c10::nullopt;
+}
+
+/// Return the Device of a TensorList, if the list is non-empty and
+/// the first Tensor is defined.  (This function implicitly assumes
+/// that all tensors in the list have the same device.)
+inline c10::optional<Device> device_of(ITensorListRef t) {
+  if (!t.empty()) {
+    return device_of(t.front());
+  } else {
+    return c10::nullopt;
+  }
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/DimVector.h b/MLPY/Lib/site-packages/torch/include/ATen/DimVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a854a378782824f756ff054d39965d259054351
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/DimVector.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/DimVector.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Dimname.h b/MLPY/Lib/site-packages/torch/include/ATen/Dimname.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a93a8e38f8f25d42131a320ecf54a55c59bb481
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Dimname.h
@@ -0,0 +1 @@
+#include <ATen/core/Dimname.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/Dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d08a04b45e4244b15e845eecac8dc365112ee3f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Dispatch.h
@@ -0,0 +1,808 @@
+#pragma once
+
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/complex.h>
+#include <c10/util/string_view.h>
+
+#ifdef __CUDACC__
+#include <cuda.h> // For CUDA_VERSION
+#endif
+
+#ifdef TEMPLATE_SELECTIVE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#else
+namespace at {
+/**
+ * The method should_include_kernel_dtype() returns true/false
+ * based on whether the switching code for a specific dtype should be
+ * included based on build time constants generated from tracing model
+ * execution. This method will be implmeneted via code-generation and
+ * included in this file when code-gen is ready.
+ */
+inline constexpr bool should_include_kernel_dtype(
+    const char* /*kernel_tag_str*/,
+    at::ScalarType /*scalar_type*/
+) {
+  return true;
+}
+} // namespace at
+#endif
+
+/**
+ * In the Facebook internal build (using BUCK), this macro is enabled by
+ * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer
+ * binary.
+ */
+#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE
+namespace at {
+namespace detail {
+TORCH_API void record_kernel_function_dtype(std::string name);
+}
+} // namespace at
+
+#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) \
+  at::detail::record_kernel_function_dtype(           \
+      std::string(NAME) + "$" + toString(enum_type));
+#else
+#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type)
+#endif
+
+#define AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type)   \
+  do {                                                \
+    if constexpr (!at::should_include_kernel_dtype(   \
+                      at_dispatch_name, enum_type)) { \
+      AT_ERROR(                                       \
+          "dtype '",                                  \
+          toString(enum_type),                        \
+          "' not selected for kernel tag ",           \
+          at_dispatch_name);                          \
+    }                                                 \
+  } while (0)
+
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)           \
+  case enum_type: {                                                     \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                        \
+    using HINT C10_UNUSED = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
+    return __VA_ARGS__();                                               \
+  }
+
+#define AT_DISPATCH_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
+
+#define AT_DISPATCH_CASE_QINT(enum_type, scalar_type, ...)            \
+  case enum_type: {                                                   \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
+    using scalar_t = scalar_type;                                     \
+    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
+    const auto& SCALAR_TYPE C10_UNUSED = enum_type;                   \
+    const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \
+    return __VA_ARGS__();                                             \
+  }
+
+#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                           \
+    enum_type, scalar_type, bitwidth, qmin, qmax, ...)                \
+  case enum_type: {                                                   \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                      \
+    using scalar_t = scalar_type;                                     \
+    using underlying_t C10_UNUSED = typename scalar_t::underlying;    \
+    const auto& SCALAR_TYPE C10_UNUSED = enum_type;                   \
+    const auto& UNDERLYING_TYPE C10_UNUSED = toUnderlying(enum_type); \
+    C10_UNUSED int bit_width = bitwidth;                              \
+    C10_UNUSED int64_t quant_min = qmin;                              \
+    C10_UNUSED int64_t quant_max = qmax;                              \
+    return __VA_ARGS__();                                             \
+  }
+
+namespace detail {
+
+inline at::ScalarType scalar_type(at::ScalarType s) {
+  return s;
+}
+
+C10_DEPRECATED_MESSAGE(
+    "passing at::DeprecatedTypeProperties to an AT_DISPATCH macro is deprecated, "
+    "pass an at::ScalarType instead")
+inline at::ScalarType scalar_type(const at::DeprecatedTypeProperties& t) {
+  return t.scalarType();
+}
+
+C10_DEPRECATED_MESSAGE(
+    "AT_DISPATCH_ALL_TYPES_AND_HALF is deprecated, "
+    "use AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ...) instead")
+inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF() {}
+
+C10_DEPRECATED_MESSAGE(
+    "AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX is deprecated, "
+    "use AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, ...) "
+    "instead")
+inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
+
+} // namespace detail
+
+// The AT_DISPATCH_* family of macros provides the ability to
+// conveniently generate specializations of a kernel over all of the
+// dtypes we care about in PyTorch.  We call it "dispatch" because
+// we are "dispatching" to the correct, dtype-specific kernel.
+//
+// A standard usage looks like:
+//
+//      AT_DISPATCH_ALL_TYPES(self.scalar_type(), "op_name", [&] {
+//          // Your code here, with 'scalar_t' now defined to
+//          // be the dtype in question
+//      });
+//
+// There are many variations of this macro, so it's important to
+// understand exactly /which/ dtypes you want to get instantiated, as
+// well as what the "default" set is.
+//
+// The default set of dtypes that are instantiated (e.g., by
+// AT_DISPATCH_ALL_TYPES) are floating point types (float, double),
+// and integral types (int32_t, int64_t, int16_t, int8_t, uint8_t),
+// but NOT booleans (bool), half-precision floats (Half) or
+// complex number (c10::complex<float>, c10::complex<double>).
+// This "cut" is somewhat historical (the default types are the
+// ones that TH historically supported), but it also reflects the
+// fact that the non-default types are "poorly" behaved (booleans
+// are NOT integers mod 2, half precision operations ~essentially
+// don't exist on CPU, complex numbers are an experimental application).
+//
+// Here are the questions you should generally ask to decide which
+// dispatch you want:
+//
+// 1. Is this an integral or floating point specific operation?
+//    (If so, you'll want one of the FLOATING or INTEGRAL macros.)
+//
+// 2. Should half be supported?  (If you're on CPU, the answer is almost
+//    definitely no.  If you do want support, use one of the AND_HALF
+//    macros)
+//
+// Much rarer situations:
+//
+// 3. Should bool be supported?  (You often have to write your kernel
+//    differently if arithmetic operations are involved.)  If so,
+//    Use AT_DISPATCH_ALL_TYPES_AND along with ScalarType::Bool
+//
+// 4. Should complex be supported?  The answer is almost always no,
+//    unless you are working on "generic" code that should work on
+//    all dtypes.
+//
+// Parameters:
+// -----------
+//
+// 1. The NAME argument is a "tag" that is used to trace and then
+//    conditionally compile fragments of the case statements such
+//    that the kernel functions are specialized only for the dtypes
+//    that are needed. The NAME parameter *must* be a build time
+//    const char* (can't be std::string, etc...)
+//
+// Please ensure that the NAME is unique for every implementation
+// or you run the risk of over-including code for the kernel
+// functions. There is no risk of missing out on any code, so
+// it's mostly a risk of a Type-2 error, and not a Type-1 error.
+//
+// Switch-like syntax:
+// -------------------
+// There is also a switch-case like syntax which is useful if a kernel
+// needs to be specialized for particular scalar types
+//
+//      AT_DISPATCH_SWITCH(self.scalar_type(), "op_name",
+//          AT_DISPATCH_CASE_INTEGRAL_TYPES([&] {
+//            op_integral<scalar_t>(iter);
+//          })
+//          AT_DISPATCH_CASE_FLOATING_TYPES([&] {
+//            op_floating<scalar_t>(iter);
+//          })
+//          AT_DISPATCH_CASE(kBool, [&] {
+//            op_bool(iter);
+//          })
+//      );
+//
+// For each AT_DISPATCH_FOO macro, there is a corresponding
+// AT_DISPATCH_CASE_FOO macro which can be used inside of an
+// AT_DISPATCH_SWITCH block.
+
+// NB: the the_type variable is not used, but we have kept it for
+// backwards compatibility.  It's probably not used by anyone though;
+// but we're just being safe (and it doesn't hurt.)  Note we must
+// use it to shut up warnings about unused store.
+
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    constexpr const char* at_dispatch_name = NAME;                          \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
+    switch (_st) {                                                          \
+      __VA_ARGS__                                                           \
+      default:                                                              \
+        AT_ERROR(                                                           \
+            '"',                                                            \
+            at_dispatch_name,                                               \
+            "\" not implemented for '",                                     \
+            toString(_st),                                                  \
+            "'");                                                           \
+    }                                                                       \
+  }()
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(...)   \
+  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                        \
+      TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES_AND_HALF(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(...)  \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define AT_DISPATCH_REDUCED_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE, NAME, AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                                \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND2(       \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                          \
+      TYPE,                                    \
+      NAME,                                    \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND2(    \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND3(   \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)  \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND3(                    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND3(                 \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
+  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
+
+#define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_COMPLEX_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                              \
+      TYPE, NAME, AT_DISPATCH_CASE_COMPLEX_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)           \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                           \
+      TYPE, NAME, AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)                \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(    \
+    SCALARTYPE, TYPE, NAME, ...)                        \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND1( \
+          SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2(  \
+    SCALARTYPE1, SCALARTYPE2, ...)                         \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(    \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...)          \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND2( \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3(  \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...)            \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND3(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND3(     \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)   \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND4(                     \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND4(                  \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5(                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__)                \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND5(    \
+    SCALARTYPE1,                                        \
+    SCALARTYPE2,                                        \
+    SCALARTYPE3,                                        \
+    SCALARTYPE4,                                        \
+    SCALARTYPE5,                                        \
+    TYPE,                                               \
+    NAME,                                               \
+    ...)                                                \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND5( \
+          SCALARTYPE1,                                  \
+          SCALARTYPE2,                                  \
+          SCALARTYPE3,                                  \
+          SCALARTYPE4,                                  \
+          SCALARTYPE5,                                  \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6(  \
+    SCALARTYPE1,                                           \
+    SCALARTYPE2,                                           \
+    SCALARTYPE3,                                           \
+    SCALARTYPE4,                                           \
+    SCALARTYPE5,                                           \
+    SCALARTYPE6,                                           \
+    ...)                                                   \
+  AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND6(    \
+    SCALARTYPE1,                                        \
+    SCALARTYPE2,                                        \
+    SCALARTYPE3,                                        \
+    SCALARTYPE4,                                        \
+    SCALARTYPE5,                                        \
+    SCALARTYPE6,                                        \
+    TYPE,                                               \
+    NAME,                                               \
+    ...)                                                \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_FLOATING_AND_COMPLEX_TYPES_AND6( \
+          SCALARTYPE1,                                  \
+          SCALARTYPE2,                                  \
+          SCALARTYPE3,                                  \
+          SCALARTYPE4,                                  \
+          SCALARTYPE5,                                  \
+          SCALARTYPE6,                                  \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_INTEGRAL_TYPES(...)          \
+  AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)  \
+  AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)
+
+#define AT_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_INTEGRAL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                               \
+      TYPE,                                                         \
+      NAME,                                                         \
+      AT_DISPATCH_CASE_INTEGRAL_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES(...)        \
+  AT_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_TYPES(...)                      \
+  AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__)   \
+  AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__) \
+  AT_DISPATCH_CASE_QINT(at::kQInt32, at::qint32, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_QINT_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                           \
+      TYPE, NAME, AT_DISPATCH_CASE_QINT_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_BYTE_TYPES(...)               \
+  AT_DISPATCH_CASE_QINT(at::kQInt8, at::qint8, __VA_ARGS__) \
+  AT_DISPATCH_CASE_QINT(at::kQUInt8, at::quint8, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_QINT_BYTE_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(...)                     \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQInt8, at::qint8, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__) \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt8, at::quint8, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__)       \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQInt32,                                                        \
+      at::qint32,                                                         \
+      CHAR_BIT * sizeof(int),                                             \
+      INT_MIN,                                                            \
+      INT_MAX,                                                            \
+      __VA_ARGS__)                                                        \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt4x2, at::quint4x2, 4, 0, 15, __VA_ARGS__)                 \
+  AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE(                                     \
+      at::kQUInt2x4, at::quint2x4, 2, 0, 3, __VA_ARGS__)
+
+#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                        \
+      TYPE, NAME, AT_DISPATCH_CASE_QINT_AND_SUB_BYTE_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)           \
+  AT_DISPATCH_CASE_COMPLEX_TYPES(__VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                      \
+      TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                          \
+      TYPE, NAME, AT_DISPATCH_CASE_ALL_TYPES_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                      \
+      TYPE,                                                                \
+      NAME,                                                                \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                           \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                         \
+      TYPE,                                                                   \
+      NAME,                                                                   \
+      AT_DISPATCH_CASE_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2(  \
+    SCALARTYPE1, SCALARTYPE2, ...)                    \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(    \
+    SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...)     \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND2( \
+          SCALARTYPE1, SCALARTYPE2, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND3(        \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)       \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND3(                         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_ALL_TYPES_AND3(                      \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3(  \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, ...)       \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                       \
+      TYPE,                                                 \
+      NAME,                                                 \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND3(          \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(         \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)        \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(                          \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4(                       \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5(                      \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__)                     \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND5(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND5( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND6( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    SCALARTYPE7,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND7(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    SCALARTYPE7,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND7( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          SCALARTYPE7,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8(  \
+    SCALARTYPE1,                                      \
+    SCALARTYPE2,                                      \
+    SCALARTYPE3,                                      \
+    SCALARTYPE4,                                      \
+    SCALARTYPE5,                                      \
+    SCALARTYPE6,                                      \
+    SCALARTYPE7,                                      \
+    SCALARTYPE8,                                      \
+    ...)                                              \
+  AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX(__VA_ARGS__) \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE6, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE7, __VA_ARGS__)          \
+  AT_DISPATCH_CASE(SCALARTYPE8, __VA_ARGS__)
+
+#define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND8(    \
+    SCALARTYPE1,                                   \
+    SCALARTYPE2,                                   \
+    SCALARTYPE3,                                   \
+    SCALARTYPE4,                                   \
+    SCALARTYPE5,                                   \
+    SCALARTYPE6,                                   \
+    SCALARTYPE7,                                   \
+    SCALARTYPE8,                                   \
+    TYPE,                                          \
+    NAME,                                          \
+    ...)                                           \
+  AT_DISPATCH_SWITCH(                              \
+      TYPE,                                        \
+      NAME,                                        \
+      AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND8( \
+          SCALARTYPE1,                             \
+          SCALARTYPE2,                             \
+          SCALARTYPE3,                             \
+          SCALARTYPE4,                             \
+          SCALARTYPE5,                             \
+          SCALARTYPE6,                             \
+          SCALARTYPE7,                             \
+          SCALARTYPE8,                             \
+          __VA_ARGS__))
+
+#define AT_DISPATCH_CASE_BIT_TYPES(...)                  \
+  AT_DISPATCH_CASE(at::ScalarType::Bits1x8, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits2x4, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits4x2, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Bits8, __VA_ARGS__)   \
+  AT_DISPATCH_CASE(at::ScalarType::Bits16, __VA_ARGS__)
+
+#define AT_DISPATCH_BIT_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_BIT_TYPES(__VA_ARGS__))
+
+#define AT_DISPATCH_INDEX_TYPES(TYPE, NAME, ...)     \
+  AT_DISPATCH_SWITCH(                                \
+      TYPE,                                          \
+      NAME,                                          \
+      AT_PRIVATE_CASE_TYPE_USING_HINT(               \
+          at::ScalarType::Int, index_t, __VA_ARGS__) \
+          AT_PRIVATE_CASE_TYPE_USING_HINT(           \
+              at::ScalarType::Long, index_t, __VA_ARGS__))
+
+// ----------------------------------------------------------------------------
+// DEPRECATED MACROS, DON'T USE THESE
+// ----------------------------------------------------------------------------
+
+#define AT_DISPATCH_ALL_TYPES_AND_HALF(TYPE, NAME, ...) \
+  detail::deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF();  \
+  AT_DISPATCH_SWITCH(                                   \
+      TYPE,                                             \
+      NAME,                                             \
+      AT_DISPATCH_CASE_ALL_TYPES_AND(at::ScalarType::Half, __VA_ARGS__))
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Dispatch_v2.h b/MLPY/Lib/site-packages/torch/include/ATen/Dispatch_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..8960e2f4d96eb68632ecae60fdc94515963e984a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Dispatch_v2.h
@@ -0,0 +1,186 @@
+#include <ATen/Dispatch.h>
+
+// This is a new implementation of the AT_DISPATCH macro family from
+// ATen/Dispatch.h
+//
+// The intended usage is:
+//
+//  ScalarType scalar_type;
+//
+//  AT_DISPATCH_V2(
+//    scalar_type,
+//    "debug string",
+//    AT_WRAP([&] {
+//      ... code to specialize with scalar_t ...
+//    }),
+//    kHalf,
+//    AT_EXPAND(AT_ALL_TYPES),
+//    ... as many types arguments as needed ...
+//  )
+//
+// For example, given an old style:
+//
+//  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+//    kComplexHalf,
+//    kHalf,
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    [&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }
+//  )
+//
+// You now write:
+//
+//  AT_DISPATCH_V2(
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    AT_WRAP([&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }),
+//    AT_EXPAND(AT_ALL_TYPES),
+//    AT_EXPAND(AT_COMPLEX_TYPES),
+//    kComplexHalf,
+//    kHalf,
+//  )
+//
+// Notably, it sports the following improvements:
+//
+//  - It is not necessary to specify the arity (e.g.,
+//    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3,4,...})
+//    when using the macro
+//
+//  - It is not necessary to specify each dtype individually; if
+//    there is a set of related dtypes and you want to dispatch
+//    over all of them, you can simply say, e.g., AT_EXPAND(AT_INTEGRAL_TYPES)
+//    in your argument list.
+//
+// However, you must remember to wrap the payload body in AT_WRAP, or commas
+// inside your lambda will be improperly handled.  Furthermore, if you more
+// entries to ScalarType than can be supported by this macro, it will fail
+// with an obscure error (due to attempting to concatenate AT_AP with
+// something that is not a number).
+//
+// The implementation strategy is to use the count arguments trick
+// (e.g., as described in https://stackoverflow.com/a/2124385/23845)
+// to discover how many dtypes have been passed, and then dispatch to a
+// hand-written macro for each arity that applies as many DISPATCH_CASE as
+// necessary.  The hand-written macros can be regenerated for other arities
+// with the script below.
+//
+// There is some delicacy in the implementation in controlling when
+// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
+// relied on GPT4 to help me get it right.
+
+// Public API macros
+
+// See documentation above
+#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
+
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+#define AT_WRAP(...) __VA_ARGS__
+
+#define AT_FLOAT8_TYPES                                          \
+  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
+      c10::kFloat8_e4m3fnuz
+
+#define AT_INTEGRAL_TYPES \
+  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
+#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
+#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
+#define AT_INTEGRAL_TYPES_V2 \
+  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
+#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
+#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
+// NB: not *actually* all types
+#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+#define AT_ALL_TYPES_AND_COMPLEX \
+  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
+
+// Helper macros
+
+#define AT_AP_VAR(N, T, ...) \
+  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
+#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
+#define AT_CONCAT_AUX(a, b) a##b
+#define AT_EXPAND(X) X
+
+// Ensure we never have too many scalar types for the expansion here to
+// support.  To bump this, you must regenerate the macros below.
+static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 45);
+
+// Python code to regenerate generate code below:
+#if 0
+
+num_args = 45
+
+nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
+args = ', '.join(f'_{i}' for i in range(1, num_args+1))
+
+print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
+print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
+
+for i in range(1, num_args+1):
+    args = ', '.join(f'_{i}' for i in range(1, i+1))
+    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
+    print(f'#define AT_AP{i}(N, {args}) {cases}')
+
+#endif
+
+// Begin generated code
+// clang-format off
+
+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, N, ...) N
+#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
+#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
+#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
+#define AT_AP4(N, _1, _2, _3, _4) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N)
+#define AT_AP5(N, _1, _2, _3, _4, _5) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N)
+#define AT_AP6(N, _1, _2, _3, _4, _5, _6) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N)
+#define AT_AP7(N, _1, _2, _3, _4, _5, _6, _7) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N)
+#define AT_AP8(N, _1, _2, _3, _4, _5, _6, _7, _8) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N)
+#define AT_AP9(N, _1, _2, _3, _4, _5, _6, _7, _8, _9) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N)
+#define AT_AP10(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N)
+#define AT_AP11(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N)
+#define AT_AP12(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N)
+#define AT_AP13(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N)
+#define AT_AP14(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N)
+#define AT_AP15(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N)
+#define AT_AP16(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N)
+#define AT_AP17(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N)
+#define AT_AP18(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N)
+#define AT_AP19(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N)
+#define AT_AP20(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N)
+#define AT_AP21(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N)
+#define AT_AP22(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N)
+#define AT_AP23(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N)
+#define AT_AP24(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N)
+#define AT_AP25(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N)
+#define AT_AP26(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N)
+#define AT_AP27(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N)
+#define AT_AP28(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N)
+#define AT_AP29(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N)
+#define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N)
+#define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N)
+#define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N)
+#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N)
+#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N)
+#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N)
+#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N)
+#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N)
+#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N)
+#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N)
+#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N)
+#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N)
+#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N)
+#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N)
+#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N)
+#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N)
+// End generated code
+// clang-format on
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/DynamicLibrary.h b/MLPY/Lib/site-packages/torch/include/ATen/DynamicLibrary.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8a2b6d4c10efe0804210e7ae7fc45549a9d166
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/DynamicLibrary.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/Utils.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+class DynamicLibraryError : public Error {
+  using Error::Error;
+};
+
+} // namespace c10
+
+namespace at {
+
+struct DynamicLibrary {
+  AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+
+  TORCH_API DynamicLibrary(
+      const char* name,
+      const char* alt_name = nullptr,
+      bool leak_handle = false);
+
+  TORCH_API void* sym(const char* name);
+
+  TORCH_API ~DynamicLibrary();
+
+ private:
+  bool leak_handle;
+  void* handle = nullptr;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/EmptyTensor.h b/MLPY/Lib/site-packages/torch/include/ATen/EmptyTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a380a34b965e347a15c6cfc4a6d25aa9a62e773
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/EmptyTensor.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+inline void check_size_nonnegative(ArrayRef<int64_t> size) {
+  for (const auto& x : size) {
+    TORCH_CHECK(
+        x >= 0,
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+
+inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
+  for (const auto& x : size) {
+    TORCH_CHECK(
+        x.expect_size(__FILE__, __LINE__),
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+
+TORCH_API size_t computeStorageNbytesContiguous(
+    IntArrayRef sizes,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytesContiguous(
+    SymIntArrayRef sizes,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+TORCH_API size_t computeStorageNbytes(
+    IntArrayRef sizes,
+    IntArrayRef strides,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytes(
+    SymIntArrayRef sizes,
+    SymIntArrayRef strides,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+
+TORCH_API TensorBase empty_generic(
+    IntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_strided_generic(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+
+TORCH_API TensorBase empty_strided_symint_generic(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    bool pin_memory = false,
+    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    bool pin_memory = false);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    ScalarType dtype,
+    c10::optional<c10::MemoryFormat> memory_format_opt = c10::nullopt);
+
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_symint_meta(
+    SymIntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options);
+
+TORCH_API TensorBase
+empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype);
+
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    ScalarType dtype);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    const TensorOptions& options);
+
+} // namespace at::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ExpandBase.h b/MLPY/Lib/site-packages/torch/include/ATen/ExpandBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..d59a2714455873cf776242bd04157130911c8b28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ExpandBase.h
@@ -0,0 +1,30 @@
+#include <ATen/core/TensorBase.h>
+
+// Broadcasting utilities for working with TensorBase
+namespace at {
+namespace internal {
+TORCH_API TensorBase expand_slow_path(const TensorBase& self, IntArrayRef size);
+} // namespace internal
+
+inline c10::MaybeOwned<TensorBase> expand_size(
+    const TensorBase& self,
+    IntArrayRef size) {
+  if (size.equals(self.sizes())) {
+    return c10::MaybeOwned<TensorBase>::borrowed(self);
+  }
+  return c10::MaybeOwned<TensorBase>::owned(
+      at::internal::expand_slow_path(self, size));
+}
+c10::MaybeOwned<TensorBase> expand_size(TensorBase&& self, IntArrayRef size) =
+    delete;
+
+inline c10::MaybeOwned<TensorBase> expand_inplace(
+    const TensorBase& tensor,
+    const TensorBase& to_expand) {
+  return expand_size(to_expand, tensor.sizes());
+}
+c10::MaybeOwned<TensorBase> expand_inplace(
+    const TensorBase& tensor,
+    TensorBase&& to_expand) = delete;
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ExpandUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/ExpandUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..04710d2796a14d16ead11b0b4d3fae7925f9ab2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ExpandUtils.h
@@ -0,0 +1,527 @@
+#pragma once
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/view.h>
+#include <ATen/ops/view_copy.h>
+#endif
+
+#include <ATen/Tensor.h>
+#include <ATen/core/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/irange.h>
+
+#include <functional>
+#include <sstream>
+#include <tuple>
+#include <utility>
+
+namespace at {
+
+TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
+TORCH_API std::vector<SymInt> infer_size_symint(
+    SymIntArrayRef a,
+    SymIntArrayRef b);
+TORCH_API DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b);
+TORCH_API SymDimVector
+infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b);
+
+// Named type instead of a pair/tuple so that we can be sure to
+// construct the vectors in place and get NRVO.
+template <typename Container>
+struct InferExpandGeometryResult {
+  Container sizes;
+  Container strides;
+  explicit InferExpandGeometryResult(size_t ndim)
+      : sizes(ndim), strides(ndim) {}
+  explicit InferExpandGeometryResult(IntArrayRef sizes_, size_t ndim)
+      : sizes(sizes_.begin(), sizes_.end()), strides(ndim) {}
+};
+
+TORCH_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+inferExpandGeometry(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides,
+    IntArrayRef sizes);
+
+TORCH_API InferExpandGeometryResult<DimVector> inferExpandGeometry_dimvector(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides,
+    IntArrayRef sizes);
+
+TORCH_API std::vector<int64_t> infer_dense_strides(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides);
+
+// True if input shapes are expandable
+// NOTE: infer_size did a similar check, please keep them sync if change is
+// needed
+inline bool are_expandable(IntArrayRef shape1, IntArrayRef shape2) {
+  size_t ndim1 = shape1.size();
+  size_t ndim2 = shape2.size();
+  size_t ndim = ndim1 < ndim2 ? ndim1 : ndim2;
+
+  for (int64_t i = static_cast<int64_t>(ndim) - 1; i >= 0; --i) {
+    if (shape1[--ndim1] == shape2[--ndim2] || shape1[ndim1] == 1 ||
+        shape2[ndim2] == 1) {
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+// avoid copy-construction of Tensor by using a reference_wrapper.
+inline void check_defined(
+    std::initializer_list<std::reference_wrapper<const Tensor>> tensors,
+    const char* api_name) {
+  for (auto& t : tensors) {
+    if (!t.get().defined()) {
+      AT_ERROR(api_name, "(...) called with an undefined Tensor");
+    }
+  }
+}
+
+// NOTE [ ExpandUtils Borrowing ]
+//
+// Functions in ExpandUtils return `c10::MaybeOwned<Tensor>` because
+// expansion may not actually be needed, in which case we can improve
+// efficiency by returning
+// `c10::MaybeOwned<Tensor>::borrowed(to_expand)`. However, this means
+// that you need to be careful: the returned `c10::MaybeOwned<Tensor>`
+// must not outlive the original `Tensor` object that `to_expand`
+// referred to! The deleted rvalue reference overloads of these
+// functions help with this by preventing trivial use of a temporary
+// resulting from a function call, but it is still possible to make a
+// mistake.
+
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand) {
+  if (tensor.sym_sizes().equals(to_expand.sym_sizes())) {
+    return c10::MaybeOwned<Tensor>::borrowed(to_expand);
+  }
+  return c10::MaybeOwned<Tensor>::owned(
+      to_expand.expand_symint(tensor.sym_sizes()));
+}
+
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand) = delete;
+
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand,
+    const char* api_name) {
+  check_defined({tensor, to_expand}, api_name);
+  return expand_inplace(tensor, to_expand);
+}
+
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand,
+    const char* api_name) = delete;
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    const Tensor& to_expand2) {
+  if (tensor.sizes().equals(to_expand1.sizes()) &&
+      tensor.sizes().equals((to_expand2.sizes()))) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2));
+  }
+
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand(tensor.sizes())),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand(tensor.sizes())));
+}
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    const Tensor& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    Tensor&& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(const Tensor& tensor, Tensor&& to_expand1, Tensor&& to_expand2) =
+    delete;
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) {
+  check_defined({tensor, to_expand1, to_expand2}, api_name);
+  return expand_inplace(tensor, to_expand1, to_expand2);
+}
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+
+// See NOTE [ ExpandUtils Borrowing ] above for `MaybeOwned` explanation.
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(const Tensor& to_expand1, const Tensor& to_expand2) {
+  auto s1 = to_expand1.sym_sizes();
+  auto s2 = to_expand2.sym_sizes();
+  if (s1.equals(s2)) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2));
+  }
+
+  auto expanded_size = infer_size_symdimvector(s1, s2);
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand_symint(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand_symint(expanded_size)));
+}
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, const Tensor& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(const Tensor& to_expand1, Tensor&& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, Tensor&& to_expand2) = delete;
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) {
+  check_defined({to_expand1, to_expand2}, api_name);
+  return expand_outplace(to_expand1, to_expand2);
+}
+
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3) {
+  if (to_expand1.sizes().equals(to_expand2.sizes()) &&
+      to_expand1.sizes().equals(to_expand3.sizes())) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand3));
+  }
+
+  auto expanded_size12 =
+      infer_size_dimvector(to_expand1.sizes(), to_expand2.sizes());
+  auto expanded_size =
+      infer_size_dimvector(expanded_size12, to_expand3.sizes());
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand3.expand(expanded_size)));
+}
+
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, Tensor&& to_expand2, Tensor&& to_expand3) =
+    delete;
+
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) {
+  check_defined({to_expand1, to_expand2, to_expand3}, api_name);
+  return expand_outplace(to_expand1, to_expand2, to_expand3);
+}
+
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+
+inline c10::MaybeOwned<Tensor> expand_size(
+    const Tensor& to_expand,
+    IntArrayRef sizes) {
+  if (to_expand.sizes().equals(sizes)) {
+    return c10::MaybeOwned<Tensor>::borrowed(to_expand);
+  }
+
+  return c10::MaybeOwned<Tensor>::owned(to_expand.expand(sizes));
+}
+
+inline c10::MaybeOwned<Tensor> expand_size(
+    Tensor&& to_expand,
+    IntArrayRef sizes) = delete;
+
+inline c10::MaybeOwned<Tensor> expand_size(
+    const Tensor& to_expand,
+    IntArrayRef sizes,
+    const char* api_name) {
+  check_defined({to_expand}, api_name);
+  return expand_size(to_expand, sizes);
+}
+
+inline c10::MaybeOwned<Tensor> expand_size(
+    Tensor&& to_expand,
+    IntArrayRef sizes,
+    const char* api_name) = delete;
+
+inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
+  // expands a list of Tensors; ignores undefined (null) tensors
+  bool first = true;
+  DimVector sizes;
+  for (const auto i : c10::irange(to_expand.size())) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (first) {
+      sizes = to_expand[i].sizes();
+      first = false;
+    } else {
+      sizes = infer_size_dimvector(sizes, to_expand[i].sizes());
+    }
+  }
+
+  std::vector<Tensor> result(to_expand.size());
+  for (const auto i : c10::irange(to_expand.size())) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (to_expand[i].sizes().equals(sizes)) {
+      result[i] = to_expand[i];
+    } else {
+      result[i] = to_expand[i].expand(sizes);
+    }
+  }
+  return result;
+}
+
+template <typename T>
+inline Tensor _sum_to(
+    Tensor tensor,
+    const c10::ArrayRef<T> shape,
+    bool always_return_non_view = false) {
+  if (shape.size() == 0) {
+    return tensor.sum();
+  }
+
+  auto sizes = at::symint::sizes<T>(tensor);
+  c10::SmallVector<int64_t, 8> reduce_dims;
+  const int64_t leading_dims = sizes.size() - shape.size();
+  for (const auto i : c10::irange(leading_dims)) {
+    reduce_dims.push_back(i);
+  }
+  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
+    if (shape[i - leading_dims] == 1 && sizes[i] != 1) {
+      reduce_dims.push_back(i);
+    }
+  }
+
+  if (!reduce_dims.empty()) {
+    tensor = tensor.sum(reduce_dims, /*keepdim=*/true);
+  }
+
+  if (always_return_non_view) {
+    // This is only actually used by the functionalization pass.
+    // We want to be able to guarantee that this function doesn't return a view
+    // of the input.
+    return leading_dims > 0 ? at::symint::view_copy<T>(tensor, shape)
+                            : tensor.clone();
+  } else {
+    return leading_dims > 0 ? at::symint::view<T>(tensor, shape) : tensor;
+  }
+}
+
+inline Tensor sum_to(
+    Tensor tensor,
+    const c10::SymIntArrayRef shape,
+    bool always_return_non_view = false) {
+  return _sum_to(std::move(tensor), shape, always_return_non_view);
+}
+
+// Sums `tensor` repeatedly to produce a tensor of shape `shape`.
+// Precondition: is_expandable_to(shape, tensor.sizes()) must be true
+inline Tensor sum_to(
+    Tensor tensor,
+    const IntArrayRef shape,
+    bool always_return_non_view = false) {
+  return _sum_to(std::move(tensor), shape, always_return_non_view);
+}
+
+static inline bool is_expandable_to(
+    SymIntArrayRef shape,
+    c10::SymIntArrayRef desired) {
+  size_t ndim = shape.size();
+  size_t target_dim = desired.size();
+  if (ndim > target_dim) {
+    return false;
+  }
+  for (const auto i : c10::irange(ndim)) {
+    const auto& size = shape[ndim - i - 1];
+    const auto& target = desired[target_dim - i - 1];
+    if (size != target && size != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
+  auto sym_shape = c10::SymIntArrayRef(
+      reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
+  auto sym_desired = c10::SymIntArrayRef(
+      reinterpret_cast<const c10::SymInt*>(desired.data()), desired.size());
+  return is_expandable_to(sym_shape, sym_desired);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Formatting.h b/MLPY/Lib/site-packages/torch/include/ATen/Formatting.h
new file mode 100644
index 0000000000000000000000000000000000000000..e23b27ffd373180a1857a5491694eff11705f9a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Formatting.h
@@ -0,0 +1 @@
+#include <ATen/core/Formatting.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/FuncTorchTLS.h b/MLPY/Lib/site-packages/torch/include/ATen/FuncTorchTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..6430caadfa947f57f76f1d7e218b4b4d60140f8b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/FuncTorchTLS.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <memory>
+
+namespace at::functorch {
+
+// NOTE [functorch TLS in pytorch/pytorch]
+//
+// functorch lives out-of-tree. However, it has some TLS that needs to be
+// propagated. The solution for that is we store a pointer to the TLS
+// inside pytorch/pytorch and extend FuncTorchTLSBase inside functorch to
+// include whatever functorch needs.
+//
+// We need to store a pointer due to the indirection:
+// inside functorch, we will create a subclass of FunctorchTLSBase called
+// FuncTorchTLSImpl that actually contains metadata, like the DynamicLayerStack.
+// FuncTorchTLSBase doesn't have any metadata because it hasn't been defined
+// yet.
+//
+// Here in pytorch/pytorch, we will pass around FuncTorchTLSBase*, but inside
+// functorch, we will assign a FuncTorchTLSImpl* to the FunctorchTLSBase*.
+// We can't directly pass around FunctorchTLSBase (without a pointer) because
+// FuncTorchTLSImpl does not fit inside a FuncTorchTLSBase by virtue of having
+// more elements.
+struct TORCH_API FuncTorchTLSBase {
+  virtual ~FuncTorchTLSBase() = default;
+  virtual std::unique_ptr<FuncTorchTLSBase> deepcopy() const = 0;
+
+  virtual int64_t checkSupportsSingleLevelAutogradFunction() const = 0;
+  virtual void checkSupportsCppAutogradFunction() const = 0;
+  virtual void checkSupportsInplaceRequiresGrad() const = 0;
+  virtual void checkSupportsRetainGrad() const = 0;
+};
+
+// returns deepcopy of the functorch tls
+TORCH_API std::unique_ptr<FuncTorchTLSBase> getCopyOfFuncTorchTLS();
+
+// sets the functorch tls. always does a deep copy.
+TORCH_API void setFuncTorchTLS(
+    const std::shared_ptr<const FuncTorchTLSBase>& state);
+
+// get a mutable reference to the functorch tls
+TORCH_API std::unique_ptr<FuncTorchTLSBase>& functorchTLSAccessor();
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/FunctionalStorageImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/FunctionalStorageImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2753121e1da2ae541b5afe748a3c05690f01676e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/FunctionalStorageImpl.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace at::functionalization {
+
+// See Note [Functionalization Pass In Core]
+
+// ViewMeta is a class used by the functionalization pass to navigate between
+// a base tensor and a view tensor.
+// For example, if I call `b = a.view1(...)`
+// the functionalization pass will generate and store a ViewMeta on b that looks
+// like:
+//
+// ViewMeta(
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
+//     return base.view1(...);
+//   },
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
+//   int64_t mutated_view_idx) -> at::Tensor {
+//     return at::functionalization::impl::view1_inverse(base, mutated_view,
+//     ...);
+//   }
+//
+// The forward_fn lambda describes how to replay view1 on a tensor.
+//
+// The reverse_fn lambda describes how, given a tensor that is already a view,
+// how to get the corresponding base tensor. See Note [Functionalization Pass:
+// View Inverses] for details.
+struct ViewMeta {
+  ViewMeta(
+      std::function<Tensor(const Tensor&, int64_t)> forward,
+      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
+      bool is_multi_output = false,
+      int64_t out_idx = 0)
+      : forward_fn(std::move(forward)),
+        reverse_fn(std::move(reverse)),
+        out_index(out_idx),
+        is_multi_output(is_multi_output) {}
+
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  // See Note [out_idx in ViewMeta]
+  int64_t out_index;
+
+  // Tells us if this is a multi-output view
+  bool is_multi_output;
+
+  // Returns a copy of the current ViewMeta, if out_idx matches the current
+  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // functions, but a new out index.
+  ViewMeta to_out_idx(int64_t out_idx);
+};
+
+// FunctionalStorageImpl is a subclass of StorageImpl used by the
+// functionalization pass. It has no underlying data (similar to meta storage).
+// It also knows how to reflect mutations to tensors in the absence of a valid
+// data pointer.
+//
+// A storage represents the state shared by (potentially multiple) views of the
+// same tensor. For example, in the following code:
+//
+// b = a.view1(...)
+// c = b.view2(...)
+// b.add_(1)
+// --> storage.add_update(b, {view1_meta})
+//
+// The call to add_(1) will result in a call to alias.add_update(b,
+// {view1_meta}), queueing up the mutation from b onto the alias. Later, suppose
+// c is used in an expression (e.g. you try to print c, or pass it to an
+// operator). Doing so will involve "syncing" c. First we apply any pending
+// updates to the alias, and then we regenerate c by replaying its views off of
+// the updated alias. E.g:
+//
+// print(str(c))
+// --> c.sync_()
+//     --> alias.apply_updates() // after this, the alias will be updated to
+//     reflect the mutation to b
+struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
+ public:
+  struct Update {
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const at::Tensor new_val;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::vector<ViewMeta> view_metas;
+  };
+
+  explicit FunctionalStorageImpl(const Tensor& value);
+
+  void add_update(
+      const Tensor& updated_val,
+      const std::vector<ViewMeta>& view_metas);
+  bool apply_updates();
+  const Tensor& base() {
+    return base_;
+  }
+  size_t generation() const {
+    return generation_;
+  }
+  void freeze() {
+    frozen_ = true;
+  }
+
+  ~FunctionalStorageImpl() override = default;
+
+ private:
+  // NB: base_ should always point to a tensor BELOW the current
+  // functionalization layer. This is mainly to avoid reference cycles. e.g.
+  // given `b = a.view(...)` Both a.storage_ and b.storage_ are a
+  // FunctionStorageImpl containing an Walualias, with contains a Tensor
+  // `base_`. In this case (where a and b are FunctionalTensorWrapper's), base_
+  // should point not to a, but to a's unwrapped value, a.value_` See Note
+  // [Functionalization: Walualias Removal] for a diagram that shows this
+  // visually.
+  at::Tensor base_;
+  std::vector<Update> updates_;
+  // generation_ gets incremented every time a mutation is queued onto the
+  // alias. It is used to determine if a given tensor is "up to date", or if it
+  // needs to be regenerated from the alias.
+  size_t generation_ = 0;
+  // If frozen, no more mutations are allowed on this storage.  Once frozen, a
+  // storage cannot be unfrozen.
+  bool frozen_ = false;
+};
+
+} // namespace at::functionalization
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/FunctionalTensorWrapper.h b/MLPY/Lib/site-packages/torch/include/ATen/FunctionalTensorWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef40ae5e931bc8eda6f00c16cac9b67eed7276ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/FunctionalTensorWrapper.h
@@ -0,0 +1,408 @@
+
+#pragma once
+
+#include <ATen/ArrayRef.h>
+#include <ATen/FunctionalStorageImpl.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/List.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+#include <c10/core/DispatchKey.h>
+
+namespace at {
+
+// Note [Functionalization Pass In Core]
+// The Functionalization pass is used to remove aliasing from a pytorch program.
+//
+// This is useful for backends that don't support aliasing, like XLA and Vulkan.
+// It's also necessary in order to remove mutation from a program, which is
+// needed in Functorch.
+//
+// Consider this program:
+// a = torch.ones(...)
+// b = a.view(...)
+// b.add_(1)
+//
+// In this program, b is meant to alias with a due to the use of view(). At the
+// end of the program, both a and b are full of 2's. However, backends that
+// don't support aliasing aren't able to correctly implement the view()
+// operator. Instead, they can opt into the Functionalization pass, which will
+// sit between the user and the backend, and provide the necessary aliasing
+// logic.
+//
+// The functionalization pass will turn the above program into a slightly
+// different program that has the same semantics, transparently to the user,
+// that backends like XLA/Vulkan are able to implement a = torch.ones(...) b =
+// a.view_copy(...)  # view() replaced with view_copy(). Backends like
+// XLA/Vulkan can implement this! b.add_(1) a.add_(1)  # Our functionalization
+// pass machinery knows that a and b are aliased - it applies b's mutation to a
+// too.
+//
+// So, how does the functionalization pass keep track of which tensors are
+// aliased? The pass works by wrapping EVERY tensor in the program inside of a
+// FunctionalTensorWrapper, which knows about its alias'd tensors.
+//
+// See Note [Functionalization: Alias Removal] for details on the aliasing
+// machinery. See Note [Functionalization: Mutation Removal] for details on
+// mutation removal.
+struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
+  explicit FunctionalTensorWrapper(const Tensor& value);
+  // Additional constructor to create a FunctionalTensorWrapper directly from an
+  // underlying tensor that was created from a view. For example, the code b =
+  // a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a,
+  // view1_meta)
+  explicit FunctionalTensorWrapper(
+      const Tensor& view_value,
+      const FunctionalTensorWrapper* base,
+      const functionalization::ViewMeta& meta);
+
+  // Get the underlying, actual tensor, that doesn't know anything about
+  // functionalization.
+  const Tensor& value() const {
+    return value_;
+  };
+  // The concept of "level" is only ever important to functorch; it's exposed
+  // here as more of a hook for functorch to use.
+  int64_t level() const {
+    return level_;
+  };
+  void set_level(int64_t level) {
+    level_ = level;
+  }
+  bool has_metadata_mutation() const {
+    return has_metadata_mutation_;
+  };
+
+  // Denotes a mutation that's hidden from autograd,
+  // e.g. for the purposes of passing a tensor to a triton kernel
+  void mark_mutation_hidden_from_autograd() {
+    mutation_hidden_from_autograd_counter_++;
+  }
+  void mark_mutation_during_no_grad_or_inference_mode() {
+    mutation_during_no_grad_or_inference_mode_++;
+  }
+  // Are all the mutations happening to the tensor hidden from autograd
+  bool are_all_mutations_hidden_from_autograd() const {
+    return mutation_hidden_from_autograd_counter_ == mutation_counter_;
+  }
+  // Did all mutations happen under no_grad or inference_mode
+  // (We also need to ignore mutations fully hidden from autograd here)
+  bool are_all_mutations_under_no_grad_or_inference_mode() const {
+    return mutation_hidden_from_autograd_counter_ +
+        mutation_during_no_grad_or_inference_mode_ ==
+        mutation_counter_;
+  }
+
+  // Sync's the underlying tensor with its alias, if it's out of date. This
+  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
+  // Replay the views (if any) to regenerate the current tensor off of the
+  // updated alias.
+  void sync_();
+  // Performs step (1) of the sync. This is its own public API because it's
+  // needed by view_inplace ops like transpose_. See Note [Functionalization
+  // Pass - Inplace View Ops]
+  void regenerate_from_base();
+  // Performs step (2) of the sync. This is its own public API because it's
+  // needed by functorch. functorch wants to make sure that all input tensors to
+  // a functionalized program have been properly synced so it can properly
+  // propagate mutations to inputs. It can't just call sync_(), because the
+  // FunctionalTensorWrapper will look like it has no aliases and sync_ will be
+  // a noop. We use the reference count on storage_ to determine if the wrapper
+  // is aliased, and by the time functorch is ready to propagate updates to
+  // inputs, any intermediate views of the input created by the program will
+  // have been deallocated. This function also returns whether or not the base
+  // actually had any updates to apply.
+  bool apply_updates();
+  // Takes the current state of value_ and snapshots it, sending it as a pending
+  // update to the alias.
+  void commit_update();
+  // When any tensor is mutated, the tensor increments its alias's "generation".
+  // Separately, each tensor maintains its own "generation" counter, which is
+  // used to determine if it's up-to-date with its alias. The act of syncing a
+  // tensor will set a tensor's generation equal to its alias's generation.
+  bool is_up_to_date() const;
+  // Freezes the storage of this tensor, preventing subsequent mutations
+  void freeze_storage() const;
+  // Every FunctionalTensorWrapper contains a vector<ViewMeta> objects
+  // describing the series of view ops that ran to generate the current tensor
+  // from the base tensor. This method is used by inplace-view ops like
+  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
+  // tensor by replaying the views off of the alias.
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+
+  // Custom implementation of self.set_(src)
+  void set__impl(const FunctionalTensorWrapper* other);
+
+  // Returns whether the current tensor's data was ever mutated
+  bool has_data_mutation();
+  //
+  // Returns whether the current FunctionalTensorWrapper
+  // experienced a set_() call.
+  bool was_storage_changed() {
+    return was_storage_changed_;
+  }
+
+  // The functionalization pass can be used to remove mutations.
+  // It does so by replacing any mutation op with it's corresponding
+  // out-of-place op, followed by a call to replace_(). e.g:
+  //
+  // a.add_(1)
+  //
+  // will turn into:
+  //
+  // tmp = a.add(1)
+  // a.replace_(tmp)
+  //
+  // replace_() swaps out the wrapped tensor, value_, with tmp.
+  void replace_(const Tensor& other);
+
+  bool is_multi_output_view() {
+    return is_multi_output_view_;
+  }
+
+  // See Note[resize_() in functionalization pass]
+  void maybe_replace_storage(const Tensor& other);
+
+  // Replaces the storage with a new functional storage,
+  // and clears the view_metas_ stack.
+  // WARNING: Calling this function will sever the aliasing relationship between
+  // the current FunctionalTensorWrapper and any of its outstanding aliases.
+  // Please only call if you know what you're doing.
+  void _unsafe_reset_storage();
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  ~FunctionalTensorWrapper() override = default;
+
+  // FunctionalTensorWrapper overrides all custom size/stride function,
+  // so that if the inner tensor has a custom implementation
+  // we make sure to call that implementation.
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t dim_custom() const override;
+  int64_t numel_custom() const override;
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymInt sym_size_custom(int64_t d) const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  c10::SymInt sym_storage_offset_custom() const override;
+  c10::Device device_custom() const override;
+
+ private:
+  const char* tensorimpl_type_name() const override;
+  void set_constructor_metadata();
+  functionalization::FunctionalStorageImpl* functional_storage_impl() const;
+
+  // This is used to re-implement shallow_copy_and_detach for
+  // FunctionalTensorWrapper. The implementation is identical, but we just need
+  // to return a subclass instead of a plain TensorImpl.
+  // TODO: maybe it's possible to arrange for that to happen automatically
+  // without an override here?
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+  void copy_tensor_metadata_and_refresh(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  // Note that value is not taken by reference: internally, the wrapper will
+  // change the value tensor that it points to over time.
+  Tensor value_;
+  int64_t level_{};
+  // These two counters are used for identifying
+  // whether all the mutations on a given tensor are hidden from autograd or
+  // not. If we have an input mutation that is hidden from autograd, then once
+  // we convert the input mutation to a copy_() we know it will be safe to hide
+  // the copy_() from autograd as well.
+  uint64_t mutation_counter_ = 0;
+  uint64_t mutation_hidden_from_autograd_counter_ = 0;
+  uint64_t mutation_during_no_grad_or_inference_mode_ = 0;
+  bool has_metadata_mutation_ = false;
+  bool is_multi_output_view_ = false;
+  // Did the tensor experience a set_() call.
+  bool was_storage_changed_ = false;
+
+  size_t generation_ = 0;
+  std::vector<at::functionalization::ViewMeta> view_metas_;
+
+ protected:
+  static void copy_tensor_metadata(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
+};
+
+// Utility functions for the functionalization pass.
+
+namespace functionalization {
+namespace impl {
+
+TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+    const Tensor& tensor) {
+  auto functional_impl =
+      static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
+  return functional_impl;
+}
+
+TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
+TORCH_API bool isFunctionalTensor(const c10::optional<Tensor>& t);
+TORCH_API bool isFunctionalTensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API bool isFunctionalTensor(ITensorListRef list);
+
+TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
+TORCH_API c10::optional<Tensor> to_functional_tensor(
+    const c10::optional<Tensor>& tensor);
+TORCH_API c10::List<c10::optional<Tensor>> to_functional_tensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> to_functional_tensor(ITensorListRef t_list);
+
+TORCH_API void freeze_functional_tensor(const Tensor& tensor);
+
+TORCH_API Tensor
+from_functional_tensor(const Tensor& tensor, bool assert_functional = true);
+TORCH_API c10::optional<Tensor> from_functional_tensor(
+    const c10::optional<Tensor>& t,
+    bool assert_functional = true);
+TORCH_API c10::List<c10::optional<Tensor>> from_functional_tensor(
+    const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> from_functional_tensor(ITensorListRef t_list);
+
+TORCH_API void sync(const at::Tensor& t);
+TORCH_API void sync(const c10::optional<Tensor>& t);
+TORCH_API void sync(const c10::List<c10::optional<Tensor>>& t_list);
+TORCH_API void sync(ITensorListRef t_list);
+
+TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other);
+TORCH_API void replace_(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+
+TORCH_API void commit_update(const Tensor& functional_tensor);
+TORCH_API void commit_update(ITensorListRef functional_tensor);
+
+TORCH_API void unsafe_reset_storage(const Tensor& functional_tensor);
+
+TORCH_API void mark_mutation_hidden_from_autograd(
+    const Tensor& functional_tensor);
+
+TORCH_API bool are_all_mutations_hidden_from_autograd(
+    const Tensor& functional_tensor);
+
+TORCH_API bool are_all_mutations_under_no_grad_or_inference_mode(
+    const Tensor& functional_tensor);
+
+// These two methods are XLA-specific logic and are no-ops
+// for the normal functionalization flow.
+TORCH_API void propagate_xla_data(
+    const Tensor& functional_tensor,
+    const Tensor& other);
+TORCH_API void propagate_xla_data(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+
+Tensor create_functional_tensor_with_view_meta(
+    const Tensor& view_to_wrap,
+    const Tensor& base,
+    functionalization::ViewMeta meta,
+    int64_t out_idx = 0);
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const Tensor& base,
+    const functionalization::ViewMeta& meta);
+
+void mutate_view_meta(
+    const Tensor& self,
+    const functionalization::ViewMeta& meta);
+
+void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
+void set_sizes_strides_offset(
+    const std::vector<Tensor>& outs,
+    const std::vector<Tensor>& meta_outs);
+
+//  ~~~~~ TLS used in functionalization ~~~~~
+
+TORCH_API bool getFunctionalizationReapplyViewsTLS();
+TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views);
+
+class TORCH_API FunctionalizationReapplyViewsGuard {
+ public:
+  FunctionalizationReapplyViewsGuard(bool reapply_views)
+      : prev_(getFunctionalizationReapplyViewsTLS()) {
+    setFunctionalizationReapplyViewsTLS(reapply_views);
+  }
+
+  ~FunctionalizationReapplyViewsGuard() {
+    setFunctionalizationReapplyViewsTLS(prev_);
+  }
+
+  FunctionalizationReapplyViewsGuard(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) =
+      delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      FunctionalizationReapplyViewsGuard&&) = delete;
+
+ private:
+  bool prev_;
+};
+
+} // namespace impl
+
+// Helper function to call an out-of-place composite aten kernel that may use
+// mutations / views internally, and functionalize them.
+TORCH_API void functionalize_op_helper(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op final {};
+
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op<Op, symint, ReturnType(ParameterTypes...)> final {
+  static ReturnType call(
+      typename c10::maybe_keep_symint<symint, ParameterTypes>::type... args) {
+    using FuncType = ReturnType(
+        typename c10::maybe_keep_symint<symint, ParameterTypes>::type...);
+    auto op = c10::Dispatcher::singleton()
+                  .findSchemaOrThrow(
+                      (const char*)Op::name, (const char*)Op::overload_name)
+                  .typed<FuncType>();
+
+    return c10::impl::BoxedKernelWrapper<FuncType>::call(
+        c10::BoxedKernel::makeFromFunction<functionalize_op_helper>(),
+        op,
+        // BoxedKernelWrapper knows to ignore this keyset argument,
+        // because functionalize_op_helper doesn't take in a DispatchKeySet
+        c10::DispatchKeySet(),
+        args...);
+  }
+};
+
+template <class Op>
+using functionalize_aten_op =
+    _functionalize_aten_op<Op, false, typename Op::schema>;
+
+template <class Op>
+using functionalize_aten_op_symint =
+    _functionalize_aten_op<Op, true, typename Op::schema>;
+
+} // namespace functionalization
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Functions.h b/MLPY/Lib/site-packages/torch/include/ATen/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2ca4df21178cb487fcaca8b2908a150824a28d3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Functions.h
@@ -0,0 +1,1427 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Functions.h
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}.h> and   \
+  see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS]
+//
+// In ATen, certain generated headers files include the definitions of
+// every single operator in PyTorch. Unfortunately this means every
+// time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile every source file that includes any of these headers.
+//
+// To break up these header dependencies, and improve incremental
+// build times for all PyTorch developers. These headers are split
+// into per-operator headers in the `ATen/ops` folder. This limits
+// incremental builds to only changes to methods of `Tensor`, or files
+// that use the specific operator being changed. With `at::sum` as an
+// example, you should include
+//
+//   <ATen/ops/sum.h>               // instead of ATen/Functions.h
+//   <ATen/ops/sum_native.h>        // instead of ATen/NativeFunctions.h
+//   <ATen/ops/sum_ops.h>           // instead of ATen/Operators.h
+//   <ATen/ops/sum_cpu_dispatch.h>  // instead of ATen/CPUFunctions.h
+//
+// However, even if you're careful to use this in your own code.
+// `Functions.h` might be included indirectly through another header
+// without you realising. To avoid this, you can add
+//
+//   #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+//
+// to the top of your source file. This way any time the non-specific
+// headers are included, the compiler will error out.
+//
+// Also, be aware that `ops` are not available in all build
+// configurations (namely fb-internal) so you must guard these
+// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g.
+//
+//   #ifndef AT_PER_OPERATOR_HEADERS
+//   #include <ATen/Functions.h>
+//   #else
+//   #include <ATen/ops/sum.h>
+//   #endif
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/tensor.h>
+
+#include <ATen/ops/_adaptive_avg_pool2d.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward.h>
+#include <ATen/ops/_adaptive_avg_pool3d.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward.h>
+#include <ATen/ops/_add_batch_dim.h>
+#include <ATen/ops/_add_relu.h>
+#include <ATen/ops/_addmm_activation.h>
+#include <ATen/ops/_aminmax.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale.h>
+#include <ATen/ops/_amp_update_scale.h>
+#include <ATen/ops/_assert_async.h>
+#include <ATen/ops/_assert_scalar.h>
+#include <ATen/ops/_assert_tensor_metadata.h>
+#include <ATen/ops/_autocast_to_full_precision.h>
+#include <ATen/ops/_autocast_to_reduced_precision.h>
+#include <ATen/ops/_backward.h>
+#include <ATen/ops/_batch_norm_impl_index.h>
+#include <ATen/ops/_batch_norm_impl_index_backward.h>
+#include <ATen/ops/_cast_Byte.h>
+#include <ATen/ops/_cast_Char.h>
+#include <ATen/ops/_cast_Double.h>
+#include <ATen/ops/_cast_Float.h>
+#include <ATen/ops/_cast_Half.h>
+#include <ATen/ops/_cast_Int.h>
+#include <ATen/ops/_cast_Long.h>
+#include <ATen/ops/_cast_Short.h>
+#include <ATen/ops/_cdist_backward.h>
+#include <ATen/ops/_cdist_forward.h>
+#include <ATen/ops/_cholesky_solve_helper.h>
+#include <ATen/ops/_choose_qparams_per_tensor.h>
+#include <ATen/ops/_chunk_cat.h>
+#include <ATen/ops/_coalesce.h>
+#include <ATen/ops/_coalesced.h>
+#include <ATen/ops/_compute_linear_combination.h>
+#include <ATen/ops/_conj.h>
+#include <ATen/ops/_conj_copy.h>
+#include <ATen/ops/_conj_physical.h>
+#include <ATen/ops/_conv_depthwise2d.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
+#include <ATen/ops/_convert_weight_to_int4pack.h>
+#include <ATen/ops/_convolution.h>
+#include <ATen/ops/_convolution_double_backward.h>
+#include <ATen/ops/_convolution_mode.h>
+#include <ATen/ops/_copy_from.h>
+#include <ATen/ops/_copy_from_and_resize.h>
+#include <ATen/ops/_cslt_compress.h>
+#include <ATen/ops/_cslt_sparse_mm.h>
+#include <ATen/ops/_cslt_sparse_mm_search.h>
+#include <ATen/ops/_ctc_loss.h>
+#include <ATen/ops/_ctc_loss_backward.h>
+#include <ATen/ops/_cudnn_ctc_loss.h>
+#include <ATen/ops/_cudnn_init_dropout_state.h>
+#include <ATen/ops/_cudnn_rnn.h>
+#include <ATen/ops/_cudnn_rnn_backward.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight.h>
+#include <ATen/ops/_cufft_clear_plan_cache.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size.h>
+#include <ATen/ops/_cufft_get_plan_cache_size.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size.h>
+#include <ATen/ops/_cummax_helper.h>
+#include <ATen/ops/_cummin_helper.h>
+#include <ATen/ops/_debug_has_internal_overlap.h>
+#include <ATen/ops/_dimI.h>
+#include <ATen/ops/_dimV.h>
+#include <ATen/ops/_dim_arange.h>
+#include <ATen/ops/_dirichlet_grad.h>
+#include <ATen/ops/_efficient_attention_backward.h>
+#include <ATen/ops/_efficient_attention_forward.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_embedding_bag.h>
+#include <ATen/ops/_embedding_bag_backward.h>
+#include <ATen/ops/_embedding_bag_dense_backward.h>
+#include <ATen/ops/_embedding_bag_forward_only.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward.h>
+#include <ATen/ops/_embedding_bag_sparse_backward.h>
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/_euclidean_dist.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h>
+#include <ATen/ops/_fft_c2c.h>
+#include <ATen/ops/_fft_c2r.h>
+#include <ATen/ops/_fft_r2c.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask.h>
+#include <ATen/ops/_flash_attention_backward.h>
+#include <ATen/ops/_flash_attention_forward.h>
+#include <ATen/ops/_foobar.h>
+#include <ATen/ops/_foreach_abs.h>
+#include <ATen/ops/_foreach_acos.h>
+#include <ATen/ops/_foreach_add.h>
+#include <ATen/ops/_foreach_addcdiv.h>
+#include <ATen/ops/_foreach_addcmul.h>
+#include <ATen/ops/_foreach_asin.h>
+#include <ATen/ops/_foreach_atan.h>
+#include <ATen/ops/_foreach_ceil.h>
+#include <ATen/ops/_foreach_clamp_max.h>
+#include <ATen/ops/_foreach_clamp_min.h>
+#include <ATen/ops/_foreach_copy.h>
+#include <ATen/ops/_foreach_cos.h>
+#include <ATen/ops/_foreach_cosh.h>
+#include <ATen/ops/_foreach_div.h>
+#include <ATen/ops/_foreach_erf.h>
+#include <ATen/ops/_foreach_erfc.h>
+#include <ATen/ops/_foreach_exp.h>
+#include <ATen/ops/_foreach_expm1.h>
+#include <ATen/ops/_foreach_floor.h>
+#include <ATen/ops/_foreach_frac.h>
+#include <ATen/ops/_foreach_lerp.h>
+#include <ATen/ops/_foreach_lgamma.h>
+#include <ATen/ops/_foreach_log.h>
+#include <ATen/ops/_foreach_log10.h>
+#include <ATen/ops/_foreach_log1p.h>
+#include <ATen/ops/_foreach_log2.h>
+#include <ATen/ops/_foreach_maximum.h>
+#include <ATen/ops/_foreach_minimum.h>
+#include <ATen/ops/_foreach_mul.h>
+#include <ATen/ops/_foreach_neg.h>
+#include <ATen/ops/_foreach_norm.h>
+#include <ATen/ops/_foreach_pow.h>
+#include <ATen/ops/_foreach_reciprocal.h>
+#include <ATen/ops/_foreach_round.h>
+#include <ATen/ops/_foreach_sigmoid.h>
+#include <ATen/ops/_foreach_sign.h>
+#include <ATen/ops/_foreach_sin.h>
+#include <ATen/ops/_foreach_sinh.h>
+#include <ATen/ops/_foreach_sqrt.h>
+#include <ATen/ops/_foreach_sub.h>
+#include <ATen/ops/_foreach_tan.h>
+#include <ATen/ops/_foreach_tanh.h>
+#include <ATen/ops/_foreach_trunc.h>
+#include <ATen/ops/_foreach_zero.h>
+#include <ATen/ops/_functional_assert_async.h>
+#include <ATen/ops/_functional_assert_scalar.h>
+#include <ATen/ops/_functional_sym_constrain_range.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size.h>
+#include <ATen/ops/_fused_adam.h>
+#include <ATen/ops/_fused_adamw.h>
+#include <ATen/ops/_fused_dropout.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper.h>
+#include <ATen/ops/_fused_sdp_choice.h>
+#include <ATen/ops/_fused_sgd.h>
+#include <ATen/ops/_fw_primal.h>
+#include <ATen/ops/_fw_primal_copy.h>
+#include <ATen/ops/_gather_sparse_backward.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type.h>
+#include <ATen/ops/_has_same_storage_numel.h>
+#include <ATen/ops/_histogramdd_bin_edges.h>
+#include <ATen/ops/_histogramdd_from_bin_cts.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors.h>
+#include <ATen/ops/_index_put_impl.h>
+#include <ATen/ops/_indices.h>
+#include <ATen/ops/_indices_copy.h>
+#include <ATen/ops/_int_mm.h>
+#include <ATen/ops/_is_all_true.h>
+#include <ATen/ops/_is_any_true.h>
+#include <ATen/ops/_is_zerotensor.h>
+#include <ATen/ops/_lazy_clone.h>
+#include <ATen/ops/_linalg_check_errors.h>
+#include <ATen/ops/_linalg_det.h>
+#include <ATen/ops/_linalg_eigh.h>
+#include <ATen/ops/_linalg_eigvals.h>
+#include <ATen/ops/_linalg_slogdet.h>
+#include <ATen/ops/_linalg_solve_ex.h>
+#include <ATen/ops/_linalg_svd.h>
+#include <ATen/ops/_local_scalar_dense.h>
+#include <ATen/ops/_log_softmax.h>
+#include <ATen/ops/_log_softmax_backward_data.h>
+#include <ATen/ops/_logcumsumexp.h>
+#include <ATen/ops/_lstm_mps.h>
+#include <ATen/ops/_lu_with_info.h>
+#include <ATen/ops/_make_dep_token.h>
+#include <ATen/ops/_make_dual.h>
+#include <ATen/ops/_make_dual_copy.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
+#include <ATen/ops/_masked_scale.h>
+#include <ATen/ops/_masked_softmax.h>
+#include <ATen/ops/_masked_softmax_backward.h>
+#include <ATen/ops/_mixed_dtypes_linear.h>
+#include <ATen/ops/_mkldnn_reshape.h>
+#include <ATen/ops/_mkldnn_transpose.h>
+#include <ATen/ops/_mps_convolution.h>
+#include <ATen/ops/_mps_convolution_transpose.h>
+#include <ATen/ops/_native_batch_norm_legit.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training.h>
+#include <ATen/ops/_native_multi_head_attention.h>
+#include <ATen/ops/_neg_view.h>
+#include <ATen/ops/_neg_view_copy.h>
+#include <ATen/ops/_nested_from_padded.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example.h>
+#include <ATen/ops/_nested_get_jagged_dummy.h>
+#include <ATen/ops/_nested_get_lengths.h>
+#include <ATen/ops/_nested_get_offsets.h>
+#include <ATen/ops/_nested_get_ragged_idx.h>
+#include <ATen/ops/_nested_get_values.h>
+#include <ATen/ops/_nested_get_values_copy.h>
+#include <ATen/ops/_nested_select_backward.h>
+#include <ATen/ops/_nested_sum_backward.h>
+#include <ATen/ops/_nested_tensor_from_mask.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list.h>
+#include <ATen/ops/_nested_tensor_size.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape.h>
+#include <ATen/ops/_nested_tensor_storage_offsets.h>
+#include <ATen/ops/_nested_tensor_strides.h>
+#include <ATen/ops/_nested_view_from_buffer.h>
+#include <ATen/ops/_nested_view_from_buffer_copy.h>
+#include <ATen/ops/_nested_view_from_jagged.h>
+#include <ATen/ops/_nested_view_from_jagged_copy.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta.h>
+#include <ATen/ops/_nnpack_available.h>
+#include <ATen/ops/_nnpack_spatial_convolution.h>
+#include <ATen/ops/_nnz.h>
+#include <ATen/ops/_pack_padded_sequence.h>
+#include <ATen/ops/_pack_padded_sequence_backward.h>
+#include <ATen/ops/_pad_circular.h>
+#include <ATen/ops/_pad_enum.h>
+#include <ATen/ops/_pad_packed_sequence.h>
+#include <ATen/ops/_pdist_backward.h>
+#include <ATen/ops/_pdist_forward.h>
+#include <ATen/ops/_pin_memory.h>
+#include <ATen/ops/_prelu_kernel.h>
+#include <ATen/ops/_prelu_kernel_backward.h>
+#include <ATen/ops/_print.h>
+#include <ATen/ops/_propagate_xla_data.h>
+#include <ATen/ops/_remove_batch_dim.h>
+#include <ATen/ops/_reshape_alias.h>
+#include <ATen/ops/_reshape_alias_copy.h>
+#include <ATen/ops/_reshape_copy.h>
+#include <ATen/ops/_reshape_from_tensor.h>
+#include <ATen/ops/_resize_output.h>
+#include <ATen/ops/_rowwise_prune.h>
+#include <ATen/ops/_sample_dirichlet.h>
+#include <ATen/ops/_saturate_weight_to_fp16.h>
+#include <ATen/ops/_scaled_dot_product_attention_math.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward.h>
+#include <ATen/ops/_scaled_mm.h>
+#include <ATen/ops/_segment_reduce_backward.h>
+#include <ATen/ops/_shape_as_tensor.h>
+#include <ATen/ops/_slow_conv2d_backward.h>
+#include <ATen/ops/_slow_conv2d_forward.h>
+#include <ATen/ops/_sobol_engine_draw.h>
+#include <ATen/ops/_sobol_engine_ff.h>
+#include <ATen/ops/_sobol_engine_initialize_state.h>
+#include <ATen/ops/_sobol_engine_scramble.h>
+#include <ATen/ops/_softmax.h>
+#include <ATen/ops/_softmax_backward_data.h>
+#include <ATen/ops/_sparse_addmm.h>
+#include <ATen/ops/_sparse_broadcast_to.h>
+#include <ATen/ops/_sparse_broadcast_to_copy.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csr_prod.h>
+#include <ATen/ops/_sparse_csr_sum.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_log_softmax.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data.h>
+#include <ATen/ops/_sparse_mask_projection.h>
+#include <ATen/ops/_sparse_mm.h>
+#include <ATen/ops/_sparse_mm_reduce_impl.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward.h>
+#include <ATen/ops/_sparse_semi_structured_linear.h>
+#include <ATen/ops/_sparse_softmax.h>
+#include <ATen/ops/_sparse_softmax_backward_data.h>
+#include <ATen/ops/_sparse_sparse_matmul.h>
+#include <ATen/ops/_sparse_sum.h>
+#include <ATen/ops/_sparse_sum_backward.h>
+#include <ATen/ops/_spdiags.h>
+#include <ATen/ops/_stack.h>
+#include <ATen/ops/_standard_gamma.h>
+#include <ATen/ops/_standard_gamma_grad.h>
+#include <ATen/ops/_test_ambiguous_defaults.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy.h>
+#include <ATen/ops/_test_check_tensor.h>
+#include <ATen/ops/_test_functorch_fallback.h>
+#include <ATen/ops/_test_optional_filled_intlist.h>
+#include <ATen/ops/_test_optional_floatlist.h>
+#include <ATen/ops/_test_optional_intlist.h>
+#include <ATen/ops/_test_parallel_materialize.h>
+#include <ATen/ops/_test_serialization_subcmul.h>
+#include <ATen/ops/_test_string_default.h>
+#include <ATen/ops/_test_warn_in_autograd.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward.h>
+#include <ATen/ops/_thnn_fused_gru_cell.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward.h>
+#include <ATen/ops/_thnn_fused_lstm_cell.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl.h>
+#include <ATen/ops/_to_copy.h>
+#include <ATen/ops/_to_cpu.h>
+#include <ATen/ops/_to_dense.h>
+#include <ATen/ops/_to_sparse.h>
+#include <ATen/ops/_to_sparse_bsc.h>
+#include <ATen/ops/_to_sparse_bsr.h>
+#include <ATen/ops/_to_sparse_csc.h>
+#include <ATen/ops/_to_sparse_csr.h>
+#include <ATen/ops/_to_sparse_semi_structured.h>
+#include <ATen/ops/_transform_bias_rescale_qkv.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd.h>
+#include <ATen/ops/_trilinear.h>
+#include <ATen/ops/_triton_multi_head_attention.h>
+#include <ATen/ops/_triton_scaled_dot_attention.h>
+#include <ATen/ops/_unique.h>
+#include <ATen/ops/_unique2.h>
+#include <ATen/ops/_unpack_dual.h>
+#include <ATen/ops/_unsafe_index.h>
+#include <ATen/ops/_unsafe_index_put.h>
+#include <ATen/ops/_unsafe_view.h>
+#include <ATen/ops/_upsample_bicubic2d_aa.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward.h>
+#include <ATen/ops/_upsample_bilinear2d_aa.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward.h>
+#include <ATen/ops/_upsample_nearest_exact1d.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact2d.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_use_cudnn_ctc_loss.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight.h>
+#include <ATen/ops/_validate_compressed_sparse_indices.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args.h>
+#include <ATen/ops/_values.h>
+#include <ATen/ops/_values_copy.h>
+#include <ATen/ops/_version.h>
+#include <ATen/ops/_weight_int4pack_mm.h>
+#include <ATen/ops/_weight_int8pack_mm.h>
+#include <ATen/ops/_weight_norm.h>
+#include <ATen/ops/_weight_norm_differentiable_backward.h>
+#include <ATen/ops/_weight_norm_interface.h>
+#include <ATen/ops/_weight_norm_interface_backward.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/absolute.h>
+#include <ATen/ops/acos.h>
+#include <ATen/ops/acosh.h>
+#include <ATen/ops/adaptive_avg_pool1d.h>
+#include <ATen/ops/adaptive_avg_pool2d.h>
+#include <ATen/ops/adaptive_avg_pool3d.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward.h>
+#include <ATen/ops/adaptive_max_pool1d.h>
+#include <ATen/ops/adaptive_max_pool2d.h>
+#include <ATen/ops/adaptive_max_pool2d_backward.h>
+#include <ATen/ops/adaptive_max_pool3d.h>
+#include <ATen/ops/adaptive_max_pool3d_backward.h>
+#include <ATen/ops/add.h>
+#include <ATen/ops/addbmm.h>
+#include <ATen/ops/addcdiv.h>
+#include <ATen/ops/addcmul.h>
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/addmv.h>
+#include <ATen/ops/addr.h>
+#include <ATen/ops/adjoint.h>
+#include <ATen/ops/affine_grid_generator.h>
+#include <ATen/ops/affine_grid_generator_backward.h>
+#include <ATen/ops/alias.h>
+#include <ATen/ops/alias_copy.h>
+#include <ATen/ops/align_as.h>
+#include <ATen/ops/align_tensors.h>
+#include <ATen/ops/align_to.h>
+#include <ATen/ops/all.h>
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/alpha_dropout.h>
+#include <ATen/ops/amax.h>
+#include <ATen/ops/amin.h>
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/and.h>
+#include <ATen/ops/angle.h>
+#include <ATen/ops/any.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arccos.h>
+#include <ATen/ops/arccosh.h>
+#include <ATen/ops/arcsin.h>
+#include <ATen/ops/arcsinh.h>
+#include <ATen/ops/arctan.h>
+#include <ATen/ops/arctan2.h>
+#include <ATen/ops/arctanh.h>
+#include <ATen/ops/argmax.h>
+#include <ATen/ops/argmin.h>
+#include <ATen/ops/argsort.h>
+#include <ATen/ops/argwhere.h>
+#include <ATen/ops/as_strided.h>
+#include <ATen/ops/as_strided_copy.h>
+#include <ATen/ops/as_strided_scatter.h>
+#include <ATen/ops/asin.h>
+#include <ATen/ops/asinh.h>
+#include <ATen/ops/atan.h>
+#include <ATen/ops/atan2.h>
+#include <ATen/ops/atanh.h>
+#include <ATen/ops/atleast_1d.h>
+#include <ATen/ops/atleast_2d.h>
+#include <ATen/ops/atleast_3d.h>
+#include <ATen/ops/avg_pool1d.h>
+#include <ATen/ops/avg_pool2d.h>
+#include <ATen/ops/avg_pool2d_backward.h>
+#include <ATen/ops/avg_pool3d.h>
+#include <ATen/ops/avg_pool3d_backward.h>
+#include <ATen/ops/baddbmm.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/batch_norm.h>
+#include <ATen/ops/batch_norm_backward_elemt.h>
+#include <ATen/ops/batch_norm_backward_reduce.h>
+#include <ATen/ops/batch_norm_elemt.h>
+#include <ATen/ops/batch_norm_gather_stats.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts.h>
+#include <ATen/ops/batch_norm_stats.h>
+#include <ATen/ops/batch_norm_update_stats.h>
+#include <ATen/ops/bernoulli.h>
+#include <ATen/ops/bilinear.h>
+#include <ATen/ops/binary_cross_entropy.h>
+#include <ATen/ops/binary_cross_entropy_backward.h>
+#include <ATen/ops/binary_cross_entropy_with_logits.h>
+#include <ATen/ops/bincount.h>
+#include <ATen/ops/binomial.h>
+#include <ATen/ops/bitwise_and.h>
+#include <ATen/ops/bitwise_left_shift.h>
+#include <ATen/ops/bitwise_not.h>
+#include <ATen/ops/bitwise_or.h>
+#include <ATen/ops/bitwise_right_shift.h>
+#include <ATen/ops/bitwise_xor.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/block_diag.h>
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/broadcast_tensors.h>
+#include <ATen/ops/broadcast_to.h>
+#include <ATen/ops/bucketize.h>
+#include <ATen/ops/can_cast.h>
+#include <ATen/ops/cartesian_prod.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cauchy.h>
+#include <ATen/ops/ccol_indices.h>
+#include <ATen/ops/ccol_indices_copy.h>
+#include <ATen/ops/cdist.h>
+#include <ATen/ops/ceil.h>
+#include <ATen/ops/celu.h>
+#include <ATen/ops/chain_matmul.h>
+#include <ATen/ops/chalf.h>
+#include <ATen/ops/channel_shuffle.h>
+#include <ATen/ops/cholesky.h>
+#include <ATen/ops/cholesky_inverse.h>
+#include <ATen/ops/cholesky_solve.h>
+#include <ATen/ops/choose_qparams_optimized.h>
+#include <ATen/ops/chunk.h>
+#include <ATen/ops/clamp.h>
+#include <ATen/ops/clamp_max.h>
+#include <ATen/ops/clamp_min.h>
+#include <ATen/ops/clip.h>
+#include <ATen/ops/clone.h>
+#include <ATen/ops/coalesce.h>
+#include <ATen/ops/col2im.h>
+#include <ATen/ops/col_indices.h>
+#include <ATen/ops/col_indices_copy.h>
+#include <ATen/ops/column_stack.h>
+#include <ATen/ops/combinations.h>
+#include <ATen/ops/complex.h>
+#include <ATen/ops/concat.h>
+#include <ATen/ops/concatenate.h>
+#include <ATen/ops/conj.h>
+#include <ATen/ops/conj_physical.h>
+#include <ATen/ops/constant_pad_nd.h>
+#include <ATen/ops/contiguous.h>
+#include <ATen/ops/conv1d.h>
+#include <ATen/ops/conv2d.h>
+#include <ATen/ops/conv3d.h>
+#include <ATen/ops/conv_depthwise3d.h>
+#include <ATen/ops/conv_tbc.h>
+#include <ATen/ops/conv_tbc_backward.h>
+#include <ATen/ops/conv_transpose1d.h>
+#include <ATen/ops/conv_transpose2d.h>
+#include <ATen/ops/conv_transpose3d.h>
+#include <ATen/ops/convolution.h>
+#include <ATen/ops/convolution_backward.h>
+#include <ATen/ops/convolution_backward_overrideable.h>
+#include <ATen/ops/convolution_overrideable.h>
+#include <ATen/ops/copy.h>
+#include <ATen/ops/copy_sparse_to_sparse.h>
+#include <ATen/ops/copysign.h>
+#include <ATen/ops/corrcoef.h>
+#include <ATen/ops/cos.h>
+#include <ATen/ops/cosh.h>
+#include <ATen/ops/cosine_embedding_loss.h>
+#include <ATen/ops/cosine_similarity.h>
+#include <ATen/ops/count_nonzero.h>
+#include <ATen/ops/cov.h>
+#include <ATen/ops/cross.h>
+#include <ATen/ops/cross_entropy_loss.h>
+#include <ATen/ops/crow_indices.h>
+#include <ATen/ops/crow_indices_copy.h>
+#include <ATen/ops/ctc_loss.h>
+#include <ATen/ops/cudnn_affine_grid_generator.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward.h>
+#include <ATen/ops/cudnn_batch_norm.h>
+#include <ATen/ops/cudnn_batch_norm_backward.h>
+#include <ATen/ops/cudnn_convolution.h>
+#include <ATen/ops/cudnn_convolution_add_relu.h>
+#include <ATen/ops/cudnn_convolution_relu.h>
+#include <ATen/ops/cudnn_convolution_transpose.h>
+#include <ATen/ops/cudnn_grid_sampler.h>
+#include <ATen/ops/cudnn_grid_sampler_backward.h>
+#include <ATen/ops/cudnn_is_acceptable.h>
+#include <ATen/ops/cummax.h>
+#include <ATen/ops/cummaxmin_backward.h>
+#include <ATen/ops/cummin.h>
+#include <ATen/ops/cumprod.h>
+#include <ATen/ops/cumprod_backward.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/cumulative_trapezoid.h>
+#include <ATen/ops/data.h>
+#include <ATen/ops/deg2rad.h>
+#include <ATen/ops/dense_dim.h>
+#include <ATen/ops/dequantize.h>
+#include <ATen/ops/det.h>
+#include <ATen/ops/detach.h>
+#include <ATen/ops/detach_copy.h>
+#include <ATen/ops/diag.h>
+#include <ATen/ops/diag_embed.h>
+#include <ATen/ops/diagflat.h>
+#include <ATen/ops/diagonal.h>
+#include <ATen/ops/diagonal_backward.h>
+#include <ATen/ops/diagonal_copy.h>
+#include <ATen/ops/diagonal_scatter.h>
+#include <ATen/ops/diff.h>
+#include <ATen/ops/digamma.h>
+#include <ATen/ops/dist.h>
+#include <ATen/ops/div.h>
+#include <ATen/ops/divide.h>
+#include <ATen/ops/dot.h>
+#include <ATen/ops/dropout.h>
+#include <ATen/ops/dsplit.h>
+#include <ATen/ops/dstack.h>
+#include <ATen/ops/einsum.h>
+#include <ATen/ops/elu.h>
+#include <ATen/ops/elu_backward.h>
+#include <ATen/ops/embedding.h>
+#include <ATen/ops/embedding_backward.h>
+#include <ATen/ops/embedding_bag.h>
+#include <ATen/ops/embedding_dense_backward.h>
+#include <ATen/ops/embedding_renorm.h>
+#include <ATen/ops/embedding_sparse_backward.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_permuted.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/eq.h>
+#include <ATen/ops/equal.h>
+#include <ATen/ops/erf.h>
+#include <ATen/ops/erfc.h>
+#include <ATen/ops/erfinv.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/exp2.h>
+#include <ATen/ops/expand.h>
+#include <ATen/ops/expand_as.h>
+#include <ATen/ops/expand_copy.h>
+#include <ATen/ops/expm1.h>
+#include <ATen/ops/exponential.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/fake_quantize_per_channel_affine.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_linear_int8_weight.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix.h>
+#include <ATen/ops/feature_alpha_dropout.h>
+#include <ATen/ops/feature_dropout.h>
+#include <ATen/ops/fft_fft.h>
+#include <ATen/ops/fft_fft2.h>
+#include <ATen/ops/fft_fftfreq.h>
+#include <ATen/ops/fft_fftn.h>
+#include <ATen/ops/fft_fftshift.h>
+#include <ATen/ops/fft_hfft.h>
+#include <ATen/ops/fft_hfft2.h>
+#include <ATen/ops/fft_hfftn.h>
+#include <ATen/ops/fft_ifft.h>
+#include <ATen/ops/fft_ifft2.h>
+#include <ATen/ops/fft_ifftn.h>
+#include <ATen/ops/fft_ifftshift.h>
+#include <ATen/ops/fft_ihfft.h>
+#include <ATen/ops/fft_ihfft2.h>
+#include <ATen/ops/fft_ihfftn.h>
+#include <ATen/ops/fft_irfft.h>
+#include <ATen/ops/fft_irfft2.h>
+#include <ATen/ops/fft_irfftn.h>
+#include <ATen/ops/fft_rfft.h>
+#include <ATen/ops/fft_rfft2.h>
+#include <ATen/ops/fft_rfftfreq.h>
+#include <ATen/ops/fft_rfftn.h>
+#include <ATen/ops/fill.h>
+#include <ATen/ops/fill_diagonal.h>
+#include <ATen/ops/fix.h>
+#include <ATen/ops/flatten.h>
+#include <ATen/ops/flatten_dense_tensors.h>
+#include <ATen/ops/flip.h>
+#include <ATen/ops/fliplr.h>
+#include <ATen/ops/flipud.h>
+#include <ATen/ops/float_power.h>
+#include <ATen/ops/floor.h>
+#include <ATen/ops/floor_divide.h>
+#include <ATen/ops/fmax.h>
+#include <ATen/ops/fmin.h>
+#include <ATen/ops/fmod.h>
+#include <ATen/ops/frac.h>
+#include <ATen/ops/fractional_max_pool2d.h>
+#include <ATen/ops/fractional_max_pool2d_backward.h>
+#include <ATen/ops/fractional_max_pool3d.h>
+#include <ATen/ops/fractional_max_pool3d_backward.h>
+#include <ATen/ops/frexp.h>
+#include <ATen/ops/frobenius_norm.h>
+#include <ATen/ops/from_file.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant.h>
+#include <ATen/ops/gather.h>
+#include <ATen/ops/gather_backward.h>
+#include <ATen/ops/gcd.h>
+#include <ATen/ops/ge.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/gelu_backward.h>
+#include <ATen/ops/geometric.h>
+#include <ATen/ops/geqrf.h>
+#include <ATen/ops/ger.h>
+#include <ATen/ops/glu.h>
+#include <ATen/ops/glu_backward.h>
+#include <ATen/ops/glu_backward_jvp.h>
+#include <ATen/ops/glu_jvp.h>
+#include <ATen/ops/gradient.h>
+#include <ATen/ops/greater.h>
+#include <ATen/ops/greater_equal.h>
+#include <ATen/ops/grid_sampler.h>
+#include <ATen/ops/grid_sampler_2d.h>
+#include <ATen/ops/grid_sampler_2d_backward.h>
+#include <ATen/ops/grid_sampler_3d.h>
+#include <ATen/ops/grid_sampler_3d_backward.h>
+#include <ATen/ops/group_norm.h>
+#include <ATen/ops/gru.h>
+#include <ATen/ops/gru_cell.h>
+#include <ATen/ops/gt.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hardshrink.h>
+#include <ATen/ops/hardshrink_backward.h>
+#include <ATen/ops/hardsigmoid.h>
+#include <ATen/ops/hardsigmoid_backward.h>
+#include <ATen/ops/hardswish.h>
+#include <ATen/ops/hardswish_backward.h>
+#include <ATen/ops/hardtanh.h>
+#include <ATen/ops/hardtanh_backward.h>
+#include <ATen/ops/heaviside.h>
+#include <ATen/ops/hinge_embedding_loss.h>
+#include <ATen/ops/histc.h>
+#include <ATen/ops/histogram.h>
+#include <ATen/ops/histogramdd.h>
+#include <ATen/ops/hsplit.h>
+#include <ATen/ops/hspmm.h>
+#include <ATen/ops/hstack.h>
+#include <ATen/ops/huber_loss.h>
+#include <ATen/ops/huber_loss_backward.h>
+#include <ATen/ops/hypot.h>
+#include <ATen/ops/i0.h>
+#include <ATen/ops/igamma.h>
+#include <ATen/ops/igammac.h>
+#include <ATen/ops/im2col.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/index.h>
+#include <ATen/ops/index_add.h>
+#include <ATen/ops/index_copy.h>
+#include <ATen/ops/index_fill.h>
+#include <ATen/ops/index_put.h>
+#include <ATen/ops/index_reduce.h>
+#include <ATen/ops/index_select.h>
+#include <ATen/ops/index_select_backward.h>
+#include <ATen/ops/indices.h>
+#include <ATen/ops/indices_copy.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward.h>
+#include <ATen/ops/inner.h>
+#include <ATen/ops/instance_norm.h>
+#include <ATen/ops/int_repr.h>
+#include <ATen/ops/inverse.h>
+#include <ATen/ops/is_coalesced.h>
+#include <ATen/ops/is_complex.h>
+#include <ATen/ops/is_conj.h>
+#include <ATen/ops/is_distributed.h>
+#include <ATen/ops/is_floating_point.h>
+#include <ATen/ops/is_inference.h>
+#include <ATen/ops/is_leaf.h>
+#include <ATen/ops/is_neg.h>
+#include <ATen/ops/is_nonzero.h>
+#include <ATen/ops/is_pinned.h>
+#include <ATen/ops/is_same_size.h>
+#include <ATen/ops/is_set_to.h>
+#include <ATen/ops/is_signed.h>
+#include <ATen/ops/is_vulkan_available.h>
+#include <ATen/ops/isclose.h>
+#include <ATen/ops/isfinite.h>
+#include <ATen/ops/isin.h>
+#include <ATen/ops/isinf.h>
+#include <ATen/ops/isnan.h>
+#include <ATen/ops/isneginf.h>
+#include <ATen/ops/isposinf.h>
+#include <ATen/ops/isreal.h>
+#include <ATen/ops/istft.h>
+#include <ATen/ops/item.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kl_div.h>
+#include <ATen/ops/kron.h>
+#include <ATen/ops/kthvalue.h>
+#include <ATen/ops/l1_loss.h>
+#include <ATen/ops/layer_norm.h>
+#include <ATen/ops/lcm.h>
+#include <ATen/ops/ldexp.h>
+#include <ATen/ops/le.h>
+#include <ATen/ops/leaky_relu.h>
+#include <ATen/ops/leaky_relu_backward.h>
+#include <ATen/ops/lerp.h>
+#include <ATen/ops/less.h>
+#include <ATen/ops/less_equal.h>
+#include <ATen/ops/lgamma.h>
+#include <ATen/ops/lift.h>
+#include <ATen/ops/lift_fresh.h>
+#include <ATen/ops/lift_fresh_copy.h>
+#include <ATen/ops/linalg_cholesky.h>
+#include <ATen/ops/linalg_cholesky_ex.h>
+#include <ATen/ops/linalg_cond.h>
+#include <ATen/ops/linalg_cross.h>
+#include <ATen/ops/linalg_det.h>
+#include <ATen/ops/linalg_diagonal.h>
+#include <ATen/ops/linalg_eig.h>
+#include <ATen/ops/linalg_eigh.h>
+#include <ATen/ops/linalg_eigvals.h>
+#include <ATen/ops/linalg_eigvalsh.h>
+#include <ATen/ops/linalg_householder_product.h>
+#include <ATen/ops/linalg_inv.h>
+#include <ATen/ops/linalg_inv_ex.h>
+#include <ATen/ops/linalg_ldl_factor.h>
+#include <ATen/ops/linalg_ldl_factor_ex.h>
+#include <ATen/ops/linalg_ldl_solve.h>
+#include <ATen/ops/linalg_lstsq.h>
+#include <ATen/ops/linalg_lu.h>
+#include <ATen/ops/linalg_lu_factor.h>
+#include <ATen/ops/linalg_lu_factor_ex.h>
+#include <ATen/ops/linalg_lu_solve.h>
+#include <ATen/ops/linalg_matmul.h>
+#include <ATen/ops/linalg_matrix_exp.h>
+#include <ATen/ops/linalg_matrix_norm.h>
+#include <ATen/ops/linalg_matrix_power.h>
+#include <ATen/ops/linalg_matrix_rank.h>
+#include <ATen/ops/linalg_multi_dot.h>
+#include <ATen/ops/linalg_norm.h>
+#include <ATen/ops/linalg_pinv.h>
+#include <ATen/ops/linalg_qr.h>
+#include <ATen/ops/linalg_slogdet.h>
+#include <ATen/ops/linalg_solve.h>
+#include <ATen/ops/linalg_solve_ex.h>
+#include <ATen/ops/linalg_solve_triangular.h>
+#include <ATen/ops/linalg_svd.h>
+#include <ATen/ops/linalg_svdvals.h>
+#include <ATen/ops/linalg_tensorinv.h>
+#include <ATen/ops/linalg_tensorsolve.h>
+#include <ATen/ops/linalg_vander.h>
+#include <ATen/ops/linalg_vecdot.h>
+#include <ATen/ops/linalg_vector_norm.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/linear_backward.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/log.h>
+#include <ATen/ops/log10.h>
+#include <ATen/ops/log1p.h>
+#include <ATen/ops/log2.h>
+#include <ATen/ops/log_normal.h>
+#include <ATen/ops/log_sigmoid.h>
+#include <ATen/ops/log_sigmoid_backward.h>
+#include <ATen/ops/log_sigmoid_forward.h>
+#include <ATen/ops/log_softmax.h>
+#include <ATen/ops/logaddexp.h>
+#include <ATen/ops/logaddexp2.h>
+#include <ATen/ops/logcumsumexp.h>
+#include <ATen/ops/logdet.h>
+#include <ATen/ops/logical_and.h>
+#include <ATen/ops/logical_not.h>
+#include <ATen/ops/logical_or.h>
+#include <ATen/ops/logical_xor.h>
+#include <ATen/ops/logit.h>
+#include <ATen/ops/logit_backward.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logsumexp.h>
+#include <ATen/ops/lshift.h>
+#include <ATen/ops/lstm.h>
+#include <ATen/ops/lstm_cell.h>
+#include <ATen/ops/lstm_mps_backward.h>
+#include <ATen/ops/lt.h>
+#include <ATen/ops/lu_solve.h>
+#include <ATen/ops/lu_unpack.h>
+#include <ATen/ops/mH.h>
+#include <ATen/ops/mT.h>
+#include <ATen/ops/margin_ranking_loss.h>
+#include <ATen/ops/masked_fill.h>
+#include <ATen/ops/masked_scatter.h>
+#include <ATen/ops/masked_scatter_backward.h>
+#include <ATen/ops/masked_select.h>
+#include <ATen/ops/masked_select_backward.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/matmul_backward.h>
+#include <ATen/ops/matrix_H.h>
+#include <ATen/ops/matrix_exp.h>
+#include <ATen/ops/matrix_exp_backward.h>
+#include <ATen/ops/matrix_power.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_pool1d.h>
+#include <ATen/ops/max_pool1d_with_indices.h>
+#include <ATen/ops/max_pool2d.h>
+#include <ATen/ops/max_pool2d_backward.h>
+#include <ATen/ops/max_pool2d_with_indices.h>
+#include <ATen/ops/max_pool2d_with_indices_backward.h>
+#include <ATen/ops/max_pool3d.h>
+#include <ATen/ops/max_pool3d_with_indices.h>
+#include <ATen/ops/max_pool3d_with_indices_backward.h>
+#include <ATen/ops/max_unpool2d.h>
+#include <ATen/ops/max_unpool3d.h>
+#include <ATen/ops/maximum.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/median.h>
+#include <ATen/ops/meshgrid.h>
+#include <ATen/ops/min.h>
+#include <ATen/ops/minimum.h>
+#include <ATen/ops/miopen_batch_norm.h>
+#include <ATen/ops/miopen_batch_norm_backward.h>
+#include <ATen/ops/miopen_convolution.h>
+#include <ATen/ops/miopen_convolution_add_relu.h>
+#include <ATen/ops/miopen_convolution_relu.h>
+#include <ATen/ops/miopen_convolution_transpose.h>
+#include <ATen/ops/miopen_depthwise_convolution.h>
+#include <ATen/ops/miopen_rnn.h>
+#include <ATen/ops/miopen_rnn_backward.h>
+#include <ATen/ops/mish.h>
+#include <ATen/ops/mish_backward.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h>
+#include <ATen/ops/mkldnn_convolution.h>
+#include <ATen/ops/mkldnn_linear.h>
+#include <ATen/ops/mkldnn_linear_backward.h>
+#include <ATen/ops/mkldnn_linear_backward_input.h>
+#include <ATen/ops/mkldnn_linear_backward_weights.h>
+#include <ATen/ops/mkldnn_max_pool2d.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward.h>
+#include <ATen/ops/mkldnn_max_pool3d.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight.h>
+#include <ATen/ops/mkldnn_rnn_layer.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward.h>
+#include <ATen/ops/mm.h>
+#include <ATen/ops/mode.h>
+#include <ATen/ops/moveaxis.h>
+#include <ATen/ops/movedim.h>
+#include <ATen/ops/mps_convolution_backward.h>
+#include <ATen/ops/mps_convolution_transpose_backward.h>
+#include <ATen/ops/mse_loss.h>
+#include <ATen/ops/mse_loss_backward.h>
+#include <ATen/ops/msort.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/multi_margin_loss.h>
+#include <ATen/ops/multi_margin_loss_backward.h>
+#include <ATen/ops/multilabel_margin_loss.h>
+#include <ATen/ops/multilabel_margin_loss_backward.h>
+#include <ATen/ops/multilabel_margin_loss_forward.h>
+#include <ATen/ops/multinomial.h>
+#include <ATen/ops/multiply.h>
+#include <ATen/ops/mv.h>
+#include <ATen/ops/mvlgamma.h>
+#include <ATen/ops/nan_to_num.h>
+#include <ATen/ops/nanmean.h>
+#include <ATen/ops/nanmedian.h>
+#include <ATen/ops/nanquantile.h>
+#include <ATen/ops/nansum.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/narrow_copy.h>
+#include <ATen/ops/native_batch_norm.h>
+#include <ATen/ops/native_batch_norm_backward.h>
+#include <ATen/ops/native_channel_shuffle.h>
+#include <ATen/ops/native_dropout.h>
+#include <ATen/ops/native_dropout_backward.h>
+#include <ATen/ops/native_group_norm.h>
+#include <ATen/ops/native_group_norm_backward.h>
+#include <ATen/ops/native_layer_norm.h>
+#include <ATen/ops/native_layer_norm_backward.h>
+#include <ATen/ops/native_norm.h>
+#include <ATen/ops/ne.h>
+#include <ATen/ops/neg.h>
+#include <ATen/ops/negative.h>
+#include <ATen/ops/nested_to_padded_tensor.h>
+#include <ATen/ops/new_empty.h>
+#include <ATen/ops/new_empty_strided.h>
+#include <ATen/ops/new_full.h>
+#include <ATen/ops/new_ones.h>
+#include <ATen/ops/new_zeros.h>
+#include <ATen/ops/nextafter.h>
+#include <ATen/ops/nll_loss.h>
+#include <ATen/ops/nll_loss2d.h>
+#include <ATen/ops/nll_loss2d_backward.h>
+#include <ATen/ops/nll_loss2d_forward.h>
+#include <ATen/ops/nll_loss_backward.h>
+#include <ATen/ops/nll_loss_forward.h>
+#include <ATen/ops/nll_loss_nd.h>
+#include <ATen/ops/nonzero.h>
+#include <ATen/ops/nonzero_numpy.h>
+#include <ATen/ops/nonzero_static.h>
+#include <ATen/ops/norm.h>
+#include <ATen/ops/norm_except_dim.h>
+#include <ATen/ops/normal.h>
+#include <ATen/ops/not_equal.h>
+#include <ATen/ops/nuclear_norm.h>
+#include <ATen/ops/numpy_T.h>
+#include <ATen/ops/one_hot.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/or.h>
+#include <ATen/ops/orgqr.h>
+#include <ATen/ops/ormqr.h>
+#include <ATen/ops/outer.h>
+#include <ATen/ops/output_nr.h>
+#include <ATen/ops/pad.h>
+#include <ATen/ops/pad_sequence.h>
+#include <ATen/ops/pairwise_distance.h>
+#include <ATen/ops/pdist.h>
+#include <ATen/ops/permute.h>
+#include <ATen/ops/permute_copy.h>
+#include <ATen/ops/pin_memory.h>
+#include <ATen/ops/pinverse.h>
+#include <ATen/ops/pixel_shuffle.h>
+#include <ATen/ops/pixel_unshuffle.h>
+#include <ATen/ops/poisson.h>
+#include <ATen/ops/poisson_nll_loss.h>
+#include <ATen/ops/polar.h>
+#include <ATen/ops/polygamma.h>
+#include <ATen/ops/positive.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/prelu.h>
+#include <ATen/ops/prod.h>
+#include <ATen/ops/promote_types.h>
+#include <ATen/ops/put.h>
+#include <ATen/ops/q_per_channel_axis.h>
+#include <ATen/ops/q_per_channel_scales.h>
+#include <ATen/ops/q_per_channel_zero_points.h>
+#include <ATen/ops/q_scale.h>
+#include <ATen/ops/q_zero_point.h>
+#include <ATen/ops/qr.h>
+#include <ATen/ops/qscheme.h>
+#include <ATen/ops/quantile.h>
+#include <ATen/ops/quantize_per_channel.h>
+#include <ATen/ops/quantize_per_tensor.h>
+#include <ATen/ops/quantize_per_tensor_dynamic.h>
+#include <ATen/ops/quantized_batch_norm.h>
+#include <ATen/ops/quantized_gru_cell.h>
+#include <ATen/ops/quantized_lstm_cell.h>
+#include <ATen/ops/quantized_max_pool1d.h>
+#include <ATen/ops/quantized_max_pool2d.h>
+#include <ATen/ops/quantized_max_pool3d.h>
+#include <ATen/ops/quantized_rnn_relu_cell.h>
+#include <ATen/ops/quantized_rnn_tanh_cell.h>
+#include <ATen/ops/rad2deg.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand_like.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn_like.h>
+#include <ATen/ops/random.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/ravel.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/reciprocal.h>
+#include <ATen/ops/record_stream.h>
+#include <ATen/ops/refine_names.h>
+#include <ATen/ops/reflection_pad1d.h>
+#include <ATen/ops/reflection_pad1d_backward.h>
+#include <ATen/ops/reflection_pad2d.h>
+#include <ATen/ops/reflection_pad2d_backward.h>
+#include <ATen/ops/reflection_pad3d.h>
+#include <ATen/ops/reflection_pad3d_backward.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/relu6.h>
+#include <ATen/ops/remainder.h>
+#include <ATen/ops/rename.h>
+#include <ATen/ops/renorm.h>
+#include <ATen/ops/repeat.h>
+#include <ATen/ops/repeat_interleave.h>
+#include <ATen/ops/replication_pad1d.h>
+#include <ATen/ops/replication_pad1d_backward.h>
+#include <ATen/ops/replication_pad2d.h>
+#include <ATen/ops/replication_pad2d_backward.h>
+#include <ATen/ops/replication_pad3d.h>
+#include <ATen/ops/replication_pad3d_backward.h>
+#include <ATen/ops/requires_grad.h>
+#include <ATen/ops/reshape.h>
+#include <ATen/ops/reshape_as.h>
+#include <ATen/ops/resize.h>
+#include <ATen/ops/resize_as.h>
+#include <ATen/ops/resize_as_sparse.h>
+#include <ATen/ops/resolve_conj.h>
+#include <ATen/ops/resolve_neg.h>
+#include <ATen/ops/result_type.h>
+#include <ATen/ops/retain_grad.h>
+#include <ATen/ops/retains_grad.h>
+#include <ATen/ops/rnn_relu.h>
+#include <ATen/ops/rnn_relu_cell.h>
+#include <ATen/ops/rnn_tanh.h>
+#include <ATen/ops/rnn_tanh_cell.h>
+#include <ATen/ops/roll.h>
+#include <ATen/ops/rot90.h>
+#include <ATen/ops/round.h>
+#include <ATen/ops/row_indices.h>
+#include <ATen/ops/row_indices_copy.h>
+#include <ATen/ops/row_stack.h>
+#include <ATen/ops/rrelu.h>
+#include <ATen/ops/rrelu_with_noise.h>
+#include <ATen/ops/rrelu_with_noise_backward.h>
+#include <ATen/ops/rshift.h>
+#include <ATen/ops/rsqrt.h>
+#include <ATen/ops/rsub.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/scaled_dot_product_attention.h>
+#include <ATen/ops/scatter.h>
+#include <ATen/ops/scatter_add.h>
+#include <ATen/ops/scatter_reduce.h>
+#include <ATen/ops/searchsorted.h>
+#include <ATen/ops/segment_reduce.h>
+#include <ATen/ops/select.h>
+#include <ATen/ops/select_backward.h>
+#include <ATen/ops/select_copy.h>
+#include <ATen/ops/select_scatter.h>
+#include <ATen/ops/selu.h>
+#include <ATen/ops/set.h>
+#include <ATen/ops/set_data.h>
+#include <ATen/ops/sgn.h>
+#include <ATen/ops/sigmoid.h>
+#include <ATen/ops/sigmoid_backward.h>
+#include <ATen/ops/sign.h>
+#include <ATen/ops/signbit.h>
+#include <ATen/ops/silu.h>
+#include <ATen/ops/silu_backward.h>
+#include <ATen/ops/sin.h>
+#include <ATen/ops/sinc.h>
+#include <ATen/ops/sinh.h>
+#include <ATen/ops/size.h>
+#include <ATen/ops/slice.h>
+#include <ATen/ops/slice_backward.h>
+#include <ATen/ops/slice_copy.h>
+#include <ATen/ops/slice_inverse.h>
+#include <ATen/ops/slice_scatter.h>
+#include <ATen/ops/slogdet.h>
+#include <ATen/ops/slow_conv3d.h>
+#include <ATen/ops/slow_conv3d_forward.h>
+#include <ATen/ops/slow_conv_dilated2d.h>
+#include <ATen/ops/slow_conv_dilated3d.h>
+#include <ATen/ops/slow_conv_transpose2d.h>
+#include <ATen/ops/slow_conv_transpose3d.h>
+#include <ATen/ops/smm.h>
+#include <ATen/ops/smooth_l1_loss.h>
+#include <ATen/ops/smooth_l1_loss_backward.h>
+#include <ATen/ops/soft_margin_loss.h>
+#include <ATen/ops/soft_margin_loss_backward.h>
+#include <ATen/ops/softmax.h>
+#include <ATen/ops/softplus.h>
+#include <ATen/ops/softplus_backward.h>
+#include <ATen/ops/softshrink.h>
+#include <ATen/ops/softshrink_backward.h>
+#include <ATen/ops/sort.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_dim.h>
+#include <ATen/ops/sparse_mask.h>
+#include <ATen/ops/sparse_resize.h>
+#include <ATen/ops/sparse_resize_and_clear.h>
+#include <ATen/ops/sparse_sampled_addmm.h>
+#include <ATen/ops/special_airy_ai.h>
+#include <ATen/ops/special_bessel_j0.h>
+#include <ATen/ops/special_bessel_j1.h>
+#include <ATen/ops/special_bessel_y0.h>
+#include <ATen/ops/special_bessel_y1.h>
+#include <ATen/ops/special_chebyshev_polynomial_t.h>
+#include <ATen/ops/special_chebyshev_polynomial_u.h>
+#include <ATen/ops/special_chebyshev_polynomial_v.h>
+#include <ATen/ops/special_chebyshev_polynomial_w.h>
+#include <ATen/ops/special_digamma.h>
+#include <ATen/ops/special_entr.h>
+#include <ATen/ops/special_erf.h>
+#include <ATen/ops/special_erfc.h>
+#include <ATen/ops/special_erfcx.h>
+#include <ATen/ops/special_erfinv.h>
+#include <ATen/ops/special_exp2.h>
+#include <ATen/ops/special_expit.h>
+#include <ATen/ops/special_expm1.h>
+#include <ATen/ops/special_gammainc.h>
+#include <ATen/ops/special_gammaincc.h>
+#include <ATen/ops/special_gammaln.h>
+#include <ATen/ops/special_hermite_polynomial_h.h>
+#include <ATen/ops/special_hermite_polynomial_he.h>
+#include <ATen/ops/special_i0.h>
+#include <ATen/ops/special_i0e.h>
+#include <ATen/ops/special_i1.h>
+#include <ATen/ops/special_i1e.h>
+#include <ATen/ops/special_laguerre_polynomial_l.h>
+#include <ATen/ops/special_legendre_polynomial_p.h>
+#include <ATen/ops/special_log1p.h>
+#include <ATen/ops/special_log_ndtr.h>
+#include <ATen/ops/special_log_softmax.h>
+#include <ATen/ops/special_logit.h>
+#include <ATen/ops/special_logsumexp.h>
+#include <ATen/ops/special_modified_bessel_i0.h>
+#include <ATen/ops/special_modified_bessel_i1.h>
+#include <ATen/ops/special_modified_bessel_k0.h>
+#include <ATen/ops/special_modified_bessel_k1.h>
+#include <ATen/ops/special_multigammaln.h>
+#include <ATen/ops/special_ndtr.h>
+#include <ATen/ops/special_ndtri.h>
+#include <ATen/ops/special_polygamma.h>
+#include <ATen/ops/special_psi.h>
+#include <ATen/ops/special_round.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w.h>
+#include <ATen/ops/special_sinc.h>
+#include <ATen/ops/special_softmax.h>
+#include <ATen/ops/special_spherical_bessel_j0.h>
+#include <ATen/ops/special_xlog1py.h>
+#include <ATen/ops/special_xlogy.h>
+#include <ATen/ops/special_zeta.h>
+#include <ATen/ops/split.h>
+#include <ATen/ops/split_copy.h>
+#include <ATen/ops/split_with_sizes.h>
+#include <ATen/ops/split_with_sizes_copy.h>
+#include <ATen/ops/sqrt.h>
+#include <ATen/ops/square.h>
+#include <ATen/ops/squeeze.h>
+#include <ATen/ops/squeeze_copy.h>
+#include <ATen/ops/sspaddmm.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/std.h>
+#include <ATen/ops/std_mean.h>
+#include <ATen/ops/stft.h>
+#include <ATen/ops/stride.h>
+#include <ATen/ops/sub.h>
+#include <ATen/ops/subtract.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/sum_to_size.h>
+#include <ATen/ops/svd.h>
+#include <ATen/ops/swapaxes.h>
+#include <ATen/ops/swapdims.h>
+#include <ATen/ops/sym_constrain_range.h>
+#include <ATen/ops/sym_constrain_range_for_size.h>
+#include <ATen/ops/sym_numel.h>
+#include <ATen/ops/sym_size.h>
+#include <ATen/ops/sym_storage_offset.h>
+#include <ATen/ops/sym_stride.h>
+#include <ATen/ops/t.h>
+#include <ATen/ops/t_copy.h>
+#include <ATen/ops/take.h>
+#include <ATen/ops/take_along_dim.h>
+#include <ATen/ops/tan.h>
+#include <ATen/ops/tanh.h>
+#include <ATen/ops/tanh_backward.h>
+#include <ATen/ops/tensor_split.h>
+#include <ATen/ops/tensordot.h>
+#include <ATen/ops/thnn_conv2d.h>
+#include <ATen/ops/threshold.h>
+#include <ATen/ops/threshold_backward.h>
+#include <ATen/ops/tile.h>
+#include <ATen/ops/to.h>
+#include <ATen/ops/to_dense.h>
+#include <ATen/ops/to_dense_backward.h>
+#include <ATen/ops/to_mkldnn.h>
+#include <ATen/ops/to_mkldnn_backward.h>
+#include <ATen/ops/to_padded_tensor.h>
+#include <ATen/ops/to_sparse.h>
+#include <ATen/ops/to_sparse_bsc.h>
+#include <ATen/ops/to_sparse_bsr.h>
+#include <ATen/ops/to_sparse_csc.h>
+#include <ATen/ops/to_sparse_csr.h>
+#include <ATen/ops/topk.h>
+#include <ATen/ops/trace.h>
+#include <ATen/ops/trace_backward.h>
+#include <ATen/ops/transpose.h>
+#include <ATen/ops/transpose_copy.h>
+#include <ATen/ops/trapezoid.h>
+#include <ATen/ops/trapz.h>
+#include <ATen/ops/triangular_solve.h>
+#include <ATen/ops/tril.h>
+#include <ATen/ops/tril_indices.h>
+#include <ATen/ops/triplet_margin_loss.h>
+#include <ATen/ops/triu.h>
+#include <ATen/ops/triu_indices.h>
+#include <ATen/ops/true_divide.h>
+#include <ATen/ops/trunc.h>
+#include <ATen/ops/type_as.h>
+#include <ATen/ops/unbind.h>
+#include <ATen/ops/unbind_copy.h>
+#include <ATen/ops/unflatten.h>
+#include <ATen/ops/unflatten_dense_tensors.h>
+#include <ATen/ops/unfold.h>
+#include <ATen/ops/unfold_backward.h>
+#include <ATen/ops/unfold_copy.h>
+#include <ATen/ops/uniform.h>
+#include <ATen/ops/unique_consecutive.h>
+#include <ATen/ops/unique_dim.h>
+#include <ATen/ops/unique_dim_consecutive.h>
+#include <ATen/ops/unsafe_chunk.h>
+#include <ATen/ops/unsafe_split.h>
+#include <ATen/ops/unsafe_split_with_sizes.h>
+#include <ATen/ops/unsqueeze.h>
+#include <ATen/ops/unsqueeze_copy.h>
+#include <ATen/ops/upsample_bicubic2d.h>
+#include <ATen/ops/upsample_bicubic2d_backward.h>
+#include <ATen/ops/upsample_bilinear2d.h>
+#include <ATen/ops/upsample_bilinear2d_backward.h>
+#include <ATen/ops/upsample_linear1d.h>
+#include <ATen/ops/upsample_linear1d_backward.h>
+#include <ATen/ops/upsample_nearest1d.h>
+#include <ATen/ops/upsample_nearest1d_backward.h>
+#include <ATen/ops/upsample_nearest2d.h>
+#include <ATen/ops/upsample_nearest2d_backward.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_trilinear3d.h>
+#include <ATen/ops/upsample_trilinear3d_backward.h>
+#include <ATen/ops/value_selecting_reduction_backward.h>
+#include <ATen/ops/values.h>
+#include <ATen/ops/values_copy.h>
+#include <ATen/ops/vander.h>
+#include <ATen/ops/var.h>
+#include <ATen/ops/var_mean.h>
+#include <ATen/ops/vdot.h>
+#include <ATen/ops/view.h>
+#include <ATen/ops/view_as.h>
+#include <ATen/ops/view_as_complex.h>
+#include <ATen/ops/view_as_complex_copy.h>
+#include <ATen/ops/view_as_real.h>
+#include <ATen/ops/view_as_real_copy.h>
+#include <ATen/ops/view_copy.h>
+#include <ATen/ops/vsplit.h>
+#include <ATen/ops/vstack.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/xlogy.h>
+#include <ATen/ops/xor.h>
+#include <ATen/ops/zero.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+
+namespace at {
+
+
+
+// Special C++ only overloads for std()-like functions (See gh-40287)
+// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+// So, for example std(0) would select the std(unbiased=False) overload
+TORCH_API inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+TORCH_API inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+
+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+
+inline int64_t size(const Tensor& tensor, int64_t dim) {
+  return tensor.size(dim);
+}
+
+inline int64_t stride(const Tensor& tensor, int64_t dim) {
+  return tensor.stride(dim);
+}
+
+inline bool is_complex(const Tensor& tensor) {
+  return tensor.is_complex();
+}
+
+inline bool is_floating_point(const Tensor& tensor) {
+  return tensor.is_floating_point();
+}
+
+inline bool is_signed(const Tensor& tensor) {
+  return tensor.is_signed();
+}
+
+inline bool is_inference(const Tensor& tensor) {
+  return tensor.is_inference();
+}
+
+inline bool _is_zerotensor(const Tensor& tensor) {
+  return tensor._is_zerotensor();
+}
+
+inline bool is_conj(const Tensor& tensor) {
+  return tensor.is_conj();
+}
+
+inline Tensor conj(const Tensor& tensor) {
+  return tensor.conj();
+}
+
+inline bool is_neg(const Tensor& tensor) {
+  return tensor.is_neg();
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Generator.h b/MLPY/Lib/site-packages/torch/include/ATen/Generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..741e39f29dae4cca6cb39f8b1d385bb14ed1b6c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Generator.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/core/Generator.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/InferSize.h b/MLPY/Lib/site-packages/torch/include/ATen/InferSize.h
new file mode 100644
index 0000000000000000000000000000000000000000..853425357e3d7d4697adfbaf1e67f5c295d2760f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/InferSize.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <ATen/DimVector.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Optional.h>
+#include <sstream>
+#include <vector>
+
+namespace at {
+
+// Infers the size of a dim with size -1, if it exists. Also checks that new
+// shape is compatible with the number of elements.
+//
+// templated to handle std::vector<int64_t> and DimVector use cases, see
+// below
+//
+template <typename InputArrayRef, typename NumelType, typename ResultVec>
+inline void infer_size_impl(
+    InputArrayRef shape,
+    NumelType numel,
+    ResultVec& res) {
+  NumelType newsize = 1;
+  // N.B. this is an index, not a sym dim!
+  auto infer_dim = c10::optional<int64_t>();
+  for (int64_t dim = 0, ndim = shape.size(); dim != ndim; dim++) {
+    if (shape[dim] == -1) {
+      if (infer_dim) {
+        throw std::runtime_error("only one dimension can be inferred");
+      }
+      infer_dim = dim;
+    } else if (shape[dim] >= 0) {
+      newsize *= shape[dim];
+    } else {
+      AT_ERROR("invalid shape dimension ", shape[dim]);
+    }
+  }
+
+  if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) {
+    if (infer_dim) {
+      // We have a degree of freedom here to select the dimension size; follow
+      // NumPy semantics and just bail.  However, a nice error message is needed
+      // because users often use `view` as a way to flatten & unflatten
+      // dimensions and will otherwise be confused why
+      //   empty_tensor.view( 0, 0)
+      // works yet
+      //   empty_tensor.view(-1, 0)
+      // doesn't.
+      TORCH_CHECK(
+          newsize != 0,
+          "cannot reshape tensor of 0 elements into shape ",
+          shape,
+          " because the unspecified dimension size -1 can be any "
+          "value and is ambiguous");
+      res[*infer_dim] = numel / newsize;
+    }
+    return;
+  }
+
+  std::ostringstream ss;
+  ss << "shape '" << shape << "' is invalid for input of size " << numel;
+  throw std::runtime_error(ss.str());
+}
+
+inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
+  auto res = shape.vec();
+  infer_size_impl(shape, numel, res);
+  return res;
+}
+
+inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {
+  auto res = at::DimVector(shape);
+  infer_size_impl(shape, numel, res);
+  return res;
+}
+
+inline at::SymDimVector infer_size_dv(
+    c10::SymIntArrayRef shape,
+    c10::SymInt numel) {
+  auto res = at::SymDimVector(shape);
+  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
+      shape, std::move(numel), res);
+  return res;
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/InitialTensorOptions.h b/MLPY/Lib/site-packages/torch/include/ATen/InitialTensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..58289fb41c6f66b85ca17297864e1639f0a78441
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/InitialTensorOptions.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <c10/core/TensorOptions.h>
+
+namespace at {
+
+// Represents the initial TensorOptions, before the "defaults" are ever changed.
+// This is designed to be used in library code, where the explicit devices,
+// dtypes, etc. are known. NOTE: this is not a stable API.
+inline TensorOptions initialTensorOptions() {
+  return TensorOptions(kCPU).dtype(kFloat).layout(kStrided).requires_grad(
+      false);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Layout.h b/MLPY/Lib/site-packages/torch/include/ATen/Layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..11bda768d2fc435e5aa32c764097ef158fe4a315
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Layout.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <c10/core/Layout.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedFallback.h b/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4a1961a5f57d0aed6a4bd9b07ae2ff7e094d8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedFallback.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+namespace at {
+
+// If an operator doesn't have a batching rule implemented then we fallback
+// to this implementation. The fallback only works on out-of-place operators
+// that return only tensors with new memory. (e.g., no in-place operators, no
+// view operations).
+//
+// The fallback effectively takes all of the BatchedTensors in `stack`, slices
+// them, and runs `op` on all of the corresponding slices to produce slices
+// of the outputs. The output slices then get `torch.stack`ed to create the
+// final returns.
+//
+// The performance of the fallback is not very good because it introduces an
+// extra copy from stacking the sliced outputs. Because of this, we prefer to
+// write batching rules for operators whenever possible.
+void batchedTensorForLoopFallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c606c6c1d423364cfb25d54cab682835d7b3074e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <bitset>
+
+#include <ATen/ArrayRef.h>
+#include <ATen/SmallVector.h>
+#include <ATen/Tensor.h>
+
+namespace at {
+
+// We assume this in a few other places in the codebase,
+// but there isn't a centralized definition.
+constexpr int64_t kVmapMaxTensorDims = 64;
+
+// The valid vmap levels range from [0, 64). This effectively means that we
+// support a maximum of 64 nested vmaps.
+constexpr int64_t kVmapNumLevels = 64;
+
+// Store this number of elements of BatchDims on the stack. Most people will
+// probably use <= 5 nested vmaps, but adjust this number as necessary.
+constexpr int64_t kBatchDimsStackSize = 5;
+
+// a BatchDim represents a "private" dimension on a Tensor created inside of
+// vmap. It is a (level, dim) tuple, with the `dim` indicating which dimension
+// is being vmap'ed over and the `level` being an identifier for which vmap
+// said dimension was created inside. The `dim` corresponds to a "physical
+// dim" - it is a dimension index on the underlying physical tensor that is
+// being vmapped over.
+struct BatchDim {
+  BatchDim(int64_t level, int64_t dim) : dim_(dim), level_(level) {}
+  int64_t dim() const {
+    return dim_;
+  }
+  int64_t level() const {
+    return level_;
+  }
+
+ private:
+  int64_t dim_;
+  int64_t level_;
+};
+
+using BatchDims = SmallVector<BatchDim, kBatchDimsStackSize>;
+using BatchDimsRef = ArrayRef<BatchDim>;
+
+// A BatchedTensorImpl holds an underlying Tensor and a list of BatchDim
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+//
+// The batch dimensions are treated as being "private"; they are not
+// user-visible. For example, in the following Tensor,
+//    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2, dim=1)])
+// dimensions 0 and 1 are batch dimensions.
+//
+// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
+// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7)
+// tensor.
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
+  explicit BatchedTensorImpl(Tensor value, BatchDims bdims);
+
+  // Returns a reference to BatchDims that represent which dimensions of this
+  // tensor are private.
+  BatchDimsRef bdims() const {
+    return bdims_;
+  }
+
+  // BatchedTensorImpl wraps a Tensor
+  const Tensor& value() const {
+    return value_;
+  };
+
+  // Given a public dimension index, return the dimension index in the
+  // underlying value() tensor. For example, if we have
+  //    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2,
+  //    dim=2)])
+  // bt.actualDim(0) -> 1
+  // bt.actualDim(1) -> 3
+  // bt.actualDim(2) -> Error
+  int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
+
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
+  // Override a bunch of methods inherited from TensorImpl to return error
+  // messages.
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  void set_storage_offset(int64_t storage_offset) override;
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+
+ private:
+  // see NOTE: [BatchedTensorImpl levels invariant]
+  void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
+
+  Tensor value_;
+
+  // Note: [BatchedTensorImpl levels invariant]
+  // There is an invariant that the BatchDims must be stored in increasing
+  // `level` order. That is, for i < j, bdims_[i].level must be less than
+  // bdims_[j].level.
+  BatchDims bdims_;
+};
+
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+inline bool isBatchedTensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::Batched);
+}
+
+// It is unsafe to call this on a Tensor that is not backed by a
+// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
+  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
+  if (!isBatchedTensor(tensor)) {
+    return nullptr;
+  }
+  return unsafeGetBatchedImpl(tensor);
+}
+
+// Returns a bitset. If bit i is set, then that means dim i is a batchdim.
+inline std::bitset<kVmapMaxTensorDims> createBatchDimBitset(
+    BatchDimsRef bdims) {
+  std::bitset<kVmapMaxTensorDims> is_bdim;
+  for (const auto& bdim : bdims) {
+    is_bdim.set(bdim.dim());
+  }
+  return is_bdim;
+}
+
+// Creates a bitset for all of the levels present in `bdims`
+inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
+  std::bitset<kVmapNumLevels> result;
+  for (const auto& bdim : bdims) {
+    result.set(bdim.level());
+  }
+  return result;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  return out;
+}
+
+// Use this to construct a BatchedTensor from a regular Tensor
+TORCH_API Tensor makeBatched(const Tensor& tensor, BatchDims bdims);
+
+// Adds a batch dim to `tensor`, returning a BatchedTensor
+TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t level, int64_t dim);
+
+// Checks if an inplace operation on self and other is "vmap compatible".
+// See NOTE: [vmap-incompatible in-place operations] for the definition of this.
+TORCH_API bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapMode.h b/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfb093566ccbe05a23e1d474cad84166496eb402
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapMode.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
+
+namespace at::impl {
+
+// VmapMode contains a thread local count of how many nested vmaps
+// we are currently inside. That number is known as the `vmap level`.
+// VmapMode is used in the implementation of the Python `torch.vmap` API.
+//
+// NOTE: this is NOT the c++ api for torch.vmap. That doesn't exist yet.
+
+struct TORCH_API VmapMode {
+  // Returns the vmap level, aka the count of how many nested vmaps we're in.
+  static int64_t current_vmap_level();
+
+  // Increment the count of nested vmaps. If this causes the vmap level to be
+  // greater than 0, then it enables DispatchKey::VmapMode on all tensors.
+  static int64_t increment_nesting();
+
+  // Decrements the count of nested vmaps. If this causes the vmap level to be
+  // equal to 0, then it disables DispatchKey::VmapMode on all tensors.
+  static int64_t decrement_nesting();
+};
+
+} // namespace at::impl
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapTransforms.h b/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapTransforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..13af3ad08ad24f59d81bf6d4ade0cb925d3a5b95
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/LegacyVmapTransforms.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/core/IListRef.h>
+
+namespace at {
+
+// This file contains abstractions used for transforming *logical* vmap
+// arguments into *physical* arguments. (Keep reading for definitions of these
+// terms).
+
+// NOTE: [Logical vs physical args]
+// Consider the following vmap.
+//   vmap(vmap(func, in_dims=(2,)), in_dims=(0,))(torch.ones(2, 3, 4))
+// This would produce a BatchedTensor wrapping a Tensor of size [2, 3, 4],
+// with batch dims 0 and 2:
+//   BatchedTensor(ones(2, 3, 4), bdims=[(lvl=1,dim=0),(lvl=2,dim=2)])
+//
+// We say the *logical* view of the tensor has size [3] -- tensors inside
+// `func` appear to have size [3].
+// However, the *physical* underlying tensor (the one passed to vmap) has size
+// [2, 3, 4].
+//
+// This notion of logical vs physical also extends to non-tensor arguments.
+// Consider the previous tensor; let's assume the user called
+// `torch.sum(tensor, dim=0)` inside of `func`. Then the logical
+// dimension they are reducing over is dim 0 but the physical dim is dim 1
+// (the first non-batch dimension)
+
+// Forward declared; see NOTE: [What is a VmapPhysicalView?]
+struct VmapPhysicalView;
+
+// Most PyTorch operators take 4 or fewer inputs.
+constexpr int64_t kVmapTransformStaticInputSize = 4;
+using VmapPhysicalViewVec =
+    SmallVector<VmapPhysicalView, kVmapTransformStaticInputSize>;
+
+// Pytorch generally advertises good performance for <= 5 dims.
+// (see ATen/core/DimVector.h). We add a few extra dims (~3) for vmap
+// dimensions to get 8. Adjust this number as necessary
+constexpr int64_t kVmapStaticDimVecSize = 8;
+using VmapDimVector = SmallVector<int64_t, kVmapStaticDimVecSize>;
+using VmapSymDimVector = SmallVector<c10::SymInt, kVmapStaticDimVecSize>;
+
+// NOTE: [What is an VmapTransform?]
+// An *VmapTransform* converts logical views of tensors to physical views.
+//
+// Batching rules use VmapTransforms to convert logical arguments to
+// physical arguments, then call one or more at:: operator that handles the
+// physical arguments, and then converts the physical result back to a logical
+// argument.
+
+// VmapTransform for operators that take tensors with multiple batch dims.
+// Given one or more logical views on Tensors, `logicalToPhysical`
+// permutes all of the batch dims to the front of the tensor, aligns
+// and expands the batch dims to match each other (according to their `level`),
+// and returns a VmapPhysicalView on the tensor(s).
+struct TORCH_API MultiBatchVmapTransform {
+  static VmapPhysicalView logicalToPhysical(const Tensor& logical_tensor);
+  static VmapPhysicalViewVec logicalToPhysical(ITensorListRef logical_tensors);
+};
+
+// VmapTransform for operators that broadcast all inputs.
+// Given some logical views on Tensors, `logicalToPhysical`:
+// - permutes all of the batch dims to the front of the tensors
+// - aligns all the batch dims to the collective levels of all of the tensors.
+//   If a tensor does not have a batch dim for a vmap level, then it receives
+//   a size-one dimension for said level.
+// - aligns the non-batch dims to have the same dimensionality, adding extra
+//   size-1 dimensions in between the batch dimensions and the non-batch
+//   dimensions so that the batch dimensions are lined up from the right.
+//
+// For example: given inputs of size (B, 2) and (B, 3, 2) where B is the batch
+// dimension, BroadcastingVmapTransform returns VmapPhysicalViews that wrap
+// tensors of size (B, 1, 2) and (B, 3, 2).
+//
+// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
+// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
+// actually *need* to return a tensor of size (1, 2) for the second tensor
+// because the broadcasting operation takes care of that for us, but we do
+// it anyways to keep things simple.
+struct TORCH_API BroadcastingVmapTransform {
+  static VmapPhysicalViewVec logicalToPhysical(TensorList logical_tensors);
+};
+
+// Forward declared, if you're reading this file head to toe, don't worry about
+// it yet.
+struct VmapPhysicalToLogicalMap;
+
+// NOTE: [What is a VmapPhysicalView?]
+// VmapPhysicalView represents a physical view on a Tensor.
+//
+// One can use it to further convert logical dimension indices, logical shapes,
+// and more to their physical variants, or convert a new (physical) tensor into
+// a logical BatchedTensor. (TODO(rzou): some of these are not yet implemented).
+//
+// VmapPhysicalView stores a physical tensor with all of its batch dimensions at
+// the front and some levels that correspond to said batch dimensions.
+//
+// The levels bitset specifies which vmap levels correspond to the batch
+// dimensions at the front of the tensor. In particular, the number of set bits
+// corresponds to the number of batch dimensions on `tensor` and the rightmost
+// bit of `levels` specifies the maximum number of nested vmaps we are in at
+// this point in time.
+// For example, given:
+//   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3})
+//
+// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less
+// than or equal to 3.
+//   bitset: 010100
+//              ^
+//              |
+//   levels: 012345
+struct TORCH_API VmapPhysicalView {
+  VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
+      : levels_(levels), tensor_(std::move(tensor)) {
+    TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor_));
+  }
+
+  Tensor& tensor() {
+    return tensor_;
+  }
+  const Tensor& tensor() const {
+    return tensor_;
+  }
+
+  // Maps logical dim indices to physical dim indices. Also does dim wrapping.
+  //
+  // For example, given:
+  //   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5), levels={1, 3})
+  //
+  // Then physical_view.getPhysicalDims({0, 1}) returns {2, 3}.
+  // This is because the size of levels tell us that the first two dimensions
+  // of `tensor_` are batch dimensions, so a logical dim of `n` is actually
+  // a physical dim of `n + 2`.
+  VmapDimVector getPhysicalDims(OptionalIntArrayRef logical_dims) const;
+  int64_t getPhysicalDim(int64_t logical_dim) const;
+
+  // Returns a VmapPhysicalToLogicalMap object. This can be used for
+  // mapping a physical tensor to a new logical tensor (BatchedTensor)
+  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
+
+  // Maps a logical shape to a physical shape by pre-pending the batch
+  // sizes to the logical shape.
+  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
+
+  int64_t numBatchDims() const;
+
+ private:
+  int64_t numLogicalDims() const;
+
+  std::bitset<kVmapNumLevels> levels_;
+  Tensor tensor_;
+};
+
+// Convenience struct used for mapping a physical tensor (a non-BatchedTensor)
+// to a logical one (BatchedTensor). It holds some levels that are used to do
+// the mapping and assumes that the batch dimensions in the physical tensor all
+// occur at the front of the tensor.
+struct TORCH_API VmapPhysicalToLogicalMap {
+  VmapPhysicalToLogicalMap(std::bitset<kVmapNumLevels> levels)
+      : levels_(levels) {}
+
+  // Maps a physical tensor to a new logical tensor (BatchedTensor).
+  // Assumes that all of the "batch dimensions" are at the front
+  // of the physical tensor. For example, given:
+  // - x = rank-4 Tensor with size 2, 3, 5, 7
+  // - levels = (2, 4)
+  // Returns:
+  // - BatchedTensor(x, bdims=[(dim=0,lvl=2), (dim=1, lvl=4)])
+  Tensor apply(const Tensor& physical_tensor) const;
+
+  // Given a vector of physical tensors,
+  // 1. maps each tensor to a new logical tensor. Assumes that all of the
+  //    "batch dimensions" are at the front of the physical tensors.
+  // 2. stores the new logical tensors back into the passed-in vector. This is
+  //    to avoid additional dynamic allocations.
+  void applyInplace(std::vector<Tensor>& physical_tensors) const;
+
+  std::bitset<kVmapNumLevels> levels_;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/LinalgBackend.h b/MLPY/Lib/site-packages/torch/include/ATen/LinalgBackend.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b084d189d7fb61cc0f67ccc0be15614be7e490c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/LinalgBackend.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <ostream>
+#include <string>
+
+namespace at {
+
+enum class LinalgBackend : int8_t { Default, Cusolver, Magma };
+
+inline std::string LinalgBackendToString(at::LinalgBackend backend) {
+  switch (backend) {
+    case LinalgBackend::Default:
+      return "at::LinalgBackend::Default";
+    case LinalgBackend::Cusolver:
+      return "at::LinalgBackend::Cusolver";
+    case LinalgBackend::Magma:
+      return "at::LinalgBackend::Magma";
+    default:
+      TORCH_CHECK(false, "Unknown linalg backend");
+  }
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::LinalgBackend backend) {
+  return stream << LinalgBackendToString(backend);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MapAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/MapAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..17af2f8947abb4501412cc50057d4889d2a0a237
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MapAllocator.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/string_view.h>
+
+namespace at {
+
+enum MappedAllocatorModes {
+  ALLOCATOR_MAPPED_SHARED = 1,
+  ALLOCATOR_MAPPED_SHAREDMEM = 2,
+  ALLOCATOR_MAPPED_EXCLUSIVE = 4,
+  ALLOCATOR_MAPPED_NOCREATE = 8,
+  ALLOCATOR_MAPPED_KEEPFD = 16,
+  ALLOCATOR_MAPPED_FROMFD = 32,
+  ALLOCATOR_MAPPED_UNLINK = 64
+};
+
+// Sentinel value/type to help distinguish the file descriptor constructor from
+// the non-file descriptor constructor
+enum WithFd { WITH_FD };
+
+TORCH_API std::string NewProcessWideShmHandle();
+
+class TORCH_API MapAllocator {
+ public:
+  MapAllocator(c10::string_view filename, int flags, size_t size);
+  MapAllocator(
+      WithFd,
+      c10::string_view filename,
+      int fd,
+      int flags,
+      size_t size);
+  MapAllocator(const MapAllocator&) = delete;
+  MapAllocator& operator=(const MapAllocator&) = delete;
+  MapAllocator(MapAllocator&&) = delete;
+  MapAllocator& operator=(MapAllocator&&) = delete;
+
+  const char* filename() const {
+    return filename_.c_str();
+  }
+  int fd() const {
+#ifdef _WIN32
+    TORCH_CHECK(false, "MapAllocator::fd() is unsupported on Windows");
+#else
+    return fd_;
+#endif
+  }
+  ptrdiff_t size() const {
+    return size_;
+  }
+  // Return a pointer to the actual data for this allocator
+  // (in the case of the refcounted allocator, this is offset
+  // from the base pointer.)
+  virtual void* data() const {
+    return base_ptr_;
+  }
+
+  static MapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(
+      c10::string_view filename,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+
+  // Closes the data.  Helps us avoid destructor shenanigans
+  virtual void close();
+
+  // This is very dangerous.  You have to redefine this destructor for each
+  // subclass
+  virtual ~MapAllocator();
+
+ protected:
+  bool closed_ = false;
+  std::string filename_;
+  int flags_ = 0;
+  ptrdiff_t size_; /* mapped size */
+#ifdef _WIN32
+  void* handle_;
+  void* event_;
+  std::string eventname_;
+#else
+  int fd_ = -1;
+#endif
+  void* base_ptr_ = nullptr;
+};
+
+// Base-from-member idiom
+struct TORCH_API RefcountedMapAllocatorArgCheck {
+  RefcountedMapAllocatorArgCheck(int flags);
+};
+
+class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
+                                         public MapAllocator {
+ public:
+  RefcountedMapAllocator(const char* filename, int flags, size_t size);
+  RefcountedMapAllocator(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size);
+
+  static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
+  static at::DataPtr makeDataPtr(
+      const char* filename,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+  static at::DataPtr makeDataPtr(
+      WithFd,
+      const char* filename,
+      int fd,
+      int flags,
+      size_t size,
+      size_t* actual_size_out);
+
+  void* data() const override;
+
+  void incref();
+  int decref();
+  void close() override;
+
+  ~RefcountedMapAllocator() override {
+    RefcountedMapAllocator::close();
+  }
+
+ protected:
+  void checkFlags();
+  void initializeAlloc();
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MatrixRef.h b/MLPY/Lib/site-packages/torch/include/ATen/MatrixRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e803b09f9dc5f592a301d94aa858021371ddc0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MatrixRef.h
@@ -0,0 +1,109 @@
+#pragma once
+#include <ATen/Utils.h>
+#include <c10/util/ArrayRef.h>
+
+#include <vector>
+
+namespace at {
+/// MatrixRef - Like an ArrayRef, but with an extra recorded strides so that
+/// we can easily view it as a multidimensional array.
+///
+/// Like ArrayRef, this class does not own the underlying data, it is expected
+/// to be used in situations where the data resides in some other buffer.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+///
+/// For now, 2D only (so the copies are actually cheap, without having
+/// to write a SmallVector class) and contiguous only (so we can
+/// return non-strided ArrayRef on index).
+///
+/// P.S. dimension 0 indexes rows, dimension 1 indexes columns
+template <typename T>
+class MatrixRef {
+ public:
+  typedef size_t size_type;
+
+ private:
+  /// Underlying ArrayRef
+  ArrayRef<T> arr;
+
+  /// Stride of dim 0 (outer dimension)
+  size_type stride0;
+
+  // Stride of dim 1 is assumed to be 1
+
+ public:
+  /// Construct an empty Matrixref.
+  /*implicit*/ MatrixRef() : arr(nullptr), stride0(0) {}
+
+  /// Construct an MatrixRef from an ArrayRef and outer stride.
+  /*implicit*/ MatrixRef(ArrayRef<T> arr, size_type stride0)
+      : arr(arr), stride0(stride0) {
+    TORCH_CHECK(
+        arr.size() % stride0 == 0,
+        "MatrixRef: ArrayRef size ",
+        arr.size(),
+        " not divisible by stride ",
+        stride0)
+  }
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  /// empty - Check if the matrix is empty.
+  bool empty() const {
+    return arr.empty();
+  }
+
+  const T* data() const {
+    return arr.data();
+  }
+
+  /// size - Get size a dimension
+  size_t size(size_t dim) const {
+    if (dim == 0) {
+      return arr.size() / stride0;
+    } else if (dim == 1) {
+      return stride0;
+    } else {
+      TORCH_CHECK(
+          0, "MatrixRef: out of bounds dimension ", dim, "; expected 0 or 1");
+    }
+  }
+
+  size_t numel() const {
+    return arr.size();
+  }
+
+  /// equals - Check for element-wise equality.
+  bool equals(MatrixRef RHS) const {
+    return stride0 == RHS.stride0 && arr.equals(RHS.arr);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  ArrayRef<T> operator[](size_t Index) const {
+    return arr.slice(Index * stride0, stride0);
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+};
+
+} // end namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MemoryOverlap.h b/MLPY/Lib/site-packages/torch/include/ATen/MemoryOverlap.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8427eef13cdd1741262f4dcdb84900389157e22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MemoryOverlap.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace c10 {
+struct TensorImpl;
+}
+
+namespace at {
+class TensorBase;
+
+// MemOverlap: Whether or not there is memory overlap
+//
+// No: Absolutely no memory overlap
+// Yes: Absolutely yes memory overlap
+// TooHard: There might be memory overlap, but it was too expensive to compute.
+//
+// NB: Please update the python test for these if you renumber them.
+enum class MemOverlap { No, Yes, TooHard };
+
+enum class MemOverlapStatus { Full, Partial, No, TooHard };
+
+TORCH_API MemOverlap has_internal_overlap(const TensorBase& t);
+TORCH_API MemOverlap has_internal_overlap(c10::TensorImpl* t);
+
+TORCH_API void assert_no_internal_overlap(const TensorBase& t);
+TORCH_API void assert_no_internal_overlap(c10::TensorImpl* t);
+
+TORCH_API MemOverlapStatus
+get_overlap_status(const TensorBase& a, const TensorBase& b);
+TORCH_API MemOverlapStatus
+get_overlap_status(const c10::TensorImpl* a, const c10::TensorImpl* b);
+
+TORCH_API void assert_no_partial_overlap(
+    const TensorBase& a,
+    const TensorBase& b);
+void assert_no_partial_overlap(c10::TensorImpl* a, c10::TensorImpl* b);
+
+TORCH_API void assert_no_overlap(const TensorBase& a, const TensorBase& b);
+TORCH_API void assert_no_overlap(c10::TensorImpl* a, c10::TensorImpl* b);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bd95d4fcf9785268519848d69f378f5e4bbdacb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions.h
@@ -0,0 +1,29 @@
+#include <ATen/core/TensorBody.h>
+
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/MetaFunctions_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..614fdf1e725bfe64454b8210e88a28c1e96af529
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MetaFunctions_inl.h
@@ -0,0 +1,324 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_meta_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <ATen/ops/_add_relu_meta_dispatch.h>
+#include <ATen/ops/_addmm_activation_meta_dispatch.h>
+#include <ATen/ops/_amp_update_scale_meta_dispatch.h>
+#include <ATen/ops/_coalesced_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_meta_dispatch.h>
+#include <ATen/ops/_ctc_loss_meta_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_meta_dispatch.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_meta_dispatch.h>
+#include <ATen/ops/_fused_sdp_choice_meta_dispatch.h>
+#include <ATen/ops/_index_put_impl_meta_dispatch.h>
+#include <ATen/ops/_linalg_det_meta_dispatch.h>
+#include <ATen/ops/_linalg_eigh_meta_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_meta_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_meta_dispatch.h>
+#include <ATen/ops/_linalg_svd_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_mkldnn_transpose_meta_dispatch.h>
+#include <ATen/ops/_reshape_alias_meta_dispatch.h>
+#include <ATen/ops/_resize_output_meta_dispatch.h>
+#include <ATen/ops/_softmax_meta_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_meta_dispatch.h>
+#include <ATen/ops/acos_meta_dispatch.h>
+#include <ATen/ops/acosh_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/add_meta_dispatch.h>
+#include <ATen/ops/addbmm_meta_dispatch.h>
+#include <ATen/ops/addcdiv_meta_dispatch.h>
+#include <ATen/ops/addcmul_meta_dispatch.h>
+#include <ATen/ops/addmm_meta_dispatch.h>
+#include <ATen/ops/addmv_meta_dispatch.h>
+#include <ATen/ops/all_meta_dispatch.h>
+#include <ATen/ops/amax_meta_dispatch.h>
+#include <ATen/ops/amin_meta_dispatch.h>
+#include <ATen/ops/aminmax_meta_dispatch.h>
+#include <ATen/ops/any_meta_dispatch.h>
+#include <ATen/ops/arange_meta_dispatch.h>
+#include <ATen/ops/argmax_meta_dispatch.h>
+#include <ATen/ops/argmin_meta_dispatch.h>
+#include <ATen/ops/as_strided_meta_dispatch.h>
+#include <ATen/ops/asin_meta_dispatch.h>
+#include <ATen/ops/asinh_meta_dispatch.h>
+#include <ATen/ops/atan_meta_dispatch.h>
+#include <ATen/ops/atan2_meta_dispatch.h>
+#include <ATen/ops/atanh_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/baddbmm_meta_dispatch.h>
+#include <ATen/ops/bernoulli_meta_dispatch.h>
+#include <ATen/ops/bitwise_and_meta_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_not_meta_dispatch.h>
+#include <ATen/ops/bitwise_or_meta_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_xor_meta_dispatch.h>
+#include <ATen/ops/bmm_meta_dispatch.h>
+#include <ATen/ops/cat_meta_dispatch.h>
+#include <ATen/ops/cauchy_meta_dispatch.h>
+#include <ATen/ops/ceil_meta_dispatch.h>
+#include <ATen/ops/clamp_meta_dispatch.h>
+#include <ATen/ops/clamp_max_meta_dispatch.h>
+#include <ATen/ops/clamp_min_meta_dispatch.h>
+#include <ATen/ops/copy_sparse_to_sparse_meta_dispatch.h>
+#include <ATen/ops/copysign_meta_dispatch.h>
+#include <ATen/ops/cos_meta_dispatch.h>
+#include <ATen/ops/cosh_meta_dispatch.h>
+#include <ATen/ops/cumprod_meta_dispatch.h>
+#include <ATen/ops/cumsum_meta_dispatch.h>
+#include <ATen/ops/digamma_meta_dispatch.h>
+#include <ATen/ops/div_meta_dispatch.h>
+#include <ATen/ops/elu_meta_dispatch.h>
+#include <ATen/ops/elu_backward_meta_dispatch.h>
+#include <ATen/ops/embedding_renorm_meta_dispatch.h>
+#include <ATen/ops/empty_meta_dispatch.h>
+#include <ATen/ops/empty_strided_meta_dispatch.h>
+#include <ATen/ops/eq_meta_dispatch.h>
+#include <ATen/ops/erf_meta_dispatch.h>
+#include <ATen/ops/erfc_meta_dispatch.h>
+#include <ATen/ops/erfinv_meta_dispatch.h>
+#include <ATen/ops/exp_meta_dispatch.h>
+#include <ATen/ops/exp2_meta_dispatch.h>
+#include <ATen/ops/expm1_meta_dispatch.h>
+#include <ATen/ops/exponential_meta_dispatch.h>
+#include <ATen/ops/eye_meta_dispatch.h>
+#include <ATen/ops/fill_meta_dispatch.h>
+#include <ATen/ops/floor_meta_dispatch.h>
+#include <ATen/ops/floor_divide_meta_dispatch.h>
+#include <ATen/ops/fmax_meta_dispatch.h>
+#include <ATen/ops/fmin_meta_dispatch.h>
+#include <ATen/ops/fmod_meta_dispatch.h>
+#include <ATen/ops/frac_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/gather_meta_dispatch.h>
+#include <ATen/ops/gcd_meta_dispatch.h>
+#include <ATen/ops/ge_meta_dispatch.h>
+#include <ATen/ops/gelu_meta_dispatch.h>
+#include <ATen/ops/gelu_backward_meta_dispatch.h>
+#include <ATen/ops/geometric_meta_dispatch.h>
+#include <ATen/ops/glu_meta_dispatch.h>
+#include <ATen/ops/gt_meta_dispatch.h>
+#include <ATen/ops/hardshrink_meta_dispatch.h>
+#include <ATen/ops/hardshrink_backward_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/hardswish_meta_dispatch.h>
+#include <ATen/ops/hardtanh_meta_dispatch.h>
+#include <ATen/ops/heaviside_meta_dispatch.h>
+#include <ATen/ops/hypot_meta_dispatch.h>
+#include <ATen/ops/i0_meta_dispatch.h>
+#include <ATen/ops/igamma_meta_dispatch.h>
+#include <ATen/ops/igammac_meta_dispatch.h>
+#include <ATen/ops/index_meta_dispatch.h>
+#include <ATen/ops/index_add_meta_dispatch.h>
+#include <ATen/ops/index_copy_meta_dispatch.h>
+#include <ATen/ops/index_fill_meta_dispatch.h>
+#include <ATen/ops/index_reduce_meta_dispatch.h>
+#include <ATen/ops/isin_meta_dispatch.h>
+#include <ATen/ops/isneginf_meta_dispatch.h>
+#include <ATen/ops/isposinf_meta_dispatch.h>
+#include <ATen/ops/lcm_meta_dispatch.h>
+#include <ATen/ops/le_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_meta_dispatch.h>
+#include <ATen/ops/lerp_meta_dispatch.h>
+#include <ATen/ops/lgamma_meta_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_cross_meta_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_qr_meta_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_meta_dispatch.h>
+#include <ATen/ops/linspace_meta_dispatch.h>
+#include <ATen/ops/log_meta_dispatch.h>
+#include <ATen/ops/log10_meta_dispatch.h>
+#include <ATen/ops/log1p_meta_dispatch.h>
+#include <ATen/ops/log2_meta_dispatch.h>
+#include <ATen/ops/log_normal_meta_dispatch.h>
+#include <ATen/ops/logaddexp_meta_dispatch.h>
+#include <ATen/ops/logaddexp2_meta_dispatch.h>
+#include <ATen/ops/logit_meta_dispatch.h>
+#include <ATen/ops/logit_backward_meta_dispatch.h>
+#include <ATen/ops/logspace_meta_dispatch.h>
+#include <ATen/ops/lshift_meta_dispatch.h>
+#include <ATen/ops/lt_meta_dispatch.h>
+#include <ATen/ops/lu_unpack_meta_dispatch.h>
+#include <ATen/ops/masked_fill_meta_dispatch.h>
+#include <ATen/ops/masked_scatter_meta_dispatch.h>
+#include <ATen/ops/max_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_meta_dispatch.h>
+#include <ATen/ops/maximum_meta_dispatch.h>
+#include <ATen/ops/mean_meta_dispatch.h>
+#include <ATen/ops/min_meta_dispatch.h>
+#include <ATen/ops/minimum_meta_dispatch.h>
+#include <ATen/ops/mish_meta_dispatch.h>
+#include <ATen/ops/mm_meta_dispatch.h>
+#include <ATen/ops/mse_loss_meta_dispatch.h>
+#include <ATen/ops/mul_meta_dispatch.h>
+#include <ATen/ops/ne_meta_dispatch.h>
+#include <ATen/ops/neg_meta_dispatch.h>
+#include <ATen/ops/nextafter_meta_dispatch.h>
+#include <ATen/ops/nll_loss_backward_meta_dispatch.h>
+#include <ATen/ops/nll_loss_forward_meta_dispatch.h>
+#include <ATen/ops/norm_meta_dispatch.h>
+#include <ATen/ops/normal_meta_dispatch.h>
+#include <ATen/ops/polygamma_meta_dispatch.h>
+#include <ATen/ops/pow_meta_dispatch.h>
+#include <ATen/ops/prod_meta_dispatch.h>
+#include <ATen/ops/put_meta_dispatch.h>
+#include <ATen/ops/random_meta_dispatch.h>
+#include <ATen/ops/range_meta_dispatch.h>
+#include <ATen/ops/reciprocal_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_meta_dispatch.h>
+#include <ATen/ops/relu_meta_dispatch.h>
+#include <ATen/ops/remainder_meta_dispatch.h>
+#include <ATen/ops/renorm_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/replication_pad2d_meta_dispatch.h>
+#include <ATen/ops/replication_pad3d_meta_dispatch.h>
+#include <ATen/ops/resize_meta_dispatch.h>
+#include <ATen/ops/resize_as_sparse_meta_dispatch.h>
+#include <ATen/ops/round_meta_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_meta_dispatch.h>
+#include <ATen/ops/rshift_meta_dispatch.h>
+#include <ATen/ops/rsqrt_meta_dispatch.h>
+#include <ATen/ops/scatter_meta_dispatch.h>
+#include <ATen/ops/scatter_add_meta_dispatch.h>
+#include <ATen/ops/scatter_reduce_meta_dispatch.h>
+#include <ATen/ops/set_meta_dispatch.h>
+#include <ATen/ops/sgn_meta_dispatch.h>
+#include <ATen/ops/sigmoid_meta_dispatch.h>
+#include <ATen/ops/sigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/sign_meta_dispatch.h>
+#include <ATen/ops/signbit_meta_dispatch.h>
+#include <ATen/ops/silu_meta_dispatch.h>
+#include <ATen/ops/silu_backward_meta_dispatch.h>
+#include <ATen/ops/sin_meta_dispatch.h>
+#include <ATen/ops/sinc_meta_dispatch.h>
+#include <ATen/ops/sinh_meta_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_meta_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_meta_dispatch.h>
+#include <ATen/ops/softplus_meta_dispatch.h>
+#include <ATen/ops/softplus_backward_meta_dispatch.h>
+#include <ATen/ops/softshrink_meta_dispatch.h>
+#include <ATen/ops/softshrink_backward_meta_dispatch.h>
+#include <ATen/ops/sort_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_and_clear_meta_dispatch.h>
+#include <ATen/ops/special_airy_ai_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j1_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y1_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_entr_meta_dispatch.h>
+#include <ATen/ops/special_erfcx_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_meta_dispatch.h>
+#include <ATen/ops/special_i0e_meta_dispatch.h>
+#include <ATen/ops/special_i1_meta_dispatch.h>
+#include <ATen/ops/special_i1e_meta_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_meta_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_meta_dispatch.h>
+#include <ATen/ops/special_log_ndtr_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_ndtri_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_xlog1py_meta_dispatch.h>
+#include <ATen/ops/special_zeta_meta_dispatch.h>
+#include <ATen/ops/sqrt_meta_dispatch.h>
+#include <ATen/ops/sub_meta_dispatch.h>
+#include <ATen/ops/sum_meta_dispatch.h>
+#include <ATen/ops/tan_meta_dispatch.h>
+#include <ATen/ops/tanh_meta_dispatch.h>
+#include <ATen/ops/tanh_backward_meta_dispatch.h>
+#include <ATen/ops/threshold_meta_dispatch.h>
+#include <ATen/ops/threshold_backward_meta_dispatch.h>
+#include <ATen/ops/topk_meta_dispatch.h>
+#include <ATen/ops/triangular_solve_meta_dispatch.h>
+#include <ATen/ops/tril_meta_dispatch.h>
+#include <ATen/ops/triu_meta_dispatch.h>
+#include <ATen/ops/trunc_meta_dispatch.h>
+#include <ATen/ops/unfold_meta_dispatch.h>
+#include <ATen/ops/uniform_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h>
+#include <ATen/ops/view_meta_dispatch.h>
+#include <ATen/ops/view_as_complex_meta_dispatch.h>
+#include <ATen/ops/view_as_real_meta_dispatch.h>
+#include <ATen/ops/xlogy_meta_dispatch.h>
+#include <ATen/ops/zero_meta_dispatch.h>
+
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/MethodOperators.h b/MLPY/Lib/site-packages/torch/include/ATen/MethodOperators.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9848f67d4b24fcb5d69f0396de5930271b4ac64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/MethodOperators.h
@@ -0,0 +1,443 @@
+#pragma once
+
+// @generated by torchgen/gen.py from MethodOperators.h
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+#include <ATen/ops/_addmm_activation_ops.h>
+#include <ATen/ops/_autocast_to_full_precision_ops.h>
+#include <ATen/ops/_autocast_to_reduced_precision_ops.h>
+#include <ATen/ops/_backward_ops.h>
+#include <ATen/ops/_coalesced_ops.h>
+#include <ATen/ops/_conj_ops.h>
+#include <ATen/ops/_conj_physical_ops.h>
+#include <ATen/ops/_dimI_ops.h>
+#include <ATen/ops/_dimV_ops.h>
+#include <ATen/ops/_fw_primal_ops.h>
+#include <ATen/ops/_indices_ops.h>
+#include <ATen/ops/_is_all_true_ops.h>
+#include <ATen/ops/_is_any_true_ops.h>
+#include <ATen/ops/_is_zerotensor_ops.h>
+#include <ATen/ops/_lazy_clone_ops.h>
+#include <ATen/ops/_neg_view_ops.h>
+#include <ATen/ops/_nested_tensor_size_ops.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_ops.h>
+#include <ATen/ops/_nested_tensor_strides_ops.h>
+#include <ATen/ops/_nnz_ops.h>
+#include <ATen/ops/_reshape_alias_ops.h>
+#include <ATen/ops/_sparse_mask_projection_ops.h>
+#include <ATen/ops/_to_dense_ops.h>
+#include <ATen/ops/_to_sparse_bsc_ops.h>
+#include <ATen/ops/_to_sparse_bsr_ops.h>
+#include <ATen/ops/_to_sparse_csc_ops.h>
+#include <ATen/ops/_to_sparse_csr_ops.h>
+#include <ATen/ops/_to_sparse_ops.h>
+#include <ATen/ops/_values_ops.h>
+#include <ATen/ops/_version_ops.h>
+#include <ATen/ops/abs_ops.h>
+#include <ATen/ops/absolute_ops.h>
+#include <ATen/ops/acos_ops.h>
+#include <ATen/ops/acosh_ops.h>
+#include <ATen/ops/add_ops.h>
+#include <ATen/ops/addbmm_ops.h>
+#include <ATen/ops/addcdiv_ops.h>
+#include <ATen/ops/addcmul_ops.h>
+#include <ATen/ops/addmm_ops.h>
+#include <ATen/ops/addmv_ops.h>
+#include <ATen/ops/addr_ops.h>
+#include <ATen/ops/adjoint_ops.h>
+#include <ATen/ops/alias_ops.h>
+#include <ATen/ops/align_as_ops.h>
+#include <ATen/ops/align_to_ops.h>
+#include <ATen/ops/all_ops.h>
+#include <ATen/ops/allclose_ops.h>
+#include <ATen/ops/amax_ops.h>
+#include <ATen/ops/amin_ops.h>
+#include <ATen/ops/aminmax_ops.h>
+#include <ATen/ops/and_ops.h>
+#include <ATen/ops/angle_ops.h>
+#include <ATen/ops/any_ops.h>
+#include <ATen/ops/arccos_ops.h>
+#include <ATen/ops/arccosh_ops.h>
+#include <ATen/ops/arcsin_ops.h>
+#include <ATen/ops/arcsinh_ops.h>
+#include <ATen/ops/arctan2_ops.h>
+#include <ATen/ops/arctan_ops.h>
+#include <ATen/ops/arctanh_ops.h>
+#include <ATen/ops/argmax_ops.h>
+#include <ATen/ops/argmin_ops.h>
+#include <ATen/ops/argsort_ops.h>
+#include <ATen/ops/argwhere_ops.h>
+#include <ATen/ops/as_strided_ops.h>
+#include <ATen/ops/as_strided_scatter_ops.h>
+#include <ATen/ops/asin_ops.h>
+#include <ATen/ops/asinh_ops.h>
+#include <ATen/ops/atan2_ops.h>
+#include <ATen/ops/atan_ops.h>
+#include <ATen/ops/atanh_ops.h>
+#include <ATen/ops/baddbmm_ops.h>
+#include <ATen/ops/bernoulli_ops.h>
+#include <ATen/ops/bincount_ops.h>
+#include <ATen/ops/bitwise_and_ops.h>
+#include <ATen/ops/bitwise_left_shift_ops.h>
+#include <ATen/ops/bitwise_not_ops.h>
+#include <ATen/ops/bitwise_or_ops.h>
+#include <ATen/ops/bitwise_right_shift_ops.h>
+#include <ATen/ops/bitwise_xor_ops.h>
+#include <ATen/ops/bmm_ops.h>
+#include <ATen/ops/broadcast_to_ops.h>
+#include <ATen/ops/cauchy_ops.h>
+#include <ATen/ops/ccol_indices_ops.h>
+#include <ATen/ops/ceil_ops.h>
+#include <ATen/ops/chalf_ops.h>
+#include <ATen/ops/cholesky_inverse_ops.h>
+#include <ATen/ops/cholesky_ops.h>
+#include <ATen/ops/cholesky_solve_ops.h>
+#include <ATen/ops/chunk_ops.h>
+#include <ATen/ops/clamp_max_ops.h>
+#include <ATen/ops/clamp_min_ops.h>
+#include <ATen/ops/clamp_ops.h>
+#include <ATen/ops/clip_ops.h>
+#include <ATen/ops/clone_ops.h>
+#include <ATen/ops/coalesce_ops.h>
+#include <ATen/ops/col_indices_ops.h>
+#include <ATen/ops/conj_ops.h>
+#include <ATen/ops/conj_physical_ops.h>
+#include <ATen/ops/contiguous_ops.h>
+#include <ATen/ops/copy_ops.h>
+#include <ATen/ops/copysign_ops.h>
+#include <ATen/ops/corrcoef_ops.h>
+#include <ATen/ops/cos_ops.h>
+#include <ATen/ops/cosh_ops.h>
+#include <ATen/ops/count_nonzero_ops.h>
+#include <ATen/ops/cov_ops.h>
+#include <ATen/ops/cross_ops.h>
+#include <ATen/ops/crow_indices_ops.h>
+#include <ATen/ops/cummax_ops.h>
+#include <ATen/ops/cummin_ops.h>
+#include <ATen/ops/cumprod_ops.h>
+#include <ATen/ops/cumsum_ops.h>
+#include <ATen/ops/data_ops.h>
+#include <ATen/ops/deg2rad_ops.h>
+#include <ATen/ops/dense_dim_ops.h>
+#include <ATen/ops/dequantize_ops.h>
+#include <ATen/ops/det_ops.h>
+#include <ATen/ops/detach_ops.h>
+#include <ATen/ops/diag_embed_ops.h>
+#include <ATen/ops/diag_ops.h>
+#include <ATen/ops/diagflat_ops.h>
+#include <ATen/ops/diagonal_ops.h>
+#include <ATen/ops/diagonal_scatter_ops.h>
+#include <ATen/ops/diff_ops.h>
+#include <ATen/ops/digamma_ops.h>
+#include <ATen/ops/dist_ops.h>
+#include <ATen/ops/div_ops.h>
+#include <ATen/ops/divide_ops.h>
+#include <ATen/ops/dot_ops.h>
+#include <ATen/ops/dsplit_ops.h>
+#include <ATen/ops/eq_ops.h>
+#include <ATen/ops/equal_ops.h>
+#include <ATen/ops/erf_ops.h>
+#include <ATen/ops/erfc_ops.h>
+#include <ATen/ops/erfinv_ops.h>
+#include <ATen/ops/exp2_ops.h>
+#include <ATen/ops/exp_ops.h>
+#include <ATen/ops/expand_as_ops.h>
+#include <ATen/ops/expand_ops.h>
+#include <ATen/ops/expm1_ops.h>
+#include <ATen/ops/exponential_ops.h>
+#include <ATen/ops/fill_diagonal_ops.h>
+#include <ATen/ops/fill_ops.h>
+#include <ATen/ops/fix_ops.h>
+#include <ATen/ops/flatten_ops.h>
+#include <ATen/ops/flip_ops.h>
+#include <ATen/ops/fliplr_ops.h>
+#include <ATen/ops/flipud_ops.h>
+#include <ATen/ops/float_power_ops.h>
+#include <ATen/ops/floor_divide_ops.h>
+#include <ATen/ops/floor_ops.h>
+#include <ATen/ops/fmax_ops.h>
+#include <ATen/ops/fmin_ops.h>
+#include <ATen/ops/fmod_ops.h>
+#include <ATen/ops/frac_ops.h>
+#include <ATen/ops/frexp_ops.h>
+#include <ATen/ops/gather_ops.h>
+#include <ATen/ops/gcd_ops.h>
+#include <ATen/ops/ge_ops.h>
+#include <ATen/ops/geometric_ops.h>
+#include <ATen/ops/geqrf_ops.h>
+#include <ATen/ops/ger_ops.h>
+#include <ATen/ops/greater_equal_ops.h>
+#include <ATen/ops/greater_ops.h>
+#include <ATen/ops/gt_ops.h>
+#include <ATen/ops/hardshrink_backward_ops.h>
+#include <ATen/ops/hardshrink_ops.h>
+#include <ATen/ops/heaviside_ops.h>
+#include <ATen/ops/histc_ops.h>
+#include <ATen/ops/histogram_ops.h>
+#include <ATen/ops/hsplit_ops.h>
+#include <ATen/ops/hypot_ops.h>
+#include <ATen/ops/i0_ops.h>
+#include <ATen/ops/igamma_ops.h>
+#include <ATen/ops/igammac_ops.h>
+#include <ATen/ops/index_add_ops.h>
+#include <ATen/ops/index_copy_ops.h>
+#include <ATen/ops/index_fill_ops.h>
+#include <ATen/ops/index_ops.h>
+#include <ATen/ops/index_put_ops.h>
+#include <ATen/ops/index_reduce_ops.h>
+#include <ATen/ops/index_select_ops.h>
+#include <ATen/ops/indices_ops.h>
+#include <ATen/ops/inner_ops.h>
+#include <ATen/ops/int_repr_ops.h>
+#include <ATen/ops/inverse_ops.h>
+#include <ATen/ops/is_coalesced_ops.h>
+#include <ATen/ops/is_complex_ops.h>
+#include <ATen/ops/is_conj_ops.h>
+#include <ATen/ops/is_distributed_ops.h>
+#include <ATen/ops/is_floating_point_ops.h>
+#include <ATen/ops/is_inference_ops.h>
+#include <ATen/ops/is_leaf_ops.h>
+#include <ATen/ops/is_neg_ops.h>
+#include <ATen/ops/is_nonzero_ops.h>
+#include <ATen/ops/is_pinned_ops.h>
+#include <ATen/ops/is_same_size_ops.h>
+#include <ATen/ops/is_set_to_ops.h>
+#include <ATen/ops/is_signed_ops.h>
+#include <ATen/ops/isclose_ops.h>
+#include <ATen/ops/isfinite_ops.h>
+#include <ATen/ops/isinf_ops.h>
+#include <ATen/ops/isnan_ops.h>
+#include <ATen/ops/isneginf_ops.h>
+#include <ATen/ops/isposinf_ops.h>
+#include <ATen/ops/isreal_ops.h>
+#include <ATen/ops/istft_ops.h>
+#include <ATen/ops/item_ops.h>
+#include <ATen/ops/kron_ops.h>
+#include <ATen/ops/kthvalue_ops.h>
+#include <ATen/ops/lcm_ops.h>
+#include <ATen/ops/ldexp_ops.h>
+#include <ATen/ops/le_ops.h>
+#include <ATen/ops/lerp_ops.h>
+#include <ATen/ops/less_equal_ops.h>
+#include <ATen/ops/less_ops.h>
+#include <ATen/ops/lgamma_ops.h>
+#include <ATen/ops/log10_ops.h>
+#include <ATen/ops/log1p_ops.h>
+#include <ATen/ops/log2_ops.h>
+#include <ATen/ops/log_normal_ops.h>
+#include <ATen/ops/log_ops.h>
+#include <ATen/ops/log_softmax_ops.h>
+#include <ATen/ops/logaddexp2_ops.h>
+#include <ATen/ops/logaddexp_ops.h>
+#include <ATen/ops/logcumsumexp_ops.h>
+#include <ATen/ops/logdet_ops.h>
+#include <ATen/ops/logical_and_ops.h>
+#include <ATen/ops/logical_not_ops.h>
+#include <ATen/ops/logical_or_ops.h>
+#include <ATen/ops/logical_xor_ops.h>
+#include <ATen/ops/logit_ops.h>
+#include <ATen/ops/logsumexp_ops.h>
+#include <ATen/ops/lshift_ops.h>
+#include <ATen/ops/lt_ops.h>
+#include <ATen/ops/lu_solve_ops.h>
+#include <ATen/ops/mH_ops.h>
+#include <ATen/ops/mT_ops.h>
+#include <ATen/ops/masked_fill_ops.h>
+#include <ATen/ops/masked_scatter_ops.h>
+#include <ATen/ops/masked_select_ops.h>
+#include <ATen/ops/matmul_ops.h>
+#include <ATen/ops/matrix_H_ops.h>
+#include <ATen/ops/matrix_exp_ops.h>
+#include <ATen/ops/matrix_power_ops.h>
+#include <ATen/ops/max_ops.h>
+#include <ATen/ops/maximum_ops.h>
+#include <ATen/ops/mean_ops.h>
+#include <ATen/ops/median_ops.h>
+#include <ATen/ops/min_ops.h>
+#include <ATen/ops/minimum_ops.h>
+#include <ATen/ops/mm_ops.h>
+#include <ATen/ops/mode_ops.h>
+#include <ATen/ops/moveaxis_ops.h>
+#include <ATen/ops/movedim_ops.h>
+#include <ATen/ops/msort_ops.h>
+#include <ATen/ops/mul_ops.h>
+#include <ATen/ops/multinomial_ops.h>
+#include <ATen/ops/multiply_ops.h>
+#include <ATen/ops/mv_ops.h>
+#include <ATen/ops/mvlgamma_ops.h>
+#include <ATen/ops/nan_to_num_ops.h>
+#include <ATen/ops/nanmean_ops.h>
+#include <ATen/ops/nanmedian_ops.h>
+#include <ATen/ops/nanquantile_ops.h>
+#include <ATen/ops/nansum_ops.h>
+#include <ATen/ops/narrow_copy_ops.h>
+#include <ATen/ops/narrow_ops.h>
+#include <ATen/ops/ne_ops.h>
+#include <ATen/ops/neg_ops.h>
+#include <ATen/ops/negative_ops.h>
+#include <ATen/ops/new_empty_ops.h>
+#include <ATen/ops/new_empty_strided_ops.h>
+#include <ATen/ops/new_full_ops.h>
+#include <ATen/ops/new_ones_ops.h>
+#include <ATen/ops/new_zeros_ops.h>
+#include <ATen/ops/nextafter_ops.h>
+#include <ATen/ops/nonzero_numpy_ops.h>
+#include <ATen/ops/nonzero_ops.h>
+#include <ATen/ops/nonzero_static_ops.h>
+#include <ATen/ops/norm_ops.h>
+#include <ATen/ops/normal_ops.h>
+#include <ATen/ops/not_equal_ops.h>
+#include <ATen/ops/numpy_T_ops.h>
+#include <ATen/ops/or_ops.h>
+#include <ATen/ops/orgqr_ops.h>
+#include <ATen/ops/ormqr_ops.h>
+#include <ATen/ops/outer_ops.h>
+#include <ATen/ops/output_nr_ops.h>
+#include <ATen/ops/permute_ops.h>
+#include <ATen/ops/pin_memory_ops.h>
+#include <ATen/ops/pinverse_ops.h>
+#include <ATen/ops/polygamma_ops.h>
+#include <ATen/ops/positive_ops.h>
+#include <ATen/ops/pow_ops.h>
+#include <ATen/ops/prelu_ops.h>
+#include <ATen/ops/prod_ops.h>
+#include <ATen/ops/put_ops.h>
+#include <ATen/ops/q_per_channel_axis_ops.h>
+#include <ATen/ops/q_per_channel_scales_ops.h>
+#include <ATen/ops/q_per_channel_zero_points_ops.h>
+#include <ATen/ops/q_scale_ops.h>
+#include <ATen/ops/q_zero_point_ops.h>
+#include <ATen/ops/qr_ops.h>
+#include <ATen/ops/qscheme_ops.h>
+#include <ATen/ops/quantile_ops.h>
+#include <ATen/ops/rad2deg_ops.h>
+#include <ATen/ops/random_ops.h>
+#include <ATen/ops/ravel_ops.h>
+#include <ATen/ops/reciprocal_ops.h>
+#include <ATen/ops/record_stream_ops.h>
+#include <ATen/ops/refine_names_ops.h>
+#include <ATen/ops/relu_ops.h>
+#include <ATen/ops/remainder_ops.h>
+#include <ATen/ops/rename_ops.h>
+#include <ATen/ops/renorm_ops.h>
+#include <ATen/ops/repeat_interleave_ops.h>
+#include <ATen/ops/repeat_ops.h>
+#include <ATen/ops/requires_grad_ops.h>
+#include <ATen/ops/reshape_as_ops.h>
+#include <ATen/ops/reshape_ops.h>
+#include <ATen/ops/resize_as_ops.h>
+#include <ATen/ops/resize_as_sparse_ops.h>
+#include <ATen/ops/resize_ops.h>
+#include <ATen/ops/resolve_conj_ops.h>
+#include <ATen/ops/resolve_neg_ops.h>
+#include <ATen/ops/retain_grad_ops.h>
+#include <ATen/ops/retains_grad_ops.h>
+#include <ATen/ops/roll_ops.h>
+#include <ATen/ops/rot90_ops.h>
+#include <ATen/ops/round_ops.h>
+#include <ATen/ops/row_indices_ops.h>
+#include <ATen/ops/rshift_ops.h>
+#include <ATen/ops/rsqrt_ops.h>
+#include <ATen/ops/scatter_add_ops.h>
+#include <ATen/ops/scatter_ops.h>
+#include <ATen/ops/scatter_reduce_ops.h>
+#include <ATen/ops/select_ops.h>
+#include <ATen/ops/select_scatter_ops.h>
+#include <ATen/ops/set_data_ops.h>
+#include <ATen/ops/set_ops.h>
+#include <ATen/ops/sgn_ops.h>
+#include <ATen/ops/sigmoid_ops.h>
+#include <ATen/ops/sign_ops.h>
+#include <ATen/ops/signbit_ops.h>
+#include <ATen/ops/sin_ops.h>
+#include <ATen/ops/sinc_ops.h>
+#include <ATen/ops/sinh_ops.h>
+#include <ATen/ops/size_ops.h>
+#include <ATen/ops/slice_inverse_ops.h>
+#include <ATen/ops/slice_ops.h>
+#include <ATen/ops/slice_scatter_ops.h>
+#include <ATen/ops/slogdet_ops.h>
+#include <ATen/ops/smm_ops.h>
+#include <ATen/ops/softmax_ops.h>
+#include <ATen/ops/sort_ops.h>
+#include <ATen/ops/sparse_dim_ops.h>
+#include <ATen/ops/sparse_mask_ops.h>
+#include <ATen/ops/sparse_resize_and_clear_ops.h>
+#include <ATen/ops/sparse_resize_ops.h>
+#include <ATen/ops/split_ops.h>
+#include <ATen/ops/split_with_sizes_ops.h>
+#include <ATen/ops/sqrt_ops.h>
+#include <ATen/ops/square_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/sspaddmm_ops.h>
+#include <ATen/ops/std_ops.h>
+#include <ATen/ops/stft_ops.h>
+#include <ATen/ops/stride_ops.h>
+#include <ATen/ops/sub_ops.h>
+#include <ATen/ops/subtract_ops.h>
+#include <ATen/ops/sum_ops.h>
+#include <ATen/ops/sum_to_size_ops.h>
+#include <ATen/ops/svd_ops.h>
+#include <ATen/ops/swapaxes_ops.h>
+#include <ATen/ops/swapdims_ops.h>
+#include <ATen/ops/t_ops.h>
+#include <ATen/ops/take_along_dim_ops.h>
+#include <ATen/ops/take_ops.h>
+#include <ATen/ops/tan_ops.h>
+#include <ATen/ops/tanh_ops.h>
+#include <ATen/ops/tensor_split_ops.h>
+#include <ATen/ops/tile_ops.h>
+#include <ATen/ops/to_dense_ops.h>
+#include <ATen/ops/to_mkldnn_ops.h>
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/to_padded_tensor_ops.h>
+#include <ATen/ops/to_sparse_bsc_ops.h>
+#include <ATen/ops/to_sparse_bsr_ops.h>
+#include <ATen/ops/to_sparse_csc_ops.h>
+#include <ATen/ops/to_sparse_csr_ops.h>
+#include <ATen/ops/to_sparse_ops.h>
+#include <ATen/ops/topk_ops.h>
+#include <ATen/ops/trace_ops.h>
+#include <ATen/ops/transpose_ops.h>
+#include <ATen/ops/triangular_solve_ops.h>
+#include <ATen/ops/tril_ops.h>
+#include <ATen/ops/triu_ops.h>
+#include <ATen/ops/true_divide_ops.h>
+#include <ATen/ops/trunc_ops.h>
+#include <ATen/ops/type_as_ops.h>
+#include <ATen/ops/unbind_ops.h>
+#include <ATen/ops/unflatten_ops.h>
+#include <ATen/ops/unfold_ops.h>
+#include <ATen/ops/uniform_ops.h>
+#include <ATen/ops/unsafe_chunk_ops.h>
+#include <ATen/ops/unsafe_split_ops.h>
+#include <ATen/ops/unsafe_split_with_sizes_ops.h>
+#include <ATen/ops/unsqueeze_ops.h>
+#include <ATen/ops/values_ops.h>
+#include <ATen/ops/var_ops.h>
+#include <ATen/ops/vdot_ops.h>
+#include <ATen/ops/view_as_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/vsplit_ops.h>
+#include <ATen/ops/where_ops.h>
+#include <ATen/ops/xlogy_ops.h>
+#include <ATen/ops/xor_ops.h>
+#include <ATen/ops/zero_ops.h>
+
+namespace at {
+namespace _ops {
+
+} // namespace _ops
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NamedTensor.h b/MLPY/Lib/site-packages/torch/include/ATen/NamedTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..b18f8d95b195a19fc5c78cc941b7ce6de28f4534
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NamedTensor.h
@@ -0,0 +1 @@
+#include <ATen/core/NamedTensor.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NamedTensorUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/NamedTensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..930957b44f168dfaa5a20311756b563b88cb2870
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NamedTensorUtils.h
@@ -0,0 +1,215 @@
+#pragma once
+#include <ATen/NamedTensor.h>
+#include <ATen/TensorNames.h>
+#include <ATen/WrapDimUtilsMulti.h>
+
+#include <ATen/core/DimVector.h>
+#include <ATen/core/Tensor.h>
+#include <functional>
+
+namespace at {
+
+using NameVector = SmallVector<Dimname, kDimVectorStaticSize>;
+
+inline bool has_names(const ITensorListRef& tensors) {
+  return std::any_of(tensors.begin(), tensors.end(), [](const Tensor& t) {
+    return t.has_names();
+  });
+}
+
+// Converts dim to an positional index. Errors if `dim` cannot be used to
+// refer to any dimension of tensor.
+TORCH_API int64_t dimname_to_position(const Tensor& tensor, Dimname dim);
+TORCH_API std::vector<int64_t> dimnames_to_positions(
+    const Tensor& tensor,
+    DimnameList dims);
+
+// Unifies two DimnameList to produce a third. This is useful for implementing
+// the named inference rule for binary broadcasting operations like add.
+//
+// There are three main constraints:
+// 1) Check matching: Names must match positionally from the right.
+// 2) Check misaligned: If a name `n` is in `names`, then it must appear at
+//    the same index from the right in other.
+// 3) The output names are obtained by unifying the names individually from the
+// right.
+TORCH_API std::vector<Dimname> unify_from_right(
+    DimnameList names,
+    DimnameList other,
+    const char* action = "broadcast");
+
+[[noreturn]] inline void reportNYIDimnameOverload(const char* op_name) {
+  TORCH_CHECK(
+      false,
+      op_name,
+      ": You passed a dimname (string) to this op in place of a dimension "
+      "index but it does not yet support this behavior. Please pass a dimension "
+      "index to work around this.");
+}
+
+// [NOTE] Writing name inference rules
+//
+// Operators that support named tensors are either composed of operations that
+// support named tensors or implement some name inference rule. An op that
+// implements its own name inference rule generally looks like the following:
+//
+// Tensor op(...) {
+//   perform_shape_checks(...);
+//   # (1)
+//   auto maybe_outnames = compute_outnames(...);
+//   auto result = [&]() {
+//     NoNamesGuard guard;
+//     return op_impl(...);
+//   }();
+//   # (2)
+//   propagate_names_if_nonempty(result, maybe_outnames);
+//
+// Each op has (1) a compute outnames step and (2) a propagate names step.
+//
+// compute_outnames is responsible for checking that input names match and
+// determining what the output names should be. It returns either:
+// - {} (if the inputs tensors are all unnamed)
+// - non-empty outnames.
+//
+// propagate_names_if_nonempty propagates the outnames if they exist to the
+// result tensors.
+//
+// The {} case is an optimization; if the user does not use named tensors they
+// pay no perf cost for it.
+
+namespace namedinference {
+
+const Tensor& propagate_names_if_present_and_nonempty(
+    const Tensor& result,
+    c10::optional<DimnameList> maybe_names,
+    bool validate_names = false);
+// Propagates `names` to `result` if `names` is not empty.
+// `names` can be empty; see [NOTE] Writing name inference rules
+// If `names` is not empty, `names.size()` should equal `result.dim()`.
+// When in doubt, use this overload instead of the others.
+TORCH_API const Tensor& propagate_names_if_nonempty(
+    const Tensor& result,
+    DimnameList maybe_names,
+    bool validate_names = false);
+
+// Propagates `names` to `result`. Only use this if we are certain that there
+// are names to propagate (that names is not empty).
+TORCH_API const Tensor& propagate_names(
+    const Tensor& result,
+    DimnameList names,
+    bool validate_names = false);
+
+// Propagates all names from src to result.
+TORCH_API void propagate_names(const Tensor& result, const Tensor& src);
+
+// Propagates all names except for those at the excluded_idxs.
+TORCH_API void propagate_names_except(
+    const Tensor& result,
+    const Tensor& src,
+    IntArrayRef excluded_idxs);
+
+// Used for reduction ops that have a `keepdim` arg.
+TORCH_API void propagate_names_for_reduction(
+    const Tensor& result,
+    const Tensor& src,
+    IntArrayRef excluded_idxs,
+    bool keepdim);
+
+TORCH_API void propagate_names_for_expand(
+    const Tensor& result,
+    const Tensor& self);
+
+TORCH_API std::vector<Dimname> compute_cat_outnames(
+    const MaterializedITensorListRef& tensors);
+
+TORCH_API std::vector<Dimname> compute_broadcast_outnames(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API std::vector<Dimname> broadcast_to_outnames(
+    const Tensor& tensor,
+    const Tensor& reference_tensor,
+    const char* op_name);
+
+TORCH_API std::vector<Dimname> compute_matmul_outnames(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API std::vector<Dimname> compute_cdist_outnames(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API std::vector<Dimname> compute_bmm_outnames(
+    const Tensor& result,
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API std::vector<Dimname> compute_squeeze_outnames(const Tensor& tensor);
+TORCH_API std::vector<Dimname> compute_squeeze_outnames(
+    const Tensor& tensor,
+    std::bitset<dim_bitset_size> dims);
+
+std::vector<Dimname> compute_diagonal_outnames(
+    const Tensor& tensor,
+    int64_t dim1,
+    int64_t dim2);
+
+// TensorImpl* overloads for Legacy TH/THC code. Use these sparingly.
+
+TORCH_API TensorImpl* propagate_names_if_nonempty(
+    TensorImpl* result,
+    DimnameList maybe_names,
+    bool validate_names = false);
+
+TORCH_API TensorImpl* propagate_names(
+    TensorImpl* result,
+    DimnameList names,
+    bool validate_names = false);
+
+TORCH_API void propagate_names(TensorImpl* result, /*const */ TensorImpl* src);
+
+TORCH_API inline void propagate_names(
+    const TensorBase& result,
+    DimnameList names,
+    bool validate_names = false) {
+  propagate_names(result.unsafeGetTensorImpl(), names, validate_names);
+}
+
+TORCH_API inline void propagate_names_if_nonempty(
+    const TensorBase& result,
+    DimnameList names,
+    bool validate_names = false) {
+  propagate_names_if_nonempty(
+      result.unsafeGetTensorImpl(), names, validate_names);
+}
+
+TORCH_API inline void propagate_names(
+    const TensorBase& result,
+    const TensorBase& src) {
+  propagate_names(result.unsafeGetTensorImpl(), src.unsafeGetTensorImpl());
+}
+
+// result = m1 @ m2 + bias
+TORCH_API std::vector<Dimname> propagate_names_for_addmm(
+    const Tensor& m1,
+    const Tensor& m2,
+    const Tensor& bias);
+
+TORCH_API std::vector<Dimname> propagate_names_for_addmv(
+    const Tensor& mat,
+    const Tensor& vec,
+    const Tensor& bias);
+
+TORCH_API void check_names_for_dot(TensorImpl* vec1, TensorImpl* vec2);
+
+TORCH_API std::vector<Dimname> compute_baddbmm_outnames(
+    const Tensor& result,
+    const Tensor& self,
+    const Tensor& other,
+    const Tensor& bias);
+
+TORCH_API bool are_names_equal(TensorImpl* self, TensorImpl* other);
+
+} // namespace namedinference
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NativeFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/NativeFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee53762fbe68c3754aa9fe321446d24cc38aad84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NativeFunctions.h
@@ -0,0 +1,1317 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunctions.h
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the      \
+  file will need to be re-compiled every time an operator is changed or added.  \
+  Consider including a specific operator from <ATen/ops/{my_operator}_native.h> \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+#include <ATen/ops/_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/_adaptive_avg_pool3d_native.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/_add_batch_dim_native.h>
+#include <ATen/ops/_add_relu_native.h>
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_aminmax_native.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_native.h>
+#include <ATen/ops/_amp_update_scale_native.h>
+#include <ATen/ops/_assert_async_native.h>
+#include <ATen/ops/_assert_scalar_native.h>
+#include <ATen/ops/_assert_tensor_metadata_native.h>
+#include <ATen/ops/_autocast_to_full_precision_native.h>
+#include <ATen/ops/_autocast_to_reduced_precision_native.h>
+#include <ATen/ops/_backward_native.h>
+#include <ATen/ops/_batch_norm_impl_index_native.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_native.h>
+#include <ATen/ops/_cast_Byte_native.h>
+#include <ATen/ops/_cast_Char_native.h>
+#include <ATen/ops/_cast_Double_native.h>
+#include <ATen/ops/_cast_Float_native.h>
+#include <ATen/ops/_cast_Half_native.h>
+#include <ATen/ops/_cast_Int_native.h>
+#include <ATen/ops/_cast_Long_native.h>
+#include <ATen/ops/_cast_Short_native.h>
+#include <ATen/ops/_cdist_backward_native.h>
+#include <ATen/ops/_cdist_forward_native.h>
+#include <ATen/ops/_cholesky_solve_helper_native.h>
+#include <ATen/ops/_choose_qparams_per_tensor_native.h>
+#include <ATen/ops/_chunk_cat_native.h>
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_coalesced_native.h>
+#include <ATen/ops/_compute_linear_combination_native.h>
+#include <ATen/ops/_conj_native.h>
+#include <ATen/ops/_conj_copy_native.h>
+#include <ATen/ops/_conj_physical_native.h>
+#include <ATen/ops/_conv_depthwise2d_native.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_native.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_native.h>
+#include <ATen/ops/_convert_weight_to_int4pack_native.h>
+#include <ATen/ops/_convolution_native.h>
+#include <ATen/ops/_convolution_double_backward_native.h>
+#include <ATen/ops/_convolution_mode_native.h>
+#include <ATen/ops/_copy_from_native.h>
+#include <ATen/ops/_copy_from_and_resize_native.h>
+#include <ATen/ops/_cslt_compress_native.h>
+#include <ATen/ops/_cslt_sparse_mm_native.h>
+#include <ATen/ops/_cslt_sparse_mm_search_native.h>
+#include <ATen/ops/_ctc_loss_native.h>
+#include <ATen/ops/_ctc_loss_backward_native.h>
+#include <ATen/ops/_cudnn_ctc_loss_native.h>
+#include <ATen/ops/_cudnn_init_dropout_state_native.h>
+#include <ATen/ops/_cudnn_rnn_native.h>
+#include <ATen/ops/_cudnn_rnn_backward_native.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_native.h>
+#include <ATen/ops/_cufft_clear_plan_cache_native.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_native.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_native.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_native.h>
+#include <ATen/ops/_cummax_helper_native.h>
+#include <ATen/ops/_cummin_helper_native.h>
+#include <ATen/ops/_debug_has_internal_overlap_native.h>
+#include <ATen/ops/_dimI_native.h>
+#include <ATen/ops/_dimV_native.h>
+#include <ATen/ops/_dim_arange_native.h>
+#include <ATen/ops/_dirichlet_grad_native.h>
+#include <ATen/ops/_efficient_attention_backward_native.h>
+#include <ATen/ops/_efficient_attention_forward_native.h>
+#include <ATen/ops/_efficientzerotensor_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/_embedding_bag_backward_native.h>
+#include <ATen/ops/_embedding_bag_dense_backward_native.h>
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_native.h>
+#include <ATen/ops/_embedding_bag_sparse_backward_native.h>
+#include <ATen/ops/_empty_affine_quantized_native.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_native.h>
+#include <ATen/ops/_euclidean_dist_native.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_native.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_native.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_native.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_native.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_native.h>
+#include <ATen/ops/_fft_c2c_native.h>
+#include <ATen/ops/_fft_c2r_native.h>
+#include <ATen/ops/_fft_r2c_native.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_native.h>
+#include <ATen/ops/_flash_attention_backward_native.h>
+#include <ATen/ops/_flash_attention_forward_native.h>
+#include <ATen/ops/_foobar_native.h>
+#include <ATen/ops/_foreach_abs_native.h>
+#include <ATen/ops/_foreach_acos_native.h>
+#include <ATen/ops/_foreach_add_native.h>
+#include <ATen/ops/_foreach_addcdiv_native.h>
+#include <ATen/ops/_foreach_addcmul_native.h>
+#include <ATen/ops/_foreach_asin_native.h>
+#include <ATen/ops/_foreach_atan_native.h>
+#include <ATen/ops/_foreach_ceil_native.h>
+#include <ATen/ops/_foreach_clamp_max_native.h>
+#include <ATen/ops/_foreach_clamp_min_native.h>
+#include <ATen/ops/_foreach_copy_native.h>
+#include <ATen/ops/_foreach_cos_native.h>
+#include <ATen/ops/_foreach_cosh_native.h>
+#include <ATen/ops/_foreach_div_native.h>
+#include <ATen/ops/_foreach_erf_native.h>
+#include <ATen/ops/_foreach_erfc_native.h>
+#include <ATen/ops/_foreach_exp_native.h>
+#include <ATen/ops/_foreach_expm1_native.h>
+#include <ATen/ops/_foreach_floor_native.h>
+#include <ATen/ops/_foreach_frac_native.h>
+#include <ATen/ops/_foreach_lerp_native.h>
+#include <ATen/ops/_foreach_lgamma_native.h>
+#include <ATen/ops/_foreach_log_native.h>
+#include <ATen/ops/_foreach_log10_native.h>
+#include <ATen/ops/_foreach_log1p_native.h>
+#include <ATen/ops/_foreach_log2_native.h>
+#include <ATen/ops/_foreach_maximum_native.h>
+#include <ATen/ops/_foreach_minimum_native.h>
+#include <ATen/ops/_foreach_mul_native.h>
+#include <ATen/ops/_foreach_neg_native.h>
+#include <ATen/ops/_foreach_norm_native.h>
+#include <ATen/ops/_foreach_pow_native.h>
+#include <ATen/ops/_foreach_reciprocal_native.h>
+#include <ATen/ops/_foreach_round_native.h>
+#include <ATen/ops/_foreach_sigmoid_native.h>
+#include <ATen/ops/_foreach_sign_native.h>
+#include <ATen/ops/_foreach_sin_native.h>
+#include <ATen/ops/_foreach_sinh_native.h>
+#include <ATen/ops/_foreach_sqrt_native.h>
+#include <ATen/ops/_foreach_sub_native.h>
+#include <ATen/ops/_foreach_tan_native.h>
+#include <ATen/ops/_foreach_tanh_native.h>
+#include <ATen/ops/_foreach_trunc_native.h>
+#include <ATen/ops/_foreach_zero_native.h>
+#include <ATen/ops/_functional_assert_async_native.h>
+#include <ATen/ops/_functional_assert_scalar_native.h>
+#include <ATen/ops/_functional_sym_constrain_range_native.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_native.h>
+#include <ATen/ops/_fused_adam_native.h>
+#include <ATen/ops/_fused_adamw_native.h>
+#include <ATen/ops/_fused_dropout_native.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_native.h>
+#include <ATen/ops/_fused_sdp_choice_native.h>
+#include <ATen/ops/_fused_sgd_native.h>
+#include <ATen/ops/_fw_primal_native.h>
+#include <ATen/ops/_fw_primal_copy_native.h>
+#include <ATen/ops/_gather_sparse_backward_native.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_native.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward_native.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type_native.h>
+#include <ATen/ops/_has_same_storage_numel_native.h>
+#include <ATen/ops/_histogramdd_bin_edges_native.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_native.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_native.h>
+#include <ATen/ops/_index_put_impl_native.h>
+#include <ATen/ops/_indices_native.h>
+#include <ATen/ops/_indices_copy_native.h>
+#include <ATen/ops/_int_mm_native.h>
+#include <ATen/ops/_is_all_true_native.h>
+#include <ATen/ops/_is_any_true_native.h>
+#include <ATen/ops/_is_zerotensor_native.h>
+#include <ATen/ops/_lazy_clone_native.h>
+#include <ATen/ops/_linalg_check_errors_native.h>
+#include <ATen/ops/_linalg_det_native.h>
+#include <ATen/ops/_linalg_eigh_native.h>
+#include <ATen/ops/_linalg_eigvals_native.h>
+#include <ATen/ops/_linalg_slogdet_native.h>
+#include <ATen/ops/_linalg_solve_ex_native.h>
+#include <ATen/ops/_linalg_svd_native.h>
+#include <ATen/ops/_local_scalar_dense_native.h>
+#include <ATen/ops/_log_softmax_native.h>
+#include <ATen/ops/_log_softmax_backward_data_native.h>
+#include <ATen/ops/_logcumsumexp_native.h>
+#include <ATen/ops/_lstm_mps_native.h>
+#include <ATen/ops/_lu_with_info_native.h>
+#include <ATen/ops/_make_dep_token_native.h>
+#include <ATen/ops/_make_dual_native.h>
+#include <ATen/ops/_make_dual_copy_native.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_native.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_native.h>
+#include <ATen/ops/_masked_scale_native.h>
+#include <ATen/ops/_masked_softmax_native.h>
+#include <ATen/ops/_masked_softmax_backward_native.h>
+#include <ATen/ops/_mixed_dtypes_linear_native.h>
+#include <ATen/ops/_mkldnn_reshape_native.h>
+#include <ATen/ops/_mkldnn_transpose_native.h>
+#include <ATen/ops/_mps_convolution_native.h>
+#include <ATen/ops/_mps_convolution_transpose_native.h>
+#include <ATen/ops/_native_batch_norm_legit_native.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_native.h>
+#include <ATen/ops/_native_multi_head_attention_native.h>
+#include <ATen/ops/_neg_view_native.h>
+#include <ATen/ops/_neg_view_copy_native.h>
+#include <ATen/ops/_nested_from_padded_native.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_native.h>
+#include <ATen/ops/_nested_get_jagged_dummy_native.h>
+#include <ATen/ops/_nested_get_lengths_native.h>
+#include <ATen/ops/_nested_get_offsets_native.h>
+#include <ATen/ops/_nested_get_ragged_idx_native.h>
+#include <ATen/ops/_nested_get_values_native.h>
+#include <ATen/ops/_nested_get_values_copy_native.h>
+#include <ATen/ops/_nested_select_backward_native.h>
+#include <ATen/ops/_nested_sum_backward_native.h>
+#include <ATen/ops/_nested_tensor_from_mask_native.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_native.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_native.h>
+#include <ATen/ops/_nested_tensor_size_native.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape_native.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_native.h>
+#include <ATen/ops/_nested_tensor_strides_native.h>
+#include <ATen/ops/_nested_view_from_buffer_native.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_native.h>
+#include <ATen/ops/_nested_view_from_jagged_native.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_native.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_native.h>
+#include <ATen/ops/_nnpack_available_native.h>
+#include <ATen/ops/_nnpack_spatial_convolution_native.h>
+#include <ATen/ops/_nnz_native.h>
+#include <ATen/ops/_pack_padded_sequence_native.h>
+#include <ATen/ops/_pack_padded_sequence_backward_native.h>
+#include <ATen/ops/_pad_circular_native.h>
+#include <ATen/ops/_pad_enum_native.h>
+#include <ATen/ops/_pad_packed_sequence_native.h>
+#include <ATen/ops/_pdist_backward_native.h>
+#include <ATen/ops/_pdist_forward_native.h>
+#include <ATen/ops/_pin_memory_native.h>
+#include <ATen/ops/_prelu_kernel_native.h>
+#include <ATen/ops/_prelu_kernel_backward_native.h>
+#include <ATen/ops/_print_native.h>
+#include <ATen/ops/_propagate_xla_data_native.h>
+#include <ATen/ops/_remove_batch_dim_native.h>
+#include <ATen/ops/_reshape_alias_native.h>
+#include <ATen/ops/_reshape_alias_copy_native.h>
+#include <ATen/ops/_reshape_copy_native.h>
+#include <ATen/ops/_reshape_from_tensor_native.h>
+#include <ATen/ops/_resize_output_native.h>
+#include <ATen/ops/_rowwise_prune_native.h>
+#include <ATen/ops/_sample_dirichlet_native.h>
+#include <ATen/ops/_saturate_weight_to_fp16_native.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_native.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_native.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_native.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward_native.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_native.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward_native.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_native.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_segment_reduce_backward_native.h>
+#include <ATen/ops/_shape_as_tensor_native.h>
+#include <ATen/ops/_slow_conv2d_backward_native.h>
+#include <ATen/ops/_slow_conv2d_forward_native.h>
+#include <ATen/ops/_sobol_engine_draw_native.h>
+#include <ATen/ops/_sobol_engine_ff_native.h>
+#include <ATen/ops/_sobol_engine_initialize_state_native.h>
+#include <ATen/ops/_sobol_engine_scramble_native.h>
+#include <ATen/ops/_softmax_native.h>
+#include <ATen/ops/_softmax_backward_data_native.h>
+#include <ATen/ops/_sparse_addmm_native.h>
+#include <ATen/ops/_sparse_broadcast_to_native.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_native.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_native.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_native.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_csr_prod_native.h>
+#include <ATen/ops/_sparse_csr_sum_native.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_native.h>
+#include <ATen/ops/_sparse_log_softmax_native.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_native.h>
+#include <ATen/ops/_sparse_mask_projection_native.h>
+#include <ATen/ops/_sparse_mm_native.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_native.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward_native.h>
+#include <ATen/ops/_sparse_semi_structured_linear_native.h>
+#include <ATen/ops/_sparse_softmax_native.h>
+#include <ATen/ops/_sparse_softmax_backward_data_native.h>
+#include <ATen/ops/_sparse_sparse_matmul_native.h>
+#include <ATen/ops/_sparse_sum_native.h>
+#include <ATen/ops/_sparse_sum_backward_native.h>
+#include <ATen/ops/_spdiags_native.h>
+#include <ATen/ops/_stack_native.h>
+#include <ATen/ops/_standard_gamma_native.h>
+#include <ATen/ops/_standard_gamma_grad_native.h>
+#include <ATen/ops/_test_ambiguous_defaults_native.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_native.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_native.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_native.h>
+#include <ATen/ops/_test_check_tensor_native.h>
+#include <ATen/ops/_test_functorch_fallback_native.h>
+#include <ATen/ops/_test_optional_filled_intlist_native.h>
+#include <ATen/ops/_test_optional_floatlist_native.h>
+#include <ATen/ops/_test_optional_intlist_native.h>
+#include <ATen/ops/_test_parallel_materialize_native.h>
+#include <ATen/ops/_test_serialization_subcmul_native.h>
+#include <ATen/ops/_test_string_default_native.h>
+#include <ATen/ops/_test_warn_in_autograd_native.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_native.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_native.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_native.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_native.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_native.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_native.h>
+#include <ATen/ops/_to_copy_native.h>
+#include <ATen/ops/_to_cpu_native.h>
+#include <ATen/ops/_to_dense_native.h>
+#include <ATen/ops/_to_sparse_native.h>
+#include <ATen/ops/_to_sparse_bsc_native.h>
+#include <ATen/ops/_to_sparse_bsr_native.h>
+#include <ATen/ops/_to_sparse_csc_native.h>
+#include <ATen/ops/_to_sparse_csr_native.h>
+#include <ATen/ops/_to_sparse_semi_structured_native.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_native.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_native.h>
+#include <ATen/ops/_trilinear_native.h>
+#include <ATen/ops/_triton_multi_head_attention_native.h>
+#include <ATen/ops/_triton_scaled_dot_attention_native.h>
+#include <ATen/ops/_unique_native.h>
+#include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/_unpack_dual_native.h>
+#include <ATen/ops/_unsafe_index_native.h>
+#include <ATen/ops/_unsafe_index_put_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_native.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_native.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_native.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_native.h>
+#include <ATen/ops/_use_cudnn_ctc_loss_native.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_native.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_native.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_native.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_native.h>
+#include <ATen/ops/_values_native.h>
+#include <ATen/ops/_values_copy_native.h>
+#include <ATen/ops/_version_native.h>
+#include <ATen/ops/_weight_int4pack_mm_native.h>
+#include <ATen/ops/_weight_int8pack_mm_native.h>
+#include <ATen/ops/_weight_norm_native.h>
+#include <ATen/ops/_weight_norm_differentiable_backward_native.h>
+#include <ATen/ops/_weight_norm_interface_native.h>
+#include <ATen/ops/_weight_norm_interface_backward_native.h>
+#include <ATen/ops/abs_native.h>
+#include <ATen/ops/absolute_native.h>
+#include <ATen/ops/acos_native.h>
+#include <ATen/ops/acosh_native.h>
+#include <ATen/ops/adaptive_avg_pool1d_native.h>
+#include <ATen/ops/adaptive_avg_pool2d_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_native.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool1d_native.h>
+#include <ATen/ops/adaptive_max_pool2d_native.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_native.h>
+#include <ATen/ops/adaptive_max_pool3d_native.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_native.h>
+#include <ATen/ops/add_native.h>
+#include <ATen/ops/addbmm_native.h>
+#include <ATen/ops/addcdiv_native.h>
+#include <ATen/ops/addcmul_native.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/addr_native.h>
+#include <ATen/ops/adjoint_native.h>
+#include <ATen/ops/affine_grid_generator_native.h>
+#include <ATen/ops/affine_grid_generator_backward_native.h>
+#include <ATen/ops/alias_native.h>
+#include <ATen/ops/alias_copy_native.h>
+#include <ATen/ops/align_as_native.h>
+#include <ATen/ops/align_tensors_native.h>
+#include <ATen/ops/align_to_native.h>
+#include <ATen/ops/all_native.h>
+#include <ATen/ops/allclose_native.h>
+#include <ATen/ops/alpha_dropout_native.h>
+#include <ATen/ops/amax_native.h>
+#include <ATen/ops/amin_native.h>
+#include <ATen/ops/aminmax_native.h>
+#include <ATen/ops/and_native.h>
+#include <ATen/ops/angle_native.h>
+#include <ATen/ops/any_native.h>
+#include <ATen/ops/arange_native.h>
+#include <ATen/ops/arccos_native.h>
+#include <ATen/ops/arccosh_native.h>
+#include <ATen/ops/arcsin_native.h>
+#include <ATen/ops/arcsinh_native.h>
+#include <ATen/ops/arctan_native.h>
+#include <ATen/ops/arctan2_native.h>
+#include <ATen/ops/arctanh_native.h>
+#include <ATen/ops/argmax_native.h>
+#include <ATen/ops/argmin_native.h>
+#include <ATen/ops/argsort_native.h>
+#include <ATen/ops/argwhere_native.h>
+#include <ATen/ops/as_strided_native.h>
+#include <ATen/ops/as_strided_copy_native.h>
+#include <ATen/ops/as_strided_scatter_native.h>
+#include <ATen/ops/asin_native.h>
+#include <ATen/ops/asinh_native.h>
+#include <ATen/ops/atan_native.h>
+#include <ATen/ops/atan2_native.h>
+#include <ATen/ops/atanh_native.h>
+#include <ATen/ops/atleast_1d_native.h>
+#include <ATen/ops/atleast_2d_native.h>
+#include <ATen/ops/atleast_3d_native.h>
+#include <ATen/ops/avg_pool1d_native.h>
+#include <ATen/ops/avg_pool2d_native.h>
+#include <ATen/ops/avg_pool2d_backward_native.h>
+#include <ATen/ops/avg_pool3d_native.h>
+#include <ATen/ops/avg_pool3d_backward_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bartlett_window_native.h>
+#include <ATen/ops/batch_norm_native.h>
+#include <ATen/ops/batch_norm_backward_elemt_native.h>
+#include <ATen/ops/batch_norm_backward_reduce_native.h>
+#include <ATen/ops/batch_norm_elemt_native.h>
+#include <ATen/ops/batch_norm_gather_stats_native.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_native.h>
+#include <ATen/ops/batch_norm_stats_native.h>
+#include <ATen/ops/batch_norm_update_stats_native.h>
+#include <ATen/ops/bernoulli_native.h>
+#include <ATen/ops/bilinear_native.h>
+#include <ATen/ops/binary_cross_entropy_native.h>
+#include <ATen/ops/binary_cross_entropy_backward_native.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_native.h>
+#include <ATen/ops/bincount_native.h>
+#include <ATen/ops/binomial_native.h>
+#include <ATen/ops/bitwise_and_native.h>
+#include <ATen/ops/bitwise_left_shift_native.h>
+#include <ATen/ops/bitwise_not_native.h>
+#include <ATen/ops/bitwise_or_native.h>
+#include <ATen/ops/bitwise_right_shift_native.h>
+#include <ATen/ops/bitwise_xor_native.h>
+#include <ATen/ops/blackman_window_native.h>
+#include <ATen/ops/block_diag_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/broadcast_tensors_native.h>
+#include <ATen/ops/broadcast_to_native.h>
+#include <ATen/ops/bucketize_native.h>
+#include <ATen/ops/can_cast_native.h>
+#include <ATen/ops/cartesian_prod_native.h>
+#include <ATen/ops/cat_native.h>
+#include <ATen/ops/cauchy_native.h>
+#include <ATen/ops/ccol_indices_native.h>
+#include <ATen/ops/ccol_indices_copy_native.h>
+#include <ATen/ops/cdist_native.h>
+#include <ATen/ops/ceil_native.h>
+#include <ATen/ops/celu_native.h>
+#include <ATen/ops/chain_matmul_native.h>
+#include <ATen/ops/chalf_native.h>
+#include <ATen/ops/channel_shuffle_native.h>
+#include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/cholesky_inverse_native.h>
+#include <ATen/ops/cholesky_solve_native.h>
+#include <ATen/ops/choose_qparams_optimized_native.h>
+#include <ATen/ops/chunk_native.h>
+#include <ATen/ops/clamp_native.h>
+#include <ATen/ops/clamp_max_native.h>
+#include <ATen/ops/clamp_min_native.h>
+#include <ATen/ops/clip_native.h>
+#include <ATen/ops/clone_native.h>
+#include <ATen/ops/coalesce_native.h>
+#include <ATen/ops/col2im_native.h>
+#include <ATen/ops/col_indices_native.h>
+#include <ATen/ops/col_indices_copy_native.h>
+#include <ATen/ops/column_stack_native.h>
+#include <ATen/ops/combinations_native.h>
+#include <ATen/ops/complex_native.h>
+#include <ATen/ops/concat_native.h>
+#include <ATen/ops/concatenate_native.h>
+#include <ATen/ops/conj_native.h>
+#include <ATen/ops/conj_physical_native.h>
+#include <ATen/ops/constant_pad_nd_native.h>
+#include <ATen/ops/contiguous_native.h>
+#include <ATen/ops/conv1d_native.h>
+#include <ATen/ops/conv2d_native.h>
+#include <ATen/ops/conv3d_native.h>
+#include <ATen/ops/conv_depthwise3d_native.h>
+#include <ATen/ops/conv_tbc_native.h>
+#include <ATen/ops/conv_tbc_backward_native.h>
+#include <ATen/ops/conv_transpose1d_native.h>
+#include <ATen/ops/conv_transpose2d_native.h>
+#include <ATen/ops/conv_transpose3d_native.h>
+#include <ATen/ops/convolution_native.h>
+#include <ATen/ops/convolution_backward_native.h>
+#include <ATen/ops/convolution_backward_overrideable_native.h>
+#include <ATen/ops/convolution_overrideable_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/copy_sparse_to_sparse_native.h>
+#include <ATen/ops/copysign_native.h>
+#include <ATen/ops/corrcoef_native.h>
+#include <ATen/ops/cos_native.h>
+#include <ATen/ops/cosh_native.h>
+#include <ATen/ops/cosine_embedding_loss_native.h>
+#include <ATen/ops/cosine_similarity_native.h>
+#include <ATen/ops/count_nonzero_native.h>
+#include <ATen/ops/cov_native.h>
+#include <ATen/ops/cross_native.h>
+#include <ATen/ops/cross_entropy_loss_native.h>
+#include <ATen/ops/crow_indices_native.h>
+#include <ATen/ops/crow_indices_copy_native.h>
+#include <ATen/ops/ctc_loss_native.h>
+#include <ATen/ops/cudnn_affine_grid_generator_native.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_native.h>
+#include <ATen/ops/cudnn_batch_norm_native.h>
+#include <ATen/ops/cudnn_batch_norm_backward_native.h>
+#include <ATen/ops/cudnn_convolution_native.h>
+#include <ATen/ops/cudnn_convolution_add_relu_native.h>
+#include <ATen/ops/cudnn_convolution_relu_native.h>
+#include <ATen/ops/cudnn_convolution_transpose_native.h>
+#include <ATen/ops/cudnn_grid_sampler_native.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_native.h>
+#include <ATen/ops/cudnn_is_acceptable_native.h>
+#include <ATen/ops/cummax_native.h>
+#include <ATen/ops/cummaxmin_backward_native.h>
+#include <ATen/ops/cummin_native.h>
+#include <ATen/ops/cumprod_native.h>
+#include <ATen/ops/cumprod_backward_native.h>
+#include <ATen/ops/cumsum_native.h>
+#include <ATen/ops/cumulative_trapezoid_native.h>
+#include <ATen/ops/data_native.h>
+#include <ATen/ops/deg2rad_native.h>
+#include <ATen/ops/dense_dim_native.h>
+#include <ATen/ops/dequantize_native.h>
+#include <ATen/ops/det_native.h>
+#include <ATen/ops/detach_native.h>
+#include <ATen/ops/detach_copy_native.h>
+#include <ATen/ops/diag_native.h>
+#include <ATen/ops/diag_embed_native.h>
+#include <ATen/ops/diagflat_native.h>
+#include <ATen/ops/diagonal_native.h>
+#include <ATen/ops/diagonal_backward_native.h>
+#include <ATen/ops/diagonal_copy_native.h>
+#include <ATen/ops/diagonal_scatter_native.h>
+#include <ATen/ops/diff_native.h>
+#include <ATen/ops/digamma_native.h>
+#include <ATen/ops/dist_native.h>
+#include <ATen/ops/div_native.h>
+#include <ATen/ops/divide_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/dropout_native.h>
+#include <ATen/ops/dsplit_native.h>
+#include <ATen/ops/dstack_native.h>
+#include <ATen/ops/einsum_native.h>
+#include <ATen/ops/elu_native.h>
+#include <ATen/ops/elu_backward_native.h>
+#include <ATen/ops/embedding_native.h>
+#include <ATen/ops/embedding_backward_native.h>
+#include <ATen/ops/embedding_bag_native.h>
+#include <ATen/ops/embedding_dense_backward_native.h>
+#include <ATen/ops/embedding_renorm_native.h>
+#include <ATen/ops/embedding_sparse_backward_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/empty_like_native.h>
+#include <ATen/ops/empty_permuted_native.h>
+#include <ATen/ops/empty_quantized_native.h>
+#include <ATen/ops/empty_strided_native.h>
+#include <ATen/ops/eq_native.h>
+#include <ATen/ops/equal_native.h>
+#include <ATen/ops/erf_native.h>
+#include <ATen/ops/erfc_native.h>
+#include <ATen/ops/erfinv_native.h>
+#include <ATen/ops/exp_native.h>
+#include <ATen/ops/exp2_native.h>
+#include <ATen/ops/expand_native.h>
+#include <ATen/ops/expand_as_native.h>
+#include <ATen/ops/expand_copy_native.h>
+#include <ATen/ops/expm1_native.h>
+#include <ATen/ops/exponential_native.h>
+#include <ATen/ops/eye_native.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_native.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_native.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward_native.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_native.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_native.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_native.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_native.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_native.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_native.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_native.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_native.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_native.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_native.h>
+#include <ATen/ops/feature_alpha_dropout_native.h>
+#include <ATen/ops/feature_dropout_native.h>
+#include <ATen/ops/fft_fft_native.h>
+#include <ATen/ops/fft_fft2_native.h>
+#include <ATen/ops/fft_fftfreq_native.h>
+#include <ATen/ops/fft_fftn_native.h>
+#include <ATen/ops/fft_fftshift_native.h>
+#include <ATen/ops/fft_hfft_native.h>
+#include <ATen/ops/fft_hfft2_native.h>
+#include <ATen/ops/fft_hfftn_native.h>
+#include <ATen/ops/fft_ifft_native.h>
+#include <ATen/ops/fft_ifft2_native.h>
+#include <ATen/ops/fft_ifftn_native.h>
+#include <ATen/ops/fft_ifftshift_native.h>
+#include <ATen/ops/fft_ihfft_native.h>
+#include <ATen/ops/fft_ihfft2_native.h>
+#include <ATen/ops/fft_ihfftn_native.h>
+#include <ATen/ops/fft_irfft_native.h>
+#include <ATen/ops/fft_irfft2_native.h>
+#include <ATen/ops/fft_irfftn_native.h>
+#include <ATen/ops/fft_rfft_native.h>
+#include <ATen/ops/fft_rfft2_native.h>
+#include <ATen/ops/fft_rfftfreq_native.h>
+#include <ATen/ops/fft_rfftn_native.h>
+#include <ATen/ops/fill_native.h>
+#include <ATen/ops/fill_diagonal_native.h>
+#include <ATen/ops/fix_native.h>
+#include <ATen/ops/flatten_native.h>
+#include <ATen/ops/flatten_dense_tensors_native.h>
+#include <ATen/ops/flip_native.h>
+#include <ATen/ops/fliplr_native.h>
+#include <ATen/ops/flipud_native.h>
+#include <ATen/ops/float_power_native.h>
+#include <ATen/ops/floor_native.h>
+#include <ATen/ops/floor_divide_native.h>
+#include <ATen/ops/fmax_native.h>
+#include <ATen/ops/fmin_native.h>
+#include <ATen/ops/fmod_native.h>
+#include <ATen/ops/frac_native.h>
+#include <ATen/ops/fractional_max_pool2d_native.h>
+#include <ATen/ops/fractional_max_pool2d_backward_native.h>
+#include <ATen/ops/fractional_max_pool3d_native.h>
+#include <ATen/ops/fractional_max_pool3d_backward_native.h>
+#include <ATen/ops/frexp_native.h>
+#include <ATen/ops/frobenius_norm_native.h>
+#include <ATen/ops/from_file_native.h>
+#include <ATen/ops/full_native.h>
+#include <ATen/ops/full_like_native.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant_native.h>
+#include <ATen/ops/gather_native.h>
+#include <ATen/ops/gather_backward_native.h>
+#include <ATen/ops/gcd_native.h>
+#include <ATen/ops/ge_native.h>
+#include <ATen/ops/gelu_native.h>
+#include <ATen/ops/gelu_backward_native.h>
+#include <ATen/ops/geometric_native.h>
+#include <ATen/ops/geqrf_native.h>
+#include <ATen/ops/ger_native.h>
+#include <ATen/ops/glu_native.h>
+#include <ATen/ops/glu_backward_native.h>
+#include <ATen/ops/glu_backward_jvp_native.h>
+#include <ATen/ops/glu_jvp_native.h>
+#include <ATen/ops/gradient_native.h>
+#include <ATen/ops/greater_native.h>
+#include <ATen/ops/greater_equal_native.h>
+#include <ATen/ops/grid_sampler_native.h>
+#include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_2d_backward_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
+#include <ATen/ops/grid_sampler_3d_backward_native.h>
+#include <ATen/ops/group_norm_native.h>
+#include <ATen/ops/gru_native.h>
+#include <ATen/ops/gru_cell_native.h>
+#include <ATen/ops/gt_native.h>
+#include <ATen/ops/hamming_window_native.h>
+#include <ATen/ops/hann_window_native.h>
+#include <ATen/ops/hardshrink_native.h>
+#include <ATen/ops/hardshrink_backward_native.h>
+#include <ATen/ops/hardsigmoid_native.h>
+#include <ATen/ops/hardsigmoid_backward_native.h>
+#include <ATen/ops/hardswish_native.h>
+#include <ATen/ops/hardswish_backward_native.h>
+#include <ATen/ops/hardtanh_native.h>
+#include <ATen/ops/hardtanh_backward_native.h>
+#include <ATen/ops/heaviside_native.h>
+#include <ATen/ops/hinge_embedding_loss_native.h>
+#include <ATen/ops/histc_native.h>
+#include <ATen/ops/histogram_native.h>
+#include <ATen/ops/histogramdd_native.h>
+#include <ATen/ops/hsplit_native.h>
+#include <ATen/ops/hspmm_native.h>
+#include <ATen/ops/hstack_native.h>
+#include <ATen/ops/huber_loss_native.h>
+#include <ATen/ops/huber_loss_backward_native.h>
+#include <ATen/ops/hypot_native.h>
+#include <ATen/ops/i0_native.h>
+#include <ATen/ops/igamma_native.h>
+#include <ATen/ops/igammac_native.h>
+#include <ATen/ops/im2col_native.h>
+#include <ATen/ops/imag_native.h>
+#include <ATen/ops/index_native.h>
+#include <ATen/ops/index_add_native.h>
+#include <ATen/ops/index_copy_native.h>
+#include <ATen/ops/index_fill_native.h>
+#include <ATen/ops/index_put_native.h>
+#include <ATen/ops/index_reduce_native.h>
+#include <ATen/ops/index_select_native.h>
+#include <ATen/ops/index_select_backward_native.h>
+#include <ATen/ops/indices_native.h>
+#include <ATen/ops/indices_copy_native.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward_native.h>
+#include <ATen/ops/inner_native.h>
+#include <ATen/ops/instance_norm_native.h>
+#include <ATen/ops/int_repr_native.h>
+#include <ATen/ops/inverse_native.h>
+#include <ATen/ops/is_coalesced_native.h>
+#include <ATen/ops/is_complex_native.h>
+#include <ATen/ops/is_conj_native.h>
+#include <ATen/ops/is_distributed_native.h>
+#include <ATen/ops/is_floating_point_native.h>
+#include <ATen/ops/is_inference_native.h>
+#include <ATen/ops/is_leaf_native.h>
+#include <ATen/ops/is_neg_native.h>
+#include <ATen/ops/is_nonzero_native.h>
+#include <ATen/ops/is_pinned_native.h>
+#include <ATen/ops/is_same_size_native.h>
+#include <ATen/ops/is_set_to_native.h>
+#include <ATen/ops/is_signed_native.h>
+#include <ATen/ops/is_vulkan_available_native.h>
+#include <ATen/ops/isclose_native.h>
+#include <ATen/ops/isfinite_native.h>
+#include <ATen/ops/isin_native.h>
+#include <ATen/ops/isinf_native.h>
+#include <ATen/ops/isnan_native.h>
+#include <ATen/ops/isneginf_native.h>
+#include <ATen/ops/isposinf_native.h>
+#include <ATen/ops/isreal_native.h>
+#include <ATen/ops/istft_native.h>
+#include <ATen/ops/item_native.h>
+#include <ATen/ops/kaiser_window_native.h>
+#include <ATen/ops/kl_div_native.h>
+#include <ATen/ops/kron_native.h>
+#include <ATen/ops/kthvalue_native.h>
+#include <ATen/ops/l1_loss_native.h>
+#include <ATen/ops/layer_norm_native.h>
+#include <ATen/ops/lcm_native.h>
+#include <ATen/ops/ldexp_native.h>
+#include <ATen/ops/le_native.h>
+#include <ATen/ops/leaky_relu_native.h>
+#include <ATen/ops/leaky_relu_backward_native.h>
+#include <ATen/ops/lerp_native.h>
+#include <ATen/ops/less_native.h>
+#include <ATen/ops/less_equal_native.h>
+#include <ATen/ops/lgamma_native.h>
+#include <ATen/ops/lift_native.h>
+#include <ATen/ops/lift_fresh_native.h>
+#include <ATen/ops/lift_fresh_copy_native.h>
+#include <ATen/ops/linalg_cholesky_native.h>
+#include <ATen/ops/linalg_cholesky_ex_native.h>
+#include <ATen/ops/linalg_cond_native.h>
+#include <ATen/ops/linalg_cross_native.h>
+#include <ATen/ops/linalg_det_native.h>
+#include <ATen/ops/linalg_diagonal_native.h>
+#include <ATen/ops/linalg_eig_native.h>
+#include <ATen/ops/linalg_eigh_native.h>
+#include <ATen/ops/linalg_eigvals_native.h>
+#include <ATen/ops/linalg_eigvalsh_native.h>
+#include <ATen/ops/linalg_householder_product_native.h>
+#include <ATen/ops/linalg_inv_native.h>
+#include <ATen/ops/linalg_inv_ex_native.h>
+#include <ATen/ops/linalg_ldl_factor_native.h>
+#include <ATen/ops/linalg_ldl_factor_ex_native.h>
+#include <ATen/ops/linalg_ldl_solve_native.h>
+#include <ATen/ops/linalg_lstsq_native.h>
+#include <ATen/ops/linalg_lu_native.h>
+#include <ATen/ops/linalg_lu_factor_native.h>
+#include <ATen/ops/linalg_lu_factor_ex_native.h>
+#include <ATen/ops/linalg_lu_solve_native.h>
+#include <ATen/ops/linalg_matmul_native.h>
+#include <ATen/ops/linalg_matrix_exp_native.h>
+#include <ATen/ops/linalg_matrix_norm_native.h>
+#include <ATen/ops/linalg_matrix_power_native.h>
+#include <ATen/ops/linalg_matrix_rank_native.h>
+#include <ATen/ops/linalg_multi_dot_native.h>
+#include <ATen/ops/linalg_norm_native.h>
+#include <ATen/ops/linalg_pinv_native.h>
+#include <ATen/ops/linalg_qr_native.h>
+#include <ATen/ops/linalg_slogdet_native.h>
+#include <ATen/ops/linalg_solve_native.h>
+#include <ATen/ops/linalg_solve_ex_native.h>
+#include <ATen/ops/linalg_solve_triangular_native.h>
+#include <ATen/ops/linalg_svd_native.h>
+#include <ATen/ops/linalg_svdvals_native.h>
+#include <ATen/ops/linalg_tensorinv_native.h>
+#include <ATen/ops/linalg_tensorsolve_native.h>
+#include <ATen/ops/linalg_vander_native.h>
+#include <ATen/ops/linalg_vecdot_native.h>
+#include <ATen/ops/linalg_vector_norm_native.h>
+#include <ATen/ops/linear_native.h>
+#include <ATen/ops/linear_backward_native.h>
+#include <ATen/ops/linspace_native.h>
+#include <ATen/ops/log_native.h>
+#include <ATen/ops/log10_native.h>
+#include <ATen/ops/log1p_native.h>
+#include <ATen/ops/log2_native.h>
+#include <ATen/ops/log_normal_native.h>
+#include <ATen/ops/log_sigmoid_native.h>
+#include <ATen/ops/log_sigmoid_backward_native.h>
+#include <ATen/ops/log_sigmoid_forward_native.h>
+#include <ATen/ops/log_softmax_native.h>
+#include <ATen/ops/logaddexp_native.h>
+#include <ATen/ops/logaddexp2_native.h>
+#include <ATen/ops/logcumsumexp_native.h>
+#include <ATen/ops/logdet_native.h>
+#include <ATen/ops/logical_and_native.h>
+#include <ATen/ops/logical_not_native.h>
+#include <ATen/ops/logical_or_native.h>
+#include <ATen/ops/logical_xor_native.h>
+#include <ATen/ops/logit_native.h>
+#include <ATen/ops/logit_backward_native.h>
+#include <ATen/ops/logspace_native.h>
+#include <ATen/ops/logsumexp_native.h>
+#include <ATen/ops/lshift_native.h>
+#include <ATen/ops/lstm_native.h>
+#include <ATen/ops/lstm_cell_native.h>
+#include <ATen/ops/lstm_mps_backward_native.h>
+#include <ATen/ops/lt_native.h>
+#include <ATen/ops/lu_solve_native.h>
+#include <ATen/ops/lu_unpack_native.h>
+#include <ATen/ops/mH_native.h>
+#include <ATen/ops/mT_native.h>
+#include <ATen/ops/margin_ranking_loss_native.h>
+#include <ATen/ops/masked_fill_native.h>
+#include <ATen/ops/masked_scatter_native.h>
+#include <ATen/ops/masked_scatter_backward_native.h>
+#include <ATen/ops/masked_select_native.h>
+#include <ATen/ops/masked_select_backward_native.h>
+#include <ATen/ops/matmul_native.h>
+#include <ATen/ops/matmul_backward_native.h>
+#include <ATen/ops/matrix_H_native.h>
+#include <ATen/ops/matrix_exp_native.h>
+#include <ATen/ops/matrix_exp_backward_native.h>
+#include <ATen/ops/matrix_power_native.h>
+#include <ATen/ops/max_native.h>
+#include <ATen/ops/max_pool1d_native.h>
+#include <ATen/ops/max_pool1d_with_indices_native.h>
+#include <ATen/ops/max_pool2d_native.h>
+#include <ATen/ops/max_pool2d_backward_native.h>
+#include <ATen/ops/max_pool2d_with_indices_native.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
+#include <ATen/ops/max_pool3d_native.h>
+#include <ATen/ops/max_pool3d_with_indices_native.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
+#include <ATen/ops/max_unpool2d_native.h>
+#include <ATen/ops/max_unpool3d_native.h>
+#include <ATen/ops/maximum_native.h>
+#include <ATen/ops/mean_native.h>
+#include <ATen/ops/median_native.h>
+#include <ATen/ops/meshgrid_native.h>
+#include <ATen/ops/min_native.h>
+#include <ATen/ops/minimum_native.h>
+#include <ATen/ops/miopen_batch_norm_native.h>
+#include <ATen/ops/miopen_batch_norm_backward_native.h>
+#include <ATen/ops/miopen_convolution_native.h>
+#include <ATen/ops/miopen_convolution_add_relu_native.h>
+#include <ATen/ops/miopen_convolution_relu_native.h>
+#include <ATen/ops/miopen_convolution_transpose_native.h>
+#include <ATen/ops/miopen_depthwise_convolution_native.h>
+#include <ATen/ops/miopen_rnn_native.h>
+#include <ATen/ops/miopen_rnn_backward_native.h>
+#include <ATen/ops/mish_native.h>
+#include <ATen/ops/mish_backward_native.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_native.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_native.h>
+#include <ATen/ops/mkldnn_convolution_native.h>
+#include <ATen/ops/mkldnn_linear_native.h>
+#include <ATen/ops/mkldnn_linear_backward_native.h>
+#include <ATen/ops/mkldnn_linear_backward_input_native.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_native.h>
+#include <ATen/ops/mkldnn_max_pool2d_native.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_native.h>
+#include <ATen/ops/mkldnn_max_pool3d_native.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_native.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_native.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_native.h>
+#include <ATen/ops/mkldnn_rnn_layer_native.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_native.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mode_native.h>
+#include <ATen/ops/moveaxis_native.h>
+#include <ATen/ops/movedim_native.h>
+#include <ATen/ops/mps_convolution_backward_native.h>
+#include <ATen/ops/mps_convolution_transpose_backward_native.h>
+#include <ATen/ops/mse_loss_native.h>
+#include <ATen/ops/mse_loss_backward_native.h>
+#include <ATen/ops/msort_native.h>
+#include <ATen/ops/mul_native.h>
+#include <ATen/ops/multi_margin_loss_native.h>
+#include <ATen/ops/multi_margin_loss_backward_native.h>
+#include <ATen/ops/multilabel_margin_loss_native.h>
+#include <ATen/ops/multilabel_margin_loss_backward_native.h>
+#include <ATen/ops/multilabel_margin_loss_forward_native.h>
+#include <ATen/ops/multinomial_native.h>
+#include <ATen/ops/multiply_native.h>
+#include <ATen/ops/mv_native.h>
+#include <ATen/ops/mvlgamma_native.h>
+#include <ATen/ops/nan_to_num_native.h>
+#include <ATen/ops/nanmean_native.h>
+#include <ATen/ops/nanmedian_native.h>
+#include <ATen/ops/nanquantile_native.h>
+#include <ATen/ops/nansum_native.h>
+#include <ATen/ops/narrow_native.h>
+#include <ATen/ops/narrow_copy_native.h>
+#include <ATen/ops/native_batch_norm_native.h>
+#include <ATen/ops/native_batch_norm_backward_native.h>
+#include <ATen/ops/native_channel_shuffle_native.h>
+#include <ATen/ops/native_dropout_native.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_group_norm_native.h>
+#include <ATen/ops/native_group_norm_backward_native.h>
+#include <ATen/ops/native_layer_norm_native.h>
+#include <ATen/ops/native_layer_norm_backward_native.h>
+#include <ATen/ops/native_norm_native.h>
+#include <ATen/ops/ne_native.h>
+#include <ATen/ops/neg_native.h>
+#include <ATen/ops/negative_native.h>
+#include <ATen/ops/nested_to_padded_tensor_native.h>
+#include <ATen/ops/new_empty_native.h>
+#include <ATen/ops/new_empty_strided_native.h>
+#include <ATen/ops/new_full_native.h>
+#include <ATen/ops/new_ones_native.h>
+#include <ATen/ops/new_zeros_native.h>
+#include <ATen/ops/nextafter_native.h>
+#include <ATen/ops/nll_loss_native.h>
+#include <ATen/ops/nll_loss2d_native.h>
+#include <ATen/ops/nll_loss2d_backward_native.h>
+#include <ATen/ops/nll_loss2d_forward_native.h>
+#include <ATen/ops/nll_loss_backward_native.h>
+#include <ATen/ops/nll_loss_forward_native.h>
+#include <ATen/ops/nll_loss_nd_native.h>
+#include <ATen/ops/nonzero_native.h>
+#include <ATen/ops/nonzero_numpy_native.h>
+#include <ATen/ops/nonzero_static_native.h>
+#include <ATen/ops/norm_native.h>
+#include <ATen/ops/norm_except_dim_native.h>
+#include <ATen/ops/normal_native.h>
+#include <ATen/ops/not_equal_native.h>
+#include <ATen/ops/nuclear_norm_native.h>
+#include <ATen/ops/numpy_T_native.h>
+#include <ATen/ops/one_hot_native.h>
+#include <ATen/ops/ones_native.h>
+#include <ATen/ops/ones_like_native.h>
+#include <ATen/ops/or_native.h>
+#include <ATen/ops/orgqr_native.h>
+#include <ATen/ops/ormqr_native.h>
+#include <ATen/ops/outer_native.h>
+#include <ATen/ops/output_nr_native.h>
+#include <ATen/ops/pad_native.h>
+#include <ATen/ops/pad_sequence_native.h>
+#include <ATen/ops/pairwise_distance_native.h>
+#include <ATen/ops/pdist_native.h>
+#include <ATen/ops/permute_native.h>
+#include <ATen/ops/permute_copy_native.h>
+#include <ATen/ops/pin_memory_native.h>
+#include <ATen/ops/pinverse_native.h>
+#include <ATen/ops/pixel_shuffle_native.h>
+#include <ATen/ops/pixel_unshuffle_native.h>
+#include <ATen/ops/poisson_native.h>
+#include <ATen/ops/poisson_nll_loss_native.h>
+#include <ATen/ops/polar_native.h>
+#include <ATen/ops/polygamma_native.h>
+#include <ATen/ops/positive_native.h>
+#include <ATen/ops/pow_native.h>
+#include <ATen/ops/prelu_native.h>
+#include <ATen/ops/prod_native.h>
+#include <ATen/ops/promote_types_native.h>
+#include <ATen/ops/put_native.h>
+#include <ATen/ops/q_per_channel_axis_native.h>
+#include <ATen/ops/q_per_channel_scales_native.h>
+#include <ATen/ops/q_per_channel_zero_points_native.h>
+#include <ATen/ops/q_scale_native.h>
+#include <ATen/ops/q_zero_point_native.h>
+#include <ATen/ops/qr_native.h>
+#include <ATen/ops/qscheme_native.h>
+#include <ATen/ops/quantile_native.h>
+#include <ATen/ops/quantize_per_channel_native.h>
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_native.h>
+#include <ATen/ops/quantized_batch_norm_native.h>
+#include <ATen/ops/quantized_gru_cell_native.h>
+#include <ATen/ops/quantized_lstm_cell_native.h>
+#include <ATen/ops/quantized_max_pool1d_native.h>
+#include <ATen/ops/quantized_max_pool2d_native.h>
+#include <ATen/ops/quantized_max_pool3d_native.h>
+#include <ATen/ops/quantized_rnn_relu_cell_native.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_native.h>
+#include <ATen/ops/rad2deg_native.h>
+#include <ATen/ops/rand_native.h>
+#include <ATen/ops/rand_like_native.h>
+#include <ATen/ops/randint_native.h>
+#include <ATen/ops/randint_like_native.h>
+#include <ATen/ops/randn_native.h>
+#include <ATen/ops/randn_like_native.h>
+#include <ATen/ops/random_native.h>
+#include <ATen/ops/randperm_native.h>
+#include <ATen/ops/range_native.h>
+#include <ATen/ops/ravel_native.h>
+#include <ATen/ops/real_native.h>
+#include <ATen/ops/reciprocal_native.h>
+#include <ATen/ops/record_stream_native.h>
+#include <ATen/ops/refine_names_native.h>
+#include <ATen/ops/reflection_pad1d_native.h>
+#include <ATen/ops/reflection_pad1d_backward_native.h>
+#include <ATen/ops/reflection_pad2d_native.h>
+#include <ATen/ops/reflection_pad2d_backward_native.h>
+#include <ATen/ops/reflection_pad3d_native.h>
+#include <ATen/ops/reflection_pad3d_backward_native.h>
+#include <ATen/ops/relu_native.h>
+#include <ATen/ops/relu6_native.h>
+#include <ATen/ops/remainder_native.h>
+#include <ATen/ops/rename_native.h>
+#include <ATen/ops/renorm_native.h>
+#include <ATen/ops/repeat_native.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#include <ATen/ops/replication_pad1d_native.h>
+#include <ATen/ops/replication_pad1d_backward_native.h>
+#include <ATen/ops/replication_pad2d_native.h>
+#include <ATen/ops/replication_pad2d_backward_native.h>
+#include <ATen/ops/replication_pad3d_native.h>
+#include <ATen/ops/replication_pad3d_backward_native.h>
+#include <ATen/ops/requires_grad_native.h>
+#include <ATen/ops/reshape_native.h>
+#include <ATen/ops/reshape_as_native.h>
+#include <ATen/ops/resize_native.h>
+#include <ATen/ops/resize_as_native.h>
+#include <ATen/ops/resize_as_sparse_native.h>
+#include <ATen/ops/resolve_conj_native.h>
+#include <ATen/ops/resolve_neg_native.h>
+#include <ATen/ops/result_type_native.h>
+#include <ATen/ops/retain_grad_native.h>
+#include <ATen/ops/retains_grad_native.h>
+#include <ATen/ops/rnn_relu_native.h>
+#include <ATen/ops/rnn_relu_cell_native.h>
+#include <ATen/ops/rnn_tanh_native.h>
+#include <ATen/ops/rnn_tanh_cell_native.h>
+#include <ATen/ops/roll_native.h>
+#include <ATen/ops/rot90_native.h>
+#include <ATen/ops/round_native.h>
+#include <ATen/ops/row_indices_native.h>
+#include <ATen/ops/row_indices_copy_native.h>
+#include <ATen/ops/row_stack_native.h>
+#include <ATen/ops/rrelu_native.h>
+#include <ATen/ops/rrelu_with_noise_native.h>
+#include <ATen/ops/rrelu_with_noise_backward_native.h>
+#include <ATen/ops/rshift_native.h>
+#include <ATen/ops/rsqrt_native.h>
+#include <ATen/ops/rsub_native.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/scaled_dot_product_attention_native.h>
+#include <ATen/ops/scatter_native.h>
+#include <ATen/ops/scatter_add_native.h>
+#include <ATen/ops/scatter_reduce_native.h>
+#include <ATen/ops/searchsorted_native.h>
+#include <ATen/ops/segment_reduce_native.h>
+#include <ATen/ops/select_native.h>
+#include <ATen/ops/select_backward_native.h>
+#include <ATen/ops/select_copy_native.h>
+#include <ATen/ops/select_scatter_native.h>
+#include <ATen/ops/selu_native.h>
+#include <ATen/ops/set_native.h>
+#include <ATen/ops/set_data_native.h>
+#include <ATen/ops/sgn_native.h>
+#include <ATen/ops/sigmoid_native.h>
+#include <ATen/ops/sigmoid_backward_native.h>
+#include <ATen/ops/sign_native.h>
+#include <ATen/ops/signbit_native.h>
+#include <ATen/ops/silu_native.h>
+#include <ATen/ops/silu_backward_native.h>
+#include <ATen/ops/sin_native.h>
+#include <ATen/ops/sinc_native.h>
+#include <ATen/ops/sinh_native.h>
+#include <ATen/ops/size_native.h>
+#include <ATen/ops/slice_native.h>
+#include <ATen/ops/slice_backward_native.h>
+#include <ATen/ops/slice_copy_native.h>
+#include <ATen/ops/slice_inverse_native.h>
+#include <ATen/ops/slice_scatter_native.h>
+#include <ATen/ops/slogdet_native.h>
+#include <ATen/ops/slow_conv3d_native.h>
+#include <ATen/ops/slow_conv3d_forward_native.h>
+#include <ATen/ops/slow_conv_dilated2d_native.h>
+#include <ATen/ops/slow_conv_dilated3d_native.h>
+#include <ATen/ops/slow_conv_transpose2d_native.h>
+#include <ATen/ops/slow_conv_transpose3d_native.h>
+#include <ATen/ops/smm_native.h>
+#include <ATen/ops/smooth_l1_loss_native.h>
+#include <ATen/ops/smooth_l1_loss_backward_native.h>
+#include <ATen/ops/soft_margin_loss_native.h>
+#include <ATen/ops/soft_margin_loss_backward_native.h>
+#include <ATen/ops/softmax_native.h>
+#include <ATen/ops/softplus_native.h>
+#include <ATen/ops/softplus_backward_native.h>
+#include <ATen/ops/softshrink_native.h>
+#include <ATen/ops/softshrink_backward_native.h>
+#include <ATen/ops/sort_native.h>
+#include <ATen/ops/sparse_bsc_tensor_native.h>
+#include <ATen/ops/sparse_bsr_tensor_native.h>
+#include <ATen/ops/sparse_compressed_tensor_native.h>
+#include <ATen/ops/sparse_coo_tensor_native.h>
+#include <ATen/ops/sparse_csc_tensor_native.h>
+#include <ATen/ops/sparse_csr_tensor_native.h>
+#include <ATen/ops/sparse_dim_native.h>
+#include <ATen/ops/sparse_mask_native.h>
+#include <ATen/ops/sparse_resize_native.h>
+#include <ATen/ops/sparse_resize_and_clear_native.h>
+#include <ATen/ops/sparse_sampled_addmm_native.h>
+#include <ATen/ops/special_airy_ai_native.h>
+#include <ATen/ops/special_bessel_j0_native.h>
+#include <ATen/ops/special_bessel_j1_native.h>
+#include <ATen/ops/special_bessel_y0_native.h>
+#include <ATen/ops/special_bessel_y1_native.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_native.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_native.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_native.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_native.h>
+#include <ATen/ops/special_digamma_native.h>
+#include <ATen/ops/special_entr_native.h>
+#include <ATen/ops/special_erf_native.h>
+#include <ATen/ops/special_erfc_native.h>
+#include <ATen/ops/special_erfcx_native.h>
+#include <ATen/ops/special_erfinv_native.h>
+#include <ATen/ops/special_exp2_native.h>
+#include <ATen/ops/special_expit_native.h>
+#include <ATen/ops/special_expm1_native.h>
+#include <ATen/ops/special_gammainc_native.h>
+#include <ATen/ops/special_gammaincc_native.h>
+#include <ATen/ops/special_gammaln_native.h>
+#include <ATen/ops/special_hermite_polynomial_h_native.h>
+#include <ATen/ops/special_hermite_polynomial_he_native.h>
+#include <ATen/ops/special_i0_native.h>
+#include <ATen/ops/special_i0e_native.h>
+#include <ATen/ops/special_i1_native.h>
+#include <ATen/ops/special_i1e_native.h>
+#include <ATen/ops/special_laguerre_polynomial_l_native.h>
+#include <ATen/ops/special_legendre_polynomial_p_native.h>
+#include <ATen/ops/special_log1p_native.h>
+#include <ATen/ops/special_log_ndtr_native.h>
+#include <ATen/ops/special_log_softmax_native.h>
+#include <ATen/ops/special_logit_native.h>
+#include <ATen/ops/special_logsumexp_native.h>
+#include <ATen/ops/special_modified_bessel_i0_native.h>
+#include <ATen/ops/special_modified_bessel_i1_native.h>
+#include <ATen/ops/special_modified_bessel_k0_native.h>
+#include <ATen/ops/special_modified_bessel_k1_native.h>
+#include <ATen/ops/special_multigammaln_native.h>
+#include <ATen/ops/special_ndtr_native.h>
+#include <ATen/ops/special_ndtri_native.h>
+#include <ATen/ops/special_polygamma_native.h>
+#include <ATen/ops/special_psi_native.h>
+#include <ATen/ops/special_round_native.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_native.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_native.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_native.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_native.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_native.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_native.h>
+#include <ATen/ops/special_sinc_native.h>
+#include <ATen/ops/special_softmax_native.h>
+#include <ATen/ops/special_spherical_bessel_j0_native.h>
+#include <ATen/ops/special_xlog1py_native.h>
+#include <ATen/ops/special_xlogy_native.h>
+#include <ATen/ops/special_zeta_native.h>
+#include <ATen/ops/split_native.h>
+#include <ATen/ops/split_copy_native.h>
+#include <ATen/ops/split_with_sizes_native.h>
+#include <ATen/ops/split_with_sizes_copy_native.h>
+#include <ATen/ops/sqrt_native.h>
+#include <ATen/ops/square_native.h>
+#include <ATen/ops/squeeze_native.h>
+#include <ATen/ops/squeeze_copy_native.h>
+#include <ATen/ops/sspaddmm_native.h>
+#include <ATen/ops/stack_native.h>
+#include <ATen/ops/std_native.h>
+#include <ATen/ops/std_mean_native.h>
+#include <ATen/ops/stft_native.h>
+#include <ATen/ops/stride_native.h>
+#include <ATen/ops/sub_native.h>
+#include <ATen/ops/subtract_native.h>
+#include <ATen/ops/sum_native.h>
+#include <ATen/ops/sum_to_size_native.h>
+#include <ATen/ops/svd_native.h>
+#include <ATen/ops/swapaxes_native.h>
+#include <ATen/ops/swapdims_native.h>
+#include <ATen/ops/sym_constrain_range_native.h>
+#include <ATen/ops/sym_constrain_range_for_size_native.h>
+#include <ATen/ops/sym_numel_native.h>
+#include <ATen/ops/sym_size_native.h>
+#include <ATen/ops/sym_storage_offset_native.h>
+#include <ATen/ops/sym_stride_native.h>
+#include <ATen/ops/t_native.h>
+#include <ATen/ops/t_copy_native.h>
+#include <ATen/ops/take_native.h>
+#include <ATen/ops/take_along_dim_native.h>
+#include <ATen/ops/tan_native.h>
+#include <ATen/ops/tanh_native.h>
+#include <ATen/ops/tanh_backward_native.h>
+#include <ATen/ops/tensor_split_native.h>
+#include <ATen/ops/tensordot_native.h>
+#include <ATen/ops/thnn_conv2d_native.h>
+#include <ATen/ops/threshold_native.h>
+#include <ATen/ops/threshold_backward_native.h>
+#include <ATen/ops/tile_native.h>
+#include <ATen/ops/to_native.h>
+#include <ATen/ops/to_dense_native.h>
+#include <ATen/ops/to_dense_backward_native.h>
+#include <ATen/ops/to_mkldnn_native.h>
+#include <ATen/ops/to_mkldnn_backward_native.h>
+#include <ATen/ops/to_padded_tensor_native.h>
+#include <ATen/ops/to_sparse_native.h>
+#include <ATen/ops/to_sparse_bsc_native.h>
+#include <ATen/ops/to_sparse_bsr_native.h>
+#include <ATen/ops/to_sparse_csc_native.h>
+#include <ATen/ops/to_sparse_csr_native.h>
+#include <ATen/ops/topk_native.h>
+#include <ATen/ops/trace_native.h>
+#include <ATen/ops/trace_backward_native.h>
+#include <ATen/ops/transpose_native.h>
+#include <ATen/ops/transpose_copy_native.h>
+#include <ATen/ops/trapezoid_native.h>
+#include <ATen/ops/trapz_native.h>
+#include <ATen/ops/triangular_solve_native.h>
+#include <ATen/ops/tril_native.h>
+#include <ATen/ops/tril_indices_native.h>
+#include <ATen/ops/triplet_margin_loss_native.h>
+#include <ATen/ops/triu_native.h>
+#include <ATen/ops/triu_indices_native.h>
+#include <ATen/ops/true_divide_native.h>
+#include <ATen/ops/trunc_native.h>
+#include <ATen/ops/type_as_native.h>
+#include <ATen/ops/unbind_native.h>
+#include <ATen/ops/unbind_copy_native.h>
+#include <ATen/ops/unflatten_native.h>
+#include <ATen/ops/unflatten_dense_tensors_native.h>
+#include <ATen/ops/unfold_native.h>
+#include <ATen/ops/unfold_backward_native.h>
+#include <ATen/ops/unfold_copy_native.h>
+#include <ATen/ops/uniform_native.h>
+#include <ATen/ops/unique_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unsafe_chunk_native.h>
+#include <ATen/ops/unsafe_split_native.h>
+#include <ATen/ops/unsafe_split_with_sizes_native.h>
+#include <ATen/ops/unsqueeze_native.h>
+#include <ATen/ops/unsqueeze_copy_native.h>
+#include <ATen/ops/upsample_bicubic2d_native.h>
+#include <ATen/ops/upsample_bicubic2d_backward_native.h>
+#include <ATen/ops/upsample_bilinear2d_native.h>
+#include <ATen/ops/upsample_bilinear2d_backward_native.h>
+#include <ATen/ops/upsample_linear1d_native.h>
+#include <ATen/ops/upsample_linear1d_backward_native.h>
+#include <ATen/ops/upsample_nearest1d_native.h>
+#include <ATen/ops/upsample_nearest1d_backward_native.h>
+#include <ATen/ops/upsample_nearest2d_native.h>
+#include <ATen/ops/upsample_nearest2d_backward_native.h>
+#include <ATen/ops/upsample_nearest3d_native.h>
+#include <ATen/ops/upsample_nearest3d_backward_native.h>
+#include <ATen/ops/upsample_trilinear3d_native.h>
+#include <ATen/ops/upsample_trilinear3d_backward_native.h>
+#include <ATen/ops/value_selecting_reduction_backward_native.h>
+#include <ATen/ops/values_native.h>
+#include <ATen/ops/values_copy_native.h>
+#include <ATen/ops/vander_native.h>
+#include <ATen/ops/var_native.h>
+#include <ATen/ops/var_mean_native.h>
+#include <ATen/ops/vdot_native.h>
+#include <ATen/ops/view_native.h>
+#include <ATen/ops/view_as_native.h>
+#include <ATen/ops/view_as_complex_native.h>
+#include <ATen/ops/view_as_complex_copy_native.h>
+#include <ATen/ops/view_as_real_native.h>
+#include <ATen/ops/view_as_real_copy_native.h>
+#include <ATen/ops/view_copy_native.h>
+#include <ATen/ops/vsplit_native.h>
+#include <ATen/ops/vstack_native.h>
+#include <ATen/ops/where_native.h>
+#include <ATen/ops/xlogy_native.h>
+#include <ATen/ops/xor_native.h>
+#include <ATen/ops/zero_native.h>
+#include <ATen/ops/zeros_native.h>
+#include <ATen/ops/zeros_like_native.h>
+
+
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NativeMetaFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/NativeMetaFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..58fb5c2eee20f90c01cc6ea4afedf6b14b686dc9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NativeMetaFunctions.h
@@ -0,0 +1,1303 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunctions.h
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorIterator.h>
+
+#include <ATen/ops/_adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_meta.h>
+#include <ATen/ops/_adaptive_avg_pool3d_meta.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_meta.h>
+#include <ATen/ops/_add_batch_dim_meta.h>
+#include <ATen/ops/_add_relu_meta.h>
+#include <ATen/ops/_addmm_activation_meta.h>
+#include <ATen/ops/_aminmax_meta.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_meta.h>
+#include <ATen/ops/_amp_update_scale_meta.h>
+#include <ATen/ops/_assert_async_meta.h>
+#include <ATen/ops/_assert_scalar_meta.h>
+#include <ATen/ops/_assert_tensor_metadata_meta.h>
+#include <ATen/ops/_autocast_to_full_precision_meta.h>
+#include <ATen/ops/_autocast_to_reduced_precision_meta.h>
+#include <ATen/ops/_backward_meta.h>
+#include <ATen/ops/_batch_norm_impl_index_meta.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_meta.h>
+#include <ATen/ops/_cast_Byte_meta.h>
+#include <ATen/ops/_cast_Char_meta.h>
+#include <ATen/ops/_cast_Double_meta.h>
+#include <ATen/ops/_cast_Float_meta.h>
+#include <ATen/ops/_cast_Half_meta.h>
+#include <ATen/ops/_cast_Int_meta.h>
+#include <ATen/ops/_cast_Long_meta.h>
+#include <ATen/ops/_cast_Short_meta.h>
+#include <ATen/ops/_cdist_backward_meta.h>
+#include <ATen/ops/_cdist_forward_meta.h>
+#include <ATen/ops/_cholesky_solve_helper_meta.h>
+#include <ATen/ops/_choose_qparams_per_tensor_meta.h>
+#include <ATen/ops/_chunk_cat_meta.h>
+#include <ATen/ops/_coalesce_meta.h>
+#include <ATen/ops/_coalesced_meta.h>
+#include <ATen/ops/_compute_linear_combination_meta.h>
+#include <ATen/ops/_conj_meta.h>
+#include <ATen/ops/_conj_copy_meta.h>
+#include <ATen/ops/_conj_physical_meta.h>
+#include <ATen/ops/_conv_depthwise2d_meta.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_meta.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_meta.h>
+#include <ATen/ops/_convert_weight_to_int4pack_meta.h>
+#include <ATen/ops/_convolution_meta.h>
+#include <ATen/ops/_convolution_double_backward_meta.h>
+#include <ATen/ops/_convolution_mode_meta.h>
+#include <ATen/ops/_copy_from_meta.h>
+#include <ATen/ops/_copy_from_and_resize_meta.h>
+#include <ATen/ops/_cslt_compress_meta.h>
+#include <ATen/ops/_cslt_sparse_mm_meta.h>
+#include <ATen/ops/_cslt_sparse_mm_search_meta.h>
+#include <ATen/ops/_ctc_loss_meta.h>
+#include <ATen/ops/_ctc_loss_backward_meta.h>
+#include <ATen/ops/_cudnn_ctc_loss_meta.h>
+#include <ATen/ops/_cudnn_init_dropout_state_meta.h>
+#include <ATen/ops/_cudnn_rnn_meta.h>
+#include <ATen/ops/_cudnn_rnn_backward_meta.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_meta.h>
+#include <ATen/ops/_cufft_clear_plan_cache_meta.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_meta.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_meta.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_meta.h>
+#include <ATen/ops/_cummax_helper_meta.h>
+#include <ATen/ops/_cummin_helper_meta.h>
+#include <ATen/ops/_debug_has_internal_overlap_meta.h>
+#include <ATen/ops/_dimI_meta.h>
+#include <ATen/ops/_dimV_meta.h>
+#include <ATen/ops/_dim_arange_meta.h>
+#include <ATen/ops/_dirichlet_grad_meta.h>
+#include <ATen/ops/_efficient_attention_backward_meta.h>
+#include <ATen/ops/_efficient_attention_forward_meta.h>
+#include <ATen/ops/_efficientzerotensor_meta.h>
+#include <ATen/ops/_embedding_bag_meta.h>
+#include <ATen/ops/_embedding_bag_backward_meta.h>
+#include <ATen/ops/_embedding_bag_dense_backward_meta.h>
+#include <ATen/ops/_embedding_bag_forward_only_meta.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_meta.h>
+#include <ATen/ops/_embedding_bag_sparse_backward_meta.h>
+#include <ATen/ops/_empty_affine_quantized_meta.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_meta.h>
+#include <ATen/ops/_euclidean_dist_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_meta.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_meta.h>
+#include <ATen/ops/_fft_c2c_meta.h>
+#include <ATen/ops/_fft_c2r_meta.h>
+#include <ATen/ops/_fft_r2c_meta.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_meta.h>
+#include <ATen/ops/_flash_attention_backward_meta.h>
+#include <ATen/ops/_flash_attention_forward_meta.h>
+#include <ATen/ops/_foobar_meta.h>
+#include <ATen/ops/_foreach_abs_meta.h>
+#include <ATen/ops/_foreach_acos_meta.h>
+#include <ATen/ops/_foreach_add_meta.h>
+#include <ATen/ops/_foreach_addcdiv_meta.h>
+#include <ATen/ops/_foreach_addcmul_meta.h>
+#include <ATen/ops/_foreach_asin_meta.h>
+#include <ATen/ops/_foreach_atan_meta.h>
+#include <ATen/ops/_foreach_ceil_meta.h>
+#include <ATen/ops/_foreach_clamp_max_meta.h>
+#include <ATen/ops/_foreach_clamp_min_meta.h>
+#include <ATen/ops/_foreach_copy_meta.h>
+#include <ATen/ops/_foreach_cos_meta.h>
+#include <ATen/ops/_foreach_cosh_meta.h>
+#include <ATen/ops/_foreach_div_meta.h>
+#include <ATen/ops/_foreach_erf_meta.h>
+#include <ATen/ops/_foreach_erfc_meta.h>
+#include <ATen/ops/_foreach_exp_meta.h>
+#include <ATen/ops/_foreach_expm1_meta.h>
+#include <ATen/ops/_foreach_floor_meta.h>
+#include <ATen/ops/_foreach_frac_meta.h>
+#include <ATen/ops/_foreach_lerp_meta.h>
+#include <ATen/ops/_foreach_lgamma_meta.h>
+#include <ATen/ops/_foreach_log_meta.h>
+#include <ATen/ops/_foreach_log10_meta.h>
+#include <ATen/ops/_foreach_log1p_meta.h>
+#include <ATen/ops/_foreach_log2_meta.h>
+#include <ATen/ops/_foreach_maximum_meta.h>
+#include <ATen/ops/_foreach_minimum_meta.h>
+#include <ATen/ops/_foreach_mul_meta.h>
+#include <ATen/ops/_foreach_neg_meta.h>
+#include <ATen/ops/_foreach_norm_meta.h>
+#include <ATen/ops/_foreach_pow_meta.h>
+#include <ATen/ops/_foreach_reciprocal_meta.h>
+#include <ATen/ops/_foreach_round_meta.h>
+#include <ATen/ops/_foreach_sigmoid_meta.h>
+#include <ATen/ops/_foreach_sign_meta.h>
+#include <ATen/ops/_foreach_sin_meta.h>
+#include <ATen/ops/_foreach_sinh_meta.h>
+#include <ATen/ops/_foreach_sqrt_meta.h>
+#include <ATen/ops/_foreach_sub_meta.h>
+#include <ATen/ops/_foreach_tan_meta.h>
+#include <ATen/ops/_foreach_tanh_meta.h>
+#include <ATen/ops/_foreach_trunc_meta.h>
+#include <ATen/ops/_foreach_zero_meta.h>
+#include <ATen/ops/_functional_assert_async_meta.h>
+#include <ATen/ops/_functional_assert_scalar_meta.h>
+#include <ATen/ops/_functional_sym_constrain_range_meta.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_meta.h>
+#include <ATen/ops/_fused_adam_meta.h>
+#include <ATen/ops/_fused_adamw_meta.h>
+#include <ATen/ops/_fused_dropout_meta.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_meta.h>
+#include <ATen/ops/_fused_sdp_choice_meta.h>
+#include <ATen/ops/_fused_sgd_meta.h>
+#include <ATen/ops/_fw_primal_meta.h>
+#include <ATen/ops/_fw_primal_copy_meta.h>
+#include <ATen/ops/_gather_sparse_backward_meta.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_meta.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward_meta.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type_meta.h>
+#include <ATen/ops/_has_same_storage_numel_meta.h>
+#include <ATen/ops/_histogramdd_bin_edges_meta.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_meta.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_meta.h>
+#include <ATen/ops/_index_put_impl_meta.h>
+#include <ATen/ops/_indices_meta.h>
+#include <ATen/ops/_indices_copy_meta.h>
+#include <ATen/ops/_int_mm_meta.h>
+#include <ATen/ops/_is_all_true_meta.h>
+#include <ATen/ops/_is_any_true_meta.h>
+#include <ATen/ops/_is_zerotensor_meta.h>
+#include <ATen/ops/_lazy_clone_meta.h>
+#include <ATen/ops/_linalg_check_errors_meta.h>
+#include <ATen/ops/_linalg_det_meta.h>
+#include <ATen/ops/_linalg_eigh_meta.h>
+#include <ATen/ops/_linalg_eigvals_meta.h>
+#include <ATen/ops/_linalg_slogdet_meta.h>
+#include <ATen/ops/_linalg_solve_ex_meta.h>
+#include <ATen/ops/_linalg_svd_meta.h>
+#include <ATen/ops/_local_scalar_dense_meta.h>
+#include <ATen/ops/_log_softmax_meta.h>
+#include <ATen/ops/_log_softmax_backward_data_meta.h>
+#include <ATen/ops/_logcumsumexp_meta.h>
+#include <ATen/ops/_lstm_mps_meta.h>
+#include <ATen/ops/_lu_with_info_meta.h>
+#include <ATen/ops/_make_dep_token_meta.h>
+#include <ATen/ops/_make_dual_meta.h>
+#include <ATen/ops/_make_dual_copy_meta.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_meta.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_meta.h>
+#include <ATen/ops/_masked_scale_meta.h>
+#include <ATen/ops/_masked_softmax_meta.h>
+#include <ATen/ops/_masked_softmax_backward_meta.h>
+#include <ATen/ops/_mixed_dtypes_linear_meta.h>
+#include <ATen/ops/_mkldnn_reshape_meta.h>
+#include <ATen/ops/_mkldnn_transpose_meta.h>
+#include <ATen/ops/_mps_convolution_meta.h>
+#include <ATen/ops/_mps_convolution_transpose_meta.h>
+#include <ATen/ops/_native_batch_norm_legit_meta.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_meta.h>
+#include <ATen/ops/_native_multi_head_attention_meta.h>
+#include <ATen/ops/_neg_view_meta.h>
+#include <ATen/ops/_neg_view_copy_meta.h>
+#include <ATen/ops/_nested_from_padded_meta.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_meta.h>
+#include <ATen/ops/_nested_get_jagged_dummy_meta.h>
+#include <ATen/ops/_nested_get_lengths_meta.h>
+#include <ATen/ops/_nested_get_offsets_meta.h>
+#include <ATen/ops/_nested_get_ragged_idx_meta.h>
+#include <ATen/ops/_nested_get_values_meta.h>
+#include <ATen/ops/_nested_get_values_copy_meta.h>
+#include <ATen/ops/_nested_select_backward_meta.h>
+#include <ATen/ops/_nested_sum_backward_meta.h>
+#include <ATen/ops/_nested_tensor_from_mask_meta.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_meta.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_meta.h>
+#include <ATen/ops/_nested_tensor_size_meta.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape_meta.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_meta.h>
+#include <ATen/ops/_nested_tensor_strides_meta.h>
+#include <ATen/ops/_nested_view_from_buffer_meta.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_meta.h>
+#include <ATen/ops/_nested_view_from_jagged_meta.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_meta.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_meta.h>
+#include <ATen/ops/_nnpack_available_meta.h>
+#include <ATen/ops/_nnpack_spatial_convolution_meta.h>
+#include <ATen/ops/_nnz_meta.h>
+#include <ATen/ops/_pack_padded_sequence_meta.h>
+#include <ATen/ops/_pack_padded_sequence_backward_meta.h>
+#include <ATen/ops/_pad_circular_meta.h>
+#include <ATen/ops/_pad_enum_meta.h>
+#include <ATen/ops/_pad_packed_sequence_meta.h>
+#include <ATen/ops/_pdist_backward_meta.h>
+#include <ATen/ops/_pdist_forward_meta.h>
+#include <ATen/ops/_pin_memory_meta.h>
+#include <ATen/ops/_prelu_kernel_meta.h>
+#include <ATen/ops/_prelu_kernel_backward_meta.h>
+#include <ATen/ops/_print_meta.h>
+#include <ATen/ops/_propagate_xla_data_meta.h>
+#include <ATen/ops/_remove_batch_dim_meta.h>
+#include <ATen/ops/_reshape_alias_meta.h>
+#include <ATen/ops/_reshape_alias_copy_meta.h>
+#include <ATen/ops/_reshape_copy_meta.h>
+#include <ATen/ops/_reshape_from_tensor_meta.h>
+#include <ATen/ops/_resize_output_meta.h>
+#include <ATen/ops/_rowwise_prune_meta.h>
+#include <ATen/ops/_sample_dirichlet_meta.h>
+#include <ATen/ops/_saturate_weight_to_fp16_meta.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_meta.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_meta.h>
+#include <ATen/ops/_scaled_mm_meta.h>
+#include <ATen/ops/_segment_reduce_backward_meta.h>
+#include <ATen/ops/_shape_as_tensor_meta.h>
+#include <ATen/ops/_slow_conv2d_backward_meta.h>
+#include <ATen/ops/_slow_conv2d_forward_meta.h>
+#include <ATen/ops/_sobol_engine_draw_meta.h>
+#include <ATen/ops/_sobol_engine_ff_meta.h>
+#include <ATen/ops/_sobol_engine_initialize_state_meta.h>
+#include <ATen/ops/_sobol_engine_scramble_meta.h>
+#include <ATen/ops/_softmax_meta.h>
+#include <ATen/ops/_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_addmm_meta.h>
+#include <ATen/ops/_sparse_broadcast_to_meta.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_meta.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_meta.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_csr_prod_meta.h>
+#include <ATen/ops/_sparse_csr_sum_meta.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_log_softmax_meta.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_mask_projection_meta.h>
+#include <ATen/ops/_sparse_mm_meta.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_meta.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward_meta.h>
+#include <ATen/ops/_sparse_semi_structured_linear_meta.h>
+#include <ATen/ops/_sparse_softmax_meta.h>
+#include <ATen/ops/_sparse_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_sparse_matmul_meta.h>
+#include <ATen/ops/_sparse_sum_meta.h>
+#include <ATen/ops/_sparse_sum_backward_meta.h>
+#include <ATen/ops/_spdiags_meta.h>
+#include <ATen/ops/_stack_meta.h>
+#include <ATen/ops/_standard_gamma_meta.h>
+#include <ATen/ops/_standard_gamma_grad_meta.h>
+#include <ATen/ops/_test_ambiguous_defaults_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_meta.h>
+#include <ATen/ops/_test_check_tensor_meta.h>
+#include <ATen/ops/_test_functorch_fallback_meta.h>
+#include <ATen/ops/_test_optional_filled_intlist_meta.h>
+#include <ATen/ops/_test_optional_floatlist_meta.h>
+#include <ATen/ops/_test_optional_intlist_meta.h>
+#include <ATen/ops/_test_parallel_materialize_meta.h>
+#include <ATen/ops/_test_serialization_subcmul_meta.h>
+#include <ATen/ops/_test_string_default_meta.h>
+#include <ATen/ops/_test_warn_in_autograd_meta.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_meta.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_gru_cell_meta.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_meta.h>
+#include <ATen/ops/_to_copy_meta.h>
+#include <ATen/ops/_to_cpu_meta.h>
+#include <ATen/ops/_to_dense_meta.h>
+#include <ATen/ops/_to_sparse_meta.h>
+#include <ATen/ops/_to_sparse_bsc_meta.h>
+#include <ATen/ops/_to_sparse_bsr_meta.h>
+#include <ATen/ops/_to_sparse_csc_meta.h>
+#include <ATen/ops/_to_sparse_csr_meta.h>
+#include <ATen/ops/_to_sparse_semi_structured_meta.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_meta.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_meta.h>
+#include <ATen/ops/_trilinear_meta.h>
+#include <ATen/ops/_triton_multi_head_attention_meta.h>
+#include <ATen/ops/_triton_scaled_dot_attention_meta.h>
+#include <ATen/ops/_unique_meta.h>
+#include <ATen/ops/_unique2_meta.h>
+#include <ATen/ops/_unpack_dual_meta.h>
+#include <ATen/ops/_unsafe_index_meta.h>
+#include <ATen/ops/_unsafe_index_put_meta.h>
+#include <ATen/ops/_unsafe_view_meta.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_meta.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_meta.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_meta.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact1d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact2d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact3d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_meta.h>
+#include <ATen/ops/_use_cudnn_ctc_loss_meta.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_meta.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_meta.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_meta.h>
+#include <ATen/ops/_values_meta.h>
+#include <ATen/ops/_values_copy_meta.h>
+#include <ATen/ops/_version_meta.h>
+#include <ATen/ops/_weight_int4pack_mm_meta.h>
+#include <ATen/ops/_weight_int8pack_mm_meta.h>
+#include <ATen/ops/_weight_norm_meta.h>
+#include <ATen/ops/_weight_norm_differentiable_backward_meta.h>
+#include <ATen/ops/_weight_norm_interface_meta.h>
+#include <ATen/ops/_weight_norm_interface_backward_meta.h>
+#include <ATen/ops/abs_meta.h>
+#include <ATen/ops/absolute_meta.h>
+#include <ATen/ops/acos_meta.h>
+#include <ATen/ops/acosh_meta.h>
+#include <ATen/ops/adaptive_avg_pool1d_meta.h>
+#include <ATen/ops/adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/adaptive_avg_pool3d_meta.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_meta.h>
+#include <ATen/ops/adaptive_max_pool1d_meta.h>
+#include <ATen/ops/adaptive_max_pool2d_meta.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta.h>
+#include <ATen/ops/adaptive_max_pool3d_meta.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta.h>
+#include <ATen/ops/add_meta.h>
+#include <ATen/ops/addbmm_meta.h>
+#include <ATen/ops/addcdiv_meta.h>
+#include <ATen/ops/addcmul_meta.h>
+#include <ATen/ops/addmm_meta.h>
+#include <ATen/ops/addmv_meta.h>
+#include <ATen/ops/addr_meta.h>
+#include <ATen/ops/adjoint_meta.h>
+#include <ATen/ops/affine_grid_generator_meta.h>
+#include <ATen/ops/affine_grid_generator_backward_meta.h>
+#include <ATen/ops/alias_meta.h>
+#include <ATen/ops/alias_copy_meta.h>
+#include <ATen/ops/align_as_meta.h>
+#include <ATen/ops/align_tensors_meta.h>
+#include <ATen/ops/align_to_meta.h>
+#include <ATen/ops/all_meta.h>
+#include <ATen/ops/allclose_meta.h>
+#include <ATen/ops/alpha_dropout_meta.h>
+#include <ATen/ops/amax_meta.h>
+#include <ATen/ops/amin_meta.h>
+#include <ATen/ops/aminmax_meta.h>
+#include <ATen/ops/and_meta.h>
+#include <ATen/ops/angle_meta.h>
+#include <ATen/ops/any_meta.h>
+#include <ATen/ops/arange_meta.h>
+#include <ATen/ops/arccos_meta.h>
+#include <ATen/ops/arccosh_meta.h>
+#include <ATen/ops/arcsin_meta.h>
+#include <ATen/ops/arcsinh_meta.h>
+#include <ATen/ops/arctan_meta.h>
+#include <ATen/ops/arctan2_meta.h>
+#include <ATen/ops/arctanh_meta.h>
+#include <ATen/ops/argmax_meta.h>
+#include <ATen/ops/argmin_meta.h>
+#include <ATen/ops/argsort_meta.h>
+#include <ATen/ops/argwhere_meta.h>
+#include <ATen/ops/as_strided_meta.h>
+#include <ATen/ops/as_strided_copy_meta.h>
+#include <ATen/ops/as_strided_scatter_meta.h>
+#include <ATen/ops/asin_meta.h>
+#include <ATen/ops/asinh_meta.h>
+#include <ATen/ops/atan_meta.h>
+#include <ATen/ops/atan2_meta.h>
+#include <ATen/ops/atanh_meta.h>
+#include <ATen/ops/atleast_1d_meta.h>
+#include <ATen/ops/atleast_2d_meta.h>
+#include <ATen/ops/atleast_3d_meta.h>
+#include <ATen/ops/avg_pool1d_meta.h>
+#include <ATen/ops/avg_pool2d_meta.h>
+#include <ATen/ops/avg_pool2d_backward_meta.h>
+#include <ATen/ops/avg_pool3d_meta.h>
+#include <ATen/ops/avg_pool3d_backward_meta.h>
+#include <ATen/ops/baddbmm_meta.h>
+#include <ATen/ops/bartlett_window_meta.h>
+#include <ATen/ops/batch_norm_meta.h>
+#include <ATen/ops/batch_norm_backward_elemt_meta.h>
+#include <ATen/ops/batch_norm_backward_reduce_meta.h>
+#include <ATen/ops/batch_norm_elemt_meta.h>
+#include <ATen/ops/batch_norm_gather_stats_meta.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_meta.h>
+#include <ATen/ops/batch_norm_stats_meta.h>
+#include <ATen/ops/batch_norm_update_stats_meta.h>
+#include <ATen/ops/bernoulli_meta.h>
+#include <ATen/ops/bilinear_meta.h>
+#include <ATen/ops/binary_cross_entropy_meta.h>
+#include <ATen/ops/binary_cross_entropy_backward_meta.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_meta.h>
+#include <ATen/ops/bincount_meta.h>
+#include <ATen/ops/binomial_meta.h>
+#include <ATen/ops/bitwise_and_meta.h>
+#include <ATen/ops/bitwise_left_shift_meta.h>
+#include <ATen/ops/bitwise_not_meta.h>
+#include <ATen/ops/bitwise_or_meta.h>
+#include <ATen/ops/bitwise_right_shift_meta.h>
+#include <ATen/ops/bitwise_xor_meta.h>
+#include <ATen/ops/blackman_window_meta.h>
+#include <ATen/ops/block_diag_meta.h>
+#include <ATen/ops/bmm_meta.h>
+#include <ATen/ops/broadcast_tensors_meta.h>
+#include <ATen/ops/broadcast_to_meta.h>
+#include <ATen/ops/bucketize_meta.h>
+#include <ATen/ops/can_cast_meta.h>
+#include <ATen/ops/cartesian_prod_meta.h>
+#include <ATen/ops/cat_meta.h>
+#include <ATen/ops/cauchy_meta.h>
+#include <ATen/ops/ccol_indices_meta.h>
+#include <ATen/ops/ccol_indices_copy_meta.h>
+#include <ATen/ops/cdist_meta.h>
+#include <ATen/ops/ceil_meta.h>
+#include <ATen/ops/celu_meta.h>
+#include <ATen/ops/chain_matmul_meta.h>
+#include <ATen/ops/chalf_meta.h>
+#include <ATen/ops/channel_shuffle_meta.h>
+#include <ATen/ops/cholesky_meta.h>
+#include <ATen/ops/cholesky_inverse_meta.h>
+#include <ATen/ops/cholesky_solve_meta.h>
+#include <ATen/ops/choose_qparams_optimized_meta.h>
+#include <ATen/ops/chunk_meta.h>
+#include <ATen/ops/clamp_meta.h>
+#include <ATen/ops/clamp_max_meta.h>
+#include <ATen/ops/clamp_min_meta.h>
+#include <ATen/ops/clip_meta.h>
+#include <ATen/ops/clone_meta.h>
+#include <ATen/ops/coalesce_meta.h>
+#include <ATen/ops/col2im_meta.h>
+#include <ATen/ops/col_indices_meta.h>
+#include <ATen/ops/col_indices_copy_meta.h>
+#include <ATen/ops/column_stack_meta.h>
+#include <ATen/ops/combinations_meta.h>
+#include <ATen/ops/complex_meta.h>
+#include <ATen/ops/concat_meta.h>
+#include <ATen/ops/concatenate_meta.h>
+#include <ATen/ops/conj_meta.h>
+#include <ATen/ops/conj_physical_meta.h>
+#include <ATen/ops/constant_pad_nd_meta.h>
+#include <ATen/ops/contiguous_meta.h>
+#include <ATen/ops/conv1d_meta.h>
+#include <ATen/ops/conv2d_meta.h>
+#include <ATen/ops/conv3d_meta.h>
+#include <ATen/ops/conv_depthwise3d_meta.h>
+#include <ATen/ops/conv_tbc_meta.h>
+#include <ATen/ops/conv_tbc_backward_meta.h>
+#include <ATen/ops/conv_transpose1d_meta.h>
+#include <ATen/ops/conv_transpose2d_meta.h>
+#include <ATen/ops/conv_transpose3d_meta.h>
+#include <ATen/ops/convolution_meta.h>
+#include <ATen/ops/convolution_backward_meta.h>
+#include <ATen/ops/convolution_backward_overrideable_meta.h>
+#include <ATen/ops/convolution_overrideable_meta.h>
+#include <ATen/ops/copy_meta.h>
+#include <ATen/ops/copy_sparse_to_sparse_meta.h>
+#include <ATen/ops/copysign_meta.h>
+#include <ATen/ops/corrcoef_meta.h>
+#include <ATen/ops/cos_meta.h>
+#include <ATen/ops/cosh_meta.h>
+#include <ATen/ops/cosine_embedding_loss_meta.h>
+#include <ATen/ops/cosine_similarity_meta.h>
+#include <ATen/ops/count_nonzero_meta.h>
+#include <ATen/ops/cov_meta.h>
+#include <ATen/ops/cross_meta.h>
+#include <ATen/ops/cross_entropy_loss_meta.h>
+#include <ATen/ops/crow_indices_meta.h>
+#include <ATen/ops/crow_indices_copy_meta.h>
+#include <ATen/ops/ctc_loss_meta.h>
+#include <ATen/ops/cudnn_affine_grid_generator_meta.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_meta.h>
+#include <ATen/ops/cudnn_batch_norm_meta.h>
+#include <ATen/ops/cudnn_batch_norm_backward_meta.h>
+#include <ATen/ops/cudnn_convolution_meta.h>
+#include <ATen/ops/cudnn_convolution_add_relu_meta.h>
+#include <ATen/ops/cudnn_convolution_relu_meta.h>
+#include <ATen/ops/cudnn_convolution_transpose_meta.h>
+#include <ATen/ops/cudnn_grid_sampler_meta.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_meta.h>
+#include <ATen/ops/cudnn_is_acceptable_meta.h>
+#include <ATen/ops/cummax_meta.h>
+#include <ATen/ops/cummaxmin_backward_meta.h>
+#include <ATen/ops/cummin_meta.h>
+#include <ATen/ops/cumprod_meta.h>
+#include <ATen/ops/cumprod_backward_meta.h>
+#include <ATen/ops/cumsum_meta.h>
+#include <ATen/ops/cumulative_trapezoid_meta.h>
+#include <ATen/ops/data_meta.h>
+#include <ATen/ops/deg2rad_meta.h>
+#include <ATen/ops/dense_dim_meta.h>
+#include <ATen/ops/dequantize_meta.h>
+#include <ATen/ops/det_meta.h>
+#include <ATen/ops/detach_meta.h>
+#include <ATen/ops/detach_copy_meta.h>
+#include <ATen/ops/diag_meta.h>
+#include <ATen/ops/diag_embed_meta.h>
+#include <ATen/ops/diagflat_meta.h>
+#include <ATen/ops/diagonal_meta.h>
+#include <ATen/ops/diagonal_backward_meta.h>
+#include <ATen/ops/diagonal_copy_meta.h>
+#include <ATen/ops/diagonal_scatter_meta.h>
+#include <ATen/ops/diff_meta.h>
+#include <ATen/ops/digamma_meta.h>
+#include <ATen/ops/dist_meta.h>
+#include <ATen/ops/div_meta.h>
+#include <ATen/ops/divide_meta.h>
+#include <ATen/ops/dot_meta.h>
+#include <ATen/ops/dropout_meta.h>
+#include <ATen/ops/dsplit_meta.h>
+#include <ATen/ops/dstack_meta.h>
+#include <ATen/ops/einsum_meta.h>
+#include <ATen/ops/elu_meta.h>
+#include <ATen/ops/elu_backward_meta.h>
+#include <ATen/ops/embedding_meta.h>
+#include <ATen/ops/embedding_backward_meta.h>
+#include <ATen/ops/embedding_bag_meta.h>
+#include <ATen/ops/embedding_dense_backward_meta.h>
+#include <ATen/ops/embedding_renorm_meta.h>
+#include <ATen/ops/embedding_sparse_backward_meta.h>
+#include <ATen/ops/empty_meta.h>
+#include <ATen/ops/empty_like_meta.h>
+#include <ATen/ops/empty_permuted_meta.h>
+#include <ATen/ops/empty_quantized_meta.h>
+#include <ATen/ops/empty_strided_meta.h>
+#include <ATen/ops/eq_meta.h>
+#include <ATen/ops/equal_meta.h>
+#include <ATen/ops/erf_meta.h>
+#include <ATen/ops/erfc_meta.h>
+#include <ATen/ops/erfinv_meta.h>
+#include <ATen/ops/exp_meta.h>
+#include <ATen/ops/exp2_meta.h>
+#include <ATen/ops/expand_meta.h>
+#include <ATen/ops/expand_as_meta.h>
+#include <ATen/ops/expand_copy_meta.h>
+#include <ATen/ops/expm1_meta.h>
+#include <ATen/ops/exponential_meta.h>
+#include <ATen/ops/eye_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_meta.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_meta.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_meta.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_meta.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_meta.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_meta.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_meta.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_meta.h>
+#include <ATen/ops/feature_alpha_dropout_meta.h>
+#include <ATen/ops/feature_dropout_meta.h>
+#include <ATen/ops/fft_fft_meta.h>
+#include <ATen/ops/fft_fft2_meta.h>
+#include <ATen/ops/fft_fftfreq_meta.h>
+#include <ATen/ops/fft_fftn_meta.h>
+#include <ATen/ops/fft_fftshift_meta.h>
+#include <ATen/ops/fft_hfft_meta.h>
+#include <ATen/ops/fft_hfft2_meta.h>
+#include <ATen/ops/fft_hfftn_meta.h>
+#include <ATen/ops/fft_ifft_meta.h>
+#include <ATen/ops/fft_ifft2_meta.h>
+#include <ATen/ops/fft_ifftn_meta.h>
+#include <ATen/ops/fft_ifftshift_meta.h>
+#include <ATen/ops/fft_ihfft_meta.h>
+#include <ATen/ops/fft_ihfft2_meta.h>
+#include <ATen/ops/fft_ihfftn_meta.h>
+#include <ATen/ops/fft_irfft_meta.h>
+#include <ATen/ops/fft_irfft2_meta.h>
+#include <ATen/ops/fft_irfftn_meta.h>
+#include <ATen/ops/fft_rfft_meta.h>
+#include <ATen/ops/fft_rfft2_meta.h>
+#include <ATen/ops/fft_rfftfreq_meta.h>
+#include <ATen/ops/fft_rfftn_meta.h>
+#include <ATen/ops/fill_meta.h>
+#include <ATen/ops/fill_diagonal_meta.h>
+#include <ATen/ops/fix_meta.h>
+#include <ATen/ops/flatten_meta.h>
+#include <ATen/ops/flatten_dense_tensors_meta.h>
+#include <ATen/ops/flip_meta.h>
+#include <ATen/ops/fliplr_meta.h>
+#include <ATen/ops/flipud_meta.h>
+#include <ATen/ops/float_power_meta.h>
+#include <ATen/ops/floor_meta.h>
+#include <ATen/ops/floor_divide_meta.h>
+#include <ATen/ops/fmax_meta.h>
+#include <ATen/ops/fmin_meta.h>
+#include <ATen/ops/fmod_meta.h>
+#include <ATen/ops/frac_meta.h>
+#include <ATen/ops/fractional_max_pool2d_meta.h>
+#include <ATen/ops/fractional_max_pool2d_backward_meta.h>
+#include <ATen/ops/fractional_max_pool3d_meta.h>
+#include <ATen/ops/fractional_max_pool3d_backward_meta.h>
+#include <ATen/ops/frexp_meta.h>
+#include <ATen/ops/frobenius_norm_meta.h>
+#include <ATen/ops/from_file_meta.h>
+#include <ATen/ops/full_meta.h>
+#include <ATen/ops/full_like_meta.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant_meta.h>
+#include <ATen/ops/gather_meta.h>
+#include <ATen/ops/gather_backward_meta.h>
+#include <ATen/ops/gcd_meta.h>
+#include <ATen/ops/ge_meta.h>
+#include <ATen/ops/gelu_meta.h>
+#include <ATen/ops/gelu_backward_meta.h>
+#include <ATen/ops/geometric_meta.h>
+#include <ATen/ops/geqrf_meta.h>
+#include <ATen/ops/ger_meta.h>
+#include <ATen/ops/glu_meta.h>
+#include <ATen/ops/glu_backward_meta.h>
+#include <ATen/ops/glu_backward_jvp_meta.h>
+#include <ATen/ops/glu_jvp_meta.h>
+#include <ATen/ops/gradient_meta.h>
+#include <ATen/ops/greater_meta.h>
+#include <ATen/ops/greater_equal_meta.h>
+#include <ATen/ops/grid_sampler_meta.h>
+#include <ATen/ops/grid_sampler_2d_meta.h>
+#include <ATen/ops/grid_sampler_2d_backward_meta.h>
+#include <ATen/ops/grid_sampler_3d_meta.h>
+#include <ATen/ops/grid_sampler_3d_backward_meta.h>
+#include <ATen/ops/group_norm_meta.h>
+#include <ATen/ops/gru_meta.h>
+#include <ATen/ops/gru_cell_meta.h>
+#include <ATen/ops/gt_meta.h>
+#include <ATen/ops/hamming_window_meta.h>
+#include <ATen/ops/hann_window_meta.h>
+#include <ATen/ops/hardshrink_meta.h>
+#include <ATen/ops/hardshrink_backward_meta.h>
+#include <ATen/ops/hardsigmoid_meta.h>
+#include <ATen/ops/hardsigmoid_backward_meta.h>
+#include <ATen/ops/hardswish_meta.h>
+#include <ATen/ops/hardswish_backward_meta.h>
+#include <ATen/ops/hardtanh_meta.h>
+#include <ATen/ops/hardtanh_backward_meta.h>
+#include <ATen/ops/heaviside_meta.h>
+#include <ATen/ops/hinge_embedding_loss_meta.h>
+#include <ATen/ops/histc_meta.h>
+#include <ATen/ops/histogram_meta.h>
+#include <ATen/ops/histogramdd_meta.h>
+#include <ATen/ops/hsplit_meta.h>
+#include <ATen/ops/hspmm_meta.h>
+#include <ATen/ops/hstack_meta.h>
+#include <ATen/ops/huber_loss_meta.h>
+#include <ATen/ops/huber_loss_backward_meta.h>
+#include <ATen/ops/hypot_meta.h>
+#include <ATen/ops/i0_meta.h>
+#include <ATen/ops/igamma_meta.h>
+#include <ATen/ops/igammac_meta.h>
+#include <ATen/ops/im2col_meta.h>
+#include <ATen/ops/imag_meta.h>
+#include <ATen/ops/index_meta.h>
+#include <ATen/ops/index_add_meta.h>
+#include <ATen/ops/index_copy_meta.h>
+#include <ATen/ops/index_fill_meta.h>
+#include <ATen/ops/index_put_meta.h>
+#include <ATen/ops/index_reduce_meta.h>
+#include <ATen/ops/index_select_meta.h>
+#include <ATen/ops/index_select_backward_meta.h>
+#include <ATen/ops/indices_meta.h>
+#include <ATen/ops/indices_copy_meta.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward_meta.h>
+#include <ATen/ops/inner_meta.h>
+#include <ATen/ops/instance_norm_meta.h>
+#include <ATen/ops/int_repr_meta.h>
+#include <ATen/ops/inverse_meta.h>
+#include <ATen/ops/is_coalesced_meta.h>
+#include <ATen/ops/is_complex_meta.h>
+#include <ATen/ops/is_conj_meta.h>
+#include <ATen/ops/is_distributed_meta.h>
+#include <ATen/ops/is_floating_point_meta.h>
+#include <ATen/ops/is_inference_meta.h>
+#include <ATen/ops/is_leaf_meta.h>
+#include <ATen/ops/is_neg_meta.h>
+#include <ATen/ops/is_nonzero_meta.h>
+#include <ATen/ops/is_pinned_meta.h>
+#include <ATen/ops/is_same_size_meta.h>
+#include <ATen/ops/is_set_to_meta.h>
+#include <ATen/ops/is_signed_meta.h>
+#include <ATen/ops/is_vulkan_available_meta.h>
+#include <ATen/ops/isclose_meta.h>
+#include <ATen/ops/isfinite_meta.h>
+#include <ATen/ops/isin_meta.h>
+#include <ATen/ops/isinf_meta.h>
+#include <ATen/ops/isnan_meta.h>
+#include <ATen/ops/isneginf_meta.h>
+#include <ATen/ops/isposinf_meta.h>
+#include <ATen/ops/isreal_meta.h>
+#include <ATen/ops/istft_meta.h>
+#include <ATen/ops/item_meta.h>
+#include <ATen/ops/kaiser_window_meta.h>
+#include <ATen/ops/kl_div_meta.h>
+#include <ATen/ops/kron_meta.h>
+#include <ATen/ops/kthvalue_meta.h>
+#include <ATen/ops/l1_loss_meta.h>
+#include <ATen/ops/layer_norm_meta.h>
+#include <ATen/ops/lcm_meta.h>
+#include <ATen/ops/ldexp_meta.h>
+#include <ATen/ops/le_meta.h>
+#include <ATen/ops/leaky_relu_meta.h>
+#include <ATen/ops/leaky_relu_backward_meta.h>
+#include <ATen/ops/lerp_meta.h>
+#include <ATen/ops/less_meta.h>
+#include <ATen/ops/less_equal_meta.h>
+#include <ATen/ops/lgamma_meta.h>
+#include <ATen/ops/lift_meta.h>
+#include <ATen/ops/lift_fresh_meta.h>
+#include <ATen/ops/lift_fresh_copy_meta.h>
+#include <ATen/ops/linalg_cholesky_meta.h>
+#include <ATen/ops/linalg_cholesky_ex_meta.h>
+#include <ATen/ops/linalg_cond_meta.h>
+#include <ATen/ops/linalg_cross_meta.h>
+#include <ATen/ops/linalg_det_meta.h>
+#include <ATen/ops/linalg_diagonal_meta.h>
+#include <ATen/ops/linalg_eig_meta.h>
+#include <ATen/ops/linalg_eigh_meta.h>
+#include <ATen/ops/linalg_eigvals_meta.h>
+#include <ATen/ops/linalg_eigvalsh_meta.h>
+#include <ATen/ops/linalg_householder_product_meta.h>
+#include <ATen/ops/linalg_inv_meta.h>
+#include <ATen/ops/linalg_inv_ex_meta.h>
+#include <ATen/ops/linalg_ldl_factor_meta.h>
+#include <ATen/ops/linalg_ldl_factor_ex_meta.h>
+#include <ATen/ops/linalg_ldl_solve_meta.h>
+#include <ATen/ops/linalg_lstsq_meta.h>
+#include <ATen/ops/linalg_lu_meta.h>
+#include <ATen/ops/linalg_lu_factor_meta.h>
+#include <ATen/ops/linalg_lu_factor_ex_meta.h>
+#include <ATen/ops/linalg_lu_solve_meta.h>
+#include <ATen/ops/linalg_matmul_meta.h>
+#include <ATen/ops/linalg_matrix_exp_meta.h>
+#include <ATen/ops/linalg_matrix_norm_meta.h>
+#include <ATen/ops/linalg_matrix_power_meta.h>
+#include <ATen/ops/linalg_matrix_rank_meta.h>
+#include <ATen/ops/linalg_multi_dot_meta.h>
+#include <ATen/ops/linalg_norm_meta.h>
+#include <ATen/ops/linalg_pinv_meta.h>
+#include <ATen/ops/linalg_qr_meta.h>
+#include <ATen/ops/linalg_slogdet_meta.h>
+#include <ATen/ops/linalg_solve_meta.h>
+#include <ATen/ops/linalg_solve_ex_meta.h>
+#include <ATen/ops/linalg_solve_triangular_meta.h>
+#include <ATen/ops/linalg_svd_meta.h>
+#include <ATen/ops/linalg_svdvals_meta.h>
+#include <ATen/ops/linalg_tensorinv_meta.h>
+#include <ATen/ops/linalg_tensorsolve_meta.h>
+#include <ATen/ops/linalg_vander_meta.h>
+#include <ATen/ops/linalg_vecdot_meta.h>
+#include <ATen/ops/linalg_vector_norm_meta.h>
+#include <ATen/ops/linear_meta.h>
+#include <ATen/ops/linear_backward_meta.h>
+#include <ATen/ops/linspace_meta.h>
+#include <ATen/ops/log_meta.h>
+#include <ATen/ops/log10_meta.h>
+#include <ATen/ops/log1p_meta.h>
+#include <ATen/ops/log2_meta.h>
+#include <ATen/ops/log_normal_meta.h>
+#include <ATen/ops/log_sigmoid_meta.h>
+#include <ATen/ops/log_sigmoid_backward_meta.h>
+#include <ATen/ops/log_sigmoid_forward_meta.h>
+#include <ATen/ops/log_softmax_meta.h>
+#include <ATen/ops/logaddexp_meta.h>
+#include <ATen/ops/logaddexp2_meta.h>
+#include <ATen/ops/logcumsumexp_meta.h>
+#include <ATen/ops/logdet_meta.h>
+#include <ATen/ops/logical_and_meta.h>
+#include <ATen/ops/logical_not_meta.h>
+#include <ATen/ops/logical_or_meta.h>
+#include <ATen/ops/logical_xor_meta.h>
+#include <ATen/ops/logit_meta.h>
+#include <ATen/ops/logit_backward_meta.h>
+#include <ATen/ops/logspace_meta.h>
+#include <ATen/ops/logsumexp_meta.h>
+#include <ATen/ops/lshift_meta.h>
+#include <ATen/ops/lstm_meta.h>
+#include <ATen/ops/lstm_cell_meta.h>
+#include <ATen/ops/lstm_mps_backward_meta.h>
+#include <ATen/ops/lt_meta.h>
+#include <ATen/ops/lu_solve_meta.h>
+#include <ATen/ops/lu_unpack_meta.h>
+#include <ATen/ops/mH_meta.h>
+#include <ATen/ops/mT_meta.h>
+#include <ATen/ops/margin_ranking_loss_meta.h>
+#include <ATen/ops/masked_fill_meta.h>
+#include <ATen/ops/masked_scatter_meta.h>
+#include <ATen/ops/masked_scatter_backward_meta.h>
+#include <ATen/ops/masked_select_meta.h>
+#include <ATen/ops/masked_select_backward_meta.h>
+#include <ATen/ops/matmul_meta.h>
+#include <ATen/ops/matmul_backward_meta.h>
+#include <ATen/ops/matrix_H_meta.h>
+#include <ATen/ops/matrix_exp_meta.h>
+#include <ATen/ops/matrix_exp_backward_meta.h>
+#include <ATen/ops/matrix_power_meta.h>
+#include <ATen/ops/max_meta.h>
+#include <ATen/ops/max_pool1d_meta.h>
+#include <ATen/ops/max_pool1d_with_indices_meta.h>
+#include <ATen/ops/max_pool2d_meta.h>
+#include <ATen/ops/max_pool2d_backward_meta.h>
+#include <ATen/ops/max_pool2d_with_indices_meta.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_meta.h>
+#include <ATen/ops/max_pool3d_meta.h>
+#include <ATen/ops/max_pool3d_with_indices_meta.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_meta.h>
+#include <ATen/ops/max_unpool2d_meta.h>
+#include <ATen/ops/max_unpool3d_meta.h>
+#include <ATen/ops/maximum_meta.h>
+#include <ATen/ops/mean_meta.h>
+#include <ATen/ops/median_meta.h>
+#include <ATen/ops/meshgrid_meta.h>
+#include <ATen/ops/min_meta.h>
+#include <ATen/ops/minimum_meta.h>
+#include <ATen/ops/miopen_batch_norm_meta.h>
+#include <ATen/ops/miopen_batch_norm_backward_meta.h>
+#include <ATen/ops/miopen_convolution_meta.h>
+#include <ATen/ops/miopen_convolution_add_relu_meta.h>
+#include <ATen/ops/miopen_convolution_relu_meta.h>
+#include <ATen/ops/miopen_convolution_transpose_meta.h>
+#include <ATen/ops/miopen_depthwise_convolution_meta.h>
+#include <ATen/ops/miopen_rnn_meta.h>
+#include <ATen/ops/miopen_rnn_backward_meta.h>
+#include <ATen/ops/mish_meta.h>
+#include <ATen/ops/mish_backward_meta.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_meta.h>
+#include <ATen/ops/mkldnn_convolution_meta.h>
+#include <ATen/ops/mkldnn_linear_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_input_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_meta.h>
+#include <ATen/ops/mkldnn_max_pool2d_meta.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_meta.h>
+#include <ATen/ops/mkldnn_max_pool3d_meta.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_meta.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_meta.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_meta.h>
+#include <ATen/ops/mkldnn_rnn_layer_meta.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_meta.h>
+#include <ATen/ops/mm_meta.h>
+#include <ATen/ops/mode_meta.h>
+#include <ATen/ops/moveaxis_meta.h>
+#include <ATen/ops/movedim_meta.h>
+#include <ATen/ops/mps_convolution_backward_meta.h>
+#include <ATen/ops/mps_convolution_transpose_backward_meta.h>
+#include <ATen/ops/mse_loss_meta.h>
+#include <ATen/ops/mse_loss_backward_meta.h>
+#include <ATen/ops/msort_meta.h>
+#include <ATen/ops/mul_meta.h>
+#include <ATen/ops/multi_margin_loss_meta.h>
+#include <ATen/ops/multi_margin_loss_backward_meta.h>
+#include <ATen/ops/multilabel_margin_loss_meta.h>
+#include <ATen/ops/multilabel_margin_loss_backward_meta.h>
+#include <ATen/ops/multilabel_margin_loss_forward_meta.h>
+#include <ATen/ops/multinomial_meta.h>
+#include <ATen/ops/multiply_meta.h>
+#include <ATen/ops/mv_meta.h>
+#include <ATen/ops/mvlgamma_meta.h>
+#include <ATen/ops/nan_to_num_meta.h>
+#include <ATen/ops/nanmean_meta.h>
+#include <ATen/ops/nanmedian_meta.h>
+#include <ATen/ops/nanquantile_meta.h>
+#include <ATen/ops/nansum_meta.h>
+#include <ATen/ops/narrow_meta.h>
+#include <ATen/ops/narrow_copy_meta.h>
+#include <ATen/ops/native_batch_norm_meta.h>
+#include <ATen/ops/native_batch_norm_backward_meta.h>
+#include <ATen/ops/native_channel_shuffle_meta.h>
+#include <ATen/ops/native_dropout_meta.h>
+#include <ATen/ops/native_dropout_backward_meta.h>
+#include <ATen/ops/native_group_norm_meta.h>
+#include <ATen/ops/native_group_norm_backward_meta.h>
+#include <ATen/ops/native_layer_norm_meta.h>
+#include <ATen/ops/native_layer_norm_backward_meta.h>
+#include <ATen/ops/native_norm_meta.h>
+#include <ATen/ops/ne_meta.h>
+#include <ATen/ops/neg_meta.h>
+#include <ATen/ops/negative_meta.h>
+#include <ATen/ops/nested_to_padded_tensor_meta.h>
+#include <ATen/ops/new_empty_meta.h>
+#include <ATen/ops/new_empty_strided_meta.h>
+#include <ATen/ops/new_full_meta.h>
+#include <ATen/ops/new_ones_meta.h>
+#include <ATen/ops/new_zeros_meta.h>
+#include <ATen/ops/nextafter_meta.h>
+#include <ATen/ops/nll_loss_meta.h>
+#include <ATen/ops/nll_loss2d_meta.h>
+#include <ATen/ops/nll_loss2d_backward_meta.h>
+#include <ATen/ops/nll_loss2d_forward_meta.h>
+#include <ATen/ops/nll_loss_backward_meta.h>
+#include <ATen/ops/nll_loss_forward_meta.h>
+#include <ATen/ops/nll_loss_nd_meta.h>
+#include <ATen/ops/nonzero_meta.h>
+#include <ATen/ops/nonzero_numpy_meta.h>
+#include <ATen/ops/nonzero_static_meta.h>
+#include <ATen/ops/norm_meta.h>
+#include <ATen/ops/norm_except_dim_meta.h>
+#include <ATen/ops/normal_meta.h>
+#include <ATen/ops/not_equal_meta.h>
+#include <ATen/ops/nuclear_norm_meta.h>
+#include <ATen/ops/numpy_T_meta.h>
+#include <ATen/ops/one_hot_meta.h>
+#include <ATen/ops/ones_meta.h>
+#include <ATen/ops/ones_like_meta.h>
+#include <ATen/ops/or_meta.h>
+#include <ATen/ops/orgqr_meta.h>
+#include <ATen/ops/ormqr_meta.h>
+#include <ATen/ops/outer_meta.h>
+#include <ATen/ops/output_nr_meta.h>
+#include <ATen/ops/pad_meta.h>
+#include <ATen/ops/pad_sequence_meta.h>
+#include <ATen/ops/pairwise_distance_meta.h>
+#include <ATen/ops/pdist_meta.h>
+#include <ATen/ops/permute_meta.h>
+#include <ATen/ops/permute_copy_meta.h>
+#include <ATen/ops/pin_memory_meta.h>
+#include <ATen/ops/pinverse_meta.h>
+#include <ATen/ops/pixel_shuffle_meta.h>
+#include <ATen/ops/pixel_unshuffle_meta.h>
+#include <ATen/ops/poisson_meta.h>
+#include <ATen/ops/poisson_nll_loss_meta.h>
+#include <ATen/ops/polar_meta.h>
+#include <ATen/ops/polygamma_meta.h>
+#include <ATen/ops/positive_meta.h>
+#include <ATen/ops/pow_meta.h>
+#include <ATen/ops/prelu_meta.h>
+#include <ATen/ops/prod_meta.h>
+#include <ATen/ops/promote_types_meta.h>
+#include <ATen/ops/put_meta.h>
+#include <ATen/ops/q_per_channel_axis_meta.h>
+#include <ATen/ops/q_per_channel_scales_meta.h>
+#include <ATen/ops/q_per_channel_zero_points_meta.h>
+#include <ATen/ops/q_scale_meta.h>
+#include <ATen/ops/q_zero_point_meta.h>
+#include <ATen/ops/qr_meta.h>
+#include <ATen/ops/qscheme_meta.h>
+#include <ATen/ops/quantile_meta.h>
+#include <ATen/ops/quantize_per_channel_meta.h>
+#include <ATen/ops/quantize_per_tensor_meta.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_meta.h>
+#include <ATen/ops/quantized_batch_norm_meta.h>
+#include <ATen/ops/quantized_gru_cell_meta.h>
+#include <ATen/ops/quantized_lstm_cell_meta.h>
+#include <ATen/ops/quantized_max_pool1d_meta.h>
+#include <ATen/ops/quantized_max_pool2d_meta.h>
+#include <ATen/ops/quantized_max_pool3d_meta.h>
+#include <ATen/ops/quantized_rnn_relu_cell_meta.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_meta.h>
+#include <ATen/ops/rad2deg_meta.h>
+#include <ATen/ops/rand_meta.h>
+#include <ATen/ops/rand_like_meta.h>
+#include <ATen/ops/randint_meta.h>
+#include <ATen/ops/randint_like_meta.h>
+#include <ATen/ops/randn_meta.h>
+#include <ATen/ops/randn_like_meta.h>
+#include <ATen/ops/random_meta.h>
+#include <ATen/ops/randperm_meta.h>
+#include <ATen/ops/range_meta.h>
+#include <ATen/ops/ravel_meta.h>
+#include <ATen/ops/real_meta.h>
+#include <ATen/ops/reciprocal_meta.h>
+#include <ATen/ops/record_stream_meta.h>
+#include <ATen/ops/refine_names_meta.h>
+#include <ATen/ops/reflection_pad1d_meta.h>
+#include <ATen/ops/reflection_pad1d_backward_meta.h>
+#include <ATen/ops/reflection_pad2d_meta.h>
+#include <ATen/ops/reflection_pad2d_backward_meta.h>
+#include <ATen/ops/reflection_pad3d_meta.h>
+#include <ATen/ops/reflection_pad3d_backward_meta.h>
+#include <ATen/ops/relu_meta.h>
+#include <ATen/ops/relu6_meta.h>
+#include <ATen/ops/remainder_meta.h>
+#include <ATen/ops/rename_meta.h>
+#include <ATen/ops/renorm_meta.h>
+#include <ATen/ops/repeat_meta.h>
+#include <ATen/ops/repeat_interleave_meta.h>
+#include <ATen/ops/replication_pad1d_meta.h>
+#include <ATen/ops/replication_pad1d_backward_meta.h>
+#include <ATen/ops/replication_pad2d_meta.h>
+#include <ATen/ops/replication_pad2d_backward_meta.h>
+#include <ATen/ops/replication_pad3d_meta.h>
+#include <ATen/ops/replication_pad3d_backward_meta.h>
+#include <ATen/ops/requires_grad_meta.h>
+#include <ATen/ops/reshape_meta.h>
+#include <ATen/ops/reshape_as_meta.h>
+#include <ATen/ops/resize_meta.h>
+#include <ATen/ops/resize_as_meta.h>
+#include <ATen/ops/resize_as_sparse_meta.h>
+#include <ATen/ops/resolve_conj_meta.h>
+#include <ATen/ops/resolve_neg_meta.h>
+#include <ATen/ops/result_type_meta.h>
+#include <ATen/ops/retain_grad_meta.h>
+#include <ATen/ops/retains_grad_meta.h>
+#include <ATen/ops/rnn_relu_meta.h>
+#include <ATen/ops/rnn_relu_cell_meta.h>
+#include <ATen/ops/rnn_tanh_meta.h>
+#include <ATen/ops/rnn_tanh_cell_meta.h>
+#include <ATen/ops/roll_meta.h>
+#include <ATen/ops/rot90_meta.h>
+#include <ATen/ops/round_meta.h>
+#include <ATen/ops/row_indices_meta.h>
+#include <ATen/ops/row_indices_copy_meta.h>
+#include <ATen/ops/row_stack_meta.h>
+#include <ATen/ops/rrelu_meta.h>
+#include <ATen/ops/rrelu_with_noise_meta.h>
+#include <ATen/ops/rrelu_with_noise_backward_meta.h>
+#include <ATen/ops/rshift_meta.h>
+#include <ATen/ops/rsqrt_meta.h>
+#include <ATen/ops/rsub_meta.h>
+#include <ATen/ops/scalar_tensor_meta.h>
+#include <ATen/ops/scaled_dot_product_attention_meta.h>
+#include <ATen/ops/scatter_meta.h>
+#include <ATen/ops/scatter_add_meta.h>
+#include <ATen/ops/scatter_reduce_meta.h>
+#include <ATen/ops/searchsorted_meta.h>
+#include <ATen/ops/segment_reduce_meta.h>
+#include <ATen/ops/select_meta.h>
+#include <ATen/ops/select_backward_meta.h>
+#include <ATen/ops/select_copy_meta.h>
+#include <ATen/ops/select_scatter_meta.h>
+#include <ATen/ops/selu_meta.h>
+#include <ATen/ops/set_meta.h>
+#include <ATen/ops/set_data_meta.h>
+#include <ATen/ops/sgn_meta.h>
+#include <ATen/ops/sigmoid_meta.h>
+#include <ATen/ops/sigmoid_backward_meta.h>
+#include <ATen/ops/sign_meta.h>
+#include <ATen/ops/signbit_meta.h>
+#include <ATen/ops/silu_meta.h>
+#include <ATen/ops/silu_backward_meta.h>
+#include <ATen/ops/sin_meta.h>
+#include <ATen/ops/sinc_meta.h>
+#include <ATen/ops/sinh_meta.h>
+#include <ATen/ops/size_meta.h>
+#include <ATen/ops/slice_meta.h>
+#include <ATen/ops/slice_backward_meta.h>
+#include <ATen/ops/slice_copy_meta.h>
+#include <ATen/ops/slice_inverse_meta.h>
+#include <ATen/ops/slice_scatter_meta.h>
+#include <ATen/ops/slogdet_meta.h>
+#include <ATen/ops/slow_conv3d_meta.h>
+#include <ATen/ops/slow_conv3d_forward_meta.h>
+#include <ATen/ops/slow_conv_dilated2d_meta.h>
+#include <ATen/ops/slow_conv_dilated3d_meta.h>
+#include <ATen/ops/slow_conv_transpose2d_meta.h>
+#include <ATen/ops/slow_conv_transpose3d_meta.h>
+#include <ATen/ops/smm_meta.h>
+#include <ATen/ops/smooth_l1_loss_meta.h>
+#include <ATen/ops/smooth_l1_loss_backward_meta.h>
+#include <ATen/ops/soft_margin_loss_meta.h>
+#include <ATen/ops/soft_margin_loss_backward_meta.h>
+#include <ATen/ops/softmax_meta.h>
+#include <ATen/ops/softplus_meta.h>
+#include <ATen/ops/softplus_backward_meta.h>
+#include <ATen/ops/softshrink_meta.h>
+#include <ATen/ops/softshrink_backward_meta.h>
+#include <ATen/ops/sort_meta.h>
+#include <ATen/ops/sparse_bsc_tensor_meta.h>
+#include <ATen/ops/sparse_bsr_tensor_meta.h>
+#include <ATen/ops/sparse_compressed_tensor_meta.h>
+#include <ATen/ops/sparse_coo_tensor_meta.h>
+#include <ATen/ops/sparse_csc_tensor_meta.h>
+#include <ATen/ops/sparse_csr_tensor_meta.h>
+#include <ATen/ops/sparse_dim_meta.h>
+#include <ATen/ops/sparse_mask_meta.h>
+#include <ATen/ops/sparse_resize_meta.h>
+#include <ATen/ops/sparse_resize_and_clear_meta.h>
+#include <ATen/ops/sparse_sampled_addmm_meta.h>
+#include <ATen/ops/special_airy_ai_meta.h>
+#include <ATen/ops/special_bessel_j0_meta.h>
+#include <ATen/ops/special_bessel_j1_meta.h>
+#include <ATen/ops/special_bessel_y0_meta.h>
+#include <ATen/ops/special_bessel_y1_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_meta.h>
+#include <ATen/ops/special_digamma_meta.h>
+#include <ATen/ops/special_entr_meta.h>
+#include <ATen/ops/special_erf_meta.h>
+#include <ATen/ops/special_erfc_meta.h>
+#include <ATen/ops/special_erfcx_meta.h>
+#include <ATen/ops/special_erfinv_meta.h>
+#include <ATen/ops/special_exp2_meta.h>
+#include <ATen/ops/special_expit_meta.h>
+#include <ATen/ops/special_expm1_meta.h>
+#include <ATen/ops/special_gammainc_meta.h>
+#include <ATen/ops/special_gammaincc_meta.h>
+#include <ATen/ops/special_gammaln_meta.h>
+#include <ATen/ops/special_hermite_polynomial_h_meta.h>
+#include <ATen/ops/special_hermite_polynomial_he_meta.h>
+#include <ATen/ops/special_i0_meta.h>
+#include <ATen/ops/special_i0e_meta.h>
+#include <ATen/ops/special_i1_meta.h>
+#include <ATen/ops/special_i1e_meta.h>
+#include <ATen/ops/special_laguerre_polynomial_l_meta.h>
+#include <ATen/ops/special_legendre_polynomial_p_meta.h>
+#include <ATen/ops/special_log1p_meta.h>
+#include <ATen/ops/special_log_ndtr_meta.h>
+#include <ATen/ops/special_log_softmax_meta.h>
+#include <ATen/ops/special_logit_meta.h>
+#include <ATen/ops/special_logsumexp_meta.h>
+#include <ATen/ops/special_modified_bessel_i0_meta.h>
+#include <ATen/ops/special_modified_bessel_i1_meta.h>
+#include <ATen/ops/special_modified_bessel_k0_meta.h>
+#include <ATen/ops/special_modified_bessel_k1_meta.h>
+#include <ATen/ops/special_multigammaln_meta.h>
+#include <ATen/ops/special_ndtr_meta.h>
+#include <ATen/ops/special_ndtri_meta.h>
+#include <ATen/ops/special_polygamma_meta.h>
+#include <ATen/ops/special_psi_meta.h>
+#include <ATen/ops/special_round_meta.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_meta.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_meta.h>
+#include <ATen/ops/special_sinc_meta.h>
+#include <ATen/ops/special_softmax_meta.h>
+#include <ATen/ops/special_spherical_bessel_j0_meta.h>
+#include <ATen/ops/special_xlog1py_meta.h>
+#include <ATen/ops/special_xlogy_meta.h>
+#include <ATen/ops/special_zeta_meta.h>
+#include <ATen/ops/split_meta.h>
+#include <ATen/ops/split_copy_meta.h>
+#include <ATen/ops/split_with_sizes_meta.h>
+#include <ATen/ops/split_with_sizes_copy_meta.h>
+#include <ATen/ops/sqrt_meta.h>
+#include <ATen/ops/square_meta.h>
+#include <ATen/ops/squeeze_meta.h>
+#include <ATen/ops/squeeze_copy_meta.h>
+#include <ATen/ops/sspaddmm_meta.h>
+#include <ATen/ops/stack_meta.h>
+#include <ATen/ops/std_meta.h>
+#include <ATen/ops/std_mean_meta.h>
+#include <ATen/ops/stft_meta.h>
+#include <ATen/ops/stride_meta.h>
+#include <ATen/ops/sub_meta.h>
+#include <ATen/ops/subtract_meta.h>
+#include <ATen/ops/sum_meta.h>
+#include <ATen/ops/sum_to_size_meta.h>
+#include <ATen/ops/svd_meta.h>
+#include <ATen/ops/swapaxes_meta.h>
+#include <ATen/ops/swapdims_meta.h>
+#include <ATen/ops/sym_constrain_range_meta.h>
+#include <ATen/ops/sym_constrain_range_for_size_meta.h>
+#include <ATen/ops/sym_numel_meta.h>
+#include <ATen/ops/sym_size_meta.h>
+#include <ATen/ops/sym_storage_offset_meta.h>
+#include <ATen/ops/sym_stride_meta.h>
+#include <ATen/ops/t_meta.h>
+#include <ATen/ops/t_copy_meta.h>
+#include <ATen/ops/take_meta.h>
+#include <ATen/ops/take_along_dim_meta.h>
+#include <ATen/ops/tan_meta.h>
+#include <ATen/ops/tanh_meta.h>
+#include <ATen/ops/tanh_backward_meta.h>
+#include <ATen/ops/tensor_split_meta.h>
+#include <ATen/ops/tensordot_meta.h>
+#include <ATen/ops/thnn_conv2d_meta.h>
+#include <ATen/ops/threshold_meta.h>
+#include <ATen/ops/threshold_backward_meta.h>
+#include <ATen/ops/tile_meta.h>
+#include <ATen/ops/to_meta.h>
+#include <ATen/ops/to_dense_meta.h>
+#include <ATen/ops/to_dense_backward_meta.h>
+#include <ATen/ops/to_mkldnn_meta.h>
+#include <ATen/ops/to_mkldnn_backward_meta.h>
+#include <ATen/ops/to_padded_tensor_meta.h>
+#include <ATen/ops/to_sparse_meta.h>
+#include <ATen/ops/to_sparse_bsc_meta.h>
+#include <ATen/ops/to_sparse_bsr_meta.h>
+#include <ATen/ops/to_sparse_csc_meta.h>
+#include <ATen/ops/to_sparse_csr_meta.h>
+#include <ATen/ops/topk_meta.h>
+#include <ATen/ops/trace_meta.h>
+#include <ATen/ops/trace_backward_meta.h>
+#include <ATen/ops/transpose_meta.h>
+#include <ATen/ops/transpose_copy_meta.h>
+#include <ATen/ops/trapezoid_meta.h>
+#include <ATen/ops/trapz_meta.h>
+#include <ATen/ops/triangular_solve_meta.h>
+#include <ATen/ops/tril_meta.h>
+#include <ATen/ops/tril_indices_meta.h>
+#include <ATen/ops/triplet_margin_loss_meta.h>
+#include <ATen/ops/triu_meta.h>
+#include <ATen/ops/triu_indices_meta.h>
+#include <ATen/ops/true_divide_meta.h>
+#include <ATen/ops/trunc_meta.h>
+#include <ATen/ops/type_as_meta.h>
+#include <ATen/ops/unbind_meta.h>
+#include <ATen/ops/unbind_copy_meta.h>
+#include <ATen/ops/unflatten_meta.h>
+#include <ATen/ops/unflatten_dense_tensors_meta.h>
+#include <ATen/ops/unfold_meta.h>
+#include <ATen/ops/unfold_backward_meta.h>
+#include <ATen/ops/unfold_copy_meta.h>
+#include <ATen/ops/uniform_meta.h>
+#include <ATen/ops/unique_consecutive_meta.h>
+#include <ATen/ops/unique_dim_meta.h>
+#include <ATen/ops/unique_dim_consecutive_meta.h>
+#include <ATen/ops/unsafe_chunk_meta.h>
+#include <ATen/ops/unsafe_split_meta.h>
+#include <ATen/ops/unsafe_split_with_sizes_meta.h>
+#include <ATen/ops/unsqueeze_meta.h>
+#include <ATen/ops/unsqueeze_copy_meta.h>
+#include <ATen/ops/upsample_bicubic2d_meta.h>
+#include <ATen/ops/upsample_bicubic2d_backward_meta.h>
+#include <ATen/ops/upsample_bilinear2d_meta.h>
+#include <ATen/ops/upsample_bilinear2d_backward_meta.h>
+#include <ATen/ops/upsample_linear1d_meta.h>
+#include <ATen/ops/upsample_linear1d_backward_meta.h>
+#include <ATen/ops/upsample_nearest1d_meta.h>
+#include <ATen/ops/upsample_nearest1d_backward_meta.h>
+#include <ATen/ops/upsample_nearest2d_meta.h>
+#include <ATen/ops/upsample_nearest2d_backward_meta.h>
+#include <ATen/ops/upsample_nearest3d_meta.h>
+#include <ATen/ops/upsample_nearest3d_backward_meta.h>
+#include <ATen/ops/upsample_trilinear3d_meta.h>
+#include <ATen/ops/upsample_trilinear3d_backward_meta.h>
+#include <ATen/ops/value_selecting_reduction_backward_meta.h>
+#include <ATen/ops/values_meta.h>
+#include <ATen/ops/values_copy_meta.h>
+#include <ATen/ops/vander_meta.h>
+#include <ATen/ops/var_meta.h>
+#include <ATen/ops/var_mean_meta.h>
+#include <ATen/ops/vdot_meta.h>
+#include <ATen/ops/view_meta.h>
+#include <ATen/ops/view_as_meta.h>
+#include <ATen/ops/view_as_complex_meta.h>
+#include <ATen/ops/view_as_complex_copy_meta.h>
+#include <ATen/ops/view_as_real_meta.h>
+#include <ATen/ops/view_as_real_copy_meta.h>
+#include <ATen/ops/view_copy_meta.h>
+#include <ATen/ops/vsplit_meta.h>
+#include <ATen/ops/vstack_meta.h>
+#include <ATen/ops/where_meta.h>
+#include <ATen/ops/xlogy_meta.h>
+#include <ATen/ops/xor_meta.h>
+#include <ATen/ops/zero_meta.h>
+#include <ATen/ops/zeros_meta.h>
+#include <ATen/ops/zeros_like_meta.h>
+
+namespace at {
+
+namespace meta {
+
+
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NestedTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/NestedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..af9c50182715f62e4e3991c403a62855e07fab5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NestedTensorImpl.h
@@ -0,0 +1,283 @@
+#pragma once
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Tensor.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+struct NestedTensorImpl;
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt);
+int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor);
+
+struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
+  explicit NestedTensorImpl(
+      Storage storage,
+      c10::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+  // assume contiguous, `nested_strides` and `offsets`
+  // can be infered from `nested_sizes`
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      const at::Tensor& nested_sizes);
+
+  // This constructor is used creating view tensors from nested tensors
+  explicit NestedTensorImpl(
+      c10::TensorImpl::ImplType impl_type,
+      const at::Tensor& base_tensor,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+
+  // TODO: don't expose private implementation details like this; in
+  // particular, resizing this tensor will mess up our dim() and
+  // callers cannot fix it.
+  const Tensor& get_nested_sizes() const {
+    return nested_sizes_;
+  }
+  // TODO: don't expose private implementation details like this
+  const Tensor& get_nested_strides() const {
+    return nested_strides_;
+  }
+  const Tensor& get_storage_offsets() const {
+    return storage_offsets_;
+  }
+  // Returns nullopt if the ith dimension is irregular. The ith dimension
+  // of a NestedTensor is regular if the unbound tensors match in
+  // size at the (i-1)th dimension.
+  c10::optional<int64_t> opt_size(int64_t d) const;
+
+  int64_t size(int64_t d) const {
+    c10::optional<int64_t> optional_size = this->opt_size(d);
+    TORCH_CHECK(
+        optional_size.has_value(),
+        "Given dimension ",
+        d,
+        " is irregular and does not have a size.");
+    return *optional_size;
+  }
+  /**
+   * Return a view of the nested tensor as a 1 dimensional contiguous tensor.
+   *
+   * The buffer tensor created by this function shares the same storage_impl as
+   * the original nested tensor, and therefore can be seen as a view.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_buffer() const {
+    TORCH_CHECK(
+        nested_tensor_impl_is_contiguous(this),
+        "NestedTensor must be contiguous to get buffer.");
+    return get_unsafe_storage_as_tensor();
+  }
+  /**
+   * If possible use get_buffer() instead. This function returns the storage
+   * as a tensor directly, which is not safe to use in general. If using this
+   * function, The caller must ensure to account for nested_sizes,
+   * nested_strides and storage_offsets.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_unsafe_storage_as_tensor() const {
+    auto buffer_key_set_ = generate_buffer_key_set();
+    const auto buffer_size = get_buffer_size();
+    auto buffer_tensor_impl = c10::make_intrusive<TensorImpl>(
+        c10::TensorImpl::VIEW, Storage(storage_), buffer_key_set_, data_type_);
+    buffer_tensor_impl->set_sizes_contiguous(
+        c10::makeArrayRef(static_cast<int64_t>(buffer_size)));
+    return Tensor(buffer_tensor_impl);
+  }
+
+  size_t get_buffer_size() const {
+    return storage_.nbytes() / data_type_.itemsize();
+  }
+
+ protected:
+  const char* tensorimpl_type_name() const override;
+
+  // TODO: numel_custom and is_contiguous_custom can be profitably overridden
+  // with real implementations
+  int64_t numel_custom() const override;
+  c10::SymInt sym_numel_custom() const override;
+  bool is_contiguous_custom(MemoryFormat) const override;
+  int64_t size_custom(int64_t d) const override {
+    return this->size(d);
+  }
+  c10::SymInt sym_size_custom(int64_t d) const override {
+    return c10::SymInt{this->size(d)};
+  }
+  IntArrayRef sizes_custom() const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  IntArrayRef strides_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+
+  // this one is real
+  int64_t dim_custom() const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override {
+    copy_tensor_metadata(
+        /*src_impl=*/impl.get(),
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+  }
+
+ private:
+  // Must be called after any changes to our dim() to sync the state
+  // to TensorImpl.
+  void refresh_dim();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor nested_sizes_, nested_strides_;
+  // The starting positions of the underlying tensors in contiguous buffer
+  // i.e. the buffer memory offsets to get the underlying tensors
+  // The reason to keep this metadata is that, without strong enough constraint
+  // it cannot be derived from `nested_sizes_`
+  // and `nested_strides_`:
+  // 1. when buffer has blanks, e.g. [tensor1, blank, tensor2]
+  //    this can happen e.g. after slicing a nested tensor
+  // 2. when multiple tensors share a same memory
+  // 3. when the nesting ordering is changed, e.g. [tensor1, tensor3, tensor2]
+  // Some strong enough constraints are:
+  // 1. every underlying tensor is contiguous in memory
+  //    && nesting in ascending order
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor storage_offsets_;
+  // NOTE: -1 here means the size is missing
+  // Optional to allow it to be computed lazily from nested.
+  // TODO: maybe we can remove this metadata since
+  //       we can compute it from `nested_sizes_`
+  mutable c10::optional<std::vector<int64_t>> opt_sizes_;
+
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Generates a non-nested key_set from a nested tensor.
+   *
+   * For many nested tensor kernel implementations a buffer tensor
+   * is generated and redispatched to a non-nested kernel this function
+   * generates the key set used by that buffer tensor
+   *
+   * @return Appropriate key set for non-nested tensor
+   */
+  inline c10::DispatchKeySet generate_buffer_key_set() const {
+    auto buffer_key_set = this->key_set();
+    const bool Autograd = buffer_key_set.has_any(c10::autograd_dispatch_keyset);
+    // Remove nested tensor specific keys
+    buffer_key_set = buffer_key_set -
+        c10::DispatchKeySet{
+            c10::DispatchKey::NestedTensor,
+            c10::DispatchKey::AutogradNestedTensor};
+
+    // Add dense tensor specific keys
+    buffer_key_set =
+        buffer_key_set | c10::DispatchKeySet{c10::DispatchKey::Dense};
+    buffer_key_set = Autograd
+        ? c10::DispatchKeySet{c10::DispatchKey::Autograd} | buffer_key_set
+        : buffer_key_set;
+
+    return buffer_key_set;
+  }
+};
+
+inline NestedTensorImpl* get_nested_tensor_impl_or_null(
+    const at::Tensor& tensor) {
+  if (tensor.is_nested()) {
+    return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+  }
+  return nullptr;
+}
+
+inline NestedTensorImpl* get_nested_tensor_impl(const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.is_nested(), "get_nested_tensor_impl requires a NestedTensor.");
+  return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
+  int64_t ntensors = nt->size(0);
+  if (ntensors == 0) {
+    return true;
+  }
+  const Tensor &sizemat = nt->get_nested_sizes(),
+               &stridemat = nt->get_nested_strides();
+  int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr<int64_t>();
+  int64_t orig_dim = sizemat.size(1);
+  // nesting scalars
+  if (orig_dim == 0) {
+    // each scalar must be contiguous
+    // if there is blank memory between underlying scalars
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (offsets_ptr[i] != i) {
+        return false;
+      }
+    }
+  }
+  // nesting tensors
+  else {
+    // if any underlying tensor is non-contiguous
+    const int64_t *sizemat_ptr = sizemat.data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.data_ptr<int64_t>();
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (stridemat_ptr[orig_dim - 1] != 1) {
+        return false;
+      }
+      int64_t product = sizemat_ptr[orig_dim - 1];
+      for (int64_t j = orig_dim - 2; j >= 0; j--) {
+        if (stridemat_ptr[j] != product) {
+          return false;
+        }
+        product *= sizemat_ptr[j];
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+    // if there is blank memory between underlying tensors
+    if (offsets_ptr[0] != 0) {
+      return false;
+    }
+    sizemat_ptr = sizemat.data_ptr<int64_t>();
+    stridemat_ptr = stridemat.data_ptr<int64_t>();
+    for (int64_t i = 1; i < ntensors; i++) {
+      if (offsets_ptr[i] !=
+          offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {
+        return false;
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+  }
+  // everything is fine
+  return true;
+}
+
+inline const at::Tensor& get_nested_sizes(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_nested_sizes();
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/NumericUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/NumericUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccef4482d530839205e4ceec8b0d69c9e1565a15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/NumericUtils.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif
+
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/complex.h>
+
+#include <cmath>
+#include <type_traits>
+
+namespace at {
+
+// std::isnan isn't performant to use on integral types; it will
+// (uselessly) convert to floating point and then do the test.
+// This function is.
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T /*val*/) {
+  return false;
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return ::isnan(val);
+#else
+  return std::isnan(val);
+#endif
+}
+
+template <typename T, std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return std::isnan(val.real()) || std::isnan(val.imag());
+}
+
+template <typename T, std::enable_if_t<std::is_same_v<T, at::Half>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return at::_isnan(static_cast<float>(val));
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::BFloat16>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
+  return at::_isnan(static_cast<float>(val));
+}
+
+inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
+  return at::_isnan(static_cast<float>(val));
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fn>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2fnuz>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fnuz>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+
+// std::isinf isn't performant to use on integral types; it will
+// (uselessly) convert to floating point and then do the test.
+// This function is.
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isinf(T /*val*/) {
+  return false;
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isinf(T val) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return ::isinf(val);
+#else
+  return std::isinf(val);
+#endif
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Half val) {
+  return at::_isinf(static_cast<float>(val));
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::BFloat16 val) {
+  return at::_isinf(static_cast<float>(val));
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2 val) {
+  return val.isinf();
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fn val) {
+  return false;
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2fnuz val) {
+  return false;
+}
+
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fnuz val) {
+  return false;
+}
+
+template <typename T>
+C10_HOST_DEVICE inline T exp(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __expf fast approximation for peak bandwidth
+  return __expf(x);
+#else
+  return ::exp(x);
+#endif
+}
+
+template <>
+C10_HOST_DEVICE inline double exp<double>(double x) {
+  return ::exp(x);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline T log(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __logf fast approximation for peak bandwidth
+  return __logf(x);
+#else
+  return ::log(x);
+#endif
+}
+
+template <>
+C10_HOST_DEVICE inline double log<double>(double x) {
+  return ::log(x);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline T log1p(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __logf fast approximation for peak bandwidth
+  // NOTE: There is no __log1pf so unfortunately we lose precision.
+  return __logf(1.0f + x);
+#else
+  return ::log1p(x);
+#endif
+}
+
+template <>
+C10_HOST_DEVICE inline double log1p<double>(double x) {
+  return ::log1p(x);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline T tan(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __tanf fast approximation for peak bandwidth
+  return __tanf(x);
+#else
+  return ::tan(x);
+#endif
+}
+
+template <>
+C10_HOST_DEVICE inline double tan<double>(double x) {
+  return ::tan(x);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/OpMathType.h b/MLPY/Lib/site-packages/torch/include/ATen/OpMathType.h
new file mode 100644
index 0000000000000000000000000000000000000000..64b1364a8bb72db5916c331835aa76bfd96e7995
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/OpMathType.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+
+namespace at {
+
+// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
+template <typename scalar_t>
+struct OpMathType {
+  using type = scalar_t;
+};
+template <>
+struct OpMathType<at::Half> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::BFloat16> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e5m2> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e4m3fn> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e5m2fnuz> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e4m3fnuz> {
+  using type = float;
+};
+template <>
+struct OpMathType<c10::complex<Half>> {
+  using type = c10::complex<float>;
+};
+
+template <typename T>
+using opmath_type = typename OpMathType<T>::type;
+
+namespace {
+
+inline c10::ScalarType toOpMathType(const c10::ScalarType type) {
+  switch (type) {
+#define DEFINE_CASE(scalar_t, TypeNum) \
+  case ScalarType::TypeNum:            \
+    return CppTypeToScalarType<at::opmath_type<scalar_t>>::value;
+
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+#undef DEFINE_CASE
+
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
+  }
+}
+
+} // namespace
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/OpaqueTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/OpaqueTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..23805376faad47f149d4bf57823ad1a473ed16ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/OpaqueTensorImpl.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+
+// An "Opaque" TensorImpl -- there are no strides and (for now)
+// even data() is not supported (thus no pointer arithmetic).
+
+// NOTE: We could allow data() in the future, but would have to ensure pointer
+// arithmetic code is properly guarded.
+//
+// NOTE: This does not support resize_ (and other metadata-changing ops) because
+// of `shallow_copy_and_detach`. We would need to define an interface to
+// "shallow copy" in order to add support.
+
+template <typename OpaqueHandle>
+struct TORCH_API OpaqueTensorImpl : public TensorImpl {
+  // public constructor for now...
+  OpaqueTensorImpl(
+      at::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      c10::Device device,
+      OpaqueHandle opaque_handle,
+      c10::IntArrayRef sizes,
+      bool is_non_overlapping_and_dense = true)
+      : TensorImpl(key_set, data_type, device),
+        opaque_handle_(std::move(opaque_handle)) {
+    set_storage_access_should_throw();
+    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
+    sizes_and_strides_.set_sizes(sizes);
+    refresh_numel();
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
+  }
+
+  // Destructor doesn't call release_resources because it's
+  // unnecessary; don't forget to change that if needed!
+  void release_resources() override {
+    TensorImpl::release_resources();
+    opaque_handle_ = {};
+  }
+
+  void set_size(int64_t dim, int64_t new_size) override {
+    AT_ERROR("opaque tensors do not have set_size");
+  }
+
+  void set_stride(int64_t dim, int64_t new_stride) override {
+    AT_ERROR("opaque tensors do not have set_stride");
+  }
+
+  void set_storage_offset(int64_t storage_offset) override {
+    AT_ERROR("opaque tensors do not have set_storage_offset");
+  }
+
+#ifdef DEBUG
+  bool has_storage() const override {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        !storage_, "OpaqueTensorImpl assumes that storage_ is never set");
+    return false;
+  }
+#endif
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
+        key_set(),
+        dtype(),
+        device(),
+        opaque_handle_,
+        sizes_and_strides_.sizes_arrayref());
+    copy_tensor_metadata(
+        /*src_opaque_impl=*/this,
+        /*dest_opaque_impl=*/impl.get(),
+        /*version_counter=*/version_counter,
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
+        key_set(),
+        dtype(),
+        device(),
+        opaque_handle_,
+        sizes_and_strides_.sizes_arrayref());
+    copy_tensor_metadata(
+        /*src_opaque_impl=*/this,
+        /*dest_opaque_impl=*/impl.get(),
+        /*version_counter=*/std::move(version_counter),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    return impl;
+  }
+
+  /**
+   * Shallow-copies data from another TensorImpl into this TensorImpl.
+   *
+   * For why this function doesn't check this TensorImpl's
+   * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override {
+    AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
+    auto opaque_impl =
+        static_cast<const OpaqueTensorImpl<OpaqueHandle>*>(impl.get());
+    copy_tensor_metadata(
+        /*src_impl=*/opaque_impl,
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+    refresh_numel();
+  }
+
+  const OpaqueHandle& opaque_handle() const {
+    return opaque_handle_;
+  }
+
+  OpaqueHandle& unsafe_opaque_handle() {
+    return opaque_handle_;
+  }
+
+ protected:
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const OpaqueTensorImpl<OpaqueHandle>* src_opaque_impl,
+      OpaqueTensorImpl<OpaqueHandle>* dest_opaque_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_opaque_impl,
+        dest_opaque_impl,
+        version_counter,
+        allow_tensor_metadata_change);
+
+    // OpaqueTensorImpl-specific fields.
+    dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
+  }
+
+  static void copy_tensor_metadata(
+      const OpaqueTensorImpl<OpaqueHandle>* src_opaque_impl,
+      OpaqueTensorImpl<OpaqueHandle>* dest_opaque_impl,
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_opaque_impl,
+        dest_opaque_impl,
+        std::move(version_counter),
+        allow_tensor_metadata_change);
+
+    // OpaqueTensorImpl-specific fields.
+    dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
+  }
+
+ private:
+  const char* tensorimpl_type_name() const override {
+    return "OpaqueTensorImpl";
+  }
+
+  OpaqueHandle opaque_handle_;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Operators.h b/MLPY/Lib/site-packages/torch/include/ATen/Operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e3118f98030fb8d660e55931b771b794e82244d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Operators.h
@@ -0,0 +1,1358 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operators.h
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}_ops.h>   \
+  and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <tuple>
+#include <vector>
+
+#include <ATen/ops/_adaptive_avg_pool2d_ops.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_ops.h>
+#include <ATen/ops/_adaptive_avg_pool3d_ops.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_ops.h>
+#include <ATen/ops/_add_batch_dim_ops.h>
+#include <ATen/ops/_add_relu_ops.h>
+#include <ATen/ops/_addmm_activation_ops.h>
+#include <ATen/ops/_aminmax_ops.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_ops.h>
+#include <ATen/ops/_amp_update_scale_ops.h>
+#include <ATen/ops/_assert_async_ops.h>
+#include <ATen/ops/_assert_scalar_ops.h>
+#include <ATen/ops/_assert_tensor_metadata_ops.h>
+#include <ATen/ops/_autocast_to_full_precision_ops.h>
+#include <ATen/ops/_autocast_to_reduced_precision_ops.h>
+#include <ATen/ops/_backward_ops.h>
+#include <ATen/ops/_batch_norm_impl_index_ops.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_ops.h>
+#include <ATen/ops/_cast_Byte_ops.h>
+#include <ATen/ops/_cast_Char_ops.h>
+#include <ATen/ops/_cast_Double_ops.h>
+#include <ATen/ops/_cast_Float_ops.h>
+#include <ATen/ops/_cast_Half_ops.h>
+#include <ATen/ops/_cast_Int_ops.h>
+#include <ATen/ops/_cast_Long_ops.h>
+#include <ATen/ops/_cast_Short_ops.h>
+#include <ATen/ops/_cdist_backward_ops.h>
+#include <ATen/ops/_cdist_forward_ops.h>
+#include <ATen/ops/_cholesky_solve_helper_ops.h>
+#include <ATen/ops/_choose_qparams_per_tensor_ops.h>
+#include <ATen/ops/_chunk_cat_ops.h>
+#include <ATen/ops/_coalesce_ops.h>
+#include <ATen/ops/_coalesced_ops.h>
+#include <ATen/ops/_compute_linear_combination_ops.h>
+#include <ATen/ops/_conj_ops.h>
+#include <ATen/ops/_conj_copy_ops.h>
+#include <ATen/ops/_conj_physical_ops.h>
+#include <ATen/ops/_conv_depthwise2d_ops.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_ops.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_ops.h>
+#include <ATen/ops/_convert_weight_to_int4pack_ops.h>
+#include <ATen/ops/_convolution_ops.h>
+#include <ATen/ops/_convolution_double_backward_ops.h>
+#include <ATen/ops/_convolution_mode_ops.h>
+#include <ATen/ops/_copy_from_ops.h>
+#include <ATen/ops/_copy_from_and_resize_ops.h>
+#include <ATen/ops/_cslt_compress_ops.h>
+#include <ATen/ops/_cslt_sparse_mm_ops.h>
+#include <ATen/ops/_cslt_sparse_mm_search_ops.h>
+#include <ATen/ops/_ctc_loss_ops.h>
+#include <ATen/ops/_ctc_loss_backward_ops.h>
+#include <ATen/ops/_cudnn_ctc_loss_ops.h>
+#include <ATen/ops/_cudnn_init_dropout_state_ops.h>
+#include <ATen/ops/_cudnn_rnn_ops.h>
+#include <ATen/ops/_cudnn_rnn_backward_ops.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_ops.h>
+#include <ATen/ops/_cufft_clear_plan_cache_ops.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_ops.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_ops.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_ops.h>
+#include <ATen/ops/_cummax_helper_ops.h>
+#include <ATen/ops/_cummin_helper_ops.h>
+#include <ATen/ops/_debug_has_internal_overlap_ops.h>
+#include <ATen/ops/_dimI_ops.h>
+#include <ATen/ops/_dimV_ops.h>
+#include <ATen/ops/_dim_arange_ops.h>
+#include <ATen/ops/_dirichlet_grad_ops.h>
+#include <ATen/ops/_efficient_attention_backward_ops.h>
+#include <ATen/ops/_efficient_attention_forward_ops.h>
+#include <ATen/ops/_efficientzerotensor_ops.h>
+#include <ATen/ops/_embedding_bag_ops.h>
+#include <ATen/ops/_embedding_bag_backward_ops.h>
+#include <ATen/ops/_embedding_bag_dense_backward_ops.h>
+#include <ATen/ops/_embedding_bag_forward_only_ops.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_ops.h>
+#include <ATen/ops/_embedding_bag_sparse_backward_ops.h>
+#include <ATen/ops/_empty_affine_quantized_ops.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_ops.h>
+#include <ATen/ops/_euclidean_dist_ops.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_ops.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_ops.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_ops.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_ops.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_ops.h>
+#include <ATen/ops/_fft_c2c_ops.h>
+#include <ATen/ops/_fft_c2r_ops.h>
+#include <ATen/ops/_fft_r2c_ops.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_ops.h>
+#include <ATen/ops/_flash_attention_backward_ops.h>
+#include <ATen/ops/_flash_attention_forward_ops.h>
+#include <ATen/ops/_foobar_ops.h>
+#include <ATen/ops/_foreach_abs_ops.h>
+#include <ATen/ops/_foreach_acos_ops.h>
+#include <ATen/ops/_foreach_add_ops.h>
+#include <ATen/ops/_foreach_addcdiv_ops.h>
+#include <ATen/ops/_foreach_addcmul_ops.h>
+#include <ATen/ops/_foreach_asin_ops.h>
+#include <ATen/ops/_foreach_atan_ops.h>
+#include <ATen/ops/_foreach_ceil_ops.h>
+#include <ATen/ops/_foreach_clamp_max_ops.h>
+#include <ATen/ops/_foreach_clamp_min_ops.h>
+#include <ATen/ops/_foreach_copy_ops.h>
+#include <ATen/ops/_foreach_cos_ops.h>
+#include <ATen/ops/_foreach_cosh_ops.h>
+#include <ATen/ops/_foreach_div_ops.h>
+#include <ATen/ops/_foreach_erf_ops.h>
+#include <ATen/ops/_foreach_erfc_ops.h>
+#include <ATen/ops/_foreach_exp_ops.h>
+#include <ATen/ops/_foreach_expm1_ops.h>
+#include <ATen/ops/_foreach_floor_ops.h>
+#include <ATen/ops/_foreach_frac_ops.h>
+#include <ATen/ops/_foreach_lerp_ops.h>
+#include <ATen/ops/_foreach_lgamma_ops.h>
+#include <ATen/ops/_foreach_log_ops.h>
+#include <ATen/ops/_foreach_log10_ops.h>
+#include <ATen/ops/_foreach_log1p_ops.h>
+#include <ATen/ops/_foreach_log2_ops.h>
+#include <ATen/ops/_foreach_maximum_ops.h>
+#include <ATen/ops/_foreach_minimum_ops.h>
+#include <ATen/ops/_foreach_mul_ops.h>
+#include <ATen/ops/_foreach_neg_ops.h>
+#include <ATen/ops/_foreach_norm_ops.h>
+#include <ATen/ops/_foreach_pow_ops.h>
+#include <ATen/ops/_foreach_reciprocal_ops.h>
+#include <ATen/ops/_foreach_round_ops.h>
+#include <ATen/ops/_foreach_sigmoid_ops.h>
+#include <ATen/ops/_foreach_sign_ops.h>
+#include <ATen/ops/_foreach_sin_ops.h>
+#include <ATen/ops/_foreach_sinh_ops.h>
+#include <ATen/ops/_foreach_sqrt_ops.h>
+#include <ATen/ops/_foreach_sub_ops.h>
+#include <ATen/ops/_foreach_tan_ops.h>
+#include <ATen/ops/_foreach_tanh_ops.h>
+#include <ATen/ops/_foreach_trunc_ops.h>
+#include <ATen/ops/_foreach_zero_ops.h>
+#include <ATen/ops/_functional_assert_async_ops.h>
+#include <ATen/ops/_functional_assert_scalar_ops.h>
+#include <ATen/ops/_functional_sym_constrain_range_ops.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_ops.h>
+#include <ATen/ops/_fused_adam_ops.h>
+#include <ATen/ops/_fused_adamw_ops.h>
+#include <ATen/ops/_fused_dropout_ops.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_ops.h>
+#include <ATen/ops/_fused_sdp_choice_ops.h>
+#include <ATen/ops/_fused_sgd_ops.h>
+#include <ATen/ops/_fw_primal_ops.h>
+#include <ATen/ops/_fw_primal_copy_ops.h>
+#include <ATen/ops/_gather_sparse_backward_ops.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_ops.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward_ops.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type_ops.h>
+#include <ATen/ops/_has_same_storage_numel_ops.h>
+#include <ATen/ops/_histogramdd_bin_edges_ops.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_ops.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_ops.h>
+#include <ATen/ops/_index_put_impl_ops.h>
+#include <ATen/ops/_indices_ops.h>
+#include <ATen/ops/_indices_copy_ops.h>
+#include <ATen/ops/_int_mm_ops.h>
+#include <ATen/ops/_is_all_true_ops.h>
+#include <ATen/ops/_is_any_true_ops.h>
+#include <ATen/ops/_is_zerotensor_ops.h>
+#include <ATen/ops/_lazy_clone_ops.h>
+#include <ATen/ops/_linalg_check_errors_ops.h>
+#include <ATen/ops/_linalg_det_ops.h>
+#include <ATen/ops/_linalg_eigh_ops.h>
+#include <ATen/ops/_linalg_eigvals_ops.h>
+#include <ATen/ops/_linalg_slogdet_ops.h>
+#include <ATen/ops/_linalg_solve_ex_ops.h>
+#include <ATen/ops/_linalg_svd_ops.h>
+#include <ATen/ops/_local_scalar_dense_ops.h>
+#include <ATen/ops/_log_softmax_ops.h>
+#include <ATen/ops/_log_softmax_backward_data_ops.h>
+#include <ATen/ops/_logcumsumexp_ops.h>
+#include <ATen/ops/_lstm_mps_ops.h>
+#include <ATen/ops/_lu_with_info_ops.h>
+#include <ATen/ops/_make_dep_token_ops.h>
+#include <ATen/ops/_make_dual_ops.h>
+#include <ATen/ops/_make_dual_copy_ops.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_ops.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_ops.h>
+#include <ATen/ops/_masked_scale_ops.h>
+#include <ATen/ops/_masked_softmax_ops.h>
+#include <ATen/ops/_masked_softmax_backward_ops.h>
+#include <ATen/ops/_mixed_dtypes_linear_ops.h>
+#include <ATen/ops/_mkldnn_reshape_ops.h>
+#include <ATen/ops/_mkldnn_transpose_ops.h>
+#include <ATen/ops/_mps_convolution_ops.h>
+#include <ATen/ops/_mps_convolution_transpose_ops.h>
+#include <ATen/ops/_native_batch_norm_legit_ops.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_ops.h>
+#include <ATen/ops/_native_multi_head_attention_ops.h>
+#include <ATen/ops/_neg_view_ops.h>
+#include <ATen/ops/_neg_view_copy_ops.h>
+#include <ATen/ops/_nested_from_padded_ops.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_ops.h>
+#include <ATen/ops/_nested_get_jagged_dummy_ops.h>
+#include <ATen/ops/_nested_get_lengths_ops.h>
+#include <ATen/ops/_nested_get_offsets_ops.h>
+#include <ATen/ops/_nested_get_ragged_idx_ops.h>
+#include <ATen/ops/_nested_get_values_ops.h>
+#include <ATen/ops/_nested_get_values_copy_ops.h>
+#include <ATen/ops/_nested_select_backward_ops.h>
+#include <ATen/ops/_nested_sum_backward_ops.h>
+#include <ATen/ops/_nested_tensor_from_mask_ops.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_ops.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_ops.h>
+#include <ATen/ops/_nested_tensor_size_ops.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape_ops.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_ops.h>
+#include <ATen/ops/_nested_tensor_strides_ops.h>
+#include <ATen/ops/_nested_view_from_buffer_ops.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_ops.h>
+#include <ATen/ops/_nested_view_from_jagged_ops.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_ops.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_ops.h>
+#include <ATen/ops/_nnpack_available_ops.h>
+#include <ATen/ops/_nnpack_spatial_convolution_ops.h>
+#include <ATen/ops/_nnz_ops.h>
+#include <ATen/ops/_pack_padded_sequence_ops.h>
+#include <ATen/ops/_pack_padded_sequence_backward_ops.h>
+#include <ATen/ops/_pad_circular_ops.h>
+#include <ATen/ops/_pad_enum_ops.h>
+#include <ATen/ops/_pad_packed_sequence_ops.h>
+#include <ATen/ops/_pdist_backward_ops.h>
+#include <ATen/ops/_pdist_forward_ops.h>
+#include <ATen/ops/_pin_memory_ops.h>
+#include <ATen/ops/_prelu_kernel_ops.h>
+#include <ATen/ops/_prelu_kernel_backward_ops.h>
+#include <ATen/ops/_print_ops.h>
+#include <ATen/ops/_propagate_xla_data_ops.h>
+#include <ATen/ops/_remove_batch_dim_ops.h>
+#include <ATen/ops/_reshape_alias_ops.h>
+#include <ATen/ops/_reshape_alias_copy_ops.h>
+#include <ATen/ops/_reshape_copy_ops.h>
+#include <ATen/ops/_reshape_from_tensor_ops.h>
+#include <ATen/ops/_resize_output_ops.h>
+#include <ATen/ops/_rowwise_prune_ops.h>
+#include <ATen/ops/_sample_dirichlet_ops.h>
+#include <ATen/ops/_saturate_weight_to_fp16_ops.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_ops.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_ops.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_ops.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward_ops.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_ops.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward_ops.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_ops.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_ops.h>
+#include <ATen/ops/_scaled_mm_ops.h>
+#include <ATen/ops/_segment_reduce_backward_ops.h>
+#include <ATen/ops/_shape_as_tensor_ops.h>
+#include <ATen/ops/_slow_conv2d_backward_ops.h>
+#include <ATen/ops/_slow_conv2d_forward_ops.h>
+#include <ATen/ops/_sobol_engine_draw_ops.h>
+#include <ATen/ops/_sobol_engine_ff_ops.h>
+#include <ATen/ops/_sobol_engine_initialize_state_ops.h>
+#include <ATen/ops/_sobol_engine_scramble_ops.h>
+#include <ATen/ops/_softmax_ops.h>
+#include <ATen/ops/_softmax_backward_data_ops.h>
+#include <ATen/ops/_sparse_addmm_ops.h>
+#include <ATen/ops/_sparse_broadcast_to_ops.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_ops.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_ops.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_ops.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_csr_prod_ops.h>
+#include <ATen/ops/_sparse_csr_sum_ops.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_ops.h>
+#include <ATen/ops/_sparse_log_softmax_ops.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_ops.h>
+#include <ATen/ops/_sparse_mask_projection_ops.h>
+#include <ATen/ops/_sparse_mm_ops.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_ops.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward_ops.h>
+#include <ATen/ops/_sparse_semi_structured_linear_ops.h>
+#include <ATen/ops/_sparse_softmax_ops.h>
+#include <ATen/ops/_sparse_softmax_backward_data_ops.h>
+#include <ATen/ops/_sparse_sparse_matmul_ops.h>
+#include <ATen/ops/_sparse_sum_ops.h>
+#include <ATen/ops/_sparse_sum_backward_ops.h>
+#include <ATen/ops/_spdiags_ops.h>
+#include <ATen/ops/_stack_ops.h>
+#include <ATen/ops/_standard_gamma_ops.h>
+#include <ATen/ops/_standard_gamma_grad_ops.h>
+#include <ATen/ops/_test_ambiguous_defaults_ops.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_ops.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_ops.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h>
+#include <ATen/ops/_test_check_tensor_ops.h>
+#include <ATen/ops/_test_functorch_fallback_ops.h>
+#include <ATen/ops/_test_optional_filled_intlist_ops.h>
+#include <ATen/ops/_test_optional_floatlist_ops.h>
+#include <ATen/ops/_test_optional_intlist_ops.h>
+#include <ATen/ops/_test_parallel_materialize_ops.h>
+#include <ATen/ops/_test_serialization_subcmul_ops.h>
+#include <ATen/ops/_test_string_default_ops.h>
+#include <ATen/ops/_test_warn_in_autograd_ops.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_ops.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_ops.h>
+#include <ATen/ops/_thnn_fused_gru_cell_ops.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_ops.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_ops.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_ops.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_ops.h>
+#include <ATen/ops/_to_copy_ops.h>
+#include <ATen/ops/_to_cpu_ops.h>
+#include <ATen/ops/_to_dense_ops.h>
+#include <ATen/ops/_to_sparse_ops.h>
+#include <ATen/ops/_to_sparse_bsc_ops.h>
+#include <ATen/ops/_to_sparse_bsr_ops.h>
+#include <ATen/ops/_to_sparse_csc_ops.h>
+#include <ATen/ops/_to_sparse_csr_ops.h>
+#include <ATen/ops/_to_sparse_semi_structured_ops.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_ops.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_ops.h>
+#include <ATen/ops/_trilinear_ops.h>
+#include <ATen/ops/_triton_multi_head_attention_ops.h>
+#include <ATen/ops/_triton_scaled_dot_attention_ops.h>
+#include <ATen/ops/_unique_ops.h>
+#include <ATen/ops/_unique2_ops.h>
+#include <ATen/ops/_unpack_dual_ops.h>
+#include <ATen/ops/_unsafe_index_ops.h>
+#include <ATen/ops/_unsafe_index_put_ops.h>
+#include <ATen/ops/_unsafe_view_ops.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_ops.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_ops.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_ops.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_ops.h>
+#include <ATen/ops/_upsample_nearest_exact1d_ops.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_ops.h>
+#include <ATen/ops/_upsample_nearest_exact2d_ops.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_ops.h>
+#include <ATen/ops/_upsample_nearest_exact3d_ops.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_ops.h>
+#include <ATen/ops/_use_cudnn_ctc_loss_ops.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_ops.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_ops.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_ops.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_ops.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_ops.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args_ops.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_ops.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_ops.h>
+#include <ATen/ops/_values_ops.h>
+#include <ATen/ops/_values_copy_ops.h>
+#include <ATen/ops/_version_ops.h>
+#include <ATen/ops/_weight_int4pack_mm_ops.h>
+#include <ATen/ops/_weight_int8pack_mm_ops.h>
+#include <ATen/ops/_weight_norm_ops.h>
+#include <ATen/ops/_weight_norm_differentiable_backward_ops.h>
+#include <ATen/ops/_weight_norm_interface_ops.h>
+#include <ATen/ops/_weight_norm_interface_backward_ops.h>
+#include <ATen/ops/abs_ops.h>
+#include <ATen/ops/absolute_ops.h>
+#include <ATen/ops/acos_ops.h>
+#include <ATen/ops/acosh_ops.h>
+#include <ATen/ops/adaptive_avg_pool1d_ops.h>
+#include <ATen/ops/adaptive_avg_pool2d_ops.h>
+#include <ATen/ops/adaptive_avg_pool3d_ops.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_ops.h>
+#include <ATen/ops/adaptive_max_pool1d_ops.h>
+#include <ATen/ops/adaptive_max_pool2d_ops.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_ops.h>
+#include <ATen/ops/adaptive_max_pool3d_ops.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_ops.h>
+#include <ATen/ops/add_ops.h>
+#include <ATen/ops/addbmm_ops.h>
+#include <ATen/ops/addcdiv_ops.h>
+#include <ATen/ops/addcmul_ops.h>
+#include <ATen/ops/addmm_ops.h>
+#include <ATen/ops/addmv_ops.h>
+#include <ATen/ops/addr_ops.h>
+#include <ATen/ops/adjoint_ops.h>
+#include <ATen/ops/affine_grid_generator_ops.h>
+#include <ATen/ops/affine_grid_generator_backward_ops.h>
+#include <ATen/ops/alias_ops.h>
+#include <ATen/ops/alias_copy_ops.h>
+#include <ATen/ops/align_as_ops.h>
+#include <ATen/ops/align_tensors_ops.h>
+#include <ATen/ops/align_to_ops.h>
+#include <ATen/ops/all_ops.h>
+#include <ATen/ops/allclose_ops.h>
+#include <ATen/ops/alpha_dropout_ops.h>
+#include <ATen/ops/amax_ops.h>
+#include <ATen/ops/amin_ops.h>
+#include <ATen/ops/aminmax_ops.h>
+#include <ATen/ops/and_ops.h>
+#include <ATen/ops/angle_ops.h>
+#include <ATen/ops/any_ops.h>
+#include <ATen/ops/arange_ops.h>
+#include <ATen/ops/arccos_ops.h>
+#include <ATen/ops/arccosh_ops.h>
+#include <ATen/ops/arcsin_ops.h>
+#include <ATen/ops/arcsinh_ops.h>
+#include <ATen/ops/arctan_ops.h>
+#include <ATen/ops/arctan2_ops.h>
+#include <ATen/ops/arctanh_ops.h>
+#include <ATen/ops/argmax_ops.h>
+#include <ATen/ops/argmin_ops.h>
+#include <ATen/ops/argsort_ops.h>
+#include <ATen/ops/argwhere_ops.h>
+#include <ATen/ops/as_strided_ops.h>
+#include <ATen/ops/as_strided_copy_ops.h>
+#include <ATen/ops/as_strided_scatter_ops.h>
+#include <ATen/ops/asin_ops.h>
+#include <ATen/ops/asinh_ops.h>
+#include <ATen/ops/atan_ops.h>
+#include <ATen/ops/atan2_ops.h>
+#include <ATen/ops/atanh_ops.h>
+#include <ATen/ops/atleast_1d_ops.h>
+#include <ATen/ops/atleast_2d_ops.h>
+#include <ATen/ops/atleast_3d_ops.h>
+#include <ATen/ops/avg_pool1d_ops.h>
+#include <ATen/ops/avg_pool2d_ops.h>
+#include <ATen/ops/avg_pool2d_backward_ops.h>
+#include <ATen/ops/avg_pool3d_ops.h>
+#include <ATen/ops/avg_pool3d_backward_ops.h>
+#include <ATen/ops/baddbmm_ops.h>
+#include <ATen/ops/bartlett_window_ops.h>
+#include <ATen/ops/batch_norm_ops.h>
+#include <ATen/ops/batch_norm_backward_elemt_ops.h>
+#include <ATen/ops/batch_norm_backward_reduce_ops.h>
+#include <ATen/ops/batch_norm_elemt_ops.h>
+#include <ATen/ops/batch_norm_gather_stats_ops.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_ops.h>
+#include <ATen/ops/batch_norm_stats_ops.h>
+#include <ATen/ops/batch_norm_update_stats_ops.h>
+#include <ATen/ops/bernoulli_ops.h>
+#include <ATen/ops/bilinear_ops.h>
+#include <ATen/ops/binary_cross_entropy_ops.h>
+#include <ATen/ops/binary_cross_entropy_backward_ops.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_ops.h>
+#include <ATen/ops/bincount_ops.h>
+#include <ATen/ops/binomial_ops.h>
+#include <ATen/ops/bitwise_and_ops.h>
+#include <ATen/ops/bitwise_left_shift_ops.h>
+#include <ATen/ops/bitwise_not_ops.h>
+#include <ATen/ops/bitwise_or_ops.h>
+#include <ATen/ops/bitwise_right_shift_ops.h>
+#include <ATen/ops/bitwise_xor_ops.h>
+#include <ATen/ops/blackman_window_ops.h>
+#include <ATen/ops/block_diag_ops.h>
+#include <ATen/ops/bmm_ops.h>
+#include <ATen/ops/broadcast_tensors_ops.h>
+#include <ATen/ops/broadcast_to_ops.h>
+#include <ATen/ops/bucketize_ops.h>
+#include <ATen/ops/can_cast_ops.h>
+#include <ATen/ops/cartesian_prod_ops.h>
+#include <ATen/ops/cat_ops.h>
+#include <ATen/ops/cauchy_ops.h>
+#include <ATen/ops/ccol_indices_ops.h>
+#include <ATen/ops/ccol_indices_copy_ops.h>
+#include <ATen/ops/cdist_ops.h>
+#include <ATen/ops/ceil_ops.h>
+#include <ATen/ops/celu_ops.h>
+#include <ATen/ops/chain_matmul_ops.h>
+#include <ATen/ops/chalf_ops.h>
+#include <ATen/ops/channel_shuffle_ops.h>
+#include <ATen/ops/cholesky_ops.h>
+#include <ATen/ops/cholesky_inverse_ops.h>
+#include <ATen/ops/cholesky_solve_ops.h>
+#include <ATen/ops/choose_qparams_optimized_ops.h>
+#include <ATen/ops/chunk_ops.h>
+#include <ATen/ops/clamp_ops.h>
+#include <ATen/ops/clamp_max_ops.h>
+#include <ATen/ops/clamp_min_ops.h>
+#include <ATen/ops/clip_ops.h>
+#include <ATen/ops/clone_ops.h>
+#include <ATen/ops/coalesce_ops.h>
+#include <ATen/ops/col2im_ops.h>
+#include <ATen/ops/col_indices_ops.h>
+#include <ATen/ops/col_indices_copy_ops.h>
+#include <ATen/ops/column_stack_ops.h>
+#include <ATen/ops/combinations_ops.h>
+#include <ATen/ops/complex_ops.h>
+#include <ATen/ops/concat_ops.h>
+#include <ATen/ops/concatenate_ops.h>
+#include <ATen/ops/conj_ops.h>
+#include <ATen/ops/conj_physical_ops.h>
+#include <ATen/ops/constant_pad_nd_ops.h>
+#include <ATen/ops/contiguous_ops.h>
+#include <ATen/ops/conv1d_ops.h>
+#include <ATen/ops/conv2d_ops.h>
+#include <ATen/ops/conv3d_ops.h>
+#include <ATen/ops/conv_depthwise3d_ops.h>
+#include <ATen/ops/conv_tbc_ops.h>
+#include <ATen/ops/conv_tbc_backward_ops.h>
+#include <ATen/ops/conv_transpose1d_ops.h>
+#include <ATen/ops/conv_transpose2d_ops.h>
+#include <ATen/ops/conv_transpose3d_ops.h>
+#include <ATen/ops/convolution_ops.h>
+#include <ATen/ops/convolution_backward_ops.h>
+#include <ATen/ops/convolution_backward_overrideable_ops.h>
+#include <ATen/ops/convolution_overrideable_ops.h>
+#include <ATen/ops/copy_ops.h>
+#include <ATen/ops/copy_sparse_to_sparse_ops.h>
+#include <ATen/ops/copysign_ops.h>
+#include <ATen/ops/corrcoef_ops.h>
+#include <ATen/ops/cos_ops.h>
+#include <ATen/ops/cosh_ops.h>
+#include <ATen/ops/cosine_embedding_loss_ops.h>
+#include <ATen/ops/cosine_similarity_ops.h>
+#include <ATen/ops/count_nonzero_ops.h>
+#include <ATen/ops/cov_ops.h>
+#include <ATen/ops/cross_ops.h>
+#include <ATen/ops/cross_entropy_loss_ops.h>
+#include <ATen/ops/crow_indices_ops.h>
+#include <ATen/ops/crow_indices_copy_ops.h>
+#include <ATen/ops/ctc_loss_ops.h>
+#include <ATen/ops/cudnn_affine_grid_generator_ops.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_ops.h>
+#include <ATen/ops/cudnn_batch_norm_ops.h>
+#include <ATen/ops/cudnn_batch_norm_backward_ops.h>
+#include <ATen/ops/cudnn_convolution_ops.h>
+#include <ATen/ops/cudnn_convolution_add_relu_ops.h>
+#include <ATen/ops/cudnn_convolution_relu_ops.h>
+#include <ATen/ops/cudnn_convolution_transpose_ops.h>
+#include <ATen/ops/cudnn_grid_sampler_ops.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_ops.h>
+#include <ATen/ops/cudnn_is_acceptable_ops.h>
+#include <ATen/ops/cummax_ops.h>
+#include <ATen/ops/cummaxmin_backward_ops.h>
+#include <ATen/ops/cummin_ops.h>
+#include <ATen/ops/cumprod_ops.h>
+#include <ATen/ops/cumprod_backward_ops.h>
+#include <ATen/ops/cumsum_ops.h>
+#include <ATen/ops/cumulative_trapezoid_ops.h>
+#include <ATen/ops/data_ops.h>
+#include <ATen/ops/deg2rad_ops.h>
+#include <ATen/ops/dense_dim_ops.h>
+#include <ATen/ops/dequantize_ops.h>
+#include <ATen/ops/det_ops.h>
+#include <ATen/ops/detach_ops.h>
+#include <ATen/ops/detach_copy_ops.h>
+#include <ATen/ops/diag_ops.h>
+#include <ATen/ops/diag_embed_ops.h>
+#include <ATen/ops/diagflat_ops.h>
+#include <ATen/ops/diagonal_ops.h>
+#include <ATen/ops/diagonal_backward_ops.h>
+#include <ATen/ops/diagonal_copy_ops.h>
+#include <ATen/ops/diagonal_scatter_ops.h>
+#include <ATen/ops/diff_ops.h>
+#include <ATen/ops/digamma_ops.h>
+#include <ATen/ops/dist_ops.h>
+#include <ATen/ops/div_ops.h>
+#include <ATen/ops/divide_ops.h>
+#include <ATen/ops/dot_ops.h>
+#include <ATen/ops/dropout_ops.h>
+#include <ATen/ops/dsplit_ops.h>
+#include <ATen/ops/dstack_ops.h>
+#include <ATen/ops/einsum_ops.h>
+#include <ATen/ops/elu_ops.h>
+#include <ATen/ops/elu_backward_ops.h>
+#include <ATen/ops/embedding_ops.h>
+#include <ATen/ops/embedding_backward_ops.h>
+#include <ATen/ops/embedding_bag_ops.h>
+#include <ATen/ops/embedding_dense_backward_ops.h>
+#include <ATen/ops/embedding_renorm_ops.h>
+#include <ATen/ops/embedding_sparse_backward_ops.h>
+#include <ATen/ops/empty_ops.h>
+#include <ATen/ops/empty_like_ops.h>
+#include <ATen/ops/empty_permuted_ops.h>
+#include <ATen/ops/empty_quantized_ops.h>
+#include <ATen/ops/empty_strided_ops.h>
+#include <ATen/ops/eq_ops.h>
+#include <ATen/ops/equal_ops.h>
+#include <ATen/ops/erf_ops.h>
+#include <ATen/ops/erfc_ops.h>
+#include <ATen/ops/erfinv_ops.h>
+#include <ATen/ops/exp_ops.h>
+#include <ATen/ops/exp2_ops.h>
+#include <ATen/ops/expand_ops.h>
+#include <ATen/ops/expand_as_ops.h>
+#include <ATen/ops/expand_copy_ops.h>
+#include <ATen/ops/expm1_ops.h>
+#include <ATen/ops/exponential_ops.h>
+#include <ATen/ops/eye_ops.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_ops.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_ops.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward_ops.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_ops.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_ops.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_ops.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_ops.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_ops.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_ops.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_ops.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_ops.h>
+#include <ATen/ops/feature_alpha_dropout_ops.h>
+#include <ATen/ops/feature_dropout_ops.h>
+#include <ATen/ops/fft_fft_ops.h>
+#include <ATen/ops/fft_fft2_ops.h>
+#include <ATen/ops/fft_fftfreq_ops.h>
+#include <ATen/ops/fft_fftn_ops.h>
+#include <ATen/ops/fft_fftshift_ops.h>
+#include <ATen/ops/fft_hfft_ops.h>
+#include <ATen/ops/fft_hfft2_ops.h>
+#include <ATen/ops/fft_hfftn_ops.h>
+#include <ATen/ops/fft_ifft_ops.h>
+#include <ATen/ops/fft_ifft2_ops.h>
+#include <ATen/ops/fft_ifftn_ops.h>
+#include <ATen/ops/fft_ifftshift_ops.h>
+#include <ATen/ops/fft_ihfft_ops.h>
+#include <ATen/ops/fft_ihfft2_ops.h>
+#include <ATen/ops/fft_ihfftn_ops.h>
+#include <ATen/ops/fft_irfft_ops.h>
+#include <ATen/ops/fft_irfft2_ops.h>
+#include <ATen/ops/fft_irfftn_ops.h>
+#include <ATen/ops/fft_rfft_ops.h>
+#include <ATen/ops/fft_rfft2_ops.h>
+#include <ATen/ops/fft_rfftfreq_ops.h>
+#include <ATen/ops/fft_rfftn_ops.h>
+#include <ATen/ops/fill_ops.h>
+#include <ATen/ops/fill_diagonal_ops.h>
+#include <ATen/ops/fix_ops.h>
+#include <ATen/ops/flatten_ops.h>
+#include <ATen/ops/flatten_dense_tensors_ops.h>
+#include <ATen/ops/flip_ops.h>
+#include <ATen/ops/fliplr_ops.h>
+#include <ATen/ops/flipud_ops.h>
+#include <ATen/ops/float_power_ops.h>
+#include <ATen/ops/floor_ops.h>
+#include <ATen/ops/floor_divide_ops.h>
+#include <ATen/ops/fmax_ops.h>
+#include <ATen/ops/fmin_ops.h>
+#include <ATen/ops/fmod_ops.h>
+#include <ATen/ops/frac_ops.h>
+#include <ATen/ops/fractional_max_pool2d_ops.h>
+#include <ATen/ops/fractional_max_pool2d_backward_ops.h>
+#include <ATen/ops/fractional_max_pool3d_ops.h>
+#include <ATen/ops/fractional_max_pool3d_backward_ops.h>
+#include <ATen/ops/frexp_ops.h>
+#include <ATen/ops/frobenius_norm_ops.h>
+#include <ATen/ops/from_file_ops.h>
+#include <ATen/ops/full_ops.h>
+#include <ATen/ops/full_like_ops.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant_ops.h>
+#include <ATen/ops/gather_ops.h>
+#include <ATen/ops/gather_backward_ops.h>
+#include <ATen/ops/gcd_ops.h>
+#include <ATen/ops/ge_ops.h>
+#include <ATen/ops/gelu_ops.h>
+#include <ATen/ops/gelu_backward_ops.h>
+#include <ATen/ops/geometric_ops.h>
+#include <ATen/ops/geqrf_ops.h>
+#include <ATen/ops/ger_ops.h>
+#include <ATen/ops/glu_ops.h>
+#include <ATen/ops/glu_backward_ops.h>
+#include <ATen/ops/glu_backward_jvp_ops.h>
+#include <ATen/ops/glu_jvp_ops.h>
+#include <ATen/ops/gradient_ops.h>
+#include <ATen/ops/greater_ops.h>
+#include <ATen/ops/greater_equal_ops.h>
+#include <ATen/ops/grid_sampler_ops.h>
+#include <ATen/ops/grid_sampler_2d_ops.h>
+#include <ATen/ops/grid_sampler_2d_backward_ops.h>
+#include <ATen/ops/grid_sampler_3d_ops.h>
+#include <ATen/ops/grid_sampler_3d_backward_ops.h>
+#include <ATen/ops/group_norm_ops.h>
+#include <ATen/ops/gru_ops.h>
+#include <ATen/ops/gru_cell_ops.h>
+#include <ATen/ops/gt_ops.h>
+#include <ATen/ops/hamming_window_ops.h>
+#include <ATen/ops/hann_window_ops.h>
+#include <ATen/ops/hardshrink_ops.h>
+#include <ATen/ops/hardshrink_backward_ops.h>
+#include <ATen/ops/hardsigmoid_ops.h>
+#include <ATen/ops/hardsigmoid_backward_ops.h>
+#include <ATen/ops/hardswish_ops.h>
+#include <ATen/ops/hardswish_backward_ops.h>
+#include <ATen/ops/hardtanh_ops.h>
+#include <ATen/ops/hardtanh_backward_ops.h>
+#include <ATen/ops/heaviside_ops.h>
+#include <ATen/ops/hinge_embedding_loss_ops.h>
+#include <ATen/ops/histc_ops.h>
+#include <ATen/ops/histogram_ops.h>
+#include <ATen/ops/histogramdd_ops.h>
+#include <ATen/ops/hsplit_ops.h>
+#include <ATen/ops/hspmm_ops.h>
+#include <ATen/ops/hstack_ops.h>
+#include <ATen/ops/huber_loss_ops.h>
+#include <ATen/ops/huber_loss_backward_ops.h>
+#include <ATen/ops/hypot_ops.h>
+#include <ATen/ops/i0_ops.h>
+#include <ATen/ops/igamma_ops.h>
+#include <ATen/ops/igammac_ops.h>
+#include <ATen/ops/im2col_ops.h>
+#include <ATen/ops/imag_ops.h>
+#include <ATen/ops/index_ops.h>
+#include <ATen/ops/index_add_ops.h>
+#include <ATen/ops/index_copy_ops.h>
+#include <ATen/ops/index_fill_ops.h>
+#include <ATen/ops/index_put_ops.h>
+#include <ATen/ops/index_reduce_ops.h>
+#include <ATen/ops/index_select_ops.h>
+#include <ATen/ops/index_select_backward_ops.h>
+#include <ATen/ops/indices_ops.h>
+#include <ATen/ops/indices_copy_ops.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward_ops.h>
+#include <ATen/ops/inner_ops.h>
+#include <ATen/ops/instance_norm_ops.h>
+#include <ATen/ops/int_repr_ops.h>
+#include <ATen/ops/inverse_ops.h>
+#include <ATen/ops/is_coalesced_ops.h>
+#include <ATen/ops/is_complex_ops.h>
+#include <ATen/ops/is_conj_ops.h>
+#include <ATen/ops/is_distributed_ops.h>
+#include <ATen/ops/is_floating_point_ops.h>
+#include <ATen/ops/is_inference_ops.h>
+#include <ATen/ops/is_leaf_ops.h>
+#include <ATen/ops/is_neg_ops.h>
+#include <ATen/ops/is_nonzero_ops.h>
+#include <ATen/ops/is_pinned_ops.h>
+#include <ATen/ops/is_same_size_ops.h>
+#include <ATen/ops/is_set_to_ops.h>
+#include <ATen/ops/is_signed_ops.h>
+#include <ATen/ops/is_vulkan_available_ops.h>
+#include <ATen/ops/isclose_ops.h>
+#include <ATen/ops/isfinite_ops.h>
+#include <ATen/ops/isin_ops.h>
+#include <ATen/ops/isinf_ops.h>
+#include <ATen/ops/isnan_ops.h>
+#include <ATen/ops/isneginf_ops.h>
+#include <ATen/ops/isposinf_ops.h>
+#include <ATen/ops/isreal_ops.h>
+#include <ATen/ops/istft_ops.h>
+#include <ATen/ops/item_ops.h>
+#include <ATen/ops/kaiser_window_ops.h>
+#include <ATen/ops/kl_div_ops.h>
+#include <ATen/ops/kron_ops.h>
+#include <ATen/ops/kthvalue_ops.h>
+#include <ATen/ops/l1_loss_ops.h>
+#include <ATen/ops/layer_norm_ops.h>
+#include <ATen/ops/lcm_ops.h>
+#include <ATen/ops/ldexp_ops.h>
+#include <ATen/ops/le_ops.h>
+#include <ATen/ops/leaky_relu_ops.h>
+#include <ATen/ops/leaky_relu_backward_ops.h>
+#include <ATen/ops/lerp_ops.h>
+#include <ATen/ops/less_ops.h>
+#include <ATen/ops/less_equal_ops.h>
+#include <ATen/ops/lgamma_ops.h>
+#include <ATen/ops/lift_ops.h>
+#include <ATen/ops/lift_fresh_ops.h>
+#include <ATen/ops/lift_fresh_copy_ops.h>
+#include <ATen/ops/linalg_cholesky_ops.h>
+#include <ATen/ops/linalg_cholesky_ex_ops.h>
+#include <ATen/ops/linalg_cond_ops.h>
+#include <ATen/ops/linalg_cross_ops.h>
+#include <ATen/ops/linalg_det_ops.h>
+#include <ATen/ops/linalg_diagonal_ops.h>
+#include <ATen/ops/linalg_eig_ops.h>
+#include <ATen/ops/linalg_eigh_ops.h>
+#include <ATen/ops/linalg_eigvals_ops.h>
+#include <ATen/ops/linalg_eigvalsh_ops.h>
+#include <ATen/ops/linalg_householder_product_ops.h>
+#include <ATen/ops/linalg_inv_ops.h>
+#include <ATen/ops/linalg_inv_ex_ops.h>
+#include <ATen/ops/linalg_ldl_factor_ops.h>
+#include <ATen/ops/linalg_ldl_factor_ex_ops.h>
+#include <ATen/ops/linalg_ldl_solve_ops.h>
+#include <ATen/ops/linalg_lstsq_ops.h>
+#include <ATen/ops/linalg_lu_ops.h>
+#include <ATen/ops/linalg_lu_factor_ops.h>
+#include <ATen/ops/linalg_lu_factor_ex_ops.h>
+#include <ATen/ops/linalg_lu_solve_ops.h>
+#include <ATen/ops/linalg_matmul_ops.h>
+#include <ATen/ops/linalg_matrix_exp_ops.h>
+#include <ATen/ops/linalg_matrix_norm_ops.h>
+#include <ATen/ops/linalg_matrix_power_ops.h>
+#include <ATen/ops/linalg_matrix_rank_ops.h>
+#include <ATen/ops/linalg_multi_dot_ops.h>
+#include <ATen/ops/linalg_norm_ops.h>
+#include <ATen/ops/linalg_pinv_ops.h>
+#include <ATen/ops/linalg_qr_ops.h>
+#include <ATen/ops/linalg_slogdet_ops.h>
+#include <ATen/ops/linalg_solve_ops.h>
+#include <ATen/ops/linalg_solve_ex_ops.h>
+#include <ATen/ops/linalg_solve_triangular_ops.h>
+#include <ATen/ops/linalg_svd_ops.h>
+#include <ATen/ops/linalg_svdvals_ops.h>
+#include <ATen/ops/linalg_tensorinv_ops.h>
+#include <ATen/ops/linalg_tensorsolve_ops.h>
+#include <ATen/ops/linalg_vander_ops.h>
+#include <ATen/ops/linalg_vecdot_ops.h>
+#include <ATen/ops/linalg_vector_norm_ops.h>
+#include <ATen/ops/linear_ops.h>
+#include <ATen/ops/linear_backward_ops.h>
+#include <ATen/ops/linspace_ops.h>
+#include <ATen/ops/log_ops.h>
+#include <ATen/ops/log10_ops.h>
+#include <ATen/ops/log1p_ops.h>
+#include <ATen/ops/log2_ops.h>
+#include <ATen/ops/log_normal_ops.h>
+#include <ATen/ops/log_sigmoid_ops.h>
+#include <ATen/ops/log_sigmoid_backward_ops.h>
+#include <ATen/ops/log_sigmoid_forward_ops.h>
+#include <ATen/ops/log_softmax_ops.h>
+#include <ATen/ops/logaddexp_ops.h>
+#include <ATen/ops/logaddexp2_ops.h>
+#include <ATen/ops/logcumsumexp_ops.h>
+#include <ATen/ops/logdet_ops.h>
+#include <ATen/ops/logical_and_ops.h>
+#include <ATen/ops/logical_not_ops.h>
+#include <ATen/ops/logical_or_ops.h>
+#include <ATen/ops/logical_xor_ops.h>
+#include <ATen/ops/logit_ops.h>
+#include <ATen/ops/logit_backward_ops.h>
+#include <ATen/ops/logspace_ops.h>
+#include <ATen/ops/logsumexp_ops.h>
+#include <ATen/ops/lshift_ops.h>
+#include <ATen/ops/lstm_ops.h>
+#include <ATen/ops/lstm_cell_ops.h>
+#include <ATen/ops/lstm_mps_backward_ops.h>
+#include <ATen/ops/lt_ops.h>
+#include <ATen/ops/lu_solve_ops.h>
+#include <ATen/ops/lu_unpack_ops.h>
+#include <ATen/ops/mH_ops.h>
+#include <ATen/ops/mT_ops.h>
+#include <ATen/ops/margin_ranking_loss_ops.h>
+#include <ATen/ops/masked_fill_ops.h>
+#include <ATen/ops/masked_scatter_ops.h>
+#include <ATen/ops/masked_scatter_backward_ops.h>
+#include <ATen/ops/masked_select_ops.h>
+#include <ATen/ops/masked_select_backward_ops.h>
+#include <ATen/ops/matmul_ops.h>
+#include <ATen/ops/matmul_backward_ops.h>
+#include <ATen/ops/matrix_H_ops.h>
+#include <ATen/ops/matrix_exp_ops.h>
+#include <ATen/ops/matrix_exp_backward_ops.h>
+#include <ATen/ops/matrix_power_ops.h>
+#include <ATen/ops/max_ops.h>
+#include <ATen/ops/max_pool1d_ops.h>
+#include <ATen/ops/max_pool1d_with_indices_ops.h>
+#include <ATen/ops/max_pool2d_ops.h>
+#include <ATen/ops/max_pool2d_backward_ops.h>
+#include <ATen/ops/max_pool2d_with_indices_ops.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_ops.h>
+#include <ATen/ops/max_pool3d_ops.h>
+#include <ATen/ops/max_pool3d_with_indices_ops.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_ops.h>
+#include <ATen/ops/max_unpool2d_ops.h>
+#include <ATen/ops/max_unpool3d_ops.h>
+#include <ATen/ops/maximum_ops.h>
+#include <ATen/ops/mean_ops.h>
+#include <ATen/ops/median_ops.h>
+#include <ATen/ops/meshgrid_ops.h>
+#include <ATen/ops/min_ops.h>
+#include <ATen/ops/minimum_ops.h>
+#include <ATen/ops/miopen_batch_norm_ops.h>
+#include <ATen/ops/miopen_batch_norm_backward_ops.h>
+#include <ATen/ops/miopen_convolution_ops.h>
+#include <ATen/ops/miopen_convolution_add_relu_ops.h>
+#include <ATen/ops/miopen_convolution_relu_ops.h>
+#include <ATen/ops/miopen_convolution_transpose_ops.h>
+#include <ATen/ops/miopen_depthwise_convolution_ops.h>
+#include <ATen/ops/miopen_rnn_ops.h>
+#include <ATen/ops/miopen_rnn_backward_ops.h>
+#include <ATen/ops/mish_ops.h>
+#include <ATen/ops/mish_backward_ops.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h>
+#include <ATen/ops/mkldnn_convolution_ops.h>
+#include <ATen/ops/mkldnn_linear_ops.h>
+#include <ATen/ops/mkldnn_linear_backward_ops.h>
+#include <ATen/ops/mkldnn_linear_backward_input_ops.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_ops.h>
+#include <ATen/ops/mkldnn_max_pool2d_ops.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_ops.h>
+#include <ATen/ops/mkldnn_max_pool3d_ops.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_ops.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_ops.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_ops.h>
+#include <ATen/ops/mkldnn_rnn_layer_ops.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_ops.h>
+#include <ATen/ops/mm_ops.h>
+#include <ATen/ops/mode_ops.h>
+#include <ATen/ops/moveaxis_ops.h>
+#include <ATen/ops/movedim_ops.h>
+#include <ATen/ops/mps_convolution_backward_ops.h>
+#include <ATen/ops/mps_convolution_transpose_backward_ops.h>
+#include <ATen/ops/mse_loss_ops.h>
+#include <ATen/ops/mse_loss_backward_ops.h>
+#include <ATen/ops/msort_ops.h>
+#include <ATen/ops/mul_ops.h>
+#include <ATen/ops/multi_margin_loss_ops.h>
+#include <ATen/ops/multi_margin_loss_backward_ops.h>
+#include <ATen/ops/multilabel_margin_loss_ops.h>
+#include <ATen/ops/multilabel_margin_loss_backward_ops.h>
+#include <ATen/ops/multilabel_margin_loss_forward_ops.h>
+#include <ATen/ops/multinomial_ops.h>
+#include <ATen/ops/multiply_ops.h>
+#include <ATen/ops/mv_ops.h>
+#include <ATen/ops/mvlgamma_ops.h>
+#include <ATen/ops/nan_to_num_ops.h>
+#include <ATen/ops/nanmean_ops.h>
+#include <ATen/ops/nanmedian_ops.h>
+#include <ATen/ops/nanquantile_ops.h>
+#include <ATen/ops/nansum_ops.h>
+#include <ATen/ops/narrow_ops.h>
+#include <ATen/ops/narrow_copy_ops.h>
+#include <ATen/ops/native_batch_norm_ops.h>
+#include <ATen/ops/native_batch_norm_backward_ops.h>
+#include <ATen/ops/native_channel_shuffle_ops.h>
+#include <ATen/ops/native_dropout_ops.h>
+#include <ATen/ops/native_dropout_backward_ops.h>
+#include <ATen/ops/native_group_norm_ops.h>
+#include <ATen/ops/native_group_norm_backward_ops.h>
+#include <ATen/ops/native_layer_norm_ops.h>
+#include <ATen/ops/native_layer_norm_backward_ops.h>
+#include <ATen/ops/native_norm_ops.h>
+#include <ATen/ops/ne_ops.h>
+#include <ATen/ops/neg_ops.h>
+#include <ATen/ops/negative_ops.h>
+#include <ATen/ops/nested_to_padded_tensor_ops.h>
+#include <ATen/ops/new_empty_ops.h>
+#include <ATen/ops/new_empty_strided_ops.h>
+#include <ATen/ops/new_full_ops.h>
+#include <ATen/ops/new_ones_ops.h>
+#include <ATen/ops/new_zeros_ops.h>
+#include <ATen/ops/nextafter_ops.h>
+#include <ATen/ops/nll_loss_ops.h>
+#include <ATen/ops/nll_loss2d_ops.h>
+#include <ATen/ops/nll_loss2d_backward_ops.h>
+#include <ATen/ops/nll_loss2d_forward_ops.h>
+#include <ATen/ops/nll_loss_backward_ops.h>
+#include <ATen/ops/nll_loss_forward_ops.h>
+#include <ATen/ops/nll_loss_nd_ops.h>
+#include <ATen/ops/nonzero_ops.h>
+#include <ATen/ops/nonzero_numpy_ops.h>
+#include <ATen/ops/nonzero_static_ops.h>
+#include <ATen/ops/norm_ops.h>
+#include <ATen/ops/norm_except_dim_ops.h>
+#include <ATen/ops/normal_ops.h>
+#include <ATen/ops/not_equal_ops.h>
+#include <ATen/ops/nuclear_norm_ops.h>
+#include <ATen/ops/numpy_T_ops.h>
+#include <ATen/ops/one_hot_ops.h>
+#include <ATen/ops/ones_ops.h>
+#include <ATen/ops/ones_like_ops.h>
+#include <ATen/ops/or_ops.h>
+#include <ATen/ops/orgqr_ops.h>
+#include <ATen/ops/ormqr_ops.h>
+#include <ATen/ops/outer_ops.h>
+#include <ATen/ops/output_nr_ops.h>
+#include <ATen/ops/pad_ops.h>
+#include <ATen/ops/pad_sequence_ops.h>
+#include <ATen/ops/pairwise_distance_ops.h>
+#include <ATen/ops/pdist_ops.h>
+#include <ATen/ops/permute_ops.h>
+#include <ATen/ops/permute_copy_ops.h>
+#include <ATen/ops/pin_memory_ops.h>
+#include <ATen/ops/pinverse_ops.h>
+#include <ATen/ops/pixel_shuffle_ops.h>
+#include <ATen/ops/pixel_unshuffle_ops.h>
+#include <ATen/ops/poisson_ops.h>
+#include <ATen/ops/poisson_nll_loss_ops.h>
+#include <ATen/ops/polar_ops.h>
+#include <ATen/ops/polygamma_ops.h>
+#include <ATen/ops/positive_ops.h>
+#include <ATen/ops/pow_ops.h>
+#include <ATen/ops/prelu_ops.h>
+#include <ATen/ops/prod_ops.h>
+#include <ATen/ops/promote_types_ops.h>
+#include <ATen/ops/put_ops.h>
+#include <ATen/ops/q_per_channel_axis_ops.h>
+#include <ATen/ops/q_per_channel_scales_ops.h>
+#include <ATen/ops/q_per_channel_zero_points_ops.h>
+#include <ATen/ops/q_scale_ops.h>
+#include <ATen/ops/q_zero_point_ops.h>
+#include <ATen/ops/qr_ops.h>
+#include <ATen/ops/qscheme_ops.h>
+#include <ATen/ops/quantile_ops.h>
+#include <ATen/ops/quantize_per_channel_ops.h>
+#include <ATen/ops/quantize_per_tensor_ops.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_ops.h>
+#include <ATen/ops/quantized_batch_norm_ops.h>
+#include <ATen/ops/quantized_gru_cell_ops.h>
+#include <ATen/ops/quantized_lstm_cell_ops.h>
+#include <ATen/ops/quantized_max_pool1d_ops.h>
+#include <ATen/ops/quantized_max_pool2d_ops.h>
+#include <ATen/ops/quantized_max_pool3d_ops.h>
+#include <ATen/ops/quantized_rnn_relu_cell_ops.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_ops.h>
+#include <ATen/ops/rad2deg_ops.h>
+#include <ATen/ops/rand_ops.h>
+#include <ATen/ops/rand_like_ops.h>
+#include <ATen/ops/randint_ops.h>
+#include <ATen/ops/randint_like_ops.h>
+#include <ATen/ops/randn_ops.h>
+#include <ATen/ops/randn_like_ops.h>
+#include <ATen/ops/random_ops.h>
+#include <ATen/ops/randperm_ops.h>
+#include <ATen/ops/range_ops.h>
+#include <ATen/ops/ravel_ops.h>
+#include <ATen/ops/real_ops.h>
+#include <ATen/ops/reciprocal_ops.h>
+#include <ATen/ops/record_stream_ops.h>
+#include <ATen/ops/refine_names_ops.h>
+#include <ATen/ops/reflection_pad1d_ops.h>
+#include <ATen/ops/reflection_pad1d_backward_ops.h>
+#include <ATen/ops/reflection_pad2d_ops.h>
+#include <ATen/ops/reflection_pad2d_backward_ops.h>
+#include <ATen/ops/reflection_pad3d_ops.h>
+#include <ATen/ops/reflection_pad3d_backward_ops.h>
+#include <ATen/ops/relu_ops.h>
+#include <ATen/ops/relu6_ops.h>
+#include <ATen/ops/remainder_ops.h>
+#include <ATen/ops/rename_ops.h>
+#include <ATen/ops/renorm_ops.h>
+#include <ATen/ops/repeat_ops.h>
+#include <ATen/ops/repeat_interleave_ops.h>
+#include <ATen/ops/replication_pad1d_ops.h>
+#include <ATen/ops/replication_pad1d_backward_ops.h>
+#include <ATen/ops/replication_pad2d_ops.h>
+#include <ATen/ops/replication_pad2d_backward_ops.h>
+#include <ATen/ops/replication_pad3d_ops.h>
+#include <ATen/ops/replication_pad3d_backward_ops.h>
+#include <ATen/ops/requires_grad_ops.h>
+#include <ATen/ops/reshape_ops.h>
+#include <ATen/ops/reshape_as_ops.h>
+#include <ATen/ops/resize_ops.h>
+#include <ATen/ops/resize_as_ops.h>
+#include <ATen/ops/resize_as_sparse_ops.h>
+#include <ATen/ops/resolve_conj_ops.h>
+#include <ATen/ops/resolve_neg_ops.h>
+#include <ATen/ops/result_type_ops.h>
+#include <ATen/ops/retain_grad_ops.h>
+#include <ATen/ops/retains_grad_ops.h>
+#include <ATen/ops/rnn_relu_ops.h>
+#include <ATen/ops/rnn_relu_cell_ops.h>
+#include <ATen/ops/rnn_tanh_ops.h>
+#include <ATen/ops/rnn_tanh_cell_ops.h>
+#include <ATen/ops/roll_ops.h>
+#include <ATen/ops/rot90_ops.h>
+#include <ATen/ops/round_ops.h>
+#include <ATen/ops/row_indices_ops.h>
+#include <ATen/ops/row_indices_copy_ops.h>
+#include <ATen/ops/row_stack_ops.h>
+#include <ATen/ops/rrelu_ops.h>
+#include <ATen/ops/rrelu_with_noise_ops.h>
+#include <ATen/ops/rrelu_with_noise_backward_ops.h>
+#include <ATen/ops/rshift_ops.h>
+#include <ATen/ops/rsqrt_ops.h>
+#include <ATen/ops/rsub_ops.h>
+#include <ATen/ops/scalar_tensor_ops.h>
+#include <ATen/ops/scaled_dot_product_attention_ops.h>
+#include <ATen/ops/scatter_ops.h>
+#include <ATen/ops/scatter_add_ops.h>
+#include <ATen/ops/scatter_reduce_ops.h>
+#include <ATen/ops/searchsorted_ops.h>
+#include <ATen/ops/segment_reduce_ops.h>
+#include <ATen/ops/select_ops.h>
+#include <ATen/ops/select_backward_ops.h>
+#include <ATen/ops/select_copy_ops.h>
+#include <ATen/ops/select_scatter_ops.h>
+#include <ATen/ops/selu_ops.h>
+#include <ATen/ops/set_ops.h>
+#include <ATen/ops/set_data_ops.h>
+#include <ATen/ops/sgn_ops.h>
+#include <ATen/ops/sigmoid_ops.h>
+#include <ATen/ops/sigmoid_backward_ops.h>
+#include <ATen/ops/sign_ops.h>
+#include <ATen/ops/signbit_ops.h>
+#include <ATen/ops/silu_ops.h>
+#include <ATen/ops/silu_backward_ops.h>
+#include <ATen/ops/sin_ops.h>
+#include <ATen/ops/sinc_ops.h>
+#include <ATen/ops/sinh_ops.h>
+#include <ATen/ops/size_ops.h>
+#include <ATen/ops/slice_ops.h>
+#include <ATen/ops/slice_backward_ops.h>
+#include <ATen/ops/slice_copy_ops.h>
+#include <ATen/ops/slice_inverse_ops.h>
+#include <ATen/ops/slice_scatter_ops.h>
+#include <ATen/ops/slogdet_ops.h>
+#include <ATen/ops/slow_conv3d_ops.h>
+#include <ATen/ops/slow_conv3d_forward_ops.h>
+#include <ATen/ops/slow_conv_dilated2d_ops.h>
+#include <ATen/ops/slow_conv_dilated3d_ops.h>
+#include <ATen/ops/slow_conv_transpose2d_ops.h>
+#include <ATen/ops/slow_conv_transpose3d_ops.h>
+#include <ATen/ops/smm_ops.h>
+#include <ATen/ops/smooth_l1_loss_ops.h>
+#include <ATen/ops/smooth_l1_loss_backward_ops.h>
+#include <ATen/ops/soft_margin_loss_ops.h>
+#include <ATen/ops/soft_margin_loss_backward_ops.h>
+#include <ATen/ops/softmax_ops.h>
+#include <ATen/ops/softplus_ops.h>
+#include <ATen/ops/softplus_backward_ops.h>
+#include <ATen/ops/softshrink_ops.h>
+#include <ATen/ops/softshrink_backward_ops.h>
+#include <ATen/ops/sort_ops.h>
+#include <ATen/ops/sparse_bsc_tensor_ops.h>
+#include <ATen/ops/sparse_bsr_tensor_ops.h>
+#include <ATen/ops/sparse_compressed_tensor_ops.h>
+#include <ATen/ops/sparse_coo_tensor_ops.h>
+#include <ATen/ops/sparse_csc_tensor_ops.h>
+#include <ATen/ops/sparse_csr_tensor_ops.h>
+#include <ATen/ops/sparse_dim_ops.h>
+#include <ATen/ops/sparse_mask_ops.h>
+#include <ATen/ops/sparse_resize_ops.h>
+#include <ATen/ops/sparse_resize_and_clear_ops.h>
+#include <ATen/ops/sparse_sampled_addmm_ops.h>
+#include <ATen/ops/special_airy_ai_ops.h>
+#include <ATen/ops/special_bessel_j0_ops.h>
+#include <ATen/ops/special_bessel_j1_ops.h>
+#include <ATen/ops/special_bessel_y0_ops.h>
+#include <ATen/ops/special_bessel_y1_ops.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_ops.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_ops.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_ops.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_ops.h>
+#include <ATen/ops/special_digamma_ops.h>
+#include <ATen/ops/special_entr_ops.h>
+#include <ATen/ops/special_erf_ops.h>
+#include <ATen/ops/special_erfc_ops.h>
+#include <ATen/ops/special_erfcx_ops.h>
+#include <ATen/ops/special_erfinv_ops.h>
+#include <ATen/ops/special_exp2_ops.h>
+#include <ATen/ops/special_expit_ops.h>
+#include <ATen/ops/special_expm1_ops.h>
+#include <ATen/ops/special_gammainc_ops.h>
+#include <ATen/ops/special_gammaincc_ops.h>
+#include <ATen/ops/special_gammaln_ops.h>
+#include <ATen/ops/special_hermite_polynomial_h_ops.h>
+#include <ATen/ops/special_hermite_polynomial_he_ops.h>
+#include <ATen/ops/special_i0_ops.h>
+#include <ATen/ops/special_i0e_ops.h>
+#include <ATen/ops/special_i1_ops.h>
+#include <ATen/ops/special_i1e_ops.h>
+#include <ATen/ops/special_laguerre_polynomial_l_ops.h>
+#include <ATen/ops/special_legendre_polynomial_p_ops.h>
+#include <ATen/ops/special_log1p_ops.h>
+#include <ATen/ops/special_log_ndtr_ops.h>
+#include <ATen/ops/special_log_softmax_ops.h>
+#include <ATen/ops/special_logit_ops.h>
+#include <ATen/ops/special_logsumexp_ops.h>
+#include <ATen/ops/special_modified_bessel_i0_ops.h>
+#include <ATen/ops/special_modified_bessel_i1_ops.h>
+#include <ATen/ops/special_modified_bessel_k0_ops.h>
+#include <ATen/ops/special_modified_bessel_k1_ops.h>
+#include <ATen/ops/special_multigammaln_ops.h>
+#include <ATen/ops/special_ndtr_ops.h>
+#include <ATen/ops/special_ndtri_ops.h>
+#include <ATen/ops/special_polygamma_ops.h>
+#include <ATen/ops/special_psi_ops.h>
+#include <ATen/ops/special_round_ops.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_ops.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_ops.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_ops.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_ops.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_ops.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_ops.h>
+#include <ATen/ops/special_sinc_ops.h>
+#include <ATen/ops/special_softmax_ops.h>
+#include <ATen/ops/special_spherical_bessel_j0_ops.h>
+#include <ATen/ops/special_xlog1py_ops.h>
+#include <ATen/ops/special_xlogy_ops.h>
+#include <ATen/ops/special_zeta_ops.h>
+#include <ATen/ops/split_ops.h>
+#include <ATen/ops/split_copy_ops.h>
+#include <ATen/ops/split_with_sizes_ops.h>
+#include <ATen/ops/split_with_sizes_copy_ops.h>
+#include <ATen/ops/sqrt_ops.h>
+#include <ATen/ops/square_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/squeeze_copy_ops.h>
+#include <ATen/ops/sspaddmm_ops.h>
+#include <ATen/ops/stack_ops.h>
+#include <ATen/ops/std_ops.h>
+#include <ATen/ops/std_mean_ops.h>
+#include <ATen/ops/stft_ops.h>
+#include <ATen/ops/stride_ops.h>
+#include <ATen/ops/sub_ops.h>
+#include <ATen/ops/subtract_ops.h>
+#include <ATen/ops/sum_ops.h>
+#include <ATen/ops/sum_to_size_ops.h>
+#include <ATen/ops/svd_ops.h>
+#include <ATen/ops/swapaxes_ops.h>
+#include <ATen/ops/swapdims_ops.h>
+#include <ATen/ops/sym_constrain_range_ops.h>
+#include <ATen/ops/sym_constrain_range_for_size_ops.h>
+#include <ATen/ops/sym_numel_ops.h>
+#include <ATen/ops/sym_size_ops.h>
+#include <ATen/ops/sym_storage_offset_ops.h>
+#include <ATen/ops/sym_stride_ops.h>
+#include <ATen/ops/t_ops.h>
+#include <ATen/ops/t_copy_ops.h>
+#include <ATen/ops/take_ops.h>
+#include <ATen/ops/take_along_dim_ops.h>
+#include <ATen/ops/tan_ops.h>
+#include <ATen/ops/tanh_ops.h>
+#include <ATen/ops/tanh_backward_ops.h>
+#include <ATen/ops/tensor_split_ops.h>
+#include <ATen/ops/tensordot_ops.h>
+#include <ATen/ops/thnn_conv2d_ops.h>
+#include <ATen/ops/threshold_ops.h>
+#include <ATen/ops/threshold_backward_ops.h>
+#include <ATen/ops/tile_ops.h>
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/to_dense_ops.h>
+#include <ATen/ops/to_dense_backward_ops.h>
+#include <ATen/ops/to_mkldnn_ops.h>
+#include <ATen/ops/to_mkldnn_backward_ops.h>
+#include <ATen/ops/to_padded_tensor_ops.h>
+#include <ATen/ops/to_sparse_ops.h>
+#include <ATen/ops/to_sparse_bsc_ops.h>
+#include <ATen/ops/to_sparse_bsr_ops.h>
+#include <ATen/ops/to_sparse_csc_ops.h>
+#include <ATen/ops/to_sparse_csr_ops.h>
+#include <ATen/ops/topk_ops.h>
+#include <ATen/ops/trace_ops.h>
+#include <ATen/ops/trace_backward_ops.h>
+#include <ATen/ops/transpose_ops.h>
+#include <ATen/ops/transpose_copy_ops.h>
+#include <ATen/ops/trapezoid_ops.h>
+#include <ATen/ops/trapz_ops.h>
+#include <ATen/ops/triangular_solve_ops.h>
+#include <ATen/ops/tril_ops.h>
+#include <ATen/ops/tril_indices_ops.h>
+#include <ATen/ops/triplet_margin_loss_ops.h>
+#include <ATen/ops/triu_ops.h>
+#include <ATen/ops/triu_indices_ops.h>
+#include <ATen/ops/true_divide_ops.h>
+#include <ATen/ops/trunc_ops.h>
+#include <ATen/ops/type_as_ops.h>
+#include <ATen/ops/unbind_ops.h>
+#include <ATen/ops/unbind_copy_ops.h>
+#include <ATen/ops/unflatten_ops.h>
+#include <ATen/ops/unflatten_dense_tensors_ops.h>
+#include <ATen/ops/unfold_ops.h>
+#include <ATen/ops/unfold_backward_ops.h>
+#include <ATen/ops/unfold_copy_ops.h>
+#include <ATen/ops/uniform_ops.h>
+#include <ATen/ops/unique_consecutive_ops.h>
+#include <ATen/ops/unique_dim_ops.h>
+#include <ATen/ops/unique_dim_consecutive_ops.h>
+#include <ATen/ops/unsafe_chunk_ops.h>
+#include <ATen/ops/unsafe_split_ops.h>
+#include <ATen/ops/unsafe_split_with_sizes_ops.h>
+#include <ATen/ops/unsqueeze_ops.h>
+#include <ATen/ops/unsqueeze_copy_ops.h>
+#include <ATen/ops/upsample_bicubic2d_ops.h>
+#include <ATen/ops/upsample_bicubic2d_backward_ops.h>
+#include <ATen/ops/upsample_bilinear2d_ops.h>
+#include <ATen/ops/upsample_bilinear2d_backward_ops.h>
+#include <ATen/ops/upsample_linear1d_ops.h>
+#include <ATen/ops/upsample_linear1d_backward_ops.h>
+#include <ATen/ops/upsample_nearest1d_ops.h>
+#include <ATen/ops/upsample_nearest1d_backward_ops.h>
+#include <ATen/ops/upsample_nearest2d_ops.h>
+#include <ATen/ops/upsample_nearest2d_backward_ops.h>
+#include <ATen/ops/upsample_nearest3d_ops.h>
+#include <ATen/ops/upsample_nearest3d_backward_ops.h>
+#include <ATen/ops/upsample_trilinear3d_ops.h>
+#include <ATen/ops/upsample_trilinear3d_backward_ops.h>
+#include <ATen/ops/value_selecting_reduction_backward_ops.h>
+#include <ATen/ops/values_ops.h>
+#include <ATen/ops/values_copy_ops.h>
+#include <ATen/ops/vander_ops.h>
+#include <ATen/ops/var_ops.h>
+#include <ATen/ops/var_mean_ops.h>
+#include <ATen/ops/vdot_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/view_as_ops.h>
+#include <ATen/ops/view_as_complex_ops.h>
+#include <ATen/ops/view_as_complex_copy_ops.h>
+#include <ATen/ops/view_as_real_ops.h>
+#include <ATen/ops/view_as_real_copy_ops.h>
+#include <ATen/ops/view_copy_ops.h>
+#include <ATen/ops/vsplit_ops.h>
+#include <ATen/ops/vstack_ops.h>
+#include <ATen/ops/where_ops.h>
+#include <ATen/ops/xlogy_ops.h>
+#include <ATen/ops/xor_ops.h>
+#include <ATen/ops/zero_ops.h>
+#include <ATen/ops/zeros_ops.h>
+#include <ATen/ops/zeros_like_ops.h>
+
+// Extension writers: do you write wrapper functions? Are you frustrated with
+// resolving overloads of operators? Are you frustrated with dealing with
+// pointer-to-methods and resolving overloads of pointer-to-methods?? Look no
+// further, this is the utility for you.
+//
+// Given an operator schema: aten::op.overload(...
+//
+// Use ATEN_FN2(op, overload) to get a *function* version of the operator
+// that is guaranteed to not be overloaded. This means that you can safely
+// decltype(&ATEN_FN2(op, overload)) it. NB: the 2 means this macro takes 2 args.
+//
+// Given an operator schema without an overload name: aten::op(...
+//
+// Use ATEN_FN(op) to get an unambiguous *function* version of the operator.
+//
+// There is some interesting behavior for out= operations.
+// ATEN_FN2(sin, out) gives a function that is *faithful* to the schema;
+// that is, the order of arguments is exactly what it looks like in the schema.
+
+#define ATEN_FN2(op_name, overload) at::_ops::op_name##_##overload::call
+#define ATEN_FN(op_name) at::_ops::op_name::call
+
+// Separately, ATEN_OP(op) and ATEN_OP2(op, overload) define a class containing compile-time
+// metadata about a given aten operator.
+// Notable data on the class includes:
+// - ATEN_OP2(add, Tensor)::name // returns the string name: "add"
+// - ATEN_OP2(add, Tensor)::overload_name // returns the string overload name: "Tensor"
+// - ATEN_OP2(add, Tensor)::schema // returns the C++ schema type: at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &)
+// - ATEN_OP2(add, Tensor)::schema_str // returns the string jit type: "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"
+
+#define ATEN_OP2(op_name, overload) at::_ops::op_name##_##overload
+#define ATEN_OP(op_name) at::_ops::op_name
+
+// WARNING: Please do not call any of the ops in the _ops namespace directly.
+// Use the ATEN_FN macros. We do not guarantee stability of the naming
+// scheme for the functions in at::_ops
+
+// See Note [The ATen Operators API] for details of the at::_ops namespace
+
+namespace at {
+namespace _ops {
+
+} // namespace _ops
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/PTThreadPool.h b/MLPY/Lib/site-packages/torch/include/ATen/PTThreadPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..d18d80161296db96fc6cc0c89ba4546490b6e5a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/PTThreadPool.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <c10/core/thread_pool.h>
+
+namespace at {
+
+class TORCH_API PTThreadPool : public c10::ThreadPool {
+ public:
+  explicit PTThreadPool(int pool_size, int numa_node_id = -1)
+      : c10::ThreadPool(pool_size, numa_node_id, []() {
+          c10::setThreadName("PTThreadPool");
+          at::init_num_threads();
+        }) {}
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/PadNd.h b/MLPY/Lib/site-packages/torch/include/ATen/PadNd.h
new file mode 100644
index 0000000000000000000000000000000000000000..612631ec6bc042ff7b02955620981e107a2fa8fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/PadNd.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
+
+namespace at {
+
+enum class padding_mode {
+  reflect,
+  replicate,
+  circular,
+  constant,
+};
+
+static inline c10::string_view padding_mode_string(padding_mode m) {
+  switch (m) {
+    case padding_mode::reflect:
+      return "reflect";
+    case padding_mode::replicate:
+      return "replicate";
+    case padding_mode::circular:
+      return "circular";
+    case padding_mode::constant:
+      return "constant";
+  }
+  TORCH_CHECK(false, "Invalid padding mode (", static_cast<int64_t>(m), ")");
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Parallel-inl.h b/MLPY/Lib/site-packages/torch/include/ATen/Parallel-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..966aa4b6371df7442cade150cae890bd772e4491
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Parallel-inl.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/ParallelGuard.h>
+#include <c10/util/SmallVector.h>
+
+namespace at {
+
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(grain_size >= 0);
+  if (begin >= end) {
+    return;
+  }
+
+#ifdef INTRA_OP_PARALLEL
+  at::internal::lazy_init_num_threads();
+  const auto numiter = end - begin;
+  const bool use_parallel =
+      (numiter > grain_size && numiter > 1 && !at::in_parallel_region() &&
+       at::get_num_threads() > 1);
+  if (!use_parallel) {
+    internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
+    f(begin, end);
+    return;
+  }
+
+  internal::invoke_parallel(
+      begin, end, grain_size, [&](int64_t begin, int64_t end) {
+        c10::ParallelGuard guard(true);
+        f(begin, end);
+      });
+#else
+  internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
+  f(begin, end);
+#endif
+}
+
+template <class scalar_t, class F, class SF>
+inline scalar_t parallel_reduce(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const scalar_t ident,
+    const F& f,
+    const SF& sf) {
+  TORCH_CHECK(grain_size >= 0);
+  if (begin >= end) {
+    return ident;
+  }
+
+#ifdef INTRA_OP_PARALLEL
+  at::internal::lazy_init_num_threads();
+  const auto max_threads = at::get_num_threads();
+  const bool use_parallel =
+      ((end - begin) > grain_size && !at::in_parallel_region() &&
+       max_threads > 1);
+  if (!use_parallel) {
+    internal::ThreadIdGuard tid_guard(0);
+    c10::ParallelGuard guard(true);
+    return f(begin, end, ident);
+  }
+
+  c10::SmallVector<scalar_t, 64> results(max_threads, ident);
+  internal::invoke_parallel(
+      begin,
+      end,
+      grain_size,
+      [&](const int64_t my_begin, const int64_t my_end) {
+        const auto tid = at::get_thread_num();
+        c10::ParallelGuard guard(true);
+        results[tid] = f(my_begin, my_end, ident);
+      });
+
+  scalar_t result = ident;
+  for (auto partial_result : results) {
+    result = sf(result, partial_result);
+  }
+  return result;
+#else
+  internal::ThreadIdGuard tid_guard(0);
+  c10::ParallelGuard guard(true);
+  return f(begin, end, ident);
+#endif
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Parallel.h b/MLPY/Lib/site-packages/torch/include/ATen/Parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7261fed38968b84d9dfca9f63c30af742cdec4e6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Parallel.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/Config.h>
+#include <c10/macros/Macros.h>
+#include <functional>
+#include <string>
+
+namespace at {
+
+inline int64_t divup(int64_t x, int64_t y) {
+  return (x + y - 1) / y;
+}
+
+// Called during new thread initialization
+TORCH_API void init_num_threads();
+
+// Sets the number of threads to be used in parallel region
+TORCH_API void set_num_threads(int);
+
+// Returns the maximum number of threads that may be used in a parallel region
+TORCH_API int get_num_threads();
+
+// Returns the current thread number (starting from 0)
+// in the current parallel region, or 0 in the sequential region
+TORCH_API int get_thread_num();
+
+// Checks whether the code runs in parallel region
+TORCH_API bool in_parallel_region();
+
+namespace internal {
+
+// Initialise num_threads lazily at first parallel call
+inline void lazy_init_num_threads() {
+  thread_local bool init = false;
+  if (C10_UNLIKELY(!init)) {
+    at::init_num_threads();
+    init = true;
+  }
+}
+
+TORCH_API void set_thread_num(int);
+
+class TORCH_API ThreadIdGuard {
+ public:
+  ThreadIdGuard(int new_id) : old_id_(at::get_thread_num()) {
+    set_thread_num(new_id);
+  }
+
+  ~ThreadIdGuard() {
+    set_thread_num(old_id_);
+  }
+
+ private:
+  int old_id_;
+};
+
+} // namespace internal
+
+/*
+parallel_for
+
+begin: index at which to start applying user function
+
+end: index at which to stop applying user function
+
+grain_size: number of elements per chunk. impacts the degree of parallelization
+
+f: user function applied in parallel to the chunks, signature:
+  void f(int64_t begin, int64_t end)
+
+Warning: parallel_for does NOT copy thread local
+states from the current thread to the worker threads.
+This means for example that Tensor operations CANNOT be used in the
+body of your function, only data pointers.
+*/
+template <class F>
+inline void parallel_for(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f);
+
+/*
+parallel_reduce
+
+begin: index at which to start applying reduction
+
+end: index at which to stop applying reduction
+
+grain_size: number of elements per chunk. impacts number of elements in
+intermediate results tensor and degree of parallelization.
+
+ident: identity for binary combination function sf. sf(ident, x) needs to return
+x.
+
+f: function for reduction over a chunk. f needs to be of signature scalar_t
+f(int64_t partial_begin, int64_t partial_end, scalar_t identifiy)
+
+sf: function to combine two partial results. sf needs to be of signature
+scalar_t sf(scalar_t x, scalar_t y)
+
+For example, you might have a tensor of 10000 entires and want to sum together
+all the elements. Parallel_reduce with a grain_size of 2500 will then allocate
+an intermediate result tensor with 4 elements. Then it will execute the function
+"f" you provide and pass the beginning and end index of these chunks, so
+0-2499, 2500-4999, etc. and the combination identity. It will then write out
+the result from each of these chunks into the intermediate result tensor. After
+that it'll reduce the partial results from each chunk into a single number using
+the combination function sf and the identity ident. For a total summation this
+would be "+" and 0 respectively. This is similar to tbb's approach [1], where
+you need to provide a function to accumulate a subrange, a function to combine
+two partial results and an identity.
+
+Warning: parallel_reduce does NOT copy thread local
+states from the current thread to the worker threads.
+This means for example that Tensor operations CANNOT be used in the
+body of your function, only data pointers.
+
+[1] https://software.intel.com/en-us/node/506154
+*/
+template <class scalar_t, class F, class SF>
+inline scalar_t parallel_reduce(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const scalar_t ident,
+    const F& f,
+    const SF& sf);
+
+// Returns a detailed string describing parallelization settings
+TORCH_API std::string get_parallel_info();
+
+// Sets number of threads used for inter-op parallelism
+TORCH_API void set_num_interop_threads(int);
+
+// Returns the number of threads used for inter-op parallelism
+TORCH_API int get_num_interop_threads();
+
+// Launches inter-op parallel task
+TORCH_API void launch(std::function<void()> func);
+namespace internal {
+void launch_no_thread_state(std::function<void()> fn);
+} // namespace internal
+
+// Launches intra-op parallel task
+TORCH_API void intraop_launch(std::function<void()> func);
+
+// Returns number of intra-op threads used by default
+TORCH_API int intraop_default_num_threads();
+
+} // namespace at
+
+#if AT_PARALLEL_OPENMP
+#include <ATen/ParallelOpenMP.h> // IWYU pragma: keep
+#elif AT_PARALLEL_NATIVE
+#include <ATen/ParallelNative.h> // IWYU pragma: keep
+#elif AT_PARALLEL_NATIVE_TBB
+#include <ATen/ParallelNativeTBB.h> // IWYU pragma: keep
+#endif
+
+#include <ATen/Parallel-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ParallelFuture.h b/MLPY/Lib/site-packages/torch/include/ATen/ParallelFuture.h
new file mode 100644
index 0000000000000000000000000000000000000000..f05e79b333c8dd3a7dfb7874c102f534ba354a2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ParallelFuture.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <functional>
+
+namespace at {
+
+// Launches intra-op parallel task, returns a future
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> intraop_launch_future(
+    std::function<void()> func);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ParallelNative.h b/MLPY/Lib/site-packages/torch/include/ATen/ParallelNative.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd572a697eb4d7fb0f7bce45b4b887713c6e5534
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ParallelNative.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <exception>
+
+#include <c10/util/Exception.h>
+
+#define INTRA_OP_PARALLEL
+
+namespace at::internal {
+
+TORCH_API void invoke_parallel(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const std::function<void(int64_t, int64_t)>& f);
+
+} // namespace at::internal
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ParallelNativeTBB.h b/MLPY/Lib/site-packages/torch/include/ATen/ParallelNativeTBB.h
new file mode 100644
index 0000000000000000000000000000000000000000..0378a733a6a2762838c76cc191e65996beb20747
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ParallelNativeTBB.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <atomic>
+#include <cstddef>
+#include <exception>
+
+#include <c10/util/Exception.h>
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#endif
+#include <tbb/tbb.h>
+
+#define INTRA_OP_PARALLEL
+
+namespace at::internal {
+
+template <typename F>
+inline void invoke_parallel(
+    const int64_t begin,
+    const int64_t end,
+    const int64_t grain_size,
+    const F& f) {
+  // Choose number of tasks based on grain size and number of threads.
+  int64_t chunk_size = divup((end - begin), get_num_threads());
+  // Make sure each task is at least grain_size size.
+  chunk_size = std::max(grain_size, chunk_size);
+
+  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
+  std::exception_ptr eptr;
+  tbb::parallel_for(
+      tbb::blocked_range<int64_t>(begin, end, chunk_size),
+      [&eptr, &err_flag, f](const tbb::blocked_range<int64_t>& r) {
+        try {
+          internal::ThreadIdGuard tid_guard(
+              tbb::this_task_arena::current_thread_index());
+          f(r.begin(), r.end());
+        } catch (...) {
+          if (!err_flag.test_and_set()) {
+            eptr = std::current_exception();
+          }
+        }
+      },
+      tbb::static_partitioner{});
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  }
+}
+
+} // namespace at::internal
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ParallelOpenMP.h b/MLPY/Lib/site-packages/torch/include/ATen/ParallelOpenMP.h
new file mode 100644
index 0000000000000000000000000000000000000000..40a8830c764543d90f8b0180fa3a91039a537d38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ParallelOpenMP.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <exception>
+
+#ifdef _OPENMP
+#define INTRA_OP_PARALLEL
+
+#include <omp.h>
+#endif
+
+#ifdef _OPENMP
+namespace at::internal {
+template <typename F>
+inline void invoke_parallel(
+    int64_t begin,
+    int64_t end,
+    int64_t grain_size,
+    const F& f) {
+  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
+  std::exception_ptr eptr;
+
+#pragma omp parallel
+  {
+    // choose number of tasks based on grain size and number of threads
+    // can't use num_threads clause due to bugs in GOMP's thread pool (See
+    // #32008)
+    int64_t num_threads = omp_get_num_threads();
+    if (grain_size > 0) {
+      num_threads = std::min(num_threads, divup((end - begin), grain_size));
+    }
+
+    int64_t tid = omp_get_thread_num();
+    int64_t chunk_size = divup((end - begin), num_threads);
+    int64_t begin_tid = begin + tid * chunk_size;
+    if (begin_tid < end) {
+      try {
+        internal::ThreadIdGuard tid_guard(tid);
+        f(begin_tid, std::min(end, chunk_size + begin_tid));
+      } catch (...) {
+        if (!err_flag.test_and_set()) {
+          eptr = std::current_exception();
+        }
+      }
+    }
+  }
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  }
+}
+} // namespace at::internal
+#endif // _OPENMP
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h b/MLPY/Lib/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7ca10fd8895bf40842b6d3d6c7adff367d49fc7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+
+namespace at::impl {
+
+enum TorchFunctionDisabledState { ENABLED, SUBCLASSES_DISABLED, ALL_DISABLED };
+
+struct TORCH_API PythonTorchFunctionTLS {
+  static void set_disabled_state(TorchFunctionDisabledState disabled_state_);
+  static TorchFunctionDisabledState get_disabled_state();
+
+  static void push_onto_stack(std::shared_ptr<SafePyObject> mode);
+  static const std::shared_ptr<SafePyObject> pop_stack();
+  static const std::shared_ptr<SafePyObject>& get_stack_at(int64_t idx);
+  static int64_t stack_len();
+
+  static const PythonTorchFunctionTLS& get_state();
+  static void set_state(const PythonTorchFunctionTLS& state);
+
+ private:
+  // The mode TLS is split into
+  //   - disabled_state, which says which part of torch function are disabled
+  //   - stack_, which is a vector of modes representing the stack of user
+  //   defined modes
+  TorchFunctionDisabledState disabled_state_ =
+      TorchFunctionDisabledState::ENABLED;
+  std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
+};
+
+TORCH_API bool torch_function_mode_enabled();
+
+} // namespace at::impl
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/RedispatchFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/RedispatchFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9d205bb300f8fb8c6fe5a11b29549a2912c46d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/RedispatchFunctions.h
@@ -0,0 +1,24791 @@
+#pragma once
+
+// @generated by torchgen/gen.py from RedispatchFunctions.h
+
+#ifdef TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider using the at::_ops::{name}::redispatch() interface by including     \
+  the specific operator from <ATen/ops/{my_operator}_ops.h>
+#endif
+
+#include <c10/core/Scalar.h>
+#include <ATen/Tensor.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Deprecated.h>
+#include <ATen/DeviceGuard.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/Reduction.h>
+#include <c10/util/Optional.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Context.h>
+#include <ATen/TracerMode.h>
+#include <ATen/Operators.h>
+
+namespace at {
+
+namespace redispatch {
+    
+    // aten::_cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Byte(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Byte::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Char(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Char(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Char::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Double(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Double(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Double::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Float(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Float(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Float::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Int(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Int(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Int::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Long(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Long(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Long::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Short(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Short(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Short::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_cast_Half(Tensor self, bool non_blocking=False) -> Tensor
+    inline at::Tensor _cast_Half(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking=false) {
+        return at::_ops::_cast_Half::redispatch(dispatchKeySet, self, non_blocking);
+    }
+    
+    // aten::_backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
+    inline void __dispatch__backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList inputs, const c10::optional<at::Tensor> & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false) {
+        return at::_ops::_backward::redispatch(dispatchKeySet, self, inputs, gradient, retain_graph, create_graph);
+    }
+    
+    // aten::set_data(Tensor(a!) self, Tensor new_data) -> ()
+    inline void __dispatch_set_data(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & new_data) {
+        return at::_ops::set_data::redispatch(dispatchKeySet, self, new_data);
+    }
+    
+    // aten::data(Tensor self) -> Tensor
+    inline at::Tensor __dispatch_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::data::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_leaf(Tensor self) -> bool
+    inline bool __dispatch_is_leaf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_leaf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::output_nr(Tensor self) -> int
+    inline int64_t __dispatch_output_nr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::output_nr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_version(Tensor self) -> int
+    inline int64_t __dispatch__version(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_version::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
+    inline at::Tensor & __dispatch_requires_grad_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, bool requires_grad=true) {
+        return at::_ops::requires_grad_::redispatch(dispatchKeySet, self, requires_grad);
+    }
+    
+    // aten::retain_grad(Tensor(a!) self) -> ()
+    inline void __dispatch_retain_grad(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::retain_grad::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::retains_grad(Tensor self) -> bool
+    inline bool __dispatch_retains_grad(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::retains_grad::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_fw_primal(Tensor(a) self, int level) -> Tensor(a)
+    inline at::Tensor _fw_primal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t level) {
+        return at::_ops::_fw_primal::redispatch(dispatchKeySet, self, level);
+    }
+    
+    // aten::_make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)
+    inline at::Tensor _make_dual(c10::DispatchKeySet dispatchKeySet, const at::Tensor & primal, const at::Tensor & tangent, int64_t level) {
+        return at::_ops::_make_dual::redispatch(dispatchKeySet, primal, tangent, level);
+    }
+    
+    // aten::_unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)
+    inline ::std::tuple<at::Tensor,at::Tensor> _unpack_dual(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dual, int64_t level) {
+        return at::_ops::_unpack_dual::redispatch(dispatchKeySet, dual, level);
+    }
+    
+    // aten::_new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
+    inline at::Tensor _new_zeros_with_same_feature_meta(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0) {
+        return at::_ops::_new_zeros_with_same_feature_meta::redispatch(dispatchKeySet, self, other, self_num_batch_dims);
+    }
+    
+    // aten::_has_same_storage_numel(Tensor self, Tensor other) -> bool
+    inline bool _has_same_storage_numel(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::_has_same_storage_numel::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
+    inline at::Tensor & rename_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::optional<at::DimnameList> names) {
+        return at::_ops::rename_::redispatch(dispatchKeySet, self, names);
+    }
+    
+    // aten::rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
+    inline at::Tensor rename(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::DimnameList> names) {
+        return at::_ops::rename::redispatch(dispatchKeySet, self, names);
+    }
+    
+    // aten::align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
+    inline at::Tensor align_to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList names) {
+        return at::_ops::align_to::redispatch(dispatchKeySet, self, names);
+    }
+    
+    // aten::align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
+    inline at::Tensor align_to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList order, int64_t ellipsis_idx) {
+        return at::_ops::align_to_ellipsis_idx::redispatch(dispatchKeySet, self, order, ellipsis_idx);
+    }
+    
+    // aten::align_as(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor align_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::align_as::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::align_tensors(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> align_tensors(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::align_tensors::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::_assert_async(Tensor self) -> ()
+    inline void _assert_async(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_assert_async::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_assert_async.msg(Tensor self, str assert_msg) -> ()
+    inline void _assert_async(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view assert_msg) {
+        return at::_ops::_assert_async_msg::redispatch(dispatchKeySet, self, assert_msg);
+    }
+    
+    // aten::_assert_scalar(Scalar self, str assert_msg) -> ()
+    inline void _assert_scalar(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, c10::string_view assert_msg) {
+        return at::_ops::_assert_scalar::redispatch(dispatchKeySet, self, assert_msg);
+    }
+    
+    // aten::_functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor
+    inline at::Tensor _functional_assert_scalar(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, c10::string_view assert_msg, const at::Tensor & dep_token) {
+        return at::_ops::_functional_assert_scalar::redispatch(dispatchKeySet, self, assert_msg, dep_token);
+    }
+    
+    // aten::_functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor
+    inline at::Tensor _functional_assert_async(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view assert_msg, const at::Tensor & dep_token) {
+        return at::_ops::_functional_assert_async_msg::redispatch(dispatchKeySet, self, assert_msg, dep_token);
+    }
+    
+    // aten::_assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+    inline void _assert_tensor_metadata(c10::DispatchKeySet dispatchKeySet, const at::Tensor & a, at::OptionalIntArrayRef size=c10::nullopt, at::OptionalIntArrayRef stride=c10::nullopt, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_assert_tensor_metadata::redispatch(dispatchKeySet, a, size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*size)) : c10::nullopt, stride.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*stride)) : c10::nullopt, dtype);
+    }
+    
+    // aten::_assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()
+    inline void _assert_tensor_metadata_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & a, at::OptionalSymIntArrayRef size=c10::nullopt, at::OptionalSymIntArrayRef stride=c10::nullopt, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_assert_tensor_metadata::redispatch(dispatchKeySet, a, size, stride, dtype);
+    }
+    
+    // aten::_print(str s) -> ()
+    inline void _print(c10::DispatchKeySet dispatchKeySet, c10::string_view s) {
+        return at::_ops::_print::redispatch(dispatchKeySet, s);
+    }
+    
+    // aten::sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()
+    inline void sym_constrain_range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, c10::optional<int64_t> min=c10::nullopt, c10::optional<int64_t> max=c10::nullopt) {
+        return at::_ops::sym_constrain_range::redispatch(dispatchKeySet, size, min, max);
+    }
+    
+    // aten::sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()
+    inline void sym_constrain_range_for_size(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, c10::optional<int64_t> min=c10::nullopt, c10::optional<int64_t> max=c10::nullopt) {
+        return at::_ops::sym_constrain_range_for_size::redispatch(dispatchKeySet, size, min, max);
+    }
+    
+    // aten::_functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+    inline at::Tensor _functional_sym_constrain_range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max, const at::Tensor & dep_token) {
+        return at::_ops::_functional_sym_constrain_range::redispatch(dispatchKeySet, size, min, max, dep_token);
+    }
+    
+    // aten::_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor
+    inline at::Tensor _functional_sym_constrain_range_for_size(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max, const at::Tensor & dep_token) {
+        return at::_ops::_functional_sym_constrain_range_for_size::redispatch(dispatchKeySet, size, min, max, dep_token);
+    }
+    
+    // aten::_make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor _make_dep_token(c10::DispatchKeySet dispatchKeySet, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::_make_dep_token::redispatch(dispatchKeySet, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor _make_dep_token(c10::DispatchKeySet dispatchKeySet, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_make_dep_token::redispatch(dispatchKeySet, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
+    inline at::Tensor refine_names(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList names) {
+        return at::_ops::refine_names::redispatch(dispatchKeySet, self, names);
+    }
+    
+    // aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
+    inline bool _use_cudnn_ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank) {
+        return at::_ops::_use_cudnn_ctc_loss::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank);
+    }
+    
+    // aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
+    inline bool _use_cudnn_ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank) {
+        return at::_ops::_use_cudnn_ctc_loss_Tensor::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank);
+    }
+    
+    // aten::_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _cudnn_ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) {
+        return at::_ops::_cudnn_ctc_loss::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity);
+    }
+    
+    // aten::_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _cudnn_ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity) {
+        return at::_ops::_cudnn_ctc_loss_Tensor::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity);
+    }
+    
+    // aten::_use_cudnn_rnn_flatten_weight() -> bool
+    inline bool _use_cudnn_rnn_flatten_weight(c10::DispatchKeySet dispatchKeySet) {
+        return at::_ops::_use_cudnn_rnn_flatten_weight::redispatch(dispatchKeySet);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
+    inline at::Tensor _cudnn_rnn_flatten_weight(c10::DispatchKeySet dispatchKeySet, at::TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, bool bidirectional) {
+        return at::_ops::_cudnn_rnn_flatten_weight::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
+    inline at::Tensor _cudnn_rnn_flatten_weight_symint(c10::DispatchKeySet dispatchKeySet, at::TensorList weight_arr, int64_t weight_stride0, c10::SymInt input_size, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, bool bidirectional) {
+        return at::_ops::_cudnn_rnn_flatten_weight::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional);
+    }
+    
+    // aten::_cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _cudnn_rnn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::_cudnn_rnn::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state);
+    }
+    
+    // aten::_cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _cudnn_rnn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::_cudnn_rnn::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state);
+    }
+    
+    // aten::_cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,::std::vector<at::Tensor>> _cudnn_rnn_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::_cudnn_rnn_backward::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state, reserve, output_mask);
+    }
+    
+    // aten::_cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,::std::vector<at::Tensor>> _cudnn_rnn_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::_cudnn_rnn_backward::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask);
+    }
+    
+    // aten::_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor _cudnn_init_dropout_state(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, at::TensorOptions options) {
+        return at::_ops::_cudnn_init_dropout_state::redispatch(dispatchKeySet, dropout, train, dropout_seed, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor _cudnn_init_dropout_state(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_cudnn_init_dropout_state::redispatch(dispatchKeySet, dropout, train, dropout_seed, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_debug_has_internal_overlap(Tensor self) -> int
+    inline int64_t _debug_has_internal_overlap(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_debug_has_internal_overlap::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _fused_dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_fused_dropout::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::_masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
+    inline at::Tensor _masked_scale(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, double scale) {
+        return at::_ops::_masked_scale::redispatch(dispatchKeySet, self, mask, scale);
+    }
+    
+    // aten::native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> native_dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, c10::optional<bool> train) {
+        return at::_ops::native_dropout::redispatch(dispatchKeySet, input, p, train);
+    }
+    
+    // aten::native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
+    inline at::Tensor native_dropout_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
+        return at::_ops::native_dropout_backward::redispatch(dispatchKeySet, grad_output, mask, scale);
+    }
+    
+    // aten::_sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _sobol_engine_draw(c10::DispatchKeySet dispatchKeySet, const at::Tensor & quasi, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated, c10::optional<at::ScalarType> dtype) {
+        return at::_ops::_sobol_engine_draw::redispatch(dispatchKeySet, quasi, n, sobolstate, dimension, num_generated, dtype);
+    }
+    
+    // aten::_sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
+    inline at::Tensor & _sobol_engine_ff_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated) {
+        return at::_ops::_sobol_engine_ff_::redispatch(dispatchKeySet, self, n, sobolstate, dimension, num_generated);
+    }
+    
+    // aten::_sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
+    inline at::Tensor & _sobol_engine_scramble_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & ltm, int64_t dimension) {
+        return at::_ops::_sobol_engine_scramble_::redispatch(dispatchKeySet, self, ltm, dimension);
+    }
+    
+    // aten::_sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
+    inline at::Tensor & _sobol_engine_initialize_state_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dimension) {
+        return at::_ops::_sobol_engine_initialize_state_::redispatch(dispatchKeySet, self, dimension);
+    }
+    
+    // aten::_reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
+    inline at::Tensor _reshape_from_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & shape) {
+        return at::_ops::_reshape_from_tensor::redispatch(dispatchKeySet, self, shape);
+    }
+    
+    // aten::_shape_as_tensor(Tensor self) -> Tensor
+    inline at::Tensor _shape_as_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_shape_as_tensor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::dropout(Tensor input, float p, bool train) -> Tensor
+    inline at::Tensor dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train) {
+        return at::_ops::dropout::redispatch(dispatchKeySet, input, p, train);
+    }
+    
+    // aten::dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+    inline at::Tensor & dropout_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train) {
+        return at::_ops::dropout_::redispatch(dispatchKeySet, self, p, train);
+    }
+    
+    // aten::feature_dropout(Tensor input, float p, bool train) -> Tensor
+    inline at::Tensor feature_dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train) {
+        return at::_ops::feature_dropout::redispatch(dispatchKeySet, input, p, train);
+    }
+    
+    // aten::feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+    inline at::Tensor & feature_dropout_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train) {
+        return at::_ops::feature_dropout_::redispatch(dispatchKeySet, self, p, train);
+    }
+    
+    // aten::alpha_dropout(Tensor input, float p, bool train) -> Tensor
+    inline at::Tensor alpha_dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train) {
+        return at::_ops::alpha_dropout::redispatch(dispatchKeySet, input, p, train);
+    }
+    
+    // aten::alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+    inline at::Tensor & alpha_dropout_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train) {
+        return at::_ops::alpha_dropout_::redispatch(dispatchKeySet, self, p, train);
+    }
+    
+    // aten::feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
+    inline at::Tensor feature_alpha_dropout(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train) {
+        return at::_ops::feature_alpha_dropout::redispatch(dispatchKeySet, input, p, train);
+    }
+    
+    // aten::feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
+    inline at::Tensor & feature_alpha_dropout_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train) {
+        return at::_ops::feature_alpha_dropout_::redispatch(dispatchKeySet, self, p, train);
+    }
+    
+    // aten::abs(Tensor self) -> Tensor
+    inline at::Tensor abs(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::abs::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::abs_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & abs_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::abs_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & abs_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::abs_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & abs_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::abs_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::absolute(Tensor self) -> Tensor
+    inline at::Tensor absolute(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::absolute::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::absolute_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & absolute_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::absolute_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & absolute_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::absolute_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & absolute_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::absolute_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::angle(Tensor self) -> Tensor
+    inline at::Tensor angle(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::angle::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & angle_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::angle_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & angle_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::angle_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::view_as_real(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor view_as_real(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::view_as_real::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::view_as_complex(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor view_as_complex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::view_as_complex::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sgn(Tensor self) -> Tensor
+    inline at::Tensor sgn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sgn::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sgn_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sgn_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sgn_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sgn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sgn_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sgn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sgn_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor chalf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::chalf::redispatch(dispatchKeySet, self, memory_format);
+    }
+    
+    // aten::real(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor real(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::real::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::imag(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor imag(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::imag::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_conj(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _conj(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_conj::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::conj(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor __dispatch_conj(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::conj::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_conj_physical(Tensor self) -> Tensor
+    inline at::Tensor _conj_physical(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_conj_physical::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::conj_physical(Tensor self) -> Tensor
+    inline at::Tensor conj_physical(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::conj_physical::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conj_physical_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::conj_physical_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conj_physical_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::conj_physical_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::conj_physical_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & conj_physical_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::conj_physical_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::resolve_conj(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor resolve_conj(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::resolve_conj::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::resolve_neg(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor resolve_neg(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::resolve_neg::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_neg_view(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _neg_view(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_neg_view::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::acos(Tensor self) -> Tensor
+    inline at::Tensor acos(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::acos::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::acos_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & acos_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::acos_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & acos_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::acos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & acos_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::acos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arccos(Tensor self) -> Tensor
+    inline at::Tensor arccos(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arccos::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arccos_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arccos_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arccos_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arccos_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arccos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arccos_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arccos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
+    inline at::Tensor avg_pool1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true) {
+        return at::_ops::avg_pool1d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad);
+    }
+    
+    // aten::adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
+    inline at::Tensor adaptive_avg_pool1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool1d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_max_pool1d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::add_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & add_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::add__Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & add_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::add_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & add_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::add_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor _add_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & _add_relu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu__Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _add_relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _add_relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::_add_relu_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    inline at::Tensor _add_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu_Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & _add_relu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu__Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    inline at::Tensor add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::add_Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & add_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::add__Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor addmv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmv::redispatch(dispatchKeySet, self, mat, vec, beta, alpha);
+    }
+    
+    // aten::addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & addmv_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmv_::redispatch(dispatchKeySet, self, mat, vec, beta, alpha);
+    }
+    
+    // aten::addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addmv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmv_out::redispatch(dispatchKeySet, self, mat, vec, beta, alpha, out);
+    }
+    
+    // aten::addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addmv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::addmv_out::redispatch(dispatchKeySet, self, mat, vec, beta, alpha, out);
+    }
+    
+    // aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor addr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addr::redispatch(dispatchKeySet, self, vec1, vec2, beta, alpha);
+    }
+    
+    // aten::addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & addr_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addr_::redispatch(dispatchKeySet, self, vec1, vec2, beta, alpha);
+    }
+    
+    // aten::addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addr_out::redispatch(dispatchKeySet, self, vec1, vec2, beta, alpha, out);
+    }
+    
+    // aten::addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::addr_out::redispatch(dispatchKeySet, self, vec1, vec2, beta, alpha, out);
+    }
+    
+    // aten::affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
+    inline at::Tensor affine_grid_generator(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, at::IntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator::redispatch(dispatchKeySet, theta, c10::fromIntArrayRefSlow(size), align_corners);
+    }
+    
+    // aten::affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor
+    inline at::Tensor affine_grid_generator_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, c10::SymIntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator::redispatch(dispatchKeySet, theta, size, align_corners);
+    }
+    
+    // aten::affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
+    inline at::Tensor affine_grid_generator_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, at::IntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator_backward::redispatch(dispatchKeySet, grad, c10::fromIntArrayRefSlow(size), align_corners);
+    }
+    
+    // aten::affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor
+    inline at::Tensor affine_grid_generator_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator_backward::redispatch(dispatchKeySet, grad, size, align_corners);
+    }
+    
+    // aten::_is_all_true(Tensor self) -> Tensor
+    inline at::Tensor _is_all_true(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_is_all_true::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_is_any_true(Tensor self) -> Tensor
+    inline at::Tensor _is_any_true(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_is_any_true::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_test_check_tensor(Tensor self) -> Tensor
+    inline at::Tensor _test_check_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_test_check_tensor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_test_functorch_fallback(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor _test_functorch_fallback(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::_test_functorch_fallback::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+    inline at::Tensor all(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::all_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+    inline at::Tensor all(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) {
+        return at::_ops::all_dims::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::all_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::all_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) {
+        return at::_ops::all_dims_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::all_dims_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+    inline at::Tensor all(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::all_dimname::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::all_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::all_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
+    inline bool allclose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) {
+        return at::_ops::allclose::redispatch(dispatchKeySet, self, other, rtol, atol, equal_nan);
+    }
+    
+    // aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+    inline at::Tensor any(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::any_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+    inline at::Tensor any(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) {
+        return at::_ops::any_dims::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::any_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::any_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) {
+        return at::_ops::any_dims_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::any_dims_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+    inline at::Tensor any(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::any_dimname::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::any_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::any_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & end, at::TensorOptions options={}) {
+        return at::_ops::arange::redispatch(dispatchKeySet, end, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & end, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::arange::redispatch(dispatchKeySet, end, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, at::TensorOptions options={}) {
+        return at::_ops::arange_start::redispatch(dispatchKeySet, start, end, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::arange_start::redispatch(dispatchKeySet, start, end, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::TensorOptions options={}) {
+        return at::_ops::arange_start_step::redispatch(dispatchKeySet, start, end, step, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor arange(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::arange_start_step::redispatch(dispatchKeySet, start, end, step, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arange_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & end) {
+        return at::_ops::arange_out::redispatch(dispatchKeySet, end, out);
+    }
+    
+    // aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arange_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & end, at::Tensor & out) {
+        return at::_ops::arange_out::redispatch(dispatchKeySet, end, out);
+    }
+    
+    // aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arange_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step) {
+        return at::_ops::arange_start_out::redispatch(dispatchKeySet, start, end, step, out);
+    }
+    
+    // aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arange_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out) {
+        return at::_ops::arange_start_out::redispatch(dispatchKeySet, start, end, step, out);
+    }
+    
+    // aten::_dim_arange(Tensor like, int dim) -> Tensor
+    inline at::Tensor _dim_arange(c10::DispatchKeySet dispatchKeySet, const at::Tensor & like, int64_t dim) {
+        return at::_ops::_dim_arange::redispatch(dispatchKeySet, like, dim);
+    }
+    
+    // aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+    inline at::Tensor argmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::argmax::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::argmax_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::argmax_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+    inline at::Tensor argmin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::argmin::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argmin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::argmin_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argmin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::argmin_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::acosh(Tensor self) -> Tensor
+    inline at::Tensor acosh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::acosh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::acosh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & acosh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::acosh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & acosh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::acosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & acosh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::acosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arccosh(Tensor self) -> Tensor
+    inline at::Tensor arccosh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arccosh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arccosh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arccosh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arccosh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arccosh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arccosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arccosh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arccosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::asinh(Tensor self) -> Tensor
+    inline at::Tensor asinh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::asinh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::asinh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & asinh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::asinh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & asinh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::asinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & asinh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::asinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arcsinh(Tensor self) -> Tensor
+    inline at::Tensor arcsinh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arcsinh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arcsinh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arcsinh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arcsinh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arcsinh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arcsinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arcsinh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arcsinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::atanh(Tensor self) -> Tensor
+    inline at::Tensor atanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::atanh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atanh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & atanh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::atanh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atanh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::atanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atanh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::atanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arctanh(Tensor self) -> Tensor
+    inline at::Tensor arctanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arctanh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arctanh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arctanh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arctanh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctanh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arctanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctanh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arctanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+    inline at::Tensor as_strided(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+    }
+    
+    // aten::as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+    inline at::Tensor as_strided_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided::redispatch(dispatchKeySet, self, size, stride, storage_offset);
+    }
+    
+    // aten::as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+    inline const at::Tensor & as_strided_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+    }
+    
+    // aten::as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+    inline const at::Tensor & as_strided__symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_::redispatch(dispatchKeySet, self, size, stride, storage_offset);
+    }
+    
+    // aten::asin(Tensor self) -> Tensor
+    inline at::Tensor asin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::asin::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::asin_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & asin_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::asin_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & asin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::asin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & asin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::asin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arcsin(Tensor self) -> Tensor
+    inline at::Tensor arcsin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arcsin::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arcsin_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arcsin_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arcsin_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arcsin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arcsin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arcsin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arcsin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::atan(Tensor self) -> Tensor
+    inline at::Tensor atan(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::atan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atan_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & atan_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::atan_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atan_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::atan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atan_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::atan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arctan(Tensor self) -> Tensor
+    inline at::Tensor arctan(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::arctan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arctan_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & arctan_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::arctan_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctan_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::arctan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctan_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::arctan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::atleast_1d(Tensor self) -> Tensor
+    inline at::Tensor atleast_1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::atleast_1d::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> atleast_1d(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::atleast_1d_Sequence::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::atleast_2d(Tensor self) -> Tensor
+    inline at::Tensor atleast_2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::atleast_2d::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> atleast_2d(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::atleast_2d_Sequence::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::atleast_3d(Tensor self) -> Tensor
+    inline at::Tensor atleast_3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::atleast_3d::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> atleast_3d(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::atleast_3d_Sequence::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor baddbmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::baddbmm::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha);
+    }
+    
+    // aten::baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & baddbmm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::baddbmm_::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha);
+    }
+    
+    // aten::baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & baddbmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::baddbmm_out::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha, out);
+    }
+    
+    // aten::baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & baddbmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::baddbmm_out::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha, out);
+    }
+    
+    // aten::bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor bartlett_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::TensorOptions options={}) {
+        return at::_ops::bartlett_window::redispatch(dispatchKeySet, window_length, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor bartlett_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::bartlett_window::redispatch(dispatchKeySet, window_length, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor bartlett_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::TensorOptions options={}) {
+        return at::_ops::bartlett_window_periodic::redispatch(dispatchKeySet, window_length, periodic, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor bartlett_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::bartlett_window_periodic::redispatch(dispatchKeySet, window_length, periodic, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor
+    inline at::Tensor batch_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps, bool cudnn_enabled) {
+        return at::_ops::batch_norm::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled);
+    }
+    
+    // aten::quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor
+    inline at::Tensor quantized_batch_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & var, double eps, double output_scale, int64_t output_zero_point) {
+        return at::_ops::quantized_batch_norm::redispatch(dispatchKeySet, input, weight, bias, mean, var, eps, output_scale, output_zero_point);
+    }
+    
+    // aten::_batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t> _batch_norm_impl_index(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps, bool cudnn_enabled) {
+        return at::_ops::_batch_norm_impl_index::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled);
+    }
+    
+    // aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _batch_norm_impl_index_backward(c10::DispatchKeySet dispatchKeySet, int64_t impl_index, const at::Tensor & input, const at::Tensor & grad_output, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var_transform, bool train, double eps, ::std::array<bool,3> output_mask, const at::Tensor & reservedSpace) {
+        return at::_ops::_batch_norm_impl_index_backward::redispatch(dispatchKeySet, impl_index, input, grad_output, weight, running_mean, running_var, save_mean, save_var_transform, train, eps, output_mask, reservedSpace);
+    }
+    
+    // aten::bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+    inline at::Tensor bernoulli(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::bernoulli_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & bernoulli_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli__Tensor::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & bernoulli_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p=0.5, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli__float::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
+    inline at::Tensor bernoulli(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli_p::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor
+    inline at::Tensor bilinear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}) {
+        return at::_ops::bilinear::redispatch(dispatchKeySet, input1, input2, weight, bias);
+    }
+    
+    // aten::binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+    inline at::Tensor binary_cross_entropy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy::redispatch(dispatchKeySet, self, target, weight, reduction);
+    }
+    
+    // aten::binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy_out::redispatch(dispatchKeySet, self, target, weight, reduction, out);
+    }
+    
+    // aten::binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out) {
+        return at::_ops::binary_cross_entropy_out::redispatch(dispatchKeySet, self, target, weight, reduction, out);
+    }
+    
+    // aten::binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+    inline at::Tensor binary_cross_entropy_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy_backward::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction);
+    }
+    
+    // aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, grad_input);
+    }
+    
+    // aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & grad_input) {
+        return at::_ops::binary_cross_entropy_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, grad_input);
+    }
+    
+    // aten::binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
+    inline at::Tensor binary_cross_entropy_with_logits(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, const c10::optional<at::Tensor> & pos_weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy_with_logits::redispatch(dispatchKeySet, self, target, weight, pos_weight, reduction);
+    }
+    
+    // aten::bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+    inline at::Tensor bincount(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & weights={}, int64_t minlength=0) {
+        return at::_ops::bincount::redispatch(dispatchKeySet, self, weights, minlength);
+    }
+    
+    // aten::bitwise_not(Tensor self) -> Tensor
+    inline at::Tensor bitwise_not(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::bitwise_not::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & bitwise_not_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::bitwise_not_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_not_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::bitwise_not_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_not_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::bitwise_not_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copysign_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::copysign_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copysign_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::copysign_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::copysign.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor copysign(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::copysign_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & copysign_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::copysign__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::copysign.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor copysign(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::copysign_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & copysign_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::copysign__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copysign_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::copysign_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copysign_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::copysign_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_lazy_clone(Tensor self) -> Tensor
+    inline at::Tensor _lazy_clone(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_lazy_clone::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::logical_not(Tensor self) -> Tensor
+    inline at::Tensor logical_not(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::logical_not::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::logical_not_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & logical_not_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::logical_not_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_not_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::logical_not_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_not_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::logical_not_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::logical_xor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor logical_xor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_xor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & logical_xor_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_xor_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_xor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_xor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_xor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::logical_xor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logical_and(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor logical_and(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_and::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & logical_and_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_and_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_and_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_and_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_and_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::logical_and_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logical_or(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor logical_or(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_or::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & logical_or_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_or_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_or_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logical_or_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logical_or_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::logical_or_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor blackman_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::TensorOptions options={}) {
+        return at::_ops::blackman_window::redispatch(dispatchKeySet, window_length, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor blackman_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::blackman_window::redispatch(dispatchKeySet, window_length, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor blackman_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::TensorOptions options={}) {
+        return at::_ops::blackman_window_periodic::redispatch(dispatchKeySet, window_length, periodic, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor blackman_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::blackman_window_periodic::redispatch(dispatchKeySet, window_length, periodic, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::bmm(Tensor self, Tensor mat2) -> Tensor
+    inline at::Tensor bmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::bmm::redispatch(dispatchKeySet, self, mat2);
+    }
+    
+    // aten::bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::bmm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out) {
+        return at::_ops::bmm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::broadcast_tensors(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> broadcast_tensors(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::broadcast_tensors::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+    inline at::Tensor broadcast_to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::broadcast_to::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+    inline at::Tensor broadcast_to_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::broadcast_to::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::_sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
+    inline at::Tensor _sparse_broadcast_to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_sparse_broadcast_to::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::cat(Tensor[] tensors, int dim=0) -> Tensor
+    inline at::Tensor cat(c10::DispatchKeySet dispatchKeySet, const at::ITensorListRef & tensors, int64_t dim=0) {
+        return at::_ops::cat::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::ITensorListRef & tensors, int64_t dim=0) {
+        return at::_ops::cat_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cat_outf(c10::DispatchKeySet dispatchKeySet, const at::ITensorListRef & tensors, int64_t dim, at::Tensor & out) {
+        return at::_ops::cat_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::cat.names(Tensor[] tensors, Dimname dim) -> Tensor
+    inline at::Tensor cat(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::cat_names::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::cat_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cat_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out) {
+        return at::_ops::cat_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concat(Tensor[] tensors, int dim=0) -> Tensor
+    inline at::Tensor concat(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::concat::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::concat_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concat_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out) {
+        return at::_ops::concat_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concat.names(Tensor[] tensors, Dimname dim) -> Tensor
+    inline at::Tensor concat(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::concat_names::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::concat_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concat_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out) {
+        return at::_ops::concat_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concatenate(Tensor[] tensors, int dim=0) -> Tensor
+    inline at::Tensor concatenate(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::concatenate::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concatenate_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::concatenate_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concatenate_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out) {
+        return at::_ops::concatenate_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor
+    inline at::Tensor concatenate(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::concatenate_names::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concatenate_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, at::Dimname dim) {
+        return at::_ops::concatenate_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & concatenate_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out) {
+        return at::_ops::concatenate_names_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::block_diag(Tensor[] tensors) -> Tensor
+    inline at::Tensor block_diag(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::block_diag::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::ceil(Tensor self) -> Tensor
+    inline at::Tensor ceil(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::ceil::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::ceil_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & ceil_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::ceil_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ceil_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::ceil_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ceil_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::ceil_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::chain_matmul(Tensor[] matrices) -> Tensor
+    inline at::Tensor chain_matmul(c10::DispatchKeySet dispatchKeySet, at::TensorList matrices) {
+        return at::_ops::chain_matmul::redispatch(dispatchKeySet, matrices);
+    }
+    
+    // aten::chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & chain_matmul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList matrices) {
+        return at::_ops::chain_matmul_out::redispatch(dispatchKeySet, matrices, out);
+    }
+    
+    // aten::chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & chain_matmul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList matrices, at::Tensor & out) {
+        return at::_ops::chain_matmul_out::redispatch(dispatchKeySet, matrices, out);
+    }
+    
+    // aten::unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unsafe_chunk(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t chunks, int64_t dim=0) {
+        return at::_ops::unsafe_chunk::redispatch(dispatchKeySet, self, chunks, dim);
+    }
+    
+    // aten::chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> chunk(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t chunks, int64_t dim=0) {
+        return at::_ops::chunk::redispatch(dispatchKeySet, self, chunks, dim);
+    }
+    
+    // aten::tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> tensor_split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections, int64_t dim=0) {
+        return at::_ops::tensor_split_sections::redispatch(dispatchKeySet, self, sections, dim);
+    }
+    
+    // aten::tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> tensor_split_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt sections, int64_t dim=0) {
+        return at::_ops::tensor_split_sections::redispatch(dispatchKeySet, self, sections, dim);
+    }
+    
+    // aten::tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> tensor_split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices, int64_t dim=0) {
+        return at::_ops::tensor_split_indices::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(indices), dim);
+    }
+    
+    // aten::tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> tensor_split_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef indices, int64_t dim=0) {
+        return at::_ops::tensor_split_indices::redispatch(dispatchKeySet, self, indices, dim);
+    }
+    
+    // aten::tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> tensor_split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor_indices_or_sections, int64_t dim=0) {
+        return at::_ops::tensor_split_tensor_indices_or_sections::redispatch(dispatchKeySet, self, tensor_indices_or_sections, dim);
+    }
+    
+    // aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    inline at::Tensor clamp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clamp::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+    inline at::Tensor clamp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clamp_Tensor::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+    inline at::Tensor & clamp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clamp_::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+    inline at::Tensor & clamp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clamp__Tensor::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clamp_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out) {
+        return at::_ops::clamp_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clamp_Tensor_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out) {
+        return at::_ops::clamp_Tensor_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clamp_max(Tensor self, Scalar max) -> Tensor
+    inline at::Tensor clamp_max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & max) {
+        return at::_ops::clamp_max::redispatch(dispatchKeySet, self, max);
+    }
+    
+    // aten::clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+    inline at::Tensor clamp_max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & max) {
+        return at::_ops::clamp_max_Tensor::redispatch(dispatchKeySet, self, max);
+    }
+    
+    // aten::clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+    inline at::Tensor & clamp_max_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & max) {
+        return at::_ops::clamp_max_::redispatch(dispatchKeySet, self, max);
+    }
+    
+    // aten::clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
+    inline at::Tensor & clamp_max_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & max) {
+        return at::_ops::clamp_max__Tensor::redispatch(dispatchKeySet, self, max);
+    }
+    
+    // aten::clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & max) {
+        return at::_ops::clamp_max_out::redispatch(dispatchKeySet, self, max, out);
+    }
+    
+    // aten::clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & max, at::Tensor & out) {
+        return at::_ops::clamp_max_out::redispatch(dispatchKeySet, self, max, out);
+    }
+    
+    // aten::clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & max) {
+        return at::_ops::clamp_max_Tensor_out::redispatch(dispatchKeySet, self, max, out);
+    }
+    
+    // aten::clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & max, at::Tensor & out) {
+        return at::_ops::clamp_max_Tensor_out::redispatch(dispatchKeySet, self, max, out);
+    }
+    
+    // aten::clamp_min(Tensor self, Scalar min) -> Tensor
+    inline at::Tensor clamp_min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & min) {
+        return at::_ops::clamp_min::redispatch(dispatchKeySet, self, min);
+    }
+    
+    // aten::clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+    inline at::Tensor clamp_min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & min) {
+        return at::_ops::clamp_min_Tensor::redispatch(dispatchKeySet, self, min);
+    }
+    
+    // aten::clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+    inline at::Tensor & clamp_min_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & min) {
+        return at::_ops::clamp_min_::redispatch(dispatchKeySet, self, min);
+    }
+    
+    // aten::clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
+    inline at::Tensor & clamp_min_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & min) {
+        return at::_ops::clamp_min__Tensor::redispatch(dispatchKeySet, self, min);
+    }
+    
+    // aten::clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & min) {
+        return at::_ops::clamp_min_out::redispatch(dispatchKeySet, self, min, out);
+    }
+    
+    // aten::clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & min, at::Tensor & out) {
+        return at::_ops::clamp_min_out::redispatch(dispatchKeySet, self, min, out);
+    }
+    
+    // aten::clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & min) {
+        return at::_ops::clamp_min_Tensor_out::redispatch(dispatchKeySet, self, min, out);
+    }
+    
+    // aten::clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clamp_min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & min, at::Tensor & out) {
+        return at::_ops::clamp_min_Tensor_out::redispatch(dispatchKeySet, self, min, out);
+    }
+    
+    // aten::clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+    inline at::Tensor clip(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clip::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+    inline at::Tensor clip(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clip_Tensor::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+    inline at::Tensor & clip_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clip_::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+    inline at::Tensor & clip_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clip__Tensor::redispatch(dispatchKeySet, self, min, max);
+    }
+    
+    // aten::clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clip_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) {
+        return at::_ops::clip_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clip_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out) {
+        return at::_ops::clip_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clip_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) {
+        return at::_ops::clip_Tensor_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clip_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out) {
+        return at::_ops::clip_Tensor_out::redispatch(dispatchKeySet, self, min, max, out);
+    }
+    
+    // aten::cudnn_is_acceptable(Tensor self) -> bool
+    inline bool cudnn_is_acceptable(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::cudnn_is_acceptable::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::complex(Tensor real, Tensor imag) -> Tensor
+    inline at::Tensor complex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & real, const at::Tensor & imag) {
+        return at::_ops::complex::redispatch(dispatchKeySet, real, imag);
+    }
+    
+    // aten::complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & complex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & real, const at::Tensor & imag) {
+        return at::_ops::complex_out::redispatch(dispatchKeySet, real, imag, out);
+    }
+    
+    // aten::complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & complex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & real, const at::Tensor & imag, at::Tensor & out) {
+        return at::_ops::complex_out::redispatch(dispatchKeySet, real, imag, out);
+    }
+    
+    // aten::polar(Tensor abs, Tensor angle) -> Tensor
+    inline at::Tensor polar(c10::DispatchKeySet dispatchKeySet, const at::Tensor & abs, const at::Tensor & angle) {
+        return at::_ops::polar::redispatch(dispatchKeySet, abs, angle);
+    }
+    
+    // aten::polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & polar_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & abs, const at::Tensor & angle) {
+        return at::_ops::polar_out::redispatch(dispatchKeySet, abs, angle, out);
+    }
+    
+    // aten::polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & polar_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & abs, const at::Tensor & angle, at::Tensor & out) {
+        return at::_ops::polar_out::redispatch(dispatchKeySet, abs, angle, out);
+    }
+    
+    // aten::constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
+    inline at::Tensor constant_pad_nd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0) {
+        return at::_ops::constant_pad_nd::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad), value);
+    }
+    
+    // aten::constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor
+    inline at::Tensor constant_pad_nd_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0) {
+        return at::_ops::constant_pad_nd::redispatch(dispatchKeySet, self, pad, value);
+    }
+    
+    // aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+    inline at::Tensor __dispatch_contiguous(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::MemoryFormat memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::contiguous::redispatch(dispatchKeySet, self, memory_format);
+    }
+    
+    // aten::convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+    inline at::Tensor convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
+        return at::_ops::convolution::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups);
+    }
+    
+    // aten::convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+    inline at::Tensor convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+        return at::_ops::convolution::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups);
+    }
+    
+    // aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
+    }
+    
+    // aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask);
+    }
+    
+    // aten::convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+    inline at::Tensor convolution_overrideable(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
+        return at::_ops::convolution_overrideable::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups);
+    }
+    
+    // aten::convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+    inline at::Tensor convolution_overrideable_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+        return at::_ops::convolution_overrideable::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups);
+    }
+    
+    // aten::convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_overrideable(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_overrideable::redispatch(dispatchKeySet, grad_output, input, weight, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
+    }
+    
+    // aten::convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_overrideable_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_overrideable::redispatch(dispatchKeySet, grad_output, input, weight, stride, padding, dilation, transposed, output_padding, groups, output_mask);
+    }
+    
+    // aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+    inline at::Tensor _convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
+        return at::_ops::_convolution::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32);
+    }
+    
+    // aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+    inline at::Tensor _convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
+        return at::_ops::_convolution::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32);
+    }
+    
+    // aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+    inline at::Tensor _convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) {
+        return at::_ops::_convolution_deprecated::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled);
+    }
+    
+    // aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+    inline at::Tensor _convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) {
+        return at::_ops::_convolution_deprecated::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled);
+    }
+    
+    // aten::_convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _convolution_mode(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::_convolution_mode::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::_convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _convolution_mode_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::_convolution_mode::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & ggI, const c10::optional<at::Tensor> & ggW, const c10::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::_convolution_double_backward::redispatch(dispatchKeySet, ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
+    }
+    
+    // aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward_symint(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & ggI, const c10::optional<at::Tensor> & ggW, const c10::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::_convolution_double_backward::redispatch(dispatchKeySet, ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask);
+    }
+    
+    // aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv1d::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv1d::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv2d::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv2d::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv3d::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv3d::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv1d_padding::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv1d_padding::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv2d_padding::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv2d_padding::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::conv3d_padding::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor conv3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::conv3d_padding::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
+    inline at::Tensor conv_tbc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & bias, int64_t pad=0) {
+        return at::_ops::conv_tbc::redispatch(dispatchKeySet, self, weight, bias, pad);
+    }
+    
+    // aten::conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> conv_tbc_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, int64_t pad) {
+        return at::_ops::conv_tbc_backward::redispatch(dispatchKeySet, self, input, weight, bias, pad);
+    }
+    
+    // aten::conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) {
+        return at::_ops::conv_transpose1d::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::conv_transpose1d::redispatch(dispatchKeySet, input, weight, bias, stride, padding, output_padding, groups, dilation);
+    }
+    
+    // aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) {
+        return at::_ops::conv_transpose2d_input::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::conv_transpose2d_input::redispatch(dispatchKeySet, input, weight, bias, stride, padding, output_padding, groups, dilation);
+    }
+    
+    // aten::conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) {
+        return at::_ops::conv_transpose3d_input::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor conv_transpose3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::conv_transpose3d_input::redispatch(dispatchKeySet, input, weight, bias, stride, padding, output_padding, groups, dilation);
+    }
+    
+    // aten::copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+    inline at::Tensor copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+    inline at::Tensor & copy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy_::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
+    inline at::Tensor _copy_from(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking=false) {
+        return at::_ops::_copy_from::redispatch(dispatchKeySet, self, dst, non_blocking);
+    }
+    
+    // aten::_copy_from_and_resize(Tensor self, Tensor dst) -> Tensor
+    inline at::Tensor _copy_from_and_resize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst) {
+        return at::_ops::_copy_from_and_resize::redispatch(dispatchKeySet, self, dst);
+    }
+    
+    // aten::cos(Tensor self) -> Tensor
+    inline at::Tensor cos(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::cos::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::cos_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & cos_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::cos_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cos_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::cos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cos_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::cos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::cosh(Tensor self) -> Tensor
+    inline at::Tensor cosh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::cosh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::cosh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & cosh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::cosh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cosh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::cosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cosh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::cosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+    inline at::Tensor cosine_embedding_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & target, double margin=0.0, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::cosine_embedding_loss::redispatch(dispatchKeySet, input1, input2, target, margin, reduction);
+    }
+    
+    // aten::count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+    inline at::Tensor count_nonzero(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::count_nonzero_dim_IntList::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::count_nonzero(Tensor self, int? dim=None) -> Tensor
+    inline at::Tensor count_nonzero(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::count_nonzero::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
+    inline at::Tensor cov(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t correction=1, const c10::optional<at::Tensor> & fweights={}, const c10::optional<at::Tensor> & aweights={}) {
+        return at::_ops::cov::redispatch(dispatchKeySet, self, correction, fweights, aweights);
+    }
+    
+    // aten::corrcoef(Tensor self) -> Tensor
+    inline at::Tensor corrcoef(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::corrcoef::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
+    inline at::Tensor cudnn_affine_grid_generator(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) {
+        return at::_ops::cudnn_affine_grid_generator::redispatch(dispatchKeySet, theta, N, C, H, W);
+    }
+    
+    // aten::cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
+    inline at::Tensor cudnn_affine_grid_generator_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) {
+        return at::_ops::cudnn_affine_grid_generator_backward::redispatch(dispatchKeySet, grad, N, C, H, W);
+    }
+    
+    // aten::cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon) {
+        return at::_ops::cudnn_batch_norm::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon);
+    }
+    
+    // aten::cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_batch_norm_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, const at::Tensor & reserveSpace) {
+        return at::_ops::cudnn_batch_norm_backward::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon, reserveSpace);
+    }
+    
+    // aten::cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+    inline at::Tensor cudnn_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32);
+    }
+    
+    // aten::cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+    inline at::Tensor cudnn_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution::redispatch(dispatchKeySet, self, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    }
+    
+    // aten::cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_out::redispatch(dispatchKeySet, self, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_out::redispatch(dispatchKeySet, self, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+    inline at::Tensor cudnn_convolution_transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_transpose::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32);
+    }
+    
+    // aten::cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+    inline at::Tensor cudnn_convolution_transpose_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_transpose::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32);
+    }
+    
+    // aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _mps_convolution_transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::_mps_convolution_transpose::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _mps_convolution_transpose_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::_mps_convolution_transpose::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups);
+    }
+    
+    // aten::mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> mps_convolution_transpose_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,2> output_mask) {
+        return at::_ops::mps_convolution_transpose_backward::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask);
+    }
+    
+    // aten::mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> mps_convolution_transpose_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask) {
+        return at::_ops::mps_convolution_transpose_backward::redispatch(dispatchKeySet, self, grad_output, weight, padding, output_padding, stride, dilation, groups, output_mask);
+    }
+    
+    // aten::cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor cudnn_convolution_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::cudnn_convolution_relu::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor cudnn_convolution_relu_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::cudnn_convolution_relu::redispatch(dispatchKeySet, self, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor cudnn_convolution_add_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::cudnn_convolution_add_relu::redispatch(dispatchKeySet, self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor cudnn_convolution_add_relu_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::cudnn_convolution_add_relu::redispatch(dispatchKeySet, self, weight, z, alpha, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
+    inline at::Tensor cudnn_grid_sampler(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid) {
+        return at::_ops::cudnn_grid_sampler::redispatch(dispatchKeySet, self, grid);
+    }
+    
+    // aten::cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
+    inline ::std::tuple<at::Tensor,at::Tensor> cudnn_grid_sampler_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output) {
+        return at::_ops::cudnn_grid_sampler_backward::redispatch(dispatchKeySet, self, grid, grad_output);
+    }
+    
+    // aten::cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> cummax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::cummax::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim) {
+        return at::_ops::cummax_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::cummax_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> cummax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::cummax_dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::cummax_dimname_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::cummax_dimname_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::_cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+    inline void _cummax_helper(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & values, at::Tensor & indices, int64_t dim) {
+        return at::_ops::_cummax_helper::redispatch(dispatchKeySet, self, values, indices, dim);
+    }
+    
+    // aten::cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> cummin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::cummin::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim) {
+        return at::_ops::cummin_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::cummin_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> cummin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::cummin_dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::cummin_dimname_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cummin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::cummin_dimname_out::redispatch(dispatchKeySet, self, dim, values, indices);
+    }
+    
+    // aten::_cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
+    inline void _cummin_helper(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & values, at::Tensor & indices, int64_t dim) {
+        return at::_ops::_cummin_helper::redispatch(dispatchKeySet, self, values, indices, dim);
+    }
+    
+    // aten::cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
+    inline at::Tensor cummaxmin_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & indices, int64_t dim) {
+        return at::_ops::cummaxmin_backward::redispatch(dispatchKeySet, grad, input, indices, dim);
+    }
+    
+    // aten::cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor cumprod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+    inline at::Tensor & cumprod_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod_::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumprod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumprod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::cumprod_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor cumprod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod_dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+    inline at::Tensor & cumprod_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod__dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumprod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumprod_dimname_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumprod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::cumprod_dimname_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor
+    inline at::Tensor cumprod_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & input, int64_t dim, const at::Tensor & output) {
+        return at::_ops::cumprod_backward::redispatch(dispatchKeySet, grad, input, dim, output);
+    }
+    
+    // aten::cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor cumsum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+    inline at::Tensor & cumsum_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum_::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumsum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumsum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::cumsum_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor cumsum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum_dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+    inline at::Tensor & cumsum_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum__dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumsum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::cumsum_dimname_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cumsum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::cumsum_dimname_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+    inline at::Tensor cumulative_trapezoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, const at::Tensor & x, int64_t dim=-1) {
+        return at::_ops::cumulative_trapezoid_x::redispatch(dispatchKeySet, y, x, dim);
+    }
+    
+    // aten::cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+    inline at::Tensor cumulative_trapezoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, const at::Scalar & dx=1, int64_t dim=-1) {
+        return at::_ops::cumulative_trapezoid_dx::redispatch(dispatchKeySet, y, dx, dim);
+    }
+    
+    // aten::ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+    inline at::Tensor ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank=0, int64_t reduction=at::Reduction::Mean, bool zero_infinity=false) {
+        return at::_ops::ctc_loss_IntList::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity);
+    }
+    
+    // aten::ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
+    inline at::Tensor ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank=0, int64_t reduction=at::Reduction::Mean, bool zero_infinity=false) {
+        return at::_ops::ctc_loss_Tensor::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, reduction, zero_infinity);
+    }
+    
+    // aten::_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank=0, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity);
+    }
+    
+    // aten::_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _ctc_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank=0, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_Tensor::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity);
+    }
+    
+    // aten::_ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+    inline at::Tensor _ctc_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_backward::redispatch(dispatchKeySet, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, zero_infinity);
+    }
+    
+    // aten::_ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
+    inline at::Tensor _ctc_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_backward_Tensor::redispatch(dispatchKeySet, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, zero_infinity);
+    }
+    
+    // aten::diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
+    inline at::Tensor diag_embed(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) {
+        return at::_ops::diag_embed::redispatch(dispatchKeySet, self, offset, dim1, dim2);
+    }
+    
+    // aten::diagflat(Tensor self, int offset=0) -> Tensor
+    inline at::Tensor diagflat(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset=0) {
+        return at::_ops::diagflat::redispatch(dispatchKeySet, self, offset);
+    }
+    
+    // aten::diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
+    inline at::Tensor diagonal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) {
+        return at::_ops::diagonal::redispatch(dispatchKeySet, self, offset, dim1, dim2);
+    }
+    
+    // aten::linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a)
+    inline at::Tensor linalg_diagonal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) {
+        return at::_ops::linalg_diagonal::redispatch(dispatchKeySet, A, offset, dim1, dim2);
+    }
+    
+    // aten::diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
+    inline at::Tensor diagonal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0) {
+        return at::_ops::diagonal_Dimname::redispatch(dispatchKeySet, self, outdim, dim1, dim2, offset);
+    }
+    
+    // aten::diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+    inline at::Tensor diagonal_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+        return at::_ops::diagonal_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), offset, dim1, dim2);
+    }
+    
+    // aten::diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor
+    inline at::Tensor diagonal_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+        return at::_ops::diagonal_backward::redispatch(dispatchKeySet, grad_output, input_sizes, offset, dim1, dim2);
+    }
+    
+    // aten::fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
+    inline at::Tensor & fill_diagonal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & fill_value, bool wrap=false) {
+        return at::_ops::fill_diagonal_::redispatch(dispatchKeySet, self, fill_value, wrap);
+    }
+    
+    // aten::diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor
+    inline at::Tensor diff(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n=1, int64_t dim=-1, const c10::optional<at::Tensor> & prepend={}, const c10::optional<at::Tensor> & append={}) {
+        return at::_ops::diff::redispatch(dispatchKeySet, self, n, dim, prepend, append);
+    }
+    
+    // aten::diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diff_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t n=1, int64_t dim=-1, const c10::optional<at::Tensor> & prepend={}, const c10::optional<at::Tensor> & append={}) {
+        return at::_ops::diff_out::redispatch(dispatchKeySet, self, n, dim, prepend, append, out);
+    }
+    
+    // aten::diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diff_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append, at::Tensor & out) {
+        return at::_ops::diff_out::redispatch(dispatchKeySet, self, n, dim, prepend, append, out);
+    }
+    
+    // aten::gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & spacing=c10::nullopt, c10::optional<int64_t> dim=c10::nullopt, int64_t edge_order=1) {
+        return at::_ops::gradient_scalarint::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & spacing, at::IntArrayRef dim, int64_t edge_order=1) {
+        return at::_ops::gradient_scalararray::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t edge_order=1) {
+        return at::_ops::gradient_array::redispatch(dispatchKeySet, self, dim, edge_order);
+    }
+    
+    // aten::gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ArrayRef<at::Scalar> spacing, c10::optional<int64_t> dim=c10::nullopt, int64_t edge_order=1) {
+        return at::_ops::gradient_scalarrayint::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ArrayRef<at::Scalar> spacing, at::IntArrayRef dim, int64_t edge_order=1) {
+        return at::_ops::gradient_scalarrayarray::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList spacing, c10::optional<int64_t> dim=c10::nullopt, int64_t edge_order=1) {
+        return at::_ops::gradient_tensorarrayint::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> gradient(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList spacing, at::IntArrayRef dim, int64_t edge_order=1) {
+        return at::_ops::gradient_tensorarray::redispatch(dispatchKeySet, self, spacing, dim, edge_order);
+    }
+    
+    // aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor div(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::div_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & div_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::div__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::div_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::div_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+    inline at::Tensor div(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div_Tensor_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+    inline at::Tensor & div_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div__Tensor_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div_out_mode::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode, at::Tensor & out) {
+        return at::_ops::div_out_mode::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::div.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor div(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::div_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & div_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::div__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+    inline at::Tensor div(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div_Scalar_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+    inline at::Tensor & div_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div__Scalar_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::divide.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::divide_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::divide__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & divide_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & divide_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::divide.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::divide_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::divide__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+    inline at::Tensor divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::divide_Tensor_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+    inline at::Tensor & divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::divide__Tensor_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & divide_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::divide_out_mode::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & divide_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<c10::string_view> rounding_mode, at::Tensor & out) {
+        return at::_ops::divide_out_mode::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+    inline at::Tensor divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::divide_Scalar_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+    inline at::Tensor & divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::divide__Scalar_mode::redispatch(dispatchKeySet, self, other, rounding_mode);
+    }
+    
+    // aten::true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor true_divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::true_divide_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & true_divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::true_divide__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & true_divide_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::true_divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & true_divide_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::true_divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor true_divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::true_divide_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & true_divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::true_divide__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::dot(Tensor self, Tensor tensor) -> Tensor
+    inline at::Tensor dot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor) {
+        return at::_ops::dot::redispatch(dispatchKeySet, self, tensor);
+    }
+    
+    // aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor) {
+        return at::_ops::dot_out::redispatch(dispatchKeySet, self, tensor, out);
+    }
+    
+    // aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dot_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor, at::Tensor & out) {
+        return at::_ops::dot_out::redispatch(dispatchKeySet, self, tensor, out);
+    }
+    
+    // aten::vdot(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor vdot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::vdot::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & vdot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::vdot_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & vdot_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::vdot_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor
+    inline at::Tensor einsum(c10::DispatchKeySet dispatchKeySet, c10::string_view equation, at::TensorList tensors, at::OptionalIntArrayRef path=c10::nullopt) {
+        return at::_ops::einsum::redispatch(dispatchKeySet, equation, tensors, path);
+    }
+    
+    // aten::embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+    inline at::Tensor embedding(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) {
+        return at::_ops::embedding::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse);
+    }
+    
+    // aten::embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
+    inline at::Tensor embedding_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) {
+        return at::_ops::embedding::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse);
+    }
+    
+    // aten::embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+    inline at::Tensor embedding_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+        return at::_ops::embedding_backward::redispatch(dispatchKeySet, grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse);
+    }
+    
+    // aten::embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
+    inline at::Tensor embedding_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse) {
+        return at::_ops::embedding_backward::redispatch(dispatchKeySet, grad, indices, num_weights, padding_idx, scale_grad_by_freq, sparse);
+    }
+    
+    // aten::embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+    inline at::Tensor embedding_dense_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+        return at::_ops::embedding_dense_backward::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq);
+    }
+    
+    // aten::embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor
+    inline at::Tensor embedding_dense_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq) {
+        return at::_ops::embedding_dense_backward::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq);
+    }
+    
+    // aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
+    inline at::Tensor & embedding_renorm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+        return at::_ops::embedding_renorm_::redispatch(dispatchKeySet, self, indices, max_norm, norm_type);
+    }
+    
+    // aten::embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
+    inline at::Tensor embedding_sparse_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+        return at::_ops::embedding_sparse_backward::redispatch(dispatchKeySet, grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+    }
+    
+    // aten::_embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _embedding_bag_forward_only(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const c10::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_forward_only::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx);
+    }
+    
+    // aten::_rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _rowwise_prune(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & mask, at::ScalarType compressed_indices_dtype) {
+        return at::_ops::_rowwise_prune::redispatch(dispatchKeySet, weight, mask, compressed_indices_dtype);
+    }
+    
+    // aten::row_stack(Tensor[] tensors) -> Tensor
+    inline at::Tensor row_stack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::row_stack::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & row_stack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::row_stack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & row_stack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::row_stack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> embedding_bag(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const c10::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false) {
+        return at::_ops::embedding_bag::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset);
+    }
+    
+    // aten::embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> embedding_bag(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, bool include_last_offset, c10::optional<int64_t> padding_idx) {
+        return at::_ops::embedding_bag_padding_idx::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx);
+    }
+    
+    // aten::_embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _embedding_bag(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const c10::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx);
+    }
+    
+    // aten::_embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_backward::redispatch(dispatchKeySet, grad, indices, offsets, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, sparse, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_backward::redispatch(dispatchKeySet, grad, indices, offsets, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, sparse, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_sparse_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_sparse_backward::redispatch(dispatchKeySet, grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_sparse_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_sparse_backward::redispatch(dispatchKeySet, grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_dense_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_dense_backward::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_dense_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_dense_backward::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+    }
+    
+    // aten::_embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor
+    inline at::Tensor _embedding_bag_per_sample_weights_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_per_sample_weights_backward::redispatch(dispatchKeySet, grad, weight, indices, offsets, offset2bag, mode, padding_idx);
+    }
+    
+    // aten::empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_names::redispatch(dispatchKeySet, size, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::empty_names::redispatch(dispatchKeySet, size, names, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_memory_format::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::empty_memory_format::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_memory_format::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::empty_memory_format::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_permuted(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options={}) {
+        return at::_ops::empty_permuted::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), physical_layout, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_permuted(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef physical_layout, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::empty_permuted::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), physical_layout, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_permuted_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options={}) {
+        return at::_ops::empty_permuted::redispatch(dispatchKeySet, size, physical_layout, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_permuted_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::IntArrayRef physical_layout, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::empty_permuted::redispatch(dispatchKeySet, size, physical_layout, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_empty::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_empty::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_empty::redispatch(dispatchKeySet, self, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_empty::redispatch(dispatchKeySet, self, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_strided(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={}) {
+        return at::_ops::new_empty_strided::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_strided(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_empty_strided::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_strided_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={}) {
+        return at::_ops::new_empty_strided::redispatch(dispatchKeySet, self, size, stride, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_empty_strided_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_empty_strided::redispatch(dispatchKeySet, self, size, stride, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_full(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+        return at::_ops::new_full::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_full(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_full::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), fill_value, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_full_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+        return at::_ops::new_full::redispatch(dispatchKeySet, self, size, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_full_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_full::redispatch(dispatchKeySet, self, size, fill_value, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_zeros(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_zeros::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_zeros(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_zeros::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_zeros_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_zeros::redispatch(dispatchKeySet, self, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_zeros_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_zeros::redispatch(dispatchKeySet, self, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_ones(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_ones::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_ones(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_ones::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_ones_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::new_ones::redispatch(dispatchKeySet, self, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor new_ones_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::new_ones::redispatch(dispatchKeySet, self, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_affine_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}, double scale=1, int64_t zero_point=0, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_affine_quantized::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), scale, zero_point, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_affine_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_empty_affine_quantized::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, scale, zero_point, memory_format);
+    }
+    
+    // aten::_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_affine_quantized_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}, double scale=1, int64_t zero_point=0, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_affine_quantized::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), scale, zero_point, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_affine_quantized_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_empty_affine_quantized::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory, scale, zero_point, memory_format);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_per_channel_affine_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_per_channel_affine_quantized::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_per_channel_affine_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_empty_per_channel_affine_quantized::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_per_channel_affine_quantized::redispatch(dispatchKeySet, size, scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+    inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_empty_per_channel_affine_quantized::redispatch(dispatchKeySet, size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+    inline const at::Tensor & resize_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), memory_format);
+    }
+    
+    // aten::resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+    inline const at::Tensor & resize__symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_::redispatch(dispatchKeySet, self, size, memory_format);
+    }
+    
+    // aten::_resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
+    inline const at::Tensor & _resize_output_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output_::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), device);
+    }
+    
+    // aten::_resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)
+    inline const at::Tensor & _resize_output__symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output_::redispatch(dispatchKeySet, self, size, device);
+    }
+    
+    // aten::empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & qtensor, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_quantized::redispatch(dispatchKeySet, size, qtensor, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_quantized(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & qtensor, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::empty_quantized::redispatch(dispatchKeySet, size, qtensor, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), memory_format, out);
+    }
+    
+    // aten::empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::empty_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), memory_format, out);
+    }
+    
+    // aten::empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_out::redispatch(dispatchKeySet, size, memory_format, out);
+    }
+    
+    // aten::empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::empty_out::redispatch(dispatchKeySet, size, memory_format, out);
+    }
+    
+    // aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_like::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor empty_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::empty_like::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_strided(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={}) {
+        return at::_ops::empty_strided::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_strided(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::empty_strided::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_strided_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={}) {
+        return at::_ops::empty_strided::redispatch(dispatchKeySet, size, stride, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor empty_strided_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::empty_strided::redispatch(dispatchKeySet, size, stride, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::erf(Tensor self) -> Tensor
+    inline at::Tensor erf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::erf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erf_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & erf_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::erf_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::erfc(Tensor self) -> Tensor
+    inline at::Tensor erfc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::erfc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erfc_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & erfc_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::erfc_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erfc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erfc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::exp(Tensor self) -> Tensor
+    inline at::Tensor exp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::exp::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::exp_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & exp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::exp_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::exp2(Tensor self) -> Tensor
+    inline at::Tensor exp2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::exp2::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::exp2_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & exp2_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::exp2_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exp2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::exp2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exp2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::exp2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::expm1(Tensor self) -> Tensor
+    inline at::Tensor expm1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::expm1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::expm1_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & expm1_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::expm1_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expm1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expm1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+    inline at::Tensor expand(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, bool implicit=false) {
+        return at::_ops::expand::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), implicit);
+    }
+    
+    // aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+    inline at::Tensor expand_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit=false) {
+        return at::_ops::expand::redispatch(dispatchKeySet, self, size, implicit);
+    }
+    
+    // aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
+    inline at::Tensor expand_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::expand_as::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye(c10::DispatchKeySet dispatchKeySet, int64_t n, at::TensorOptions options={}) {
+        return at::_ops::eye::redispatch(dispatchKeySet, n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye(c10::DispatchKeySet dispatchKeySet, int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::eye::redispatch(dispatchKeySet, n, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, at::TensorOptions options={}) {
+        return at::_ops::eye::redispatch(dispatchKeySet, n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::eye::redispatch(dispatchKeySet, n, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye(c10::DispatchKeySet dispatchKeySet, int64_t n, int64_t m, at::TensorOptions options={}) {
+        return at::_ops::eye_m::redispatch(dispatchKeySet, n, m, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye(c10::DispatchKeySet dispatchKeySet, int64_t n, int64_t m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::eye_m::redispatch(dispatchKeySet, n, m, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::SymInt m, at::TensorOptions options={}) {
+        return at::_ops::eye_m::redispatch(dispatchKeySet, n, m, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor eye_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::SymInt m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::eye_m::redispatch(dispatchKeySet, n, m, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n) {
+        return at::_ops::eye_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, at::Tensor & out) {
+        return at::_ops::eye_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt n) {
+        return at::_ops::eye_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, at::Tensor & out) {
+        return at::_ops::eye_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, int64_t m) {
+        return at::_ops::eye_m_out::redispatch(dispatchKeySet, n, m, out);
+    }
+    
+    // aten::eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, int64_t m, at::Tensor & out) {
+        return at::_ops::eye_m_out::redispatch(dispatchKeySet, n, m, out);
+    }
+    
+    // aten::eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt n, c10::SymInt m) {
+        return at::_ops::eye_m_out::redispatch(dispatchKeySet, n, m, out);
+    }
+    
+    // aten::eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eye_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::SymInt m, at::Tensor & out) {
+        return at::_ops::eye_m_out::redispatch(dispatchKeySet, n, m, out);
+    }
+    
+    // aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+    inline at::Tensor flatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) {
+        return at::_ops::flatten_using_ints::redispatch(dispatchKeySet, self, start_dim, end_dim);
+    }
+    
+    // aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
+    inline at::Tensor flatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim) {
+        return at::_ops::flatten_named_out_dim::redispatch(dispatchKeySet, self, start_dim, end_dim, out_dim);
+    }
+    
+    // aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
+    inline at::Tensor flatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim) {
+        return at::_ops::flatten_using_names::redispatch(dispatchKeySet, self, start_dim, end_dim, out_dim);
+    }
+    
+    // aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
+    inline at::Tensor flatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim) {
+        return at::_ops::flatten_DimnameList::redispatch(dispatchKeySet, self, dims, out_dim);
+    }
+    
+    // aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+    inline at::Tensor unflatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::IntArrayRef sizes) {
+        return at::_ops::unflatten_int::redispatch(dispatchKeySet, self, dim, c10::fromIntArrayRefSlow(sizes));
+    }
+    
+    // aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+    inline at::Tensor unflatten_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymIntArrayRef sizes) {
+        return at::_ops::unflatten_int::redispatch(dispatchKeySet, self, dim, sizes);
+    }
+    
+    // aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+    inline at::Tensor unflatten(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, at::IntArrayRef sizes, at::DimnameList names) {
+        return at::_ops::unflatten_Dimname::redispatch(dispatchKeySet, self, dim, c10::fromIntArrayRefSlow(sizes), names);
+    }
+    
+    // aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+    inline at::Tensor unflatten_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::SymIntArrayRef sizes, at::DimnameList names) {
+        return at::_ops::unflatten_Dimname::redispatch(dispatchKeySet, self, dim, sizes, names);
+    }
+    
+    // aten::fill.Scalar(Tensor self, Scalar value) -> Tensor
+    inline at::Tensor fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value) {
+        return at::_ops::fill_Scalar::redispatch(dispatchKeySet, self, value);
+    }
+    
+    // aten::fill.Tensor(Tensor self, Tensor value) -> Tensor
+    inline at::Tensor fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value) {
+        return at::_ops::fill_Tensor::redispatch(dispatchKeySet, self, value);
+    }
+    
+    // aten::fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+    inline at::Tensor & fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & value) {
+        return at::_ops::fill__Scalar::redispatch(dispatchKeySet, self, value);
+    }
+    
+    // aten::fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+    inline at::Tensor & fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & value) {
+        return at::_ops::fill__Tensor::redispatch(dispatchKeySet, self, value);
+    }
+    
+    // aten::floor(Tensor self) -> Tensor
+    inline at::Tensor floor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::floor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::floor_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & floor_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::floor_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::floor_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::floor_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::floor_divide(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor floor_divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::floor_divide::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & floor_divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::floor_divide__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_divide_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::floor_divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_divide_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::floor_divide_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor floor_divide(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::floor_divide_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & floor_divide_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::floor_divide__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::frac(Tensor self) -> Tensor
+    inline at::Tensor frac(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::frac::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::frac_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & frac_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::frac_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & frac_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::frac_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & frac_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::frac_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::full_names::redispatch(dispatchKeySet, size, fill_value, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::full_names::redispatch(dispatchKeySet, size, fill_value, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+        return at::_ops::full::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::full::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), fill_value, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+        return at::_ops::full::redispatch(dispatchKeySet, size, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor full_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::full::redispatch(dispatchKeySet, size, fill_value, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, const at::Scalar & fill_value) {
+        return at::_ops::full_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), fill_value, out);
+    }
+    
+    // aten::full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+        return at::_ops::full_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), fill_value, out);
+    }
+    
+    // aten::full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, const at::Scalar & fill_value) {
+        return at::_ops::full_out::redispatch(dispatchKeySet, size, fill_value, out);
+    }
+    
+    // aten::full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+        return at::_ops::full_out::redispatch(dispatchKeySet, size, fill_value, out);
+    }
+    
+    // aten::full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor full_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & fill_value, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::full_like::redispatch(dispatchKeySet, self, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor full_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::full_like::redispatch(dispatchKeySet, self, fill_value, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor from_file(c10::DispatchKeySet dispatchKeySet, c10::string_view filename, c10::optional<bool> shared=c10::nullopt, c10::optional<int64_t> size=0, at::TensorOptions options={}) {
+        return at::_ops::from_file::redispatch(dispatchKeySet, filename, shared, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor from_file(c10::DispatchKeySet dispatchKeySet, c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::from_file::redispatch(dispatchKeySet, filename, shared, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gcd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gcd_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gcd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::gcd_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gcd(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor gcd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gcd::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & gcd_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gcd_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lcm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lcm_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lcm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::lcm_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lcm(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor lcm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lcm::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & lcm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lcm_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+    inline at::Tensor grid_sampler(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::grid_sampler::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners);
+    }
+    
+    // aten::grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+    inline at::Tensor grid_sampler_2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::grid_sampler_2d::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners);
+    }
+    
+    // aten::grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> grid_sampler_2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask) {
+        return at::_ops::grid_sampler_2d_backward::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask);
+    }
+    
+    // aten::_grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+    inline at::Tensor _grid_sampler_2d_cpu_fallback(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::_grid_sampler_2d_cpu_fallback::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners);
+    }
+    
+    // aten::_grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _grid_sampler_2d_cpu_fallback_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::_grid_sampler_2d_cpu_fallback_backward::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners);
+    }
+    
+    // aten::grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
+    inline at::Tensor grid_sampler_3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::grid_sampler_3d::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners);
+    }
+    
+    // aten::grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> grid_sampler_3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask) {
+        return at::_ops::grid_sampler_3d_backward::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask);
+    }
+    
+    // aten::hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hann_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::TensorOptions options={}) {
+        return at::_ops::hann_window::redispatch(dispatchKeySet, window_length, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hann_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hann_window::redispatch(dispatchKeySet, window_length, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hann_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::TensorOptions options={}) {
+        return at::_ops::hann_window_periodic::redispatch(dispatchKeySet, window_length, periodic, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hann_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hann_window_periodic::redispatch(dispatchKeySet, window_length, periodic, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::TensorOptions options={}) {
+        return at::_ops::hamming_window::redispatch(dispatchKeySet, window_length, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hamming_window::redispatch(dispatchKeySet, window_length, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::TensorOptions options={}) {
+        return at::_ops::hamming_window_periodic::redispatch(dispatchKeySet, window_length, periodic, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hamming_window_periodic::redispatch(dispatchKeySet, window_length, periodic, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, at::TensorOptions options={}) {
+        return at::_ops::hamming_window_periodic_alpha::redispatch(dispatchKeySet, window_length, periodic, alpha, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hamming_window_periodic_alpha::redispatch(dispatchKeySet, window_length, periodic, alpha, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, double beta, at::TensorOptions options={}) {
+        return at::_ops::hamming_window_periodic_alpha_beta::redispatch(dispatchKeySet, window_length, periodic, alpha, beta, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor hamming_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, double beta, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::hamming_window_periodic_alpha_beta::redispatch(dispatchKeySet, window_length, periodic, alpha, beta, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::TensorOptions options={}) {
+        return at::_ops::kaiser_window::redispatch(dispatchKeySet, window_length, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::kaiser_window::redispatch(dispatchKeySet, window_length, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::TensorOptions options={}) {
+        return at::_ops::kaiser_window_periodic::redispatch(dispatchKeySet, window_length, periodic, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::kaiser_window_periodic::redispatch(dispatchKeySet, window_length, periodic, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double beta, at::TensorOptions options={}) {
+        return at::_ops::kaiser_window_beta::redispatch(dispatchKeySet, window_length, periodic, beta, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor kaiser_window(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double beta, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::kaiser_window_beta::redispatch(dispatchKeySet, window_length, periodic, beta, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
+    inline at::Tensor hinge_embedding_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, double margin=1.0, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::hinge_embedding_loss::redispatch(dispatchKeySet, self, target, margin, reduction);
+    }
+    
+    // aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
+    inline at::Tensor group_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, int64_t num_groups, const c10::optional<at::Tensor> & weight={}, const c10::optional<at::Tensor> & bias={}, double eps=1e-05, bool cudnn_enabled=true) {
+        return at::_ops::group_norm::redispatch(dispatchKeySet, input, num_groups, weight, bias, eps, cudnn_enabled);
+    }
+    
+    // aten::native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps) {
+        return at::_ops::native_group_norm::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps);
+    }
+    
+    // aten::native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, double eps) {
+        return at::_ops::native_group_norm::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps);
+    }
+    
+    // aten::native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_group_norm_backward::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+    }
+    
+    // aten::native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_group_norm_backward::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+    }
+    
+    // aten::_fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
+    inline at::Tensor _fft_r2c(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided) {
+        return at::_ops::_fft_r2c::redispatch(dispatchKeySet, self, dim, normalization, onesided);
+    }
+    
+    // aten::_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_r2c_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided) {
+        return at::_ops::_fft_r2c_out::redispatch(dispatchKeySet, self, dim, normalization, onesided, out);
+    }
+    
+    // aten::_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_r2c_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out) {
+        return at::_ops::_fft_r2c_out::redispatch(dispatchKeySet, self, dim, normalization, onesided, out);
+    }
+    
+    // aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
+    inline at::Tensor _fft_c2r(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
+        return at::_ops::_fft_c2r::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size);
+    }
+    
+    // aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
+    inline at::Tensor _fft_c2r_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
+        return at::_ops::_fft_c2r::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size);
+    }
+    
+    // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2r_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
+        return at::_ops::_fft_c2r_out::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size, out);
+    }
+    
+    // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2r_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size, at::Tensor & out) {
+        return at::_ops::_fft_c2r_out::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size, out);
+    }
+    
+    // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2r_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
+        return at::_ops::_fft_c2r_out::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size, out);
+    }
+    
+    // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2r_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, at::Tensor & out) {
+        return at::_ops::_fft_c2r_out::redispatch(dispatchKeySet, self, dim, normalization, last_dim_size, out);
+    }
+    
+    // aten::_fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
+    inline at::Tensor _fft_c2c(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool forward) {
+        return at::_ops::_fft_c2c::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(dim), normalization, forward);
+    }
+    
+    // aten::_fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor
+    inline at::Tensor _fft_c2c_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward) {
+        return at::_ops::_fft_c2c::redispatch(dispatchKeySet, self, dim, normalization, forward);
+    }
+    
+    // aten::_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2c_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool forward) {
+        return at::_ops::_fft_c2c_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(dim), normalization, forward, out);
+    }
+    
+    // aten::_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2c_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool forward, at::Tensor & out) {
+        return at::_ops::_fft_c2c_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(dim), normalization, forward, out);
+    }
+    
+    // aten::_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2c_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward) {
+        return at::_ops::_fft_c2c_out::redispatch(dispatchKeySet, self, dim, normalization, forward, out);
+    }
+    
+    // aten::_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fft_c2c_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward, at::Tensor & out) {
+        return at::_ops::_fft_c2c_out::redispatch(dispatchKeySet, self, dim, normalization, forward, out);
+    }
+    
+    // aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()
+    inline void _validate_compressed_sparse_indices(c10::DispatchKeySet dispatchKeySet, bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz) {
+        return at::_ops::_validate_compressed_sparse_indices::redispatch(dispatchKeySet, is_crow, compressed_idx, plain_idx, cdim, dim, nnz);
+    }
+    
+    // aten::_cufft_get_plan_cache_size(DeviceIndex device_index) -> int
+    inline int64_t _cufft_get_plan_cache_size(c10::DispatchKeySet dispatchKeySet, at::DeviceIndex device_index) {
+        return at::_ops::_cufft_get_plan_cache_size::redispatch(dispatchKeySet, device_index);
+    }
+    
+    // aten::_cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int
+    inline int64_t _cufft_get_plan_cache_max_size(c10::DispatchKeySet dispatchKeySet, at::DeviceIndex device_index) {
+        return at::_ops::_cufft_get_plan_cache_max_size::redispatch(dispatchKeySet, device_index);
+    }
+    
+    // aten::_cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()
+    inline void _cufft_set_plan_cache_max_size(c10::DispatchKeySet dispatchKeySet, at::DeviceIndex device_index, int64_t max_size) {
+        return at::_ops::_cufft_set_plan_cache_max_size::redispatch(dispatchKeySet, device_index, max_size);
+    }
+    
+    // aten::_cufft_clear_plan_cache(DeviceIndex device_index) -> ()
+    inline void _cufft_clear_plan_cache(c10::DispatchKeySet dispatchKeySet, at::DeviceIndex device_index) {
+        return at::_ops::_cufft_clear_plan_cache::redispatch(dispatchKeySet, device_index);
+    }
+    
+    // aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+    inline at::Tensor index(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices) {
+        return at::_ops::index_Tensor::redispatch(dispatchKeySet, self, indices);
+    }
+    
+    // aten::index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices) {
+        return at::_ops::index_Tensor_out::redispatch(dispatchKeySet, self, indices, out);
+    }
+    
+    // aten::index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, at::Tensor & out) {
+        return at::_ops::index_Tensor_out::redispatch(dispatchKeySet, self, indices, out);
+    }
+    
+    // aten::_unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+    inline at::Tensor _unsafe_index(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices) {
+        return at::_ops::_unsafe_index_Tensor::redispatch(dispatchKeySet, self, indices);
+    }
+    
+    // aten::index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source) {
+        return at::_ops::index_copy_out::redispatch(dispatchKeySet, self, dim, index, source, out);
+    }
+    
+    // aten::index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, at::Tensor & out) {
+        return at::_ops::index_copy_out::redispatch(dispatchKeySet, self, dim, index, source, out);
+    }
+    
+    // aten::index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+    inline at::Tensor & index_copy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source) {
+        return at::_ops::index_copy_::redispatch(dispatchKeySet, self, dim, index, source);
+    }
+    
+    // aten::index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
+    inline at::Tensor index_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source) {
+        return at::_ops::index_copy::redispatch(dispatchKeySet, self, dim, index, source);
+    }
+    
+    // aten::index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
+    inline at::Tensor & index_copy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & source) {
+        return at::_ops::index_copy__dimname::redispatch(dispatchKeySet, self, dim, index, source);
+    }
+    
+    // aten::index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
+    inline at::Tensor index_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & source) {
+        return at::_ops::index_copy_dimname::redispatch(dispatchKeySet, self, dim, index, source);
+    }
+    
+    // aten::index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+    inline at::Tensor & index_put_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) {
+        return at::_ops::index_put_::redispatch(dispatchKeySet, self, indices, values, accumulate);
+    }
+    
+    // aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+    inline at::Tensor index_put(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) {
+        return at::_ops::index_put::redispatch(dispatchKeySet, self, indices, values, accumulate);
+    }
+    
+    // aten::_unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+    inline at::Tensor _unsafe_index_put(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) {
+        return at::_ops::_unsafe_index_put::redispatch(dispatchKeySet, self, indices, values, accumulate);
+    }
+    
+    // aten::_index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)
+    inline at::Tensor & _index_put_impl_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false) {
+        return at::_ops::_index_put_impl_::redispatch(dispatchKeySet, self, indices, values, accumulate, unsafe);
+    }
+    
+    // aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor
+    inline at::Tensor instance_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) {
+        return at::_ops::instance_norm::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled);
+    }
+    
+    // aten::isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
+    inline at::Tensor isclose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) {
+        return at::_ops::isclose::redispatch(dispatchKeySet, self, other, rtol, atol, equal_nan);
+    }
+    
+    // aten::isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & elements, const at::Tensor & test_elements, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Tensor_Tensor_out::redispatch(dispatchKeySet, elements, test_elements, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & elements, const at::Tensor & test_elements, bool assume_unique, bool invert, at::Tensor & out) {
+        return at::_ops::isin_Tensor_Tensor_out::redispatch(dispatchKeySet, elements, test_elements, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+    inline at::Tensor isin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & elements, const at::Tensor & test_elements, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Tensor_Tensor::redispatch(dispatchKeySet, elements, test_elements, assume_unique, invert);
+    }
+    
+    // aten::isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & elements, const at::Scalar & test_element, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Tensor_Scalar_out::redispatch(dispatchKeySet, elements, test_element, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & elements, const at::Scalar & test_element, bool assume_unique, bool invert, at::Tensor & out) {
+        return at::_ops::isin_Tensor_Scalar_out::redispatch(dispatchKeySet, elements, test_element, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor
+    inline at::Tensor isin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & elements, const at::Scalar & test_element, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Tensor_Scalar::redispatch(dispatchKeySet, elements, test_element, assume_unique, invert);
+    }
+    
+    // aten::isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & element, const at::Tensor & test_elements, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Scalar_Tensor_out::redispatch(dispatchKeySet, element, test_elements, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isin_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & element, const at::Tensor & test_elements, bool assume_unique, bool invert, at::Tensor & out) {
+        return at::_ops::isin_Scalar_Tensor_out::redispatch(dispatchKeySet, element, test_elements, assume_unique, invert, out);
+    }
+    
+    // aten::isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor
+    inline at::Tensor isin(c10::DispatchKeySet dispatchKeySet, const at::Scalar & element, const at::Tensor & test_elements, bool assume_unique=false, bool invert=false) {
+        return at::_ops::isin_Scalar_Tensor::redispatch(dispatchKeySet, element, test_elements, assume_unique, invert);
+    }
+    
+    // aten::isnan(Tensor self) -> Tensor
+    inline at::Tensor isnan(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isnan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_distributed(Tensor self) -> bool
+    inline bool is_distributed(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_distributed::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_floating_point(Tensor self) -> bool
+    inline bool __dispatch_is_floating_point(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_floating_point::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_complex(Tensor self) -> bool
+    inline bool __dispatch_is_complex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_complex::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_conj(Tensor self) -> bool
+    inline bool __dispatch_is_conj(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_conj::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_is_zerotensor(Tensor self) -> bool
+    inline bool __dispatch__is_zerotensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_is_zerotensor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_neg(Tensor self) -> bool
+    inline bool __dispatch_is_neg(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_neg::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::isreal(Tensor self) -> Tensor
+    inline at::Tensor isreal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isreal::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_nonzero(Tensor self) -> bool
+    inline bool is_nonzero(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_nonzero::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_same_size(Tensor self, Tensor other) -> bool
+    inline bool is_same_size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::is_same_size::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::is_signed(Tensor self) -> bool
+    inline bool __dispatch_is_signed(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_signed::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_inference(Tensor self) -> bool
+    inline bool __dispatch_is_inference(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_inference::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
+    inline at::Tensor kl_div(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, bool log_target=false) {
+        return at::_ops::kl_div::redispatch(dispatchKeySet, self, target, reduction, log_target);
+    }
+    
+    // aten::kron(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor kron(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::kron::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kron_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::kron_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kron_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::kron_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> kthvalue(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) {
+        return at::_ops::kthvalue::redispatch(dispatchKeySet, self, k, dim, keepdim);
+    }
+    
+    // aten::kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> kthvalue_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false) {
+        return at::_ops::kthvalue_values::redispatch(dispatchKeySet, self, k, dim, keepdim, values, indices);
+    }
+    
+    // aten::kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> kthvalue_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::kthvalue_values::redispatch(dispatchKeySet, self, k, dim, keepdim, values, indices);
+    }
+    
+    // aten::kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> kthvalue(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::kthvalue_dimname::redispatch(dispatchKeySet, self, k, dim, keepdim);
+    }
+    
+    // aten::kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> kthvalue_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::kthvalue_dimname_out::redispatch(dispatchKeySet, self, k, dim, keepdim, values, indices);
+    }
+    
+    // aten::kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> kthvalue_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::kthvalue_dimname_out::redispatch(dispatchKeySet, self, k, dim, keepdim, values, indices);
+    }
+    
+    // aten::layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+    inline at::Tensor layer_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight={}, const c10::optional<at::Tensor> & bias={}, double eps=1e-05, bool cudnn_enable=true) {
+        return at::_ops::layer_norm::redispatch(dispatchKeySet, input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, cudnn_enable);
+    }
+    
+    // aten::layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor
+    inline at::Tensor layer_norm_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight={}, const c10::optional<at::Tensor> & bias={}, double eps=1e-05, bool cudnn_enable=true) {
+        return at::_ops::layer_norm::redispatch(dispatchKeySet, input, normalized_shape, weight, bias, eps, cudnn_enable);
+    }
+    
+    // aten::native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+        return at::_ops::native_layer_norm::redispatch(dispatchKeySet, input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps);
+    }
+    
+    // aten::native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+        return at::_ops::native_layer_norm::redispatch(dispatchKeySet, input, normalized_shape, weight, bias, eps);
+    }
+    
+    // aten::native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_layer_norm_backward::redispatch(dispatchKeySet, grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask);
+    }
+    
+    // aten::native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_layer_norm_backward::redispatch(dispatchKeySet, grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask);
+    }
+    
+    // aten::nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+    inline at::Tensor nan_to_num(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt) {
+        return at::_ops::nan_to_num::redispatch(dispatchKeySet, self, nan, posinf, neginf);
+    }
+    
+    // aten::nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+    inline at::Tensor & nan_to_num_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt) {
+        return at::_ops::nan_to_num_::redispatch(dispatchKeySet, self, nan, posinf, neginf);
+    }
+    
+    // aten::nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nan_to_num_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt) {
+        return at::_ops::nan_to_num_out::redispatch(dispatchKeySet, self, nan, posinf, neginf, out);
+    }
+    
+    // aten::nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nan_to_num_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf, at::Tensor & out) {
+        return at::_ops::nan_to_num_out::redispatch(dispatchKeySet, self, nan, posinf, neginf, out);
+    }
+    
+    // aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
+    inline at::Tensor linear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}) {
+        return at::_ops::linear::redispatch(dispatchKeySet, input, weight, bias);
+    }
+    
+    // aten::linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linear_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
+        return at::_ops::linear_backward::redispatch(dispatchKeySet, self, grad_output, weight, output_mask);
+    }
+    
+    // aten::linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linear_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}) {
+        return at::_ops::linear_out::redispatch(dispatchKeySet, input, weight, bias, out);
+    }
+    
+    // aten::linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linear_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::Tensor & out) {
+        return at::_ops::linear_out::redispatch(dispatchKeySet, input, weight, bias, out);
+    }
+    
+    // aten::mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
+    inline at::Tensor mkldnn_linear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}) {
+        return at::_ops::mkldnn_linear::redispatch(dispatchKeySet, self, weight, bias);
+    }
+    
+    // aten::mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor
+    inline at::Tensor mkldnn_linear_backward_input(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight) {
+        return at::_ops::mkldnn_linear_backward_input::redispatch(dispatchKeySet, input_size, grad_output, weight);
+    }
+    
+    // aten::mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> mkldnn_linear_backward_weights(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) {
+        return at::_ops::mkldnn_linear_backward_weights::redispatch(dispatchKeySet, grad_output, input, weight, bias_defined);
+    }
+    
+    // aten::mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> mkldnn_linear_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
+        return at::_ops::mkldnn_linear_backward::redispatch(dispatchKeySet, self, grad_output, weight, output_mask);
+    }
+    
+    // aten::_cslt_compress(Tensor input) -> Tensor
+    inline at::Tensor _cslt_compress(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input) {
+        return at::_ops::_cslt_compress::redispatch(dispatchKeySet, input);
+    }
+    
+    // aten::_cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
+    inline at::Tensor _cslt_sparse_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_A, const at::Tensor & dense_B, const c10::optional<at::Tensor> & bias={}, const c10::optional<at::Tensor> & alpha={}, c10::optional<at::ScalarType> out_dtype=c10::nullopt, bool transpose_result=false, int64_t alg_id=0) {
+        return at::_ops::_cslt_sparse_mm::redispatch(dispatchKeySet, compressed_A, dense_B, bias, alpha, out_dtype, transpose_result, alg_id);
+    }
+    
+    // aten::_cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int
+    inline int64_t _cslt_sparse_mm_search(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_A, const at::Tensor & dense_B, const c10::optional<at::Tensor> & bias={}, const c10::optional<at::Tensor> & alpha={}, c10::optional<at::ScalarType> out_dtype=c10::nullopt, bool transpose_result=false) {
+        return at::_ops::_cslt_sparse_mm_search::redispatch(dispatchKeySet, compressed_A, dense_B, bias, alpha, out_dtype, transpose_result);
+    }
+    
+    // aten::_sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor
+    inline at::Tensor _sparse_semi_structured_linear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & meta, const c10::optional<at::Tensor> & bias={}, c10::optional<c10::string_view> activation=c10::nullopt, c10::optional<at::ScalarType> out_dtype=c10::nullopt) {
+        return at::_ops::_sparse_semi_structured_linear::redispatch(dispatchKeySet, input, weight, meta, bias, activation, out_dtype);
+    }
+    
+    // aten::_mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor
+    inline at::Tensor _mixed_dtypes_linear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & scale, const c10::optional<at::Tensor> & bias={}, c10::optional<c10::string_view> activation=c10::nullopt) {
+        return at::_ops::_mixed_dtypes_linear::redispatch(dispatchKeySet, input, weight, scale, bias, activation);
+    }
+    
+    // aten::fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+    inline at::Tensor fbgemm_linear_int8_weight_fp32_activation(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias) {
+        return at::_ops::fbgemm_linear_int8_weight_fp32_activation::redispatch(dispatchKeySet, input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias);
+    }
+    
+    // aten::fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
+    inline at::Tensor fbgemm_linear_int8_weight(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias) {
+        return at::_ops::fbgemm_linear_int8_weight::redispatch(dispatchKeySet, input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias);
+    }
+    
+    // aten::fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
+    inline ::std::tuple<at::Tensor,at::Tensor,double,int64_t> fbgemm_linear_quantize_weight(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input) {
+        return at::_ops::fbgemm_linear_quantize_weight::redispatch(dispatchKeySet, input);
+    }
+    
+    // aten::fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
+    inline at::Tensor fbgemm_pack_gemm_matrix_fp16(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input) {
+        return at::_ops::fbgemm_pack_gemm_matrix_fp16::redispatch(dispatchKeySet, input);
+    }
+    
+    // aten::fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+    inline at::Tensor fbgemm_linear_fp16_weight_fp32_activation(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & packed_weight, const at::Tensor & bias) {
+        return at::_ops::fbgemm_linear_fp16_weight_fp32_activation::redispatch(dispatchKeySet, input, packed_weight, bias);
+    }
+    
+    // aten::fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
+    inline at::Tensor fbgemm_linear_fp16_weight(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & packed_weight, const at::Tensor & bias) {
+        return at::_ops::fbgemm_linear_fp16_weight::redispatch(dispatchKeySet, input, packed_weight, bias);
+    }
+    
+    // aten::fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
+    inline at::Tensor fbgemm_pack_quantized_matrix(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input) {
+        return at::_ops::fbgemm_pack_quantized_matrix::redispatch(dispatchKeySet, input);
+    }
+    
+    // aten::fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
+    inline at::Tensor fbgemm_pack_quantized_matrix(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, int64_t K, int64_t N) {
+        return at::_ops::fbgemm_pack_quantized_matrix_KN::redispatch(dispatchKeySet, input, K, N);
+    }
+    
+    // aten::ldexp.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor ldexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ldexp_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & ldexp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ldexp_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ldexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ldexp_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ldexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::ldexp_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, at::TensorOptions options={}) {
+        return at::_ops::linspace::redispatch(dispatchKeySet, start, end, steps, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::linspace::redispatch(dispatchKeySet, start, end, steps, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, at::TensorOptions options={}) {
+        return at::_ops::linspace_Tensor_Tensor::redispatch(dispatchKeySet, start, end, steps, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::linspace_Tensor_Tensor::redispatch(dispatchKeySet, start, end, steps, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, at::TensorOptions options={}) {
+        return at::_ops::linspace_Tensor_Scalar::redispatch(dispatchKeySet, start, end, steps, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::linspace_Tensor_Scalar::redispatch(dispatchKeySet, start, end, steps, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, at::TensorOptions options={}) {
+        return at::_ops::linspace_Scalar_Tensor::redispatch(dispatchKeySet, start, end, steps, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor linspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::linspace_Scalar_Tensor::redispatch(dispatchKeySet, start, end, steps, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps) {
+        return at::_ops::linspace_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out) {
+        return at::_ops::linspace_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & start, const at::Tensor & end, int64_t steps) {
+        return at::_ops::linspace_Tensor_Tensor_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, at::Tensor & out) {
+        return at::_ops::linspace_Tensor_Tensor_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & start, const at::Scalar & end, int64_t steps) {
+        return at::_ops::linspace_Tensor_Scalar_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, at::Tensor & out) {
+        return at::_ops::linspace_Tensor_Scalar_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Tensor & end, int64_t steps) {
+        return at::_ops::linspace_Scalar_Tensor_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, at::Tensor & out) {
+        return at::_ops::linspace_Scalar_Tensor_out::redispatch(dispatchKeySet, start, end, steps, out);
+    }
+    
+    // aten::log(Tensor self) -> Tensor
+    inline at::Tensor log(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & log_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::log_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::log_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::log_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log10(Tensor self) -> Tensor
+    inline at::Tensor log10(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log10::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log10_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & log10_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::log10_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log10_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::log10_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log10_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::log10_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log1p(Tensor self) -> Tensor
+    inline at::Tensor log1p(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log1p::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log1p_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & log1p_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::log1p_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log1p_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log1p_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log2(Tensor self) -> Tensor
+    inline at::Tensor log2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log2::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log2_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & log2_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::log2_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::log2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::log2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logaddexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logaddexp_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logaddexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::logaddexp_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logaddexp(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor logaddexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logaddexp::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logaddexp2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logaddexp2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logaddexp2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::logaddexp2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logaddexp2(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor logaddexp2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::logaddexp2::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor xlogy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::xlogy_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor xlogy(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::xlogy_Scalar_Self::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor xlogy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::xlogy_Scalar_Other::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & xlogy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::xlogy__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & xlogy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::xlogy__Scalar_Other::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::xlogy_OutTensor::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::xlogy_OutTensor::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::xlogy_OutScalar_Self::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::xlogy_OutScalar_Self::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::xlogy_OutScalar_Other::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::xlogy_OutScalar_Other::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+        return at::_ops::logspace::redispatch(dispatchKeySet, start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::logspace::redispatch(dispatchKeySet, start, end, steps, base, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+        return at::_ops::logspace_Tensor_Tensor::redispatch(dispatchKeySet, start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::logspace_Tensor_Tensor::redispatch(dispatchKeySet, start, end, steps, base, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+        return at::_ops::logspace_Tensor_Scalar::redispatch(dispatchKeySet, start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::logspace_Tensor_Scalar::redispatch(dispatchKeySet, start, end, steps, base, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) {
+        return at::_ops::logspace_Scalar_Tensor::redispatch(dispatchKeySet, start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor logspace(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::logspace_Scalar_Tensor::redispatch(dispatchKeySet, start, end, steps, base, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0) {
+        return at::_ops::logspace_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) {
+        return at::_ops::logspace_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0) {
+        return at::_ops::logspace_Tensor_Tensor_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) {
+        return at::_ops::logspace_Tensor_Tensor_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0) {
+        return at::_ops::logspace_Tensor_Scalar_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) {
+        return at::_ops::logspace_Tensor_Scalar_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0) {
+        return at::_ops::logspace_Scalar_Tensor_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logspace_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) {
+        return at::_ops::logspace_Scalar_Tensor_out::redispatch(dispatchKeySet, start, end, steps, base, out);
+    }
+    
+    // aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::log_softmax_int::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::log_softmax_int_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::log_softmax_int_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::log_softmax_Dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+    inline at::Tensor _log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_log_softmax::redispatch(dispatchKeySet, self, dim, half_to_float);
+    }
+    
+    // aten::_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _log_softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_log_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _log_softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) {
+        return at::_ops::_log_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+    inline at::Tensor _log_softmax_backward_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
+        return at::_ops::_log_softmax_backward_data::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype);
+    }
+    
+    // aten::_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _log_softmax_backward_data_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
+        return at::_ops::_log_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype, out);
+    }
+    
+    // aten::_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _log_softmax_backward_data_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, at::Tensor & out) {
+        return at::_ops::_log_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype, out);
+    }
+    
+    // aten::_logcumsumexp(Tensor self, int dim) -> Tensor
+    inline at::Tensor _logcumsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::_logcumsumexp::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::_logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _logcumsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim) {
+        return at::_ops::_logcumsumexp_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::_logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _logcumsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & out) {
+        return at::_ops::_logcumsumexp_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::logcumsumexp(Tensor self, int dim) -> Tensor
+    inline at::Tensor logcumsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::logcumsumexp::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logcumsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim) {
+        return at::_ops::logcumsumexp_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logcumsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & out) {
+        return at::_ops::logcumsumexp_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
+    inline at::Tensor logcumsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::logcumsumexp_dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logcumsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::logcumsumexp_dimname_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logcumsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, at::Tensor & out) {
+        return at::_ops::logcumsumexp_dimname_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor logsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::logsumexp::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::logsumexp_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::logsumexp_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor logsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim=false) {
+        return at::_ops::logsumexp_names::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false) {
+        return at::_ops::logsumexp_names_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::logsumexp_names_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
+    inline at::Tensor margin_ranking_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & target, double margin=0.0, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::margin_ranking_loss::redispatch(dispatchKeySet, input1, input2, target, margin, reduction);
+    }
+    
+    // aten::matmul(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor matmul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::matmul::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> matmul_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
+        return at::_ops::matmul_backward::redispatch(dispatchKeySet, grad, self, other, mask);
+    }
+    
+    // aten::matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & matmul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & matmul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::matrix_power(Tensor self, int n) -> Tensor
+    inline at::Tensor matrix_power(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n) {
+        return at::_ops::matrix_power::redispatch(dispatchKeySet, self, n);
+    }
+    
+    // aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & matrix_power_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t n) {
+        return at::_ops::matrix_power_out::redispatch(dispatchKeySet, self, n, out);
+    }
+    
+    // aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & matrix_power_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, at::Tensor & out) {
+        return at::_ops::matrix_power_out::redispatch(dispatchKeySet, self, n, out);
+    }
+    
+    // aten::matrix_exp(Tensor self) -> Tensor
+    inline at::Tensor matrix_exp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::matrix_exp::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
+    inline at::Tensor matrix_exp_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad) {
+        return at::_ops::matrix_exp_backward::redispatch(dispatchKeySet, self, grad);
+    }
+    
+    // aten::_aminmax(Tensor self) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _aminmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_aminmax::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _aminmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::_aminmax_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
+    inline ::std::tuple<at::Tensor,at::Tensor> aminmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::aminmax::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> aminmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & min, at::Tensor & max, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) {
+        return at::_ops::aminmax_out::redispatch(dispatchKeySet, self, dim, keepdim, min, max);
+    }
+    
+    // aten::aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> aminmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim, bool keepdim, at::Tensor & min, at::Tensor & max) {
+        return at::_ops::aminmax_out::redispatch(dispatchKeySet, self, dim, keepdim, min, max);
+    }
+    
+    // aten::_compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
+    inline at::Tensor _compute_linear_combination(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & coefficients) {
+        return at::_ops::_compute_linear_combination::redispatch(dispatchKeySet, input, coefficients);
+    }
+    
+    // aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _compute_linear_combination_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & coefficients) {
+        return at::_ops::_compute_linear_combination_out::redispatch(dispatchKeySet, input, coefficients, out);
+    }
+    
+    // aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _compute_linear_combination_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & coefficients, at::Tensor & out) {
+        return at::_ops::_compute_linear_combination_out::redispatch(dispatchKeySet, input, coefficients, out);
+    }
+    
+    // aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::max_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & max, at::Tensor & max_values, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::max_dim_max::redispatch(dispatchKeySet, self, dim, keepdim, max, max_values);
+    }
+    
+    // aten::max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & max, at::Tensor & max_values) {
+        return at::_ops::max_dim_max::redispatch(dispatchKeySet, self, dim, keepdim, max, max_values);
+    }
+    
+    // aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::max_names_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & max, at::Tensor & max_values, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::max_names_dim_max::redispatch(dispatchKeySet, self, dim, keepdim, max, max_values);
+    }
+    
+    // aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & max, at::Tensor & max_values) {
+        return at::_ops::max_names_dim_max::redispatch(dispatchKeySet, self, dim, keepdim, max, max_values);
+    }
+    
+    // aten::value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
+    inline at::Tensor value_selecting_reduction_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim) {
+        return at::_ops::value_selecting_reduction_backward::redispatch(dispatchKeySet, grad, dim, indices, c10::fromIntArrayRefSlow(sizes), keepdim);
+    }
+    
+    // aten::value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
+    inline at::Tensor value_selecting_reduction_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, int64_t dim, const at::Tensor & indices, c10::SymIntArrayRef sizes, bool keepdim) {
+        return at::_ops::value_selecting_reduction_backward::redispatch(dispatchKeySet, grad, dim, indices, sizes, keepdim);
+    }
+    
+    // aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+    inline at::Tensor amax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+        return at::_ops::amax::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & amax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+        return at::_ops::amax_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & amax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::amax_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> max_pool1d_with_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool1d_with_indices::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor max_pool1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool1d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor max_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool2d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor max_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool2d_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor mkldnn_max_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool2d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor mkldnn_max_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool2d_backward::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor mkldnn_max_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool3d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor mkldnn_max_pool3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool3d_backward::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor quantized_max_pool1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool1d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor quantized_max_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool2d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor quantized_max_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool3d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
+    inline at::Tensor max_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool3d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::mean::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::mean_dim::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mean_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::mean_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mean_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::mean_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::mean_names_dim::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mean_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::mean_names_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mean_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::mean_names_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor nanmean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::nanmean::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanmean_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::nanmean_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanmean_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::nanmean_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::median(Tensor self) -> Tensor
+    inline at::Tensor median(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::median::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> median(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::median_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> median_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::median_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> median_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::median_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> median(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::median_names_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> median_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::median_names_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> median_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::median_names_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::nanmedian(Tensor self) -> Tensor
+    inline at::Tensor nanmedian(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::nanmedian::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> nanmedian(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::nanmedian_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nanmedian_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::nanmedian_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nanmedian_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::nanmedian_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> nanmedian(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::nanmedian_names_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nanmedian_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::nanmedian_names_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nanmedian_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::nanmedian_names_dim_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::min_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & min, at::Tensor & min_indices, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::min_dim_min::redispatch(dispatchKeySet, self, dim, keepdim, min, min_indices);
+    }
+    
+    // aten::min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices) {
+        return at::_ops::min_dim_min::redispatch(dispatchKeySet, self, dim, keepdim, min, min_indices);
+    }
+    
+    // aten::min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::min_names_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & min, at::Tensor & min_indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::min_names_dim_min::redispatch(dispatchKeySet, self, dim, keepdim, min, min_indices);
+    }
+    
+    // aten::min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices) {
+        return at::_ops::min_names_dim_min::redispatch(dispatchKeySet, self, dim, keepdim, min, min_indices);
+    }
+    
+    // aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+    inline at::Tensor amin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+        return at::_ops::amin::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & amin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+        return at::_ops::amin_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & amin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::amin_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::_mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _mps_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::_mps_convolution::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::_mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor _mps_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::_mps_convolution::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups);
+    }
+    
+    // aten::mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::mps_convolution_backward::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask);
+    }
+    
+    // aten::mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> mps_convolution_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::mps_convolution_backward::redispatch(dispatchKeySet, self, grad_output, weight, padding, stride, dilation, groups, output_mask);
+    }
+    
+    // aten::mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor mkldnn_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::mkldnn_convolution::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor mkldnn_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::mkldnn_convolution::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups);
+    }
+    
+    // aten::mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> mkldnn_rnn_layer(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight0, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & hx_, const at::Tensor & cx_, bool reverse, at::IntArrayRef batch_sizes, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) {
+        return at::_ops::mkldnn_rnn_layer::redispatch(dispatchKeySet, input, weight0, weight1, weight2, weight3, hx_, cx_, reverse, batch_sizes, mode, hidden_size, num_layers, has_biases, bidirectional, batch_first, train);
+    }
+    
+    // aten::mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> mkldnn_rnn_layer_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & weight4, const at::Tensor & hx_, const at::Tensor & cx_tmp, const at::Tensor & output, const at::Tensor & hy_, const at::Tensor & cy_, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, bool batch_first, const at::Tensor & workspace) {
+        return at::_ops::mkldnn_rnn_layer_backward::redispatch(dispatchKeySet, input, weight1, weight2, weight3, weight4, hx_, cx_tmp, output, hy_, cy_, grad_output, grad_hy, grad_cy, reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, workspace);
+    }
+    
+    // aten::miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon) {
+        return at::_ops::miopen_batch_norm::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon);
+    }
+    
+    // aten::miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon) {
+        return at::_ops::miopen_batch_norm_backward::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon);
+    }
+    
+    // aten::miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_convolution_transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_transpose::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_convolution_transpose_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_transpose::redispatch(dispatchKeySet, self, weight, bias, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_depthwise_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_depthwise_convolution::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
+    inline at::Tensor miopen_depthwise_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_depthwise_convolution::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic);
+    }
+    
+    // aten::miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor miopen_convolution_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::miopen_convolution_relu::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor miopen_convolution_relu_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::miopen_convolution_relu::redispatch(dispatchKeySet, self, weight, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor miopen_convolution_add_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::miopen_convolution_add_relu::redispatch(dispatchKeySet, self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+    inline at::Tensor miopen_convolution_add_relu_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::miopen_convolution_add_relu::redispatch(dispatchKeySet, self, weight, z, alpha, bias, stride, padding, dilation, groups);
+    }
+    
+    // aten::miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> miopen_rnn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::miopen_rnn::redispatch(dispatchKeySet, input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state);
+    }
+    
+    // aten::miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,::std::vector<at::Tensor>> miopen_rnn_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::miopen_rnn_backward::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask);
+    }
+    
+    // aten::mm(Tensor self, Tensor mat2) -> Tensor
+    inline at::Tensor mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::mm::redispatch(dispatchKeySet, self, mat2);
+    }
+    
+    // aten::mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::mm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out) {
+        return at::_ops::mm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::_int_mm(Tensor self, Tensor mat2) -> Tensor
+    inline at::Tensor _int_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::_int_mm::redispatch(dispatchKeySet, self, mat2);
+    }
+    
+    // aten::_int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _int_mm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::_int_mm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::_int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _int_mm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out) {
+        return at::_ops::_int_mm_out::redispatch(dispatchKeySet, self, mat2, out);
+    }
+    
+    // aten::_convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor
+    inline at::Tensor _convert_weight_to_int4pack(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t innerKTiles) {
+        return at::_ops::_convert_weight_to_int4pack::redispatch(dispatchKeySet, self, innerKTiles);
+    }
+    
+    // aten::_weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
+    inline at::Tensor _weight_int4pack_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, int64_t qGroupSize, const at::Tensor & qScaleAndZeros) {
+        return at::_ops::_weight_int4pack_mm::redispatch(dispatchKeySet, self, mat2, qGroupSize, qScaleAndZeros);
+    }
+    
+    // aten::_weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor
+    inline at::Tensor _weight_int8pack_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scales) {
+        return at::_ops::_weight_int8pack_mm::redispatch(dispatchKeySet, self, mat2, scales);
+    }
+    
+    // aten::_sparse_mm(Tensor sparse, Tensor dense) -> Tensor
+    inline at::Tensor _sparse_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sparse, const at::Tensor & dense) {
+        return at::_ops::_sparse_mm::redispatch(dispatchKeySet, sparse, dense);
+    }
+    
+    // aten::_sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor
+    inline at::Tensor _sparse_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sparse, const at::Tensor & dense, c10::string_view reduce) {
+        return at::_ops::_sparse_mm_reduce::redispatch(dispatchKeySet, sparse, dense, reduce);
+    }
+    
+    // aten::_sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor _sparse_sparse_matmul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::_sparse_sparse_matmul::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> mode(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=-1, bool keepdim=false) {
+        return at::_ops::mode::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mode_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim=-1, bool keepdim=false) {
+        return at::_ops::mode_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mode_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::mode_values::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> mode(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::mode_dimname::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mode_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false) {
+        return at::_ops::mode_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mode_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::mode_dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, values, indices);
+    }
+    
+    // aten::mul.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor mul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::mul_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & mul_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::mul__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::mul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::mul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::mul.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor mul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::mul_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & mul_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::mul__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::multiply.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor multiply(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::multiply_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & multiply_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::multiply__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multiply_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::multiply_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multiply_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::multiply_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::multiply.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor multiply(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::multiply_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & multiply_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::multiply__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::mv(Tensor self, Tensor vec) -> Tensor
+    inline at::Tensor mv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec) {
+        return at::_ops::mv::redispatch(dispatchKeySet, self, vec);
+    }
+    
+    // aten::mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & vec) {
+        return at::_ops::mv_out::redispatch(dispatchKeySet, self, vec, out);
+    }
+    
+    // aten::mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec, at::Tensor & out) {
+        return at::_ops::mv_out::redispatch(dispatchKeySet, self, vec, out);
+    }
+    
+    // aten::mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mvlgamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t p) {
+        return at::_ops::mvlgamma_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mvlgamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t p, at::Tensor & out) {
+        return at::_ops::mvlgamma_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::mvlgamma(Tensor self, int p) -> Tensor
+    inline at::Tensor mvlgamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t p) {
+        return at::_ops::mvlgamma::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
+    inline at::Tensor & mvlgamma_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t p) {
+        return at::_ops::mvlgamma_::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+    inline at::Tensor narrow_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) {
+        return at::_ops::narrow_copy::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+    inline at::Tensor narrow_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) {
+        return at::_ops::narrow_copy::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & narrow_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) {
+        return at::_ops::narrow_copy_out::redispatch(dispatchKeySet, self, dim, start, length, out);
+    }
+    
+    // aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & narrow_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) {
+        return at::_ops::narrow_copy_out::redispatch(dispatchKeySet, self, dim, start, length, out);
+    }
+    
+    // aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & narrow_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) {
+        return at::_ops::narrow_copy_out::redispatch(dispatchKeySet, self, dim, start, length, out);
+    }
+    
+    // aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & narrow_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out) {
+        return at::_ops::narrow_copy_out::redispatch(dispatchKeySet, self, dim, start, length, out);
+    }
+    
+    // aten::narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+    inline at::Tensor narrow(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) {
+        return at::_ops::narrow::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+    inline at::Tensor narrow_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) {
+        return at::_ops::narrow::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+    inline at::Tensor narrow(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & start, int64_t length) {
+        return at::_ops::narrow_Tensor::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+    inline at::Tensor narrow_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & start, c10::SymInt length) {
+        return at::_ops::narrow_Tensor::redispatch(dispatchKeySet, self, dim, start, length);
+    }
+    
+    // aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_batch_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps) {
+        return at::_ops::native_batch_norm::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps);
+    }
+    
+    // aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_batch_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps) {
+        return at::_ops::native_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_batch_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd) {
+        return at::_ops::native_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::_native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps);
+    }
+    
+    // aten::_native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_no_training(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_no_training::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, momentum, eps);
+    }
+    
+    // aten::_native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::_native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd) {
+        return at::_ops::_native_batch_norm_legit_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_no_stats::redispatch(dispatchKeySet, input, weight, bias, training, momentum, eps);
+    }
+    
+    // aten::_native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_no_stats_out::redispatch(dispatchKeySet, input, weight, bias, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::_native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd) {
+        return at::_ops::_native_batch_norm_legit_no_stats_out::redispatch(dispatchKeySet, input, weight, bias, training, momentum, eps, out, save_mean, save_invstd);
+    }
+    
+    // aten::batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> batch_norm_stats(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double eps) {
+        return at::_ops::batch_norm_stats::redispatch(dispatchKeySet, input, eps);
+    }
+    
+    // aten::batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor
+    inline at::Tensor batch_norm_elemt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps) {
+        return at::_ops::batch_norm_elemt::redispatch(dispatchKeySet, input, weight, bias, mean, invstd, eps);
+    }
+    
+    // aten::batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & batch_norm_elemt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps) {
+        return at::_ops::batch_norm_elemt_out::redispatch(dispatchKeySet, input, weight, bias, mean, invstd, eps, out);
+    }
+    
+    // aten::batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & batch_norm_elemt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out) {
+        return at::_ops::batch_norm_elemt_out::redispatch(dispatchKeySet, input, weight, bias, mean, invstd, eps, out);
+    }
+    
+    // aten::batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> batch_norm_gather_stats(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count) {
+        return at::_ops::batch_norm_gather_stats::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, count);
+    }
+    
+    // aten::batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> batch_norm_gather_stats_with_counts(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, const at::Tensor & counts) {
+        return at::_ops::batch_norm_gather_stats_with_counts::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, counts);
+    }
+    
+    // aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_batch_norm_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_batch_norm_backward::redispatch(dispatchKeySet, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps, output_mask);
+    }
+    
+    // aten::batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_reduce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, bool input_g, bool weight_g, bool bias_g) {
+        return at::_ops::batch_norm_backward_reduce::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g);
+    }
+    
+    // aten::batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor
+    inline at::Tensor batch_norm_backward_elemt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count) {
+        return at::_ops::batch_norm_backward_elemt::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count);
+    }
+    
+    // aten::batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> batch_norm_update_stats(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum) {
+        return at::_ops::batch_norm_update_stats::redispatch(dispatchKeySet, input, running_mean, running_var, momentum);
+    }
+    
+    // aten::is_vulkan_available() -> bool
+    inline bool is_vulkan_available(c10::DispatchKeySet dispatchKeySet) {
+        return at::_ops::is_vulkan_available::redispatch(dispatchKeySet);
+    }
+    
+    // aten::_nnpack_available() -> bool
+    inline bool _nnpack_available(c10::DispatchKeySet dispatchKeySet) {
+        return at::_ops::_nnpack_available::redispatch(dispatchKeySet);
+    }
+    
+    // aten::_nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
+    inline at::Tensor _nnpack_spatial_convolution(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride=1) {
+        return at::_ops::_nnpack_spatial_convolution::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::_nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
+    inline at::Tensor _nnpack_spatial_convolution_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride=c10::SymInt(1)) {
+        return at::_ops::_nnpack_spatial_convolution::redispatch(dispatchKeySet, input, weight, bias, padding, stride);
+    }
+    
+    // aten::ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::ones_names::redispatch(dispatchKeySet, size, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::ones_names::redispatch(dispatchKeySet, size, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::ones::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::ones::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::ones::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor ones_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::ones::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::ones_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::ones_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size) {
+        return at::_ops::ones_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::ones_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor ones_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::ones_like::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor ones_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::ones_like::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
+    inline at::Tensor pairwise_distance(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p=2, double eps=1e-06, bool keepdim=false) {
+        return at::_ops::pairwise_distance::redispatch(dispatchKeySet, x1, x2, p, eps, keepdim);
+    }
+    
+    // aten::cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
+    inline at::Tensor cdist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p=2, c10::optional<int64_t> compute_mode=c10::nullopt) {
+        return at::_ops::cdist::redispatch(dispatchKeySet, x1, x2, p, compute_mode);
+    }
+    
+    // aten::_euclidean_dist(Tensor x1, Tensor x2) -> Tensor
+    inline at::Tensor _euclidean_dist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2) {
+        return at::_ops::_euclidean_dist::redispatch(dispatchKeySet, x1, x2);
+    }
+    
+    // aten::_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
+    inline at::Tensor _cdist_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional<int64_t> compute_mode) {
+        return at::_ops::_cdist_forward::redispatch(dispatchKeySet, x1, x2, p, compute_mode);
+    }
+    
+    // aten::_cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
+    inline at::Tensor _cdist_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist) {
+        return at::_ops::_cdist_backward::redispatch(dispatchKeySet, grad, x1, x2, p, cdist);
+    }
+    
+    // aten::pdist(Tensor self, float p=2) -> Tensor
+    inline at::Tensor pdist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p=2) {
+        return at::_ops::pdist::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::_pdist_forward(Tensor self, float p=2) -> Tensor
+    inline at::Tensor _pdist_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p=2) {
+        return at::_ops::_pdist_forward::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::_pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
+    inline at::Tensor _pdist_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, double p, const at::Tensor & pdist) {
+        return at::_ops::_pdist_backward::redispatch(dispatchKeySet, grad, self, p, pdist);
+    }
+    
+    // aten::cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
+    inline at::Tensor cosine_similarity(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, int64_t dim=1, double eps=1e-08) {
+        return at::_ops::cosine_similarity::redispatch(dispatchKeySet, x1, x2, dim, eps);
+    }
+    
+    // aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+    inline at::Tensor permute(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::permute::redispatch(dispatchKeySet, self, dims);
+    }
+    
+    // aten::movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+    inline at::Tensor movedim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef source, at::IntArrayRef destination) {
+        return at::_ops::movedim_intlist::redispatch(dispatchKeySet, self, source, destination);
+    }
+    
+    // aten::movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+    inline at::Tensor movedim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t source, int64_t destination) {
+        return at::_ops::movedim_int::redispatch(dispatchKeySet, self, source, destination);
+    }
+    
+    // aten::moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+    inline at::Tensor moveaxis(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef source, at::IntArrayRef destination) {
+        return at::_ops::moveaxis_intlist::redispatch(dispatchKeySet, self, source, destination);
+    }
+    
+    // aten::moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+    inline at::Tensor moveaxis(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t source, int64_t destination) {
+        return at::_ops::moveaxis_int::redispatch(dispatchKeySet, self, source, destination);
+    }
+    
+    // aten::numpy_T(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor numpy_T(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::numpy_T::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::matrix_H(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor matrix_H(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::matrix_H::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::mT(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor mT(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::mT::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::mH(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor mH(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::mH::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::adjoint(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor adjoint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::adjoint::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
+    inline at::Tensor pixel_shuffle(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t upscale_factor) {
+        return at::_ops::pixel_shuffle::redispatch(dispatchKeySet, self, upscale_factor);
+    }
+    
+    // aten::pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
+    inline at::Tensor pixel_unshuffle(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t downscale_factor) {
+        return at::_ops::pixel_unshuffle::redispatch(dispatchKeySet, self, downscale_factor);
+    }
+    
+    // aten::channel_shuffle(Tensor self, SymInt groups) -> Tensor
+    inline at::Tensor channel_shuffle(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t groups) {
+        return at::_ops::channel_shuffle::redispatch(dispatchKeySet, self, groups);
+    }
+    
+    // aten::channel_shuffle(Tensor self, SymInt groups) -> Tensor
+    inline at::Tensor channel_shuffle_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt groups) {
+        return at::_ops::channel_shuffle::redispatch(dispatchKeySet, self, groups);
+    }
+    
+    // aten::native_channel_shuffle(Tensor self, SymInt groups) -> Tensor
+    inline at::Tensor native_channel_shuffle(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t groups) {
+        return at::_ops::native_channel_shuffle::redispatch(dispatchKeySet, self, groups);
+    }
+    
+    // aten::native_channel_shuffle(Tensor self, SymInt groups) -> Tensor
+    inline at::Tensor native_channel_shuffle_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt groups) {
+        return at::_ops::native_channel_shuffle::redispatch(dispatchKeySet, self, groups);
+    }
+    
+    // aten::is_pinned(Tensor self, Device? device=None) -> bool
+    inline bool is_pinned(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+        return at::_ops::is_pinned::redispatch(dispatchKeySet, self, device);
+    }
+    
+    // aten::pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+    inline at::Tensor pin_memory(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+        return at::_ops::pin_memory::redispatch(dispatchKeySet, self, device);
+    }
+    
+    // aten::_pin_memory(Tensor self, Device? device=None) -> Tensor
+    inline at::Tensor _pin_memory(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+        return at::_ops::_pin_memory::redispatch(dispatchKeySet, self, device);
+    }
+    
+    // aten::pinverse(Tensor self, float rcond=1e-15) -> Tensor
+    inline at::Tensor pinverse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double rcond=1e-15) {
+        return at::_ops::pinverse::redispatch(dispatchKeySet, self, rcond);
+    }
+    
+    // aten::poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
+    inline at::Tensor poisson_nll_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & target, bool log_input, bool full, double eps, int64_t reduction) {
+        return at::_ops::poisson_nll_loss::redispatch(dispatchKeySet, input, target, log_input, full, eps, reduction);
+    }
+    
+    // aten::rad2deg(Tensor self) -> Tensor
+    inline at::Tensor rad2deg(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::rad2deg::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::rad2deg_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & rad2deg_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::rad2deg_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rad2deg_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::rad2deg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rad2deg_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::rad2deg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::deg2rad(Tensor self) -> Tensor
+    inline at::Tensor deg2rad(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::deg2rad::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::deg2rad_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & deg2rad_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::deg2rad_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & deg2rad_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::deg2rad_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & deg2rad_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::deg2rad_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor scalar_tensor(c10::DispatchKeySet dispatchKeySet, const at::Scalar & s, at::TensorOptions options={}) {
+        return at::_ops::scalar_tensor::redispatch(dispatchKeySet, s, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor scalar_tensor(c10::DispatchKeySet dispatchKeySet, const at::Scalar & s, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::scalar_tensor::redispatch(dispatchKeySet, s, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::rand_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::rand_names::redispatch(dispatchKeySet, size, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_names::redispatch(dispatchKeySet, size, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::rand_generator_with_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_generator_with_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::rand_generator_with_names::redispatch(dispatchKeySet, size, generator, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_generator_with_names::redispatch(dispatchKeySet, size, generator, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::rand::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::rand::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={}) {
+        return at::_ops::rand_generator::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_generator::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={}) {
+        return at::_ops::rand_generator::redispatch(dispatchKeySet, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor rand_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::rand_generator::redispatch(dispatchKeySet, size, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::rand_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::rand_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size) {
+        return at::_ops::rand_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::rand_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::rand_generator_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::rand_generator_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::rand_generator_out::redispatch(dispatchKeySet, size, generator, out);
+    }
+    
+    // aten::rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::rand_generator_out::redispatch(dispatchKeySet, size, generator, out);
+    }
+    
+    // aten::rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor rand_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::rand_like::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor rand_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::rand_like::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint::redispatch(dispatchKeySet, high, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint::redispatch(dispatchKeySet, high, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_generator::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_generator::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_generator::redispatch(dispatchKeySet, high, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_generator::redispatch(dispatchKeySet, high, size, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_low::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_low::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_low::redispatch(dispatchKeySet, low, high, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_low::redispatch(dispatchKeySet, low, high, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_low_generator::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_low_generator::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randint_low_generator::redispatch(dispatchKeySet, low, high, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randint_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randint_low_generator::redispatch(dispatchKeySet, low, high, size, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t high, at::IntArrayRef size) {
+        return at::_ops::randint_out::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_outf(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::randint_out::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt high, c10::SymIntArrayRef size) {
+        return at::_ops::randint_out::redispatch(dispatchKeySet, high, size, out);
+    }
+    
+    // aten::randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::randint_out::redispatch(dispatchKeySet, high, size, out);
+    }
+    
+    // aten::randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randint_generator_out::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_outf(c10::DispatchKeySet dispatchKeySet, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randint_generator_out::redispatch(dispatchKeySet, high, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randint_generator_out::redispatch(dispatchKeySet, high, size, generator, out);
+    }
+    
+    // aten::randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randint_generator_out::redispatch(dispatchKeySet, high, size, generator, out);
+    }
+    
+    // aten::randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t low, int64_t high, at::IntArrayRef size) {
+        return at::_ops::randint_low_out::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_outf(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::randint_low_out::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size) {
+        return at::_ops::randint_low_out::redispatch(dispatchKeySet, low, high, size, out);
+    }
+    
+    // aten::randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::randint_low_out::redispatch(dispatchKeySet, low, high, size, out);
+    }
+    
+    // aten::randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randint_low_generator_out::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_outf(c10::DispatchKeySet dispatchKeySet, int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randint_low_generator_out::redispatch(dispatchKeySet, low, high, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randint_low_generator_out::redispatch(dispatchKeySet, low, high, size, generator, out);
+    }
+    
+    // aten::randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randint_low_generator_out::redispatch(dispatchKeySet, low, high, size, generator, out);
+    }
+    
+    // aten::randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t high, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like::redispatch(dispatchKeySet, self, high, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t high, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::randint_like::redispatch(dispatchKeySet, self, high, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt high, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like::redispatch(dispatchKeySet, self, high, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt high, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::randint_like::redispatch(dispatchKeySet, self, high, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t low, int64_t high, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_low_dtype::redispatch(dispatchKeySet, self, low, high, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t low, int64_t high, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::randint_like_low_dtype::redispatch(dispatchKeySet, self, low, high, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt low, c10::SymInt high, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_low_dtype::redispatch(dispatchKeySet, self, low, high, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randint_like_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt low, c10::SymInt high, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::randint_like_low_dtype::redispatch(dispatchKeySet, self, low, high, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::randn::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::randn::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={}) {
+        return at::_ops::randn_generator::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_generator::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={}) {
+        return at::_ops::randn_generator::redispatch(dispatchKeySet, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_generator::redispatch(dispatchKeySet, size, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::randn_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::randn_names::redispatch(dispatchKeySet, size, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_names::redispatch(dispatchKeySet, size, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::randn_generator_with_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_generator_with_names::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::randn_generator_with_names::redispatch(dispatchKeySet, size, generator, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randn_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randn_generator_with_names::redispatch(dispatchKeySet, size, generator, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::randn_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::randn_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size) {
+        return at::_ops::randn_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::randn_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randn_generator_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randn_generator_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::Generator> generator) {
+        return at::_ops::randn_generator_out::redispatch(dispatchKeySet, size, generator, out);
+    }
+    
+    // aten::randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randn_generator_out::redispatch(dispatchKeySet, size, generator, out);
+    }
+    
+    // aten::randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randn_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randn_like::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor randn_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::randn_like::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm(c10::DispatchKeySet dispatchKeySet, int64_t n, at::TensorOptions options=at::kLong) {
+        return at::_ops::randperm::redispatch(dispatchKeySet, n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm(c10::DispatchKeySet dispatchKeySet, int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randperm::redispatch(dispatchKeySet, n, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, at::TensorOptions options=at::kLong) {
+        return at::_ops::randperm::redispatch(dispatchKeySet, n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randperm::redispatch(dispatchKeySet, n, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm(c10::DispatchKeySet dispatchKeySet, int64_t n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randperm_generator::redispatch(dispatchKeySet, n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm(c10::DispatchKeySet dispatchKeySet, int64_t n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randperm_generator::redispatch(dispatchKeySet, n, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+        return at::_ops::randperm_generator::redispatch(dispatchKeySet, n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor randperm_symint(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::randperm_generator::redispatch(dispatchKeySet, n, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n) {
+        return at::_ops::randperm_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, at::Tensor & out) {
+        return at::_ops::randperm_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt n) {
+        return at::_ops::randperm_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, at::Tensor & out) {
+        return at::_ops::randperm_out::redispatch(dispatchKeySet, n, out);
+    }
+    
+    // aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, c10::optional<at::Generator> generator) {
+        return at::_ops::randperm_generator_out::redispatch(dispatchKeySet, n, generator, out);
+    }
+    
+    // aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randperm_generator_out::redispatch(dispatchKeySet, n, generator, out);
+    }
+    
+    // aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymInt n, c10::optional<at::Generator> generator) {
+        return at::_ops::randperm_generator_out::redispatch(dispatchKeySet, n, generator, out);
+    }
+    
+    // aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randperm_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::randperm_generator_out::redispatch(dispatchKeySet, n, generator, out);
+    }
+    
+    // aten::range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step=1, at::TensorOptions options={}) {
+        return at::_ops::range_step::redispatch(dispatchKeySet, start, end, step, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::range_step::redispatch(dispatchKeySet, start, end, step, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, at::TensorOptions options={}) {
+        return at::_ops::range::redispatch(dispatchKeySet, start, end, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor range(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::range::redispatch(dispatchKeySet, start, end, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & range_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Scalar & end) {
+        return at::_ops::range_out_::redispatch(dispatchKeySet, start, end, out);
+    }
+    
+    // aten::range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & range_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, at::Tensor & out) {
+        return at::_ops::range_out_::redispatch(dispatchKeySet, start, end, out);
+    }
+    
+    // aten::range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & range_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step) {
+        return at::_ops::range_out::redispatch(dispatchKeySet, start, end, step, out);
+    }
+    
+    // aten::range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & range_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out) {
+        return at::_ops::range_out::redispatch(dispatchKeySet, start, end, step, out);
+    }
+    
+    // aten::ravel(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor ravel(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::ravel::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::reciprocal(Tensor self) -> Tensor
+    inline at::Tensor reciprocal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::reciprocal::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::reciprocal_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & reciprocal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::reciprocal_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reciprocal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::reciprocal_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reciprocal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::reciprocal_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::neg(Tensor self) -> Tensor
+    inline at::Tensor neg(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::neg::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::neg_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & neg_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::neg_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & neg_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::neg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & neg_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::neg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::negative(Tensor self) -> Tensor
+    inline at::Tensor negative(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::negative::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::negative_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & negative_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::negative_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & negative_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::negative_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & negative_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::negative_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::repeat(Tensor self, SymInt[] repeats) -> Tensor
+    inline at::Tensor repeat(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef repeats) {
+        return at::_ops::repeat::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(repeats));
+    }
+    
+    // aten::repeat(Tensor self, SymInt[] repeats) -> Tensor
+    inline at::Tensor repeat_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats) {
+        return at::_ops::repeat::redispatch(dispatchKeySet, self, repeats);
+    }
+    
+    // aten::repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave(c10::DispatchKeySet dispatchKeySet, const at::Tensor & repeats, c10::optional<int64_t> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_Tensor::redispatch(dispatchKeySet, repeats, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt);
+    }
+    
+    // aten::repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & repeats, c10::optional<c10::SymInt> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_Tensor::redispatch(dispatchKeySet, repeats, output_size);
+    }
+    
+    // aten::repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_self_Tensor::redispatch(dispatchKeySet, self, repeats, dim, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt);
+    }
+    
+    // aten::repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<c10::SymInt> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_self_Tensor::redispatch(dispatchKeySet, self, repeats, dim, output_size);
+    }
+    
+    // aten::repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_self_int::redispatch(dispatchKeySet, self, repeats, dim, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt);
+    }
+    
+    // aten::repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+    inline at::Tensor repeat_interleave_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<c10::SymInt> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_self_int::redispatch(dispatchKeySet, self, repeats, dim, output_size);
+    }
+    
+    // aten::reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+    inline at::Tensor reshape(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef shape) {
+        return at::_ops::reshape::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(shape));
+    }
+    
+    // aten::reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+    inline at::Tensor reshape_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef shape) {
+        return at::_ops::reshape::redispatch(dispatchKeySet, self, shape);
+    }
+    
+    // aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor _reshape_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_reshape_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor _reshape_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::_reshape_copy::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+    inline at::Tensor _reshape_alias(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride) {
+        return at::_ops::_reshape_alias::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+    inline at::Tensor _reshape_alias_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
+        return at::_ops::_reshape_alias::redispatch(dispatchKeySet, self, size, stride);
+    }
+    
+    // aten::_mkldnn_reshape(Tensor self, int[] shape) -> Tensor
+    inline at::Tensor _mkldnn_reshape(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef shape) {
+        return at::_ops::_mkldnn_reshape::redispatch(dispatchKeySet, self, shape);
+    }
+    
+    // aten::reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
+    inline at::Tensor reshape_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::reshape_as::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::round(Tensor self) -> Tensor
+    inline at::Tensor round(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::round::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::round_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & round_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::round_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & round_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::round_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & round_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::round_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::round.decimals(Tensor self, *, int decimals) -> Tensor
+    inline at::Tensor round(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals) {
+        return at::_ops::round_decimals::redispatch(dispatchKeySet, self, decimals);
+    }
+    
+    // aten::round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
+    inline at::Tensor & round_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t decimals) {
+        return at::_ops::round__decimals::redispatch(dispatchKeySet, self, decimals);
+    }
+    
+    // aten::round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & round_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t decimals) {
+        return at::_ops::round_decimals_out::redispatch(dispatchKeySet, self, decimals, out);
+    }
+    
+    // aten::round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & round_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals, at::Tensor & out) {
+        return at::_ops::round_decimals_out::redispatch(dispatchKeySet, self, decimals, out);
+    }
+    
+    // aten::rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+    inline at::Tensor rrelu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::rrelu::redispatch(dispatchKeySet, self, lower, upper, training, generator);
+    }
+    
+    // aten::rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & rrelu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::rrelu_::redispatch(dispatchKeySet, self, lower, upper, training, generator);
+    }
+    
+    // aten::relu(Tensor self) -> Tensor
+    inline at::Tensor relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::relu::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::relu_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & relu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::relu_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::relu6(Tensor self) -> Tensor
+    inline at::Tensor relu6(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::relu6::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::relu6_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & relu6_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::relu6_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::prelu(Tensor self, Tensor weight) -> Tensor
+    inline at::Tensor prelu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight) {
+        return at::_ops::prelu::redispatch(dispatchKeySet, self, weight);
+    }
+    
+    // aten::_prelu_kernel(Tensor self, Tensor weight) -> Tensor
+    inline at::Tensor _prelu_kernel(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight) {
+        return at::_ops::_prelu_kernel::redispatch(dispatchKeySet, self, weight);
+    }
+    
+    // aten::_prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _prelu_kernel_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight) {
+        return at::_ops::_prelu_kernel_backward::redispatch(dispatchKeySet, grad_output, self, weight);
+    }
+    
+    // aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gelu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none") {
+        return at::_ops::gelu_out::redispatch(dispatchKeySet, self, approximate, out);
+    }
+    
+    // aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gelu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view approximate, at::Tensor & out) {
+        return at::_ops::gelu_out::redispatch(dispatchKeySet, self, approximate, out);
+    }
+    
+    // aten::gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+    inline at::Tensor & gelu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::string_view approximate="none") {
+        return at::_ops::gelu_::redispatch(dispatchKeySet, self, approximate);
+    }
+    
+    // aten::gelu(Tensor self, *, str approximate='none') -> Tensor
+    inline at::Tensor gelu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view approximate="none") {
+        return at::_ops::gelu::redispatch(dispatchKeySet, self, approximate);
+    }
+    
+    // aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & gelu_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") {
+        return at::_ops::gelu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, approximate, grad_input);
+    }
+    
+    // aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & gelu_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input) {
+        return at::_ops::gelu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, approximate, grad_input);
+    }
+    
+    // aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+    inline at::Tensor gelu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") {
+        return at::_ops::gelu_backward::redispatch(dispatchKeySet, grad_output, self, approximate);
+    }
+    
+    // aten::infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
+    inline at::Tensor infinitely_differentiable_gelu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self) {
+        return at::_ops::infinitely_differentiable_gelu_backward::redispatch(dispatchKeySet, grad, self);
+    }
+    
+    // aten::hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardshrink_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & lambd=0.5) {
+        return at::_ops::hardshrink_out::redispatch(dispatchKeySet, self, lambd, out);
+    }
+    
+    // aten::hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardshrink_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out) {
+        return at::_ops::hardshrink_out::redispatch(dispatchKeySet, self, lambd, out);
+    }
+    
+    // aten::hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+    inline at::Tensor hardshrink(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd=0.5) {
+        return at::_ops::hardshrink::redispatch(dispatchKeySet, self, lambd);
+    }
+    
+    // aten::hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardshrink_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd) {
+        return at::_ops::hardshrink_backward_grad_input::redispatch(dispatchKeySet, grad_out, self, lambd, grad_input);
+    }
+    
+    // aten::hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardshrink_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & grad_input) {
+        return at::_ops::hardshrink_backward_grad_input::redispatch(dispatchKeySet, grad_out, self, lambd, grad_input);
+    }
+    
+    // aten::hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+    inline at::Tensor hardshrink_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd) {
+        return at::_ops::hardshrink_backward::redispatch(dispatchKeySet, grad_out, self, lambd);
+    }
+    
+    // aten::rsqrt(Tensor self) -> Tensor
+    inline at::Tensor rsqrt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::rsqrt::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::rsqrt_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & rsqrt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::rsqrt_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsqrt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::rsqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsqrt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::rsqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
+    inline at::Tensor select(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, int64_t index) {
+        return at::_ops::select_Dimname::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+    inline at::Tensor select(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t index) {
+        return at::_ops::select_int::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+    inline at::Tensor select_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_int::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+        return at::_ops::select_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index);
+    }
+    
+    // aten::select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_backward::redispatch(dispatchKeySet, grad_output, input_sizes, dim, index);
+    }
+    
+    // aten::_nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor
+    inline at::Tensor _nested_select_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, int64_t index) {
+        return at::_ops::_nested_select_backward::redispatch(dispatchKeySet, grad_output, self, dim, index);
+    }
+    
+    // aten::_nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor
+    inline at::Tensor _nested_select_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index) {
+        return at::_ops::_nested_select_backward::redispatch(dispatchKeySet, grad_output, self, dim, index);
+    }
+    
+    // aten::selu(Tensor self) -> Tensor
+    inline at::Tensor selu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::selu::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::selu_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & selu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::selu_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::celu(Tensor self, Scalar alpha=1.0) -> Tensor
+    inline at::Tensor celu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha=1.0) {
+        return at::_ops::celu::redispatch(dispatchKeySet, self, alpha);
+    }
+    
+    // aten::celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
+    inline at::Tensor & celu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & alpha=1.0) {
+        return at::_ops::celu_::redispatch(dispatchKeySet, self, alpha);
+    }
+    
+    // aten::silu(Tensor self) -> Tensor
+    inline at::Tensor silu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::silu::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::silu_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & silu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::silu_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & silu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::silu_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & silu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::silu_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & silu_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::silu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & silu_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) {
+        return at::_ops::silu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::silu_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor silu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::silu_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::mish(Tensor self) -> Tensor
+    inline at::Tensor mish(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::mish::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::mish_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & mish_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::mish_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mish_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::mish_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mish_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::mish_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::mish_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor mish_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::mish_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::sigmoid(Tensor self) -> Tensor
+    inline at::Tensor sigmoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sigmoid::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sigmoid_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sigmoid_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sigmoid_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sigmoid_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sigmoid_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::logit(Tensor self, float? eps=None) -> Tensor
+    inline at::Tensor logit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::logit::redispatch(dispatchKeySet, self, eps);
+    }
+    
+    // aten::logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+    inline at::Tensor & logit_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::logit_::redispatch(dispatchKeySet, self, eps);
+    }
+    
+    // aten::logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::logit_out::redispatch(dispatchKeySet, self, eps, out);
+    }
+    
+    // aten::logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & logit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> eps, at::Tensor & out) {
+        return at::_ops::logit_out::redispatch(dispatchKeySet, self, eps, out);
+    }
+    
+    // aten::sin(Tensor self) -> Tensor
+    inline at::Tensor sin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sin::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sin_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sin_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sin_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sinc(Tensor self) -> Tensor
+    inline at::Tensor sinc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sinc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sinc_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sinc_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sinc_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sinc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sinc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sinc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sinc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sinh(Tensor self) -> Tensor
+    inline at::Tensor sinh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sinh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sinh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sinh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sinh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sinh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sinh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::detach(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor detach(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::detach::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::detach_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & detach_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::detach_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::size.int(Tensor self, int dim) -> int
+    inline int64_t __dispatch_size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::size_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::size.Dimname(Tensor self, Dimname dim) -> int
+    inline int64_t size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::size_Dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::sym_size.int(Tensor self, int dim) -> SymInt
+    inline c10::SymInt __dispatch_sym_size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::sym_size_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::sym_numel(Tensor self) -> SymInt
+    inline c10::SymInt __dispatch_sym_numel(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sym_numel::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sym_storage_offset(Tensor self) -> SymInt
+    inline c10::SymInt __dispatch_sym_storage_offset(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sym_storage_offset::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+    inline at::Tensor slice(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_Tensor::redispatch(dispatchKeySet, self, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+    }
+    
+    // aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+    inline at::Tensor slice_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_Tensor::redispatch(dispatchKeySet, self, dim, start, end, step);
+    }
+    
+    // aten::slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+    inline at::Tensor slice_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
+        return at::_ops::slice_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, start, end, step);
+    }
+    
+    // aten::slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor
+    inline at::Tensor slice_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step) {
+        return at::_ops::slice_backward::redispatch(dispatchKeySet, grad_output, input_sizes, dim, start, end, step);
+    }
+    
+    // aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+    inline at::Tensor slice_inverse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_inverse::redispatch(dispatchKeySet, self, src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+    }
+    
+    // aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+    inline at::Tensor slice_inverse_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_inverse::redispatch(dispatchKeySet, self, src, dim, start, end, step);
+    }
+    
+    // aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+    inline at::Tensor slice_scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_scatter::redispatch(dispatchKeySet, self, src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+    }
+    
+    // aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+    inline at::Tensor slice_scatter_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_scatter::redispatch(dispatchKeySet, self, src, dim, start, end, step);
+    }
+    
+    // aten::select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index) {
+        return at::_ops::select_scatter::redispatch(dispatchKeySet, self, src, dim, index);
+    }
+    
+    // aten::select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_scatter_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_scatter::redispatch(dispatchKeySet, self, src, dim, index);
+    }
+    
+    // aten::diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+    inline at::Tensor diagonal_scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) {
+        return at::_ops::diagonal_scatter::redispatch(dispatchKeySet, self, src, offset, dim1, dim2);
+    }
+    
+    // aten::as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+    inline at::Tensor as_strided_scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_scatter::redispatch(dispatchKeySet, self, src, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+    }
+    
+    // aten::as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+    inline at::Tensor as_strided_scatter_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_scatter::redispatch(dispatchKeySet, self, src, size, stride, storage_offset);
+    }
+    
+    // aten::smm(Tensor self, Tensor mat2) -> Tensor
+    inline at::Tensor smm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2) {
+        return at::_ops::smm::redispatch(dispatchKeySet, self, mat2);
+    }
+    
+    // aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::softmax_int::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::softmax_int_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::softmax_int_out::redispatch(dispatchKeySet, self, dim, dtype, out);
+    }
+    
+    // aten::softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::softmax_Dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+    inline at::Tensor _softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_softmax::redispatch(dispatchKeySet, self, dim, half_to_float);
+    }
+    
+    // aten::_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) {
+        return at::_ops::_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
+    inline at::Tensor _softmax_backward_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
+        return at::_ops::_softmax_backward_data::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype);
+    }
+    
+    // aten::_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _softmax_backward_data_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype) {
+        return at::_ops::_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype, grad_input);
+    }
+    
+    // aten::_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _softmax_backward_data_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, at::Tensor & grad_input) {
+        return at::_ops::_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, input_dtype, grad_input);
+    }
+    
+    // aten::unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unsafe_split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+        return at::_ops::unsafe_split_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unsafe_split_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+        return at::_ops::unsafe_split_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+        return at::_ops::split_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+        return at::_ops::split_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_size, int64_t dim=0) {
+        return at::_ops::split_sizes::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_size), dim);
+    }
+    
+    // aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_size, int64_t dim=0) {
+        return at::_ops::split_sizes::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unsafe_split_with_sizes(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::unsafe_split_with_sizes::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim);
+    }
+    
+    // aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unsafe_split_with_sizes_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::unsafe_split_with_sizes::redispatch(dispatchKeySet, self, split_sizes, dim);
+    }
+    
+    // aten::split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split_with_sizes(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim);
+    }
+    
+    // aten::split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> split_with_sizes_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes::redispatch(dispatchKeySet, self, split_sizes, dim);
+    }
+    
+    // aten::hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> hsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections) {
+        return at::_ops::hsplit_int::redispatch(dispatchKeySet, self, sections);
+    }
+    
+    // aten::hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> hsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices) {
+        return at::_ops::hsplit_array::redispatch(dispatchKeySet, self, indices);
+    }
+    
+    // aten::vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> vsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections) {
+        return at::_ops::vsplit_int::redispatch(dispatchKeySet, self, sections);
+    }
+    
+    // aten::vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> vsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices) {
+        return at::_ops::vsplit_array::redispatch(dispatchKeySet, self, indices);
+    }
+    
+    // aten::dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> dsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections) {
+        return at::_ops::dsplit_int::redispatch(dispatchKeySet, self, sections);
+    }
+    
+    // aten::dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> dsplit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices) {
+        return at::_ops::dsplit_array::redispatch(dispatchKeySet, self, indices);
+    }
+    
+    // aten::squeeze(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor squeeze(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::squeeze::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+    inline at::Tensor squeeze(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::squeeze_dim::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
+    inline at::Tensor squeeze(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::squeeze_dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
+    inline at::Tensor squeeze(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::squeeze_dims::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & squeeze_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::squeeze_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+    inline at::Tensor & squeeze_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim) {
+        return at::_ops::squeeze__dim::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
+    inline at::Tensor & squeeze_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::squeeze__dims::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
+    inline at::Tensor & squeeze_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim) {
+        return at::_ops::squeeze__dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor sspaddmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::sspaddmm::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha);
+    }
+    
+    // aten::sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sspaddmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::sspaddmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sspaddmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::sspaddmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::_chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor
+    inline at::Tensor _chunk_cat(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, int64_t num_chunks) {
+        return at::_ops::_chunk_cat::redispatch(dispatchKeySet, tensors, dim, num_chunks);
+    }
+    
+    // aten::_chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _chunk_cat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, int64_t dim, int64_t num_chunks) {
+        return at::_ops::_chunk_cat_out::redispatch(dispatchKeySet, tensors, dim, num_chunks, out);
+    }
+    
+    // aten::_chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _chunk_cat_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, int64_t num_chunks, at::Tensor & out) {
+        return at::_ops::_chunk_cat_out::redispatch(dispatchKeySet, tensors, dim, num_chunks, out);
+    }
+    
+    // aten::stack(Tensor[] tensors, int dim=0) -> Tensor
+    inline at::Tensor stack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::stack::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & stack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::stack_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & stack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out) {
+        return at::_ops::stack_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::_stack(Tensor[] tensors, int dim=0) -> Tensor
+    inline at::Tensor _stack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::_stack::redispatch(dispatchKeySet, tensors, dim);
+    }
+    
+    // aten::_stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _stack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors, int64_t dim=0) {
+        return at::_ops::_stack_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::_stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _stack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out) {
+        return at::_ops::_stack_out::redispatch(dispatchKeySet, tensors, dim, out);
+    }
+    
+    // aten::hstack(Tensor[] tensors) -> Tensor
+    inline at::Tensor hstack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::hstack::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hstack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::hstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hstack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::hstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::vstack(Tensor[] tensors) -> Tensor
+    inline at::Tensor vstack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::vstack::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & vstack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::vstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & vstack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::vstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::dstack(Tensor[] tensors) -> Tensor
+    inline at::Tensor dstack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::dstack::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dstack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::dstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dstack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::dstack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+    inline at::Tensor stft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool normalized, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) {
+        return at::_ops::stft::redispatch(dispatchKeySet, self, n_fft, hop_length, win_length, window, normalized, onesided, return_complex);
+    }
+    
+    // aten::stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+    inline at::Tensor stft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, c10::string_view pad_mode="reflect", bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) {
+        return at::_ops::stft_center::redispatch(dispatchKeySet, self, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided, return_complex);
+    }
+    
+    // aten::istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
+    inline at::Tensor istft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<int64_t> length=c10::nullopt, bool return_complex=false) {
+        return at::_ops::istft::redispatch(dispatchKeySet, self, n_fft, hop_length, win_length, window, center, normalized, onesided, length, return_complex);
+    }
+    
+    // aten::stride.int(Tensor self, int dim) -> int
+    inline int64_t __dispatch_stride(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::stride_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::stride.Dimname(Tensor self, Dimname dim) -> int
+    inline int64_t stride(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::stride_Dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::sym_stride.int(Tensor self, int dim) -> SymInt
+    inline c10::SymInt __dispatch_sym_stride(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::sym_stride_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum_dim_IntList::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum_dim_DimnameList::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum_IntList_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::sum_IntList_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum_DimnameList_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::sum_DimnameList_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
+    inline at::Tensor _nested_sum_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) {
+        return at::_ops::_nested_sum_backward::redispatch(dispatchKeySet, grad, self, dim, keepdim);
+    }
+    
+    // aten::nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor nansum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::nansum::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nansum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::nansum_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nansum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::nansum_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::sum_to_size(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor sum_to_size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::sum_to_size::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::sum_to_size(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor sum_to_size_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::sum_to_size::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::sqrt(Tensor self) -> Tensor
+    inline at::Tensor sqrt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sqrt::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sqrt_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sqrt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sqrt_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sqrt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sqrt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::square(Tensor self) -> Tensor
+    inline at::Tensor square(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::square::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::square_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & square_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::square_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & square_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::square_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & square_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::square_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::std(Tensor self, bool unbiased=True) -> Tensor
+    inline at::Tensor std(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool unbiased) {
+        return at::_ops::std::redispatch(dispatchKeySet, self, unbiased);
+    }
+    
+    // aten::std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+    inline at::Tensor std(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+    inline at::Tensor std(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_correction::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> std_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool unbiased) {
+        return at::_ops::std_mean::redispatch(dispatchKeySet, self, unbiased);
+    }
+    
+    // aten::std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> std_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_mean_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> std_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_mean_correction::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> std_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_mean_names_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> std_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_mean_correction_names::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim, at::Tensor & out) {
+        return at::_ops::std_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out) {
+        return at::_ops::std_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+    inline at::Tensor std(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_names_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::std_names_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim, at::Tensor & out) {
+        return at::_ops::std_names_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+    inline at::Tensor std(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_correction_names::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_correction_names_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & std_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out) {
+        return at::_ops::std_correction_names_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor prod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor prod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod_dim_int::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod_int_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::prod_int_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor prod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod_dim_Dimname::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod_Dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::prod_Dimname_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::t(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor t(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::t::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::t_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & t_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::t_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::tan(Tensor self) -> Tensor
+    inline at::Tensor tan(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::tan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::tan_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & tan_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::tan_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tan_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::tan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tan_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::tan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::tanh(Tensor self) -> Tensor
+    inline at::Tensor tanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::tanh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::tanh_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & tanh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::tanh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tanh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::tanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tanh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::tanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
+    inline at::Tensor tensordot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::IntArrayRef dims_self, at::IntArrayRef dims_other) {
+        return at::_ops::tensordot::redispatch(dispatchKeySet, self, other, dims_self, dims_other);
+    }
+    
+    // aten::tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tensordot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, at::IntArrayRef dims_self, at::IntArrayRef dims_other) {
+        return at::_ops::tensordot_out::redispatch(dispatchKeySet, self, other, dims_self, dims_other, out);
+    }
+    
+    // aten::tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tensordot_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::IntArrayRef dims_self, at::IntArrayRef dims_other, at::Tensor & out) {
+        return at::_ops::tensordot_out::redispatch(dispatchKeySet, self, other, dims_self, dims_other, out);
+    }
+    
+    // aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
+    inline at::Tensor threshold(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value) {
+        return at::_ops::threshold::redispatch(dispatchKeySet, self, threshold, value);
+    }
+    
+    // aten::threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
+    inline at::Tensor & threshold_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value) {
+        return at::_ops::threshold_::redispatch(dispatchKeySet, self, threshold, value);
+    }
+    
+    // aten::threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & threshold_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value) {
+        return at::_ops::threshold_out::redispatch(dispatchKeySet, self, threshold, value, out);
+    }
+    
+    // aten::threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & threshold_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::threshold_out::redispatch(dispatchKeySet, self, threshold, value, out);
+    }
+    
+    // aten::threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & threshold_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold) {
+        return at::_ops::threshold_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, threshold, grad_input);
+    }
+    
+    // aten::threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & threshold_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold, at::Tensor & grad_input) {
+        return at::_ops::threshold_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, threshold, grad_input);
+    }
+    
+    // aten::threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
+    inline at::Tensor threshold_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold) {
+        return at::_ops::threshold_backward::redispatch(dispatchKeySet, grad_output, self, threshold);
+    }
+    
+    // aten::tile(Tensor self, SymInt[] dims) -> Tensor
+    inline at::Tensor tile(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::tile::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(dims));
+    }
+    
+    // aten::tile(Tensor self, SymInt[] dims) -> Tensor
+    inline at::Tensor tile_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef dims) {
+        return at::_ops::tile::redispatch(dispatchKeySet, self, dims);
+    }
+    
+    // aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+    inline at::Tensor transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::transpose_int::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
+    inline at::Tensor transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim0, at::Dimname dim1) {
+        return at::_ops::transpose_Dimname::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::_mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
+    inline at::Tensor _mkldnn_transpose(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::_mkldnn_transpose::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+    inline at::Tensor & transpose_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::transpose_::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::_mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+    inline at::Tensor & _mkldnn_transpose_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::_mkldnn_transpose_::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::one_hot(Tensor self, int num_classes=-1) -> Tensor
+    inline at::Tensor one_hot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_classes=-1) {
+        return at::_ops::one_hot::redispatch(dispatchKeySet, self, num_classes);
+    }
+    
+    // aten::flip(Tensor self, int[] dims) -> Tensor
+    inline at::Tensor flip(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::flip::redispatch(dispatchKeySet, self, dims);
+    }
+    
+    // aten::fliplr(Tensor self) -> Tensor
+    inline at::Tensor fliplr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::fliplr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::flipud(Tensor self) -> Tensor
+    inline at::Tensor flipud(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::flipud::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+    inline at::Tensor roll(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef shifts, at::IntArrayRef dims={}) {
+        return at::_ops::roll::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(shifts), dims);
+    }
+    
+    // aten::roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+    inline at::Tensor roll_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef shifts, at::IntArrayRef dims={}) {
+        return at::_ops::roll::redispatch(dispatchKeySet, self, shifts, dims);
+    }
+    
+    // aten::rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
+    inline at::Tensor rot90(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k=1, at::IntArrayRef dims={0,1}) {
+        return at::_ops::rot90::redispatch(dispatchKeySet, self, k, dims);
+    }
+    
+    // aten::trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+    inline at::Tensor trapezoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, const at::Tensor & x, int64_t dim=-1) {
+        return at::_ops::trapezoid_x::redispatch(dispatchKeySet, y, x, dim);
+    }
+    
+    // aten::trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor
+    inline at::Tensor trapezoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, const at::Scalar & dx=1, int64_t dim=-1) {
+        return at::_ops::trapezoid_dx::redispatch(dispatchKeySet, y, dx, dim);
+    }
+    
+    // aten::trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
+    inline at::Tensor trapz(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, const at::Tensor & x, int64_t dim=-1) {
+        return at::_ops::trapz_x::redispatch(dispatchKeySet, y, x, dim);
+    }
+    
+    // aten::trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
+    inline at::Tensor trapz(c10::DispatchKeySet dispatchKeySet, const at::Tensor & y, double dx=1, int64_t dim=-1) {
+        return at::_ops::trapz_dx::redispatch(dispatchKeySet, y, dx, dim);
+    }
+    
+    // aten::_transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _transform_bias_rescale_qkv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
+        return at::_ops::_transform_bias_rescale_qkv::redispatch(dispatchKeySet, qkv, qkv_bias, num_heads);
+    }
+    
+    // aten::_nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor
+    inline at::Tensor _nested_tensor_from_mask(c10::DispatchKeySet dispatchKeySet, const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) {
+        return at::_ops::_nested_tensor_from_mask::redispatch(dispatchKeySet, t, mask, mask_check);
+    }
+    
+    // aten::_nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool
+    inline bool _nested_tensor_from_mask_left_aligned(c10::DispatchKeySet dispatchKeySet, const at::Tensor & t, const at::Tensor & mask) {
+        return at::_ops::_nested_tensor_from_mask_left_aligned::redispatch(dispatchKeySet, t, mask);
+    }
+    
+    // aten::_nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor
+    inline at::Tensor _nested_from_padded(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & cpu_nested_shape_example, bool fuse_transform_0213=false) {
+        return at::_ops::_nested_from_padded::redispatch(dispatchKeySet, padded, cpu_nested_shape_example, fuse_transform_0213);
+    }
+    
+    // aten::_nested_tensor_size(Tensor self) -> Tensor
+    inline at::Tensor _nested_tensor_size(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_size::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_tensor_strides(Tensor self) -> Tensor
+    inline at::Tensor _nested_tensor_strides(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_strides::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_tensor_storage_offsets(Tensor self) -> Tensor
+    inline at::Tensor _nested_tensor_storage_offsets(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_storage_offsets::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
+    inline at::Tensor _nested_from_padded_and_nested_example(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & nt_example) {
+        return at::_ops::_nested_from_padded_and_nested_example::redispatch(dispatchKeySet, padded, nt_example);
+    }
+    
+    // aten::_nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)
+    inline at::Tensor _nested_view_from_buffer(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets) {
+        return at::_ops::_nested_view_from_buffer::redispatch(dispatchKeySet, self, nested_size, nested_strides, offsets);
+    }
+    
+    // aten::_nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor
+    inline at::Tensor _nested_view_from_buffer_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets) {
+        return at::_ops::_nested_view_from_buffer_copy::redispatch(dispatchKeySet, self, nested_size, nested_strides, offsets);
+    }
+    
+    // aten::_nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)
+    inline at::Tensor _nested_view_from_jagged(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths={}, int64_t ragged_idx=1) {
+        return at::_ops::_nested_view_from_jagged::redispatch(dispatchKeySet, self, offsets, dummy, lengths, ragged_idx);
+    }
+    
+    // aten::_nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor
+    inline at::Tensor _nested_view_from_jagged_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths={}, int64_t ragged_idx=1) {
+        return at::_ops::_nested_view_from_jagged_copy::redispatch(dispatchKeySet, self, offsets, dummy, lengths, ragged_idx);
+    }
+    
+    // aten::_nested_get_values(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _nested_get_values(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_get_values::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_get_values_copy(Tensor self) -> Tensor
+    inline at::Tensor _nested_get_values_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_get_values_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_get_offsets(Tensor self) -> Tensor
+    inline at::Tensor _nested_get_offsets(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_get_offsets::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_get_lengths(Tensor self) -> Tensor
+    inline at::Tensor _nested_get_lengths(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_get_lengths::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_get_ragged_idx(Tensor self) -> int
+    inline int64_t _nested_get_ragged_idx(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nested_get_ragged_idx::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nested_get_jagged_dummy(Tensor any) -> Tensor
+    inline at::Tensor _nested_get_jagged_dummy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & any) {
+        return at::_ops::_nested_get_jagged_dummy::redispatch(dispatchKeySet, any);
+    }
+    
+    // aten::_trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
+    inline at::Tensor _trilinear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & i1, const at::Tensor & i2, const at::Tensor & i3, at::IntArrayRef expand1, at::IntArrayRef expand2, at::IntArrayRef expand3, at::IntArrayRef sumdim, int64_t unroll_dim=1) {
+        return at::_ops::_trilinear::redispatch(dispatchKeySet, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim);
+    }
+    
+    // aten::triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
+    inline at::Tensor triplet_margin_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & anchor, const at::Tensor & positive, const at::Tensor & negative, double margin=1.0, double p=2, double eps=1e-06, bool swap=false, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::triplet_margin_loss::redispatch(dispatchKeySet, anchor, positive, negative, margin, p, eps, swap, reduction);
+    }
+    
+    // aten::trunc(Tensor self) -> Tensor
+    inline at::Tensor trunc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::trunc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::trunc_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & trunc_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::trunc_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & trunc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::trunc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & trunc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::trunc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::fix(Tensor self) -> Tensor
+    inline at::Tensor fix(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::fix::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::fix_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & fix_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::fix_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fix_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::fix_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fix_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::fix_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::type_as(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor type_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::type_as::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
+    inline bool _has_compatible_shallow_copy_type(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & from) {
+        return at::_ops::_has_compatible_shallow_copy_type::redispatch(dispatchKeySet, self, from);
+    }
+    
+    // aten::_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _unique(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted=true, bool return_inverse=false) {
+        return at::_ops::_unique::redispatch(dispatchKeySet, self, sorted, return_inverse);
+    }
+    
+    // aten::unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::unique_dim::redispatch(dispatchKeySet, self, dim, sorted, return_inverse, return_counts);
+    }
+    
+    // aten::unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_consecutive(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool return_inverse=false, bool return_counts=false, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::unique_consecutive::redispatch(dispatchKeySet, self, return_inverse, return_counts, dim);
+    }
+    
+    // aten::unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_dim_consecutive(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::unique_dim_consecutive::redispatch(dispatchKeySet, self, dim, return_inverse, return_counts);
+    }
+    
+    // aten::_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _unique2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted=true, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::_unique2::redispatch(dispatchKeySet, self, sorted, return_inverse, return_counts);
+    }
+    
+    // aten::_unsafe_view(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor _unsafe_view(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_unsafe_view::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::_unsafe_view(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor _unsafe_view_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::_unsafe_view::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+    inline at::Tensor unsqueeze(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::unsqueeze::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+    inline at::Tensor & unsqueeze_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim) {
+        return at::_ops::unsqueeze_::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
+    inline at::Tensor vander(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, c10::optional<int64_t> N=c10::nullopt, bool increasing=false) {
+        return at::_ops::vander::redispatch(dispatchKeySet, x, N, increasing);
+    }
+    
+    // aten::var(Tensor self, bool unbiased=True) -> Tensor
+    inline at::Tensor var(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool unbiased) {
+        return at::_ops::var::redispatch(dispatchKeySet, self, unbiased);
+    }
+    
+    // aten::var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+    inline at::Tensor var(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+    inline at::Tensor var(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_correction::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim, at::Tensor & out) {
+        return at::_ops::var_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out) {
+        return at::_ops::var_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+    inline at::Tensor var(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_names_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_names_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim, at::Tensor & out) {
+        return at::_ops::var_names_out::redispatch(dispatchKeySet, self, dim, unbiased, keepdim, out);
+    }
+    
+    // aten::var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+    inline at::Tensor var(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_correction_names::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_correction_names_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & var_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out) {
+        return at::_ops::var_correction_names_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out);
+    }
+    
+    // aten::var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> var_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool unbiased) {
+        return at::_ops::var_mean::redispatch(dispatchKeySet, self, unbiased);
+    }
+    
+    // aten::var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> var_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_mean_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> var_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_mean_correction::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> var_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, bool unbiased, bool keepdim=false) {
+        return at::_ops::var_mean_names_dim::redispatch(dispatchKeySet, self, dim, unbiased, keepdim);
+    }
+    
+    // aten::var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> var_mean(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_mean_correction_names::redispatch(dispatchKeySet, self, dim, correction, keepdim);
+    }
+    
+    // aten::view_as(Tensor(a) self, Tensor other) -> Tensor(a)
+    inline at::Tensor view_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::view_as::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+    inline at::Tensor where(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::where_self::redispatch(dispatchKeySet, condition, self, other);
+    }
+    
+    // aten::where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & where_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::where_self_out::redispatch(dispatchKeySet, condition, self, other, out);
+    }
+    
+    // aten::where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & where_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::where_self_out::redispatch(dispatchKeySet, condition, self, other, out);
+    }
+    
+    // aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
+    inline at::Tensor where(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::where_ScalarSelf::redispatch(dispatchKeySet, condition, self, other);
+    }
+    
+    // aten::where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
+    inline at::Tensor where(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::where_ScalarOther::redispatch(dispatchKeySet, condition, self, other);
+    }
+    
+    // aten::where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
+    inline at::Tensor where(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition, const at::Scalar & self, const at::Scalar & other) {
+        return at::_ops::where_Scalar::redispatch(dispatchKeySet, condition, self, other);
+    }
+    
+    // aten::where(Tensor condition) -> Tensor[]
+    inline ::std::vector<at::Tensor> where(c10::DispatchKeySet dispatchKeySet, const at::Tensor & condition) {
+        return at::_ops::where::redispatch(dispatchKeySet, condition);
+    }
+    
+    // aten::norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
+    inline at::Tensor norm_except_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & v, int64_t pow=2, int64_t dim=0) {
+        return at::_ops::norm_except_dim::redispatch(dispatchKeySet, v, pow, dim);
+    }
+    
+    // aten::_weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
+    inline at::Tensor _weight_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & v, const at::Tensor & g, int64_t dim=0) {
+        return at::_ops::_weight_norm::redispatch(dispatchKeySet, v, g, dim);
+    }
+    
+    // aten::_weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _weight_norm_interface(c10::DispatchKeySet dispatchKeySet, const at::Tensor & v, const at::Tensor & g, int64_t dim=0) {
+        return at::_ops::_weight_norm_interface::redispatch(dispatchKeySet, v, g, dim);
+    }
+    
+    // aten::_weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _weight_norm_interface_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim) {
+        return at::_ops::_weight_norm_interface_backward::redispatch(dispatchKeySet, grad_w, saved_v, saved_g, saved_norms, dim);
+    }
+    
+    // aten::_weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _weight_norm_differentiable_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim) {
+        return at::_ops::_weight_norm_differentiable_backward::redispatch(dispatchKeySet, grad_w, saved_v, saved_g, saved_norms, dim);
+    }
+    
+    // aten::zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={}) {
+        return at::_ops::zeros_names::redispatch(dispatchKeySet, size, names, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::zeros_names::redispatch(dispatchKeySet, size, names, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _efficientzerotensor(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_efficientzerotensor::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _efficientzerotensor(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_efficientzerotensor::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _efficientzerotensor_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_efficientzerotensor::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _efficientzerotensor_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_efficientzerotensor::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::zeros::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::zeros::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::zeros::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor zeros_symint(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::zeros::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::zeros_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::zeros_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size) {
+        return at::_ops::zeros_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::zeros_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor zeros_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::zeros_like::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor zeros_like(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::zeros_like::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, memory_format);
+    }
+    
+    // aten::_standard_gamma_grad(Tensor self, Tensor output) -> Tensor
+    inline at::Tensor _standard_gamma_grad(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & output) {
+        return at::_ops::_standard_gamma_grad::redispatch(dispatchKeySet, self, output);
+    }
+    
+    // aten::_standard_gamma(Tensor self, Generator? generator=None) -> Tensor
+    inline at::Tensor _standard_gamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_standard_gamma::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::_dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
+    inline at::Tensor _dirichlet_grad(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total) {
+        return at::_ops::_dirichlet_grad::redispatch(dispatchKeySet, x, alpha, total);
+    }
+    
+    // aten::_sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
+    inline at::Tensor _sample_dirichlet(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_sample_dirichlet::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::poisson(Tensor self, Generator? generator=None) -> Tensor
+    inline at::Tensor poisson(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::poisson::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
+    inline at::Tensor binomial(c10::DispatchKeySet dispatchKeySet, const at::Tensor & count, const at::Tensor & prob, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::binomial::redispatch(dispatchKeySet, count, prob, generator);
+    }
+    
+    // aten::native_norm(Tensor self, Scalar p=2) -> Tensor
+    inline at::Tensor native_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p=2) {
+        return at::_ops::native_norm::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
+    inline at::Tensor native_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) {
+        return at::_ops::native_norm_ScalarOpt_dim_dtype::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype);
+    }
+    
+    // aten::_sparse_sum(Tensor self) -> Tensor
+    inline at::Tensor _sparse_sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_sparse_sum::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
+    inline at::Tensor _sparse_sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype) {
+        return at::_ops::_sparse_sum_dtype::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::_sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
+    inline at::Tensor _sparse_sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::_sparse_sum_dim::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::_sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
+    inline at::Tensor _sparse_sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, at::ScalarType dtype) {
+        return at::_ops::_sparse_sum_dim_dtype::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
+    inline at::Tensor _sparse_sum_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::_sparse_sum_backward::redispatch(dispatchKeySet, grad, self, dim);
+    }
+    
+    // aten::_sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_csr_sum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_csr_sum_dim_dtype::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::_sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_csr_prod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_csr_prod_dim_dtype::redispatch(dispatchKeySet, self, dim, keepdim, dtype);
+    }
+    
+    // aten::_sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_softmax_int::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_softmax_Dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+    inline at::Tensor _sparse_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_sparse_softmax::redispatch(dispatchKeySet, self, dim, half_to_float);
+    }
+    
+    // aten::_sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+    inline at::Tensor _sparse_softmax_backward_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self) {
+        return at::_ops::_sparse_softmax_backward_data::redispatch(dispatchKeySet, grad_output, output, dim, self);
+    }
+    
+    // aten::_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_log_softmax_int::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor _sparse_log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_log_softmax_Dimname::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
+    inline at::Tensor _sparse_log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_sparse_log_softmax::redispatch(dispatchKeySet, self, dim, half_to_float);
+    }
+    
+    // aten::_sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
+    inline at::Tensor _sparse_log_softmax_backward_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self) {
+        return at::_ops::_sparse_log_softmax_backward_data::redispatch(dispatchKeySet, grad_output, output, dim, self);
+    }
+    
+    // aten::_spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor
+    inline at::Tensor _spdiags(c10::DispatchKeySet dispatchKeySet, const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional<at::Layout> layout=c10::nullopt) {
+        return at::_ops::_spdiags::redispatch(dispatchKeySet, diagonals, offsets, shape, layout);
+    }
+    
+    // aten::norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::ScalarType dtype) {
+        return at::_ops::norm_ScalarOpt_dtype::redispatch(dispatchKeySet, self, p, dtype);
+    }
+    
+    // aten::norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p=2) {
+        return at::_ops::norm_Scalar::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype) {
+        return at::_ops::norm_ScalarOpt_dim_dtype::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype);
+    }
+    
+    // aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::norm_ScalarOpt_dim::redispatch(dispatchKeySet, self, p, dim, keepdim);
+    }
+    
+    // aten::norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype) {
+        return at::_ops::norm_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::norm_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::norm_out::redispatch(dispatchKeySet, self, p, dim, keepdim, out);
+    }
+    
+    // aten::norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::norm_out::redispatch(dispatchKeySet, self, p, dim, keepdim, out);
+    }
+    
+    // aten::norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype) {
+        return at::_ops::norm_names_ScalarOpt_dim_dtype::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype);
+    }
+    
+    // aten::norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim=false) {
+        return at::_ops::norm_names_ScalarOpt_dim::redispatch(dispatchKeySet, self, p, dim, keepdim);
+    }
+    
+    // aten::norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype) {
+        return at::_ops::norm_names_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::norm_names_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim=false) {
+        return at::_ops::norm_names_out::redispatch(dispatchKeySet, self, p, dim, keepdim, out);
+    }
+    
+    // aten::norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::norm_names_out::redispatch(dispatchKeySet, self, p, dim, keepdim, out);
+    }
+    
+    // aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+    inline ::std::tuple<at::Tensor,at::Tensor> frexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::frexp_Tensor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> frexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self) {
+        return at::_ops::frexp_Tensor_out::redispatch(dispatchKeySet, self, mantissa, exponent);
+    }
+    
+    // aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> frexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent) {
+        return at::_ops::frexp_Tensor_out::redispatch(dispatchKeySet, self, mantissa, exponent);
+    }
+    
+    // aten::frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor frobenius_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::frobenius_norm_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & frobenius_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::frobenius_norm_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & frobenius_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::frobenius_norm_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
+    inline at::Tensor nuclear_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool keepdim=false) {
+        return at::_ops::nuclear_norm::redispatch(dispatchKeySet, self, keepdim);
+    }
+    
+    // aten::nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nuclear_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool keepdim=false) {
+        return at::_ops::nuclear_norm_out::redispatch(dispatchKeySet, self, keepdim, out);
+    }
+    
+    // aten::nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nuclear_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool keepdim, at::Tensor & out) {
+        return at::_ops::nuclear_norm_out::redispatch(dispatchKeySet, self, keepdim, out);
+    }
+    
+    // aten::nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor nuclear_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::nuclear_norm_dim::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nuclear_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::nuclear_norm_dim_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nuclear_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::nuclear_norm_dim_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor clone(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::clone::redispatch(dispatchKeySet, self, memory_format);
+    }
+    
+    // aten::positive(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor positive(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::positive::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+    inline const at::Tensor & resize_as_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_as_::redispatch(dispatchKeySet, self, the_template, memory_format);
+    }
+    
+    // aten::resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
+    inline const at::Tensor & resize_as_sparse_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template) {
+        return at::_ops::resize_as_sparse_::redispatch(dispatchKeySet, self, the_template);
+    }
+    
+    // aten::zero_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & zero_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::zero_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sub_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sub_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::sub_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor sub(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & sub_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub__Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    inline at::Tensor sub(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub_Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & sub_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub__Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & subtract_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::subtract_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & subtract_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::subtract_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor subtract(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::subtract_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & subtract_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::subtract__Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    inline at::Tensor subtract(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::subtract_Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & subtract_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::subtract__Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor rsub(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::rsub_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & heaviside_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & values) {
+        return at::_ops::heaviside_out::redispatch(dispatchKeySet, self, values, out);
+    }
+    
+    // aten::heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & heaviside_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & values, at::Tensor & out) {
+        return at::_ops::heaviside_out::redispatch(dispatchKeySet, self, values, out);
+    }
+    
+    // aten::heaviside(Tensor self, Tensor values) -> Tensor
+    inline at::Tensor heaviside(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & values) {
+        return at::_ops::heaviside::redispatch(dispatchKeySet, self, values);
+    }
+    
+    // aten::heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
+    inline at::Tensor & heaviside_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & values) {
+        return at::_ops::heaviside_::redispatch(dispatchKeySet, self, values);
+    }
+    
+    // aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+    inline at::Tensor rsub(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::rsub_Scalar::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor _sparse_addmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::_sparse_addmm::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha);
+    }
+    
+    // aten::sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_sampled_addmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::sparse_sampled_addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_sampled_addmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::sparse_sampled_addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor sparse_sampled_addmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::sparse_sampled_addmm::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha);
+    }
+    
+    // aten::_sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _sparse_mm_reduce_impl(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::string_view reduce) {
+        return at::_ops::_sparse_mm_reduce_impl::redispatch(dispatchKeySet, self, other, reduce);
+    }
+    
+    // aten::_sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _sparse_mm_reduce_impl_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_out, const at::Tensor & weight, c10::string_view reduce, const at::Tensor & arg_out, ::std::array<bool,2> output_mask) {
+        return at::_ops::_sparse_mm_reduce_impl_backward::redispatch(dispatchKeySet, self, grad_out, weight, reduce, arg_out, output_mask);
+    }
+    
+    // aten::addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor addmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmm::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha);
+    }
+    
+    // aten::addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & addmm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addmm_::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha);
+    }
+    
+    // aten::_addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _addmm_activation_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1, bool use_gelu=false) {
+        return at::_ops::_addmm_activation_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, use_gelu, out);
+    }
+    
+    // aten::_addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _addmm_activation_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, at::Tensor & out) {
+        return at::_ops::_addmm_activation_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, use_gelu, out);
+    }
+    
+    // aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
+    inline at::Tensor _addmm_activation(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1, bool use_gelu=false) {
+        return at::_ops::_addmm_activation::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, use_gelu);
+    }
+    
+    // aten::_scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _scaled_mm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, const c10::optional<at::Tensor> & bias={}, c10::optional<at::ScalarType> out_dtype=c10::nullopt, const c10::optional<at::Tensor> & scale_a={}, const c10::optional<at::Tensor> & scale_b={}, const c10::optional<at::Tensor> & scale_result={}, bool use_fast_accum=false) {
+        return at::_ops::_scaled_mm::redispatch(dispatchKeySet, self, mat2, bias, out_dtype, scale_a, scale_b, scale_result, use_fast_accum);
+    }
+    
+    // aten::_scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _scaled_mm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & out_amax, const at::Tensor & self, const at::Tensor & mat2, const c10::optional<at::Tensor> & bias={}, c10::optional<at::ScalarType> out_dtype=c10::nullopt, const c10::optional<at::Tensor> & scale_a={}, const c10::optional<at::Tensor> & scale_b={}, const c10::optional<at::Tensor> & scale_result={}, bool use_fast_accum=false) {
+        return at::_ops::_scaled_mm_out::redispatch(dispatchKeySet, self, mat2, bias, out_dtype, scale_a, scale_b, scale_result, use_fast_accum, out, out_amax);
+    }
+    
+    // aten::_scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _scaled_mm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, const c10::optional<at::Tensor> & bias, c10::optional<at::ScalarType> out_dtype, const c10::optional<at::Tensor> & scale_a, const c10::optional<at::Tensor> & scale_b, const c10::optional<at::Tensor> & scale_result, bool use_fast_accum, at::Tensor & out, at::Tensor & out_amax) {
+        return at::_ops::_scaled_mm_out::redispatch(dispatchKeySet, self, mat2, bias, out_dtype, scale_a, scale_b, scale_result, use_fast_accum, out, out_amax);
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value_size::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value_size::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value_size::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value_size::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_csr_tensor_crow_col_value_size::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_csr_tensor_crow_col_value_size::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_csc_tensor_ccol_row_value_size::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_csc_tensor_ccol_row_value_size::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_bsr_tensor_crow_col_value_size::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_bsr_tensor_crow_col_value_size::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_bsc_tensor_ccol_row_value_size::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_bsc_tensor_ccol_row_value_size::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::TensorOptions options) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_compressed_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_compressed_tensor_comp_plain_value::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+        return at::_ops::sparse_csr_tensor_crow_col_value::redispatch(dispatchKeySet, crow_indices, col_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_csr_tensor_crow_col_value::redispatch(dispatchKeySet, crow_indices, col_indices, values, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+        return at::_ops::sparse_csc_tensor_ccol_row_value::redispatch(dispatchKeySet, ccol_indices, row_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_csc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_csc_tensor_ccol_row_value::redispatch(dispatchKeySet, ccol_indices, row_indices, values, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+        return at::_ops::sparse_bsr_tensor_crow_col_value::redispatch(dispatchKeySet, crow_indices, col_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsr_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_bsr_tensor_crow_col_value::redispatch(dispatchKeySet, crow_indices, col_indices, values, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+        return at::_ops::sparse_bsc_tensor_ccol_row_value::redispatch(dispatchKeySet, ccol_indices, row_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_bsc_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_bsc_tensor_ccol_row_value::redispatch(dispatchKeySet, ccol_indices, row_indices, values, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_compressed_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_compressed_tensor_unsafe::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_compressed_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_compressed_tensor_unsafe::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_compressed_tensor_unsafe_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_compressed_tensor_unsafe::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_compressed_tensor_unsafe_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_compressed_tensor_unsafe::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_csr_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_csr_tensor_unsafe::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_csr_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_csr_tensor_unsafe::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_csc_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_csc_tensor_unsafe::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_csc_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_csc_tensor_unsafe::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_bsr_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_bsr_tensor_unsafe::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_bsr_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_bsr_tensor_unsafe::redispatch(dispatchKeySet, crow_indices, col_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_bsc_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) {
+        return at::_ops::_sparse_bsc_tensor_unsafe::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _sparse_bsc_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_bsc_tensor_unsafe::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::sparse_coo_tensor_size::redispatch(dispatchKeySet, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::sparse_coo_tensor_size::redispatch(dispatchKeySet, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::sparse_coo_tensor_indices::redispatch(dispatchKeySet, indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::sparse_coo_tensor_indices::redispatch(dispatchKeySet, indices, values, dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::sparse_coo_tensor_indices_size::redispatch(dispatchKeySet, indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor sparse_coo_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::sparse_coo_tensor_indices_size::redispatch(dispatchKeySet, indices, values, size, dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_unsafe::redispatch(dispatchKeySet, indices, values, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_unsafe(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::_sparse_coo_tensor_unsafe::redispatch(dispatchKeySet, indices, values, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_unsafe_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_unsafe::redispatch(dispatchKeySet, indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_unsafe_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::_sparse_coo_tensor_unsafe::redispatch(dispatchKeySet, indices, values, size, dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()
+    inline void _validate_sparse_coo_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_validate_sparse_coo_tensor_args::redispatch(dispatchKeySet, indices, values, size, is_coalesced);
+    }
+    
+    // aten::_validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()
+    inline void _validate_sparse_compressed_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::Layout layout) {
+        return at::_ops::_validate_sparse_compressed_tensor_args::redispatch(dispatchKeySet, compressed_indices, plain_indices, values, size, layout);
+    }
+    
+    // aten::_validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+    inline void _validate_sparse_csr_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size) {
+        return at::_ops::_validate_sparse_csr_tensor_args::redispatch(dispatchKeySet, crow_indices, col_indices, values, size);
+    }
+    
+    // aten::_validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+    inline void _validate_sparse_csc_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size) {
+        return at::_ops::_validate_sparse_csc_tensor_args::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size);
+    }
+    
+    // aten::_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+    inline void _validate_sparse_bsr_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size) {
+        return at::_ops::_validate_sparse_bsr_tensor_args::redispatch(dispatchKeySet, crow_indices, col_indices, values, size);
+    }
+    
+    // aten::_validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()
+    inline void _validate_sparse_bsc_tensor_args(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size) {
+        return at::_ops::_validate_sparse_bsc_tensor_args::redispatch(dispatchKeySet, ccol_indices, row_indices, values, size);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::TensorOptions options) {
+        return at::_ops::_sparse_coo_tensor_with_dims::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::_sparse_coo_tensor_with_dims::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::redispatch(dispatchKeySet, sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::redispatch(dispatchKeySet, sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor
+    inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, indices, values, dtype, layout, device, pin_memory, is_coalesced);
+    }
+    
+    // aten::sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize_::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim);
+    }
+    
+    // aten::sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_and_clear_(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize_and_clear_::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim);
+    }
+    
+    // aten::sparse_mask(Tensor self, Tensor mask) -> Tensor
+    inline at::Tensor sparse_mask(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask) {
+        return at::_ops::sparse_mask::redispatch(dispatchKeySet, self, mask);
+    }
+    
+    // aten::_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
+    inline at::Tensor _sparse_mask_projection(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches=false) {
+        return at::_ops::_sparse_mask_projection::redispatch(dispatchKeySet, self, mask, accumulate_matches);
+    }
+    
+    // aten::_to_cpu(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> _to_cpu(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::_to_cpu::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
+    inline at::Tensor to_dense(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<bool> masked_grad=c10::nullopt) {
+        return at::_ops::to_dense::redispatch(dispatchKeySet, self, dtype, masked_grad);
+    }
+    
+    // aten::_to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
+    inline at::Tensor _to_dense(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<bool> masked_grad=c10::nullopt) {
+        return at::_ops::_to_dense::redispatch(dispatchKeySet, self, dtype, masked_grad);
+    }
+    
+    // aten::to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor
+    inline at::Tensor to_dense_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & input, c10::optional<bool> masked_grad=c10::nullopt) {
+        return at::_ops::to_dense_backward::redispatch(dispatchKeySet, grad, input, masked_grad);
+    }
+    
+    // aten::sparse_dim(Tensor self) -> int
+    inline int64_t sparse_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sparse_dim::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_dimI(Tensor self) -> int
+    inline int64_t _dimI(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_dimI::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::dense_dim(Tensor self) -> int
+    inline int64_t dense_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::dense_dim::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_dimV(Tensor self) -> int
+    inline int64_t _dimV(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_dimV::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_nnz(Tensor self) -> int
+    inline int64_t _nnz(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_nnz::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::coalesce(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor coalesce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::coalesce::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_coalesce(Tensor self) -> Tensor
+    inline at::Tensor _coalesce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_coalesce::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_coalesced(Tensor self) -> bool
+    inline bool is_coalesced(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::is_coalesced::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_values(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _values(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_values::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
+    inline at::Tensor & _coalesced_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, bool coalesced) {
+        return at::_ops::_coalesced_::redispatch(dispatchKeySet, self, coalesced);
+    }
+    
+    // aten::indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::values(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor values(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::values::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::crow_indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor crow_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::crow_indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::col_indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor col_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::col_indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::ccol_indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor ccol_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::ccol_indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::row_indices(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor row_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::row_indices::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hspmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & mat1, const at::Tensor & mat2) {
+        return at::_ops::hspmm_out::redispatch(dispatchKeySet, mat1, mat2, out);
+    }
+    
+    // aten::hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hspmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mat1, const at::Tensor & mat2, at::Tensor & out) {
+        return at::_ops::hspmm_out::redispatch(dispatchKeySet, mat1, mat2, out);
+    }
+    
+    // aten::hspmm(Tensor mat1, Tensor mat2) -> Tensor
+    inline at::Tensor hspmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mat1, const at::Tensor & mat2) {
+        return at::_ops::hspmm::redispatch(dispatchKeySet, mat1, mat2);
+    }
+    
+    // aten::copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+    inline at::Tensor & copy_sparse_to_sparse_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy_sparse_to_sparse_::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> unbind(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0) {
+        return at::_ops::unbind_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
+    inline ::std::vector<at::Tensor> unbind(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim) {
+        return at::_ops::unbind_Dimname::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+    inline at::Tensor to_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sparse_dim) {
+        return at::_ops::to_sparse_sparse_dim::redispatch(dispatchKeySet, self, sparse_dim);
+    }
+    
+    // aten::_to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+    inline at::Tensor _to_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sparse_dim) {
+        return at::_ops::_to_sparse_sparse_dim::redispatch(dispatchKeySet, self, sparse_dim);
+    }
+    
+    // aten::to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+    inline at::Tensor to_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::to_sparse::redispatch(dispatchKeySet, self, layout, blocksize, dense_dim);
+    }
+    
+    // aten::_to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+    inline at::Tensor _to_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse::redispatch(dispatchKeySet, self, layout, blocksize, dense_dim);
+    }
+    
+    // aten::to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+    inline at::Tensor to_sparse_csr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::to_sparse_csr::redispatch(dispatchKeySet, self, dense_dim);
+    }
+    
+    // aten::_to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+    inline at::Tensor _to_sparse_csr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_csr::redispatch(dispatchKeySet, self, dense_dim);
+    }
+    
+    // aten::to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+    inline at::Tensor to_sparse_csc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::to_sparse_csc::redispatch(dispatchKeySet, self, dense_dim);
+    }
+    
+    // aten::_to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+    inline at::Tensor _to_sparse_csc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_csc::redispatch(dispatchKeySet, self, dense_dim);
+    }
+    
+    // aten::to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+    inline at::Tensor to_sparse_bsr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::to_sparse_bsr::redispatch(dispatchKeySet, self, blocksize, dense_dim);
+    }
+    
+    // aten::_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+    inline at::Tensor _to_sparse_bsr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_bsr::redispatch(dispatchKeySet, self, blocksize, dense_dim);
+    }
+    
+    // aten::to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+    inline at::Tensor to_sparse_bsc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::to_sparse_bsc::redispatch(dispatchKeySet, self, blocksize, dense_dim);
+    }
+    
+    // aten::_to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+    inline at::Tensor _to_sparse_bsc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_bsc::redispatch(dispatchKeySet, self, blocksize, dense_dim);
+    }
+    
+    // aten::_to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _to_sparse_semi_structured(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dense) {
+        return at::_ops::_to_sparse_semi_structured::redispatch(dispatchKeySet, dense);
+    }
+    
+    // aten::to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor to_mkldnn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::to_mkldnn::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
+    inline at::Tensor mkldnn_reorder_conv2d_weight(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=c10::nullopt) {
+        return at::_ops::mkldnn_reorder_conv2d_weight::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*input_size)) : c10::nullopt);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
+    inline at::Tensor mkldnn_reorder_conv2d_weight_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=c10::nullopt) {
+        return at::_ops::mkldnn_reorder_conv2d_weight::redispatch(dispatchKeySet, self, padding, stride, dilation, groups, input_size);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor mkldnn_reorder_conv3d_weight(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::mkldnn_reorder_conv3d_weight::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
+    inline at::Tensor mkldnn_reorder_conv3d_weight_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::mkldnn_reorder_conv3d_weight::redispatch(dispatchKeySet, self, padding, stride, dilation, groups);
+    }
+    
+    // aten::to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
+    inline at::Tensor to_mkldnn_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & input) {
+        return at::_ops::to_mkldnn_backward::redispatch(dispatchKeySet, grad, input);
+    }
+    
+    // aten::quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor
+    inline at::Tensor quantize_per_tensor_dynamic(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype, bool reduce_range) {
+        return at::_ops::quantize_per_tensor_dynamic::redispatch(dispatchKeySet, self, dtype, reduce_range);
+    }
+    
+    // aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
+    inline at::Tensor quantize_per_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor::redispatch(dispatchKeySet, self, scale, zero_point, dtype);
+    }
+    
+    // aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor
+    inline at::Tensor quantize_per_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor_tensor_qparams::redispatch(dispatchKeySet, self, scale, zero_point, dtype);
+    }
+    
+    // aten::quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
+    inline ::std::vector<at::Tensor> quantize_per_tensor(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor_tensors::redispatch(dispatchKeySet, tensors, scales, zero_points, dtype);
+    }
+    
+    // aten::quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
+    inline at::Tensor quantize_per_channel(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype) {
+        return at::_ops::quantize_per_channel::redispatch(dispatchKeySet, self, scales, zero_points, axis, dtype);
+    }
+    
+    // aten::dequantize.self(Tensor self) -> Tensor
+    inline at::Tensor dequantize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::dequantize_self::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::dequantize.tensors(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> dequantize(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::dequantize_tensors::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::q_scale(Tensor self) -> float
+    inline double q_scale(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::q_scale::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::q_zero_point(Tensor self) -> int
+    inline int64_t q_zero_point(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::q_zero_point::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::q_per_channel_scales(Tensor self) -> Tensor
+    inline at::Tensor q_per_channel_scales(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::q_per_channel_scales::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::q_per_channel_zero_points(Tensor self) -> Tensor
+    inline at::Tensor q_per_channel_zero_points(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::q_per_channel_zero_points::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::q_per_channel_axis(Tensor self) -> int
+    inline int64_t q_per_channel_axis(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::q_per_channel_axis::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::int_repr(Tensor self) -> Tensor
+    inline at::Tensor int_repr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::int_repr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
+    inline at::Tensor _make_per_tensor_quantized_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point) {
+        return at::_ops::_make_per_tensor_quantized_tensor::redispatch(dispatchKeySet, self, scale, zero_point);
+    }
+    
+    // aten::_make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
+    inline at::Tensor _make_per_channel_quantized_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis) {
+        return at::_ops::_make_per_channel_quantized_tensor::redispatch(dispatchKeySet, self, scale, zero_point, axis);
+    }
+    
+    // aten::qscheme(Tensor self) -> QScheme
+    inline at::QScheme qscheme(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::qscheme::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
+    inline at::Tensor fake_quantize_per_tensor_affine(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_tensor_affine::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
+    inline at::Tensor fake_quantize_per_tensor_affine(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_tensor_affine_tensor_qparams::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+    inline ::std::tuple<at::Tensor,at::Tensor> fake_quantize_per_tensor_affine_cachemask(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_tensor_affine_cachemask::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max);
+    }
+    
+    // aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+    inline ::std::tuple<at::Tensor,at::Tensor> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams::redispatch(dispatchKeySet, self, scale, zero_point, fake_quant_enabled, quant_min, quant_max);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+    inline at::Tensor fake_quantize_per_tensor_affine_cachemask_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & mask) {
+        return at::_ops::fake_quantize_per_tensor_affine_cachemask_backward::redispatch(dispatchKeySet, grad, mask);
+    }
+    
+    // aten::_fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+    inline at::Tensor _fake_quantize_learnable_per_tensor_affine(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_tensor_affine::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max, grad_factor);
+    }
+    
+    // aten::_fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _fake_quantize_learnable_per_tensor_affine_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_tensor_affine_backward::redispatch(dispatchKeySet, grad, self, scale, zero_point, quant_min, quant_max, grad_factor);
+    }
+    
+    // aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
+    inline at::Tensor fake_quantize_per_channel_affine(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_channel_affine::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max);
+    }
+    
+    // aten::fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+    inline ::std::tuple<at::Tensor,at::Tensor> fake_quantize_per_channel_affine_cachemask(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_channel_affine_cachemask::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max);
+    }
+    
+    // aten::fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor
+    inline at::Tensor fake_quantize_per_channel_affine_cachemask_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & mask) {
+        return at::_ops::fake_quantize_per_channel_affine_cachemask_backward::redispatch(dispatchKeySet, grad, mask);
+    }
+    
+    // aten::_fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+    inline at::Tensor _fake_quantize_learnable_per_channel_affine(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_channel_affine::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max, grad_factor);
+    }
+    
+    // aten::_fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _fake_quantize_learnable_per_channel_affine_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_channel_affine_backward::redispatch(dispatchKeySet, grad, self, scale, zero_point, axis, quant_min, quant_max, grad_factor);
+    }
+    
+    // aten::fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor
+    inline at::Tensor fused_moving_avg_obs_fake_quant(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false) {
+        return at::_ops::fused_moving_avg_obs_fake_quant::redispatch(dispatchKeySet, self, observer_on, fake_quant_on, running_min, running_max, scale, zero_point, averaging_const, quant_min, quant_max, ch_axis, per_row_fake_quant, symmetric_quant);
+    }
+    
+    // aten::_fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)
+    inline ::std::tuple<at::Tensor,at::Tensor> _fused_moving_avg_obs_fq_helper(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false) {
+        return at::_ops::_fused_moving_avg_obs_fq_helper::redispatch(dispatchKeySet, self, observer_on, fake_quant_on, running_min, running_max, scale, zero_point, averaging_const, quant_min, quant_max, ch_axis, per_row_fake_quant, symmetric_quant);
+    }
+    
+    // aten::_choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
+    inline ::std::tuple<double,int64_t> _choose_qparams_per_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool reduce_range=false) {
+        return at::_ops::_choose_qparams_per_tensor::redispatch(dispatchKeySet, self, reduce_range);
+    }
+    
+    // aten::_saturate_weight_to_fp16(Tensor weight) -> Tensor
+    inline at::Tensor _saturate_weight_to_fp16(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight) {
+        return at::_ops::_saturate_weight_to_fp16::redispatch(dispatchKeySet, weight);
+    }
+    
+    // aten::choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> choose_qparams_optimized(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, int64_t numel, int64_t n_bins, double ratio, int64_t bit_width) {
+        return at::_ops::choose_qparams_optimized::redispatch(dispatchKeySet, input, numel, n_bins, ratio, bit_width);
+    }
+    
+    // aten::_autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
+    inline at::Tensor _autocast_to_reduced_precision(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool cuda_enabled, bool cpu_enabled, at::ScalarType cuda_dtype, at::ScalarType cpu_dtype) {
+        return at::_ops::_autocast_to_reduced_precision::redispatch(dispatchKeySet, self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype);
+    }
+    
+    // aten::_autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
+    inline at::Tensor _autocast_to_full_precision(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool cuda_enabled, bool cpu_enabled) {
+        return at::_ops::_autocast_to_full_precision::redispatch(dispatchKeySet, self, cuda_enabled, cpu_enabled);
+    }
+    
+    // aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor _to_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, bool non_blocking=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::_to_copy::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor _to_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::_to_copy::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, non_blocking, memory_format);
+    }
+    
+    // aten::to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+    inline at::Tensor to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::to_dtype_layout::redispatch(dispatchKeySet, self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, copy, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+    }
+    
+    // aten::to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+    inline at::Tensor to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) {
+        return at::_ops::to_dtype_layout::redispatch(dispatchKeySet, self, dtype, layout, device, pin_memory, non_blocking, copy, memory_format);
+    }
+    
+    // aten::to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+    inline at::Tensor to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Device device, at::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::to_device::redispatch(dispatchKeySet, self, device, dtype, non_blocking, copy, memory_format);
+    }
+    
+    // aten::to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+    inline at::Tensor to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::to_dtype::redispatch(dispatchKeySet, self, dtype, non_blocking, copy, memory_format);
+    }
+    
+    // aten::to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+    inline at::Tensor to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::to_other::redispatch(dispatchKeySet, self, other, non_blocking, copy, memory_format);
+    }
+    
+    // aten::meshgrid(Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> meshgrid(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::meshgrid::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[]
+    inline ::std::vector<at::Tensor> meshgrid(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, c10::string_view indexing) {
+        return at::_ops::meshgrid_indexing::redispatch(dispatchKeySet, tensors, indexing);
+    }
+    
+    // aten::cartesian_prod(Tensor[] tensors) -> Tensor
+    inline at::Tensor cartesian_prod(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::cartesian_prod::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
+    inline at::Tensor combinations(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t r=2, bool with_replacement=false) {
+        return at::_ops::combinations::redispatch(dispatchKeySet, self, r, with_replacement);
+    }
+    
+    // aten::item(Tensor self) -> Scalar
+    inline at::Scalar item(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::item::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
+    inline at::ScalarType result_type(c10::DispatchKeySet dispatchKeySet, const at::Tensor & tensor, const at::Tensor & other) {
+        return at::_ops::result_type_Tensor::redispatch(dispatchKeySet, tensor, other);
+    }
+    
+    // aten::result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
+    inline at::ScalarType result_type(c10::DispatchKeySet dispatchKeySet, const at::Tensor & tensor, const at::Scalar & other) {
+        return at::_ops::result_type_Scalar::redispatch(dispatchKeySet, tensor, other);
+    }
+    
+    // aten::result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
+    inline at::ScalarType result_type(c10::DispatchKeySet dispatchKeySet, const at::Scalar & scalar, const at::Tensor & tensor) {
+        return at::_ops::result_type_Scalar_Tensor::redispatch(dispatchKeySet, scalar, tensor);
+    }
+    
+    // aten::result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
+    inline at::ScalarType result_type(c10::DispatchKeySet dispatchKeySet, const at::Scalar & scalar1, const at::Scalar & scalar2) {
+        return at::_ops::result_type_Scalar_Scalar::redispatch(dispatchKeySet, scalar1, scalar2);
+    }
+    
+    // aten::can_cast(ScalarType from, ScalarType to) -> bool
+    inline bool can_cast(c10::DispatchKeySet dispatchKeySet, at::ScalarType from, at::ScalarType to) {
+        return at::_ops::can_cast::redispatch(dispatchKeySet, from, to);
+    }
+    
+    // aten::promote_types(ScalarType type1, ScalarType type2) -> ScalarType
+    inline at::ScalarType promote_types(c10::DispatchKeySet dispatchKeySet, at::ScalarType type1, at::ScalarType type2) {
+        return at::_ops::promote_types::redispatch(dispatchKeySet, type1, type2);
+    }
+    
+    // aten::_local_scalar_dense(Tensor self) -> Scalar
+    inline at::Scalar _local_scalar_dense(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_local_scalar_dense::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _lstm_mps(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::_lstm_mps::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])
+    inline ::std::tuple<at::Tensor,::std::vector<at::Tensor>,::std::vector<at::Tensor>> lstm_mps_backward(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_y, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & z_state, const at::Tensor & cell_state_fwd, const at::Tensor & input, const at::Tensor & layersOutputs, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::lstm_mps_backward::redispatch(dispatchKeySet, grad_y, grad_hy, grad_cy, z_state, cell_state_fwd, input, layersOutputs, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::_thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _thnn_fused_lstm_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & cx, const c10::optional<at::Tensor> & input_bias={}, const c10::optional<at::Tensor> & hidden_bias={}) {
+        return at::_ops::_thnn_fused_lstm_cell::redispatch(dispatchKeySet, input_gates, hidden_gates, cx, input_bias, hidden_bias);
+    }
+    
+    // aten::_thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _thnn_fused_lstm_cell_backward_impl(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & cx, const at::Tensor & cy, const at::Tensor & workspace, bool has_bias) {
+        return at::_ops::_thnn_fused_lstm_cell_backward_impl::redispatch(dispatchKeySet, grad_hy, grad_cy, cx, cy, workspace, has_bias);
+    }
+    
+    // aten::_thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_fused_lstm_cell_backward(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & cx, const at::Tensor & cy, const at::Tensor & workspace, bool has_bias) {
+        return at::_ops::_thnn_fused_lstm_cell_backward::redispatch(dispatchKeySet, grad_hy, grad_cy, cx, cy, workspace, has_bias);
+    }
+    
+    // aten::_thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_differentiable_lstm_cell_backward(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias, const at::Tensor & cx, const at::Tensor & cy) {
+        return at::_ops::_thnn_differentiable_lstm_cell_backward::redispatch(dispatchKeySet, grad_hy, grad_cy, input_gates, hidden_gates, input_bias, hidden_bias, cx, cy);
+    }
+    
+    // aten::_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _thnn_fused_gru_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias={}, const c10::optional<at::Tensor> & hidden_bias={}) {
+        return at::_ops::_thnn_fused_gru_cell::redispatch(dispatchKeySet, input_gates, hidden_gates, hx, input_bias, hidden_bias);
+    }
+    
+    // aten::_thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_fused_gru_cell_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias) {
+        return at::_ops::_thnn_fused_gru_cell_backward::redispatch(dispatchKeySet, grad_hy, workspace, has_bias);
+    }
+    
+    // aten::_thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_differentiable_gru_cell_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_hy, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias) {
+        return at::_ops::_thnn_differentiable_gru_cell_backward::redispatch(dispatchKeySet, grad_hy, input_gates, hidden_gates, hx, input_bias, hidden_bias);
+    }
+    
+    // aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::lstm_input::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+        return at::_ops::lstm_data::redispatch(dispatchKeySet, data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+    }
+    
+    // aten::gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> gru(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::gru_input::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> gru(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+        return at::_ops::gru_data::redispatch(dispatchKeySet, data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+    }
+    
+    // aten::rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> rnn_tanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::rnn_tanh_input::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> rnn_tanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+        return at::_ops::rnn_tanh_data::redispatch(dispatchKeySet, data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+    }
+    
+    // aten::rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> rnn_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::rnn_relu_input::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+    }
+    
+    // aten::rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> rnn_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+        return at::_ops::rnn_relu_data::redispatch(dispatchKeySet, data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+    }
+    
+    // aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> lstm_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih={}, const c10::optional<at::Tensor> & b_hh={}) {
+        return at::_ops::lstm_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh);
+    }
+    
+    // aten::gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+    inline at::Tensor gru_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih={}, const c10::optional<at::Tensor> & b_hh={}) {
+        return at::_ops::gru_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh);
+    }
+    
+    // aten::rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+    inline at::Tensor rnn_tanh_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih={}, const c10::optional<at::Tensor> & b_hh={}) {
+        return at::_ops::rnn_tanh_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh);
+    }
+    
+    // aten::rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor
+    inline at::Tensor rnn_relu_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih={}, const c10::optional<at::Tensor> & b_hh={}) {
+        return at::_ops::rnn_relu_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh);
+    }
+    
+    // aten::quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> quantized_lstm_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh) {
+        return at::_ops::quantized_lstm_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh);
+    }
+    
+    // aten::quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+    inline at::Tensor quantized_gru_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh) {
+        return at::_ops::quantized_gru_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh);
+    }
+    
+    // aten::quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+    inline at::Tensor quantized_rnn_relu_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh) {
+        return at::_ops::quantized_rnn_relu_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh);
+    }
+    
+    // aten::quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
+    inline at::Tensor quantized_rnn_tanh_cell(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh) {
+        return at::_ops::quantized_rnn_tanh_cell::redispatch(dispatchKeySet, input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh);
+    }
+    
+    // aten::_pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _pack_padded_sequence(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & lengths, bool batch_first) {
+        return at::_ops::_pack_padded_sequence::redispatch(dispatchKeySet, input, lengths, batch_first);
+    }
+    
+    // aten::_pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
+    inline at::Tensor _pack_padded_sequence_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, at::IntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first) {
+        return at::_ops::_pack_padded_sequence_backward::redispatch(dispatchKeySet, grad, c10::fromIntArrayRefSlow(input_size), batch_sizes, batch_first);
+    }
+    
+    // aten::_pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
+    inline at::Tensor _pack_padded_sequence_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first) {
+        return at::_ops::_pack_padded_sequence_backward::redispatch(dispatchKeySet, grad, input_size, batch_sizes, batch_first);
+    }
+    
+    // aten::_pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _pad_packed_sequence(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, bool batch_first, const at::Scalar & padding_value, int64_t total_length) {
+        return at::_ops::_pad_packed_sequence::redispatch(dispatchKeySet, data, batch_sizes, batch_first, padding_value, total_length);
+    }
+    
+    // aten::set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+    inline at::Tensor & set_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source) {
+        return at::_ops::set__source_Storage::redispatch(dispatchKeySet, self, source);
+    }
+    
+    // aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+    inline at::Tensor & set_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) {
+        return at::_ops::set__source_Storage_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+    inline at::Tensor & set__symint(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) {
+        return at::_ops::set__source_Storage_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, size, stride);
+    }
+    
+    // aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+    inline at::Tensor & set_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) {
+        return at::_ops::set__source_Tensor_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+    inline at::Tensor & set__symint(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) {
+        return at::_ops::set__source_Tensor_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, size, stride);
+    }
+    
+    // aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+    inline at::Tensor & set_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source) {
+        return at::_ops::set__source_Tensor::redispatch(dispatchKeySet, self, source);
+    }
+    
+    // aten::set_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & set_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::set_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lift(Tensor self) -> Tensor
+    inline at::Tensor lift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::lift::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lift_fresh(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor lift_fresh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::lift_fresh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lift_fresh_copy(Tensor self) -> Tensor
+    inline at::Tensor lift_fresh_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::lift_fresh_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::is_set_to(Tensor self, Tensor tensor) -> bool
+    inline bool is_set_to(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor) {
+        return at::_ops::is_set_to::redispatch(dispatchKeySet, self, tensor);
+    }
+    
+    // aten::masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+    inline at::Tensor & masked_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
+        return at::_ops::masked_fill__Scalar::redispatch(dispatchKeySet, self, mask, value);
+    }
+    
+    // aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+    inline at::Tensor masked_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
+        return at::_ops::masked_fill_Scalar::redispatch(dispatchKeySet, self, mask, value);
+    }
+    
+    // aten::masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+    inline at::Tensor & masked_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
+        return at::_ops::masked_fill__Tensor::redispatch(dispatchKeySet, self, mask, value);
+    }
+    
+    // aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+    inline at::Tensor masked_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
+        return at::_ops::masked_fill_Tensor::redispatch(dispatchKeySet, self, mask, value);
+    }
+    
+    // aten::masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+    inline at::Tensor & masked_scatter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) {
+        return at::_ops::masked_scatter_::redispatch(dispatchKeySet, self, mask, source);
+    }
+    
+    // aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+    inline at::Tensor masked_scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) {
+        return at::_ops::masked_scatter::redispatch(dispatchKeySet, self, mask, source);
+    }
+    
+    // aten::masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
+    inline at::Tensor masked_scatter_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & mask, at::IntArrayRef sizes) {
+        return at::_ops::masked_scatter_backward::redispatch(dispatchKeySet, grad_output, mask, c10::fromIntArrayRefSlow(sizes));
+    }
+    
+    // aten::masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor
+    inline at::Tensor masked_scatter_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & mask, c10::SymIntArrayRef sizes) {
+        return at::_ops::masked_scatter_backward::redispatch(dispatchKeySet, grad_output, mask, sizes);
+    }
+    
+    // aten::_masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor
+    inline at::Tensor _masked_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_masked_softmax::redispatch(dispatchKeySet, self, mask, dim, mask_type);
+    }
+    
+    // aten::_masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor
+    inline at::Tensor _masked_softmax_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::_masked_softmax_backward::redispatch(dispatchKeySet, grad_output, output, mask, dim);
+    }
+    
+    // aten::view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+    inline at::Tensor view(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::view::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+    inline at::Tensor view_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::view::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+    inline at::Tensor view(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype) {
+        return at::_ops::view_dtype::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
+    inline at::Tensor & put_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate=false) {
+        return at::_ops::put_::redispatch(dispatchKeySet, self, index, source, accumulate);
+    }
+    
+    // aten::put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
+    inline at::Tensor put(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate=false) {
+        return at::_ops::put::redispatch(dispatchKeySet, self, index, source, accumulate);
+    }
+    
+    // aten::index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_add_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) {
+        return at::_ops::index_add_out::redispatch(dispatchKeySet, self, dim, index, source, alpha, out);
+    }
+    
+    // aten::index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_add_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::index_add_out::redispatch(dispatchKeySet, self, dim, index, source, alpha, out);
+    }
+    
+    // aten::index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & index_add_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) {
+        return at::_ops::index_add_::redispatch(dispatchKeySet, self, dim, index, source, alpha);
+    }
+    
+    // aten::index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor index_add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) {
+        return at::_ops::index_add::redispatch(dispatchKeySet, self, dim, index, source, alpha);
+    }
+    
+    // aten::index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+    inline at::Tensor index_add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) {
+        return at::_ops::index_add_dimname::redispatch(dispatchKeySet, self, dim, index, source, alpha);
+    }
+    
+    // aten::index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_reduce_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::index_reduce_out::redispatch(dispatchKeySet, self, dim, index, source, reduce, include_self, out);
+    }
+    
+    // aten::index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_reduce_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self, at::Tensor & out) {
+        return at::_ops::index_reduce_out::redispatch(dispatchKeySet, self, dim, index, source, reduce, include_self, out);
+    }
+    
+    // aten::index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
+    inline at::Tensor & index_reduce_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::index_reduce_::redispatch(dispatchKeySet, self, dim, index, source, reduce, include_self);
+    }
+    
+    // aten::index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+    inline at::Tensor index_reduce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::index_reduce::redispatch(dispatchKeySet, self, dim, index, source, reduce, include_self);
+    }
+    
+    // aten::index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+    inline at::Tensor & index_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::index_fill__int_Scalar::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+    inline at::Tensor index_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::index_fill_int_Scalar::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+    inline at::Tensor & index_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value) {
+        return at::_ops::index_fill__int_Tensor::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+    inline at::Tensor index_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value) {
+        return at::_ops::index_fill_int_Tensor::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
+    inline at::Tensor & index_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::index_fill__Dimname_Scalar::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
+    inline at::Tensor & index_fill_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & value) {
+        return at::_ops::index_fill__Dimname_Tensor::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+    inline at::Tensor index_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::index_fill_Dimname_Scalar::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+    inline at::Tensor index_fill(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & value) {
+        return at::_ops::index_fill_Dimname_Tensor::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_src::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+    inline at::Tensor & scatter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter__src::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_src_out::redispatch(dispatchKeySet, self, dim, index, src, out);
+    }
+    
+    // aten::scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out) {
+        return at::_ops::scatter_src_out::redispatch(dispatchKeySet, self, dim, index, src, out);
+    }
+    
+    // aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::scatter_value::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+    inline at::Tensor & scatter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::scatter__value::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::scatter_value_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::scatter_value_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) {
+        return at::_ops::scatter_reduce::redispatch(dispatchKeySet, self, dim, index, src, reduce);
+    }
+    
+    // aten::scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+    inline at::Tensor & scatter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) {
+        return at::_ops::scatter__reduce::redispatch(dispatchKeySet, self, dim, index, src, reduce);
+    }
+    
+    // aten::scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) {
+        return at::_ops::scatter_reduce_out::redispatch(dispatchKeySet, self, dim, index, src, reduce, out);
+    }
+    
+    // aten::scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, at::Tensor & out) {
+        return at::_ops::scatter_reduce_out::redispatch(dispatchKeySet, self, dim, index, src, reduce, out);
+    }
+    
+    // aten::scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) {
+        return at::_ops::scatter_value_reduce::redispatch(dispatchKeySet, self, dim, index, value, reduce);
+    }
+    
+    // aten::scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+    inline at::Tensor & scatter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) {
+        return at::_ops::scatter__value_reduce::redispatch(dispatchKeySet, self, dim, index, value, reduce);
+    }
+    
+    // aten::scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) {
+        return at::_ops::scatter_value_reduce_out::redispatch(dispatchKeySet, self, dim, index, value, reduce, out);
+    }
+    
+    // aten::scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce, at::Tensor & out) {
+        return at::_ops::scatter_value_reduce_out::redispatch(dispatchKeySet, self, dim, index, value, reduce, out);
+    }
+    
+    // aten::scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_dimname_src::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+    inline at::Tensor scatter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::scatter_dimname_value::redispatch(dispatchKeySet, self, dim, index, value);
+    }
+    
+    // aten::scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+    inline at::Tensor scatter_add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_add::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+    inline at::Tensor & scatter_add_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_add_::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_add_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_add_out::redispatch(dispatchKeySet, self, dim, index, src, out);
+    }
+    
+    // aten::scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_add_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out) {
+        return at::_ops::scatter_add_out::redispatch(dispatchKeySet, self, dim, index, src, out);
+    }
+    
+    // aten::scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+    inline at::Tensor scatter_add(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src) {
+        return at::_ops::scatter_add_dimname::redispatch(dispatchKeySet, self, dim, index, src);
+    }
+    
+    // aten::scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+    inline at::Tensor scatter_reduce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::scatter_reduce_two::redispatch(dispatchKeySet, self, dim, index, src, reduce, include_self);
+    }
+    
+    // aten::scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+    inline at::Tensor & scatter_reduce_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::scatter_reduce__two::redispatch(dispatchKeySet, self, dim, index, src, reduce, include_self);
+    }
+    
+    // aten::scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_reduce_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true) {
+        return at::_ops::scatter_reduce_two_out::redispatch(dispatchKeySet, self, dim, index, src, reduce, include_self, out);
+    }
+    
+    // aten::scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scatter_reduce_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out) {
+        return at::_ops::scatter_reduce_two_out::redispatch(dispatchKeySet, self, dim, index, src, reduce, include_self, out);
+    }
+    
+    // aten::eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & eq_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::eq__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & eq_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::eq__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_and_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_and_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_and_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::bitwise_and_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor bitwise_and(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_and_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_and(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_and_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_and(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_and_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_and__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_and__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__and__.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor __and__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__and___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__and__.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor __and__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__and___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & __iand__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__iand___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & __iand__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__iand___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_or_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_or_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_or_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::bitwise_or_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor bitwise_or(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_or_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_or(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_or_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_or(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_or_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_or__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_or__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__or__.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor __or__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__or___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__or__.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor __or__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__or___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & __ior__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__ior___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & __ior__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__ior___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_xor_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_xor_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_xor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::bitwise_xor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor bitwise_xor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_xor_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_xor(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_xor_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_xor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_xor_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_xor__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_xor__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__xor__.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor __xor__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__xor___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__xor__.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor __xor__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__xor___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & __ixor__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__ixor___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & __ixor__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__ixor___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__lshift__.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor __lshift__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__lshift___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__lshift__.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor __lshift__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__lshift___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & __ilshift__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__ilshift___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & __ilshift__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__ilshift___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_left_shift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_left_shift_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_left_shift__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_left_shift_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_left_shift_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor bitwise_left_shift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_left_shift_Tensor_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_left_shift__Tensor_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_left_shift_Tensor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::bitwise_left_shift_Tensor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_left_shift(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_left_shift_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor __rshift__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__rshift___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor __rshift__(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__rshift___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & __irshift__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__irshift___Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::__irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & __irshift__(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__irshift___Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_right_shift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_right_shift_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_right_shift__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::bitwise_right_shift_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_right_shift_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor bitwise_right_shift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_right_shift_Tensor_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_right_shift__Tensor_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::bitwise_right_shift_Tensor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::bitwise_right_shift_Tensor_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor bitwise_right_shift(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_right_shift_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+    inline at::Tensor & tril_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::tril_::redispatch(dispatchKeySet, self, diagonal);
+    }
+    
+    // aten::triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+    inline at::Tensor & triu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::triu_::redispatch(dispatchKeySet, self, diagonal);
+    }
+    
+    // aten::digamma_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & digamma_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::digamma_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
+    inline at::Tensor & lerp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & end, const at::Scalar & weight) {
+        return at::_ops::lerp__Scalar::redispatch(dispatchKeySet, self, end, weight);
+    }
+    
+    // aten::lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
+    inline at::Tensor & lerp_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & end, const at::Tensor & weight) {
+        return at::_ops::lerp__Tensor::redispatch(dispatchKeySet, self, end, weight);
+    }
+    
+    // aten::addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+    inline at::Tensor & addbmm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addbmm_::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha);
+    }
+    
+    // aten::addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addbmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addbmm_out::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha, out);
+    }
+    
+    // aten::addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addbmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::addbmm_out::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha, out);
+    }
+    
+    // aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+    inline at::Tensor addbmm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::addbmm::redispatch(dispatchKeySet, self, batch1, batch2, beta, alpha);
+    }
+    
+    // aten::random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & random_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random__from::redispatch(dispatchKeySet, self, from, to, generator);
+    }
+    
+    // aten::random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & random_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random__to::redispatch(dispatchKeySet, self, to, generator);
+    }
+    
+    // aten::random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & random_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & uniform_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::uniform_::redispatch(dispatchKeySet, self, from, to, generator);
+    }
+    
+    // aten::cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & cauchy_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double median=0, double sigma=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::cauchy_::redispatch(dispatchKeySet, self, median, sigma, generator);
+    }
+    
+    // aten::log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & log_normal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double mean=1, double std=2, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::log_normal_::redispatch(dispatchKeySet, self, mean, std, generator);
+    }
+    
+    // aten::exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & exponential_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double lambd=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::exponential_::redispatch(dispatchKeySet, self, lambd, generator);
+    }
+    
+    // aten::geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & geometric_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::geometric_::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diag_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::diag_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diag_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal, at::Tensor & out) {
+        return at::_ops::diag_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::diag(Tensor self, int diagonal=0) -> Tensor
+    inline at::Tensor diag(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::diag::redispatch(dispatchKeySet, self, diagonal);
+    }
+    
+    // aten::cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cross_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::cross_out::redispatch(dispatchKeySet, self, other, dim, out);
+    }
+    
+    // aten::cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cross_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim, at::Tensor & out) {
+        return at::_ops::cross_out::redispatch(dispatchKeySet, self, other, dim, out);
+    }
+    
+    // aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor
+    inline at::Tensor cross(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::cross::redispatch(dispatchKeySet, self, other, dim);
+    }
+    
+    // aten::triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & triu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::triu_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & triu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal, at::Tensor & out) {
+        return at::_ops::triu_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::triu(Tensor self, int diagonal=0) -> Tensor
+    inline at::Tensor triu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::triu::redispatch(dispatchKeySet, self, diagonal);
+    }
+    
+    // aten::tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tril_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::tril_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tril_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal, at::Tensor & out) {
+        return at::_ops::tril_out::redispatch(dispatchKeySet, self, diagonal, out);
+    }
+    
+    // aten::tril(Tensor self, int diagonal=0) -> Tensor
+    inline at::Tensor tril(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal=0) {
+        return at::_ops::tril::redispatch(dispatchKeySet, self, diagonal);
+    }
+    
+    // aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor tril_indices(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) {
+        return at::_ops::tril_indices::redispatch(dispatchKeySet, row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor tril_indices(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::tril_indices::redispatch(dispatchKeySet, row, col, offset, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor triu_indices(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) {
+        return at::_ops::triu_indices::redispatch(dispatchKeySet, row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor triu_indices(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::triu_indices::redispatch(dispatchKeySet, row, col, offset, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::trace(Tensor self) -> Tensor
+    inline at::Tensor trace(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::trace::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
+    inline at::Tensor trace_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, at::IntArrayRef sizes) {
+        return at::_ops::trace_backward::redispatch(dispatchKeySet, grad, c10::fromIntArrayRefSlow(sizes));
+    }
+    
+    // aten::trace_backward(Tensor grad, SymInt[] sizes) -> Tensor
+    inline at::Tensor trace_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef sizes) {
+        return at::_ops::trace_backward::redispatch(dispatchKeySet, grad, sizes);
+    }
+    
+    // aten::ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ne_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ne_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ne_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::ne_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ne.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor ne(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ne_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ne_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ne_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ne_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::ne_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ne.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor ne(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ne_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & ne_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ne__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & ne_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ne__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & not_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::not_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & not_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::not_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::not_equal.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor not_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::not_equal_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & not_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::not_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & not_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::not_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::not_equal.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor not_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::not_equal_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & not_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::not_equal__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & not_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::not_equal__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eq_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::eq_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eq_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::eq_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::eq.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor eq(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::eq_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eq_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::eq_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & eq_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::eq_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::eq.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor eq(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::eq_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ge_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ge_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ge_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::ge_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ge.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor ge(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ge_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ge_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ge_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ge_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::ge_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::ge.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor ge(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ge_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & ge_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::ge__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & ge_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::ge__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::greater_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor greater_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater_equal_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::greater_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor greater_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater_equal_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & greater_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater_equal__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & greater_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater_equal__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & le_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::le_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & le_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::le_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::le.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor le(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::le_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & le_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::le_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & le_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::le_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::le.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor le(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::le_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & le_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::le__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & le_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::le__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::less_equal_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less_equal.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor less_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less_equal_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_equal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_equal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::less_equal_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less_equal.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor less_equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less_equal_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & less_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less_equal__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & less_equal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less_equal__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::gt_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::gt_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gt.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor gt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::gt_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gt_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::gt_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::gt.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor gt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gt_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & gt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::gt__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & gt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::gt__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::greater_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor greater(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & greater_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::greater_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::greater.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor greater(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & greater_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::greater__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & greater_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::greater__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::lt_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::lt_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lt.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor lt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::lt_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lt_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::lt_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::lt.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor lt(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lt_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & lt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::lt__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & lt_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::lt__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::less_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor less(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & less_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::less_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::less.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor less(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & less_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::less__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & less_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::less__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & take_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & index) {
+        return at::_ops::take_out::redispatch(dispatchKeySet, self, index, out);
+    }
+    
+    // aten::take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & take_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index, at::Tensor & out) {
+        return at::_ops::take_out::redispatch(dispatchKeySet, self, index, out);
+    }
+    
+    // aten::take(Tensor self, Tensor index) -> Tensor
+    inline at::Tensor take(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index) {
+        return at::_ops::take::redispatch(dispatchKeySet, self, index);
+    }
+    
+    // aten::take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & take_along_dim_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::take_along_dim_out::redispatch(dispatchKeySet, self, indices, dim, out);
+    }
+    
+    // aten::take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & take_along_dim_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim, at::Tensor & out) {
+        return at::_ops::take_along_dim_out::redispatch(dispatchKeySet, self, indices, dim, out);
+    }
+    
+    // aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor
+    inline at::Tensor take_along_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::take_along_dim::redispatch(dispatchKeySet, self, indices, dim);
+    }
+    
+    // aten::index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_select_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index) {
+        return at::_ops::index_select_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_select_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out) {
+        return at::_ops::index_select_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::index_select(Tensor self, int dim, Tensor index) -> Tensor
+    inline at::Tensor index_select(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index) {
+        return at::_ops::index_select::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_select_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, const at::Tensor & index) {
+        return at::_ops::index_select_dimname_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_select_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, at::Tensor & out) {
+        return at::_ops::index_select_dimname_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
+    inline at::Tensor index_select(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index) {
+        return at::_ops::index_select_dimname::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor
+    inline at::Tensor index_select_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index) {
+        return at::_ops::index_select_backward::redispatch(dispatchKeySet, grad, c10::fromIntArrayRefSlow(self_sizes), dim, index);
+    }
+    
+    // aten::index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor
+    inline at::Tensor index_select_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef self_sizes, int64_t dim, const at::Tensor & index) {
+        return at::_ops::index_select_backward::redispatch(dispatchKeySet, grad, self_sizes, dim, index);
+    }
+    
+    // aten::masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_select_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask) {
+        return at::_ops::masked_select_out::redispatch(dispatchKeySet, self, mask, out);
+    }
+    
+    // aten::masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_select_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, at::Tensor & out) {
+        return at::_ops::masked_select_out::redispatch(dispatchKeySet, self, mask, out);
+    }
+    
+    // aten::masked_select(Tensor self, Tensor mask) -> Tensor
+    inline at::Tensor masked_select(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask) {
+        return at::_ops::masked_select::redispatch(dispatchKeySet, self, mask);
+    }
+    
+    // aten::masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
+    inline at::Tensor masked_select_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & mask) {
+        return at::_ops::masked_select_backward::redispatch(dispatchKeySet, grad, input, mask);
+    }
+    
+    // aten::nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nonzero_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::nonzero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nonzero_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::nonzero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::nonzero(Tensor self) -> Tensor
+    inline at::Tensor nonzero(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::nonzero::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nonzero_static_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t size, int64_t fill_value=-1) {
+        return at::_ops::nonzero_static_out::redispatch(dispatchKeySet, self, size, fill_value, out);
+    }
+    
+    // aten::nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nonzero_static_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t size, int64_t fill_value, at::Tensor & out) {
+        return at::_ops::nonzero_static_out::redispatch(dispatchKeySet, self, size, fill_value, out);
+    }
+    
+    // aten::nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
+    inline at::Tensor nonzero_static(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t size, int64_t fill_value=-1) {
+        return at::_ops::nonzero_static::redispatch(dispatchKeySet, self, size, fill_value);
+    }
+    
+    // aten::nonzero_numpy(Tensor self) -> Tensor[]
+    inline ::std::vector<at::Tensor> nonzero_numpy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::nonzero_numpy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::argwhere(Tensor self) -> Tensor
+    inline at::Tensor argwhere(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::argwhere::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gather_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false) {
+        return at::_ops::gather_out::redispatch(dispatchKeySet, self, dim, index, sparse_grad, out);
+    }
+    
+    // aten::gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gather_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad, at::Tensor & out) {
+        return at::_ops::gather_out::redispatch(dispatchKeySet, self, dim, index, sparse_grad, out);
+    }
+    
+    // aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+    inline at::Tensor gather(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false) {
+        return at::_ops::gather::redispatch(dispatchKeySet, self, dim, index, sparse_grad);
+    }
+    
+    // aten::gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
+    inline at::Tensor gather_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad) {
+        return at::_ops::gather_backward::redispatch(dispatchKeySet, grad, self, dim, index, sparse_grad);
+    }
+    
+    // aten::gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gather_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, bool sparse_grad=false) {
+        return at::_ops::gather_dimname_out::redispatch(dispatchKeySet, self, dim, index, sparse_grad, out);
+    }
+    
+    // aten::gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & gather_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, bool sparse_grad, at::Tensor & out) {
+        return at::_ops::gather_dimname_out::redispatch(dispatchKeySet, self, dim, index, sparse_grad, out);
+    }
+    
+    // aten::gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+    inline at::Tensor gather(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, bool sparse_grad=false) {
+        return at::_ops::gather_dimname::redispatch(dispatchKeySet, self, dim, index, sparse_grad);
+    }
+    
+    // aten::_gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
+    inline at::Tensor _gather_sparse_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & grad) {
+        return at::_ops::_gather_sparse_backward::redispatch(dispatchKeySet, self, dim, index, grad);
+    }
+    
+    // aten::addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addcmul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcmul_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addcmul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::addcmul_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+    inline at::Tensor addcmul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcmul::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+    inline at::Tensor & addcmul_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcmul_::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addcdiv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcdiv_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & addcdiv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::addcdiv_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+    inline at::Tensor addcdiv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcdiv::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+    inline at::Tensor & addcdiv_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+        return at::_ops::addcdiv_::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor
+    inline at::Tensor cross_entropy_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100, double label_smoothing=0.0) {
+        return at::_ops::cross_entropy_loss::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, label_smoothing);
+    }
+    
+    // aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor
+    inline at::Tensor cross_entropy_loss_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100, double label_smoothing=0.0) {
+        return at::_ops::cross_entropy_loss::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, label_smoothing);
+    }
+    
+    // aten::triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> triangular_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & X, at::Tensor & M, const at::Tensor & self, const at::Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) {
+        return at::_ops::triangular_solve_X::redispatch(dispatchKeySet, self, A, upper, transpose, unitriangular, X, M);
+    }
+    
+    // aten::triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> triangular_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & A, bool upper, bool transpose, bool unitriangular, at::Tensor & X, at::Tensor & M) {
+        return at::_ops::triangular_solve_X::redispatch(dispatchKeySet, self, A, upper, transpose, unitriangular, X, M);
+    }
+    
+    // aten::triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+    inline ::std::tuple<at::Tensor,at::Tensor> triangular_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) {
+        return at::_ops::triangular_solve::redispatch(dispatchKeySet, self, A, upper, transpose, unitriangular);
+    }
+    
+    // aten::_linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> ()
+    inline void _linalg_check_errors(c10::DispatchKeySet dispatchKeySet, const at::Tensor & info, c10::string_view api_name, bool is_matrix) {
+        return at::_ops::_linalg_check_errors::redispatch(dispatchKeySet, info, api_name, is_matrix);
+    }
+    
+    // aten::linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_solve_triangular_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false) {
+        return at::_ops::linalg_solve_triangular_out::redispatch(dispatchKeySet, self, B, upper, left, unitriangular, out);
+    }
+    
+    // aten::linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_solve_triangular_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & B, bool upper, bool left, bool unitriangular, at::Tensor & out) {
+        return at::_ops::linalg_solve_triangular_out::redispatch(dispatchKeySet, self, B, upper, left, unitriangular, out);
+    }
+    
+    // aten::linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor
+    inline at::Tensor linalg_solve_triangular(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false) {
+        return at::_ops::linalg_solve_triangular::redispatch(dispatchKeySet, self, B, upper, left, unitriangular);
+    }
+    
+    // aten::linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
+    inline at::Tensor linalg_vander(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, c10::optional<int64_t> N=c10::nullopt) {
+        return at::_ops::linalg_vander::redispatch(dispatchKeySet, x, N.has_value() ? c10::make_optional(c10::SymInt(*N)) : c10::nullopt);
+    }
+    
+    // aten::linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor
+    inline at::Tensor linalg_vander_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, c10::optional<c10::SymInt> N=c10::nullopt) {
+        return at::_ops::linalg_vander::redispatch(dispatchKeySet, x, N);
+    }
+    
+    // aten::svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> svd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & U, at::Tensor & S, at::Tensor & V, const at::Tensor & self, bool some=true, bool compute_uv=true) {
+        return at::_ops::svd_U::redispatch(dispatchKeySet, self, some, compute_uv, U, S, V);
+    }
+    
+    // aten::svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> svd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some, bool compute_uv, at::Tensor & U, at::Tensor & S, at::Tensor & V) {
+        return at::_ops::svd_U::redispatch(dispatchKeySet, self, some, compute_uv, U, S, V);
+    }
+    
+    // aten::svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> svd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some=true, bool compute_uv=true) {
+        return at::_ops::svd::redispatch(dispatchKeySet, self, some, compute_uv);
+    }
+    
+    // aten::swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
+    inline at::Tensor swapaxes(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t axis0, int64_t axis1) {
+        return at::_ops::swapaxes::redispatch(dispatchKeySet, self, axis0, axis1);
+    }
+    
+    // aten::swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
+    inline at::Tensor & swapaxes_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t axis0, int64_t axis1) {
+        return at::_ops::swapaxes_::redispatch(dispatchKeySet, self, axis0, axis1);
+    }
+    
+    // aten::swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+    inline at::Tensor swapdims(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::swapdims::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+    inline at::Tensor & swapdims_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::swapdims_::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool upper=false) {
+        return at::_ops::cholesky_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, at::Tensor & out) {
+        return at::_ops::cholesky_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::cholesky(Tensor self, bool upper=False) -> Tensor
+    inline at::Tensor cholesky(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper=false) {
+        return at::_ops::cholesky::redispatch(dispatchKeySet, self, upper);
+    }
+    
+    // aten::cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & input2, bool upper=false) {
+        return at::_ops::cholesky_solve_out::redispatch(dispatchKeySet, self, input2, upper, out);
+    }
+    
+    // aten::cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2, bool upper, at::Tensor & out) {
+        return at::_ops::cholesky_solve_out::redispatch(dispatchKeySet, self, input2, upper, out);
+    }
+    
+    // aten::cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
+    inline at::Tensor cholesky_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2, bool upper=false) {
+        return at::_ops::cholesky_solve::redispatch(dispatchKeySet, self, input2, upper);
+    }
+    
+    // aten::_cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
+    inline at::Tensor _cholesky_solve_helper(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & A, bool upper) {
+        return at::_ops::_cholesky_solve_helper::redispatch(dispatchKeySet, self, A, upper);
+    }
+    
+    // aten::cholesky_inverse(Tensor self, bool upper=False) -> Tensor
+    inline at::Tensor cholesky_inverse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper=false) {
+        return at::_ops::cholesky_inverse::redispatch(dispatchKeySet, self, upper);
+    }
+    
+    // aten::cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_inverse_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool upper=false) {
+        return at::_ops::cholesky_inverse_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cholesky_inverse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, at::Tensor & out) {
+        return at::_ops::cholesky_inverse_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> qr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & Q, at::Tensor & R, const at::Tensor & self, bool some=true) {
+        return at::_ops::qr_Q::redispatch(dispatchKeySet, self, some, Q, R);
+    }
+    
+    // aten::qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> qr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some, at::Tensor & Q, at::Tensor & R) {
+        return at::_ops::qr_Q::redispatch(dispatchKeySet, self, some, Q, R);
+    }
+    
+    // aten::qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
+    inline ::std::tuple<at::Tensor,at::Tensor> qr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some=true) {
+        return at::_ops::qr::redispatch(dispatchKeySet, self, some);
+    }
+    
+    // aten::geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> geqrf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & a, at::Tensor & tau, const at::Tensor & self) {
+        return at::_ops::geqrf_a::redispatch(dispatchKeySet, self, a, tau);
+    }
+    
+    // aten::geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> geqrf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & a, at::Tensor & tau) {
+        return at::_ops::geqrf_a::redispatch(dispatchKeySet, self, a, tau);
+    }
+    
+    // aten::geqrf(Tensor self) -> (Tensor a, Tensor tau)
+    inline ::std::tuple<at::Tensor,at::Tensor> geqrf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::geqrf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::orgqr(Tensor self, Tensor input2) -> Tensor
+    inline at::Tensor orgqr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2) {
+        return at::_ops::orgqr::redispatch(dispatchKeySet, self, input2);
+    }
+    
+    // aten::orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & orgqr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & input2) {
+        return at::_ops::orgqr_out::redispatch(dispatchKeySet, self, input2, out);
+    }
+    
+    // aten::orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & orgqr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2, at::Tensor & out) {
+        return at::_ops::orgqr_out::redispatch(dispatchKeySet, self, input2, out);
+    }
+    
+    // aten::ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ormqr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & input2, const at::Tensor & input3, bool left=true, bool transpose=false) {
+        return at::_ops::ormqr_out::redispatch(dispatchKeySet, self, input2, input3, left, transpose, out);
+    }
+    
+    // aten::ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ormqr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2, const at::Tensor & input3, bool left, bool transpose, at::Tensor & out) {
+        return at::_ops::ormqr_out::redispatch(dispatchKeySet, self, input2, input3, left, transpose, out);
+    }
+    
+    // aten::ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
+    inline at::Tensor ormqr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input2, const at::Tensor & input3, bool left=true, bool transpose=false) {
+        return at::_ops::ormqr::redispatch(dispatchKeySet, self, input2, input3, left, transpose);
+    }
+    
+    // aten::_lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _lu_with_info(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool pivot=true, bool check_errors=true) {
+        return at::_ops::_lu_with_info::redispatch(dispatchKeySet, self, pivot, check_errors);
+    }
+    
+    // aten::lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lu_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots) {
+        return at::_ops::lu_solve_out::redispatch(dispatchKeySet, self, LU_data, LU_pivots, out);
+    }
+    
+    // aten::lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lu_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots, at::Tensor & out) {
+        return at::_ops::lu_solve_out::redispatch(dispatchKeySet, self, LU_data, LU_pivots, out);
+    }
+    
+    // aten::lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
+    inline at::Tensor lu_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots) {
+        return at::_ops::lu_solve::redispatch(dispatchKeySet, self, LU_data, LU_pivots);
+    }
+    
+    // aten::lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lu_unpack(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LU_data, const at::Tensor & LU_pivots, bool unpack_data=true, bool unpack_pivots=true) {
+        return at::_ops::lu_unpack::redispatch(dispatchKeySet, LU_data, LU_pivots, unpack_data, unpack_pivots);
+    }
+    
+    // aten::lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> lu_unpack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & P, at::Tensor & L, at::Tensor & U, const at::Tensor & LU_data, const at::Tensor & LU_pivots, bool unpack_data=true, bool unpack_pivots=true) {
+        return at::_ops::lu_unpack_out::redispatch(dispatchKeySet, LU_data, LU_pivots, unpack_data, unpack_pivots, P, L, U);
+    }
+    
+    // aten::lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> lu_unpack_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LU_data, const at::Tensor & LU_pivots, bool unpack_data, bool unpack_pivots, at::Tensor & P, at::Tensor & L, at::Tensor & U) {
+        return at::_ops::lu_unpack_out::redispatch(dispatchKeySet, LU_data, LU_pivots, unpack_data, unpack_pivots, P, L, U);
+    }
+    
+    // aten::multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multinomial_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t num_samples, bool replacement=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::multinomial_out::redispatch(dispatchKeySet, self, num_samples, replacement, generator, out);
+    }
+    
+    // aten::multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multinomial_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::multinomial_out::redispatch(dispatchKeySet, self, num_samples, replacement, generator, out);
+    }
+    
+    // aten::multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+    inline at::Tensor multinomial(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::multinomial::redispatch(dispatchKeySet, self, num_samples, replacement, generator);
+    }
+    
+    // aten::lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lgamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::lgamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lgamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::lgamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::lgamma_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & lgamma_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::lgamma_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lgamma(Tensor self) -> Tensor
+    inline at::Tensor lgamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::lgamma::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & digamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::digamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & digamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::digamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::digamma(Tensor self) -> Tensor
+    inline at::Tensor digamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::digamma::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & polygamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, const at::Tensor & self) {
+        return at::_ops::polygamma_out::redispatch(dispatchKeySet, n, self, out);
+    }
+    
+    // aten::polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & polygamma_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::polygamma_out::redispatch(dispatchKeySet, n, self, out);
+    }
+    
+    // aten::polygamma(int n, Tensor self) -> Tensor
+    inline at::Tensor polygamma(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self) {
+        return at::_ops::polygamma::redispatch(dispatchKeySet, n, self);
+    }
+    
+    // aten::polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+    inline at::Tensor & polygamma_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t n) {
+        return at::_ops::polygamma_::redispatch(dispatchKeySet, self, n);
+    }
+    
+    // aten::erfinv(Tensor self) -> Tensor
+    inline at::Tensor erfinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::erfinv::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erfinv_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & erfinv_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::erfinv_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erfinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::erfinv_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & erfinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::erfinv_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::i0(Tensor self) -> Tensor
+    inline at::Tensor i0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::i0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::i0_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & i0_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::i0_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & i0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & i0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sign(Tensor self) -> Tensor
+    inline at::Tensor sign(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::sign::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sign_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & sign_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::sign_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sign_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::sign_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sign_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::sign_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::signbit(Tensor self) -> Tensor
+    inline at::Tensor signbit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::signbit::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & signbit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::signbit_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & signbit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::signbit_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
+    inline at::Tensor dist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & p=2) {
+        return at::_ops::dist::redispatch(dispatchKeySet, self, other, p);
+    }
+    
+    // aten::atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atan2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::atan2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & atan2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::atan2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & atan2_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::atan2_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::atan2(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor atan2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::atan2::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::arctan2(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor arctan2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::arctan2::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctan2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::arctan2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & arctan2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::arctan2_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & arctan2_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::arctan2_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lerp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & end, const at::Scalar & weight) {
+        return at::_ops::lerp_Scalar_out::redispatch(dispatchKeySet, self, end, weight, out);
+    }
+    
+    // aten::lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lerp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & end, const at::Scalar & weight, at::Tensor & out) {
+        return at::_ops::lerp_Scalar_out::redispatch(dispatchKeySet, self, end, weight, out);
+    }
+    
+    // aten::lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lerp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & end, const at::Tensor & weight) {
+        return at::_ops::lerp_Tensor_out::redispatch(dispatchKeySet, self, end, weight, out);
+    }
+    
+    // aten::lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lerp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & end, const at::Tensor & weight, at::Tensor & out) {
+        return at::_ops::lerp_Tensor_out::redispatch(dispatchKeySet, self, end, weight, out);
+    }
+    
+    // aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+    inline at::Tensor lerp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & end, const at::Scalar & weight) {
+        return at::_ops::lerp_Scalar::redispatch(dispatchKeySet, self, end, weight);
+    }
+    
+    // aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+    inline at::Tensor lerp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & end, const at::Tensor & weight) {
+        return at::_ops::lerp_Tensor::redispatch(dispatchKeySet, self, end, weight);
+    }
+    
+    // aten::histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & histc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t bins=100, const at::Scalar & min=0, const at::Scalar & max=0) {
+        return at::_ops::histc_out::redispatch(dispatchKeySet, self, bins, min, max, out);
+    }
+    
+    // aten::histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & histc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins, const at::Scalar & min, const at::Scalar & max, at::Tensor & out) {
+        return at::_ops::histc_out::redispatch(dispatchKeySet, self, bins, min, max, out);
+    }
+    
+    // aten::histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+    inline at::Tensor histc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins=100, const at::Scalar & min=0, const at::Scalar & max=0) {
+        return at::_ops::histc::redispatch(dispatchKeySet, self, bins, min, max);
+    }
+    
+    // aten::histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> histogram_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & hist, at::Tensor & bin_edges, const at::Tensor & self, const at::Tensor & bins, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogram_bins_tensor_out::redispatch(dispatchKeySet, self, bins, weight, density, hist, bin_edges);
+    }
+    
+    // aten::histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> histogram_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & bins, const c10::optional<at::Tensor> & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges) {
+        return at::_ops::histogram_bins_tensor_out::redispatch(dispatchKeySet, self, bins, weight, density, hist, bin_edges);
+    }
+    
+    // aten::histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+    inline ::std::tuple<at::Tensor,at::Tensor> histogram(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & bins, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogram_bins_tensor::redispatch(dispatchKeySet, self, bins, weight, density);
+    }
+    
+    // aten::histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> histogram_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & hist, at::Tensor & bin_edges, const at::Tensor & self, int64_t bins=100, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogram_bin_ct_out::redispatch(dispatchKeySet, self, bins, range, weight, density, hist, bin_edges);
+    }
+    
+    // aten::histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> histogram_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins, c10::optional<at::ArrayRef<double>> range, const c10::optional<at::Tensor> & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges) {
+        return at::_ops::histogram_bin_ct_out::redispatch(dispatchKeySet, self, bins, range, weight, density, hist, bin_edges);
+    }
+    
+    // aten::histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+    inline ::std::tuple<at::Tensor,at::Tensor> histogram(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins=100, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogram_bin_ct::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::_histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]
+    inline ::std::vector<at::Tensor> _histogramdd_bin_edges(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_bin_edges::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::_histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor
+    inline at::Tensor _histogramdd_from_bin_cts(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_from_bin_cts::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::_histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor
+    inline at::Tensor _histogramdd_from_bin_tensors(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList bins, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_from_bin_tensors::redispatch(dispatchKeySet, self, bins, weight, density);
+    }
+    
+    // aten::histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+    inline ::std::tuple<at::Tensor,::std::vector<at::Tensor>> histogramdd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogramdd::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+    inline ::std::tuple<at::Tensor,::std::vector<at::Tensor>> histogramdd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogramdd_int_bins::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)
+    inline ::std::tuple<at::Tensor,::std::vector<at::Tensor>> histogramdd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::histogramdd_TensorList_bins::redispatch(dispatchKeySet, self, bins, range, weight, density);
+    }
+    
+    // aten::fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::fmod_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::fmod_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor fmod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::fmod_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & fmod_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::fmod__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmod_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::fmod_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor fmod(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmod_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & fmod_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmod__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hypot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::hypot_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hypot_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::hypot_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::hypot(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor hypot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::hypot::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & hypot_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::hypot_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & igamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igamma_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & igamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::igamma_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::igamma(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor igamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igamma::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & igamma_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igamma_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & igammac_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igammac_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & igammac_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::igammac_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::igammac(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor igammac(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igammac::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & igammac_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::igammac_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nextafter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::nextafter_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nextafter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::nextafter_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::nextafter(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor nextafter(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::nextafter::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & nextafter_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::nextafter_::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::remainder_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::remainder_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor remainder(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::remainder_Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+    inline at::Tensor & remainder_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::remainder__Scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::remainder_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::remainder_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor remainder(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::remainder_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+    inline at::Tensor & remainder_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::remainder__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor remainder(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::remainder_Scalar_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::min(Tensor self) -> Tensor
+    inline at::Tensor min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::min::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::min_unary_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::min_unary_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::fmin(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor fmin(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmin::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmin_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmin_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmin_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::fmin_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::max(Tensor self) -> Tensor
+    inline at::Tensor max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::max::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::fmax(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor fmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmax::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::fmax_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::fmax_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::maximum(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor maximum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::maximum::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & maximum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::maximum_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & maximum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::maximum_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::max.other(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor max(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::max_other::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::max_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::max_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::max_unary_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::max_unary_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::minimum(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor minimum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::minimum::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & minimum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::minimum_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & minimum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::minimum_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & min_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::min_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & min_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::min_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::min.other(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor min(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::min_other::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+    inline at::Tensor quantile(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::quantile::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation);
+    }
+    
+    // aten::quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantile_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::quantile_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantile_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out) {
+        return at::_ops::quantile_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+    inline at::Tensor quantile(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::quantile_scalar::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation);
+    }
+    
+    // aten::quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantile_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::quantile_scalar_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantile_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out) {
+        return at::_ops::quantile_scalar_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+    inline at::Tensor nanquantile(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::nanquantile::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation);
+    }
+    
+    // aten::nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanquantile_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::nanquantile_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanquantile_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out) {
+        return at::_ops::nanquantile_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+    inline at::Tensor nanquantile(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::nanquantile_scalar::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation);
+    }
+    
+    // aten::nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanquantile_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") {
+        return at::_ops::nanquantile_scalar_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanquantile_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out) {
+        return at::_ops::nanquantile_scalar_out::redispatch(dispatchKeySet, self, q, dim, keepdim, interpolation, out);
+    }
+    
+    // aten::sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim=-1, bool descending=false) {
+        return at::_ops::sort_values::redispatch(dispatchKeySet, self, dim, descending, values, indices);
+    }
+    
+    // aten::sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::sort_values::redispatch(dispatchKeySet, self, dim, descending, values, indices);
+    }
+    
+    // aten::sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::optional<bool> stable, int64_t dim=-1, bool descending=false) {
+        return at::_ops::sort_values_stable::redispatch(dispatchKeySet, self, stable, dim, descending, values, indices);
+    }
+    
+    // aten::sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::sort_values_stable::redispatch(dispatchKeySet, self, stable, dim, descending, values, indices);
+    }
+    
+    // aten::sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> sort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=-1, bool descending=false) {
+        return at::_ops::sort::redispatch(dispatchKeySet, self, dim, descending);
+    }
+    
+    // aten::sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> sort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<bool> stable, int64_t dim=-1, bool descending=false) {
+        return at::_ops::sort_stable::redispatch(dispatchKeySet, self, stable, dim, descending);
+    }
+    
+    // aten::sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool descending=false) {
+        return at::_ops::sort_dimname_values::redispatch(dispatchKeySet, self, dim, descending, values, indices);
+    }
+    
+    // aten::sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::sort_dimname_values::redispatch(dispatchKeySet, self, dim, descending, values, indices);
+    }
+    
+    // aten::sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::optional<bool> stable, at::Dimname dim, bool descending=false) {
+        return at::_ops::sort_dimname_values_stable::redispatch(dispatchKeySet, self, stable, dim, descending, values, indices);
+    }
+    
+    // aten::sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> sort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<bool> stable, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::sort_dimname_values_stable::redispatch(dispatchKeySet, self, stable, dim, descending, values, indices);
+    }
+    
+    // aten::sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> sort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending=false) {
+        return at::_ops::sort_dimname::redispatch(dispatchKeySet, self, dim, descending);
+    }
+    
+    // aten::sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> sort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<bool> stable, at::Dimname dim, bool descending=false) {
+        return at::_ops::sort_dimname_stable::redispatch(dispatchKeySet, self, stable, dim, descending);
+    }
+    
+    // aten::msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & msort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::msort_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & msort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::msort_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::msort(Tensor self) -> Tensor
+    inline at::Tensor msort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::msort::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
+    inline at::Tensor argsort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=-1, bool descending=false) {
+        return at::_ops::argsort::redispatch(dispatchKeySet, self, dim, descending);
+    }
+    
+    // aten::argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor
+    inline at::Tensor argsort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool stable, int64_t dim=-1, bool descending=false) {
+        return at::_ops::argsort_stable::redispatch(dispatchKeySet, self, stable, dim, descending);
+    }
+    
+    // aten::argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
+    inline at::Tensor argsort(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending=false) {
+        return at::_ops::argsort_dimname::redispatch(dispatchKeySet, self, dim, descending);
+    }
+    
+    // aten::topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> topk_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) {
+        return at::_ops::topk_values::redispatch(dispatchKeySet, self, k, dim, largest, sorted, values, indices);
+    }
+    
+    // aten::topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> topk_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::topk_values::redispatch(dispatchKeySet, self, k, dim, largest, sorted, values, indices);
+    }
+    
+    // aten::topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> topk_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true) {
+        return at::_ops::topk_values::redispatch(dispatchKeySet, self, k, dim, largest, sorted, values, indices);
+    }
+    
+    // aten::topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> topk_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices) {
+        return at::_ops::topk_values::redispatch(dispatchKeySet, self, k, dim, largest, sorted, values, indices);
+    }
+    
+    // aten::topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> topk(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) {
+        return at::_ops::topk::redispatch(dispatchKeySet, self, k, dim, largest, sorted);
+    }
+    
+    // aten::topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+    inline ::std::tuple<at::Tensor,at::Tensor> topk_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true) {
+        return at::_ops::topk::redispatch(dispatchKeySet, self, k, dim, largest, sorted);
+    }
+    
+    // aten::all(Tensor self) -> Tensor
+    inline at::Tensor all(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::all::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::all_all_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & all_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::all_all_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::any(Tensor self) -> Tensor
+    inline at::Tensor any(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::any::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::any_all_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & any_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::any_all_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & renorm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) {
+        return at::_ops::renorm_out::redispatch(dispatchKeySet, self, p, dim, maxnorm, out);
+    }
+    
+    // aten::renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & renorm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm, at::Tensor & out) {
+        return at::_ops::renorm_out::redispatch(dispatchKeySet, self, p, dim, maxnorm, out);
+    }
+    
+    // aten::renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+    inline at::Tensor renorm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) {
+        return at::_ops::renorm::redispatch(dispatchKeySet, self, p, dim, maxnorm);
+    }
+    
+    // aten::renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+    inline at::Tensor & renorm_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) {
+        return at::_ops::renorm_::redispatch(dispatchKeySet, self, p, dim, maxnorm);
+    }
+    
+    // aten::unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+    inline at::Tensor unfold(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
+        return at::_ops::unfold::redispatch(dispatchKeySet, self, dimension, size, step);
+    }
+    
+    // aten::unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+    inline at::Tensor unfold_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step) {
+        return at::_ops::unfold_backward::redispatch(dispatchKeySet, grad_in, c10::fromIntArrayRefSlow(input_sizes), dim, size, step);
+    }
+    
+    // aten::unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor
+    inline at::Tensor unfold_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step) {
+        return at::_ops::unfold_backward::redispatch(dispatchKeySet, grad_in, input_sizes, dim, size, step);
+    }
+    
+    // aten::equal(Tensor self, Tensor other) -> bool
+    inline bool equal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::equal::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::pow_Tensor_Tensor_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent, at::Tensor & out) {
+        return at::_ops::pow_Tensor_Tensor_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+    inline at::Tensor pow(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::pow_Tensor_Tensor::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & exponent) {
+        return at::_ops::pow_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent, at::Tensor & out) {
+        return at::_ops::pow_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Scalar(Scalar self, Tensor exponent) -> Tensor
+    inline at::Tensor pow(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent) {
+        return at::_ops::pow_Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::pow_Tensor_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pow_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent, at::Tensor & out) {
+        return at::_ops::pow_Tensor_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+    inline at::Tensor pow(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::pow_Tensor_Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+    inline at::Tensor & pow_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::pow__Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+    inline at::Tensor & pow_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::pow__Tensor::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::float_power_Tensor_Tensor_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent, at::Tensor & out) {
+        return at::_ops::float_power_Tensor_Tensor_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+    inline at::Tensor float_power(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::float_power_Tensor_Tensor::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & exponent) {
+        return at::_ops::float_power_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent, at::Tensor & out) {
+        return at::_ops::float_power_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
+    inline at::Tensor float_power(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent) {
+        return at::_ops::float_power_Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::float_power_Tensor_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & float_power_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent, at::Tensor & out) {
+        return at::_ops::float_power_Tensor_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+    inline at::Tensor float_power(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::float_power_Tensor_Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+    inline at::Tensor & float_power_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & exponent) {
+        return at::_ops::float_power__Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+    inline at::Tensor & float_power_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & exponent) {
+        return at::_ops::float_power__Tensor::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & normal_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_::redispatch(dispatchKeySet, self, mean, std, generator);
+    }
+    
+    // aten::normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor
+    inline at::Tensor normal_functional(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_functional::redispatch(dispatchKeySet, self, mean, std, generator);
+    }
+    
+    // aten::normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_Tensor_float_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mean, double std, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_Tensor_float_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+    inline at::Tensor normal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_Tensor_float::redispatch(dispatchKeySet, mean, std, generator);
+    }
+    
+    // aten::normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_float_Tensor_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_outf(c10::DispatchKeySet dispatchKeySet, double mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_float_Tensor_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+    inline at::Tensor normal(c10::DispatchKeySet dispatchKeySet, double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_float_Tensor::redispatch(dispatchKeySet, mean, std, generator);
+    }
+    
+    // aten::normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_Tensor_Tensor_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_Tensor_Tensor_out::redispatch(dispatchKeySet, mean, std, generator, out);
+    }
+    
+    // aten::normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+    inline at::Tensor normal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_Tensor_Tensor::redispatch(dispatchKeySet, mean, std, generator);
+    }
+    
+    // aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor normal(c10::DispatchKeySet dispatchKeySet, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+        return at::_ops::normal_float_float::redispatch(dispatchKeySet, mean, std, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor normal(c10::DispatchKeySet dispatchKeySet, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::normal_float_float::redispatch(dispatchKeySet, mean, std, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor normal_symint(c10::DispatchKeySet dispatchKeySet, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+        return at::_ops::normal_float_float::redispatch(dispatchKeySet, mean, std, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor normal_symint(c10::DispatchKeySet dispatchKeySet, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::normal_float_float::redispatch(dispatchKeySet, mean, std, size, generator, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_float_float_out::redispatch(dispatchKeySet, mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_outf(c10::DispatchKeySet dispatchKeySet, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_float_float_out::redispatch(dispatchKeySet, mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+    }
+    
+    // aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_float_float_out::redispatch(dispatchKeySet, mean, std, size, generator, out);
+    }
+    
+    // aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_symint_outf(c10::DispatchKeySet dispatchKeySet, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_float_float_out::redispatch(dispatchKeySet, mean, std, size, generator, out);
+    }
+    
+    // aten::alias(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor alias(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::alias::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
+    inline void _amp_foreach_non_finite_check_and_unscale_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::Tensor & found_inf, const at::Tensor & inv_scale) {
+        return at::_ops::_amp_foreach_non_finite_check_and_unscale_::redispatch(dispatchKeySet, self, found_inf, inv_scale);
+    }
+    
+    // aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+    inline at::Tensor & _amp_update_scale_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+        return at::_ops::_amp_update_scale_::redispatch(dispatchKeySet, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+    }
+    
+    // aten::_foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_add(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_add_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_add_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_add__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_add(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add_List::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+    inline void _foreach_add_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add__List::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_add(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_add_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_add_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_add__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_add(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add_Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()
+    inline void _foreach_add_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add__Tensor::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sub(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_sub_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_sub_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_sub__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sub(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_sub_List::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
+    inline void _foreach_sub_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_sub__List::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sub(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_sub_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_sub_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_sub__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_mul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_mul_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_mul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_mul__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_mul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_mul_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_mul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_mul__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_mul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_mul_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_mul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_mul__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_mul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_mul_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+    inline void _foreach_mul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_mul__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_div(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_div_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_div_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_div__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_div(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_div_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_div_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_div__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_div(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_div_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_div_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_div__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_div(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_div_Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
+    inline void _foreach_div_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_div__Tensor::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_max(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_max_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_clamp_max_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_max__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_max(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_max_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_clamp_max_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_max__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_max(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_max_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_clamp_max_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_max__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_min(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_min_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_clamp_min_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_min__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_min(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_min_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_clamp_min_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_min__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_clamp_min(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_min_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_clamp_min_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_min__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_maximum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_maximum_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_maximum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_maximum__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_maximum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_maximum_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_maximum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_maximum__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_maximum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_maximum_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_maximum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_maximum__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_minimum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_minimum_Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
+    inline void _foreach_minimum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_minimum__Scalar::redispatch(dispatchKeySet, self, scalar);
+    }
+    
+    // aten::_foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_minimum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_minimum_List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
+    inline void _foreach_minimum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_minimum__List::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::_foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_minimum(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_minimum_ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
+    inline void _foreach_minimum_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_minimum__ScalarList::redispatch(dispatchKeySet, self, scalars);
+    }
+    
+    // aten::_foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcdiv(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcdiv_Scalar::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::_foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcdiv(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcdiv_ScalarList::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcdiv(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcdiv_Tensor::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+    inline void _foreach_addcdiv_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcdiv__Scalar::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::_foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+    inline void _foreach_addcdiv_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcdiv__ScalarList::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+    inline void _foreach_addcdiv_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcdiv__Tensor::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcmul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcmul_Scalar::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::_foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcmul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcmul_ScalarList::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_addcmul(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcmul_Tensor::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
+    inline void _foreach_addcmul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcmul__Scalar::redispatch(dispatchKeySet, self, tensor1, tensor2, value);
+    }
+    
+    // aten::_foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()
+    inline void _foreach_addcmul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcmul__ScalarList::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()
+    inline void _foreach_addcmul_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcmul__Tensor::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars);
+    }
+    
+    // aten::_foreach_abs(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_abs(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_abs::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_abs_(Tensor(a!)[] self) -> ()
+    inline void _foreach_abs_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_abs_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_acos(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_acos(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_acos::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_acos_(Tensor(a!)[] self) -> ()
+    inline void _foreach_acos_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_acos_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_asin(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_asin(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_asin::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_asin_(Tensor(a!)[] self) -> ()
+    inline void _foreach_asin_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_asin_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_atan(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_atan(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_atan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_atan_(Tensor(a!)[] self) -> ()
+    inline void _foreach_atan_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_atan_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_ceil(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_ceil(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_ceil::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_ceil_(Tensor(a!)[] self) -> ()
+    inline void _foreach_ceil_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_ceil_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_cos(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_cos(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_cos::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_cos_(Tensor(a!)[] self) -> ()
+    inline void _foreach_cos_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_cos_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_cosh(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_cosh(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_cosh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_cosh_(Tensor(a!)[] self) -> ()
+    inline void _foreach_cosh_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_cosh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_erf(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_erf(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_erf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_erf_(Tensor(a!)[] self) -> ()
+    inline void _foreach_erf_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_erf_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_erfc(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_erfc(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_erfc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_erfc_(Tensor(a!)[] self) -> ()
+    inline void _foreach_erfc_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_erfc_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_exp(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_exp(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_exp::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_exp_(Tensor(a!)[] self) -> ()
+    inline void _foreach_exp_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_exp_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_expm1(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_expm1(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_expm1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_expm1_(Tensor(a!)[] self) -> ()
+    inline void _foreach_expm1_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_expm1_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_floor(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_floor(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_floor::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_floor_(Tensor(a!)[] self) -> ()
+    inline void _foreach_floor_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_floor_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_frac(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_frac(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_frac::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_frac_(Tensor(a!)[] self) -> ()
+    inline void _foreach_frac_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_frac_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_lerp(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights) {
+        return at::_ops::_foreach_lerp_List::redispatch(dispatchKeySet, self, tensors1, weights);
+    }
+    
+    // aten::_foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()
+    inline void _foreach_lerp_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights) {
+        return at::_ops::_foreach_lerp__List::redispatch(dispatchKeySet, self, tensors1, weights);
+    }
+    
+    // aten::_foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_lerp(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight) {
+        return at::_ops::_foreach_lerp_Scalar::redispatch(dispatchKeySet, self, tensors1, weight);
+    }
+    
+    // aten::_foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()
+    inline void _foreach_lerp_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight) {
+        return at::_ops::_foreach_lerp__Scalar::redispatch(dispatchKeySet, self, tensors1, weight);
+    }
+    
+    // aten::_foreach_lgamma(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_lgamma(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_lgamma::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_lgamma_(Tensor(a!)[] self) -> ()
+    inline void _foreach_lgamma_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_lgamma_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_log(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log_(Tensor(a!)[] self) -> ()
+    inline void _foreach_log_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log10(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_log10(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log10::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log10_(Tensor(a!)[] self) -> ()
+    inline void _foreach_log10_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log10_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log1p(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_log1p(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log1p::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log1p_(Tensor(a!)[] self) -> ()
+    inline void _foreach_log1p_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log1p_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log2(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_log2(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log2::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_log2_(Tensor(a!)[] self) -> ()
+    inline void _foreach_log2_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_log2_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_neg(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_neg(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_neg::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_neg_(Tensor(a!)[] self) -> ()
+    inline void _foreach_neg_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_neg_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_norm(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & ord=2) {
+        return at::_ops::_foreach_norm_Scalar::redispatch(dispatchKeySet, self, ord);
+    }
+    
+    // aten::_foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_pow(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList exponent) {
+        return at::_ops::_foreach_pow_List::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_pow(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & exponent) {
+        return at::_ops::_foreach_pow_Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_pow(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> exponent) {
+        return at::_ops::_foreach_pow_ScalarList::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_pow(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, at::TensorList exponent) {
+        return at::_ops::_foreach_pow_ScalarAndTensor::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()
+    inline void _foreach_pow_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList exponent) {
+        return at::_ops::_foreach_pow__List::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()
+    inline void _foreach_pow_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & exponent) {
+        return at::_ops::_foreach_pow__Scalar::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()
+    inline void _foreach_pow_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> exponent) {
+        return at::_ops::_foreach_pow__ScalarList::redispatch(dispatchKeySet, self, exponent);
+    }
+    
+    // aten::_foreach_reciprocal(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_reciprocal(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_reciprocal::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_reciprocal_(Tensor(a!)[] self) -> ()
+    inline void _foreach_reciprocal_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_reciprocal_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_round(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_round(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_round::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_round_(Tensor(a!)[] self) -> ()
+    inline void _foreach_round_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_round_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sigmoid(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sigmoid(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sigmoid::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sigmoid_(Tensor(a!)[] self) -> ()
+    inline void _foreach_sigmoid_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sigmoid_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sign(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sign(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sign::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sign_(Tensor(a!)[] self) -> ()
+    inline void _foreach_sign_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sign_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sin(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sin(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sin::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sin_(Tensor(a!)[] self) -> ()
+    inline void _foreach_sin_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sin_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sinh(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sinh(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sinh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sinh_(Tensor(a!)[] self) -> ()
+    inline void _foreach_sinh_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sinh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sqrt(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_sqrt(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sqrt::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_sqrt_(Tensor(a!)[] self) -> ()
+    inline void _foreach_sqrt_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_sqrt_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_tan(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_tan(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_tan::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_tan_(Tensor(a!)[] self) -> ()
+    inline void _foreach_tan_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_tan_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_tanh(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_tanh(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_tanh::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_tanh_(Tensor(a!)[] self) -> ()
+    inline void _foreach_tanh_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_tanh_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_trunc(Tensor[] self) -> Tensor[]
+    inline ::std::vector<at::Tensor> _foreach_trunc(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_trunc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_trunc_(Tensor(a!)[] self) -> ()
+    inline void _foreach_trunc_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_trunc_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_zero_(Tensor(a!)[] self) -> ()
+    inline void _foreach_zero_(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_zero_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()
+    inline void _foreach_copy_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList src, bool non_blocking=false) {
+        return at::_ops::_foreach_copy_::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+    inline at::Tensor bucketize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & boundaries, bool out_int32=false, bool right=false) {
+        return at::_ops::bucketize_Tensor::redispatch(dispatchKeySet, self, boundaries, out_int32, right);
+    }
+    
+    // aten::bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bucketize_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & boundaries, bool out_int32=false, bool right=false) {
+        return at::_ops::bucketize_Tensor_out::redispatch(dispatchKeySet, self, boundaries, out_int32, right, out);
+    }
+    
+    // aten::bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bucketize_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out) {
+        return at::_ops::bucketize_Tensor_out::redispatch(dispatchKeySet, self, boundaries, out_int32, right, out);
+    }
+    
+    // aten::bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
+    inline at::Tensor bucketize(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & boundaries, bool out_int32=false, bool right=false) {
+        return at::_ops::bucketize_Scalar::redispatch(dispatchKeySet, self, boundaries, out_int32, right);
+    }
+    
+    // aten::searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+    inline at::Tensor searchsorted(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={}) {
+        return at::_ops::searchsorted_Tensor::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter);
+    }
+    
+    // aten::searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & searchsorted_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={}) {
+        return at::_ops::searchsorted_Tensor_out::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter, out);
+    }
+    
+    // aten::searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & searchsorted_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<at::Tensor> & sorter, at::Tensor & out) {
+        return at::_ops::searchsorted_Tensor_out::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter, out);
+    }
+    
+    // aten::searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor
+    inline at::Tensor searchsorted(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={}) {
+        return at::_ops::searchsorted_Scalar::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter);
+    }
+    
+    // aten::searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & searchsorted_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={}) {
+        return at::_ops::searchsorted_Scalar_out::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter, out);
+    }
+    
+    // aten::searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & searchsorted_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<at::Tensor> & sorter, at::Tensor & out) {
+        return at::_ops::searchsorted_Scalar_out::redispatch(dispatchKeySet, sorted_sequence, self, out_int32, right, side, sorter, out);
+    }
+    
+    // aten::_convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor
+    inline at::Tensor _convert_indices_from_coo_to_csr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t size, bool out_int32=false) {
+        return at::_ops::_convert_indices_from_coo_to_csr::redispatch(dispatchKeySet, self, size, out_int32);
+    }
+    
+    // aten::_convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convert_indices_from_coo_to_csr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t size, bool out_int32=false) {
+        return at::_ops::_convert_indices_from_coo_to_csr_out::redispatch(dispatchKeySet, self, size, out_int32, out);
+    }
+    
+    // aten::_convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convert_indices_from_coo_to_csr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t size, bool out_int32, at::Tensor & out) {
+        return at::_ops::_convert_indices_from_coo_to_csr_out::redispatch(dispatchKeySet, self, size, out_int32, out);
+    }
+    
+    // aten::_convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor
+    inline at::Tensor _convert_indices_from_csr_to_coo(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32=false, bool transpose=false) {
+        return at::_ops::_convert_indices_from_csr_to_coo::redispatch(dispatchKeySet, crow_indices, col_indices, out_int32, transpose);
+    }
+    
+    // aten::_convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convert_indices_from_csr_to_coo_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32=false, bool transpose=false) {
+        return at::_ops::_convert_indices_from_csr_to_coo_out::redispatch(dispatchKeySet, crow_indices, col_indices, out_int32, transpose, out);
+    }
+    
+    // aten::_convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convert_indices_from_csr_to_coo_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32, bool transpose, at::Tensor & out) {
+        return at::_ops::_convert_indices_from_csr_to_coo_out::redispatch(dispatchKeySet, crow_indices, col_indices, out_int32, transpose, out);
+    }
+    
+    // aten::mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mse_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::mse_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mse_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out) {
+        return at::_ops::mse_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+    inline at::Tensor mse_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::mse_loss::redispatch(dispatchKeySet, self, target, reduction);
+    }
+    
+    // aten::mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & mse_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::mse_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, grad_input);
+    }
+    
+    // aten::mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & mse_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input) {
+        return at::_ops::mse_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, grad_input);
+    }
+    
+    // aten::mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+    inline at::Tensor mse_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::mse_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, reduction);
+    }
+    
+    // aten::l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+    inline at::Tensor l1_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::l1_loss::redispatch(dispatchKeySet, self, target, reduction);
+    }
+    
+    // aten::multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multi_margin_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p=1, const at::Scalar & margin=1, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multi_margin_loss_out::redispatch(dispatchKeySet, self, target, p, margin, weight, reduction, out);
+    }
+    
+    // aten::multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multi_margin_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out) {
+        return at::_ops::multi_margin_loss_out::redispatch(dispatchKeySet, self, target, p, margin, weight, reduction, out);
+    }
+    
+    // aten::multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor
+    inline at::Tensor multi_margin_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p=1, const at::Scalar & margin=1, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multi_margin_loss::redispatch(dispatchKeySet, self, target, p, margin, weight, reduction);
+    }
+    
+    // aten::multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & multi_margin_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multi_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, p, margin, weight, reduction, grad_input);
+    }
+    
+    // aten::multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & multi_margin_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & grad_input) {
+        return at::_ops::multi_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, p, margin, weight, reduction, grad_input);
+    }
+    
+    // aten::multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor
+    inline at::Tensor multi_margin_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multi_margin_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, p, margin, weight, reduction);
+    }
+    
+    // aten::multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multilabel_margin_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multilabel_margin_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & multilabel_margin_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out) {
+        return at::_ops::multilabel_margin_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+    inline at::Tensor multilabel_margin_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::multilabel_margin_loss::redispatch(dispatchKeySet, self, target, reduction);
+    }
+    
+    // aten::multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> multilabel_margin_loss_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & is_target, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::multilabel_margin_loss_forward_output::redispatch(dispatchKeySet, self, target, reduction, output, is_target);
+    }
+    
+    // aten::multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> multilabel_margin_loss_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & output, at::Tensor & is_target) {
+        return at::_ops::multilabel_margin_loss_forward_output::redispatch(dispatchKeySet, self, target, reduction, output, is_target);
+    }
+    
+    // aten::multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
+    inline ::std::tuple<at::Tensor,at::Tensor> multilabel_margin_loss_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::multilabel_margin_loss_forward::redispatch(dispatchKeySet, self, target, reduction);
+    }
+    
+    // aten::multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & multilabel_margin_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target) {
+        return at::_ops::multilabel_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, is_target, grad_input);
+    }
+    
+    // aten::multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & multilabel_margin_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target, at::Tensor & grad_input) {
+        return at::_ops::multilabel_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, is_target, grad_input);
+    }
+    
+    // aten::multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
+    inline at::Tensor multilabel_margin_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target) {
+        return at::_ops::multilabel_margin_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, reduction, is_target);
+    }
+    
+    // aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100) {
+        return at::_ops::nll_loss_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & out) {
+        return at::_ops::nll_loss_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100) {
+        return at::_ops::nll_loss_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out) {
+        return at::_ops::nll_loss_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss_nd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100) {
+        return at::_ops::nll_loss_nd::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss_nd_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100) {
+        return at::_ops::nll_loss_nd::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100) {
+        return at::_ops::nll_loss::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100) {
+        return at::_ops::nll_loss::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index) {
+        return at::_ops::nll_loss_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight) {
+        return at::_ops::nll_loss_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index) {
+        return at::_ops::nll_loss_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight) {
+        return at::_ops::nll_loss_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+    inline ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index) {
+        return at::_ops::nll_loss_forward::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+    inline ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index) {
+        return at::_ops::nll_loss_forward::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input) {
+        return at::_ops::nll_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input) {
+        return at::_ops::nll_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+    inline at::Tensor nll_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight);
+    }
+    
+    // aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+    inline at::Tensor nll_loss_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight);
+    }
+    
+    // aten::nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100) {
+        return at::_ops::nll_loss2d_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & out) {
+        return at::_ops::nll_loss2d_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100) {
+        return at::_ops::nll_loss2d_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out) {
+        return at::_ops::nll_loss2d_out::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, out);
+    }
+    
+    // aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, int64_t ignore_index=-100) {
+        return at::_ops::nll_loss2d::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor
+    inline at::Tensor nll_loss2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100) {
+        return at::_ops::nll_loss2d::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss2d_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index) {
+        return at::_ops::nll_loss2d_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss2d_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss2d_forward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index) {
+        return at::_ops::nll_loss2d_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> nll_loss2d_forward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_forward_output::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index, output, total_weight);
+    }
+    
+    // aten::nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+    inline ::std::tuple<at::Tensor,at::Tensor> nll_loss2d_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index) {
+        return at::_ops::nll_loss2d_forward::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)
+    inline ::std::tuple<at::Tensor,at::Tensor> nll_loss2d_forward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index) {
+        return at::_ops::nll_loss2d_forward::redispatch(dispatchKeySet, self, target, weight, reduction, ignore_index);
+    }
+    
+    // aten::nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input) {
+        return at::_ops::nll_loss2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & nll_loss2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input) {
+        return at::_ops::nll_loss2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight, grad_input);
+    }
+    
+    // aten::nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+    inline at::Tensor nll_loss2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_backward::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight);
+    }
+    
+    // aten::nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor
+    inline at::Tensor nll_loss2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight) {
+        return at::_ops::nll_loss2d_backward::redispatch(dispatchKeySet, grad_output, self, target, weight, reduction, ignore_index, total_weight);
+    }
+    
+    // aten::smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & smooth_l1_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0) {
+        return at::_ops::smooth_l1_loss_out::redispatch(dispatchKeySet, self, target, reduction, beta, out);
+    }
+    
+    // aten::smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & smooth_l1_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta, at::Tensor & out) {
+        return at::_ops::smooth_l1_loss_out::redispatch(dispatchKeySet, self, target, reduction, beta, out);
+    }
+    
+    // aten::smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
+    inline at::Tensor smooth_l1_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0) {
+        return at::_ops::smooth_l1_loss::redispatch(dispatchKeySet, self, target, reduction, beta);
+    }
+    
+    // aten::smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & smooth_l1_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta) {
+        return at::_ops::smooth_l1_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, beta, grad_input);
+    }
+    
+    // aten::smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & smooth_l1_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta, at::Tensor & grad_input) {
+        return at::_ops::smooth_l1_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, beta, grad_input);
+    }
+    
+    // aten::smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
+    inline at::Tensor smooth_l1_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta) {
+        return at::_ops::smooth_l1_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, reduction, beta);
+    }
+    
+    // aten::huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & huber_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double delta=1.0) {
+        return at::_ops::huber_loss_out::redispatch(dispatchKeySet, self, target, reduction, delta, out);
+    }
+    
+    // aten::huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & huber_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta, at::Tensor & out) {
+        return at::_ops::huber_loss_out::redispatch(dispatchKeySet, self, target, reduction, delta, out);
+    }
+    
+    // aten::huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor
+    inline at::Tensor huber_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double delta=1.0) {
+        return at::_ops::huber_loss::redispatch(dispatchKeySet, self, target, reduction, delta);
+    }
+    
+    // aten::huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & huber_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta) {
+        return at::_ops::huber_loss_backward_out::redispatch(dispatchKeySet, grad_output, self, target, reduction, delta, grad_input);
+    }
+    
+    // aten::huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & huber_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta, at::Tensor & grad_input) {
+        return at::_ops::huber_loss_backward_out::redispatch(dispatchKeySet, grad_output, self, target, reduction, delta, grad_input);
+    }
+    
+    // aten::huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor
+    inline at::Tensor huber_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta) {
+        return at::_ops::huber_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, reduction, delta);
+    }
+    
+    // aten::soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & soft_margin_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::soft_margin_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & soft_margin_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out) {
+        return at::_ops::soft_margin_loss_out::redispatch(dispatchKeySet, self, target, reduction, out);
+    }
+    
+    // aten::soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+    inline at::Tensor soft_margin_loss(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::soft_margin_loss::redispatch(dispatchKeySet, self, target, reduction);
+    }
+    
+    // aten::soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & soft_margin_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::soft_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, grad_input);
+    }
+    
+    // aten::soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & soft_margin_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input) {
+        return at::_ops::soft_margin_loss_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, target, reduction, grad_input);
+    }
+    
+    // aten::soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
+    inline at::Tensor soft_margin_loss_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction) {
+        return at::_ops::soft_margin_loss_backward::redispatch(dispatchKeySet, grad_output, self, target, reduction);
+    }
+    
+    // aten::elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & elu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1) {
+        return at::_ops::elu_out::redispatch(dispatchKeySet, self, alpha, scale, input_scale, out);
+    }
+    
+    // aten::elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & elu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, at::Tensor & out) {
+        return at::_ops::elu_out::redispatch(dispatchKeySet, self, alpha, scale, input_scale, out);
+    }
+    
+    // aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
+    inline at::Tensor elu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1) {
+        return at::_ops::elu::redispatch(dispatchKeySet, self, alpha, scale, input_scale);
+    }
+    
+    // aten::elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & elu_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result) {
+        return at::_ops::elu_backward_grad_input::redispatch(dispatchKeySet, grad_output, alpha, scale, input_scale, is_result, self_or_result, grad_input);
+    }
+    
+    // aten::elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & elu_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result, at::Tensor & grad_input) {
+        return at::_ops::elu_backward_grad_input::redispatch(dispatchKeySet, grad_output, alpha, scale, input_scale, is_result, self_or_result, grad_input);
+    }
+    
+    // aten::elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
+    inline at::Tensor elu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result) {
+        return at::_ops::elu_backward::redispatch(dispatchKeySet, grad_output, alpha, scale, input_scale, is_result, self_or_result);
+    }
+    
+    // aten::elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
+    inline at::Tensor & elu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1) {
+        return at::_ops::elu_::redispatch(dispatchKeySet, self, alpha, scale, input_scale);
+    }
+    
+    // aten::glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim=-1) {
+        return at::_ops::glu_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & out) {
+        return at::_ops::glu_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::glu(Tensor self, int dim=-1) -> Tensor
+    inline at::Tensor glu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=-1) {
+        return at::_ops::glu::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & glu_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim) {
+        return at::_ops::glu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, dim, grad_input);
+    }
+    
+    // aten::glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & glu_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, at::Tensor & grad_input) {
+        return at::_ops::glu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, dim, grad_input);
+    }
+    
+    // aten::glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
+    inline at::Tensor glu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim) {
+        return at::_ops::glu_backward::redispatch(dispatchKeySet, grad_output, self, dim);
+    }
+    
+    // aten::glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor
+    inline at::Tensor glu_jvp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim) {
+        return at::_ops::glu_jvp::redispatch(dispatchKeySet, glu, x, dx, dim);
+    }
+    
+    // aten::glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor
+    inline at::Tensor glu_backward_jvp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) {
+        return at::_ops::glu_backward_jvp::redispatch(dispatchKeySet, grad_x, grad_glu, x, dgrad_glu, dx, dim);
+    }
+    
+    // aten::hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardsigmoid_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::hardsigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardsigmoid_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::hardsigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::hardsigmoid(Tensor self) -> Tensor
+    inline at::Tensor hardsigmoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::hardsigmoid::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & hardsigmoid_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::hardsigmoid_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardsigmoid_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::hardsigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardsigmoid_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) {
+        return at::_ops::hardsigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor hardsigmoid_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::hardsigmoid_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardtanh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & min_val=-1, const at::Scalar & max_val=1) {
+        return at::_ops::hardtanh_out::redispatch(dispatchKeySet, self, min_val, max_val, out);
+    }
+    
+    // aten::hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardtanh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & out) {
+        return at::_ops::hardtanh_out::redispatch(dispatchKeySet, self, min_val, max_val, out);
+    }
+    
+    // aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
+    inline at::Tensor hardtanh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & min_val=-1, const at::Scalar & max_val=1) {
+        return at::_ops::hardtanh::redispatch(dispatchKeySet, self, min_val, max_val);
+    }
+    
+    // aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardtanh_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) {
+        return at::_ops::hardtanh_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, min_val, max_val, grad_input);
+    }
+    
+    // aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & hardtanh_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input) {
+        return at::_ops::hardtanh_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, min_val, max_val, grad_input);
+    }
+    
+    // aten::hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
+    inline at::Tensor hardtanh_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) {
+        return at::_ops::hardtanh_backward::redispatch(dispatchKeySet, grad_output, self, min_val, max_val);
+    }
+    
+    // aten::hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
+    inline at::Tensor & hardtanh_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & min_val=-1, const at::Scalar & max_val=1) {
+        return at::_ops::hardtanh_::redispatch(dispatchKeySet, self, min_val, max_val);
+    }
+    
+    // aten::hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardswish_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::hardswish_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardswish_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::hardswish_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::hardswish(Tensor self) -> Tensor
+    inline at::Tensor hardswish(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::hardswish::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::hardswish_(Tensor(a!) self) -> Tensor(a!)
+    inline at::Tensor & hardswish_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self) {
+        return at::_ops::hardswish_::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor hardswish_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::hardswish_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & leaky_relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+        return at::_ops::leaky_relu_out::redispatch(dispatchKeySet, self, negative_slope, out);
+    }
+    
+    // aten::leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & leaky_relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & negative_slope, at::Tensor & out) {
+        return at::_ops::leaky_relu_out::redispatch(dispatchKeySet, self, negative_slope, out);
+    }
+    
+    // aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+    inline at::Tensor leaky_relu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+        return at::_ops::leaky_relu::redispatch(dispatchKeySet, self, negative_slope);
+    }
+    
+    // aten::leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & leaky_relu_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result) {
+        return at::_ops::leaky_relu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, negative_slope, self_is_result, grad_input);
+    }
+    
+    // aten::leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & leaky_relu_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input) {
+        return at::_ops::leaky_relu_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, negative_slope, self_is_result, grad_input);
+    }
+    
+    // aten::leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
+    inline at::Tensor leaky_relu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result) {
+        return at::_ops::leaky_relu_backward::redispatch(dispatchKeySet, grad_output, self, negative_slope, self_is_result);
+    }
+    
+    // aten::leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+    inline at::Tensor & leaky_relu_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+        return at::_ops::leaky_relu_::redispatch(dispatchKeySet, self, negative_slope);
+    }
+    
+    // aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_sigmoid_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::log_sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_sigmoid_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::log_sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::log_sigmoid(Tensor self) -> Tensor
+    inline at::Tensor log_sigmoid(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log_sigmoid::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> log_sigmoid_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & buffer, const at::Tensor & self) {
+        return at::_ops::log_sigmoid_forward_output::redispatch(dispatchKeySet, self, output, buffer);
+    }
+    
+    // aten::log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> log_sigmoid_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & output, at::Tensor & buffer) {
+        return at::_ops::log_sigmoid_forward_output::redispatch(dispatchKeySet, self, output, buffer);
+    }
+    
+    // aten::log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
+    inline ::std::tuple<at::Tensor,at::Tensor> log_sigmoid_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::log_sigmoid_forward::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & log_sigmoid_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer) {
+        return at::_ops::log_sigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, buffer, grad_input);
+    }
+    
+    // aten::log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & log_sigmoid_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer, at::Tensor & grad_input) {
+        return at::_ops::log_sigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, buffer, grad_input);
+    }
+    
+    // aten::log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
+    inline at::Tensor log_sigmoid_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer) {
+        return at::_ops::log_sigmoid_backward::redispatch(dispatchKeySet, grad_output, self, buffer);
+    }
+    
+    // aten::rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rrelu_with_noise_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::rrelu_with_noise_out::redispatch(dispatchKeySet, self, noise, lower, upper, training, generator, out);
+    }
+    
+    // aten::rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rrelu_with_noise_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::rrelu_with_noise_out::redispatch(dispatchKeySet, self, noise, lower, upper, training, generator, out);
+    }
+    
+    // aten::rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+    inline at::Tensor rrelu_with_noise(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::rrelu_with_noise::redispatch(dispatchKeySet, self, noise, lower, upper, training, generator);
+    }
+    
+    // aten::rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
+    inline at::Tensor rrelu_with_noise_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result) {
+        return at::_ops::rrelu_with_noise_backward::redispatch(dispatchKeySet, grad_output, self, noise, lower, upper, training, self_is_result);
+    }
+    
+    // aten::rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+    inline at::Tensor & rrelu_with_noise_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::rrelu_with_noise_::redispatch(dispatchKeySet, self, noise, lower, upper, training, generator);
+    }
+    
+    // aten::softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softplus_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20) {
+        return at::_ops::softplus_out::redispatch(dispatchKeySet, self, beta, threshold, out);
+    }
+    
+    // aten::softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softplus_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out) {
+        return at::_ops::softplus_out::redispatch(dispatchKeySet, self, beta, threshold, out);
+    }
+    
+    // aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
+    inline at::Tensor softplus(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20) {
+        return at::_ops::softplus::redispatch(dispatchKeySet, self, beta, threshold);
+    }
+    
+    // aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & softplus_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) {
+        return at::_ops::softplus_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, beta, threshold, grad_input);
+    }
+    
+    // aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & softplus_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input) {
+        return at::_ops::softplus_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, beta, threshold, grad_input);
+    }
+    
+    // aten::softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor
+    inline at::Tensor softplus_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold) {
+        return at::_ops::softplus_backward::redispatch(dispatchKeySet, grad_output, self, beta, threshold);
+    }
+    
+    // aten::softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softshrink_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & lambd=0.5) {
+        return at::_ops::softshrink_out::redispatch(dispatchKeySet, self, lambd, out);
+    }
+    
+    // aten::softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & softshrink_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out) {
+        return at::_ops::softshrink_out::redispatch(dispatchKeySet, self, lambd, out);
+    }
+    
+    // aten::softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+    inline at::Tensor softshrink(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd=0.5) {
+        return at::_ops::softshrink::redispatch(dispatchKeySet, self, lambd);
+    }
+    
+    // aten::softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & softshrink_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & lambd) {
+        return at::_ops::softshrink_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, lambd, grad_input);
+    }
+    
+    // aten::softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & softshrink_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & grad_input) {
+        return at::_ops::softshrink_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, lambd, grad_input);
+    }
+    
+    // aten::softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
+    inline at::Tensor softshrink_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & lambd) {
+        return at::_ops::softshrink_backward::redispatch(dispatchKeySet, grad_output, self, lambd);
+    }
+    
+    // aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+    inline at::Tensor adaptive_avg_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size));
+    }
+    
+    // aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+    inline at::Tensor adaptive_avg_pool2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool2d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
+    inline at::Tensor mkldnn_adaptive_avg_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_adaptive_avg_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_adaptive_avg_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor mkldnn_adaptive_avg_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+    inline at::Tensor _adaptive_avg_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size));
+    }
+    
+    // aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+    inline at::Tensor _adaptive_avg_pool2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool2d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor _adaptive_avg_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::_adaptive_avg_pool2d_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+    inline at::Tensor adaptive_avg_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size));
+    }
+    
+    // aten::adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+    inline at::Tensor adaptive_avg_pool3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::adaptive_avg_pool3d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::_adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+    inline at::Tensor _adaptive_avg_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size));
+    }
+    
+    // aten::_adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+    inline at::Tensor _adaptive_avg_pool3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool3d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::adaptive_avg_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_avg_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) {
+        return at::_ops::adaptive_avg_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, grad_input);
+    }
+    
+    // aten::_adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
+    inline at::Tensor _adaptive_avg_pool3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::_adaptive_avg_pool3d_backward::redispatch(dispatchKeySet, grad_output, self);
+    }
+    
+    // aten::adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_max_pool2d_out::redispatch(dispatchKeySet, self, output_size, out, indices);
+    }
+    
+    // aten::adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool2d_out::redispatch(dispatchKeySet, self, output_size, out, indices);
+    }
+    
+    // aten::adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_max_pool2d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_max_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, indices, grad_input);
+    }
+    
+    // aten::adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_max_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::adaptive_max_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, indices, grad_input);
+    }
+    
+    // aten::adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+    inline at::Tensor adaptive_max_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool2d_backward::redispatch(dispatchKeySet, grad_output, self, indices);
+    }
+    
+    // aten::adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_max_pool3d_out::redispatch(dispatchKeySet, self, output_size, out, indices);
+    }
+    
+    // aten::adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool3d_out::redispatch(dispatchKeySet, self, output_size, out, indices);
+    }
+    
+    // aten::adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::adaptive_max_pool3d::redispatch(dispatchKeySet, self, output_size);
+    }
+    
+    // aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_max_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, indices, grad_input);
+    }
+    
+    // aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & adaptive_max_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::adaptive_max_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, indices, grad_input);
+    }
+    
+    // aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+    inline at::Tensor adaptive_max_pool3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+        return at::_ops::adaptive_max_pool3d_backward::redispatch(dispatchKeySet, grad_output, self, indices);
+    }
+    
+    // aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & avg_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional<int64_t> divisor_override=c10::nullopt) {
+        return at::_ops::avg_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out);
+    }
+    
+    // aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & avg_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & out) {
+        return at::_ops::avg_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out);
+    }
+    
+    // aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    inline at::Tensor avg_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional<int64_t> divisor_override=c10::nullopt) {
+        return at::_ops::avg_pool2d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override);
+    }
+    
+    // aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & avg_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override) {
+        return at::_ops::avg_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+    }
+    
+    // aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & avg_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & grad_input) {
+        return at::_ops::avg_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+    }
+    
+    // aten::avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+    inline at::Tensor avg_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override) {
+        return at::_ops::avg_pool2d_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override);
+    }
+    
+    // aten::avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & avg_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional<int64_t> divisor_override=c10::nullopt) {
+        return at::_ops::avg_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out);
+    }
+    
+    // aten::avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & avg_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & out) {
+        return at::_ops::avg_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out);
+    }
+    
+    // aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
+    inline at::Tensor avg_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional<int64_t> divisor_override=c10::nullopt) {
+        return at::_ops::avg_pool3d::redispatch(dispatchKeySet, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override);
+    }
+    
+    // aten::avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & avg_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override) {
+        return at::_ops::avg_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+    }
+    
+    // aten::avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & avg_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & grad_input) {
+        return at::_ops::avg_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, grad_input);
+    }
+    
+    // aten::avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
+    inline at::Tensor avg_pool3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override) {
+        return at::_ops::avg_pool3d_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override);
+    }
+    
+    // aten::fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples) {
+        return at::_ops::fractional_max_pool2d_output::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples, output, indices);
+    }
+    
+    // aten::fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices) {
+        return at::_ops::fractional_max_pool2d_output::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples, output, indices);
+    }
+    
+    // aten::fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> fractional_max_pool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples) {
+        return at::_ops::fractional_max_pool2d::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples);
+    }
+    
+    // aten::fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & fractional_max_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) {
+        return at::_ops::fractional_max_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices, grad_input);
+    }
+    
+    // aten::fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & fractional_max_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::fractional_max_pool2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices, grad_input);
+    }
+    
+    // aten::fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
+    inline at::Tensor fractional_max_pool2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) {
+        return at::_ops::fractional_max_pool2d_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices);
+    }
+    
+    // aten::fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples) {
+        return at::_ops::fractional_max_pool3d_output::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples, output, indices);
+    }
+    
+    // aten::fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices) {
+        return at::_ops::fractional_max_pool3d_output::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples, output, indices);
+    }
+    
+    // aten::fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> fractional_max_pool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples) {
+        return at::_ops::fractional_max_pool3d::redispatch(dispatchKeySet, self, kernel_size, output_size, random_samples);
+    }
+    
+    // aten::fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & fractional_max_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) {
+        return at::_ops::fractional_max_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices, grad_input);
+    }
+    
+    // aten::fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & fractional_max_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::fractional_max_pool3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices, grad_input);
+    }
+    
+    // aten::fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
+    inline at::Tensor fractional_max_pool3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) {
+        return at::_ops::fractional_max_pool3d_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, output_size, indices);
+    }
+    
+    // aten::max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_pool2d_with_indices_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool2d_with_indices_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
+    }
+    
+    // aten::max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_pool2d_with_indices_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices) {
+        return at::_ops::max_pool2d_with_indices_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
+    }
+    
+    // aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> max_pool2d_with_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool2d_with_indices::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & max_pool2d_with_indices_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices) {
+        return at::_ops::max_pool2d_with_indices_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices, grad_input);
+    }
+    
+    // aten::max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & max_pool2d_with_indices_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::max_pool2d_with_indices_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices, grad_input);
+    }
+    
+    // aten::max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
+    inline at::Tensor max_pool2d_with_indices_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices) {
+        return at::_ops::max_pool2d_with_indices_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+    }
+    
+    // aten::max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_pool3d_with_indices_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool3d_with_indices_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
+    }
+    
+    // aten::max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> max_pool3d_with_indices_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices) {
+        return at::_ops::max_pool3d_with_indices_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out, indices);
+    }
+    
+    // aten::max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> max_pool3d_with_indices(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool3d_with_indices::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode);
+    }
+    
+    // aten::max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & max_pool3d_with_indices_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices) {
+        return at::_ops::max_pool3d_with_indices_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices, grad_input);
+    }
+    
+    // aten::max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & max_pool3d_with_indices_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input) {
+        return at::_ops::max_pool3d_with_indices_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices, grad_input);
+    }
+    
+    // aten::max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
+    inline at::Tensor max_pool3d_with_indices_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices) {
+        return at::_ops::max_pool3d_with_indices_backward::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices);
+    }
+    
+    // aten::max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size) {
+        return at::_ops::max_unpool2d_out::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::max_unpool2d_out::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size) {
+        return at::_ops::max_unpool2d_out::redispatch(dispatchKeySet, self, indices, output_size, out);
+    }
+    
+    // aten::max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::max_unpool2d_out::redispatch(dispatchKeySet, self, indices, output_size, out);
+    }
+    
+    // aten::max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
+    inline at::Tensor max_unpool2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size) {
+        return at::_ops::max_unpool2d::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size));
+    }
+    
+    // aten::max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor
+    inline at::Tensor max_unpool2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size) {
+        return at::_ops::max_unpool2d::redispatch(dispatchKeySet, self, indices, output_size);
+    }
+    
+    // aten::max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::max_unpool3d_out::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size), stride, padding, out);
+    }
+    
+    // aten::max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::max_unpool3d_out::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size), stride, padding, out);
+    }
+    
+    // aten::max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::max_unpool3d_out::redispatch(dispatchKeySet, self, indices, output_size, stride, padding, out);
+    }
+    
+    // aten::max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_unpool3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::max_unpool3d_out::redispatch(dispatchKeySet, self, indices, output_size, stride, padding, out);
+    }
+    
+    // aten::max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
+    inline at::Tensor max_unpool3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::max_unpool3d::redispatch(dispatchKeySet, self, indices, c10::fromIntArrayRefSlow(output_size), stride, padding);
+    }
+    
+    // aten::max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor
+    inline at::Tensor max_unpool3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::max_unpool3d::redispatch(dispatchKeySet, self, indices, output_size, stride, padding);
+    }
+    
+    // aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad1d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad1d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor reflection_pad1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad1d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor reflection_pad1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad1d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad1d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor reflection_pad1d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad1d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor reflection_pad1d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad1d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad2d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad2d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor reflection_pad2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor reflection_pad2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad2d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor reflection_pad2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad2d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor reflection_pad2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad2d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad3d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::reflection_pad3d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor reflection_pad3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor reflection_pad3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad3d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & reflection_pad3d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::reflection_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor reflection_pad3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::reflection_pad3d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor reflection_pad3d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::reflection_pad3d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad1d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad1d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor replication_pad1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad1d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor replication_pad1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad1d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad1d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor replication_pad1d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad1d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor
+    inline at::Tensor replication_pad1d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad1d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad2d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad2d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor replication_pad2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor replication_pad2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad2d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor replication_pad2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad2d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor
+    inline at::Tensor replication_pad2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad2d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad3d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::replication_pad3d_out::redispatch(dispatchKeySet, self, padding, out);
+    }
+    
+    // aten::replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor replication_pad3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor replication_pad3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad3d::redispatch(dispatchKeySet, self, padding);
+    }
+    
+    // aten::replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input);
+    }
+    
+    // aten::replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & replication_pad3d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) {
+        return at::_ops::replication_pad3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, padding, grad_input);
+    }
+    
+    // aten::replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor replication_pad3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) {
+        return at::_ops::replication_pad3d_backward::redispatch(dispatchKeySet, grad_output, self, c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor
+    inline at::Tensor replication_pad3d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) {
+        return at::_ops::replication_pad3d_backward::redispatch(dispatchKeySet, grad_output, self, padding);
+    }
+    
+    // aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor
+    inline at::Tensor _pad_circular(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef pad) {
+        return at::_ops::_pad_circular::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad));
+    }
+    
+    // aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor
+    inline at::Tensor _pad_circular_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef pad) {
+        return at::_ops::_pad_circular::redispatch(dispatchKeySet, self, pad);
+    }
+    
+    // aten::_pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor
+    inline at::Tensor _pad_enum(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef pad, int64_t mode, c10::optional<double> value=c10::nullopt) {
+        return at::_ops::_pad_enum::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad), mode, value);
+    }
+    
+    // aten::_pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor
+    inline at::Tensor _pad_enum_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef pad, int64_t mode, c10::optional<double> value=c10::nullopt) {
+        return at::_ops::_pad_enum::redispatch(dispatchKeySet, self, pad, mode, value);
+    }
+    
+    // aten::pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor
+    inline at::Tensor pad(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef pad, c10::string_view mode="constant", c10::optional<double> value=c10::nullopt) {
+        return at::_ops::pad::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad), mode, value);
+    }
+    
+    // aten::pad(Tensor self, SymInt[] pad, str mode="constant", float? value=None) -> Tensor
+    inline at::Tensor pad_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef pad, c10::string_view mode="constant", c10::optional<double> value=c10::nullopt) {
+        return at::_ops::pad::redispatch(dispatchKeySet, self, pad, mode, value);
+    }
+    
+    // aten::upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_linear1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_linear1d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_linear1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_linear1d_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_bilinear2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_bilinear2d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_bilinear2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_bilinear2d_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_bilinear2d_aa_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_bilinear2d_aa_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_trilinear3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_trilinear3d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_trilinear3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_trilinear3d_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_bicubic2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_bicubic2d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_bicubic2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_bicubic2d_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_bicubic2d_aa_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_bicubic2d_aa_vec::redispatch(dispatchKeySet, input, output_size, align_corners, scale_factors);
+    }
+    
+    // aten::upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest1d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest1d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact1d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact1d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest2d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest2d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact2d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact2d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest3d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor upsample_nearest3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::upsample_nearest3d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact3d_vec::redispatch(dispatchKeySet, input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+    }
+    
+    // aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+        return at::_ops::_upsample_nearest_exact3d_vec::redispatch(dispatchKeySet, input, output_size, scale_factors);
+    }
+    
+    // aten::upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales, out);
+    }
+    
+    // aten::upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::upsample_linear1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales, out);
+    }
+    
+    // aten::upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales, out);
+    }
+    
+    // aten::upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::upsample_linear1d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales, out);
+    }
+    
+    // aten::upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+    inline at::Tensor upsample_linear1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales);
+    }
+    
+    // aten::upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor
+    inline at::Tensor upsample_linear1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d::redispatch(dispatchKeySet, self, output_size, align_corners, scales);
+    }
+    
+    // aten::upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales, grad_input);
+    }
+    
+    // aten::upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::upsample_linear1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales, grad_input);
+    }
+    
+    // aten::upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales, grad_input);
+    }
+    
+    // aten::upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_linear1d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::upsample_linear1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales, grad_input);
+    }
+    
+    // aten::upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+    inline at::Tensor upsample_linear1d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales);
+    }
+    
+    // aten::upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor
+    inline at::Tensor upsample_linear1d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_linear1d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales);
+    }
+    
+    // aten::upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_bilinear2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_bilinear2d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bilinear2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bilinear2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_bilinear2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bilinear2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_bilinear2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bilinear2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bilinear2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bilinear2d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_bilinear2d_aa_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_bilinear2d_aa_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bilinear2d_aa_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bilinear2d_aa_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_bicubic2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_bicubic2d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bicubic2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bicubic2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_bicubic2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_bicubic2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_bicubic2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bicubic2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_bicubic2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_bicubic2d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_bicubic2d_aa_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_bicubic2d_aa_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa::redispatch(dispatchKeySet, self, output_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_bicubic2d_aa_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_bicubic2d_aa_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+    }
+    
+    // aten::upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_trilinear3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_trilinear3d_out::redispatch(dispatchKeySet, self, output_size, align_corners, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_trilinear3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_trilinear3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d::redispatch(dispatchKeySet, self, output_size, align_corners, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_trilinear3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_trilinear3d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_trilinear3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_trilinear3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_trilinear3d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_trilinear3d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, align_corners, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales, out);
+    }
+    
+    // aten::upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::upsample_nearest1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales, out);
+    }
+    
+    // aten::upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_out::redispatch(dispatchKeySet, self, output_size, scales, out);
+    }
+    
+    // aten::upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::upsample_nearest1d_out::redispatch(dispatchKeySet, self, output_size, scales, out);
+    }
+    
+    // aten::_upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales, out);
+    }
+    
+    // aten::_upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact1d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales, out);
+    }
+    
+    // aten::_upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_out::redispatch(dispatchKeySet, self, output_size, scales, out);
+    }
+    
+    // aten::_upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact1d_out::redispatch(dispatchKeySet, self, output_size, scales, out);
+    }
+    
+    // aten::upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+    inline at::Tensor upsample_nearest1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales);
+    }
+    
+    // aten::upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+    inline at::Tensor upsample_nearest1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d::redispatch(dispatchKeySet, self, output_size, scales);
+    }
+    
+    // aten::_upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales);
+    }
+    
+    // aten::_upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d::redispatch(dispatchKeySet, self, output_size, scales);
+    }
+    
+    // aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input);
+    }
+    
+    // aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input);
+    }
+    
+    // aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales, grad_input);
+    }
+    
+    // aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest1d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact1d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact1d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales, grad_input);
+    }
+    
+    // aten::upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+    inline at::Tensor upsample_nearest1d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales);
+    }
+    
+    // aten::upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+    inline at::Tensor upsample_nearest1d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::upsample_nearest1d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales);
+    }
+    
+    // aten::_upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact1d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact1d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales);
+    }
+    
+    // aten::upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_nearest2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_out::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_nearest2d_out::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_out::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact2d_out::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d::redispatch(dispatchKeySet, self, output_size, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest2d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact2d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_nearest3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_out::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::upsample_nearest3d_out::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_out::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+        return at::_ops::_upsample_nearest_exact3d_out::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w, out);
+    }
+    
+    // aten::upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d::redispatch(dispatchKeySet, self, output_size, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & upsample_nearest3d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::upsample_nearest3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & _upsample_nearest_exact3d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+        return at::_ops::_upsample_nearest_exact3d_backward_grad_input::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w, grad_input);
+    }
+    
+    // aten::upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w);
+    }
+    
+    // aten::upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor upsample_nearest3d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::upsample_nearest3d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_backward::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales_d, scales_h, scales_w);
+    }
+    
+    // aten::_upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+    inline at::Tensor _upsample_nearest_exact3d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+        return at::_ops::_upsample_nearest_exact3d_backward::redispatch(dispatchKeySet, grad_output, output_size, input_size, scales_d, scales_h, scales_w);
+    }
+    
+    // aten::sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & sigmoid_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & output) {
+        return at::_ops::sigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, output, grad_input);
+    }
+    
+    // aten::sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & sigmoid_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input) {
+        return at::_ops::sigmoid_backward_grad_input::redispatch(dispatchKeySet, grad_output, output, grad_input);
+    }
+    
+    // aten::sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
+    inline at::Tensor sigmoid_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output) {
+        return at::_ops::sigmoid_backward::redispatch(dispatchKeySet, grad_output, output);
+    }
+    
+    // aten::logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & logit_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::logit_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, eps, grad_input);
+    }
+    
+    // aten::logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & logit_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps, at::Tensor & grad_input) {
+        return at::_ops::logit_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, eps, grad_input);
+    }
+    
+    // aten::logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
+    inline at::Tensor logit_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::logit_backward::redispatch(dispatchKeySet, grad_output, self, eps);
+    }
+    
+    // aten::tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & tanh_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & output) {
+        return at::_ops::tanh_backward_grad_input::redispatch(dispatchKeySet, grad_output, output, grad_input);
+    }
+    
+    // aten::tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)
+    inline at::Tensor & tanh_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input) {
+        return at::_ops::tanh_backward_grad_input::redispatch(dispatchKeySet, grad_output, output, grad_input);
+    }
+    
+    // aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor
+    inline at::Tensor tanh_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output) {
+        return at::_ops::tanh_backward::redispatch(dispatchKeySet, grad_output, output);
+    }
+    
+    // aten::slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_transpose2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_transpose2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_transpose2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation, out);
+    }
+    
+    // aten::slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_transpose2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation, out);
+    }
+    
+    // aten::slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_transpose2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_transpose2d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_transpose2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_transpose2d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+    }
+    
+    // aten::slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_transpose3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_transpose3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_transpose3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation, out);
+    }
+    
+    // aten::slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_transpose3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_transpose3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation, out);
+    }
+    
+    // aten::slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_transpose3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_transpose3d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_transpose3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_transpose3d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output_padding, dilation);
+    }
+    
+    // aten::thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & thnn_conv2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0) {
+        return at::_ops::thnn_conv2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & thnn_conv2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::thnn_conv2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & thnn_conv2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)) {
+        return at::_ops::thnn_conv2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, out);
+    }
+    
+    // aten::thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & thnn_conv2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::thnn_conv2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, out);
+    }
+    
+    // aten::thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor
+    inline at::Tensor thnn_conv2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0) {
+        return at::_ops::thnn_conv2d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor
+    inline at::Tensor thnn_conv2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)) {
+        return at::_ops::thnn_conv2d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding);
+    }
+    
+    // aten::_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & _slow_conv2d_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::_slow_conv2d_forward_output::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output);
+    }
+    
+    // aten::_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & _slow_conv2d_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & output) {
+        return at::_ops::_slow_conv2d_forward_output::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output);
+    }
+    
+    // aten::_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & _slow_conv2d_forward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding) {
+        return at::_ops::_slow_conv2d_forward_output::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output);
+    }
+    
+    // aten::_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & _slow_conv2d_forward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output) {
+        return at::_ops::_slow_conv2d_forward_output::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output);
+    }
+    
+    // aten::_slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+    inline at::Tensor _slow_conv2d_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::_slow_conv2d_forward::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::_slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+    inline at::Tensor _slow_conv2d_forward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding) {
+        return at::_ops::_slow_conv2d_forward::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding);
+    }
+    
+    // aten::_slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, at::Tensor & grad_weight, at::Tensor & grad_bias, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::_slow_conv2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, weight, c10::fromIntArrayRefSlow(kernel_size), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), grad_input, grad_weight, grad_bias);
+    }
+    
+    // aten::_slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & grad_input, at::Tensor & grad_weight, at::Tensor & grad_bias) {
+        return at::_ops::_slow_conv2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, weight, c10::fromIntArrayRefSlow(kernel_size), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), grad_input, grad_weight, grad_bias);
+    }
+    
+    // aten::_slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & grad_input, at::Tensor & grad_weight, at::Tensor & grad_bias, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding) {
+        return at::_ops::_slow_conv2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, weight, kernel_size, stride, padding, grad_input, grad_weight, grad_bias);
+    }
+    
+    // aten::_slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & grad_input, at::Tensor & grad_weight, at::Tensor & grad_bias) {
+        return at::_ops::_slow_conv2d_backward_grad_input::redispatch(dispatchKeySet, grad_output, self, weight, kernel_size, stride, padding, grad_input, grad_weight, grad_bias);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _slow_conv2d_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, ::std::array<bool,3> output_mask) {
+        return at::_ops::_slow_conv2d_backward_output_mask::redispatch(dispatchKeySet, grad_output, self, weight, c10::fromIntArrayRefSlow(kernel_size), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output_mask);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _slow_conv2d_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, ::std::array<bool,3> output_mask) {
+        return at::_ops::_slow_conv2d_backward_output_mask::redispatch(dispatchKeySet, grad_output, self, weight, kernel_size, stride, padding, output_mask);
+    }
+    
+    // aten::_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _conv_depthwise2d_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation) {
+        return at::_ops::_conv_depthwise2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _conv_depthwise2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, const at::Tensor & out) {
+        return at::_ops::_conv_depthwise2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _conv_depthwise2d_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation) {
+        return at::_ops::_conv_depthwise2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _conv_depthwise2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, const at::Tensor & out) {
+        return at::_ops::_conv_depthwise2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::_conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
+    inline at::Tensor _conv_depthwise2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation) {
+        return at::_ops::_conv_depthwise2d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::_conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
+    inline at::Tensor _conv_depthwise2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation) {
+        return at::_ops::_conv_depthwise2d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation);
+    }
+    
+    // aten::conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
+    inline at::Tensor conv_depthwise3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation) {
+        return at::_ops::conv_depthwise3d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
+    inline at::Tensor conv_depthwise3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation) {
+        return at::_ops::conv_depthwise3d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation);
+    }
+    
+    // aten::slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0) {
+        return at::_ops::slow_conv3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out) {
+        return at::_ops::slow_conv3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), out);
+    }
+    
+    // aten::slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)) {
+        return at::_ops::slow_conv3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, out);
+    }
+    
+    // aten::slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & out) {
+        return at::_ops::slow_conv3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, out);
+    }
+    
+    // aten::slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor
+    inline at::Tensor slow_conv3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0) {
+        return at::_ops::slow_conv3d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor
+    inline at::Tensor slow_conv3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)) {
+        return at::_ops::slow_conv3d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding);
+    }
+    
+    // aten::slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::slow_conv3d_forward_output::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output);
+    }
+    
+    // aten::slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & output) {
+        return at::_ops::slow_conv3d_forward_output::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output);
+    }
+    
+    // aten::slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_forward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding) {
+        return at::_ops::slow_conv3d_forward_output::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output);
+    }
+    
+    // aten::slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+    inline at::Tensor & slow_conv3d_forward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & output) {
+        return at::_ops::slow_conv3d_forward_output::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, output);
+    }
+    
+    // aten::slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
+    inline at::Tensor slow_conv3d_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding) {
+        return at::_ops::slow_conv3d_forward::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding));
+    }
+    
+    // aten::slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
+    inline at::Tensor slow_conv3d_forward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding) {
+        return at::_ops::slow_conv3d_forward::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding);
+    }
+    
+    // aten::slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_dilated2d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_dilated2d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_dilated2d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_dilated2d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation);
+    }
+    
+    // aten::slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_dilated3d(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_dilated3d::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation));
+    }
+    
+    // aten::slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
+    inline at::Tensor slow_conv_dilated3d_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_dilated3d::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation);
+    }
+    
+    // aten::col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col2im_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::col2im_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col2im_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::col2im_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col2im_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::col2im_out::redispatch(dispatchKeySet, self, output_size, kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col2im_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::col2im_out::redispatch(dispatchKeySet, self, output_size, kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+    inline at::Tensor col2im(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::col2im::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), kernel_size, dilation, padding, stride);
+    }
+    
+    // aten::col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+    inline at::Tensor col2im_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::col2im::redispatch(dispatchKeySet, self, output_size, kernel_size, dilation, padding, stride);
+    }
+    
+    // aten::column_stack(Tensor[] tensors) -> Tensor
+    inline at::Tensor column_stack(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::column_stack::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & column_stack_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::column_stack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & column_stack_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::column_stack_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & im2col_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::im2col_out::redispatch(dispatchKeySet, self, kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & im2col_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::im2col_out::redispatch(dispatchKeySet, self, kernel_size, dilation, padding, stride, out);
+    }
+    
+    // aten::im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
+    inline at::Tensor im2col(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride) {
+        return at::_ops::im2col::redispatch(dispatchKeySet, self, kernel_size, dilation, padding, stride);
+    }
+    
+    // aten::isfinite(Tensor self) -> Tensor
+    inline at::Tensor isfinite(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isfinite::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::isinf(Tensor self) -> Tensor
+    inline at::Tensor isinf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isinf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::record_stream(Tensor(a!) self, Stream s) -> ()
+    inline void record_stream(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Stream s) {
+        return at::_ops::record_stream::redispatch(dispatchKeySet, self, s);
+    }
+    
+    // aten::isposinf(Tensor self) -> Tensor
+    inline at::Tensor isposinf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isposinf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isposinf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::isposinf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isposinf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::isposinf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::isneginf(Tensor self) -> Tensor
+    inline at::Tensor isneginf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::isneginf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isneginf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::isneginf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isneginf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::isneginf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
+    inline at::Tensor _add_batch_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t batch_dim, int64_t level) {
+        return at::_ops::_add_batch_dim::redispatch(dispatchKeySet, self, batch_dim, level);
+    }
+    
+    // aten::_remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
+    inline at::Tensor _remove_batch_dim(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t level, int64_t batch_size, int64_t out_dim) {
+        return at::_ops::_remove_batch_dim::redispatch(dispatchKeySet, self, level, batch_size, out_dim);
+    }
+    
+    // aten::special_entr(Tensor self) -> Tensor
+    inline at::Tensor special_entr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_entr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_entr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_entr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_entr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_entr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_ndtri(Tensor self) -> Tensor
+    inline at::Tensor special_ndtri(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_ndtri::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_ndtri_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_ndtri_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_ndtri_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_ndtri_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_log_ndtr(Tensor self) -> Tensor
+    inline at::Tensor special_log_ndtr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_log_ndtr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_log_ndtr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_log_ndtr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_log_ndtr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_log_ndtr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_expm1(Tensor self) -> Tensor
+    inline at::Tensor special_expm1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_expm1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_expm1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_expm1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_exp2(Tensor self) -> Tensor
+    inline at::Tensor special_exp2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_exp2::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_exp2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_exp2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_exp2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_exp2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_psi(Tensor self) -> Tensor
+    inline at::Tensor special_psi(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_psi::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_psi_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_psi_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_psi_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_psi_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_digamma(Tensor self) -> Tensor
+    inline at::Tensor special_digamma(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_digamma::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_digamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_digamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_digamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_digamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_gammaln(Tensor self) -> Tensor
+    inline at::Tensor special_gammaln(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_gammaln::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammaln_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_gammaln_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammaln_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_gammaln_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erf(Tensor self) -> Tensor
+    inline at::Tensor special_erf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_erf::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfc(Tensor self) -> Tensor
+    inline at::Tensor special_erfc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_erfc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfcx(Tensor self) -> Tensor
+    inline at::Tensor special_erfcx(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_erfcx::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfcx_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_erfcx_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfcx_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_erfcx_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfinv(Tensor self) -> Tensor
+    inline at::Tensor special_erfinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_erfinv::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_erfinv_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_erfinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_erfinv_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_ndtr(Tensor self) -> Tensor
+    inline at::Tensor special_ndtr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_ndtr::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_ndtr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_ndtr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_ndtr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_ndtr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_xlog1py(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor special_xlog1py(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_xlog1py::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor special_xlog1py(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_xlog1py_self_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor special_xlog1py(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_xlog1py_other_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_xlog1py_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_xlog1py_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_xlog1py_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_xlog1py_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_xlog1py_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlog1py_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::special_xlog1py_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor special_xlogy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_xlogy::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor special_xlogy(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_xlogy_self_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor special_xlogy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_xlogy_other_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_xlogy_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_xlogy_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_xlogy_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_xlogy_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_xlogy_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_xlogy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::special_xlogy_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor special_zeta(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_zeta::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor
+    inline at::Tensor special_zeta(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_zeta_self_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor
+    inline at::Tensor special_zeta(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_zeta_other_scalar::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_zeta_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_zeta_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::special_zeta_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_zeta_self_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::special_zeta_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_zeta_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::special_zeta_other_scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_i0(Tensor self) -> Tensor
+    inline at::Tensor special_i0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_i0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i0e(Tensor self) -> Tensor
+    inline at::Tensor special_i0e(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_i0e::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i0e_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_i0e_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i0e_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_i0e_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i1(Tensor self) -> Tensor
+    inline at::Tensor special_i1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_i1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_i1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_i1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i1e(Tensor self) -> Tensor
+    inline at::Tensor special_i1e(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_i1e::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i1e_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_i1e_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_i1e_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_i1e_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_logit(Tensor self, float? eps=None) -> Tensor
+    inline at::Tensor special_logit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::special_logit::redispatch(dispatchKeySet, self, eps);
+    }
+    
+    // aten::special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_logit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<double> eps=c10::nullopt) {
+        return at::_ops::special_logit_out::redispatch(dispatchKeySet, self, eps, out);
+    }
+    
+    // aten::special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_logit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> eps, at::Tensor & out) {
+        return at::_ops::special_logit_out::redispatch(dispatchKeySet, self, eps, out);
+    }
+    
+    // aten::special_polygamma(int n, Tensor self) -> Tensor
+    inline at::Tensor special_polygamma(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self) {
+        return at::_ops::special_polygamma::redispatch(dispatchKeySet, n, self);
+    }
+    
+    // aten::special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_polygamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, const at::Tensor & self) {
+        return at::_ops::special_polygamma_out::redispatch(dispatchKeySet, n, self, out);
+    }
+    
+    // aten::special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_polygamma_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_polygamma_out::redispatch(dispatchKeySet, n, self, out);
+    }
+    
+    // aten::special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+    inline at::Tensor special_logsumexp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::special_logsumexp::redispatch(dispatchKeySet, self, dim, keepdim);
+    }
+    
+    // aten::special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_logsumexp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false) {
+        return at::_ops::special_logsumexp_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_logsumexp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+        return at::_ops::special_logsumexp_out::redispatch(dispatchKeySet, self, dim, keepdim, out);
+    }
+    
+    // aten::special_expit(Tensor self) -> Tensor
+    inline at::Tensor special_expit(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_expit::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_expit_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_expit_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_expit_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_expit_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_sinc(Tensor self) -> Tensor
+    inline at::Tensor special_sinc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_sinc::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_sinc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_sinc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_sinc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_sinc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_round(Tensor self, *, int decimals=0) -> Tensor
+    inline at::Tensor special_round(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals=0) {
+        return at::_ops::special_round::redispatch(dispatchKeySet, self, decimals);
+    }
+    
+    // aten::special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_round_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t decimals=0) {
+        return at::_ops::special_round_out::redispatch(dispatchKeySet, self, decimals, out);
+    }
+    
+    // aten::special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_round_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals, at::Tensor & out) {
+        return at::_ops::special_round_out::redispatch(dispatchKeySet, self, decimals, out);
+    }
+    
+    // aten::special_log1p(Tensor self) -> Tensor
+    inline at::Tensor special_log1p(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_log1p::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_log1p_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_log1p_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor special_log_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::special_log_softmax::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammainc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_gammainc_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammainc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_gammainc_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_gammainc(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor special_gammainc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_gammainc::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammaincc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_gammaincc_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_gammaincc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::special_gammaincc_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::special_gammaincc(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor special_gammaincc(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::special_gammaincc::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::special_multigammaln(Tensor self, int p) -> Tensor
+    inline at::Tensor special_multigammaln(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t p) {
+        return at::_ops::special_multigammaln::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_multigammaln_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t p) {
+        return at::_ops::special_multigammaln_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_multigammaln_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t p, at::Tensor & out) {
+        return at::_ops::special_multigammaln_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor special_softmax(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::special_softmax::redispatch(dispatchKeySet, self, dim, dtype);
+    }
+    
+    // aten::fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_fft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_fft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_ifft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_ifft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_rfft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_rfft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_irfft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_irfft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_hfft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_hfft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_hfft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_hfft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_hfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_hfft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_hfft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_hfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfft(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfft_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft::redispatch(dispatchKeySet, self, n, dim, norm);
+    }
+    
+    // aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ihfft_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ihfft_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ihfft_out::redispatch(dispatchKeySet, self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ihfft_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ihfft_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ihfft_out::redispatch(dispatchKeySet, self, n, dim, norm, out);
+    }
+    
+    // aten::fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_fft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_fft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft2_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_ifft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_ifft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft2_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_rfft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_rfft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft2_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_irfft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_irfft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft2_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_hfft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_hfft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfft2_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_hfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfft2_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_hfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfft2(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft2::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfft2_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft2::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfft2_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfft2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_ihfft2_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfft2_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfft2_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_ihfft2_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_fftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_fftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_fftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_fftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_ifftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_ifftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifftn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifftn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ifftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_ifftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_ifftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_rfftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_rfftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_rfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_rfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_irfftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_irfftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfftn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfftn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_irfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_irfftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+        return at::_ops::fft_irfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_hfftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_hfftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfftn_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_hfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfftn_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_hfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_hfftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_hfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfftn(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfftn::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm);
+    }
+    
+    // aten::fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
+    inline at::Tensor fft_ihfftn_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfftn::redispatch(dispatchKeySet, self, s, dim, norm);
+    }
+    
+    // aten::fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfftn_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfftn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_ihfftn_out::redispatch(dispatchKeySet, self, s.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*s)) : c10::nullopt, dim, norm, out);
+    }
+    
+    // aten::fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfftn_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt) {
+        return at::_ops::fft_ihfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & fft_ihfftn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const at::Tensor & out) {
+        return at::_ops::fft_ihfftn_out::redispatch(dispatchKeySet, self, s, dim, norm, out);
+    }
+    
+    // aten::fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor fft_fftfreq(c10::DispatchKeySet dispatchKeySet, int64_t n, double d=1.0, at::TensorOptions options={}) {
+        return at::_ops::fft_fftfreq::redispatch(dispatchKeySet, n, d, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor fft_fftfreq(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::fft_fftfreq::redispatch(dispatchKeySet, n, d, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftfreq_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, double d=1.0) {
+        return at::_ops::fft_fftfreq_out::redispatch(dispatchKeySet, n, d, out);
+    }
+    
+    // aten::fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_fftfreq_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, at::Tensor & out) {
+        return at::_ops::fft_fftfreq_out::redispatch(dispatchKeySet, n, d, out);
+    }
+    
+    // aten::fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor fft_rfftfreq(c10::DispatchKeySet dispatchKeySet, int64_t n, double d=1.0, at::TensorOptions options={}) {
+        return at::_ops::fft_rfftfreq::redispatch(dispatchKeySet, n, d, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+    }
+    
+    // aten::fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor fft_rfftfreq(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+        return at::_ops::fft_rfftfreq::redispatch(dispatchKeySet, n, d, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftfreq_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t n, double d=1.0) {
+        return at::_ops::fft_rfftfreq_out::redispatch(dispatchKeySet, n, d, out);
+    }
+    
+    // aten::fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fft_rfftfreq_outf(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, at::Tensor & out) {
+        return at::_ops::fft_rfftfreq_out::redispatch(dispatchKeySet, n, d, out);
+    }
+    
+    // aten::fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
+    inline at::Tensor fft_fftshift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt) {
+        return at::_ops::fft_fftshift::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+    inline at::Tensor fft_ifftshift(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt) {
+        return at::_ops::fft_ifftshift::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_cholesky_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper=false, bool check_errors=false) {
+        return at::_ops::linalg_cholesky_ex::redispatch(dispatchKeySet, self, upper, check_errors);
+    }
+    
+    // aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_cholesky_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper=false, bool check_errors=false) {
+        return at::_ops::linalg_cholesky_ex_L::redispatch(dispatchKeySet, self, upper, check_errors, L, info);
+    }
+    
+    // aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_cholesky_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info) {
+        return at::_ops::linalg_cholesky_ex_L::redispatch(dispatchKeySet, self, upper, check_errors, L, info);
+    }
+    
+    // aten::linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
+    inline at::Tensor linalg_cholesky(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper=false) {
+        return at::_ops::linalg_cholesky::redispatch(dispatchKeySet, self, upper);
+    }
+    
+    // aten::linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cholesky_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool upper=false) {
+        return at::_ops::linalg_cholesky_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cholesky_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, at::Tensor & out) {
+        return at::_ops::linalg_cholesky_out::redispatch(dispatchKeySet, self, upper, out);
+    }
+    
+    // aten::linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor
+    inline at::Tensor linalg_cross(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1) {
+        return at::_ops::linalg_cross::redispatch(dispatchKeySet, self, other, dim);
+    }
+    
+    // aten::linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cross_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1) {
+        return at::_ops::linalg_cross_out::redispatch(dispatchKeySet, self, other, dim, out);
+    }
+    
+    // aten::linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cross_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t dim, at::Tensor & out) {
+        return at::_ops::linalg_cross_out::redispatch(dispatchKeySet, self, other, dim, out);
+    }
+    
+    // aten::linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_lu_factor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot=true) {
+        return at::_ops::linalg_lu_factor::redispatch(dispatchKeySet, A, pivot);
+    }
+    
+    // aten::linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_lu_factor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & LU, at::Tensor & pivots, const at::Tensor & A, bool pivot=true) {
+        return at::_ops::linalg_lu_factor_out::redispatch(dispatchKeySet, A, pivot, LU, pivots);
+    }
+    
+    // aten::linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_lu_factor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot, at::Tensor & LU, at::Tensor & pivots) {
+        return at::_ops::linalg_lu_factor_out::redispatch(dispatchKeySet, A, pivot, LU, pivots);
+    }
+    
+    // aten::linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linalg_lu_factor_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot=true, bool check_errors=false) {
+        return at::_ops::linalg_lu_factor_ex::redispatch(dispatchKeySet, A, pivot, check_errors);
+    }
+    
+    // aten::linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_lu_factor_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info, const at::Tensor & A, bool pivot=true, bool check_errors=false) {
+        return at::_ops::linalg_lu_factor_ex_out::redispatch(dispatchKeySet, A, pivot, check_errors, LU, pivots, info);
+    }
+    
+    // aten::linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_lu_factor_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot, bool check_errors, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info) {
+        return at::_ops::linalg_lu_factor_ex_out::redispatch(dispatchKeySet, A, pivot, check_errors, LU, pivots, info);
+    }
+    
+    // aten::linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linalg_lu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot=true) {
+        return at::_ops::linalg_lu::redispatch(dispatchKeySet, A, pivot);
+    }
+    
+    // aten::linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_lu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & P, at::Tensor & L, at::Tensor & U, const at::Tensor & A, bool pivot=true) {
+        return at::_ops::linalg_lu_out::redispatch(dispatchKeySet, A, pivot, P, L, U);
+    }
+    
+    // aten::linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_lu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot, at::Tensor & P, at::Tensor & L, at::Tensor & U) {
+        return at::_ops::linalg_lu_out::redispatch(dispatchKeySet, A, pivot, P, L, U);
+    }
+    
+    // aten::linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor
+    inline at::Tensor linalg_lu_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LU, const at::Tensor & pivots, const at::Tensor & B, bool left=true, bool adjoint=false) {
+        return at::_ops::linalg_lu_solve::redispatch(dispatchKeySet, LU, pivots, B, left, adjoint);
+    }
+    
+    // aten::linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_lu_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & LU, const at::Tensor & pivots, const at::Tensor & B, bool left=true, bool adjoint=false) {
+        return at::_ops::linalg_lu_solve_out::redispatch(dispatchKeySet, LU, pivots, B, left, adjoint, out);
+    }
+    
+    // aten::linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_lu_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LU, const at::Tensor & pivots, const at::Tensor & B, bool left, bool adjoint, at::Tensor & out) {
+        return at::_ops::linalg_lu_solve_out::redispatch(dispatchKeySet, LU, pivots, B, left, adjoint, out);
+    }
+    
+    // aten::_linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _linalg_det(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A) {
+        return at::_ops::_linalg_det::redispatch(dispatchKeySet, A);
+    }
+    
+    // aten::_linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_det_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & result, at::Tensor & LU, at::Tensor & pivots, const at::Tensor & A) {
+        return at::_ops::_linalg_det_result::redispatch(dispatchKeySet, A, result, LU, pivots);
+    }
+    
+    // aten::_linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_det_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & result, at::Tensor & LU, at::Tensor & pivots) {
+        return at::_ops::_linalg_det_result::redispatch(dispatchKeySet, A, result, LU, pivots);
+    }
+    
+    // aten::linalg_det(Tensor A) -> Tensor
+    inline at::Tensor linalg_det(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A) {
+        return at::_ops::linalg_det::redispatch(dispatchKeySet, A);
+    }
+    
+    // aten::linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_det_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & A) {
+        return at::_ops::linalg_det_out::redispatch(dispatchKeySet, A, out);
+    }
+    
+    // aten::linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_det_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & out) {
+        return at::_ops::linalg_det_out::redispatch(dispatchKeySet, A, out);
+    }
+    
+    // aten::det(Tensor self) -> Tensor
+    inline at::Tensor det(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::det::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linalg_ldl_factor_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool hermitian=false, bool check_errors=false) {
+        return at::_ops::linalg_ldl_factor_ex::redispatch(dispatchKeySet, self, hermitian, check_errors);
+    }
+    
+    // aten::linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_ldl_factor_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & LD, at::Tensor & pivots, at::Tensor & info, const at::Tensor & self, bool hermitian=false, bool check_errors=false) {
+        return at::_ops::linalg_ldl_factor_ex_out::redispatch(dispatchKeySet, self, hermitian, check_errors, LD, pivots, info);
+    }
+    
+    // aten::linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_ldl_factor_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool hermitian, bool check_errors, at::Tensor & LD, at::Tensor & pivots, at::Tensor & info) {
+        return at::_ops::linalg_ldl_factor_ex_out::redispatch(dispatchKeySet, self, hermitian, check_errors, LD, pivots, info);
+    }
+    
+    // aten::linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_ldl_factor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool hermitian=false) {
+        return at::_ops::linalg_ldl_factor::redispatch(dispatchKeySet, self, hermitian);
+    }
+    
+    // aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_ldl_factor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & LD, at::Tensor & pivots, const at::Tensor & self, bool hermitian=false) {
+        return at::_ops::linalg_ldl_factor_out::redispatch(dispatchKeySet, self, hermitian, LD, pivots);
+    }
+    
+    // aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_ldl_factor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool hermitian, at::Tensor & LD, at::Tensor & pivots) {
+        return at::_ops::linalg_ldl_factor_out::redispatch(dispatchKeySet, self, hermitian, LD, pivots);
+    }
+    
+    // aten::linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_ldl_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian=false) {
+        return at::_ops::linalg_ldl_solve::redispatch(dispatchKeySet, LD, pivots, B, hermitian);
+    }
+    
+    // aten::linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_ldl_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian=false) {
+        return at::_ops::linalg_ldl_solve_out::redispatch(dispatchKeySet, LD, pivots, B, hermitian, out);
+    }
+    
+    // aten::linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_ldl_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_ldl_solve_out::redispatch(dispatchKeySet, LD, pivots, B, hermitian, out);
+    }
+    
+    // aten::linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> linalg_lstsq(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & b, c10::optional<double> rcond=c10::nullopt, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_lstsq::redispatch(dispatchKeySet, self, b, rcond, driver);
+    }
+    
+    // aten::linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> linalg_lstsq_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & solution, at::Tensor & residuals, at::Tensor & rank, at::Tensor & singular_values, const at::Tensor & self, const at::Tensor & b, c10::optional<double> rcond=c10::nullopt, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_lstsq_out::redispatch(dispatchKeySet, self, b, rcond, driver, solution, residuals, rank, singular_values);
+    }
+    
+    // aten::linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> linalg_lstsq_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & b, c10::optional<double> rcond, c10::optional<c10::string_view> driver, at::Tensor & solution, at::Tensor & residuals, at::Tensor & rank, at::Tensor & singular_values) {
+        return at::_ops::linalg_lstsq_out::redispatch(dispatchKeySet, self, b, rcond, driver, solution, residuals, rank, singular_values);
+    }
+    
+    // aten::linalg_matmul(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor linalg_matmul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::linalg_matmul::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matmul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::linalg_matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matmul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::linalg_matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor
+    inline at::Tensor linalg_vecdot(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & y, int64_t dim=-1) {
+        return at::_ops::linalg_vecdot::redispatch(dispatchKeySet, x, y, dim);
+    }
+    
+    // aten::linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_vecdot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & y, int64_t dim=-1) {
+        return at::_ops::linalg_vecdot_out::redispatch(dispatchKeySet, x, y, dim, out);
+    }
+    
+    // aten::linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_vecdot_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & y, int64_t dim, at::Tensor & out) {
+        return at::_ops::linalg_vecdot_out::redispatch(dispatchKeySet, x, y, dim, out);
+    }
+    
+    // aten::linalg_matrix_exp(Tensor self) -> Tensor
+    inline at::Tensor linalg_matrix_exp(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::linalg_matrix_exp::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _linalg_slogdet(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A) {
+        return at::_ops::_linalg_slogdet::redispatch(dispatchKeySet, A);
+    }
+    
+    // aten::_linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _linalg_slogdet_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & sign, at::Tensor & logabsdet, at::Tensor & LU, at::Tensor & pivots, const at::Tensor & A) {
+        return at::_ops::_linalg_slogdet_sign::redispatch(dispatchKeySet, A, sign, logabsdet, LU, pivots);
+    }
+    
+    // aten::_linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _linalg_slogdet_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & sign, at::Tensor & logabsdet, at::Tensor & LU, at::Tensor & pivots) {
+        return at::_ops::_linalg_slogdet_sign::redispatch(dispatchKeySet, A, sign, logabsdet, LU, pivots);
+    }
+    
+    // aten::linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_slogdet(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A) {
+        return at::_ops::linalg_slogdet::redispatch(dispatchKeySet, A);
+    }
+    
+    // aten::linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_slogdet_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & sign, at::Tensor & logabsdet, const at::Tensor & A) {
+        return at::_ops::linalg_slogdet_out::redispatch(dispatchKeySet, A, sign, logabsdet);
+    }
+    
+    // aten::linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_slogdet_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & sign, at::Tensor & logabsdet) {
+        return at::_ops::linalg_slogdet_out::redispatch(dispatchKeySet, A, sign, logabsdet);
+    }
+    
+    // aten::slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
+    inline ::std::tuple<at::Tensor,at::Tensor> slogdet(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::slogdet::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> slogdet_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & sign, at::Tensor & logabsdet, const at::Tensor & self) {
+        return at::_ops::slogdet_out::redispatch(dispatchKeySet, self, sign, logabsdet);
+    }
+    
+    // aten::slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> slogdet_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & sign, at::Tensor & logabsdet) {
+        return at::_ops::slogdet_out::redispatch(dispatchKeySet, self, sign, logabsdet);
+    }
+    
+    // aten::logdet(Tensor self) -> Tensor
+    inline at::Tensor logdet(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::logdet::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_eig(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::linalg_eig::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eig_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & eigenvalues, at::Tensor & eigenvectors, const at::Tensor & self) {
+        return at::_ops::linalg_eig_out::redispatch(dispatchKeySet, self, eigenvalues, eigenvectors);
+    }
+    
+    // aten::linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eig_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors) {
+        return at::_ops::linalg_eig_out::redispatch(dispatchKeySet, self, eigenvalues, eigenvectors);
+    }
+    
+    // aten::_linalg_eigvals(Tensor self) -> Tensor
+    inline at::Tensor _linalg_eigvals(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_linalg_eigvals::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::linalg_eigvals(Tensor self) -> Tensor
+    inline at::Tensor linalg_eigvals(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::linalg_eigvals::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_eigvals_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::linalg_eigvals_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_eigvals_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::linalg_eigvals_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_linalg_eigh(Tensor A, str UPLO="L", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)
+    inline ::std::tuple<at::Tensor,at::Tensor> _linalg_eigh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true) {
+        return at::_ops::_linalg_eigh::redispatch(dispatchKeySet, A, UPLO, compute_v);
+    }
+    
+    // aten::_linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _linalg_eigh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & eigenvalues, at::Tensor & eigenvectors, const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true) {
+        return at::_ops::_linalg_eigh_eigenvalues::redispatch(dispatchKeySet, A, UPLO, compute_v, eigenvalues, eigenvectors);
+    }
+    
+    // aten::_linalg_eigh.eigenvalues(Tensor A, str UPLO="L", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _linalg_eigh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view UPLO, bool compute_v, at::Tensor & eigenvalues, at::Tensor & eigenvectors) {
+        return at::_ops::_linalg_eigh_eigenvalues::redispatch(dispatchKeySet, A, UPLO, compute_v, eigenvalues, eigenvectors);
+    }
+    
+    // aten::linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_eigh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view UPLO="L") {
+        return at::_ops::linalg_eigh::redispatch(dispatchKeySet, self, UPLO);
+    }
+    
+    // aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eigh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & eigvals, at::Tensor & eigvecs, const at::Tensor & self, c10::string_view UPLO="L") {
+        return at::_ops::linalg_eigh_eigvals::redispatch(dispatchKeySet, self, UPLO, eigvals, eigvecs);
+    }
+    
+    // aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_eigh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs) {
+        return at::_ops::linalg_eigh_eigvals::redispatch(dispatchKeySet, self, UPLO, eigvals, eigvecs);
+    }
+    
+    // aten::linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor
+    inline at::Tensor linalg_eigvalsh(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view UPLO="L") {
+        return at::_ops::linalg_eigvalsh::redispatch(dispatchKeySet, self, UPLO);
+    }
+    
+    // aten::linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_eigvalsh_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::string_view UPLO="L") {
+        return at::_ops::linalg_eigvalsh_out::redispatch(dispatchKeySet, self, UPLO, out);
+    }
+    
+    // aten::linalg_eigvalsh.out(Tensor self, str UPLO="L", *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_eigvalsh_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view UPLO, at::Tensor & out) {
+        return at::_ops::linalg_eigvalsh_out::redispatch(dispatchKeySet, self, UPLO, out);
+    }
+    
+    // aten::linalg_householder_product(Tensor input, Tensor tau) -> Tensor
+    inline at::Tensor linalg_householder_product(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & tau) {
+        return at::_ops::linalg_householder_product::redispatch(dispatchKeySet, input, tau);
+    }
+    
+    // aten::linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_householder_product_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & tau) {
+        return at::_ops::linalg_householder_product_out::redispatch(dispatchKeySet, input, tau, out);
+    }
+    
+    // aten::linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_householder_product_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & tau, at::Tensor & out) {
+        return at::_ops::linalg_householder_product_out::redispatch(dispatchKeySet, input, tau, out);
+    }
+    
+    // aten::linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_inv_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool check_errors=false) {
+        return at::_ops::linalg_inv_ex::redispatch(dispatchKeySet, A, check_errors);
+    }
+    
+    // aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_inv_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & inverse, at::Tensor & info, const at::Tensor & A, bool check_errors=false) {
+        return at::_ops::linalg_inv_ex_inverse::redispatch(dispatchKeySet, A, check_errors, inverse, info);
+    }
+    
+    // aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_inv_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool check_errors, at::Tensor & inverse, at::Tensor & info) {
+        return at::_ops::linalg_inv_ex_inverse::redispatch(dispatchKeySet, A, check_errors, inverse, info);
+    }
+    
+    // aten::linalg_inv(Tensor A) -> Tensor
+    inline at::Tensor linalg_inv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A) {
+        return at::_ops::linalg_inv::redispatch(dispatchKeySet, A);
+    }
+    
+    // aten::linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_inv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & A) {
+        return at::_ops::linalg_inv_out::redispatch(dispatchKeySet, A, out);
+    }
+    
+    // aten::linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_inv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & out) {
+        return at::_ops::linalg_inv_out::redispatch(dispatchKeySet, A, out);
+    }
+    
+    // aten::inverse(Tensor self) -> Tensor
+    inline at::Tensor inverse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::inverse::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & inverse_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::inverse_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & inverse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::inverse_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::inner(Tensor self, Tensor other) -> Tensor
+    inline at::Tensor inner(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::inner::redispatch(dispatchKeySet, self, other);
+    }
+    
+    // aten::inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & inner_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::inner_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & inner_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::inner_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::outer(Tensor self, Tensor vec2) -> Tensor
+    inline at::Tensor outer(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec2) {
+        return at::_ops::outer::redispatch(dispatchKeySet, self, vec2);
+    }
+    
+    // aten::outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & outer_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & vec2) {
+        return at::_ops::outer_out::redispatch(dispatchKeySet, self, vec2, out);
+    }
+    
+    // aten::outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & outer_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec2, at::Tensor & out) {
+        return at::_ops::outer_out::redispatch(dispatchKeySet, self, vec2, out);
+    }
+    
+    // aten::ger(Tensor self, Tensor vec2) -> Tensor
+    inline at::Tensor ger(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec2) {
+        return at::_ops::ger::redispatch(dispatchKeySet, self, vec2);
+    }
+    
+    // aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ger_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & vec2) {
+        return at::_ops::ger_out::redispatch(dispatchKeySet, self, vec2, out);
+    }
+    
+    // aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ger_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & vec2, at::Tensor & out) {
+        return at::_ops::ger_out::redispatch(dispatchKeySet, self, vec2, out);
+    }
+    
+    // aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor linalg_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & ord=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_norm::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype);
+    }
+    
+    // aten::linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor linalg_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_norm_ord_str::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype);
+    }
+    
+    // aten::linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & ord=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::linalg_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_norm_ord_str_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::linalg_norm_ord_str_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor linalg_vector_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_vector_norm::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype);
+    }
+    
+    // aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_vector_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_vector_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_vector_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::linalg_vector_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor linalg_matrix_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord, at::IntArrayRef dim={-2,-1}, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_matrix_norm::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype);
+    }
+    
+    // aten::linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & ord, at::IntArrayRef dim={-2,-1}, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_matrix_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::linalg_matrix_norm_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+    inline at::Tensor linalg_matrix_norm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view ord="fro", at::IntArrayRef dim={-2,-1}, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_matrix_norm_str_ord::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype);
+    }
+    
+    // aten::linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::string_view ord="fro", at::IntArrayRef dim={-2,-1}, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::linalg_matrix_norm_str_ord_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view ord, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::linalg_matrix_norm_str_ord_out::redispatch(dispatchKeySet, self, ord, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _linalg_svd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::_linalg_svd::redispatch(dispatchKeySet, A, full_matrices, compute_uv, driver);
+    }
+    
+    // aten::_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::_linalg_svd_U::redispatch(dispatchKeySet, A, full_matrices, compute_uv, driver, U, S, Vh);
+    }
+    
+    // aten::_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices, bool compute_uv, c10::optional<c10::string_view> driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh) {
+        return at::_ops::_linalg_svd_U::redispatch(dispatchKeySet, A, full_matrices, compute_uv, driver, U, S, Vh);
+    }
+    
+    // aten::linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linalg_svd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_svd::redispatch(dispatchKeySet, A, full_matrices, driver);
+    }
+    
+    // aten::linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_svd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_svd_U::redispatch(dispatchKeySet, A, full_matrices, driver, U, S, Vh);
+    }
+    
+    // aten::linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_svd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices, c10::optional<c10::string_view> driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh) {
+        return at::_ops::linalg_svd_U::redispatch(dispatchKeySet, A, full_matrices, driver, U, S, Vh);
+    }
+    
+    // aten::linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor
+    inline at::Tensor linalg_svdvals(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_svdvals::redispatch(dispatchKeySet, A, driver);
+    }
+    
+    // aten::linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_svdvals_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & A, c10::optional<c10::string_view> driver=c10::nullopt) {
+        return at::_ops::linalg_svdvals_out::redispatch(dispatchKeySet, A, driver, out);
+    }
+    
+    // aten::linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_svdvals_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::optional<c10::string_view> driver, at::Tensor & out) {
+        return at::_ops::linalg_svdvals_out::redispatch(dispatchKeySet, A, driver, out);
+    }
+    
+    // aten::linalg_cond(Tensor self, Scalar? p=None) -> Tensor
+    inline at::Tensor linalg_cond(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p=c10::nullopt) {
+        return at::_ops::linalg_cond::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cond_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p=c10::nullopt) {
+        return at::_ops::linalg_cond_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cond_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::Tensor & out) {
+        return at::_ops::linalg_cond_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::linalg_cond.p_str(Tensor self, str p) -> Tensor
+    inline at::Tensor linalg_cond(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view p) {
+        return at::_ops::linalg_cond_p_str::redispatch(dispatchKeySet, self, p);
+    }
+    
+    // aten::linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cond_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::string_view p) {
+        return at::_ops::linalg_cond_p_str_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_cond_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view p, at::Tensor & out) {
+        return at::_ops::linalg_cond_p_str_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_pinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & atol={}, const c10::optional<at::Tensor> & rtol={}, bool hermitian=false) {
+        return at::_ops::linalg_pinv_atol_rtol_tensor::redispatch(dispatchKeySet, self, atol, rtol, hermitian);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Tensor> & atol={}, const c10::optional<at::Tensor> & rtol={}, bool hermitian=false) {
+        return at::_ops::linalg_pinv_atol_rtol_tensor_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & atol, const c10::optional<at::Tensor> & rtol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_pinv_atol_rtol_tensor_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_pinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian=false) {
+        return at::_ops::linalg_pinv_atol_rtol_float::redispatch(dispatchKeySet, self, atol, rtol, hermitian);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian=false) {
+        return at::_ops::linalg_pinv_atol_rtol_float_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_pinv_atol_rtol_float_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_pinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double rcond, bool hermitian=false) {
+        return at::_ops::linalg_pinv::redispatch(dispatchKeySet, self, rcond, hermitian);
+    }
+    
+    // aten::linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_pinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false) {
+        return at::_ops::linalg_pinv_rcond_tensor::redispatch(dispatchKeySet, self, rcond, hermitian);
+    }
+    
+    // aten::linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double rcond, bool hermitian=false) {
+        return at::_ops::linalg_pinv_out::redispatch(dispatchKeySet, self, rcond, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double rcond, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_pinv_out::redispatch(dispatchKeySet, self, rcond, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false) {
+        return at::_ops::linalg_pinv_out_rcond_tensor::redispatch(dispatchKeySet, self, rcond, hermitian, out);
+    }
+    
+    // aten::linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_pinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & rcond, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_pinv_out_rcond_tensor::redispatch(dispatchKeySet, self, rcond, hermitian, out);
+    }
+    
+    // aten::_linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _linalg_solve_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+        return at::_ops::_linalg_solve_ex::redispatch(dispatchKeySet, A, B, left, check_errors);
+    }
+    
+    // aten::_linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _linalg_solve_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & result, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+        return at::_ops::_linalg_solve_ex_result::redispatch(dispatchKeySet, A, B, left, check_errors, result, LU, pivots, info);
+    }
+    
+    // aten::_linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _linalg_solve_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info) {
+        return at::_ops::_linalg_solve_ex_result::redispatch(dispatchKeySet, A, B, left, check_errors, result, LU, pivots, info);
+    }
+    
+    // aten::linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_solve_ex(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+        return at::_ops::linalg_solve_ex::redispatch(dispatchKeySet, A, B, left, check_errors);
+    }
+    
+    // aten::linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_solve_ex_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & result, at::Tensor & info, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+        return at::_ops::linalg_solve_ex_out::redispatch(dispatchKeySet, A, B, left, check_errors, result, info);
+    }
+    
+    // aten::linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_solve_ex_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & info) {
+        return at::_ops::linalg_solve_ex_out::redispatch(dispatchKeySet, A, B, left, check_errors, result, info);
+    }
+    
+    // aten::linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor
+    inline at::Tensor linalg_solve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left=true) {
+        return at::_ops::linalg_solve::redispatch(dispatchKeySet, A, B, left);
+    }
+    
+    // aten::linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_solve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & A, const at::Tensor & B, bool left=true) {
+        return at::_ops::linalg_solve_out::redispatch(dispatchKeySet, A, B, left, out);
+    }
+    
+    // aten::linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_solve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left, at::Tensor & out) {
+        return at::_ops::linalg_solve_out::redispatch(dispatchKeySet, A, B, left, out);
+    }
+    
+    // aten::linalg_tensorinv(Tensor self, int ind=2) -> Tensor
+    inline at::Tensor linalg_tensorinv(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t ind=2) {
+        return at::_ops::linalg_tensorinv::redispatch(dispatchKeySet, self, ind);
+    }
+    
+    // aten::linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_tensorinv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t ind=2) {
+        return at::_ops::linalg_tensorinv_out::redispatch(dispatchKeySet, self, ind, out);
+    }
+    
+    // aten::linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_tensorinv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t ind, at::Tensor & out) {
+        return at::_ops::linalg_tensorinv_out::redispatch(dispatchKeySet, self, ind, out);
+    }
+    
+    // aten::linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
+    inline at::Tensor linalg_tensorsolve(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::OptionalIntArrayRef dims=c10::nullopt) {
+        return at::_ops::linalg_tensorsolve::redispatch(dispatchKeySet, self, other, dims);
+    }
+    
+    // aten::linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_tensorsolve_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, at::OptionalIntArrayRef dims=c10::nullopt) {
+        return at::_ops::linalg_tensorsolve_out::redispatch(dispatchKeySet, self, other, dims, out);
+    }
+    
+    // aten::linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_tensorsolve_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::OptionalIntArrayRef dims, at::Tensor & out) {
+        return at::_ops::linalg_tensorsolve_out::redispatch(dispatchKeySet, self, other, dims, out);
+    }
+    
+    // aten::linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)
+    inline ::std::tuple<at::Tensor,at::Tensor> linalg_qr(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode="reduced") {
+        return at::_ops::linalg_qr::redispatch(dispatchKeySet, A, mode);
+    }
+    
+    // aten::linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_qr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & Q, at::Tensor & R, const at::Tensor & A, c10::string_view mode="reduced") {
+        return at::_ops::linalg_qr_out::redispatch(dispatchKeySet, A, mode, Q, R);
+    }
+    
+    // aten::linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+    inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_qr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R) {
+        return at::_ops::linalg_qr_out::redispatch(dispatchKeySet, A, mode, Q, R);
+    }
+    
+    // aten::linalg_matrix_power(Tensor self, int n) -> Tensor
+    inline at::Tensor linalg_matrix_power(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n) {
+        return at::_ops::linalg_matrix_power::redispatch(dispatchKeySet, self, n);
+    }
+    
+    // aten::linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_power_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t n) {
+        return at::_ops::linalg_matrix_power_out::redispatch(dispatchKeySet, self, n, out);
+    }
+    
+    // aten::linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_power_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, at::Tensor & out) {
+        return at::_ops::linalg_matrix_power_out::redispatch(dispatchKeySet, self, n, out);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_matrix_rank(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & atol={}, const c10::optional<at::Tensor> & rtol={}, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_tensor::redispatch(dispatchKeySet, input, atol, rtol, hermitian);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const c10::optional<at::Tensor> & atol={}, const c10::optional<at::Tensor> & rtol={}, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_tensor_out::redispatch(dispatchKeySet, input, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & atol, const c10::optional<at::Tensor> & rtol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_tensor_out::redispatch(dispatchKeySet, input, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_matrix_rank(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_float::redispatch(dispatchKeySet, self, atol, rtol, hermitian);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_float_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_matrix_rank_atol_rtol_float_out::redispatch(dispatchKeySet, self, atol, rtol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_matrix_rank(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double tol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank::redispatch(dispatchKeySet, self, tol, hermitian);
+    }
+    
+    // aten::linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double tol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_out::redispatch(dispatchKeySet, self, tol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double tol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_matrix_rank_out::redispatch(dispatchKeySet, self, tol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor
+    inline at::Tensor linalg_matrix_rank(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & tol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_tol_tensor::redispatch(dispatchKeySet, input, tol, hermitian);
+    }
+    
+    // aten::linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & tol, bool hermitian=false) {
+        return at::_ops::linalg_matrix_rank_out_tol_tensor::redispatch(dispatchKeySet, input, tol, hermitian, out);
+    }
+    
+    // aten::linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_rank_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & tol, bool hermitian, at::Tensor & out) {
+        return at::_ops::linalg_matrix_rank_out_tol_tensor::redispatch(dispatchKeySet, input, tol, hermitian, out);
+    }
+    
+    // aten::linalg_multi_dot(Tensor[] tensors) -> Tensor
+    inline at::Tensor linalg_multi_dot(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::linalg_multi_dot::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_multi_dot_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::linalg_multi_dot_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_multi_dot_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::linalg_multi_dot_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor
+    inline at::Tensor nested_to_padded_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt) {
+        return at::_ops::nested_to_padded_tensor::redispatch(dispatchKeySet, self, padding, output_size);
+    }
+    
+    // aten::_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
+    inline at::Tensor _test_serialization_subcmul(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_test_serialization_subcmul::redispatch(dispatchKeySet, self, other, alpha);
+    }
+    
+    // aten::_test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor
+    inline at::Tensor _test_parallel_materialize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_parallel, bool skip_first=false) {
+        return at::_ops::_test_parallel_materialize::redispatch(dispatchKeySet, self, num_parallel, skip_first);
+    }
+    
+    // aten::_test_optional_intlist(Tensor values, int[]? addends) -> Tensor
+    inline at::Tensor _test_optional_intlist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, at::OptionalIntArrayRef addends) {
+        return at::_ops::_test_optional_intlist::redispatch(dispatchKeySet, values, addends);
+    }
+    
+    // aten::_test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
+    inline at::Tensor _test_optional_filled_intlist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, at::OptionalIntArrayRef addends) {
+        return at::_ops::_test_optional_filled_intlist::redispatch(dispatchKeySet, values, addends);
+    }
+    
+    // aten::_test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
+    inline at::Tensor _test_optional_floatlist(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, c10::optional<at::ArrayRef<double>> addends) {
+        return at::_ops::_test_optional_floatlist::redispatch(dispatchKeySet, values, addends);
+    }
+    
+    // aten::_test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
+    inline at::Tensor _test_string_default(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dummy, c10::string_view a="\"'\\", c10::string_view b="\"'\\") {
+        return at::_ops::_test_string_default::redispatch(dispatchKeySet, dummy, a, b);
+    }
+    
+    // aten::_test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor
+    inline at::Tensor _test_ambiguous_defaults(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dummy, int64_t a=1, int64_t b=1) {
+        return at::_ops::_test_ambiguous_defaults_a::redispatch(dispatchKeySet, dummy, a, b);
+    }
+    
+    // aten::_test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor
+    inline at::Tensor _test_ambiguous_defaults(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dummy, int64_t a, c10::string_view b) {
+        return at::_ops::_test_ambiguous_defaults_b::redispatch(dispatchKeySet, dummy, a, b);
+    }
+    
+    // aten::_test_warn_in_autograd(Tensor self) -> Tensor
+    inline at::Tensor _test_warn_in_autograd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_test_warn_in_autograd::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor
+    inline at::Tensor _test_autograd_multiple_dispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_test_autograd_multiple_dispatch_fullcoverage::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor
+    inline at::Tensor _test_autograd_multiple_dispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool b) {
+        return at::_ops::_test_autograd_multiple_dispatch_ntonly::redispatch(dispatchKeySet, self, b);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)
+    inline at::Tensor _test_autograd_multiple_dispatch_view(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_test_autograd_multiple_dispatch_view::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor
+    inline at::Tensor _test_autograd_multiple_dispatch_view_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_test_autograd_multiple_dispatch_view_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor
+    inline at::Tensor segment_reduce(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & indices={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, bool unsafe=false, const c10::optional<at::Scalar> & initial=c10::nullopt) {
+        return at::_ops::segment_reduce::redispatch(dispatchKeySet, data, reduce, lengths, indices, offsets, axis, unsafe, initial);
+    }
+    
+    // aten::_segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor
+    inline at::Tensor _segment_reduce_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, const c10::optional<at::Scalar> & initial=c10::nullopt) {
+        return at::_ops::_segment_reduce_backward::redispatch(dispatchKeySet, grad, output, data, reduce, lengths, offsets, axis, initial);
+    }
+    
+    // aten::pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor
+    inline at::Tensor pad_sequence(c10::DispatchKeySet dispatchKeySet, at::TensorList sequences, bool batch_first=false, double padding_value=0.0) {
+        return at::_ops::pad_sequence::redispatch(dispatchKeySet, sequences, batch_first, padding_value);
+    }
+    
+    // aten::flatten_dense_tensors(Tensor[] tensors) -> Tensor
+    inline at::Tensor flatten_dense_tensors(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors) {
+        return at::_ops::flatten_dense_tensors::redispatch(dispatchKeySet, tensors);
+    }
+    
+    // aten::unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]
+    inline ::std::vector<at::Tensor> unflatten_dense_tensors(c10::DispatchKeySet dispatchKeySet, const at::Tensor & flat, at::TensorList tensors) {
+        return at::_ops::unflatten_dense_tensors::redispatch(dispatchKeySet, flat, tensors);
+    }
+    
+    // aten::_nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+    inline at::Tensor _nested_tensor_from_tensor_list(c10::DispatchKeySet dispatchKeySet, at::TensorList list, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt) {
+        return at::_ops::_nested_tensor_from_tensor_list::redispatch(dispatchKeySet, list, dtype, layout, device, pin_memory);
+    }
+    
+    // aten::_fw_primal_copy(Tensor self, int level) -> Tensor
+    inline at::Tensor _fw_primal_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t level) {
+        return at::_ops::_fw_primal_copy::redispatch(dispatchKeySet, self, level);
+    }
+    
+    // aten::_make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
+    inline at::Tensor _make_dual_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & primal, const at::Tensor & tangent, int64_t level) {
+        return at::_ops::_make_dual_copy::redispatch(dispatchKeySet, primal, tangent, level);
+    }
+    
+    // aten::view_as_real_copy(Tensor self) -> Tensor
+    inline at::Tensor view_as_real_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::view_as_real_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::view_as_complex_copy(Tensor self) -> Tensor
+    inline at::Tensor view_as_complex_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::view_as_complex_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_conj_copy(Tensor self) -> Tensor
+    inline at::Tensor _conj_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_conj_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_neg_view_copy(Tensor self) -> Tensor
+    inline at::Tensor _neg_view_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_neg_view_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+    inline at::Tensor as_strided_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+    }
+    
+    // aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+    inline at::Tensor as_strided_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_copy::redispatch(dispatchKeySet, self, size, stride, storage_offset);
+    }
+    
+    // aten::_sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor
+    inline at::Tensor _sparse_broadcast_to_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_sparse_broadcast_to_copy::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor
+    inline at::Tensor diagonal_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) {
+        return at::_ops::diagonal_copy::redispatch(dispatchKeySet, self, offset, dim1, dim2);
+    }
+    
+    // aten::expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
+    inline at::Tensor expand_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, bool implicit=false) {
+        return at::_ops::expand_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), implicit);
+    }
+    
+    // aten::expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor
+    inline at::Tensor expand_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit=false) {
+        return at::_ops::expand_copy::redispatch(dispatchKeySet, self, size, implicit);
+    }
+    
+    // aten::permute_copy(Tensor self, int[] dims) -> Tensor
+    inline at::Tensor permute_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::permute_copy::redispatch(dispatchKeySet, self, dims);
+    }
+    
+    // aten::_reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
+    inline at::Tensor _reshape_alias_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride) {
+        return at::_ops::_reshape_alias_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::_reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor
+    inline at::Tensor _reshape_alias_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
+        return at::_ops::_reshape_alias_copy::redispatch(dispatchKeySet, self, size, stride);
+    }
+    
+    // aten::select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t index) {
+        return at::_ops::select_copy_int::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::select_copy.int(Tensor self, int dim, SymInt index) -> Tensor
+    inline at::Tensor select_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_copy_int::redispatch(dispatchKeySet, self, dim, index);
+    }
+    
+    // aten::detach_copy(Tensor self) -> Tensor
+    inline at::Tensor detach_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::detach_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+    inline at::Tensor slice_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_copy_Tensor::redispatch(dispatchKeySet, self, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+    }
+    
+    // aten::slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+    inline at::Tensor slice_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_copy_Tensor::redispatch(dispatchKeySet, self, dim, start, end, step);
+    }
+    
+    // aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> split_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+        return at::_ops::split_copy_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> split_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+        return at::_ops::split_copy_Tensor::redispatch(dispatchKeySet, self, split_size, dim);
+    }
+    
+    // aten::split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> split_with_sizes_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim);
+    }
+    
+    // aten::split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> split_with_sizes_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes_copy::redispatch(dispatchKeySet, self, split_sizes, dim);
+    }
+    
+    // aten::squeeze_copy(Tensor self) -> Tensor
+    inline at::Tensor squeeze_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::squeeze_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::squeeze_copy.dim(Tensor self, int dim) -> Tensor
+    inline at::Tensor squeeze_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::squeeze_copy_dim::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::squeeze_copy.dims(Tensor self, int[] dim) -> Tensor
+    inline at::Tensor squeeze_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::squeeze_copy_dims::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::t_copy(Tensor self) -> Tensor
+    inline at::Tensor t_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::t_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor
+    inline at::Tensor transpose_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::transpose_copy_int::redispatch(dispatchKeySet, self, dim0, dim1);
+    }
+    
+    // aten::unsqueeze_copy(Tensor self, int dim) -> Tensor
+    inline at::Tensor unsqueeze_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim) {
+        return at::_ops::unsqueeze_copy::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::_indices_copy(Tensor self) -> Tensor
+    inline at::Tensor _indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_values_copy(Tensor self) -> Tensor
+    inline at::Tensor _values_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::_values_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::indices_copy(Tensor self) -> Tensor
+    inline at::Tensor indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::values_copy(Tensor self) -> Tensor
+    inline at::Tensor values_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::values_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::crow_indices_copy(Tensor self) -> Tensor
+    inline at::Tensor crow_indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::crow_indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::col_indices_copy(Tensor self) -> Tensor
+    inline at::Tensor col_indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::col_indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::ccol_indices_copy(Tensor self) -> Tensor
+    inline at::Tensor ccol_indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::ccol_indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::row_indices_copy(Tensor self) -> Tensor
+    inline at::Tensor row_indices_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::row_indices_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
+    inline ::std::vector<at::Tensor> unbind_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim=0) {
+        return at::_ops::unbind_copy_int::redispatch(dispatchKeySet, self, dim);
+    }
+    
+    // aten::unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unbind_copy_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, int64_t dim=0) {
+        return at::_ops::unbind_copy_int_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unbind_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::TensorList out) {
+        return at::_ops::unbind_copy_int_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_copy_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+        return at::_ops::split_copy_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList out) {
+        return at::_ops::split_copy_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+        return at::_ops::split_copy_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out) {
+        return at::_ops::split_copy_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_with_sizes_copy_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+    }
+    
+    // aten::split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_with_sizes_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+        return at::_ops::split_with_sizes_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+    }
+    
+    // aten::split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_with_sizes_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::split_with_sizes_copy_out::redispatch(dispatchKeySet, self, split_sizes, dim, out);
+    }
+    
+    // aten::split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void split_with_sizes_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+        return at::_ops::split_with_sizes_copy_out::redispatch(dispatchKeySet, self, split_sizes, dim, out);
+    }
+    
+    // aten::view_copy(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor view_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::view_copy::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size));
+    }
+    
+    // aten::view_copy(Tensor self, SymInt[] size) -> Tensor
+    inline at::Tensor view_copy_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::view_copy::redispatch(dispatchKeySet, self, size);
+    }
+    
+    // aten::view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor
+    inline at::Tensor view_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype) {
+        return at::_ops::view_copy_dtype::redispatch(dispatchKeySet, self, dtype);
+    }
+    
+    // aten::unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+    inline at::Tensor unfold_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
+        return at::_ops::unfold_copy::redispatch(dispatchKeySet, self, dimension, size, step);
+    }
+    
+    // aten::alias_copy(Tensor self) -> Tensor
+    inline at::Tensor alias_copy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::alias_copy::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+    inline at::Tensor to_padded_tensor(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt) {
+        return at::_ops::to_padded_tensor::redispatch(dispatchKeySet, self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt);
+    }
+    
+    // aten::to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+    inline at::Tensor to_padded_tensor_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size=c10::nullopt) {
+        return at::_ops::to_padded_tensor::redispatch(dispatchKeySet, self, padding, output_size);
+    }
+    
+    // aten::_nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
+    inline at::Tensor _nested_tensor_softmax_with_shape(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & query) {
+        return at::_ops::_nested_tensor_softmax_with_shape::redispatch(dispatchKeySet, self, query);
+    }
+    
+    // aten::_transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
+    inline at::Tensor _transformer_encoder_layer_fwd(c10::DispatchKeySet dispatchKeySet, const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask={}, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_transformer_encoder_layer_fwd::redispatch(dispatchKeySet, src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
+    }
+    
+    // aten::_native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _native_multi_head_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}, bool need_weights=true, bool average_attn_weights=true, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_native_multi_head_attention::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
+    }
+    
+    // aten::scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor
+    inline at::Tensor scaled_dot_product_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask={}, double dropout_p=0.0, bool is_causal=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::scaled_dot_product_attention::redispatch(dispatchKeySet, query, key, value, attn_mask, dropout_p, is_causal, scale);
+    }
+    
+    // aten::_fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int
+    inline int64_t _fused_sdp_choice(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask={}, double dropout_p=0.0, bool is_causal=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_fused_sdp_choice::redispatch(dispatchKeySet, query, key, value, attn_mask, dropout_p, is_causal, scale);
+    }
+    
+    // aten::_scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_attention_math(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask={}, double dropout_p=0.0, bool is_causal=false, const c10::optional<at::Tensor> & dropout_mask={}, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_attention_math::redispatch(dispatchKeySet, query, key, value, attn_mask, dropout_p, is_causal, dropout_mask, scale);
+    }
+    
+    // aten::_scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,c10::SymInt,c10::SymInt,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_flash_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_flash_attention::redispatch(dispatchKeySet, query, key, value, dropout_p, is_causal, return_debug_mask, scale);
+    }
+    
+    // aten::_scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)
+    inline ::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_flash_attention_for_cpu(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const c10::optional<at::Tensor> & attn_mask={}, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_flash_attention_for_cpu::redispatch(dispatchKeySet, query, key, value, dropout_p, is_causal, attn_mask, scale);
+    }
+    
+    // aten::_scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_flash_attention_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_flash_attention_backward::redispatch(dispatchKeySet, grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale);
+    }
+    
+    // aten::_scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_flash_attention_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_flash_attention_backward::redispatch(dispatchKeySet, grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale);
+    }
+    
+    // aten::_scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_flash_attention_for_cpu_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, double dropout_p, bool is_causal, const c10::optional<at::Tensor> & attn_mask={}, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_flash_attention_for_cpu_backward::redispatch(dispatchKeySet, grad_out, query, key, value, out, logsumexp, dropout_p, is_causal, attn_mask, scale);
+    }
+    
+    // aten::_scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_bias, bool compute_log_sumexp, double dropout_p=0.0, bool is_causal=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_efficient_attention::redispatch(dispatchKeySet, query, key, value, attn_bias, compute_log_sumexp, dropout_p, is_causal, scale);
+    }
+    
+    // aten::_scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & attn_bias, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & philox_seed, const at::Tensor & philox_offset, double dropout_p, ::std::array<bool,4> grad_input_mask, bool is_causal=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_efficient_attention_backward::redispatch(dispatchKeySet, grad_out_, query, key, value, attn_bias, out, logsumexp, philox_seed, philox_offset, dropout_p, grad_input_mask, is_causal, scale);
+    }
+    
+    // aten::_scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_cudnn_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_scaled_dot_product_cudnn_attention::redispatch(dispatchKeySet, query, key, value, dropout_p, is_causal, return_debug_mask, scale);
+    }
+    
+    // aten::_flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _flash_attention_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & cum_seq_q, const c10::optional<at::Tensor> & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, bool return_debug_mask, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_flash_attention_forward::redispatch(dispatchKeySet, query, key, value, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, return_debug_mask, scale);
+    }
+    
+    // aten::_flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _flash_attention_forward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & cum_seq_q, const c10::optional<at::Tensor> & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, bool return_debug_mask, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_flash_attention_forward::redispatch(dispatchKeySet, query, key, value, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, return_debug_mask, scale);
+    }
+    
+    // aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_flash_attention_backward::redispatch(dispatchKeySet, grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale);
+    }
+    
+    // aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, c10::optional<double> scale=c10::nullopt) {
+        return at::_ops::_flash_attention_backward::redispatch(dispatchKeySet, grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale);
+    }
+    
+    // aten::_efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,c10::SymInt,c10::SymInt> _efficient_attention_forward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & cu_seqlens_q, const c10::optional<at::Tensor> & cu_seqlens_k, c10::optional<int64_t> max_seqlen_q, c10::optional<int64_t> max_seqlen_k, double dropout_p, int64_t custom_mask_type, bool compute_log_sumexp=false, c10::optional<double> scale=c10::nullopt, const c10::optional<at::Tensor> & causal_diagonal={}, const c10::optional<at::Tensor> & seqlen_k={}) {
+        return at::_ops::_efficient_attention_forward::redispatch(dispatchKeySet, query, key, value, bias, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, custom_mask_type, compute_log_sumexp, scale, causal_diagonal, seqlen_k);
+    }
+    
+    // aten::_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _efficient_attention_backward(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & bias, const at::Tensor & out, const c10::optional<at::Tensor> & cu_seqlens_q, const c10::optional<at::Tensor> & cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, const at::Tensor & logsumexp, double dropout_p, const at::Tensor & philox_seed, const at::Tensor & philox_offset, int64_t custom_mask_type, bool bias_requires_grad, c10::optional<double> scale=c10::nullopt, c10::optional<int64_t> num_splits_key=c10::nullopt) {
+        return at::_ops::_efficient_attention_backward::redispatch(dispatchKeySet, grad_out_, query, key, value, bias, out, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias_requires_grad, scale, num_splits_key);
+    }
+    
+    // aten::_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _efficient_attention_backward_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & bias, const at::Tensor & out, const c10::optional<at::Tensor> & cu_seqlens_q, const c10::optional<at::Tensor> & cu_seqlens_k, c10::SymInt max_seqlen_q, c10::SymInt max_seqlen_k, const at::Tensor & logsumexp, double dropout_p, const at::Tensor & philox_seed, const at::Tensor & philox_offset, int64_t custom_mask_type, bool bias_requires_grad, c10::optional<double> scale=c10::nullopt, c10::optional<int64_t> num_splits_key=c10::nullopt) {
+        return at::_ops::_efficient_attention_backward::redispatch(dispatchKeySet, grad_out_, query, key, value, bias, out, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, logsumexp, dropout_p, philox_seed, philox_offset, custom_mask_type, bias_requires_grad, scale, num_splits_key);
+    }
+    
+    // aten::_triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor
+    inline at::Tensor _triton_scaled_dot_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p=0.0) {
+        return at::_ops::_triton_scaled_dot_attention::redispatch(dispatchKeySet, q, k, v, dropout_p);
+    }
+    
+    // aten::_fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
+    inline at::Tensor & _fill_mem_eff_dropout_mask_(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double dropout_p, int64_t seed, int64_t offset) {
+        return at::_ops::_fill_mem_eff_dropout_mask_::redispatch(dispatchKeySet, self, dropout_p, seed, offset);
+    }
+    
+    // aten::_triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor
+    inline at::Tensor _triton_multi_head_attention(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}) {
+        return at::_ops::_triton_multi_head_attention::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask);
+    }
+    
+    // aten::special_airy_ai(Tensor x) -> Tensor
+    inline at::Tensor special_airy_ai(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x) {
+        return at::_ops::special_airy_ai::redispatch(dispatchKeySet, x);
+    }
+    
+    // aten::special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_airy_ai_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x) {
+        return at::_ops::special_airy_ai_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_airy_ai_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out) {
+        return at::_ops::special_airy_ai_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_bessel_j0(Tensor self) -> Tensor
+    inline at::Tensor special_bessel_j0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_bessel_j0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_j0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_bessel_j0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_j0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_bessel_j0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_j1(Tensor self) -> Tensor
+    inline at::Tensor special_bessel_j1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_bessel_j1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_j1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_bessel_j1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_j1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_bessel_j1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_y0(Tensor self) -> Tensor
+    inline at::Tensor special_bessel_y0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_bessel_y0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_y0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_bessel_y0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_y0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_bessel_y0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_y1(Tensor self) -> Tensor
+    inline at::Tensor special_bessel_y1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_bessel_y1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_y1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_bessel_y1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_bessel_y1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_bessel_y1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_t::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_t_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_t_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_t_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_t_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_t_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_t_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_t_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_t_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_u::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_u_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_u_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_u_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_u_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_u_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_u_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_u_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_u_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_v::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_v_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_v_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_v_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_v_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_v_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_v_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_v_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_v_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_w::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_w_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_w_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_w_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_w_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_chebyshev_polynomial_w_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_w_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_chebyshev_polynomial_w_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_chebyshev_polynomial_w_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_h(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_h::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_h(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_h_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_h(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_hermite_polynomial_h_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_h_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_h_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_h_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_h_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_hermite_polynomial_h_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_h_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_h_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_he(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_he::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_he(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_he_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_hermite_polynomial_he(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_hermite_polynomial_he_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_he_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_he_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_hermite_polynomial_he_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_he_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_hermite_polynomial_he_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_hermite_polynomial_he_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_hermite_polynomial_he_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_laguerre_polynomial_l(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_laguerre_polynomial_l::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_laguerre_polynomial_l(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_laguerre_polynomial_l_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_laguerre_polynomial_l(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_laguerre_polynomial_l_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_laguerre_polynomial_l_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_laguerre_polynomial_l_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_laguerre_polynomial_l_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_laguerre_polynomial_l_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_laguerre_polynomial_l_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_laguerre_polynomial_l_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_laguerre_polynomial_l_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_legendre_polynomial_p(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_legendre_polynomial_p::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_legendre_polynomial_p(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_legendre_polynomial_p_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_legendre_polynomial_p(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_legendre_polynomial_p_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_legendre_polynomial_p_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_legendre_polynomial_p_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_legendre_polynomial_p_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_legendre_polynomial_p_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_legendre_polynomial_p_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_legendre_polynomial_p_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_legendre_polynomial_p_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_modified_bessel_i0(Tensor self) -> Tensor
+    inline at::Tensor special_modified_bessel_i0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_i0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_i0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_i0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_modified_bessel_i0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_i1(Tensor self) -> Tensor
+    inline at::Tensor special_modified_bessel_i1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_i1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_i1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_i1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_i1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_modified_bessel_i1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_k0(Tensor self) -> Tensor
+    inline at::Tensor special_modified_bessel_k0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_k0::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_k0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_k0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_k0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_modified_bessel_k0_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_k1(Tensor self) -> Tensor
+    inline at::Tensor special_modified_bessel_k1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_k1::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_k1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::special_modified_bessel_k1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_modified_bessel_k1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::special_modified_bessel_k1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::special_scaled_modified_bessel_k0(Tensor x) -> Tensor
+    inline at::Tensor special_scaled_modified_bessel_k0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x) {
+        return at::_ops::special_scaled_modified_bessel_k0::redispatch(dispatchKeySet, x);
+    }
+    
+    // aten::special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_scaled_modified_bessel_k0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x) {
+        return at::_ops::special_scaled_modified_bessel_k0_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_scaled_modified_bessel_k0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out) {
+        return at::_ops::special_scaled_modified_bessel_k0_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_scaled_modified_bessel_k1(Tensor x) -> Tensor
+    inline at::Tensor special_scaled_modified_bessel_k1(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x) {
+        return at::_ops::special_scaled_modified_bessel_k1::redispatch(dispatchKeySet, x);
+    }
+    
+    // aten::special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_scaled_modified_bessel_k1_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x) {
+        return at::_ops::special_scaled_modified_bessel_k1_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_scaled_modified_bessel_k1_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out) {
+        return at::_ops::special_scaled_modified_bessel_k1_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_t(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_u(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_u_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_u_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_v(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_x_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor
+    inline at::Tensor special_shifted_chebyshev_polynomial_w(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_n_scalar::redispatch(dispatchKeySet, x, n);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_x_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_shifted_chebyshev_polynomial_w_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+        return at::_ops::special_shifted_chebyshev_polynomial_w_n_scalar_out::redispatch(dispatchKeySet, x, n, out);
+    }
+    
+    // aten::special_spherical_bessel_j0(Tensor x) -> Tensor
+    inline at::Tensor special_spherical_bessel_j0(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x) {
+        return at::_ops::special_spherical_bessel_j0::redispatch(dispatchKeySet, x);
+    }
+    
+    // aten::special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_spherical_bessel_j0_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x) {
+        return at::_ops::special_spherical_bessel_j0_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & special_spherical_bessel_j0_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out) {
+        return at::_ops::special_spherical_bessel_j0_out::redispatch(dispatchKeySet, x, out);
+    }
+    
+    // aten::_foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor
+    inline at::Tensor _foobar(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool arg1=true, bool arg2=true, bool arg3=true) {
+        return at::_ops::_foobar::redispatch(dispatchKeySet, self, arg1, arg2, arg3);
+    }
+    
+    // aten::_fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_adam_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam_::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_adam_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam__tensor_lr::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_adamw_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw_::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_adamw_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw__tensor_lr::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_sgd_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd_::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+    inline void _fused_sgd_(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, const at::Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd__tensor_lr::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf);
+    }
+    
+    // aten::_propagate_xla_data(Tensor input, Tensor output) -> ()
+    inline void _propagate_xla_data(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & output) {
+        return at::_ops::_propagate_xla_data::redispatch(dispatchKeySet, input, output);
+    }
+    
+    // aten::_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _new_zeros_with_same_feature_meta_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0) {
+        return at::_ops::_new_zeros_with_same_feature_meta_out::redispatch(dispatchKeySet, self, other, self_num_batch_dims, out);
+    }
+    
+    // aten::_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _new_zeros_with_same_feature_meta_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims, at::Tensor & out) {
+        return at::_ops::_new_zeros_with_same_feature_meta_out::redispatch(dispatchKeySet, self, other, self_num_batch_dims, out);
+    }
+    
+    // aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _cudnn_ctc_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) {
+        return at::_ops::_cudnn_ctc_loss_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1);
+    }
+    
+    // aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _cudnn_ctc_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_cudnn_ctc_loss_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight.out(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_rnn_flatten_weight_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, bool bidirectional) {
+        return at::_ops::_cudnn_rnn_flatten_weight_out::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional, out);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight.out(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_rnn_flatten_weight_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, bool bidirectional, at::Tensor & out) {
+        return at::_ops::_cudnn_rnn_flatten_weight_out::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional, out);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight.out(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_rnn_flatten_weight_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList weight_arr, int64_t weight_stride0, c10::SymInt input_size, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, bool bidirectional) {
+        return at::_ops::_cudnn_rnn_flatten_weight_out::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional, out);
+    }
+    
+    // aten::_cudnn_rnn_flatten_weight.out(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_rnn_flatten_weight_symint_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList weight_arr, int64_t weight_stride0, c10::SymInt input_size, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, bool bidirectional, at::Tensor & out) {
+        return at::_ops::_cudnn_rnn_flatten_weight_out::redispatch(dispatchKeySet, weight_arr, weight_stride0, input_size, mode, hidden_size, proj_size, num_layers, batch_first, bidirectional, out);
+    }
+    
+    // aten::_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::_cudnn_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4) {
+        return at::_ops::_cudnn_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::_cudnn_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const c10::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4) {
+        return at::_ops::_cudnn_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_cudnn_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void _cudnn_rnn_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::_cudnn_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::_cudnn_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void _cudnn_rnn_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3) {
+        return at::_ops::_cudnn_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, c10::fromIntArrayRefSlow(batch_sizes), dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::_cudnn_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void _cudnn_rnn_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::_cudnn_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::_cudnn_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void _cudnn_rnn_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3) {
+        return at::_ops::_cudnn_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_init_dropout_state_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, double dropout, bool train, int64_t dropout_seed) {
+        return at::_ops::_cudnn_init_dropout_state_out::redispatch(dispatchKeySet, dropout, train, dropout_seed, out);
+    }
+    
+    // aten::_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cudnn_init_dropout_state_outf(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, at::Tensor & out) {
+        return at::_ops::_cudnn_init_dropout_state_out::redispatch(dispatchKeySet, dropout, train, dropout_seed, out);
+    }
+    
+    // aten::_fused_dropout.out(Tensor self, float p, Generator? generator=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fused_dropout_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_fused_dropout_out::redispatch(dispatchKeySet, self, p, generator, out0, out1);
+    }
+    
+    // aten::_fused_dropout.out(Tensor self, float p, Generator? generator=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fused_dropout_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_fused_dropout_out::redispatch(dispatchKeySet, self, p, generator, out0, out1);
+    }
+    
+    // aten::_masked_scale.out(Tensor self, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_scale_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, double scale) {
+        return at::_ops::_masked_scale_out::redispatch(dispatchKeySet, self, mask, scale, out);
+    }
+    
+    // aten::_masked_scale.out(Tensor self, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_scale_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, double scale, at::Tensor & out) {
+        return at::_ops::_masked_scale_out::redispatch(dispatchKeySet, self, mask, scale, out);
+    }
+    
+    // aten::native_dropout.out(Tensor input, float p, bool? train, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> native_dropout_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, double p, c10::optional<bool> train) {
+        return at::_ops::native_dropout_out::redispatch(dispatchKeySet, input, p, train, out0, out1);
+    }
+    
+    // aten::native_dropout.out(Tensor input, float p, bool? train, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> native_dropout_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, c10::optional<bool> train, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::native_dropout_out::redispatch(dispatchKeySet, input, p, train, out0, out1);
+    }
+    
+    // aten::native_dropout_backward.out(Tensor grad_output, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_dropout_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & mask, double scale) {
+        return at::_ops::native_dropout_backward_out::redispatch(dispatchKeySet, grad_output, mask, scale, out);
+    }
+    
+    // aten::native_dropout_backward.out(Tensor grad_output, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_dropout_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & mask, double scale, at::Tensor & out) {
+        return at::_ops::native_dropout_backward_out::redispatch(dispatchKeySet, grad_output, mask, scale, out);
+    }
+    
+    // aten::_conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _conj_physical_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_conj_physical_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _conj_physical_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_conj_physical_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_add_relu.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _add_relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::_add_relu_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_add_relu.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _add_relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::_add_relu_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & add_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::add_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & add_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::add_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::affine_grid_generator.out(Tensor theta, SymInt[] size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & affine_grid_generator_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & theta, at::IntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator_out::redispatch(dispatchKeySet, theta, c10::fromIntArrayRefSlow(size), align_corners, out);
+    }
+    
+    // aten::affine_grid_generator.out(Tensor theta, SymInt[] size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & affine_grid_generator_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, at::IntArrayRef size, bool align_corners, at::Tensor & out) {
+        return at::_ops::affine_grid_generator_out::redispatch(dispatchKeySet, theta, c10::fromIntArrayRefSlow(size), align_corners, out);
+    }
+    
+    // aten::affine_grid_generator.out(Tensor theta, SymInt[] size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & affine_grid_generator_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & theta, c10::SymIntArrayRef size, bool align_corners) {
+        return at::_ops::affine_grid_generator_out::redispatch(dispatchKeySet, theta, size, align_corners, out);
+    }
+    
+    // aten::affine_grid_generator.out(Tensor theta, SymInt[] size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & affine_grid_generator_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, c10::SymIntArrayRef size, bool align_corners, at::Tensor & out) {
+        return at::_ops::affine_grid_generator_out::redispatch(dispatchKeySet, theta, size, align_corners, out);
+    }
+    
+    // aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_functorch_fallback_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::_test_functorch_fallback_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_functorch_fallback_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::_test_functorch_fallback_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bartlett_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bartlett_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length) {
+        return at::_ops::bartlett_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::bartlett_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bartlett_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out) {
+        return at::_ops::bartlett_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::bartlett_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bartlett_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic) {
+        return at::_ops::bartlett_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::bartlett_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bartlett_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out) {
+        return at::_ops::bartlett_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::quantized_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_batch_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & var, double eps, double output_scale, int64_t output_zero_point) {
+        return at::_ops::quantized_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, mean, var, eps, output_scale, output_zero_point, out);
+    }
+    
+    // aten::quantized_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_batch_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & mean, const at::Tensor & var, double eps, double output_scale, int64_t output_zero_point, at::Tensor & out) {
+        return at::_ops::quantized_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, mean, var, eps, output_scale, output_zero_point, out);
+    }
+    
+    // aten::bernoulli.Tensor_out(Tensor self, Tensor p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli_Tensor_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::bernoulli.Tensor_out(Tensor self, Tensor p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & p, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::bernoulli_Tensor_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::bernoulli.Tensor(Tensor self, Tensor p, *, Generator? generator=None) -> Tensor
+    inline at::Tensor bernoulli(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli_Tensor::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::bernoulli.float_out(Tensor self, float p=0.5, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double p=0.5, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::bernoulli_float_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::bernoulli.float_out(Tensor self, float p=0.5, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bernoulli_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::bernoulli_float_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::binary_cross_entropy_with_logits.out(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_with_logits_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight={}, const c10::optional<at::Tensor> & pos_weight={}, int64_t reduction=at::Reduction::Mean) {
+        return at::_ops::binary_cross_entropy_with_logits_out::redispatch(dispatchKeySet, self, target, weight, pos_weight, reduction, out);
+    }
+    
+    // aten::binary_cross_entropy_with_logits.out(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binary_cross_entropy_with_logits_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & pos_weight, int64_t reduction, at::Tensor & out) {
+        return at::_ops::binary_cross_entropy_with_logits_out::redispatch(dispatchKeySet, self, target, weight, pos_weight, reduction, out);
+    }
+    
+    // aten::bincount.out(Tensor self, Tensor? weights=None, int minlength=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bincount_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Tensor> & weights={}, int64_t minlength=0) {
+        return at::_ops::bincount_out::redispatch(dispatchKeySet, self, weights, minlength, out);
+    }
+    
+    // aten::bincount.out(Tensor self, Tensor? weights=None, int minlength=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bincount_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & weights, int64_t minlength, at::Tensor & out) {
+        return at::_ops::bincount_out::redispatch(dispatchKeySet, self, weights, minlength, out);
+    }
+    
+    // aten::blackman_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & blackman_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length) {
+        return at::_ops::blackman_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::blackman_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & blackman_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out) {
+        return at::_ops::blackman_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::blackman_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & blackman_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic) {
+        return at::_ops::blackman_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::blackman_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & blackman_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out) {
+        return at::_ops::blackman_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::block_diag.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & block_diag_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList tensors) {
+        return at::_ops::block_diag_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::block_diag.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & block_diag_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Tensor & out) {
+        return at::_ops::block_diag_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::constant_pad_nd.out(Tensor self, SymInt[] pad, Scalar value=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & constant_pad_nd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0) {
+        return at::_ops::constant_pad_nd_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad), value, out);
+    }
+    
+    // aten::constant_pad_nd.out(Tensor self, SymInt[] pad, Scalar value=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & constant_pad_nd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::constant_pad_nd_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(pad), value, out);
+    }
+    
+    // aten::constant_pad_nd.out(Tensor self, SymInt[] pad, Scalar value=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & constant_pad_nd_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0) {
+        return at::_ops::constant_pad_nd_out::redispatch(dispatchKeySet, self, pad, value, out);
+    }
+    
+    // aten::constant_pad_nd.out(Tensor self, SymInt[] pad, Scalar value=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & constant_pad_nd_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::constant_pad_nd_out::redispatch(dispatchKeySet, self, pad, value, out);
+    }
+    
+    // aten::convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
+        return at::_ops::convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, out);
+    }
+    
+    // aten::convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, at::Tensor & out) {
+        return at::_ops::convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, out);
+    }
+    
+    // aten::convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+        return at::_ops::convolution_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, out);
+    }
+    
+    // aten::convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::convolution_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, out);
+    }
+    
+    // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::convolution_backward_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::convolution_backward_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_overrideable_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
+        return at::_ops::convolution_overrideable_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, out);
+    }
+    
+    // aten::convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_overrideable_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, at::Tensor & out) {
+        return at::_ops::convolution_overrideable_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, out);
+    }
+    
+    // aten::convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_overrideable_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
+        return at::_ops::convolution_overrideable_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, out);
+    }
+    
+    // aten::convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & convolution_overrideable_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::convolution_overrideable_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, out);
+    }
+    
+    // aten::convolution_backward_overrideable.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_overrideable_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_overrideable_out::redispatch(dispatchKeySet, grad_output, input, weight, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward_overrideable.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_overrideable_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::convolution_backward_overrideable_out::redispatch(dispatchKeySet, grad_output, input, weight, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward_overrideable.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_overrideable_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::convolution_backward_overrideable_out::redispatch(dispatchKeySet, grad_output, input, weight, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::convolution_backward_overrideable.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_overrideable_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::convolution_backward_overrideable_out::redispatch(dispatchKeySet, grad_output, input, weight, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
+        return at::_ops::_convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out);
+    }
+    
+    // aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::_convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out);
+    }
+    
+    // aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) {
+        return at::_ops::_convolution_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out);
+    }
+    
+    // aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::_convolution_out::redispatch(dispatchKeySet, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out);
+    }
+    
+    // aten::conv_tbc.out(Tensor self, Tensor weight, Tensor bias, int pad=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_tbc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & bias, int64_t pad=0) {
+        return at::_ops::conv_tbc_out::redispatch(dispatchKeySet, self, weight, bias, pad, out);
+    }
+    
+    // aten::conv_tbc.out(Tensor self, Tensor weight, Tensor bias, int pad=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_tbc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & bias, int64_t pad, at::Tensor & out) {
+        return at::_ops::conv_tbc_out::redispatch(dispatchKeySet, self, weight, bias, pad, out);
+    }
+    
+    // aten::copy.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::copy.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out) {
+        return at::_ops::copy_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::_copy_from.out(Tensor self, Tensor dst, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _copy_from_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & dst, bool non_blocking=false) {
+        return at::_ops::_copy_from_out::redispatch(dispatchKeySet, self, dst, non_blocking, out);
+    }
+    
+    // aten::_copy_from.out(Tensor self, Tensor dst, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _copy_from_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking, at::Tensor & out) {
+        return at::_ops::_copy_from_out::redispatch(dispatchKeySet, self, dst, non_blocking, out);
+    }
+    
+    // aten::_copy_from_and_resize.out(Tensor self, Tensor dst, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _copy_from_and_resize_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & dst) {
+        return at::_ops::_copy_from_and_resize_out::redispatch(dispatchKeySet, self, dst, out);
+    }
+    
+    // aten::_copy_from_and_resize.out(Tensor self, Tensor dst, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _copy_from_and_resize_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, at::Tensor & out) {
+        return at::_ops::_copy_from_and_resize_out::redispatch(dispatchKeySet, self, dst, out);
+    }
+    
+    // aten::count_nonzero.dim_IntList_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & count_nonzero_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::count_nonzero_dim_IntList_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::count_nonzero.dim_IntList_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & count_nonzero_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out) {
+        return at::_ops::count_nonzero_dim_IntList_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::count_nonzero.out(Tensor self, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & count_nonzero_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::count_nonzero_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::count_nonzero.out(Tensor self, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & count_nonzero_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dim, at::Tensor & out) {
+        return at::_ops::count_nonzero_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::cudnn_affine_grid_generator.out(Tensor theta, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_affine_grid_generator_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W) {
+        return at::_ops::cudnn_affine_grid_generator_out::redispatch(dispatchKeySet, theta, N, C, H, W, out);
+    }
+    
+    // aten::cudnn_affine_grid_generator.out(Tensor theta, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_affine_grid_generator_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W, at::Tensor & out) {
+        return at::_ops::cudnn_affine_grid_generator_out::redispatch(dispatchKeySet, theta, N, C, H, W, out);
+    }
+    
+    // aten::cudnn_affine_grid_generator_backward.out(Tensor grad, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_affine_grid_generator_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W) {
+        return at::_ops::cudnn_affine_grid_generator_backward_out::redispatch(dispatchKeySet, grad, N, C, H, W, out);
+    }
+    
+    // aten::cudnn_affine_grid_generator_backward.out(Tensor grad, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_affine_grid_generator_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W, at::Tensor & out) {
+        return at::_ops::cudnn_affine_grid_generator_backward_out::redispatch(dispatchKeySet, grad, N, C, H, W, out);
+    }
+    
+    // aten::cudnn_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> cudnn_batch_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon) {
+        return at::_ops::cudnn_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon, out0, out1, out2, out3);
+    }
+    
+    // aten::cudnn_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> cudnn_batch_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) {
+        return at::_ops::cudnn_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon, out0, out1, out2, out3);
+    }
+    
+    // aten::cudnn_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> cudnn_batch_norm_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, const at::Tensor & reserveSpace) {
+        return at::_ops::cudnn_batch_norm_backward_out::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon, reserveSpace, out0, out1, out2);
+    }
+    
+    // aten::cudnn_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> cudnn_batch_norm_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, const at::Tensor & reserveSpace, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::cudnn_batch_norm_backward_out::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon, reserveSpace, out0, out1, out2);
+    }
+    
+    // aten::cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_transpose_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_transpose_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_transpose_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) {
+        return at::_ops::cudnn_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_transpose_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, out);
+    }
+    
+    // aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_transpose_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::_mps_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_transpose_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::_mps_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_transpose_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::_mps_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups, out);
+    }
+    
+    // aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_transpose_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::_mps_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, padding, output_padding, stride, dilation, groups, out);
+    }
+    
+    // aten::mps_convolution_transpose_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mps_convolution_transpose_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,2> output_mask) {
+        return at::_ops::mps_convolution_transpose_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask, out0, out1);
+    }
+    
+    // aten::mps_convolution_transpose_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mps_convolution_transpose_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::mps_convolution_transpose_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask, out0, out1);
+    }
+    
+    // aten::mps_convolution_transpose_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mps_convolution_transpose_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask) {
+        return at::_ops::mps_convolution_transpose_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, padding, output_padding, stride, dilation, groups, output_mask, out0, out1);
+    }
+    
+    // aten::mps_convolution_transpose_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mps_convolution_transpose_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::mps_convolution_transpose_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, padding, output_padding, stride, dilation, groups, output_mask, out0, out1);
+    }
+    
+    // aten::cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::cudnn_convolution_relu_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_relu_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_relu_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::cudnn_convolution_relu_out::redispatch(dispatchKeySet, self, weight, bias, stride, padding, dilation, groups, out);
+    }
+    
+    // aten::cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_relu_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_relu_out::redispatch(dispatchKeySet, self, weight, bias, stride, padding, dilation, groups, out);
+    }
+    
+    // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_add_relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::cudnn_convolution_add_relu_out::redispatch(dispatchKeySet, self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_add_relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_add_relu_out::redispatch(dispatchKeySet, self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_add_relu_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::cudnn_convolution_add_relu_out::redispatch(dispatchKeySet, self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
+    }
+    
+    // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_convolution_add_relu_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::cudnn_convolution_add_relu_out::redispatch(dispatchKeySet, self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
+    }
+    
+    // aten::cudnn_grid_sampler.out(Tensor self, Tensor grid, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_grid_sampler_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & grid) {
+        return at::_ops::cudnn_grid_sampler_out::redispatch(dispatchKeySet, self, grid, out);
+    }
+    
+    // aten::cudnn_grid_sampler.out(Tensor self, Tensor grid, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cudnn_grid_sampler_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid, at::Tensor & out) {
+        return at::_ops::cudnn_grid_sampler_out::redispatch(dispatchKeySet, self, grid, out);
+    }
+    
+    // aten::cudnn_grid_sampler_backward.out(Tensor self, Tensor grid, Tensor grad_output, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cudnn_grid_sampler_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output) {
+        return at::_ops::cudnn_grid_sampler_backward_out::redispatch(dispatchKeySet, self, grid, grad_output, out0, out1);
+    }
+    
+    // aten::cudnn_grid_sampler_backward.out(Tensor self, Tensor grid, Tensor grad_output, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> cudnn_grid_sampler_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::cudnn_grid_sampler_backward_out::redispatch(dispatchKeySet, self, grid, grad_output, out0, out1);
+    }
+    
+    // aten::_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _ctc_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank=0, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity, out0, out1);
+    }
+    
+    // aten::_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _ctc_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_ctc_loss_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity, out0, out1);
+    }
+    
+    // aten::_ctc_loss.Tensor_out(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _ctc_loss_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank=0, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_Tensor_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity, out0, out1);
+    }
+    
+    // aten::_ctc_loss.Tensor_out(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _ctc_loss_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_ctc_loss_Tensor_out::redispatch(dispatchKeySet, log_probs, targets, input_lengths, target_lengths, blank, zero_infinity, out0, out1);
+    }
+    
+    // aten::_ctc_loss_backward.out(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _ctc_loss_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity=false) {
+        return at::_ops::_ctc_loss_backward_out::redispatch(dispatchKeySet, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, zero_infinity, out);
+    }
+    
+    // aten::_ctc_loss_backward.out(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _ctc_loss_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity, at::Tensor & out) {
+        return at::_ops::_ctc_loss_backward_out::redispatch(dispatchKeySet, grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, blank, zero_infinity, out);
+    }
+    
+    // aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diag_embed_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) {
+        return at::_ops::diag_embed_out::redispatch(dispatchKeySet, self, offset, dim1, dim2, out);
+    }
+    
+    // aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diag_embed_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+        return at::_ops::diag_embed_out::redispatch(dispatchKeySet, self, offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_backward.out(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+        return at::_ops::diagonal_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_backward.out(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+        return at::_ops::diagonal_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_backward.out(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2) {
+        return at::_ops::diagonal_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_backward.out(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+        return at::_ops::diagonal_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, offset, dim1, dim2, out);
+    }
+    
+    // aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::div_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::div_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) {
+        return at::_ops::div_Scalar_mode_out::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & div_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, c10::optional<c10::string_view> rounding_mode, at::Tensor & out) {
+        return at::_ops::div_Scalar_mode_out::redispatch(dispatchKeySet, self, other, rounding_mode, out);
+    }
+    
+    // aten::embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) {
+        return at::_ops::embedding_out::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse, out);
+    }
+    
+    // aten::embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out) {
+        return at::_ops::embedding_out::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse, out);
+    }
+    
+    // aten::embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false) {
+        return at::_ops::embedding_out::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse, out);
+    }
+    
+    // aten::embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out) {
+        return at::_ops::embedding_out::redispatch(dispatchKeySet, weight, indices, padding_idx, scale_grad_by_freq, sparse, out);
+    }
+    
+    // aten::embedding_dense_backward.out(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_dense_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+        return at::_ops::embedding_dense_backward_out::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq, out);
+    }
+    
+    // aten::embedding_dense_backward.out(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_dense_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq, at::Tensor & out) {
+        return at::_ops::embedding_dense_backward_out::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq, out);
+    }
+    
+    // aten::embedding_dense_backward.out(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_dense_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq) {
+        return at::_ops::embedding_dense_backward_out::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq, out);
+    }
+    
+    // aten::embedding_dense_backward.out(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_dense_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq, at::Tensor & out) {
+        return at::_ops::embedding_dense_backward_out::redispatch(dispatchKeySet, grad_output, indices, num_weights, padding_idx, scale_grad_by_freq, out);
+    }
+    
+    // aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_renorm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+        return at::_ops::embedding_renorm_out::redispatch(dispatchKeySet, self, indices, max_norm, norm_type, out);
+    }
+    
+    // aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & embedding_renorm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type, at::Tensor & out) {
+        return at::_ops::embedding_renorm_out::redispatch(dispatchKeySet, self, indices, max_norm, norm_type, out);
+    }
+    
+    // aten::embedding_renorm(Tensor self, Tensor indices, float max_norm, float norm_type) -> Tensor
+    inline at::Tensor embedding_renorm(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, double max_norm, double norm_type) {
+        return at::_ops::embedding_renorm::redispatch(dispatchKeySet, self, indices, max_norm, norm_type);
+    }
+    
+    // aten::_embedding_bag_forward_only.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_forward_only_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const c10::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_forward_only_out::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx, out0, out1, out2, out3);
+    }
+    
+    // aten::_embedding_bag_forward_only.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_forward_only_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) {
+        return at::_ops::_embedding_bag_forward_only_out::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx, out0, out1, out2, out3);
+    }
+    
+    // aten::_embedding_bag.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const c10::optional<at::Tensor> & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_out::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx, out0, out1, out2, out3);
+    }
+    
+    // aten::_embedding_bag.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _embedding_bag_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) {
+        return at::_ops::_embedding_bag_out::redispatch(dispatchKeySet, weight, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx, out0, out1, out2, out3);
+    }
+    
+    // aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_dense_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_dense_backward_out::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+    }
+    
+    // aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_dense_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+        return at::_ops::_embedding_bag_dense_backward_out::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+    }
+    
+    // aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_dense_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_dense_backward_out::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+    }
+    
+    // aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_dense_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out) {
+        return at::_ops::_embedding_bag_dense_backward_out::redispatch(dispatchKeySet, grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out);
+    }
+    
+    // aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_per_sample_weights_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) {
+        return at::_ops::_embedding_bag_per_sample_weights_backward_out::redispatch(dispatchKeySet, grad, weight, indices, offsets, offset2bag, mode, padding_idx, out);
+    }
+    
+    // aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _embedding_bag_per_sample_weights_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out) {
+        return at::_ops::_embedding_bag_per_sample_weights_backward_out::redispatch(dispatchKeySet, grad, weight, indices, offsets, offset2bag, mode, padding_idx, out);
+    }
+    
+    // aten::empty.names_out(int[] size, *, Dimname[]? names, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_names_out::redispatch(dispatchKeySet, size, names, memory_format, out);
+    }
+    
+    // aten::empty.names_out(int[] size, *, Dimname[]? names, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::empty_names_out::redispatch(dispatchKeySet, size, names, memory_format, out);
+    }
+    
+    // aten::empty_permuted.out(SymInt[] size, int[] physical_layout, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_permuted_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, at::IntArrayRef physical_layout) {
+        return at::_ops::empty_permuted_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), physical_layout, out);
+    }
+    
+    // aten::empty_permuted.out(SymInt[] size, int[] physical_layout, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_permuted_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef physical_layout, at::Tensor & out) {
+        return at::_ops::empty_permuted_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), physical_layout, out);
+    }
+    
+    // aten::empty_permuted.out(SymInt[] size, int[] physical_layout, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_permuted_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, at::IntArrayRef physical_layout) {
+        return at::_ops::empty_permuted_out::redispatch(dispatchKeySet, size, physical_layout, out);
+    }
+    
+    // aten::empty_permuted.out(SymInt[] size, int[] physical_layout, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_permuted_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::IntArrayRef physical_layout, at::Tensor & out) {
+        return at::_ops::empty_permuted_out::redispatch(dispatchKeySet, size, physical_layout, out);
+    }
+    
+    // aten::new_empty.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::new_empty_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_empty.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_empty_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_empty.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::new_empty_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::new_empty.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_empty_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_strided_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride) {
+        return at::_ops::new_empty_strided_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_strided_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::new_empty_strided_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_strided_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
+        return at::_ops::new_empty_strided_out::redispatch(dispatchKeySet, self, size, stride, out);
+    }
+    
+    // aten::new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_empty_strided_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) {
+        return at::_ops::new_empty_strided_out::redispatch(dispatchKeySet, self, size, stride, out);
+    }
+    
+    // aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_full_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value) {
+        return at::_ops::new_full_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), fill_value, out);
+    }
+    
+    // aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_full_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+        return at::_ops::new_full_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), fill_value, out);
+    }
+    
+    // aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_full_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value) {
+        return at::_ops::new_full_out::redispatch(dispatchKeySet, self, size, fill_value, out);
+    }
+    
+    // aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_full_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+        return at::_ops::new_full_out::redispatch(dispatchKeySet, self, size, fill_value, out);
+    }
+    
+    // aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_zeros_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::new_zeros_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_zeros_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_zeros_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_zeros_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::new_zeros_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_zeros_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_zeros_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::new_ones.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_ones_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::new_ones_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_ones.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_ones_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_ones_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::new_ones.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_ones_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::new_ones_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::new_ones.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & new_ones_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::new_ones_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_affine_quantized_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, double scale=1, int64_t zero_point=0, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_affine_quantized_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scale, zero_point, memory_format, out);
+    }
+    
+    // aten::_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_affine_quantized_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::_empty_affine_quantized_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scale, zero_point, memory_format, out);
+    }
+    
+    // aten::_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_affine_quantized_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, double scale=1, int64_t zero_point=0, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_affine_quantized_out::redispatch(dispatchKeySet, size, scale, zero_point, memory_format, out);
+    }
+    
+    // aten::_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_affine_quantized_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, double scale, int64_t zero_point, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::_empty_affine_quantized_out::redispatch(dispatchKeySet, size, scale, zero_point, memory_format, out);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_per_channel_affine_quantized_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_per_channel_affine_quantized_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_per_channel_affine_quantized_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::_empty_per_channel_affine_quantized_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_per_channel_affine_quantized_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::MemoryFormat> memory_format=MemoryFormat::Contiguous) {
+        return at::_ops::_empty_per_channel_affine_quantized_out::redispatch(dispatchKeySet, size, scales, zero_points, axis, memory_format, out);
+    }
+    
+    // aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _empty_per_channel_affine_quantized_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::_empty_per_channel_affine_quantized_out::redispatch(dispatchKeySet, size, scales, zero_points, axis, memory_format, out);
+    }
+    
+    // aten::resize.out(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), memory_format, out);
+    }
+    
+    // aten::resize.out(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format, const at::Tensor & out) {
+        return at::_ops::resize_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), memory_format, out);
+    }
+    
+    // aten::resize.out(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_out::redispatch(dispatchKeySet, self, size, memory_format, out);
+    }
+    
+    // aten::resize.out(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format, const at::Tensor & out) {
+        return at::_ops::resize_out::redispatch(dispatchKeySet, self, size, memory_format, out);
+    }
+    
+    // aten::resize(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor resize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), memory_format);
+    }
+    
+    // aten::resize(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor resize_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize::redispatch(dispatchKeySet, self, size, memory_format);
+    }
+    
+    // aten::_resize_output.out(Tensor self, SymInt[] size, Device device, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _resize_output_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), device, out);
+    }
+    
+    // aten::_resize_output.out(Tensor self, SymInt[] size, Device device, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _resize_output_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Device device, const at::Tensor & out) {
+        return at::_ops::_resize_output_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), device, out);
+    }
+    
+    // aten::_resize_output.out(Tensor self, SymInt[] size, Device device, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _resize_output_symint_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output_out::redispatch(dispatchKeySet, self, size, device, out);
+    }
+    
+    // aten::_resize_output.out(Tensor self, SymInt[] size, Device device, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & _resize_output_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Device device, const at::Tensor & out) {
+        return at::_ops::_resize_output_out::redispatch(dispatchKeySet, self, size, device, out);
+    }
+    
+    // aten::_resize_output(Tensor self, SymInt[] size, Device device) -> Tensor
+    inline at::Tensor _resize_output(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), device);
+    }
+    
+    // aten::_resize_output(Tensor self, SymInt[] size, Device device) -> Tensor
+    inline at::Tensor _resize_output_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Device device) {
+        return at::_ops::_resize_output::redispatch(dispatchKeySet, self, size, device);
+    }
+    
+    // aten::empty_quantized.out(int[] size, Tensor qtensor, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_quantized_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, const at::Tensor & qtensor, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_quantized_out::redispatch(dispatchKeySet, size, qtensor, memory_format, out);
+    }
+    
+    // aten::empty_quantized.out(int[] size, Tensor qtensor, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_quantized_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & qtensor, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::empty_quantized_out::redispatch(dispatchKeySet, size, qtensor, memory_format, out);
+    }
+    
+    // aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::empty_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::empty_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_strided_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, at::IntArrayRef stride) {
+        return at::_ops::empty_strided_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_strided_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::empty_strided_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_strided_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
+        return at::_ops::empty_strided_out::redispatch(dispatchKeySet, size, stride, out);
+    }
+    
+    // aten::empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & empty_strided_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) {
+        return at::_ops::empty_strided_out::redispatch(dispatchKeySet, size, stride, out);
+    }
+    
+    // aten::fill.Scalar_out(Tensor self, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & value) {
+        return at::_ops::fill_Scalar_out::redispatch(dispatchKeySet, self, value, out);
+    }
+    
+    // aten::fill.Scalar_out(Tensor self, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::fill_Scalar_out::redispatch(dispatchKeySet, self, value, out);
+    }
+    
+    // aten::fill.Tensor_out(Tensor self, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & value) {
+        return at::_ops::fill_Tensor_out::redispatch(dispatchKeySet, self, value, out);
+    }
+    
+    // aten::fill.Tensor_out(Tensor self, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value, at::Tensor & out) {
+        return at::_ops::fill_Tensor_out::redispatch(dispatchKeySet, self, value, out);
+    }
+    
+    // aten::floor_divide.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_divide_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::floor_divide_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::floor_divide.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & floor_divide_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::floor_divide_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::full.names_out(int[] size, Scalar fill_value, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::DimnameList> names) {
+        return at::_ops::full_names_out::redispatch(dispatchKeySet, size, fill_value, names, out);
+    }
+    
+    // aten::full.names_out(int[] size, Scalar fill_value, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::full_names_out::redispatch(dispatchKeySet, size, fill_value, names, out);
+    }
+    
+    // aten::full_like.out(Tensor self, Scalar fill_value, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::full_like_out::redispatch(dispatchKeySet, self, fill_value, memory_format, out);
+    }
+    
+    // aten::full_like.out(Tensor self, Scalar fill_value, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & full_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::full_like_out::redispatch(dispatchKeySet, self, fill_value, memory_format, out);
+    }
+    
+    // aten::from_file.out(str filename, bool? shared=None, int? size=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & from_file_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::string_view filename, c10::optional<bool> shared=c10::nullopt, c10::optional<int64_t> size=0) {
+        return at::_ops::from_file_out::redispatch(dispatchKeySet, filename, shared, size, out);
+    }
+    
+    // aten::from_file.out(str filename, bool? shared=None, int? size=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & from_file_outf(c10::DispatchKeySet dispatchKeySet, c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size, at::Tensor & out) {
+        return at::_ops::from_file_out::redispatch(dispatchKeySet, filename, shared, size, out);
+    }
+    
+    // aten::grid_sampler_2d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & grid_sampler_2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::grid_sampler_2d_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::grid_sampler_2d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & grid_sampler_2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out) {
+        return at::_ops::grid_sampler_2d_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::grid_sampler_2d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask) {
+        return at::_ops::grid_sampler_2d_backward_out::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask, out0, out1);
+    }
+    
+    // aten::grid_sampler_2d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::grid_sampler_2d_backward_out::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask, out0, out1);
+    }
+    
+    // aten::_grid_sampler_2d_cpu_fallback.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _grid_sampler_2d_cpu_fallback_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::_grid_sampler_2d_cpu_fallback_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::_grid_sampler_2d_cpu_fallback.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _grid_sampler_2d_cpu_fallback_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out) {
+        return at::_ops::_grid_sampler_2d_cpu_fallback_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::grid_sampler_3d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & grid_sampler_3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
+        return at::_ops::grid_sampler_3d_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::grid_sampler_3d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & grid_sampler_3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out) {
+        return at::_ops::grid_sampler_3d_out::redispatch(dispatchKeySet, input, grid, interpolation_mode, padding_mode, align_corners, out);
+    }
+    
+    // aten::grid_sampler_3d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask) {
+        return at::_ops::grid_sampler_3d_backward_out::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask, out0, out1);
+    }
+    
+    // aten::grid_sampler_3d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::grid_sampler_3d_backward_out::redispatch(dispatchKeySet, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask, out0, out1);
+    }
+    
+    // aten::hann_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hann_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length) {
+        return at::_ops::hann_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::hann_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hann_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out) {
+        return at::_ops::hann_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::hann_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hann_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic) {
+        return at::_ops::hann_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::hann_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hann_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out) {
+        return at::_ops::hann_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::hamming_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length) {
+        return at::_ops::hamming_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::hamming_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out) {
+        return at::_ops::hamming_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::hamming_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic) {
+        return at::_ops::hamming_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::hamming_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out) {
+        return at::_ops::hamming_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::hamming_window.periodic_alpha_out(int window_length, bool periodic, float alpha, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic, double alpha) {
+        return at::_ops::hamming_window_periodic_alpha_out::redispatch(dispatchKeySet, window_length, periodic, alpha, out);
+    }
+    
+    // aten::hamming_window.periodic_alpha_out(int window_length, bool periodic, float alpha, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, at::Tensor & out) {
+        return at::_ops::hamming_window_periodic_alpha_out::redispatch(dispatchKeySet, window_length, periodic, alpha, out);
+    }
+    
+    // aten::hamming_window.periodic_alpha_beta_out(int window_length, bool periodic, float alpha, float beta, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic, double alpha, double beta) {
+        return at::_ops::hamming_window_periodic_alpha_beta_out::redispatch(dispatchKeySet, window_length, periodic, alpha, beta, out);
+    }
+    
+    // aten::hamming_window.periodic_alpha_beta_out(int window_length, bool periodic, float alpha, float beta, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hamming_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double alpha, double beta, at::Tensor & out) {
+        return at::_ops::hamming_window_periodic_alpha_beta_out::redispatch(dispatchKeySet, window_length, periodic, alpha, beta, out);
+    }
+    
+    // aten::kaiser_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length) {
+        return at::_ops::kaiser_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::kaiser_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out) {
+        return at::_ops::kaiser_window_out::redispatch(dispatchKeySet, window_length, out);
+    }
+    
+    // aten::kaiser_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic) {
+        return at::_ops::kaiser_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::kaiser_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out) {
+        return at::_ops::kaiser_window_periodic_out::redispatch(dispatchKeySet, window_length, periodic, out);
+    }
+    
+    // aten::kaiser_window.beta_out(int window_length, bool periodic, float beta, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t window_length, bool periodic, double beta) {
+        return at::_ops::kaiser_window_beta_out::redispatch(dispatchKeySet, window_length, periodic, beta, out);
+    }
+    
+    // aten::kaiser_window.beta_out(int window_length, bool periodic, float beta, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & kaiser_window_outf(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, double beta, at::Tensor & out) {
+        return at::_ops::kaiser_window_beta_out::redispatch(dispatchKeySet, window_length, periodic, beta, out);
+    }
+    
+    // aten::native_group_norm.out(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps) {
+        return at::_ops::native_group_norm_out::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm.out(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_group_norm_out::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm.out(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, double eps) {
+        return at::_ops::native_group_norm_out::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm.out(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_group_norm_out::redispatch(dispatchKeySet, input, weight, bias, N, C, HxW, group, eps, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_group_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_group_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_group_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_group_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+    }
+    
+    // aten::index_put.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_put_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) {
+        return at::_ops::index_put_out::redispatch(dispatchKeySet, self, indices, values, accumulate, out);
+    }
+    
+    // aten::index_put.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_put_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, at::Tensor & out) {
+        return at::_ops::index_put_out::redispatch(dispatchKeySet, self, indices, values, accumulate, out);
+    }
+    
+    // aten::_index_put_impl.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _index_put_impl_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false) {
+        return at::_ops::_index_put_impl_out::redispatch(dispatchKeySet, self, indices, values, accumulate, unsafe, out);
+    }
+    
+    // aten::_index_put_impl.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _index_put_impl_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, bool unsafe, at::Tensor & out) {
+        return at::_ops::_index_put_impl_out::redispatch(dispatchKeySet, self, indices, values, accumulate, unsafe, out);
+    }
+    
+    // aten::_index_put_impl(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor
+    inline at::Tensor _index_put_impl(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false) {
+        return at::_ops::_index_put_impl::redispatch(dispatchKeySet, self, indices, values, accumulate, unsafe);
+    }
+    
+    // aten::isnan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isnan_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::isnan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::isnan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isnan_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::isnan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+        return at::_ops::native_layer_norm_out::redispatch(dispatchKeySet, input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_layer_norm_out::redispatch(dispatchKeySet, input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+        return at::_ops::native_layer_norm_out::redispatch(dispatchKeySet, input, normalized_shape, weight, bias, eps, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_layer_norm_out::redispatch(dispatchKeySet, input, normalized_shape, weight, bias, eps, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_layer_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_layer_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_layer_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_layer_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+    }
+    
+    // aten::linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linear_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
+        return at::_ops::linear_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, output_mask, out0, out1, out2);
+    }
+    
+    // aten::linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linear_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::linear_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mkldnn_linear.out(Tensor self, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_linear_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}) {
+        return at::_ops::mkldnn_linear_out::redispatch(dispatchKeySet, self, weight, bias, out);
+    }
+    
+    // aten::mkldnn_linear.out(Tensor self, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_linear_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::Tensor & out) {
+        return at::_ops::mkldnn_linear_out::redispatch(dispatchKeySet, self, weight, bias, out);
+    }
+    
+    // aten::mkldnn_linear_backward_input.out(int[] input_size, Tensor grad_output, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_linear_backward_input_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight) {
+        return at::_ops::mkldnn_linear_backward_input_out::redispatch(dispatchKeySet, input_size, grad_output, weight, out);
+    }
+    
+    // aten::mkldnn_linear_backward_input.out(int[] input_size, Tensor grad_output, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_linear_backward_input_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight, at::Tensor & out) {
+        return at::_ops::mkldnn_linear_backward_input_out::redispatch(dispatchKeySet, input_size, grad_output, weight, out);
+    }
+    
+    // aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mkldnn_linear_backward_weights_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) {
+        return at::_ops::mkldnn_linear_backward_weights_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_defined, out0, out1);
+    }
+    
+    // aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> mkldnn_linear_backward_weights_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::mkldnn_linear_backward_weights_out::redispatch(dispatchKeySet, grad_output, input, weight, bias_defined, out0, out1);
+    }
+    
+    // aten::mkldnn_linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_linear_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask) {
+        return at::_ops::mkldnn_linear_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mkldnn_linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_linear_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::mkldnn_linear_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, output_mask, out0, out1, out2);
+    }
+    
+    // aten::matmul_backward.out(Tensor grad, Tensor self, Tensor other, bool[2] mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> matmul_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
+        return at::_ops::matmul_backward_out::redispatch(dispatchKeySet, grad, self, other, mask, out0, out1);
+    }
+    
+    // aten::matmul_backward.out(Tensor grad, Tensor self, Tensor other, bool[2] mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> matmul_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::matmul_backward_out::redispatch(dispatchKeySet, grad, self, other, mask, out0, out1);
+    }
+    
+    // aten::_aminmax.out(Tensor self, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _aminmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self) {
+        return at::_ops::_aminmax_out::redispatch(dispatchKeySet, self, out0, out1);
+    }
+    
+    // aten::_aminmax.out(Tensor self, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _aminmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_aminmax_out::redispatch(dispatchKeySet, self, out0, out1);
+    }
+    
+    // aten::_aminmax.dim_out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _aminmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, int64_t dim, bool keepdim=false) {
+        return at::_ops::_aminmax_dim_out::redispatch(dispatchKeySet, self, dim, keepdim, out0, out1);
+    }
+    
+    // aten::_aminmax.dim_out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _aminmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_aminmax_dim_out::redispatch(dispatchKeySet, self, dim, keepdim, out0, out1);
+    }
+    
+    // aten::max_pool2d_backward.out(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::max_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::max_pool2d_backward.out(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & max_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::max_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::mkldnn_max_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool2d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool2d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::mkldnn_max_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::mkldnn_max_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool3d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::mkldnn_max_pool3d_backward_out::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::mkldnn_max_pool3d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_max_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::mkldnn_max_pool3d_backward_out::redispatch(dispatchKeySet, grad_output, output, input, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool1d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool1d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool1d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::quantized_max_pool1d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::quantized_max_pool2d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+        return at::_ops::quantized_max_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::quantized_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantized_max_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+        return at::_ops::quantized_max_pool3d_out::redispatch(dispatchKeySet, self, kernel_size, stride, padding, dilation, ceil_mode, out);
+    }
+    
+    // aten::median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & median_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::median_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & median_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::median_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::nanmedian.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanmedian_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::nanmedian_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::nanmedian.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & nanmedian_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::nanmedian_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_mps_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::_mps_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::_mps_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::_mps_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::_mps_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::_mps_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::_mps_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mps_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::_mps_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::mps_convolution_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mps_convolution_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::mps_convolution_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mps_convolution_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mps_convolution_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::mps_convolution_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mps_convolution_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mps_convolution_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+        return at::_ops::mps_convolution_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, padding, stride, dilation, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mps_convolution_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> mps_convolution_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::mps_convolution_backward_out::redispatch(dispatchKeySet, self, grad_output, weight, padding, stride, dilation, groups, output_mask, out0, out1, out2);
+    }
+    
+    // aten::mkldnn_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups) {
+        return at::_ops::mkldnn_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::mkldnn_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::mkldnn_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::mkldnn_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+        return at::_ops::mkldnn_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::mkldnn_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::mkldnn_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::mkldnn_rnn_layer.out(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_rnn_layer_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & input, const at::Tensor & weight0, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & hx_, const at::Tensor & cx_, bool reverse, at::IntArrayRef batch_sizes, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) {
+        return at::_ops::mkldnn_rnn_layer_out::redispatch(dispatchKeySet, input, weight0, weight1, weight2, weight3, hx_, cx_, reverse, batch_sizes, mode, hidden_size, num_layers, has_biases, bidirectional, batch_first, train, out0, out1, out2, out3);
+    }
+    
+    // aten::mkldnn_rnn_layer.out(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_rnn_layer_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight0, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & hx_, const at::Tensor & cx_, bool reverse, at::IntArrayRef batch_sizes, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) {
+        return at::_ops::mkldnn_rnn_layer_out::redispatch(dispatchKeySet, input, weight0, weight1, weight2, weight3, hx_, cx_, reverse, batch_sizes, mode, hidden_size, num_layers, has_biases, bidirectional, batch_first, train, out0, out1, out2, out3);
+    }
+    
+    // aten::mkldnn_rnn_layer_backward.out(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5, Tensor(g!) out6) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_rnn_layer_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, at::Tensor & out6, const at::Tensor & input, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & weight4, const at::Tensor & hx_, const at::Tensor & cx_tmp, const at::Tensor & output, const at::Tensor & hy_, const at::Tensor & cy_, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, bool batch_first, const at::Tensor & workspace) {
+        return at::_ops::mkldnn_rnn_layer_backward_out::redispatch(dispatchKeySet, input, weight1, weight2, weight3, weight4, hx_, cx_tmp, output, hy_, cy_, grad_output, grad_hy, grad_cy, reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, workspace, out0, out1, out2, out3, out4, out5, out6);
+    }
+    
+    // aten::mkldnn_rnn_layer_backward.out(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5, Tensor(g!) out6) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> mkldnn_rnn_layer_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & weight4, const at::Tensor & hx_, const at::Tensor & cx_tmp, const at::Tensor & output, const at::Tensor & hy_, const at::Tensor & cy_, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, bool batch_first, const at::Tensor & workspace, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, at::Tensor & out6) {
+        return at::_ops::mkldnn_rnn_layer_backward_out::redispatch(dispatchKeySet, input, weight1, weight2, weight3, weight4, hx_, cx_tmp, output, hy_, cy_, grad_output, grad_hy, grad_cy, reverse, mode, hidden_size, num_layers, has_biases, train, bidirectional, batch_sizes, batch_first, workspace, out0, out1, out2, out3, out4, out5, out6);
+    }
+    
+    // aten::miopen_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon) {
+        return at::_ops::miopen_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon, out0, out1, out2);
+    }
+    
+    // aten::miopen_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::miopen_batch_norm_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, exponential_average_factor, epsilon, out0, out1, out2);
+    }
+    
+    // aten::miopen_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon) {
+        return at::_ops::miopen_batch_norm_backward_out::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon, out0, out1, out2);
+    }
+    
+    // aten::miopen_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::miopen_batch_norm_backward_out::redispatch(dispatchKeySet, input, grad_output, weight, running_mean, running_var, save_mean, save_var, epsilon, out0, out1, out2);
+    }
+    
+    // aten::miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution_transpose.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_transpose_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution_transpose.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_transpose_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution_transpose.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_transpose_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, bias, padding, output_padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_convolution_transpose.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_convolution_transpose_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_convolution_transpose_out::redispatch(dispatchKeySet, self, weight, bias, padding, output_padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_depthwise_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_depthwise_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_depthwise_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_depthwise_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_depthwise_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_depthwise_convolution_out::redispatch(dispatchKeySet, self, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_depthwise_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_depthwise_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic) {
+        return at::_ops::miopen_depthwise_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_depthwise_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & miopen_depthwise_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out) {
+        return at::_ops::miopen_depthwise_convolution_out::redispatch(dispatchKeySet, self, weight, bias, padding, stride, dilation, groups, benchmark, deterministic, out);
+    }
+    
+    // aten::miopen_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> miopen_rnn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state) {
+        return at::_ops::miopen_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::miopen_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> miopen_rnn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4) {
+        return at::_ops::miopen_rnn_out::redispatch(dispatchKeySet, input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::miopen_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void miopen_rnn_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask) {
+        return at::_ops::miopen_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::miopen_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()
+    inline void miopen_rnn_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const c10::optional<at::Tensor> & cx, const at::Tensor & output, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const c10::optional<at::Tensor> & dropout_state, const at::Tensor & reserve, ::std::array<bool,4> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::TensorList out3) {
+        return at::_ops::miopen_rnn_backward_out::redispatch(dispatchKeySet, input, weight, weight_stride0, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, output_mask, out0, out1, out2, out3);
+    }
+    
+    // aten::_sparse_sparse_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sparse_matmul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::_sparse_sparse_matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_sparse_sparse_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sparse_matmul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::_sparse_sparse_matmul_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::mul.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mul_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::mul_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::mul.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mul_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::mul_Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_native_batch_norm_legit_functional(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor running_mean_out, Tensor running_var_out)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_functional(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, bool training, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_functional::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, training, momentum, eps);
+    }
+    
+    // aten::_native_batch_norm_legit_no_training.out(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_no_training_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps) {
+        return at::_ops::_native_batch_norm_legit_no_training_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, momentum, eps, out0, out1, out2);
+    }
+    
+    // aten::_native_batch_norm_legit_no_training.out(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _native_batch_norm_legit_no_training_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_native_batch_norm_legit_no_training_out::redispatch(dispatchKeySet, input, weight, bias, running_mean, running_var, momentum, eps, out0, out1, out2);
+    }
+    
+    // aten::batch_norm_stats.out(Tensor input, float eps, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_stats_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, double eps) {
+        return at::_ops::batch_norm_stats_out::redispatch(dispatchKeySet, input, eps, out0, out1);
+    }
+    
+    // aten::batch_norm_stats.out(Tensor input, float eps, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_stats_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double eps, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::batch_norm_stats_out::redispatch(dispatchKeySet, input, eps, out0, out1);
+    }
+    
+    // aten::batch_norm_gather_stats.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_gather_stats_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count) {
+        return at::_ops::batch_norm_gather_stats_out::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, count, out0, out1);
+    }
+    
+    // aten::batch_norm_gather_stats.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_gather_stats_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::batch_norm_gather_stats_out::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, count, out0, out1);
+    }
+    
+    // aten::batch_norm_gather_stats_with_counts.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_gather_stats_with_counts_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, const at::Tensor & counts) {
+        return at::_ops::batch_norm_gather_stats_with_counts_out::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, counts, out0, out1);
+    }
+    
+    // aten::batch_norm_gather_stats_with_counts.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_gather_stats_with_counts_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, const at::Tensor & counts, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::batch_norm_gather_stats_with_counts_out::redispatch(dispatchKeySet, input, mean, invstd, running_mean, running_var, momentum, eps, counts, out0, out1);
+    }
+    
+    // aten::native_batch_norm_backward.out(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_batch_norm_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask) {
+        return at::_ops::native_batch_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps, output_mask, out0, out1, out2);
+    }
+    
+    // aten::native_batch_norm_backward.out(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_batch_norm_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::native_batch_norm_backward_out::redispatch(dispatchKeySet, grad_out, input, weight, running_mean, running_var, save_mean, save_invstd, train, eps, output_mask, out0, out1, out2);
+    }
+    
+    // aten::batch_norm_backward_reduce.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> batch_norm_backward_reduce_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, bool input_g, bool weight_g, bool bias_g) {
+        return at::_ops::batch_norm_backward_reduce_out::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g, out0, out1, out2, out3);
+    }
+    
+    // aten::batch_norm_backward_reduce.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> batch_norm_backward_reduce_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, bool input_g, bool weight_g, bool bias_g, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) {
+        return at::_ops::batch_norm_backward_reduce_out::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g, out0, out1, out2, out3);
+    }
+    
+    // aten::batch_norm_backward_elemt.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & batch_norm_backward_elemt_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count) {
+        return at::_ops::batch_norm_backward_elemt_out::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count, out);
+    }
+    
+    // aten::batch_norm_backward_elemt.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & batch_norm_backward_elemt_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count, at::Tensor & out) {
+        return at::_ops::batch_norm_backward_elemt_out::redispatch(dispatchKeySet, grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count, out);
+    }
+    
+    // aten::batch_norm_update_stats.out(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_update_stats_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum) {
+        return at::_ops::batch_norm_update_stats_out::redispatch(dispatchKeySet, input, running_mean, running_var, momentum, out0, out1);
+    }
+    
+    // aten::batch_norm_update_stats.out(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_update_stats_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::batch_norm_update_stats_out::redispatch(dispatchKeySet, input, running_mean, running_var, momentum, out0, out1);
+    }
+    
+    // aten::_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nnpack_spatial_convolution_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride=1) {
+        return at::_ops::_nnpack_spatial_convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nnpack_spatial_convolution_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::_nnpack_spatial_convolution_out::redispatch(dispatchKeySet, input, weight, bias, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nnpack_spatial_convolution_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride=c10::SymInt(1)) {
+        return at::_ops::_nnpack_spatial_convolution_out::redispatch(dispatchKeySet, input, weight, bias, padding, stride, out);
+    }
+    
+    // aten::_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nnpack_spatial_convolution_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, at::Tensor & out) {
+        return at::_ops::_nnpack_spatial_convolution_out::redispatch(dispatchKeySet, input, weight, bias, padding, stride, out);
+    }
+    
+    // aten::ones.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::ones_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::ones.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::ones_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::ones_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::ones_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::ones_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ones_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::ones_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::_euclidean_dist.out(Tensor x1, Tensor x2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _euclidean_dist_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x1, const at::Tensor & x2) {
+        return at::_ops::_euclidean_dist_out::redispatch(dispatchKeySet, x1, x2, out);
+    }
+    
+    // aten::_euclidean_dist.out(Tensor x1, Tensor x2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _euclidean_dist_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, at::Tensor & out) {
+        return at::_ops::_euclidean_dist_out::redispatch(dispatchKeySet, x1, x2, out);
+    }
+    
+    // aten::_cdist_forward.out(Tensor x1, Tensor x2, float p, int? compute_mode, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cdist_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional<int64_t> compute_mode) {
+        return at::_ops::_cdist_forward_out::redispatch(dispatchKeySet, x1, x2, p, compute_mode, out);
+    }
+    
+    // aten::_cdist_forward.out(Tensor x1, Tensor x2, float p, int? compute_mode, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cdist_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional<int64_t> compute_mode, at::Tensor & out) {
+        return at::_ops::_cdist_forward_out::redispatch(dispatchKeySet, x1, x2, p, compute_mode, out);
+    }
+    
+    // aten::_cdist_backward.out(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cdist_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist) {
+        return at::_ops::_cdist_backward_out::redispatch(dispatchKeySet, grad, x1, x2, p, cdist, out);
+    }
+    
+    // aten::_cdist_backward.out(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cdist_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist, at::Tensor & out) {
+        return at::_ops::_cdist_backward_out::redispatch(dispatchKeySet, grad, x1, x2, p, cdist, out);
+    }
+    
+    // aten::_pdist_forward.out(Tensor self, float p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pdist_forward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double p=2) {
+        return at::_ops::_pdist_forward_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::_pdist_forward.out(Tensor self, float p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pdist_forward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, at::Tensor & out) {
+        return at::_ops::_pdist_forward_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::_pdist_backward.out(Tensor grad, Tensor self, float p, Tensor pdist, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pdist_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & self, double p, const at::Tensor & pdist) {
+        return at::_ops::_pdist_backward_out::redispatch(dispatchKeySet, grad, self, p, pdist, out);
+    }
+    
+    // aten::_pdist_backward.out(Tensor grad, Tensor self, float p, Tensor pdist, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pdist_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, double p, const at::Tensor & pdist, at::Tensor & out) {
+        return at::_ops::_pdist_backward_out::redispatch(dispatchKeySet, grad, self, p, pdist, out);
+    }
+    
+    // aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pixel_shuffle_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t upscale_factor) {
+        return at::_ops::pixel_shuffle_out::redispatch(dispatchKeySet, self, upscale_factor, out);
+    }
+    
+    // aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pixel_shuffle_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t upscale_factor, at::Tensor & out) {
+        return at::_ops::pixel_shuffle_out::redispatch(dispatchKeySet, self, upscale_factor, out);
+    }
+    
+    // aten::pixel_unshuffle.out(Tensor self, int downscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pixel_unshuffle_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t downscale_factor) {
+        return at::_ops::pixel_unshuffle_out::redispatch(dispatchKeySet, self, downscale_factor, out);
+    }
+    
+    // aten::pixel_unshuffle.out(Tensor self, int downscale_factor, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & pixel_unshuffle_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t downscale_factor, at::Tensor & out) {
+        return at::_ops::pixel_unshuffle_out::redispatch(dispatchKeySet, self, downscale_factor, out);
+    }
+    
+    // aten::channel_shuffle.out(Tensor self, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & channel_shuffle_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t groups) {
+        return at::_ops::channel_shuffle_out::redispatch(dispatchKeySet, self, groups, out);
+    }
+    
+    // aten::channel_shuffle.out(Tensor self, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & channel_shuffle_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t groups, at::Tensor & out) {
+        return at::_ops::channel_shuffle_out::redispatch(dispatchKeySet, self, groups, out);
+    }
+    
+    // aten::channel_shuffle.out(Tensor self, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & channel_shuffle_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymInt groups) {
+        return at::_ops::channel_shuffle_out::redispatch(dispatchKeySet, self, groups, out);
+    }
+    
+    // aten::channel_shuffle.out(Tensor self, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & channel_shuffle_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::channel_shuffle_out::redispatch(dispatchKeySet, self, groups, out);
+    }
+    
+    // aten::_pin_memory.out(Tensor self, Device? device=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pin_memory_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+        return at::_ops::_pin_memory_out::redispatch(dispatchKeySet, self, device, out);
+    }
+    
+    // aten::_pin_memory.out(Tensor self, Device? device=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _pin_memory_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Device> device, at::Tensor & out) {
+        return at::_ops::_pin_memory_out::redispatch(dispatchKeySet, self, device, out);
+    }
+    
+    // aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scalar_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & s) {
+        return at::_ops::scalar_tensor_out::redispatch(dispatchKeySet, s, out);
+    }
+    
+    // aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & scalar_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & s, at::Tensor & out) {
+        return at::_ops::scalar_tensor_out::redispatch(dispatchKeySet, s, out);
+    }
+    
+    // aten::rand.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::rand_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, out);
+    }
+    
+    // aten::rand.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::rand_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, out);
+    }
+    
+    // aten::rand.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::rand_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::rand.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::rand_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::rand.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names) {
+        return at::_ops::rand_generator_with_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, out);
+    }
+    
+    // aten::rand.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::rand_generator_with_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, out);
+    }
+    
+    // aten::rand.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names) {
+        return at::_ops::rand_generator_with_names_out::redispatch(dispatchKeySet, size, generator, names, out);
+    }
+    
+    // aten::rand.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::rand_generator_with_names_out::redispatch(dispatchKeySet, size, generator, names, out);
+    }
+    
+    // aten::rand_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::rand_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::rand_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rand_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::rand_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::randint_like.out(Tensor self, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t high, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_out::redispatch(dispatchKeySet, self, high, memory_format, out);
+    }
+    
+    // aten::randint_like.out(Tensor self, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t high, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::randint_like_out::redispatch(dispatchKeySet, self, high, memory_format, out);
+    }
+    
+    // aten::randint_like.out(Tensor self, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymInt high, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_out::redispatch(dispatchKeySet, self, high, memory_format, out);
+    }
+    
+    // aten::randint_like.out(Tensor self, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt high, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::randint_like_out::redispatch(dispatchKeySet, self, high, memory_format, out);
+    }
+    
+    // aten::randint_like.low_dtype_out(Tensor self, SymInt low, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t low, int64_t high, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_low_dtype_out::redispatch(dispatchKeySet, self, low, high, memory_format, out);
+    }
+    
+    // aten::randint_like.low_dtype_out(Tensor self, SymInt low, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t low, int64_t high, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::randint_like_low_dtype_out::redispatch(dispatchKeySet, self, low, high, memory_format, out);
+    }
+    
+    // aten::randint_like.low_dtype_out(Tensor self, SymInt low, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymInt low, c10::SymInt high, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randint_like_low_dtype_out::redispatch(dispatchKeySet, self, low, high, memory_format, out);
+    }
+    
+    // aten::randint_like.low_dtype_out(Tensor self, SymInt low, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randint_like_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt low, c10::SymInt high, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::randint_like_low_dtype_out::redispatch(dispatchKeySet, self, low, high, memory_format, out);
+    }
+    
+    // aten::randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::randn_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, out);
+    }
+    
+    // aten::randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::randn_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), names, out);
+    }
+    
+    // aten::randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::randn_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::randn_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names) {
+        return at::_ops::randn_generator_with_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, out);
+    }
+    
+    // aten::randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::randn_generator_with_names_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), generator, names, out);
+    }
+    
+    // aten::randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names) {
+        return at::_ops::randn_generator_with_names_out::redispatch(dispatchKeySet, size, generator, names, out);
+    }
+    
+    // aten::randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::randn_generator_with_names_out::redispatch(dispatchKeySet, size, generator, names, out);
+    }
+    
+    // aten::randn_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::randn_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::randn_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & randn_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::randn_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef repeats) {
+        return at::_ops::repeat_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(repeats), out);
+    }
+    
+    // aten::repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef repeats, at::Tensor & out) {
+        return at::_ops::repeat_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(repeats), out);
+    }
+    
+    // aten::repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef repeats) {
+        return at::_ops::repeat_out::redispatch(dispatchKeySet, self, repeats, out);
+    }
+    
+    // aten::repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef repeats, at::Tensor & out) {
+        return at::_ops::repeat_out::redispatch(dispatchKeySet, self, repeats, out);
+    }
+    
+    // aten::repeat_interleave.Tensor_out(Tensor repeats, *, SymInt? output_size=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_interleave_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & repeats, c10::optional<int64_t> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_Tensor_out::redispatch(dispatchKeySet, repeats, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt, out);
+    }
+    
+    // aten::repeat_interleave.Tensor_out(Tensor repeats, *, SymInt? output_size=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_interleave_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & repeats, c10::optional<int64_t> output_size, at::Tensor & out) {
+        return at::_ops::repeat_interleave_Tensor_out::redispatch(dispatchKeySet, repeats, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt, out);
+    }
+    
+    // aten::repeat_interleave.Tensor_out(Tensor repeats, *, SymInt? output_size=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_interleave_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & repeats, c10::optional<c10::SymInt> output_size=c10::nullopt) {
+        return at::_ops::repeat_interleave_Tensor_out::redispatch(dispatchKeySet, repeats, output_size, out);
+    }
+    
+    // aten::repeat_interleave.Tensor_out(Tensor repeats, *, SymInt? output_size=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & repeat_interleave_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & repeats, c10::optional<c10::SymInt> output_size, at::Tensor & out) {
+        return at::_ops::repeat_interleave_Tensor_out::redispatch(dispatchKeySet, repeats, output_size, out);
+    }
+    
+    // aten::_mkldnn_reshape.out(Tensor self, int[] shape, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mkldnn_reshape_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef shape) {
+        return at::_ops::_mkldnn_reshape_out::redispatch(dispatchKeySet, self, shape, out);
+    }
+    
+    // aten::_mkldnn_reshape.out(Tensor self, int[] shape, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mkldnn_reshape_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef shape, at::Tensor & out) {
+        return at::_ops::_mkldnn_reshape_out::redispatch(dispatchKeySet, self, shape, out);
+    }
+    
+    // aten::relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & relu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::relu_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & relu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::relu_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+        return at::_ops::select_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+    }
+    
+    // aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index, at::Tensor & out) {
+        return at::_ops::select_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+    }
+    
+    // aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, dim, index, out);
+    }
+    
+    // aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index, at::Tensor & out) {
+        return at::_ops::select_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, dim, index, out);
+    }
+    
+    // aten::celu.out(Tensor self, Scalar alpha=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & celu_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & alpha=1.0) {
+        return at::_ops::celu_out::redispatch(dispatchKeySet, self, alpha, out);
+    }
+    
+    // aten::celu.out(Tensor self, Scalar alpha=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & celu_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::celu_out::redispatch(dispatchKeySet, self, alpha, out);
+    }
+    
+    // aten::slice_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) {
+        return at::_ops::slice_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, start, end, step, out);
+    }
+    
+    // aten::slice_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step, at::Tensor & out) {
+        return at::_ops::slice_backward_out::redispatch(dispatchKeySet, grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, start, end, step, out);
+    }
+    
+    // aten::slice_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step) {
+        return at::_ops::slice_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, dim, start, end, step, out);
+    }
+    
+    // aten::slice_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step, at::Tensor & out) {
+        return at::_ops::slice_backward_out::redispatch(dispatchKeySet, grad_output, input_sizes, dim, start, end, step, out);
+    }
+    
+    // aten::slice_scatter.out(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_scatter_out::redispatch(dispatchKeySet, self, src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step, out);
+    }
+    
+    // aten::slice_scatter.out(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step, at::Tensor & out) {
+        return at::_ops::slice_scatter_out::redispatch(dispatchKeySet, self, src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step, out);
+    }
+    
+    // aten::slice_scatter.out(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_scatter_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_scatter_out::redispatch(dispatchKeySet, self, src, dim, start, end, step, out);
+    }
+    
+    // aten::slice_scatter.out(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_scatter_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step, at::Tensor & out) {
+        return at::_ops::slice_scatter_out::redispatch(dispatchKeySet, self, src, dim, start, end, step, out);
+    }
+    
+    // aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index) {
+        return at::_ops::select_scatter_out::redispatch(dispatchKeySet, self, src, dim, index, out);
+    }
+    
+    // aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index, at::Tensor & out) {
+        return at::_ops::select_scatter_out::redispatch(dispatchKeySet, self, src, dim, index, out);
+    }
+    
+    // aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_scatter_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_scatter_out::redispatch(dispatchKeySet, self, src, dim, index, out);
+    }
+    
+    // aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_scatter_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::SymInt index, at::Tensor & out) {
+        return at::_ops::select_scatter_out::redispatch(dispatchKeySet, self, src, dim, index, out);
+    }
+    
+    // aten::diagonal_scatter.out(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) {
+        return at::_ops::diagonal_scatter_out::redispatch(dispatchKeySet, self, src, offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_scatter.out(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+        return at::_ops::diagonal_scatter_out::redispatch(dispatchKeySet, self, src, offset, dim1, dim2, out);
+    }
+    
+    // aten::as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_scatter_out::redispatch(dispatchKeySet, self, src, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt, out);
+    }
+    
+    // aten::as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset, at::Tensor & out) {
+        return at::_ops::as_strided_scatter_out::redispatch(dispatchKeySet, self, src, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt, out);
+    }
+    
+    // aten::as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_scatter_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_scatter_out::redispatch(dispatchKeySet, self, src, size, stride, storage_offset, out);
+    }
+    
+    // aten::as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_scatter_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset, at::Tensor & out) {
+        return at::_ops::as_strided_scatter_out::redispatch(dispatchKeySet, self, src, size, stride, storage_offset, out);
+    }
+    
+    // aten::unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+        return at::_ops::unsafe_split_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList out) {
+        return at::_ops::unsafe_split_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_symint_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+        return at::_ops::unsafe_split_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out) {
+        return at::_ops::unsafe_split_Tensor_out::redispatch(dispatchKeySet, self, split_size, dim, out);
+    }
+    
+    // aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_with_sizes_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::unsafe_split_with_sizes_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+    }
+    
+    // aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_with_sizes_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+        return at::_ops::unsafe_split_with_sizes_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(split_sizes), dim, out);
+    }
+    
+    // aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_with_sizes_symint_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim=0) {
+        return at::_ops::unsafe_split_with_sizes_out::redispatch(dispatchKeySet, self, split_sizes, dim, out);
+    }
+    
+    // aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()
+    inline void unsafe_split_with_sizes_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out) {
+        return at::_ops::unsafe_split_with_sizes_out::redispatch(dispatchKeySet, self, split_sizes, dim, out);
+    }
+    
+    // aten::sum.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::sum_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::sum.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::sum_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::std_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> std_mean_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::std_mean_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out0, out1);
+    }
+    
+    // aten::std_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> std_mean_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::std_mean_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out0, out1);
+    }
+    
+    // aten::prod.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::prod_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::prod.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & prod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::prod_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::_mkldnn_transpose.out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mkldnn_transpose_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::_mkldnn_transpose_out::redispatch(dispatchKeySet, self, dim0, dim1, out);
+    }
+    
+    // aten::_mkldnn_transpose.out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _mkldnn_transpose_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out) {
+        return at::_ops::_mkldnn_transpose_out::redispatch(dispatchKeySet, self, dim0, dim1, out);
+    }
+    
+    // aten::flip.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & flip_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::flip_out::redispatch(dispatchKeySet, self, dims, out);
+    }
+    
+    // aten::flip.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & flip_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out) {
+        return at::_ops::flip_out::redispatch(dispatchKeySet, self, dims, out);
+    }
+    
+    // aten::roll.out(Tensor self, SymInt[1] shifts, int[1] dims=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & roll_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef shifts, at::IntArrayRef dims={}) {
+        return at::_ops::roll_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(shifts), dims, out);
+    }
+    
+    // aten::roll.out(Tensor self, SymInt[1] shifts, int[1] dims=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & roll_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef shifts, at::IntArrayRef dims, at::Tensor & out) {
+        return at::_ops::roll_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(shifts), dims, out);
+    }
+    
+    // aten::roll.out(Tensor self, SymInt[1] shifts, int[1] dims=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & roll_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef shifts, at::IntArrayRef dims={}) {
+        return at::_ops::roll_out::redispatch(dispatchKeySet, self, shifts, dims, out);
+    }
+    
+    // aten::roll.out(Tensor self, SymInt[1] shifts, int[1] dims=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & roll_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef shifts, at::IntArrayRef dims, at::Tensor & out) {
+        return at::_ops::roll_out::redispatch(dispatchKeySet, self, shifts, dims, out);
+    }
+    
+    // aten::rot90.out(Tensor self, int k=1, int[] dims=[0,1], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rot90_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t k=1, at::IntArrayRef dims={0,1}) {
+        return at::_ops::rot90_out::redispatch(dispatchKeySet, self, k, dims, out);
+    }
+    
+    // aten::rot90.out(Tensor self, int k=1, int[] dims=[0,1], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rot90_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t k, at::IntArrayRef dims, at::Tensor & out) {
+        return at::_ops::rot90_out::redispatch(dispatchKeySet, self, k, dims, out);
+    }
+    
+    // aten::_transform_bias_rescale_qkv.out(Tensor qkv, Tensor qkv_bias, int num_heads, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _transform_bias_rescale_qkv_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads) {
+        return at::_ops::_transform_bias_rescale_qkv_out::redispatch(dispatchKeySet, qkv, qkv_bias, num_heads, out0, out1, out2);
+    }
+    
+    // aten::_transform_bias_rescale_qkv.out(Tensor qkv, Tensor qkv_bias, int num_heads, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _transform_bias_rescale_qkv_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_transform_bias_rescale_qkv_out::redispatch(dispatchKeySet, qkv, qkv_bias, num_heads, out0, out1, out2);
+    }
+    
+    // aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_from_mask_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) {
+        return at::_ops::_nested_tensor_from_mask_out::redispatch(dispatchKeySet, t, mask, mask_check, out);
+    }
+    
+    // aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_from_mask_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & t, const at::Tensor & mask, bool mask_check, at::Tensor & out) {
+        return at::_ops::_nested_tensor_from_mask_out::redispatch(dispatchKeySet, t, mask, mask_check, out);
+    }
+    
+    // aten::_nested_from_padded.out(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_from_padded_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & padded, const at::Tensor & cpu_nested_shape_example, bool fuse_transform_0213=false) {
+        return at::_ops::_nested_from_padded_out::redispatch(dispatchKeySet, padded, cpu_nested_shape_example, fuse_transform_0213, out);
+    }
+    
+    // aten::_nested_from_padded.out(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_from_padded_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & cpu_nested_shape_example, bool fuse_transform_0213, at::Tensor & out) {
+        return at::_ops::_nested_from_padded_out::redispatch(dispatchKeySet, padded, cpu_nested_shape_example, fuse_transform_0213, out);
+    }
+    
+    // aten::_nested_tensor_size.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_size_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_size_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_tensor_size.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_size_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_nested_tensor_size_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_tensor_strides.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_strides_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_strides_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_tensor_strides.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_strides_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_nested_tensor_strides_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_tensor_storage_offsets.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_storage_offsets_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_nested_tensor_storage_offsets_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_tensor_storage_offsets.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_storage_offsets_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_nested_tensor_storage_offsets_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_from_padded_and_nested_example.out(Tensor padded, Tensor nt_example, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_from_padded_and_nested_example_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & padded, const at::Tensor & nt_example) {
+        return at::_ops::_nested_from_padded_and_nested_example_out::redispatch(dispatchKeySet, padded, nt_example, out);
+    }
+    
+    // aten::_nested_from_padded_and_nested_example.out(Tensor padded, Tensor nt_example, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_from_padded_and_nested_example_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & nt_example, at::Tensor & out) {
+        return at::_ops::_nested_from_padded_and_nested_example_out::redispatch(dispatchKeySet, padded, nt_example, out);
+    }
+    
+    // aten::_nested_view_from_buffer_copy.out(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_view_from_buffer_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets) {
+        return at::_ops::_nested_view_from_buffer_copy_out::redispatch(dispatchKeySet, self, nested_size, nested_strides, offsets, out);
+    }
+    
+    // aten::_nested_view_from_buffer_copy.out(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_view_from_buffer_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets, at::Tensor & out) {
+        return at::_ops::_nested_view_from_buffer_copy_out::redispatch(dispatchKeySet, self, nested_size, nested_strides, offsets, out);
+    }
+    
+    // aten::_nested_view_from_jagged_copy.out(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_view_from_jagged_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths={}, int64_t ragged_idx=1) {
+        return at::_ops::_nested_view_from_jagged_copy_out::redispatch(dispatchKeySet, self, offsets, dummy, lengths, ragged_idx, out);
+    }
+    
+    // aten::_nested_view_from_jagged_copy.out(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_view_from_jagged_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths, int64_t ragged_idx, at::Tensor & out) {
+        return at::_ops::_nested_view_from_jagged_copy_out::redispatch(dispatchKeySet, self, offsets, dummy, lengths, ragged_idx, out);
+    }
+    
+    // aten::_nested_get_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_get_values_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_nested_get_values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_nested_get_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_get_values_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_nested_get_values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_trilinear.out(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _trilinear_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & i1, const at::Tensor & i2, const at::Tensor & i3, at::IntArrayRef expand1, at::IntArrayRef expand2, at::IntArrayRef expand3, at::IntArrayRef sumdim, int64_t unroll_dim=1) {
+        return at::_ops::_trilinear_out::redispatch(dispatchKeySet, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim, out);
+    }
+    
+    // aten::_trilinear.out(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _trilinear_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & i1, const at::Tensor & i2, const at::Tensor & i3, at::IntArrayRef expand1, at::IntArrayRef expand2, at::IntArrayRef expand3, at::IntArrayRef sumdim, int64_t unroll_dim, at::Tensor & out) {
+        return at::_ops::_trilinear_out::redispatch(dispatchKeySet, i1, i2, i3, expand1, expand2, expand3, sumdim, unroll_dim, out);
+    }
+    
+    // aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _unique_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, bool sorted=true, bool return_inverse=false) {
+        return at::_ops::_unique_out::redispatch(dispatchKeySet, self, sorted, return_inverse, out0, out1);
+    }
+    
+    // aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _unique_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_unique_out::redispatch(dispatchKeySet, self, sorted, return_inverse, out0, out1);
+    }
+    
+    // aten::unique_dim.out(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::unique_dim_out::redispatch(dispatchKeySet, self, dim, sorted, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::unique_dim.out(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::unique_dim_out::redispatch(dispatchKeySet, self, dim, sorted, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::unique_consecutive.out(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_consecutive_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, bool return_inverse=false, bool return_counts=false, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::unique_consecutive_out::redispatch(dispatchKeySet, self, return_inverse, return_counts, dim, out0, out1, out2);
+    }
+    
+    // aten::unique_consecutive.out(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_consecutive_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool return_inverse, bool return_counts, c10::optional<int64_t> dim, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::unique_consecutive_out::redispatch(dispatchKeySet, self, return_inverse, return_counts, dim, out0, out1, out2);
+    }
+    
+    // aten::unique_dim_consecutive.out(Tensor self, int dim, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_consecutive_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, int64_t dim, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::unique_dim_consecutive_out::redispatch(dispatchKeySet, self, dim, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::unique_dim_consecutive.out(Tensor self, int dim, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_consecutive_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::unique_dim_consecutive_out::redispatch(dispatchKeySet, self, dim, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::_unique2.out(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _unique2_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, bool sorted=true, bool return_inverse=false, bool return_counts=false) {
+        return at::_ops::_unique2_out::redispatch(dispatchKeySet, self, sorted, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::_unique2.out(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _unique2_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_unique2_out::redispatch(dispatchKeySet, self, sorted, return_inverse, return_counts, out0, out1, out2);
+    }
+    
+    // aten::_unsafe_view.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _unsafe_view_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_unsafe_view_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::_unsafe_view.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _unsafe_view_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::_unsafe_view_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::_unsafe_view.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _unsafe_view_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::_unsafe_view_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::_unsafe_view.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _unsafe_view_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::_unsafe_view_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::var_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> var_mean_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) {
+        return at::_ops::var_mean_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out0, out1);
+    }
+    
+    // aten::var_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> var_mean_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::var_mean_correction_out::redispatch(dispatchKeySet, self, dim, correction, keepdim, out0, out1);
+    }
+    
+    // aten::_weight_norm_interface.out(Tensor v, Tensor g, int dim=0, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _weight_norm_interface_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & v, const at::Tensor & g, int64_t dim=0) {
+        return at::_ops::_weight_norm_interface_out::redispatch(dispatchKeySet, v, g, dim, out0, out1);
+    }
+    
+    // aten::_weight_norm_interface.out(Tensor v, Tensor g, int dim=0, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _weight_norm_interface_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & v, const at::Tensor & g, int64_t dim, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_weight_norm_interface_out::redispatch(dispatchKeySet, v, g, dim, out0, out1);
+    }
+    
+    // aten::_weight_norm_interface_backward.out(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _weight_norm_interface_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim) {
+        return at::_ops::_weight_norm_interface_backward_out::redispatch(dispatchKeySet, grad_w, saved_v, saved_g, saved_norms, dim, out0, out1);
+    }
+    
+    // aten::_weight_norm_interface_backward.out(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _weight_norm_interface_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_weight_norm_interface_backward_out::redispatch(dispatchKeySet, grad_w, saved_v, saved_g, saved_norms, dim, out0, out1);
+    }
+    
+    // aten::zeros.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names) {
+        return at::_ops::zeros_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::zeros.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out) {
+        return at::_ops::zeros_names_out::redispatch(dispatchKeySet, size, names, out);
+    }
+    
+    // aten::_efficientzerotensor.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _efficientzerotensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::_efficientzerotensor_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::_efficientzerotensor.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _efficientzerotensor_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::_efficientzerotensor_out::redispatch(dispatchKeySet, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::_efficientzerotensor.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _efficientzerotensor_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, c10::SymIntArrayRef size) {
+        return at::_ops::_efficientzerotensor_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::_efficientzerotensor.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _efficientzerotensor_symint_outf(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::_efficientzerotensor_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::zeros_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_like_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::zeros_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::zeros_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zeros_like_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::zeros_like_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::_standard_gamma_grad.out(Tensor self, Tensor output, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _standard_gamma_grad_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & output) {
+        return at::_ops::_standard_gamma_grad_out::redispatch(dispatchKeySet, self, output, out);
+    }
+    
+    // aten::_standard_gamma_grad.out(Tensor self, Tensor output, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _standard_gamma_grad_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & output, at::Tensor & out) {
+        return at::_ops::_standard_gamma_grad_out::redispatch(dispatchKeySet, self, output, out);
+    }
+    
+    // aten::_standard_gamma.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _standard_gamma_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_standard_gamma_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::_standard_gamma.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _standard_gamma_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::_standard_gamma_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::_dirichlet_grad.out(Tensor x, Tensor alpha, Tensor total, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _dirichlet_grad_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total) {
+        return at::_ops::_dirichlet_grad_out::redispatch(dispatchKeySet, x, alpha, total, out);
+    }
+    
+    // aten::_dirichlet_grad.out(Tensor x, Tensor alpha, Tensor total, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _dirichlet_grad_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total, at::Tensor & out) {
+        return at::_ops::_dirichlet_grad_out::redispatch(dispatchKeySet, x, alpha, total, out);
+    }
+    
+    // aten::_sample_dirichlet.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sample_dirichlet_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::_sample_dirichlet_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::_sample_dirichlet.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sample_dirichlet_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::_sample_dirichlet_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::poisson.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & poisson_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::poisson_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::poisson.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & poisson_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::poisson_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binomial_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & count, const at::Tensor & prob, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::binomial_out::redispatch(dispatchKeySet, count, prob, generator, out);
+    }
+    
+    // aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & binomial_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & count, const at::Tensor & prob, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::binomial_out::redispatch(dispatchKeySet, count, prob, generator, out);
+    }
+    
+    // aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2) {
+        return at::_ops::native_norm_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p, at::Tensor & out) {
+        return at::_ops::native_norm_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) {
+        return at::_ops::native_norm_ScalarOpt_dim_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & native_norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::native_norm_ScalarOpt_dim_dtype_out::redispatch(dispatchKeySet, self, p, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_sparse_sum.dim_out(Tensor self, int[1] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::_sparse_sum_dim_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::_sparse_sum.dim_out(Tensor self, int[1] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out) {
+        return at::_ops::_sparse_sum_dim_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sum_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::_sparse_sum_backward_out::redispatch(dispatchKeySet, grad, self, dim, out);
+    }
+    
+    // aten::_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_sum_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out) {
+        return at::_ops::_sparse_sum_backward_out::redispatch(dispatchKeySet, grad, self, dim, out);
+    }
+    
+    // aten::_sparse_csr_sum.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_csr_sum_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_csr_sum_dim_dtype_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_sparse_csr_sum.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_csr_sum_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::_sparse_csr_sum_dim_dtype_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_sparse_csr_prod.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_csr_prod_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::_sparse_csr_prod_dim_dtype_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_sparse_csr_prod.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_csr_prod_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::_sparse_csr_prod_dim_dtype_out::redispatch(dispatchKeySet, self, dim, keepdim, dtype, out);
+    }
+    
+    // aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_sparse_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) {
+        return at::_ops::_sparse_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_sparse_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_softmax_backward_data_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self) {
+        return at::_ops::_sparse_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, self, out);
+    }
+    
+    // aten::_sparse_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_softmax_backward_data_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_sparse_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, self, out);
+    }
+    
+    // aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_log_softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) {
+        return at::_ops::_sparse_log_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_log_softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) {
+        return at::_ops::_sparse_log_softmax_out::redispatch(dispatchKeySet, self, dim, half_to_float, out);
+    }
+    
+    // aten::_sparse_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_log_softmax_backward_data_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self) {
+        return at::_ops::_sparse_log_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, self, out);
+    }
+    
+    // aten::_sparse_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_log_softmax_backward_data_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_sparse_log_softmax_backward_data_out::redispatch(dispatchKeySet, grad_output, output, dim, self, out);
+    }
+    
+    // aten::_spdiags.out(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _spdiags_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional<at::Layout> layout=c10::nullopt) {
+        return at::_ops::_spdiags_out::redispatch(dispatchKeySet, diagonals, offsets, shape, layout, out);
+    }
+    
+    // aten::_spdiags.out(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _spdiags_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional<at::Layout> layout, at::Tensor & out) {
+        return at::_ops::_spdiags_out::redispatch(dispatchKeySet, diagonals, offsets, shape, layout, out);
+    }
+    
+    // aten::norm.ScalarOpt_dtype_out(Tensor self, Scalar? p, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::ScalarType dtype) {
+        return at::_ops::norm_ScalarOpt_dtype_out::redispatch(dispatchKeySet, self, p, dtype, out);
+    }
+    
+    // aten::norm.ScalarOpt_dtype_out(Tensor self, Scalar? p, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & p, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::norm_ScalarOpt_dtype_out::redispatch(dispatchKeySet, self, p, dtype, out);
+    }
+    
+    // aten::norm.Scalar_out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2) {
+        return at::_ops::norm_Scalar_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::norm.Scalar_out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & norm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & p, at::Tensor & out) {
+        return at::_ops::norm_Scalar_out::redispatch(dispatchKeySet, self, p, out);
+    }
+    
+    // aten::clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clone_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::clone_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & clone_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::clone_out::redispatch(dispatchKeySet, self, memory_format, out);
+    }
+    
+    // aten::resize_as.out(Tensor self, Tensor the_template, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_as_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_as_out::redispatch(dispatchKeySet, self, the_template, memory_format, out);
+    }
+    
+    // aten::resize_as.out(Tensor self, Tensor the_template, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_as_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format, const at::Tensor & out) {
+        return at::_ops::resize_as_out::redispatch(dispatchKeySet, self, the_template, memory_format, out);
+    }
+    
+    // aten::resize_as(Tensor self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor
+    inline at::Tensor resize_as(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::resize_as::redispatch(dispatchKeySet, self, the_template, memory_format);
+    }
+    
+    // aten::resize_as_sparse.out(Tensor self, Tensor the_template, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_as_sparse_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, const at::Tensor & the_template) {
+        return at::_ops::resize_as_sparse_out::redispatch(dispatchKeySet, self, the_template, out);
+    }
+    
+    // aten::resize_as_sparse.out(Tensor self, Tensor the_template, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & resize_as_sparse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template, const at::Tensor & out) {
+        return at::_ops::resize_as_sparse_out::redispatch(dispatchKeySet, self, the_template, out);
+    }
+    
+    // aten::resize_as_sparse(Tensor self, Tensor the_template) -> Tensor
+    inline at::Tensor resize_as_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template) {
+        return at::_ops::resize_as_sparse::redispatch(dispatchKeySet, self, the_template);
+    }
+    
+    // aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zero_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::zero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & zero_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::zero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::zero(Tensor self) -> Tensor
+    inline at::Tensor zero(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::zero::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::sub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sub_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::sub_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::sub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sub_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::sub_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::rsub.Tensor_out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsub_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::rsub_Tensor_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::rsub.Tensor_out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsub_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::rsub_Tensor_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::rsub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsub_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+        return at::_ops::rsub_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::rsub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rsub_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::rsub_Scalar_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_sparse_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_addmm_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+        return at::_ops::_sparse_addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::_sparse_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_addmm_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+        return at::_ops::_sparse_addmm_out::redispatch(dispatchKeySet, self, mat1, mat2, beta, alpha, out);
+    }
+    
+    // aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_coo_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::IntArrayRef size) {
+        return at::_ops::sparse_coo_tensor_size_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_coo_tensor_outf(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::sparse_coo_tensor_size_out::redispatch(dispatchKeySet, size, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size) {
+        return at::_ops::_sparse_coo_tensor_with_dims_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_outf(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::_sparse_coo_tensor_with_dims_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_outf(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<bool> is_coalesced, at::Tensor & out) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<bool> is_coalesced=c10::nullopt) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, indices, values, is_coalesced, out);
+    }
+    
+    // aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_symint_outf(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, c10::optional<bool> is_coalesced, at::Tensor & out) {
+        return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::redispatch(dispatchKeySet, sparse_dim, dense_dim, size, indices, values, is_coalesced, out);
+    }
+    
+    // aten::sparse_resize.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize_out::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim, out);
+    }
+    
+    // aten::sparse_resize.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out) {
+        return at::_ops::sparse_resize_out::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim, out);
+    }
+    
+    // aten::sparse_resize(Tensor self, int[] size, int sparse_dim, int dense_dim) -> Tensor
+    inline at::Tensor sparse_resize(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim);
+    }
+    
+    // aten::sparse_resize_and_clear.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_and_clear_out(c10::DispatchKeySet dispatchKeySet, const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize_and_clear_out::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim, out);
+    }
+    
+    // aten::sparse_resize_and_clear.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline const at::Tensor & sparse_resize_and_clear_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out) {
+        return at::_ops::sparse_resize_and_clear_out::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim, out);
+    }
+    
+    // aten::sparse_resize_and_clear(Tensor self, int[] size, int sparse_dim, int dense_dim) -> Tensor
+    inline at::Tensor sparse_resize_and_clear(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+        return at::_ops::sparse_resize_and_clear::redispatch(dispatchKeySet, self, size, sparse_dim, dense_dim);
+    }
+    
+    // aten::sparse_mask.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_mask_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask) {
+        return at::_ops::sparse_mask_out::redispatch(dispatchKeySet, self, mask, out);
+    }
+    
+    // aten::sparse_mask.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & sparse_mask_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, at::Tensor & out) {
+        return at::_ops::sparse_mask_out::redispatch(dispatchKeySet, self, mask, out);
+    }
+    
+    // aten::_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_mask_projection_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches=false) {
+        return at::_ops::_sparse_mask_projection_out::redispatch(dispatchKeySet, self, mask, accumulate_matches, out);
+    }
+    
+    // aten::_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_mask_projection_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out) {
+        return at::_ops::_sparse_mask_projection_out::redispatch(dispatchKeySet, self, mask, accumulate_matches, out);
+    }
+    
+    // aten::_to_dense.out(Tensor self, ScalarType? dtype=None, bool? masked_grad=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_dense_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<bool> masked_grad=c10::nullopt) {
+        return at::_ops::_to_dense_out::redispatch(dispatchKeySet, self, dtype, masked_grad, out);
+    }
+    
+    // aten::_to_dense.out(Tensor self, ScalarType? dtype=None, bool? masked_grad=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_dense_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<bool> masked_grad, at::Tensor & out) {
+        return at::_ops::_to_dense_out::redispatch(dispatchKeySet, self, dtype, masked_grad, out);
+    }
+    
+    // aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _coalesce_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_coalesce_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _coalesce_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_coalesce_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _coalesced_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool coalesced) {
+        return at::_ops::_coalesced_out::redispatch(dispatchKeySet, self, coalesced, out);
+    }
+    
+    // aten::_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _coalesced_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool coalesced, at::Tensor & out) {
+        return at::_ops::_coalesced_out::redispatch(dispatchKeySet, self, coalesced, out);
+    }
+    
+    // aten::_coalesced(Tensor self, bool coalesced) -> Tensor
+    inline at::Tensor _coalesced(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool coalesced) {
+        return at::_ops::_coalesced::redispatch(dispatchKeySet, self, coalesced);
+    }
+    
+    // aten::copy_sparse_to_sparse.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copy_sparse_to_sparse_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy_sparse_to_sparse_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::copy_sparse_to_sparse.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & copy_sparse_to_sparse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out) {
+        return at::_ops::copy_sparse_to_sparse_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::copy_sparse_to_sparse(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
+    inline at::Tensor copy_sparse_to_sparse(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking=false) {
+        return at::_ops::copy_sparse_to_sparse::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::_to_sparse.sparse_dim_out(Tensor self, int sparse_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t sparse_dim) {
+        return at::_ops::_to_sparse_sparse_dim_out::redispatch(dispatchKeySet, self, sparse_dim, out);
+    }
+    
+    // aten::_to_sparse.sparse_dim_out(Tensor self, int sparse_dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sparse_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_sparse_dim_out::redispatch(dispatchKeySet, self, sparse_dim, out);
+    }
+    
+    // aten::_to_sparse.out(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_out::redispatch(dispatchKeySet, self, layout, blocksize, dense_dim, out);
+    }
+    
+    // aten::_to_sparse.out(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Layout> layout, at::OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_out::redispatch(dispatchKeySet, self, layout, blocksize, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_csr.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_csr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_csr_out::redispatch(dispatchKeySet, self, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_csr.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_csr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_csr_out::redispatch(dispatchKeySet, self, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_csc.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_csc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_csc_out::redispatch(dispatchKeySet, self, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_csc.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_csc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<int64_t> dense_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_csc_out::redispatch(dispatchKeySet, self, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_bsr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_bsr_out::redispatch(dispatchKeySet, self, blocksize, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_bsr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_bsr_out::redispatch(dispatchKeySet, self, blocksize, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_bsc.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_bsc_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) {
+        return at::_ops::_to_sparse_bsc_out::redispatch(dispatchKeySet, self, blocksize, dense_dim, out);
+    }
+    
+    // aten::_to_sparse_bsc.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_sparse_bsc_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out) {
+        return at::_ops::_to_sparse_bsc_out::redispatch(dispatchKeySet, self, blocksize, dense_dim, out);
+    }
+    
+    // aten::to_mkldnn.out(Tensor self, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_mkldnn_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt) {
+        return at::_ops::to_mkldnn_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::to_mkldnn.out(Tensor self, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_mkldnn_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::ScalarType> dtype, at::Tensor & out) {
+        return at::_ops::to_mkldnn_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv2d_weight_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=c10::nullopt) {
+        return at::_ops::mkldnn_reorder_conv2d_weight_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*input_size)) : c10::nullopt, out);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv2d_weight_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) {
+        return at::_ops::mkldnn_reorder_conv2d_weight_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*input_size)) : c10::nullopt, out);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv2d_weight_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=c10::nullopt) {
+        return at::_ops::mkldnn_reorder_conv2d_weight_out::redispatch(dispatchKeySet, self, padding, stride, dilation, groups, input_size, out);
+    }
+    
+    // aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv2d_weight_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) {
+        return at::_ops::mkldnn_reorder_conv2d_weight_out::redispatch(dispatchKeySet, self, padding, stride, dilation, groups, input_size, out);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv3d_weight_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1) {
+        return at::_ops::mkldnn_reorder_conv3d_weight_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv3d_weight_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
+        return at::_ops::mkldnn_reorder_conv3d_weight_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, out);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+        return at::_ops::mkldnn_reorder_conv3d_weight_out::redispatch(dispatchKeySet, self, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
+        return at::_ops::mkldnn_reorder_conv3d_weight_out::redispatch(dispatchKeySet, self, padding, stride, dilation, groups, out);
+    }
+    
+    // aten::quantize_per_tensor_dynamic.out(Tensor self, ScalarType dtype, bool reduce_range, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_dynamic_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::ScalarType dtype, bool reduce_range) {
+        return at::_ops::quantize_per_tensor_dynamic_out::redispatch(dispatchKeySet, self, dtype, reduce_range, out);
+    }
+    
+    // aten::quantize_per_tensor_dynamic.out(Tensor self, ScalarType dtype, bool reduce_range, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_dynamic_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype, bool reduce_range, at::Tensor & out) {
+        return at::_ops::quantize_per_tensor_dynamic_out::redispatch(dispatchKeySet, self, dtype, reduce_range, out);
+    }
+    
+    // aten::quantize_per_tensor.out(Tensor self, float scale, int zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, dtype, out);
+    }
+    
+    // aten::quantize_per_tensor.out(Tensor self, float scale, int zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::quantize_per_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, dtype, out);
+    }
+    
+    // aten::quantize_per_tensor.tensor_qparams_out(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor_tensor_qparams_out::redispatch(dispatchKeySet, self, scale, zero_point, dtype, out);
+    }
+    
+    // aten::quantize_per_tensor.tensor_qparams_out(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::quantize_per_tensor_tensor_qparams_out::redispatch(dispatchKeySet, self, scale, zero_point, dtype, out);
+    }
+    
+    // aten::quantize_per_tensor.tensors_out(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype, *, Tensor(a!)[] out) -> ()
+    inline void quantize_per_tensor_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype) {
+        return at::_ops::quantize_per_tensor_tensors_out::redispatch(dispatchKeySet, tensors, scales, zero_points, dtype, out);
+    }
+    
+    // aten::quantize_per_tensor.tensors_out(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype, *, Tensor(a!)[] out) -> ()
+    inline void quantize_per_tensor_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype, at::TensorList out) {
+        return at::_ops::quantize_per_tensor_tensors_out::redispatch(dispatchKeySet, tensors, scales, zero_points, dtype, out);
+    }
+    
+    // aten::quantize_per_channel.out(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_channel_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype) {
+        return at::_ops::quantize_per_channel_out::redispatch(dispatchKeySet, self, scales, zero_points, axis, dtype, out);
+    }
+    
+    // aten::quantize_per_channel.out(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & quantize_per_channel_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::quantize_per_channel_out::redispatch(dispatchKeySet, self, scales, zero_points, axis, dtype, out);
+    }
+    
+    // aten::dequantize.self_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dequantize_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::dequantize_self_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::dequantize.self_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dequantize_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::dequantize_self_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::dequantize.tensors_out(Tensor[] tensors, *, Tensor(a!)[] out) -> ()
+    inline void dequantize_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList tensors) {
+        return at::_ops::dequantize_tensors_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::dequantize.tensors_out(Tensor[] tensors, *, Tensor(a!)[] out) -> ()
+    inline void dequantize_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::TensorList out) {
+        return at::_ops::dequantize_tensors_out::redispatch(dispatchKeySet, tensors, out);
+    }
+    
+    // aten::q_per_channel_scales.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & q_per_channel_scales_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::q_per_channel_scales_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::q_per_channel_scales.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & q_per_channel_scales_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::q_per_channel_scales_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::q_per_channel_zero_points.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & q_per_channel_zero_points_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::q_per_channel_zero_points_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::q_per_channel_zero_points.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & q_per_channel_zero_points_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::q_per_channel_zero_points_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::int_repr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & int_repr_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::int_repr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::int_repr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & int_repr_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::int_repr_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_make_per_tensor_quantized_tensor.out(Tensor self, float scale, int zero_point, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_per_tensor_quantized_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double scale, int64_t zero_point) {
+        return at::_ops::_make_per_tensor_quantized_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, out);
+    }
+    
+    // aten::_make_per_tensor_quantized_tensor.out(Tensor self, float scale, int zero_point, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_per_tensor_quantized_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, at::Tensor & out) {
+        return at::_ops::_make_per_tensor_quantized_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, out);
+    }
+    
+    // aten::_make_per_channel_quantized_tensor.out(Tensor self, Tensor scale, Tensor zero_point, int axis, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_per_channel_quantized_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis) {
+        return at::_ops::_make_per_channel_quantized_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, out);
+    }
+    
+    // aten::_make_per_channel_quantized_tensor.out(Tensor self, Tensor scale, Tensor zero_point, int axis, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_per_channel_quantized_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, at::Tensor & out) {
+        return at::_ops::_make_per_channel_quantized_tensor_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, out);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine_cachemask.out(Tensor self, float scale, int zero_point, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fake_quantize_per_tensor_affine_cachemask_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_tensor_affine_cachemask_out::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::fake_quantize_per_tensor_affine_cachemask.out(Tensor self, float scale, int zero_point, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fake_quantize_per_tensor_affine_cachemask_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::fake_quantize_per_tensor_affine_cachemask_out::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out::redispatch(dispatchKeySet, self, scale, zero_point, fake_quant_enabled, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out::redispatch(dispatchKeySet, self, scale, zero_point, fake_quant_enabled, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::_fake_quantize_learnable_per_tensor_affine.out(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fake_quantize_learnable_per_tensor_affine_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_tensor_affine_out::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max, grad_factor, out);
+    }
+    
+    // aten::_fake_quantize_learnable_per_tensor_affine.out(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fake_quantize_learnable_per_tensor_affine_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out) {
+        return at::_ops::_fake_quantize_learnable_per_tensor_affine_out::redispatch(dispatchKeySet, self, scale, zero_point, quant_min, quant_max, grad_factor, out);
+    }
+    
+    // aten::fake_quantize_per_channel_affine_cachemask.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fake_quantize_per_channel_affine_cachemask_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) {
+        return at::_ops::fake_quantize_per_channel_affine_cachemask_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::fake_quantize_per_channel_affine_cachemask.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> fake_quantize_per_channel_affine_cachemask_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::fake_quantize_per_channel_affine_cachemask_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max, out0, out1);
+    }
+    
+    // aten::_fake_quantize_learnable_per_channel_affine.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fake_quantize_learnable_per_channel_affine_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+        return at::_ops::_fake_quantize_learnable_per_channel_affine_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max, grad_factor, out);
+    }
+    
+    // aten::_fake_quantize_learnable_per_channel_affine.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fake_quantize_learnable_per_channel_affine_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out) {
+        return at::_ops::_fake_quantize_learnable_per_channel_affine_out::redispatch(dispatchKeySet, self, scale, zero_point, axis, quant_min, quant_max, grad_factor, out);
+    }
+    
+    // aten::_fused_moving_avg_obs_fq_helper.out(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False, *, Tensor(e!) out0, Tensor(f!) out1) -> (Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fused_moving_avg_obs_fq_helper_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false) {
+        return at::_ops::_fused_moving_avg_obs_fq_helper_out::redispatch(dispatchKeySet, self, observer_on, fake_quant_on, running_min, running_max, scale, zero_point, averaging_const, quant_min, quant_max, ch_axis, per_row_fake_quant, symmetric_quant, out0, out1);
+    }
+    
+    // aten::_fused_moving_avg_obs_fq_helper.out(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False, *, Tensor(e!) out0, Tensor(f!) out1) -> (Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _fused_moving_avg_obs_fq_helper_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_fused_moving_avg_obs_fq_helper_out::redispatch(dispatchKeySet, self, observer_on, fake_quant_on, running_min, running_max, scale, zero_point, averaging_const, quant_min, quant_max, ch_axis, per_row_fake_quant, symmetric_quant, out0, out1);
+    }
+    
+    // aten::_fused_moving_avg_obs_fq_helper_functional(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask, Tensor running_min_out, Tensor running_max_out, Tensor scale_out, Tensor zero_point_out)
+    inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _fused_moving_avg_obs_fq_helper_functional(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, const at::Tensor & running_min, const at::Tensor & running_max, const at::Tensor & scale, const at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false) {
+        return at::_ops::_fused_moving_avg_obs_fq_helper_functional::redispatch(dispatchKeySet, self, observer_on, fake_quant_on, running_min, running_max, scale, zero_point, averaging_const, quant_min, quant_max, ch_axis, per_row_fake_quant, symmetric_quant);
+    }
+    
+    // aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool non_blocking=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+        return at::_ops::_to_copy_out::redispatch(dispatchKeySet, self, non_blocking, memory_format, out);
+    }
+    
+    // aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _to_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+        return at::_ops::_to_copy_out::redispatch(dispatchKeySet, self, non_blocking, memory_format, out);
+    }
+    
+    // aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _lstm_mps_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::_lstm_mps_out::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5);
+    }
+    
+    // aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _lstm_mps_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5) {
+        return at::_ops::_lstm_mps_out::redispatch(dispatchKeySet, input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5);
+    }
+    
+    // aten::lstm_mps_backward.out(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!)[] out1, Tensor(c!)[] out2) -> ()
+    inline void lstm_mps_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::TensorList out1, at::TensorList out2, const c10::optional<at::Tensor> & grad_y, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & z_state, const at::Tensor & cell_state_fwd, const at::Tensor & input, const at::Tensor & layersOutputs, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+        return at::_ops::lstm_mps_backward_out::redispatch(dispatchKeySet, grad_y, grad_hy, grad_cy, z_state, cell_state_fwd, input, layersOutputs, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2);
+    }
+    
+    // aten::lstm_mps_backward.out(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!)[] out1, Tensor(c!)[] out2) -> ()
+    inline void lstm_mps_backward_outf(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_y, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & z_state, const at::Tensor & cell_state_fwd, const at::Tensor & input, const at::Tensor & layersOutputs, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::TensorList out1, at::TensorList out2) {
+        return at::_ops::lstm_mps_backward_out::redispatch(dispatchKeySet, grad_y, grad_hy, grad_cy, z_state, cell_state_fwd, input, layersOutputs, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2);
+    }
+    
+    // aten::_thnn_fused_lstm_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_lstm_cell_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & cx, const c10::optional<at::Tensor> & input_bias={}, const c10::optional<at::Tensor> & hidden_bias={}) {
+        return at::_ops::_thnn_fused_lstm_cell_out::redispatch(dispatchKeySet, input_gates, hidden_gates, cx, input_bias, hidden_bias, out0, out1, out2);
+    }
+    
+    // aten::_thnn_fused_lstm_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_lstm_cell_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & cx, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_thnn_fused_lstm_cell_out::redispatch(dispatchKeySet, input_gates, hidden_gates, cx, input_bias, hidden_bias, out0, out1, out2);
+    }
+    
+    // aten::_thnn_fused_lstm_cell_backward_impl.out(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_lstm_cell_backward_impl_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & cx, const at::Tensor & cy, const at::Tensor & workspace, bool has_bias) {
+        return at::_ops::_thnn_fused_lstm_cell_backward_impl_out::redispatch(dispatchKeySet, grad_hy, grad_cy, cx, cy, workspace, has_bias, out0, out1, out2);
+    }
+    
+    // aten::_thnn_fused_lstm_cell_backward_impl.out(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_lstm_cell_backward_impl_outf(c10::DispatchKeySet dispatchKeySet, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & cx, const at::Tensor & cy, const at::Tensor & workspace, bool has_bias, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_thnn_fused_lstm_cell_backward_impl_out::redispatch(dispatchKeySet, grad_hy, grad_cy, cx, cy, workspace, has_bias, out0, out1, out2);
+    }
+    
+    // aten::_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _thnn_fused_gru_cell_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias={}, const c10::optional<at::Tensor> & hidden_bias={}) {
+        return at::_ops::_thnn_fused_gru_cell_out::redispatch(dispatchKeySet, input_gates, hidden_gates, hx, input_bias, hidden_bias, out0, out1);
+    }
+    
+    // aten::_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _thnn_fused_gru_cell_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_thnn_fused_gru_cell_out::redispatch(dispatchKeySet, input_gates, hidden_gates, hx, input_bias, hidden_bias, out0, out1);
+    }
+    
+    // aten::_thnn_fused_gru_cell_backward.out(Tensor grad_hy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_gru_cell_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias) {
+        return at::_ops::_thnn_fused_gru_cell_backward_out::redispatch(dispatchKeySet, grad_hy, workspace, has_bias, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_thnn_fused_gru_cell_backward.out(Tensor grad_hy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _thnn_fused_gru_cell_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4) {
+        return at::_ops::_thnn_fused_gru_cell_backward_out::redispatch(dispatchKeySet, grad_hy, workspace, has_bias, out0, out1, out2, out3, out4);
+    }
+    
+    // aten::_pack_padded_sequence.out(Tensor input, Tensor lengths, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _pack_padded_sequence_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const at::Tensor & lengths, bool batch_first) {
+        return at::_ops::_pack_padded_sequence_out::redispatch(dispatchKeySet, input, lengths, batch_first, out0, out1);
+    }
+    
+    // aten::_pack_padded_sequence.out(Tensor input, Tensor lengths, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _pack_padded_sequence_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & lengths, bool batch_first, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_pack_padded_sequence_out::redispatch(dispatchKeySet, input, lengths, batch_first, out0, out1);
+    }
+    
+    // aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Storage source) {
+        return at::_ops::set_source_Storage_out::redispatch(dispatchKeySet, self, source, out);
+    }
+    
+    // aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, at::Tensor & out) {
+        return at::_ops::set_source_Storage_out::redispatch(dispatchKeySet, self, source, out);
+    }
+    
+    // aten::set.source_Storage(Tensor self, Storage source) -> Tensor
+    inline at::Tensor set(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source) {
+        return at::_ops::set_source_Storage::redispatch(dispatchKeySet, self, source);
+    }
+    
+    // aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) {
+        return at::_ops::set_source_Storage_storage_offset_out::redispatch(dispatchKeySet, self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::set_source_Storage_storage_offset_out::redispatch(dispatchKeySet, self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) {
+        return at::_ops::set_source_Storage_storage_offset_out::redispatch(dispatchKeySet, self, source, storage_offset, size, stride, out);
+    }
+    
+    // aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) {
+        return at::_ops::set_source_Storage_storage_offset_out::redispatch(dispatchKeySet, self, source, storage_offset, size, stride, out);
+    }
+    
+    // aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor
+    inline at::Tensor set(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) {
+        return at::_ops::set_source_Storage_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+    }
+    
+    // aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor
+    inline at::Tensor set_symint(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) {
+        return at::_ops::set_source_Storage_storage_offset::redispatch(dispatchKeySet, self, source, storage_offset, size, stride);
+    }
+    
+    // aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & source) {
+        return at::_ops::set_source_Tensor_out::redispatch(dispatchKeySet, self, source, out);
+    }
+    
+    // aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source, at::Tensor & out) {
+        return at::_ops::set_source_Tensor_out::redispatch(dispatchKeySet, self, source, out);
+    }
+    
+    // aten::set.source_Tensor(Tensor self, Tensor source) -> Tensor
+    inline at::Tensor set(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source) {
+        return at::_ops::set_source_Tensor::redispatch(dispatchKeySet, self, source);
+    }
+    
+    // aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::set_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & set_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::set_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::set(Tensor self) -> Tensor
+    inline at::Tensor set(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self) {
+        return at::_ops::set::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::lift.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::lift_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::lift.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lift_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::lift_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::lift_fresh_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lift_fresh_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::lift_fresh_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::lift_fresh_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & lift_fresh_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::lift_fresh_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::masked_fill.Scalar_out(Tensor self, Tensor mask, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value) {
+        return at::_ops::masked_fill_Scalar_out::redispatch(dispatchKeySet, self, mask, value, out);
+    }
+    
+    // aten::masked_fill.Scalar_out(Tensor self, Tensor mask, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::masked_fill_Scalar_out::redispatch(dispatchKeySet, self, mask, value, out);
+    }
+    
+    // aten::masked_fill.Tensor_out(Tensor self, Tensor mask, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value) {
+        return at::_ops::masked_fill_Tensor_out::redispatch(dispatchKeySet, self, mask, value, out);
+    }
+    
+    // aten::masked_fill.Tensor_out(Tensor self, Tensor mask, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value, at::Tensor & out) {
+        return at::_ops::masked_fill_Tensor_out::redispatch(dispatchKeySet, self, mask, value, out);
+    }
+    
+    // aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_scatter_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) {
+        return at::_ops::masked_scatter_out::redispatch(dispatchKeySet, self, mask, source, out);
+    }
+    
+    // aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & masked_scatter_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out) {
+        return at::_ops::masked_scatter_out::redispatch(dispatchKeySet, self, mask, source, out);
+    }
+    
+    // aten::_masked_softmax.out(Tensor self, Tensor mask, int? dim=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_softmax_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_masked_softmax_out::redispatch(dispatchKeySet, self, mask, dim, mask_type, out);
+    }
+    
+    // aten::_masked_softmax.out(Tensor self, Tensor mask, int? dim=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_softmax_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim, c10::optional<int64_t> mask_type, at::Tensor & out) {
+        return at::_ops::_masked_softmax_out::redispatch(dispatchKeySet, self, mask, dim, mask_type, out);
+    }
+    
+    // aten::_masked_softmax_backward.out(Tensor grad_output, Tensor output, Tensor mask, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_softmax_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt) {
+        return at::_ops::_masked_softmax_backward_out::redispatch(dispatchKeySet, grad_output, output, mask, dim, out);
+    }
+    
+    // aten::_masked_softmax_backward.out(Tensor grad_output, Tensor output, Tensor mask, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _masked_softmax_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional<int64_t> dim, at::Tensor & out) {
+        return at::_ops::_masked_softmax_backward_out::redispatch(dispatchKeySet, grad_output, output, mask, dim, out);
+    }
+    
+    // aten::put.out(Tensor self, Tensor index, Tensor source, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & put_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate=false) {
+        return at::_ops::put_out::redispatch(dispatchKeySet, self, index, source, accumulate, out);
+    }
+    
+    // aten::put.out(Tensor self, Tensor index, Tensor source, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & put_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate, at::Tensor & out) {
+        return at::_ops::put_out::redispatch(dispatchKeySet, self, index, source, accumulate, out);
+    }
+    
+    // aten::index_fill.int_Scalar_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+        return at::_ops::index_fill_int_Scalar_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::index_fill.int_Scalar_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, at::Tensor & out) {
+        return at::_ops::index_fill_int_Scalar_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::index_fill.int_Tensor_out(Tensor self, int dim, Tensor index, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_fill_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value) {
+        return at::_ops::index_fill_int_Tensor_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::index_fill.int_Tensor_out(Tensor self, int dim, Tensor index, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & index_fill_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value, at::Tensor & out) {
+        return at::_ops::index_fill_int_Tensor_out::redispatch(dispatchKeySet, self, dim, index, value, out);
+    }
+    
+    // aten::bitwise_and.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_and_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_and.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_and_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_and_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_or_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_or_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_or_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_xor_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_xor.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_xor_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_xor_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__lshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __lshift___out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__lshift___Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__lshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __lshift___outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::__lshift___Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__lshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __lshift___out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__lshift___Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__lshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __lshift___outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::__lshift___Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_left_shift_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_left_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_left_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_left_shift_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __rshift___out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+        return at::_ops::__rshift___Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __rshift___outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+        return at::_ops::__rshift___Scalar_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __rshift___out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+        return at::_ops::__rshift___Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & __rshift___outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::__rshift___Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::bitwise_right_shift_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::bitwise_right_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bitwise_right_shift_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::bitwise_right_shift_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_from_out::redispatch(dispatchKeySet, self, from, to, generator, out);
+    }
+    
+    // aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::random_from_out::redispatch(dispatchKeySet, self, from, to, generator, out);
+    }
+    
+    // aten::random.from(Tensor self, int from, int? to, *, Generator? generator=None) -> Tensor
+    inline at::Tensor random(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_from::redispatch(dispatchKeySet, self, from, to, generator);
+    }
+    
+    // aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_to_out::redispatch(dispatchKeySet, self, to, generator, out);
+    }
+    
+    // aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::random_to_out::redispatch(dispatchKeySet, self, to, generator, out);
+    }
+    
+    // aten::random.to(Tensor self, int to, *, Generator? generator=None) -> Tensor
+    inline at::Tensor random(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_to::redispatch(dispatchKeySet, self, to, generator);
+    }
+    
+    // aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & random_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::random_out::redispatch(dispatchKeySet, self, generator, out);
+    }
+    
+    // aten::random(Tensor self, *, Generator? generator=None) -> Tensor
+    inline at::Tensor random(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::random::redispatch(dispatchKeySet, self, generator);
+    }
+    
+    // aten::uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & uniform_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::uniform_out::redispatch(dispatchKeySet, self, from, to, generator, out);
+    }
+    
+    // aten::uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & uniform_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::uniform_out::redispatch(dispatchKeySet, self, from, to, generator, out);
+    }
+    
+    // aten::uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor
+    inline at::Tensor uniform(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::uniform::redispatch(dispatchKeySet, self, from, to, generator);
+    }
+    
+    // aten::cauchy.out(Tensor self, float median=0, float sigma=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cauchy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double median=0, double sigma=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::cauchy_out::redispatch(dispatchKeySet, self, median, sigma, generator, out);
+    }
+    
+    // aten::cauchy.out(Tensor self, float median=0, float sigma=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & cauchy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double median, double sigma, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::cauchy_out::redispatch(dispatchKeySet, self, median, sigma, generator, out);
+    }
+    
+    // aten::cauchy(Tensor self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor
+    inline at::Tensor cauchy(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double median=0, double sigma=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::cauchy::redispatch(dispatchKeySet, self, median, sigma, generator);
+    }
+    
+    // aten::log_normal.out(Tensor self, float mean=1, float std=2, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double mean=1, double std=2, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::log_normal_out::redispatch(dispatchKeySet, self, mean, std, generator, out);
+    }
+    
+    // aten::log_normal.out(Tensor self, float mean=1, float std=2, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & log_normal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::log_normal_out::redispatch(dispatchKeySet, self, mean, std, generator, out);
+    }
+    
+    // aten::log_normal(Tensor self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor
+    inline at::Tensor log_normal(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double mean=1, double std=2, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::log_normal::redispatch(dispatchKeySet, self, mean, std, generator);
+    }
+    
+    // aten::exponential.out(Tensor self, float lambd=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exponential_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double lambd=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::exponential_out::redispatch(dispatchKeySet, self, lambd, generator, out);
+    }
+    
+    // aten::exponential.out(Tensor self, float lambd=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & exponential_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double lambd, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::exponential_out::redispatch(dispatchKeySet, self, lambd, generator, out);
+    }
+    
+    // aten::exponential(Tensor self, float lambd=1, *, Generator? generator=None) -> Tensor
+    inline at::Tensor exponential(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double lambd=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::exponential::redispatch(dispatchKeySet, self, lambd, generator);
+    }
+    
+    // aten::geometric.out(Tensor self, float p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & geometric_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::geometric_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::geometric.out(Tensor self, float p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & geometric_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::geometric_out::redispatch(dispatchKeySet, self, p, generator, out);
+    }
+    
+    // aten::geometric(Tensor self, float p, *, Generator? generator=None) -> Tensor
+    inline at::Tensor geometric(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::geometric::redispatch(dispatchKeySet, self, p, generator);
+    }
+    
+    // aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tril_indices_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) {
+        return at::_ops::tril_indices_out::redispatch(dispatchKeySet, row, col, offset, out);
+    }
+    
+    // aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & tril_indices_outf(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset, at::Tensor & out) {
+        return at::_ops::tril_indices_out::redispatch(dispatchKeySet, row, col, offset, out);
+    }
+    
+    // aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & triu_indices_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) {
+        return at::_ops::triu_indices_out::redispatch(dispatchKeySet, row, col, offset, out);
+    }
+    
+    // aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & triu_indices_outf(c10::DispatchKeySet dispatchKeySet, int64_t row, int64_t col, int64_t offset, at::Tensor & out) {
+        return at::_ops::triu_indices_out::redispatch(dispatchKeySet, row, col, offset, out);
+    }
+    
+    // aten::trace.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & trace_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::trace_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::trace.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & trace_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::trace_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_cholesky_solve_helper.out(Tensor self, Tensor A, bool upper, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cholesky_solve_helper_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & A, bool upper) {
+        return at::_ops::_cholesky_solve_helper_out::redispatch(dispatchKeySet, self, A, upper, out);
+    }
+    
+    // aten::_cholesky_solve_helper.out(Tensor self, Tensor A, bool upper, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _cholesky_solve_helper_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & A, bool upper, at::Tensor & out) {
+        return at::_ops::_cholesky_solve_helper_out::redispatch(dispatchKeySet, self, A, upper, out);
+    }
+    
+    // aten::dist.out(Tensor self, Tensor other, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dist_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & p=2) {
+        return at::_ops::dist_out::redispatch(dispatchKeySet, self, other, p, out);
+    }
+    
+    // aten::dist.out(Tensor self, Tensor other, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & dist_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & p, at::Tensor & out) {
+        return at::_ops::dist_out::redispatch(dispatchKeySet, self, other, p, out);
+    }
+    
+    // aten::_histogramdd_bin_edges.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!)[] out) -> ()
+    inline void _histogramdd_bin_edges_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_bin_edges_out::redispatch(dispatchKeySet, self, bins, range, weight, density, out);
+    }
+    
+    // aten::_histogramdd_bin_edges.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!)[] out) -> ()
+    inline void _histogramdd_bin_edges_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range, const c10::optional<at::Tensor> & weight, bool density, at::TensorList out) {
+        return at::_ops::_histogramdd_bin_edges_out::redispatch(dispatchKeySet, self, bins, range, weight, density, out);
+    }
+    
+    // aten::_histogramdd_from_bin_cts.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _histogramdd_from_bin_cts_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_from_bin_cts_out::redispatch(dispatchKeySet, self, bins, range, weight, density, out);
+    }
+    
+    // aten::_histogramdd_from_bin_cts.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _histogramdd_from_bin_cts_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef bins, c10::optional<at::ArrayRef<double>> range, const c10::optional<at::Tensor> & weight, bool density, at::Tensor & out) {
+        return at::_ops::_histogramdd_from_bin_cts_out::redispatch(dispatchKeySet, self, bins, range, weight, density, out);
+    }
+    
+    // aten::_histogramdd_from_bin_tensors.out(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _histogramdd_from_bin_tensors_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::TensorList bins, const c10::optional<at::Tensor> & weight={}, bool density=false) {
+        return at::_ops::_histogramdd_from_bin_tensors_out::redispatch(dispatchKeySet, self, bins, weight, density, out);
+    }
+    
+    // aten::_histogramdd_from_bin_tensors.out(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _histogramdd_from_bin_tensors_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList bins, const c10::optional<at::Tensor> & weight, bool density, at::Tensor & out) {
+        return at::_ops::_histogramdd_from_bin_tensors_out::redispatch(dispatchKeySet, self, bins, weight, density, out);
+    }
+    
+    // aten::remainder.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+        return at::_ops::remainder_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::remainder.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & remainder_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+        return at::_ops::remainder_Scalar_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::argsort.stable_out(Tensor self, *, bool stable, int dim=-1, bool descending=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argsort_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool stable, int64_t dim=-1, bool descending=false) {
+        return at::_ops::argsort_stable_out::redispatch(dispatchKeySet, self, stable, dim, descending, out);
+    }
+    
+    // aten::argsort.stable_out(Tensor self, *, bool stable, int dim=-1, bool descending=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & argsort_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool stable, int64_t dim, bool descending, at::Tensor & out) {
+        return at::_ops::argsort_stable_out::redispatch(dispatchKeySet, self, stable, dim, descending, out);
+    }
+    
+    // aten::unfold_backward.out(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step) {
+        return at::_ops::unfold_backward_out::redispatch(dispatchKeySet, grad_in, c10::fromIntArrayRefSlow(input_sizes), dim, size, step, out);
+    }
+    
+    // aten::unfold_backward.out(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step, at::Tensor & out) {
+        return at::_ops::unfold_backward_out::redispatch(dispatchKeySet, grad_in, c10::fromIntArrayRefSlow(input_sizes), dim, size, step, out);
+    }
+    
+    // aten::unfold_backward.out(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step) {
+        return at::_ops::unfold_backward_out::redispatch(dispatchKeySet, grad_in, input_sizes, dim, size, step, out);
+    }
+    
+    // aten::unfold_backward.out(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step, at::Tensor & out) {
+        return at::_ops::unfold_backward_out::redispatch(dispatchKeySet, grad_in, input_sizes, dim, size, step, out);
+    }
+    
+    // aten::normal.out(Tensor self, float mean=0, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+        return at::_ops::normal_out::redispatch(dispatchKeySet, self, mean, std, generator, out);
+    }
+    
+    // aten::normal.out(Tensor self, float mean=0, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & normal_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator, at::Tensor & out) {
+        return at::_ops::normal_out::redispatch(dispatchKeySet, self, mean, std, generator, out);
+    }
+    
+    // aten::_amp_foreach_non_finite_check_and_unscale.out(Tensor[] self, Tensor(b!) found_inf, Tensor inv_scale, *, Tensor(a!)[] out) -> ()
+    inline void _amp_foreach_non_finite_check_and_unscale_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::Tensor & found_inf, const at::Tensor & inv_scale) {
+        return at::_ops::_amp_foreach_non_finite_check_and_unscale_out::redispatch(dispatchKeySet, self, found_inf, inv_scale, out);
+    }
+    
+    // aten::_amp_foreach_non_finite_check_and_unscale.out(Tensor[] self, Tensor(b!) found_inf, Tensor inv_scale, *, Tensor(a!)[] out) -> ()
+    inline void _amp_foreach_non_finite_check_and_unscale_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::Tensor & found_inf, const at::Tensor & inv_scale, at::TensorList out) {
+        return at::_ops::_amp_foreach_non_finite_check_and_unscale_out::redispatch(dispatchKeySet, self, found_inf, inv_scale, out);
+    }
+    
+    // aten::_amp_foreach_non_finite_check_and_unscale(Tensor[] self, Tensor found_inf, Tensor inv_scale) -> (Tensor[] self_out, Tensor found_inf_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,at::Tensor> _amp_foreach_non_finite_check_and_unscale(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & found_inf, const at::Tensor & inv_scale) {
+        return at::_ops::_amp_foreach_non_finite_check_and_unscale::redispatch(dispatchKeySet, self, found_inf, inv_scale);
+    }
+    
+    // aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _amp_update_scale_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+        return at::_ops::_amp_update_scale_out::redispatch(dispatchKeySet, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+    }
+    
+    // aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _amp_update_scale_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out) {
+        return at::_ops::_amp_update_scale_out::redispatch(dispatchKeySet, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+    }
+    
+    // aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out)
+    inline ::std::tuple<at::Tensor,at::Tensor> _amp_update_scale(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+        return at::_ops::_amp_update_scale::redispatch(dispatchKeySet, self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+    }
+    
+    // aten::_foreach_add.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_add_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_add.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_add_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_add.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add_List_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_add.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha, at::TensorList out) {
+        return at::_ops::_foreach_add_List_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_add.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_add_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_add.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_add_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_add.Tensor_out(Tensor[] self, Tensor other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Tensor & other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_add_Tensor_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_add.Tensor_out(Tensor[] self, Tensor other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_add_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other, const at::Scalar & alpha, at::TensorList out) {
+        return at::_ops::_foreach_add_Tensor_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_sub.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_sub_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_sub.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_sub_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_sub.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other, const at::Scalar & alpha=1) {
+        return at::_ops::_foreach_sub_List_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_sub.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, const at::Scalar & alpha, at::TensorList out) {
+        return at::_ops::_foreach_sub_List_out::redispatch(dispatchKeySet, self, other, alpha, out);
+    }
+    
+    // aten::_foreach_sub.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_sub_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_sub.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sub_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_sub_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_mul.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_mul_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_mul.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_mul_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_mul.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_mul_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_mul.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_mul_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_mul.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_mul_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_mul.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_mul_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_mul.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_mul_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_mul.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_mul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other, at::TensorList out) {
+        return at::_ops::_foreach_mul_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_div.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_div_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_div.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_div_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_div.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_div_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_div.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_div_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_div.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_div_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_div.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_div_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_div.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Tensor & other) {
+        return at::_ops::_foreach_div_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_div.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_div_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Tensor & other, at::TensorList out) {
+        return at::_ops::_foreach_div_Tensor_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_clamp_max.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_max_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_clamp_max.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_clamp_max_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_clamp_max.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_max_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_clamp_max.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_clamp_max_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_clamp_max.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_max_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_clamp_max.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_max_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_clamp_max_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_clamp_min.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_clamp_min_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_clamp_min.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_clamp_min_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_clamp_min.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_clamp_min_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_clamp_min.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_clamp_min_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_clamp_min.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_clamp_min_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_clamp_min.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_clamp_min_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_clamp_min_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_maximum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_maximum_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_maximum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_maximum_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_maximum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_maximum_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_maximum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_maximum_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_maximum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_maximum_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_maximum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_maximum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_maximum_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
+        return at::_ops::_foreach_minimum_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
+        return at::_ops::_foreach_minimum_Scalar_out::redispatch(dispatchKeySet, self, scalar, out);
+    }
+    
+    // aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList other) {
+        return at::_ops::_foreach_minimum_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList other, at::TensorList out) {
+        return at::_ops::_foreach_minimum_List_out::redispatch(dispatchKeySet, self, other, out);
+    }
+    
+    // aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_minimum_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_minimum_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_minimum_ScalarList_out::redispatch(dispatchKeySet, self, scalars, out);
+    }
+    
+    // aten::_foreach_addcdiv.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcdiv_Scalar_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::_foreach_addcdiv.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out) {
+        return at::_ops::_foreach_addcdiv_Scalar_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::_foreach_addcdiv.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcdiv_ScalarList_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcdiv.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_addcdiv_ScalarList_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcdiv.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcdiv_Tensor_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcdiv.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcdiv_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out) {
+        return at::_ops::_foreach_addcdiv_Tensor_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcmul.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1) {
+        return at::_ops::_foreach_addcmul_Scalar_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::_foreach_addcmul.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out) {
+        return at::_ops::_foreach_addcmul_Scalar_out::redispatch(dispatchKeySet, self, tensor1, tensor2, value, out);
+    }
+    
+    // aten::_foreach_addcmul.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars) {
+        return at::_ops::_foreach_addcmul_ScalarList_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcmul.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
+        return at::_ops::_foreach_addcmul_ScalarList_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcmul.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars) {
+        return at::_ops::_foreach_addcmul_Tensor_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_addcmul.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_addcmul_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out) {
+        return at::_ops::_foreach_addcmul_Tensor_out::redispatch(dispatchKeySet, self, tensor1, tensor2, scalars, out);
+    }
+    
+    // aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_abs_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_abs_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_abs_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_abs_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_acos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_acos_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_acos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_acos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_acos_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_acos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_asin_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_asin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_asin_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_asin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_atan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_atan_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_atan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_atan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_atan_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_atan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_ceil.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_ceil_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_ceil_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_ceil.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_ceil_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_ceil_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_cos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_cos_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_cos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_cos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_cos_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_cos_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_cosh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_cosh_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_cosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_cosh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_cosh_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_cosh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_erf_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_erf_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_erf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_erfc_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_erfc_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_erfc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_exp.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_exp_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_exp.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_exp_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_expm1_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_expm1_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_expm1_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_floor.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_floor_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_floor_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_floor.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_floor_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_floor_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_frac.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_frac_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_frac_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_frac.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_frac_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_frac_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_lerp.List_out(Tensor[] self, Tensor[] tensors1, Tensor[] weights, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lerp_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensors1, at::TensorList weights) {
+        return at::_ops::_foreach_lerp_List_out::redispatch(dispatchKeySet, self, tensors1, weights, out);
+    }
+    
+    // aten::_foreach_lerp.List_out(Tensor[] self, Tensor[] tensors1, Tensor[] weights, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lerp_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights, at::TensorList out) {
+        return at::_ops::_foreach_lerp_List_out::redispatch(dispatchKeySet, self, tensors1, weights, out);
+    }
+    
+    // aten::_foreach_lerp.Scalar_out(Tensor[] self, Tensor[] tensors1, Scalar weight, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lerp_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight) {
+        return at::_ops::_foreach_lerp_Scalar_out::redispatch(dispatchKeySet, self, tensors1, weight, out);
+    }
+    
+    // aten::_foreach_lerp.Scalar_out(Tensor[] self, Tensor[] tensors1, Scalar weight, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lerp_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight, at::TensorList out) {
+        return at::_ops::_foreach_lerp_Scalar_out::redispatch(dispatchKeySet, self, tensors1, weight, out);
+    }
+    
+    // aten::_foreach_lgamma.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lgamma_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_lgamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_lgamma.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_lgamma_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_lgamma_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_log_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_log_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log10.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log10_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_log10_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log10.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log10_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_log10_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log1p.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log1p_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log1p.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log1p_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_log1p_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log2.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log2_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_log2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_log2.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_log2_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_log2_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_neg.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_neg_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_neg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_neg.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_neg_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_neg_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_norm_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & ord=2) {
+        return at::_ops::_foreach_norm_Scalar_out::redispatch(dispatchKeySet, self, ord, out);
+    }
+    
+    // aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_norm_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & ord, at::TensorList out) {
+        return at::_ops::_foreach_norm_Scalar_out::redispatch(dispatchKeySet, self, ord, out);
+    }
+    
+    // aten::_foreach_pow.List_out(Tensor[] self, Tensor[] exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList exponent) {
+        return at::_ops::_foreach_pow_List_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_pow.List_out(Tensor[] self, Tensor[] exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList exponent, at::TensorList out) {
+        return at::_ops::_foreach_pow_List_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_pow.Scalar_out(Tensor[] self, Scalar exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, const at::Scalar & exponent) {
+        return at::_ops::_foreach_pow_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_pow.Scalar_out(Tensor[] self, Scalar exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, const at::Scalar & exponent, at::TensorList out) {
+        return at::_ops::_foreach_pow_Scalar_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_pow.ScalarList_out(Tensor[] self, Scalar[] exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> exponent) {
+        return at::_ops::_foreach_pow_ScalarList_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_pow.ScalarList_out(Tensor[] self, Scalar[] exponent, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_pow_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::ArrayRef<at::Scalar> exponent, at::TensorList out) {
+        return at::_ops::_foreach_pow_ScalarList_out::redispatch(dispatchKeySet, self, exponent, out);
+    }
+    
+    // aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_reciprocal_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_reciprocal_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_reciprocal_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_reciprocal_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_round.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_round_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_round_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_round.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_round_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_round_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sigmoid.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sigmoid_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sigmoid.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sigmoid_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_sigmoid_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sign.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sign_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_sign_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sign.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sign_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_sign_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sin_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_sin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sin_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_sin_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sinh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sinh_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_sinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sinh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sinh_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_sinh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sqrt.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sqrt_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_sqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_sqrt.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_sqrt_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_sqrt_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_tan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_tan_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_tan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_tan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_tan_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_tan_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_tanh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_tanh_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_tanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_tanh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_tanh_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_tanh_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_trunc_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_trunc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_trunc_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_trunc_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_zero.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_zero_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self) {
+        return at::_ops::_foreach_zero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_zero.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_zero_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out) {
+        return at::_ops::_foreach_zero_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_foreach_zero(Tensor[] self) -> Tensor[] self_out
+    inline ::std::vector<at::Tensor> _foreach_zero(c10::DispatchKeySet dispatchKeySet, at::TensorList self) {
+        return at::_ops::_foreach_zero::redispatch(dispatchKeySet, self);
+    }
+    
+    // aten::_foreach_copy.out(Tensor[] self, Tensor[] src, bool non_blocking=False, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_copy_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList src, bool non_blocking=false) {
+        return at::_ops::_foreach_copy_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::_foreach_copy.out(Tensor[] self, Tensor[] src, bool non_blocking=False, *, Tensor(a!)[] out) -> ()
+    inline void _foreach_copy_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList src, bool non_blocking, at::TensorList out) {
+        return at::_ops::_foreach_copy_out::redispatch(dispatchKeySet, self, src, non_blocking, out);
+    }
+    
+    // aten::_foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
+    inline ::std::vector<at::Tensor> _foreach_copy(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList src, bool non_blocking=false) {
+        return at::_ops::_foreach_copy::redispatch(dispatchKeySet, self, src, non_blocking);
+    }
+    
+    // aten::bucketize.Scalar_out(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bucketize_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Scalar & self, const at::Tensor & boundaries, bool out_int32=false, bool right=false) {
+        return at::_ops::bucketize_Scalar_out::redispatch(dispatchKeySet, self, boundaries, out_int32, right, out);
+    }
+    
+    // aten::bucketize.Scalar_out(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & bucketize_outf(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out) {
+        return at::_ops::bucketize_Scalar_out::redispatch(dispatchKeySet, self, boundaries, out_int32, right, out);
+    }
+    
+    // aten::glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_jvp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim) {
+        return at::_ops::glu_jvp_out::redispatch(dispatchKeySet, glu, x, dx, dim, out);
+    }
+    
+    // aten::glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_jvp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out) {
+        return at::_ops::glu_jvp_out::redispatch(dispatchKeySet, glu, x, dx, dim, out);
+    }
+    
+    // aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_backward_jvp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) {
+        return at::_ops::glu_backward_jvp_out::redispatch(dispatchKeySet, grad_x, grad_glu, x, dgrad_glu, dx, dim, out);
+    }
+    
+    // aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & glu_backward_jvp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim, at::Tensor & out) {
+        return at::_ops::glu_backward_jvp_out::redispatch(dispatchKeySet, grad_x, grad_glu, x, dgrad_glu, dx, dim, out);
+    }
+    
+    // aten::hardswish_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardswish_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::hardswish_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::hardswish_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & hardswish_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::hardswish_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::rrelu_with_noise_backward.out(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rrelu_with_noise_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result) {
+        return at::_ops::rrelu_with_noise_backward_out::redispatch(dispatchKeySet, grad_output, self, noise, lower, upper, training, self_is_result, out);
+    }
+    
+    // aten::rrelu_with_noise_backward.out(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & rrelu_with_noise_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result, at::Tensor & out) {
+        return at::_ops::rrelu_with_noise_backward_out::redispatch(dispatchKeySet, grad_output, self, noise, lower, upper, training, self_is_result, out);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool2d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::_adaptive_avg_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool2d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(output_size), out);
+    }
+    
+    // aten::_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+        return at::_ops::_adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool3d_out::redispatch(dispatchKeySet, self, output_size, out);
+    }
+    
+    // aten::_adaptive_avg_pool3d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) {
+        return at::_ops::_adaptive_avg_pool3d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::_adaptive_avg_pool3d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _adaptive_avg_pool3d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_adaptive_avg_pool3d_backward_out::redispatch(dispatchKeySet, grad_output, self, out);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask_out(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, ::std::array<bool,3> output_mask) {
+        return at::_ops::_slow_conv2d_backward_output_mask_out::redispatch(dispatchKeySet, grad_output, self, weight, c10::fromIntArrayRefSlow(kernel_size), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output_mask, out0, out1, out2);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask_out(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_slow_conv2d_backward_output_mask_out::redispatch(dispatchKeySet, grad_output, self, weight, c10::fromIntArrayRefSlow(kernel_size), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), output_mask, out0, out1, out2);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask_out(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, ::std::array<bool,3> output_mask) {
+        return at::_ops::_slow_conv2d_backward_output_mask_out::redispatch(dispatchKeySet, grad_output, self, weight, kernel_size, stride, padding, output_mask, out0, out1, out2);
+    }
+    
+    // aten::_slow_conv2d_backward.output_mask_out(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _slow_conv2d_backward_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+        return at::_ops::_slow_conv2d_backward_output_mask_out::redispatch(dispatchKeySet, grad_output, self, weight, kernel_size, stride, padding, output_mask, out0, out1, out2);
+    }
+    
+    // aten::conv_depthwise3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_depthwise3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation) {
+        return at::_ops::conv_depthwise3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::conv_depthwise3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_depthwise3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::conv_depthwise3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::conv_depthwise3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_depthwise3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation) {
+        return at::_ops::conv_depthwise3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::conv_depthwise3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & conv_depthwise3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::conv_depthwise3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::slow_conv_dilated2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated2d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_dilated2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_dilated2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated2d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_dilated2d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_dilated2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated2d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_dilated2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::slow_conv_dilated2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated2d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_dilated2d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated3d_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1) {
+        return at::_ops::slow_conv_dilated3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated3d_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_dilated3d_out::redispatch(dispatchKeySet, self, weight, c10::fromIntArrayRefSlow(kernel_size), bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), out);
+    }
+    
+    // aten::slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated3d_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)) {
+        return at::_ops::slow_conv_dilated3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slow_conv_dilated3d_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out) {
+        return at::_ops::slow_conv_dilated3d_out::redispatch(dispatchKeySet, self, weight, kernel_size, bias, stride, padding, dilation, out);
+    }
+    
+    // aten::isinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isinf_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::isinf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::isinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & isinf_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::isinf_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::linalg_matrix_exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_exp_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::linalg_matrix_exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::linalg_matrix_exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & linalg_matrix_exp_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::linalg_matrix_exp_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_intlist_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & values, at::OptionalIntArrayRef addends) {
+        return at::_ops::_test_optional_intlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_intlist_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, at::OptionalIntArrayRef addends, at::Tensor & out) {
+        return at::_ops::_test_optional_intlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_optional_filled_intlist.out(Tensor values, int[2]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_filled_intlist_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & values, at::OptionalIntArrayRef addends) {
+        return at::_ops::_test_optional_filled_intlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_optional_filled_intlist.out(Tensor values, int[2]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_filled_intlist_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, at::OptionalIntArrayRef addends, at::Tensor & out) {
+        return at::_ops::_test_optional_filled_intlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_optional_floatlist.out(Tensor values, float[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_floatlist_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & values, c10::optional<at::ArrayRef<double>> addends) {
+        return at::_ops::_test_optional_floatlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_optional_floatlist.out(Tensor values, float[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_optional_floatlist_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, c10::optional<at::ArrayRef<double>> addends, at::Tensor & out) {
+        return at::_ops::_test_optional_floatlist_out::redispatch(dispatchKeySet, values, addends, out);
+    }
+    
+    // aten::_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_warn_in_autograd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_test_warn_in_autograd_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_warn_in_autograd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_test_warn_in_autograd_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch.fullcoverage_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_autograd_multiple_dispatch_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_test_autograd_multiple_dispatch_fullcoverage_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch.fullcoverage_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_autograd_multiple_dispatch_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_test_autograd_multiple_dispatch_fullcoverage_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_autograd_multiple_dispatch_view_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_test_autograd_multiple_dispatch_view_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _test_autograd_multiple_dispatch_view_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_test_autograd_multiple_dispatch_view_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::segment_reduce.out(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & segment_reduce_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & indices={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, bool unsafe=false, const c10::optional<at::Scalar> & initial=c10::nullopt) {
+        return at::_ops::segment_reduce_out::redispatch(dispatchKeySet, data, reduce, lengths, indices, offsets, axis, unsafe, initial, out);
+    }
+    
+    // aten::segment_reduce.out(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & segment_reduce_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths, const c10::optional<at::Tensor> & indices, const c10::optional<at::Tensor> & offsets, int64_t axis, bool unsafe, const c10::optional<at::Scalar> & initial, at::Tensor & out) {
+        return at::_ops::segment_reduce_out::redispatch(dispatchKeySet, data, reduce, lengths, indices, offsets, axis, unsafe, initial, out);
+    }
+    
+    // aten::_segment_reduce_backward.out(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _segment_reduce_backward_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, const c10::optional<at::Scalar> & initial=c10::nullopt) {
+        return at::_ops::_segment_reduce_backward_out::redispatch(dispatchKeySet, grad, output, data, reduce, lengths, offsets, axis, initial, out);
+    }
+    
+    // aten::_segment_reduce_backward.out(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _segment_reduce_backward_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths, const c10::optional<at::Tensor> & offsets, int64_t axis, const c10::optional<at::Scalar> & initial, at::Tensor & out) {
+        return at::_ops::_segment_reduce_backward_out::redispatch(dispatchKeySet, grad, output, data, reduce, lengths, offsets, axis, initial, out);
+    }
+    
+    // aten::_nested_tensor_from_tensor_list.out(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_from_tensor_list_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, at::TensorList list, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt) {
+        return at::_ops::_nested_tensor_from_tensor_list_out::redispatch(dispatchKeySet, list, dtype, layout, device, pin_memory, out);
+    }
+    
+    // aten::_nested_tensor_from_tensor_list.out(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _nested_tensor_from_tensor_list_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList list, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, at::Tensor & out) {
+        return at::_ops::_nested_tensor_from_tensor_list_out::redispatch(dispatchKeySet, list, dtype, layout, device, pin_memory, out);
+    }
+    
+    // aten::_fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fw_primal_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t level) {
+        return at::_ops::_fw_primal_copy_out::redispatch(dispatchKeySet, self, level, out);
+    }
+    
+    // aten::_fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _fw_primal_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t level, at::Tensor & out) {
+        return at::_ops::_fw_primal_copy_out::redispatch(dispatchKeySet, self, level, out);
+    }
+    
+    // aten::_make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_dual_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & primal, const at::Tensor & tangent, int64_t level) {
+        return at::_ops::_make_dual_copy_out::redispatch(dispatchKeySet, primal, tangent, level, out);
+    }
+    
+    // aten::_make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _make_dual_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & primal, const at::Tensor & tangent, int64_t level, at::Tensor & out) {
+        return at::_ops::_make_dual_copy_out::redispatch(dispatchKeySet, primal, tangent, level, out);
+    }
+    
+    // aten::view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_as_real_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::view_as_real_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_as_real_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::view_as_real_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_as_complex_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::view_as_complex_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_as_complex_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::view_as_complex_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _conj_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_conj_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _conj_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_conj_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _neg_view_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_neg_view_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _neg_view_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_neg_view_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt, out);
+    }
+    
+    // aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset, at::Tensor & out) {
+        return at::_ops::as_strided_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt, out);
+    }
+    
+    // aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) {
+        return at::_ops::as_strided_copy_out::redispatch(dispatchKeySet, self, size, stride, storage_offset, out);
+    }
+    
+    // aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & as_strided_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset, at::Tensor & out) {
+        return at::_ops::as_strided_copy_out::redispatch(dispatchKeySet, self, size, stride, storage_offset, out);
+    }
+    
+    // aten::_sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_broadcast_to_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::_sparse_broadcast_to_copy_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::_sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _sparse_broadcast_to_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::_sparse_broadcast_to_copy_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) {
+        return at::_ops::diagonal_copy_out::redispatch(dispatchKeySet, self, offset, dim1, dim2, out);
+    }
+    
+    // aten::diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & diagonal_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) {
+        return at::_ops::diagonal_copy_out::redispatch(dispatchKeySet, self, offset, dim1, dim2, out);
+    }
+    
+    // aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expand_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, bool implicit=false) {
+        return at::_ops::expand_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), implicit, out);
+    }
+    
+    // aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expand_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, bool implicit, at::Tensor & out) {
+        return at::_ops::expand_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), implicit, out);
+    }
+    
+    // aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expand_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit=false) {
+        return at::_ops::expand_copy_out::redispatch(dispatchKeySet, self, size, implicit, out);
+    }
+    
+    // aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & expand_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out) {
+        return at::_ops::expand_copy_out::redispatch(dispatchKeySet, self, size, implicit, out);
+    }
+    
+    // aten::permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & permute_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dims) {
+        return at::_ops::permute_copy_out::redispatch(dispatchKeySet, self, dims, out);
+    }
+    
+    // aten::permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & permute_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out) {
+        return at::_ops::permute_copy_out::redispatch(dispatchKeySet, self, dims, out);
+    }
+    
+    // aten::_reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _reshape_alias_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride) {
+        return at::_ops::_reshape_alias_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::_reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _reshape_alias_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) {
+        return at::_ops::_reshape_alias_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out);
+    }
+    
+    // aten::_reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _reshape_alias_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) {
+        return at::_ops::_reshape_alias_copy_out::redispatch(dispatchKeySet, self, size, stride, out);
+    }
+    
+    // aten::_reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _reshape_alias_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) {
+        return at::_ops::_reshape_alias_copy_out::redispatch(dispatchKeySet, self, size, stride, out);
+    }
+    
+    // aten::select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t index) {
+        return at::_ops::select_copy_int_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, int64_t index, at::Tensor & out) {
+        return at::_ops::select_copy_int_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt index) {
+        return at::_ops::select_copy_int_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & select_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out) {
+        return at::_ops::select_copy_int_out::redispatch(dispatchKeySet, self, dim, index, out);
+    }
+    
+    // aten::detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & detach_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::detach_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & detach_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::detach_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) {
+        return at::_ops::slice_copy_Tensor_out::redispatch(dispatchKeySet, self, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step, out);
+    }
+    
+    // aten::slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step, at::Tensor & out) {
+        return at::_ops::slice_copy_Tensor_out::redispatch(dispatchKeySet, self, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step, out);
+    }
+    
+    // aten::slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) {
+        return at::_ops::slice_copy_Tensor_out::redispatch(dispatchKeySet, self, dim, start, end, step, out);
+    }
+    
+    // aten::slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & slice_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step, at::Tensor & out) {
+        return at::_ops::slice_copy_Tensor_out::redispatch(dispatchKeySet, self, dim, start, end, step, out);
+    }
+    
+    // aten::squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::squeeze_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::squeeze_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim) {
+        return at::_ops::squeeze_copy_dim_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & out) {
+        return at::_ops::squeeze_copy_dim_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::squeeze_copy.dims_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim) {
+        return at::_ops::squeeze_copy_dims_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::squeeze_copy.dims_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & squeeze_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out) {
+        return at::_ops::squeeze_copy_dims_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & t_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::t_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & t_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::t_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & transpose_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim0, int64_t dim1) {
+        return at::_ops::transpose_copy_int_out::redispatch(dispatchKeySet, self, dim0, dim1, out);
+    }
+    
+    // aten::transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & transpose_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out) {
+        return at::_ops::transpose_copy_int_out::redispatch(dispatchKeySet, self, dim0, dim1, out);
+    }
+    
+    // aten::unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unsqueeze_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dim) {
+        return at::_ops::unsqueeze_copy_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unsqueeze_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, at::Tensor & out) {
+        return at::_ops::unsqueeze_copy_out::redispatch(dispatchKeySet, self, dim, out);
+    }
+    
+    // aten::_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _values_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::_values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _values_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::_values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & values_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & values_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::values_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & crow_indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::crow_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & crow_indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::crow_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col_indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::col_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & col_indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::col_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::ccol_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ccol_indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::ccol_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::ccol_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & ccol_indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::ccol_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::row_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & row_indices_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::row_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::row_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & row_indices_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::row_indices_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::IntArrayRef size) {
+        return at::_ops::view_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out) {
+        return at::_ops::view_copy_out::redispatch(dispatchKeySet, self, c10::fromIntArrayRefSlow(size), out);
+    }
+    
+    // aten::view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size) {
+        return at::_ops::view_copy_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out) {
+        return at::_ops::view_copy_out::redispatch(dispatchKeySet, self, size, out);
+    }
+    
+    // aten::view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, at::ScalarType dtype) {
+        return at::_ops::view_copy_dtype_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & view_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::ScalarType dtype, at::Tensor & out) {
+        return at::_ops::view_copy_dtype_out::redispatch(dispatchKeySet, self, dtype, out);
+    }
+    
+    // aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
+        return at::_ops::unfold_copy_out::redispatch(dispatchKeySet, self, dimension, size, step, out);
+    }
+    
+    // aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & unfold_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) {
+        return at::_ops::unfold_copy_out::redispatch(dispatchKeySet, self, dimension, size, step, out);
+    }
+    
+    // aten::alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & alias_copy_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self) {
+        return at::_ops::alias_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & alias_copy_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out) {
+        return at::_ops::alias_copy_out::redispatch(dispatchKeySet, self, out);
+    }
+    
+    // aten::to_padded_tensor.out(Tensor self, float padding, SymInt[]? output_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_padded_tensor_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt) {
+        return at::_ops::to_padded_tensor_out::redispatch(dispatchKeySet, self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, out);
+    }
+    
+    // aten::to_padded_tensor.out(Tensor self, float padding, SymInt[]? output_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_padded_tensor_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::to_padded_tensor_out::redispatch(dispatchKeySet, self, padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, out);
+    }
+    
+    // aten::to_padded_tensor.out(Tensor self, float padding, SymInt[]? output_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_padded_tensor_symint_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size=c10::nullopt) {
+        return at::_ops::to_padded_tensor_out::redispatch(dispatchKeySet, self, padding, output_size, out);
+    }
+    
+    // aten::to_padded_tensor.out(Tensor self, float padding, SymInt[]? output_size=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & to_padded_tensor_symint_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size, at::Tensor & out) {
+        return at::_ops::to_padded_tensor_out::redispatch(dispatchKeySet, self, padding, output_size, out);
+    }
+    
+    // aten::_transformer_encoder_layer_fwd.out(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _transformer_encoder_layer_fwd_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask={}, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_transformer_encoder_layer_fwd_out::redispatch(dispatchKeySet, src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type, out);
+    }
+    
+    // aten::_transformer_encoder_layer_fwd.out(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _transformer_encoder_layer_fwd_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type, at::Tensor & out) {
+        return at::_ops::_transformer_encoder_layer_fwd_out::redispatch(dispatchKeySet, src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type, out);
+    }
+    
+    // aten::_native_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _native_multi_head_attention_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out0, at::Tensor & out1, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}, bool need_weights=true, bool average_attn_weights=true, c10::optional<int64_t> mask_type=c10::nullopt) {
+        return at::_ops::_native_multi_head_attention_out::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type, out0, out1);
+    }
+    
+    // aten::_native_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+    inline ::std::tuple<at::Tensor &,at::Tensor &> _native_multi_head_attention_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type, at::Tensor & out0, at::Tensor & out1) {
+        return at::_ops::_native_multi_head_attention_out::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type, out0, out1);
+    }
+    
+    // aten::_triton_scaled_dot_attention.out(Tensor q, Tensor k, Tensor v, float dropout_p=0.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _triton_scaled_dot_attention_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p=0.0) {
+        return at::_ops::_triton_scaled_dot_attention_out::redispatch(dispatchKeySet, q, k, v, dropout_p, out);
+    }
+    
+    // aten::_triton_scaled_dot_attention.out(Tensor q, Tensor k, Tensor v, float dropout_p=0.0, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _triton_scaled_dot_attention_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out) {
+        return at::_ops::_triton_scaled_dot_attention_out::redispatch(dispatchKeySet, q, k, v, dropout_p, out);
+    }
+    
+    // aten::_triton_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _triton_multi_head_attention_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}) {
+        return at::_ops::_triton_multi_head_attention_out::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, out);
+    }
+    
+    // aten::_triton_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, *, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _triton_multi_head_attention_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, at::Tensor & out) {
+        return at::_ops::_triton_multi_head_attention_out::redispatch(dispatchKeySet, query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, out);
+    }
+    
+    // aten::_foobar.out(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _foobar_out(c10::DispatchKeySet dispatchKeySet, at::Tensor & out, const at::Tensor & self, bool arg1=true, bool arg2=true, bool arg3=true) {
+        return at::_ops::_foobar_out::redispatch(dispatchKeySet, self, arg1, arg2, arg3, out);
+    }
+    
+    // aten::_foobar.out(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True, Tensor(a!) out) -> Tensor(a!)
+    inline at::Tensor & _foobar_outf(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool arg1, bool arg2, bool arg3, at::Tensor & out) {
+        return at::_ops::_foobar_out::redispatch(dispatchKeySet, self, arg1, arg2, arg3, out);
+    }
+    
+    // aten::_fused_adam.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adam_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adam.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adam_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_adam_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adam(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adam(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adam.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adam_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam_tensor_lr_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adam.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adam_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_adam_tensor_lr_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adam.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adam(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adam_tensor_lr::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adamw.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adamw_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adamw.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adamw_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_adamw_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adamw(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adamw(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_adamw.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adamw_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw_tensor_lr_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adamw.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_adamw_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_adamw_tensor_lr_out::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_adamw.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_adamw(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList exp_avgs, at::TensorList exp_avg_sqs, at::TensorList max_exp_avg_sqs, at::TensorList state_steps, const at::Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_adamw_tensor_lr::redispatch(dispatchKeySet, self, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_sgd.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_sgd_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd_out::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_sgd.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_sgd_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_sgd_out::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_sgd(Tensor[] self, Tensor[] grads, Tensor[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] momentum_buffer_list_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_sgd(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf);
+    }
+    
+    // aten::_fused_sgd.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_sgd_out(c10::DispatchKeySet dispatchKeySet, at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, const at::Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd_tensor_lr_out::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_sgd.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()
+    inline void _fused_sgd_outf(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, const at::Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale, const c10::optional<at::Tensor> & found_inf, at::TensorList out) {
+        return at::_ops::_fused_sgd_tensor_lr_out::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf, out);
+    }
+    
+    // aten::_fused_sgd.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] momentum_buffer_list_out)
+    inline ::std::tuple<::std::vector<at::Tensor>,::std::vector<at::Tensor>,::std::vector<at::Tensor>> _fused_sgd(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList momentum_buffer_list, double weight_decay, double momentum, const at::Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<at::Tensor> & grad_scale={}, const c10::optional<at::Tensor> & found_inf={}) {
+        return at::_ops::_fused_sgd_tensor_lr::redispatch(dispatchKeySet, self, grads, momentum_buffer_list, weight_decay, momentum, lr, dampening, nesterov, maximize, is_first_step, grad_scale, found_inf);
+    }
+} // namespace redispatch
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/RegistrationDeclarations.h b/MLPY/Lib/site-packages/torch/include/ATen/RegistrationDeclarations.h
new file mode 100644
index 0000000000000000000000000000000000000000..efd957c9e256baddec2135a1408f57a202aa1242
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/RegistrationDeclarations.h
@@ -0,0 +1,3099 @@
+// This file contains all native_functions that can be registered to
+// and the schema string that they should be registered with
+
+Tensor _cast_Byte(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Byte(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Char(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Char(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Double(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Double(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Float(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Float(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Int(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Int(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Long(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Long(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Short(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Short(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _cast_Half(const Tensor & self, bool non_blocking); // {"schema": "aten::_cast_Half(Tensor self, bool non_blocking=False) -> Tensor", "dispatch": "False", "default": "True"}
+void _backward(const Tensor & self, TensorList inputs, const c10::optional<Tensor> & gradient, c10::optional<bool> retain_graph, bool create_graph); // {"schema": "aten::_backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()", "dispatch": "False", "default": "True"}
+void set_data(Tensor & self, const Tensor & new_data); // {"schema": "aten::set_data(Tensor(a!) self, Tensor new_data) -> ()", "dispatch": "False", "default": "True"}
+Tensor data(const Tensor & self); // {"schema": "aten::data(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+bool is_leaf(const Tensor & self); // {"schema": "aten::is_leaf(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+int64_t output_nr(const Tensor & self); // {"schema": "aten::output_nr(Tensor self) -> int", "dispatch": "False", "default": "True"}
+int64_t _version(const Tensor & self); // {"schema": "aten::_version(Tensor self) -> int", "dispatch": "False", "default": "True"}
+Tensor & requires_grad_(Tensor & self, bool requires_grad); // {"schema": "aten::requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+void retain_grad(Tensor & self); // {"schema": "aten::retain_grad(Tensor(a!) self) -> ()", "dispatch": "False", "default": "True"}
+bool retains_grad(const Tensor & self); // {"schema": "aten::retains_grad(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+Tensor _fw_primal(const Tensor & self, int64_t level); // {"schema": "aten::_fw_primal(Tensor(a) self, int level) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor _make_dual(const Tensor & primal, const Tensor & tangent, int64_t level); // {"schema": "aten::_make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> _unpack_dual(const Tensor & dual, int64_t level); // {"schema": "aten::_unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)", "dispatch": "False", "default": "True"}
+Tensor _new_zeros_with_same_feature_meta(const Tensor & self, const Tensor & other, int64_t self_num_batch_dims); // {"schema": "aten::_new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor", "dispatch": "True", "default": "True"}
+bool _has_same_storage_numel(const Tensor & self, const Tensor & other); // {"schema": "aten::_has_same_storage_numel(Tensor self, Tensor other) -> bool", "dispatch": "True", "default": "True"}
+Tensor & rename_(Tensor & self, c10::optional<DimnameList> names); // {"schema": "aten::rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor rename(const Tensor & self, c10::optional<DimnameList> names); // {"schema": "aten::rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor align_to(const Tensor & self, DimnameList names); // {"schema": "aten::align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor align_to(const Tensor & self, DimnameList order, int64_t ellipsis_idx); // {"schema": "aten::align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor align_as(const Tensor & self, const Tensor & other); // {"schema": "aten::align_as(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> align_tensors(TensorList tensors); // {"schema": "aten::align_tensors(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+void _assert_async(const Tensor & self); // {"schema": "aten::_assert_async(Tensor self) -> ()", "dispatch": "True", "default": "False"}
+void _assert_async(const Tensor & self, c10::string_view assert_msg); // {"schema": "aten::_assert_async.msg(Tensor self, str assert_msg) -> ()", "dispatch": "True", "default": "False"}
+void _assert_scalar(const Scalar & self, c10::string_view assert_msg); // {"schema": "aten::_assert_scalar(Scalar self, str assert_msg) -> ()", "dispatch": "True", "default": "True"}
+Tensor _functional_assert_scalar(const Scalar & self, c10::string_view assert_msg, const Tensor & dep_token); // {"schema": "aten::_functional_assert_scalar(Scalar self, str assert_msg, Tensor dep_token) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _functional_assert_async(const Tensor & self, c10::string_view assert_msg, const Tensor & dep_token); // {"schema": "aten::_functional_assert_async.msg(Tensor self, str assert_msg, Tensor dep_token) -> Tensor", "dispatch": "True", "default": "False"}
+void _assert_tensor_metadata(const Tensor & a, OptionalSymIntArrayRef size, OptionalSymIntArrayRef stride, c10::optional<ScalarType> dtype); // {"schema": "aten::_assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> ()", "dispatch": "False", "default": "True"}
+void _print(c10::string_view s); // {"schema": "aten::_print(str s) -> ()", "dispatch": "True", "default": "True"}
+void sym_constrain_range(const Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max); // {"schema": "aten::sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> ()", "dispatch": "True", "default": "True"}
+void sym_constrain_range_for_size(const Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max); // {"schema": "aten::sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()", "dispatch": "True", "default": "True"}
+Tensor _functional_sym_constrain_range(const Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max, const Tensor & dep_token); // {"schema": "aten::_functional_sym_constrain_range(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _functional_sym_constrain_range_for_size(const Scalar & size, c10::optional<int64_t> min, c10::optional<int64_t> max, const Tensor & dep_token); // {"schema": "aten::_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _make_dep_token(c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::_make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor refine_names(const Tensor & self, DimnameList names); // {"schema": "aten::refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)", "dispatch": "False", "default": "True"}
+bool _use_cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank); // {"schema": "aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool", "dispatch": "True", "default": "False"}
+bool _use_cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, int64_t blank); // {"schema": "aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity); // {"schema": "aten::_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity); // {"schema": "aten::_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+bool _use_cudnn_rnn_flatten_weight(); // {"schema": "aten::_use_cudnn_rnn_flatten_weight() -> bool", "dispatch": "False", "default": "True"}
+Tensor _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, c10::SymInt input_size, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, bool bidirectional); // {"schema": "aten::_cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _cudnn_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const c10::optional<Tensor> & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state); // {"schema": "aten::_cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,::std::vector<Tensor>> _cudnn_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, const Tensor & output, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, const Tensor & reserve, ::std::array<bool,4> output_mask); // {"schema": "aten::_cudnn_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])", "dispatch": "True", "default": "False"}
+Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "True", "default": "False"}
+int64_t _debug_has_internal_overlap(const Tensor & self); // {"schema": "aten::_debug_has_internal_overlap(Tensor self) -> int", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _fused_dropout(const Tensor & self, double p, c10::optional<Generator> generator); // {"schema": "aten::_fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _masked_scale(const Tensor & self, const Tensor & mask, double scale); // {"schema": "aten::_masked_scale(Tensor self, Tensor mask, float scale) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> native_dropout(const Tensor & input, double p, c10::optional<bool> train); // {"schema": "aten::native_dropout(Tensor input, float p, bool? train) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor native_dropout_backward(const Tensor & grad_output, const Tensor & mask, double scale); // {"schema": "aten::native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _sobol_engine_draw(const Tensor & quasi, int64_t n, const Tensor & sobolstate, int64_t dimension, int64_t num_generated, c10::optional<ScalarType> dtype); // {"schema": "aten::_sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor & _sobol_engine_ff_(Tensor & self, int64_t n, const Tensor & sobolstate, int64_t dimension, int64_t num_generated); // {"schema": "aten::_sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & _sobol_engine_scramble_(Tensor & self, const Tensor & ltm, int64_t dimension); // {"schema": "aten::_sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & _sobol_engine_initialize_state_(Tensor & self, int64_t dimension); // {"schema": "aten::_sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor _reshape_from_tensor(const Tensor & self, const Tensor & shape); // {"schema": "aten::_reshape_from_tensor(Tensor self, Tensor shape) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _shape_as_tensor(const Tensor & self); // {"schema": "aten::_shape_as_tensor(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor dropout(const Tensor & input, double p, bool train); // {"schema": "aten::dropout(Tensor input, float p, bool train) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & dropout_(Tensor & self, double p, bool train); // {"schema": "aten::dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor feature_dropout(const Tensor & input, double p, bool train); // {"schema": "aten::feature_dropout(Tensor input, float p, bool train) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & feature_dropout_(Tensor & self, double p, bool train); // {"schema": "aten::feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor alpha_dropout(const Tensor & input, double p, bool train); // {"schema": "aten::alpha_dropout(Tensor input, float p, bool train) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & alpha_dropout_(Tensor & self, double p, bool train); // {"schema": "aten::alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor feature_alpha_dropout(const Tensor & input, double p, bool train); // {"schema": "aten::feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & feature_alpha_dropout_(Tensor & self, double p, bool train); // {"schema": "aten::feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor abs(const Tensor & self); // {"schema": "aten::abs(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & abs_(Tensor & self); // {"schema": "aten::abs_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & abs_out(const Tensor & self, Tensor & out); // {"schema": "aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor absolute(const Tensor & self); // {"schema": "aten::absolute(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & absolute_(Tensor & self); // {"schema": "aten::absolute_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & absolute_out(const Tensor & self, Tensor & out); // {"schema": "aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor angle(const Tensor & self); // {"schema": "aten::angle(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & angle_out(const Tensor & self, Tensor & out); // {"schema": "aten::angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor view_as_real(const Tensor & self); // {"schema": "aten::view_as_real(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor view_as_complex(const Tensor & self); // {"schema": "aten::view_as_complex(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor sgn(const Tensor & self); // {"schema": "aten::sgn(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sgn_(Tensor & self); // {"schema": "aten::sgn_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sgn_out(const Tensor & self, Tensor & out); // {"schema": "aten::sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor chalf(const Tensor & self, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor real(const Tensor & self); // {"schema": "aten::real(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor imag(const Tensor & self); // {"schema": "aten::imag(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _conj(const Tensor & self); // {"schema": "aten::_conj(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor conj(const Tensor & self); // {"schema": "aten::conj(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _conj_physical(const Tensor & self); // {"schema": "aten::_conj_physical(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor conj_physical(const Tensor & self); // {"schema": "aten::conj_physical(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & conj_physical_out(const Tensor & self, Tensor & out); // {"schema": "aten::conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & conj_physical_(Tensor & self); // {"schema": "aten::conj_physical_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor resolve_conj(const Tensor & self); // {"schema": "aten::resolve_conj(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor resolve_neg(const Tensor & self); // {"schema": "aten::resolve_neg(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _neg_view(const Tensor & self); // {"schema": "aten::_neg_view(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor acos(const Tensor & self); // {"schema": "aten::acos(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & acos_(Tensor & self); // {"schema": "aten::acos_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & acos_out(const Tensor & self, Tensor & out); // {"schema": "aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arccos(const Tensor & self); // {"schema": "aten::arccos(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arccos_(Tensor & self); // {"schema": "aten::arccos_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arccos_out(const Tensor & self, Tensor & out); // {"schema": "aten::arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor avg_pool1d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad); // {"schema": "aten::avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor adaptive_avg_pool1d(const Tensor & self, IntArrayRef output_size); // {"schema": "aten::adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> adaptive_max_pool1d(const Tensor & self, IntArrayRef output_size); // {"schema": "aten::adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor add(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & add_(Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & add_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _add_relu(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::_add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _add_relu_(Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::_add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _add_relu_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::_add_relu.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _add_relu(const Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::_add_relu.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _add_relu_(Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::_add_relu_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor add(const Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & add_(Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor addmv(const Tensor & self, const Tensor & mat, const Tensor & vec, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & addmv_(Tensor & self, const Tensor & mat, const Tensor & vec, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & addmv_out(const Tensor & self, const Tensor & mat, const Tensor & vec, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & addr_out(const Tensor & self, const Tensor & vec1, const Tensor & vec2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor affine_grid_generator(const Tensor & theta, c10::SymIntArrayRef size, bool align_corners); // {"schema": "aten::affine_grid_generator(Tensor theta, SymInt[] size, bool align_corners) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor affine_grid_generator_backward(const Tensor & grad, c10::SymIntArrayRef size, bool align_corners); // {"schema": "aten::affine_grid_generator_backward(Tensor grad, SymInt[] size, bool align_corners) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _is_all_true(const Tensor & self); // {"schema": "aten::_is_all_true(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _is_any_true(const Tensor & self); // {"schema": "aten::_is_any_true(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _test_check_tensor(const Tensor & self); // {"schema": "aten::_test_check_tensor(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_functorch_fallback(const Tensor & self, const Tensor & other); // {"schema": "aten::_test_functorch_fallback(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor all(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor all(const Tensor & self, OptionalIntArrayRef dim, bool keepdim); // {"schema": "aten::all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & all_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & out); // {"schema": "aten::all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & all_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor all(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & all_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & out); // {"schema": "aten::all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+bool allclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan); // {"schema": "aten::allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool", "dispatch": "True", "default": "True"}
+Tensor any(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor any(const Tensor & self, OptionalIntArrayRef dim, bool keepdim); // {"schema": "aten::any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & any_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & out); // {"schema": "aten::any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & any_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor any(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & any_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & out); // {"schema": "aten::any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor arange(const Scalar & end, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor arange(const Scalar & start, const Scalar & end, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor arange(const Scalar & start, const Scalar & end, const Scalar & step, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::arange.start_step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & arange_out(const Scalar & end, Tensor & out); // {"schema": "aten::arange.out(Scalar end, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & arange_out(const Scalar & start, const Scalar & end, const Scalar & step, Tensor & out); // {"schema": "aten::arange.start_out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _dim_arange(const Tensor & like, int64_t dim); // {"schema": "aten::_dim_arange(Tensor like, int dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor argmax(const Tensor & self, c10::optional<int64_t> dim, bool keepdim); // {"schema": "aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & argmax_out(const Tensor & self, c10::optional<int64_t> dim, bool keepdim, Tensor & out); // {"schema": "aten::argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor argmin(const Tensor & self, c10::optional<int64_t> dim, bool keepdim); // {"schema": "aten::argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & argmin_out(const Tensor & self, c10::optional<int64_t> dim, bool keepdim, Tensor & out); // {"schema": "aten::argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor acosh(const Tensor & self); // {"schema": "aten::acosh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & acosh_(Tensor & self); // {"schema": "aten::acosh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & acosh_out(const Tensor & self, Tensor & out); // {"schema": "aten::acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arccosh(const Tensor & self); // {"schema": "aten::arccosh(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arccosh_(Tensor & self); // {"schema": "aten::arccosh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arccosh_out(const Tensor & self, Tensor & out); // {"schema": "aten::arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor asinh(const Tensor & self); // {"schema": "aten::asinh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & asinh_(Tensor & self); // {"schema": "aten::asinh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & asinh_out(const Tensor & self, Tensor & out); // {"schema": "aten::asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arcsinh(const Tensor & self); // {"schema": "aten::arcsinh(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arcsinh_(Tensor & self); // {"schema": "aten::arcsinh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arcsinh_out(const Tensor & self, Tensor & out); // {"schema": "aten::arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor atanh(const Tensor & self); // {"schema": "aten::atanh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & atanh_(Tensor & self); // {"schema": "aten::atanh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & atanh_out(const Tensor & self, Tensor & out); // {"schema": "aten::atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arctanh(const Tensor & self); // {"schema": "aten::arctanh(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arctanh_(Tensor & self); // {"schema": "aten::arctanh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arctanh_out(const Tensor & self, Tensor & out); // {"schema": "aten::arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor as_strided(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset); // {"schema": "aten::as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)", "dispatch": "True", "default": "False"}
+const Tensor & as_strided_(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset); // {"schema": "aten::as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor asin(const Tensor & self); // {"schema": "aten::asin(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & asin_(Tensor & self); // {"schema": "aten::asin_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & asin_out(const Tensor & self, Tensor & out); // {"schema": "aten::asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arcsin(const Tensor & self); // {"schema": "aten::arcsin(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arcsin_(Tensor & self); // {"schema": "aten::arcsin_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arcsin_out(const Tensor & self, Tensor & out); // {"schema": "aten::arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor atan(const Tensor & self); // {"schema": "aten::atan(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & atan_(Tensor & self); // {"schema": "aten::atan_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & atan_out(const Tensor & self, Tensor & out); // {"schema": "aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor arctan(const Tensor & self); // {"schema": "aten::arctan(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arctan_(Tensor & self); // {"schema": "aten::arctan_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arctan_out(const Tensor & self, Tensor & out); // {"schema": "aten::arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor atleast_1d(const Tensor & self); // {"schema": "aten::atleast_1d(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> atleast_1d(TensorList tensors); // {"schema": "aten::atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor atleast_2d(const Tensor & self); // {"schema": "aten::atleast_2d(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> atleast_2d(TensorList tensors); // {"schema": "aten::atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor atleast_3d(const Tensor & self); // {"schema": "aten::atleast_3d(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> atleast_3d(TensorList tensors); // {"schema": "aten::atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor baddbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & baddbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & baddbmm_out(const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bartlett_window(int64_t window_length, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bartlett_window(int64_t window_length, bool periodic, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor batch_norm(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double momentum, double eps, bool cudnn_enabled); // {"schema": "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor quantized_batch_norm(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & mean, const Tensor & var, double eps, double output_scale, int64_t output_zero_point); // {"schema": "aten::quantized_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,int64_t> _batch_norm_impl_index(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double momentum, double eps, bool cudnn_enabled); // {"schema": "aten::_batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> _batch_norm_impl_index_backward(int64_t impl_index, const Tensor & input, const Tensor & grad_output, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_var_transform, bool train, double eps, ::std::array<bool,3> output_mask, const Tensor & reservedSpace); // {"schema": "aten::_batch_norm_impl_index_backward(int impl_index, Tensor input, Tensor grad_output, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var_transform, bool train, float eps, bool[3] output_mask, Tensor reservedSpace) -> (Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor bernoulli(const Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::bernoulli(Tensor self, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bernoulli_out(const Tensor & self, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & bernoulli_(Tensor & self, const Tensor & p, c10::optional<Generator> generator); // {"schema": "aten::bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & bernoulli_(Tensor & self, double p, c10::optional<Generator> generator); // {"schema": "aten::bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bernoulli(const Tensor & self, double p, c10::optional<Generator> generator); // {"schema": "aten::bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bilinear(const Tensor & input1, const Tensor & input2, const Tensor & weight, const c10::optional<Tensor> & bias); // {"schema": "aten::bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor binary_cross_entropy(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction); // {"schema": "aten::binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & binary_cross_entropy_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, Tensor & out); // {"schema": "aten::binary_cross_entropy.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor binary_cross_entropy_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction); // {"schema": "aten::binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & binary_cross_entropy_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, Tensor & grad_input); // {"schema": "aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor binary_cross_entropy_with_logits(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & pos_weight, int64_t reduction); // {"schema": "aten::binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bincount(const Tensor & self, const c10::optional<Tensor> & weights, int64_t minlength); // {"schema": "aten::bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor bitwise_not(const Tensor & self); // {"schema": "aten::bitwise_not(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_not_(Tensor & self); // {"schema": "aten::bitwise_not_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_not_out(const Tensor & self, Tensor & out); // {"schema": "aten::bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & copysign_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor copysign(const Tensor & self, const Tensor & other); // {"schema": "aten::copysign.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & copysign_(Tensor & self, const Tensor & other); // {"schema": "aten::copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor copysign(const Tensor & self, const Scalar & other); // {"schema": "aten::copysign.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & copysign_(Tensor & self, const Scalar & other); // {"schema": "aten::copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & copysign_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::copysign.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _lazy_clone(const Tensor & self); // {"schema": "aten::_lazy_clone(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor logical_not(const Tensor & self); // {"schema": "aten::logical_not(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logical_not_(Tensor & self); // {"schema": "aten::logical_not_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logical_not_out(const Tensor & self, Tensor & out); // {"schema": "aten::logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logical_xor(const Tensor & self, const Tensor & other); // {"schema": "aten::logical_xor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logical_xor_(Tensor & self, const Tensor & other); // {"schema": "aten::logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logical_xor_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logical_and(const Tensor & self, const Tensor & other); // {"schema": "aten::logical_and(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logical_and_(Tensor & self, const Tensor & other); // {"schema": "aten::logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logical_and_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logical_or(const Tensor & self, const Tensor & other); // {"schema": "aten::logical_or(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logical_or_(Tensor & self, const Tensor & other); // {"schema": "aten::logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logical_or_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor blackman_window(int64_t window_length, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::blackman_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor blackman_window(int64_t window_length, bool periodic, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::blackman_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bmm(const Tensor & self, const Tensor & mat2); // {"schema": "aten::bmm(Tensor self, Tensor mat2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bmm_out(const Tensor & self, const Tensor & mat2, Tensor & out); // {"schema": "aten::bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> broadcast_tensors(TensorList tensors); // {"schema": "aten::broadcast_tensors(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor broadcast_to(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _sparse_broadcast_to(const Tensor & self, IntArrayRef size); // {"schema": "aten::_sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor cat(const ITensorListRef & tensors, int64_t dim); // {"schema": "aten::cat(Tensor[] tensors, int dim=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cat_out(const ITensorListRef & tensors, int64_t dim, Tensor & out); // {"schema": "aten::cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cat(TensorList tensors, Dimname dim); // {"schema": "aten::cat.names(Tensor[] tensors, Dimname dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & cat_out(TensorList tensors, Dimname dim, Tensor & out); // {"schema": "aten::cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor concat(TensorList tensors, int64_t dim); // {"schema": "aten::concat(Tensor[] tensors, int dim=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & concat_out(TensorList tensors, int64_t dim, Tensor & out); // {"schema": "aten::concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor concat(TensorList tensors, Dimname dim); // {"schema": "aten::concat.names(Tensor[] tensors, Dimname dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & concat_out(TensorList tensors, Dimname dim, Tensor & out); // {"schema": "aten::concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor concatenate(TensorList tensors, int64_t dim); // {"schema": "aten::concatenate(Tensor[] tensors, int dim=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & concatenate_out(TensorList tensors, int64_t dim, Tensor & out); // {"schema": "aten::concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor concatenate(TensorList tensors, Dimname dim); // {"schema": "aten::concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & concatenate_out(TensorList tensors, Dimname dim, Tensor & out); // {"schema": "aten::concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor block_diag(TensorList tensors); // {"schema": "aten::block_diag(Tensor[] tensors) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor ceil(const Tensor & self); // {"schema": "aten::ceil(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ceil_(Tensor & self); // {"schema": "aten::ceil_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ceil_out(const Tensor & self, Tensor & out); // {"schema": "aten::ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor chain_matmul(TensorList matrices); // {"schema": "aten::chain_matmul(Tensor[] matrices) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & chain_matmul_out(TensorList matrices, Tensor & out); // {"schema": "aten::chain_matmul.out(Tensor[] matrices, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> unsafe_chunk(const Tensor & self, int64_t chunks, int64_t dim); // {"schema": "aten::unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> chunk(const Tensor & self, int64_t chunks, int64_t dim); // {"schema": "aten::chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> tensor_split(const Tensor & self, c10::SymInt sections, int64_t dim); // {"schema": "aten::tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> tensor_split(const Tensor & self, c10::SymIntArrayRef indices, int64_t dim); // {"schema": "aten::tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> tensor_split(const Tensor & self, const Tensor & tensor_indices_or_sections, int64_t dim); // {"schema": "aten::tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+Tensor clamp(const Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max); // {"schema": "aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor clamp(const Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max); // {"schema": "aten::clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & clamp_(Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max); // {"schema": "aten::clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_(Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max); // {"schema": "aten::clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_out(const Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max, Tensor & out); // {"schema": "aten::clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & clamp_out(const Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max, Tensor & out); // {"schema": "aten::clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor clamp_max(const Tensor & self, const Scalar & max); // {"schema": "aten::clamp_max(Tensor self, Scalar max) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor clamp_max(const Tensor & self, const Tensor & max); // {"schema": "aten::clamp_max.Tensor(Tensor self, Tensor max) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & clamp_max_(Tensor & self, const Scalar & max); // {"schema": "aten::clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_max_(Tensor & self, const Tensor & max); // {"schema": "aten::clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_max_out(const Tensor & self, const Scalar & max, Tensor & out); // {"schema": "aten::clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & clamp_max_out(const Tensor & self, const Tensor & max, Tensor & out); // {"schema": "aten::clamp_max.Tensor_out(Tensor self, Tensor max, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor clamp_min(const Tensor & self, const Scalar & min); // {"schema": "aten::clamp_min(Tensor self, Scalar min) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor clamp_min(const Tensor & self, const Tensor & min); // {"schema": "aten::clamp_min.Tensor(Tensor self, Tensor min) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & clamp_min_(Tensor & self, const Scalar & min); // {"schema": "aten::clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_min_(Tensor & self, const Tensor & min); // {"schema": "aten::clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clamp_min_out(const Tensor & self, const Scalar & min, Tensor & out); // {"schema": "aten::clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & clamp_min_out(const Tensor & self, const Tensor & min, Tensor & out); // {"schema": "aten::clamp_min.Tensor_out(Tensor self, Tensor min, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor clip(const Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max); // {"schema": "aten::clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor clip(const Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max); // {"schema": "aten::clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & clip_(Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max); // {"schema": "aten::clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & clip_(Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max); // {"schema": "aten::clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & clip_out(const Tensor & self, const c10::optional<Scalar> & min, const c10::optional<Scalar> & max, Tensor & out); // {"schema": "aten::clip.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & clip_out(const Tensor & self, const c10::optional<Tensor> & min, const c10::optional<Tensor> & max, Tensor & out); // {"schema": "aten::clip.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+bool cudnn_is_acceptable(const Tensor & self); // {"schema": "aten::cudnn_is_acceptable(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+Tensor complex(const Tensor & real, const Tensor & imag); // {"schema": "aten::complex(Tensor real, Tensor imag) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & complex_out(const Tensor & real, const Tensor & imag, Tensor & out); // {"schema": "aten::complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor polar(const Tensor & abs, const Tensor & angle); // {"schema": "aten::polar(Tensor abs, Tensor angle) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & polar_out(const Tensor & abs, const Tensor & angle, Tensor & out); // {"schema": "aten::polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor constant_pad_nd(const Tensor & self, c10::SymIntArrayRef pad, const Scalar & value); // {"schema": "aten::constant_pad_nd(Tensor self, SymInt[] pad, Scalar value=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor contiguous(const Tensor & self, MemoryFormat memory_format); // {"schema": "aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor convolution(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); // {"schema": "aten::convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> convolution_backward(const Tensor & grad_output, const Tensor & input, const Tensor & weight, OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask); // {"schema": "aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor convolution_overrideable(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); // {"schema": "aten::convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> convolution_backward_overrideable(const Tensor & grad_output, const Tensor & input, const Tensor & weight, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask); // {"schema": "aten::convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)", "dispatch": "True", "default": "True"}
+Tensor _convolution(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32); // {"schema": "aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _convolution(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled); // {"schema": "aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _convolution_mode(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::_convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward(const c10::optional<Tensor> & ggI, const c10::optional<Tensor> & ggW, const c10::optional<Tensor> & ggb, const Tensor & gO, const Tensor & weight, const Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask); // {"schema": "aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor conv1d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv2d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv3d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv1d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding=\"valid\", SymInt[1] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv2d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding=\"valid\", SymInt[2] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv3d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding=\"valid\", SymInt[3] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv_tbc(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad); // {"schema": "aten::conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> conv_tbc_backward(const Tensor & self, const Tensor & input, const Tensor & weight, const Tensor & bias, int64_t pad); // {"schema": "aten::conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor conv_transpose1d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); // {"schema": "aten::conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv_transpose2d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); // {"schema": "aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor conv_transpose3d(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); // {"schema": "aten::conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor copy(const Tensor & self, const Tensor & src, bool non_blocking); // {"schema": "aten::copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking); // {"schema": "aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _copy_from(const Tensor & self, const Tensor & dst, bool non_blocking); // {"schema": "aten::_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _copy_from_and_resize(const Tensor & self, const Tensor & dst); // {"schema": "aten::_copy_from_and_resize(Tensor self, Tensor dst) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor cos(const Tensor & self); // {"schema": "aten::cos(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cos_(Tensor & self); // {"schema": "aten::cos_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cos_out(const Tensor & self, Tensor & out); // {"schema": "aten::cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cosh(const Tensor & self); // {"schema": "aten::cosh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cosh_(Tensor & self); // {"schema": "aten::cosh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cosh_out(const Tensor & self, Tensor & out); // {"schema": "aten::cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cosine_embedding_loss(const Tensor & input1, const Tensor & input2, const Tensor & target, double margin, int64_t reduction); // {"schema": "aten::cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor count_nonzero(const Tensor & self, IntArrayRef dim); // {"schema": "aten::count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor count_nonzero(const Tensor & self, c10::optional<int64_t> dim); // {"schema": "aten::count_nonzero(Tensor self, int? dim=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor cov(const Tensor & self, int64_t correction, const c10::optional<Tensor> & fweights, const c10::optional<Tensor> & aweights); // {"schema": "aten::cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor corrcoef(const Tensor & self); // {"schema": "aten::corrcoef(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor cudnn_affine_grid_generator(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W); // {"schema": "aten::cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid", "dispatch": "True", "default": "False"}
+Tensor cudnn_affine_grid_generator_backward(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W); // {"schema": "aten::cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> cudnn_batch_norm(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double exponential_average_factor, double epsilon); // {"schema": "aten::cudnn_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> cudnn_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_var, double epsilon, const Tensor & reserveSpace); // {"schema": "aten::cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor cudnn_convolution(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); // {"schema": "aten::cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & cudnn_convolution_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, Tensor & out); // {"schema": "aten::cudnn_convolution.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cudnn_convolution_transpose(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); // {"schema": "aten::cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _mps_convolution_transpose(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::_mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> mps_convolution_transpose_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask); // {"schema": "aten::mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor cudnn_convolution_relu(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor cudnn_convolution_add_relu(const Tensor & self, const Tensor & weight, const Tensor & z, const c10::optional<Scalar> & alpha, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor cudnn_grid_sampler(const Tensor & self, const Tensor & grid); // {"schema": "aten::cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> cudnn_grid_sampler_backward(const Tensor & self, const Tensor & grid, const Tensor & grad_output); // {"schema": "aten::cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> cummax(const Tensor & self, int64_t dim); // {"schema": "aten::cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> cummax_out(const Tensor & self, int64_t dim, Tensor & values, Tensor & indices); // {"schema": "aten::cummax.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> cummax(const Tensor & self, Dimname dim); // {"schema": "aten::cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> cummax_out(const Tensor & self, Dimname dim, Tensor & values, Tensor & indices); // {"schema": "aten::cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+void _cummax_helper(const Tensor & self, Tensor & values, Tensor & indices, int64_t dim); // {"schema": "aten::_cummax_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> cummin(const Tensor & self, int64_t dim); // {"schema": "aten::cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> cummin_out(const Tensor & self, int64_t dim, Tensor & values, Tensor & indices); // {"schema": "aten::cummin.out(Tensor self, int dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> cummin(const Tensor & self, Dimname dim); // {"schema": "aten::cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> cummin_out(const Tensor & self, Dimname dim, Tensor & values, Tensor & indices); // {"schema": "aten::cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+void _cummin_helper(const Tensor & self, Tensor & values, Tensor & indices, int64_t dim); // {"schema": "aten::_cummin_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()", "dispatch": "True", "default": "False"}
+Tensor cummaxmin_backward(const Tensor & grad, const Tensor & input, const Tensor & indices, int64_t dim); // {"schema": "aten::cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor cumprod(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cumprod_(Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cumprod_out(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::cumprod.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cumprod(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & cumprod_(Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & cumprod_out(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor cumprod_backward(const Tensor & grad, const Tensor & input, int64_t dim, const Tensor & output); // {"schema": "aten::cumprod_backward(Tensor grad, Tensor input, int dim, Tensor output) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor cumsum(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cumsum_(Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cumsum_out(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::cumsum.out(Tensor self, int dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cumsum(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & cumsum_(Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & cumsum_out(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor cumulative_trapezoid(const Tensor & y, const Tensor & x, int64_t dim); // {"schema": "aten::cumulative_trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor cumulative_trapezoid(const Tensor & y, const Scalar & dx, int64_t dim); // {"schema": "aten::cumulative_trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank, int64_t reduction, bool zero_infinity); // {"schema": "aten::ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor ctc_loss(const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, int64_t blank, int64_t reduction, bool zero_infinity); // {"schema": "aten::ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank, bool zero_infinity); // {"schema": "aten::_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _ctc_loss(const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, int64_t blank, bool zero_infinity); // {"schema": "aten::_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _ctc_loss_backward(const Tensor & grad, const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, const Tensor & neg_log_likelihood, const Tensor & log_alpha, int64_t blank, bool zero_infinity); // {"schema": "aten::_ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _ctc_loss_backward(const Tensor & grad, const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, const Tensor & neg_log_likelihood, const Tensor & log_alpha, int64_t blank, bool zero_infinity); // {"schema": "aten::_ctc_loss_backward.Tensor(Tensor grad, Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor diag_embed(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor diagflat(const Tensor & self, int64_t offset); // {"schema": "aten::diagflat(Tensor self, int offset=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor diagonal(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor linalg_diagonal(const Tensor & A, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::linalg_diagonal(Tensor(a) A, *, int offset=0, int dim1=-2, int dim2=-1) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor diagonal(const Tensor & self, Dimname outdim, Dimname dim1, Dimname dim2, int64_t offset); // {"schema": "aten::diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor diagonal_backward(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::diagonal_backward(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fill_diagonal_(Tensor & self, const Scalar & fill_value, bool wrap); // {"schema": "aten::fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor diff(const Tensor & self, int64_t n, int64_t dim, const c10::optional<Tensor> & prepend, const c10::optional<Tensor> & append); // {"schema": "aten::diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & diff_out(const Tensor & self, int64_t n, int64_t dim, const c10::optional<Tensor> & prepend, const c10::optional<Tensor> & append, Tensor & out); // {"schema": "aten::diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, const c10::optional<Scalar> & spacing, c10::optional<int64_t> dim, int64_t edge_order); // {"schema": "aten::gradient.scalarint(Tensor self, *, Scalar? spacing=None, int? dim=None, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, const Scalar & spacing, IntArrayRef dim, int64_t edge_order); // {"schema": "aten::gradient.scalararray(Tensor self, *, Scalar spacing, int[] dim, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, IntArrayRef dim, int64_t edge_order); // {"schema": "aten::gradient.array(Tensor self, *, int[] dim, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, ArrayRef<Scalar> spacing, c10::optional<int64_t> dim, int64_t edge_order); // {"schema": "aten::gradient.scalarrayint(Tensor self, *, Scalar[] spacing, int? dim=None, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, ArrayRef<Scalar> spacing, IntArrayRef dim, int64_t edge_order); // {"schema": "aten::gradient.scalarrayarray(Tensor self, *, Scalar[] spacing, int[] dim, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, TensorList spacing, c10::optional<int64_t> dim, int64_t edge_order); // {"schema": "aten::gradient.tensorarrayint(Tensor self, *, Tensor[] spacing, int? dim=None, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> gradient(const Tensor & self, TensorList spacing, IntArrayRef dim, int64_t edge_order); // {"schema": "aten::gradient.tensorarray(Tensor self, *, Tensor[] spacing, int[] dim, int edge_order=1) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor div(const Tensor & self, const Tensor & other); // {"schema": "aten::div.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & div_(Tensor & self, const Tensor & other); // {"schema": "aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & div_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor div(const Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & div_(Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & div_out(const Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode, Tensor & out); // {"schema": "aten::div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor div(const Tensor & self, const Scalar & other); // {"schema": "aten::div.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & div_(Tensor & self, const Scalar & other); // {"schema": "aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor div(const Tensor & self, const Scalar & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & div_(Tensor & self, const Scalar & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor divide(const Tensor & self, const Tensor & other); // {"schema": "aten::divide.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & divide_(Tensor & self, const Tensor & other); // {"schema": "aten::divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & divide_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor divide(const Tensor & self, const Scalar & other); // {"schema": "aten::divide.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & divide_(Tensor & self, const Scalar & other); // {"schema": "aten::divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor divide(const Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & divide_(Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & divide_out(const Tensor & self, const Tensor & other, c10::optional<c10::string_view> rounding_mode, Tensor & out); // {"schema": "aten::divide.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor divide(const Tensor & self, const Scalar & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & divide_(Tensor & self, const Scalar & other, c10::optional<c10::string_view> rounding_mode); // {"schema": "aten::divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor true_divide(const Tensor & self, const Tensor & other); // {"schema": "aten::true_divide.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & true_divide_(Tensor & self, const Tensor & other); // {"schema": "aten::true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & true_divide_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor true_divide(const Tensor & self, const Scalar & other); // {"schema": "aten::true_divide.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & true_divide_(Tensor & self, const Scalar & other); // {"schema": "aten::true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor dot(const Tensor & self, const Tensor & tensor); // {"schema": "aten::dot(Tensor self, Tensor tensor) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & dot_out(const Tensor & self, const Tensor & tensor, Tensor & out); // {"schema": "aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor vdot(const Tensor & self, const Tensor & other); // {"schema": "aten::vdot(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & vdot_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::vdot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor einsum(c10::string_view equation, TensorList tensors, OptionalIntArrayRef path); // {"schema": "aten::einsum(str equation, Tensor[] tensors, *, int[]? path=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor embedding(const Tensor & weight, const Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); // {"schema": "aten::embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor embedding_backward(const Tensor & grad, const Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); // {"schema": "aten::embedding_backward(Tensor grad, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor embedding_dense_backward(const Tensor & grad_output, const Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq); // {"schema": "aten::embedding_dense_backward(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & embedding_renorm_(Tensor & self, const Tensor & indices, double max_norm, double norm_type); // {"schema": "aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor embedding_sparse_backward(const Tensor & grad, const Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); // {"schema": "aten::embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _embedding_bag_forward_only(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx); // {"schema": "aten::_embedding_bag_forward_only(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _rowwise_prune(const Tensor & weight, const Tensor & mask, ScalarType compressed_indices_dtype); // {"schema": "aten::_rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor row_stack(TensorList tensors); // {"schema": "aten::row_stack(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & row_stack_out(TensorList tensors, Tensor & out); // {"schema": "aten::row_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset); // {"schema": "aten::embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset, c10::optional<int64_t> padding_idx); // {"schema": "aten::embedding_bag.padding_idx(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, bool include_last_offset, int? padding_idx) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _embedding_bag(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx); // {"schema": "aten::_embedding_bag(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _embedding_bag_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, const Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, int64_t padding_idx); // {"schema": "aten::_embedding_bag_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, bool sparse, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _embedding_bag_sparse_backward(const Tensor & grad, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, const Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor> & per_sample_weights, int64_t padding_idx); // {"schema": "aten::_embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _embedding_bag_dense_backward(const Tensor & grad, const Tensor & indices, const Tensor & offset2bag, const Tensor & bag_size, const Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor> & per_sample_weights, int64_t padding_idx); // {"schema": "aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _embedding_bag_per_sample_weights_backward(const Tensor & grad, const Tensor & weight, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, int64_t mode, int64_t padding_idx); // {"schema": "aten::_embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor empty(IntArrayRef size, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor empty(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor empty_permuted(c10::SymIntArrayRef size, IntArrayRef physical_layout, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::empty_permuted(SymInt[] size, int[] physical_layout, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor new_empty(const Tensor & self, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor new_empty_strided(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor new_full(const Tensor & self, c10::SymIntArrayRef size, const Scalar & fill_value, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor new_zeros(const Tensor & self, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor new_ones(const Tensor & self, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _empty_affine_quantized(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, double scale, int64_t zero_point, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _empty_per_channel_affine_quantized(c10::SymIntArrayRef size, const Tensor & scales, const Tensor & zero_points, int64_t axis, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor", "dispatch": "True", "default": "False"}
+const Tensor & resize_(const Tensor & self, c10::SymIntArrayRef size, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+const Tensor & _resize_output_(const Tensor & self, c10::SymIntArrayRef size, Device device); // {"schema": "aten::_resize_output_(Tensor(a!) self, SymInt[] size, Device device) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor empty_quantized(IntArrayRef size, const Tensor & qtensor, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & empty_out(c10::SymIntArrayRef size, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::empty.out(SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor empty_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor empty_strided(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor erf(const Tensor & self); // {"schema": "aten::erf(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & erf_(Tensor & self); // {"schema": "aten::erf_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & erf_out(const Tensor & self, Tensor & out); // {"schema": "aten::erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor erfc(const Tensor & self); // {"schema": "aten::erfc(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & erfc_(Tensor & self); // {"schema": "aten::erfc_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & erfc_out(const Tensor & self, Tensor & out); // {"schema": "aten::erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor exp(const Tensor & self); // {"schema": "aten::exp(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & exp_(Tensor & self); // {"schema": "aten::exp_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & exp_out(const Tensor & self, Tensor & out); // {"schema": "aten::exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor exp2(const Tensor & self); // {"schema": "aten::exp2(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & exp2_(Tensor & self); // {"schema": "aten::exp2_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & exp2_out(const Tensor & self, Tensor & out); // {"schema": "aten::exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor expm1(const Tensor & self); // {"schema": "aten::expm1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & expm1_(Tensor & self); // {"schema": "aten::expm1_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & expm1_out(const Tensor & self, Tensor & out); // {"schema": "aten::expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor expand(const Tensor & self, c10::SymIntArrayRef size, bool implicit); // {"schema": "aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor expand_as(const Tensor & self, const Tensor & other); // {"schema": "aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor eye(c10::SymInt n, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor eye(c10::SymInt n, c10::SymInt m, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & eye_out(c10::SymInt n, Tensor & out); // {"schema": "aten::eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & eye_out(c10::SymInt n, c10::SymInt m, Tensor & out); // {"schema": "aten::eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim); // {"schema": "aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor flatten(const Tensor & self, int64_t start_dim, int64_t end_dim, Dimname out_dim); // {"schema": "aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor flatten(const Tensor & self, Dimname start_dim, Dimname end_dim, Dimname out_dim); // {"schema": "aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor flatten(const Tensor & self, DimnameList dims, Dimname out_dim); // {"schema": "aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor unflatten(const Tensor & self, int64_t dim, c10::SymIntArrayRef sizes); // {"schema": "aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor unflatten(const Tensor & self, Dimname dim, c10::SymIntArrayRef sizes, DimnameList names); // {"schema": "aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor fill(const Tensor & self, const Scalar & value); // {"schema": "aten::fill.Scalar(Tensor self, Scalar value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor fill(const Tensor & self, const Tensor & value); // {"schema": "aten::fill.Tensor(Tensor self, Tensor value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fill_(Tensor & self, const Scalar & value); // {"schema": "aten::fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & fill_(Tensor & self, const Tensor & value); // {"schema": "aten::fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor floor(const Tensor & self); // {"schema": "aten::floor(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & floor_(Tensor & self); // {"schema": "aten::floor_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & floor_out(const Tensor & self, Tensor & out); // {"schema": "aten::floor.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor floor_divide(const Tensor & self, const Tensor & other); // {"schema": "aten::floor_divide(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & floor_divide_(Tensor & self, const Tensor & other); // {"schema": "aten::floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & floor_divide_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor floor_divide(const Tensor & self, const Scalar & other); // {"schema": "aten::floor_divide.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & floor_divide_(Tensor & self, const Scalar & other); // {"schema": "aten::floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor frac(const Tensor & self); // {"schema": "aten::frac(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & frac_(Tensor & self); // {"schema": "aten::frac_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & frac_out(const Tensor & self, Tensor & out); // {"schema": "aten::frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor full(IntArrayRef size, const Scalar & fill_value, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor full(c10::SymIntArrayRef size, const Scalar & fill_value, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & full_out(c10::SymIntArrayRef size, const Scalar & fill_value, Tensor & out); // {"schema": "aten::full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor full_like(const Tensor & self, const Scalar & fill_value, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor from_file(c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::from_file(str filename, bool? shared=None, int? size=0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & gcd_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor gcd(const Tensor & self, const Tensor & other); // {"schema": "aten::gcd(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & gcd_(Tensor & self, const Tensor & other); // {"schema": "aten::gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & lcm_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor lcm(const Tensor & self, const Tensor & other); // {"schema": "aten::lcm(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & lcm_(Tensor & self, const Tensor & other); // {"schema": "aten::lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor grid_sampler(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); // {"schema": "aten::grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor grid_sampler_2d(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); // {"schema": "aten::grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> grid_sampler_2d_backward(const Tensor & grad_output, const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask); // {"schema": "aten::grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _grid_sampler_2d_cpu_fallback(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); // {"schema": "aten::_grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> _grid_sampler_2d_cpu_fallback_backward(const Tensor & grad_output, const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); // {"schema": "aten::_grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor grid_sampler_3d(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); // {"schema": "aten::grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> grid_sampler_3d_backward(const Tensor & grad_output, const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask); // {"schema": "aten::grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor hann_window(int64_t window_length, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hann_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hann_window(int64_t window_length, bool periodic, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hann_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hamming_window(int64_t window_length, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hamming_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hamming_window(int64_t window_length, bool periodic, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hamming_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hamming_window(int64_t window_length, bool periodic, double alpha, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hamming_window.periodic_alpha(int window_length, bool periodic, float alpha, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hamming_window(int64_t window_length, bool periodic, double alpha, double beta, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::hamming_window.periodic_alpha_beta(int window_length, bool periodic, float alpha, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor kaiser_window(int64_t window_length, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::kaiser_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor kaiser_window(int64_t window_length, bool periodic, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::kaiser_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor kaiser_window(int64_t window_length, bool periodic, double beta, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::kaiser_window.beta(int window_length, bool periodic, float beta, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor hinge_embedding_loss(const Tensor & self, const Tensor & target, double margin, int64_t reduction); // {"schema": "aten::hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor group_norm(const Tensor & input, int64_t num_groups, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, double eps, bool cudnn_enabled); // {"schema": "aten::group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> native_group_norm(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, double eps); // {"schema": "aten::native_group_norm(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & rstd, const c10::optional<Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask); // {"schema": "aten::native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _fft_r2c(const Tensor & self, IntArrayRef dim, int64_t normalization, bool onesided); // {"schema": "aten::_fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _fft_r2c_out(const Tensor & self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor & out); // {"schema": "aten::_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _fft_c2r(const Tensor & self, IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size); // {"schema": "aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _fft_c2r_out(const Tensor & self, IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, Tensor & out); // {"schema": "aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _fft_c2c(const Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward); // {"schema": "aten::_fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _fft_c2c_out(const Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward, Tensor & out); // {"schema": "aten::_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+void _validate_compressed_sparse_indices(bool is_crow, const Tensor & compressed_idx, const Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz); // {"schema": "aten::_validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> ()", "dispatch": "True", "default": "False"}
+int64_t _cufft_get_plan_cache_size(DeviceIndex device_index); // {"schema": "aten::_cufft_get_plan_cache_size(DeviceIndex device_index) -> int", "dispatch": "False", "default": "True"}
+int64_t _cufft_get_plan_cache_max_size(DeviceIndex device_index); // {"schema": "aten::_cufft_get_plan_cache_max_size(DeviceIndex device_index) -> int", "dispatch": "False", "default": "True"}
+void _cufft_set_plan_cache_max_size(DeviceIndex device_index, int64_t max_size); // {"schema": "aten::_cufft_set_plan_cache_max_size(DeviceIndex device_index, int max_size) -> ()", "dispatch": "False", "default": "True"}
+void _cufft_clear_plan_cache(DeviceIndex device_index); // {"schema": "aten::_cufft_clear_plan_cache(DeviceIndex device_index) -> ()", "dispatch": "False", "default": "True"}
+Tensor index(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices); // {"schema": "aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_out(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, Tensor & out); // {"schema": "aten::index.Tensor_out(Tensor self, Tensor?[] indices, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _unsafe_index(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices); // {"schema": "aten::_unsafe_index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_copy_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, Tensor & out); // {"schema": "aten::index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source); // {"schema": "aten::index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor index_copy(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source); // {"schema": "aten::index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_copy_(Tensor & self, Dimname dim, const Tensor & index, const Tensor & source); // {"schema": "aten::index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor index_copy(const Tensor & self, Dimname dim, const Tensor & index, const Tensor & source); // {"schema": "aten::index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & index_put_(Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate); // {"schema": "aten::index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor index_put(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate); // {"schema": "aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _unsafe_index_put(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate); // {"schema": "aten::_unsafe_index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _index_put_impl_(Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate, bool unsafe); // {"schema": "aten::_index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor instance_norm(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled); // {"schema": "aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor isclose(const Tensor & self, const Tensor & other, double rtol, double atol, bool equal_nan); // {"schema": "aten::isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & isin_out(const Tensor & elements, const Tensor & test_elements, bool assume_unique, bool invert, Tensor & out); // {"schema": "aten::isin.Tensor_Tensor_out(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor isin(const Tensor & elements, const Tensor & test_elements, bool assume_unique, bool invert); // {"schema": "aten::isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & isin_out(const Tensor & elements, const Scalar & test_element, bool assume_unique, bool invert, Tensor & out); // {"schema": "aten::isin.Tensor_Scalar_out(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor isin(const Tensor & elements, const Scalar & test_element, bool assume_unique, bool invert); // {"schema": "aten::isin.Tensor_Scalar(Tensor elements, Scalar test_element, *, bool assume_unique=False, bool invert=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & isin_out(const Scalar & element, const Tensor & test_elements, bool assume_unique, bool invert, Tensor & out); // {"schema": "aten::isin.Scalar_Tensor_out(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor isin(const Scalar & element, const Tensor & test_elements, bool assume_unique, bool invert); // {"schema": "aten::isin.Scalar_Tensor(Scalar element, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor isnan(const Tensor & self); // {"schema": "aten::isnan(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+bool is_distributed(const Tensor & self); // {"schema": "aten::is_distributed(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_floating_point(const Tensor & self); // {"schema": "aten::is_floating_point(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_complex(const Tensor & self); // {"schema": "aten::is_complex(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_conj(const Tensor & self); // {"schema": "aten::is_conj(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool _is_zerotensor(const Tensor & self); // {"schema": "aten::_is_zerotensor(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_neg(const Tensor & self); // {"schema": "aten::is_neg(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+Tensor isreal(const Tensor & self); // {"schema": "aten::isreal(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+bool is_nonzero(const Tensor & self); // {"schema": "aten::is_nonzero(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_same_size(const Tensor & self, const Tensor & other); // {"schema": "aten::is_same_size(Tensor self, Tensor other) -> bool", "dispatch": "True", "default": "True"}
+bool is_signed(const Tensor & self); // {"schema": "aten::is_signed(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+bool is_inference(const Tensor & self); // {"schema": "aten::is_inference(Tensor self) -> bool", "dispatch": "False", "default": "True"}
+Tensor kl_div(const Tensor & self, const Tensor & target, int64_t reduction, bool log_target); // {"schema": "aten::kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor kron(const Tensor & self, const Tensor & other); // {"schema": "aten::kron(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & kron_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, int64_t dim, bool keepdim); // {"schema": "aten::kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> kthvalue_out(const Tensor & self, int64_t k, int64_t dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::kthvalue.values(Tensor self, int k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> kthvalue(const Tensor & self, int64_t k, Dimname dim, bool keepdim); // {"schema": "aten::kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> kthvalue_out(const Tensor & self, int64_t k, Dimname dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+Tensor layer_norm(const Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, double eps, bool cudnn_enable); // {"schema": "aten::layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> native_layer_norm(const Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, double eps); // {"schema": "aten::native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> native_layer_norm_backward(const Tensor & grad_out, const Tensor & input, c10::SymIntArrayRef normalized_shape, const Tensor & mean, const Tensor & rstd, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, ::std::array<bool,3> output_mask); // {"schema": "aten::native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor nan_to_num(const Tensor & self, c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf); // {"schema": "aten::nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & nan_to_num_(Tensor & self, c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf); // {"schema": "aten::nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & nan_to_num_out(const Tensor & self, c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf, Tensor & out); // {"schema": "aten::nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor linear(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias); // {"schema": "aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> linear_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, ::std::array<bool,3> output_mask); // {"schema": "aten::linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor & linear_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, Tensor & out); // {"schema": "aten::linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor mkldnn_linear(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias); // {"schema": "aten::mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_linear_backward_input(IntArrayRef input_size, const Tensor & grad_output, const Tensor & weight); // {"schema": "aten::mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> mkldnn_linear_backward_weights(const Tensor & grad_output, const Tensor & input, const Tensor & weight, bool bias_defined); // {"schema": "aten::mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> mkldnn_linear_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, ::std::array<bool,3> output_mask); // {"schema": "aten::mkldnn_linear_backward(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _cslt_compress(const Tensor & input); // {"schema": "aten::_cslt_compress(Tensor input) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _cslt_sparse_mm(const Tensor & compressed_A, const Tensor & dense_B, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & alpha, c10::optional<ScalarType> out_dtype, bool transpose_result, int64_t alg_id); // {"schema": "aten::_cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor", "dispatch": "True", "default": "False"}
+int64_t _cslt_sparse_mm_search(const Tensor & compressed_A, const Tensor & dense_B, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & alpha, c10::optional<ScalarType> out_dtype, bool transpose_result); // {"schema": "aten::_cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int", "dispatch": "True", "default": "False"}
+Tensor _sparse_semi_structured_linear(const Tensor & input, const Tensor & weight, const Tensor & meta, const c10::optional<Tensor> & bias, c10::optional<c10::string_view> activation, c10::optional<ScalarType> out_dtype); // {"schema": "aten::_sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _mixed_dtypes_linear(const Tensor & input, const Tensor & weight, const Tensor & scale, const c10::optional<Tensor> & bias, c10::optional<c10::string_view> activation); // {"schema": "aten::_mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor fbgemm_linear_int8_weight_fp32_activation(const Tensor & input, const Tensor & weight, const Tensor & packed, const Tensor & col_offsets, const Scalar & weight_scale, const Scalar & weight_zero_point, const Tensor & bias); // {"schema": "aten::fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fbgemm_linear_int8_weight(const Tensor & input, const Tensor & weight, const Tensor & packed, const Tensor & col_offsets, const Scalar & weight_scale, const Scalar & weight_zero_point, const Tensor & bias); // {"schema": "aten::fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,double,int64_t> fbgemm_linear_quantize_weight(const Tensor & input); // {"schema": "aten::fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)", "dispatch": "False", "default": "True"}
+Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor & input); // {"schema": "aten::fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fbgemm_linear_fp16_weight_fp32_activation(const Tensor & input, const Tensor & packed_weight, const Tensor & bias); // {"schema": "aten::fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fbgemm_linear_fp16_weight(const Tensor & input, const Tensor & packed_weight, const Tensor & bias); // {"schema": "aten::fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fbgemm_pack_quantized_matrix(const Tensor & input); // {"schema": "aten::fbgemm_pack_quantized_matrix(Tensor input) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fbgemm_pack_quantized_matrix(const Tensor & input, int64_t K, int64_t N); // {"schema": "aten::fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor ldexp(const Tensor & self, const Tensor & other); // {"schema": "aten::ldexp.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & ldexp_(Tensor & self, const Tensor & other); // {"schema": "aten::ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & ldexp_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linspace(const Scalar & start, const Scalar & end, int64_t steps, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor linspace(const Tensor & start, const Tensor & end, int64_t steps, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor linspace(const Tensor & start, const Scalar & end, int64_t steps, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor linspace(const Scalar & start, const Tensor & end, int64_t steps, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linspace_out(const Scalar & start, const Scalar & end, int64_t steps, Tensor & out); // {"schema": "aten::linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & linspace_out(const Tensor & start, const Tensor & end, int64_t steps, Tensor & out); // {"schema": "aten::linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & linspace_out(const Tensor & start, const Scalar & end, int64_t steps, Tensor & out); // {"schema": "aten::linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & linspace_out(const Scalar & start, const Tensor & end, int64_t steps, Tensor & out); // {"schema": "aten::linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor log(const Tensor & self); // {"schema": "aten::log(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & log_(Tensor & self); // {"schema": "aten::log_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & log_out(const Tensor & self, Tensor & out); // {"schema": "aten::log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor log10(const Tensor & self); // {"schema": "aten::log10(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & log10_(Tensor & self); // {"schema": "aten::log10_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & log10_out(const Tensor & self, Tensor & out); // {"schema": "aten::log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor log1p(const Tensor & self); // {"schema": "aten::log1p(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & log1p_(Tensor & self); // {"schema": "aten::log1p_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & log1p_out(const Tensor & self, Tensor & out); // {"schema": "aten::log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor log2(const Tensor & self); // {"schema": "aten::log2(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & log2_(Tensor & self); // {"schema": "aten::log2_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & log2_out(const Tensor & self, Tensor & out); // {"schema": "aten::log2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & logaddexp_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logaddexp(const Tensor & self, const Tensor & other); // {"schema": "aten::logaddexp(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logaddexp2_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logaddexp2(const Tensor & self, const Tensor & other); // {"schema": "aten::logaddexp2(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor xlogy(const Tensor & self, const Tensor & other); // {"schema": "aten::xlogy.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor xlogy(const Scalar & self, const Tensor & other); // {"schema": "aten::xlogy.Scalar_Self(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor xlogy(const Tensor & self, const Scalar & other); // {"schema": "aten::xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & xlogy_(Tensor & self, const Tensor & other); // {"schema": "aten::xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & xlogy_(Tensor & self, const Scalar & other); // {"schema": "aten::xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & xlogy_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::xlogy.OutTensor(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & xlogy_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::xlogy.OutScalar_Self(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & xlogy_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::xlogy.OutScalar_Other(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor logspace(const Scalar & start, const Scalar & end, int64_t steps, double base, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor logspace(const Tensor & start, const Tensor & end, int64_t steps, double base, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor logspace(const Tensor & start, const Scalar & end, int64_t steps, double base, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor logspace(const Scalar & start, const Tensor & end, int64_t steps, double base, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logspace_out(const Scalar & start, const Scalar & end, int64_t steps, double base, Tensor & out); // {"schema": "aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & logspace_out(const Tensor & start, const Tensor & end, int64_t steps, double base, Tensor & out); // {"schema": "aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logspace_out(const Tensor & start, const Scalar & end, int64_t steps, double base, Tensor & out); // {"schema": "aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & logspace_out(const Scalar & start, const Tensor & end, int64_t steps, double base, Tensor & out); // {"schema": "aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor log_softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & log_softmax_out(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor log_softmax(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _log_softmax(const Tensor & self, int64_t dim, bool half_to_float); // {"schema": "aten::_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _log_softmax_out(const Tensor & self, int64_t dim, bool half_to_float, Tensor & out); // {"schema": "aten::_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _log_softmax_backward_data(const Tensor & grad_output, const Tensor & output, int64_t dim, ScalarType input_dtype); // {"schema": "aten::_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _log_softmax_backward_data_out(const Tensor & grad_output, const Tensor & output, int64_t dim, ScalarType input_dtype, Tensor & out); // {"schema": "aten::_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _logcumsumexp(const Tensor & self, int64_t dim); // {"schema": "aten::_logcumsumexp(Tensor self, int dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _logcumsumexp_out(const Tensor & self, int64_t dim, Tensor & out); // {"schema": "aten::_logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logcumsumexp(const Tensor & self, int64_t dim); // {"schema": "aten::logcumsumexp(Tensor self, int dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logcumsumexp_out(const Tensor & self, int64_t dim, Tensor & out); // {"schema": "aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor logcumsumexp(const Tensor & self, Dimname dim); // {"schema": "aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & logcumsumexp_out(const Tensor & self, Dimname dim, Tensor & out); // {"schema": "aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor logsumexp(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logsumexp_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor logsumexp(const Tensor & self, DimnameList dim, bool keepdim); // {"schema": "aten::logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & logsumexp_out(const Tensor & self, DimnameList dim, bool keepdim, Tensor & out); // {"schema": "aten::logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor margin_ranking_loss(const Tensor & input1, const Tensor & input2, const Tensor & target, double margin, int64_t reduction); // {"schema": "aten::margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor matmul(const Tensor & self, const Tensor & other); // {"schema": "aten::matmul(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> matmul_backward(const Tensor & grad, const Tensor & self, const Tensor & other, ::std::array<bool,2> mask); // {"schema": "aten::matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor & matmul_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor matrix_power(const Tensor & self, int64_t n); // {"schema": "aten::matrix_power(Tensor self, int n) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & matrix_power_out(const Tensor & self, int64_t n, Tensor & out); // {"schema": "aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor matrix_exp(const Tensor & self); // {"schema": "aten::matrix_exp(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor matrix_exp_backward(const Tensor & self, const Tensor & grad); // {"schema": "aten::matrix_exp_backward(Tensor self, Tensor grad) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _aminmax(const Tensor & self); // {"schema": "aten::_aminmax(Tensor self) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _aminmax(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::_aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> aminmax(const Tensor & self, c10::optional<int64_t> dim, bool keepdim); // {"schema": "aten::aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> aminmax_out(const Tensor & self, c10::optional<int64_t> dim, bool keepdim, Tensor & min, Tensor & max); // {"schema": "aten::aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)", "dispatch": "True", "default": "False"}
+Tensor _compute_linear_combination(const Tensor & input, const Tensor & coefficients); // {"schema": "aten::_compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _compute_linear_combination_out(const Tensor & input, const Tensor & coefficients, Tensor & out); // {"schema": "aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> max(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> max_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & max, Tensor & max_values); // {"schema": "aten::max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> max(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> max_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & max, Tensor & max_values); // {"schema": "aten::max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+Tensor value_selecting_reduction_backward(const Tensor & grad, int64_t dim, const Tensor & indices, c10::SymIntArrayRef sizes, bool keepdim); // {"schema": "aten::value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor amax(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & amax_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> max_pool1d_with_indices(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor max_pool1d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor max_pool2d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_max_pool2d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_max_pool2d_backward(const Tensor & grad_output, const Tensor & output, const Tensor & input, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::mkldnn_max_pool2d_backward(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_max_pool3d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_max_pool3d_backward(const Tensor & grad_output, const Tensor & output, const Tensor & input, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::mkldnn_max_pool3d_backward(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor quantized_max_pool1d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor quantized_max_pool2d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor quantized_max_pool3d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::quantized_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor max_pool3d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor mean(const Tensor & self, c10::optional<ScalarType> dtype); // {"schema": "aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor mean(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mean_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mean(const Tensor & self, DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & mean_out(const Tensor & self, DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nanmean(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nanmean_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::nanmean.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor median(const Tensor & self); // {"schema": "aten::median(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> median(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> median_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> median(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> median_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+Tensor nanmedian(const Tensor & self); // {"schema": "aten::nanmedian(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> nanmedian(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> nanmedian_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> nanmedian(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> nanmedian_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> min(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> min_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & min, Tensor & min_indices); // {"schema": "aten::min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> min(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> min_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & min, Tensor & min_indices); // {"schema": "aten::min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+Tensor amin(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & amin_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _mps_convolution(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::_mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> mps_convolution_backward(const Tensor & self, const Tensor & grad_output, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,3> output_mask); // {"schema": "aten::mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor mkldnn_convolution(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> mkldnn_rnn_layer(const Tensor & input, const Tensor & weight0, const Tensor & weight1, const Tensor & weight2, const Tensor & weight3, const Tensor & hx_, const Tensor & cx_, bool reverse, IntArrayRef batch_sizes, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train); // {"schema": "aten::mkldnn_rnn_layer(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor,Tensor,Tensor> mkldnn_rnn_layer_backward(const Tensor & input, const Tensor & weight1, const Tensor & weight2, const Tensor & weight3, const Tensor & weight4, const Tensor & hx_, const Tensor & cx_tmp, const Tensor & output, const Tensor & hy_, const Tensor & cy_, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, IntArrayRef batch_sizes, bool batch_first, const Tensor & workspace); // {"schema": "aten::mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> miopen_batch_norm(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double exponential_average_factor, double epsilon); // {"schema": "aten::miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> miopen_batch_norm_backward(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_var, double epsilon); // {"schema": "aten::miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor miopen_convolution(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); // {"schema": "aten::miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor miopen_convolution_transpose(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); // {"schema": "aten::miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor miopen_depthwise_convolution(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); // {"schema": "aten::miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor miopen_convolution_relu(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor miopen_convolution_add_relu(const Tensor & self, const Tensor & weight, const Tensor & z, const c10::optional<Scalar> & alpha, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> miopen_rnn(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & hx, const c10::optional<Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state); // {"schema": "aten::miopen_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,::std::vector<Tensor>> miopen_rnn_backward(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, const Tensor & output, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, const Tensor & reserve, ::std::array<bool,4> output_mask); // {"schema": "aten::miopen_rnn_backward(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask) -> (Tensor, Tensor, Tensor, Tensor[])", "dispatch": "True", "default": "False"}
+Tensor mm(const Tensor & self, const Tensor & mat2); // {"schema": "aten::mm(Tensor self, Tensor mat2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mm_out(const Tensor & self, const Tensor & mat2, Tensor & out); // {"schema": "aten::mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _int_mm(const Tensor & self, const Tensor & mat2); // {"schema": "aten::_int_mm(Tensor self, Tensor mat2) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _int_mm_out(const Tensor & self, const Tensor & mat2, Tensor & out); // {"schema": "aten::_int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _convert_weight_to_int4pack(const Tensor & self, int64_t innerKTiles); // {"schema": "aten::_convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _weight_int4pack_mm(const Tensor & self, const Tensor & mat2, int64_t qGroupSize, const Tensor & qScaleAndZeros); // {"schema": "aten::_weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _weight_int8pack_mm(const Tensor & self, const Tensor & mat2, const Tensor & scales); // {"schema": "aten::_weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_mm(const Tensor & sparse, const Tensor & dense); // {"schema": "aten::_sparse_mm(Tensor sparse, Tensor dense) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_mm(const Tensor & sparse, const Tensor & dense, c10::string_view reduce); // {"schema": "aten::_sparse_mm.reduce(Tensor sparse, Tensor dense, str reduce) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_sparse_matmul(const Tensor & self, const Tensor & other); // {"schema": "aten::_sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> mode(const Tensor & self, int64_t dim, bool keepdim); // {"schema": "aten::mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> mode_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> mode(const Tensor & self, Dimname dim, bool keepdim); // {"schema": "aten::mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> mode_out(const Tensor & self, Dimname dim, bool keepdim, Tensor & values, Tensor & indices); // {"schema": "aten::mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+Tensor mul(const Tensor & self, const Tensor & other); // {"schema": "aten::mul.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mul_(Tensor & self, const Tensor & other); // {"schema": "aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mul_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mul(const Tensor & self, const Scalar & other); // {"schema": "aten::mul.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mul_(Tensor & self, const Scalar & other); // {"schema": "aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor multiply(const Tensor & self, const Tensor & other); // {"schema": "aten::multiply.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & multiply_(Tensor & self, const Tensor & other); // {"schema": "aten::multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & multiply_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor multiply(const Tensor & self, const Scalar & other); // {"schema": "aten::multiply.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & multiply_(Tensor & self, const Scalar & other); // {"schema": "aten::multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor mv(const Tensor & self, const Tensor & vec); // {"schema": "aten::mv(Tensor self, Tensor vec) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mv_out(const Tensor & self, const Tensor & vec, Tensor & out); // {"schema": "aten::mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mvlgamma_out(const Tensor & self, int64_t p, Tensor & out); // {"schema": "aten::mvlgamma.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mvlgamma(const Tensor & self, int64_t p); // {"schema": "aten::mvlgamma(Tensor self, int p) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mvlgamma_(Tensor & self, int64_t p); // {"schema": "aten::mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor narrow_copy(const Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); // {"schema": "aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & narrow_copy_out(const Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, Tensor & out); // {"schema": "aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor narrow(const Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); // {"schema": "aten::narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor narrow(const Tensor & self, int64_t dim, const Tensor & start, c10::SymInt length); // {"schema": "aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> native_batch_norm(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double momentum, double eps); // {"schema": "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_batch_norm_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double momentum, double eps, Tensor & out, Tensor & save_mean, Tensor & save_invstd); // {"schema": "aten::native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _native_batch_norm_legit(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, Tensor & running_mean, Tensor & running_var, bool training, double momentum, double eps); // {"schema": "aten::_native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _native_batch_norm_legit_no_training(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & running_mean, const Tensor & running_var, double momentum, double eps); // {"schema": "aten::_native_batch_norm_legit_no_training(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _native_batch_norm_legit_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, Tensor & running_mean, Tensor & running_var, bool training, double momentum, double eps, Tensor & out, Tensor & save_mean, Tensor & save_invstd); // {"schema": "aten::_native_batch_norm_legit.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, bool training, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd) -> (Tensor(d!), Tensor(e!), Tensor(f!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _native_batch_norm_legit(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, bool training, double momentum, double eps); // {"schema": "aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _native_batch_norm_legit_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, bool training, double momentum, double eps, Tensor & out, Tensor & save_mean, Tensor & save_invstd); // {"schema": "aten::_native_batch_norm_legit.no_stats_out(Tensor input, Tensor? weight, Tensor? bias, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> batch_norm_stats(const Tensor & input, double eps); // {"schema": "aten::batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor batch_norm_elemt(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & mean, const Tensor & invstd, double eps); // {"schema": "aten::batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & batch_norm_elemt_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & mean, const Tensor & invstd, double eps, Tensor & out); // {"schema": "aten::batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> batch_norm_gather_stats(const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum, double eps, int64_t count); // {"schema": "aten::batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> batch_norm_gather_stats_with_counts(const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum, double eps, const Tensor & counts); // {"schema": "aten::batch_norm_gather_stats_with_counts(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> native_batch_norm_backward(const Tensor & grad_out, const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask); // {"schema": "aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> batch_norm_backward_reduce(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & weight, bool input_g, bool weight_g, bool bias_g); // {"schema": "aten::batch_norm_backward_reduce(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor batch_norm_backward_elemt(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & weight, const Tensor & sum_dy, const Tensor & sum_dy_xmu, const Tensor & count); // {"schema": "aten::batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> batch_norm_update_stats(const Tensor & input, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum); // {"schema": "aten::batch_norm_update_stats(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+bool is_vulkan_available(); // {"schema": "aten::is_vulkan_available() -> bool", "dispatch": "False", "default": "True"}
+bool _nnpack_available(); // {"schema": "aten::_nnpack_available() -> bool", "dispatch": "False", "default": "True"}
+Tensor _nnpack_spatial_convolution(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride); // {"schema": "aten::_nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor ones(IntArrayRef size, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor ones(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::ones(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ones_out(c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::ones.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor ones_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor pairwise_distance(const Tensor & x1, const Tensor & x2, double p, double eps, bool keepdim); // {"schema": "aten::pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor cdist(const Tensor & x1, const Tensor & x2, double p, c10::optional<int64_t> compute_mode); // {"schema": "aten::cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _euclidean_dist(const Tensor & x1, const Tensor & x2); // {"schema": "aten::_euclidean_dist(Tensor x1, Tensor x2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _cdist_forward(const Tensor & x1, const Tensor & x2, double p, c10::optional<int64_t> compute_mode); // {"schema": "aten::_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _cdist_backward(const Tensor & grad, const Tensor & x1, const Tensor & x2, double p, const Tensor & cdist); // {"schema": "aten::_cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor pdist(const Tensor & self, double p); // {"schema": "aten::pdist(Tensor self, float p=2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _pdist_forward(const Tensor & self, double p); // {"schema": "aten::_pdist_forward(Tensor self, float p=2) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _pdist_backward(const Tensor & grad, const Tensor & self, double p, const Tensor & pdist); // {"schema": "aten::_pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor cosine_similarity(const Tensor & x1, const Tensor & x2, int64_t dim, double eps); // {"schema": "aten::cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor permute(const Tensor & self, IntArrayRef dims); // {"schema": "aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor movedim(const Tensor & self, IntArrayRef source, IntArrayRef destination); // {"schema": "aten::movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor movedim(const Tensor & self, int64_t source, int64_t destination); // {"schema": "aten::movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor moveaxis(const Tensor & self, IntArrayRef source, IntArrayRef destination); // {"schema": "aten::moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor moveaxis(const Tensor & self, int64_t source, int64_t destination); // {"schema": "aten::moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor numpy_T(const Tensor & self); // {"schema": "aten::numpy_T(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor matrix_H(const Tensor & self); // {"schema": "aten::matrix_H(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor mT(const Tensor & self); // {"schema": "aten::mT(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor mH(const Tensor & self); // {"schema": "aten::mH(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor adjoint(const Tensor & self); // {"schema": "aten::adjoint(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor pixel_shuffle(const Tensor & self, int64_t upscale_factor); // {"schema": "aten::pixel_shuffle(Tensor self, int upscale_factor) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor pixel_unshuffle(const Tensor & self, int64_t downscale_factor); // {"schema": "aten::pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor channel_shuffle(const Tensor & self, c10::SymInt groups); // {"schema": "aten::channel_shuffle(Tensor self, SymInt groups) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor native_channel_shuffle(const Tensor & self, c10::SymInt groups); // {"schema": "aten::native_channel_shuffle(Tensor self, SymInt groups) -> Tensor", "dispatch": "True", "default": "True"}
+bool is_pinned(const Tensor & self, c10::optional<Device> device); // {"schema": "aten::is_pinned(Tensor self, Device? device=None) -> bool", "dispatch": "True", "default": "True"}
+Tensor pin_memory(const Tensor & self, c10::optional<Device> device); // {"schema": "aten::pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _pin_memory(const Tensor & self, c10::optional<Device> device); // {"schema": "aten::_pin_memory(Tensor self, Device? device=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor pinverse(const Tensor & self, double rcond); // {"schema": "aten::pinverse(Tensor self, float rcond=1e-15) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor poisson_nll_loss(const Tensor & input, const Tensor & target, bool log_input, bool full, double eps, int64_t reduction); // {"schema": "aten::poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor rad2deg(const Tensor & self); // {"schema": "aten::rad2deg(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & rad2deg_(Tensor & self); // {"schema": "aten::rad2deg_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rad2deg_out(const Tensor & self, Tensor & out); // {"schema": "aten::rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor deg2rad(const Tensor & self); // {"schema": "aten::deg2rad(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & deg2rad_(Tensor & self); // {"schema": "aten::deg2rad_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & deg2rad_out(const Tensor & self, Tensor & out); // {"schema": "aten::deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor scalar_tensor(const Scalar & s, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor rand(c10::SymIntArrayRef size, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::rand.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor rand(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::rand.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor rand(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::rand(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor rand(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::rand.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & rand_out(c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::rand.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rand_out(c10::SymIntArrayRef size, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::rand.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor rand_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randint(c10::SymInt high, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randint(SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randint(c10::SymInt high, c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randint.generator(SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randint.low(SymInt low, SymInt high, SymInt[] size, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randint.low_generator(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & randint_out(c10::SymInt high, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::randint.out(SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randint_out(c10::SymInt high, c10::SymIntArrayRef size, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::randint.generator_out(SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randint_out(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::randint.low_out(SymInt low, SymInt high, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randint_out(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::randint.low_generator_out(SymInt low, SymInt high, SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor randint_like(const Tensor & self, c10::SymInt high, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::randint_like(Tensor self, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randint_like(const Tensor & self, c10::SymInt low, c10::SymInt high, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::randint_like.low_dtype(Tensor self, SymInt low, SymInt high, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randn(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randn(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randn(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randn.generator(SymInt[] size, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randn(c10::SymIntArrayRef size, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randn.names(SymInt[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randn(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randn.generator_with_names(SymInt[] size, *, Generator? generator, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & randn_out(c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::randn.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & randn_out(c10::SymIntArrayRef size, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::randn.generator_out(SymInt[] size, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor randn_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randperm(c10::SymInt n, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor randperm(c10::SymInt n, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & randperm_out(c10::SymInt n, Tensor & out); // {"schema": "aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randperm_out(c10::SymInt n, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor range(const Scalar & start, const Scalar & end, const Scalar & step, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::range.step(Scalar start, Scalar end, Scalar step=1, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor range(const Scalar & start, const Scalar & end, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::range(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & range_out(const Scalar & start, const Scalar & end, Tensor & out); // {"schema": "aten::range.out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & range_out(const Scalar & start, const Scalar & end, const Scalar & step, Tensor & out); // {"schema": "aten::range.out(Scalar start, Scalar end, Scalar step=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ravel(const Tensor & self); // {"schema": "aten::ravel(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor reciprocal(const Tensor & self); // {"schema": "aten::reciprocal(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & reciprocal_(Tensor & self); // {"schema": "aten::reciprocal_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & reciprocal_out(const Tensor & self, Tensor & out); // {"schema": "aten::reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor neg(const Tensor & self); // {"schema": "aten::neg(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & neg_(Tensor & self); // {"schema": "aten::neg_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & neg_out(const Tensor & self, Tensor & out); // {"schema": "aten::neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor negative(const Tensor & self); // {"schema": "aten::negative(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & negative_(Tensor & self); // {"schema": "aten::negative_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & negative_out(const Tensor & self, Tensor & out); // {"schema": "aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor repeat(const Tensor & self, c10::SymIntArrayRef repeats); // {"schema": "aten::repeat(Tensor self, SymInt[] repeats) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor repeat_interleave(const Tensor & repeats, c10::optional<c10::SymInt> output_size); // {"schema": "aten::repeat_interleave.Tensor(Tensor repeats, *, SymInt? output_size=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor repeat_interleave(const Tensor & self, const Tensor & repeats, c10::optional<int64_t> dim, c10::optional<c10::SymInt> output_size); // {"schema": "aten::repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor repeat_interleave(const Tensor & self, c10::SymInt repeats, c10::optional<int64_t> dim, c10::optional<c10::SymInt> output_size); // {"schema": "aten::repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor reshape(const Tensor & self, c10::SymIntArrayRef shape); // {"schema": "aten::reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _reshape_copy(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _reshape_alias(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); // {"schema": "aten::_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor _mkldnn_reshape(const Tensor & self, IntArrayRef shape); // {"schema": "aten::_mkldnn_reshape(Tensor self, int[] shape) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor reshape_as(const Tensor & self, const Tensor & other); // {"schema": "aten::reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor round(const Tensor & self); // {"schema": "aten::round(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & round_(Tensor & self); // {"schema": "aten::round_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & round_out(const Tensor & self, Tensor & out); // {"schema": "aten::round.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor round(const Tensor & self, int64_t decimals); // {"schema": "aten::round.decimals(Tensor self, *, int decimals) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & round_(Tensor & self, int64_t decimals); // {"schema": "aten::round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & round_out(const Tensor & self, int64_t decimals, Tensor & out); // {"schema": "aten::round.decimals_out(Tensor self, *, int decimals, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor rrelu(const Tensor & self, const Scalar & lower, const Scalar & upper, bool training, c10::optional<Generator> generator); // {"schema": "aten::rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & rrelu_(Tensor & self, const Scalar & lower, const Scalar & upper, bool training, c10::optional<Generator> generator); // {"schema": "aten::rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor relu(const Tensor & self); // {"schema": "aten::relu(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & relu_(Tensor & self); // {"schema": "aten::relu_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor relu6(const Tensor & self); // {"schema": "aten::relu6(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & relu6_(Tensor & self); // {"schema": "aten::relu6_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor prelu(const Tensor & self, const Tensor & weight); // {"schema": "aten::prelu(Tensor self, Tensor weight) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _prelu_kernel(const Tensor & self, const Tensor & weight); // {"schema": "aten::_prelu_kernel(Tensor self, Tensor weight) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _prelu_kernel_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight); // {"schema": "aten::_prelu_kernel_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor & gelu_out(const Tensor & self, c10::string_view approximate, Tensor & out); // {"schema": "aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & gelu_(Tensor & self, c10::string_view approximate); // {"schema": "aten::gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor gelu(const Tensor & self, c10::string_view approximate); // {"schema": "aten::gelu(Tensor self, *, str approximate='none') -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & gelu_backward_out(const Tensor & grad_output, const Tensor & self, c10::string_view approximate, Tensor & grad_input); // {"schema": "aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor gelu_backward(const Tensor & grad_output, const Tensor & self, c10::string_view approximate); // {"schema": "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor", "dispatch": "True", "default": "True"}
+Tensor infinitely_differentiable_gelu_backward(const Tensor & grad, const Tensor & self); // {"schema": "aten::infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & hardshrink_out(const Tensor & self, const Scalar & lambd, Tensor & out); // {"schema": "aten::hardshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardshrink(const Tensor & self, const Scalar & lambd); // {"schema": "aten::hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & hardshrink_backward_out(const Tensor & grad_out, const Tensor & self, const Scalar & lambd, Tensor & grad_input); // {"schema": "aten::hardshrink_backward.grad_input(Tensor grad_out, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardshrink_backward(const Tensor & grad_out, const Tensor & self, const Scalar & lambd); // {"schema": "aten::hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor rsqrt(const Tensor & self); // {"schema": "aten::rsqrt(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & rsqrt_(Tensor & self); // {"schema": "aten::rsqrt_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rsqrt_out(const Tensor & self, Tensor & out); // {"schema": "aten::rsqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor select(const Tensor & self, Dimname dim, int64_t index); // {"schema": "aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor select(const Tensor & self, int64_t dim, c10::SymInt index); // {"schema": "aten::select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor select_backward(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index); // {"schema": "aten::select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _nested_select_backward(const Tensor & grad_output, const Tensor & self, int64_t dim, c10::SymInt index); // {"schema": "aten::_nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor selu(const Tensor & self); // {"schema": "aten::selu(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & selu_(Tensor & self); // {"schema": "aten::selu_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor celu(const Tensor & self, const Scalar & alpha); // {"schema": "aten::celu(Tensor self, Scalar alpha=1.0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & celu_(Tensor & self, const Scalar & alpha); // {"schema": "aten::celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor silu(const Tensor & self); // {"schema": "aten::silu(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & silu_(Tensor & self); // {"schema": "aten::silu_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & silu_out(const Tensor & self, Tensor & out); // {"schema": "aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & silu_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & grad_input); // {"schema": "aten::silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor silu_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::silu_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor mish(const Tensor & self); // {"schema": "aten::mish(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mish_(Tensor & self); // {"schema": "aten::mish_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mish_out(const Tensor & self, Tensor & out); // {"schema": "aten::mish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mish_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::mish_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sigmoid(const Tensor & self); // {"schema": "aten::sigmoid(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sigmoid_(Tensor & self); // {"schema": "aten::sigmoid_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sigmoid_out(const Tensor & self, Tensor & out); // {"schema": "aten::sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logit(const Tensor & self, c10::optional<double> eps); // {"schema": "aten::logit(Tensor self, float? eps=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & logit_(Tensor & self, c10::optional<double> eps); // {"schema": "aten::logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & logit_out(const Tensor & self, c10::optional<double> eps, Tensor & out); // {"schema": "aten::logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sin(const Tensor & self); // {"schema": "aten::sin(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sin_(Tensor & self); // {"schema": "aten::sin_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sin_out(const Tensor & self, Tensor & out); // {"schema": "aten::sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sinc(const Tensor & self); // {"schema": "aten::sinc(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sinc_(Tensor & self); // {"schema": "aten::sinc_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sinc_out(const Tensor & self, Tensor & out); // {"schema": "aten::sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sinh(const Tensor & self); // {"schema": "aten::sinh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sinh_(Tensor & self); // {"schema": "aten::sinh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sinh_out(const Tensor & self, Tensor & out); // {"schema": "aten::sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor detach(const Tensor & self); // {"schema": "aten::detach(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & detach_(Tensor & self); // {"schema": "aten::detach_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+int64_t size(const Tensor & self, int64_t dim); // {"schema": "aten::size.int(Tensor self, int dim) -> int", "dispatch": "False", "default": "True"}
+int64_t size(const Tensor & self, Dimname dim); // {"schema": "aten::size.Dimname(Tensor self, Dimname dim) -> int", "dispatch": "False", "default": "True"}
+c10::SymInt sym_size(const Tensor & self, int64_t dim); // {"schema": "aten::sym_size.int(Tensor self, int dim) -> SymInt", "dispatch": "False", "default": "True"}
+c10::SymInt sym_numel(const Tensor & self); // {"schema": "aten::sym_numel(Tensor self) -> SymInt", "dispatch": "False", "default": "True"}
+c10::SymInt sym_storage_offset(const Tensor & self); // {"schema": "aten::sym_storage_offset(Tensor self) -> SymInt", "dispatch": "False", "default": "True"}
+Tensor slice(const Tensor & self, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step); // {"schema": "aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor slice_backward(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step); // {"schema": "aten::slice_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor slice_inverse(const Tensor & self, const Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step); // {"schema": "aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor slice_scatter(const Tensor & self, const Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step); // {"schema": "aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor select_scatter(const Tensor & self, const Tensor & src, int64_t dim, c10::SymInt index); // {"schema": "aten::select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor diagonal_scatter(const Tensor & self, const Tensor & src, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor as_strided_scatter(const Tensor & self, const Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset); // {"schema": "aten::as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor smm(const Tensor & self, const Tensor & mat2); // {"schema": "aten::smm(Tensor self, Tensor mat2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & softmax_out(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor softmax(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _softmax(const Tensor & self, int64_t dim, bool half_to_float); // {"schema": "aten::_softmax(Tensor self, int dim, bool half_to_float) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _softmax_out(const Tensor & self, int64_t dim, bool half_to_float, Tensor & out); // {"schema": "aten::_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _softmax_backward_data(const Tensor & grad_output, const Tensor & output, int64_t dim, ScalarType input_dtype); // {"schema": "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _softmax_backward_data_out(const Tensor & grad_output, const Tensor & output, int64_t dim, ScalarType input_dtype, Tensor & grad_input); // {"schema": "aten::_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> unsafe_split(const Tensor & self, c10::SymInt split_size, int64_t dim); // {"schema": "aten::unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> split(const Tensor & self, c10::SymInt split_size, int64_t dim); // {"schema": "aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> split(const Tensor & self, c10::SymIntArrayRef split_size, int64_t dim); // {"schema": "aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> unsafe_split_with_sizes(const Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim); // {"schema": "aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> split_with_sizes(const Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim); // {"schema": "aten::split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> hsplit(const Tensor & self, int64_t sections); // {"schema": "aten::hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> hsplit(const Tensor & self, IntArrayRef indices); // {"schema": "aten::hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> vsplit(const Tensor & self, int64_t sections); // {"schema": "aten::vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> vsplit(const Tensor & self, IntArrayRef indices); // {"schema": "aten::vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> dsplit(const Tensor & self, int64_t sections); // {"schema": "aten::dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> dsplit(const Tensor & self, IntArrayRef indices); // {"schema": "aten::dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+Tensor squeeze(const Tensor & self); // {"schema": "aten::squeeze(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor squeeze(const Tensor & self, int64_t dim); // {"schema": "aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor squeeze(const Tensor & self, Dimname dim); // {"schema": "aten::squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor squeeze(const Tensor & self, IntArrayRef dim); // {"schema": "aten::squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_(Tensor & self); // {"schema": "aten::squeeze_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_(Tensor & self, int64_t dim); // {"schema": "aten::squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_(Tensor & self, IntArrayRef dim); // {"schema": "aten::squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_(Tensor & self, Dimname dim); // {"schema": "aten::squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor sspaddmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & sspaddmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _chunk_cat(TensorList tensors, int64_t dim, int64_t num_chunks); // {"schema": "aten::_chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _chunk_cat_out(TensorList tensors, int64_t dim, int64_t num_chunks, Tensor & out); // {"schema": "aten::_chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor stack(TensorList tensors, int64_t dim); // {"schema": "aten::stack(Tensor[] tensors, int dim=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & stack_out(TensorList tensors, int64_t dim, Tensor & out); // {"schema": "aten::stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _stack(TensorList tensors, int64_t dim); // {"schema": "aten::_stack(Tensor[] tensors, int dim=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _stack_out(TensorList tensors, int64_t dim, Tensor & out); // {"schema": "aten::_stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor hstack(TensorList tensors); // {"schema": "aten::hstack(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & hstack_out(TensorList tensors, Tensor & out); // {"schema": "aten::hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor vstack(TensorList tensors); // {"schema": "aten::vstack(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & vstack_out(TensorList tensors, Tensor & out); // {"schema": "aten::vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor dstack(TensorList tensors); // {"schema": "aten::dstack(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & dstack_out(TensorList tensors, Tensor & out); // {"schema": "aten::dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor stft(const Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<Tensor> & window, bool normalized, c10::optional<bool> onesided, c10::optional<bool> return_complex); // {"schema": "aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor stft(const Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<Tensor> & window, bool center, c10::string_view pad_mode, bool normalized, c10::optional<bool> onesided, c10::optional<bool> return_complex); // {"schema": "aten::stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode=\"reflect\", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor istft(const Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<Tensor> & window, bool center, bool normalized, c10::optional<bool> onesided, c10::optional<int64_t> length, bool return_complex); // {"schema": "aten::istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor", "dispatch": "False", "default": "True"}
+int64_t stride(const Tensor & self, int64_t dim); // {"schema": "aten::stride.int(Tensor self, int dim) -> int", "dispatch": "False", "default": "True"}
+int64_t stride(const Tensor & self, Dimname dim); // {"schema": "aten::stride.Dimname(Tensor self, Dimname dim) -> int", "dispatch": "False", "default": "True"}
+c10::SymInt sym_stride(const Tensor & self, int64_t dim); // {"schema": "aten::sym_stride.int(Tensor self, int dim) -> SymInt", "dispatch": "False", "default": "True"}
+Tensor sum(const Tensor & self, c10::optional<ScalarType> dtype); // {"schema": "aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sum(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sum(const Tensor & self, DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & sum_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & sum_out(const Tensor & self, DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor _nested_sum_backward(const Tensor & grad, const Tensor & self, OptionalIntArrayRef dim, bool keepdim); // {"schema": "aten::_nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor nansum(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & nansum_out(const Tensor & self, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sum_to_size(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::sum_to_size(Tensor self, SymInt[] size) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sqrt(const Tensor & self); // {"schema": "aten::sqrt(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sqrt_(Tensor & self); // {"schema": "aten::sqrt_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sqrt_out(const Tensor & self, Tensor & out); // {"schema": "aten::sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor square(const Tensor & self); // {"schema": "aten::square(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & square_(Tensor & self); // {"schema": "aten::square_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & square_out(const Tensor & self, Tensor & out); // {"schema": "aten::square.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor std(const Tensor & self, bool unbiased); // {"schema": "aten::std(Tensor self, bool unbiased=True) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor std(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim); // {"schema": "aten::std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor std(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> std_mean(const Tensor & self, bool unbiased); // {"schema": "aten::std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> std_mean(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim); // {"schema": "aten::std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> std_mean(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> std_mean(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim); // {"schema": "aten::std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> std_mean(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor & std_out(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim, Tensor & out); // {"schema": "aten::std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & std_out(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out); // {"schema": "aten::std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor std(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim); // {"schema": "aten::std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & std_out(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim, Tensor & out); // {"schema": "aten::std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor std(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & std_out(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out); // {"schema": "aten::std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor prod(const Tensor & self, c10::optional<ScalarType> dtype); // {"schema": "aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor prod(const Tensor & self, int64_t dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & prod_out(const Tensor & self, int64_t dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor prod(const Tensor & self, Dimname dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & prod_out(const Tensor & self, Dimname dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor t(const Tensor & self); // {"schema": "aten::t(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & t_(Tensor & self); // {"schema": "aten::t_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor tan(const Tensor & self); // {"schema": "aten::tan(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tan_(Tensor & self); // {"schema": "aten::tan_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & tan_out(const Tensor & self, Tensor & out); // {"schema": "aten::tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor tanh(const Tensor & self); // {"schema": "aten::tanh(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tanh_(Tensor & self); // {"schema": "aten::tanh_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & tanh_out(const Tensor & self, Tensor & out); // {"schema": "aten::tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor tensordot(const Tensor & self, const Tensor & other, IntArrayRef dims_self, IntArrayRef dims_other); // {"schema": "aten::tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & tensordot_out(const Tensor & self, const Tensor & other, IntArrayRef dims_self, IntArrayRef dims_other, Tensor & out); // {"schema": "aten::tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor threshold(const Tensor & self, const Scalar & threshold, const Scalar & value); // {"schema": "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & threshold_(Tensor & self, const Scalar & threshold, const Scalar & value); // {"schema": "aten::threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & threshold_out(const Tensor & self, const Scalar & threshold, const Scalar & value, Tensor & out); // {"schema": "aten::threshold.out(Tensor self, Scalar threshold, Scalar value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & threshold_backward_out(const Tensor & grad_output, const Tensor & self, const Scalar & threshold, Tensor & grad_input); // {"schema": "aten::threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor threshold_backward(const Tensor & grad_output, const Tensor & self, const Scalar & threshold); // {"schema": "aten::threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor tile(const Tensor & self, c10::SymIntArrayRef dims); // {"schema": "aten::tile(Tensor self, SymInt[] dims) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor transpose(const Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor transpose(const Tensor & self, Dimname dim0, Dimname dim1); // {"schema": "aten::transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _mkldnn_transpose(const Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::_mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _mkldnn_transpose_(Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::_mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor one_hot(const Tensor & self, int64_t num_classes); // {"schema": "aten::one_hot(Tensor self, int num_classes=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor flip(const Tensor & self, IntArrayRef dims); // {"schema": "aten::flip(Tensor self, int[] dims) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor fliplr(const Tensor & self); // {"schema": "aten::fliplr(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor flipud(const Tensor & self); // {"schema": "aten::flipud(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor roll(const Tensor & self, c10::SymIntArrayRef shifts, IntArrayRef dims); // {"schema": "aten::roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor rot90(const Tensor & self, int64_t k, IntArrayRef dims); // {"schema": "aten::rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor trapezoid(const Tensor & y, const Tensor & x, int64_t dim); // {"schema": "aten::trapezoid.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor trapezoid(const Tensor & y, const Scalar & dx, int64_t dim); // {"schema": "aten::trapezoid.dx(Tensor y, *, Scalar dx=1, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor trapz(const Tensor & y, const Tensor & x, int64_t dim); // {"schema": "aten::trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor trapz(const Tensor & y, double dx, int64_t dim); // {"schema": "aten::trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> _transform_bias_rescale_qkv(const Tensor & qkv, const Tensor & qkv_bias, int64_t num_heads); // {"schema": "aten::_transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _nested_tensor_from_mask(const Tensor & t, const Tensor & mask, bool mask_check); // {"schema": "aten::_nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor", "dispatch": "True", "default": "False"}
+bool _nested_tensor_from_mask_left_aligned(const Tensor & t, const Tensor & mask); // {"schema": "aten::_nested_tensor_from_mask_left_aligned(Tensor t, Tensor mask) -> bool", "dispatch": "True", "default": "False"}
+Tensor _nested_from_padded(const Tensor & padded, const Tensor & cpu_nested_shape_example, bool fuse_transform_0213); // {"schema": "aten::_nested_from_padded(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_tensor_size(const Tensor & self); // {"schema": "aten::_nested_tensor_size(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_tensor_strides(const Tensor & self); // {"schema": "aten::_nested_tensor_strides(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_tensor_storage_offsets(const Tensor & self); // {"schema": "aten::_nested_tensor_storage_offsets(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_from_padded_and_nested_example(const Tensor & padded, const Tensor & nt_example); // {"schema": "aten::_nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_view_from_buffer(const Tensor & self, const Tensor & nested_size, const Tensor & nested_strides, const Tensor & offsets); // {"schema": "aten::_nested_view_from_buffer(Tensor(a) self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor _nested_view_from_buffer_copy(const Tensor & self, const Tensor & nested_size, const Tensor & nested_strides, const Tensor & offsets); // {"schema": "aten::_nested_view_from_buffer_copy(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _nested_view_from_jagged(const Tensor & self, const Tensor & offsets, const Tensor & dummy, const c10::optional<Tensor> & lengths, int64_t ragged_idx); // {"schema": "aten::_nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor _nested_view_from_jagged_copy(const Tensor & self, const Tensor & offsets, const Tensor & dummy, const c10::optional<Tensor> & lengths, int64_t ragged_idx); // {"schema": "aten::_nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _nested_get_values(const Tensor & self); // {"schema": "aten::_nested_get_values(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor _nested_get_values_copy(const Tensor & self); // {"schema": "aten::_nested_get_values_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _nested_get_offsets(const Tensor & self); // {"schema": "aten::_nested_get_offsets(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_get_lengths(const Tensor & self); // {"schema": "aten::_nested_get_lengths(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+int64_t _nested_get_ragged_idx(const Tensor & self); // {"schema": "aten::_nested_get_ragged_idx(Tensor self) -> int", "dispatch": "True", "default": "False"}
+Tensor _nested_get_jagged_dummy(const Tensor & any); // {"schema": "aten::_nested_get_jagged_dummy(Tensor any) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _trilinear(const Tensor & i1, const Tensor & i2, const Tensor & i3, IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3, IntArrayRef sumdim, int64_t unroll_dim); // {"schema": "aten::_trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor triplet_margin_loss(const Tensor & anchor, const Tensor & positive, const Tensor & negative, double margin, double p, double eps, bool swap, int64_t reduction); // {"schema": "aten::triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor trunc(const Tensor & self); // {"schema": "aten::trunc(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & trunc_(Tensor & self); // {"schema": "aten::trunc_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & trunc_out(const Tensor & self, Tensor & out); // {"schema": "aten::trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor fix(const Tensor & self); // {"schema": "aten::fix(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fix_(Tensor & self); // {"schema": "aten::fix_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & fix_out(const Tensor & self, Tensor & out); // {"schema": "aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor type_as(const Tensor & self, const Tensor & other); // {"schema": "aten::type_as(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+bool _has_compatible_shallow_copy_type(const Tensor & self, const Tensor & from); // {"schema": "aten::_has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _unique(const Tensor & self, bool sorted, bool return_inverse); // {"schema": "aten::_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> unique_dim(const Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts); // {"schema": "aten::unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> unique_consecutive(const Tensor & self, bool return_inverse, bool return_counts, c10::optional<int64_t> dim); // {"schema": "aten::unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> unique_dim_consecutive(const Tensor & self, int64_t dim, bool return_inverse, bool return_counts); // {"schema": "aten::unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _unique2(const Tensor & self, bool sorted, bool return_inverse, bool return_counts); // {"schema": "aten::_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _unsafe_view(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::_unsafe_view(Tensor self, SymInt[] size) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor unsqueeze(const Tensor & self, int64_t dim); // {"schema": "aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & unsqueeze_(Tensor & self, int64_t dim); // {"schema": "aten::unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor vander(const Tensor & x, c10::optional<int64_t> N, bool increasing); // {"schema": "aten::vander(Tensor x, int? N=None, bool increasing=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor var(const Tensor & self, bool unbiased); // {"schema": "aten::var(Tensor self, bool unbiased=True) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor var(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim); // {"schema": "aten::var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor var(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & var_out(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim, Tensor & out); // {"schema": "aten::var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & var_out(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out); // {"schema": "aten::var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor var(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim); // {"schema": "aten::var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & var_out(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim, Tensor & out); // {"schema": "aten::var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor var(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & var_out(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out); // {"schema": "aten::var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> var_mean(const Tensor & self, bool unbiased); // {"schema": "aten::var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> var_mean(const Tensor & self, OptionalIntArrayRef dim, bool unbiased, bool keepdim); // {"schema": "aten::var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> var_mean(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> var_mean(const Tensor & self, DimnameList dim, bool unbiased, bool keepdim); // {"schema": "aten::var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> var_mean(const Tensor & self, DimnameList dim, const c10::optional<Scalar> & correction, bool keepdim); // {"schema": "aten::var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor view_as(const Tensor & self, const Tensor & other); // {"schema": "aten::view_as(Tensor(a) self, Tensor other) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor where(const Tensor & condition, const Tensor & self, const Tensor & other); // {"schema": "aten::where.self(Tensor condition, Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & where_out(const Tensor & condition, const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor where(const Tensor & condition, const Scalar & self, const Tensor & other); // {"schema": "aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor where(const Tensor & condition, const Tensor & self, const Scalar & other); // {"schema": "aten::where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor where(const Tensor & condition, const Scalar & self, const Scalar & other); // {"schema": "aten::where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> where(const Tensor & condition); // {"schema": "aten::where(Tensor condition) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor norm_except_dim(const Tensor & v, int64_t pow, int64_t dim); // {"schema": "aten::norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _weight_norm(const Tensor & v, const Tensor & g, int64_t dim); // {"schema": "aten::_weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _weight_norm_interface(const Tensor & v, const Tensor & g, int64_t dim); // {"schema": "aten::_weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _weight_norm_interface_backward(const Tensor & grad_w, const Tensor & saved_v, const Tensor & saved_g, const Tensor & saved_norms, int64_t dim); // {"schema": "aten::_weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _weight_norm_differentiable_backward(const Tensor & grad_w, const Tensor & saved_v, const Tensor & saved_g, const Tensor & saved_norms, int64_t dim); // {"schema": "aten::_weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor zeros(IntArrayRef size, c10::optional<DimnameList> names, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _efficientzerotensor(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_efficientzerotensor(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor zeros(c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & zeros_out(c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor zeros_like(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _standard_gamma_grad(const Tensor & self, const Tensor & output); // {"schema": "aten::_standard_gamma_grad(Tensor self, Tensor output) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _standard_gamma(const Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::_standard_gamma(Tensor self, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _dirichlet_grad(const Tensor & x, const Tensor & alpha, const Tensor & total); // {"schema": "aten::_dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sample_dirichlet(const Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::_sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor poisson(const Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::poisson(Tensor self, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor binomial(const Tensor & count, const Tensor & prob, c10::optional<Generator> generator); // {"schema": "aten::binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor native_norm(const Tensor & self, const Scalar & p); // {"schema": "aten::native_norm(Tensor self, Scalar p=2) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor native_norm(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_sum(const Tensor & self); // {"schema": "aten::_sparse_sum(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_sum(const Tensor & self, ScalarType dtype); // {"schema": "aten::_sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_sum(const Tensor & self, IntArrayRef dim); // {"schema": "aten::_sparse_sum.dim(Tensor self, int[1] dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _sparse_sum(const Tensor & self, IntArrayRef dim, ScalarType dtype); // {"schema": "aten::_sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_sum_backward(const Tensor & grad, const Tensor & self, IntArrayRef dim); // {"schema": "aten::_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_csr_sum(const Tensor & self, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_csr_sum.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_csr_prod(const Tensor & self, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_csr_prod.dim_dtype(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_softmax(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_softmax(const Tensor & self, int64_t dim, bool half_to_float); // {"schema": "aten::_sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_softmax_backward_data(const Tensor & grad_output, const Tensor & output, int64_t dim, const Tensor & self); // {"schema": "aten::_sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_log_softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_log_softmax(const Tensor & self, Dimname dim, c10::optional<ScalarType> dtype); // {"schema": "aten::_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_log_softmax(const Tensor & self, int64_t dim, bool half_to_float); // {"schema": "aten::_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_log_softmax_backward_data(const Tensor & grad_output, const Tensor & output, int64_t dim, const Tensor & self); // {"schema": "aten::_sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _spdiags(const Tensor & diagonals, const Tensor & offsets, IntArrayRef shape, c10::optional<Layout> layout); // {"schema": "aten::_spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor norm(const Tensor & self, const c10::optional<Scalar> & p, ScalarType dtype); // {"schema": "aten::norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor norm(const Tensor & self, const Scalar & p); // {"schema": "aten::norm.Scalar(Tensor self, Scalar p=2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor norm(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim, ScalarType dtype); // {"schema": "aten::norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor norm(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim); // {"schema": "aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & norm_out(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim, ScalarType dtype, Tensor & out); // {"schema": "aten::norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & norm_out(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor norm(const Tensor & self, const c10::optional<Scalar> & p, DimnameList dim, bool keepdim, ScalarType dtype); // {"schema": "aten::norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor norm(const Tensor & self, const c10::optional<Scalar> & p, DimnameList dim, bool keepdim); // {"schema": "aten::norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & norm_out(const Tensor & self, const c10::optional<Scalar> & p, DimnameList dim, bool keepdim, ScalarType dtype, Tensor & out); // {"schema": "aten::norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & norm_out(const Tensor & self, const c10::optional<Scalar> & p, DimnameList dim, bool keepdim, Tensor & out); // {"schema": "aten::norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> frexp(const Tensor & self); // {"schema": "aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> frexp_out(const Tensor & self, Tensor & mantissa, Tensor & exponent); // {"schema": "aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)", "dispatch": "True", "default": "False"}
+Tensor frobenius_norm(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & frobenius_norm_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nuclear_norm(const Tensor & self, bool keepdim); // {"schema": "aten::nuclear_norm(Tensor self, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nuclear_norm_out(const Tensor & self, bool keepdim, Tensor & out); // {"schema": "aten::nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nuclear_norm(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nuclear_norm_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor clone(const Tensor & self, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor positive(const Tensor & self); // {"schema": "aten::positive(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+const Tensor & resize_as_(const Tensor & self, const Tensor & the_template, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+const Tensor & resize_as_sparse_(const Tensor & self, const Tensor & the_template); // {"schema": "aten::resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & zero_(Tensor & self); // {"schema": "aten::zero_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & sub_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::sub.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sub(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sub_(Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor sub(const Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sub_(Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & subtract_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor subtract(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & subtract_(Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor subtract(const Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & subtract_(Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor rsub(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & heaviside_out(const Tensor & self, const Tensor & values, Tensor & out); // {"schema": "aten::heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor heaviside(const Tensor & self, const Tensor & values); // {"schema": "aten::heaviside(Tensor self, Tensor values) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & heaviside_(Tensor & self, const Tensor & values); // {"schema": "aten::heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor rsub(const Tensor & self, const Scalar & other, const Scalar & alpha); // {"schema": "aten::rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _sparse_addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::_sparse_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sparse_sampled_addmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sparse_sampled_addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _sparse_mm_reduce_impl(const Tensor & self, const Tensor & other, c10::string_view reduce); // {"schema": "aten::_sparse_mm_reduce_impl(Tensor self, Tensor other, str reduce) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _sparse_mm_reduce_impl_backward(const Tensor & self, const Tensor & grad_out, const Tensor & weight, c10::string_view reduce, const Tensor & arg_out, ::std::array<bool,2> output_mask); // {"schema": "aten::_sparse_mm_reduce_impl_backward(Tensor self, Tensor grad_out, Tensor weight, str reduce, Tensor arg_out, bool[2] output_mask) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor & addmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _addmm_activation_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, bool use_gelu, Tensor & out); // {"schema": "aten::_addmm_activation.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _addmm_activation(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, bool use_gelu); // {"schema": "aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> _scaled_mm(const Tensor & self, const Tensor & mat2, const c10::optional<Tensor> & bias, c10::optional<ScalarType> out_dtype, const c10::optional<Tensor> & scale_a, const c10::optional<Tensor> & scale_b, const c10::optional<Tensor> & scale_result, bool use_fast_accum); // {"schema": "aten::_scaled_mm(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> _scaled_mm_out(const Tensor & self, const Tensor & mat2, const c10::optional<Tensor> & bias, c10::optional<ScalarType> out_dtype, const c10::optional<Tensor> & scale_a, const c10::optional<Tensor> & scale_b, const c10::optional<Tensor> & scale_result, bool use_fast_accum, Tensor & out, Tensor & out_amax); // {"schema": "aten::_scaled_mm.out(Tensor self, Tensor mat2, *, Tensor? bias=None, ScalarType? out_dtype=None, Tensor? scale_a=None, Tensor? scale_b=None, Tensor? scale_result=None, bool use_fast_accum=False, Tensor(a!) out, Tensor(b!) out_amax) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+Tensor sparse_compressed_tensor(const Tensor & compressed_indices, const Tensor & plain_indices, const Tensor & values, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sparse_csr_tensor(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_csc_tensor(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_csc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_bsr_tensor(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_bsc_tensor(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_bsc_tensor.ccol_row_value_size(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_compressed_tensor(const Tensor & compressed_indices, const Tensor & plain_indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_compressed_tensor.comp_plain_value(Tensor compressed_indices, Tensor plain_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sparse_csr_tensor(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_csc_tensor(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_csc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_bsr_tensor(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_bsc_tensor(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_bsc_tensor.ccol_row_value(Tensor ccol_indices, Tensor row_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_compressed_tensor_unsafe(const Tensor & compressed_indices, const Tensor & plain_indices, const Tensor & values, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_csr_tensor_unsafe(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_csr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_csc_tensor_unsafe(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_csc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_bsr_tensor_unsafe(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_bsr_tensor_unsafe(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_bsc_tensor_unsafe(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_bsc_tensor_unsafe(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_coo_tensor(IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced); // {"schema": "aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor sparse_coo_tensor(const Tensor & indices, const Tensor & values, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced); // {"schema": "aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, c10::SymIntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced); // {"schema": "aten::_sparse_coo_tensor_unsafe(Tensor indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor", "dispatch": "False", "default": "True"}
+void _validate_sparse_coo_tensor_args(const Tensor & indices, const Tensor & values, IntArrayRef size, c10::optional<bool> is_coalesced); // {"schema": "aten::_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None) -> ()", "dispatch": "False", "default": "True"}
+void _validate_sparse_compressed_tensor_args(const Tensor & compressed_indices, const Tensor & plain_indices, const Tensor & values, IntArrayRef size, Layout layout); // {"schema": "aten::_validate_sparse_compressed_tensor_args(Tensor compressed_indices, Tensor plain_indices, Tensor values, int[] size, Layout layout) -> ()", "dispatch": "False", "default": "True"}
+void _validate_sparse_csr_tensor_args(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size); // {"schema": "aten::_validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()", "dispatch": "False", "default": "True"}
+void _validate_sparse_csc_tensor_args(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size); // {"schema": "aten::_validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()", "dispatch": "False", "default": "True"}
+void _validate_sparse_bsr_tensor_args(const Tensor & crow_indices, const Tensor & col_indices, const Tensor & values, IntArrayRef size); // {"schema": "aten::_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()", "dispatch": "False", "default": "True"}
+void _validate_sparse_bsc_tensor_args(const Tensor & ccol_indices, const Tensor & row_indices, const Tensor & values, IntArrayRef size); // {"schema": "aten::_validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()", "dispatch": "False", "default": "True"}
+Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const Tensor & indices, const Tensor & values, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced); // {"schema": "aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor", "dispatch": "True", "default": "False"}
+const Tensor & sparse_resize_(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); // {"schema": "aten::sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+const Tensor & sparse_resize_and_clear_(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); // {"schema": "aten::sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sparse_mask(const Tensor & self, const Tensor & mask); // {"schema": "aten::sparse_mask(Tensor self, Tensor mask) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _sparse_mask_projection(const Tensor & self, const Tensor & mask, bool accumulate_matches); // {"schema": "aten::_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _to_cpu(TensorList tensors); // {"schema": "aten::_to_cpu(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor to_dense(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<bool> masked_grad); // {"schema": "aten::to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_dense(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<bool> masked_grad); // {"schema": "aten::_to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_dense_backward(const Tensor & grad, const Tensor & input, c10::optional<bool> masked_grad); // {"schema": "aten::to_dense_backward(Tensor grad, Tensor input, bool? masked_grad=None) -> Tensor", "dispatch": "False", "default": "True"}
+int64_t sparse_dim(const Tensor & self); // {"schema": "aten::sparse_dim(Tensor self) -> int", "dispatch": "True", "default": "False"}
+int64_t _dimI(const Tensor & self); // {"schema": "aten::_dimI(Tensor self) -> int", "dispatch": "True", "default": "False"}
+int64_t dense_dim(const Tensor & self); // {"schema": "aten::dense_dim(Tensor self) -> int", "dispatch": "True", "default": "False"}
+int64_t _dimV(const Tensor & self); // {"schema": "aten::_dimV(Tensor self) -> int", "dispatch": "True", "default": "False"}
+int64_t _nnz(const Tensor & self); // {"schema": "aten::_nnz(Tensor self) -> int", "dispatch": "True", "default": "False"}
+Tensor coalesce(const Tensor & self); // {"schema": "aten::coalesce(Tensor(a) self) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _coalesce(const Tensor & self); // {"schema": "aten::_coalesce(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+bool is_coalesced(const Tensor & self); // {"schema": "aten::is_coalesced(Tensor self) -> bool", "dispatch": "True", "default": "True"}
+Tensor _indices(const Tensor & self); // {"schema": "aten::_indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor _values(const Tensor & self); // {"schema": "aten::_values(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor & _coalesced_(Tensor & self, bool coalesced); // {"schema": "aten::_coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor indices(const Tensor & self); // {"schema": "aten::indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor values(const Tensor & self); // {"schema": "aten::values(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor crow_indices(const Tensor & self); // {"schema": "aten::crow_indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor col_indices(const Tensor & self); // {"schema": "aten::col_indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor ccol_indices(const Tensor & self); // {"schema": "aten::ccol_indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor row_indices(const Tensor & self); // {"schema": "aten::row_indices(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & hspmm_out(const Tensor & mat1, const Tensor & mat2, Tensor & out); // {"schema": "aten::hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hspmm(const Tensor & mat1, const Tensor & mat2); // {"schema": "aten::hspmm(Tensor mat1, Tensor mat2) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & copy_sparse_to_sparse_(Tensor & self, const Tensor & src, bool non_blocking); // {"schema": "aten::copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> unbind(const Tensor & self, int64_t dim); // {"schema": "aten::unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> unbind(const Tensor & self, Dimname dim); // {"schema": "aten::unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]", "dispatch": "False", "default": "True"}
+Tensor to_sparse(const Tensor & self, int64_t sparse_dim); // {"schema": "aten::to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse(const Tensor & self, int64_t sparse_dim); // {"schema": "aten::_to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_sparse(const Tensor & self, c10::optional<Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse(const Tensor & self, c10::optional<Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::_to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_sparse_csr(const Tensor & self, c10::optional<int64_t> dense_dim); // {"schema": "aten::to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse_csr(const Tensor & self, c10::optional<int64_t> dense_dim); // {"schema": "aten::_to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_sparse_csc(const Tensor & self, c10::optional<int64_t> dense_dim); // {"schema": "aten::to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse_csc(const Tensor & self, c10::optional<int64_t> dense_dim); // {"schema": "aten::_to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_sparse_bsr(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse_bsr(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_sparse_bsc(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _to_sparse_bsc(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim); // {"schema": "aten::_to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _to_sparse_semi_structured(const Tensor & dense); // {"schema": "aten::_to_sparse_semi_structured(Tensor dense) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor to_mkldnn(const Tensor & self, c10::optional<ScalarType> dtype); // {"schema": "aten::to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_reorder_conv2d_weight(const Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, OptionalSymIntArrayRef input_size); // {"schema": "aten::mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor mkldnn_reorder_conv3d_weight(const Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups); // {"schema": "aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor to_mkldnn_backward(const Tensor & grad, const Tensor & input); // {"schema": "aten::to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor quantize_per_tensor_dynamic(const Tensor & self, ScalarType dtype, bool reduce_range); // {"schema": "aten::quantize_per_tensor_dynamic(Tensor self, ScalarType dtype, bool reduce_range) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor quantize_per_tensor(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype); // {"schema": "aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor quantize_per_tensor(const Tensor & self, const Tensor & scale, const Tensor & zero_point, ScalarType dtype); // {"schema": "aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> quantize_per_tensor(TensorList tensors, const Tensor & scales, const Tensor & zero_points, ScalarType dtype); // {"schema": "aten::quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]", "dispatch": "True", "default": "False"}
+Tensor quantize_per_channel(const Tensor & self, const Tensor & scales, const Tensor & zero_points, int64_t axis, ScalarType dtype); // {"schema": "aten::quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor dequantize(const Tensor & self); // {"schema": "aten::dequantize.self(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> dequantize(TensorList tensors); // {"schema": "aten::dequantize.tensors(Tensor[] tensors) -> Tensor[]", "dispatch": "True", "default": "False"}
+double q_scale(const Tensor & self); // {"schema": "aten::q_scale(Tensor self) -> float", "dispatch": "True", "default": "False"}
+int64_t q_zero_point(const Tensor & self); // {"schema": "aten::q_zero_point(Tensor self) -> int", "dispatch": "True", "default": "False"}
+Tensor q_per_channel_scales(const Tensor & self); // {"schema": "aten::q_per_channel_scales(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor q_per_channel_zero_points(const Tensor & self); // {"schema": "aten::q_per_channel_zero_points(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+int64_t q_per_channel_axis(const Tensor & self); // {"schema": "aten::q_per_channel_axis(Tensor self) -> int", "dispatch": "True", "default": "False"}
+Tensor int_repr(const Tensor & self); // {"schema": "aten::int_repr(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _make_per_tensor_quantized_tensor(const Tensor & self, double scale, int64_t zero_point); // {"schema": "aten::_make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _make_per_channel_quantized_tensor(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis); // {"schema": "aten::_make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor", "dispatch": "True", "default": "False"}
+QScheme qscheme(const Tensor & self); // {"schema": "aten::qscheme(Tensor self) -> QScheme", "dispatch": "True", "default": "False"}
+Tensor fake_quantize_per_tensor_affine(const Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max); // {"schema": "aten::fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fake_quantize_per_tensor_affine(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t quant_min, int64_t quant_max); // {"schema": "aten::fake_quantize_per_tensor_affine.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> fake_quantize_per_tensor_affine_cachemask(const Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max); // {"schema": "aten::fake_quantize_per_tensor_affine_cachemask(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> (Tensor output, Tensor mask)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(const Tensor & self, const Tensor & scale, const Tensor & zero_point, const Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max); // {"schema": "aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)", "dispatch": "True", "default": "False"}
+Tensor fake_quantize_per_tensor_affine_cachemask_backward(const Tensor & grad, const Tensor & mask); // {"schema": "aten::fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _fake_quantize_learnable_per_tensor_affine(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor); // {"schema": "aten::_fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _fake_quantize_learnable_per_tensor_affine_backward(const Tensor & grad, const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor); // {"schema": "aten::_fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor fake_quantize_per_channel_affine(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max); // {"schema": "aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> fake_quantize_per_channel_affine_cachemask(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max); // {"schema": "aten::fake_quantize_per_channel_affine_cachemask(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor output, Tensor mask)", "dispatch": "True", "default": "False"}
+Tensor fake_quantize_per_channel_affine_cachemask_backward(const Tensor & grad, const Tensor & mask); // {"schema": "aten::fake_quantize_per_channel_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _fake_quantize_learnable_per_channel_affine(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor); // {"schema": "aten::_fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _fake_quantize_learnable_per_channel_affine_backward(const Tensor & grad, const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor); // {"schema": "aten::_fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor fused_moving_avg_obs_fake_quant(const Tensor & self, const Tensor & observer_on, const Tensor & fake_quant_on, Tensor & running_min, Tensor & running_max, Tensor & scale, Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant); // {"schema": "aten::fused_moving_avg_obs_fake_quant(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _fused_moving_avg_obs_fq_helper(const Tensor & self, const Tensor & observer_on, const Tensor & fake_quant_on, Tensor & running_min, Tensor & running_max, Tensor & scale, Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant); // {"schema": "aten::_fused_moving_avg_obs_fq_helper(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask)", "dispatch": "True", "default": "False"}
+::std::tuple<double,int64_t> _choose_qparams_per_tensor(const Tensor & self, bool reduce_range); // {"schema": "aten::_choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)", "dispatch": "False", "default": "True"}
+Tensor _saturate_weight_to_fp16(const Tensor & weight); // {"schema": "aten::_saturate_weight_to_fp16(Tensor weight) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> choose_qparams_optimized(const Tensor & input, int64_t numel, int64_t n_bins, double ratio, int64_t bit_width); // {"schema": "aten::choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor _autocast_to_reduced_precision(const Tensor & self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype); // {"schema": "aten::_autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _autocast_to_full_precision(const Tensor & self, bool cuda_enabled, bool cpu_enabled); // {"schema": "aten::_autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor _to_copy(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor to(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor to(const Tensor & self, const Tensor & other, bool non_blocking, bool copy, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> meshgrid(TensorList tensors); // {"schema": "aten::meshgrid(Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> meshgrid(TensorList tensors, c10::string_view indexing); // {"schema": "aten::meshgrid.indexing(Tensor[] tensors, *, str indexing) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor cartesian_prod(TensorList tensors); // {"schema": "aten::cartesian_prod(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor combinations(const Tensor & self, int64_t r, bool with_replacement); // {"schema": "aten::combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor", "dispatch": "False", "default": "True"}
+Scalar item(const Tensor & self); // {"schema": "aten::item(Tensor self) -> Scalar", "dispatch": "False", "default": "True"}
+ScalarType result_type(const Tensor & tensor, const Tensor & other); // {"schema": "aten::result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType", "dispatch": "False", "default": "True"}
+ScalarType result_type(const Tensor & tensor, const Scalar & other); // {"schema": "aten::result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType", "dispatch": "False", "default": "True"}
+ScalarType result_type(const Scalar & scalar, const Tensor & tensor); // {"schema": "aten::result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType", "dispatch": "False", "default": "True"}
+ScalarType result_type(const Scalar & scalar1, const Scalar & scalar2); // {"schema": "aten::result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType", "dispatch": "False", "default": "True"}
+bool can_cast(ScalarType from, ScalarType to); // {"schema": "aten::can_cast(ScalarType from, ScalarType to) -> bool", "dispatch": "False", "default": "True"}
+ScalarType promote_types(ScalarType type1, ScalarType type2); // {"schema": "aten::promote_types(ScalarType type1, ScalarType type2) -> ScalarType", "dispatch": "False", "default": "True"}
+Scalar _local_scalar_dense(const Tensor & self); // {"schema": "aten::_local_scalar_dense(Tensor self) -> Scalar", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor,Tensor> _lstm_mps(const Tensor & input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::_lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,::std::vector<Tensor>,::std::vector<Tensor>> lstm_mps_backward(const c10::optional<Tensor> & grad_y, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & z_state, const Tensor & cell_state_fwd, const Tensor & input, const Tensor & layersOutputs, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::lstm_mps_backward(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor[], Tensor[])", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _thnn_fused_lstm_cell(const Tensor & input_gates, const Tensor & hidden_gates, const Tensor & cx, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias); // {"schema": "aten::_thnn_fused_lstm_cell(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _thnn_fused_lstm_cell_backward_impl(const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & cx, const Tensor & cy, const Tensor & workspace, bool has_bias); // {"schema": "aten::_thnn_fused_lstm_cell_backward_impl(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _thnn_fused_lstm_cell_backward(const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & cx, const Tensor & cy, const Tensor & workspace, bool has_bias); // {"schema": "aten::_thnn_fused_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _thnn_differentiable_lstm_cell_backward(const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & input_gates, const Tensor & hidden_gates, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias, const Tensor & cx, const Tensor & cy); // {"schema": "aten::_thnn_differentiable_lstm_cell_backward(Tensor? grad_hy, Tensor? grad_cy, Tensor input_gates, Tensor hidden_gates, Tensor? input_bias, Tensor? hidden_bias, Tensor cx, Tensor cy) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _thnn_fused_gru_cell(const Tensor & input_gates, const Tensor & hidden_gates, const Tensor & hx, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias); // {"schema": "aten::_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _thnn_fused_gru_cell_backward(const Tensor & grad_hy, const Tensor & workspace, bool has_bias); // {"schema": "aten::_thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _thnn_differentiable_gru_cell_backward(const Tensor & grad_hy, const Tensor & input_gates, const Tensor & hidden_gates, const Tensor & hx, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias); // {"schema": "aten::_thnn_differentiable_gru_cell_backward(Tensor grad_hy, Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias, Tensor? hidden_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> lstm(const Tensor & input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> lstm(const Tensor & data, const Tensor & batch_sizes, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); // {"schema": "aten::lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> gru(const Tensor & input, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> gru(const Tensor & data, const Tensor & batch_sizes, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); // {"schema": "aten::gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> rnn_tanh(const Tensor & input, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> rnn_tanh(const Tensor & data, const Tensor & batch_sizes, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); // {"schema": "aten::rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> rnn_relu(const Tensor & input, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); // {"schema": "aten::rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> rnn_relu(const Tensor & data, const Tensor & batch_sizes, const Tensor & hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); // {"schema": "aten::rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> lstm_cell(const Tensor & input, TensorList hx, const Tensor & w_ih, const Tensor & w_hh, const c10::optional<Tensor> & b_ih, const c10::optional<Tensor> & b_hh); // {"schema": "aten::lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor gru_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const c10::optional<Tensor> & b_ih, const c10::optional<Tensor> & b_hh); // {"schema": "aten::gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor rnn_tanh_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const c10::optional<Tensor> & b_ih, const c10::optional<Tensor> & b_hh); // {"schema": "aten::rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor rnn_relu_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const c10::optional<Tensor> & b_ih, const c10::optional<Tensor> & b_hh); // {"schema": "aten::rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> quantized_lstm_cell(const Tensor & input, TensorList hx, const Tensor & w_ih, const Tensor & w_hh, const Tensor & b_ih, const Tensor & b_hh, const Tensor & packed_ih, const Tensor & packed_hh, const Tensor & col_offsets_ih, const Tensor & col_offsets_hh, const Scalar & scale_ih, const Scalar & scale_hh, const Scalar & zero_point_ih, const Scalar & zero_point_hh); // {"schema": "aten::quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor quantized_gru_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const Tensor & b_ih, const Tensor & b_hh, const Tensor & packed_ih, const Tensor & packed_hh, const Tensor & col_offsets_ih, const Tensor & col_offsets_hh, const Scalar & scale_ih, const Scalar & scale_hh, const Scalar & zero_point_ih, const Scalar & zero_point_hh); // {"schema": "aten::quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor quantized_rnn_relu_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const Tensor & b_ih, const Tensor & b_hh, const Tensor & packed_ih, const Tensor & packed_hh, const Tensor & col_offsets_ih, const Tensor & col_offsets_hh, const Scalar & scale_ih, const Scalar & scale_hh, const Scalar & zero_point_ih, const Scalar & zero_point_hh); // {"schema": "aten::quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor quantized_rnn_tanh_cell(const Tensor & input, const Tensor & hx, const Tensor & w_ih, const Tensor & w_hh, const Tensor & b_ih, const Tensor & b_hh, const Tensor & packed_ih, const Tensor & packed_hh, const Tensor & col_offsets_ih, const Tensor & col_offsets_hh, const Scalar & scale_ih, const Scalar & scale_hh, const Scalar & zero_point_ih, const Scalar & zero_point_hh); // {"schema": "aten::quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _pack_padded_sequence(const Tensor & input, const Tensor & lengths, bool batch_first); // {"schema": "aten::_pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor _pack_padded_sequence_backward(const Tensor & grad, c10::SymIntArrayRef input_size, const Tensor & batch_sizes, bool batch_first); // {"schema": "aten::_pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> _pad_packed_sequence(const Tensor & data, const Tensor & batch_sizes, bool batch_first, const Scalar & padding_value, int64_t total_length); // {"schema": "aten::_pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+Tensor & set_(Tensor & self, Storage source); // {"schema": "aten::set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & set_(Tensor & self, Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); // {"schema": "aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & set_(Tensor & self, const Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); // {"schema": "aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & set_(Tensor & self, const Tensor & source); // {"schema": "aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & set_(Tensor & self); // {"schema": "aten::set_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor lift(const Tensor & self); // {"schema": "aten::lift(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor lift_fresh(const Tensor & self); // {"schema": "aten::lift_fresh(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor lift_fresh_copy(const Tensor & self); // {"schema": "aten::lift_fresh_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+bool is_set_to(const Tensor & self, const Tensor & tensor); // {"schema": "aten::is_set_to(Tensor self, Tensor tensor) -> bool", "dispatch": "True", "default": "False"}
+Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Scalar & value); // {"schema": "aten::masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor masked_fill(const Tensor & self, const Tensor & mask, const Scalar & value); // {"schema": "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & masked_fill_(Tensor & self, const Tensor & mask, const Tensor & value); // {"schema": "aten::masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor masked_fill(const Tensor & self, const Tensor & mask, const Tensor & value); // {"schema": "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source); // {"schema": "aten::masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor masked_scatter(const Tensor & self, const Tensor & mask, const Tensor & source); // {"schema": "aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor masked_scatter_backward(const Tensor & grad_output, const Tensor & mask, c10::SymIntArrayRef sizes); // {"schema": "aten::masked_scatter_backward(Tensor grad_output, Tensor mask, SymInt[] sizes) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _masked_softmax(const Tensor & self, const Tensor & mask, c10::optional<int64_t> dim, c10::optional<int64_t> mask_type); // {"schema": "aten::_masked_softmax(Tensor self, Tensor mask, int? dim=None, int? mask_type=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _masked_softmax_backward(const Tensor & grad_output, const Tensor & output, const Tensor & mask, c10::optional<int64_t> dim); // {"schema": "aten::_masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor view(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::view(Tensor(a) self, SymInt[] size) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor view(const Tensor & self, ScalarType dtype); // {"schema": "aten::view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor & put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate); // {"schema": "aten::put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor put(const Tensor & self, const Tensor & index, const Tensor & source, bool accumulate); // {"schema": "aten::put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_add_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, const Scalar & alpha, Tensor & out); // {"schema": "aten::index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & index_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, const Scalar & alpha); // {"schema": "aten::index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor index_add(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, const Scalar & alpha); // {"schema": "aten::index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor index_add(const Tensor & self, Dimname dim, const Tensor & index, const Tensor & source, const Scalar & alpha); // {"schema": "aten::index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & index_reduce_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, c10::string_view reduce, bool include_self, Tensor & out); // {"schema": "aten::index_reduce.out(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & index_reduce_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, c10::string_view reduce, bool include_self); // {"schema": "aten::index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor index_reduce(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & source, c10::string_view reduce, bool include_self); // {"schema": "aten::index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Scalar & value); // {"schema": "aten::index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value); // {"schema": "aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & value); // {"schema": "aten::index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & value); // {"schema": "aten::index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & index_fill_(Tensor & self, Dimname dim, const Tensor & index, const Scalar & value); // {"schema": "aten::index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & index_fill_(Tensor & self, Dimname dim, const Tensor & index, const Tensor & value); // {"schema": "aten::index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor index_fill(const Tensor & self, Dimname dim, const Tensor & index, const Scalar & value); // {"schema": "aten::index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor index_fill(const Tensor & self, Dimname dim, const Tensor & index, const Tensor & value); // {"schema": "aten::index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, Tensor & out); // {"schema": "aten::scatter.src_out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value); // {"schema": "aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Scalar & value); // {"schema": "aten::scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_out(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value, Tensor & out); // {"schema": "aten::scatter.value_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce); // {"schema": "aten::scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce); // {"schema": "aten::scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce, Tensor & out); // {"schema": "aten::scatter.reduce_out(Tensor self, int dim, Tensor index, Tensor src, *, str reduce, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value, c10::string_view reduce); // {"schema": "aten::scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Scalar & value, c10::string_view reduce); // {"schema": "aten::scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_out(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value, c10::string_view reduce, Tensor & out); // {"schema": "aten::scatter.value_reduce_out(Tensor self, int dim, Tensor index, Scalar value, *, str reduce, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor scatter(const Tensor & self, Dimname dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor scatter(const Tensor & self, Dimname dim, const Tensor & index, const Scalar & value); // {"schema": "aten::scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor scatter_add(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_add_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, Tensor & out); // {"schema": "aten::scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor scatter_add(const Tensor & self, Dimname dim, const Tensor & index, const Tensor & src); // {"schema": "aten::scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor scatter_reduce(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce, bool include_self); // {"schema": "aten::scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & scatter_reduce_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce, bool include_self); // {"schema": "aten::scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scatter_reduce_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & src, c10::string_view reduce, bool include_self, Tensor & out); // {"schema": "aten::scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & eq_(Tensor & self, const Scalar & other); // {"schema": "aten::eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & eq_(Tensor & self, const Tensor & other); // {"schema": "aten::eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_and_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & bitwise_and_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bitwise_and(const Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_and(const Scalar & self, const Tensor & other); // {"schema": "aten::bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_and(const Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_and_(Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_and_(Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor __and__(const Tensor & self, const Scalar & other); // {"schema": "aten::__and__.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor __and__(const Tensor & self, const Tensor & other); // {"schema": "aten::__and__.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & __iand__(Tensor & self, const Scalar & other); // {"schema": "aten::__iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & __iand__(Tensor & self, const Tensor & other); // {"schema": "aten::__iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & bitwise_or_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & bitwise_or_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::bitwise_or.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bitwise_or(const Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_or(const Scalar & self, const Tensor & other); // {"schema": "aten::bitwise_or.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_or(const Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_or_(Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_or_(Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor __or__(const Tensor & self, const Scalar & other); // {"schema": "aten::__or__.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor __or__(const Tensor & self, const Tensor & other); // {"schema": "aten::__or__.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & __ior__(Tensor & self, const Scalar & other); // {"schema": "aten::__ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & __ior__(Tensor & self, const Tensor & other); // {"schema": "aten::__ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & bitwise_xor_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & bitwise_xor_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::bitwise_xor.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bitwise_xor(const Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_xor(const Scalar & self, const Tensor & other); // {"schema": "aten::bitwise_xor.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor bitwise_xor(const Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_xor_(Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_xor_(Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor __xor__(const Tensor & self, const Scalar & other); // {"schema": "aten::__xor__.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor __xor__(const Tensor & self, const Tensor & other); // {"schema": "aten::__xor__.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & __ixor__(Tensor & self, const Scalar & other); // {"schema": "aten::__ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & __ixor__(Tensor & self, const Tensor & other); // {"schema": "aten::__ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor __lshift__(const Tensor & self, const Scalar & other); // {"schema": "aten::__lshift__.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor __lshift__(const Tensor & self, const Tensor & other); // {"schema": "aten::__lshift__.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & __ilshift__(Tensor & self, const Scalar & other); // {"schema": "aten::__ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & __ilshift__(Tensor & self, const Tensor & other); // {"schema": "aten::__ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bitwise_left_shift(const Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_left_shift_(Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_left_shift_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_left_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bitwise_left_shift(const Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_left_shift_(Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_left_shift_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::bitwise_left_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bitwise_left_shift(const Scalar & self, const Tensor & other); // {"schema": "aten::bitwise_left_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor __rshift__(const Tensor & self, const Scalar & other); // {"schema": "aten::__rshift__.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor __rshift__(const Tensor & self, const Tensor & other); // {"schema": "aten::__rshift__.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & __irshift__(Tensor & self, const Scalar & other); // {"schema": "aten::__irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & __irshift__(Tensor & self, const Tensor & other); // {"schema": "aten::__irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bitwise_right_shift(const Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_right_shift_(Tensor & self, const Tensor & other); // {"schema": "aten::bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_right_shift_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bitwise_right_shift(const Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bitwise_right_shift_(Tensor & self, const Scalar & other); // {"schema": "aten::bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_right_shift_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bitwise_right_shift(const Scalar & self, const Tensor & other); // {"schema": "aten::bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tril_(Tensor & self, int64_t diagonal); // {"schema": "aten::tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & triu_(Tensor & self, int64_t diagonal); // {"schema": "aten::triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & digamma_(Tensor & self); // {"schema": "aten::digamma_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & lerp_(Tensor & self, const Tensor & end, const Scalar & weight); // {"schema": "aten::lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & lerp_(Tensor & self, const Tensor & end, const Tensor & weight); // {"schema": "aten::lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & addbmm_(Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & addbmm_out(const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor addbmm(const Tensor & self, const Tensor & batch1, const Tensor & batch2, const Scalar & beta, const Scalar & alpha); // {"schema": "aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & random_(Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<Generator> generator); // {"schema": "aten::random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & random_(Tensor & self, int64_t to, c10::optional<Generator> generator); // {"schema": "aten::random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & random_(Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & uniform_(Tensor & self, double from, double to, c10::optional<Generator> generator); // {"schema": "aten::uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & cauchy_(Tensor & self, double median, double sigma, c10::optional<Generator> generator); // {"schema": "aten::cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & log_normal_(Tensor & self, double mean, double std, c10::optional<Generator> generator); // {"schema": "aten::log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & exponential_(Tensor & self, double lambd, c10::optional<Generator> generator); // {"schema": "aten::exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & geometric_(Tensor & self, double p, c10::optional<Generator> generator); // {"schema": "aten::geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & diag_out(const Tensor & self, int64_t diagonal, Tensor & out); // {"schema": "aten::diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor diag(const Tensor & self, int64_t diagonal); // {"schema": "aten::diag(Tensor self, int diagonal=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & cross_out(const Tensor & self, const Tensor & other, c10::optional<int64_t> dim, Tensor & out); // {"schema": "aten::cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor cross(const Tensor & self, const Tensor & other, c10::optional<int64_t> dim); // {"schema": "aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & triu_out(const Tensor & self, int64_t diagonal, Tensor & out); // {"schema": "aten::triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor triu(const Tensor & self, int64_t diagonal); // {"schema": "aten::triu(Tensor self, int diagonal=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tril_out(const Tensor & self, int64_t diagonal, Tensor & out); // {"schema": "aten::tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor tril(const Tensor & self, int64_t diagonal); // {"schema": "aten::tril(Tensor self, int diagonal=0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor tril_indices(int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor triu_indices(int64_t row, int64_t col, int64_t offset, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor trace(const Tensor & self); // {"schema": "aten::trace(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor trace_backward(const Tensor & grad, c10::SymIntArrayRef sizes); // {"schema": "aten::trace_backward(Tensor grad, SymInt[] sizes) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & ne_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::ne.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ne(const Tensor & self, const Scalar & other); // {"schema": "aten::ne.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ne_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::ne.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ne(const Tensor & self, const Tensor & other); // {"schema": "aten::ne.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ne_(Tensor & self, const Scalar & other); // {"schema": "aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ne_(Tensor & self, const Tensor & other); // {"schema": "aten::ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & not_equal_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::not_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor not_equal(const Tensor & self, const Scalar & other); // {"schema": "aten::not_equal.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & not_equal_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor not_equal(const Tensor & self, const Tensor & other); // {"schema": "aten::not_equal.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & not_equal_(Tensor & self, const Scalar & other); // {"schema": "aten::not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & not_equal_(Tensor & self, const Tensor & other); // {"schema": "aten::not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & eq_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor eq(const Tensor & self, const Scalar & other); // {"schema": "aten::eq.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & eq_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor eq(const Tensor & self, const Tensor & other); // {"schema": "aten::eq.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ge_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::ge.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ge(const Tensor & self, const Scalar & other); // {"schema": "aten::ge.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ge_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ge(const Tensor & self, const Tensor & other); // {"schema": "aten::ge.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & ge_(Tensor & self, const Scalar & other); // {"schema": "aten::ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ge_(Tensor & self, const Tensor & other); // {"schema": "aten::ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & greater_equal_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::greater_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor greater_equal(const Tensor & self, const Scalar & other); // {"schema": "aten::greater_equal.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & greater_equal_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor greater_equal(const Tensor & self, const Tensor & other); // {"schema": "aten::greater_equal.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & greater_equal_(Tensor & self, const Scalar & other); // {"schema": "aten::greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & greater_equal_(Tensor & self, const Tensor & other); // {"schema": "aten::greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & le_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor le(const Tensor & self, const Scalar & other); // {"schema": "aten::le.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & le_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor le(const Tensor & self, const Tensor & other); // {"schema": "aten::le.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & le_(Tensor & self, const Scalar & other); // {"schema": "aten::le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & le_(Tensor & self, const Tensor & other); // {"schema": "aten::le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & less_equal_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::less_equal.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor less_equal(const Tensor & self, const Scalar & other); // {"schema": "aten::less_equal.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & less_equal_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor less_equal(const Tensor & self, const Tensor & other); // {"schema": "aten::less_equal.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & less_equal_(Tensor & self, const Scalar & other); // {"schema": "aten::less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & less_equal_(Tensor & self, const Tensor & other); // {"schema": "aten::less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & gt_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor gt(const Tensor & self, const Scalar & other); // {"schema": "aten::gt.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & gt_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor gt(const Tensor & self, const Tensor & other); // {"schema": "aten::gt.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & gt_(Tensor & self, const Scalar & other); // {"schema": "aten::gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & gt_(Tensor & self, const Tensor & other); // {"schema": "aten::gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & greater_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor greater(const Tensor & self, const Scalar & other); // {"schema": "aten::greater.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & greater_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor greater(const Tensor & self, const Tensor & other); // {"schema": "aten::greater.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & greater_(Tensor & self, const Scalar & other); // {"schema": "aten::greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & greater_(Tensor & self, const Tensor & other); // {"schema": "aten::greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & lt_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor lt(const Tensor & self, const Scalar & other); // {"schema": "aten::lt.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & lt_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor lt(const Tensor & self, const Tensor & other); // {"schema": "aten::lt.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & lt_(Tensor & self, const Scalar & other); // {"schema": "aten::lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & lt_(Tensor & self, const Tensor & other); // {"schema": "aten::lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & less_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::less.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor less(const Tensor & self, const Scalar & other); // {"schema": "aten::less.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & less_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor less(const Tensor & self, const Tensor & other); // {"schema": "aten::less.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & less_(Tensor & self, const Scalar & other); // {"schema": "aten::less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & less_(Tensor & self, const Tensor & other); // {"schema": "aten::less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & take_out(const Tensor & self, const Tensor & index, Tensor & out); // {"schema": "aten::take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor take(const Tensor & self, const Tensor & index); // {"schema": "aten::take(Tensor self, Tensor index) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & take_along_dim_out(const Tensor & self, const Tensor & indices, c10::optional<int64_t> dim, Tensor & out); // {"schema": "aten::take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor take_along_dim(const Tensor & self, const Tensor & indices, c10::optional<int64_t> dim); // {"schema": "aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & index_select_out(const Tensor & self, int64_t dim, const Tensor & index, Tensor & out); // {"schema": "aten::index_select.out(Tensor self, int dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor index_select(const Tensor & self, int64_t dim, const Tensor & index); // {"schema": "aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & index_select_out(const Tensor & self, Dimname dim, const Tensor & index, Tensor & out); // {"schema": "aten::index_select.dimname_out(Tensor self, Dimname dim, Tensor index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor index_select(const Tensor & self, Dimname dim, const Tensor & index); // {"schema": "aten::index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor index_select_backward(const Tensor & grad, c10::SymIntArrayRef self_sizes, int64_t dim, const Tensor & index); // {"schema": "aten::index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & masked_select_out(const Tensor & self, const Tensor & mask, Tensor & out); // {"schema": "aten::masked_select.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor masked_select(const Tensor & self, const Tensor & mask); // {"schema": "aten::masked_select(Tensor self, Tensor mask) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor masked_select_backward(const Tensor & grad, const Tensor & input, const Tensor & mask); // {"schema": "aten::masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nonzero_out(const Tensor & self, Tensor & out); // {"schema": "aten::nonzero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor nonzero(const Tensor & self); // {"schema": "aten::nonzero(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & nonzero_static_out(const Tensor & self, int64_t size, int64_t fill_value, Tensor & out); // {"schema": "aten::nonzero_static.out(Tensor self, *, int size, int fill_value=-1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor nonzero_static(const Tensor & self, int64_t size, int64_t fill_value); // {"schema": "aten::nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> nonzero_numpy(const Tensor & self); // {"schema": "aten::nonzero_numpy(Tensor self) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor argwhere(const Tensor & self); // {"schema": "aten::argwhere(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & gather_out(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad, Tensor & out); // {"schema": "aten::gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor gather(const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad); // {"schema": "aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor gather_backward(const Tensor & grad, const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad); // {"schema": "aten::gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & gather_out(const Tensor & self, Dimname dim, const Tensor & index, bool sparse_grad, Tensor & out); // {"schema": "aten::gather.dimname_out(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor gather(const Tensor & self, Dimname dim, const Tensor & index, bool sparse_grad); // {"schema": "aten::gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _gather_sparse_backward(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & grad); // {"schema": "aten::_gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & addcmul_out(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value, Tensor & out); // {"schema": "aten::addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor addcmul(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value); // {"schema": "aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & addcmul_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value); // {"schema": "aten::addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & addcdiv_out(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value, Tensor & out); // {"schema": "aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor addcdiv(const Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value); // {"schema": "aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & addcdiv_(Tensor & self, const Tensor & tensor1, const Tensor & tensor2, const Scalar & value); // {"schema": "aten::addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor cross_entropy_loss(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, double label_smoothing); // {"schema": "aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> triangular_solve_out(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular, Tensor & X, Tensor & M); // {"schema": "aten::triangular_solve.X(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False, *, Tensor(a!) X, Tensor(b!) M) -> (Tensor(a!) solution, Tensor(b!) cloned_coefficient)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> triangular_solve(const Tensor & self, const Tensor & A, bool upper, bool transpose, bool unitriangular); // {"schema": "aten::triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)", "dispatch": "True", "default": "True"}
+void _linalg_check_errors(const Tensor & info, c10::string_view api_name, bool is_matrix); // {"schema": "aten::_linalg_check_errors(Tensor info, str api_name, *, bool is_matrix) -> ()", "dispatch": "True", "default": "True"}
+Tensor & linalg_solve_triangular_out(const Tensor & self, const Tensor & B, bool upper, bool left, bool unitriangular, Tensor & out); // {"schema": "aten::linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor linalg_solve_triangular(const Tensor & self, const Tensor & B, bool upper, bool left, bool unitriangular); // {"schema": "aten::linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor linalg_vander(const Tensor & x, c10::optional<c10::SymInt> N); // {"schema": "aten::linalg_vander(Tensor x, *, SymInt? N=None) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> svd_out(const Tensor & self, bool some, bool compute_uv, Tensor & U, Tensor & S, Tensor & V); // {"schema": "aten::svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> svd(const Tensor & self, bool some, bool compute_uv); // {"schema": "aten::svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)", "dispatch": "False", "default": "True"}
+Tensor swapaxes(const Tensor & self, int64_t axis0, int64_t axis1); // {"schema": "aten::swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor & swapaxes_(Tensor & self, int64_t axis0, int64_t axis1); // {"schema": "aten::swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor swapdims(const Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", "dispatch": "False", "default": "True"}
+Tensor & swapdims_(Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & cholesky_out(const Tensor & self, bool upper, Tensor & out); // {"schema": "aten::cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor cholesky(const Tensor & self, bool upper); // {"schema": "aten::cholesky(Tensor self, bool upper=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & cholesky_solve_out(const Tensor & self, const Tensor & input2, bool upper, Tensor & out); // {"schema": "aten::cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor cholesky_solve(const Tensor & self, const Tensor & input2, bool upper); // {"schema": "aten::cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _cholesky_solve_helper(const Tensor & self, const Tensor & A, bool upper); // {"schema": "aten::_cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor cholesky_inverse(const Tensor & self, bool upper); // {"schema": "aten::cholesky_inverse(Tensor self, bool upper=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & cholesky_inverse_out(const Tensor & self, bool upper, Tensor & out); // {"schema": "aten::cholesky_inverse.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> qr_out(const Tensor & self, bool some, Tensor & Q, Tensor & R); // {"schema": "aten::qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> qr(const Tensor & self, bool some); // {"schema": "aten::qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> geqrf_out(const Tensor & self, Tensor & a, Tensor & tau); // {"schema": "aten::geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> geqrf(const Tensor & self); // {"schema": "aten::geqrf(Tensor self) -> (Tensor a, Tensor tau)", "dispatch": "True", "default": "False"}
+Tensor orgqr(const Tensor & self, const Tensor & input2); // {"schema": "aten::orgqr(Tensor self, Tensor input2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & orgqr_out(const Tensor & self, const Tensor & input2, Tensor & out); // {"schema": "aten::orgqr.out(Tensor self, Tensor input2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & ormqr_out(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose, Tensor & out); // {"schema": "aten::ormqr.out(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor ormqr(const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose); // {"schema": "aten::ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _lu_with_info(const Tensor & self, bool pivot, bool check_errors); // {"schema": "aten::_lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info)", "dispatch": "False", "default": "True"}
+Tensor & lu_solve_out(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots, Tensor & out); // {"schema": "aten::lu_solve.out(Tensor self, Tensor LU_data, Tensor LU_pivots, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor lu_solve(const Tensor & self, const Tensor & LU_data, const Tensor & LU_pivots); // {"schema": "aten::lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> lu_unpack(const Tensor & LU_data, const Tensor & LU_pivots, bool unpack_data, bool unpack_pivots); // {"schema": "aten::lu_unpack(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True) -> (Tensor P, Tensor L, Tensor U)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> lu_unpack_out(const Tensor & LU_data, const Tensor & LU_pivots, bool unpack_data, bool unpack_pivots, Tensor & P, Tensor & L, Tensor & U); // {"schema": "aten::lu_unpack.out(Tensor LU_data, Tensor LU_pivots, bool unpack_data=True, bool unpack_pivots=True, *, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)", "dispatch": "True", "default": "False"}
+Tensor & multinomial_out(const Tensor & self, int64_t num_samples, bool replacement, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor multinomial(const Tensor & self, int64_t num_samples, bool replacement, c10::optional<Generator> generator); // {"schema": "aten::multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & lgamma_out(const Tensor & self, Tensor & out); // {"schema": "aten::lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & lgamma_(Tensor & self); // {"schema": "aten::lgamma_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor lgamma(const Tensor & self); // {"schema": "aten::lgamma(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & digamma_out(const Tensor & self, Tensor & out); // {"schema": "aten::digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor digamma(const Tensor & self); // {"schema": "aten::digamma(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & polygamma_out(int64_t n, const Tensor & self, Tensor & out); // {"schema": "aten::polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor polygamma(int64_t n, const Tensor & self); // {"schema": "aten::polygamma(int n, Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & polygamma_(Tensor & self, int64_t n); // {"schema": "aten::polygamma_(Tensor(a!) self, int n) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor erfinv(const Tensor & self); // {"schema": "aten::erfinv(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & erfinv_(Tensor & self); // {"schema": "aten::erfinv_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & erfinv_out(const Tensor & self, Tensor & out); // {"schema": "aten::erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor i0(const Tensor & self); // {"schema": "aten::i0(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & i0_(Tensor & self); // {"schema": "aten::i0_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & i0_out(const Tensor & self, Tensor & out); // {"schema": "aten::i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sign(const Tensor & self); // {"schema": "aten::sign(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sign_(Tensor & self); // {"schema": "aten::sign_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sign_out(const Tensor & self, Tensor & out); // {"schema": "aten::sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor signbit(const Tensor & self); // {"schema": "aten::signbit(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & signbit_out(const Tensor & self, Tensor & out); // {"schema": "aten::signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor dist(const Tensor & self, const Tensor & other, const Scalar & p); // {"schema": "aten::dist(Tensor self, Tensor other, Scalar p=2) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & atan2_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & atan2_(Tensor & self, const Tensor & other); // {"schema": "aten::atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor atan2(const Tensor & self, const Tensor & other); // {"schema": "aten::atan2(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor arctan2(const Tensor & self, const Tensor & other); // {"schema": "aten::arctan2(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & arctan2_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & arctan2_(Tensor & self, const Tensor & other); // {"schema": "aten::arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & lerp_out(const Tensor & self, const Tensor & end, const Scalar & weight, Tensor & out); // {"schema": "aten::lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & lerp_out(const Tensor & self, const Tensor & end, const Tensor & weight, Tensor & out); // {"schema": "aten::lerp.Tensor_out(Tensor self, Tensor end, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor lerp(const Tensor & self, const Tensor & end, const Scalar & weight); // {"schema": "aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor lerp(const Tensor & self, const Tensor & end, const Tensor & weight); // {"schema": "aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & histc_out(const Tensor & self, int64_t bins, const Scalar & min, const Scalar & max, Tensor & out); // {"schema": "aten::histc.out(Tensor self, int bins=100, Scalar min=0, Scalar max=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor histc(const Tensor & self, int64_t bins, const Scalar & min, const Scalar & max); // {"schema": "aten::histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> histogram_out(const Tensor & self, const Tensor & bins, const c10::optional<Tensor> & weight, bool density, Tensor & hist, Tensor & bin_edges); // {"schema": "aten::histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> histogram(const Tensor & self, const Tensor & bins, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> histogram_out(const Tensor & self, int64_t bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density, Tensor & hist, Tensor & bin_edges); // {"schema": "aten::histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> histogram(const Tensor & self, int64_t bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _histogramdd_bin_edges(const Tensor & self, IntArrayRef bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::_histogramdd_bin_edges(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor[]", "dispatch": "True", "default": "False"}
+Tensor _histogramdd_from_bin_cts(const Tensor & self, IntArrayRef bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::_histogramdd_from_bin_cts(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _histogramdd_from_bin_tensors(const Tensor & self, TensorList bins, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::_histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,::std::vector<Tensor>> histogramdd(const Tensor & self, IntArrayRef bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::histogramdd(Tensor self, int[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,::std::vector<Tensor>> histogramdd(const Tensor & self, int64_t bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::histogramdd.int_bins(Tensor self, int bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,::std::vector<Tensor>> histogramdd(const Tensor & self, TensorList bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density); // {"schema": "aten::histogramdd.TensorList_bins(Tensor self, Tensor[] bins, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor[] bin_edges)", "dispatch": "False", "default": "True"}
+Tensor & fmod_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor fmod(const Tensor & self, const Scalar & other); // {"schema": "aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fmod_(Tensor & self, const Scalar & other); // {"schema": "aten::fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & fmod_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor fmod(const Tensor & self, const Tensor & other); // {"schema": "aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fmod_(Tensor & self, const Tensor & other); // {"schema": "aten::fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hypot_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hypot(const Tensor & self, const Tensor & other); // {"schema": "aten::hypot(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & hypot_(Tensor & self, const Tensor & other); // {"schema": "aten::hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & igamma_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::igamma.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor igamma(const Tensor & self, const Tensor & other); // {"schema": "aten::igamma(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & igamma_(Tensor & self, const Tensor & other); // {"schema": "aten::igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & igammac_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::igammac.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor igammac(const Tensor & self, const Tensor & other); // {"schema": "aten::igammac(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & igammac_(Tensor & self, const Tensor & other); // {"schema": "aten::igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & nextafter_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor nextafter(const Tensor & self, const Tensor & other); // {"schema": "aten::nextafter(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & nextafter_(Tensor & self, const Tensor & other); // {"schema": "aten::nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & remainder_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::remainder.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor remainder(const Tensor & self, const Scalar & other); // {"schema": "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & remainder_(Tensor & self, const Scalar & other); // {"schema": "aten::remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & remainder_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::remainder.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor remainder(const Tensor & self, const Tensor & other); // {"schema": "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & remainder_(Tensor & self, const Tensor & other); // {"schema": "aten::remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor remainder(const Scalar & self, const Tensor & other); // {"schema": "aten::remainder.Scalar_Tensor(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor min(const Tensor & self); // {"schema": "aten::min(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & min_out(const Tensor & self, Tensor & out); // {"schema": "aten::min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor fmin(const Tensor & self, const Tensor & other); // {"schema": "aten::fmin(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fmin_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::fmin.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max(const Tensor & self); // {"schema": "aten::max(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor fmax(const Tensor & self, const Tensor & other); // {"schema": "aten::fmax(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fmax_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::fmax.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor maximum(const Tensor & self, const Tensor & other); // {"schema": "aten::maximum(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & maximum_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max(const Tensor & self, const Tensor & other); // {"schema": "aten::max.other(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & max_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & max_out(const Tensor & self, Tensor & out); // {"schema": "aten::max.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor minimum(const Tensor & self, const Tensor & other); // {"schema": "aten::minimum(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & minimum_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & min_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor min(const Tensor & self, const Tensor & other); // {"schema": "aten::min.other(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor quantile(const Tensor & self, const Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation); // {"schema": "aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & quantile_out(const Tensor & self, const Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, Tensor & out); // {"schema": "aten::quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor quantile(const Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation); // {"schema": "aten::quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & quantile_out(const Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, Tensor & out); // {"schema": "aten::quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nanquantile(const Tensor & self, const Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation); // {"schema": "aten::nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nanquantile_out(const Tensor & self, const Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, Tensor & out); // {"schema": "aten::nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nanquantile(const Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation); // {"schema": "aten::nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & nanquantile_out(const Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, Tensor & out); // {"schema": "aten::nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear', Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> sort_out(const Tensor & self, int64_t dim, bool descending, Tensor & values, Tensor & indices); // {"schema": "aten::sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> sort_out(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending, Tensor & values, Tensor & indices); // {"schema": "aten::sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> sort(const Tensor & self, int64_t dim, bool descending); // {"schema": "aten::sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> sort(const Tensor & self, c10::optional<bool> stable, int64_t dim, bool descending); // {"schema": "aten::sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> sort_out(const Tensor & self, Dimname dim, bool descending, Tensor & values, Tensor & indices); // {"schema": "aten::sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> sort_out(const Tensor & self, c10::optional<bool> stable, Dimname dim, bool descending, Tensor & values, Tensor & indices); // {"schema": "aten::sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> sort(const Tensor & self, Dimname dim, bool descending); // {"schema": "aten::sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> sort(const Tensor & self, c10::optional<bool> stable, Dimname dim, bool descending); // {"schema": "aten::sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)", "dispatch": "False", "default": "True"}
+Tensor & msort_out(const Tensor & self, Tensor & out); // {"schema": "aten::msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor msort(const Tensor & self); // {"schema": "aten::msort(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor argsort(const Tensor & self, int64_t dim, bool descending); // {"schema": "aten::argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor argsort(const Tensor & self, bool stable, int64_t dim, bool descending); // {"schema": "aten::argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor argsort(const Tensor & self, Dimname dim, bool descending); // {"schema": "aten::argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> topk_out(const Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted, Tensor & values, Tensor & indices); // {"schema": "aten::topk.values(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> topk(const Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted); // {"schema": "aten::topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)", "dispatch": "True", "default": "True"}
+Tensor all(const Tensor & self); // {"schema": "aten::all(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & all_out(const Tensor & self, Tensor & out); // {"schema": "aten::all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor any(const Tensor & self); // {"schema": "aten::any(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & any_out(const Tensor & self, Tensor & out); // {"schema": "aten::any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & renorm_out(const Tensor & self, const Scalar & p, int64_t dim, const Scalar & maxnorm, Tensor & out); // {"schema": "aten::renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor renorm(const Tensor & self, const Scalar & p, int64_t dim, const Scalar & maxnorm); // {"schema": "aten::renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & renorm_(Tensor & self, const Scalar & p, int64_t dim, const Scalar & maxnorm); // {"schema": "aten::renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor unfold(const Tensor & self, int64_t dimension, int64_t size, int64_t step); // {"schema": "aten::unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)", "dispatch": "True", "default": "False"}
+Tensor unfold_backward(const Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step); // {"schema": "aten::unfold_backward(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step) -> Tensor", "dispatch": "True", "default": "False"}
+bool equal(const Tensor & self, const Tensor & other); // {"schema": "aten::equal(Tensor self, Tensor other) -> bool", "dispatch": "True", "default": "False"}
+Tensor & pow_out(const Tensor & self, const Tensor & exponent, Tensor & out); // {"schema": "aten::pow.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor pow(const Tensor & self, const Tensor & exponent); // {"schema": "aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & pow_out(const Scalar & self, const Tensor & exponent, Tensor & out); // {"schema": "aten::pow.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor pow(const Scalar & self, const Tensor & exponent); // {"schema": "aten::pow.Scalar(Scalar self, Tensor exponent) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & pow_out(const Tensor & self, const Scalar & exponent, Tensor & out); // {"schema": "aten::pow.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor pow(const Tensor & self, const Scalar & exponent); // {"schema": "aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & pow_(Tensor & self, const Scalar & exponent); // {"schema": "aten::pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & pow_(Tensor & self, const Tensor & exponent); // {"schema": "aten::pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & float_power_out(const Tensor & self, const Tensor & exponent, Tensor & out); // {"schema": "aten::float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor float_power(const Tensor & self, const Tensor & exponent); // {"schema": "aten::float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & float_power_out(const Scalar & self, const Tensor & exponent, Tensor & out); // {"schema": "aten::float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor float_power(const Scalar & self, const Tensor & exponent); // {"schema": "aten::float_power.Scalar(Scalar self, Tensor exponent) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & float_power_out(const Tensor & self, const Scalar & exponent, Tensor & out); // {"schema": "aten::float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor float_power(const Tensor & self, const Scalar & exponent); // {"schema": "aten::float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & float_power_(Tensor & self, const Scalar & exponent); // {"schema": "aten::float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & float_power_(Tensor & self, const Tensor & exponent); // {"schema": "aten::float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & normal_(Tensor & self, double mean, double std, c10::optional<Generator> generator); // {"schema": "aten::normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor normal_functional(const Tensor & self, double mean, double std, c10::optional<Generator> generator); // {"schema": "aten::normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & normal_out(const Tensor & mean, double std, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor normal(const Tensor & mean, double std, c10::optional<Generator> generator); // {"schema": "aten::normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & normal_out(double mean, const Tensor & std, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor normal(double mean, const Tensor & std, c10::optional<Generator> generator); // {"schema": "aten::normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & normal_out(const Tensor & mean, const Tensor & std, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor normal(const Tensor & mean, const Tensor & std, c10::optional<Generator> generator); // {"schema": "aten::normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor normal(double mean, double std, c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & normal_out(double mean, double std, c10::SymIntArrayRef size, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor alias(const Tensor & self); // {"schema": "aten::alias(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+void _amp_foreach_non_finite_check_and_unscale_(TensorList self, Tensor & found_inf, const Tensor & inv_scale); // {"schema": "aten::_amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()", "dispatch": "True", "default": "False"}
+Tensor & _amp_update_scale_(Tensor & self, Tensor & growth_tracker, const Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval); // {"schema": "aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_add(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_add.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_add_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_add(TensorList self, TensorList other, const Scalar & alpha); // {"schema": "aten::_foreach_add.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_add_(TensorList self, TensorList other, const Scalar & alpha); // {"schema": "aten::_foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_add(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_add.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_add_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_add_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_add(TensorList self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::_foreach_add.Tensor(Tensor[] self, Tensor other, *, Scalar alpha=1) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_add_(TensorList self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::_foreach_add_.Tensor(Tensor(a!)[] self, Tensor other, *, Scalar alpha=1) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sub(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_sub.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sub_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sub(TensorList self, TensorList other, const Scalar & alpha); // {"schema": "aten::_foreach_sub.List(Tensor[] self, Tensor[] other, *, Scalar alpha=1) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sub_(TensorList self, TensorList other, const Scalar & alpha); // {"schema": "aten::_foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sub(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_sub.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sub_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_sub_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_mul(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_mul.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_mul_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_mul(TensorList self, TensorList other); // {"schema": "aten::_foreach_mul.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_mul_(TensorList self, TensorList other); // {"schema": "aten::_foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_mul(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_mul.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_mul_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_mul_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_mul(TensorList self, const Tensor & other); // {"schema": "aten::_foreach_mul.Tensor(Tensor[] self, Tensor other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_mul_(TensorList self, const Tensor & other); // {"schema": "aten::_foreach_mul_.Tensor(Tensor(a!)[] self, Tensor other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_div(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_div.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_div_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_div(TensorList self, TensorList other); // {"schema": "aten::_foreach_div.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_div_(TensorList self, TensorList other); // {"schema": "aten::_foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_div(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_div_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_div_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_div(TensorList self, const Tensor & other); // {"schema": "aten::_foreach_div.Tensor(Tensor[] self, Tensor other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_div_(TensorList self, const Tensor & other); // {"schema": "aten::_foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_max(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_max_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_clamp_max_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_max(TensorList self, TensorList other); // {"schema": "aten::_foreach_clamp_max.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_max_(TensorList self, TensorList other); // {"schema": "aten::_foreach_clamp_max_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_max(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_clamp_max.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_max_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_clamp_max_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_min(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_clamp_min.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_min_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_clamp_min_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_min(TensorList self, TensorList other); // {"schema": "aten::_foreach_clamp_min.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_min_(TensorList self, TensorList other); // {"schema": "aten::_foreach_clamp_min_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_clamp_min(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_clamp_min.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_clamp_min_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_clamp_min_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_maximum(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_maximum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_maximum_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_maximum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_maximum(TensorList self, TensorList other); // {"schema": "aten::_foreach_maximum.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_maximum_(TensorList self, TensorList other); // {"schema": "aten::_foreach_maximum_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_maximum(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_maximum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_maximum_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_maximum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_minimum(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_minimum_(TensorList self, const Scalar & scalar); // {"schema": "aten::_foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_minimum(TensorList self, TensorList other); // {"schema": "aten::_foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_minimum_(TensorList self, TensorList other); // {"schema": "aten::_foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_minimum(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_minimum_(TensorList self, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcdiv(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value); // {"schema": "aten::_foreach_addcdiv.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcdiv(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_addcdiv.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcdiv(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars); // {"schema": "aten::_foreach_addcdiv.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_addcdiv_(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value); // {"schema": "aten::_foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_addcdiv_(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_addcdiv_(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars); // {"schema": "aten::_foreach_addcdiv_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcmul(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value); // {"schema": "aten::_foreach_addcmul.Scalar(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcmul(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_addcmul.ScalarList(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_addcmul(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars); // {"schema": "aten::_foreach_addcmul.Tensor(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_addcmul_(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value); // {"schema": "aten::_foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_addcmul_(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars); // {"schema": "aten::_foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_addcmul_(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars); // {"schema": "aten::_foreach_addcmul_.Tensor(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_abs(TensorList self); // {"schema": "aten::_foreach_abs(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_abs_(TensorList self); // {"schema": "aten::_foreach_abs_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_acos(TensorList self); // {"schema": "aten::_foreach_acos(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_acos_(TensorList self); // {"schema": "aten::_foreach_acos_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_asin(TensorList self); // {"schema": "aten::_foreach_asin(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_asin_(TensorList self); // {"schema": "aten::_foreach_asin_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_atan(TensorList self); // {"schema": "aten::_foreach_atan(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_atan_(TensorList self); // {"schema": "aten::_foreach_atan_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_ceil(TensorList self); // {"schema": "aten::_foreach_ceil(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_ceil_(TensorList self); // {"schema": "aten::_foreach_ceil_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_cos(TensorList self); // {"schema": "aten::_foreach_cos(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_cos_(TensorList self); // {"schema": "aten::_foreach_cos_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_cosh(TensorList self); // {"schema": "aten::_foreach_cosh(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_cosh_(TensorList self); // {"schema": "aten::_foreach_cosh_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_erf(TensorList self); // {"schema": "aten::_foreach_erf(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_erf_(TensorList self); // {"schema": "aten::_foreach_erf_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_erfc(TensorList self); // {"schema": "aten::_foreach_erfc(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_erfc_(TensorList self); // {"schema": "aten::_foreach_erfc_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_exp(TensorList self); // {"schema": "aten::_foreach_exp(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_exp_(TensorList self); // {"schema": "aten::_foreach_exp_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_expm1(TensorList self); // {"schema": "aten::_foreach_expm1(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_expm1_(TensorList self); // {"schema": "aten::_foreach_expm1_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_floor(TensorList self); // {"schema": "aten::_foreach_floor(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_floor_(TensorList self); // {"schema": "aten::_foreach_floor_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_frac(TensorList self); // {"schema": "aten::_foreach_frac(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_frac_(TensorList self); // {"schema": "aten::_foreach_frac_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_lerp(TensorList self, TensorList tensors1, TensorList weights); // {"schema": "aten::_foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_lerp_(TensorList self, TensorList tensors1, TensorList weights); // {"schema": "aten::_foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_lerp(TensorList self, TensorList tensors1, const Scalar & weight); // {"schema": "aten::_foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_lerp_(TensorList self, TensorList tensors1, const Scalar & weight); // {"schema": "aten::_foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_lgamma(TensorList self); // {"schema": "aten::_foreach_lgamma(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_lgamma_(TensorList self); // {"schema": "aten::_foreach_lgamma_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_log(TensorList self); // {"schema": "aten::_foreach_log(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_log_(TensorList self); // {"schema": "aten::_foreach_log_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_log10(TensorList self); // {"schema": "aten::_foreach_log10(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_log10_(TensorList self); // {"schema": "aten::_foreach_log10_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_log1p(TensorList self); // {"schema": "aten::_foreach_log1p(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_log1p_(TensorList self); // {"schema": "aten::_foreach_log1p_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_log2(TensorList self); // {"schema": "aten::_foreach_log2(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_log2_(TensorList self); // {"schema": "aten::_foreach_log2_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_neg(TensorList self); // {"schema": "aten::_foreach_neg(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_neg_(TensorList self); // {"schema": "aten::_foreach_neg_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_norm(TensorList self, const Scalar & ord); // {"schema": "aten::_foreach_norm.Scalar(Tensor[] self, Scalar ord=2) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_pow(TensorList self, TensorList exponent); // {"schema": "aten::_foreach_pow.List(Tensor[] self, Tensor[] exponent) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_pow(TensorList self, const Scalar & exponent); // {"schema": "aten::_foreach_pow.Scalar(Tensor[] self, Scalar exponent) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_pow(TensorList self, ArrayRef<Scalar> exponent); // {"schema": "aten::_foreach_pow.ScalarList(Tensor[] self, Scalar[] exponent) -> Tensor[]", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_pow(const Scalar & self, TensorList exponent); // {"schema": "aten::_foreach_pow.ScalarAndTensor(Scalar self, Tensor[] exponent) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_pow_(TensorList self, TensorList exponent); // {"schema": "aten::_foreach_pow_.List(Tensor(a!)[] self, Tensor[] exponent) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_pow_(TensorList self, const Scalar & exponent); // {"schema": "aten::_foreach_pow_.Scalar(Tensor(a!)[] self, Scalar exponent) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_pow_(TensorList self, ArrayRef<Scalar> exponent); // {"schema": "aten::_foreach_pow_.ScalarList(Tensor(a!)[] self, Scalar[] exponent) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_reciprocal(TensorList self); // {"schema": "aten::_foreach_reciprocal(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_reciprocal_(TensorList self); // {"schema": "aten::_foreach_reciprocal_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_round(TensorList self); // {"schema": "aten::_foreach_round(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_round_(TensorList self); // {"schema": "aten::_foreach_round_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sigmoid(TensorList self); // {"schema": "aten::_foreach_sigmoid(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sigmoid_(TensorList self); // {"schema": "aten::_foreach_sigmoid_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sign(TensorList self); // {"schema": "aten::_foreach_sign(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sign_(TensorList self); // {"schema": "aten::_foreach_sign_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sin(TensorList self); // {"schema": "aten::_foreach_sin(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sin_(TensorList self); // {"schema": "aten::_foreach_sin_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sinh(TensorList self); // {"schema": "aten::_foreach_sinh(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sinh_(TensorList self); // {"schema": "aten::_foreach_sinh_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_sqrt(TensorList self); // {"schema": "aten::_foreach_sqrt(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_sqrt_(TensorList self); // {"schema": "aten::_foreach_sqrt_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_tan(TensorList self); // {"schema": "aten::_foreach_tan(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_tan_(TensorList self); // {"schema": "aten::_foreach_tan_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_tanh(TensorList self); // {"schema": "aten::_foreach_tanh(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_tanh_(TensorList self); // {"schema": "aten::_foreach_tanh_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+::std::vector<Tensor> _foreach_trunc(TensorList self); // {"schema": "aten::_foreach_trunc(Tensor[] self) -> Tensor[]", "dispatch": "True", "default": "False"}
+void _foreach_trunc_(TensorList self); // {"schema": "aten::_foreach_trunc_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_zero_(TensorList self); // {"schema": "aten::_foreach_zero_(Tensor(a!)[] self) -> ()", "dispatch": "True", "default": "False"}
+void _foreach_copy_(TensorList self, TensorList src, bool non_blocking); // {"schema": "aten::_foreach_copy_(Tensor(a!)[] self, Tensor[] src, bool non_blocking=False) -> ()", "dispatch": "True", "default": "False"}
+Tensor bucketize(const Tensor & self, const Tensor & boundaries, bool out_int32, bool right); // {"schema": "aten::bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & bucketize_out(const Tensor & self, const Tensor & boundaries, bool out_int32, bool right, Tensor & out); // {"schema": "aten::bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor bucketize(const Scalar & self, const Tensor & boundaries, bool out_int32, bool right); // {"schema": "aten::bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor searchsorted(const Tensor & sorted_sequence, const Tensor & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<Tensor> & sorter); // {"schema": "aten::searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & searchsorted_out(const Tensor & sorted_sequence, const Tensor & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<Tensor> & sorter, Tensor & out); // {"schema": "aten::searchsorted.Tensor_out(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor searchsorted(const Tensor & sorted_sequence, const Scalar & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<Tensor> & sorter); // {"schema": "aten::searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & searchsorted_out(const Tensor & sorted_sequence, const Scalar & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<Tensor> & sorter, Tensor & out); // {"schema": "aten::searchsorted.Scalar_out(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False, str? side=None, Tensor? sorter=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _convert_indices_from_coo_to_csr(const Tensor & self, int64_t size, bool out_int32); // {"schema": "aten::_convert_indices_from_coo_to_csr(Tensor self, int size, *, bool out_int32=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _convert_indices_from_coo_to_csr_out(const Tensor & self, int64_t size, bool out_int32, Tensor & out); // {"schema": "aten::_convert_indices_from_coo_to_csr.out(Tensor self, int size, *, bool out_int32=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _convert_indices_from_csr_to_coo(const Tensor & crow_indices, const Tensor & col_indices, bool out_int32, bool transpose); // {"schema": "aten::_convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _convert_indices_from_csr_to_coo_out(const Tensor & crow_indices, const Tensor & col_indices, bool out_int32, bool transpose, Tensor & out); // {"schema": "aten::_convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & mse_loss_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & out); // {"schema": "aten::mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mse_loss(const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & mse_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, Tensor & grad_input); // {"schema": "aten::mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mse_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor l1_loss(const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & multi_margin_loss_out(const Tensor & self, const Tensor & target, const Scalar & p, const Scalar & margin, const c10::optional<Tensor> & weight, int64_t reduction, Tensor & out); // {"schema": "aten::multi_margin_loss.out(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor multi_margin_loss(const Tensor & self, const Tensor & target, const Scalar & p, const Scalar & margin, const c10::optional<Tensor> & weight, int64_t reduction); // {"schema": "aten::multi_margin_loss(Tensor self, Tensor target, Scalar p=1, Scalar margin=1, Tensor? weight=None, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & multi_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar & p, const Scalar & margin, const c10::optional<Tensor> & weight, int64_t reduction, Tensor & grad_input); // {"schema": "aten::multi_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor multi_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const Scalar & p, const Scalar & margin, const c10::optional<Tensor> & weight, int64_t reduction); // {"schema": "aten::multi_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, Scalar p, Scalar margin, Tensor? weight=None, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & multilabel_margin_loss_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & out); // {"schema": "aten::multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor multilabel_margin_loss(const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> multilabel_margin_loss_forward_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & output, Tensor & is_target); // {"schema": "aten::multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> multilabel_margin_loss_forward(const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)", "dispatch": "True", "default": "False"}
+Tensor & multilabel_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target, Tensor & grad_input); // {"schema": "aten::multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor multilabel_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, const Tensor & is_target); // {"schema": "aten::multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, Tensor & out); // {"schema": "aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nll_loss_nd(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index); // {"schema": "aten::nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor nll_loss(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index); // {"schema": "aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> nll_loss_forward_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, Tensor & output, Tensor & total_weight); // {"schema": "aten::nll_loss_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> nll_loss_forward(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index); // {"schema": "aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)", "dispatch": "True", "default": "True"}
+Tensor & nll_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const Tensor & total_weight, Tensor & grad_input); // {"schema": "aten::nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor nll_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const Tensor & total_weight); // {"schema": "aten::nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, Tensor & out); // {"schema": "aten::nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nll_loss2d(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index); // {"schema": "aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> nll_loss2d_forward_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, Tensor & output, Tensor & total_weight); // {"schema": "aten::nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> nll_loss2d_forward(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index); // {"schema": "aten::nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)", "dispatch": "True", "default": "False"}
+Tensor & nll_loss2d_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const Tensor & total_weight, Tensor & grad_input); // {"schema": "aten::nll_loss2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor nll_loss2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, const Tensor & total_weight); // {"schema": "aten::nll_loss2d_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & smooth_l1_loss_out(const Tensor & self, const Tensor & target, int64_t reduction, double beta, Tensor & out); // {"schema": "aten::smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor smooth_l1_loss(const Tensor & self, const Tensor & target, int64_t reduction, double beta); // {"schema": "aten::smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & smooth_l1_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, double beta, Tensor & grad_input); // {"schema": "aten::smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor smooth_l1_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, double beta); // {"schema": "aten::smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & huber_loss_out(const Tensor & self, const Tensor & target, int64_t reduction, double delta, Tensor & out); // {"schema": "aten::huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor huber_loss(const Tensor & self, const Tensor & target, int64_t reduction, double delta); // {"schema": "aten::huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & huber_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, double delta, Tensor & grad_input); // {"schema": "aten::huber_loss_backward.out(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor huber_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, double delta); // {"schema": "aten::huber_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float delta) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & soft_margin_loss_out(const Tensor & self, const Tensor & target, int64_t reduction, Tensor & out); // {"schema": "aten::soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor soft_margin_loss(const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & soft_margin_loss_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction, Tensor & grad_input); // {"schema": "aten::soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor soft_margin_loss_backward(const Tensor & grad_output, const Tensor & self, const Tensor & target, int64_t reduction); // {"schema": "aten::soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & elu_out(const Tensor & self, const Scalar & alpha, const Scalar & scale, const Scalar & input_scale, Tensor & out); // {"schema": "aten::elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor elu(const Tensor & self, const Scalar & alpha, const Scalar & scale, const Scalar & input_scale); // {"schema": "aten::elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & elu_backward_out(const Tensor & grad_output, const Scalar & alpha, const Scalar & scale, const Scalar & input_scale, bool is_result, const Tensor & self_or_result, Tensor & grad_input); // {"schema": "aten::elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor elu_backward(const Tensor & grad_output, const Scalar & alpha, const Scalar & scale, const Scalar & input_scale, bool is_result, const Tensor & self_or_result); // {"schema": "aten::elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & elu_(Tensor & self, const Scalar & alpha, const Scalar & scale, const Scalar & input_scale); // {"schema": "aten::elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & glu_out(const Tensor & self, int64_t dim, Tensor & out); // {"schema": "aten::glu.out(Tensor self, int dim=-1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor glu(const Tensor & self, int64_t dim); // {"schema": "aten::glu(Tensor self, int dim=-1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & glu_backward_out(const Tensor & grad_output, const Tensor & self, int64_t dim, Tensor & grad_input); // {"schema": "aten::glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor glu_backward(const Tensor & grad_output, const Tensor & self, int64_t dim); // {"schema": "aten::glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor glu_jvp(const Tensor & glu, const Tensor & x, const Tensor & dx, int64_t dim); // {"schema": "aten::glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor glu_backward_jvp(const Tensor & grad_x, const Tensor & grad_glu, const Tensor & x, const Tensor & dgrad_glu, const Tensor & dx, int64_t dim); // {"schema": "aten::glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & hardsigmoid_out(const Tensor & self, Tensor & out); // {"schema": "aten::hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardsigmoid(const Tensor & self); // {"schema": "aten::hardsigmoid(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & hardsigmoid_(Tensor & self); // {"schema": "aten::hardsigmoid_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hardsigmoid_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & grad_input); // {"schema": "aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardsigmoid_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & hardtanh_out(const Tensor & self, const Scalar & min_val, const Scalar & max_val, Tensor & out); // {"schema": "aten::hardtanh.out(Tensor self, Scalar min_val=-1, Scalar max_val=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardtanh(const Tensor & self, const Scalar & min_val, const Scalar & max_val); // {"schema": "aten::hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & hardtanh_backward_out(const Tensor & grad_output, const Tensor & self, const Scalar & min_val, const Scalar & max_val, Tensor & grad_input); // {"schema": "aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardtanh_backward(const Tensor & grad_output, const Tensor & self, const Scalar & min_val, const Scalar & max_val); // {"schema": "aten::hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & hardtanh_(Tensor & self, const Scalar & min_val, const Scalar & max_val); // {"schema": "aten::hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & hardswish_out(const Tensor & self, Tensor & out); // {"schema": "aten::hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardswish(const Tensor & self); // {"schema": "aten::hardswish(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & hardswish_(Tensor & self); // {"schema": "aten::hardswish_(Tensor(a!) self) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor hardswish_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::hardswish_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & leaky_relu_out(const Tensor & self, const Scalar & negative_slope, Tensor & out); // {"schema": "aten::leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor leaky_relu(const Tensor & self, const Scalar & negative_slope); // {"schema": "aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & leaky_relu_backward_out(const Tensor & grad_output, const Tensor & self, const Scalar & negative_slope, bool self_is_result, Tensor & grad_input); // {"schema": "aten::leaky_relu_backward.grad_input(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor leaky_relu_backward(const Tensor & grad_output, const Tensor & self, const Scalar & negative_slope, bool self_is_result); // {"schema": "aten::leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & leaky_relu_(Tensor & self, const Scalar & negative_slope); // {"schema": "aten::leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & log_sigmoid_out(const Tensor & self, Tensor & out); // {"schema": "aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor log_sigmoid(const Tensor & self); // {"schema": "aten::log_sigmoid(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> log_sigmoid_forward_out(const Tensor & self, Tensor & output, Tensor & buffer); // {"schema": "aten::log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> log_sigmoid_forward(const Tensor & self); // {"schema": "aten::log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)", "dispatch": "True", "default": "False"}
+Tensor & log_sigmoid_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & buffer, Tensor & grad_input); // {"schema": "aten::log_sigmoid_backward.grad_input(Tensor grad_output, Tensor self, Tensor buffer, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor log_sigmoid_backward(const Tensor & grad_output, const Tensor & self, const Tensor & buffer); // {"schema": "aten::log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & rrelu_with_noise_out(const Tensor & self, const Tensor & noise, const Scalar & lower, const Scalar & upper, bool training, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor rrelu_with_noise(const Tensor & self, const Tensor & noise, const Scalar & lower, const Scalar & upper, bool training, c10::optional<Generator> generator); // {"schema": "aten::rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor rrelu_with_noise_backward(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar & lower, const Scalar & upper, bool training, bool self_is_result); // {"schema": "aten::rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & rrelu_with_noise_(Tensor & self, const Tensor & noise, const Scalar & lower, const Scalar & upper, bool training, c10::optional<Generator> generator); // {"schema": "aten::rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & softplus_out(const Tensor & self, const Scalar & beta, const Scalar & threshold, Tensor & out); // {"schema": "aten::softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor softplus(const Tensor & self, const Scalar & beta, const Scalar & threshold); // {"schema": "aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & softplus_backward_out(const Tensor & grad_output, const Tensor & self, const Scalar & beta, const Scalar & threshold, Tensor & grad_input); // {"schema": "aten::softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor softplus_backward(const Tensor & grad_output, const Tensor & self, const Scalar & beta, const Scalar & threshold); // {"schema": "aten::softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & softshrink_out(const Tensor & self, const Scalar & lambd, Tensor & out); // {"schema": "aten::softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor softshrink(const Tensor & self, const Scalar & lambd); // {"schema": "aten::softshrink(Tensor self, Scalar lambd=0.5) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & softshrink_backward_out(const Tensor & grad_output, const Tensor & self, const Scalar & lambd, Tensor & grad_input); // {"schema": "aten::softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor softshrink_backward(const Tensor & grad_output, const Tensor & self, const Scalar & lambd); // {"schema": "aten::softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & adaptive_avg_pool2d_out(const Tensor & self, c10::SymIntArrayRef output_size, Tensor & out); // {"schema": "aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor adaptive_avg_pool2d(const Tensor & self, c10::SymIntArrayRef output_size); // {"schema": "aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor mkldnn_adaptive_avg_pool2d(const Tensor & self, IntArrayRef output_size); // {"schema": "aten::mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & mkldnn_adaptive_avg_pool2d_out(const Tensor & self, IntArrayRef output_size, Tensor & out); // {"schema": "aten::mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor mkldnn_adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _adaptive_avg_pool2d(const Tensor & self, c10::SymIntArrayRef output_size); // {"schema": "aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _adaptive_avg_pool2d_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & adaptive_avg_pool3d_out(const Tensor & self, c10::SymIntArrayRef output_size, Tensor & out); // {"schema": "aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor adaptive_avg_pool3d(const Tensor & self, c10::SymIntArrayRef output_size); // {"schema": "aten::adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _adaptive_avg_pool3d(const Tensor & self, c10::SymIntArrayRef output_size); // {"schema": "aten::_adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & adaptive_avg_pool3d_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & grad_input); // {"schema": "aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _adaptive_avg_pool3d_backward(const Tensor & grad_output, const Tensor & self); // {"schema": "aten::_adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> adaptive_max_pool2d_out(const Tensor & self, IntArrayRef output_size, Tensor & out, Tensor & indices); // {"schema": "aten::adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> adaptive_max_pool2d(const Tensor & self, IntArrayRef output_size); // {"schema": "aten::adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor & adaptive_max_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor adaptive_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices); // {"schema": "aten::adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> adaptive_max_pool3d_out(const Tensor & self, IntArrayRef output_size, Tensor & out, Tensor & indices); // {"schema": "aten::adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> adaptive_max_pool3d(const Tensor & self, IntArrayRef output_size); // {"schema": "aten::adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor & adaptive_max_pool3d_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor adaptive_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & indices); // {"schema": "aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & avg_pool2d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, Tensor & out); // {"schema": "aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor avg_pool2d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override); // {"schema": "aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & avg_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, Tensor & grad_input); // {"schema": "aten::avg_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor avg_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override); // {"schema": "aten::avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & avg_pool3d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, Tensor & out); // {"schema": "aten::avg_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor avg_pool3d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override); // {"schema": "aten::avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & avg_pool3d_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, Tensor & grad_input); // {"schema": "aten::avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor avg_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override); // {"schema": "aten::avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> fractional_max_pool2d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & random_samples, Tensor & output, Tensor & indices); // {"schema": "aten::fractional_max_pool2d.output(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> fractional_max_pool2d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & random_samples); // {"schema": "aten::fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor & fractional_max_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor fractional_max_pool2d_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & indices); // {"schema": "aten::fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> fractional_max_pool3d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & random_samples, Tensor & output, Tensor & indices); // {"schema": "aten::fractional_max_pool3d.output(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples, *, Tensor(a!) output, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> fractional_max_pool3d(const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & random_samples); // {"schema": "aten::fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor & fractional_max_pool3d_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor fractional_max_pool3d_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef output_size, const Tensor & indices); // {"schema": "aten::fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> max_pool2d_with_indices_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out, Tensor & indices); // {"schema": "aten::max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> max_pool2d_with_indices(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "True"}
+Tensor & max_pool2d_with_indices_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max_pool2d_with_indices_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices); // {"schema": "aten::max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> max_pool3d_with_indices_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out, Tensor & indices); // {"schema": "aten::max_pool3d_with_indices.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> max_pool3d_with_indices(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode); // {"schema": "aten::max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor & max_pool3d_with_indices_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices, Tensor & grad_input); // {"schema": "aten::max_pool3d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max_pool3d_with_indices_backward(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, const Tensor & indices); // {"schema": "aten::max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & max_unpool2d_out(const Tensor & self, const Tensor & indices, c10::SymIntArrayRef output_size, Tensor & out); // {"schema": "aten::max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max_unpool2d(const Tensor & self, const Tensor & indices, c10::SymIntArrayRef output_size); // {"schema": "aten::max_unpool2d(Tensor self, Tensor indices, SymInt[2] output_size) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & max_unpool3d_out(const Tensor & self, const Tensor & indices, c10::SymIntArrayRef output_size, IntArrayRef stride, IntArrayRef padding, Tensor & out); // {"schema": "aten::max_unpool3d.out(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor max_unpool3d(const Tensor & self, const Tensor & indices, c10::SymIntArrayRef output_size, IntArrayRef stride, IntArrayRef padding); // {"schema": "aten::max_unpool3d(Tensor self, Tensor indices, SymInt[3] output_size, int[3] stride, int[3] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & reflection_pad1d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad1d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & reflection_pad1d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::reflection_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad1d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & reflection_pad2d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad2d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & reflection_pad2d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad2d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & reflection_pad3d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad3d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & reflection_pad3d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor reflection_pad3d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & replication_pad1d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad1d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & replication_pad1d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::replication_pad1d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[2] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad1d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad1d_backward(Tensor grad_output, Tensor self, SymInt[2] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & replication_pad2d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::replication_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad2d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad2d(Tensor self, SymInt[4] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & replication_pad2d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::replication_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad2d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & replication_pad3d_out(const Tensor & self, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::replication_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad3d(const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad3d(Tensor self, SymInt[6] padding) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & replication_pad3d_backward_out(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding, Tensor & grad_input); // {"schema": "aten::replication_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor replication_pad3d_backward(const Tensor & grad_output, const Tensor & self, c10::SymIntArrayRef padding); // {"schema": "aten::replication_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _pad_circular(const Tensor & self, c10::SymIntArrayRef pad); // {"schema": "aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _pad_enum(const Tensor & self, c10::SymIntArrayRef pad, int64_t mode, c10::optional<double> value); // {"schema": "aten::_pad_enum(Tensor self, SymInt[] pad, int mode, float? value=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor pad(const Tensor & self, c10::SymIntArrayRef pad, c10::string_view mode, c10::optional<double> value); // {"schema": "aten::pad(Tensor self, SymInt[] pad, str mode=\"constant\", float? value=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_linear1d(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_linear1d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_bilinear2d(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_bilinear2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _upsample_bilinear2d_aa(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::_upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_trilinear3d(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_trilinear3d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_bicubic2d(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_bicubic2d.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _upsample_bicubic2d_aa(const Tensor & input, OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::_upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_nearest1d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_nearest1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _upsample_nearest_exact1d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::_upsample_nearest_exact1d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_nearest2d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _upsample_nearest_exact2d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::_upsample_nearest_exact2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor upsample_nearest3d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::upsample_nearest3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _upsample_nearest_exact3d(const Tensor & input, OptionalSymIntArrayRef output_size, c10::optional<ArrayRef<double>> scale_factors); // {"schema": "aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & upsample_linear1d_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales, Tensor & out); // {"schema": "aten::upsample_linear1d.out(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_linear1d(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales); // {"schema": "aten::upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_linear1d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales, Tensor & grad_input); // {"schema": "aten::upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_linear1d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales); // {"schema": "aten::upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_bilinear2d_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::upsample_bilinear2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_bilinear2d(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_bilinear2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_bilinear2d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_bilinear2d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _upsample_bilinear2d_aa_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _upsample_bilinear2d_aa(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _upsample_bilinear2d_aa_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _upsample_bilinear2d_aa_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_bicubic2d_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::upsample_bicubic2d.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_bicubic2d(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_bicubic2d(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_bicubic2d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::upsample_bicubic2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_bicubic2d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_bicubic2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _upsample_bicubic2d_aa_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _upsample_bicubic2d_aa(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _upsample_bicubic2d_aa_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _upsample_bicubic2d_aa_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_trilinear3d_out(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::upsample_trilinear3d.out(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_trilinear3d(const Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_trilinear3d(Tensor self, SymInt[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_trilinear3d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::upsample_trilinear3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_trilinear3d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_trilinear3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest1d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales, Tensor & out); // {"schema": "aten::upsample_nearest1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact1d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales, Tensor & out); // {"schema": "aten::_upsample_nearest_exact1d.out(Tensor self, SymInt[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest1d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales); // {"schema": "aten::upsample_nearest1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact1d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales); // {"schema": "aten::_upsample_nearest_exact1d(Tensor self, SymInt[1] output_size, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest1d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales, Tensor & grad_input); // {"schema": "aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact1d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales, Tensor & grad_input); // {"schema": "aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest1d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales); // {"schema": "aten::upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact1d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales); // {"schema": "aten::_upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest2d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact2d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::_upsample_nearest_exact2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest2d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact2d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_nearest_exact2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest2d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::upsample_nearest2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact2d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::_upsample_nearest_exact2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest2d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_nearest2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact2d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_nearest_exact2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest3d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::upsample_nearest3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact3d_out(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & out); // {"schema": "aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest3d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_nearest3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact3d(const Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & upsample_nearest3d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::upsample_nearest3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & _upsample_nearest_exact3d_backward_out(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, Tensor & grad_input); // {"schema": "aten::_upsample_nearest_exact3d_backward.grad_input(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor upsample_nearest3d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::upsample_nearest3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _upsample_nearest_exact3d_backward(const Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w); // {"schema": "aten::_upsample_nearest_exact3d_backward(Tensor grad_output, SymInt[3] output_size, SymInt[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sigmoid_backward_out(const Tensor & grad_output, const Tensor & output, Tensor & grad_input); // {"schema": "aten::sigmoid_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor sigmoid_backward(const Tensor & grad_output, const Tensor & output); // {"schema": "aten::sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & logit_backward_out(const Tensor & grad_output, const Tensor & self, c10::optional<double> eps, Tensor & grad_input); // {"schema": "aten::logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor logit_backward(const Tensor & grad_output, const Tensor & self, c10::optional<double> eps); // {"schema": "aten::logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tanh_backward_out(const Tensor & grad_output, const Tensor & output, Tensor & grad_input); // {"schema": "aten::tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor tanh_backward(const Tensor & grad_output, const Tensor & output); // {"schema": "aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & slow_conv_transpose2d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, Tensor & out); // {"schema": "aten::slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor slow_conv_transpose2d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation); // {"schema": "aten::slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & slow_conv_transpose3d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, Tensor & out); // {"schema": "aten::slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor slow_conv_transpose3d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation); // {"schema": "aten::slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); // {"schema": "aten::thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & _slow_conv2d_forward_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, Tensor & output); // {"schema": "aten::_slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _slow_conv2d_forward(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); // {"schema": "aten::_slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _slow_conv2d_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, Tensor & grad_input, Tensor & grad_weight, Tensor & grad_bias); // {"schema": "aten::_slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _slow_conv2d_backward(const Tensor & grad_output, const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, ::std::array<bool,3> output_mask); // {"schema": "aten::_slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)", "dispatch": "True", "default": "False"}
+const Tensor & _conv_depthwise2d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, const Tensor & out); // {"schema": "aten::_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _conv_depthwise2d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); // {"schema": "aten::_conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor conv_depthwise3d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); // {"schema": "aten::conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & slow_conv3d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, Tensor & out); // {"schema": "aten::slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor slow_conv3d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); // {"schema": "aten::slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & slow_conv3d_forward_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, Tensor & output); // {"schema": "aten::slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor slow_conv3d_forward(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding); // {"schema": "aten::slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor slow_conv_dilated2d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); // {"schema": "aten::slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor slow_conv_dilated3d(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); // {"schema": "aten::slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & col2im_out(const Tensor & self, c10::SymIntArrayRef output_size, IntArrayRef kernel_size, IntArrayRef dilation, IntArrayRef padding, IntArrayRef stride, Tensor & out); // {"schema": "aten::col2im.out(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor col2im(const Tensor & self, c10::SymIntArrayRef output_size, IntArrayRef kernel_size, IntArrayRef dilation, IntArrayRef padding, IntArrayRef stride); // {"schema": "aten::col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor column_stack(TensorList tensors); // {"schema": "aten::column_stack(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & column_stack_out(TensorList tensors, Tensor & out); // {"schema": "aten::column_stack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & im2col_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef dilation, IntArrayRef padding, IntArrayRef stride, Tensor & out); // {"schema": "aten::im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor im2col(const Tensor & self, IntArrayRef kernel_size, IntArrayRef dilation, IntArrayRef padding, IntArrayRef stride); // {"schema": "aten::im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor isfinite(const Tensor & self); // {"schema": "aten::isfinite(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor isinf(const Tensor & self); // {"schema": "aten::isinf(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+void record_stream(Tensor & self, Stream s); // {"schema": "aten::record_stream(Tensor(a!) self, Stream s) -> ()", "dispatch": "True", "default": "False"}
+Tensor isposinf(const Tensor & self); // {"schema": "aten::isposinf(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & isposinf_out(const Tensor & self, Tensor & out); // {"schema": "aten::isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor isneginf(const Tensor & self); // {"schema": "aten::isneginf(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & isneginf_out(const Tensor & self, Tensor & out); // {"schema": "aten::isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _add_batch_dim(const Tensor & self, int64_t batch_dim, int64_t level); // {"schema": "aten::_add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _remove_batch_dim(const Tensor & self, int64_t level, int64_t batch_size, int64_t out_dim); // {"schema": "aten::_remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor special_entr(const Tensor & self); // {"schema": "aten::special_entr(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_entr_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_entr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_ndtri(const Tensor & self); // {"schema": "aten::special_ndtri(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_ndtri_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_ndtri.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_log_ndtr(const Tensor & self); // {"schema": "aten::special_log_ndtr(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_log_ndtr_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_log_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_expm1(const Tensor & self); // {"schema": "aten::special_expm1(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_expm1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_exp2(const Tensor & self); // {"schema": "aten::special_exp2(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_exp2_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_psi(const Tensor & self); // {"schema": "aten::special_psi(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_psi_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_digamma(const Tensor & self); // {"schema": "aten::special_digamma(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_digamma_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_gammaln(const Tensor & self); // {"schema": "aten::special_gammaln(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_gammaln_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_gammaln.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_erf(const Tensor & self); // {"schema": "aten::special_erf(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_erf_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_erfc(const Tensor & self); // {"schema": "aten::special_erfc(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_erfc_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_erfcx(const Tensor & self); // {"schema": "aten::special_erfcx(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_erfcx_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_erfcx.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_erfinv(const Tensor & self); // {"schema": "aten::special_erfinv(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_erfinv_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_ndtr(const Tensor & self); // {"schema": "aten::special_ndtr(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_ndtr_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_ndtr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_xlog1py(const Tensor & self, const Tensor & other); // {"schema": "aten::special_xlog1py(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_xlog1py(const Scalar & self, const Tensor & other); // {"schema": "aten::special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_xlog1py(const Tensor & self, const Scalar & other); // {"schema": "aten::special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_xlog1py_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_xlog1py_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_xlog1py_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_xlogy(const Tensor & self, const Tensor & other); // {"schema": "aten::special_xlogy(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor special_xlogy(const Scalar & self, const Tensor & other); // {"schema": "aten::special_xlogy.self_scalar(Scalar self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor special_xlogy(const Tensor & self, const Scalar & other); // {"schema": "aten::special_xlogy.other_scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_xlogy_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_xlogy.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & special_xlogy_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_xlogy.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & special_xlogy_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::special_xlogy.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_zeta(const Tensor & self, const Tensor & other); // {"schema": "aten::special_zeta(Tensor self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_zeta(const Scalar & self, const Tensor & other); // {"schema": "aten::special_zeta.self_scalar(Scalar self, Tensor other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_zeta(const Tensor & self, const Scalar & other); // {"schema": "aten::special_zeta.other_scalar(Tensor self, Scalar other) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_zeta_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_zeta.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_zeta_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_zeta.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_zeta_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::special_zeta.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_i0(const Tensor & self); // {"schema": "aten::special_i0(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_i0_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_i0e(const Tensor & self); // {"schema": "aten::special_i0e(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_i0e_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_i0e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_i1(const Tensor & self); // {"schema": "aten::special_i1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_i1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_i1e(const Tensor & self); // {"schema": "aten::special_i1e(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_i1e_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_i1e.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_logit(const Tensor & self, c10::optional<double> eps); // {"schema": "aten::special_logit(Tensor self, float? eps=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_logit_out(const Tensor & self, c10::optional<double> eps, Tensor & out); // {"schema": "aten::special_logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_polygamma(int64_t n, const Tensor & self); // {"schema": "aten::special_polygamma(int n, Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_polygamma_out(int64_t n, const Tensor & self, Tensor & out); // {"schema": "aten::special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_logsumexp(const Tensor & self, IntArrayRef dim, bool keepdim); // {"schema": "aten::special_logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_logsumexp_out(const Tensor & self, IntArrayRef dim, bool keepdim, Tensor & out); // {"schema": "aten::special_logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_expit(const Tensor & self); // {"schema": "aten::special_expit(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_expit_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_expit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_sinc(const Tensor & self); // {"schema": "aten::special_sinc(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_sinc_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_round(const Tensor & self, int64_t decimals); // {"schema": "aten::special_round(Tensor self, *, int decimals=0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_round_out(const Tensor & self, int64_t decimals, Tensor & out); // {"schema": "aten::special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_log1p(const Tensor & self); // {"schema": "aten::special_log1p(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_log1p_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_log_softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::special_log_softmax(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_gammainc_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_gammainc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_gammainc(const Tensor & self, const Tensor & other); // {"schema": "aten::special_gammainc(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_gammaincc_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_gammaincc(const Tensor & self, const Tensor & other); // {"schema": "aten::special_gammaincc(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor special_multigammaln(const Tensor & self, int64_t p); // {"schema": "aten::special_multigammaln(Tensor self, int p) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & special_multigammaln_out(const Tensor & self, int64_t p, Tensor & out); // {"schema": "aten::special_multigammaln.out(Tensor self, int p, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor special_softmax(const Tensor & self, int64_t dim, c10::optional<ScalarType> dtype); // {"schema": "aten::special_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fft_fft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_fft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ifft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_ifft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_rfft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_rfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_rfft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_rfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_irfft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_irfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_irfft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_irfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_hfft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_hfft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ihfft(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_ihfft_out(const Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_fft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_fft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ifft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ifft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_ifft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_ifft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_rfft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_rfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_rfft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_rfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_irfft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_irfft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_hfft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_hfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+const Tensor & fft_hfft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, const Tensor & out); // {"schema": "aten::fft_hfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ihfft2(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+const Tensor & fft_ihfft2_out(const Tensor & self, OptionalSymIntArrayRef s, IntArrayRef dim, c10::optional<c10::string_view> norm, const Tensor & out); // {"schema": "aten::fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_fftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_fftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ifftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ifftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_ifftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_ifftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_rfftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_rfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_rfftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_rfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_irfftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & fft_irfftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, Tensor & out); // {"schema": "aten::fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_hfftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_hfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+const Tensor & fft_hfftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const Tensor & out); // {"schema": "aten::fft_hfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_ihfftn(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm); // {"schema": "aten::fft_ihfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor", "dispatch": "False", "default": "True"}
+const Tensor & fft_ihfftn_out(const Tensor & self, OptionalSymIntArrayRef s, OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, const Tensor & out); // {"schema": "aten::fft_ihfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor fft_fftfreq(int64_t n, double d, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fft_fftfreq_out(int64_t n, double d, Tensor & out); // {"schema": "aten::fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor fft_rfftfreq(int64_t n, double d, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & fft_rfftfreq_out(int64_t n, double d, Tensor & out); // {"schema": "aten::fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor fft_fftshift(const Tensor & self, OptionalIntArrayRef dim); // {"schema": "aten::fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor fft_ifftshift(const Tensor & self, OptionalIntArrayRef dim); // {"schema": "aten::fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> linalg_cholesky_ex(const Tensor & self, bool upper, bool check_errors); // {"schema": "aten::linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_cholesky_ex_out(const Tensor & self, bool upper, bool check_errors, Tensor & L, Tensor & info); // {"schema": "aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)", "dispatch": "True", "default": "False"}
+Tensor linalg_cholesky(const Tensor & self, bool upper); // {"schema": "aten::linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_cholesky_out(const Tensor & self, bool upper, Tensor & out); // {"schema": "aten::linalg_cholesky.out(Tensor self, *, bool upper=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_cross(const Tensor & self, const Tensor & other, int64_t dim); // {"schema": "aten::linalg_cross(Tensor self, Tensor other, *, int dim=-1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linalg_cross_out(const Tensor & self, const Tensor & other, int64_t dim, Tensor & out); // {"schema": "aten::linalg_cross.out(Tensor self, Tensor other, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_lu_factor(const Tensor & A, bool pivot); // {"schema": "aten::linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_lu_factor_out(const Tensor & A, bool pivot, Tensor & LU, Tensor & pivots); // {"schema": "aten::linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> linalg_lu_factor_ex(const Tensor & A, bool pivot, bool check_errors); // {"schema": "aten::linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> linalg_lu_factor_ex_out(const Tensor & A, bool pivot, bool check_errors, Tensor & LU, Tensor & pivots, Tensor & info); // {"schema": "aten::linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> linalg_lu(const Tensor & A, bool pivot); // {"schema": "aten::linalg_lu(Tensor A, *, bool pivot=True) -> (Tensor P, Tensor L, Tensor U)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> linalg_lu_out(const Tensor & A, bool pivot, Tensor & P, Tensor & L, Tensor & U); // {"schema": "aten::linalg_lu.out(Tensor A, *, bool pivot=True, Tensor(a!) P, Tensor(b!) L, Tensor(c!) U) -> (Tensor(a!) P, Tensor(b!) L, Tensor(c!) U)", "dispatch": "True", "default": "False"}
+Tensor linalg_lu_solve(const Tensor & LU, const Tensor & pivots, const Tensor & B, bool left, bool adjoint); // {"schema": "aten::linalg_lu_solve(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linalg_lu_solve_out(const Tensor & LU, const Tensor & pivots, const Tensor & B, bool left, bool adjoint, Tensor & out); // {"schema": "aten::linalg_lu_solve.out(Tensor LU, Tensor pivots, Tensor B, *, bool left=True, bool adjoint=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _linalg_det(const Tensor & A); // {"schema": "aten::_linalg_det(Tensor A) -> (Tensor result, Tensor LU, Tensor pivots)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _linalg_det_out(const Tensor & A, Tensor & result, Tensor & LU, Tensor & pivots); // {"schema": "aten::_linalg_det.result(Tensor A, *, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots)", "dispatch": "True", "default": "False"}
+Tensor linalg_det(const Tensor & A); // {"schema": "aten::linalg_det(Tensor A) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_det_out(const Tensor & A, Tensor & out); // {"schema": "aten::linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor det(const Tensor & self); // {"schema": "aten::det(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> linalg_ldl_factor_ex(const Tensor & self, bool hermitian, bool check_errors); // {"schema": "aten::linalg_ldl_factor_ex(Tensor self, *, bool hermitian=False, bool check_errors=False) -> (Tensor LD, Tensor pivots, Tensor info)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> linalg_ldl_factor_ex_out(const Tensor & self, bool hermitian, bool check_errors, Tensor & LD, Tensor & pivots, Tensor & info); // {"schema": "aten::linalg_ldl_factor_ex.out(Tensor self, *, bool hermitian=False, bool check_errors=False, Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LD, Tensor(b!) pivots, Tensor(c!) info)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_ldl_factor(const Tensor & self, bool hermitian); // {"schema": "aten::linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_ldl_factor_out(const Tensor & self, bool hermitian, Tensor & LD, Tensor & pivots); // {"schema": "aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots)", "dispatch": "False", "default": "True"}
+Tensor linalg_ldl_solve(const Tensor & LD, const Tensor & pivots, const Tensor & B, bool hermitian); // {"schema": "aten::linalg_ldl_solve(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linalg_ldl_solve_out(const Tensor & LD, const Tensor & pivots, const Tensor & B, bool hermitian, Tensor & out); // {"schema": "aten::linalg_ldl_solve.out(Tensor LD, Tensor pivots, Tensor B, *, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> linalg_lstsq(const Tensor & self, const Tensor & b, c10::optional<double> rcond, c10::optional<c10::string_view> driver); // {"schema": "aten::linalg_lstsq(Tensor self, Tensor b, float? rcond=None, *, str? driver=None) -> (Tensor solution, Tensor residuals, Tensor rank, Tensor singular_values)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> linalg_lstsq_out(const Tensor & self, const Tensor & b, c10::optional<double> rcond, c10::optional<c10::string_view> driver, Tensor & solution, Tensor & residuals, Tensor & rank, Tensor & singular_values); // {"schema": "aten::linalg_lstsq.out(Tensor self, Tensor b, float? rcond=None, *, str? driver=None, Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values) -> (Tensor(a!) solution, Tensor(b!) residuals, Tensor(c!) rank, Tensor(d!) singular_values)", "dispatch": "True", "default": "False"}
+Tensor linalg_matmul(const Tensor & self, const Tensor & other); // {"schema": "aten::linalg_matmul(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matmul_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::linalg_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_vecdot(const Tensor & x, const Tensor & y, int64_t dim); // {"schema": "aten::linalg_vecdot(Tensor x, Tensor y, *, int dim=-1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_vecdot_out(const Tensor & x, const Tensor & y, int64_t dim, Tensor & out); // {"schema": "aten::linalg_vecdot.out(Tensor x, Tensor y, *, int dim=-1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_exp(const Tensor & self); // {"schema": "aten::linalg_matrix_exp(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _linalg_slogdet(const Tensor & A); // {"schema": "aten::_linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet, Tensor LU, Tensor pivots)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> _linalg_slogdet_out(const Tensor & A, Tensor & sign, Tensor & logabsdet, Tensor & LU, Tensor & pivots); // {"schema": "aten::_linalg_slogdet.sign(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots) -> (Tensor(a!) sign, Tensor(b!) logabsdet, Tensor(c!) LU, Tensor(d!) pivots)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_slogdet(const Tensor & A); // {"schema": "aten::linalg_slogdet(Tensor A) -> (Tensor sign, Tensor logabsdet)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_slogdet_out(const Tensor & A, Tensor & sign, Tensor & logabsdet); // {"schema": "aten::linalg_slogdet.out(Tensor A, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> slogdet(const Tensor & self); // {"schema": "aten::slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> slogdet_out(const Tensor & self, Tensor & sign, Tensor & logabsdet); // {"schema": "aten::slogdet.out(Tensor self, *, Tensor(a!) sign, Tensor(b!) logabsdet) -> (Tensor(a!) sign, Tensor(b!) logabsdet)", "dispatch": "False", "default": "True"}
+Tensor logdet(const Tensor & self); // {"schema": "aten::logdet(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> linalg_eig(const Tensor & self); // {"schema": "aten::linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor &,Tensor &> linalg_eig_out(const Tensor & self, Tensor & eigenvalues, Tensor & eigenvectors); // {"schema": "aten::linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)", "dispatch": "True", "default": "False"}
+Tensor _linalg_eigvals(const Tensor & self); // {"schema": "aten::_linalg_eigvals(Tensor self) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor linalg_eigvals(const Tensor & self); // {"schema": "aten::linalg_eigvals(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_eigvals_out(const Tensor & self, Tensor & out); // {"schema": "aten::linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _linalg_eigh(const Tensor & A, c10::string_view UPLO, bool compute_v); // {"schema": "aten::_linalg_eigh(Tensor A, str UPLO=\"L\", bool compute_v=True) -> (Tensor eigenvalues, Tensor eigenvectors)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _linalg_eigh_out(const Tensor & A, c10::string_view UPLO, bool compute_v, Tensor & eigenvalues, Tensor & eigenvectors); // {"schema": "aten::_linalg_eigh.eigenvalues(Tensor A, str UPLO=\"L\", bool compute_v=True, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_eigh(const Tensor & self, c10::string_view UPLO); // {"schema": "aten::linalg_eigh(Tensor self, str UPLO=\"L\") -> (Tensor eigenvalues, Tensor eigenvectors)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_eigh_out(const Tensor & self, c10::string_view UPLO, Tensor & eigvals, Tensor & eigvecs); // {"schema": "aten::linalg_eigh.eigvals(Tensor self, str UPLO=\"L\", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)", "dispatch": "False", "default": "True"}
+Tensor linalg_eigvalsh(const Tensor & self, c10::string_view UPLO); // {"schema": "aten::linalg_eigvalsh(Tensor self, str UPLO=\"L\") -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_eigvalsh_out(const Tensor & self, c10::string_view UPLO, Tensor & out); // {"schema": "aten::linalg_eigvalsh.out(Tensor self, str UPLO=\"L\", *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_householder_product(const Tensor & input, const Tensor & tau); // {"schema": "aten::linalg_householder_product(Tensor input, Tensor tau) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & linalg_householder_product_out(const Tensor & input, const Tensor & tau, Tensor & out); // {"schema": "aten::linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_inv_ex(const Tensor & A, bool check_errors); // {"schema": "aten::linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_inv_ex_out(const Tensor & A, bool check_errors, Tensor & inverse, Tensor & info); // {"schema": "aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info)", "dispatch": "True", "default": "False"}
+Tensor linalg_inv(const Tensor & A); // {"schema": "aten::linalg_inv(Tensor A) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_inv_out(const Tensor & A, Tensor & out); // {"schema": "aten::linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor inverse(const Tensor & self); // {"schema": "aten::inverse(Tensor self) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & inverse_out(const Tensor & self, Tensor & out); // {"schema": "aten::inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor inner(const Tensor & self, const Tensor & other); // {"schema": "aten::inner(Tensor self, Tensor other) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & inner_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor outer(const Tensor & self, const Tensor & vec2); // {"schema": "aten::outer(Tensor self, Tensor vec2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & outer_out(const Tensor & self, const Tensor & vec2, Tensor & out); // {"schema": "aten::outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor ger(const Tensor & self, const Tensor & vec2); // {"schema": "aten::ger(Tensor self, Tensor vec2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & ger_out(const Tensor & self, const Tensor & vec2, Tensor & out); // {"schema": "aten::ger.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_norm(const Tensor & self, const c10::optional<Scalar> & ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor linalg_norm(const Tensor & self, c10::string_view ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_norm_out(const Tensor & self, const c10::optional<Scalar> & ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::linalg_norm.out(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & linalg_norm_out(const Tensor & self, c10::string_view ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::linalg_norm.ord_str_out(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_vector_norm(const Tensor & self, const Scalar & ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linalg_vector_norm_out(const Tensor & self, const Scalar & ord, OptionalIntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor linalg_matrix_norm(const Tensor & self, const Scalar & ord, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_norm_out(const Tensor & self, const Scalar & ord, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::linalg_matrix_norm.out(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_norm(const Tensor & self, c10::string_view ord, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype); // {"schema": "aten::linalg_matrix_norm.str_ord(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_norm_out(const Tensor & self, c10::string_view ord, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::linalg_matrix_norm.str_ord_out(Tensor self, str ord='fro', int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor> _linalg_svd(const Tensor & A, bool full_matrices, bool compute_uv, c10::optional<c10::string_view> driver); // {"schema": "aten::_linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _linalg_svd_out(const Tensor & A, bool full_matrices, bool compute_uv, c10::optional<c10::string_view> driver, Tensor & U, Tensor & S, Tensor & Vh); // {"schema": "aten::_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> linalg_svd(const Tensor & A, bool full_matrices, c10::optional<c10::string_view> driver); // {"schema": "aten::linalg_svd(Tensor A, bool full_matrices=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> linalg_svd_out(const Tensor & A, bool full_matrices, c10::optional<c10::string_view> driver, Tensor & U, Tensor & S, Tensor & Vh); // {"schema": "aten::linalg_svd.U(Tensor A, bool full_matrices=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)", "dispatch": "False", "default": "True"}
+Tensor linalg_svdvals(const Tensor & A, c10::optional<c10::string_view> driver); // {"schema": "aten::linalg_svdvals(Tensor A, *, str? driver=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_svdvals_out(const Tensor & A, c10::optional<c10::string_view> driver, Tensor & out); // {"schema": "aten::linalg_svdvals.out(Tensor A, *, str? driver=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_cond(const Tensor & self, const c10::optional<Scalar> & p); // {"schema": "aten::linalg_cond(Tensor self, Scalar? p=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_cond_out(const Tensor & self, const c10::optional<Scalar> & p, Tensor & out); // {"schema": "aten::linalg_cond.out(Tensor self, Scalar? p=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_cond(const Tensor & self, c10::string_view p); // {"schema": "aten::linalg_cond.p_str(Tensor self, str p) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_cond_out(const Tensor & self, c10::string_view p, Tensor & out); // {"schema": "aten::linalg_cond.p_str_out(Tensor self, str p, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_pinv(const Tensor & self, const c10::optional<Tensor> & atol, const c10::optional<Tensor> & rtol, bool hermitian); // {"schema": "aten::linalg_pinv.atol_rtol_tensor(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & linalg_pinv_out(const Tensor & self, const c10::optional<Tensor> & atol, const c10::optional<Tensor> & rtol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_pinv.atol_rtol_tensor_out(Tensor self, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor linalg_pinv(const Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian); // {"schema": "aten::linalg_pinv.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_pinv_out(const Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_pinv.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_pinv(const Tensor & self, double rcond, bool hermitian); // {"schema": "aten::linalg_pinv(Tensor self, float rcond, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor linalg_pinv(const Tensor & self, const Tensor & rcond, bool hermitian); // {"schema": "aten::linalg_pinv.rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_pinv_out(const Tensor & self, double rcond, bool hermitian, Tensor & out); // {"schema": "aten::linalg_pinv.out(Tensor self, float rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor & linalg_pinv_out(const Tensor & self, const Tensor & rcond, bool hermitian, Tensor & out); // {"schema": "aten::linalg_pinv.out_rcond_tensor(Tensor self, Tensor rcond, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _linalg_solve_ex(const Tensor & A, const Tensor & B, bool left, bool check_errors); // {"schema": "aten::_linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> _linalg_solve_ex_out(const Tensor & A, const Tensor & B, bool left, bool check_errors, Tensor & result, Tensor & LU, Tensor & pivots, Tensor & info); // {"schema": "aten::_linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> linalg_solve_ex(const Tensor & A, const Tensor & B, bool left, bool check_errors); // {"schema": "aten::linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_solve_ex_out(const Tensor & A, const Tensor & B, bool left, bool check_errors, Tensor & result, Tensor & info); // {"schema": "aten::linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)", "dispatch": "False", "default": "True"}
+Tensor linalg_solve(const Tensor & A, const Tensor & B, bool left); // {"schema": "aten::linalg_solve(Tensor A, Tensor B, *, bool left=True) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_solve_out(const Tensor & A, const Tensor & B, bool left, Tensor & out); // {"schema": "aten::linalg_solve.out(Tensor A, Tensor B, *, bool left=True, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_tensorinv(const Tensor & self, int64_t ind); // {"schema": "aten::linalg_tensorinv(Tensor self, int ind=2) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_tensorinv_out(const Tensor & self, int64_t ind, Tensor & out); // {"schema": "aten::linalg_tensorinv.out(Tensor self, int ind=2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_tensorsolve(const Tensor & self, const Tensor & other, OptionalIntArrayRef dims); // {"schema": "aten::linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_tensorsolve_out(const Tensor & self, const Tensor & other, OptionalIntArrayRef dims, Tensor & out); // {"schema": "aten::linalg_tensorsolve.out(Tensor self, Tensor other, int[]? dims=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor> linalg_qr(const Tensor & A, c10::string_view mode); // {"schema": "aten::linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> linalg_qr_out(const Tensor & A, c10::string_view mode, Tensor & Q, Tensor & R); // {"schema": "aten::linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)", "dispatch": "True", "default": "False"}
+Tensor linalg_matrix_power(const Tensor & self, int64_t n); // {"schema": "aten::linalg_matrix_power(Tensor self, int n) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_power_out(const Tensor & self, int64_t n, Tensor & out); // {"schema": "aten::linalg_matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_rank(const Tensor & input, const c10::optional<Tensor> & atol, const c10::optional<Tensor> & rtol, bool hermitian); // {"schema": "aten::linalg_matrix_rank.atol_rtol_tensor(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_rank_out(const Tensor & input, const c10::optional<Tensor> & atol, const c10::optional<Tensor> & rtol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_matrix_rank.atol_rtol_tensor_out(Tensor input, *, Tensor? atol=None, Tensor? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_rank(const Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian); // {"schema": "aten::linalg_matrix_rank.atol_rtol_float(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_rank_out(const Tensor & self, c10::optional<double> atol, c10::optional<double> rtol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_matrix_rank.atol_rtol_float_out(Tensor self, *, float? atol=None, float? rtol=None, bool hermitian=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_rank(const Tensor & self, double tol, bool hermitian); // {"schema": "aten::linalg_matrix_rank(Tensor self, float tol, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_rank_out(const Tensor & self, double tol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_matrix_rank.out(Tensor self, float tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_matrix_rank(const Tensor & input, const Tensor & tol, bool hermitian); // {"schema": "aten::linalg_matrix_rank.tol_tensor(Tensor input, Tensor tol, bool hermitian=False) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_matrix_rank_out(const Tensor & input, const Tensor & tol, bool hermitian, Tensor & out); // {"schema": "aten::linalg_matrix_rank.out_tol_tensor(Tensor input, Tensor tol, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor linalg_multi_dot(TensorList tensors); // {"schema": "aten::linalg_multi_dot(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor & linalg_multi_dot_out(TensorList tensors, Tensor & out); // {"schema": "aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "False", "default": "True"}
+Tensor nested_to_padded_tensor(const Tensor & self, double padding, OptionalIntArrayRef output_size); // {"schema": "aten::nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_serialization_subcmul(const Tensor & self, const Tensor & other, const Scalar & alpha); // {"schema": "aten::_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_parallel_materialize(const Tensor & self, int64_t num_parallel, bool skip_first); // {"schema": "aten::_test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _test_optional_intlist(const Tensor & values, OptionalIntArrayRef addends); // {"schema": "aten::_test_optional_intlist(Tensor values, int[]? addends) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _test_optional_filled_intlist(const Tensor & values, OptionalIntArrayRef addends); // {"schema": "aten::_test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _test_optional_floatlist(const Tensor & values, c10::optional<ArrayRef<double>> addends); // {"schema": "aten::_test_optional_floatlist(Tensor values, float[]? addends) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _test_string_default(const Tensor & dummy, c10::string_view a, c10::string_view b); // {"schema": "aten::_test_string_default(Tensor dummy, str a=\"\\\"'\\\\\", str b='\"\\'\\\\') -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_ambiguous_defaults(const Tensor & dummy, int64_t a, int64_t b); // {"schema": "aten::_test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_ambiguous_defaults(const Tensor & dummy, int64_t a, c10::string_view b); // {"schema": "aten::_test_ambiguous_defaults.b(Tensor dummy, int a=2, str b=\"2\") -> Tensor", "dispatch": "False", "default": "True"}
+Tensor _test_warn_in_autograd(const Tensor & self); // {"schema": "aten::_test_warn_in_autograd(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _test_autograd_multiple_dispatch(const Tensor & self); // {"schema": "aten::_test_autograd_multiple_dispatch.fullcoverage(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _test_autograd_multiple_dispatch(const Tensor & self, bool b); // {"schema": "aten::_test_autograd_multiple_dispatch.ntonly(Tensor self, bool b) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _test_autograd_multiple_dispatch_view(const Tensor & self); // {"schema": "aten::_test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)", "dispatch": "True", "default": "True"}
+Tensor _test_autograd_multiple_dispatch_view_copy(const Tensor & self); // {"schema": "aten::_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor segment_reduce(const Tensor & data, c10::string_view reduce, const c10::optional<Tensor> & lengths, const c10::optional<Tensor> & indices, const c10::optional<Tensor> & offsets, int64_t axis, bool unsafe, const c10::optional<Scalar> & initial); // {"schema": "aten::segment_reduce(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _segment_reduce_backward(const Tensor & grad, const Tensor & output, const Tensor & data, c10::string_view reduce, const c10::optional<Tensor> & lengths, const c10::optional<Tensor> & offsets, int64_t axis, const c10::optional<Scalar> & initial); // {"schema": "aten::_segment_reduce_backward(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value); // {"schema": "aten::pad_sequence(Tensor[] sequences, bool batch_first=False, float padding_value=0.0) -> Tensor", "dispatch": "False", "default": "True"}
+Tensor flatten_dense_tensors(TensorList tensors); // {"schema": "aten::flatten_dense_tensors(Tensor[] tensors) -> Tensor", "dispatch": "False", "default": "True"}
+::std::vector<Tensor> unflatten_dense_tensors(const Tensor & flat, TensorList tensors); // {"schema": "aten::unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]", "dispatch": "False", "default": "True"}
+Tensor _nested_tensor_from_tensor_list(TensorList list, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory); // {"schema": "aten::_nested_tensor_from_tensor_list(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _fw_primal_copy(const Tensor & self, int64_t level); // {"schema": "aten::_fw_primal_copy(Tensor self, int level) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _make_dual_copy(const Tensor & primal, const Tensor & tangent, int64_t level); // {"schema": "aten::_make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor view_as_real_copy(const Tensor & self); // {"schema": "aten::view_as_real_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor view_as_complex_copy(const Tensor & self); // {"schema": "aten::view_as_complex_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _conj_copy(const Tensor & self); // {"schema": "aten::_conj_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _neg_view_copy(const Tensor & self); // {"schema": "aten::_neg_view_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor as_strided_copy(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset); // {"schema": "aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _sparse_broadcast_to_copy(const Tensor & self, IntArrayRef size); // {"schema": "aten::_sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor diagonal_copy(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2); // {"schema": "aten::diagonal_copy(Tensor self, int offset=0, int dim1=0, int dim2=1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor expand_copy(const Tensor & self, c10::SymIntArrayRef size, bool implicit); // {"schema": "aten::expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor permute_copy(const Tensor & self, IntArrayRef dims); // {"schema": "aten::permute_copy(Tensor self, int[] dims) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _reshape_alias_copy(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); // {"schema": "aten::_reshape_alias_copy(Tensor self, SymInt[] size, SymInt[] stride) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor select_copy(const Tensor & self, int64_t dim, c10::SymInt index); // {"schema": "aten::select_copy.int(Tensor self, int dim, SymInt index) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor detach_copy(const Tensor & self); // {"schema": "aten::detach_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor slice_copy(const Tensor & self, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step); // {"schema": "aten::slice_copy.Tensor(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> split_copy(const Tensor & self, c10::SymInt split_size, int64_t dim); // {"schema": "aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> split_with_sizes_copy(const Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim); // {"schema": "aten::split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]", "dispatch": "True", "default": "True"}
+Tensor squeeze_copy(const Tensor & self); // {"schema": "aten::squeeze_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor squeeze_copy(const Tensor & self, int64_t dim); // {"schema": "aten::squeeze_copy.dim(Tensor self, int dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor squeeze_copy(const Tensor & self, IntArrayRef dim); // {"schema": "aten::squeeze_copy.dims(Tensor self, int[] dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor t_copy(const Tensor & self); // {"schema": "aten::t_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor transpose_copy(const Tensor & self, int64_t dim0, int64_t dim1); // {"schema": "aten::transpose_copy.int(Tensor self, int dim0, int dim1) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor unsqueeze_copy(const Tensor & self, int64_t dim); // {"schema": "aten::unsqueeze_copy(Tensor self, int dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _indices_copy(const Tensor & self); // {"schema": "aten::_indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor _values_copy(const Tensor & self); // {"schema": "aten::_values_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor indices_copy(const Tensor & self); // {"schema": "aten::indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor values_copy(const Tensor & self); // {"schema": "aten::values_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor crow_indices_copy(const Tensor & self); // {"schema": "aten::crow_indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor col_indices_copy(const Tensor & self); // {"schema": "aten::col_indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor ccol_indices_copy(const Tensor & self); // {"schema": "aten::ccol_indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor row_indices_copy(const Tensor & self); // {"schema": "aten::row_indices_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> unbind_copy(const Tensor & self, int64_t dim); // {"schema": "aten::unbind_copy.int(Tensor self, int dim=0) -> Tensor[]", "dispatch": "True", "default": "True"}
+void unbind_copy_out(const Tensor & self, int64_t dim, TensorList out); // {"schema": "aten::unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void split_copy_out(const Tensor & self, c10::SymInt split_size, int64_t dim, TensorList out); // {"schema": "aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void split_with_sizes_copy_out(const Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, TensorList out); // {"schema": "aten::split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+Tensor view_copy(const Tensor & self, c10::SymIntArrayRef size); // {"schema": "aten::view_copy(Tensor self, SymInt[] size) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor view_copy(const Tensor & self, ScalarType dtype); // {"schema": "aten::view_copy.dtype(Tensor self, ScalarType dtype) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor unfold_copy(const Tensor & self, int64_t dimension, int64_t size, int64_t step); // {"schema": "aten::unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor alias_copy(const Tensor & self); // {"schema": "aten::alias_copy(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor to_padded_tensor(const Tensor & self, double padding, OptionalSymIntArrayRef output_size); // {"schema": "aten::to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _nested_tensor_softmax_with_shape(const Tensor & self, const Tensor & query); // {"schema": "aten::_nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor _transformer_encoder_layer_fwd(const Tensor & src, int64_t embed_dim, int64_t num_heads, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const Tensor & norm_weight_1, const Tensor & norm_bias_1, const Tensor & norm_weight_2, const Tensor & norm_bias_2, const Tensor & ffn_weight_1, const Tensor & ffn_bias_1, const Tensor & ffn_weight_2, const Tensor & ffn_bias_2, const c10::optional<Tensor> & mask, c10::optional<int64_t> mask_type); // {"schema": "aten::_transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _native_multi_head_attention(const Tensor & query, const Tensor & key, const Tensor & value, int64_t embed_dim, int64_t num_head, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, const c10::optional<Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type); // {"schema": "aten::_native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor scaled_dot_product_attention(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & attn_mask, double dropout_p, bool is_causal, c10::optional<double> scale); // {"schema": "aten::scaled_dot_product_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> Tensor", "dispatch": "False", "default": "True"}
+int64_t _fused_sdp_choice(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & attn_mask, double dropout_p, bool is_causal, c10::optional<double> scale); // {"schema": "aten::_fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> int", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _scaled_dot_product_attention_math(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & attn_mask, double dropout_p, bool is_causal, const c10::optional<Tensor> & dropout_mask, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)", "dispatch": "False", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,c10::SymInt,c10::SymInt,Tensor,Tensor,Tensor> _scaled_dot_product_flash_attention(const Tensor & query, const Tensor & key, const Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor> _scaled_dot_product_flash_attention_for_cpu(const Tensor & query, const Tensor & key, const Tensor & value, double dropout_p, bool is_causal, const c10::optional<Tensor> & attn_mask, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_flash_attention_for_cpu(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor output, Tensor logsumexp)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _scaled_dot_product_flash_attention_backward(const Tensor & grad_out, const Tensor & query, const Tensor & key, const Tensor & value, const Tensor & out, const Tensor & logsumexp, const Tensor & cum_seq_q, const Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const Tensor & philox_seed, const Tensor & philox_offset, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _scaled_dot_product_flash_attention_for_cpu_backward(const Tensor & grad_out, const Tensor & query, const Tensor & key, const Tensor & value, const Tensor & out, const Tensor & logsumexp, double dropout_p, bool is_causal, const c10::optional<Tensor> & attn_mask, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_flash_attention_for_cpu_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, float dropout_p, bool is_causal, *, Tensor? attn_mask=None, float? scale=None) -> (Tensor grad_query, Tensor grad_key, Tensor grad_value)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _scaled_dot_product_efficient_attention(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & attn_bias, bool compute_log_sumexp, double dropout_p, bool is_causal, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _scaled_dot_product_efficient_attention_backward(const Tensor & grad_out_, const Tensor & query, const Tensor & key, const Tensor & value, const Tensor & attn_bias, const Tensor & out, const Tensor & logsumexp, const Tensor & philox_seed, const Tensor & philox_offset, double dropout_p, ::std::array<bool,4> grad_input_mask, bool is_causal, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _scaled_dot_product_cudnn_attention(const Tensor & query, const Tensor & key, const Tensor & value, double dropout_p, bool is_causal, bool return_debug_mask, c10::optional<double> scale); // {"schema": "aten::_scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _flash_attention_forward(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & cum_seq_q, const c10::optional<Tensor> & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, bool return_debug_mask, c10::optional<double> scale); // {"schema": "aten::_flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor> _flash_attention_backward(const Tensor & grad_out, const Tensor & query, const Tensor & key, const Tensor & value, const Tensor & out, const Tensor & logsumexp, const Tensor & cum_seq_q, const Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const Tensor & philox_seed, const Tensor & philox_offset, c10::optional<double> scale); // {"schema": "aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,c10::SymInt,c10::SymInt> _efficient_attention_forward(const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & cu_seqlens_q, const c10::optional<Tensor> & cu_seqlens_k, c10::optional<int64_t> max_seqlen_q, c10::optional<int64_t> max_seqlen_k, double dropout_p, int64_t custom_mask_type, bool compute_log_sumexp, c10::optional<double> scale, const c10::optional<Tensor> & causal_diagonal, const c10::optional<Tensor> & seqlen_k); // {"schema": "aten::_efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)", "dispatch": "True", "default": "False"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor> _efficient_attention_backward(const Tensor & grad_out_, const Tensor & query, const Tensor & key, const Tensor & value, const c10::optional<Tensor> & bias, const Tensor & out, const c10::optional<Tensor> & cu_seqlens_q, const c10::optional<Tensor> & cu_seqlens_k, c10::SymInt max_seqlen_q, c10::SymInt max_seqlen_k, const Tensor & logsumexp, double dropout_p, const Tensor & philox_seed, const Tensor & philox_offset, int64_t custom_mask_type, bool bias_requires_grad, c10::optional<double> scale, c10::optional<int64_t> num_splits_key); // {"schema": "aten::_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor)", "dispatch": "True", "default": "False"}
+Tensor _triton_scaled_dot_attention(const Tensor & q, const Tensor & k, const Tensor & v, double dropout_p); // {"schema": "aten::_triton_scaled_dot_attention(Tensor q, Tensor k, Tensor v, float dropout_p=0.0) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor & _fill_mem_eff_dropout_mask_(Tensor & self, double dropout_p, int64_t seed, int64_t offset); // {"schema": "aten::_fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _triton_multi_head_attention(const Tensor & query, const Tensor & key, const Tensor & value, int64_t embed_dim, int64_t num_head, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, const c10::optional<Tensor> & mask); // {"schema": "aten::_triton_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None) -> Tensor", "dispatch": "True", "default": "False"}
+Tensor special_airy_ai(const Tensor & x); // {"schema": "aten::special_airy_ai(Tensor x) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_airy_ai_out(const Tensor & x, Tensor & out); // {"schema": "aten::special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_bessel_j0(const Tensor & self); // {"schema": "aten::special_bessel_j0(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_bessel_j0_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_bessel_j1(const Tensor & self); // {"schema": "aten::special_bessel_j1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_bessel_j1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_bessel_y0(const Tensor & self); // {"schema": "aten::special_bessel_y0(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_bessel_y0_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_bessel_y1(const Tensor & self); // {"schema": "aten::special_bessel_y1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_bessel_y1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_chebyshev_polynomial_t(const Tensor & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_t(const Scalar & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_t(const Tensor & x, const Scalar & n); // {"schema": "aten::special_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_t_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_chebyshev_polynomial_t_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_t_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_u(const Tensor & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_u(const Scalar & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_u(const Tensor & x, const Scalar & n); // {"schema": "aten::special_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_u_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_chebyshev_polynomial_u_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_u_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_v(const Tensor & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_v(const Scalar & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_v(const Tensor & x, const Scalar & n); // {"schema": "aten::special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_v_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_chebyshev_polynomial_v_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_v_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_w(const Tensor & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_w(const Scalar & x, const Tensor & n); // {"schema": "aten::special_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_chebyshev_polynomial_w(const Tensor & x, const Scalar & n); // {"schema": "aten::special_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_w_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_chebyshev_polynomial_w_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_chebyshev_polynomial_w_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_h(const Tensor & x, const Tensor & n); // {"schema": "aten::special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_h(const Scalar & x, const Tensor & n); // {"schema": "aten::special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_h(const Tensor & x, const Scalar & n); // {"schema": "aten::special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_hermite_polynomial_h_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_hermite_polynomial_h_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_hermite_polynomial_h_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_he(const Tensor & x, const Tensor & n); // {"schema": "aten::special_hermite_polynomial_he(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_he(const Scalar & x, const Tensor & n); // {"schema": "aten::special_hermite_polynomial_he.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_hermite_polynomial_he(const Tensor & x, const Scalar & n); // {"schema": "aten::special_hermite_polynomial_he.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_hermite_polynomial_he_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_hermite_polynomial_he_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_he.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_hermite_polynomial_he_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_hermite_polynomial_he.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_laguerre_polynomial_l(const Tensor & x, const Tensor & n); // {"schema": "aten::special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_laguerre_polynomial_l(const Scalar & x, const Tensor & n); // {"schema": "aten::special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_laguerre_polynomial_l(const Tensor & x, const Scalar & n); // {"schema": "aten::special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_laguerre_polynomial_l_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_laguerre_polynomial_l_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_laguerre_polynomial_l_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_legendre_polynomial_p(const Tensor & x, const Tensor & n); // {"schema": "aten::special_legendre_polynomial_p(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_legendre_polynomial_p(const Scalar & x, const Tensor & n); // {"schema": "aten::special_legendre_polynomial_p.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_legendre_polynomial_p(const Tensor & x, const Scalar & n); // {"schema": "aten::special_legendre_polynomial_p.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_legendre_polynomial_p_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_legendre_polynomial_p.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_legendre_polynomial_p_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_legendre_polynomial_p.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_legendre_polynomial_p_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_legendre_polynomial_p.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_modified_bessel_i0(const Tensor & self); // {"schema": "aten::special_modified_bessel_i0(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_modified_bessel_i0_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_modified_bessel_i1(const Tensor & self); // {"schema": "aten::special_modified_bessel_i1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_modified_bessel_i1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_modified_bessel_k0(const Tensor & self); // {"schema": "aten::special_modified_bessel_k0(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_modified_bessel_k0_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_modified_bessel_k0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_modified_bessel_k1(const Tensor & self); // {"schema": "aten::special_modified_bessel_k1(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_modified_bessel_k1_out(const Tensor & self, Tensor & out); // {"schema": "aten::special_modified_bessel_k1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_scaled_modified_bessel_k0(const Tensor & x); // {"schema": "aten::special_scaled_modified_bessel_k0(Tensor x) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_scaled_modified_bessel_k0_out(const Tensor & x, Tensor & out); // {"schema": "aten::special_scaled_modified_bessel_k0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_scaled_modified_bessel_k1(const Tensor & x); // {"schema": "aten::special_scaled_modified_bessel_k1(Tensor x) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_scaled_modified_bessel_k1_out(const Tensor & x, Tensor & out); // {"schema": "aten::special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor special_shifted_chebyshev_polynomial_t(const Tensor & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_t(const Scalar & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_t(const Tensor & x, const Scalar & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_t_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_shifted_chebyshev_polynomial_t_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_t_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_u(const Tensor & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_u(const Scalar & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_u(const Tensor & x, const Scalar & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_u_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_shifted_chebyshev_polynomial_u_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_u_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_v(const Tensor & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_v(const Scalar & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_v(const Tensor & x, const Scalar & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_v_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_shifted_chebyshev_polynomial_v_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_v_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_w(const Tensor & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_w(Tensor x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_w(const Scalar & x, const Tensor & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_w.x_scalar(Scalar x, Tensor n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor special_shifted_chebyshev_polynomial_w(const Tensor & x, const Scalar & n); // {"schema": "aten::special_shifted_chebyshev_polynomial_w.n_scalar(Tensor x, Scalar n) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_w_out(const Tensor & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor & special_shifted_chebyshev_polynomial_w_out(const Scalar & x, const Tensor & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_w.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & special_shifted_chebyshev_polynomial_w_out(const Tensor & x, const Scalar & n, Tensor & out); // {"schema": "aten::special_shifted_chebyshev_polynomial_w.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor special_spherical_bessel_j0(const Tensor & x); // {"schema": "aten::special_spherical_bessel_j0(Tensor x) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & special_spherical_bessel_j0_out(const Tensor & x, Tensor & out); // {"schema": "aten::special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "False"}
+Tensor _foobar(const Tensor & self, bool arg1, bool arg2, bool arg3); // {"schema": "aten::_foobar(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True) -> Tensor", "dispatch": "True", "default": "False"}
+void _fused_adam_(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _fused_adam_(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adam_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _fused_adamw_(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adamw_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _fused_adamw_(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adamw_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _fused_sgd_(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _fused_sgd_(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, const Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()", "dispatch": "True", "default": "False"}
+void _propagate_xla_data(const Tensor & input, const Tensor & output); // {"schema": "aten::_propagate_xla_data(Tensor input, Tensor output) -> ()", "dispatch": "False", "default": "True"}
+Tensor & _new_zeros_with_same_feature_meta_out(const Tensor & self, const Tensor & other, int64_t self_num_batch_dims, Tensor & out); // {"schema": "aten::_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _cudnn_ctc_loss_out(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, Tensor & out0, Tensor & out1); // {"schema": "aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _cudnn_rnn_flatten_weight_out(TensorList weight_arr, int64_t weight_stride0, c10::SymInt input_size, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, bool bidirectional, Tensor & out); // {"schema": "aten::_cudnn_rnn_flatten_weight.out(Tensor[] weight_arr, int weight_stride0, SymInt input_size, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, bool bidirectional, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &,Tensor &> _cudnn_rnn_out(const Tensor & input, TensorList weight, int64_t weight_stride0, const c10::optional<Tensor> & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3, Tensor & out4); // {"schema": "aten::_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))", "dispatch": "True", "default": "True"}
+void _cudnn_rnn_backward_out(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, const Tensor & output, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, const Tensor & reserve, ::std::array<bool,4> output_mask, Tensor & out0, Tensor & out1, Tensor & out2, TensorList out3); // {"schema": "aten::_cudnn_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()", "dispatch": "True", "default": "True"}
+Tensor & _cudnn_init_dropout_state_out(double dropout, bool train, int64_t dropout_seed, Tensor & out); // {"schema": "aten::_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _fused_dropout_out(const Tensor & self, double p, c10::optional<Generator> generator, Tensor & out0, Tensor & out1); // {"schema": "aten::_fused_dropout.out(Tensor self, float p, Generator? generator=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _masked_scale_out(const Tensor & self, const Tensor & mask, double scale, Tensor & out); // {"schema": "aten::_masked_scale.out(Tensor self, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> native_dropout_out(const Tensor & input, double p, c10::optional<bool> train, Tensor & out0, Tensor & out1); // {"schema": "aten::native_dropout.out(Tensor input, float p, bool? train, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & native_dropout_backward_out(const Tensor & grad_output, const Tensor & mask, double scale, Tensor & out); // {"schema": "aten::native_dropout_backward.out(Tensor grad_output, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _conj_physical_out(const Tensor & self, Tensor & out); // {"schema": "aten::_conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _add_relu_out(const Tensor & self, const Scalar & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::_add_relu.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & add_out(const Tensor & self, const Scalar & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & affine_grid_generator_out(const Tensor & theta, c10::SymIntArrayRef size, bool align_corners, Tensor & out); // {"schema": "aten::affine_grid_generator.out(Tensor theta, SymInt[] size, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_functorch_fallback_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bartlett_window_out(int64_t window_length, Tensor & out); // {"schema": "aten::bartlett_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bartlett_window_out(int64_t window_length, bool periodic, Tensor & out); // {"schema": "aten::bartlett_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantized_batch_norm_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & mean, const Tensor & var, double eps, double output_scale, int64_t output_zero_point, Tensor & out); // {"schema": "aten::quantized_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bernoulli_out(const Tensor & self, const Tensor & p, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::bernoulli.Tensor_out(Tensor self, Tensor p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor bernoulli(const Tensor & self, const Tensor & p, c10::optional<Generator> generator); // {"schema": "aten::bernoulli.Tensor(Tensor self, Tensor p, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & bernoulli_out(const Tensor & self, double p, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::bernoulli.float_out(Tensor self, float p=0.5, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & binary_cross_entropy_with_logits_out(const Tensor & self, const Tensor & target, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & pos_weight, int64_t reduction, Tensor & out); // {"schema": "aten::binary_cross_entropy_with_logits.out(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bincount_out(const Tensor & self, const c10::optional<Tensor> & weights, int64_t minlength, Tensor & out); // {"schema": "aten::bincount.out(Tensor self, Tensor? weights=None, int minlength=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & blackman_window_out(int64_t window_length, Tensor & out); // {"schema": "aten::blackman_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & blackman_window_out(int64_t window_length, bool periodic, Tensor & out); // {"schema": "aten::blackman_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & block_diag_out(TensorList tensors, Tensor & out); // {"schema": "aten::block_diag.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & constant_pad_nd_out(const Tensor & self, c10::SymIntArrayRef pad, const Scalar & value, Tensor & out); // {"schema": "aten::constant_pad_nd.out(Tensor self, SymInt[] pad, Scalar value=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & convolution_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, Tensor & out); // {"schema": "aten::convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> convolution_backward_out(const Tensor & grad_output, const Tensor & input, const Tensor & weight, OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & convolution_overrideable_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, Tensor & out); // {"schema": "aten::convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> convolution_backward_overrideable_out(const Tensor & grad_output, const Tensor & input, const Tensor & weight, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::convolution_backward_overrideable.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & _convolution_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, Tensor & out); // {"schema": "aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & conv_tbc_out(const Tensor & self, const Tensor & weight, const Tensor & bias, int64_t pad, Tensor & out); // {"schema": "aten::conv_tbc.out(Tensor self, Tensor weight, Tensor bias, int pad=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & copy_out(const Tensor & self, const Tensor & src, bool non_blocking, Tensor & out); // {"schema": "aten::copy.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _copy_from_out(const Tensor & self, const Tensor & dst, bool non_blocking, Tensor & out); // {"schema": "aten::_copy_from.out(Tensor self, Tensor dst, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _copy_from_and_resize_out(const Tensor & self, const Tensor & dst, Tensor & out); // {"schema": "aten::_copy_from_and_resize.out(Tensor self, Tensor dst, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & count_nonzero_out(const Tensor & self, IntArrayRef dim, Tensor & out); // {"schema": "aten::count_nonzero.dim_IntList_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & count_nonzero_out(const Tensor & self, c10::optional<int64_t> dim, Tensor & out); // {"schema": "aten::count_nonzero.out(Tensor self, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cudnn_affine_grid_generator_out(const Tensor & theta, int64_t N, int64_t C, int64_t H, int64_t W, Tensor & out); // {"schema": "aten::cudnn_affine_grid_generator.out(Tensor theta, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cudnn_affine_grid_generator_backward_out(const Tensor & grad, int64_t N, int64_t C, int64_t H, int64_t W, Tensor & out); // {"schema": "aten::cudnn_affine_grid_generator_backward.out(Tensor grad, int N, int C, int H, int W, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> cudnn_batch_norm_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3); // {"schema": "aten::cudnn_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> cudnn_batch_norm_backward_out(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_var, double epsilon, const Tensor & reserveSpace, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::cudnn_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & cudnn_convolution_transpose_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, Tensor & out); // {"schema": "aten::cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _mps_convolution_transpose_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::_mps_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> mps_convolution_transpose_backward_out(const Tensor & self, const Tensor & grad_output, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,2> output_mask, Tensor & out0, Tensor & out1); // {"schema": "aten::mps_convolution_transpose_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & cudnn_convolution_relu_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cudnn_convolution_add_relu_out(const Tensor & self, const Tensor & weight, const Tensor & z, const c10::optional<Scalar> & alpha, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & cudnn_grid_sampler_out(const Tensor & self, const Tensor & grid, Tensor & out); // {"schema": "aten::cudnn_grid_sampler.out(Tensor self, Tensor grid, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> cudnn_grid_sampler_backward_out(const Tensor & self, const Tensor & grid, const Tensor & grad_output, Tensor & out0, Tensor & out1); // {"schema": "aten::cudnn_grid_sampler_backward.out(Tensor self, Tensor grid, Tensor grad_output, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _ctc_loss_out(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank, bool zero_infinity, Tensor & out0, Tensor & out1); // {"schema": "aten::_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _ctc_loss_out(const Tensor & log_probs, const Tensor & targets, const Tensor & input_lengths, const Tensor & target_lengths, int64_t blank, bool zero_infinity, Tensor & out0, Tensor & out1); // {"schema": "aten::_ctc_loss.Tensor_out(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _ctc_loss_backward_out(const Tensor & grad, const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, const Tensor & neg_log_likelihood, const Tensor & log_alpha, int64_t blank, bool zero_infinity, Tensor & out); // {"schema": "aten::_ctc_loss_backward.out(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & diag_embed_out(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, Tensor & out); // {"schema": "aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & diagonal_backward_out(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2, Tensor & out); // {"schema": "aten::diagonal_backward.out(Tensor grad_output, SymInt[] input_sizes, int offset, int dim1, int dim2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & div_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::div.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & div_out(const Tensor & self, const Scalar & other, c10::optional<c10::string_view> rounding_mode, Tensor & out); // {"schema": "aten::div.Scalar_mode_out(Tensor self, Scalar other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & embedding_out(const Tensor & weight, const Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, Tensor & out); // {"schema": "aten::embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & embedding_dense_backward_out(const Tensor & grad_output, const Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq, Tensor & out); // {"schema": "aten::embedding_dense_backward.out(Tensor grad_output, Tensor indices, SymInt num_weights, SymInt padding_idx, bool scale_grad_by_freq, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & embedding_renorm_out(const Tensor & self, const Tensor & indices, double max_norm, double norm_type, Tensor & out); // {"schema": "aten::embedding_renorm.out(Tensor self, Tensor indices, float max_norm, float norm_type, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor embedding_renorm(const Tensor & self, const Tensor & indices, double max_norm, double norm_type); // {"schema": "aten::embedding_renorm(Tensor self, Tensor indices, float max_norm, float norm_type) -> Tensor", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> _embedding_bag_forward_only_out(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3); // {"schema": "aten::_embedding_bag_forward_only.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> _embedding_bag_out(const Tensor & weight, const Tensor & indices, const Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<Tensor> & per_sample_weights, bool include_last_offset, int64_t padding_idx, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3); // {"schema": "aten::_embedding_bag.out(Tensor weight, Tensor indices, Tensor offsets, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False, int padding_idx=-1, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))", "dispatch": "True", "default": "True"}
+Tensor & _embedding_bag_dense_backward_out(const Tensor & grad, const Tensor & indices, const Tensor & offset2bag, const Tensor & bag_size, const Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<Tensor> & per_sample_weights, int64_t padding_idx, Tensor & out); // {"schema": "aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _embedding_bag_per_sample_weights_backward_out(const Tensor & grad, const Tensor & weight, const Tensor & indices, const Tensor & offsets, const Tensor & offset2bag, int64_t mode, int64_t padding_idx, Tensor & out); // {"schema": "aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & empty_out(IntArrayRef size, c10::optional<DimnameList> names, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::empty.names_out(int[] size, *, Dimname[]? names, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & empty_permuted_out(c10::SymIntArrayRef size, IntArrayRef physical_layout, Tensor & out); // {"schema": "aten::empty_permuted.out(SymInt[] size, int[] physical_layout, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & new_empty_out(const Tensor & self, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::new_empty.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & new_empty_strided_out(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, Tensor & out); // {"schema": "aten::new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & new_full_out(const Tensor & self, c10::SymIntArrayRef size, const Scalar & fill_value, Tensor & out); // {"schema": "aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & new_zeros_out(const Tensor & self, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::new_zeros.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & new_ones_out(const Tensor & self, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::new_ones.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _empty_affine_quantized_out(c10::SymIntArrayRef size, double scale, int64_t zero_point, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _empty_per_channel_affine_quantized_out(c10::SymIntArrayRef size, const Tensor & scales, const Tensor & zero_points, int64_t axis, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+const Tensor & resize_out(const Tensor & self, c10::SymIntArrayRef size, c10::optional<MemoryFormat> memory_format, const Tensor & out); // {"schema": "aten::resize.out(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor resize(const Tensor & self, c10::SymIntArrayRef size, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::resize(Tensor self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+const Tensor & _resize_output_out(const Tensor & self, c10::SymIntArrayRef size, Device device, const Tensor & out); // {"schema": "aten::_resize_output.out(Tensor self, SymInt[] size, Device device, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _resize_output(const Tensor & self, c10::SymIntArrayRef size, Device device); // {"schema": "aten::_resize_output(Tensor self, SymInt[] size, Device device) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & empty_quantized_out(IntArrayRef size, const Tensor & qtensor, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::empty_quantized.out(int[] size, Tensor qtensor, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & empty_like_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & empty_strided_out(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, Tensor & out); // {"schema": "aten::empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & fill_out(const Tensor & self, const Scalar & value, Tensor & out); // {"schema": "aten::fill.Scalar_out(Tensor self, Scalar value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & fill_out(const Tensor & self, const Tensor & value, Tensor & out); // {"schema": "aten::fill.Tensor_out(Tensor self, Tensor value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & floor_divide_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::floor_divide.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & full_out(IntArrayRef size, const Scalar & fill_value, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::full.names_out(int[] size, Scalar fill_value, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & full_like_out(const Tensor & self, const Scalar & fill_value, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::full_like.out(Tensor self, Scalar fill_value, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & from_file_out(c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size, Tensor & out); // {"schema": "aten::from_file.out(str filename, bool? shared=None, int? size=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & grid_sampler_2d_out(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, Tensor & out); // {"schema": "aten::grid_sampler_2d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> grid_sampler_2d_backward_out(const Tensor & grad_output, const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask, Tensor & out0, Tensor & out1); // {"schema": "aten::grid_sampler_2d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _grid_sampler_2d_cpu_fallback_out(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, Tensor & out); // {"schema": "aten::_grid_sampler_2d_cpu_fallback.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & grid_sampler_3d_out(const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, Tensor & out); // {"schema": "aten::grid_sampler_3d.out(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> grid_sampler_3d_backward_out(const Tensor & grad_output, const Tensor & input, const Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask, Tensor & out0, Tensor & out1); // {"schema": "aten::grid_sampler_3d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & hann_window_out(int64_t window_length, Tensor & out); // {"schema": "aten::hann_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hann_window_out(int64_t window_length, bool periodic, Tensor & out); // {"schema": "aten::hann_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hamming_window_out(int64_t window_length, Tensor & out); // {"schema": "aten::hamming_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hamming_window_out(int64_t window_length, bool periodic, Tensor & out); // {"schema": "aten::hamming_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hamming_window_out(int64_t window_length, bool periodic, double alpha, Tensor & out); // {"schema": "aten::hamming_window.periodic_alpha_out(int window_length, bool periodic, float alpha, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hamming_window_out(int64_t window_length, bool periodic, double alpha, double beta, Tensor & out); // {"schema": "aten::hamming_window.periodic_alpha_beta_out(int window_length, bool periodic, float alpha, float beta, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & kaiser_window_out(int64_t window_length, Tensor & out); // {"schema": "aten::kaiser_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & kaiser_window_out(int64_t window_length, bool periodic, Tensor & out); // {"schema": "aten::kaiser_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & kaiser_window_out(int64_t window_length, bool periodic, double beta, Tensor & out); // {"schema": "aten::kaiser_window.beta_out(int window_length, bool periodic, float beta, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_group_norm_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, double eps, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::native_group_norm.out(Tensor input, Tensor? weight, Tensor? bias, SymInt N, SymInt C, SymInt HxW, int group, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_group_norm_backward_out(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & rstd, const c10::optional<Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & index_put_out(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate, Tensor & out); // {"schema": "aten::index_put.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _index_put_impl_out(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate, bool unsafe, Tensor & out); // {"schema": "aten::_index_put_impl.out(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _index_put_impl(const Tensor & self, const c10::List<c10::optional<Tensor>> & indices, const Tensor & values, bool accumulate, bool unsafe); // {"schema": "aten::_index_put_impl(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & isnan_out(const Tensor & self, Tensor & out); // {"schema": "aten::isnan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_layer_norm_out(const Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, double eps, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_layer_norm_backward_out(const Tensor & grad_out, const Tensor & input, c10::SymIntArrayRef normalized_shape, const Tensor & mean, const Tensor & rstd, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> linear_backward_out(const Tensor & self, const Tensor & grad_output, const Tensor & weight, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_linear_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, Tensor & out); // {"schema": "aten::mkldnn_linear.out(Tensor self, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_linear_backward_input_out(IntArrayRef input_size, const Tensor & grad_output, const Tensor & weight, Tensor & out); // {"schema": "aten::mkldnn_linear_backward_input.out(int[] input_size, Tensor grad_output, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> mkldnn_linear_backward_weights_out(const Tensor & grad_output, const Tensor & input, const Tensor & weight, bool bias_defined, Tensor & out0, Tensor & out1); // {"schema": "aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> mkldnn_linear_backward_out(const Tensor & self, const Tensor & grad_output, const Tensor & weight, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::mkldnn_linear_backward.out(Tensor self, Tensor grad_output, Tensor weight, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> matmul_backward_out(const Tensor & grad, const Tensor & self, const Tensor & other, ::std::array<bool,2> mask, Tensor & out0, Tensor & out1); // {"schema": "aten::matmul_backward.out(Tensor grad, Tensor self, Tensor other, bool[2] mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _aminmax_out(const Tensor & self, Tensor & out0, Tensor & out1); // {"schema": "aten::_aminmax.out(Tensor self, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _aminmax_out(const Tensor & self, int64_t dim, bool keepdim, Tensor & out0, Tensor & out1); // {"schema": "aten::_aminmax.dim_out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & max_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::max_pool2d_backward.out(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_max_pool2d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::mkldnn_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_max_pool2d_backward_out(const Tensor & grad_output, const Tensor & output, const Tensor & input, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::mkldnn_max_pool2d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_max_pool3d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::mkldnn_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_max_pool3d_backward_out(const Tensor & grad_output, const Tensor & output, const Tensor & input, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::mkldnn_max_pool3d_backward.out(Tensor grad_output, Tensor output, Tensor input, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantized_max_pool1d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantized_max_pool2d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantized_max_pool3d_out(const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, bool ceil_mode, Tensor & out); // {"schema": "aten::quantized_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & median_out(const Tensor & self, Tensor & out); // {"schema": "aten::median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & nanmedian_out(const Tensor & self, Tensor & out); // {"schema": "aten::nanmedian.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _mps_convolution_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::_mps_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> mps_convolution_backward_out(const Tensor & self, const Tensor & grad_output, const Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::mps_convolution_backward.out(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_convolution_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::mkldnn_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> mkldnn_rnn_layer_out(const Tensor & input, const Tensor & weight0, const Tensor & weight1, const Tensor & weight2, const Tensor & weight3, const Tensor & hx_, const Tensor & cx_, bool reverse, IntArrayRef batch_sizes, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3); // {"schema": "aten::mkldnn_rnn_layer.out(Tensor input, Tensor weight0, Tensor weight1, Tensor weight2, Tensor weight3, Tensor hx_, Tensor cx_, bool reverse, int[] batch_sizes, int mode, int hidden_size, int num_layers, bool has_biases, bool bidirectional, bool batch_first, bool train, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &,Tensor &,Tensor &,Tensor &> mkldnn_rnn_layer_backward_out(const Tensor & input, const Tensor & weight1, const Tensor & weight2, const Tensor & weight3, const Tensor & weight4, const Tensor & hx_, const Tensor & cx_tmp, const Tensor & output, const Tensor & hy_, const Tensor & cy_, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, IntArrayRef batch_sizes, bool batch_first, const Tensor & workspace, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3, Tensor & out4, Tensor & out5, Tensor & out6); // {"schema": "aten::mkldnn_rnn_layer_backward.out(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5, Tensor(g!) out6) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> miopen_batch_norm_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::miopen_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> miopen_batch_norm_backward_out(const Tensor & input, const Tensor & grad_output, const Tensor & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_var, double epsilon, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::miopen_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & miopen_convolution_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, Tensor & out); // {"schema": "aten::miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & miopen_convolution_transpose_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, Tensor & out); // {"schema": "aten::miopen_convolution_transpose.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & miopen_depthwise_convolution_out(const Tensor & self, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, Tensor & out); // {"schema": "aten::miopen_depthwise_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &,Tensor &> miopen_rnn_out(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & hx, const c10::optional<Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3, Tensor & out4); // {"schema": "aten::miopen_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor hx, Tensor? cx, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))", "dispatch": "True", "default": "True"}
+void miopen_rnn_backward_out(const Tensor & input, TensorList weight, int64_t weight_stride0, const Tensor & weight_buf, const Tensor & hx, const c10::optional<Tensor> & cx, const Tensor & output, const c10::optional<Tensor> & grad_output, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional<Tensor> & dropout_state, const Tensor & reserve, ::std::array<bool,4> output_mask, Tensor & out0, Tensor & out1, Tensor & out2, TensorList out3); // {"schema": "aten::miopen_rnn_backward.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, int mode, int hidden_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, int[] batch_sizes, Tensor? dropout_state, Tensor reserve, bool[4] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!)[] out3) -> ()", "dispatch": "True", "default": "True"}
+Tensor & _sparse_sparse_matmul_out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::_sparse_sparse_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mul_out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::mul.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor> _native_batch_norm_legit_functional(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & running_mean, const Tensor & running_var, bool training, double momentum, double eps); // {"schema": "aten::_native_batch_norm_legit_functional(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor running_mean_out, Tensor running_var_out)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _native_batch_norm_legit_no_training_out(const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & bias, const Tensor & running_mean, const Tensor & running_var, double momentum, double eps, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_native_batch_norm_legit_no_training.out(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> batch_norm_stats_out(const Tensor & input, double eps, Tensor & out0, Tensor & out1); // {"schema": "aten::batch_norm_stats.out(Tensor input, float eps, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> batch_norm_gather_stats_out(const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum, double eps, int64_t count, Tensor & out0, Tensor & out1); // {"schema": "aten::batch_norm_gather_stats.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> batch_norm_gather_stats_with_counts_out(const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum, double eps, const Tensor & counts, Tensor & out0, Tensor & out1); // {"schema": "aten::batch_norm_gather_stats_with_counts.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, Tensor counts, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> native_batch_norm_backward_out(const Tensor & grad_out, const Tensor & input, const c10::optional<Tensor> & weight, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, const c10::optional<Tensor> & save_mean, const c10::optional<Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::native_batch_norm_backward.out(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &> batch_norm_backward_reduce_out(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & weight, bool input_g, bool weight_g, bool bias_g, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3); // {"schema": "aten::batch_norm_backward_reduce.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, bool input_g, bool weight_g, bool bias_g, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))", "dispatch": "True", "default": "True"}
+Tensor & batch_norm_backward_elemt_out(const Tensor & grad_out, const Tensor & input, const Tensor & mean, const Tensor & invstd, const c10::optional<Tensor> & weight, const Tensor & sum_dy, const Tensor & sum_dy_xmu, const Tensor & count, Tensor & out); // {"schema": "aten::batch_norm_backward_elemt.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> batch_norm_update_stats_out(const Tensor & input, const c10::optional<Tensor> & running_mean, const c10::optional<Tensor> & running_var, double momentum, Tensor & out0, Tensor & out1); // {"schema": "aten::batch_norm_update_stats.out(Tensor input, Tensor? running_mean, Tensor? running_var, float momentum, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _nnpack_spatial_convolution_out(const Tensor & input, const Tensor & weight, const c10::optional<Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, Tensor & out); // {"schema": "aten::_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ones_out(IntArrayRef size, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::ones.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ones_like_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::ones_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _euclidean_dist_out(const Tensor & x1, const Tensor & x2, Tensor & out); // {"schema": "aten::_euclidean_dist.out(Tensor x1, Tensor x2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _cdist_forward_out(const Tensor & x1, const Tensor & x2, double p, c10::optional<int64_t> compute_mode, Tensor & out); // {"schema": "aten::_cdist_forward.out(Tensor x1, Tensor x2, float p, int? compute_mode, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _cdist_backward_out(const Tensor & grad, const Tensor & x1, const Tensor & x2, double p, const Tensor & cdist, Tensor & out); // {"schema": "aten::_cdist_backward.out(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _pdist_forward_out(const Tensor & self, double p, Tensor & out); // {"schema": "aten::_pdist_forward.out(Tensor self, float p=2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _pdist_backward_out(const Tensor & grad, const Tensor & self, double p, const Tensor & pdist, Tensor & out); // {"schema": "aten::_pdist_backward.out(Tensor grad, Tensor self, float p, Tensor pdist, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & pixel_shuffle_out(const Tensor & self, int64_t upscale_factor, Tensor & out); // {"schema": "aten::pixel_shuffle.out(Tensor self, int upscale_factor, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & pixel_unshuffle_out(const Tensor & self, int64_t downscale_factor, Tensor & out); // {"schema": "aten::pixel_unshuffle.out(Tensor self, int downscale_factor, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & channel_shuffle_out(const Tensor & self, c10::SymInt groups, Tensor & out); // {"schema": "aten::channel_shuffle.out(Tensor self, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _pin_memory_out(const Tensor & self, c10::optional<Device> device, Tensor & out); // {"schema": "aten::_pin_memory.out(Tensor self, Device? device=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & scalar_tensor_out(const Scalar & s, Tensor & out); // {"schema": "aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rand_out(c10::SymIntArrayRef size, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::rand.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rand_out(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::rand.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rand_like_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::rand_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randint_like_out(const Tensor & self, c10::SymInt high, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::randint_like.out(Tensor self, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randint_like_out(const Tensor & self, c10::SymInt low, c10::SymInt high, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::randint_like.low_dtype_out(Tensor self, SymInt low, SymInt high, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randn_out(c10::SymIntArrayRef size, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::randn.names_out(SymInt[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randn_out(c10::SymIntArrayRef size, c10::optional<Generator> generator, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::randn.generator_with_names_out(SymInt[] size, *, Generator? generator, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & randn_like_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::randn_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & repeat_out(const Tensor & self, c10::SymIntArrayRef repeats, Tensor & out); // {"schema": "aten::repeat.out(Tensor self, SymInt[] repeats, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & repeat_interleave_out(const Tensor & repeats, c10::optional<c10::SymInt> output_size, Tensor & out); // {"schema": "aten::repeat_interleave.Tensor_out(Tensor repeats, *, SymInt? output_size=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _mkldnn_reshape_out(const Tensor & self, IntArrayRef shape, Tensor & out); // {"schema": "aten::_mkldnn_reshape.out(Tensor self, int[] shape, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & relu_out(const Tensor & self, Tensor & out); // {"schema": "aten::relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & select_backward_out(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index, Tensor & out); // {"schema": "aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & celu_out(const Tensor & self, const Scalar & alpha, Tensor & out); // {"schema": "aten::celu.out(Tensor self, Scalar alpha=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & slice_backward_out(const Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step, Tensor & out); // {"schema": "aten::slice_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt start, SymInt end, SymInt step, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & slice_scatter_out(const Tensor & self, const Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step, Tensor & out); // {"schema": "aten::slice_scatter.out(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & select_scatter_out(const Tensor & self, const Tensor & src, int64_t dim, c10::SymInt index, Tensor & out); // {"schema": "aten::select_scatter.out(Tensor self, Tensor src, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & diagonal_scatter_out(const Tensor & self, const Tensor & src, int64_t offset, int64_t dim1, int64_t dim2, Tensor & out); // {"schema": "aten::diagonal_scatter.out(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & as_strided_scatter_out(const Tensor & self, const Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset, Tensor & out); // {"schema": "aten::as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void unsafe_split_out(const Tensor & self, c10::SymInt split_size, int64_t dim, TensorList out); // {"schema": "aten::unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void unsafe_split_with_sizes_out(const Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, TensorList out); // {"schema": "aten::unsafe_split_with_sizes.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+Tensor & sum_out(const Tensor & self, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::sum.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> std_mean_out(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out0, Tensor & out1); // {"schema": "aten::std_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & prod_out(const Tensor & self, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::prod.out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _mkldnn_transpose_out(const Tensor & self, int64_t dim0, int64_t dim1, Tensor & out); // {"schema": "aten::_mkldnn_transpose.out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & flip_out(const Tensor & self, IntArrayRef dims, Tensor & out); // {"schema": "aten::flip.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & roll_out(const Tensor & self, c10::SymIntArrayRef shifts, IntArrayRef dims, Tensor & out); // {"schema": "aten::roll.out(Tensor self, SymInt[1] shifts, int[1] dims=[], *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rot90_out(const Tensor & self, int64_t k, IntArrayRef dims, Tensor & out); // {"schema": "aten::rot90.out(Tensor self, int k=1, int[] dims=[0,1], *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _transform_bias_rescale_qkv_out(const Tensor & qkv, const Tensor & qkv_bias, int64_t num_heads, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_transform_bias_rescale_qkv.out(Tensor qkv, Tensor qkv_bias, int num_heads, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & _nested_tensor_from_mask_out(const Tensor & t, const Tensor & mask, bool mask_check, Tensor & out); // {"schema": "aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_from_padded_out(const Tensor & padded, const Tensor & cpu_nested_shape_example, bool fuse_transform_0213, Tensor & out); // {"schema": "aten::_nested_from_padded.out(Tensor padded, Tensor cpu_nested_shape_example, bool fuse_transform_0213=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_tensor_size_out(const Tensor & self, Tensor & out); // {"schema": "aten::_nested_tensor_size.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_tensor_strides_out(const Tensor & self, Tensor & out); // {"schema": "aten::_nested_tensor_strides.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_tensor_storage_offsets_out(const Tensor & self, Tensor & out); // {"schema": "aten::_nested_tensor_storage_offsets.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_from_padded_and_nested_example_out(const Tensor & padded, const Tensor & nt_example, Tensor & out); // {"schema": "aten::_nested_from_padded_and_nested_example.out(Tensor padded, Tensor nt_example, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_view_from_buffer_copy_out(const Tensor & self, const Tensor & nested_size, const Tensor & nested_strides, const Tensor & offsets, Tensor & out); // {"schema": "aten::_nested_view_from_buffer_copy.out(Tensor self, Tensor nested_size, Tensor nested_strides, Tensor offsets, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_view_from_jagged_copy_out(const Tensor & self, const Tensor & offsets, const Tensor & dummy, const c10::optional<Tensor> & lengths, int64_t ragged_idx, Tensor & out); // {"schema": "aten::_nested_view_from_jagged_copy.out(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_get_values_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_nested_get_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _trilinear_out(const Tensor & i1, const Tensor & i2, const Tensor & i3, IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3, IntArrayRef sumdim, int64_t unroll_dim, Tensor & out); // {"schema": "aten::_trilinear.out(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _unique_out(const Tensor & self, bool sorted, bool return_inverse, Tensor & out0, Tensor & out1); // {"schema": "aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> unique_dim_out(const Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::unique_dim.out(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> unique_consecutive_out(const Tensor & self, bool return_inverse, bool return_counts, c10::optional<int64_t> dim, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::unique_consecutive.out(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> unique_dim_consecutive_out(const Tensor & self, int64_t dim, bool return_inverse, bool return_counts, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::unique_dim_consecutive.out(Tensor self, int dim, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _unique2_out(const Tensor & self, bool sorted, bool return_inverse, bool return_counts, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_unique2.out(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & _unsafe_view_out(const Tensor & self, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::_unsafe_view.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> var_mean_out(const Tensor & self, OptionalIntArrayRef dim, const c10::optional<Scalar> & correction, bool keepdim, Tensor & out0, Tensor & out1); // {"schema": "aten::var_mean.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _weight_norm_interface_out(const Tensor & v, const Tensor & g, int64_t dim, Tensor & out0, Tensor & out1); // {"schema": "aten::_weight_norm_interface.out(Tensor v, Tensor g, int dim=0, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _weight_norm_interface_backward_out(const Tensor & grad_w, const Tensor & saved_v, const Tensor & saved_g, const Tensor & saved_norms, int64_t dim, Tensor & out0, Tensor & out1); // {"schema": "aten::_weight_norm_interface_backward.out(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & zeros_out(IntArrayRef size, c10::optional<DimnameList> names, Tensor & out); // {"schema": "aten::zeros.names_out(int[] size, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _efficientzerotensor_out(c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::_efficientzerotensor.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & zeros_like_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::zeros_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _standard_gamma_grad_out(const Tensor & self, const Tensor & output, Tensor & out); // {"schema": "aten::_standard_gamma_grad.out(Tensor self, Tensor output, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _standard_gamma_out(const Tensor & self, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::_standard_gamma.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _dirichlet_grad_out(const Tensor & x, const Tensor & alpha, const Tensor & total, Tensor & out); // {"schema": "aten::_dirichlet_grad.out(Tensor x, Tensor alpha, Tensor total, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sample_dirichlet_out(const Tensor & self, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::_sample_dirichlet.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & poisson_out(const Tensor & self, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::poisson.out(Tensor self, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & binomial_out(const Tensor & count, const Tensor & prob, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & native_norm_out(const Tensor & self, const Scalar & p, Tensor & out); // {"schema": "aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & native_norm_out(const Tensor & self, const c10::optional<Scalar> & p, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_sum_out(const Tensor & self, IntArrayRef dim, Tensor & out); // {"schema": "aten::_sparse_sum.dim_out(Tensor self, int[1] dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_sum_backward_out(const Tensor & grad, const Tensor & self, IntArrayRef dim, Tensor & out); // {"schema": "aten::_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_csr_sum_out(const Tensor & self, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::_sparse_csr_sum.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_csr_prod_out(const Tensor & self, IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::_sparse_csr_prod.dim_dtype_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_softmax_out(const Tensor & self, int64_t dim, bool half_to_float, Tensor & out); // {"schema": "aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_softmax_backward_data_out(const Tensor & grad_output, const Tensor & output, int64_t dim, const Tensor & self, Tensor & out); // {"schema": "aten::_sparse_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_log_softmax_out(const Tensor & self, int64_t dim, bool half_to_float, Tensor & out); // {"schema": "aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_log_softmax_backward_data_out(const Tensor & grad_output, const Tensor & output, int64_t dim, const Tensor & self, Tensor & out); // {"schema": "aten::_sparse_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _spdiags_out(const Tensor & diagonals, const Tensor & offsets, IntArrayRef shape, c10::optional<Layout> layout, Tensor & out); // {"schema": "aten::_spdiags.out(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & norm_out(const Tensor & self, const c10::optional<Scalar> & p, ScalarType dtype, Tensor & out); // {"schema": "aten::norm.ScalarOpt_dtype_out(Tensor self, Scalar? p, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & norm_out(const Tensor & self, const Scalar & p, Tensor & out); // {"schema": "aten::norm.Scalar_out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & clone_out(const Tensor & self, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::clone.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+const Tensor & resize_as_out(const Tensor & self, const Tensor & the_template, c10::optional<MemoryFormat> memory_format, const Tensor & out); // {"schema": "aten::resize_as.out(Tensor self, Tensor the_template, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor resize_as(const Tensor & self, const Tensor & the_template, c10::optional<MemoryFormat> memory_format); // {"schema": "aten::resize_as(Tensor self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor", "dispatch": "True", "default": "True"}
+const Tensor & resize_as_sparse_out(const Tensor & self, const Tensor & the_template, const Tensor & out); // {"schema": "aten::resize_as_sparse.out(Tensor self, Tensor the_template, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor resize_as_sparse(const Tensor & self, const Tensor & the_template); // {"schema": "aten::resize_as_sparse(Tensor self, Tensor the_template) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & zero_out(const Tensor & self, Tensor & out); // {"schema": "aten::zero.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor zero(const Tensor & self); // {"schema": "aten::zero(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sub_out(const Tensor & self, const Scalar & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::sub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rsub_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::rsub.Tensor_out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rsub_out(const Tensor & self, const Scalar & other, const Scalar & alpha, Tensor & out); // {"schema": "aten::rsub.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_addmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out); // {"schema": "aten::_sparse_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & sparse_coo_tensor_out(IntArrayRef size, Tensor & out); // {"schema": "aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_coo_tensor_with_dims_out(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size, Tensor & out); // {"schema": "aten::_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_coo_tensor_with_dims_and_tensors_out(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const Tensor & indices, const Tensor & values, c10::optional<bool> is_coalesced, Tensor & out); // {"schema": "aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+const Tensor & sparse_resize_out(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const Tensor & out); // {"schema": "aten::sparse_resize.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor sparse_resize(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); // {"schema": "aten::sparse_resize(Tensor self, int[] size, int sparse_dim, int dense_dim) -> Tensor", "dispatch": "True", "default": "True"}
+const Tensor & sparse_resize_and_clear_out(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const Tensor & out); // {"schema": "aten::sparse_resize_and_clear.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor sparse_resize_and_clear(const Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); // {"schema": "aten::sparse_resize_and_clear(Tensor self, int[] size, int sparse_dim, int dense_dim) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & sparse_mask_out(const Tensor & self, const Tensor & mask, Tensor & out); // {"schema": "aten::sparse_mask.out(Tensor self, Tensor mask, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_mask_projection_out(const Tensor & self, const Tensor & mask, bool accumulate_matches, Tensor & out); // {"schema": "aten::_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_dense_out(const Tensor & self, c10::optional<ScalarType> dtype, c10::optional<bool> masked_grad, Tensor & out); // {"schema": "aten::_to_dense.out(Tensor self, ScalarType? dtype=None, bool? masked_grad=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _coalesce_out(const Tensor & self, Tensor & out); // {"schema": "aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _coalesced_out(const Tensor & self, bool coalesced, Tensor & out); // {"schema": "aten::_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor _coalesced(const Tensor & self, bool coalesced); // {"schema": "aten::_coalesced(Tensor self, bool coalesced) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & copy_sparse_to_sparse_out(const Tensor & self, const Tensor & src, bool non_blocking, Tensor & out); // {"schema": "aten::copy_sparse_to_sparse.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor copy_sparse_to_sparse(const Tensor & self, const Tensor & src, bool non_blocking); // {"schema": "aten::copy_sparse_to_sparse(Tensor self, Tensor src, bool non_blocking=False) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_out(const Tensor & self, int64_t sparse_dim, Tensor & out); // {"schema": "aten::_to_sparse.sparse_dim_out(Tensor self, int sparse_dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_out(const Tensor & self, c10::optional<Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim, Tensor & out); // {"schema": "aten::_to_sparse.out(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_csr_out(const Tensor & self, c10::optional<int64_t> dense_dim, Tensor & out); // {"schema": "aten::_to_sparse_csr.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_csc_out(const Tensor & self, c10::optional<int64_t> dense_dim, Tensor & out); // {"schema": "aten::_to_sparse_csc.out(Tensor self, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_bsr_out(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim, Tensor & out); // {"schema": "aten::_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _to_sparse_bsc_out(const Tensor & self, IntArrayRef blocksize, c10::optional<int64_t> dense_dim, Tensor & out); // {"schema": "aten::_to_sparse_bsc.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & to_mkldnn_out(const Tensor & self, c10::optional<ScalarType> dtype, Tensor & out); // {"schema": "aten::to_mkldnn.out(Tensor self, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_reorder_conv2d_weight_out(const Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, OptionalSymIntArrayRef input_size, Tensor & out); // {"schema": "aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_reorder_conv3d_weight_out(const Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, Tensor & out); // {"schema": "aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantize_per_tensor_dynamic_out(const Tensor & self, ScalarType dtype, bool reduce_range, Tensor & out); // {"schema": "aten::quantize_per_tensor_dynamic.out(Tensor self, ScalarType dtype, bool reduce_range, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantize_per_tensor_out(const Tensor & self, double scale, int64_t zero_point, ScalarType dtype, Tensor & out); // {"schema": "aten::quantize_per_tensor.out(Tensor self, float scale, int zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & quantize_per_tensor_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, ScalarType dtype, Tensor & out); // {"schema": "aten::quantize_per_tensor.tensor_qparams_out(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void quantize_per_tensor_out(TensorList tensors, const Tensor & scales, const Tensor & zero_points, ScalarType dtype, TensorList out); // {"schema": "aten::quantize_per_tensor.tensors_out(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+Tensor & quantize_per_channel_out(const Tensor & self, const Tensor & scales, const Tensor & zero_points, int64_t axis, ScalarType dtype, Tensor & out); // {"schema": "aten::quantize_per_channel.out(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & dequantize_out(const Tensor & self, Tensor & out); // {"schema": "aten::dequantize.self_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void dequantize_out(TensorList tensors, TensorList out); // {"schema": "aten::dequantize.tensors_out(Tensor[] tensors, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+Tensor & q_per_channel_scales_out(const Tensor & self, Tensor & out); // {"schema": "aten::q_per_channel_scales.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & q_per_channel_zero_points_out(const Tensor & self, Tensor & out); // {"schema": "aten::q_per_channel_zero_points.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & int_repr_out(const Tensor & self, Tensor & out); // {"schema": "aten::int_repr.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _make_per_tensor_quantized_tensor_out(const Tensor & self, double scale, int64_t zero_point, Tensor & out); // {"schema": "aten::_make_per_tensor_quantized_tensor.out(Tensor self, float scale, int zero_point, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _make_per_channel_quantized_tensor_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, Tensor & out); // {"schema": "aten::_make_per_channel_quantized_tensor.out(Tensor self, Tensor scale, Tensor zero_point, int axis, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> fake_quantize_per_tensor_affine_cachemask_out(const Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, Tensor & out0, Tensor & out1); // {"schema": "aten::fake_quantize_per_tensor_affine_cachemask.out(Tensor self, float scale, int zero_point, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, const Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max, Tensor & out0, Tensor & out1); // {"schema": "aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _fake_quantize_learnable_per_tensor_affine_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, Tensor & out); // {"schema": "aten::_fake_quantize_learnable_per_tensor_affine.out(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> fake_quantize_per_channel_affine_cachemask_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, Tensor & out0, Tensor & out1); // {"schema": "aten::fake_quantize_per_channel_affine_cachemask.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _fake_quantize_learnable_per_channel_affine_out(const Tensor & self, const Tensor & scale, const Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, Tensor & out); // {"schema": "aten::_fake_quantize_learnable_per_channel_affine.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _fused_moving_avg_obs_fq_helper_out(const Tensor & self, const Tensor & observer_on, const Tensor & fake_quant_on, Tensor & running_min, Tensor & running_max, Tensor & scale, Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant, Tensor & out0, Tensor & out1); // {"schema": "aten::_fused_moving_avg_obs_fq_helper.out(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor(a!) running_min, Tensor(b!) running_max, Tensor(c!) scale, Tensor(d!) zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False, *, Tensor(e!) out0, Tensor(f!) out1) -> (Tensor(e!), Tensor(f!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor,Tensor,Tensor,Tensor,Tensor> _fused_moving_avg_obs_fq_helper_functional(const Tensor & self, const Tensor & observer_on, const Tensor & fake_quant_on, const Tensor & running_min, const Tensor & running_max, const Tensor & scale, const Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant); // {"schema": "aten::_fused_moving_avg_obs_fq_helper_functional(Tensor self, Tensor observer_on, Tensor fake_quant_on, Tensor running_min, Tensor running_max, Tensor scale, Tensor zero_point, float averaging_const, int quant_min, int quant_max, int ch_axis, bool per_row_fake_quant=False, bool symmetric_quant=False) -> (Tensor output, Tensor mask, Tensor running_min_out, Tensor running_max_out, Tensor scale_out, Tensor zero_point_out)", "dispatch": "True", "default": "True"}
+Tensor & _to_copy_out(const Tensor & self, bool non_blocking, c10::optional<MemoryFormat> memory_format, Tensor & out); // {"schema": "aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &,Tensor &,Tensor &> _lstm_mps_out(const Tensor & input, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3, Tensor & out4, Tensor & out5); // {"schema": "aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!))", "dispatch": "True", "default": "True"}
+void lstm_mps_backward_out(const c10::optional<Tensor> & grad_y, const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & z_state, const Tensor & cell_state_fwd, const Tensor & input, const Tensor & layersOutputs, TensorList hx, TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, Tensor & out0, TensorList out1, TensorList out2); // {"schema": "aten::lstm_mps_backward.out(Tensor? grad_y, Tensor? grad_hy, Tensor? grad_cy, Tensor z_state, Tensor cell_state_fwd, Tensor input, Tensor layersOutputs, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!)[] out1, Tensor(c!)[] out2) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _thnn_fused_lstm_cell_out(const Tensor & input_gates, const Tensor & hidden_gates, const Tensor & cx, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_thnn_fused_lstm_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor cx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _thnn_fused_lstm_cell_backward_impl_out(const c10::optional<Tensor> & grad_hy, const c10::optional<Tensor> & grad_cy, const Tensor & cx, const Tensor & cy, const Tensor & workspace, bool has_bias, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_thnn_fused_lstm_cell_backward_impl.out(Tensor? grad_hy, Tensor? grad_cy, Tensor cx, Tensor cy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _thnn_fused_gru_cell_out(const Tensor & input_gates, const Tensor & hidden_gates, const Tensor & hx, const c10::optional<Tensor> & input_bias, const c10::optional<Tensor> & hidden_bias, Tensor & out0, Tensor & out1); // {"schema": "aten::_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &,Tensor &,Tensor &> _thnn_fused_gru_cell_backward_out(const Tensor & grad_hy, const Tensor & workspace, bool has_bias, Tensor & out0, Tensor & out1, Tensor & out2, Tensor & out3, Tensor & out4); // {"schema": "aten::_thnn_fused_gru_cell_backward.out(Tensor grad_hy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _pack_padded_sequence_out(const Tensor & input, const Tensor & lengths, bool batch_first, Tensor & out0, Tensor & out1); // {"schema": "aten::_pack_padded_sequence.out(Tensor input, Tensor lengths, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & set_out(const Tensor & self, Storage source, Tensor & out); // {"schema": "aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor set(const Tensor & self, Storage source); // {"schema": "aten::set.source_Storage(Tensor self, Storage source) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & set_out(const Tensor & self, Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, Tensor & out); // {"schema": "aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor set(const Tensor & self, Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); // {"schema": "aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & set_out(const Tensor & self, const Tensor & source, Tensor & out); // {"schema": "aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor set(const Tensor & self, const Tensor & source); // {"schema": "aten::set.source_Tensor(Tensor self, Tensor source) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & set_out(const Tensor & self, Tensor & out); // {"schema": "aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor set(const Tensor & self); // {"schema": "aten::set(Tensor self) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & lift_out(const Tensor & self, Tensor & out); // {"schema": "aten::lift.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & lift_fresh_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::lift_fresh_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & masked_fill_out(const Tensor & self, const Tensor & mask, const Scalar & value, Tensor & out); // {"schema": "aten::masked_fill.Scalar_out(Tensor self, Tensor mask, Scalar value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & masked_fill_out(const Tensor & self, const Tensor & mask, const Tensor & value, Tensor & out); // {"schema": "aten::masked_fill.Tensor_out(Tensor self, Tensor mask, Tensor value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & masked_scatter_out(const Tensor & self, const Tensor & mask, const Tensor & source, Tensor & out); // {"schema": "aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _masked_softmax_out(const Tensor & self, const Tensor & mask, c10::optional<int64_t> dim, c10::optional<int64_t> mask_type, Tensor & out); // {"schema": "aten::_masked_softmax.out(Tensor self, Tensor mask, int? dim=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _masked_softmax_backward_out(const Tensor & grad_output, const Tensor & output, const Tensor & mask, c10::optional<int64_t> dim, Tensor & out); // {"schema": "aten::_masked_softmax_backward.out(Tensor grad_output, Tensor output, Tensor mask, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & put_out(const Tensor & self, const Tensor & index, const Tensor & source, bool accumulate, Tensor & out); // {"schema": "aten::put.out(Tensor self, Tensor index, Tensor source, bool accumulate=False, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & index_fill_out(const Tensor & self, int64_t dim, const Tensor & index, const Scalar & value, Tensor & out); // {"schema": "aten::index_fill.int_Scalar_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & index_fill_out(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & value, Tensor & out); // {"schema": "aten::index_fill.int_Tensor_out(Tensor self, int dim, Tensor index, Tensor value, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_and_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_and.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_or_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_or.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_xor_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_xor.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & __lshift___out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::__lshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & __lshift___out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::__lshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_left_shift_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_left_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & __rshift___out(const Tensor & self, const Scalar & other, Tensor & out); // {"schema": "aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & __rshift___out(const Tensor & self, const Tensor & other, Tensor & out); // {"schema": "aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & bitwise_right_shift_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::bitwise_right_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & random_out(const Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor random(const Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<Generator> generator); // {"schema": "aten::random.from(Tensor self, int from, int? to, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & random_out(const Tensor & self, int64_t to, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor random(const Tensor & self, int64_t to, c10::optional<Generator> generator); // {"schema": "aten::random.to(Tensor self, int to, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & random_out(const Tensor & self, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor random(const Tensor & self, c10::optional<Generator> generator); // {"schema": "aten::random(Tensor self, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & uniform_out(const Tensor & self, double from, double to, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor uniform(const Tensor & self, double from, double to, c10::optional<Generator> generator); // {"schema": "aten::uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & cauchy_out(const Tensor & self, double median, double sigma, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::cauchy.out(Tensor self, float median=0, float sigma=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor cauchy(const Tensor & self, double median, double sigma, c10::optional<Generator> generator); // {"schema": "aten::cauchy(Tensor self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & log_normal_out(const Tensor & self, double mean, double std, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::log_normal.out(Tensor self, float mean=1, float std=2, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor log_normal(const Tensor & self, double mean, double std, c10::optional<Generator> generator); // {"schema": "aten::log_normal(Tensor self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & exponential_out(const Tensor & self, double lambd, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::exponential.out(Tensor self, float lambd=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor exponential(const Tensor & self, double lambd, c10::optional<Generator> generator); // {"schema": "aten::exponential(Tensor self, float lambd=1, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & geometric_out(const Tensor & self, double p, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::geometric.out(Tensor self, float p, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor geometric(const Tensor & self, double p, c10::optional<Generator> generator); // {"schema": "aten::geometric(Tensor self, float p, *, Generator? generator=None) -> Tensor", "dispatch": "True", "default": "True"}
+Tensor & tril_indices_out(int64_t row, int64_t col, int64_t offset, Tensor & out); // {"schema": "aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & triu_indices_out(int64_t row, int64_t col, int64_t offset, Tensor & out); // {"schema": "aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & trace_out(const Tensor & self, Tensor & out); // {"schema": "aten::trace.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _cholesky_solve_helper_out(const Tensor & self, const Tensor & A, bool upper, Tensor & out); // {"schema": "aten::_cholesky_solve_helper.out(Tensor self, Tensor A, bool upper, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & dist_out(const Tensor & self, const Tensor & other, const Scalar & p, Tensor & out); // {"schema": "aten::dist.out(Tensor self, Tensor other, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void _histogramdd_bin_edges_out(const Tensor & self, IntArrayRef bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density, TensorList out); // {"schema": "aten::_histogramdd_bin_edges.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+Tensor & _histogramdd_from_bin_cts_out(const Tensor & self, IntArrayRef bins, c10::optional<ArrayRef<double>> range, const c10::optional<Tensor> & weight, bool density, Tensor & out); // {"schema": "aten::_histogramdd_from_bin_cts.out(Tensor self, int[] bins, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _histogramdd_from_bin_tensors_out(const Tensor & self, TensorList bins, const c10::optional<Tensor> & weight, bool density, Tensor & out); // {"schema": "aten::_histogramdd_from_bin_tensors.out(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & remainder_out(const Scalar & self, const Tensor & other, Tensor & out); // {"schema": "aten::remainder.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & argsort_out(const Tensor & self, bool stable, int64_t dim, bool descending, Tensor & out); // {"schema": "aten::argsort.stable_out(Tensor self, *, bool stable, int dim=-1, bool descending=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & unfold_backward_out(const Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step, Tensor & out); // {"schema": "aten::unfold_backward.out(Tensor grad_in, SymInt[] input_sizes, int dim, int size, int step, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & normal_out(const Tensor & self, double mean, double std, c10::optional<Generator> generator, Tensor & out); // {"schema": "aten::normal.out(Tensor self, float mean=0, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void _amp_foreach_non_finite_check_and_unscale_out(TensorList self, Tensor & found_inf, const Tensor & inv_scale, TensorList out); // {"schema": "aten::_amp_foreach_non_finite_check_and_unscale.out(Tensor[] self, Tensor(b!) found_inf, Tensor inv_scale, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,Tensor> _amp_foreach_non_finite_check_and_unscale(TensorList self, const Tensor & found_inf, const Tensor & inv_scale); // {"schema": "aten::_amp_foreach_non_finite_check_and_unscale(Tensor[] self, Tensor found_inf, Tensor inv_scale) -> (Tensor[] self_out, Tensor found_inf_out)", "dispatch": "True", "default": "True"}
+Tensor & _amp_update_scale_out(const Tensor & self, Tensor & growth_tracker, const Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, Tensor & out); // {"schema": "aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor,Tensor> _amp_update_scale(const Tensor & self, const Tensor & growth_tracker, const Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval); // {"schema": "aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out)", "dispatch": "True", "default": "True"}
+void _foreach_add_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_add.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_add_out(TensorList self, TensorList other, const Scalar & alpha, TensorList out); // {"schema": "aten::_foreach_add.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_add_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_add.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_add_out(TensorList self, const Tensor & other, const Scalar & alpha, TensorList out); // {"schema": "aten::_foreach_add.Tensor_out(Tensor[] self, Tensor other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sub_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_sub.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sub_out(TensorList self, TensorList other, const Scalar & alpha, TensorList out); // {"schema": "aten::_foreach_sub.List_out(Tensor[] self, Tensor[] other, *, Scalar alpha=1, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sub_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_sub.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_mul_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_mul.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_mul_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_mul.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_mul_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_mul.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_mul_out(TensorList self, const Tensor & other, TensorList out); // {"schema": "aten::_foreach_mul.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_div_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_div.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_div_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_div.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_div_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_div.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_div_out(TensorList self, const Tensor & other, TensorList out); // {"schema": "aten::_foreach_div.Tensor_out(Tensor[] self, Tensor other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_max_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_clamp_max.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_max_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_clamp_max.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_max_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_clamp_max.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_min_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_clamp_min.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_min_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_clamp_min.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_clamp_min_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_clamp_min.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_maximum_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_maximum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_maximum_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_maximum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_maximum_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_maximum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_minimum_out(TensorList self, const Scalar & scalar, TensorList out); // {"schema": "aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_minimum_out(TensorList self, TensorList other, TensorList out); // {"schema": "aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_minimum_out(TensorList self, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcdiv_out(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value, TensorList out); // {"schema": "aten::_foreach_addcdiv.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcdiv_out(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_addcdiv.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcdiv_out(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars, TensorList out); // {"schema": "aten::_foreach_addcdiv.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcmul_out(TensorList self, TensorList tensor1, TensorList tensor2, const Scalar & value, TensorList out); // {"schema": "aten::_foreach_addcmul.Scalar_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcmul_out(TensorList self, TensorList tensor1, TensorList tensor2, ArrayRef<Scalar> scalars, TensorList out); // {"schema": "aten::_foreach_addcmul.ScalarList_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar[] scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_addcmul_out(TensorList self, TensorList tensor1, TensorList tensor2, const Tensor & scalars, TensorList out); // {"schema": "aten::_foreach_addcmul.Tensor_out(Tensor[] self, Tensor[] tensor1, Tensor[] tensor2, Tensor scalars, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_abs_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_acos_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_acos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_asin_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_atan_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_atan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_ceil_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_ceil.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_cos_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_cos.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_cosh_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_cosh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_erf_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_erfc_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_exp_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_exp.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_expm1_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_floor_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_floor.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_frac_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_frac.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_lerp_out(TensorList self, TensorList tensors1, TensorList weights, TensorList out); // {"schema": "aten::_foreach_lerp.List_out(Tensor[] self, Tensor[] tensors1, Tensor[] weights, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_lerp_out(TensorList self, TensorList tensors1, const Scalar & weight, TensorList out); // {"schema": "aten::_foreach_lerp.Scalar_out(Tensor[] self, Tensor[] tensors1, Scalar weight, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_lgamma_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_lgamma.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_log_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_log10_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_log10.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_log1p_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_log1p.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_log2_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_log2.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_neg_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_neg.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_norm_out(TensorList self, const Scalar & ord, TensorList out); // {"schema": "aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_pow_out(TensorList self, TensorList exponent, TensorList out); // {"schema": "aten::_foreach_pow.List_out(Tensor[] self, Tensor[] exponent, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_pow_out(TensorList self, const Scalar & exponent, TensorList out); // {"schema": "aten::_foreach_pow.Scalar_out(Tensor[] self, Scalar exponent, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_pow_out(TensorList self, ArrayRef<Scalar> exponent, TensorList out); // {"schema": "aten::_foreach_pow.ScalarList_out(Tensor[] self, Scalar[] exponent, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_reciprocal_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_round_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_round.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sigmoid_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_sigmoid.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sign_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_sign.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sin_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_sin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sinh_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_sinh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_sqrt_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_sqrt.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_tan_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_tan.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_tanh_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_tanh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_trunc_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+void _foreach_zero_out(TensorList self, TensorList out); // {"schema": "aten::_foreach_zero.out(Tensor[] self, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> _foreach_zero(TensorList self); // {"schema": "aten::_foreach_zero(Tensor[] self) -> Tensor[] self_out", "dispatch": "True", "default": "True"}
+void _foreach_copy_out(TensorList self, TensorList src, bool non_blocking, TensorList out); // {"schema": "aten::_foreach_copy.out(Tensor[] self, Tensor[] src, bool non_blocking=False, *, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::vector<Tensor> _foreach_copy(TensorList self, TensorList src, bool non_blocking); // {"schema": "aten::_foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out", "dispatch": "True", "default": "True"}
+Tensor & bucketize_out(const Scalar & self, const Tensor & boundaries, bool out_int32, bool right, Tensor & out); // {"schema": "aten::bucketize.Scalar_out(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & glu_jvp_out(const Tensor & glu, const Tensor & x, const Tensor & dx, int64_t dim, Tensor & out); // {"schema": "aten::glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & glu_backward_jvp_out(const Tensor & grad_x, const Tensor & grad_glu, const Tensor & x, const Tensor & dgrad_glu, const Tensor & dx, int64_t dim, Tensor & out); // {"schema": "aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & hardswish_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & out); // {"schema": "aten::hardswish_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & rrelu_with_noise_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & noise, const Scalar & lower, const Scalar & upper, bool training, bool self_is_result, Tensor & out); // {"schema": "aten::rrelu_with_noise_backward.out(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & mkldnn_adaptive_avg_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & out); // {"schema": "aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _adaptive_avg_pool2d_out(const Tensor & self, c10::SymIntArrayRef output_size, Tensor & out); // {"schema": "aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _adaptive_avg_pool2d_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & out); // {"schema": "aten::_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _adaptive_avg_pool3d_out(const Tensor & self, c10::SymIntArrayRef output_size, Tensor & out); // {"schema": "aten::_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _adaptive_avg_pool3d_backward_out(const Tensor & grad_output, const Tensor & self, Tensor & out); // {"schema": "aten::_adaptive_avg_pool3d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &,Tensor &> _slow_conv2d_backward_out(const Tensor & grad_output, const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, ::std::array<bool,3> output_mask, Tensor & out0, Tensor & out1, Tensor & out2); // {"schema": "aten::_slow_conv2d_backward.output_mask_out(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))", "dispatch": "True", "default": "True"}
+Tensor & conv_depthwise3d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, Tensor & out); // {"schema": "aten::conv_depthwise3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & slow_conv_dilated2d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, Tensor & out); // {"schema": "aten::slow_conv_dilated2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & slow_conv_dilated3d_out(const Tensor & self, const Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, Tensor & out); // {"schema": "aten::slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & isinf_out(const Tensor & self, Tensor & out); // {"schema": "aten::isinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & linalg_matrix_exp_out(const Tensor & self, Tensor & out); // {"schema": "aten::linalg_matrix_exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_optional_intlist_out(const Tensor & values, OptionalIntArrayRef addends, Tensor & out); // {"schema": "aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_optional_filled_intlist_out(const Tensor & values, OptionalIntArrayRef addends, Tensor & out); // {"schema": "aten::_test_optional_filled_intlist.out(Tensor values, int[2]? addends, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_optional_floatlist_out(const Tensor & values, c10::optional<ArrayRef<double>> addends, Tensor & out); // {"schema": "aten::_test_optional_floatlist.out(Tensor values, float[]? addends, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_warn_in_autograd_out(const Tensor & self, Tensor & out); // {"schema": "aten::_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_autograd_multiple_dispatch_out(const Tensor & self, Tensor & out); // {"schema": "aten::_test_autograd_multiple_dispatch.fullcoverage_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _test_autograd_multiple_dispatch_view_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & segment_reduce_out(const Tensor & data, c10::string_view reduce, const c10::optional<Tensor> & lengths, const c10::optional<Tensor> & indices, const c10::optional<Tensor> & offsets, int64_t axis, bool unsafe, const c10::optional<Scalar> & initial, Tensor & out); // {"schema": "aten::segment_reduce.out(Tensor data, str reduce, *, Tensor? lengths=None, Tensor? indices=None, Tensor? offsets=None, int axis=0, bool unsafe=False, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _segment_reduce_backward_out(const Tensor & grad, const Tensor & output, const Tensor & data, c10::string_view reduce, const c10::optional<Tensor> & lengths, const c10::optional<Tensor> & offsets, int64_t axis, const c10::optional<Scalar> & initial, Tensor & out); // {"schema": "aten::_segment_reduce_backward.out(Tensor grad, Tensor output, Tensor data, str reduce, *, Tensor? lengths=None, Tensor? offsets=None, int axis=0, Scalar? initial=None, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _nested_tensor_from_tensor_list_out(TensorList list, c10::optional<ScalarType> dtype, c10::optional<Layout> layout, c10::optional<Device> device, c10::optional<bool> pin_memory, Tensor & out); // {"schema": "aten::_nested_tensor_from_tensor_list.out(Tensor[] list, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _fw_primal_copy_out(const Tensor & self, int64_t level, Tensor & out); // {"schema": "aten::_fw_primal_copy.out(Tensor self, int level, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _make_dual_copy_out(const Tensor & primal, const Tensor & tangent, int64_t level, Tensor & out); // {"schema": "aten::_make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & view_as_real_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & view_as_complex_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::view_as_complex_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _conj_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _neg_view_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_neg_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & as_strided_copy_out(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset, Tensor & out); // {"schema": "aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _sparse_broadcast_to_copy_out(const Tensor & self, IntArrayRef size, Tensor & out); // {"schema": "aten::_sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & diagonal_copy_out(const Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, Tensor & out); // {"schema": "aten::diagonal_copy.out(Tensor self, int offset=0, int dim1=0, int dim2=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & expand_copy_out(const Tensor & self, c10::SymIntArrayRef size, bool implicit, Tensor & out); // {"schema": "aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & permute_copy_out(const Tensor & self, IntArrayRef dims, Tensor & out); // {"schema": "aten::permute_copy.out(Tensor self, int[] dims, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _reshape_alias_copy_out(const Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, Tensor & out); // {"schema": "aten::_reshape_alias_copy.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & select_copy_out(const Tensor & self, int64_t dim, c10::SymInt index, Tensor & out); // {"schema": "aten::select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & detach_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & slice_copy_out(const Tensor & self, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step, Tensor & out); // {"schema": "aten::slice_copy.Tensor_out(Tensor self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_copy_out(const Tensor & self, int64_t dim, Tensor & out); // {"schema": "aten::squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & squeeze_copy_out(const Tensor & self, IntArrayRef dim, Tensor & out); // {"schema": "aten::squeeze_copy.dims_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & t_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::t_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & transpose_copy_out(const Tensor & self, int64_t dim0, int64_t dim1, Tensor & out); // {"schema": "aten::transpose_copy.int_out(Tensor self, int dim0, int dim1, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & unsqueeze_copy_out(const Tensor & self, int64_t dim, Tensor & out); // {"schema": "aten::unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _values_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & values_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & crow_indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & col_indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::col_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & ccol_indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::ccol_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & row_indices_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::row_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & view_copy_out(const Tensor & self, c10::SymIntArrayRef size, Tensor & out); // {"schema": "aten::view_copy.out(Tensor self, SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & view_copy_out(const Tensor & self, ScalarType dtype, Tensor & out); // {"schema": "aten::view_copy.dtype_out(Tensor self, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & unfold_copy_out(const Tensor & self, int64_t dimension, int64_t size, int64_t step, Tensor & out); // {"schema": "aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & alias_copy_out(const Tensor & self, Tensor & out); // {"schema": "aten::alias_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & to_padded_tensor_out(const Tensor & self, double padding, OptionalSymIntArrayRef output_size, Tensor & out); // {"schema": "aten::to_padded_tensor.out(Tensor self, float padding, SymInt[]? output_size=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _transformer_encoder_layer_fwd_out(const Tensor & src, int64_t embed_dim, int64_t num_heads, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const Tensor & norm_weight_1, const Tensor & norm_bias_1, const Tensor & norm_weight_2, const Tensor & norm_bias_2, const Tensor & ffn_weight_1, const Tensor & ffn_bias_1, const Tensor & ffn_weight_2, const Tensor & ffn_bias_2, const c10::optional<Tensor> & mask, c10::optional<int64_t> mask_type, Tensor & out); // {"schema": "aten::_transformer_encoder_layer_fwd.out(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+::std::tuple<Tensor &,Tensor &> _native_multi_head_attention_out(const Tensor & query, const Tensor & key, const Tensor & value, int64_t embed_dim, int64_t num_head, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, const c10::optional<Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type, Tensor & out0, Tensor & out1); // {"schema": "aten::_native_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))", "dispatch": "True", "default": "True"}
+Tensor & _triton_scaled_dot_attention_out(const Tensor & q, const Tensor & k, const Tensor & v, double dropout_p, Tensor & out); // {"schema": "aten::_triton_scaled_dot_attention.out(Tensor q, Tensor k, Tensor v, float dropout_p=0.0, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _triton_multi_head_attention_out(const Tensor & query, const Tensor & key, const Tensor & value, int64_t embed_dim, int64_t num_head, const Tensor & qkv_weight, const Tensor & qkv_bias, const Tensor & proj_weight, const Tensor & proj_bias, const c10::optional<Tensor> & mask, Tensor & out); // {"schema": "aten::_triton_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, *, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+Tensor & _foobar_out(const Tensor & self, bool arg1, bool arg2, bool arg3, Tensor & out); // {"schema": "aten::_foobar.out(Tensor self, bool arg1=True, bool arg2=True, *, bool arg3=True, Tensor(a!) out) -> Tensor(a!)", "dispatch": "True", "default": "True"}
+void _fused_adam_out(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_adam.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_adam(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adam(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)", "dispatch": "True", "default": "True"}
+void _fused_adam_out(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_adam.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_adam(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adam.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)", "dispatch": "True", "default": "True"}
+void _fused_adamw_out(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_adamw.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_adamw(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, double lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adamw(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)", "dispatch": "True", "default": "True"}
+void _fused_adamw_out(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_adamw.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_adamw(TensorList self, TensorList grads, TensorList exp_avgs, TensorList exp_avg_sqs, TensorList max_exp_avg_sqs, TensorList state_steps, const Tensor & lr, double beta1, double beta2, double weight_decay, double eps, bool amsgrad, bool maximize, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_adamw.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] exp_avgs, Tensor[] exp_avg_sqs, Tensor[] max_exp_avg_sqs, Tensor[] state_steps, *, Tensor lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] exp_avgs_out, Tensor[] exp_avg_sqs_out, Tensor[] max_exp_avg_sqs_out)", "dispatch": "True", "default": "True"}
+void _fused_sgd_out(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_sgd.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_sgd(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, double lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_sgd(Tensor[] self, Tensor[] grads, Tensor[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] momentum_buffer_list_out)", "dispatch": "True", "default": "True"}
+void _fused_sgd_out(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, const Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf, TensorList out); // {"schema": "aten::_fused_sgd.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()", "dispatch": "True", "default": "True"}
+::std::tuple<::std::vector<Tensor>,::std::vector<Tensor>,::std::vector<Tensor>> _fused_sgd(TensorList self, TensorList grads, TensorList momentum_buffer_list, double weight_decay, double momentum, const Tensor & lr, double dampening, bool nesterov, bool maximize, bool is_first_step, const c10::optional<Tensor> & grad_scale, const c10::optional<Tensor> & found_inf); // {"schema": "aten::_fused_sgd.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] momentum_buffer_list_out)", "dispatch": "True", "default": "True"}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/SavedTensorHooks.h b/MLPY/Lib/site-packages/torch/include/ATen/SavedTensorHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e5708ce33e4718344562a0996472ddbed438737
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/SavedTensorHooks.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <c10/util/python_stub.h>
+#include <stack>
+#include <string>
+
+#include <utility>
+
+namespace at {
+
+namespace impl {
+
+struct TORCH_API SavedTensorDefaultHooksTLS {
+  // PyObject is defined in c10/util/python_stub.h
+  std::stack<std::pair<PyObject*, PyObject*>> stack;
+
+  // See NOTE: [Disabling SavedTensorDefaultHooks] for context
+  // NOTE: [disabled_error_message invariant]
+  // disabled_error_message is nullopt IFF Saved Tensor hooks is enabled
+  // We did this for efficiency (so we didn't have to keep a separate bool
+  // around)
+  c10::optional<std::string> disabled_error_message;
+};
+
+} // namespace impl
+
+struct TORCH_API SavedTensorDefaultHooks {
+  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
+  static void pop_hooks();
+  static std::pair<PyObject*, PyObject*> get_hooks();
+  static void lazy_initialize();
+  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
+  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);
+
+  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
+  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
+
+  // NOTE: [Disabling SavedTensorDefaultHooks]
+  // A developer of a PyTorch feature may choose to disable SavedTensorDefault
+  // hooks, especially if their feature does not work with it. If they are
+  // disabled, then the following will raise an error:
+  // - Attempting to push_hooks
+  // - calling disable(message) with a non-zero stack (from get_stack) size
+  static void disable(const std::string& error_message);
+  static void enable();
+  static bool is_enabled();
+  static const c10::optional<std::string>& get_disabled_error_message();
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/Scalar.h b/MLPY/Lib/site-packages/torch/include/ATen/Scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dec39dd3c32cef073fec4891ab16a71c58e8077
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/Scalar.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <ATen/core/Scalar.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ScalarOps.h b/MLPY/Lib/site-packages/torch/include/ATen/ScalarOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..533ba88771c62e3fc4fd95490d1c1a1421c412f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ScalarOps.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Scalar.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+namespace at::detail {
+// When filling a number to 1-element CPU tensor, we want to skip
+// everything but manipulate data ptr directly.
+// Ideally this fast pass should be implemented in TensorIterator,
+// but we also want to skip compute_types which in not avoidable
+// in TensorIterator for now.
+Tensor& scalar_fill(Tensor& self, const Scalar& value);
+TORCH_API Tensor scalar_tensor_static(
+    const Scalar& s,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Device> device_opt);
+} // namespace at::detail
+
+// This is in the c10 namespace because we use ADL to find the functions in it.
+namespace c10 {
+
+// FIXME: this should be (and was) Scalar::toTensor, but there is currently no
+// way to implement this without going through Derived Types (which are not part
+// of core).
+inline at::Tensor scalar_to_tensor(
+    const Scalar& s,
+    const Device device = at::kCPU) {
+  // This is the fast track we have for CPU scalar tensors.
+  if (device == at::kCPU) {
+    return at::detail::scalar_tensor_static(s, s.type(), at::kCPU);
+  }
+  return at::scalar_tensor(s, at::device(device).dtype(s.type()));
+}
+
+} // namespace c10
+
+namespace at::native {
+
+inline Tensor wrapped_scalar_tensor(
+    const Scalar& scalar,
+    const Device device = at::kCPU) {
+  auto tensor = scalar_to_tensor(scalar, device);
+  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+  return tensor;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/autocast_mode.h b/MLPY/Lib/site-packages/torch/include/ATen/autocast_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..b29bd694747b7ea2c9f21d4a9603292b84215654
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/autocast_mode.h
@@ -0,0 +1,647 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Operators.h>
+#include <torch/library.h>
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace at::autocast {
+
+TORCH_API bool is_enabled();
+TORCH_API void set_enabled(bool enabled);
+TORCH_API void clear_cache();
+TORCH_API int increment_nesting();
+TORCH_API int decrement_nesting();
+TORCH_API bool is_cpu_enabled();
+TORCH_API void set_cpu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_gpu_dtype();
+TORCH_API at::ScalarType get_autocast_cpu_dtype();
+TORCH_API void set_autocast_gpu_dtype(at::ScalarType dtype);
+TORCH_API void set_autocast_cpu_dtype(at::ScalarType dtype);
+TORCH_API bool is_xpu_enabled();
+TORCH_API void set_xpu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_xpu_dtype();
+TORCH_API void set_autocast_xpu_dtype(at::ScalarType dtype);
+TORCH_API bool is_ipu_enabled();
+TORCH_API void set_ipu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_ipu_dtype();
+TORCH_API void set_autocast_ipu_dtype(at::ScalarType dtype);
+TORCH_API bool is_hpu_enabled();
+TORCH_API void set_hpu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_hpu_dtype();
+TORCH_API void set_autocast_hpu_dtype(at::ScalarType dtype);
+TORCH_API bool is_xla_enabled();
+TORCH_API void set_xla_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_xla_dtype();
+TORCH_API void set_autocast_xla_dtype(at::ScalarType dtype);
+TORCH_API bool is_privateuseone_enabled();
+TORCH_API void set_privateuseone_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_privateuseone_dtype();
+TORCH_API void set_autocast_privateuseone_dtype(at::ScalarType dtype);
+TORCH_API bool is_autocast_cache_enabled();
+TORCH_API void set_autocast_cache_enabled(bool enabled);
+
+namespace {
+inline bool is_autocast_eligible(
+    const Tensor& tensor,
+    c10::DeviceType device_type) {
+  switch (device_type) {
+    case c10::DeviceType::CUDA:
+      return (tensor.is_cuda() || tensor.is_xla()) &&
+          tensor.is_floating_point();
+    case c10::DeviceType::CPU:
+      return (tensor.is_cpu() || tensor.is_mkldnn()) &&
+          tensor.is_floating_point();
+    case c10::DeviceType::XPU:
+      return tensor.is_xpu() && tensor.is_floating_point();
+    case c10::DeviceType::IPU:
+      return tensor.is_ipu() && tensor.is_floating_point();
+    case c10::DeviceType::HPU:
+      return tensor.is_hpu() && tensor.is_floating_point();
+    case c10::DeviceType::XLA:
+      return tensor.is_xla() && tensor.is_floating_point();
+    case c10::DeviceType::PrivateUse1:
+      return tensor.is_privateuseone() && tensor.is_floating_point();
+    default:
+      return false;
+  }
+}
+} // namespace
+
+inline DispatchKey get_autocast_dispatch_key_from_device_type(
+    c10::DeviceType device_type) {
+  switch (device_type) {
+    case c10::DeviceType::CUDA:
+      return DispatchKey::Autocast;
+    case c10::DeviceType::CPU:
+      return DispatchKey::AutocastCPU;
+    case c10::DeviceType::XPU:
+      return DispatchKey::AutocastXPU;
+    case c10::DeviceType::IPU:
+      return DispatchKey::AutocastIPU;
+    case c10::DeviceType::HPU:
+      return DispatchKey::AutocastHPU;
+    case c10::DeviceType::XLA:
+      return DispatchKey::AutocastXLA;
+    case c10::DeviceType::PrivateUse1:
+      return DispatchKey::AutocastPrivateUse1;
+    default:
+      throw std::runtime_error(
+          "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
+  }
+}
+
+inline at::ScalarType get_lower_precision_fp_from_device_type(
+    c10::DeviceType device_type) {
+  switch (device_type) {
+    case c10::DeviceType::CUDA:
+      return get_autocast_gpu_dtype();
+    case c10::DeviceType::CPU:
+      return get_autocast_cpu_dtype();
+    case c10::DeviceType::XPU:
+      return get_autocast_xpu_dtype();
+    case c10::DeviceType::IPU:
+      return get_autocast_ipu_dtype();
+    case c10::DeviceType::HPU:
+      return get_autocast_hpu_dtype();
+    case c10::DeviceType::XLA:
+      return get_autocast_xla_dtype();
+    case c10::DeviceType::PrivateUse1:
+      return get_autocast_privateuseone_dtype();
+    default:
+      throw std::runtime_error(
+          "unknown device type for autocast in get_lower_precision_fp_from_device_type");
+  }
+}
+
+/********************************************************************
+Logic to extract the promote type from any Tensor or TensorList args.
+********************************************************************/
+
+// Overload to catch Tensor args.
+// If nextArg is floating-point, compare its scalar_type with our
+// current best guess for the promote type, and update if necessary.
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    const Tensor& nextArg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  if (current == at::kDouble) {
+    AT_ERROR("promote type is double in at::autocast::prioritize");
+    return current;
+  }
+  at::ScalarType lower_precision_fp =
+      get_lower_precision_fp_from_device_type(device_type);
+  if (is_autocast_eligible(nextArg, device_type)) {
+    auto next = nextArg.scalar_type();
+    if (next == at::kDouble) {
+      return current; // ignores double tensors
+    } else if (current == at::kFloat || next == at::kFloat) {
+      return at::kFloat; // prioritizes float over lower_precision_fp
+    } else if (current == lower_precision_fp && next == lower_precision_fp) {
+      return lower_precision_fp;
+    } else {
+      AT_ERROR("Unexpected floating ScalarType in at::autocast::prioritize");
+      return current;
+    }
+  } else {
+    return current;
+  }
+}
+
+// Overload to catch TensorList args (for e.g. cat, stack).
+// Reuses the overload above to process each Tensor in the list.
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    const TensorList& list,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  for (const auto& tensor : list) {
+    current = prioritize(current, tensor, device_type);
+  }
+  return current;
+}
+
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    const ITensorListRef& list,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  for (const auto& tensor : list) {
+    current = prioritize(current, tensor, device_type);
+  }
+  return current;
+}
+
+// Template to catch non-Tensor args (no-op that returns current best guess)
+template <typename T>
+inline at::ScalarType prioritize(
+    at::ScalarType current,
+    T nextArg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  return current;
+}
+
+// Overload for the tail case.
+inline at::ScalarType promote_type(
+    at::ScalarType current,
+    c10::DeviceType device_type) {
+  return current;
+}
+
+// Unpack args and determine if incoming lower_precision_fp tensors need to be
+// promoted to float32. Non-Tensor arguments are ignored.
+template <typename Arg0, typename... Args>
+inline at::ScalarType promote_type(
+    at::ScalarType current,
+    c10::DeviceType device_type,
+    Arg0 arg0,
+    Args... args) {
+  auto new_current = prioritize(current, arg0, device_type);
+  return promote_type(new_current, device_type, args...);
+}
+
+/****************************************************
+Logic to apply cached casting to any Tensor argument.
+****************************************************/
+inline bool is_eligible(
+    const Tensor& arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  return (
+      arg.defined() && is_autocast_eligible(arg, device_type) &&
+      (arg.scalar_type() != at::kDouble));
+}
+
+// Overload to catch Tensor args
+TORCH_API Tensor cached_cast(
+    at::ScalarType to_type,
+    const Tensor& arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA);
+
+// Overload to process optional<Tensor>
+inline c10::optional<Tensor> cached_cast(
+    at::ScalarType to_type,
+    const c10::optional<Tensor>& arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  if (arg.has_value()) {
+    return cached_cast(to_type, *arg, device_type);
+  } else {
+    return c10::nullopt;
+  }
+}
+
+// Overload to process TensorLists
+inline std::vector<Tensor> cached_cast(
+    at::ScalarType to_type,
+    const TensorList& arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  std::vector<Tensor> vec;
+  vec.reserve(arg.size());
+  for (const auto& t : arg) {
+    vec.emplace_back(cached_cast(to_type, t, device_type));
+  }
+  return vec;
+}
+
+inline std::vector<Tensor> cached_cast(
+    at::ScalarType to_type,
+    const ITensorListRef& arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  std::vector<Tensor> vec;
+  vec.reserve(arg.size());
+  for (const auto& t : arg) {
+    vec.emplace_back(cached_cast(to_type, t, device_type));
+  }
+  return vec;
+}
+
+// Template to catch non-Tensor args.
+template <typename T>
+inline T cached_cast(
+    at::ScalarType to_type,
+    T arg,
+    c10::DeviceType device_type = c10::DeviceType::CUDA) {
+  return arg;
+}
+
+/*******************************************************
+Logic to flip an output dtype flag.
+Keep it simple for now by assuming only one such flag is
+present in the argument list.  If I ever need a function
+with more than flag I'll figure out something else.
+The policy is:
+If the user has explicity specified a dtype, respect it.
+Otherwise, set it to the autocast type.
+********************************************************/
+
+// Overload to catch dtype flags
+c10::optional<ScalarType> inline set_opt_dtype(
+    at::ScalarType to_type,
+    const c10::optional<ScalarType>& dtype) {
+  return dtype.has_value() ? dtype : to_type;
+}
+
+// Template to catch other args
+template <typename T>
+inline T set_opt_dtype(at::ScalarType to_type, T arg) {
+  return arg;
+}
+
+template <typename... Args>
+inline bool firstarg_is_eligible(
+    c10::DeviceType device_type,
+    const Tensor& arg,
+    Args... args) {
+  return is_eligible(arg, device_type);
+}
+
+template <typename... Args>
+inline at::ScalarType type_from_firstarg(
+    c10::DeviceType device_type,
+    at::ScalarType to_type,
+    const Tensor& arg,
+    Args... args) {
+  return (is_eligible(arg, device_type) ? to_type : arg.scalar_type());
+}
+
+// Policies correspond to op categories that need code-divergent handling.
+// Wrapper templates below are specialized based on a policy template parameter.
+enum class CastPolicy : uint8_t {
+  lower_precision_fp = 0, // Cast all inputs to lower_precision_fp before
+                          // running the op. Currently, lower_precision_fp is
+                          // fp16 for AutocastCUDA, and is defined by user
+                          // (default bf16) for AutocastCPU or other device.
+  fp32, // Cast all inputs to at::kFloat before running the op.
+  fp32_set_opt_dtype, // Treats functions (like softmax) that
+                      //  1. we'd like to run in fp32 and
+                      //  2. have a c10::optional<ScalarType> arg that controls
+                      //  the output type.
+                      // fp32_set_opt_dtype wrappers' policy is: if the output
+                      // type is already set, don't touch it, otherwise, set
+                      // it to at::kFloat.
+  fp32_append_dtype, // Treats functions (like norm) that
+                     //  1. we'd like to run in fp32 and
+                     //  2. have some overloads that accept an output type and
+                     //  other overloads that don't.
+                     // fp32_append_dtype wrappers wrap the overloads that don't
+                     // have an output dtype.
+                     // The wrapper policy is:  append at::kFloat to the args,
+                     // and redispatch to the type-aware overload.
+  promote, // Run in the widest dtype among several args.
+};
+
+/********************************************************************************************************
+Templates to provide wrapper functions
+
+I'm copying the pattern used in core/boxing/impl/WrapFunctionIntoFunctor.h to
+extract args and return type. (see also
+https://stackoverflow.com/questions/46533698/how-to-deduce-argument-list-from-function-pointer)
+
+This strategy uses an exterior "WrapFunction" that extracts arguments on behalf
+of (in my case several specializations of) an interior "WrapFunction_".
+Interior WrapFunction_ specializations are defined for each CastPolicy.
+********************************************************************************************************/
+
+// Base template for WrapFunction_, which is specialized to contain a "call"
+// method each CastPolicy
+template <
+    CastPolicy policy,
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class ArgList>
+struct WrapFunction_ {};
+
+// CastPolicy::lower_precision_fp General_DeviceType
+template <
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class... Args>
+struct WrapFunction_<
+    CastPolicy::lower_precision_fp,
+    device_type,
+    Redispatch,
+    F,
+    Ret,
+    guts::typelist::typelist<Args...>> {
+  static Ret call(Args... args) {
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(
+        get_autocast_dispatch_key_from_device_type(device_type));
+    return (*F)(cached_cast(
+        get_lower_precision_fp_from_device_type(device_type),
+        args,
+        device_type)...);
+  }
+};
+
+// CastPolicy::fp32 General_DeviceType
+template <
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class... Args>
+struct WrapFunction_<
+    CastPolicy::fp32,
+    device_type,
+    Redispatch,
+    F,
+    Ret,
+    guts::typelist::typelist<Args...>> {
+  static Ret call(Args... args) {
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(
+        get_autocast_dispatch_key_from_device_type(device_type));
+    return (*F)(cached_cast(at::kFloat, args, device_type)...);
+  }
+};
+
+// CastPolicy::fp32_set_opt_dtype General_DeviceType
+template <
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class... Args>
+struct WrapFunction_<
+    CastPolicy::fp32_set_opt_dtype,
+    device_type,
+    Redispatch,
+    F,
+    Ret,
+    guts::typelist::typelist<Args...>> {
+  static Ret call(Args... args) {
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(
+        get_autocast_dispatch_key_from_device_type(device_type));
+    if (firstarg_is_eligible(device_type, args...)) {
+      return (*F)(set_opt_dtype(at::kFloat, args)...);
+    } else {
+      // If ineligible, calls F with unaltered args.  Does not set opt dtype,
+      // because setting opt dtype explicitly may interfere with internal
+      // implicit promotion decisions.
+      return (*F)(args...);
+    }
+  }
+};
+
+// CastPolicy::fp32_append_dtype General_DeviceType
+template <
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class... Args>
+struct WrapFunction_<
+    CastPolicy::fp32_append_dtype,
+    device_type,
+    Redispatch,
+    F,
+    Ret,
+    guts::typelist::typelist<Args...>> {
+  static Ret call(Args... args) {
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(
+        get_autocast_dispatch_key_from_device_type(device_type));
+    at::ScalarType out_type =
+        type_from_firstarg(device_type, at::kFloat, args...);
+    return (*F)(args..., out_type);
+  }
+};
+
+// CastPolicy::promote General_DeviceType
+template <
+    c10::DeviceType device_type,
+    class Redispatch,
+    Redispatch* F,
+    class Ret,
+    class... Args>
+struct WrapFunction_<
+    CastPolicy::promote,
+    device_type,
+    Redispatch,
+    F,
+    Ret,
+    guts::typelist::typelist<Args...>> {
+  static Ret call(Args... args) {
+    c10::impl::ExcludeDispatchKeyGuard no_autocast(
+        get_autocast_dispatch_key_from_device_type(device_type));
+    auto to_type = promote_type(
+        get_lower_precision_fp_from_device_type(device_type),
+        device_type,
+        args...);
+    return (*F)(cached_cast(to_type, args, device_type)...);
+  }
+};
+
+// Wrapper to infer return_type and parameter_types for WrapFunction_ (imitating
+// core/boxing/impl/WrapFunctionIntoFunctor.h)
+template <
+    CastPolicy policy,
+    c10::DeviceType device_type,
+    class Registered, // The signature for which we're registering.  The
+                      // dispatcher's calling code invokes our registered
+                      // functions with arguments matching Registered, so we
+                      // register WrapFunction_::call methods with a matching
+                      // signature to properly field those arguments.
+    // guts::function_traits below extracts return_type and
+    // parameter_types from Registered, which WrapFunction_
+    // templates above use to declare their call methods.
+    class Redispatch, // The signature for the function we're redispatching to.
+                      // In most cases this is the same as Registered, but for
+                      // some ops (for example, ops where we append a dtype)
+                      // it's useful to redispatch to a function with a
+                      // different signature.
+    Redispatch* F> // The actual function we're redispatching to.
+struct WrapFunction final {
+  using type = WrapFunction_<
+      policy,
+      device_type,
+      Redispatch,
+      F,
+      typename guts::function_traits<Registered>::return_type,
+      typename guts::function_traits<Registered>::parameter_types>;
+};
+
+/*****************************************************************************************************************
+This section performs load-time registration for autocast wrappers.
+
+It's debatable at what level operations should be patched.  We'd like casts to
+be autograd-exposed and precede autograd history recording, so that for
+lower_precision_fp ops, input tensors are saved for backward in
+lower_precision_fp rather than fp32.  Saving inputs in lower_precision_fp
+can significantly reduce a model's memory footprint.
+
+Option 1 (strawman):  Patch only at the level of explicit calls into
+cudnn/cublas (cudnn_convolution, etc), because those are the code paths that are
+guaranteed to use Tensor Cores, therefore they're the ones that will benefit
+most from lower_precision_fp.   Potential pitfall:  convolutions (and other ops)
+are wrapped in several layers of at::* calls.  If one of those happens to record
+autograd history, then we've lost the opportunity to save inputs in
+lower_precision_fp.
+
+Option 2:  Patch the Python-exposed surface of calls, to make 100% sure autograd
+history recording can't sneak in ahead of autocast.  This mirrors Apex most
+closely.
+
+I think Option 2 is the right answer for all ops, not just convolutions. Option
+2 is what I implement here.
+*****************************************************************************************************************/
+
+/********************************************************************************************************************
+Explicit registration for out-of-place ops
+
+The stuff below could be codegenned.  Ed said
+> you are going to have to write the function definition at some point, I
+wouldn't try to get clever about it Therefore, for the moment, this is all
+copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
+********************************************************************************************************************/
+
+} // namespace at::autocast
+
+#define ADD_NS(RAW_OP) at::RAW_OP
+
+// Common cases where registration signature matches redispatch signature
+// (that's why SIGNATURE is repeated in the WrapFunction instantiation)
+#define KERNEL(DISPATCHKEY, OP, POLICY)       \
+  m.impl(                                     \
+      TORCH_SELECTIVE_NAME("aten::" #OP),     \
+      &::at::autocast::WrapFunction<          \
+          ::at::autocast::CastPolicy::POLICY, \
+          DISPATCHKEY,                        \
+          decltype(ATEN_FN(OP)),              \
+          decltype(ATEN_FN(OP)),              \
+          &ATEN_FN(OP)>::type::call);
+
+#define KERNEL2(DISPATCHKEY, OP, OVERLOAD, POLICY)      \
+  m.impl(                                               \
+      TORCH_SELECTIVE_NAME("aten::" #OP "." #OVERLOAD), \
+      &::at::autocast::WrapFunction<                    \
+          ::at::autocast::CastPolicy::POLICY,           \
+          DISPATCHKEY,                                  \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          decltype(ATEN_FN2(OP, OVERLOAD)),             \
+          &ATEN_FN2(OP, OVERLOAD)>::type::call);
+
+// Less-common but still useful case: redispatching to a function
+// with a new signature (e.g. appending a dtype)
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(      \
+    DISPATCHKEY,                                    \
+    REDISPATCH_FUNC,                                \
+    REGISTER_NAME,                                  \
+    REGISTER_SIGNATURE,                             \
+    REDISPATCH_SIGNATURE,                           \
+    POLICY)                                         \
+  m.impl(                                           \
+      TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
+      &::at::autocast::WrapFunction<                \
+          ::at::autocast::CastPolicy::POLICY,       \
+          DISPATCHKEY,                              \
+          REGISTER_SIGNATURE,                       \
+          REDISPATCH_SIGNATURE,                     \
+          &REDISPATCH_FUNC>::type::call);
+
+// KERNEL_CPU/KERNEL_CPU2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU
+// registration for AutocastCPU
+#define KERNEL_CPU(OP, POLICY) KERNEL(c10::DeviceType::CPU, OP, POLICY)
+
+#define KERNEL_CPU2(OP, OVERLOAD, POLICY) \
+  KERNEL2(c10::DeviceType::CPU, OP, OVERLOAD, POLICY)
+
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU( \
+    REDISPATCH_FUNC,                               \
+    REGISTER_NAME,                                 \
+    REGISTER_SIGNATURE,                            \
+    REDISPATCH_SIGNATURE,                          \
+    POLICY)                                        \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(           \
+      c10::DeviceType::CPU,                        \
+      REDISPATCH_FUNC,                             \
+      REGISTER_NAME,                               \
+      REGISTER_SIGNATURE,                          \
+      REDISPATCH_SIGNATURE,                        \
+      POLICY)
+
+// KERNEL_CUDA/KERNEL_CUDA2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA
+// registration for AutocastCUDA
+#define KERNEL_CUDA(OP, POLICY) KERNEL(c10::DeviceType::CUDA, OP, POLICY)
+
+#define KERNEL_CUDA2(OP, OVERLOAD, POLICY) \
+  KERNEL2(c10::DeviceType::CUDA, OP, OVERLOAD, POLICY)
+
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA( \
+    REDISPATCH_FUNC,                                \
+    REGISTER_NAME,                                  \
+    REGISTER_SIGNATURE,                             \
+    REDISPATCH_SIGNATURE,                           \
+    POLICY)                                         \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(            \
+      c10::DeviceType::CUDA,                        \
+      REDISPATCH_FUNC,                              \
+      REGISTER_NAME,                                \
+      REGISTER_SIGNATURE,                           \
+      REDISPATCH_SIGNATURE,                         \
+      POLICY)
+
+// KERNEL_PRIVATEUSEONE/KERNEL_PRIVATEUSEONE2/
+// KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE
+// registration for AutocastPrivateUse1
+#define KERNEL_PRIVATEUSEONE(OP, POLICY) \
+  KERNEL(c10::DeviceType::PrivateUse1, OP, POLICY)
+
+#define KERNEL_PRIVATEUSEONE2(OP, OVERLOAD, POLICY) \
+  KERNEL2(c10::DeviceType::PrivateUse1, OP, OVERLOAD, POLICY)
+
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \
+    REDISPATCH_FUNC,                                         \
+    REGISTER_NAME,                                           \
+    REGISTER_SIGNATURE,                                      \
+    REDISPATCH_SIGNATURE,                                    \
+    POLICY)                                                  \
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(                     \
+      c10::DeviceType::PrivateUse1,                          \
+      REDISPATCH_FUNC,                                       \
+      REGISTER_NAME,                                         \
+      REGISTER_SIGNATURE,                                    \
+      REDISPATCH_SIGNATURE,                                  \
+      POLICY)
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ceil_div.h b/MLPY/Lib/site-packages/torch/include/ATen/ceil_div.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eb9940e57d8bd97cef964acb8650466d663da17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ceil_div.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <type_traits>
+
+namespace at {
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+C10_ALWAYS_INLINE C10_HOST_DEVICE T ceil_div(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+/**
+   Computes ceil(a / b) * b; i.e., rounds up `a` to the next highest
+   multiple of b
+*/
+template <typename T>
+C10_ALWAYS_INLINE C10_HOST_DEVICE T round_up(T a, T b) {
+  return ceil_div(a, b) * b;
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/code_template.h b/MLPY/Lib/site-packages/torch/include/ATen/code_template.h
new file mode 100644
index 0000000000000000000000000000000000000000..45872bb07daedbecfb59ca46dc6c507dc16a6aac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/code_template.h
@@ -0,0 +1,243 @@
+#pragma once
+
+#include <c10/util/irange.h>
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace at::jit {
+
+// A template environment is a mapping from template variable names, e.g.,
+// identifier (corresponding to $identifier) to their expansions.
+//
+// This template environment supports storing strings, numbers and lists
+// of strings, and can be chained together (so that lookup proceeds in
+// in the top level environment, and then recurses into a parent
+// environment if the key is not found.)
+struct TemplateEnv {
+  TemplateEnv() = default;
+  TemplateEnv(TemplateEnv& parent) : parent(&parent) {}
+
+  using string_list = std::vector<std::string>;
+
+  // Add a string 'v' to the map at key 'k'.
+  void s(const std::string& k, const std::string& v) {
+    strings_[k] = v;
+    lists_.erase(k);
+  }
+
+  // Add a number 'v' to the map at key 'k'
+  template <typename T>
+  void d(const std::string& k, const T& v) {
+    strings_[k] = c10::to_string(v);
+    lists_.erase(k);
+  }
+
+  // Retrieve the string representation of the value stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const std::string& s(const std::string& k) const {
+    if (strings_.count(k) == 0) {
+      if (parent) {
+        return parent->s(k);
+      }
+      notFound(k);
+    }
+    return strings_.at(k);
+  }
+
+  // Store a list of strings 'v' in the map at 'k'.
+  void v(const std::string& k, const string_list& v) {
+    lists_[k] = v;
+    strings_.erase(k);
+  }
+
+  // Retrieve a list of strings stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const string_list& v(const std::string& k) const {
+    if (lists_.count(k) == 0) {
+      if (parent) {
+        return parent->v(k);
+      }
+      notFound(k);
+    }
+    return lists_.at(k);
+  }
+
+  // Test if a string 'k' is a string (as opposed to a list.)
+  bool keyIsString(const std::string& k) const {
+    if (strings_.count(k) > 0)
+      return true;
+    if (lists_.count(k) > 0)
+      return false;
+    if (parent)
+      return parent->keyIsString(k);
+    notFound(k);
+  }
+
+ private:
+  [[noreturn]] void notFound(const std::string& k) const {
+    std::stringstream ss;
+    ss << "key not found: " << k;
+    throw std::logic_error(ss.str());
+  }
+
+  std::unordered_map<std::string, std::string> strings_;
+  std::unordered_map<std::string, string_list> lists_;
+  TemplateEnv* parent{nullptr};
+};
+
+/*
+# Match $identifier or ${identifier} and replace with the value in env.
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block substitution by indenting all lines of all elements.
+# If the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated. ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+*/
+struct CodeTemplate {
+  /* implicit */ CodeTemplate(std::string t) : template_text(std::move(t)) {}
+
+  std::string format(const TemplateEnv& env) const {
+    std::stringstream out;
+    size_t pos = 0;
+    size_t indent = 0;
+    bool all_whitespace = true;
+    while (pos < template_text.size()) {
+      char c = template_text[pos];
+      if (c == '$') {
+        std::stringstream kss;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        bool comma_before;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        bool comma_after;
+        size_t new_pos = parseKey(pos, kss, comma_before, comma_after);
+        std::string k = kss.str();
+        bool is_string = env.keyIsString(k);
+        if (all_whitespace) {
+          if (is_string)
+            emitStringWithIndents(out, indent, env.s(k));
+          else
+            emitLinesIndented(out, indent, env.v(k));
+        } else {
+          if (is_string)
+            out << env.s(k);
+          else
+            emitCommaSeparatedList(out, env.v(k), comma_before, comma_after);
+        }
+        all_whitespace = false;
+        pos = new_pos;
+      } else {
+        out << c;
+        if (!isspace(c))
+          all_whitespace = false;
+        indent++;
+        if (c == '\n') {
+          indent = 0;
+          all_whitespace = true;
+        }
+        pos++;
+      }
+    }
+    return out.str();
+  }
+
+ private:
+  using string_list = std::vector<std::string>;
+  char charAt(size_t p) const {
+    if (p >= template_text.size())
+      throw std::logic_error("EOS found in key");
+    return template_text[p];
+  }
+  size_t parseKey(
+      size_t pos,
+      std::ostream& k,
+      bool& comma_before,
+      bool& comma_after) const {
+    comma_before = false;
+    comma_after = false;
+    pos++;
+    if (charAt(pos) == '{') {
+      pos++;
+      if (charAt(pos) == ',') {
+        comma_before = true;
+        pos++;
+      }
+      pos = parseIdent(pos, k);
+      if (charAt(pos) == ',') {
+        comma_after = true;
+        pos++;
+      }
+      if (charAt(pos) != '}')
+        throw std::logic_error("missing terminating '}'");
+      pos++;
+      return pos;
+    } else {
+      return parseIdent(pos, k);
+    }
+  }
+  size_t parseIdent(size_t pos, std::ostream& k) const {
+    while (pos < template_text.size() &&
+           (isalnum(template_text[pos]) || template_text[pos] == '_')) {
+      k << template_text[pos];
+      pos++;
+    }
+    return pos;
+  }
+  void emitCommaSeparatedList(
+      std::ostream& out,
+      const string_list& strings,
+      bool comma_before,
+      bool comma_after) const {
+    if (comma_before && !strings.empty())
+      out << ", ";
+    for (const auto i : c10::irange(strings.size())) {
+      if (i > 0)
+        out << ", ";
+      out << strings[i];
+    }
+    if (comma_after && !strings.empty())
+      out << ", ";
+  }
+  // These indentation functions follow the convention that they never emit
+  // leading or trailing newlines when the input string does not have leading
+  // or trailing newlines. It's the responsibility of the calling function
+  // to indent correctly in the context.
+  void emitIndent(std::ostream& out, size_t indent) const {
+    for (C10_UNUSED const auto i : c10::irange(indent)) {
+      out << " ";
+    }
+  }
+  void emitStringWithIndents(
+      std::ostream& out,
+      size_t indent,
+      const std::string& str) const {
+    for (auto c : str) {
+      out << c;
+      if (c == '\n') {
+        emitIndent(out, indent);
+      }
+    }
+  }
+  void emitLinesIndented(
+      std::stringstream& out,
+      size_t indent,
+      const string_list& strings) const {
+    for (const auto i : c10::irange(strings.size())) {
+      if (i > 0)
+        emitIndent(out, indent);
+      emitStringWithIndents(out, indent, strings[i]);
+      if (i + 1 != strings.size())
+        out << "\n";
+    }
+  }
+  std::string template_text;
+};
+
+static inline std::string format(const std::string& fmt, TemplateEnv& env) {
+  return CodeTemplate(fmt).format(env);
+}
+
+} // namespace at::jit
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ATenGeneral.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ATenGeneral.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f411e535837a17c272762ccbd2714e15a1466cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ATenGeneral.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ATenOpList.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ATenOpList.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dfed2b9398544bb43938cdcc8243cfb10d9be32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ATenOpList.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace c10 {
+struct OperatorName;
+}
+
+namespace at {
+
+// check if an op is a custom op (i.e. did not come from native_functions.yaml)
+TORCH_API bool is_custom_op(const c10::OperatorName& opName);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_fwd.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..263e339c5bd6c7d4362771bc078ca8d980e042ec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_fwd.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <c10/core/QScheme.h>
+
+// Forward declarations of core ATen types used in dispatch functions
+namespace c10 {
+
+template<typename T>
+class List;
+template<typename T>
+class IListRef;
+class Stream;
+class Scalar;
+class SymInt;
+class SymIntList;
+struct Storage;
+struct TensorOptions;
+template <typename T>
+class ArrayRef;
+template <typename T>
+class OptionalArrayRef;
+
+}  // namespace c10
+
+namespace at {
+
+class Tensor;
+class OptionalTensorRef;
+struct Dimname;
+struct Generator;
+using TensorList = c10::ArrayRef<Tensor>;
+using ITensorListRef = c10::IListRef<Tensor>;
+using IOptTensorListRef = c10::IListRef<OptionalTensorRef>;
+using DimnameList = c10::ArrayRef<Dimname>;
+using IntArrayRef = c10::ArrayRef<int64_t>;
+using OptionalIntArrayRef = c10::OptionalArrayRef<int64_t>;
+using OptionalSymIntArrayRef = c10::OptionalArrayRef<c10::SymInt>;
+
+using c10::Stream;
+using c10::Storage;
+using c10::QScheme;
+using c10::Scalar;
+using c10::SymInt;
+using c10::SymIntList;
+using c10::TensorOptions;
+
+}  // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_pch.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_pch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0f32460ebe4ead8ca3c01d2c1f58bfce900942a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ATen_pch.h
@@ -0,0 +1,165 @@
+// This global header must not depend on native_functions.yaml or
+// incremental builds will be next to useless
+#pragma push_macro("TORCH_ASSERT_NO_OPERATORS")
+#define TORCH_ASSERT_NO_OPERATORS
+
+// This macro doesn't work if defined after the first time inttypes.h
+// is included, so won't work anywhere if not defined here.
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <cinttypes>
+
+// This list of headers was generated using a script that finds
+// high-impact headers and then manually tweaked to remove OS specific
+// or duplicate headers (e.g. <cassert> and <assert.h>) and to remove
+// "impl" headers (e.g BFloat16-inl.h or complex_math.h in c10).
+
+// To generate the initial list:
+// 1. Build pytorch from scratch with all build caching disabled
+// 2. Generate a build trace with ninjatracing (https://github.com/nico/ninjatracing)
+//    $ ninjatracing /path/to/pytorch/build/.ninja_log > trace_all.json
+// 3. Run pch_gen.py from https://github.com/peterbell10/build_analysis/
+//    $ python pch_gen.py --threshold .80 --target torch_cpu --build_dir /path/to/pytorch/build --trace trace_all.json
+//    Where the threshold can be tweaked until c10 and some of ATen
+//    core are included but TORCH_ASSERT_NO_OPERATORS still passes.
+
+#include <cerrno>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <complex>
+#include <deque>
+#include <exception>
+#include <functional>
+#include <initializer_list>
+#include <iomanip>
+#include <iosfwd>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <new>
+#include <numeric>
+#include <ostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/AutogradState.h>
+#include <c10/core/Backend.h>
+#include <c10/core/DefaultDtype.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/OptionalRef.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/core/impl/SizesAndStrides.h>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+
+#include <c10/util/AlignOf.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/C++17.h>
+#include <c10/util/ConstexprCrc.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Flags.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/FunctionRef.h>
+#include <c10/util/Half.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/Logging.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/Optional.h>
+#include <c10/util/Registry.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <c10/util/Type.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/TypeIndex.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/bits.h>
+#include <c10/util/complex.h>
+#include <c10/util/floating_point_utils.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+#include <c10/util/llvmMathExtras.h>
+#include <c10/util/python_stub.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint2x4.h>
+#include <c10/util/quint4x2.h>
+#include <c10/util/quint8.h>
+#include <c10/util/safe_numerics.h>
+#include <c10/util/string_utils.h>
+#include <c10/util/string_view.h>
+#include <c10/util/typeid.h>
+
+#include <ATen/StorageUtils.h>
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/core/Dimname.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/core/symbol.h>
+
+#pragma pop_macro("TORCH_ASSERT_NO_OPERATORS")
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Array.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Array.h
new file mode 100644
index 0000000000000000000000000000000000000000..c81a3cffbfd59e277bc5b7be6b1aca77db52c1ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Array.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// A fixed-size array type usable from both host and
+// device code.
+
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at { namespace detail {
+
+template <typename T, int size_>
+struct Array {
+  T data[size_];
+
+  C10_HOST_DEVICE T operator[](int i) const {
+    return data[i];
+  }
+  C10_HOST_DEVICE T& operator[](int i) {
+    return data[i];
+  }
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Array() = default;
+  C10_HOST_DEVICE Array(const Array&) = default;
+  C10_HOST_DEVICE Array& operator=(const Array&) = default;
+#else
+  Array() = default;
+  Array(const Array&) = default;
+  Array& operator=(const Array&) = default;
+#endif
+  static constexpr int size(){return size_;}
+  // Fill the array with x.
+  C10_HOST_DEVICE Array(T x) {
+    for (int i = 0; i < size_; i++) {
+      data[i] = x;
+    }
+  }
+};
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Backtrace.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Backtrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..684825dc2ba32d0dd84284f08591ec0ec314980f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Backtrace.h
@@ -0,0 +1,2 @@
+#include <c10/util/Backtrace.h>
+#include <c10/util/Type.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/CheckMemoryFormat.h b/MLPY/Lib/site-packages/torch/include/ATen/core/CheckMemoryFormat.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce83c43497192016cdb26de022a52ee30020b28b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/CheckMemoryFormat.h
@@ -0,0 +1,25 @@
+#include <c10/core/TensorOptions.h>
+
+namespace c10 { namespace impl {
+
+inline c10::optional<MemoryFormat>
+check_tensor_options_and_extract_memory_format(
+    const TensorOptions& options,
+    c10::optional<MemoryFormat> memory_format) {
+  TORCH_CHECK(
+      options.requires_grad_opt() == c10::nullopt ||
+      options.requires_grad_opt().value() == false,
+      "Operators taking TensorOptions cannot take a TensorOptions with "
+      "options.requires_grad set as true. This isn't implemented yet.");
+  TORCH_CHECK(
+      !(options.has_memory_format() && memory_format.has_value()),
+      "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+      "the redundant setter.");
+  if (memory_format.has_value()) {
+    return memory_format;
+  } else {
+    return options.memory_format_opt();
+  }
+}
+
+}} // namespace impl namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypeProperties.h b/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypeProperties.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c95fc31149c7cde43bf62d114408e17270cef62
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypeProperties.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Layout.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/Generator.h>
+
+
+namespace at {
+
+class Tensor;
+
+// This class specifies a Backend and a ScalarType. Currently, it primarily
+// serves as a replacement return value for Tensor::type(). Previously,
+// Tensor::type() returned Type&, but we are changing Type to not be
+// dtype-specific.
+class TORCH_API DeprecatedTypeProperties {
+ public:
+  DeprecatedTypeProperties(Backend backend, ScalarType scalar_type)
+    : backend_(backend), scalar_type_(scalar_type) {}
+
+  Backend backend() const {
+    return backend_;
+  }
+
+  Layout layout() const {
+    return layout_from_backend(backend_);
+  }
+
+  bool is_sparse() const {
+    return layout_from_backend(backend()) == kSparse;
+  }
+
+  bool is_sparse_csr() const {
+    return layout_from_backend(backend()) == kSparseCsr;
+  }
+
+  c10::DeviceType device_type() const {
+    return backendToDeviceType(backend_);
+  }
+
+  bool is_cuda() const {
+    return backendToDeviceType(backend_) == kCUDA;
+  }
+
+  ScalarType scalarType() const {
+    return scalar_type_;
+  }
+
+  caffe2::TypeMeta typeMeta() const {
+    return scalarTypeToTypeMeta(scalar_type_);
+  }
+
+  bool operator==(const DeprecatedTypeProperties& other) const {
+    return backend_ == other.backend() && scalar_type_ == other.scalarType();
+  }
+
+  bool operator!=(const DeprecatedTypeProperties& other) const {
+    return !(*this == other);
+  }
+
+  std::string toString() const {
+    std::string base_str;
+    if (backend_ == Backend::Undefined || scalar_type_ == ScalarType::Undefined) {
+      base_str = "UndefinedType";
+    } else {
+      base_str = std::string(at::toString(backend_)) + at::toString(scalar_type_) + "Type";
+    }
+    return base_str;
+  }
+
+  DeprecatedTypeProperties & toBackend(Backend b) const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        b, scalar_type_);
+  }
+
+  DeprecatedTypeProperties & toScalarType(ScalarType s) const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        backend_, s);
+  }
+
+  DeprecatedTypeProperties & cpu() const {
+    return toBackend(Backend::CPU);
+  }
+
+  DeprecatedTypeProperties & cuda() const {
+    return toBackend(Backend::CUDA);
+  }
+
+  DeprecatedTypeProperties & hip() const {
+    return toBackend(Backend::HIP);
+  }
+
+  DeprecatedTypeProperties & privateUser1() const {
+    return toBackend(Backend::PrivateUse1);
+  }
+
+  /// Constructs the `TensorOptions` from a type and a `device_index`.
+  TensorOptions options(int16_t device_index = -1) const {
+    return TensorOptions().dtype(typeMeta())
+                          .device(device_type(), static_cast<c10::DeviceIndex>(device_index))
+                          .layout(layout());
+  }
+
+  /// Constructs the `TensorOptions` from a type and a Device.  Asserts that
+  /// the device type matches the device type of the type.
+  TensorOptions options(c10::optional<Device> device_opt) const {
+    if (!device_opt.has_value()) {
+      return options(-1);
+    } else {
+      Device device = device_opt.value();
+      AT_ASSERT(device.type() == device_type());
+      return options(device.index());
+    }
+  }
+
+  operator TensorOptions() const {
+    return options();
+  }
+
+  int64_t id() const {
+    return static_cast<int64_t>(backend()) *
+        static_cast<int64_t>(ScalarType::NumOptions) +
+        static_cast<int64_t>(scalarType());
+  }
+
+  Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const;
+  Storage unsafeStorageFromTH(void * th_pointer, bool retain) const;
+  Tensor copy(const Tensor & src, bool non_blocking=false, c10::optional<Device> to_device={}) const;
+
+ private:
+  Backend backend_;
+  ScalarType scalar_type_;
+};
+
+}  // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypePropertiesRegistry.h b/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypePropertiesRegistry.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcf7a88f8d0ad26f1dc21a687a6696f48108af5f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/DeprecatedTypePropertiesRegistry.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// In order to preserve bc, we make DeprecatedTypeProperties instances unique
+// just like they are for Type.
+
+#include <c10/core/Backend.h>
+#include <c10/core/ScalarType.h>
+#include <memory>
+
+namespace at {
+
+class DeprecatedTypeProperties;
+
+struct TORCH_API DeprecatedTypePropertiesDeleter {
+  void operator()(DeprecatedTypeProperties * ptr);
+};
+
+class TORCH_API DeprecatedTypePropertiesRegistry {
+ public:
+  DeprecatedTypePropertiesRegistry();
+
+  DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) const;
+
+private:
+  std::unique_ptr<DeprecatedTypeProperties> registry
+    [static_cast<int>(Backend::NumOptions)]
+    [static_cast<int>(ScalarType::NumOptions)];
+};
+
+TORCH_API DeprecatedTypePropertiesRegistry& globalDeprecatedTypePropertiesRegistry();
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Dict.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..7808d52d32f9348b96b2195119a3255cd4f9b276
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Dict.h
@@ -0,0 +1,397 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/macros/Export.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/order_preserving_flat_hash_map.h>
+#include <c10/util/Optional.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/jit_type_base.h>
+
+namespace c10 {
+struct IValue;
+template<class Key, class Value> class Dict;
+struct Type;
+
+namespace impl {
+
+using valid_dict_key_types = guts::typelist::typelist<
+  int64_t,
+  std::string,
+  double,
+  c10::complex<double>,
+  bool,
+  at::Tensor
+>;
+}
+
+namespace detail {
+
+struct DictKeyHash {
+  size_t operator()(const IValue& ivalue) const;
+};
+
+struct DictKeyEqualTo {
+  bool operator()(const IValue& lhs, const IValue& rhs) const;
+};
+
+struct DictImpl final : public c10::intrusive_ptr_target {
+  using dict_map_type = ska_ordered::order_preserving_flat_hash_map<IValue, IValue, DictKeyHash, DictKeyEqualTo>;
+  struct DictElementTypes final {
+    TypePtr keyType;
+    TypePtr valueType;
+  };
+
+  explicit DictImpl(dict_map_type dict_, DictElementTypes elementTypes_)
+  : dict(std::move(dict_))
+  , elementTypes(std::move(elementTypes_)) {}
+  dict_map_type dict;
+
+  DictElementTypes elementTypes;
+
+  intrusive_ptr<DictImpl> copy() const;
+  friend TORCH_API bool operator==(const DictImpl& lhs, const DictImpl& rhs);
+};
+
+}
+
+namespace impl {
+template<class Key, class Value, class Iterator> class DictIterator;
+
+/**
+ * A reference to an entry in the Dict.
+ * Use the `key()` and `value()` methods to read the element.
+ */
+template<class Key, class Value, class Iterator>
+class DictEntryRef final {
+public:
+  explicit DictEntryRef(Iterator iterator)
+  : iterator_(std::move(iterator)) {}
+
+  decltype(auto) key() const {
+    return iterator_->first.template to<Key>();
+  }
+
+  decltype(auto) value() const {
+    return iterator_->second.template to<Value>();
+  }
+
+  template<class Value_>
+  void setValue(Value_&& value) const {
+    static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of setValue()");
+    iterator_->second = Value(std::forward<Value_>(value));
+  }
+
+private:
+  // allow copying and moving, but only our friends (i.e. the Dict class) can do
+  // it. Copying/moving this reference wrapper would be too ambiguous to allow it
+  // in the public API.
+  DictEntryRef(const DictEntryRef&) = default;
+  DictEntryRef& operator=(const DictEntryRef&) = default;
+  DictEntryRef(DictEntryRef&&) noexcept = default;
+  DictEntryRef& operator=(DictEntryRef&& rhs) & noexcept = default;
+
+  Iterator iterator_;
+  friend class DictIterator<Key, Value, Iterator>;
+  friend class Dict<Key, Value>;
+};
+
+// this wraps map_type::iterator to make sure user code can't rely
+// on it being the type of the underlying map.
+template<class Key, class Value, class Iterator>
+class DictIterator final {
+public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = DictEntryRef<Key, Value, Iterator>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+
+  explicit DictIterator() = default;
+  ~DictIterator() = default;
+
+  DictIterator(const DictIterator& rhs): entryRef_(rhs.entryRef_) {}
+  DictIterator(DictIterator&& rhs) noexcept: entryRef_(std::move(rhs.entryRef_)) {}
+  DictIterator& operator=(const DictIterator& rhs) {
+    entryRef_ = rhs.entryRef_;
+    return *this;
+  }
+  DictIterator& operator=(DictIterator&& rhs) noexcept {
+    entryRef_ = std::move(rhs.entryRef_);
+    return *this;
+  }
+
+  DictIterator& operator++() {
+      ++entryRef_.iterator_;
+      return *this;
+  }
+
+  DictIterator operator++(int) {
+      DictIterator copy(*this);
+      ++*this;
+      return copy;
+  }
+
+  const DictEntryRef<Key, Value, Iterator>& operator*() const {
+      return entryRef_;
+  }
+
+  const DictEntryRef<Key, Value, Iterator>* operator->() const {
+    return &entryRef_;
+  }
+
+  friend difference_type operator-(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.entryRef_.iterator_ - rhs.entryRef_.iterator_;
+  }
+
+private:
+  explicit DictIterator(Iterator iterator): entryRef_(std::move(iterator)) {}
+
+  const Iterator& get_iterator_() const {
+    return entryRef_.iterator_;
+  }
+
+  friend bool operator==(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() == rhs.get_iterator_();
+  }
+
+  friend bool operator!=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() != rhs.get_iterator_();
+  }
+
+  friend bool operator<(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() < rhs.get_iterator_();
+  }
+
+  friend bool operator<=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() <= rhs.get_iterator_();
+  }
+
+  friend bool operator>(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() > rhs.get_iterator_();
+  }
+
+  friend bool operator>=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() >= rhs.get_iterator_();
+  }
+
+  DictEntryRef<Key, Value, Iterator> entryRef_;
+
+  friend class DictIterator<Key, Value, typename c10::detail::DictImpl::dict_map_type::iterator>;
+  friend class Dict<Key, Value>;
+};
+
+template<class Key, class Value> Dict<Key, Value> toTypedDict(Dict<IValue, IValue> dict);
+template<class Key, class Value> Dict<IValue, IValue> toGenericDict(Dict<Key, Value> dict);
+}
+
+/**
+ * An object of this class stores a map from Key to Value.
+ *
+ * This is a pointer type. After a copy, both Dicts
+ * will share the same storage:
+ *
+ * > Dict<int, string> a;
+ * > Dict<int, string> b = a;
+ * > b.insert(3, "three");
+ * > ASSERT("three" == a.at(3));
+ *
+ * We use this class in the PyTorch kernel API because that
+ * allows us to do optimizations and switch out the underlying
+ * map implementation without breaking backwards compatibility
+ * for the kernel API.
+ */
+template<class Key, class Value>
+class Dict final {
+private:
+  static_assert((std::is_same<IValue, Key>::value && std::is_same<IValue, Value>::value) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");
+
+  // impl_ stores the underlying map as a ska_ordered::order_preserving_flat_hash_map.
+  // We intentionally don't offer conversion from/to
+  // order_preserving_flat_hash_map, return references to it or something like that,
+  // because such operations would get expensive if we switch out
+  // the actual map implementation.
+  // This is an intrusive_ptr because Dict is a pointer type.
+  // Invariant: This will never be a nullptr, there will always be a valid
+  // DictImpl.
+  c10::intrusive_ptr<detail::DictImpl> impl_;
+
+  explicit Dict(c10::intrusive_ptr<detail::DictImpl>&& impl);
+  friend struct IValue;
+  template<class K, class V> friend Dict<K, V> impl::toTypedDict(Dict<IValue, IValue>);
+  template<class K, class V> friend Dict<IValue, IValue> impl::toGenericDict(Dict<K, V>);
+
+public:
+  using key_type = Key;
+  using mapped_type = Value;
+  using size_type = typename detail::DictImpl::dict_map_type::size_type;
+  using iterator = impl::DictIterator<Key, Value, typename detail::DictImpl::dict_map_type::iterator>;
+
+  /**
+   * Creates an empty dict.
+   */
+  explicit Dict();
+
+  /**
+   * Create a generic dict with runtime type information.
+   * This only works for c10::impl::GenericDict and is not part of the public API
+   * but only supposed to be used internally by PyTorch.
+   */
+  explicit Dict(TypePtr keyType, TypePtr valueType);
+
+  ~Dict() = default;
+
+  Dict(const Dict&) = default;
+  Dict& operator=(const Dict&) = default;
+
+  /**
+   * Create a new Dict pointing to a deep copy of the same data.
+   * The Dict returned is a new dict with separate storage.
+   * Changes in it are not reflected in the original dict or vice versa.
+   */
+  Dict copy() const;
+
+  /**
+   * Returns an iterator to the first element of the container.
+   * If the container is empty, the returned iterator will be equal to end().
+   */
+  iterator begin() const;
+
+  /**
+   * Returns an iterator to the element following the last element of the container.
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   */
+  iterator end() const;
+
+  /**
+   * Checks if the container has no elements.
+   */
+  bool empty() const;
+
+  /**
+   * Returns the number of elements in the container.
+   */
+  size_type size() const;
+
+  /**
+   * Erases all elements from the container. After this call, size() returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements. May also invalidate past-the-end iterators.
+   */
+  void clear() const;
+
+  /**
+   * Inserts element(s) into the container, if the container doesn't already contain an element with an equivalent key.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return A pair consisting of an iterator to the inserted element (or to the element that prevented the insertion) and a bool denoting whether the insertion took place.
+   */
+  template<class Key_, class Value_>
+  std::pair<iterator, bool> insert(Key_&& key, Value_&& value) const;
+
+  /**
+   * If an element with the given key already exists, it is overwritten with the given value.
+   * Otherwise, a new element with the given key and value are inserted.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return The bool component is true if the insertion took place and false if the assignment took place. The iterator component is pointing at the element that was inserted or updated.
+   */
+  template<class Key_, class Value_>
+  std::pair<iterator, bool> insert_or_assign(Key_&& key, Value_&& value) const;
+
+  /**
+   * Removes the element pointed to by iter.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   * The iterator iter must be valid and dereferenceable. Thus the end() iterator (which is valid, but is not dereferenceable) cannot be used as a value for iter.
+   */
+  void erase(iterator iter) const;
+
+  /**
+   * Removes the element with the given key, if it exists.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return The number of elements removed. This is either '1' if an element with the key existed, or '0' if it didn't.
+   */
+  C10_NODISCARD size_t erase(const Key& key) const;
+
+  /**
+   * Returns the mapped value of the element with key equivalent to key.
+   * If no such element exists, an exception of type std::out_of_range is thrown.
+   */
+  Value at(const Key& key) const;
+
+  /**
+   * Finds an element with key equivalent to key.
+   *
+   * @return Iterator to an element with key equivalent to key.
+   *         If no such element is found, past-the-end (see end()) iterator is returned.
+   */
+  iterator find(const Key& key) const;
+
+  /**
+   * Checks if there is an element with key equivalent to key in the container.
+   *
+   * @return true if there is such an element, otherwise false.
+   */
+  bool contains(const Key& key) const;
+
+  /**
+   * Increase the capacity so that at least count elements can be stored without
+   * having to reallocate or rehash.
+   */
+  void reserve(size_type count) const;
+
+  /**
+   * Value equality comparison. This function implements Python-like semantics for
+   * equality: two dicts with the same identity (e.g. same pointer) trivially
+   * compare equal, otherwise each element is compared for equality.
+   */
+  template <class Key_, class Value_>
+  friend bool operator==(
+      const Dict<Key_, Value_>& lhs,
+      const Dict<Key_, Value_>& rhs);
+  template <class Key_, class Value_>
+  friend bool operator!=(
+      const Dict<Key_, Value_>& lhs,
+      const Dict<Key_, Value_>& rhs);
+
+  /**
+   * Identity comparison. Returns true if and only if `rhs` represents the same
+   * Dict object as `this`.
+   */
+  bool is(const Dict& rhs) const;
+
+  // private API for now because the return type will change to TypePtr
+  // instead of optional<TypePtr> once types are mandatory.
+  TypePtr keyType() const;
+  TypePtr valueType() const;
+
+  // [unsafe set type]
+  // These functions mutate the tagged type of this dictionary in place.
+  // There is no checking that the members of the dictionary are instances
+  // of the new types, nor is there a check that other IValues which
+  // hold references to this dictionary have the right static type.
+  // This functionality is used only in the unpickler, where at
+  // creation type the real type of the dictionary is unknown, but
+  // then later recovered from the static type information of the
+  // unpickled object.
+  void unsafeSetKeyType(TypePtr t);
+  void unsafeSetValueType(TypePtr t);
+};
+
+namespace impl {
+// GenericDict is how IValue stores dicts. It is, however, not part of the
+// public API. Kernels should use Dicts with concrete Key, Value types instead
+// (maybe except for some internal prim ops).
+using GenericDict = Dict<IValue, IValue>;
+
+}
+}
+
+namespace torch {
+  template<class Key, class Value> using Dict = c10::Dict<Key, Value>;
+}
+
+#include <ATen/core/Dict_inl.h>  // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Dict_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Dict_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9340a06ac7479838f30d26dafb30beb16c2f4c9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Dict_inl.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/hash.h>
+
+namespace c10 {
+namespace detail {
+inline bool DictKeyEqualTo::operator()(const IValue& lhs, const IValue& rhs) const {
+  if (lhs.isTensor() && rhs.isTensor()) {
+    // for tensors, we compare only by identity (following how it's done in Python).
+    return lhs.is(rhs);
+  }
+  // Otherwise, we first compare by identity for efficiency, then by value (see:
+  // [container equality])
+  return _fastEqualsForContainer(lhs, rhs);
+}
+}
+
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+
+namespace impl {
+
+template<class Key, class Value>
+Dict<Key, Value> toTypedDict(GenericDict dict) {
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Key>() == *dict.impl_->elementTypes.keyType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Key types mismatch.");
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Value>() == *dict.impl_->elementTypes.valueType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Value types mismatch.");
+
+  return Dict<Key, Value>(std::move(dict.impl_));
+}
+
+template<class Key, class Value>
+GenericDict toGenericDict(Dict<Key, Value> dict) {
+  return GenericDict(std::move(dict.impl_));
+}
+}
+
+namespace detail {
+
+inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
+  if (ivalue.isInt()) {
+    return std::hash<int64_t>()(ivalue.toInt());
+  } else if (ivalue.isString()) {
+    return std::hash<c10::string_view>()(ivalue.toStringView());
+  } else if (ivalue.isDouble()) {
+    return std::hash<double>()(ivalue.toDouble());
+  } else if (ivalue.isComplexDouble()) {
+    return c10::hash<c10::complex<double>>()(ivalue.toComplexDouble());
+  } else if (ivalue.isBool()) {
+    return std::hash<bool>()(ivalue.toBool());
+  } else if (ivalue.isTensor()) {
+    return std::hash<TensorImpl*>()(ivalue.toTensor().unsafeGetTensorImpl());
+  } else if (ivalue.isDevice()) {
+    return std::hash<Device>()(ivalue.toDevice());
+  } else {
+    throw std::runtime_error(
+        "Can't hash IValues with tag '" + ivalue.tagKind() + "'");
+  }
+}
+
+inline intrusive_ptr<DictImpl> DictImpl::copy() const {
+  return make_intrusive<DictImpl>(dict, elementTypes);
+}
+
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict()
+  :Dict(make_intrusive<detail::DictImpl>(
+      detail::DictImpl::dict_map_type(),
+      detail::DictImpl::DictElementTypes{getTypePtr<Key>(), getTypePtr<Value>()})) {
+  static_assert(!std::is_same<Key, IValue>::value, "This constructor is not valid for Dict<IValue, _>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+  static_assert(!std::is_same<Value, IValue>::value, "This constructor is not valid for Dict<_, IValue>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict(TypePtr keyType, TypePtr valueType)
+: Dict(make_intrusive<detail::DictImpl>(
+    detail::DictImpl::dict_map_type(),
+    detail::DictImpl::DictElementTypes {std::move(keyType), std::move(valueType)})) {
+  static_assert(std::is_same<Key, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
+  static_assert(std::is_same<Value, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict(c10::intrusive_ptr<detail::DictImpl>&& impl): impl_(std::move(impl)) {}
+
+template<class Key, class Value>
+Dict<Key, Value> Dict<Key, Value>::copy() const {
+  return Dict<Key, Value>(impl_->copy());
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::begin() const {
+  return iterator{impl_->dict.begin()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::end() const {
+  return iterator{impl_->dict.end()};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::empty() const {
+  return impl_->dict.empty();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::size_type Dict<Key, Value>::size() const {
+  return impl_->dict.size();
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::clear() const {
+  impl_->dict.clear();
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
+  auto inserted = impl_->dict.emplace(
+      Key(std::forward<Key_>(key)),
+      Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert_or_assign(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert_or_assign");
+  auto inserted = impl_->dict.insert_or_assign(
+    Key(std::forward<Key_>(key)),
+    Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::erase(iterator iter) const {
+  impl_->dict.erase(iter.entryRef_.iterator_);
+}
+
+template<class Key, class Value>
+C10_NODISCARD size_t Dict<Key, Value>::erase(const Key& key) const {
+  return impl_->dict.erase(key);
+}
+
+template<class Key, class Value>
+Value Dict<Key, Value>::at(const Key& key) const {
+  return impl_->dict.at(key).template to<Value>();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::find(const Key& key) const {
+  return iterator{impl_->dict.find(key)};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::contains(const Key& key) const {
+  return end() != find(key);
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::reserve(size_type count) const {
+  impl_->dict.reserve(count);
+}
+
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::keyType() const {
+  return impl_->elementTypes.keyType;
+}
+
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::valueType() const {
+  return impl_->elementTypes.valueType;
+}
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetKeyType(TypePtr t) {
+  impl_->elementTypes.keyType = std::move(t);
+}
+
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetValueType(TypePtr t) {
+  impl_->elementTypes.valueType = std::move(t);
+}
+
+template <class Key_, class Value_>
+bool operator==(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  // Dicts with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+
+  // Otherwise compare the values
+  return *lhs.impl_ == *rhs.impl_;
+}
+
+template <class Key_, class Value_>
+bool operator!=(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <class Key, class Value>
+bool Dict<Key, Value>::is(const Dict& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/DimVector.h b/MLPY/Lib/site-packages/torch/include/ATen/core/DimVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d0318b7e3bd6b6207c9b2e333b6fdf99eaf0585
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/DimVector.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <c10/util/DimVector.h>
+
+namespace at {
+
+// Re-declaring 'DimVector' type and size inside 'at' namespace.
+// This is done to avoid modifying every use into their 'c10'
+// equivalent.
+
+using c10::kDimVectorStaticSize;
+using c10::DimVector;
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Dimname.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Dimname.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ac2abe3ac0ae8a78af55a426a325b072e32439d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Dimname.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <ostream>
+
+namespace at {
+
+enum class NameType: uint8_t { BASIC, WILDCARD };
+
+struct TORCH_API Dimname {
+  static Dimname fromSymbol(Symbol name);
+  static Dimname wildcard();
+  static bool isValidName(const std::string& name);
+
+  NameType type() const { return type_; }
+  Symbol symbol() const { return name_; }
+
+  bool isBasic() const { return type_ == NameType::BASIC; }
+  bool isWildcard() const { return type_ == NameType::WILDCARD; }
+
+  bool matches(Dimname other) const;
+  c10::optional<Dimname> unify(Dimname other) const;
+
+ private:
+  Dimname(Symbol name)
+    : name_(name), type_(NameType::BASIC) {}
+  Dimname(Symbol name, NameType type)
+    : name_(name), type_(type) {}
+
+  Symbol name_;
+  NameType type_;
+};
+
+using DimnameList = c10::ArrayRef<Dimname>;
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Dimname& dimname);
+
+inline bool operator==(const Dimname& lhs, const Dimname& rhs) {
+  return lhs.symbol() == rhs.symbol();
+}
+
+inline bool operator!=(const Dimname& lhs, const Dimname& rhs) {
+  return !(lhs == rhs);
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/DistributionsHelper.h b/MLPY/Lib/site-packages/torch/include/ATen/core/DistributionsHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae4a73662fc74d6b75177ea87bbe533034011696
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/DistributionsHelper.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include <ATen/core/Array.h>
+#include <ATen/core/TransformationHelper.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/MathConstants.h>
+#include <c10/util/Optional.h>
+#include <c10/macros/Macros.h>
+
+#include <type_traits>
+#include <limits>
+#include <cmath>
+
+/**
+ * Distributions kernel adapted from THRandom.cpp
+ * The kernels try to follow std::random distributions signature
+ * For instance: in ATen
+ *      auto gen = at::detail::createCPUGenerator();
+ *      at::uniform_real_distribution<double> uniform(0, 1);
+ *      auto sample = uniform(gen.get());
+ *
+ *      vs std::random
+ *
+ *      std::mt19937 gen;
+ *      std::uniform_real_distribution uniform(0, 1);
+ *      auto sample = uniform(gen);
+ */
+
+
+namespace at {
+namespace {
+
+/**
+ * Samples a discrete uniform distribution in the range [base, base+range) of type T
+ */
+template <typename T>
+struct uniform_int_from_to_distribution {
+
+  C10_HOST_DEVICE inline uniform_int_from_to_distribution(uint64_t range, int64_t base) : range_(range), base_(base) {}
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    if ((
+      std::is_same<T, int64_t>::value ||
+      std::is_same<T, double>::value ||
+      std::is_same<T, float>::value ||
+      std::is_same<T, at::BFloat16>::value) && range_ >= 1ULL << 32)
+    {
+      return transformation::uniform_int_from_to<T>(generator->random64(), range_, base_);
+    } else {
+      return transformation::uniform_int_from_to<T>(generator->random(), range_, base_);
+    }
+  }
+
+  private:
+    uint64_t range_;
+    int64_t base_;
+};
+
+/**
+ * Samples a discrete uniform distribution in the range [min_value(int64_t), max_value(int64_t)]
+ */
+template <typename T>
+struct uniform_int_full_range_distribution {
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    return transformation::uniform_int_full_range<T>(generator->random64());
+  }
+
+};
+
+/**
+ * Samples a discrete uniform distribution in the range [0, max_value(T)] for integral types
+ * and [0, 2^mantissa] for floating-point types.
+ */
+template <typename T>
+struct uniform_int_distribution {
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    if constexpr (std::is_same_v<T, double> || std::is_same_v<T, int64_t>) {
+      return transformation::uniform_int<T>(generator->random64());
+    } else {
+      return transformation::uniform_int<T>(generator->random());
+    }
+  }
+
+};
+
+/**
+ * Samples a uniform distribution in the range [from, to) of type T
+ */
+template <typename T>
+struct uniform_real_distribution {
+
+  C10_HOST_DEVICE inline uniform_real_distribution(T from, T to) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(from <= to);
+    TORCH_CHECK_IF_NOT_ON_CUDA(to - from <= std::numeric_limits<T>::max());
+    from_ = from;
+    to_ = to;
+  }
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline dist_acctype<T> operator()(RNG generator){
+    if constexpr (std::is_same_v<T, double>) {
+      return transformation::uniform_real<T>(generator->random64(), from_, to_);
+    } else {
+      return transformation::uniform_real<T>(generator->random(), from_, to_);
+    }
+  }
+
+  private:
+    T from_;
+    T to_;
+};
+
+// The SFINAE checks introduced in #39816 looks overcomplicated and must revisited
+// https://github.com/pytorch/pytorch/issues/40052
+#define DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(member)              \
+template <typename T>                                                \
+struct has_member_##member                                           \
+{                                                                    \
+    typedef char yes;                                                \
+    typedef long no;                                                 \
+    template <typename U> static yes test(decltype(&U::member));     \
+    template <typename U> static no test(...);                       \
+    static constexpr bool value = sizeof(test<T>(0)) == sizeof(yes); \
+}
+
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(next_double_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(set_next_double_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(next_float_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(set_next_float_normal_sample);
+
+#define DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(TYPE)                                      \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            has_member_next_##TYPE##_normal_sample<RNG>::value &&                                   \
+            has_member_set_next_##TYPE##_normal_sample<RNG>::value                                  \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* generator, ret_type* ret) {  \
+  if (generator->next_##TYPE##_normal_sample()) {                                                   \
+    *ret = *(generator->next_##TYPE##_normal_sample());                                             \
+    generator->set_next_##TYPE##_normal_sample(c10::optional<TYPE>());                              \
+    return true;                                                                                    \
+  }                                                                                                 \
+  return false;                                                                                     \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            !has_member_next_##TYPE##_normal_sample<RNG>::value ||                                  \
+            !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type* /*ret*/) {  \
+  return false;                                                                                     \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            has_member_set_next_##TYPE##_normal_sample<RNG>::value                                  \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* generator, ret_type cache) { \
+  generator->set_next_##TYPE##_normal_sample(cache);                                                \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type /*cache*/) { \
+}
+
+DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(double);
+DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(float);
+
+/**
+ * Samples a normal distribution using the Box-Muller method
+ * Takes mean and standard deviation as inputs
+ * Note that Box-muller method returns two samples at a time.
+ * Hence, we cache the "next" sample in the CPUGeneratorImpl class.
+ */
+template <typename T>
+struct normal_distribution {
+
+  C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);
+    mean = mean_in;
+    stdv = stdv_in;
+  }
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline dist_acctype<T> operator()(RNG generator){
+    dist_acctype<T> ret;
+    // return cached values if available
+    if constexpr (std::is_same_v<T, double>) {
+      if (maybe_get_next_double_normal_sample(generator, &ret)) {
+        return transformation::normal(ret, mean, stdv);
+      }
+    } else {
+      if (maybe_get_next_float_normal_sample(generator, &ret)) {
+        return transformation::normal(ret, mean, stdv);
+      }
+    }
+    // otherwise generate new normal values
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    const dist_acctype<T> u1 = uniform(generator);
+    const dist_acctype<T> u2 = uniform(generator);
+    const dist_acctype<T> r = ::sqrt(static_cast<T>(-2.0) * ::log1p(-u2));
+    const dist_acctype<T> theta = static_cast<T>(2.0) * c10::pi<T> * u1;
+    if constexpr (std::is_same_v<T, double>) {
+      maybe_set_next_double_normal_sample(generator, r * ::sin(theta));
+    } else {
+      maybe_set_next_float_normal_sample(generator, r * ::sin(theta));
+    }
+    ret = r * ::cos(theta);
+    return transformation::normal(ret, mean, stdv);
+  }
+
+  private:
+    T mean;
+    T stdv;
+};
+
+template <typename T>
+struct DiscreteDistributionType { using type = float; };
+
+template <> struct DiscreteDistributionType<double> { using type = double; };
+
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+template <typename T>
+struct bernoulli_distribution {
+
+  C10_HOST_DEVICE inline bernoulli_distribution(T p_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(p_in >= 0 && p_in <= 1);
+    p = p_in;
+  }
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::bernoulli<T>(uniform(generator), p);
+  }
+
+  private:
+    T p;
+};
+
+/**
+ * Samples a geometric distribution given a probability input
+ */
+template <typename T>
+struct geometric_distribution {
+
+  C10_HOST_DEVICE inline geometric_distribution(T p_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(p_in > 0 && p_in < 1);
+    p = p_in;
+  }
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::geometric<T>(uniform(generator), p);
+  }
+
+  private:
+    T p;
+};
+
+/**
+ * Samples an exponential distribution given a lambda input
+ */
+template <typename T>
+struct exponential_distribution {
+
+  C10_HOST_DEVICE inline exponential_distribution(T lambda_in) : lambda(lambda_in) {}
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::exponential<T>(uniform(generator), lambda);
+  }
+
+  private:
+    T lambda;
+};
+
+/**
+ * Samples a cauchy distribution given median and sigma as inputs
+ */
+template <typename T>
+struct cauchy_distribution {
+
+  C10_HOST_DEVICE inline cauchy_distribution(T median_in, T sigma_in) : median(median_in), sigma(sigma_in) {}
+
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::cauchy<T>(uniform(generator), median, sigma);
+  }
+
+  private:
+    T median;
+    T sigma;
+};
+
+/**
+ * Samples a lognormal distribution
+ * Takes mean and standard deviation as inputs
+ * Outputs two samples at a time
+ */
+template <typename T>
+struct lognormal_distribution {
+
+  C10_HOST_DEVICE inline lognormal_distribution(T mean_in, T stdv_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);
+    mean = mean_in;
+    stdv = stdv_in;
+  }
+
+  template<typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator){
+    normal_distribution<T> normal(mean, stdv);
+    return transformation::log_normal<T>(normal(generator));
+  }
+
+  private:
+    T mean;
+    T stdv;
+};
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Formatting.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Formatting.h
new file mode 100644
index 0000000000000000000000000000000000000000..05b22d474582873e382a8bfde55758c951e154e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Formatting.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include <c10/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+
+namespace c10 {
+TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Scalar& s);
+TORCH_API std::string toString(const Scalar& s);
+}
+namespace at {
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t);
+TORCH_API std::ostream& print(
+    std::ostream& stream,
+    const Tensor& tensor,
+    int64_t linesize);
+static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+  return print(out,t,80);
+}
+TORCH_API void print(const Tensor & t, int64_t linesize=80);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Generator.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dfefa1217177ad0d309f2913e25b1651c25cc56
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Generator.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <mutex>
+#include <deque>
+#include <atomic>
+#include <typeinfo>
+#include <utility>
+#include <cstddef>
+#include <cstdint>
+
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+
+// For the record I don't think this is a correct pimpl idiom.
+// Including Impl header in interface header defeats the purpose
+// because you can't change Impl private members without forcing
+// everything that included the interface to rebuild.
+// Impl should be forward-declared in the interface header instead.
+#include <c10/core/GeneratorImpl.h>
+
+/**
+ * Note [Generator]
+ * ~~~~~~~~~~~~~~~~
+ * A Pseudo Random Number Generator (PRNG) is an engine that uses an algorithm to
+ * generate a seemingly random sequence of numbers, that may be later be used in creating
+ * a random distribution. Such an engine almost always maintains a state and requires a
+ * seed to start off the creation of random numbers. Often times, users have
+ * found it beneficial to be able to explicitly create, retain, and destroy
+ * PRNG states and also be able to have control over the seed value.
+ *
+ * A Generator in ATen gives users the ability to read, write and modify a PRNG engine.
+ * For instance, it does so by letting users seed a PRNG engine, fork the state of the
+ * engine, etc.
+ *
+ * By default, there is one generator per device, and a device's generator is
+ * lazily created. A user can use the torch.Generator() api to create their own generator.
+ */
+
+/**
+ * Note [Acquire lock when using random generators]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Generator and its derived classes are NOT thread-safe. Please note that most of the
+ * places where we have inserted locking for generators are historically based, and we
+ * haven't actually checked that everything is truly thread safe (and it probably isn't).
+ * Please use the public mutex_ when using any methods from these classes, except for the
+ * read-only methods. You can learn about the usage by looking into the unittests
+ * (aten/src/ATen/cpu_generator_test.cpp) and other places where we have used lock_guard.
+ *
+ * TODO: Look into changing the threading semantics of Generators in ATen (e.g., making
+ * them non-thread safe and instead making the generator state splittable, to accommodate
+ * forks into other threads).
+ */
+
+namespace at {
+
+class Tensor;
+
+struct TORCH_API Generator {
+  Generator() = default;
+
+  explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
+   : impl_(std::move(gen_impl)) {
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("GeneratorImpl with nullptr is not supported");
+    }
+  }
+
+  bool operator==(const Generator& rhs) const {
+    return this->impl_ == rhs.impl_;
+  }
+
+  bool operator!=(const Generator& rhs) const {
+    return !((*this) == rhs);
+  }
+
+  bool defined() const {
+    return static_cast<bool>(impl_);
+  }
+
+  c10::GeneratorImpl* unsafeGetGeneratorImpl() const {
+    return impl_.get();
+  }
+
+  c10::GeneratorImpl* unsafeReleaseGeneratorImpl() {
+    return impl_.release();
+  }
+
+  const c10::intrusive_ptr<c10::GeneratorImpl>& getIntrusivePtr() const {
+    return impl_;
+  }
+
+  void set_current_seed(uint64_t seed) { impl_->set_current_seed(seed); }
+  // Sets the offset of Generator state to the desired offset. This is currently
+  // supported for only Philox based Generators, i.e., CUDA and MPS.
+  void set_offset(uint64_t offset) { impl_->set_offset(offset); }
+
+  // Returns the offset of Generator state. This is currently supported for only
+  // Philox based Generators, i.e., CUDA and MPS.
+  uint64_t get_offset() const { return impl_->get_offset(); }
+
+  uint64_t current_seed() const { return impl_->current_seed(); }
+
+  uint64_t seed() { return impl_->seed(); }
+
+  // Implementation not inlined to prevent cycle reference between
+  // `ATen/core/Generator.h` and `ATen/core/Tensor.h`
+  void set_state(const at::Tensor& new_state);
+
+  at::Tensor get_state() const;
+
+  std::mutex& mutex() {
+    return impl_->mutex_;
+  }
+
+  DispatchKeySet key_set() const {
+    return impl_->key_set();
+  }
+
+  Device device() const { return impl_->device(); }
+
+  inline void set_pyobj(PyObject* pyobj) const noexcept {
+    impl_->set_pyobj(pyobj);
+  }
+
+  inline PyObject* pyobj() const noexcept {
+    return impl_->pyobj();
+  }
+
+  template<typename T>
+  T* get() const { return static_cast<T*>(impl_.get()); }
+
+  Generator clone() const {
+    return Generator(impl_->clone());
+  }
+
+ private:
+  c10::intrusive_ptr<c10::GeneratorImpl> impl_;
+};
+
+template<class Impl, class... Args>
+Generator make_generator(Args&&... args) {
+  return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
+}
+
+/**
+ * Utility function to static cast input Generator* to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T * check_generator(c10::optional<Generator> gen) {
+  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
+  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
+  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
+  return gen->get<T>();
+}
+
+/**
+ * Utility function used in tensor implementations, which
+ * supplies the default generator to tensors, if an input generator
+ * is not supplied. The input Generator* is also static casted to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+static inline T* get_generator_or_default(const c10::optional<Generator>& gen, const Generator& default_gen) {
+  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
+}
+
+namespace detail {
+
+/**
+ * Helper function for checking the validity of new random generator
+ * state. Right now following conditions are checked:
+ *
+ * - The new state tensor must be a torch.ByteTensor
+ * - Data of the new state tensor must be contiguous
+ */
+static inline void check_rng_state(const c10::TensorImpl& new_state) {
+  TORCH_CHECK_TYPE(
+    new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte,
+    "RNG state must be a torch.ByteTensor"
+  );
+
+  TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous");
+}
+
+} // namespace detail
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/GeneratorForPrivateuseone.h b/MLPY/Lib/site-packages/torch/include/ATen/core/GeneratorForPrivateuseone.h
new file mode 100644
index 0000000000000000000000000000000000000000..2daa607f02ec720d30a3d9a10c8b429bb1703716
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/GeneratorForPrivateuseone.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace at {
+
+using GeneratorFuncType = std::function<at::Generator(c10::DeviceIndex)>;
+
+c10::optional<GeneratorFuncType>& GetGeneratorPrivate();
+
+class TORCH_API _GeneratorRegister {
+ public:
+  explicit _GeneratorRegister(const GeneratorFuncType& func);
+};
+
+TORCH_API at::Generator GetGeneratorForPrivateuse1(
+    c10::DeviceIndex device_index);
+
+/**
+ * This is used to register Generator to PyTorch for `privateuse1` key.
+ *
+ * Usage: REGISTER_GENERATOR_PRIVATEUSE1(MakeGeneratorForPrivateuse1)
+ *
+ * class CustomGeneratorImpl : public c10::GeneratorImpl {
+ *   CustomGeneratorImpl(DeviceIndex device_index = -1);
+ *   explicit ~CustomGeneratorImpl() override = default;
+ *   ...
+ * };
+ *
+ * at::Generator MakeGeneratorForPrivateuse1(c10::DeviceIndex id) {
+ *   return at::make_generator<CustomGeneratorImpl>(id);
+ * }
+ */
+
+#define REGISTER_GENERATOR_PRIVATEUSE1(GeneratorPrivate) \
+  static auto temp##GeneratorPrivate = at::_GeneratorRegister(GeneratorPrivate);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef.h b/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0e1f9e063e3d87e729c037d2b61c913288619f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef.h
@@ -0,0 +1,631 @@
+#pragma once
+
+#include <ATen/core/ivalue_to.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+
+/*
+ * [Note: IListRef]
+ * Wrapper around different API containers (e.g. boxed and unboxed).
+ *
+ * What is it?
+ * ===========
+ * It is a tagged union of both boxed and unboxed API containers.
+ * Working implementations:
+ *
+ * - `IListRef<at::Tensor>`
+ * - `IListRef<at::OptionalTensorRef>`
+ *
+ * Note that `IListRef` is a view type. Meaning that it won't own the
+ * tensors it holds. It's intended to be used only as argument parameters.
+ * Specifically, where these 2 worlds overlap.
+ *
+ * What is this for?
+ * =================
+ * Historically, PyTorch has maintained 2 different APIs: the unboxed
+ * (called from C++ API and Python eager mode) and boxed APIs (called
+ * from the TorchScript JIT, mobile interpreter, and boxed fallbacks).
+ *
+ * Calling unboxed kernels from the boxed "world" and vice-versa may
+ * result in non-negligible overhead. Lists are one of those types:
+ *
+ * - Boxed world: `c10::List`
+ * - Unboxed world: `c10::ArrayRef`
+ *
+ * In this context, `c10::IListRef` solves this problem by wrapping those
+ * 2 container types, so that we don't need to convert from one to
+ * the other.
+ *
+ * (see https://github.com/pytorch/pytorch/issues/66328)
+ *
+ * What does it do?
+ * ================
+ * This container wraps around the different tagged containers
+ * (currently, only boxed and unboxed), without incurring in extra
+ * overhead for converting from one to another. It does so while
+ * exposing usual container methods, which dispatch to corresponding
+ * implementations.
+ *
+ * While it works with different container types, it introduces
+ * overhead for repeatedly calling member functions (since those will
+ * get dispatched, again). Therefore, you should only use it to iterate
+ * through the list up to one time. If you need to do more complex things,
+ * call `materialize()` first.
+ *
+ * Adding support for a new Tag
+ * ============================
+ * Suppose we want to add a new tag: `Chest`. Here are the steps
+ * we would have to go through:
+ *
+ * 1. Add a line for it in the macro `TORCH_ILISTREF_FORALL_TAGS`.
+ *
+ *   #define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+ *     ...
+ *     _(Chest, ##__VA_ARGS__)
+ *
+ * 2. Add type aliases, union members, and constructors.
+ *
+ *   template <typename T>
+ *   class IListRef {
+ *     ...
+ *     using chest_type =
+ *       typename detail::IListRefTagImpl<T, IListRefTag::Chest>::list_type;
+ *     ...
+ *     IListRef(...) : tag_(IListRefTag::Chest) {
+ *       ...
+ *     }
+ *     ...
+ *     union Payload {
+ *       ...
+ *       chest_type chest;
+ *       ...
+ *     };
+ *     ...
+ *   };
+ *
+ * 3. Add a default implementation for it (in 'IListRef_inl.h'). It's
+ *    preferable to make the default implementation work for `T = Tensor`
+ *    (both `Unboxed` and `Boxed` do it).
+ *
+ *   template <typename T, typename ListElemT>
+ *   class IListRefTagImplBase<IListRefTag::Chest, T, ListElemT> {
+ *    public:
+ *     using elem_type = ListElemT;
+ *     using list_type = ChestContainer<elem_type>;
+ *
+ *     static const list_type& unwrap(const IListRef<T>& ilist) { ... }
+ *
+ *     static typename list_type::const_iterator& unwrap(
+ *         IListRefIterator<T>& it) { ... }
+ *
+ *     static const typename list_type::const_iterator& unwrap(
+ *         const IListRefIterator<T>& it) { ... }
+ *
+ *     static IListRefConstRef<T> iterator_get(
+ *         const typename list_type::const_iterator& it) { ... }
+ *   }
+ *
+ * 4. Add an specialization for each of the already supported types.
+ *    Finally, for consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Chest, at::Tensor>
+ *       : public IListRefTagImplBase<IListRefTag::Chest, at::Tensor> {};
+ *
+ * Adding support for a new Type
+ * =============================
+ * Suppose we want to add support for a new type: `Matrix`.
+ * Here are the steps we would have to go through:
+ *
+ * 1. Add an specialization for each of the existing tags.
+ *    For consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Unboxed, Matrix>
+ *       : public IListRefTagImplBase<IListRefTag::Unboxed, Matrix> {};
+ *
+ *   template <>
+ *   class IListRefTagImpl<Matrix, IListRefTag::Boxed>
+ *       : public IListRefTagImplBase<IListRefTag::Boxed, Matrix> {};
+ *
+ * Common Problems
+ * ===============
+ * 1. One of `IListRef(Iterator)` methods are failing to compile.
+ *
+ *     That may be happening because the container type you added
+ *     is not compatible with the code written for that method. If
+ *     that's true, then you might have to transform that code into
+ *     a static method call (see `List::operator[]` method).
+ *
+ * 2. Can't make `IListRefIterator<T>::operator*` return a const-reference.
+ *
+ *    First, keep in mind that we assume that boxed containers will
+ *    have to deal with `IValue` (e.g. `c10::List`). In this context,
+ *    what may be happening is that `IValue` doesn't store internally
+ *    your type `T`. Instead, it constructs a type new `T` everytime
+ *    you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
+ */
+
+namespace c10 {
+template <typename T>
+class IListRef;
+
+/*
+ * Applies arbitrary macros to each `IListRefTag`.
+ */
+#define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+  _(Unboxed, ##__VA_ARGS__)                \
+  _(Boxed, ##__VA_ARGS__)                  \
+  _(Materialized, ##__VA_ARGS__)
+
+/*
+ * Defines a "switch-case" for `TAG`. Inside, it executes `BODY`,
+ * while bringing to scope:
+ *
+ * - `ImplT`: the implementation class for `TAG`
+ * - `this_`: the result of unwrapping `this`
+ */
+#define TORCH_ILISTREF_UNWRAP_CASE(TAG, BODY)                        \
+  case c10::IListRefTag::TAG: {                                      \
+    using ImplT = c10::detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+    auto& this_ = ImplT::unwrap(*this);                              \
+    BODY                                                             \
+  } break;
+
+/*
+ * Dispatches the unwrap call, depending on `TAG`, followed by
+ * the execution of `BODY`. It aborts if `TAG` is not a `IListRefTag`.
+ *
+ * This macro is useful because it allows us to handle different
+ * types (that correspond to different tags) to be implemented
+ * only once. We can do it even when the implementation of the
+ * different tags aren't syntatically the same, by dispatching
+ * it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
+ */
+#define TORCH_ILISTREF_UNWRAP(TAG, BODY)                         \
+  switch (TAG) {                                                 \
+    TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \
+    break;                                                       \
+    default:                                                     \
+      TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");     \
+  }
+
+enum class IListRefTag {
+#define DEFINE_TAG(tag, ...) tag,
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+      None
+};
+
+namespace detail {
+/*
+ * Type alias that specifies whether we return a reference or a copy of `T`.
+ *
+ * What is this for?
+ * =================
+ * Since values in the boxed world are represented by an `IValue`, we also
+ * depend on whether it can be converted to a const-reference (`Tensor`) or
+ * has to create a new copy of `T` (`OptionalTensorRef`).
+ */
+template <typename T>
+using IListRefConstRef = typename ivalue_to_const_ref_overload_return<T>::type;
+
+/*
+ * Interface that implements key functions for each `IListRefTag` type.
+ *
+ * What is this for?
+ * =================
+ * Given an `IListRef(Iterator)<T>`, some methods have to be implemented
+ * differently for each `TAG`. Therefore, the methods inside this class
+ * are used as dispatch targets for the different `IListRefTag` values.
+ *
+ * You should create an specialization of this class for each possible
+ * combination of `IListRefTag` type (except `None`) and element types
+ * (e.g. `Tensor`).
+ *
+ * What does it do?
+ * ================
+ * 1. defines static methods to be used as dispatch targets by both
+ *    `IListRef<T>` and `IListRefIterator<T>` (see the implementation of
+ *    `IListRefTagImplBase`).
+ *
+ * 2. defines the `elem_type` and `list_type` aliases that will be
+ *    used in the definition of `IListRef<T>`. In general, we should do
+ *    so by inheriting from `IListRefTagImplBase<TAG, T, ListElemT>`.
+ *
+ * [Note: IListRefTagImpl Specialization]
+ * ======================================
+ * For `IListRef(Iterator)<at::Tensor>`:
+ * - <IListRefTag::Unboxed, at::Tensor>
+ * - <IListRefTag::Boxed, at::Tensor>
+ * - <IListRefTag::Materialized, at::Tensor>
+ *
+ * For `IListRef(Iterator)<at::OptionalTensorRef>`:
+ * - <IListRefTag::Unboxed, at::OptionalTensorRef>
+ * - <IListRefTag::Boxed, at::OptionalTensorRef>
+ * - <IListRefTag::Materialized, at::OptionalTensorRef>
+ */
+template <IListRefTag TAG, typename T>
+class IListRefTagImpl {};
+
+/*
+ * Base implementation of `IListRefTagImpl<TAG, T>` methods.
+ *
+ * What is this for?
+ * =================
+ * This should make adding specializations for new types easier. For
+ * example, one should be able to add a new type just by making its
+ * `IListRefTagImpl` specialization inherit from `IListRefTagImplBase`.
+ *
+ * You should create a partial specialization for this class only if
+ * you introduce a new `IListRefTag`. The idea being that there is one
+ * default implementation for each possible value of `IListRefTag`.
+ *
+ * What does it do?
+ * ================
+ * 1. defines `elem_type` as an alias to `ListElemT`.
+ *
+ * 1. defines `list_type` as an alias to the default container type
+ *    that will hold a collection of `elem_type`. The idea being that
+ *    all types tagged as `TAG` will have `list_type` as its container,
+ *    with different `elem_type`.
+ *
+ * 3. defines the default implementation for each of the methods that
+ *    are supposed to be defined on `IListRefTagImpl` specializations.
+ *
+ * 4. inheriting from `IListRefTagImplBase<TAG, T, ListElemT>` also means
+ *    that the payload of the type `IListRef<T>` will be of type `list_type`
+ *    when it is tagged as `TAG`.
+ */
+template <IListRefTag TAG, typename T, typename ListElemT = T>
+class IListRefTagImplBase {};
+
+/*
+ * Materialized container for `IListRef<T>`.
+ *
+ * What is this for?
+ * =================
+ * Container that groups `T` references together. This exchanges the
+ * overhead of every method call from `IListRef<T>` for a dynamic allocation.
+ *
+ * You should use this container instead of `IListRef<T>` if:
+ *
+ *   - You are going to iterate the list more than once
+ *   - You need to repeatedly access arbitrary elements (using `operator[]`)
+ * What does it do?
+
+ * ================
+ * Removes the reference (&) from the type, and wraps it into a
+ * `std::reference_wrapper`. If `IListRefConstRef<T>` is not a
+ * reference type, then it's left unchanged.
+ */
+template <typename T>
+using _MaterializedIListRefElem = typename std::conditional<
+    std::is_reference<T>::value,
+    typename std::reference_wrapper<typename std::remove_reference<T>::type>,
+    T>::type;
+
+template <typename T>
+using MaterializedIListRefElem = _MaterializedIListRefElem<IListRefConstRef<T>>;
+
+template <typename T>
+using MaterializedIListRef = std::vector<MaterializedIListRefElem<T>>;
+
+} // namespace detail
+
+/*
+ * Iterator for `IListRef<T>`.
+ *
+ * What is it?
+ * ===========
+ * Currently, a `std::bidirectional_iterator` that wraps the iterator
+ * types defined for each of the `IListRefTag`.
+ *
+ * One should be able to use it, as if it were the unwrapped
+ * iterators themselves.
+
+ * What does it do?
+ * ================
+ * Similarly to `IListRef<T>`, this is a wrapper class. Specifically, it
+ * wraps each container's `const_iterator` type alias. So, for example,
+ * given that the container for `IListRefTag::Boxed` is `c10::List`, this
+ * iterator will wrap a `c10::List::const_iterator`.
+ *
+ * [Note: MSVC Iterator Debug]
+ * ===========================
+ * MSVC `vector<T>::iterator` implementation (used in the boxed variant)
+ * makes it so this union's destructor, copy-constructor (assignment), and
+ * move-constructor (assignment) are implicitly deleted.
+ *
+ * Therefore, we need to explicitly define them as needed. Follows a list
+ * of places where these are needed and their reason:
+ *
+ *   - `Payload` destructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is set to 2.
+ *
+ *   - `IListRefIterator` destructor:
+ *     same as above. However, we need to explicitly call the variant
+ *     destructor explicitly.
+ *
+ *   - `IListRefIterator` copy-constructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is different
+ *     than 0.
+ */
+template <typename T>
+class IListRefIterator {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+
+ public:
+  // C++17 friendly std::iterator implementation
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = T&;
+
+  using unboxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Unboxed, T>::list_type::const_iterator;
+  using boxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Boxed, T>::list_type::const_iterator;
+  using materialized_iterator_type =
+      typename detail::MaterializedIListRef<T>::const_iterator;
+
+  IListRefIterator() : tag_(IListRefTag::None) {}
+
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL != 0
+  // See [Note: MSVC Iterator Debug]
+  IListRefIterator(const IListRefIterator& iterator)
+      : tag_(iterator.tag_) {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator = iterator.payload_.boxed_iterator;
+        break;
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator = iterator.payload_.unboxed_iterator;
+        break;
+      case IListRefTag::Materialized:
+        payload_.materialized_iterator = iterator.payload_.materialized_iterator;
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL == 2
+  // See [Note: MSVC Iterator Debug]
+  ~IListRefIterator() noexcept(false) {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator.~boxed_iterator_type();
+        break;
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator.~unboxed_iterator_type();
+        break;
+      case IListRefTag::Materialized:
+        payload_.materialized_iterator.~materialized_iterator_type();
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+
+  IListRefIterator(boxed_iterator_type boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed_iterator = boxed;
+  }
+
+  IListRefIterator(unboxed_iterator_type unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed_iterator = unboxed;
+  }
+
+  IListRefIterator(materialized_iterator_type materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized_iterator = materialized;
+  }
+
+  detail::IListRefConstRef<T> operator*() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::iterator_get(this_); });
+  }
+
+  IListRefIterator& operator++() {
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return *this;
+  }
+
+  IListRefIterator operator++(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return old;
+  }
+
+  IListRefIterator& operator--() {
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return *this;
+  }
+
+  IListRefIterator operator--(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return old;
+  }
+
+  bool operator==(const IListRefIterator& rhs) const {
+    if (tag_ != rhs.tag_) {
+      return false;
+    }
+    TORCH_ILISTREF_UNWRAP(tag_, {
+      auto& rhs_it = ImplT::unwrap(rhs);
+      return this_ == rhs_it;
+    });
+  }
+
+  bool operator!=(const IListRefIterator& rhs) const {
+    return !(*this == rhs);
+  }
+
+ private:
+  union Payload {
+    boxed_iterator_type boxed_iterator;
+    unboxed_iterator_type unboxed_iterator;
+    materialized_iterator_type materialized_iterator;
+    void* _init_ptr;
+    Payload() : _init_ptr(nullptr) {}
+#if defined(_MSC_VER)
+    // See [Note: MSVC Iterator Debug]
+    ~Payload() {}
+#endif
+  };
+
+  Payload payload_;
+  IListRefTag tag_;
+};
+
+/*
+ * See [Note: IListRef]
+ */
+template <typename T>
+class IListRef {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+
+ public:
+  using unboxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Unboxed, T>::list_type;
+  using boxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Boxed, T>::list_type;
+  using materialized_type =
+      typename detail::MaterializedIListRef<T>;
+
+  using iterator = IListRefIterator<T>;
+  using const_iterator = IListRefIterator<T>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using value_type = typename iterator::value_type;
+
+  IListRef() : tag_(IListRefTag::None) {}
+
+  IListRef(const boxed_type& boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed = &boxed;
+  }
+
+  IListRef(const unboxed_type& unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed;
+  }
+
+  IListRef(const std::initializer_list<T>& list) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = at::ArrayRef<T>(list);
+  }
+
+  template <
+      typename... UnboxedConstructorArgs,
+      typename = std::enable_if_t<
+          std::is_constructible<unboxed_type, UnboxedConstructorArgs...>::value>>
+  IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed_type(std::forward<UnboxedConstructorArgs>(args)...);
+  }
+
+  IListRef(const materialized_type& materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized = &materialized;
+  }
+
+  size_t size() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.size(); });
+  }
+
+  bool empty() const {
+    return size() == 0;
+  }
+
+  iterator begin() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.begin(); });
+  }
+
+  iterator end() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.end(); });
+  }
+
+  detail::IListRefConstRef<T> front() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::front(this_); });
+  }
+
+  /*
+   * Materializes the `IListRef` into a `std::vector`.
+   *
+   * This should be used when one wishes to either:
+   *
+   *   - iterate over the list more than once: each `IListRefIterator`
+   *     member function call has to go through a switch, introducing
+   *     non-negligible overhead
+   *
+   *   - randomly access an arbitrary element using `operator[]`:
+   *     same reason as above
+   */
+  detail::MaterializedIListRef<T> materialize() const {
+    if (isMaterialized()) {
+      return toMaterialized();
+    }
+
+    detail::MaterializedIListRef<T> materialized;
+    materialized.reserve(size());
+    for (const auto& t : *this) {
+      materialized.emplace_back(t);
+    }
+    return materialized;
+  }
+
+#define DEFINE_CHECK(TAG, ...)    \
+  bool is##TAG() const {          \
+    return tag_ == IListRefTag::TAG; \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+
+  bool isNone() const {
+    return tag_ == IListRefTag::None;
+  }
+
+#define DEFINE_CASTING(TAG, ...)                                          \
+  const typename detail::IListRefTagImpl<IListRefTag::TAG, T>::list_type& \
+      to##TAG() const {                                                   \
+    TORCH_INTERNAL_ASSERT(is##TAG());                                     \
+    return detail::IListRefTagImpl<IListRefTag::TAG, T>::unwrap(*this);   \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CASTING);
+#undef DEFINE_CASTING
+
+ private:
+  union Payload {
+    const boxed_type* boxed;
+    unboxed_type unboxed;
+    const materialized_type* materialized;
+    Payload() : boxed(nullptr) {}
+  };
+
+  Payload payload_;
+  IListRefTag tag_;
+};
+
+} // namespace c10
+
+#include <ATen/core/IListRef_inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bee60a6bafedf55bb24704040dd7e4db2ce9b795
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/IListRef_inl.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+
+namespace at {
+class Tensor;
+class OptionalTensorRef;
+}
+
+namespace c10 {
+namespace detail {
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Unboxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = ArrayRef<elem_type>;
+
+  /*
+   * These `unwrap` static methods unwraps the inner containers out
+   * of `IListRef<T>` (and `IListRefIterator<T>`). They are required when
+   * the macro `TORCH_ILISTREF_UNWRAP` is called.
+   */
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return ilist.payload_.unboxed;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+
+  /*
+   * We have these function (besides the `unwrap`s above) because the
+   * implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
+   * weren't syntatically equal for the existing tags at the time
+   * (`Unboxed` and `Boxed`).
+   */
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst.front();
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Boxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Boxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = List<elem_type>;
+
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.boxed;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return (*it).get().toTensor();
+  }
+};
+
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Materialized`.
+ */
+template <typename T>
+class IListRefTagImplBase<IListRefTag::Materialized, T, MaterializedIListRefElem<T>> {
+ public:
+  using elem_type = MaterializedIListRefElem<T>;
+  using list_type = MaterializedIListRef<T>;
+
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.materialized;
+  }
+
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+
+/*
+ * [Note: ITensorListRef]
+ * Specializations necessary for `IListRef<at::Tensor>` type.
+ *
+ * Since the default implementations are usually done with supporting
+ * `Tensor` in mind, we only have to inherit from the base implementations.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::Tensor> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::Tensor> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::Tensor>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::Tensor,
+          MaterializedIListRefElem<at::Tensor>> {};
+
+/*
+ * [Note: IOptTensorListRef]
+ * Specializations necessary for `IListRef<at::OptionalTensorRef>` type.
+ *
+ * We can't get an `at::OptionalTensorRef` directly from an instance of
+ * `List<optional<Tensor>>` (the type that corresponds to the boxed world).
+ *
+ * So, the default implementation won't help us. Thus, we have to implement
+ * this method ourselves.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::OptionalTensorRef> {};
+
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, optional<at::Tensor>> {
+
+ public:
+  /*
+   * Given an instance of the types corresponding to the `Boxed` tag, we override
+   * the default implementation, so that we can return a `at::OptionalTensorRef`.
+   */
+  static IListRefConstRef<at::OptionalTensorRef> iterator_get(
+      const typename list_type::const_iterator& it) {
+    const auto& ivalue = (*it).get();
+    if (!ivalue.isNone()) {
+        const auto& tensor = ivalue.toTensor();
+        return (tensor.defined()) ? tensor : at::OptionalTensorRef{};
+    }
+    return {};
+  }
+};
+
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::OptionalTensorRef>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::OptionalTensorRef,
+          MaterializedIListRefElem<at::OptionalTensorRef>> {};
+
+} // namespace detail
+} // namespace c10
+
+namespace at {
+
+// [Note: ITensorListRef]
+using ITensorListRef = c10::IListRef<at::Tensor>;
+using ITensorListRefIterator = c10::IListRefIterator<at::Tensor>;
+using MaterializedITensorListRef = c10::detail::MaterializedIListRef<at::Tensor>;
+// [Note: IOptTensorListRef]
+using IOptTensorListRef = c10::IListRef<at::OptionalTensorRef>;
+using IOptTensorListRefIterator = c10::IListRefIterator<at::OptionalTensorRef>;
+using MaterializedIOptTensorListRef = c10::detail::MaterializedIListRef<at::OptionalTensorRef>;
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/LegacyTypeDispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/core/LegacyTypeDispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1244d0fda87ffd3846c6c6352ed46a7de45b5678
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/LegacyTypeDispatch.h
@@ -0,0 +1,111 @@
+#pragma once
+
+// The legacy mechanism for dispatching operators in ATen is a Type
+// object, which is essentially a giant virtual dispatch table
+// for every operation we support dynamically dispatching over.
+//
+// This has been deprecated in favor of ATenDispatch, and in the future,
+// c10 dispatcher.
+// TODO: Clean up what remains here
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
+
+namespace at {
+
+// A RAII, thread local (!) guard that will disable dispatch to variable
+// handler.
+//
+// NOTE [ Treating Variables as non-Variables in type dispatch ]
+//
+// What exactly does AutoDispatchBelowAutograd do?  The short answer is, it causes
+// dispatches on ATen functions to go to the non-variable implementation,
+// bypassing autograd handling (and also profiling and tracing).
+//
+// To understand why this guard exists, it's helpful to understand the history
+// behind how Variable was implemented.  Previously, Variables were implemented
+// as a wrapper on Tensors; so the act of processing a Variable involved
+// unwrapping the underlying Tensor, and then calling the underlying base
+// operation on /that/ operation
+//
+// However, after the Variable/Tensor merge, there is no concept of unwrapping
+// a tensor anymore.  If you just call the operation on the same variable
+// again inside your VariableType handler, you'll dispatch back to
+// VariableType, which is not what we want.
+//
+// The solution to the above problem is to add `at::AutoDispatchBelowAutograd`, which
+// when enabled will cause `legacyTensorType()` and `getType()` to always return
+// non-Variable type, even if the tensor being called on is a variable.
+
+/* Note [AutoDispatchBelowAutograd]
+ * AutoDispatchBelowAutograd is **INTERNAL ONLY** that it should be used
+ * for kernel implementations and customized C++ kernels.
+ * If you are looking for a guard to run workload in inference mode, please use
+ * c10::InferenceMode RAII which is user facing API.
+ * In the past AutoDispatchBelowAutograd(or its old version AutoNonVariableTypeMode)
+ * was used in the user code for inference-only workload, this was under risk of
+ * producing wrong results silently in some edge cases. For example:
+ * ```
+ *  torch::Tensor s = torch::ones({1, 2, 3}).set_requires_grad(true);
+ *  torch::Tensor out = s * s;
+ *  {
+ *    at::AutoDispatchBelowAutograd guard;
+ *    s.add_(1);  // Skips version bump on `s`.
+ *  }
+ *  // WRONG GRADIENT! s.grad() are now computed using `s` value after the
+ *  // inplace update.
+ *  out.backward(torch::ones_like(out));
+ * ```
+ * Users should use `c10::InferenceMode` here so that it'll properly throw an
+ * error saying "one of the variables needed for gradient computation has be modified."
+ */
+struct TORCH_API AutoDispatchBelowAutograd {
+  AutoDispatchBelowAutograd() :
+    autograd_guard_(c10::autograd_dispatch_keyset) {
+  }
+
+  // disable all autograd dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
+};
+
+// TODO: AutoNonVariableTypeMode should be removed in release 1.10.
+struct TORCH_API AutoNonVariableTypeMode {
+  AutoNonVariableTypeMode(bool enabled = true) :
+    autograd_guard_(c10::autograd_dispatch_keyset) {
+    TORCH_WARN_ONCE("AutoNonVariableTypeMode is deprecated and will be removed in 1.10 release. "
+        "For kernel implementations please use AutoDispatchBelowADInplaceOrView instead, "
+        "If you are looking for a user facing API to enable running your inference-only "
+        "workload, please use c10::InferenceMode. Using AutoDispatchBelowADInplaceOrView in user code "
+        "is under risk of producing silent wrong result in some edge cases. "
+        "See Note [AutoDispatchBelowAutograd] for more details.");
+    TORCH_INTERNAL_ASSERT(enabled);
+  }
+
+  // disable all autograd dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
+};
+
+struct TORCH_API AutoDispatchSkipFunctionalize {
+  AutoDispatchSkipFunctionalize() :
+    dispatch_key_guard_(c10::DispatchKeySet(c10::DispatchKey::Functionalize)) {
+  }
+  c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
+};
+
+/* Note [AutoDispatchBelowADInplaceOrView]
+ * AutoDispatchBelowADInplaceOrView is equivalent to AutoNonVariableTypeMode
+ * before we split inplace & view ops out of VariableType kernel.
+ * Note this guard is used in VariableType kernels for functional ops
+ * as well as ADInplaceOrView kernels for inplace/view ops to enforce the
+ * Invariant:
+ *   Once you are in VariableType/ADInplaceOrView kernel for an op,
+ *   you never go back to a kernel on same dispatch key until
+ *   you finish the current op.
+ */
+struct TORCH_API AutoDispatchBelowADInplaceOrView {
+  AutoDispatchBelowADInplaceOrView() :
+    dispatch_key_guard_(c10::autograd_dispatch_keyset_with_ADInplaceOrView) {
+  }
+  // disable Autograd & ADInplaceOrView dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
+};
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/List.h b/MLPY/Lib/site-packages/torch/include/ATen/core/List.h
new file mode 100644
index 0000000000000000000000000000000000000000..30316e388457d1cb1148127dd90c78b860589478
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/List.h
@@ -0,0 +1,490 @@
+#pragma once
+
+#include <ATen/core/ivalue_to.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/macros/Export.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <vector>
+
+namespace at {
+class Tensor;
+}
+namespace c10 {
+struct IValue;
+template<class T> class List;
+struct Type;
+
+namespace detail {
+
+struct ListImpl final : public c10::intrusive_ptr_target {
+  using list_type = std::vector<IValue>;
+
+  explicit TORCH_API ListImpl(list_type list_, TypePtr elementType_);
+
+  list_type list;
+
+  TypePtr elementType;
+
+  intrusive_ptr<ListImpl> copy() const {
+    return make_intrusive<ListImpl>(list, elementType);
+  }
+  friend TORCH_API bool operator==(const ListImpl& lhs, const ListImpl& rhs);
+};
+}
+
+namespace impl {
+
+template<class T, class Iterator> class ListIterator;
+
+template<class T, class Iterator> class ListElementReference;
+
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs);
+
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs);
+
+template<class T, class Iterator>
+bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs);
+
+template<class T>
+struct ListElementConstReferenceTraits {
+  // In the general case, we use IValue::to().
+  using const_reference = typename c10::detail::ivalue_to_const_ref_overload_return<T>::type;
+};
+
+// There is no to() overload for c10::optional<std::string>.
+template<>
+struct ListElementConstReferenceTraits<c10::optional<std::string>> {
+  using const_reference = c10::optional<std::reference_wrapper<const std::string>>;
+};
+
+template<class T, class Iterator>
+class ListElementReference final {
+public:
+  operator std::conditional_t<
+      std::is_reference<typename c10::detail::
+                            ivalue_to_const_ref_overload_return<T>::type>::value,
+      const T&,
+      T>() const;
+
+  ListElementReference& operator=(T&& new_value) &&;
+
+  ListElementReference& operator=(const T& new_value) &&;
+
+  // assigning another ref to this assigns the underlying value
+  ListElementReference& operator=(ListElementReference&& rhs) && noexcept;
+
+  const IValue& get() const& {
+    return *iterator_;
+  }
+
+  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs);
+
+  ListElementReference(const ListElementReference&) = delete;
+  ListElementReference& operator=(const ListElementReference&) = delete;
+
+private:
+  ListElementReference(Iterator iter)
+  : iterator_(iter) {}
+
+  // allow moving, but only our friends (i.e. the List class) can move us
+  ListElementReference(ListElementReference&&) noexcept = default;
+  ListElementReference& operator=(ListElementReference&& rhs) & noexcept {
+    iterator_ = std::move(rhs.iterator_);
+    return *this;
+  }
+
+  friend class List<T>;
+  friend class ListIterator<T, Iterator>;
+
+  Iterator iterator_;
+};
+
+// this wraps vector::iterator to make sure user code can't rely
+// on it being the type of the underlying vector.
+template <class T, class Iterator>
+class ListIterator final {
+ public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = ListElementReference<T, Iterator>;
+
+  explicit ListIterator() = default;
+  ~ListIterator() = default;
+
+  ListIterator(const ListIterator&) = default;
+  ListIterator(ListIterator&&) noexcept = default;
+  ListIterator& operator=(const ListIterator&) = default;
+  ListIterator& operator=(ListIterator&&) noexcept = default;
+
+  ListIterator& operator++() {
+      ++iterator_;
+      return *this;
+  }
+
+  ListIterator operator++(int) {
+      ListIterator copy(*this);
+      ++*this;
+      return copy;
+  }
+
+  ListIterator& operator--() {
+      --iterator_;
+      return *this;
+  }
+
+  ListIterator operator--(int) {
+      ListIterator copy(*this);
+      --*this;
+      return copy;
+  }
+
+  ListIterator& operator+=(typename List<T>::size_type offset) {
+      iterator_ += offset;
+      return *this;
+  }
+
+  ListIterator& operator-=(typename List<T>::size_type offset) {
+      iterator_ -= offset;
+      return *this;
+  }
+
+  ListIterator operator+(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ + offset};
+  }
+
+  ListIterator operator-(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ - offset};
+  }
+
+  friend difference_type operator-(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ - rhs.iterator_;
+  }
+
+  ListElementReference<T, Iterator> operator*() const {
+    return {iterator_};
+  }
+
+  ListElementReference<T, Iterator> operator[](typename List<T>::size_type offset) const {
+    return {iterator_ + offset};
+  }
+
+private:
+  explicit ListIterator(Iterator iterator): iterator_(std::move(iterator)) {}
+
+  Iterator iterator_;
+
+  friend bool operator==(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ == rhs.iterator_;
+  }
+
+  friend bool operator!=(const ListIterator& lhs, const ListIterator& rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend bool operator<(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ < rhs.iterator_;
+  }
+
+  friend bool operator<=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ <= rhs.iterator_;
+  }
+
+  friend bool operator>(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ > rhs.iterator_;
+  }
+
+  friend bool operator>=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ >= rhs.iterator_;
+  }
+
+  friend class ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  friend class List<T>;
+};
+
+template<class T> List<T> toTypedList(List<IValue> list);
+template<class T> List<IValue> toList(List<T>&& list);
+template<class T> List<IValue> toList(const List<T>& list);
+const IValue* ptr_to_first_element(const List<IValue>& list);
+}
+
+/**
+ * An object of this class stores a list of values of type T.
+ *
+ * This is a pointer type. After a copy, both Lists
+ * will share the same storage:
+ *
+ * > List<int> a;
+ * > List<int> b = a;
+ * > b.push_back("three");
+ * > ASSERT("three" == a.get(0));
+ *
+ * We use this class in the PyTorch kernel API instead of
+ * std::vector<T>, because that allows us to do optimizations
+ * and switch out the underlying list implementation without
+ * breaking backwards compatibility for the kernel API.
+ */
+template<class T>
+class List final {
+private:
+  // This is an intrusive_ptr because List is a pointer type.
+  // Invariant: This will never be a nullptr, there will always be a valid
+  // ListImpl.
+  c10::intrusive_ptr<c10::detail::ListImpl> impl_;
+
+  using internal_reference_type = impl::ListElementReference<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using internal_const_reference_type = typename impl::ListElementConstReferenceTraits<T>::const_reference;
+
+public:
+  using value_type = T;
+  using size_type = typename c10::detail::ListImpl::list_type::size_type;
+  using iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using const_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using reverse_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::reverse_iterator>;
+
+  /**
+   * Constructs an empty list.
+   */
+  explicit List();
+
+  /**
+   * Constructs a list with some initial values.
+   * Example:
+   *   List<int> a({2, 3, 4});
+   */
+  List(std::initializer_list<T> initial_values);
+  explicit List(ArrayRef<T> initial_values);
+
+  /**
+   * Create a generic list with runtime type information.
+   * This only works for c10::impl::GenericList and is not part of the public API
+   * but only supposed to be used internally by PyTorch.
+   */
+  explicit List(TypePtr elementType);
+
+  List(const List&) = default;
+  List& operator=(const List&) = default;
+
+  /**
+   * Create a new List pointing to a deep copy of the same data.
+   * The List returned is a new list with separate storage.
+   * Changes in it are not reflected in the original list or vice versa.
+   */
+  List copy() const;
+
+  /**
+   * Returns the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   */
+  internal_const_reference_type get(size_type pos) const;
+
+  /**
+   * Moves out the element at the specified location pos and returns it, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   * The list contains an invalid element at position pos afterwards. Any operations
+   * on it before re-setting it are invalid.
+   */
+  value_type extract(size_type pos) const;
+
+  /**
+   * Returns a reference to the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   *
+   * You cannot store the reference, but you can read it and assign new values to it:
+   *
+   *   List<int64_t> list = ...;
+   *   list[2] = 5;
+   *   int64_t v = list[1];
+   */
+  internal_const_reference_type operator[](size_type pos) const;
+
+  internal_reference_type operator[](size_type pos);
+
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, const value_type& value) const;
+
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, value_type&& value) const;
+
+  /**
+   * Returns an iterator to the first element of the container.
+   * If the container is empty, the returned iterator will be equal to end().
+   */
+  iterator begin() const;
+
+  /**
+   * Returns an iterator to the element following the last element of the container.
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   */
+  iterator end() const;
+
+  /**
+   * Checks if the container has no elements.
+   */
+  bool empty() const;
+
+  /**
+   * Returns the number of elements in the container
+   */
+  size_type size() const;
+
+  /**
+   * Increase the capacity of the vector to a value that's greater or equal to new_cap.
+   */
+  void reserve(size_type new_cap) const;
+
+  /**
+   * Erases all elements from the container. After this call, size() returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements. Any past-the-end iterators are also invalidated.
+   */
+  void clear() const;
+
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, const T& value) const;
+
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, T&& value) const;
+
+  /**
+   * Inserts a new element into the container directly before pos.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  iterator emplace(iterator pos, Args&&... value) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(const T& value) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(T&& value) const;
+
+  /**
+   * Appends the given list to the end of the container. Uses at most one memory allocation.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void append(List<T> lst) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  void emplace_back(Args&&... args) const;
+
+  /**
+   * Removes the element at pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator pos) const;
+
+  /**
+   * Removes the elements in the range [first, last).
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator first, iterator last) const;
+
+  /**
+   * Removes the last element of the container.
+   * Calling pop_back on an empty container is undefined.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void pop_back() const;
+
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional default-inserted elements are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count) const;
+
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional copies of value are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count, const T& value) const;
+
+  /**
+   * Value equality comparison. This function implements Python-like semantics for
+   * equality: two lists with the same identity (e.g. same pointer) trivially
+   * compare equal, otherwise each element is compared for equality.
+   */
+  template <class T_>
+  friend bool operator==(const List<T_>& lhs, const List<T_>& rhs);
+
+  template <class T_>
+  friend bool operator!=(const List<T_>& lhs, const List<T_>& rhs);
+
+  /**
+   * Identity comparison. Returns true if and only if `rhs` represents the same
+   * List object as `this`.
+   */
+  bool is(const List<T>& rhs) const;
+
+  std::vector<T> vec() const;
+
+  /**
+   * Returns the number of Lists currently pointing to this same list.
+   * If this is the only instance pointing to this list, returns 1.
+   */
+  // TODO Test use_count
+  size_t use_count() const;
+
+  TypePtr elementType() const;
+
+  // See [unsafe set type] for why this exists.
+  void unsafeSetElementType(TypePtr t);
+
+private:
+  explicit List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements);
+  explicit List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements);
+  friend struct IValue;
+  template<class T_> friend List<T_> impl::toTypedList(List<IValue>);
+  template<class T_> friend List<IValue> impl::toList(List<T_>&&);
+  template<class T_> friend List<IValue> impl::toList(const List<T_>&);
+  friend const IValue* impl::ptr_to_first_element(const List<IValue>& list);
+};
+
+namespace impl {
+// GenericList is how IValue stores lists. It is, however, not part of the
+// public API. Kernels should use Lists with concrete types instead
+// (maybe except for some internal prim ops).
+using GenericList = List<IValue>;
+
+const IValue* ptr_to_first_element(const GenericList& list);
+
+}
+}
+
+namespace torch {
+  template<class T> using List = c10::List<T>;
+}
+
+#include <ATen/core/List_inl.h>  // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/List_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/List_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3866f77ad866551432b08b975bb30382edbf8ef5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/List_inl.h
@@ -0,0 +1,360 @@
+#pragma once
+
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/ivalue.h>
+
+namespace c10 {
+
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+
+template<class T>
+List<T>::List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements)
+: impl_(std::move(elements)) {}
+
+template<class T>
+List<T>::List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements)
+: impl_(elements) {}
+
+template<class T>
+List<T>::List()
+: List(make_intrusive<c10::detail::ListImpl>(
+  typename c10::detail::ListImpl::list_type(),
+  getTypePtr<T>())) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType) instead.");
+}
+
+template<class T>
+List<T>::List(ArrayRef<T> values)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    getTypePtr<T>())) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+  impl_->list.reserve(values.size());
+  for (const T& element : values) {
+    impl_->list.push_back(element);
+  }
+}
+
+template<class T>
+List<T>::List(std::initializer_list<T> initial_values)
+: List(ArrayRef<T>(initial_values)) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+}
+
+template<class T>
+List<T>::List(TypePtr elementType)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    std::move(elementType))) {
+  static_assert(std::is_same<T, IValue>::value || std::is_same<T, c10::intrusive_ptr<ivalue::Future>>::value,
+                "This constructor is only valid for c10::impl::GenericList or List<Future>.");
+}
+
+namespace impl {
+template<class T>
+List<T> toTypedList(impl::GenericList list) {
+  // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant
+  // because upcasting would allow people to add types into the new list that would break the old list.
+  // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can
+  // allow upcasting. This can be a perf improvement since we can cast List<T> to List<optional<T>>
+  // without having to copy it. This is also used to provide backwards compatibility with some old models
+  // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_
+  // as List<Tensor> before we changed that argument to be List<optional<Tensor>>. When deserializing, we
+  // have list.use_count() == 1 and can deserialize the List<Tensor> directly as List<optional<Tensor>>.
+  TORCH_CHECK(*list.impl_->elementType == *getTypePtr<T>()
+    || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(*getTypePtr<T>()))
+    , "Tried to cast a List<", toString(*list.impl_->elementType), "> to a List<", toString(*getTypePtr<T>()), ">. Types mismatch.");
+  return List<T>(std::move(list.impl_));
+}
+
+template<class T>
+impl::GenericList toList(List<T>&& list) {
+  return GenericList(std::move(list.impl_));
+}
+template<class T>
+impl::GenericList toList(const List<T>& list) {
+  return GenericList(list.impl_);
+}
+}
+
+template<class T>
+List<T> List<T>::copy() const {
+  return List<T>(impl_->copy());
+}
+
+namespace detail {
+  template<class T>
+  T list_element_to(T element) {
+    return element;
+  }
+  template<class T>
+  T list_element_to(const IValue& element) {
+    return element.template to<T>();
+  }
+  template<class T>
+  T list_element_to(IValue&& element) {
+    return std::move(element).template to<T>();
+  }
+  template<class T>
+  struct ListElementFrom {
+    static IValue from(const T& element) {
+      return element;
+    }
+    static IValue from(T&& element) {
+      return std::move(element);
+    }
+  };
+  template<>
+  struct ListElementFrom<IValue> {
+    static const IValue& from(const IValue& element) {
+      return element;
+    }
+    static IValue&& from(IValue&& element) {
+      return std::move(element);
+    }
+  };
+}
+
+namespace impl {
+
+template <class T, class Iterator>
+ListElementReference<T, Iterator>::operator std::conditional_t<
+    std::is_reference<typename c10::detail::ivalue_to_const_ref_overload_return<
+        T>::type>::value,
+    const T&,
+    T>() const {
+  return iterator_->template to<T>();
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(T&& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(std::move(new_value));
+  return *this;
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(const T& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(new_value);
+  return *this;
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(ListElementReference<T, Iterator>&& rhs) && noexcept {
+  *iterator_ = *rhs.iterator_;
+  return *this;
+}
+
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) {
+  std::swap(*lhs.iterator_, *rhs.iterator_);
+}
+
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs) {
+  const T& lhs_tmp = lhs;
+  return lhs_tmp == rhs;
+}
+
+template<class T, class Iterator>
+inline bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs) {
+  return rhs == lhs;
+}
+
+template<class T>
+inline typename ListElementConstReferenceTraits<T>::const_reference
+list_element_to_const_ref(const IValue& element) {
+  return element.template to<T>();
+}
+
+template<>
+inline typename ListElementConstReferenceTraits<c10::optional<std::string>>::const_reference
+list_element_to_const_ref<c10::optional<std::string>>(const IValue& element) {
+  return element.toOptionalStringRef();
+}
+
+} // namespace impl
+
+template<class T>
+void List<T>::set(size_type pos, const value_type& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(value);
+}
+
+template<class T>
+void List<T>::set(size_type pos, value_type&& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(std::move(value));
+}
+
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::get(size_type pos) const {
+  return operator[](pos);
+}
+
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::operator[](size_type pos) const {
+  return c10::impl::list_element_to_const_ref<T>(impl_->list.at(pos));
+}
+
+template<class T>
+typename List<T>::internal_reference_type List<T>::operator[](size_type pos) {
+  static_cast<void>(impl_->list.at(pos)); // Throw the exception if it is out of range.
+  return {impl_->list.begin() + static_cast<typename decltype(impl_->list)::difference_type>(pos)};
+}
+
+template<class T>
+typename List<T>::value_type List<T>::extract(size_type pos) const {
+  auto& elem = impl_->list.at(pos);
+  auto result = c10::detail::list_element_to<T>(std::move(elem));
+  // Reset the list element to a T() instead of None to keep it correctly typed
+  elem = c10::detail::ListElementFrom<T>::from(T{});
+  return result;
+}
+
+template<class T>
+typename List<T>::iterator List<T>::begin() const {
+  return iterator(impl_->list.begin());
+}
+
+template<class T>
+typename List<T>::iterator List<T>::end() const {
+  return iterator(impl_->list.end());
+}
+
+template<class T>
+bool List<T>::empty() const {
+  return impl_->list.empty();
+}
+
+template<class T>
+typename List<T>::size_type List<T>::size() const {
+  return impl_->list.size();
+}
+
+template<class T>
+void List<T>::reserve(size_type new_cap) const {
+  impl_->list.reserve(new_cap);
+}
+
+template<class T>
+void List<T>::clear() const {
+  impl_->list.clear();
+}
+
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, const T& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(value)) };
+}
+
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, T&& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(std::move(value))) };
+}
+
+template<class T>
+template<class... Args>
+typename List<T>::iterator List<T>::emplace(iterator pos, Args&&... value) const {
+  // TODO Use list_element_from?
+  return iterator { impl_->list.emplace(pos.iterator_, std::forward<Args>(value)...) };
+}
+
+template<class T>
+void List<T>::push_back(const T& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(value));
+}
+
+template<class T>
+void List<T>::push_back(T&& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(std::move(value)));
+}
+
+template<class T>
+void List<T>::append(List<T> b) const {
+  if (b.use_count() == 1) {
+    impl_->list.insert(impl_->list.end(), make_move_iterator(b.impl_->list.begin()), make_move_iterator(b.impl_->list.end()));
+  } else {
+    impl_->list.insert(impl_->list.end(), b.impl_->list.begin(), b.impl_->list.end());
+  }
+}
+
+template<class T>
+template<class... Args>
+void List<T>::emplace_back(Args&&... args) const {
+  // TODO Use list_element_from?
+  impl_->list.push_back(T(std::forward<Args>(args)...));
+}
+
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator pos) const {
+  return iterator { impl_->list.erase(pos.iterator_) };
+}
+
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator first, iterator last) const {
+  return iterator { impl_->list.erase(first.iterator_, last.iterator_) };
+}
+
+template<class T>
+void List<T>::pop_back() const {
+  impl_->list.pop_back();
+}
+
+template<class T>
+void List<T>::resize(size_type count) const {
+  impl_->list.resize(count, T{});
+}
+
+template<class T>
+void List<T>::resize(size_type count, const T& value) const {
+  impl_->list.resize(count, value);
+}
+
+template<class T>
+bool operator==(const List<T>& lhs, const List<T>& rhs) {
+  // Lists with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+
+  // Otherwise, just compare values directly.
+  return *lhs.impl_ == *rhs.impl_;
+}
+
+template<class T>
+bool operator!=(const List<T>& lhs, const List<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template<class T>
+bool List<T>::is(const List<T>& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+
+template<class T>
+std::vector<T> List<T>::vec() const {
+  std::vector<T> result(begin(), end());
+  return result;
+}
+
+template<class T>
+size_t List<T>::use_count() const {
+  return impl_.use_count();
+}
+
+template <class T>
+TypePtr List<T>::elementType() const {
+  return impl_->elementType;
+}
+
+template <class T>
+void List<T>::unsafeSetElementType(TypePtr t) {
+  impl_->elementType = std::move(t);
+}
+
+namespace impl {
+
+inline const IValue* ptr_to_first_element(const GenericList& list) {
+  return &list.impl_->list[0];
+}
+
+}
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/MT19937RNGEngine.h b/MLPY/Lib/site-packages/torch/include/ATen/core/MT19937RNGEngine.h
new file mode 100644
index 0000000000000000000000000000000000000000..7aaebf8289e5c3ce80411846ea8f24ca29c3f620
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/MT19937RNGEngine.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include <c10/util/irange.h>
+
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#endif
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+
+namespace at {
+
+constexpr int MERSENNE_STATE_N = 624;
+constexpr int MERSENNE_STATE_M = 397;
+constexpr uint32_t MATRIX_A = 0x9908b0df;
+constexpr uint32_t UMASK = 0x80000000;
+constexpr uint32_t LMASK = 0x7fffffff;
+
+/**
+ * Note [Mt19937 Engine implementation]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Originally implemented in:
+ * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/MTARCOK/mt19937ar-cok.c
+ * and modified with C++ constructs. Moreover the state array of the engine
+ * has been modified to hold 32 bit uints instead of 64 bits.
+ *
+ * Note that we reimplemented mt19937 instead of using std::mt19937 because,
+ * at::mt19937 turns out to be faster in the pytorch codebase. PyTorch builds with -O2
+ * by default and following are the benchmark numbers (benchmark code can be found at
+ * https://github.com/syed-ahmed/benchmark-rngs):
+ *
+ * with -O2
+ * Time to get 100000000 philox randoms with at::uniform_real_distribution = 0.462759s
+ * Time to get 100000000 at::mt19937 randoms with at::uniform_real_distribution = 0.39628s
+ * Time to get 100000000 std::mt19937 randoms with std::uniform_real_distribution = 0.352087s
+ * Time to get 100000000 std::mt19937 randoms with at::uniform_real_distribution = 0.419454s
+ *
+ * std::mt19937 is faster when used in conjunction with std::uniform_real_distribution,
+ * however we can't use std::uniform_real_distribution because of this bug:
+ * http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524. Plus, even if we used
+ * std::uniform_real_distribution and filtered out the 1's, it is a different algorithm
+ * than what's in pytorch currently and that messes up the tests in tests_distributions.py.
+ * The other option, using std::mt19937 with at::uniform_real_distribution is a tad bit slower
+ * than at::mt19937 with at::uniform_real_distribution and hence, we went with the latter.
+ *
+ * Copyright notice:
+ * A C-program for MT19937, with initialization improved 2002/2/10.
+ * Coded by Takuji Nishimura and Makoto Matsumoto.
+ * This is a faster version by taking Shawn Cokus's optimization,
+ * Matthe Bellew's simplification, Isaku Wada's real version.
+ *
+ * Before using, initialize the state by using init_genrand(seed)
+ * or init_by_array(init_key, key_length).
+ *
+ * Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ *
+ *   3. The names of its contributors may not be used to endorse or promote
+ *   products derived from this software without specific prior written
+ *   permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Any feedback is very welcome.
+ * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ * email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+/**
+ * mt19937_data_pod is used to get POD data in and out
+ * of mt19937_engine. Used in torch.get_rng_state and
+ * torch.set_rng_state functions.
+ */
+struct mt19937_data_pod {
+  uint64_t seed_;
+  int left_;
+  bool seeded_;
+  uint32_t next_;
+  std::array<uint32_t, MERSENNE_STATE_N> state_;
+};
+
+class mt19937_engine {
+public:
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  inline explicit mt19937_engine(uint64_t seed = 5489) {
+    init_with_uint32(seed);
+  }
+
+  inline mt19937_data_pod data() const {
+    return data_;
+  }
+
+  inline void set_data(const mt19937_data_pod& data) {
+    data_ = data;
+  }
+
+  inline uint64_t seed() const {
+    return data_.seed_;
+  }
+
+  inline bool is_valid() {
+    if ((data_.seeded_ == true)
+      && (data_.left_ > 0 && data_.left_ <= MERSENNE_STATE_N)
+      && (data_.next_ <= MERSENNE_STATE_N)) {
+      return true;
+    }
+    return false;
+  }
+
+  inline uint32_t operator()() {
+    if (--(data_.left_) == 0) {
+        next_state();
+    }
+    uint32_t y = *(data_.state_.data() + data_.next_++);
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+
+    return y;
+  }
+
+private:
+  mt19937_data_pod data_;
+
+  inline void init_with_uint32(uint64_t seed) {
+    data_.seed_ = seed;
+    data_.seeded_ = true;
+    data_.state_[0] = seed & 0xffffffff;
+    for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
+      data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
+    }
+    data_.left_ = 1;
+    data_.next_ = 0;
+  }
+
+  inline uint32_t mix_bits(uint32_t u, uint32_t v) {
+    return (u & UMASK) | (v & LMASK);
+  }
+
+  inline uint32_t twist(uint32_t u, uint32_t v) {
+    return (mix_bits(u,v) >> 1) ^ (v & 1 ? MATRIX_A : 0);
+  }
+
+  inline void next_state() {
+    uint32_t* p = data_.state_.data();
+    data_.left_ = MERSENNE_STATE_N;
+    data_.next_ = 0;
+
+    for(int j = MERSENNE_STATE_N - MERSENNE_STATE_M + 1; --j; p++) {
+      *p = p[MERSENNE_STATE_M] ^ twist(p[0], p[1]);
+    }
+
+    for(int j = MERSENNE_STATE_M; --j; p++) {
+      *p = p[MERSENNE_STATE_M - MERSENNE_STATE_N] ^ twist(p[0], p[1]);
+    }
+
+    *p = p[MERSENNE_STATE_M - MERSENNE_STATE_N] ^ twist(p[0], data_.state_[0]);
+  }
+
+};
+
+typedef mt19937_engine mt19937;
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/NamedTensor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/NamedTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..55db7ca5aae53673fbe90b47e3c90a3c88784afe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/NamedTensor.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <ATen/core/Dimname.h>
+#include <c10/core/TensorImpl.h>
+
+namespace at {
+
+class TensorBase;
+
+// XXX: This file exists because TensorImpl is in c10, but Dimname is in ATen.
+// Due to the c10/ATen library split, TensorImpl cannot depend on Dimname,
+// so we have a couple of workarounds.
+//
+// In the long term, we'll move Dimname to c10 and everything in this file
+// can be refactored out. The main blocker for that is that "c10::Symbol"
+// actually exists outside of c10 and needs to be moved in.
+
+// TensorImpl has a unique_ptr<NamedTensorMetaInterface> field.
+// XXX: Ideally we would just put optional<vector<Dimname>> into TensorImpl.
+//
+// This class has an important invariant: there must be at least ONE
+// non-wildcard
+struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
+  // This enum is to remind people that the invariant on constructors is that
+  // the list of dimnames must have at least one non-wildcard
+  enum HAS_NON_WILDCARD {
+    HasNonWildcard
+  };
+
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+    : names_(names.vec()) {
+    check_invariants();
+  }
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+    : names_(std::move(names)) {
+    check_invariants();
+  }
+
+  std::unique_ptr<c10::NamedTensorMetaInterface> clone() const override {
+    return std::make_unique<NamedTensorMeta>(HasNonWildcard, names_);
+  }
+
+  DimnameList names() const { return names_; }
+
+  // Used for an assertion in TensorImpl.h
+  int64_t slow_dim() const override {
+    return names_.size();
+  }
+
+  void check_invariants() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
+  }
+
+  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
+    std::copy(new_names.begin(), new_names.end(), names_.begin());
+    check_invariants();
+  }
+
+  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
+    names_ = std::move(new_names);
+    check_invariants();
+  }
+
+  // INVARIANT: at least one Dimname is non-WILDCARD
+  std::vector<Dimname> names_;
+};
+
+// When NamesMode is disabled, then all operations ignore tensors' names fields.
+// Concretely speaking, all tensors are treated as having nullopt names.
+struct TORCH_API NamesMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+
+
+// A RAII, thread local (!) guard that enables or disables names upon
+// construction, and sets it back to the original value upon destruction.
+struct TORCH_API NoNamesGuard {
+  NoNamesGuard() : prev_mode(NamesMode::is_enabled()), initialized(true) {
+    NamesMode::set_enabled(false);
+  }
+  ~NoNamesGuard() {
+    if (initialized) {
+      reset();
+    }
+  }
+  void reset() {
+    TORCH_INTERNAL_ASSERT(initialized);
+    NamesMode::set_enabled(prev_mode);
+  }
+ private:
+  bool prev_mode;
+  bool initialized;
+};
+
+void check_names_valid_for(const TensorBase& tensor, DimnameList names);
+void check_names_valid_for(size_t tensor_dim, DimnameList names);
+
+// Sets the names of `tensor` to be `names`.
+TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, c10::optional<DimnameList> names);
+TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::vector<Dimname>&& names, bool validate_names);
+
+constexpr size_t kMaxNamedTensorDim = 64;
+
+DimnameList default_names(size_t len);
+
+namespace impl {
+
+// Some helper functions on TensorImpl. Useful for working with names in TH.
+// XXX: Ideally these would exist as methods on TensorImpl
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
+
+void check_names_valid_for(TensorImpl* impl, DimnameList names);
+
+// Returns true if the tensor's names exist and are not all 'None'.
+// Returns false if the tensor's names don't exist (were not allocated),
+// or if all names are 'None'.
+// We treat not-allocated-names the same as allocated names that are all 'None'.
+TORCH_API bool has_names(const TensorImpl* impl);
+
+// Returns the names of the tensor's dimensions.
+// Unnamed tensors are treated as having 'None' in all dimension; this method
+// would return a DimnameList of all 'None's for an unnamed tensor.
+TORCH_API DimnameList get_names(const TensorImpl* impl);
+
+// This is more of an implementation detail; one should use impl::get_names /
+// Tensor::names() whenever possible because it provides a cleaner API.
+// Returns the names of the tensor if they have been allocated; returns nullopt
+// instead if the haven't been. The names of a tensor are not allocated if a
+// tensor is constructed with names=None.
+TORCH_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);
+
+} // namespace impl
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/NestedIntSymNodeImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/NestedIntSymNodeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..067e18717dcd51b755e4babb096e1fa3a52ce43c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/NestedIntSymNodeImpl.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <c10/core/ConstantSymNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <string>
+
+namespace c10 {
+
+// The motivating usecase for this is to represent the ragged size structure
+// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This
+// allows us to simply return [B, j0, D] if someone queries for the size of our
+// tensor.
+//
+// Morally we define comparison between two nested ints to return true if
+// that comparison holds for all corresponding elements of the arrays they
+// represent. Comparison between a nested int and a plain int is defined
+// similarly.
+//
+// To simulate this desired behavior but also avoid the O(N) cost of checking,
+// we associate each raggedness pattern with an integer "id" that can be used as
+// a proxy to evaluate equality. We also constrain the range of values for this
+// as to enable inequality checks.
+//
+// We also support a positive integer scalar "coeff" that is used for computing
+// strides. For example given, a [B, j0, D] tensor, it can be strided in two
+// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to
+// differentiate the two cases.
+//
+// During tracing the strides of the outputs need to be a function of the size
+// and strides of the inputs so it is important that NestedIntSymNode itself is
+// able to express this.
+class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl {
+ public:
+  // CAUTION: you should probably not be constructing these directly; please
+  // the higher-level API in python instead (TODO: actually introduce that).
+  explicit NestedIntSymNodeImpl(int64_t val, int64_t coeff)
+      : val_(val), coeff_(coeff) {}
+
+  bool bool_() override {
+    return false;
+  }
+
+  bool is_int() override {
+    return true;
+  }
+
+  bool is_float() override {
+    return false;
+  }
+
+  bool is_bool() override {
+    return false;
+  }
+
+  bool is_nested_int() const override {
+    return true;
+  }
+
+  bool has_hint() override {
+    return true;
+  }
+
+  c10::SymNode wrap_int(int64_t num) override {
+    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<int64_t>>(num));
+  };
+
+  int64_t guard_int(const char* file, int64_t line) override {
+    TORCH_CHECK(false);
+  }
+
+  double guard_float(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a float");
+  }
+
+  bool guard_bool(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a bool");
+  }
+
+  int64_t int_() override {
+    TORCH_CHECK(false);
+  }
+
+  std::string str() override {
+    if (coeff_ == 1) {
+      return "j" + std::to_string(val_);
+    }
+    return std::to_string(coeff_) + "*j" + std::to_string(val_);
+  }
+
+  // NOTE [ Inequalities with nested int ]
+  //
+  // The semantics of nested int when it comes to relations is that it is
+  // treated as integer known to be within a certain range,
+  //
+  //     j0 \in [2, int64_t::max]
+  //
+  // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False).
+  // This is a useful default range for the raggedness pattern of a jagged
+  // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1
+  // specialization checks.
+  //
+  // [ Indeterminate inequalities error out ]
+  //
+  // Given the semantic defined above, certain relations like j0 < 3 are thus
+  // indeterminable. In our impl today, evaluating such relations error
+  //
+  // It may seem convenient to just define indeterminate relations to return
+  // False, but the implementation we maintain in parallel using sympy does not
+  // allow this.
+  //
+  // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are,
+  // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This
+  // would mean that means that if we define the indeterminate j0 >= 3 to be
+  // False, the also indeterminate j0 < 3 will be evaluated to be True!
+  //
+  // [ Coefficient are assumed positive ]
+  //
+  // For the purpose of computing inequalities, we consider the coefficient of
+  // the nested int to be a positive integer.
+  //
+  // Thus, no modifications are needed to the logic since
+  // j0 >= k implies coeff * j0 >= k
+  //
+  c10::SymNode eq(const c10::SymNode& other) override;
+  c10::SymNode ne(const c10::SymNode& other) override;
+  c10::SymNode ge(const c10::SymNode& other) override;
+  c10::SymNode gt(const c10::SymNode& other) override;
+  c10::SymNode lt(const c10::SymNode& other) override;
+  c10::SymNode le(const c10::SymNode& other) override;
+  c10::SymNode mul(const c10::SymNode& other) override;
+
+  c10::optional<int64_t> nested_int() override {
+    return val_;
+  }
+
+  c10::optional<int64_t> nested_int_coeff() override {
+    return coeff_;
+  }
+
+  bool is_symbolic() override {
+    return false;
+  }
+
+#define DEFINE_BINARY_NOT_SUPPORTED(name)                           \
+  c10::SymNode name(const c10::SymNode& other) override {           \
+    TORCH_CHECK(false, #name " not supported by NestedIntSymNode"); \
+  }
+
+  DEFINE_BINARY_NOT_SUPPORTED(add)
+  DEFINE_BINARY_NOT_SUPPORTED(sub)
+  DEFINE_BINARY_NOT_SUPPORTED(truediv)
+  DEFINE_BINARY_NOT_SUPPORTED(pow)
+  DEFINE_BINARY_NOT_SUPPORTED(floordiv)
+  DEFINE_BINARY_NOT_SUPPORTED(mod)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_min)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_max)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_and)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_or)
+
+#undef DEFINE_BINARY_NOT_SUPPORTED
+
+#define DEFINE_NOT_SUPPORTED(name)                                     \
+  c10::SymNode name() override {                                       \
+    TORCH_CHECK(false, #name " is not supported by NestedIntSymNode"); \
+  }
+
+  DEFINE_NOT_SUPPORTED(sym_not)
+  DEFINE_NOT_SUPPORTED(ceil)
+  DEFINE_NOT_SUPPORTED(floor)
+  DEFINE_NOT_SUPPORTED(neg)
+  DEFINE_NOT_SUPPORTED(clone)
+  DEFINE_NOT_SUPPORTED(sym_float)
+
+#undef DEFINE_NOT_SUPPORTED
+
+ private:
+  int64_t val_;
+  int64_t coeff_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/PhiloxRNGEngine.h b/MLPY/Lib/site-packages/torch/include/ATen/core/PhiloxRNGEngine.h
new file mode 100644
index 0000000000000000000000000000000000000000..e061933486045108de1c672c4343c6fa08ae6629
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/PhiloxRNGEngine.h
@@ -0,0 +1,242 @@
+#pragma once
+
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#endif
+
+#include <ATen/core/Array.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <cmath>
+#include <cstdint>
+
+namespace at {
+
+// typedefs for holding vector data
+namespace detail {
+
+typedef at::detail::Array<uint32_t, 4> UINT4;
+typedef at::detail::Array<uint32_t, 2> UINT2;
+typedef at::detail::Array<double, 2> DOUBLE2;
+typedef at::detail::Array<float, 2> FLOAT2;
+
+} // namespace detail
+
+/**
+ * Note [Philox Engine implementation]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Originally implemented in PyTorch's fusion compiler
+ * Refer to: http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+ * for details regarding the engine.
+ *
+ * Note that currently this implementation of the philox engine is not used
+ * anywhere except for tests in cpu_generator_test.cpp. However, this engine
+ * will replace curandStatePhilox4_32_10_t in the future.
+ *
+ * The philox engine takes a seed value, a subsequeunce
+ * for starting the generation and an offset for the subsequence.
+ * Think of this engine as an algorithm producing a huge array. We are
+ * parallelizing this array by partitioning the huge array and assigning
+ * a thread index to each partition. In other words, each seed value
+ * (there are 2^64 possible seed values) gives a sub array of size
+ * 2^128 (each element in that array is a 128 bit number). Reasoning
+ * behind the array being of size 2^128 is, there are 2^64 possible
+ * thread index value and there is an array of size 2^64 for each of
+ * those thread index. Hence 2^64 * 2^64 = 2^128 for each seed value.
+ *
+ * In short, this generator can produce 2^64 (seed values) * 2^128 (number
+ * of elements in an array given by a seed value) = 2^192 values.
+ *
+ * Arguments:
+ * seed:        Seed values could be any number from 0 to 2^64-1.
+ * subsequence: Subsequence is just the cuda thread indexing with:
+ *              - blockIdx.x * blockDim.x + threadIdx.x
+ * offset:      The offset variable in PhiloxEngine  decides how many 128-bit
+ *              random numbers to skip (i.e. how many groups of 4, 32-bit numbers to skip)
+ *              and hence really decides the total number of randoms that can be achieved
+ *              for the given subsequence.
+ */
+
+class philox_engine {
+public:
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  C10_HOST_DEVICE inline explicit philox_engine(uint64_t seed = 67280421310721,
+                                 uint64_t subsequence = 0,
+                                 uint64_t offset = 0) {
+
+    reset_state(seed, subsequence);
+    incr_n(offset);
+  }
+
+  C10_HOST_DEVICE inline void reset_state(uint64_t seed = 67280421310721,
+                                 uint64_t subsequence = 0) {
+    key_[0] = static_cast<uint32_t>(seed);
+    key_[1] = static_cast<uint32_t>(seed >> 32);
+    counter_ = detail::UINT4(0);
+    counter_[2] = static_cast<uint32_t>(subsequence);
+    counter_[3] = static_cast<uint32_t>(subsequence >> 32);
+    STATE = 0;
+  }
+
+  /**
+   * Set the offset field of Philox Generator to the desired offset.
+   */
+  C10_HOST_DEVICE inline void set_offset(uint64_t offset) {
+    counter_[0] = static_cast<uint32_t>(offset);
+    counter_[1] = static_cast<uint32_t>(offset >> 32);
+  }
+
+  /**
+   * Gets the current offset of the Philox Generator.
+   */
+  C10_HOST_DEVICE uint64_t get_offset() const {
+    uint64_t lo = static_cast<uint64_t>(counter_[0]);
+    uint64_t hi = static_cast<uint64_t>(counter_[1]) << 32;
+    return lo | hi;
+  }
+
+  /**
+   * Produces a unique 32-bit pseudo random number on every invocation. Bookeeps state to avoid waste.
+   */
+  C10_HOST_DEVICE inline uint32_t operator()(int32_t n_rounds = 10) { // 10 here to preserve back-compat behavior
+    if(STATE == 0) {
+      detail::UINT4 counter = counter_;
+      detail::UINT2 key = key_;
+      output_ = rand(counter, key, n_rounds);
+      incr();
+    }
+    uint32_t ret = output_[static_cast<int>(STATE)];
+    STATE = (STATE + 1) & 3;
+    return ret;
+  }
+
+  inline float randn(uint32_t n_rounds) {
+    #ifdef __CUDA_ARCH__
+    AT_ASSERT(false, "Unsupported invocation of randn on CUDA");
+    #endif
+    if(STATE == 0) {
+      detail::UINT4 counter = counter_;
+      detail::UINT2 key = key_;
+      output_ = rand(counter, key, n_rounds);
+      incr();
+    }
+    // TODO(min-jean-cho) change to Polar method, a more efficient version of Box-Muller method
+    // TODO(voz) We use std:: below, and thus need a separate impl for CUDA.
+    float u1 = 1 - uint32_to_uniform_float(output_[0]); // uint32_to_uniform_float returns [0,1), we need (0,1] to avoid passing 0 to log.
+    float u2 = 1 - uint32_to_uniform_float(output_[1]);
+    return static_cast<float>(std::sqrt(-2.0 * std::log(u1)) * std::cos(2.0 * M_PI * u2));
+  }
+
+  /**
+   * Function that Skips N 128 bit numbers in a subsequence
+   */
+  C10_HOST_DEVICE inline void incr_n(uint64_t n) {
+    uint32_t nlo = static_cast<uint32_t>(n);
+    uint32_t nhi = static_cast<uint32_t>(n >> 32);
+    counter_[0] += nlo;
+    // if overflow in x has occurred, carry over to nhi
+    if (counter_[0] < nlo) {
+      nhi++;
+      // if overflow in nhi has occurred during carry over,
+      // propagate that overflow to y and exit to increment z
+      // otherwise return
+      counter_[1] += nhi;
+      if(nhi != 0) {
+        if (nhi <= counter_[1]) {
+          return;
+        }
+      }
+    } else {
+      // if overflow in y has occurred during addition,
+      // exit to increment z
+      // otherwise return
+      counter_[1] += nhi;
+      if (nhi <= counter_[1]) {
+        return;
+      }
+    }
+    if (++counter_[2])
+      return;
+    ++counter_[3];
+  }
+
+  /**
+   * Function that Skips one 128 bit number in a subsequence
+   */
+  C10_HOST_DEVICE inline void incr() {
+    if (++counter_[0])
+      return;
+    if (++counter_[1])
+      return;
+    if (++counter_[2]) {
+      return;
+    }
+    ++counter_[3];
+  }
+
+private:
+  detail::UINT4 counter_;
+  detail::UINT4 output_;
+  detail::UINT2 key_;
+  uint32_t STATE;
+
+  C10_HOST_DEVICE inline uint32_t mulhilo32(uint32_t a, uint32_t b,
+                                    uint32_t *result_high) {
+    #ifdef __CUDA_ARCH__
+      *result_high = __umulhi(a, b);
+      return a*b;
+    #else
+      const uint64_t product = static_cast<uint64_t>(a) * b;
+      *result_high = static_cast<uint32_t>(product >> 32);
+      return static_cast<uint32_t>(product);
+    #endif
+  }
+
+  C10_HOST_DEVICE inline detail::UINT4 single_round(detail::UINT4 ctr, detail::UINT2 in_key) {
+    uint32_t hi0 = 0;
+    uint32_t hi1 = 0;
+    uint32_t lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
+    uint32_t lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
+    detail::UINT4 ret;
+    ret[0] = hi1 ^ ctr[1] ^ in_key[0];
+    ret[1] = lo1;
+    ret[2] = hi0 ^ ctr[3] ^ in_key[1];
+    ret[3] = lo0;
+    return ret;
+  }
+
+  C10_HOST_DEVICE constexpr float uint32_to_uniform_float(uint32_t value) {
+      // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+      constexpr float scale = 4.6566127342e-10;
+      return static_cast<float>(value & 0x7FFFFFFF) * scale;
+  }
+
+
+
+  C10_HOST_DEVICE inline detail::UINT4 rand(detail::UINT4& counter, detail::UINT2& key, uint32_t n_rounds) {
+    for (uint32_t round = 0; round < (n_rounds - 1); round++) {
+        counter = single_round(counter, key);
+        key[0] += (kPhilox10A); key[1] += (kPhilox10B);
+      }
+    return single_round(counter, key);
+  }
+
+
+  static const uint32_t kPhilox10A = 0x9E3779B9;
+  static const uint32_t kPhilox10B = 0xBB67AE85;
+  static const uint32_t kPhiloxSA = 0xD2511F53;
+  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+};
+
+typedef philox_engine Philox4_32;
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/PythonFallbackKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/core/PythonFallbackKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..eab730ccef58e4b548eb12469cb8428860dc483d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/PythonFallbackKernel.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <ATen/core/TorchDispatchUtils.h>
+
+namespace at {
+namespace impl {
+
+struct TORCH_API RestorePythonTLSSnapshot {
+  RestorePythonTLSSnapshot();
+  ~RestorePythonTLSSnapshot();
+
+private:
+  c10::impl::LocalDispatchKeySet saved_;
+  c10::impl::ForceDispatchKeyGuard guard_;
+};
+
+
+// RAII guard to make working with the above TLS safer.
+struct TORCH_API MaybeSetTLSOnEntryGuard {
+public:
+  MaybeSetTLSOnEntryGuard();
+  ~MaybeSetTLSOnEntryGuard();
+
+private:
+  bool value_set_;
+};
+
+} // namespace impl
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/PythonOpRegistrationTrampoline.h b/MLPY/Lib/site-packages/torch/include/ATen/core/PythonOpRegistrationTrampoline.h
new file mode 100644
index 0000000000000000000000000000000000000000..979a21ef13a5647a9e839ca1b2632adbfbd2cf4d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/PythonOpRegistrationTrampoline.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+
+// TODO: this can probably live in c10
+
+namespace at {
+namespace impl {
+
+class TORCH_API PythonOpRegistrationTrampoline final {
+  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
+
+public:
+  //  Returns true if you successfully registered yourself (that means
+  //  you are in the hot seat for doing the operator registrations!)
+  static bool registerInterpreter(c10::impl::PyInterpreter*);
+
+  // Returns nullptr if no interpreter has been registered yet.
+  static c10::impl::PyInterpreter* getInterpreter();
+};
+
+} // namespace impl
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/QuantizerBase.h b/MLPY/Lib/site-packages/torch/include/ATen/core/QuantizerBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..320bb0e859785c0813e763412dbf92dfb92b98a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/QuantizerBase.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace at {
+
+class Tensor;
+struct QTensorImpl;
+struct Quantizer;
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
+
+/**
+ * Quantizer is the class for storing all the information
+ * that's necessary to perform quantize and dequantize
+ * operation.
+ *
+ * We might have different types of quantization schemes and this is
+ * the base class for all quantizers.
+ *
+ * QTensorImpl will hold a pointer to Quantizer so that we can support
+ * different quantization schemes on Tensor.
+ *
+ * For example, the most common quantization scheme, Affine Quantization,
+ * requires scale and zero_point as parameters, we'll store scale and zero_point
+ * inside the instance and we can use it to quantize a float Tensor or
+ * dequantize a quantized Tensor.
+ *
+ * When you add new types of leaf Quantizer class, please also
+ * make sure to add a corresponding QScheme enum since
+ * they should have one to one mapping.
+ *
+ * Note about intrusive_ptr:
+ * Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
+ * share the same Quantizer. Quantizer should be immutable.
+ */
+struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
+  const ScalarType scalar_type_;
+  explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
+  ~Quantizer() override;
+
+  // Copied from torch/csrc/jit/ir/scope.h
+  QuantizerPtr intrusive_from_this() {
+    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                           // from a raw `this` pointer
+                                           // so we need to bump the refcount
+                                           // to account for this ownership
+    return c10::intrusive_ptr<Quantizer>::reclaim(this);
+  }
+
+  /**
+   * Each concrete Quantizer type should have a unique QScheme type.
+   */
+  virtual QScheme qscheme() const = 0;
+
+  ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+
+  /**
+   * quantize a float Tensor into a quantized Tensor.
+   */
+  virtual Tensor quantize(const Tensor& t) = 0;
+
+  /**
+   * dequantize a quantized Tensor into a float Tensor.
+   */
+  virtual Tensor dequantize(const Tensor& t) = 0;
+
+  /**
+   * dequantize a quantized Tensor into a float Tensor, out= variant
+   */
+  virtual Tensor& dequantize_out(Tensor& out, const Tensor& t) = 0;
+
+  /**
+   * Compare against `other` for equality.
+   */
+  virtual bool equalTo(QuantizerPtr other) const = 0;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Range.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Range.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb79331a2fa8e6520929badeeab10868d9f6f23e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Range.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+namespace at {
+
+struct Range {
+  Range(int64_t begin, int64_t end)
+    : begin(begin)
+    , end(end) {}
+
+  int64_t size() const { return end - begin; }
+
+  Range operator/(int64_t divisor) {
+    return Range(begin / divisor, end / divisor);
+  }
+
+  int64_t begin;
+  int64_t end;
+};
+
+std::ostream& operator<<(std::ostream& out, const Range& range);
+
+}  // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Reduction.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..04a94e25e9fc6014bc13ffa0749a3791cceedf94
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Reduction.h
@@ -0,0 +1,16 @@
+#pragma once
+
+namespace at {
+namespace Reduction {
+
+// NB: Keep this in sync with Reduction class in torch/nn/_reduction.py
+// These constants control the reduction behavior of loss functions.
+// Ideally, this would be a scoped enum, but jit doesn't support that
+enum Reduction {
+  None,             // Do not reduce
+  Mean,             // (Possibly weighted) mean of losses
+  Sum,              // Sum losses
+  END
+};
+} // namespace Reduction
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Scalar.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c1649491a2b69236e633783b794d97faf3546b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Scalar.h
@@ -0,0 +1 @@
+#include <c10/core/Scalar.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ScalarType.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ScalarType.h
new file mode 100644
index 0000000000000000000000000000000000000000..b83740b82dc25709e2aa8d2252c7fad88b4638dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ScalarType.h
@@ -0,0 +1 @@
+#include <c10/core/ScalarType.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Tensor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3c53f9577c0b9aa544e8c8975bb8ef5b3fb228
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Tensor.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <ATen/core/TensorBody.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+class TORCH_API OptionalTensorRef {
+ public:
+  OptionalTensorRef() = default;
+
+  ~OptionalTensorRef() {
+    ref_.unsafeReleaseTensorImpl();
+  }
+
+  OptionalTensorRef(const TensorBase& src)
+      : ref_(Tensor::unsafe_borrow_t{}, src) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.defined());
+  }
+
+  OptionalTensorRef(const OptionalTensorRef& rhs)
+      : ref_(Tensor::unsafe_borrow_t{}, rhs.ref_) {}
+
+  OptionalTensorRef& operator=(OptionalTensorRef rhs) {
+    std::swap(ref_, rhs.ref_);
+    return *this;
+  }
+
+  bool has_value() const {
+    return ref_.defined();
+  }
+
+  const Tensor& getTensorRef() const & {
+    return ref_;
+  }
+
+  const Tensor& operator*() const & {
+    return ref_;
+  }
+
+  const Tensor* operator->() const & {
+    return &ref_;
+  }
+
+  operator bool() const {
+    return ref_.defined();
+  }
+
+ private:
+  Tensor ref_;
+};
+
+// Use to convert a TensorBase (that may be undefined) to an at::Tensor
+// without bumping refcount.
+class TORCH_API TensorRef {
+ public:
+  ~TensorRef() {
+    ref_.unsafeReleaseTensorImpl();
+  }
+
+  TensorRef(const TensorBase& src)
+      : ref_(Tensor::unsafe_borrow_t{}, src) {}
+
+  const Tensor& operator*() const & {
+    return ref_;
+  }
+ private:
+  Tensor ref_;
+};
+
+template <typename T>
+auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
+  // Return the grad argument in case of a hook with void return type to have an
+  // std::function with Tensor return type
+  static_assert(std::is_same<decltype(hook(Tensor())), void>::value,
+                "Expected hook to return void");
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
+    TensorRef grad(grad_base);
+    fn(*grad);
+    return Tensor();
+  });
+}
+
+template <typename T>
+auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_var_t<T> {
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
+    TensorRef grad(grad_base);
+    Tensor ret = fn(*grad);
+    return TensorBase(std::move(ret));
+  });
+}
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/TensorAccessor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5e4cbf7b991dcb7f3033f70a63f2190c85fc278
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorAccessor.h
@@ -0,0 +1,276 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace at {
+
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntArrayRef isn't).
+
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class TensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  C10_HOST IntArrayRef sizes() const {
+    return IntArrayRef(sizes_,N);
+  }
+  C10_HOST IntArrayRef strides() const {
+    return IntArrayRef(strides_,N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  const index_t* sizes_;
+  const index_t* strides_;
+};
+
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
+
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+
+  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
+  C10_HOST_DEVICE T & operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->data_[this->strides_[0]*i];
+  }
+  C10_HOST_DEVICE const T & operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+
+
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_) {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : data_(data_) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes_[i];
+      this->strides_[i] = strides_[i];
+    }
+  }
+
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t sizes_[N];
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t strides_[N];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
+        0 <= i && i < index_t{N},
+        "Index ",
+        i,
+        " is not within bounds of a tensor of dimension ",
+        N);
+  }
+};
+
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
+    const index_t* new_sizes = this->sizes_ + 1;
+    const index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+
+  C10_DEVICE T & operator[](index_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};
+
+
+// Can't put this directly into the macro function args because of commas
+#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+
+// Old name for `GenericPackedTensorAccessor`
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
+
+#undef AT_X
+
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor32 = GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
+
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor64 = GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBase.h b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..8102105aef7acd0eec70b22ae471e91fafd14192
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBase.h
@@ -0,0 +1,1055 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/ExclusivelyOwnedTensorTraits.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/StorageUtils.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+}} // namespace torch::autograd
+
+namespace at {
+
+class Tensor;
+class TensorBase;
+
+// Convert Tensor to TensorBase without any need to include Tensor.h
+TORCH_API const TensorBase& get_tensor_base(const Tensor& t);
+
+namespace impl {
+inline bool variable_excluded_from_dispatch() {
+#ifdef C10_MOBILE
+  // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change.
+  return true;
+#else
+  return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset);
+#endif
+}
+
+}
+
+// NOTE: [Tensor vs. TensorBase]
+//
+// Tensor, being the central data structure in PyTorch, gets used and
+// it's header included almost everywhere. Unfortunately this means
+// every time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile all of ATen and it's dependencies.
+//
+// TensorBase aims to break up these header dependencies, and improve
+// incremental build times for all PyTorch developers. TensorBase
+// represents a reference counted handle to TensorImpl, exactly the
+// same as Tensor. However, TensorBase doesn't have code generated
+// methods in it's API and thus no dependence on native_functions.yaml.
+//
+// Usage tips
+// ----------
+// - You can `#define TORCH_ASSERT_NO_OPERATORS` at the top of a .cpp
+//   or .cu file to ensure it has no header dependencies on
+//   native_functions.yaml (direct or indirect).
+// - Tensor inherits from TensorBase, so functions taking
+//   `const TensorBase &` are callable with Tensor as well.
+// - TensorBase can be converted to tensor with `Tensor(tensor_base)`,
+//   but this requires a reference-count bump. OptionalTensorRef on
+//   the other hand can materialize a `const Tensor &` without
+//   touching the reference-count.
+class TORCH_API TensorBase {
+ public:
+  struct unsafe_borrow_t { explicit unsafe_borrow_t() = default; };
+
+ protected:
+  // Create a Tensor with a +0 reference count. Special care must be
+  // taken to avoid decrementing this reference count at destruction
+  // time. Intended to support MaybeOwnedTraits<Tensor>.
+  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+      : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>::reclaim(rhs.impl_.get())) {}
+  friend MaybeOwnedTraits<TensorBase>;
+
+ public:
+  TensorBase() = default;
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit TensorBase(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
+    }
+  }
+  TensorBase(const TensorBase&) = default;
+  TensorBase(TensorBase&&) noexcept = default;
+
+ public:
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static TensorBase wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    TensorBase r(std::move(tensor_impl));
+    r.enforce_invariants();
+    return r;
+  }
+
+  int64_t dim() const {
+    return impl_->dim();
+  }
+  int64_t storage_offset() const {
+    return impl_->storage_offset();
+  }
+
+  TensorBase contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+    if (is_contiguous(memory_format)) {
+      return *this;
+    } else {
+      return __dispatch_contiguous(memory_format);
+    }
+  }
+
+  /// Should be used if *this can reasonably be expected to be contiguous and
+  /// performance is important.
+  /// Compared to contiguous, it saves a reference count
+  /// increment/decrement if *this is already contiguous, at the cost
+  /// in all cases of an extra pointer of stack usage, an extra branch
+  /// to access, and an extra branch at destruction time.
+  c10::MaybeOwned<TensorBase> expect_contiguous(
+      MemoryFormat memory_format=MemoryFormat::Contiguous) const &;
+
+  // Use .contiguous() instead. Trying to borrow from a prvalue
+  // will only lead to trouble and dangling references.
+  c10::MaybeOwned<TensorBase> expect_contiguous(
+      MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
+
+  const TensorBase& fill_(const c10::Scalar& scalar) const;
+  const TensorBase& zero_() const;
+
+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+
+  bool is_complex() const {
+    return at::isComplexType(this->scalar_type());
+  }
+
+  bool is_floating_point() const {
+    return at::isFloatingType(this->scalar_type());
+  }
+
+  bool is_signed() const {
+    return at::isSignedType(this->scalar_type());
+  }
+
+  c10::SymInt sym_size(int64_t dim) const {
+    return impl_->sym_size(dim);
+  }
+
+  c10::SymInt sym_stride(int64_t dim) const {
+    const auto sizes = this->sym_strides();
+    const auto ndim = static_cast<int64_t>(sizes.size());
+    // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+    return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
+
+  }
+
+  int64_t size(int64_t dim) const {
+    return impl_->size(dim);
+  }
+
+  int64_t stride(int64_t dim) const {
+    const auto strides = this->strides();
+    const auto ndim = static_cast<int64_t>(strides.size());
+    // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+    return strides[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
+  }
+
+  TensorImpl * unsafeGetTensorImpl() const {
+    return impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
+    return impl_;
+  }
+
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> unsafeReleaseIntrusivePtr() {
+    return std::move(impl_);
+  }
+
+  bool defined() const {
+    return impl_;
+  }
+
+  void reset() {
+    impl_.reset();
+  }
+
+#if defined (_MSC_VER)
+  TensorBase& operator=(const TensorBase& x) & {
+    impl_ = x.impl_;
+    return *this;
+  };
+  TensorBase& operator=(TensorBase&& x) & noexcept {
+    impl_ = std::move(x.impl_);
+    return *this;
+  }
+#else
+  TensorBase& operator=(const TensorBase& x) & = default;
+  TensorBase& operator=(TensorBase&& x) & noexcept = default;
+#endif
+
+  // Ban assignment to rvalues, since at::Tensor (weirdly) performs a deep copy here
+  TensorBase& operator=(const TensorBase&) && = delete;
+  TensorBase& operator=(TensorBase&&) && noexcept = delete;
+
+  bool is_same(const TensorBase& other) const noexcept {
+    return impl_ == other.impl_;
+  }
+  size_t use_count() const noexcept {
+    return impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return impl_.weak_use_count();
+  }
+
+  std::string toString() const;
+
+  IntArrayRef sizes() const {
+    return impl_->sizes();
+  }
+  c10::SymIntArrayRef sym_sizes() const {
+    return impl_->sym_sizes();
+  }
+  c10::SymIntArrayRef sym_strides() const {
+    return impl_->sym_strides();
+  }
+  IntArrayRef strides() const {
+    return impl_->strides();
+  }
+  // See impl::get_opt_names in ATen/NamedTensor.h for docs.
+  c10::optional<DimnameList> opt_names() const {
+    return impl::get_opt_names(unsafeGetTensorImpl());
+  }
+  // See impl::get_names in ATen/NamedTensor.h for docs.
+  DimnameList names() const {
+    return impl::get_names(unsafeGetTensorImpl());
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
+    return impl_->is_contiguous(memory_format);
+  }
+
+  bool is_non_overlapping_and_dense() const {
+    return impl_->is_non_overlapping_and_dense();
+  }
+
+  at::MemoryFormat suggest_memory_format(
+      bool channels_last_strides_exact_match = false) const {
+    // Setting channels_last_strides_exact_match to true forces function to
+    // check 0,1 - sized dimension strides.
+    if (layout() == at::kStrided) {
+      if (impl_->is_strides_like_channels_last()) {
+        if (!channels_last_strides_exact_match ||
+            get_channels_last_strides_2d(sizes()) == strides()) {
+          return at::MemoryFormat::ChannelsLast;
+        }
+      }
+      else if (impl_->is_strides_like_channels_last_3d()) {
+        if (!channels_last_strides_exact_match ||
+            get_channels_last_strides_3d(sizes()) == strides()) {
+          return at::MemoryFormat::ChannelsLast3d;
+        }
+      }
+    }
+    return at::MemoryFormat::Contiguous;
+  }
+
+  // Total bytes consumed by the "view" of elements of the array.  Does not
+  // include size of metadata.  The number reported here does not necessarily
+  // correspond to the true physical memory consumed by a tensor; instead,
+  // it reports the memory the tensor would take *if* it were contiguous.
+  // Defined to be numel() * itemsize()
+  size_t nbytes() const {
+    TORCH_CHECK(layout () != at::kSparse,
+                "nbytes is not defined for sparse tensors.  If you want the size of the constituent " \
+                "tensors, add the nbytes of the indices and values.  If you want the size of the  " \
+                "equivalent dense tensor, multiply numel() by element_size()");
+    return impl_->numel() * impl_->itemsize();
+  }
+
+  c10::SymInt sym_nbytes() const {
+    TORCH_CHECK(layout () != at::kSparse,
+                "nbytes is not defined for sparse tensors.  If you want the size of the constituent " \
+                "tensors, add the nbytes of the indices and values.  If you want the size of the  " \
+                "equivalent dense tensor, multiply numel() by element_size()");
+    return impl_->sym_numel() * impl_->itemsize();
+  }
+
+  int64_t numel() const {
+    return impl_->numel();
+  }
+
+  c10::SymInt sym_numel() const {
+    return impl_->sym_numel();
+  }
+
+  c10::SymInt sym_storage_offset() const {
+    return impl_->sym_storage_offset();
+  }
+
+  // Length of one array element in bytes.  This is the traditional
+  // Numpy naming.
+  size_t itemsize() const {
+    return impl_->itemsize();
+  }
+
+  // Same as itemsize().  This is the PyTorch naming.
+  int64_t element_size() const {
+    return static_cast<int64_t>(impl_->itemsize());
+  }
+
+  DispatchKeySet key_set() const {
+    return impl_->key_set();
+  }
+  ScalarType scalar_type() const {
+    return typeMetaToScalarType(impl_->dtype());
+  }
+  bool has_storage() const {
+    return defined() && impl_->has_storage();
+  }
+  const Storage& storage() const {
+    return impl_->storage();
+  }
+  bool is_alias_of(const at::TensorBase& other) const{
+    return impl_->storage().is_alias_of(other.storage());
+  }
+
+  // Move the storage backend to shm based
+  // to enable memory sharing across processes.
+  //
+  // NB1: the ideal behavior of this API still requires further discussion
+  // but for now we are inclined to keep it consistent with existing THP behavior
+  // https://github.com/pytorch/pytorch/blob/4dca9bde0552afc67b5b74f4a0696fe6055709c4/torch/storage.py#L196-L212
+  // so we don't assert on anything here and rely on caller knowing
+  // what it's doing.
+  //
+  // NB2: this currently provides Linux fd based shm support only
+  // to simplify the storage lifetime management logic in ATen
+  // and similarly for now we are not adding support for file system based
+  // shm support like in THP due to additional GC manager support needed
+  // to prevent leaks.
+  // As such, calling this from non supported systems (e.g. Windows) would fail.
+  void share_memory_() {
+    at::share_memory_(*this);
+  }
+
+  inline bool _is_zerotensor() const {
+    return impl_->_is_zerotensor();
+  }
+
+  inline void _set_zero(bool zero) const {
+    impl_->_set_zero(zero);
+  }
+
+  inline bool is_conj() const {
+    return impl_->is_conj();
+  }
+
+  // sets the conjugate bit of a tensor.
+  // NOTE: Conjugate bit is supposed to be a read-only field. Only change this, if you are sure
+  // that's what you want. Changing this might lead to incorrect behavior since conjugation is
+  // a lazy operation and we rely on this bit to determine if a conjugation needs to be materialized.
+  inline void _set_conj(bool conjugate) const {
+    impl_->_set_conj(conjugate);
+  }
+
+  inline bool is_neg() const {
+    return impl_->is_neg();
+  }
+
+  // sets the negative bit of a tensor.
+  // NOTE: Negative bit is supposed to be a read-only field. Only change this, if you are sure
+  // that's what you want. Changing this might lead to incorrect behavior since we rely on this
+  // bit to determine if a negation needs to be materialized.
+  inline void _set_neg(bool negative) const {
+    impl_->_set_neg(negative);
+  }
+
+  /// Returns a `Tensor`'s layout.
+  Layout layout() const {
+    return impl_->layout();
+  }
+
+  /// Returns a `Tensor`'s dtype (`TypeMeta`).
+  caffe2::TypeMeta dtype() const {
+    return impl_->dtype();
+  }
+
+  /// Returns a `Tensor`'s device.
+  inline Device device() const {
+    return impl_->device();
+  }
+
+  /// Returns a `Tensor`'s device index.
+  DeviceIndex get_device() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->get_device();
+  }
+
+  /// Returns if a `Tensor` has CPU backend.
+  bool is_cpu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_cpu();
+  }
+
+  /// Returns if a `Tensor` has CUDA backend.
+  bool is_cuda() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_cuda();
+  }
+
+  /// Returns if a `Tensor` has IPU backend.
+  bool is_ipu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ipu();
+  }
+
+  /// Returns if a `Tensor` has XPU backend.
+  bool is_xpu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_xpu();
+  }
+
+  /// Returns if a `Tensor` has XLA backend.
+  bool is_xla() const {
+    return impl_->is_xla();
+  }
+
+  /// Returns if a `Tensor` has MTIA backend.
+  bool is_mtia() const {
+    return impl_->is_mtia();
+  }
+
+  /// Returns if a `Tensor` has HPU backend.
+  bool is_hpu() const {
+    return impl_->is_hpu();
+  }
+
+  /// Returns if a `Tensor` has Lazy backend.
+  bool is_lazy() const {
+    return impl_->is_lazy();
+  }
+
+  /// Returns if a `Tensor` has HIP backend.
+  bool is_hip() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_hip();
+  }
+
+  /// Returns if a `Tensor` has VE backend.
+  bool is_ve() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ve();
+  }
+
+  /// Returns if a `Tensor` has PrivateUse1 backend.
+  bool is_privateuseone() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_privateuseone();
+  }
+
+  /// Returns if a `Tensor` has sparse backend.
+  bool is_sparse() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_sparse();
+  }
+
+  /// Returns is a `Tensor` has a sparse CSR backend.
+  bool is_sparse_csr() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_sparse_csr();
+  }
+
+  /// Returns if a `Tensor` is mkldnn tensor.
+  bool is_mkldnn() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_mkldnn();
+  }
+
+  /// Returns if a `Tensor` is mps tensor.
+  bool is_mps() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_mps();
+  }
+
+  /// Returns if a `Tensor` is ort tensor.
+  bool is_ort() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ort();
+  }
+
+  /// Returns if a `Tensor` is vulkan tensor.
+  bool is_vulkan() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_vulkan();
+  }
+
+  /// Returns if a `Tensor` is metal tensor.
+  bool is_metal() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_metal();
+  }
+
+  /// Returns if a `Tensor` has quantized backend.
+  bool is_quantized() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_quantized();
+  }
+
+  /// Returns if a `Tensor` is a meta tensor.  Meta tensors can
+  /// also have other designations.
+  bool is_meta() const {
+    return impl_->is_meta();
+  }
+
+  /// Returns if a `Tensor` is an inference tensor.
+  bool is_inference() const {
+    return impl_->is_inference();
+  }
+
+  // Returns if a `Tensor` is a NestedTensor.
+  bool is_nested() const {
+    return impl_->is_nested();
+  }
+
+  /// If a tensor is a quantized tensor, returns its quantizer
+  /// TODO: it's not in native_functions.yaml yet as it's not exposed to python
+  QuantizerPtr quantizer() const;
+
+  /// Returns if a `Tensor` has any dimension names
+  bool has_names() const {
+    // If a user is using unnamed tensors, then we can short-circuit right here.
+    // Otherwise, impl::has_names attempts to retrieve names.
+    if (!impl_->has_named_tensor_meta()) {
+      return false;
+    }
+    return impl::has_names(unsafeGetTensorImpl());
+  }
+
+  /// Returns a `Tensor`'s dimension names data structure
+  const NamedTensorMeta* get_named_tensor_meta() const {
+    return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
+  }
+
+  NamedTensorMeta* get_named_tensor_meta() {
+    return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
+  }
+
+  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
+  /// TensorOptions.h.
+  TensorOptions options() const {
+    return TensorOptions().dtype(dtype())
+                          .device(device())
+                          .layout(layout());
+  }
+
+  const void* const_data_ptr() const {
+    return this->unsafeGetTensorImpl()->data();
+  }
+
+  void* mutable_data_ptr() const {
+    return this->unsafeGetTensorImpl()->mutable_data();
+  }
+
+  // TODO(#97856) Make this return a const pointer. This currently
+  //              returns a non-const pointer because of the large
+  //              number of clients that we still want to audit before
+  //              migrating to mutable_data_ptr().
+  void* data_ptr() const {
+    return mutable_data_ptr();
+  }
+
+  template <typename T, std::enable_if_t<!std::is_const<T>::value, int> = 0>
+  const T* const_data_ptr() const;
+
+  template <typename T, std::enable_if_t<std::is_const<T>::value, int> = 0>
+  const std::remove_const_t<T>* const_data_ptr() const;
+
+  template <typename T>
+  T* mutable_data_ptr() const;
+
+  // Legacy interface during the migration to indicate that a callsite
+  // has not been audited for mutability.
+  //
+  // Do not add new uses of this, use const_data_ptr() if possible,
+  // mutable_data_ptr() otherwise.
+  //
+  // TODO(#97856) Make this return a const pointer. This is currently
+  //              const because of the vast number of clients that
+  //              rely on this.
+  template <typename T>
+  T* data_ptr() const;
+
+  // Purposely not defined here to avoid inlining
+  void print() const;
+
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+    TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
+    T* ptr = nullptr;
+    if constexpr (std::is_const<T>::value) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return TensorAccessor<T,N>(ptr,sizes().data(),strides().data());
+  }
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() && = delete;
+
+  // Return a `GenericPackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding GenericPackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> generic_packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+    TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
+    T* ptr = nullptr;
+    if constexpr (std::is_const<T>::value) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return GenericPackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(ptr),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  GenericPackedTensorAccessor<T,N> generic_packed_accessor() && = delete;
+
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() const& {
+    TORCH_CHECK(
+        impl_->numel() <=
+            static_cast<int64_t>(std::numeric_limits<int32_t>::max()),
+        "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64");
+    return generic_packed_accessor<T,N,PtrTraits,int32_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() && = delete;
+
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() const& {
+    return generic_packed_accessor<T,N,PtrTraits,int64_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() && = delete;
+
+  // ~~~~~ Autograd API ~~~~~
+
+  /// \fn bool is_leaf() const;
+  ///
+  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
+  ///
+  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
+  /// created by the user. This means that they are not the result of an operation and so
+  /// `grad_fn()` is `nullptr`.
+  ///
+  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
+  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
+  ///
+  /// Example:
+  /// @code
+  /// auto a = torch::rand(10, torch::requires_grad());
+  /// std::cout << a.is_leaf() << std::endl; // prints `true`
+  ///
+  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
+  /// std::cout << b.is_leaf() << std::endl; // prints `false`
+  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
+  ///
+  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
+  /// std::cout << c.is_leaf() << std::endl; // prints `false`
+  /// // c was created by the addition operation
+  ///
+  /// auto d = torch::rand(10).cuda();
+  /// std::cout << d.is_leaf() << std::endl; // prints `true`
+  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+  ///
+  /// auto e = torch::rand(10).cuda().requires_grad_();
+  /// std::cout << e.is_leaf() << std::endl; // prints `true`
+  /// // e requires gradients and has no operations creating it
+  ///
+  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
+  /// std::cout << f.is_leaf() << std::endl; // prints `true`
+  /// // f requires grad, has no operation creating it
+  /// @endcode
+
+  /// \fn void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
+  ///
+  /// Computes the gradient of current tensor with respect to graph leaves.
+  ///
+  /// The graph is differentiated using the chain rule. If the tensor is
+  /// non-scalar (i.e. its data has more than one element) and requires
+  /// gradient, the function additionally requires specifying ``gradient``.
+  /// It should be a tensor of matching type and location, that contains
+  /// the gradient of the differentiated function w.r.t. this Tensor.
+  ///
+  /// This function accumulates gradients in the leaves - you might need to
+  /// zero them before calling it.
+  ///
+  /// \param gradient Gradient w.r.t. the
+  ///     tensor. If it is a tensor, it will be automatically converted
+  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
+  ///     None values can be specified for scalar Tensors or ones that
+  ///     don't require grad. If a None value would be acceptable then
+  ///     this argument is optional.
+  /// \param retain_graph If ``false``, the graph used to compute
+  ///     the grads will be freed. Note that in nearly all cases setting
+  ///     this option to True is not needed and often can be worked around
+  ///     in a much more efficient way. Defaults to the value of
+  ///     ``create_graph``.
+  /// \param create_graph If ``true``, graph of the derivative will
+  ///     be constructed, allowing to compute higher order derivative
+  ///     products. Defaults to ``false``.
+  /// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+  ///     ``at::Tensor::grad``. All other Tensors will be ignored. If not
+  ///     provided, the gradient is accumulated into all the leaf Tensors
+  ///     that were used to compute the current tensor.
+  ///     When inputs are provided and a given input is not a leaf,
+  ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+  ///     It is an implementation detail on which the user should not rely.
+  ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+
+  /// \fn Tensor detach() const;
+  ///
+  /// Returns a new Tensor, detached from the current graph.
+  /// The result will never require gradient.
+
+  /// \fn Tensor & detach_() const;
+  ///
+  /// Detaches the Tensor from the graph that created it, making it a leaf.
+  /// Views cannot be detached in-place.
+
+  /// \fn void retain_grad() const;
+  ///
+  /// Enables this Tensor to have their :attr:`grad` populated during
+  /// :func:`backward`. This is a no-op for leaf tensors.
+
+  /// \fn bool retains_grad() const;
+  ///
+  /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+  /// populated during :func:`backward`, ``false`` otherwise.
+
+  const TensorBase& set_requires_grad(bool requires_grad) const {
+    impl_->set_requires_grad(requires_grad);
+    return *this;
+  }
+  bool requires_grad() const {
+    return impl_->requires_grad();
+  }
+
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& _fw_grad(uint64_t level) const {
+    return impl_->_fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const {
+    impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+  /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended
+  /// to be used from functions that need to access the `Variable`'s equivalent `Tensor`
+  /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`).
+  ///
+  /// One notable difference with the legacy `.data()` function is that changes to the
+  /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset)
+  /// will not update the original `Variable`, due to the fact that this function
+  /// shallow-copies the `Variable`'s underlying TensorImpl.
+  at::TensorBase tensor_data() const;
+
+  /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data`
+  /// in Python, which create a new `Variable` that shares the same storage and
+  /// tensor metadata with the original `Variable`, but with a completely new
+  /// autograd history.
+  ///
+  /// NOTE: If we change the tensor metadata (e.g. sizes / strides /
+  /// storage / storage_offset) of a variable created from `var.variable_data()`, those
+  /// changes will not update the original variable `var`. In `.variable_data()`, we set
+  /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal,
+  /// in order to prevent users from changing metadata of `var.variable_data()`
+  /// and expecting the original variable `var` to also be updated.
+  at::TensorBase variable_data() const;
+
+  // Gradient Node and Edges
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Gets the gradient function of the `Variable`. If this is a leaf variable,
+  /// the pointer returned will be null.
+  ///
+  /// For View Variables:
+  /// Gets the up-to-date grad_fn. If the shared data or base was modified, we
+  /// re-create the grad_fn to express the up-to-date view relationship between
+  /// this and the base Variable.
+  const std::shared_ptr<torch::autograd::Node>& grad_fn() const;
+
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  template <typename T>
+  using hook_return_void_t = std::enable_if_t<std::is_void<typename c10::invoke_result_t<T&, TensorBase>>::value, unsigned>;
+  template <typename T>
+  using hook_return_var_t = std::enable_if_t<std::is_same<typename c10::invoke_result_t<T&, TensorBase>, TensorBase>::value, unsigned>;
+
+  /// Registers a backward hook.
+  ///
+  /// The hook will be called every time a gradient with respect to the Tensor is computed.
+  /// The hook should have one of the following signature:
+  /// ```
+  /// hook(TensorBase grad) -> TensorBase
+  /// ```
+  /// ```
+  /// hook(TensorBase grad) -> void
+  /// ```
+  /// The hook should not modify its argument, but it can optionally return a new gradient
+  /// which will be used in place of `grad`.
+  ///
+  /// This function returns the index of the hook in the list which can be used to remove hook.
+  ///
+  /// Example:
+  /// @code
+  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
+  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
+  /// v.backward(torch::tensor({1., 2., 3.}));
+  /// // This prints:
+  /// // ```
+  /// //  2
+  /// //  4
+  /// //  6
+  /// // [ CPUFloatType{3} ]
+  /// // ```
+  /// std::cout << v.grad() << std::endl;
+  /// v.remove_hook(h);  // removes the hook
+  /// @endcode
+  template <typename T>
+  hook_return_void_t<T> register_hook(T&& hook) const;
+  template <typename T>
+  hook_return_var_t<T> register_hook(T&& hook) const;
+
+protected:
+  unsigned _register_hook(std::function<TensorBase(const TensorBase&)> hook) const;
+
+public:
+
+  /// Remove hook at given position
+  void remove_hook(unsigned pos) const;
+
+  // Variable methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  bool is_leaf() const;
+
+  int64_t output_nr() const;
+
+  void set_data(const TensorBase & new_data) const;
+
+  TensorBase data() const;
+
+  int64_t _version() const;
+
+  void retain_grad() const;
+
+  bool retains_grad() const;
+
+  const TensorBase& requires_grad_(bool _requires_grad=true) const;
+
+  // View Variables
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Returns true if this `Variable` is a view of another `Variable`.
+  bool is_view() const;
+
+  /// Returns the `Variable` that this `Variable` is a view of. If this
+  /// `Variable` is not a view, throw a `std::runtime_error`.
+  const TensorBase& _base() const;
+
+  // Miscellaneous
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  const std::string& name() const;
+
+protected:
+  void enforce_invariants();
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
+
+private:
+  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+};
+
+inline DeviceIndex get_device(const TensorBase& self) {
+  return self.get_device();
+}
+
+template <typename T>
+auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t<T> {
+  // Return the grad argument in case of a hook with void return type to have an
+  // std::function with Tensor return type
+  static_assert(std::is_same<decltype(hook(TensorBase())), void>::value,
+                "Expected hook to return void");
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad) {
+    fn(grad);
+    return TensorBase();
+  });
+}
+
+template <typename T>
+auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_var_t<T> {
+  return _register_hook(std::forward<T>(hook));
+}
+
+namespace detail {
+// Helper creator for Tensor class which doesn't requires the users to pass
+// in an intrusive_ptr instead it just converts the argument passed to
+// requested intrusive_ptr type.
+template <typename T, typename... Args>
+TensorBase make_tensor_base(Args&&... args) {
+  return TensorBase(c10::make_intrusive<T>(std::forward<Args>(args)...));
+}
+
+} // namespace detail
+
+static inline DispatchKey legacyExtractDispatchKey(const TensorBase& t) {
+  return legacyExtractDispatchKey(t.key_set());
+}
+
+} // namespace at
+
+namespace c10 {
+template <>
+struct MaybeOwnedTraits<at::TensorBase> {
+  using owned_type = at::TensorBase;
+  using borrow_type = at::TensorBase;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    // NOTE: this can be implemented without the special
+    // unsafe_borrow_t Tensor constructor as
+    //
+    // return borrow_type(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(from.unsafeGetTensorImpl()));
+    //
+    // but that hurts inlining due to the nullptr check in the
+    // Tensor(c10::intrusive_ptr<...>) constructor. We already know
+    // that from.impl_ isn't null because from is a valid Tensor, so
+    // we needn't do the check again. (using __builtin_assume can
+    // avoid this, but wouldn't be portable to MSVC.)
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseTensorImpl();
+    // See above note: this can be implemented with public API
+    // similarly to createBorrow(), but that would hurt inlining.
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<at::TensorBase> : public c10::ExclusivelyOwnedTensorTraits<at::TensorBase> {};
+} // namespace c10
+
+namespace at {
+
+inline c10::MaybeOwned<TensorBase> borrow_from_optional_tensor(
+    const c10::optional<TensorBase>& opt) {
+  return opt.has_value()
+    ? c10::MaybeOwned<TensorBase>::borrowed(*opt)
+    : c10::MaybeOwned<TensorBase>::owned(std::in_place);
+}
+
+inline c10::MaybeOwned<TensorBase> TensorBase::expect_contiguous(MemoryFormat memory_format) const & {
+  if (is_contiguous(memory_format)) {
+    return c10::MaybeOwned<TensorBase>::borrowed(*this);
+  } else {
+    return c10::MaybeOwned<TensorBase>::owned(__dispatch_contiguous(memory_format));
+  }
+}
+
+namespace symint {
+
+template <typename T>
+using enable_if_symint = std::enable_if_t<std::is_same<T, c10::SymInt>::value>;
+template <typename T>
+using enable_if_int = std::enable_if_t<std::is_same<T, int64_t>::value>;
+
+template <typename T, typename = enable_if_symint<T>>
+c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
+template <typename T, typename = enable_if_int<T>>
+IntArrayRef sizes(const TensorBase& t) { return t.sizes(); }
+
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); }
+template <typename T, typename = enable_if_int<T>>
+int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); }
+
+template <typename T, typename = enable_if_symint<T>>
+c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); }
+template <typename T, typename = enable_if_int<T>>
+IntArrayRef strides(const TensorBase& t) { return t.strides(); }
+
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); }
+template <typename T, typename = enable_if_int<T>>
+int64_t numel(const TensorBase& t) { return t.numel(); }
+
+} // namespace symint
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBody.h b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBody.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e3dc0fc3e04bd9db687af8d199022bbb8b7160
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/TensorBody.h
@@ -0,0 +1,5731 @@
+#pragma once
+
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/Stream.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/macros/Export.h>
+#include <ATen/core/CheckMemoryFormat.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/TensorBase.h>
+
+
+#include <ATen/MethodOperators.h>
+
+namespace c10{
+template<class T> class List;
+template<class T> class IListRef;
+}
+namespace at {
+struct Generator;
+struct Type;
+class DeprecatedTypeProperties;
+class Tensor;
+} // namespace at
+namespace at {
+namespace indexing {
+struct TensorIndex;
+} // namespace indexing
+} // namespace at
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+}} // namespace torch::autograd
+
+namespace at {
+
+class OptionalTensorRef;
+class TensorRef;
+class Tensor;
+using TensorList = ArrayRef<Tensor>;
+using ITensorList = c10::IListRef<Tensor>;
+
+using Stream = c10::Stream;
+
+// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
+// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
+//
+// For example:
+//
+// void func(Tensor a) {
+//   Tensor b = a;
+//   ...
+// }
+//
+// In this example, when we say Tensor b = a, we are creating a new object that points to the
+// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
+// destructor decrements the reference count by calling release() on the TensorImpl it points to.
+// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
+//
+// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
+// special care must be taken to handle this.
+class TORCH_API Tensor: public TensorBase {
+ protected:
+  // Create a Tensor with a +0 reference count. Special care must be
+  // taken to avoid decrementing this reference count at destruction
+  // time. Intended to support MaybeOwnedTraits<Tensor>.
+  explicit Tensor(unsafe_borrow_t, const TensorBase& rhs): TensorBase(unsafe_borrow_t{}, rhs) {}
+  friend MaybeOwnedTraits<Tensor>;
+  friend OptionalTensorRef;
+  friend TensorRef;
+
+ public:
+  Tensor() = default;
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit Tensor(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : TensorBase(std::move(tensor_impl)) {}
+  Tensor(const Tensor &tensor) = default;
+  Tensor(Tensor &&tensor) = default;
+
+  // Implicitly move-constructible from TensorBase, but must be explicit to increase refcount
+  explicit Tensor(const TensorBase &base): TensorBase(base) {}
+  /*implicit*/ Tensor(TensorBase &&base): TensorBase(std::move(base)) {}
+
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static Tensor wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    return TensorBase::wrap_tensor_impl(std::move(tensor_impl));
+  }
+
+  Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+    return TensorBase::contiguous(memory_format);
+  }
+
+  Tensor conj() const {
+    if (!this->is_complex()) {
+      return *this;
+    }
+
+    switch (this->layout()) {
+      case at::kSparse:
+      case at::kSparseCsr:
+      case at::kSparseCsc:
+      case at::kSparseBsr:
+      case at::kSparseBsc:
+        return this->conj_physical();
+      default:
+        return this->_conj();
+    }
+  }
+
+  // Aliased by Dimname overloads, so need explicit using
+  using TensorBase::size;
+  using TensorBase::sym_size;
+  using TensorBase::stride;
+
+  /// Should be used if *this can reasonably be expected to be contiguous and
+  /// performance is important.
+  /// Compared to contiguous, it saves a reference count
+  /// increment/decrement if *this is already contiguous, at the cost
+  /// in all cases of an extra pointer of stack usage, an extra branch
+  /// to access, and an extra branch at destruction time.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const &;
+
+  // Use .contiguous() instead. Trying to borrow from a prvalue Tensor
+  // will only lead to trouble and dangling references.
+  c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
+
+  // The following overloads are very intruiging.  Consider the following
+  // program:
+  //
+  //    x[1] = 3;
+  //
+  // We would expect that the first entry of x is written to 3.  But how can we
+  // actually achieve this?  x[1] evaluates to a tensor...
+  //
+  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
+  // (profitably) assigned to in the traditional sense, so we overload
+  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
+  // with an rvalue-reference ref-qualified overload (the methods with && at the
+  // end of their type.)
+  //
+  // There's one more fly in the ointment: We also want
+  //
+  //    Tensor x = y;
+  //
+  // to work, and we want it NOT to copy.  So we need a traditional operator=
+  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
+  // disambiguate the traditional overload from the rvalue-reference
+  // ref-qualified overload.  Otherwise, it will be ambiguous, because
+  // a non ref-qualified method is eligible for all situations.
+
+  // Unfortunately, we have to write these constructors out manually
+  // to work around an MSVC bug:
+  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
+  //    multiple versions of a defaulted special member functions are not allowed
+  // Tensor& operator=(const Tensor&) & = default;
+  // Tensor& operator=(Tensor&&) & = default;
+
+  // Also MSVC will wrongly issue the following warning with the aforementioned fix
+  //    warning C4522: 'at::Tensor': multiple assignment operators specified
+  // Let's just skip the warning.
+  //
+  // TODO: temporarily disabled
+
+  Tensor& operator=(const TensorBase& x) & {
+    impl_ = x.getIntrusivePtr();
+    return *this;
+  }
+  Tensor& operator=(TensorBase&& x) & noexcept {
+    impl_ = x.unsafeReleaseIntrusivePtr();
+    return *this;
+  }
+
+  Tensor& operator=(const Tensor &x) & {
+    return operator=(static_cast<const TensorBase&>(x));
+  }
+  Tensor& operator=(Tensor &&x) & noexcept {
+    return operator=(static_cast<TensorBase&&>(x));
+  }
+
+  Tensor& operator=(const Scalar &v) && {
+    return fill_(v);
+  }
+  Tensor& operator=(const Tensor &rhs) && {
+    return copy_(rhs);
+  }
+  Tensor& operator=(Tensor&& rhs) && {
+    return copy_(rhs);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device().")
+  DeprecatedTypeProperties & type() const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        dispatchKeyToBackend(legacyExtractDispatchKey(key_set())),
+        scalar_type());
+  }
+
+  Tensor toType(ScalarType t) const {
+    return to(options().dtype(t), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: Deprecate me
+  Tensor toBackend(Backend b) const {
+    return to(options().device(backendToDeviceType(b)).layout(layout_from_backend(b)), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  C10_DEPRECATED_MESSAGE("Tensor.is_variable() is deprecated; everything is a variable now. (If you want to assert that variable has been appropriately handled already, use at::impl::variable_excluded_from_dispatch())")
+  bool is_variable() const noexcept {
+    return !at::impl::variable_excluded_from_dispatch();
+  }
+
+  template<typename T>
+  C10_DEPRECATED_MESSAGE("Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead.")
+  T * data() const {
+    return data_ptr<T>();
+  }
+
+  template <typename T>
+  T item() const;
+
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {
+    return generic_packed_accessor<T,N,PtrTraits,index_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() && = delete;
+
+  Tensor operator~() const {
+    return bitwise_not();
+  }
+  Tensor operator-() const {
+    return neg();
+  }
+  Tensor& operator+=(const Tensor & other) {
+    return add_(other);
+  }
+  Tensor& operator+=(const Scalar & other) {
+    return add_(other);
+  }
+  Tensor& operator-=(const Tensor & other) {
+    return sub_(other);
+  }
+  Tensor& operator-=(const Scalar & other) {
+    return sub_(other);
+  }
+  Tensor& operator*=(const Tensor & other) {
+    return mul_(other);
+  }
+  Tensor& operator*=(const Scalar & other) {
+    return mul_(other);
+  }
+  Tensor& operator/=(const Tensor & other) {
+    return div_(other);
+  }
+  Tensor& operator/=(const Scalar & other) {
+    return div_(other);
+  }
+  Tensor& operator&=(const Tensor & other) {
+    return bitwise_and_(other);
+  }
+  Tensor& operator|=(const Tensor & other) {
+    return bitwise_or_(other);
+  }
+  Tensor& operator^=(const Tensor & other) {
+    return bitwise_xor_(other);
+  }
+  Tensor operator[](const Scalar & index) const {
+    if (!index.isIntegral(false)) {
+      TORCH_CHECK_INDEX(false, "Can only index tensors with integral scalars");
+    }
+    return this->operator[](index.toLong());
+  }
+  Tensor operator[](const Tensor & index) const {
+    // These properties are checked in the Scalar constructor, but we already
+    // check them here to provide more useful diagnostics for the user.
+    if (!index.defined()) {
+      TORCH_CHECK_INDEX(false, "Can only index with tensors that are defined");
+    }
+    if (index.dim() != 0) {
+      TORCH_CHECK_INDEX(false,
+                        "Can only index with tensors that are scalars (zero-dim)");
+    }
+    // The Scalar(Tensor) constructor is explicit, so we need to call it.
+    return this->operator[](index.item());
+  }
+  Tensor operator[](int64_t index) const {
+    return select(0, index);
+  }
+
+  Tensor index(ArrayRef<at::indexing::TensorIndex> indices) const;
+  Tensor index(std::initializer_list<at::indexing::TensorIndex> indices) const;
+
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(ArrayRef<at::indexing::TensorIndex> indices, const Scalar& v);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, Tensor const & rhs);
+  Tensor & index_put_(std::initializer_list<at::indexing::TensorIndex> indices, const Scalar& v);
+
+  Tensor cpu() const {
+    return to(options().device(c10::DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // TODO: The Python version also accepts arguments
+  Tensor cuda() const {
+    return to(options().device(c10::DeviceType::CUDA), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor hip() const {
+    return to(options().device(c10::DeviceType::HIP), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor ve() const {
+    return to(options().device(c10::DeviceType::VE), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor vulkan() const {
+    return to(options().device(c10::DeviceType::Vulkan), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor metal() const {
+    return to(options().device(c10::DeviceType::Metal), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  Tensor meta() const {
+    return to(options().device(c10::DeviceType::Meta), /*non_blocking*/ false, /*copy*/ false);
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+
+  /// \fn bool is_leaf() const;
+  ///
+  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
+  ///
+  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
+  /// created by the user. This means that they are not the result of an operation and so
+  /// `grad_fn()` is `nullptr`.
+  ///
+  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
+  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
+  ///
+  /// Example:
+  /// @code
+  /// auto a = torch::rand(10, torch::requires_grad());
+  /// std::cout << a.is_leaf() << std::endl; // prints `true`
+  ///
+  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
+  /// std::cout << b.is_leaf() << std::endl; // prints `false`
+  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
+  ///
+  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
+  /// std::cout << c.is_leaf() << std::endl; // prints `false`
+  /// // c was created by the addition operation
+  ///
+  /// auto d = torch::rand(10).cuda();
+  /// std::cout << d.is_leaf() << std::endl; // prints `true`
+  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+  ///
+  /// auto e = torch::rand(10).cuda().requires_grad_();
+  /// std::cout << e.is_leaf() << std::endl; // prints `true`
+  /// // e requires gradients and has no operations creating it
+  ///
+  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
+  /// std::cout << f.is_leaf() << std::endl; // prints `true`
+  /// // f requires grad, has no operation creating it
+  /// @endcode
+
+  /// \fn void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const;
+  ///
+  /// Computes the gradient of current tensor with respect to graph leaves.
+  ///
+  /// The graph is differentiated using the chain rule. If the tensor is
+  /// non-scalar (i.e. its data has more than one element) and requires
+  /// gradient, the function additionally requires specifying ``gradient``.
+  /// It should be a tensor of matching type and location, that contains
+  /// the gradient of the differentiated function w.r.t. this Tensor.
+  ///
+  /// This function accumulates gradients in the leaves - you might need to
+  /// zero them before calling it.
+  ///
+  /// \param gradient Gradient w.r.t. the
+  ///     tensor. If it is a tensor, it will be automatically converted
+  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
+  ///     None values can be specified for scalar Tensors or ones that
+  ///     don't require grad. If a None value would be acceptable then
+  ///     this argument is optional.
+  /// \param retain_graph If ``false``, the graph used to compute
+  ///     the grads will be freed. Note that in nearly all cases setting
+  ///     this option to True is not needed and often can be worked around
+  ///     in a much more efficient way. Defaults to the value of
+  ///     ``create_graph``.
+  /// \param create_graph If ``true``, graph of the derivative will
+  ///     be constructed, allowing to compute higher order derivative
+  ///     products. Defaults to ``false``.
+  /// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+  ///     ``at::Tensor::grad``. All other Tensors will be ignored. If not
+  ///     provided, the gradient is accumulated into all the leaf Tensors
+  ///     that were used to compute the current tensor.
+  ///     When inputs are provided and a given input is not a leaf,
+  ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+  ///     It is an implementation detail on which the user should not rely.
+  ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+  void backward(const Tensor & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false, c10::optional<TensorList> inputs=c10::nullopt) const {
+    // NB: Adding this wrapper to _backward here because we'd like our
+    // 'backwards' api to accept the 'inputs' argument optionally. Since code gen
+    // currently does not support optional of TensorList our approach is to replace
+    // backward in native_functions.yaml with _backward and call it here instead.
+    if (inputs.has_value()) {
+      TORCH_CHECK(inputs.value().size() > 0, "'inputs' argument to backward cannot be empty")
+      this->_backward(inputs.value(), gradient, retain_graph, create_graph);
+    } else {
+      this->_backward({}, gradient, retain_graph, create_graph);
+    }
+  }
+
+  /// \fn Tensor detach() const;
+  ///
+  /// Returns a new Tensor, detached from the current graph.
+  /// The result will never require gradient.
+
+  /// \fn Tensor & detach_() const;
+  ///
+  /// Detaches the Tensor from the graph that created it, making it a leaf.
+  /// Views cannot be detached in-place.
+
+  /// \fn void retain_grad() const;
+  ///
+  /// Enables this Tensor to have their :attr:`grad` populated during
+  /// :func:`backward`. This is a no-op for leaf tensors.
+
+  /// \fn bool retains_grad() const;
+  ///
+  /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+  /// populated during :func:`backward`, ``false`` otherwise.
+
+  const Tensor& set_requires_grad(bool requires_grad) const {
+    TensorBase::set_requires_grad(requires_grad);
+    return *this;
+  }
+
+  /// Return a mutable reference to the gradient. This is conventionally
+  /// used as `t.grad() = x` to set a gradient to a completely new tensor.
+  /// Note that this function work with a non-const Tensor and is not
+  /// thread safe.
+  Tensor& mutable_grad() const {
+    return impl_->mutable_grad();
+  }
+
+  /// This function returns an undefined tensor by default and returns a defined tensor
+  /// the first time a call to `backward()` computes gradients for this Tensor.
+  /// The attribute will then contain the gradients computed and future calls
+  /// to `backward()` will accumulate (add) gradients into it.
+  const Tensor& grad() const {
+    const Tensor& maybe_grad = impl_->grad();
+    if (!is_leaf() && !retains_grad() && !maybe_grad.defined()) {
+      TORCH_WARN(
+        "The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad "
+        "attribute won't be populated during autograd.backward(). If you indeed want the .grad "
+        "field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. "
+        "If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor "
+        "instead. See github.com/pytorch/pytorch/pull/30531 for more informations.");
+    }
+    return maybe_grad;
+  }
+
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& _fw_grad(uint64_t level) const {
+    return impl_->_fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const {
+    impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+
+  // STOP.  Thinking of adding a method here, which only makes use
+  // of other ATen methods?  Define it in native_functions.yaml.
+
+  //example
+  //Tensor * add(Tensor & b);
+  void __dispatch__backward(at::TensorList inputs, const c10::optional<at::Tensor> & gradient={}, c10::optional<bool> retain_graph=c10::nullopt, bool create_graph=false) const;
+  void __dispatch_set_data(const at::Tensor & new_data) const;
+  at::Tensor __dispatch_data() const;
+  bool __dispatch_is_leaf() const;
+  int64_t __dispatch_output_nr() const;
+  int64_t __dispatch__version() const;
+  at::Tensor & __dispatch_requires_grad_(bool requires_grad=true) const;
+  void __dispatch_retain_grad() const;
+  bool __dispatch_retains_grad() const;
+  at::Tensor _fw_primal(int64_t level) const;
+  at::Tensor & rename_(c10::optional<at::DimnameList> names) const;
+  at::Tensor rename(c10::optional<at::DimnameList> names) const;
+  at::Tensor align_to(at::DimnameList names) const;
+  at::Tensor align_to(at::DimnameList order, int64_t ellipsis_idx) const;
+  at::Tensor align_as(const at::Tensor & other) const;
+  at::Tensor refine_names(at::DimnameList names) const;
+  at::Tensor abs() const;
+  at::Tensor & abs_() const;
+  at::Tensor absolute() const;
+  at::Tensor & absolute_() const;
+  at::Tensor angle() const;
+  at::Tensor sgn() const;
+  at::Tensor & sgn_() const;
+  at::Tensor chalf(c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor _conj() const;
+  at::Tensor __dispatch_conj() const;
+  at::Tensor _conj_physical() const;
+  at::Tensor conj_physical() const;
+  at::Tensor & conj_physical_() const;
+  at::Tensor resolve_conj() const;
+  at::Tensor resolve_neg() const;
+  at::Tensor _neg_view() const;
+  at::Tensor acos() const;
+  at::Tensor & acos_() const;
+  at::Tensor arccos() const;
+  at::Tensor & arccos_() const;
+  at::Tensor add(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor & add_(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor add(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor & add_(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor addmv(const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor & addmv_(const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor addr(const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor & addr_(const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor _is_all_true() const;
+  at::Tensor _is_any_true() const;
+  at::Tensor all(int64_t dim, bool keepdim=false) const;
+  at::Tensor all(at::OptionalIntArrayRef dim, bool keepdim=false) const;
+  at::Tensor all(at::Dimname dim, bool keepdim=false) const;
+  bool allclose(const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  at::Tensor any(int64_t dim, bool keepdim=false) const;
+  at::Tensor any(at::OptionalIntArrayRef dim, bool keepdim=false) const;
+  at::Tensor any(at::Dimname dim, bool keepdim=false) const;
+  at::Tensor argmax(c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) const;
+  at::Tensor argmin(c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) const;
+  at::Tensor acosh() const;
+  at::Tensor & acosh_() const;
+  at::Tensor arccosh() const;
+  at::Tensor & arccosh_() const;
+  at::Tensor asinh() const;
+  at::Tensor & asinh_() const;
+  at::Tensor arcsinh() const;
+  at::Tensor & arcsinh_() const;
+  at::Tensor atanh() const;
+  at::Tensor & atanh_() const;
+  at::Tensor arctanh() const;
+  at::Tensor & arctanh_() const;
+  at::Tensor as_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
+  at::Tensor as_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) const;
+  const at::Tensor & as_strided_(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
+  const at::Tensor & as_strided__symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) const;
+  at::Tensor asin() const;
+  at::Tensor & asin_() const;
+  at::Tensor arcsin() const;
+  at::Tensor & arcsin_() const;
+  at::Tensor atan() const;
+  at::Tensor & atan_() const;
+  at::Tensor arctan() const;
+  at::Tensor & arctan_() const;
+  at::Tensor baddbmm(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor & baddbmm_(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor bernoulli(c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & bernoulli_(const at::Tensor & p, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & bernoulli_(double p=0.5, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor bernoulli(double p, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor bincount(const c10::optional<at::Tensor> & weights={}, int64_t minlength=0) const;
+  at::Tensor bitwise_not() const;
+  at::Tensor & bitwise_not_() const;
+  at::Tensor copysign(const at::Tensor & other) const;
+  at::Tensor & copysign_(const at::Tensor & other) const;
+  at::Tensor copysign(const at::Scalar & other) const;
+  at::Tensor & copysign_(const at::Scalar & other) const;
+  at::Tensor _lazy_clone() const;
+  at::Tensor logical_not() const;
+  at::Tensor & logical_not_() const;
+  at::Tensor logical_xor(const at::Tensor & other) const;
+  at::Tensor & logical_xor_(const at::Tensor & other) const;
+  at::Tensor logical_and(const at::Tensor & other) const;
+  at::Tensor & logical_and_(const at::Tensor & other) const;
+  at::Tensor logical_or(const at::Tensor & other) const;
+  at::Tensor & logical_or_(const at::Tensor & other) const;
+  at::Tensor bmm(const at::Tensor & mat2) const;
+  at::Tensor broadcast_to(at::IntArrayRef size) const;
+  at::Tensor broadcast_to_symint(c10::SymIntArrayRef size) const;
+  at::Tensor ceil() const;
+  at::Tensor & ceil_() const;
+  ::std::vector<at::Tensor> unsafe_chunk(int64_t chunks, int64_t dim=0) const;
+  ::std::vector<at::Tensor> chunk(int64_t chunks, int64_t dim=0) const;
+  ::std::vector<at::Tensor> tensor_split(int64_t sections, int64_t dim=0) const;
+  ::std::vector<at::Tensor> tensor_split_symint(c10::SymInt sections, int64_t dim=0) const;
+  ::std::vector<at::Tensor> tensor_split(at::IntArrayRef indices, int64_t dim=0) const;
+  ::std::vector<at::Tensor> tensor_split_symint(c10::SymIntArrayRef indices, int64_t dim=0) const;
+  ::std::vector<at::Tensor> tensor_split(const at::Tensor & tensor_indices_or_sections, int64_t dim=0) const;
+  at::Tensor clamp(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) const;
+  at::Tensor clamp(const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) const;
+  at::Tensor & clamp_(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) const;
+  at::Tensor & clamp_(const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) const;
+  at::Tensor clamp_max(const at::Scalar & max) const;
+  at::Tensor clamp_max(const at::Tensor & max) const;
+  at::Tensor & clamp_max_(const at::Scalar & max) const;
+  at::Tensor & clamp_max_(const at::Tensor & max) const;
+  at::Tensor clamp_min(const at::Scalar & min) const;
+  at::Tensor clamp_min(const at::Tensor & min) const;
+  at::Tensor & clamp_min_(const at::Scalar & min) const;
+  at::Tensor & clamp_min_(const at::Tensor & min) const;
+  at::Tensor clip(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) const;
+  at::Tensor clip(const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) const;
+  at::Tensor & clip_(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max=c10::nullopt) const;
+  at::Tensor & clip_(const c10::optional<at::Tensor> & min={}, const c10::optional<at::Tensor> & max={}) const;
+  at::Tensor __dispatch_contiguous(at::MemoryFormat memory_format=MemoryFormat::Contiguous) const;
+  at::Tensor & copy_(const at::Tensor & src, bool non_blocking=false) const;
+  at::Tensor cos() const;
+  at::Tensor & cos_() const;
+  at::Tensor cosh() const;
+  at::Tensor & cosh_() const;
+  at::Tensor count_nonzero(at::IntArrayRef dim) const;
+  at::Tensor count_nonzero(c10::optional<int64_t> dim=c10::nullopt) const;
+  at::Tensor cov(int64_t correction=1, const c10::optional<at::Tensor> & fweights={}, const c10::optional<at::Tensor> & aweights={}) const;
+  at::Tensor corrcoef() const;
+  ::std::tuple<at::Tensor,at::Tensor> cummax(int64_t dim) const;
+  ::std::tuple<at::Tensor,at::Tensor> cummax(at::Dimname dim) const;
+  ::std::tuple<at::Tensor,at::Tensor> cummin(int64_t dim) const;
+  ::std::tuple<at::Tensor,at::Tensor> cummin(at::Dimname dim) const;
+  at::Tensor cumprod(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor & cumprod_(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor cumprod(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor & cumprod_(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor cumsum(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor & cumsum_(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor cumsum(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor & cumsum_(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor diag_embed(int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) const;
+  at::Tensor diagflat(int64_t offset=0) const;
+  at::Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
+  at::Tensor diagonal(at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0) const;
+  at::Tensor & fill_diagonal_(const at::Scalar & fill_value, bool wrap=false) const;
+  at::Tensor diff(int64_t n=1, int64_t dim=-1, const c10::optional<at::Tensor> & prepend={}, const c10::optional<at::Tensor> & append={}) const;
+  at::Tensor div(const at::Tensor & other) const;
+  at::Tensor & div_(const at::Tensor & other) const;
+  at::Tensor div(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor & div_(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor div(const at::Scalar & other) const;
+  at::Tensor & div_(const at::Scalar & other) const;
+  at::Tensor div(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor & div_(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor divide(const at::Tensor & other) const;
+  at::Tensor & divide_(const at::Tensor & other) const;
+  at::Tensor divide(const at::Scalar & other) const;
+  at::Tensor & divide_(const at::Scalar & other) const;
+  at::Tensor divide(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor & divide_(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor divide(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor & divide_(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const;
+  at::Tensor true_divide(const at::Tensor & other) const;
+  at::Tensor & true_divide_(const at::Tensor & other) const;
+  at::Tensor true_divide(const at::Scalar & other) const;
+  at::Tensor & true_divide_(const at::Scalar & other) const;
+  at::Tensor dot(const at::Tensor & tensor) const;
+  at::Tensor vdot(const at::Tensor & other) const;
+  at::Tensor new_empty(at::IntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_empty(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_empty_symint(c10::SymIntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_empty_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={}) const;
+  at::Tensor new_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={}) const;
+  at::Tensor new_empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_full(at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) const;
+  at::Tensor new_full(at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) const;
+  at::Tensor new_full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_zeros(at::IntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_zeros(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_zeros_symint(c10::SymIntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_zeros_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_ones(at::IntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_ones(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  at::Tensor new_ones_symint(c10::SymIntArrayRef size, at::TensorOptions options={}) const;
+  at::Tensor new_ones_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const;
+  const at::Tensor & resize_(at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  const at::Tensor & resize__symint(c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor erf() const;
+  at::Tensor & erf_() const;
+  at::Tensor erfc() const;
+  at::Tensor & erfc_() const;
+  at::Tensor exp() const;
+  at::Tensor & exp_() const;
+  at::Tensor exp2() const;
+  at::Tensor & exp2_() const;
+  at::Tensor expm1() const;
+  at::Tensor & expm1_() const;
+  at::Tensor expand(at::IntArrayRef size, bool implicit=false) const;
+  at::Tensor expand_symint(c10::SymIntArrayRef size, bool implicit=false) const;
+  at::Tensor expand_as(const at::Tensor & other) const;
+  at::Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const;
+  at::Tensor flatten(int64_t start_dim, int64_t end_dim, at::Dimname out_dim) const;
+  at::Tensor flatten(at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim) const;
+  at::Tensor flatten(at::DimnameList dims, at::Dimname out_dim) const;
+  at::Tensor unflatten(int64_t dim, at::IntArrayRef sizes) const;
+  at::Tensor unflatten_symint(int64_t dim, c10::SymIntArrayRef sizes) const;
+  at::Tensor unflatten(at::Dimname dim, at::IntArrayRef sizes, at::DimnameList names) const;
+  at::Tensor unflatten_symint(at::Dimname dim, c10::SymIntArrayRef sizes, at::DimnameList names) const;
+  at::Tensor & fill_(const at::Scalar & value) const;
+  at::Tensor & fill_(const at::Tensor & value) const;
+  at::Tensor floor() const;
+  at::Tensor & floor_() const;
+  at::Tensor floor_divide(const at::Tensor & other) const;
+  at::Tensor & floor_divide_(const at::Tensor & other) const;
+  at::Tensor floor_divide(const at::Scalar & other) const;
+  at::Tensor & floor_divide_(const at::Scalar & other) const;
+  at::Tensor frac() const;
+  at::Tensor & frac_() const;
+  at::Tensor gcd(const at::Tensor & other) const;
+  at::Tensor & gcd_(const at::Tensor & other) const;
+  at::Tensor lcm(const at::Tensor & other) const;
+  at::Tensor & lcm_(const at::Tensor & other) const;
+  at::Tensor index(const c10::List<c10::optional<at::Tensor>> & indices) const;
+  at::Tensor & index_copy_(int64_t dim, const at::Tensor & index, const at::Tensor & source) const;
+  at::Tensor index_copy(int64_t dim, const at::Tensor & index, const at::Tensor & source) const;
+  at::Tensor & index_copy_(at::Dimname dim, const at::Tensor & index, const at::Tensor & source) const;
+  at::Tensor index_copy(at::Dimname dim, const at::Tensor & index, const at::Tensor & source) const;
+  at::Tensor & index_put_(const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) const;
+  at::Tensor index_put(const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false) const;
+  at::Tensor isclose(const at::Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
+  at::Tensor isnan() const;
+  bool is_distributed() const;
+  bool __dispatch_is_floating_point() const;
+  bool __dispatch_is_complex() const;
+  bool __dispatch_is_conj() const;
+  bool __dispatch__is_zerotensor() const;
+  bool __dispatch_is_neg() const;
+  at::Tensor isreal() const;
+  bool is_nonzero() const;
+  bool is_same_size(const at::Tensor & other) const;
+  bool __dispatch_is_signed() const;
+  bool __dispatch_is_inference() const;
+  at::Tensor kron(const at::Tensor & other) const;
+  ::std::tuple<at::Tensor,at::Tensor> kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> kthvalue(int64_t k, at::Dimname dim, bool keepdim=false) const;
+  at::Tensor nan_to_num(c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt) const;
+  at::Tensor & nan_to_num_(c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt) const;
+  at::Tensor ldexp(const at::Tensor & other) const;
+  at::Tensor & ldexp_(const at::Tensor & other) const;
+  at::Tensor log() const;
+  at::Tensor & log_() const;
+  at::Tensor log10() const;
+  at::Tensor & log10_() const;
+  at::Tensor log1p() const;
+  at::Tensor & log1p_() const;
+  at::Tensor log2() const;
+  at::Tensor & log2_() const;
+  at::Tensor logaddexp(const at::Tensor & other) const;
+  at::Tensor logaddexp2(const at::Tensor & other) const;
+  at::Tensor xlogy(const at::Tensor & other) const;
+  at::Tensor xlogy(const at::Scalar & other) const;
+  at::Tensor & xlogy_(const at::Tensor & other) const;
+  at::Tensor & xlogy_(const at::Scalar & other) const;
+  at::Tensor log_softmax(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor log_softmax(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor logcumsumexp(int64_t dim) const;
+  at::Tensor logcumsumexp(at::Dimname dim) const;
+  at::Tensor logsumexp(at::IntArrayRef dim, bool keepdim=false) const;
+  at::Tensor logsumexp(at::DimnameList dim, bool keepdim=false) const;
+  at::Tensor matmul(const at::Tensor & other) const;
+  at::Tensor matrix_power(int64_t n) const;
+  at::Tensor matrix_exp() const;
+  ::std::tuple<at::Tensor,at::Tensor> aminmax(c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> max(int64_t dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> max(at::Dimname dim, bool keepdim=false) const;
+  at::Tensor amax(at::IntArrayRef dim={}, bool keepdim=false) const;
+  at::Tensor mean(c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor mean(at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor mean(at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor nanmean(at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor median() const;
+  ::std::tuple<at::Tensor,at::Tensor> median(int64_t dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> median(at::Dimname dim, bool keepdim=false) const;
+  at::Tensor nanmedian() const;
+  ::std::tuple<at::Tensor,at::Tensor> nanmedian(int64_t dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> nanmedian(at::Dimname dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> min(int64_t dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> min(at::Dimname dim, bool keepdim=false) const;
+  at::Tensor amin(at::IntArrayRef dim={}, bool keepdim=false) const;
+  at::Tensor mm(const at::Tensor & mat2) const;
+  ::std::tuple<at::Tensor,at::Tensor> mode(int64_t dim=-1, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> mode(at::Dimname dim, bool keepdim=false) const;
+  at::Tensor mul(const at::Tensor & other) const;
+  at::Tensor & mul_(const at::Tensor & other) const;
+  at::Tensor mul(const at::Scalar & other) const;
+  at::Tensor & mul_(const at::Scalar & other) const;
+  at::Tensor multiply(const at::Tensor & other) const;
+  at::Tensor & multiply_(const at::Tensor & other) const;
+  at::Tensor multiply(const at::Scalar & other) const;
+  at::Tensor & multiply_(const at::Scalar & other) const;
+  at::Tensor mv(const at::Tensor & vec) const;
+  at::Tensor mvlgamma(int64_t p) const;
+  at::Tensor & mvlgamma_(int64_t p) const;
+  at::Tensor narrow_copy(int64_t dim, int64_t start, int64_t length) const;
+  at::Tensor narrow_copy_symint(int64_t dim, c10::SymInt start, c10::SymInt length) const;
+  at::Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
+  at::Tensor narrow_symint(int64_t dim, c10::SymInt start, c10::SymInt length) const;
+  at::Tensor narrow(int64_t dim, const at::Tensor & start, int64_t length) const;
+  at::Tensor narrow_symint(int64_t dim, const at::Tensor & start, c10::SymInt length) const;
+  at::Tensor permute(at::IntArrayRef dims) const;
+  at::Tensor movedim(at::IntArrayRef source, at::IntArrayRef destination) const;
+  at::Tensor movedim(int64_t source, int64_t destination) const;
+  at::Tensor moveaxis(at::IntArrayRef source, at::IntArrayRef destination) const;
+  at::Tensor moveaxis(int64_t source, int64_t destination) const;
+  at::Tensor numpy_T() const;
+  at::Tensor matrix_H() const;
+  at::Tensor mT() const;
+  at::Tensor mH() const;
+  at::Tensor adjoint() const;
+  bool is_pinned(c10::optional<at::Device> device=c10::nullopt) const;
+  at::Tensor pin_memory(c10::optional<at::Device> device=c10::nullopt) const;
+  at::Tensor pinverse(double rcond=1e-15) const;
+  at::Tensor rad2deg() const;
+  at::Tensor & rad2deg_() const;
+  at::Tensor deg2rad() const;
+  at::Tensor & deg2rad_() const;
+  at::Tensor ravel() const;
+  at::Tensor reciprocal() const;
+  at::Tensor & reciprocal_() const;
+  at::Tensor neg() const;
+  at::Tensor & neg_() const;
+  at::Tensor negative() const;
+  at::Tensor & negative_() const;
+  at::Tensor repeat(at::IntArrayRef repeats) const;
+  at::Tensor repeat_symint(c10::SymIntArrayRef repeats) const;
+  at::Tensor repeat_interleave(const at::Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt) const;
+  at::Tensor repeat_interleave_symint(const at::Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<c10::SymInt> output_size=c10::nullopt) const;
+  at::Tensor repeat_interleave(int64_t repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt) const;
+  at::Tensor repeat_interleave_symint(c10::SymInt repeats, c10::optional<int64_t> dim=c10::nullopt, c10::optional<c10::SymInt> output_size=c10::nullopt) const;
+  at::Tensor reshape(at::IntArrayRef shape) const;
+  at::Tensor reshape_symint(c10::SymIntArrayRef shape) const;
+  at::Tensor _reshape_alias(at::IntArrayRef size, at::IntArrayRef stride) const;
+  at::Tensor _reshape_alias_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride) const;
+  at::Tensor reshape_as(const at::Tensor & other) const;
+  at::Tensor round() const;
+  at::Tensor & round_() const;
+  at::Tensor round(int64_t decimals) const;
+  at::Tensor & round_(int64_t decimals) const;
+  at::Tensor relu() const;
+  at::Tensor & relu_() const;
+  at::Tensor prelu(const at::Tensor & weight) const;
+  at::Tensor hardshrink(const at::Scalar & lambd=0.5) const;
+  at::Tensor hardshrink_backward(const at::Tensor & grad_out, const at::Scalar & lambd) const;
+  at::Tensor rsqrt() const;
+  at::Tensor & rsqrt_() const;
+  at::Tensor select(at::Dimname dim, int64_t index) const;
+  at::Tensor select(int64_t dim, int64_t index) const;
+  at::Tensor select_symint(int64_t dim, c10::SymInt index) const;
+  at::Tensor sigmoid() const;
+  at::Tensor & sigmoid_() const;
+  at::Tensor logit(c10::optional<double> eps=c10::nullopt) const;
+  at::Tensor & logit_(c10::optional<double> eps=c10::nullopt) const;
+  at::Tensor sin() const;
+  at::Tensor & sin_() const;
+  at::Tensor sinc() const;
+  at::Tensor & sinc_() const;
+  at::Tensor sinh() const;
+  at::Tensor & sinh_() const;
+  at::Tensor detach() const;
+  at::Tensor & detach_() const;
+  int64_t size(at::Dimname dim) const;
+  at::Tensor slice(int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) const;
+  at::Tensor slice_symint(int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) const;
+  at::Tensor slice_inverse(const at::Tensor & src, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) const;
+  at::Tensor slice_inverse_symint(const at::Tensor & src, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) const;
+  at::Tensor slice_scatter(const at::Tensor & src, int64_t dim=0, c10::optional<int64_t> start=c10::nullopt, c10::optional<int64_t> end=c10::nullopt, int64_t step=1) const;
+  at::Tensor slice_scatter_symint(const at::Tensor & src, int64_t dim=0, c10::optional<c10::SymInt> start=c10::nullopt, c10::optional<c10::SymInt> end=c10::nullopt, c10::SymInt step=1) const;
+  at::Tensor select_scatter(const at::Tensor & src, int64_t dim, int64_t index) const;
+  at::Tensor select_scatter_symint(const at::Tensor & src, int64_t dim, c10::SymInt index) const;
+  at::Tensor diagonal_scatter(const at::Tensor & src, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
+  at::Tensor as_strided_scatter(const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
+  at::Tensor as_strided_scatter_symint(const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset=c10::nullopt) const;
+  at::Tensor smm(const at::Tensor & mat2) const;
+  at::Tensor softmax(int64_t dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor softmax(at::Dimname dim, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  ::std::vector<at::Tensor> unsafe_split(int64_t split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> unsafe_split_symint(c10::SymInt split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split(int64_t split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split_symint(c10::SymInt split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split(at::IntArrayRef split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split_symint(c10::SymIntArrayRef split_size, int64_t dim=0) const;
+  ::std::vector<at::Tensor> unsafe_split_with_sizes(at::IntArrayRef split_sizes, int64_t dim=0) const;
+  ::std::vector<at::Tensor> unsafe_split_with_sizes_symint(c10::SymIntArrayRef split_sizes, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split_with_sizes(at::IntArrayRef split_sizes, int64_t dim=0) const;
+  ::std::vector<at::Tensor> split_with_sizes_symint(c10::SymIntArrayRef split_sizes, int64_t dim=0) const;
+  ::std::vector<at::Tensor> hsplit(int64_t sections) const;
+  ::std::vector<at::Tensor> hsplit(at::IntArrayRef indices) const;
+  ::std::vector<at::Tensor> vsplit(int64_t sections) const;
+  ::std::vector<at::Tensor> vsplit(at::IntArrayRef indices) const;
+  ::std::vector<at::Tensor> dsplit(int64_t sections) const;
+  ::std::vector<at::Tensor> dsplit(at::IntArrayRef indices) const;
+  at::Tensor squeeze() const;
+  at::Tensor squeeze(int64_t dim) const;
+  at::Tensor squeeze(at::Dimname dim) const;
+  at::Tensor squeeze(at::IntArrayRef dim) const;
+  at::Tensor & squeeze_() const;
+  at::Tensor & squeeze_(int64_t dim) const;
+  at::Tensor & squeeze_(at::IntArrayRef dim) const;
+  at::Tensor & squeeze_(at::Dimname dim) const;
+  at::Tensor sspaddmm(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor stft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool normalized, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) const;
+  at::Tensor stft(int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, c10::string_view pad_mode="reflect", bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) const;
+  at::Tensor istft(int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<int64_t> length=c10::nullopt, bool return_complex=false) const;
+  int64_t stride(at::Dimname dim) const;
+  at::Tensor sum(c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor sum(at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor sum(at::DimnameList dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor nansum(at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor sum_to_size(at::IntArrayRef size) const;
+  at::Tensor sum_to_size_symint(c10::SymIntArrayRef size) const;
+  at::Tensor sqrt() const;
+  at::Tensor & sqrt_() const;
+  at::Tensor square() const;
+  at::Tensor & square_() const;
+  at::Tensor std(bool unbiased) const;
+  at::Tensor std(at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) const;
+  at::Tensor std(at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) const;
+  at::Tensor std(at::DimnameList dim, bool unbiased, bool keepdim=false) const;
+  at::Tensor std(at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) const;
+  at::Tensor prod(c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor prod(int64_t dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor prod(at::Dimname dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor t() const;
+  at::Tensor & t_() const;
+  at::Tensor tan() const;
+  at::Tensor & tan_() const;
+  at::Tensor tanh() const;
+  at::Tensor & tanh_() const;
+  at::Tensor tile(at::IntArrayRef dims) const;
+  at::Tensor tile_symint(c10::SymIntArrayRef dims) const;
+  at::Tensor transpose(int64_t dim0, int64_t dim1) const;
+  at::Tensor transpose(at::Dimname dim0, at::Dimname dim1) const;
+  at::Tensor & transpose_(int64_t dim0, int64_t dim1) const;
+  at::Tensor flip(at::IntArrayRef dims) const;
+  at::Tensor fliplr() const;
+  at::Tensor flipud() const;
+  at::Tensor roll(at::IntArrayRef shifts, at::IntArrayRef dims={}) const;
+  at::Tensor roll_symint(c10::SymIntArrayRef shifts, at::IntArrayRef dims={}) const;
+  at::Tensor rot90(int64_t k=1, at::IntArrayRef dims={0,1}) const;
+  at::Tensor _nested_tensor_size() const;
+  at::Tensor _nested_tensor_strides() const;
+  at::Tensor _nested_tensor_storage_offsets() const;
+  at::Tensor trunc() const;
+  at::Tensor & trunc_() const;
+  at::Tensor fix() const;
+  at::Tensor & fix_() const;
+  at::Tensor type_as(const at::Tensor & other) const;
+  at::Tensor unsqueeze(int64_t dim) const;
+  at::Tensor & unsqueeze_(int64_t dim) const;
+  at::Tensor var(bool unbiased) const;
+  at::Tensor var(at::OptionalIntArrayRef dim, bool unbiased, bool keepdim=false) const;
+  at::Tensor var(at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) const;
+  at::Tensor var(at::DimnameList dim, bool unbiased, bool keepdim=false) const;
+  at::Tensor var(at::DimnameList dim, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false) const;
+  at::Tensor view_as(const at::Tensor & other) const;
+  at::Tensor where(const at::Tensor & condition, const at::Tensor & other) const;
+  at::Tensor where(const at::Tensor & condition, const at::Scalar & other) const;
+  at::Tensor norm(const c10::optional<at::Scalar> & p, at::ScalarType dtype) const;
+  at::Tensor norm(const at::Scalar & p=2) const;
+  at::Tensor norm(const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype) const;
+  at::Tensor norm(const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim=false) const;
+  at::Tensor norm(const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype) const;
+  at::Tensor norm(const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> frexp() const;
+  at::Tensor clone(c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor positive() const;
+  const at::Tensor & resize_as_(const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  const at::Tensor & resize_as_sparse_(const at::Tensor & the_template) const;
+  at::Tensor & zero_() const;
+  at::Tensor sub(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor & sub_(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor sub(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor & sub_(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor subtract(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor & subtract_(const at::Tensor & other, const at::Scalar & alpha=1) const;
+  at::Tensor subtract(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor & subtract_(const at::Scalar & other, const at::Scalar & alpha=1) const;
+  at::Tensor heaviside(const at::Tensor & values) const;
+  at::Tensor & heaviside_(const at::Tensor & values) const;
+  at::Tensor addmm(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor & addmm_(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor _addmm_activation(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1, bool use_gelu=false) const;
+  const at::Tensor & sparse_resize_(at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const;
+  const at::Tensor & sparse_resize_and_clear_(at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const;
+  at::Tensor sparse_mask(const at::Tensor & mask) const;
+  at::Tensor _sparse_mask_projection(const at::Tensor & mask, bool accumulate_matches=false) const;
+  at::Tensor to_dense(c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<bool> masked_grad=c10::nullopt) const;
+  at::Tensor _to_dense(c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<bool> masked_grad=c10::nullopt) const;
+  int64_t sparse_dim() const;
+  int64_t _dimI() const;
+  int64_t dense_dim() const;
+  int64_t _dimV() const;
+  int64_t _nnz() const;
+  at::Tensor coalesce() const;
+  bool is_coalesced() const;
+  at::Tensor _indices() const;
+  at::Tensor _values() const;
+  at::Tensor & _coalesced_(bool coalesced) const;
+  at::Tensor indices() const;
+  at::Tensor values() const;
+  at::Tensor crow_indices() const;
+  at::Tensor col_indices() const;
+  at::Tensor ccol_indices() const;
+  at::Tensor row_indices() const;
+  ::std::vector<at::Tensor> unbind(int64_t dim=0) const;
+  ::std::vector<at::Tensor> unbind(at::Dimname dim) const;
+  at::Tensor to_sparse(int64_t sparse_dim) const;
+  at::Tensor _to_sparse(int64_t sparse_dim) const;
+  at::Tensor to_sparse(c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor _to_sparse(c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor to_sparse_csr(c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor _to_sparse_csr(c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor to_sparse_csc(c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor _to_sparse_csc(c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor to_sparse_bsr(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor _to_sparse_bsr(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor to_sparse_bsc(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor _to_sparse_bsc(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt) const;
+  at::Tensor to_mkldnn(c10::optional<at::ScalarType> dtype=c10::nullopt) const;
+  at::Tensor dequantize() const;
+  double q_scale() const;
+  int64_t q_zero_point() const;
+  at::Tensor q_per_channel_scales() const;
+  at::Tensor q_per_channel_zero_points() const;
+  int64_t q_per_channel_axis() const;
+  at::Tensor int_repr() const;
+  at::QScheme qscheme() const;
+  at::Tensor _autocast_to_reduced_precision(bool cuda_enabled, bool cpu_enabled, at::ScalarType cuda_dtype, at::ScalarType cpu_dtype) const;
+  at::Tensor _autocast_to_full_precision(bool cuda_enabled, bool cpu_enabled) const;
+  at::Tensor to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor to(c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const;
+  at::Tensor to(at::Device device, at::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor to(at::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Tensor to(const at::Tensor & other, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const;
+  at::Scalar item() const;
+  at::Tensor & set_(at::Storage source) const;
+  at::Tensor & set_(at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) const;
+  at::Tensor & set__symint(at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) const;
+  at::Tensor & set_(const at::Tensor & source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) const;
+  at::Tensor & set__symint(const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) const;
+  at::Tensor & set_(const at::Tensor & source) const;
+  at::Tensor & set_() const;
+  bool is_set_to(const at::Tensor & tensor) const;
+  at::Tensor & masked_fill_(const at::Tensor & mask, const at::Scalar & value) const;
+  at::Tensor masked_fill(const at::Tensor & mask, const at::Scalar & value) const;
+  at::Tensor & masked_fill_(const at::Tensor & mask, const at::Tensor & value) const;
+  at::Tensor masked_fill(const at::Tensor & mask, const at::Tensor & value) const;
+  at::Tensor & masked_scatter_(const at::Tensor & mask, const at::Tensor & source) const;
+  at::Tensor masked_scatter(const at::Tensor & mask, const at::Tensor & source) const;
+  at::Tensor view(at::IntArrayRef size) const;
+  at::Tensor view_symint(c10::SymIntArrayRef size) const;
+  at::Tensor view(at::ScalarType dtype) const;
+  at::Tensor & put_(const at::Tensor & index, const at::Tensor & source, bool accumulate=false) const;
+  at::Tensor put(const at::Tensor & index, const at::Tensor & source, bool accumulate=false) const;
+  at::Tensor & index_add_(int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) const;
+  at::Tensor index_add(int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) const;
+  at::Tensor index_add(at::Dimname dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) const;
+  at::Tensor & index_reduce_(int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true) const;
+  at::Tensor index_reduce(int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true) const;
+  at::Tensor & index_fill_(int64_t dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor index_fill(int64_t dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor & index_fill_(int64_t dim, const at::Tensor & index, const at::Tensor & value) const;
+  at::Tensor index_fill(int64_t dim, const at::Tensor & index, const at::Tensor & value) const;
+  at::Tensor & index_fill_(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor & index_fill_(at::Dimname dim, const at::Tensor & index, const at::Tensor & value) const;
+  at::Tensor index_fill(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor index_fill(at::Dimname dim, const at::Tensor & index, const at::Tensor & value) const;
+  at::Tensor scatter(int64_t dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor & scatter_(int64_t dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor scatter(int64_t dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor & scatter_(int64_t dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor scatter(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) const;
+  at::Tensor & scatter_(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) const;
+  at::Tensor scatter(int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) const;
+  at::Tensor & scatter_(int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) const;
+  at::Tensor scatter(at::Dimname dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor scatter(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const;
+  at::Tensor scatter_add(int64_t dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor & scatter_add_(int64_t dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor scatter_add(at::Dimname dim, const at::Tensor & index, const at::Tensor & src) const;
+  at::Tensor scatter_reduce(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true) const;
+  at::Tensor & scatter_reduce_(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true) const;
+  at::Tensor & eq_(const at::Scalar & other) const;
+  at::Tensor & eq_(const at::Tensor & other) const;
+  at::Tensor bitwise_and(const at::Scalar & other) const;
+  at::Tensor bitwise_and(const at::Tensor & other) const;
+  at::Tensor & bitwise_and_(const at::Scalar & other) const;
+  at::Tensor & bitwise_and_(const at::Tensor & other) const;
+  at::Tensor __and__(const at::Scalar & other) const;
+  at::Tensor __and__(const at::Tensor & other) const;
+  at::Tensor & __iand__(const at::Scalar & other) const;
+  at::Tensor & __iand__(const at::Tensor & other) const;
+  at::Tensor bitwise_or(const at::Scalar & other) const;
+  at::Tensor bitwise_or(const at::Tensor & other) const;
+  at::Tensor & bitwise_or_(const at::Scalar & other) const;
+  at::Tensor & bitwise_or_(const at::Tensor & other) const;
+  at::Tensor __or__(const at::Scalar & other) const;
+  at::Tensor __or__(const at::Tensor & other) const;
+  at::Tensor & __ior__(const at::Scalar & other) const;
+  at::Tensor & __ior__(const at::Tensor & other) const;
+  at::Tensor bitwise_xor(const at::Scalar & other) const;
+  at::Tensor bitwise_xor(const at::Tensor & other) const;
+  at::Tensor & bitwise_xor_(const at::Scalar & other) const;
+  at::Tensor & bitwise_xor_(const at::Tensor & other) const;
+  at::Tensor __xor__(const at::Scalar & other) const;
+  at::Tensor __xor__(const at::Tensor & other) const;
+  at::Tensor & __ixor__(const at::Scalar & other) const;
+  at::Tensor & __ixor__(const at::Tensor & other) const;
+  at::Tensor __lshift__(const at::Scalar & other) const;
+  at::Tensor __lshift__(const at::Tensor & other) const;
+  at::Tensor & __ilshift__(const at::Scalar & other) const;
+  at::Tensor & __ilshift__(const at::Tensor & other) const;
+  at::Tensor bitwise_left_shift(const at::Tensor & other) const;
+  at::Tensor & bitwise_left_shift_(const at::Tensor & other) const;
+  at::Tensor bitwise_left_shift(const at::Scalar & other) const;
+  at::Tensor & bitwise_left_shift_(const at::Scalar & other) const;
+  at::Tensor __rshift__(const at::Scalar & other) const;
+  at::Tensor __rshift__(const at::Tensor & other) const;
+  at::Tensor & __irshift__(const at::Scalar & other) const;
+  at::Tensor & __irshift__(const at::Tensor & other) const;
+  at::Tensor bitwise_right_shift(const at::Tensor & other) const;
+  at::Tensor & bitwise_right_shift_(const at::Tensor & other) const;
+  at::Tensor bitwise_right_shift(const at::Scalar & other) const;
+  at::Tensor & bitwise_right_shift_(const at::Scalar & other) const;
+  at::Tensor & tril_(int64_t diagonal=0) const;
+  at::Tensor & triu_(int64_t diagonal=0) const;
+  at::Tensor & digamma_() const;
+  at::Tensor & lerp_(const at::Tensor & end, const at::Scalar & weight) const;
+  at::Tensor & lerp_(const at::Tensor & end, const at::Tensor & weight) const;
+  at::Tensor & addbmm_(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor addbmm(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) const;
+  at::Tensor & random_(int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & random_(int64_t to, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & random_(c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & uniform_(double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & cauchy_(double median=0, double sigma=1, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & log_normal_(double mean=1, double std=2, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & exponential_(double lambd=1, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & geometric_(double p, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor diag(int64_t diagonal=0) const;
+  at::Tensor cross(const at::Tensor & other, c10::optional<int64_t> dim=c10::nullopt) const;
+  at::Tensor triu(int64_t diagonal=0) const;
+  at::Tensor tril(int64_t diagonal=0) const;
+  at::Tensor trace() const;
+  at::Tensor ne(const at::Scalar & other) const;
+  at::Tensor ne(const at::Tensor & other) const;
+  at::Tensor & ne_(const at::Scalar & other) const;
+  at::Tensor & ne_(const at::Tensor & other) const;
+  at::Tensor not_equal(const at::Scalar & other) const;
+  at::Tensor not_equal(const at::Tensor & other) const;
+  at::Tensor & not_equal_(const at::Scalar & other) const;
+  at::Tensor & not_equal_(const at::Tensor & other) const;
+  at::Tensor eq(const at::Scalar & other) const;
+  at::Tensor eq(const at::Tensor & other) const;
+  at::Tensor ge(const at::Scalar & other) const;
+  at::Tensor ge(const at::Tensor & other) const;
+  at::Tensor & ge_(const at::Scalar & other) const;
+  at::Tensor & ge_(const at::Tensor & other) const;
+  at::Tensor greater_equal(const at::Scalar & other) const;
+  at::Tensor greater_equal(const at::Tensor & other) const;
+  at::Tensor & greater_equal_(const at::Scalar & other) const;
+  at::Tensor & greater_equal_(const at::Tensor & other) const;
+  at::Tensor le(const at::Scalar & other) const;
+  at::Tensor le(const at::Tensor & other) const;
+  at::Tensor & le_(const at::Scalar & other) const;
+  at::Tensor & le_(const at::Tensor & other) const;
+  at::Tensor less_equal(const at::Scalar & other) const;
+  at::Tensor less_equal(const at::Tensor & other) const;
+  at::Tensor & less_equal_(const at::Scalar & other) const;
+  at::Tensor & less_equal_(const at::Tensor & other) const;
+  at::Tensor gt(const at::Scalar & other) const;
+  at::Tensor gt(const at::Tensor & other) const;
+  at::Tensor & gt_(const at::Scalar & other) const;
+  at::Tensor & gt_(const at::Tensor & other) const;
+  at::Tensor greater(const at::Scalar & other) const;
+  at::Tensor greater(const at::Tensor & other) const;
+  at::Tensor & greater_(const at::Scalar & other) const;
+  at::Tensor & greater_(const at::Tensor & other) const;
+  at::Tensor lt(const at::Scalar & other) const;
+  at::Tensor lt(const at::Tensor & other) const;
+  at::Tensor & lt_(const at::Scalar & other) const;
+  at::Tensor & lt_(const at::Tensor & other) const;
+  at::Tensor less(const at::Scalar & other) const;
+  at::Tensor less(const at::Tensor & other) const;
+  at::Tensor & less_(const at::Scalar & other) const;
+  at::Tensor & less_(const at::Tensor & other) const;
+  at::Tensor take(const at::Tensor & index) const;
+  at::Tensor take_along_dim(const at::Tensor & indices, c10::optional<int64_t> dim=c10::nullopt) const;
+  at::Tensor index_select(int64_t dim, const at::Tensor & index) const;
+  at::Tensor index_select(at::Dimname dim, const at::Tensor & index) const;
+  at::Tensor masked_select(const at::Tensor & mask) const;
+  at::Tensor nonzero() const;
+  at::Tensor nonzero_static(int64_t size, int64_t fill_value=-1) const;
+  ::std::vector<at::Tensor> nonzero_numpy() const;
+  at::Tensor argwhere() const;
+  at::Tensor gather(int64_t dim, const at::Tensor & index, bool sparse_grad=false) const;
+  at::Tensor gather(at::Dimname dim, const at::Tensor & index, bool sparse_grad=false) const;
+  at::Tensor addcmul(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) const;
+  at::Tensor & addcmul_(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) const;
+  at::Tensor addcdiv(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) const;
+  at::Tensor & addcdiv_(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) const;
+  ::std::tuple<at::Tensor,at::Tensor> triangular_solve(const at::Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> svd(bool some=true, bool compute_uv=true) const;
+  at::Tensor swapaxes(int64_t axis0, int64_t axis1) const;
+  at::Tensor & swapaxes_(int64_t axis0, int64_t axis1) const;
+  at::Tensor swapdims(int64_t dim0, int64_t dim1) const;
+  at::Tensor & swapdims_(int64_t dim0, int64_t dim1) const;
+  at::Tensor cholesky(bool upper=false) const;
+  at::Tensor cholesky_solve(const at::Tensor & input2, bool upper=false) const;
+  at::Tensor cholesky_inverse(bool upper=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> qr(bool some=true) const;
+  ::std::tuple<at::Tensor,at::Tensor> geqrf() const;
+  at::Tensor orgqr(const at::Tensor & input2) const;
+  at::Tensor ormqr(const at::Tensor & input2, const at::Tensor & input3, bool left=true, bool transpose=false) const;
+  at::Tensor lu_solve(const at::Tensor & LU_data, const at::Tensor & LU_pivots) const;
+  at::Tensor multinomial(int64_t num_samples, bool replacement=false, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor & lgamma_() const;
+  at::Tensor lgamma() const;
+  at::Tensor digamma() const;
+  at::Tensor polygamma(int64_t n) const;
+  at::Tensor & polygamma_(int64_t n) const;
+  at::Tensor erfinv() const;
+  at::Tensor & erfinv_() const;
+  at::Tensor i0() const;
+  at::Tensor & i0_() const;
+  at::Tensor sign() const;
+  at::Tensor & sign_() const;
+  at::Tensor signbit() const;
+  at::Tensor dist(const at::Tensor & other, const at::Scalar & p=2) const;
+  at::Tensor & atan2_(const at::Tensor & other) const;
+  at::Tensor atan2(const at::Tensor & other) const;
+  at::Tensor arctan2(const at::Tensor & other) const;
+  at::Tensor & arctan2_(const at::Tensor & other) const;
+  at::Tensor lerp(const at::Tensor & end, const at::Scalar & weight) const;
+  at::Tensor lerp(const at::Tensor & end, const at::Tensor & weight) const;
+  at::Tensor histc(int64_t bins=100, const at::Scalar & min=0, const at::Scalar & max=0) const;
+  ::std::tuple<at::Tensor,at::Tensor> histogram(const at::Tensor & bins, const c10::optional<at::Tensor> & weight={}, bool density=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> histogram(int64_t bins=100, c10::optional<at::ArrayRef<double>> range=c10::nullopt, const c10::optional<at::Tensor> & weight={}, bool density=false) const;
+  at::Tensor fmod(const at::Scalar & other) const;
+  at::Tensor & fmod_(const at::Scalar & other) const;
+  at::Tensor fmod(const at::Tensor & other) const;
+  at::Tensor & fmod_(const at::Tensor & other) const;
+  at::Tensor hypot(const at::Tensor & other) const;
+  at::Tensor & hypot_(const at::Tensor & other) const;
+  at::Tensor igamma(const at::Tensor & other) const;
+  at::Tensor & igamma_(const at::Tensor & other) const;
+  at::Tensor igammac(const at::Tensor & other) const;
+  at::Tensor & igammac_(const at::Tensor & other) const;
+  at::Tensor nextafter(const at::Tensor & other) const;
+  at::Tensor & nextafter_(const at::Tensor & other) const;
+  at::Tensor remainder(const at::Scalar & other) const;
+  at::Tensor & remainder_(const at::Scalar & other) const;
+  at::Tensor remainder(const at::Tensor & other) const;
+  at::Tensor & remainder_(const at::Tensor & other) const;
+  at::Tensor min() const;
+  at::Tensor fmin(const at::Tensor & other) const;
+  at::Tensor max() const;
+  at::Tensor fmax(const at::Tensor & other) const;
+  at::Tensor maximum(const at::Tensor & other) const;
+  at::Tensor max(const at::Tensor & other) const;
+  at::Tensor minimum(const at::Tensor & other) const;
+  at::Tensor min(const at::Tensor & other) const;
+  at::Tensor quantile(const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") const;
+  at::Tensor quantile(double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") const;
+  at::Tensor nanquantile(const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") const;
+  at::Tensor nanquantile(double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear") const;
+  ::std::tuple<at::Tensor,at::Tensor> sort(int64_t dim=-1, bool descending=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> sort(c10::optional<bool> stable, int64_t dim=-1, bool descending=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> sort(at::Dimname dim, bool descending=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> sort(c10::optional<bool> stable, at::Dimname dim, bool descending=false) const;
+  at::Tensor msort() const;
+  at::Tensor argsort(int64_t dim=-1, bool descending=false) const;
+  at::Tensor argsort(bool stable, int64_t dim=-1, bool descending=false) const;
+  at::Tensor argsort(at::Dimname dim, bool descending=false) const;
+  ::std::tuple<at::Tensor,at::Tensor> topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  ::std::tuple<at::Tensor,at::Tensor> topk_symint(c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
+  at::Tensor all() const;
+  at::Tensor any() const;
+  at::Tensor renorm(const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) const;
+  at::Tensor & renorm_(const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) const;
+  at::Tensor unfold(int64_t dimension, int64_t size, int64_t step) const;
+  bool equal(const at::Tensor & other) const;
+  at::Tensor pow(const at::Tensor & exponent) const;
+  at::Tensor pow(const at::Scalar & exponent) const;
+  at::Tensor & pow_(const at::Scalar & exponent) const;
+  at::Tensor & pow_(const at::Tensor & exponent) const;
+  at::Tensor float_power(const at::Tensor & exponent) const;
+  at::Tensor float_power(const at::Scalar & exponent) const;
+  at::Tensor & float_power_(const at::Scalar & exponent) const;
+  at::Tensor & float_power_(const at::Tensor & exponent) const;
+  at::Tensor & normal_(double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) const;
+  at::Tensor alias() const;
+  at::Tensor isfinite() const;
+  at::Tensor isinf() const;
+  void record_stream(at::Stream s) const;
+  at::Tensor isposinf() const;
+  at::Tensor isneginf() const;
+  at::Tensor det() const;
+  ::std::tuple<at::Tensor,at::Tensor> slogdet() const;
+  at::Tensor logdet() const;
+  at::Tensor inverse() const;
+  at::Tensor inner(const at::Tensor & other) const;
+  at::Tensor outer(const at::Tensor & vec2) const;
+  at::Tensor ger(const at::Tensor & vec2) const;
+  at::Tensor to_padded_tensor(double padding, at::OptionalIntArrayRef output_size=c10::nullopt) const;
+  at::Tensor to_padded_tensor_symint(double padding, at::OptionalSymIntArrayRef output_size=c10::nullopt) const;
+
+  // Special C++ only overloads for std()-like functions (See gh-40287)
+  // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+  // So, for example std(0) would select the std(unbiased=False) overload
+
+  Tensor var(int dim) const {
+    return var(IntArrayRef{dim});
+  }
+
+  Tensor std(int dim) const {
+    return std(IntArrayRef{dim});
+  }
+
+  // We changed .dtype() to return a TypeMeta in #12766. Ideally, we want the
+  // at::kDouble and its friends to be TypeMeta's, but that hasn't happened yet.
+  // Before that change, we make this method to maintain BC for C++ usage like
+  // `x.to(y.dtype)`.
+  // TODO: remove following two after at::kDouble and its friends are TypeMeta's.
+  inline Tensor to(caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(/*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+  inline Tensor to(Device device, caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
+    return this->to(device, /*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
+  }
+
+  template <typename F, typename... Args>
+  decltype(auto) m(F func, Args&&... params) const {
+    return func(*this, std::forward<Args>(params)...);
+  }
+
+  /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended
+  /// to be used from functions that need to access the `Variable`'s equivalent `Tensor`
+  /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`).
+  ///
+  /// One notable difference with the legacy `.data()` function is that changes to the
+  /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset)
+  /// will not update the original `Variable`, due to the fact that this function
+  /// shallow-copies the `Variable`'s underlying TensorImpl.
+  at::Tensor tensor_data() const {
+    return TensorBase::tensor_data();
+  }
+
+  /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data`
+  /// in Python, which create a new `Variable` that shares the same storage and
+  /// tensor metadata with the original `Variable`, but with a completely new
+  /// autograd history.
+  ///
+  /// NOTE: If we change the tensor metadata (e.g. sizes / strides /
+  /// storage / storage_offset) of a variable created from `var.variable_data()`, those
+  /// changes will not update the original variable `var`. In `.variable_data()`, we set
+  /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal,
+  /// in order to prevent users from changing metadata of `var.variable_data()`
+  /// and expecting the original variable `var` to also be updated.
+  at::Tensor variable_data() const {
+    return TensorBase::variable_data();
+  }
+
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  template <typename T>
+  using hook_return_void_t = std::enable_if_t<std::is_void<typename c10::invoke_result_t<T&, Tensor>>::value, unsigned>;
+  template <typename T>
+  using hook_return_var_t = std::enable_if_t<std::is_same<typename c10::invoke_result_t<T&, Tensor>, Tensor>::value, unsigned>;
+
+  /// Registers a backward hook.
+  ///
+  /// The hook will be called every time a gradient with respect to the Tensor is computed.
+  /// The hook should have one of the following signature:
+  /// ```
+  /// hook(Tensor grad) -> Tensor
+  /// ```
+  /// ```
+  /// hook(Tensor grad) -> void
+  /// ```
+  /// The hook should not modify its argument, but it can optionally return a new gradient
+  /// which will be used in place of `grad`.
+  ///
+  /// This function returns the index of the hook in the list which can be used to remove hook.
+  ///
+  /// Example:
+  /// @code
+  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
+  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
+  /// v.backward(torch::tensor({1., 2., 3.}));
+  /// // This prints:
+  /// // ```
+  /// //  2
+  /// //  4
+  /// //  6
+  /// // [ CPUFloatType{3} ]
+  /// // ```
+  /// std::cout << v.grad() << std::endl;
+  /// v.remove_hook(h);  // removes the hook
+  /// @endcode
+  template <typename T>
+  hook_return_void_t<T> register_hook(T&& hook) const;
+  template <typename T>
+  hook_return_var_t<T> register_hook(T&& hook) const;
+
+  // Variable methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  Tensor data() const {
+    return TensorBase::data();
+  }
+
+  void _backward(TensorList inputs, const c10::optional<Tensor>& gradient, c10::optional<bool> keep_graph, bool create_graph) const;
+
+  const Tensor& requires_grad_(bool _requires_grad=true) const {
+    TensorBase::requires_grad_(_requires_grad);
+    return *this;
+  }
+};
+
+namespace detail {
+// Helper creator for Tensor class which doesn't requires the users to pass
+// in an intrusive_ptr instead it just converts the argument passed to
+// requested intrusive_ptr type.
+template <typename T, typename... Args>
+Tensor make_tensor(Args&&... args) {
+  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
+}
+
+} // namespace detail
+
+} // namespace at
+
+
+namespace at {
+
+// aten::_backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()
+inline void Tensor::__dispatch__backward(at::TensorList inputs, const c10::optional<at::Tensor> & gradient, c10::optional<bool> retain_graph, bool create_graph) const {
+    return at::_ops::_backward::call(const_cast<Tensor&>(*this), inputs, gradient, retain_graph, create_graph);
+}
+
+// aten::set_data(Tensor(a!) self, Tensor new_data) -> ()
+inline void Tensor::__dispatch_set_data(const at::Tensor & new_data) const {
+    return at::_ops::set_data::call(const_cast<Tensor&>(*this), new_data);
+}
+
+// aten::data(Tensor self) -> Tensor
+inline at::Tensor Tensor::__dispatch_data() const {
+    return at::_ops::data::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_leaf(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_leaf() const {
+    return at::_ops::is_leaf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::output_nr(Tensor self) -> int
+inline int64_t Tensor::__dispatch_output_nr() const {
+    return at::_ops::output_nr::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_version(Tensor self) -> int
+inline int64_t Tensor::__dispatch__version() const {
+    return at::_ops::_version::call(const_cast<Tensor&>(*this));
+}
+
+// aten::requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
+inline at::Tensor & Tensor::__dispatch_requires_grad_(bool requires_grad) const {
+    return at::_ops::requires_grad_::call(const_cast<Tensor&>(*this), requires_grad);
+}
+
+// aten::retain_grad(Tensor(a!) self) -> ()
+inline void Tensor::__dispatch_retain_grad() const {
+    return at::_ops::retain_grad::call(const_cast<Tensor&>(*this));
+}
+
+// aten::retains_grad(Tensor self) -> bool
+inline bool Tensor::__dispatch_retains_grad() const {
+    return at::_ops::retains_grad::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_fw_primal(Tensor(a) self, int level) -> Tensor(a)
+inline at::Tensor Tensor::_fw_primal(int64_t level) const {
+    return at::_ops::_fw_primal::call(const_cast<Tensor&>(*this), level);
+}
+
+// aten::rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
+inline at::Tensor & Tensor::rename_(c10::optional<at::DimnameList> names) const {
+    return at::_ops::rename_::call(const_cast<Tensor&>(*this), names);
+}
+
+// aten::rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
+inline at::Tensor Tensor::rename(c10::optional<at::DimnameList> names) const {
+    return at::_ops::rename::call(const_cast<Tensor&>(*this), names);
+}
+
+// aten::align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
+inline at::Tensor Tensor::align_to(at::DimnameList names) const {
+    return at::_ops::align_to::call(const_cast<Tensor&>(*this), names);
+}
+
+// aten::align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
+inline at::Tensor Tensor::align_to(at::DimnameList order, int64_t ellipsis_idx) const {
+    return at::_ops::align_to_ellipsis_idx::call(const_cast<Tensor&>(*this), order, ellipsis_idx);
+}
+
+// aten::align_as(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::align_as(const at::Tensor & other) const {
+    return at::_ops::align_as::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
+inline at::Tensor Tensor::refine_names(at::DimnameList names) const {
+    return at::_ops::refine_names::call(const_cast<Tensor&>(*this), names);
+}
+
+// aten::abs(Tensor self) -> Tensor
+inline at::Tensor Tensor::abs() const {
+    return at::_ops::abs::call(const_cast<Tensor&>(*this));
+}
+
+// aten::abs_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::abs_() const {
+    return at::_ops::abs_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::absolute(Tensor self) -> Tensor
+inline at::Tensor Tensor::absolute() const {
+    return at::_ops::absolute::call(const_cast<Tensor&>(*this));
+}
+
+// aten::absolute_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::absolute_() const {
+    return at::_ops::absolute_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::angle(Tensor self) -> Tensor
+inline at::Tensor Tensor::angle() const {
+    return at::_ops::angle::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sgn(Tensor self) -> Tensor
+inline at::Tensor Tensor::sgn() const {
+    return at::_ops::sgn::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sgn_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sgn_() const {
+    return at::_ops::sgn_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::chalf(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+inline at::Tensor Tensor::chalf(c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::chalf::call(const_cast<Tensor&>(*this), memory_format);
+}
+
+// aten::_conj(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::_conj() const {
+    return at::_ops::_conj::call(const_cast<Tensor&>(*this));
+}
+
+// aten::conj(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::__dispatch_conj() const {
+    return at::_ops::conj::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_conj_physical(Tensor self) -> Tensor
+inline at::Tensor Tensor::_conj_physical() const {
+    return at::_ops::_conj_physical::call(const_cast<Tensor&>(*this));
+}
+
+// aten::conj_physical(Tensor self) -> Tensor
+inline at::Tensor Tensor::conj_physical() const {
+    return at::_ops::conj_physical::call(const_cast<Tensor&>(*this));
+}
+
+// aten::conj_physical_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::conj_physical_() const {
+    return at::_ops::conj_physical_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::resolve_conj(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::resolve_conj() const {
+    return at::_ops::resolve_conj::call(const_cast<Tensor&>(*this));
+}
+
+// aten::resolve_neg(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::resolve_neg() const {
+    return at::_ops::resolve_neg::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_neg_view(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::_neg_view() const {
+    return at::_ops::_neg_view::call(const_cast<Tensor&>(*this));
+}
+
+// aten::acos(Tensor self) -> Tensor
+inline at::Tensor Tensor::acos() const {
+    return at::_ops::acos::call(const_cast<Tensor&>(*this));
+}
+
+// aten::acos_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::acos_() const {
+    return at::_ops::acos_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arccos(Tensor self) -> Tensor
+inline at::Tensor Tensor::arccos() const {
+    return at::_ops::arccos::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arccos_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arccos_() const {
+    return at::_ops::arccos_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::add(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::add_Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::add_(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::add__Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::add(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::add_Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::add_(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::add__Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::addmv(const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addmv::call(const_cast<Tensor&>(*this), mat, vec, beta, alpha);
+}
+
+// aten::addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addmv_(const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addmv_::call(const_cast<Tensor&>(*this), mat, vec, beta, alpha);
+}
+
+// aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::addr(const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addr::call(const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
+}
+
+// aten::addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addr_(const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addr_::call(const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
+}
+
+// aten::_is_all_true(Tensor self) -> Tensor
+inline at::Tensor Tensor::_is_all_true() const {
+    return at::_ops::_is_all_true::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_is_any_true(Tensor self) -> Tensor
+inline at::Tensor Tensor::_is_any_true() const {
+    return at::_ops::_is_any_true::call(const_cast<Tensor&>(*this));
+}
+
+// aten::all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::all(int64_t dim, bool keepdim) const {
+    return at::_ops::all_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::all(at::OptionalIntArrayRef dim, bool keepdim) const {
+    return at::_ops::all_dims::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::all(at::Dimname dim, bool keepdim) const {
+    return at::_ops::all_dimname::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
+inline bool Tensor::allclose(const at::Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return at::_ops::allclose::call(const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
+}
+
+// aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::any(int64_t dim, bool keepdim) const {
+    return at::_ops::any_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::any(at::OptionalIntArrayRef dim, bool keepdim) const {
+    return at::_ops::any_dims::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::any(at::Dimname dim, bool keepdim) const {
+    return at::_ops::any_dimname::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::argmax(c10::optional<int64_t> dim, bool keepdim) const {
+    return at::_ops::argmax::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::argmin(c10::optional<int64_t> dim, bool keepdim) const {
+    return at::_ops::argmin::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::acosh(Tensor self) -> Tensor
+inline at::Tensor Tensor::acosh() const {
+    return at::_ops::acosh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::acosh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::acosh_() const {
+    return at::_ops::acosh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arccosh(Tensor self) -> Tensor
+inline at::Tensor Tensor::arccosh() const {
+    return at::_ops::arccosh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arccosh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arccosh_() const {
+    return at::_ops::arccosh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::asinh(Tensor self) -> Tensor
+inline at::Tensor Tensor::asinh() const {
+    return at::_ops::asinh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::asinh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::asinh_() const {
+    return at::_ops::asinh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arcsinh(Tensor self) -> Tensor
+inline at::Tensor Tensor::arcsinh() const {
+    return at::_ops::arcsinh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arcsinh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arcsinh_() const {
+    return at::_ops::arcsinh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::atanh(Tensor self) -> Tensor
+inline at::Tensor Tensor::atanh() const {
+    return at::_ops::atanh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::atanh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::atanh_() const {
+    return at::_ops::atanh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arctanh(Tensor self) -> Tensor
+inline at::Tensor Tensor::arctanh() const {
+    return at::_ops::arctanh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arctanh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arctanh_() const {
+    return at::_ops::arctanh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+inline at::Tensor Tensor::as_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
+    return at::_ops::as_strided::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+}
+
+// aten::as_strided(Tensor(a) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a)
+inline at::Tensor Tensor::as_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) const {
+    return at::_ops::as_strided::call(const_cast<Tensor&>(*this), size, stride, storage_offset);
+}
+
+// aten::as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+inline const at::Tensor & Tensor::as_strided_(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
+    return at::_ops::as_strided_::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+}
+
+// aten::as_strided_(Tensor(a!) self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor(a!)
+inline const at::Tensor & Tensor::as_strided__symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) const {
+    return at::_ops::as_strided_::call(const_cast<Tensor&>(*this), size, stride, storage_offset);
+}
+
+// aten::asin(Tensor self) -> Tensor
+inline at::Tensor Tensor::asin() const {
+    return at::_ops::asin::call(const_cast<Tensor&>(*this));
+}
+
+// aten::asin_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::asin_() const {
+    return at::_ops::asin_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arcsin(Tensor self) -> Tensor
+inline at::Tensor Tensor::arcsin() const {
+    return at::_ops::arcsin::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arcsin_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arcsin_() const {
+    return at::_ops::arcsin_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::atan(Tensor self) -> Tensor
+inline at::Tensor Tensor::atan() const {
+    return at::_ops::atan::call(const_cast<Tensor&>(*this));
+}
+
+// aten::atan_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::atan_() const {
+    return at::_ops::atan_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arctan(Tensor self) -> Tensor
+inline at::Tensor Tensor::arctan() const {
+    return at::_ops::arctan::call(const_cast<Tensor&>(*this));
+}
+
+// aten::arctan_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::arctan_() const {
+    return at::_ops::arctan_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::baddbmm(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::baddbmm::call(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
+}
+
+// aten::baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::baddbmm_(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::baddbmm_::call(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
+}
+
+// aten::bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
+inline at::Tensor Tensor::bernoulli(c10::optional<at::Generator> generator) const {
+    return at::_ops::bernoulli::call(const_cast<Tensor&>(*this), generator);
+}
+
+// aten::bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::bernoulli_(const at::Tensor & p, c10::optional<at::Generator> generator) const {
+    return at::_ops::bernoulli__Tensor::call(const_cast<Tensor&>(*this), p, generator);
+}
+
+// aten::bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::bernoulli_(double p, c10::optional<at::Generator> generator) const {
+    return at::_ops::bernoulli__float::call(const_cast<Tensor&>(*this), p, generator);
+}
+
+// aten::bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
+inline at::Tensor Tensor::bernoulli(double p, c10::optional<at::Generator> generator) const {
+    return at::_ops::bernoulli_p::call(const_cast<Tensor&>(*this), p, generator);
+}
+
+// aten::bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor
+inline at::Tensor Tensor::bincount(const c10::optional<at::Tensor> & weights, int64_t minlength) const {
+    return at::_ops::bincount::call(const_cast<Tensor&>(*this), weights, minlength);
+}
+
+// aten::bitwise_not(Tensor self) -> Tensor
+inline at::Tensor Tensor::bitwise_not() const {
+    return at::_ops::bitwise_not::call(const_cast<Tensor&>(*this));
+}
+
+// aten::bitwise_not_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_not_() const {
+    return at::_ops::bitwise_not_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::copysign.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::copysign(const at::Tensor & other) const {
+    return at::_ops::copysign_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::copysign_(const at::Tensor & other) const {
+    return at::_ops::copysign__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::copysign.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::copysign(const at::Scalar & other) const {
+    return at::_ops::copysign_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::copysign_(const at::Scalar & other) const {
+    return at::_ops::copysign__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::_lazy_clone(Tensor self) -> Tensor
+inline at::Tensor Tensor::_lazy_clone() const {
+    return at::_ops::_lazy_clone::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logical_not(Tensor self) -> Tensor
+inline at::Tensor Tensor::logical_not() const {
+    return at::_ops::logical_not::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logical_not_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::logical_not_() const {
+    return at::_ops::logical_not_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logical_xor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::logical_xor(const at::Tensor & other) const {
+    return at::_ops::logical_xor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::logical_xor_(const at::Tensor & other) const {
+    return at::_ops::logical_xor_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logical_and(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::logical_and(const at::Tensor & other) const {
+    return at::_ops::logical_and::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::logical_and_(const at::Tensor & other) const {
+    return at::_ops::logical_and_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logical_or(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::logical_or(const at::Tensor & other) const {
+    return at::_ops::logical_or::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::logical_or_(const at::Tensor & other) const {
+    return at::_ops::logical_or_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bmm(Tensor self, Tensor mat2) -> Tensor
+inline at::Tensor Tensor::bmm(const at::Tensor & mat2) const {
+    return at::_ops::bmm::call(const_cast<Tensor&>(*this), mat2);
+}
+
+// aten::broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+inline at::Tensor Tensor::broadcast_to(at::IntArrayRef size) const {
+    return at::_ops::broadcast_to::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size));
+}
+
+// aten::broadcast_to(Tensor(a) self, SymInt[] size) -> Tensor(a)
+inline at::Tensor Tensor::broadcast_to_symint(c10::SymIntArrayRef size) const {
+    return at::_ops::broadcast_to::call(const_cast<Tensor&>(*this), size);
+}
+
+// aten::ceil(Tensor self) -> Tensor
+inline at::Tensor Tensor::ceil() const {
+    return at::_ops::ceil::call(const_cast<Tensor&>(*this));
+}
+
+// aten::ceil_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::ceil_() const {
+    return at::_ops::ceil_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::unsafe_chunk(int64_t chunks, int64_t dim) const {
+    return at::_ops::unsafe_chunk::call(const_cast<Tensor&>(*this), chunks, dim);
+}
+
+// aten::chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::chunk(int64_t chunks, int64_t dim) const {
+    return at::_ops::chunk::call(const_cast<Tensor&>(*this), chunks, dim);
+}
+
+// aten::tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::tensor_split(int64_t sections, int64_t dim) const {
+    return at::_ops::tensor_split_sections::call(const_cast<Tensor&>(*this), sections, dim);
+}
+
+// aten::tensor_split.sections(Tensor(a -> *) self, SymInt sections, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::tensor_split_symint(c10::SymInt sections, int64_t dim) const {
+    return at::_ops::tensor_split_sections::call(const_cast<Tensor&>(*this), sections, dim);
+}
+
+// aten::tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::tensor_split(at::IntArrayRef indices, int64_t dim) const {
+    return at::_ops::tensor_split_indices::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(indices), dim);
+}
+
+// aten::tensor_split.indices(Tensor(a -> *) self, SymInt[] indices, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::tensor_split_symint(c10::SymIntArrayRef indices, int64_t dim) const {
+    return at::_ops::tensor_split_indices::call(const_cast<Tensor&>(*this), indices, dim);
+}
+
+// aten::tensor_split.tensor_indices_or_sections(Tensor(a -> *) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::tensor_split(const at::Tensor & tensor_indices_or_sections, int64_t dim) const {
+    return at::_ops::tensor_split_tensor_indices_or_sections::call(const_cast<Tensor&>(*this), tensor_indices_or_sections, dim);
+}
+
+// aten::clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+inline at::Tensor Tensor::clamp(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max) const {
+    return at::_ops::clamp::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+inline at::Tensor Tensor::clamp(const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max) const {
+    return at::_ops::clamp_Tensor::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max) const {
+    return at::_ops::clamp_::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_(const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max) const {
+    return at::_ops::clamp__Tensor::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clamp_max(Tensor self, Scalar max) -> Tensor
+inline at::Tensor Tensor::clamp_max(const at::Scalar & max) const {
+    return at::_ops::clamp_max::call(const_cast<Tensor&>(*this), max);
+}
+
+// aten::clamp_max.Tensor(Tensor self, Tensor max) -> Tensor
+inline at::Tensor Tensor::clamp_max(const at::Tensor & max) const {
+    return at::_ops::clamp_max_Tensor::call(const_cast<Tensor&>(*this), max);
+}
+
+// aten::clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_max_(const at::Scalar & max) const {
+    return at::_ops::clamp_max_::call(const_cast<Tensor&>(*this), max);
+}
+
+// aten::clamp_max_.Tensor(Tensor(a!) self, Tensor max) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_max_(const at::Tensor & max) const {
+    return at::_ops::clamp_max__Tensor::call(const_cast<Tensor&>(*this), max);
+}
+
+// aten::clamp_min(Tensor self, Scalar min) -> Tensor
+inline at::Tensor Tensor::clamp_min(const at::Scalar & min) const {
+    return at::_ops::clamp_min::call(const_cast<Tensor&>(*this), min);
+}
+
+// aten::clamp_min.Tensor(Tensor self, Tensor min) -> Tensor
+inline at::Tensor Tensor::clamp_min(const at::Tensor & min) const {
+    return at::_ops::clamp_min_Tensor::call(const_cast<Tensor&>(*this), min);
+}
+
+// aten::clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_min_(const at::Scalar & min) const {
+    return at::_ops::clamp_min_::call(const_cast<Tensor&>(*this), min);
+}
+
+// aten::clamp_min_.Tensor(Tensor(a!) self, Tensor min) -> Tensor(a!)
+inline at::Tensor & Tensor::clamp_min_(const at::Tensor & min) const {
+    return at::_ops::clamp_min__Tensor::call(const_cast<Tensor&>(*this), min);
+}
+
+// aten::clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
+inline at::Tensor Tensor::clip(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max) const {
+    return at::_ops::clip::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clip.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor
+inline at::Tensor Tensor::clip(const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max) const {
+    return at::_ops::clip_Tensor::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
+inline at::Tensor & Tensor::clip_(const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max) const {
+    return at::_ops::clip_::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::clip_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)
+inline at::Tensor & Tensor::clip_(const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max) const {
+    return at::_ops::clip__Tensor::call(const_cast<Tensor&>(*this), min, max);
+}
+
+// aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
+inline at::Tensor Tensor::__dispatch_contiguous(at::MemoryFormat memory_format) const {
+    return at::_ops::contiguous::call(const_cast<Tensor&>(*this), memory_format);
+}
+
+// aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+inline at::Tensor & Tensor::copy_(const at::Tensor & src, bool non_blocking) const {
+    return at::_ops::copy_::call(const_cast<Tensor&>(*this), src, non_blocking);
+}
+
+// aten::cos(Tensor self) -> Tensor
+inline at::Tensor Tensor::cos() const {
+    return at::_ops::cos::call(const_cast<Tensor&>(*this));
+}
+
+// aten::cos_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::cos_() const {
+    return at::_ops::cos_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::cosh(Tensor self) -> Tensor
+inline at::Tensor Tensor::cosh() const {
+    return at::_ops::cosh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::cosh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::cosh_() const {
+    return at::_ops::cosh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
+inline at::Tensor Tensor::count_nonzero(at::IntArrayRef dim) const {
+    return at::_ops::count_nonzero_dim_IntList::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::count_nonzero(Tensor self, int? dim=None) -> Tensor
+inline at::Tensor Tensor::count_nonzero(c10::optional<int64_t> dim) const {
+    return at::_ops::count_nonzero::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
+inline at::Tensor Tensor::cov(int64_t correction, const c10::optional<at::Tensor> & fweights, const c10::optional<at::Tensor> & aweights) const {
+    return at::_ops::cov::call(const_cast<Tensor&>(*this), correction, fweights, aweights);
+}
+
+// aten::corrcoef(Tensor self) -> Tensor
+inline at::Tensor Tensor::corrcoef() const {
+    return at::_ops::corrcoef::call(const_cast<Tensor&>(*this));
+}
+
+// aten::cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::cummax(int64_t dim) const {
+    return at::_ops::cummax::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::cummax(at::Dimname dim) const {
+    return at::_ops::cummax_dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::cummin(int64_t dim) const {
+    return at::_ops::cummin::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::cummin(at::Dimname dim) const {
+    return at::_ops::cummin_dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::cumprod(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumprod::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+inline at::Tensor & Tensor::cumprod_(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumprod_::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::cumprod(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumprod_dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+inline at::Tensor & Tensor::cumprod_(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumprod__dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::cumsum(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumsum::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
+inline at::Tensor & Tensor::cumsum_(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumsum_::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::cumsum(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumsum_dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
+inline at::Tensor & Tensor::cumsum_(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::cumsum__dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
+inline at::Tensor Tensor::diag_embed(int64_t offset, int64_t dim1, int64_t dim2) const {
+    return at::_ops::diag_embed::call(const_cast<Tensor&>(*this), offset, dim1, dim2);
+}
+
+// aten::diagflat(Tensor self, int offset=0) -> Tensor
+inline at::Tensor Tensor::diagflat(int64_t offset) const {
+    return at::_ops::diagflat::call(const_cast<Tensor&>(*this), offset);
+}
+
+// aten::diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
+inline at::Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const {
+    return at::_ops::diagonal::call(const_cast<Tensor&>(*this), offset, dim1, dim2);
+}
+
+// aten::diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
+inline at::Tensor Tensor::diagonal(at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset) const {
+    return at::_ops::diagonal_Dimname::call(const_cast<Tensor&>(*this), outdim, dim1, dim2, offset);
+}
+
+// aten::fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
+inline at::Tensor & Tensor::fill_diagonal_(const at::Scalar & fill_value, bool wrap) const {
+    return at::_ops::fill_diagonal_::call(const_cast<Tensor&>(*this), fill_value, wrap);
+}
+
+// aten::diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor
+inline at::Tensor Tensor::diff(int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append) const {
+    return at::_ops::diff::call(const_cast<Tensor&>(*this), n, dim, prepend, append);
+}
+
+// aten::div.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::div(const at::Tensor & other) const {
+    return at::_ops::div_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::div_(const at::Tensor & other) const {
+    return at::_ops::div__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+inline at::Tensor Tensor::div(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::div_Tensor_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+inline at::Tensor & Tensor::div_(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::div__Tensor_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::div.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::div(const at::Scalar & other) const {
+    return at::_ops::div_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::div_(const at::Scalar & other) const {
+    return at::_ops::div__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::div.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+inline at::Tensor Tensor::div(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::div_Scalar_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::div_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+inline at::Tensor & Tensor::div_(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::div__Scalar_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::divide.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::divide(const at::Tensor & other) const {
+    return at::_ops::divide_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::divide_(const at::Tensor & other) const {
+    return at::_ops::divide__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::divide.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::divide(const at::Scalar & other) const {
+    return at::_ops::divide_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::divide_(const at::Scalar & other) const {
+    return at::_ops::divide__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::divide.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
+inline at::Tensor Tensor::divide(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::divide_Tensor_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::divide_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
+inline at::Tensor & Tensor::divide_(const at::Tensor & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::divide__Tensor_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::divide.Scalar_mode(Tensor self, Scalar other, *, str? rounding_mode) -> Tensor
+inline at::Tensor Tensor::divide(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::divide_Scalar_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::divide_.Scalar_mode(Tensor(a!) self, Scalar other, *, str? rounding_mode) -> Tensor(a!)
+inline at::Tensor & Tensor::divide_(const at::Scalar & other, c10::optional<c10::string_view> rounding_mode) const {
+    return at::_ops::divide__Scalar_mode::call(const_cast<Tensor&>(*this), other, rounding_mode);
+}
+
+// aten::true_divide.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::true_divide(const at::Tensor & other) const {
+    return at::_ops::true_divide_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::true_divide_(const at::Tensor & other) const {
+    return at::_ops::true_divide__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::true_divide.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::true_divide(const at::Scalar & other) const {
+    return at::_ops::true_divide_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::true_divide_(const at::Scalar & other) const {
+    return at::_ops::true_divide__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::dot(Tensor self, Tensor tensor) -> Tensor
+inline at::Tensor Tensor::dot(const at::Tensor & tensor) const {
+    return at::_ops::dot::call(const_cast<Tensor&>(*this), tensor);
+}
+
+// aten::vdot(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::vdot(const at::Tensor & other) const {
+    return at::_ops::vdot::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty(at::IntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_empty::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_empty::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+}
+
+// aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_symint(c10::SymIntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_empty::call(const_cast<Tensor&>(*this), size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_empty(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_empty::call(const_cast<Tensor&>(*this), size, dtype, layout, device, pin_memory);
+}
+
+// aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options) const {
+    return at::_ops::new_empty_strided::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_empty_strided::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype, layout, device, pin_memory);
+}
+
+// aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options) const {
+    return at::_ops::new_empty_strided::call(const_cast<Tensor&>(*this), size, stride, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_empty_strided::call(const_cast<Tensor&>(*this), size, stride, dtype, layout, device, pin_memory);
+}
+
+// aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_full(at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options) const {
+    return at::_ops::new_full::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_full(at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_full::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), fill_value, dtype, layout, device, pin_memory);
+}
+
+// aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options) const {
+    return at::_ops::new_full::call(const_cast<Tensor&>(*this), size, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_full(Tensor self, SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_full::call(const_cast<Tensor&>(*this), size, fill_value, dtype, layout, device, pin_memory);
+}
+
+// aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_zeros(at::IntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_zeros::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_zeros(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_zeros::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+}
+
+// aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_zeros_symint(c10::SymIntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_zeros::call(const_cast<Tensor&>(*this), size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_zeros(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_zeros_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_zeros::call(const_cast<Tensor&>(*this), size, dtype, layout, device, pin_memory);
+}
+
+// aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_ones(at::IntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_ones::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_ones(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_ones::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory);
+}
+
+// aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_ones_symint(c10::SymIntArrayRef size, at::TensorOptions options) const {
+    return at::_ops::new_ones::call(const_cast<Tensor&>(*this), size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+
+// aten::new_ones(Tensor self, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor Tensor::new_ones_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) const {
+    return at::_ops::new_ones::call(const_cast<Tensor&>(*this), size, dtype, layout, device, pin_memory);
+}
+
+// aten::resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+inline const at::Tensor & Tensor::resize_(at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::resize_::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), memory_format);
+}
+
+// aten::resize_(Tensor(a!) self, SymInt[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+inline const at::Tensor & Tensor::resize__symint(c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::resize_::call(const_cast<Tensor&>(*this), size, memory_format);
+}
+
+// aten::erf(Tensor self) -> Tensor
+inline at::Tensor Tensor::erf() const {
+    return at::_ops::erf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::erf_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::erf_() const {
+    return at::_ops::erf_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::erfc(Tensor self) -> Tensor
+inline at::Tensor Tensor::erfc() const {
+    return at::_ops::erfc::call(const_cast<Tensor&>(*this));
+}
+
+// aten::erfc_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::erfc_() const {
+    return at::_ops::erfc_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::exp(Tensor self) -> Tensor
+inline at::Tensor Tensor::exp() const {
+    return at::_ops::exp::call(const_cast<Tensor&>(*this));
+}
+
+// aten::exp_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::exp_() const {
+    return at::_ops::exp_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::exp2(Tensor self) -> Tensor
+inline at::Tensor Tensor::exp2() const {
+    return at::_ops::exp2::call(const_cast<Tensor&>(*this));
+}
+
+// aten::exp2_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::exp2_() const {
+    return at::_ops::exp2_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::expm1(Tensor self) -> Tensor
+inline at::Tensor Tensor::expm1() const {
+    return at::_ops::expm1::call(const_cast<Tensor&>(*this));
+}
+
+// aten::expm1_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::expm1_() const {
+    return at::_ops::expm1_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+inline at::Tensor Tensor::expand(at::IntArrayRef size, bool implicit) const {
+    return at::_ops::expand::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), implicit);
+}
+
+// aten::expand(Tensor(a) self, SymInt[] size, *, bool implicit=False) -> Tensor(a)
+inline at::Tensor Tensor::expand_symint(c10::SymIntArrayRef size, bool implicit) const {
+    return at::_ops::expand::call(const_cast<Tensor&>(*this), size, implicit);
+}
+
+// aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
+inline at::Tensor Tensor::expand_as(const at::Tensor & other) const {
+    return at::_ops::expand_as::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
+inline at::Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const {
+    return at::_ops::flatten_using_ints::call(const_cast<Tensor&>(*this), start_dim, end_dim);
+}
+
+// aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
+inline at::Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim, at::Dimname out_dim) const {
+    return at::_ops::flatten_named_out_dim::call(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
+}
+
+// aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
+inline at::Tensor Tensor::flatten(at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim) const {
+    return at::_ops::flatten_using_names::call(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
+}
+
+// aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
+inline at::Tensor Tensor::flatten(at::DimnameList dims, at::Dimname out_dim) const {
+    return at::_ops::flatten_DimnameList::call(const_cast<Tensor&>(*this), dims, out_dim);
+}
+
+// aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+inline at::Tensor Tensor::unflatten(int64_t dim, at::IntArrayRef sizes) const {
+    return at::_ops::unflatten_int::call(const_cast<Tensor&>(*this), dim, c10::fromIntArrayRefSlow(sizes));
+}
+
+// aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a)
+inline at::Tensor Tensor::unflatten_symint(int64_t dim, c10::SymIntArrayRef sizes) const {
+    return at::_ops::unflatten_int::call(const_cast<Tensor&>(*this), dim, sizes);
+}
+
+// aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+inline at::Tensor Tensor::unflatten(at::Dimname dim, at::IntArrayRef sizes, at::DimnameList names) const {
+    return at::_ops::unflatten_Dimname::call(const_cast<Tensor&>(*this), dim, c10::fromIntArrayRefSlow(sizes), names);
+}
+
+// aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a)
+inline at::Tensor Tensor::unflatten_symint(at::Dimname dim, c10::SymIntArrayRef sizes, at::DimnameList names) const {
+    return at::_ops::unflatten_Dimname::call(const_cast<Tensor&>(*this), dim, sizes, names);
+}
+
+// aten::fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
+inline at::Tensor & Tensor::fill_(const at::Scalar & value) const {
+    return at::_ops::fill__Scalar::call(const_cast<Tensor&>(*this), value);
+}
+
+// aten::fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
+inline at::Tensor & Tensor::fill_(const at::Tensor & value) const {
+    return at::_ops::fill__Tensor::call(const_cast<Tensor&>(*this), value);
+}
+
+// aten::floor(Tensor self) -> Tensor
+inline at::Tensor Tensor::floor() const {
+    return at::_ops::floor::call(const_cast<Tensor&>(*this));
+}
+
+// aten::floor_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::floor_() const {
+    return at::_ops::floor_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::floor_divide(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::floor_divide(const at::Tensor & other) const {
+    return at::_ops::floor_divide::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::floor_divide_(const at::Tensor & other) const {
+    return at::_ops::floor_divide__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::floor_divide(const at::Scalar & other) const {
+    return at::_ops::floor_divide_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::floor_divide_(const at::Scalar & other) const {
+    return at::_ops::floor_divide__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::frac(Tensor self) -> Tensor
+inline at::Tensor Tensor::frac() const {
+    return at::_ops::frac::call(const_cast<Tensor&>(*this));
+}
+
+// aten::frac_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::frac_() const {
+    return at::_ops::frac_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::gcd(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::gcd(const at::Tensor & other) const {
+    return at::_ops::gcd::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::gcd_(const at::Tensor & other) const {
+    return at::_ops::gcd_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lcm(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::lcm(const at::Tensor & other) const {
+    return at::_ops::lcm::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::lcm_(const at::Tensor & other) const {
+    return at::_ops::lcm_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
+inline at::Tensor Tensor::index(const c10::List<c10::optional<at::Tensor>> & indices) const {
+    return at::_ops::index_Tensor::call(const_cast<Tensor&>(*this), indices);
+}
+
+// aten::index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
+inline at::Tensor & Tensor::index_copy_(int64_t dim, const at::Tensor & index, const at::Tensor & source) const {
+    return at::_ops::index_copy_::call(const_cast<Tensor&>(*this), dim, index, source);
+}
+
+// aten::index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
+inline at::Tensor Tensor::index_copy(int64_t dim, const at::Tensor & index, const at::Tensor & source) const {
+    return at::_ops::index_copy::call(const_cast<Tensor&>(*this), dim, index, source);
+}
+
+// aten::index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
+inline at::Tensor & Tensor::index_copy_(at::Dimname dim, const at::Tensor & index, const at::Tensor & source) const {
+    return at::_ops::index_copy__dimname::call(const_cast<Tensor&>(*this), dim, index, source);
+}
+
+// aten::index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
+inline at::Tensor Tensor::index_copy(at::Dimname dim, const at::Tensor & index, const at::Tensor & source) const {
+    return at::_ops::index_copy_dimname::call(const_cast<Tensor&>(*this), dim, index, source);
+}
+
+// aten::index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
+inline at::Tensor & Tensor::index_put_(const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate) const {
+    return at::_ops::index_put_::call(const_cast<Tensor&>(*this), indices, values, accumulate);
+}
+
+// aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor
+inline at::Tensor Tensor::index_put(const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate) const {
+    return at::_ops::index_put::call(const_cast<Tensor&>(*this), indices, values, accumulate);
+}
+
+// aten::isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
+inline at::Tensor Tensor::isclose(const at::Tensor & other, double rtol, double atol, bool equal_nan) const {
+    return at::_ops::isclose::call(const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
+}
+
+// aten::isnan(Tensor self) -> Tensor
+inline at::Tensor Tensor::isnan() const {
+    return at::_ops::isnan::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_distributed(Tensor self) -> bool
+inline bool Tensor::is_distributed() const {
+    return at::_ops::is_distributed::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_floating_point(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_floating_point() const {
+    return at::_ops::is_floating_point::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_complex(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_complex() const {
+    return at::_ops::is_complex::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_conj(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_conj() const {
+    return at::_ops::is_conj::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_is_zerotensor(Tensor self) -> bool
+inline bool Tensor::__dispatch__is_zerotensor() const {
+    return at::_ops::_is_zerotensor::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_neg(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_neg() const {
+    return at::_ops::is_neg::call(const_cast<Tensor&>(*this));
+}
+
+// aten::isreal(Tensor self) -> Tensor
+inline at::Tensor Tensor::isreal() const {
+    return at::_ops::isreal::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_nonzero(Tensor self) -> bool
+inline bool Tensor::is_nonzero() const {
+    return at::_ops::is_nonzero::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_same_size(Tensor self, Tensor other) -> bool
+inline bool Tensor::is_same_size(const at::Tensor & other) const {
+    return at::_ops::is_same_size::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::is_signed(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_signed() const {
+    return at::_ops::is_signed::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_inference(Tensor self) -> bool
+inline bool Tensor::__dispatch_is_inference() const {
+    return at::_ops::is_inference::call(const_cast<Tensor&>(*this));
+}
+
+// aten::kron(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::kron(const at::Tensor & other) const {
+    return at::_ops::kron::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const {
+    return at::_ops::kthvalue::call(const_cast<Tensor&>(*this), k, dim, keepdim);
+}
+
+// aten::kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::kthvalue(int64_t k, at::Dimname dim, bool keepdim) const {
+    return at::_ops::kthvalue_dimname::call(const_cast<Tensor&>(*this), k, dim, keepdim);
+}
+
+// aten::nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
+inline at::Tensor Tensor::nan_to_num(c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf) const {
+    return at::_ops::nan_to_num::call(const_cast<Tensor&>(*this), nan, posinf, neginf);
+}
+
+// aten::nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
+inline at::Tensor & Tensor::nan_to_num_(c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf) const {
+    return at::_ops::nan_to_num_::call(const_cast<Tensor&>(*this), nan, posinf, neginf);
+}
+
+// aten::ldexp.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::ldexp(const at::Tensor & other) const {
+    return at::_ops::ldexp_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::ldexp_(const at::Tensor & other) const {
+    return at::_ops::ldexp_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::log(Tensor self) -> Tensor
+inline at::Tensor Tensor::log() const {
+    return at::_ops::log::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::log_() const {
+    return at::_ops::log_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log10(Tensor self) -> Tensor
+inline at::Tensor Tensor::log10() const {
+    return at::_ops::log10::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log10_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::log10_() const {
+    return at::_ops::log10_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log1p(Tensor self) -> Tensor
+inline at::Tensor Tensor::log1p() const {
+    return at::_ops::log1p::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log1p_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::log1p_() const {
+    return at::_ops::log1p_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log2(Tensor self) -> Tensor
+inline at::Tensor Tensor::log2() const {
+    return at::_ops::log2::call(const_cast<Tensor&>(*this));
+}
+
+// aten::log2_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::log2_() const {
+    return at::_ops::log2_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logaddexp(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::logaddexp(const at::Tensor & other) const {
+    return at::_ops::logaddexp::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::logaddexp2(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::logaddexp2(const at::Tensor & other) const {
+    return at::_ops::logaddexp2::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::xlogy.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::xlogy(const at::Tensor & other) const {
+    return at::_ops::xlogy_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::xlogy.Scalar_Other(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::xlogy(const at::Scalar & other) const {
+    return at::_ops::xlogy_Scalar_Other::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::xlogy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::xlogy_(const at::Tensor & other) const {
+    return at::_ops::xlogy__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::xlogy_.Scalar_Other(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::xlogy_(const at::Scalar & other) const {
+    return at::_ops::xlogy__Scalar_Other::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::log_softmax(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::log_softmax_int::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::log_softmax(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::log_softmax_Dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::logcumsumexp(Tensor self, int dim) -> Tensor
+inline at::Tensor Tensor::logcumsumexp(int64_t dim) const {
+    return at::_ops::logcumsumexp::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
+inline at::Tensor Tensor::logcumsumexp(at::Dimname dim) const {
+    return at::_ops::logcumsumexp_dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::logsumexp(at::IntArrayRef dim, bool keepdim) const {
+    return at::_ops::logsumexp::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::logsumexp(at::DimnameList dim, bool keepdim) const {
+    return at::_ops::logsumexp_names::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::matmul(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::matmul(const at::Tensor & other) const {
+    return at::_ops::matmul::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::matrix_power(Tensor self, int n) -> Tensor
+inline at::Tensor Tensor::matrix_power(int64_t n) const {
+    return at::_ops::matrix_power::call(const_cast<Tensor&>(*this), n);
+}
+
+// aten::matrix_exp(Tensor self) -> Tensor
+inline at::Tensor Tensor::matrix_exp() const {
+    return at::_ops::matrix_exp::call(const_cast<Tensor&>(*this));
+}
+
+// aten::aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::aminmax(c10::optional<int64_t> dim, bool keepdim) const {
+    return at::_ops::aminmax::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::max(int64_t dim, bool keepdim) const {
+    return at::_ops::max_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::max(at::Dimname dim, bool keepdim) const {
+    return at::_ops::max_names_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::amax(at::IntArrayRef dim, bool keepdim) const {
+    return at::_ops::amax::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::mean(c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::mean::call(const_cast<Tensor&>(*this), dtype);
+}
+
+// aten::mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::mean(at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::mean_dim::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::mean(at::DimnameList dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::mean_names_dim::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::nanmean(at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::nanmean::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::median(Tensor self) -> Tensor
+inline at::Tensor Tensor::median() const {
+    return at::_ops::median::call(const_cast<Tensor&>(*this));
+}
+
+// aten::median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::median(int64_t dim, bool keepdim) const {
+    return at::_ops::median_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::median(at::Dimname dim, bool keepdim) const {
+    return at::_ops::median_names_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::nanmedian(Tensor self) -> Tensor
+inline at::Tensor Tensor::nanmedian() const {
+    return at::_ops::nanmedian::call(const_cast<Tensor&>(*this));
+}
+
+// aten::nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::nanmedian(int64_t dim, bool keepdim) const {
+    return at::_ops::nanmedian_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::nanmedian(at::Dimname dim, bool keepdim) const {
+    return at::_ops::nanmedian_names_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::min(int64_t dim, bool keepdim) const {
+    return at::_ops::min_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::min(at::Dimname dim, bool keepdim) const {
+    return at::_ops::min_names_dim::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::amin(at::IntArrayRef dim, bool keepdim) const {
+    return at::_ops::amin::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::mm(Tensor self, Tensor mat2) -> Tensor
+inline at::Tensor Tensor::mm(const at::Tensor & mat2) const {
+    return at::_ops::mm::call(const_cast<Tensor&>(*this), mat2);
+}
+
+// aten::mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::mode(int64_t dim, bool keepdim) const {
+    return at::_ops::mode::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::mode(at::Dimname dim, bool keepdim) const {
+    return at::_ops::mode_dimname::call(const_cast<Tensor&>(*this), dim, keepdim);
+}
+
+// aten::mul.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::mul(const at::Tensor & other) const {
+    return at::_ops::mul_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::mul_(const at::Tensor & other) const {
+    return at::_ops::mul__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::mul.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::mul(const at::Scalar & other) const {
+    return at::_ops::mul_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::mul_(const at::Scalar & other) const {
+    return at::_ops::mul__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::multiply.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::multiply(const at::Tensor & other) const {
+    return at::_ops::multiply_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::multiply_(const at::Tensor & other) const {
+    return at::_ops::multiply__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::multiply.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::multiply(const at::Scalar & other) const {
+    return at::_ops::multiply_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::multiply_(const at::Scalar & other) const {
+    return at::_ops::multiply__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::mv(Tensor self, Tensor vec) -> Tensor
+inline at::Tensor Tensor::mv(const at::Tensor & vec) const {
+    return at::_ops::mv::call(const_cast<Tensor&>(*this), vec);
+}
+
+// aten::mvlgamma(Tensor self, int p) -> Tensor
+inline at::Tensor Tensor::mvlgamma(int64_t p) const {
+    return at::_ops::mvlgamma::call(const_cast<Tensor&>(*this), p);
+}
+
+// aten::mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
+inline at::Tensor & Tensor::mvlgamma_(int64_t p) const {
+    return at::_ops::mvlgamma_::call(const_cast<Tensor&>(*this), p);
+}
+
+// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+inline at::Tensor Tensor::narrow_copy(int64_t dim, int64_t start, int64_t length) const {
+    return at::_ops::narrow_copy::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor
+inline at::Tensor Tensor::narrow_copy_symint(int64_t dim, c10::SymInt start, c10::SymInt length) const {
+    return at::_ops::narrow_copy::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+inline at::Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
+    return at::_ops::narrow::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::narrow(Tensor(a) self, int dim, SymInt start, SymInt length) -> Tensor(a)
+inline at::Tensor Tensor::narrow_symint(int64_t dim, c10::SymInt start, c10::SymInt length) const {
+    return at::_ops::narrow::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+inline at::Tensor Tensor::narrow(int64_t dim, const at::Tensor & start, int64_t length) const {
+    return at::_ops::narrow_Tensor::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::narrow.Tensor(Tensor(a) self, int dim, Tensor start, SymInt length) -> Tensor(a)
+inline at::Tensor Tensor::narrow_symint(int64_t dim, const at::Tensor & start, c10::SymInt length) const {
+    return at::_ops::narrow_Tensor::call(const_cast<Tensor&>(*this), dim, start, length);
+}
+
+// aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)
+inline at::Tensor Tensor::permute(at::IntArrayRef dims) const {
+    return at::_ops::permute::call(const_cast<Tensor&>(*this), dims);
+}
+
+// aten::movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+inline at::Tensor Tensor::movedim(at::IntArrayRef source, at::IntArrayRef destination) const {
+    return at::_ops::movedim_intlist::call(const_cast<Tensor&>(*this), source, destination);
+}
+
+// aten::movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+inline at::Tensor Tensor::movedim(int64_t source, int64_t destination) const {
+    return at::_ops::movedim_int::call(const_cast<Tensor&>(*this), source, destination);
+}
+
+// aten::moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
+inline at::Tensor Tensor::moveaxis(at::IntArrayRef source, at::IntArrayRef destination) const {
+    return at::_ops::moveaxis_intlist::call(const_cast<Tensor&>(*this), source, destination);
+}
+
+// aten::moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
+inline at::Tensor Tensor::moveaxis(int64_t source, int64_t destination) const {
+    return at::_ops::moveaxis_int::call(const_cast<Tensor&>(*this), source, destination);
+}
+
+// aten::numpy_T(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::numpy_T() const {
+    return at::_ops::numpy_T::call(const_cast<Tensor&>(*this));
+}
+
+// aten::matrix_H(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::matrix_H() const {
+    return at::_ops::matrix_H::call(const_cast<Tensor&>(*this));
+}
+
+// aten::mT(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::mT() const {
+    return at::_ops::mT::call(const_cast<Tensor&>(*this));
+}
+
+// aten::mH(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::mH() const {
+    return at::_ops::mH::call(const_cast<Tensor&>(*this));
+}
+
+// aten::adjoint(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::adjoint() const {
+    return at::_ops::adjoint::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_pinned(Tensor self, Device? device=None) -> bool
+inline bool Tensor::is_pinned(c10::optional<at::Device> device) const {
+    return at::_ops::is_pinned::call(const_cast<Tensor&>(*this), device);
+}
+
+// aten::pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)
+inline at::Tensor Tensor::pin_memory(c10::optional<at::Device> device) const {
+    return at::_ops::pin_memory::call(const_cast<Tensor&>(*this), device);
+}
+
+// aten::pinverse(Tensor self, float rcond=1e-15) -> Tensor
+inline at::Tensor Tensor::pinverse(double rcond) const {
+    return at::_ops::pinverse::call(const_cast<Tensor&>(*this), rcond);
+}
+
+// aten::rad2deg(Tensor self) -> Tensor
+inline at::Tensor Tensor::rad2deg() const {
+    return at::_ops::rad2deg::call(const_cast<Tensor&>(*this));
+}
+
+// aten::rad2deg_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::rad2deg_() const {
+    return at::_ops::rad2deg_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::deg2rad(Tensor self) -> Tensor
+inline at::Tensor Tensor::deg2rad() const {
+    return at::_ops::deg2rad::call(const_cast<Tensor&>(*this));
+}
+
+// aten::deg2rad_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::deg2rad_() const {
+    return at::_ops::deg2rad_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::ravel(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::ravel() const {
+    return at::_ops::ravel::call(const_cast<Tensor&>(*this));
+}
+
+// aten::reciprocal(Tensor self) -> Tensor
+inline at::Tensor Tensor::reciprocal() const {
+    return at::_ops::reciprocal::call(const_cast<Tensor&>(*this));
+}
+
+// aten::reciprocal_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::reciprocal_() const {
+    return at::_ops::reciprocal_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::neg(Tensor self) -> Tensor
+inline at::Tensor Tensor::neg() const {
+    return at::_ops::neg::call(const_cast<Tensor&>(*this));
+}
+
+// aten::neg_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::neg_() const {
+    return at::_ops::neg_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::negative(Tensor self) -> Tensor
+inline at::Tensor Tensor::negative() const {
+    return at::_ops::negative::call(const_cast<Tensor&>(*this));
+}
+
+// aten::negative_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::negative_() const {
+    return at::_ops::negative_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::repeat(Tensor self, SymInt[] repeats) -> Tensor
+inline at::Tensor Tensor::repeat(at::IntArrayRef repeats) const {
+    return at::_ops::repeat::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(repeats));
+}
+
+// aten::repeat(Tensor self, SymInt[] repeats) -> Tensor
+inline at::Tensor Tensor::repeat_symint(c10::SymIntArrayRef repeats) const {
+    return at::_ops::repeat::call(const_cast<Tensor&>(*this), repeats);
+}
+
+// aten::repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+inline at::Tensor Tensor::repeat_interleave(const at::Tensor & repeats, c10::optional<int64_t> dim, c10::optional<int64_t> output_size) const {
+    return at::_ops::repeat_interleave_self_Tensor::call(const_cast<Tensor&>(*this), repeats, dim, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt);
+}
+
+// aten::repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+inline at::Tensor Tensor::repeat_interleave_symint(const at::Tensor & repeats, c10::optional<int64_t> dim, c10::optional<c10::SymInt> output_size) const {
+    return at::_ops::repeat_interleave_self_Tensor::call(const_cast<Tensor&>(*this), repeats, dim, output_size);
+}
+
+// aten::repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+inline at::Tensor Tensor::repeat_interleave(int64_t repeats, c10::optional<int64_t> dim, c10::optional<int64_t> output_size) const {
+    return at::_ops::repeat_interleave_self_int::call(const_cast<Tensor&>(*this), repeats, dim, output_size.has_value() ? c10::make_optional(c10::SymInt(*output_size)) : c10::nullopt);
+}
+
+// aten::repeat_interleave.self_int(Tensor self, SymInt repeats, int? dim=None, *, SymInt? output_size=None) -> Tensor
+inline at::Tensor Tensor::repeat_interleave_symint(c10::SymInt repeats, c10::optional<int64_t> dim, c10::optional<c10::SymInt> output_size) const {
+    return at::_ops::repeat_interleave_self_int::call(const_cast<Tensor&>(*this), repeats, dim, output_size);
+}
+
+// aten::reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+inline at::Tensor Tensor::reshape(at::IntArrayRef shape) const {
+    return at::_ops::reshape::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(shape));
+}
+
+// aten::reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)
+inline at::Tensor Tensor::reshape_symint(c10::SymIntArrayRef shape) const {
+    return at::_ops::reshape::call(const_cast<Tensor&>(*this), shape);
+}
+
+// aten::_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+inline at::Tensor Tensor::_reshape_alias(at::IntArrayRef size, at::IntArrayRef stride) const {
+    return at::_ops::_reshape_alias::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+}
+
+// aten::_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)
+inline at::Tensor Tensor::_reshape_alias_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride) const {
+    return at::_ops::_reshape_alias::call(const_cast<Tensor&>(*this), size, stride);
+}
+
+// aten::reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
+inline at::Tensor Tensor::reshape_as(const at::Tensor & other) const {
+    return at::_ops::reshape_as::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::round(Tensor self) -> Tensor
+inline at::Tensor Tensor::round() const {
+    return at::_ops::round::call(const_cast<Tensor&>(*this));
+}
+
+// aten::round_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::round_() const {
+    return at::_ops::round_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::round.decimals(Tensor self, *, int decimals) -> Tensor
+inline at::Tensor Tensor::round(int64_t decimals) const {
+    return at::_ops::round_decimals::call(const_cast<Tensor&>(*this), decimals);
+}
+
+// aten::round_.decimals(Tensor(a!) self, *, int decimals) -> Tensor(a!)
+inline at::Tensor & Tensor::round_(int64_t decimals) const {
+    return at::_ops::round__decimals::call(const_cast<Tensor&>(*this), decimals);
+}
+
+// aten::relu(Tensor self) -> Tensor
+inline at::Tensor Tensor::relu() const {
+    return at::_ops::relu::call(const_cast<Tensor&>(*this));
+}
+
+// aten::relu_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::relu_() const {
+    return at::_ops::relu_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::prelu(Tensor self, Tensor weight) -> Tensor
+inline at::Tensor Tensor::prelu(const at::Tensor & weight) const {
+    return at::_ops::prelu::call(const_cast<Tensor&>(*this), weight);
+}
+
+// aten::hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
+inline at::Tensor Tensor::hardshrink(const at::Scalar & lambd) const {
+    return at::_ops::hardshrink::call(const_cast<Tensor&>(*this), lambd);
+}
+
+// aten::hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
+inline at::Tensor Tensor::hardshrink_backward(const at::Tensor & grad_out, const at::Scalar & lambd) const {
+    return at::_ops::hardshrink_backward::call(grad_out, const_cast<Tensor&>(*this), lambd);
+}
+
+// aten::rsqrt(Tensor self) -> Tensor
+inline at::Tensor Tensor::rsqrt() const {
+    return at::_ops::rsqrt::call(const_cast<Tensor&>(*this));
+}
+
+// aten::rsqrt_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::rsqrt_() const {
+    return at::_ops::rsqrt_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
+inline at::Tensor Tensor::select(at::Dimname dim, int64_t index) const {
+    return at::_ops::select_Dimname::call(const_cast<Tensor&>(*this), dim, index);
+}
+
+// aten::select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+inline at::Tensor Tensor::select(int64_t dim, int64_t index) const {
+    return at::_ops::select_int::call(const_cast<Tensor&>(*this), dim, index);
+}
+
+// aten::select.int(Tensor(a) self, int dim, SymInt index) -> Tensor(a)
+inline at::Tensor Tensor::select_symint(int64_t dim, c10::SymInt index) const {
+    return at::_ops::select_int::call(const_cast<Tensor&>(*this), dim, index);
+}
+
+// aten::sigmoid(Tensor self) -> Tensor
+inline at::Tensor Tensor::sigmoid() const {
+    return at::_ops::sigmoid::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sigmoid_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sigmoid_() const {
+    return at::_ops::sigmoid_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logit(Tensor self, float? eps=None) -> Tensor
+inline at::Tensor Tensor::logit(c10::optional<double> eps) const {
+    return at::_ops::logit::call(const_cast<Tensor&>(*this), eps);
+}
+
+// aten::logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
+inline at::Tensor & Tensor::logit_(c10::optional<double> eps) const {
+    return at::_ops::logit_::call(const_cast<Tensor&>(*this), eps);
+}
+
+// aten::sin(Tensor self) -> Tensor
+inline at::Tensor Tensor::sin() const {
+    return at::_ops::sin::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sin_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sin_() const {
+    return at::_ops::sin_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sinc(Tensor self) -> Tensor
+inline at::Tensor Tensor::sinc() const {
+    return at::_ops::sinc::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sinc_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sinc_() const {
+    return at::_ops::sinc_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sinh(Tensor self) -> Tensor
+inline at::Tensor Tensor::sinh() const {
+    return at::_ops::sinh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sinh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sinh_() const {
+    return at::_ops::sinh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::detach(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::detach() const {
+    return at::_ops::detach::call(const_cast<Tensor&>(*this));
+}
+
+// aten::detach_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::detach_() const {
+    return at::_ops::detach_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::size.Dimname(Tensor self, Dimname dim) -> int
+inline int64_t Tensor::size(at::Dimname dim) const {
+    return at::_ops::size_Dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor Tensor::slice(int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) const {
+    return at::_ops::slice_Tensor::call(const_cast<Tensor&>(*this), dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+}
+
+// aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor Tensor::slice_symint(int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) const {
+    return at::_ops::slice_Tensor::call(const_cast<Tensor&>(*this), dim, start, end, step);
+}
+
+// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor Tensor::slice_inverse(const at::Tensor & src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) const {
+    return at::_ops::slice_inverse::call(const_cast<Tensor&>(*this), src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+}
+
+// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)
+inline at::Tensor Tensor::slice_inverse_symint(const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) const {
+    return at::_ops::slice_inverse::call(const_cast<Tensor&>(*this), src, dim, start, end, step);
+}
+
+// aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+inline at::Tensor Tensor::slice_scatter(const at::Tensor & src, int64_t dim, c10::optional<int64_t> start, c10::optional<int64_t> end, int64_t step) const {
+    return at::_ops::slice_scatter::call(const_cast<Tensor&>(*this), src, dim, start.has_value() ? c10::make_optional(c10::SymInt(*start)) : c10::nullopt, end.has_value() ? c10::make_optional(c10::SymInt(*end)) : c10::nullopt, step);
+}
+
+// aten::slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor
+inline at::Tensor Tensor::slice_scatter_symint(const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) const {
+    return at::_ops::slice_scatter::call(const_cast<Tensor&>(*this), src, dim, start, end, step);
+}
+
+// aten::select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+inline at::Tensor Tensor::select_scatter(const at::Tensor & src, int64_t dim, int64_t index) const {
+    return at::_ops::select_scatter::call(const_cast<Tensor&>(*this), src, dim, index);
+}
+
+// aten::select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor
+inline at::Tensor Tensor::select_scatter_symint(const at::Tensor & src, int64_t dim, c10::SymInt index) const {
+    return at::_ops::select_scatter::call(const_cast<Tensor&>(*this), src, dim, index);
+}
+
+// aten::diagonal_scatter(Tensor self, Tensor src, int offset=0, int dim1=0, int dim2=1) -> Tensor
+inline at::Tensor Tensor::diagonal_scatter(const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2) const {
+    return at::_ops::diagonal_scatter::call(const_cast<Tensor&>(*this), src, offset, dim1, dim2);
+}
+
+// aten::as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+inline at::Tensor Tensor::as_strided_scatter(const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
+    return at::_ops::as_strided_scatter::call(const_cast<Tensor&>(*this), src, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? c10::make_optional(c10::SymInt(*storage_offset)) : c10::nullopt);
+}
+
+// aten::as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor
+inline at::Tensor Tensor::as_strided_scatter_symint(const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) const {
+    return at::_ops::as_strided_scatter::call(const_cast<Tensor&>(*this), src, size, stride, storage_offset);
+}
+
+// aten::smm(Tensor self, Tensor mat2) -> Tensor
+inline at::Tensor Tensor::smm(const at::Tensor & mat2) const {
+    return at::_ops::smm::call(const_cast<Tensor&>(*this), mat2);
+}
+
+// aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::softmax(int64_t dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::softmax_int::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::softmax(at::Dimname dim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::softmax_Dimname::call(const_cast<Tensor&>(*this), dim, dtype);
+}
+
+// aten::unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::unsafe_split(int64_t split_size, int64_t dim) const {
+    return at::_ops::unsafe_split_Tensor::call(const_cast<Tensor&>(*this), split_size, dim);
+}
+
+// aten::unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::unsafe_split_symint(c10::SymInt split_size, int64_t dim) const {
+    return at::_ops::unsafe_split_Tensor::call(const_cast<Tensor&>(*this), split_size, dim);
+}
+
+// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split(int64_t split_size, int64_t dim) const {
+    return at::_ops::split_Tensor::call(const_cast<Tensor&>(*this), split_size, dim);
+}
+
+// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split_symint(c10::SymInt split_size, int64_t dim) const {
+    return at::_ops::split_Tensor::call(const_cast<Tensor&>(*this), split_size, dim);
+}
+
+// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split(at::IntArrayRef split_size, int64_t dim) const {
+    return at::_ops::split_sizes::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(split_size), dim);
+}
+
+// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split_symint(c10::SymIntArrayRef split_size, int64_t dim) const {
+    return at::_ops::split_sizes::call(const_cast<Tensor&>(*this), split_size, dim);
+}
+
+// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::unsafe_split_with_sizes(at::IntArrayRef split_sizes, int64_t dim) const {
+    return at::_ops::unsafe_split_with_sizes::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(split_sizes), dim);
+}
+
+// aten::unsafe_split_with_sizes(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::unsafe_split_with_sizes_symint(c10::SymIntArrayRef split_sizes, int64_t dim) const {
+    return at::_ops::unsafe_split_with_sizes::call(const_cast<Tensor&>(*this), split_sizes, dim);
+}
+
+// aten::split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split_with_sizes(at::IntArrayRef split_sizes, int64_t dim) const {
+    return at::_ops::split_with_sizes::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(split_sizes), dim);
+}
+
+// aten::split_with_sizes(Tensor(a -> *) self, SymInt[] split_sizes, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::split_with_sizes_symint(c10::SymIntArrayRef split_sizes, int64_t dim) const {
+    return at::_ops::split_with_sizes::call(const_cast<Tensor&>(*this), split_sizes, dim);
+}
+
+// aten::hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::hsplit(int64_t sections) const {
+    return at::_ops::hsplit_int::call(const_cast<Tensor&>(*this), sections);
+}
+
+// aten::hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::hsplit(at::IntArrayRef indices) const {
+    return at::_ops::hsplit_array::call(const_cast<Tensor&>(*this), indices);
+}
+
+// aten::vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::vsplit(int64_t sections) const {
+    return at::_ops::vsplit_int::call(const_cast<Tensor&>(*this), sections);
+}
+
+// aten::vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::vsplit(at::IntArrayRef indices) const {
+    return at::_ops::vsplit_array::call(const_cast<Tensor&>(*this), indices);
+}
+
+// aten::dsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::dsplit(int64_t sections) const {
+    return at::_ops::dsplit_int::call(const_cast<Tensor&>(*this), sections);
+}
+
+// aten::dsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::dsplit(at::IntArrayRef indices) const {
+    return at::_ops::dsplit_array::call(const_cast<Tensor&>(*this), indices);
+}
+
+// aten::squeeze(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::squeeze() const {
+    return at::_ops::squeeze::call(const_cast<Tensor&>(*this));
+}
+
+// aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
+inline at::Tensor Tensor::squeeze(int64_t dim) const {
+    return at::_ops::squeeze_dim::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
+inline at::Tensor Tensor::squeeze(at::Dimname dim) const {
+    return at::_ops::squeeze_dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)
+inline at::Tensor Tensor::squeeze(at::IntArrayRef dim) const {
+    return at::_ops::squeeze_dims::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::squeeze_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::squeeze_() const {
+    return at::_ops::squeeze_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
+inline at::Tensor & Tensor::squeeze_(int64_t dim) const {
+    return at::_ops::squeeze__dim::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::squeeze_.dims(Tensor(a!) self, int[] dim) -> Tensor(a!)
+inline at::Tensor & Tensor::squeeze_(at::IntArrayRef dim) const {
+    return at::_ops::squeeze__dims::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
+inline at::Tensor & Tensor::squeeze_(at::Dimname dim) const {
+    return at::_ops::squeeze__dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::sspaddmm(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::sspaddmm::call(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
+}
+
+// aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+inline at::Tensor Tensor::stft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool normalized, c10::optional<bool> onesided, c10::optional<bool> return_complex) const {
+    return at::_ops::stft::call(const_cast<Tensor&>(*this), n_fft, hop_length, win_length, window, normalized, onesided, return_complex);
+}
+
+// aten::stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+inline at::Tensor Tensor::stft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool center, c10::string_view pad_mode, bool normalized, c10::optional<bool> onesided, c10::optional<bool> return_complex) const {
+    return at::_ops::stft_center::call(const_cast<Tensor&>(*this), n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided, return_complex);
+}
+
+// aten::istft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, bool normalized=False, bool? onesided=None, int? length=None, bool return_complex=False) -> Tensor
+inline at::Tensor Tensor::istft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool center, bool normalized, c10::optional<bool> onesided, c10::optional<int64_t> length, bool return_complex) const {
+    return at::_ops::istft::call(const_cast<Tensor&>(*this), n_fft, hop_length, win_length, window, center, normalized, onesided, length, return_complex);
+}
+
+// aten::stride.Dimname(Tensor self, Dimname dim) -> int
+inline int64_t Tensor::stride(at::Dimname dim) const {
+    return at::_ops::stride_Dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::sum(c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::sum::call(const_cast<Tensor&>(*this), dtype);
+}
+
+// aten::sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::sum(at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::sum_dim_IntList::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::sum(at::DimnameList dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::sum_dim_DimnameList::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::nansum(at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::nansum::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::sum_to_size(Tensor self, SymInt[] size) -> Tensor
+inline at::Tensor Tensor::sum_to_size(at::IntArrayRef size) const {
+    return at::_ops::sum_to_size::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size));
+}
+
+// aten::sum_to_size(Tensor self, SymInt[] size) -> Tensor
+inline at::Tensor Tensor::sum_to_size_symint(c10::SymIntArrayRef size) const {
+    return at::_ops::sum_to_size::call(const_cast<Tensor&>(*this), size);
+}
+
+// aten::sqrt(Tensor self) -> Tensor
+inline at::Tensor Tensor::sqrt() const {
+    return at::_ops::sqrt::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sqrt_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sqrt_() const {
+    return at::_ops::sqrt_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::square(Tensor self) -> Tensor
+inline at::Tensor Tensor::square() const {
+    return at::_ops::square::call(const_cast<Tensor&>(*this));
+}
+
+// aten::square_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::square_() const {
+    return at::_ops::square_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::std(Tensor self, bool unbiased=True) -> Tensor
+inline at::Tensor Tensor::std(bool unbiased) const {
+    return at::_ops::std::call(const_cast<Tensor&>(*this), unbiased);
+}
+
+// aten::std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::std(at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) const {
+    return at::_ops::std_dim::call(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
+}
+
+// aten::std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::std(at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim) const {
+    return at::_ops::std_correction::call(const_cast<Tensor&>(*this), dim, correction, keepdim);
+}
+
+// aten::std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::std(at::DimnameList dim, bool unbiased, bool keepdim) const {
+    return at::_ops::std_names_dim::call(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
+}
+
+// aten::std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::std(at::DimnameList dim, const c10::optional<at::Scalar> & correction, bool keepdim) const {
+    return at::_ops::std_correction_names::call(const_cast<Tensor&>(*this), dim, correction, keepdim);
+}
+
+// aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::prod(c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::prod::call(const_cast<Tensor&>(*this), dtype);
+}
+
+// aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::prod(int64_t dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::prod_dim_int::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::prod(at::Dimname dim, bool keepdim, c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::prod_dim_Dimname::call(const_cast<Tensor&>(*this), dim, keepdim, dtype);
+}
+
+// aten::t(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::t() const {
+    return at::_ops::t::call(const_cast<Tensor&>(*this));
+}
+
+// aten::t_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::t_() const {
+    return at::_ops::t_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::tan(Tensor self) -> Tensor
+inline at::Tensor Tensor::tan() const {
+    return at::_ops::tan::call(const_cast<Tensor&>(*this));
+}
+
+// aten::tan_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::tan_() const {
+    return at::_ops::tan_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::tanh(Tensor self) -> Tensor
+inline at::Tensor Tensor::tanh() const {
+    return at::_ops::tanh::call(const_cast<Tensor&>(*this));
+}
+
+// aten::tanh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::tanh_() const {
+    return at::_ops::tanh_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::tile(Tensor self, SymInt[] dims) -> Tensor
+inline at::Tensor Tensor::tile(at::IntArrayRef dims) const {
+    return at::_ops::tile::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(dims));
+}
+
+// aten::tile(Tensor self, SymInt[] dims) -> Tensor
+inline at::Tensor Tensor::tile_symint(c10::SymIntArrayRef dims) const {
+    return at::_ops::tile::call(const_cast<Tensor&>(*this), dims);
+}
+
+// aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+inline at::Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const {
+    return at::_ops::transpose_int::call(const_cast<Tensor&>(*this), dim0, dim1);
+}
+
+// aten::transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
+inline at::Tensor Tensor::transpose(at::Dimname dim0, at::Dimname dim1) const {
+    return at::_ops::transpose_Dimname::call(const_cast<Tensor&>(*this), dim0, dim1);
+}
+
+// aten::transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+inline at::Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) const {
+    return at::_ops::transpose_::call(const_cast<Tensor&>(*this), dim0, dim1);
+}
+
+// aten::flip(Tensor self, int[] dims) -> Tensor
+inline at::Tensor Tensor::flip(at::IntArrayRef dims) const {
+    return at::_ops::flip::call(const_cast<Tensor&>(*this), dims);
+}
+
+// aten::fliplr(Tensor self) -> Tensor
+inline at::Tensor Tensor::fliplr() const {
+    return at::_ops::fliplr::call(const_cast<Tensor&>(*this));
+}
+
+// aten::flipud(Tensor self) -> Tensor
+inline at::Tensor Tensor::flipud() const {
+    return at::_ops::flipud::call(const_cast<Tensor&>(*this));
+}
+
+// aten::roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+inline at::Tensor Tensor::roll(at::IntArrayRef shifts, at::IntArrayRef dims) const {
+    return at::_ops::roll::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(shifts), dims);
+}
+
+// aten::roll(Tensor self, SymInt[1] shifts, int[1] dims=[]) -> Tensor
+inline at::Tensor Tensor::roll_symint(c10::SymIntArrayRef shifts, at::IntArrayRef dims) const {
+    return at::_ops::roll::call(const_cast<Tensor&>(*this), shifts, dims);
+}
+
+// aten::rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
+inline at::Tensor Tensor::rot90(int64_t k, at::IntArrayRef dims) const {
+    return at::_ops::rot90::call(const_cast<Tensor&>(*this), k, dims);
+}
+
+// aten::_nested_tensor_size(Tensor self) -> Tensor
+inline at::Tensor Tensor::_nested_tensor_size() const {
+    return at::_ops::_nested_tensor_size::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_nested_tensor_strides(Tensor self) -> Tensor
+inline at::Tensor Tensor::_nested_tensor_strides() const {
+    return at::_ops::_nested_tensor_strides::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_nested_tensor_storage_offsets(Tensor self) -> Tensor
+inline at::Tensor Tensor::_nested_tensor_storage_offsets() const {
+    return at::_ops::_nested_tensor_storage_offsets::call(const_cast<Tensor&>(*this));
+}
+
+// aten::trunc(Tensor self) -> Tensor
+inline at::Tensor Tensor::trunc() const {
+    return at::_ops::trunc::call(const_cast<Tensor&>(*this));
+}
+
+// aten::trunc_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::trunc_() const {
+    return at::_ops::trunc_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::fix(Tensor self) -> Tensor
+inline at::Tensor Tensor::fix() const {
+    return at::_ops::fix::call(const_cast<Tensor&>(*this));
+}
+
+// aten::fix_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::fix_() const {
+    return at::_ops::fix_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::type_as(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::type_as(const at::Tensor & other) const {
+    return at::_ops::type_as::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
+inline at::Tensor Tensor::unsqueeze(int64_t dim) const {
+    return at::_ops::unsqueeze::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
+inline at::Tensor & Tensor::unsqueeze_(int64_t dim) const {
+    return at::_ops::unsqueeze_::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::var(Tensor self, bool unbiased=True) -> Tensor
+inline at::Tensor Tensor::var(bool unbiased) const {
+    return at::_ops::var::call(const_cast<Tensor&>(*this), unbiased);
+}
+
+// aten::var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::var(at::OptionalIntArrayRef dim, bool unbiased, bool keepdim) const {
+    return at::_ops::var_dim::call(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
+}
+
+// aten::var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::var(at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim) const {
+    return at::_ops::var_correction::call(const_cast<Tensor&>(*this), dim, correction, keepdim);
+}
+
+// aten::var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::var(at::DimnameList dim, bool unbiased, bool keepdim) const {
+    return at::_ops::var_names_dim::call(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
+}
+
+// aten::var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::var(at::DimnameList dim, const c10::optional<at::Scalar> & correction, bool keepdim) const {
+    return at::_ops::var_correction_names::call(const_cast<Tensor&>(*this), dim, correction, keepdim);
+}
+
+// aten::view_as(Tensor(a) self, Tensor other) -> Tensor(a)
+inline at::Tensor Tensor::view_as(const at::Tensor & other) const {
+    return at::_ops::view_as::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::where(const at::Tensor & condition, const at::Tensor & other) const {
+    return at::_ops::where_self::call(condition, const_cast<Tensor&>(*this), other);
+}
+
+// aten::where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::where(const at::Tensor & condition, const at::Scalar & other) const {
+    return at::_ops::where_ScalarOther::call(condition, const_cast<Tensor&>(*this), other);
+}
+
+// aten::norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
+inline at::Tensor Tensor::norm(const c10::optional<at::Scalar> & p, at::ScalarType dtype) const {
+    return at::_ops::norm_ScalarOpt_dtype::call(const_cast<Tensor&>(*this), p, dtype);
+}
+
+// aten::norm.Scalar(Tensor self, Scalar p=2) -> Tensor
+inline at::Tensor Tensor::norm(const at::Scalar & p) const {
+    return at::_ops::norm_Scalar::call(const_cast<Tensor&>(*this), p);
+}
+
+// aten::norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+inline at::Tensor Tensor::norm(const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype) const {
+    return at::_ops::norm_ScalarOpt_dim_dtype::call(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
+}
+
+// aten::norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::norm(const c10::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim) const {
+    return at::_ops::norm_ScalarOpt_dim::call(const_cast<Tensor&>(*this), p, dim, keepdim);
+}
+
+// aten::norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
+inline at::Tensor Tensor::norm(const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype) const {
+    return at::_ops::norm_names_ScalarOpt_dim_dtype::call(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
+}
+
+// aten::norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
+inline at::Tensor Tensor::norm(const c10::optional<at::Scalar> & p, at::DimnameList dim, bool keepdim) const {
+    return at::_ops::norm_names_ScalarOpt_dim::call(const_cast<Tensor&>(*this), p, dim, keepdim);
+}
+
+// aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::frexp() const {
+    return at::_ops::frexp_Tensor::call(const_cast<Tensor&>(*this));
+}
+
+// aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
+inline at::Tensor Tensor::clone(c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::clone::call(const_cast<Tensor&>(*this), memory_format);
+}
+
+// aten::positive(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::positive() const {
+    return at::_ops::positive::call(const_cast<Tensor&>(*this));
+}
+
+// aten::resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
+inline const at::Tensor & Tensor::resize_as_(const at::Tensor & the_template, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::resize_as_::call(const_cast<Tensor&>(*this), the_template, memory_format);
+}
+
+// aten::resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
+inline const at::Tensor & Tensor::resize_as_sparse_(const at::Tensor & the_template) const {
+    return at::_ops::resize_as_sparse_::call(const_cast<Tensor&>(*this), the_template);
+}
+
+// aten::zero_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::zero_() const {
+    return at::_ops::zero_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::sub(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::sub_Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::sub_(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::sub__Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::sub(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::sub_Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::sub_(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::sub__Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::subtract(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::subtract_Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::subtract_(const at::Tensor & other, const at::Scalar & alpha) const {
+    return at::_ops::subtract__Tensor::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::subtract(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::subtract_Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::subtract_(const at::Scalar & other, const at::Scalar & alpha) const {
+    return at::_ops::subtract__Scalar::call(const_cast<Tensor&>(*this), other, alpha);
+}
+
+// aten::heaviside(Tensor self, Tensor values) -> Tensor
+inline at::Tensor Tensor::heaviside(const at::Tensor & values) const {
+    return at::_ops::heaviside::call(const_cast<Tensor&>(*this), values);
+}
+
+// aten::heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
+inline at::Tensor & Tensor::heaviside_(const at::Tensor & values) const {
+    return at::_ops::heaviside_::call(const_cast<Tensor&>(*this), values);
+}
+
+// aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::addmm(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addmm::call(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
+}
+
+// aten::addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addmm_(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addmm_::call(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
+}
+
+// aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
+inline at::Tensor Tensor::_addmm_activation(const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu) const {
+    return at::_ops::_addmm_activation::call(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha, use_gelu);
+}
+
+// aten::sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+inline const at::Tensor & Tensor::sparse_resize_(at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const {
+    return at::_ops::sparse_resize_::call(const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
+}
+
+// aten::sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
+inline const at::Tensor & Tensor::sparse_resize_and_clear_(at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const {
+    return at::_ops::sparse_resize_and_clear_::call(const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
+}
+
+// aten::sparse_mask(Tensor self, Tensor mask) -> Tensor
+inline at::Tensor Tensor::sparse_mask(const at::Tensor & mask) const {
+    return at::_ops::sparse_mask::call(const_cast<Tensor&>(*this), mask);
+}
+
+// aten::_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor
+inline at::Tensor Tensor::_sparse_mask_projection(const at::Tensor & mask, bool accumulate_matches) const {
+    return at::_ops::_sparse_mask_projection::call(const_cast<Tensor&>(*this), mask, accumulate_matches);
+}
+
+// aten::to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor
+inline at::Tensor Tensor::to_dense(c10::optional<at::ScalarType> dtype, c10::optional<bool> masked_grad) const {
+    return at::_ops::to_dense::call(const_cast<Tensor&>(*this), dtype, masked_grad);
+}
+
+// aten::_to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
+inline at::Tensor Tensor::_to_dense(c10::optional<at::ScalarType> dtype, c10::optional<bool> masked_grad) const {
+    return at::_ops::_to_dense::call(const_cast<Tensor&>(*this), dtype, masked_grad);
+}
+
+// aten::sparse_dim(Tensor self) -> int
+inline int64_t Tensor::sparse_dim() const {
+    return at::_ops::sparse_dim::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_dimI(Tensor self) -> int
+inline int64_t Tensor::_dimI() const {
+    return at::_ops::_dimI::call(const_cast<Tensor&>(*this));
+}
+
+// aten::dense_dim(Tensor self) -> int
+inline int64_t Tensor::dense_dim() const {
+    return at::_ops::dense_dim::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_dimV(Tensor self) -> int
+inline int64_t Tensor::_dimV() const {
+    return at::_ops::_dimV::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_nnz(Tensor self) -> int
+inline int64_t Tensor::_nnz() const {
+    return at::_ops::_nnz::call(const_cast<Tensor&>(*this));
+}
+
+// aten::coalesce(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::coalesce() const {
+    return at::_ops::coalesce::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_coalesced(Tensor self) -> bool
+inline bool Tensor::is_coalesced() const {
+    return at::_ops::is_coalesced::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::_indices() const {
+    return at::_ops::_indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_values(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::_values() const {
+    return at::_ops::_values::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
+inline at::Tensor & Tensor::_coalesced_(bool coalesced) const {
+    return at::_ops::_coalesced_::call(const_cast<Tensor&>(*this), coalesced);
+}
+
+// aten::indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::indices() const {
+    return at::_ops::indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::values(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::values() const {
+    return at::_ops::values::call(const_cast<Tensor&>(*this));
+}
+
+// aten::crow_indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::crow_indices() const {
+    return at::_ops::crow_indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::col_indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::col_indices() const {
+    return at::_ops::col_indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::ccol_indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::ccol_indices() const {
+    return at::_ops::ccol_indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::row_indices(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::row_indices() const {
+    return at::_ops::row_indices::call(const_cast<Tensor&>(*this));
+}
+
+// aten::unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::unbind(int64_t dim) const {
+    return at::_ops::unbind_int::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::unbind.Dimname(Tensor(a -> *) self, Dimname dim) -> Tensor(a)[]
+inline ::std::vector<at::Tensor> Tensor::unbind(at::Dimname dim) const {
+    return at::_ops::unbind_Dimname::call(const_cast<Tensor&>(*this), dim);
+}
+
+// aten::to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+inline at::Tensor Tensor::to_sparse(int64_t sparse_dim) const {
+    return at::_ops::to_sparse_sparse_dim::call(const_cast<Tensor&>(*this), sparse_dim);
+}
+
+// aten::_to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
+inline at::Tensor Tensor::_to_sparse(int64_t sparse_dim) const {
+    return at::_ops::_to_sparse_sparse_dim::call(const_cast<Tensor&>(*this), sparse_dim);
+}
+
+// aten::to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::to_sparse(c10::optional<at::Layout> layout, at::OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::to_sparse::call(const_cast<Tensor&>(*this), layout, blocksize, dense_dim);
+}
+
+// aten::_to_sparse(Tensor self, *, Layout? layout=None, int[2]? blocksize=None, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::_to_sparse(c10::optional<at::Layout> layout, at::OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::_to_sparse::call(const_cast<Tensor&>(*this), layout, blocksize, dense_dim);
+}
+
+// aten::to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::to_sparse_csr(c10::optional<int64_t> dense_dim) const {
+    return at::_ops::to_sparse_csr::call(const_cast<Tensor&>(*this), dense_dim);
+}
+
+// aten::_to_sparse_csr(Tensor self, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::_to_sparse_csr(c10::optional<int64_t> dense_dim) const {
+    return at::_ops::_to_sparse_csr::call(const_cast<Tensor&>(*this), dense_dim);
+}
+
+// aten::to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::to_sparse_csc(c10::optional<int64_t> dense_dim) const {
+    return at::_ops::to_sparse_csc::call(const_cast<Tensor&>(*this), dense_dim);
+}
+
+// aten::_to_sparse_csc(Tensor self, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::_to_sparse_csc(c10::optional<int64_t> dense_dim) const {
+    return at::_ops::_to_sparse_csc::call(const_cast<Tensor&>(*this), dense_dim);
+}
+
+// aten::to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::to_sparse_bsr(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::to_sparse_bsr::call(const_cast<Tensor&>(*this), blocksize, dense_dim);
+}
+
+// aten::_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::_to_sparse_bsr(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::_to_sparse_bsr::call(const_cast<Tensor&>(*this), blocksize, dense_dim);
+}
+
+// aten::to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::to_sparse_bsc(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::to_sparse_bsc::call(const_cast<Tensor&>(*this), blocksize, dense_dim);
+}
+
+// aten::_to_sparse_bsc(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor
+inline at::Tensor Tensor::_to_sparse_bsc(at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim) const {
+    return at::_ops::_to_sparse_bsc::call(const_cast<Tensor&>(*this), blocksize, dense_dim);
+}
+
+// aten::to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
+inline at::Tensor Tensor::to_mkldnn(c10::optional<at::ScalarType> dtype) const {
+    return at::_ops::to_mkldnn::call(const_cast<Tensor&>(*this), dtype);
+}
+
+// aten::dequantize.self(Tensor self) -> Tensor
+inline at::Tensor Tensor::dequantize() const {
+    return at::_ops::dequantize_self::call(const_cast<Tensor&>(*this));
+}
+
+// aten::q_scale(Tensor self) -> float
+inline double Tensor::q_scale() const {
+    return at::_ops::q_scale::call(const_cast<Tensor&>(*this));
+}
+
+// aten::q_zero_point(Tensor self) -> int
+inline int64_t Tensor::q_zero_point() const {
+    return at::_ops::q_zero_point::call(const_cast<Tensor&>(*this));
+}
+
+// aten::q_per_channel_scales(Tensor self) -> Tensor
+inline at::Tensor Tensor::q_per_channel_scales() const {
+    return at::_ops::q_per_channel_scales::call(const_cast<Tensor&>(*this));
+}
+
+// aten::q_per_channel_zero_points(Tensor self) -> Tensor
+inline at::Tensor Tensor::q_per_channel_zero_points() const {
+    return at::_ops::q_per_channel_zero_points::call(const_cast<Tensor&>(*this));
+}
+
+// aten::q_per_channel_axis(Tensor self) -> int
+inline int64_t Tensor::q_per_channel_axis() const {
+    return at::_ops::q_per_channel_axis::call(const_cast<Tensor&>(*this));
+}
+
+// aten::int_repr(Tensor self) -> Tensor
+inline at::Tensor Tensor::int_repr() const {
+    return at::_ops::int_repr::call(const_cast<Tensor&>(*this));
+}
+
+// aten::qscheme(Tensor self) -> QScheme
+inline at::QScheme Tensor::qscheme() const {
+    return at::_ops::qscheme::call(const_cast<Tensor&>(*this));
+}
+
+// aten::_autocast_to_reduced_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled, ScalarType cuda_dtype, ScalarType cpu_dtype) -> Tensor(a)
+inline at::Tensor Tensor::_autocast_to_reduced_precision(bool cuda_enabled, bool cpu_enabled, at::ScalarType cuda_dtype, at::ScalarType cpu_dtype) const {
+    return at::_ops::_autocast_to_reduced_precision::call(const_cast<Tensor&>(*this), cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype);
+}
+
+// aten::_autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)
+inline at::Tensor Tensor::_autocast_to_full_precision(bool cuda_enabled, bool cpu_enabled) const {
+    return at::_ops::_autocast_to_full_precision::call(const_cast<Tensor&>(*this), cuda_enabled, cpu_enabled);
+}
+
+// aten::to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+inline at::Tensor Tensor::to(at::TensorOptions options, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::to_dtype_layout::call(const_cast<Tensor&>(*this), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, copy, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+}
+
+// aten::to.dtype_layout(Tensor(a) self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+inline at::Tensor Tensor::to(c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::to_dtype_layout::call(const_cast<Tensor&>(*this), dtype, layout, device, pin_memory, non_blocking, copy, memory_format);
+}
+
+// aten::to.device(Tensor(a) self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+inline at::Tensor Tensor::to(at::Device device, at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::to_device::call(const_cast<Tensor&>(*this), device, dtype, non_blocking, copy, memory_format);
+}
+
+// aten::to.dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+inline at::Tensor Tensor::to(at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::to_dtype::call(const_cast<Tensor&>(*this), dtype, non_blocking, copy, memory_format);
+}
+
+// aten::to.other(Tensor(a) self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)
+inline at::Tensor Tensor::to(const at::Tensor & other, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) const {
+    return at::_ops::to_other::call(const_cast<Tensor&>(*this), other, non_blocking, copy, memory_format);
+}
+
+// aten::item(Tensor self) -> Scalar
+inline at::Scalar Tensor::item() const {
+    return at::_ops::item::call(const_cast<Tensor&>(*this));
+}
+
+// aten::set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
+inline at::Tensor & Tensor::set_(at::Storage source) const {
+    return at::_ops::set__source_Storage::call(const_cast<Tensor&>(*this), source);
+}
+
+// aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+inline at::Tensor & Tensor::set_(at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride) const {
+    return at::_ops::set__source_Storage_storage_offset::call(const_cast<Tensor&>(*this), source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+}
+
+// aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+inline at::Tensor & Tensor::set__symint(at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) const {
+    return at::_ops::set__source_Storage_storage_offset::call(const_cast<Tensor&>(*this), source, storage_offset, size, stride);
+}
+
+// aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+inline at::Tensor & Tensor::set_(const at::Tensor & source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride) const {
+    return at::_ops::set__source_Tensor_storage_offset::call(const_cast<Tensor&>(*this), source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride));
+}
+
+// aten::set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)
+inline at::Tensor & Tensor::set__symint(const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride) const {
+    return at::_ops::set__source_Tensor_storage_offset::call(const_cast<Tensor&>(*this), source, storage_offset, size, stride);
+}
+
+// aten::set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
+inline at::Tensor & Tensor::set_(const at::Tensor & source) const {
+    return at::_ops::set__source_Tensor::call(const_cast<Tensor&>(*this), source);
+}
+
+// aten::set_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::set_() const {
+    return at::_ops::set_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::is_set_to(Tensor self, Tensor tensor) -> bool
+inline bool Tensor::is_set_to(const at::Tensor & tensor) const {
+    return at::_ops::is_set_to::call(const_cast<Tensor&>(*this), tensor);
+}
+
+// aten::masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
+inline at::Tensor & Tensor::masked_fill_(const at::Tensor & mask, const at::Scalar & value) const {
+    return at::_ops::masked_fill__Scalar::call(const_cast<Tensor&>(*this), mask, value);
+}
+
+// aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
+inline at::Tensor Tensor::masked_fill(const at::Tensor & mask, const at::Scalar & value) const {
+    return at::_ops::masked_fill_Scalar::call(const_cast<Tensor&>(*this), mask, value);
+}
+
+// aten::masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
+inline at::Tensor & Tensor::masked_fill_(const at::Tensor & mask, const at::Tensor & value) const {
+    return at::_ops::masked_fill__Tensor::call(const_cast<Tensor&>(*this), mask, value);
+}
+
+// aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
+inline at::Tensor Tensor::masked_fill(const at::Tensor & mask, const at::Tensor & value) const {
+    return at::_ops::masked_fill_Tensor::call(const_cast<Tensor&>(*this), mask, value);
+}
+
+// aten::masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
+inline at::Tensor & Tensor::masked_scatter_(const at::Tensor & mask, const at::Tensor & source) const {
+    return at::_ops::masked_scatter_::call(const_cast<Tensor&>(*this), mask, source);
+}
+
+// aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+inline at::Tensor Tensor::masked_scatter(const at::Tensor & mask, const at::Tensor & source) const {
+    return at::_ops::masked_scatter::call(const_cast<Tensor&>(*this), mask, source);
+}
+
+// aten::view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+inline at::Tensor Tensor::view(at::IntArrayRef size) const {
+    return at::_ops::view::call(const_cast<Tensor&>(*this), c10::fromIntArrayRefSlow(size));
+}
+
+// aten::view(Tensor(a) self, SymInt[] size) -> Tensor(a)
+inline at::Tensor Tensor::view_symint(c10::SymIntArrayRef size) const {
+    return at::_ops::view::call(const_cast<Tensor&>(*this), size);
+}
+
+// aten::view.dtype(Tensor(a) self, ScalarType dtype) -> Tensor(a)
+inline at::Tensor Tensor::view(at::ScalarType dtype) const {
+    return at::_ops::view_dtype::call(const_cast<Tensor&>(*this), dtype);
+}
+
+// aten::put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
+inline at::Tensor & Tensor::put_(const at::Tensor & index, const at::Tensor & source, bool accumulate) const {
+    return at::_ops::put_::call(const_cast<Tensor&>(*this), index, source, accumulate);
+}
+
+// aten::put(Tensor self, Tensor index, Tensor source, bool accumulate=False) -> Tensor
+inline at::Tensor Tensor::put(const at::Tensor & index, const at::Tensor & source, bool accumulate) const {
+    return at::_ops::put::call(const_cast<Tensor&>(*this), index, source, accumulate);
+}
+
+// aten::index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::index_add_(int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha) const {
+    return at::_ops::index_add_::call(const_cast<Tensor&>(*this), dim, index, source, alpha);
+}
+
+// aten::index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::index_add(int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha) const {
+    return at::_ops::index_add::call(const_cast<Tensor&>(*this), dim, index, source, alpha);
+}
+
+// aten::index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::index_add(at::Dimname dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha) const {
+    return at::_ops::index_add_dimname::call(const_cast<Tensor&>(*this), dim, index, source, alpha);
+}
+
+// aten::index_reduce_(Tensor(a!) self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor(a!)
+inline at::Tensor & Tensor::index_reduce_(int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self) const {
+    return at::_ops::index_reduce_::call(const_cast<Tensor&>(*this), dim, index, source, reduce, include_self);
+}
+
+// aten::index_reduce(Tensor self, int dim, Tensor index, Tensor source, str reduce, *, bool include_self=True) -> Tensor
+inline at::Tensor Tensor::index_reduce(int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self) const {
+    return at::_ops::index_reduce::call(const_cast<Tensor&>(*this), dim, index, source, reduce, include_self);
+}
+
+// aten::index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+inline at::Tensor & Tensor::index_fill_(int64_t dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::index_fill__int_Scalar::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor Tensor::index_fill(int64_t dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::index_fill_int_Scalar::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
+inline at::Tensor & Tensor::index_fill_(int64_t dim, const at::Tensor & index, const at::Tensor & value) const {
+    return at::_ops::index_fill__int_Tensor::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+inline at::Tensor Tensor::index_fill(int64_t dim, const at::Tensor & index, const at::Tensor & value) const {
+    return at::_ops::index_fill_int_Tensor::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
+inline at::Tensor & Tensor::index_fill_(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::index_fill__Dimname_Scalar::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
+inline at::Tensor & Tensor::index_fill_(at::Dimname dim, const at::Tensor & index, const at::Tensor & value) const {
+    return at::_ops::index_fill__Dimname_Tensor::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor Tensor::index_fill(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::index_fill_Dimname_Scalar::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+inline at::Tensor Tensor::index_fill(at::Dimname dim, const at::Tensor & index, const at::Tensor & value) const {
+    return at::_ops::index_fill_Dimname_Tensor::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+inline at::Tensor Tensor::scatter(int64_t dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter_src::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_(int64_t dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter__src::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor Tensor::scatter(int64_t dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::scatter_value::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_(int64_t dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::scatter__value::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::scatter.reduce(Tensor self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor
+inline at::Tensor Tensor::scatter(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) const {
+    return at::_ops::scatter_reduce::call(const_cast<Tensor&>(*this), dim, index, src, reduce);
+}
+
+// aten::scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce) const {
+    return at::_ops::scatter__reduce::call(const_cast<Tensor&>(*this), dim, index, src, reduce);
+}
+
+// aten::scatter.value_reduce(Tensor self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor
+inline at::Tensor Tensor::scatter(int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) const {
+    return at::_ops::scatter_value_reduce::call(const_cast<Tensor&>(*this), dim, index, value, reduce);
+}
+
+// aten::scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_(int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce) const {
+    return at::_ops::scatter__value_reduce::call(const_cast<Tensor&>(*this), dim, index, value, reduce);
+}
+
+// aten::scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+inline at::Tensor Tensor::scatter(at::Dimname dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter_dimname_src::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor Tensor::scatter(at::Dimname dim, const at::Tensor & index, const at::Scalar & value) const {
+    return at::_ops::scatter_dimname_value::call(const_cast<Tensor&>(*this), dim, index, value);
+}
+
+// aten::scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
+inline at::Tensor Tensor::scatter_add(int64_t dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter_add::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_add_(int64_t dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter_add_::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
+inline at::Tensor Tensor::scatter_add(at::Dimname dim, const at::Tensor & index, const at::Tensor & src) const {
+    return at::_ops::scatter_add_dimname::call(const_cast<Tensor&>(*this), dim, index, src);
+}
+
+// aten::scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor
+inline at::Tensor Tensor::scatter_reduce(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self) const {
+    return at::_ops::scatter_reduce_two::call(const_cast<Tensor&>(*this), dim, index, src, reduce, include_self);
+}
+
+// aten::scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)
+inline at::Tensor & Tensor::scatter_reduce_(int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self) const {
+    return at::_ops::scatter_reduce__two::call(const_cast<Tensor&>(*this), dim, index, src, reduce, include_self);
+}
+
+// aten::eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::eq_(const at::Scalar & other) const {
+    return at::_ops::eq__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::eq_(const at::Tensor & other) const {
+    return at::_ops::eq__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::bitwise_and(const at::Scalar & other) const {
+    return at::_ops::bitwise_and_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::bitwise_and(const at::Tensor & other) const {
+    return at::_ops::bitwise_and_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_and_(const at::Scalar & other) const {
+    return at::_ops::bitwise_and__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_and_(const at::Tensor & other) const {
+    return at::_ops::bitwise_and__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__and__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::__and__(const at::Scalar & other) const {
+    return at::_ops::__and___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__and__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::__and__(const at::Tensor & other) const {
+    return at::_ops::__and___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::__iand__(const at::Scalar & other) const {
+    return at::_ops::__iand___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::__iand__(const at::Tensor & other) const {
+    return at::_ops::__iand___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::bitwise_or(const at::Scalar & other) const {
+    return at::_ops::bitwise_or_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::bitwise_or(const at::Tensor & other) const {
+    return at::_ops::bitwise_or_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_or_(const at::Scalar & other) const {
+    return at::_ops::bitwise_or__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_or_(const at::Tensor & other) const {
+    return at::_ops::bitwise_or__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__or__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::__or__(const at::Scalar & other) const {
+    return at::_ops::__or___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__or__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::__or__(const at::Tensor & other) const {
+    return at::_ops::__or___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ior__(const at::Scalar & other) const {
+    return at::_ops::__ior___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ior__(const at::Tensor & other) const {
+    return at::_ops::__ior___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::bitwise_xor(const at::Scalar & other) const {
+    return at::_ops::bitwise_xor_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::bitwise_xor(const at::Tensor & other) const {
+    return at::_ops::bitwise_xor_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_xor_(const at::Scalar & other) const {
+    return at::_ops::bitwise_xor__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_xor_(const at::Tensor & other) const {
+    return at::_ops::bitwise_xor__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__xor__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::__xor__(const at::Scalar & other) const {
+    return at::_ops::__xor___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__xor__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::__xor__(const at::Tensor & other) const {
+    return at::_ops::__xor___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ixor__(const at::Scalar & other) const {
+    return at::_ops::__ixor___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ixor__(const at::Tensor & other) const {
+    return at::_ops::__ixor___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__lshift__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::__lshift__(const at::Scalar & other) const {
+    return at::_ops::__lshift___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__lshift__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::__lshift__(const at::Tensor & other) const {
+    return at::_ops::__lshift___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ilshift__(const at::Scalar & other) const {
+    return at::_ops::__ilshift___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::__ilshift__(const at::Tensor & other) const {
+    return at::_ops::__ilshift___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_left_shift.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::bitwise_left_shift(const at::Tensor & other) const {
+    return at::_ops::bitwise_left_shift_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_left_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_left_shift_(const at::Tensor & other) const {
+    return at::_ops::bitwise_left_shift__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_left_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::bitwise_left_shift(const at::Scalar & other) const {
+    return at::_ops::bitwise_left_shift_Tensor_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_left_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_left_shift_(const at::Scalar & other) const {
+    return at::_ops::bitwise_left_shift__Tensor_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::__rshift__(const at::Scalar & other) const {
+    return at::_ops::__rshift___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::__rshift__(const at::Tensor & other) const {
+    return at::_ops::__rshift___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::__irshift__(const at::Scalar & other) const {
+    return at::_ops::__irshift___Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::__irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::__irshift__(const at::Tensor & other) const {
+    return at::_ops::__irshift___Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::bitwise_right_shift(const at::Tensor & other) const {
+    return at::_ops::bitwise_right_shift_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_right_shift_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_right_shift_(const at::Tensor & other) const {
+    return at::_ops::bitwise_right_shift__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::bitwise_right_shift(const at::Scalar & other) const {
+    return at::_ops::bitwise_right_shift_Tensor_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::bitwise_right_shift_.Tensor_Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::bitwise_right_shift_(const at::Scalar & other) const {
+    return at::_ops::bitwise_right_shift__Tensor_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+inline at::Tensor & Tensor::tril_(int64_t diagonal) const {
+    return at::_ops::tril_::call(const_cast<Tensor&>(*this), diagonal);
+}
+
+// aten::triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
+inline at::Tensor & Tensor::triu_(int64_t diagonal) const {
+    return at::_ops::triu_::call(const_cast<Tensor&>(*this), diagonal);
+}
+
+// aten::digamma_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::digamma_() const {
+    return at::_ops::digamma_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
+inline at::Tensor & Tensor::lerp_(const at::Tensor & end, const at::Scalar & weight) const {
+    return at::_ops::lerp__Scalar::call(const_cast<Tensor&>(*this), end, weight);
+}
+
+// aten::lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
+inline at::Tensor & Tensor::lerp_(const at::Tensor & end, const at::Tensor & weight) const {
+    return at::_ops::lerp__Tensor::call(const_cast<Tensor&>(*this), end, weight);
+}
+
+// aten::addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addbmm_(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addbmm_::call(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
+}
+
+// aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor Tensor::addbmm(const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha) const {
+    return at::_ops::addbmm::call(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
+}
+
+// aten::random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::random_(int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator) const {
+    return at::_ops::random__from::call(const_cast<Tensor&>(*this), from, to, generator);
+}
+
+// aten::random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::random_(int64_t to, c10::optional<at::Generator> generator) const {
+    return at::_ops::random__to::call(const_cast<Tensor&>(*this), to, generator);
+}
+
+// aten::random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::random_(c10::optional<at::Generator> generator) const {
+    return at::_ops::random_::call(const_cast<Tensor&>(*this), generator);
+}
+
+// aten::uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::uniform_(double from, double to, c10::optional<at::Generator> generator) const {
+    return at::_ops::uniform_::call(const_cast<Tensor&>(*this), from, to, generator);
+}
+
+// aten::cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::cauchy_(double median, double sigma, c10::optional<at::Generator> generator) const {
+    return at::_ops::cauchy_::call(const_cast<Tensor&>(*this), median, sigma, generator);
+}
+
+// aten::log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::log_normal_(double mean, double std, c10::optional<at::Generator> generator) const {
+    return at::_ops::log_normal_::call(const_cast<Tensor&>(*this), mean, std, generator);
+}
+
+// aten::exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::exponential_(double lambd, c10::optional<at::Generator> generator) const {
+    return at::_ops::exponential_::call(const_cast<Tensor&>(*this), lambd, generator);
+}
+
+// aten::geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::geometric_(double p, c10::optional<at::Generator> generator) const {
+    return at::_ops::geometric_::call(const_cast<Tensor&>(*this), p, generator);
+}
+
+// aten::diag(Tensor self, int diagonal=0) -> Tensor
+inline at::Tensor Tensor::diag(int64_t diagonal) const {
+    return at::_ops::diag::call(const_cast<Tensor&>(*this), diagonal);
+}
+
+// aten::cross(Tensor self, Tensor other, int? dim=None) -> Tensor
+inline at::Tensor Tensor::cross(const at::Tensor & other, c10::optional<int64_t> dim) const {
+    return at::_ops::cross::call(const_cast<Tensor&>(*this), other, dim);
+}
+
+// aten::triu(Tensor self, int diagonal=0) -> Tensor
+inline at::Tensor Tensor::triu(int64_t diagonal) const {
+    return at::_ops::triu::call(const_cast<Tensor&>(*this), diagonal);
+}
+
+// aten::tril(Tensor self, int diagonal=0) -> Tensor
+inline at::Tensor Tensor::tril(int64_t diagonal) const {
+    return at::_ops::tril::call(const_cast<Tensor&>(*this), diagonal);
+}
+
+// aten::trace(Tensor self) -> Tensor
+inline at::Tensor Tensor::trace() const {
+    return at::_ops::trace::call(const_cast<Tensor&>(*this));
+}
+
+// aten::ne.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::ne(const at::Scalar & other) const {
+    return at::_ops::ne_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ne.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::ne(const at::Tensor & other) const {
+    return at::_ops::ne_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::ne_(const at::Scalar & other) const {
+    return at::_ops::ne__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::ne_(const at::Tensor & other) const {
+    return at::_ops::ne__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::not_equal.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::not_equal(const at::Scalar & other) const {
+    return at::_ops::not_equal_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::not_equal.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::not_equal(const at::Tensor & other) const {
+    return at::_ops::not_equal_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::not_equal_(const at::Scalar & other) const {
+    return at::_ops::not_equal__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::not_equal_(const at::Tensor & other) const {
+    return at::_ops::not_equal__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::eq.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::eq(const at::Scalar & other) const {
+    return at::_ops::eq_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::eq.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::eq(const at::Tensor & other) const {
+    return at::_ops::eq_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ge.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::ge(const at::Scalar & other) const {
+    return at::_ops::ge_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ge.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::ge(const at::Tensor & other) const {
+    return at::_ops::ge_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::ge_(const at::Scalar & other) const {
+    return at::_ops::ge__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::ge_(const at::Tensor & other) const {
+    return at::_ops::ge__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::greater_equal(const at::Scalar & other) const {
+    return at::_ops::greater_equal_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::greater_equal(const at::Tensor & other) const {
+    return at::_ops::greater_equal_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::greater_equal_(const at::Scalar & other) const {
+    return at::_ops::greater_equal__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::greater_equal_(const at::Tensor & other) const {
+    return at::_ops::greater_equal__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::le.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::le(const at::Scalar & other) const {
+    return at::_ops::le_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::le.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::le(const at::Tensor & other) const {
+    return at::_ops::le_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::le_(const at::Scalar & other) const {
+    return at::_ops::le__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::le_(const at::Tensor & other) const {
+    return at::_ops::le__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_equal.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::less_equal(const at::Scalar & other) const {
+    return at::_ops::less_equal_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_equal.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::less_equal(const at::Tensor & other) const {
+    return at::_ops::less_equal_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::less_equal_(const at::Scalar & other) const {
+    return at::_ops::less_equal__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::less_equal_(const at::Tensor & other) const {
+    return at::_ops::less_equal__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::gt.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::gt(const at::Scalar & other) const {
+    return at::_ops::gt_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::gt.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::gt(const at::Tensor & other) const {
+    return at::_ops::gt_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::gt_(const at::Scalar & other) const {
+    return at::_ops::gt__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::gt_(const at::Tensor & other) const {
+    return at::_ops::gt__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::greater(const at::Scalar & other) const {
+    return at::_ops::greater_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::greater(const at::Tensor & other) const {
+    return at::_ops::greater_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::greater_(const at::Scalar & other) const {
+    return at::_ops::greater__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::greater_(const at::Tensor & other) const {
+    return at::_ops::greater__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lt.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::lt(const at::Scalar & other) const {
+    return at::_ops::lt_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lt.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::lt(const at::Tensor & other) const {
+    return at::_ops::lt_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::lt_(const at::Scalar & other) const {
+    return at::_ops::lt__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::lt_(const at::Tensor & other) const {
+    return at::_ops::lt__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::less(const at::Scalar & other) const {
+    return at::_ops::less_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::less(const at::Tensor & other) const {
+    return at::_ops::less_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::less_(const at::Scalar & other) const {
+    return at::_ops::less__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::less_(const at::Tensor & other) const {
+    return at::_ops::less__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::take(Tensor self, Tensor index) -> Tensor
+inline at::Tensor Tensor::take(const at::Tensor & index) const {
+    return at::_ops::take::call(const_cast<Tensor&>(*this), index);
+}
+
+// aten::take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor
+inline at::Tensor Tensor::take_along_dim(const at::Tensor & indices, c10::optional<int64_t> dim) const {
+    return at::_ops::take_along_dim::call(const_cast<Tensor&>(*this), indices, dim);
+}
+
+// aten::index_select(Tensor self, int dim, Tensor index) -> Tensor
+inline at::Tensor Tensor::index_select(int64_t dim, const at::Tensor & index) const {
+    return at::_ops::index_select::call(const_cast<Tensor&>(*this), dim, index);
+}
+
+// aten::index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
+inline at::Tensor Tensor::index_select(at::Dimname dim, const at::Tensor & index) const {
+    return at::_ops::index_select_dimname::call(const_cast<Tensor&>(*this), dim, index);
+}
+
+// aten::masked_select(Tensor self, Tensor mask) -> Tensor
+inline at::Tensor Tensor::masked_select(const at::Tensor & mask) const {
+    return at::_ops::masked_select::call(const_cast<Tensor&>(*this), mask);
+}
+
+// aten::nonzero(Tensor self) -> Tensor
+inline at::Tensor Tensor::nonzero() const {
+    return at::_ops::nonzero::call(const_cast<Tensor&>(*this));
+}
+
+// aten::nonzero_static(Tensor self, *, int size, int fill_value=-1) -> Tensor
+inline at::Tensor Tensor::nonzero_static(int64_t size, int64_t fill_value) const {
+    return at::_ops::nonzero_static::call(const_cast<Tensor&>(*this), size, fill_value);
+}
+
+// aten::nonzero_numpy(Tensor self) -> Tensor[]
+inline ::std::vector<at::Tensor> Tensor::nonzero_numpy() const {
+    return at::_ops::nonzero_numpy::call(const_cast<Tensor&>(*this));
+}
+
+// aten::argwhere(Tensor self) -> Tensor
+inline at::Tensor Tensor::argwhere() const {
+    return at::_ops::argwhere::call(const_cast<Tensor&>(*this));
+}
+
+// aten::gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+inline at::Tensor Tensor::gather(int64_t dim, const at::Tensor & index, bool sparse_grad) const {
+    return at::_ops::gather::call(const_cast<Tensor&>(*this), dim, index, sparse_grad);
+}
+
+// aten::gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
+inline at::Tensor Tensor::gather(at::Dimname dim, const at::Tensor & index, bool sparse_grad) const {
+    return at::_ops::gather_dimname::call(const_cast<Tensor&>(*this), dim, index, sparse_grad);
+}
+
+// aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+inline at::Tensor Tensor::addcmul(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value) const {
+    return at::_ops::addcmul::call(const_cast<Tensor&>(*this), tensor1, tensor2, value);
+}
+
+// aten::addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addcmul_(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value) const {
+    return at::_ops::addcmul_::call(const_cast<Tensor&>(*this), tensor1, tensor2, value);
+}
+
+// aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+inline at::Tensor Tensor::addcdiv(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value) const {
+    return at::_ops::addcdiv::call(const_cast<Tensor&>(*this), tensor1, tensor2, value);
+}
+
+// aten::addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
+inline at::Tensor & Tensor::addcdiv_(const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value) const {
+    return at::_ops::addcdiv_::call(const_cast<Tensor&>(*this), tensor1, tensor2, value);
+}
+
+// aten::triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::triangular_solve(const at::Tensor & A, bool upper, bool transpose, bool unitriangular) const {
+    return at::_ops::triangular_solve::call(const_cast<Tensor&>(*this), A, upper, transpose, unitriangular);
+}
+
+// aten::svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> Tensor::svd(bool some, bool compute_uv) const {
+    return at::_ops::svd::call(const_cast<Tensor&>(*this), some, compute_uv);
+}
+
+// aten::swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
+inline at::Tensor Tensor::swapaxes(int64_t axis0, int64_t axis1) const {
+    return at::_ops::swapaxes::call(const_cast<Tensor&>(*this), axis0, axis1);
+}
+
+// aten::swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
+inline at::Tensor & Tensor::swapaxes_(int64_t axis0, int64_t axis1) const {
+    return at::_ops::swapaxes_::call(const_cast<Tensor&>(*this), axis0, axis1);
+}
+
+// aten::swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
+inline at::Tensor Tensor::swapdims(int64_t dim0, int64_t dim1) const {
+    return at::_ops::swapdims::call(const_cast<Tensor&>(*this), dim0, dim1);
+}
+
+// aten::swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
+inline at::Tensor & Tensor::swapdims_(int64_t dim0, int64_t dim1) const {
+    return at::_ops::swapdims_::call(const_cast<Tensor&>(*this), dim0, dim1);
+}
+
+// aten::cholesky(Tensor self, bool upper=False) -> Tensor
+inline at::Tensor Tensor::cholesky(bool upper) const {
+    return at::_ops::cholesky::call(const_cast<Tensor&>(*this), upper);
+}
+
+// aten::cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
+inline at::Tensor Tensor::cholesky_solve(const at::Tensor & input2, bool upper) const {
+    return at::_ops::cholesky_solve::call(const_cast<Tensor&>(*this), input2, upper);
+}
+
+// aten::cholesky_inverse(Tensor self, bool upper=False) -> Tensor
+inline at::Tensor Tensor::cholesky_inverse(bool upper) const {
+    return at::_ops::cholesky_inverse::call(const_cast<Tensor&>(*this), upper);
+}
+
+// aten::qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::qr(bool some) const {
+    return at::_ops::qr::call(const_cast<Tensor&>(*this), some);
+}
+
+// aten::geqrf(Tensor self) -> (Tensor a, Tensor tau)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::geqrf() const {
+    return at::_ops::geqrf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::orgqr(Tensor self, Tensor input2) -> Tensor
+inline at::Tensor Tensor::orgqr(const at::Tensor & input2) const {
+    return at::_ops::orgqr::call(const_cast<Tensor&>(*this), input2);
+}
+
+// aten::ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
+inline at::Tensor Tensor::ormqr(const at::Tensor & input2, const at::Tensor & input3, bool left, bool transpose) const {
+    return at::_ops::ormqr::call(const_cast<Tensor&>(*this), input2, input3, left, transpose);
+}
+
+// aten::lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
+inline at::Tensor Tensor::lu_solve(const at::Tensor & LU_data, const at::Tensor & LU_pivots) const {
+    return at::_ops::lu_solve::call(const_cast<Tensor&>(*this), LU_data, LU_pivots);
+}
+
+// aten::multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
+inline at::Tensor Tensor::multinomial(int64_t num_samples, bool replacement, c10::optional<at::Generator> generator) const {
+    return at::_ops::multinomial::call(const_cast<Tensor&>(*this), num_samples, replacement, generator);
+}
+
+// aten::lgamma_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::lgamma_() const {
+    return at::_ops::lgamma_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::lgamma(Tensor self) -> Tensor
+inline at::Tensor Tensor::lgamma() const {
+    return at::_ops::lgamma::call(const_cast<Tensor&>(*this));
+}
+
+// aten::digamma(Tensor self) -> Tensor
+inline at::Tensor Tensor::digamma() const {
+    return at::_ops::digamma::call(const_cast<Tensor&>(*this));
+}
+
+// aten::polygamma(int n, Tensor self) -> Tensor
+inline at::Tensor Tensor::polygamma(int64_t n) const {
+    return at::_ops::polygamma::call(n, const_cast<Tensor&>(*this));
+}
+
+// aten::polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
+inline at::Tensor & Tensor::polygamma_(int64_t n) const {
+    return at::_ops::polygamma_::call(const_cast<Tensor&>(*this), n);
+}
+
+// aten::erfinv(Tensor self) -> Tensor
+inline at::Tensor Tensor::erfinv() const {
+    return at::_ops::erfinv::call(const_cast<Tensor&>(*this));
+}
+
+// aten::erfinv_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::erfinv_() const {
+    return at::_ops::erfinv_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::i0(Tensor self) -> Tensor
+inline at::Tensor Tensor::i0() const {
+    return at::_ops::i0::call(const_cast<Tensor&>(*this));
+}
+
+// aten::i0_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::i0_() const {
+    return at::_ops::i0_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sign(Tensor self) -> Tensor
+inline at::Tensor Tensor::sign() const {
+    return at::_ops::sign::call(const_cast<Tensor&>(*this));
+}
+
+// aten::sign_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & Tensor::sign_() const {
+    return at::_ops::sign_::call(const_cast<Tensor&>(*this));
+}
+
+// aten::signbit(Tensor self) -> Tensor
+inline at::Tensor Tensor::signbit() const {
+    return at::_ops::signbit::call(const_cast<Tensor&>(*this));
+}
+
+// aten::dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
+inline at::Tensor Tensor::dist(const at::Tensor & other, const at::Scalar & p) const {
+    return at::_ops::dist::call(const_cast<Tensor&>(*this), other, p);
+}
+
+// aten::atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::atan2_(const at::Tensor & other) const {
+    return at::_ops::atan2_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::atan2(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::atan2(const at::Tensor & other) const {
+    return at::_ops::atan2::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::arctan2(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::arctan2(const at::Tensor & other) const {
+    return at::_ops::arctan2::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::arctan2_(const at::Tensor & other) const {
+    return at::_ops::arctan2_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
+inline at::Tensor Tensor::lerp(const at::Tensor & end, const at::Scalar & weight) const {
+    return at::_ops::lerp_Scalar::call(const_cast<Tensor&>(*this), end, weight);
+}
+
+// aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
+inline at::Tensor Tensor::lerp(const at::Tensor & end, const at::Tensor & weight) const {
+    return at::_ops::lerp_Tensor::call(const_cast<Tensor&>(*this), end, weight);
+}
+
+// aten::histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
+inline at::Tensor Tensor::histc(int64_t bins, const at::Scalar & min, const at::Scalar & max) const {
+    return at::_ops::histc::call(const_cast<Tensor&>(*this), bins, min, max);
+}
+
+// aten::histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::histogram(const at::Tensor & bins, const c10::optional<at::Tensor> & weight, bool density) const {
+    return at::_ops::histogram_bins_tensor::call(const_cast<Tensor&>(*this), bins, weight, density);
+}
+
+// aten::histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::histogram(int64_t bins, c10::optional<at::ArrayRef<double>> range, const c10::optional<at::Tensor> & weight, bool density) const {
+    return at::_ops::histogram_bin_ct::call(const_cast<Tensor&>(*this), bins, range, weight, density);
+}
+
+// aten::fmod.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::fmod(const at::Scalar & other) const {
+    return at::_ops::fmod_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::fmod_(const at::Scalar & other) const {
+    return at::_ops::fmod__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::fmod.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::fmod(const at::Tensor & other) const {
+    return at::_ops::fmod_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::fmod_(const at::Tensor & other) const {
+    return at::_ops::fmod__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::hypot(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::hypot(const at::Tensor & other) const {
+    return at::_ops::hypot::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::hypot_(const at::Tensor & other) const {
+    return at::_ops::hypot_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::igamma(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::igamma(const at::Tensor & other) const {
+    return at::_ops::igamma::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::igamma_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::igamma_(const at::Tensor & other) const {
+    return at::_ops::igamma_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::igammac(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::igammac(const at::Tensor & other) const {
+    return at::_ops::igammac::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::igammac_(const at::Tensor & other) const {
+    return at::_ops::igammac_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::nextafter(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::nextafter(const at::Tensor & other) const {
+    return at::_ops::nextafter::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::nextafter_(const at::Tensor & other) const {
+    return at::_ops::nextafter_::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor Tensor::remainder(const at::Scalar & other) const {
+    return at::_ops::remainder_Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
+inline at::Tensor & Tensor::remainder_(const at::Scalar & other) const {
+    return at::_ops::remainder__Scalar::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::remainder(const at::Tensor & other) const {
+    return at::_ops::remainder_Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
+inline at::Tensor & Tensor::remainder_(const at::Tensor & other) const {
+    return at::_ops::remainder__Tensor::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::min(Tensor self) -> Tensor
+inline at::Tensor Tensor::min() const {
+    return at::_ops::min::call(const_cast<Tensor&>(*this));
+}
+
+// aten::fmin(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::fmin(const at::Tensor & other) const {
+    return at::_ops::fmin::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::max(Tensor self) -> Tensor
+inline at::Tensor Tensor::max() const {
+    return at::_ops::max::call(const_cast<Tensor&>(*this));
+}
+
+// aten::fmax(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::fmax(const at::Tensor & other) const {
+    return at::_ops::fmax::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::maximum(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::maximum(const at::Tensor & other) const {
+    return at::_ops::maximum::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::max.other(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::max(const at::Tensor & other) const {
+    return at::_ops::max_other::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::minimum(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::minimum(const at::Tensor & other) const {
+    return at::_ops::minimum::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::min.other(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::min(const at::Tensor & other) const {
+    return at::_ops::min_other::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+inline at::Tensor Tensor::quantile(const at::Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation) const {
+    return at::_ops::quantile::call(const_cast<Tensor&>(*this), q, dim, keepdim, interpolation);
+}
+
+// aten::quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+inline at::Tensor Tensor::quantile(double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation) const {
+    return at::_ops::quantile_scalar::call(const_cast<Tensor&>(*this), q, dim, keepdim, interpolation);
+}
+
+// aten::nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+inline at::Tensor Tensor::nanquantile(const at::Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation) const {
+    return at::_ops::nanquantile::call(const_cast<Tensor&>(*this), q, dim, keepdim, interpolation);
+}
+
+// aten::nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False, *, str interpolation='linear') -> Tensor
+inline at::Tensor Tensor::nanquantile(double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation) const {
+    return at::_ops::nanquantile_scalar::call(const_cast<Tensor&>(*this), q, dim, keepdim, interpolation);
+}
+
+// aten::sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::sort(int64_t dim, bool descending) const {
+    return at::_ops::sort::call(const_cast<Tensor&>(*this), dim, descending);
+}
+
+// aten::sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::sort(c10::optional<bool> stable, int64_t dim, bool descending) const {
+    return at::_ops::sort_stable::call(const_cast<Tensor&>(*this), stable, dim, descending);
+}
+
+// aten::sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::sort(at::Dimname dim, bool descending) const {
+    return at::_ops::sort_dimname::call(const_cast<Tensor&>(*this), dim, descending);
+}
+
+// aten::sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::sort(c10::optional<bool> stable, at::Dimname dim, bool descending) const {
+    return at::_ops::sort_dimname_stable::call(const_cast<Tensor&>(*this), stable, dim, descending);
+}
+
+// aten::msort(Tensor self) -> Tensor
+inline at::Tensor Tensor::msort() const {
+    return at::_ops::msort::call(const_cast<Tensor&>(*this));
+}
+
+// aten::argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
+inline at::Tensor Tensor::argsort(int64_t dim, bool descending) const {
+    return at::_ops::argsort::call(const_cast<Tensor&>(*this), dim, descending);
+}
+
+// aten::argsort.stable(Tensor self, *, bool stable, int dim=-1, bool descending=False) -> Tensor
+inline at::Tensor Tensor::argsort(bool stable, int64_t dim, bool descending) const {
+    return at::_ops::argsort_stable::call(const_cast<Tensor&>(*this), stable, dim, descending);
+}
+
+// aten::argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
+inline at::Tensor Tensor::argsort(at::Dimname dim, bool descending) const {
+    return at::_ops::argsort_dimname::call(const_cast<Tensor&>(*this), dim, descending);
+}
+
+// aten::topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const {
+    return at::_ops::topk::call(const_cast<Tensor&>(*this), k, dim, largest, sorted);
+}
+
+// aten::topk(Tensor self, SymInt k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::topk_symint(c10::SymInt k, int64_t dim, bool largest, bool sorted) const {
+    return at::_ops::topk::call(const_cast<Tensor&>(*this), k, dim, largest, sorted);
+}
+
+// aten::all(Tensor self) -> Tensor
+inline at::Tensor Tensor::all() const {
+    return at::_ops::all::call(const_cast<Tensor&>(*this));
+}
+
+// aten::any(Tensor self) -> Tensor
+inline at::Tensor Tensor::any() const {
+    return at::_ops::any::call(const_cast<Tensor&>(*this));
+}
+
+// aten::renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
+inline at::Tensor Tensor::renorm(const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) const {
+    return at::_ops::renorm::call(const_cast<Tensor&>(*this), p, dim, maxnorm);
+}
+
+// aten::renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
+inline at::Tensor & Tensor::renorm_(const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm) const {
+    return at::_ops::renorm_::call(const_cast<Tensor&>(*this), p, dim, maxnorm);
+}
+
+// aten::unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
+inline at::Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const {
+    return at::_ops::unfold::call(const_cast<Tensor&>(*this), dimension, size, step);
+}
+
+// aten::equal(Tensor self, Tensor other) -> bool
+inline bool Tensor::equal(const at::Tensor & other) const {
+    return at::_ops::equal::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+inline at::Tensor Tensor::pow(const at::Tensor & exponent) const {
+    return at::_ops::pow_Tensor_Tensor::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+inline at::Tensor Tensor::pow(const at::Scalar & exponent) const {
+    return at::_ops::pow_Tensor_Scalar::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+inline at::Tensor & Tensor::pow_(const at::Scalar & exponent) const {
+    return at::_ops::pow__Scalar::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+inline at::Tensor & Tensor::pow_(const at::Tensor & exponent) const {
+    return at::_ops::pow__Tensor::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
+inline at::Tensor Tensor::float_power(const at::Tensor & exponent) const {
+    return at::_ops::float_power_Tensor_Tensor::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
+inline at::Tensor Tensor::float_power(const at::Scalar & exponent) const {
+    return at::_ops::float_power_Tensor_Scalar::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
+inline at::Tensor & Tensor::float_power_(const at::Scalar & exponent) const {
+    return at::_ops::float_power__Scalar::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
+inline at::Tensor & Tensor::float_power_(const at::Tensor & exponent) const {
+    return at::_ops::float_power__Tensor::call(const_cast<Tensor&>(*this), exponent);
+}
+
+// aten::normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & Tensor::normal_(double mean, double std, c10::optional<at::Generator> generator) const {
+    return at::_ops::normal_::call(const_cast<Tensor&>(*this), mean, std, generator);
+}
+
+// aten::alias(Tensor(a) self) -> Tensor(a)
+inline at::Tensor Tensor::alias() const {
+    return at::_ops::alias::call(const_cast<Tensor&>(*this));
+}
+
+// aten::isfinite(Tensor self) -> Tensor
+inline at::Tensor Tensor::isfinite() const {
+    return at::_ops::isfinite::call(const_cast<Tensor&>(*this));
+}
+
+// aten::isinf(Tensor self) -> Tensor
+inline at::Tensor Tensor::isinf() const {
+    return at::_ops::isinf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::record_stream(Tensor(a!) self, Stream s) -> ()
+inline void Tensor::record_stream(at::Stream s) const {
+    return at::_ops::record_stream::call(const_cast<Tensor&>(*this), s);
+}
+
+// aten::isposinf(Tensor self) -> Tensor
+inline at::Tensor Tensor::isposinf() const {
+    return at::_ops::isposinf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::isneginf(Tensor self) -> Tensor
+inline at::Tensor Tensor::isneginf() const {
+    return at::_ops::isneginf::call(const_cast<Tensor&>(*this));
+}
+
+// aten::det(Tensor self) -> Tensor
+inline at::Tensor Tensor::det() const {
+    return at::_ops::det::call(const_cast<Tensor&>(*this));
+}
+
+// aten::slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
+inline ::std::tuple<at::Tensor,at::Tensor> Tensor::slogdet() const {
+    return at::_ops::slogdet::call(const_cast<Tensor&>(*this));
+}
+
+// aten::logdet(Tensor self) -> Tensor
+inline at::Tensor Tensor::logdet() const {
+    return at::_ops::logdet::call(const_cast<Tensor&>(*this));
+}
+
+// aten::inverse(Tensor self) -> Tensor
+inline at::Tensor Tensor::inverse() const {
+    return at::_ops::inverse::call(const_cast<Tensor&>(*this));
+}
+
+// aten::inner(Tensor self, Tensor other) -> Tensor
+inline at::Tensor Tensor::inner(const at::Tensor & other) const {
+    return at::_ops::inner::call(const_cast<Tensor&>(*this), other);
+}
+
+// aten::outer(Tensor self, Tensor vec2) -> Tensor
+inline at::Tensor Tensor::outer(const at::Tensor & vec2) const {
+    return at::_ops::outer::call(const_cast<Tensor&>(*this), vec2);
+}
+
+// aten::ger(Tensor self, Tensor vec2) -> Tensor
+inline at::Tensor Tensor::ger(const at::Tensor & vec2) const {
+    return at::_ops::ger::call(const_cast<Tensor&>(*this), vec2);
+}
+
+// aten::to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+inline at::Tensor Tensor::to_padded_tensor(double padding, at::OptionalIntArrayRef output_size) const {
+    return at::_ops::to_padded_tensor::call(const_cast<Tensor&>(*this), padding, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt);
+}
+
+// aten::to_padded_tensor(Tensor self, float padding, SymInt[]? output_size=None) -> Tensor
+inline at::Tensor Tensor::to_padded_tensor_symint(double padding, at::OptionalSymIntArrayRef output_size) const {
+    return at::_ops::to_padded_tensor::call(const_cast<Tensor&>(*this), padding, output_size);
+}
+} // namespace at
+
+
+namespace c10 {
+template <>
+struct MaybeOwnedTraits<at::Tensor> {
+  using owned_type = at::Tensor;
+  using borrow_type = at::Tensor;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    // NOTE: this can be implemented without the special
+    // unsafe_borrow_t Tensor constructor as
+    //
+    // return borrow_type(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(from.unsafeGetTensorImpl()));
+    //
+    // but that hurts inlining due to the nullptr check in the
+    // Tensor(c10::intrusive_ptr<...>) constructor. We already know
+    // that from.impl_ isn't null because from is a valid Tensor, so
+    // we needn't do the check again. (using __builtin_assume can
+    // avoid this, but wouldn't be portable to MSVC.)
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseTensorImpl();
+    // See above note: this can be implemented with public API
+    // similarly to createBorrow(), but that would hurt inlining.
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<at::Tensor> {
+  using repr_type = at::Tensor;
+  using pointer_type = at::Tensor*;
+  using const_pointer_type = const at::Tensor*;
+
+  static repr_type nullRepr() {
+    return at::Tensor();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return at::Tensor(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(at::Tensor&& x) {
+    return std::move(x);
+  }
+
+  static void destroyOwned(at::Tensor& x) {
+    return ExclusivelyOwnedTraits<at::TensorBase>::destroyOwned(x);
+  }
+
+  static at::Tensor take(at::Tensor& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+} // namespace c10
+
+namespace at {
+
+inline c10::MaybeOwned<Tensor> borrow_from_optional_tensor(
+    const c10::optional<Tensor>& opt) {
+  return opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*opt)
+    : c10::MaybeOwned<Tensor>::owned(std::in_place);
+}
+
+inline c10::MaybeOwned<Tensor> Tensor::expect_contiguous(MemoryFormat memory_format) const & {
+  if (is_contiguous(memory_format)) {
+    return c10::MaybeOwned<Tensor>::borrowed(*this);
+  } else {
+    return c10::MaybeOwned<Tensor>::owned(__dispatch_contiguous(memory_format));
+  }
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/TorchDispatchUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/core/TorchDispatchUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad5d009e70553593a4d20106eacc8908717d8ab3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/TorchDispatchUtils.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/library.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+
+namespace at {
+namespace impl {
+
+TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
+TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
+TORCH_API bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li);
+using c10::impl::dispatch_mode_enabled;
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/TransformationHelper.h b/MLPY/Lib/site-packages/torch/include/ATen/core/TransformationHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2052e4c47df6a17159070091a323c1e604512590
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/TransformationHelper.h
@@ -0,0 +1,173 @@
+#include <c10/macros/Macros.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/MathConstants.h>
+#include <ATen/NumericUtils.h>
+#include <limits>
+#include <cstdint>
+#include <cassert>
+
+namespace at {
+
+// Using DistAccumType in accumulate types for distributions.
+// Note: Ideally we'd be using ATen/AccumulateType.h but looks
+// like the there is some inconsistency in how accumulate types
+// are mapped currently, e.g. for the cpu side, float is mapped
+// to double.
+template <typename T>
+struct DistAccumType {  };
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <> struct DistAccumType<half> { using type = float; };
+#endif
+template <> struct DistAccumType<BFloat16> { using type = float; };
+template <> struct DistAccumType<Half> { using type = float; };
+template <> struct DistAccumType<float> { using type = float; };
+template <> struct DistAccumType<double> { using type = double; };
+
+template <typename T>
+using dist_acctype = typename DistAccumType<T>::type;
+
+namespace transformation {
+
+/**
+ * A transformation function for `torch.Tensor.random_()`, when both `from` and `to` are specified.
+ * `range` is `to - from`
+ * `base` is `from`
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline T uniform_int_from_to(V val, uint64_t range, int64_t base) {
+  return static_cast<T>(static_cast<int64_t>((val % range) + base));
+}
+
+/**
+ * A transformation function for `torch.Tensor.random_()`, when `from=min_value(int64_t)` and to=None
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
+  return static_cast<T>(static_cast<int64_t>(val));
+}
+
+/**
+ * A transformation function for `torch.Tensor.random_()`, when used without specifying `from` and `to`.
+ * In order to prevent compiler warnings reported in GitHub issue 46391, T can't be float or double
+ * in this overloaded version
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline typename std::enable_if<!(std::is_floating_point<T>::value), T>::type uniform_int(V val) {
+  if constexpr (std::is_same_v<T, bool>) {
+    return static_cast<bool>(val & 1);
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
+  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same<T, at::BFloat16>::value) {
+    return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
+  } else if constexpr (std::is_integral_v<T>) {
+    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
+  } else {
+    assert(false);
+    return 0;
+  }
+}
+
+/**
+ * An overloaded transformation function for `torch.Tensor.random_()`, when used without specifying `from` and `to`,
+ * added to fix compiler warnings reported in GitHub issue 46391. T is either float or double in this version.
+ */
+template<typename T, typename V>
+C10_HOST_DEVICE inline typename std::enable_if<std::is_floating_point<T>::value, T>::type uniform_int(V val) {
+  return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
+}
+
+template <typename T, typename V>
+C10_HOST_DEVICE inline dist_acctype<T> uniform_real(V val, T from, T to) {
+  constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+  constexpr auto DIVISOR = static_cast<dist_acctype<T>>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+  dist_acctype<T> x = (val & MASK) * DIVISOR;
+  return (x * (to - from) + from);
+}
+
+/**
+ * Transforms normally distributed `val` with mean 0.0 and standard deviation 1.0 to
+ * normally distributed with `mean` and standard deviation `std`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T normal(T val, T mean, T std) {
+  return val * std + mean;
+}
+
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * Cauchy distribution with location parameter `median` and scale parameter `sigma`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
+  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
+  // __tanf overflows and returns `inf/-inf` when (val > 1 - eps) or (val < 0 + eps),
+  // thus we clip those values.
+  constexpr T eps = std::numeric_limits<T>::epsilon();
+  constexpr T one_minus_eps = 1 - eps;
+  constexpr T zero_plus_eps = 0 + eps;
+  val = (val > one_minus_eps ? one_minus_eps : val);
+  val = (val < zero_plus_eps ? zero_plus_eps : val);
+  return median + sigma * at::tan(c10::pi<T> * (val - static_cast<T>(0.5)));
+}
+
+template <>
+C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
+  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
+  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
+}
+
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * exponentially distributed with `lambda` parameter of the distribution.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T exponential(T val, T lambda) {
+  // https://en.wikipedia.org/wiki/Exponential_distribution#Generating_exponential_variates
+  // Different implementations for CUDA and CPU to preserve original logic
+  // TODO: must be investigated and unified!!!
+  // https://github.com/pytorch/pytorch/issues/38662
+#if defined(__CUDACC__) || defined(__HIPCC__)
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
+      // curand_uniform has (0,1] bounds. log(1) is 0 and exponential excludes 0.
+      // we need log to be not 0, and not underflow when converted to half
+      // fast __logf approximation can underflow, so set log to -epsilon/2 for 1 or close to 1 args
+  auto log = val >= static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2
+      ? -std::numeric_limits<T>::epsilon() / 2
+      : at::log(val);
+  return static_cast<T>(-1.0) / lambda * log;
+#else
+  return static_cast<T>(-1.0) / lambda * at::log1p(-val);
+#endif
+}
+
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * geometrically distributed with success probability `p`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T geometric(T val, T p) {
+  // https://en.wikipedia.org/wiki/Geometric_distribution#Related_distributions
+  return static_cast<T>(::ceil(at::log(val) / at::log1p(-p)));
+}
+
+/**
+ * Transforms normally distributed `val` to log-normally distributed.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T log_normal(T val) {
+  // https://en.wikipedia.org/wiki/Log-normal_distribution#Mode,_median,_quantiles
+  return at::exp(val);
+}
+
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * bernoulli distributed with success probability `p`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T bernoulli(T val, T p) {
+  return val < p;
+}
+
+}} // namespace at::transformation
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..49612392cc4f66224d30e9480522acca886fd293
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h
@@ -0,0 +1 @@
+#include <c10/core/UndefinedTensorImpl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/UnsafeFromTH.h b/MLPY/Lib/site-packages/torch/include/ATen/core/UnsafeFromTH.h
new file mode 100644
index 0000000000000000000000000000000000000000..a47ad1586d70587faf7dea99d50b20dbea3a344f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/UnsafeFromTH.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+
+inline Tensor unsafeTensorFromTH(void * th_pointer, bool retain) {
+  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(static_cast<TensorImpl*>(th_pointer));
+  if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) {
+    c10::raw::intrusive_ptr::incref(tensor_impl.get());
+  }
+  return Tensor(std::move(tensor_impl));
+}
+
+inline Storage unsafeStorageFromTH(void * th_pointer, bool retain) {
+  if (retain && th_pointer) {
+    c10::raw::intrusive_ptr::incref(static_cast<StorageImpl*>(th_pointer));
+  }
+  return Storage(c10::intrusive_ptr<StorageImpl>::reclaim(static_cast<StorageImpl*>(th_pointer)));
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/VariableHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/core/VariableHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f315a092b63b7993527e6483782a8678bb1583be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/VariableHooksInterface.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <ATen/core/Tensor.h>
+
+// A little explanation about why this file exists at all.  We have
+// a few methods on Tensor class which require access to reified access to
+// AutogradMeta.  In open source, this isn't a big deal: we just access
+// torch/csrc/autograd/variable.h from aten/src/ATen/core/Tensor.cpp and
+// we can put the definitions inline.  This is because everything gets balled
+// into a single dynamic library in the end.
+//
+// However, inside our Facebook internal version of our build system, we
+// have a split between aten and torch/csrc.  So we cannot simply just
+// cross this boundary.  "Now wait," you might say, "Why don't we just
+// merge the libraries inside Facebook".  Well, the problem is that there
+// are some downstream applications which are at binary size limit, and
+// incorporating all of the extra code from libtorch would push them
+// over (admarket/adreview/service:adreviewservice, see also
+// https://github.com/pytorch/pytorch/pull/29299)  So if you want to do that,
+// we have to fix all of the services like this.
+//
+// I didn't want to block eliminating Tensor-Variable on this work, so I
+// had to introduce another dynamic dispatch to get to the variable
+// implementations (which live in torch/csrc/autograd/variable.cpp, FYI).
+//
+// I also considered using our existing dynamic dispatch mechanism, c10
+// dispatcher, to do this.  However, (1) some of the functions on Tensor
+// have weird signatures that are not supported by autograd, and (2)
+// see this bug https://github.com/pytorch/pytorch/issues/30102
+
+namespace torch { namespace autograd {
+
+struct Node;
+
+}} // namespace torch::autograd
+
+namespace at {
+namespace impl {
+
+struct TORCH_API VariableHooksInterface {
+  virtual ~VariableHooksInterface() = default;
+  virtual TensorBase tensor_data(const TensorBase&) const = 0;
+  virtual TensorBase variable_data(const TensorBase&) const = 0;
+  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(const TensorBase&) const = 0;
+  virtual unsigned _register_hook(
+      const TensorBase&,
+      std::function<TensorBase(const TensorBase&)> hook) const = 0;
+  virtual void remove_hook(const TensorBase&, unsigned pos) const = 0;
+  virtual bool is_view(const TensorBase&) const = 0;
+  virtual const TensorBase& base(const TensorBase&) const = 0;
+  virtual const std::string& name(const TensorBase&) const = 0;
+  virtual bool is_leaf(const TensorBase&) const = 0;
+  virtual int64_t output_nr(const TensorBase&) const = 0;
+  virtual void set_data(const TensorBase&, const TensorBase&) const = 0;
+  virtual TensorBase data(const TensorBase&) const = 0;
+  virtual int64_t _version(const TensorBase&) const = 0;
+  virtual void retain_grad(const TensorBase&) const = 0;
+  virtual bool retains_grad(const TensorBase&) const = 0;
+  virtual void _backward(const Tensor&, TensorList, const c10::optional<Tensor>&, c10::optional<bool>, bool) const = 0;
+  virtual void requires_grad_(const TensorBase&, bool) const = 0;
+  virtual void basic_autograd_not_implemented_fallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0;
+};
+
+TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
+TORCH_API VariableHooksInterface* GetVariableHooks();
+TORCH_API bool HasVariableHooks();
+
+struct TORCH_API VariableHooksRegisterer {
+  explicit VariableHooksRegisterer(VariableHooksInterface* hooks) {
+    SetVariableHooks(hooks);
+  }
+};
+
+}} // namespace at::impl
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Variadic.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Variadic.h
new file mode 100644
index 0000000000000000000000000000000000000000..22007cf39eff2b832732d68c7bb0bd9429a49865
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Variadic.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include <c10/util/ArrayRef.h>
+#include <ATen/core/List.h>
+
+namespace at {
+
+// This class allows you to write variadic functions which
+// call a (possibly overloaded) function on each argument,
+// in order.  This is most commonly used in autogenerated code,
+// where it is convenient to have a function that can uniformly
+// take arguments of different types.  If your arguments
+// are homogenous consider using a std::initializer_list instead.
+//
+// For examples of this in use, see torch/csrc/utils/variadic.h
+template <typename F>
+struct IterArgs {
+  template <typename... Args>
+  inline F& apply() {
+    return self();
+  }
+
+  // NB: Use perfect forwarding here, otherwise we'll make value
+  // copies of all arguments!
+  template <typename T, typename... Args>
+  inline F& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  // Here are some handy overloads which provide sensible
+  // defaults for container-like structures that one might
+  // be interested in recursing into.  You can enable them
+  // by adding:
+  //
+  //    using IterArgs<YourStructName>::operator()
+  //
+  // to your struct.  These are not enabled by default because
+  // you may be able to process these structures more efficiently
+  // than handling them one-by-one.
+
+  template <typename T>
+  void operator()(c10::IListRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  template <typename T>
+  void operator()(at::ArrayRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  template <typename T>
+  void operator()(const torch::List<T>& args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  // NB: we need to specify std::vector manually as C++ won't
+  // do an implicit conversion to make a template deduction go through.
+  template <typename T>
+  void operator()(const std::vector<T>& args) {
+    self()(at::ArrayRef<T>{args});
+  }
+
+  constexpr bool short_circuit() const {
+    return false;
+  }
+
+ private:
+  inline F& self() {
+    return *static_cast<F*>(this);
+  }
+};
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/Vitals.h b/MLPY/Lib/site-packages/torch/include/ATen/core/Vitals.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fbaa61f37c9f32951c52855b22efac8d73ac74d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/Vitals.h
@@ -0,0 +1,96 @@
+#pragma once
+#include <cstring>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <unordered_map>
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
+
+namespace at {
+namespace vitals {
+
+TORCH_API bool torchVitalEnabled();
+
+struct TORCH_API TorchVitalAttr {
+  // always initialized to empty
+  std::string value = "";
+  template <typename T>
+  TorchVitalAttr& operator<<(const T& t) {
+    if (torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value += ss.str();
+    }
+    return *this;
+  }
+
+  template <typename T>
+  void write(const T& t, bool force) {
+    if (force || torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value = ss.str();
+    }
+  }
+};
+
+struct TORCH_API TorchVital {
+  std::string name;
+  std::unordered_map<std::string, TorchVitalAttr> attrs;
+
+  explicit TorchVital(std::string n) : name(std::move(n)) {}
+  TorchVital(const TorchVital&) = default;
+  TorchVital(TorchVital&&) = default;
+  TorchVital() = delete;
+
+  TorchVitalAttr& create(const std::string& attr);
+  TorchVitalAttr& create(const std::string& attr, bool force);
+  friend std::ostream& operator<<(std::ostream& os, const TorchVital& dt);
+
+  ~TorchVital();
+};
+
+std::ostream& operator<<(std::ostream& os, TorchVital const& tv);
+
+// A way to access vitals by string names instead of by global reference.
+// This enables access to vitals from the PythonAPI.
+class TORCH_API APIVitals {
+ public:
+  bool vitals_enabled;
+
+  // Set any vital sign that was added to the map.
+  bool setVital(
+      const std::string& vital_name,
+      const std::string& attr_name,
+      const std::string& value,
+      bool force = false);
+  std::string readVitals();
+
+  APIVitals();
+
+  // Ensure this stays a singleton
+  APIVitals(APIVitals const& other) = delete;
+  APIVitals(APIVitals&& other) = delete;
+  APIVitals& operator=(const APIVitals&) = delete;
+  APIVitals& operator=(APIVitals&&) = delete;
+
+ private:
+  std::unordered_map<std::string, TorchVital> name_map_;
+};
+
+extern TORCH_API APIVitals VitalsAPI;
+
+} // namespace vitals
+} // namespace at
+
+#define TORCH_VITAL_DECLARE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name;
+
+#define TORCH_VITAL_DEFINE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name(#name);
+
+#define TORCH_VITAL_BASE(name) TorchVital_##name
+
+#define TORCH_VITAL(name, attr) TORCH_VITAL_BASE(name).create(#attr)
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/alias_info.h b/MLPY/Lib/site-packages/torch/include/ATen/core/alias_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..9670e92646c44d7ca23700010f7ea971bc0b7989
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/alias_info.h
@@ -0,0 +1,151 @@
+#pragma once
+#include <unordered_set>
+#include <vector>
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
+
+namespace c10 {
+/**
+ * class AliasInfo
+ *
+ * Data structure to hold aliasing information for an `Argument`. They can be
+ * nested to represent aliasing information on contained types.
+ *
+ * There is a `beforeSet` which describes the aliasing information before the
+ * operator executes, and an `afterSet` that describes aliasing info
+ * after execution.
+ */
+class AliasInfo {
+ public:
+  // Symbol for the set that can alias anything
+  static Symbol wildcardSet() {
+    static const Symbol wc = Symbol::fromQualString("alias::*");
+    return wc;
+  }
+
+  void setIsWrite(bool isWrite) {
+    isWrite_ = isWrite;
+  }
+
+  bool isWrite() const {
+    return isWrite_;
+  }
+
+  void addBeforeSet(Symbol aliasSet) {
+    beforeSets_.insert(aliasSet);
+  }
+
+  void addAfterSet(Symbol aliasSet) {
+    afterSets_.insert(aliasSet);
+  }
+
+  const std::unordered_set<Symbol>& beforeSets() const {
+    return beforeSets_;
+  }
+
+  const std::unordered_set<Symbol>& afterSets() const {
+    return afterSets_;
+  }
+
+  Symbol beforeSet() const {
+    AT_ASSERT(beforeSets_.size() == 1);
+    return *beforeSets_.begin();
+  }
+
+  bool isWildcardBefore() const {
+    return beforeSets_.count(wildcardSet()) != 0;
+  }
+
+  bool isWildcardAfter() const {
+    return afterSets_.count(wildcardSet()) != 0;
+  }
+
+  // the alias info for the contained types of the type
+  // e.g. if this is an annotation on List[T], `sets` refers to
+  // the alias sets that the list may be in
+  // while containedTypes()[0] refers to the sets that members of the list
+  // may be in
+  void addContainedType(AliasInfo aliasInfo) {
+    containedTypes_.push_back(std::move(aliasInfo));
+  }
+  const std::vector<AliasInfo>& containedTypes() const {
+    return containedTypes_;
+  }
+
+ private:
+  std::unordered_set<Symbol> beforeSets_;
+  std::unordered_set<Symbol> afterSets_;
+  std::vector<AliasInfo> containedTypes_;
+  bool isWrite_ = false;
+};
+
+inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
+  return lhs.isWrite() == rhs.isWrite()
+      && lhs.beforeSets() == rhs.beforeSets()
+      && lhs.afterSets() == rhs.afterSets()
+      && lhs.containedTypes() == rhs.containedTypes();
+}
+
+// this does match the way things are represented in the schema
+inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
+  out << "(";
+  bool first = true;
+  for (const auto& set : aliasInfo.beforeSets()) {
+    if (first) {
+      first = false;
+    } else {
+      out << "|";
+    }
+    out << set.toUnqualString();
+  }
+  if (aliasInfo.isWrite()) {
+    out << "!";
+  }
+  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
+    out << " -> ";
+    first = true;
+    for (const auto& set : aliasInfo.afterSets()) {
+      if (first) {
+        first = false;
+      } else {
+        out << "|";
+      }
+      out << set.toUnqualString();
+    }
+  }
+  out << ")";
+  return out;
+}
+} // namespace c10
+
+namespace std {
+template <>
+  struct hash<c10::AliasInfo> {
+    size_t operator()(const c10::AliasInfo& aliasInfo) const {
+      auto hash = std::hash<bool>()(aliasInfo.isWrite());
+
+      // NOTE: for unordered_set hashes, we couldn't use hash_combine
+      // because hash_combine is order dependent. Instead, we choose to
+      // use XOR as the combining function as XOR is commutative.
+      size_t before_set_hash_seed = 0;
+      for (auto &e: aliasInfo.beforeSets()) {
+        auto symbol_hash = std::hash<c10::Symbol>()(e);
+        before_set_hash_seed = before_set_hash_seed ^ symbol_hash;
+      }
+      size_t after_set_hash_seed = 0;
+      for (auto &e: aliasInfo.afterSets()) {
+        auto symbol_hash = std::hash<c10::Symbol>()(e);
+        after_set_hash_seed = after_set_hash_seed ^ symbol_hash;
+      }
+
+      hash = c10::hash_combine(hash, before_set_hash_seed);
+      hash = c10::hash_combine(hash, after_set_hash_seed);
+      for (auto &e: aliasInfo.containedTypes()) {
+        auto contained_type_hash = std::hash<c10::AliasInfo>()(e);
+        hash = c10::hash_combine(hash, contained_type_hash);
+      }
+      return hash;
+    }
+  };
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/aten_interned_strings.h b/MLPY/Lib/site-packages/torch/include/ATen/core/aten_interned_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..8348b554d6f189e1b35d86a087822f67286dc235
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/aten_interned_strings.h
@@ -0,0 +1,2213 @@
+#pragma once
+
+// @generated by torchgen/gen.py from aten_interned_strings.h
+
+#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on native_functions.yaml,          \
+  meaning the file will need to be re-compiled every time an operator   \
+  is changed or added. Consider if including <ATen/core/symbol.h> for   \
+  the c10::Symbol class would be sufficient, or if your change would be \
+  better placed in another file.
+#endif
+
+// ATen symbols correspond exactly to operators defined in ATen. Every
+// symbol here corresponds exactly to an ATen operation defined in
+// native_functions.yaml; attributes are in one-to-one correspondence
+// with their ATen name.
+
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+_(aten, __and__) \
+_(aten, __iand__) \
+_(aten, __ilshift__) \
+_(aten, __ior__) \
+_(aten, __irshift__) \
+_(aten, __ixor__) \
+_(aten, __lshift__) \
+_(aten, __or__) \
+_(aten, __rshift__) \
+_(aten, __xor__) \
+_(aten, _adaptive_avg_pool2d) \
+_(aten, _adaptive_avg_pool2d_backward) \
+_(aten, _adaptive_avg_pool3d) \
+_(aten, _adaptive_avg_pool3d_backward) \
+_(aten, _add_batch_dim) \
+_(aten, _add_relu) \
+_(aten, _add_relu_) \
+_(aten, _addmm_activation) \
+_(aten, _aminmax) \
+_(aten, _amp_foreach_non_finite_check_and_unscale) \
+_(aten, _amp_foreach_non_finite_check_and_unscale_) \
+_(aten, _amp_update_scale) \
+_(aten, _amp_update_scale_) \
+_(aten, _assert_async) \
+_(aten, _assert_scalar) \
+_(aten, _assert_tensor_metadata) \
+_(aten, _autocast_to_full_precision) \
+_(aten, _autocast_to_reduced_precision) \
+_(aten, _backward) \
+_(aten, _batch_norm_impl_index) \
+_(aten, _batch_norm_impl_index_backward) \
+_(aten, _cast_Byte) \
+_(aten, _cast_Char) \
+_(aten, _cast_Double) \
+_(aten, _cast_Float) \
+_(aten, _cast_Half) \
+_(aten, _cast_Int) \
+_(aten, _cast_Long) \
+_(aten, _cast_Short) \
+_(aten, _cdist_backward) \
+_(aten, _cdist_forward) \
+_(aten, _cholesky_solve_helper) \
+_(aten, _choose_qparams_per_tensor) \
+_(aten, _chunk_cat) \
+_(aten, _coalesce) \
+_(aten, _coalesced) \
+_(aten, _coalesced_) \
+_(aten, _compute_linear_combination) \
+_(aten, _conj) \
+_(aten, _conj_copy) \
+_(aten, _conj_physical) \
+_(aten, _conv_depthwise2d) \
+_(aten, _convert_indices_from_coo_to_csr) \
+_(aten, _convert_indices_from_csr_to_coo) \
+_(aten, _convert_weight_to_int4pack) \
+_(aten, _convolution) \
+_(aten, _convolution_double_backward) \
+_(aten, _convolution_mode) \
+_(aten, _copy_from) \
+_(aten, _copy_from_and_resize) \
+_(aten, _cslt_compress) \
+_(aten, _cslt_sparse_mm) \
+_(aten, _cslt_sparse_mm_search) \
+_(aten, _ctc_loss) \
+_(aten, _ctc_loss_backward) \
+_(aten, _cudnn_ctc_loss) \
+_(aten, _cudnn_init_dropout_state) \
+_(aten, _cudnn_rnn) \
+_(aten, _cudnn_rnn_backward) \
+_(aten, _cudnn_rnn_flatten_weight) \
+_(aten, _cufft_clear_plan_cache) \
+_(aten, _cufft_get_plan_cache_max_size) \
+_(aten, _cufft_get_plan_cache_size) \
+_(aten, _cufft_set_plan_cache_max_size) \
+_(aten, _cummax_helper) \
+_(aten, _cummin_helper) \
+_(aten, _debug_has_internal_overlap) \
+_(aten, _dimI) \
+_(aten, _dimV) \
+_(aten, _dim_arange) \
+_(aten, _dirichlet_grad) \
+_(aten, _efficient_attention_backward) \
+_(aten, _efficient_attention_forward) \
+_(aten, _efficientzerotensor) \
+_(aten, _embedding_bag) \
+_(aten, _embedding_bag_backward) \
+_(aten, _embedding_bag_dense_backward) \
+_(aten, _embedding_bag_forward_only) \
+_(aten, _embedding_bag_per_sample_weights_backward) \
+_(aten, _embedding_bag_sparse_backward) \
+_(aten, _empty_affine_quantized) \
+_(aten, _empty_per_channel_affine_quantized) \
+_(aten, _euclidean_dist) \
+_(aten, _fake_quantize_learnable_per_channel_affine) \
+_(aten, _fake_quantize_learnable_per_channel_affine_backward) \
+_(aten, _fake_quantize_learnable_per_tensor_affine) \
+_(aten, _fake_quantize_learnable_per_tensor_affine_backward) \
+_(aten, _fake_quantize_per_tensor_affine_cachemask_tensor_qparams) \
+_(aten, _fft_c2c) \
+_(aten, _fft_c2r) \
+_(aten, _fft_r2c) \
+_(aten, _fill_mem_eff_dropout_mask) \
+_(aten, _fill_mem_eff_dropout_mask_) \
+_(aten, _flash_attention_backward) \
+_(aten, _flash_attention_forward) \
+_(aten, _foobar) \
+_(aten, _foreach_abs) \
+_(aten, _foreach_abs_) \
+_(aten, _foreach_acos) \
+_(aten, _foreach_acos_) \
+_(aten, _foreach_add) \
+_(aten, _foreach_add_) \
+_(aten, _foreach_addcdiv) \
+_(aten, _foreach_addcdiv_) \
+_(aten, _foreach_addcmul) \
+_(aten, _foreach_addcmul_) \
+_(aten, _foreach_asin) \
+_(aten, _foreach_asin_) \
+_(aten, _foreach_atan) \
+_(aten, _foreach_atan_) \
+_(aten, _foreach_ceil) \
+_(aten, _foreach_ceil_) \
+_(aten, _foreach_clamp_max) \
+_(aten, _foreach_clamp_max_) \
+_(aten, _foreach_clamp_min) \
+_(aten, _foreach_clamp_min_) \
+_(aten, _foreach_copy) \
+_(aten, _foreach_copy_) \
+_(aten, _foreach_cos) \
+_(aten, _foreach_cos_) \
+_(aten, _foreach_cosh) \
+_(aten, _foreach_cosh_) \
+_(aten, _foreach_div) \
+_(aten, _foreach_div_) \
+_(aten, _foreach_erf) \
+_(aten, _foreach_erf_) \
+_(aten, _foreach_erfc) \
+_(aten, _foreach_erfc_) \
+_(aten, _foreach_exp) \
+_(aten, _foreach_exp_) \
+_(aten, _foreach_expm1) \
+_(aten, _foreach_expm1_) \
+_(aten, _foreach_floor) \
+_(aten, _foreach_floor_) \
+_(aten, _foreach_frac) \
+_(aten, _foreach_frac_) \
+_(aten, _foreach_lerp) \
+_(aten, _foreach_lerp_) \
+_(aten, _foreach_lgamma) \
+_(aten, _foreach_lgamma_) \
+_(aten, _foreach_log) \
+_(aten, _foreach_log10) \
+_(aten, _foreach_log10_) \
+_(aten, _foreach_log1p) \
+_(aten, _foreach_log1p_) \
+_(aten, _foreach_log2) \
+_(aten, _foreach_log2_) \
+_(aten, _foreach_log_) \
+_(aten, _foreach_maximum) \
+_(aten, _foreach_maximum_) \
+_(aten, _foreach_minimum) \
+_(aten, _foreach_minimum_) \
+_(aten, _foreach_mul) \
+_(aten, _foreach_mul_) \
+_(aten, _foreach_neg) \
+_(aten, _foreach_neg_) \
+_(aten, _foreach_norm) \
+_(aten, _foreach_pow) \
+_(aten, _foreach_pow_) \
+_(aten, _foreach_reciprocal) \
+_(aten, _foreach_reciprocal_) \
+_(aten, _foreach_round) \
+_(aten, _foreach_round_) \
+_(aten, _foreach_sigmoid) \
+_(aten, _foreach_sigmoid_) \
+_(aten, _foreach_sign) \
+_(aten, _foreach_sign_) \
+_(aten, _foreach_sin) \
+_(aten, _foreach_sin_) \
+_(aten, _foreach_sinh) \
+_(aten, _foreach_sinh_) \
+_(aten, _foreach_sqrt) \
+_(aten, _foreach_sqrt_) \
+_(aten, _foreach_sub) \
+_(aten, _foreach_sub_) \
+_(aten, _foreach_tan) \
+_(aten, _foreach_tan_) \
+_(aten, _foreach_tanh) \
+_(aten, _foreach_tanh_) \
+_(aten, _foreach_trunc) \
+_(aten, _foreach_trunc_) \
+_(aten, _foreach_zero) \
+_(aten, _foreach_zero_) \
+_(aten, _functional_assert_async) \
+_(aten, _functional_assert_scalar) \
+_(aten, _functional_sym_constrain_range) \
+_(aten, _functional_sym_constrain_range_for_size) \
+_(aten, _fused_adam) \
+_(aten, _fused_adam_) \
+_(aten, _fused_adamw) \
+_(aten, _fused_adamw_) \
+_(aten, _fused_dropout) \
+_(aten, _fused_moving_avg_obs_fq_helper) \
+_(aten, _fused_moving_avg_obs_fq_helper_functional) \
+_(aten, _fused_sdp_choice) \
+_(aten, _fused_sgd) \
+_(aten, _fused_sgd_) \
+_(aten, _fw_primal) \
+_(aten, _fw_primal_copy) \
+_(aten, _gather_sparse_backward) \
+_(aten, _grid_sampler_2d_cpu_fallback) \
+_(aten, _grid_sampler_2d_cpu_fallback_backward) \
+_(aten, _has_compatible_shallow_copy_type) \
+_(aten, _has_same_storage_numel) \
+_(aten, _histogramdd_bin_edges) \
+_(aten, _histogramdd_from_bin_cts) \
+_(aten, _histogramdd_from_bin_tensors) \
+_(aten, _index_put_impl) \
+_(aten, _index_put_impl_) \
+_(aten, _indices) \
+_(aten, _indices_copy) \
+_(aten, _int_mm) \
+_(aten, _is_all_true) \
+_(aten, _is_any_true) \
+_(aten, _is_zerotensor) \
+_(aten, _lazy_clone) \
+_(aten, _linalg_check_errors) \
+_(aten, _linalg_det) \
+_(aten, _linalg_eigh) \
+_(aten, _linalg_eigvals) \
+_(aten, _linalg_slogdet) \
+_(aten, _linalg_solve_ex) \
+_(aten, _linalg_svd) \
+_(aten, _local_scalar_dense) \
+_(aten, _log_softmax) \
+_(aten, _log_softmax_backward_data) \
+_(aten, _logcumsumexp) \
+_(aten, _lstm_mps) \
+_(aten, _lu_with_info) \
+_(aten, _make_dep_token) \
+_(aten, _make_dual) \
+_(aten, _make_dual_copy) \
+_(aten, _make_per_channel_quantized_tensor) \
+_(aten, _make_per_tensor_quantized_tensor) \
+_(aten, _masked_scale) \
+_(aten, _masked_softmax) \
+_(aten, _masked_softmax_backward) \
+_(aten, _mixed_dtypes_linear) \
+_(aten, _mkldnn_reshape) \
+_(aten, _mkldnn_transpose) \
+_(aten, _mkldnn_transpose_) \
+_(aten, _mps_convolution) \
+_(aten, _mps_convolution_transpose) \
+_(aten, _native_batch_norm_legit) \
+_(aten, _native_batch_norm_legit_functional) \
+_(aten, _native_batch_norm_legit_no_training) \
+_(aten, _native_multi_head_attention) \
+_(aten, _neg_view) \
+_(aten, _neg_view_copy) \
+_(aten, _nested_from_padded) \
+_(aten, _nested_from_padded_and_nested_example) \
+_(aten, _nested_get_jagged_dummy) \
+_(aten, _nested_get_lengths) \
+_(aten, _nested_get_offsets) \
+_(aten, _nested_get_ragged_idx) \
+_(aten, _nested_get_values) \
+_(aten, _nested_get_values_copy) \
+_(aten, _nested_select_backward) \
+_(aten, _nested_sum_backward) \
+_(aten, _nested_tensor_from_mask) \
+_(aten, _nested_tensor_from_mask_left_aligned) \
+_(aten, _nested_tensor_from_tensor_list) \
+_(aten, _nested_tensor_size) \
+_(aten, _nested_tensor_softmax_with_shape) \
+_(aten, _nested_tensor_storage_offsets) \
+_(aten, _nested_tensor_strides) \
+_(aten, _nested_view_from_buffer) \
+_(aten, _nested_view_from_buffer_copy) \
+_(aten, _nested_view_from_jagged) \
+_(aten, _nested_view_from_jagged_copy) \
+_(aten, _new_zeros_with_same_feature_meta) \
+_(aten, _nnpack_available) \
+_(aten, _nnpack_spatial_convolution) \
+_(aten, _nnz) \
+_(aten, _pack_padded_sequence) \
+_(aten, _pack_padded_sequence_backward) \
+_(aten, _pad_circular) \
+_(aten, _pad_enum) \
+_(aten, _pad_packed_sequence) \
+_(aten, _pdist_backward) \
+_(aten, _pdist_forward) \
+_(aten, _pin_memory) \
+_(aten, _prelu_kernel) \
+_(aten, _prelu_kernel_backward) \
+_(aten, _print) \
+_(aten, _propagate_xla_data) \
+_(aten, _remove_batch_dim) \
+_(aten, _reshape_alias) \
+_(aten, _reshape_alias_copy) \
+_(aten, _reshape_copy) \
+_(aten, _reshape_from_tensor) \
+_(aten, _resize_output) \
+_(aten, _resize_output_) \
+_(aten, _rowwise_prune) \
+_(aten, _sample_dirichlet) \
+_(aten, _saturate_weight_to_fp16) \
+_(aten, _scaled_dot_product_attention_math) \
+_(aten, _scaled_dot_product_cudnn_attention) \
+_(aten, _scaled_dot_product_efficient_attention) \
+_(aten, _scaled_dot_product_efficient_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention) \
+_(aten, _scaled_dot_product_flash_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu_backward) \
+_(aten, _scaled_mm) \
+_(aten, _segment_reduce_backward) \
+_(aten, _shape_as_tensor) \
+_(aten, _slow_conv2d_backward) \
+_(aten, _slow_conv2d_forward) \
+_(aten, _sobol_engine_draw) \
+_(aten, _sobol_engine_ff) \
+_(aten, _sobol_engine_ff_) \
+_(aten, _sobol_engine_initialize_state) \
+_(aten, _sobol_engine_initialize_state_) \
+_(aten, _sobol_engine_scramble) \
+_(aten, _sobol_engine_scramble_) \
+_(aten, _softmax) \
+_(aten, _softmax_backward_data) \
+_(aten, _sparse_addmm) \
+_(aten, _sparse_broadcast_to) \
+_(aten, _sparse_broadcast_to_copy) \
+_(aten, _sparse_bsc_tensor_unsafe) \
+_(aten, _sparse_bsr_tensor_unsafe) \
+_(aten, _sparse_compressed_tensor_unsafe) \
+_(aten, _sparse_coo_tensor_unsafe) \
+_(aten, _sparse_coo_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_with_dims_and_tensors) \
+_(aten, _sparse_csc_tensor_unsafe) \
+_(aten, _sparse_csr_prod) \
+_(aten, _sparse_csr_sum) \
+_(aten, _sparse_csr_tensor_unsafe) \
+_(aten, _sparse_log_softmax) \
+_(aten, _sparse_log_softmax_backward_data) \
+_(aten, _sparse_mask_projection) \
+_(aten, _sparse_mm) \
+_(aten, _sparse_mm_reduce_impl) \
+_(aten, _sparse_mm_reduce_impl_backward) \
+_(aten, _sparse_semi_structured_linear) \
+_(aten, _sparse_softmax) \
+_(aten, _sparse_softmax_backward_data) \
+_(aten, _sparse_sparse_matmul) \
+_(aten, _sparse_sum) \
+_(aten, _sparse_sum_backward) \
+_(aten, _spdiags) \
+_(aten, _stack) \
+_(aten, _standard_gamma) \
+_(aten, _standard_gamma_grad) \
+_(aten, _test_ambiguous_defaults) \
+_(aten, _test_autograd_multiple_dispatch) \
+_(aten, _test_autograd_multiple_dispatch_view) \
+_(aten, _test_autograd_multiple_dispatch_view_copy) \
+_(aten, _test_check_tensor) \
+_(aten, _test_functorch_fallback) \
+_(aten, _test_optional_filled_intlist) \
+_(aten, _test_optional_floatlist) \
+_(aten, _test_optional_intlist) \
+_(aten, _test_parallel_materialize) \
+_(aten, _test_serialization_subcmul) \
+_(aten, _test_string_default) \
+_(aten, _test_warn_in_autograd) \
+_(aten, _thnn_differentiable_gru_cell_backward) \
+_(aten, _thnn_differentiable_lstm_cell_backward) \
+_(aten, _thnn_fused_gru_cell) \
+_(aten, _thnn_fused_gru_cell_backward) \
+_(aten, _thnn_fused_lstm_cell) \
+_(aten, _thnn_fused_lstm_cell_backward) \
+_(aten, _thnn_fused_lstm_cell_backward_impl) \
+_(aten, _to_copy) \
+_(aten, _to_cpu) \
+_(aten, _to_dense) \
+_(aten, _to_sparse) \
+_(aten, _to_sparse_bsc) \
+_(aten, _to_sparse_bsr) \
+_(aten, _to_sparse_csc) \
+_(aten, _to_sparse_csr) \
+_(aten, _to_sparse_semi_structured) \
+_(aten, _transform_bias_rescale_qkv) \
+_(aten, _transformer_encoder_layer_fwd) \
+_(aten, _trilinear) \
+_(aten, _triton_multi_head_attention) \
+_(aten, _triton_scaled_dot_attention) \
+_(aten, _unique) \
+_(aten, _unique2) \
+_(aten, _unpack_dual) \
+_(aten, _unsafe_index) \
+_(aten, _unsafe_index_put) \
+_(aten, _unsafe_view) \
+_(aten, _upsample_bicubic2d_aa) \
+_(aten, _upsample_bicubic2d_aa_backward) \
+_(aten, _upsample_bilinear2d_aa) \
+_(aten, _upsample_bilinear2d_aa_backward) \
+_(aten, _upsample_nearest_exact1d) \
+_(aten, _upsample_nearest_exact1d_backward) \
+_(aten, _upsample_nearest_exact2d) \
+_(aten, _upsample_nearest_exact2d_backward) \
+_(aten, _upsample_nearest_exact3d) \
+_(aten, _upsample_nearest_exact3d_backward) \
+_(aten, _use_cudnn_ctc_loss) \
+_(aten, _use_cudnn_rnn_flatten_weight) \
+_(aten, _validate_compressed_sparse_indices) \
+_(aten, _validate_sparse_bsc_tensor_args) \
+_(aten, _validate_sparse_bsr_tensor_args) \
+_(aten, _validate_sparse_compressed_tensor_args) \
+_(aten, _validate_sparse_coo_tensor_args) \
+_(aten, _validate_sparse_csc_tensor_args) \
+_(aten, _validate_sparse_csr_tensor_args) \
+_(aten, _values) \
+_(aten, _values_copy) \
+_(aten, _version) \
+_(aten, _weight_int4pack_mm) \
+_(aten, _weight_int8pack_mm) \
+_(aten, _weight_norm) \
+_(aten, _weight_norm_differentiable_backward) \
+_(aten, _weight_norm_interface) \
+_(aten, _weight_norm_interface_backward) \
+_(aten, abs) \
+_(aten, abs_) \
+_(aten, absolute) \
+_(aten, absolute_) \
+_(aten, acos) \
+_(aten, acos_) \
+_(aten, acosh) \
+_(aten, acosh_) \
+_(aten, adaptive_avg_pool1d) \
+_(aten, adaptive_avg_pool2d) \
+_(aten, adaptive_avg_pool3d) \
+_(aten, adaptive_avg_pool3d_backward) \
+_(aten, adaptive_max_pool1d) \
+_(aten, adaptive_max_pool2d) \
+_(aten, adaptive_max_pool2d_backward) \
+_(aten, adaptive_max_pool3d) \
+_(aten, adaptive_max_pool3d_backward) \
+_(aten, add) \
+_(aten, add_) \
+_(aten, addbmm) \
+_(aten, addbmm_) \
+_(aten, addcdiv) \
+_(aten, addcdiv_) \
+_(aten, addcmul) \
+_(aten, addcmul_) \
+_(aten, addmm) \
+_(aten, addmm_) \
+_(aten, addmv) \
+_(aten, addmv_) \
+_(aten, addr) \
+_(aten, addr_) \
+_(aten, adjoint) \
+_(aten, affine_grid_generator) \
+_(aten, affine_grid_generator_backward) \
+_(aten, alias) \
+_(aten, alias_copy) \
+_(aten, align_as) \
+_(aten, align_tensors) \
+_(aten, align_to) \
+_(aten, all) \
+_(aten, allclose) \
+_(aten, alpha_dropout) \
+_(aten, alpha_dropout_) \
+_(aten, amax) \
+_(aten, amin) \
+_(aten, aminmax) \
+_(aten, angle) \
+_(aten, any) \
+_(aten, arange) \
+_(aten, arccos) \
+_(aten, arccos_) \
+_(aten, arccosh) \
+_(aten, arccosh_) \
+_(aten, arcsin) \
+_(aten, arcsin_) \
+_(aten, arcsinh) \
+_(aten, arcsinh_) \
+_(aten, arctan) \
+_(aten, arctan2) \
+_(aten, arctan2_) \
+_(aten, arctan_) \
+_(aten, arctanh) \
+_(aten, arctanh_) \
+_(aten, argmax) \
+_(aten, argmin) \
+_(aten, argsort) \
+_(aten, argwhere) \
+_(aten, as_strided) \
+_(aten, as_strided_) \
+_(aten, as_strided_copy) \
+_(aten, as_strided_scatter) \
+_(aten, asin) \
+_(aten, asin_) \
+_(aten, asinh) \
+_(aten, asinh_) \
+_(aten, atan) \
+_(aten, atan2) \
+_(aten, atan2_) \
+_(aten, atan_) \
+_(aten, atanh) \
+_(aten, atanh_) \
+_(aten, atleast_1d) \
+_(aten, atleast_2d) \
+_(aten, atleast_3d) \
+_(aten, avg_pool1d) \
+_(aten, avg_pool2d) \
+_(aten, avg_pool2d_backward) \
+_(aten, avg_pool3d) \
+_(aten, avg_pool3d_backward) \
+_(aten, baddbmm) \
+_(aten, baddbmm_) \
+_(aten, bartlett_window) \
+_(aten, batch_norm) \
+_(aten, batch_norm_backward_elemt) \
+_(aten, batch_norm_backward_reduce) \
+_(aten, batch_norm_elemt) \
+_(aten, batch_norm_gather_stats) \
+_(aten, batch_norm_gather_stats_with_counts) \
+_(aten, batch_norm_stats) \
+_(aten, batch_norm_update_stats) \
+_(aten, bernoulli) \
+_(aten, bernoulli_) \
+_(aten, bilinear) \
+_(aten, binary_cross_entropy) \
+_(aten, binary_cross_entropy_backward) \
+_(aten, binary_cross_entropy_with_logits) \
+_(aten, bincount) \
+_(aten, binomial) \
+_(aten, bitwise_and) \
+_(aten, bitwise_and_) \
+_(aten, bitwise_left_shift) \
+_(aten, bitwise_left_shift_) \
+_(aten, bitwise_not) \
+_(aten, bitwise_not_) \
+_(aten, bitwise_or) \
+_(aten, bitwise_or_) \
+_(aten, bitwise_right_shift) \
+_(aten, bitwise_right_shift_) \
+_(aten, bitwise_xor) \
+_(aten, bitwise_xor_) \
+_(aten, blackman_window) \
+_(aten, block_diag) \
+_(aten, bmm) \
+_(aten, broadcast_tensors) \
+_(aten, broadcast_to) \
+_(aten, bucketize) \
+_(aten, can_cast) \
+_(aten, cartesian_prod) \
+_(aten, cat) \
+_(aten, cauchy) \
+_(aten, cauchy_) \
+_(aten, ccol_indices) \
+_(aten, ccol_indices_copy) \
+_(aten, cdist) \
+_(aten, ceil) \
+_(aten, ceil_) \
+_(aten, celu) \
+_(aten, celu_) \
+_(aten, chain_matmul) \
+_(aten, chalf) \
+_(aten, channel_shuffle) \
+_(aten, cholesky) \
+_(aten, cholesky_inverse) \
+_(aten, cholesky_solve) \
+_(aten, choose_qparams_optimized) \
+_(aten, chunk) \
+_(aten, clamp) \
+_(aten, clamp_) \
+_(aten, clamp_max) \
+_(aten, clamp_max_) \
+_(aten, clamp_min) \
+_(aten, clamp_min_) \
+_(aten, clip) \
+_(aten, clip_) \
+_(aten, clone) \
+_(aten, coalesce) \
+_(aten, col2im) \
+_(aten, col_indices) \
+_(aten, col_indices_copy) \
+_(aten, column_stack) \
+_(aten, combinations) \
+_(aten, complex) \
+_(aten, concat) \
+_(aten, concatenate) \
+_(aten, conj) \
+_(aten, conj_physical) \
+_(aten, conj_physical_) \
+_(aten, constant_pad_nd) \
+_(aten, contiguous) \
+_(aten, conv1d) \
+_(aten, conv2d) \
+_(aten, conv3d) \
+_(aten, conv_depthwise3d) \
+_(aten, conv_tbc) \
+_(aten, conv_tbc_backward) \
+_(aten, conv_transpose1d) \
+_(aten, conv_transpose2d) \
+_(aten, conv_transpose3d) \
+_(aten, convolution) \
+_(aten, convolution_backward) \
+_(aten, convolution_backward_overrideable) \
+_(aten, convolution_overrideable) \
+_(aten, copy) \
+_(aten, copy_) \
+_(aten, copy_sparse_to_sparse) \
+_(aten, copy_sparse_to_sparse_) \
+_(aten, copysign) \
+_(aten, copysign_) \
+_(aten, corrcoef) \
+_(aten, cos) \
+_(aten, cos_) \
+_(aten, cosh) \
+_(aten, cosh_) \
+_(aten, cosine_embedding_loss) \
+_(aten, cosine_similarity) \
+_(aten, count_nonzero) \
+_(aten, cov) \
+_(aten, cross) \
+_(aten, cross_entropy_loss) \
+_(aten, crow_indices) \
+_(aten, crow_indices_copy) \
+_(aten, ctc_loss) \
+_(aten, cudnn_affine_grid_generator) \
+_(aten, cudnn_affine_grid_generator_backward) \
+_(aten, cudnn_batch_norm) \
+_(aten, cudnn_batch_norm_backward) \
+_(aten, cudnn_convolution) \
+_(aten, cudnn_convolution_add_relu) \
+_(aten, cudnn_convolution_relu) \
+_(aten, cudnn_convolution_transpose) \
+_(aten, cudnn_grid_sampler) \
+_(aten, cudnn_grid_sampler_backward) \
+_(aten, cudnn_is_acceptable) \
+_(aten, cummax) \
+_(aten, cummaxmin_backward) \
+_(aten, cummin) \
+_(aten, cumprod) \
+_(aten, cumprod_) \
+_(aten, cumprod_backward) \
+_(aten, cumsum) \
+_(aten, cumsum_) \
+_(aten, cumulative_trapezoid) \
+_(aten, data) \
+_(aten, deg2rad) \
+_(aten, deg2rad_) \
+_(aten, dense_dim) \
+_(aten, dequantize) \
+_(aten, det) \
+_(aten, detach) \
+_(aten, detach_) \
+_(aten, detach_copy) \
+_(aten, diag) \
+_(aten, diag_embed) \
+_(aten, diagflat) \
+_(aten, diagonal) \
+_(aten, diagonal_backward) \
+_(aten, diagonal_copy) \
+_(aten, diagonal_scatter) \
+_(aten, diff) \
+_(aten, digamma) \
+_(aten, digamma_) \
+_(aten, dist) \
+_(aten, div) \
+_(aten, div_) \
+_(aten, divide) \
+_(aten, divide_) \
+_(aten, dot) \
+_(aten, dropout) \
+_(aten, dropout_) \
+_(aten, dsplit) \
+_(aten, dstack) \
+_(aten, einsum) \
+_(aten, elu) \
+_(aten, elu_) \
+_(aten, elu_backward) \
+_(aten, embedding) \
+_(aten, embedding_backward) \
+_(aten, embedding_bag) \
+_(aten, embedding_dense_backward) \
+_(aten, embedding_renorm) \
+_(aten, embedding_renorm_) \
+_(aten, embedding_sparse_backward) \
+_(aten, empty) \
+_(aten, empty_like) \
+_(aten, empty_permuted) \
+_(aten, empty_quantized) \
+_(aten, empty_strided) \
+_(aten, eq) \
+_(aten, eq_) \
+_(aten, equal) \
+_(aten, erf) \
+_(aten, erf_) \
+_(aten, erfc) \
+_(aten, erfc_) \
+_(aten, erfinv) \
+_(aten, erfinv_) \
+_(aten, exp) \
+_(aten, exp2) \
+_(aten, exp2_) \
+_(aten, exp_) \
+_(aten, expand) \
+_(aten, expand_as) \
+_(aten, expand_copy) \
+_(aten, expm1) \
+_(aten, expm1_) \
+_(aten, exponential) \
+_(aten, exponential_) \
+_(aten, eye) \
+_(aten, fake_quantize_per_channel_affine) \
+_(aten, fake_quantize_per_channel_affine_cachemask) \
+_(aten, fake_quantize_per_channel_affine_cachemask_backward) \
+_(aten, fake_quantize_per_tensor_affine) \
+_(aten, fake_quantize_per_tensor_affine_cachemask) \
+_(aten, fake_quantize_per_tensor_affine_cachemask_backward) \
+_(aten, fbgemm_linear_fp16_weight) \
+_(aten, fbgemm_linear_fp16_weight_fp32_activation) \
+_(aten, fbgemm_linear_int8_weight) \
+_(aten, fbgemm_linear_int8_weight_fp32_activation) \
+_(aten, fbgemm_linear_quantize_weight) \
+_(aten, fbgemm_pack_gemm_matrix_fp16) \
+_(aten, fbgemm_pack_quantized_matrix) \
+_(aten, feature_alpha_dropout) \
+_(aten, feature_alpha_dropout_) \
+_(aten, feature_dropout) \
+_(aten, feature_dropout_) \
+_(aten, fft_fft) \
+_(aten, fft_fft2) \
+_(aten, fft_fftfreq) \
+_(aten, fft_fftn) \
+_(aten, fft_fftshift) \
+_(aten, fft_hfft) \
+_(aten, fft_hfft2) \
+_(aten, fft_hfftn) \
+_(aten, fft_ifft) \
+_(aten, fft_ifft2) \
+_(aten, fft_ifftn) \
+_(aten, fft_ifftshift) \
+_(aten, fft_ihfft) \
+_(aten, fft_ihfft2) \
+_(aten, fft_ihfftn) \
+_(aten, fft_irfft) \
+_(aten, fft_irfft2) \
+_(aten, fft_irfftn) \
+_(aten, fft_rfft) \
+_(aten, fft_rfft2) \
+_(aten, fft_rfftfreq) \
+_(aten, fft_rfftn) \
+_(aten, fill) \
+_(aten, fill_) \
+_(aten, fill_diagonal) \
+_(aten, fill_diagonal_) \
+_(aten, fix) \
+_(aten, fix_) \
+_(aten, flatten) \
+_(aten, flatten_dense_tensors) \
+_(aten, flip) \
+_(aten, fliplr) \
+_(aten, flipud) \
+_(aten, float_power) \
+_(aten, float_power_) \
+_(aten, floor) \
+_(aten, floor_) \
+_(aten, floor_divide) \
+_(aten, floor_divide_) \
+_(aten, fmax) \
+_(aten, fmin) \
+_(aten, fmod) \
+_(aten, fmod_) \
+_(aten, frac) \
+_(aten, frac_) \
+_(aten, fractional_max_pool2d) \
+_(aten, fractional_max_pool2d_backward) \
+_(aten, fractional_max_pool3d) \
+_(aten, fractional_max_pool3d_backward) \
+_(aten, frexp) \
+_(aten, frobenius_norm) \
+_(aten, from_file) \
+_(aten, full) \
+_(aten, full_like) \
+_(aten, fused_moving_avg_obs_fake_quant) \
+_(aten, gather) \
+_(aten, gather_backward) \
+_(aten, gcd) \
+_(aten, gcd_) \
+_(aten, ge) \
+_(aten, ge_) \
+_(aten, gelu) \
+_(aten, gelu_) \
+_(aten, gelu_backward) \
+_(aten, geometric) \
+_(aten, geometric_) \
+_(aten, geqrf) \
+_(aten, ger) \
+_(aten, glu) \
+_(aten, glu_backward) \
+_(aten, glu_backward_jvp) \
+_(aten, glu_jvp) \
+_(aten, gradient) \
+_(aten, greater) \
+_(aten, greater_) \
+_(aten, greater_equal) \
+_(aten, greater_equal_) \
+_(aten, grid_sampler) \
+_(aten, grid_sampler_2d) \
+_(aten, grid_sampler_2d_backward) \
+_(aten, grid_sampler_3d) \
+_(aten, grid_sampler_3d_backward) \
+_(aten, group_norm) \
+_(aten, gru) \
+_(aten, gru_cell) \
+_(aten, gt) \
+_(aten, gt_) \
+_(aten, hamming_window) \
+_(aten, hann_window) \
+_(aten, hardshrink) \
+_(aten, hardshrink_backward) \
+_(aten, hardsigmoid) \
+_(aten, hardsigmoid_) \
+_(aten, hardsigmoid_backward) \
+_(aten, hardswish) \
+_(aten, hardswish_) \
+_(aten, hardswish_backward) \
+_(aten, hardtanh) \
+_(aten, hardtanh_) \
+_(aten, hardtanh_backward) \
+_(aten, heaviside) \
+_(aten, heaviside_) \
+_(aten, hinge_embedding_loss) \
+_(aten, histc) \
+_(aten, histogram) \
+_(aten, histogramdd) \
+_(aten, hsplit) \
+_(aten, hspmm) \
+_(aten, hstack) \
+_(aten, huber_loss) \
+_(aten, huber_loss_backward) \
+_(aten, hypot) \
+_(aten, hypot_) \
+_(aten, i0) \
+_(aten, i0_) \
+_(aten, igamma) \
+_(aten, igamma_) \
+_(aten, igammac) \
+_(aten, igammac_) \
+_(aten, im2col) \
+_(aten, imag) \
+_(aten, index) \
+_(aten, index_add) \
+_(aten, index_add_) \
+_(aten, index_copy) \
+_(aten, index_copy_) \
+_(aten, index_fill) \
+_(aten, index_fill_) \
+_(aten, index_put) \
+_(aten, index_put_) \
+_(aten, index_reduce) \
+_(aten, index_reduce_) \
+_(aten, index_select) \
+_(aten, index_select_backward) \
+_(aten, indices) \
+_(aten, indices_copy) \
+_(aten, infinitely_differentiable_gelu_backward) \
+_(aten, inner) \
+_(aten, instance_norm) \
+_(aten, int_repr) \
+_(aten, inverse) \
+_(aten, is_coalesced) \
+_(aten, is_complex) \
+_(aten, is_conj) \
+_(aten, is_distributed) \
+_(aten, is_floating_point) \
+_(aten, is_inference) \
+_(aten, is_leaf) \
+_(aten, is_neg) \
+_(aten, is_nonzero) \
+_(aten, is_pinned) \
+_(aten, is_same_size) \
+_(aten, is_set_to) \
+_(aten, is_signed) \
+_(aten, is_vulkan_available) \
+_(aten, isclose) \
+_(aten, isfinite) \
+_(aten, isin) \
+_(aten, isinf) \
+_(aten, isnan) \
+_(aten, isneginf) \
+_(aten, isposinf) \
+_(aten, isreal) \
+_(aten, istft) \
+_(aten, item) \
+_(aten, kaiser_window) \
+_(aten, kl_div) \
+_(aten, kron) \
+_(aten, kthvalue) \
+_(aten, l1_loss) \
+_(aten, layer_norm) \
+_(aten, lcm) \
+_(aten, lcm_) \
+_(aten, ldexp) \
+_(aten, ldexp_) \
+_(aten, le) \
+_(aten, le_) \
+_(aten, leaky_relu) \
+_(aten, leaky_relu_) \
+_(aten, leaky_relu_backward) \
+_(aten, lerp) \
+_(aten, lerp_) \
+_(aten, less) \
+_(aten, less_) \
+_(aten, less_equal) \
+_(aten, less_equal_) \
+_(aten, lgamma) \
+_(aten, lgamma_) \
+_(aten, lift) \
+_(aten, lift_fresh) \
+_(aten, lift_fresh_copy) \
+_(aten, linalg_cholesky) \
+_(aten, linalg_cholesky_ex) \
+_(aten, linalg_cond) \
+_(aten, linalg_cross) \
+_(aten, linalg_det) \
+_(aten, linalg_diagonal) \
+_(aten, linalg_eig) \
+_(aten, linalg_eigh) \
+_(aten, linalg_eigvals) \
+_(aten, linalg_eigvalsh) \
+_(aten, linalg_householder_product) \
+_(aten, linalg_inv) \
+_(aten, linalg_inv_ex) \
+_(aten, linalg_ldl_factor) \
+_(aten, linalg_ldl_factor_ex) \
+_(aten, linalg_ldl_solve) \
+_(aten, linalg_lstsq) \
+_(aten, linalg_lu) \
+_(aten, linalg_lu_factor) \
+_(aten, linalg_lu_factor_ex) \
+_(aten, linalg_lu_solve) \
+_(aten, linalg_matmul) \
+_(aten, linalg_matrix_exp) \
+_(aten, linalg_matrix_norm) \
+_(aten, linalg_matrix_power) \
+_(aten, linalg_matrix_rank) \
+_(aten, linalg_multi_dot) \
+_(aten, linalg_norm) \
+_(aten, linalg_pinv) \
+_(aten, linalg_qr) \
+_(aten, linalg_slogdet) \
+_(aten, linalg_solve) \
+_(aten, linalg_solve_ex) \
+_(aten, linalg_solve_triangular) \
+_(aten, linalg_svd) \
+_(aten, linalg_svdvals) \
+_(aten, linalg_tensorinv) \
+_(aten, linalg_tensorsolve) \
+_(aten, linalg_vander) \
+_(aten, linalg_vecdot) \
+_(aten, linalg_vector_norm) \
+_(aten, linear) \
+_(aten, linear_backward) \
+_(aten, linspace) \
+_(aten, log) \
+_(aten, log10) \
+_(aten, log10_) \
+_(aten, log1p) \
+_(aten, log1p_) \
+_(aten, log2) \
+_(aten, log2_) \
+_(aten, log_) \
+_(aten, log_normal) \
+_(aten, log_normal_) \
+_(aten, log_sigmoid) \
+_(aten, log_sigmoid_backward) \
+_(aten, log_sigmoid_forward) \
+_(aten, log_softmax) \
+_(aten, logaddexp) \
+_(aten, logaddexp2) \
+_(aten, logcumsumexp) \
+_(aten, logdet) \
+_(aten, logical_and) \
+_(aten, logical_and_) \
+_(aten, logical_not) \
+_(aten, logical_not_) \
+_(aten, logical_or) \
+_(aten, logical_or_) \
+_(aten, logical_xor) \
+_(aten, logical_xor_) \
+_(aten, logit) \
+_(aten, logit_) \
+_(aten, logit_backward) \
+_(aten, logspace) \
+_(aten, logsumexp) \
+_(aten, lshift) \
+_(aten, lstm) \
+_(aten, lstm_cell) \
+_(aten, lstm_mps_backward) \
+_(aten, lt) \
+_(aten, lt_) \
+_(aten, lu_solve) \
+_(aten, lu_unpack) \
+_(aten, mH) \
+_(aten, mT) \
+_(aten, margin_ranking_loss) \
+_(aten, masked_fill) \
+_(aten, masked_fill_) \
+_(aten, masked_scatter) \
+_(aten, masked_scatter_) \
+_(aten, masked_scatter_backward) \
+_(aten, masked_select) \
+_(aten, masked_select_backward) \
+_(aten, matmul) \
+_(aten, matmul_backward) \
+_(aten, matrix_H) \
+_(aten, matrix_exp) \
+_(aten, matrix_exp_backward) \
+_(aten, matrix_power) \
+_(aten, max) \
+_(aten, max_pool1d) \
+_(aten, max_pool1d_with_indices) \
+_(aten, max_pool2d) \
+_(aten, max_pool2d_backward) \
+_(aten, max_pool2d_with_indices) \
+_(aten, max_pool2d_with_indices_backward) \
+_(aten, max_pool3d) \
+_(aten, max_pool3d_with_indices) \
+_(aten, max_pool3d_with_indices_backward) \
+_(aten, max_unpool2d) \
+_(aten, max_unpool3d) \
+_(aten, maximum) \
+_(aten, mean) \
+_(aten, median) \
+_(aten, meshgrid) \
+_(aten, min) \
+_(aten, minimum) \
+_(aten, miopen_batch_norm) \
+_(aten, miopen_batch_norm_backward) \
+_(aten, miopen_convolution) \
+_(aten, miopen_convolution_add_relu) \
+_(aten, miopen_convolution_relu) \
+_(aten, miopen_convolution_transpose) \
+_(aten, miopen_depthwise_convolution) \
+_(aten, miopen_rnn) \
+_(aten, miopen_rnn_backward) \
+_(aten, mish) \
+_(aten, mish_) \
+_(aten, mish_backward) \
+_(aten, mkldnn_adaptive_avg_pool2d) \
+_(aten, mkldnn_adaptive_avg_pool2d_backward) \
+_(aten, mkldnn_convolution) \
+_(aten, mkldnn_linear) \
+_(aten, mkldnn_linear_backward) \
+_(aten, mkldnn_linear_backward_input) \
+_(aten, mkldnn_linear_backward_weights) \
+_(aten, mkldnn_max_pool2d) \
+_(aten, mkldnn_max_pool2d_backward) \
+_(aten, mkldnn_max_pool3d) \
+_(aten, mkldnn_max_pool3d_backward) \
+_(aten, mkldnn_reorder_conv2d_weight) \
+_(aten, mkldnn_reorder_conv3d_weight) \
+_(aten, mkldnn_rnn_layer) \
+_(aten, mkldnn_rnn_layer_backward) \
+_(aten, mm) \
+_(aten, mode) \
+_(aten, moveaxis) \
+_(aten, movedim) \
+_(aten, mps_convolution_backward) \
+_(aten, mps_convolution_transpose_backward) \
+_(aten, mse_loss) \
+_(aten, mse_loss_backward) \
+_(aten, msort) \
+_(aten, mul) \
+_(aten, mul_) \
+_(aten, multi_margin_loss) \
+_(aten, multi_margin_loss_backward) \
+_(aten, multilabel_margin_loss) \
+_(aten, multilabel_margin_loss_backward) \
+_(aten, multilabel_margin_loss_forward) \
+_(aten, multinomial) \
+_(aten, multiply) \
+_(aten, multiply_) \
+_(aten, mv) \
+_(aten, mvlgamma) \
+_(aten, mvlgamma_) \
+_(aten, nan_to_num) \
+_(aten, nan_to_num_) \
+_(aten, nanmean) \
+_(aten, nanmedian) \
+_(aten, nanquantile) \
+_(aten, nansum) \
+_(aten, narrow) \
+_(aten, narrow_copy) \
+_(aten, native_batch_norm) \
+_(aten, native_batch_norm_backward) \
+_(aten, native_channel_shuffle) \
+_(aten, native_dropout) \
+_(aten, native_dropout_backward) \
+_(aten, native_group_norm) \
+_(aten, native_group_norm_backward) \
+_(aten, native_layer_norm) \
+_(aten, native_layer_norm_backward) \
+_(aten, native_norm) \
+_(aten, ne) \
+_(aten, ne_) \
+_(aten, neg) \
+_(aten, neg_) \
+_(aten, negative) \
+_(aten, negative_) \
+_(aten, nested_to_padded_tensor) \
+_(aten, new_empty) \
+_(aten, new_empty_strided) \
+_(aten, new_full) \
+_(aten, new_ones) \
+_(aten, new_zeros) \
+_(aten, nextafter) \
+_(aten, nextafter_) \
+_(aten, nll_loss) \
+_(aten, nll_loss2d) \
+_(aten, nll_loss2d_backward) \
+_(aten, nll_loss2d_forward) \
+_(aten, nll_loss_backward) \
+_(aten, nll_loss_forward) \
+_(aten, nll_loss_nd) \
+_(aten, nonzero) \
+_(aten, nonzero_numpy) \
+_(aten, nonzero_static) \
+_(aten, norm) \
+_(aten, norm_except_dim) \
+_(aten, normal) \
+_(aten, normal_) \
+_(aten, normal_functional) \
+_(aten, not_equal) \
+_(aten, not_equal_) \
+_(aten, nuclear_norm) \
+_(aten, numpy_T) \
+_(aten, one_hot) \
+_(aten, ones) \
+_(aten, ones_like) \
+_(aten, orgqr) \
+_(aten, ormqr) \
+_(aten, outer) \
+_(aten, output_nr) \
+_(aten, pad) \
+_(aten, pad_sequence) \
+_(aten, pairwise_distance) \
+_(aten, pdist) \
+_(aten, permute) \
+_(aten, permute_copy) \
+_(aten, pin_memory) \
+_(aten, pinverse) \
+_(aten, pixel_shuffle) \
+_(aten, pixel_unshuffle) \
+_(aten, poisson) \
+_(aten, poisson_nll_loss) \
+_(aten, polar) \
+_(aten, polygamma) \
+_(aten, polygamma_) \
+_(aten, positive) \
+_(aten, pow) \
+_(aten, pow_) \
+_(aten, prelu) \
+_(aten, prod) \
+_(aten, promote_types) \
+_(aten, put) \
+_(aten, put_) \
+_(aten, q_per_channel_axis) \
+_(aten, q_per_channel_scales) \
+_(aten, q_per_channel_zero_points) \
+_(aten, q_scale) \
+_(aten, q_zero_point) \
+_(aten, qr) \
+_(aten, qscheme) \
+_(aten, quantile) \
+_(aten, quantize_per_channel) \
+_(aten, quantize_per_tensor) \
+_(aten, quantize_per_tensor_dynamic) \
+_(aten, quantized_batch_norm) \
+_(aten, quantized_gru_cell) \
+_(aten, quantized_lstm_cell) \
+_(aten, quantized_max_pool1d) \
+_(aten, quantized_max_pool2d) \
+_(aten, quantized_max_pool3d) \
+_(aten, quantized_rnn_relu_cell) \
+_(aten, quantized_rnn_tanh_cell) \
+_(aten, rad2deg) \
+_(aten, rad2deg_) \
+_(aten, rand) \
+_(aten, rand_like) \
+_(aten, randint) \
+_(aten, randint_like) \
+_(aten, randn) \
+_(aten, randn_like) \
+_(aten, random) \
+_(aten, random_) \
+_(aten, randperm) \
+_(aten, range) \
+_(aten, ravel) \
+_(aten, real) \
+_(aten, reciprocal) \
+_(aten, reciprocal_) \
+_(aten, record_stream) \
+_(aten, refine_names) \
+_(aten, reflection_pad1d) \
+_(aten, reflection_pad1d_backward) \
+_(aten, reflection_pad2d) \
+_(aten, reflection_pad2d_backward) \
+_(aten, reflection_pad3d) \
+_(aten, reflection_pad3d_backward) \
+_(aten, relu) \
+_(aten, relu6) \
+_(aten, relu6_) \
+_(aten, relu_) \
+_(aten, remainder) \
+_(aten, remainder_) \
+_(aten, rename) \
+_(aten, rename_) \
+_(aten, renorm) \
+_(aten, renorm_) \
+_(aten, repeat) \
+_(aten, repeat_interleave) \
+_(aten, replication_pad1d) \
+_(aten, replication_pad1d_backward) \
+_(aten, replication_pad2d) \
+_(aten, replication_pad2d_backward) \
+_(aten, replication_pad3d) \
+_(aten, replication_pad3d_backward) \
+_(aten, requires_grad) \
+_(aten, requires_grad_) \
+_(aten, reshape) \
+_(aten, reshape_as) \
+_(aten, resize) \
+_(aten, resize_) \
+_(aten, resize_as) \
+_(aten, resize_as_) \
+_(aten, resize_as_sparse) \
+_(aten, resize_as_sparse_) \
+_(aten, resolve_conj) \
+_(aten, resolve_neg) \
+_(aten, result_type) \
+_(aten, retain_grad) \
+_(aten, retains_grad) \
+_(aten, rnn_relu) \
+_(aten, rnn_relu_cell) \
+_(aten, rnn_tanh) \
+_(aten, rnn_tanh_cell) \
+_(aten, roll) \
+_(aten, rot90) \
+_(aten, round) \
+_(aten, round_) \
+_(aten, row_indices) \
+_(aten, row_indices_copy) \
+_(aten, row_stack) \
+_(aten, rrelu) \
+_(aten, rrelu_) \
+_(aten, rrelu_with_noise) \
+_(aten, rrelu_with_noise_) \
+_(aten, rrelu_with_noise_backward) \
+_(aten, rshift) \
+_(aten, rsqrt) \
+_(aten, rsqrt_) \
+_(aten, rsub) \
+_(aten, scalar_tensor) \
+_(aten, scaled_dot_product_attention) \
+_(aten, scatter) \
+_(aten, scatter_) \
+_(aten, scatter_add) \
+_(aten, scatter_add_) \
+_(aten, scatter_reduce) \
+_(aten, scatter_reduce_) \
+_(aten, searchsorted) \
+_(aten, segment_reduce) \
+_(aten, select) \
+_(aten, select_backward) \
+_(aten, select_copy) \
+_(aten, select_scatter) \
+_(aten, selu) \
+_(aten, selu_) \
+_(aten, set) \
+_(aten, set_) \
+_(aten, set_data) \
+_(aten, sgn) \
+_(aten, sgn_) \
+_(aten, sigmoid) \
+_(aten, sigmoid_) \
+_(aten, sigmoid_backward) \
+_(aten, sign) \
+_(aten, sign_) \
+_(aten, signbit) \
+_(aten, silu) \
+_(aten, silu_) \
+_(aten, silu_backward) \
+_(aten, sin) \
+_(aten, sin_) \
+_(aten, sinc) \
+_(aten, sinc_) \
+_(aten, sinh) \
+_(aten, sinh_) \
+_(aten, size) \
+_(aten, slice) \
+_(aten, slice_backward) \
+_(aten, slice_copy) \
+_(aten, slice_inverse) \
+_(aten, slice_scatter) \
+_(aten, slogdet) \
+_(aten, slow_conv3d) \
+_(aten, slow_conv3d_forward) \
+_(aten, slow_conv_dilated2d) \
+_(aten, slow_conv_dilated3d) \
+_(aten, slow_conv_transpose2d) \
+_(aten, slow_conv_transpose3d) \
+_(aten, smm) \
+_(aten, smooth_l1_loss) \
+_(aten, smooth_l1_loss_backward) \
+_(aten, soft_margin_loss) \
+_(aten, soft_margin_loss_backward) \
+_(aten, softmax) \
+_(aten, softplus) \
+_(aten, softplus_backward) \
+_(aten, softshrink) \
+_(aten, softshrink_backward) \
+_(aten, sort) \
+_(aten, sparse_bsc_tensor) \
+_(aten, sparse_bsr_tensor) \
+_(aten, sparse_compressed_tensor) \
+_(aten, sparse_coo_tensor) \
+_(aten, sparse_csc_tensor) \
+_(aten, sparse_csr_tensor) \
+_(aten, sparse_dim) \
+_(aten, sparse_mask) \
+_(aten, sparse_resize) \
+_(aten, sparse_resize_) \
+_(aten, sparse_resize_and_clear) \
+_(aten, sparse_resize_and_clear_) \
+_(aten, sparse_sampled_addmm) \
+_(aten, special_airy_ai) \
+_(aten, special_bessel_j0) \
+_(aten, special_bessel_j1) \
+_(aten, special_bessel_y0) \
+_(aten, special_bessel_y1) \
+_(aten, special_chebyshev_polynomial_t) \
+_(aten, special_chebyshev_polynomial_u) \
+_(aten, special_chebyshev_polynomial_v) \
+_(aten, special_chebyshev_polynomial_w) \
+_(aten, special_digamma) \
+_(aten, special_entr) \
+_(aten, special_erf) \
+_(aten, special_erfc) \
+_(aten, special_erfcx) \
+_(aten, special_erfinv) \
+_(aten, special_exp2) \
+_(aten, special_expit) \
+_(aten, special_expm1) \
+_(aten, special_gammainc) \
+_(aten, special_gammaincc) \
+_(aten, special_gammaln) \
+_(aten, special_hermite_polynomial_h) \
+_(aten, special_hermite_polynomial_he) \
+_(aten, special_i0) \
+_(aten, special_i0e) \
+_(aten, special_i1) \
+_(aten, special_i1e) \
+_(aten, special_laguerre_polynomial_l) \
+_(aten, special_legendre_polynomial_p) \
+_(aten, special_log1p) \
+_(aten, special_log_ndtr) \
+_(aten, special_log_softmax) \
+_(aten, special_logit) \
+_(aten, special_logsumexp) \
+_(aten, special_modified_bessel_i0) \
+_(aten, special_modified_bessel_i1) \
+_(aten, special_modified_bessel_k0) \
+_(aten, special_modified_bessel_k1) \
+_(aten, special_multigammaln) \
+_(aten, special_ndtr) \
+_(aten, special_ndtri) \
+_(aten, special_polygamma) \
+_(aten, special_psi) \
+_(aten, special_round) \
+_(aten, special_scaled_modified_bessel_k0) \
+_(aten, special_scaled_modified_bessel_k1) \
+_(aten, special_shifted_chebyshev_polynomial_t) \
+_(aten, special_shifted_chebyshev_polynomial_u) \
+_(aten, special_shifted_chebyshev_polynomial_v) \
+_(aten, special_shifted_chebyshev_polynomial_w) \
+_(aten, special_sinc) \
+_(aten, special_softmax) \
+_(aten, special_spherical_bessel_j0) \
+_(aten, special_xlog1py) \
+_(aten, special_xlogy) \
+_(aten, special_zeta) \
+_(aten, split) \
+_(aten, split_copy) \
+_(aten, split_with_sizes) \
+_(aten, split_with_sizes_copy) \
+_(aten, sqrt) \
+_(aten, sqrt_) \
+_(aten, square) \
+_(aten, square_) \
+_(aten, squeeze) \
+_(aten, squeeze_) \
+_(aten, squeeze_copy) \
+_(aten, sspaddmm) \
+_(aten, stack) \
+_(aten, std) \
+_(aten, std_mean) \
+_(aten, stft) \
+_(aten, stride) \
+_(aten, sub) \
+_(aten, sub_) \
+_(aten, subtract) \
+_(aten, subtract_) \
+_(aten, sum) \
+_(aten, sum_to_size) \
+_(aten, svd) \
+_(aten, swapaxes) \
+_(aten, swapaxes_) \
+_(aten, swapdims) \
+_(aten, swapdims_) \
+_(aten, sym_constrain_range) \
+_(aten, sym_constrain_range_for_size) \
+_(aten, sym_numel) \
+_(aten, sym_size) \
+_(aten, sym_storage_offset) \
+_(aten, sym_stride) \
+_(aten, t) \
+_(aten, t_) \
+_(aten, t_copy) \
+_(aten, take) \
+_(aten, take_along_dim) \
+_(aten, tan) \
+_(aten, tan_) \
+_(aten, tanh) \
+_(aten, tanh_) \
+_(aten, tanh_backward) \
+_(aten, tensor_split) \
+_(aten, tensordot) \
+_(aten, thnn_conv2d) \
+_(aten, threshold) \
+_(aten, threshold_) \
+_(aten, threshold_backward) \
+_(aten, tile) \
+_(aten, to) \
+_(aten, to_dense) \
+_(aten, to_dense_backward) \
+_(aten, to_mkldnn) \
+_(aten, to_mkldnn_backward) \
+_(aten, to_padded_tensor) \
+_(aten, to_sparse) \
+_(aten, to_sparse_bsc) \
+_(aten, to_sparse_bsr) \
+_(aten, to_sparse_csc) \
+_(aten, to_sparse_csr) \
+_(aten, topk) \
+_(aten, trace) \
+_(aten, trace_backward) \
+_(aten, transpose) \
+_(aten, transpose_) \
+_(aten, transpose_copy) \
+_(aten, trapezoid) \
+_(aten, trapz) \
+_(aten, triangular_solve) \
+_(aten, tril) \
+_(aten, tril_) \
+_(aten, tril_indices) \
+_(aten, triplet_margin_loss) \
+_(aten, triu) \
+_(aten, triu_) \
+_(aten, triu_indices) \
+_(aten, true_divide) \
+_(aten, true_divide_) \
+_(aten, trunc) \
+_(aten, trunc_) \
+_(aten, type_as) \
+_(aten, unbind) \
+_(aten, unbind_copy) \
+_(aten, unflatten) \
+_(aten, unflatten_dense_tensors) \
+_(aten, unfold) \
+_(aten, unfold_backward) \
+_(aten, unfold_copy) \
+_(aten, uniform) \
+_(aten, uniform_) \
+_(aten, unique_consecutive) \
+_(aten, unique_dim) \
+_(aten, unique_dim_consecutive) \
+_(aten, unsafe_chunk) \
+_(aten, unsafe_split) \
+_(aten, unsafe_split_with_sizes) \
+_(aten, unsqueeze) \
+_(aten, unsqueeze_) \
+_(aten, unsqueeze_copy) \
+_(aten, upsample_bicubic2d) \
+_(aten, upsample_bicubic2d_backward) \
+_(aten, upsample_bilinear2d) \
+_(aten, upsample_bilinear2d_backward) \
+_(aten, upsample_linear1d) \
+_(aten, upsample_linear1d_backward) \
+_(aten, upsample_nearest1d) \
+_(aten, upsample_nearest1d_backward) \
+_(aten, upsample_nearest2d) \
+_(aten, upsample_nearest2d_backward) \
+_(aten, upsample_nearest3d) \
+_(aten, upsample_nearest3d_backward) \
+_(aten, upsample_trilinear3d) \
+_(aten, upsample_trilinear3d_backward) \
+_(aten, value_selecting_reduction_backward) \
+_(aten, values) \
+_(aten, values_copy) \
+_(aten, vander) \
+_(aten, var) \
+_(aten, var_mean) \
+_(aten, vdot) \
+_(aten, view) \
+_(aten, view_as) \
+_(aten, view_as_complex) \
+_(aten, view_as_complex_copy) \
+_(aten, view_as_real) \
+_(aten, view_as_real_copy) \
+_(aten, view_copy) \
+_(aten, vsplit) \
+_(aten, vstack) \
+_(aten, where) \
+_(aten, xlogy) \
+_(aten, xlogy_) \
+_(aten, zero) \
+_(aten, zero_) \
+_(aten, zeros) \
+_(aten, zeros_like)
+
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+_(attr, A) \
+_(attr, B) \
+_(attr, C) \
+_(attr, H) \
+_(attr, HxW) \
+_(attr, K) \
+_(attr, L) \
+_(attr, LD) \
+_(attr, LU) \
+_(attr, LU_data) \
+_(attr, LU_pivots) \
+_(attr, M) \
+_(attr, N) \
+_(attr, P) \
+_(attr, Q) \
+_(attr, R) \
+_(attr, S) \
+_(attr, U) \
+_(attr, UPLO) \
+_(attr, V) \
+_(attr, Vh) \
+_(attr, W) \
+_(attr, X) \
+_(attr, a) \
+_(attr, abs) \
+_(attr, accumulate) \
+_(attr, accumulate_matches) \
+_(attr, activation) \
+_(attr, addends) \
+_(attr, adjoint) \
+_(attr, alg_id) \
+_(attr, align_corners) \
+_(attr, allow_tf32) \
+_(attr, alpha) \
+_(attr, amsgrad) \
+_(attr, anchor) \
+_(attr, angle) \
+_(attr, any) \
+_(attr, api_name) \
+_(attr, append) \
+_(attr, approximate) \
+_(attr, arg1) \
+_(attr, arg2) \
+_(attr, arg3) \
+_(attr, arg_out) \
+_(attr, assert_msg) \
+_(attr, assume_unique) \
+_(attr, atol) \
+_(attr, attn_bias) \
+_(attr, attn_mask) \
+_(attr, average_attn_weights) \
+_(attr, averaging_const) \
+_(attr, aweights) \
+_(attr, axis) \
+_(attr, axis0) \
+_(attr, axis1) \
+_(attr, b) \
+_(attr, b_hh) \
+_(attr, b_ih) \
+_(attr, bag_size) \
+_(attr, base) \
+_(attr, batch1) \
+_(attr, batch2) \
+_(attr, batch_dim) \
+_(attr, batch_first) \
+_(attr, batch_size) \
+_(attr, batch_sizes) \
+_(attr, benchmark) \
+_(attr, beta) \
+_(attr, beta1) \
+_(attr, beta2) \
+_(attr, bias) \
+_(attr, bias_defined) \
+_(attr, bias_g) \
+_(attr, bias_requires_grad) \
+_(attr, bias_sizes) \
+_(attr, bidirectional) \
+_(attr, bin_edges) \
+_(attr, bins) \
+_(attr, bit_width) \
+_(attr, blank) \
+_(attr, blocksize) \
+_(attr, boundaries) \
+_(attr, buffer) \
+_(attr, causal_diagonal) \
+_(attr, ccol_indices) \
+_(attr, cdim) \
+_(attr, cdist) \
+_(attr, ceil_mode) \
+_(attr, cell_state_fwd) \
+_(attr, center) \
+_(attr, ch_axis) \
+_(attr, check_errors) \
+_(attr, chunks) \
+_(attr, coalesced) \
+_(attr, coefficients) \
+_(attr, col) \
+_(attr, col_indices) \
+_(attr, col_offsets) \
+_(attr, col_offsets_hh) \
+_(attr, col_offsets_ih) \
+_(attr, compressed_A) \
+_(attr, compressed_idx) \
+_(attr, compressed_indices) \
+_(attr, compressed_indices_dtype) \
+_(attr, compute_log_sumexp) \
+_(attr, compute_mode) \
+_(attr, compute_uv) \
+_(attr, compute_v) \
+_(attr, condition) \
+_(attr, copy) \
+_(attr, correction) \
+_(attr, count) \
+_(attr, count_include_pad) \
+_(attr, counts) \
+_(attr, cpu_dtype) \
+_(attr, cpu_enabled) \
+_(attr, cpu_nested_shape_example) \
+_(attr, create_graph) \
+_(attr, crow_indices) \
+_(attr, cu_seqlens_k) \
+_(attr, cu_seqlens_q) \
+_(attr, cuda_dtype) \
+_(attr, cuda_enabled) \
+_(attr, cudnn_enable) \
+_(attr, cudnn_enabled) \
+_(attr, cum_seq_k) \
+_(attr, cum_seq_q) \
+_(attr, custom_mask_type) \
+_(attr, cx) \
+_(attr, cx_) \
+_(attr, cx_tmp) \
+_(attr, cy) \
+_(attr, cy_) \
+_(attr, d) \
+_(attr, dampening) \
+_(attr, data) \
+_(attr, decimals) \
+_(attr, delta) \
+_(attr, dense) \
+_(attr, dense_B) \
+_(attr, dense_dim) \
+_(attr, density) \
+_(attr, dep_token) \
+_(attr, descending) \
+_(attr, destination) \
+_(attr, deterministic) \
+_(attr, device) \
+_(attr, device_index) \
+_(attr, dgrad_glu) \
+_(attr, diagonal) \
+_(attr, diagonals) \
+_(attr, dilation) \
+_(attr, dim) \
+_(attr, dim0) \
+_(attr, dim1) \
+_(attr, dim2) \
+_(attr, dimension) \
+_(attr, dims) \
+_(attr, dims_other) \
+_(attr, dims_self) \
+_(attr, divisor_override) \
+_(attr, downscale_factor) \
+_(attr, driver) \
+_(attr, dropout) \
+_(attr, dropout_mask) \
+_(attr, dropout_p) \
+_(attr, dropout_seed) \
+_(attr, dropout_state) \
+_(attr, dst) \
+_(attr, dtype) \
+_(attr, dual) \
+_(attr, dummy) \
+_(attr, dx) \
+_(attr, edge_order) \
+_(attr, eigenvalues) \
+_(attr, eigenvectors) \
+_(attr, eigvals) \
+_(attr, eigvecs) \
+_(attr, element) \
+_(attr, elements) \
+_(attr, ellipsis_idx) \
+_(attr, embed_dim) \
+_(attr, end) \
+_(attr, end_dim) \
+_(attr, eps) \
+_(attr, epsilon) \
+_(attr, equal_nan) \
+_(attr, equation) \
+_(attr, exp_avg_sqs) \
+_(attr, exp_avgs) \
+_(attr, expand1) \
+_(attr, expand2) \
+_(attr, expand3) \
+_(attr, exponent) \
+_(attr, exponential_average_factor) \
+_(attr, fake_quant_enabled) \
+_(attr, fake_quant_on) \
+_(attr, ffn_bias_1) \
+_(attr, ffn_bias_2) \
+_(attr, ffn_weight_1) \
+_(attr, ffn_weight_2) \
+_(attr, filename) \
+_(attr, fill_value) \
+_(attr, flat) \
+_(attr, forward) \
+_(attr, found_inf) \
+_(attr, from) \
+_(attr, full) \
+_(attr, full_matrices) \
+_(attr, fuse_transform_0213) \
+_(attr, fweights) \
+_(attr, g) \
+_(attr, gO) \
+_(attr, generator) \
+_(attr, ggI) \
+_(attr, ggW) \
+_(attr, ggb) \
+_(attr, glu) \
+_(attr, grad) \
+_(attr, grad_bias) \
+_(attr, grad_cy) \
+_(attr, grad_factor) \
+_(attr, grad_glu) \
+_(attr, grad_hy) \
+_(attr, grad_in) \
+_(attr, grad_input) \
+_(attr, grad_input_mask) \
+_(attr, grad_out) \
+_(attr, grad_out_) \
+_(attr, grad_output) \
+_(attr, grad_scale) \
+_(attr, grad_w) \
+_(attr, grad_weight) \
+_(attr, grad_x) \
+_(attr, grad_y) \
+_(attr, gradient) \
+_(attr, grads) \
+_(attr, grid) \
+_(attr, group) \
+_(attr, groups) \
+_(attr, growth_interval) \
+_(attr, growth_tracker) \
+_(attr, half_to_float) \
+_(attr, has_bias) \
+_(attr, has_biases) \
+_(attr, hermitian) \
+_(attr, hidden_bias) \
+_(attr, hidden_gates) \
+_(attr, hidden_size) \
+_(attr, high) \
+_(attr, hist) \
+_(attr, hop_length) \
+_(attr, hx) \
+_(attr, hx_) \
+_(attr, hy_) \
+_(attr, i1) \
+_(attr, i2) \
+_(attr, i3) \
+_(attr, ignore_index) \
+_(attr, imag) \
+_(attr, impl_index) \
+_(attr, implicit) \
+_(attr, include_last_offset) \
+_(attr, include_self) \
+_(attr, increasing) \
+_(attr, ind) \
+_(attr, index) \
+_(attr, indexing) \
+_(attr, indices) \
+_(attr, info) \
+_(attr, initial) \
+_(attr, innerKTiles) \
+_(attr, input) \
+_(attr, input1) \
+_(attr, input2) \
+_(attr, input3) \
+_(attr, input_bias) \
+_(attr, input_dtype) \
+_(attr, input_g) \
+_(attr, input_gates) \
+_(attr, input_lengths) \
+_(attr, input_scale) \
+_(attr, input_size) \
+_(attr, input_sizes) \
+_(attr, inputs) \
+_(attr, interpolation) \
+_(attr, interpolation_mode) \
+_(attr, inv_scale) \
+_(attr, inverse) \
+_(attr, invert) \
+_(attr, invstd) \
+_(attr, is_causal) \
+_(attr, is_coalesced) \
+_(attr, is_crow) \
+_(attr, is_first_step) \
+_(attr, is_matrix) \
+_(attr, is_result) \
+_(attr, is_target) \
+_(attr, k) \
+_(attr, keepdim) \
+_(attr, kernel_size) \
+_(attr, key) \
+_(attr, label_smoothing) \
+_(attr, lambd) \
+_(attr, largest) \
+_(attr, last_dim_size) \
+_(attr, layersOutputs) \
+_(attr, layout) \
+_(attr, left) \
+_(attr, length) \
+_(attr, lengths) \
+_(attr, level) \
+_(attr, like) \
+_(attr, list) \
+_(attr, log_alpha) \
+_(attr, log_input) \
+_(attr, log_probs) \
+_(attr, log_target) \
+_(attr, logabsdet) \
+_(attr, logsumexp) \
+_(attr, low) \
+_(attr, lower) \
+_(attr, lr) \
+_(attr, ltm) \
+_(attr, m) \
+_(attr, mantissa) \
+_(attr, margin) \
+_(attr, mask) \
+_(attr, mask_check) \
+_(attr, mask_type) \
+_(attr, masked_grad) \
+_(attr, mat) \
+_(attr, mat1) \
+_(attr, mat2) \
+_(attr, matrices) \
+_(attr, max) \
+_(attr, max_exp_avg_sqs) \
+_(attr, max_k) \
+_(attr, max_norm) \
+_(attr, max_q) \
+_(attr, max_seqlen_k) \
+_(attr, max_seqlen_q) \
+_(attr, max_size) \
+_(attr, max_val) \
+_(attr, max_values) \
+_(attr, maximize) \
+_(attr, maximum_indices) \
+_(attr, maxnorm) \
+_(attr, mean) \
+_(attr, median) \
+_(attr, memory_format) \
+_(attr, meta) \
+_(attr, min) \
+_(attr, min_indices) \
+_(attr, min_val) \
+_(attr, minlength) \
+_(attr, mode) \
+_(attr, momentum) \
+_(attr, momentum_buffer_list) \
+_(attr, n) \
+_(attr, n_bins) \
+_(attr, n_fft) \
+_(attr, names) \
+_(attr, nan) \
+_(attr, need_weights) \
+_(attr, neg_log_likelihood) \
+_(attr, negative) \
+_(attr, negative_slope) \
+_(attr, neginf) \
+_(attr, nested_size) \
+_(attr, nested_strides) \
+_(attr, nesterov) \
+_(attr, new_data) \
+_(attr, nnz) \
+_(attr, noise) \
+_(attr, non_blocking) \
+_(attr, norm) \
+_(attr, norm_bias_1) \
+_(attr, norm_bias_2) \
+_(attr, norm_first) \
+_(attr, norm_type) \
+_(attr, norm_weight_1) \
+_(attr, norm_weight_2) \
+_(attr, normalization) \
+_(attr, normalized) \
+_(attr, normalized_shape) \
+_(attr, nt_example) \
+_(attr, num_chunks) \
+_(attr, num_classes) \
+_(attr, num_generated) \
+_(attr, num_groups) \
+_(attr, num_head) \
+_(attr, num_heads) \
+_(attr, num_layers) \
+_(attr, num_parallel) \
+_(attr, num_samples) \
+_(attr, num_splits_key) \
+_(attr, num_weights) \
+_(attr, numel) \
+_(attr, observer_on) \
+_(attr, offset) \
+_(attr, offset2bag) \
+_(attr, offsets) \
+_(attr, onesided) \
+_(attr, ord) \
+_(attr, order) \
+_(attr, other) \
+_(attr, out) \
+_(attr, out0) \
+_(attr, out1) \
+_(attr, out2) \
+_(attr, out3) \
+_(attr, out4) \
+_(attr, out5) \
+_(attr, out6) \
+_(attr, out_amax) \
+_(attr, out_dim) \
+_(attr, out_dtype) \
+_(attr, out_int32) \
+_(attr, outdim) \
+_(attr, output) \
+_(attr, output_mask) \
+_(attr, output_padding) \
+_(attr, output_scale) \
+_(attr, output_size) \
+_(attr, output_zero_point) \
+_(attr, p) \
+_(attr, packed) \
+_(attr, packed_hh) \
+_(attr, packed_ih) \
+_(attr, packed_weight) \
+_(attr, pad) \
+_(attr, pad_mode) \
+_(attr, padded) \
+_(attr, padding) \
+_(attr, padding_idx) \
+_(attr, padding_mode) \
+_(attr, padding_value) \
+_(attr, params) \
+_(attr, path) \
+_(attr, pdist) \
+_(attr, per_row_fake_quant) \
+_(attr, per_sample_weights) \
+_(attr, periodic) \
+_(attr, philox_offset) \
+_(attr, philox_seed) \
+_(attr, physical_layout) \
+_(attr, pin_memory) \
+_(attr, pivot) \
+_(attr, pivots) \
+_(attr, plain_idx) \
+_(attr, plain_indices) \
+_(attr, pos_weight) \
+_(attr, posinf) \
+_(attr, positive) \
+_(attr, pow) \
+_(attr, prepend) \
+_(attr, primal) \
+_(attr, prob) \
+_(attr, proj_bias) \
+_(attr, proj_size) \
+_(attr, proj_weight) \
+_(attr, q) \
+_(attr, qGroupSize) \
+_(attr, qScaleAndZeros) \
+_(attr, qkv) \
+_(attr, qkv_bias) \
+_(attr, qkv_weight) \
+_(attr, qtensor) \
+_(attr, quant_max) \
+_(attr, quant_min) \
+_(attr, quasi) \
+_(attr, query) \
+_(attr, r) \
+_(attr, ragged_idx) \
+_(attr, random_samples) \
+_(attr, range) \
+_(attr, rank) \
+_(attr, ratio) \
+_(attr, rcond) \
+_(attr, real) \
+_(attr, reduce) \
+_(attr, reduce_range) \
+_(attr, reduction) \
+_(attr, repeats) \
+_(attr, replacement) \
+_(attr, requires_grad) \
+_(attr, reserve) \
+_(attr, reserveSpace) \
+_(attr, reservedSpace) \
+_(attr, residuals) \
+_(attr, result) \
+_(attr, retain_graph) \
+_(attr, return_complex) \
+_(attr, return_counts) \
+_(attr, return_debug_mask) \
+_(attr, return_inverse) \
+_(attr, reverse) \
+_(attr, right) \
+_(attr, rounding_mode) \
+_(attr, row) \
+_(attr, row_indices) \
+_(attr, rstd) \
+_(attr, rtol) \
+_(attr, running_max) \
+_(attr, running_mean) \
+_(attr, running_min) \
+_(attr, running_var) \
+_(attr, s) \
+_(attr, save_invstd) \
+_(attr, save_mean) \
+_(attr, save_var) \
+_(attr, save_var_transform) \
+_(attr, saved_g) \
+_(attr, saved_norms) \
+_(attr, saved_v) \
+_(attr, scalar) \
+_(attr, scalar1) \
+_(attr, scalar2) \
+_(attr, scalars) \
+_(attr, scale) \
+_(attr, scale_a) \
+_(attr, scale_b) \
+_(attr, scale_backoff_factor) \
+_(attr, scale_factors) \
+_(attr, scale_grad_by_freq) \
+_(attr, scale_growth_factor) \
+_(attr, scale_hh) \
+_(attr, scale_ih) \
+_(attr, scale_result) \
+_(attr, scales) \
+_(attr, scales_d) \
+_(attr, scales_h) \
+_(attr, scales_w) \
+_(attr, sections) \
+_(attr, seed) \
+_(attr, self) \
+_(attr, self_is_result) \
+_(attr, self_num_batch_dims) \
+_(attr, self_or_result) \
+_(attr, self_sizes) \
+_(attr, seqlen_k) \
+_(attr, sequences) \
+_(attr, shape) \
+_(attr, shared) \
+_(attr, shifts) \
+_(attr, side) \
+_(attr, sigma) \
+_(attr, sign) \
+_(attr, singular_values) \
+_(attr, size) \
+_(attr, sizes) \
+_(attr, skip_first) \
+_(attr, sobolstate) \
+_(attr, solution) \
+_(attr, some) \
+_(attr, sorted) \
+_(attr, sorted_sequence) \
+_(attr, sorter) \
+_(attr, source) \
+_(attr, spacing) \
+_(attr, sparse) \
+_(attr, sparse_dim) \
+_(attr, sparse_grad) \
+_(attr, split_size) \
+_(attr, split_sizes) \
+_(attr, src) \
+_(attr, stable) \
+_(attr, start) \
+_(attr, start_dim) \
+_(attr, state_steps) \
+_(attr, std) \
+_(attr, step) \
+_(attr, steps) \
+_(attr, storage_offset) \
+_(attr, stride) \
+_(attr, sum_dy) \
+_(attr, sum_dy_xmu) \
+_(attr, sumdim) \
+_(attr, swap) \
+_(attr, symmetric_quant) \
+_(attr, t) \
+_(attr, tangent) \
+_(attr, target) \
+_(attr, target_lengths) \
+_(attr, targets) \
+_(attr, tau) \
+_(attr, tensor) \
+_(attr, tensor1) \
+_(attr, tensor2) \
+_(attr, tensor_indices_or_sections) \
+_(attr, tensors) \
+_(attr, tensors1) \
+_(attr, test_element) \
+_(attr, test_elements) \
+_(attr, the_template) \
+_(attr, theta) \
+_(attr, threshold) \
+_(attr, to) \
+_(attr, tol) \
+_(attr, total) \
+_(attr, total_length) \
+_(attr, total_weight) \
+_(attr, train) \
+_(attr, training) \
+_(attr, transpose) \
+_(attr, transpose_result) \
+_(attr, transposed) \
+_(attr, type1) \
+_(attr, type2) \
+_(attr, unbiased) \
+_(attr, unitriangular) \
+_(attr, unpack_data) \
+_(attr, unpack_pivots) \
+_(attr, unroll_dim) \
+_(attr, unsafe) \
+_(attr, upper) \
+_(attr, upscale_factor) \
+_(attr, use_fast_accum) \
+_(attr, use_gelu) \
+_(attr, use_input_stats) \
+_(attr, v) \
+_(attr, value) \
+_(attr, values) \
+_(attr, var) \
+_(attr, vec) \
+_(attr, vec1) \
+_(attr, vec2) \
+_(attr, w_hh) \
+_(attr, w_ih) \
+_(attr, weight) \
+_(attr, weight0) \
+_(attr, weight1) \
+_(attr, weight2) \
+_(attr, weight3) \
+_(attr, weight4) \
+_(attr, weight_arr) \
+_(attr, weight_buf) \
+_(attr, weight_decay) \
+_(attr, weight_g) \
+_(attr, weight_scale) \
+_(attr, weight_stride0) \
+_(attr, weight_zero_point) \
+_(attr, weights) \
+_(attr, win_length) \
+_(attr, window) \
+_(attr, window_length) \
+_(attr, with_replacement) \
+_(attr, workspace) \
+_(attr, wrap) \
+_(attr, x) \
+_(attr, x1) \
+_(attr, x2) \
+_(attr, y) \
+_(attr, z) \
+_(attr, z_state) \
+_(attr, zero_infinity) \
+_(attr, zero_point) \
+_(attr, zero_point_hh) \
+_(attr, zero_point_ih) \
+_(attr, zero_points)
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/blob.h b/MLPY/Lib/site-packages/torch/include/ATen/core/blob.h
new file mode 100644
index 0000000000000000000000000000000000000000..7aa52ea67a6b52894d30f2c020f4f64952ad7af6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/blob.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <cstddef>
+#include <sstream>
+#include <type_traits>
+#include <typeinfo>
+#include <vector>
+
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/typeid.h>
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+class Tensor;
+
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class TORCH_API Blob final : public c10::intrusive_ptr_target {
+ public:
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {}
+  ~Blob() override {
+    Reset();
+  }
+
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
+  }
+
+  Blob& operator=(Blob&& other) noexcept {
+    Blob(std::move(other)).swap(*this);
+    return *this;
+  }
+
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const noexcept {
+    return meta_.Match<T>();
+  }
+
+  /**
+   * Returns the meta info of the blob.
+   */
+  const TypeMeta meta() const noexcept {
+    return meta_;
+  }
+
+  /**
+   * Returns a printable typename of the blob.
+   */
+  c10::string_view TypeName() const noexcept {
+    return meta_.name();
+  }
+
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  // TODO(jerryzh): add a Get(c10::DeviceType) function?
+  template <class T>
+  const T& Get() const {
+    TORCH_INTERNAL_ASSERT(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    // TODO: after we add Get<Tensor>(c10::DeviceType)
+    // and changed all the callsites, we can add
+    // a static assert here to enforce T != Tensor
+    return *static_cast<const T*>(pointer_);
+  }
+
+  const void* GetRaw() const noexcept {
+    return pointer_;
+  }
+  void* GetRaw() noexcept {
+    return pointer_;
+  }
+
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      // TODO Re-enable logging
+      // VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    free_();
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    has_ownership_ = true;
+    return allocated;
+  }
+
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta meta) {
+    free_();
+    meta_ = meta;
+    pointer_ = allocated;
+    has_ownership_ = false;
+    return allocated;
+  }
+
+  /**
+   * Resets the Blob to an empty one.
+   */
+  void Reset() {
+    free_();
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    has_ownership_ = false;
+  }
+
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs) {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(has_ownership_, rhs.has_ownership_);
+  }
+
+ private:
+  void free_() {
+    if (has_ownership_ && pointer_ != nullptr) {
+      (*meta_.deleteFn())(pointer_);
+    }
+  }
+
+  TypeMeta meta_;
+  void* pointer_;
+  bool has_ownership_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+inline void swap(Blob& lhs, Blob& rhs) {
+  lhs.swap(rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
+  return out << "Blob[" << v.TypeName() << "]";
+}
+
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0adf28a2f1937cc728a9efe484cbd47d7937b87
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+
+struct IValue;
+using Stack = std::vector<IValue>;
+
+class OperatorHandle;
+class KernelFunction;
+
+// This kernel implements the behavior of falling through to the next available
+// registered dispatch key.  The implementation of this function is FAST; it is
+// no overhead to fallthrough to the next key.  See cpp file for some more
+// implementation notes; notably, this does NOT actually go through the
+// boxing/unboxing codepath.
+TORCH_API void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+
+// Note [Ambiguity in AutogradOther kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This error-reporting kernel is registered to the AutogradOther entry in the
+// dispatch table when there is both a CompositeImplicitAutograd kernel and a
+// backend kernel for ANY backend that maps to AutogradOther.  To see why
+// this is necessary in the AutogradOther case, it's helpful to first see
+// why everything works out fine for a backend that has a reserved Autograd
+// entry (see rule 2.2 in [Note] DispatchTable computation):
+//
+//    CPU   AutogradCPU
+//    reg?  registers with...
+//    -------------------------------------------------
+//    y     Autograd registration takes precedence
+//          over CompositeImplicitAutograd.
+//          This is good, because the CPU specific backend
+//          implementation is more specialized and typically better;
+//          if we used the composite, we would bypass it.
+//          (NB: the Autograd key is guaranteed to exist because
+//          the autograd codegen requires it!)
+//
+//    n     CompositeImplicitAutograd takes precedence.
+//          This is also good, because the Autograd
+//          registration (if it exists) would try to redispatch
+//          to the (non-existent) CPU implementation; by
+//          using the composite, we ensure the operator
+//          actually works.
+//
+// As you can see, when we have a specific Autograd key (AutogradCPU), we can
+// decide whether or not to use the CompositeImplicitAutograd kernel or the
+// Autograd kernel based on whether or not the backend kernel exists.
+//
+// However, for AutogradOther (which is the catchall autograd kernel for
+// everything that doesn't have a specific Autograd key), we can't do this
+// trick because there isn't any unique backend to peek at to disambiguate;
+// if there are some backends that have implementations they prefer Autograd,
+// but unimplemented backends would prefer CompositeImplicitAutograd.  Rather
+// than arbitrarily pick one or the other, we just register a kernel that raises
+// an error and let the user decide how to proceed.
+TORCH_API void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+
+// Note [named_not_supported_kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This kernel implements reporting an error message saying that named tensor is
+// not supported.  This kernel doesn't rely on the Stack, and so it is special
+// cased in the dispatcher to be triggered before we attempt boxing (so we can
+// give a good error message in cases when boxing is not supported).  When
+// boxing is universally supported this can be removed.
+[[noreturn]] TORCH_API void named_not_supported_kernel(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+
+/**
+ * BoxedKernel is similar to a std::function storing a boxed kernel.
+ */
+class TORCH_API BoxedKernel final {
+public:
+  // This is how boxed kernels are actually stored
+  //
+  // Note [Plumbing Keys Through The Dispatcher]
+  // Benchmarks have shown that it is expensive for the dispatcher to read from thread-local storage (TLS)
+  // upon every dispatch call into order to compute which kernel to dispatch to.
+  //
+  // To mitigate this, we've updated the calling convention inside the dispatcher to expect every kernel that it stores
+  // to have a first argument of type DispatchKeySet.
+  //
+  // What are the invariants of the DispatchKeySet when it gets passed to a kernel?
+  // - All keys to the left of the current dispatch key have been masked out.
+  //   (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the highest bit to be DispatchKey::Tracer)
+  // - All other keys that dispatcher normally would have computed through TLS + global state + op arguments
+  //   are still in the set.
+  //
+  // Kernels can then opt into using this keyset to save the dispatcher from doing repeated work during redispatches:
+  // recalculating the highest-priority dispatch key, which involves reading from TLS. Instead, the kernels that opt in will
+  // calculate an updated DispatchKeySet directly from the old one, and pass the updated set directly into the dispatcher
+  // upon redispatching.
+  //
+  // This is an opt-in mechanism: Kernels can automatically opt in by setting the first argument in their signature
+  // to be of type DispatchKeySet. See the kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for examples.
+  //
+  // The mechanism for optionally passing that DispatchKeySet into the kernel lives in make_boxed_from_unboxed_functor.h.
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  using InternalBoxedKernelFunction = void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+  // This is the public API for how boxed kernels are defined
+  using BoxedKernelFunction = void(const OperatorHandle&, Stack*);
+  using BoxedKernelFunction_withDispatchKeys = void(const OperatorHandle&, DispatchKeySet, Stack*);
+
+  BoxedKernel();
+
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValid() const;
+  bool isFallthrough() const;
+
+  /**
+   * Call the function with boxed arguments.
+   */
+  void callBoxed(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Stack* stack) const;
+
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>();
+   */
+  template<BoxedKernelFunction* func>
+  static BoxedKernel makeFromFunction();
+
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs dispatch keys (currently there are none)
+   * See Note [Plumbing Keys Through The Dispatcher] for details.
+   */
+  template<BoxedKernelFunction_withDispatchKeys* func>
+  static BoxedKernel makeFromFunction();
+
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > BoxedKernel func = BoxedKernel::makeFromFunctor(std::make_unique<MyFunctor>());
+   */
+  template<class KernelFunctor>
+  static BoxedKernel makeFromFunctor(std::unique_ptr<KernelFunctor> kernelFunctor);
+
+
+  static BoxedKernel makeFallthrough();
+  static BoxedKernel makeAmbiguousAutogradOther();
+  static BoxedKernel makeNamedNotSupported();
+
+private:
+
+  friend class KernelFunction;
+
+  template<BoxedKernelFunction* func>
+  static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet, Stack* stack);
+
+  template<BoxedKernelFunction_withDispatchKeys* func>
+  static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet, Stack* stack);
+
+  explicit BoxedKernel(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func);
+
+  OperatorKernel* getFunctor() const;
+  InternalBoxedKernelFunction* getFnPtr() const;
+
+  c10::intrusive_ptr<OperatorKernel> functor_;
+  InternalBoxedKernelFunction* boxed_kernel_func_;
+};
+
+}  // namespace c10
+
+#include <ATen/core/boxing/BoxedKernel_impl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4520f0de4585bf50b2b786396afbec3bfa9ff782
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h
@@ -0,0 +1,99 @@
+#pragma once
+
+namespace c10 {
+
+inline BoxedKernel::BoxedKernel()
+    : functor_()
+, boxed_kernel_func_(nullptr)
+{}
+
+inline BoxedKernel::BoxedKernel(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func)
+: functor_(std::move(functor))
+, boxed_kernel_func_(boxed_kernel_func)
+{}
+
+template<BoxedKernel::BoxedKernelFunction* func>
+inline void BoxedKernel::make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet, Stack* stack) {
+    // Note that we're dropping the DispatchKeySet argument.
+    // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+    func(opHandle, stack);
+}
+
+template<BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline void BoxedKernel::make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, DispatchKeySet ks, Stack* stack) {
+    // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+    func(opHandle, ks, stack);
+}
+
+inline bool BoxedKernel::isValid() const {
+    return boxed_kernel_func_ != nullptr;
+}
+
+inline bool BoxedKernel::isFallthrough() const {
+    return boxed_kernel_func_ == &fallthrough_kernel;
+}
+
+inline void BoxedKernel::callBoxed(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Stack* stack) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        boxed_kernel_func_ != nullptr,
+        "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel."
+    );
+    (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack);
+}
+
+template<BoxedKernel::BoxedKernelFunction* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+    return BoxedKernel(
+        nullptr,  // no functor_ object
+        &make_boxed_function<func>
+    );
+}
+
+template<BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+    return BoxedKernel(
+        nullptr,  // no functor_ object
+        &make_boxed_function<func>
+    );
+}
+
+inline BoxedKernel BoxedKernel::makeFallthrough() {
+    return BoxedKernel(
+        nullptr,  // no functor_ object
+        &fallthrough_kernel
+    );
+}
+
+inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() {
+    return BoxedKernel(
+        nullptr,  // no functor_ object
+        &ambiguous_autogradother_kernel
+    );
+}
+
+inline BoxedKernel BoxedKernel::makeNamedNotSupported() {
+    return BoxedKernel(
+        nullptr,  // no functor_ object
+        &named_not_supported_kernel
+    );
+}
+
+template<class KernelFunctor>
+inline BoxedKernel BoxedKernel::makeFromFunctor(std::unique_ptr<KernelFunctor> kernelFunctor) {
+    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+    return BoxedKernel(
+        std::move(kernelFunctor),
+        [](OperatorKernel* kernel, const OperatorHandle& op, DispatchKeySet ks, Stack* stack) {
+          (*static_cast<KernelFunctor*>(kernel))(op, ks, stack);
+        }
+    );
+}
+
+inline OperatorKernel* BoxedKernel::getFunctor() const {
+  return functor_.get();
+}
+inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const {
+  return boxed_kernel_func_;
+}
+
+}  // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..41d9467e03d561c6ad46d0ff2e9d095a76b12ef3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/stack.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/TypeList.h>
+#include <type_traits>
+
+namespace c10 {
+
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack to the c10 namespace.
+
+class OperatorHandle;
+struct OperatorKernel;
+class KernelFunction;
+
+template <typename T>
+using has_symint =
+  std::disjunction<
+    std::is_same<c10::SymInt, T>,
+    std::is_same<c10::SymIntArrayRef, T>,
+    std::is_same<at::OptionalSymIntArrayRef, T>,
+    std::is_same<c10::optional<c10::SymInt>, T>
+  >;
+
+template <typename T>
+struct remove_symint {
+  using type = T;
+};
+
+template <>
+struct remove_symint<c10::SymInt> {
+  using type = int64_t;
+};
+
+template <>
+struct remove_symint<at::OptionalSymIntArrayRef> {
+  using type = OptionalIntArrayRef;
+};
+
+template <>
+struct remove_symint<c10::SymIntArrayRef> {
+  using type = c10::IntArrayRef;
+};
+
+template <>
+struct remove_symint<c10::optional<c10::SymInt>> {
+  using type = c10::optional<int64_t>;
+};
+
+
+template <bool symint, typename T>
+struct maybe_keep_symint final {};
+
+template <typename T>
+struct maybe_keep_symint<true, T> { using type = T; };
+
+template <typename T>
+struct maybe_keep_symint<false, T> { using type = typename remove_symint<T>::type; };
+
+template <typename T>
+using fn_has_symint = typename guts::typelist::true_for_any_type<
+  has_symint,
+  typename guts::infer_function_traits<T>::type::parameter_types
+>;
+
+template <typename T>
+struct fn_remove_symint;
+
+template <typename Ret, typename... Args>
+struct fn_remove_symint<Ret(Args...)> {
+  using type = Ret(typename remove_symint<Args>::type...);
+};
+
+/**
+ * KernelFunction is similar to std::function but stores a kernel function.
+ * You can create a KernelFunction from a boxed or unboxed function/functor/lambda
+ * and call it in a boxed or unboxed way. If the way it was created doesn't
+ * match the way it was called, it will do boxing or unboxing as necessary.
+ */
+class TORCH_API KernelFunction final {
+public:
+  using InternalBoxedKernelFunction = BoxedKernel::InternalBoxedKernelFunction;
+  using BoxedKernelFunction = BoxedKernel::BoxedKernelFunction;
+  using BoxedKernelFunction_withDispatchKeys = BoxedKernel::BoxedKernelFunction_withDispatchKeys;
+
+  KernelFunction();
+
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValidUnboxed() const;
+  bool isValidSymUnboxed() const;
+  bool isValid() const;
+  bool isFallthrough() const;
+
+  /**
+   * Call the function in a boxed way.
+   * If the kernel function was created with an unboxed function,
+   * this will call an unboxing wrapper which then calls into that
+   * unboxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.callBoxed(stack);
+   *
+   * Or, with an unboxed implementation:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.callBoxed(stack);
+   */
+  void callBoxed(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Stack* stack) const;
+
+  /**
+   * Call the function in an unboxed way.
+   * If the kernel function was created with a boxed function,
+   * this will box all inputs and then call into that boxed function.
+   *
+   * Note that this doesn't work for all types yet.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   *
+   * Or, with a boxed implementation:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   */
+  template<class Return, class... Args>
+  Return call(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Args... args) const;
+
+  /**
+   * Create a KernelFunction from a BoxedKernel.
+   */
+  static KernelFunction makeFromBoxedKernel(BoxedKernel boxed_fn);
+
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction<&boxed_func>();
+   */
+  template<BoxedKernelFunction* func>
+  static KernelFunction makeFromBoxedFunction();
+
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs dispatch keys (currently there are none)
+   * See Note [Plumbing Keys Through The Dispatcher] for details.
+   */
+  template<BoxedKernelFunction_withDispatchKeys* func>
+  static KernelFunction makeFromBoxedFunction();
+
+  /**
+   * Create a KernelFunction from an unboxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * > };
+   * > KernelFunction func = KernelFunction::makeFromUnboxedFunctor<MyFunctor>(std::make_unique<MyFunctor>());
+   */
+  template<bool AllowLegacyTypes = false, class KernelFunctor>
+  static KernelFunction makeFromUnboxedFunctor(std::unique_ptr<OperatorKernel> kernelFunctor);
+
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunctor(std::make_unique<MyFunctor>());
+   */
+  template<class KernelFunctor>
+  static KernelFunction makeFromBoxedFunctor(std::unique_ptr<KernelFunctor> kernelFunctor);
+
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction
+   * because knowing the function pointer as a template argument (i.e. at
+   * compile time) allows the compiler to inline the function into its
+   * unboxing wrapper and yields better performance when calling the function.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func = KernelFunction::makeFromUnboxedFunction<decltype(unboxed_func), &unboxed_func>();
+   */
+  template<class FuncPtr, bool AllowLegacyTypes = false>
+  static KernelFunction makeFromUnboxedFunction(FuncPtr);
+
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * KernelFunction::makeFromUnboxedFunction is usually a better choice than
+   * this if you know the function pointer at compile time, see doc comment
+   * there for an explanation.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func = KernelFunction::makeFromUnboxedRuntimeFunction(&unboxed_func);
+   */
+  template<bool AllowLegacyTypes = false, class FuncType>
+  static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func);
+
+  static KernelFunction makeFallthrough();
+  static KernelFunction makeAmbiguousAutogradOther();
+  static KernelFunction makeNamedNotSupported();
+
+  /**
+   * Create a KernelFunction from an unboxed lambda.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   */
+  template<bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> makeFromUnboxedLambda(Lambda&& lambda);
+  template<bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<!guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> makeFromUnboxedLambda(Lambda&& lambda);
+
+  std::string dumpState() const;
+  // For testing internal invariants only
+  bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
+
+private:
+
+  explicit KernelFunction(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+  explicit KernelFunction(
+      BoxedKernel boxed_fn,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+
+  BoxedKernel boxed_kernel_func_;
+  void* unboxed_kernel_func_;
+  void* sym_unboxed_kernel_func_;
+};
+
+}
+
+#include <ATen/core/boxing/KernelFunction_impl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe5b8d39f72054dbe034b3dc35e247c66eefb2cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h
@@ -0,0 +1,229 @@
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h>
+#include <ATen/core/boxing/impl/WrapFunctionIntoFunctor.h>
+#include <ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h>
+
+#include <c10/util/C++17.h>
+#include <type_traits>
+
+namespace c10 {
+
+inline KernelFunction::KernelFunction()
+    : boxed_kernel_func_()
+    , unboxed_kernel_func_(nullptr)
+    , sym_unboxed_kernel_func_(nullptr)
+{}
+
+inline KernelFunction::KernelFunction(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func, void* sym_unboxed_kernel_func = nullptr)
+  : boxed_kernel_func_(std::move(functor), boxed_kernel_func)
+  , unboxed_kernel_func_(unboxed_kernel_func)
+  , sym_unboxed_kernel_func_(sym_unboxed_kernel_func)
+{}
+
+inline KernelFunction::KernelFunction(BoxedKernel boxed_fn, void* unboxed_kernel_func, void* sym_unboxed_kernel_func = nullptr)
+  : boxed_kernel_func_(std::move(boxed_fn))
+  , unboxed_kernel_func_(unboxed_kernel_func)
+  , sym_unboxed_kernel_func_(sym_unboxed_kernel_func)
+{}
+
+inline bool KernelFunction::isValidUnboxed() const {
+  return unboxed_kernel_func_ != nullptr;
+}
+
+inline bool KernelFunction::isValidSymUnboxed() const {
+  return sym_unboxed_kernel_func_ != nullptr;
+}
+
+inline bool KernelFunction::isValid() const {
+  return boxed_kernel_func_.isValid();
+}
+
+inline bool KernelFunction::isFallthrough() const {
+  return boxed_kernel_func_.isFallthrough();
+}
+
+inline void KernelFunction::callBoxed(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Stack* stack) const {
+  boxed_kernel_func_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+
+template<class Return, class... Args>
+inline Return callUnboxedKernelFunction(void* unboxed_kernel_func, OperatorKernel* functor, DispatchKeySet dispatchKeySet, Args&&... args) {
+    using ActualSignature = Return (OperatorKernel*, DispatchKeySet, Args...);
+    ActualSignature* func = reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
+    return (*func)(functor, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+// This template requires you to explicitly specify the argument you want to
+// forward; it doesn't work if you try to deduce it
+// NB: keep this in sync with cloneWithRealTypes in function_schema.cpp
+
+template <typename T>
+inline typename remove_symint<T>::type unpackSymInt(T x) { return x; }
+
+template <>
+inline typename remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
+  return x.guard_int(__FILE__, __LINE__);
+}
+
+template <>
+inline typename remove_symint<c10::SymIntArrayRef>::type unpackSymInt(c10::SymIntArrayRef x) {
+  return C10_AS_INTARRAYREF_SLOW(x);
+}
+
+template <>
+inline typename remove_symint<c10::optional<c10::SymInt>>::type unpackSymInt(c10::optional<c10::SymInt> x) {
+  return x.has_value() ? c10::make_optional(x->guard_int(__FILE__, __LINE__)) : c10::nullopt;
+}
+
+template <>
+inline typename remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(at::OptionalSymIntArrayRef x) {
+  return x.has_value() ? c10::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) : c10::nullopt;
+}
+
+template<class Return, class... Args>
+C10_ALWAYS_INLINE Return KernelFunction::call(const OperatorHandle& opHandle, DispatchKeySet dispatchKeySet, Args... args) const {
+    // note: Args above is intentionally not Args&&. We don't want perfect
+    // forwarding, which would require Args to be deduced, but instead we
+    // want callers to explicitly specify the Args.
+
+    if constexpr (std::disjunction_v<has_symint<Args>...>) {
+      if (sym_unboxed_kernel_func_ != nullptr) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, Args...>(
+            sym_unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
+      }
+
+      if (unboxed_kernel_func_ != nullptr) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, typename remove_symint<Args>::type...>(
+            unboxed_kernel_func_, functor, dispatchKeySet, unpackSymInt<Args>(args)...);
+      }
+    } else {
+      if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
+        auto *functor = boxed_kernel_func_.getFunctor();
+        return callUnboxedKernelFunction<Return, Args...>(
+            unboxed_kernel_func_, functor, dispatchKeySet, std::forward<Args>(args)...);
+      }
+    }
+
+    return impl::BoxedKernelWrapper<Return(Args...)>::call(
+        boxed_kernel_func_,
+        opHandle,
+        dispatchKeySet,
+        std::forward<Args>(args)...
+    );
+}
+
+inline KernelFunction KernelFunction::makeFromBoxedKernel(BoxedKernel boxed_fn) {
+  return KernelFunction(std::move(boxed_fn), nullptr);  // no unboxed function pointer
+}
+
+template<KernelFunction::BoxedKernelFunction* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+
+template<KernelFunction::BoxedKernelFunction_withDispatchKeys* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+
+inline KernelFunction KernelFunction::makeFallthrough() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFallthrough());
+}
+
+inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeAmbiguousAutogradOther());
+}
+
+inline KernelFunction KernelFunction::makeNamedNotSupported() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeNamedNotSupported());
+}
+
+template<bool AllowLegacyTypes, class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr<OperatorKernel> kernelFunctor) {
+#ifndef NDEBUG
+  // This assertion is costly for build time so it's debug-gated.
+    static_assert(guts::is_functor<KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
+#endif
+    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+
+    auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed<KernelFunctor>::call;
+    void* void_unboxed_fn = reinterpret_cast<void*>(unboxed_fn);
+    bool is_symint = fn_has_symint<decltype(unboxed_fn)>::value;
+    return KernelFunction(
+        std::move(kernelFunctor),
+        &impl::make_boxed_from_unboxed_functor<KernelFunctor, AllowLegacyTypes>::call,
+        is_symint ? nullptr : void_unboxed_fn,
+        is_symint ? void_unboxed_fn : nullptr
+    );
+}
+
+template<class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromBoxedFunctor(std::unique_ptr<KernelFunctor> kernelFunctor) {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunctor(std::move(kernelFunctor)));
+}
+
+template<class FuncPtr, bool AllowLegacyTypes>
+inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) {
+    static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
+    static_assert(!std::is_same<typename FuncPtr::FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+    static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
+
+#if !defined(C10_MOBILE)
+    (void)func_ptr; // Suppress unused variable warning
+    return makeFromUnboxedFunctor<AllowLegacyTypes, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>(
+        guts::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
+    );
+#else
+    // On mobile, we rather want to optimize for binary size than for performance,
+    // so let's not inline the kernel into the wrapper but use makeFromUnboxedRuntimeFunction
+    // instead.
+    return makeFromUnboxedRuntimeFunction(func_ptr.func_ptr());
+#endif
+}
+
+template<bool AllowLegacyTypes, class FuncType>
+inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) {
+    static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
+    static_assert(!std::is_same<FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+    TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
+
+    return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
+        guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func)
+    );
+}
+
+template<bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+    static_assert(guts::is_functor<std::decay_t<Lambda>>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+
+#if !defined(C10_MOBILE)
+    return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+        guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
+    );
+#else
+    // On mobile, we rather want to optimize for binary size than for performance,
+    // so let's not inline the kernel into the wrapper but use makeFromUnboxedRuntimeFunction
+    // instead.
+    using FuncType = typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type;
+    return makeFromUnboxedRuntimeFunction<AllowLegacyTypes, FuncType>(lambda);
+#endif
+}
+
+template<bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<!guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+    static_assert(guts::is_functor<std::decay_t<Lambda>>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+
+    return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+        guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
+    );
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bda1820b4ffdf210f59df8443a9f725564a6104
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+
+/**
+ * Inherit from OperatorKernel to implement a c10 kernel.
+ *
+ * Example:
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ *
+ * The kernel class is allowed to have members but these are equivalent
+ * to global variables. The kernel implementation is responsible for
+ * preventing race conditions on them.
+ *
+ * See below for how to register this kernel with PyTorch.
+ */
+struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target {
+  ~OperatorKernel() override = default;
+};
+
+}  // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa4811722a47e932eafebdcceac4e844c43fc2e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <c10/core/CompileTimeFunctionPointer.h>
+
+namespace c10 {
+namespace impl {
+  namespace detail {
+    template<class FuncPtr, class ReturnType, class ParameterList> class WrapFunctionIntoFunctor_ {};
+    template<class FuncPtr, class ReturnType, class... Parameters>
+    class WrapFunctionIntoFunctor_<FuncPtr, ReturnType, guts::typelist::typelist<Parameters...>> final : public c10::OperatorKernel {
+    public:
+      C10_ALWAYS_INLINE decltype(auto) operator()(Parameters... args) {
+        return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
+      }
+    };
+  }
+
+  // WrapFunctionIntoFunctor: Wraps a compile time function pointer into a kernel functor.
+  // Since it is a compile time function pointer, many compilers can inline it
+  // into the wrapper and you don't get any performance overhead for wrapping.
+  template<class FuncPtr>
+  struct WrapFunctionIntoFunctor final {
+    static_assert(c10::is_compile_time_function_pointer<FuncPtr>::value, "WrapFunctionIntoFunctor can only wrap functions created with TORCH_FN.");
+    using type = detail::WrapFunctionIntoFunctor_<
+        FuncPtr,
+        typename guts::function_traits<typename FuncPtr::FuncType>::return_type,
+        typename guts::function_traits<typename FuncPtr::FuncType>::parameter_types
+    >;
+  };
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a12160b47f494b3deb455205956e7d271bece967
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <c10/util/TypeTraits.h>
+
+namespace c10 {
+
+namespace impl {
+  namespace detail {
+    template<class FuncType, class ReturnType, class ParameterList> class WrapFunctionIntoRuntimeFunctor_ {};
+    template<class FuncType, class ReturnType, class... Parameters>
+    class WrapFunctionIntoRuntimeFunctor_<FuncType, ReturnType, guts::typelist::typelist<Parameters...>> final : public c10::OperatorKernel {
+    public:
+      template<class FuncType_>
+      explicit WrapFunctionIntoRuntimeFunctor_(FuncType_&& kernel_func)
+      : kernel_func_(std::forward<FuncType_>(kernel_func)) {}
+
+      decltype(auto) operator()(Parameters... args) {
+        return kernel_func_(std::forward<Parameters>(args)...);
+      }
+
+    private:
+      FuncType kernel_func_;
+    };
+  }
+
+  // WrapFunctionIntoRuntimeFunctor: Wraps any runtime functor into a functor that
+  // inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
+  // This can, for example, be used for lambdas, functors or even function pointers.
+  // In the case of function pointers, since it is a runtime function pointer,
+  // there is an overhead for calling it whenever the kernel is invoked.
+  template<class FuncType>
+  using WrapFunctionIntoRuntimeFunctor = detail::WrapFunctionIntoRuntimeFunctor_<
+      FuncType,
+      typename guts::infer_function_traits_t<FuncType>::return_type,
+      typename guts::infer_function_traits_t<FuncType>::parameter_types
+  >;
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/boxing.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/boxing.h
new file mode 100644
index 0000000000000000000000000000000000000000..041b261031c4496105eb2b58dd54d7e8b570165b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/boxing.h
@@ -0,0 +1,387 @@
+#pragma once
+
+// This file contains boxing (not unboxing) logic,
+// i.e. how to make a vector<IValue> from a set of concrete arguments.
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/core/TensorOptions.h>
+
+#include <ATen/core/boxing/BoxedKernel.h>
+
+#include <c10/util/Metaprogramming.h>
+#include <type_traits>
+
+namespace c10 {
+namespace impl {
+
+//
+// utils
+//
+
+// is_mutable_tensor_ref
+template <class T> struct is_mutable_tensor_ref : std::false_type {};
+template <> struct is_mutable_tensor_ref<at::Tensor&> : std::true_type {};
+
+// is_tuple_of_mutable_tensor_refs
+//
+template <class T, class Enable = void>
+struct is_tuple_of_mutable_tensor_refs : std::false_type {};
+
+template <class T>
+struct is_tuple_of_mutable_tensor_refs<T, std::enable_if_t<guts::is_instantiation_of<std::tuple, T>::value, void>>
+: guts::typelist::all<is_mutable_tensor_ref, guts::typelist::from_tuple_t<T>>
+{};
+
+// has_ivalue_to<T> tests the presence/absence of instance method IValue::to<T>()
+//
+template <class T, class Enable = void>
+struct has_ivalue_to : std::false_type {};
+
+template <class T>
+struct has_ivalue_to<T, std::void_t<decltype(std::declval<IValue>().to<T>())>>
+: std::true_type
+{};
+
+//
+// boxing predicates
+//
+
+// A boxable arg type is one that IValue has a constructor for.
+template <typename T>
+using can_box =
+  std::disjunction<
+    std::is_constructible<IValue, std::decay_t<T>>,
+    // TensorOptions are not directly constructible into IValue,
+    // but torch::jit::push knows how to handle them
+    std::is_same<TensorOptions, std::decay_t<T>>
+  >;
+
+template <typename... Ts>
+using can_box_all = std::conjunction<can_box<Ts>...>;
+
+// an unboxable result is one that can be extracted from an IValue
+template <typename T>
+using can_unbox =
+   std::conjunction<
+    std::disjunction<
+      has_ivalue_to<T>,
+      // void returns are ok
+      std::is_same<void, T>
+    >,
+    std::negation<std::is_lvalue_reference<T>>
+  >;
+
+//
+// boxArgs - utility for pushing unboxed args onto IValue stack
+//
+template <class... Args>
+torch::jit::Stack boxArgs(Args... args) {
+  // TODO Reuse stack vector instead of allocating?
+  torch::jit::Stack stack;
+  stack.reserve(sizeof...(Args));
+  torch::jit::push(stack, std::forward<Args>(args)...);
+  return stack;
+}
+
+template <class T>
+static inline constexpr size_t boxed_size_one() {
+  static_assert(!std::is_same<std::decay_t<T>, c10::TensorOptions>::value, "need to patch this path to support TensorOptions passed by reference");
+  return 1;
+}
+
+// torch::jit::push pushes 4 values for a TensorOptions; this needs to
+// be kept in sync.
+template <>
+inline constexpr size_t boxed_size_one<c10::TensorOptions>() {
+  return 4;
+}
+
+// NOTE: this could probably be simplified with C++17 fold expressions.
+template <typename...>
+struct BoxedSize : std::integral_constant<size_t, 0> {};
+template <class T, class... Args>
+struct BoxedSize<T, Args...> : std::integral_constant<size_t, boxed_size_one<T>() + BoxedSize<Args...>::value> {};
+
+template <class... Args>
+static inline constexpr size_t boxed_size() {
+  return BoxedSize<Args...>::value;
+}
+
+using IValueAlignedStorage = std::aligned_storage_t<sizeof(IValue), alignof(IValue)>;
+
+template <typename T>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValueAlignedStorage* dest, T& arg, int& lastIdx) {
+  new (&dest[lastIdx]) IValue(arg);
+  lastIdx++;
+}
+
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValueAlignedStorage* dest, c10::TensorOptions options, int& lastIdx) {
+  new (&dest[lastIdx++]) IValue(c10::typeMetaToScalarType(options.dtype()));
+  new (&dest[lastIdx++]) IValue(options.layout());
+  new (&dest[lastIdx++]) IValue(options.device());
+  new (&dest[lastIdx++]) IValue(options.pinned_memory());
+}
+
+inline void boxArgsToStack(IValueAlignedStorage*, int&) {}
+
+template<typename T, typename... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(IValueAlignedStorage* dest, int& lastIdx, T& arg, Args &... args) {
+  boxToStack(dest, arg, lastIdx);
+  boxArgsToStack(dest, lastIdx, args...);
+}
+
+//
+// PopResult is a helper class whose specializations handle popping single and
+// multiple return values, respectively.
+//
+template <class Result>
+struct PopResult final {
+  static Result call(Stack& stack) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == 1,
+      "Boxed kernel was expected to return one value on the stack, ",
+      "but instead pushed ", stack.size(), " values."
+    );
+    return std::move(stack[0]).to<Result>();
+  }
+};
+
+template <class... Types>
+struct PopResult<std::tuple<Types...>> final {
+  using Result = std::tuple<Types...>;
+
+  static Result call(Stack& stack) {
+    // for tuple return types, boxed kernel has pushed multiple values onto the stack
+    constexpr int RetCount = sizeof...(Types);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == RetCount,
+      "Boxed kernel was expected to return ", RetCount, " values on the stack, ",
+      "but instead pushed ", stack.size(), " values."
+    );
+    return pop_to_tuple_impl(stack, std::make_index_sequence<RetCount>());
+  }
+private:
+  // note: this has been moved into its own helper only to avoid a parse error on `indices` otherwise.
+  // I'm sure there's an incantation that slips it past the parser but eh
+  template <size_t... indices>
+  static Result pop_to_tuple_impl(Stack& stack, std::index_sequence<indices...>) {
+    return std::make_tuple((std::move(stack[indices]).to<Types>())...);
+  }
+};
+
+//
+// BoxedKernelWrapper
+//
+// For a given function type FT, BoxedKernelWrapper<FT> implements
+// a `call` method that
+// - takes a boxed kernel and unboxed arguments as specified by FT,
+// - calls `boxArgs` to box the arguments
+// - calls the boxed kernel
+// - unboxes and returns the result
+//
+// The partial specializations below handle various cases: in
+// particular, not all types appearing in op signatures are supported,
+// and ops returning references have nonstandard wrapper implementations.
+//
+
+// 1. The base specialization of BoxedKernelWrapper should never be instantiated.
+// A "no call method defined on BoxedKernelWrapper" compile error means that
+// an op signature has failed to trigger any of the partial specializations
+// that follow this one.
+//
+template <class FuncType, class Enable = void>
+struct BoxedKernelWrapper {
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to use
+  // template parameters in the expression, e.g. FuncType here. However, since
+  // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same
+  // effect.
+  static_assert(sizeof(FuncType) != sizeof(FuncType),
+     "Function signature contains one or more unsupported parameter and/or return types. "
+     "Look for a nearby error like "
+     "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
+     "- (your function type) is the unsupported signature.");
+};
+
+//
+// 2. Supported signatures, other than those involving non-const Tensor refs -
+// i.e., "functional" ops.
+//
+
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+  Result(Args...),
+  std::enable_if_t<
+    can_box_all<Args...>::value && can_unbox<Result>::value && !is_tuple_of_mutable_tensor_refs<Result>::value,
+    void
+  >
+> {
+  static Result call(
+    const BoxedKernel& boxed_kernel_func,
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Args... args
+  ) {
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+
+    if constexpr (!std::is_same_v<void, Result>) {
+        // op has pushed one or more values onto the stack.
+        return PopResult<Result>::call(stack);
+    } else {
+      // op returns void, boxed kernel has pushed nothing onto stack.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          stack.empty(),
+          "Boxed kernel was expected to return no values on the stack, ",
+          "but instead returned ", stack.size(), " values."
+      );
+    }
+  }
+};
+
+//
+// 3. in-place ops take a single non-const Tensor reference
+// as their first argument, and return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// Because of this, the generated BoxedKernelWrapper specializations simply
+// return the in-place argument.
+//
+
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+  at::Tensor&(at::Tensor&, OtherArgs...),
+  std::enable_if_t<can_box_all<OtherArgs...>::value, void>
+> {
+  static at::Tensor& call(
+    const BoxedKernel& boxed_kernel_func,
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    at::Tensor& outArg, OtherArgs... otherArgs
+  ) {
+    torch::jit::Stack stack = boxArgs<at::Tensor&, OtherArgs...>(outArg, std::forward<OtherArgs>(otherArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == 1,
+      "Boxed kernel was expected to return a single value on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    return outArg;
+  }
+};
+
+//
+// 3.5. In-process migration to make in-place ops take and return
+// const references instead.
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+  const at::Tensor&(const at::Tensor&, OtherArgs...),
+  std::enable_if_t<can_box_all<OtherArgs...>::value, void>
+> {
+  static const at::Tensor& call(
+    const BoxedKernel& boxed_kernel_func,
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    const at::Tensor& outArg, OtherArgs... otherArgs
+  ) {
+    torch::jit::Stack stack = boxArgs(outArg, otherArgs...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == 1,
+      "Boxed kernel was expected to return a single value on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    return outArg;
+  }
+};
+
+//
+// 4. out of place ops that take a single non-const Tensor reference as their
+// final argument, and also return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to simply
+// return out arguments.
+//
+template <class FirstArg, class... RestArgs>
+struct BoxedKernelWrapper<
+  at::Tensor&(FirstArg, RestArgs...),
+  std::enable_if_t<
+    can_box_all<FirstArg, RestArgs...>::value
+    // this skips over in-place kernels with a non-const Tensor
+    // arg at the front, so those can unambiguously trigger the preceding specialization.
+    && !is_mutable_tensor_ref<FirstArg>::value,
+    void
+  >
+> {
+  static at::Tensor& call(
+    const BoxedKernel& boxed_kernel_func,
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    FirstArg firstArg, RestArgs... restArgs
+  ) {
+    torch::jit::Stack stack = boxArgs<FirstArg, RestArgs...>(std::forward<FirstArg>(firstArg), std::forward<RestArgs>(restArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == 1,
+      "Boxed kernel was expected to return a single value on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    // reusing restArgs after it has been forwarded here is ok because we know
+    // that the last element is of type `Tensor&`.
+    return std::get<sizeof...(RestArgs) - 1>(std::tuple<RestArgs...>{restArgs...});
+  }
+};
+
+//
+// 5. out of place ops that take multiple non-const Tensor references as their
+// final arguments, and return them in a std::tuple.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to simply
+// return the out arguments.
+//
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+  Result(Args...),
+  std::enable_if_t<
+    can_box_all<Args...>::value && is_tuple_of_mutable_tensor_refs<Result>::value,
+    void
+  >
+> {
+  static Result call(
+    const BoxedKernel& boxed_kernel_func,
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Args... args
+  ) {
+    using ArgTuple = std::tuple<Args...>;
+    constexpr int RetCount = std::tuple_size<Result>();
+
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      stack.size() == RetCount,
+      "Boxed kernel was expected to return ", RetCount, " values on the stack, ",
+      "but instead returned ", stack.size(), " values."
+    );
+
+    // reusing args after it has been forwarded here is ok because we know
+    // that the last RetCount elements are of type `Tensor&`.
+    auto result = guts::tuple_take<ArgTuple, -RetCount>(ArgTuple{std::forward<Args>(args)...});
+    static_assert(
+        std::is_same<Result, decltype(result)>::value,
+        "The parameter list of an op returning a tuple of Tensor references "
+            "must end with an equal number of Tensor reference parameters."
+    );
+    return result;
+  }
+};
+
+} // impl
+} // c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..91bf0bff104adfd08bb85d05f744e7886370148a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -0,0 +1,600 @@
+#pragma once
+
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/util/TypeList.h>
+#include <ATen/core/IListRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/Metaprogramming.h>
+
+#include <utility>
+
+namespace c10 {
+
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack to the c10 namespace.
+class OperatorHandle;
+
+/*
+ * [Note: Argument forwarding in the dispatcher]
+ *
+ * The dispatcher uses a somewhat unusual way to forward arguments through several layers of
+ * wrapper functions. This can be confusing because an experienced C++ programmer would look at this
+ * and think "oh this is supposed to be forwarding a universal reference but the && is missing. This is a bug.".
+ * It is not a bug. The common way in C++ to forward arguments is to use universal references:
+ *
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ * but that relies on inferring the correct reference type (i.e. value vs & vs &&) from the argument.
+ * In our case, we cannot rely on the argument as supplied by the caller, because that could infer a
+ * different reference type than was used in the kernel function. The correct reference type
+ * is dictated by the kernel signature and must be identical since we cast function pointers
+ * through void* pointers and mismatches would be UB. So we need a forwarding pattern that determines
+ * the reference type to use by looking at the explicitly supplied operator signature, not by looking at
+ * the argument we're calling it with.
+ *
+ * What does std::forward do, exactly?
+ * ------------------------------------
+ * std::forward<T>(t) is a way to cast t to the reference type supplied in T.
+ * Let's assume decay_t<T> == U and T is either U or some reference of U.
+ *  - std::forward<T&>(t) will return U&, no matter what kind of reference t is.
+ *  - std::forward<T&&>(t) will return U&&, no matter what kind of reference t is.
+ *  - std::forward<T>(t) will return U&& (not U!), no matter what kind of reference t is.
+ *
+ * For universal references, that means that in the following function
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ *  - when called with arg being a rvalue reference or non-reference value, T gets inferred to be
+ *    a non-reference U, and std::forward<T>(t) will return U&&, correctly moving the argument.
+ *  - when called with arg behind a lvalue reference, T gets inferred to be U& because that's the only
+ *    way to match the signature (in C++, a type that is (T&)&& will collapse to T&).
+ *    That means std::forward<T>(t) will return U& and the value will not be moved but passed on as
+ *    a lvalue reference.
+ *
+ * How do we use that?
+ * ------------------------------------
+ * But std::forward can also be used outside of the common "universal forwarding" pattern to change
+ * reference types. So instead of following the common C++ pattern, we notice what
+ * std::forward<T>() actually does, and that is it takes a value and changes its reference to the
+ * type of reference passed in as T. If we don't infer T but explicitly specify it, we can use this
+ * to forward based on an explicitly specified reference type instead of the inferred argument type.
+ *
+ * This is why many of the dispatcher functions look like
+ * > template<class T> func(T t) { func2<T>(std::forward<T>(t)); }
+ * instead of the common
+ * > template<class T> func(T&& t) { func2(std::forward<T>(t)); }
+ *
+ * and are expected to be called by explicitly specifying the template parameters in a way that matches
+ * the expected operator signature at each call site.
+ */
+
+namespace impl {
+  // supported_primitive_arg_types defines which primitive types we allow in
+  // kernel functions as arguments or returns.
+  // Additionally, we support lists, dicts and optionals containing these types.
+  using supported_primitive_arg_types = guts::typelist::typelist<
+    int64_t,
+    double,
+    bool,
+    c10::string_view,
+    at::Tensor,
+    at::Scalar,
+    c10::QScheme,
+    c10::ScalarType,
+    c10::Device,
+    c10::DeviceIndex,
+    c10::Layout,
+    c10::MemoryFormat,
+    at::Dimname
+  >;
+
+  // We have an unboxed functor in hand that takes C++ arguments, and
+  // we're building a boxed functor wrapper for it that takes IValues.
+  // So "outside" is boxed and "inside" is unboxed.
+  //
+  // So a valid input type is one that our boxed functor wrapper can
+  // unbox from an IValue into a C++ value.
+  //
+  // Whereas a valid output type is one that our wrapper can recieve
+  // as a C++ value from the unboxed functor, and box into an IValue.
+
+  //
+  // assert_is_valid_input_type
+  // checks that T can be unboxed from an IValue into a C++ value.
+  //
+
+  template<class T, bool AllowDeprecatedTypes, class Enable = void>
+  struct assert_is_valid_input_type {
+    assert_is_valid_input_type() {
+      if constexpr (guts::typelist::contains<supported_primitive_arg_types, T>::value) {
+        /* everything is ok, this is a primitive type */
+      } else {
+        /* otherwise this must be an instance of a valid custom class, since it can only
+           have been created via IValue(x), which ensures this. */
+      }
+    }
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<c10::optional<T>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {};
+
+  template <bool AllowDeprecatedTypes, class... Args>
+  struct TypeCheckHelper;
+
+  template <bool AllowDeprecatedTypes>
+  struct TypeCheckHelper<AllowDeprecatedTypes> {};
+
+  template <bool AllowDeprecatedTypes, class Head, class... Rest>
+  struct TypeCheckHelper<AllowDeprecatedTypes, Head, Rest...>
+  : TypeCheckHelper<AllowDeprecatedTypes, Rest...> {
+    assert_is_valid_input_type<Head, AllowDeprecatedTypes> check;
+  };
+
+  template<class... Contained, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<std::tuple<Contained...>, AllowDeprecatedTypes>
+  : TypeCheckHelper<AllowDeprecatedTypes, Contained...> {};
+
+  template<class Key, class Value, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<Dict<Key, Value>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+    static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  };
+
+  template<class Key, class Value, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<std::unordered_map<Key, Value>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+    static_assert(AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+    static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<List<T>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported input type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<c10::ArrayRef<T>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported input type: ArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<c10::OptionalArrayRef<T>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported input type: OptionalArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  };
+
+  template<class T, size_t N, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<std::array<T, N>, AllowDeprecatedTypes>
+  : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported input type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
+    // There is no reason to support float when we have double. Keep the API lean.
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: float. Please use double instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: const char*. Please use c10::string_view instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<std::vector<bool>, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: vector<bool>. Please use List<bool> instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_input_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const c10::SymInt&, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead.");
+  };
+
+  // TODO: it probably would be good to tighten this up quite a bit more with
+  // an explicit list for everything
+
+  //
+  // assert_is_valid_output_type
+  //
+
+  template<class T, bool AllowDeprecatedTypes, class Enable = void>
+  struct assert_is_valid_output_type {
+    assert_is_valid_output_type() {
+      if constexpr(guts::typelist::contains<supported_primitive_arg_types, T>::value) {
+        /* everything is ok, this is a primitive type */
+      } else {
+        /* otherwise T is verified to be a registered custom class in the IValue
+          constructor, so no benefit in double-checking here */
+      }
+    }
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<c10::optional<T>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<c10::OptionalArrayRef<T>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+
+  template<class Key, class Value, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<Dict<Key, Value>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+    static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+    static_assert(!std::is_same<Value, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+  };
+
+  template<class Key, class Value, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<std::unordered_map<Key, Value>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+    static_assert(AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+    static_assert(guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+    static_assert(!std::is_same<Value, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<List<T>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported output type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<std::vector<T>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported output type: std::vector<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+    // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel with an unsupported output type: std::vector<T>. Please use List<T> instead.");
+  };
+
+  template<class T, size_t N, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<std::array<T, N>, AllowDeprecatedTypes>
+  : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+    static_assert(!std::is_same<T, at::Scalar>::value,
+      "You tried to register a kernel with an unsupported output type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+  };
+
+  // The following specialisations of assert_is_valid_output_type are technically not
+  // necessary since we would hit the base case and show an error message
+  // there if they didn't exist, but we can show a better error message
+  // in some common error scenarios.
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<float, T>::value>> {
+    // There is no reason to support float when we have double. Keep the API lean.
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: float. Please use double instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<const char*, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: const char*. Please use c10::string_view instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_same<std::vector<bool>, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: vector<bool>. Please use List<bool> instead.");
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct assert_is_valid_output_type<T, AllowDeprecatedTypes, std::enable_if_t<std::is_integral<T>::value && !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+    static_assert(guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead.");
+  };
+
+  // ivalue_to_arg
+
+  template<class T>
+  struct decay_if_not_tensor final {
+    using type = std::decay_t<T>;
+  };
+
+  template<>
+  struct decay_if_not_tensor<at::Tensor&> final {
+    using type = at::Tensor&;
+  };
+
+  template<>
+  struct decay_if_not_tensor<const at::Tensor&> final {
+    using type = const at::Tensor&;
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg final {
+    static decltype(auto) call(IValue& v) {
+      assert_is_valid_input_type<T, AllowDeprecatedTypes>();
+      return std::move(v).to<T>();
+    }
+  };
+
+  // The following two specializations take advantage of specialized
+  // `toTensor()` overloads on IValue to avoid copying.
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<at::Tensor&, AllowDeprecatedTypes> final {
+    // We cannot use the default implementation if they asked for a
+    // `at::Tensor&` because it moves from the IValue, so it can't get
+    // an lvalue reference.
+    static at::Tensor& call(IValue& v) {
+      // Tensor& is valid, don't bother asserting
+      return v.toTensor();
+    }
+  };
+
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<const at::Tensor&, AllowDeprecatedTypes> final {
+    // We should not use the default implementation if they asked for
+    // a `const at::Tensor&` because it moves from the IValue and they
+    // didn't ask for that.
+    static const at::Tensor& call(IValue& v) {
+      // const Tensor& is valid, don't bother asserting
+      return v.toTensor();
+    }
+  };
+
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<at::ITensorListRef, AllowDeprecatedTypes> final {
+    static List<at::Tensor> call(IValue& v) {
+      return v.toTensorList();
+    }
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<ArrayRef<T>, AllowDeprecatedTypes> final {
+    // If an argument is ArrayRef<T>, convert the IValue to a std::vector<T> and pass that
+    // to the operator. std::vector<T> is implicitly convertible to ArrayRef<T>.
+    static std::vector<T> call(IValue& v) {
+      return ivalue_to_arg<std::vector<T>, AllowDeprecatedTypes>::call(v);
+    }
+  };
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<c10::SymIntArrayRef, AllowDeprecatedTypes> final {
+    static std::vector<c10::SymInt> call(IValue& v) {
+      if (v.isIntList()) {
+        std::vector<c10::SymInt> r;
+        auto src = v.toIntList();
+        std::transform(src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { return c10::SymInt(i); });
+        return r;
+      } else {
+        return ivalue_to_arg<std::vector<c10::SymInt>, AllowDeprecatedTypes>::call(v);
+      }
+    }
+  };
+  template<bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<c10::OptionalArray<c10::SymInt>, AllowDeprecatedTypes> final {
+    static OptionalArray<c10::SymInt> call(IValue& v) {
+      if (v.isIntList()) {
+        std::vector<c10::SymInt> r;
+        auto src = v.toIntList();
+        std::transform(src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { return c10::SymInt(i); });
+        return OptionalArray<c10::SymInt>(std::move(r));
+      } else {
+        return std::move(v).to<OptionalArray<c10::SymInt>>();
+      }
+    }
+  };
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+    // If an argument is optional<ArrayRef<T>>, convert the IValue to an optional<std::vector<T>> and pass that
+    // to the operator. OptionalArray<T> is basically a optional<std::vector<T>> but implicitly convertible
+    // to optional<ArrayRef<T>>.
+    static OptionalArray<T> call(IValue& v) {
+      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+    }
+  };
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
+    // If an argument is OptionalArrayRef<T>, convert the IValue to an
+    // optional<std::vector<T>> and pass that to the operator. OptionalArray<T>
+    // is basically a optional<std::vector<T>> but implicitly convertible to
+    // OptionalArrayRef<T>
+    static OptionalArray<T> call(IValue& v) {
+      return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+    }
+  };
+
+  // return_to_ivalue
+  template<class T, bool AllowDeprecatedTypes, class Enable = void>
+  struct return_to_ivalue final {};
+
+  template<class T, bool AllowDeprecatedTypes>
+  struct return_to_ivalue<T, AllowDeprecatedTypes, std::enable_if_t<!std::is_same<at::Tensor&, T>::value>> final {
+    static IValue call(T&& v) {
+      assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+      return c10::ivalue::from(std::move(v));
+    }
+    static IValue copy(const T& v) {
+      assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+      return IValue(v);
+    }
+  };
+
+  // Special case to allow kernels to return `Tensor&`.
+  // TODO Delete this once kernels don't do that anymore
+  template<bool AllowDeprecatedTypes>
+  struct return_to_ivalue<at::Tensor&, AllowDeprecatedTypes, void> final {
+    static IValue call(at::Tensor& v) {
+      return c10::ivalue::from(v);
+    }
+    static IValue copy(at::Tensor& v) {
+      return IValue(v);
+    }
+  };
+
+  // wrap_kernel_functor_unboxed_
+
+  template<class KernelFunctor, class OpSignature>
+  struct wrap_kernel_functor_unboxed_ final {};
+
+  // This specialization is for kernels with a first argument that is NOT of type DispatchKeySet
+  // This includes kernels with 0 arguments.
+  template<class KernelFunctor, class ReturnType, class... ParameterTypes>
+  struct wrap_kernel_functor_unboxed_<KernelFunctor, ReturnType(ParameterTypes...)> final {
+    static_assert(std::is_same<ReturnType, typename guts::infer_function_traits_t<KernelFunctor>::return_type>::value,
+      "Return type mismatch");
+    static_assert(std::is_same<guts::typelist::typelist<ParameterTypes...>, typename guts::infer_function_traits_t<KernelFunctor>::parameter_types>::value,
+      "Parameter types mismatch");
+
+    // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes doesn't use &&
+    static ReturnType call(OperatorKernel* functor, DispatchKeySet, ParameterTypes... args) {
+      KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+      // Note [Plumbing Keys Through The Dispatcher 2]
+      // See Note [Plumbing Keys Through The Dispatcher] for the background.
+      // This functor explicitly takes in a dispatchKeySet and drops it on the floor- it does not forward it to the registered kernel.
+      //
+      // This is due to the calling convention within the dispatcher, which expects all registered kernels to have a first argument of type
+      // DispatchKeySet.
+      // This is not the case for pretty much all manually written kernels, however- this functor serves to separate the calling convention
+      // of the dispatcher from the calling convention of manually written kernels.
+      return (*functor_)(std::forward<ParameterTypes>(args)...);
+    }
+  };
+
+  // This specialization is for kernels with a first argument of type DispatchKeySet
+  template<class KernelFunctor, class ReturnType, class... ParameterTypes>
+  struct wrap_kernel_functor_unboxed_<KernelFunctor, ReturnType(DispatchKeySet, ParameterTypes...)> final {
+    static_assert(std::is_same<ReturnType, typename guts::infer_function_traits_t<KernelFunctor>::return_type>::value,
+      "Return type mismatch");
+    static_assert(std::is_same<guts::typelist::typelist<DispatchKeySet, ParameterTypes...>, typename guts::infer_function_traits_t<KernelFunctor>::parameter_types>::value,
+      "Parameter types mismatch");
+
+    // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes doesn't use &&
+    static ReturnType call(OperatorKernel* functor, DispatchKeySet dispatchKeySet, ParameterTypes... args) {
+      KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+      // We're explicitly taking in a dispatchKeySet and forwarding it to the registered kernel.
+      // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+      return (*functor_)(dispatchKeySet, std::forward<ParameterTypes>(args)...);
+    }
+  };
+
+  template<class KernelFunctor>
+  using wrap_kernel_functor_unboxed = wrap_kernel_functor_unboxed_<KernelFunctor, typename guts::infer_function_traits_t<KernelFunctor>::func_type>;
+
+  // call_functor_with_args_from_stack
+
+  template<class Functor, bool AllowDeprecatedTypes, size_t... ivalue_arg_indices,  typename... ArgTypes>
+  std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+  call_functor_with_args_from_stack_(OperatorKernel* functor, DispatchKeySet dispatchKeySet, Stack* stack, std::index_sequence<ivalue_arg_indices...>, guts::typelist::typelist<ArgTypes...>*) {
+    (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would be unused and we have to silence the compiler warning.
+
+    // We're explicitly filtering out DispatchKeySet from the argument list.
+    // Some kernels take a DispatchKeySet as their first argument in order to plumb keys through the dispatcher.
+    // We don't want to expose the DispatchKeySet type to jit, so we don't include this argument on the stack.
+    // See Note [Plumbing Keys Through The Dispatcher] for the background.
+    return wrap_kernel_functor_unboxed<Functor>::call(functor, dispatchKeySet,
+      ivalue_to_arg<typename decay_if_not_tensor<ArgTypes>::type, AllowDeprecatedTypes>::call(
+        torch::jit::peek(*stack, ivalue_arg_indices, sizeof...(ivalue_arg_indices))
+    )...);
+  }
+
+  template<class Functor, bool AllowDeprecatedTypes>
+  std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+  call_functor_with_args_from_stack(OperatorKernel* functor, DispatchKeySet dispatchKeySet, Stack* stack) {
+    // We're explicitly filtering out DispatchKeySet from the argument list.
+    // Some kernels take a DispatchKeySet as their first argument in order to plumb keys through the dispatcher.
+    // We don't want to expose the DispatchKeySet type to jit, so we don't include this argument on the stack.
+    // See Note [Plumbing Keys Through The Dispatcher] for the background.
+    using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<Functor>::parameter_types;
+    constexpr size_t num_ivalue_args = guts::typelist::size<ArgTypes>::value;
+    return call_functor_with_args_from_stack_<Functor, AllowDeprecatedTypes>(functor, dispatchKeySet, stack, std::make_index_sequence<num_ivalue_args>(), static_cast<ArgTypes*>(nullptr));
+  }
+
+  // push_outputs
+
+  template<class OutputType, bool AllowDeprecatedTypes>
+  struct push_outputs final {
+    // Contrary to [Note: Argument forwarding in the dispatcher], we use OutputType&& here
+    // to avoid one extra call to the move constructor in this case. This is still not a
+    // universal reference though because OutputType is an explicitly specified class
+    // template parameter.
+    static void call(OutputType&& output, Stack* stack) {
+      torch::jit::push(*stack, return_to_ivalue<OutputType, AllowDeprecatedTypes>::call(std::forward<OutputType>(output)));
+    }
+    static void copy(const OutputType& output, Stack* stack) {
+      torch::jit::push(*stack, return_to_ivalue<OutputType, AllowDeprecatedTypes>::copy(output));
+    }
+  };
+  template<class... OutputTypes, bool AllowDeprecatedTypes>
+  struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
+    static void call(std::tuple<OutputTypes...>&& output, Stack* stack) {
+      call_(std::move(output), stack, std::make_index_sequence<sizeof...(OutputTypes)>());
+    }
+    static void copy(const std::tuple<OutputTypes...>& output, Stack* stack) {
+      copy_(output, stack, std::make_index_sequence<sizeof...(OutputTypes)>());
+    }
+
+  private:
+    template<size_t... indices>
+    static void call_(std::tuple<OutputTypes...>&& output, Stack* stack, std::index_sequence<indices...>) {
+      torch::jit::push(*stack, return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(std::forward<OutputTypes>(std::get<indices>(output)))...);
+    }
+    template<size_t... indices>
+    static void copy_(const std::tuple<OutputTypes...>& output, Stack* stack, std::index_sequence<indices...>) {
+      torch::jit::push(*stack, return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(std::get<indices>(output))...);
+    }
+  };
+  template<bool AllowDeprecatedTypes>
+  struct push_outputs<void, AllowDeprecatedTypes> final {
+    static void call(int /*dummy*/, Stack* /*stack*/) {
+    }
+    static void copy(int /*dummy*/, Stack* /*stack*/) {
+    }
+  };
+
+  // make_boxed_from_unboxed_functor
+
+  template<class KernelFunctor, bool AllowDeprecatedTypes>
+  struct make_boxed_from_unboxed_functor final {
+    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value,
+      "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+
+    static void call(OperatorKernel* functor, const OperatorHandle&, DispatchKeySet dispatchKeySet, Stack* stack) {
+      using ReturnType = typename guts::infer_function_traits_t<KernelFunctor>::return_type;
+      // We're explicitly filtering out DispatchKeySet from the argument list.
+      // Some kernels take a DispatchKeySet as their first argument in order to plumb keys through the dispatcher.
+      // We don't want to expose the DispatchKeySet type to jit, so we don't include this argument on the stack.
+      // See Note [Plumbing Keys Through The Dispatcher] for the background.
+      using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<KernelFunctor>::parameter_types;
+      constexpr bool has_outputs = !std::is_same<void, ReturnType>::value;
+      constexpr size_t num_inputs = guts::typelist::size<ArgTypes>::value;
+      if constexpr (has_outputs) {
+        // Decay ReturnType to ReturnType_ so that if a reference gets returned, we actually store it by value
+        // and don't get a dangling reference. This is only required because some kernels still return `Tensor&`.
+        // [Note: VC++ and 'std': ambiguous symbol]
+        using ReturnType_ = ::std::decay_t<ReturnType>;
+        ReturnType_ output = call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(functor, dispatchKeySet, stack);
+        torch::jit::drop(*stack, num_inputs);
+        // See note [ VC++ and 'std': ambiguous symbol]
+        push_outputs<ReturnType_, AllowDeprecatedTypes>::call(::std::move(output), stack);
+      } else {
+        call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(functor, dispatchKeySet, stack);
+        torch::jit::drop(*stack, num_inputs);
+      }
+    }
+  };
+} // namespace impl
+
+} // namespace c10
+
+namespace torch {
+  using OperatorKernel = c10::OperatorKernel;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d6d8134698c649114a31f5bed05419e51a1d7fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/irange.h>
+
+template<class... Inputs>
+inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+  return {std::forward<Inputs>(inputs)...};
+}
+
+inline at::Tensor dummyTensor(c10::DispatchKeySet ks, bool requires_grad=false) {
+  auto* allocator = c10::GetCPUAllocator();
+  int64_t nelements = 1;
+  auto dtype = caffe2::TypeMeta::Make<float>();
+  int64_t size_bytes = nelements * dtype.itemsize();
+  auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size_bytes,
+      allocator->allocate(size_bytes),
+      allocator,
+      /*resizable=*/true);
+  at::Tensor t = at::detail::make_tensor<c10::TensorImpl>(storage_impl, ks, dtype);
+  // TODO: We add this to simulate the ideal case where we only have Autograd backend keys
+  //       on Tensor when it requires grad. But currently Autograd keys are added in TensorImpl
+  //       constructor by default.
+  if (!requires_grad) {
+    t.unsafeGetTensorImpl()->remove_autograd_key();
+  }
+  return t;
+}
+
+inline at::Tensor dummyTensor(c10::DispatchKey dispatch_key, bool requires_grad=false) {
+  return dummyTensor(c10::DispatchKeySet(dispatch_key), requires_grad);
+}
+
+template<class... Args>
+inline std::vector<c10::IValue> callOp(const c10::OperatorHandle& op, Args... args) {
+  auto stack = makeStack(std::forward<Args>(args)...);
+  op.callBoxed(&stack);
+  return stack;
+}
+
+template<class Result, class... Args>
+inline Result callOpUnboxed(const c10::OperatorHandle& op, Args... args) {
+  return op.typed<Result(Args...)>().call(std::forward<Args>(args)...);
+}
+
+template<class Result, class... Args>
+inline Result callOpUnboxedWithDispatchKey(const c10::OperatorHandle& op, c10::DispatchKey dispatchKey, Args... args) {
+  return op.typed<Result(Args...)>().callWithDispatchKey(dispatchKey, std::forward<Args>(args)...);
+}
+
+template<class Result, class... Args>
+inline Result callOpUnboxedWithPrecomputedDispatchKeySet(const c10::OperatorHandle& op, c10::DispatchKeySet ks, Args... args) {
+  return op.typed<Result(Args...)>().redispatch(ks, std::forward<Args>(args)...);
+}
+
+inline void expectDoesntFindKernel(const char* op_name, c10::DispatchKey dispatch_key) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_ANY_THROW(
+    callOp(*op, dummyTensor(dispatch_key), 5);
+  );
+}
+
+inline void expectDoesntFindOperator(const char* op_name) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_FALSE(op.has_value());
+}
+
+template<class Exception, class Functor>
+inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
+  try {
+    std::forward<Functor>(functor)();
+  } catch (const Exception& e) {
+    EXPECT_THAT(e.what(), testing::HasSubstr(expectMessageContains));
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception containing \""
+    << expectMessageContains << "\" but didn't throw";
+}
+
+template<class T, size_t N>
+void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+template<class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+template<class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual.get(i));
+  }
+}
+
+template<class T>
+void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+
+// NB: This is not really sound, but all of the type sets constructed here
+// are singletons so it's fine
+static inline c10::DispatchKey extractDispatchKey(const at::Tensor& t) {
+  return legacyExtractDispatchKey(t.key_set());
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/builtin_function.h b/MLPY/Lib/site-packages/torch/include/ATen/core/builtin_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..19b1f2d579d7acde75f22cff1f1ba6d2bf318725
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/builtin_function.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <ATen/core/function.h>
+#include <ATen/core/ivalue.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <functional>
+#include <utility>
+
+namespace torch {
+namespace jit {
+
+struct BuiltinOpFunction : public Function {
+  BuiltinOpFunction(
+      c10::QualifiedName qualname,
+      c10::FunctionSchema schema,
+      std::function<void(Stack&)> callable,
+      std::string doc_string = "")
+      : name_(std::move(qualname)),
+        callable_(std::move(callable)),
+        schema_(std::move(schema)),
+        doc_string_(std::move(doc_string)) {
+    TORCH_INTERNAL_ASSERT(schema_.returns().size() == 1);
+  }
+
+  c10::string_view doc_string() const override {
+    return doc_string_;
+  }
+
+  void run(Stack& stack) override {
+    callable_(stack);
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      Stack& stack,
+      TaskLauncher /* not used */) override {
+    run(stack);
+    auto res = c10::make_intrusive<c10::ivalue::Future>(stack.front().type());
+    res->markCompleted(std::move(stack.front()));
+    return res;
+  }
+
+  const c10::QualifiedName& qualname() const override {
+    return name_;
+  }
+
+  // if this isn't yet defined, run its method_creator function
+  void ensure_defined() override {
+    // nop
+  }
+
+  const c10::FunctionSchema& getSchema() const override {
+    return schema_;
+  }
+
+  size_t num_inputs() const override {
+    return schema_.arguments().size();
+  }
+
+  Function& setSchema(c10::FunctionSchema schema) override {
+    schema_ = std::move(schema);
+    return *this;
+  }
+
+  bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {
+    run(stack);
+    return false;
+  }
+
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
+    run(stack);
+    return false;
+  }
+
+  ~BuiltinOpFunction() override = default;
+
+ private:
+  c10::QualifiedName name_;
+
+  std::function<void(Stack&)> callable_;
+
+  c10::FunctionSchema schema_;
+
+  std::string doc_string_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/class_type.h b/MLPY/Lib/site-packages/torch/include/ATen/core/class_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd40c149e784ba7a593a5314e381a4c35db0a73d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/class_type.h
@@ -0,0 +1,441 @@
+#pragma once
+
+#include <memory>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/util/Optional.h>
+
+namespace torch {
+namespace jit {
+struct CompilationUnit;
+struct Function;
+} // namespace jit
+} // namespace torch
+
+namespace c10 {
+
+struct FunctionSchema;
+
+// This enumerator represents the 'kind' of an attribute - a buffer, a parameter, or neither.
+// This state is mutually exclusive. Buffers and Parameters can only appear on modules.
+enum class AttributeKind {
+  BUFFER,
+  PARAMETER,
+  REGULAR_ATTRIBUTE
+};
+
+// This structure represents all notional booking entities in a class attribute: name, kind (see: AttributeKind), and type (see: TypePtr).
+// Note: This structure does not represent the value of the attribute.
+struct TORCH_API ClassAttribute {
+  public:
+  ClassAttribute(AttributeKind kind,
+  TypePtr attributeType,
+  std::string attributeName) :
+    kind_(kind),
+    attributeType_(std::move(attributeType)),
+    attributeName_(std::move(attributeName)) {}
+
+  AttributeKind getKind() const {
+    return kind_;
+  }
+
+  const TypePtr& getType() const {
+    return attributeType_;
+  }
+
+  const std::string& getName() const {
+    return attributeName_;
+  }
+
+  private:
+  AttributeKind kind_;
+  TypePtr attributeType_;
+  std::string attributeName_;
+};
+
+/**
+ * User Defined Types
+ */
+
+struct ClassType;
+using ClassTypePtr = std::shared_ptr<ClassType>;
+using ::torch::jit::CompilationUnit;
+
+// This represents a class in TorchScript.
+struct TORCH_API ClassType : public NamedType {
+  // This represents an attribute of a class; a name associated with an attribute, and a
+  // getter and (optional) setter for that attribute.
+  struct Property {
+    std::string name;
+    torch::jit::Function* getter;
+    torch::jit::Function* setter;
+  };
+
+  // Create a class type with name `name` and its methods stored in `cu`.
+  static ClassTypePtr create(
+      c10::optional<QualifiedName> qualifiedName,
+      std::weak_ptr<CompilationUnit> cu,
+      bool is_module = false,
+      std::string doc_string = "",
+      std::vector<std::string> unresolved_class_attributes = {});
+
+  bool equals(const Type& rhs) const override {
+    if (this == &rhs) {
+      return true;
+    }
+    if (auto user_rhs = rhs.castRaw<ClassType>()) {
+      const auto& lhs_name = name().value();
+      const auto& rhs_name = user_rhs->name().value();
+
+      return lhs_name == rhs_name &&
+          this->compilation_unit() == user_rhs->compilation_unit();
+    }
+    return false;
+  }
+
+  std::string str() const override {
+     return annotation_str();
+  }
+
+  std::string repr_str() const override {
+    std::stringstream ss;
+    ss << str()
+       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
+    return ss.str();
+  }
+
+  const std::vector<torch::jit::Function*>& methods() const;
+
+  TypePtr findAttribute(const std::string& name) const {
+    size_t pos = 0;
+    for (const auto& attr : attributes_) {
+      if (name == attr.getName()) {
+        break;
+      }
+      ++pos;
+    }
+
+    if (pos >= attributes_.size()) {
+      return nullptr;
+    }
+    return attributes_[pos].getType();
+  }
+
+  const TypePtr& getAttribute(const std::string& name) const {
+    auto slot = findAttributeSlot(name);
+    TORCH_CHECK(
+        slot,
+        repr_str(),
+        " does not have an attribute with name '",
+        name,
+        "'");
+    return attributes_[*slot].getType();
+  }
+
+  size_t numAttributes() const {
+    return attributes_.size();
+  }
+
+  const TypePtr& getAttribute(size_t slot) const {
+    AT_ASSERT(slot < attributes_.size());
+    return attributes_.at(slot).getType();
+  }
+
+  const std::string getAttributeName(size_t slot) const {
+    AT_ASSERT(slot < attributes_.size());
+    return attributes_[slot].getName();
+  }
+
+  void checkNotExist(const std::string& name, const std::string& what) const;
+
+  // Attributes are stored in a specific slot at runtime for effiency.
+  // When emitting instructions we specify the slot so that attribute access is
+  // a constant lookup
+  c10::optional<size_t> findAttributeSlot(const std::string& name) const {
+    size_t slot = 0;
+    for (const auto& attr : attributes_) {
+      if (name == attr.getName()) {
+        return slot;
+      }
+      slot++;
+    }
+    return c10::nullopt;
+  }
+  size_t getAttributeSlot(const std::string& name) const {
+    if (auto r = findAttributeSlot(name)) {
+      return *r;
+    }
+    TORCH_CHECK(
+        false,
+        repr_str(),
+        " does not have an attribute with name '",
+        name,
+        "'");
+  }
+
+  bool hasAttribute(const std::string& name) const {
+    return std::find_if(
+               attributes_.cbegin(),
+               attributes_.cend(),
+               [&](const ClassAttribute& attr) { return attr.getName() == name; }) !=
+        attributes_.cend();
+  }
+
+  bool isUnresolvedClassAttribute(const std::string& name) const;
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return attributeTypes_;
+  }
+
+  size_t addAttribute(
+      const std::string& name,
+      TypePtr type,
+      bool is_parameter = false,
+      bool is_buffer = false);
+
+  // [Internal Only] Remove attribute from the ClassType,
+  // caller is responsible to make sure the modification is safe:
+  // it is unsafe to having existing allocations
+  // of this object around anymore, and any code that works on
+  // the attribute is now invalid. Only newly created code is
+  // valid again.
+  void unsafeRemoveAttribute(const std::string& name);
+
+  // [Internal Only] Change the type of an attribute of the ClassType,
+  // The caller is responsible to make sure the modification is safe:
+  // it is unsafe to maintain uses of the old type of the attribute,
+  // and any code that works on the attribute is now invalid.
+  // Only newly created code is valid again.
+  void unsafeChangeAttributeType(const std::string& name, const TypePtr& new_ty);
+
+  // Add attribute \p NAME if it doesn't exist or verify that it has a
+  // compatible type otherwise.
+  size_t addOrCheckAttribute(
+      const std::string& name,
+      TypePtr ty,
+      bool is_parameter = false,
+      bool is_buffer = false) {
+    auto slot_idx = findAttributeSlot(name);
+    if (!slot_idx) {
+      return addAttribute(name, std::move(ty), is_parameter, is_buffer);
+    }
+
+    TORCH_CHECK(
+        is_parameter == this->is_parameter(*slot_idx),
+        "Parameter field mismatch for the field '",
+        name,
+        "'");
+    const TypePtr& atype = getAttribute(*slot_idx);
+    TORCH_CHECK(
+      ty->isSubtypeOf(*atype),
+      ty->repr_str(),
+      " is not compatible with the type ",
+      atype->repr_str(),
+      " for the field '",
+      name,
+      "'");
+    return *slot_idx;
+  }
+
+  // Get the property with the given \p name, if it exists on the class.
+  c10::optional<ClassType::Property> getProperty(const std::string& name);
+  // Add a property named \p name with \p getter and \p setter as its getter and setter.
+  void addProperty(const std::string& name, torch::jit::Function* getter, torch::jit::Function* setter);
+  // Get a list of all properties.
+  const std::vector<Property>& properties() const {
+    return properties_;
+  }
+
+  bool hasConstant(const std::string& name) const {
+    return std::find_if(
+               constantNames_.cbegin(),
+               constantNames_.cend(),
+               [&](const std::string& constant) { return constant == name; }) !=
+        constantNames_.cend();
+  }
+
+  size_t addConstant(const std::string& name, const IValue& value);
+
+  c10::optional<size_t> findConstantSlot(const std::string& name) const;
+
+  size_t getConstantSlot(const std::string& name) const {
+    if (auto r = findConstantSlot(name)) {
+      return *r;
+    }
+    TORCH_CHECK(
+        false,
+        repr_str(),
+        " does not have constant field with the name '",
+        name,
+        "'");
+  }
+
+  const std::string& getConstantName(size_t slot) const;
+
+  const std::string& doc_string() const {
+    return doc_string_;
+  }
+
+  IValue getConstant(const std::string& name) const;
+
+  IValue getConstant(size_t slot) const;
+
+  c10::optional<IValue> findConstant(const std::string& name) const;
+
+  size_t numConstants() const;
+
+  at::ArrayRef<std::string> constantNames() const {
+    return constantNames_;
+  }
+
+  at::ArrayRef<IValue> constantValues() const;
+
+  // [Internal Only] Remove constant from the ClassType
+  // caller is responsible to make sure the modification is safe:
+  // it is unsafe to having existing allocations
+  // of this object around anymore, and any code that works on
+  // the attribute is now invalid. Only newly created code is
+  // valid again.
+  void unsafeRemoveConstant(const std::string& name);
+
+  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
+    auto ptr = ClassType::create(name(), compilation_unit_, is_module());
+    AT_ASSERT(numAttributes() == contained_types.size());
+    for(size_t i = 0; i < attributes_.size(); ++i) {
+      AT_ASSERT(attributes_[i].getType()->isSubtypeOf(*contained_types[i]));
+      ptr->addAttribute(attributes_[i].getName(), std::move(contained_types[i]));
+    }
+    // Copy methods over
+    for (const auto& method : methods()) {
+      ptr->addMethod(method);
+    }
+    return ptr;
+  }
+
+  bool is_module() const override {
+    return isModule_;
+  }
+
+  const std::vector<ClassAttribute>& getAttributes() const {
+    return attributes_;
+  }
+
+  bool is_parameter(size_t slot) const {
+    TORCH_INTERNAL_ASSERT(
+        is_module(), "asking for parameterSlots of non-Module");
+    return attributes_.at(slot).getKind() == AttributeKind::PARAMETER;
+  }
+
+  bool is_buffer(size_t slot) const {
+    TORCH_INTERNAL_ASSERT(
+        is_module(), "asking for bufferWrittenSlots of non-Module");
+    return attributes_.at(slot).getKind() == AttributeKind::BUFFER;
+  }
+
+  void addForwardPreHook(torch::jit::Function* pre_hook_ptr);
+  void addForwardHook(torch::jit::Function* hook_ptr);
+  torch::jit::Function* findForwardPreHook(const std::string& name) const;
+  torch::jit::Function* findForwardHook(const std::string& name) const;
+  const std::vector<torch::jit::Function*>& getForwardHooks() const;
+  const std::vector<torch::jit::Function*>& getForwardPreHooks() const;
+
+  void checkForwardPreHookSchema(
+      int pre_hook_idx,
+      const FunctionSchema& pre_hook_schema) const;
+  void checkForwardHookSchema(
+      int hook_idx,
+      const FunctionSchema& hook_schema) const;
+
+  void addMethod(torch::jit::Function* method);
+  torch::jit::Function* findMethod(const std::string& name) const;
+  torch::jit::Function& getMethod(const std::string& name) const;
+  torch::jit::Function* findHook(const std::string& name) const;
+  torch::jit::Function& getHook(const std::string& name) const;
+  bool hasMethod(const std::string& name) const;
+
+  torch::jit::Function* findStaticMethod(const std::string& name) const;
+  void addStaticMethod(torch::jit::Function* method);
+
+  // [Internal Only] Remove method from the ClassType
+  // caller is responsible to make sure the modification is safe:
+  // it is unsafe to having existing allocations
+  // of this object around anymore, and any code that works on
+  // the attribute is now invalid. Only newly created code is
+  // valid again.
+  // Note this method is intended for freezing only.
+  void unsafeRemoveMethod(const std::string& name);
+
+  std::shared_ptr<CompilationUnit> compilation_unit();
+
+  std::shared_ptr<const CompilationUnit> compilation_unit() const;
+
+  // generate a refined version of this class.
+  // It has the same name but the slot Types are subtypes of
+  // the original slots. It is only valid to refine a class type in a context
+  // where it is know that there are not assignments to the objects slots
+  // that would invalidate the refinement.
+  // These variants are not registered in the global class table.
+  ClassTypePtr refine(at::ArrayRef<TypePtr> refined_slots) const;
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  static const TypeKind Kind = TypeKind::ClassType;
+
+ private:
+  ClassType(
+      c10::optional<QualifiedName> name,
+      std::weak_ptr<CompilationUnit> cu,
+      bool is_module = false,
+      std::string doc_string = "",
+      std::vector<std::string> unresolved_class_attributes = {});
+
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    const auto& n = name().value();
+    return n.qualifiedName();
+  }
+
+  void addAttribute(ClassAttribute classAttribute);
+  std::string getForwardPreHookErrorMessage(int pre_hook_idx) const;
+  std::string getForwardHookErrorMessage(int hook_idx) const;
+
+  // Mapping of attribute names -> their type.
+  // NOTE: this does not contain methods, which are stored in the module
+  // TODO: once modules support arbitrary ivalue attributes, we don't need this
+  // anymore.
+  // TODO: This is better represented as an OrderedDict, but alas it is not yet
+  // available from c10
+
+  // Mapping of constant names -> their value.
+  std::vector<std::string> constantNames_;
+  std::vector<IValue> constantValues_;
+  // Holds method attributes
+  std::weak_ptr<CompilationUnit> compilation_unit_;
+
+  // Holds all atrributes, attribute details are found on ClassAttribute
+  std::vector<ClassAttribute> attributes_;
+  // Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
+  // Never fill this without using the appropriate provideNewClassAttribute method
+  std::vector<TypePtr> attributeTypes_;
+
+  // List of methods associated with this class.
+  std::vector<torch::jit::Function*> methods_;
+  std::vector<torch::jit::Function*> staticmethods_;
+
+  // List of hooks to be run before/after forward.
+  std::vector<torch::jit::Function*> forward_hooks_;
+  std::vector<torch::jit::Function*> forward_pre_hooks_;
+
+  // List of properties exposed by this class.
+  std::vector<Property> properties_;
+
+  bool isModule_ = false;
+
+  // Doc string of class.
+  std::string doc_string_ = "";
+
+  // For error reporting accesses to class level attributes.
+  std::vector<std::string> unresolved_class_attributes_;
+};
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/custom_class.h b/MLPY/Lib/site-packages/torch/include/ATen/core/custom_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..601af3eb48c1222edfb07a5f98b3d02f4e4c5a57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/custom_class.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <typeindex>
+#include <memory>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+struct ClassType;
+using ClassTypePtr = std::shared_ptr<ClassType>;
+
+TORCH_API c10::ClassTypePtr getCustomClassTypeImpl(const std::type_index &tindex);
+
+template <typename T>
+const c10::ClassTypePtr& getCustomClassType() {
+  // Classes are never unregistered from getCustomClassTypeMap and the
+  // hash lookup can be a hot path, so just cache.
+  // For the same reason, it's fine If this ends up getting duplicated across
+  // DSO boundaries for whatever reason.
+  static c10::ClassTypePtr cache = getCustomClassTypeImpl(
+      std::type_index(typeid(T)));
+  return cache;
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/CppSignature.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/CppSignature.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ffa8df7e7bef13ff96e5473591e529845409b11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/CppSignature.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <typeindex>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/Type.h>
+
+namespace c10 {
+namespace impl {
+
+// A CppSignature object holds RTTI information about a C++ function signature at runtime
+// and can compare them or get a debug-printable name.
+class TORCH_API CppSignature final {
+public:
+    CppSignature(const CppSignature&) = default;
+    CppSignature(CppSignature&&) noexcept = default;
+    CppSignature& operator=(const CppSignature&) = default;
+    CppSignature& operator=(CppSignature&&) noexcept = default;
+
+    template<class FuncType>
+    static CppSignature make() {
+        // Normalize functors, lambdas, function pointers, etc. into the plain function type
+        // The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
+        // We do this to guarantee that all CppSignature's for an operator will match, even if they're registered
+        // with different calling conventions.
+        // See Note [Plumbing Keys Through The Dispatcher]
+        using decayed_function_type = typename c10::remove_DispatchKeySet_arg_from_func<std::decay_t<FuncType>>::func_type;
+
+        return CppSignature(std::type_index(typeid(decayed_function_type)));
+    }
+
+    std::string name() const {
+        return c10::demangle(signature_.name());
+    }
+
+    friend bool operator==(const CppSignature& lhs, const CppSignature& rhs) {
+        if (lhs.signature_ == rhs.signature_) {
+            return true;
+        }
+        // Without RTLD_GLOBAL, the type_index comparison could yield false because
+        // they point to different instances of the RTTI data, but the types would
+        // still be the same. Let's check for that case too.
+        // Note that there still is a case where this might not work, i.e. when
+        // linking libraries of different compilers together, they might have
+        // different ways to serialize a type name. That, together with a missing
+        // RTLD_GLOBAL, would still fail this.
+        if (0 == strcmp(lhs.signature_.name(), rhs.signature_.name())) {
+            return true;
+        }
+
+        return false;
+    }
+
+private:
+    explicit CppSignature(std::type_index signature): signature_(std::move(signature)) {}
+    std::type_index signature_;
+};
+
+inline bool operator!=(const CppSignature& lhs, const CppSignature& rhs) {
+    return !(lhs == rhs );
+}
+
+}
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c39b5f0a4bf12244d0d1b38ba4e2f33d72463ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <cstdint>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/Bitset.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/irange.h>
+#include <ATen/core/Variadic.h>
+#include <ATen/core/stack.h>
+
+namespace c10 {
+
+namespace impl {
+
+// Take a DispatchKeySet for a Tensor and determine what the actual dispatch
+// DispatchKey should be, taking into account TLS, and skipping backends which
+// fall through.
+//
+// Unlike Tensor::key_set(), the value of this on a tensor can change depending
+// on TLS.
+//
+// NB: If there is no valid dispatch key, this will return Undefined
+static inline DispatchKeySet computeDispatchKeySet(
+    DispatchKeySet ks,
+    // The key mask lets us eliminate (by zero entries) keys which should not
+    // be considered for dispatch.  There are two cases when we use this:
+    //
+    // - If an operator's dispatch table contains a fallthrough entry, we
+    //   should bypass it entirely when finding the key
+    // - If a user invokes with redispatch, the mask lets us
+    //   zero out the key the user asked us to stop.
+    //
+    // These excluded backends are NOT tracked in the TLS, but must be applied
+    // AFTER TLS (since the backend may have been introduced for consideration
+    // by the included TLS), which is why you have to pass them in to this
+    // function (as opposed to just applying it to the input 'ks').
+    DispatchKeySet key_mask
+) {
+  c10::impl::LocalDispatchKeySet local = c10::impl::tls_local_dispatch_key_set();
+  // TODO: It's a bit irritating that we have to do logical ORs here, it would
+  // be nice to only do one.  Can always_included be folded into the TLS?  Well,
+  // it's a bit troublesome, because fastpath TLS access requires the type of
+  // the TLS in question to be zero-initialized, so you don't actually win
+  // anyting in that case.
+  return (((ks | local.included_) - local.excluded_) & key_mask);
+}
+
+}
+
+namespace detail {
+  // A small gadget to extract the DispatchKeySet from types which are known
+  // to have it.  Used to extract dispatch keys from unboxed calls.
+  struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
+    DispatchKeySet ts;
+    void operator()(const at::Tensor& x) {
+      ts = ts | x.key_set();
+    }
+    void operator()(const c10::optional<at::Tensor>& x) {
+      if (x.has_value()) {
+        ts = ts | x->key_set();
+      }
+    }
+    void operator()(at::ArrayRef<at::Tensor> xs) {
+      for (const auto& x : xs) {
+        ts = ts | x.key_set();
+      }
+    }
+    // Tensor?[] translates to this case.
+    void operator()(const c10::List<c10::optional<at::Tensor>>& xs) {
+      for (c10::optional<at::Tensor> x : xs) {
+        if (x.has_value()) {
+          ts = ts | x.value().key_set();
+        }
+      }
+    }
+    // Structured Tensor[] translates to this case
+    void operator()(const at::ITensorListRef& xs) {
+      for (const auto& x : xs) {
+        ts = ts | x.key_set();
+      }
+    }
+    [[noreturn]] void operator()(at::ArrayRef<c10::optional<at::Tensor>>) {
+      // Just checking that the handling of Tensor?[] didn't change.
+      TORCH_INTERNAL_ASSERT(false);
+    }
+    void operator()(const at::Generator& gen) {
+      if (gen.defined()) {
+        ts = ts | gen.key_set();
+      }
+    }
+    void operator()(const c10::optional<at::Generator>& gen) {
+      if (gen.has_value() && gen->defined()) {
+        ts = ts | gen->key_set();
+      }
+    }
+    template <typename T>
+    void operator()(const T&) {
+      // do nothing
+    }
+  };
+
+  // NB: take by const reference (Don't do universal forwarding here! You
+  // don't want to move into this function!)
+  template <typename... Args>
+  DispatchKeySet multi_dispatch_key_set(const Args&... args) {
+    return MultiDispatchKeySet().apply(args...).ts;
+  }
+}
+
+/**
+ * An instance of DispatchKeyExtractor knows how to get a dispatch key given
+ * a list of arguments for an operator call.
+ *
+ * The instance is specific for a certain operator as:
+ *  - In boxed dispatch, different operators have different ways to extract
+ *    the dispatch key (e.g. different numbers of arguments), and we precompute
+ *    the stack locations we should look at; and
+ *  - In all dispatch, some backends should be excluded from dispatch because
+ *    they have been registered as fallthrough.  The set of excluded backends
+ *    varies from operator, as some operators may have overridden the
+ *    fallthrough with custom behavior.
+ *
+ *   Note - this should maintain identical impl to the py dispatcher key extraction logic
+ *   at pytorch/torch/dispatcher.py
+ */
+struct TORCH_API DispatchKeyExtractor final {
+public:
+  static DispatchKeyExtractor make(const FunctionSchema& schema) {
+    return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema));
+  }
+
+  static DispatchKeyExtractor makeUninitialized() {
+    return DispatchKeyExtractor(c10::utils::bitset());
+  }
+
+  void registerSchema(const FunctionSchema& schema) {
+    TORCH_INTERNAL_ASSERT(dispatch_arg_indices_reverse_.is_entirely_unset());
+    dispatch_arg_indices_reverse_ = makeBitsetForDispatchArgs(schema);
+  }
+  void deregisterSchema() {
+    dispatch_arg_indices_reverse_ = c10::utils::bitset();
+  }
+
+  DispatchKeySet getDispatchKeySetBoxed(const torch::jit::Stack* stack) const {
+    DispatchKeySet ks;
+    dispatch_arg_indices_reverse_.for_each_set_bit([&] (size_t reverse_arg_index) {
+      const auto& ivalue = torch::jit::peek(*stack, 0, reverse_arg_index + 1);
+      if (C10_LIKELY(ivalue.isTensor())) {
+        // NB: Take care not to introduce a refcount bump (there's
+        // no safe toTensorRef method, alas)
+        ks = ks | ivalue.unsafeToTensorImpl()->key_set();
+      } else if (C10_UNLIKELY(ivalue.isTensorList())) {
+        for (const at::Tensor& tensor : ivalue.toTensorList()) {
+          ks = ks | tensor.key_set();
+        }
+      }
+      // Tensor?[] translates to a c10::List<IValue> so we need to peek inside
+      else if (C10_UNLIKELY(ivalue.isList())) {
+        for (const auto& elt : ivalue.toListRef()) {
+          if (elt.isTensor()) {
+            ks = ks | elt.toTensor().key_set();
+          }
+        }
+      }
+    });
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      auto backend_idx = ks.getBackendIndex();
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+
+  template<class... Args>
+  DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
+    auto ks = detail::multi_dispatch_key_set(args...);
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      auto backend_idx = ks.getBackendIndex();
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+
+  void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);
+
+  std::string dumpState() const;
+  void checkInvariants(const FunctionSchema& schema) const;
+
+private:
+  static c10::utils::bitset makeBitsetForDispatchArgs(const FunctionSchema& schema) {
+    TORCH_CHECK(schema.arguments().size() <= c10::utils::bitset::NUM_BITS(),
+        "The function schema has ", schema.arguments().size(),
+        " arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS());
+    c10::utils::bitset dispatch_arg_indices_reverse;
+    for (const auto index : c10::irange(schema.arguments().size())) {
+      if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) ||
+          schema.arguments()[index].type()->isSubtypeOf(
+              *ListType::ofTensors()) ||
+          schema.arguments()[index].type()->isSubtypeOf(
+              *ListType::ofOptionalTensors()) ||
+          schema.arguments()[index].type()->isSubtypeOf(
+              *OptionalType::ofTensor())) {
+        dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index);
+      }
+    }
+    return dispatch_arg_indices_reverse;
+  }
+
+  explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
+  : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse)
+  , nonFallthroughKeys_(DispatchKeySet::FULL)
+  , requiresBitsetPerBackend_(false) {
+    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
+    }
+  }
+
+  // this is a bitset that has ones for each argument index which has to be
+  // considered for dispatch. This avoids having to iterate over the stack
+  // to find all the tensors. The bits are stored in reverse order, i.e.
+  // dispatch_arg_indices_reverse_[i] == true, then the i-th argument from
+  // the top of the stack (i.e. the i-th last argument of the function)
+  // is relevant for dispatch.
+  // dispatch_arg_indices_reverse_ is allowed to have zero bits set; that just means you must do the
+  // fallthrough
+  c10::utils::bitset dispatch_arg_indices_reverse_;
+
+  // Set of functionality keys for which the operator does NOT have fallthrough kernel.
+  DispatchKeySet nonFallthroughKeys_;
+  // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND.
+  // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends.
+  std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
+  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path),
+  // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_
+  bool requiresBitsetPerBackend_;
+};
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc08b28e9bd80cdb882c9c04dfdac12c45cf516d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h
@@ -0,0 +1,795 @@
+#pragma once
+
+#include <ATen/SequenceNumber.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/OperatorEntry.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/record_function.h>
+#include <c10/util/Exception.h>
+#include <c10/util/LeftRight.h>
+#include <list>
+#include <mutex>
+#include <condition_variable>
+#include <type_traits>
+#include <c10/core/SafePyObject.h>
+
+#include <ATen/core/grad_mode.h>
+#include <ATen/core/enum_tag.h>
+
+#ifndef NDEBUG
+#include <iostream>
+#endif
+
+namespace c10 {
+
+TORCH_API bool show_dispatch_trace();
+TORCH_API void dispatch_trace_nesting_incr();
+TORCH_API void dispatch_trace_nesting_decr();
+TORCH_API int64_t dispatch_trace_nesting_value();
+
+struct DispatchTraceNestingGuard {
+  DispatchTraceNestingGuard() { dispatch_trace_nesting_incr(); }
+  ~DispatchTraceNestingGuard() { dispatch_trace_nesting_decr(); }
+};
+
+class TORCH_API OperatorHandle;
+template<class FuncType> class TypedOperatorHandle;
+
+/**
+ * Implement this interface and register your instance with the dispatcher
+ * to get notified when operators are registered or deregistered with
+ * the dispatcher.
+ *
+ * NB: registration events only occur when a 'def' occurs; we don't trigger
+ * on 'impl' or 'fallback' calls.
+ */
+class TORCH_API OpRegistrationListener {
+public:
+  virtual ~OpRegistrationListener();
+
+  virtual void onOperatorRegistered(const OperatorHandle& op) = 0;
+  virtual void onOperatorDeregistered(const OperatorHandle& op) = 0;
+};
+
+namespace detail {
+class RegistrationListenerList;
+}
+class SchemaRegistrationHandleRAII;
+
+/**
+ * Top-level dispatch interface for dispatching via the dynamic dispatcher.
+ * Most end users shouldn't use this directly; if you're trying to register
+ * ops look in op_registration
+ */
+class TORCH_API Dispatcher final {
+private:
+  // For direct access to backend fallback information
+  friend class impl::OperatorEntry;
+
+  struct OperatorDef final {
+    explicit OperatorDef(OperatorName&& op_name)
+    : op(std::move(op_name)) {}
+
+    impl::OperatorEntry op;
+
+    // These refer to the number of outstanding RegistrationHandleRAII
+    // for this operator.  def_count reflects only def() registrations
+    // (in the new world, this should only ever be 1, but old style
+    // registrations may register the schema multiple times, which
+    // will increase this count).  def_and_impl_count reflects the number
+    // of combined def() and impl() registrations.  When the last def() gets
+    // unregistered, we must immediately call the Deregistered listeners, but we
+    // must not actually delete the handle as there are other outstanding RAII
+    // destructors which will try to destruct and they had better still have a
+    // working operator handle in this case
+    size_t def_count = 0;
+    size_t def_and_impl_count = 0;
+  };
+  friend class OperatorHandle;
+  template<class> friend class TypedOperatorHandle;
+
+  struct Guard final {
+    Guard() : alive(true), mutex() {}
+    std::atomic<bool> alive;
+    std::mutex mutex;
+  };
+
+public:
+  ~Dispatcher();
+
+  // Implementation note: this class abstracts over the fact that we have per-operator
+  // dispatch tables.  This could be easily adjusted to have a single global hash
+  // table.
+  static Dispatcher& realSingleton();
+
+  C10_ALWAYS_INLINE static Dispatcher& singleton() {
+#if !defined C10_MOBILE
+    // Implemented inline so that steady-state code needn't incur
+    // function-call overhead. We can't just inline `realSingleton`
+    // because the function-local static would get duplicated across
+    // all DSOs that include & use this header, leading to multiple
+    // singleton instances.
+    static Dispatcher& s = realSingleton();
+    return s;
+#else
+    // For C10_MOBILE, we should never inline a static function that
+    // has a static member, since the generated code calls
+    // __cxa_guard_acquire and __cxa_guard_release which help
+    // implement exactly once semantics for the initialization of the
+    // static Dispatcher& s above (for the non-mobile case). That
+    // additional code when duplicated across all operator stubs
+    // for every backend results in a lot of additional code
+    // being generated by the compiler.
+    return realSingleton();
+#endif
+  }
+
+  // ------------------------------------------------------------------------
+  //
+  // Accessing operators by schema
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Looks for an operator schema with the given name and overload name
+   * and returns it if it is registered WITH A SCHEMA.
+   * Returns nullopt otherwise.
+   */
+  c10::optional<OperatorHandle> findSchema(const OperatorName& operator_name);
+
+  /**
+   * Variant of findSchema that results in less code generated at the call site.
+   * It (1) takes const char* pointer rather than OperatorName (so we skip
+   * generating std::string constructor calls at the call site), and (2)
+   * it raises an exception if the operator is not found (so we skip
+   * generating exception raising code at the call site)
+   *
+   * Irritatingly, we still have to generate the handful of instructions
+   * for dealing with an exception being thrown during static initialization
+   * (e.g. __cxa_guard_abort).  If we could annotate this method noexcept we
+   * could avoid this code too, but as the name of the function suggests,
+   * it does throw exceptions.
+   */
+  OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name);
+
+  // Like findSchema, but also returns OperatorHandle even if there is no schema
+  c10::optional<OperatorHandle> findOp(const OperatorName& operator_name);
+
+  // Returns a list of all operator names present in the operatorLookupTable_
+  const std::vector<OperatorName> getAllOpNames();
+
+  // ------------------------------------------------------------------------
+  //
+  // Invoking operators
+  //
+  // ------------------------------------------------------------------------
+
+  template<class Return, class... Args>
+  Return call(const TypedOperatorHandle<Return (Args...)>& op, Args... args) const;
+
+
+  template<class Return, class... Args>
+  static Return callWithDispatchKeySlowPath(const TypedOperatorHandle<Return (Args...)>& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args);
+
+  // Like call, but intended for use in a redispatch in kernels that have explicitly performed the DispatchKey update calculatulation.
+  // This will take the DispatchKeySet completely as is and dispatch to the kernel of the corresponding highest priority key in the set.
+  // Note that this version of redispatch treats the inputted DispatchKeySet *as is*, and does NOT mask out the highest priority key.
+  // See Note [Plumbing Keys Through The Dispatcher]
+  template<class Return, class... Args>
+  Return redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const;
+
+  // Invoke an operator via the boxed calling convention using an IValue stack
+  void callBoxed(const OperatorHandle& op, Stack* stack) const;
+  void callBoxedForDispatchKey(const OperatorHandle& op, DispatchKey dk, Stack* stack) const;
+
+  // TODO: This will only be useful if we write a backend fallback that plumbs dispatch keys (currently there are none)
+  // See Note [Plumbing Keys Through The Dispatcher]
+  void redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const;
+
+  bool hasBackendFallbackForDispatchKey(DispatchKey dk) {
+    auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk);
+    if (dispatch_ix < 0) return false;
+    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
+  }
+
+  // Used by torchdeploy/multipy for multiple interpreters racing.
+  void waitForDef(const FunctionSchema& schema);
+  void waitForImpl(const OperatorName& op_name, c10::optional<DispatchKey> dispatch_key);
+
+  // ------------------------------------------------------------------------
+  //
+  // Performing registrations (NON user public; use op_registration)
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Register a new operator schema.
+   *
+   * If a schema with the same operator name and overload name already exists,
+   * this function will check that both schemas are exactly identical.
+   */
+  RegistrationHandleRAII registerDef(FunctionSchema schema, std::string debug, std::vector<at::Tag> tags = {});
+
+  /**
+   * Register a kernel to the dispatch table for an operator.
+   * If dispatch_key is nullopt, then this registers a fallback kernel.
+   *
+   * @return A RAII object that manages the lifetime of the registration.
+   *         Once that object is destructed, the kernel will be deregistered.
+   */
+  // NB: steals the inferred function schema, as we may need to hold on to
+  // it for a bit until the real schema turns up
+  RegistrationHandleRAII registerImpl(OperatorName op_name, c10::optional<DispatchKey> dispatch_key, KernelFunction kernel, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema> inferred_function_schema, std::string debug);
+
+  /**
+   * Given an operator, tells the Dispatcher that we have implemented an abstract impl
+   * for this op in the given Python module. Call this a "pystub".
+   */
+  RegistrationHandleRAII registerAbstractImplPyStub(const OperatorName& op_name, const char* pymodule, const char* context);
+
+  /**
+   * Given an operator, throws if we have an abstract impl pystub.
+   */
+  void throwIfHasAbstractImplPyStub(OperatorName op_name);
+
+  c10::optional<std::pair<const char*, const char*>> getAbstractImplPyStub(OperatorName op_name);
+
+  /**
+   * Register a new operator by name.
+   */
+  RegistrationHandleRAII registerName(OperatorName op_name);
+
+  /**
+   * Register a fallback kernel for a backend.
+   * If an operator is called but there is no concrete kernel for the dispatch
+   * key of the given operator arguments, it will check if there is such a
+   * fallback kernel for the given dispatch key and, if yes, call that one.
+   */
+  RegistrationHandleRAII registerFallback(DispatchKey dispatch_key, KernelFunction kernel, std::string debug);
+
+  /**
+   * Use to register whenever we had a TORCH_LIBRARY declaration in the frontend
+   * API.  These invocations are only permitted once per program, so we raise
+   * an error if this is called again for the same namespace.
+   */
+  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
+
+  // ------------------------------------------------------------------------
+  //
+  // Listeners on registrations
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * Add a listener that gets called whenever a new op is registered or an existing
+   * op is deregistered. Immediately after registering, this listener gets called
+   * for all previously registered ops, so it can be used to keep track of ops
+   * registered with this dispatcher.
+   */
+  RegistrationHandleRAII addRegistrationListener(std::unique_ptr<OpRegistrationListener> listener);
+
+  void checkInvariants() const;
+
+  //
+  // ------------------------------------------------------------------------
+  //
+  // Assertions
+  //
+  // ------------------------------------------------------------------------
+
+  /**
+   * For testing purposes.
+   * Returns a list of all operators that were created through calls to registerImpl(),
+   * without any corresponding calls to registerDef(). After static initialization
+   * is done this is almost certainly a bug, as the created OperatorHandle won't have
+   * any schema associated with it and users calling the op through the dispatcher
+   * won't be able to access it
+   *
+   * Note that we cannot enforce this invariant "as we go" during static initialization,
+   * due to undefined static initialization order- we have no guarantees over the order
+   * in which .def() and .impl() calls are registered in the dispatcher at static
+   * initialization time. So this function should only be called after static initialization.
+   */
+  std::vector<OperatorHandle> findDanglingImpls() const;
+
+  /**
+   * Useful for inspecting global Dispatcher registration state.
+   * Returns the names of all operators with a kernel registered for the specified DispatchKey.
+   * If no DispatchKey is specified, it returns all registered operators.
+   */
+  std::vector<OperatorName> getRegistrationsForDispatchKey(c10::optional<DispatchKey> k) const;
+
+private:
+  Dispatcher();
+
+  static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey);
+  static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, c10::ArrayRef<const c10::IValue> args);
+
+  #ifdef FBCODE_CAFFE2
+  static bool profilingOperatorEvents();
+  static void fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref);
+  static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref);
+  #endif // FBCODE_CAFFE2
+
+  OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema);
+  OperatorHandle findOrRegisterName_(const OperatorName& op_name);
+
+  void deregisterDef_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterImpl_(
+    const OperatorHandle& op,
+    const OperatorName& op_name,
+    c10::optional<DispatchKey> dispatch_key,
+    impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle);
+  void deregisterName_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterFallback_(DispatchKey dispatchKey);
+  void deregisterLibrary_(const std::string& ns);
+  void cleanup(const OperatorHandle& op, const OperatorName& op_name);
+  void checkSchemaCompatibility(const OperatorHandle& op, const FunctionSchema& schema, const std::string& debug);
+
+  std::list<OperatorDef> operators_;
+#if !defined(C10_MOBILE)
+  LeftRight<ska::flat_hash_map<OperatorName, OperatorHandle>> operatorLookupTable_;
+#else
+  RWSafeLeftRightWrapper<ska::flat_hash_map<OperatorName, OperatorHandle>> operatorLookupTable_;
+#endif
+  // Map from namespace to debug string (saying, e.g., where the library was defined)
+  ska::flat_hash_map<std::string, std::string> libraries_;
+
+  std::array<impl::AnnotatedKernel, num_runtime_entries> backendFallbackKernels_;
+
+  std::unique_ptr<detail::RegistrationListenerList> listeners_;
+
+  // This condition variable gets notified whenever we add a new def/impl to the
+  // dispatch table.  This is primarily used by multipy/torchdeploy, when
+  // we have multiple interpreters trying to register to the dispatch table.
+  // In this situation, whenever the non-primary interpreter would have tried
+  // to register to the dispatch table, instead it will check to see if the
+  // expected registration has already been made, and if it hasn't, wait on
+  // this condition variable to see if it was just racing with the primary
+  // interpreter.
+  //
+  // We expect it to be rare for there to be any waiters on this condition
+  // variable.  This is mostly just to help give better diagnostics if
+  // something goes horribly wrong
+  std::condition_variable cond_var_;
+
+  // Protect concurrent access to the dispatcher.  We store this in a
+  // `shared_ptr` as we return callbacks that call back into dispatcher methods,
+  // and we need to be able to handle and guard against the event when the
+  // `Dispatcher` has been destroyed before the callbacks fire.
+  std::shared_ptr<Guard> guard_;
+};
+
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * This handle can be used to register kernels with the dispatcher or
+ * to lookup a kernel for a certain set of arguments.
+ */
+class TORCH_API OperatorHandle {
+  template <typename T> friend struct std::hash;
+
+public:
+  OperatorHandle(OperatorHandle&&) noexcept = default;
+  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
+  OperatorHandle(const OperatorHandle&) = default;
+  OperatorHandle& operator=(const OperatorHandle&) = default;
+  // NOLINTNEXTLINE(performance-trivially-destructible)
+  ~OperatorHandle();
+
+  const OperatorName& operator_name() const {
+    return operatorDef_->op.operator_name();
+  }
+
+  bool hasSchema() const {
+    return operatorDef_->op.hasSchema();
+  }
+
+  const FunctionSchema& schema() const {
+    return operatorDef_->op.schema();
+  }
+
+  const std::string& debug() const {
+    return operatorDef_->op.debug();
+  }
+
+  std::string dumpState() const {
+    return operatorDef_->op.dumpState();
+  }
+
+  bool hasKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasKernelForDispatchKey(k);
+  }
+
+  bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
+    return operatorDef_->op.hasKernelForAnyDispatchKey(k);
+  }
+
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasComputedKernelForDispatchKey(k);
+  }
+
+  std::string dumpComputedTable() const {
+    return operatorDef_->op.dumpComputedTable();
+  }
+
+  void checkInvariants() const {
+    return operatorDef_->op.checkInvariants();
+  }
+
+  c10::ArrayRef<at::Tag> getTags() const {
+    return operatorDef_->op.getTags();
+  }
+
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback) {
+    operatorDef_->op.setReportErrorCallback_(std::move(callback));
+  }
+
+  bool hasTag(const at::Tag& tag) const {
+    for(const auto& tag_: getTags()) {
+      if (tag == tag_) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template<class FuncType>
+  TypedOperatorHandle<FuncType> typed() const {
+    // NB: This assert is not 100% sound: you can retrieve a typed() operator
+    // handle prior to ANY C++ signature being registered on the operator
+    // and the check will say everything is OK (at which point you can then
+    // smuggle in a kernel that is typed incorrectly).  For everything
+    // in core library this won't happen, because all the static registrations
+    // will be done by the time a typed() handle is acquired.
+#if !defined C10_MOBILE
+    operatorDef_->op.assertSignatureIsCorrect<FuncType>();
+    if (fn_has_symint<FuncType>::value) {
+      operatorDef_->op.assertSignatureIsCorrect<typename fn_remove_symint<FuncType>::type>();
+    }
+#endif
+    return TypedOperatorHandle<FuncType>(operatorIterator_);
+  }
+
+  void callBoxed(Stack* stack) const {
+    c10::Dispatcher::singleton().callBoxed(*this, stack);
+  }
+
+  void callBoxed(Stack& stack) const {
+    callBoxed(&stack);
+  }
+
+  void callBoxedForDispatchKey(DispatchKey dk, Stack& stack) const {
+    c10::Dispatcher::singleton().callBoxedForDispatchKey(*this, dk, &stack);
+  }
+
+  void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
+    c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
+  }
+
+  template <typename F>
+  PyObject* getPythonOp(c10::impl::PyInterpreter* self_interpreter, F slow_accessor) const {
+    return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor);
+  }
+
+  bool operator==(const OperatorHandle& other) const {
+    return operatorDef_ == other.operatorDef_;
+  }
+
+  bool operator!=(const OperatorHandle& other) const {
+    return operatorDef_ != other.operatorDef_;
+  }
+
+private:
+  explicit OperatorHandle(std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+  : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator)  {}
+  friend class Dispatcher;
+  template<class> friend class TypedOperatorHandle;
+
+  // Storing a direct pointer to the OperatorDef even though we
+  // already have the iterator saves an instruction in the critical
+  // dispatch path. The iterator is effectively a
+  // pointer-to-std::list-node, and (at least in libstdc++'s
+  // implementation) the element is at an offset 16 bytes from that,
+  // because the prev/next pointers come first in the list node
+  // struct. So, an add instruction would be necessary to convert from the
+  // iterator to an OperatorDef*.
+  Dispatcher::OperatorDef* operatorDef_;
+
+  // We need to store this iterator in order to make
+  // Dispatcher::cleanup() fast -- it runs a lot on program
+  // termination (and presuambly library unloading).
+  std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
+};
+
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * It holds the same information as an OperatorHandle, but it is templated
+ * on the operator arguments and allows calling the operator in an
+ * unboxed way.
+ */
+template<class FuncType>
+class TypedOperatorHandle final {
+  static_assert(guts::false_t<FuncType>(), "FuncType in OperatorHandle::typed<FuncType> was not a valid function type");
+};
+template<class Return, class... Args>
+class TypedOperatorHandle<Return (Args...)> final : public OperatorHandle {
+public:
+  TypedOperatorHandle(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle& operator=(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle(const TypedOperatorHandle&) = default;
+  TypedOperatorHandle& operator=(const TypedOperatorHandle&) = default;
+
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+  C10_ALWAYS_INLINE Return call(Args... args) const {
+    return c10::Dispatcher::singleton().call<Return, Args...>(*this, std::forward<Args>(args)...);
+  }
+
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+  C10_ALWAYS_INLINE Return redispatch(DispatchKeySet currentDispatchKeySet, Args... args) const {
+    return c10::Dispatcher::singleton().redispatch<Return, Args...>(*this, currentDispatchKeySet, std::forward<Args>(args)...);
+  }
+
+private:
+  explicit TypedOperatorHandle(std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+  : OperatorHandle(operatorIterator) {}
+  friend class OperatorHandle;
+};
+
+namespace detail {
+template <class... Args> inline void unused_arg_(const Args&...) {}
+
+// CaptureKernelCall is intended to capture return values from Dispatcher
+// unboxed kernel calls. A record function may request to get outputs from the
+// kernel calls. For boxed kernels, it's straightforward, the returned values
+// are in the stack object. The stack can be passed to record functions. For
+// unboxed kernels, we need to handle different kinds of return values, cache
+// them temporarily, then release the values for the actual function call
+// return.
+template <typename ReturnType>
+struct CaptureKernelCall {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<ReturnType(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args)
+      // Calls the kernel and capture the result in output_.
+      : output_{kernel.template call<ReturnType, Args...>(
+            op,
+            dispatchKeySet,
+            std::forward<Args>(args)...)} {}
+  // Wraps the return values in a Stack.
+  Stack getOutputs() {
+    Stack stack;
+    impl::push_outputs<ReturnType, false>::copy(output_, &stack);
+    return stack;
+  }
+  // Since we are returning the output_, we don't expect the output_ to be used
+  // afterward. Copy elision and RVO do not apply to class data members. Using
+  // move semantic to avoid copies when possible.
+  ReturnType release() && {
+    return std::move(output_);
+  }
+
+ private:
+  ReturnType output_;
+};
+
+// Handle the lvalue reference differently since it should not be moved.
+template <>
+inline at::Tensor& CaptureKernelCall<at::Tensor&>::release() && {
+  return output_;
+}
+
+// Handle case where the kernel returns void.
+template <>
+struct CaptureKernelCall<void> {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<void(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args) {
+    // Calling the kernel and no need to capture void.
+    kernel.template call<void, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+  Stack getOutputs() {
+    return Stack();
+  }
+  void release() && {}
+};
+
+} // namespace detail
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template<class Return, class... Args>
+inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle<Return(Args...)>& op, at::StepCallbacks& stepCallbacks, DispatchKeySet dispatchKeySet, const KernelFunction& kernel, Args... args) {
+  // If callbacks need inputs, we box the arguments and pass them to the guard.
+  // Note: For perf reasons we wouldn't want to prematurely box the arguments.
+  at::RecordFunction guard(std::move(stepCallbacks));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved());
+  auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+  auto& schema = op.schema();
+  auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+  constexpr auto num_boxed_args = impl::boxed_size<Args...>();
+  if constexpr (num_boxed_args != 0) {
+    if (guard.needsInputs()) {
+      // If we used std::array<IValue, num_boxed_args> here, we would
+      // have to spend time default constructing the IValues in
+      // boxedArgs. aligned_storage has no such requirement.
+      impl::IValueAlignedStorage boxedArgs[num_boxed_args];
+      // For debugging only; could be removed (but the compiler will do
+      // that for us and it's nice to have the extra assurance of
+      // correctness from our debug builds).
+      int lastArgIdx = 0;
+      impl::boxArgsToStack(boxedArgs, lastArgIdx, args...);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(lastArgIdx == num_boxed_args);
+      // I don't *think* we need std::launder here, because IValue has
+      // no subclasses and no const or reference fields.
+      runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef<const c10::IValue>(reinterpret_cast<IValue *>(boxedArgs), num_boxed_args));
+      for (size_t ii = 0; ii < num_boxed_args; ++ii) {
+        reinterpret_cast<IValue *>(&boxedArgs[ii])->~IValue();
+      }
+    } else {
+      runRecordFunction(guard, schema_ref, dispatchKey);
+    }
+  } else {
+    runRecordFunction(guard, schema_ref, dispatchKey);
+  }
+
+  if (C10_UNLIKELY(guard.needsOutputs())) {
+    // Calls the kernel and capture the output temporarily to pass to
+    // RecordFunction.
+    detail::CaptureKernelCall<Return> captureKernelCall(
+        kernel, op, dispatchKeySet, std::forward<Args>(args)...);
+    guard.setOutputs(captureKernelCall.getOutputs());
+    // Releases the captured output to return to caller.
+    return std::move(captureKernelCall).release();
+  }
+
+  // keeping the guard alive while executing the kernel
+  return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template<class Return, class... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorHandle<Return(Args...)>& op, Args... args) const {
+  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
+  auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor()
+    .template getDispatchKeySetUnboxed<Args...>(args...);
+#ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+      std::cerr << "[call] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  }
+#endif
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks = at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(step_callbacks.has_value() && op.operatorDef_->op.isObserved())) {
+    return callWithDispatchKeySlowPath<Return, Args...>(op, *step_callbacks, dispatchKeySet, kernel, std::forward<Args>(args)...);
+  }
+#endif  // PYTORCH_DISABLE_PER_OP_PROFILING
+
+#ifdef FBCODE_CAFFE2
+  if(profilingOperatorEvents()) {
+    struct FireOpRAII {
+       FireOpRAII(at::RecordFunction::schema_ref_t schema_ref) : schema_ref_(schema_ref) {
+           fireOpStartUSDT(schema_ref);
+        }
+       ~FireOpRAII() { fireOpEndUSDT(schema_ref_); }
+       at::RecordFunction::schema_ref_t schema_ref_;
+    } event(op.schema());
+    return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
+  } else {
+    return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+#else
+    return kernel.template call<Return, Args...>(op, dispatchKeySet, std::forward<Args>(args)...);
+#endif // FBCODE_CAFFE2
+}
+
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template<class Return, class... Args>
+inline Return Dispatcher::redispatch(const TypedOperatorHandle<Return (Args...)>& op, DispatchKeySet currentDispatchKeySet, Args... args) const {
+  detail::unused_arg_(args...);  // workaround for a false-positive warning about unused parameters in gcc 5
+  // do not use RecordFunction on redispatch
+#ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+      std::cerr << "[redispatch] op=[" << op.operator_name() << "], key=[" << toString(currentDispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  }
+#endif
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet);
+  return kernel.template call<Return, Args...>(op, currentDispatchKeySet, std::forward<Args>(args)...);
+}
+
+inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+#ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+      std::cerr << "[callBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks = at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(step_callbacks.has_value() && entry.isObserved())) {
+    at::RecordFunction guard(std::move(*step_callbacks));
+    auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+    auto& schema = op.schema();
+    auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+    guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef<const c10::IValue>(stack->data(), stack->size()))
+                        : runRecordFunction(guard, schema_ref, dispatchKey);
+
+    // keeping the guard alive while executing the kernel
+    kernel.callBoxed(op, dispatchKeySet, stack);
+
+    if (C10_UNLIKELY(guard.needsOutputs())) {
+      guard.setOutputs(*stack);
+    }
+    return;
+  }
+#endif  // PYTORCH_DISABLE_PER_OP_PROFILING
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+// NB: this doesn't count as a "true" dispatcher jump, so no instrumentation
+inline void Dispatcher::callBoxedForDispatchKey(const OperatorHandle& op, DispatchKey dk, Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  // We still compute this as we're obligated to pass it on to the internal
+  // kernel, if it is a boxed fallback
+  auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+  const auto& kernel = ([&]() {
+    if (op.hasKernelForDispatchKey(dk)) {
+      return entry.kernelForDispatchKey(dk);
+    } else {
+      auto idx = getDispatchTableIndexForDispatchKey(dk);
+      TORCH_INTERNAL_ASSERT(idx >= 0);
+      return backendFallbackKernels_[idx].kernel;
+    }
+  })();
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep iterators intact.
+  const auto& entry = op.operatorDef_->op;
+#ifndef NDEBUG
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+      auto nesting_value = dispatch_trace_nesting_value();
+      for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+      std::cerr << "[redispatchBoxed] op=[" << op.operator_name() << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+  return kernel.callBoxed(op, dispatchKeySet, stack);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+struct hash<c10::OperatorHandle> {
+  size_t operator()(const c10::OperatorHandle& op) const noexcept {
+    return std::hash<void*>{}(static_cast<void*>(op.operatorDef_));
+  }
+};
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef2efd55af04ee5c9d2bb01683a31d4688435ccd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/core/operator_name.h>
+#include <string>
+#include <unordered_set>
+
+namespace c10 {
+
+struct TORCH_API ObservedOperators {
+  ObservedOperators() = delete;
+
+  static bool isObserved(const OperatorName& name);
+
+  static std::unordered_set<std::string>& getUnobservedOperatorList();
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ebdca6edee2b3534b0f3b8fbd6ddb7018c2b465
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h
@@ -0,0 +1,313 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/Optional.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/PyHandleCache.h>
+#include <c10/core/SafePyObject.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/DispatchKeyExtractor.h>
+
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/enum_tag.h>
+
+#include <list>
+#include <array>
+
+#ifdef C10_MOBILE
+#define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+#endif
+
+namespace c10 {
+
+class Dispatcher;
+
+namespace impl {
+
+// This data structure represents a kernel that was registered to us from a
+// user.  Unlike KernelFunction, AnnotatedKernel contains some extra metadata
+// about the kernel that isn't necessary for actual dispatching (this is why
+// we don't put AnnotatedKernel in the actual DispatchTable), but is useful for
+// giving good error messages.
+struct AnnotatedKernel final {
+  AnnotatedKernel(KernelFunction k, std::unique_ptr<FunctionSchema> s, std::string d)
+    : kernel(std::move(k))
+    , inferred_function_schema(std::move(s))
+    , debug(std::move(d))
+    {}
+  AnnotatedKernel() = default;
+  KernelFunction kernel;
+  std::unique_ptr<FunctionSchema> inferred_function_schema;
+  // A little debug string to help us identify the kernel in question.
+  // Most importantly it records the TORCH_LIBRARY block that did the
+  // registration.
+  std::string debug;
+};
+
+// This data structure represents operator schema, with metadata specifying
+// where the registration of this schema occurred
+struct AnnotatedSchema final {
+  AnnotatedSchema(FunctionSchema s, std::string d)
+    : schema(std::move(s))
+    , debug(std::move(d))
+    {}
+  FunctionSchema schema;
+  std::string debug;
+};
+
+// Internal data structure that records information about a specific operator.
+// It's not part of the public API; typically, users will interact with
+// OperatorHandle instead.
+//
+// Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher
+// lock (this is important because some methods in OperatorEntry access
+// dispatcher state)
+class TORCH_API OperatorEntry final {
+public:
+  explicit OperatorEntry(OperatorName&& operator_name);
+
+  OperatorEntry(const OperatorEntry&) = delete;
+  OperatorEntry(OperatorEntry&&) noexcept = delete;
+  OperatorEntry& operator=(const OperatorEntry&) = delete;
+  OperatorEntry& operator=(OperatorEntry&&) noexcept = delete;
+
+  const FunctionSchema& schema() const {
+    TORCH_INTERNAL_ASSERT(schema_.has_value(), "Tried to access the schema for ", name_, " which doesn't have a schema registered yet");
+    return schema_->schema;
+  }
+  const std::string& debug() const {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    return schema_->debug;
+  }
+  bool hasSchema() const {
+    return schema_.has_value();
+  }
+
+  bool isObserved() const {
+    return is_observed_;
+  }
+
+  // We may allocate an OperatorEntry for an operator even when we don't
+  // have a schema.  When we receive the schema registration, we post
+  // facto register a schema.
+  //
+  // NB: registerSchema/deregisterSchema are not idempotent; if you
+  // attempt to register a schema when one is already present or vice
+  // versa that is an error.  (Refcounting for the registrations is
+  // handled in the OperatorHandle in Dispatcher)
+  void registerSchema(FunctionSchema&&, std::string&& debug, std::vector<at::Tag> tags = {});
+  void deregisterSchema();
+
+  const OperatorName& operator_name() const {
+    return name_;
+  }
+
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+  using AnnotatedKernelContainer = std::array<AnnotatedKernel, 1>;
+#else
+  using AnnotatedKernelContainer = std::list<AnnotatedKernel>;
+#endif
+  using AnnotatedKernelContainerIterator = AnnotatedKernelContainer::iterator;
+
+  // Why are kernels and fallback asymmetric?  It has to do with ownership.
+  // Kernels and the computed dispatch tables for them are canonically
+  // owned by OperatorEntry, but backend fallbacks are specified once
+  // and apply for all operators, so they should be owned by Dispatcher.
+  // However, the registration of a backend fallback affects the
+  // state of the computed dispatch table, so when a backend fallback
+  // is updated, we need to update the operator tables too.  Thus,
+  // registerKernel is the mechanism by which we give kernels to
+  // operator entry to own (and update dispatch table), but we only
+  // need a non-owning mechanism to update fallback.
+
+  // Precondition: Dispatcher::mutex_ is held
+  // Postcondition: caller is responsible for disposing of the kernel
+  AnnotatedKernelContainerIterator registerKernel(
+    const Dispatcher& dispatcher,
+    c10::optional<DispatchKey> dispatch_key,
+    KernelFunction kernel,
+    c10::optional<CppSignature> cpp_signature,
+    std::unique_ptr<FunctionSchema> inferred_function_schema,
+    std::string debug
+  );
+
+  // Precondition: Dispatcher::mutex_ is held
+  void deregisterKernel_(
+    const Dispatcher& dispatcher,
+    c10::optional<DispatchKey> dispatch_key,
+    AnnotatedKernelContainerIterator kernel
+  );
+
+  // Precondition: Dispatcher::mutex_ is held
+  void updateFallback(
+    const Dispatcher& dispatcher,
+    DispatchKey dispatch_key
+  );
+
+  // Precondition: Dispatcher::mutex_ is held
+  void updateSchemaAliasAnalysis(AliasAnalysisKind a) {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    schema_->schema.setAliasAnalysis(a);
+  }
+
+  std::string dumpComputedTable() const;
+  std::string dumpState() const;
+  void checkInvariants() const;
+
+  const DispatchKeyExtractor& dispatchKeyExtractor() const { return dispatchKeyExtractor_; }
+
+  // Asserts that the given FuncType is correct for calling this operator in an unboxed way.
+  template<class FuncType>
+  inline void assertSignatureIsCorrect() {
+    assertSignatureIsCorrect(CppSignature::make<FuncType>(), fn_has_symint<FuncType>::value);
+  }
+
+  void assertSignatureIsCorrect(const CppSignature& call_signature, bool has_symint) const;
+
+  [[noreturn]] void reportError(DispatchKey dispatchKey) const;
+
+  const KernelFunction& lookup(DispatchKeySet ks) const {
+    const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
+    if (C10_UNLIKELY(idx == -1)) {
+      reportError(ks.highestPriorityTypeId());
+    }
+    const auto& kernel = dispatchTable_[idx];
+    // A valid kernel *always* has a boxed kernel and *may* have an
+    // unboxed kernel. However, we typically do unboxed calls in at::
+    // APIs, where the kernel 1) will very likely be valid and 2)
+    // should have an unboxed kernel. Checking the unboxed kernel
+    // first will allow us to avoid touching the boxed kernel at all
+    // in the common case.
+    if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
+      if (!kernel.isValid()) {
+        reportError(ks.highestPriorityTypeId());
+      }
+    }
+    return kernel;
+  }
+
+  std::string listAllDispatchKeys() const;
+
+  // Returns true if kernel_ has entry for any key in ks.
+  //
+  // Invariant: There are no alias keys in the passed-in dispatch key set.
+  // Note [No Alias Keys in DispatchKeySet]
+  // Alias keys should be checked using `hasKernelForDispatchKey`
+  // Alias keys shouldn't go inside of a DispatchKeySet, since they can technically
+  // have a value > 63 (causing overflow).
+  bool hasKernelForAnyDispatchKey(DispatchKeySet ks) const;
+  // Returns true if kernel_ has entry for a particular key.
+  bool hasKernelForDispatchKey(DispatchKey k) const;
+  // Retrieves the kernel entry at a particular key.  Symmetric with
+  // hasKernelForDispatchKey.  To get the AnnotatedKernel, see
+  // getKernelForDispatchKey (private)
+  const KernelFunction& kernelForDispatchKey(DispatchKey k) const;
+  // Returns true if the "computed table" has an entry for a particular key.
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns all the operator tags added at the time of registration
+  const std::vector<at::Tag>& getTags() const;
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback);
+
+  template <typename F>
+  PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor) const {
+    return py_cache_.ptr_or(self_interpreter, slow_accessor);
+  }
+
+private:
+
+  OperatorName name_;
+  c10::optional<AnnotatedSchema> schema_;
+  #ifndef C10_MOBILE
+    std::vector<at::Tag> tags_;
+  #endif
+  std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
+  DispatchKeyExtractor dispatchKeyExtractor_;
+  // Pointer to the torch.ops.ns.op.overload object for speed
+  c10::PyHandleCache py_cache_;
+
+  // kernels_ stores all registered kernels for the corresponding dispatch key
+  // and catchAllKernels_ stores the catch-all kernels.
+  // If an operator library gets loaded that overwrites an already existing kernel,
+  // both kernels will be in that list but only the newer one will be in
+  // dispatchTable. If any of the kernels go away (say the library gets
+  // unloaded), we remove the kernel from this list and update the
+  // dispatchTable if necessary.
+  // Kernels in the list are ordered by registration time descendingly,
+  // newer registrations are before older registrations.
+  // We do not combine dispatchTable and kernels into one hash map because
+  // kernels is a larger data structure and accessed quite infrequently
+  // while dispatchTable is accessed often and should be kept small to fit
+  // into CPU caches.
+  // Invariants:
+  //  - dispatchTable[dispatch_key] == kernels_[dispatch_key].front()
+  //  - dispatchTable[dispatch_key] does not exist if and only if
+  //    kernels_[dispatch_key] does not exist
+  //  - If kernels_[dispatch_key] exists, then it has elements.
+  //    It is never an empty list.
+  //
+  // Why do we do that?
+  // -----
+  // We mostly do this to enable Jupyter notebooks where a cell registering
+  // a kernel could be executed multiple times and the later execution
+  // should overwrite the earlier one. Note that this still fails when the
+  // function schema changed between the executions, but it works as long
+  // as the function schema didn't change. A better solution would be to
+  // unload the old extension library from the Jupyter cell when the cell is
+  // re-executed and then only allow one kernel here, i.e. error if a kernel
+  // is already registered, but that's a lot of effort to implement and
+  // currently not high-pri.
+  ska::flat_hash_map<DispatchKey,
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+                     // On mobile, we needn't worry about Jupyter notebooks.
+                     std::array<AnnotatedKernel, 1>
+#else
+                     std::list<AnnotatedKernel>
+#endif
+                     > kernels_;
+
+  const AnnotatedKernel& missingKernel() const;
+  const AnnotatedKernel& ambiguousAutogradOtherKernel() const;
+
+  // cpp_signature_ stores function signature if any of
+  // the kernels was created in a way that allowed us to know the function
+  // signature (i.e. by supplying an unboxed C++ kernel function).
+  // If this is set, it will be used to check that future kernel
+  // registrations match and it will be used in unboxed function calls
+  // to verify their arguments against the known function signature.
+  struct CppSignatureWithDebug {
+    CppSignature signature;
+    std::string debug;
+    c10::optional<DispatchKey> dispatch_key;
+  };
+  c10::optional<CppSignatureWithDebug> cpp_signature_;
+  c10::optional<CppSignatureWithDebug> sym_cpp_signature_;
+
+  // A Python custom error handler for OperatorEntry::reportError
+  std::unique_ptr<c10::SafePyObject> report_error_callback_;
+
+  // Whether this operator needs to be observed with RecordFunction
+  const bool is_observed_;
+
+  [[noreturn]] void reportSignatureError(const CppSignature& call_signature, const CppSignatureWithDebug& saved_signature) const;
+  const KernelFunction& computeDispatchTableEntry(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) const;
+  std::pair<const AnnotatedKernel&, const char*> computeDispatchTableEntryWithDebug(
+    const c10::Dispatcher& dispatcher, DispatchKey dispatch_key
+  ) const;
+  // This function re-establishes the invariant that dispatchTable
+  // contains the front element from the kernels list for a given runtime dispatch key.
+  void updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
+  // Like above, but also handles alias dispatch keys.
+  void updateDispatchTable_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key);
+  // Like above, but for ALL entries in the dispatch table.
+  void updateDispatchTableFull_(const c10::Dispatcher& dispatcher);
+  // Retrieves a pointer to AnnotatedKernel at kernels_.at(dispatch_key).front().
+  const AnnotatedKernel* getKernelForDispatchKey(DispatchKey dispatch_key) const;
+};
+
+} // namespace impl
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d542bc942279d06f791306177e7a4462cd917caf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstdint>
+
+namespace c10 {
+
+enum class AliasAnalysisKind : uint8_t {
+  INTERNAL_SPECIAL_CASE,
+  CONSERVATIVE, // The most conservative alias analysis type, assumes
+                // side-effects. This is the default analysis.
+  FROM_SCHEMA,
+  PURE_FUNCTION
+};
+
+#if !defined(_MSC_VER)
+constexpr // Our current MSVC version has a bug that doesn't allow this to be constexpr.
+#endif
+inline const char* toString(AliasAnalysisKind aliasAnalysisKind) {
+  return (aliasAnalysisKind == AliasAnalysisKind::CONSERVATIVE)
+      ? "CONSERVATIVE"
+      : (aliasAnalysisKind == AliasAnalysisKind::FROM_SCHEMA)
+          ? "FROM_SCHEMA"
+          : (aliasAnalysisKind == AliasAnalysisKind::PURE_FUNCTION)
+              ? "PURE_FUNCTION"
+              : (aliasAnalysisKind == AliasAnalysisKind::INTERNAL_SPECIAL_CASE)
+                  ? "INTERNAL_SPECIAL_CASE"
+                  : "UNKNOWN";
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h
new file mode 100644
index 0000000000000000000000000000000000000000..a26f491a0ce9c84b0e1d5d8ef3ead0d0592d4b31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <functional>
+
+namespace c10 {
+
+class RegistrationHandleRAII final {
+public:
+  explicit RegistrationHandleRAII(std::function<void()> onDestruction)
+      : onDestruction_(std::move(onDestruction)) {}
+
+  ~RegistrationHandleRAII() {
+    if (onDestruction_) {
+      onDestruction_();
+    }
+  }
+
+  RegistrationHandleRAII(const RegistrationHandleRAII&) = delete;
+  RegistrationHandleRAII& operator=(const RegistrationHandleRAII&) = delete;
+
+  RegistrationHandleRAII(RegistrationHandleRAII&& rhs) noexcept
+      : onDestruction_(std::move(rhs.onDestruction_)) {
+    rhs.onDestruction_ = nullptr;
+  }
+
+  RegistrationHandleRAII& operator=(RegistrationHandleRAII&& rhs) noexcept {
+    onDestruction_ = std::move(rhs.onDestruction_);
+    rhs.onDestruction_ = nullptr;
+    return *this;
+  }
+
+private:
+  std::function<void()> onDestruction_;
+};
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/dynamic_type.h b/MLPY/Lib/site-packages/torch/include/ATen/core/dynamic_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b2e3970670c20c3dd2e8f3246cb59998451e313
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/dynamic_type.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include <ATen/core/jit_type_base.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+using DynamicTypeBits = std::uint32_t;
+#define DYNAMIC_TYPE_BIT(x) (1u << x)
+
+constexpr DynamicTypeBits kDynamicCovariantTypeBit = DYNAMIC_TYPE_BIT(31);
+constexpr DynamicTypeBits kDynamicAnyTypeBit = DYNAMIC_TYPE_BIT(30);
+
+constexpr DynamicTypeBits kDynamicNoneTypeBit = DYNAMIC_TYPE_BIT(1);
+constexpr DynamicTypeBits kDynamicIntTypeBit = DYNAMIC_TYPE_BIT(3);
+constexpr DynamicTypeBits kDynamicFloatTypeBit = DYNAMIC_TYPE_BIT(4);
+constexpr DynamicTypeBits kDynamicComplexTypeBit = DYNAMIC_TYPE_BIT(5);
+constexpr DynamicTypeBits kDynamicListTypeBit = DYNAMIC_TYPE_BIT(7);
+constexpr DynamicTypeBits kDynamicTupleTypeBit = DYNAMIC_TYPE_BIT(8);
+constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
+
+#define FORALL_DYNAMIC_TYPES(_)                                              \
+  _(Tensor, DYNAMIC_TYPE_BIT(0), 1)                                          \
+  _(None, kDynamicNoneTypeBit, 1)                                            \
+  _(Bool, DYNAMIC_TYPE_BIT(2), 1)                                            \
+  _(Int, kDynamicIntTypeBit, 1)                                              \
+  _(Float, kDynamicFloatTypeBit, 1)                                          \
+  _(Complex, kDynamicComplexTypeBit, 1)                                      \
+  _(Number,                                                                  \
+    (kDynamicIntTypeBit | kDynamicFloatTypeBit | kDynamicComplexTypeBit),    \
+    1)                                                                       \
+  _(String, DYNAMIC_TYPE_BIT(6), 1)                                          \
+  _(List, kDynamicListTypeBit, 0)                                            \
+  _(Tuple, (kDynamicTupleTypeBit | kDynamicCovariantTypeBit), 0)             \
+  _(Dict, DYNAMIC_TYPE_BIT(9), 0)                                            \
+  _(Class, kDynamicClassTypeBit, 0)                                          \
+  _(Optional,                                                                \
+    (DYNAMIC_TYPE_BIT(11) | kDynamicNoneTypeBit | kDynamicCovariantTypeBit), \
+    0)                                                                       \
+  _(AnyList, (kDynamicListTypeBit | kDynamicAnyTypeBit), 1)                  \
+  _(AnyTuple,                                                                \
+    (kDynamicTupleTypeBit | kDynamicCovariantTypeBit | kDynamicAnyTypeBit),  \
+    1)                                                                       \
+  _(DeviceObj, DYNAMIC_TYPE_BIT(12), 1)                                      \
+  _(StreamObj, DYNAMIC_TYPE_BIT(13), 1)                                      \
+  _(Capsule, DYNAMIC_TYPE_BIT(14), 1)                                        \
+  _(Generator, DYNAMIC_TYPE_BIT(15), 1)                                      \
+  _(Storage, DYNAMIC_TYPE_BIT(16), 1)                                        \
+  _(Var, DYNAMIC_TYPE_BIT(17), 0)                                            \
+  _(AnyClass, (kDynamicClassTypeBit | kDynamicAnyTypeBit), 1)                \
+  _(QScheme, DYNAMIC_TYPE_BIT(18), 1)                                        \
+  _(Quantizer, DYNAMIC_TYPE_BIT(19), 1)                                      \
+  _(AnyEnum, DYNAMIC_TYPE_BIT(20), 1)                                        \
+  _(RRef, DYNAMIC_TYPE_BIT(21), 0)                                           \
+  _(Future, DYNAMIC_TYPE_BIT(22), 0)                                         \
+  _(Await, DYNAMIC_TYPE_BIT(23), 0)                                          \
+  _(Any, 0xffffffff, 1)
+
+#define FORALL_DYNAMIC_TYPES_FAKE(_) \
+  _(ScalarType, kDynamicIntTypeBit, 1)                                \
+  _(Layout, kDynamicIntTypeBit, 1)                                        \
+  _(SymInt, kDynamicIntTypeBit, 1)                                        \
+  _(MemoryFormat, kDynamicIntTypeBit, 1)
+
+#define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type;
+  FORALL_DYNAMIC_TYPES(FORWARD_DECL_TYPE)
+  FORALL_DYNAMIC_TYPES_FAKE(FORWARD_DECL_TYPE)
+#undef FORWARD_DECL_TYPE
+
+class DynamicType;
+using DynamicTypePtr = std::shared_ptr<DynamicType>;
+
+/**
+ * DynamicType is designed as a low dependency type system for TorchScript. The
+ * existing JIT types are used for both compilation and runtime, which makes
+ * sense for server contexts because we often compile and run the model in
+ * the same process, however this doesn't hold for mobile devices where we
+ * always compiles a model ahead of time, therefore there will be dependencies
+ * which are not needed, but built with mobile runtime causing binary size
+ * bloat, by design. Every basic type like Int, Bool or String will bring their
+ * vtable, typeinfo, constructor, destructor and even more data from their
+ * specializations for STL types to the binary causing a long tail bloat.
+ *
+ * The core problem is about the complexity to implement and maintain a single
+ * type system for both analysis and execution purposes. Although they should
+ * have the exactly same semantics, in practice implement a unified abstraction
+ * adds conceptual and representational overhead for both sides of the world.
+ *
+ * To address the issues, DynamicType implements a minimal subset of JIT types
+ * and uses a generic algorithm to test all subtyping relations. To achieve
+ * this, we assign each dynamic type a single integer tag to represent its
+ * semantics. More specifically, a dynamic type is defined as a set of "control
+ * bits" and "data bits", where control bits describe the special behavior when
+ * testing a type and data bits map to identity of each nominal type. We use bit
+ * operations to perform all the tests.
+ *
+ * For example, a "covariant bit" is a control bit used to describe if a type
+ * is covariant, right now the most used one is tuple type, and in addition to
+ * the control bit, tuple type's data bit is the 8th bit from the LSB. Control
+ * bits start from MSB and data bits start from LSB.
+ *
+ * If two types are equal, then they are subtype of each other, also if the bits
+ * from one type tag is subset of the other tag, it automatically becomes a
+ * subtype of the other. This simplifies the subtyping logic a lot, and over the
+ * long term it is possible to adopt this scheme on the server side as well.
+ * Special cases can be added but they generally should not take too much code
+ * size.
+ *
+ * DynamicType may or may not inherit from c10::Type because it's not the core
+ * requirement of DynamicType to interface with existing JIT types, but we might
+ * want to inherit from c10::Type to reduce the migration cost.
+ */
+class DynamicType : public SharedType {
+  using ClassTypePtr = std::shared_ptr<const c10::ClassType>;
+
+  /**
+   * A implementation detail to support NamedTuple.
+   */
+  struct LabeledDynamicType {
+    c10::optional<std::string> label;
+    DynamicTypePtr ty;
+    explicit LabeledDynamicType(DynamicTypePtr t) : ty(std::move(t)) {}
+
+    bool equals(const LabeledDynamicType& other) const;
+    bool isSubtypeOf(const LabeledDynamicType& other) const;
+  };
+
+ public:
+  // TODO Change Ptr to DynamicTypePtr when all migrations are done.
+  using Ptr = TypePtr;
+  using ElementType = DynamicType;
+  ~DynamicType() override;
+
+  struct Arguments {
+    Arguments() = default;
+    Arguments(c10::ArrayRef<TypePtr>);
+    Arguments(const std::vector<c10::string_view>&, c10::ArrayRef<TypePtr>);
+    std::vector<LabeledDynamicType> elems;
+  };
+
+  enum class Tag : DynamicTypeBits {
+#define DYNAMIC_TYPE_ITEM(NAME, VAL, _) NAME = VAL,
+    FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_ITEM)
+    FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_ITEM)
+#undef DYNAMIC_TYPE_ITEM
+  };
+
+  bool equals(const Type& rhs) const override;
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+  std::string str() const override;
+  static const TypeKind Kind = TypeKind::DynamicType;
+  static TORCH_API DynamicTypePtr create(Type& ty);
+
+  explicit DynamicType(Tag, Arguments);
+  explicit DynamicType(Tag, c10::string_view, Arguments);
+
+  TypePtr containedType(size_t) const override;
+  size_t containedTypeSize() const override;
+  Tag tag() const {
+    return tag_;
+  }
+  const c10::optional<std::string>& name() const {
+    return name_;
+  }
+  const Arguments& arguments() const {
+    return arguments_;
+  }
+  TORCH_API TypeKind dynamicKind() const;
+
+  // Should be used only on the server side to restore static type information.
+#ifndef C10_MOBILE
+  TORCH_API
+#endif
+  TypePtr fallback() const;
+
+ private:
+  bool symmetric() const override {
+    return false;
+  }
+  friend struct Type;
+  static std::shared_ptr<const DynamicType> create(const Type& ty);
+  DynamicType(const Type& other);
+  bool equals(const DynamicType& other) const;
+
+  template <typename F>
+  bool compareArguments(const DynamicType& other, F&& f) const {
+    if (arguments_.elems.size() != other.arguments_.elems.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < arguments_.elems.size(); i++) {
+      if (!f(arguments_.elems[i], other.arguments_.elems[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Tag tag_;
+  c10::optional<std::string> name_;
+  union {
+    Arguments arguments_;
+    ClassTypePtr class_;
+  };
+};
+
+template <typename T>
+struct DynamicTypeTrait {
+  C10_NOINLINE static auto tagValue() {
+    TORCH_CHECK(false);
+    return DynamicType::Tag::Any;
+  }
+};
+
+namespace detail {
+C10_NOINLINE DynamicTypePtr makeBaseType(DynamicType::Tag tag);
+}
+
+#define DYNAMIC_TYPE_TAG_VALUE(NAME, _, IS_BASE_TYPE)      \
+  template <>                                              \
+  struct TORCH_API DynamicTypeTrait<NAME##Type> {          \
+    C10_ERASE static auto tagValue() {                     \
+      return DynamicType::Tag::NAME;                       \
+    }                                                      \
+    static constexpr bool isBaseType = IS_BASE_TYPE;       \
+    template <typename T = const DynamicTypePtr&>          \
+    static std::enable_if_t<isBaseType, T> getBaseType() { \
+      static auto type = detail::makeBaseType(tagValue()); \
+      return type;                                         \
+    }                                                      \
+  }; // namespace c10
+FORALL_DYNAMIC_TYPES(DYNAMIC_TYPE_TAG_VALUE)
+FORALL_DYNAMIC_TYPES_FAKE(DYNAMIC_TYPE_TAG_VALUE)
+#undef DYNAMIC_TYPE_TAG_VALUE
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/enum_tag.h b/MLPY/Lib/site-packages/torch/include/ATen/core/enum_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e5448211db5a6a9215f0fd794f07c6b61771e87
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/enum_tag.h
@@ -0,0 +1,20 @@
+#pragma once
+
+// @generated by torchgen/gen.py from enum_tag.h
+
+namespace at {
+    // Enum of valid tags obtained from the entries in tags.yaml
+    enum class Tag {
+        core,
+        data_dependent_output,
+        dynamic_output_shape,
+        generated,
+        inplace_view,
+        needs_fixed_stride_order,
+        nondeterministic_bitwise,
+        nondeterministic_seeded,
+        pointwise,
+        pt2_compliant_tag,
+        view_copy
+    };
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/enum_type.h b/MLPY/Lib/site-packages/torch/include/ATen/core/enum_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cd67fd89778fa2df32a9b0e8585c3337419c3cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/enum_type.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+#include <utility>
+
+namespace c10 {
+
+struct EnumType;
+using EnumTypePtr = std::shared_ptr<EnumType>;
+using EnumNameValue = std::pair<std::string, IValue>;
+struct TORCH_API EnumType : public NamedType {
+  friend struct Type;
+  static const TypeKind Kind = TypeKind::EnumType;
+
+  static EnumTypePtr create(
+      const c10::QualifiedName& qualified_class_name,
+      TypePtr value,
+      std::vector<EnumNameValue> enum_names_values,
+      std::weak_ptr<::torch::jit::CompilationUnit> cu) {
+    switch (value->kind()) {
+      case TypeKind::IntType:
+      case TypeKind::FloatType:
+      case TypeKind::StringType:
+        return EnumTypePtr(new EnumType(
+            qualified_class_name,
+            std::move(value),
+            std::move(enum_names_values),
+            std::move(cu)));
+      default:
+        AT_ERROR(
+            "Cannot create Enum with value type '",
+            value->str(),
+            "', only int, float and string are supported");
+    }
+  }
+
+  std::string str() const override {
+    return "Enum<" + annotation_str() + ">";
+  }
+
+  std::string repr_str() const override {
+    return str();
+  }
+
+  const TypePtr& getValueType() const {
+    return value_type_;
+  }
+
+  bool equals(const Type& rhs) const override {
+    if (auto* enum_rhs = rhs.castRaw<EnumType>()) {
+      return name().value() == enum_rhs->name().value() &&
+          *getValueType() == *(enum_rhs->getValueType()) &&
+          this->compilation_unit() == enum_rhs->compilation_unit();
+    }
+    return false;
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  std::shared_ptr<const ::torch::jit::CompilationUnit> compilation_unit()
+      const {
+    auto cu = cu_.lock();
+    return cu;
+  }
+
+  const QualifiedName& qualifiedClassName() const {
+    return name().value();
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return value_type_;
+  }
+
+  const at::ArrayRef<EnumNameValue> enumNamesValues() const {
+    return enum_names_values_;
+  }
+
+ private:
+  EnumType(
+      c10::QualifiedName qualified_class_name,
+      TypePtr value_type,
+      std::vector<EnumNameValue> enum_names_values,
+      std::weak_ptr<torch::jit::CompilationUnit> cu)
+      : NamedType(TypeKind::EnumType, std::move(qualified_class_name)),
+        value_type_(std::move(value_type)),
+        enum_names_values_(std::move(enum_names_values)),
+        cu_(std::move(cu)) {}
+
+  std::string annotation_str_impl(
+      C10_UNUSED TypePrinter printer = nullptr) const override {
+    const auto& n = name().value();
+    return n.qualifiedName();
+  }
+
+  TypePtr value_type_;
+  std::vector<EnumNameValue> enum_names_values_;
+  std::weak_ptr<::torch::jit::CompilationUnit> cu_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/function.h b/MLPY/Lib/site-packages/torch/include/ATen/core/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef64da980b5c6f855a06cb2f5b12bae14da17da6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/function.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/Exception.h>
+#include <c10/util/FunctionRef.h>
+
+namespace c10 {
+struct FunctionSchema;
+};
+
+namespace at {
+TORCH_API void launch(std::function<void()> func);
+}
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+struct Code;
+
+namespace mobile {
+struct Code;
+}
+
+using Stack = std::vector<at::IValue>;
+using Kwargs = std::unordered_map<std::string, at::IValue>;
+struct RecursiveMethodCallError : public std::exception {};
+using TaskLauncher = std::function<void(std::function<void()>)>;
+
+TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph, bool disable_autocast=false);
+
+// A Function is a pure Graph with no implicit `self` object bound.
+// It contains schema information and the executor that manages the
+// execution of the function. Method is a wrapper around an
+// underlying Function that also provides a `self` object.
+struct TORCH_API Function {
+  Function() = default;
+  Function(const Function&) = default;
+  Function& operator=(const Function&) = default;
+  Function(Function&&) noexcept = default;
+  Function& operator=(Function&&) noexcept = default;
+  virtual c10::string_view doc_string() const {
+    static constexpr c10::string_view no_doc_string = "";
+    return no_doc_string;
+  }
+
+  virtual bool isGraphFunction() const {
+    return false;
+  }
+
+  virtual void run(Stack& stack) = 0;
+
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      Stack& /*stack*/,
+      C10_UNUSED TaskLauncher taskLauncher = at::launch) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return {};
+  }
+
+  at::IValue operator()(
+    Stack stack,
+    const Kwargs& kwargs = Kwargs()) {
+    getSchema().checkAndNormalizeInputs(stack, kwargs);
+    run(stack);
+    return stack.front();
+  }
+
+  virtual const c10::QualifiedName& qualname() const = 0;
+
+  const std::string& name() const {
+    return qualname().name();
+  }
+
+  // if this isn't yet defined, run its method_creator function
+  virtual void ensure_defined() = 0;
+
+  virtual const c10::FunctionSchema& getSchema() const = 0;
+
+  virtual size_t num_inputs() const = 0;
+
+  virtual Function& setSchema(c10::FunctionSchema schema) = 0;
+
+  // call() defines how different interpreter implementations interacts with
+  // Function objects. Basically interpreters need to provide a callback to
+  // communicate to Functions what to do if provided a Code object.
+  // Alternatively we could design the signature to return an optional Code
+  // object, but that requires special handling the null case in interpreter
+  // and the fallback behavior is not well defined by interpreter but rather
+  // Function themselves, so a callback approach is more reasonable than
+  // returning values.
+  // If call() returns true, then callback completes successfully, otherwise
+  // call() returns false.
+
+  // Overload for server interpreter, a bailout size is needed for graph executor.
+  virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
+  // Overload for mobile interpreter.
+  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
+  virtual ~Function() = default;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema.h b/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0ab8d744da2a55d9bb0f4eb699f66274a04bcc5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema.h
@@ -0,0 +1,687 @@
+#pragma once
+
+#include <c10/util/StringUtil.h>
+#include <c10/util/string_view.h>
+#include <c10/util/irange.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/symbol.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/alias_info.h>
+#include <ATen/core/operator_name.h>
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <unordered_map>
+#include <utility>
+
+namespace c10 {
+
+// schema as used in the compiler for resolving function calls and reporting
+// errors. These objects should be constructed from C10 schema once those
+// are available.
+
+struct Argument;
+struct FunctionSchema;
+
+using AliasTypeSet = std::vector<TypePtr>;
+
+bool operator==(const Argument& lhs, const Argument& rhs);
+
+struct Argument {
+  Argument(
+      std::string name = "",
+      const TypePtr& type = nullptr,
+      c10::optional<int32_t> N = c10::nullopt,
+      c10::optional<IValue> default_value = c10::nullopt,
+      bool kwarg_only = false,
+      c10::optional<AliasInfo> alias_info = c10::nullopt)
+    : Argument(std::move(name), type, type, N, std::move(default_value), kwarg_only, std::move(alias_info)) {}
+
+  Argument(
+      std::string name,
+      TypePtr fake_type,
+      TypePtr real_type,
+      c10::optional<int32_t> N = c10::nullopt,
+      c10::optional<IValue> default_value = c10::nullopt,
+      bool kwarg_only = false,
+      c10::optional<AliasInfo> alias_info = c10::nullopt)
+      : name_(std::move(name)),
+        type_(fake_type ? std::move(fake_type) : TensorType::get()),
+        real_type_(real_type ? std::move(real_type) : type_),
+        N_(N),
+        default_value_(std::move(default_value)),
+        alias_info_(alias_info ? std::make_unique<AliasInfo>(std::move(*alias_info)) : nullptr),
+        kwarg_only_(kwarg_only) {
+    // this is an softly-enforced invariant for out arguments.
+    bool is_alias = alias_info_ != nullptr && alias_info_->isWrite();
+    is_out_ = kwarg_only_ && is_alias;
+  }
+
+  Argument(Argument&& rhs) noexcept = default;
+
+  Argument(const Argument& rhs)
+      : name_(rhs.name_),
+        type_(rhs.type_),
+        real_type_(rhs.real_type_),
+        N_(rhs.N_),
+        default_value_(rhs.default_value_),
+        alias_info_(rhs.alias_info_ ? std::make_unique<AliasInfo>(*rhs.alias_info_) : nullptr),
+        kwarg_only_(rhs.kwarg_only_),
+        is_out_(rhs.is_out_) {}
+
+  Argument& operator=(Argument&& rhs) = default;
+
+  Argument& operator=(const Argument& rhs) {
+    if (this != &rhs) {
+      name_ = rhs.name_;
+      type_ = rhs.type_;
+      real_type_ = rhs.real_type_;
+      N_ = rhs.N_;
+      default_value_ = rhs.default_value_;
+      alias_info_ = rhs.alias_info_ ? std::make_unique<AliasInfo>(*rhs.alias_info_) : nullptr;
+      kwarg_only_ = rhs.kwarg_only_;
+      is_out_ = rhs.is_out_;
+    }
+    return *this;
+  }
+
+  const std::string& name() const {
+    return name_;
+  }
+  const TypePtr& type() const {
+    return type_;
+  }
+  // if type() is non-null, this is guaranteed to be non-null (if no real
+  // type was provided, this takes on type()'s value)
+  const TypePtr& real_type() const {
+    return real_type_;
+  }
+  c10::optional<int32_t> N() const {
+    return N_;
+  }
+  const c10::optional<IValue>& default_value() const {
+    return default_value_;
+  }
+  bool kwarg_only() const {
+    return kwarg_only_;
+  }
+
+  bool is_out() const {
+    return is_out_;
+  }
+
+  C10_NODISCARD const AliasInfo* alias_info() const {
+    return alias_info_.get();
+  }
+
+  bool is_inferred_type() const {
+    bool is_inferred_type = false;
+    TORCH_INTERNAL_ASSERT(type_);
+    if (auto pt = type_->cast<TensorType>()) {
+      if (pt->isInferredType()) {
+        is_inferred_type = true;
+      }
+    }
+    return is_inferred_type;
+  }
+
+  std::string formatTypeMismatchMsg(const std::string& actual_type) const {
+    std::string inferred_type_hint;
+    if (is_inferred_type()) {
+      inferred_type_hint = c10::str(
+          "Inferred '",
+          name(),
+          "' to be of type 'Tensor' ",
+          "because it was not annotated with an explicit type.\n");
+    }
+    return c10::str(
+        "Expected a value of type '",
+        type()->repr_str(),
+        "' for argument '",
+        name(),
+        "' but instead found type '",
+        actual_type,
+        "'.\n",
+        inferred_type_hint);
+  }
+
+  Argument cloneWithType(TypePtr new_type) const {
+    return Argument(
+        name_,
+        std::move(new_type),
+        N_,
+        default_value_,
+        kwarg_only_,
+        alias_info_ ? c10::optional<AliasInfo>(*alias_info_) : c10::nullopt);
+  }
+
+  // this function checks whether this Argument is backward compatible with
+  // the old one. we consider the following cases are backward compatible:
+  //   1) two arguments are equal
+  //   2) this arg's type should be subtype of old
+  //   3) this arg must provide the same default value if old arg has one,
+  bool isBackwardCompatibleWith(
+      const Argument& old,
+      std::ostream* why_not=nullptr) const;
+
+  // this function checks whether this Argument is forward compatible with
+  // the old one. we consider the following cases are forward compatible:
+  //   1) two arguments are equal
+  //   2) this arg's type should be subtype of old
+  //   3) this arg must provide the same default value if old arg has one,
+  bool isForwardCompatibleWith(
+      const Argument& old,
+      std::ostream* why_not = nullptr) const;
+
+ private:
+  std::string name_;
+  TypePtr type_;
+  TypePtr real_type_; // this is ScalarType, not int, e.g.
+  // for list types, an optional statically known length for the list
+  // e.g. for int[3]: type = ListType::ofInts(), N = 3
+  // If present, this will allow scalars to be broadcast to this length to
+  // become a list.
+  c10::optional<int32_t> N_;
+
+  c10::optional<IValue> default_value_;
+  // AliasInfo is huge, so let's only allocate memory for it if
+  // necessary (which it isn't during schema parsing on startup, to
+  // give a pertinent example).
+  std::unique_ptr<AliasInfo> alias_info_;
+  // is this only specifiable as a keyword argument?
+  bool kwarg_only_;
+  // marks if the argument is out variant of the schema
+  bool is_out_;
+};
+
+inline bool operator==(const Argument& lhs, const Argument& rhs) {
+  return lhs.name() == rhs.name()
+          && *lhs.type() == *rhs.type()
+          && lhs.N() == rhs.N()
+          && lhs.default_value() == rhs.default_value()
+          && lhs.kwarg_only() == rhs.kwarg_only()
+          && (lhs.alias_info() == rhs.alias_info()
+              || (lhs.alias_info() != nullptr && rhs.alias_info() != nullptr
+                   && *lhs.alias_info() == *rhs.alias_info()));
+}
+
+inline bool operator!=(const Argument& lhs, const Argument& rhs) {
+  return !(lhs == rhs);
+}
+
+enum struct TORCH_API SchemaArgType { input, output };
+
+/**
+ * struct SchemaArgument
+ *
+ * Structure used to represent arguments or returns for a schema.
+ */
+struct TORCH_API SchemaArgument {
+  SchemaArgType type;
+  size_t index;
+  SchemaArgument(SchemaArgType tpe, size_t idx) : type(tpe), index(idx) {}
+  bool operator==(const SchemaArgument& rhs) const {
+    return type == rhs.type && index == rhs.index;
+  }
+};
+
+bool operator==(const FunctionSchema& lhs, const FunctionSchema& rhs);
+
+struct TORCH_API FunctionSchema {
+  FunctionSchema(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      bool is_vararg = false,
+      bool is_varret = false)
+      : name_({std::move(name), std::move(overload_name)}),
+        arguments_(std::move(arguments)),
+        returns_(std::move(returns)),
+        is_vararg_(is_vararg),
+        is_varret_(is_varret) {
+    checkSchema();
+  }
+
+  FunctionSchema(
+      Symbol name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      bool is_vararg = false,
+      bool is_varret = false)
+      : FunctionSchema(
+            name.toQualString(),
+            std::move(overload_name),
+            std::move(arguments),
+            std::move(returns),
+            is_vararg,
+            is_varret) {
+    checkSchema();
+  }
+
+  // Checks whether this schema is backward compatible with the old one.
+  // The following conditions must be true:
+  // [Function structure] The new schema's name, overload-name, varargs, and
+  //      return arity are the same.
+  // [Output Narrowing] The new schema's output type must be the same class
+  //      or inherit from the old schema's output type.
+  // [Argument count] The new schema must have at least as many arguments as
+  //      the old schema (considering the list of positional and kwargs).
+  // [Arg Compatibility] Every argument in the old schema has a corresponding
+  //      argument in the new schema that:
+  //        * is at the same position.
+  //        * has the same name.
+  //        * is either positional, or kwarg and the old argument was kwarg.
+  //        * has the same type, or the old argument's type inherits from the
+  //          new argument's type.
+  // [Default Values] Every new argument must have a default value.
+  // E.g.
+  //   OK    f_new(a, b, c=1) => f_old(a, b)
+  //   NOK   f_new(a, c=1, *, b) => f_old(a, *, b)
+  //   OK    f_new(a, b, *, c) => f_old(a, *, b, c)
+  //   NOK   f_new(a, *, b, c) -> f_old(a, b, *, c)
+  //   NOK   f_new(a, *, c, b) => f_old(a, *, b, c)
+  //   OK    f_new(a, *, b, c, d=1) => f_old(a, *, b, c)
+  bool isBackwardCompatibleWith(
+      const FunctionSchema& old,
+      std::ostream* why_not = nullptr) const;
+
+  // Checks whether this schema is forward compatible with the old one.
+  // The following conditions must be true:
+  // [Function structure] The new schema's name, overload-name, varargs, and
+  //      return arity are the same.
+  // [Output Narrowing] The new schema's output type must be the same class
+  //      or inherit from the old schema's output type.
+  // [Arg Compatibility] Every argument in the old schema has a corresponding
+  //      argument in the new schema that:
+  //        * is at the same position.
+  //        * has the same name.
+  //        * is either positional, or kwarg and the old argument was kwarg.
+  //        * has the same type, or the old argument's type inherits from the
+  //          new argument's type.
+  // [Default Values] Every new argument must have a default value.
+  //         Each default value type should NOT be a container type.
+  // [Positioning] All defaults arguments MUST go after either old
+  //         default arguments or the end of positional arguments
+  //         and right BEFORE all out arguments
+  bool isForwardCompatibleWith(
+      const FunctionSchema& old,
+      std::ostringstream& why_not) const;
+
+ private:
+  OperatorName name_;
+  std::vector<Argument> arguments_;
+  std::vector<Argument> returns_;
+  // if true then this schema takes an arbitrary number of additional arguments
+  // after the argument specified in arguments
+  // currently this is used primarily to represent 'primitive' operators whose
+  // arguments are not checked by schema
+  bool is_vararg_;
+  bool is_varret_;
+
+  // if no alias information is directly specified, what kind of "default"
+  // alias information should we infer?
+  // NB: due to alias analysis kind merging, this may be nullopt.  Eventually
+  // this should always be set no matter what
+  c10::optional<AliasAnalysisKind> alias_kind_;
+
+  template <typename T>
+  void checkArg(const IValue& value, const Argument& argument, optional<size_t> pos) const;
+
+  void checkSchema() const {
+    bool seen_default_arg = false;
+    for (const auto& arg : arguments()) {
+      if (arg.default_value()) {
+        seen_default_arg = true;
+      } else {
+        // we have historically serialized broadcasting lists wo/default values,
+        // so to not break BC allow lists here
+        if (arg.type()->kind() == ListType::Kind) {
+          continue;
+        }
+        TORCH_INTERNAL_ASSERT(
+            !seen_default_arg || arg.kwarg_only(),
+            "Non-default positional argument follows default argument. Parameter ",
+            arg.name(),
+            " in ",
+            *this);
+      }
+    }
+  }
+
+ public:
+
+  void dump() const;
+
+  const OperatorName& operator_name() const {
+    return name_;
+  }
+  const std::string& name() const {
+    return name_.name;
+  }
+  const std::string& overload_name() const {
+    return name_.overload_name;
+  }
+  const std::vector<Argument>& arguments() const {
+    return arguments_;
+  }
+  const std::vector<Argument>& returns() const {
+    return returns_;
+  }
+  bool is_vararg() const {
+    return is_vararg_;
+  }
+  bool is_varret() const {
+    return is_varret_;
+  }
+  bool is_aliasing(const c10::SchemaArgument &argument) const {
+    TORCH_INTERNAL_ASSERT(
+    argument.index < getCorrectList(argument.type).size(),
+    "Invalid index for schema.");
+    const AliasInfo* aliasInfo = getCorrectList(argument.type)[argument.index].alias_info();
+    return aliasInfo;
+  }
+  bool is_mutable() const {
+    return std::any_of(
+        arguments_.cbegin(), arguments_.cend(), [](const Argument& arg) {
+          const AliasInfo* aliasInfo = arg.alias_info();
+          return aliasInfo && aliasInfo->isWrite();
+        });
+  }
+  bool is_mutable(const c10::SchemaArgument &argument) const {
+    TORCH_INTERNAL_ASSERT(
+        argument.index < getCorrectList(argument.type).size(),
+        "Invalid index for schema.");
+    const AliasInfo* aliasInfo = getCorrectList(argument.type)[argument.index].alias_info();
+    return aliasInfo && aliasInfo->isWrite();
+  }
+  bool is_mutable(c10::string_view name) const {
+    c10::optional<int> index = argumentIndexWithName(name);
+    TORCH_INTERNAL_ASSERT(
+        index != c10::nullopt, "Schema has no argument named ", name);
+
+    return is_mutable({c10::SchemaArgType::input, static_cast<size_t>(*index)});
+  }
+
+  // Returns whether lhs and rhs may alias directly.
+  // This does not account for cases where lhs or rhs are a container that
+  // may contain elements that alias the other argument.
+  // FunctionSchema::may_contain_alias will include that functionality.
+  bool may_alias(const SchemaArgument& lhs, const SchemaArgument& rhs) const;
+
+  // Returns whether lhs and rhs may alias directly or whether lhs/rhs are a container
+  // that may contain elements that alias the other argument.
+  // bidirectional = false only returns whether lhs may contain an alias of rhs
+  // while bidirectional = true returns both directions.
+  bool may_contain_alias(const SchemaArgument& lhs, const SchemaArgument& rhs, bool bidirectional = true) const;
+
+  // Returns whether the two AliasTypeSets contain any similarities
+  // ie: whether the two type sets can alias.
+  bool canAliasTypeSetsAlias(const c10::optional<AliasTypeSet> &lhs, const c10::optional<AliasTypeSet> &rhs) const;
+
+  // Recursively Finds all contained types within the AliasTypeSet.
+  c10::optional<AliasTypeSet> getAliasTypeSetContainedTypes(const c10::optional<AliasTypeSet> &aliasTypeSet) const;
+
+  // Similar to mapTypeToAliasTypeSet defined in alias_analysis.cpp.
+  // Used to map types to a type such that all types that can alias will be mapped to the same type.
+  // For example, calling this method on 'Optional[List[int]]' is the same as calling this method
+  // on 'List[int]'.
+  c10::optional<AliasTypeSet> mapTypeToAliasTypeSet(const TypePtr& type) const;
+
+  // Returns either arguments() or returns() depending on the SchemaArgType
+  // output => returns(), input => arguments()
+  const std::vector<Argument>& getCorrectList(SchemaArgType type) const;
+
+  c10::optional<int> argumentIndexWithName(c10::string_view name) const {
+    for (const auto i : c10::irange(arguments().size())) {
+      if(name == arguments()[i].name())
+        return i;
+    }
+    return c10::nullopt;
+  }
+  FunctionSchema cloneWithName(std::string name, std::string overload_name) const {
+    return FunctionSchema(
+        std::move(name),
+        std::move(overload_name),
+        arguments(),
+        returns(),
+        is_vararg(),
+        is_varret()
+        );
+  }
+  FunctionSchema cloneWithArguments(std::vector<Argument> new_arguments) const {
+    return FunctionSchema(
+        name(),
+        overload_name(),
+        std::move(new_arguments),
+        returns(),
+        is_vararg(),
+        is_varret());
+  }
+  FunctionSchema cloneWithReturns(std::vector<Argument> new_returns) const {
+    return FunctionSchema(
+        name(),
+        overload_name(),
+        arguments(),
+        std::move(new_returns),
+        is_vararg(),
+        is_varret());
+  }
+
+  std::string formatTypeMismatchMsg(
+      const Argument& expected,
+      const std::string& actual_type,
+      c10::optional<size_t> position = c10::nullopt,
+      c10::optional<std::string> value = c10::nullopt) const;
+
+  FunctionSchema cloneWithRemappedTypes(
+      const std::function<TypePtr(TypePtr)> type_map) const;
+
+  FunctionSchema cloneWithRealTypes(bool with_symint=true) const;
+
+  // Check that inputs have the correct types and appends any missing default
+  // values.
+  template <typename T = c10::PlatformType>
+  void checkAndNormalizeInputs(
+      std::vector<IValue>& inputs,
+      const std::unordered_map<std::string, IValue>& kwargs =
+          std::unordered_map<std::string, IValue>{}) const;
+
+  std::string findErrorInKwargs(const std::vector<std::string>& kwargs) const;
+
+  bool hasAnyAliasInfo() const {
+    for (const auto& arg : arguments_) {
+      if (arg.alias_info() != nullptr) {
+        return true;
+      }
+    }
+    for (const auto& ret : returns_) {
+      if (ret.alias_info() != nullptr) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+
+  // TODO remove the mutation here
+  bool isDefaultAliasAnalysisKind() const {
+    return !alias_kind_;
+  }
+  AliasAnalysisKind aliasAnalysis() const {
+    return alias_kind_.value_or(AliasAnalysisKind::CONSERVATIVE);
+  }
+  void setAliasAnalysis(AliasAnalysisKind v) {
+    alias_kind_ = v;
+  }
+
+  c10::optional<c10::string_view> getNamespace() const {
+    return name_.getNamespace();
+  }
+
+  // Returns true if we successfully set the namespace (as there
+  // was none set, and false otherwise)
+  bool setNamespaceIfNotSet(const char* ns) {
+    return name_.setNamespaceIfNotSet(ns);
+  }
+
+  // can a function with this schema be substituted for a function of rhs's
+  // schema and have the program typecheck?
+  // as_method - if true, treat this schema as a method and ignore
+  // the first argument, which will be the object in both cases
+  bool isSubtypeOf(const FunctionSchema& rhs, bool as_method, std::ostream* why_not=nullptr) const;
+};
+
+inline bool operator==(const FunctionSchema& lhs, const FunctionSchema& rhs) {
+  return lhs.name() == rhs.name()
+     && lhs.overload_name() == rhs.overload_name()
+     && lhs.arguments() == rhs.arguments()
+     && lhs.returns() == rhs.returns()
+     && lhs.is_vararg() == rhs.is_vararg()
+     && lhs.is_varret() == rhs.is_varret();
+}
+
+inline bool operator!=(const FunctionSchema& lhs, const FunctionSchema& rhs) {
+  return !(lhs == rhs);
+}
+
+// print out Argument, which is compatible with FunctionSchema parser
+// full format: Type(alias)? name=default_value
+inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
+
+  // for adjusting the ? position.
+  // in schema, we have Tensor?(a!) input, and t(a!)?.
+  // however, t?(a!) doesn't work with schema parser.
+  // so we always use Type(alias)? format
+  // real_type versus fake_type: in order to be compatible with FunctionSchema
+  // parser, printing an argument with either MemoryFormat or Layout type should
+  // give us the original schema string, hence printing out real_type.
+  auto type = arg.real_type();
+  bool is_opt = type->kind() == OptionalType::Kind;
+  auto unopt_type = is_opt ? type->castRaw<OptionalType>()->getElementType() : type;
+
+  if (unopt_type->kind() == ListType::Kind) {
+    // sized lists get size N from arg, not type
+    auto list = unopt_type->cast<c10::ListType>();
+    out << list->getElementType()->str();
+    if (arg.alias_info() && !arg.alias_info()->containedTypes().empty()){
+      out << arg.alias_info()->containedTypes()[0];
+    }
+    std::string N = "";
+    if (arg.N()) {
+        N = std::to_string(*arg.N());
+    }
+    out << "[" << N << "]";
+  } else {
+    out << unopt_type->str();
+  }
+
+  // print alias info if it has beforeSets.
+  if (arg.alias_info() && !arg.alias_info()->beforeSets().empty()) {
+    out << *arg.alias_info();
+  }
+
+  if (is_opt) {
+    out << "?";
+  }
+
+  if (!arg.name().empty()) {
+    out << " " << arg.name();
+  }
+
+  if (arg.default_value()) {
+    out << "=";
+    if ((type->kind() == c10::TypeKind::StringType ||
+        unopt_type->kind() == c10::TypeKind::StringType) &&
+        arg.default_value().value().isString()) {
+      printQuotedString(out, arg.default_value().value().toStringRef());
+    } else if (type->kind() == TypeKind::ListType && type->castRaw<ListType>()->getElementType()->kind() == c10::TypeKind::IntType) {
+      // We want to faithfully replicate JIT schema.
+      // in native_functions.yaml defaults for int arrays with a single value always look like
+      //   int[2] stride=1
+      // instead of
+      //   int[2] stride=[1, 1]
+      auto default_val = arg.default_value().value().toIntList();
+      if (default_val.size() > 1) {
+        auto all_defaults_the_same = true;
+        for (const auto i : c10::irange(1, default_val.size())) {
+          if (default_val[0] != default_val[i]) all_defaults_the_same = false;
+        }
+        if (all_defaults_the_same) {
+          out << default_val[0];
+        } else {
+          out << arg.default_value().value();
+        }
+      } else {
+        out << arg.default_value().value();
+      }
+    } else {
+      out << arg.default_value().value();
+    }
+  }
+
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema);
+
+inline std::string toString(const FunctionSchema& schema) {
+  std::ostringstream str;
+  str << schema;
+  return str.str();
+}
+
+} // namespace c10
+
+namespace std {
+template<>
+  struct hash<c10::SchemaArgument> {
+    size_t operator()(const c10::SchemaArgument& arg) const
+    {
+      return c10::hash_combine(std::hash<size_t>()(arg.index), std::hash<size_t>()(static_cast<std::size_t>(arg.type)));
+    }
+  };
+template<>
+  struct hash<c10::Argument> {
+    size_t operator()(const c10::Argument& arg) const
+    {
+      auto hash = std::hash<std::string>{}(arg.name());
+      auto type_hash = std::hash<c10::TypePtr>{}(arg.type());
+      auto kwarg_only_hash = std::hash<bool>{}(arg.kwarg_only());
+      hash = c10::hash_combine(hash, type_hash);
+      hash = c10::hash_combine(hash, kwarg_only_hash);
+      // hashing optional fields if they exist
+      if (arg.default_value()) {
+        auto default_value_hash = c10::hash<c10::IValue>{}(arg.default_value().value());
+        hash = c10::hash_combine(hash, default_value_hash);
+      }
+      if (arg.N()) {
+        auto N_hash = std::hash<int64_t>{}(*arg.N());
+        hash = c10::hash_combine(hash, N_hash);
+      }
+      if (arg.alias_info()) {
+        auto alias_info_hash = std::hash<c10::AliasInfo>{}(*arg.alias_info());
+        hash = c10::hash_combine(hash, alias_info_hash);
+      }
+      return hash;
+    }
+  };
+template<>
+  struct hash<c10::FunctionSchema> {
+    size_t operator()(const c10::FunctionSchema& schema) const
+    {
+      auto hash = std::hash<c10::OperatorName>{}(schema.operator_name());
+      auto args_hash = c10::hash<std::vector<c10::Argument>>{}(schema.arguments());
+      auto returns_hash = c10::hash<std::vector<c10::Argument>>{}(schema.returns());
+      auto is_vararg_hash = std::hash<bool>{}(schema.is_vararg());
+      auto is_varret_hash = std::hash<bool>{}(schema.is_varret());
+      hash = c10::hash_combine(hash, args_hash);
+      hash = c10::hash_combine(hash, returns_hash);
+      hash = c10::hash_combine(hash, is_vararg_hash);
+      hash = c10::hash_combine(hash, is_varret_hash);
+      return hash;
+    }
+  };
+} // namespace std
+
+
+#include <ATen/core/function_schema_inl.h>  // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..34edfa3e7750ae47f7abf5094dffaa63237f6c6d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/function_schema_inl.h
@@ -0,0 +1,483 @@
+#pragma once
+#include <ostream>
+#include <sstream>
+
+// note: windows build doesn't find symbols in operator files unless
+// this is a header file
+
+namespace c10 {
+
+inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
+  // eventually this should look almost identical to python arg parser, but
+  // it is simpler for now to work directly on this schema
+
+  out << schema.name();
+  if (!schema.overload_name().empty()) {
+    out << "." << schema.overload_name();
+  }
+  out << "(";
+
+  bool seen_kwarg_only = false;
+  for (const auto i : c10::irange(schema.arguments().size())) {
+    if (i > 0) out << ", ";
+    if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) {
+      out << "*, ";
+      seen_kwarg_only = true;
+    }
+    out << schema.arguments()[i];
+  }
+
+  if(schema.is_vararg()) {
+    if(!schema.arguments().empty())
+      out << ", ";
+    out << "...";
+  }
+
+  out << ") -> ";
+
+  const auto& returns = schema.returns();
+
+  /*
+   * We should skip parenthesis if we return a single item and it's not varret,
+   * or we return nothing but varret.
+   *
+   * Need special handling for schema
+   *   aten::items.str(Dict(str, t) self) -> (str,t)[]
+   * Even though this schema returns a single item, we need add parenthesis.
+   * The is necessary so the printed schema can be parsed by the C++ SchemaParser
+   * Without the extra parenthesis, the parser sees the first parenthesis in '(str,t)' and mistakenly
+   * treat the return type as a tuple. An alternative is to enhance the Lexer
+   * to lookahead multiple tokens to accurately decide if the return type is
+   * a tuple.
+   */
+  bool need_paren = !(
+    (returns.size() == 1 && !schema.is_varret()) ||
+    (returns.empty() && schema.is_varret()));
+
+  if (returns.size() == 1 && !schema.is_varret()) {
+    std::stringstream return_ss;
+    return_ss << returns.at(0);
+    auto return_str = return_ss.str();
+
+    // enclosing the single return item with parenthesis if the return type
+    // starts with a left parenthesis.
+    //
+    // There are 2 cases
+    // 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
+    // without the extra parenthesis, the c++ schem parser can not parse it.
+    // 2. something like '-> ((str, str))'. Need extra parenthesis so the return
+    // type is a single tuple rather than two strings.
+    // PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
+    // this. test_serialize_and_deserialize (https://github.com/pytorch/pytorch/blob/master/test/test_function_schema.py#L15)
+    // also covers this case.
+    if (!return_str.empty() && return_str.front() == '(') {
+      need_paren = true;
+    }
+  }
+
+  if (need_paren) {
+    out << "(";
+  }
+  for (const auto i : c10::irange(returns.size())) {
+    if (i > 0) {
+      out << ", ";
+    }
+    out << returns.at(i);
+  }
+  if (schema.is_varret()) {
+    if (!returns.empty()) {
+      out << ", ";
+    }
+    out << "...";
+  }
+  if (need_paren) {
+    out << ")";
+  }
+  return out;
+}
+
+inline size_t findFirstOutArg(const std::vector<Argument>& args) {
+  // find the start of out args in the schema
+  for (const auto out_start_idx : c10::irange(args.size())) {
+    if (args.at(out_start_idx).is_out()) {
+      return out_start_idx;
+    }
+  }
+  return args.size();
+}
+
+inline bool Argument::isBackwardCompatibleWith(
+      const Argument& old,
+      std::ostream* why_not) const {
+    const Argument* lhs = this;
+    const Argument* rhs = &old;
+    if (!(lhs->name() == rhs->name()
+        && lhs->N() == rhs->N()
+          && (lhs->alias_info() == rhs->alias_info()
+              || (lhs->alias_info() != nullptr && rhs->alias_info() != nullptr
+                  && *lhs->alias_info() == *rhs->alias_info())))) {
+      return false;
+    }
+    if (lhs->kwarg_only() && !rhs->kwarg_only()) {
+      return false;
+    }
+    if (!rhs->type()->isSubtypeOfExt(*lhs->type(), why_not)) {
+      return false;
+    }
+    if (rhs->default_value().has_value() &&
+        lhs->default_value() != rhs->default_value()) {
+      return false;
+    }
+    return true;
+}
+
+inline bool Argument::isForwardCompatibleWith(
+    const Argument& old,
+    std::ostream* why_not) const {
+  const Argument* lhs = this;
+  const Argument* rhs = &old;
+  if (!(lhs->name() == rhs->name()
+      && lhs->N() == rhs->N()
+        && (lhs->alias_info() == rhs->alias_info()
+            || (lhs->alias_info() != nullptr && rhs->alias_info() != nullptr
+                && *lhs->alias_info() == *rhs->alias_info())))) {
+    return false;
+  }
+  if (lhs->kwarg_only() && !rhs->kwarg_only()) {
+    return false;
+  }
+  if (!lhs->type()->isSubtypeOfExt(rhs->type(), why_not)) {
+    return false;
+  }
+  if (rhs->default_value().has_value() &&
+      lhs->default_value() != rhs->default_value()) {
+    return false;
+  }
+  if (lhs->default_value().has_value() && !rhs->default_value().has_value()) {
+    return false;
+  }
+  return true;
+}
+
+inline std::string FunctionSchema::formatTypeMismatchMsg(
+    const Argument& expected,
+    const std::string& actual_type,
+    c10::optional<size_t> position,
+    c10::optional<std::string> value) const {
+  std::string position_str;
+  if (position) {
+    position_str = c10::str("Position: ", *position, "\n");
+  }
+  std::string value_str;
+  if (value) {
+    value_str = c10::str("Value: ", *value, "\n");
+  }
+  return c10::str(
+      name(),
+      "() ",
+      expected.formatTypeMismatchMsg(actual_type),
+      position_str,
+      value_str,
+      "Declaration: ",
+      *this);
+}
+
+inline bool FunctionSchema::isBackwardCompatibleWith(
+    const FunctionSchema& old,
+    std::ostream* why_not) const {
+  if (!(name() == old.name()
+        && overload_name() == old.overload_name()
+        // we are conservative on is_vararg and is_varret,
+        // since they are only used by internal operators
+        && is_vararg() == old.is_vararg()
+        && is_varret() == old.is_varret()
+        && returns().size() == old.returns().size()
+        && arguments().size() >= old.arguments().size())) {
+    return false;
+  }
+  for (const auto i : c10::irange(returns().size())) {
+    // Backwards compatibility requires covariance on argument types
+    // (i.e. more generic), and contravariance on return types (i.e.
+    //  more specific).
+    if (!old.returns().at(i).isBackwardCompatibleWith(
+          returns().at(i),
+          why_not)) {
+      return false;
+    }
+  }
+
+  // we want to test both out and default args separately
+  size_t old_out_start_idx = findFirstOutArg(old.arguments());
+  size_t new_out_start_idx = findFirstOutArg(arguments());
+
+  // make sure among the default args, they are backward compatible
+  for (const auto i : c10::irange(old_out_start_idx)) {
+    if (!arguments().at(i).isBackwardCompatibleWith(
+          old.arguments().at(i), why_not)) {
+      return false;
+    }
+  }
+
+  // Validate that all new arguments provided has a default value
+  for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) {
+    if (!arguments().at(i).default_value()) {
+      if (why_not) {
+        *why_not
+            << "Function schema not backward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str()
+            << " did not provide a default value.";
+      }
+      return false;
+    }
+  }
+
+  // now compare the out args
+  for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) {
+    if (!arguments()
+             .at(i - old_out_start_idx + new_out_start_idx)
+             .isBackwardCompatibleWith(old.arguments().at(i), why_not)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+inline bool FunctionSchema::isForwardCompatibleWith(
+    const FunctionSchema& old,
+    std::ostringstream& why_not) const {
+  if (!(name() == old.name() &&
+        overload_name() == old.overload_name()
+        // we are conservative on is_vararg and is_varret,
+        // since they are only used by internal operators
+        && is_vararg() == old.is_vararg() && is_varret() == old.is_varret() &&
+        returns().size() == old.returns().size())) {
+    return false;
+  }
+
+  // we want to test both out and default args separately
+  size_t old_out_start_idx = findFirstOutArg(old.arguments());
+  size_t new_out_start_idx = findFirstOutArg(arguments());
+
+  if (old.arguments().size() - old_out_start_idx !=
+      arguments().size() - new_out_start_idx) {
+    if (why_not) {
+      why_not << "Function schema should have the "
+              << "same number of out arguments";
+    }
+    return false;
+  }
+
+  // make sure among the default args, they are forward compatible
+  for (size_t i = 0; i < std::min(old_out_start_idx, new_out_start_idx); i++) {
+    if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
+      if (why_not) {
+        why_not
+            << "'" << arguments().at(i).name() << "'"
+            << " is not forward compatible with the older version of the schema";
+      }
+      return false;
+    }
+  }
+
+  // Validate that all new arguments provided has a default value
+  for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) {
+    if (!arguments().at(i).default_value()) {
+      if (why_not) {
+        why_not
+            << "Function schema is not forward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str()
+            << " did not provide a default value.";
+      }
+      return false;
+    }
+
+    auto default_val = arguments().at(i).default_value().value();
+    if (default_val.isList() || default_val.isGenericDict()) {
+      if (why_not) {
+        why_not
+            << "Function schema is not forward compatible since the new argument '"
+            << arguments().at(i).name() << "' of type "
+            << arguments().at(i).type()->str() << " has a container type "
+            << "as its default value.";
+      }
+      return false;
+    }
+  }
+
+  // now compare the out args
+  for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) {
+    if (!arguments()
+             .at(i - old_out_start_idx + new_out_start_idx)
+             .isForwardCompatibleWith(old.arguments().at(i))) {
+      if (why_not) {
+        why_not << "Out argument '"
+                << "'" << arguments().at(i).name()
+                << " is not FC with the older version of the schema";
+      }
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template<typename T>
+inline void FunctionSchema::checkArg(
+    const IValue& value,
+    const Argument& argument,
+    optional<size_t> pos) const {
+  if (value.isTensor() && argument.type() == TensorType::get()) {
+    // Fast-path for the common case
+    return;
+  }
+  if (!value.type<T>()->isSubtypeOf(*argument.type())) {
+    TORCH_CHECK(
+        false,
+        formatTypeMismatchMsg(
+            argument, value.type<T>()->repr_str(), pos));
+  }
+}
+
+inline std::string FunctionSchema::findErrorInKwargs(const std::vector<std::string>& kwargs) const {
+  // First check if any of the kwargs are unknown, i.e. don't match the name of
+  // any argument in the schema.
+  for (const auto& kwarg : kwargs) {
+    if (!std::count_if(
+            arguments().begin(),
+            arguments().end(),
+            [&kwarg](const Argument& argument) {
+              return argument.name() == kwarg;
+            })) {
+      return c10::str(
+          "Unknown keyword argument '",
+          kwarg,
+          "' for operator '",
+          name(),
+          "'. Schema: ",
+          *this);
+    }
+  }
+  // If there are unconsumed kwargs but none of them were unknown, the first
+  // positional argument present in the kwargs is duplicated.
+  for (const auto& argument : arguments()) {
+    if (std::find(kwargs.begin(), kwargs.end(), argument.name()) != kwargs.end()) {
+      AT_ASSERT(!argument.default_value());
+      return c10::str(
+          "Argument '",
+          argument.name(),
+          "' specified both as positional and ",
+          "keyword argument. Schema: ",
+          *this);
+    }
+  }
+  return "";
+}
+
+template <typename T>
+inline void FunctionSchema::checkAndNormalizeInputs(
+    std::vector<IValue>& inputs,
+    const std::unordered_map<std::string, IValue>& kwargs) const {
+  // Do we have more inputs than the schema accepts?
+  TORCH_CHECK(
+      inputs.size() <= arguments().size(),
+      "Expected at most ",
+      arguments().size(),
+      " argument(s) for operator '",
+      name(),
+      "', but received ",
+      inputs.size(),
+      " argument(s). Declaration: ",
+      *this);
+
+  size_t consumed_kwargs = 0;
+  for (const auto pos : c10::irange(arguments().size())) {
+    const auto& argument = arguments()[pos];
+    if (pos < inputs.size()) {
+      checkArg<T>(inputs[pos], argument, pos);
+      continue;
+    }
+    auto it = kwargs.find(argument.name());
+    if (it != kwargs.end()) {
+      checkArg<T>(it->second, argument, nullopt);
+      inputs.push_back(it->second);
+      consumed_kwargs++;
+      continue;
+    }
+    if (argument.default_value()) {
+      inputs.push_back(*argument.default_value());
+      continue;
+    }
+    AT_ERROR(
+        name(),
+        "() is missing value for argument '",
+        argument.name(),
+        "'. Declaration: ",
+        *this);
+  }
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    names.reserve(kwargs.size());
+    for(const auto& k : kwargs) {
+      names.emplace_back(k.first);
+    }
+    throw std::runtime_error(findErrorInKwargs(names));
+  }
+}
+
+inline FunctionSchema FunctionSchema::cloneWithRemappedTypes(
+    const std::function<TypePtr(TypePtr)> type_map) const {
+  auto update_args = [&](const std::vector<Argument>& args) {
+    std::vector<Argument> new_args;
+    new_args.reserve(args.size());
+    for(const Argument& arg : args) {
+      new_args.emplace_back(arg.cloneWithType(type_map(arg.type())));
+    }
+    return new_args;
+  };
+  return FunctionSchema(
+      name(),
+      overload_name(),
+      update_args(arguments()),
+      update_args(returns()),
+      is_vararg(),
+      is_varret());
+}
+
+// covariant subtyping of list of Arguments
+inline bool isSubtypeOfList(
+    ArrayRef<Argument> child,
+    ArrayRef<Argument> parent,
+    std::ostream* why_not) {
+  if (child.size() != parent.size()) {
+    return false;
+  }
+  for (const auto i : c10::irange(child.size())) {
+    const Argument& c = child[i];
+    const Argument& p = parent[i];
+    if (c.name() != p.name()) {
+      return false;
+    }
+    if (!c.type()->isSubtypeOfExt(*p.type(), why_not)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool FunctionSchema::isSubtypeOf(
+    const FunctionSchema& rhs,
+    bool as_method,
+    std::ostream* why_not) const {
+  size_t start = as_method ? 1 : 0;
+  // functions are contravariant in arguments but covariant in returns
+  return isSubtypeOfList(
+             ArrayRef<Argument>(rhs.arguments()).slice(start),
+             ArrayRef<Argument>(arguments()).slice(start),
+             why_not) &&
+      isSubtypeOfList(returns(), rhs.returns(), why_not);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/functional.h b/MLPY/Lib/site-packages/torch/include/ATen/core/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..20e2d60445fe938ceb25f41508ce9761b69c41c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/functional.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <vector>
+#include <c10/util/ArrayRef.h>
+
+namespace c10 {
+
+// The passed in function must take T by value (T), or by
+// const reference (const T&); taking T by non-const reference
+// will result in an error like:
+//
+//    error: no type named 'type' in 'class std::result_of<foobar::__lambda(T)>'
+//
+// No explicit template parameters are required.
+
+// Overload for explicit function and ArrayRef
+template<class F, class T>
+inline auto fmap(const T& inputs, const F& fn) -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for(const auto & input : inputs)
+    r.push_back(fn(input));
+  return r;
+}
+
+// C++ forbids taking an address of a constructor, so here's a workaround...
+// Overload for constructor (R) application
+template<typename R, typename T>
+inline std::vector<R> fmap(const T& inputs) {
+  std::vector<R> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs)
+    r.push_back(R(input));
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(at::ArrayRef<T> inputs, const F& fn) {
+  std::vector<T> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs) {
+    if (fn(input)) {
+      r.push_back(input);
+    }
+  }
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(const std::vector<T>& inputs, const F& fn) {
+  return filter<F, T>(static_cast<at::ArrayRef<T>>(inputs), fn);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/grad_mode.h b/MLPY/Lib/site-packages/torch/include/ATen/core/grad_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e7dc5b0ad1ca9ca11f325cb6c5985ffa9815efc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/grad_mode.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/core/GradMode.h>
+
+namespace at {
+  using GradMode = c10::GradMode;
+  using AutoGradMode = c10::AutoGradMode;
+  using NoGradGuard = c10::NoGradGuard;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings.h b/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff02c53f7f52afa3dcde61e7a84af865387c4f63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings.h
@@ -0,0 +1,358 @@
+#pragma once
+#include <vector>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+#include <algorithm>
+
+#include <c10/macros/Macros.h>
+
+#include <ATen/core/aten_interned_strings.h>
+#include <ATen/core/symbol.h>
+
+namespace c10 {
+
+#define FORALL_NS_SYMBOLS(_)         \
+  _(namespaces, prim)                \
+  _(namespaces, prims)               \
+  _(namespaces, nvprims)             \
+  _(namespaces, aten)                \
+  _(namespaces, cuda)                \
+  _(namespaces, onnx)                \
+  _(namespaces, attr)                \
+  _(namespaces, scope)               \
+  _(namespaces, user)                \
+  _(namespaces, _caffe2)             \
+  _(namespaces, dimname)             \
+  _(namespaces, namespaces)          \
+  _(prim, Assign)                    \
+  _(prim, BroadcastingChunk)         \
+  _(prim, BroadcastSizes)            \
+  _(prim, ReductionSizes)            \
+  _(prim, Constant)                  \
+  _(prim, ChunkSizes)                \
+  _(prim, ConstantMKLDNNTensor)      \
+  _(prim, BroadcastMKLDNNTensors)    \
+  _(prim, MKLDNNGroup)               \
+  _(prim, MKLDNNHardSwish)           \
+  _(prim, MKLDNNHardSigmoid)         \
+  _(prim, MKLDNNHardTanh)            \
+  _(prim, MKLDNNClamp)               \
+  _(prim, StaticRuntimeCopyOuts)     \
+  _(prim, Drop)                      \
+  _(prim, Eval)                      \
+  _(prim, Expand) /* onnx */         \
+  _(prim, FusionGroup)               \
+  _(prim, CudaFusionGroup)           \
+  _(prim, CudaFusionGuard)           \
+  _(prim, oneDNNFusionGroup)         \
+  _(prim, oneDNNFusionGuard)         \
+  _(prim, FunctionalGraph)           \
+  _(prim, add_optional)              \
+  _(prim, view_copy)                 \
+  _(prim, permute_copy)              \
+  _(prim, reshape_copy)              \
+  _(prim, squeeze_copy)              \
+  _(prim, t_copy)                    \
+  _(prim, transpose_copy)            \
+  _(prim, unsqueeze_copy)            \
+  _(prim, flatten_copy)              \
+  _(prim, expand_copy)               \
+  _(prim, expand_as_copy)            \
+  _(prim, DifferentiableGraph)       \
+  _(prim, TensorExprGroup)           \
+  _(prim, TensorExprDynamicGroup)    \
+  _(prim, StaticSubgraph)            \
+  _(prim, If)                        \
+  _(prim, Jump) /* debug */          \
+  _(prim, JumpNZ) /* debug */        \
+  _(prim, JumpZ) /* debug */         \
+  _(prim, Load)                      \
+  _(prim, Loop)                      \
+  _(prim, Param)                     \
+  _(prim, PackPadded) /* onnx */     \
+  _(prim, PadPacked) /* onnx */      \
+  _(prim, Placeholder) /* debug */   \
+  _(prim, Print)                     \
+  _(prim, EmptyListLiteral)          \
+  _(prim, LegacyTypedConstructor)    \
+  _(prim, PythonOp)                  \
+  _(prim, IgnoredPythonOp)           \
+  _(prim, Reverse)                   \
+  _(prim, Return)                    \
+  _(prim, ReturnStmt)                \
+  _(prim, BreakStmt)                 \
+  _(prim, ContinueStmt)              \
+  _(prim, ComprehensionScope)        \
+  _(prim, Store)                     \
+  _(prim, AutogradZero)              \
+  _(prim, AutogradAnyNonZero)        \
+  _(prim, AutogradAllNonZero)        \
+  _(prim, AutogradAllZero)           \
+  _(prim, Starred)                   \
+  _(prim, TupleConstruct)            \
+  _(prim, TupleUnpack)               \
+  _(prim, TupleIndex)                \
+  _(prim, TupleSlice)                \
+  _(prim, ListConstruct)             \
+  _(prim, ListUnpack)                \
+  _(prim, DictConstruct)             \
+  _(prim, ModuleContainerIndex)      \
+  _(prim, EnumName)                  \
+  _(prim, EnumValue)                 \
+  _(prim, StringIndex)               \
+  _(prim, NumToTensor)               \
+  _(prim, Uninitialized)             \
+  _(prim, VarConcat)                 \
+  _(prim, VarStack)                  \
+  _(prim, With)                      \
+  _(prim, Enter)                     \
+  _(prim, Exit)                      \
+  _(prim, IfThenElse)                \
+  _(aten, Bool)                      \
+  _(aten, Int)                       \
+  _(aten, FloatImplicit)             \
+  _(aten, ComplexImplicit)           \
+  _(aten, IntImplicit)               \
+  _(aten, ScalarImplicit)            \
+  _(aten, Float)                     \
+  _(aten, Complex)                   \
+  _(aten, str)                       \
+  _(aten, Delete)                    \
+  _(prim, device)                    \
+  _(prim, dtype)                     \
+  _(prim, layout)                    \
+  _(prim, id)                        \
+  _(prim, requires_grad)             \
+  _(prim, MakeTestTensor) /* test */ \
+  _(prim, AutogradAdd)               \
+  _(prim, GradOf)                    \
+  _(aten, grad)                      \
+  _(aten, backward)                  \
+  _(prim, Guard)                     \
+  _(prim, BailOut)                   \
+  _(prim, TypeCheck)                 \
+  _(prim, RequiresGradCheck)         \
+  _(prim, FallbackGraph)             \
+  _(prim, FusedConcat)               \
+  _(prim, ConstantChunk)             \
+  _(prim, MMTreeReduce)              \
+  _(prim, MMBatchSide)               \
+  _(prim, list)                      \
+  _(prim, dict)                      \
+  _(prim, min)                       \
+  _(prim, max)                       \
+  _(prim, abs)                       \
+  _(aten, divmod)                    \
+  _(prim, zip)                       \
+  _(prim, enumerate)                 \
+  _(prim, range)                     \
+  _(prim, rangelist)                 \
+  _(prim, isinstance)                \
+  _(prim, tolist)                    \
+  _(prim, unchecked_cast)            \
+  _(aten, _grad_sum_to_size)         \
+  _(aten, _size_if_not_equal)        \
+  _(aten, _ncf_unsqueeze)            \
+  _(aten, warn)                      \
+  _(aten, sorted)                    \
+  _(aten, floordiv)                  \
+  _(aten, __range_length)            \
+  _(aten, __derive_index)            \
+  _(aten, __round_to_zero_floordiv)  \
+  _(aten, is_scripting)              \
+  _(aten, _unwrap_optional)          \
+  _(prim, fork)                      \
+  _(prim, awaitable)                 \
+  _(prim, forkClosure)               \
+  _(prim, awaitableClosure)          \
+  _(prim, awaitable_nowait)          \
+  _(prim, awaitable_wait)            \
+  _(prim, RaiseException)            \
+  _(prim, Closure)                   \
+  _(prim, CreateObject)              \
+  _(prim, SetAttr)                   \
+  _(prim, GetAttr)                   \
+  _(prim, HasAttr)                   \
+  _(prim, profile)                   \
+  _(prim, profile_ivalue)            \
+  _(prim, AddStatValue)              \
+  _(prim, TimePoint)                 \
+  _(prim, CallFunction)              \
+  _(prim, CallMethod)                \
+  _(prim, LoopContinuation)          \
+  _(prim, annotate)                  \
+  _(prim, TracedModuleForward)       \
+  _(prim, TracedFork)                \
+  _(prim, TracedAttr)                \
+  _(prim, rpc_async)                 \
+  _(prim, rpc_sync)                  \
+  _(prim, rpc_remote)                \
+  _(prim, is_cuda)                   \
+  _(aten, append)                    \
+  _(aten, as_tensor)                 \
+  _(aten, adaptive_avg_pool2d_backward) \
+  _(aten, dim)                       \
+  _(aten, format)                    \
+  _(aten, percentFormat)             \
+  _(aten, __not__)                   \
+  _(aten, __is__)                    \
+  _(aten, __isnot__)                 \
+  _(aten, _ger)                      \
+  _(aten, __getitem__)               \
+  _(aten, _set_item)                 \
+  _(aten, manual_seed)               \
+  _(aten, device)                    \
+  _(aten, hash)                      \
+  _(aten, len)                       \
+  _(aten, list)                      \
+  _(aten, dict)                      \
+  _(aten, wait)                      \
+  _(aten, save)                      \
+  _(aten, keys)                      \
+  _(aten, ord)                       \
+  _(aten, chr)                       \
+  _(aten, hex)                       \
+  _(aten, oct)                       \
+  _(aten, clear)                     \
+  _(aten, setdefault)                \
+  _(aten, bin)                       \
+  _(aten, pop)                       \
+  _(aten, insert)                    \
+  _(aten, tensor)                    \
+  _(prim, unchecked_unwrap_optional) \
+  _(aten, __contains__)              \
+  _(prim, BailoutTemplate)           \
+  _(prim, grad)                      \
+  _(cuda, _set_device)               \
+  _(cuda, set_stream)                \
+  _(cuda, _current_device)           \
+  _(cuda, synchronize)               \
+  _(aten, has_torch_function)        \
+  _(aten, is_autocast_enabled)       \
+  _(aten, is_autocast_cpu_enabled)   \
+  _(aten, is_autocast_xla_enabled)   \
+  FORALL_ATEN_BASE_SYMBOLS(_)        \
+  _(onnx, Add)                       \
+  _(onnx, Concat)                    \
+  _(onnx, Constant)                  \
+  _(onnx, ConstantFill)              \
+  _(onnx, Div)                       \
+  _(onnx, GRU)                       \
+  _(onnx, Gather)                    \
+  _(onnx, Gemm)                      \
+  _(onnx, LSTM)                      \
+  _(onnx, MatMul)                    \
+  _(onnx, Min)                       \
+  _(onnx, Max)                       \
+  _(onnx, Mul)                       \
+  _(onnx, Pow)                       \
+  _(onnx, RNN)                       \
+  _(onnx, Shape)                     \
+  _(onnx, Size)                      \
+  _(onnx, Slice)                     \
+  _(onnx, Softmax)                   \
+  _(onnx, Squeeze)                   \
+  _(onnx, Sub)                       \
+  _(onnx, Transpose)                 \
+  _(onnx, Unsqueeze)                 \
+  _(onnx, Loop)                      \
+  _(onnx, If)                        \
+  _(onnx, Reshape)                   \
+  _(onnx, Expand)                    \
+  _(onnx, Equal)                     \
+  _(onnx, Greater)                   \
+  _(onnx, GreaterOrEqual)            \
+  _(onnx, Less)                      \
+  _(onnx, LessOrEqual)               \
+  _(onnx, Not)                       \
+  _(aten, ATen)                      \
+  _(onnx, Split)                     \
+  _(onnx, ConstantOfShape)           \
+  _(onnx, Cast)                      \
+  _(onnx, Mod)                       \
+  _(onnx, Sqrt)                      \
+  _(onnx, SplitToSequence)           \
+  _(onnx, SequenceAt)                \
+  _(onnx, SequenceConstruct)         \
+  _(onnx, SequenceEmpty)             \
+  _(onnx, SequenceInsert)            \
+  _(onnx, SequenceErase)             \
+  _(onnx, ConcatFromSequence)        \
+  _(onnx, Identity)                  \
+  _(onnx, SoftmaxCrossEntropyLoss)   \
+  _(onnx, NegativeLogLikelihoodLoss) \
+  _(onnx, LogSoftmax)                \
+  _(onnx, ReduceL1)                  \
+  _(onnx, ReduceL2)                  \
+  _(onnx, Conv)                      \
+  _(onnx, BatchNormalization)        \
+  _(onnx, ReduceMean)                \
+  _(onnx, ReduceProd)                \
+  _(onnx, Relu)                      \
+  _(onnx, Neg)                       \
+  _(onnx, NonZero)                   \
+  _(onnx, Range)                     \
+  _(onnx, Tile)                      \
+  _(onnx, Where)                     \
+  _(onnx, Optional)                  \
+  _(onnx, OptionalGetElement)        \
+  _(onnx, OptionalHasElement)        \
+  FORALL_ATTR_BASE_SYMBOLS(_)        \
+  _(attr, Subgraph)                  \
+  _(attr, ReverseSubgraph)           \
+  _(attr, f_real_outputs)            \
+  _(attr, df_input_vjps)             \
+  _(attr, df_input_captured_inputs)  \
+  _(attr, df_input_captured_outputs) \
+  _(attr, df_output_vjps)            \
+  _(attr, axes)                      \
+  _(attr, symbolic_shape_inputs)     \
+  _(attr, allow_stack_outputs)       \
+  _(attr, striding_inputs_desc)      \
+  _(attr, striding_outputs_desc)     \
+  _(attr, broadcast)                 \
+  _(attr, direction)                 \
+  _(attr, ends)                      \
+  _(attr, inplace)                   \
+  _(attr, input_as_shape)            \
+  _(attr, is_zero)                   \
+  _(attr, num_none)                  \
+  _(attr, num_present)               \
+  _(attr, perm)                      \
+  _(attr, starts)                    \
+  _(attr, profiled_type)             \
+  _(attr, transA)                    \
+  _(attr, transB)                    \
+  _(attr, name)                      \
+  _(attr, module)                    \
+  _(attr, beg)                       \
+  _(attr, idx)                       \
+  _(attr, split)                     \
+  _(attr, slot)                      \
+  _(attr, kinds)                     \
+  _(attr, types)                     \
+  _(attr, scope)                     \
+  _(attr, keepdims)                  \
+  _(attr, cache_id)                  \
+  _(attr, new_axis)                  \
+  _(attr, warn_id)                   \
+  _(attr, output_layouts)            \
+  _(attr, allowzero)                 \
+  _(attr, seen_none)                 \
+  _(attr, overload_name)             \
+  _(attr, node_stack_idx)
+
+enum class _keys : unique_t {
+    #define DEFINE_KEY(ns, s) ns##_##s,
+    FORALL_NS_SYMBOLS(DEFINE_KEY)
+    #undef DEFINE_KEY
+    num_symbols
+};
+
+#define DEFINE_SYMBOL(ns, s) \
+  namespace ns { constexpr Symbol s(static_cast<unique_t>(_keys::ns##_##s)); }
+FORALL_NS_SYMBOLS(DEFINE_SYMBOL)
+#undef DEFINE_SYMBOL
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings_class.h b/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee651b41e66729816ce68b20b365689714aab086
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/interned_strings_class.h
@@ -0,0 +1,34 @@
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+
+namespace c10 {
+
+struct TORCH_API InternedStrings {
+  InternedStrings();
+  Symbol symbol(const std::string& s);
+  std::pair<const char*, const char*> string(Symbol sym);
+  Symbol ns(Symbol sym);
+
+ private:
+  // prereq - holding mutex_
+  Symbol _symbol(const std::string& s);
+  std::pair<const char*, const char*> customString(Symbol sym);
+  std::unordered_map<std::string, Symbol> string_to_sym_;
+
+  struct SymbolInfo {
+    Symbol ns;
+    std::string qual_name;
+    std::string unqual_name;
+  };
+  std::vector<SymbolInfo> sym_to_info_;
+
+  std::mutex mutex_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cae44fc00dbc6c6a0f814e641b47781a11a7fa1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue.h
@@ -0,0 +1,1555 @@
+#pragma once
+
+#include <ATen/core/DimVector.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/blob.h>
+#include <ATen/core/custom_class.h>
+#include <ATen/core/ivalue_to.h>
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/type_factory.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymFloat.h>
+#include <c10/macros/Export.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/intrusive_ptr.h>
+#include <type_traits>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+namespace torch {
+class TORCH_API CustomClassHolder : public c10::intrusive_ptr_target {};
+namespace jit {
+using ::torch::CustomClassHolder;
+struct Function;
+struct CompilationUnit;
+struct Module;
+} // namespace jit
+} // namespace torch
+namespace c10 {
+template <class Key, class Value>
+class Dict;
+template <class T>
+class List;
+template <class T>
+class IListRef;
+struct IValue;
+struct ClassType;
+struct Type;
+class RRefInterface;
+
+struct ClassType;
+using ClassTypePtr = std::shared_ptr<ClassType>;
+
+TORCH_API bool _fastEqualsForContainer(const IValue& lhs, const IValue& rhs);
+
+TORCH_API torch::jit::Function* checkObjectSortSchema(
+    const c10::ClassTypePtr& t,
+    std::stringstream& why_not);
+
+// A comparator that checks ordering of two IValues of same type.
+typedef std::function<bool(const IValue& a, const IValue& b)> IValueComparator;
+
+TORCH_API IValueComparator getLessThanComparator(const IValue& v);
+TORCH_API IValueComparator getGreaterThanComparator(const IValue& v);
+
+namespace ivalue {
+struct Tuple;
+struct Future;
+struct Await;
+struct ConstantString;
+struct GenericDict;
+struct Object;
+struct PyObjectHolder;
+struct EnumHolder;
+// We need a ComplexHolder because currently the payloads in the Union
+// only take 64 bits. Since ComplexDouble takes up 128 bits, and is too big
+// to fit in the IValue directly, we indirect complex numbers through an
+// intrusive pointer to ComplexHolder (which contains a c10::complex).
+struct ComplexHolder : c10::intrusive_ptr_target {
+ public:
+  template <typename T>
+  ComplexHolder(c10::complex<T> c) {
+    val = convert<decltype(val), c10::complex<T>>(c);
+  }
+  ComplexHolder() = default;
+  c10::complex<double> val;
+};
+
+// Similar to ComplexHolder, for StreamData3
+struct StreamData3Holder : c10::intrusive_ptr_target {
+ public:
+  StreamData3Holder(struct c10::StreamData3 d) : val(d) {}
+  StreamData3Holder() = delete;
+  struct c10::StreamData3 val;
+};
+
+} // namespace ivalue
+
+// This is an owning wrapper for a c10::optional<std::vector<T>>
+// that can be implicitly converted to a (non-owning) optional<ArrayRef<T>>.
+// Its purpose is to be used in generated code to keep the vector alive
+// either until the end of a statement (as a temporary), or as a saved arg
+// in autograd.
+template <typename T>
+struct OptionalArray {
+  c10::optional<std::vector<T>> list;
+
+  OptionalArray() = default;
+  OptionalArray(std::vector<T> val) : list(std::move(val)) {}
+
+  // Used when saving an argument for the backwards pass.
+  OptionalArray& operator=(c10::optional<ArrayRef<T>> ref) {
+    if (ref) {
+      list = std::vector<T>(ref->begin(), ref->end());
+    } else {
+      list = nullopt;
+    }
+    return *this;
+  }
+
+  // Used when saving an argument for the backwards pass.
+  OptionalArray& operator=(c10::OptionalArrayRef<T> ref) {
+    if (ref) {
+      list = std::vector<T>(ref->begin(), ref->end());
+    } else {
+      list = nullopt;
+    }
+    return *this;
+  }
+
+  operator c10::optional<c10::ArrayRef<T>>() {
+    if (!list) {
+      return nullopt;
+    }
+    return *list;
+  }
+
+  operator c10::OptionalArrayRef<T>() {
+    if (!list) {
+      return nullopt;
+    }
+    return *list;
+  }
+};
+
+// Capsule is an internal implementation detail of custom C++ classes. We
+// define it as an owning wrapper for
+// c10::intrusive_ptr<torch::CustomClassHolder> This wrapper is here to serve as
+// an abstraction of the type erased custom class object pointer. It also allow
+// pybind11 to treat this as a standalone class to register as a separate type
+// caster, instead of a custom pointer holder which the pointer holder type
+// caster try to "unwrap" it automatically.
+struct Capsule {
+  c10::intrusive_ptr<torch::CustomClassHolder> obj_ptr;
+  explicit Capsule(c10::intrusive_ptr<torch::CustomClassHolder> ptr)
+      : obj_ptr(std::move(ptr)) {}
+};
+
+// IValue is the generic tagged union used by the interpreter to hold
+// all value types.
+// It is a 16-byte object with an 8-byte payload and an 8-byte tag.
+// The tag is currently 4 bytes to determine the type, and 1 byte
+// to mark whether that type is a subtype of c10::intrusive_ptr_target and needs
+// retain/release calls.
+
+#define TORCH_FORALL_TAGS(_) \
+  _(None)                    \
+  _(Tensor)                  \
+  _(Storage)                 \
+  _(Double)                  \
+  _(ComplexDouble)           \
+  _(Int)                     \
+  _(SymInt)                  \
+  _(SymFloat)                \
+  _(SymBool)                 \
+  _(Bool)                    \
+  _(Tuple)                   \
+  _(String)                  \
+  _(Blob)                    \
+  _(GenericList)             \
+  _(GenericDict)             \
+  _(Future)                  \
+  _(Await)                   \
+  _(Device)                  \
+  _(Stream)                  \
+  _(Object)                  \
+  _(PyObject)                \
+  _(Uninitialized)           \
+  _(Capsule)                 \
+  _(RRef)                    \
+  _(Quantizer)               \
+  _(Generator)               \
+  _(Enum)
+
+// [doxygen private]
+// These methods are not actually private but we don't want to document them, so
+// they are marked `@private`, which hides them on the doxygen documentation for
+// this page.
+
+/// IValue (Interpreter Value) is a tagged union over the types
+/// supported by the TorchScript interpreter. IValues contain their
+/// values as an `IValue::Payload`, which holds primitive types
+/// (`int64_t`, `bool`, `double`, `Device`) and `Tensor` as values,
+/// and all other types as a `c10::intrusive_ptr`. In order to
+/// optimize performance of the destructor and related operations by
+/// making the `Tensor` and `c10::intrusive_ptr` paths generate the
+/// same code, we represent a null `c10::intrusive_ptr` as
+/// `UndefinedTensorImpl::singleton()`, *not* `nullptr`.
+///
+/// IValues are used as inputs to and outputs from the TorchScript interpreter.
+/// To retrieve the value contained within an IValue, use the `.toX()` methods,
+/// where `X` is the type you are trying to get. Note that neither the `.toX()`
+/// methods nor the templated `.to<T>` functions do any kind of casting, they
+/// only unwrap the contained value. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   // Make the IValue
+///   torch::IValue my_ivalue(26);
+///   std::cout << my_ivalue << "\n";
+///
+///   // Unwrap the IValue
+///   int64_t my_int = my_ivalue.toInt();
+///   std::cout << my_int << "\n";
+///
+///   // This will throw an error!
+///   // `my_ivalue` is tagged as an int and cannot be used as another type
+///   torch::Tensor my_tensor = my_ivalue.toTensor();
+/// \endrst
+struct TORCH_API IValue final {
+  IValue(const IValue& rhs) : IValue(rhs.payload, rhs.tag) {
+    if (isIntrusivePtr() &&
+        payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
+    }
+  }
+
+  IValue(IValue&& rhs) noexcept : tag(rhs.tag) {
+    moveFrom(std::move(rhs));
+  }
+
+  /// @private [doxygen private]
+  ~IValue() {
+    destroy();
+  }
+
+  C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
+    return *this;
+  }
+
+  IValue& operator=(IValue const& rhs) & {
+    *this = IValue(rhs);
+    return *this;
+  }
+
+  void dump() const;
+
+  /**
+   * Equality comparison. The semantics are the same as Python's `==`:
+   * 1. Numerical types are compared by value.
+   * 2. Tensors compute element-wise equality, returning a BoolTensor (see:
+   * `torch.eq()`)
+   * 3. Strings are compared by value.
+   * 4. Sequence types (list, tuple) are compared lexicographically by
+   *    comparing their elements. Different sequence types never compare equal.
+   * 5. Mappings (dict) must have equal (key, value) pairs.
+   * 6. If not listed above, the default behavior for is to test identity
+   * equality (e.g. pointer equality).
+   *
+   * Why does this return an IValue instead of a bool? Because in PyTorch,
+   * `tensor1 == tensor2` returns a `BoolTensor`, not a bool.
+   *
+   * NOTE: we (like Python) assume that identity equality implies value equality
+   * for efficiency.
+   * TODO: need to support customizing equality
+   */
+  IValue equals(const IValue& rhs) const;
+  /**
+   * This implements the same semantics as `bool(lhs == rhs)` in Python. which
+   * is the same as `equals()` except for Tensor types.
+   */
+  TORCH_API friend bool operator==(const IValue& lhs, const IValue& rhs);
+  TORCH_API friend bool operator!=(const IValue& lhs, const IValue& rhs);
+
+  /**
+   * Identity comparison. Checks if `this` is the same object as `rhs`. The
+   * semantics are the same as Python's `is` operator.
+   *
+   * NOTE: Like in Python, this operation is poorly defined for primitive types
+   * like numbers and strings. Prefer to use `==` unless you really want to
+   * check identity equality.
+   */
+  bool is(const IValue& rhs) const;
+
+  /**
+   * Hashing for IValues. Returns an IValue-boxed int.
+   *
+   * Some notes:
+   * - Like eager, Tensors are hashed by looking at the pointer. This is not
+   *   strictly correct because two value-equal tensors with different tensor
+   *   pointers will hash differently, but we choose to reproduce the eager
+   *   semantics.
+   * - Hashing is not defined on all built-in IValue types (e.g. list and
+   *   dict), following Python. Calling `hash()` on these types will throw.
+   */
+  IValue hash() const {
+    return (int64_t)IValue::hash(*this);
+  }
+  // This is defined because `c10::hash` dispatches to a function of this
+  // signature. See the member function `hash()`.
+  static size_t hash(const IValue& iv);
+
+  /**
+   * @private [doxygen private]
+   * [container equality]
+   * This is an equality implementation that assumes objects with the same
+   * identity equal themselves, for efficiency reasons. We primarily have this
+   * for consistency, because Python does the same thing. This actually
+   * provokes user-visible changes in behavior due to quirks in torch:
+   *      [tensor1] == [tensor1] -> True (because container equality will first
+   * compare identity) [tensor1] == [tensor1_copy] -> RuntimeError:
+   * Boolean value of Tensor with more than one value is ambiguous
+   */
+  TORCH_API friend bool _fastEqualsForContainer(
+      const IValue& lhs,
+      const IValue& rhs);
+
+ private:
+  static bool isAliasOf(const at::Tensor& a, const at::Tensor& b) {
+    if (a.is_sparse()) {
+      return isAliasOf(a._values(), b) || isAliasOf(a._indices(), b);
+    }
+    if (b.is_sparse()) {
+      return isAliasOf(a, b._values()) || isAliasOf(a, b._indices());
+    }
+    if (a.is_sparse_csr()) {
+      return isAliasOf(a.values(), b) || isAliasOf(a.crow_indices(), b) ||
+          isAliasOf(a.col_indices(), b);
+    }
+    if (b.is_sparse_csr()) {
+      return isAliasOf(a, b.values()) || isAliasOf(a, b.crow_indices()) ||
+          isAliasOf(a, b.col_indices());
+    }
+
+    // Opaque tensors such as the ones constructed by the MKL-DNN backend
+    // don't have storage so we just compare their TensorImpls.
+    // TODO: Find way to expose alias info for opaque tensors.
+    if (!a.has_storage() || !b.has_storage()) {
+      return a.unsafeGetTensorImpl() == b.unsafeGetTensorImpl();
+    }
+
+    return a.is_alias_of(b);
+  }
+
+  template <typename T>
+  bool isListOf() const;
+
+ public:
+  /// @private [doxygen private]
+  bool isAliasOf(const IValue& rhs) const {
+    if (this->tag != rhs.tag) {
+      // Trivially don't alias if the type is different
+      return false;
+    }
+
+    // Tensors should be compared based on internal storage
+    if (this->isTensor()) {
+      return isAliasOf(this->toTensor(), rhs.toTensor());
+    }
+
+    if (!isIntrusivePtr()) {
+      // Primitive types don't alias anything
+      return false;
+    }
+
+    AT_ASSERT(rhs.isIntrusivePtr());
+
+    // Other types can be compared by their ptr value
+    return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
+  }
+
+  /// @private [doxygen private]
+  size_t use_count() const noexcept {
+    if (isTensor()) {
+      return payload.as_tensor.use_count();
+    }
+
+    if (!isIntrusivePtrLegacyBehavior()) {
+      return 1;
+    }
+
+    if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) {
+      return 0;
+    }
+    return c10::raw::intrusive_ptr::use_count(payload.u.as_intrusive_ptr);
+  }
+
+  /// @private [doxygen private]
+  void swap(IValue& rhs) noexcept {
+    if (isTensor() && rhs.isTensor()) {
+      std::swap(payload.as_tensor, rhs.payload.as_tensor);
+    } else if (isTensor()) {
+      at::Tensor t = std::move(payload.as_tensor);
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // payload.as_tensor.~Tensor();
+      payload.u = rhs.payload.u;
+      new (&rhs.payload.as_tensor) at::Tensor(std::move(t));
+    } else if (rhs.isTensor()) {
+      rhs.swap(*this);
+      return;
+    } else {
+      std::swap(payload.u, rhs.payload.u);
+    }
+    std::swap(tag, rhs.tag);
+  }
+
+  // Accessors for subtypes are arranged together below
+  // While some of these accessors could be generated through templates,
+  // we prefer to write them manually for clarity
+
+  IValue(at::TensorBase t) : tag(Tag::Tensor) {
+    new (&payload.as_tensor) at::Tensor(std::move(t));
+  }
+  bool isTensor() const {
+    return Tag::Tensor == tag;
+  }
+
+ private:
+  // Outlined error path so that toTensor() can be inlined.
+  [[noreturn]] void reportToTensorTypeError() const;
+
+ public:
+  at::Tensor toTensor() &&;
+  at::Tensor& toTensor() &;
+  const at::Tensor& toTensor() const&;
+  at::TensorImpl* unsafeToTensorImpl() const {
+    TORCH_INTERNAL_ASSERT(isTensor());
+    return payload.as_tensor.unsafeGetTensorImpl();
+  }
+
+  IValue(at::Storage s) : tag(Tag::Storage) {
+    payload.u.as_intrusive_ptr =
+        null_to_undefined_tensor(s.unsafeReleaseStorageImpl());
+  }
+  bool isStorage() const {
+    return Tag::Storage == tag;
+  }
+  c10::Storage toStorage() &&;
+  c10::Storage toStorage() const&;
+
+  const IValue& toIValue() const {
+    return *this;
+  }
+  IValue& toIValue() {
+    return *this;
+  }
+
+  /// @private [doxygen private]
+  IValue(intrusive_ptr<caffe2::Blob> blob) : tag(Tag::Blob) {
+    // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
+    // and store it as a Tensor instead.
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
+  }
+
+  /// @private [doxygen private]
+  bool isBlob() const {
+    return Tag::Blob == tag;
+  }
+
+  /// @private [doxygen private]
+  c10::intrusive_ptr<caffe2::Blob> toBlob() &&;
+
+  /// @private [doxygen private]
+  c10::intrusive_ptr<caffe2::Blob> toBlob() const&;
+
+  // Capsule. No new callsites of these APIs should
+  // be introduced.
+  static inline IValue make_capsule(
+      intrusive_ptr<torch::CustomClassHolder> blob);
+  bool isCapsule() const {
+    return Tag::Capsule == tag;
+  }
+  c10::intrusive_ptr<torch::CustomClassHolder> toCapsule() &&;
+  c10::intrusive_ptr<torch::CustomClassHolder> toCapsule() const&;
+
+  // Custom C++ classes
+  template <
+      typename T,
+      std::enable_if_t<
+          std::is_base_of<torch::CustomClassHolder, T>::value,
+          int> = 0>
+  IValue(intrusive_ptr<T> custom_class);
+  bool isCustomClass() const;
+  template <typename T>
+  c10::intrusive_ptr<T> toCustomClass() &&;
+  template <typename T>
+  c10::intrusive_ptr<T> toCustomClass() const&;
+
+  // Tuple
+  IValue(c10::intrusive_ptr<ivalue::Tuple> v);
+
+  template <
+      typename... Args,
+      std::enable_if_t<
+          !std::disjunction<
+              std::is_lvalue_reference<Args>...,
+              std::negation<std::is_constructible<IValue, Args>>...>::value,
+          std::nullptr_t> = nullptr>
+  IValue(const std::tuple<Args...>& t);
+  template <
+      typename... Args,
+      std::enable_if_t<
+          !std::disjunction<
+              std::is_lvalue_reference<Args>...,
+              std::negation<std::is_constructible<IValue, Args>>...>::value,
+          std::nullptr_t> = nullptr>
+  IValue(std::tuple<Args...>&& t);
+  bool isTuple() const {
+    return Tag::Tuple == tag;
+  }
+  c10::intrusive_ptr<ivalue::Tuple> toTuple() &&;
+  c10::intrusive_ptr<ivalue::Tuple> toTuple() const&;
+  C10_NODISCARD ivalue::Tuple& toTupleRef() const;
+
+  // Double
+  IValue(double d) : tag(Tag::Double) {
+    payload.u.as_double = d;
+  }
+  bool isDouble() const {
+    return Tag::Double == tag;
+  }
+  double toDouble() const {
+    AT_ASSERT(isDouble());
+    return payload.u.as_double;
+  }
+
+  // ComplexDouble
+  template <typename T>
+  IValue(c10::complex<T> c);
+  bool isComplexDouble() const {
+    return Tag::ComplexDouble == tag;
+  }
+  c10::complex<double> toComplexDouble() const;
+
+  // Future
+  IValue(c10::intrusive_ptr<ivalue::Future> v);
+  bool isFuture() const {
+    return Tag::Future == tag;
+  }
+  c10::intrusive_ptr<ivalue::Future> toFuture() &&;
+  c10::intrusive_ptr<ivalue::Future> toFuture() const&;
+
+  IValue(c10::intrusive_ptr<ivalue::Await> v);
+  bool isAwait() const {
+    return Tag::Await == tag;
+  }
+  c10::intrusive_ptr<ivalue::Await> toAwait() &&;
+  c10::intrusive_ptr<ivalue::Await> toAwait() const&;
+
+  // RRef
+  IValue(c10::intrusive_ptr<c10::RRefInterface> v);
+  bool isRRef() const {
+    return Tag::RRef == tag;
+  }
+  c10::intrusive_ptr<c10::RRefInterface> toRRef() &&;
+  c10::intrusive_ptr<c10::RRefInterface> toRRef() const&;
+
+  // Quantizer
+  IValue(c10::intrusive_ptr<at::Quantizer> v);
+  bool isQuantizer() const {
+    return Tag::Quantizer == tag;
+  }
+  c10::intrusive_ptr<at::Quantizer> toQuantizer() &&;
+  c10::intrusive_ptr<at::Quantizer> toQuantizer() const&;
+
+  // Int
+  IValue(int64_t i) : tag(Tag::Int) {
+    payload.u.as_int = i;
+  }
+
+  IValue(const c10::SymInt& i) {
+    if (auto mi = i.maybe_as_int()) {
+      tag = Tag::Int;
+      payload.u.as_int = *mi;
+    } else {
+      tag = Tag::SymInt;
+      payload.u.as_intrusive_ptr = i.toSymNode().release();
+    }
+  }
+
+  bool isSymInt() const {
+    return Tag::SymInt == tag;
+  }
+
+  c10::SymInt toSymInt() &&;
+  c10::SymInt toSymInt() const&;
+
+  IValue(const c10::SymFloat& i) {
+    if (i.is_symbolic()) {
+      tag = Tag::SymFloat;
+      payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
+    } else {
+      tag = Tag::Double;
+      payload.u.as_double = i.as_float_unchecked();
+    }
+  }
+
+  bool isSymFloat() const {
+    return Tag::SymFloat == tag;
+  }
+
+  c10::SymFloat toSymFloat() &&;
+  c10::SymFloat toSymFloat() const&;
+
+  IValue(const c10::SymBool& i) {
+    if (auto mi = i.maybe_as_bool()) {
+      tag = Tag::Bool;
+      payload.u.as_int = *mi;
+    } else {
+      tag = Tag::SymBool;
+      payload.u.as_intrusive_ptr = i.toSymNodeImpl().release();
+    }
+  }
+
+  bool isSymBool() const {
+    return Tag::SymBool == tag;
+  }
+
+  c10::SymBool toSymBool() &&;
+  c10::SymBool toSymBool() const&;
+
+  // allow you to pass literals (3, 4) without ambiguity
+  IValue(int32_t i) : IValue(static_cast<int64_t>(i)) {}
+
+  bool isInt() const {
+    return Tag::Int == tag;
+  }
+
+  int64_t toInt() const {
+    AT_ASSERT(isInt());
+    return payload.u.as_int;
+  }
+
+  // Bool
+  IValue(bool b) : tag(Tag::Bool) {
+#if defined(__clang__) && defined(__x86_64__)
+    // Initializing entire payload stops valgrind's from reporting
+    // "jump or move depends on uninitialised value" in IValue copy constructor
+    // See https://github.com/pytorch/pytorch/issues/37117
+    payload.u.as_int = b;
+#else
+    payload.u.as_bool = b;
+#endif
+  }
+  bool isBool() const {
+    return Tag::Bool == tag;
+  }
+  bool toBool() const {
+    AT_ASSERT(isBool());
+    return payload.u.as_bool;
+  }
+
+  // IntList
+  bool isIntList() const;
+  bool isSymIntList() const;
+  c10::List<int64_t> toIntList() &&;
+  c10::List<int64_t> toIntList() const&;
+  std::vector<int64_t> toIntVector() const;
+  std::vector<c10::SymInt> toSymIntVector() const;
+  at::DimVector toDimVector() const;
+
+  // ConstantString
+  IValue(c10::intrusive_ptr<ivalue::ConstantString> v);
+  IValue(std::string v);
+  IValue(const char* v) : IValue(std::string(v)) {}
+  IValue(c10::string_view v) : IValue(std::string(v)){};
+  bool isString() const {
+    return Tag::String == tag;
+  }
+  c10::intrusive_ptr<ivalue::ConstantString> toString() &&;
+  c10::intrusive_ptr<ivalue::ConstantString> toString() const&;
+  const std::string& toStringRef() const;
+  c10::optional<std::reference_wrapper<const std::string>> toOptionalStringRef()
+      const;
+  c10::string_view toStringView() const;
+
+  // DoubleList
+  bool isDoubleList() const;
+  c10::List<double> toDoubleList() &&;
+  c10::List<double> toDoubleList() const&;
+  std::vector<double> toDoubleVector() const;
+
+  // ComplexDoubleList
+  bool isComplexDoubleList() const;
+  c10::List<c10::complex<double>> toComplexDoubleList() &&;
+  c10::List<c10::complex<double>> toComplexDoubleList() const&;
+  std::vector<c10::complex<double>> toComplexDoubleVector() const;
+
+  // BoolList
+  bool isBoolList() const;
+  c10::List<bool> toBoolList() &&;
+  c10::List<bool> toBoolList() const&;
+
+  // TensorList
+  bool isTensorList() const;
+  c10::List<at::Tensor> toTensorList() &&;
+  c10::List<at::Tensor> toTensorList() const&;
+  std::vector<at::Tensor> toTensorVector() const;
+
+  // OptionalTensorList
+  bool isOptionalTensorList() const;
+  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() &&;
+  c10::List<c10::optional<at::Tensor>> toOptionalTensorList() const&;
+  std::vector<c10::optional<at::Tensor>> toOptionalTensorVector() const;
+
+  // GenericList
+  IValue(c10::List<IValue> v);
+  bool isList() const {
+    return Tag::GenericList == tag;
+  }
+  c10::List<IValue> toList() &&;
+  c10::List<IValue> toList() const&;
+  c10::ArrayRef<IValue> toListRef() const;
+
+  // Some template constructors of IValue calls another constructor recursively.
+  // This SFINAEs the called constructor exists.
+  template <class T>
+  using enable_if_ivalue_constructible =
+      std::enable_if_t<std::is_constructible<IValue, T>::value, std::nullptr_t>;
+
+  // The rule for lists is more complicated; the generic constructor is only
+  // acceptable if your element isn't SymInt.  If you do have a SymInt element,
+  // then you must also, at construction time, check if you can decay the list
+  // into an int list (this is MANDATORY, as at a use site we may expect
+  // toIntList to work even if at the call site you had a SymIntArrayRef
+  // argument).  In practice, only SymIntArrayRef is used this way, so we
+  // didn't bother making it work for the other constructors, we just make sure
+  // they're not selectable.
+  template <class T>
+  using enable_if_list_is_ivalue_constructible = std::enable_if_t<
+      std::is_constructible<IValue, T>::value &&
+          !std::is_same<T, c10::SymInt>::value,
+      std::nullptr_t>;
+
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(c10::List<T>&& v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(const c10::List<T>& v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(at::ArrayRef<T> v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(const std::vector<T>& v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(std::vector<T>&& v);
+  template <class T, size_t N>
+  IValue(std::array<T, N> v);
+
+  // Manual constructors for lists of symints, which decay to int list if
+  // possible.  To avoid ambiguous overload situations, we template them
+  // to prevent implicit conversions
+  template <class T>
+  using enable_if_symint =
+      std::enable_if_t<std::is_same<T, c10::SymInt>::value, std::nullptr_t>;
+
+  template <class T, enable_if_symint<T> = nullptr>
+  IValue(at::ArrayRef<T> v);
+  template <class T, enable_if_symint<T> = nullptr>
+  IValue(at::OptionalArrayRef<T> v);
+  template <class T, enable_if_symint<T> = nullptr>
+  IValue(const std::vector<T>& v);
+  template <class T, enable_if_symint<T> = nullptr>
+  IValue(std::vector<T>&& v);
+
+
+  template <class T>
+  using enable_if_ilist_is_ivalue_constructible = std::enable_if_t<
+      std::is_constructible<IValue, T>::value &&
+          std::is_constructible<IValue, typename IListRef<T>::boxed_type>::
+              value &&
+          !std::is_same<T, c10::SymInt>::value,
+      std::nullptr_t>;
+
+  template <class T, enable_if_ilist_is_ivalue_constructible<T> = nullptr>
+  IValue(c10::IListRef<T> v);
+
+  // GenericDict
+  IValue(c10::Dict<IValue, IValue> v);
+  bool isGenericDict() const {
+    return Tag::GenericDict == tag;
+  }
+  c10::Dict<IValue, IValue> toGenericDict() &&;
+  c10::Dict<IValue, IValue> toGenericDict() const&;
+
+  template <class Key, class Value>
+  IValue(c10::Dict<Key, Value> v);
+
+  template <class Key, class Value>
+  /// \cond
+  /// DOXYGEN_CANNOT_HANDLE_CONSTRUCTORS_WITH_MACROS_SO_EXCLUDE_THIS_LINE_FROM_DOXYGEN
+  C10_DEPRECATED_MESSAGE(
+      "IValues based on std::unordered_map<K, V> are slow and deprecated. Please use c10::Dict<K, V> instead.")
+      /// \endcond
+      IValue(std::unordered_map<Key, Value> v);
+
+  template <class T, enable_if_ivalue_constructible<T> = nullptr>
+  IValue(c10::optional<T> v);
+  template <class T, enable_if_list_is_ivalue_constructible<T> = nullptr>
+  IValue(c10::OptionalArrayRef<T> v);
+  IValue(c10::nullopt_t);
+
+  // ClassType
+  IValue(c10::intrusive_ptr<ivalue::Object> v);
+  bool isObject() const {
+    return tag == Tag::Object;
+  }
+  c10::intrusive_ptr<ivalue::Object> toObject() &&;
+  c10::intrusive_ptr<ivalue::Object> toObject() const&;
+  ivalue::Object& toObjectRef() const;
+
+  torch::jit::Module toModule() const;
+  bool isModule() const;
+
+  // PyObject
+  IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v);
+  bool isPyObject() const {
+    return tag == Tag::PyObject;
+  }
+  c10::intrusive_ptr<ivalue::PyObjectHolder> toPyObjectHolder() &&;
+  c10::intrusive_ptr<ivalue::PyObjectHolder> toPyObjectHolder() const&;
+  PyObject* toPyObject() const;
+
+  // Enum
+  explicit IValue(c10::intrusive_ptr<ivalue::EnumHolder> v);
+  bool isEnum() const {
+    return tag == Tag::Enum;
+  }
+  c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() &&;
+  c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
+
+  // None
+  IValue() : tag(Tag::None) {}
+  bool isNone() const {
+    return Tag::None == tag;
+  }
+  std::string toNone() const {
+    AT_ASSERT(isNone());
+    return "None";
+  }
+
+  static IValue uninitialized() {
+    auto i = IValue();
+    i.tag = Tag::Uninitialized;
+    return i;
+  }
+
+  // Scalar, which gets encoded as either an Int, a Double or a ComplexDouble
+  IValue(const at::Scalar& s) : IValue() {
+    // NB: do the symbolic versions first, as isFloatingPoint is true
+    // for both SymFloat and double
+    if (s.isSymInt()) {
+      tag = Tag::SymInt;
+      payload.u.as_intrusive_ptr = s.toSymInt().toSymNode().release();
+    } else if (s.isSymFloat()) {
+      tag = Tag::SymFloat;
+      payload.u.as_intrusive_ptr = s.toSymFloat().toSymNodeImpl().release();
+    } else if (s.isSymBool()) {
+      tag = Tag::SymBool;
+      payload.u.as_intrusive_ptr = s.toSymBool().toSymNodeImpl().release();
+    } else if (s.isFloatingPoint()) {
+      tag = Tag::Double;
+      payload.u.as_double = s.toDouble();
+    } else if (s.isComplex()) {
+      *this = s.toComplexDouble();
+    } else if (s.isBoolean()) {
+      tag = Tag::Bool;
+      payload.u.as_bool = s.toBool();
+    } else {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          s.isIntegral(false), "Unknown type in Scalar");
+      tag = Tag::Int;
+      payload.u.as_int = s.toLong();
+    }
+  }
+
+  bool isScalar() const {
+    return isDouble() || isInt() || isComplexDouble() || isBool() ||
+        isSymInt() || isSymFloat() || isSymBool();
+  }
+
+  at::Scalar toScalar() const {
+    if (isDouble())
+      return toDouble();
+    else if (isInt())
+      return toInt();
+    else if (isComplexDouble())
+      return toComplexDouble();
+    else if (isBool())
+      return toBool();
+    else if (isSymInt())
+      return toSymInt();
+    else if (isSymFloat())
+      return toSymFloat();
+    else if (isSymBool())
+      return toSymBool();
+    throw std::runtime_error("IValue is not a Scalar");
+  }
+
+  // Device
+  IValue(c10::Device d) : tag(Tag::Device) {
+    payload.u.as_device.type = d.type();
+    payload.u.as_device.index = d.index();
+  }
+  bool isDevice() const {
+    return Tag::Device == tag;
+  }
+  c10::Device toDevice() const {
+    AT_ASSERT(isDevice());
+    return c10::Device(payload.u.as_device.type, payload.u.as_device.index);
+  }
+
+  // Stream
+  IValue(c10::Stream s) : tag(Tag::Stream) {
+    auto v = c10::make_intrusive<ivalue::StreamData3Holder>(s.pack3());
+    payload.u.as_intrusive_ptr = v.release();
+  }
+  c10::Stream toStream() &&;
+  c10::Stream toStream() const&;
+  bool isStream() const {
+    return Tag::Stream == tag;
+  }
+
+  // ScalarType
+  IValue(ScalarType t)
+      : IValue(static_cast<std::underlying_type<ScalarType>::type>(t)) {}
+  at::ScalarType toScalarType() const {
+    return static_cast<at::ScalarType>(toInt());
+  }
+
+  // Layout
+  IValue(Layout l)
+      : IValue(static_cast<std::underlying_type<Layout>::type>(l)) {}
+  at::Layout toLayout() const {
+    return static_cast<at::Layout>(toInt());
+  }
+
+  // MemoryFormat
+  IValue(MemoryFormat m)
+      : IValue(static_cast<std::underlying_type<MemoryFormat>::type>(m)) {}
+  at::MemoryFormat toMemoryFormat() const {
+    return static_cast<at::MemoryFormat>(toInt());
+  }
+
+  // QScheme
+  IValue(at::QScheme qscheme) : tag(Tag::Int) {
+    payload.u.as_int = static_cast<int64_t>(qscheme);
+  }
+
+  at::QScheme toQScheme() const {
+    return static_cast<at::QScheme>(toInt());
+  }
+
+  // Dimname
+  IValue(at::Dimname dimname) : IValue(dimname.symbol().toQualString()) {}
+
+  at::Dimname toDimname() const {
+    return at::Dimname::fromSymbol(Symbol::fromQualString(toStringRef()));
+  }
+
+  // Generator
+  IValue(at::Generator g) : tag(Tag::Generator) {
+    payload.u.as_intrusive_ptr =
+        null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl());
+  }
+  bool isGenerator() const {
+    return Tag::Generator == tag;
+  }
+  at::Generator toGenerator() &&;
+  at::Generator toGenerator() const&;
+
+  // for debugging
+  std::string tagKind() const {
+    switch (tag) {
+#define DEFINE_CASE(x) \
+  case Tag::x:         \
+    return #x;
+      TORCH_FORALL_TAGS(DEFINE_CASE)
+#undef DEFINE_CASE
+    }
+    return "InvalidTag(" + std::to_string(static_cast<int>(tag)) + ")";
+  }
+
+  // generic v.to<at::Tensor>() implementations
+  // that can be used in special functions like pop/push
+  // that use template meta-programming.
+  // prefer the directly named methods when you can,
+  // since they are simpler to understand
+
+  // Note: if you get linker errors saying one of these is missing,
+  // change it to ... && = delete; and you will see better error messages for
+  // why However, we cannot commit this because some compiler versions barf on
+  // it.
+  template <typename T>
+  T to() &&;
+  template <typename T>
+  typename c10::detail::ivalue_to_const_ref_overload_return<T>::type to()
+      const&;
+
+  // ToOptional: convert a IValue to the Optional obj that accepts both T and
+  // None
+  template <typename T>
+  optional<T> toOptional();
+  template <typename T>
+  optional<T> toOptional() const;
+
+  /// @private [doxygen private]
+  /// this is a shallow comparison of two IValues to test the object identity
+  bool isSameIdentity(const IValue& rhs) const;
+
+  // Computes the "official" string representation of an IValue. This produces a
+  // TorchScript expression that can be used to recreate an IValue with the same
+  // value (e.g. when we are printing constants in the serializer).
+  //
+  // Callers can use `customFormatter` to override how `repr()` prints out an
+  // IValue. This is useful if you have some other environment where you can
+  // look up values, and you want to print a reference to that environment (like
+  // the serializer's constant table).
+  //
+  // repr() is not necessarily defined on all objects!
+  std::ostream& repr(
+      std::ostream& stream,
+      std::function<bool(std::ostream&, const IValue& v)> customFormatter)
+      const;
+
+  // Computes an "informal" string representation of an IValue. This should be
+  // used for debugging, or servicing `print()`-like functions.
+  // This is different from `repr()` in that there is no expectation that we can
+  // exactly reconstruct an IValue from the output; feel free to use a
+  // concise/pretty form
+  TORCH_API friend std::ostream& operator<<(std::ostream& out, const IValue& v);
+
+  bool isPtrType() const {
+    if (isTensor()) {
+      return payload.as_tensor.defined();
+    }
+    return isIntrusivePtrLegacyBehavior();
+  }
+
+  /// @private [doxygen private]
+  const void* internalToPointer() const {
+    TORCH_INTERNAL_ASSERT(
+        isPtrType(), "Can only call internalToPointer() for pointer types");
+    if (isTensor()) {
+      return payload.as_tensor.unsafeGetTensorImpl();
+    } else {
+      return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()
+          ? payload.u.as_intrusive_ptr
+          : nullptr;
+    }
+  }
+
+  template <typename T = c10::PlatformType>
+  TypePtr type() const;
+
+  // Detect aliased tensors.
+  struct HashAliasedIValue {
+    size_t hashTensor(const at::Tensor& ten) const {
+      if (ten.is_sparse()) {
+        // COO sparse tensors have a "values" tensor and an "indices" tensor
+        // so this will detect overlap of sparse tensors that share a values
+        // tensor, but not sparse tensors that share an indices tensor.
+        return hashTensor(ten._values());
+      } else if (ten.is_sparse_csr()) {
+        // COO sparse tensors have a "values" tensor and an "indices" tensor
+        // so this will detect overlap of sparse tensors that share a values
+        // tensor, but not sparse tensors that share an indices tensor.
+        return hashTensor(ten.values());
+      } else if (!ten.has_storage()) {
+        // Opaque tensors such as the ones constructed by the MKL-DNN backend
+        // don't have storage so we just use their TensorImpls.
+        // TODO: Find way to expose alias info for opaque tensors.
+        return reinterpret_cast<size_t>(ten.unsafeGetTensorImpl());
+      } else {
+        return reinterpret_cast<size_t>(ten.storage().unsafeGetStorageImpl());
+      }
+    }
+    size_t operator()(const IValue& val) const {
+      if (val.isTensor()) {
+        return hashTensor(val.toTensor());
+      }
+      // If it is not a Tensor, then two mutable IValues alias each other only
+      // if they are the same pointer.
+      return val.payload.u.as_int;
+    }
+  };
+
+  struct CompAliasedIValues {
+    bool operator()(const IValue& lhs, const IValue& rhs) const {
+      return lhs.isAliasOf(rhs);
+    }
+  };
+
+  using HashAliasedIValues =
+      std::unordered_set<IValue, HashAliasedIValue, CompAliasedIValues>;
+  using HashAliasedIValueMap =
+      std::unordered_map<IValue, IValue, HashAliasedIValue, CompAliasedIValues>;
+
+  // Chechs if this and rhs has a subvalues in common.
+  // [t1,t2] and [t2, t3] returns true.
+  bool overlaps(const IValue& rhs) const;
+
+  // Inserts all subvalues of this in subValues.
+  void getSubValues(HashAliasedIValues& subValues) const;
+
+  // Apply visitor to every subvalue.
+  // TODO: There are several places that recurse over IValue. This is fragile.
+  // This visitor should be used to recurse over ivalues.
+  void visit(const std::function<bool(const IValue&)>& visitor) const;
+  IValue deepcopy(c10::optional<at::Device> device = c10::nullopt) const;
+  IValue deepcopy(
+      HashAliasedIValueMap& memo,
+      c10::optional<at::Device> device = c10::nullopt) const;
+
+ private:
+  static c10::intrusive_ptr_target* null_to_undefined_tensor(
+      c10::intrusive_ptr_target* p) {
+    return p ? p
+             : static_cast<c10::intrusive_ptr_target*>(
+                   c10::UndefinedTensorImpl::singleton());
+  }
+
+  static bool ptrEqual(const IValue& lhs, const IValue& rhs);
+  // NOTE: IValue tags are intentionally private. In the future we may encode
+  // this value different (e.g. using NaN boxing), and this would make it more
+  // costly to determine the tag for all types vs just determining if something
+  // is a particular type. Instead we want clients to use the `isX` methods when
+  // possible. If for perf. reasons you really, absolutely, must have a jump
+  // table, then we can revisit this.
+  enum class Tag : uint32_t {
+#define DEFINE_TAG(x) x,
+    TORCH_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+  };
+
+#define COUNT_TAG(x) 1 +
+  static constexpr auto kNumTags = TORCH_FORALL_TAGS(COUNT_TAG) 0;
+#undef COUNT_TAG
+
+  template <
+      class T,
+      class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> moveToIntrusivePtr();
+  template <
+      typename T,
+      class NullType = c10::detail::intrusive_target_default_null_type<T>>
+  c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
+
+  void destroy() {
+    // We carefully construct this call to both 1) avoid UB by using
+    // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable
+    // the compiler to generate the same code for each case. It is
+    // surprisingly difficult to get this right.
+    if (isTensor() || isIntrusivePtr()) {
+      c10::intrusive_ptr_target* p = isTensor()
+          ? payload.as_tensor.unsafeGetTensorImpl()
+          : payload.u.as_intrusive_ptr;
+      c10::intrusive_ptr<intrusive_ptr_target, c10::UndefinedTensorImpl>::
+          reclaim(p);
+      // No need to make this destructor call!
+      // payload.as_tensor.~Tensor();
+    }
+  }
+
+  C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // rhs.payload.as_tensor.~Tensor();
+    } else {
+      payload.u = rhs.payload.u;
+    }
+    tag = rhs.tag;
+    rhs.clearToNone();
+  }
+
+  void clearToNone() noexcept {
+    payload.u.as_int = 0;
+    tag = Tag::None;
+  }
+
+ private:
+  // This is the source of truth for isIntrusivePtr; edit results here
+  // as needed and isIntrusivePtr will pick them up.
+  // NOLINTBEGIN(bugprone-branch-clone)
+  static constexpr bool isIntrusivePtrConstexpr(Tag tag) {
+    switch (tag) {
+      case Tag::None:
+        return false;
+      case Tag::Tensor:
+        return false;
+      case Tag::Storage:
+        return true;
+      case Tag::Generator:
+        return true;
+      case Tag::Double:
+        return false;
+      case Tag::ComplexDouble:
+        return true;
+      case Tag::Int:
+        return false;
+      case Tag::SymInt:
+        return true;
+      case Tag::SymFloat:
+        return true;
+      case Tag::SymBool:
+        return true;
+      case Tag::Bool:
+        return false;
+      case Tag::Tuple:
+        return true;
+      case Tag::String:
+        return true;
+      case Tag::Blob:
+        return true;
+      case Tag::GenericList:
+        return true;
+      case Tag::GenericDict:
+        return true;
+      case Tag::Future:
+        return true;
+      case Tag::Await:
+        return true;
+      case Tag::Device:
+        return false;
+      case Tag::Stream:
+        return true;
+      case Tag::Object:
+        return true;
+      case Tag::PyObject:
+        return true;
+      case Tag::Uninitialized:
+        return false;
+      case Tag::Capsule:
+        return true;
+      case Tag::RRef:
+        return true;
+      case Tag::Quantizer:
+        return true;
+      case Tag::Enum:
+        return true;
+    }
+    return false;
+  }
+  // NOLINTEND(bugprone-branch-clone)
+
+ public:
+  // Don't edit this just to add results for new tags; edit
+  // isIntrusivePtrConstexpr above.
+  bool isIntrusivePtr() const {
+    // Implementation NOTE: the switch in isIntrusivePtrConstexpr
+    // above is the previous production implementation of this
+    // function. We observed that, at least on x86_64, the generated
+    // instruction sequence was a similar bit vector test to what we
+    // have manually implemented below, except that there was an extra
+    // "bounds check" branch confirming, essentially, that `tag <
+    // kNumTags` and providing a consistent result in that case. We
+    // don't care about the result if tag is out of bounds, so we'd
+    // like to eliminate that comparison and branch; manually
+    // implementing this function as a bit test is the simplest way I
+    // could find to accomplish that elimination.
+    static constexpr uint32_t kTruthTableBitVector =
+#define TRUTH_TABLE_ENTRY(tag) \
+  (uint32_t(isIntrusivePtrConstexpr(Tag::tag)) << uint32_t(Tag::tag)) |
+        TORCH_FORALL_TAGS(TRUTH_TABLE_ENTRY)
+#undef TRUTH_TABLE_ENTRY
+            0;
+
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<uint32_t>(tag) < kNumTags,
+        "unexpected tag ",
+        static_cast<int>(tag));
+    return kTruthTableBitVector & (1 << (uint32_t(tag) % 32));
+  }
+
+  // Storage and Generator were treated specially when
+  // is_intrusive_ptr was stored as explicit state. This getter
+  // preserves the old behavior for use with WeakIValue for now.
+  bool isIntrusivePtrLegacyBehavior() const {
+    if (tag == Tag::Storage || tag == Tag::Generator) {
+      return payload.u.as_intrusive_ptr !=
+          c10::UndefinedTensorImpl::singleton();
+    } else {
+      return isIntrusivePtr();
+    }
+  }
+
+  union Payload {
+    // [TriviallyCopyablePayload]
+    // We use a nested union here so that we can make the copy easy
+    // and efficient in the non-tensor (i.e., trivially copyable)
+    // case. Specifically, we do not have to do a switch-on-tag to
+    // figure out which union member to assign; we can just use
+    // TriviallyCopyablePayload::operator=.
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+      // Invariant: never nullptr; null state is represented as
+      // c10::UndefinedTensorImpl::singleton() for consistency of
+      // representation with Tensor.
+      c10::intrusive_ptr_target* as_intrusive_ptr;
+      struct {
+        c10::DeviceType type;
+        DeviceIndex index;
+      } as_device;
+    } u;
+    at::Tensor as_tensor;
+    Payload() : u() {}
+    ~Payload() {}
+  };
+
+  IValue(const Payload& p, Tag t) : tag(t) {
+    if (isTensor()) {
+      new (&payload.as_tensor) at::Tensor(p.as_tensor);
+    } else {
+      payload.u = p.u;
+    }
+  }
+
+  template <typename T>
+  struct TagType {};
+
+  friend MaybeOwnedTraits<IValue>;
+
+  Payload payload;
+  Tag tag{IValue::Tag::None};
+  friend struct WeakIValue;
+};
+
+struct TORCH_API WeakIValue final {
+  WeakIValue() = default;
+
+  WeakIValue(const WeakIValue& rhs)
+      : payload(rhs.payload),
+        tag(rhs.tag),
+        is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (is_intrusive_ptr &&
+        payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+    }
+  }
+  WeakIValue(const IValue& rhs)
+      : tag(rhs.tag), is_intrusive_ptr(rhs.isIntrusivePtrLegacyBehavior()) {
+    if (rhs.isTensor()) {
+      payload.as_intrusive_ptr = rhs.unsafeToTensorImpl();
+      is_intrusive_ptr = true;
+    } else {
+      payload = rhs.payload.u;
+    }
+    if (is_intrusive_ptr) {
+      if (payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+        c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+      }
+    }
+  }
+  WeakIValue(WeakIValue&& rhs) noexcept : WeakIValue() {
+    swap(rhs);
+  }
+  ~WeakIValue() {
+    if (is_intrusive_ptr &&
+        payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      c10::raw::weak_intrusive_ptr::decref(payload.as_intrusive_ptr);
+    }
+  }
+  WeakIValue& operator=(WeakIValue&& rhs) & noexcept {
+    WeakIValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+    return *this;
+  }
+  WeakIValue& operator=(WeakIValue const& rhs) & {
+    WeakIValue(rhs).swap(*this);
+    return *this;
+  }
+  void swap(WeakIValue& rhs) noexcept {
+    std::swap(payload, rhs.payload);
+    std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
+    std::swap(tag, rhs.tag);
+  }
+
+  bool isSameIdentity(const WeakIValue& rhs) const {
+    return payload.as_int == rhs.payload.as_int && tag == rhs.tag &&
+        is_intrusive_ptr == rhs.is_intrusive_ptr;
+  }
+
+  IValue lock() const {
+    if (!is_intrusive_ptr) {
+      IValue::Payload newPayload;
+      newPayload.u = payload;
+      return IValue(newPayload, tag);
+    }
+    if (IValue::Tag::Tensor == tag) {
+      auto temp =
+          c10::weak_intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl>::
+              reclaim(static_cast<at::TensorImpl*>(payload.as_intrusive_ptr));
+      c10::intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl> ip(
+          temp.lock());
+      temp.release();
+      if (!ip) {
+        return IValue();
+      } else {
+        return IValue(at::Tensor(std::move(ip)));
+      }
+    } else {
+      auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+          payload.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+              ? nullptr
+              : payload.as_intrusive_ptr);
+      IValue::Payload pl;
+      pl.u.as_intrusive_ptr = temp.lock().release();
+      temp.release();
+      if (!pl.u.as_intrusive_ptr) {
+        return IValue();
+      } else {
+        return IValue(pl, tag);
+      }
+    }
+  }
+
+  size_t use_count() const noexcept {
+    if (!is_intrusive_ptr) {
+      return 1;
+    }
+    auto temp = c10::weak_intrusive_ptr<
+        c10::intrusive_ptr_target,
+        c10::UndefinedTensorImpl>::reclaim(payload.as_intrusive_ptr);
+    size_t result = temp.use_count();
+    temp.release();
+    return result;
+  }
+
+  size_t weak_use_count() const noexcept {
+    if (!is_intrusive_ptr) {
+      return 1;
+    }
+    auto temp = c10::weak_intrusive_ptr<
+        c10::intrusive_ptr_target,
+        c10::UndefinedTensorImpl>::reclaim(payload.as_intrusive_ptr);
+    size_t result = temp.weak_use_count();
+    temp.release();
+    return result;
+  }
+  size_t hash() const {
+    return payload.as_int;
+  }
+
+ private:
+  using Payload = IValue::Payload::TriviallyCopyablePayload;
+  Payload payload;
+  IValue::Tag tag{IValue::Tag::None};
+  bool is_intrusive_ptr{false};
+};
+
+// An owning pointer to a type. When the type is class type, it requires a pair
+// of shared_ptrs to the class type and its owning CU, so that the class type is
+// guaranteed to stay alive as long as we hold this object.
+struct TORCH_API StrongTypePtr {
+  StrongTypePtr(std::shared_ptr<torch::jit::CompilationUnit> cu, TypePtr type);
+
+  std::shared_ptr<torch::jit::CompilationUnit> cu_;
+  TypePtr type_;
+};
+
+// [Constant Object Weak CompilationUnit Reference]
+// A non owning pointer to a type. When a class get inserted as a constant
+// into a graph, if we used a strong pointer we would have a circular reference
+// from Object -> CompilationUnit and CompilationUnit -> Graph (which owns the
+// Constant Object)
+struct TORCH_API WeakTypePtr {
+  WeakTypePtr(std::weak_ptr<torch::jit::CompilationUnit> cu, TypePtr type);
+
+  std::weak_ptr<torch::jit::CompilationUnit> cu_;
+  TypePtr type_;
+};
+
+// internal build errors with std::variant :/
+struct WeakOrStrongCompilationUnit {
+  explicit WeakOrStrongCompilationUnit(
+      std::shared_ptr<torch::jit::CompilationUnit> shared_cu)
+      : strong_ptr_(std::move(shared_cu)), weak_ptr_(c10::nullopt) {}
+
+  explicit WeakOrStrongCompilationUnit(
+      std::weak_ptr<torch::jit::CompilationUnit> weak_cu)
+      : strong_ptr_(c10::nullopt), weak_ptr_(std::move(weak_cu)) {}
+
+  std::shared_ptr<torch::jit::CompilationUnit> getStrongRefOrThrow() const {
+    TORCH_INTERNAL_ASSERT(strong_ptr_ != c10::nullopt);
+    return *strong_ptr_;
+  }
+
+  std::weak_ptr<torch::jit::CompilationUnit> getWeakRefOrThrow() const {
+    TORCH_INTERNAL_ASSERT(weak_ptr_ != c10::nullopt);
+    return *weak_ptr_;
+  }
+
+  bool holdingStrongRef() const {
+    return strong_ptr_ != c10::nullopt;
+  }
+
+  bool holdingEmptyStrongRef() const {
+    return holdingStrongRef() && *strong_ptr_ == nullptr;
+  }
+
+  c10::optional<std::shared_ptr<torch::jit::CompilationUnit>> strong_ptr_;
+  c10::optional<std::weak_ptr<torch::jit::CompilationUnit>> weak_ptr_;
+};
+
+// An Object will hold a non-owning Compilation Unit reference if it is a
+// Constant in the graph and a Owning reference otherwise
+struct TORCH_API WeakOrStrongTypePtr {
+  explicit WeakOrStrongTypePtr(WeakTypePtr weak)
+      : cu_(WeakOrStrongCompilationUnit(std::move(weak.cu_))),
+        type_(std::move(weak.type_)) {}
+  explicit WeakOrStrongTypePtr(StrongTypePtr strong)
+      : cu_(WeakOrStrongCompilationUnit(std::move(strong.cu_))),
+        type_(std::move(strong.type_)) {}
+  explicit WeakOrStrongTypePtr(WeakOrStrongCompilationUnit cu, TypePtr type)
+      : cu_(std::move(cu)), type_(std::move(type)) {}
+  WeakTypePtr asWeakTypePtr() const;
+
+  WeakOrStrongCompilationUnit cu_;
+  TypePtr type_;
+
+  bool holds_strong_ref() const {
+    return cu_.holdingStrongRef();
+  }
+
+  bool holds_empty_strong_ref() const {
+    return cu_.holdingEmptyStrongRef();
+  }
+};
+
+} // namespace c10
+
+#include <ATen/core/ivalue_inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_inl.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9b25a0451b3ce3e710b009f52583926ad657350
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_inl.h
@@ -0,0 +1,2545 @@
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/List.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <ATen/core/rref_interface.h>
+#include <ATen/core/symbol.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Event.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Stream.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/FunctionRef.h>
+#include <c10/util/Logging.h>
+#include <c10/util/hash.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+
+namespace torch {
+namespace jit {
+struct Function;
+struct CompilationUnit;
+} // namespace jit
+TORCH_API bool isCustomClass(const c10::IValue& v);
+} // namespace torch
+namespace c10 {
+struct IValue;
+struct ClassType;
+struct TupleType;
+struct EnumType;
+struct InferredType;
+
+// For custom class __init__ registration, we need to pass in a function
+// that looks like this: [](IValue x, args...)
+
+// However, make_boxed_from_unboxed_functor.h automatically sets the input types
+// of the function by introspecting the types of the functor (which is IValue in
+// this case). However, we need the type it binds to be Foo.
+
+// Instead, we pass in a lambda [](ivalue_holder<CurClass> x, args...) from
+// which getTypePtr can recover the original class pointer.
+
+template <typename TaggedCapsuleType>
+struct tagged_capsule {
+  IValue ivalue;
+};
+
+template <class T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
+  auto t = c10::intrusive_ptr<T, NullType>::reclaim(
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
+  clearToNone();
+  return t;
+}
+template <typename T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
+  if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) {
+    return c10::intrusive_ptr<T, NullType>();
+  }
+  c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
+  return c10::intrusive_ptr<T, NullType>::reclaim(
+      static_cast<T*>(payload.u.as_intrusive_ptr));
+}
+
+template <class T, class U>
+intrusive_ptr<T> static_intrusive_pointer_cast(intrusive_ptr<U> r) {
+  return intrusive_ptr<T>::reclaim(static_cast<T*>(r.release()));
+}
+
+template <class T, class U>
+intrusive_ptr<T> dynamic_intrusive_pointer_cast(intrusive_ptr<U> r) {
+  return intrusive_ptr<T>::reclaim(dynamic_cast<T*>(r.release()));
+}
+
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() && {
+  AT_ASSERT(isFuture(), "Expected Future but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() const& {
+  AT_ASSERT(isFuture(), "Expected Future but got ", tagKind());
+  return toIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() && {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Await>();
+}
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() const& {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return toIntrusivePtr<ivalue::Await>();
+}
+inline c10::intrusive_ptr<c10::RRefInterface> IValue::toRRef() && {
+  AT_ASSERT(isRRef(), "Expected RRef but got ", tagKind());
+  return moveToIntrusivePtr<c10::RRefInterface>();
+}
+inline c10::intrusive_ptr<c10::RRefInterface> IValue::toRRef() const& {
+  AT_ASSERT(isRRef(), "Expected RRef but got ", tagKind());
+  return toIntrusivePtr<c10::RRefInterface>();
+}
+inline c10::intrusive_ptr<at::Quantizer> IValue::toQuantizer() && {
+  AT_ASSERT(isQuantizer(), "Expected Quantizer but got ", tagKind());
+  return moveToIntrusivePtr<at::Quantizer>();
+}
+inline c10::intrusive_ptr<at::Quantizer> IValue::toQuantizer() const& {
+  AT_ASSERT(isQuantizer(), "Expected Quantizer but got ", tagKind());
+  return toIntrusivePtr<at::Quantizer>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() && {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() const& {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  return toIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() && {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Object>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() const& {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  return toIntrusivePtr<ivalue::Object>();
+}
+inline c10::intrusive_ptr<ivalue::PyObjectHolder> IValue::
+    toPyObjectHolder() && {
+  TORCH_INTERNAL_ASSERT(isPyObject(), "Expected PyObject but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::PyObjectHolder>();
+}
+inline c10::intrusive_ptr<ivalue::PyObjectHolder> IValue::toPyObjectHolder()
+    const& {
+  TORCH_INTERNAL_ASSERT(isPyObject(), "Expected PyObject but got ", tagKind());
+  return toIntrusivePtr<ivalue::PyObjectHolder>();
+}
+inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() && {
+  TORCH_INTERNAL_ASSERT(isEnum(), "Expected Enum but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::EnumHolder>();
+}
+inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() const& {
+  TORCH_INTERNAL_ASSERT(isEnum(), "Expected Enum but got ", tagKind());
+  return toIntrusivePtr<ivalue::EnumHolder>();
+}
+inline c10::complex<double> IValue::toComplexDouble() const {
+  TORCH_INTERNAL_ASSERT(isComplexDouble(), "Expected ComplexDouble but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::ComplexHolder>();
+  return (*ptr).val;
+}
+inline at::Tensor IValue::toTensor() && {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  auto result = std::move(payload.as_tensor);
+  // As far as I can tell, omitting the usual explicit destructor call
+  // is not UB in and of itself, and it's a slight perf win. The
+  // destructor is a no-op, because the moved-from Tensor is
+  // effectively an intrusive_ptr in the null state, so we don't need
+  // the behavior for correctness reasons either. Leaving this
+  // explanatory comment, including commented-out destructor call, to
+  // make this abundantly clear.
+  //
+  // payload.as_tensor.~Tensor();
+  clearToNone();
+  return result;
+}
+inline at::Tensor& IValue::toTensor() & {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  return payload.as_tensor;
+}
+inline const at::Tensor& IValue::toTensor() const& {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  return payload.as_tensor;
+}
+inline c10::Storage IValue::toStorage() && {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(
+      moveToIntrusivePtr<at::StorageImpl>());
+}
+inline c10::Storage IValue::toStorage() const& {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(toIntrusivePtr<at::StorageImpl>());
+}
+inline c10::Stream IValue::toStream() && {
+  AT_ASSERT(isStream(), "Expected Stream but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::StreamData3Holder>();
+  return c10::Stream::unpack3((*ptr).val.stream_id,
+                              (*ptr).val.device_index,
+                              (*ptr).val.device_type);
+}
+inline c10::Stream IValue::toStream() const& {
+  AT_ASSERT(isStream(), "Expected Stream but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::StreamData3Holder>();
+  return c10::Stream::unpack3((*ptr).val.stream_id,
+                              (*ptr).val.device_index,
+                              (*ptr).val.device_type);
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
+  AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
+  return moveToIntrusivePtr<caffe2::Blob>();
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() const& {
+  AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
+  return toIntrusivePtr<caffe2::Blob>();
+  ;
+}
+inline c10::intrusive_ptr<torch::CustomClassHolder> IValue::toCapsule() && {
+  TORCH_INTERNAL_ASSERT(isCapsule());
+  return moveToIntrusivePtr<torch::CustomClassHolder>();
+}
+inline c10::intrusive_ptr<torch::CustomClassHolder> IValue::toCapsule() const& {
+  TORCH_INTERNAL_ASSERT(isCapsule());
+  return toIntrusivePtr<torch::CustomClassHolder>();
+}
+inline at::Generator IValue::toGenerator() && {
+  AT_ASSERT(isGenerator(), "Expected Generator but got ", tagKind());
+  return at::Generator(moveToIntrusivePtr<at::GeneratorImpl>());
+}
+inline at::Generator IValue::toGenerator() const& {
+  AT_ASSERT(isGenerator(), "Expected Generator but got ", tagKind());
+  return at::Generator(toIntrusivePtr<at::GeneratorImpl>());
+}
+inline c10::SymInt IValue::toSymInt() && {
+  AT_ASSERT(isSymInt() || isInt(), "Expected SymInt or int but got ", tagKind());
+  if (isSymInt()) {
+    return c10::SymInt(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymInt(payload.u.as_int);
+  }
+}
+inline c10::SymInt IValue::toSymInt() const& {
+  AT_ASSERT(isSymInt() || isInt(), "Expected SymInt or int but got ", tagKind());
+  if (isSymInt()) {
+    return c10::SymInt(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymInt(payload.u.as_int);
+  }
+}
+inline c10::SymFloat IValue::toSymFloat() && {
+  AT_ASSERT(isSymFloat() || isDouble(), "Expected SymFloat or double but got ", tagKind());
+  if (isSymFloat()) {
+    return c10::SymFloat(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymFloat(payload.u.as_double);
+  }
+}
+inline c10::SymFloat IValue::toSymFloat() const& {
+  AT_ASSERT(isSymFloat() || isDouble(), "Expected SymFloat or double but got ", tagKind());
+  if (isSymFloat()) {
+    return c10::SymFloat(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymFloat(payload.u.as_double);
+  }
+}
+inline c10::SymBool IValue::toSymBool() && {
+  AT_ASSERT(isSymBool() || isBool(), "Expected SymBool or boolean but got ", tagKind());
+  if (isSymBool()) {
+    return c10::SymBool(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymBool(payload.u.as_bool);
+  }
+}
+
+inline c10::SymBool IValue::toSymBool() const& {
+  AT_ASSERT(isSymBool() || isBool(), "Expected SymBool or boolean but got ", tagKind());
+  if (isSymBool()) {
+    return c10::SymBool(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymBool(payload.u.as_bool);
+  }
+}
+
+namespace ivalue {
+
+void TORCH_API
+checkCustomClassType(const ClassType* expected_type, const Type* actual_type);
+
+template <typename T>
+using Shared = c10::intrusive_ptr<T>;
+
+// string
+struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
+ private:
+   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::string str_;
+
+ public:
+  ConstantString(std::string str) : str_(std::move(str)) {}
+  ConstantString(c10::string_view str) : str_(std::string(str)) {}
+  static c10::intrusive_ptr<ConstantString> create(std::string str_);
+  static c10::intrusive_ptr<ConstantString> create(c10::string_view str_);
+  static c10::intrusive_ptr<ConstantString> create(const char* str_);
+
+  const std::string& string() const {
+    return str_;
+  }
+  c10::string_view string_view() const {
+    return str_;
+  }
+
+  operator const std::string&() const {
+    return string();
+  }
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ConstantString& v);
+};
+
+struct Future;
+
+struct TORCH_API TupleElements {
+ private:
+  size_t inlineSize_;
+  // We represent TupleElements this way to save doing a heap
+  // allocation in the common (at least for unpickling) case where we
+  // have only 3 elements. We have our own union instead of
+  // c10::SmallVector<IValue> because c10::SmallVector<IValue> always
+  // stores the begin/end/capacity pointers, which would be a waste of
+  // space in our use case.
+  union {
+    std::vector<IValue> elementsVector_;
+    // Don't want to declare a std::array because the convenient
+    // iteration and size members are a footgun in this case -- the
+    // actual size of the array may be smaller than 3!
+    // NOLINTNEXTLINE(*c-arrays*)
+    IValue elementsInline_[3];
+  };
+
+  void destroyInline() {
+   for (const auto ii : c10::irange(inlineSize_)) {
+     elementsInline_[ii].~IValue();
+   }
+  }
+ public:
+
+  using iterator = IValue*;
+  using const_iterator = const IValue*;
+
+  TupleElements() : inlineSize_(0) {
+    new (&elementsVector_) std::vector<IValue>();
+  }
+
+  explicit TupleElements(std::vector<IValue> elements)
+  : inlineSize_(0), elementsVector_(std::move(elements)) {}
+
+  explicit TupleElements(c10::ArrayRef<IValue> elements)
+  : inlineSize_(elements.size() <= 3 ? elements.size() : 0) {
+    switch (inlineSize_) {
+      case 3:
+        new (&elementsInline_[2]) IValue(elements[2]);
+        [[fallthrough]];
+      case 2:
+        new (&elementsInline_[1]) IValue(elements[1]);
+        [[fallthrough]];
+      case 1:
+        new (&elementsInline_[0]) IValue(elements[0]);
+        break;
+      case 0:
+        new (&elementsVector_) std::vector<IValue>(elements.begin(), elements.end());
+        break;
+    }
+  }
+
+  explicit TupleElements(IValue&& e1)
+  : inlineSize_(1) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+  }
+
+  explicit TupleElements(IValue&& e1, IValue&& e2)
+  : inlineSize_(2) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+    new (&elementsInline_[1]) IValue(std::move(e2));
+  }
+
+  explicit TupleElements(IValue&& e1, IValue&& e2, IValue&& e3)
+  : inlineSize_(3) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+    new (&elementsInline_[1]) IValue(std::move(e2));
+    new (&elementsInline_[2]) IValue(std::move(e3));
+  }
+
+  ~TupleElements() {
+    if (inlineSize_) {
+      destroyInline();
+    } else {
+      elementsVector_.~vector();
+    }
+  }
+
+  // It would be nice to make this noncopyable to prevent people from
+  // writing code like `auto output =
+  // forward(...).toTupleRef().elements()` (which does refcount bumps on
+  // each element, unlike the more efficient but verbose
+  // ```
+  // auto outputIntrusivePtr = forward(...).toTuple();
+  // const auto& output = outputIntrusivePtr->elements();
+  // ```
+  // ), but there is simply an overwhelming amount of code that does
+  // it the inefficient way.
+  // See also operator std::vector below.
+  TupleElements(const TupleElements& rhs)
+  : inlineSize_(rhs.inlineSize_) {
+    if (rhs.inlineSize_) {
+      for (const auto  ii : c10::irange(inlineSize_)) {
+        new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+      }
+    } else {
+      new (&elementsVector_) std::vector<IValue>(rhs.elementsVector_);
+    }
+  }
+
+  TupleElements& operator=(const TupleElements& rhs) {
+    if (inlineSize_) {
+      if (rhs.inlineSize_) {
+        for (const auto ii : c10::irange(std::min(inlineSize_, rhs.inlineSize_))) {
+          elementsInline_[ii] = rhs.elementsInline_[ii];
+        }
+        if (rhs.inlineSize_ > inlineSize_) {
+          for (const auto ii : c10::irange(inlineSize_, rhs.inlineSize_)) {
+            new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+          }
+        } else {
+          for (const auto ii : c10::irange(rhs.inlineSize_, inlineSize_)) {
+            elementsInline_[ii].~IValue();
+          }
+        }
+      } else {
+        destroyInline();
+        new (&elementsVector_) std::vector<IValue>(rhs.elementsVector_);
+      }
+    } else {
+      if (rhs.inlineSize_) {
+        elementsVector_.~vector();
+        for (const auto ii : c10::irange(rhs.inlineSize_)) {
+          new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+        }
+      } else {
+        elementsVector_ = rhs.elementsVector_;
+      }
+    }
+    inlineSize_ = rhs.inlineSize_;
+    return *this;
+  }
+
+  TupleElements(TupleElements&& rhs) noexcept
+  : inlineSize_(rhs.inlineSize_) {
+    if (inlineSize_) {
+      for (const auto ii : c10::irange(inlineSize_)) {
+        new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+      }
+    } else {
+      new (&elementsVector_) std::vector<IValue>(std::move(rhs.elementsVector_));
+    }
+  }
+
+  TupleElements& operator=(TupleElements&& rhs) noexcept {
+    if (inlineSize_) {
+      if (rhs.inlineSize_) {
+        for (const auto ii : c10::irange(std::min(inlineSize_, rhs.inlineSize_))) {
+          elementsInline_[ii] = std::move(rhs.elementsInline_[ii]);
+        }
+        if (rhs.inlineSize_ > inlineSize_) {
+          for (const auto ii : c10::irange(inlineSize_, rhs.inlineSize_)) {
+            new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+          }
+        } else {
+          for (const auto ii : c10::irange(rhs.inlineSize_, inlineSize_)) {
+            elementsInline_[ii].~IValue();
+          }
+        }
+      } else {
+        destroyInline();
+        new (&elementsVector_) std::vector<IValue>(std::move(rhs.elementsVector_));
+      }
+    } else {
+      if (rhs.inlineSize_) {
+        elementsVector_.~vector();
+        for (const auto ii : c10::irange(rhs.inlineSize_)) {
+          new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+        }
+      } else {
+        elementsVector_ = std::move(rhs.elementsVector_);
+      }
+    }
+    inlineSize_ = rhs.inlineSize_;
+    return *this;
+  }
+
+  C10_NODISCARD c10::ArrayRef<IValue> asArrayRef() const {
+    if (inlineSize_) {
+      return c10::ArrayRef<IValue>(elementsInline_, inlineSize_);
+    } else {
+      return elementsVector_;
+    }
+  }
+
+  // Mimic implicit conversion from std::vector to ArrayRef.
+  operator c10::ArrayRef<IValue>() const {
+    return asArrayRef();
+  }
+
+  static size_t hash(const TupleElements& v) {
+    return c10::hash<c10::ArrayRef<IValue>>()(v.asArrayRef());
+  }
+
+  void setContents(std::vector<IValue>&& contents) {
+    if (inlineSize_) {
+      destroyInline();
+      new (&elementsVector_) std::vector<IValue>(std::move(contents));
+      inlineSize_ = 0;
+    } else {
+      elementsVector_ = std::move(contents);
+    }
+  }
+
+  C10_NODISCARD bool empty() const {
+    return inlineSize_ ? false : elementsVector_.empty();
+  }
+
+  C10_NODISCARD size_t size() const {
+    return inlineSize_ ? inlineSize_ : elementsVector_.size();
+  }
+
+  C10_NODISCARD IValue& operator[](size_t idx) {
+    if (inlineSize_) {
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_[idx];
+    }
+  }
+
+  C10_NODISCARD const IValue& operator[](size_t idx) const {
+    if (inlineSize_) {
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_[idx];
+    }
+  }
+
+  C10_NODISCARD IValue& at(size_t idx) {
+    if (inlineSize_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inlineSize_ <= 3);
+      TORCH_CHECK(idx < inlineSize_, "TupleElements: invalid index Index = ", idx, "; Length = ", inlineSize_);
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_.at(idx);
+    }
+  }
+
+  C10_NODISCARD const IValue& at(size_t idx) const {
+    if (inlineSize_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inlineSize_ <= 3);
+      TORCH_CHECK(idx < inlineSize_, "TupleElements: invalid index Index = ", idx, "; Length = ", inlineSize_);
+      return elementsInline_[idx];
+    } else {
+      TORCH_CHECK(idx < elementsVector_.size(), "TupleElements: invalid index Index = ", idx, "; Length = ", elementsVector_.size());
+      return elementsVector_.at(idx);
+    }
+  }
+
+  C10_NODISCARD iterator begin() {
+    if (inlineSize_) {
+      return elementsInline_;
+    } else {
+      return elementsVector_.data();
+    }
+  }
+
+  C10_NODISCARD iterator end() {
+    if (inlineSize_) {
+      return elementsInline_ + inlineSize_;
+    } else {
+      return elementsVector_.data() + elementsVector_.size();
+    }
+  }
+
+  C10_NODISCARD const_iterator begin() const {
+    if (inlineSize_) {
+      return elementsInline_;
+    } else {
+      return elementsVector_.data();
+    }
+  }
+
+  C10_NODISCARD const_iterator end() const {
+    if (inlineSize_) {
+      return elementsInline_ + inlineSize_;
+    } else {
+      return elementsVector_.data() + elementsVector_.size();
+    }
+  }
+
+  C10_NODISCARD const_iterator cbegin() const {
+    return begin();
+  }
+
+  C10_NODISCARD const_iterator cend() const {
+    return end();
+  }
+
+  C10_NODISCARD std::vector<IValue> vec() const & {
+    return asArrayRef().vec();
+  }
+
+  C10_NODISCARD IValue& back() {
+    return *(end() - 1);
+  }
+
+  C10_NODISCARD const IValue& back() const {
+    return *(end() - 1);
+  }
+
+  C10_NODISCARD std::vector<IValue> vec() && {
+    std::vector<IValue> result;
+    result.reserve(size());
+    for (auto&& iv : *this) {
+      result.push_back(std::move(iv));
+    }
+    return result;
+  }
+
+  // More compatibility shims for the overwhelming amount of code that
+  // likes to copy tuple elements into a vector; see comment above the
+  // copy constructor.
+  operator std::vector<IValue>() const & {
+    return vec();
+  }
+
+  operator std::vector<IValue>() && {
+    return vec();
+  }
+};
+
+template <typename T>
+struct TupleTypeFactory {};
+
+template <>
+struct TORCH_API TupleTypeFactory<TupleType> {
+  static TupleTypePtr create(std::vector<TypePtr> types) {
+    return TupleType::create(std::move(types));
+  }
+  static TupleTypePtr fallback(const Type& type);
+};
+
+template <>
+struct TORCH_API TupleTypeFactory<c10::DynamicType> {
+  static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
+  static DynamicTypePtr fallback(const Type&);
+};
+
+struct TORCH_API Tuple : c10::intrusive_ptr_target {
+ private:
+  TupleElements elements_;
+  mutable c10::TypePtr type_; // lazily computed for unnamed tuples
+
+ public:
+  // named tuples have additional type information, so we
+  // directly create them tagged
+  static c10::intrusive_ptr<Tuple> createNamed(
+      std::vector<IValue> elements_,
+      c10::TypePtr type_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_), std::move(type_));
+  }
+
+  static c10::intrusive_ptr<Tuple> createNamed(
+      TupleElements elements_,
+      std::shared_ptr<TupleType> type_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_), std::move(type_));
+  }
+
+  static c10::intrusive_ptr<Tuple> createNamed(
+      std::initializer_list<IValue> elements_,
+      std::shared_ptr<TupleType> type_) {
+    return createNamed(TupleElements(c10::ArrayRef<IValue>(elements_)), std::move(type_));
+  }
+
+  // MSVC apparently can't disambiguate the other two overloads of
+  // create when passed an initializer_list without this.
+  static c10::intrusive_ptr<Tuple> create(std::initializer_list<IValue> elements_) {
+    return create(c10::ArrayRef<IValue>(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(TupleElements elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(c10::ArrayRef<IValue> elements_) {
+    return create(TupleElements(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1) {
+    return c10::make_intrusive<Tuple>(std::move(e1));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1, IValue e2) {
+    return c10::make_intrusive<Tuple>(std::move(e1), std::move(e2));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1, IValue e2, IValue e3) {
+    return c10::make_intrusive<Tuple>(std::move(e1), std::move(e2), std::move(e3));
+  }
+
+ private:
+  // Workaround inability to use `>` operator in template argument list.
+  template <typename... Args>
+  static constexpr bool hasMoreThanThreeArgs() {
+    return sizeof...(Args) > 3;
+  }
+
+ public:
+  template <typename... Args>
+  static c10::intrusive_ptr<Tuple> create(Args&&... elements_) {
+    switch (sizeof...(Args)) {
+      case 1:
+      case 2:
+      case 3:
+        return create(IValue(std::forward<Args>(elements_))...);
+      default:
+        return create(
+            std::vector<IValue>{IValue(std::forward<Args>(elements_))...});
+    }
+  }
+
+  // Again, it would be nice to make this noncopyable, but there's a
+  // lot of extant code that copies Tuples.
+  // Tuple(const Tuple& rhs) = delete;
+
+  const TupleElements& elements() const& {
+    return elements_;
+  }
+
+  TupleElements elements() && {
+    return std::move(elements_);
+  }
+
+  void setElements(std::vector<IValue>&& elements) {
+    elements_.setContents(std::move(elements));
+  }
+
+  void setElements(TupleElements&& elements) {
+    elements_ = std::move(elements);
+  }
+
+  void unsafeSetElement(size_t idx, const IValue& element) {
+    elements_[idx] = element;
+  }
+
+  void unsafeSetElement(size_t idx, IValue&& element) {
+    elements_[idx] = std::move(element);
+  }
+
+  size_t size() const {
+    return elements_.size();
+  }
+
+  template <typename T = c10::TupleType>
+  std::shared_ptr<T> type() const {
+    if (!type_) {
+      type_ = TupleTypeFactory<T>::create(fmap(elements(), [&](const IValue& v) {
+        return v.type<typename T::ElementType>();
+      }));
+    }
+    if (auto t = type_->cast<T>()) {
+      return t;
+    }
+    return TupleTypeFactory<T>::fallback(*type_);
+  }
+
+  static size_t hash(const Tuple& t) {
+    return c10::get_hash(t.elements());
+  }
+
+  TORCH_API friend bool operator==(
+      const ivalue::Tuple& lhs,
+      const ivalue::Tuple& rhs);
+
+ private:
+  // NOTE: If we try to avoid the overloads without
+  // `std::shared_ptr<TupleType> type` by defaulting it to nullptr, we
+  // end up having to call (part of) the shared_ptr destructor for
+  // `type` even though we should know statically it won't do
+  // anything.
+  explicit Tuple(std::vector<IValue> elements)
+    : elements_(std::move(elements)){}
+
+  explicit Tuple(std::vector<IValue> elements, c10::TypePtr type)
+    : elements_(std::move(elements)), type_(std::move(type)) {}
+
+  explicit Tuple(TupleElements&& elements)
+    : elements_(std::move(elements)) {}
+
+  explicit Tuple(TupleElements&& elements, std::shared_ptr<TupleType> type)
+    : elements_(std::move(elements)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1)
+    : elements_(std::move(e1)) {}
+
+  explicit Tuple(IValue&& e1, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2)
+    : elements_(std::move(e1), std::move(e2)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1), std::move(e2)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, IValue&& e3)
+    : elements_(std::move(e1), std::move(e2), std::move(e3)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, IValue&& e3, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1), std::move(e2), std::move(e3)), type_(std::move(type)) {}
+
+  friend class c10::intrusive_ptr<Tuple>;
+};
+
+struct Object;
+struct PyObjectHolder;
+struct EnumHolder;
+} // namespace ivalue
+
+// Future
+struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
+ private:
+  // Keep this private in order to force users to go through make_intrusive and
+  // thus prevent creating a Future that's not held by an intrusive_ptr.
+  explicit Future(TypePtr type, std::vector<c10::Device> devices={})
+      : type_(std::move(type)),
+        impl_(getTypeOfDevices(devices)),
+        devices_(sortAndDeduplicateDevices(impl_, std::move(devices))) {}
+
+  friend c10::intrusive_ptr<Future>;
+
+  struct FutureCallback {
+    std::function<void(Future&)> callback;
+    bool uses_future; // whether the Future& passed in is actually used
+
+    template <typename T>
+    FutureCallback(T callback, bool uses_future)
+        : callback(std::move(callback)), uses_future(uses_future) {}
+  };
+
+ public:
+  Future(const Future&) = delete;
+  Future(Future&&) = delete;
+  Future& operator=(const Future&) = delete;
+  Future& operator=(Future&&) = delete;
+
+  struct TORCH_API FutureError final : public std::exception {
+    explicit FutureError(std::string&& error_msg_)
+        : error_msg(std::move(error_msg_)) {}
+
+    FutureError() = default;
+
+    const char* what() const noexcept override {
+      return error_msg.c_str();
+    }
+
+    std::string error_msg;
+  };
+
+  /**
+   * Wait on the future until it completes.
+   */
+  void wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    finished_cv_.wait(lock, [&]() -> bool { return completed_; });
+    synchronizeWithCurrentStreams();
+  }
+
+  /**
+   * Wait on the future until it completes and throw an
+   * exception if an error exists.
+   */
+  void waitAndThrow() {
+    wait();
+
+    if (eptr_) {
+      std::rethrow_exception(eptr_);
+    }
+  }
+
+  /**
+   * Explicitly mark the future as completed with the output value. Optionally,
+   * the storages for all tensors in IValue can be passed as well. The DataPtrs
+   * of these storages are used to synchronize CUDA streams. If storages isn't
+   * given we will attempt to extract it from the value, if we need to (this
+   * happens if a non-empty set of devices was given to the constructor). Thus
+   * one only needs to provide storages when 1) they cannot be extracted through
+   * IValue::getSubValues() or through pickling in case of Python object; or
+   * when 2) customized storage extraction is more efficient.
+   */
+  using WeakStorage = c10::weak_intrusive_ptr<c10::StorageImpl>;
+  void markCompleted(
+      IValue value,
+      c10::optional<std::vector<WeakStorage>> storages = c10::nullopt) {
+    // Start by performing all steps that can throw, before setting any field.
+    // Do this before even acquiring the mutex, because extractStorages might
+    // acquire the GIL, which could lead to a lock inversion with our mutex.
+    // See https://github.com/pytorch/pytorch/issues/58239.
+    std::vector<WeakStorage> actualStorages;
+    std::vector<c10::Device> usedDevices;
+    try {
+      // FIXME We should always extract DataPtrs, in order to catch the case of
+      // users using CUDA values but forgetting to set devices, which currently
+      // leads to a silent synchronization/correctness issue. However, as this
+      // might worsen perf in CPU-only cases, we should only do so after careful
+      // benchmarks.
+      if (impl_.type() != c10::kCPU) {
+        actualStorages =
+            storages.has_value() ? std::move(*storages) : extractStorages(value);
+        usedDevices = getDevicesOfStorages(impl_, actualStorages);
+        ensureIsSubsetOfDevices(usedDevices, devices_);
+      }
+    } catch (const std::exception&) {
+      setError(std::current_exception());
+      return;
+    }
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    TORCH_CHECK(
+        !completed(),
+        "Attempting to mark a completed Future as complete again. Note that "
+        "a Future can only be marked completed once.");
+
+    // Only set value_ and completed_ flag once all checks and preparation steps
+    // have returned successfully to allow for proper error propagation.
+    value_ = std::move(value);
+    completed_ = true;
+
+    currentDevice_ = impl_.getDevice();
+    storages_ = std::move(actualStorages);
+    for (const c10::Device& device : usedDevices) {
+      c10::Event event(impl_.type());
+      event.record(impl_.getStream(device));
+      events_.push_back(std::move(event));
+    }
+
+    std::vector<FutureCallback> cbs;
+    cbs.swap(callbacks_);
+    lock.unlock();
+
+    finished_cv_.notify_all();
+    for (auto& callback : cbs) {
+      invokeCallback(std::move(callback.callback), callback.uses_future);
+    }
+  }
+
+  void markCompleted() {
+    markCompleted(IValue{});
+  }
+
+  void setError(std::exception_ptr eptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    setErrorInternal(std::move(eptr), lock);
+  }
+
+  void setErrorIfNeeded(std::exception_ptr eptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (completed_) {
+      // This should be rare and shouldn't cause log spew. Its important to
+      // log errors and thats why we have this log here.
+      std::string msg = c10::str(
+          "Skipping setting following error on the Future since "
+          "it is already marked completed (this is not necessarily "
+          "an error):\n",
+          tryRetrieveErrorMessageInternal(std::move(eptr)));
+      if (eptr_) {
+        msg += c10::str(
+            ", \nOriginal exception:\n",
+            tryRetrieveErrorMessageInternal(eptr_));
+      }
+      LOG(INFO) << msg;
+      return;
+    } else {
+      setErrorInternal(std::move(eptr), lock);
+    }
+  }
+
+  // Get the result of the current future.
+  IValue value() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    if (eptr_) {
+      std::rethrow_exception(eptr_);
+    }
+    return value_;
+  }
+
+  // This accessor should only be used if we know that the future is
+  // completed() with no error.
+  const IValue& constValue() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    TORCH_INTERNAL_ASSERT(
+      !eptr_,
+      "value() accessor should only be used when future is not completed with ",
+      "an error, but future had the following error: ",
+      tryRetrieveErrorMessageInternal(eptr_)
+    );
+    return value_;
+  }
+
+  // This accessor should only be used if we know that the future is
+  // completed() with no error.
+  const std::vector<WeakStorage>& storages() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    AT_ASSERT(!eptr_);
+    return storages_;
+  }
+
+  /**
+   * Add a callback to the future.
+   * The callbacks will be executed once the future completes.
+   * If the future has already completed,
+   * this function will execute the callback immediately.
+   */
+  template <typename T>
+  void addCallback(T callback, bool uses_future = true) {
+#if __cpp_lib_is_invocable >= 201703
+    static_assert(
+        std::is_invocable_r<void, T, Future&>::value,
+        "The callback must have signature void(Future&)");
+#endif
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (completed()) {
+      lock.unlock();
+      invokeCallback(std::move(callback), uses_future);
+      return;
+    }
+    callbacks_.emplace_back(std::move(callback), uses_future);
+  }
+
+  /**
+   * Add a callback to the future, and return another Future to hold the return
+   * value of the callback. This is necessary when the callback provider needs
+   * to know for sure when the callback has finished.
+   */
+  template <typename T>
+  c10::intrusive_ptr<Future> then(T callback, TypePtr type) {
+    using IValueWithStorages = std::tuple<IValue, std::vector<WeakStorage>>;
+#if __cpp_lib_is_invocable >= 201703
+    static_assert(
+        std::disjunction<
+            std::is_invocable_r<IValue, T, Future&>,
+            std::is_invocable_r<IValueWithStorages, T, Future&>>::value,
+        "The callback must have signature IValue(Future&) or "
+        "std::tuple<IValue, std::vector<Storage>>(Future&)");
+#endif
+    auto childFut = createInstance(::std::move(type));
+    addCallback([childFut,
+                 cb = std::move(callback)](Future& parentFut) mutable {
+      try {
+        if constexpr (::std::is_convertible_v<typename c10::invoke_result_t<T &&, Future&>, IValueWithStorages>) {
+          auto [ivalue, storages] = cb(parentFut);
+          childFut->markCompleted(::std::move(ivalue), ::std::move(storages));
+        } else {
+          childFut->markCompleted(cb(parentFut));
+        }
+      } catch (std::exception&) {
+        childFut->setError(std::current_exception());
+      }
+    });
+    return childFut;
+  }
+
+  template <typename T>
+  c10::intrusive_ptr<Future> thenAsync(T callback, TypePtr type) {
+#if __cpp_lib_is_invocable >= 201703
+    static_assert(
+        std::is_invocable_r<c10::intrusive_ptr<Future>, T, Future&>::value,
+        "The callback must have signature c10::intrusive_ptr<Future>(Future&)");
+#endif
+    auto childFut = createInstance(std::move(type));
+    addCallback(
+        [childFut, cb = std::move(callback)](Future& parentFut) mutable {
+          c10::intrusive_ptr<Future> intermediateFut;
+          try {
+            intermediateFut = cb(parentFut);
+          } catch (std::exception&) {
+            childFut->setError(std::current_exception());
+            return;
+          }
+          intermediateFut->addCallback(
+              [childFut = std::move(childFut)](Future& intermediateFut) {
+                if (intermediateFut.hasError()) {
+                  childFut->setError(intermediateFut.exception_ptr());
+                } else {
+                  childFut->markCompleted(
+                      intermediateFut.value(), intermediateFut.storages());
+                }
+              });
+        });
+    return childFut;
+  }
+
+  // Tries to retrieve the error message from std::exception_ptr.
+  std::string tryRetrieveErrorMessage() const {
+    TORCH_CHECK(hasError(), "No error present on the future.");
+    std::unique_lock<std::mutex> lock(mutex_);
+    return tryRetrieveErrorMessageInternal(eptr_);
+  }
+
+  // Check if the current future has completed
+  bool completed() const {
+    return completed_;
+  }
+
+  bool hasValue() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return completed_ && !eptr_;
+  }
+
+  bool hasError() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return eptr_ ? true : false;
+  }
+
+  std::exception_ptr exception_ptr() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return eptr_;
+  }
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Future& v);
+
+  const TypePtr& elementType() const {
+    return type_;
+  }
+
+  const std::vector<c10::Device>& devices() const {
+    return devices_;
+  }
+
+  // This method should be used when one intends to manually create a child
+  // future, for example when implementing a customized version of then().
+  c10::intrusive_ptr<Future> createInstance(at::TypePtr type) {
+    return c10::make_intrusive<Future>(std::move(type), devices_);
+  }
+
+ private:
+
+  // This method should always be used when invoking a callback (regardless of
+  // how/when that happens) as it will ensure that the proper "environment" is
+  // set up before running the callback, as in, it will set up the CUDA streams,
+  // synchronize them with the value, and so on (if needed).
+  template<typename T>
+  void invokeCallback(T callback, bool uses_future) {
+#if __cpp_lib_is_invocable >= 201703
+    static_assert(
+        std::is_invocable_r<void, T, Future&>::value,
+        "The callback must have signature void(Future&)");
+#endif
+
+    // The synchronization performed below shouldn't be needed when the future
+    // is not used by the callback.
+    if (uses_future) {
+      c10::OptionalDeviceGuard deviceGuard(currentDevice_);
+
+      std::vector<c10::Stream> streams;
+      streams.reserve(devices_.size());
+      for (const c10::Device& device : devices_) {
+        streams.push_back(impl_.getStreamFromGlobalPool(device));
+      }
+      c10::MultiStreamGuard streamGuard(streams);
+      synchronizeWithCurrentStreams();
+      callback(*this);
+    } else {
+      callback(*this);
+    }
+  }
+
+  // This method should be called before this future's value is used, as it
+  // ensures that the CUDA streams that are "current" at the callsite properly
+  // synchronize with the value.
+  void synchronizeWithCurrentStreams() {
+    for (c10::Event& event : events_) {
+      event.block(impl_.getStream(event.device()));
+    }
+
+    for (const WeakStorage& weak_storage : storages_) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      if (!storage->device().is_cpu()) {
+        impl_.recordDataPtrOnStream(
+            storage->data_ptr(), impl_.getStream(storage->device()));
+      }
+    }
+  }
+
+  void setErrorInternal(
+      std::exception_ptr eptr,
+      std::unique_lock<std::mutex>& lock) {
+    TORCH_CHECK(
+        !eptr_,
+        "Error already set on this Future: ",
+        tryRetrieveErrorMessageInternal(eptr_),
+        ", trying to set error: ",
+        tryRetrieveErrorMessageInternal(eptr));
+    TORCH_INTERNAL_ASSERT(!completed(), "Future is already marked completed");
+    completed_ = true;
+    eptr_ = std::move(eptr);
+
+    std::vector<FutureCallback> cbs;
+    cbs.swap(callbacks_);
+    lock.unlock();
+
+    finished_cv_.notify_all();
+    for (auto& callback : cbs) {
+      invokeCallback(std::move(callback.callback), callback.uses_future);
+    }
+  }
+
+  // Tries to retrieve the error message from std::exception_ptr.
+  std::string tryRetrieveErrorMessageInternal(std::exception_ptr eptr) const {
+    try {
+      std::rethrow_exception(std::move(eptr));
+    } catch (const std::exception& e) {
+      return e.what();
+    } catch (...) {
+      return "Unknown Exception Type";
+    }
+  }
+
+  // Defined in ivalue.cpp.
+  static std::vector<WeakStorage> extractStorages(
+      const at::IValue& value);
+
+  static std::vector<c10::Device> getDevicesOfStorages(
+      const c10::impl::VirtualGuardImpl& impl,
+      const std::vector<WeakStorage>& storages) {
+    c10::DeviceIndex deviceCount = impl.deviceCount();
+    std::vector<bool> isDeviceUsed(deviceCount, false);
+    for (const WeakStorage& weak_storage : storages) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      c10::Device device = storage->device();
+      if (!device.is_cpu()) {
+        TORCH_CHECK_VALUE(
+            device.type() == impl.type(),
+            "Expected all data ptrs to be on a device of type ",
+            impl.type(),
+            ", got one on device ",
+            device);
+        isDeviceUsed[device.index()] = true;
+      }
+    }
+    std::vector<c10::Device> devices;
+    for (c10::DeviceIndex idx = 0; idx < deviceCount; idx++) {
+      if (isDeviceUsed[idx]) {
+        devices.emplace_back(impl.type(), idx);
+      }
+    }
+    return devices;
+  }
+
+  static std::string formatSetOfDevices(
+      const std::vector<c10::Device>& devices) {
+    if (devices.empty()) {
+      return "(none)";
+    }
+    std::ostringstream oss;
+    oss << devices[0];
+    for (const auto idx : c10::irange(1, devices.size())) {
+      if (idx == devices.size() - 1) {
+        oss << " and ";
+      } else {
+        oss << ", ";
+      }
+      oss << devices[idx];
+    }
+    return oss.str();
+  }
+
+  static c10::DeviceType getTypeOfDevices(
+      const std::vector<c10::Device>& devices) {
+    if (devices.empty()) {
+      return c10::kCPU;
+    }
+    c10::DeviceType deviceType = devices[0].type();
+    for (const auto idx : c10::irange(1, devices.size())) {
+      TORCH_CHECK_VALUE(
+          devices[idx].type() == deviceType,
+          "Expected all devices to be of the same type, but got a mismatch between ",
+          devices[0],
+          " and ",
+          devices[idx]);
+    }
+    return deviceType;
+  }
+
+  // We need devices to be sorted in order to use ensureIsSubsetOfDevices.
+  static std::vector<c10::Device> sortAndDeduplicateDevices(
+      const c10::impl::VirtualGuardImpl& /*impl*/,
+      std::vector<c10::Device> devices) {
+    std::sort(
+      devices.begin(), devices.end(),
+      [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
+    // Deduplicate by compacting.
+    size_t targetIdx = 0;
+    for (const auto sourceIdx : c10::irange(devices.size())) {
+      TORCH_CHECK_VALUE(
+          devices[sourceIdx].has_index(),
+          "Expected devices to have indices, got ", devices[sourceIdx]);
+      if (targetIdx > 0 && devices[targetIdx - 1].index() == devices[sourceIdx].index()) {
+        // It's a duplicate, skip it.
+        continue;
+      }
+      if (sourceIdx != targetIdx) {
+        devices[targetIdx] = devices[sourceIdx];
+      }
+      targetIdx++;
+    }
+    // If there were duplicates there's now a gap at the end: trim it. Resizing
+    // requires the item type to be default-constructible (which c10::Device is
+    // not) because in principle it could be required to create new items. Since
+    // we know we'll shrink the vector, we provide a custom dummy value instead.
+    devices.resize(targetIdx, c10::Device(c10::kCPU));
+    return devices;
+  }
+
+  static void ensureIsSubsetOfDevices(
+      const std::vector<c10::Device>& subset,
+      const std::vector<c10::Device>& superset) {
+    // We assume the devices in both vectors have the same consistent type, and
+    // their indices are unique and sorted.
+    std::vector<c10::Device> excessDevices;
+    std::set_difference(
+        subset.begin(),
+        subset.end(),
+        superset.begin(),
+        superset.end(),
+        std::back_inserter(excessDevices),
+        [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
+    TORCH_CHECK_VALUE(
+        excessDevices.empty(),
+        "The result contained tensors residing on device(s) ",
+        formatSetOfDevices(excessDevices),
+        " which are not among the expected device(s) ",
+        formatSetOfDevices(superset));
+  }
+
+  mutable std::mutex mutex_;
+  std::atomic_bool completed_ = {false}; // is this future complete
+  std::condition_variable finished_cv_;
+
+  IValue value_; // when finished the value
+  TypePtr type_;
+  std::vector<FutureCallback> callbacks_;
+  std::exception_ptr eptr_;
+
+  // An upcast pointer to a virtual class which allows us to manipulate events,
+  // streams, ... in a generic way, without an explicit dependency on CUDA.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const c10::impl::VirtualGuardImpl impl_;
+
+  // The device that was current when markCompleted was called, which we'll
+  // restore when invoking callbacks. It's optional because we'll only store it
+  // if the future completes successfully.
+  optional<c10::Device> currentDevice_;
+
+  // The events that correspond to the completion of the async I/O kernels. They
+  // are recorded on the appropriate streams when the future is marked completed
+  // and can then be queried/waited/blocked on. There is one event for each
+  // distinct device on which the value's tensors reside.
+  std::vector<c10::Event> events_;
+
+  // A cached version of the storages extracted from the value when the future
+  // is first marked completed.
+  std::vector<WeakStorage> storages_;
+
+  // The bounding set of devices that this future, and any of its children, is
+  // allowed to use. This is a superset of the set of devices used by the events
+  // above. We need this to know what streams (for which devices) to set as
+  // current when invoking a callback, thus allowing the callback to use devices
+  // that the parent future didn't use. This field is set to the value provided
+  // in the constructor and will be "inherited" by all child futures.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::vector<c10::Device> devices_;
+};
+
+struct C10_EXPORT ivalue::Await final : c10::intrusive_ptr_target {
+ private:
+  explicit Await(TypePtr elType, std::function<IValue()> fn)
+      : elType_(std::move(elType)), type_(AwaitType::create(elType_)), fn_(std::move(fn)) {}
+
+  explicit Await(TypePtr elType) : elType_(std::move(elType)), type_(AwaitType::create(elType_)) { }
+
+  friend c10::intrusive_ptr<Await>;
+
+ public:
+  Await(const Await&) = delete;
+  Await(Await&&) = delete;
+  Await& operator=(const Await&) = delete;
+  Await& operator=(Await&&) = delete;
+
+  IValue wait() {
+    if (!completed_) {
+      TORCH_CHECK(fn_, "Incompleted Await: fn can't be None");
+      value_ = fn_();
+      completed_ = true;
+      args_ = {};
+    }
+    return value_;
+  }
+
+  IValue value() {
+    TORCH_CHECK(completed_, "Await must be completed");
+    return value_;
+  }
+
+  void setFn(std::function<IValue()> fn) {
+    fn_ = std::move(fn);
+  }
+
+  bool completed() {
+    return completed_;
+  }
+
+  void markCompleted(IValue value) {
+    value_ = std::move(value);
+    completed_ = true;
+  }
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Await& v);
+
+  const TypePtr& elementType() const {
+    return elType_;
+  }
+
+  const TypePtr& type() const {
+    return type_;
+  }
+
+  void setArgs(std::vector<IValue> args) {
+    args_ = std::move(args);
+  }
+
+  std::vector<IValue>& args() {
+    return args_;
+  }
+
+ private:
+  TypePtr elType_;
+  TypePtr type_;
+  std::vector<IValue> args_;
+  std::function<IValue()> fn_;
+  IValue value_;
+  bool completed_{};
+};
+
+// Input is a list of Futures with the same target type.
+// Output is a Future to the List of completed Futures.
+TORCH_API intrusive_ptr<ivalue::Future> collectAll(
+    const c10::List<c10::intrusive_ptr<ivalue::Future>>& srcs);
+// Input is a List of Futures with the same target type.
+// Output is a Future that will be updated with a seen value.
+TORCH_API intrusive_ptr<ivalue::Future> collectAny(
+    const c10::List<c10::intrusive_ptr<ivalue::Future>>& srcs);
+
+// User-defined object.
+struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
+ public:
+  // In general, class types hold a shared_ptr to its owning CompilationUnit,
+  // so that its type and methods do not get deallocated while the class exists.
+  // However, the CompilationUnit holds ownership of the type's graphs, so
+  // inserting a constant object into a Graph would create a reference cycle if
+  // that constant object held a shared_ptr to its CU. For these objects we
+  // instatiate them with non-owning references to its CU
+  Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
+    slots_.resize(numSlots);
+  }
+
+  Object(StrongTypePtr type, size_t numSlots)
+      : type_(WeakOrStrongTypePtr(std::move(type))) {
+    slots_.resize(numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(
+      WeakOrStrongTypePtr type,
+      size_t numSlots) {
+    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(
+      StrongTypePtr type,
+      size_t numSlots) {
+    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(ClassTypePtr classType, size_t numSlots);
+
+  /**
+   * Slot API.
+   *
+   * Attributes are stored as a simple vector so that lookups are fast at
+   * runtime. A "slot" is just an index into that vector, which can be computed
+   * statically if you have access to the class type. Use this API if you are
+   * writing compiler stuff.
+   */
+  void setSlot(size_t slot, IValue v) {
+    if (slot >= slots_.size()) {
+      // for module types, it is possible that the members of the class have
+      // expanded after the object was created. In this case, we expand
+      // the slots to the right size
+      resizeObject(slot);
+    }
+    slots_[slot] = std::move(v);
+  }
+
+  const IValue& getSlot(size_t slot) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(slot < slots_.size());
+    // NOTE: This lookup is fairly hot, so we use unchecked access to the
+    // vector.  Errors should still be detectable with ASan.
+    return slots_[slot];
+  }
+
+  void unsafeRemoveSlot(size_t slot) {
+    TORCH_CHECK(slot < slots_.size());
+    slots_.erase(slots_.begin() + static_cast<std::ptrdiff_t>(slot));
+  }
+
+  /**
+   * Attribute API.
+   *
+   * Wrappers around the slot stuff so that users can access attributes
+   * directly. Use this API if you are a user.
+   *
+   * Note: Unlike in Python, TorchScript must make a distinction between
+   * attributes (which are IValues) and methods (which are Methods). If you
+   * want a method, use `obj.type()->getMethod()`
+   */
+  IValue getAttr(const std::string& name) const;
+  void setAttr(const std::string& name, IValue v);
+  // Remove attribute by name, caller is responsible for
+  // the safety of this operation
+  // We didn't remove the attribute in the type because the type
+  // might be shared by multiple objects.
+  // Therefore after removing attribute, the object is in an inconsistent
+  // state where it has more attribute types in its Type than
+  // the attribute slots it has, user needs to make sure the object
+  // has consistent by removing the attribute in type as well
+  void unsafeRemoveAttr(const std::string& name);
+
+  std::string name() const;
+
+  const std::vector<IValue>& slots() const {
+    return slots_;
+  }
+  std::shared_ptr<ClassType> type() const;
+
+  std::shared_ptr<torch::jit::CompilationUnit> compilation_unit() {
+    if (type_.holds_strong_ref()) {
+      return type_.cu_.getStrongRefOrThrow();
+    } else {
+      auto weak_ptr = type_.cu_.getWeakRefOrThrow();
+      return std::shared_ptr<torch::jit::CompilationUnit>(weak_ptr);
+    }
+  }
+
+  c10::intrusive_ptr<Object> copy_to_weak_compilation_ref() const;
+
+  void unsafe_make_weak_compilation_ref() {
+    type_ = WeakOrStrongTypePtr(type_.asWeakTypePtr());
+  }
+
+  c10::intrusive_ptr<Object> copy() const;
+
+  c10::intrusive_ptr<Object> deepcopy(
+      c10::optional<at::Device> device = c10::nullopt) const;
+
+  c10::intrusive_ptr<Object> deepcopy(
+      IValue::HashAliasedIValueMap& memo,
+      c10::optional<at::Device> device = c10::nullopt) const;
+
+  bool is_weak_compilation_ref() const {
+    return !type_.holds_strong_ref();
+  }
+
+  bool is_empty_strong_compilation_ref() const {
+    return type_.holds_empty_strong_ref();
+  }
+
+ private:
+  void resizeObject(size_t slot);
+  WeakOrStrongTypePtr type_;
+  std::vector<IValue> slots_;
+};
+
+// virtual ivalue PyObjectHolder that hold a py::object, we make this virtual
+// because the py::object and refcounting logic should happen in libtorch_python
+// see concrete implementation in python_ivalue.h
+struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
+ public:
+  virtual PyObject* getPyObject() = 0;
+  virtual c10::InferredType tryToInferType() = 0;
+  virtual IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt) = 0;
+  virtual std::string toStr() = 0;
+  virtual std::vector<at::Tensor> extractTensors() = 0;
+
+  ~PyObjectHolder() override = default;
+};
+
+struct ivalue::EnumHolder : c10::intrusive_ptr_target {
+ public:
+  EnumHolder(std::shared_ptr<EnumType> type, std::string name, IValue value)
+      : type_(std::move(type)),
+        name_(std::move(name)),
+        value_(std::move(value)) {}
+
+  bool is(const ivalue::EnumHolder& rhs) {
+    return *this == rhs;
+  }
+
+  friend bool operator==(
+      const ivalue::EnumHolder& lhs,
+      const ivalue::EnumHolder& rhs);
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ivalue::EnumHolder& v);
+
+  TORCH_API const std::string& qualifiedClassName() const;
+
+  const std::string& unqualifiedClassName() const;
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  const IValue& value() const {
+    return value_;
+  }
+
+  std::shared_ptr<EnumType> type() const {
+    return type_;
+  }
+
+ private:
+  std::shared_ptr<EnumType> type_;
+  std::string name_;
+  IValue value_;
+};
+
+#undef TORCH_FORALL_TAGS
+
+namespace detail {
+
+struct _guarded_unsigned_long_unique_dummy final {
+  _guarded_unsigned_long_unique_dummy(int64_t){};
+};
+using _guarded_unsigned_long = std::conditional_t<
+    std::is_same<unsigned long, uint32_t>::value ||
+        std::is_same<unsigned long, uint64_t>::value,
+    _guarded_unsigned_long_unique_dummy,
+    unsigned long>;
+
+} // namespace detail
+
+inline ivalue::Object& IValue::toObjectRef() const {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference");
+  return *static_cast<c10::ivalue::Object*>(payload.u.as_intrusive_ptr);
+}
+
+// note: when adding a DEFINE_TO case here you should also add a
+// toX method to IValue. These named methods are much more discoverable
+// than the to templated function.
+
+#define DEFINE_TO(T, method_name)                          \
+  template <>                                              \
+  inline T IValue::to<T>()&& {                             \
+    return static_cast<T>(std::move(*this).method_name()); \
+  }                                                        \
+  template <>                                              \
+  inline c10::detail::ivalue_to_const_ref_overload_return<T>::type IValue::to<T>() const& { \
+    typedef c10::detail::ivalue_to_const_ref_overload_return<T>::type return_type;          \
+    return static_cast<return_type>(this->method_name());                                   \
+  }
+
+DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(at::Storage, toStorage)
+DEFINE_TO(c10::Stream, toStream)
+DEFINE_TO(float, toDouble)
+DEFINE_TO(double, toDouble)
+DEFINE_TO(c10::complex<double>, toComplexDouble)
+DEFINE_TO(unsigned char, toInt)
+DEFINE_TO(signed char, toInt)
+DEFINE_TO(unsigned short, toInt)
+DEFINE_TO(short, toInt)
+DEFINE_TO(int, toInt)
+DEFINE_TO(uint32_t, toInt)
+DEFINE_TO(uint64_t, toInt)
+DEFINE_TO(detail::_guarded_unsigned_long, toInt)
+DEFINE_TO(int64_t, toInt)
+DEFINE_TO(bool, toBool)
+DEFINE_TO(c10::intrusive_ptr<caffe2::Blob>, toBlob);
+DEFINE_TO(c10::intrusive_ptr<ivalue::ConstantString>, toString)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Object>, toObject)
+DEFINE_TO(at::Scalar, toScalar)
+DEFINE_TO(c10::List<int64_t>, toIntList)
+DEFINE_TO(c10::List<double>, toDoubleList)
+DEFINE_TO(c10::List<c10::complex<double>>, toComplexDoubleList)
+DEFINE_TO(c10::List<bool>, toBoolList)
+DEFINE_TO(c10::List<at::Tensor>, toTensorList)
+DEFINE_TO(c10::impl::GenericList, toList)
+DEFINE_TO(c10::impl::GenericDict, toGenericDict)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
+DEFINE_TO(std::string, toStringRef)
+DEFINE_TO(c10::string_view, toStringView)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Future>, toFuture)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Await>, toAwait)
+DEFINE_TO(c10::intrusive_ptr<c10::RRefInterface>, toRRef)
+DEFINE_TO(c10::intrusive_ptr<at::Quantizer>, toQuantizer)
+DEFINE_TO(IValue, toIValue)
+DEFINE_TO(c10::Device, toDevice)
+DEFINE_TO(at::ScalarType, toScalarType)
+DEFINE_TO(at::Layout, toLayout)
+DEFINE_TO(at::MemoryFormat, toMemoryFormat)
+DEFINE_TO(at::QScheme, toQScheme)
+DEFINE_TO(at::Dimname, toDimname)
+DEFINE_TO(at::Generator, toGenerator)
+DEFINE_TO(c10::SymInt, toSymInt)
+DEFINE_TO(c10::SymFloat, toSymFloat)
+DEFINE_TO(c10::SymBool, toSymBool)
+
+template <class T>
+struct _fake_type {};
+
+// generic_to<T> converts an IValue from a generic list or generic dict
+// to a concrete list/dict type likelike List<T>, Dict<...> or optional<T>.
+// Note that in the case of lists, this only works for IValue-based lists,
+// i.e. not for int64_t, double, ...
+// generic_to<T> is an implementation detail of IValue::to<T> and not
+// supposed to be called directly.
+// The _fake_type<T> parameter allows us to overload
+// based on the return type.
+template <class Elem>
+// TODO this is deprecated but we don't throw a warning because a lot of ops in
+// native_functions.yaml still return std::vector.
+// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
+// and deprecated. Please use torch::List<T> instead.")
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+  // We need to do a deep copy of the vector because there might be other
+  // references to this same IValue that also use the list. We can't just
+  // move the elements out.
+  auto list = std::move(ivalue).to<List<Elem>>();
+  std::vector<Elem> result;
+  result.reserve(list.size());
+  for (Elem v : list) {
+    result.push_back(std::move(v));
+  }
+  return result;
+}
+
+template <typename T>
+c10::intrusive_ptr<T> IValue::toCustomClass() && {
+  static_assert(
+      std::is_base_of<torch::CustomClassHolder, T>::value == true,
+      "toCustomClass requires that template parameter T must inherit "
+      "from torch::CustomClassHolder");
+  auto obj = toObject();
+  TORCH_CHECK(
+      obj->slots().size() == 1,
+      "Tried to cast IValue to custom class but it did "
+      "not contain a custom class!");
+  const auto* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
+  auto userObj =
+      c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
+  return userObj;
+}
+
+template <typename T>
+c10::intrusive_ptr<T> IValue::toCustomClass() const& {
+  static_assert(
+      std::is_base_of<torch::CustomClassHolder, T>::value == true,
+      "toCustomClass requires that template parameter T must inherit "
+      "from torch::CustomClassHolder");
+  auto obj = toObject();
+  TORCH_CHECK(
+      obj->slots().size() == 1,
+      "Tried to cast IValue to custom class but it did "
+      "not contain a custom class!");
+  const auto* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
+  auto userObj =
+      c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
+  return userObj;
+}
+
+template <typename T>
+T generic_to(IValue ivalue, _fake_type<T>) {
+  using ElemType = typename std::remove_pointer<T>::type::element_type;
+  return std::move(ivalue).toCustomClass<ElemType>();
+}
+
+template <typename T>
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+  return tagged_capsule<T>{std::move(ivalue)};
+}
+
+template <typename Elem>
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+  return impl::toTypedList<Elem>(std::move(ivalue).toList());
+}
+
+template <typename T>
+static T createVectorLikeFromList(const c10::detail::ListImpl* impl) {
+  T result;
+  result.reserve(impl->list.size());
+  for (const auto & i : impl->list) {
+    result.push_back(i.to<typename T::value_type>());
+  }
+  return result;
+}
+
+template <typename T>
+static std::vector<T> createVectorFromList(const c10::detail::ListImpl* impl) {
+  return createVectorLikeFromList<std::vector<T>>(impl);
+}
+
+template <typename T>
+std::vector<T> createVectorFromList(const c10::List<T>& impl) {
+  std::vector<T> result;
+  result.reserve(impl.size());
+  for (size_t i = 0, N = impl.size(); i < N; ++i) {
+    result.push_back(impl[i]);
+  }
+  return result;
+}
+
+template <typename T>
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+  if (ivalue.isNone()) {
+    return {};
+  }
+  return createVectorFromList<T>(
+    std::move(ivalue).to<c10::List<T>>()
+  );
+}
+
+namespace detail {
+template <typename Elem, size_t... I>
+std::array<Elem, sizeof...(I)> generic_to_array(
+    IValue ivalue,
+    _fake_type<std::array<Elem, sizeof...(I)>>,
+    std::index_sequence<I...>) {
+  // We need to do a deep copy of the array because there might be other
+  // references to this same IValue that also use the list. We can't just
+  // move the elements out.
+  auto list = std::move(ivalue).to<List<Elem>>();
+  TORCH_CHECK(
+      list.size() == sizeof...(I),
+      "Tried to convert a List with ",
+      list.size(),
+      " elements to a fixed-size array of size ",
+      sizeof...(I));
+  return {list[I]...};
+}
+} // namespace detail
+
+template <typename Elem, size_t N>
+std::array<Elem, N> generic_to(
+    IValue ivalue,
+    _fake_type<std::array<Elem, N>> ft) {
+  return detail::generic_to_array(ivalue, ft, std::make_index_sequence<N>());
+}
+
+template <typename Key, typename Value>
+c10::Dict<Key, Value> generic_to(
+    IValue ivalue,
+    _fake_type<c10::Dict<Key, Value>>) {
+  return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
+}
+
+template <typename K, typename V>
+C10_DEPRECATED_MESSAGE(
+    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
+std::unordered_map<K, V> generic_to(
+    IValue ivalue,
+    _fake_type<std::unordered_map<K, V>>) {
+  std::unordered_map<K, V> specialized_dict;
+
+  for (const auto& item : std::move(ivalue).toGenericDict()) {
+    specialized_dict[item.key().template to<K>()] = item.value().template to<V>();
+  }
+
+  return specialized_dict;
+}
+
+template <typename T>
+c10::optional<T> generic_to(IValue ivalue, _fake_type<c10::optional<T>>) {
+  if (ivalue.isNone()) {
+    return c10::nullopt;
+  }
+  return std::move(ivalue).to<T>();
+}
+
+namespace detail {
+template <typename Tuple, std::size_t... INDEX>
+Tuple generic_to_tuple_impl(
+    const ivalue::TupleElements& t,
+    std::index_sequence<INDEX...>) {
+  return std::make_tuple(
+      t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
+}
+} // namespace detail
+
+template <
+    typename... Args,
+    typename Indices = std::make_index_sequence<sizeof...(Args)>,
+    std::enable_if_t<
+        !std::disjunction<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>::value,
+        std::nullptr_t> = nullptr>
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+  const auto& vals = ivalue.toTupleRef().elements();
+  TORCH_CHECK(vals.size() == sizeof...(Args));
+  return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
+}
+
+template <typename T>
+inline T IValue::to() && {
+  return generic_to(std::move(*this), _fake_type<T>{});
+}
+
+template <>
+inline c10::optional<c10::string_view> IValue::to() && {
+  // In the default implementation, the IValue is destroyed with std::move.
+  // But if the unboxed type is optional<string_view> we cannot destroy
+  // the IValue.
+  return generic_to(*this, _fake_type<c10::optional<c10::string_view>>{});
+}
+
+template <typename T>
+inline typename c10::detail::ivalue_to_const_ref_overload_return<T>::type IValue::to() const& {
+  return generic_to(*this, _fake_type<T>{});
+}
+
+inline c10::List<int64_t> IValue::toIntList() && {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  return c10::List<int64_t>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<int64_t> IValue::toIntList() const& {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  return c10::List<int64_t>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<int64_t> IValue::toIntVector() const {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toIntVector on null intrusive_ptr IValue");
+  return createVectorFromList<int64_t>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline std::vector<c10::SymInt> IValue::toSymIntVector() const {
+  AT_ASSERT(isSymIntList() || isIntList(), "Expected SymIntList or IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toSymIntVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::SymInt>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline at::DimVector IValue::toDimVector() const {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDimVector on null intrusive_ptr IValue");
+  return createVectorLikeFromList<at::DimVector>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<double> IValue::toDoubleList() && {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  return c10::List<double>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<double> IValue::toDoubleList() const& {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  return c10::List<double>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<double> IValue::toDoubleVector() const {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDoubleVector on null intrusive_ptr IValue");
+  return createVectorFromList<double>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<c10::complex<double>> IValue::toComplexDoubleList() && {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  return c10::List<c10::complex<double>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<c10::complex<double>> IValue::toComplexDoubleList() const& {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  return c10::List<c10::complex<double>>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<c10::complex<double>> IValue::toComplexDoubleVector() const {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toComplexDoubleVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::complex<double>>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<bool> IValue::toBoolList() && {
+  AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
+  return c10::List<bool>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<bool> IValue::toBoolList() const& {
+  AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
+  return c10::List<bool>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<at::Tensor> IValue::toTensorList() && {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  return c10::List<at::Tensor>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<at::Tensor> IValue::toTensorList() const& {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  return c10::List<at::Tensor>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<at::Tensor> IValue::toTensorVector() const {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTensorVector on null intrusive_ptr IValue");
+  return createVectorFromList<at::Tensor>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() && {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<c10::optional<at::Tensor>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<c10::optional<at::Tensor>> IValue::toOptionalTensorList() const& {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<c10::optional<at::Tensor>>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<c10::optional<at::Tensor>> IValue::toOptionalTensorVector() const {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalTensorVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::optional<at::Tensor>>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<IValue> IValue::toList() && {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  return c10::List<IValue>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<IValue> IValue::toList() const& {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  return c10::List<IValue>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::ArrayRef<IValue> IValue::toListRef() const {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toListRef on null intrusive_ptr IValue");
+  return static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr)
+      ->list;
+}
+inline c10::Dict<IValue, IValue> IValue::toGenericDict() && {
+  AT_ASSERT(isGenericDict(), "Expected GenericDict but got ", tagKind());
+  return c10::Dict<IValue, IValue>(moveToIntrusivePtr<c10::detail::DictImpl>());
+}
+inline c10::Dict<IValue, IValue> IValue::toGenericDict() const& {
+  AT_ASSERT(isGenericDict(), "Expected GenericDict but got ", tagKind());
+  return c10::Dict<IValue, IValue>(toIntrusivePtr<c10::detail::DictImpl>());
+}
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() && {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Tuple>();
+}
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const& {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  return toIntrusivePtr<ivalue::Tuple>();
+}
+inline ivalue::Tuple& IValue::toTupleRef() const {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTupleRef on null intrusive_ptr IValue");
+  return *static_cast<c10::ivalue::Tuple*>(
+      payload.u.as_intrusive_ptr);
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
+    : tag(Tag::Tuple) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+template <
+    typename... Args,
+    std::enable_if_t<
+        !std::disjunction<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>::value,
+        std::nullptr_t>>
+inline IValue::IValue(const std::tuple<Args...>& t)
+    : IValue(c10::guts::apply(c10::ivalue::Tuple::create<const Args&...>, t)) {
+}
+
+template <
+    typename... Args,
+    std::enable_if_t<
+        !std::disjunction<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>::value,
+        std::nullptr_t>>
+inline IValue::IValue(std::tuple<Args...>&& t)
+    : IValue(c10::guts::apply(c10::ivalue::Tuple::create<Args&&...>, std::move(t))) {
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
+    : tag(Tag::String) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+inline IValue::IValue(std::string v)
+    : IValue(ivalue::ConstantString::create(std::move(v))) {}
+
+inline IValue::IValue(c10::impl::GenericList v)
+    : tag(Tag::GenericList) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::List<T>&& v) : IValue(impl::toList<T>(std::move(v))) {}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(const c10::List<T>& v) : IValue(impl::toList<T>(v)) {}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(at::ArrayRef<T> v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (const auto& e : v) {
+    list.push_back(e);
+  }
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(at::ArrayRef<T> v) : IValue() {
+  auto vi = c10::asIntArrayRefSlowOpt(v);
+  if (vi.has_value()) {
+    // This list is entirely integers; ensure it is typed as
+    // an IntList so toIntList works
+    *this = IValue(*vi);
+  } else {
+    // This list has SymInts; type it as a SymInt
+    *this = IValue(impl::toList<c10::SymInt>(c10::List<c10::SymInt>()));
+    auto list = to<c10::List<c10::SymInt>>();
+    list.reserve(v.size());
+    for (const auto& e : v) {
+      list.push_back(e);
+    }
+  }
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(at::OptionalArrayRef<T> mb_v) : IValue() {
+  if (!mb_v.has_value()) return;
+  *this = IValue(*mb_v);
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(const std::vector<T>& v) : IValue() {
+  *this = IValue(at::ArrayRef<T>(v));
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue() {
+  auto vi = c10::asIntArrayRefSlowOpt(v);
+  if (vi.has_value()) {
+    // This list is entirely integers; ensure it is typed as
+    // an IntList so toIntList works
+    *this = IValue(*vi);
+  } else {
+    // This list has SymInts; type it as a SymInt
+    *this = IValue(impl::toList<c10::SymInt>(c10::List<c10::SymInt>()));
+    auto list = to<c10::List<c10::SymInt>>();
+    list.reserve(v.size());
+    for (auto& e : v) {
+      list.push_back(std::move(e));
+    }
+  }
+}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (const auto& e : v) {
+    list.push_back(e);
+  }
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  if constexpr (std::is_same_v<T, bool>) {
+    for (auto e : v) {
+      list.push_back(e);
+    }
+  } else {
+    for (auto& e : v) {
+      list.push_back(std::move(e));
+    }
+  }
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::OptionalArrayRef<T> v) : IValue() {
+  if (v.has_value()) {
+    *this = IValue(std::move(*v));
+  }
+}
+
+template <class T, size_t N>
+inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (auto& e : v) {
+    list.push_back(std::move(e));
+  }
+}
+
+template <class T, IValue::enable_if_ilist_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::IListRef<T> v) : IValue() {
+  constexpr bool boxed_type_constructs_ivalue =
+      std::is_constructible<IValue, typename c10::IListRef<T>::boxed_type>::value;
+  // First, we try to use the boxed value.
+  // If we fail (either it's not in the boxed state, or its boxed type
+  // can not construct an IValue), we fallback to copying the list.
+  if (boxed_type_constructs_ivalue && v.isBoxed()) {
+    *this = IValue(impl::toList(v.toBoxed()));
+  } else {
+    c10::List<T> list;
+    list.reserve(v.size());
+    for (const auto& t : v) {
+      list.push_back(t);
+    }
+    *this = IValue(impl::toList(std::move(list)));
+  }
+}
+
+inline IValue::IValue(c10::impl::GenericDict v)
+    : tag(Tag::GenericDict) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
+}
+template <class Key, class Value>
+inline IValue::IValue(c10::Dict<Key, Value> v)
+    : IValue(impl::toGenericDict(std::move(v))) {}
+
+template <class Key, class Value>
+inline IValue::IValue(std::unordered_map<Key, Value> v)
+    : IValue(Dict<Key, Value>()) {
+  auto dict = to<c10::Dict<Key, Value>>();
+  dict.reserve(v.size());
+  for (auto& e : v) {
+    dict.insert(std::move(e.first), std::move(e.second));
+  }
+}
+
+template <class T, IValue::enable_if_ivalue_constructible<T>>
+inline IValue::IValue(c10::optional<T> v) : IValue() {
+  if (v.has_value()) {
+    *this = IValue(std::move(*v));
+  }
+}
+
+inline IValue::IValue(c10::nullopt_t) : IValue() {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
+    : tag(Tag::Object) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v)
+    : tag(Tag::PyObject) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::EnumHolder> v)
+    : tag(Tag::Enum) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue IValue::make_capsule(
+    intrusive_ptr<torch::CustomClassHolder> blob) {
+  IValue iv;
+  iv.tag = Tag::Capsule;
+  iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
+  return iv;
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_base_of<torch::CustomClassHolder, T>::value, int>>
+IValue::IValue(c10::intrusive_ptr<T> custom_class) : tag(Tag::Object) {
+  auto classType = []() {
+    try {
+      return c10::getCustomClassType<c10::intrusive_ptr<T>>();
+    } catch (const c10::Error&) {
+      throw c10::Error(
+          "Trying to instantiate a class that isn't a registered custom class: " +
+          std::string(c10::util::get_fully_qualified_type_name<T>()),
+          "");
+    }
+  }();
+  auto ivalue_obj = c10::ivalue::Object::create(std::move(classType), /* numSlots */1);
+  ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class)));
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release());
+
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
+    : tag(Tag::Future) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Await> v)
+    : tag(Tag::Await) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
+    : tag(Tag::RRef) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<at::Quantizer> v)
+    : tag(Tag::Quantizer) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+template <typename T>
+inline IValue::IValue(c10::complex<T> c)
+    : tag(Tag::ComplexDouble) {
+  auto v = c10::make_intrusive<ivalue::ComplexHolder>(c);
+  payload.u.as_intrusive_ptr = v.release();
+}
+
+inline const std::string& IValue::toStringRef() const {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringRef on null intrusive_ptr IValue");
+  return static_cast<const c10::ivalue::ConstantString*>(
+             payload.u.as_intrusive_ptr)
+      ->string();
+}
+inline c10::optional<std::reference_wrapper<const std::string>> IValue::
+    toOptionalStringRef() const {
+  if (isNone()) {
+    return c10::nullopt;
+  }
+  AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalStringRef on null intrusive_ptr IValue");
+  return std::reference_wrapper<const std::string>(
+      static_cast<const c10::ivalue::ConstantString*>(payload.u.as_intrusive_ptr)
+          ->string());
+}
+
+inline c10::string_view IValue::toStringView() const {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringView on null intrusive_ptr IValue");
+  return static_cast<const c10::ivalue::ConstantString*>(
+        payload.u.as_intrusive_ptr)
+    ->string_view();
+}
+
+inline PyObject* IValue::toPyObject() const {
+  return toPyObjectHolder()->getPyObject();
+}
+
+template <typename T>
+inline optional<T> IValue::toOptional() {
+  if (this->isNone()) {
+    return nullopt;
+  }
+  return this->to<T>();
+}
+
+template <typename T>
+inline optional<T> IValue::toOptional() const {
+  if (this->isNone()) {
+    return nullopt;
+  }
+  return this->to<T>();
+}
+
+inline bool IValue::isCustomClass() const {
+  return torch::isCustomClass(*this);
+}
+
+inline bool IValue::isSameIdentity(const IValue& rhs) const {
+  // We choose to not use memcmp for payload check due to potential random
+  // padding characters on union type
+
+  // Semantics:
+  // 1. Immutable primitive values of the same type (Int, Double, None, Bool,
+  // Str) return value equality
+  // 2. If it is a tensor type, we need to take undefined tensor into account
+  // 3. Undefined_tensor is None and vice versa should be true
+  // 4. If it is a reference type (i.e. isIntrusivePtr()), then is True when
+  // the pointed-to object is the same.
+  // 5. False for all other comparisons.
+  if (this->isNone() && rhs.isNone()) {
+    return true;
+  } else if (this->isBool() && rhs.isBool()) {
+    // for bool type, do equality check
+    return this->toBool() == rhs.toBool();
+  } else if (this->isTensor() && rhs.isTensor()) {
+    return this->payload.as_tensor.is_same(rhs.payload.as_tensor);
+  } else if (this->isTensor() && rhs.isNone()) {
+    // special case: undefined tensor and None are the same identity
+    return !this->payload.as_tensor.defined();
+  } else if (this->isNone() && rhs.isTensor()) {
+    // special case: undefined tensor and None are the same identity
+    return !rhs.payload.as_tensor.defined();
+  } else if (this->isInt() && rhs.isInt()) {
+    return this->toInt() == rhs.toInt();
+  } else if (this->isDouble() && rhs.isDouble()) {
+    return this->toDouble() == rhs.toDouble();
+  } else if (this->isString() && rhs.isString()) {
+    return this->toStringRef() == rhs.toStringRef();
+  } else {
+    // for objects holding in IValue, do shallow compare on pointer address to
+    // testify the identity
+    return this->isIntrusivePtr() && rhs.isIntrusivePtr() &&
+        this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
+  }
+}
+
+namespace ivalue {
+namespace detail {
+
+template <typename T>
+IValue from_(T&& x, std::true_type) {
+  return IValue(std::forward<T>(x));
+}
+template <typename T>
+IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+  return IValue(std::move(x));
+}
+template <typename T>
+IValue from_(T&& /*x*/, std::false_type) {
+  static_assert(
+      guts::false_t<T>::value,
+      "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
+  return IValue();
+}
+} // namespace detail
+
+template <typename T>
+IValue from(T&& x) {
+  return detail::from_(
+      std::forward<T>(x), typename std::is_constructible<IValue, T>::type{});
+}
+
+} // namespace ivalue
+
+
+template <>
+struct MaybeOwnedTraits<IValue> {
+  using owned_type = IValue;
+  using borrow_type = IValue;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    if (!from.isPtrType()) {
+      return from;
+    }
+    if (from.isTensor()) {
+      return IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(from.toTensor()));
+    } else {
+      return IValue(from.payload, from.tag);
+    }
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.clearToNone();
+    if (!rhs.isPtrType()) {
+      lhs = rhs;
+    } else if (rhs.isTensor()) {
+      lhs = IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(rhs.toTensor()));
+    } else {
+      lhs = IValue(rhs.payload, rhs.tag);
+    }
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.clearToNone();
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type&) {
+    return true;
+  }
+};
+
+template <>
+struct IValue::TagType<c10::Type> {
+  static TORCH_API c10::TypePtr get(const IValue&);
+};
+
+template <>
+struct IValue::TagType<c10::DynamicType> {
+  static TORCH_API c10::TypePtr get(const IValue&);
+};
+
+template <typename T>
+TypePtr IValue::type() const {
+  return IValue::TagType<T>::get(*this);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_to.h b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_to.h
new file mode 100644
index 0000000000000000000000000000000000000000..f750de76cfa9dc1ae0b1ef975526b38d70eb8bb0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/ivalue_to.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <string>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace c10 {
+struct IValue;
+namespace detail {
+// Determine the return type of `IValue::to() const &`. It's a const
+// reference when possible and a copy otherwise. It is in this
+// separate header so that List can use it as well.
+template<typename T>
+struct ivalue_to_const_ref_overload_return {
+  using type = T;
+};
+
+template<>
+struct ivalue_to_const_ref_overload_return<at::Tensor> {
+  using type = const at::Tensor&;
+};
+
+template<>
+struct ivalue_to_const_ref_overload_return<std::string> {
+  using type = const std::string&;
+};
+
+template<>
+struct ivalue_to_const_ref_overload_return<IValue> {
+  using type = const IValue&;
+};
+
+} // namespace detail
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type.h b/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f3a855c1f847f9ea19789cf16a697c87bb77443
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type.h
@@ -0,0 +1,2425 @@
+#pragma once
+
+#include <ATen/core/custom_class.h>
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/symbol.h>
+#include <ATen/core/type_factory.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/Optional.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/Device.h>
+
+#include <array>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace jit {
+struct Function;
+} // namespace jit
+} // namespace torch
+
+namespace c10 {
+
+template<class Key, class Value>
+class Dict;
+struct IValue;
+struct FunctionSchema;
+struct NamedType;
+using OptNameList = c10::optional<std::vector<std::string>>;
+
+void standardizeVectorForUnion(std::vector<TypePtr>& reference, std::vector<TypePtr>* to_fill);
+void standardizeVectorForUnion(std::vector<TypePtr>* to_flatten);
+
+inline bool is_contiguous_strides(
+    const IntArrayRef sizes,
+    const IntArrayRef strides) {
+  int n_dim = static_cast<int>(sizes.size());
+  if (n_dim == 0) {
+    return true;
+  }
+
+  if (strides[n_dim - 1] != 1) {
+    return false;
+  }
+
+  for (int i = n_dim - 2; i >= 0; i--) {
+    if (strides[i] != strides[i + 1] * sizes[i + 1]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+struct AnyType;
+using AnyTypePtr = SingletonTypePtr<AnyType>;
+// Any is the top of the type hierarchy, all other types are subtypes
+// T <: Any, forall T
+struct TORCH_API AnyType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Any";
+  }
+  static const TypeKind Kind = TypeKind::AnyType;
+  // global singleton
+  static AnyTypePtr get();
+
+ private:
+  AnyType() : Type(TypeKind::AnyType) {}
+};
+
+inline std::string toString(const Type& type) {
+  return type.str();
+}
+
+// Shim for compatibility with code that uses TypePtr.
+inline std::string toString(const TypePtr& typePtr) {
+  return toString(*typePtr);
+}
+
+inline bool operator!=(const Type& lhs, const Type& rhs) {
+  return !(lhs == rhs);
+}
+
+// common base for all types that have a single sub element
+// e.g. Future[T], Optional[T], List[T]
+template <TypeKind K, typename T>
+struct SingleElementType : public SharedType {
+  static const TypeKind Kind = K;
+
+  const TypePtr& getElementType() const {
+    return elem;
+  }
+
+  bool hasFreeVariables() const override {
+    return getElementType()->hasFreeVariables();
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return elem;
+  }
+
+  bool equals(const Type& rhs) const override {
+    if (auto rhs_ = rhs.cast<T>()) {
+      return *getElementType() == *rhs_->getElementType();
+    }
+    return false;
+  }
+
+ protected:
+  SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
+    if (!this->elem) {
+      throw std::runtime_error(c10::str(
+            "Can not create ", typeKindToString(Kind), " with None type"));
+    }
+  }
+
+ private:
+  TypePtr elem;
+};
+
+struct UnionType;
+using UnionTypePtr = std::shared_ptr<UnionType>;
+struct TORCH_API UnionType : public SharedType {
+  friend struct Type;
+
+  static const TypeKind Kind = TypeKind::UnionType;
+
+  bool isSubtypeOfExt(const Type& rhs_, std::ostream* why_not) const override;
+
+  std::string str() const override;
+
+  static UnionTypePtr create(std::vector<TypePtr> reference);
+
+  bool equals(const Type& rhs) const override;
+
+  bool isUnionType() const override {
+    return true;
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return types_;
+  }
+
+  // For testing purposes only
+  at::ArrayRef<TypePtr> getTypes() const {
+    return types_;
+  }
+
+  TypePtr createWithContained(std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types));
+  }
+
+  bool canHoldType(const Type& type) const;
+
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
+
+  c10::optional<TypePtr> toOptional() const;
+
+  c10::optional<TypePtr> subtractTypeSet(std::vector<TypePtr>& to_subtract) const;
+
+ protected:
+    explicit UnionType(std::vector<TypePtr> types, TypeKind kind=TypeKind::UnionType);
+    std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+    std::string unionStr(
+        TypePrinter printer = nullptr,
+        bool is_annotation_str = false) const;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    bool has_free_variables_;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    std::vector<TypePtr> types_;
+    // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+    bool can_hold_none_;
+
+};
+
+struct OptionalType;
+using OptionalTypePtr = std::shared_ptr<OptionalType>;
+// This type represents an optional type. There is one `Optional` for
+// each element type. `Optional[T]` can accept both `T` and
+// `None`(`c10::nullopt` in C++)
+// Subtype hierarchy for Optional:
+//     - Optional[T] <: Optional[R] iff T <: R
+//     - T <: Optional[R] if T <: R
+//     - None <: Optional[T] for all T
+//     - Optional[T] == Union[T, None] for all T
+struct TORCH_API OptionalType : public UnionType {
+  static OptionalTypePtr create(const TypePtr& contained);
+
+  static const TypeKind Kind = TypeKind::OptionalType;
+
+  friend struct Type;
+
+  bool equals(const Type& rhs) const override;
+
+  const TypePtr& getElementType() const {
+    return contained_;
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return contained_;
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << getElementType()->str() << "?";
+    return ss.str();
+  }
+
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    AT_ASSERT(contained_types.size() == 1);
+    return create(contained_types[0]);
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  bool isUnionType() const override {
+    return true;
+  }
+
+  // common cast Optional[Tensor] for undefined tensor type
+  static TypePtr ofTensor();
+  //
+  // global singleton
+  static TypePtr get(TypePtr inner);
+
+ private:
+  explicit OptionalType(const TypePtr& contained);
+
+  TypePtr contained_;
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "Optional[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    return ss.str();
+  }
+};
+
+template <typename T>
+inline c10::optional<T> merge_primitive(
+    const c10::optional<T>& a,
+    const c10::optional<T>& b) {
+  if (a.has_value() && b.has_value() && a.value() == b.value()) {
+    return a;
+  }
+  return c10::optional<T>{};
+}
+
+// If we see `a + b + c`  and know that a, b, and c are the same size and have
+// two dimensions (WxH), then we can generate a fused kernel for them. That
+// fused kernel would likely have indexing math to handling both the W and H
+// dimensions. However, if we knew the WxH dimensions were contiguous, we can
+// pretend like we only have a single dimension, simplifying the indexing logic.
+// This can be performed even if the dimensions are transposed,
+// as long as a, b, and c are transposed in the same way.
+// We'd like to have the compiler be able to do this dimensionality reduction,
+// but simply knowing sizes is not enough.
+// We can extend profiling to also record stride information.
+// Rather than recording specific strides,
+// we can simply order the strides from smallest to largest with
+// `stride_indices` A contiguity marker on the smallest stride (c0) indicates
+// the stride is precisely 1, otherwise a contiguity marker means that $stride_n
+// = size_{n-1}*stride_{n-1}$
+struct TORCH_API Stride {
+  Stride() = default;
+  Stride(
+      const c10::optional<size_t>& stride_index,
+      c10::optional<bool> contiguous,
+      const c10::optional<size_t>& stride)
+      : stride_index_(stride_index), contiguous_(contiguous), stride_(stride) {}
+
+  bool operator==(const Stride& b) const {
+    return stride_index_ == b.stride_index_ && contiguous_ == b.contiguous_ &&
+        stride_ == b.stride_;
+  }
+
+  bool isComplete() const {
+    return stride_index_ && contiguous_ && stride_;
+  }
+
+  c10::optional<size_t> stride_index_;
+  c10::optional<bool> contiguous_;
+  c10::optional<size_t> stride_;
+};
+
+template <>
+inline c10::optional<Stride> merge_primitive(
+    const c10::optional<Stride>& a,
+    const c10::optional<Stride>& b) {
+  c10::optional<Stride> left = a;
+  c10::optional<Stride> right = b;
+  if (!left.has_value()) {
+    left = {Stride()};
+  }
+  if (!right.has_value()) {
+    right = {Stride()};
+  }
+
+  auto merged_index =
+      merge_primitive(left->stride_index_, right->stride_index_);
+  auto merged_cont = merge_primitive(left->contiguous_, right->contiguous_);
+  auto merged_stride = merge_primitive(left->stride_, right->stride_);
+  auto r = Stride(merged_index, merged_cont, merged_stride);
+  // normalize
+  if (!r.stride_index_.has_value() && !r.contiguous_.has_value() &&
+      !r.stride_.has_value()) {
+    return c10::optional<Stride>{};
+  }
+
+  return r;
+}
+
+struct TORCH_API ShapeSymbol {
+  // needed for use in `std::map`
+  ShapeSymbol() : value_(-1) {}
+  // is this symbol a fixed/static dimension
+  bool is_static() const {
+    return value_ >= 0;
+  };
+  bool operator==(const ShapeSymbol& b) const {
+    return value_ == b.value_;
+  }
+  bool operator<(const ShapeSymbol& b) const {
+    return value_ < b.value_;
+  }
+
+  static ShapeSymbol fromStaticSize(int64_t val) {
+    return ShapeSymbol(val);
+  }
+  int64_t static_size() const {
+    TORCH_CHECK(is_static());
+    return value_;
+  };
+
+  int64_t value() const {
+    return value_;
+  };
+
+  static ShapeSymbol newSymbol() {
+    return fromStaticSize(-static_cast<int64_t>(++num_symbols));
+  };
+  friend TORCH_API std::ostream& operator<<(
+      std::ostream& os,
+      const ShapeSymbol& s);
+
+ private:
+  ShapeSymbol(int64_t val) : value_(val) {}
+  int64_t value_;
+  static std::atomic<size_t> num_symbols;
+};
+
+inline ShapeSymbol merge_primitive(
+    const ShapeSymbol& a,
+    const ShapeSymbol& b) {
+  if (a.is_static() && b.is_static() && a == b) {
+    return a;
+  }
+  return ShapeSymbol::newSymbol();
+}
+
+// Shape of a Tensor represented with ShapeSymbol's. Unranked, ranked unknown
+// dims, partially known and fully known shapes are all supported.
+struct TORCH_API SymbolicShape {
+  // Unranked shape constructor.
+  SymbolicShape() : dims_(c10::nullopt) {}
+
+  // Known rank but unknown dimentions.
+  SymbolicShape(c10::optional<size_t> rank) : dims_(c10::nullopt) {
+    if(!rank) {
+      return;
+    }
+
+    std::vector<ShapeSymbol> shape_symbols;
+    shape_symbols.reserve(*rank);
+    for(size_t i = 0; i < *rank; ++i) {
+      shape_symbols.push_back(ShapeSymbol::newSymbol());
+    }
+    dims_ = shape_symbols;
+  }
+
+  // Mix of known and unknown ranks
+  SymbolicShape(const std::vector<c10::optional<int64_t>>& dims) {
+    std::vector<ShapeSymbol> shape_symbols;
+    shape_symbols.reserve(dims.size());
+    for(c10::optional<int64_t> dim: dims) {
+      if(!dim) {
+        shape_symbols.push_back(ShapeSymbol::newSymbol());
+      } else {
+        shape_symbols.push_back(ShapeSymbol::fromStaticSize(*dim));
+      }
+    }
+    dims_ = shape_symbols;
+  }
+
+  void dump() const;
+
+  SymbolicShape(std::vector<ShapeSymbol> dims) : dims_(std::move(dims)) {}
+
+  SymbolicShape(c10::IntArrayRef dims) {
+    std::vector<ShapeSymbol> shape_symbols;
+    shape_symbols.reserve(dims.size());
+    for(int64_t dim : dims) {
+      shape_symbols.push_back(ShapeSymbol::fromStaticSize(dim));
+    }
+    dims_ = shape_symbols;
+  }
+
+  ShapeSymbol operator[](size_t i) const {
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+    return (*dims_).at(i);
+  }
+
+  ShapeSymbol at(size_t i) const {
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+    return (*dims_).at(i);
+  }
+
+  // Returns rank or nullopt in case of unranked shape.
+  c10::optional<size_t> rank() const {
+    if(!dims_) {
+      return c10::nullopt;
+    }
+    return dims_->size();
+  }
+
+  c10::optional<std::vector<ShapeSymbol>> sizes() const {
+    return dims_;
+  }
+
+  c10::optional<std::vector<bool>> symbolicDims() const {
+    if (!dims_) {
+      return c10::nullopt;
+    }
+    auto symbolic_dims = std::vector<bool>();
+    for (const ShapeSymbol& s : *dims_) {
+      symbolic_dims.push_back(!s.is_static());
+    }
+    return symbolic_dims;
+  }
+
+  // Checks whether the shape is fully defined/complete, ie. rank and sizes
+  // of every dimension are known.
+  bool isComplete() const {
+    if(!dims_) {
+      return false;
+    }
+    for(auto d : *dims_) {
+      if(!d.is_static()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Create new SymbolicShape that is result of merging self and another
+  // SymbolicShape. Only dimensions that are static and equal will be
+  // preserved.
+  // If either of two shapes are of unknown rank or they have unmatching rank,
+  // result will be unranked.
+  SymbolicShape merge(const SymbolicShape& other) const;
+
+  friend bool operator==(const SymbolicShape& lhs, const SymbolicShape& rhs) {
+    return lhs.dims_ == rhs.dims_;
+  }
+
+  friend bool operator!=(const SymbolicShape& lhs, const SymbolicShape& rhs) {
+    return !(lhs == rhs);
+  }
+
+  private:
+    c10::optional<std::vector<ShapeSymbol>> dims_;
+};
+
+namespace detail {
+inline bool isComplete(const Stride& s) {
+  return s.isComplete();
+}
+
+template<typename T>
+inline bool isComplete(const T& /*t*/) {
+  return true;
+}
+}
+
+template <typename T>
+struct VaryingShape {
+  using ListOfOptionalElements = std::vector<c10::optional<T>>;
+  VaryingShape(const std::vector<T>& vec)
+      : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}
+
+  VaryingShape(c10::ArrayRef<T> vec)
+      : VaryingShape(ListOfOptionalElements(vec.begin(), vec.end())) {}
+
+  VaryingShape(c10::optional<size_t> size = c10::nullopt) : dims_(c10::nullopt) {
+    if (size) {
+      dims_ = ListOfOptionalElements(*size);
+    }
+  }
+
+  VaryingShape(ListOfOptionalElements dims) : dims_(std::move(dims)) {}
+
+  VaryingShape(size_t size) : VaryingShape(c10::optional<size_t>(size)) {}
+
+  bool operator==(const VaryingShape& other) const {
+    return dims_ == other.dims_;
+  }
+
+  const c10::optional<T> &operator[](size_t i) const {
+    if (!dims_) {
+      throw std::runtime_error("Rank isn't fixed");
+    }
+    return (*dims_).at(i);
+  }
+
+  c10::optional<size_t> size() const {
+    if (!dims_) {
+      return c10::nullopt;
+    }
+    const auto& dims = dims_.value();
+    return dims.size();
+  }
+
+  const c10::optional<ListOfOptionalElements>& sizes() const {
+    return dims_;
+  }
+
+  TORCH_API VaryingShape merge(const VaryingShape& other) const;
+
+  c10::optional<std::vector<T>> concrete_sizes() const {
+    if (!dims_) {
+      return c10::nullopt;
+    }
+    std::vector<T> sizes;
+    sizes.reserve(dims_.value().size());
+    for (auto d : *dims_) {
+      if (!d) {
+        return c10::nullopt;
+      }
+      sizes.push_back(d.value());
+    }
+    return sizes;
+  }
+
+  bool isComplete() const {
+    if (!dims_) {
+      return false;
+    }
+    for (auto d : *dims_) {
+      if (!d || !detail::isComplete(*d)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  c10::optional<ListOfOptionalElements> dims_;
+};
+
+struct TensorType;
+// TODO: investigate making this SingletonOrSharedTypePtr<TensorType>
+using TensorTypePtr = std::shared_ptr<TensorType>;
+// This type represents a single Tensor with a specific size
+struct TORCH_API TensorType : public SharedType {
+  static TensorTypePtr create(const at::Tensor& t);
+
+  // used by TensorType::create(size_t dim) which in turn used by
+  // shape_analysis.cpp
+  static TensorTypePtr create(
+      c10::optional<at::ScalarType> scalar_type,
+      c10::optional<Device> device,
+      const VaryingShape<int64_t>& sizes,
+      const VaryingShape<int64_t>& strides,
+      c10::optional<bool> requires_grad,
+      c10::optional<bool> undefined = false,
+      bool tensor_contiguity = false);
+
+  static TensorTypePtr create(
+      c10::optional<at::ScalarType> scalar_type,
+      c10::optional<Device> device,
+      const SymbolicShape& sizes,
+      const VaryingShape<Stride>& stride_,
+      c10::optional<bool> requires_grad,
+      c10::optional<bool> undefined = false);
+
+  static TensorTypePtr create(
+      c10::optional<at::ScalarType> scalar_type,
+      c10::optional<Device> device,
+      c10::optional<size_t> dim,
+      c10::optional<bool> requires_grad);
+
+  // overloaded create variadic template argument as it could not distinguish
+  // initializer list
+  static TensorTypePtr createContiguous(
+      at::ScalarType scalar_type,
+      at::Device device,
+      at::IntArrayRef sizes);
+
+  static TypePtr fromNumberType(const Type& typ);
+  static TypePtr fromBoolType();
+
+  c10::optional<size_t> dim() const {
+    return sizes().size();
+  }
+
+  VaryingShape<int64_t> sizes() const;
+
+  VaryingShape<int64_t> strides() const;
+
+  const VaryingShape<Stride>& stride_properties() const {
+    return strides_;
+  }
+
+  c10::optional<at::Device> device() const {
+    return device_;
+  }
+  c10::optional<at::ScalarType> scalarType() const {
+    return scalar_type_;
+  }
+  c10::optional<bool> requiresGrad() const {
+    return requires_grad_;
+  }
+  bool requires_grad() const override {
+    return requires_grad_ ? *requires_grad_ : true;
+  }
+
+  bool equals(const Type& rhs) const override;
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  std::string str() const override;
+
+  std::string repr_str() const override {
+    if (isInferredType()) {
+      return str() + " (inferred)";
+    } else {
+      return str();
+    }
+  }
+
+  c10::optional<size_t> numel() const {
+    size_t prod = 1;
+    const auto& shape = sizes();
+
+    for (size_t i = 0; i < shape.size(); i++) {
+      if (!shape[i]) {
+        return c10::optional<size_t>{};
+      }
+      prod *= shape[i].value();
+    }
+    return prod;
+  }
+
+  TensorTypePtr withRequiresGrad(c10::optional<bool> s) {
+    auto copy = clone();
+    copy->requires_grad_ = s;
+    return copy;
+  }
+
+  TensorTypePtr withScalarType(c10::optional<ScalarType> st) {
+    auto copy = clone();
+    copy->scalar_type_ = st;
+    return copy;
+  }
+
+  TensorTypePtr withDim(c10::optional<size_t> d) {
+    auto copy = clone();
+    // withDim is only used by the legacy executor
+    // that only cares about the rank, so create dummy symbols)) :
+    copy->sizes_ = SymbolicShape(d);
+    copy->strides_ = VaryingShape<Stride>(d);
+    return copy;
+  }
+
+  TensorTypePtr withStrides(VaryingShape<Stride> sstrides) const {
+    auto cloned = clone();
+    cloned->strides_ = std::move(sstrides);
+    return cloned;
+  }
+
+  TensorTypePtr withSizesStrides(
+      at::IntArrayRef sizes,
+      at::IntArrayRef strides) const {
+    auto cloned = clone();
+    auto ssizes = SymbolicShape(sizes);
+    cloned->sizes_ = ssizes;
+    cloned->strides_ = computeStrideProps(sizes, strides);
+    return cloned;
+  }
+
+  TensorTypePtr withSymbolicShapes(SymbolicShape ssizes) const {
+    auto cloned = clone();
+    cloned->sizes_ = std::move(ssizes);
+    return cloned;
+  }
+
+  TensorTypePtr withSizes(at::IntArrayRef sizes) const {
+    return withSizesStrides(
+        sizes, contiguousStridesOf(sizes));
+  }
+
+  TensorTypePtr withDevice(const c10::optional<at::Device> device) const {
+    auto copy = clone();
+    copy->device_ = device;
+    return copy;
+  }
+
+  TensorTypePtr dimensionedOnly() const {
+    auto copy = clone();
+    copy->sizes_ = SymbolicShape(sizes().size());
+    copy->strides_ = VaryingShape<Stride>(sizes().size());
+    return copy;
+  }
+
+  TensorTypePtr contiguous() const {
+    auto cloned = clone();
+    TORCH_INTERNAL_ASSERT(sizes().concrete_sizes().has_value());
+    auto strides = computeStrideProps(
+        *sizes().concrete_sizes(),
+        contiguousStridesOf(*sizes().concrete_sizes()));
+    cloned->strides_ = strides;
+    return cloned;
+  }
+
+  const SymbolicShape& symbolic_sizes() const;
+
+  TensorTypePtr merge(const TensorType& other, bool merge_sizes = true) const;
+
+  bool matchTensor(const at::Tensor& t);
+
+  // is all information about the type specified except for autograd?
+  // This replaces the notion of a 'CompleteTensorType' that used to exist
+  // in the type-hierarchy. Excluding require_grad and undefined allows
+  // this to match the old behavior.
+  bool isComplete() const {
+    return scalar_type_ && device_ && sizes_.isComplete() && strides_.isComplete();
+  }
+
+  bool isInferredType() const {
+    return is_inferred_;
+  }
+
+  static TensorTypePtr getInferred() {
+    static auto valueInferred = TensorType::create(
+        /*scalar_type=*/{},
+        /*device=*/{},
+        /*sizes=*/SymbolicShape(),
+        /*stride=*/VaryingShape<Stride>{},
+        /*requires_grad=*/{},
+        /*undefined=*/false);
+    valueInferred->is_inferred_ = true;
+    return valueInferred;
+  }
+
+  // this property is used by GuardElimination
+  // please see `checkInputs` for more details
+  bool isSummarized() const {
+    return !(isComplete() && requiresGrad().has_value() &&
+             undefined().has_value());
+  }
+
+  TensorTypePtr withUndefined() {
+    auto r = clone();
+    r->undefined_ = true;
+    return r;
+  }
+
+  TensorTypePtr withPossiblyUndefined() {
+    auto r = clone();
+    r->undefined_ = c10::nullopt;
+    return r;
+  }
+
+  c10::optional<bool> undefined() const { return undefined_; }
+
+  static const TensorTypePtr& get();
+
+  static const TypeKind Kind = TypeKind::TensorType;
+
+  static std::vector<int64_t> contiguousStridesOf(
+      at::IntArrayRef in_sizes,
+      at::MemoryFormat memory_format = MemoryFormat::Contiguous) {
+    auto contiguous_fn = [](const at::IntArrayRef& sizes,
+                            const std::vector<int64_t>& dim_order) {
+      std::vector<int64_t> strides(sizes.size());
+      if (sizes.empty()) // zero-dim case
+        return strides;
+
+      strides[dim_order[0]] = 1;
+      for (size_t i = 1; i < dim_order.size(); i++) {
+        auto cur_dim = dim_order[i];
+        auto pre_dim = dim_order[i - 1];
+        strides[cur_dim] = strides[pre_dim] * sizes[pre_dim];
+      }
+      return strides;
+    };
+
+    std::vector<int64_t> dim_order(in_sizes.size());
+    if (memory_format == MemoryFormat::ChannelsLast) {
+      dim_order = {1, 3, 2, 0};
+    } else if (memory_format == MemoryFormat::ChannelsLast3d) {
+      dim_order = {1, 4, 3, 2, 0};
+    } else {
+      auto ndims = in_sizes.size();
+      for (size_t i = 0; i < ndims; i++) {
+        dim_order[i] = static_cast<int64_t>(ndims - i - 1); // Reverse
+      }
+    }
+    return contiguous_fn(in_sizes, dim_order);
+  }
+
+ private:
+  TensorType(
+      c10::optional<at::ScalarType> scalar_type,
+      c10::optional<Device> device,
+      SymbolicShape sizes,
+      VaryingShape<Stride> strides,
+      c10::optional<bool> requires_grad,
+      c10::optional<bool> undefined = false);
+
+  TensorTypePtr clone() const {
+    return TensorTypePtr(new TensorType(
+        scalar_type_, device_, sizes_, strides_, requires_grad_, undefined_));
+  }
+
+  static VaryingShape<Stride> computeStrideProps(
+      at::IntArrayRef sizes,
+      at::IntArrayRef strides,
+      bool tensor_contiguity = false);
+
+  c10::optional<at::ScalarType> scalar_type_;
+  c10::optional<at::Device> device_;
+  SymbolicShape sizes_;
+  VaryingShape<Stride> strides_;
+  c10::optional<bool> requires_grad_;
+  // we exploit the fact certain tensors must be zero in the autograd to
+  // optimize gradient computation. Such zero tensors are currently implemented
+  // with `UndefinedTensorImpl.` They can be handled only by special operators
+  // (e.g. `AutogradAdd`) and their `Tensor::defined()` property returns false.
+  // Normally, `undefined_` is set to false, unless a type was created
+  // with `withUndefined`
+  // This will also mean that `undefined` tensors will fail
+  // `subtypeOf(TensorType::get())` check
+  // undefined_ may become `c10::nullopt` if the tensor was observed to be both
+  // defined and undefined. However, no tensor type starts out with
+  // `undefined_` set to `c10::nullopt`
+  c10::optional<bool> undefined_;
+  // Represents whether or not this type was inferred.
+  bool is_inferred_ = false;
+};
+
+struct ListType;
+using ListTypePtr = std::shared_ptr<ListType>;
+struct TORCH_API ListType
+    : public SingleElementType<TypeKind::ListType, ListType> {
+  // It's not exactly a singleton, but there should be exactly one instance of
+  // List[T] for every T
+  friend struct Type;
+  template <typename... T>
+  static ListTypePtr create(T&&... all) {
+    return ListTypePtr(
+        new ListType(std::forward<T>(all)...)); // NOLINT(modernize-make-shared)
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << getElementType()->str() << "[]";
+    return ss.str();
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types.at(0)));
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  // global singleton
+  // Given an inner type T and an identifier,
+  // this function wil return the global singleton type pointer
+  // the type List<T>.
+  // The extra "identifier" argument is needed beccause we have multiple container types
+  // that all re-use this function (List<T>, array<T, N>, etc.)
+  static TypePtr get(const std::string& identifier, TypePtr inner);
+
+  // common cast List[Tensor]
+  static ListTypePtr ofTensors();
+  static ListTypePtr ofOptionalTensors();
+  static ListTypePtr ofInts();
+  static ListTypePtr ofSymInts();
+  static ListTypePtr ofFloats();
+  static ListTypePtr ofComplexDoubles();
+  static ListTypePtr ofBools();
+  static ListTypePtr ofStrings();
+  static ListTypePtr ofNumbers();
+
+ private:
+  ListType(TypePtr elem) : SingleElementType(std::move(elem)) {}
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "List[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    return ss.str();
+  }
+};
+
+struct DictType;
+using DictTypePtr = std::shared_ptr<DictType>;
+struct TORCH_API DictType : public SharedType {
+  friend struct Type;
+  static const TypeKind Kind = TypeKind::DictType;
+
+  static DictTypePtr create(TypePtr key, TypePtr value) {
+    auto kind = key->kind();
+    if (auto dyn = key->castRaw<DynamicType>()) {
+      kind = dyn->dynamicKind();
+    }
+    switch (kind) {
+      case TypeKind::AnyType:
+      case TypeKind::IntType:
+      case TypeKind::BoolType:
+      case TypeKind::FloatType:
+      case TypeKind::ComplexType:
+      case TypeKind::StringType:
+      case TypeKind::TensorType:
+      case TypeKind::DeviceObjType:
+        return DictTypePtr(new DictType(std::move(key), std::move(value)));
+      default:
+        AT_ERROR(
+            "Cannot create dict for key type '",
+            key->str(),
+            "', only int, float, complex, Tensor, device and string keys are supported");
+    }
+  }
+
+  // aligned with the format in FunctionSchema
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
+       << ")";
+    return ss.str();
+  }
+
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    if (contained_types.size() != 2) {
+      throw std::runtime_error("Expected 2 contained types");
+    }
+    return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
+  }
+
+  const TypePtr& getKeyType() const {
+    return types.at(0);
+  }
+
+  const TypePtr& getValueType() const {
+    return types.at(1);
+  }
+
+  bool hasFreeVariables() const override {
+    return has_free_variables;
+  }
+
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return types;
+  }
+
+  bool equals(const Type& rhs) const override {
+    if (auto* dict_rhs = rhs.castRaw<DictType>()) {
+      return *getKeyType() == *(dict_rhs->getKeyType()) &&
+          *getValueType() == *(dict_rhs->getValueType());
+    }
+    return false;
+  }
+
+  // global singleton
+  // Given an inner type T and an identifier,
+  // this function will return the global singleton type pointer
+  // the type List<T>.
+  // The extra "identifier" argument is needed because we have multiple container types
+  // that all re-use this function (Dict<K, V> and unordered_map<K, V>)
+  static TypePtr get(const std::string& identifier, TypePtr key, TypePtr val);
+
+ private:
+  DictType(TypePtr key, TypePtr value)
+      : SharedType(TypeKind::DictType),
+        has_free_variables(
+            key->hasFreeVariables() || value->hasFreeVariables()) {
+    types.reserve(2);
+    types.push_back(std::move(key));
+    types.push_back(std::move(value));
+  }
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+
+  std::vector<TypePtr> types;
+  bool has_free_variables;
+};
+
+struct FutureType;
+using FutureTypePtr = std::shared_ptr<FutureType>;
+
+struct TORCH_API FutureType
+    : public SingleElementType<TypeKind::FutureType, FutureType> {
+  friend struct Type;
+  template <typename... T>
+  static FutureTypePtr create(TypePtr elem) {
+    return FutureTypePtr(
+        new FutureType(std::move(elem))); // NOLINT(modernize-make-shared)
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "Future(" << getElementType()->str() << ")";
+    return ss.str();
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types.at(0)));
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    if (Type::isSubtypeOfExt(rhs, why_not)) {
+      return true;
+    }
+    if (auto rhs_ = rhs.castRaw<FutureType>()) {
+      return getElementType()->isSubtypeOfExt(*rhs_->getElementType(), why_not);
+    }
+    return false;
+  }
+
+ private:
+  FutureType(TypePtr elem) : SingleElementType(std::move(elem)) {}
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "Future[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    return ss.str();
+  }
+};
+
+struct AwaitType;
+using AwaitTypePtr = std::shared_ptr<AwaitType>;
+
+struct TORCH_API AwaitType
+    : public SingleElementType<TypeKind::AwaitType, AwaitType> {
+  friend struct Type;
+  template <typename... T>
+  static AwaitTypePtr create(TypePtr elem) {
+    return AwaitTypePtr(
+        new AwaitType(std::move(elem))); // NOLINT(modernize-make-shared)
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "Await(" << getElementType()->str() << ")";
+    return ss.str();
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types.at(0)));
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    if (Type::isSubtypeOfExt(rhs, why_not)) {
+      return true;
+    }
+    if (auto rhs_ = rhs.castRaw<AwaitType>()) {
+      return getElementType()->isSubtypeOfExt(*rhs_->getElementType(), why_not);
+    }
+    return false;
+  }
+
+ private:
+  AwaitType(TypePtr elem) : SingleElementType(std::move(elem)) {}
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
+    return ss.str();
+  }
+};
+
+struct RRefType;
+using RRefTypePtr = std::shared_ptr<RRefType>;
+
+struct TORCH_API RRefType
+    : public SingleElementType<TypeKind::RRefType, RRefType> {
+  friend struct Type;
+  template <typename... T>
+  static RRefTypePtr create(TypePtr elem) {
+    return RRefTypePtr(
+        new RRefType(std::move(elem))); // NOLINT(modernize-make-shared)
+  }
+
+  std::string str() const override {
+    std::stringstream ss;
+    ss << "RRef(" << getElementType()->str() << ")";
+    return ss.str();
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return create(std::move(contained_types.at(0)));
+  }
+
+ private:
+  RRefType(TypePtr elem) : SingleElementType(std::move(elem)) {}
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    std::stringstream ss;
+    ss << "RRef[" << getElementType()->annotation_str(std::move(printer)) << "]";
+    return ss.str();
+  }
+};
+
+// Any should never appear in a named type like a class, namedtuple or
+// interface. If it does, then dynamic type information will be lost in the
+// Pickler, leading to hard-to-track-down bugs that will only occur
+// after saving or loading a model. This is because we rely on the
+// static types in named types to reconstruct type tags of loaded
+// values. Lifting this restriction requires solving the serialization
+// problem first.
+TORCH_API void checkNoAny(
+    const Type& base,
+    const char* what,
+    const std::string& attrname,
+    const TypePtr& attrtype);
+
+struct TupleType;
+using TupleTypePtr = std::shared_ptr<TupleType>;
+using NameList = std::vector<std::string>;
+// This type represents a Tuple
+struct TORCH_API TupleType : public NamedType {
+
+  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+      const std::vector<std::string>& field_names,
+      const std::vector<TypePtr>& field_types,
+      std::vector<IValue>& field_defaults);
+
+  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+      const std::vector<std::string>& field_names,
+      const std::vector<TypePtr>& field_types);
+
+  static TupleTypePtr createNamed(const c10::optional<c10::QualifiedName>& name,
+      const std::vector<c10::string_view>& field_names,
+      const std::vector<TypePtr>& field_types);
+
+  static TupleTypePtr create(
+      std::vector<TypePtr> types) {
+    return TupleTypePtr(new TupleType(
+        std::move(types),
+        c10::nullopt,
+        nullptr)); // NOLINT(modernize-make-shared)
+  }
+  static TupleTypePtr create() {
+    return create({});
+  }
+
+  at::ArrayRef<TypePtr> elements() const {
+    return elements_;
+  }
+
+  bool equals(const Type& rhs) const override;
+  bool isSubtypeOfExt(const Type& rhs_, std::ostream* why_not) const override;
+
+  std::string str() const override;
+  bool hasFreeVariables() const override {
+    return has_free_variables_;
+  }
+  at::ArrayRef<TypePtr> containedTypes() const override {
+    return elements_;
+  }
+  TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const override {
+    return std::shared_ptr<TupleType>(
+        new TupleType(std::move(contained_types), name(), schema()));
+  }
+  const std::shared_ptr<FunctionSchema>& schema() const {
+    return schema_;
+  }
+  c10::optional<std::vector<c10::string_view>> names() const;
+
+  static const TypeKind Kind = TypeKind::TupleType;
+
+ private:
+  template <typename S>
+  static TupleTypePtr createWithSpec(
+      const c10::optional<c10::QualifiedName>& name,
+      const std::vector<S>& field_names,
+      const std::vector<TypePtr>& field_types,
+      std::vector<IValue>& field_defaults);
+
+  TupleType(
+      std::vector<TypePtr> elements_,
+      c10::optional<c10::QualifiedName> name,
+      std::shared_ptr<FunctionSchema> schema);
+
+  bool compare(
+      const Type& rhs,
+      const std::function<bool(const Type&, const Type&)>& fn) const {
+    if (rhs.kind() != kind()) {
+      return false;
+    }
+
+    const auto& l_elements = elements();
+    const auto& r_elements = rhs.castRaw<TupleType>()->elements();
+    if (l_elements.size() != r_elements.size())
+      return false;
+    for (size_t i = 0; i < l_elements.size(); ++i) {
+      if (!fn(*l_elements[i], *r_elements[i]))
+        return false;
+    }
+    return true;
+  }
+
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override;
+
+  std::vector<TypePtr> elements_;
+  bool has_free_variables_;
+  std::shared_ptr<FunctionSchema> schema_;
+};
+
+// the common supertype of all Enums, only used in operator registraion.
+// EnumType <: AnyEnumType for all Enums
+struct AnyEnumType;
+using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
+struct TORCH_API AnyEnumType final : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "AnyEnumType";
+  }
+  static const TypeKind Kind = TypeKind::AnyEnumType;
+  // global singleton
+  static AnyEnumTypePtr get();
+private:
+  AnyEnumType()
+  : Type(TypeKind::AnyEnumType) {}
+};
+
+struct NumberType;
+using NumberTypePtr = SingletonTypePtr<NumberType>;
+// This type represents a Python number
+// Subtype hierarchy for Number Types (NumberType as the base type):
+// IntType <: NumberType
+// FloatType <: NumberType
+// ComplexType <:NumberType
+//
+// WARNING: if you add a new subtype of NumberType that is not
+// represented by a global singleton, you need to change NumberTypePtr
+// to a SingletonOrSharedTypePtr and deal with NumberType needing to
+// both inherit and not inherit from SharedType!
+struct TORCH_API NumberType : public Type {
+  bool equals(const Type& rhs) const override;
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  std::string str() const override {
+    return "Scalar"; // match what PythonArgParser says for clarity
+  }
+  static const TypeKind Kind = TypeKind::NumberType;
+  // global singleton
+  static NumberTypePtr get();
+
+ protected:
+  NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {}
+
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "number"; // technically not a valid python type, but
+                     // we need to use it when parsing back in annotations
+                     // for implicit conversions
+  }
+};
+
+struct FloatType;
+using FloatTypePtr = SingletonTypePtr<FloatType>;
+// This type represents a Python float number
+struct TORCH_API FloatType : public NumberType {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "float";
+  }
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs.kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
+  }
+  static const TypeKind Kind = TypeKind::FloatType;
+  // global singleton
+  static FloatTypePtr get();
+
+ private:
+  FloatType() : NumberType(TypeKind::FloatType) {}
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "float";
+  }
+};
+
+struct ComplexType;
+using ComplexTypePtr = SingletonTypePtr<ComplexType>;
+// This type represents a Python float number
+struct TORCH_API ComplexType : public NumberType {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "complex";
+  }
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs.kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
+  }
+  static const TypeKind Kind = TypeKind::ComplexType;
+  // global singleton
+  static ComplexTypePtr get();
+
+ private:
+  ComplexType() : NumberType(TypeKind::ComplexType) {}
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "complex";
+  }
+};
+
+// We need to introduce `SymIntType` to represent the `SymInt` type
+// used in function schemas e.g. `aten::narrow_copy(... SymInt length)
+// `SymInt` will be used to enable tracing arithmetic operations on
+// dimension values. Please see [SymInt.h] for more information
+struct SymIntType;
+using SymIntTypePtr = SingletonTypePtr<SymIntType>;
+struct TORCH_API SymIntType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "SymInt";
+  }
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    return "int";
+  }
+  static const TypeKind Kind = TypeKind::SymIntType;
+  // global singleton
+  static SymIntTypePtr get();
+
+ private:
+  SymIntType() : Type(TypeKind::SymIntType) {}
+};
+
+struct SymFloatType;
+using SymFloatTypePtr = SingletonTypePtr<SymFloatType>;
+struct TORCH_API SymFloatType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "SymFloat";
+  }
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    return "float";
+  }
+  static const TypeKind Kind = TypeKind::SymFloatType;
+  // global singleton
+  static SymFloatTypePtr get();
+
+ private:
+  SymFloatType() : Type(TypeKind::SymFloatType) {}
+};
+
+struct SymBoolType;
+using SymBoolTypePtr = SingletonTypePtr<SymBoolType>;
+struct TORCH_API SymBoolType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "SymBool";
+  }
+  std::string annotation_str_impl(TypePrinter printer = nullptr) const override {
+    return "bool";
+  }
+  static const TypeKind Kind = TypeKind::SymBoolType;
+  // global singleton
+  static SymBoolTypePtr get();
+
+ private:
+  SymBoolType() : Type(TypeKind::SymBoolType) {}
+};
+
+struct IntType;
+using IntTypePtr = SingletonTypePtr<IntType>;
+// This type represents a Python int number
+struct TORCH_API IntType : public NumberType {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "int";
+  }
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override {
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
+    return rhs.kind() == TypeKind::NumberType || Type::isSubtypeOfExt(rhs, why_not);
+  }
+  static const TypeKind Kind = TypeKind::IntType;
+  // global singleton
+  static IntTypePtr get();
+
+ private:
+  IntType() : NumberType(TypeKind::IntType) {}
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "int";
+  }
+};
+
+struct BoolType;
+using BoolTypePtr = SingletonTypePtr<BoolType>;
+// This node represents a Python bool value
+struct TORCH_API BoolType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "bool";
+  }
+  static const TypeKind Kind = TypeKind::BoolType;
+  // global singleton
+  static BoolTypePtr get();
+
+ private:
+  BoolType() : Type(TypeKind::BoolType) {}
+};
+
+struct StringType;
+using StringTypePtr = SingletonTypePtr<StringType>;
+// This type represents a Python string
+struct TORCH_API StringType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    // we only use "str" (not "string") in both FunctionSchema and script
+    return annotation_str();
+  }
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "str";
+  }
+  static const TypeKind Kind = TypeKind::StringType;
+  // global singleton
+  static StringTypePtr get();
+
+ private:
+  StringType() : Type(TypeKind::StringType) {}
+};
+
+struct StorageType;
+using StorageTypePtr = SingletonTypePtr<StorageType>;
+struct TORCH_API StorageType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return annotation_str();
+  }
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return "Storage";
+  }
+  static const TypeKind Kind = TypeKind::StorageType;
+  // global singleton
+  static StorageTypePtr get();
+
+ private:
+  StorageType() : Type(TypeKind::StorageType) {}
+};
+
+struct FunctionType;
+using FunctionTypePtr = std::shared_ptr<FunctionType>;
+struct TORCH_API FunctionType : public NamedType {
+  static FunctionTypePtr create(torch::jit::Function* function) {
+    return FunctionTypePtr(
+        new FunctionType(function)); // NOLINT(modernize-make-shared)
+  }
+  bool equals(const Type& rhs) const override {
+    if (auto func_type = rhs.cast<FunctionType>()) {
+      return func_type->function_ == function_;
+    }
+
+    return false;
+  }
+  std::string str() const override {
+    return "Function";
+  }
+  torch::jit::Function* function() const {
+    return function_;
+  }
+  static const TypeKind Kind = TypeKind::FunctionType;
+
+ private:
+  FunctionType(torch::jit::Function* function);
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    const auto& n = name().value();
+    return n.qualifiedName();
+  }
+  torch::jit::Function* function_;
+};
+
+struct NoneType;
+using NoneTypePtr = SingletonTypePtr<NoneType>;
+// This type represents a Python None
+struct TORCH_API NoneType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "NoneType";
+  }
+  bool isSubtypeOfExt(const Type& rhs, std::ostream *why_not) const override;
+
+  static const TypeKind Kind = TypeKind::NoneType;
+  // global singleton
+  static NoneTypePtr get();
+
+ private:
+  NoneType() : Type(TypeKind::NoneType) {}
+};
+
+struct GeneratorType;
+using GeneratorTypePtr = SingletonTypePtr<GeneratorType>;
+// This type represents a Generator
+struct TORCH_API GeneratorType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Generator";
+  }
+  static const TypeKind Kind = TypeKind::GeneratorType;
+  // global singleton
+  static GeneratorTypePtr get();
+
+ private:
+  GeneratorType() : Type(TypeKind::GeneratorType) {}
+};
+
+struct QuantizerType;
+using QuantizerTypePtr = SingletonTypePtr<QuantizerType>;
+// This type represents a Quantizer
+struct TORCH_API QuantizerType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Quantizer";
+  }
+  static const TypeKind Kind = TypeKind::QuantizerType;
+  // global singleton
+  static QuantizerTypePtr get();
+
+ private:
+  QuantizerType() : Type(TypeKind::QuantizerType) {}
+};
+
+struct QSchemeType;
+using QSchemeTypePtr = SingletonTypePtr<QSchemeType>;
+// This type represents a QScheme
+struct TORCH_API QSchemeType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "QScheme";
+  }
+  static const TypeKind Kind = TypeKind::QSchemeType;
+  // global singleton
+  static QSchemeTypePtr get();
+
+ private:
+  QSchemeType() : Type(TypeKind::QSchemeType) {}
+};
+
+struct DeviceObjType;
+using DeviceObjTypePtr = SingletonTypePtr<DeviceObjType>;
+// This type represents a Device
+struct TORCH_API DeviceObjType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Device";
+  }
+  static const TypeKind Kind = TypeKind::DeviceObjType;
+  // global singleton
+  static DeviceObjTypePtr get();
+
+ private:
+  DeviceObjType() : Type(TypeKind::DeviceObjType) {}
+};
+
+struct StreamObjType;
+using StreamObjTypePtr = SingletonTypePtr<StreamObjType>;
+// This type represents a Generator
+struct TORCH_API StreamObjType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Stream";
+  }
+  static const TypeKind Kind = TypeKind::StreamObjType;
+  // global singleton
+  static StreamObjTypePtr get();
+
+private:
+  StreamObjType() : Type(TypeKind::StreamObjType) {}
+};
+
+struct VarType;
+using VarTypePtr = std::shared_ptr<VarType>;
+// This type represents a type variable, used in FunctionSchema
+struct VarType : public SharedType {
+  static VarTypePtr create(std::string name_) {
+    return VarTypePtr(new VarType(std::move(name_)));
+  }
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return name();
+  }
+  const std::string& name() const {
+    return name_;
+  }
+  bool hasFreeVariables() const override {
+    return true;
+  }
+  static const TypeKind Kind = TypeKind::VarType;
+
+ private:
+  VarType(std::string name_)
+      : SharedType(TypeKind::VarType), name_(std::move(name_)) {}
+  std::string name_;
+};
+
+struct CapsuleType;
+using CapsuleTypePtr = SingletonTypePtr<CapsuleType>;
+// This type represents a Python Capsule.
+// It does not appear in the IR and is only used during runtime
+struct TORCH_API CapsuleType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "Capsule";
+  }
+  static const TypeKind Kind = TypeKind::CapsuleType;
+  // global singleton
+  static CapsuleTypePtr get();
+private:
+  CapsuleType()
+  : Type(TypeKind::CapsuleType) {}
+};
+
+struct PyObjectType;
+using PyObjectTypePtr = SingletonTypePtr<PyObjectType>;
+// This type represents a PyObject Type
+struct TORCH_API PyObjectType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "PyObject";
+  }
+  static const TypeKind Kind = TypeKind::PyObjectType;
+  // global singleton
+  static PyObjectTypePtr get();
+private:
+  PyObjectType()
+  : Type(TypeKind::PyObjectType) {}
+};
+
+enum class TypeVerbosity {
+  None,
+  Type,
+  TypeAndStride,
+  Full,
+  Symbolic,
+  Default = Full,
+};
+
+TORCH_API TypeVerbosity type_verbosity();
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Type& t);
+template <typename T>
+TORCH_API std::ostream& operator<<(
+    std::ostream& out,
+    const VaryingShape<T>& t);
+TORCH_API std::ostream& operator<<(std::ostream& os, const SymbolicShape& s);
+TORCH_API std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s);
+TORCH_API std::ostream& operator<<(std::ostream& os, const Stride& s);
+// what is the type, ignoring extra size/shape information?
+// e.g. Tensor(2x3) -> Dynamic, and Tuple(Tensor(2x3),...) -> Tuple(Dynamic,...)
+
+// `unshapedType` is used to remove Tensor subtypes. We treat all Tensor
+// subtypes as simply "Tensor"; we also create a new version of any
+// container types in which internal Tensors have undergone the same
+// operation. This is used for type comparisons between two Tensor types
+// (`unshapedType` means that we don't falsely return `false` for e.g.
+// Tensors of different dimensions). It's also used in the alias
+// analysis pass.
+// Be careful with calls because this can be very slow. If calling this
+// on a graph, use `EraseShapeInformation` in shape_analysis.h
+inline TypePtr unshapedType(const TypePtr& type) {
+  if (type->isSubtypeOf(*TensorType::get())) {
+    return TensorType::get();
+  }
+  at::ArrayRef<TypePtr> contained = type->containedTypes();
+  if (contained.empty()) {
+    return type;
+  }
+  return type->withContained(fmap(type->containedTypes(), unshapedType));
+}
+
+inline TypePtr TensorType::fromNumberType(const Type& typ) {
+  if (typ.isSubtypeOf(*IntType::get())) {
+    return TensorType::createContiguous(at::kLong, at::kCPU, {});
+  } else if (typ.isSubtypeOf(*FloatType::get())) {
+    return TensorType::createContiguous(at::kDouble, at::kCPU, {});
+  } else if (typ.isSubtypeOf(*BoolType::get())) {
+    return TensorType::createContiguous(at::kBool, at::kCPU, {});
+  } else if (typ.kind() == NumberType::Kind) {
+    return TensorType::create(c10::nullopt, at::kCPU, {}, c10::nullopt);
+  }
+  TORCH_CHECK(false, "Unknown number type: ", typ.str());
+}
+inline TypePtr TensorType::fromBoolType() {
+  return TensorType::createContiguous(at::kBool, at::kCPU, {});
+}
+
+inline c10::optional<c10::ScalarType> tryScalarTypeFromJitType(const Type& type) {
+  if (type == *FloatType::get()) {
+    return at::typeMetaToScalarType(c10::get_default_dtype());
+  } else if (type == *IntType::get()) {
+    return at::ScalarType::Long;
+  } else if (type == *BoolType::get()) {
+    return at::ScalarType::Bool;
+  }
+  return c10::nullopt;
+}
+
+inline at::ScalarType scalarTypeFromJitType(const Type& type) {
+  auto result = tryScalarTypeFromJitType(type);
+  TORCH_CHECK(
+      result,
+      "Add new condition, expected Float, Complex, Int, or Bool but got",
+      type.str());
+  return *result;
+}
+
+// Attempt to find the correct supertype of the two types `t1` and `t2`.
+// If no supertype is found, then nullopt will be returned if
+// `default_to_union` is false, and `Union[t1, t2]` will be returned
+// if it is true. If `t1 == t2`, or `t1` is a type refinement of `t2`,
+// then `t2` will be returned (and vice versa).
+//
+// Two different tensortypes will return dynamic.
+//
+// Currently we chose not to support returning a NumberType for
+// two types from the set of {FloatType, IntType, ComplexType}, because
+// there is a lack of operator support for NumberType.
+//
+// If `type_hint` is an `InterfaceType`, then we can use that as a
+// potential supertype for `ClassType`s in the list. Otherwise, we have
+// no way to find and use some common interface type
+TORCH_API c10::optional<TypePtr> unifyTypes(
+    const TypePtr& t1,
+    const TypePtr& t2,
+    bool default_to_union = false,
+    const TypePtr& type_hint = nullptr);
+
+TORCH_API c10::optional<TypePtr> unifyTypeList(
+    at::ArrayRef<TypePtr> elements,
+    std::ostream& why_not,
+    bool default_to_union = false,
+    const TypePtr& type_hint = nullptr);
+
+namespace detail {
+template <typename T>
+struct getTypePtr_ final {
+  static decltype(auto) call() {
+    return ([]() {
+      try {
+        return getCustomClassType<T>();
+      } catch(const c10::Error&) {
+        TORCH_CHECK(
+            false,
+            "Type ",
+            c10::util::get_fully_qualified_type_name<T>(),
+            " could not be converted to any of the known types."
+        );
+      }
+    }());
+  }
+};
+
+template <typename T, bool fake>
+struct getMaybeFakeTypePtr_ final {
+  static decltype(auto) call() {
+    return getTypePtr_<T>::call();
+  }
+};
+
+template <>
+struct getTypePtr_<at::IValue> final {
+  static decltype(auto) call() {
+    return AnyType::get();
+  }
+};
+
+template <>
+struct getTypePtr_<at::Tensor> final {
+  static decltype(auto) call() {
+    return TensorType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::Storage> final {
+  static decltype(auto) call() {
+    return StorageType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::Stream> final {
+  static decltype(auto) call() {
+    return StreamObjType::get();
+  }
+};
+template <>
+struct getTypePtr_<double> final {
+  static decltype(auto) call() {
+    return FloatType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::complex<double>> final {
+  static decltype(auto) call() {
+    return ComplexType::get();
+  }
+};
+template <>
+struct getTypePtr_<int64_t> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+
+template <>
+struct getTypePtr_<DeviceIndex> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+
+template <>
+struct getMaybeFakeTypePtr_<SymInt, false> final {
+  static decltype(auto) call() {
+    return SymIntType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<SymInt, true> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+
+template <>
+struct getMaybeFakeTypePtr_<SymFloat, false> final {
+  static decltype(auto) call() {
+    return SymFloatType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<SymFloat, true> final {
+  static decltype(auto) call() {
+    return FloatType::get();
+  }
+};
+
+template <>
+struct getMaybeFakeTypePtr_<SymBool, false> final {
+  static decltype(auto) call() {
+    return SymBoolType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<SymBool, true> final {
+  static decltype(auto) call() {
+    return BoolType::get();
+  }
+};
+
+template <>
+struct getTypePtr_<c10::Device> final {
+  static decltype(auto) call() {
+    return DeviceObjType::get();
+  }
+};
+template <>
+struct getTypePtr_<bool> final {
+  static decltype(auto) call() {
+    return BoolType::get();
+  }
+};
+template <>
+struct getTypePtr_<at::Scalar> final {
+  static decltype(auto) call() {
+    return NumberType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::QScheme> final {
+  static decltype(auto) call() {
+    return QSchemeType::get();
+  }
+};
+template <>
+struct getTypePtr_<at::Generator> final {
+  static decltype(auto) call() {
+    return TypeFactory::create<OptionalType>(
+        TypeFactory::get<GeneratorType>());
+  }
+};
+template <>
+struct getTypePtr_<std::string> final {
+  static decltype(auto) call() {
+    return StringType::get();
+  }
+};
+template <>
+struct getTypePtr_<c10::string_view> final {
+  static decltype(auto) call() {
+    return StringType::get();
+  }
+};
+template <>
+struct getTypePtr_<at::Dimname> final {
+  static decltype(auto) call() {
+    return StringType::get();
+  }
+};
+template <class T, bool fake>
+struct getMaybeFakeTypePtr_<std::vector<T>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    // The "per vector<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("vector", inner_type);
+    return type;
+  }
+};
+template <class T, bool fake>
+struct getMaybeFakeTypePtr_<c10::ArrayRef<T>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    // The "per ArrayRef<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("ArrayRef", inner_type);
+    return type;
+  }
+};
+template <bool fake>
+struct getMaybeFakeTypePtr_<c10::SymIntArrayRef, fake> final {
+  static const auto& call() {
+    static auto type = ListType::create(getMaybeFakeTypePtr_<c10::SymInt, fake>::call());
+    return type;
+  }
+};
+template <class T, bool fake>
+struct getMaybeFakeTypePtr_<c10::List<T>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    // The "per List<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = ListType::get("List", inner_type);
+    return type;
+  }
+};
+template <class T, bool fake>
+struct getMaybeFakeTypePtr_<c10::IListRef<T>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    static auto type = ListType::get("List", inner_type);
+    return type;
+  }
+};
+template <class T, size_t N, bool fake>
+struct getMaybeFakeTypePtr_<std::array<T, N>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    // The "per array<T, N>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    // (Concatenating the length onto the end of the string because we want a unique
+    // type_ptr created for every std::array<T, N> type).
+    static auto type = ListType::get(std::string("array") + std::to_string(N), inner_type);
+    return type;
+  }
+};
+template <class K, class V, bool fake>
+struct getMaybeFakeTypePtr_<std::unordered_map<K, V>, fake> final {
+  static const auto& call() {
+    static auto inner_key_type = getMaybeFakeTypePtr_<K, fake>::call();
+    static auto inner_val_type = getMaybeFakeTypePtr_<V, fake>::call();
+    // The "per unordered_map<K, V>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = DictType::get("unordered_map", inner_key_type, inner_val_type);
+    return type;
+  }
+};
+template <class K, class V, bool fake>
+struct getMaybeFakeTypePtr_<c10::Dict<K, V>, fake> final {
+  static const auto& call() {
+    static auto inner_key_type = getMaybeFakeTypePtr_<K, fake>::call();
+    static auto inner_val_type = getMaybeFakeTypePtr_<V, fake>::call();
+    // The "per Dict<K, V>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = DictType::get("Dict", inner_key_type, inner_val_type);
+    return type;
+  }
+};
+
+template <class T, bool fake>
+struct getMaybeFakeTypePtr_<at::optional<T>, fake> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<T, fake>::call();
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = OptionalType::get(inner_type);
+    return type;
+  }
+};
+
+
+template<>
+struct getTypePtr_<at::OptionalIntArrayRef> final {
+  static const auto& call() {
+    static auto inner_type = getMaybeFakeTypePtr_<IntArrayRef, false>::call();
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto type = OptionalType::get(inner_type);
+    return type;
+  }
+};
+
+template <bool fake>
+struct getMaybeFakeTypePtr_<at::OptionalSymIntArrayRef, fake> final {
+  static const auto& call() {
+    // The "per optional<T>" static singleton needs to live in a .cpp file,
+    // otherwise we'll end up with one singleton instance per shared library.
+    static auto inner_type = getMaybeFakeTypePtr_<SymIntArrayRef, fake>::call();
+    static auto type = OptionalType::get(inner_type);
+    return type;
+  }
+};
+
+template <class... Contained, bool fake>
+struct getMaybeFakeTypePtr_<std::tuple<Contained...>, fake> final {
+  static const auto& call() {
+    static auto type = ([]() {
+      std::vector<TypePtr> contained_types = {
+        (getMaybeFakeTypePtr_<Contained, fake>::call())...
+      };
+      return TupleType::create(std::move(contained_types));
+    })();
+    return type;
+  }
+};
+template <>
+struct getTypePtr_<void> final {
+  static decltype(auto) call() {
+    return NoneType::get();
+  }
+};
+} // namespace detail
+template <class T>
+inline decltype(auto) getTypePtr() {
+  // TODO: static_assert that a templated function exists, and throw a friendly
+  // error message if not
+  return detail::getMaybeFakeTypePtr_<T, false>::call();
+}
+
+template <class T>
+inline TypePtr getTypePtrCopy() {
+  // TODO: static_assert that a templated function exists, and throw a friendly
+  // error message if not
+  return getTypePtr<T>();
+}
+
+template <class T>
+inline decltype(auto) getFakeTypePtr() {
+  return detail::getMaybeFakeTypePtr_<T, true>::call();
+}
+
+template <class T>
+inline TypePtr getFakeTypePtrCopy() {
+  return getFakeTypePtr<T>();
+}
+
+using TypeEnv = std::unordered_map<std::string, TypePtr>;
+struct MatchTypeReturn {
+  MatchTypeReturn(std::string reason) : reason_(std::move(reason)) {}
+  static MatchTypeReturn Success() {
+    return MatchTypeReturn();
+  }
+  bool success() const {
+    return !reason_.has_value();
+  }
+  const std::string& reason() const {
+    return reason_.value();
+  }
+
+ private:
+  MatchTypeReturn()
+  : reason_(c10::nullopt) {}
+  c10::optional<std::string> reason_; // is there is no match, this contains the reason
+};
+
+// attempt to match the type variables in formal to actual, adding them to type_env.
+// If no match is possible this returns a MatchTypeReturn with r.success() == false
+// and a r.reason() that describes why it could not match.
+// note: It is possible to successfully match a formal, but for type variables
+// in the formal to still not be defined. In particular, None matches Optional[T]
+// but does not define the value of T.
+TORCH_API MatchTypeReturn
+matchTypeVariables(const TypePtr& formal, const TypePtr& actual, TypeEnv& type_env);
+
+// replace type variables appearing in `type` with the values in
+// `type_env`. Returns nullptr if a variable used in `type`
+// does not appear in `type_env`
+TORCH_API TypePtr tryEvalTypeVariables(const TypePtr& type, TypeEnv& type_env);
+
+TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type);
+
+struct InterfaceType;
+using InterfaceTypePtr = std::shared_ptr<InterfaceType>;
+
+// Interfaces are a list of abstract methods that a class might meet.
+// If a class provides those methods, it implicitly meets the interface.
+
+// Subtype relations for Interface with ClassType:
+// lhs (ClassType or InterfaceType) is a subtype of rhs if:
+// 1. lhs methods are a superset of rhs methods
+// 2. if rhs is module interface, the lhs must be module interface or module itself
+struct TORCH_API InterfaceType : public NamedType {
+  static InterfaceTypePtr create(
+      QualifiedName qualifiedName, bool is_module=false);
+
+  bool equals(const Type& rhs) const override {
+    if (auto user_rhs = rhs.castRaw<InterfaceType>()) {
+      return isSubTypeImpl(*this, *user_rhs, nullptr) &&
+          isSubTypeImpl(*user_rhs, *this, nullptr);
+    }
+    return false;
+  }
+
+  std::string str() const override {
+    return std::string("InterfaceType<") + name()->name() + ">";
+  }
+
+  bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const override;
+
+  // try to find a method of this interface,
+  // returns nullptr if not found.
+  const FunctionSchema* getMethod(const std::string& name) const;
+  void addMethod(FunctionSchema schema);
+  const std::vector<FunctionSchema>& methods() const {
+    return *methods_;
+  }
+
+  bool is_module() const override{
+    return is_module_;
+  }
+  static const TypeKind Kind = TypeKind::InterfaceType;
+  ~InterfaceType() override;
+ private:
+  InterfaceType(QualifiedName name, bool is_module);
+  static bool isSubTypeImpl(
+      const InterfaceType& lhs,
+      const InterfaceType& rhs,
+      std::ostream* why_not);
+
+  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+    return name()->qualifiedName();
+  }
+
+  // shared_ptr so that this header does not have to depend on
+  // FunctionSchema.h
+  std::shared_ptr<std::vector<FunctionSchema>> methods_;
+  // flag to distinguish if it's an interface type from a module or not
+  bool is_module_;
+};
+
+template <TypeKind K>
+struct EnumerationType : public Type {
+static const TypeKind Kind = K;
+
+bool equals(const Type& rhs) const override {
+  return rhs.kind() == kind();
+}
+
+protected:
+EnumerationType() : Type(Kind) {}
+};
+
+// WARNING: These enumeration types below DO NOT actually get parsed out
+// from the logical schema strings, instead they are mapped as ints.  To
+// observe these types, use real_type() instead of type() on Argument
+
+struct ScalarTypeType;
+using ScalarTypeTypePtr = SingletonTypePtr<ScalarTypeType>;
+struct TORCH_API ScalarTypeType : public EnumerationType<TypeKind::ScalarTypeType> {
+std::string str() const override {
+return "ScalarType";
+}
+static const TypeKind Kind = TypeKind::ScalarTypeType;
+// global singleton
+static ScalarTypeTypePtr get();
+
+private:
+ScalarTypeType() : EnumerationType() {}
+};
+
+struct MemoryFormatType;
+using MemoryFormatTypePtr = SingletonTypePtr<MemoryFormatType>;
+struct TORCH_API MemoryFormatType : public EnumerationType<TypeKind::MemoryFormatType> {
+std::string str() const override {
+return "MemoryFormat";
+}
+static const TypeKind Kind = TypeKind::MemoryFormatType;
+// global singleton
+static MemoryFormatTypePtr get();
+
+private:
+MemoryFormatType() : EnumerationType() {}
+};
+
+struct LayoutType;
+using LayoutTypePtr = SingletonTypePtr<LayoutType>;
+struct TORCH_API LayoutType : public EnumerationType<TypeKind::LayoutType> {
+std::string str() const override {
+return "Layout";
+}
+static const TypeKind Kind = TypeKind::LayoutType;
+// global singleton
+static LayoutTypePtr get();
+
+private:
+LayoutType() : EnumerationType() {}
+};
+
+namespace detail {
+template <>
+struct getMaybeFakeTypePtr_<c10::ScalarType, false> final {
+  static decltype(auto) call() {
+    return ScalarTypeType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<c10::Layout, false> final {
+  static decltype(auto) call() {
+    return LayoutType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<c10::MemoryFormat, false> final {
+  static decltype(auto) call() {
+    return MemoryFormatType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<c10::ScalarType, true> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<c10::Layout, true> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+template <>
+struct getMaybeFakeTypePtr_<c10::MemoryFormat, true> final {
+  static decltype(auto) call() {
+    return IntType::get();
+  }
+};
+} // namespace detail
+
+// the common supertype of all lists,
+// List[T] <: AnyList for all T
+struct AnyListType;
+using AnyListTypePtr = SingletonTypePtr<AnyListType>;
+struct TORCH_API AnyListType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "list";
+  }
+  static const TypeKind Kind = TypeKind::AnyListType;
+  // global singleton
+  static AnyListTypePtr get();
+private:
+  AnyListType()
+  : Type(TypeKind::AnyListType) {}
+};
+
+// the common supertype of all tuples,
+// Tuple[T...] <: AnyTuple for all T
+struct AnyTupleType;
+using AnyTupleTypePtr = SingletonTypePtr<AnyTupleType>;
+struct TORCH_API AnyTupleType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+
+  std::string str() const override {
+    return "tuple";
+  }
+  static const TypeKind Kind = TypeKind::AnyTupleType;
+
+  // global singleton
+  static AnyTupleTypePtr get();
+private:
+  AnyTupleType()
+  : Type(TypeKind::AnyTupleType) {}
+};
+
+// the common supertype of all classes,
+// ClassType <: AnyClassType for all classes
+struct AnyClassType;
+using AnyClassTypePtr = SingletonTypePtr<AnyClassType>;
+struct TORCH_API AnyClassType : public Type {
+  bool equals(const Type& rhs) const override {
+    return rhs.kind() == kind();
+  }
+  std::string str() const override {
+    return "AnyClassType";
+  }
+  static const TypeKind Kind = TypeKind::AnyClassType;
+  // global singleton
+  static AnyClassTypePtr get();
+private:
+  AnyClassType()
+  : Type(TypeKind::AnyClassType) {}
+};
+
+template<>
+inline typename detail::CastReturnType<NamedType>::type Type::cast<NamedType>() {
+  if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
+      kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
+    return std::static_pointer_cast<NamedType>(static_cast<NamedType *>(this)->shared_from_this());
+  }
+  return nullptr;
+}
+
+template<>
+inline typename detail::CastConstReturnType<NamedType>::type Type::cast<NamedType>() const {
+  if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
+      kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
+    return std::static_pointer_cast<const NamedType>(static_cast<const NamedType *>(this)->shared_from_this());
+  }
+  return nullptr;
+}
+
+template<>
+inline const NamedType* Type::castRaw<NamedType>() const {
+  if (kind() == TypeKind::TupleType || kind() == TypeKind::FunctionType ||
+      kind() == TypeKind::ClassType || kind() == TypeKind::InterfaceType) {
+    return static_cast<const NamedType*>(this);
+  }
+  return nullptr;
+}
+
+// Used as a return type when inferring the IValue type of a Python object.
+struct InferredType {
+  /* implicit */ InferredType(TypePtr type) : type_(std::move(type)) {}
+  /* implicit */ InferredType(std::string reason)
+      : type_(nullptr), reason_(std::move(reason)) {}
+  TypePtr type() const {
+    TORCH_INTERNAL_ASSERT(
+        type_,
+        "Tried to get the type from an InferredType but the type is null. ",
+        "Reason: ",
+        reason_);
+    return type_;
+  }
+  bool success() const {
+    return type_ != nullptr;
+  }
+  const std::string& reason() const {
+    TORCH_INTERNAL_ASSERT(!type_);
+    return reason_;
+  }
+
+private:
+  TypePtr type_;
+  std::string reason_;
+};
+
+TORCH_API bool containsAnyType(const TypePtr& type);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type_base.h b/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..73f153ef523e0b54310aac0159e4677971075712
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/jit_type_base.h
@@ -0,0 +1,719 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <ATen/core/qualified_name.h>
+#include <ATen/core/type_ptr.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+#define C10_FORALL_TYPES(_) \
+  _(AnyType)                \
+  _(EnumType)               \
+  _(AnyEnumType)            \
+  _(TensorType)             \
+  _(StorageType)            \
+  _(TupleType)              \
+  _(ListType)               \
+  _(DictType)               \
+  _(NumberType)             \
+  _(FloatType)              \
+  _(ComplexType)            \
+  _(FutureType)             \
+  _(AwaitType)              \
+  _(RRefType)               \
+  _(IntType)                \
+  _(NoneType)               \
+  _(StringType)             \
+  _(GeneratorType)          \
+  _(QuantizerType)          \
+  _(BoolType)               \
+  _(OptionalType)           \
+  _(VarType)                \
+  _(DeviceObjType)          \
+  _(StreamObjType)          \
+  _(FunctionType)           \
+  _(ClassType)              \
+  _(PyObjectType)           \
+  _(CapsuleType)            \
+  _(InterfaceType)          \
+  _(QSchemeType)            \
+  _(ScalarTypeType)         \
+  _(LayoutType)             \
+  _(MemoryFormatType)       \
+  _(AnyListType)            \
+  _(AnyTupleType)           \
+  _(AnyClassType)           \
+  _(SymIntType)             \
+  _(SymFloatType)           \
+  _(SymBoolType)            \
+  _(UnionType)              \
+  _(DynamicType)
+
+enum class TypeKind {
+#define DEFINE_TYPE(T) T,
+  C10_FORALL_TYPES(DEFINE_TYPE)
+#undef DEFINE_TYPE
+};
+
+TORCH_API const char* typeKindToString(TypeKind kind);
+
+struct Type;
+struct SharedType;
+
+// Use this to customize how a Type is printed using `annotation_str()`. If
+// c10::nullopt is returned, `annotation_str()` falls through to its default
+// implementation.
+using TypePrinter = std::function<c10::optional<std::string>(const Type&)>;
+
+namespace detail {
+template <typename T>
+struct IsSingletonType : public std::integral_constant<bool, false> {};
+} // namespace detail
+#define TORCH_DECLARE_SINGLETON(Type) \
+  struct Type;                                                          \
+  namespace detail { \
+  template <> struct IsSingletonType<Type> : public std::integral_constant<bool, true> {}; \
+  }
+
+TORCH_DECLARE_SINGLETON(AnyType);
+TORCH_DECLARE_SINGLETON(AnyEnumType);
+TORCH_DECLARE_SINGLETON(NumberType);
+TORCH_DECLARE_SINGLETON(FloatType);
+TORCH_DECLARE_SINGLETON(ComplexType);
+TORCH_DECLARE_SINGLETON(IntType);
+TORCH_DECLARE_SINGLETON(BoolType);
+TORCH_DECLARE_SINGLETON(StringType);
+TORCH_DECLARE_SINGLETON(StorageType);
+TORCH_DECLARE_SINGLETON(NoneType);
+TORCH_DECLARE_SINGLETON(GeneratorType);
+TORCH_DECLARE_SINGLETON(QuantizerType);
+TORCH_DECLARE_SINGLETON(QSchemeType);
+TORCH_DECLARE_SINGLETON(DeviceObjType);
+TORCH_DECLARE_SINGLETON(StreamObjType);
+TORCH_DECLARE_SINGLETON(CapsuleType);
+TORCH_DECLARE_SINGLETON(PyObjectType);
+TORCH_DECLARE_SINGLETON(ScalarTypeType);
+TORCH_DECLARE_SINGLETON(LayoutType);
+TORCH_DECLARE_SINGLETON(MemoryFormatType);
+TORCH_DECLARE_SINGLETON(AnyListType);
+TORCH_DECLARE_SINGLETON(AnyTupleType);
+TORCH_DECLARE_SINGLETON(AnyClassType);
+
+namespace detail {
+template <typename T, typename Enable = void>
+struct CastReturnType {
+  using type = std::shared_ptr<T>;
+};
+
+template <typename T>
+struct CastReturnType<T, typename std::enable_if<IsSingletonType<T>::value>::type> {
+  using type = SingletonTypePtr<T>;
+};
+
+template <typename T, typename Enable = void>
+struct CastConstReturnType {
+  using type = std::shared_ptr<const T>;
+};
+
+template <typename T>
+struct CastConstReturnType<T, typename std::enable_if<IsSingletonType<T>::value>::type> {
+  using type = SingletonTypePtr<const T>;
+};
+
+template <typename T>
+struct as_shared_type {
+  using type = SharedType*;
+};
+
+template <typename T>
+struct as_shared_type<const T*> {
+  using type = const SharedType *;
+};
+} // namespace detail
+
+struct TORCH_API Type {
+  friend TORCH_API bool operator==(const Type& lhs, const Type& rhs);
+  private:
+  TypeKind kind_;
+
+  protected:
+  Type(TypeKind kind) : kind_(kind) {}
+
+  Type(const Type&) = default;
+  Type& operator=(const Type&) = default;
+  Type(Type&&) noexcept = default;
+  Type& operator=(Type&&) noexcept = default;
+
+  virtual std::string annotation_str_impl(TypePrinter /*printer*/) const {
+    return str();
+  }
+  // a == b
+  virtual bool equals(const Type& rhs) const = 0;
+  // a == b <=> b == a
+  virtual bool symmetric() const {
+    return true;
+  }
+
+ public:
+  template <typename T>
+  class SingletonOrSharedTypePtr {
+   public:
+    using element_type = typename std::shared_ptr<T>::element_type;
+
+    SingletonOrSharedTypePtr() = default;
+
+    /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr<T> x)
+        : repr_(std::move(x)) {}
+
+    template <typename U, std::enable_if_t<std::is_convertible<U*, T*>::value, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr<U> x)
+        : repr_(std::move(x)) {}
+
+    /* implicit */ SingletonOrSharedTypePtr(std::nullptr_t)
+        : repr_(nullptr) {}
+
+    /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<T> p)
+        : repr_(p) {}
+
+    template <typename U, std::enable_if_t<std::is_convertible<U*, T*>::value, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr<U> p)
+        : repr_(SingletonTypePtr<T>(p.get())) {}
+
+
+    // We need to support construction from T* for pybind. The problem
+    // is that it's not clear if we are supposed to be taking shared
+    // ownership or not.
+    //
+    // Case 1: if T is known statically to derive from SharedType, we should use
+    // shared_from_this() and take shared_ownership.
+    //
+    // Case 2: if T is exactly Type, we need to do a dynamic_cast to
+    // check if it's a SharedType and do the right thing.
+    //
+    // Case 3: Otherwise, T is not a SharedType. (debug-check this
+    // assumption!) Use a singleton pointer.
+
+    template <typename U = T, std::enable_if_t<std::is_base_of<SharedType, U>::value, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(T* p) : SingletonOrSharedTypePtr(static_cast<typename detail::as_shared_type<U>::type>(p)->shared_from_this()) {}
+
+    template <typename U = T, std::enable_if_t<std::is_same<Type, U>::value, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(T* p) {
+      if (auto* shared_p = dynamic_cast<typename detail::as_shared_type<U>::type>(p)) {
+        repr_ = Repr(shared_p->shared_from_this());
+      } else {
+        repr_ = Repr(p);
+      }
+    }
+
+    template <typename U = T, std::enable_if_t<!std::is_same<Type, U>::value && !std::is_base_of<SharedType, U>::value, bool> = true>
+    /* implicit */ SingletonOrSharedTypePtr(T* p)
+        : repr_(p) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dynamic_cast<typename detail::as_shared_type<U>::type>(p) == nullptr);
+    }
+
+    SingletonOrSharedTypePtr(const SingletonOrSharedTypePtr&) = default;
+    SingletonOrSharedTypePtr(SingletonOrSharedTypePtr&&) noexcept = default;
+    SingletonOrSharedTypePtr& operator=(const SingletonOrSharedTypePtr&) = default;
+    SingletonOrSharedTypePtr& operator=(SingletonOrSharedTypePtr&&) noexcept = default;
+
+    T* get() const {
+      return repr_.isSharedAndNonNull() ? repr_.shared_.repr_.get() : static_cast<T*>(repr_.rawRepr().first);
+    }
+
+    operator bool() const {
+      return repr_.isNonNull();
+    }
+
+    bool operator==(std::nullptr_t) const {
+      return !repr_.isNonNull();
+    }
+
+    bool operator!=(std::nullptr_t) const {
+      return repr_.isNonNull();
+    }
+
+    template <typename U = T, std::enable_if_t<!std::is_same<std::remove_const_t<U>, void>::value, bool> = true>
+    U& operator*() const {
+      return *get();
+    }
+
+    T* operator->() const {
+      return get();
+    }
+
+  private:
+    // NOTE: SharedPtrWrapper exists to work around a baffling bug in
+    // nvcc; see comment in destroy() below.
+    struct SharedPtrWrapper {
+      SharedPtrWrapper(std::shared_ptr<T> &&x)
+          : repr_(std::move(x)) {}
+      std::shared_ptr<T> repr_;
+    };
+    union Repr {
+      Repr() : Repr(nullptr) {}
+
+      explicit Repr(std::shared_ptr<T> x)
+          : shared_(std::move(x)) {}
+
+      explicit Repr(std::nullptr_t)
+          : singletonRepr_(nullptr) {}
+
+      explicit Repr(SingletonTypePtr<T> p)
+          : singletonRepr_(p.get()) {}
+
+      ~Repr() {
+        destroy();
+      }
+
+      // NOTE: the only non-UB way to access our null state is through
+      // rawRepr(), because our copy operation doesn't preserve which
+      // union member is active for null pointers.
+      Repr(const Repr& rhs) {
+        if (rhs.isSharedAndNonNull()) {
+          new (&shared_) SharedPtrWrapper(rhs.shared_);
+        } else {
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+      }
+
+      Repr(Repr&& rhs) noexcept {
+        if (rhs.isSharedAndNonNull()) {
+          new (&shared_) SharedPtrWrapper(std::move(rhs.shared_));
+        } else {
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.singletonRepr_.unused_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+      }
+
+      Repr& operator=(const Repr& rhs) {
+        if (&rhs == this) {
+          return *this;
+        }
+        if (rhs.isSharedAndNonNull()) {
+          if (isSharedAndNonNull()) {
+            shared_ = rhs.shared_;
+          } else {
+            new (&shared_) SharedPtrWrapper(rhs.shared_);
+          }
+        } else {
+          if (isSharedAndNonNull()) {
+            destroy();
+          }
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+        return *this;
+      }
+
+      Repr& operator=(Repr&& rhs) noexcept {
+        if (&rhs == this) {
+          return *this;
+        }
+        if (rhs.isSharedAndNonNull()) {
+          if (isSharedAndNonNull()) {
+            shared_ = std::move(rhs.shared_);
+          } else {
+            new (&shared_) SharedPtrWrapper(std::move(rhs.shared_));
+          }
+        } else {
+          if (isSharedAndNonNull()) {
+            destroy();
+          }
+          singletonRepr_.singleton_ = static_cast<T*>(rhs.rawRepr().first);
+          TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.rawRepr().nullIfSingleton_ == nullptr);
+          singletonRepr_.unused_ = nullptr;
+        }
+        return *this;
+      }
+
+      SharedPtrWrapper shared_;
+
+      struct SingletonRepr {
+        explicit SingletonRepr(T* s) : singleton_(s) {}
+        T* singleton_;
+        void* unused_ = nullptr;
+      } singletonRepr_;
+      struct RawRepr {
+        void* first;
+        void* nullIfSingleton_;
+      };
+
+      // It is UB to read the singleton part of Repr if it was
+      // constructed as a shared_ptr and vice versa, but memcpying out
+      // the representation is always OK, so here's an accessor to obey
+      // the letter of the law.
+      RawRepr rawRepr() const {
+        RawRepr repr{};
+        memcpy(&repr, reinterpret_cast<const char *>(this), sizeof(RawRepr));
+        return repr;
+      }
+
+      bool isNonNull() const {
+        auto repr = rawRepr();
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(repr.nullIfSingleton_ == nullptr || repr.first != nullptr);
+        return repr.first != nullptr;
+      }
+
+      bool isSharedAndNonNull() const {
+        return rawRepr().nullIfSingleton_ != nullptr;
+      }
+
+     private:
+      void destroy() {
+        if (isSharedAndNonNull()) {
+          // Without SharedPtrWrapper, this line would read
+          // `shared_.~shared_ptr()` and nvcc would complain with
+          // "error: expected primary-expression before '>' token"
+          // referring to the "t" in "shared_ptr". SharedPtrWrapper
+          // exists to work around this compiler bug.
+          shared_.~SharedPtrWrapper();
+        }
+      }
+    } repr_;
+  };
+
+  using TypePtr = SingletonOrSharedTypePtr<Type>;
+  using Ptr = TypePtr;
+  using ElementType = Type;
+
+  // subtyping relation. By default, we return true for the case
+  // when the type is exactly equal or if this <: T where rhs = Optional[T]
+
+  // if this returns false and the why_not stream is non-null, it contains
+  // additional details that describe why this is not a subtype of 'rhs'.
+  // This additional information should only contain details that are not
+  // obvious from the annotation_str() that describes the type. For instance it
+  // is clear that `int <: str` is false but not clear why `Foo <: InterfaceBar`
+  // might be false.
+  virtual bool isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const;
+  virtual bool is_module() const;
+  bool isSubtypeOf(const Type& rhs) const {
+    return isSubtypeOfExt(rhs, nullptr);
+  }
+  // Compatibility shims to accommodate existing code that passes shared_ptrs
+  // around. Ideally, we would just delete this, but it should be harmless.
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOf(const std::shared_ptr<T>& rhs) const {
+    return isSubtypeOf(*rhs);
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOf(const SingletonOrSharedTypePtr<T>& rhs) const {
+    return isSubtypeOf(*rhs);
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOf(SingletonTypePtr<T> rhs) const {
+    return isSubtypeOf(*rhs);
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOfExt(const SingletonOrSharedTypePtr<T>& rhs, std::ostream* why_not) const {
+    return isSubtypeOfExt(*rhs, why_not);
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOfExt(const std::shared_ptr<T>& rhs, std::ostream* why_not) const {
+    return isSubtypeOfExt(*rhs, why_not);
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_base_of<Type, T>::value, bool>::type
+  isSubtypeOfExt(SingletonTypePtr<T> rhs, std::ostream* why_not) const {
+    return isSubtypeOfExt(*rhs, why_not);
+  }
+
+  // How this type will appear in FunctionSchema declarations
+  virtual std::string str() const = 0;
+
+  // How this type will appear as if it were a type annotation in Python
+  // which is sometimes different than how it appears in declarations (e.g.
+  // int[] vs List[int])
+  //
+  // Takes a custom printer that users can pass in to customize the output of
+  // this method.
+  std::string annotation_str(TypePrinter printer) const {
+    if (printer) {
+      // the printer can return nullopt to fall through to the default impl
+      if (auto renamed = printer(*this)) {
+        return *renamed;
+      }
+    }
+    return annotation_str_impl(std::move(printer));
+  }
+  std::string annotation_str() const {
+    // Overload instead of define a default value for `printer` to help
+    // debuggers out.
+    return annotation_str(nullptr);
+  }
+
+  // Returns a human readable string that includes additional information like
+  // "type is inferred rather than explicitly defined" to help construct more
+  // user-friendly messages.
+  virtual std::string repr_str() const {
+    return annotation_str();
+  }
+
+  TypeKind kind() const {
+    return kind_;
+  }
+
+  virtual bool isUnionType() const {
+    return false;
+  }
+
+  virtual bool requires_grad() const {
+    for (const auto& ct : containedTypes()) {
+      if (ct->requires_grad()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Dynamically cast this object to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid.
+  template <typename T, std::enable_if_t<!detail::IsSingletonType<T>::value, bool> = true>
+  typename detail::CastReturnType<T>::type cast() {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<T>(static_cast<T*>(this)->shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T, std::enable_if_t<detail::IsSingletonType<T>::value, bool> = true>
+  typename detail::CastReturnType<T>::type cast() {
+    if (T::Kind == kind()) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this == T::get().get());
+      return typename detail::CastReturnType<T>::type(static_cast<T*>(this));
+    }
+    return nullptr;
+  }
+  template <typename T, std::enable_if_t<!detail::IsSingletonType<T>::value, bool> = true>
+  typename detail::CastConstReturnType<T>::type cast() const {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<const T>(static_cast<const T*>(this)->shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T, std::enable_if_t<detail::IsSingletonType<T>::value, bool> = true>
+  typename detail::CastConstReturnType<T>::type cast() const {
+    if (T::Kind == kind()) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(this == T::get().get());
+      return typename detail::CastConstReturnType<T>::type(static_cast<const T*>(this));
+    }
+    return nullptr;
+  }
+  template <typename T>
+  T* castRaw() {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  const T* castRaw() const {
+    if (T::Kind == kind()) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  auto expect() {
+    auto r = cast<T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  template <typename T>
+  auto expect() const {
+    auto r = cast<const T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  template <typename T>
+  T& expectRef() {
+    auto* r = castRaw<T>();
+    AT_ASSERT(r);
+    return *r;
+  }
+  template <typename T>
+  const T& expectRef() const {
+    auto* r = castRaw<const T>();
+    AT_ASSERT(r);
+    return *r;
+  }
+  virtual ~Type() = default;
+  virtual bool hasFreeVariables() const {
+    return false;
+  }
+  // list of types this type contains, e.g. for a List then element type of a
+  // list for a tuple, the types of the tuple elements
+  virtual at::ArrayRef<TypePtr> containedTypes() const {
+    return {};
+  }
+  virtual TypePtr containedType(size_t i) const {
+    return containedTypes().at(i);
+  }
+  virtual size_t containedTypeSize() const {
+    return containedTypes().size();
+  }
+  // create a new version of this type, replacing its contained types with
+  // contained_types
+  TypePtr withContained(std::vector<TypePtr> contained_types);
+  // per-type constructor, you only need to override this if the
+  // containedTypes() is not empty
+  virtual TypePtr createWithContained(
+      std::vector<TypePtr> /*contained_types*/) const {
+    AT_ERROR(
+        "type with contained types did not overload createWithContained: ",
+        str());
+  }
+
+};
+
+template <typename T>
+using SingletonOrSharedTypePtr = Type::SingletonOrSharedTypePtr<T>;
+
+
+template <typename T, typename U>
+bool operator==(const SingletonOrSharedTypePtr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return (void*)x.get() == (void*)y.get();
+}
+
+template <typename T, typename U>
+bool operator==(const SingletonOrSharedTypePtr<T>& x, const std::shared_ptr<U>& y) {
+  return (void*)x.get() == (void*)y.get();
+}
+
+template <typename T, typename U>
+bool operator==(const std::shared_ptr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return (void*)x.get() == (void*)y.get();
+}
+
+template <typename T, typename U>
+bool operator==(const SingletonOrSharedTypePtr<T>& x, const SingletonTypePtr<U>& y) {
+  return (void*)x.get() == (void*)y.get();
+}
+
+template <typename T, typename U>
+bool operator==(const SingletonTypePtr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return (void*)x.get() == (void*)y.get();
+}
+
+template <typename T, typename U>
+bool operator!=(const SingletonOrSharedTypePtr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return !(x == y);
+}
+
+template <typename T, typename U>
+bool operator!=(const SingletonOrSharedTypePtr<T>& x, const std::shared_ptr<U>& y) {
+  return !(x == y);
+}
+
+template <typename T, typename U>
+bool operator!=(const std::shared_ptr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return !(x == y);
+}
+
+template <typename T, typename U>
+bool operator!=(const SingletonOrSharedTypePtr<T>& x, const SingletonTypePtr<U>& y) {
+  return !(x == y);
+}
+
+template <typename T, typename U>
+bool operator!=(const SingletonTypePtr<T>& x, const SingletonOrSharedTypePtr<U>& y) {
+  return !(x == y);
+}
+
+using TypePtr = SingletonOrSharedTypePtr<Type>;
+using ConstTypePtr = SingletonOrSharedTypePtr<const Type>;
+
+// Explicitly enable MaybeOwned<shared_ptr<T>>, rather than allowing
+// MaybeOwned to be used for any type right away.
+template <typename T>
+struct MaybeOwnedTraits<SingletonOrSharedTypePtr<T>>
+    : public MaybeOwnedTraitsGenericImpl<SingletonOrSharedTypePtr<T>> {};
+
+// Base class for Types that are guaranteed to be owned by std::shared_ptr.
+struct TORCH_API SharedType : public Type, public std::enable_shared_from_this<SharedType> {
+  using Type::Type;
+};
+
+inline TypePtr Type::withContained(std::vector<TypePtr> contained_types) {
+  auto current_contained = containedTypes();
+  // Types with no contained_types don't need this call. Check before calling!
+  //
+  // (We can't support this efficiently because types without
+  // contained types may be singletons, in which case
+  // shared_from_this will crash; we would have to provide a virtual
+  // typeptr_from_this or isSingleton.)
+  TORCH_INTERNAL_ASSERT(!current_contained.empty() && current_contained.size() == contained_types.size());
+  if (current_contained.equals(contained_types)) {
+    return std::static_pointer_cast<Type>(static_cast<SharedType *>(this)->shared_from_this());
+  }
+  return createWithContained(std::move(contained_types));
+}
+
+
+TORCH_API inline bool operator==(const Type& lhs, const Type& rhs) {
+  if (C10_UNLIKELY(!rhs.symmetric())) {
+    return rhs.equals(lhs);
+  }
+  return lhs.equals(rhs);
+}
+
+struct NamedType;
+using NamedTypePtr = std::shared_ptr<NamedType>;
+using ConstNamedTypePtr = std::shared_ptr<const NamedType>;
+
+struct TORCH_API NamedType : public SharedType {
+  NamedType(TypeKind tk, c10::optional<QualifiedName> name)
+      : SharedType(tk), name_(std::move(name)) {
+    TORCH_INTERNAL_ASSERT(
+        tk == TypeKind::TupleType || tk == TypeKind::FunctionType ||
+            tk == TypeKind::ClassType || tk == TypeKind::InterfaceType ||
+            tk == TypeKind::EnumType,
+        "If you add a new kind of NamedType, ",
+        "please update the cast<NamedType> specialization and this assert");
+  }
+
+  // Fully qualified name of type
+  // Looks like: "foo.bar.Baz".
+  const c10::optional<QualifiedName>& name() const {
+    return name_;
+  }
+
+ private:
+  c10::optional<QualifiedName> name_;
+};
+
+} // namespace c10
+
+namespace std {
+template <typename T>
+struct hash<c10::SingletonOrSharedTypePtr<T>> {
+  size_t operator()(const c10::SingletonOrSharedTypePtr<T>& x) const {
+    return std::hash<T*>()(x.get());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/adaption.h b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/adaption.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6e555f3bb47b61665088f6ee2ca7179f5b120c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/adaption.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/List.h>
+#include <c10/core/TensorOptions.h>
+
+/*
+ * [Note: hacky wrapper removal for optional tensor]
+ *
+ * The kernel implementation takes an optional tensor marked in the schema as
+ * Tensor? but the C++ function takes Tensor instead of the optional<Tensor>
+ * expected by the dispatcher.
+ *
+ * To remove the hacky wrapper, the C++ function is changed to take
+ * optional<Tensor> and unwrap the Tensor value at the beginning of
+ * the function, e.g.:
+ *   > c10::MaybeOwned<Tensor> weight_maybe_owned =
+ *   >     at::borrow_from_optional_tensor(weight_opt);
+ *   > const Tensor& weight = *weight_maybe_owned;
+ *
+ * We may want to make the kernel handle optional directly without
+ * going through the creation of a default-constructed Tensor in
+ * at::borrow_from_optional_tensor.
+ */
+
+/*
+ * [Note: hacky wrapper removal for TensorOptions]
+ *
+ * The kernel implementation takes a TensorOptions argument but the dispatcher
+ * expects separate arguments for dtype, layout, device, pin_memory.
+ *
+ * To remove the hacky wrapper, the kernel implementation is changed to take
+ * the 4 arguments (dtype, layout, device, pin_memory), and assemble the
+ * TensorOptions value at the beginning of the function, e.g.:
+ *   > TensorOptions options = TensorOptions().dtype(dtype).layout(layout)
+ *   >    .device(device).pinned_memory(pin_memory);
+ *
+ * We may want make the kernel handle these parameters directly without going
+ * through the creation of a TensorOptions value.
+ */
+
+namespace c10 {
+namespace impl {
+
+TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName);
+
+inline void check_and_update_common_device(optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  // TODO: Remove this once the following issue is addressed:
+  // https://github.com/pytorch/pytorch/issues/57380
+  if (!tensor.defined()) {
+    return;
+  }
+
+  if (!common_device.has_value()) {
+    common_device = tensor.device();
+    return;
+  }
+
+  if (C10_UNLIKELY(common_device != tensor.device())) {
+    common_device_check_failure(*common_device, tensor, methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(optional<Device>& common_device, const optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  if (tensor.has_value()) {
+    check_and_update_common_device(common_device, tensor.value(), methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+
+inline void check_and_update_common_device(optional<Device>& common_device, const List<optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+} // namespace impl
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/infer_schema.h b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/infer_schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e089330f5a8681d0f41e6357fbc63b6f328b79d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/infer_schema.h
@@ -0,0 +1,160 @@
+#pragma once
+
+/**
+ * This file contains functionality to take a C++ function and infer its
+ * c10::FunctionSchema.
+ */
+
+#include <ATen/core/function_schema.h>
+#include <c10/util/Metaprogramming.h>
+
+namespace c10 {
+namespace detail {
+
+namespace infer_schema {
+
+/// The templated inference code creates `ArgumentDef` instead of `Argument`,
+/// because that can be constructed at compile time and has a much smaller
+/// binary size than having calls to `Argument` constructors in the template.
+/// Creating `Argument` objects from `ArgumentDef` can then be done at
+/// runtime in a non-templated way.
+struct ArgumentDef final {
+  using GetTypeFn = TypePtr();
+  GetTypeFn* getTypeFn;
+  GetTypeFn* getFakeTypeFn;
+  constexpr ArgumentDef(): getTypeFn(nullptr), getFakeTypeFn(nullptr) {}
+  explicit constexpr ArgumentDef(GetTypeFn *getTypeFn, GetTypeFn *getFakeTypeFn): getTypeFn(getTypeFn), getFakeTypeFn(getFakeTypeFn) {}
+};
+
+template<bool V>
+struct bool_t {};
+template<> struct bool_t<true> : std::true_type {};
+template<> struct bool_t<false> : std::false_type {};
+
+/// Checks the static C++ types `Types` for correctness to catch common error cases.
+template <class... Types>
+constexpr int checkStaticTypes() {
+ // Give nice error messages for some of the common error cases.
+ // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT
+ static_assert(std::conjunction<
+     bool_t<!std::is_integral<Types>::value || std::is_same<Types, int8_t>::value || std::is_same<Types, int64_t>::value || std::is_same<Types, bool>::value>...
+   >::value, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type");
+ static_assert(std::conjunction<
+     bool_t<!std::is_same<Types, float>::value>...
+   >::value, "INVALID TYPE: float is not supported as an argument type, use double instead");
+ return 0;
+}
+
+template <typename... Ts, size_t... Is>
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...>) {
+  return (
+    // Check types for common errors
+    checkStaticTypes<Ts...>(),
+
+    // Create the return value
+    std::array<ArgumentDef, sizeof...(Ts)>{
+      ArgumentDef(&getTypePtrCopy<std::decay_t<Ts>>, &getFakeTypePtrCopy<std::decay_t<Ts>>)...}
+  );
+}
+
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as template arguments.
+template<class ParameterTypes> struct createArguments final {};
+template<class... ParameterTypes>
+struct createArguments<guts::typelist::typelist<ParameterTypes...>> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ParameterTypes)> call() {
+    return createArgumentVectorFromTypes<ParameterTypes...>(
+        std::make_index_sequence<sizeof...(ParameterTypes)>()
+    );
+  }
+};
+
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as a tuple (i.e. in the way c10 kernels return values).
+/// It can be a tuple<A, B, C> if there's three output arguments with types A, B, C.
+/// It can be an empty tuple<>, or void for kernels that don't return anything.
+/// It can be a single type A (i.e. no tuple) for the case where a kernel just
+/// returns one value.
+template<class ReturnTypeTuple, class Enable = void> struct createReturns final {};
+
+template<class... ReturnTypes>
+struct createReturns<std::tuple<ReturnTypes...>, void> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ReturnTypes)> call() {
+    return createArgumentVectorFromTypes<ReturnTypes...>(
+        std::make_index_sequence<sizeof...(ReturnTypes)>()
+    );
+  }
+};
+
+template<class ReturnType>
+struct createReturns<ReturnType, std::enable_if_t<!std::is_same<void, ReturnType>::value && !guts::is_instantiation_of<std::tuple, ReturnType>::value>> final {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createReturns<std::tuple<ReturnType>>::call();
+  }
+};
+
+template<>
+struct createReturns<void, void> final {
+  static constexpr std::array<ArgumentDef, 0> call() {
+    return createReturns<std::tuple<>>::call();
+  }
+};
+
+template <typename ReturnType>
+struct createSingleReturn {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createArgumentVectorFromTypes<ReturnType>(std::make_index_sequence<1>());
+  }
+};
+
+TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+TORCH_API FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Flattens std::tuple returns into multiple return types
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsFlattenedReturns() {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createReturns<ReturnType>::call();
+
+ return make_function_schema(arguments, returns);
+}
+
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Preserves std::tuple returns as a Tuple return type
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsSingleReturn(std::string&& name, std::string&& overload_name) {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createSingleReturn<ReturnType>::call();
+
+ return make_function_schema(std::move(name), std::move(overload_name), arguments, returns);
+}
+
+}
+}
+
+template<class FuncType>
+FunctionSchema inferFunctionSchemaFlattenedReturns() {
+  return detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns<guts::infer_function_traits_t<FuncType>>();
+}
+
+template<class FuncType>
+FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& overload_name) {
+  return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
+}
+
+TORCH_API c10::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2703c741fcbef3d2c45c5df2a7210b17aac6925
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h
@@ -0,0 +1,199 @@
+#pragma once
+
+// TODO: unify to C10_MOBILE. In theory this header could be used in OSS.
+#ifdef TEMPLATE_SELECTIVE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#endif
+
+/**
+ * This header implements functionality to build PyTorch with only a certain
+ * set of operators (+ dependencies) included.
+ *
+ * - Build with -DTORCH_OPERATOR_WHITELIST="aten::add;aten::sub" and only these
+ *   two ops will be included in your build.  The allowlist records operators
+ *   only, no overloads; if you include aten::add, all overloads of aten::add
+ *   will be included.
+ *
+ * Internally, this is done by removing the operator registration calls
+ * using compile time programming, and the linker will then prune all
+ * operator functions that weren't registered.
+ * See Note [Selective build] for more details
+ *
+ * WARNING: The allowlist mechanism doesn't work for all ways you could go about
+ * registering an operator.  If the dispatch key / operator name is not
+ * sufficiently obvious at compile time, then the allowlisting mechanism
+ * will fail (and the operator will be included in the binary anyway).
+ */
+
+#include <c10/util/string_view.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/macros/Macros.h>
+
+
+#if defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+#include <ATen/record_function.h>
+#endif
+
+namespace c10 {
+
+namespace impl {
+
+constexpr bool allowlist_contains(string_view allowlist, string_view item);  // Forward Declare
+
+/**
+ * In selective build mode returns true/false depending on whether a build
+ * feature is available or not.
+ *
+ * In instrumenting mode (tracing mode), always returns true, and doesn't
+ * trigger any side effects.
+ */
+constexpr bool is_build_feature_available(const char* name) {
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+  // Selective Build mode.
+#if !defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+  (void)name;
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_BUILD_FEATURE_ALLOWLIST),
+    name);
+#endif
+
+#else
+  // Instrumenting mode.
+  (void)name;
+  return true;
+#endif
+}
+
+[[noreturn]] void build_feature_required_feature_not_available(const char* feature);
+
+/**
+ * Use BUILD_FEATURE_REQUIRED macro in user-code.
+ *
+ * In selective build mode becomes a no-op if the build feature passed
+ * in is available. If not available, throws an exception (c10::Error).
+ * The compiler is able to perform dead code elimination for code
+ * following this method if the build feature is not available.
+ *
+ * In instrumenting mode (tracing mode), registers (as a side effect)
+ * the presence of this specific build feature being triggered.
+ */
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)  // selective build mode
+
+#if defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+#define BUILD_FEATURE_REQUIRED(NAME)                                 \
+  if (!c10::impl::is_build_feature_available(NAME)) {                \
+    ::c10::impl::build_feature_required_feature_not_available(NAME); \
+  }
+#else  // Everything trivially selected
+#define BUILD_FEATURE_REQUIRED(NAME)
+
+#endif
+
+#else  // trace mode
+#define BUILD_FEATURE_REQUIRED(NAME)  \
+  RECORD_FUNCTION_WITH_SCOPE(         \
+      at::RecordScope::BUILD_FEATURE, \
+      std::string(NAME),              \
+      {});
+#endif
+
+// Use this macro, and not is_build_feature_available
+#define BUILD_FEATURE_AVAILABLE(NAME) ::c10::impl::is_build_feature_available(NAME)
+
+// returns true iff allowlist contains item
+// allowlist_contains("a;bc;d", "bc") == true
+constexpr bool allowlist_contains(string_view allowlist, string_view item) {
+    //Choose a really big value for next so that if something goes wrong
+    //this code will blow up in a hopefully detectable way.
+    size_t next = std::numeric_limits<size_t>::max();
+    for (size_t cur = 0; cur <= allowlist.size(); cur = next) {
+      next = allowlist.find(';', cur);
+      if (next != string_view::npos) {
+        if (allowlist.substr(cur, next - cur).compare(item) == 0) {
+          return true;
+        }
+        next++;
+      } else {
+        if (allowlist.substr(cur).compare(item) == 0) {
+          return true;
+        }
+        break;
+      }
+    }
+    return false;
+}
+
+// Returns true iff the given op name is on the allowlist
+// and should be registered
+constexpr bool op_allowlist_check(string_view op_name) {
+  assert(op_name.find("::") != string_view::npos);
+  // Use assert() instead of throw() due to a gcc bug. See:
+  // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function
+  // https://github.com/fmtlib/fmt/issues/682
+  assert(op_name.find("(") == string_view::npos);
+#if !defined(TORCH_OPERATOR_WHITELIST)
+  // If the TORCH_OPERATOR_WHITELIST parameter is not defined,
+  // all ops are to be registered
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_OPERATOR_WHITELIST),
+    // This function is majorly used for mobile selective build with
+    // root operators, where the overload is included in the allowlist.
+    op_name);
+    // // Strip overload name (as allowlist doesn't contain overloads)
+    // // Another function based on this may be added when there's usage
+    // // on op names without overload.
+    // OperatorNameView::parse(op_name).name);
+#endif
+}
+
+// Returns true iff the given schema string is on the allowlist
+// and should be registered
+constexpr bool schema_allowlist_check(string_view schema) {
+#if defined(TORCH_FORCE_SCHEMA_REGISTRATION)
+  return true;
+#else
+  return op_allowlist_check(schema.substr(0, schema.find("(")));
+#endif
+}
+
+// Returns true iff the given custom class name is on the allowlist
+// and should be registered
+constexpr bool custom_class_allowlist_check(string_view custom_class_name) {
+#if !defined(TORCH_CUSTOM_CLASS_ALLOWLIST)
+  // If the TORCH_CUSTOM_CLASS_ALLOWLIST parameter is not defined,
+  // all custom classes are to be registered
+  (void)custom_class_name;
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_CUSTOM_CLASS_ALLOWLIST),
+    custom_class_name);
+#endif
+}
+
+// schema_allowlist_check() implicitly depends on a macro, TORCH_OPERATOR_WHITELIST.
+// Add this API to pass arbitrary allowlist.
+constexpr bool op_allowlist_contains_name_in_schema(string_view allowlist, string_view schema) {
+  return allowlist_contains(allowlist, schema.substr(0, schema.find("(")));
+}
+
+// Returns true iff the given dispatch key is on the allowlist
+// and should be registered.  When we turn this on, the list of valid
+// mobile dispatch keys is hard coded (but you need to make sure
+// that you have the correct set of dispatch keys for this).
+constexpr bool dispatch_key_allowlist_check(DispatchKey /*k*/) {
+#ifdef C10_MOBILE
+  return true;
+  // Disabled for now: to be enabled later!
+  // return k == DispatchKey::CPU || k == DispatchKey::Vulkan || k == DispatchKey::QuantizedCPU || k == DispatchKey::BackendSelect || k == DispatchKey::CatchAll;
+#else
+  return true;
+#endif
+}
+
+} // namespace impl
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_registration.h b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_registration.h
new file mode 100644
index 0000000000000000000000000000000000000000..751c3bfed81c0ab8fee17fa7caec35e1ed2a645d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/op_registration/op_registration.h
@@ -0,0 +1,596 @@
+#pragma once
+
+/**
+ * Include this file if you want to register operators. It includes all
+ * functionality needed to do so for you.
+ */
+
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/op_registration/infer_schema.h>
+#if defined(EXPOSE_C2_OPS) || !defined(CAFFE2_IS_XPLAT_BUILD)
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#endif
+#include <ATen/core/ATenOpList.h>
+
+namespace c10 {
+
+namespace detail {
+// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
+// We do this because every argument in a function schema is expected to be convertable
+// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
+// See Note [Plumbing Keys Through The Dispatcher]
+template<class KernelFunctor>
+std::unique_ptr<FunctionSchema> inferFunctionSchemaFromFunctor() {
+  using func_type = typename c10::remove_DispatchKeySet_arg_from_func<KernelFunctor>::func_type;
+  return std::make_unique<FunctionSchema>(inferFunctionSchemaFlattenedReturns<func_type>());
+}
+}
+
+/**
+ * An instance of this class handles the registration for one or more operators.
+ * Make sure you keep the RegisterOperators instance around since it will
+ * deregister the operator it's responsible for in its destructor.
+ *
+ * Example:
+ *
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ * >
+ * > static auto registry = c10::RegisterOperators()
+ * >     .op(c10::RegisterOperators::options()
+ * >         .schema("my_op")
+ * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+ */
+class TORCH_API RegisterOperators final {
+public:
+  RegisterOperators() = default;
+  ~RegisterOperators() = default;
+
+  RegisterOperators(const RegisterOperators&) = delete;
+  RegisterOperators& operator=(const RegisterOperators&) = delete;
+  RegisterOperators(RegisterOperators&&) noexcept = default;
+  RegisterOperators& operator=(RegisterOperators&&) noexcept = default;
+
+  class TORCH_API Options final {
+  public:
+    Options(const Options&) = delete;
+    Options(Options&&) noexcept = delete;
+    Options& operator=(const Options&) = delete;
+    Options& operator=(Options&&) noexcept = delete;
+
+    // internal-only for registering stack based kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& kernel(DispatchKey dispatch_key) && {
+      return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
+    }
+
+    // internal-only for registering stack based catch-all kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& catchAllKernel() && {
+      return std::move(*this).kernel(c10::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
+    }
+
+    // internal only for registering caffe2 ops
+    Options&& schema(FunctionSchema&& schema) {
+        TORCH_CHECK(!schemaOrName_.has_value(), "You can only specify the schema once per operator registration.");
+        schemaOrName_ = FunctionSchema(std::move(schema));
+        return std::move(*this);
+    }
+
+    /**
+     * Use this to specify the schema for an operator. You can also specify
+     * the operator name only to have the function signature part of the
+     * schema be inferred from the kernel function.
+     *
+     * Example:
+     *
+     * > // Infer function signature from my_kernel_cpu
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     * >
+     * >
+     * > // Explicitly specify full schema
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op(Tensor a) -> Tensor")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     */
+    Options&& schema(const std::string& schemaOrName) {
+      TORCH_CHECK(!schemaOrName_.has_value(), "Tried to register operator ", schemaOrName," but specified schema multiple times. You can only specify the schema once per operator registration.");
+
+      #if !defined(EXPOSE_C2_OPS) && defined(CAFFE2_IS_XPLAT_BUILD)
+        throw std::logic_error("Tried to register operator " + schemaOrName + ". We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
+      #else
+        schemaOrName_ = torch::jit::parseSchemaOrName(schemaOrName);
+      #endif
+
+      return std::move(*this);
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU, "some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>());
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>("some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible<KernelFunctor, ConstructorParameters...>::value, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+
+      return std::move(*this).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>(DispatchKey::CPU));
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key) && {
+      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel() && {
+      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && {
+      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel(FuncType* kernel_func) && {
+      static_assert(!std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+
+      return std::move(*this).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel(DispatchKey::CPU, [] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>::value,
+        Options&&> kernel(DispatchKey dispatch_key, Lambda&& functor) && {
+      static_assert(!std::is_base_of<OperatorKernel, std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious. A functor kernel with cache gets a new instance of
+      // its cache each time the kernel is looked up from the dispatch table.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // So, instead of making users having to think about it (including the thread-safety
+      // issues this causes), let's just forbid stateful lambdas altogether.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(functor)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel([] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>::value,
+        Options&&> catchAllKernel(Lambda&& lambda) && {
+      static_assert(!std::is_base_of<OperatorKernel, std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // This would be a likely source for unexpected race conditions, so we forbid it.
+      // If a kernel really needs global state, they can just have regular global state
+      // in their .cpp file next to the kernel lambda.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+
+      return std::move(*this).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+
+    Options&& aliasAnalysis(AliasAnalysisKind aliasAnalysisKind) && {
+      TORCH_CHECK(!aliasAnalysisKind_.has_value(), "You can only call aliasAnalysis() once per operator registration.");
+      aliasAnalysisKind_ = aliasAnalysisKind;
+      return std::move(*this);
+    }
+
+  private:
+    Options&& kernel(c10::optional<DispatchKey> dispatch_key, KernelFunction&& func, c10::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
+      KernelRegistrationConfig config;
+      config.dispatch_key = dispatch_key;
+      config.func = std::move(func);
+      config.cpp_signature = cpp_signature;
+      config.inferred_function_schema = std::move(inferred_function_schema);
+      kernels.push_back(std::move(config));
+      return std::move(*this);
+    }
+
+    Options()
+    : schemaOrName_(c10::nullopt)
+    , kernels()
+    , aliasAnalysisKind_(c10::nullopt)
+    {}
+
+    // KernelRegistrationConfig accumulates all information from the config
+    // parameters passed to a RegisterOperators::op() call into one object.
+    struct KernelRegistrationConfig final {
+      KernelRegistrationConfig()
+        : dispatch_key(c10::nullopt)
+        , func()
+        , cpp_signature(c10::nullopt)
+        , inferred_function_schema(nullptr)
+      {}
+
+      c10::optional<DispatchKey> dispatch_key;
+      KernelFunction func;
+      c10::optional<impl::CppSignature> cpp_signature;
+      std::unique_ptr<FunctionSchema> inferred_function_schema;
+    };
+
+    c10::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
+
+    std::vector<KernelRegistrationConfig> kernels;
+    optional<AliasAnalysisKind> aliasAnalysisKind_;
+    friend class RegisterOperators;
+    friend class Library;
+  };
+
+  /**
+   * Call this to get an instance of registration options, which
+   * can be passed to a call to RegisterOperators::op() to specify
+   * these options for the operator registration.
+   * See class doc comment for examples.
+   */
+  static Options options() {
+    return {};
+  }
+
+  /**
+   * Call this to register an operator. See class doc comment for examples.
+   */
+  RegisterOperators&& op(Options&& options) && {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return std::move(*this);
+  }
+
+  // Regular mutator version of the && version above
+  RegisterOperators& op(Options&& options) & {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return *this;
+  }
+
+  /**
+   * This is a shorthand for RegisterOperators::op(Options) where you can
+   * specify the operator schema outside of the options parameter.
+   * See class doc comment for examples.
+   */
+  RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && {
+    return std::move(*this).op(std::move(options).schema(schemaOrName));
+  }
+
+  // internal only for registering caffe2 ops
+  RegisterOperators&& op(FunctionSchema schema, Options&& options) && {
+    return std::move(*this).op(std::move(options).schema(std::move(schema)));
+  }
+
+  template<class FuncType>
+  explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options())
+  : RegisterOperators() {
+    std::move(*this).op(schemaOrName, std::forward<FuncType>(func), std::move(options));
+  }
+
+  /**
+   * This API registers an operator based on a kernel function pointer.
+   *
+   * Given a kernel
+   *
+   * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+   *
+   * This API looks like:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", &my_kernel_cpu);
+   *
+   * If your kernel is small and the overhead of calling it matters,
+   * then this API might be the wrong choice since the following API
+   * has a slightly lower overhead for calling into the kernel:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+   *
+   * Or, alternatively, write your kernel as a functor:
+   *
+   * > namespace {
+   * >   class my_kernel_cpu final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * >   };
+   * > }
+   * >
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<my_kernel_cpu>());
+   */
+   template<class FuncType>
+   // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction.
+   std::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same<FuncType, KernelFunction::BoxedKernelFunction>::value, RegisterOperators&&>
+   op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
+     constexpr bool AllowLegacyTypes = true;
+     return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+       c10::nullopt,
+       KernelFunction::makeFromUnboxedRuntimeFunction<AllowLegacyTypes>(func),
+       impl::CppSignature::make<FuncType>(),
+       // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+       detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+     ));
+   }
+
+   /**
+    * This API registers an operator based on a kernel lambda.
+    *
+    * This API looks like:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", [] (Tensor a, Tensor b) {...});
+    *
+    * This is equivalent to:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", c10::RegisterOperators::options()
+    * >         .catchAllKernel([] (Tensor a, Tensor b) {...}));
+    *
+    */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is actually a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+
+    template<class Lambda>
+    C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.")
+    // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && !guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of<OperatorKernel, Lambda>::value, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        c10::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+
+private:
+  void checkSchemaAndRegisterOp_(Options&& config);
+
+  static c10::FunctionSchema inferSchemaFromKernels_(const OperatorName& opNameStr, const Options& options);
+  void checkNoDuplicateKernels_(const Options& options);
+  void registerOp_(Options&& options);
+
+  std::vector<RegistrationHandleRAII> registrars_;
+};
+
+} // namespace c10
+
+namespace torch {
+  // Old-style API
+  using RegisterOperators = c10::RegisterOperators;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/operator_name.h b/MLPY/Lib/site-packages/torch/include/ATen/core/operator_name.h
new file mode 100644
index 0000000000000000000000000000000000000000..83995e24f9122981968e01fba55deab23d904695
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/operator_name.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+#include <string>
+#include <utility>
+#include <ostream>
+
+namespace c10 {
+
+// TODO: consider storing namespace separately too
+struct OperatorName final {
+  std::string name;
+  std::string overload_name;
+  OperatorName(std::string name, std::string overload_name)
+      : name(std::move(name)), overload_name(std::move(overload_name)) {}
+
+  // TODO: These two functions below are slow!  Fix internal data structures so
+  // I don't have to manually reconstruct the namespaces!
+
+  // Return the namespace of this OperatorName, if it exists.  The
+  // returned string_view is only live as long as the OperatorName
+  // exists and name is not mutated
+  c10::optional<c10::string_view> getNamespace() const {
+    auto pos = name.find("::");
+    if (pos == std::string::npos) {
+      return c10::nullopt;
+    } else {
+      return c10::make_optional(c10::string_view(name.data(), pos));
+    }
+  }
+
+  // Returns true if we successfully set the namespace
+  bool setNamespaceIfNotSet(const char* ns) {
+    if (!getNamespace().has_value()) {
+      const auto ns_len = strlen(ns);
+      const auto old_name_size = name.size();
+      name.resize(ns_len + 2 + old_name_size);
+      // Shift current value of name to the end of the new space.
+      name.replace(name.size() - old_name_size, old_name_size, name, 0, old_name_size);
+      name.replace(0, ns_len, ns, ns_len);
+      name[ns_len] = ':';
+      name[ns_len + 1] = ':';
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+// Non-owning view of an OperatorName.  Unlike OperatorName, most of
+// its functions are constexpr, so it can be used for compile time
+// computations
+struct OperatorNameView final {
+  c10::string_view name;
+  c10::string_view overload_name;
+  constexpr OperatorNameView(c10::string_view name, c10::string_view overload_name)
+    : name(name), overload_name(overload_name) {}
+  // Parses strings like "foo.overload" and also "foo"
+  constexpr static OperatorNameView parse(c10::string_view full_name) {
+    auto i = full_name.find('.');
+    if (i == c10::string_view::npos) {
+      return OperatorNameView(full_name, c10::string_view());
+    } else {
+      return OperatorNameView(full_name.substr(0, i), full_name.substr(i + 1));
+    }
+  }
+};
+
+inline bool operator==(const OperatorName& lhs, const OperatorName& rhs) {
+  return lhs.name == rhs.name && lhs.overload_name == rhs.overload_name;
+}
+
+inline bool operator!=(const OperatorName& lhs, const OperatorName& rhs) {
+  return !operator==(lhs, rhs);
+}
+
+TORCH_API std::string toString(const OperatorName& opName);
+TORCH_API std::ostream& operator<<(std::ostream&, const OperatorName&);
+
+} // namespace c10
+
+namespace std {
+  template <>
+  struct hash<::c10::OperatorName> {
+    size_t operator()(const ::c10::OperatorName& x) const {
+      return std::hash<std::string>()(x.name) ^ (~ std::hash<std::string>()(x.overload_name));
+    }
+  };
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/qualified_name.h b/MLPY/Lib/site-packages/torch/include/ATen/core/qualified_name.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcc5bdada9b276b2aa7745d87d47c850c95c6894
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/qualified_name.h
@@ -0,0 +1,161 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/irange.h>
+#include <string>
+
+namespace c10 {
+
+// Represents a name of the form "foo.bar.baz"
+struct QualifiedName {
+  QualifiedName() = default;
+
+  // `name` can be a dotted string, like "foo.bar.baz", or just a bare name.
+  /* implicit */ QualifiedName(const std::string& name) {
+    TORCH_CHECK(!name.empty());
+    // split the string into its atoms.
+    size_t startSearchFrom = 0;
+    size_t pos = name.find(delimiter_, startSearchFrom);
+
+    while (pos != std::string::npos) {
+      auto atom = name.substr(startSearchFrom, pos - startSearchFrom);
+      TORCH_INTERNAL_ASSERT(
+          !atom.empty(), "Invalid name for qualified name: '", name, "'");
+      atoms_.push_back(std::move(atom));
+      startSearchFrom = pos + 1;
+      pos = name.find(delimiter_, startSearchFrom);
+    }
+
+    auto finalAtom = name.substr(startSearchFrom);
+    TORCH_INTERNAL_ASSERT(
+        !finalAtom.empty(), "Invalid name for qualified name: '", name, "'");
+    atoms_.emplace_back(std::move(finalAtom));
+
+    cacheAccessors();
+  }
+
+  explicit QualifiedName(std::vector<std::string> atoms) : atoms_(std::move(atoms)) {
+    for (const auto& atom : atoms_) {
+      TORCH_CHECK(!atom.empty(), "Atom cannot be empty");
+      TORCH_CHECK(
+          atom.find(delimiter_) == std::string::npos,
+          "Delimiter not allowed in atom");
+    }
+
+    cacheAccessors();
+  }
+  // Unnecessary copy. Ideally we'd use something like std::string_view.
+  /* implicit */ QualifiedName(const char* name)
+      : QualifiedName(std::string(name)) {}
+
+  // `name` must be a bare name (no dots!)
+  explicit QualifiedName(const QualifiedName& prefix, std::string name) {
+    TORCH_INTERNAL_ASSERT(!name.empty());
+    TORCH_INTERNAL_ASSERT(name.find(delimiter_) == std::string::npos);
+    atoms_.insert(atoms_.begin(), prefix.atoms_.begin(), prefix.atoms_.end());
+    atoms_.push_back(std::move(name));
+
+    cacheAccessors();
+  }
+
+  // Is `this` a prefix of `other`?
+  // For example, "foo.bar" is a prefix of "foo.bar.baz"
+  bool isPrefixOf(const QualifiedName& other) const {
+    const auto& thisAtoms = atoms_;
+    const auto& otherAtoms = other.atoms_;
+
+    if (thisAtoms.size() > otherAtoms.size()) {
+      // Can't be a prefix if it's bigger
+      return false;
+    }
+    for (const auto i : c10::irange(thisAtoms.size())) {
+      if (thisAtoms[i] != otherAtoms[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // The fully qualified name, like "foo.bar.baz"
+  const std::string& qualifiedName() const {
+    return qualifiedName_;
+  }
+
+  // The leading qualifier, like "foo.bar"
+  const std::string& prefix() const {
+    return prefix_;
+  }
+
+  // The base name, like "baz"
+  const std::string& name() const {
+    return name_;
+  }
+
+  const std::vector<std::string>& atoms() const {
+    return atoms_;
+  }
+
+  bool operator==(const QualifiedName& other) const {
+    return this->qualifiedName_ == other.qualifiedName_;
+  }
+
+  bool operator!=(const QualifiedName& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  static constexpr char delimiter_ = '.';
+
+  // Helper for cacheAccessors() below.
+  template<typename T>
+  std::string join(char delimiter, const T& v) {
+    std::string out;
+    size_t reserve = 0;
+    for (const auto& e : v) {
+      reserve += e.size() + 1;
+    }
+    out.reserve(reserve);
+    for (const auto i : c10::irange(v.size())) {
+      if (i != 0) {
+        out.push_back(delimiter);
+      }
+      out.append(v[i]);
+    }
+    return out;
+  }
+
+  void cacheAccessors() {
+    qualifiedName_ = join(delimiter_, atoms_);
+    if (atoms_.size() > 1) {
+      ArrayRef<std::string> view(atoms_);
+      const auto prefixView = view.slice(0, view.size() - 1);
+      prefix_ = join(delimiter_, prefixView);
+    }
+
+    if (!atoms_.empty()) {
+      name_ = atoms_.back();
+    }
+  }
+
+  // The actual list of names, like "{foo, bar, baz}"
+  std::vector<std::string> atoms_;
+
+  /*
+   * Cached accessors, derived from `atoms_`.
+   */
+  std::string qualifiedName_;
+  std::string prefix_;
+  std::string name_;
+};
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::QualifiedName> {
+  size_t operator()(const c10::QualifiedName& n) const noexcept {
+    return std::hash<std::string>()(n.qualifiedName());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/rref_interface.h b/MLPY/Lib/site-packages/torch/include/ATen/core/rref_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..c31ea40902dc8432fcabbbf8401a6d8acd8cc4e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/rref_interface.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/util/intrusive_ptr.h>
+#include <ATen/core/type_ptr.h>
+
+namespace c10 {
+
+struct Type;
+using worker_id_t = int16_t;
+
+// This abstract class contains only user-facing APIs, and will be shared
+// between jit and distributed to implement TorchScript support.
+class C10_EXPORT RRefInterface : public c10::intrusive_ptr_target {
+ public:
+  RRefInterface() = default;
+  // RRef is made NOT copyable NOT movable to prevent messing up reference
+  // counting.
+  RRefInterface(const RRefInterface& other) = delete;
+  RRefInterface(RRefInterface&& other) = delete;
+  RRefInterface& operator=(RRefInterface&& other) = delete;
+
+  ~RRefInterface() override = default;
+
+  // returns the worker id of the owner
+  virtual worker_id_t owner() const = 0;
+
+  // returns the worker name of the owner
+  virtual std::string ownerName() const = 0;
+
+  // Returns true if this is the ``OwnerRRef``
+  virtual bool isOwner() const = 0;
+
+  // Returns true if this is an ``OwnerRRef`` or if this ``UserRRef`` has been
+  // confirmed by its owner.
+  virtual bool confirmedByOwner() const = 0;
+
+  virtual const TypePtr type() const = 0;
+};
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/stack.h b/MLPY/Lib/site-packages/torch/include/ATen/core/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aac6f102d4ebdfff37980f79c29e688ed3901e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/stack.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <type_traits>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/irange.h>
+
+// TODO move this to c10 namespace
+
+namespace torch {
+namespace jit {
+
+using c10::IValue;
+using Stack = std::vector<IValue>;
+
+class Operation {
+  template <typename F, typename Arg>
+  using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
+
+ public:
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
+  C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
+    raw(&stack);
+  }) {}
+
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack&>::value &&
+                !std::is_same<std::decay_t<F>, Operation>::value, int> = 0>
+  Operation(F&& op): op_(std::forward<F>(op)) {}
+
+  Operation(std::nullptr_t) noexcept {}
+
+  explicit operator bool() const noexcept {
+    return op_ ? true : false;
+  }
+
+  void operator()(Stack& stack) {
+    op_(stack);
+  }
+
+  template <typename T>
+  T* target() noexcept {
+    return op_.target<T>();
+  }
+
+ private:
+  std::function<void(Stack&)> op_;
+};
+
+// An operation with N inputs and M outputs pops the last N inputs off
+// the stack and pushes its M inputs onto the stack
+// before: <other stack items> I0, I1, ... IN <- stack.back()
+// after: <other stack items> O0, O1, ... OM
+// operations are defined this way so that ownership of inputs can be
+// transferred to the operation and it can incrementally drop ownership of
+// tensors when they become unneeded. For large operations, like 'run an entire
+// subgraph', this functionality is very important for minimizing gpu memory
+// usage return value is the relative 'offset' to jump to for the next
+// operation:
+//   pc += 1 + offset
+// so a return value of 0 goes to the next instruction
+
+// treat the last N elements of the stack as a list, looking up
+// element i
+static inline IValue& peek(Stack& stack, size_t i, size_t N) {
+  return *(stack.end() - N + i);
+}
+static inline IValue& peek(Stack* stack, size_t i, size_t N) {
+  return peek(*stack, i, N);
+}
+static inline const IValue& peek(const Stack& stack, size_t i, size_t N) {
+  return *(stack.end() - N + i);
+}
+static inline const IValue& peek(const Stack* stack, size_t i, size_t N) {
+  return peek(*stack, i, N);
+}
+// treat the last N elements of the stack as a list, looking up the
+// slice starting at index i and having length len
+static inline at::ArrayRef<IValue> peekSlice(
+    const Stack& stack,
+    size_t i,
+    size_t len,
+    size_t N) {
+  return at::ArrayRef<IValue>(stack).slice(stack.size() - N + i, len);
+}
+static inline at::ArrayRef<IValue> last(const Stack& stack, size_t N) {
+  return peekSlice(stack, 0, N, N);
+}
+static inline at::ArrayRef<IValue> last(const Stack* stack, size_t N) {
+  return last(*stack, N);
+}
+static inline void drop(Stack& stack, size_t n) {
+  stack.erase(stack.end() - n, stack.end());
+}
+static inline void drop(Stack* stack, size_t n) {
+  drop(*stack, n);
+}
+static inline IValue pop(Stack& stack) {
+  auto r = std::move(stack.back());
+  stack.pop_back();
+  return r;
+}
+static inline IValue pop(Stack* stack) {
+  return pop(*stack);
+}
+static inline std::vector<IValue> pop(Stack& stack, size_t n) {
+  std::vector<IValue> result;
+  result.reserve(n);
+  for (const auto i : c10::irange(n)) {
+    result.push_back(std::move(peek(stack, i, n)));
+  }
+  drop(stack, n);
+  return result;
+}
+
+// variadic pop:
+// int64_t a; at::Tensor b;
+// pop(stack, a, b);
+// equivalent to:
+// b = pop(stack).toTensor();
+// a = pop(stack).toInt();
+template <typename... Types>
+static inline void pop(Stack& stack, Types&... args) {
+  size_t i = 0;
+  constexpr size_t N = sizeof...(args);
+  (void)std::initializer_list<int>{
+      (args = std::move(peek(stack, i++, N)).template to<Types>(), 0)...};
+  drop(stack, N);
+}
+template <typename... Types>
+static inline void pop(Stack* stack, Types&... args) {
+  pop(*stack, args...);
+}
+template <typename Type>
+static inline void push_one(Stack& stack, Type&& arg) {
+  stack.emplace_back(std::forward<Type>(arg));
+}
+
+static inline void push_one(Stack& stack, c10::TensorOptions options) {
+  stack.emplace_back(c10::typeMetaToScalarType(options.dtype()));
+  stack.emplace_back(options.layout());
+  stack.emplace_back(options.device());
+  stack.emplace_back(options.pinned_memory());
+}
+
+template <typename... Types>
+static inline void push(Stack& stack, Types&&... args) {
+  (void)std::initializer_list<int>{(push_one(stack, std::forward<Types>(args)), 0)...};
+}
+template <typename... Types>
+static inline void push(Stack* stack, Types&&... args) {
+  return push(*stack, std::forward<Types>(args)...);
+}
+template <class T>
+static inline void push_list_elements(Stack& stack, const c10::List<T>& elements) {
+  for (T elem : elements) {
+    stack.push_back(std::move(elem));
+  }
+}
+
+// The packer here is carefully written not to make any unnecessary
+// copies.
+
+// pack takes the return values of aten functions pushes them onto the stack
+template <typename T>
+inline void pack(Stack& stack, T&& v) {
+  stack.emplace_back(std::forward<T>(v));
+}
+template <typename T>
+inline void pack(Stack* stack, T&& v) {
+  pack(*stack, std::forward<T>(v));
+}
+
+template <std::size_t remaining, typename... Args>
+struct TuplePacker {
+  // NB: *Not* a universal reference.
+  static void execute(Stack& stack, std::tuple<Args...>&& t) {
+    // NB: The move here does not "destroy" the entire tuple, that is
+    // not what std::move does; only the particular tuple index
+    // processed here gets stolen.
+    pack(stack, std::get<sizeof...(Args) - remaining>(std::move(t)));
+    TuplePacker<remaining - 1, Args...>::execute(stack, std::move(t));
+  }
+};
+
+template <typename... Args>
+struct TuplePacker<0, Args...> {
+  static void execute(Stack& /*stack*/, std::tuple<Args...>&& /*t*/){};
+};
+
+template <typename... Args>
+inline void pack(Stack& stack, std::tuple<Args...>&& t) {
+  TuplePacker<sizeof...(Args), Args...>::execute(stack, std::move(t));
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/symbol.h b/MLPY/Lib/site-packages/torch/include/ATen/core/symbol.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e23098d098259d6e914cab655cc1f3805ef4753
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/symbol.h
@@ -0,0 +1,147 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <cstdint>
+#include <functional>  // For std::hash
+#include <string>
+
+
+namespace c10 {
+
+// 'prim' symbols are synthetic operators that occur only in the IR
+// and don't have corresponding implementations in ATen.
+
+// 'onnx' symbols correspond to ONNX operators.  Their semantics
+// are defined in https://github.com/onnx/onnx/blob/master/docs/Operators.md
+// The particular version we are targeting is specified by '_onnx_opset_version'
+// in torch.onnx.symbolic_helper
+//
+// In general, most ONNX operators won't get an entry here, because they
+// are handled from the Python end.  However, you may occasionally need
+// to intern an ONNX symbol here so that you can conveniently write an
+// optimization on ONNX operations.
+
+// 'attr' symbols are attribute keys.  They are shared between both ONNX and ATen
+// operators (you disambiguate their meaning by looking at the operator itself).
+// In general, you only need to define attribute keys that are used by
+// onnx or prim; ATen attributes are automatically generated in FORALL_ATTR_BASE_SYMBOLS.
+
+// Note [Symbol allocation]
+// ~~~~~~~~~~~~~~~~~~~~~~~~
+//
+//  1. Symbol namespace is split up into namespaces.
+//
+//  2. The intended access pattern for built-in symbols is onnx::MatMul
+//  in the c10 namespace (this is a Symbol).
+//
+
+// Built-in constant definition strategy:
+// - Enum is the most convenient way to generate a contiguous sequence
+//   of numbers for an identifier.
+// - However, an enum gives you a fresh type.  We want onnx::MatMul to
+//   be type Symbol, not some random enum type!
+// - Therefore, after using enums to generate the sequence of integers,
+//   we then declare constexpr Symbols to get everything the actual Symbol
+//   type we want.  Symbols must be constexpr to be valid to be "case"ed on.
+
+using unique_t = uint32_t;
+
+const std::string& domain_prefix();
+
+// A Symbol is like an interned string, but with a little extra
+// structure; it is namespaced via SymbolNamespace and the resulting
+// intern pointers support efficient namespace testing.
+struct TORCH_API Symbol {
+  explicit constexpr Symbol() : value(0) {};
+  explicit constexpr Symbol(unique_t uniq)
+  : value(uniq) {}
+
+  // Get a Symbol for a qualified string like "attr::bar"
+  static Symbol fromQualString(const std::string & s);
+
+  // Get a Symbol from a domain and an unqualified string like "org.pytorch.attr" and "bar"
+  static Symbol fromDomainAndUnqualString(const std::string & d, const std::string & s);
+
+  // Constructors for our various namespaced strings.  This will construct
+  // the appropriate namespaced string, e.g., "attr::foo" for the
+  // argument "foo", and then attempt to intern it.  DO NOT USE THIS
+  // with a string literal; attr::foo should be available in that case
+  // (and if it's not, you should add it to the built-ins list above.)
+  static Symbol attr(const std::string & s);
+  static Symbol aten(const std::string & s);
+  static Symbol cuda(const std::string & s);
+  static Symbol onnx(const std::string & s);
+  static Symbol prim(const std::string & s);
+  static Symbol user(const std::string & s);
+  static Symbol caffe2(const std::string & s);
+  static Symbol dimname(const std::string & s);
+  // TODO: eliminate me
+  static Symbol scope(const std::string & s);
+
+  bool is_attr() const;
+  bool is_aten() const;
+  bool is_cuda() const;
+  bool is_prim() const;
+  bool is_prims() const;
+  bool is_nvprims() const;
+  bool is_onnx() const;
+  bool is_user() const;
+  bool is_caffe2() const;
+  bool is_dimname() const;
+
+  // So we can switch on this
+  constexpr operator unique_t() const {
+    return value;
+  }
+
+  Symbol ns() const;
+
+  // Give a string corresponding to the unqualified version of this name, e.g.,
+  // "mm". Use this in a context where the intended namespace of the string is
+  // obvious; this is a *lossy* conversion.
+  const char * toUnqualString() const;
+
+  // Give a string corresponding to the qualified version of this name,
+  // e.g., "aten::mm".  This string format is made available to Python bindings
+  // (so we know how to parse it.)
+  const char * toQualString() const;
+
+  // This describes a symbol in a case where humans read it.  At the moment it's
+  // the same as toQualString.  This has to be a const char* returned because
+  // a lot of printf style macros use it.
+  const char * toDisplayString() const;
+
+  // Give a string corresponding to the domain name for the symbol,
+  // e.g., "org.pytorch.aten".
+  std::string domainString() const;
+
+private:
+
+  explicit Symbol(Symbol ns, const std::string & s);
+  unique_t value;
+};
+
+static inline bool operator==(Symbol lhs, Symbol rhs) {
+  return static_cast<unique_t>(lhs) == static_cast<unique_t>(rhs);
+}
+
+inline Symbol Symbol::attr(const std::string & s) { return Symbol::fromQualString("attr::" + s); }
+inline Symbol Symbol::aten(const std::string & s)  { return Symbol::fromQualString("aten::" + s); }
+inline Symbol Symbol::cuda(const std::string & s)  { return Symbol::fromQualString("cuda::" + s); }
+inline Symbol Symbol::onnx(const std::string & s)  { return Symbol::fromQualString("onnx::" + s); }
+inline Symbol Symbol::prim(const std::string & s)  { return Symbol::fromQualString("prim::" + s); }
+inline Symbol Symbol::scope(const std::string & s) { return Symbol::fromQualString("scope::" + s); }
+inline Symbol Symbol::user(const std::string & s) { return Symbol::fromQualString("user::" + s); }
+inline Symbol Symbol::caffe2(const std::string & s) { return Symbol::fromQualString("_caffe2::" + s); }
+inline Symbol Symbol::dimname(const std::string & s) { return Symbol::fromQualString("dimname::" + s); }
+
+} // namespace c10
+
+// make symbol behave like an integer in hash tables
+namespace std {
+template <>
+struct hash<c10::Symbol> {
+  size_t operator()(c10::Symbol s) const {
+    return std::hash<uint32_t>()(static_cast<uint32_t>(s));
+  }
+};
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/type_factory.h b/MLPY/Lib/site-packages/torch/include/ATen/core/type_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..771cc65c43b0431c8a43ed8595980fb1d9d819fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/type_factory.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <type_traits>
+#include <unordered_map>
+
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+template <typename T>
+struct TORCH_API TypeFactoryBase {};
+
+template <>
+struct TORCH_API TypeFactoryBase<c10::DynamicType> {
+  template <typename T, typename... Args>
+  static c10::DynamicTypePtr create(TypePtr ty, Args&&... args) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        c10::DynamicType::Arguments(c10::ArrayRef<c10::TypePtr>(
+            {std::move(ty), std::forward<Args>(args)...})));
+  }
+  template <typename T>
+  static c10::DynamicTypePtr create(const std::vector<c10::TypePtr>& types) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        c10::DynamicType::Arguments(types));
+  }
+  static c10::DynamicTypePtr createNamedTuple(
+      const std::string& name,
+      const std::vector<c10::string_view>& fields,
+      const std::vector<c10::TypePtr>& types) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicType::Tag::Tuple,
+        name,
+        c10::DynamicType::Arguments(fields, types));
+  }
+  template <typename T>
+  C10_ERASE static c10::DynamicTypePtr createNamed(const std::string& name) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        name,
+        c10::DynamicType::Arguments{});
+  }
+  template <typename T>
+  C10_ERASE static c10::DynamicTypePtr get() {
+    return DynamicTypeTrait<T>::getBaseType();
+  }
+  static const std::unordered_map<std::string, c10::TypePtr>& basePythonTypes();
+};
+
+using DynamicTypeFactory = TypeFactoryBase<c10::DynamicType>;
+
+// Helper functions for constructing DynamicTypes inline.
+template <
+    typename T,
+    std::enable_if_t<DynamicTypeTrait<T>::isBaseType, int> = 0>
+C10_ERASE DynamicTypePtr dynT() {
+  return DynamicTypeFactory::get<T>();
+}
+
+template <
+    typename T,
+    typename... Args,
+    std::enable_if_t<!DynamicTypeTrait<T>::isBaseType, int> = 0>
+C10_ERASE DynamicTypePtr dynT(Args&&... args) {
+  return DynamicTypeFactory::create<T>(std::forward<Args>(args)...);
+}
+
+template <>
+struct TORCH_API TypeFactoryBase<c10::Type> {
+  template <typename T, typename... Args>
+  static c10::TypePtr create(TypePtr ty, Args&&... args) {
+    return T::create(std::move(ty), std::forward<Args>(args)...);
+  }
+  template <typename T>
+  static c10::TypePtr create(std::vector<c10::TypePtr> types) {
+    return T::create(std::move(types));
+  }
+  static c10::TypePtr createNamedTuple(
+      const std::string& name,
+      const std::vector<c10::string_view>& fields,
+      const std::vector<c10::TypePtr>& types);
+  template <typename T>
+  C10_ERASE static c10::TypePtr createNamed(const std::string& name) {
+    return T::create(name);
+  }
+  static const std::unordered_map<std::string, c10::TypePtr>& basePythonTypes();
+  template <typename T>
+  C10_ERASE static c10::TypePtr get() {
+    return T::get();
+  }
+};
+
+using DefaultTypeFactory = TypeFactoryBase<c10::Type>;
+
+using PlatformType =
+#ifdef C10_MOBILE
+    c10::DynamicType
+#else
+    c10::Type
+#endif
+    ;
+
+using TypeFactory = TypeFactoryBase<PlatformType>;
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/type_ptr.h b/MLPY/Lib/site-packages/torch/include/ATen/core/type_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b183d4249201bcb7359bdb0156b9669525280b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/type_ptr.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <memory>
+#include <type_traits>
+
+#include <c10/util/Exception.h>
+#include <c10/util/MaybeOwned.h>
+
+namespace c10 {
+
+// Compatibility wrapper around a raw pointer so that existing code
+// written to deal with a shared_ptr can keep working.
+template <typename T>
+class SingletonTypePtr {
+ public:
+  /* implicit */ SingletonTypePtr(T* p) : repr_(p) {}
+
+  // We need this to satisfy Pybind11, but it shouldn't be hit.
+  explicit SingletonTypePtr(std::shared_ptr<T>) { TORCH_CHECK(false); }
+
+  using element_type = typename std::shared_ptr<T>::element_type;
+
+  template <typename U = T, std::enable_if_t<!std::is_same<std::remove_const_t<U>, void>::value, bool> = true>
+  T& operator*() const {
+    return *repr_;
+  }
+
+  T* get() const {
+    return repr_;
+  }
+
+  T* operator->() const {
+    return repr_;
+  }
+
+  operator bool() const {
+    return repr_ != nullptr;
+  }
+
+ private:
+  T* repr_{nullptr};
+};
+
+template <typename T, typename U>
+bool operator==(SingletonTypePtr<T> lhs, SingletonTypePtr<U> rhs) {
+  return (void*)lhs.get() == (void*)rhs.get();
+}
+
+template <typename T, typename U>
+bool operator!=(SingletonTypePtr<T> lhs, SingletonTypePtr<U> rhs) {
+  return !(lhs == rhs);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/core/typeid.h b/MLPY/Lib/site-packages/torch/include/ATen/core/typeid.h
new file mode 100644
index 0000000000000000000000000000000000000000..d69eba920abb0059a113405faf0264cd5a9b7bab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/core/typeid.h
@@ -0,0 +1 @@
+#include <c10/util/typeid.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpp_custom_type_hack.h b/MLPY/Lib/site-packages/torch/include/ATen/cpp_custom_type_hack.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9e4e3e677d16b3001188f678ef2b985319b8405
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpp_custom_type_hack.h
@@ -0,0 +1,110 @@
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+
+// YOU ARE IN THE WRONG PLACE! TURN BACK NOW!
+
+// This code was a temporary hack to enable embedding arbitrary C++ structures
+// into Tensors. THIS IS UNSAFE AND IS NOT SUPPORTED. IF YOU USE THIS CODE,
+// IT __WILL__ BREAK.
+
+// This code has been superseded by custom classes:
+// https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html
+
+// Please use custom classes and **DO NOT ADD MORE CALLSITES TO THINGS DEFINED
+// IN THIS FILE**.
+
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+
+#include <ATen/TracerMode.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::cpp_custom_type_hack {
+
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] bool
+isa(const Tensor& packed) {
+  return (packed.scalar_type() == kByte) &&
+      (packed.storage().data_ptr().get_deleter() ==
+       caffe2::TypeMeta::Make<T>().deleteFn());
+}
+
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] T&
+cast(const Tensor& packed) {
+  TORCH_CHECK(
+      packed.scalar_type() == kByte, "Expected temporary cpp type wrapper");
+  TORCH_CHECK(
+      packed.storage().data_ptr().get_deleter() ==
+          caffe2::TypeMeta::Make<T>().deleteFn(),
+      "Expected temporary cpp type wrapper of type ",
+      caffe2::TypeMeta::TypeName<T>());
+  return *reinterpret_cast<T*>(packed.storage().data_ptr().get());
+}
+
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] Tensor
+create(std::unique_ptr<T> ptr, TensorOptions options) {
+  // None of this should trace, so turn off Tracer dispatching
+  at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove
+  at::tracer::impl::NoTracerDispatchMode tracer_guard;
+
+  // We store this instance away in a Tensor and register a deleter function
+  // so that we do not leak memory. On the other side, we pull out the storage's
+  // data_ptr and get the right typed pointer.
+  void* raw_ptr = ptr.release();
+  at::DataPtr at_ptr(
+      raw_ptr, raw_ptr, caffe2::TypeMeta::Make<T>().deleteFn(), at::kCPU);
+
+  // size doesn't really matter, but we can align it to the actual size
+  // returning variables because one likely want to use this hack from python
+  auto retval = at::empty({sizeof(T)}, options.device(kCPU).dtype(at::kByte));
+  retval.storage().set_data_ptr_noswap(std::move(at_ptr));
+  return retval;
+}
+
+} // namespace at::cpp_custom_type_hack
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/FlushDenormal.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/FlushDenormal.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d7b4b9cc679c93d48f3b1be053a7ff9fb004128
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/FlushDenormal.h
@@ -0,0 +1,14 @@
+/// Flush-To-Zero and Denormals-Are-Zero mode
+///
+/// Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass
+/// IEEE 754 methods of dealing with denormal floating-point numbers on x86-64
+/// and some x86 CPUs. They result in reduced precision for values near zero,
+/// but increased performance.
+///
+/// See https://software.intel.com/en-us/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz
+
+namespace at::cpu {
+
+bool set_flush_denormal(bool on);
+
+}  // namespace at::cpu
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/Utils.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab06ce7d18dab2ed20a64b2105beb451dccbf189
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/Utils.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace at::cpu {
+
+// Detect if CPU support Vector Neural Network Instruction.
+TORCH_API bool is_cpu_support_vnni();
+
+} // namespace at::cpu
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..032e9bfa471391b3a38e56dedd04c7a881a241f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <ATen/cpu/vec/functional_base.h>
+#include <ATen/cpu/vec/functional_bfloat16.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_base.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..801685e23cfa5eff3a3b22bf7e77af3f033e9311
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_base.h
@@ -0,0 +1,329 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+
+// slow path
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(
+    const Op& vec_fun,
+    vec::Vectorized<scalar_t> acc_vec,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  scalar_t acc_arr[Vec::size()];
+  acc_vec.store(acc_arr);
+  for (const auto i : c10::irange(1, size)) {
+    std::array<scalar_t, Vec::size()> acc_arr_next = {0};
+    acc_arr_next[0] = acc_arr[i];
+    Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
+    acc_vec = vec_fun(acc_vec, acc_vec_next);
+  }
+  acc_vec.store(acc_arr);
+  return acc_arr[0];
+}
+
+template <typename scalar_t, typename Op>
+struct VecReduceAllSIMD {
+  static inline scalar_t apply(const Op& vec_fun, const Vectorized<scalar_t>& acc_vec) {
+    return vec_reduce_all(vec_fun, acc_vec, Vectorized<scalar_t>::size());
+  }
+};
+
+#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+#if defined(CPU_CAPABILITY_AVX2)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    Vec v1 = _mm256_permute2f128_ps(v, v, 0x1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm256_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm256_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX2)
+#if defined(CPU_CAPABILITY_AVX512)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 256-bit shuffle
+    Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 128-bit shuffle
+    v1 = _mm512_shuffle_f32x4(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0x4E);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    v1 = _mm512_shuffle_ps(v, v, 0xB1);
+    v = vec_fun(v, v1);
+    return _mm512_cvtss_f32(v);
+  }
+};
+#endif // defined(CPU_CAPABILITY_AVX512)
+#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)
+
+template <typename scalar_t, typename Op>
+inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized<scalar_t>& acc_vec) {
+  return VecReduceAllSIMD<scalar_t, Op>::apply(vec_fun, acc_vec);
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline scalar_t reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  if (size < Vec::size())
+    return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
+  int64_t d = Vec::size();
+  Vec acc_vec = Vec::loadu(data);
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    acc_vec = vec_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(vec_fun, acc_vec);
+}
+
+// similar to reduce_all, but reduces into two outputs
+template <typename scalar_t, typename Op1, typename Op2,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::pair<scalar_t, scalar_t> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
+    const scalar_t* data, int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  if (size < Vec::size()) {
+    auto loaded_data = Vec::loadu(data, size);
+    return std::pair<scalar_t, scalar_t>(
+      vec_reduce_all(vec_fun1, loaded_data, size),
+      vec_reduce_all(vec_fun2, loaded_data, size));
+  }
+  int64_t d = Vec::size();
+  Vec acc_vec1 = Vec::loadu(data);
+  Vec acc_vec2 = Vec::loadu(data);
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    acc_vec1 = vec_fun1(acc_vec1, data_vec);
+    acc_vec2 = vec_fun2(acc_vec2, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    acc_vec1 = Vec::set(acc_vec1, vec_fun1(acc_vec1, data_vec), size - d);
+    acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
+  }
+  return std::pair<scalar_t, scalar_t>(
+    vec_reduce_all(vec_fun1, acc_vec1),
+    vec_reduce_all(vec_fun2, acc_vec2));
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline scalar_t map_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  if (size < Vec::size())
+    return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
+  int64_t d = Vec::size();
+  Vec acc_vec = map_fun(Vec::loadu(data));
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    data_vec = map_fun(data_vec);
+    acc_vec = red_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    data_vec = map_fun(data_vec);
+    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(red_fun, acc_vec);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline scalar_t map2_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    const scalar_t* data2,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  if (size < Vec::size()) {
+    Vec data_vec = Vec::loadu(data, size);
+    Vec data2_vec = Vec::loadu(data2, size);
+    data_vec = map_fun(data_vec, data2_vec);
+    return vec_reduce_all(red_fun, data_vec, size);
+  }
+  int64_t d = Vec::size();
+  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    Vec data2_vec = Vec::loadu(data2 + d);
+    data_vec = map_fun(data_vec, data2_vec);
+    acc_vec = red_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    Vec data2_vec = Vec::loadu(data2 + d, size - d);
+    data_vec = map_fun(data_vec, data2_vec);
+    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(red_fun, acc_vec);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline scalar_t map3_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    const scalar_t* data2,
+    const scalar_t* data3,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  if (size < Vec::size()) {
+    Vec data_vec = Vec::loadu(data, size);
+    Vec data2_vec = Vec::loadu(data2, size);
+    Vec data3_vec = Vec::loadu(data3, size);
+    data_vec = map_fun(data_vec, data2_vec, data3_vec);
+    return vec_reduce_all(red_fun, data_vec, size);
+  }
+
+  int64_t d = Vec::size();
+  Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2), Vec::loadu(data3));
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(data + d);
+    Vec data2_vec = Vec::loadu(data2 + d);
+    Vec data3_vec = Vec::loadu(data3 + d);
+    data_vec = map_fun(data_vec, data2_vec, data3_vec);
+    acc_vec = red_fun(acc_vec, data_vec);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(data + d, size - d);
+    Vec data2_vec = Vec::loadu(data2 + d, size - d);
+    Vec data3_vec = Vec::loadu(data3 + d, size - d);
+    data_vec = map_fun(data_vec, data2_vec, data3_vec);
+    acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
+  }
+  return vec_reduce_all(red_fun, acc_vec);
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec output_vec = vec_fun(Vec::loadu(input_data + d));
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d));
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map2(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    const scalar_t* input_data2,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec = Vec::loadu(input_data + d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d);
+    Vec output_vec = vec_fun(data_vec, data_vec2);
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec data_vec = Vec::loadu(input_data + d, size - d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
+    Vec output_vec = vec_fun(data_vec, data_vec2);
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map3(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data1,
+    const scalar_t* input_data2,
+    const scalar_t* input_data3,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec1 = Vec::loadu(input_data1 + d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d);
+    Vec data_vec3 = Vec::loadu(input_data3 + d);
+    Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
+    Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
+    Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map4(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data1,
+    const scalar_t* input_data2,
+    const scalar_t* input_data3,
+    const scalar_t* input_data4,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec data_vec1 = Vec::loadu(input_data1 + d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d);
+    Vec data_vec3 = Vec::loadu(input_data3 + d);
+    Vec data_vec4 = Vec::loadu(input_data4 + d);
+    Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
+    output_vec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
+    Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
+    Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
+    Vec data_vec4 = Vec::loadu(input_data4 + d, size - d);
+    Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
+    output_vec.store(output_data + d, size - d);
+  }
+}
+
+} // namespace at::vec
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..e11401955706cc082d1b8a505c1582d156975957
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h
@@ -0,0 +1,549 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/vec.h>
+
+namespace at::vec {
+
+// BFloat16 specification
+template <typename scalar_t> struct VecScalarType { using type = scalar_t; };
+template <> struct VecScalarType<BFloat16> { using type = float; };
+template <> struct VecScalarType<Half> { using type = float; };
+
+// This is different from at::acc_type since we only need to specialize BFloat16
+template <typename scalar_t>
+using vec_scalar_t = typename VecScalarType<scalar_t>::type;
+
+// Vector conversion between float and bfloat16/half
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float(const Vectorized<scalar_t>&);
+
+template <>
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<BFloat16> (const Vectorized<BFloat16>& a) {
+  return convert_bfloat16_float(a);
+}
+
+template <>
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_to_float<Half> (const Vectorized<Half>& a) {
+    return convert_half_float(a);
+}
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline Vectorized<scalar_t> convert_from_float(const Vectorized<float>&, const Vectorized<float>&);
+
+template <>
+inline Vectorized<BFloat16> convert_from_float<BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return convert_float_bfloat16(a, b);
+}
+
+template <>
+inline Vectorized<Half> convert_from_float<Half>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return convert_float_half(a, b);
+}
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(const scalar_t *data, Vectorized<float> &out1, Vectorized<float> &out2);
+
+template <>
+inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out1, Vectorized<float> &out2) {
+  load_fp32_from_bf16(data, out1, out2);
+}
+
+template <>
+inline void load_to_float<Half> (const Half *data, Vectorized<float> &out1, Vectorized<float> &out2) {
+  load_fp32_from_fp16(data, out1, out2);
+}
+
+template <typename scalar_t,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void load_to_float(const scalar_t *data, Vectorized<float> &out);
+
+template <>
+inline void load_to_float<BFloat16> (const BFloat16 *data, Vectorized<float> &out) {
+  load_fp32_from_bf16(data, out);
+}
+
+template <>
+inline void load_to_float<Half> (const Half *data, Vectorized<float> &out) {
+  load_fp32_from_fp16(data, out);
+}
+
+// Note that we already have specialized member of Vectorized<scalar_t> for BFloat16
+// so the following functions would run smoothly:
+//   using Vec = Vectorized<BFloat16>;
+//   Vec one = Vec(BFloat16(1));
+//   vec::map([](Vec x) { return one / (one + x.exp()); }, y_ptr, x_ptr, N);
+//
+// Then why we still need to specialize "functional"?
+//   If we do specialization at Vectorized<> level, the above example would need 3 pairs of
+//   conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and "/".
+//   If we do specialization at vec::map<>() level, we have only 1 pair of conversion
+//   of bf16->fp32/fp32->bf16, for the input and output BFloat16 vector only.
+//
+// The following BFloat16 functionality will only do data type conversion for input
+// and output vector (reduce functionality will only convert the final scalar back to bf16).
+// Compared to Vectorized<> specialization,
+//   1. better performance since we have less data type conversion;
+//   2. less rounding error since immediate results are kept in fp32;
+//   3. accumulation done on data type of fp32.
+//
+//  If you plan to extend this file, please ensure adding unit tests at
+//    aten/src/ATen/test/vec_test_all_types.cpp
+//
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  if (size < bVec::size()) {
+    bVec data_bvec = bVec::loadu(data, size);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size > fVec::size()) {
+      data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size());
+      return vec_reduce_all<float>(vec_fun, data_fvec0, fVec::size());
+    } else {
+      return vec_reduce_all<float>(vec_fun, data_fvec0, size);
+    }
+  }
+  int64_t d = bVec::size();
+  bVec acc_bvec = bVec::loadu(data);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
+    acc_fvec1 = vec_fun(acc_fvec1, data_fvec1);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size - d > fVec::size()) {
+      acc_fvec0 = vec_fun(acc_fvec0, data_fvec0);
+      acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      acc_fvec0 = fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d);
+    }
+  }
+  acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1);
+  return vec_reduce_all<float>(vec_fun, acc_fvec0);
+}
+
+template <typename scalar_t, typename Op1, typename Op2,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline std::pair<float, float> reduce2_all(const Op1& vec_fun1, const Op2& vec_fun2,
+    const scalar_t* data, int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  if (size < bVec::size()) {
+    bVec data_bvec = bVec::loadu(data, size);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size > fVec::size()) {
+      fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size());
+      fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size());
+      return std::pair<scalar_t, scalar_t>(
+          vec_reduce_all<float>(vec_fun1, acc1_fvec, fVec::size()),
+          vec_reduce_all<float>(vec_fun2, acc2_fvec, fVec::size()));
+    } else {
+      return std::pair<scalar_t, scalar_t>(
+          vec_reduce_all<float>(vec_fun1, data_fvec0, size),
+          vec_reduce_all<float>(vec_fun2, data_fvec0, size));
+    }
+  }
+  int64_t d = bVec::size();
+  bVec acc_bvec = bVec::loadu(data);
+  auto [acc1_fvec0, acc1_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
+    acc1_fvec1 = vec_fun1(acc1_fvec1, data_fvec1);
+    acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
+    acc2_fvec1 = vec_fun2(acc2_fvec1, data_fvec1);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size - d > fVec::size()) {
+      acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0);
+      acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size());
+      acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0);
+      acc2_fvec1 = fVec::set(acc2_fvec1, vec_fun2(acc2_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      acc1_fvec0 = fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d);
+      acc2_fvec0 = fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d);
+    }
+  }
+  acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1);
+  acc2_fvec0 = vec_fun2(acc2_fvec0, acc2_fvec1);
+  return std::pair<scalar_t, scalar_t>(
+      vec_reduce_all<float>(vec_fun1, acc1_fvec0),
+      vec_reduce_all<float>(vec_fun2, acc2_fvec0));
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline float map_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  if (size < bVec::size()) {
+    bVec data_bvec = bVec::loadu(data, size);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0);
+      data_fvec1 = map_fun(data_fvec1);
+      data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+      return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0);
+      return vec_reduce_all<float>(red_fun, data_fvec0, size);
+    }
+  }
+  int64_t d = bVec::size();
+  bVec acc_bvec = bVec::loadu(data);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  acc_fvec0 = map_fun(acc_fvec0);
+  acc_fvec1 = map_fun(acc_fvec1);
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    data_fvec0 = map_fun(data_fvec0);
+    data_fvec1 = map_fun(data_fvec1);
+    acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+    acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    if (size - d > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0);
+      data_fvec1 = map_fun(data_fvec1);
+      acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+      acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0);
+      acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+    }
+  }
+  acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline float map2_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    const scalar_t* data2,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  if (size < bVec::size()) {
+    bVec data_bvec = bVec::loadu(data, size);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2, size);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    if (size > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+      data_fvec1 = map_fun(data_fvec1, data2_fvec1);
+      data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+      return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+      return vec_reduce_all<float>(red_fun, data_fvec0, size);
+    }
+  }
+  int64_t d = bVec::size();
+  bVec acc_bvec = bVec::loadu(data);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  bVec acc2_bvec = bVec::loadu(data2);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc2_bvec);
+  acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0);
+  acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1);
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2 + d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+    data_fvec1 = map_fun(data_fvec1, data2_fvec1);
+    acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+    acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2 + d, size - d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    if (size - d > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+      data_fvec1 = map_fun(data_fvec1, data2_fvec1);
+      acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+      acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0);
+      acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+    }
+  }
+  acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
+}
+
+template <typename scalar_t, typename MapOp, typename ReduceOp,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline float map3_reduce_all(
+    const MapOp& map_fun,
+    const ReduceOp& red_fun,
+    const scalar_t* data,
+    const scalar_t* data2,
+    const scalar_t* data3,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  if (size < bVec::size()) {
+    bVec data_bvec = bVec::loadu(data, size);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2, size);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(data3, size);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    if (size > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+      data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
+      data_fvec0 = fVec::set(data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size());
+      return vec_reduce_all<float>(red_fun, data_fvec0, fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+      return vec_reduce_all<float>(red_fun, data_fvec0, size);
+    }
+  }
+  int64_t d = bVec::size();
+  bVec acc_bvec = bVec::loadu(data);
+  auto [acc_fvec0, acc_fvec1] = convert_to_float<scalar_t>(acc_bvec);
+  bVec acc2_bvec = bVec::loadu(data2);
+  auto [acc2_fvec0, acc2_fvec1] = convert_to_float<scalar_t>(acc2_bvec);
+  bVec acc3_bvec = bVec::loadu(data3);
+  auto [acc3_fvec0, acc3_fvec1] = convert_to_float<scalar_t>(acc3_bvec);
+  acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0, acc3_fvec0);
+  acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1, acc3_fvec1);
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2 + d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(data3 + d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+    data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
+    acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+    acc_fvec1 = red_fun(acc_fvec1, data_fvec1);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(data2 + d, size - d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(data3 + d, size - d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    if (size - d > fVec::size()) {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+      data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1);
+      acc_fvec0 = red_fun(acc_fvec0, data_fvec0);
+      acc_fvec1 = fVec::set(acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size());
+    } else {
+      data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0);
+      acc_fvec0 = fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d);
+    }
+  }
+  acc_fvec0 = red_fun(acc_fvec0, acc_fvec1);
+  return vec_reduce_all<float>(red_fun, acc_fvec0);
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(input_data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    fVec output_fvec0 = vec_fun(data_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(input_data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    fVec output_fvec0 = vec_fun(data_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const float* input_data,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    fVec data_fvec0 = fVec::loadu(input_data + d);
+    fVec data_fvec1 = fVec::loadu(input_data + d + fVec::size());
+    fVec output_fvec0 = vec_fun(data_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    fVec data_fvec0, data_fvec1;
+    if (size - d > fVec::size()) {
+      data_fvec0 = fVec::loadu(input_data + d);
+      data_fvec1 = fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size());
+    } else {
+      // choose to align with behaviour of bVec::loadu(ptr, size),
+      // which leaves data_fvec1 uninitialized
+      data_fvec0 = fVec::loadu(input_data + d, size - d);
+    }
+    fVec output_fvec0 = vec_fun(data_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map2(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data,
+    const scalar_t* input_data2,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data_bvec = bVec::loadu(input_data + d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    bVec data_bvec = bVec::loadu(input_data + d, size - d);
+    auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0);
+    fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map3(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data1,
+    const scalar_t* input_data2,
+    const scalar_t* input_data3,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data1_bvec = bVec::loadu(input_data1 + d);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(input_data3 + d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
+    fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0);
+    fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d, size - d);
+  }
+}
+
+template <typename scalar_t, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map4(
+    const Op& vec_fun,
+    scalar_t* output_data,
+    const scalar_t* input_data1,
+    const scalar_t* input_data2,
+    const scalar_t* input_data3,
+    const scalar_t* input_data4,
+    int64_t size) {
+  using bVec = vec::Vectorized<scalar_t>;
+  using fVec = vec::Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec data1_bvec = bVec::loadu(input_data1 + d);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(input_data3 + d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    bVec data4_bvec = bVec::loadu(input_data4 + d);
+    auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
+    fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
+    fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d);
+  }
+  if (size - d > 0) {
+    bVec data1_bvec = bVec::loadu(input_data1 + d, size - d);
+    auto [data1_fvec0, data1_fvec1] = convert_to_float<scalar_t>(data1_bvec);
+    bVec data2_bvec = bVec::loadu(input_data2 + d, size - d);
+    auto [data2_fvec0, data2_fvec1] = convert_to_float<scalar_t>(data2_bvec);
+    bVec data3_bvec = bVec::loadu(input_data3 + d, size - d);
+    auto [data3_fvec0, data3_fvec1] = convert_to_float<scalar_t>(data3_bvec);
+    bVec data4_bvec = bVec::loadu(input_data4 + d, size - d);
+    auto [data4_fvec0, data4_fvec1] = convert_to_float<scalar_t>(data4_bvec);
+    fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0);
+    fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1);
+    bVec output_bvec = convert_from_float<scalar_t>(output_fvec0, output_fvec1);
+    output_bvec.store(output_data + d, size - d);
+  }
+}
+
+} // namespace at::vec
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/intrinsics.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..054f457b7e006cff43c622982fe9885d17869a50
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/intrinsics.h
@@ -0,0 +1,43 @@
+#pragma once
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC or clang-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
+/* Clang-compatible compiler, targeting arm neon */
+#include <arm_neon.h>
+#elif defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) (_mm_extract_epi64(_mm256_extractf128_si256(X, Y >> 1), Y % 2))
+#define _mm256_extract_epi32(X, Y) (_mm_extract_epi32(_mm256_extractf128_si256(X, Y >> 2), Y % 4))
+#define _mm256_extract_epi16(X, Y) (_mm_extract_epi16(_mm256_extractf128_si256(X, Y >> 3), Y % 8))
+#define _mm256_extract_epi8(X, Y) (_mm_extract_epi8(_mm256_extractf128_si256(X, Y >> 4), Y % 16))
+#endif
+#elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#if defined (MISSING_ARM_VLD1)
+#include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
+#elif defined (MISSING_ARM_VST1)
+#include <ATen/cpu/vec/vec256/missing_vst1_neon.h>
+#endif
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif defined(__s390x__)
+// targets Z/architecture
+// we will include vecintrin later
+#elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
+        (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
+   with the C++ types. => Can still use __bool/__vector */
+#undef bool
+#undef vector
+#undef pixel
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b6912193ff3627c07a05ed696d519ef00cb0bc7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#if defined(CPU_CAPABILITY_AVX512)
+#include <ATen/cpu/vec/vec512/vec512.h>
+#else
+#include <ATen/cpu/vec/vec256/vec256.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
+  __at_align__ bool buffer[x.size()];
+  x.ne(Vectorized<int8_t>(0)).store(buffer);
+
+  Vectorized<bool> ret;
+  static_assert(x.size() == ret.size(), "");
+  std::memcpy(ret, buffer, ret.size() * sizeof(bool));
+  return ret;
+}
+
+template <>
+inline Vectorized<bool> Vectorized<bool>::loadu(const void* ptr) {
+  // See NOTE [Loading boolean values]
+  return convert_to_bool(Vectorized<int8_t>::loadu(ptr));
+}
+
+template <>
+inline Vectorized<bool> Vectorized<bool>::loadu(const void* ptr, int64_t count) {
+  // See NOTE [Loading boolean values]
+  return convert_to_bool(Vectorized<int8_t>::loadu(ptr, count));
+}
+
+template <typename VT>
+struct VecHoldType { using hold_type = typename VT::value_type; };
+
+template <>
+struct VecHoldType<Vectorized<BFloat16>> { using hold_type = BFloat16; };
+
+template <>
+struct VecHoldType<Vectorized<Half>> {using hold_type = Half; };
+
+template <typename VT>
+using vechold_type = typename VecHoldType<VT>::hold_type;
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vld1_neon.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vld1_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..79cbec407d65c19df52f61308c0ecc1a6c9d6d9e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vld1_neon.h
@@ -0,0 +1,452 @@
+/* Workaround for missing vld1_*_x2 and vst1_*_x2 intrinsics in gcc-7.  */
+
+__extension__ extern __inline uint8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u8_x2 (const uint8_t *__a)
+{
+  uint8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s8_x2 (const int8_t *__a)
+{
+  int8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u16_x2 (const uint16_t *__a)
+{
+  uint16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s16_x2 (const int16_t *__a)
+{
+  int16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u32_x2 (const uint32_t *__a)
+{
+  uint32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s32_x2 (const int32_t *__a)
+{
+  int32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_u64_x2 (const uint64_t *__a)
+{
+  uint64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_s64_x2 (const int64_t *__a)
+{
+  int64x1x2_t ret;
+  __builtin_aarch64_simd_oi __o;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f16_x2 (const float16_t *__a)
+{
+  float16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float32x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f32_x2 (const float32_t *__a)
+{
+  float32x2x2_t ret;
+  asm volatile("ld1 {%S0.2s - %T0.2s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_f64_x2 (const float64_t *__a)
+{
+  float64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly8x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p8_x2 (const poly8_t *__a)
+{
+  poly8x8x2_t ret;
+  asm volatile("ld1 {%S0.8b - %T0.8b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p16_x2 (const poly16_t *__a)
+{
+  poly16x4x2_t ret;
+  asm volatile("ld1 {%S0.4h - %T0.4h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly64x1x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_p64_x2 (const poly64_t *__a)
+{
+  poly64x1x2_t ret;
+  asm volatile("ld1 {%S0.1d - %T0.1d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u8_x2 (const uint8_t *__a)
+{
+  uint8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s8_x2 (const int8_t *__a)
+{
+  int8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u16_x2 (const uint16_t *__a)
+{
+  uint16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s16_x2 (const int16_t *__a)
+{
+  int16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u32_x2 (const uint32_t *__a)
+{
+  uint32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s32_x2 (const int32_t *__a)
+{
+  int32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline uint64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_u64_x2 (const uint64_t *__a)
+{
+  uint64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline int64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_s64_x2 (const int64_t *__a)
+{
+  int64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f16_x2 (const float16_t *__a)
+{
+  float16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float32x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f32_x2 (const float32_t *__a)
+{
+  float32x4x2_t ret;
+  asm volatile("ld1 {%S0.4s - %T0.4s}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline float64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_f64_x2 (const float64_t *__a)
+{
+  float64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly8x16x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p8_x2 (const poly8_t *__a)
+{
+  poly8x16x2_t ret;
+  asm volatile("ld1 {%S0.16b - %T0.16b}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p16_x2 (const poly16_t *__a)
+{
+  poly16x8x2_t ret;
+  asm volatile("ld1 {%S0.8h - %T0.8h}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+__extension__ extern __inline poly64x2x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_p64_x2 (const poly64_t *__a)
+{
+  poly64x2x2_t ret;
+  asm volatile("ld1 {%S0.2d - %T0.2d}, %1" : "=w" (ret) : "Q"(*__a));
+  return ret;
+}
+
+/* vst1x2 */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s64_x2 (int64_t * __a, int64x1x2_t val)
+{
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u64_x2 (uint64_t * __a, uint64x1x2_t val)
+{
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f64_x2 (float64_t * __a, float64x1x2_t val)
+{
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s8_x2 (int8_t * __a, int8x8x2_t val)
+{
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p8_x2 (poly8_t * __a, poly8x8x2_t val)
+{
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s16_x2 (int16_t * __a, int16x4x2_t val)
+{
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p16_x2 (poly16_t * __a, poly16x4x2_t val)
+{
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_s32_x2 (int32_t * __a, int32x2x2_t val)
+{
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u8_x2 (uint8_t * __a, uint8x8x2_t val)
+{
+  asm volatile("st1 {%S1.8b - %T1.8b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u16_x2 (uint16_t * __a, uint16x4x2_t val)
+{
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_u32_x2 (uint32_t * __a, uint32x2x2_t val)
+{
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f16_x2 (float16_t * __a, float16x4x2_t val)
+{
+  asm volatile("st1 {%S1.4h - %T1.4h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_f32_x2 (float32_t * __a, float32x2x2_t val)
+{
+  asm volatile("st1 {%S1.2s - %T1.2s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_p64_x2 (poly64_t * __a, poly64x1x2_t val)
+{
+  asm volatile("st1 {%S1.1d - %T1.1d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s8_x2 (int8_t * __a, int8x16x2_t val)
+{
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t val)
+{
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s16_x2 (int16_t * __a, int16x8x2_t val)
+{
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t val)
+{
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s32_x2 (int32_t * __a, int32x4x2_t val)
+{
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_s64_x2 (int64_t * __a, int64x2x2_t val)
+{
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t val)
+{
+  asm volatile("st1 {%S1.16b - %T1.16b}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t val)
+{
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t val)
+{
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t val)
+{
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f16_x2 (float16_t * __a, float16x8x2_t val)
+{
+  asm volatile("st1 {%S1.8h - %T1.8h}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f64_x2 (float64_t * __a, float64x2x2_t val)
+{
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t val)
+{
+  asm volatile("st1 {%S1.2d - %T1.2d}, %0" : "=Q" (*__a) : "w" (val));
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vst1_neon.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vst1_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..d882a5dbe8753aab083e5a72fde9684d26998196
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/missing_vst1_neon.h
@@ -0,0 +1,8 @@
+/* Workaround for missing vst1q_f32_x2 in gcc-8.  */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
+{
+  asm volatile("st1 {%S1.4s - %T1.4s}, %0" : "=Q" (*__a) : "w" (val));
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h
new file mode 100644
index 0000000000000000000000000000000000000000..272c3295fca5d21ed0d7e7d67801f7acf78400d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256.h
@@ -0,0 +1,307 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+#if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
+#include <ATen/cpu/vec/vec256/vec256_float.h>
+#include <ATen/cpu/vec/vec256/vec256_float_neon.h>
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#include <ATen/cpu/vec/vec256/vec256_double.h>
+#include <ATen/cpu/vec/vec256/vec256_int.h>
+#include <ATen/cpu/vec/vec256/vec256_qint.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
+#elif defined(__VSX__)  || defined(CPU_CAPABILITY_VSX)
+#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
+#else
+#include <ATen/cpu/vec/vec256/zarch/vec256_zarch.h>
+#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
+#endif
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+namespace at::vec {
+
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return _mm256_castpd_ps(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return _mm256_castps_pd(src);
+}
+
+template<>
+inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
+  return _mm256_castsi256_ps(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src) {
+  return _mm256_castsi256_pd(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm256_i64gather_pd(base_addr, vindex, scale);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+  return _mm256_i32gather_ps(base_addr, vindex, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+                   const Vectorized<int64_t>& vindex, Vectorized<double>& mask) {
+  return _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+                   const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+  return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<int64_t>
+inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
+  auto x = _mm256_add_pd(src, _mm256_set1_pd(0x0018000000000000));
+  return _mm256_sub_epi64(
+      _mm256_castpd_si256(x),
+      _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
+  );
+}
+
+template<>
+Vectorized<int32_t>
+inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+  return _mm256_cvttps_epi32(src);
+}
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+  auto x = _mm256_add_epi64(src, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
+  return _mm256_sub_pd(
+    _mm256_castsi256_pd(x),
+    _mm256_set1_pd(0x0018000000000000)
+  );
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+  return _mm256_cvtepi32_ps(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, b0, b1}
+  //   b_swapped = {a2, a3, b2, b3}
+  auto a_swapped = _mm256_permute2f128_pd(a, b, 0b0100000);  // 0, 2.   4 bits apart
+  auto b_swapped = _mm256_permute2f128_pd(a, b, 0b0110001);  // 1, 3.   4 bits apart
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(_mm256_permute4x64_pd(a_swapped, 0b11011000),  // 0, 2, 1, 3
+                        _mm256_permute4x64_pd(b_swapped, 0b11011000)); // 0, 2, 1, 3
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+
+  // swap lanes:
+  //   a_swapped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_swapped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  auto a_swapped = _mm256_permute2f128_ps(a, b, 0b0100000);  // 0, 2.   4 bits apart
+  auto b_swapped = _mm256_permute2f128_ps(a, b, 0b0110001);  // 1, 3.   4 bits apart
+
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  return std::make_pair(_mm256_permutevar8x32_ps(a_swapped, group_ctrl),
+                        _mm256_permutevar8x32_ps(b_swapped, group_ctrl));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, b0, b1}
+  //   b_grouped = {a2, a3, b2, b3}
+  auto a_grouped = _mm256_permute4x64_pd(a, 0b11011000);  // 0, 2, 1, 3
+  auto b_grouped = _mm256_permute4x64_pd(b, 0b11011000);  // 0, 2, 1, 3
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(_mm256_permute2f128_pd(a_grouped, b_grouped, 0b0100000),  // 0, 2.   4 bits apart
+                        _mm256_permute2f128_pd(a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+
+  // group cols crossing lanes:
+  //   a_grouped = {a0, a1, a2, a3, b0, b1, b2, b3}
+  //   b_grouped = {a4, a5, a6, a7, b4, b5, b6, b7}
+  // TODO: can we support caching this?
+  const __m256i group_ctrl = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
+  auto a_grouped = _mm256_permutevar8x32_ps(a, group_ctrl);
+  auto b_grouped = _mm256_permutevar8x32_ps(b, group_ctrl);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(_mm256_permute2f128_ps(a_grouped, b_grouped, 0b0100000),  // 0, 2.   4 bits apart
+                        _mm256_permute2f128_ps(a_grouped, b_grouped, 0b0110001)); // 1, 3.   4 bits apart
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> flip(const Vectorized<float> & v) {
+  const __m256i mask_float = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_ps(v, mask_float);
+}
+
+template<>
+inline Vectorized<double> flip(const Vectorized<double> & v) {
+  return _mm256_permute4x64_pd(v, 27);  // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template<>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
+  return _mm256_permute4x64_epi64(v, 27);  // 27 == _MM_SHUFFLE(0, 1, 2, 3)
+}
+
+template<>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
+  const __m256i mask_int32 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm256_permutevar8x32_epi32(v, mask_int32);
+}
+
+template<>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
+  const __m256i mask = _mm256_set_epi8(
+    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+    1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
+  );
+  auto reversed = _mm256_shuffle_epi8(v, mask);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+inline __m256i flip8(const __m256i & v) {
+  const __m256i mask_int8 = _mm256_set_epi8(
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  );
+  auto reversed = _mm256_shuffle_epi8(v, mask_int8);
+  return _mm256_permute2x128_si256(reversed, reversed, 1);
+}
+
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
+
+#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+}} // namepsace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..b53184cbae191137d8400a0be093f2aa5f90d3cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -0,0 +1,1096 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-qualifiers"
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+// bfloat16 conversion
+static inline void cvtbf16_fp32(const __m128i& a, __m256& o) {
+  o = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(a), 16));
+}
+
+static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
+  __m128i lo = _mm256_extractf128_si256(a, 0);
+  __m128i hi = _mm256_extractf128_si256(a, 1);
+  cvtbf16_fp32(lo, o1);
+  cvtbf16_fp32(hi, o2);
+}
+static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
+  __m256i lo = _mm256_castps_si256(a);
+  __m256i hi = _mm256_castps_si256(b);
+  __m256i nan = _mm256_set1_epi32(0xffff);
+  __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q));
+  __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q));
+  __m256i ones = _mm256_set1_epi32(0x1);
+  __m256i vec_bias = _mm256_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_lo = _mm256_and_si256(_mm256_srli_epi32(lo, 16), ones);
+  auto t_hi = _mm256_and_si256(_mm256_srli_epi32(hi, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_lo = _mm256_add_epi32(t_lo, vec_bias);
+  t_hi = _mm256_add_epi32(t_hi, vec_bias);
+  // input += rounding_bias;
+  t_lo = _mm256_add_epi32(t_lo, lo);
+  t_hi = _mm256_add_epi32(t_hi, hi);
+  // input = input >> 16;
+  t_lo = _mm256_srli_epi32(t_lo, 16);
+  t_hi = _mm256_srli_epi32(t_hi, 16);
+  // Check NaN before converting back to bf16
+  t_lo = _mm256_blendv_epi8(nan, t_lo, mask_lo);
+  t_hi = _mm256_blendv_epi8(nan, t_hi, mask_hi);
+
+  t_lo = _mm256_packus_epi32(t_lo, t_hi);      // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+  return _mm256_permute4x64_epi64(t_lo, 0xd8); // 11        01        10        00
+}
+
+static inline __m256i merge_compare_result(const __m256& a, const __m256& b) {
+  __m256i lo = _mm256_castps_si256(a);
+  __m256i hi = _mm256_castps_si256(b);
+  lo = _mm256_srli_epi32(lo, 16);
+  hi = _mm256_srli_epi32(hi, 16);
+  auto out = _mm256_packus_epi32(lo, hi);
+  return _mm256_permute4x64_epi64(out, 0xd8);
+}
+
+// float16 conversion
+static inline void cvtfp16_fp32(const __m128i& a, __m256& o) {
+  o = _mm256_cvtph_ps(a);
+}
+
+static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
+  __m128i lo = _mm256_extractf128_si256(a, 0);
+  __m128i hi = _mm256_extractf128_si256(a, 1);
+  cvtfp16_fp32(lo, o1);
+  cvtfp16_fp32(hi, o2);
+}
+
+static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) {
+  __m128i lo = _mm256_cvtps_ph(
+      a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m128i hi = _mm256_cvtps_ph(
+      b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+// dtype conversion between float16/bfloat16 and float32
+template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m128i& a, __m256& o);
+template <> inline void cvt_to_fp32<BFloat16>(const __m128i& a, __m256& o) {
+  cvtbf16_fp32(a, o);
+};
+template <> inline void cvt_to_fp32<Half>(const __m128i& a, __m256& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m256& o1, __m256& o2);
+template <> inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <> inline void cvt_to_fp32<Half>(const __m256i& a, __m256& o1, __m256& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <typename T, bool is_compare_op = false,
+          typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m256i cvt_from_fp32(const __m256& a, const __m256& b);
+template <> inline __m256i cvt_from_fp32<BFloat16, false>(const __m256& a, const __m256& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <> inline __m256i cvt_from_fp32<BFloat16, true>(const __m256& a, const __m256& b) {
+  return merge_compare_result(a, b);
+}
+template <> inline __m256i cvt_from_fp32<Half, false>(const __m256& a, const __m256& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <> inline __m256i cvt_from_fp32<Half, true>(const __m256& a, const __m256& b) {
+  return cvtfp32_fp16(a, b);
+}
+
+template <typename T>
+class Vectorized16 {
+static_assert(
+  is_reduced_floating_point_v<T>,
+  "Support only float16 and bfloat16.");
+protected:
+  __m256i values;
+public:
+  using value_type = uint16_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized16() {}
+  Vectorized16(__m256i v) : values(v) {}
+  Vectorized16(T val) {
+    value_type uw = val.x;
+    values = _mm256_set1_epi16(uw);
+  }
+  Vectorized16(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16) {
+    values = _mm256_setr_epi16(
+        val1.x, val2.x, val3.x, val4.x, val5.x, val6.x, val7.x, val8.x,
+        val9.x, val10.x, val11.x, val12.x, val13.x, val14.x, val15.x, val16.x);
+  }
+  operator __m256i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const  = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    __m256i cmp = _mm256_cmpeq_epi16(values, _mm256_set1_epi16(0));
+    return _mm256_movemask_epi8(cmp);
+  }
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+
+    __at_align__ int16_t tmp_values[size()];
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(tmp_values));
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int16_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+    }
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    __at_align__ int16_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi16(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi16(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi16(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi16(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi16(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi16(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi16(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi16(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi16(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi16(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi16(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi16(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi16(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi16(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi16(b.values, 15);
+    return loadu(tmp_values);
+  }
+  static Vectorized<T> blendv(const Vectorized<T>& a,
+      const Vectorized<T>& b, const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template<typename step_t>
+  static Vectorized<T> arange(T base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
+  static Vectorized<T> set(const Vectorized<T>& a,
+      const Vectorized<T>& b, int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  Vectorized<T> map(const __m256 (*const vop)(__m256)) const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    const auto o1 = vop(lo);
+    const auto o2 = vop(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> isnan() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    lo = _mm256_cmp_ps(lo, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+    hi = _mm256_cmp_ps(hi, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+    return merge_compare_result(lo, hi);
+  }
+  Vectorized<T> abs() const {
+    return _mm256_andnot_si256(_mm256_set1_epi16(0x8000), values);
+  }
+  Vectorized<T> angle() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto angle_lambda = [](__m256 values_2) {
+      const auto zero_vec = _mm256_set1_ps(0.f);
+      const auto nan_vec = _mm256_set1_ps(NAN);
+      const auto not_nan_mask = _mm256_cmp_ps(values_2, values_2, _CMP_EQ_OQ);
+      const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+      const auto pi = _mm256_set1_ps(c10::pi<float>);
+
+      const auto neg_mask = _mm256_cmp_ps(values_2, zero_vec, _CMP_LT_OQ);
+      auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+      angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+      return angle;
+    };
+    auto o1 = angle_lambda(lo);
+    auto o2 = angle_lambda(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+  Vectorized<T> acos() const {
+    return map(Sleef_acosf8_u10);
+  }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf8_u10);
+  }
+  Vectorized<T> asin() const {
+    return map(Sleef_asinf8_u10);
+  }
+  Vectorized<T> atan() const {
+    return map(Sleef_atanf8_u10);
+  }
+  Vectorized<T> atanh() const {
+    return map(Sleef_atanhf8_u10);
+  }
+  Vectorized<T> atan2(const Vectorized<T> &b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_atan2f8_u10(lo, b1);
+    auto o2 = Sleef_atan2f8_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> copysign(const Vectorized<T> &sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m256i mask_value = _mm256_set1_epi32(~0x80008000);
+    __m256i mask_signbit = _mm256_set1_epi32(0x80008000);
+    return Vectorized<T>(
+      _mm256_or_si256(
+        _mm256_and_si256(values, mask_value),
+        _mm256_and_si256(sign, mask_signbit)));
+  }
+  Vectorized<T> erf() const {
+    return map(Sleef_erff8_u10);
+  }
+  Vectorized<T> erfc() const {
+    return map(Sleef_erfcf8_u15);
+  }
+  Vectorized<T> erfinv() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_erfinv(tmp1[i]);
+      tmp2[i] = calc_erfinv(tmp2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> exp() const {
+    return map(Sleef_expf8_u10);
+  }
+  Vectorized<T> exp2() const {
+    return map(Sleef_exp2f8_u10);
+  }
+  Vectorized<T> expm1() const {
+    return map(Sleef_expm1f8_u10);
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+  Vectorized<T> fmod(const Vectorized<T> & q) const {
+    __m256 x_lo, x_hi;
+    cvt_to_fp32<T>(values, x_lo, x_hi);
+    __m256 q_lo, q_hi;
+    cvt_to_fp32<T>(q.values, q_lo, q_hi);
+    auto o1 = Sleef_fmodf8(x_lo, q_lo);
+    auto o2 = Sleef_fmodf8(x_hi, q_hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> hypot(const Vectorized<T> &b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_hypotf8_u05(lo, b1);
+    auto o2 = Sleef_hypotf8_u05(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_i0(tmp1[i]);
+      tmp2[i] = calc_i0(tmp2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0e() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_i0e(tmp1[i]);
+      tmp2[i] = calc_i0e(tmp2[i]);
+    }
+    const auto o1 = _mm256_loadu_ps(tmp1);
+    const auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> digamma() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_digamma(tmp1[i]);
+      tmp2[i] = calc_digamma(tmp2[i]);
+    }
+    const auto o1 = _mm256_loadu_ps(tmp1);
+    const auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> igamma(const Vectorized<T> &x) const {
+    __m256 lo, hi;
+    __m256 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T> &x) const {
+    __m256 lo, hi;
+    __m256 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm256_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm256_loadu_ps(tmp1);
+    auto o2 = _mm256_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> log() const {
+    return map(Sleef_logf8_u10);
+  }
+  Vectorized<T> log2() const {
+    return map(Sleef_log2f8_u10);
+  }
+  Vectorized<T> log10() const {
+    return map(Sleef_log10f8_u10);
+  }
+  Vectorized<T> log1p() const {
+    return map(Sleef_log1pf8_u10);
+  }
+  Vectorized<T> sin() const {
+    return map(Sleef_sinf8_u10);
+  }
+  Vectorized<T> sinh() const {
+    return map(Sleef_sinhf8_u10);
+  }
+  Vectorized<T> cos() const {
+    return map(Sleef_cosf8_u10);
+  }
+  Vectorized<T> cosh() const {
+    return map(Sleef_coshf8_u10);
+  }
+  Vectorized<T> ceil() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_ceil_ps(lo);
+    auto o2 = _mm256_ceil_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> floor() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_floor_ps(lo);
+    auto o2 = _mm256_floor_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> neg() const {
+    return _mm256_xor_si256(values, _mm256_set1_epi16(0x8000));
+  }
+  Vectorized<T> round() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> tan() const {
+    return map(Sleef_tanf8_u10);
+  }
+  Vectorized<T> tanh() const {
+    return map(Sleef_tanhf8_u10);
+  }
+  Vectorized<T> trunc() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_round_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    auto o2 = _mm256_round_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> lgamma() const {
+    return map(Sleef_lgammaf8_u10);
+  }
+  Vectorized<T> sqrt() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm256_sqrt_ps(lo);
+    auto o2 = _mm256_sqrt_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> reciprocal() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm256_set1_ps(1);
+    auto o1 = _mm256_div_ps(ones, lo);
+    auto o2 = _mm256_div_ps(ones, hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> rsqrt() const {
+    __m256 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm256_set1_ps(1);
+    auto o1 = _mm256_div_ps(ones, _mm256_sqrt_ps(lo));
+    auto o2 = _mm256_div_ps(ones, _mm256_sqrt_ps(hi));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> pow(const Vectorized<T> &b) const {
+    __m256 lo, hi;
+    __m256 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_powf8_u10(lo, b1);
+    auto o2 = Sleef_powf8_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+private:
+  template<typename Op>
+  Vectorized<T> inline binary_compare(const Vectorized<T>& b, Op op) const {
+    __m256 a_lo, a_hi;
+    __m256 b_lo, b_hi;
+    cvt_to_fp32<T>(values, a_lo, a_hi);
+    cvt_to_fp32<T>(b.values, b_lo, b_hi);
+    auto o1 = op(a_lo, b_lo);
+    auto o2 = op(a_hi, b_hi);
+    return cvt_from_fp32<T, /*is_compare_op*/true>(o1, o2);
+  }
+
+public:
+  Vectorized<T> inline operator>(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_GT_OQ); });
+  }
+  Vectorized<T> inline operator<(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_LT_OQ); });
+  }
+  Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_GE_OQ); });
+  }
+  Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_LE_OQ); });
+  }
+  Vectorized<T> inline operator==(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_EQ_OQ); });
+  }
+  Vectorized<T> inline operator!=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m256 x, __m256 y) { return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ); });
+  }
+};
+
+template<typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvt_to_fp32<T>(__m256i(a), a_lo, a_hi);
+  cvt_to_fp32<T>(__m256i(b), b_lo, b_hi);
+  auto o1 = op(a_lo, b_lo);
+  auto o2 = op(a_hi, b_hi);
+  return cvt_from_fp32<T>(o1, o2);
+}
+
+template <>
+class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
+public:
+  using Vectorized16::Vectorized16;
+
+  Vectorized<BFloat16> frac() const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+Vectorized<BFloat16> inline operator+(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator-(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_sub_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator*(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_mul_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator/(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_div_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator&(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<BFloat16> inline operator|(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<BFloat16> inline operator^(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  auto max_lo = _mm256_max_ps(a_lo, b_lo);
+  auto max_hi = _mm256_max_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(max_lo, nan_lo);
+  auto o2 = _mm256_or_ps(max_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  auto min_lo = _mm256_min_ps(a_lo, b_lo);
+  auto min_hi = _mm256_min_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(min_lo, nan_lo);
+  auto o2 = _mm256_or_ps(min_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min, const Vectorized<BFloat16>& max) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  __m256 max_lo, max_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(min), min_lo, min_hi);
+  cvtbf16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo));
+  auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi));
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& max) {
+  __m256 a_lo, a_hi;
+  __m256 max_lo, max_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, a_lo);
+  auto o2 = _mm256_min_ps(max_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& min) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(min), min_lo, min_hi);
+  auto o1 = _mm256_max_ps(min_lo, a_lo);
+  auto o2 = _mm256_max_ps(min_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
+    auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+    __m256 a = _mm256_loadu_ps(&src[i]);
+    __m256 b = _mm256_loadu_ps(&src[i + 8]);
+
+    __m256i bf = cvtfp32_bf16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, BFloat16* dst, int64_t n) {
+  auto load_float = [](const double *src) -> __m256 {
+    // Load one float vector from an array of doubles
+    __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
+    __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+    __m256 a = load_float(&src[i]);
+    __m256 b = load_float(&src[i + 8]);
+
+    __m256i bf = cvtfp32_bf16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  __m256 c_lo, c_hi;
+  cvtbf16_fp32(__m256i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m256i(b), b_lo, b_hi);
+  cvtbf16_fp32(__m256i(c), c_lo, c_hi);
+  auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+class Vectorized<Half>: public Vectorized16<Half> {
+public:
+  using Vectorized16::Vectorized16;
+
+  Vectorized<Half> frac() const;
+
+  Vectorized<Half> eq(const Vectorized<Half>& other) const;
+  Vectorized<Half> ne(const Vectorized<Half>& other) const;
+  Vectorized<Half> gt(const Vectorized<Half>& other) const;
+  Vectorized<Half> ge(const Vectorized<Half>& other) const;
+  Vectorized<Half> lt(const Vectorized<Half>& other) const;
+  Vectorized<Half> le(const Vectorized<Half>& other) const;
+};
+
+Vectorized<Half> inline operator+(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_add_ps(x, y); });
+}
+Vectorized<Half> inline operator-(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_sub_ps(x, y); });
+}
+Vectorized<Half> inline operator*(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_mul_ps(x, y); });
+}
+Vectorized<Half> inline operator/(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m256& x, const __m256& y) { return _mm256_div_ps(x, y); });
+}
+Vectorized<Half> inline operator&(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm256_and_si256(a, b);
+}
+Vectorized<Half> inline operator|(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm256_or_si256(a, b);
+}
+Vectorized<Half> inline operator^(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm256_xor_si256(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ne(const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::gt(const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::ge(const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::lt(const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+inline Vectorized<Half> Vectorized<Half>::le(const Vectorized<Half>& other) const {
+  return (*this <= other) & Vectorized<Half>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<Half> Vectorized<Half>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  auto max_lo = _mm256_max_ps(a_lo, b_lo);
+  auto max_hi = _mm256_max_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(max_lo, nan_lo);
+  auto o2 = _mm256_or_ps(max_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  auto min_lo = _mm256_min_ps(a_lo, b_lo);
+  auto min_hi = _mm256_min_ps(a_hi, b_hi);
+  auto nan_lo = _mm256_cmp_ps(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi = _mm256_cmp_ps(a_hi, b_hi, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm256_or_ps(min_lo, nan_lo);
+  auto o2 = _mm256_or_ps(min_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp(const Vectorized<Half>& a,
+    const Vectorized<Half>& min, const Vectorized<Half>& max) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  __m256 max_lo, max_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(min), min_lo, min_hi);
+  cvtfp16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, _mm256_max_ps(min_lo, a_lo));
+  auto o2 = _mm256_min_ps(max_hi, _mm256_max_ps(min_hi, a_hi));
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Half>& max) {
+  __m256 a_lo, a_hi;
+  __m256 max_lo, max_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(max), max_lo, max_hi);
+  auto o1 = _mm256_min_ps(max_lo, a_lo);
+  auto o2 = _mm256_min_ps(max_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Half>& min) {
+  __m256 a_lo, a_hi;
+  __m256 min_lo, min_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(min), min_lo, min_hi);
+  auto o1 = _mm256_max_ps(min_lo, a_lo);
+  auto o2 = _mm256_max_ps(min_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+inline void convert(const Half* src, Half* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
+    auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, Half* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+    __m256 a = _mm256_loadu_ps(&src[i]);
+    __m256 b = _mm256_loadu_ps(&src[i + 8]);
+
+    __m256i c = cvtfp32_fp16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, Half* dst, int64_t n) {
+  auto load_float = [](const double *src) -> __m256 {
+    // Load one float vector from an array of doubles
+    __m128 a = _mm256_cvtpd_ps(_mm256_loadu_pd(src));
+    __m128 b = _mm256_cvtpd_ps(_mm256_loadu_pd(src + 4));
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+    __m256 a = load_float(&src[i]);
+    __m256 b = load_float(&src[i + 8]);
+
+    __m256i c = cvtfp32_fp16(a, b);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dst[i]), c);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+Vectorized<Half> inline fmadd(const Vectorized<Half>& a,
+    const Vectorized<Half>& b, const Vectorized<Half>& c) {
+  __m256 a_lo, a_hi;
+  __m256 b_lo, b_hi;
+  __m256 c_lo, c_hi;
+  cvtfp16_fp32(__m256i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m256i(b), b_lo, b_hi);
+  cvtfp16_fp32(__m256i(c), c_lo, c_hi);
+  auto o1 = _mm256_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm256_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+#define CONVERT_VECTORIZED_INIT(type, name) \
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+  __m256 o1, o2; \
+  cvt_to_fp32<type>(__m256i(a), o1, o2); \
+  return std::make_tuple(o1, o2); \
+} \
+inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
+  return cvt_from_fp32<type>(__m256(a), __m256(b)); \
+}
+CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+CONVERT_VECTORIZED_INIT(Half, half);
+
+#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name) \
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+  constexpr int64_t K = Vectorized<type>::size(); \
+  __at_align__ float arr[K]; \
+  __at_align__ type arr2[K]; \
+  a.store(arr2); \
+  convert(arr2, arr, K); \
+  return std::make_tuple( \
+      Vectorized<float>::loadu(arr), \
+      Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+} \
+inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
+  constexpr int64_t K = Vectorized<type>::size(); \
+  __at_align__ float arr[K]; \
+  __at_align__ type arr2[K]; \
+  a.store(arr); \
+  b.store(arr + Vectorized<float>::size()); \
+  convert(arr, arr2, K); \
+  return Vectorized<type>::loadu(arr2); \
+}
+CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+CONVERT_NON_VECTORIZED_INIT(Half, half);
+
+#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#define LOAD_FP32_VECTORIZED_INIT(type, name) \
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+  auto values = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); \
+  __m256 out_values; \
+  cvt_to_fp32<type>(values, out_values); \
+  out = out_values; \
+} \
+\
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+  auto vec = Vectorized<type>::loadu(data); \
+  __m256 out1_values, out2_values; \
+  cvt_to_fp32<type>(vec, out1_values, out2_values); \
+  out1 = out1_values; \
+  out2 = out2_values; \
+}
+LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+
+#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+  __at_align__ float values[Vectorized<float>::size()]; \
+  for (const auto k : c10::irange(Vectorized<float>::size())) { \
+    values[k] = data[k]; \
+  } \
+  out = Vectorized<float>::loadu(values); \
+} \
+\
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+  load_fp32_from_##name(data, out1); \
+  data += Vectorized<float>::size(); \
+  load_fp32_from_##name(data, out2); \
+}
+LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16);
+LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
+
+#endif
+}} // namsepace at::vec::CPU_CAPABILITY
+
+#pragma GCC diagnostic pop
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..a095d00637a245a64c7cfb51485dc552d5b65c60
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -0,0 +1,431 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <> class Vectorized<c10::complex<double>> {
+private:
+  __m256d values;
+public:
+  using value_type = c10::complex<double>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 2;
+  }
+  Vectorized() {}
+  Vectorized(__m256d v) : values(v) {}
+  Vectorized(c10::complex<double> val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    values = _mm256_setr_pd(real_value, imag_value,
+                            real_value, imag_value);
+  }
+  Vectorized(c10::complex<double> val1, c10::complex<double> val2) {
+    values = _mm256_setr_pd(val1.real(), val1.imag(),
+                            val2.real(), val2.imag());
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<double>> blend(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+     // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert (mask > -1 && mask < 4, "Unexpected mask value");
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_pd(a.values, b.values, 0x03);
+      case 2:
+        return _mm256_blend_pd(a.values, b.values, 0x0c);
+      case 3: break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b,
+                               const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_pd(mask.values, mask.values);
+    return _mm256_blendv_pd(a.values, b.values, mask_);
+
+  }
+  template<typename step_t>
+  static Vectorized<c10::complex<double>> arange(c10::complex<double> base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(base,
+                                        base + step);
+  }
+  static Vectorized<c10::complex<double>> set(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2*size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2*size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(c10::complex<double>));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[2*size()];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
+    }
+  }
+  const c10::complex<double>& operator[](int idx) const  = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
+    __at_align__ c10::complex<double> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256d abs_2_() const {
+    auto val_2 = _mm256_mul_pd(values, values);     // a*a     b*b
+    return _mm256_hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
+  }
+  __m256d abs_() const {
+    auto real = _mm256_movedup_pd(values);       // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm256_permute_pd(values, 0xf);  // imag imag
+    return Sleef_hypotd4_u05(real, imag);        // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm256_and_pd(abs_(), real_mask);        // abs     0
+  }
+  __m256d angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm256_permute_pd(values, 0x05);     // b        a
+    return Sleef_atan2d4_u10(values, b_a);          // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    auto angle = _mm256_permute_pd(angle_(), 0x05); // angle    90-angle
+    return _mm256_and_pd(angle, real_mask);         // angle    0
+  }
+  Vectorized<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_pd();
+    auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ);
+    auto div = values / abs;
+    return _mm256_blendv_pd(div, zero, mask);
+  }
+  __m256d real_() const {
+    const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                     0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm256_and_pd(values, real_mask);
+  }
+  Vectorized<c10::complex<double>> real() const {
+    return real_();
+  }
+  __m256d imag_() const {
+    const __m256d imag_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
+                                                                     0x0000000000000000, 0xFFFFFFFFFFFFFFFF));
+    return _mm256_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm256_permute_pd(imag_(), 0x05);           //b        a
+  }
+  __m256d conj_() const {
+    const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_pd(values, sign_mask);           // a       -b
+  }
+  Vectorized<c10::complex<double>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<double>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<double>> log2() const {
+    const __m256d log2_ = _mm256_set1_pd(std::log(2));
+    return _mm256_div_pd(log(), log2_);
+  }
+  Vectorized<c10::complex<double>> log10() const {
+    const __m256d log10_ = _mm256_set1_pd(std::log(10));
+    return _mm256_div_pd(log(), log10_);
+  }
+  Vectorized<c10::complex<double>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<double>> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    const __m256d one = _mm256_set1_pd(1);
+
+    auto conj = conj_();
+    auto b_a = _mm256_permute_pd(conj, 0x05);                         //-b        a
+    auto ab = _mm256_mul_pd(conj, b_a);                               //-ab       -ab
+    auto im = _mm256_add_pd(ab, ab);                                  //-2ab      -2ab
+
+    auto val_2 = _mm256_mul_pd(values, values);                       // a*a      b*b
+    auto re = _mm256_hsub_pd(val_2, _mm256_permute_pd(val_2, 0x05));  // a*a-b*b  b*b-a*a
+    re = _mm256_sub_pd(one, re);
+
+    auto root = Vectorized(_mm256_blend_pd(re, im, 0x0A)).sqrt();         //sqrt(re + i*im)
+    auto ln = Vectorized(_mm256_add_pd(b_a, root)).log();                 //ln(iz + sqrt())
+    return Vectorized(_mm256_permute_pd(ln.values, 0x05)).conj();         //-i*ln()
+  }
+  Vectorized<c10::complex<double>> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    constexpr auto pi_2d = c10::pi<double> / 2;
+    const __m256d pi_2 = _mm256_setr_pd(pi_2d, 0.0, pi_2d, 0.0);
+    return _mm256_sub_pd(pi_2, asin());
+  }
+  Vectorized<c10::complex<double>> atan() const;
+  Vectorized<c10::complex<double>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<double>> exp() const {
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expd4_u10(values);                               //exp(a)           exp(b)
+    exp = _mm256_blend_pd(exp, _mm256_permute_pd(exp, 0x05), 0x0A);   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosd4_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm256_blend_pd(_mm256_permute_pd(sin_cos.y, 0x05),
+                                   sin_cos.x, 0x0A);                  //cos(b)           sin(b)
+    return _mm256_mul_pd(exp, cos_sin);
+  }
+  Vectorized<c10::complex<double>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m256d ln_2 = _mm256_set1_pd(c10::ln_2<double>);
+    Vectorized<c10::complex<double>> scaled_values = _mm256_mul_pd(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<double>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<double>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<double>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<double>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<double>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<double>> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vectorized<c10::complex<double>> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vectorized<c10::complex<double>> neg() const {
+    auto zero = _mm256_setzero_pd();
+    return _mm256_sub_pd(zero, values);
+  }
+  Vectorized<c10::complex<double>> round() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<double>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<double>> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<double>> reciprocal() const;
+  Vectorized<c10::complex<double>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<double>> pow(const Vectorized<c10::complex<double>> &exp) const {
+    __at_align__ c10::complex<double> x_tmp[size()];
+    __at_align__ c10::complex<double> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<double>> operator==(const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<double>> operator!=(const Vectorized<c10::complex<double>>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<double>> operator<(const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(const Vectorized<c10::complex<double>>&) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <> Vectorized<c10::complex<double>> inline operator+(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <> Vectorized<c10::complex<double>> inline operator-(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_pd(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_pd(b, 0x05);    //d        c
+  d_c = _mm256_xor_pd(sign_mask, d_c);      //d       -c
+  auto ad_bc = _mm256_mul_pd(a, d_c);       //ad      -bc
+
+  auto ret = _mm256_hsub_pd(ac_bd, ad_bc);  //ac - bd  ad + bc
+  return ret;
+}
+
+template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a, const Vectorized<c10::complex<double>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  auto mask = _mm256_set1_pd(-0.f);
+  auto fabs_cd = _mm256_andnot_pd(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm256_permute_pd(fabs_cd, 0x05);   // |d|    |c|
+  auto scale = _mm256_div_pd(_mm256_set1_pd(1.0f), _mm256_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm256_mul_pd(a, scale);         // a/sc     b/sc
+  auto b2 = _mm256_mul_pd(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm256_mul_pd(a2, b2);
+
+  const __m256d sign_mask = _mm256_setr_pd(-0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm256_permute_pd(b2, 0x05);    // d/sc         c/sc
+  dc2 = _mm256_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm256_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = _mm256_hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm256_div_pd(res2, denom2);
+  return res2;
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::reciprocal() const{
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0);
+  auto c_d = _mm256_xor_pd(sign_mask, values);    //c       -d
+  return _mm256_div_pd(c_d, abs_2_());
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan() const {
+  // atan(x) = i/2 * ln((i + z)/(i - z))
+  const __m256d i = _mm256_setr_pd(0.0, 1.0, 0.0, 1.0);
+  const Vectorized i_half = _mm256_setr_pd(0.0, 0.5, 0.0, 0.5);
+
+  auto sum = Vectorized(_mm256_add_pd(i, values));                      // a        1+b
+  auto sub = Vectorized(_mm256_sub_pd(i, values));                      // -a       1-b
+  auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
+  return i_half*ln;                                                 // i/2*ln()
+}
+
+template <>
+Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(max, isnan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_pd(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_pd(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_pd(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator&(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator|(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator^(const Vectorized<c10::complex<double>>& a, const Vectorized<c10::complex<double>>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other);  // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers are equal
+  return (eq.real() & eq.imag()) & Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other);  // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+  return (ne.real() | ne.imag()) & Vectorized<c10::complex<double>>(_mm256_set1_pd(1.0));
+}
+
+#endif
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..be44f3e94ad6c74e7f645346ac8bfc72d0441673
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -0,0 +1,468 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <> class Vectorized<c10::complex<float>> {
+private:
+  __m256 values;
+public:
+  using value_type = c10::complex<float>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  Vectorized(__m256 v) : values(v) {}
+  Vectorized(c10::complex<float> val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    values = _mm256_setr_ps(real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value
+                            );
+  }
+  Vectorized(c10::complex<float> val1, c10::complex<float> val2, c10::complex<float> val3, c10::complex<float> val4) {
+    values = _mm256_setr_ps(val1.real(), val1.imag(),
+                            val2.real(), val2.imag(),
+                            val3.real(), val3.imag(),
+                            val4.real(), val4.imag()
+                            );
+  }
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<float>> blend(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+     // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 16, "Unexpected mask range");
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm256_blend_ps(a.values, b.values, 0x03); //b0000 0001 = b0000 0011
+      case 2:
+        return _mm256_blend_ps(a.values, b.values, 0x0C); //b0000 0010 = b0000 1100
+      case 3:
+        return _mm256_blend_ps(a.values, b.values, 0x0F); //b0000 0011 = b0000 1111
+      case 4:
+        return _mm256_blend_ps(a.values, b.values, 0x30); //b0000 0100 = b0011 0000
+      case 5:
+        return _mm256_blend_ps(a.values, b.values, 0x33); //b0000 0101 = b0011 0011
+      case 6:
+        return _mm256_blend_ps(a.values, b.values, 0x3C); //b0000 0110 = b0011 1100
+      case 7:
+        return _mm256_blend_ps(a.values, b.values, 0x3F); //b0000 0111 = b0011 1111
+      case 8:
+        return _mm256_blend_ps(a.values, b.values, 0xC0); //b0000 1000 = b1100 0000
+      case 9:
+        return _mm256_blend_ps(a.values, b.values, 0xC3); //b0000 1001 = b1100 0011
+      case 10:
+        return _mm256_blend_ps(a.values, b.values, 0xCC); //b0000 1010 = b1100 1100
+      case 11:
+        return _mm256_blend_ps(a.values, b.values, 0xCF); //b0000 1011 = b1100 1111
+      case 12:
+        return _mm256_blend_ps(a.values, b.values, 0xF0); //b0000 1100 = b1111 0000
+      case 13:
+        return _mm256_blend_ps(a.values, b.values, 0xF3); //b0000 1101 = b1111 0011
+      case 14:
+        return _mm256_blend_ps(a.values, b.values, 0xFC); //b0000 1110 = b1111 1100
+      default: break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b,
+                               const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm256_unpacklo_ps(mask.values, mask.values);
+    return _mm256_blendv_ps(a.values, b.values, mask_);
+
+  }
+  template<typename step_t>
+  static Vectorized<c10::complex<float>> arange(c10::complex<float> base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(base,
+                                        base + step,
+                                        base + c10::complex<float>(2)*step,
+                                        base + c10::complex<float>(3)*step);
+  }
+  static Vectorized<c10::complex<float>> set(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2*size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2*size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float*>(ptr),
+        count * sizeof(c10::complex<float>));
+    return _mm256_load_ps(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[2*size()];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
+    }
+  }
+  const c10::complex<float>& operator[](int idx) const  = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
+    __at_align__ c10::complex<float> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m256 abs_2_() const {
+    auto val_2 = _mm256_mul_ps(values, values);     // a*a     b*b
+    auto ret = _mm256_hadd_ps(val_2, val_2);        // a*a+b*b a*a+b*b
+    return _mm256_permute_ps(ret, 0xD8);
+  }
+  __m256 abs_() const {
+    auto real = _mm256_moveldup_ps(values);   // real real
+    auto imag = _mm256_movehdup_ps(values);   // imag imag
+    return Sleef_hypotf8_u05(real, imag);     // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm256_and_ps(abs_(), real_mask);        // abs     0
+  }
+  __m256 angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm256_permute_ps(values, 0xB1);     // b        a
+    return Sleef_atan2f8_u10(values, b_a);          // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm256_and_ps(angle, real_mask);         // angle    0
+  }
+  Vectorized<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm256_setzero_ps();
+    auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ);
+    auto div = values / abs;
+    return _mm256_blendv_ps(div, zero, mask);
+  }
+  __m256 real_() const {
+    const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm256_and_ps(values, real_mask);
+  }
+  Vectorized<c10::complex<float>> real() const {
+    return real_();
+  }
+  __m256 imag_() const {
+    const __m256 imag_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
+                                                                   0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF));
+    return _mm256_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm256_permute_ps(imag_(), 0xB1);        //b        a
+  }
+  __m256 conj_() const {
+    const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm256_xor_ps(values, sign_mask);        // a       -b
+  }
+  Vectorized<c10::complex<float>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<float>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<float>> log2() const {
+    const __m256 log2_ = _mm256_set1_ps(std::log(2));
+    return _mm256_div_ps(log(), log2_);
+  }
+  Vectorized<c10::complex<float>> log10() const {
+    const __m256 log10_ = _mm256_set1_ps(std::log(10));
+    return _mm256_div_ps(log(), log10_);
+  }
+  Vectorized<c10::complex<float>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<float>> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    const __m256 one = _mm256_set1_ps(1);
+
+    auto conj = conj_();
+    auto b_a = _mm256_permute_ps(conj, 0xB1);                         //-b        a
+    auto ab = _mm256_mul_ps(conj, b_a);                               //-ab       -ab
+    auto im = _mm256_add_ps(ab, ab);                                  //-2ab      -2ab
+
+    auto val_2 = _mm256_mul_ps(values, values);                       // a*a      b*b
+    auto re = _mm256_hsub_ps(val_2, _mm256_permute_ps(val_2, 0xB1));  // a*a-b*b  b*b-a*a
+    re = _mm256_permute_ps(re, 0xD8);
+    re = _mm256_sub_ps(one, re);
+
+    auto root = Vectorized(_mm256_blend_ps(re, im, 0xAA)).sqrt();         //sqrt(re + i*im)
+    auto ln = Vectorized(_mm256_add_ps(b_a, root)).log();                 //ln(iz + sqrt())
+    return Vectorized(_mm256_permute_ps(ln.values, 0xB1)).conj();         //-i*ln()
+  }
+  Vectorized<c10::complex<float>> acos() const {
+    return map(std::acos);
+  }
+  Vectorized<c10::complex<float>> atan() const;
+  Vectorized<c10::complex<float>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<float>> exp() const {
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expf8_u10(values);                               //exp(a)           exp(b)
+    exp = _mm256_blend_ps(exp, _mm256_permute_ps(exp, 0xB1), 0xAA);   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosf8_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm256_blend_ps(_mm256_permute_ps(sin_cos.y, 0xB1),
+                                   sin_cos.x, 0xAA);                  //cos(b)           sin(b)
+    return _mm256_mul_ps(exp, cos_sin);
+  }
+  Vectorized<c10::complex<float>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m256 ln_2 = _mm256_set1_ps(c10::ln_2<float>);
+    Vectorized<c10::complex<float>> scaled_values = _mm256_mul_ps(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<float>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<float>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<float>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<float>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<float>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<float>> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vectorized<c10::complex<float>> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vectorized<c10::complex<float>> neg() const {
+    auto zero = _mm256_setzero_ps();
+    return _mm256_sub_ps(zero, values);
+  }
+  Vectorized<c10::complex<float>> round() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<float>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<float>> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<float>> reciprocal() const;
+  Vectorized<c10::complex<float>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<float>> pow(const Vectorized<c10::complex<float>> &exp) const {
+    __at_align__ c10::complex<float> x_tmp[size()];
+    __at_align__ c10::complex<float> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<float>> operator==(const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+  Vectorized<c10::complex<float>> operator!=(const Vectorized<c10::complex<float>>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
+  }
+  Vectorized<c10::complex<float>> operator<(const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(const Vectorized<c10::complex<float>>& /*other*/) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <> Vectorized<c10::complex<float>> inline operator+(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <> Vectorized<c10::complex<float>> inline operator-(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm256_mul_ps(a, b);         //ac       bd
+
+  auto d_c = _mm256_permute_ps(b, 0xB1);    //d        c
+  d_c = _mm256_xor_ps(sign_mask, d_c);      //d       -c
+  auto ad_bc = _mm256_mul_ps(a, d_c);       //ad      -bc
+
+  auto ret = _mm256_hsub_ps(ac_bd, ad_bc);  //ac - bd  ad + bc
+  ret = _mm256_permute_ps(ret, 0xD8);
+  return ret;
+}
+
+template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a, const Vectorized<c10::complex<float>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  auto mask = _mm256_set1_ps(-0.f);
+  auto fabs_cd = _mm256_andnot_ps(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm256_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  auto scale = _mm256_rcp_ps(_mm256_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm256_mul_ps(a, scale);         // a/sc     b/sc
+  auto b2 = _mm256_mul_ps(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm256_mul_ps(a2, b2);
+
+  const __m256 sign_mask = _mm256_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm256_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  dc2 = _mm256_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm256_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = _mm256_hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+  res2 = _mm256_permute_ps(res2, 0xD8);
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm256_div_ps(res2, denom2);
+  return res2;
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciprocal() const {
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto c_d = _mm256_xor_ps(sign_mask, values);    //c       -d
+  return _mm256_div_ps(c_d, abs_2_());
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() const {
+  // atan(x) = i/2 * ln((i + z)/(i - z))
+  const __m256 i = _mm256_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  const Vectorized i_half = _mm256_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+
+  auto sum = Vectorized(_mm256_add_ps(i, values));                      // a        1+b
+  auto sub = Vectorized(_mm256_sub_ps(i, values));                      // -a       1-b
+  auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
+  return i_half*ln;                                                 // i/2*ln()
+}
+
+template <>
+Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm256_blendv_ps(a, b, mask);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator&(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator|(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator^(const Vectorized<c10::complex<float>>& a, const Vectorized<c10::complex<float>>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto eq = (*this == other);  // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers are equal
+  return (eq.real() & eq.imag()) & Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto ne = (*this != other);  // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+  return (ne.real() | ne.imag()) & Vectorized<c10::complex<float>>(_mm256_set1_ps(1.0f));
+}
+
+#endif
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..328e33a79a4e2a89d1044144280c9625321fab9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_double.h
@@ -0,0 +1,442 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <> class Vectorized<double> {
+private:
+  __m256d values;
+public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  Vectorized(__m256d v) : values(v) {}
+  Vectorized(double val) {
+    values = _mm256_set1_pd(val);
+  }
+  Vectorized(double val1, double val2, double val3, double val4) {
+    values = _mm256_setr_pd(val1, val2, val3, val4);
+  }
+  operator __m256d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<double> blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return _mm256_blend_pd(a.values, b.values, mask);
+  }
+  static Vectorized<double> blendv(const Vectorized<double>& a, const Vectorized<double>& b,
+                               const Vectorized<double>& mask) {
+    return _mm256_blendv_pd(a.values, b.values, mask.values);
+  }
+  template<typename step_t>
+  static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+
+    __at_align__ double tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(double));
+    return _mm256_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[size()];
+      _mm256_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(double));
+    }
+  }
+  const double& operator[](int idx) const  = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    __m256d cmp = _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_EQ_OQ);
+    return _mm256_movemask_pd(cmp);
+  }
+  Vectorized<double> isnan() const {
+    return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
+  }
+  bool has_inf_nan() const {
+    __m256d self_sub  = _mm256_sub_pd(values, values);
+    return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0;
+  }
+  Vectorized<double> map(double (*const f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    auto mask = _mm256_set1_pd(-0.f);
+    return _mm256_andnot_pd(mask, values);
+  }
+  Vectorized<double> angle() const {
+    const auto zero_vec = _mm256_set1_pd(0.f);
+    const auto nan_vec = _mm256_set1_pd(NAN);
+    const auto not_nan_mask = _mm256_cmp_pd(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_pd(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_pd(c10::pi<double>);
+
+    const auto neg_mask = _mm256_cmp_pd(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_pd(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_pd(angle, nan_vec, nan_mask);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return _mm256_set1_pd(0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return Vectorized<double>(Sleef_acosd4_u10(values));
+  }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd4_u10(values));
+  }
+  Vectorized<double> asin() const {
+    return Vectorized<double>(Sleef_asind4_u10(values));
+  }
+  Vectorized<double> atan() const {
+    return Vectorized<double>(Sleef_atand4_u10(values));
+  }
+  Vectorized<double> atanh() const {
+    return Vectorized<double>(Sleef_atanhd4_u10(values));
+  }
+  Vectorized<double> atan2(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_atan2d4_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double> &sign) const {
+    return Vectorized<double>(Sleef_copysignd4(values, sign));
+  }
+  Vectorized<double> erf() const {
+    return Vectorized<double>(Sleef_erfd4_u10(values));
+  }
+  Vectorized<double> erfc() const {
+    return Vectorized<double>(Sleef_erfcd4_u15(values));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return Vectorized<double>(Sleef_expd4_u10(values));
+  }
+  Vectorized<double> exp2() const {
+    return Vectorized<double>(Sleef_exp2d4_u10(values));
+  }
+  Vectorized<double> expm1() const {
+    return Vectorized<double>(Sleef_expm1d4_u10(values));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    return Vectorized<double>(Sleef_fmodd4(values, q));
+  }
+  Vectorized<double> hypot(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_hypotd4_u05(values, b));
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> log() const {
+    return Vectorized<double>(Sleef_logd4_u10(values));
+  }
+  Vectorized<double> log2() const {
+    return Vectorized<double>(Sleef_log2d4_u10(values));
+  }
+  Vectorized<double> log10() const {
+    return Vectorized<double>(Sleef_log10d4_u10(values));
+  }
+  Vectorized<double> log1p() const {
+    return Vectorized<double>(Sleef_log1pd4_u10(values));
+  }
+  Vectorized<double> sin() const {
+    return Vectorized<double>(Sleef_sind4_u10(values));
+  }
+  Vectorized<double> sinh() const {
+    return Vectorized<double>(Sleef_sinhd4_u10(values));
+  }
+  Vectorized<double> cos() const {
+    return Vectorized<double>(Sleef_cosd4_u10(values));
+  }
+  Vectorized<double> cosh() const {
+    return Vectorized<double>(Sleef_coshd4_u10(values));
+  }
+  Vectorized<double> ceil() const {
+    return _mm256_ceil_pd(values);
+  }
+  Vectorized<double> floor() const {
+    return _mm256_floor_pd(values);
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> neg() const {
+    return _mm256_xor_pd(_mm256_set1_pd(-0.), values);
+  }
+  Vectorized<double> nextafter(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_nextafterd4(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> tan() const {
+    return Vectorized<double>(Sleef_tand4_u10(values));
+  }
+  Vectorized<double> tanh() const {
+    return Vectorized<double>(Sleef_tanhd4_u10(values));
+  }
+  Vectorized<double> trunc() const {
+    return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> lgamma() const {
+    return Vectorized<double>(Sleef_lgammad4_u10(values));
+  }
+  Vectorized<double> sqrt() const {
+    return _mm256_sqrt_pd(values);
+  }
+  Vectorized<double> reciprocal() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), values);
+  }
+  Vectorized<double> rsqrt() const {
+    return _mm256_div_pd(_mm256_set1_pd(1), _mm256_sqrt_pd(values));
+  }
+  Vectorized<double> pow(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_powd4_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_NEQ_UQ);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    return _mm256_cmp_pd(values, other.values, _CMP_GE_OQ);
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_add_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_sub_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_mul_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_div_pd(a, b);
+}
+
+// frac. Implement this here so we can use subtraction.
+inline Vectorized<double> Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  Vectorized<double> max = _mm256_max_pd(a, b);
+  Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_pd(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  Vectorized<double> min = _mm256_min_pd(a, b);
+  Vectorized<double> isnan = _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+  return _mm256_min_pd(max, _mm256_max_pd(min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+  return _mm256_max_pd(min, a);
+}
+
+template <>
+Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+  return _mm256_min_pd(max, a);
+}
+
+template <>
+Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_and_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_or_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm256_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+#ifdef CPU_CAPABILITY_AVX2
+template <>
+Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return _mm256_fmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fmsub(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return _mm256_fmsub_pd(a, b, c);
+}
+#endif
+
+#endif
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b372f294f9036e1a7c5a1915cb37f24f7b645fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float.h
@@ -0,0 +1,636 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+template <> class Vectorized<float> {
+private:
+  __m256 values;
+public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  Vectorized(__m256 v) : values(v) {}
+  Vectorized(float val) {
+    values = _mm256_set1_ps(val);
+  }
+  Vectorized(float val1, float val2, float val3, float val4,
+         float val5, float val6, float val7, float val8) {
+    values = _mm256_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  operator __m256() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return _mm256_blend_ps(a.values, b.values, mask);
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask) {
+    return _mm256_blendv_ps(a.values, b.values, mask.values);
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm256_loadu_ps(reinterpret_cast<const float*>(ptr));
+    __at_align__ float tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values, reinterpret_cast<const float*>(ptr), count * sizeof(float));
+    return _mm256_loadu_ps(tmp_values);
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      _mm256_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[size()];
+      _mm256_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  const float& operator[](int idx) const  = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    __m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
+    return _mm256_movemask_ps(cmp);
+  }
+  Vectorized<float> isnan() const {
+    return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q);
+  }
+
+  bool has_inf_nan() const {
+    __m256 self_sub  = _mm256_sub_ps(values, values);
+    return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0;
+  }
+
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    auto mask = _mm256_set1_ps(-0.f);
+    return _mm256_andnot_ps(mask, values);
+  }
+  Vectorized<float> angle() const {
+    const auto zero_vec = _mm256_set1_ps(0.f);
+    const auto nan_vec = _mm256_set1_ps(NAN);
+    const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ);
+    const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm256_set1_ps(c10::pi<float>);
+
+    const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask);
+    angle = _mm256_blendv_ps(angle, nan_vec, nan_mask);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return _mm256_set1_ps(0);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+  Vectorized<float> acos() const {
+    return Vectorized<float>(Sleef_acosf8_u10(values));
+  }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf8_u10(values));
+  }
+  Vectorized<float> asin() const {
+    return Vectorized<float>(Sleef_asinf8_u10(values));
+  }
+  Vectorized<float> atan() const {
+    return Vectorized<float>(Sleef_atanf8_u10(values));
+  }
+  Vectorized<float> atanh() const {
+    return Vectorized<float>(Sleef_atanhf8_u10(values));
+  }
+  Vectorized<float> atan2(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_atan2f8_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    return Vectorized<float>(Sleef_copysignf8(values, sign));
+  }
+  Vectorized<float> erf() const {
+    // constants
+    const auto neg_zero_vec = _mm256_set1_ps(-0.f);
+    const auto one_vec = _mm256_set1_ps(1.0f);
+    const auto p = _mm256_set1_ps(0.3275911f);
+    const auto p1 = _mm256_set1_ps(0.254829592f);
+    const auto p2 = _mm256_set1_ps(-0.284496736f);
+    const auto p3 = _mm256_set1_ps(1.421413741f);
+    const auto p4 = _mm256_set1_ps(-1.453152027f);
+    const auto p5 = _mm256_set1_ps(1.061405429f);
+    // sign(x)
+    auto sign_mask = _mm256_and_ps(neg_zero_vec, values);
+    auto abs_vec = _mm256_xor_ps(sign_mask, values);
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = _mm256_fmadd_ps(p, abs_vec, one_vec);
+    auto t = _mm256_div_ps(one_vec, tmp0);
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = _mm256_fmadd_ps(p5, t, p4);
+    auto tmp2 = _mm256_fmadd_ps(tmp1, t, p3);
+    auto tmp3 = _mm256_fmadd_ps(tmp2, t, p2);
+    auto r = _mm256_fmadd_ps(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = _mm256_mul_ps(values, values);
+    auto neg_pow_2 = _mm256_xor_ps(neg_zero_vec, pow_2);
+    // auto tmp4 = exp(neg_pow_2);
+    auto tmp4 = Vectorized<float>(Sleef_expf8_u10(neg_pow_2));
+    auto tmp5 = _mm256_xor_ps(neg_zero_vec, tmp4);
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = _mm256_mul_ps(tmp5, t);
+    auto tmp7 = _mm256_fmadd_ps(tmp6, r, one_vec);
+    return _mm256_xor_ps(sign_mask, tmp7);
+  }
+  Vectorized<float> erfc() const {
+    return Vectorized<float>(Sleef_erfcf8_u15(values));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return Vectorized<float>(Sleef_expf8_u10(values));
+  }
+  Vectorized<float> exp2() const {
+    return Vectorized<float>(Sleef_exp2f8_u10(values));
+  }
+  Vectorized<float> expm1() const {
+    return Vectorized<float>(Sleef_expm1f8_u10(values));
+  }
+  Vectorized<float> exp_u20() const {
+    // A faster version of exp with ULP=20
+    static __m256 vec_factorial_1 =
+        _mm256_set1_ps(0.999999701f); // 1/factorial(1)
+    static __m256 vec_factorial_2 =
+        _mm256_set1_ps(0.499991506f); // 1/factorial(2)
+    static __m256 vec_factorial_3 =
+        _mm256_set1_ps(0.166676521f); // 1/factorial(3)
+    static __m256 vec_factorial_4 =
+        _mm256_set1_ps(0.0418978221f); // 1/factorial(4)
+    static __m256 vec_factorial_5 =
+        _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
+    static __m256 vec_exp_log2ef =
+        (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e)
+    static __m256 vec_half = _mm256_set1_ps(0.5f);
+    static __m256 vec_one = _mm256_set1_ps(1.f);
+    static __m256 vec_zero = _mm256_set1_ps(0.f);
+    static __m256 vec_two = _mm256_set1_ps(2.f);
+    static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2)
+    static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50);
+    static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218);
+    static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
+    static int n_mantissa_bits = 23;
+
+    // exp(x) =
+    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
+    // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression
+
+    auto less_ln_flt_min_mask =
+        _mm256_cmp_ps(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);
+    auto vec_src = _mm256_min_ps(values, vec_ln_flt_max);
+    vec_src = _mm256_max_ps(vec_src, vec_ln_flt_min);
+
+    // fx = floorf(x * log2ef + 0.5)
+    auto vec_fx = _mm256_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);
+    vec_fx = _mm256_floor_ps(vec_fx);
+
+    // x = x - fx * ln2
+    auto vec_exp_poly = _mm256_fnmadd_ps(vec_fx, vec_ln2f, vec_src);
+
+    // compute polynomial
+    auto vec_res =
+        _mm256_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);
+    vec_res = _mm256_fmadd_ps(vec_exp_poly, vec_res, vec_one);
+
+    // compute 2^(n-1)
+    auto vec_exp_number = _mm256_sub_ps(vec_fx, vec_one);
+    auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number);
+    auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127);
+    vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+    auto vec_two_pow_n = (__m256)vec_two_pow_n_i;
+    vec_two_pow_n =
+        _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask);
+
+    // y = y * 2^n
+    vec_res = _mm256_mul_ps(vec_res, vec_two_pow_n);
+    vec_res = _mm256_mul_ps(vec_res, vec_two);
+    return vec_res;
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return Vectorized<float>(Sleef_fmodf8(values, q));
+  }
+  Vectorized<float> log() const {
+    return Vectorized<float>(Sleef_logf8_u10(values));
+  }
+  Vectorized<float> log2() const {
+    return Vectorized<float>(Sleef_log2f8_u10(values));
+  }
+  Vectorized<float> log10() const {
+    return Vectorized<float>(Sleef_log10f8_u10(values));
+  }
+  Vectorized<float> log1p() const {
+    return Vectorized<float>(Sleef_log1pf8_u10(values));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return Vectorized<float>(Sleef_sinf8_u35(values));
+  }
+  Vectorized<float> sinh() const {
+    return Vectorized<float>(Sleef_sinhf8_u10(values));
+  }
+  Vectorized<float> cos() const {
+    return Vectorized<float>(Sleef_cosf8_u35(values));
+  }
+  Vectorized<float> cosh() const {
+    return Vectorized<float>(Sleef_coshf8_u10(values));
+  }
+  Vectorized<float> ceil() const {
+    return _mm256_ceil_ps(values);
+  }
+  Vectorized<float> floor() const {
+    return _mm256_floor_ps(values);
+  }
+  Vectorized<float> hypot(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_hypotf8_u05(values, b));
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> neg() const {
+    return _mm256_xor_ps(_mm256_set1_ps(-0.f), values);
+  }
+  Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_nextafterf8(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> tan() const {
+    return Vectorized<float>(Sleef_tanf8_u10(values));
+  }
+  Vectorized<float> tanh() const {
+    return Vectorized<float>(Sleef_tanhf8_u10(values));
+  }
+  Vectorized<float> trunc() const {
+    return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> lgamma() const {
+    return Vectorized<float>(Sleef_lgammaf8_u10(values));
+  }
+  Vectorized<float> sqrt() const {
+    return _mm256_sqrt_ps(values);
+  }
+  Vectorized<float> reciprocal() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), values);
+  }
+  Vectorized<float> rsqrt() const {
+    return _mm256_div_ps(_mm256_set1_ps(1), _mm256_sqrt_ps(values));
+  }
+  Vectorized<float> pow(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_powf8_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_EQ_OQ);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_NEQ_UQ);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LT_OQ);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_LE_OQ);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GT_OQ);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    return _mm256_cmp_ps(values, other.values, _CMP_GE_OQ);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_add_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_sub_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_mul_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_div_ps(a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  Vectorized<float> max = _mm256_max_ps(a, b);
+  Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_ps(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  Vectorized<float> min = _mm256_min_ps(a, b);
+  Vectorized<float> isnan = _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
+  // Exploit the fact that all-ones is a NaN.
+  return _mm256_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return _mm256_min_ps(max, _mm256_max_ps(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return _mm256_min_ps(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return _mm256_max_ps(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_and_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_or_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm256_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return _mm256_fmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return _mm256_fmsub_ps(a, b, c);
+}
+
+// Used by Inductor CPP codegen
+template<>
+inline void transpose_mxn<float, 8, 8>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  // load from src to registers
+  // a: a0  a1  a2  a3  a4  a5  a6  a7
+  // b: b0  b1  b2  b3  b4  b5  b6  b7
+  // c: c0  c1  c2  c3  c4  c5  c6  c7
+  // d: d0  d1  d2  d3  d4  d5  d6  d7
+  // e: e0  e1  e2  e3  e4  e5  e6  e7
+  // f: f0  f1  f2  f3  f4  f5  f6  f7
+  // g: g0  g1  g2  g3  g4  g5  g6  g7
+  // h: h0  h1  h2  h3  h4  h5  h6  h7
+  __m256 a = _mm256_loadu_ps(&src[0 * ld_src]);
+  __m256 b = _mm256_loadu_ps(&src[1 * ld_src]);
+  __m256 c = _mm256_loadu_ps(&src[2 * ld_src]);
+  __m256 d = _mm256_loadu_ps(&src[3 * ld_src]);
+  __m256 e = _mm256_loadu_ps(&src[4 * ld_src]);
+  __m256 f = _mm256_loadu_ps(&src[5 * ld_src]);
+  __m256 g = _mm256_loadu_ps(&src[6 * ld_src]);
+  __m256 h = _mm256_loadu_ps(&src[7 * ld_src]);
+
+  __m256 ta, tb, tc, td, te, tf, tg, th;
+  // unpacking and interleaving 32-bit elements
+  // a0  b0  a1  b1  a4  b4  a5  b5
+  // a2  b2  a3  b3  a6  b6  a7  b7
+  // c0  d0  c1  d1 ...
+  // c2  d2  c3  d3 ...
+  // e0  f0  e1  f1 ...
+  // e2  f2  e3  f3 ...
+  // g0  h0  g1  h1 ...
+  // g2  h2  g3  h3 ...
+  ta = _mm256_unpacklo_ps(a, b);
+  tb = _mm256_unpackhi_ps(a, b);
+  tc = _mm256_unpacklo_ps(c, d);
+  td = _mm256_unpackhi_ps(c, d);
+  te = _mm256_unpacklo_ps(e, f);
+  tf = _mm256_unpackhi_ps(e, f);
+  tg = _mm256_unpacklo_ps(g, h);
+  th = _mm256_unpackhi_ps(g, h);
+
+  // unpacking and interleaving 64-bit elements
+  //  a0  b0  c0  d0  a4  b4  c4  d4
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  e0  f0  g0  h0  e4  f4  g4  h4
+  //  e1  f1  g1  h1 ...
+  //  e2  f2  g2  h2 ...
+  //  e3  f3  g3  h3 ...
+  a = _mm256_castpd_ps(
+      _mm256_unpacklo_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
+  b = _mm256_castpd_ps(
+      _mm256_unpackhi_pd(_mm256_castps_pd(ta), _mm256_castps_pd(tc)));
+  c = _mm256_castpd_ps(
+      _mm256_unpacklo_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
+  d = _mm256_castpd_ps(
+      _mm256_unpackhi_pd(_mm256_castps_pd(tb), _mm256_castps_pd(td)));
+  e = _mm256_castpd_ps(
+      _mm256_unpacklo_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
+  f = _mm256_castpd_ps(
+      _mm256_unpackhi_pd(_mm256_castps_pd(te), _mm256_castps_pd(tg)));
+  g = _mm256_castpd_ps(
+      _mm256_unpacklo_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
+  h = _mm256_castpd_ps(
+      _mm256_unpackhi_pd(_mm256_castps_pd(tf), _mm256_castps_pd(th)));
+
+  //  shuffle 128-bits (composed of 4 32-bit elements)
+  //  a0  b0  c0  d0  e0  f0  g0  h0
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  a4  b4  c4  d4 ...
+  //  a5  b5  c5  d5 ...
+  //  a6  b6  c6  d6 ...
+  //  a7  b7  c7  d7 ...
+  ta = _mm256_permute2f128_ps(a, e, 0x20);
+  tb = _mm256_permute2f128_ps(b, f, 0x20);
+  tc = _mm256_permute2f128_ps(c, g, 0x20);
+  td = _mm256_permute2f128_ps(d, h, 0x20);
+  te = _mm256_permute2f128_ps(a, e, 0x31);
+  tf = _mm256_permute2f128_ps(b, f, 0x31);
+  tg = _mm256_permute2f128_ps(c, g, 0x31);
+  th = _mm256_permute2f128_ps(d, h, 0x31);
+
+  // store from registers to dst
+  _mm256_storeu_ps(&dst[0 * ld_dst], ta);
+  _mm256_storeu_ps(&dst[1 * ld_dst], tb);
+  _mm256_storeu_ps(&dst[2 * ld_dst], tc);
+  _mm256_storeu_ps(&dst[3 * ld_dst], td);
+  _mm256_storeu_ps(&dst[4 * ld_dst], te);
+  _mm256_storeu_ps(&dst[5 * ld_dst], tf);
+  _mm256_storeu_ps(&dst[6 * ld_dst], tg);
+  _mm256_storeu_ps(&dst[7 * ld_dst], th);
+}
+
+#endif
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float_neon.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float_neon.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8f9a5e74500be63df8b04539a527013ab99f33f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_float_neon.h
@@ -0,0 +1,892 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
+// Sleef offers vectorized versions of some transcedentals
+// such as sin, cos, tan etc..
+// However for now opting for STL, since we are not building
+// with Sleef for mobile yet.
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+// Right now contains only aarch64 implementation.
+// Due to follow two reasons aarch32 is not currently supported.
+// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics
+//    that work for aarch64 dont work for aarch32.
+// 2. Android NDK r21 has problems with compiling aarch32.
+//    Clang seg faults.
+//    https://github.com/android/ndk/issues/1248
+//    https://bugs.llvm.org/show_bug.cgi?id=45824
+// Most likely we will do aarch32 support with inline asm.
+#if defined(__aarch64__)
+
+#ifdef __BIG_ENDIAN__
+#error "Big endian is not supported."
+#endif
+
+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
+template<int index, bool mask_val>
+struct BlendRegs {
+  static float32x4_t impl(
+    const float32x4_t& a, const float32x4_t& b, float32x4_t& res);
+};
+
+template<int index>
+struct BlendRegs<index, true>{
+  static float32x4_t impl(
+      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
+  }
+};
+
+template<int index>
+struct BlendRegs<index, false>{
+  static float32x4_t impl(
+      const float32x4_t& a, const float32x4_t& b, float32x4_t& res) {
+    return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
+  }
+};
+
+template <> class Vectorized<float> {
+private:
+  float32x4x2_t values;
+public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  Vectorized(float32x4x2_t v) : values(v) {}
+  Vectorized(float val) : values{vdupq_n_f32(val), vdupq_n_f32(val) } {}
+  Vectorized(float val0, float val1, float val2, float val3,
+         float val4, float val5, float val6, float val7) :
+         values{val0, val1, val2, val3, val4, val5, val6, val7} {}
+  Vectorized(float32x4_t val0, float32x4_t val1) : values{val0, val1} {}
+  operator float32x4x2_t() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    Vectorized<float> vec;
+    // 0.
+    vec.values.val[0] =
+      BlendRegs<0, (mask & 0x01)!=0>::impl(
+          a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] =
+      BlendRegs<1, (mask & 0x02)!=0>::impl(
+          a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] =
+      BlendRegs<2, (mask & 0x04)!=0>::impl(
+          a.values.val[0], b.values.val[0], vec.values.val[0]);
+    vec.values.val[0] =
+      BlendRegs<3, (mask & 0x08)!=0>::impl(
+          a.values.val[0], b.values.val[0], vec.values.val[0]);
+    // 1.
+    vec.values.val[1] =
+      BlendRegs<0, (mask & 0x10)!=0>::impl(
+          a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] =
+      BlendRegs<1, (mask & 0x20)!=0>::impl(
+          a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] =
+      BlendRegs<2, (mask & 0x40)!=0>::impl(
+          a.values.val[1], b.values.val[1], vec.values.val[1]);
+    vec.values.val[1] =
+      BlendRegs<3, (mask & 0x80)!=0>::impl(
+          a.values.val[1], b.values.val[1], vec.values.val[1]);
+    return vec;
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask) {
+    // TODO
+    // NB: This requires that each value, i.e., each uint value,
+    // of the mask either all be zeros or all be 1s.
+    // We perhaps need some kind of an assert?
+    // But that will affect performance.
+    Vectorized<float> vec(mask.values);
+    vec.values.val[0] = vbslq_f32(
+        vreinterpretq_u32_f32(vec.values.val[0]),
+        b.values.val[0],
+        a.values.val[0]);
+    vec.values.val[1] = vbslq_f32(
+        vreinterpretq_u32_f32(vec.values.val[1]),
+        b.values.val[1],
+        a.values.val[1]);
+    return vec;
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    const Vectorized<float> base_vec(base);
+    const Vectorized<float> step_vec(step);
+    const Vectorized<float> step_sizes(0, 1, 2, 3, 4, 5, 6, 7);
+    return fmadd(step_sizes, step_vec, base_vec);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0x0, 0x0, 0x0};
+          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
+          vec.values.val[1] = a.values.val[1];
+          vec.values.val[0] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[0]),
+              b.values.val[0],
+              a.values.val[0]);
+          return vec;
+        }
+      case 2:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
+          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
+          vec.values.val[1] = a.values.val[1];
+          vec.values.val[0] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[0]),
+              b.values.val[0],
+              a.values.val[0]);
+          return vec;
+        }
+      case 3:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_low = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+          vec.values.val[0] = vreinterpretq_f32_u32(mask_low);
+          vec.values.val[1] = a.values.val[1];
+          vec.values.val[0] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[0]),
+              b.values.val[0],
+              a.values.val[0]);
+          return vec;
+        }
+      case 4:
+        return Vectorized<float>(b.values.val[0], a.values.val[1]);
+      case 5:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_high = {0xFFFFFFFF, 0x0, 0x0, 0x0};
+          vec.values.val[0] = b.values.val[0];
+          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
+          vec.values.val[1] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[1]),
+              b.values.val[1],
+              a.values.val[1]);
+          return vec;
+        }
+      case 6:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0};
+          vec.values.val[0] = b.values.val[0];
+          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
+          vec.values.val[1] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[1]),
+              b.values.val[1],
+              a.values.val[1]);
+          return vec;
+        }
+      case 7:
+        {
+          Vectorized<float> vec;
+          static uint32x4_t mask_high = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
+          vec.values.val[0] = b.values.val[0];
+          vec.values.val[1] = vreinterpretq_f32_u32(mask_high);
+          vec.values.val[1] = vbslq_f32(
+              vreinterpretq_u32_f32(vec.values.val[1]),
+              b.values.val[1],
+              a.values.val[1]);
+          return vec;
+        }
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size()) {
+      return vld1q_f32_x2(reinterpret_cast<const float*>(ptr));
+    }
+    else if (count == (size() >> 1)) {
+      Vectorized<float> res;
+      res.values.val[0] = vld1q_f32(reinterpret_cast<const float*>(ptr));
+      res.values.val[1] = vdupq_n_f32(0.f);
+      return res;
+    }
+    else {
+      __at_align__ float tmp_values[size()];
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0.0;
+      }
+      std::memcpy(
+          tmp_values,
+          reinterpret_cast<const float*>(ptr),
+          count * sizeof(float));
+      return vld1q_f32_x2(reinterpret_cast<const float*>(tmp_values));
+    }
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      vst1q_f32_x2(reinterpret_cast<float*>(ptr), values);
+    }
+    else if (count == (size() >> 1)) {
+      vst1q_f32(reinterpret_cast<float*>(ptr), values.val[0]);
+    }
+    else {
+      float tmp_values[size()];
+      vst1q_f32_x2(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(float));
+    }
+  }
+  inline const float32x4_t& get_low() const {
+    return values.val[0];
+  }
+  inline float32x4_t& get_low() {
+    return values.val[0];
+  }
+  inline const float32x4_t& get_high() const {
+    return values.val[1];
+  }
+  inline float32x4_t& get_high() {
+    return values.val[1];
+  }
+  // Very slow implementation of indexing.
+  // Only required because vec256_qint refers to this.
+  // Once we specialize that implementation for ARM
+  // this should be removed. TODO (kimishpatel)
+  float operator[](int idx) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  float operator[](int idx) {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    return tmp[idx];
+  }
+  // For boolean version where we want to if any 1/all zero
+  // etc. can be done faster in a different way.
+  int zero_mask() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    int mask = 0;
+    for (int i = 0; i < size(); ++ i) {
+      if (tmp[i] == 0.f) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+  }
+  Vectorized<float> isnan() const {
+    __at_align__ float tmp[size()];
+    __at_align__ float res[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if (_isnan(tmp[i])) {
+        std::memset(static_cast<void*>(&res[i]), 0xFF, sizeof(float));
+      } else {
+        std::memset(static_cast<void*>(&res[i]), 0, sizeof(float));
+      }
+    }
+    return loadu(res);
+  };
+  bool has_inf_nan() const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      if(_isnan(tmp[i]) || _isinf(tmp[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return Vectorized<float>(vabsq_f32(values.val[0]), vabsq_f32(values.val[1]));
+  }
+  Vectorized<float> angle() const {
+    auto zero = Vectorized<float>(0);
+    auto pi = Vectorized<float>(c10::pi<float>);
+    auto tmp = blendv(zero, pi, *this < zero);
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+  Vectorized<float> acos() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])),
+      map(std::acos)
+    );
+  }
+  Vectorized<float> asin() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])),
+      map(std::asin)
+    );
+  }
+  Vectorized<float> atan() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])),
+      map(std::atan)
+    );
+  }
+  Vectorized<float> atanh() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_atanhf4_u10(values.val[0]), Sleef_atanhf4_u10(values.val[1])),
+      map(std::atanh)
+    );
+  }
+  Vectorized<float> atan2(const Vectorized<float> &exp) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]),
+                                 Sleef_atan2f4_u10(values.val[1], exp.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_exp[size()];
+        store(tmp);
+        exp.store(tmp_exp);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_copysignf4(values.val[0], sign.values.val[0]),
+                                 Sleef_copysignf4(values.val[1], sign.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_sign[size()];
+        store(tmp);
+        sign.store(tmp_sign);
+        for (size_type i = 0; i < size(); i++) {
+          tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> erf() const;
+  Vectorized<float> erfc() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])),
+      map(std::erfc)
+    );
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])),
+      map(std::exp)
+    );
+  }
+  Vectorized<float> exp2() const {
+    return USE_SLEEF(
+        Vectorized<float>(Sleef_exp2f4_u10(values.val[0]), Sleef_exp2f4_u10(values.val[1])),
+        map(std::exp2)
+      );
+  }
+  Vectorized<float> expm1() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])),
+      map(std::expm1)
+    );
+  }
+  Vectorized<float> exp_u20() const {
+    return exp();
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_fmodf4(values.val[0], q.values.val[0]),
+                                 Sleef_fmodf4(values.val[1], q.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> hypot(const Vectorized<float> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_hypotf4_u05(values.val[0], b.values.val[0]),
+                                 Sleef_hypotf4_u05(values.val[1], b.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> log() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])),
+      map(std::log)
+    );
+  }
+  Vectorized<float> log10() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])),
+      map(std::log10)
+    );
+  }
+  Vectorized<float> log1p() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])),
+      map(std::log1p)
+    );
+  }
+  Vectorized<float> log2() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])),
+      map(std::log2)
+    );
+  }
+  Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_nextafterf4(values.val[0], b.values.val[0]),
+                                 Sleef_nextafterf4(values.val[1], b.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])),
+      map(std::sin)
+    );
+  }
+  Vectorized<float> sinh() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])),
+      map(std::sinh)
+    );
+  }
+  Vectorized<float> cos() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])),
+      map(std::cos)
+    );
+  }
+  Vectorized<float> cosh() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])),
+      map(std::cosh)
+    );
+  }
+  Vectorized<float> ceil() const {
+    return map(at::native::ceil_impl);
+  }
+  Vectorized<float> floor() const {
+    return map(at::native::floor_impl);
+  }
+  Vectorized<float> neg() const {
+    return Vectorized<float>(
+        vnegq_f32(values.val[0]),
+        vnegq_f32(values.val[1]));
+  }
+  Vectorized<float> round() const {
+    // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+    return map(at::native::round_impl);
+  }
+  Vectorized<float> tan() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])),
+      map(std::tan)
+    );
+  }
+  Vectorized<float> tanh() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])),
+      map(std::tanh)
+    );
+  }
+  Vectorized<float> trunc() const {
+    float32x4_t r0 = vrndq_f32(values.val[0]);
+    float32x4_t r1 = vrndq_f32(values.val[1]);
+    return Vectorized<float>(r0, r1);
+  }
+  Vectorized<float> lgamma() const {
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])),
+      map(std::lgamma)
+    );
+  }
+  Vectorized<float> sqrt() const {
+    return Vectorized<float>(
+        vsqrtq_f32(values.val[0]),
+        vsqrtq_f32(values.val[1]));
+  }
+  Vectorized<float> reciprocal() const {
+    auto r0 = vdivq_f32(vdupq_n_f32(1.0f), values.val[0]);
+    auto r1 = vdivq_f32(vdupq_n_f32(1.0f), values.val[1]);
+    return Vectorized<float>(r0, r1);
+  }
+  Vectorized<float> rsqrt() const {
+    return this->sqrt().reciprocal();
+  }
+  Vectorized<float> pow(const Vectorized<float> &exp) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_powf4_u10(values.val[0], exp.values.val[0]),
+                                 Sleef_powf4_u10(values.val[1], exp.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_exp[size()];
+        store(tmp);
+        exp.store(tmp_exp);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::pow(tmp[i], tmp_exp[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+      vreinterpretq_f32_u32(vceqq_f32(values.val[0], other.values.val[0]));
+    float32x4_t r1 =
+      vreinterpretq_f32_u32(vceqq_f32(values.val[1], other.values.val[1]));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    float32x4_t r0 = vreinterpretq_f32_u32(
+        vmvnq_u32(vceqq_f32(values.val[0], other.values.val[0])));
+    float32x4_t r1 = vreinterpretq_f32_u32(
+        vmvnq_u32(vceqq_f32(values.val[1], other.values.val[1])));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+      vreinterpretq_f32_u32(vcltq_f32(values.val[0], other.values.val[0]));
+    float32x4_t r1 =
+      vreinterpretq_f32_u32(vcltq_f32(values.val[1], other.values.val[1]));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+      vreinterpretq_f32_u32(vcleq_f32(values.val[0], other.values.val[0]));
+    float32x4_t r1 =
+      vreinterpretq_f32_u32(vcleq_f32(values.val[1], other.values.val[1]));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+      vreinterpretq_f32_u32(vcgtq_f32(values.val[0], other.values.val[0]));
+    float32x4_t r1 =
+      vreinterpretq_f32_u32(vcgtq_f32(values.val[1], other.values.val[1]));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    float32x4_t r0 =
+      vreinterpretq_f32_u32(vcgeq_f32(values.val[0], other.values.val[0]));
+    float32x4_t r1 =
+      vreinterpretq_f32_u32(vcgeq_f32(values.val[1], other.values.val[1]));
+    return Vectorized<float>(r0, r1);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vaddq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vaddq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vsubq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vsubq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vmulq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vmulq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vdivq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vdivq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vmaxq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vmaxq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vminq_f32(a.get_low(), b.get_low());
+  float32x4_t r1 = vminq_f32(a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return minimum(max, maximum(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return minimum(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return maximum(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vreinterpretq_f32_u32(vandq_u32(
+      vreinterpretq_u32_f32(a.get_low()),
+      vreinterpretq_u32_f32(b.get_low())));
+  float32x4_t r1 = vreinterpretq_f32_u32(vandq_u32(
+      vreinterpretq_u32_f32(a.get_high()),
+      vreinterpretq_u32_f32(b.get_high())));
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vreinterpretq_f32_u32(vorrq_u32(
+      vreinterpretq_u32_f32(a.get_low()),
+      vreinterpretq_u32_f32(b.get_low())));
+  float32x4_t r1 = vreinterpretq_f32_u32(vorrq_u32(
+      vreinterpretq_u32_f32(a.get_high()),
+      vreinterpretq_u32_f32(b.get_high())));
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  float32x4_t r0 = vreinterpretq_f32_u32(veorq_u32(
+      vreinterpretq_u32_f32(a.get_low()),
+      vreinterpretq_u32_f32(b.get_low())));
+  float32x4_t r1 = vreinterpretq_f32_u32(veorq_u32(
+      vreinterpretq_u32_f32(a.get_high()),
+      vreinterpretq_u32_f32(b.get_high())));
+  return Vectorized<float>(r0, r1);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, int32_t* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
+    vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4)));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<int32_t>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
+    vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4)));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  float32x4_t r0 = vfmaq_f32(c.get_low(), a.get_low(), b.get_low());
+  float32x4_t r1 = vfmaq_f32(c.get_high(), a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+template <>
+Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  float32x4_t r0 = vfmsq_f32(c.get_low(), a.get_low(), b.get_low());
+  float32x4_t r1 = vfmsq_f32(c.get_high(), a.get_high(), b.get_high());
+  return Vectorized<float>(r0, r1);
+}
+
+inline Vectorized<float> Vectorized<float>::erf() const{
+    // constants
+    const Vectorized<float> neg_zero_vec(-0.f);
+    const Vectorized<float> one_vec(1.0f);
+    const Vectorized<float> p(0.3275911f);
+    const Vectorized<float> p1(0.254829592f);
+    const Vectorized<float> p2(-0.284496736f);
+    const Vectorized<float> p3(1.421413741f);
+    const Vectorized<float> p4(-1.453152027f);
+    const Vectorized<float> p5(1.061405429f);
+    // sign(x)
+    auto sign_mask = neg_zero_vec & *this;
+    auto abs_vec = this->abs();
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = fmadd(p, abs_vec, one_vec);
+    auto t = one_vec / tmp0;
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = fmadd(p5, t, p4);
+    auto tmp2 = fmadd(tmp1, t, p3);
+    auto tmp3 = fmadd(tmp2, t, p2);
+    auto r = fmadd(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = (*this) * (*this);
+    auto neg_pow_2 = pow_2 ^ neg_zero_vec;
+    auto tmp4 = neg_pow_2.map(std::exp); // This can be swapped for a faster implementation of exp.
+    auto tmp5 = tmp4 ^ neg_zero_vec;
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = t * tmp5;
+    auto tmp7 = fmadd(tmp6, r, one_vec);
+    return tmp7 ^ sign_mask;
+}
+#endif /* defined(aarch64) */
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f337fea3bfdf20a44e69600e3368b408f537623
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_int.h
@@ -0,0 +1,1586 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#ifdef CPU_CAPABILITY_AVX2
+
+struct Vectorizedi {
+protected:
+  __m256i values;
+
+  static inline __m256i invert(const __m256i& v) {
+    const auto ones = _mm256_set1_epi64x(-1);
+    return _mm256_xor_si256(ones, v);
+  }
+public:
+  Vectorizedi() {}
+  Vectorizedi(__m256i v) : values(v) {}
+  operator __m256i() const {
+    return values;
+  }
+};
+
+#else
+
+struct Vectorizedi {};  // dummy definition to make Vectorizedi always defined
+
+#endif // CPU_CAPABILITY_AVX2
+
+#ifdef CPU_CAPABILITY_AVX2
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+private:
+  static const Vectorized<int64_t> ones;
+public:
+  using value_type = int64_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int64_t v) { values = _mm256_set1_epi64x(v); }
+  Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4) {
+    values = _mm256_setr_epi64x(val1, val2, val3, val4);
+  }
+  template <int64_t mask>
+  static Vectorized<int64_t> blend(Vectorized<int64_t> a, Vectorized<int64_t> b) {
+    __at_align__ int64_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi64(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi64(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi64(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi64(b.values, 3);
+    return loadu(tmp_values);
+  }
+  static Vectorized<int64_t> blendv(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b,
+                                const Vectorized<int64_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(int64_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+  static Vectorized<int64_t>
+  set(Vectorized<int64_t> a, Vectorized<int64_t> b, int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int64_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
+    __at_align__ int64_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int64_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
+    }
+  }
+  const int64_t& operator[](int idx) const  = delete;
+  int64_t& operator[](int idx)  = delete;
+  Vectorized<int64_t> abs() const {
+    auto zero = _mm256_set1_epi64x(0);
+    auto is_larger = _mm256_cmpgt_epi64(zero, values);
+    auto inverse = _mm256_xor_si256(values, is_larger);
+    return _mm256_sub_epi64(inverse, is_larger);
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return _mm256_set1_epi64x(0);
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+  Vectorized<int64_t> neg() const;
+  Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpeq_epi64(values, other.values);
+  }
+  Vectorized<int64_t> operator!=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpeq_epi64(values, other.values));
+  }
+  Vectorized<int64_t> operator<(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(other.values, values);
+  }
+  Vectorized<int64_t> operator<=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpgt_epi64(values, other.values));
+  }
+  Vectorized<int64_t> operator>(const Vectorized<int64_t>& other) const {
+    return _mm256_cmpgt_epi64(values, other.values);
+  }
+  Vectorized<int64_t> operator>=(const Vectorized<int64_t>& other) const {
+    return invert(_mm256_cmpgt_epi64(other.values, values));
+  }
+
+  Vectorized<int64_t> eq(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ne(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> gt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ge(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> lt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> le(const Vectorized<int64_t>& other) const;
+};
+
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+private:
+  static const Vectorized<int32_t> ones;
+public:
+  using value_type = int32_t;
+  static constexpr int size() {
+    return 8;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int32_t v) { values = _mm256_set1_epi32(v); }
+  Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
+         int32_t val5, int32_t val6, int32_t val7, int32_t val8) {
+    values = _mm256_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(Vectorized<int32_t> a, Vectorized<int32_t> b) {
+    return _mm256_blend_epi32(a, b, mask);
+  }
+  static Vectorized<int32_t> blendv(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b,
+                                const Vectorized<int32_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int32_t> arange(int32_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
+  static Vectorized<int32_t>
+  set(Vectorized<int32_t> a, Vectorized<int32_t> b, int32_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int32_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int32_t> loadu(const void* ptr, int32_t count) {
+    __at_align__ int32_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int32_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
+    }
+  }
+  const int32_t& operator[](int idx) const  = delete;
+  int32_t& operator[](int idx)  = delete;
+  Vectorized<int32_t> abs() const {
+    return _mm256_abs_epi32(values);
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return _mm256_set1_epi32(0);
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+  Vectorized<int32_t> neg() const;
+  Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpeq_epi32(values, other.values);
+  }
+  Vectorized<int32_t> operator!=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpeq_epi32(values, other.values));
+  }
+  Vectorized<int32_t> operator<(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(other.values, values);
+  }
+  Vectorized<int32_t> operator<=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpgt_epi32(values, other.values));
+  }
+  Vectorized<int32_t> operator>(const Vectorized<int32_t>& other) const {
+    return _mm256_cmpgt_epi32(values, other.values);
+  }
+  Vectorized<int32_t> operator>=(const Vectorized<int32_t>& other) const {
+    return invert(_mm256_cmpgt_epi32(other.values, values));
+  }
+  Vectorized<int32_t> eq(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ne(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> gt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ge(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> lt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> le(const Vectorized<int32_t>& other) const;
+};
+
+template <>
+inline void convert(const int32_t *src, float *dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size()); i += Vectorized<int32_t>::size()) {
+    auto input_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_ps(input_vec);
+    _mm256_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t *src, double *dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+    auto input_128_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + i));
+    auto output_vec = _mm256_cvtepi32_pd(input_128_vec);
+    _mm256_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+private:
+  static const Vectorized<int16_t> ones;
+public:
+  using value_type = int16_t;
+  static constexpr int size() {
+    return 16;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int16_t v) { values = _mm256_set1_epi16(v); }
+  Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
+         int16_t val5, int16_t val6, int16_t val7, int16_t val8,
+         int16_t val9, int16_t val10, int16_t val11, int16_t val12,
+         int16_t val13, int16_t val14, int16_t val15, int16_t val16) {
+    values = _mm256_setr_epi16(val1, val2, val3, val4, val5, val6, val7, val8,
+                               val9, val10, val11, val12, val13, val14, val15, val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(Vectorized<int16_t> a, Vectorized<int16_t> b) {
+    __at_align__ int16_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi16(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi16(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi16(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi16(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi16(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi16(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi16(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi16(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi16(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi16(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi16(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi16(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi16(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi16(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi16(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi16(b.values, 15);
+    return loadu(tmp_values);
+  }
+  static Vectorized<int16_t> blendv(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b,
+                                const Vectorized<int16_t>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<int16_t> arange(int16_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
+  static Vectorized<int16_t>
+  set(Vectorized<int16_t> a, Vectorized<int16_t> b, int16_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int16_t> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<int16_t> loadu(const void* ptr, int16_t count) {
+    __at_align__ int16_t tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      __at_align__ int16_t tmp_values[size()];
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+    }
+  }
+  const int16_t& operator[](int idx) const  = delete;
+  int16_t& operator[](int idx)  = delete;
+  Vectorized<int16_t> abs() const {
+    return _mm256_abs_epi16(values);
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return _mm256_set1_epi16(0);
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+  Vectorized<int16_t> neg() const;
+  Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpeq_epi16(values, other.values);
+  }
+  Vectorized<int16_t> operator!=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpeq_epi16(values, other.values));
+  }
+  Vectorized<int16_t> operator<(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(other.values, values);
+  }
+  Vectorized<int16_t> operator<=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpgt_epi16(values, other.values));
+  }
+  Vectorized<int16_t> operator>(const Vectorized<int16_t>& other) const {
+    return _mm256_cmpgt_epi16(values, other.values);
+  }
+  Vectorized<int16_t> operator>=(const Vectorized<int16_t>& other) const {
+    return invert(_mm256_cmpgt_epi16(other.values, values));
+  }
+
+  Vectorized<int16_t> eq(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ne(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> gt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ge(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> lt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
+};
+
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    "Only int8_t/uint8_t are supported");
+protected:
+  static const Vectorized<T> ones;
+public:
+  using value_type = T;
+  static constexpr int size() {
+    return 32;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized8() {}
+  Vectorized8(T v) { values = _mm256_set1_epi8(v); }
+  Vectorized8(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16,
+         T val17, T val18, T val19, T val20,
+         T val21, T val22, T val23, T val24,
+         T val25, T val26, T val27, T val28,
+         T val29, T val30, T val31, T val32) {
+    values = _mm256_setr_epi8(val1, val2, val3, val4, val5, val6, val7, val8,
+                              val9, val10, val11, val12, val13, val14, val15, val16,
+                              val17, val18, val19, val20, val21, val22, val23, val24,
+                              val25, val26, val27, val28, val29, val30, val31, val32);
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
+    __at_align__ T tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = _mm256_extract_epi8(b.values, 0);
+    if (mask & 0x02)
+      tmp_values[1] = _mm256_extract_epi8(b.values, 1);
+    if (mask & 0x04)
+      tmp_values[2] = _mm256_extract_epi8(b.values, 2);
+    if (mask & 0x08)
+      tmp_values[3] = _mm256_extract_epi8(b.values, 3);
+    if (mask & 0x10)
+      tmp_values[4] = _mm256_extract_epi8(b.values, 4);
+    if (mask & 0x20)
+      tmp_values[5] = _mm256_extract_epi8(b.values, 5);
+    if (mask & 0x40)
+      tmp_values[6] = _mm256_extract_epi8(b.values, 6);
+    if (mask & 0x80)
+      tmp_values[7] = _mm256_extract_epi8(b.values, 7);
+    if (mask & 0x100)
+      tmp_values[8] = _mm256_extract_epi8(b.values, 8);
+    if (mask & 0x200)
+      tmp_values[9] = _mm256_extract_epi8(b.values, 9);
+    if (mask & 0x400)
+      tmp_values[10] = _mm256_extract_epi8(b.values, 10);
+    if (mask & 0x800)
+      tmp_values[11] = _mm256_extract_epi8(b.values, 11);
+    if (mask & 0x1000)
+      tmp_values[12] = _mm256_extract_epi8(b.values, 12);
+    if (mask & 0x2000)
+      tmp_values[13] = _mm256_extract_epi8(b.values, 13);
+    if (mask & 0x4000)
+      tmp_values[14] = _mm256_extract_epi8(b.values, 14);
+    if (mask & 0x8000)
+      tmp_values[15] = _mm256_extract_epi8(b.values, 15);
+    if (mask & 0x010000)
+      tmp_values[16] = _mm256_extract_epi8(b.values, 16);
+    if (mask & 0x020000)
+      tmp_values[17] = _mm256_extract_epi8(b.values, 17);
+    if (mask & 0x040000)
+      tmp_values[18] = _mm256_extract_epi8(b.values, 18);
+    if (mask & 0x080000)
+      tmp_values[19] = _mm256_extract_epi8(b.values, 19);
+    if (mask & 0x100000)
+      tmp_values[20] = _mm256_extract_epi8(b.values, 20);
+    if (mask & 0x200000)
+      tmp_values[21] = _mm256_extract_epi8(b.values, 21);
+    if (mask & 0x400000)
+      tmp_values[22] = _mm256_extract_epi8(b.values, 22);
+    if (mask & 0x800000)
+      tmp_values[23] = _mm256_extract_epi8(b.values, 23);
+    if (mask & 0x1000000)
+      tmp_values[24] = _mm256_extract_epi8(b.values, 24);
+    if (mask & 0x2000000)
+      tmp_values[25] = _mm256_extract_epi8(b.values, 25);
+    if (mask & 0x4000000)
+      tmp_values[26] = _mm256_extract_epi8(b.values, 26);
+    if (mask & 0x8000000)
+      tmp_values[27] = _mm256_extract_epi8(b.values, 27);
+    if (mask & 0x10000000)
+      tmp_values[28] = _mm256_extract_epi8(b.values, 28);
+    if (mask & 0x20000000)
+      tmp_values[29] = _mm256_extract_epi8(b.values, 29);
+    if (mask & 0x40000000)
+      tmp_values[30] = _mm256_extract_epi8(b.values, 30);
+    if (mask & 0x80000000)
+      tmp_values[31] = _mm256_extract_epi8(b.values, 31);
+    return loadu(tmp_values);
+  }
+  static Vectorized<T> blendv(const Vectorized<T>& a, const Vectorized<T>& b,
+                               const Vectorized<T>& mask) {
+    return _mm256_blendv_epi8(a.values, b.values, mask.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(T base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step);
+  }
+  static Vectorized<T>
+  set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<T> loadu(const void* ptr) {
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+      // Fast path if only load element number of 8.
+      // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+      // Because loadu(const void* ptr, T count) requires zero initialization for upper 128 bits.
+      // However, by using _mm256_castsi128_si256, the upper 128 bits of the result are undefined.
+      // TODO<leslie> We can use _mm256_zextsi128_si256 in the furture,
+      // since gcc 9.3 doesn't support it now.
+      __m128i input_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr));
+      return _mm256_castsi128_si256(input_128);
+  }
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    __at_align__ T tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, ptr, count * sizeof(T));
+    return loadu(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html
+      _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 8) {
+        // Fast path if only store element number of 8
+        _mm_storel_epi64(reinterpret_cast<__m128i*>(ptr), _mm256_castsi256_si128(values));
+      } else {
+        __at_align__ T tmp_values[size()];
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(tmp_values), values);
+        std::memcpy(ptr, tmp_values, count * sizeof(T));
+      }
+    }
+  }
+  const T& operator[](int idx) const  = delete;
+  T& operator[](int idx)  = delete;
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm256_set1_epi8(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+};
+
+template<>
+class Vectorized<int8_t>: public Vectorized8<int8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
+  Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+   return _mm256_abs_epi8(values);
+  }
+
+  Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
+    return _mm256_cmpeq_epi8(values, other.values);
+  }
+  Vectorized<int8_t> operator!=(const Vectorized<int8_t>& other) const {
+    return invert(_mm256_cmpeq_epi8(values, other.values));
+  }
+  Vectorized<int8_t> operator<(const Vectorized<int8_t>& other) const {
+    return _mm256_cmpgt_epi8(other.values, values);
+  }
+  Vectorized<int8_t> operator<=(const Vectorized<int8_t>& other) const {
+    return invert(_mm256_cmpgt_epi8(values, other.values));
+  }
+  Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ne(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> gt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ge(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> lt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
+};
+
+template<>
+class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
+  Vectorized<uint8_t> neg() const;
+
+  Vectorized<uint8_t> abs() const {
+    return *this;
+  }
+
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    return _mm256_cmpeq_epi8(values, other.values);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    return invert(_mm256_cmpeq_epi8(values, other.values));
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return invert(_mm256_cmpeq_epi8(max, values));
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    __m256i max = _mm256_max_epu8(values, other.values);
+    return _mm256_cmpeq_epi8(max, other.values);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
+template <>
+Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm256_add_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_add_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm256_add_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_add_epi8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm256_sub_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_sub_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm256_sub_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_sub_epi8(a, b);
+}
+
+// Negation. Defined here so we can utilize operator-
+inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
+  return Vectorized<int64_t>(0) - *this;
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::neg() const {
+  return Vectorized<int32_t>(0) - *this;
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::neg() const {
+  return Vectorized<int16_t>(0) - *this;
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
+  return Vectorized<int8_t>(0) - *this;
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
+// Emulate operations with no native 64-bit support in avx,
+// by extracting each element, performing the operation pointwise,
+// then combining the results into a vector.
+template <typename op_t>
+Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b, const op_t& op) {
+  int64_t a0 = _mm256_extract_epi64(a, 0);
+  int64_t a1 = _mm256_extract_epi64(a, 1);
+  int64_t a2 = _mm256_extract_epi64(a, 2);
+  int64_t a3 = _mm256_extract_epi64(a, 3);
+
+  int64_t b0 = _mm256_extract_epi64(b, 0);
+  int64_t b1 = _mm256_extract_epi64(b, 1);
+  int64_t b2 = _mm256_extract_epi64(b, 2);
+  int64_t b3 = _mm256_extract_epi64(b, 3);
+
+  int64_t c0 = op(a0, b0);
+  int64_t c1 = op(a1, b1);
+  int64_t c2 = op(a2, b2);
+  int64_t c3 = op(a3, b3);
+
+  return _mm256_set_epi64x(c3, c2, c1, c0);
+}
+
+template <typename op_t>
+Vectorized<int64_t> inline emulate(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b, const Vectorized<int64_t>& c, const op_t& op) {
+  int64_t a0 = _mm256_extract_epi64(a, 0);
+  int64_t a1 = _mm256_extract_epi64(a, 1);
+  int64_t a2 = _mm256_extract_epi64(a, 2);
+  int64_t a3 = _mm256_extract_epi64(a, 3);
+
+  int64_t b0 = _mm256_extract_epi64(b, 0);
+  int64_t b1 = _mm256_extract_epi64(b, 1);
+  int64_t b2 = _mm256_extract_epi64(b, 2);
+  int64_t b3 = _mm256_extract_epi64(b, 3);
+
+  int64_t c0 = _mm256_extract_epi64(c, 0);
+  int64_t c1 = _mm256_extract_epi64(c, 1);
+  int64_t c2 = _mm256_extract_epi64(c, 2);
+  int64_t c3 = _mm256_extract_epi64(c, 3);
+
+  int64_t d0 = op(a0, b0, c0);
+  int64_t d1 = op(a1, b1, c1);
+  int64_t d2 = op(a2, b2, c2);
+  int64_t d3 = op(a3, b3, c3);
+
+  return _mm256_set_epi64x(d3, d2, d1, d0);
+}
+
+// AVX2 has no intrinsic for int64_t multiply so it needs to be emulated
+// This could be implemented more efficiently using epi32 instructions
+// This is also technically avx compatible, but then we'll need AVX
+// code for add as well.
+// Note: intentionally ignores undefined behavior like (-lowest * -1).
+template <>
+Vectorized<int64_t> inline operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) __ubsan_ignore_undefined__ {return a_point * b_point;});
+}
+
+template <>
+Vectorized<int32_t> inline operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+template <typename T, typename Op>
+Vectorized<T> inline int_elementwise_binary_256(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] = op(values_a[i], values_b[i]);
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  // We don't have an instruction for multiplying int8_t
+#ifndef CPU_CAPABILITY_AVX2
+  return int_elementwise_binary_256(a, b, std::multiplies<int8_t>());
+#else
+  __m256i mask00FF = _mm256_set1_epi16(0x00FF);
+  __m256i a_lo = _mm256_srai_epi16(_mm256_slli_epi16(a, 8), 8);
+  __m256i b_lo = _mm256_srai_epi16(_mm256_slli_epi16(b, 8), 8);
+  __m256i a_hi = _mm256_srai_epi16(a, 8);
+  __m256i b_hi = _mm256_srai_epi16(b, 8);
+  __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+  __m256i res = _mm256_or_si256(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+#ifndef CPU_CAPABILITY_AVX2
+  return int_elementwise_binary_256(a, b, std::multiplies<uint8_t>());
+#else
+  __m256i mask00FF = _mm256_set1_epi16(0x00FF);
+  __m256i a_lo = _mm256_and_si256 (a, mask00FF);
+  __m256i b_lo = _mm256_and_si256 (b, mask00FF);
+  __m256i a_hi = _mm256_srli_epi16(a, 8);
+  __m256i b_hi = _mm256_srli_epi16(b, 8);
+  __m256i res_lo = _mm256_and_si256(_mm256_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m256i res_hi = _mm256_slli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+  __m256i res = _mm256_or_si256(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::min(a_point, b_point);});
+#else
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  return _mm256_blendv_epi8(a, b, cmp);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline minimum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_min_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm256_min_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm256_min_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_min_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, b, [](int64_t a_point, int64_t b_point) {return std::max(a_point, b_point);});
+#else
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  return _mm256_blendv_epi8(b, a, cmp);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline maximum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_max_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm256_max_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm256_max_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm256_max_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, min_val, max_val, [](int64_t a_point, int64_t min_point, int64_t max_point) {return std::min(max_point, std::max(a_point, min_point));});
+#else
+  return minimum(maximum(a, min_val), max_val);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val, const Vectorized<int32_t>& max_val) {
+  return _mm256_min_epi32(max_val, _mm256_max_epi32(a, min_val));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val, const Vectorized<int16_t>& max_val) {
+  return _mm256_min_epi16(max_val, _mm256_max_epi16(a, min_val));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val, const Vectorized<int8_t>& max_val) {
+  return _mm256_min_epi8(max_val, _mm256_max_epi8(a, min_val));
+}
+
+template <>
+Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, _mm256_max_epu8(a, min_val));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, max_val, [](int64_t a_point, int64_t max_point) {return std::min(max_point, a_point);});
+#else
+  return minimum(max_val, a);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(const Vectorized<int32_t>& a, const Vectorized<int32_t>& max_val) {
+  return _mm256_min_epi32(max_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(const Vectorized<int16_t>& a, const Vectorized<int16_t>& max_val) {
+  return _mm256_min_epi16(max_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorized<int8_t>& max_val) {
+  return _mm256_min_epi8(max_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+  return _mm256_min_epu8(max_val, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
+#ifndef CPU_CAPABILITY_AVX2
+  return emulate(a, min_val, [](int64_t a_point, int64_t min_point) {return std::max(min_point, a_point);});
+#else
+  return maximum(min_val, a);
+#endif
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val) {
+  return _mm256_max_epi32(min_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val) {
+  return _mm256_max_epi16(min_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val) {
+  return _mm256_max_epi8(min_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
+  return _mm256_max_epu8(min_val, a);
+}
+
+template<typename T>
+Vectorized<int32_t> inline convert_to_int32(const T* ptr) {
+  return Vectorized<int32_t>::loadu(ptr);
+}
+
+template<>
+Vectorized<int32_t> inline convert_to_int32<int8_t>(const int8_t* ptr) {
+  return _mm256_cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+}
+
+template<>
+Vectorized<int32_t> inline convert_to_int32<uint8_t>(const uint8_t* ptr) {
+  return _mm256_cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(ptr)));
+}
+
+template <>
+Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_256(a, b, std::divides<uint8_t>());
+}
+
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_and_si256(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_or_si256(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm256_xor_si256(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <bool left_shift>
+Vectorized<int16_t> inline shift_256_16(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  // No vector instruction for shifting int16_t, so emulating it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 16-bit elements, and considering pairs of neighboring
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m256i ctl_0_1 = _mm256_set_epi8(29, 28, 0x80, 0x80, 25, 24, 0x80, 0x80,
+                                    21, 20, 0x80, 0x80, 17, 16, 0x80, 0x80,
+                                    13, 12, 0x80, 0x80, 9, 8, 0x80, 0x80,
+                                    5, 4, 0x80, 0x80, 1, 0, 0x80, 0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 31, 30, 0x80, 0x80, 27, 26,
+                                    0x80, 0x80, 23, 22, 0x80, 0x80, 19, 18,
+                                    0x80, 0x80, 15, 14, 0x80, 0x80, 11, 10,
+                                    0x80, 0x80, 7, 6, 0x80, 0x80, 3, 2);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 16-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFFFF);
+  __m256i keep_1 = _mm256_set1_epi32(0xFFFF0000);
+
+  // Take each 16-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 16
+  // bits will be proper result of shifting original 16-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_1);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  else
+    c0 = _mm256_srav_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_1_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%2==1.
+  __m256i a1 = _mm256_and_si256(a, keep_1);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  else
+    c1 = _mm256_srav_epi32(a1, b1);
+  c1 = _mm256_and_si256(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m256i c = _mm256_or_si256(c0, c1);
+
+  return c;
+}
+
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+Vectorized<T> inline shift_256_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
+
+  // Control masks for shuffle operation, treating 256 bits as an
+  // array of 8-bit elements, and considering quadruples of
+  // neighboring elements.  Specifially, a mask named "ctl_M_N" (M,N
+  // in [0,1,2,3], and M!=N) is set so that shuffle will move element
+  // with index M from input quadruple into element with index N in
+  // output quadruple, and other elements in output quadruple will be
+  // set to all 0s.
+  __m256i ctl_0_3 = _mm256_set_epi8(28, 0x80, 0x80, 0x80, 24, 0x80, 0x80, 0x80,
+                                    20, 0x80, 0x80, 0x80, 16, 0x80, 0x80, 0x80,
+                                    12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80,
+                                    4, 0x80, 0x80, 0x80, 0, 0x80, 0x80, 0x80);
+  __m256i ctl_1_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 29, 0x80, 0x80, 0x80, 25,
+                                    0x80, 0x80, 0x80, 21, 0x80, 0x80, 0x80, 17,
+                                    0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 9,
+                                    0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1);
+  __m256i ctl_1_3 = _mm256_set_epi8(29, 0x80, 0x80, 0x80, 25, 0x80, 0x80, 0x80,
+                                    21, 0x80, 0x80, 0x80, 17, 0x80, 0x80, 0x80,
+                                    13, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80,
+                                    5, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80);
+  __m256i ctl_2_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 30, 0x80, 0x80, 0x80, 26,
+                                    0x80, 0x80, 0x80, 22, 0x80, 0x80, 0x80, 18,
+                                    0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 10,
+                                    0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2);
+  __m256i ctl_2_3 = _mm256_set_epi8(30, 0x80, 0x80, 0x80, 26, 0x80, 0x80, 0x80,
+                                    22, 0x80, 0x80, 0x80, 18, 0x80, 0x80, 0x80,
+                                    14, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80,
+                                    6, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80);
+  __m256i ctl_3_0 = _mm256_set_epi8(0x80, 0x80, 0x80, 31, 0x80, 0x80, 0x80, 27,
+                                    0x80, 0x80, 0x80, 23, 0x80, 0x80, 0x80, 19,
+                                    0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11,
+                                    0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3);
+  __m256i ctl_3_1 = _mm256_set_epi8(0x80, 0x80, 31, 0x80, 0x80, 0x80, 27, 0x80,
+                                    0x80, 0x80, 23, 0x80, 0x80, 0x80, 19, 0x80,
+                                    0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80,
+                                    0x80, 0x80, 7, 0x80, 0x80, 0x80, 3, 0x80);
+  __m256i ctl_3_2 = _mm256_set_epi8(0x80, 31, 0x80, 0x80, 0x80, 27, 0x80, 0x80,
+                                    0x80, 23, 0x80, 0x80, 0x80, 19, 0x80, 0x80,
+                                    0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80,
+                                    0x80, 7, 0x80, 0x80, 0x80, 3, 0x80, 0x80);
+
+  // Masks for bitwise and operation, treating 256 bits as an array of
+  // 8-bit elements, and considering them in quadruples of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1,2,3]) is set so that
+  // bitwise and will copy element with index M from input quadruple
+  // into element with the same index in output quadruple, while the
+  // other elements in output quadruple will be set to all 0s.
+  __m256i keep_0 = _mm256_set1_epi32(0xFF);
+  __m256i keep_3 = _mm256_set1_epi32(0xFF000000);
+
+  // Take each 8-bit element with idx%4==0 from input array to be
+  // shifted and extend it to 32 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 32-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%4!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 32 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m256i a0 = _mm256_shuffle_epi8(a, ctl_0_3);
+  __m256i b0 = _mm256_and_si256(b, keep_0);
+  __m256i c0;
+  if (left_shift)
+    c0 = _mm256_sllv_epi32(a0, b0);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c0 = _mm256_srav_epi32(a0, b0);
+    else
+      c0 = _mm256_srlv_epi32(a0, b0);
+  c0 = _mm256_shuffle_epi8(c0, ctl_3_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==1.
+  __m256i a1 = _mm256_shuffle_epi8(a, ctl_1_3);
+  __m256i b1 = _mm256_shuffle_epi8(b, ctl_1_0);
+  __m256i c1;
+  if (left_shift)
+    c1 = _mm256_sllv_epi32(a1, b1);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c1 = _mm256_srav_epi32(a1, b1);
+    else
+      c1 = _mm256_srlv_epi32(a1, b1);
+  c1 = _mm256_shuffle_epi8(c1, ctl_3_1);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==2.
+  __m256i a2 = _mm256_shuffle_epi8(a, ctl_2_3);
+  __m256i b2 = _mm256_shuffle_epi8(b, ctl_2_0);
+  __m256i c2;
+  if (left_shift)
+    c2 = _mm256_sllv_epi32(a2, b2);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c2 = _mm256_srav_epi32(a2, b2);
+    else
+      c2 = _mm256_srlv_epi32(a2, b2);
+  c2 = _mm256_shuffle_epi8(c2, ctl_3_2);
+
+  // Peform shifting the same way for input array elements with
+  // idx%4==3.
+  __m256i a3 =  _mm256_and_si256(a, keep_3);
+  __m256i b3 = _mm256_shuffle_epi8(b, ctl_3_0);
+  __m256i c3;
+  if (left_shift)
+    c3 = _mm256_sllv_epi32(a3, b3);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c3 = _mm256_srav_epi32(a3, b3);
+    else
+      c3 = _mm256_srlv_epi32(a3, b3);
+  c3 = _mm256_and_si256(c3, keep_3);
+
+  // Merge partial results into the final result.
+  __m256i c01 = _mm256_or_si256(c0, c1);
+  __m256i c23 = _mm256_or_si256(c2, c3);
+  __m256i c = _mm256_or_si256(c01, c23);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm256_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return shift_256_16<true>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_256_8<true>(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  // No vector instruction for right arithmetic shifting int64_t, so emulating it
+  // instead.
+
+  // Clamp the shift values such that shift values < 0 and > 64 are changed to 64
+  // which results in -1 for negative input and 0 for non-negative input.
+  __m256i zero = _mm256_set1_epi64x(0);
+  __m256i max_shift = _mm256_set1_epi64x(64);
+  __m256i mask = _mm256_or_si256(_mm256_cmpgt_epi64(zero, b), _mm256_cmpgt_epi64(b, max_shift));
+  __m256i shift = _mm256_blendv_epi8(b, max_shift, mask);
+  // Shift the number logically to the right, thus filling the most
+  // significant bits with 0s.  Then, replace these bits with the sign
+  // bit.
+  __m256i sign_bits = _mm256_cmpgt_epi64(zero, a);
+  __m256i sign_shift = _mm256_sub_epi64(max_shift, shift);
+  __m256i sign_ext = _mm256_sllv_epi64(sign_bits, sign_shift);
+  __m256i c = _mm256_srlv_epi64(a, shift);
+  c = _mm256_or_si256(c, sign_ext);
+
+  return c;
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm256_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return shift_256_16<false>(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_256_8<false>(a, b);
+}
+
+#endif
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h
new file mode 100644
index 0000000000000000000000000000000000000000..28e0b4e50a4270d784acf41c6b501620281a9db5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vec256_qint.h
@@ -0,0 +1,1335 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cmath>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+
+struct Vectorizedqi {
+ protected:
+  __m256i vals __attribute__((aligned(64)));
+
+ public:
+  Vectorizedqi() {}
+  Vectorizedqi(__m256i v) : vals(v) {}
+  operator __m256i() const {
+    return vals;
+  }
+};
+
+template <typename T>
+__m256i pack_saturate_and_clamp(
+    __m256i first,
+    __m256i second,
+    T min_val,
+    T max_val);
+
+template <>
+inline __m256i pack_saturate_and_clamp<int32_t>(
+    __m256i /*first*/,
+    __m256i /*second*/,
+    int32_t /*min_val*/,
+    int32_t /*max_val*/) {
+  // This function is for linkage only, will not be used
+  AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
+}
+
+template <>
+inline __m256i pack_saturate_and_clamp<int8_t>(
+    __m256i first,
+    __m256i second,
+    int8_t min_val,
+    int8_t max_val) {
+  __m256i packed_and_sat = _mm256_packs_epi16(first, second);
+  return _mm256_max_epi8(
+      _mm256_set1_epi8(min_val),
+      _mm256_min_epi8(packed_and_sat, _mm256_set1_epi8(max_val)));
+}
+
+template <>
+inline __m256i pack_saturate_and_clamp<uint8_t>(
+    __m256i first,
+    __m256i second,
+    uint8_t min_val,
+    uint8_t max_val) {
+  __m256i packed_and_sat = _mm256_packus_epi16(first, second);
+  return _mm256_max_epu8(
+      _mm256_set1_epi8(min_val),
+      _mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val)));
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, at::vec::Vectorized<float>>::type
+inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+  // Only handle first 8*8 bits
+  __m128i input_128 = _mm256_castsi256_si128(src);
+  // Convert from 8*uint8/int8 to 8*int32
+  __m256i input_256_int32;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_256_int32 = _mm256_cvtepu8_epi32(input_128);
+  else
+    input_256_int32 = _mm256_cvtepi8_epi32(input_128);
+  // Convert from 8*int32 to 8*float
+  return _mm256_cvtepi32_ps(input_256_int32);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, at::vec::Vectorized<T>>::type
+inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+  // Convert from float32 to int32 with truncation
+  __m256i x_values_int32 = _mm256_cvttps_epi32(src);
+
+  // Convert from int32 to int16 using signed saturation
+  __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32);
+
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m256i xyzw_clamped_v = pack_saturate_and_clamp<T>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
+  __m256i permute_mask_v =
+    _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  return _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+}
+
+template <typename T>
+inline void __attribute__((always_inline)) QuantizeAvx2(
+    const float* src,
+    T* dst,
+    int len,
+    float inverse_scale,
+    int64_t zero_point) {
+  constexpr int VLEN = 8;
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  const __m256i min_v = _mm256_set1_epi32(min_val);
+  const __m256i max_v = _mm256_set1_epi32(max_val);
+  // This is the largest int32 value < int32_max exactly representable in float
+  constexpr int32_t int32_float_max_val =
+      std::numeric_limits<int32_t>::max() - 127;
+  int i = 0;
+  __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale);
+  // clang-format off
+  static const __m256i shuffle_mask_v = _mm256_set_epi8(
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00);
+  // clang-format on
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  __m256i permute_mask_l8_v =
+      _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00);
+  int len_aligned = len / (VLEN * 4) * (VLEN * 4);
+  for (; i < len_aligned; i += 4 * VLEN) {
+    // x
+    __m256 x_vals = _mm256_load_ps(src + i);
+    __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v);
+    // If the floating point value is greater than int32_max,
+    // _mm256_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
+    // Clip at int32_float_max_val to avoid this.
+    x_transformed_v =
+        _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // y
+    __m256 y_vals = _mm256_load_ps(src + i + VLEN);
+    __m256 y_transformed_v = _mm256_mul_ps(y_vals, inverse_scale_v);
+    y_transformed_v =
+        _mm256_min_ps(y_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // z
+    __m256 z_vals = _mm256_load_ps(src + i + 2 * VLEN);
+    __m256 z_transformed_v = _mm256_mul_ps(z_vals, inverse_scale_v);
+    z_transformed_v =
+        _mm256_min_ps(z_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    // w
+    __m256 w_vals = _mm256_load_ps(src + i + 3 * VLEN);
+    __m256 w_transformed_v = _mm256_mul_ps(w_vals, inverse_scale_v);
+    w_transformed_v =
+        _mm256_min_ps(w_transformed_v, _mm256_set1_ps(int32_float_max_val));
+
+    __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v);
+    __m256i y_rounded_v = _mm256_cvtps_epi32(y_transformed_v);
+    __m256i z_rounded_v = _mm256_cvtps_epi32(z_transformed_v);
+    __m256i w_rounded_v = _mm256_cvtps_epi32(w_transformed_v);
+
+    // add zero point
+    x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point));
+    y_rounded_v = _mm256_add_epi32(y_rounded_v, _mm256_set1_epi32(zero_point));
+    z_rounded_v = _mm256_add_epi32(z_rounded_v, _mm256_set1_epi32(zero_point));
+    w_rounded_v = _mm256_add_epi32(w_rounded_v, _mm256_set1_epi32(zero_point));
+
+    __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v);
+    __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v);
+    __m256i xyzw_clamped_v =
+        pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+    xyzw_clamped_v =
+        _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst + i), xyzw_clamped_v);
+  }
+
+  // Additional 8-lane AVX2 version to take advantage when len is smaller
+  // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
+  for (; i < len / VLEN * VLEN; i += VLEN) {
+    __m256 x_vals = _mm256_load_ps(src + i);
+    __m256 x_transformed_v = _mm256_mul_ps(x_vals, inverse_scale_v);
+    x_transformed_v =
+        _mm256_min_ps(x_transformed_v, _mm256_set1_ps(int32_float_max_val));
+    __m256i x_rounded_v = _mm256_cvtps_epi32(x_transformed_v);
+    x_rounded_v = _mm256_add_epi32(x_rounded_v, _mm256_set1_epi32(zero_point));
+    __m256i x_clipped_v =
+        _mm256_max_epi32(min_v, _mm256_min_epi32(max_v, x_rounded_v));
+
+    x_clipped_v = _mm256_shuffle_epi8(x_clipped_v, shuffle_mask_v);
+    x_clipped_v = _mm256_permutevar8x32_epi32(x_clipped_v, permute_mask_l8_v);
+    _mm_storel_epi64(
+        reinterpret_cast<__m128i*>(dst + i),
+        _mm256_castsi256_si128(x_clipped_v));
+  }
+
+  for (; i < len; ++i) {
+    float transformed = src[i] * inverse_scale;
+
+    // Not exactly the same behavior as the vectorized code.
+    // The vectorized code above always rounds to even in halfway cases
+    // (https://software.intel.com/en-us/node/523819), but std::nearbyint
+    // does the same only when the current rounding mode is FE_TONEAREST.
+    // However, in practice, this should not be a problem because most cases
+    // use the default rounding mode FE_TONEAREST.
+    // Note that we cannot implement the same behavior as the vectorized code
+    // using std::round because it does rounding away from zero in halfway
+    // cases.
+    transformed = zero_point + std::nearbyint(transformed);
+    float clipped =
+        std::min(std::max(transformed, float(min_val)), float(max_val));
+    dst[i] = clipped;
+  }
+}
+
+template<>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+    using size_type = int;
+    static constexpr size_type size() {
+        return 8;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 1;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 1;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 1>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+    using value_type = c10::qint32::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+    Vectorized() {}
+
+    Vectorized(__m256i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::qint32& val) {
+        value_type uw = val.val_;
+        vals = _mm256_set1_epi32(uw);
+    }
+
+    void store(void* ptr, int count = size()) const {
+      if (count != size()) {
+        memcpy(ptr, &vals, count * sizeof(value_type));
+      } else {
+        _mm256_storeu_si256((__m256i*)ptr, vals);
+      }
+    }
+
+    static Vectorized<c10::qint32> loadu(const void* ptr) {
+        return Vectorized<c10::qint32>(ptr);
+    }
+
+    static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(
+            tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return _mm256_loadu_si256((const __m256i*)tmp_values);
+    }
+
+    float_vec_return_type dequantize(
+        Vectorized<float> scale,
+        Vectorized<float> /*zero_point*/,
+        Vectorized<float> scale_zp_premul) const {
+      __m256 float_vals = _mm256_cvtepi32_ps(vals);
+      return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+    }
+
+    float_vec_return_type dequantize(
+        Vectorized<float> scale,
+        Vectorized<float> zero_point) const {
+      __m256 float_vals = _mm256_cvtepi32_ps(vals);
+      return {(Vectorized<float>(float_vals) - zero_point) * scale};
+    }
+
+    static Vectorized<c10::qint32> quantize(
+        const float_vec_return_type& rhs,
+        float scale,
+        int32_t zero_point,
+        float /*inverse_scale*/) {
+      Vectorized<c10::qint32> retval;
+      auto rhs_data = (__m256)rhs[0];
+      at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+          scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 8);
+      return retval;
+    }
+
+    Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+      return _mm256_max_epi32(vals, b.vals);
+    }
+
+    Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+      return _mm256_min_epi32(vals, b.vals);
+    }
+
+    Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::qint32> relu6(
+        Vectorized<c10::qint32> zero_point,
+        Vectorized<c10::qint32> q_six) {
+      return _mm256_min_epi32(
+          _mm256_max_epi32(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+      return {_mm256_sub_epi32(vals, b)};
+    }
+
+    static Vectorized<c10::qint32> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m256 multiplier_v = _mm256_set1_ps(multiplier);
+      __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+
+      __m256 scaled = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier_v);
+      __m256i rounded = _mm256_cvtps_epi32(scaled);
+      return _mm256_add_epi32(rounded, zero_point_v);
+    }
+
+ private:
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+      vals = _mm256_loadu_si256((const __m256i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm256_add_epi32(a, b);
+}
+
+/*
+ * Convert values from int32 back to int8/uint8
+ */
+template <typename T>
+__m256i RequantizeAvx2(
+    const std::array<Vectorized<c10::qint32>, 4>& inp,
+    __m256 multiplier,
+    __m256i zp) {
+  static_assert(
+      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+      "Only int8_t/uint8_t are supported");
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  __m256i permute_mask_v =
+      _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+  __m256 x_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[0]), multiplier);
+  __m256 y_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[1]), multiplier);
+  __m256 z_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[2]), multiplier);
+  __m256 w_scaled_v = _mm256_mul_ps(_mm256_cvtepi32_ps(inp[3]), multiplier);
+
+  __m256i x_rounded_v = _mm256_cvtps_epi32(x_scaled_v);
+  __m256i y_rounded_v = _mm256_cvtps_epi32(y_scaled_v);
+  __m256i z_rounded_v = _mm256_cvtps_epi32(z_scaled_v);
+  __m256i w_rounded_v = _mm256_cvtps_epi32(w_scaled_v);
+
+  /* Add zero point */
+  __m256i x_v = _mm256_add_epi32(x_rounded_v, zp);
+  __m256i y_v = _mm256_add_epi32(y_rounded_v, zp);
+  __m256i z_v = _mm256_add_epi32(z_rounded_v, zp);
+  __m256i w_v = _mm256_add_epi32(w_rounded_v, zp);
+
+  /* Pack to int16_t and saturate */
+  __m256i xy_packed_v = _mm256_packs_epi32(x_v, y_v);
+  __m256i zw_packed_v = _mm256_packs_epi32(z_v, w_v);
+
+  __m256i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+  /*
+   * xyzw_clamped_v has results in the following layout so we need to
+   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7
+   */
+  xyzw_clamped_v = _mm256_permutevar8x32_epi32(xyzw_clamped_v, permute_mask_v);
+  return xyzw_clamped_v;
+}
+
+template<>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+    static constexpr int size() {
+        return 32;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 4;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 4;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 4>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using value_type = typename c10::qint8::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+
+    Vectorized() {}
+    Vectorized(__m256i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::qint8& val) {
+        value_type uw = val.val_;
+        vals = _mm256_set1_epi8(uw);
+    }
+
+    // This is needed because the compiler emits awful code for the default
+    // constructor for moving the enum
+    // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+    C10_CLANG_DIAGNOSTIC_PUSH()
+    #if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+    C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+    #endif
+    Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { }
+    C10_CLANG_DIAGNOSTIC_POP()
+
+    void store(void* ptr, int count = size()) const {
+        if (count != size()) {
+            memcpy(ptr, &vals, count * sizeof(value_type));
+        } else {
+            _mm256_storeu_si256((__m256i*)ptr, vals);
+        }
+    }
+
+    static Vectorized<c10::qint8> loadu(const void* ptr) {
+        return Vectorized<c10::qint8>(ptr);
+    }
+
+    static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(
+            tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return _mm256_loadu_si256((const __m256i*)tmp_values);
+    }
+
+ private:
+    __m256i cvtepi8_epi32(__m128i epi8_vals) const {
+        return _mm256_cvtepi8_epi32(epi8_vals);
+    }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_neg_zp_premul) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float /*scale*/,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    int8_t quantized_values[32];
+    QuantizeAvx2<value_type>(
+        rhs_data, quantized_values, 32, inverse_scale, zero_point);
+    return Vectorized<c10::qint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+      return _mm256_max_epi8(vals, b.vals);
+    }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+      return _mm256_min_epi8(vals, b.vals);
+    }
+
+    Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::qint8> relu6(
+        Vectorized<c10::qint8> zero_point,
+        Vectorized<c10::qint8> q_six) {
+      return _mm256_min_epi8(
+          _mm256_max_epi8(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+      __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+      __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+      __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+      __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+      __m256i int32_val0 = cvtepi8_epi32(int_val0);
+      __m256i int32_val1 = cvtepi8_epi32(int_val1);
+      __m256i int32_val2 = cvtepi8_epi32(int_val2);
+      __m256i int32_val3 = cvtepi8_epi32(int_val3);
+
+      __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+      __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+      __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+      __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+      __m256i int32_b0 = cvtepi8_epi32(int_b0);
+      __m256i int32_b1 = cvtepi8_epi32(int_b1);
+      __m256i int32_b2 = cvtepi8_epi32(int_b2);
+      __m256i int32_b3 = cvtepi8_epi32(int_b3);
+
+      __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+      __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+      __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+      __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+
+      return {Vectorized<c10::qint32>(res_0),
+              Vectorized<c10::qint32>(res_1),
+              Vectorized<c10::qint32>(res_2),
+              Vectorized<c10::qint32>(res_3)};
+    }
+
+    static Vectorized<c10::qint8> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m256 multiplier_v = _mm256_set1_ps(multiplier);
+      __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+      return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+    }
+
+ private:
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+        vals = _mm256_loadu_si256((const __m256i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template<>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+    static constexpr int size() {
+        return 32;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 4;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 4;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 4>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using value_type = typename c10::quint8::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+    Vectorized() {}
+
+    Vectorized(__m256i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::quint8& val) {
+        value_type uw = val.val_;
+        vals = _mm256_set1_epi8(uw);
+    }
+
+    // NOLINTNEXTLINE(clang-diagnostic-deprecated-copy)
+    C10_CLANG_DIAGNOSTIC_PUSH()
+    #if C10_CLANG_HAS_WARNING("-Wdeprecated-copy")
+    C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy")
+    #endif
+    Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { }
+    C10_CLANG_DIAGNOSTIC_POP()
+
+    void store(void* ptr, int count = size()) const {
+        if (count != size()) {
+            memcpy(ptr, &vals, count * sizeof(value_type));
+        } else {
+            _mm256_storeu_si256((__m256i*)ptr, vals);
+        }
+    }
+
+    static Vectorized<c10::quint8> loadu(const void* ptr) {
+        return Vectorized<c10::quint8>(ptr);
+    }
+
+    static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(
+            tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return _mm256_loadu_si256((const __m256i*)tmp_values);
+    }
+
+ private:
+    __m256i cvtepu8_epi32(__m128i epu8_vals) const {
+        return _mm256_cvtepu8_epi32(epu8_vals);
+    }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> /*zero_point*/,
+      Vectorized<float> scale_zp_premul) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+    __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+    __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+    __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+    __m256 float_val0 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m256 float_val1 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m256 float_val2 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m256 float_val3 = _mm256_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float /*scale*/,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    uint8_t quantized_values[32];
+    QuantizeAvx2<value_type>(
+        rhs_data, quantized_values, 32, inverse_scale, zero_point);
+    return Vectorized<c10::quint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+      return _mm256_max_epu8(vals, b.vals);
+    }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+      return _mm256_min_epu8(vals, b.vals);
+    }
+
+    Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::quint8> relu6(
+        Vectorized<c10::quint8> zero_point,
+        Vectorized<c10::quint8> q_six) {
+      return _mm256_min_epu8(
+          _mm256_max_epu8(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+      __m128i int_val0 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 0));
+      __m128i int_val1 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 1));
+      __m128i int_val2 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 2));
+      __m128i int_val3 = _mm_set1_epi64x(_mm256_extract_epi64(vals, 3));
+
+      __m256i int32_val0 = cvtepu8_epi32(int_val0);
+      __m256i int32_val1 = cvtepu8_epi32(int_val1);
+      __m256i int32_val2 = cvtepu8_epi32(int_val2);
+      __m256i int32_val3 = cvtepu8_epi32(int_val3);
+
+      __m128i int_b0 = _mm_set1_epi64x(_mm256_extract_epi64(b, 0));
+      __m128i int_b1 = _mm_set1_epi64x(_mm256_extract_epi64(b, 1));
+      __m128i int_b2 = _mm_set1_epi64x(_mm256_extract_epi64(b, 2));
+      __m128i int_b3 = _mm_set1_epi64x(_mm256_extract_epi64(b, 3));
+
+      __m256i int32_b0 = cvtepu8_epi32(int_b0);
+      __m256i int32_b1 = cvtepu8_epi32(int_b1);
+      __m256i int32_b2 = cvtepu8_epi32(int_b2);
+      __m256i int32_b3 = cvtepu8_epi32(int_b3);
+
+      __m256i res_0 = _mm256_sub_epi32(int32_val0, int32_b0);
+      __m256i res_1 = _mm256_sub_epi32(int32_val1, int32_b1);
+      __m256i res_2 = _mm256_sub_epi32(int32_val2, int32_b2);
+      __m256i res_3 = _mm256_sub_epi32(int32_val3, int32_b3);
+      return {Vectorized<c10::qint32>(res_0),
+              Vectorized<c10::qint32>(res_1),
+              Vectorized<c10::qint32>(res_2),
+              Vectorized<c10::qint32>(res_3)};
+    }
+
+    static Vectorized<c10::quint8> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m256 multiplier_v = _mm256_set1_ps(multiplier);
+      __m256i zero_point_v = _mm256_set1_epi32(zero_point);
+      return RequantizeAvx2<value_type>(inp, multiplier_v, zero_point_v);
+    }
+
+ private:
+
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+        vals = _mm256_loadu_si256((const __m256i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#else
+
+// NOTE: These are low-performance implementations that we fall back on
+// if we are not building with AVX2. This may not be an issue, because
+// currently for quantization we assume the user has at least AVX512
+// installed, so these can simply act as a reference implementation.
+//
+// If in the future we relax this requirement (AVX2+), we should probably
+// revisit these implementations
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  static constexpr int size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / 8;
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / 8;
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (const auto i : c10::irange(size())) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> /*scale_zp_premul*/) const {
+    float_vec_return_type rv;
+    for (const auto i : c10::irange(float_num_vecs())) {
+      float tmp_vals[8];
+      for (const auto j : c10::irange(8)) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            scale[j], zero_point[j], T(vals[8 * i + j]));
+      }
+      rv[i] = Vectorized<float>(tmp_vals[0],
+          tmp_vals[1],
+          tmp_vals[2],
+          tmp_vals[3],
+          tmp_vals[4],
+          tmp_vals[5],
+          tmp_vals[6],
+          tmp_vals[7]);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    Vectorized<float> scale_zp_premul;
+    return dequantize(scale, zero_point, scale_zp_premul);
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                 c10::qint32,
+                                 std::array<Vectorized<float>, 1>,
+                                 std::array<Vectorized<c10::qint32>, 1>,
+                                 8> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            8>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            8>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            8>(ptr) {}
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return Vectorized<c10::qint32>(tmp_values);
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 8> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 8], 8);
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        8 * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (const auto i : c10::irange(size())) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] =
+          std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                c10::qint8,
+                                std::array<Vectorized<float>, 4>,
+                                std::array<Vectorized<c10::qint32>, 4>,
+                                32> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return Vectorized<c10::qint8>(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 8> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 8], 8);
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        8 * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                 c10::quint8,
+                                 std::array<Vectorized<float>, 4>,
+                                 std::array<Vectorized<c10::qint32>, 4>,
+                                 32> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            32>(ptr) {}
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(
+        tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return Vectorized<c10::quint8>(tmp_values);
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float /*inverse_scale*/) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 8> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 8], 8);
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        8 * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..d904c712ed61d39267e0e9a1e580d50a7e943614
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
+    const Vectorized<BFloat16>& a) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr2);
+  convert(arr2, arr, K);
+  return std::make_tuple(
+      Vectorized<float>::loadu(arr),
+      Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<BFloat16> convert_float_bfloat16(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  constexpr int64_t K = Vectorized<BFloat16>::size();
+  __at_align__ float arr[K];
+  __at_align__ BFloat16 arr2[K];
+  a.store(arr);
+  b.store(arr + Vectorized<float>::size());
+  convert(arr, arr2, K);
+  return Vectorized<BFloat16>::loadu(arr2);
+}
+
+inline void load_fp32_from_bf16(const c10::BFloat16* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_bf16(
+    const c10::BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_bf16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_bf16(data, out2);
+}
+
+inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_fp16(
+    const c10::Half* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  load_fp32_from_fp16(data, out1);
+  data += Vectorized<float>::size();
+  load_fp32_from_fp16(data, out2);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3f8ae2fc513430289ae989355b540dc20527123
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h
@@ -0,0 +1,246 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+
+// Note: header order is important here
+#include <ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h>
+
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h>
+#include <ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h>
+
+#include <ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h>
+
+namespace at {
+namespace vec {
+
+inline namespace CPU_CAPABILITY {
+
+DEFINE_CLAMP_FUNCS(c10::quint8)
+DEFINE_CLAMP_FUNCS(c10::qint8)
+DEFINE_CLAMP_FUNCS(c10::qint32)
+DEFINE_CLAMP_FUNCS(int16_t)
+DEFINE_CLAMP_FUNCS(int32_t)
+DEFINE_CLAMP_FUNCS(int64_t)
+DEFINE_CLAMP_FUNCS(float)
+DEFINE_CLAMP_FUNCS(double)
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return Vectorized<double>{
+      vec_madd(a.vec0(), b.vec0(), c.vec0()),
+      vec_madd(a.vec1(), b.vec1(), c.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c) {
+  return Vectorized<int64_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    const Vectorized<int32_t>& c) {
+  return Vectorized<int32_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    const Vectorized<int16_t>& c) {
+  return Vectorized<int16_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t)
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
+  return Vectorized<int64_t>{vec_signed(src.vec0()), vec_signed(src.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<float>(
+    const Vectorized<float>& src) {
+  return Vectorized<int32_t>{vec_signed(src.vec0()), vec_signed(src.vec1())};
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  // int32_t and float have same size
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    const int32_t* src_a = src + i;
+    float* dst_a = dst + i;
+    vint32 input_vec0 = vec_vsx_ld(offset0, reinterpret_cast<const vint32*>(src_a));
+    vint32 input_vec1 =
+        vec_vsx_ld(offset16, reinterpret_cast<const vint32*>(src_a));
+    vfloat32 c0 = vec_float(input_vec0);
+    vfloat32 c1 = vec_float(input_vec1);
+    vec_vsx_st(c0, offset0, dst_a);
+    vec_vsx_st(c1, offset16, dst_a);
+  }
+
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int64_t* src, double* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+    const int64_t* src_a = src + i;
+    double* dst_a = dst + i;
+    vint64 input_vec0 =
+        vec_vsx_ld(offset0, reinterpret_cast<const vint64*>(src_a));
+    vint64 input_vec1 =
+        vec_vsx_ld(offset16, reinterpret_cast<const vint64*>(src_a));
+    vfloat64 c0 = vec_double(input_vec0);
+    vfloat64 c1 = vec_double(input_vec1);
+    vec_vsx_st(c0, offset0, reinterpret_cast<double*>(dst_a));
+    vec_vsx_st(c1, offset16, reinterpret_cast<double*>(dst_a));
+  }
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+//Generic implementation to fix compiler error
+//TO-DO : Add optimized version for ppc64
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(
+    const Vectorized<Half>& a) {
+  constexpr int64_t K = Vectorized<Half>::size();
+  __at_align__ float arr[K];
+  __at_align__ Half arr2[K];
+  a.store(arr2);
+  convert(arr2, arr, K);
+  return std::make_tuple(
+       Vectorized<float>::loadu(arr),
+       Vectorized<float>::loadu(arr + Vectorized<float>::size()));
+}
+
+inline Vectorized<Half> convert_float_half(
+    const Vectorized<float>& a, const Vectorized<float>& b) {
+  constexpr int64_t K = Vectorized<Half>::size();
+  __at_align__ float arr[K];
+  __at_align__ Half arr2[K];
+  a.store(arr);
+  b.store(arr + Vectorized<float>::size());
+  convert(arr, arr2, K);
+  return Vectorized<Half>::loadu(arr2);
+};
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a      = {a0, a1, a2, a3}
+  //   b      = {b0, b1, b2, b3}
+
+  vfloat64 ab00 = vec_xxpermdi(a.vec0(), b.vec0(), 0);
+  vfloat64 ab11 = vec_xxpermdi(a.vec0(), b.vec0(), 3);
+  vfloat64 ab2_00 = vec_xxpermdi(a.vec1(), b.vec1(), 0);
+  vfloat64 ab2_11 = vec_xxpermdi(a.vec1(), b.vec1(), 3);
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      Vectorized<double>{ab00, ab11}, Vectorized<double>{ab2_00, ab2_11});
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  vfloat64 aa01 = vec_xxpermdi(a.vec0(), a.vec1(), 0);
+  vfloat64 aa23 = vec_xxpermdi(b.vec0(), b.vec1(), 0);
+
+  vfloat64 bb_01 = vec_xxpermdi(a.vec0(), a.vec1(), 3);
+  vfloat64 bb_23 = vec_xxpermdi(b.vec0(), b.vec1(), 3);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(
+      Vectorized<double>{aa01, aa23}, Vectorized<double>{bb_01, bb_23});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  vfloat32 ab0011 = vec_mergeh(a.vec0(), b.vec0());
+  vfloat32 ab2233 = vec_mergel(a.vec0(), b.vec0());
+
+  vfloat32 ab2_0011 = vec_mergeh(a.vec1(), b.vec1());
+  vfloat32 ab2_2233 = vec_mergel(a.vec1(), b.vec1());
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  return std::make_pair(
+      Vectorized<float>{ab0011, ab2233}, Vectorized<float>{ab2_0011, ab2_2233});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  // {a0,a2,b0,b2} {a1,a3,b1,b3}
+  vfloat32 a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1());
+  vfloat32 a1a3b1b3 = vec_mergel(a.vec0(), a.vec1());
+
+  vfloat32 aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3);
+  vfloat32 bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3);
+
+  vfloat32 a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1());
+  vfloat32 a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1());
+
+  vfloat32 aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2);
+  vfloat32 bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2);
+
+  // it could be done with vec_perm ,too
+  // swap lanes:
+  //   return {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  return std::make_pair(
+      Vectorized<float>{aa0123, aa0123_2}, Vectorized<float>{bb0123, bb0123_2});
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f7147213550af8d7dd980482ad8ad30b1f458d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@@ -0,0 +1,560 @@
+#pragma once
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+using ComplexDbl = c10::complex<double>;
+
+template <>
+class Vectorized<ComplexDbl> {
+  union {
+    struct {
+      vfloat64 _vec0;
+      vfloat64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = ComplexDbl;
+  using vec_internal_type = vfloat64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 2;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(ComplexDbl val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    _vec0 = vfloat64{real_value, imag_value};
+    _vec1 = vfloat64{real_value, imag_value};
+  }
+  Vectorized(ComplexDbl val1, ComplexDbl val2) {
+    _vec0 = vfloat64{val1.real(), val1.imag()};
+    _vec1 = vfloat64{val2.real(), val2.imag()};
+  }
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceComplexDbl(mask) == 0, Vectorized<ComplexDbl>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceComplexDbl(mask) == 1, Vectorized<ComplexDbl>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceComplexDbl(mask) == 2, Vectorized<ComplexDbl>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceComplexDbl(mask) == 3, Vectorized<ComplexDbl>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static Vectorized<ComplexDbl> C10_ALWAYS_INLINE
+  el_blend(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+    const vbool64 mask_1st = VsxDblMask1(mask);
+    const vbool64 mask_2nd = VsxDblMask2(mask);
+    return {
+        (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<ComplexDbl> blendv(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      const Vectorized<ComplexDbl>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_complex =
+        Vectorized<ComplexDbl>(vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0));
+    return {
+        vec_sel(a._vec0, b._vec0, mask_complex._vecb0),
+        vec_sel(a._vec1, b._vec1, mask_complex._vecb1)};
+  }
+
+  static Vectorized<ComplexDbl> C10_ALWAYS_INLINE elwise_blendv(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      const Vectorized<ComplexDbl>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<ComplexDbl> arange(
+      ComplexDbl base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<ComplexDbl>(base, base + step);
+  }
+  static Vectorized<ComplexDbl> set(
+      const Vectorized<ComplexDbl>& a,
+      const Vectorized<ComplexDbl>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+    }
+    return b;
+  }
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const double*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const double*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        vec_vsx_ld(offset0, reinterpret_cast<const double*>(tmp_values)),
+        vec_vsx_ld(offset16, reinterpret_cast<const double*>(tmp_values))};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<double*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<double*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<double*>(tmp_values));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<double*>(tmp_values));
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const ComplexDbl& operator[](int idx) const = delete;
+  ComplexDbl& operator[](int idx) = delete;
+
+  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(ComplexDbl)) const {
+    __at_align__ ComplexDbl tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexDbl> map(ComplexDbl (*const f)(const ComplexDbl&)) const {
+    __at_align__ ComplexDbl tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexDbl> el_swapped() const {
+    vfloat64 v0 = vec_xxpermdi(_vec0, _vec0, 2);
+    vfloat64 v1 = vec_xxpermdi(_vec1, _vec1, 2);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexDbl> el_madd(
+      const Vectorized<ComplexDbl>& multiplier,
+      const Vectorized<ComplexDbl>& val) const {
+    return {
+        vec_madd(_vec0, multiplier._vec0, val._vec0),
+        vec_madd(_vec1, multiplier._vec1, val._vec1)};
+  }
+
+  Vectorized<ComplexDbl> el_mergeo() const {
+    vfloat64 v0 = vec_splat(_vec0, 1);
+    vfloat64 v1 = vec_splat(_vec1, 1);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexDbl> el_mergee() const {
+    vfloat64 v0 = vec_splat(_vec0, 0);
+    vfloat64 v1 = vec_splat(_vec1, 0);
+    return {v0, v1};
+  }
+
+  static Vectorized<ComplexDbl> el_mergee(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergeh(first._vec0, second._vec0),
+        vec_mergeh(first._vec1, second._vec1)};
+  }
+
+  static Vectorized<ComplexDbl> el_mergeo(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    return {
+        vec_mergel(first._vec0, second._vec0),
+        vec_mergel(first._vec1, second._vec1)};
+  }
+
+  Vectorized<ComplexDbl> abs_2_() const {
+    auto a = (*this).elwise_mult(*this);
+    auto permuted = a.el_swapped();
+    a = a + permuted;
+    return a;
+  }
+
+  Vectorized<ComplexDbl> abs_() const {
+    auto vi = el_mergeo();
+    auto vr = el_mergee();
+    return {Sleef_hypotd2_u05vsx(vr._vec0, vi._vec0), Sleef_hypotd2_u05vsx(vr._vec1, vi._vec1)};
+  }
+
+  Vectorized<ComplexDbl> abs() const {
+    return abs_() & vd_real_mask;
+  }
+
+  Vectorized<ComplexDbl> angle_() const {
+    // angle = atan2(b/a)
+    // auto b_a = _mm256_permute_pd(values, 0x05);     // b        a
+    // return Sleef_atan2d4_u10(values, b_a);          // 90-angle angle
+    Vectorized<ComplexDbl> ret;
+    ret._vec0[0] = std::atan2(_vec0[1], _vec0[0]);
+    ret._vec1[0] = std::atan2(_vec1[1], _vec1[0]);
+    return ret;
+  }
+
+  Vectorized<ComplexDbl> angle() const {
+    return angle_() & vd_real_mask;
+  }
+
+  Vectorized<ComplexDbl> real_() const {
+    return *this & vd_real_mask;
+  }
+  Vectorized<ComplexDbl> real() const {
+    return *this & vd_real_mask;
+  }
+  Vectorized<ComplexDbl> imag_() const {
+    return *this & vd_imag_mask;
+  }
+  Vectorized<ComplexDbl> imag() const {
+    return imag_().el_swapped();
+  }
+
+  Vectorized<ComplexDbl> conj_() const {
+    return *this ^ vd_isign_mask;
+  }
+  Vectorized<ComplexDbl> conj() const {
+    return *this ^ vd_isign_mask;
+  }
+
+  Vectorized<ComplexDbl> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+
+  Vectorized<ComplexDbl> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return ret.elwise_mult(vd_log2e_inv);
+  }
+  Vectorized<ComplexDbl> log10() const {
+    auto ret = log();
+    return ret.elwise_mult(vd_log10e_inv);
+  }
+
+  Vectorized<ComplexDbl> log1p() const {
+    return map(std::log1p);
+  }
+
+  Vectorized<ComplexDbl> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    auto conj = conj_();
+    auto b_a = conj.el_swapped();
+    auto ab = conj.elwise_mult(b_a);
+    auto im = ab + ab;
+    auto val_2 = (*this).elwise_mult(*this);
+    auto val_2_swapped = val_2.el_swapped();
+    auto re = horizontal_sub(val_2, val_2_swapped);
+    re = Vectorized<ComplexDbl>(vd_one) - re;
+    auto root = el_blend<0x0A>(re, im).sqrt();
+    auto ln = (b_a + root).log();
+    return ln.el_swapped().conj();
+  }
+
+  Vectorized<ComplexDbl> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized(vd_pi_2) - asin();
+  }
+
+  Vectorized<ComplexDbl> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized(vd_imag_one);
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln * vd_imag_half; // i/2*ln()
+  }
+  Vectorized<ComplexDbl> atanh() const {
+    return map(std::atanh);
+  }
+
+  Vectorized<ComplexDbl> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<ComplexDbl> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<ComplexDbl> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<ComplexDbl> cosh() const {
+    return map(std::cosh);
+  }
+
+  Vectorized<ComplexDbl> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<ComplexDbl> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<ComplexDbl> ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<ComplexDbl> floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<ComplexDbl> neg() const {
+    auto z = Vectorized<ComplexDbl>(vd_zero);
+    return z - *this;
+  }
+  Vectorized<ComplexDbl> round() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> elwise_sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+
+  Vectorized<ComplexDbl> sqrt() const {
+    return map(std::sqrt);
+  }
+
+  Vectorized<ComplexDbl> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    auto c_d = *this ^ vd_isign_mask; // c       -d
+    auto abs = abs_2_();
+    return c_d.elwise_div(abs);
+  }
+
+  Vectorized<ComplexDbl> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  static Vectorized<ComplexDbl> horizontal_add(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
+  }
+
+  static Vectorized<ComplexDbl> horizontal_sub(
+      Vectorized<ComplexDbl>& first,
+      Vectorized<ComplexDbl>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.el_swapped(); // 2perm
+    auto second_perm = second.el_swapped(); // 2perm
+    // summ
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return el_mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  Vectorized<ComplexDbl> inline operator*(const Vectorized<ComplexDbl>& b) const {
+    //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+#if 1
+    // this is more vsx friendly than simulating horizontal from x86
+    auto vi = b.el_mergeo();
+    auto vr = b.el_mergee();
+    vi = vi ^ vd_rsign_mask;
+    auto ret = elwise_mult(vr);
+    auto vx_swapped = el_swapped();
+    ret = vx_swapped.el_madd(vi, ret);
+#else
+    auto ac_bd = elwise_mult(b);
+    auto d_c = b.el_swapped();
+    d_c = d_c ^ vd_isign_mask;
+    auto ad_bc = elwise_mult(d_c);
+    auto ret = horizontal_sub(ac_bd, ad_bc);
+#endif
+    return ret;
+  }
+
+  Vectorized<ComplexDbl> inline operator/(const Vectorized<ComplexDbl>& b) const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2()
+    // im = (bc - ad)/abs_2()
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, vd_sign_mask),
+      vec_andc(b._vec1, vd_sign_mask)};       // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ vd_rsign_mask;                // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
+    return ret;
+  }
+
+  Vectorized<ComplexDbl> exp() const {
+    return map(std::exp);
+  }
+  Vectorized<ComplexDbl> exp2() const {
+    return map(exp2_impl);
+  }
+  Vectorized<ComplexDbl> expm1() const {
+    return map(std::expm1);
+  }
+
+  Vectorized<ComplexDbl> pow(const Vectorized<ComplexDbl>& exp) const {
+    __at_align__ ComplexDbl x_tmp[size()];
+    __at_align__ ComplexDbl y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+
+  Vectorized<ComplexDbl> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
+  Vectorized<ComplexDbl> operator<(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator<=(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator>(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<ComplexDbl> operator>=(const Vectorized<ComplexDbl>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexDbl> eq(const Vectorized<ComplexDbl>& other) const {
+    auto eq = (*this == other);  // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers are equal
+    return (eq.real() & eq.imag()) & vd_one;
+  }
+  Vectorized<ComplexDbl> ne(const Vectorized<ComplexDbl>& other) const {
+    auto ne = (*this != other);  // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+    return (ne.real() | ne.imag()) & vd_one;
+  }
+
+  DEFINE_MEMBER_OP(operator==, ComplexDbl, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, ComplexDbl, vec_cmpne)
+
+  DEFINE_MEMBER_OP(operator+, ComplexDbl, vec_add)
+  DEFINE_MEMBER_OP(operator-, ComplexDbl, vec_sub)
+  DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and)
+  DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or)
+  DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor)
+  // elementwise helpers
+  DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul)
+  DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div)
+  DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt)
+  DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge)
+  DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt)
+  DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max)
+};
+
+template <>
+Vectorized<ComplexDbl> inline maximum(
+    const Vectorized<ComplexDbl>& a,
+    const Vectorized<ComplexDbl>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  // auto max = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_lt(abs_b);
+  auto max = Vectorized<ComplexDbl>::elwise_blendv(a, b, mask);
+
+  return max;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<ComplexDbl> inline minimum(
+    const Vectorized<ComplexDbl>& a,
+    const Vectorized<ComplexDbl>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  // auto min = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_gt(abs_b);
+  auto min = Vectorized<ComplexDbl>::elwise_blendv(a, b, mask);
+  return min;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(min, isnan);
+}
+
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..8894381bfc718518788e18f17af0c5fb9d5734f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@@ -0,0 +1,628 @@
+
+#pragma once
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+using ComplexFlt = c10::complex<float>;
+
+template <>
+class Vectorized<ComplexFlt> {
+ private:
+  union {
+    struct {
+      vfloat32 _vec0;
+      vfloat32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = ComplexFlt;
+  using vec_internal_type = vfloat32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(ComplexFlt val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    _vec0 = vfloat32{real_value, imag_value, real_value, imag_value};
+    _vec1 = vfloat32{real_value, imag_value, real_value, imag_value};
+  }
+
+  Vectorized(ComplexFlt val1, ComplexFlt val2, ComplexFlt val3, ComplexFlt val4) {
+    _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()};
+    _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 0, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 1, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 2, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 3, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 4, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 5, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 6, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 7, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<blendChoiceComplex(mask) == 8, Vectorized<ComplexFlt>>
+      C10_ALWAYS_INLINE
+      blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxComplexMask1(mask);
+    const vbool32 mask_2nd = VsxComplexMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static Vectorized<ComplexFlt> C10_ALWAYS_INLINE
+  el_blend(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    const vbool32 mask_2nd = VsxMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<ComplexFlt> blendv(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      const Vectorized<ComplexFlt>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_complex = Vectorized<ComplexFlt>(
+        vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1));
+    return {
+        vec_sel(a._vec0, b._vec0, reinterpret_cast<vbool32>(mask_complex._vec0)),
+        vec_sel(a._vec1, b._vec1, reinterpret_cast<vbool32>(mask_complex._vec1)),
+    };
+  }
+
+  static Vectorized<ComplexFlt> elwise_blendv(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      const Vectorized<ComplexFlt>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, reinterpret_cast<vbool32>(mask._vec0)),
+        vec_sel(a._vec1, b._vec1, reinterpret_cast<vbool32>(mask._vec1)),
+    };
+  }
+
+  template <typename step_t>
+  static Vectorized<ComplexFlt> arange(
+      ComplexFlt base = 0.,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<ComplexFlt>(
+        base,
+        base + step,
+        base + ComplexFlt(2) * step,
+        base + ComplexFlt(3) * step);
+  }
+  static Vectorized<ComplexFlt> set(
+      const Vectorized<ComplexFlt>& a,
+      const Vectorized<ComplexFlt>& b,
+      int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const float*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const float*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        vec_vsx_ld(offset0, reinterpret_cast<const float*>(tmp_values)),
+        vec_vsx_ld(offset16, reinterpret_cast<const float*>(tmp_values))};
+  }
+
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<float*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<float*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<float*>(tmp_values));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<float*>(tmp_values));
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const ComplexFlt& operator[](int idx) const = delete;
+  ComplexFlt& operator[](int idx) = delete;
+
+  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(ComplexFlt)) const {
+    __at_align__ ComplexFlt tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  Vectorized<ComplexFlt> map(ComplexFlt (*const f)(const ComplexFlt&)) const {
+    __at_align__ ComplexFlt tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+
+  static Vectorized<ComplexFlt> horizontal_add(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    // Operates on individual floats, see _mm_hadd_ps
+    // {f0+f1, s0+s1, f2+f3, s2+s3, ...}
+    // i.e. it sums the re and im of each value and interleaves first and second:
+    // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...}
+    return el_mergee(first, second) + el_mergeo(first, second);
+  }
+
+  static Vectorized<ComplexFlt> horizontal_sub_permD8(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.el_swapped(); // 2perm
+    auto second_perm = second.el_swapped(); // 2perm
+    // sum
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return el_mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  Vectorized<ComplexFlt> abs_2_() const {
+    auto a = (*this).elwise_mult(*this);
+    auto permuted = a.el_swapped();
+    a = a + permuted;
+    return a.el_mergee();
+  }
+
+  Vectorized<ComplexFlt> abs_() const {
+    auto vi = el_mergeo();
+    auto vr = el_mergee();
+    return {Sleef_hypotf4_u05vsx(vr._vec0, vi._vec0), Sleef_hypotf4_u05vsx(vr._vec1, vi._vec1)};
+  }
+
+  Vectorized<ComplexFlt> abs() const {
+    return abs_() & real_mask;
+  }
+
+  Vectorized<ComplexFlt> real_() const {
+    return *this & real_mask;
+  }
+  Vectorized<ComplexFlt> real() const {
+    return *this & real_mask;
+  }
+  Vectorized<ComplexFlt> imag_() const {
+    return *this & imag_mask;
+  }
+  Vectorized<ComplexFlt> imag() const {
+    // we can use swap_mask or sldwi
+    auto ret = imag_();
+    return {
+        vec_sldw(ret._vec0, ret._vec0, 3), vec_sldw(ret._vec1, ret._vec1, 3)};
+  }
+
+  Vectorized<ComplexFlt> conj_() const {
+    return *this ^ isign_mask;
+  }
+  Vectorized<ComplexFlt> conj() const {
+    return *this ^ isign_mask;
+  }
+
+  Vectorized<ComplexFlt> log() const {
+    // Most trigonomic ops use the log() op to improve complex number
+    // performance.
+    return map(std::log);
+  }
+
+  Vectorized<ComplexFlt> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return ret.elwise_mult(log2e_inv);
+  }
+  Vectorized<ComplexFlt> log10() const {
+    auto ret = log();
+    return ret.elwise_mult(log10e_inv);
+  }
+
+  Vectorized<ComplexFlt> log1p() const {
+    return map(std::log1p);
+  }
+
+  Vectorized<ComplexFlt> el_swapped() const {
+    vfloat32 v0 = vec_perm(_vec0, _vec0, swap_mask);
+    vfloat32 v1 = vec_perm(_vec1, _vec1, swap_mask);
+    return {v0, v1};
+  }
+
+  Vectorized<ComplexFlt> el_mergee() const {
+    // as mergee phased in , we can use vec_perm with mask
+    return {vec_mergee(_vecb0, _vecb0), vec_mergee(_vecb1, _vecb1)};
+  }
+
+  Vectorized<ComplexFlt> el_mergeo() const {
+    // as mergeo phased in , we can use vec_perm with mask
+    return {vec_mergeo(_vecb0, _vecb0), vec_mergeo(_vecb1, _vecb1)};
+  }
+
+  Vectorized<ComplexFlt> el_madd(
+      const Vectorized<ComplexFlt>& multiplier,
+      const Vectorized<ComplexFlt>& val) const {
+    return {
+        vec_madd(_vec0, multiplier._vec0, val._vec0),
+        vec_madd(_vec1, multiplier._vec1, val._vec1)};
+  }
+
+  static Vectorized<ComplexFlt> el_mergee(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergee(first._vecb0, second._vecb0),
+        vec_mergee(first._vecb1, second._vecb1)};
+  }
+
+  static Vectorized<ComplexFlt> el_mergeo(
+      Vectorized<ComplexFlt>& first,
+      Vectorized<ComplexFlt>& second) {
+    return {
+        vec_mergeo(first._vecb0, second._vecb0),
+        vec_mergeo(first._vecb1, second._vecb1)};
+  }
+
+  Vectorized<ComplexFlt> angle_() const {
+    // angle = atan2(b/a)
+    // auto b_a = _mm256_permute_ps(values, 0xB1); // b        a
+    // return Sleef_atan2f8_u10(values, b_a); // 90-angle angle
+    Vectorized<ComplexFlt> ret;
+    for (int i = 0; i < 4; i += 2) {
+      ret._vec0[i] = std::atan2(_vec0[i + 1], _vec0[i]);
+      ret._vec1[i] = std::atan2(_vec1[i + 1], _vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<ComplexFlt> angle() const {
+    return angle_() & real_mask;
+  }
+
+  Vectorized<ComplexFlt> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<ComplexFlt> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<ComplexFlt> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<ComplexFlt> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<ComplexFlt> ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<ComplexFlt> floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<ComplexFlt> neg() const {
+    auto z = Vectorized<ComplexFlt>(zero);
+    return z - *this;
+  }
+  Vectorized<ComplexFlt> round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+  Vectorized<ComplexFlt> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<ComplexFlt> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<ComplexFlt> trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<ComplexFlt> elwise_sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+
+  Vectorized<ComplexFlt> sqrt() const {
+    return map(std::sqrt);
+  }
+
+  Vectorized<ComplexFlt> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    auto c_d = *this ^ isign_mask; // c       -d
+    auto abs = abs_2_();
+    return c_d.elwise_div(abs);
+  }
+
+  Vectorized<ComplexFlt> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<ComplexFlt> pow(const Vectorized<ComplexFlt>& exp) const {
+    __at_align__ ComplexFlt x_tmp[size()];
+    __at_align__ ComplexFlt y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+
+  Vectorized<ComplexFlt> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized(imag_one);
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln * imag_half; // i/2*ln()
+  }
+  Vectorized<ComplexFlt> atanh() const {
+    return map(std::atanh);
+  }
+
+  Vectorized<ComplexFlt> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized(pi_2) - asin();
+  }
+
+  Vectorized<ComplexFlt> inline operator*(const Vectorized<ComplexFlt>& b) const {
+    //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+
+#if 1
+    // this is more vsx friendly than simulating horizontal from x86
+
+    auto vi = b.el_mergeo();
+    auto vr = b.el_mergee();
+    vi = vi ^ rsign_mask;
+    auto ret = elwise_mult(vr);
+    auto vx_swapped = el_swapped();
+    ret = vx_swapped.el_madd(vi, ret);
+    return ret;
+
+#else
+
+    auto ac_bd = elwise_mult(b);
+    auto d_c = b.el_swapped();
+    d_c = d_c ^ isign_mask;
+    auto ad_bc = elwise_mult(d_c);
+    auto ret = horizontal_sub_permD8(ac_bd, ad_bc);
+    return ret;
+#endif
+  }
+
+  Vectorized<ComplexFlt> inline operator/(const Vectorized<ComplexFlt>& b) const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2()
+    // im = (bc - ad)/abs_2()
+    auto fabs_cd =  Vectorized{
+      vec_andc(b._vec0, sign_mask),
+      vec_andc(b._vec1, sign_mask)};          // |c|            |d|
+    auto fabs_dc =  fabs_cd.el_swapped();     // |d|            |c|
+    auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|)
+    auto a2 = elwise_div(scale);              // a/sc           b/sc
+    auto b2 = b.elwise_div(scale);            // c/sc           d/sc
+    auto acbd2 = a2.elwise_mult(b2);          // ac/sc^2        bd/sc^2
+    auto dc2 = b2.el_swapped();               // d/sc           c/sc
+    dc2 = dc2 ^ rsign_mask;                   // -d/sc          c/sc
+    auto adbc2 = a2.elwise_mult(dc2);         // -ad/sc^2       bc/sc^2
+    auto ret = horizontal_add(acbd2, adbc2);  // (ac+bd)/sc^2   (bc-ad)/sc^2
+    auto denom2 = b2.abs_2_();                // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2
+    ret = ret.elwise_div(denom2);
+    return ret;
+  }
+
+  Vectorized<ComplexFlt> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+
+#if 1
+    auto conj = conj_();
+    auto b_a = conj.el_swapped();
+    auto ab = conj.elwise_mult(b_a);
+    auto im = ab + ab;
+    auto val_2 = (*this).elwise_mult(*this);
+    auto val_2_swapped = val_2.el_swapped();
+    auto re = horizontal_sub_permD8(val_2, val_2_swapped);
+    re = Vectorized<ComplexFlt>(one) - re;
+    auto root = el_blend<0xAA>(re, im).sqrt();
+    auto ln = (b_a + root).log();
+    return ln.el_swapped().conj();
+#else
+    return map(std::asin);
+#endif
+  }
+
+  Vectorized<ComplexFlt> exp() const {
+    return map(std::exp);
+  }
+  Vectorized<ComplexFlt> exp2() const {
+    return map(exp2_impl);
+  }
+  Vectorized<ComplexFlt> expm1() const {
+    return map(std::expm1);
+  }
+
+  Vectorized<ComplexFlt> eq(const Vectorized<ComplexFlt>& other) const {
+    auto eq = (*this == other);  // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers are equal
+    return (eq.real() & eq.imag()) & one;
+  }
+  Vectorized<ComplexFlt> ne(const Vectorized<ComplexFlt>& other) const {
+    auto ne = (*this != other);  // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+    return (ne.real() | ne.imag()) & one;
+  }
+
+  Vectorized<ComplexFlt> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
+  Vectorized<ComplexFlt> operator<(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator<=(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator>(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<ComplexFlt> operator>=(const Vectorized<ComplexFlt>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  DEFINE_MEMBER_OP(operator==, ComplexFlt, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, ComplexFlt, vec_cmpne)
+
+  DEFINE_MEMBER_OP(operator+, ComplexFlt, vec_add)
+  DEFINE_MEMBER_OP(operator-, ComplexFlt, vec_sub)
+  DEFINE_MEMBER_OP(operator&, ComplexFlt, vec_and)
+  DEFINE_MEMBER_OP(operator|, ComplexFlt, vec_or)
+  DEFINE_MEMBER_OP(operator^, ComplexFlt, vec_xor)
+  // elementwise helpers
+  DEFINE_MEMBER_OP(elwise_mult, ComplexFlt, vec_mul)
+  DEFINE_MEMBER_OP(elwise_div, ComplexFlt, vec_div)
+  DEFINE_MEMBER_OP(elwise_gt, ComplexFlt, vec_cmpgt)
+  DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge)
+  DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt)
+  DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple)
+  DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max)
+};
+
+template <>
+Vectorized<ComplexFlt> inline maximum(
+    const Vectorized<ComplexFlt>& a,
+    const Vectorized<ComplexFlt>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ);
+  // auto max = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_lt(abs_b);
+  auto max = Vectorized<ComplexFlt>::elwise_blendv(a, b, mask);
+
+  return max;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(max, isnan);
+}
+
+template <>
+Vectorized<ComplexFlt> inline minimum(
+    const Vectorized<ComplexFlt>& a,
+    const Vectorized<ComplexFlt>& b) {
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ);
+  // auto min = _mm256_blendv_ps(a, b, mask);
+  auto mask = abs_a.elwise_gt(abs_b);
+  auto min = Vectorized<ComplexFlt>::elwise_blendv(a, b, mask);
+  return min;
+  // Exploit the fact that all-ones is a NaN.
+  // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q);
+  // return _mm256_or_ps(min, isnan);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9aeaa8650cf9deeedd123ef61a3f68804a85b20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@@ -0,0 +1,438 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <c10/util/irange.h>
+
+#include <sleef.h>
+
+namespace at {
+namespace vec {
+
+inline namespace CPU_CAPABILITY {
+
+
+template <>
+class Vectorized<double> {
+ private:
+  union {
+    struct {
+      vfloat64 _vec0;
+      vfloat64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = double;
+  using vec_internal_type = vfloat64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(double scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      double scalar1,
+      double scalar2,
+      double scalar3,
+      double scalar4)
+      : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  int zero_mask() const {
+    auto cmp = (*this == vd_zero);
+    return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) |
+        (cmp._vecb1[1] & 8);
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 0, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 1, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 2, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      return { b._vec0, a._vec1 };
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 3, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      return { a._vec0, b._vec1 };
+  }
+
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 4, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      const vbool64 mask_1st = VsxDblMask1(mask);
+      return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1 };
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 5, Vectorized<double>> C10_ALWAYS_INLINE
+      blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      const vbool64 mask_1st = VsxDblMask1(mask);
+      return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1 };
+  }
+
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 6,
+      Vectorized<double>>
+      C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      const vbool64 mask_2nd = VsxDblMask2(mask);
+      // generated masks
+      return { a._vec0,
+          (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 7,
+      Vectorized<double>>
+      C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      const vbool64 mask_2nd = VsxDblMask2(mask);
+      // generated masks
+      return { b._vec0,
+          (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoiceDbl(mask) == 8, Vectorized<double>>
+      C10_ALWAYS_INLINE blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+      const vbool64 mask_1st = VsxDblMask1(mask);
+      const vbool64 mask_2nd = VsxDblMask2(mask);
+      return {
+          (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st),
+          (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) };
+  }
+
+
+  static Vectorized<double> C10_ALWAYS_INLINE blendv(
+      const Vectorized<double>& a,
+      const Vectorized<double>& b,
+      const Vectorized<double>& mask) {
+    // the mask used here returned by comparision of vec256
+
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  static Vectorized<double> C10_ALWAYS_INLINE
+  set(const Vectorized<double>& a, const Vectorized<double>& b, size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const double& operator[](int idx) const = delete;
+  double& operator[](int idx) = delete;
+  Vectorized<double> map(double (*const f)(double)) const {
+    Vectorized<double> ret;
+    for (const auto i : c10::irange(size()/2)) {
+        ret._vec0[i] = f(_vec0[i]);
+    }
+    for (const auto i : c10::irange(size()/2)) {
+        ret._vec1[i] = f(_vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<double> mapbi(double (*const f)(double, double), const Vectorized<double>& other)
+      const {
+    Vectorized<double> ret;
+    for (const auto i : c10::irange(size()/2)) {
+        ret._vec0[i] = f(_vec0[i], other._vec0[i]);
+    }
+    for (const auto i : c10::irange(size()/2)) {
+        ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+    }
+    return ret;
+  }
+  Vectorized<double> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE acos() const {
+     return {Sleef_acosd2_u10(_vec0), Sleef_acosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE asin() const {
+     return {Sleef_asind2_u10(_vec0), Sleef_asind2_u10(_vec1)};
+  }
+  Vectorized<double> atan() const {
+     return {Sleef_atand2_u10(_vec0), Sleef_atand2_u10(_vec1)};
+  }
+  Vectorized<double> atanh() const {
+     return {Sleef_atanhd2_u10(_vec0), Sleef_atanhd2_u10(_vec1)};
+  }
+  Vectorized<double> atan2(const Vectorized<double>& b) const {
+     return {Sleef_atan2d2_u10(_vec0, b._vec0), Sleef_atan2d2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> copysign(const Vectorized<double> &sign) const {
+    return {Sleef_copysignd2(_vec0, sign._vec0), Sleef_copysignd2(_vec1, sign._vec1)};
+  }
+  Vectorized<double> erf() const {
+     return {Sleef_erfd2_u10(_vec0), Sleef_erfd2_u10(_vec1)};
+  }
+  Vectorized<double> erfc() const {
+     return {Sleef_erfcd2_u15(_vec0), Sleef_erfcd2_u15(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp() const {
+     return {Sleef_expd2_u10(_vec0), Sleef_expd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp2() const {
+    return {Sleef_exp2d2_u10(_vec0), Sleef_exp2d2_u10(_vec1)};
+  }
+  Vectorized<double> expm1() const {
+     return {Sleef_expm1d2_u10(_vec0), Sleef_expm1d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE exp_u20() const {
+     return exp();
+  }
+
+  Vectorized<double> lgamma() const __ubsan_ignore_undefined__ {
+     return {Sleef_lgammad2_u10(_vec0), Sleef_lgammad2_u10(_vec1)};
+  }
+
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+
+  Vectorized<double> angle() const {
+    auto tmp = blendv(
+      Vectorized<double>(0), Vectorized<double>(c10::pi<double>), *this < Vectorized<double>(0));
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>{0};
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE log() const {
+     return {Sleef_logd2_u10(_vec0), Sleef_logd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log10() const {
+     return {Sleef_log10d2_u10(_vec0), Sleef_log10d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log1p() const {
+     return {Sleef_log1pd2_u10(_vec0), Sleef_log1pd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE log2() const {
+     return {Sleef_log2d2_u10(_vec0), Sleef_log2d2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE cos() const {
+     return {Sleef_cosd2_u10(_vec0), Sleef_cosd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE cosh() const {
+     return {Sleef_coshd2_u10(_vec0), Sleef_coshd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE round() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE sin() const {
+     return {Sleef_sind2_u10(_vec0), Sleef_sind2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE sinh() const {
+     return {Sleef_sinhd2_u10(_vec0), Sleef_sinhd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tan() const {
+     return {Sleef_tand2_u10(_vec0), Sleef_tand2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE tanh() const {
+     return {Sleef_tanhd2_u10(_vec0), Sleef_tanhd2_u10(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE reciprocal() const {
+    return {
+        vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one.
+        vec_div(vd_one, _vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<double> C10_ALWAYS_INLINE pow(const Vectorized<double>& b) const {
+     return {Sleef_powd2_u10(_vec0, b._vec0), Sleef_powd2_u10(_vec1, b._vec1)};
+  }
+  Vectorized<double> C10_ALWAYS_INLINE fmod(const Vectorized<double>& b) const {
+     return {Sleef_fmodd2(_vec0, b._vec0),Sleef_fmodd2(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> hypot(const Vectorized<double>& b) const {
+     return {Sleef_hypotd2_u05(_vec0, b._vec0), Sleef_hypotd2_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> nextafter(const Vectorized<double>& b) const {
+     return {Sleef_nextafterd2(_vec0, b._vec0), Sleef_nextafterd2(_vec1, b._vec1)};
+  }
+
+  Vectorized<double> igamma(const Vectorized<double>& x) const {
+    return mapbi(calc_igamma, x);
+  }
+
+  Vectorized<double> igammac(const Vectorized<double>& x) const {
+    return mapbi(calc_igammac, x);
+  }
+
+
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+
+  Vectorized<double> _nor() const {
+    return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
+  }
+
+  Vectorized<double> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._nor();
+  }
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  DEFINE_MEMBER_OP(operator==, double, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, double, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, double, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, double, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, double, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, double, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, double, vec_add)
+  DEFINE_MEMBER_OP(operator-, double, vec_sub)
+  DEFINE_MEMBER_OP(operator*, double, vec_mul)
+  DEFINE_MEMBER_OP(operator/, double, vec_div)
+  DEFINE_MEMBER_OP(maximum, double, vec_max_nan2)
+  DEFINE_MEMBER_OP(minimum, double, vec_min_nan2)
+  DEFINE_MEMBER_OP(operator&, double, vec_and)
+  DEFINE_MEMBER_OP(operator|, double, vec_or)
+  DEFINE_MEMBER_OP(operator^, double, vec_xor)
+  DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd)
+};
+template <>
+Vectorized<double> inline maximum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<double> inline minimum(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return a.minimum(b);
+}
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c36cf92ed9b49afeffd0c3eef97693e842248dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@@ -0,0 +1,461 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <sleef.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+
+inline namespace CPU_CAPABILITY {
+
+template <>
+class Vectorized<float> {
+ private:
+  union {
+    struct {
+      vfloat32 _vec0;
+      vfloat32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = float;
+  using vec_internal_type = vfloat32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(float scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      float scalar1,
+      float scalar2,
+      float scalar3,
+      float scalar4,
+      float scalar5,
+      float scalar6,
+      float scalar7,
+      float scalar8)
+      : _vec0{vfloat32{scalar1, scalar2, scalar3, scalar4}},
+        _vec1{vfloat32{scalar5, scalar6, scalar7, scalar8}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 0, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 1, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 2, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 3, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 4, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 5, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 6, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_2nd = VsxMask2(mask);
+    // generated masks
+    return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 7, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_2nd = VsxMask2(mask);
+    // generated masks
+    return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice(mask) == 8, Vectorized<float>> C10_ALWAYS_INLINE
+  blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    const vbool32 mask_1st = VsxMask1(mask);
+    const vbool32 mask_2nd = VsxMask2(mask);
+    return {
+        (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  static Vectorized<float> C10_ALWAYS_INLINE blendv(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      const Vectorized<float>& mask) {
+    // the mask used here returned by comparision of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<float> set(
+      const Vectorized<float>& a,
+      const Vectorized<float>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  const float& operator[](int idx) const = delete;
+  float& operator[](int idx) = delete;
+
+  Vectorized<float> map(float (*const f)(float)) const {
+    Vectorized<float> ret;
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec0[i] = f(_vec0[i]);
+    }
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec1[i] = f(_vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<float> mapbi(float (*const f)(float, float), const Vectorized<float>& other)
+      const {
+    Vectorized<float> ret;
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec0[i] = f(_vec0[i], other._vec0[i]);
+    }
+    for (int i = 0; i < size() / 2; i++) {
+      ret._vec1[i] = f(_vec1[i], other._vec1[i]);
+    }
+    return ret;
+  }
+
+  Vectorized<float> _nor() const {
+    return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)};
+  }
+
+  Vectorized<float> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._nor();
+  }
+
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit
+    // and others are translated to 0-bit
+    //__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ);
+    auto cmp = (*this == zero);
+    // return _mm256_movemask_ps(cmp);
+    // possible simulation  //mask= lvsl ( 0 ) vbpermq( vec, mask <<5)
+    vuint64 result0 = vec_vbpermq((vuint8)cmp._vecb0, mask_zero_bits);
+    vuint64 result1 = vec_vbpermq((vuint8)cmp._vecb1, mask_zero_bits);
+    return (result0[1] >> 12 | (result1[1] >> 8));
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE acos() const {
+    return {Sleef_acosf4_u10(_vec0), Sleef_acosf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE asin() const {
+    return {Sleef_asinf4_u10(_vec0), Sleef_asinf4_u10(_vec1)};
+  }
+  Vectorized<float> atan() const {
+    return {Sleef_atanf4_u10(_vec0), Sleef_atanf4_u10(_vec1)};
+  }
+  Vectorized<float> atanh() const {
+    return {Sleef_atanhf4_u10(_vec0), Sleef_atanhf4_u10(_vec1)};
+  }
+  Vectorized<float> atan2(const Vectorized<float>& b) const {
+    return {Sleef_atan2f4_u10(_vec0, b._vec0), Sleef_atan2f4_u10(_vec1, b._vec1)};
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    return {Sleef_copysignf4(_vec0, sign._vec0), Sleef_copysignf4(_vec1, sign._vec1)};
+  }
+  Vectorized<float> lgamma() const {
+    return {Sleef_lgammaf4_u10(_vec0), Sleef_lgammaf4_u10(_vec1)};
+  }
+  Vectorized<float> erf() const {
+    return {Sleef_erff4_u10(_vec0), Sleef_erff4_u10(_vec1)};
+  }
+
+  Vectorized<float> erfc() const {
+    return {Sleef_erfcf4_u15(_vec0), Sleef_erfcf4_u15(_vec1)};
+  }
+
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+
+  Vectorized<float> angle() const {
+    auto tmp = blendv(
+      Vectorized<float>(0), Vectorized<float>(c10::pi<float>), *this < Vectorized<float>(0));
+    return blendv(tmp, *this, isnan());
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>{0};
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE exp() const {
+    return {Sleef_expf4_u10(_vec0), Sleef_expf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE exp2() const {
+    return {Sleef_exp2f4_u10(_vec0), Sleef_exp2f4_u10(_vec1)};
+  }
+  Vectorized<float> expm1() const {
+    return {Sleef_expm1f4_u10(_vec0), Sleef_expm1f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE exp_u20() const {
+    return exp();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE log() const {
+    return {Sleef_logf4_u10(_vec0), Sleef_logf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log10() const {
+    return {Sleef_log10f4_u10(_vec0), Sleef_log10f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log1p() const {
+    return {Sleef_log1pf4_u10(_vec0), Sleef_log1pf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE log2() const {
+    return {Sleef_log2f4_u10(_vec0), Sleef_log2f4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE cos() const {
+    return {Sleef_cosf4_u10(_vec0), Sleef_cosf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE cosh() const {
+    return {Sleef_coshf4_u10(_vec0), Sleef_coshf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE sin() const {
+    return {Sleef_sinf4_u10(_vec0), Sleef_sinf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE sinh() const {
+    return {Sleef_sinhf4_u10(_vec0), Sleef_sinhf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE tan() const {
+    return {Sleef_tanf4_u10(_vec0), Sleef_tanf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE tanh() const {
+    return {Sleef_tanhf4_u10(_vec0), Sleef_tanhf4_u10(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<float> C10_ALWAYS_INLINE reciprocal() const {
+    return Vectorized<float>(one) / (*this);
+  }
+  Vectorized<float> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<float> C10_ALWAYS_INLINE pow(const Vectorized<float>& exp) const {
+    return {Sleef_powf4_u10(_vec0, exp._vec0), Sleef_powf4_u10(_vec1, exp._vec1)};
+  }
+
+  Vectorized<float> fmod(const Vectorized<float>& b) const {
+    return {Sleef_fmodf4(_vec0, b._vec0),Sleef_fmodf4(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> hypot(const Vectorized<float>& b) const {
+    return {Sleef_hypotf4_u05(_vec0, b._vec0), Sleef_hypotf4_u05(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> nextafter(const Vectorized<float>& b) const {
+    return {Sleef_nextafterf4(_vec0, b._vec0), Sleef_nextafterf4(_vec1, b._vec1)};
+  }
+
+  Vectorized<float> igamma(const Vectorized<float>& x) const {
+    return mapbi(calc_igamma, x);
+  }
+
+  Vectorized<float> igammac(const Vectorized<float>& x) const {
+    return mapbi(calc_igammac, x);
+  }
+
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+
+  DEFINE_MEMBER_OP(operator==, float, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, float, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, float, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, float, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, float, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, float, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, float, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, float, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, float, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, float, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, float, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, float, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, float, vec_add)
+  DEFINE_MEMBER_OP(operator-, float, vec_sub)
+  DEFINE_MEMBER_OP(operator*, float, vec_mul)
+  DEFINE_MEMBER_OP(operator/, float, vec_div)
+  DEFINE_MEMBER_OP(maximum, float, vec_max_nan2)
+  DEFINE_MEMBER_OP(minimum, float, vec_min_nan2)
+  DEFINE_MEMBER_OP(operator&, float, vec_and)
+  DEFINE_MEMBER_OP(operator|, float, vec_or)
+  DEFINE_MEMBER_OP(operator^, float, vec_xor)
+  DEFINE_MEMBER_TERNARY_OP(madd, float, vec_madd)
+};
+
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return a.minimum(b);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..05a6b7d007f5e466d72b79602ad1b12f1ebb7dd9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@@ -0,0 +1,368 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+class Vectorized<int16_t> {
+ private:
+  union {
+    struct {
+      vint16 _vec0;
+      vint16 _vec1;
+    };
+    struct {
+      vbool16 _vecb0;
+      vbool16 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int16_t;
+  using vec_internal_type = vint16;
+  using vec_internal_mask_type = vbool16;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint16 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool16 v1, vbool16 v2) : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int16_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+
+  C10_ALWAYS_INLINE Vectorized(
+      int16_t scalar1,
+      int16_t scalar2,
+      int16_t scalar3,
+      int16_t scalar4,
+      int16_t scalar5,
+      int16_t scalar6,
+      int16_t scalar7,
+      int16_t scalar8,
+      int16_t scalar9,
+      int16_t scalar10,
+      int16_t scalar11,
+      int16_t scalar12,
+      int16_t scalar13,
+      int16_t scalar14,
+      int16_t scalar15,
+      int16_t scalar16)
+      : _vec0{vint16{
+            scalar1,
+            scalar2,
+            scalar3,
+            scalar4,
+            scalar5,
+            scalar6,
+            scalar7,
+            scalar8}},
+        _vec1{vint16{
+            scalar9,
+            scalar10,
+            scalar11,
+            scalar12,
+            scalar13,
+            scalar14,
+            scalar15,
+            scalar16}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int16_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 65535) == 65535, Vectorized<int16_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 255, Vectorized<int16_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 255), Vectorized<int16_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0 = (mask & 1) * 0xffff;
+    constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff;
+    const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7};
+
+    return {(vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {b._vec0, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t mask2 = (mask & 65535) >> 16;
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {a, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) &&
+       ((mask & 255) != 255)),
+      Vectorized<int16_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+    constexpr int16_t g0 = (mask & 1) * 0xffff;
+    constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff;
+    constexpr int16_t mask2 = (mask & 65535) >> 16;
+    constexpr int16_t g0_2 = (mask & 1) * 0xffff;
+    constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff;
+    constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff;
+    constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff;
+    constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff;
+    constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff;
+    constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff;
+    constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff;
+
+    const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7};
+    const vint16 mask_2nd =
+        vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2};
+    // generated masks
+    return {
+        (vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st),
+        (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)};
+  }
+
+  static Vectorized<int16_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      const Vectorized<int16_t>& mask) {
+    // the mask used here returned by comparision of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    // warning intel style mask will not work properly
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<int16_t> arange(int16_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+  static Vectorized<int16_t> set(
+      const Vectorized<int16_t>& a,
+      const Vectorized<int16_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int16_t& operator[](int idx) const = delete;
+  int16_t& operator[](int idx) = delete;
+
+  Vectorized<int16_t> angle() const {
+    return blendv(
+      Vectorized<int16_t>(0), Vectorized<int16_t>(c10::pi<int16_t>), *this < Vectorized<int16_t>(0));
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return Vectorized<int16_t>{0};
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int16_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int16_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int16_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int16_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int16_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int16_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int16_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int16_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int16_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int16_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int16_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int16_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int16_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int16_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int16_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int16_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int16_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int16_t, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int16_t, /)
+  DEFINE_MEMBER_OP(maximum, int16_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int16_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int16_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int16_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int16_t, vec_xor)
+};
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+               vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+               vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+         return Vectorized<int16_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+               vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+               vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1()) ;
+         return Vectorized<int16_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int16_t> inline maximum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b) {
+  return a.minimum(b);
+}
+
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6299b43458b2b0ab6836d2d003a12f2ad8c31e6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@@ -0,0 +1,298 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+class Vectorized<int32_t> {
+ private:
+  union {
+    struct {
+      vint32 _vec0;
+      vint32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int32_t;
+  using vec_internal_type = vint32;
+  using vec_internal_mask_type = vbool32;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int32_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      int32_t scalar1,
+      int32_t scalar2,
+      int32_t scalar3,
+      int32_t scalar4,
+      int32_t scalar5,
+      int32_t scalar6,
+      int32_t scalar7,
+      int32_t scalar8)
+      : _vec0{vint32{scalar1, scalar2, scalar3, scalar4}},
+        _vec1{vint32{scalar5, scalar6, scalar7, scalar8}} {}
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int32_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 255) == 255, Vectorized<int32_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 15, Vectorized<int32_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 15), Vectorized<int32_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
+    constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+    const vbool32 mask_1st = (vbool32){g0, g1, g2, g3};
+
+    return {(vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {b._vec0, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {a, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) &&
+       ((mask & 15) != 15)),
+      Vectorized<int32_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+    constexpr uint32_t g0 = (mask & 1) * 0xffffffff;
+    constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+    constexpr uint32_t mask2 = (mask & 255) >> 4;
+    constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff;
+    constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff;
+    constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff;
+    constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff;
+
+    const vbool32 mask_1st = (vbool32){g0, g1, g2, g3};
+    const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2};
+    // generated masks
+    return {
+        (vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st),
+        (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)};
+  }
+
+  static Vectorized<int32_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      const Vectorized<int32_t>& mask) {
+    // the mask used here returned by comparision of vec256
+    // assuming this we can use the same mask directly with vec_sel
+    // warning intel style mask will not work properly
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+
+  template <typename step_t>
+  static Vectorized<int32_t> arange(int32_t base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+  static Vectorized<int32_t> set(
+      const Vectorized<int32_t>& a,
+      const Vectorized<int32_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int32_t& operator[](int idx) const = delete;
+  int32_t& operator[](int idx) = delete;
+
+  Vectorized<int32_t> angle() const {
+    return blendv(
+      Vectorized<int32_t>(0), Vectorized<int32_t>(c10::pi<int32_t>), *this < Vectorized<int32_t>(0));
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return Vectorized<int32_t>{0};
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int32_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int32_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int32_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int32_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int32_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int32_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int32_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int32_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int32_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int32_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int32_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int32_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int32_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int32_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int32_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int32_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int32_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int32_t, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int32_t, /)
+  DEFINE_MEMBER_OP(maximum, int32_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int32_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int32_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int32_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int32_t, vec_xor)
+};
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+                vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+                vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
+          return Vectorized<int32_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+                vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+                vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
+          return Vectorized<int32_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int32_t> inline maximum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return a.minimum(b);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a52f763aa84bfd7c06006e1187f5cb38daee320
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@@ -0,0 +1,251 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <>
+class Vectorized<int64_t> {
+ private:
+  union {
+    struct {
+      vint64 _vec0;
+      vint64 _vec1;
+    };
+    struct {
+      vbool64 _vecb0;
+      vbool64 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  using value_type = int64_t;
+  using vec_internal_type = vint64;
+  using vec_internal_mask_type = vbool64;
+  using size_type = int;
+  using ElementType = signed long long;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  C10_ALWAYS_INLINE Vectorized(vint64 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(int64_t scalar)
+      : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {}
+  C10_ALWAYS_INLINE Vectorized(
+      int64_t scalar1,
+      int64_t scalar2,
+      int64_t scalar3,
+      int64_t scalar4)
+      : _vec0{vint64{scalar1, scalar2}}, _vec1{vint64{scalar3, scalar4}} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 0, Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return a;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<mask == 3, Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask & 15) == 15, Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    return b;
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 0 && mask < 3), Vectorized<int64_t>> C10_ALWAYS_INLINE
+  blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+    constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+    const vbool64 mask_1st = (vbool64){g0, g1};
+    return {(vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), a._vec1};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vectorized<int64_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
+    constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff;
+
+    const vbool64 mask_2nd = (vbool64){g0_2, g1_2};
+    return {a._vec0, (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)};
+  }
+
+  template <uint64_t mask>
+  static std::enable_if_t<
+      (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15,
+      Vectorized<int64_t>>
+      C10_ALWAYS_INLINE blend(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+    constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+    constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+    constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff;
+    constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff;
+
+    const vbool64 mask_1st = (vbool64){g0, g1};
+    const vbool64 mask_2nd = (vbool64){g0_2, g1_2};
+    return {
+        (vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st),
+        (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)};
+  }
+
+  static Vectorized<int64_t> C10_ALWAYS_INLINE blendv(
+      const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      const Vectorized<int64_t>& mask) {
+    // the mask used here returned by comparision of vec256
+
+    return {
+        vec_sel(a._vec0, b._vec0, mask._vecb0),
+        vec_sel(a._vec1, b._vec1, mask._vecb1)};
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(int64_t base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  static Vectorized<int64_t> C10_ALWAYS_INLINE
+  set(const Vectorized<int64_t>& a,
+      const Vectorized<int64_t>& b,
+      size_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+
+    return b;
+  }
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      static_assert(sizeof(double) == sizeof(value_type));
+      const double* dptr = reinterpret_cast<const double*>(ptr);
+      return {// treat it as double load
+              (vint64)vec_vsx_ld(offset0, dptr),
+              (vint64)vec_vsx_ld(offset16, dptr)};
+    }
+
+    __at_align__ double tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {
+        (vint64)vec_vsx_ld(offset0, tmp_values),
+        (vint64)vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      double* dptr = reinterpret_cast<double*>(ptr);
+      vec_vsx_st((vfloat64)_vec0, offset0, dptr);
+      vec_vsx_st((vfloat64)_vec1, offset16, dptr);
+    } else if (count > 0) {
+      __at_align__ double tmp_values[size()];
+      vec_vsx_st((vfloat64)_vec0, offset0, tmp_values);
+      vec_vsx_st((vfloat64)_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+  const int64_t& operator[](int idx) const = delete;
+  int64_t& operator[](int idx) = delete;
+
+  Vectorized<int64_t> angle() const {
+    return blendv(
+      Vectorized<int64_t>(0), Vectorized<int64_t>(c10::pi<int64_t>), *this < Vectorized<int64_t>(0));
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return Vectorized<int64_t>{0};
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+
+  Vectorized<int64_t> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  Vectorized<int64_t> C10_ALWAYS_INLINE neg() const {
+    return {vec_neg(_vec0), vec_neg(_vec1)};
+  }
+
+  DEFINE_MEMBER_UNARY_OP(operator~, int64_t, vec_not)
+  DEFINE_MEMBER_OP(operator==, int64_t, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, int64_t, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, int64_t, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, int64_t, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, int64_t, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, int64_t, vec_cmpge)
+  DEFINE_MEMBER_OP_AND_ONE(eq, int64_t, vec_cmpeq)
+  DEFINE_MEMBER_OP_AND_ONE(ne, int64_t, vec_cmpne)
+  DEFINE_MEMBER_OP_AND_ONE(lt, int64_t, vec_cmplt)
+  DEFINE_MEMBER_OP_AND_ONE(le, int64_t, vec_cmple)
+  DEFINE_MEMBER_OP_AND_ONE(gt, int64_t, vec_cmpgt)
+  DEFINE_MEMBER_OP_AND_ONE(ge, int64_t, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, int64_t, vec_add)
+  DEFINE_MEMBER_OP(operator-, int64_t, vec_sub)
+  DEFINE_MEMBER_OP(operator*, int64_t, vec_mul)
+  DEFINE_MEMBER_OP(operator/, int64_t, vec_div)
+  DEFINE_MEMBER_OP(maximum, int64_t, vec_max)
+  DEFINE_MEMBER_OP(minimum, int64_t, vec_min)
+  DEFINE_MEMBER_OP(operator&, int64_t, vec_and)
+  DEFINE_MEMBER_OP(operator|, int64_t, vec_or)
+  DEFINE_MEMBER_OP(operator^, int64_t, vec_xor)
+};
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+                vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+                vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
+          return Vectorized<int64_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+                vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+                vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
+          return Vectorized<int64_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+}
+
+template <>
+Vectorized<int64_t> inline maximum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<int64_t> inline minimum(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return a.minimum(b);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..4687883eaa419a8012427fb19d73bd1eaf71bc89
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@@ -0,0 +1,245 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <c10/util/qint32.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct Vectorized<c10::qint32> {
+ private:
+  union {
+    struct {
+      vint32 _vec0;
+      vint32 _vec1;
+    };
+    struct {
+      vbool32 _vecb0;
+      vbool32 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 1;
+  }
+  static constexpr int int_num_vecs() {
+    return 1;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 1>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+  using value_type = c10::qint32::underlying;
+  using vec_internal_type = vint32;
+  using vec_internal_mask_type = vbool32;
+  C10_ALWAYS_INLINE Vectorized(vint32 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  Vectorized(const c10::qint32& val)
+      : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {}
+
+  static Vectorized<c10::qint32> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+
+    __at_align__ value_type tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    vfloat32 float_vals0 = vec_float(_vec0);
+    vfloat32 float_vals1 = vec_float(_vec1);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
+    vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
+    return {Vectorized<float>{
+        vec_madd(scale_vec0, float_vals0, scale_zp_premul0),
+        vec_madd(scale_vec1, float_vals1, scale_zp_premul1)}};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    vfloat32 float_vals0 = vec_float(_vec0);
+    vfloat32 float_vals1 = vec_float(_vec1);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {Vectorized<float>{
+        (float_vals0 - zero_point0) * scale_vec0,
+        (float_vals1 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    Vectorized<c10::qint32> retval;
+
+    const vint32 vmin = vec_splats(std::numeric_limits<value_type>::min());
+    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());
+    vfloat32 inverse_scale_v = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)(zero_point));
+    Vectorized<float> vf0 = rhs[0];
+
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vecf0 = vec_mul(vecf0, inverse_scale_v);
+    vecf1 = vec_mul(vecf1, inverse_scale_v);
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vint32 veci0  = vec_signed(vecf0);
+    vint32 veci1  = vec_signed(vecf1);
+
+    veci0 = vec_max(veci0, vmin);
+    veci1 = vec_max(veci1, vmin);
+    veci0 = vec_min(veci0, vmax);
+    veci1 = vec_min(veci1, vmax);
+
+    return {veci0, veci1};
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) const {
+    vint32 max0 = vec_max(_vec0, zero_point._vec0);
+    vint32 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    return {*this - b};
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    const vint32 vmin = vec_splats(std::numeric_limits<value_type>::min());
+    const vint32 vmax = vec_splats(std::numeric_limits<value_type>::max());
+    vfloat32 vec_mult = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+    Vectorized<c10::qint32> vi = inp[0];
+    vfloat32 vecf0 = vec_float(vi.vec0());
+    vfloat32 vecf1 = vec_float(vi.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_mult);
+    vecf1 = vec_mul(vecf1, vec_mult);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+
+    vint32 veci0  = vec_add(vec_signed(vecf0),vec_zero_point);
+    vint32 veci1  = vec_add(vec_signed(vecf1),vec_zero_point);
+
+    veci0 = vec_max(veci0, vmin);
+    veci1 = vec_max(veci1, vmin);
+    veci0 = vec_min(veci0, vmax);
+    veci1 = vec_min(veci1, vmax);
+
+    return {veci0, veci1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::qint32, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::qint32, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::qint32, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::qint32, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::qint32, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::qint32, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint32, /)
+  DEFINE_MEMBER_OP(maximum, c10::qint32, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::qint32, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::qint32, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::qint32, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::qint32, vec_xor)
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline minimum(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return a.minimum(b);
+}
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ed4242137b41831a3aaa4249f557180dc8de16a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@@ -0,0 +1,447 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+#include <c10/util/qint8.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+template <>
+struct Vectorized<c10::qint8> {
+ private:
+  union {
+    struct {
+      vint8 _vec0;
+      vint8 _vec1;
+    };
+    struct {
+      vbool8 _vecb0;
+      vbool8 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 4;
+  }
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::qint8::underlying;
+  using vec_internal_type = vint8;
+  using vec_internal_mask_type = vbool8;
+  // Broadcast constructor
+  C10_ALWAYS_INLINE Vectorized(const c10::qint8& val)
+      : _vec0{vec_splats(val.val_)}, _vec1{vec_splats(val.val_)} {}
+
+  C10_ALWAYS_INLINE Vectorized(const Vectorized<c10::qint8>& other)
+      : _vec0{other._vec0}, _vec1(other._vec1) {}
+
+  C10_ALWAYS_INLINE Vectorized(vint8 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vint8 v1, vint8 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  static C10_ALWAYS_INLINE Vectorized<c10::qint8> loadu(
+      const void* ptr,
+      int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const vint8*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const vint8*>(ptr))};
+    }
+    __at_align__ value_type tmp_values[size()];
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+ public:
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
+    vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
+    return {
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_0, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_0, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_1, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_1, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_2, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_2, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_3, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_3, scale_zp_premul1)}};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {
+        Vectorized<float>{
+            (vecf0_0 - zero_point0) * scale_vec0,
+            (vecf1_0 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_1 - zero_point0) * scale_vec0,
+            (vecf1_1 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_2 - zero_point0) * scale_vec0,
+            (vecf1_2 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_3 - zero_point0) * scale_vec0,
+            (vecf1_3 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    // constexpr int32_t min_val = std::numeric_limits<value_type>::min();
+    // constexpr int32_t max_val = std::numeric_limits<value_type>::max();
+
+    vfloat32 inverse_scale_v = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)zero_point);
+    // vint32 vmin = vec_splats(min_val);
+    // vint32 vmax = vec_splats(max_val);
+
+    Vectorized<float> vf0 = rhs[0];
+    Vectorized<float> vf1 = rhs[1];
+    Vectorized<float> vf2 = rhs[2];
+    Vectorized<float> vf3 = rhs[3];
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vfloat32 vecf2 = vf1.vec0();
+    vfloat32 vecf3 = vf1.vec1();
+
+    vfloat32 vecf4 = vf2.vec0();
+    vfloat32 vecf5 = vf2.vec1();
+    vfloat32 vecf6 = vf3.vec0();
+    vfloat32 vecf7 = vf3.vec1();
+
+    vecf0 = vec_mul(vecf0, inverse_scale_v);
+    vecf1 = vec_mul(vecf1, inverse_scale_v);
+    vecf2 = vec_mul(vecf2, inverse_scale_v);
+    vecf3 = vec_mul(vecf3, inverse_scale_v);
+
+    vecf4 = vec_mul(vecf4, inverse_scale_v);
+    vecf5 = vec_mul(vecf5, inverse_scale_v);
+    vecf6 = vec_mul(vecf6, inverse_scale_v);
+    vecf7 = vec_mul(vecf7, inverse_scale_v);
+
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vecf2 = vec_add(vec_rint(vecf2), vec_zero_point);
+    vecf3 = vec_add(vec_rint(vecf3), vec_zero_point);
+
+    vecf4 = vec_add(vec_rint(vecf4), vec_zero_point);
+    vecf5 = vec_add(vec_rint(vecf5), vec_zero_point);
+    vecf6 = vec_add(vec_rint(vecf6), vec_zero_point);
+    vecf7 = vec_add(vec_rint(vecf7), vec_zero_point);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    // veci0 = vec_min(vmax, vec_max( vmin, vecf0)) ;
+    // veci1 = vec_min(vmax, vec_max( vmin, vecf1)) ;
+    // veci2 = vec_min(vmax, vec_max( vmin, vecf2)) ;
+    // veci3 = vec_min(vmax, vec_max( vmin, vecf3)) ;
+
+    // veci4 = vec_min(vmax, vec_max( vmin, vecf4)) ;
+    // veci5 = vec_min(vmax, vec_max( vmin, vecf5)) ;
+    // veci6 = vec_min(vmax, vec_max( vmin, vecf6)) ;
+    // veci7 = vec_min(vmax, vec_max( vmin, vecf7)) ;
+    // vec_packs CLAMP already
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vint8 vec0 = vec_packs(vecshi0, vecshi1);
+    vint8 vec1 = vec_packs(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  Vectorized<c10::qint8> C10_ALWAYS_INLINE relu(Vectorized<c10::qint8> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::qint8> C10_ALWAYS_INLINE
+  relu6(Vectorized<c10::qint8> zero_point, Vectorized<c10::qint8> q_six) const {
+    vint8 max0 = vec_max(_vec0, zero_point._vec0);
+    vint8 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    vint16 vecshi0 = vec_unpackh(_vec0);
+    vint16 vecBshi0 = vec_unpackh(b._vec0);
+    vint16 vecshi1 = vec_unpackl(_vec0);
+    vint16 vecBshi1 = vec_unpackl(b._vec0);
+
+    vint16 vecshi2 = vec_unpackh(_vec1);
+    vint16 vecBshi2 = vec_unpackh(b._vec1);
+    vint16 vecshi3 = vec_unpackl(_vec1);
+    vint16 vecBshi3 = vec_unpackl(b._vec1);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 vecBi0 = vec_unpackh(vecBshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+    vint32 vecBi1 = vec_unpackl(vecBshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 vecBi2 = vec_unpackh(vecBshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+    vint32 vecBi3 = vec_unpackl(vecBshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 vecBi4 = vec_unpackh(vecBshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+    vint32 vecBi5 = vec_unpackl(vecBshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 vecBi6 = vec_unpackh(vecBshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+    vint32 vecBi7 = vec_unpackl(vecBshi3);
+
+    return {
+        Vectorized<c10::qint32>(veci0 - vecBi0, veci1 - vecBi1),
+        Vectorized<c10::qint32>(veci2 - vecBi2, veci3 - vecBi3),
+        Vectorized<c10::qint32>(veci4 - vecBi4, veci5 - vecBi5),
+        Vectorized<c10::qint32>(veci6 - vecBi6, veci7 - vecBi7)};
+  }
+
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    vfloat32 vec_multiplier = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    vfloat32 vecf0 = vec_float(vi0.vec0());
+    vfloat32 vecf1 = vec_float(vi0.vec1());
+    vfloat32 vecf2 = vec_float(vi1.vec0());
+    vfloat32 vecf3 = vec_float(vi1.vec1());
+
+    vfloat32 vecf4 = vec_float(vi2.vec0());
+    vfloat32 vecf5 = vec_float(vi2.vec1());
+    vfloat32 vecf6 = vec_float(vi3.vec0());
+    vfloat32 vecf7 = vec_float(vi3.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_multiplier);
+    vecf1 = vec_mul(vecf1, vec_multiplier);
+    vecf2 = vec_mul(vecf2, vec_multiplier);
+    vecf3 = vec_mul(vecf3, vec_multiplier);
+
+    vecf4 = vec_mul(vecf4, vec_multiplier);
+    vecf5 = vec_mul(vecf5, vec_multiplier);
+    vecf6 = vec_mul(vecf6, vec_multiplier);
+    vecf7 = vec_mul(vecf7, vec_multiplier);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+    vecf2 = vec_rint(vecf2);
+    vecf3 = vec_rint(vecf3);
+
+    vecf4 = vec_rint(vecf4);
+    vecf5 = vec_rint(vecf5);
+    vecf6 = vec_rint(vecf6);
+    vecf7 = vec_rint(vecf7);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    veci0 = vec_add(veci0, vec_zero_point);
+    veci1 = vec_add(veci1, vec_zero_point);
+    veci2 = vec_add(veci2, vec_zero_point);
+    veci3 = vec_add(veci3, vec_zero_point);
+
+    veci4 = vec_add(veci4, vec_zero_point);
+    veci5 = vec_add(veci5, vec_zero_point);
+    veci6 = vec_add(veci6, vec_zero_point);
+    veci7 = vec_add(veci7, vec_zero_point);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vint8 vec0 = vec_packs(vecshi0, vecshi1);
+    vint8 vec1 = vec_packs(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::qint8, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::qint8, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::qint8, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::qint8, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::qint8, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::qint8, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint8, /)
+  DEFINE_MEMBER_OP(maximum, c10::qint8, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::qint8, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::qint8, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::qint8, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::qint8, vec_xor)
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint8> inline minimum(
+    const Vectorized<c10::qint8>& a,
+    const Vectorized<c10::qint8>& b) {
+  return a.minimum(b);
+}
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a0e79400b833db8cc5cb053f53cb78cdad00a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@@ -0,0 +1,466 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec256/vsx/vsx_helpers.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/quint8.h>
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+const vint16 mask_unsigned = vec_splats((short int)0xFF);
+template <>
+struct Vectorized<c10::quint8> {
+ private:
+  union {
+    struct {
+      vuint8 _vec0;
+      vuint8 _vec1;
+    };
+    struct {
+      vbool8 _vecb0;
+      vbool8 _vecb1;
+    };
+
+  } __attribute__((__may_alias__));
+
+ public:
+  Vectorized() {}
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return 4;
+  }
+  static constexpr int int_num_vecs() {
+    return 4;
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, 4>;
+  using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+  using value_type = typename c10::quint8::underlying;
+  using vec_internal_type = vuint8;
+  using vec_internal_mask_type = vbool8;
+  // Broadcast constructor
+  C10_ALWAYS_INLINE Vectorized(const c10::quint8& val)
+      : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {}
+
+  C10_ALWAYS_INLINE Vectorized(const Vectorized<c10::quint8>& other)
+      : _vec0{other._vec0}, _vec1(other._vec1) {}
+
+  C10_ALWAYS_INLINE Vectorized(vuint8 v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {}
+  C10_ALWAYS_INLINE Vectorized(vuint8 v1, vuint8 v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {}
+
+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
+  static C10_ALWAYS_INLINE Vectorized<c10::quint8> loadu(
+      const void* ptr,
+      int count = size()) {
+    if (count == size()) {
+      return {
+          vec_vsx_ld(offset0, reinterpret_cast<const value_type*>(ptr)),
+          vec_vsx_ld(offset16, reinterpret_cast<const value_type*>(ptr))};
+    }
+    __at_align__ value_type tmp_values[size()];
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type));
+    return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)};
+  }
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_vsx_st(_vec0, offset0, reinterpret_cast<value_type*>(ptr));
+      vec_vsx_st(_vec1, offset16, reinterpret_cast<value_type*>(ptr));
+    } else if (count > 0) {
+      __at_align__ value_type tmp_values[size()];
+      vec_vsx_st(_vec0, offset0, tmp_values);
+      vec_vsx_st(_vec1, offset16, tmp_values);
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(value_type));
+    }
+  }
+
+ public:
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    // unpacking unsigned as signed
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+
+    // signed ->  unsigned
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 scale_zp_premul0 = scale_zp_premul.vec0();
+    vfloat32 scale_zp_premul1 = scale_zp_premul.vec1();
+    return {
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_0, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_0, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_1, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_1, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_2, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_2, scale_zp_premul1)},
+        Vectorized<float>{
+            vec_madd(scale_vec0, vecf0_3, scale_zp_premul0),
+            vec_madd(scale_vec1, vecf1_3, scale_zp_premul1)}};
+  }
+
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    // unpacking unsigned as signed
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+
+    // signed ->  unsigned
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+
+    vfloat32 vecf0_0 = vec_float(veci0);
+    vfloat32 vecf1_0 = vec_float(veci1);
+
+    vfloat32 vecf0_1 = vec_float(veci2);
+    vfloat32 vecf1_1 = vec_float(veci3);
+
+    vfloat32 vecf0_2 = vec_float(veci4);
+    vfloat32 vecf1_2 = vec_float(veci5);
+
+    vfloat32 vecf0_3 = vec_float(veci6);
+    vfloat32 vecf1_3 = vec_float(veci7);
+    vfloat32 scale_vec0 = scale.vec0();
+    vfloat32 scale_vec1 = scale.vec1();
+    vfloat32 zero_point0 = zero_point.vec0();
+    vfloat32 zero_point1 = zero_point.vec1();
+    return {
+        Vectorized<float>{
+            (vecf0_0 - zero_point0) * scale_vec0,
+            (vecf1_0 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_1 - zero_point0) * scale_vec0,
+            (vecf1_1 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_2 - zero_point0) * scale_vec0,
+            (vecf1_2 - zero_point1) * scale_vec1},
+        Vectorized<float>{
+            (vecf0_3 - zero_point0) * scale_vec0,
+            (vecf1_3 - zero_point1) * scale_vec1}};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    // constexpr int32_t min_val = std::numeric_limits<value_type>::min();
+    // constexpr int32_t max_val = std::numeric_limits<value_type>::max();
+
+    vfloat32 vec_inverse = vec_splats(inverse_scale);
+    vfloat32 vec_zero_point = vec_splats((float)zero_point);
+    // vuint32 vmin = vec_splats(min_val);
+    // vuint32 vmax = vec_splats(max_val);
+    Vectorized<float> vf0 = rhs[0];
+    Vectorized<float> vf1 = rhs[1];
+    Vectorized<float> vf2 = rhs[2];
+    Vectorized<float> vf3 = rhs[3];
+    vfloat32 vecf0 = vf0.vec0();
+    vfloat32 vecf1 = vf0.vec1();
+    vfloat32 vecf2 = vf1.vec0();
+    vfloat32 vecf3 = vf1.vec1();
+
+    vfloat32 vecf4 = vf2.vec0();
+    vfloat32 vecf5 = vf2.vec1();
+    vfloat32 vecf6 = vf3.vec0();
+    vfloat32 vecf7 = vf3.vec1();
+
+    vecf0 = vec_mul(vecf0, vec_inverse);
+    vecf1 = vec_mul(vecf1, vec_inverse);
+    vecf2 = vec_mul(vecf2, vec_inverse);
+    vecf3 = vec_mul(vecf3, vec_inverse);
+
+    vecf4 = vec_mul(vecf4, vec_inverse);
+    vecf5 = vec_mul(vecf5, vec_inverse);
+    vecf6 = vec_mul(vecf6, vec_inverse);
+    vecf7 = vec_mul(vecf7, vec_inverse);
+
+    vecf0 = vec_add(vec_rint(vecf0), vec_zero_point);
+    vecf1 = vec_add(vec_rint(vecf1), vec_zero_point);
+    vecf2 = vec_add(vec_rint(vecf2), vec_zero_point);
+    vecf3 = vec_add(vec_rint(vecf3), vec_zero_point);
+
+    vecf4 = vec_add(vec_rint(vecf4), vec_zero_point);
+    vecf5 = vec_add(vec_rint(vecf5), vec_zero_point);
+    vecf6 = vec_add(vec_rint(vecf6), vec_zero_point);
+    vecf7 = vec_add(vec_rint(vecf7), vec_zero_point);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vuint8 vec0 = vec_packsu(vecshi0, vecshi1);
+    vuint8 vec1 = vec_packsu(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE relu(Vectorized<c10::quint8> zero_point) const {
+    return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)};
+  }
+
+  Vectorized<c10::quint8> C10_ALWAYS_INLINE
+  relu6(Vectorized<c10::quint8> zero_point, Vectorized<c10::quint8> q_six) const {
+    vuint8 max0 = vec_max(_vec0, zero_point._vec0);
+    vuint8 max1 = vec_max(_vec1, zero_point._vec1);
+    return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)};
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    vint16 vecshi0 = vec_unpackh((vint8)_vec0);
+    vint16 vecBshi0 = vec_unpackh((vint8)b._vec0);
+    vint16 vecshi1 = vec_unpackl((vint8)_vec0);
+    vint16 vecBshi1 = vec_unpackl((vint8)b._vec0);
+
+    vint16 vecshi2 = vec_unpackh((vint8)_vec1);
+    vint16 vecBshi2 = vec_unpackh((vint8)b._vec1);
+    vint16 vecshi3 = vec_unpackl((vint8)_vec1);
+    vint16 vecBshi3 = vec_unpackl((vint8)b._vec1);
+
+    vecshi0 = vec_and(vecshi0, mask_unsigned);
+    vecBshi0 = vec_and(vecBshi0, mask_unsigned);
+    vecshi1 = vec_and(vecshi1, mask_unsigned);
+    vecBshi1 = vec_and(vecBshi1, mask_unsigned);
+
+    vecshi2 = vec_and(vecshi2, mask_unsigned);
+    vecBshi2 = vec_and(vecBshi2, mask_unsigned);
+    vecshi3 = vec_and(vecshi3, mask_unsigned);
+    vecBshi3 = vec_and(vecBshi3, mask_unsigned);
+
+    vint32 veci0 = vec_unpackh(vecshi0);
+    vint32 vecBi0 = vec_unpackh(vecBshi0);
+    vint32 veci1 = vec_unpackl(vecshi0);
+    vint32 vecBi1 = vec_unpackl(vecBshi0);
+
+    vint32 veci2 = vec_unpackh(vecshi1);
+    vint32 vecBi2 = vec_unpackh(vecBshi1);
+    vint32 veci3 = vec_unpackl(vecshi1);
+    vint32 vecBi3 = vec_unpackl(vecBshi1);
+
+    vint32 veci4 = vec_unpackh(vecshi2);
+    vint32 vecBi4 = vec_unpackh(vecBshi2);
+    vint32 veci5 = vec_unpackl(vecshi2);
+    vint32 vecBi5 = vec_unpackl(vecBshi2);
+
+    vint32 veci6 = vec_unpackh(vecshi3);
+    vint32 vecBi6 = vec_unpackh(vecBshi3);
+    vint32 veci7 = vec_unpackl(vecshi3);
+    vint32 vecBi7 = vec_unpackl(vecBshi3);
+
+    return {
+        Vectorized<c10::qint32>(veci0 - vecBi0, veci1 - vecBi1),
+        Vectorized<c10::qint32>(veci2 - vecBi2, veci3 - vecBi3),
+        Vectorized<c10::qint32>(veci4 - vecBi4, veci5 - vecBi5),
+        Vectorized<c10::qint32>(veci6 - vecBi6, veci7 - vecBi7)};
+  }
+
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    vfloat32 vec_multiplier = vec_splats(multiplier);
+    vint32 vec_zero_point = vec_splats(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    vfloat32 vecf0 = vec_float(vi0.vec0());
+    vfloat32 vecf1 = vec_float(vi0.vec1());
+    vfloat32 vecf2 = vec_float(vi1.vec0());
+    vfloat32 vecf3 = vec_float(vi1.vec1());
+
+    vfloat32 vecf4 = vec_float(vi2.vec0());
+    vfloat32 vecf5 = vec_float(vi2.vec1());
+    vfloat32 vecf6 = vec_float(vi3.vec0());
+    vfloat32 vecf7 = vec_float(vi3.vec1());
+
+    vecf0 = vec_mul(vecf0, vec_multiplier);
+    vecf1 = vec_mul(vecf1, vec_multiplier);
+    vecf2 = vec_mul(vecf2, vec_multiplier);
+    vecf3 = vec_mul(vecf3, vec_multiplier);
+
+    vecf4 = vec_mul(vecf4, vec_multiplier);
+    vecf5 = vec_mul(vecf5, vec_multiplier);
+    vecf6 = vec_mul(vecf6, vec_multiplier);
+    vecf7 = vec_mul(vecf7, vec_multiplier);
+
+    vecf0 = vec_rint(vecf0);
+    vecf1 = vec_rint(vecf1);
+    vecf2 = vec_rint(vecf2);
+    vecf3 = vec_rint(vecf3);
+
+    vecf4 = vec_rint(vecf4);
+    vecf5 = vec_rint(vecf5);
+    vecf6 = vec_rint(vecf6);
+    vecf7 = vec_rint(vecf7);
+
+    vint32 veci0 = vec_signed(vecf0);
+    vint32 veci1 = vec_signed(vecf1);
+    vint32 veci2 = vec_signed(vecf2);
+    vint32 veci3 = vec_signed(vecf3);
+
+    vint32 veci4 = vec_signed(vecf4);
+    vint32 veci5 = vec_signed(vecf5);
+    vint32 veci6 = vec_signed(vecf6);
+    vint32 veci7 = vec_signed(vecf7);
+
+    veci0 = vec_add(veci0, vec_zero_point);
+    veci1 = vec_add(veci1, vec_zero_point);
+    veci2 = vec_add(veci2, vec_zero_point);
+    veci3 = vec_add(veci3, vec_zero_point);
+
+    veci4 = vec_add(veci4, vec_zero_point);
+    veci5 = vec_add(veci5, vec_zero_point);
+    veci6 = vec_add(veci6, vec_zero_point);
+    veci7 = vec_add(veci7, vec_zero_point);
+
+    vint16 vecshi0 = vec_packs(veci0, veci1);
+    vint16 vecshi1 = vec_packs(veci2, veci3);
+    vint16 vecshi2 = vec_packs(veci4, veci5);
+    vint16 vecshi3 = vec_packs(veci6, veci7);
+
+    vuint8 vec0 = vec_packsu(vecshi0, vecshi1);
+    vuint8 vec1 = vec_packsu(vecshi2, vecshi3);
+
+    return {vec0, vec1};
+  }
+
+  DEFINE_MEMBER_OP(operator==, c10::quint8, vec_cmpeq)
+  DEFINE_MEMBER_OP(operator!=, c10::quint8, vec_cmpne)
+  DEFINE_MEMBER_OP(operator<, c10::quint8, vec_cmplt)
+  DEFINE_MEMBER_OP(operator<=, c10::quint8, vec_cmple)
+  DEFINE_MEMBER_OP(operator>, c10::quint8, vec_cmpgt)
+  DEFINE_MEMBER_OP(operator>=, c10::quint8, vec_cmpge)
+  DEFINE_MEMBER_OP(operator+, c10::quint8, vec_add)
+  DEFINE_MEMBER_OP(operator-, c10::quint8, vec_sub)
+  DEFINE_MEMBER_OP(operator*, c10::quint8, vec_mul)
+  DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::quint8, /)
+  DEFINE_MEMBER_OP(maximum, c10::quint8, vec_max)
+  DEFINE_MEMBER_OP(minimum, c10::quint8, vec_min)
+  DEFINE_MEMBER_OP(operator&, c10::quint8, vec_and)
+  DEFINE_MEMBER_OP(operator|, c10::quint8, vec_or)
+  DEFINE_MEMBER_OP(operator^, c10::quint8, vec_xor)
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::quint8> inline minimum(
+    const Vectorized<c10::quint8>& a,
+    const Vectorized<c10::quint8>& b) {
+  return a.minimum(b);
+}
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..c48f9fae148f123506ee82719f7a683c270836bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
@@ -0,0 +1,474 @@
+#pragma once
+#include <cstdint>
+#include <c10/macros/Macros.h>
+#include <ATen/cpu/vec/intrinsics.h>
+
+#if defined(__clang__)
+typedef __vector __bool char vbool8;
+typedef __vector __bool short vbool16;
+typedef __vector __bool int vbool32;
+typedef __vector __bool long long vbool64;
+using vint8    = __attribute__((vector_size(16))) signed char;
+using vint16   = __attribute__((vector_size(16))) signed short;
+using vint32   = __attribute__((vector_size(16))) signed int;
+using vint64   = __attribute__((vector_size(16))) signed long long;
+using vuint8   = __attribute__((vector_size(16))) unsigned char;
+using vuint16  = __attribute__((vector_size(16))) unsigned short;
+using vuint32  = __attribute__((vector_size(16))) unsigned int;
+using vuint64  = __attribute__((vector_size(16))) unsigned long long;
+using vfloat32 = __attribute__((vector_size(16))) float;
+using vfloat64 = __attribute__((vector_size(16))) double;
+#else
+using vbool8   =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char;
+using vbool16  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short;
+using vbool32  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int;
+using vbool64  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) long long;
+using vint8    =  __attribute__((altivec(vector__)))  signed char;
+using vint16   =  __attribute__((altivec(vector__)))  signed short;
+using vint32   =  __attribute__((altivec(vector__)))  signed int;
+using vint64   =  __attribute__((altivec(vector__)))  signed long long;
+using vuint8   =  __attribute__((altivec(vector__)))  unsigned char;
+using vuint16  =  __attribute__((altivec(vector__)))  unsigned short;
+using vuint32  =  __attribute__((altivec(vector__)))  unsigned  int;
+using vuint64  =  __attribute__((altivec(vector__)))  unsigned long long;
+using vfloat32 =  __attribute__((altivec(vector__)))  float;
+using vfloat64 =  __attribute__((altivec(vector__)))  double;
+#endif
+
+#if !defined(vec_float)
+C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
+  vfloat32 vec_out;
+  __asm__("xvcvsxwsp %x0,%x1" : "=wf"(vec_out) : "wa"(vec_in));
+  return vec_out;
+}
+#endif
+
+#if !defined(vec_signed)
+C10_ALWAYS_INLINE vint32 vec_signed(const vfloat32& vec_in) {
+  vint32 vec_out;
+  __asm__("xvcvspsxws %x0,%x1" : "=wa"(vec_out) : "wf"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vint64 vec_signed(const vfloat64& vec_in) {
+  vint64 vec_out;
+  __asm__("xvcvdpsxds %x0,%x1" : "=wa"(vec_out) : "wd"(vec_in));
+  return vec_out;
+}
+#endif
+
+#if !defined(vec_neg)
+C10_ALWAYS_INLINE vfloat32 vec_neg(const vfloat32& vec_in) {
+  vfloat32 vec_out;
+  __asm__("xvnegsp %x0,%x1" : "=wf"(vec_out) : "wf"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vfloat64 vec_neg(const vfloat64& vec_in) {
+  vfloat64 vec_out;
+  __asm__("xvnegdp %x0,%x1" : "=wd"(vec_out) : "wd"(vec_in));
+  return vec_out;
+}
+
+C10_ALWAYS_INLINE vint16 vec_neg(const vint16& vec_in) {
+  vint16 vint0 = {0, 0, 0, 0 ,0, 0, 0, 0};
+  return vec_vsubuhm(vint0, vec_in);
+}
+
+C10_ALWAYS_INLINE vint32 vec_neg(const vint32& vec_in) {
+  vint32 vint0 = {0, 0, 0, 0};
+  return vec_vsubuwm(vint0, vec_in);
+}
+
+C10_ALWAYS_INLINE vint64 vec_neg(const vint64& vec_in) {
+  return -vec_in;
+}
+#endif
+
+#if !defined(vec_sldw)
+template <unsigned int C>
+C10_ALWAYS_INLINE vfloat32
+vec_sldw_aux(const vfloat32& vec_in0, const vfloat32& vec_in1) {
+  vfloat32 vec_out;
+  __asm("xxsldwi %x0, %x1, %x2, %3 "
+        : "=wa"(vec_out)
+        : "wa"(vec_in0), "wa"(vec_in1), "I"(C));
+  return vec_out;
+}
+
+#define vec_sldw(a, b, c) vec_sldw_aux<c>(a, b)
+#endif
+
+#define vec_not(a) vec_nor(a, a)
+#if defined(__clang__) && !defined(vec_splats)
+C10_ALWAYS_INLINE vint64 vec_splats(const int64_t& a) {
+  return vec_splats(a);
+}
+#endif
+// Vectorized min/max which return a if any operand is nan
+template <class T>
+C10_ALWAYS_INLINE T vec_min_nan(const T& a, const T& b) {
+  return vec_min(a, b);
+}
+template <class T>
+C10_ALWAYS_INLINE T vec_max_nan(const T& a, const T& b) {
+  return vec_max(a, b);
+}
+
+// Specializations for float/double taken from Eigen
+template<>
+C10_ALWAYS_INLINE vfloat32 vec_min_nan<vfloat32>(const vfloat32& a, const vfloat32& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  vfloat32 ret;
+  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+// Specializations for float/double taken from Eigen
+template<>
+C10_ALWAYS_INLINE vfloat32 vec_max_nan<vfloat32>(const vfloat32& a, const vfloat32& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::min and SSE regarding NaN
+  vfloat32 ret;
+   __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+
+template<>
+C10_ALWAYS_INLINE vfloat64 vec_min_nan<vfloat64>(const vfloat64& a, const vfloat64& b)
+{
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  vfloat64 ret;
+  __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+template<>
+C10_ALWAYS_INLINE vfloat64 vec_max_nan<vfloat64>(const vfloat64& a, const vfloat64& b)
+{
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  vfloat64 ret;
+  __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+  return ret;
+}
+
+// Vectorizes min/max function which returns nan if any side is nan
+#define C10_VSX_VEC_NAN_PROPAG(name, type, btype, func)       \
+  C10_ALWAYS_INLINE type name(const type& a, const type& b) { \
+    type tmp = func(a, b);                                    \
+    btype nan_a = vec_cmpne(a, a);                            \
+    btype nan_b = vec_cmpne(b, b);                            \
+    tmp = vec_sel(tmp, a, nan_a);                             \
+    return vec_sel(tmp, b, nan_b);                            \
+  }
+
+C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat32, vbool32, vec_min)
+C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat32, vbool32, vec_max)
+C10_VSX_VEC_NAN_PROPAG(vec_min_nan2, vfloat64, vbool64, vec_min)
+C10_VSX_VEC_NAN_PROPAG(vec_max_nan2, vfloat64, vbool64, vec_max)
+
+#undef C10_VSX_VEC_NAN_PROPAG
+
+#define DEFINE_MEMBER_UNARY_OP(op, op_type, func)     \
+  Vectorized<op_type> C10_ALWAYS_INLINE op() const {      \
+    return Vectorized<op_type>{func(_vec0), func(_vec1)}; \
+  }
+
+#define DEFINE_MEMBER_OP(op, op_type, func)                                  \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
+    return Vectorized<op_type>{                                                  \
+        func(_vec0, other._vec0), func(_vec1, other._vec1)};                 \
+  }
+
+#define DEFINE_MEMBER_BITWISE_OP(op, op_type, func)                          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
+    return Vectorized<op_type>{                                                  \
+        func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)};             \
+  }
+
+#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func)                    \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(                                \
+      const Vectorized<op_type>& b, const Vectorized<op_type>& c) const {      \
+    return Vectorized<op_type>{                                            \
+        func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)}; \
+  }
+
+#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op)          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& b) const { \
+    Vectorized<op_type>::vec_internal_type ret_0;                         \
+    Vectorized<op_type>::vec_internal_type ret_1;                         \
+    for (int i = 0; i < Vectorized<op_type>::size() / 2; i++) {           \
+      ret_0[i] = _vec0[i] binary_op b._vec0[i];                       \
+      ret_1[i] = _vec1[i] binary_op b._vec1[i];                       \
+    }                                                                 \
+    return Vectorized<op_type>{ret_0, ret_1};                             \
+  }
+
+
+#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func)                          \
+  Vectorized<op_type> C10_ALWAYS_INLINE op(const Vectorized<op_type>& other) const { \
+    using vvtype = Vectorized<op_type>::vec_internal_type;                       \
+    const vvtype v_one = vec_splats(static_cast<op_type>(1.0));              \
+    vvtype ret0 = (vvtype)func(_vec0, other._vec0);                          \
+    vvtype ret1 = (vvtype)func(_vec1, other._vec1);                          \
+    return Vectorized<op_type>{vec_and(ret0, v_one), vec_and(ret1, v_one)};      \
+  }
+
+#define DEFINE_CLAMP_FUNCS(operand_type)                                        \
+  template <>                                                                   \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp(                             \
+      const Vectorized<operand_type>& a,                                        \
+      const Vectorized<operand_type>& min,                                      \
+      const Vectorized<operand_type>& max) {                                    \
+    return Vectorized<operand_type>{                                            \
+        vec_min_nan(vec_max_nan(a.vec0(), min.vec0()), max.vec0()),             \
+        vec_min_nan(vec_max_nan(a.vec1(), min.vec1()), max.vec1())};            \
+  }                                                                             \
+  template <>                                                                   \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_min(                         \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& min) { \
+    return Vectorized<operand_type>{                                            \
+        vec_max_nan(a.vec0(), min.vec0()),                                      \
+        vec_max_nan(a.vec1(), min.vec1())};                                     \
+  }                                                                             \
+  template <>                                                                   \
+  Vectorized<operand_type> C10_ALWAYS_INLINE clamp_max(                         \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& max) { \
+    return Vectorized<operand_type>{                                            \
+        vec_min_nan(a.vec0(), max.vec0()),                                      \
+        vec_min_nan(a.vec1(), max.vec1())};                                     \
+  }
+
+#define DEFINE_REINTERPRET_CAST_FUNCS(                             \
+    first_type, cast_type, cast_inner_vector_type)                 \
+  template <>                                                      \
+  C10_ALWAYS_INLINE Vectorized<cast_type> cast<cast_type, first_type>( \
+      const Vectorized<first_type>& src) {                                 \
+    return Vectorized<cast_type>{(cast_inner_vector_type)src.vec0(),       \
+                             (cast_inner_vector_type)src.vec1()};      \
+  }
+
+#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64)    \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64) \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32)   \
+  DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16)
+
+// it can be used to emulate blend faster
+constexpr int blendChoice(uint32_t mask, uint32_t half1 = 0xF, uint32_t half2 = 0xF0) {
+  uint32_t none = 0;
+  uint32_t both = half1 | half2;
+  // clamp it between 0 and both
+  mask = mask & both;
+  // return  (a._vec0, a._vec1)
+  if (mask == none) return 0;
+  // return (b._vec0,b._vec1)
+  else if (mask == both)
+    return 1;
+  // return  (b._vec0,a._vec1)
+  else if (mask == half1)
+    return 2;
+  // return  (a._vec0,b._vec1)
+  else if (mask == half2)
+    return 3;
+  // return  (*_vec0,a._vec1)
+  else if (mask > 0 && mask < half1)
+    return 4;
+  // return  (*_vec0,b._vec1)
+  else if ((mask & half2) == half2)
+    return 5;
+  // return (a._vec0,*_vec1)
+  else if ((mask & half1) == 0 && mask > half1)
+    return 6;
+  // return (b._vec0,*_vec1)
+  else if ((mask & half1) == half1 && mask > half1)
+    return 7;
+  // return (*_vec0,*_vec1)
+  return 8;
+}
+
+// it can be used to emulate blend faster
+constexpr int blendChoiceDbl(uint32_t mask) {
+  // clamp it 0 and 0xF
+  return blendChoice(mask, 0x3, 0xC);
+}
+
+constexpr vbool32 VsxMask1(uint32_t mask) {
+  uint32_t g0 = (mask & 1) * 0xffffffff;
+  uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+  uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+  uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+  return (vbool32){g0, g1, g2, g3};
+}
+
+constexpr vbool32 VsxMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xFF) >> 4;
+  return VsxMask1(mask2);
+}
+
+constexpr vbool64 VsxDblMask1(uint32_t mask) {
+  uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+  uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+  return (vbool64){g0, g1};
+}
+
+constexpr vbool64 VsxDblMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxDblMask1(mask2);
+}
+
+constexpr int maskForComplex(uint32_t mask) {
+  mask = mask & 0xF;
+  int complex_mask = 0;
+  if (mask & 1) complex_mask |= 3;
+  if (mask & 2) complex_mask |= (3 << 2);
+  if (mask & 4) complex_mask |= (3 << 4);
+  if (mask & 8) complex_mask |= (3 << 6);
+  return complex_mask;
+}
+
+constexpr int maskForComplexDbl(uint32_t mask) {
+  mask = mask & 0x3;
+  int complex_mask = 0;
+  if (mask & 1) complex_mask |= 3;
+  if (mask & 2) complex_mask |= (3 << 2);
+  return complex_mask;
+}
+
+constexpr int blendChoiceComplex(uint32_t mask) {
+  return blendChoice(maskForComplex(mask));
+}
+
+constexpr int blendChoiceComplexDbl(uint32_t mask) {
+  return blendChoiceDbl(maskForComplexDbl(mask));
+}
+
+constexpr vbool32 VsxComplexMask1(uint32_t mask) {
+  return VsxMask1(maskForComplex(mask));
+}
+
+constexpr vbool32 VsxComplexMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxMask1(maskForComplex(mask2));
+}
+
+constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { return VsxDblMask1(mask); }
+
+constexpr vbool64 VsxComplexDblMask2(uint32_t mask) {
+  uint32_t mask2 = (mask & 0xF) >> 2;
+  return VsxDblMask1(mask2);
+}
+
+// constants
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+//
+constexpr int offset0 = 0;
+constexpr int offset16 = 16;
+
+// #Constants
+const vuint8 mask_zero_bits = vuint8{128, 128, 128, 128, 128, 128, 128, 128,
+                                128, 128, 128, 128, 96,  64,  32,  0};
+
+const vuint8 swap_mask =
+    vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
+
+const vint32 v0x7f = vec_splats(0x7f);
+const vint32 vi_0 = vec_splats((int)(0));
+const vint32 vi_1 = vec_splats((int)1);
+const vint32 vi_2 = vec_splats((int)2);
+const vint32 vi_4 = vec_splats((int)4);
+const vint32 vi_inv1 = vec_splats((int)~1);
+const vuint32 vu_29 = vec_splats(29u);
+const vuint32 vu_23 = vec_splats(23u);
+
+const vbool32 inv_mant_mask = (vbool32)vec_splats((unsigned int)~0xff800000);
+const vbool32 sign_mask = (vbool32)vec_splats((int)0x80000000);
+const vbool32 real_mask = vbool32{0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0};
+const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF};
+const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000};
+const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0};
+
+const vbool64 vd_sign_mask  = vbool64{0x8000000000000000, 0x8000000000000000};
+const vbool64 vd_imag_mask  = vbool64{0x0, 0xFFFFFFFFFFFFFFFF};
+const vbool64 vd_real_mask  = vbool64{0xFFFFFFFFFFFFFFFF, 0x0};
+const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000};
+const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0};
+
+const vfloat32 zero = vec_splats(0.f);
+const vfloat32 half = vec_splats(0.5f);
+const vfloat32 one = vec_splats(1.f);
+const vfloat32 two = vec_splats(2.0f);
+const vfloat32 _4div_pi = vec_splats(1.27323954473516f);
+const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u);
+const vfloat32 v_minus_inf = vfloat32{ 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u };
+const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff);
+const vfloat32 log10e_inv = vec_splats(0.43429448190325176f);
+const vfloat32 log2e_inv = vec_splats(1.4426950408889634f);
+const vfloat32 log2eB_inv = vec_splats(1.442695036924675f);
+const vfloat32 cephes_SQRTHF = vec_splats(0.707106781186547524f);
+const vfloat32 coscof_p0 = vec_splats(2.443315711809948E-005f);
+const vfloat32 coscof_p1 = vec_splats(-1.388731625493765E-003f);
+const vfloat32 coscof_p2 = vec_splats(4.166664568298827E-002f);
+const vfloat32 exp_hi = vec_splats(104.f);
+const vfloat32 exp_lo = vec_splats(-104.f);
+const vfloat32 exp_p0 = vec_splats(0.000198527617612853646278381f);
+const vfloat32 exp_p1 = vec_splats((0.00139304355252534151077271f));
+const vfloat32 exp_p2 = vec_splats(0.00833336077630519866943359f);
+const vfloat32 exp_p3 = vec_splats(0.0416664853692054748535156f);
+const vfloat32 exp_p4 = vec_splats(0.166666671633720397949219f);
+const vfloat32 exp_p5 = vec_splats(0.5f);
+const vfloat32 log_p0 = vec_splats(7.0376836292E-2f);
+const vfloat32 log_p1 = vec_splats(-1.1514610310E-1f);
+const vfloat32 log_p2 = vec_splats(1.1676998740E-1f);
+const vfloat32 log_p3 = vec_splats(-1.2420140846E-1f);
+const vfloat32 log_p4 = vec_splats(+1.4249322787E-1f);
+const vfloat32 log_p5 = vec_splats(-1.6668057665E-1f);
+const vfloat32 log_p6 = vec_splats(+2.0000714765E-1f);
+const vfloat32 log_p7 = vec_splats(-2.4999993993E-1f);
+const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f);
+const vfloat32 log_q1 = vec_splats(-2.12194440e-4f);
+const vfloat32 log_q2 = vec_splats(0.693359375f);
+const vfloat32 max_logf = vec_splats(88.02969187150841f);
+const vfloat32 max_numf = vec_splats(1.7014117331926442990585209174225846272e38f);
+const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u);
+const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u);
+const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f);
+const vfloat32 minus_cephes_dp2 = vec_splats(-2.4187564849853515625e-4f);
+const vfloat32 minus_cephes_dp3 = vec_splats(-3.77489497744594108e-8f);
+const vfloat32 negln2f_hi = vec_splats(-0.693145751953125f);
+const vfloat32 negln2f_lo = vec_splats(-1.428606765330187045e-06f);
+const vfloat32 p0 = vec_splats(2.03721912945E-4f);
+const vfloat32 p1 = vec_splats(8.33028376239E-3f);
+const vfloat32 p2 = vec_splats(1.66667160211E-1f);
+const vfloat32 sincof_p0 = vec_splats(-1.9515295891E-4f);
+const vfloat32 sincof_p1 = vec_splats(8.3321608736E-3f);
+const vfloat32 sincof_p2 = vec_splats(-1.6666654611E-1f);
+const vfloat32 tanh_0p625 = vec_splats(0.625f);
+const vfloat32 tanh_half_max = vec_splats(44.014845935754205f);
+const vfloat32 tanh_p0 = vec_splats(-5.70498872745E-3f);
+const vfloat32 tanh_p1 = vec_splats(2.06390887954E-2f);
+const vfloat32 tanh_p2 = vec_splats(-5.37397155531E-2f);
+const vfloat32 tanh_p3 = vec_splats(1.33314422036E-1f);
+const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f);
+const vfloat32 vcheck = vec_splats((float)(1LL << 24));
+const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f};
+const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f};
+const vfloat32 sqrt2_2 = vfloat32{0.70710676908493042f, 0.70710676908493042,
+                          0.70710676908493042, 0.70710676908493042};
+const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0};
+const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f};
+const vfloat64 vd_one = vec_splats(1.0);
+const vfloat64 vd_zero = vec_splats(0.0);
+const vfloat64 vd_log10e_inv = vec_splats(0.43429448190325176);
+const vfloat64 vd_log2e_inv = vec_splats(1.4426950408889634);
+const vfloat64 vd_imag_one = vfloat64{0.0, 1.0};
+const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
+const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
+const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
+
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6284bfa6735f77a0e7d29937e0e5cbe057e69108
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec256/zarch/vec256_zarch.h
@@ -0,0 +1,2818 @@
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#if defined(__clang__)
+#include <sleef.h>
+#elif defined(__GNUC__) || defined(__GNUG__)
+#include <sleef.h>
+#include <vecintrin.h>
+#endif
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/complex.h>
+
+#define SLEEF_MEMORY_WORKAROUND
+
+namespace at {
+namespace vec {
+
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+template <typename T>
+constexpr bool is_zarch_implemented() {
+  return (
+      std::is_same<T, float>::value || std::is_same<T, double>::value ||
+      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value ||
+      std::is_same<T, uint16_t>::value || std::is_same<T, int16_t>::value ||
+      std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value);
+}
+
+template <typename T>
+constexpr bool is_zarch_implemented_quant() {
+  return (
+      std::is_same<T, c10::qint32>::value ||
+      std::is_same<T, c10::qint8>::value ||
+      std::is_same<T, c10::quint8>::value);
+}
+
+template <typename T>
+constexpr bool is_zarch_implemented_complex() {
+  return std::is_same<T, c10::complex<float>>::value ||
+      std::is_same<T, c10::complex<double>>::value;
+}
+
+constexpr int offset0 = 0;
+constexpr int offset16 = 16;
+
+template <int N>
+struct VecBinaryType {
+  using type __attribute__((vector_size(16))) = uintmax_t;
+};
+
+template <>
+struct VecBinaryType<8> {
+  using type = __attribute__((vector_size(16))) unsigned long long;
+};
+
+template <>
+struct VecBinaryType<4> {
+  using type = __attribute__((vector_size(16))) unsigned int;
+};
+
+template <>
+struct VecBinaryType<2> {
+  using type = __attribute__((vector_size(16))) unsigned short;
+};
+
+template <>
+struct VecBinaryType<1> {
+  using type = __attribute__((vector_size(16))) unsigned char;
+};
+
+template <typename T>
+struct VecInnerType {
+  using Type __attribute__((vector_size(16))) = T;
+  using BinaryType = typename VecBinaryType<sizeof(T)>::type;
+  using ElementType = T;
+  static constexpr int size = 16 / sizeof(T);
+};
+
+// define for int64_t properly for load
+template <>
+struct VecInnerType<int64_t> {
+  using Type = __attribute__((vector_size(16))) signed long long;
+  using ElementType = signed long long;
+  using BinaryType = typename VecBinaryType<sizeof(signed long long)>::type;
+  static constexpr int size = 16 / sizeof(signed long long);
+};
+
+template <typename T>
+using ZSimdVect = typename VecInnerType<T>::Type;
+template <typename T>
+using ZSimdVectBinary = typename VecInnerType<T>::BinaryType;
+template <typename T>
+using ZSimdVectElement = typename VecInnerType<T>::ElementType;
+
+constexpr int blendChoiceInner(
+    const uint64_t mask,
+    const uint64_t half1 = 0xF,
+    const uint64_t half2 = 0xF0) {
+  uint64_t none = 0;
+  uint64_t both = half1 | half2;
+  // clamp it between 0 and both
+  auto res_mask = mask & both;
+  // return  (a._vec0, a._vec1)
+  if (res_mask == none)
+    return 0;
+  // return (b._vec0,b._vec1)
+  else if (res_mask == both)
+    return 1;
+  // return  (b._vec0, a._vec1)
+  else if (res_mask == half1)
+    return 2;
+  // return  (a._vec0,b._vec1)
+  else if (res_mask == half2)
+    return 3;
+  // return  (*_vec0,a._vec1)
+  else if (res_mask > 0 && res_mask < half1)
+    return 4;
+  // return  (*_vec0,b._vec1)
+  else if ((res_mask & half2) == half2)
+    return 5;
+  // return (a._vec0,*_vec1)
+  else if ((res_mask & half1) == 0 && res_mask > half1)
+    return 6;
+  // return (b._vec0,*_vec1)
+  else if ((res_mask & half1) == half1 && res_mask > half1)
+    return 7;
+  // return (*_vec0,*_vec1)
+  return 8;
+}
+
+// it can be used to emulate blend faster
+template <int Z>
+constexpr int blendChoice(const uint64_t mask) {
+  static_assert(Z < 1 || Z > 8, "not implemented");
+  return blendChoiceInner(mask);
+}
+
+template <>
+constexpr int blendChoice<1>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0x0000FFFF, 0xFFFF0000);
+}
+
+template <>
+constexpr int blendChoice<2>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0x00FF, 0xFF00);
+}
+
+template <>
+constexpr int blendChoice<4>(const uint64_t mask) {
+  return blendChoiceInner(mask, 0xF, 0xF0);
+}
+
+template <>
+constexpr int blendChoice<8>(const uint64_t mask) {
+  // clamp it 0 and 0xF
+  return blendChoiceInner(mask, 0x3, 0xC);
+}
+
+template <int N>
+constexpr auto GetMask1(const uint64_t mask) {
+  return typename VecBinaryType<N>::type{};
+}
+
+template <int N>
+constexpr auto GetMask2(const uint64_t mask) {
+  return typename VecBinaryType<N>::type{};
+}
+
+template <>
+constexpr auto GetMask1<1>(const uint64_t mask) {
+  constexpr uint8_t t = (int)0xFF;
+  uint8_t g0 = (mask & 1) * t;
+  uint8_t g1 = ((mask & 2) >> 1) * t;
+  uint8_t g2 = ((mask & 4) >> 2) * t;
+  uint8_t g3 = ((mask & 8) >> 3) * t;
+  uint8_t g4 = ((mask & 16) >> 4) * t;
+  uint8_t g5 = ((mask & 32) >> 5) * t;
+  uint8_t g6 = ((mask & 64) >> 6) * t;
+  uint8_t g7 = ((mask & 128) >> 7) * t;
+  uint8_t g8 = ((mask & 256) >> 8) * t;
+  uint8_t g9 = ((mask & 512) >> 9) * t;
+  uint8_t g10 = ((mask & 1024) >> 10) * t;
+  uint8_t g11 = ((mask & 2048) >> 11) * t;
+  uint8_t g12 = ((mask & 4096) >> 12) * t;
+  uint8_t g13 = ((mask & 8192) >> 13) * t;
+  uint8_t g14 = ((mask & 16384) >> 14) * t;
+  uint8_t g15 = ((mask & 32768) >> 15) * t;
+  return (typename VecBinaryType<1>::type){
+      g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15};
+}
+
+template <>
+constexpr auto GetMask2<1>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFFFFFFFF) >> 16;
+  return GetMask1<1>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<2>(const uint64_t mask) {
+  constexpr uint16_t t = (int)0xFFFF;
+  uint16_t g0 = (mask & 1) * t;
+  uint16_t g1 = ((mask & 2) >> 1) * t;
+  uint16_t g2 = ((mask & 4) >> 2) * t;
+  uint16_t g3 = ((mask & 8) >> 3) * t;
+  uint16_t g4 = ((mask & 16) >> 4) * t;
+  uint16_t g5 = ((mask & 32) >> 5) * t;
+  uint16_t g6 = ((mask & 64) >> 6) * t;
+  uint16_t g7 = ((mask & 128) >> 7) * t;
+  return (typename VecBinaryType<2>::type){g0, g1, g2, g3, g4, g5, g6, g7};
+}
+
+template <>
+constexpr auto GetMask2<2>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFFFF) >> 8;
+  return GetMask1<2>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<4>(const uint64_t mask) {
+  uint32_t g0 = (mask & 1) * 0xffffffff;
+  uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff;
+  uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff;
+  uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff;
+  return (typename VecBinaryType<4>::type){g0, g1, g2, g3};
+}
+
+template <>
+constexpr auto GetMask2<4>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xFF) >> 4;
+  return GetMask1<4>(mask2);
+}
+
+template <>
+constexpr auto GetMask1<8>(const uint64_t mask) {
+  uint64_t g0 = (mask & 1) * 0xffffffffffffffff;
+  uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff;
+  return (typename VecBinaryType<8>::type){g0, g1};
+}
+
+template <>
+constexpr auto GetMask2<8>(const uint64_t mask) {
+  uint64_t mask2 = (mask & 0xF) >> 2;
+  return GetMask1<8>(mask2);
+}
+
+template <int Z>
+constexpr int maskForComplex(uint32_t mask) {
+  return 0;
+}
+
+template <>
+constexpr int maskForComplex<8>(uint32_t mask) {
+  mask = mask & 0xF;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  if (mask & 4)
+    complex_mask |= (3 << 4);
+  if (mask & 8)
+    complex_mask |= (3 << 6);
+  return complex_mask;
+}
+
+template <>
+constexpr int maskForComplex<16>(uint32_t mask) {
+  mask = mask & 0x3;
+  int complex_mask = 0;
+  if (mask & 1)
+    complex_mask |= 3;
+  if (mask & 2)
+    complex_mask |= (3 << 2);
+  return complex_mask;
+}
+
+template <typename T = c10::complex<float>>
+constexpr int blend_choice() {
+  return 0xAA;
+}
+
+template <>
+constexpr int blend_choice<c10::complex<double>>() {
+  return 0x0A;
+}
+
+constexpr int64_t allbitset(int16_t x) {
+  int64_t onex = 1;
+  return (onex << x) - onex;
+}
+
+namespace { /* unnamed namespace */
+
+ZSimdVect<float> vec_mergee(ZSimdVect<float> x, ZSimdVect<float> y) {
+  constexpr ZSimdVectBinary<uint8_t> mergee_mask{
+      0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27};
+  return vec_perm(x, y, mergee_mask);
+}
+
+ZSimdVect<double> vec_mergee(ZSimdVect<double> x, ZSimdVect<double> y) {
+  return vec_mergeh(x, y);
+}
+
+ZSimdVect<float> vec_mergeo(ZSimdVect<float> x, ZSimdVect<float> y) {
+  constexpr ZSimdVectBinary<uint8_t> mergeo_mask{
+      4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+  return vec_perm(x, y, mergeo_mask);
+}
+
+ZSimdVect<double> vec_mergeo(ZSimdVect<double> x, ZSimdVect<double> y) {
+  return vec_mergel(x, y);
+}
+
+} /* unnamed namespace */
+
+//
+template <typename T>
+constexpr auto GetBpermZeroMask() {
+  return ZSimdVectBinary<uint8_t>{
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      96,
+      64,
+      32,
+      0};
+}
+
+template <>
+constexpr auto GetBpermZeroMask<double>() {
+  return ZSimdVectBinary<uint8_t>{
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      128,
+      64,
+      0};
+}
+
+constexpr auto GetSwapMaskFloat() {
+  return ZSimdVectBinary<uint8_t>{
+      4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
+}
+
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
+ public:
+  using value_type = T;
+  using vtype = ZSimdVect<T>;
+  using vmaskType = ZSimdVectBinary<T>;
+  using size_type = int;
+  // because of gcc inconsistency for int64_t we are obliged to use this, not
+  // value_type
+  using ElementType = ZSimdVectElement<T>;
+  using vinner_data = std::pair<vtype, vtype>;
+
+ private:
+  vtype _vec0;
+  vtype _vec1;
+
+ public:
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(ElementType);
+  }
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {}
+  C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec0{v.first}, _vec1{v.second} {}
+  C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {}
+  C10_ALWAYS_INLINE Vectorized(T s)
+      : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {}
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    if (count == size()) {
+      return {
+          vec_xl(offset0, reinterpret_cast<const ElementType*>(ptr)),
+          vec_xl(offset16, reinterpret_cast<const ElementType*>(ptr))};
+    }
+
+    __at_align__ ElementType tmp_values[size()] = {};
+    std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
+
+    return {
+        vec_xl(offset0, reinterpret_cast<const ElementType*>(tmp_values)),
+        vec_xl(offset16, reinterpret_cast<const ElementType*>(tmp_values))};
+  }
+
+  static Vectorized<value_type> C10_ALWAYS_INLINE
+  loadu_one_fourth(const void* ptr) {
+    // load only first 8 bytes
+    // only intended to be used with uint8_t
+    return loadu(ptr, 8 / sizeof(ElementType));
+  }
+
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      vec_xst(_vec0, offset0, reinterpret_cast<ElementType*>(ptr));
+      vec_xst(_vec1, offset16, reinterpret_cast<ElementType*>(ptr));
+    } else if (count > 0) {
+      __at_align__ ElementType tmp_values[size()];
+      vec_xst(_vec0, offset0, reinterpret_cast<ElementType*>(tmp_values));
+      vec_xst(_vec1, offset16, reinterpret_cast<ElementType*>(tmp_values));
+      std::memcpy(
+          ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
+    }
+  }
+
+  C10_ALWAYS_INLINE const vtype& vec0() const {
+    return _vec0;
+  }
+
+  C10_ALWAYS_INLINE const vtype& vec1() const {
+    return _vec1;
+  }
+
+  C10_ALWAYS_INLINE vinner_data data() const {
+    return std::make_pair<>(_vec0, _vec1);
+  }
+
+  C10_ALWAYS_INLINE operator vinner_data() const {
+    return data();
+  }
+
+  C10_ALWAYS_INLINE const vmaskType vecb0() const {
+    return (vmaskType)_vec0;
+  }
+  C10_ALWAYS_INLINE const vmaskType vecb1() const {
+    return (vmaskType)_vec1;
+  }
+
+  static Vectorized<T> C10_ALWAYS_INLINE blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    return {
+        vec_sel(a._vec0, b._vec0, mask.vecb0()),
+        vec_sel(a._vec1, b._vec1, mask.vecb1())};
+  }
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4)
+      : _vec0{s1, s2}, _vec1{s3, s4} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 4), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8)
+      : _vec0{s1, s2, s3, s4}, _vec1{s5, s6, s7, s8} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 2), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(
+      T s1,
+      T s2,
+      T s3,
+      T s4,
+      T s5,
+      T s6,
+      T s7,
+      T s8,
+      T s9,
+      T s10,
+      T s11,
+      T s12,
+      T s13,
+      T s14,
+      T s15,
+      T s16)
+      : _vec0{s1, s2, s3, s4, s5, s6, s7, s8},
+        _vec1{s9, s10, s11, s12, s13, s14, s15, s16} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 1), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(
+      T s1,
+      T s2,
+      T s3,
+      T s4,
+      T s5,
+      T s6,
+      T s7,
+      T s8,
+      T s9,
+      T s10,
+      T s11,
+      T s12,
+      T s13,
+      T s14,
+      T s15,
+      T s16,
+      T s17,
+      T s18,
+      T s19,
+      T s20,
+      T s21,
+      T s22,
+      T s23,
+      T s24,
+      T s25,
+      T s26,
+      T s27,
+      T s28,
+      T s29,
+      T s30,
+      T s31,
+      T s32)
+      : _vec0{s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16},
+        _vec1{
+            s17,
+            s18,
+            s19,
+            s20,
+            s21,
+            s22,
+            s23,
+            s24,
+            s25,
+            s26,
+            s27,
+            s28,
+            s29,
+            s30,
+            s31,
+            s32} {}
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 8, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(base, base + step, base + 2 * step, base + 3 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 4, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 2, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 1, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + 2 * step,
+        base + 3 * step,
+        base + 4 * step,
+        base + 5 * step,
+        base + 6 * step,
+        base + 7 * step,
+        base + 8 * step,
+        base + 9 * step,
+        base + 10 * step,
+        base + 11 * step,
+        base + 12 * step,
+        base + 13 * step,
+        base + 14 * step,
+        base + 15 * step,
+        base + 16 * step,
+        base + 17 * step,
+        base + 18 * step,
+        base + 19 * step,
+        base + 20 * step,
+        base + 21 * step,
+        base + 22 * step,
+        base + 23 * step,
+        base + 24 * step,
+        base + 25 * step,
+        base + 26 * step,
+        base + 27 * step,
+        base + 28 * step,
+        base + 29 * step,
+        base + 30 * step,
+        base + 31 * step);
+  }
+
+  // blend section
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 0, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return a;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 1, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return b;
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 2, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return {b._vec0, a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 3, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    return {a._vec0, b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 4, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 5, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    return {(vtype)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 6, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    // generated masks
+    return {a._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 7, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    // generated masks
+    return {b._vec0, (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int64_t mask>
+  static std::enable_if_t<blendChoice<sizeof(T)>(mask) == 8, Vectorized<T>>
+      C10_ALWAYS_INLINE blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    const vmaskType mask_1st = GetMask1<sizeof(T)>(mask);
+    const vmaskType mask_2nd = GetMask2<sizeof(T)>(mask);
+    return {
+        (vtype)vec_sel(a._vec0, b._vec0, mask_1st),
+        (vtype)vec_sel(a._vec1, b._vec1, mask_2nd)};
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z >= C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    return b;
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z < C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    if (count == Z)
+      return blend<allbitset(Z)>(a, b);
+    else
+      return set_inner<Z + 1, C>(a, b, count);
+  }
+
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count = size()) {
+    if (count == 0)
+      return a;
+    return set_inner<1, size()>(a, b, count);
+  }
+
+  const ElementType& operator[](int idx) const = delete;
+  ElementType& operator[](int idx) = delete;
+
+  Vectorized<T> C10_ALWAYS_INLINE operator+(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec0 + other._vec0, _vec1 + other._vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator-(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec0 - other._vec0, _vec1 - other._vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator*(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec0 * other._vec0, _vec1 * other._vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator/(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec0 / other._vec0, _vec1 / other._vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator&(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        (vtype)(vecb0() & other.vecb0()), (vtype)(vecb1() & other.vecb1())};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator|(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        (vtype)(vecb0() | other.vecb0()), (vtype)(vecb1() | other.vecb1())};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator^(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        (vtype)(vecb0() ^ other.vecb0()), (vtype)(vecb1() ^ other.vecb1())};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator<<(const Vectorized<T> &other) const {
+    constexpr ElementType max_shift = sizeof(ElementType) * CHAR_BIT;
+
+    ElementType a_array[Vectorized<T>::size()];
+    ElementType b_array[Vectorized<T>::size()];
+    ElementType c_array[Vectorized<T>::size()];
+
+    store(a_array);
+    other.store(b_array);
+
+    for (int i = 0; i != Vectorized<T>::size(); i++) {
+      T shift = b_array[i];
+      if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+        c_array[i] = 0;
+      } else {
+        c_array[i] = static_cast<std::make_unsigned_t<T>>(a_array[i]) << shift;
+      }
+   }
+
+    return loadu(c_array);
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator>>(const Vectorized<T> &other) const {
+    // right shift value to retain sign bit for signed and no bits for unsigned
+    constexpr ElementType max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v<T>;
+
+    ElementType a_array[Vectorized<T>::size()];
+    ElementType b_array[Vectorized<T>::size()];
+    ElementType c_array[Vectorized<T>::size()];
+
+    store(a_array);
+    other.store(b_array);
+
+    for (int i = 0; i != Vectorized<T>::size(); i++) {
+      T shift = b_array[i];
+      if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+        c_array[i] = a_array[i] >> max_shift;
+      } else {
+        c_array[i] = a_array[i] >> shift;
+      }
+    }
+
+    return loadu(c_array);
+  }
+
+  Vectorized<T> _not() const {
+    return {(vtype)vec_nor(vecb0(), vecb0()), (vtype)vec_nor(vecb1(), vecb1())};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator==(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmpeq(_vec0, other._vec0), vec_cmpeq(_vec1, other._vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator!=(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmpeq(_vec0, other._vec0), vec_cmpeq(_vec1, other._vec1)}
+        ._not();
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator>(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmpgt(_vec0, other._vec0), vec_cmpgt(_vec1, other._vec1)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator>=(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmpge(_vec0, other._vec0), vec_cmpge(_vec1, other._vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator<(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmplt(_vec0, other._vec0), vec_cmplt(_vec1, other._vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator<=(const Vectorized<T>& other) const {
+    return Vectorized<T>{
+        vec_cmple(_vec0, other._vec0), vec_cmple(_vec1, other._vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    return (*this == other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    return (*this != other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE gt(const Vectorized<T>& other) const {
+    return (*this > other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ge(const Vectorized<T>& other) const {
+    return (*this >= other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE lt(const Vectorized<T>& other) const {
+    return (*this < other) & Vectorized<T>((T)1.0);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE le(const Vectorized<T>& other) const {
+    return (*this <= other) & Vectorized<T>((T)1.0);
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_unsigned<U>::value, int> = 0>
+  Vectorized<U> C10_ALWAYS_INLINE abs() const {
+    return {vec_abs(_vec0), vec_abs(_vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_unsigned<U>::value, int> = 0>
+  Vectorized<U> C10_ALWAYS_INLINE abs() const {
+    return {_vec0, _vec1};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE neg() const {
+    return {-_vec0, -_vec1};
+  }
+
+  Vectorized<T> isnan() const {
+    auto x = *this;
+    auto ret = (x == x);
+    return ret._not();
+  }
+
+  bool has_inf_nan() const {
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec0[i]) || _isinf(_vec0[i])) {
+        return true;
+      }
+    }
+    for (const auto i : c10::irange(size()/2)) {
+      if(_isnan(_vec1[i]) || _isinf(_vec1[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<U> angle() const {
+    auto tmp = blendv(
+        Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+  Vectorized<U> angle() const {
+    return blendv(
+        Vectorized<U>(0), Vectorized<U>(c10::pi<U>), *this < Vectorized<U>(0));
+  }
+
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return Vectorized<T>{0};
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  int zero_mask() const {
+    auto cmp = (*this == Vectorized<U>(0));
+    constexpr auto mask_zero_bits = GetBpermZeroMask<U>();
+    ZSimdVectBinary<uint64_t> result0 =
+        vec_bperm_u128((ZSimdVectBinary<uint8_t>)cmp.vecb0(), mask_zero_bits);
+    ZSimdVectBinary<uint64_t> result1 =
+        vec_bperm_u128((ZSimdVectBinary<uint8_t>)cmp.vecb1(), mask_zero_bits);
+    return (result0[0] | (result1[0] << (size() / 2)));
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE floor() const {
+    return {vec_floor(_vec0), vec_floor(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE ceil() const {
+    return {vec_ceil(_vec0), vec_ceil(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE round() const {
+    return {vec_round(_vec0), vec_round(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE rint() const {
+    return {vec_rint(_vec0), vec_rint(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE trunc() const {
+    return {vec_trunc(_vec0), vec_trunc(_vec1)};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE frac() const {
+    return *this - trunc();
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE sqrt() const {
+    return {vec_sqrt(_vec0), vec_sqrt(_vec1)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE reciprocal() const {
+    return Vectorized<T>((T)1) / (*this);
+  }
+  Vectorized<T> C10_ALWAYS_INLINE rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(float (*const f)(float)) const {
+    float a00 = f(_vec0[0]);
+    float a01 = f(_vec0[1]);
+    float a02 = f(_vec0[2]);
+    float a03 = f(_vec0[3]);
+    float a10 = f(_vec1[0]);
+    float a11 = f(_vec1[1]);
+    float a12 = f(_vec1[2]);
+    float a13 = f(_vec1[3]);
+    return Vectorized<T>{a00, a01, a02, a03, a10, a11, a12, a13};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(double (*const f)(double)) const {
+    return Vectorized<T>(f(_vec0[0]), f(_vec0[1]), f(_vec1[0]), f(_vec1[1]));
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      float (*const f)(float, float),
+      const Vectorized<T>& b) const {
+    float a00 = f(_vec0[0], b._vec0[0]);
+    float a01 = f(_vec0[1], b._vec0[1]);
+    float a02 = f(_vec0[2], b._vec0[2]);
+    float a03 = f(_vec0[3], b._vec0[3]);
+    float a10 = f(_vec1[0], b._vec1[0]);
+    float a11 = f(_vec1[1], b._vec1[1]);
+    float a12 = f(_vec1[2], b._vec1[2]);
+    float a13 = f(_vec1[3], b._vec1[3]);
+    return Vectorized<T>{a00, a01, a02, a03, a10, a11, a12, a13};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      double (*const f)(double, double),
+      const Vectorized<T>& b) const {
+    return Vectorized<T>(
+        f(_vec0[0], b._vec0[0]),
+        f(_vec0[1], b._vec0[1]),
+        f(_vec1[0], b._vec1[0]),
+        f(_vec1[1], b._vec1[1]));
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
+    vtype a0 = f(_vec0);
+    vtype a1 = f(_vec1);
+    return Vectorized<T>{a0, a1};
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d) const {
+    return Vectorized<T>(d(_vec0), d(_vec1));
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
+      const {
+    vtype a0 = f(_vec0, b._vec0);
+    vtype a1 = f(_vec1, b._vec1);
+    return Vectorized<T>{a0, a1};
+  }
+
+  template <
+      typename FloatOp,
+      typename DoubleOp,
+      typename U = T,
+      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+  inline Vectorized<T> mapSleef(FloatOp f, DoubleOp d, const Vectorized<T>& b)
+      const {
+    return Vectorized<T>(d(_vec0, b._vec0), d(_vec1, b._vec1));
+  }
+
+  Vectorized<T> acos() const {
+    return mapSleef(Sleef_acosf4_u10, Sleef_acosd2_u10);
+  }
+  Vectorized<T> asin() const {
+    return mapSleef(Sleef_asinf4_u10, Sleef_asind2_u10);
+  }
+  Vectorized<T> atan() const {
+    return mapSleef(Sleef_atanf4_u10, Sleef_atand2_u10);
+  }
+  Vectorized<T> atanh() const {
+    return mapSleef(Sleef_atanhf4_u10, Sleef_atanhd2_u10);
+  }
+
+  Vectorized<T> erf() const {
+    return mapSleef(Sleef_erff4_u10, Sleef_erfd2_u10);
+  }
+  Vectorized<T> erfc() const {
+    return mapSleef(Sleef_erfcf4_u15, Sleef_erfcd2_u15);
+  }
+
+  Vectorized<T> exp() const {
+    return mapSleef(Sleef_expf4_u10, Sleef_expd2_u10);
+  }
+  Vectorized<T> exp2() const {
+    return mapSleef(Sleef_exp2f4_u10, Sleef_exp2d2_u10);
+  }
+  Vectorized<T> expm1() const {
+    return mapSleef(Sleef_expm1f4_u10, Sleef_expm1d2_u10);
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+
+  Vectorized<T> log() const {
+    return mapSleef(Sleef_logf4_u10, Sleef_logd2_u10);
+  }
+  Vectorized<T> log2() const {
+    return mapSleef(Sleef_log2f4_u10, Sleef_log2d2_u10);
+  }
+  Vectorized<T> log10() const {
+    return mapSleef(Sleef_log10f4_u10, Sleef_log10d2_u10);
+  }
+  Vectorized<T> log1p() const {
+    return mapSleef(Sleef_log1pf4_u10, Sleef_log1pd2_u10);
+  }
+
+  Vectorized<T> sin() const {
+#ifndef SLEEF_MEMORY_WORKAROUND
+    return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10);
+#else
+    return mapOrdinary(std::sin);
+#endif
+  }
+  Vectorized<T> sinh() const {
+    return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10);
+  }
+  Vectorized<T> cos() const {
+#ifndef SLEEF_MEMORY_WORKAROUND
+    return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10);
+#else
+    return mapOrdinary(std::cos);
+#endif
+  }
+  Vectorized<T> cosh() const {
+    return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10);
+  }
+
+  Vectorized<T> tan() const {
+#ifndef SLEEF_MEMORY_WORKAROUND
+    return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10);
+#else
+    return mapOrdinary(std::tan);
+#endif
+  }
+  Vectorized<T> tanh() const {
+    return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10);
+  }
+
+  Vectorized<T> lgamma() const {
+    return mapSleef(Sleef_lgammaf4_u10, Sleef_lgammad2_u10);
+  }
+
+  Vectorized<T> atan2(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_atan2f4_u10, Sleef_atan2d2_u10, b);
+  }
+  Vectorized<T> copysign(const Vectorized<T>& sign) const {
+    return mapSleef(Sleef_copysignf4, Sleef_copysignd2, sign);
+  }
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+    return mapSleef(Sleef_fmodf4, Sleef_fmodd2, q);
+  }
+
+  Vectorized<T> hypot(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_hypotf4_u05, Sleef_hypotd2_u05, b);
+  }
+
+  Vectorized<T> pow(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_powf4_u10, Sleef_powd2_u10, b);
+  }
+
+  Vectorized<T> nextafter(const Vectorized<T>& b) const {
+    return mapSleef(Sleef_nextafterf4, Sleef_nextafterd2, b);
+  }
+
+  Vectorized<T> erfinv() const {
+    return mapOrdinary(calc_erfinv);
+  }
+
+  Vectorized<T> digamma() const {
+    return mapOrdinary(calc_digamma);
+  }
+
+  Vectorized<T> igamma(const Vectorized<T>& x) const {
+    return mapOrdinary(calc_igamma, x);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T>& x) const {
+    return mapOrdinary(calc_igammac, x);
+  }
+
+  Vectorized<T> i0() const {
+    return mapOrdinary(calc_i0);
+  }
+
+  Vectorized<T> i0e() const {
+    return mapOrdinary(calc_i0e);
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    return {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+  }
+
+  /* Propagates NaN if either input is a NaN. */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    Vectorized<T> tmp = {vec_min(_vec0, other._vec0), vec_min(_vec1, other._vec1)};
+    tmp = blendv(tmp, *this, isnan());
+    return blendv(tmp, other, other.isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    return {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+  }
+
+  /* Propagates NaN if either input is a NaN. */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    Vectorized<T> tmp = {vec_max(_vec0, other._vec0), vec_max(_vec1, other._vec1)};
+    tmp = blendv(tmp, *this, isnan());
+    return blendv(tmp, other, other.isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    return {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
+  }
+
+  /* Keeps NaN if actual value is NaN */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    Vectorized<T> tmp = {vec_max(_vec0, min._vec0), vec_max(_vec1, min._vec1)};
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<!std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    return {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
+  }
+
+  /* Keeps NaN if actual value is NaN */
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    Vectorized<T> tmp = {vec_min(_vec0, max._vec0), vec_min(_vec1, max._vec1)};
+    return blendv(tmp, *this, isnan());
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, float>::value, int> = 0>
+  Vectorized<T> swapped() const {
+    auto swap_mask = GetSwapMaskFloat();
+    vtype v0 = vec_perm(_vec0, _vec0, swap_mask);
+    vtype v1 = vec_perm(_vec1, _vec1, swap_mask);
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, double>::value, int> = 0>
+  Vectorized<T> swapped() const {
+    vtype v0 = vec_permi(_vec0, _vec0, 2);
+    vtype v1 = vec_permi(_vec1, _vec1, 2);
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  static Vectorized<T> mergee(Vectorized<T>& first, Vectorized<T>& second) {
+    return {
+        vec_mergee(first._vec0, second._vec0),
+        vec_mergee(first._vec1, second._vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  static Vectorized<T> mergeo(Vectorized<T>& first, Vectorized<T>& second) {
+    return {
+        vec_mergeo(first._vec0, second._vec0),
+        vec_mergeo(first._vec1, second._vec1)};
+  }
+
+  static Vectorized<T> horizontal_add_perm(
+      Vectorized<T>& first,
+      Vectorized<T>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.swapped(); // 2perm
+    auto second_perm = second.swapped(); // 2perm
+    // summ
+    auto first_ret = first + first_perm; // 2add
+    auto second_ret = second + second_perm; // 2 add
+    // now lets choose evens
+    return mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  static Vectorized<T> horizontal_sub_perm(
+      Vectorized<T>& first,
+      Vectorized<T>& second) {
+    // we will simulate it differently with 6 instructions total
+    // lets permute second so that we can add it getting horizontal sums
+    auto first_perm = first.swapped(); // 2perm
+    auto second_perm = second.swapped(); // 2perm
+    // summ
+    auto first_ret = first - first_perm; // 2sub
+    auto second_ret = second - second_perm; // 2 sub
+    // now lets choose evens
+    return mergee(first_ret, second_ret); // 2 mergee's
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> mergee() const {
+    return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_floating_point<U>::value, int> = 0>
+  Vectorized<T> mergeo() const {
+    return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, uint8_t>::value, int> = 0>
+  Vectorized<int32_t> to_vec_float_helper() const {
+    int32_t values[8] = {
+      _vec0[0],
+      _vec0[1],
+      _vec0[2],
+      _vec0[3],
+      _vec0[4],
+      _vec0[5],
+      _vec0[6],
+      _vec0[7],
+    };
+
+    return Vectorized<int32_t>{
+      values[0], values[1], values[2], values[3],
+      values[4], values[5], values[6], values[7]
+    };
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, int32_t>::value, int> = 0>
+  Vectorized<uint8_t> to_vec_uint8_helper() const {
+    // helper function for float to uint8_t conversion
+    uint8_t values[8] = {
+      static_cast<uint8_t>(_vec0[0]),
+      static_cast<uint8_t>(_vec0[1]),
+      static_cast<uint8_t>(_vec0[2]),
+      static_cast<uint8_t>(_vec0[3]),
+      static_cast<uint8_t>(_vec1[0]),
+      static_cast<uint8_t>(_vec1[1]),
+      static_cast<uint8_t>(_vec1[2]),
+      static_cast<uint8_t>(_vec1[3]),
+    };
+
+    return Vectorized<uint8_t>{
+      values[0], values[1], values[2], values[3],
+      values[4], values[5], values[6], values[7],
+      0, 0, 0, 0,
+      0, 0, 0, 0,
+      0, 0, 0, 0,
+      0, 0, 0, 0,
+      0, 0, 0, 0,
+      0, 0, 0, 0,
+    };
+  }
+};
+
+template <>
+inline Vectorized<int64_t> operator~(const Vectorized<int64_t>& a) {
+  return a._not();
+}
+
+template <>
+inline Vectorized<int32_t> operator~(const Vectorized<int32_t>& a) {
+  return a._not();
+}
+
+template <>
+inline Vectorized<int16_t> operator~(const Vectorized<int16_t>& a) {
+  return a._not();
+}
+
+template <>
+inline Vectorized<int8_t> operator~(const Vectorized<int8_t>& a) {
+  return a._not();
+}
+
+template <>
+inline Vectorized<uint8_t> operator~(const Vectorized<uint8_t>& a) {
+  return a._not();
+}
+
+#define DEFINE_MAXMIN_FUNCS(operand_type)                                     \
+  template <>                                                                 \
+  Vectorized<operand_type> inline maximum(                                    \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return a.maximum(b);                                                      \
+  }                                                                           \
+  template <>                                                                 \
+  Vectorized<operand_type> inline minimum(                                    \
+      const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
+    return a.minimum(b);                                                      \
+  }
+
+#define DEFINE_CLAMP_MAXMIN_FUNCS(typex)                          \
+  DEFINE_MAXMIN_FUNCS(typex)                                      \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp_min(                  \
+      const Vectorized<typex>& a, const Vectorized<typex>& min) { \
+    return a.clamp_min(min);                                      \
+  }                                                               \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp_max(                  \
+      const Vectorized<typex>& a, const Vectorized<typex>& max) { \
+    return a.clamp_max(max);                                      \
+  }                                                               \
+  template <>                                                     \
+  Vectorized<typex> C10_ALWAYS_INLINE clamp(                      \
+      const Vectorized<typex>& a,                                 \
+      const Vectorized<typex>& min,                               \
+      const Vectorized<typex>& max) {                             \
+    return clamp_max(clamp_min(a, min), max);                     \
+  }
+
+DEFINE_CLAMP_MAXMIN_FUNCS(int8_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(uint8_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int16_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int32_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(int64_t)
+DEFINE_CLAMP_MAXMIN_FUNCS(float)
+DEFINE_CLAMP_MAXMIN_FUNCS(double)
+
+namespace { /* unnamed namespace */
+
+#if !defined(vec_float) || __ARCH__ < 13
+#warning \
+    "float->int and int->float conversion is simulated. compile for z15 for improved performance"
+inline ZSimdVect<float> vec_int_flt(const ZSimdVect<int> x) {
+  return ZSimdVect<float>{float(x[0]), float(x[1]), float(x[2]), float(x[3])};
+}
+inline ZSimdVect<int> vec_flt_int(const ZSimdVect<float> x) {
+  return ZSimdVect<int>{int(x[0]), int(x[1]), int(x[2]), int(x[3])};
+}
+#else
+#define vec_int_flt vec_float
+#define vec_flt_int vec_signed
+#endif
+
+Vectorized<float> convert_to_float(const Vectorized<int32_t>& x) {
+  return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())};
+}
+
+Vectorized<int32_t> convert_to_int(const Vectorized<float>& x) {
+  return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())};
+}
+
+Vectorized<double> convert_to_float(const Vectorized<int64_t>& x) {
+  return {vec_double(x.vec0()), vec_double(x.vec1())};
+}
+
+Vectorized<int64_t> convert_to_int(const Vectorized<double>& x) {
+  return {vec_signed(x.vec0()), vec_signed(x.vec1())};
+}
+
+} /* unnamed namespace */
+
+template <typename T, typename V>
+Vectorized<V> cast_zvector(const Vectorized<T>& x) {
+  using cast_type = typename Vectorized<V>::vtype;
+  return Vectorized<V>{(cast_type)x.vec0(), (cast_type)x.vec1()};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b,
+    const Vectorized<float>& c) {
+  return Vectorized<float>{
+      __builtin_s390_vfmasb(a.vec0(), b.vec0(), c.vec0()),
+      __builtin_s390_vfmasb(a.vec1(), b.vec1(), c.vec1())};
+}
+template <>
+Vectorized<double> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b,
+    const Vectorized<double>& c) {
+  return Vectorized<double>{
+      __builtin_s390_vfmadb(a.vec0(), b.vec0(), c.vec0()),
+      __builtin_s390_vfmadb(a.vec1(), b.vec1(), c.vec1())};
+}
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int16_t>& a,
+    const Vectorized<int16_t>& b,
+    const Vectorized<int16_t>& c) {
+  return Vectorized<int16_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b,
+    const Vectorized<int32_t>& c) {
+  return Vectorized<int32_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE fmadd(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b,
+    const Vectorized<int64_t>& c) {
+  return Vectorized<int64_t>{
+      a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<double>(const Vectorized<double>& src) {
+  return convert_to_int(src);
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE
+convert_to_int_of_same_size<float>(const Vectorized<float>& src) {
+  return convert_to_int(src);
+}
+
+template <>
+inline void convert(const int32_t* src, float* dst, int64_t n) {
+  // int32_t and float have same size
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<float>::size());
+       i += Vectorized<float>::size()) {
+    const int32_t* src_a = src + i;
+    float* dst_a = dst + i;
+    auto input_vec = Vectorized<int32_t>::loadu(src_a);
+    auto output_vec = convert_to_float(input_vec);
+    output_vec.store(dst_a);
+  }
+
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int64_t* src, double* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i <= (n - Vectorized<double>::size());
+       i += Vectorized<double>::size()) {
+    const int64_t* src_a = src + i;
+    double* dst_a = dst + i;
+    auto input_vec = Vectorized<int64_t>::loadu(src_a);
+    auto output_vec = convert_to_float(input_vec);
+    output_vec.store(dst_a);
+  }
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+#define DEFINE_REINTERPRET_CAST_FUNCS(Fst, Cst)     \
+  template <>                                       \
+  C10_ALWAYS_INLINE Vectorized<Cst> cast<Cst, Fst>( \
+      const Vectorized<Fst>& src) {                 \
+    return cast_zvector<Fst, Cst>(src);             \
+  }
+
+#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(Fst) \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, double)      \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, float)       \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int64_t)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int32_t)     \
+  DEFINE_REINTERPRET_CAST_FUNCS(Fst, int16_t)
+
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t)
+DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t)
+
+#undef DEFINE_REINTERPRET_CAST_FUNCS
+
+template <typename T>
+struct unpack_type {
+  using type = T;
+};
+template <>
+struct unpack_type<int8_t> {
+  using type = int16_t;
+};
+template <>
+struct unpack_type<uint8_t> {
+  using type = int16_t;
+};
+template <>
+struct unpack_type<int16_t> {
+  using type = int32_t;
+};
+
+template <typename T>
+struct pack_type {
+  using type = T;
+};
+template <>
+struct pack_type<int16_t> {
+  using type = int8_t;
+};
+template <>
+struct pack_type<int32_t> {
+  using type = int16_t;
+};
+
+namespace { /* unnamed namespace */
+
+template <typename T, typename V = typename unpack_type<T>::type>
+std::pair<Vectorized<V>, Vectorized<V>> unpack(const Vectorized<T>& x) {
+  auto vec0 = vec_unpackh(x.vec0());
+  auto vec1 = vec_unpackl(x.vec0());
+  auto vec2 = vec_unpackh(x.vec1());
+  auto vec3 = vec_unpackl(x.vec1());
+  return {Vectorized<V>{vec0, vec1}, Vectorized<V>{vec2, vec3}};
+}
+
+template <>
+std::pair<Vectorized<int16_t>, Vectorized<int16_t>> unpack<uint8_t, int16_t>(
+    const Vectorized<uint8_t>& x) {
+  using typeX = typename Vectorized<uint16_t>::vtype;
+  typeX vec0 = vec_unpackh(x.vec0());
+  typeX vec1 = vec_unpackl(x.vec0());
+  typeX vec2 = vec_unpackh(x.vec1());
+  typeX vec3 = vec_unpackl(x.vec1());
+  // auto mask = Vectorized<uint16_t>(0xFF);
+  // vec0 = vec0 & mask;
+  // vec1 = vec1 & mask;
+  // vec2 = vec2 & mask;
+  // vec3 = vec3 & mask;
+  return {
+      cast_zvector<uint16_t, int16_t>(Vectorized<uint16_t>{vec0, vec1}),
+      cast_zvector<uint16_t, int16_t>(Vectorized<uint16_t>{vec2, vec3})};
+}
+
+template <typename T, typename V = typename pack_type<T>::type>
+Vectorized<V> pack(const Vectorized<T>& first, const Vectorized<T>& second) {
+  auto vec0 = vec_packs(first.vec0(), first.vec1());
+  auto vec1 = vec_packs(second.vec0(), second.vec1());
+  return Vectorized<V>{vec0, vec1};
+}
+
+template <>
+Vectorized<uint8_t> pack(
+    const Vectorized<int16_t>& first,
+    const Vectorized<int16_t>& second) {
+  auto vec0 = vec_packsu(first.vec0(), first.vec1());
+  auto vec1 = vec_packsu(second.vec0(), second.vec1());
+  return Vectorized<uint8_t>{vec0, vec1};
+}
+
+} /* unnamed namespace */
+
+//////////////////////////////////QUANT///////////////////////////////////////////
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented_quant<T>()>> {
+ public:
+  using value_type = typename T::underlying;
+  using vtype = ZSimdVect<value_type>;
+  using vmaskType = ZSimdVectBinary<value_type>;
+  using vinner_type = Vectorized<value_type>;
+  using size_type = int;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(value_type);
+  }
+
+  static constexpr size_t float_num_vecs() {
+    return size() / Vectorized<float>::size();
+  }
+  static constexpr int int_num_vecs() {
+    return float_num_vecs();
+  }
+  using float_vec_return_type = std::array<Vectorized<float>, float_num_vecs()>;
+  using int_vec_return_type =
+      std::array<Vectorized<c10::qint32>, int_num_vecs()>;
+
+ private:
+  vinner_type _vec;
+
+ public:
+  Vectorized() {}
+
+  explicit C10_ALWAYS_INLINE Vectorized(vinner_type v) : _vec{v} {}
+  Vectorized(const T& val) : _vec(val.val_) {}
+
+  C10_ALWAYS_INLINE const vinner_type& vec() const {
+    return _vec;
+  }
+
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    return Vectorized<T>{vinner_type::loadu(ptr, count)};
+  }
+
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    _vec.store(ptr, count);
+  }
+
+  Vectorized<T> relu(Vectorized<T> zero_point) const {
+    return Vectorized<T>{_vec.maximum(zero_point._vec)};
+  }
+
+  Vectorized<T> relu6(Vectorized<T> zero_point, Vectorized<T> q_six) const {
+    auto ret_max = _vec.maximum(zero_point._vec);
+    auto ret_min = ret_max.minimum(q_six._vec);
+    return Vectorized<T>{ret_min};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  int_vec_return_type widening_subtract(Vectorized<T> b) const {
+    return {*this - b};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    auto float_val = convert_to_float(_vec);
+    return {fmadd(scale, float_val, scale_zp_premul)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    auto float_val = convert_to_float(_vec);
+    return {(float_val - zero_point) * scale};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 1, int> = 0>
+  static Vectorized<T> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    Vectorized<float> vecf = rhs[0];
+    vecf = vecf * Vectorized<float>(inverse_scale);
+    vecf = vecf.rint() + Vectorized<float>((float)(zero_point));
+    auto veci = convert_to_int(vecf);
+
+    return Vectorized<T>{veci};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 1, int> = 0>
+  static Vectorized<T> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<T> vi = inp[0];
+    auto vecf = convert_to_float(vi.vec());
+    vecf = vecf * Vectorized<float>(multiplier);
+    vecf = vecf.rint();
+    auto veci = convert_to_int(vecf) + Vectorized<int>(zero_point);
+
+    return Vectorized<T>{veci};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 4, int> = 0>
+  int_vec_return_type widening_subtract(Vectorized<U> b) const {
+    auto ret16 = unpack(_vec);
+    auto ret16B = unpack(b.vec());
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+    auto ret32B_0 = unpack(ret16B.first);
+    auto ret32B_1 = unpack(ret16B.second);
+
+    return {
+        Vectorized<c10::qint32>(ret32_0.first - ret32B_0.first),
+        Vectorized<c10::qint32>(ret32_0.second - ret32B_0.second),
+        Vectorized<c10::qint32>(ret32_1.first - ret32B_1.first),
+        Vectorized<c10::qint32>(ret32_1.second - ret32B_1.second)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  float_vec_return_type C10_ALWAYS_INLINE dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    // unpacking unsigned as signed
+    auto ret16 = unpack(_vec);
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+
+    auto vecf_0 = convert_to_float(ret32_0.first);
+    auto vecf_1 = convert_to_float(ret32_0.second);
+
+    auto vecf_2 = convert_to_float(ret32_1.first);
+    auto vecf_3 = convert_to_float(ret32_1.second);
+    return {
+        fmadd(scale, vecf_0, scale_zp_premul),
+        fmadd(scale, vecf_1, scale_zp_premul),
+        fmadd(scale, vecf_2, scale_zp_premul),
+        fmadd(scale, vecf_3, scale_zp_premul)};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    // unpacking unsigned as signed
+    auto ret16 = unpack(_vec);
+    auto ret32_0 = unpack(ret16.first);
+    auto ret32_1 = unpack(ret16.second);
+
+    auto vecf_0 = convert_to_float(ret32_0.first);
+    auto vecf_1 = convert_to_float(ret32_0.second);
+
+    auto vecf_2 = convert_to_float(ret32_1.first);
+    auto vecf_3 = convert_to_float(ret32_1.second);
+
+    return {
+        (vecf_0 - zero_point) * scale,
+        (vecf_1 - zero_point) * scale,
+        (vecf_2 - zero_point) * scale,
+        (vecf_3 - zero_point) * scale };
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::float_num_vecs() == 4, int> = 0>
+  static Vectorized<T> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto vec_inverse = Vectorized<float>(inverse_scale);
+    auto vec_zero_point = Vectorized<float>((float)zero_point);
+
+    auto vecf0 = rhs[0];
+    auto vecf2 = rhs[1];
+    auto vecf4 = rhs[2];
+    auto vecf6 = rhs[3];
+
+    vecf0 = vecf0 * vec_inverse;
+    vecf2 = vecf2 * vec_inverse;
+    vecf4 = vecf4 * vec_inverse;
+    vecf6 = vecf6 * vec_inverse;
+
+    vecf0 = vecf0.rint() + vec_zero_point;
+    vecf2 = vecf2.rint() + vec_zero_point;
+    vecf4 = vecf4.rint() + vec_zero_point;
+    vecf6 = vecf6.rint() + vec_zero_point;
+
+    auto veci0 = convert_to_int(vecf0);
+    auto veci2 = convert_to_int(vecf2);
+    auto veci4 = convert_to_int(vecf4);
+    auto veci6 = convert_to_int(vecf6);
+
+    auto vecshi0 = pack(veci0, veci2);
+    auto vecshi2 = pack(veci4, veci6);
+    auto ret = pack<int16_t, typename U::underlying>(vecshi0, vecshi2);
+
+    return Vectorized<T>{ret};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<Vectorized<U>::int_num_vecs() == 4, int> = 0>
+  static Vectorized<U> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<float> vec_multiplier = Vectorized<float>(multiplier);
+    Vectorized<int32_t> vec_zero_point = Vectorized<int32_t>(zero_point);
+
+    Vectorized<c10::qint32> vi0 = inp[0];
+    Vectorized<c10::qint32> vi1 = inp[1];
+    Vectorized<c10::qint32> vi2 = inp[2];
+    Vectorized<c10::qint32> vi3 = inp[3];
+
+    auto vecf0 = convert_to_float(vi0.vec());
+    auto vecf2 = convert_to_float(vi1.vec());
+
+    auto vecf4 = convert_to_float(vi2.vec());
+    auto vecf6 = convert_to_float(vi3.vec());
+
+    vecf0 = vecf0 * vec_multiplier;
+    vecf2 = vecf2 * vec_multiplier;
+
+    vecf4 = vecf4 * vec_multiplier;
+    vecf6 = vecf6 * vec_multiplier;
+
+    vecf0 = vecf0.rint();
+    vecf2 = vecf2.rint();
+    vecf4 = vecf4.rint();
+    vecf6 = vecf6.rint();
+
+    auto veci0 = convert_to_int(vecf0);
+    auto veci2 = convert_to_int(vecf2);
+    auto veci4 = convert_to_int(vecf4);
+    auto veci6 = convert_to_int(vecf6);
+
+    veci0 = veci0 + vec_zero_point;
+    veci2 = veci2 + vec_zero_point;
+
+    veci4 = veci4 + vec_zero_point;
+    veci6 = veci6 + vec_zero_point;
+
+    auto vecshi0 = pack<int32_t, int16_t>(veci0, veci2);
+    auto vecshi2 = pack<int32_t, int16_t>(veci4, veci6);
+
+    auto ret = pack<int16_t, typename U::underlying>(vecshi0, vecshi2);
+
+    return Vectorized<U>{ret};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator+(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec + other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator-(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec - other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator*(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec * other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator/(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec / other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator&(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec & other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator|(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec | other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator^(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec ^ other._vec};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator==(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec == other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator!=(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec != other._vec};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator>(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec > other._vec};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator>=(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec >= other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator<(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec < other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator<=(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec <= other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.eq(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.ne(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE gt(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.gt(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ge(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.ge(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE lt(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.lt(other._vec)};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE le(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.le(other._vec)};
+  }
+
+  Vectorized<T> clamp_min(const Vectorized<T>& min) const {
+    return Vectorized<T>{_vec.clamp_min(min._vec)};
+  }
+
+  Vectorized<T> clamp_max(const Vectorized<T>& max) const {
+    return Vectorized<T>{_vec.clamp_max(max._vec)};
+  }
+
+  Vectorized<T> minimum(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.minimum(other._vec)};
+  }
+
+  Vectorized<T> maximum(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec.maximum(other._vec)};
+  }
+};
+
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::quint8)
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint8)
+DEFINE_CLAMP_MAXMIN_FUNCS(c10::qint32)
+
+template <typename U = float>
+constexpr auto real_mask() {
+  return (ZSimdVect<U>)ZSimdVectBinary<float>{0xFFFFFFFF, 0, 0xFFFFFFFF, 0};
+}
+
+template <>
+constexpr auto real_mask<double>() {
+  return (ZSimdVect<double>)ZSimdVectBinary<double>{0xFFFFFFFFFFFFFFFF, 0};
+}
+
+template <typename U = float>
+constexpr auto image_mask() {
+  return (ZSimdVect<U>)ZSimdVectBinary<U>{0, 0xFFFFFFFF, 0, 0xFFFFFFFF};
+}
+
+template <>
+constexpr auto image_mask<double>() {
+  return (ZSimdVect<double>)ZSimdVectBinary<double>{0, 0xFFFFFFFFFFFFFFFF};
+}
+
+template <typename U = float>
+constexpr auto rsign_mask() {
+  return ZSimdVect<U>{-0.f, 0.f, -0.f, 0.f};
+}
+
+template <>
+constexpr auto rsign_mask<double>() {
+  return ZSimdVect<double>{-0.0, 0.f};
+}
+
+template <typename U = float>
+constexpr auto isign_mask() {
+  return ZSimdVect<U>{0.0, -0.f, 0.0, -0.f};
+}
+
+template <>
+constexpr auto isign_mask<double>() {
+  return ZSimdVect<double>{0.0, -0.0};
+}
+
+template <typename U = float>
+constexpr auto image_one() {
+  return ZSimdVect<U>{0, 1.f, 0, 1.f};
+}
+
+template <>
+constexpr auto image_one<double>() {
+  return ZSimdVect<double>{0.0, 1.0};
+}
+
+template <typename U = float>
+constexpr auto pi_half() {
+  return ZSimdVect<U>{(float)(M_PI / 2.0), 0.f, (float)(M_PI / 2.0), 0.f};
+}
+
+template <>
+constexpr auto pi_half<double>() {
+  return ZSimdVect<double>{M_PI / 2.0, 0.0};
+}
+
+template <typename U = float>
+constexpr auto image_half() {
+  return ZSimdVect<U>{0, 0.5f, 0, 0.5f};
+}
+
+template <>
+constexpr auto image_half<double>() {
+  return ZSimdVect<double>{0.0, 0.5};
+}
+
+template <typename U>
+constexpr U log2e_inv() {
+  return static_cast<U>(1.4426950408889634);
+}
+
+template <typename U>
+constexpr U log10e_inv() {
+  return static_cast<U>(0.43429448190325176);
+}
+
+template <typename T>
+struct Vectorized<T, std::enable_if_t<is_zarch_implemented_complex<T>()>> {
+ public:
+  using underline_type = decltype(std::declval<T>().imag());
+  using value_type = T;
+  using vtype = ZSimdVect<underline_type>;
+  using vmaskType = ZSimdVectBinary<underline_type>;
+  using vinner_type = Vectorized<underline_type>;
+  using size_type = int;
+  using vinner_data = typename Vectorized<underline_type>::vinner_data;
+
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(value_type);
+  }
+
+ private:
+  vinner_type _vec;
+
+ public:
+  Vectorized() {}
+
+  C10_ALWAYS_INLINE Vectorized(const vinner_data &v) : _vec{v.first, v.second} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 16), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2)
+      : _vec{s1.real(), s1.imag(), s2.real(), s2.imag()} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4)
+      : _vec{
+            s1.real(),
+            s1.imag(),
+            s2.real(),
+            s2.imag(),
+            s3.real(),
+            s3.imag(),
+            s4.real(),
+            s4.imag()} {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 16), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s) : Vectorized<T>(s, s) {}
+
+  template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
+  C10_ALWAYS_INLINE Vectorized(T s) : Vectorized<T>(s, s, s, s) {}
+
+  C10_ALWAYS_INLINE operator vinner_type() const {
+    return _vec;
+  }
+
+  C10_ALWAYS_INLINE const vinner_type& vec() const {
+    return _vec;
+  }
+
+  C10_ALWAYS_INLINE operator vinner_data() const {
+    return _vec.data();
+  }
+
+  C10_ALWAYS_INLINE vinner_data data() const {
+    return _vec.data();
+  }
+
+  static Vectorized<T> C10_ALWAYS_INLINE
+  loadu(const void* ptr, int count = size()) {
+    return Vectorized<T>{vinner_type::loadu(ptr, 2 * count)};
+  }
+
+  void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const {
+    return _vec.store(ptr, 2 * count);
+  }
+
+  static Vectorized<T> blendv(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      const Vectorized<T>& mask) {
+    // convert std::complex<V> index mask to V index mask: xy -> xxyy
+    vinner_type vmask = mask.vec();
+    auto mask_complex = vinner_type(
+        vec_mergeh(vmask.vec0(), vmask.vec0()),
+        vec_mergeh(vmask.vec1(), vmask.vec1()));
+    return Vectorized<T>{vinner_type::blendv(a.vec(), b.vec(), mask_complex)};
+  }
+
+  template <int64_t mask>
+  static auto C10_ALWAYS_INLINE
+  blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    constexpr int mask_complex = maskForComplex<sizeof(T)>(mask);
+    return Vectorized<T>{
+        vinner_type::template blend<mask_complex>(a.vec(), b.vec())};
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 16, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(base, base + step);
+  }
+
+  template <typename step_t, typename U = T>
+  static std::enable_if_t<sizeof(U) == 8, Vectorized<T>> arange(
+      T base = 0,
+      step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+        base,
+        base + step,
+        base + value_type(2) * step,
+        base + value_type(3) * step);
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z >= C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    return b;
+  }
+
+  template <int16_t Z, int16_t C>
+  static inline std::enable_if_t<(Z < C), Vectorized<T>> set_inner(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count) {
+    if (count == Z)
+      return blend<allbitset(Z)>(a, b);
+    else
+      return set_inner<Z + 1, C>(a, b, count);
+  }
+
+  static Vectorized<T> set(
+      const Vectorized<T>& a,
+      const Vectorized<T>& b,
+      size_t count = size()) {
+    if (count == 0)
+      return a;
+    return set_inner<1, size()>(a, b, count);
+  }
+
+  const T& operator[](int idx) const = delete;
+  T& operator[](int idx) = delete;
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(const T&)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{
+        f(T(v0[0], v0[1])),
+        f(T(v0[2], v0[3])),
+        f(T(v1[0], v1[1])),
+        f(T(v1[2], v1[3]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  Vectorized<U> mapOrdinary(T (*const f)(const T&)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(T)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{
+        f(T(v0[0], v0[1])),
+        f(T(v0[2], v0[3])),
+        f(T(v1[0], v1[1])),
+        f(T(v1[2], v1[3]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  Vectorized<T> mapOrdinary(T (*const f)(T)) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    return Vectorized<T>{f(T(v0[0], v0[1])), f(T(v1[0], v1[1]))};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      T (*const f)(const T&, const T&),
+      const Vectorized<T>& b) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    auto bvec = b.vec();
+    auto b0 = bvec.vec0();
+    auto b1 = bvec.vec1();
+    T a00 = f(T(v0[0], v0[1]), T(b0[0], b0[1]));
+    T a01 = f(T(v0[2], v0[3]), T(b0[2], b0[3]));
+    T a02 = f(T(v1[0], v1[1]), T(b1[0], b1[1]));
+    T a03 = f(T(v1[2], v1[3]), T(b1[2], b1[3]));
+    return Vectorized<T>{a00, a01, a02, a03};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  inline Vectorized<T> mapOrdinary(
+      T (*const f)(const T&, const T&),
+      const Vectorized<T>& b) const {
+    auto v0 = _vec.vec0();
+    auto v1 = _vec.vec1();
+    auto bvec = b.vec();
+    auto b0 = bvec.vec0();
+    auto b1 = bvec.vec1();
+    U a00 = f(U(v0[0], v0[1]), U(b0[0], b0[1]));
+    U a01 = f(U(v1[0], v1[1]), U(b1[0], b1[1]));
+    return Vectorized<T>{a00, a01};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator+(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec + other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator-(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec - other._vec};
+  }
+
+  Vectorized<T> inline operator*(const Vectorized<T>& b) const {
+    //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+    vinner_type bv = b.vec();
+#if !defined(ZVECTOR_SIMULATE_X86_MULT)
+    // this is more z arch friendly than simulating horizontal from x86
+    vinner_type vi = bv.mergeo();
+    vinner_type vr = bv.mergee();
+    vi = vi ^ rsign_mask<underline_type>();
+    vinner_type ret = _vec * vr;
+    vinner_type vx_swapped = _vec.swapped();
+    ret = fmadd(vx_swapped, vi, ret);
+#else
+    vinner_type ac_bd = _vec * b;
+    vinner_type d_c = bv.swapped();
+    d_c = d_c ^ isign_mask<underline_type>();
+    vinner_type ad_bc = _vec * d_c;
+    vinner_type ret = vinner_type::horizontal_sub_perm(ac_bd, ad_bc);
+#endif
+    return Vectorized<T>{ret};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<float>>::value, int> = 0>
+  static typename Vectorized<T>::vinner_type real_neg(const typename Vectorized<T>::vinner_type &a)
+  {
+    const auto swap_mask = ZSimdVectBinary<uint8_t>{
+      0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31};
+
+    auto a_neg = a.neg();
+    vtype v0 = vec_perm(a_neg.vec0(), a.vec0(), swap_mask);
+    vtype v1 = vec_perm(a_neg.vec1(), a.vec1(), swap_mask);
+    return {v0, v1};
+  }
+
+  template <
+      typename U = T,
+      std::enable_if_t<std::is_same<U, c10::complex<double>>::value, int> = 0>
+  static typename Vectorized<T>::vinner_type real_neg(const typename Vectorized<T>::vinner_type &a)
+  {
+    auto a_neg = a.neg();
+    auto v0 = vec_permi(a_neg.vec0(), a.vec0(), 1);
+    auto v1 = vec_permi(a_neg.vec1(), a.vec1(), 1);
+    return { v0, v1 };
+  }
+
+  Vectorized<T> inline operator/(const Vectorized<T>& b) const {
+    // Unfortunately, this breaks some tests
+    // Implement it like it's done for avx2
+    auto fabs_cd = b.vec().abs();                               // |c|    |d|
+    auto fabs_dc = fabs_cd.swapped();                           // |d|    |c|
+    auto scale = vinner_type {1.0} / maximum(fabs_cd, fabs_dc); // 1/sc     1/sc
+    auto a2 = vec() * scale;                                    // a/sc     b/sc
+    auto b2 = b.vec() * scale;                                  // c/sc     d/sc
+    auto acbd2 = a2 * b2;                                       // ac/sc^2  bd/sc^2
+
+    auto dc2 = b2.swapped();                                    // d/sc         c/sc
+    dc2 = Vectorized<T>::real_neg(dc2);                         // -d/|c,d|        c/sc
+    auto adbc2 = a2 * dc2;                                      // -ad/sc^2      bc/sc^2
+    auto sum1 = acbd2 + acbd2.swapped();                        // (ac+bd)/sc^2  (ac+bd)/sc^2
+    auto sum2 = adbc2 + adbc2.swapped();                        // (bc-ad)/sc^2  (bc-ad)/sc^2
+    auto res2 = vinner_type::mergee(sum1, sum2);                // (ac+bd)/sc^2  (bc-ad)/sc^2
+
+    // get the denominator
+    auto denom2 = Vectorized<T>{b2}.abs_2_();                   // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+    res2 = res2 / denom2;
+    return Vectorized<T>{ res2 };
+  }
+
+  Vectorized<T> angle2_() const {
+    auto b_a = _vec.swapped(); // b        a
+    return Vectorized<T>{_vec.atan2(b_a).swapped()};
+  }
+
+  Vectorized<T> angle() const {
+    return angle2_().real();
+  }
+
+  Vectorized<T> atan() const {
+    // atan(x) = i/2 * ln((i + z)/(i - z))
+    auto ione = Vectorized<T>{vinner_type(image_one<underline_type>())};
+    auto sum = ione + *this;
+    auto sub = ione - *this;
+    auto ln = (sum / sub).log(); // ln((i + z)/(i - z))
+    return ln *
+        Vectorized<T>{vinner_type(image_half<underline_type>())}; // i/2*ln()
+  }
+
+  Vectorized<T> atanh() const {
+    return mapOrdinary(std::atanh);
+  }
+
+  Vectorized<T> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+#if 1
+    vinner_type cnj = conj().vec();
+    vinner_type b_a = cnj.swapped();
+    vinner_type ab = cnj * b_a;
+    vinner_type im = ab + ab;
+    vinner_type val_2 = _vec * _vec;
+    vinner_type val_2_swapped = val_2.swapped();
+    vinner_type re = vinner_type::horizontal_sub_perm(val_2, val_2_swapped);
+    re = vinner_type(static_cast<underline_type>(1)) - re;
+    constexpr int blend_mask =
+        blend_choice<T>(); // 0x0A for complex<double> , 0xAA for complex<float>
+    vinner_type blendx = vinner_type::template blend<blend_mask>(re, im);
+    auto root = Vectorized<T>(blendx).sqrt();
+    auto ln = Vectorized<T>(Vectorized<T>(b_a) + root).log();
+    return Vectorized<T>(ln.vec().swapped()).conj();
+#else
+    return mapOrdinary(std::asin);
+#endif
+  }
+
+  Vectorized<T> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    return Vectorized<T>(vinner_type(pi_half<underline_type>())) - asin();
+  }
+
+  Vectorized<T> sin() const {
+    return mapOrdinary(std::sin);
+  }
+  Vectorized<T> sinh() const {
+    return mapOrdinary(std::sinh);
+  }
+  Vectorized<T> cos() const {
+    return mapOrdinary(std::cos);
+  }
+  Vectorized<T> cosh() const {
+    return mapOrdinary(std::cosh);
+  }
+  Vectorized<T> ceil() const {
+    return Vectorized<T>{_vec.ceil()};
+  }
+  Vectorized<T> floor() const {
+    return Vectorized<T>{_vec.floor()};
+  }
+  Vectorized<T> neg() const {
+    return Vectorized<T>(_vec.neg());
+  }
+  Vectorized<T> round() const {
+    return Vectorized<T>{_vec.round()};
+  }
+  Vectorized<T> tan() const {
+    return mapOrdinary(std::tan);
+  }
+  Vectorized<T> tanh() const {
+    return mapOrdinary(std::tanh);
+  }
+  Vectorized<T> trunc() const {
+    return Vectorized<T>{_vec.trunc()};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator&(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec & other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator|(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec | other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator^(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec ^ other._vec};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE operator==(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec == other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE operator!=(const Vectorized<T>& other) const {
+    return Vectorized<T>{_vec != other._vec};
+  }
+
+  Vectorized<T> C10_ALWAYS_INLINE eq(const Vectorized<T>& other) const {
+    auto eq = _vec.eq(other._vec);  // compares real and imag individually
+    // If both real numbers and imag numbers are equal, then the complex numbers are equal
+    auto real = eq & vinner_type(real_mask<underline_type>());
+    auto imag = (eq & vinner_type(image_mask<underline_type>())).swapped();
+    return Vectorized<T>{real & imag};
+  }
+  Vectorized<T> C10_ALWAYS_INLINE ne(const Vectorized<T>& other) const {
+    auto ne = _vec.ne(other._vec);  // compares real and imag individually
+    // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+    auto real = ne & vinner_type(real_mask<underline_type>());
+    auto imag = (ne & vinner_type(image_mask<underline_type>())).swapped();
+    return Vectorized<T>{real | imag};
+  }
+
+  Vectorized<T> real() const {
+    return Vectorized<T>(_vec & vinner_type(real_mask<underline_type>()));
+  }
+  Vectorized<T> imag_() const {
+    return Vectorized<T>(_vec & vinner_type(image_mask<underline_type>()));
+  }
+  Vectorized<T> imag() const {
+    return Vectorized<T>{
+        (_vec & vinner_type(image_mask<underline_type>())).swapped()};
+  }
+
+  Vectorized<T> conj() const {
+    return Vectorized<T>(_vec ^ vinner_type(isign_mask<underline_type>()));
+  }
+
+  vinner_data abs_2_() const {
+    auto a = _vec * _vec;
+    a = a + a.swapped();
+    return a.mergee().data();
+  }
+
+  static T abs_helper(const T &value)
+  {
+    return T(std::abs(value));
+  }
+
+  Vectorized<T> abs() const {
+    return mapOrdinary(abs_helper);
+  }
+
+  Vectorized<T> exp() const {
+    return mapOrdinary(std::exp);
+  }
+
+  Vectorized<T> exp2() const {
+    return mapOrdinary(exp2_impl);
+  }
+
+  Vectorized<T> expm1() const {
+    return mapOrdinary(std::expm1);
+  }
+
+  Vectorized<T> log() const {
+    return mapOrdinary(std::log);
+  }
+
+  Vectorized<T> log2() const {
+    // log2eB_inv
+    auto ret = log();
+    return Vectorized<T>{ret._vec * vinner_type(log2e_inv<underline_type>())};
+  }
+
+  Vectorized<T> log10() const {
+    auto ret = log();
+    return Vectorized<T>{ret._vec * vinner_type(log10e_inv<underline_type>())};
+  }
+
+  Vectorized<T> log1p() const {
+    return mapOrdinary(std::log1p);
+  }
+
+  Vectorized<T> sgn() const {
+    return mapOrdinary(at::native::sgn_impl);
+  }
+
+  Vectorized<T> pow(const Vectorized<T>& exp) const {
+    return mapOrdinary(std::pow, exp);
+  }
+
+  Vectorized<T> sqrt() const {
+    return mapOrdinary(std::sqrt);
+  }
+
+  Vectorized<T> reciprocal() const {
+    // re + im*i = (a + bi)  / (c + di)
+    // re = (ac + bd)/abs_2() = c/abs_2()
+    // im = (bc - ad)/abs_2() = d/abs_2()
+    vinner_type c_d = _vec ^ vinner_type(isign_mask<underline_type>());
+    vinner_type abs = abs_2_();
+    return Vectorized<T>{c_d / abs};
+  }
+
+  Vectorized<T> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+
+  Vectorized<T> operator<(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> operator<=(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> operator>(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> operator>=(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> lt(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> le(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> gt(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<T> ge(const Vectorized<T>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+};
+
+template <typename T, std::enable_if_t<(sizeof(T) == 8), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_interleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a      = {a0, a1, a2, a3}
+  //   b      = {b0, b1, b2, b3}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype ab00 = vec_permi(a.vec0(), b.vec0(), 0);
+  vtype ab11 = vec_permi(a.vec0(), b.vec0(), 3);
+  vtype ab2_00 = vec_permi(a.vec1(), b.vec1(), 0);
+  vtype ab2_11 = vec_permi(a.vec1(), b.vec1(), 3);
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(
+      Vectorized<T>{ab00, ab11}, Vectorized<T>{ab2_00, ab2_11});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 8), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_deinterleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype aa01 = vec_permi(a.vec0(), a.vec1(), 0);
+  vtype aa23 = vec_permi(b.vec0(), b.vec1(), 0);
+
+  vtype bb_01 = vec_permi(a.vec0(), a.vec1(), 3);
+  vtype bb_23 = vec_permi(b.vec0(), b.vec1(), 3);
+
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(Vectorized<T>{aa01, aa23}, Vectorized<T>{bb_01, bb_23});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 4), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_interleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3,, b4, b5, b6, b7}
+  using vtype = typename Vectorized<T>::vtype;
+  vtype ab0011 = vec_mergeh(a.vec0(), b.vec0());
+  vtype ab2233 = vec_mergel(a.vec0(), b.vec0());
+
+  vtype ab2_0011 = vec_mergeh(a.vec1(), b.vec1());
+  vtype ab2_2233 = vec_mergel(a.vec1(), b.vec1());
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5,, a6, b6, a7, b7}
+
+  return std::make_pair(
+      Vectorized<T>{ab0011, ab2233}, Vectorized<T>{ab2_0011, ab2_2233});
+}
+
+template <typename T, std::enable_if_t<(sizeof(T) == 4), int> = 0>
+std::pair<Vectorized<T>, Vectorized<T>> inline inner_deinterleave2(
+    const Vectorized<T>& a,
+    const Vectorized<T>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1,, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5,, a6, b6, a7, b7}
+  using vtype = typename Vectorized<T>::vtype;
+  // {a0,a2,b0,b2} {a1,a3,b1,b3}
+  vtype a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1());
+  vtype a1a3b1b3 = vec_mergel(a.vec0(), a.vec1());
+
+  vtype aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3);
+  vtype bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3);
+
+  vtype a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1());
+  vtype a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1());
+
+  vtype aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2);
+  vtype bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2);
+
+  // it could be done with vec_perm ,too
+  // swap lanes:
+  //   return {a0, a1, a2, a3,, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3,, b4, b5, b6, b7}
+
+  return std::make_pair(
+      Vectorized<T>{aa0123, aa0123_2}, Vectorized<T>{bb0123, bb0123_2});
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline interleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return inner_interleave2<float>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int32_t>, Vectorized<int32_t>> inline interleave2<int32_t>(
+    const Vectorized<int32_t>& a,
+    const Vectorized<int32_t>& b) {
+  return inner_interleave2<int32_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline interleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return inner_interleave2<double>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline interleave2<int64_t>(
+    const Vectorized<int64_t>& a,
+    const Vectorized<int64_t>& b) {
+  return inner_interleave2<int64_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>> inline deinterleave2<float>(
+    const Vectorized<float>& a,
+    const Vectorized<float>& b) {
+  return inner_deinterleave2<float>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int32_t>, Vectorized<int32_t>> inline deinterleave2<
+    int32_t>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return inner_deinterleave2<int32_t>(a, b);
+}
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>> inline deinterleave2<double>(
+    const Vectorized<double>& a,
+    const Vectorized<double>& b) {
+  return inner_deinterleave2<double>(a, b);
+}
+
+template <>
+std::pair<Vectorized<int64_t>, Vectorized<int64_t>> inline deinterleave2<
+    int64_t>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return inner_deinterleave2<int64_t>(a, b);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value, at::vec::Vectorized<float>>::type
+inline convert_int8_to_float(const Vectorized<T> &src) {
+  // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+  // Only handle first 64 bits
+  auto vec_int = src.to_vec_float_helper();
+
+  return convert_to_float(vec_int);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value, at::vec::Vectorized<T>>::type
+inline convert_float_to_int8(const Vectorized<float> &src) {
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  auto vec_int = clamp(convert_to_int(src), Vectorized<int32_t>(min_val), Vectorized<int32_t>(max_val));
+
+  return vec_int.to_vec_uint8_helper();
+}
+
+#undef DEFINE_CLAMP_MAXMIN_FUNCS
+#undef DEFINE_MAXMIN_FUNCS
+} // namespace
+} // namespace vec
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h
new file mode 100644
index 0000000000000000000000000000000000000000..782f0d3950f5bbb2dfe4387e6aa0d1d48aafc14b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512.h
@@ -0,0 +1,275 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/vec512/vec512_float.h>
+#include <ATen/cpu/vec/vec512/vec512_bfloat16.h>
+#include <ATen/cpu/vec/vec512/vec512_double.h>
+#include <ATen/cpu/vec/vec512/vec512_int.h>
+#include <ATen/cpu/vec/vec512/vec512_qint.h>
+#include <ATen/cpu/vec/vec512/vec512_complex_float.h>
+#include <ATen/cpu/vec/vec512/vec512_complex_double.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+
+namespace at {
+namespace vec {
+
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return _mm512_castpd_ps(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return _mm512_castps_pd(src);
+}
+
+template<>
+inline Vectorized<float> cast<float, int32_t>(const Vectorized<int32_t>& src) {
+  return _mm512_castsi512_ps(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, int64_t>(const Vectorized<int64_t>& src) {
+  return _mm512_castsi512_pd(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline gather(const double* base_addr, const Vectorized<int64_t>& vindex) {
+  return _mm512_i64gather_pd(vindex, base_addr, scale);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline gather(const float* base_addr, const Vectorized<int32_t>& vindex) {
+  return _mm512_i32gather_ps(vindex, base_addr, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+                   const Vectorized<int64_t>& vindex, Vectorized<double>& mask) {
+  auto all_ones = _mm512_castsi512_pd(_mm512_set1_epi64(0xFFFFFFFFFFFFFFFF));
+  auto mask_ = _mm512_cmp_pd_mask(all_ones, mask.values, _CMP_EQ_OQ);
+  return _mm512_mask_i64gather_pd(src, mask_, vindex, base_addr, scale);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+                   const Vectorized<int32_t>& vindex, Vectorized<float>& mask) {
+  auto all_ones = _mm512_castsi512_ps(_mm512_set1_epi32(0xFFFFFFFF));
+  auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ);
+  return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+Vectorized<int64_t>
+inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
+  return _mm512_cvtpd_epi64(src);
+}
+
+template<>
+Vectorized<int32_t>
+inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+  return _mm512_cvttps_epi32(src);
+}
+
+template<>
+Vectorized<double>
+inline convert_to_fp_of_same_size<double>(const Vectorized<int64_t> &src) {
+  return _mm512_cvtepi64_pd(src);
+}
+
+template<>
+Vectorized<float>
+inline convert_to_fp_of_same_size<float>(const Vectorized<int32_t> &src) {
+  return _mm512_cvtepi32_ps(src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  __m512i idx1 = _mm512_set_epi64(11, 3, 10, 2, 9, 1, 8, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 7, 14, 6, 13, 5, 12, 4);
+  return std::make_pair(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+                        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  //
+  //  return:
+  //    {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //    {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
+  __m512i idx1 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4,
+                                  19, 3, 18, 2, 17, 1, 16, 0);
+  __m512i idx2 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12,
+                                  27, 11, 26, 10, 25, 9, 24, 8);
+  return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+                        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // output:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  // The members of indices have been written in binary format for better understandability
+  __m512i idx1 = _mm512_set_epi64(14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+                        _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+  //   b = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
+  // output:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+  __m512i idx1 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16,
+                                  14, 12, 10, 8, 6, 4, 2, 0);
+  __m512i idx2 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17,
+                                  15, 13, 11, 9, 7, 5, 3, 1);
+
+  return std::make_pair(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+                        _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> flip(const Vectorized<float> & v) {
+  const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+                                        8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_ps(mask, v);
+}
+
+template<>
+inline Vectorized<double> flip(const Vectorized<double> & v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_pd(mask, v);
+}
+
+template<>
+inline Vectorized<int64_t> flip(const Vectorized<int64_t> & v) {
+  const __m512i mask = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  return _mm512_permutexvar_epi64(mask, v);
+}
+
+template<>
+inline Vectorized<int32_t> flip(const Vectorized<int32_t> & v) {
+  const __m512i mask = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+                                        8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm512_permutexvar_epi32(mask, v);
+}
+
+template<>
+inline Vectorized<int16_t> flip(const Vectorized<int16_t> & v) {
+  const __m512i mask = _mm512_set_epi16(
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+  );
+  return _mm512_permutexvar_epi16(mask, v);
+}
+
+inline __m512i flip8(const __m512i & v) {
+  const __m512i mask1 = _mm512_set_epi8(
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  );
+  const __m512i mask2 = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+  auto reversed_vec = _mm512_shuffle_epi8(v, mask1);
+  return _mm512_permutexvar_epi64(mask2, reversed_vec);
+}
+
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
+
+#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..96180ed1096da1775374d85144a644a95c6ccf1c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_bfloat16.h
@@ -0,0 +1,1644 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+// bfloat16 conversion
+static inline void cvtbf16_fp32(const __m256i& a, __m512& o) {
+  o = _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
+  __m256i lo = _mm512_extracti32x8_epi32(a, 0);
+  __m256i hi = _mm512_extracti32x8_epi32(a, 1);
+  cvtbf16_fp32(lo, o1);
+  cvtbf16_fp32(hi, o2);
+}
+
+static inline __m256i cvtfp32_bf16(const __m512& src) {
+  __m512i value = _mm512_castps_si512(src);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_value = _mm512_add_epi32(t_value, vec_bias);
+  // input += rounding_bias;
+  t_value = _mm512_add_epi32(t_value, value);
+  // input = input >> 16;
+  t_value = _mm512_srli_epi32(t_value, 16);
+  // Check NaN before converting back to bf16
+  t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+  return _mm512_cvtusepi32_epi16(t_value);
+}
+
+static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) {
+  __m512i lo = _mm512_castps_si512(a);
+  __m512i hi = _mm512_castps_si512(b);
+  __m512i nan = _mm512_set1_epi32(0xffff);
+  auto mask_lo = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+  auto mask_hi = _mm512_cmp_ps_mask(b, b, _CMP_ORD_Q);
+  __m512i ones = _mm512_set1_epi32(0x1);
+  __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+  // uint32_t lsb = (input >> 16) & 1;
+  auto t_lo = _mm512_and_si512(_mm512_srli_epi32(lo, 16), ones);
+  auto t_hi = _mm512_and_si512(_mm512_srli_epi32(hi, 16), ones);
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t_lo = _mm512_add_epi32(t_lo, vec_bias);
+  t_hi = _mm512_add_epi32(t_hi, vec_bias);
+  // input += rounding_bias;
+  t_lo = _mm512_add_epi32(t_lo, lo);
+  t_hi = _mm512_add_epi32(t_hi, hi);
+  // input = input >> 16;
+  t_lo = _mm512_srli_epi32(t_lo, 16);
+  t_hi = _mm512_srli_epi32(t_hi, 16);
+  // Check NaN before converting back to bf16
+  t_lo = _mm512_mask_blend_epi32(mask_lo, nan, t_lo);
+  t_hi = _mm512_mask_blend_epi32(mask_hi, nan, t_hi);
+
+  t_lo = _mm512_packus_epi32(t_lo, t_hi); // t_hi[4-7] t_lo[4-7] t_hi[0-4] t_lo[0-4]
+  __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+  return _mm512_permutexvar_epi64(idx, t_lo);
+}
+
+static inline __m512i merge_compare_result(const __m512& a, const __m512& b) {
+  __m512i lo = _mm512_castps_si512(a);
+  __m512i hi = _mm512_castps_si512(b);
+  lo = _mm512_srli_epi32(lo, 16);
+  hi = _mm512_srli_epi32(hi, 16);
+  auto out = _mm512_packus_epi32(lo, hi);
+  __m512i idx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+  return _mm512_permutexvar_epi64(idx, out);
+}
+
+// float16 conversion
+static inline void cvtfp16_fp32(const __m256i& a, __m512& o) {
+  o = _mm512_cvtph_ps(a);
+}
+
+static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) {
+  __m256i lo = _mm512_extracti32x8_epi32(a, 0);
+  __m256i hi = _mm512_extracti32x8_epi32(a, 1);
+  cvtfp16_fp32(lo, o1);
+  cvtfp16_fp32(hi, o2);
+}
+
+static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) {
+  __m256i lo = _mm512_cvtps_ph(
+      a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m256i hi = _mm512_cvtps_ph(
+      b, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  __m512 t_lo = _mm512_castsi512_ps(_mm512_castsi256_si512(lo));
+  __m256 t_hi = _mm256_castsi256_ps(hi);
+  return _mm512_castps_si512(_mm512_insertf32x8(t_lo, t_hi, 1));
+}
+
+// dtype conversion between float16/bfloat16 and float32
+template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m256i& a, __m512& o);
+template <> inline void cvt_to_fp32<BFloat16>(const __m256i& a, __m512& o) {
+  cvtbf16_fp32(a, o);
+}
+template <> inline void cvt_to_fp32<Half>(const __m256i& a, __m512& o) {
+  cvtfp16_fp32(a, o);
+}
+
+template <typename T, typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline void cvt_to_fp32(const __m512i& a, __m512& o1, __m512& o2);
+template <> inline void cvt_to_fp32<BFloat16>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtbf16_fp32(a, o1, o2);
+}
+template <> inline void cvt_to_fp32<Half>(const __m512i& a, __m512& o1, __m512& o2) {
+  cvtfp16_fp32(a, o1, o2);
+}
+
+template <typename T, bool is_compare_op = false,
+          typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline __m512i cvt_from_fp32(const __m512& a, const __m512& b);
+template <> inline __m512i cvt_from_fp32<BFloat16, false>(const __m512& a, const __m512& b) {
+  return cvtfp32_bf16(a, b);
+}
+template <> inline __m512i cvt_from_fp32<BFloat16, true>(const __m512& a, const __m512& b) {
+  return merge_compare_result(a, b);
+}
+template <> inline __m512i cvt_from_fp32<Half, false>(const __m512& a, const __m512& b) {
+  return cvtfp32_fp16(a, b);
+}
+template <> inline __m512i cvt_from_fp32<Half, true>(const __m512& a, const __m512& b) {
+  return cvtfp32_fp16(a, b);
+}
+
+template <typename T>
+class Vectorized16 {
+static_assert(
+  is_reduced_floating_point_v<T>,
+  "Support only float16 and bfloat16.");
+private:
+  __m512i values;
+public:
+  using value_type = uint16_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 32;
+  }
+  Vectorized16() {}
+  Vectorized16(__m512i v) : values(v) {}
+  Vectorized16(T val) {
+    value_type uw = val.x;
+    values = _mm512_set1_epi16(uw);
+  }
+  Vectorized16(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16,
+         T val17, T val18, T val19, T val20,
+         T val21, T val22, T val23, T val24,
+         T val25, T val26, T val27, T val28,
+         T val29, T val30, T val31, T val32) {
+    values = _mm512_set_epi16(
+        val32.x, val31.x, val30.x, val29.x, val28.x, val27.x, val26.x, val25.x,
+        val24.x, val23.x, val22.x, val21.x, val20.x, val19.x, val18.x, val17.x,
+        val16.x, val15.x, val14.x, val13.x, val12.x, val11.x, val10.x, val9.x,
+        val8.x, val7.x, val6.x, val5.x, val4.x, val3.x, val2.x, val1.x);
+  }
+  operator __m512i() const {
+    return values;
+  }
+  T& operator[](int idx) = delete;
+  const T& operator[](int idx) const  = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    return _mm512_cmpeq_epi16_mask(values, _mm512_set1_epi16(0));
+  }
+  static Vectorized<T> loadu(const void* ptr, int16_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+
+    __mmask32 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_epi16(mask, ptr);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask32 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi16(ptr, mask, values);
+    }
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    __at_align__ int16_t tmp_values[size()];
+    a.store(tmp_values);
+    if (mask & 0x01)
+      tmp_values[0] = b.values[31];
+    if (mask & 0x02)
+      tmp_values[1] = b.values[30];
+    if (mask & 0x04)
+      tmp_values[2] = b.values[29];
+    if (mask & 0x08)
+      tmp_values[3] = b.values[28];
+    if (mask & 0x10)
+      tmp_values[4] = b.values[27];
+    if (mask & 0x20)
+      tmp_values[5] = b.values[26];
+    if (mask & 0x40)
+      tmp_values[6] = b.values[25];
+    if (mask & 0x80)
+      tmp_values[7] = b.values[24];
+    if (mask & 0x100)
+      tmp_values[8] = b.values[23];
+    if (mask & 0x200)
+      tmp_values[9] = b.values[22];
+    if (mask & 0x400)
+      tmp_values[10] = b.values[21];
+    if (mask & 0x800)
+      tmp_values[11] = b.values[20];
+    if (mask & 0x1000)
+      tmp_values[12] = b.values[19];
+    if (mask & 0x2000)
+      tmp_values[13] = b.values[18];
+    if (mask & 0x4000)
+      tmp_values[14] = b.values[17];
+    if (mask & 0x8000)
+      tmp_values[15] = b.values[16];
+    if (mask & 0x10000)
+      tmp_values[16] = b.values[15];
+    if (mask & 0x20000)
+      tmp_values[17] = b.values[14];
+    if (mask & 0x40000)
+      tmp_values[18] = b.values[13];
+    if (mask & 0x80000)
+      tmp_values[19] = b.values[12];
+    if (mask & 0x100000)
+      tmp_values[20] = b.values[11];
+    if (mask & 0x200000)
+      tmp_values[21] = b.values[10];
+    if (mask & 0x400000)
+      tmp_values[22] = b.values[9];
+    if (mask & 0x800000)
+      tmp_values[23] = b.values[8];
+    if (mask & 0x1000000)
+      tmp_values[24] = b.values[7];
+    if (mask & 0x2000000)
+      tmp_values[25] = b.values[6];
+    if (mask & 0x4000000)
+      tmp_values[26] = b.values[5];
+    if (mask & 0x8000000)
+      tmp_values[27] = b.values[4];
+    if (mask & 0x10000000)
+      tmp_values[28] = b.values[3];
+    if (mask & 0x20000000)
+      tmp_values[29] = b.values[2];
+    if (mask & 0x40000000)
+      tmp_values[30] = b.values[1];
+    if (mask & 0x80000000)
+      tmp_values[31] = b.values[0];
+    return loadu(tmp_values);
+  }
+  static Vectorized<T> blendv(const Vectorized<T>& a,
+      const Vectorized<T>& b, const Vectorized<T>& mask) {
+    auto all_ones = _mm512_set1_epi16(0xFFFF);
+    auto mask_ = _mm512_cmp_epi16_mask(mask, all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi16(mask_, a.values, b.values);
+  }
+  template<typename step_t>
+  static Vectorized<T> arange(T base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step);
+  }
+  static Vectorized<T> set(const Vectorized<T>& a,
+      const Vectorized<T>& b, int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+      case 16:
+        return blend<65535>(a, b);
+      case 17:
+        return blend<131071>(a, b);
+      case 18:
+        return blend<262143>(a, b);
+      case 19:
+        return blend<524287>(a, b);
+      case 20:
+        return blend<1048575>(a, b);
+      case 21:
+        return blend<2097151>(a, b);
+      case 22:
+        return blend<4194303>(a, b);
+      case 23:
+        return blend<8388607>(a, b);
+      case 24:
+        return blend<16777215>(a, b);
+      case 25:
+        return blend<33554431>(a, b);
+      case 26:
+        return blend<67108863>(a, b);
+      case 27:
+        return blend<134217727>(a, b);
+      case 28:
+        return blend<268435455>(a, b);
+      case 29:
+        return blend<536870911>(a, b);
+      case 30:
+        return blend<1073741823>(a, b);
+      case 31:
+        return blend<2147483647>(a, b);
+    }
+    return b;
+  }
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wignored-qualifiers"
+  Vectorized<T> map(const __m512 (*const vop)(__m512)) const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    const auto o1 = vop(lo);
+    const auto o2 = vop(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> isnan() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __mmask16 lo_mask, hi_mask;
+    __m512 zero = _mm512_set1_ps(0.0);
+    __m512i zeroi = _mm512_castps_si512(zero);
+    lo_mask = _mm512_cmp_ps_mask(lo, zero, _CMP_UNORD_Q);
+    lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, lo_mask, 0xFFFF'FFFF));
+    hi_mask = _mm512_cmp_ps_mask(hi, zero, _CMP_UNORD_Q);
+    hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zeroi, hi_mask, 0xFFFF'FFFF));
+    return merge_compare_result(lo, hi);
+  }
+  #pragma clang diagnostic pop
+  Vectorized<T> abs() const {
+    return _mm512_andnot_si512(_mm512_set1_epi16(0x8000), values);
+  }
+  Vectorized<T> angle() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto angle_lambda = [](__m512 values) {
+      const auto zero_vec = _mm512_set1_ps(0.f);
+      const auto nan_vec = _mm512_set1_ps(NAN);
+      const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+      const auto non_nan_mask_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec),
+                                                           not_nan_mask, 0xFFFFFFFF);
+      const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(non_nan_mask_vec),
+                                               zero_vec, _CMP_EQ_OQ);
+      const auto pi = _mm512_set1_ps(c10::pi<float>);
+
+      const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
+      auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi);
+      angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec);
+      return angle;
+    };
+    auto o1 = angle_lambda(lo);
+    auto o2 = angle_lambda(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm512_set1_epi16(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+  Vectorized<T> acos() const {
+    return map(Sleef_acosf16_u10);
+  }
+  Vectorized<T> acosh() const {
+    return map(Sleef_acoshf16_u10);
+  }
+  Vectorized<T> asin() const {
+    return map(Sleef_asinf16_u10);
+  }
+  Vectorized<T> atan() const {
+    return map(Sleef_atanf16_u10);
+  }
+  Vectorized<T> atanh() const {
+    return map(Sleef_atanhf16_u10);
+  }
+  Vectorized<T> atan2(const Vectorized<T> &b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_atan2f16_u10(lo, b1);
+    auto o2 = Sleef_atan2f16_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> copysign(const Vectorized<T> &sign) const {
+    // copy sign bit (0x8000) from sign and remaining bits from values
+    __m512i mask_value = _mm512_set1_epi32(~0x80008000);
+    __m512i mask_signbit = _mm512_set1_epi32(0x80008000);
+    return Vectorized<T>(
+      _mm512_or_si512(
+        _mm512_and_si512(values, mask_value),
+        _mm512_and_si512(sign, mask_signbit)));
+  }
+  Vectorized<T> erf() const {
+    return map(Sleef_erff16_u10);
+  }
+  Vectorized<T> erfc() const {
+    return map(Sleef_erfcf16_u15);
+  }
+  Vectorized<T> erfinv() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_erfinv(tmp1[i]);
+      tmp2[i] = calc_erfinv(tmp2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> exp() const {
+    return map(Sleef_expf16_u10);
+  }
+  Vectorized<T> exp2() const {
+    return map(Sleef_exp2f16_u10);
+  }
+  Vectorized<T> expm1() const {
+    return map(Sleef_expm1f16_u10);
+  }
+  Vectorized<T> exp_u20() const {
+    return exp();
+  }
+  Vectorized<T> fmod(const Vectorized<T> & q) const {
+    __m512 x_lo, x_hi;
+    cvt_to_fp32<T>(values, x_lo, x_hi);
+    __m512 q_lo, q_hi;
+    cvtbf16_fp32(q.values, q_lo, q_hi);
+    auto o1 = Sleef_fmodf16(x_lo, q_lo);
+    auto o2 = Sleef_fmodf16(x_hi, q_hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> hypot(const Vectorized<T> &b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_hypotf16_u05(lo, b1);
+    auto o2 = Sleef_hypotf16_u05(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    for (int64_t i = 0; i < size() / 2; i++) {
+      tmp1[i] = calc_i0(tmp1[i]);
+      tmp2[i] = calc_i0(tmp2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> i0e() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_i0e(tmp1[i]);
+      tmp2[i] = calc_i0e(tmp2[i]);
+    }
+    const auto o1 = _mm512_loadu_ps(tmp1);
+    const auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> digamma() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    constexpr auto sz = size();
+    __at_align__ float tmp1[sz / 2], tmp2[sz / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
+      tmp1[i] = calc_digamma(tmp1[i]);
+      tmp2[i] = calc_digamma(tmp2[i]);
+    }
+    const auto o1 = _mm512_loadu_ps(tmp1);
+    const auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> igamma(const Vectorized<T> &x) const {
+    __m512 lo, hi;
+    __m512 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igamma(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igamma(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+
+  Vectorized<T> igammac(const Vectorized<T> &x) const {
+    __m512 lo, hi;
+    __m512 xlo, xhi;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(x.values, xlo, xhi);
+    __at_align__ float tmp1[size() / 2], tmp2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
+    __at_align__ float tmpx1[size() / 2], tmpx2[size() / 2];
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx1), xlo);
+    _mm512_storeu_ps(reinterpret_cast<float*>(tmpx2), xhi);
+    for (int64_t i = 0; i < size() / 2; ++i) {
+      tmp1[i] = calc_igammac(tmp1[i], tmpx1[i]);
+      tmp2[i] = calc_igammac(tmp2[i], tmpx2[i]);
+    }
+    auto o1 = _mm512_loadu_ps(tmp1);
+    auto o2 = _mm512_loadu_ps(tmp2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> log() const {
+    return map(Sleef_logf16_u10);
+  }
+  Vectorized<T> log2() const {
+    return map(Sleef_log2f16_u10);
+  }
+  Vectorized<T> log10() const {
+    return map(Sleef_log10f16_u10);
+  }
+  Vectorized<T> log1p() const {
+    return map(Sleef_log1pf16_u10);
+  }
+  Vectorized<T> sin() const {
+    return map(Sleef_sinf16_u10);
+  }
+  Vectorized<T> sinh() const {
+    return map(Sleef_sinhf16_u10);
+  }
+  Vectorized<T> cos() const {
+    return map(Sleef_cosf16_u10);
+  }
+  Vectorized<T> cosh() const {
+    return map(Sleef_coshf16_u10);
+  }
+  Vectorized<T> ceil() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_ceil_ps(lo);
+    auto o2 = _mm512_ceil_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> floor() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_floor_ps(lo);
+    auto o2 = _mm512_floor_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> neg() const {
+    return _mm512_xor_si512(values, _mm512_set1_epi16(0x8000));
+  }
+  Vectorized<T> round() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> tan() const {
+    return map(Sleef_tanf16_u10);
+  }
+  Vectorized<T> tanh() const {
+    return map(Sleef_tanhf16_u10);
+  }
+  Vectorized<T> trunc() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_roundscale_ps(lo, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    auto o2 = _mm512_roundscale_ps(hi, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> lgamma() const {
+    return map(Sleef_lgammaf16_u10);
+  }
+  Vectorized<T> sqrt() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto o1 = _mm512_sqrt_ps(lo);
+    auto o2 = _mm512_sqrt_ps(hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> reciprocal() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm512_set1_ps(1);
+    auto o1 = _mm512_div_ps(ones, lo);
+    auto o2 = _mm512_div_ps(ones, hi);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> rsqrt() const {
+    __m512 lo, hi;
+    cvt_to_fp32<T>(values, lo, hi);
+    auto ones = _mm512_set1_ps(1);
+    auto o1 = _mm512_div_ps(ones, _mm512_sqrt_ps(lo));
+    auto o2 = _mm512_div_ps(ones, _mm512_sqrt_ps(hi));
+    return cvt_from_fp32<T>(o1, o2);
+  }
+  Vectorized<T> pow(const Vectorized<T> &b) const {
+    __m512 lo, hi;
+    __m512 b1, b2;
+    cvt_to_fp32<T>(values, lo, hi);
+    cvt_to_fp32<T>(b.values, b1, b2);
+    auto o1 = Sleef_powf16_u10(lo, b1);
+    auto o2 = Sleef_powf16_u10(hi, b2);
+    return cvt_from_fp32<T>(o1, o2);
+  }
+private:
+  template<typename Op>
+  Vectorized<T> inline binary_compare(const Vectorized<T>& b, Op op) const {
+    __m512 a_lo, a_hi;
+    __m512 b_lo, b_hi;
+    cvt_to_fp32<T>(values, a_lo, a_hi);
+    cvt_to_fp32<T>(b.values, b_lo, b_hi);
+    auto o1 = op(a_lo, b_lo);
+    auto o2 = op(a_hi, b_hi);
+    return cvt_from_fp32<T, /*is_compare_op*/true>(o1, o2);
+  }
+
+public:
+  Vectorized<T> inline operator>(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator<(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator>=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator<=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator==(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+  Vectorized<T> inline operator!=(const Vectorized<T>& other) const {
+    return binary_compare(other, [](__m512 x, __m512 y) {
+      auto zero_vec = _mm512_set1_epi32(0);
+      auto cmp = _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ);
+      return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, cmp, 0xFFFFFFFF));
+    });
+  }
+};
+
+template<typename T, typename Op>
+static inline Vectorized<T> binary_op_as_fp32(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvt_to_fp32<T>(__m512i(a), a_lo, a_hi);
+  cvt_to_fp32<T>(__m512i(b), b_lo, b_hi);
+  auto o1 = op(a_lo, b_lo);
+  auto o2 = op(a_hi, b_hi);
+  return cvt_from_fp32<T>(o1, o2);
+}
+
+template <>
+class Vectorized<BFloat16>: public Vectorized16<BFloat16> {
+public:
+  using Vectorized16::Vectorized16;
+
+  Vectorized<BFloat16> frac() const;
+
+  Vectorized<BFloat16> eq(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ne(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> gt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> ge(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> lt(const Vectorized<BFloat16>& other) const;
+  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
+};
+
+Vectorized<BFloat16> inline operator+(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator-(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_sub_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator*(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_mul_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator/(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_div_ps(x, y); });
+}
+Vectorized<BFloat16> inline operator&(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<BFloat16> inline operator|(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<BFloat16> inline operator^(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::eq(const Vectorized<BFloat16>& other) const {
+  return (*this == other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ne(const Vectorized<BFloat16>& other) const {
+  return (*this != other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::gt(const Vectorized<BFloat16>& other) const {
+  return (*this > other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::ge(const Vectorized<BFloat16>& other) const {
+  return (*this >= other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::lt(const Vectorized<BFloat16>& other) const {
+  return (*this < other) & Vectorized<BFloat16>(1.0f);
+}
+
+inline Vectorized<BFloat16> Vectorized<BFloat16>::le(const Vectorized<BFloat16>& other) const {
+  return (*this <= other) & Vectorized<BFloat16>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<BFloat16> Vectorized<BFloat16>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline maximum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  auto max_lo = _mm512_max_ps(a_lo, b_lo);
+  auto max_hi = _mm512_max_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(max_lo, nan_lo);
+  auto o2 = _mm512_or_ps(max_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<BFloat16> inline minimum(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512i zero_vec = _mm512_set1_epi32(0);
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  auto min_lo = _mm512_min_ps(a_lo, b_lo);
+  auto min_hi = _mm512_min_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_lo_mask,
+                                                           0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_hi_mask,
+                                                           0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(min_lo, nan_lo);
+  auto o2 = _mm512_or_ps(min_hi, nan_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp(const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& min, const Vectorized<BFloat16>& max) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  __m512 max_lo, max_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(min), min_lo, min_hi);
+  cvtbf16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo));
+  auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi));
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_max(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& max) {
+  __m512 a_lo, a_hi;
+  __m512 max_lo, max_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, a_lo);
+  auto o2 = _mm512_min_ps(max_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vectorized<BFloat16>& min) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(min), min_lo, min_hi);
+  auto o1 = _mm512_max_ps(min_lo, a_lo);
+  auto o2 = _mm512_max_ps(min_hi, a_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+template <>
+inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
+    auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, BFloat16* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+    __m512 a = _mm512_loadu_ps(&src[i]);
+    __m512 b = _mm512_loadu_ps(&src[i + 16]);
+
+    __m512i bf = cvtfp32_bf16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, BFloat16* dst, int64_t n) {
+  auto load_float = [](const double *src) -> __m512 {
+    // Load one float vector from an array of doubles
+    __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
+    __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
+    return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<BFloat16>::size() <= n; i += Vectorized<BFloat16>::size()) {
+    __m512 a = load_float(&src[i]);
+    __m512 b = load_float(&src[i + 16]);
+
+    __m512i bf = cvtfp32_bf16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<BFloat16>(src[i]);
+  }
+}
+
+template <>
+Vectorized<BFloat16> inline fmadd(const Vectorized<BFloat16>& a,
+    const Vectorized<BFloat16>& b, const Vectorized<BFloat16>& c) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512 c_lo, c_hi;
+  cvtbf16_fp32(__m512i(a), a_lo, a_hi);
+  cvtbf16_fp32(__m512i(b), b_lo, b_hi);
+  cvtbf16_fp32(__m512i(c), c_lo, c_hi);
+  auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_bf16(o1, o2);
+}
+
+static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
+  __m512i r[8];
+  // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15   e0e1 e2e3 e4e5 e6e7 e8e9 e10e11 e12e13 e14e15
+  // b0-b15  f0-f15
+  // c0-c15  g0-g15
+  // d0-d15  h0-h15
+  // i0-i15  m0-m15
+  // j0-j15  n0-n15
+  // k0-k15  o0-o15
+  // l0-l15  p0-p15
+#pragma unroll(4)
+  for (int i = 0; i < 4; i++) {
+    r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
+    r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
+  }
+
+  // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11   e0e1 f0f1 e2e3 f2f3 e8e9 f8f9 e10e11 f10f11
+  // u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15   e4e5 f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15
+  // u2: c0c1 d0d1 c2c3 d2d3 c8c9 d8d9 c10c11 d10d11   g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11
+  // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
+  // i j  m n
+  // k l  o p
+#pragma unroll(4)
+  for (int i = 0; i < 8; i += 2) {
+    u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
+    u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
+  }
+
+  // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9  e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 g8g9 h8h9
+  // r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11  e2e3 f2f3 g2g3 h2h3 e10e11 f10f11 g10g11 h10h11
+  // r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 c12c13 d12d13
+  // r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15
+  // r4: i j k l m n o p
+  r[0] = _mm512_unpacklo_epi64(u[0], u[2]);
+  r[1] = _mm512_unpackhi_epi64(u[0], u[2]);
+  r[2] = _mm512_unpacklo_epi64(u[1], u[3]);
+  r[3] = _mm512_unpackhi_epi64(u[1], u[3]);
+  r[4] = _mm512_unpacklo_epi64(u[4], u[6]);
+  r[5] = _mm512_unpackhi_epi64(u[4], u[6]);
+  r[6] = _mm512_unpacklo_epi64(u[5], u[7]);
+  r[7] = _mm512_unpackhi_epi64(u[5], u[7]);
+
+  __m512i const1 = _mm512_set_epi32(
+      0x00370035,
+      0x00330031,
+      0x00270025,
+      0x00230021,
+      0x00170015,
+      0x00130011,
+      0x00070005,
+      0x00030001,
+      0x00360034,
+      0x00320030,
+      0x00260024,
+      0x00220020,
+      0x00160014,
+      0x00120010,
+      0x00060004,
+      0x00020000);
+  __m512i const2 = _mm512_set_epi32(
+      0x003f003d,
+      0x003b0039,
+      0x002f002d,
+      0x002b0029,
+      0x001f001d,
+      0x001b0019,
+      0x000f000d,
+      0x000b0009,
+      0x003e003c,
+      0x003a0038,
+      0x002e002c,
+      0x002a0028,
+      0x001e001c,
+      0x001a0018,
+      0x000e000c,
+      0x000a0008);
+  // merge values from two regs
+  // 0-- 1--
+  // 8-- 9--
+  // 2-- 3--
+  // 10-- 11--
+  // 4-- 5--
+  // 12-- 13--
+  // 6-- 7--
+  // 14-- 15--
+#pragma unroll(4)
+  for (int i = 0; i < 4; i++) {
+    u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
+    u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
+  }
+}
+
+// TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template<>
+inline void transpose_mxn<BFloat16, 16, 16>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // a: a0  a1  a2  a3  a4  a5  a6  a7  a8  a9  a10 a11 a12 a13 a14 a15
+  // b: b0  b1  b2  b3  b4  b5  b6  b7  b8  b9  b10 b11 b12 b13 b14 b15
+  // c: c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+  // d: d0  d1  d2  d3  d4  d5  d6  d7  d8  d9  d10 d11 d12 d13 d14 d15
+  // e: e0  e1  e2  e3  e4  e5  e6  e7  e8  e9  e10 e11 e12 e13 e14 e15
+  // f: f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  f10 f11 f12 f13 f14 f15
+  // g: g0  g1  g2  g3  g4  g5  g6  g7  g8  g9  g10 g11 g12 g13 g14 g15
+  // h: h0  h1  h2  h3  h4  h5  h6  h7  h8  h9  h10 h11 h12 h13 h14 h15
+  // i: i0  i1  i2  i3  i4  i5  i6  i7  i8  i9  i10 i11 i12 i13 i14 i15
+  // j: j0  j1  j2  j3  j4  j5  j6  j7  j8  j9  j10 j11 j12 j13 j14 j15
+  // k: k0  k1  k2  k3  k4  k5  k6  k7  k8  k9  k10 k11 k12 k13 k14 k15
+  // l: l0  l1  l2  l3  l4  l5  l6  l7  l8  l9  l10 l11 l12 l13 l14 l15
+  // m: m0  m1  m2  m3  m4  m5  m6  m7  m8  m9  m10 m11 m12 m13 m14 m15
+  // n: n0  n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14 n15
+  // o: o0  o1  o2  o3  o4  o5  o6  o7  o8  o9  o10 o11 o12 o13 o14 o15
+  // p: p0  p1  p2  p3  p4  p5  p6  p7  p8  p9  p10 p11 p12 p13 p14 p15
+#pragma unroll(16)
+  for (int i = 0; i < 16; i++) {
+    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#pragma unroll(8)
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+      _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607
+template<>
+inline void transpose_mxn<Half, 16, 16>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  __m256i t[16];
+  // load from src to registers
+  // Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
+#pragma unroll(16)
+  for (int i = 0; i < 16; i++) {
+    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
+  }
+
+  __m512i u[8];
+  _transpose_mxn_half_16_16(t, u);
+
+#pragma unroll(8)
+  for (int i = 0; i < 8; i++) {
+    _mm256_storeu_si256(
+      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
+      _mm512_extracti32x8_epi32(u[i], 0x0));
+    _mm256_storeu_si256(
+        reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst),
+        _mm512_extracti32x8_epi32(u[i], 0x01));
+  }
+}
+
+static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
+  // t[0]: 0 32 1 33 2 34 3 35 8 40 9 41 10 42 11 43 16 ... 59
+  // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63
+  // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123
+  // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127
+  // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 ... 187
+  // t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 175 148 ... 191
+  // t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 234 203 235 208 ... 251
+  // t[7]: 196 228 197 229 198 230 199 231 204 236 205 237 206 238 207 239 212 ... 255
+  // t[8]: 256 288 257 289 258 290 259 291 264 296 265 297 266 298 267 299 272 ... 315
+  // t[9]: 260 292 261 293 262 294 263 295 268 300 269 301 270 302 271 303 276 ... 319
+  // t[10]: 320 352 321 353 322 354 323 355 328 360 329 361 330 362 331 363 336 ... 379
+  // t[11]: 324 356 325 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383
+  // t[12]: 384 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443
+  // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404 ... 447
+  // t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459 491 464 ... 507
+  // t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462 494 463 495 468 ... 511
+  // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
+  // ...
+  // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
+#pragma unroll(16)
+  for (int i = 0; i < 16; ++i) {
+    d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
+    d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
+  }
+
+  // t[0]: 0 32 64 96 1 33 65 97 8 40 72 104 9 41 73 105 16 ... 121
+  // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123
+  // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125
+  // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127
+  // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 ... 249
+  // t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 235 146 ... 251
+  // t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 173 205 237 148 ... 253
+  // t[7]: 134 166 198 230 135 167 199 231 142 174 206 238 143 175 207 239 150 ... 255
+  // t[8]: 256 288 320 352 257 289 321 353 264 296 328 360 265 297 329 361 272 ... 377
+  // t[9]: 258 290 322 354 259 291 323 355 266 298 330 362 267 299 331 363 274 ... 379
+  // t[10]: 260 292 324 356 261 293 325 357 268 300 332 364 269 301 333 365 276 ... 381
+  // t[11]: 262 294 326 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383
+  // t[12]: 384 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505
+  // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402 ... 507
+  // t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461 493 404 ... 509
+  // t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399 431 463 495 406 ... 511
+  // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
+  // ...
+  // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
+#pragma unroll(8)
+  for (int i = 0; i < 8; ++i) {
+    r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
+    r[i * 4 + 2] = _mm512_unpacklo_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+    r[i * 4 + 3] = _mm512_unpackhi_epi32(d[i * 4 + 1], d[i * 4 + 3]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 8 40 72 104 136 168 200 232 16 ... 248
+  // t[1]: 1 33 65 97 129 161 193 225 9 41 73 105 137 169 201 233 17 ... 249
+  // t[2]: 2 34 66 98 130 162 194 226 10 42 74 106 138 170 202 234 18 ... 250
+  // t[3]: 3 35 67 99 131 163 195 227 11 43 75 107 139 171 203 235 19 ... 251
+  // t[4]: 4 36 68 100 132 164 196 228 12 44 76 108 140 172 204 236 20 ... 252
+  // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253
+  // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254
+  // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255
+  // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 ... 504
+  // t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 489 273 ... 505
+  // t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 426 458 490 274 ... 506
+  // t[11]: 259 291 323 355 387 419 451 483 267 299 331 363 395 427 459 491 275 ... 507
+  // t[12]: 260 292 324 356 388 420 452 484 268 300 332 364 396 428 460 492 276 ... 508
+  // t[13]: 261 293 325 357 389 421 453 485 269 301 333 365 397 429 461 493 277 ... 509
+  // t[14]: 262 294 326 358 390 422 454 486 270 302 334 366 398 430 462 494 278 ... 510
+  // t[15]: 263 295 327 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511
+  // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
+  // ...
+  // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
+#pragma unroll(4)
+  for (int i = 0; i < 4; ++i) {
+    d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
+    d[i * 8 + 2] = _mm512_unpacklo_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 3] = _mm512_unpackhi_epi64(r[i * 8 + 1], r[i * 8 + 5]);
+    d[i * 8 + 4] = _mm512_unpacklo_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 5] = _mm512_unpackhi_epi64(r[i * 8 + 2], r[i * 8 + 6]);
+    d[i * 8 + 6] = _mm512_unpacklo_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+    d[i * 8 + 7] = _mm512_unpackhi_epi64(r[i * 8 + 3], r[i * 8 + 7]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 16 ... 496
+  // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497
+  // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498
+  // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... 500
+  // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 ... 501
+  // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 22 ... 502
+  // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 23 ... 503
+  // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 24 ... 504
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 25 ... 505
+  // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 26 ... 506
+  // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 27 ... 507
+  // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 28 ... 508
+  // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 29 ... 509
+  // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 30 ... 510
+  // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 31 ... 511
+  // t[16]: 512 544 576 608 640 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008
+  // ...
+  // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 ... 1023
+  __m512i const1 = _mm512_set_epi64(
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000005,
+      0x0000000000000004,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const2 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000003,
+      0x0000000000000002);
+#pragma unroll(8)
+  for (int i = 0; i < 8; ++i) {
+    r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
+    r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
+    r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const1, d[i + 24]);
+    r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const2, d[i + 24]);
+  }
+
+  // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544 ... 992
+  // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 513 545 ... 993
+  // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 514 546 ... 994
+  // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 515 547 ... 995
+  // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 516 548 ... 996
+  // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 517 549 ... 997
+  // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 518 550 ... 998
+  // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999
+  // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000
+  // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553 ... 1001
+  // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 522 554 ... 1002
+  // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 523 555 ... 1003
+  // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 524 556 ... 1004
+  // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005
+  // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006
+  // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ... 1007
+  // t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 528 560 ... 1008
+  // ...
+  // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 ... 1023
+  __m512i const3 = _mm512_set_epi64(
+      0x000000000000000b,
+      0x000000000000000a,
+      0x0000000000000009,
+      0x0000000000000008,
+      0x0000000000000003,
+      0x0000000000000002,
+      0x0000000000000001,
+      0x0000000000000000);
+  __m512i const4 = _mm512_set_epi64(
+      0x000000000000000f,
+      0x000000000000000e,
+      0x000000000000000d,
+      0x000000000000000c,
+      0x0000000000000007,
+      0x0000000000000006,
+      0x0000000000000005,
+      0x0000000000000004);
+#pragma unroll(16)
+  for (int i = 0; i < 16; ++i) {
+    d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
+    d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
+  }
+}
+
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6
+template<>
+inline void transpose_mxn<BFloat16, 32, 32>(
+    const BFloat16* src,
+    int64_t ld_src,
+    BFloat16* dst,
+    int64_t ld_dst) {
+  // Load from memory
+  __m512i r[32];
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // Store to dst
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
+  }
+}
+
+template<>
+inline void transpose_mxn<Half, 32, 32>(
+    const Half* src,
+    int64_t ld_src,
+    Half* dst,
+    int64_t ld_dst) {
+  // Load from memory
+  __m512i r[32];
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
+  }
+
+  __m512i d[32];
+  _transpose_mxn_half_32_32(r, d);
+
+  // Store to dst
+#pragma unroll(32)
+  for (int i = 0; i < 32; ++i) {
+    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
+  }
+}
+
+template <>
+class Vectorized<Half>: public Vectorized16<Half> {
+public:
+  using Vectorized16::Vectorized16;
+
+  Vectorized<Half> frac() const;
+
+  Vectorized<Half> eq(const Vectorized<Half>& other) const;
+  Vectorized<Half> ne(const Vectorized<Half>& other) const;
+  Vectorized<Half> gt(const Vectorized<Half>& other) const;
+  Vectorized<Half> ge(const Vectorized<Half>& other) const;
+  Vectorized<Half> lt(const Vectorized<Half>& other) const;
+  Vectorized<Half> le(const Vectorized<Half>& other) const;
+};
+
+Vectorized<Half> inline operator+(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_add_ps(x, y); });
+}
+Vectorized<Half> inline operator-(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_sub_ps(x, y); });
+}
+Vectorized<Half> inline operator*(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_mul_ps(x, y); });
+}
+Vectorized<Half> inline operator/(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return binary_op_as_fp32(a, b, [](const __m512& x, const __m512& y) { return _mm512_div_ps(x, y); });
+}
+
+Vectorized<Half> inline operator&(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm512_and_si512(a, b);
+}
+Vectorized<Half> inline operator|(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm512_or_si512(a, b);
+}
+Vectorized<Half> inline operator^(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+inline Vectorized<Half> Vectorized<Half>::eq(const Vectorized<Half>& other) const {
+  return (*this == other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ne(const Vectorized<Half>& other) const {
+  return (*this != other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::gt(const Vectorized<Half>& other) const {
+  return (*this > other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::ge(const Vectorized<Half>& other) const {
+  return (*this >= other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::lt(const Vectorized<Half>& other) const {
+  return (*this < other) & Vectorized<Half>(1.0f);
+}
+
+inline Vectorized<Half> Vectorized<Half>::le(const Vectorized<Half>& other) const {
+  return (*this <= other) & Vectorized<Half>(1.0f);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<Half> Vectorized<Half>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline maximum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  auto max_lo = _mm512_max_ps(a_lo, b_lo);
+  auto max_hi = _mm512_max_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_set1_epi32(nan_lo_mask));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_set1_epi32(nan_hi_mask));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(max_lo, nan_lo);
+  auto o2 = _mm512_or_ps(max_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<Half> inline minimum(const Vectorized<Half>& a, const Vectorized<Half>& b) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512i zero_vec = _mm512_set1_epi32(0);
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  auto min_lo = _mm512_min_ps(a_lo, b_lo);
+  auto min_hi = _mm512_min_ps(a_hi, b_hi);
+  auto nan_lo_mask = _mm512_cmp_ps_mask(a_lo, b_lo, _CMP_UNORD_Q);
+  auto nan_hi_mask = _mm512_cmp_ps_mask(a_hi, b_hi, _CMP_UNORD_Q);
+  auto nan_lo = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_lo_mask,
+                                                           0xFFFFFFFF));
+  auto nan_hi = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, nan_hi_mask,
+                                                           0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  auto o1 = _mm512_or_ps(min_lo, nan_lo);
+  auto o2 = _mm512_or_ps(min_hi, nan_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp(const Vectorized<Half>& a,
+    const Vectorized<Half>& min, const Vectorized<Half>& max) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  __m512 max_lo, max_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(min), min_lo, min_hi);
+  cvtfp16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, _mm512_max_ps(min_lo, a_lo));
+  auto o2 = _mm512_min_ps(max_hi, _mm512_max_ps(min_hi, a_hi));
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_max(const Vectorized<Half>& a, const Vectorized<Half>& max) {
+  __m512 a_lo, a_hi;
+  __m512 max_lo, max_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(max), max_lo, max_hi);
+  auto o1 = _mm512_min_ps(max_lo, a_lo);
+  auto o2 = _mm512_min_ps(max_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Half>& min) {
+  __m512 a_lo, a_hi;
+  __m512 min_lo, min_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(min), min_lo, min_hi);
+  auto o1 = _mm512_max_ps(min_lo, a_lo);
+  auto o2 = _mm512_max_ps(min_hi, a_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+template <>
+inline void convert(const Half* src, Half* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
+    auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+inline void convert(const float* src, Half* dst, int64_t n) {
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+    __m512 a = _mm512_loadu_ps(&src[i]);
+    __m512 b = _mm512_loadu_ps(&src[i + 16]);
+
+    __m512i bf = cvtfp32_fp16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const double* src, Half* dst, int64_t n) {
+  auto load_float = [](const double *src) -> __m512 {
+    // Load one float vector from an array of doubles
+    __m256 a = _mm512_cvtpd_ps(_mm512_loadu_pd(src));
+    __m256 b = _mm512_cvtpd_ps(_mm512_loadu_pd(src + 8));
+    return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+  };
+
+  int64_t i;
+  for (i = 0; i + Vectorized<Half>::size() <= n; i += Vectorized<Half>::size()) {
+    __m512 a = load_float(&src[i]);
+    __m512 b = load_float(&src[i + 16]);
+
+    __m512i bf = cvtfp32_fp16(a, b);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(&dst[i]), bf);
+  }
+  for (; i < n; i++) {
+    dst[i] = c10::convert<Half>(src[i]);
+  }
+}
+
+template <>
+Vectorized<Half> inline fmadd(const Vectorized<Half>& a,
+    const Vectorized<Half>& b, const Vectorized<Half>& c) {
+  __m512 a_lo, a_hi;
+  __m512 b_lo, b_hi;
+  __m512 c_lo, c_hi;
+  cvtfp16_fp32(__m512i(a), a_lo, a_hi);
+  cvtfp16_fp32(__m512i(b), b_lo, b_hi);
+  cvtfp16_fp32(__m512i(c), c_lo, c_hi);
+  auto o1 = _mm512_fmadd_ps(a_lo, b_lo, c_lo);
+  auto o2 = _mm512_fmadd_ps(a_hi, b_hi, c_hi);
+  return cvtfp32_fp16(o1, o2);
+}
+
+#define CONVERT_VECTORIZED_INIT(type, name) \
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+  __m512 o1, o2; \
+  cvt_to_fp32<type>(__m512i(a), o1, o2); \
+  return std::make_tuple(o1, o2); \
+} \
+\
+inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
+ return cvt_from_fp32<type>(__m512(a), __m512(b)); \
+}
+CONVERT_VECTORIZED_INIT(BFloat16, bfloat16);
+CONVERT_VECTORIZED_INIT(Half, half);
+
+#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name) \
+inline std::tuple<Vectorized<float>, Vectorized<float>> convert_##name##_float(const Vectorized<type>& a) { \
+  constexpr int64_t K = Vectorized<type>::size(); \
+  __at_align__ float arr[K]; \
+  __at_align__ type arr2[K]; \
+  a.store(arr2); \
+  for (const auto k : c10::irange(K)) { \
+    arr[k] = c10::convert<float>(arr2[k]); \
+  } \
+  return std::make_tuple( \
+      Vectorized<float>::loadu(arr), \
+      Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+} \
+\
+inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const Vectorized<float>& b) { \
+  constexpr int64_t K = Vectorized<type>::size(); \
+  __at_align__ float arr[K]; \
+  __at_align__ type arr2[K]; \
+  a.store(arr); \
+  b.store(arr + Vectorized<float>::size()); \
+  for (const auto k : c10::irange(K)) { \
+    arr2[k] = c10::convert<type>(arr[k]); \
+  } \
+  return Vectorized<type>::loadu(arr2); \
+}
+CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
+CONVERT_NON_VECTORIZED_INIT(Half, half);
+
+#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#define LOAD_FP32_VECTORIZED_INIT(type, name) \
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+  auto values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)); \
+  __m512 out_values; \
+  cvt_to_fp32<type>(values, out_values); \
+  out = out_values; \
+} \
+\
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+  auto vec = Vectorized<type>::loadu(data); \
+  __m512 out1_values, out2_values; \
+  cvt_to_fp32<type>(vec, out1_values, out2_values); \
+  out1 = out1_values; \
+  out2 = out2_values; \
+}
+LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16);
+LOAD_FP32_VECTORIZED_INIT(Half, fp16);
+
+#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out) { \
+  __at_align__ float values[Vectorized<float>::size()]; \
+  for (const auto k : c10::irange(Vectorized<float>::size())) { \
+    values[k] = data[k]; \
+  } \
+  out = Vectorized<float>::loadu(values); \
+} \
+\
+inline void load_fp32_from_##name(const type *data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+  load_fp32_from_##name(data, out1); \
+  data += Vectorized<float>::size(); \
+  load_fp32_from_##name(data, out2); \
+}
+LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16);
+LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16);
+
+#endif
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..4455017576f4cc640ef92a6a1024cf2e31a0746c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_double.h
@@ -0,0 +1,512 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <> class Vectorized<c10::complex<double>> {
+private:
+  __m512d values;
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+public:
+  using value_type = c10::complex<double>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 4;
+  }
+  Vectorized() {}
+  Vectorized(__m512d v) : values(v) {}
+  Vectorized(c10::complex<double> val) {
+    double real_value = val.real();
+    double imag_value = val.imag();
+    values = _mm512_setr_pd(real_value, imag_value, real_value, imag_value,
+                            real_value, imag_value, real_value, imag_value);
+  }
+  Vectorized(c10::complex<double> val1, c10::complex<double> val2,
+            c10::complex<double> val3, c10::complex<double> val4) {
+    values = _mm512_setr_pd(val1.real(), val1.imag(),
+                            val2.real(), val2.imag(),
+                            val3.real(), val3.imag(),
+                            val4.real(), val4.imag());
+  }
+  operator __m512d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<double>> blend(const Vectorized<c10::complex<double>>& a,
+                                               const Vectorized<c10::complex<double>>& b) {
+     // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    // NOLINTNEXTLINE(clang-diagnostic-warning)
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm512_mask_blend_pd(0x03, a.values, b.values); //b0000 0001 = b0000 0011
+      case 2:
+        return _mm512_mask_blend_pd(0x0C, a.values, b.values); //b0000 0010 = b0000 1100
+      case 3:
+        return _mm512_mask_blend_pd(0x0F, a.values, b.values); //b0000 0011 = b0000 1111
+      case 4:
+        return _mm512_mask_blend_pd(0x30, a.values, b.values); //b0000 0100 = b0011 0000
+      case 5:
+        return _mm512_mask_blend_pd(0x33, a.values, b.values); //b0000 0101 = b0011 0011
+      case 6:
+        return _mm512_mask_blend_pd(0x3C, a.values, b.values); //b0000 0110 = b0011 1100
+      case 7:
+        return _mm512_mask_blend_pd(0x3F, a.values, b.values); //b0000 0111 = b0011 1111
+      case 8:
+        return _mm512_mask_blend_pd(0xC0, a.values, b.values); //b0000 1000 = b1100 0000
+      case 9:
+        return _mm512_mask_blend_pd(0xC3, a.values, b.values); //b0000 1001 = b1100 0011
+      case 10:
+        return _mm512_mask_blend_pd(0xCC, a.values, b.values); //b0000 1010 = b1100 1100
+      case 11:
+        return _mm512_mask_blend_pd(0xCF, a.values, b.values); //b0000 1011 = b1100 1111
+      case 12:
+        return _mm512_mask_blend_pd(0xF0, a.values, b.values); //b0000 1100 = b1111 0000
+      case 13:
+        return _mm512_mask_blend_pd(0xF3, a.values, b.values); //b0000 1101 = b1111 0011
+      case 14:
+        return _mm512_mask_blend_pd(0xFC, a.values, b.values); //b0000 1110 = b1111 1100
+      case 15:
+        return _mm512_mask_blend_pd(0xFF, a.values, b.values); //b0000 1111 = b1111 1111
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> blendv(const Vectorized<c10::complex<double>>& a,
+                                                const Vectorized<c10::complex<double>>& b,
+                                                const Vectorized<c10::complex<double>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_pd(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template<typename step_t>
+  static Vectorized<c10::complex<double>> arange(c10::complex<double> base = 0.,
+                                                step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<double>>(base,
+                                           base + c10::complex<double>(1)*step,
+                                           base + c10::complex<double>(2)*step,
+                                           base + c10::complex<double>(3)*step);
+  }
+  static Vectorized<c10::complex<double>> set(const Vectorized<c10::complex<double>>& a,
+                                             const Vectorized<c10::complex<double>>& b,
+                                             int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<double>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __at_align__ double tmp_values[2*size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2*size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const double*>(ptr),
+        count * sizeof(c10::complex<double>));
+    return _mm512_load_pd(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      double tmp_values[2*size()];
+      _mm512_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<double>));
+    }
+  }
+  const c10::complex<double>& operator[](int idx) const  = delete;
+  c10::complex<double>& operator[](int idx) = delete;
+  Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
+    __at_align__ c10::complex<double> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  // AVX512 doesn't have horizontal add & horizontal sub instructions.
+  // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
+  static inline __m512d hadd_pd(__m512d a, __m512d b) {
+  __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+  return _mm512_add_pd(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+                       _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  static inline __m512d hsub_pd(__m512d a, __m512d b) {
+  __m512i idx1 = _mm512_set_epi64(14, 6, 12, 4, 10, 2, 8, 0);
+  __m512i idx2 = _mm512_set_epi64(15, 7, 13, 5, 11, 3, 9, 1);
+  return _mm512_sub_pd(_mm512_mask_permutex2var_pd(a, 0xff, idx1, b),
+                       _mm512_mask_permutex2var_pd(a, 0xff, idx2, b));
+  }
+  __m512d abs_2_() const {
+    auto val_2 = _mm512_mul_pd(values, values);     // a*a     b*b
+    return hadd_pd(val_2, val_2);            // a*a+b*b a*a+b*b
+  }
+  __m512d abs_() const {
+    auto real = _mm512_movedup_pd(values);        // real real
+    // movehdup_pd does not exist...
+    auto imag = _mm512_permute_pd(values, 0xff);  // imag imag
+    return Sleef_hypotd8_u05(real, imag);         // abs  abs
+  }
+  Vectorized<c10::complex<double>> abs() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm512_and_pd(abs_(), real_mask);        // abs     0
+  }
+  __m512d angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm512_permute_pd(values, 0x55);     // b        a
+    return Sleef_atan2d8_u10(values, b_a);          // 90-angle angle
+  }
+  Vectorized<c10::complex<double>> angle() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    auto angle = _mm512_permute_pd(angle_(), 0x55); // angle    90-angle
+    return _mm512_and_pd(angle, real_mask);         // angle    0
+  }
+  Vectorized<c10::complex<double>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm512_setzero_pd();
+    auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ);
+    auto div = values / abs;
+    return _mm512_mask_blend_pd(mask, div, zero);
+  }
+  __m512d real_() const {
+    const __m512d real_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000,
+                                                                    0xFFFFFFFFFFFFFFFF, 0x0000000000000000));
+    return _mm512_and_pd(values, real_mask);
+  }
+  Vectorized<c10::complex<double>> real() const {
+    return real_();
+  }
+  __m512d imag_() const {
+    const __m512d imag_mask = _mm512_castsi512_pd(_mm512_setr_epi64(0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
+                                                                    0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
+                                                                    0x0000000000000000, 0xFFFFFFFFFFFFFFFF,
+                                                                    0x0000000000000000, 0xFFFFFFFFFFFFFFFF));
+    return _mm512_and_pd(values, imag_mask);
+  }
+  Vectorized<c10::complex<double>> imag() const {
+    return _mm512_permute_pd(imag_(), 0x55);           //b        a
+  }
+  __m512d conj_() const {
+    const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm512_xor_pd(values, sign_mask);           // a       -b
+  }
+  Vectorized<c10::complex<double>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<double>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<double>> log2() const {
+    const __m512d log2_ = _mm512_set1_pd(std::log(2));
+    return _mm512_div_pd(log(), log2_);
+  }
+  Vectorized<c10::complex<double>> log10() const {
+    const __m512d log10_ = _mm512_set1_pd(std::log(10));
+    return _mm512_div_pd(log(), log10_);
+  }
+  Vectorized<c10::complex<double>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<double>> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    const __m512d one = _mm512_set1_pd(1);
+
+    auto conj = conj_();
+    auto b_a = _mm512_permute_pd(conj, 0x55);                         //-b        a
+    auto ab = _mm512_mul_pd(conj, b_a);                               //-ab       -ab
+    auto im = _mm512_add_pd(ab, ab);                                  //-2ab      -2ab
+
+    auto val_2 = _mm512_mul_pd(values, values);                       // a*a      b*b
+    auto re = hsub_pd(val_2, _mm512_permute_pd(val_2, 0x55));  // a*a-b*b  b*b-a*a
+    re = _mm512_sub_pd(one, re);
+
+    auto root = Vectorized(_mm512_mask_blend_pd(0xAA, re, im)).sqrt();         //sqrt(re + i*im)
+    auto ln = Vectorized(_mm512_add_pd(b_a, root)).log();                 //ln(iz + sqrt())
+    return Vectorized(_mm512_permute_pd(ln.values, 0x55)).conj();         //-i*ln()
+  }
+  Vectorized<c10::complex<double>> acos() const {
+    // acos(x) = pi/2 - asin(x)
+    constexpr auto pi_2d = c10::pi<double> / 2;
+    const __m512d pi_2 = _mm512_setr_pd(pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0, pi_2d, 0.0);
+    return _mm512_sub_pd(pi_2, asin());
+  }
+  Vectorized<c10::complex<double>> atan() const;
+  Vectorized<c10::complex<double>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<double>> exp() const {
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expd8_u10(values);                               //exp(a)           exp(b)
+    exp = _mm512_mask_blend_pd(0xAA, exp, _mm512_permute_pd(exp, 0x55));   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosd8_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm512_mask_blend_pd(0xAA, _mm512_permute_pd(sin_cos.y, 0x55),
+                                   sin_cos.x);                  //cos(b)           sin(b)
+    return _mm512_mul_pd(exp, cos_sin);
+  }
+  Vectorized<c10::complex<double>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m512d ln_2 = _mm512_set1_pd(c10::ln_2<double>);
+    Vectorized<c10::complex<double>> scaled_values = _mm512_mul_pd(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<double>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<double>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<double>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<double>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<double>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<double>> ceil() const {
+    return _mm512_ceil_pd(values);
+  }
+  Vectorized<c10::complex<double>> floor() const {
+    return _mm512_floor_pd(values);
+  }
+  Vectorized<c10::complex<double>> neg() const {
+    auto zero = _mm512_setzero_pd();
+    return _mm512_sub_pd(zero, values);
+  }
+  Vectorized<c10::complex<double>> round() const {
+    return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<double>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<double>> trunc() const {
+    return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<double>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<double>> reciprocal() const;
+  Vectorized<c10::complex<double>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<double>> pow(const Vectorized<c10::complex<double>> &exp) const {
+    __at_align__ c10::complex<double> x_tmp[size()];
+    __at_align__ c10::complex<double> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<double>> operator==(const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator!=(const Vectorized<c10::complex<double>>& other) const {
+    auto mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+  Vectorized<c10::complex<double>> operator<(const Vectorized<c10::complex<double>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator<=(const Vectorized<c10::complex<double>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>(const Vectorized<c10::complex<double>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<double>> operator>=(const Vectorized<c10::complex<double>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<double>> eq(const Vectorized<c10::complex<double>>& other) const;
+  Vectorized<c10::complex<double>> ne(const Vectorized<c10::complex<double>>& other) const;
+};
+
+template <> Vectorized<c10::complex<double>> inline operator+(const Vectorized<c10::complex<double>> &a,
+                                                             const Vectorized<c10::complex<double>> &b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <> Vectorized<c10::complex<double>> inline operator-(const Vectorized<c10::complex<double>> &a,
+                                                             const Vectorized<c10::complex<double>> &b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <> Vectorized<c10::complex<double>> inline operator*(const Vectorized<c10::complex<double>> &a,
+                                                             const Vectorized<c10::complex<double>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm512_mul_pd(a, b);         //ac       bd
+
+  auto d_c = _mm512_permute_pd(b, 0x55);    //d        c
+  d_c = _mm512_xor_pd(sign_mask, d_c);      //d       -c
+  auto ad_bc = _mm512_mul_pd(a, d_c);       //ad      -bc
+
+  auto ret = Vectorized<c10::complex<double>>::hsub_pd(ac_bd, ad_bc);  //ac - bd  ad + bc
+  return ret;
+}
+
+template <> Vectorized<c10::complex<double>> inline operator/(const Vectorized<c10::complex<double>> &a,
+                                                             const Vectorized<c10::complex<double>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  auto mask = _mm512_set1_pd(-0.f);
+  auto fabs_cd = _mm512_andnot_pd(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm512_permute_pd(fabs_cd, 0x55);   // |d|    |c|
+  auto scale = _mm512_rcp14_pd(_mm512_max_pd(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm512_mul_pd(a, scale);         // a/sc     b/sc
+  auto b2 = _mm512_mul_pd(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm512_mul_pd(a2, b2);
+
+  const __m512d sign_mask = _mm512_setr_pd(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm512_permute_pd(b2, 0x55);    // d/sc         c/sc
+  dc2 = _mm512_xor_pd(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm512_mul_pd(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = Vectorized<c10::complex<double>>::hadd_pd(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<double>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm512_div_pd(res2, denom2);
+  return res2;
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::reciprocal() const{
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m512d sign_mask = _mm512_setr_pd(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto c_d = _mm512_xor_pd(sign_mask, values);    //c       -d
+  return _mm512_div_pd(c_d, abs_2_());
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::atan() const {
+  // atan(x) = i/2 * ln((i + z)/(i - z))
+  const __m512d i = _mm512_setr_pd(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  const Vectorized i_half = _mm512_setr_pd(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+
+  auto sum = Vectorized(_mm512_add_pd(i, values));                      // a        1+b
+  auto sub = Vectorized(_mm512_sub_pd(i, values));                      // -a       1-b
+  auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
+  return i_half*ln;                                                 // i/2*ln()
+}
+
+template <>
+Vectorized<c10::complex<double>> inline maximum(const Vectorized<c10::complex<double>>& a,
+                                               const Vectorized<c10::complex<double>>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm512_mask_blend_pd(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask,
+                                      0xFFFFFFFFFFFFFFFF);
+  return _mm512_or_pd(max, _mm512_castsi512_pd(isnan));
+}
+
+template <>
+Vectorized<c10::complex<double>> inline minimum(const Vectorized<c10::complex<double>>& a,
+                                               const Vectorized<c10::complex<double>>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm512_mask_blend_pd(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_pd_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi64(zero_vec, isnan_mask,
+                                      0xFFFFFFFFFFFFFFFF);
+  return _mm512_or_pd(min, _mm512_castsi512_pd(isnan));
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator&(const Vectorized<c10::complex<double>>& a,
+                                                 const Vectorized<c10::complex<double>>& b) {
+  return _mm512_and_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator|(const Vectorized<c10::complex<double>>& a,
+                                                 const Vectorized<c10::complex<double>>& b) {
+  return _mm512_or_pd(a, b);
+}
+
+template <>
+Vectorized<c10::complex<double>> inline operator^(const Vectorized<c10::complex<double>>& a,
+                                                 const Vectorized<c10::complex<double>>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::eq(const Vectorized<c10::complex<double>>& other) const {
+  auto eq = (*this == other);  // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers are equal
+  return (eq.real() & eq.imag()) & Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+}
+
+inline Vectorized<c10::complex<double>> Vectorized<c10::complex<double>>::ne(const Vectorized<c10::complex<double>>& other) const {
+  auto ne = (*this != other);  // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+  return (ne.real() | ne.imag()) & Vectorized<c10::complex<double>>(_mm512_set1_pd(1.0));
+}
+
+#endif
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..14dfb24e3efeec03417796a6ab15d26d8f2c84b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_complex_float.h
@@ -0,0 +1,1018 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <c10/util/complex.h>
+#include <c10/util/irange.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <> class Vectorized<c10::complex<float>> {
+private:
+  __m512 values;
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+public:
+  using value_type = c10::complex<float>;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  Vectorized(__m512 v) : values(v) {}
+  Vectorized(c10::complex<float> val) {
+    float real_value = val.real();
+    float imag_value = val.imag();
+    values = _mm512_setr_ps(real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value,
+                            real_value, imag_value);
+  }
+  Vectorized(c10::complex<float> val1, c10::complex<float> val2,
+            c10::complex<float> val3, c10::complex<float> val4,
+            c10::complex<float> val5, c10::complex<float> val6,
+            c10::complex<float> val7, c10::complex<float> val8) {
+    values = _mm512_setr_ps(val1.real(), val1.imag(),
+                            val2.real(), val2.imag(),
+                            val3.real(), val3.imag(),
+                            val4.real(), val4.imag(),
+                            val5.real(), val5.imag(),
+                            val6.real(), val6.imag(),
+                            val7.real(), val7.imag(),
+                            val8.real(), val8.imag());
+  }
+  operator __m512() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<c10::complex<float>> blend(const Vectorized<c10::complex<float>>& a,
+                                              const Vectorized<c10::complex<float>>& b) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    static_assert(mask > -1 && mask < 256, "Unexpected mask value");
+    // The compiler would hopefully convert this switch condition
+    // into a jump table
+    switch (mask) {
+      case 0:
+        return a;
+      case 1:
+        return _mm512_mask_blend_ps(0x03, a.values, b.values);
+      case 2:
+        return _mm512_mask_blend_ps(0x0C, a.values, b.values);
+      case 3:
+        return _mm512_mask_blend_ps(0x0F, a.values, b.values);
+      case 4:
+        return _mm512_mask_blend_ps(0x30, a.values, b.values);
+      case 5:
+        return _mm512_mask_blend_ps(0x33, a.values, b.values);
+      case 6:
+        return _mm512_mask_blend_ps(0x3C, a.values, b.values);
+      case 7:
+        return _mm512_mask_blend_ps(0x3F, a.values, b.values);
+      case 8:
+        return _mm512_mask_blend_ps(0xC0, a.values, b.values);
+      case 9:
+        return _mm512_mask_blend_ps(0xC3, a.values, b.values);
+      case 10:
+        return _mm512_mask_blend_ps(0xCC, a.values, b.values);
+      case 11:
+        return _mm512_mask_blend_ps(0xCF, a.values, b.values);
+      case 12:
+        return _mm512_mask_blend_ps(0xF0, a.values, b.values);
+      case 13:
+        return _mm512_mask_blend_ps(0xF3, a.values, b.values);
+      case 14:
+        return _mm512_mask_blend_ps(0xFC, a.values, b.values);
+      case 15:
+        return _mm512_mask_blend_ps(0xFF, a.values, b.values);
+      case 16:
+        return _mm512_mask_blend_ps(0x300, a.values, b.values);
+      case 17:
+        return _mm512_mask_blend_ps(0x303, a.values, b.values);
+      case 18:
+        return _mm512_mask_blend_ps(0x30C, a.values, b.values);
+      case 19:
+        return _mm512_mask_blend_ps(0x30F, a.values, b.values);
+      case 20:
+        return _mm512_mask_blend_ps(0x330, a.values, b.values);
+      case 21:
+        return _mm512_mask_blend_ps(0x333, a.values, b.values);
+      case 22:
+        return _mm512_mask_blend_ps(0x33C, a.values, b.values);
+      case 23:
+        return _mm512_mask_blend_ps(0x33F, a.values, b.values);
+      case 24:
+        return _mm512_mask_blend_ps(0x3C0, a.values, b.values);
+      case 25:
+        return _mm512_mask_blend_ps(0x3C3, a.values, b.values);
+      case 26:
+        return _mm512_mask_blend_ps(0x3CC, a.values, b.values);
+      case 27:
+        return _mm512_mask_blend_ps(0x3CF, a.values, b.values);
+      case 28:
+        return _mm512_mask_blend_ps(0x3F0, a.values, b.values);
+      case 29:
+        return _mm512_mask_blend_ps(0x3F3, a.values, b.values);
+      case 30:
+        return _mm512_mask_blend_ps(0x3FC, a.values, b.values);
+      case 31:
+        return _mm512_mask_blend_ps(0x3FF, a.values, b.values);
+      case 32:
+        return _mm512_mask_blend_ps(0xC00, a.values, b.values);
+      case 33:
+        return _mm512_mask_blend_ps(0xC03, a.values, b.values);
+      case 34:
+        return _mm512_mask_blend_ps(0xC0C, a.values, b.values);
+      case 35:
+        return _mm512_mask_blend_ps(0xC0F, a.values, b.values);
+      case 36:
+        return _mm512_mask_blend_ps(0xC30, a.values, b.values);
+      case 37:
+        return _mm512_mask_blend_ps(0xC33, a.values, b.values);
+      case 38:
+        return _mm512_mask_blend_ps(0xC3C, a.values, b.values);
+      case 39:
+        return _mm512_mask_blend_ps(0xC3F, a.values, b.values);
+      case 40:
+        return _mm512_mask_blend_ps(0xCC0, a.values, b.values);
+      case 41:
+        return _mm512_mask_blend_ps(0xCC3, a.values, b.values);
+      case 42:
+        return _mm512_mask_blend_ps(0xCCC, a.values, b.values);
+      case 43:
+        return _mm512_mask_blend_ps(0xCCF, a.values, b.values);
+      case 44:
+        return _mm512_mask_blend_ps(0xCF0, a.values, b.values);
+      case 45:
+        return _mm512_mask_blend_ps(0xCF3, a.values, b.values);
+      case 46:
+        return _mm512_mask_blend_ps(0xCFC, a.values, b.values);
+      case 47:
+        return _mm512_mask_blend_ps(0xCFF, a.values, b.values);
+      case 48:
+        return _mm512_mask_blend_ps(0xF00, a.values, b.values);
+      case 49:
+        return _mm512_mask_blend_ps(0xF03, a.values, b.values);
+      case 50:
+        return _mm512_mask_blend_ps(0xF0C, a.values, b.values);
+      case 51:
+        return _mm512_mask_blend_ps(0xF0F, a.values, b.values);
+      case 52:
+        return _mm512_mask_blend_ps(0xF30, a.values, b.values);
+      case 53:
+        return _mm512_mask_blend_ps(0xF33, a.values, b.values);
+      case 54:
+        return _mm512_mask_blend_ps(0xF3C, a.values, b.values);
+      case 55:
+        return _mm512_mask_blend_ps(0xF3F, a.values, b.values);
+      case 56:
+        return _mm512_mask_blend_ps(0xFC0, a.values, b.values);
+      case 57:
+        return _mm512_mask_blend_ps(0xFC3, a.values, b.values);
+      case 58:
+        return _mm512_mask_blend_ps(0xFCC, a.values, b.values);
+      case 59:
+        return _mm512_mask_blend_ps(0xFCF, a.values, b.values);
+      case 60:
+        return _mm512_mask_blend_ps(0xFF0, a.values, b.values);
+      case 61:
+        return _mm512_mask_blend_ps(0xFF3, a.values, b.values);
+      case 62:
+        return _mm512_mask_blend_ps(0xFFC, a.values, b.values);
+      case 63:
+        return _mm512_mask_blend_ps(0xFFF, a.values, b.values);
+      case 64:
+        return _mm512_mask_blend_ps(0x3000, a.values, b.values);
+      case 65:
+        return _mm512_mask_blend_ps(0x3003, a.values, b.values);
+      case 66:
+        return _mm512_mask_blend_ps(0x300C, a.values, b.values);
+      case 67:
+        return _mm512_mask_blend_ps(0x300F, a.values, b.values);
+      case 68:
+        return _mm512_mask_blend_ps(0x3030, a.values, b.values);
+      case 69:
+        return _mm512_mask_blend_ps(0x3033, a.values, b.values);
+      case 70:
+        return _mm512_mask_blend_ps(0x303C, a.values, b.values);
+      case 71:
+        return _mm512_mask_blend_ps(0x303F, a.values, b.values);
+      case 72:
+        return _mm512_mask_blend_ps(0x30C0, a.values, b.values);
+      case 73:
+        return _mm512_mask_blend_ps(0X30C3, a.values, b.values);
+      case 74:
+        return _mm512_mask_blend_ps(0x30CC, a.values, b.values);
+      case 75:
+        return _mm512_mask_blend_ps(0x30CF, a.values, b.values);
+      case 76:
+        return _mm512_mask_blend_ps(0x30F0, a.values, b.values);
+      case 77:
+        return _mm512_mask_blend_ps(0x30F3, a.values, b.values);
+      case 78:
+        return _mm512_mask_blend_ps(0x30FC, a.values, b.values);
+      case 79:
+        return _mm512_mask_blend_ps(0x30FF, a.values, b.values);
+      case 80:
+        return _mm512_mask_blend_ps(0x3300, a.values, b.values);
+      case 81:
+        return _mm512_mask_blend_ps(0X3303, a.values, b.values);
+      case 82:
+        return _mm512_mask_blend_ps(0x330C, a.values, b.values);
+      case 83:
+        return _mm512_mask_blend_ps(0x330F, a.values, b.values);
+      case 84:
+        return _mm512_mask_blend_ps(0x3330, a.values, b.values);
+      case 85:
+        return _mm512_mask_blend_ps(0x3333, a.values, b.values);
+      case 86:
+        return _mm512_mask_blend_ps(0x333C, a.values, b.values);
+      case 87:
+        return _mm512_mask_blend_ps(0X333F, a.values, b.values);
+      case 88:
+        return _mm512_mask_blend_ps(0x33C0, a.values, b.values);
+      case 89:
+        return _mm512_mask_blend_ps(0x33C3, a.values, b.values);
+      case 90:
+        return _mm512_mask_blend_ps(0x33CC, a.values, b.values);
+      case 91:
+        return _mm512_mask_blend_ps(0x33CF, a.values, b.values);
+      case 92:
+        return _mm512_mask_blend_ps(0x33F0, a.values, b.values);
+      case 93:
+        return _mm512_mask_blend_ps(0x33F3, a.values, b.values);
+      case 94:
+        return _mm512_mask_blend_ps(0x33FC, a.values, b.values);
+      case 95:
+        return _mm512_mask_blend_ps(0x33FF, a.values, b.values);
+      case 96:
+        return _mm512_mask_blend_ps(0X3C00, a.values, b.values);
+      case 97:
+        return _mm512_mask_blend_ps(0x3C03, a.values, b.values);
+      case 98:
+        return _mm512_mask_blend_ps(0x3C0C, a.values, b.values);
+      case 99:
+        return _mm512_mask_blend_ps(0x3C0F, a.values, b.values);
+      case 100:
+        return _mm512_mask_blend_ps(0x3C30, a.values, b.values);
+      case 101:
+        return _mm512_mask_blend_ps(0x3C33, a.values, b.values);
+      case 102:
+        return _mm512_mask_blend_ps(0x3C3C, a.values, b.values);
+      case 103:
+        return _mm512_mask_blend_ps(0x3C3F, a.values, b.values);
+      case 104:
+        return _mm512_mask_blend_ps(0x3CC0, a.values, b.values);
+      case 105:
+        return _mm512_mask_blend_ps(0x3CC3, a.values, b.values);
+      case 106:
+        return _mm512_mask_blend_ps(0x3CCC, a.values, b.values);
+      case 107:
+        return _mm512_mask_blend_ps(0x3CCF, a.values, b.values);
+      case 108:
+        return _mm512_mask_blend_ps(0x3CF0, a.values, b.values);
+      case 109:
+        return _mm512_mask_blend_ps(0x3CF3, a.values, b.values);
+      case 110:
+        return _mm512_mask_blend_ps(0x3CFC, a.values, b.values);
+      case 111:
+        return _mm512_mask_blend_ps(0x3CFF, a.values, b.values);
+      case 112:
+        return _mm512_mask_blend_ps(0x3F00, a.values, b.values);
+      case 113:
+        return _mm512_mask_blend_ps(0x3F03, a.values, b.values);
+      case 114:
+        return _mm512_mask_blend_ps(0x3F0C, a.values, b.values);
+      case 115:
+        return _mm512_mask_blend_ps(0x3F0F, a.values, b.values);
+      case 116:
+        return _mm512_mask_blend_ps(0x3F30, a.values, b.values);
+      case 117:
+        return _mm512_mask_blend_ps(0x3F33, a.values, b.values);
+      case 118:
+        return _mm512_mask_blend_ps(0x3F3C, a.values, b.values);
+      case 119:
+        return _mm512_mask_blend_ps(0x3F3F, a.values, b.values);
+      case 120:
+        return _mm512_mask_blend_ps(0x3FC0, a.values, b.values);
+      case 121:
+        return _mm512_mask_blend_ps(0x3FC3, a.values, b.values);
+      case 122:
+        return _mm512_mask_blend_ps(0x3FCC, a.values, b.values);
+      case 123:
+        return _mm512_mask_blend_ps(0x3FCF, a.values, b.values);
+      case 124:
+        return _mm512_mask_blend_ps(0x3FF0, a.values, b.values);
+      case 125:
+        return _mm512_mask_blend_ps(0x3FF3, a.values, b.values);
+      case 126:
+        return _mm512_mask_blend_ps(0x3FFC, a.values, b.values);
+      case 127:
+        return _mm512_mask_blend_ps(0x3FFF, a.values, b.values);
+      case 128:
+        return _mm512_mask_blend_ps(0xC000, a.values, b.values);
+      case 129:
+        return _mm512_mask_blend_ps(0xC003, a.values, b.values);
+      case 130:
+        return _mm512_mask_blend_ps(0xC00C, a.values, b.values);
+      case 131:
+        return _mm512_mask_blend_ps(0xC00F, a.values, b.values);
+      case 132:
+        return _mm512_mask_blend_ps(0xC030, a.values, b.values);
+      case 133:
+        return _mm512_mask_blend_ps(0xC033, a.values, b.values);
+      case 134:
+        return _mm512_mask_blend_ps(0xC03C, a.values, b.values);
+      case 135:
+        return _mm512_mask_blend_ps(0xC03F, a.values, b.values);
+      case 136:
+        return _mm512_mask_blend_ps(0xC0C0, a.values, b.values);
+      case 137:
+        return _mm512_mask_blend_ps(0xC0C3, a.values, b.values);
+      case 138:
+        return _mm512_mask_blend_ps(0xC0CC, a.values, b.values);
+      case 139:
+        return _mm512_mask_blend_ps(0xC0CF, a.values, b.values);
+      case 140:
+        return _mm512_mask_blend_ps(0xC0F0, a.values, b.values);
+      case 141:
+        return _mm512_mask_blend_ps(0xC0F3, a.values, b.values);
+      case 142:
+        return _mm512_mask_blend_ps(0xC0FC, a.values, b.values);
+      case 143:
+        return _mm512_mask_blend_ps(0xC0FF, a.values, b.values);
+      case 144:
+        return _mm512_mask_blend_ps(0xC300, a.values, b.values);
+      case 145:
+        return _mm512_mask_blend_ps(0xC303, a.values, b.values);
+      case 146:
+        return _mm512_mask_blend_ps(0xC30C, a.values, b.values);
+      case 147:
+        return _mm512_mask_blend_ps(0xC30F, a.values, b.values);
+      case 148:
+        return _mm512_mask_blend_ps(0xC330, a.values, b.values);
+      case 149:
+        return _mm512_mask_blend_ps(0xC333, a.values, b.values);
+      case 150:
+        return _mm512_mask_blend_ps(0xC33C, a.values, b.values);
+      case 151:
+        return _mm512_mask_blend_ps(0xC33F, a.values, b.values);
+      case 152:
+        return _mm512_mask_blend_ps(0xC3C0, a.values, b.values);
+      case 153:
+        return _mm512_mask_blend_ps(0xC3C3, a.values, b.values);
+      case 154:
+        return _mm512_mask_blend_ps(0xC3CC, a.values, b.values);
+      case 155:
+        return _mm512_mask_blend_ps(0xC3CF, a.values, b.values);
+      case 156:
+        return _mm512_mask_blend_ps(0xC3F0, a.values, b.values);
+      case 157:
+        return _mm512_mask_blend_ps(0xC3F3, a.values, b.values);
+      case 158:
+        return _mm512_mask_blend_ps(0xC3FC, a.values, b.values);
+      case 159:
+        return _mm512_mask_blend_ps(0xC3FF, a.values, b.values);
+      case 160:
+        return _mm512_mask_blend_ps(0xCC00, a.values, b.values);
+      case 161:
+        return _mm512_mask_blend_ps(0xCC03, a.values, b.values);
+      case 162:
+        return _mm512_mask_blend_ps(0xCC0C, a.values, b.values);
+      case 163:
+        return _mm512_mask_blend_ps(0xCC0F, a.values, b.values);
+      case 164:
+        return _mm512_mask_blend_ps(0xCC30, a.values, b.values);
+      case 165:
+        return _mm512_mask_blend_ps(0xCC33, a.values, b.values);
+      case 166:
+        return _mm512_mask_blend_ps(0xCC3C, a.values, b.values);
+      case 167:
+        return _mm512_mask_blend_ps(0xCC3F, a.values, b.values);
+      case 168:
+        return _mm512_mask_blend_ps(0xCCC0, a.values, b.values);
+      case 169:
+        return _mm512_mask_blend_ps(0xCCC3, a.values, b.values);
+      case 170:
+        return _mm512_mask_blend_ps(0xCCCC, a.values, b.values);
+      case 171:
+        return _mm512_mask_blend_ps(0xCCCF, a.values, b.values);
+      case 172:
+        return _mm512_mask_blend_ps(0xCCF0, a.values, b.values);
+      case 173:
+        return _mm512_mask_blend_ps(0xCCF3, a.values, b.values);
+      case 174:
+        return _mm512_mask_blend_ps(0xCCFC, a.values, b.values);
+      case 175:
+        return _mm512_mask_blend_ps(0xCCFF, a.values, b.values);
+      case 176:
+        return _mm512_mask_blend_ps(0xCF00, a.values, b.values);
+      case 177:
+        return _mm512_mask_blend_ps(0xCF03, a.values, b.values);
+      case 178:
+        return _mm512_mask_blend_ps(0xCF0C, a.values, b.values);
+      case 179:
+        return _mm512_mask_blend_ps(0xCF0F, a.values, b.values);
+      case 180:
+        return _mm512_mask_blend_ps(0xCF30, a.values, b.values);
+      case 181:
+        return _mm512_mask_blend_ps(0xCF33, a.values, b.values);
+      case 182:
+        return _mm512_mask_blend_ps(0xCF3C, a.values, b.values);
+      case 183:
+        return _mm512_mask_blend_ps(0xCF3F, a.values, b.values);
+      case 184:
+        return _mm512_mask_blend_ps(0xCFC0, a.values, b.values);
+      case 185:
+        return _mm512_mask_blend_ps(0xCFC3, a.values, b.values);
+      case 186:
+        return _mm512_mask_blend_ps(0xCFCC, a.values, b.values);
+      case 187:
+        return _mm512_mask_blend_ps(0xCFCF, a.values, b.values);
+      case 188:
+        return _mm512_mask_blend_ps(0xCFF0, a.values, b.values);
+      case 189:
+        return _mm512_mask_blend_ps(0xCFF3, a.values, b.values);
+      case 190:
+        return _mm512_mask_blend_ps(0xCFFC, a.values, b.values);
+      case 191:
+        return _mm512_mask_blend_ps(0xCFFF, a.values, b.values);
+      case 192:
+        return _mm512_mask_blend_ps(0xF000, a.values, b.values);
+      case 193:
+        return _mm512_mask_blend_ps(0xF003, a.values, b.values);
+      case 194:
+        return _mm512_mask_blend_ps(0xF00C, a.values, b.values);
+      case 195:
+        return _mm512_mask_blend_ps(0xF00F, a.values, b.values);
+      case 196:
+        return _mm512_mask_blend_ps(0xF030, a.values, b.values);
+      case 197:
+        return _mm512_mask_blend_ps(0xF033, a.values, b.values);
+      case 198:
+        return _mm512_mask_blend_ps(0xF03C, a.values, b.values);
+      case 199:
+        return _mm512_mask_blend_ps(0xF03F, a.values, b.values);
+      case 200:
+        return _mm512_mask_blend_ps(0XF0C0, a.values, b.values);
+      case 201:
+        return _mm512_mask_blend_ps(0xF0C3, a.values, b.values);
+      case 202:
+        return _mm512_mask_blend_ps(0xF0CC, a.values, b.values);
+      case 203:
+        return _mm512_mask_blend_ps(0xF0CF, a.values, b.values);
+      case 204:
+        return _mm512_mask_blend_ps(0xF0F0, a.values, b.values);
+      case 205:
+        return _mm512_mask_blend_ps(0xF0F3, a.values, b.values);
+      case 206:
+        return _mm512_mask_blend_ps(0xF0FC, a.values, b.values);
+      case 207:
+        return _mm512_mask_blend_ps(0xF0FF, a.values, b.values);
+      case 208:
+        return _mm512_mask_blend_ps(0XF300, a.values, b.values);
+      case 209:
+        return _mm512_mask_blend_ps(0xF303, a.values, b.values);
+      case 210:
+        return _mm512_mask_blend_ps(0xF30C, a.values, b.values);
+      case 211:
+        return _mm512_mask_blend_ps(0xF30F, a.values, b.values);
+      case 212:
+        return _mm512_mask_blend_ps(0xF330, a.values, b.values);
+      case 213:
+        return _mm512_mask_blend_ps(0xF333, a.values, b.values);
+      case 214:
+        return _mm512_mask_blend_ps(0XF33C, a.values, b.values);
+      case 215:
+        return _mm512_mask_blend_ps(0xF33F, a.values, b.values);
+      case 216:
+        return _mm512_mask_blend_ps(0xF3C0, a.values, b.values);
+      case 217:
+        return _mm512_mask_blend_ps(0xF3C3, a.values, b.values);
+      case 218:
+        return _mm512_mask_blend_ps(0xF3CC, a.values, b.values);
+      case 219:
+        return _mm512_mask_blend_ps(0xF3CF, a.values, b.values);
+      case 220:
+        return _mm512_mask_blend_ps(0xF3F0, a.values, b.values);
+      case 221:
+        return _mm512_mask_blend_ps(0xF3F3, a.values, b.values);
+      case 222:
+        return _mm512_mask_blend_ps(0xF3FC, a.values, b.values);
+      case 223:
+        return _mm512_mask_blend_ps(0XF3FF, a.values, b.values);
+      case 224:
+        return _mm512_mask_blend_ps(0xFC00, a.values, b.values);
+      case 225:
+        return _mm512_mask_blend_ps(0xFC03, a.values, b.values);
+      case 226:
+        return _mm512_mask_blend_ps(0xFC0C, a.values, b.values);
+      case 227:
+        return _mm512_mask_blend_ps(0xFC0F, a.values, b.values);
+      case 228:
+        return _mm512_mask_blend_ps(0xFC30, a.values, b.values);
+      case 229:
+        return _mm512_mask_blend_ps(0xFC33, a.values, b.values);
+      case 230:
+        return _mm512_mask_blend_ps(0xFC3C, a.values, b.values);
+      case 231:
+        return _mm512_mask_blend_ps(0xFC3F, a.values, b.values);
+      case 232:
+        return _mm512_mask_blend_ps(0xFCC0, a.values, b.values);
+      case 233:
+        return _mm512_mask_blend_ps(0xFCC3, a.values, b.values);
+      case 234:
+        return _mm512_mask_blend_ps(0xFCCC, a.values, b.values);
+      case 235:
+        return _mm512_mask_blend_ps(0xFCCF, a.values, b.values);
+      case 236:
+        return _mm512_mask_blend_ps(0xFCF0, a.values, b.values);
+      case 237:
+        return _mm512_mask_blend_ps(0xFCF3, a.values, b.values);
+      case 238:
+        return _mm512_mask_blend_ps(0xFCFC, a.values, b.values);
+      case 239:
+        return _mm512_mask_blend_ps(0xFCFF, a.values, b.values);
+      case 240:
+        return _mm512_mask_blend_ps(0xFF00, a.values, b.values);
+      case 241:
+        return _mm512_mask_blend_ps(0xFF03, a.values, b.values);
+      case 242:
+        return _mm512_mask_blend_ps(0xFF0C, a.values, b.values);
+      case 243:
+        return _mm512_mask_blend_ps(0xFF0F, a.values, b.values);
+      case 244:
+        return _mm512_mask_blend_ps(0xFF30, a.values, b.values);
+      case 245:
+        return _mm512_mask_blend_ps(0xFF33, a.values, b.values);
+      case 246:
+        return _mm512_mask_blend_ps(0xFF3C, a.values, b.values);
+      case 247:
+        return _mm512_mask_blend_ps(0xFF3F, a.values, b.values);
+      case 248:
+        return _mm512_mask_blend_ps(0xFFC0, a.values, b.values);
+      case 249:
+        return _mm512_mask_blend_ps(0xFFC3, a.values, b.values);
+      case 250:
+        return _mm512_mask_blend_ps(0xFFCC, a.values, b.values);
+      case 251:
+        return _mm512_mask_blend_ps(0xFFCF, a.values, b.values);
+      case 252:
+        return _mm512_mask_blend_ps(0xFFF0, a.values, b.values);
+      case 253:
+        return _mm512_mask_blend_ps(0xFFF3, a.values, b.values);
+      case 254:
+        return _mm512_mask_blend_ps(0xFFFC, a.values, b.values);
+      default: break;
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> blendv(const Vectorized<c10::complex<float>>& a,
+                                               const Vectorized<c10::complex<float>>& b,
+                                               const Vectorized<c10::complex<float>>& mask) {
+    // convert c10::complex<V> index mask to V index mask: xy -> xxyy
+    auto mask_ = _mm512_unpacklo_ps(mask.values, mask.values);
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask_), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template<typename step_t>
+  static Vectorized<c10::complex<float>> arange(c10::complex<float> base = 0.,
+                                               step_t step = static_cast<step_t>(1)) {
+    return Vectorized<c10::complex<float>>(base,
+                                        base + step,
+                                        base + c10::complex<float>(2)*step,
+                                        base + c10::complex<float>(3)*step,
+                                        base + c10::complex<float>(4)*step,
+                                        base + c10::complex<float>(5)*step,
+                                        base + c10::complex<float>(6)*step,
+                                        base + c10::complex<float>(7)*step);
+  }
+  static Vectorized<c10::complex<float>> set(const Vectorized<c10::complex<float>>& a,
+                                            const Vectorized<c10::complex<float>>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<c10::complex<float>> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __at_align__ float tmp_values[2*size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(2*size())) {
+      tmp_values[i] = 0.0;
+    }
+    std::memcpy(
+        tmp_values,
+        reinterpret_cast<const float*>(ptr),
+        count * sizeof(c10::complex<float>));
+    return _mm512_load_ps(tmp_values);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      float tmp_values[2*size()];
+      _mm512_storeu_ps(reinterpret_cast<float*>(tmp_values), values);
+      std::memcpy(ptr, tmp_values, count * sizeof(c10::complex<float>));
+    }
+  }
+  // AVX512 doesn't have horizontal add & horizontal sub instructions.
+  // TODO: hadd_pd() & hsub_pd() may have scope for improvement.
+  static inline __m512 hadd_ps(__m512 a, __m512 b) {
+  __m512i idx1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+  __m512i idx2 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+  return _mm512_add_ps(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+                       _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  static inline __m512 hsub_ps(__m512 a, __m512 b) {
+  __m512i idx1 = _mm512_set_epi32(30, 14, 28, 12, 26, 10, 24, 8, 22, 6, 20, 4, 18, 2, 16, 0);
+  __m512i idx2 = _mm512_set_epi32(31, 15, 29, 13, 27, 11, 25, 9, 23, 7, 21, 5, 19, 3, 17, 1);
+  return _mm512_sub_ps(_mm512_mask_permutex2var_ps(a, 0xffff, idx1, b),
+                       _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b));
+  }
+  const c10::complex<float>& operator[](int idx) const  = delete;
+  c10::complex<float>& operator[](int idx) = delete;
+  Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
+    __at_align__ c10::complex<float> tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  __m512 abs_2_() const {
+    auto val_2 = _mm512_mul_ps(values, values);     // a*a     b*b
+    auto ret = hadd_ps(val_2, val_2);               // a*a+b*b a*a+b*b
+    return ret;
+  }
+  __m512 abs_() const {
+    auto real = _mm512_moveldup_ps(values);    // real real
+    auto imag = _mm512_movehdup_ps(values);    // imag imag
+    return Sleef_hypotf16_u05(real, imag);     // abs  abs
+  }
+  Vectorized<c10::complex<float>> abs() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm512_and_ps(abs_(), real_mask);        // abs     0
+  }
+  __m512 angle_() const {
+    //angle = atan2(b/a)
+    auto b_a = _mm512_permute_ps(values, 0xB1);     // b        a
+    return Sleef_atan2f16_u10(values, b_a);          // 90-angle angle
+  }
+  Vectorized<c10::complex<float>> angle() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    auto angle = _mm512_permute_ps(angle_(), 0xB1); // angle    90-angle
+    return _mm512_and_ps(angle, real_mask);         // angle    0
+  }
+  Vectorized<c10::complex<float>> sgn() const {
+    auto abs = abs_();
+    auto zero = _mm512_setzero_ps();
+    auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ);
+    auto div = values / abs;
+    return _mm512_mask_blend_ps(mask, div, zero);
+  }
+  __m512 real_() const {
+    const __m512 real_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000,
+                                                                   0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000));
+    return _mm512_and_ps(values, real_mask);
+  }
+  Vectorized<c10::complex<float>> real() const {
+    return real_();
+  }
+  __m512 imag_() const {
+    const __m512 imag_mask = _mm512_castsi512_ps(_mm512_setr_epi32(0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
+                                                                   0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
+                                                                   0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF,
+                                                                   0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF));
+    return _mm512_and_ps(values, imag_mask);
+  }
+  Vectorized<c10::complex<float>> imag() const {
+    return _mm512_permute_ps(imag_(), 0xB1);        //b        a
+  }
+  __m512 conj_() const {
+    const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+                                            0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+    return _mm512_xor_ps(values, sign_mask);        // a       -b
+  }
+  Vectorized<c10::complex<float>> conj() const {
+    return conj_();
+  }
+  Vectorized<c10::complex<float>> log() const {
+    // Most trigonomic ops use the log() op to improve complex number performance.
+    return map(std::log);
+  }
+  Vectorized<c10::complex<float>> log2() const {
+    const __m512 log2_ = _mm512_set1_ps(std::log(2));
+    return _mm512_div_ps(log(), log2_);
+  }
+  Vectorized<c10::complex<float>> log10() const {
+    const __m512 log10_ = _mm512_set1_ps(std::log(10));
+    return _mm512_div_ps(log(), log10_);
+  }
+  Vectorized<c10::complex<float>> log1p() const {
+    return map(std::log1p);
+  }
+  Vectorized<c10::complex<float>> asin() const {
+    // asin(x)
+    // = -i*ln(iz + sqrt(1 -z^2))
+    // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi)))
+    // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi))
+    const __m512 one = _mm512_set1_ps(1);
+
+    auto conj = conj_();
+    auto b_a = _mm512_permute_ps(conj, 0xB1);                         //-b        a
+    auto ab = _mm512_mul_ps(conj, b_a);                               //-ab       -ab
+    auto im = _mm512_add_ps(ab, ab);                                  //-2ab      -2ab
+
+    auto val_2 = _mm512_mul_ps(values, values);                       // a*a      b*b
+    auto re = hsub_ps(val_2, _mm512_permute_ps(val_2, 0xB1));  // a*a-b*b  b*b-a*a
+    re = _mm512_sub_ps(one, re);
+
+    auto root = Vectorized(_mm512_mask_blend_ps(0xAAAA, re, im)).sqrt();         //sqrt(re + i*im)
+    auto ln = Vectorized(_mm512_add_ps(b_a, root)).log();                 //ln(iz + sqrt())
+    return Vectorized(_mm512_permute_ps(ln.values, 0xB1)).conj();         //-i*ln()
+  }
+  Vectorized<c10::complex<float>> acos() const {
+    return map(std::acos);
+  }
+  Vectorized<c10::complex<float>> atan() const;
+  Vectorized<c10::complex<float>> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<c10::complex<float>> exp() const {
+    //exp(a + bi)
+    // = exp(a)*(cos(b) + sin(b)i)
+    auto exp = Sleef_expf16_u10(values);                               //exp(a)           exp(b)
+    exp = _mm512_mask_blend_ps(0xAAAA, exp, _mm512_permute_ps(exp, 0xB1));   //exp(a)           exp(a)
+
+    auto sin_cos = Sleef_sincosf16_u10(values);                        //[sin(a), cos(a)] [sin(b), cos(b)]
+    auto cos_sin = _mm512_mask_blend_ps(0xAAAA, _mm512_permute_ps(sin_cos.y, 0xB1),
+                                   sin_cos.x);                  //cos(b)           sin(b)
+    return _mm512_mul_ps(exp, cos_sin);
+  }
+  Vectorized<c10::complex<float>> exp2() const {
+    // Use identity 2**x = exp(log(2) * x)
+    const __m512 ln_2 = _mm512_set1_ps(c10::ln_2<float>);
+    Vectorized<c10::complex<float>> scaled_values = _mm512_mul_ps(values, ln_2);
+    return scaled_values.exp();
+  }
+  Vectorized<c10::complex<float>> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<c10::complex<float>> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<c10::complex<float>> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<c10::complex<float>> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<c10::complex<float>> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<c10::complex<float>> ceil() const {
+    return _mm512_ceil_ps(values);
+  }
+  Vectorized<c10::complex<float>> floor() const {
+    return _mm512_floor_ps(values);
+  }
+  Vectorized<c10::complex<float>> neg() const {
+    auto zero = _mm512_setzero_ps();
+    return _mm512_sub_ps(zero, values);
+  }
+  Vectorized<c10::complex<float>> round() const {
+    return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<c10::complex<float>> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<c10::complex<float>> trunc() const {
+    return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<c10::complex<float>> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<c10::complex<float>> reciprocal() const;
+  Vectorized<c10::complex<float>> rsqrt() const {
+    return sqrt().reciprocal();
+  }
+  Vectorized<c10::complex<float>> pow(const Vectorized<c10::complex<float>> &exp) const {
+    __at_align__ c10::complex<float> x_tmp[size()];
+    __at_align__ c10::complex<float> y_tmp[size()];
+    store(x_tmp);
+    exp.store(y_tmp);
+    for (const auto i : c10::irange(size())) {
+      x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]);
+    }
+    return loadu(x_tmp);
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<c10::complex<float>> operator==(const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator!=(const Vectorized<c10::complex<float>>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF));
+  }
+  Vectorized<c10::complex<float>> operator<(const Vectorized<c10::complex<float>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator<=(const Vectorized<c10::complex<float>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>(const Vectorized<c10::complex<float>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+  Vectorized<c10::complex<float>> operator>=(const Vectorized<c10::complex<float>>& other) const {
+    TORCH_CHECK(false, "not supported for complex numbers");
+  }
+
+  Vectorized<c10::complex<float>> eq(const Vectorized<c10::complex<float>>& other) const;
+  Vectorized<c10::complex<float>> ne(const Vectorized<c10::complex<float>>& other) const;
+};
+
+template <> Vectorized<c10::complex<float>> inline operator+(const Vectorized<c10::complex<float>> &a,
+                                                            const Vectorized<c10::complex<float>> &b) {
+  return _mm512_add_ps(a, b);
+}
+
+template <> Vectorized<c10::complex<float>> inline operator-(const Vectorized<c10::complex<float>> &a,
+                                                            const Vectorized<c10::complex<float>> &b) {
+  return _mm512_sub_ps(a, b);
+}
+
+template <> Vectorized<c10::complex<float>> inline operator*(const Vectorized<c10::complex<float>> &a,
+                                                            const Vectorized<c10::complex<float>> &b) {
+  //(a + bi)  * (c + di) = (ac - bd) + (ad + bc)i
+  const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+                                          0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto ac_bd = _mm512_mul_ps(a, b);         //ac       bd
+
+  auto d_c = _mm512_permute_ps(b, 0xB1);    //d        c
+  d_c = _mm512_xor_ps(sign_mask, d_c);      //d       -c
+  auto ad_bc = _mm512_mul_ps(a, d_c);       //ad      -bc
+
+  auto ret = Vectorized<c10::complex<float>>::hsub_ps(ac_bd, ad_bc);  //ac - bd  ad + bc
+  return ret;
+}
+
+template <> Vectorized<c10::complex<float>> inline operator/(const Vectorized<c10::complex<float>> &a,
+                                                            const Vectorized<c10::complex<float>> &b) {
+  //re + im*i = (a + bi)  / (c + di)
+  auto mask = _mm512_set1_ps(-0.f);
+  auto fabs_cd = _mm512_andnot_ps(mask, b);     // |c|    |d|
+  auto fabs_dc = _mm512_permute_ps(fabs_cd, 0xB1);   // |d|    |c|
+  auto scale = _mm512_rcp14_ps(_mm512_max_ps(fabs_cd, fabs_dc));  // 1/sc     1/sc
+  auto a2 = _mm512_mul_ps(a, scale);         // a/sc     b/sc
+  auto b2 = _mm512_mul_ps(b, scale);         // c/sc     d/sc
+  auto acbd2 = _mm512_mul_ps(a2, b2);
+
+  const __m512 sign_mask = _mm512_setr_ps(-0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0,
+                                          -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0);
+  auto dc2 = _mm512_permute_ps(b2, 0xB1);    // d/sc         c/sc
+  dc2 = _mm512_xor_ps(sign_mask, dc2);       // -d/|c,d|        c/sc
+  auto adbc2 = _mm512_mul_ps(a2, dc2);       //-ad/sc^2      bc/sc^2
+  auto res2 = Vectorized<c10::complex<float>>::hadd_ps(acbd2, adbc2);  //(ac+bd)/sc^2  (bc-ad)/sc^2
+
+  // get the denominator
+  auto denom2 = Vectorized<c10::complex<float>>(b2).abs_2_();  // (c^2+d^2)/sc^2   (c^2+d^2)/sc^2
+  res2 = _mm512_div_ps(res2, denom2);
+  return res2;
+}
+
+// reciprocal. Implement this here so we can use multiplication.
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::reciprocal() const {
+  //re + im*i = (a + bi)  / (c + di)
+  //re = (ac + bd)/abs_2() = c/abs_2()
+  //im = (bc - ad)/abs_2() = d/abs_2()
+  const __m512 sign_mask = _mm512_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0,
+                                          0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0);
+  auto c_d = _mm512_xor_ps(sign_mask, values);    //c       -d
+  return _mm512_div_ps(c_d, abs_2_());
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::atan() const {
+  // atan(x) = i/2 * ln((i + z)/(i - z))
+  const __m512 i = _mm512_setr_ps(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
+                                  0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+  const Vectorized i_half = _mm512_setr_ps(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
+                                          0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
+
+  auto sum = Vectorized(_mm512_add_ps(i, values));                      // a        1+b
+  auto sub = Vectorized(_mm512_sub_ps(i, values));                      // -a       1-b
+  auto ln = (sum/sub).log();                                        // ln((i + z)/(i - z))
+  return i_half*ln;                                                 // i/2*ln()
+}
+
+template <>
+Vectorized<c10::complex<float>> inline maximum(const Vectorized<c10::complex<float>>& a,
+                                              const Vectorized<c10::complex<float>>& b) {
+  auto zero_vector = _mm512_set1_epi32(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_LT_OQ);
+  auto max = _mm512_mask_blend_ps(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF);
+  return _mm512_or_ps(max, _mm512_castsi512_ps(isnan));
+}
+
+template <>
+Vectorized<c10::complex<float>> inline minimum(const Vectorized<c10::complex<float>>& a,
+                                              const Vectorized<c10::complex<float>>& b) {
+  auto zero_vector = _mm512_set1_epi32(0);
+  auto abs_a = a.abs_2_();
+  auto abs_b = b.abs_2_();
+  auto mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_GT_OQ);
+  auto min = _mm512_mask_blend_ps(mask, a, b);
+  // Exploit the fact that all-ones is a NaN.
+  auto isnan_mask = _mm512_cmp_ps_mask(abs_a, abs_b, _CMP_UNORD_Q);
+  auto isnan = _mm512_mask_set1_epi32(zero_vector, isnan_mask, 0xFFFFFFFF);
+  return _mm512_or_ps(min, _mm512_castsi512_ps(isnan));
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator&(const Vectorized<c10::complex<float>>& a,
+                                                const Vectorized<c10::complex<float>>& b) {
+  return _mm512_and_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator|(const Vectorized<c10::complex<float>>& a,
+                                                const Vectorized<c10::complex<float>>& b) {
+  return _mm512_or_ps(a, b);
+}
+
+template <>
+Vectorized<c10::complex<float>> inline operator^(const Vectorized<c10::complex<float>>& a,
+                                                const Vectorized<c10::complex<float>>& b) {
+  return _mm512_xor_ps(a, b);
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::eq(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto eq = (*this == other);  // compares real and imag individually
+  // If both real numbers and imag numbers are equal, then the complex numbers are equal
+  return (eq.real() & eq.imag()) & Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+}
+
+inline Vectorized<c10::complex<float>> Vectorized<c10::complex<float>>::ne(
+    const Vectorized<c10::complex<float>>& other) const {
+  auto ne = (*this != other);  // compares real and imag individually
+  // If either real numbers or imag numbers are not equal, then the complex numbers are not equal
+  return (ne.real() | ne.imag()) & Vectorized<c10::complex<float>>(_mm512_set1_ps(1.0f));
+}
+
+#endif
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb12593fbc970dc2ebf69380bfbd35e3f90fb590
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_double.h
@@ -0,0 +1,467 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <> class Vectorized<double> {
+private:
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+public:
+  // values needs to be public for compilation with clang
+  // as vec512.h uses it
+  __m512d values;
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  Vectorized() {}
+  Vectorized(__m512d v) : values(v) {}
+  Vectorized(double val) {
+    values = _mm512_set1_pd(val);
+  }
+  Vectorized(double val1, double val2, double val3, double val4,
+         double val5, double val6, double val7, double val8) {
+    values = _mm512_setr_pd(val1, val2, val3, val4, val5, val6, val7, val8);
+  }
+  operator __m512d() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<double> blend(const Vectorized<double>& a, const Vectorized<double>& b) {
+    return _mm512_mask_blend_pd(mask, a.values, b.values);
+  }
+  static Vectorized<double> blendv(const Vectorized<double>& a, const Vectorized<double>& b,
+                               const Vectorized<double>& mask) {
+    auto all_ones = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mmask = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_pd(mmask, a.values, b.values);
+  }
+  template<typename step_t>
+  static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+    return Vectorized<double>(base, base + step, base + 2 * step, base + 3 * step,
+                          base + 4 * step, base + 5 * step, base + 6 * step,
+                          base + 7 * step);
+  }
+  static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
+                            int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
+
+    __mmask8 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_pd(mask, ptr);
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
+    } else if (count > 0) {
+      __mmask8 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_pd(reinterpret_cast<double*>(ptr), mask, values);
+    }
+  }
+  const double& operator[](int idx) const  = delete;
+  double& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    __mmask8 cmp = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_EQ_OQ);
+    return static_cast<int32_t>(cmp);
+  }
+  Vectorized<double> isnan() const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, _mm512_set1_pd(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512d self_sub  = _mm512_sub_pd(values, values);
+    return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
+  Vectorized<double> map(double (*const f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    auto mask = _mm512_set1_pd(-0.f);
+    return _mm512_andnot_pd(mask, values);
+  }
+  Vectorized<double> angle() const {
+    const auto zero_vec = _mm512_castsi512_pd(zero_vector);
+    const auto nan_vec = _mm512_set1_pd(NAN);
+    const auto not_nan_mask = _mm512_cmp_pd_mask(values, values, _CMP_EQ_OQ);
+    const auto not_nan = _mm512_mask_set1_epi64(zero_vector, not_nan_mask,
+                                                0xFFFFFFFFFFFFFFFF);
+    const auto nan_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(not_nan),
+                                             zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm512_set1_pd(c10::pi<double>);
+
+    const auto neg_mask = _mm512_cmp_pd_mask(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm512_mask_blend_pd(neg_mask, zero_vec, pi);
+    angle = _mm512_mask_blend_pd(nan_mask, angle, nan_vec);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return _mm512_set1_pd(0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return Vectorized<double>(Sleef_acosd8_u10(values));
+  }
+  Vectorized<double> acosh() const {
+    return Vectorized<double>(Sleef_acoshd8_u10(values));
+  }
+  Vectorized<double> asin() const {
+    return Vectorized<double>(Sleef_asind8_u10(values));
+  }
+  Vectorized<double> atan() const {
+    return Vectorized<double>(Sleef_atand8_u10(values));
+  }
+  Vectorized<double> atanh() const {
+    return Vectorized<double>(Sleef_atanhd8_u10(values));
+  }
+  Vectorized<double> atan2(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_atan2d8_u10(values, b));
+  }
+  Vectorized<double> copysign(const Vectorized<double> &sign) const {
+    return Vectorized<double>(Sleef_copysignd8(values, sign));
+  }
+  Vectorized<double> erf() const {
+    return Vectorized<double>(Sleef_erfd8_u10(values));
+  }
+  Vectorized<double> erfc() const {
+    return Vectorized<double>(Sleef_erfcd8_u15(values));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return Vectorized<double>(Sleef_expd8_u10(values));
+  }
+  Vectorized<double> exp2() const {
+    return Vectorized<double>(Sleef_exp2d8_u10(values));
+  }
+  Vectorized<double> expm1() const {
+    return Vectorized<double>(Sleef_expm1d8_u10(values));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    return Vectorized<double>(Sleef_fmodd8(values, q));
+  }
+  Vectorized<double> hypot(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_hypotd8_u05(values, b));
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> log() const {
+    return Vectorized<double>(Sleef_logd8_u10(values));
+  }
+  Vectorized<double> log2() const {
+    return Vectorized<double>(Sleef_log2d8_u10(values));
+  }
+  Vectorized<double> log10() const {
+    return Vectorized<double>(Sleef_log10d8_u10(values));
+  }
+  Vectorized<double> log1p() const {
+    return Vectorized<double>(Sleef_log1pd8_u10(values));
+  }
+  Vectorized<double> sin() const {
+    return Vectorized<double>(Sleef_sind8_u10(values));
+  }
+  Vectorized<double> sinh() const {
+    return Vectorized<double>(Sleef_sinhd8_u10(values));
+  }
+  Vectorized<double> cos() const {
+    return Vectorized<double>(Sleef_cosd8_u10(values));
+  }
+  Vectorized<double> cosh() const {
+    return Vectorized<double>(Sleef_coshd8_u10(values));
+  }
+  Vectorized<double> ceil() const {
+    return _mm512_ceil_pd(values);
+  }
+  Vectorized<double> floor() const {
+    return _mm512_floor_pd(values);
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> neg() const {
+    return _mm512_xor_pd(_mm512_set1_pd(-0.), values);
+  }
+  Vectorized<double> nextafter(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_nextafterd8(values, b));
+  }
+  Vectorized<double> round() const {
+    return _mm512_roundscale_pd(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> tan() const {
+    return Vectorized<double>(Sleef_tand8_u10(values));
+  }
+  Vectorized<double> tanh() const {
+    return Vectorized<double>(Sleef_tanhd8_u10(values));
+  }
+  Vectorized<double> trunc() const {
+    return _mm512_roundscale_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<double> lgamma() const {
+    return Vectorized<double>(Sleef_lgammad8_u10(values));
+  }
+  Vectorized<double> sqrt() const {
+    return _mm512_sqrt_pd(values);
+  }
+  Vectorized<double> reciprocal() const {
+    return _mm512_div_pd(_mm512_set1_pd(1), values);
+  }
+  Vectorized<double> rsqrt() const {
+    return _mm512_div_pd(_mm512_set1_pd(1), _mm512_sqrt_pd(values));
+  }
+  Vectorized<double> pow(const Vectorized<double> &b) const {
+    return Vectorized<double>(Sleef_powd8_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LT_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_LE_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GT_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    auto cmp_mask = _mm512_cmp_pd_mask(values, other.values, _CMP_GE_OQ);
+    return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask,
+                                                      0xFFFFFFFFFFFFFFFF));
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_add_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_sub_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_mul_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_div_pd(a, b);
+}
+
+// frac. Implement this here so we can use subtraction.
+inline Vectorized<double> Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> max = _mm512_max_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask,
+                                                          0xFFFFFFFFFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_pd(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  auto zero_vec = _mm512_set1_epi64(0);
+  Vectorized<double> min = _mm512_min_pd(a, b);
+  auto isnan_mask = _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vec, isnan_mask,
+                                                          0xFFFFFFFFFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_pd(min, isnan);
+}
+
+template <>
+Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+  return _mm512_min_pd(max, _mm512_max_pd(min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+  return _mm512_max_pd(min, a);
+}
+
+template <>
+Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+  return _mm512_min_pd(max, a);
+}
+
+template <>
+Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_and_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_or_pd(a, b);
+}
+
+template <>
+Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return _mm512_xor_pd(a, b);
+}
+
+inline Vectorized<double> Vectorized<double>::eq(const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ne(const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::gt(const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::ge(const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::lt(const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+    _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+
+template <>
+Vectorized<double> inline fmsub(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return _mm512_fmsub_pd(a, b, c);
+}
+
+#endif
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..69a429988065b0ba3b27f734561c48445cead041
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_float.h
@@ -0,0 +1,793 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/util/irange.h>
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+#include <sleef.h>
+#endif
+
+namespace at {
+namespace vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+template <> class Vectorized<float> {
+private:
+  static constexpr __m512i zero_vec {0, 0, 0, 0, 0, 0, 0, 0};
+public:
+  __m512 values;
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 16;
+  }
+  Vectorized() {}
+  Vectorized(__m512 v) : values(v) {}
+  Vectorized(float val) {
+    values = _mm512_set1_ps(val);
+  }
+  Vectorized(float val1, float val2, float val3, float val4,
+         float val5, float val6, float val7, float val8,
+         float val9, float val10, float val11, float val12,
+         float val13, float val14, float val15, float val16) {
+    values = _mm512_setr_ps(val1, val2, val3, val4, val5, val6, val7, val8,
+                            val9, val10, val11, val12, val13, val14, val15, val16);
+  }
+  operator __m512() const {
+    return values;
+  }
+  template <int64_t mask>
+  static Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b) {
+    return _mm512_mask_blend_ps(mask, a.values, b.values);
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask) {
+    auto all_ones = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mmask = _mm512_cmp_epi32_mask(_mm512_castps_si512(mask.values), all_ones, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_ps(mmask, a.values, b.values);
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<float>(
+      base,            base +     step, base + 2 * step, base + 3 * step,
+      base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step,
+      base + 8 * step, base + 9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return _mm512_loadu_ps(reinterpret_cast<const float*>(ptr));
+
+    __mmask16 mask = (1ULL << count) - 1;
+    return _mm512_maskz_loadu_ps(mask, ptr);
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      _mm512_storeu_ps(reinterpret_cast<float*>(ptr), values);
+    } else if (count > 0) {
+      __mmask16 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_ps(reinterpret_cast<float*>(ptr), mask, values);
+    }
+  }
+  const float& operator[](int idx) const  = delete;
+  float& operator[](int idx) = delete;
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    __mmask16 cmp = _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_EQ_OQ);
+    return static_cast<int32_t>(cmp);
+  }
+  Vectorized<float> isnan() const {
+    auto mask =  _mm512_cmp_ps_mask(values, _mm512_set1_ps(0.0), _CMP_UNORD_Q);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+  bool has_inf_nan() const {
+    __m512 self_sub  = _mm512_sub_ps(values, values);
+    return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0;
+  }
+  Vectorized<float> map(float (*const f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    auto mask = _mm512_set1_ps(-0.f);
+    return _mm512_andnot_ps(mask, values);
+  }
+  Vectorized<float> angle() const {
+    __m512 zero_vec = _mm512_set1_ps(0.f);
+    const auto nan_vec = _mm512_set1_ps(NAN);
+    const auto not_nan_mask = _mm512_cmp_ps_mask(values, values, _CMP_EQ_OQ);
+    const auto not_nan_vec = _mm512_mask_set1_epi32(_mm512_castps_si512(zero_vec),
+                                                    not_nan_mask, 0xFFFFFFFF);
+    const auto nan_mask = _mm512_cmp_ps_mask(_mm512_castsi512_ps(not_nan_vec),
+                                             zero_vec, _CMP_EQ_OQ);
+    const auto pi = _mm512_set1_ps(c10::pi<double>);
+
+    const auto neg_mask = _mm512_cmp_ps_mask(values, zero_vec, _CMP_LT_OQ);
+    auto angle = _mm512_mask_blend_ps(neg_mask, zero_vec, pi);
+    angle = _mm512_mask_blend_ps(nan_mask, angle, nan_vec);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return *this;
+  }
+  Vectorized<float> imag() const {
+    return _mm512_set1_ps(0);
+  }
+  Vectorized<float> conj() const {
+    return *this;
+  }
+  Vectorized<float> acos() const {
+    return Vectorized<float>(Sleef_acosf16_u10(values));
+  }
+  Vectorized<float> acosh() const {
+    return Vectorized<float>(Sleef_acoshf16_u10(values));
+  }
+  Vectorized<float> asin() const {
+    return Vectorized<float>(Sleef_asinf16_u10(values));
+  }
+  Vectorized<float> atan() const {
+    return Vectorized<float>(Sleef_atanf16_u10(values));
+  }
+  Vectorized<float> atanh() const {
+    return Vectorized<float>(Sleef_atanhf16_u10(values));
+  }
+  Vectorized<float> atan2(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_atan2f16_u10(values, b));
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+    return Vectorized<float>(Sleef_copysignf16(values, sign));
+  }
+  Vectorized<float> erf() const {
+    // constants
+    const auto neg_zero_vec = _mm512_set1_ps(-0.f);
+    const auto one_vec = _mm512_set1_ps(1.0f);
+    const auto p = _mm512_set1_ps(0.3275911f);
+    const auto p1 = _mm512_set1_ps(0.254829592f);
+    const auto p2 = _mm512_set1_ps(-0.284496736f);
+    const auto p3 = _mm512_set1_ps(1.421413741f);
+    const auto p4 = _mm512_set1_ps(-1.453152027f);
+    const auto p5 = _mm512_set1_ps(1.061405429f);
+    // sign(x)
+    auto sign_mask = _mm512_and_ps(neg_zero_vec, values);
+    auto abs_vec = _mm512_abs_ps(values);
+    // t = 1 / (p * abs(x) + 1)
+    auto tmp0 = _mm512_fmadd_ps(p, abs_vec, one_vec);
+    auto t = _mm512_div_ps(one_vec, tmp0);
+    // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+    auto tmp1 = _mm512_fmadd_ps(p5, t, p4);
+    auto tmp2 = _mm512_fmadd_ps(tmp1, t, p3);
+    auto tmp3 = _mm512_fmadd_ps(tmp2, t, p2);
+    auto r = _mm512_fmadd_ps(tmp3, t, p1);
+    // - exp(- x * x)
+    auto pow_2 = _mm512_mul_ps(values, values);
+    auto neg_pow_2 = _mm512_xor_ps(neg_zero_vec, pow_2);
+    // auto tmp4 = exp(neg_pow_2);
+    auto tmp4 = Vectorized<float>(Sleef_expf16_u10(neg_pow_2));
+    auto tmp5 = _mm512_xor_ps(neg_zero_vec, tmp4);
+    // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+    auto tmp6 = _mm512_mul_ps(tmp5, t);
+    auto tmp7 = _mm512_fmadd_ps(tmp6, r, one_vec);
+    return _mm512_xor_ps(sign_mask, tmp7);
+  }
+  Vectorized<float> erfc() const {
+    return Vectorized<float>(Sleef_erfcf16_u15(values));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return Vectorized<float>(Sleef_expf16_u10(values));
+  }
+  Vectorized<float> exp2() const {
+    return Vectorized<float>(Sleef_exp2f16_u10(values));
+  }
+  Vectorized<float> expm1() const {
+    return Vectorized<float>(Sleef_expm1f16_u10(values));
+  }
+  Vectorized<float> exp_u20() const {
+    // A faster version of exp with ULP=20
+    static __m512 vec_factorial_1 =
+        _mm512_set1_ps(0.999999701f); // 1/factorial(1)
+    static __m512 vec_factorial_2 =
+        _mm512_set1_ps(0.499991506f); // 1/factorial(2)
+    static __m512 vec_factorial_3 =
+        _mm512_set1_ps(0.166676521f); // 1/factorial(3)
+    static __m512 vec_factorial_4 =
+        _mm512_set1_ps(0.0418978221f); // 1/factorial(4)
+    static __m512 vec_factorial_5 =
+        _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
+    static __m512 vec_exp_log2ef =
+        (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e)
+    static __m512 vec_half = _mm512_set1_ps(0.5f);
+    static __m512 vec_one = _mm512_set1_ps(1.f);
+    static __m512 vec_zero = _mm512_set1_ps(0.f);
+    static __m512 vec_two = _mm512_set1_ps(2.f);
+    static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2)
+    static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50);
+    static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218);
+    static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+    static int n_mantissa_bits = 23;
+
+    // exp(x) =
+    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
+    // = 2^n * exp(r) // simplify the exp(n*ln(2)) expression
+
+    auto less_ln_flt_min_mask =
+        _mm512_cmp_ps_mask(values, vec_ln_flt_min, 1 /*_CMP_LT_OS*/);
+    auto vec_src = _mm512_min_ps(values, vec_ln_flt_max);
+    vec_src = _mm512_max_ps(vec_src, vec_ln_flt_min);
+
+    // fx = floorf(x * log2ef + 0.5)
+    auto vec_fx = _mm512_fmadd_ps(vec_src, vec_exp_log2ef, vec_half);
+    auto vec_fx_i = _mm512_cvt_roundps_epi32(
+        vec_fx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+    vec_fx = _mm512_cvtepi32_ps(vec_fx_i);
+
+    // x = x - fx * ln2
+    auto vec_exp_poly = _mm512_fnmadd_ps(vec_fx, vec_ln2f, vec_src);
+
+    // compute polynomial
+    auto vec_res =
+        _mm512_fmadd_ps(vec_exp_poly, vec_factorial_5, vec_factorial_4);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_3);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_2);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_factorial_1);
+    vec_res = _mm512_fmadd_ps(vec_exp_poly, vec_res, vec_one);
+
+    // compute 2^(n-1)
+    auto vec_exp_number = _mm512_sub_ps(vec_fx, vec_one);
+    auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number);
+    auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127);
+    vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits);
+    auto vec_two_pow_n = (__m512)vec_two_pow_n_i;
+    vec_two_pow_n =
+        _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero);
+
+    // y = y * 2^n
+    vec_res = _mm512_mul_ps(vec_res, vec_two_pow_n);
+    vec_res = _mm512_mul_ps(vec_res, vec_two);
+    return vec_res;
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+    return Vectorized<float>(Sleef_fmodf16(values, q));
+  }
+  Vectorized<float> log() const {
+    return Vectorized<float>(Sleef_logf16_u10(values));
+  }
+  Vectorized<float> log2() const {
+    return Vectorized<float>(Sleef_log2f16_u10(values));
+  }
+  Vectorized<float> log10() const {
+    return Vectorized<float>(Sleef_log10f16_u10(values));
+  }
+  Vectorized<float> log1p() const {
+    return Vectorized<float>(Sleef_log1pf16_u10(values));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return Vectorized<float>(Sleef_sinf16_u35(values));
+  }
+  Vectorized<float> sinh() const {
+    return Vectorized<float>(Sleef_sinhf16_u10(values));
+  }
+  Vectorized<float> cos() const {
+    return Vectorized<float>(Sleef_cosf16_u35(values));
+  }
+  Vectorized<float> cosh() const {
+    return Vectorized<float>(Sleef_coshf16_u10(values));
+  }
+  Vectorized<float> ceil() const {
+    return _mm512_ceil_ps(values);
+  }
+  Vectorized<float> floor() const {
+    return _mm512_floor_ps(values);
+  }
+  Vectorized<float> hypot(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_hypotf16_u05(values, b));
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (const auto i : c10::irange(size())) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> neg() const {
+    return _mm512_xor_ps(_mm512_set1_ps(-0.f), values);
+  }
+  Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_nextafterf16(values, b));
+  }
+  Vectorized<float> round() const {
+    return _mm512_roundscale_ps(values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> tan() const {
+    return Vectorized<float>(Sleef_tanf16_u10(values));
+  }
+  Vectorized<float> tanh() const {
+    return Vectorized<float>(Sleef_tanhf16_u10(values));
+  }
+  Vectorized<float> trunc() const {
+    return _mm512_roundscale_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+  }
+  Vectorized<float> lgamma() const {
+    return Vectorized<float>(Sleef_lgammaf16_u10(values));
+  }
+  Vectorized<float> sqrt() const {
+    return _mm512_sqrt_ps(values);
+  }
+  Vectorized<float> reciprocal() const {
+    return _mm512_div_ps(_mm512_set1_ps(1), values);
+  }
+  Vectorized<float> rsqrt() const {
+    return _mm512_div_ps(_mm512_set1_ps(1), _mm512_sqrt_ps(values));
+  }
+  Vectorized<float> pow(const Vectorized<float> &b) const {
+    return Vectorized<float>(Sleef_powf16_u10(values, b));
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_EQ_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_NEQ_UQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LT_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_LE_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GT_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    auto mask = _mm512_cmp_ps_mask(values, other.values, _CMP_GE_OQ);
+    return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask,
+                                                      0xFFFFFFFF));
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_add_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_sub_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_mul_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_div_ps(a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+inline Vectorized<float> Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto max = _mm512_max_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
+                                                          0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_ps(max, isnan);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  auto zero_vec = _mm512_set1_epi32(0);
+  auto min = _mm512_min_ps(a, b);
+  auto isnan_mask = _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
+  auto isnan = _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, isnan_mask,
+                                                          0xFFFFFFFF));
+  // Exploit the fact that all-ones is a NaN.
+  return _mm512_or_ps(min, isnan);
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return _mm512_min_ps(max, _mm512_max_ps(min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return _mm512_min_ps(max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return _mm512_max_ps(min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_and_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_or_ps(a, b);
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return _mm512_xor_ps(a, b);
+}
+
+inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  int64_t i;
+#pragma unroll
+  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
+    _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
+  }
+#pragma unroll
+  for (; i < n; i++) {
+    dst[i] = src[i];
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+
+template <>
+Vectorized<float> inline fmsub(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return _mm512_fmsub_ps(a, b, c);
+}
+
+// TODO(jgong5): rewrite with ATEN vectorized (need to add unpack and shuffle)
+// Used by Inductor CPP codegen
+// Code referred to FBGEMM:
+// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6
+// 16 * 6 = 96 instructions
+template<>
+inline void transpose_mxn<float, 16, 16>(
+    const float* src,
+    int64_t ld_src,
+    float* dst,
+    int64_t ld_dst) {
+  // load from src to registers
+  // a: a0  a1  a2  a3  a4  a5  a6  a7  a8  a9  a10 a11 a12 a13 a14 a15
+  // b: b0  b1  b2  b3  b4  b5  b6  b7  b8  b9  b10 b11 b12 b13 b14 b15
+  // c: c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+  // d: d0  d1  d2  d3  d4  d5  d6  d7  d8  d9  d10 d11 d12 d13 d14 d15
+  // e: e0  e1  e2  e3  e4  e5  e6  e7  e8  e9  e10 e11 e12 e13 e14 e15
+  // f: f0  f1  f2  f3  f4  f5  f6  f7  f8  f9  f10 f11 f12 f13 f14 f15
+  // g: g0  g1  g2  g3  g4  g5  g6  g7  g8  g9  g10 g11 g12 g13 g14 g15
+  // h: h0  h1  h2  h3  h4  h5  h6  h7  h8  h9  h10 h11 h12 h13 h14 h15
+  // i: i0  i1  i2  i3  i4  i5  i6  i7  i8  i9  i10 i11 i12 i13 i14 i15
+  // j: j0  j1  j2  j3  j4  j5  j6  j7  j8  j9  j10 j11 j12 j13 j14 j15
+  // k: k0  k1  k2  k3  k4  k5  k6  k7  k8  k9  k10 k11 k12 k13 k14 k15
+  // l: l0  l1  l2  l3  l4  l5  l6  l7  l8  l9  l10 l11 l12 l13 l14 l15
+  // m: m0  m1  m2  m3  m4  m5  m6  m7  m8  m9  m10 m11 m12 m13 m14 m15
+  // n: n0  n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14 n15
+  // o: o0  o1  o2  o3  o4  o5  o6  o7  o8  o9  o10 o11 o12 o13 o14 o15
+  // p: p0  p1  p2  p3  p4  p5  p6  p7  p8  p9  p10 p11 p12 p13 p14 p15
+  __m512 a = _mm512_loadu_ps(&src[0 * ld_src]);
+  __m512 b = _mm512_loadu_ps(&src[1 * ld_src]);
+  __m512 c = _mm512_loadu_ps(&src[2 * ld_src]);
+  __m512 d = _mm512_loadu_ps(&src[3 * ld_src]);
+  __m512 e = _mm512_loadu_ps(&src[4 * ld_src]);
+  __m512 f = _mm512_loadu_ps(&src[5 * ld_src]);
+  __m512 g = _mm512_loadu_ps(&src[6 * ld_src]);
+  __m512 h = _mm512_loadu_ps(&src[7 * ld_src]);
+  __m512 i = _mm512_loadu_ps(&src[8 * ld_src]);
+  __m512 j = _mm512_loadu_ps(&src[9 * ld_src]);
+  __m512 k = _mm512_loadu_ps(&src[10 * ld_src]);
+  __m512 l = _mm512_loadu_ps(&src[11 * ld_src]);
+  __m512 m = _mm512_loadu_ps(&src[12 * ld_src]);
+  __m512 n = _mm512_loadu_ps(&src[13 * ld_src]);
+  __m512 o = _mm512_loadu_ps(&src[14 * ld_src]);
+  __m512 p = _mm512_loadu_ps(&src[15 * ld_src]);
+
+  __m512 ta, tb, tc, td, te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tq;
+  // unpacking and interleaving 32-bit elements
+  // a0  b0  a1  b1  a4  b4  a5  b5  a8  b8  a9  b9  a12  b12 a13 b13
+  // a2  b2  a3  b3  a6  b6  a7  b7  a10 b10 a11 b11 a14  b14 a15 b15
+  // c0  d0  c1  d1 ...
+  // c2  d2  c3  d3 ...
+  // e0  f0  e1  f1 ...
+  // e2  f2  e3  f3 ...
+  // g0  h0  g1  h1 ...
+  // g2  h2  g3  h3 ...
+  // i0  ...
+  // i2  ...
+  // k0  ...
+  // k2  ...
+  // m0  ...
+  // m2  ...
+  // o0  ...
+  // o1  ...
+  ta = _mm512_unpacklo_ps(a, b);
+  tb = _mm512_unpackhi_ps(a, b);
+  tc = _mm512_unpacklo_ps(c, d);
+  td = _mm512_unpackhi_ps(c, d);
+  te = _mm512_unpacklo_ps(e, f);
+  tf = _mm512_unpackhi_ps(e, f);
+  tg = _mm512_unpacklo_ps(g, h);
+  th = _mm512_unpackhi_ps(g, h);
+  ti = _mm512_unpacklo_ps(i, j);
+  tj = _mm512_unpackhi_ps(i, j);
+  tk = _mm512_unpacklo_ps(k, l);
+  tl = _mm512_unpackhi_ps(k, l);
+  tm = _mm512_unpacklo_ps(m, n);
+  tn = _mm512_unpackhi_ps(m, n);
+  to = _mm512_unpacklo_ps(o, p);
+  tq = _mm512_unpackhi_ps(o, p);
+
+  // unpacking and interleaving 64-bit elements
+  //  a0  b0  c0  d0  a4  b4  c4  d4  a8  b8  c8  d8  a12 b12 c12 d12
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  e0  f0  g0  h0  e4  f4  g4  h4  e8  f8  g8  h8  e12 f12 g12 h12
+  //  e1  f1  g1  h1 ...
+  //  e2  f2  g2  h2 ...
+  //  e3  f3  g3  h3 ...
+  //  i0  j0  k0  l0 ...
+  //  i1  j1  k1  l1 ...
+  //  i2  j2  k2  l2 ...
+  //  i3  j3  k3  l3 ...
+  //  m0  n0  o0  p0 ...
+  //  m1  n1  o1  p1 ...
+  //  m2  n2  o2  p2 ...
+  //  m3  n3  o3  p3 ...
+  a = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(ta), _mm512_castps_pd(tc)));
+  b = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(ta), _mm512_castps_pd(tc)));
+  c = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(tb), _mm512_castps_pd(td)));
+  d = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(tb), _mm512_castps_pd(td)));
+  e = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(te), _mm512_castps_pd(tg)));
+  f = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(te), _mm512_castps_pd(tg)));
+  g = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(tf), _mm512_castps_pd(th)));
+  h = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(tf), _mm512_castps_pd(th)));
+  i = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(ti), _mm512_castps_pd(tk)));
+  j = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(ti), _mm512_castps_pd(tk)));
+  k = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(tj), _mm512_castps_pd(tl)));
+  l = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(tj), _mm512_castps_pd(tl)));
+  m = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(tm), _mm512_castps_pd(to)));
+  n = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(tm), _mm512_castps_pd(to)));
+  o = _mm512_castpd_ps(
+      _mm512_unpacklo_pd(_mm512_castps_pd(tn), _mm512_castps_pd(tq)));
+  p = _mm512_castpd_ps(
+      _mm512_unpackhi_pd(_mm512_castps_pd(tn), _mm512_castps_pd(tq)));
+
+  //  shuffle 128-bits (composed of 4 32-bit elements)
+  //  a0  b0  c0  d0  a8  b8  c8  d8  e0  f0  g0  h0  e8  f8  g8  h8
+  //  a1  b1  c1  d1 ...
+  //  a2  b2  c2  d2 ...
+  //  a3  b3  c3  d3 ...
+  //  a4  b4  c4  d4 ...
+  //  a5  b5  c5  d5 ...
+  //  a6  b6  c6  d6 ...
+  //  a7  b7  c7  d7 ...
+  //  i0  j0  k0  l0  i8  j8  k8  l8  m0  n0  o0  p0  m8  n8  o8  p8
+  //  i1  j1  k1  l1 ...
+  //  i2  j2  k2  l2 ...
+  //  i3  j3  k3  l3 ...
+  //  i4  j4  k4  l4 ...
+  //  i5  j5  k5  l5 ...
+  //  i6  j6  k6  l6 ...
+  //  i7  j7  k7  l7 ...
+  ta = _mm512_shuffle_f32x4(a, e, 0x88);
+  tb = _mm512_shuffle_f32x4(b, f, 0x88);
+  tc = _mm512_shuffle_f32x4(c, g, 0x88);
+  td = _mm512_shuffle_f32x4(d, h, 0x88);
+  te = _mm512_shuffle_f32x4(a, e, 0xdd);
+  tf = _mm512_shuffle_f32x4(b, f, 0xdd);
+  tg = _mm512_shuffle_f32x4(c, g, 0xdd);
+  th = _mm512_shuffle_f32x4(d, h, 0xdd);
+  ti = _mm512_shuffle_f32x4(i, m, 0x88);
+  tj = _mm512_shuffle_f32x4(j, n, 0x88);
+  tk = _mm512_shuffle_f32x4(k, o, 0x88);
+  tl = _mm512_shuffle_f32x4(l, p, 0x88);
+  tm = _mm512_shuffle_f32x4(i, m, 0xdd);
+  tn = _mm512_shuffle_f32x4(j, n, 0xdd);
+  to = _mm512_shuffle_f32x4(k, o, 0xdd);
+  tq = _mm512_shuffle_f32x4(l, p, 0xdd);
+
+  //  shuffle 128-bits (composed of 4 32-bit elements)
+  //  a0  b0  c0  d0  ...  o0
+  //  a1  b1  c1  d1  ...  o1
+  //  a2  b2  c2  d2  ...  o2
+  //  a3  b3  c3  d3  ...  o3
+  //  a4  ...
+  //  a5  ...
+  //  a6  ...
+  //  a7  ...
+  //  a8  ...
+  //  a9  ...
+  //  a10 ...
+  //  a11 ...
+  //  a12 ...
+  //  a13 ...
+  //  a14 ...
+  //  a15 b15 c15 d15 ...  o15
+  a = _mm512_shuffle_f32x4(ta, ti, 0x88);
+  b = _mm512_shuffle_f32x4(tb, tj, 0x88);
+  c = _mm512_shuffle_f32x4(tc, tk, 0x88);
+  d = _mm512_shuffle_f32x4(td, tl, 0x88);
+  e = _mm512_shuffle_f32x4(te, tm, 0x88);
+  f = _mm512_shuffle_f32x4(tf, tn, 0x88);
+  g = _mm512_shuffle_f32x4(tg, to, 0x88);
+  h = _mm512_shuffle_f32x4(th, tq, 0x88);
+  i = _mm512_shuffle_f32x4(ta, ti, 0xdd);
+  j = _mm512_shuffle_f32x4(tb, tj, 0xdd);
+  k = _mm512_shuffle_f32x4(tc, tk, 0xdd);
+  l = _mm512_shuffle_f32x4(td, tl, 0xdd);
+  m = _mm512_shuffle_f32x4(te, tm, 0xdd);
+  n = _mm512_shuffle_f32x4(tf, tn, 0xdd);
+  o = _mm512_shuffle_f32x4(tg, to, 0xdd);
+  p = _mm512_shuffle_f32x4(th, tq, 0xdd);
+
+  // store from registers to dst
+  _mm512_storeu_ps(&dst[0 * ld_dst], a);
+  _mm512_storeu_ps(&dst[1 * ld_dst], b);
+  _mm512_storeu_ps(&dst[2 * ld_dst], c);
+  _mm512_storeu_ps(&dst[3 * ld_dst], d);
+  _mm512_storeu_ps(&dst[4 * ld_dst], e);
+  _mm512_storeu_ps(&dst[5 * ld_dst], f);
+  _mm512_storeu_ps(&dst[6 * ld_dst], g);
+  _mm512_storeu_ps(&dst[7 * ld_dst], h);
+  _mm512_storeu_ps(&dst[8 * ld_dst], i);
+  _mm512_storeu_ps(&dst[9 * ld_dst], j);
+  _mm512_storeu_ps(&dst[10 * ld_dst], k);
+  _mm512_storeu_ps(&dst[11 * ld_dst], l);
+  _mm512_storeu_ps(&dst[12 * ld_dst], m);
+  _mm512_storeu_ps(&dst[13 * ld_dst], n);
+  _mm512_storeu_ps(&dst[14 * ld_dst], o);
+  _mm512_storeu_ps(&dst[15 * ld_dst], p);
+}
+
+#endif
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f316ab65166420e40a31bc0a4913ce7c682ae89
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_int.h
@@ -0,0 +1,1459 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+#ifdef CPU_CAPABILITY_AVX512
+
+struct Vectorizedi {
+protected:
+  __m512i values;
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+  static inline __m512i invert(const __m512i& v) {
+    const auto ones = _mm512_set1_epi64(-1);
+    return _mm512_xor_si512(ones, v);
+  }
+public:
+  Vectorizedi() {}
+  Vectorizedi(__m512i v) : values(v) {}
+  operator __m512i() const {
+    return values;
+  }
+};
+
+#else
+
+struct Vectorizedi {};  // dummy definition to make Vectorizedi always defined
+
+#endif // CPU_CAPABILITY_AVX512
+
+#ifdef CPU_CAPABILITY_AVX512
+
+template <>
+class Vectorized<int64_t> : public Vectorizedi {
+private:
+  static const Vectorized<int64_t> ones;
+public:
+  using value_type = int64_t;
+  using size_type = int;
+  static constexpr size_type size() {
+    return 8;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int64_t v) { values = _mm512_set1_epi64(v); }
+  Vectorized(int64_t val1, int64_t val2, int64_t val3, int64_t val4,
+         int64_t val5, int64_t val6, int64_t val7, int64_t val8) {
+    values = _mm512_setr_epi64(val1, val2, val3, val4,
+                                val5, val6, val7, val8);
+  }
+  template <int64_t mask>
+  static Vectorized<int64_t> blend(Vectorized<int64_t> a, Vectorized<int64_t> b) {
+    return _mm512_mask_blend_epi64(mask, a.values, b.values);
+  }
+  static Vectorized<int64_t> blendv(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b,
+                                const Vectorized<int64_t>& mask) {
+    auto msb_one = _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF);
+    auto mask_ = _mm512_cmp_epi64_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi64(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int64_t> arange(int64_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int64_t>(base,            base + step,     base + 2 * step, base + 3 * step,
+                           base + 4 * step, base + 5 * step, base + 6 * step, base + 7 * step);
+  }
+  static Vectorized<int64_t>
+  set(Vectorized<int64_t> a, Vectorized<int64_t> b, int64_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int64_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int64_t> loadu(const void* ptr, int64_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask8 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi64(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask8 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi64(ptr, mask, values);
+    }
+  }
+  const int64_t& operator[](int idx) const  = delete;
+  int64_t& operator[](int idx)  = delete;
+  Vectorized<int64_t> abs() const {
+    auto is_larger_mask = _mm512_cmpgt_epi64_mask(zero_vector, values);
+    auto is_larger = _mm512_mask_set1_epi64(zero_vector, is_larger_mask, 0xFFFFFFFFFFFFFFFF);
+    auto inverse = _mm512_xor_si512(values, is_larger);
+    return _mm512_sub_epi64(inverse, is_larger);
+  }
+  Vectorized<int64_t> real() const {
+    return *this;
+  }
+  Vectorized<int64_t> imag() const {
+    return _mm512_set1_epi64(0);
+  }
+  Vectorized<int64_t> conj() const {
+    return *this;
+  }
+  Vectorized<int64_t> neg() const;
+  Vectorized<int64_t> operator==(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpeq_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator!=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpneq_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator<(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmplt_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator<=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmple_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator>(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpgt_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+  Vectorized<int64_t> operator>=(const Vectorized<int64_t>& other) const {
+    auto mask = _mm512_cmpge_epi64_mask(values, other.values);
+    return _mm512_mask_set1_epi64(zero_vector, mask, 0xFFFFFFFFFFFFFFFF);
+  }
+
+  Vectorized<int64_t> eq(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ne(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> gt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> ge(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> lt(const Vectorized<int64_t>& other) const;
+  Vectorized<int64_t> le(const Vectorized<int64_t>& other) const;
+};
+
+template <>
+class Vectorized<int32_t> : public Vectorizedi {
+private:
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<int32_t> ones;
+public:
+  using value_type = int32_t;
+  static constexpr int size() {
+    return 16;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int32_t v) { values = _mm512_set1_epi32(v); }
+  Vectorized(int32_t val1, int32_t val2, int32_t val3, int32_t val4,
+            int32_t val5, int32_t val6, int32_t val7, int32_t val8,
+            int32_t val9, int32_t val10, int32_t val11, int32_t val12,
+            int32_t val13, int32_t val14, int32_t val15, int32_t val16) {
+    values = _mm512_setr_epi32(val1, val2, val3, val4, val5, val6, val7, val8,
+                               val9, val10, val11, val12, val13, val14, val15, val16);
+  }
+  template <int64_t mask>
+  static Vectorized<int32_t> blend(Vectorized<int32_t> a, Vectorized<int32_t> b) {
+    return _mm512_mask_blend_epi32(mask, a.values, b.values);
+  }
+  static Vectorized<int32_t> blendv(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b,
+                                const Vectorized<int32_t>& mask) {
+    auto msb_one = _mm512_set1_epi32(0xFFFFFFFF);
+    auto mask_ = _mm512_cmp_epi32_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi32(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int32_t> arange(int32_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int32_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step);
+  }
+  static Vectorized<int32_t>
+  set(Vectorized<int32_t> a, Vectorized<int32_t> b, int32_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<1>(a, b);
+      case 2:
+        return blend<3>(a, b);
+      case 3:
+        return blend<7>(a, b);
+      case 4:
+        return blend<15>(a, b);
+      case 5:
+        return blend<31>(a, b);
+      case 6:
+        return blend<63>(a, b);
+      case 7:
+        return blend<127>(a, b);
+      case 8:
+        return blend<255>(a, b);
+      case 9:
+        return blend<511>(a, b);
+      case 10:
+        return blend<1023>(a, b);
+      case 11:
+        return blend<2047>(a, b);
+      case 12:
+        return blend<4095>(a, b);
+      case 13:
+        return blend<8191>(a, b);
+      case 14:
+        return blend<16383>(a, b);
+      case 15:
+        return blend<32767>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int32_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int32_t> loadu(const void* ptr, int32_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask16 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi32(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask16 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi32(ptr, mask, values);
+    }
+  }
+  const int32_t& operator[](int idx) const  = delete;
+  int32_t& operator[](int idx)  = delete;
+  Vectorized<int32_t> abs() const {
+    return _mm512_abs_epi32(values);
+  }
+  Vectorized<int32_t> real() const {
+    return *this;
+  }
+  Vectorized<int32_t> imag() const {
+    return _mm512_set1_epi32(0);
+  }
+  Vectorized<int32_t> conj() const {
+    return *this;
+  }
+  Vectorized<int32_t> neg() const;
+  Vectorized<int32_t> operator==(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpeq_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator!=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpneq_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator<(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmplt_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator<=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmple_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator>(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpgt_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> operator>=(const Vectorized<int32_t>& other) const {
+    auto mask = _mm512_cmpge_epi32_mask(values, other.values);
+    return _mm512_mask_set1_epi32(zero_vector, mask, 0xFFFFFFFF);
+  }
+  Vectorized<int32_t> eq(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ne(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> gt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> ge(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> lt(const Vectorized<int32_t>& other) const;
+  Vectorized<int32_t> le(const Vectorized<int32_t>& other) const;
+};
+
+template <>
+inline void convert(const int32_t *src, float *dst, int64_t n) {
+  int64_t i;
+  // int32_t and float have same size
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<int32_t>::size()); i += Vectorized<int32_t>::size()) {
+    auto input_vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i));
+    auto output_vec = _mm512_cvtepi32_ps(input_vec);
+    _mm512_storeu_ps(reinterpret_cast<float*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<float>(src[i]);
+  }
+}
+
+template <>
+inline void convert(const int32_t *src, double *dst, int64_t n) {
+  int64_t i;
+  // int32_t has half the size of double
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
+    auto input_256_vec = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i));
+    auto output_vec = _mm512_cvtepi32_pd(input_256_vec);
+    _mm512_storeu_pd(reinterpret_cast<double*>(dst + i), output_vec);
+  }
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (; i < n; i++) {
+    dst[i] = static_cast<double>(src[i]);
+  }
+}
+
+template <>
+class Vectorized<int16_t> : public Vectorizedi {
+private:
+  static const Vectorized<int16_t> ones;
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+public:
+  using value_type = int16_t;
+  static constexpr int size() {
+    return 32;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized() {}
+  Vectorized(int16_t v) { values = _mm512_set1_epi16(v); }
+  Vectorized(int16_t val1, int16_t val2, int16_t val3, int16_t val4,
+         int16_t val5, int16_t val6, int16_t val7, int16_t val8,
+         int16_t val9, int16_t val10, int16_t val11, int16_t val12,
+         int16_t val13, int16_t val14, int16_t val15, int16_t val16,
+         int16_t val17, int16_t val18, int16_t val19, int16_t val20,
+         int16_t val21, int16_t val22, int16_t val23, int16_t val24,
+         int16_t val25, int16_t val26, int16_t val27, int16_t val28,
+         int16_t val29, int16_t val30, int16_t val31, int16_t val32) {
+    values = _mm512_set_epi16(val32, val31, val30, val29, val28, val27, val26, val25,
+                              val24, val23, val22, val21, val20, val19, val18, val17,
+                              val16, val15, val14, val13, val12, val11, val10, val9,
+                              val8, val7, val6, val5, val4, val3, val2, val1);
+  }
+  template <int64_t mask>
+  static Vectorized<int16_t> blend(Vectorized<int16_t> a, Vectorized<int16_t> b) {
+    return _mm512_mask_blend_epi16(mask, a.values, b.values);
+  }
+  static Vectorized<int16_t> blendv(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b,
+                                const Vectorized<int16_t>& mask) {
+    auto msb_one = _mm512_set1_epi16(0xFFFF);
+    auto mask_ = _mm512_cmp_epi16_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi16(mask_, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<int16_t> arange(int16_t base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<int16_t>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step
+    );
+  }
+  static Vectorized<int16_t>
+  set(Vectorized<int16_t> a, Vectorized<int16_t> b, int16_t count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<int16_t> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<int16_t> loadu(const void* ptr, int16_t count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else {
+      __mmask32 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi16(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      __mmask32 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_epi16(ptr, mask, values);
+    }
+  }
+  const int16_t& operator[](int idx) const  = delete;
+  int16_t& operator[](int idx)  = delete;
+  Vectorized<int16_t> abs() const {
+    return _mm512_abs_epi16(values);
+  }
+  Vectorized<int16_t> real() const {
+    return *this;
+  }
+  Vectorized<int16_t> imag() const {
+    return _mm512_set1_epi16(0);
+  }
+  Vectorized<int16_t> conj() const {
+    return *this;
+  }
+  Vectorized<int16_t> neg() const;
+  Vectorized<int16_t> operator==(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpeq_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator!=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpneq_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator<(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmplt_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator<=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmple_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator>(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpgt_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+  Vectorized<int16_t> operator>=(const Vectorized<int16_t>& other) const {
+    auto mask = _mm512_cmpge_epi16_mask(values, other.values);
+    return _mm512_mask_set1_epi16(zero_vector, mask, 0xFFFF);
+  }
+
+  Vectorized<int16_t> eq(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ne(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> gt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> ge(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> lt(const Vectorized<int16_t>& other) const;
+  Vectorized<int16_t> le(const Vectorized<int16_t>& other) const;
+};
+
+template <typename T>
+class Vectorized8 : public Vectorizedi {
+  static_assert(
+    std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+    "Only int8_t/uint8_t are supported");
+protected:
+  static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0};
+  static const Vectorized<T> ones;
+public:
+  using value_type = T;
+  static constexpr int size() {
+    return 64;
+  }
+  using Vectorizedi::Vectorizedi;
+  Vectorized8() {}
+  Vectorized8(T v) { values = _mm512_set1_epi8(v); }
+  Vectorized8(T val1, T val2, T val3, T val4,
+         T val5, T val6, T val7, T val8,
+         T val9, T val10, T val11, T val12,
+         T val13, T val14, T val15, T val16,
+         T val17, T val18, T val19, T val20,
+         T val21, T val22, T val23, T val24,
+         T val25, T val26, T val27, T val28,
+         T val29, T val30, T val31, T val32,
+         T val33, T val34, T val35, T val36,
+         T val37, T val38, T val39, T val40,
+         T val41, T val42, T val43, T val44,
+         T val45, T val46, T val47, T val48,
+         T val49, T val50, T val51, T val52,
+         T val53, T val54, T val55, T val56,
+         T val57, T val58, T val59, T val60,
+         T val61, T val62, T val63, T val64){
+    values = _mm512_set_epi8(val64, val63, val62, val61, val60, val59, val58, val57,
+                              val56, val55, val54, val53,val52, val51, val50, val49,
+                              val48, val47, val46, val45, val44, val43, val42, val41,
+                              val40, val39, val38, val37, val36, val35, val34, val33,
+                              val32, val31, val30, val29, val28, val27, val26, val25,
+                              val24, val23, val22, val21, val20, val19, val18, val17,
+                              val16, val15, val14, val13, val12, val11, val10, val9,
+                              val8, val7, val6, val5, val4, val3, val2, val1);
+  }
+  template <int64_t mask>
+  static Vectorized<T> blend(Vectorized<T> a, Vectorized<T> b) {
+    return _mm512_mask_blend_epi8(mask, a.values, b.values);
+  }
+  template <typename step_t>
+  static Vectorized<T> arange(T base = 0, step_t step = static_cast<step_t>(1)) {
+    return Vectorized<T>(
+      base,             base +      step, base +  2 * step, base +  3 * step,
+      base +  4 * step, base +  5 * step, base +  6 * step, base +  7 * step,
+      base +  8 * step, base +  9 * step, base + 10 * step, base + 11 * step,
+      base + 12 * step, base + 13 * step, base + 14 * step, base + 15 * step,
+      base + 16 * step, base + 17 * step, base + 18 * step, base + 19 * step,
+      base + 20 * step, base + 21 * step, base + 22 * step, base + 23 * step,
+      base + 24 * step, base + 25 * step, base + 26 * step, base + 27 * step,
+      base + 28 * step, base + 29 * step, base + 30 * step, base + 31 * step,
+      base + 32 * step, base + 33 * step, base + 34 * step, base + 35 * step,
+      base + 36 * step, base + 37 * step, base + 38 * step, base + 39 * step,
+      base + 40 * step, base + 41 * step, base + 42 * step, base + 43 * step,
+      base + 44 * step, base + 45 * step, base + 46 * step, base + 47 * step,
+      base + 48 * step, base + 49 * step, base + 50 * step, base + 51 * step,
+      base + 52 * step, base + 53 * step, base + 54 * step, base + 55 * step,
+      base + 56 * step, base + 57 * step, base + 58 * step, base + 59 * step,
+      base + 60 * step, base + 61 * step, base + 62 * step, base + 63 * step);
+  }
+  static Vectorized<T>
+  set(Vectorized<T> a, Vectorized<T> b, T count = size()) {
+    switch (count) {
+      case 0:
+        return a;
+      case 1:
+        return blend<0x1>(a, b);
+      case 2:
+        return blend<0x3>(a, b);
+      case 3:
+        return blend<0x7>(a, b);
+      case 4:
+        return blend<0xF>(a, b);
+      case 5:
+        return blend<0x1F>(a, b);
+      case 6:
+        return blend<0x3F>(a, b);
+      case 7:
+        return blend<0x7F>(a, b);
+      case 8:
+        return blend<0xFF>(a, b);
+      case 9:
+        return blend<0x1FF>(a, b);
+      case 10:
+        return blend<0x3FF>(a, b);
+      case 11:
+        return blend<0x7FF>(a, b);
+      case 12:
+        return blend<0xFFF>(a, b);
+      case 13:
+        return blend<0x1FFF>(a, b);
+      case 14:
+        return blend<0x3FFF>(a, b);
+      case 15:
+        return blend<0x7FFF>(a, b);
+      case 16:
+        return blend<0xFFFF>(a, b);
+      case 17:
+        return blend<0x1FFFF>(a, b);
+      case 18:
+        return blend<0x3FFFF>(a, b);
+      case 19:
+        return blend<0x7FFFF>(a, b);
+      case 20:
+        return blend<0xFFFFF>(a, b);
+      case 21:
+        return blend<0x1FFFFF>(a, b);
+      case 22:
+        return blend<0x3FFFFF>(a, b);
+      case 23:
+        return blend<0x7FFFFF>(a, b);
+      case 24:
+        return blend<0xFFFFFF>(a, b);
+      case 25:
+        return blend<0x1FFFFFF>(a, b);
+      case 26:
+        return blend<0x3FFFFFF>(a, b);
+      case 27:
+        return blend<0x7FFFFFF>(a, b);
+      case 28:
+        return blend<0xFFFFFFF>(a, b);
+      case 29:
+        return blend<0x1FFFFFFF>(a, b);
+      case 30:
+        return blend<0x3FFFFFFF>(a, b);
+      case 31:
+        return blend<0x7FFFFFFF>(a, b);
+      case 32:
+        return blend<0xFFFFFFFF>(a, b);
+      case 33:
+        return blend<0x1FFFFFFFF>(a, b);
+      case 34:
+        return blend<0x3FFFFFFFF>(a, b);
+      case 35:
+        return blend<0x7FFFFFFFF>(a, b);
+      case 36:
+        return blend<0xFFFFFFFFF>(a, b);
+      case 37:
+        return blend<0x1FFFFFFFFF>(a, b);
+      case 38:
+        return blend<0x3FFFFFFFFF>(a, b);
+      case 39:
+        return blend<0x7FFFFFFFFF>(a, b);
+      case 40:
+        return blend<0xFFFFFFFFFF>(a, b);
+      case 41:
+        return blend<0x1FFFFFFFFFF>(a, b);
+      case 42:
+        return blend<0x3FFFFFFFFFF>(a, b);
+      case 43:
+        return blend<0x7FFFFFFFFFF>(a, b);
+      case 44:
+        return blend<0xFFFFFFFFFFF>(a, b);
+      case 45:
+        return blend<0x1FFFFFFFFFFF>(a, b);
+      case 46:
+        return blend<0x3FFFFFFFFFFF>(a, b);
+      case 47:
+        return blend<0x7FFFFFFFFFFF>(a, b);
+      case 48:
+        return blend<0xFFFFFFFFFFFF>(a, b);
+      case 49:
+        return blend<0x1FFFFFFFFFFFF>(a, b);
+      case 50:
+        return blend<0x3FFFFFFFFFFFF>(a, b);
+      case 51:
+        return blend<0x7FFFFFFFFFFFF>(a, b);
+      case 52:
+        return blend<0xFFFFFFFFFFFFF>(a, b);
+      case 53:
+        return blend<0x1FFFFFFFFFFFFF>(a, b);
+      case 54:
+        return blend<0x3FFFFFFFFFFFFF>(a, b);
+      case 55:
+        return blend<0x7FFFFFFFFFFFFF>(a, b);
+      case 56:
+        return blend<0xFFFFFFFFFFFFFF>(a, b);
+      case 57:
+        return blend<0x1FFFFFFFFFFFFFF>(a, b);
+      case 58:
+        return blend<0x3FFFFFFFFFFFFFF>(a, b);
+      case 59:
+        return blend<0x7FFFFFFFFFFFFFF>(a, b);
+      case 60:
+        return blend<0xFFFFFFFFFFFFFFF>(a, b);
+      case 61:
+        return blend<0x1FFFFFFFFFFFFFFF>(a, b);
+      case 62:
+        return blend<0x3FFFFFFFFFFFFFFF>(a, b);
+      case 63:
+        return blend<0x7FFFFFFFFFFFFFFF>(a, b);
+    }
+    return b;
+  }
+  static Vectorized<T> loadu(const void* ptr) {
+    return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+  }
+  static Vectorized<T> loadu_one_fourth(const void* ptr) {
+      // Fast path if only load element number of 16.
+      // Note: We didn't merge it as fast path of loadu(const void* ptr, T count),
+      // Because loadu(const void* ptr, T count) requires zero initialization for upper 384 bits.
+      // However, by using _mm512_castsi128_si512, the upper 384 bits of the result are undefined.
+      // TODO<leslie> We can use _mm512_zextsi128_si512 in the furture,
+      // since gcc 9.3 doesn't support it now.
+      __m128i input_128 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+      return _mm512_castsi128_si512(input_128);
+  }
+  static Vectorized<T> loadu(const void* ptr, T count) {
+    if (count == size()) {
+      return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(ptr));
+    } else if (count == 16) {
+      // Fast path if only load element number of 16
+      return loadu_one_fourth(ptr);
+    } else {
+      __mmask64 mask = (1ULL << count) - 1;
+      return _mm512_maskz_loadu_epi8(mask, ptr);
+    }
+  }
+  void store(void* ptr, int count = size()) const {
+    if (count == size()) {
+      // ptr need not to be aligned here. See
+      // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm512-storeu-si512.html
+      _mm512_storeu_si512(reinterpret_cast<__m512i*>(ptr), values);
+    } else if (count > 0) {
+      if (count == 16) {
+        // Fast path if only store element number of 16
+        _mm_storeu_si128(
+          reinterpret_cast<__m128i*>(ptr),
+          _mm512_castsi512_si128(values));
+      } else {
+        __mmask64 mask = (1ULL << count) - 1;
+        _mm512_mask_storeu_epi8(ptr, mask, values);
+      }
+    }
+  }
+  const T& operator[](int idx) const  = delete;
+  T& operator[](int idx)  = delete;
+  Vectorized<T> real() const {
+    return *this;
+  }
+  Vectorized<T> imag() const {
+    return _mm512_set1_epi8(0);
+  }
+  Vectorized<T> conj() const {
+    return *this;
+  }
+};
+
+template<>
+class Vectorized<int8_t>: public Vectorized8<int8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<int8_t> blendv(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b,
+                               const Vectorized<int8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epi8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+
+  Vectorized<int8_t> neg() const;
+
+  Vectorized<int8_t> abs() const {
+    return _mm512_abs_epi8(values);
+  }
+
+  Vectorized<int8_t> operator==(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmpeq_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator!=(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmpneq_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator<(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmplt_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator<=(const Vectorized<int8_t>& other) const {
+    auto mask = _mm512_cmple_epi8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<int8_t> operator>(const Vectorized<int8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<int8_t> operator>=(const Vectorized<int8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<int8_t> eq(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ne(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> gt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> ge(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> lt(const Vectorized<int8_t>& other) const;
+  Vectorized<int8_t> le(const Vectorized<int8_t>& other) const;
+};
+
+template<>
+class Vectorized<uint8_t>: public Vectorized8<uint8_t> {
+public:
+  using Vectorized8::Vectorized8;
+
+  static Vectorized<uint8_t> blendv(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b,
+                               const Vectorized<uint8_t>& mask) {
+    auto msb_one = _mm512_set1_epi8(0xFF);
+    auto mask_ = _mm512_cmp_epu8_mask(mask, msb_one, _MM_CMPINT_EQ);
+    return _mm512_mask_blend_epi8(mask_, a.values, b.values);
+  }
+
+  Vectorized<uint8_t> neg() const;
+
+  Vectorized<uint8_t> abs() const {
+    return *this;
+  }
+
+  Vectorized<uint8_t> operator==(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpeq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator!=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmpneq_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmplt_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator<=(const Vectorized<uint8_t>& other) const {
+    auto mask = _mm512_cmple_epu8_mask(values, other.values);
+    return _mm512_mask_set1_epi8(zero_vector, mask, 0xFF);
+  }
+  Vectorized<uint8_t> operator>(const Vectorized<uint8_t>& other) const {
+    return other < *this;
+  }
+  Vectorized<uint8_t> operator>=(const Vectorized<uint8_t>& other) const {
+    return other <= *this;
+  }
+
+  Vectorized<uint8_t> eq(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ne(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> gt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> ge(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> lt(const Vectorized<uint8_t>& other) const;
+  Vectorized<uint8_t> le(const Vectorized<uint8_t>& other) const;
+};
+
+template <>
+Vectorized<int64_t> inline operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_add_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_add_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_add_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator+(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm512_add_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator+(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_add_epi8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_sub_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_sub_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_sub_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator-(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm512_sub_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator-(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_sub_epi8(a, b);
+}
+
+// Negation. Defined here so we can utilize operator-
+inline Vectorized<int64_t> Vectorized<int64_t>::neg() const {
+  return Vectorized<int64_t>(0) - *this;
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::neg() const {
+  return Vectorized<int32_t>(0) - *this;
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::neg() const {
+  return Vectorized<int16_t>(0) - *this;
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::neg() const {
+  return Vectorized<int8_t>(0) - *this;
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::neg() const {
+  return Vectorized<uint8_t>(0) - *this;
+}
+
+template <>
+Vectorized<int64_t> inline operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_mullo_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_mullo_epi16(a, b);
+}
+
+template <typename T, typename Op>
+Vectorized<T> inline int_elementwise_binary_512(const Vectorized<T>& a, const Vectorized<T>& b, Op op) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] = op(values_a[i], values_b[i]);
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int8_t> inline operator*(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  // We don't have an instruction for multiplying int8_t
+#ifndef CPU_CAPABILITY_AVX512
+  return int_elementwise_binary_512(a, b, std::multiplies<int8_t>());
+#else
+  __m512i mask00FF = _mm512_set1_epi16(0x00FF);
+  __m512i a_lo = _mm512_srai_epi16(_mm512_slli_epi16(a, 8), 8);
+  __m512i b_lo = _mm512_srai_epi16(_mm512_slli_epi16(b, 8), 8);
+  __m512i a_hi = _mm512_srai_epi16(a, 8);
+  __m512i b_hi = _mm512_srai_epi16(b, 8);
+  __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+  __m512i res = _mm512_or_si512(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<uint8_t> inline operator*(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  // We don't have an instruction for multiplying uint8_t
+#ifndef CPU_CAPABILITY_AVX512
+  return int_elementwise_binary_512(a, b, std::multiplies<uint8_t>());
+#else
+  __m512i mask00FF = _mm512_set1_epi16(0x00FF);
+  __m512i a_lo = _mm512_and_si512 (a, mask00FF);
+  __m512i b_lo = _mm512_and_si512 (b, mask00FF);
+  __m512i a_hi = _mm512_srli_epi16(a, 8);
+  __m512i b_hi = _mm512_srli_epi16(b, 8);
+  __m512i res_lo = _mm512_and_si512(_mm512_mullo_epi16(a_lo, b_lo), mask00FF);
+  __m512i res_hi = _mm512_slli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+  __m512i res = _mm512_or_si512(res_hi, res_lo);
+  return res;
+#endif
+}
+
+template <>
+Vectorized<int64_t> inline minimum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_min_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline minimum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_min_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline minimum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_min_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline minimum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm512_min_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline minimum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_min_epu8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline maximum(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_max_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline maximum(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_max_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline maximum(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_max_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline maximum(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return _mm512_max_epi8(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline maximum(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return _mm512_max_epi8(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline clamp(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val, const Vectorized<int64_t>& max_val) {
+  return _mm512_min_epi64(max_val, _mm512_max_epi64(a, min_val));
+}
+
+template <>
+Vectorized<int32_t> inline clamp(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val, const Vectorized<int32_t>& max_val) {
+  return _mm512_min_epi32(max_val, _mm512_max_epi32(a, min_val));
+}
+
+template <>
+Vectorized<int16_t> inline clamp(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val, const Vectorized<int16_t>& max_val) {
+  return _mm512_min_epi16(max_val, _mm512_max_epi16(a, min_val));
+}
+
+template <>
+Vectorized<int8_t> inline clamp(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val, const Vectorized<int8_t>& max_val) {
+  return _mm512_min_epi8(max_val, _mm512_max_epi8(a, min_val));
+}
+
+template <>
+Vectorized<uint8_t> inline clamp(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val, const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, _mm512_max_epu8(a, min_val));
+}
+
+template <>
+Vectorized<int64_t> inline clamp_max(const Vectorized<int64_t>& a, const Vectorized<int64_t>& max_val) {
+  return _mm512_min_epi64(max_val, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_max(const Vectorized<int32_t>& a, const Vectorized<int32_t>& max_val) {
+  return _mm512_min_epi32(max_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_max(const Vectorized<int16_t>& a, const Vectorized<int16_t>& max_val) {
+  return _mm512_min_epi16(max_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_max(const Vectorized<int8_t>& a, const Vectorized<int8_t>& max_val) {
+  return _mm512_min_epi8(max_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_max(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& max_val) {
+  return _mm512_min_epu8(max_val, a);
+}
+
+template <>
+Vectorized<int64_t> inline clamp_min(const Vectorized<int64_t>& a, const Vectorized<int64_t>& min_val) {
+  return _mm512_max_epi64(min_val, a);
+}
+
+template <>
+Vectorized<int32_t> inline clamp_min(const Vectorized<int32_t>& a, const Vectorized<int32_t>& min_val) {
+  return _mm512_max_epi32(min_val, a);
+}
+
+template <>
+Vectorized<int16_t> inline clamp_min(const Vectorized<int16_t>& a, const Vectorized<int16_t>& min_val) {
+  return _mm512_max_epi16(min_val, a);
+}
+
+template <>
+Vectorized<int8_t> inline clamp_min(const Vectorized<int8_t>& a, const Vectorized<int8_t>& min_val) {
+  return _mm512_max_epi8(min_val, a);
+}
+
+template <>
+Vectorized<uint8_t> inline clamp_min(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& min_val) {
+  return _mm512_max_epu8(min_val, a);
+}
+
+template<typename T>
+Vectorized<int32_t> inline convert_to_int32(const T* ptr) {
+  return Vectorized<int32_t>::loadu(ptr);
+}
+
+template<>
+Vectorized<int32_t> inline convert_to_int32<int8_t>(const int8_t* ptr) {
+  return _mm512_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+}
+
+template<>
+Vectorized<int32_t> inline convert_to_int32<uint8_t>(const uint8_t* ptr) {
+  return _mm512_cvtepu8_epi32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)));
+}
+
+template <>
+Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int64_t>());
+}
+template <>
+Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int32_t>());
+}
+template <>
+Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int16_t>());
+}
+template <>
+Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<int8_t>());
+}
+template <>
+Vectorized<uint8_t> inline operator/(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return int_elementwise_binary_512(a, b, std::divides<uint8_t>());
+}
+
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_and_si512(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_or_si512(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return _mm512_xor_si512(a, b);
+}
+template<class T, typename std::enable_if_t<std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  return _mm512_xor_si512(a, _mm512_set1_epi32(-1));
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::eq(const Vectorized<int64_t>& other) const {
+  return (*this == other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ne(const Vectorized<int64_t>& other) const {
+  return (*this != other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::gt(const Vectorized<int64_t>& other) const {
+  return (*this > other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::ge(const Vectorized<int64_t>& other) const {
+  return (*this >= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::lt(const Vectorized<int64_t>& other) const {
+  return (*this < other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int64_t> Vectorized<int64_t>::le(const Vectorized<int64_t>& other) const {
+  return (*this <= other) & Vectorized<int64_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::eq(const Vectorized<int32_t>& other) const {
+  return (*this == other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ne(const Vectorized<int32_t>& other) const {
+  return (*this != other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::gt(const Vectorized<int32_t>& other) const {
+  return (*this > other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::ge(const Vectorized<int32_t>& other) const {
+  return (*this >= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::lt(const Vectorized<int32_t>& other) const {
+  return (*this < other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int32_t> Vectorized<int32_t>::le(const Vectorized<int32_t>& other) const {
+  return (*this <= other) & Vectorized<int32_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::eq(const Vectorized<int16_t>& other) const {
+  return (*this == other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ne(const Vectorized<int16_t>& other) const {
+  return (*this != other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::gt(const Vectorized<int16_t>& other) const {
+  return (*this > other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::ge(const Vectorized<int16_t>& other) const {
+  return (*this >= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::lt(const Vectorized<int16_t>& other) const {
+  return (*this < other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int16_t> Vectorized<int16_t>::le(const Vectorized<int16_t>& other) const {
+  return (*this <= other) & Vectorized<int16_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::eq(const Vectorized<int8_t>& other) const {
+  return (*this == other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ne(const Vectorized<int8_t>& other) const {
+  return (*this != other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::gt(const Vectorized<int8_t>& other) const {
+  return (*this > other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::ge(const Vectorized<int8_t>& other) const {
+  return (*this >= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::lt(const Vectorized<int8_t>& other) const {
+  return (*this < other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<int8_t> Vectorized<int8_t>::le(const Vectorized<int8_t>& other) const {
+  return (*this <= other) & Vectorized<int8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::eq(const Vectorized<uint8_t>& other) const {
+  return (*this == other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ne(const Vectorized<uint8_t>& other) const {
+  return (*this != other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::gt(const Vectorized<uint8_t>& other) const {
+  return (*this > other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::ge(const Vectorized<uint8_t>& other) const {
+  return (*this >= other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::lt(const Vectorized<uint8_t>& other) const {
+  return (*this < other) & Vectorized<uint8_t>(1);
+}
+
+inline Vectorized<uint8_t> Vectorized<uint8_t>::le(const Vectorized<uint8_t>& other) const {
+  return (*this <= other) & Vectorized<uint8_t>(1);
+}
+
+template <bool left_shift, typename T, typename std::enable_if_t<std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value, int> = 0>
+Vectorized<T> inline shift_512_8(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // No vector instruction for shifting int8_t/uint8_t, so emulating
+  // it instead.
+
+  // Control masks for shuffle operation, treating 512 bits as an
+  // array of 8-bit elements, and considering pairs of neighboring
+  // elements.  Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
+  // M!=N) is set so that shuffle will move element with index M from
+  // input pair into element with index N in output pair, and element
+  // with index M in output pair will be set to all 0s.
+  __m512i ctl_0_1 = _mm512_set_epi8(62, 0x80, 60, 0x80, 58, 0x80, 56, 0x80,
+                                    54, 0x80, 52, 0x80, 50, 0x80, 48, 0x80,
+                                    46, 0x80, 44, 0x80, 42, 0x80, 40, 0x80,
+                                    38, 0x80, 36, 0x80, 34, 0x80, 32, 0x80,
+                                    30, 0x80, 28, 0x80, 26, 0x80, 24, 0x80,
+                                    22, 0x80, 20, 0x80, 18, 0x80, 16, 0x80,
+                                    14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80,
+                                    6, 0x80, 4, 0x80, 2, 0x80, 0, 0x80);
+  __m512i ctl_1_0 = _mm512_set_epi8(0x80, 63, 0x80, 61, 0x80, 59, 0x80, 57,
+                                    0x80, 55, 0x80, 53, 0x80, 51, 0x80, 49,
+                                    0x80, 47, 0x80, 45, 0x80, 43, 0x80, 41,
+                                    0x80, 39, 0x80, 37, 0x80, 35, 0x80, 33,
+                                    0x80, 31, 0x80, 29, 0x80, 27, 0x80, 25,
+                                    0x80, 23, 0x80, 21, 0x80, 19, 0x80, 17,
+                                    0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9,
+                                    0x80, 7, 0x80, 5, 0x80, 3, 0x80, 1);
+
+  // Masks for bitwise and operation, treating 512 bits as an array of
+  // 8-bit elements, and considering them in pairs of neighboring
+  // elements.  A mask named "keep_M" (M in [0,1]) is set so that
+  // bitwise and will copy element with index M from input pair into
+  // element with the same index in output pair, while the other
+  // element in output pair will be set to all 0s.
+  __m512i keep_0 = _mm512_set1_epi16(0xFF);
+  __m512i keep_1 = _mm512_set1_epi16(0xFF00);
+
+  // Take each 8-bit element with idx%2==0 from input array to be
+  // shifted and extend it to 16 bits so that 0s are added to the
+  // right.  Then, perform shifting on this 16-bit number.  Upper 8
+  // bits will be proper result of shifting original 8-bit number, so
+  // write them to result array, into the same position from which
+  // corresponding input element is taken.  Also, make sure that
+  // result array elements with idx%2!=0 are set to all 0s.
+  //
+  // Note that number of bits to shift for is extended to 16 bits by
+  // adding 0s to the left.  That means this number is not properly
+  // sign-extended for negative values.  However, number of bits to
+  // shift is treated as an unsigned integer by respective shift
+  // intrinsics anyway so if negative then either with or without
+  // proper sign extension, it will be interpreted as a number greater
+  // than 32, and the shifting result will be the same.
+  __m512i a0 = _mm512_shuffle_epi8(a, ctl_0_1);
+  __m512i b0 = _mm512_and_si512(b, keep_0);
+  __m512i c0;
+  if (left_shift)
+    c0 = _mm512_sllv_epi16(a0, b0);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c0 = _mm512_srav_epi16(a0, b0);
+    else
+      c0 = _mm512_srlv_epi16(a0, b0);
+  c0 = _mm512_shuffle_epi8(c0, ctl_1_0);
+
+  // Peform shifting the same way for input array elements with
+  // idx%2==1.
+  __m512i a1 = _mm512_and_si512(a, keep_1);
+  __m512i b1 = _mm512_shuffle_epi8(b, ctl_1_0);
+  __m512i c1;
+  if (left_shift)
+    c1 = _mm512_sllv_epi16(a1, b1);
+  else
+    if constexpr (std::is_same_v<T, int8_t>)
+      c1 = _mm512_srav_epi16(a1, b1);
+    else
+      c1 = _mm512_srlv_epi16(a1, b1);
+  c1 = _mm512_and_si512(c1, keep_1);
+
+  // Merge partial results into the final result.
+  __m512i c = _mm512_or_si512(c0, c1);
+
+  return c;
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_sllv_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_sllv_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_sllv_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator<<(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_512_8<true>(a, b);
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return _mm512_srav_epi64(a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return _mm512_srav_epi32(a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return _mm512_srav_epi16(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
+template <>
+Vectorized<uint8_t> inline operator>>(const Vectorized<uint8_t>& a, const Vectorized<uint8_t>& b) {
+  return shift_512_8<false>(a, b);
+}
+
+#endif
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffe9ada5c13f50492d4d69428b164db941da39c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec512/vec512_qint.h
@@ -0,0 +1,1346 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+#include <c10/util/irange.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cmath>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
+
+struct Vectorizedqi {
+ protected:
+  __m512i vals __attribute__((aligned(64)));
+
+ public:
+  Vectorizedqi() {}
+  Vectorizedqi(__m512i v) : vals(v) {}
+  operator __m512i() const {
+    return vals;
+  }
+};
+
+
+template <typename T>
+__m512i pack_saturate_and_clamp(
+    __m512i first,
+    __m512i second,
+    T min_val,
+    T max_val);
+
+template <>
+inline __m512i pack_saturate_and_clamp<int32_t>(
+    __m512i first,
+    __m512i second,
+    int32_t min_val,
+    int32_t max_val) {
+  // This function is for linkage only, will not be used
+  AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
+}
+
+template <>
+inline __m512i pack_saturate_and_clamp<int8_t>(
+    __m512i first,
+    __m512i second,
+    int8_t min_val,
+    int8_t max_val) {
+  __m512i packed_and_sat = _mm512_packs_epi16(first, second);
+  return _mm512_max_epi8(
+      _mm512_set1_epi8(min_val),
+      _mm512_min_epi8(packed_and_sat, _mm512_set1_epi8(max_val)));
+}
+
+template <>
+inline __m512i pack_saturate_and_clamp<uint8_t>(
+    __m512i first,
+    __m512i second,
+    uint8_t min_val,
+    uint8_t max_val) {
+  __m512i packed_and_sat = _mm512_packus_epi16(first, second);
+  return _mm512_max_epu8(
+      _mm512_set1_epi8(min_val),
+      _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val)));
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, at::vec::Vectorized<float>>::type
+inline convert_int8_to_float(at::vec::Vectorized<T> src) {
+  // Note: this function only convert inputs number of elements equal to at::vec::Vectorized<float>.size()
+  // Only handle first 16*8 bits
+  __m128i input_128 = _mm512_castsi512_si128(src);
+  // Convert from 16*uint8/int8 to 16*int32
+  __m512i input_512_extended;
+  if constexpr (std::is_same_v<T, uint8_t>)
+    input_512_extended = _mm512_cvtepu8_epi32(input_128);
+  else
+    input_512_extended = _mm512_cvtepi8_epi32(input_128);
+  // Convert from 16*int32 to 16*float32
+  return _mm512_cvtepi32_ps(input_512_extended);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, at::vec::Vectorized<T>>::type
+inline convert_float_to_int8(at::vec::Vectorized<float> src) {
+  // Convert from float32 to int32 with truncation
+  __m512i x_values_int32 = _mm512_cvttps_epi32(src);
+
+  // Convert from int32 to int16 using signed saturation
+  __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32);
+
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+
+  // Convert from int16 to uint8/int8 using unsigned saturation
+  __m512i xyzw_clamped_v = pack_saturate_and_clamp<T>(
+      xy_packed_v, xy_packed_v, min_val, max_val);
+  __m512i permute_mask_v =
+      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
+                      0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
+  return _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+}
+
+template <typename T>
+inline void __attribute__((always_inline)) QuantizeAvx512(
+    const float* src,
+    T* dst,
+    int len,
+    float inverse_scale,
+    int64_t zero_point) {
+  constexpr int VLEN = 16;
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  const __m512i min_v = _mm512_set1_epi32(min_val);
+  const __m512i max_v = _mm512_set1_epi32(max_val);
+  // This is the largest int32 value < int32_max exactly representable in float
+  constexpr int32_t int32_float_max_val =
+      std::numeric_limits<int32_t>::max() - 127;
+  int i = 0;
+  __m512 inverse_scale_v = _mm512_set1_ps(inverse_scale);
+  // clang-format off
+  static const __m512i shuffle_mask_v = _mm512_set_epi8(
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff,
+      0x0c, 0x08, 0x04, 0x00);
+  // clang-format on
+  __m512i permute_mask_v =
+      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
+                       0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
+  __m512i permute_mask_l8_v =
+      _mm512_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                       0x00, 0x00, 0x00, 0x00, 0x0c, 0x08, 0x04, 0x00);
+  int len_aligned = len / (VLEN * 4) * (VLEN * 4);
+  for (; i < len_aligned; i += 4 * VLEN) {
+    // x
+    __m512 x_vals = _mm512_load_ps(src + i);
+    __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
+    // If the floating point value is greater than int32_max,
+    // _mm512_cvtps_epi32 converts them to -ve. Clip at int32_float_max_val to
+    // Clip at int32_float_max_val to avoid this.
+    x_transformed_v =
+        _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // y
+    __m512 y_vals = _mm512_load_ps(src + i + VLEN);
+    __m512 y_transformed_v = _mm512_mul_ps(y_vals, inverse_scale_v);
+    y_transformed_v =
+        _mm512_min_ps(y_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // z
+    __m512 z_vals = _mm512_load_ps(src + i + 2 * VLEN);
+    __m512 z_transformed_v = _mm512_mul_ps(z_vals, inverse_scale_v);
+    z_transformed_v =
+        _mm512_min_ps(z_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    // w
+    __m512 w_vals = _mm512_load_ps(src + i + 3 * VLEN);
+    __m512 w_transformed_v = _mm512_mul_ps(w_vals, inverse_scale_v);
+    w_transformed_v =
+        _mm512_min_ps(w_transformed_v, _mm512_set1_ps(int32_float_max_val));
+
+    __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
+    __m512i y_rounded_v = _mm512_cvtps_epi32(y_transformed_v);
+    __m512i z_rounded_v = _mm512_cvtps_epi32(z_transformed_v);
+    __m512i w_rounded_v = _mm512_cvtps_epi32(w_transformed_v);
+
+    // add zero point
+    x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
+    y_rounded_v = _mm512_add_epi32(y_rounded_v, _mm512_set1_epi32(zero_point));
+    z_rounded_v = _mm512_add_epi32(z_rounded_v, _mm512_set1_epi32(zero_point));
+    w_rounded_v = _mm512_add_epi32(w_rounded_v, _mm512_set1_epi32(zero_point));
+
+    __m512i xy_packed_v = _mm512_packs_epi32(x_rounded_v, y_rounded_v);
+    __m512i zw_packed_v = _mm512_packs_epi32(z_rounded_v, w_rounded_v);
+    __m512i xyzw_clamped_v =
+        pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+    xyzw_clamped_v =
+        _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+    _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + i), xyzw_clamped_v);
+  }
+
+  // Additional 8-lane AVX512 version to take advantage when len is smaller
+  // based on fbgemm::QuantizeAvx2 (https://github.com/pytorch/FBGEMM)
+  for (; i < len / VLEN * VLEN; i += VLEN) {
+    __m512 x_vals = _mm512_load_ps(src + i);
+    __m512 x_transformed_v = _mm512_mul_ps(x_vals, inverse_scale_v);
+    x_transformed_v =
+        _mm512_min_ps(x_transformed_v, _mm512_set1_ps(int32_float_max_val));
+    __m512i x_rounded_v = _mm512_cvtps_epi32(x_transformed_v);
+    x_rounded_v = _mm512_add_epi32(x_rounded_v, _mm512_set1_epi32(zero_point));
+    __m512i x_clipped_v =
+        _mm512_max_epi32(min_v, _mm512_min_epi32(max_v, x_rounded_v));
+
+    x_clipped_v = _mm512_shuffle_epi8(x_clipped_v, shuffle_mask_v);
+    x_clipped_v = _mm512_permutexvar_epi32(permute_mask_l8_v, x_clipped_v);
+    _mm_storeu_si128(
+        reinterpret_cast<__m128i*>(dst + i),
+        _mm512_castsi512_si128(x_clipped_v));
+  }
+
+  for (; i < len; ++i) {
+    float transformed = src[i] * inverse_scale;
+
+    // Not exactly the same behavior as the vectorized code.
+    // The vectorized code above always rounds to even in halfway cases
+    // (https://software.intel.com/en-us/node/523819), but std::nearbyint
+    // does the same only when the current rounding mode is FE_TONEAREST.
+    // However, in practice, this should not be a problem because most cases
+    // use the default rounding mode FE_TONEAREST.
+    // Note that we cannot implement the same behavior as the vectorized code
+    // using std::round because it does rounding away from zero in halfway
+    // cases.
+    transformed = zero_point + std::nearbyint(transformed);
+    float clipped =
+        std::min(std::max(transformed, float(min_val)), float(max_val));
+    dst[i] = clipped;
+  }
+}
+
+template<>
+struct Vectorized<c10::qint32> : public Vectorizedqi {
+    using size_type = int;
+    static constexpr size_type size() {
+        return 16;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 1;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 1;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 1>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
+    using value_type = c10::qint32::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+    Vectorized() {}
+
+    Vectorized(__m512i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::qint32& val) {
+        value_type uw = val.val_;
+        vals = _mm512_set1_epi32(uw);
+    }
+
+    void store(void* ptr, int count = size()) const {
+      if (count != size()) {
+        memcpy(ptr, &vals, count * sizeof(value_type));
+      } else {
+        _mm512_storeu_si512((__m512i*)ptr, vals);
+      }
+    }
+
+    static Vectorized<c10::qint32> loadu(const void* ptr) {
+        return Vectorized<c10::qint32>(ptr);
+    }
+
+    static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return loadu(tmp_values);
+    }
+
+    float_vec_return_type dequantize(
+        Vectorized<float> scale,
+        Vectorized<float> zero_point,
+        Vectorized<float> scale_zp_premul) const {
+      __m512 float_vals = _mm512_cvtepi32_ps(vals);
+      return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
+    }
+
+    float_vec_return_type dequantize(
+        Vectorized<float> scale,
+        Vectorized<float> zero_point) const {
+      __m512 float_vals = _mm512_cvtepi32_ps(vals);
+      return {(Vectorized<float>(float_vals) - zero_point) * scale};
+    }
+
+    static Vectorized<c10::qint32> quantize(
+        const float_vec_return_type& rhs,
+        float scale,
+        int32_t zero_point,
+        float inverse_scale) {
+      Vectorized<c10::qint32> retval;
+      auto rhs_data = (__m512)rhs[0];
+      at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+          scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
+      return retval;
+    }
+
+    Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+      return _mm512_max_epi32(vals, b.vals);
+    }
+
+    Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+      return _mm512_min_epi32(vals, b.vals);
+    }
+
+    Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::qint32> relu6(
+        Vectorized<c10::qint32> zero_point,
+        Vectorized<c10::qint32> q_six) {
+      return _mm512_min_epi32(
+          _mm512_max_epi32(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+      return {_mm512_sub_epi32(vals, b)};
+    }
+
+    static Vectorized<c10::qint32> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m512 multiplier_v = _mm512_set1_ps(multiplier);
+      __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+
+      __m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
+      __m512i rounded = _mm512_cvtps_epi32(scaled);
+      return _mm512_add_epi32(rounded, zero_point_v);
+    }
+
+ private:
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+      vals = _mm512_loadu_si512((const __m512i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  return _mm512_add_epi32(a, b);
+}
+
+/*
+ * Convert values from int32 back to int8/uint8
+ */
+template <typename T>
+__m512i RequantizeAvx512(
+    const std::array<Vectorized<c10::qint32>, 4>& inp,
+    __m512 multiplier,
+    __m512i zp) {
+  static_assert(
+      std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
+      "Only int8_t/uint8_t are supported");
+  constexpr auto min_val = std::numeric_limits<T>::min();
+  constexpr auto max_val = std::numeric_limits<T>::max();
+  __m512i permute_mask_v =
+      _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02,
+                       0x0d, 0x09, 0x05, 0x01, 0x0c, 0x08, 0x04, 0x00);
+  __m512 x_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier);
+  __m512 y_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[1]), multiplier);
+  __m512 z_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[2]), multiplier);
+  __m512 w_scaled_v = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[3]), multiplier);
+
+  __m512i x_rounded_v = _mm512_cvtps_epi32(x_scaled_v);
+  __m512i y_rounded_v = _mm512_cvtps_epi32(y_scaled_v);
+  __m512i z_rounded_v = _mm512_cvtps_epi32(z_scaled_v);
+  __m512i w_rounded_v = _mm512_cvtps_epi32(w_scaled_v);
+
+  /* Add zero point */
+  __m512i x_v = _mm512_add_epi32(x_rounded_v, zp);
+  __m512i y_v = _mm512_add_epi32(y_rounded_v, zp);
+  __m512i z_v = _mm512_add_epi32(z_rounded_v, zp);
+  __m512i w_v = _mm512_add_epi32(w_rounded_v, zp);
+
+  /* Pack to int16_t and saturate */
+  __m512i xy_packed_v = _mm512_packs_epi32(x_v, y_v);
+  __m512i zw_packed_v = _mm512_packs_epi32(z_v, w_v);
+
+  __m512i xyzw_clamped_v =
+      pack_saturate_and_clamp<T>(xy_packed_v, zw_packed_v, min_val, max_val);
+
+  /*
+   * xyzw_clamped_v has results in the following layout so we need to
+   * permute: x0-3 y0-3 z0-3 w0-3 x4-7 y4-7 z4-7 w4-7 x8-11 y8-11 z8-11 w8-11 x12-15 y12-15 z12-15 w12-15
+   */
+  xyzw_clamped_v = _mm512_permutexvar_epi32(permute_mask_v, xyzw_clamped_v);
+  return xyzw_clamped_v;
+}
+
+template<>
+struct Vectorized<c10::qint8> : public Vectorizedqi {
+    static constexpr int size() {
+        return 64;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 4;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 4;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 4>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using value_type = typename c10::qint8::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+
+    Vectorized() {}
+    Vectorized(__m512i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::qint8& val) {
+        value_type uw = val.val_;
+        vals = _mm512_set1_epi8(uw);
+    }
+
+    // This is needed because the compiler emits awful code for the default
+    // constructor for moving the enum
+    Vectorized(const Vectorized<c10::qint8>& other) : Vectorizedqi(other.vals) { }
+
+    // This is added to avoid error: definition of implicit copy assignment operator
+    // for 'Vectorized<c10::qint8>' is deprecated because it has a user-declared
+    // copy constructor [-Werror,-Wdeprecated-copy]
+    Vectorized& operator=(const Vectorized<c10::qint8>&) = default;
+
+    void store(void* ptr, int count = size()) const {
+        if (count != size()) {
+            memcpy(ptr, &vals, count * sizeof(value_type));
+        } else {
+            _mm512_storeu_si512((__m512i*)ptr, vals);
+        }
+    }
+
+    static Vectorized<c10::qint8> loadu(const void* ptr) {
+        return Vectorized<c10::qint8>(ptr);
+    }
+
+    static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return loadu(tmp_values);
+    }
+
+ private:
+    __m512i cvtepi8_epi32(__m128i epi8_vals) const {
+        return _mm512_cvtepi8_epi32(epi8_vals);
+    }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_neg_zp_premul) const {
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_neg_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_neg_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_neg_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_neg_zp_premul);
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    int8_t quantized_values[64];
+    QuantizeAvx512<value_type>(
+        rhs_data, quantized_values, 64, inverse_scale, zero_point);
+    return Vectorized<c10::qint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+      return _mm512_max_epi8(vals, b.vals);
+    }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+      return _mm512_min_epi8(vals, b.vals);
+    }
+
+    Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::qint8> relu6(
+        Vectorized<c10::qint8> zero_point,
+        Vectorized<c10::qint8> q_six) {
+      return _mm512_min_epi8(
+          _mm512_max_epi8(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+      __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+      __m512i int32_val0 = cvtepi8_epi32(int_val0);
+      __m512i int32_val1 = cvtepi8_epi32(int_val1);
+      __m512i int32_val2 = cvtepi8_epi32(int_val2);
+      __m512i int32_val3 = cvtepi8_epi32(int_val3);
+
+      __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+
+      __m512i int32_b0 = cvtepi8_epi32(int_b0);
+      __m512i int32_b1 = cvtepi8_epi32(int_b1);
+      __m512i int32_b2 = cvtepi8_epi32(int_b2);
+      __m512i int32_b3 = cvtepi8_epi32(int_b3);
+
+      __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+      __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+      __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+      __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+
+      return {Vectorized<c10::qint32>(res_0),
+              Vectorized<c10::qint32>(res_1),
+              Vectorized<c10::qint32>(res_2),
+              Vectorized<c10::qint32>(res_3)};
+    }
+
+    static Vectorized<c10::qint8> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m512 multiplier_v = _mm512_set1_ps(multiplier);
+      __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+      return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+    }
+
+ private:
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+        vals = _mm512_loadu_si512((const __m512i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template<>
+struct Vectorized<c10::quint8> : public Vectorizedqi {
+    static constexpr int size() {
+        return 64;
+    }
+
+    static constexpr int float_num_vecs() {
+        return 4;
+    }
+
+    static constexpr int int_num_vecs() {
+        return 4;
+    }
+
+    using float_vec_return_type = std::array<Vectorized<float>, 4>;
+    using int_vec_return_type = std::array<Vectorized<c10::qint32>, 4>;
+    using value_type = typename c10::quint8::underlying;
+
+ public:
+    using Vectorizedqi::Vectorizedqi;
+    Vectorized() {}
+
+    Vectorized(__m512i vals_) { vals = vals_;}
+
+    // Broadcast constructor
+    Vectorized(const c10::quint8& val) {
+        value_type uw = val.val_;
+        vals = _mm512_set1_epi8(uw);
+    }
+
+    Vectorized(const Vectorized<c10::quint8>& other) : Vectorizedqi(other.vals) { }
+
+    // This is added to avoid error: definition of implicit copy assignment operator
+    // for 'Vectorized<c10::quint8>' is deprecated because it has a user-declared
+    // copy constructor [-Werror,-Wdeprecated-copy]
+    Vectorized& operator=(const Vectorized<c10::quint8>&) = default;
+
+    void store(void* ptr, int count = size()) const {
+        if (count != size()) {
+            memcpy(ptr, &vals, count * sizeof(value_type));
+        } else {
+            _mm512_storeu_si512((__m512i*)ptr, vals);
+        }
+    }
+
+    static Vectorized<c10::quint8> loadu(const void* ptr) {
+        return Vectorized<c10::quint8>(ptr);
+    }
+
+    static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+        __at_align__ value_type tmp_values[size()];
+        // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+        // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+        // instructions while a loop would be compiled to one instruction.
+        for (const auto i : c10::irange(size())) {
+          tmp_values[i] = 0;
+        }
+        std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+        return loadu(tmp_values);
+    }
+
+ private:
+    __m512i cvtepu8_epi32(__m128i epu8_vals) const {
+        return _mm512_cvtepu8_epi32(epu8_vals);
+    }
+
+ public:
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 =
+        vec::fmadd(scale, Vectorized<float>(float_val0), scale_zp_premul);
+    auto val1 =
+        vec::fmadd(scale, Vectorized<float>(float_val1), scale_zp_premul);
+    auto val2 =
+        vec::fmadd(scale, Vectorized<float>(float_val2), scale_zp_premul);
+    auto val3 =
+        vec::fmadd(scale, Vectorized<float>(float_val3), scale_zp_premul);
+
+    return {val0, val1, val2, val3};
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+    __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+    __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+    __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+    __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0));
+    __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1));
+    __m512 float_val2 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val2));
+    __m512 float_val3 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val3));
+
+    auto val0 = (Vectorized<float>(float_val0) - zero_point) * scale;
+    auto val1 = (Vectorized<float>(float_val1) - zero_point) * scale;
+    auto val2 = (Vectorized<float>(float_val2) - zero_point) * scale;
+    auto val3 = (Vectorized<float>(float_val3) - zero_point) * scale;
+
+    return {val0, val1, val2, val3};
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    auto* rhs_data = (float*)rhs.data();
+    uint8_t quantized_values[64];
+    QuantizeAvx512<value_type>(
+        rhs_data, quantized_values, 64, inverse_scale, zero_point);
+    return Vectorized<c10::quint8>::loadu(quantized_values);
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+      return _mm512_max_epu8(vals, b.vals);
+    }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+      return _mm512_min_epu8(vals, b.vals);
+    }
+
+    Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+        return maximum(zero_point);
+    }
+
+    Vectorized<c10::quint8> relu6(
+        Vectorized<c10::quint8> zero_point,
+        Vectorized<c10::quint8> q_six) {
+      return _mm512_min_epu8(
+          _mm512_max_epu8(vals, zero_point.vals), q_six.vals);
+    }
+
+    int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+      __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]);
+      __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]);
+      __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]);
+      __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]);
+
+      __m512i int32_val0 = cvtepu8_epi32(int_val0);
+      __m512i int32_val1 = cvtepu8_epi32(int_val1);
+      __m512i int32_val2 = cvtepu8_epi32(int_val2);
+      __m512i int32_val3 = cvtepu8_epi32(int_val3);
+
+      __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]);
+      __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]);
+      __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]);
+      __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]);
+
+      __m512i int32_b0 = cvtepu8_epi32(int_b0);
+      __m512i int32_b1 = cvtepu8_epi32(int_b1);
+      __m512i int32_b2 = cvtepu8_epi32(int_b2);
+      __m512i int32_b3 = cvtepu8_epi32(int_b3);
+
+      __m512i res_0 = _mm512_sub_epi32(int32_val0, int32_b0);
+      __m512i res_1 = _mm512_sub_epi32(int32_val1, int32_b1);
+      __m512i res_2 = _mm512_sub_epi32(int32_val2, int32_b2);
+      __m512i res_3 = _mm512_sub_epi32(int32_val3, int32_b3);
+      return {Vectorized<c10::qint32>(res_0),
+              Vectorized<c10::qint32>(res_1),
+              Vectorized<c10::qint32>(res_2),
+              Vectorized<c10::qint32>(res_3)};
+    }
+
+    static Vectorized<c10::quint8> requantize_from_int(
+        const int_vec_return_type& inp,
+        float multiplier,
+        int32_t zero_point) {
+      __m512 multiplier_v = _mm512_set1_ps(multiplier);
+      __m512i zero_point_v = _mm512_set1_epi32(zero_point);
+      return RequantizeAvx512<value_type>(inp, multiplier_v, zero_point_v);
+    }
+
+ private:
+
+    // Load from memory constructor
+    Vectorized(const void* ptr) {
+        vals = _mm512_loadu_si512((const __m512i*)ptr);
+    }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#else
+
+// NOTE: These are low-performance implementations that we fall back on.
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  static constexpr int size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / 8;
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / 8;
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (const auto i : c10::irange(size())) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    float_vec_return_type rv;
+    for (const auto i : c10::irange(float_num_vecs())) {
+      float tmp_vals[16];
+      for (const auto j : c10::irange(16)) {
+        tmp_vals[j] = at::native::dequantize_val<T>(
+            scale[j], zero_point[j], T(vals[16 * i + j]));
+      }
+      rv[i] = Vectorized<float>(tmp_vals[0],
+          tmp_vals[1],
+          tmp_vals[2],
+          tmp_vals[3],
+          tmp_vals[4],
+          tmp_vals[5],
+          tmp_vals[6],
+          tmp_vals[7],
+          tmp_vals[8],
+          tmp_vals[9],
+          tmp_vals[10],
+          tmp_vals[11],
+          tmp_vals[12],
+          tmp_vals[13],
+          tmp_vals[14],
+          tmp_vals[15]);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    Vectorized<float> scale_zp_premul;
+    return dequantize(scale, zero_point, scale_zp_premul);
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                 c10::qint32,
+                                 std::array<Vectorized<float>, 1>,
+                                 std::array<Vectorized<c10::qint32>, 1>,
+                                 16> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            16>(ptr) {}
+
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (const auto i : c10::irange(size())) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] =
+          std::nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (const auto i : c10::irange(std::decay_t<decltype(a)>::size())) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                c10::qint8,
+                                std::array<Vectorized<float>, 4>,
+                                std::array<Vectorized<c10::qint32>, 4>,
+                                64> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                 c10::quint8,
+                                 std::array<Vectorized<float>, 4>,
+                                 std::array<Vectorized<c10::qint32>, 4>,
+                                 64> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            64>(ptr) {}
+
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+    __at_align__ value_type tmp_values[size()];
+    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+    // instructions while a loop would be compiled to one instruction.
+    for (const auto i : c10::irange(size())) {
+      tmp_values[i] = 0;
+    }
+    std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+    return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * 16> float_vals;
+
+    for (const auto i : c10::irange(float_num_vecs())) {
+      rhs[i].store(&float_vals[i * 16], 16);
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        16 * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(size())) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (const auto i : c10::irange(int_num_vecs())) {
+      for (const auto j : c10::irange(elem_per_int_vec)) {
+        int32_t rounded =
+            std::nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // defined(CPU_CAPABILITY_AVX512) && !defined(MSVC)
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_base.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..85dd7207272310c7be95d5a44772c418e7b20558
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_base.h
@@ -0,0 +1,1108 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with AVX]
+//
+// Note [Do not compile initializers with AVX]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// If you define a static initializer in this file, the initialization will use
+// AVX instructions because these object files are compiled with AVX enabled.
+// We need to avoid non-trivial global data in these architecture specific files
+// because there's no way to guard the global initializers with CPU capability
+// detection.
+//
+// See https://github.com/pytorch/pytorch/issues/37577 for an instance
+// of this bug in the past.
+
+#include <array>
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <functional>
+#include <cmath>
+#include <type_traits>
+#include <climits>
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/native/Math.h>
+#include <ATen/NumericUtils.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/copysign.h>
+#include <ATen/native/cpu/zmath.h>
+#include <c10/util/TypeCast.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+#include <c10/util/Load.h>
+
+// These macros helped us unify vec_base.h
+#ifdef CPU_CAPABILITY_AVX512
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(64)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(64))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 64
+#define int_vector __m512i
+#else // CPU_CAPABILITY_AVX512
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(32)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(32))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 32
+#define int_vector __m256i
+#endif // CPU_CAPABILITY_AVX512
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+// at::Half and at::BFloat16 should be treated as floating point
+template <typename T>
+struct is_floating_point:
+    std::integral_constant<bool,
+      std::is_floating_point<T>::value ||
+      std::is_same<T, at::Half>::value ||
+      std::is_same<T, at::BFloat16>::value> {
+};
+
+template<typename T>
+constexpr bool is_floating_point_v = is_floating_point<T>::value;
+
+template <typename T>
+struct is_reduced_floating_point:
+    std::integral_constant<bool,
+      std::is_same<T, at::Half>::value ||
+      std::is_same<T, at::BFloat16>::value> {
+};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v = is_reduced_floating_point<T>::value;
+
+template<size_t n> struct int_of_size;
+
+#define DEFINE_INT_OF_SIZE(int_t) \
+template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
+
+DEFINE_INT_OF_SIZE(int64_t);
+DEFINE_INT_OF_SIZE(int32_t);
+DEFINE_INT_OF_SIZE(int16_t);
+DEFINE_INT_OF_SIZE(int8_t);
+
+#undef DEFINE_INT_OF_SIZE
+
+template <typename T>
+using int_same_size_t = typename int_of_size<sizeof(T)>::type;
+
+// NOTE: If you specialize on a type, you must define all operations!
+
+// emulates Vectorized types
+#if defined(__s390x__)
+template <class T, class TEMP=void>
+#else
+template <class T>
+#endif
+struct Vectorized {
+private:
+  __at_align__ T values[VECTOR_WIDTH / sizeof(T)];
+public:
+  using value_type = T;
+  using size_type = int;
+  // Note [constexpr static function to avoid odr-usage compiler bug]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Why, you might ask, is size defined to be a static constexpr function,
+  // rather than a more ordinary 'static constexpr int size;' variable?
+  // The problem lies within ODR rules for static constexpr members versus
+  // static constexpr functions.  First, recall that this class (along with all
+  // of its derivations) live in an anonymous namespace: they are intended to be
+  // *completely* inlined at their use-sites, because we need to compile it
+  // multiple times for different instruction sets.
+  //
+  // Because of this constraint, we CANNOT provide a single definition for
+  // any static members in this class; since we want to compile the class
+  // multiple times, there wouldn't actually be any good place to put the
+  // definition.  Now here is the problem: if we ODR-use a static constexpr
+  // member, we are *obligated* to provide a definition.  Without the
+  // definition, you get a compile error like:
+  //
+  //    relocation R_X86_64_PC32 against undefined symbol
+  //    `_ZN2at6vec25612_GLOBAL__N_16VectorizedIdE4sizeE' can not be used when making
+  //    a shared object; recompile with -fPIC
+  //
+  // If this were C++17, we could replace a static constexpr variable with
+  // an inline variable which doesn't require one definition. But we are not
+  // C++17.  So the next best thing is to replace the member with a static
+  // constexpr (and therefore inline) function, which does not require ODR
+  // either.
+  //
+  // Also, technically according to the C++ standard, we don't have to define
+  // a constexpr variable if we never odr-use it.  But it seems that some
+  // versions GCC/Clang have buggy determinations on whether or not an
+  // identifier is odr-used or not, and in any case it's hard to tell if
+  // a variable is odr-used or not.  So best to just cut the problem at the root.
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(T);
+  }
+  Vectorized() : values{static_cast<T>(0)} {}
+  Vectorized(T val) {
+    for (int i = 0; i != size(); i++) {
+      values[i] = val;
+    }
+  }
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) : values{vals...}{
+  }
+  // This also implies const T& operator[](int idx) const
+  inline operator const T*() const {
+    return values;
+  }
+  // This also implies T& operator[](int idx)
+  inline operator T*() {
+    return values;
+  }
+  // Return the values as char* for type punning
+  auto as_bytes() const -> const char* {
+    return reinterpret_cast<const char*>(values);
+  }
+  template <int64_t mask_>
+  static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
+    int64_t mask = mask_;
+    Vectorized vector;
+    for (const auto i : c10::irange(size())) {
+      if (mask & 0x01) {
+        vector[i] = b[i];
+      } else {
+        vector[i] = a[i];
+      }
+      mask = mask >> 1;
+    }
+    return vector;
+  }
+  static Vectorized<T> blendv(const Vectorized<T>& a, const Vectorized<T>& b,
+                          const Vectorized<T>& mask) {
+    Vectorized vector;
+    int_same_size_t<T> buffer[size()];
+    mask.store(buffer);
+    for (const auto i : c10::irange(size())) {
+      if (buffer[i] & 0x01)
+       {
+        vector[i] = b[i];
+      } else {
+        vector[i] = a[i];
+      }
+    }
+    return vector;
+  }
+  template<typename step_t>  // step sometimes requires a higher precision type (e.g., T=int, step_t=double)
+  static Vectorized<T> arange(T base = static_cast<T>(0), step_t step = static_cast<step_t>(1)) {
+    Vectorized vector;
+    for (const auto i : c10::irange(size())) {
+      vector.values[i] = base + i * step;
+    }
+    return vector;
+  }
+  static Vectorized<T> set(const Vectorized<T>& a, const Vectorized<T>& b, int64_t count = size()) {
+    Vectorized vector;
+    for (const auto i : c10::irange(size())) {
+      if (i < count) {
+        vector[i] = b[i];
+      } else {
+        vector[i] = a[i];
+      }
+    }
+    return vector;
+  }
+  static Vectorized<T> loadu(const void* ptr) {
+    Vectorized vector;
+    std::memcpy(vector.values, ptr, VECTOR_WIDTH);
+    return vector;
+  }
+  static Vectorized<T> loadu(const void* ptr, int64_t count) {
+    Vectorized vector;
+    std::memcpy(vector.values, ptr, count * sizeof(T));
+    return vector;
+  }
+  void store(void* ptr, int count = size()) const {
+    std::memcpy(ptr, values, count * sizeof(T));
+  }
+  int zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    int mask = 0;
+    for (int i = 0; i < size(); ++ i) {
+      if (values[i] == static_cast<T>(0)) {
+        mask |= (1 << i);
+      }
+    }
+    return mask;
+  }
+  Vectorized<T> isnan() const {
+    Vectorized<T> vector;
+    for (int64_t i = 0; i != size(); i++) {
+      if (_isnan(values[i])) {
+        std::memset(static_cast<void*>(vector.values + i), 0xFF, sizeof(T));
+      } else {
+        std::memset(static_cast<void*>(vector.values + i), 0, sizeof(T));
+      }
+    }
+    return vector;
+  }
+  bool has_inf_nan() const {
+    for (int64_t i = 0; i != size(); i++) {
+      if(_isnan(values[i]) || _isinf(values[i])) {
+        return true;
+      }
+    }
+    return false;
+  }
+  Vectorized<T> map(T (*const f)(T)) const {
+    Vectorized<T> ret;
+    for (int64_t i = 0; i != size(); i++) {
+      ret[i] = f(values[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> map(T (*const f)(const T &)) const {
+    Vectorized<T> ret;
+    for (int64_t i = 0; i != size(); i++) {
+      ret[i] = f(values[i]);
+    }
+    return ret;
+  }
+  template <typename other_t_abs = T,
+            typename std::enable_if<!is_floating_point_v<other_t_abs> && !c10::is_complex<other_t_abs>::value, int>::type = 0>
+  Vectorized<T> abs() const {
+    // other_t_abs is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_abs, T>::value, "other_t_abs must be T");
+    return map([](T x) -> T { return x < static_cast<T>(0) ? -x : x; });
+  }
+  template <typename float_t_abs = T,
+            typename std::enable_if<is_floating_point_v<float_t_abs>, int>::type = 0>
+  Vectorized<T> abs() const {
+    // float_t_abs is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<float_t_abs, T>::value, "float_t_abs must be T");
+    // Specifically deal with floating-point because the generic code above won't handle -0.0 (which should result in
+    // 0.0) properly.
+    return map([](T x) -> T { return std::abs(x); });
+  }
+  template <typename complex_t_abs = T,
+            typename std::enable_if<c10::is_complex<complex_t_abs>::value, int>::type = 0>
+  Vectorized<T> abs() const {
+    // complex_t_abs is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_abs, T>::value, "complex_t_abs must be T");
+    // Specifically map() does not perform the type conversion needed by abs.
+    return map([](T x) { return static_cast<T>(std::abs(x)); });
+  }
+
+  template <typename other_t_sgn = T,
+            typename std::enable_if<c10::is_complex<other_t_sgn>::value, int>::type = 0>
+  Vectorized<T> sgn() const {
+    return map(at::native::sgn_impl);
+  }
+
+  template <typename other_t_angle = T,
+            typename std::enable_if<!c10::is_complex<other_t_angle>::value, int>::type = 0>
+  Vectorized<T> angle() const {
+    // other_t_angle is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_angle, T>::value, "other_t_angle must be T");
+    return map(at::native::angle_impl<T>);  // compiler is unable to resolve the overload without <T>
+  }
+  template <typename complex_t_angle = T,
+            typename std::enable_if<c10::is_complex<complex_t_angle>::value, int>::type = 0>
+  Vectorized<T> angle() const {
+    // complex_t_angle is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_angle, T>::value, "complex_t_angle must be T");
+    return map([](T x) { return static_cast<T>(std::arg(x)); });
+  }
+  template <typename other_t_real = T,
+            typename std::enable_if<!c10::is_complex<other_t_real>::value, int>::type = 0>
+  Vectorized<T> real() const {
+    // other_t_real is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_real, T>::value, "other_t_real must be T");
+    return *this;
+  }
+  template <typename complex_t_real = T,
+            typename std::enable_if<c10::is_complex<complex_t_real>::value, int>::type = 0>
+  Vectorized<T> real() const {
+    // complex_t_real is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_real, T>::value, "complex_t_real must be T");
+    return map([](T x) { return static_cast<T>(x.real()); });
+  }
+  template <typename other_t_imag = T,
+            typename std::enable_if<!c10::is_complex<other_t_imag>::value, int>::type = 0>
+  Vectorized<T> imag() const {
+    // other_t_imag is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_imag, T>::value, "other_t_imag must be T");
+    return Vectorized(0);
+  }
+  template <typename complex_t_imag = T,
+            typename std::enable_if<c10::is_complex<complex_t_imag>::value, int>::type = 0>
+  Vectorized<T> imag() const {
+    // complex_t_imag is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_imag, T>::value, "complex_t_imag must be T");
+    return map([](T x) { return static_cast<T>(x.imag()); });
+  }
+  template <typename other_t_conj = T,
+            typename std::enable_if<!c10::is_complex<other_t_conj>::value, int>::type = 0>
+  Vectorized<T> conj() const {
+    // other_t_conj is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_conj, T>::value, "other_t_conj must be T");
+    return *this;
+  }
+  template <typename complex_t_conj = T,
+            typename std::enable_if<c10::is_complex<complex_t_conj>::value, int>::type = 0>
+  Vectorized<T> conj() const {
+    // complex_t_conj is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_conj, T>::value, "complex_t_conj must be T");
+    return map([](T x) { return static_cast<T>(std::conj(x)); });
+  }
+  Vectorized<T> acos() const {
+    return map(std::acos);
+  }
+  Vectorized<T> acosh() const {
+    return map(std::acosh);
+  }
+  Vectorized<T> asin() const {
+    return map(std::asin);
+  }
+  Vectorized<T> atan() const {
+    return map(std::atan);
+  }
+  Vectorized<T> atanh() const {
+    return map(std::atanh);
+  }
+  Vectorized<T> atan2(const Vectorized<T> &exp) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = std::atan2(values[i], exp[i]);
+    }
+    return ret;
+  }
+  template <
+    typename U = T,
+    typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
+  Vectorized<T> copysign(const Vectorized<T> &sign) const {
+    Vectorized<T> ret;
+    for (size_type i = 0; i < size(); i++) {
+      ret[i] = c10::copysign(values[i], sign[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> erf() const {
+    return map(std::erf);
+  }
+  Vectorized<T> erfc() const {
+    return map(std::erfc);
+  }
+  Vectorized<T> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<T> exp() const {
+    return map(std::exp);
+  }
+  Vectorized<T> exp2() const {
+    return map(exp2_impl);
+  }
+  Vectorized<T> expm1() const {
+    return map(std::expm1);
+  }
+  Vectorized<T> exp_u20() const {
+    return map(std::exp);
+  }
+  Vectorized<T> frac() const {
+    return *this - this->trunc();
+  }
+  template <
+    typename U = T,
+    typename std::enable_if_t<is_floating_point_v<U>, int> = 0>
+  Vectorized<T> fmod(const Vectorized<T>& q) const {
+    // U is for SFINAE purposes only. Make sure it is not changed.
+    static_assert(std::is_same<U, T>::value, "U must be T");
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = std::fmod(values[i], q[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> log() const {
+    return map(std::log);
+  }
+  Vectorized<T> log10() const {
+    return map(std::log10);
+  }
+  Vectorized<T> log1p() const {
+    return map(std::log1p);
+  }
+  template <typename other_t_log2 = T,
+            typename std::enable_if<!c10::is_complex<other_t_log2>::value, int>::type = 0>
+  Vectorized<T> log2() const {
+    // other_t_log2 is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<other_t_log2, T>::value, "other_t_log2 must be T");
+    return map(std::log2);
+  }
+  template <typename complex_t_log2 = T,
+            typename std::enable_if<c10::is_complex<complex_t_log2>::value, int>::type = 0>
+  Vectorized<T> log2() const {
+    // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed.
+    static_assert(std::is_same<complex_t_log2, T>::value, "complex_t_log2 must be T");
+    const T log_2 = T(std::log(2.0));
+    return Vectorized(map(std::log))/Vectorized(log_2);
+  }
+  Vectorized<T> ceil() const {
+    return map(at::native::ceil_impl);
+  }
+  Vectorized<T> cos() const {
+    return map(std::cos);
+  }
+  Vectorized<T> cosh() const {
+    return map(std::cosh);
+  }
+  Vectorized<T> floor() const {
+    return map(at::native::floor_impl);
+  }
+  Vectorized<T> hypot(const Vectorized<T> &b) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = std::hypot(values[i], b[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<T> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<T> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<T> igamma(const Vectorized<T> &x) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = calc_igamma(values[i], x[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> igammac(const Vectorized<T> &x) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = calc_igammac(values[i], x[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> neg() const {
+    // NB: the trailing return type is needed because we need to coerce the
+    // return value back to T in the case of unary operator- incuring a
+    // promotion
+    return map([](T x) -> T { return -x; });
+  }
+  Vectorized<T> nextafter(const Vectorized<T> &b) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = std::nextafter(values[i], b[i]);
+    }
+    return ret;
+  }
+  Vectorized<T> round() const {
+    // We do not use std::round because we would like to round midway numbers to the nearest even integer.
+    return map(at::native::round_impl);
+  }
+  Vectorized<T> sin() const {
+    return map(std::sin);
+  }
+  Vectorized<T> sinh() const {
+    return map(std::sinh);
+  }
+  Vectorized<T> tan() const {
+    return map(std::tan);
+  }
+  Vectorized<T> tanh() const {
+    return map(std::tanh);
+  }
+  Vectorized<T> trunc() const {
+    return map(at::native::trunc_impl);
+  }
+  Vectorized<T> lgamma() const {
+    return map(std::lgamma);
+  }
+  Vectorized<T> sqrt() const {
+    return map(std::sqrt);
+  }
+  Vectorized<T> reciprocal() const {
+    return map([](T x) { return (T)(1) / x; });
+  }
+  Vectorized<T> rsqrt() const {
+    return map([](T x) { return (T)1 / std::sqrt(x); });
+  }
+  Vectorized<T> pow(const Vectorized<T> &exp) const {
+    Vectorized<T> ret;
+    for (const auto i : c10::irange(size())) {
+      ret[i] = std::pow(values[i], exp[i]);
+    }
+    return ret;
+  }
+private:
+  template <typename Op>
+  inline Vectorized<T> binary_pred(const Vectorized<T>& other, Op op) const {
+    // All bits are set to 1 if the pred is true, otherwise 0.
+    Vectorized<T> vector;
+    for (int64_t i = 0; i != size(); i++) {
+      if (op(values[i], other.values[i])) {
+        std::memset(static_cast<void*>(vector.values + i), 0xFF, sizeof(T));
+      } else {
+        std::memset(static_cast<void*>(vector.values + i), 0, sizeof(T));
+      }
+    }
+    return vector;
+  }
+
+public:
+  Vectorized<T> operator==(const Vectorized<T>& other) const { return binary_pred(other, std::equal_to<T>()); }
+  Vectorized<T> operator!=(const Vectorized<T>& other) const { return binary_pred(other, std::not_equal_to<T>()); }
+  Vectorized<T> operator>=(const Vectorized<T>& other) const { return binary_pred(other, std::greater_equal<T>()); }
+  Vectorized<T> operator<=(const Vectorized<T>& other) const { return binary_pred(other, std::less_equal<T>()); }
+  Vectorized<T> operator>(const Vectorized<T>& other) const { return binary_pred(other, std::greater<T>()); }
+  Vectorized<T> operator<(const Vectorized<T>& other) const { return binary_pred(other, std::less<T>()); }
+
+private:
+  template <typename Op>
+  inline Vectorized<T> binary_pred_bool(const Vectorized<T>& other, Op op) const {
+    // 1 if the pred is true, otherwise 0.
+    Vectorized<T> vector;
+    for (int i = 0; i != size(); ++ i) {
+      vector[i] = static_cast<T>(op(values[i], other.values[i]));
+    }
+    return vector;
+  }
+
+public:
+  Vectorized<T> eq(const Vectorized<T>& other) const { return binary_pred_bool(other, std::equal_to<T>()); }
+  Vectorized<T> ne(const Vectorized<T>& other) const { return binary_pred_bool(other, std::not_equal_to<T>()); }
+  Vectorized<T> gt(const Vectorized<T>& other) const { return binary_pred_bool(other, std::greater<T>()); }
+  Vectorized<T> ge(const Vectorized<T>& other) const { return binary_pred_bool(other, std::greater_equal<T>()); }
+  Vectorized<T> lt(const Vectorized<T>& other) const { return binary_pred_bool(other, std::less<T>()); }
+  Vectorized<T> le(const Vectorized<T>& other) const { return binary_pred_bool(other, std::less_equal<T>()); }
+};
+
+template <class T> Vectorized<T> inline operator+(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] + b[i];
+  }
+  return c;
+}
+
+template <class T> Vectorized<T> inline operator-(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] - b[i];
+  }
+  return c;
+}
+
+template <class T> Vectorized<T> inline operator*(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] * b[i];
+  }
+  return c;
+}
+
+template <class T> Vectorized<T> inline operator/(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] / b[i];
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!is_floating_point_v<T>, int>::type = 0>
+Vectorized<T> inline operator%(const Vectorized<T> &a, const Vectorized<T> &b) __ubsan_ignore_float_divide_by_zero__ {
+  return a - a / b * b;
+}
+
+template <class T> Vectorized<T> inline operator||(
+    const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] || b[i];
+  }
+  return c;
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <class T,
+          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = (a[i] > b[i]) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline maximum(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <class T,
+          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = (a[i] < b[i]) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline minimum(const Vectorized<T> &a, const Vectorized<T> &b) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i];
+    if (_isnan(a[i])) {
+      // If either input is NaN, propagate a NaN.
+      // NOTE: The case where b[i] was NaN is handled correctly by the naive
+      // ternary operator above.
+      c[i] = a[i];
+    }
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline clamp(const Vectorized<T> &a, const Vectorized<T> &min_vec, const Vectorized<T> &max_vec) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]);
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline clamp_max(const Vectorized<T> &a, const Vectorized<T> &max_vec) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i];
+  }
+  return c;
+}
+
+template <class T,
+          typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+Vectorized<T> inline clamp_min(const Vectorized<T> &a, const Vectorized<T> &min_vec) {
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i];
+  }
+  return c;
+}
+
+struct Vectorizedi;
+
+#if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)
+template <class T, typename Op>
+static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
+  int_vector buffer;
+#if defined(CPU_CAPABILITY_AVX2)
+  int_vector a_buffer = _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)a));
+  int_vector b_buffer = _mm256_load_si256(reinterpret_cast<const int_vector*>((const T*)b));
+#elif defined(CPU_CAPABILITY_AVX512)
+  int_vector a_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)a));
+  int_vector b_buffer = _mm512_load_si512(reinterpret_cast<const int_vector*>((const T*)b));
+#endif
+  buffer = op(a_buffer, b_buffer);
+  __at_align__ T results[Vectorized<T>::size()];
+
+#if defined(CPU_CAPABILITY_AVX2)
+  _mm256_store_si256(reinterpret_cast<int_vector*>(results), buffer);
+#elif defined(CPU_CAPABILITY_AVX512)
+  _mm512_store_si512(reinterpret_cast<int_vector*>(results), buffer);
+#endif
+  return Vectorized<T>::loadu(results);
+}
+
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); });
+#endif
+}
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_or_si512 or _mm256_or_si256 with lambda because it is always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); });
+#endif
+}
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  // We enclose _mm512_xor_si512 or _mm256_xor_si256 with lambda because it is always_inline
+#if defined(CPU_CAPABILITY_AVX2)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); });
+#elif defined(CPU_CAPABILITY_AVX512)
+  return bitwise_binary_op(a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); });
+#endif
+}
+
+#else
+
+template <typename T>
+auto load(char const* data) -> T {
+  T ret;
+  std::memcpy(&ret, data, sizeof(ret));
+  return ret;
+}
+
+template<class T, typename Op>
+static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
+  static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
+  __at_align__ intmax_t buffer[element_no];
+  static_assert(VECTOR_WIDTH % sizeof(intmax_t) == 0, "VECTOR_WIDTH not a multiple of sizeof(intmax_t)");
+  static_assert(sizeof(buffer) == sizeof(Vectorized<T>), "sizeof(buffer) must match sizeof(Vectorized<T>)");
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // see: https://github.com/pytorch/pytorch/issues/66119
+  // Using char* is defined in the C11 standard 6.5 Expression paragraph 7
+  // (http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf)
+  const auto* a_data = a.as_bytes();
+  const auto* b_data = b.as_bytes();
+  // load each intmax_t chunk and process; increase pointers by sizeof(intmax_t)
+  for (auto& out : buffer) {
+    out = op(load<intmax_t>(a_data), load<intmax_t>(b_data));
+    a_data += sizeof(intmax_t);
+    b_data += sizeof(intmax_t);
+  }
+  assert(a_data == a.as_bytes() + sizeof(a));
+  assert(b_data == b.as_bytes() + sizeof(b));
+  return Vectorized<T>::loadu(buffer);
+}
+
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator&(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return bitwise_binary_op(a, b, std::bit_and<intmax_t>());
+}
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator|(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return bitwise_binary_op(a, b, std::bit_or<intmax_t>());
+}
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+  return bitwise_binary_op(a, b, std::bit_xor<intmax_t>());
+}
+
+#endif // defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)
+
+template<class T, typename std::enable_if_t<!std::is_base_of<Vectorizedi, Vectorized<T>>::value, int> = 0>
+inline Vectorized<T> operator~(const Vectorized<T>& a) {
+  Vectorized<T> ones;  // All bits are 1
+  memset((T*) ones, 0xFF, VECTOR_WIDTH);
+  return a ^ ones;
+}
+
+template <class T> Vectorized<T> inline operator<<(const Vectorized<T> &a, const Vectorized<T> &b) {
+  constexpr T max_shift = sizeof(T) * CHAR_BIT;
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    T shift = b[i];
+    if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+      c[i] = 0;
+    } else {
+      c[i] = static_cast<std::make_unsigned_t<T>>(a[i]) << shift;
+    }
+  }
+  return c;
+}
+
+template <class T> Vectorized<T> inline operator>>(const Vectorized<T> &a, const Vectorized<T> &b) {
+  // right shift value to retain sign bit for signed and no bits for unsigned
+  constexpr T max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v<T>;
+  Vectorized<T> c;
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    T shift = b[i];
+    if ((static_cast<std::make_signed_t<T>>(shift) < 0) || (shift >= max_shift)) {
+      c[i] = a[i] >> max_shift;
+    } else {
+      c[i] = a[i] >> shift;
+    }
+  }
+  return c;
+}
+
+template <typename T>
+inline Vectorized<T>& operator += (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a + b;
+  return a;
+}
+template <typename T>
+inline Vectorized<T>& operator -= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a - b;
+  return a;
+}
+template <typename T>
+inline Vectorized<T>& operator /= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a / b;
+  return a;
+}
+template <typename T>
+inline Vectorized<T>& operator %= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a % b;
+  return a;
+}
+template <typename T>
+inline Vectorized<T>& operator *= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a * b;
+  return a;
+}
+
+template <typename T>
+inline Vectorized<T>& operator <<= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a << b;
+  return a;
+}
+
+template <typename T>
+inline Vectorized<T>& operator >>= (Vectorized<T>& a, const Vectorized<T>& b) {
+  a = a >> b;
+  return a;
+}
+
+template <typename T>
+inline Vectorized<T> fmadd(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) {
+  return a * b + c;
+}
+
+template <typename T>
+inline Vectorized<T> fmsub(const Vectorized<T>& a, const Vectorized<T>& b, const Vectorized<T>& c) {
+  return a * b - c;
+}
+
+template <int64_t scale = 1, typename T = void>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>>
+inline gather(T const* base_addr, const Vectorized<int_same_size_t<T>>& vindex) {
+  static constexpr int size = Vectorized<T>::size();
+  int_same_size_t<T> index_arr[size];
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (const auto i : c10::irange(size)) {
+    buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+  }
+  return Vectorized<T>::loadu(static_cast<void*>(buffer));
+}
+
+template <int64_t scale = 1, typename T = void>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<T>>
+inline mask_gather(const Vectorized<T>& src, T const* base_addr,
+                   const Vectorized<int_same_size_t<T>>& vindex, Vectorized<T>& mask) {
+  static constexpr int size = Vectorized<T>::size();
+  T src_arr[size];
+  int_same_size_t<T> mask_arr[size];  // use int type so we can logical and
+  int_same_size_t<T> index_arr[size];
+  src.store(static_cast<void*>(src_arr));
+  mask.store(static_cast<void*>(mask_arr));
+  vindex.store(static_cast<void*>(index_arr));
+  T buffer[size];
+  for (const auto i : c10::irange(size)) {
+    if (mask_arr[i] & 0x01) {  // check highest bit
+      buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
+    } else {
+      buffer[i] = src_arr[i];
+    }
+  }
+  mask = Vectorized<T>();  // "zero out" mask
+  return Vectorized<T>::loadu(static_cast<void*>(buffer));
+}
+
+// Cast a given vector to another type without changing the bits representation.
+// So a Vectorized<double> of 512 bits containing all ones can be cast to a
+// Vectorized<int64_t> of 512 bits containing all ones (i.e., eight negative 1s).
+// A Vec<double> of 256 bits containing all ones can be cast to a
+// Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
+// There is a struct here because we don't have static_if and I can't
+// partially specialize a templated function.
+template<typename dst_t, typename src_t>
+struct CastImpl {
+  static inline Vectorized<dst_t> apply(const Vectorized<src_t>& src) {
+    src_t src_arr[Vectorized<src_t>::size()];
+    src.store(static_cast<void*>(src_arr));
+    return Vectorized<dst_t>::loadu(static_cast<const void*>(src_arr));
+  }
+};
+
+template<typename scalar_t>
+struct CastImpl<scalar_t, scalar_t> {
+  static inline Vectorized<scalar_t> apply(const Vectorized<scalar_t>& src) {
+    return src;
+  }
+};
+
+template<typename dst_t, typename src_t>
+inline Vectorized<dst_t> cast(const Vectorized<src_t>& src) {
+  return CastImpl<dst_t, src_t>::apply(src);
+}
+
+template <typename T, typename IntType = int_same_size_t<T>>
+inline Vectorized<IntType> convert_to_int_of_same_size(const Vectorized<T>& src) {
+  static_assert(sizeof(T) == sizeof(IntType));
+  static constexpr int size = Vectorized<T>::size();
+
+  std::array<T, size> src_arr;
+  src.store(static_cast<void*>(src_arr.data()));
+  std::array<IntType, size> buffer;
+  std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
+                 [](const T& x) { return static_cast<IntType>(x); });
+  return Vectorized<IntType>::loadu(static_cast<const void*>(buffer.data()));
+}
+
+template <typename T, typename IntType = int_same_size_t<T>>
+inline Vectorized<T> convert_to_fp_of_same_size(const Vectorized<IntType>& src) {
+  static_assert(sizeof(T) == sizeof(IntType));
+  static constexpr int size = Vectorized<T>::size();
+
+  std::array<IntType, size> src_arr;
+  src.store(static_cast<void*>(src_arr.data()));
+  std::array<T, size> buffer;
+  std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(),
+                 [](const IntType& x) { return static_cast<T>(x); });
+  return Vectorized<T>::loadu(static_cast<const void*>(buffer.data()));
+}
+
+// Example inputs for AVX512:
+// a   Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+// b   Vectorized<float>   = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
+// returns:
+//           Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15}
+//           Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+// Example inputs for AVX2: a           Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//               b                      Vectorized<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+//       returns:                       Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//                                      Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+template <typename T>
+inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>>
+deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
+  static constexpr int size = Vectorized<T>::size();
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (const auto i : c10::irange(half_size)) {
+    buffer1[i] = a_arr[i * 2];
+    buffer1[half_size + i] = b_arr[i * 2];
+    buffer2[i] = a_arr[i * 2 + 1];
+    buffer2[half_size + i] = b_arr[i * 2 + 1];
+  }
+  return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)),
+                        Vectorized<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+// inverse operation of deinterleave2
+// Example inputs for AVX512:
+//  a       Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15}
+//  b       Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15}
+// returns, for AVX512:
+//          Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7}
+//          Vectorized<float>   = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15}
+// Example inputs for AVX2 : a           Vectorized<float>   = {a0, a1, a2, a3, a4, a5, a6, a7}
+//                   b                   Vectorized<float>   = {b0, b1, b2, b3, b4, b5, b6, b7}
+//       returns:            Vectorized<float>   = {a0, b0, a1, b1, a2, b2, a3, b3}
+//                           Vectorized<float>   = {a4, b4, a5, b5, a6, b6, a7, b7}
+template <typename T>
+inline std::enable_if_t<Vectorized<T>::size() % 2 == 0, std::pair<Vectorized<T>, Vectorized<T>>>
+interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
+  static constexpr int size = Vectorized<T>::size();
+  static constexpr int half_size = size / 2;
+  T a_arr[size];
+  T b_arr[size];
+  T buffer1[size];
+  T buffer2[size];
+  a.store(static_cast<void*>(a_arr));
+  b.store(static_cast<void*>(b_arr));
+  for (const auto i : c10::irange(half_size)) {
+    buffer1[i * 2] = a_arr[i];
+    buffer1[i * 2 + 1] = b_arr[i];
+    buffer2[i * 2] = a_arr[half_size + i];
+    buffer2[i * 2 + 1] = b_arr[half_size + i];
+  }
+  return std::make_pair(Vectorized<T>::loadu(static_cast<void*>(buffer1)),
+                        Vectorized<T>::loadu(static_cast<void*>(buffer2)));
+}
+
+template <typename src_T, typename dst_T>
+inline void convert(const src_T *src, dst_T *dst, int64_t n) {
+#ifndef _MSC_VER
+# pragma unroll
+#endif
+  for (C10_UNUSED const auto i : c10::irange(n)) {
+    *dst = c10::convert<dst_T>(c10::load(src));
+    src++;
+    dst++;
+  }
+}
+
+template <typename T>
+inline Vectorized<T> flip(const Vectorized<T> & data) {
+  static constexpr int size = Vectorized<T>::size();
+  T output[size];
+  T buffer[size];
+  data.store(static_cast<void*>(buffer));
+  for (const auto i : c10::irange(size)) {
+    output[i] = buffer[size - i - 1];
+  }
+  return Vectorized<T>::loadu(static_cast<void*>(output));
+}
+
+// Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. `ld_src` is the leading
+// dimension of `src` and `ld_dst` is the leading dimension of `dst`.
+template <typename T, int M, int N>
+inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) {
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      dst[j*ld_dst + i] = src[i*ld_src + j];
+    }
+  }
+}
+
+}} // namespace at::vec::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_half.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_half.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cdc259581da37601221b2929702afc98938619a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_half.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+namespace at::vec {
+// See Note [CPU_CAPABILITY namespace]
+inline namespace CPU_CAPABILITY {
+
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+static inline uint16_t float2half_scalar(float val) {
+#if defined(CPU_CAPABILITY_AVX2)
+#if defined(_MSC_VER)
+  __m256 v = _mm256_set1_ps(val);
+  __m128i o =
+      _mm256_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return static_cast<std::uint16_t>(_mm_cvtsi128_si32(o));
+#else
+  return _cvtss_sh(val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#endif
+#elif defined(CPU_CAPABILITY_AVX512)
+  __m512 v = _mm512_set1_ps(val);
+  __m256i o =
+      _mm512_cvtps_ph(v, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+  return static_cast<std::uint16_t>(
+      _mm_cvtsi128_si32(_mm256_castsi256_si128(o)));
+#endif
+}
+
+static inline float half2float_scalar(uint16_t val) {
+#if defined(CPU_CAPABILITY_AVX2)
+#if defined(_MSC_VER)
+  __m128i v = _mm_cvtsi32_si128(val);
+  __m256 o = _mm256_cvtph_ps(v);
+  return _mm256_cvtss_f32(o);
+#else
+  return _cvtsh_ss(val);
+#endif
+#elif defined(CPU_CAPABILITY_AVX512)
+  __m256i v =
+      _mm256_setr_epi16(val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  __m512 o = _mm512_cvtph_ps(v);
+  return _mm512_cvtss_f32(o);
+#endif
+}
+
+#endif
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_n.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_n.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ad919432a05eec30bdbff189a122988731083c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vec/vec_n.h
@@ -0,0 +1,344 @@
+#include <ATen/cpu/vec/vec_base.h>
+#include <array>
+
+namespace at::vec {
+inline namespace CPU_CAPABILITY {
+
+/**
+ * @brief A class template representing a vectorized type with
+ * `N * Vectorized<T>::size()` elements, aiming to support vectors of
+ * arbitrary size. A specific use case of it is to represent vectors
+ * converted from data types with different sizes but with the same
+ * number of vector elements, e.g., `VectorizedN<float, 2>` can be
+ * a vector converted from two `Vectorized<bfloat16>`, `VectorizedN<int64_t, 2>`
+ * can be a vector converted from two `Vectorized<int32_t>` etc.
+ *
+ * It supports most of the operations of `Vectorized<T>`
+ * and the implementation delegates to `Vectorized<T>` with loops over `N`.
+ *
+ * @tparam T The underlying type of the vectorized elements.
+ * @tparam N The number of underlying `Vectorized<T>`.
+ */
+template <typename T, int N>
+class VectorizedN {
+ public:
+  using value_type = T;
+  using size_type = int;
+
+  static constexpr size_type size_T = sizeof(T);
+  static constexpr size_type size() {
+    return Vectorized<T>::size() * N;
+  }
+
+ private:
+  std::array<Vectorized<T>, N> values;
+
+ public:
+  // methods not implemented yet:
+  // variadic constructor, operator T*, as_bytes, zero_mask
+
+#define VECTORIZEDN_DEFINE_UNARY_OP(op)                             \
+  VectorizedN<T, N> op() const {                                    \
+    return unary_op([](const Vectorized<T>& a) { return a.op(); }); \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP(op)                            \
+  VectorizedN<T, N> op(const VectorizedN<T, N>& other) const {      \
+    return binary_op(                                               \
+        other, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+          return a.op(b);                                           \
+        });                                                         \
+  }
+
+  template <typename Op>
+  inline VectorizedN<T, N> unary_op(Op op) const {
+    VectorizedN<T, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = op(values[i]);
+    }
+    return result;
+  }
+
+  template <typename Op>
+  inline VectorizedN<T, N> binary_op(const VectorizedN<T, N>& other, Op op)
+      const {
+    VectorizedN<T, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = op(values[i], other.values[i]);
+    }
+    return result;
+  }
+
+  VectorizedN() = default;
+
+  explicit VectorizedN(T val) {
+    for (int i = 0; i < N; ++i) {
+      values[i] = Vectorized<T>(val);
+    }
+  }
+
+  const Vectorized<T>& operator[](int i) const {
+    return values[i];
+  }
+
+  Vectorized<T>& operator[](int i) {
+    return values[i];
+  }
+
+  template <int64_t mask>
+  static VectorizedN<T, N> blend(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::blend<mask>(a.values[i], b.values[i]);
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> blendv(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b,
+      const VectorizedN<T, N>& mask) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] =
+          Vectorized<T>::blendv(a.values[i], b.values[i], mask.values[i]);
+    }
+    return result;
+  }
+
+  template <typename step_t>
+  static VectorizedN<T, N> arange(
+      T base = static_cast<T>(0),
+      step_t step = static_cast<step_t>(1)) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::arange(base, step);
+      base += step * Vectorized<T>::size();
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> set(
+      const VectorizedN<T, N>& a,
+      const VectorizedN<T, N>& b,
+      int64_t count = size()) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] =
+          Vectorized<T>::set(a.values[i], b.values[i], std::min(count, Vectorized<T>::size()));
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> loadu(const void* ptr) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = Vectorized<T>::loadu(ptr);
+      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
+    }
+    return result;
+  }
+
+  static VectorizedN<T, N> loadu(const void* ptr, int64_t count) {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] =
+          Vectorized<T>::loadu(ptr, std::min(count, Vectorized<T>::size()));
+      ptr = static_cast<const T*>(ptr) + Vectorized<T>::size();
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+    return result;
+  }
+
+  void store(void* ptr) const {
+    for (int i = 0; i < N; ++i) {
+      values[i].store(ptr);
+      ptr = static_cast<T*>(ptr) + Vectorized<T>::size();
+    }
+  }
+
+  void store(void* ptr, int count) const {
+    for (int i = 0; i < N; ++i) {
+      values[i].store(ptr, std::min(count, Vectorized<T>::size()));
+      ptr = static_cast<T*>(ptr) + Vectorized<T>::size();
+      count -= Vectorized<T>::size();
+      if (count <= 0) {
+        break;
+      }
+    }
+  }
+
+  bool has_inf_nan() const {
+    for (int i = 0; i < N; ++i) {
+      if (values[i].has_inf_nan()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  VectorizedN<T, N> map(T (*const f)(T)) const {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = values[i].map(f);
+    }
+    return result;
+  }
+
+  VectorizedN<T, N> map(T (*const f)(const T&)) const {
+    VectorizedN<T, N> result;
+    for (int i = 0; i < N; ++i) {
+      result.values[i] = values[i].map(f);
+    }
+    return result;
+  }
+
+  VECTORIZEDN_DEFINE_UNARY_OP(abs)
+  VECTORIZEDN_DEFINE_UNARY_OP(sgn)
+  VECTORIZEDN_DEFINE_UNARY_OP(angle)
+  VECTORIZEDN_DEFINE_UNARY_OP(real)
+  VECTORIZEDN_DEFINE_UNARY_OP(imag)
+  VECTORIZEDN_DEFINE_UNARY_OP(conj)
+  VECTORIZEDN_DEFINE_UNARY_OP(acos)
+  VECTORIZEDN_DEFINE_UNARY_OP(acosh)
+  VECTORIZEDN_DEFINE_UNARY_OP(asin)
+  VECTORIZEDN_DEFINE_UNARY_OP(atan)
+  VECTORIZEDN_DEFINE_UNARY_OP(atanh)
+  VECTORIZEDN_DEFINE_BINARY_OP(atan2)
+  VECTORIZEDN_DEFINE_BINARY_OP(copysign)
+  VECTORIZEDN_DEFINE_UNARY_OP(erf)
+  VECTORIZEDN_DEFINE_UNARY_OP(erfc)
+  VECTORIZEDN_DEFINE_UNARY_OP(erfinv)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp2)
+  VECTORIZEDN_DEFINE_UNARY_OP(expm1)
+  VECTORIZEDN_DEFINE_UNARY_OP(exp_u20)
+  VECTORIZEDN_DEFINE_UNARY_OP(frac)
+  VECTORIZEDN_DEFINE_BINARY_OP(fmod)
+  VECTORIZEDN_DEFINE_UNARY_OP(log)
+  VECTORIZEDN_DEFINE_UNARY_OP(log10)
+  VECTORIZEDN_DEFINE_UNARY_OP(log1p)
+  VECTORIZEDN_DEFINE_UNARY_OP(log2)
+  VECTORIZEDN_DEFINE_UNARY_OP(ceil)
+  VECTORIZEDN_DEFINE_UNARY_OP(cos)
+  VECTORIZEDN_DEFINE_UNARY_OP(cosh)
+  VECTORIZEDN_DEFINE_UNARY_OP(floor)
+  VECTORIZEDN_DEFINE_BINARY_OP(hypot)
+  VECTORIZEDN_DEFINE_UNARY_OP(i0)
+  VECTORIZEDN_DEFINE_UNARY_OP(i0e)
+  VECTORIZEDN_DEFINE_UNARY_OP(digamma)
+  VECTORIZEDN_DEFINE_BINARY_OP(igamma)
+  VECTORIZEDN_DEFINE_BINARY_OP(igammac)
+  VECTORIZEDN_DEFINE_UNARY_OP(neg)
+  VECTORIZEDN_DEFINE_BINARY_OP(nextafter)
+  VECTORIZEDN_DEFINE_UNARY_OP(round)
+  VECTORIZEDN_DEFINE_UNARY_OP(sin)
+  VECTORIZEDN_DEFINE_UNARY_OP(sinh)
+  VECTORIZEDN_DEFINE_UNARY_OP(tan)
+  VECTORIZEDN_DEFINE_UNARY_OP(tanh)
+  VECTORIZEDN_DEFINE_UNARY_OP(trunc)
+  VECTORIZEDN_DEFINE_UNARY_OP(lgamma)
+  VECTORIZEDN_DEFINE_UNARY_OP(sqrt)
+  VECTORIZEDN_DEFINE_UNARY_OP(reciprocal)
+  VECTORIZEDN_DEFINE_UNARY_OP(rsqrt)
+  VECTORIZEDN_DEFINE_BINARY_OP(pow)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator==)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator!=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator>=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator<=)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator>)
+  VECTORIZEDN_DEFINE_BINARY_OP(operator<)
+  VECTORIZEDN_DEFINE_BINARY_OP(eq)
+  VECTORIZEDN_DEFINE_BINARY_OP(ne)
+  VECTORIZEDN_DEFINE_BINARY_OP(gt)
+  VECTORIZEDN_DEFINE_BINARY_OP(ge)
+  VECTORIZEDN_DEFINE_BINARY_OP(lt)
+  VECTORIZEDN_DEFINE_BINARY_OP(le)
+
+#undef VECTORIZEDN_DEFINE_UNARY_OP
+#undef VECTORIZEDN_DEFINE_BINARY_OP
+};
+
+#define VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(op)                       \
+  template <typename T, int N>                                       \
+  inline VectorizedN<T, N> op(const VectorizedN<T, N>& a) {          \
+    return a.unary_op([](const Vectorized<T>& a) { return op(a); }); \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(op)                                \
+  template <typename T, int N>                                                 \
+  inline VectorizedN<T, N> op(                                                 \
+      const VectorizedN<T, N>& a, const VectorizedN<T, N>& b) {                \
+    return a.binary_op(b, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+      return op(a, b);                                                         \
+    });                                                                        \
+  }
+
+#define VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(op)                     \
+  template <typename T, int N>                                              \
+  inline VectorizedN<T, N>& op(                                             \
+      VectorizedN<T, N>& a, const VectorizedN<T, N>& b) {                   \
+    a = a.binary_op(b, [](const Vectorized<T>& a, const Vectorized<T>& b) { \
+      return op(a, b);                                                      \
+    });                                                                     \
+    return a;                                                               \
+  }
+
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator+)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator-)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator*)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator/)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator%)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator||)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator<<)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator>>)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(maximum)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(minimum)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmadd)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmsub)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_max)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_min)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator&)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator|)
+VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator^)
+VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(operator~)
+
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator+=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator-=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator*=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator/=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator%=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator<<=)
+VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator>>=)
+
+#undef VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL
+#undef VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL
+#undef VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL
+
+template <typename T, int N, typename OpVec>
+inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN<T, N> acc_vec) {
+  Vectorized<T> vec_result = acc_vec[0];
+  for (int i = 1; i < N; i++) {
+    vec_result = vec_fun(vec_result, acc_vec[i]);
+  }
+  return vec_reduce_all(vec_fun, vec_result);
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::vec
\ No newline at end of file
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cpu/vml.h b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vml.h
new file mode 100644
index 0000000000000000000000000000000000000000..45a4b88ae93bac68da49ca2f3f25375b5d6c98e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cpu/vml.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/complex.h>
+
+// This header implements various unary operations using a MKL VML style
+// interface.
+
+// It implements various functions with a simple interface
+// For example it enables the user to call vsin(float* out, const float* in,
+// size) This functions takes a pointer to a continuous output array of floats and
+// a constant input array. It will then apply sin to each value in the input
+// array and write the result into the output array. out and in may point to the
+// same memory, i.e. this fully supports in-place operations. These functions
+// also implement their own parallelization, so take precautions when calling
+// these from threaded functions.
+
+// When MKL is available it will call into MKL's VML library similar to NumPy
+// If MKL is not available it will use SLEEF.
+
+// This file might be compiled under AVX or AVX2 when called from e.g.
+// UnaryOpsKernel.cpp
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+#include <mkl.h>
+#endif
+
+namespace at {
+namespace vml {
+inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+template <typename scalar_t>
+inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
+  parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
+    map(
+        [](const Vectorized<scalar_t>& x) {
+          return Vectorized<scalar_t>((scalar_t)(1)) / x.sqrt();
+        },
+        out + begin,
+        in + begin,
+        end - begin);
+  });
+}
+
+// NB: We ignore numerical errors by convention and leave them to the user
+
+#define IMPLEMENT_VML(op)                                               \
+  template <typename scalar_t>                                          \
+  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {  \
+    using vec_t = Vectorized<vec_scalar_t<scalar_t>>;                   \
+    vec::map([](vec_t x) { return x.op(); }, out, in, size);            \
+  }                                                                     \
+
+IMPLEMENT_VML(abs)
+IMPLEMENT_VML(acos)
+IMPLEMENT_VML(asin)
+IMPLEMENT_VML(atan)
+IMPLEMENT_VML(atanh)
+IMPLEMENT_VML(ceil)
+IMPLEMENT_VML(cos)
+// IMPLEMENT_VML(cosh)
+IMPLEMENT_VML(erf)
+IMPLEMENT_VML(erfc)
+IMPLEMENT_VML(erfinv)
+IMPLEMENT_VML(exp)
+IMPLEMENT_VML(expm1)
+IMPLEMENT_VML(floor)
+IMPLEMENT_VML(i0)
+IMPLEMENT_VML(i0e)
+IMPLEMENT_VML(digamma)
+IMPLEMENT_VML(reciprocal)
+IMPLEMENT_VML(log)
+IMPLEMENT_VML(log10)
+IMPLEMENT_VML(log1p)
+IMPLEMENT_VML(log2)
+IMPLEMENT_VML(neg)
+IMPLEMENT_VML(sin)
+// IMPLEMENT_VML(sinh)
+IMPLEMENT_VML(sqrt)
+IMPLEMENT_VML(round)
+IMPLEMENT_VML(rsqrt)
+IMPLEMENT_VML(tan)
+IMPLEMENT_VML(tanh)
+IMPLEMENT_VML(trunc)
+IMPLEMENT_VML(lgamma)
+
+
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+
+// NB: LP64 MKL is the most commonly used and thus we assume it here. That means
+// we need to expect MKL_INT to be of type int, which implies int32_t or int64_t in most
+// cases.
+static_assert(
+    std::is_same_v<MKL_INT, int32_t> || std::is_same_v<MKL_INT, int64_t>,
+    "MKL_INT is assumed to be int32_t or int64_t");
+#define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                \
+  template <>                                                           \
+  inline void v##op(type * out, const type * in, int64_t size) {        \
+    int64_t max_mkl_ind = std::numeric_limits<MKL_INT>::max();          \
+    if (size <= static_cast<int64_t>(max_mkl_ind)) {                    \
+      vm##mkltype##mklop(                                               \
+          size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \
+    } else {                                                            \
+      MKL_INT ind = 0;                                                  \
+      int64_t chunks = size / max_mkl_ind;                              \
+      int64_t rest = size % max_mkl_ind;                                \
+      for (; ind < chunks; ind++) {                                     \
+        vm##mkltype##mklop(                                             \
+            max_mkl_ind,                                                \
+            in + ind * max_mkl_ind,                                     \
+            out + ind * max_mkl_ind,                                    \
+            VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE);              \
+      }                                                                 \
+      vm##mkltype##mklop(                                               \
+          rest,                                                         \
+          in + ind * max_mkl_ind,                                       \
+          out + ind * max_mkl_ind,                                      \
+          VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE);                \
+    }                                                                   \
+  }
+
+#define IMPLEMENT_VML_MKL(op, mklop)          \
+  IMPLEMENT_VML_MKL_STUB(op, mklop, float, s) \
+  IMPLEMENT_VML_MKL_STUB(op, mklop, double, d)
+
+// NB: abs, cosh and sinh were temporarily disabled due to issues with Apple
+// NB: expm1 is disabled because on some configs it produces expm1(nan)=-1
+IMPLEMENT_VML_MKL(acos, Acos)
+IMPLEMENT_VML_MKL(asin, Asin)
+IMPLEMENT_VML_MKL(atan, Atan)
+IMPLEMENT_VML_MKL(cos, Cos)
+// IMPLEMENT_VML_MKL(cosh, Cosh)
+IMPLEMENT_VML_MKL(erf, Erf)
+IMPLEMENT_VML_MKL(erfc, Erfc)
+IMPLEMENT_VML_MKL(erfinv, ErfInv)
+IMPLEMENT_VML_MKL(exp, Exp)
+// IMPLEMENT_VML_MKL(expm1, Expm1)
+IMPLEMENT_VML_MKL(log, Ln)
+IMPLEMENT_VML_MKL(log10, Log10)
+IMPLEMENT_VML_MKL(sin, Sin)
+// IMPLEMENT_VML_MKL(sinh, Sinh)
+IMPLEMENT_VML_MKL(sqrt, Sqrt)
+IMPLEMENT_VML_MKL(tan, Tan)
+IMPLEMENT_VML_MKL(tanh, Tanh)
+IMPLEMENT_VML_MKL(trunc, Trunc)
+
+// Not vectorized in MKL version tested
+// IMPLEMENT_VML_MKL(abs, Abs)
+// IMPLEMENT_VML_MKL(log1p, Log1p)
+
+#if INTEL_MKL_VERSION >= 20180406
+IMPLEMENT_VML_MKL(log2, Log2)
+#endif
+
+#endif
+
+} // namespace
+} // namespace vml
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/ATenCUDAGeneral.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ATenCUDAGeneral.h
new file mode 100644
index 0000000000000000000000000000000000000000..3938aa341bb3943a9e42a3178d3233868b755101
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ATenCUDAGeneral.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <c10/macros/Export.h>
+
+// Use TORCH_CUDA_CPP_API or TORCH_CUDA_CU_API for exports from this folder
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/ApplyGridUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ApplyGridUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b67b0905a09fd2a1bb17f7cc69863fd849ded1ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ApplyGridUtils.cuh
@@ -0,0 +1,47 @@
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cuda_runtime.h>
+
+namespace at::cuda {
+
+/**
+   Computes ceil(a / b)
+*/
+template <typename T>
+__host__ __device__ __forceinline__ T ATenCeilDiv(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+namespace {
+
+// Threads per block for our apply kernel
+// FIXME: use occupancy calculator instead
+constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512;
+constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4;
+
+template <int step = 1>
+inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
+  if (curDevice == -1) return false;
+  uint64_t numel_per_thread = static_cast<uint64_t>(max_threads_per_block) * static_cast<uint64_t>(step);
+  uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread);
+  uint64_t maxGridX = at::cuda::getDeviceProperties(curDevice)->maxGridSize[0];
+  if (numBlocks > maxGridX)
+    numBlocks = maxGridX;
+  grid = dim3(numBlocks);
+  return true;
+}
+
+constexpr int getApplyBlocksPerSM() {
+  return AT_APPLY_BLOCKS_PER_SM;
+}
+
+constexpr int getApplyBlockSize() {
+  return AT_APPLY_THREADS_PER_BLOCK;
+}
+
+inline dim3 getApplyBlock(int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) {
+  return dim3(max_threads_per_block);
+}
+
+} // anonymous namespace
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/AsmUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/AsmUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1daf0349042c77bf0627c61ecfa294a5b5c73a3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/AsmUtils.cuh
@@ -0,0 +1,149 @@
+#pragma once
+#include <cstdint>
+
+// Collection of direct PTX functions
+
+namespace at::cuda {
+
+template <typename T>
+struct Bitfield {};
+
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __host__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+
+    unsigned int m = (1u << len) - 1u;
+    return (val >> pos) & m;
+#else
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+
+  static __device__ __host__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+
+    unsigned int m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+#else
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+
+template <>
+struct Bitfield<uint64_t> {
+  static __device__ __host__ __forceinline__
+  uint64_t getBitfield(uint64_t val, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+
+    uint64_t m = (1u << len) - 1u;
+    return (val >> pos) & m;
+#else
+    uint64_t ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+
+  static __device__ __host__ __forceinline__
+  uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+
+    uint64_t m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+
+    return (val & ~m) | toInsert;
+#else
+    uint64_t ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+
+__device__ __forceinline__ int getLaneId() {
+#if defined(USE_ROCM)
+  return __lane_id();
+#else
+  int laneId;
+  asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
+  return laneId;
+#endif
+}
+
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
+  const std::uint64_t m = (1ull << getLaneId()) - 1ull;
+  return m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+#endif
+
+#if defined (USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
+  std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
+  return m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+#endif
+
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
+  const std::uint64_t m = getLaneMaskLe();
+  return m ? ~m : m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+#endif
+
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
+  const std::uint64_t m = getLaneMaskLt();
+  return ~m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+#endif
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/Atomic.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Atomic.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1e66b2fdce4eb54e425885a9e400490350574c7f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Atomic.cuh
@@ -0,0 +1,508 @@
+#pragma once
+
+#include <cuda.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+
+#include <ATen/NumericUtils.h>
+
+#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
+#include <cuda_bf16.h>
+#endif
+
+template <typename T>
+struct AtomicFPOp;
+
+template <>
+struct AtomicFPOp<at::Half> {
+  template <typename func_t>
+  inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) {
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    at::Half hsum;
+    do {
+      assumed = old;
+      hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      hsum = func(hsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+    hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    return hsum;
+  }
+};
+
+template <>
+struct AtomicFPOp<at::BFloat16> {
+  template <typename func_t>
+  inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) {
+    unsigned int * address_as_ui =
+      (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    at::BFloat16 bsum;
+    do {
+      assumed = old;
+      bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+      bsum = func(bsum, val);
+      old = (size_t)address & 2 ? (old & 0xffff) | (bsum.x << 16) : (old & 0xffff0000) | bsum.x;
+      old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+    bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+    return bsum.x;
+  }
+};
+
+template <>
+struct AtomicFPOp<double> {
+  template <typename func_t>
+  inline __device__ double operator() (double * address, double val, const func_t& func) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull;
+    unsigned long long int assumed;
+
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ull, assumed, func(val, assumed));
+      // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+
+    return __longlong_as_double(old);
+  }
+};
+
+#define ATOMIC_INTEGER_IMPL(NAME)                                                                                      \
+template <typename T, size_t n>                                                                                        \
+struct Atomic##NAME##IntegerImpl;                                                                                      \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 1> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    size_t offset = (size_t)address & 3;                                                                               \
+    uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);                                                 \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t shift = offset * 8;                                                                                       \
+    uint32_t old_byte;                                                                                                 \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      old_byte = (old >> shift) & 0xff;                                                                                \
+      newval = static_cast<uint8_t>(func(val, static_cast<T>(old_byte)));                                              \
+      newval = (old & ~(0x000000ff << shift)) | (newval << shift);                                                     \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 2> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    size_t offset = (size_t)address & 2;                                                                               \
+    uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);                                                 \
+    bool is_32_align = offset;                                                                                         \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t old_bytes;                                                                                                \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      old_bytes = is_32_align ? old >> 16 : old & 0xffff;                                                              \
+      newval = static_cast<uint16_t>(func(val, static_cast<T>(old_bytes)));                                            \
+      newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval;                            \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 4> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    uint32_t * address_as_ui = (uint32_t *) (address);                                                                 \
+    uint32_t old = *address_as_ui;                                                                                     \
+    uint32_t newval;                                                                                                   \
+    uint32_t assumed;                                                                                                  \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      newval = static_cast<uint32_t>(func(val, static_cast<T>(old)));                                                  \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};                                                                                                                     \
+                                                                                                                       \
+template<typename T>                                                                                                   \
+struct Atomic##NAME##IntegerImpl<T, 8> {                                                                               \
+  template <typename func_t>                                                                                           \
+  inline __device__ void operator()(T *address, T val, const func_t& func) {                                           \
+    unsigned long long * address_as_ui = (unsigned long long *) (address);                                             \
+    unsigned long long old = *address_as_ui;                                                                           \
+    unsigned long long newval;                                                                                         \
+    unsigned long long assumed;                                                                                        \
+                                                                                                                       \
+    do {                                                                                                               \
+      assumed = old;                                                                                                   \
+      newval = static_cast<uint64_t>(func(val, static_cast<T>(old)));                                                  \
+      old = atomicCAS(address_as_ui, assumed, newval);                                                                 \
+    } while (assumed != old);                                                                                          \
+  }                                                                                                                    \
+};
+
+
+# define GPU_ATOMIC_INTEGER(NAME, OP, DTYPE)                                                                           \
+static inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) {                                             \
+Atomic##NAME##IntegerImpl<DTYPE, sizeof(DTYPE)>()(address,                                                             \
+                                                      val,                                                             \
+                                                      [](DTYPE a, DTYPE b) {                                           \
+                                                          return OP;                                                   \
+                                                      });                                                              \
+}                                                                                                                      \
+
+ATOMIC_INTEGER_IMPL(Add)
+GPU_ATOMIC_INTEGER(Add, a || b, bool)
+
+// Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64)
+static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
+  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address,
+                                                   val,
+                                                   [](uint8_t a, uint8_t b) {
+                                                      return a + b;
+                                                   });
+}
+
+static inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
+  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address,
+                                                 val,
+                                                 [](int8_t a, int8_t b) {
+                                                   return a + b;
+                                                 });
+}
+
+static inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
+  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address,
+                                                   val,
+                                                   [](int16_t a, int16_t b) {
+                                                     return a + b;
+                                                   });
+}
+
+static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
+  return atomicAdd(address, val);
+}
+
+static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
+#if defined(USE_ROCM)
+  __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
+#else
+  static_assert(sizeof(unsigned long long int) == sizeof(int64_t), "bitwidth change is not allowed");
+  atomicAdd(reinterpret_cast<unsigned long long int *>(address), static_cast<unsigned long long int>(val));
+#endif
+}
+
+static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
+#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half hsum, at::Half val) {
+                                  return hsum + val;
+                                });
+#else
+  return atomicAdd(reinterpret_cast<__half*>(address), val);
+#endif
+}
+
+static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
+return AtomicFPOp<at::BFloat16>()(address, val,
+                                  [](at::BFloat16 bsum, at::BFloat16 val) {
+                                    return bsum + val;
+                                  });
+#else
+  __nv_bfloat16 r = atomicAdd(reinterpret_cast<__nv_bfloat16*>(address), *reinterpret_cast<__nv_bfloat16*>(&val));
+  return *reinterpret_cast<c10::BFloat16*>(&r);
+#endif
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
+// from CUDA C Programmic Guide
+static inline __device__ double atomicAdd(double* address, double val)
+#if defined(__clang__) && defined(__CUDA__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wgcc-compat"
+    __attribute__((enable_if(true, "")))
+#pragma GCC diagnostic pop
+#endif
+{
+
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(val + __longlong_as_double(assumed));
+                              });
+}
+#elif defined(USE_ROCM) || !(defined(__CUDA_ARCH__))
+
+/* Note [hip-clang differences to hcc]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The upcoming hip-clang compiler for ROCm differs from hcc in a few details.
+ * It exports the __HIP__ macro, we can hence differentiate between hcc and
+ * hip-clang. In the below, hcc only received support for atomicAdd with double
+ * typing after work week 18312. hip-clang had support from the first version.
+ * In general, the code-visible differences between hip-clang and hcc will be
+ * minimal.
+ */
+
+#if defined(USE_ROCM) && __hcc_workweek__ < 18312 && !__HIP__
+  // This needs to be defined for the host side pass
+  static inline  __device__  double atomicAdd(double *address, double val) { }
+#endif
+#endif
+
+static inline __device__ double gpuAtomicAdd(double *address, double val) {
+  return atomicAdd(address, val);
+}
+
+static inline __device__ float gpuAtomicAdd(float *address, float val) {
+  return atomicAdd(address, val);
+}
+
+template<typename T>
+static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
+  gpuAtomicAdd(&address->real_, val.real_);
+  gpuAtomicAdd(&address->imag_, val.imag_);
+}
+
+/* Note [gpuAtomicAdd vs atomicAdd]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Some extensions such as torchvision call atomicAdd()
+ * directly and require non-library provided data type support. Only for these, we
+ * continue to provide atomicAdd overloads.
+ */
+static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+  return gpuAtomicAdd(address, val);
+}
+
+static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+  return gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
+  gpuAtomicAdd(address, val);
+}
+
+static inline __device__ void atomicAdd(bool *address, bool val) {
+  gpuAtomicAdd(address, val);
+}
+
+/* Note [explicitly non-returning atomics]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * AMD's MI100 (gfx908) provides an optimized fp32 atomicAdd, exposed via atomicAddNoRet().
+ * Due to compiler limitations, callers must opt-in to guarantee the optimized instruction.
+ * This non-returning atomicAddNoRet cannot be used to implement the returning atomicAdd,
+ * therefore we need a new API 'gpuAtomicAddNoReturn'.
+ */
+template<typename T>
+static inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
+static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
+
+/* Special case fp32 atomic. */
+#if defined(USE_ROCM)
+static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); }
+#else
+static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
+#endif
+
+// Atomic multiplication implementation.
+
+ATOMIC_INTEGER_IMPL(Mul)
+GPU_ATOMIC_INTEGER(Mul, a * b, uint8_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int8_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int16_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int32_t)
+GPU_ATOMIC_INTEGER(Mul, a * b, int64_t)
+
+inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return bsum * val;
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return bsum * val;
+                                    });
+}
+
+inline __device__ double gpuAtomicMul(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(val * __longlong_as_double(assumed));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMul (float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(val *
+                                   __int_as_float(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+// Atomic maximum implementation.
+
+template <typename T>
+__host__ __device__ T safe_max(T a, T b) {
+  #if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
+  #else
+    T max = at::_isnan(b) ? b : std::max<T>(a, b);
+  #endif
+
+  return max;
+}
+
+ATOMIC_INTEGER_IMPL(Max)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int16_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int32_t)
+GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int64_t)
+
+inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_max(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_max(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMax(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_max(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMax(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_max(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+// Atomic minimum implementation.
+
+template <typename T>
+__host__ __device__ T safe_min(T a, T b) {
+  #if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+    T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
+  #else
+    T min = at::_isnan(b) ? b : std::min<T>(a, b);
+  #endif
+
+  return min;
+}
+
+ATOMIC_INTEGER_IMPL(Min)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int16_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int32_t)
+GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int64_t)
+
+inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) {
+  return AtomicFPOp<at::Half>()(address, val,
+                                [](at::Half bsum, at::Half val) {
+                                  return safe_min(bsum, val);
+                                });
+}
+
+inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) {
+  return AtomicFPOp<at::BFloat16>()(address, val,
+                                    [](at::BFloat16 bsum, at::BFloat16 val) {
+                                      return safe_min(bsum, val);
+                                    });
+}
+
+inline __device__ double gpuAtomicMin(double * address, double val) {
+  return AtomicFPOp<double>()(address, val,
+                              [](double val, unsigned long long int assumed) {
+                                return __double_as_longlong(safe_min(val, __longlong_as_double(assumed)));
+                              });
+}
+
+// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
+inline __device__ float gpuAtomicMin(float * address, float val) {
+  unsigned int* address_as_ull = (unsigned int*)address;
+  unsigned int old = *address_as_ull;
+  unsigned int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __float_as_int(safe_min(val, __int_as_float(assumed))));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..eb26308c52dfc4b1c62b67b22a76d6a6a37c241c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh
@@ -0,0 +1,537 @@
+#pragma once
+
+#include <ATen/cuda/ApplyGridUtils.cuh>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/Copy.h>
+
+#include <math.h>
+
+//
+// This file contains pointwise operation functions and kernels that
+// work on both contiguous and non-contiguous tensor arguments of
+// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without
+// copying or temporary storage.
+//
+
+/*
+  NOTE [ CUDA_tensor_applyN helpers ]
+
+  The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4)
+  functions apply a pointwise operator to N tensor(s).
+
+  The calling convention is
+
+  1. The template arguments should be, sequentially,
+    - First N typename args specify the scalar types of each of the N tensors.
+    - (Optional) `int step` arg specifies the number of elements processed
+      together at the same time.
+      Default is 1.
+    - A usually omitted (i.e., inferred) typename arg specifies the type of the
+      function/functor applied on `N * step` values  in each iteration of each
+      CUDA thread.
+  2. The arguments should be, sequentially,
+    - N tensors
+    - op: a function/functor that processes `N * step` values at the same time.
+      - If `step == 1`, it must have signature
+        `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where
+        `scalar*_t`s are the first N typename template args, and the inputs
+        are the `N` values from the `N` tensors retrieved at a common index.
+      - Otherwise, it must must have signature
+          void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&,  // repeat `step` times
+                         scalar2_t&, scalar2_t&, ..., scalar2_t&,  // repeat `step` times
+                         ...,
+                         scalarN_t&, scalarN_t&, ..., scalarN_t&)  // repeat `step` times
+        Different from `step == 1` case, it processes `N * step` values taken
+        from `step` common indices. Moreover, the first input `n` represents the
+        number of valid indices (it will always have `0 < n <= step`). It will
+        almost always be `step`, but at the boundary we may not have full `step`
+        elements and `n` can be a lesser value.
+
+        E.g., if `step == 4` and `N == 2`, `op` could be
+
+          [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4,
+                    scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) {
+            // Only process u1, ..., un and v1, ..., vn.
+            // So if `n == 3`, `u4` and `v4` need not to be considered.
+          }
+
+      In both cases, the references can actually be const, but at least one of
+      them should be non-const in order to write the output.
+    - (Optional, but recommended) N TensorArgType args that specify for each
+      tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite),
+      or only reads (i.e., TensorArgType::ReadOnly).
+      Default is TensorArgType::ReadWrite for first Tensor, and
+                 TensorArgType::ReadOnly  for the rest.
+
+  E.g.,
+
+  to compute a = b^2 for a and b of same dtype, we can call
+
+  CUDA_tensor_apply2<scalar, scalar>(
+    a, b,
+    [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; }
+  );
+
+  to work on 2 values at the same time, we can call
+
+  CUDA_tensor_apply2<scalar1, scalar2, 2>(
+    a, b,
+    [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2,
+                          const scalar2 &b_val1, const scalar2 &b_val2) {
+      // call special vectorized op here, or just do elementwise and enjoy unrolling...
+      // if n == 1, only process a_val1 and b_val1
+    }
+  );
+*/
+
+namespace at::cuda {
+
+// TODO: combine with TensorArg?  So far that's been for debugging, and this is functional...
+enum class TensorArgType { ReadWrite, ReadOnly };
+
+namespace {
+
+// Rearrange dimensions for pointwise operations so that strides are in
+// decreasing order as much as possible, so that kernels have better memory
+// access patterns.
+//
+// For example, consider a binary operation on two "transposed" 2-dim tensors:
+//    sizes:          256 512
+//    aInfo->strides:   1 256
+//    bInfo->strides:   1 256
+//
+// Given this, each concurrent memory access inside kernelPointwiseApply2() is
+// exactly 256 elements apart, resulting in poor performance.
+//
+// This function exchanges dimensions so that memory access is contiguous:
+//    sizes:          512 256
+//    aInfo->strides: 256   1
+//    bInfo->strides: 256   1
+//
+// (Actually, it becomes even better because now collapseDims() can turn each
+// input into one contiguous array.)
+//
+// In general, given M (<=4) TensorInfo's with N dimensions, we can view each
+// strides[i] (0 <= i < N) as an M-tuple.  Given each pair i < j, we exchange
+// strides[i] and [j] if
+//    (1) strides[i][k] < strides[j][k] for some k (0 <= k < M)
+//        (exchanging them will benefit input #k), and
+//    (2) strides[i][k] <= strieds[j][k] for all k
+//        (exchanging them will not make any input worse).
+template <typename T1, typename IndexType,
+          typename T2 = void, typename T3 = void, typename T4 = void>
+inline void rearrangeDims(detail::TensorInfo<T1, IndexType>* aInfo,
+                          detail::TensorInfo<T2, IndexType>* bInfo = nullptr,
+                          detail::TensorInfo<T3, IndexType>* cInfo = nullptr,
+                          detail::TensorInfo<T4, IndexType>* dInfo = nullptr) {
+  int numInfos = 1;
+  int dims = aInfo->dims;
+  IndexType *sizes[4] = { aInfo->sizes, };
+  IndexType *strides[4] = { aInfo->strides, };
+
+  if (bInfo != nullptr) {
+    ++numInfos;
+    if (bInfo->dims != dims) return;
+    sizes[1] = bInfo->sizes;
+    strides[1] = bInfo->strides;
+  }
+
+  if (cInfo != nullptr) {
+    ++numInfos;
+    if (cInfo->dims != dims) return;
+    sizes[2] = cInfo->sizes;
+    strides[2] = cInfo->strides;
+  }
+
+  if (dInfo != nullptr) {
+    ++numInfos;
+    if (dInfo->dims != dims) return;
+    sizes[3] = dInfo->sizes;
+    strides[3] = dInfo->strides;
+  }
+
+  // Bail out if sizes do not match: we are using "deprecated pointwise
+  // behavior" among tensors of different shapes but same number of elements.
+  for (int i = 1; i < numInfos; ++i) {
+    for (int j = 0; j < dims; ++j) {
+      if (sizes[i][j] != sizes[0][j]) return;
+    }
+  }
+
+  for (int i = 0; i < dims - 1; ++i) {
+    // No need to consider dimensions of size 1.
+    if (sizes[0][i] == 1) continue;
+
+    for (int j = i + 1; j < dims; ++j) {
+      if (sizes[0][j] == 1) continue;
+
+      // Compare the relative sizes of strides between dim #i and dim #j.
+      bool hasIncreasingStrides = false;
+      bool hasDecreasingStrides = false;
+
+      for (int k = 0; k < numInfos; k++) {
+        IndexType stride_i = strides[k][i];
+        IndexType stride_j = strides[k][j];
+        if (stride_i < stride_j) {
+          hasIncreasingStrides = true;
+        } else if (stride_i > stride_j) {
+          hasDecreasingStrides = true;
+        }
+      }
+
+      if (hasIncreasingStrides && !hasDecreasingStrides) {
+        for (int k = 0; k < numInfos; k++) {
+          IndexType size = sizes[k][i];
+          sizes[k][i] = sizes[k][j];
+          sizes[k][j] = size;
+
+          IndexType stride = strides[k][i];
+          strides[k][i] = strides[k][j];
+          strides[k][j] = stride;
+        }
+      }
+    }
+  }
+}
+
+// The `remaining_steps` argument is used to support Op that operates on
+// multiple elements at the same time. Generally, the strategy of ApplyOpN is to
+//  1. Initialize `remaining_steps = step`, where `step` is the template arg of
+//     CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the
+//     number of elements in bound for this call. It will almost always equal to
+//     `step` except at boundaries.
+//  2. If `remaining_steps > 0` convert the current linearIndex to offset (if in
+//     bound), and recursively call `ApplyOpN` with `remaining_steps - 1`.
+//  3. At `remaining_steps = 0`,
+//       if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`;
+//       if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep,
+//                                  tensor2_val1, tensor2_val2, ..., tesor2_valstep,
+//                                       ...
+//                                  tensorN_val1, tensorN_val2, ..., tesorN_valstep);`
+//
+// See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like.
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp1 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                  IndexType linearIndex, Offsets... aOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = sizeof...(Offsets) < n ?
+    detail::IndexToOffset<scalar, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  ApplyOp1<Op, scalar, IndexType, ADims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, op, n, linearIndex + 1, aOffsets..., aOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename Offset>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op,
+                  int n, IndexType linearIndex, Offset offset) {
+  op(a.data[offset]);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          typename... Offsets>
+struct ApplyOp1<Op, scalar, IndexType, ADims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar, IndexType> &a, const Op &op, int n,
+                 IndexType linearIndex, Offsets... offsets) {
+  op(n, a.data[offsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar,
+          typename IndexType,
+          int ADims,
+          int step>
+#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM)
+#endif
+__global__ void kernelPointwiseApply1(detail::TensorInfo<scalar, IndexType> a,
+                                      IndexType totalElements, const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp1<Op, scalar, IndexType, ADims, step>::apply(
+      a, op, ::min(step, static_cast<int>(totalElements - linearIndex)), linearIndex);
+  }
+}
+
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          int remaining_steps,
+          typename... Offsets>
+struct ApplyOp2 {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int64_t n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  // Convert `linearIndex` into an offset of `a`
+  const IndexType aOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
+    detail::IndexToOffset<scalar1, IndexType, ADims>::get(linearIndex, a) : 0;
+
+  // Convert `linearIndex` into an offset of `b`
+  const IndexType bOffset = static_cast<int64_t>(sizeof...(Offsets)) < n ?
+    detail::IndexToOffset<scalar2, IndexType, BDims>::get(linearIndex, b) : 0;
+
+  ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, remaining_steps - 1, const IndexType, Offsets...>::apply(
+    a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset
+  );
+}
+};
+
+// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`).
+// We don't need to pass in how many elements need to processed in this case.
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          typename Offset>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offset> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int /*n*/, IndexType /*linearIndex*/,
+                  Offset aOffset, Offset bOffset) {
+  op(a.data[aOffset], b.data[bOffset]);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims,
+          int BDims,
+          typename... Offsets>
+struct ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, 0, Offsets...> {
+__device__ __forceinline__
+static void apply(detail::TensorInfo<scalar1, IndexType> &a,
+                  detail::TensorInfo<scalar2, IndexType> &b,
+                  const Op &op, int n, IndexType linearIndex,
+                  Offsets... aOffsets, Offsets... bOffsets) {
+  op(n, a.data[aOffsets]..., b.data[bOffsets]...);
+}
+};
+
+template <typename Op,
+          typename scalar1,
+          typename scalar2,
+          typename IndexType,
+          int ADims, int BDims,
+          int step,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM)
+C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm)
+#endif
+__global__ void
+kernelPointwiseApply2(detail::TensorInfo<scalar1, IndexType> a,
+                      detail::TensorInfo<scalar2, IndexType> b,
+                      IndexType totalElements,
+                      const Op op) {
+  for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step;
+       linearIndex < totalElements;
+       linearIndex += gridDim.x * blockDim.x * step) {
+    ApplyOp2<Op, scalar1, scalar2, IndexType, ADims, BDims, step>::apply(
+      a, b, op, ::min(step, static_cast<int>(totalElements - linearIndex)),
+      linearIndex);
+  }
+}
+
+} // anonymous namespace
+
+template <typename scalar1, typename scalar2, int step, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+inline bool CUDA_tensor_apply2(at::TensorBase a,
+                               at::TensorBase b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
+  TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(),
+              "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got "
+              "tensors with type ", a.device().type(), " and ", b.device().type());
+  int64_t totalElements = a.numel();
+
+  if (totalElements != b.numel()) {
+    return false;
+  }
+
+  if (a.dim() > MAX_TENSORINFO_DIMS ||
+      b.dim() > MAX_TENSORINFO_DIMS) {
+    return false;
+  }
+
+  if (a.numel() == 0) {
+    // Empty tensor; do nothing
+    return true;
+  }
+  const dim3 block = getApplyBlock(max_threads_per_block);
+
+  dim3 grid;
+  auto curDevice = current_device();
+  if (curDevice == -1) return false;
+  if (!getApplyGrid<step>(totalElements, grid, curDevice, max_threads_per_block)) {
+    return false;
+  }
+
+  /*
+  Expands readable/writable tensors whose indices may be "overlapped."
+  This ensures that each element of the tensor is operated on once and only
+  once.
+  */
+  TensorBase oldA;
+  TensorBase oldB;
+
+  if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
+    // Must perform in contiguous space
+    oldA = std::exchange(a, a.contiguous());
+  }
+  if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
+    // Must perform in contiguous space
+    oldB = std::exchange(b, b.contiguous());
+  }
+
+  // It is possible that the tensor dimensions are able to be collapsed,
+  // and thus we can reduce the actual code complexity of the copy by
+  // exploiting this knowledge statically, since the div/mod is the
+  // most expensive part of the operation, more so than memory accesses.
+  // For instance, when copying a non-contiguous to a contiguous tensor
+  // (or vice versa), the contiguous tensor can be collapsed to one
+  // dimension, and the loop to translate the linear index to the array
+  // index can be similarly collapsed. That is what this unrolling is for.
+
+#define HANDLE_CASE(TYPE, A, B)                                        \
+  kernelPointwiseApply2<Op,                                            \
+                        scalar1,                                       \
+                        scalar2,                                       \
+                        TYPE, A, B, step,                              \
+                        max_threads_per_block,                         \
+                        min_blocks_per_sm>                             \
+   <<<grid, block, 0, at::cuda::getCurrentCUDAStream(curDevice)>>>(    \
+       aInfo, bInfo, static_cast<TYPE>(totalElements), op);            \
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+#define HANDLE_B_CASE(TYPE, A, B) {         \
+  switch (B) {                              \
+    case 1:                                 \
+      HANDLE_CASE(TYPE, A, 1);              \
+      break;                                \
+    case 2:                                 \
+      HANDLE_CASE(TYPE, A, 2);              \
+      break;                                \
+    default:                                \
+      HANDLE_CASE(TYPE, A, -1);             \
+      break;                                \
+  }                                         \
+}
+
+#define HANDLE_A_CASE(TYPE, A, B) {         \
+  switch (A) {                              \
+    case 1:                                 \
+      HANDLE_B_CASE(TYPE, 1, B);            \
+      break;                                \
+    case 2:                                 \
+      HANDLE_B_CASE(TYPE, 2, B);            \
+      break;                                \
+    default:                                \
+      HANDLE_B_CASE(TYPE, -1, B);           \
+      break;                                \
+  }                                         \
+}
+
+  if (detail::canUse32BitIndexMath(a) &&
+      detail::canUse32BitIndexMath(b)) {
+    detail::TensorInfo<scalar1, unsigned int> aInfo =
+      detail::getTensorInfo<scalar1, unsigned int>(a);
+
+    detail::TensorInfo<scalar2, unsigned int> bInfo =
+      detail::getTensorInfo<scalar2, unsigned int>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims);
+  } else {
+    detail::TensorInfo<scalar1, uint64_t> aInfo =
+      detail::getTensorInfo<scalar1, uint64_t>(a);
+
+    detail::TensorInfo<scalar2, uint64_t> bInfo =
+      detail::getTensorInfo<scalar2, uint64_t>(b);
+    rearrangeDims(&aInfo, &bInfo);
+    aInfo.collapseDims();
+    bInfo.collapseDims();
+
+    /*
+    Only instantiates the all 1D special case and the fallback all nD case for
+    large (64-bit indexed) tensors to reduce compilation time.
+    */
+    if (aInfo.dims == 1 && bInfo.dims == 1) {
+      HANDLE_CASE(uint64_t, 1, 1);
+    } else {
+      HANDLE_CASE(uint64_t, -1, -1);
+    }
+  }
+#undef HANDLE_CASE
+#undef HANDLE_B_CASE
+#undef HANDLE_A_CASE
+
+  if (oldA.defined()) {
+    at::native::copy_ignoring_overlaps(oldA, a);
+  }
+
+  if (oldB.defined()) {
+    at::native::copy_ignoring_overlaps(oldB, b);
+  }
+
+  return true;
+}
+
+/* Provides default step = 1 to CUDA_tensor_apply2. */
+template <typename scalar1, typename scalar2, typename Op,
+          int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK,
+          int min_blocks_per_sm=AT_APPLY_BLOCKS_PER_SM>
+inline bool CUDA_tensor_apply2(const at::TensorBase &a,
+                               const at::TensorBase &b,
+                               const Op op,
+                               TensorArgType aType = TensorArgType::ReadWrite,
+                               TensorArgType bType = TensorArgType::ReadOnly) {
+  return CUDA_tensor_apply2<scalar1, scalar2, 1, Op,
+                            max_threads_per_block, min_blocks_per_sm>(a, b, op, aType, bType);
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDABlas.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDABlas.h
new file mode 100644
index 0000000000000000000000000000000000000000..395a86902a322977291b7ab6370062dd47ac02d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDABlas.h
@@ -0,0 +1,375 @@
+#pragma once
+/*
+  Provides a subset of CUDA BLAS functions as templates:
+
+    gemm<Dtype>(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
+  ldc)
+
+    gemv<Dtype>(transa, m, n, alpha, a, lda, x, incx, beta, y, incy)
+
+    dot<Dtype>(n, x, incx, y, incy, result)
+
+  where Dtype is double, float, at::Half or at::BFloat16 (ROCm, NOT for dot).
+  The functions are available in at::cuda::blas namespace.
+ */
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/OpMathType.h>
+
+namespace at::cuda::blas {
+
+// RAII guard that sets the CuBLAS pointer mode and restores it to
+// its previous value when the guard is destroyed
+class PointerModeGuard {
+public:
+  PointerModeGuard(cublasHandle_t handle, cublasPointerMode_t mode) :
+      handle(handle) {
+    TORCH_CUDABLAS_CHECK(cublasGetPointerMode(handle, &previous_mode));
+    TORCH_CUDABLAS_CHECK(cublasSetPointerMode(handle, mode));
+  }
+
+  ~PointerModeGuard() {
+    cublasSetPointerMode(handle, previous_mode);
+  }
+
+private:
+  cublasHandle_t handle;
+  cublasPointerMode_t previous_mode;
+};
+
+/* LEVEL 3 BLAS FUNCTIONS */
+
+#define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
+      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
+      Dtype *c, int64_t ldc
+
+#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc
+
+template <typename Dtype>
+inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::gemm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
+template <>
+void gemm<float>(CUDABLAS_GEMM_ARGTYPES(float));
+template <>
+void gemm<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+template <>
+void gemm<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+template <>
+void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+template <>
+void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+
+template <typename Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::gemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemm_internal<double>(CUDABLAS_GEMM_ARGTYPES(double));
+template <>
+void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float));
+template <>
+void gemm_internal<c10::complex<double>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<double>));
+template <>
+void gemm_internal<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::complex<float>));
+template <>
+void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
+template <>
+void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+enum GEMMAndBiasActivationEpilogue {
+  None,
+  RELU,
+  GELU,
+};
+
+// NOTE: GELU activation is not supported prior to CUDA 11.4 and will
+// do nothing if passed in that case.
+template <typename Dtype>
+void gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<Dtype> alpha_val,
+    const Dtype* mat1_ptr,
+    int64_t mat1_ld,
+    const Dtype* mat2_ptr,
+    int64_t mat2_ld,
+    const Dtype* bias,
+    Dtype* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
+
+void int8_gemm(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const int8_t* mat1_ptr,
+    int64_t mat1_ld,
+    const int8_t* mat2_ptr,
+    int64_t mat2_ld,
+    int32_t* result_ptr,
+    int64_t result_ld);
+
+void scaled_gemm(
+    char transa,
+    char transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const void* mat1_ptr,
+    const void* mat1_scale_ptr,
+    int64_t mat1_ld,
+    ScalarType mat1_dtype,
+    const void* mat2_ptr,
+    const void* mat2_scale_ptr,
+    int64_t mat2_ld,
+    ScalarType mat2_dtype,
+    const void* bias_ptr,
+    ScalarType bias_dtype,
+    void* result_ptr,
+    const void* result_scale_ptr,
+    int64_t result_ld,
+    ScalarType result_dtype,
+    void* amax_ptr,
+    bool use_fast_accum);
+#endif
+
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
+  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
+      const Dtype *a, int64_t lda, int64_t stridea,                                           \
+      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
+      at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+
+#define CUDABLAS_BGEMM_ARGS(Dtype) \
+  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches
+
+template <typename Dtype>
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::bgemm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
+template <>
+void bgemm<float>(CUDABLAS_BGEMM_ARGTYPES(float));
+template <>
+void bgemm<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>));
+template <>
+void bgemm<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>));
+template <>
+void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
+template <>
+void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+
+template <typename Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::bgemm_internal: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double));
+template <>
+void bgemm_internal<float>(CUDABLAS_BGEMM_ARGTYPES(float));
+template <>
+void bgemm_internal<c10::complex<double>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<double>));
+template <>
+void bgemm_internal<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::complex<float>));
+template <>
+void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
+template <>
+void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+
+#if defined(USE_ROCM) && ROCM_VERSION <= 50500
+// ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not.
+#define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
+  hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo, \
+      hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,    \
+      const Dtype *alpha,       Dtype *A, int lda, Dtype *B, int ldb
+#else
+#define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
+  cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
+      cublasOperation_t trans, cublasDiagType_t diag, int m, int n,    \
+      const Dtype *alpha, const Dtype *A, int lda, Dtype *B, int ldb
+#endif
+
+template <typename Dtype>
+inline void trsm(CUDABLAS_TRSM_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::cuda::blas::trsm: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+TORCH_CUDA_CU_API void trsm<float>(CUDABLAS_TRSM_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void trsm<double>(CUDABLAS_TRSM_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void trsm<c10::complex<float>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<float>));
+template <>
+TORCH_CUDA_CU_API void trsm<c10::complex<double>>(CUDABLAS_TRSM_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_TRSM_BATCHED_ARGTYPES(Dtype)                          \
+  cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
+      cublasOperation_t trans, cublasDiagType_t diag, int m, int n,    \
+      const Dtype *alpha, Dtype *A[], int lda, Dtype *B[], int ldb,    \
+      int batchCount
+
+template <typename Dtype>
+inline void trsmBatched(CUDABLAS_TRSM_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::blas::trsmBatched: not implemented for ",
+      typeid(Dtype).name());
+}
+
+template <>
+TORCH_CUDA_CU_API void trsmBatched<float>(CUDABLAS_TRSM_BATCHED_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<double>(CUDABLAS_TRSM_BATCHED_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<c10::complex<float>>(CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<float>));
+template <>
+TORCH_CUDA_CU_API void trsmBatched<c10::complex<double>>(CUDABLAS_TRSM_BATCHED_ARGTYPES(c10::complex<double>));
+
+/* LEVEL 2 BLAS FUNCTIONS */
+
+#define CUDABLAS_GEMV_ARGTYPES(Dtype)                                         \
+  char trans, int64_t m, int64_t n, Dtype alpha, const Dtype *a, int64_t lda, \
+      const Dtype *x, int64_t incx, Dtype beta, Dtype *y, int64_t incy
+
+template <typename Dtype>
+inline void gemv(CUDABLAS_GEMV_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::gemv: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void gemv<double>(CUDABLAS_GEMV_ARGTYPES(double));
+template <>
+void gemv<float>(CUDABLAS_GEMV_ARGTYPES(float));
+template <>
+void gemv<c10::complex<double>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<double>));
+template <>
+void gemv<c10::complex<float>>(CUDABLAS_GEMV_ARGTYPES(c10::complex<float>));
+template <>
+void gemv<at::Half>(CUDABLAS_GEMV_ARGTYPES(at::Half));
+template <>
+void gemv<at::BFloat16>(CUDABLAS_GEMV_ARGTYPES(at::BFloat16));
+
+/* LEVEL 1 BLAS FUNCTIONS */
+
+#define CUDABLAS_DOT_ARGTYPES(Dtype)                                      \
+  cublasHandle_t handle, int n, const Dtype *x, int incx, const Dtype *y, \
+      int incy, Dtype *result
+
+template <typename Dtype>
+inline void dot(CUDABLAS_DOT_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::dot: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void dot<double>(CUDABLAS_DOT_ARGTYPES(double));
+template <>
+void dot<float>(CUDABLAS_DOT_ARGTYPES(float));
+template <>
+void dot<at::Half>(CUDABLAS_DOT_ARGTYPES(at::Half));
+template <>
+void dot<at::BFloat16>(CUDABLAS_DOT_ARGTYPES(at::BFloat16));
+template <>
+void dot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
+template <>
+void dot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>));
+
+template <typename Dtype>
+inline void vdot(CUDABLAS_DOT_ARGTYPES(Dtype)) {
+  AT_ERROR("at::cuda::blas::vdot: not implemented for ", typeid(Dtype).name());
+}
+
+template <>
+void vdot<c10::complex<float>>(CUDABLAS_DOT_ARGTYPES(c10::complex<float>));
+template <>
+void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_GETRS_ARGTYPES(Dtype)  \
+  cublasHandle_t handle, cublasOperation_t trans, \
+  int n, int nrhs, Dtype** dA_array, int lda, int* ipiv_array, \
+  Dtype** dB_array, int ldb, int* info_array, int batchsize
+
+template<class Dtype>
+void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::cuda::blas::getrsBatched: not implemented for ",
+    typeid(Dtype).name());
+}
+template<>
+TORCH_CUDA_CU_API void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<double>(CUDABLAS_GETRS_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<c10::complex<float>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<float>));
+template<>
+TORCH_CUDA_CU_API void getrsBatched<c10::complex<double>>(CUDABLAS_GETRS_ARGTYPES(c10::complex<double>));
+
+#define CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)                   \
+  cublasHandle_t handle, int m, int n, Dtype **A_array, int lda, \
+      Dtype **tau_array, int *info, int batchsize
+
+template <class Dtype>
+void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::blas::geqrfBatched: not implemented for ",
+      typeid(Dtype).name());
+}
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<float>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(float));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<double>(CUDABLAS_GEQRF_BATCHED_ARGTYPES(double));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<c10::complex<double>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<double>));
+template <>
+TORCH_CUDA_CU_API void geqrfBatched<c10::complex<float>>(
+    CUDABLAS_GEQRF_BATCHED_ARGTYPES(c10::complex<float>));
+
+#define CUDABLAS_GETRF_ARGTYPES(Dtype)  \
+  int n, Dtype** dA_array, int ldda, int* ipiv_array, int* info_array, int batchsize
+
+template<class Dtype>
+void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not implemented for ", typeid(Dtype).name());
+}
+template<>
+TORCH_CUDA_CU_API void getrfBatched<float>(CUDABLAS_GETRF_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<double>(CUDABLAS_GETRF_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<c10::complex<double>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<double>));
+template<>
+TORCH_CUDA_CU_API void getrfBatched<c10::complex<float>>(CUDABLAS_GETRF_ARGTYPES(c10::complex<float>));
+
+#define CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)  \
+  cublasHandle_t handle, cublasOperation_t trans, int m, int n, int nrhs, Dtype** dA_array, int ldda, Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
+
+template <class Dtype>
+void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
+  TORCH_INTERNAL_ASSERT(false, "at::cuda::blas::gelsBatched: not implemented for ", typeid(Dtype).name());
+}
+
+template<>
+TORCH_CUDA_CU_API void gelsBatched<double>(CUDABLAS_GELS_BATCHED_ARGTYPES(double));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<float>(CUDABLAS_GELS_BATCHED_ARGTYPES(float));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<double>));
+template<>
+TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
+
+} // namespace at::cuda::blas
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContext.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContext.h
new file mode 100644
index 0000000000000000000000000000000000000000..b257e3f16b4adb5efde62dff92ed6f8fb9bc1a64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContext.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContextLight.h>
+
+// Preserved for BC, as many files depend on these includes
+#include <ATen/Context.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Logging.h>
+#include <ATen/cuda/Exceptions.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContextLight.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContextLight.h
new file mode 100644
index 0000000000000000000000000000000000000000..efaf986bc75d611cf6cf637ca7eeebc156de9a53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAContextLight.h
@@ -0,0 +1,95 @@
+#pragma once
+// Light-weight version of CUDAContext.h with fewer transitive includes
+
+#include <cstdint>
+
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <cublas_v2.h>
+
+// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+// added bf16 support
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+#include <cublasLt.h>
+#endif
+
+#ifdef CUDART_VERSION
+#include <cusolverDn.h>
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50300
+#include <hipsolver/hipsolver.h>
+#endif
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAFunctions.h>
+
+namespace c10 {
+struct Allocator;
+}
+
+namespace at::cuda {
+
+/*
+A common CUDA interface for ATen.
+
+This interface is distinct from CUDAHooks, which defines an interface that links
+to both CPU-only and CUDA builds. That interface is intended for runtime
+dispatch and should be used from files that are included in both CPU-only and
+CUDA builds.
+
+CUDAContext, on the other hand, should be preferred by files only included in
+CUDA builds. It is intended to expose CUDA functionality in a consistent
+manner.
+
+This means there is some overlap between the CUDAContext and CUDAHooks, but
+the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
+use CUDAHooks otherwise.
+
+Note that CUDAContext simply defines an interface with no associated class.
+It is expected that the modules whose functions compose this interface will
+manage their own state. There is only a single CUDA context/state.
+*/
+
+/**
+ * DEPRECATED: use device_count() instead
+ */
+inline int64_t getNumGPUs() {
+    return c10::cuda::device_count();
+}
+
+/**
+ * CUDA is available if we compiled with CUDA, and there are one or more
+ * devices.  If we compiled with CUDA but there is a driver problem, etc.,
+ * this function will report CUDA is not available (rather than raise an error.)
+ */
+inline bool is_available() {
+    return c10::cuda::device_count() > 0;
+}
+
+TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties();
+
+TORCH_CUDA_CPP_API int warp_size();
+
+TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device);
+
+TORCH_CUDA_CPP_API bool canDeviceAccessPeer(
+    c10::DeviceIndex device,
+    c10::DeviceIndex peer_device);
+
+TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
+
+/* Handles */
+TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
+TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
+#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700)
+TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
+#endif
+
+TORCH_CUDA_CPP_API void clearCublasWorkspaces();
+
+#if defined(CUDART_VERSION) || defined(USE_ROCM) && ROCM_VERSION >= 50300
+TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
+#endif
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADataType.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADataType.h
new file mode 100644
index 0000000000000000000000000000000000000000..d14a908a54831124b8127fc5df10870fd6e31b3f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADataType.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+
+#include <cuda.h>
+#include <library_types.h>
+
+namespace at::cuda {
+
+template <typename scalar_t>
+cudaDataType getCudaDataType() {
+  TORCH_INTERNAL_ASSERT(false, "Cannot convert type ", typeid(scalar_t).name(), " to cudaDataType.")
+}
+
+template<> inline cudaDataType getCudaDataType<at::Half>() {
+  return CUDA_R_16F;
+}
+template<> inline cudaDataType getCudaDataType<float>() {
+  return CUDA_R_32F;
+}
+template<> inline cudaDataType getCudaDataType<double>() {
+  return CUDA_R_64F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<c10::Half>>() {
+  return CUDA_C_16F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<float>>() {
+  return CUDA_C_32F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<double>>() {
+  return CUDA_C_64F;
+}
+
+// HIP doesn't define integral types
+#ifndef USE_ROCM
+template<> inline cudaDataType getCudaDataType<uint8_t>() {
+  return CUDA_R_8U;
+}
+template<> inline cudaDataType getCudaDataType<int8_t>() {
+  return CUDA_R_8I;
+}
+template<> inline cudaDataType getCudaDataType<int>() {
+  return CUDA_R_32I;
+}
+#endif
+
+#if !defined(USE_ROCM)
+template<> inline cudaDataType getCudaDataType<int16_t>() {
+  return CUDA_R_16I;
+}
+template<> inline cudaDataType getCudaDataType<int64_t>() {
+  return CUDA_R_64I;
+}
+template<> inline cudaDataType getCudaDataType<at::BFloat16>() {
+  return CUDA_R_16BF;
+}
+#endif
+
+inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) {
+  switch (scalar_type) {
+// HIP doesn't define integral types
+#ifndef USE_ROCM
+    case c10::ScalarType::Byte:
+      return CUDA_R_8U;
+    case c10::ScalarType::Char:
+      return CUDA_R_8I;
+    case c10::ScalarType::Int:
+      return CUDA_R_32I;
+#endif
+    case c10::ScalarType::Half:
+      return CUDA_R_16F;
+    case c10::ScalarType::Float:
+      return CUDA_R_32F;
+    case c10::ScalarType::Double:
+      return CUDA_R_64F;
+    case c10::ScalarType::ComplexHalf:
+      return CUDA_C_16F;
+    case c10::ScalarType::ComplexFloat:
+      return CUDA_C_32F;
+    case c10::ScalarType::ComplexDouble:
+      return CUDA_C_64F;
+#if !defined(USE_ROCM)
+    case c10::ScalarType::Short:
+      return CUDA_R_16I;
+    case c10::ScalarType::Long:
+      return CUDA_R_64I;
+    case c10::ScalarType::BFloat16:
+      return CUDA_R_16BF;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    case c10::ScalarType::Float8_e4m3fn:
+      return CUDA_R_8F_E4M3;
+    case c10::ScalarType::Float8_e5m2:
+      return CUDA_R_8F_E5M2;
+#endif
+#else // USE_ROCM
+    case c10::ScalarType::BFloat16:
+      return CUDA_R_16BF;
+#if defined(HIP_NEW_TYPE_ENUMS)
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return HIP_R_8F_E4M3_FNUZ;
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return HIP_R_8F_E5M2_FNUZ;
+#else
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return static_cast<hipDataType>(1000);
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return static_cast<hipDataType>(1001);
+#endif
+#endif
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
+  }
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADevice.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..5353a06ca6b11f607151a0b7c64762234b617c79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDADevice.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/cuda/Exceptions.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace at::cuda {
+
+inline Device getDeviceFromPtr(void* ptr) {
+  cudaPointerAttributes attr{};
+
+  AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+
+#if !defined(USE_ROCM)
+  TORCH_CHECK(attr.type != cudaMemoryTypeUnregistered,
+    "The specified pointer resides on host memory and is not registered with any CUDA device.");
+#endif
+
+  return {c10::DeviceType::CUDA, static_cast<DeviceIndex>(attr.device)};
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAEvent.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a9a15c4a857b71137afc8735ad226b6d91e3e2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAEvent.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/Exception.h>
+
+#include <cuda_runtime_api.h>
+
+#include <cstdint>
+#include <utility>
+
+namespace at::cuda {
+
+/*
+* CUDAEvents are movable not copyable wrappers around CUDA's events.
+*
+* CUDAEvents are constructed lazily when first recorded unless it is
+* reconstructed from a cudaIpcEventHandle_t. The event has a device, and this
+* device is acquired from the first recording stream. However, if reconstructed
+* from a handle, the device should be explicitly specified; or if ipc_handle() is
+* called before the event is ever recorded, it will use the current device.
+* Later streams that record the event must match this device.
+*/
+struct TORCH_CUDA_CPP_API CUDAEvent {
+  // Constructors
+  // Default value for `flags` is specified below - it's cudaEventDisableTiming
+  CUDAEvent() noexcept = default;
+  CUDAEvent(unsigned int flags) noexcept : flags_{flags} {}
+
+  CUDAEvent(
+      DeviceIndex device_index, const cudaIpcEventHandle_t* handle) {
+      device_index_ = device_index;
+      CUDAGuard guard(device_index_);
+
+      AT_CUDA_CHECK(cudaIpcOpenEventHandle(&event_, *handle));
+      is_created_ = true;
+  }
+
+  // Note: event destruction done on creating device to avoid creating a
+  // CUDA context on other devices.
+  ~CUDAEvent() {
+    try {
+      if (is_created_) {
+        CUDAGuard guard(device_index_);
+        const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+        if (C10_UNLIKELY(interp)) {
+          (*interp)->trace_gpu_event_deletion(reinterpret_cast<uintptr_t>(event_));
+        }
+        AT_CUDA_CHECK(cudaEventDestroy(event_));
+      }
+    } catch (...) { /* No throw */ }
+  }
+
+  CUDAEvent(const CUDAEvent&) = delete;
+  CUDAEvent& operator=(const CUDAEvent&) = delete;
+
+  CUDAEvent(CUDAEvent&& other) noexcept { moveHelper(std::move(other)); }
+  CUDAEvent& operator=(CUDAEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
+    return *this;
+  }
+
+  operator cudaEvent_t() const { return event(); }
+
+  // Less than operator (to allow use in sets)
+  friend bool operator<(const CUDAEvent& left, const CUDAEvent& right) {
+    return left.event_ < right.event_;
+  }
+
+  optional<at::Device> device() const {
+    if (is_created_) {
+      return at::Device(at::kCUDA, device_index_);
+    } else {
+      return {};
+    }
+  }
+
+  bool isCreated() const { return is_created_; }
+  DeviceIndex device_index() const {return device_index_;}
+  cudaEvent_t event() const { return event_; }
+
+  // Note: cudaEventQuery can be safely called from any device
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    cudaError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+
+    return false;
+  }
+
+  void record() { record(getCurrentCUDAStream()); }
+
+  void recordOnce(const CUDAStream& stream) {
+    if (!was_recorded_) record(stream);
+  }
+
+  // Note: cudaEventRecord must be called on the same device as the event.
+  void record(const CUDAStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(device_index_ == stream.device_index(), "Event device ", device_index_,
+      " does not match recording stream's device ", stream.device_index(), ".");
+    CUDAGuard guard(device_index_);
+    AT_CUDA_CHECK(cudaEventRecord(event_, stream));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          reinterpret_cast<uintptr_t>(event_),
+          reinterpret_cast<uintptr_t>(stream.stream())
+      );
+    }
+    was_recorded_ = true;
+  }
+
+  // Note: cudaStreamWaitEvent must be called on the same device as the stream.
+  // The event has no actual GPU resources associated with it.
+  void block(const CUDAStream& stream) {
+    if (is_created_) {
+      CUDAGuard guard(stream.device_index());
+      AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0));
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+        (*interp)->trace_gpu_event_wait(
+            reinterpret_cast<uintptr_t>(event_),
+            reinterpret_cast<uintptr_t>(stream.stream())
+        );
+      }
+    }
+  }
+
+  // Note: cudaEventElapsedTime can be safely called from any device
+  float elapsed_time(const CUDAEvent& other) const {
+    TORCH_CHECK(is_created_ && other.isCreated(),
+      "Both events must be recorded before calculating elapsed time.");
+    float time_ms = 0;
+    // raise cudaErrorNotReady if either event is recorded but not yet completed
+    AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  // Note: cudaEventSynchronize can be safely called from any device
+  void synchronize() const {
+    if (is_created_) {
+      const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+      if (C10_UNLIKELY(interp)) {
+          (*interp)->trace_gpu_event_synchronization(reinterpret_cast<uintptr_t>(event_));
+      }
+      AT_CUDA_CHECK(cudaEventSynchronize(event_));
+    }
+  }
+
+  // Note: cudaIpcGetEventHandle must be called on the same device as the event
+  void ipc_handle(cudaIpcEventHandle_t * handle) {
+      if (!is_created_) {
+        // this CUDAEvent object was initially constructed from flags but event_
+        // is not created yet.
+        createEvent(getCurrentCUDAStream().device_index());
+      }
+      CUDAGuard guard(device_index_);
+      AT_CUDA_CHECK(cudaIpcGetEventHandle(handle, event_));
+  }
+
+private:
+  unsigned int flags_ = cudaEventDisableTiming;
+  bool is_created_ = false;
+  bool was_recorded_ = false;
+  DeviceIndex device_index_ = -1;
+  cudaEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    device_index_ = device_index;
+    CUDAGuard guard(device_index_);
+    AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(reinterpret_cast<uintptr_t>(event_));
+    }
+    is_created_ = true;
+  }
+
+  void moveHelper(CUDAEvent&& other) {
+    std::swap(flags_, other.flags_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f237d77f009d32080a1d9445bc42b256939a78ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h
@@ -0,0 +1,138 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/cuda/PhiloxCudaState.h>
+#include <ATen/Context.h>
+#include <limits>
+#include <atomic>
+
+namespace at {
+/**
+ * Note [CUDA Graph-safe RNG states]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * Strategy:
+ * ~~~~~~~~~
+ * (It helps to look at
+ * cuda/detail/PhiloxCudaStateRaw.cuh and
+ * cuda/detail/UnpackRaw.cuh
+ * while you read this.)
+ *
+ * A CUDA graph containing multiple RNG ops behaves like a
+ * single giant kernel from the perspective of ops external
+ * to the graph.  During graph capture, logic in CUDAGeneratorImpl
+ * records the total of all offset increments that occur in the
+ * graphed region, and records the final total as the offset for
+ * the entire graph.
+ *
+ * When the graph reruns, the logic that reruns it
+ * increments this device's CUDA generator's offset
+ * by that total.
+ *
+ * Meanwhile, within the graph, at capture time, instead of
+ * populating PhiloxCudaStates with the uint64_t offset pulled
+ * directly from the global state, PhiloxCudaState uses a pointer
+ * to a one-element stream-local int64_t device tensor
+ * holding an initial offset value, and a uint64_t holding an
+ * intra-graph offset. (The intra-graph offset starts from zero
+ * when capture begins.)  In each consumer kernel,
+ * at::cuda::philox::unpack computes the offset to use for this kernel
+ * as intra-graph offset + *initial offset.
+ *
+ * When the graph reruns, the logic that reruns it first
+ * fill_s the initial offset tensor with this device's
+ * CUDA generator's current offset.
+ *
+ * The control flow above ensures graphed execution is bitwise
+ * identical to eager execution as long as RNG ops are enqueued
+ * from a single thread, even if RNG ops and graphs containing
+ * RNG ops are enqueued and run simultaneously on multiple streams.
+ *
+ * Usage:
+ * ~~~~~~
+ * PhiloxCudaState in this file, and unpack() in
+ * cuda/CUDAGraphsUtils.cuh allow non-divergent use of
+ * CUDAGeneratorImpl whether graph capture is underway or not.
+ *
+ * Each PhiloxCudaState instance should be used for one and only one
+ * consumer kernel.
+ *
+ * Example (see e.g. native/cuda/Dropout.cu):
+ *
+ * #include <ATen/cuda/CUDAGeneratorImpl.h>
+ * #include <ATen/cuda/CUDAGraphsUtils.cuh>
+ *
+ * __global__ void kernel(..., PhiloxCudaState philox_args) {
+ *   auto seeds = at::cuda::philox::unpack(philox_args);
+ *   IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+ *   curandStatePhilox4_32_10_t state;
+ *   curand_init(std::get<0>(seeds), // seed
+ *               idx,                // per-thread subsequence
+ *               std::get<1>(seeds), // offset in subsequence
+ *               &state);
+ *   ...
+ * }
+ *
+ * host_caller(...) {
+ *   PhiloxCudaState rng_engine_inputs;
+ *   {
+ *     // See Note [Acquire lock when using random generators]
+ *     std::lock_guard<std::mutex> lock(gen->mutex_);
+ *
+ *     // gen could be HostState or DevState here! No divergent code needed!
+ *     rng_engine_inputs = gen->philox_cuda_state(offset_increment);
+ *   }
+ *   kernel<<<...>>>(..., rng_engine_inputs);
+ * }
+ *
+ */
+
+struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  CUDAGeneratorImpl(DeviceIndex device_index = -1);
+  ~CUDAGeneratorImpl() override = default;
+
+  // CUDAGeneratorImpl methods
+  std::shared_ptr<CUDAGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void set_philox_offset_per_thread(uint64_t offset);
+  uint64_t philox_offset_per_thread() const;
+  void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph);
+  uint64_t capture_epilogue();
+  PhiloxCudaState philox_cuda_state(uint64_t increment);
+
+  bool reset_rnn_state() {
+    return !no_reset_rnn_state_.test_and_set();
+  }
+
+  // Temporarily accommodates call sites that use philox_engine_inputs.
+  // Allows incremental refactor of call sites to use philox_cuda_state.
+  std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);
+
+  static c10::DeviceType device_type();
+
+private:
+  CUDAGeneratorImpl* clone_impl() const override;
+  uint64_t seed_ = default_rng_seed_val;
+  uint64_t philox_offset_per_thread_ = 0;
+  int64_t* seed_extragraph_{};
+  int64_t* offset_extragraph_{};
+  uint32_t offset_intragraph_ = 0;
+  bool graph_expects_this_gen_ = false;
+  std::atomic_flag no_reset_rnn_state_;
+};
+
+namespace cuda::detail {
+
+TORCH_CUDA_CPP_API const Generator& getDefaultCUDAGenerator(
+    DeviceIndex device_index = -1);
+TORCH_CUDA_CPP_API Generator createCUDAGenerator(DeviceIndex device_index = -1);
+
+} // namespace cuda::detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraph.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b3c1a3f27393f13971622f6b432818ece002cb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraph.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Device.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <mutex>
+
+namespace at {
+
+struct CUDAGeneratorImpl;
+
+namespace cuda {
+
+// Standalone way to get a unique mempool id usable as a pool=... argument
+// to CUDAGraph::capture_begin
+TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle();
+
+struct TORCH_CUDA_CPP_API CUDAGraph {
+  CUDAGraph();
+  ~CUDAGraph();
+
+  static void inc_pending_event_queries();
+  static void dec_pending_event_queries();
+  static int num_pending_event_queries();
+  void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
+  void capture_end();
+  void replay();
+  void reset();
+  MempoolId_t pool();
+  void enable_debug_mode();
+  void debug_dump(const std::string& debug_path);
+
+  protected:
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+  cudaGraph_t graph_ = NULL;
+  cudaGraphExec_t graph_exec_ = NULL;
+#endif
+
+  static std::atomic<int> pending_event_queries;
+
+  // internal states so reset() can do its best cleaning up
+  // Set to true in capture_end if cudaStreamEndCapture succeeded
+  // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate
+  // to create graph_exec_, then graph_ is deleted
+  bool has_graph_ = false;
+  // Set to true in capture_end if cudaGraphInstantiate succeeded
+  bool has_graph_exec_ = false;
+
+  // uuid of this instance's current capture, used to
+  // specify the pool.
+  CaptureId_t id_;
+
+  // the ID assigned by cuda during graph capture,
+  // used to identify when a stream is participating in capture
+  CaptureId_t capture_id_ = -1;
+
+  // uuid used to request a particular private mempool from CUDACachingAllocator.
+  // By default, this will be set to {id_, 0}.
+  //
+  // If capture_begin is called with "pool=other_graph.pool()", this graph's mempool_id_
+  // will be set to the other graph's mempool_id_, and therefore share a mempool with the
+  // other graph.
+  //
+  // If capture_begin is called with "pool=handle" where "handle" came from graph_pool_handle(),
+  // it will share a mempool with any other captures that used "pool=handle".
+  //
+  // Sharing a mempool across graphs saves memory, and it's safe if you
+  // know you'll replay those graphs in the same order you captured them.
+  MempoolId_t mempool_id_;
+
+  // Stream on which capture began
+  at::cuda::CUDAStream capture_stream_;
+
+  // Default generator on device where capture began
+  at::CUDAGeneratorImpl* capture_gen_;
+
+  // Device where capture occurred. Right now, for simplicity, we require all ops
+  // in a capture to run on the same device, but this is a limitation of CUDAGraph,
+  // not CUDA itself.  We can straightforwardly modify CUDAGraph to support multi-device
+  // captures if needed.
+  int capture_dev_;
+
+  // RNG state trackers
+  at::Tensor seed_extragraph_;
+  at::Tensor offset_extragraph_;
+  uint64_t wholegraph_increment_;
+};
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8de2adbb7ec9a7c8f47f23bebb855135a7452885
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/PhiloxUtils.cuh>
+#include <ATen/cuda/detail/CUDAHooks.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// c10/cuda/CUDAGraphsC10Utils.h has utils used by both c10 and aten.
+// This file adds utils used by aten only.
+
+namespace at::cuda {
+
+using CaptureId_t = c10::cuda::CaptureId_t;
+using CaptureStatus = c10::cuda::CaptureStatus;
+
+// Use this version where you don't want to create a CUDA context if none exists.
+inline CaptureStatus currentStreamCaptureStatus() {
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+  // don't create a context if we don't have to
+  if (c10::cuda::hasPrimaryContext(c10::cuda::current_device())) {
+    return c10::cuda::currentStreamCaptureStatusMayInitCtx();
+  } else {
+    return CaptureStatus::None;
+  }
+#else
+  return CaptureStatus::None;
+#endif
+}
+
+inline void assertNotCapturing(std::string attempt) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(status == CaptureStatus::None,
+              attempt,
+              " during CUDA graph capture. If you need this call to be captured, "
+              "please file an issue. "
+              "Current cudaStreamCaptureStatus: ",
+              status);
+}
+
+inline void errorIfCapturingCudnnBenchmark(std::string version_specific) {
+  auto status = currentStreamCaptureStatus();
+  TORCH_CHECK(status == CaptureStatus::None,
+              "Current cudaStreamCaptureStatus: ",
+              status,
+              "\nCapturing ",
+              version_specific,
+              "is prohibited. Possible causes of this error:\n"
+              "1. No warmup iterations occurred before capture.\n"
+              "2. The convolutions you're trying to capture use dynamic shapes, "
+              "in which case capturing them is generally prohibited.");
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparse.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2b5c0beade3776bc8c98edad6a4d05460dff1c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparse.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#if defined(USE_ROCM)
+#include <hipsparse/hipsparse-version.h>
+#define HIPSPARSE_VERSION ((hipsparseVersionMajor*100000) + (hipsparseVersionMinor*100) + hipsparseVersionPatch)
+#endif
+
+// cuSparse Generic API added in CUDA 10.1
+// Windows support added in CUDA 11.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && ((CUSPARSE_VERSION >= 10300) || (CUSPARSE_VERSION >= 11000 && defined(_WIN32)))
+#define AT_USE_CUSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_API() 0
+#endif
+
+// cuSparse Generic API descriptor pointers were changed to const in CUDA 12.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION < 12000)
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() 0
+#endif
+
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && \
+    (CUSPARSE_VERSION >= 12000)
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 1
+#else
+#define AT_USE_CUSPARSE_CONST_DESCRIPTORS() 0
+#endif
+
+#if defined(USE_ROCM)
+// hipSparse const API added in v2.4.0
+#if HIPSPARSE_VERSION >= 200400
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#endif
+#else // USE_ROCM
+#define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0
+#define AT_USE_HIPSPARSE_GENERIC_API() 0
+#endif // USE_ROCM
+
+// cuSparse Generic API spsv function was added in CUDA 11.3.0
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
+#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_SPSV() 0
+#endif
+
+// cuSparse Generic API spsm function was added in CUDA 11.3.1
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11600)
+#define AT_USE_CUSPARSE_GENERIC_SPSM() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_SPSM() 0
+#endif
+
+// cuSparse Generic API sddmm function was added in CUDA 11.2.1 (cuSparse version 11400)
+#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11400)
+#define AT_USE_CUSPARSE_GENERIC_SDDMM() 1
+#else
+#define AT_USE_CUSPARSE_GENERIC_SDDMM() 0
+#endif
+
+// BSR triangular solve functions were added in hipSPARSE 1.11.2 (ROCm 4.5.0)
+#if defined(CUDART_VERSION) ||                            \
+      (defined(USE_ROCM) && ROCM_VERSION >= 40500 )
+#define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 1
+#else
+#define AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() 0
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h
new file mode 100644
index 0000000000000000000000000000000000000000..9eb0488d2b3dbe7a64dbdd0463f5fbd53b9cde18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h
@@ -0,0 +1,318 @@
+#pragma once
+
+/*
+  Provides a subset of cuSPARSE functions as templates:
+
+    csrgeam2<scalar_t>(...)
+
+  where scalar_t is double, float, c10::complex<double> or c10::complex<float>.
+  The functions are available in at::cuda::sparse namespace.
+*/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDASparse.h>
+
+namespace at::cuda::sparse {
+
+#define CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)             \
+  cusparseHandle_t handle, int m, int n, const scalar_t *alpha,     \
+      const cusparseMatDescr_t descrA, int nnzA,                    \
+      const scalar_t *csrSortedValA, const int *csrSortedRowPtrA,   \
+      const int *csrSortedColIndA, const scalar_t *beta,            \
+      const cusparseMatDescr_t descrB, int nnzB,                    \
+      const scalar_t *csrSortedValB, const int *csrSortedRowPtrB,   \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \
+      const scalar_t *csrSortedValC, const int *csrSortedRowPtrC,   \
+      const int *csrSortedColIndC, size_t *pBufferSizeInBytes
+
+template <typename scalar_t>
+inline void csrgeam2_bufferSizeExt(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::csrgeam2_bufferSizeExt: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void csrgeam2_bufferSizeExt<float>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float));
+template <>
+void csrgeam2_bufferSizeExt<double>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double));
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<float>));
+template <>
+void csrgeam2_bufferSizeExt<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()                                      \
+  cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,     \
+      int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,     \
+      const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB, \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC,           \
+      int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace
+
+template <typename scalar_t>
+inline void csrgeam2Nnz(CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()) {
+  TORCH_CUDASPARSE_CHECK(cusparseXcsrgeam2Nnz(
+      handle,
+      m,
+      n,
+      descrA,
+      nnzA,
+      csrSortedRowPtrA,
+      csrSortedColIndA,
+      descrB,
+      nnzB,
+      csrSortedRowPtrB,
+      csrSortedColIndB,
+      descrC,
+      csrSortedRowPtrC,
+      nnzTotalDevHostPtr,
+      workspace));
+}
+
+#define CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)                                 \
+  cusparseHandle_t handle, int m, int n, const scalar_t *alpha,              \
+      const cusparseMatDescr_t descrA, int nnzA,                             \
+      const scalar_t *csrSortedValA, const int *csrSortedRowPtrA,            \
+      const int *csrSortedColIndA, const scalar_t *beta,                     \
+      const cusparseMatDescr_t descrB, int nnzB,                             \
+      const scalar_t *csrSortedValB, const int *csrSortedRowPtrB,            \
+      const int *csrSortedColIndB, const cusparseMatDescr_t descrC,          \
+      scalar_t *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC, \
+      void *pBuffer
+
+template <typename scalar_t>
+inline void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::csrgeam2: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void csrgeam2<float>(CUSPARSE_CSRGEAM2_ARGTYPES(float));
+template <>
+void csrgeam2<double>(CUSPARSE_CSRGEAM2_ARGTYPES(double));
+template <>
+void csrgeam2<c10::complex<float>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<float>));
+template <>
+void csrgeam2<c10::complex<double>>(
+    CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRMM_ARGTYPES(scalar_t)                                    \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                         \
+      cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n, \
+      int kb, int nnzb, const scalar_t *alpha,                               \
+      const cusparseMatDescr_t descrA, const scalar_t *bsrValA,              \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim,            \
+      const scalar_t *B, int ldb, const scalar_t *beta, scalar_t *C, int ldc
+
+template <typename scalar_t>
+inline void bsrmm(CUSPARSE_BSRMM_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrmm: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrmm<float>(CUSPARSE_BSRMM_ARGTYPES(float));
+template <>
+void bsrmm<double>(CUSPARSE_BSRMM_ARGTYPES(double));
+template <>
+void bsrmm<c10::complex<float>>(CUSPARSE_BSRMM_ARGTYPES(c10::complex<float>));
+template <>
+void bsrmm<c10::complex<double>>(CUSPARSE_BSRMM_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRMV_ARGTYPES(scalar_t)                                    \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                         \
+      cusparseOperation_t transA, int mb, int nb, int nnzb,                  \
+      const scalar_t *alpha, const cusparseMatDescr_t descrA,                \
+      const scalar_t *bsrValA, const int *bsrRowPtrA, const int *bsrColIndA, \
+      int blockDim, const scalar_t *x, const scalar_t *beta, scalar_t *y
+
+template <typename scalar_t>
+inline void bsrmv(CUSPARSE_BSRMV_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrmv: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrmv<float>(CUSPARSE_BSRMV_ARGTYPES(float));
+template <>
+void bsrmv<double>(CUSPARSE_BSRMV_ARGTYPES(double));
+template <>
+void bsrmv<c10::complex<float>>(CUSPARSE_BSRMV_ARGTYPES(c10::complex<float>));
+template <>
+void bsrmv<c10::complex<double>>(CUSPARSE_BSRMV_ARGTYPES(c10::complex<double>));
+
+#if AT_USE_HIPSPARSE_TRIANGULAR_SOLVE()
+
+#define CUSPARSE_BSRSV2_BUFFER_ARGTYPES(scalar_t)                 \
+  cusparseHandle_t handle, cusparseDirection_t dirA,              \
+      cusparseOperation_t transA, int mb, int nnzb,               \
+      const cusparseMatDescr_t descrA, scalar_t *bsrValA,         \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \
+      bsrsv2Info_t info, int *pBufferSizeInBytes
+
+template <typename scalar_t>
+inline void bsrsv2_bufferSize(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsv2_bufferSize: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsv2_bufferSize<float>(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(float));
+template <>
+void bsrsv2_bufferSize<double>(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(double));
+template <>
+void bsrsv2_bufferSize<c10::complex<float>>(
+    CUSPARSE_BSRSV2_BUFFER_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsv2_bufferSize<c10::complex<double>>(
+    CUSPARSE_BSRSV2_BUFFER_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(scalar_t)               \
+  cusparseHandle_t handle, cusparseDirection_t dirA,              \
+      cusparseOperation_t transA, int mb, int nnzb,               \
+      const cusparseMatDescr_t descrA, const scalar_t *bsrValA,   \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \
+      bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer
+
+template <typename scalar_t>
+inline void bsrsv2_analysis(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsv2_analysis: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsv2_analysis<float>(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(float));
+template <>
+void bsrsv2_analysis<double>(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(double));
+template <>
+void bsrsv2_analysis<c10::complex<float>>(
+    CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsv2_analysis<c10::complex<double>>(
+    CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRSV2_SOLVE_ARGTYPES(scalar_t)                           \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                       \
+      cusparseOperation_t transA, int mb, int nnzb, const scalar_t *alpha, \
+      const cusparseMatDescr_t descrA, const scalar_t *bsrValA,            \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim,          \
+      bsrsv2Info_t info, const scalar_t *x, scalar_t *y,                   \
+      cusparseSolvePolicy_t policy, void *pBuffer
+
+template <typename scalar_t>
+inline void bsrsv2_solve(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsv2_solve: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsv2_solve<float>(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(float));
+template <>
+void bsrsv2_solve<double>(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(double));
+template <>
+void bsrsv2_solve<c10::complex<float>>(
+    CUSPARSE_BSRSV2_SOLVE_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsv2_solve<c10::complex<double>>(
+    CUSPARSE_BSRSV2_SOLVE_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRSM2_BUFFER_ARGTYPES(scalar_t)                            \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                         \
+      cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \
+      int nnzb, const cusparseMatDescr_t descrA, scalar_t *bsrValA,          \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim,            \
+      bsrsm2Info_t info, int *pBufferSizeInBytes
+
+template <typename scalar_t>
+inline void bsrsm2_bufferSize(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsm2_bufferSize: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsm2_bufferSize<float>(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(float));
+template <>
+void bsrsm2_bufferSize<double>(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(double));
+template <>
+void bsrsm2_bufferSize<c10::complex<float>>(
+    CUSPARSE_BSRSM2_BUFFER_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsm2_bufferSize<c10::complex<double>>(
+    CUSPARSE_BSRSM2_BUFFER_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(scalar_t)                          \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                         \
+      cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \
+      int nnzb, const cusparseMatDescr_t descrA, const scalar_t *bsrValA,    \
+      const int *bsrRowPtrA, const int *bsrColIndA, int blockDim,            \
+      bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer
+
+template <typename scalar_t>
+inline void bsrsm2_analysis(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsm2_analysis: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsm2_analysis<float>(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(float));
+template <>
+void bsrsm2_analysis<double>(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(double));
+template <>
+void bsrsm2_analysis<c10::complex<float>>(
+    CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsm2_analysis<c10::complex<double>>(
+    CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(c10::complex<double>));
+
+#define CUSPARSE_BSRSM2_SOLVE_ARGTYPES(scalar_t)                             \
+  cusparseHandle_t handle, cusparseDirection_t dirA,                         \
+      cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \
+      int nnzb, const scalar_t *alpha, const cusparseMatDescr_t descrA,      \
+      const scalar_t *bsrValA, const int *bsrRowPtrA, const int *bsrColIndA, \
+      int blockDim, bsrsm2Info_t info, const scalar_t *B, int ldb,           \
+      scalar_t *X, int ldx, cusparseSolvePolicy_t policy, void *pBuffer
+
+template <typename scalar_t>
+inline void bsrsm2_solve(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(scalar_t)) {
+  TORCH_INTERNAL_ASSERT(
+      false,
+      "at::cuda::sparse::bsrsm2_solve: not implemented for ",
+      typeid(scalar_t).name());
+}
+
+template <>
+void bsrsm2_solve<float>(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(float));
+template <>
+void bsrsm2_solve<double>(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(double));
+template <>
+void bsrsm2_solve<c10::complex<float>>(
+    CUSPARSE_BSRSM2_SOLVE_ARGTYPES(c10::complex<float>));
+template <>
+void bsrsm2_solve<c10::complex<double>>(
+    CUSPARSE_BSRSM2_SOLVE_ARGTYPES(c10::complex<double>));
+
+#endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
+
+} // namespace at::cuda::sparse
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseDescriptors.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseDescriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5a5391ab3642ef5301b1d87a94a4306bf5f6929
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDASparseDescriptors.h
@@ -0,0 +1,290 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDASparse.h>
+
+#include <c10/core/ScalarType.h>
+
+#if defined(USE_ROCM)
+#include <type_traits>
+#endif
+
+namespace at::cuda::sparse {
+
+template <typename T, cusparseStatus_t (*destructor)(T*)>
+struct CuSparseDescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDASPARSE_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, cusparseStatus_t (*destructor)(T*)>
+class CuSparseDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, CuSparseDescriptorDeleter<T, destructor>> descriptor_;
+};
+
+#if AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+template <typename T, cusparseStatus_t (*destructor)(const T*)>
+struct ConstCuSparseDescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDASPARSE_CHECK(destructor(x));
+    }
+  }
+};
+
+template <typename T, cusparseStatus_t (*destructor)(const T*)>
+class ConstCuSparseDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+
+ protected:
+  std::unique_ptr<T, ConstCuSparseDescriptorDeleter<T, destructor>> descriptor_;
+};
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS || AT_USE_HIPSPARSE_CONST_DESCRIPTORS
+
+#if defined(USE_ROCM)
+using cusparseMatDescr = std::remove_pointer<hipsparseMatDescr_t>::type;
+using cusparseDnMatDescr = std::remove_pointer<hipsparseDnMatDescr_t>::type;
+using cusparseDnVecDescr = std::remove_pointer<hipsparseDnVecDescr_t>::type;
+using cusparseSpMatDescr = std::remove_pointer<hipsparseSpMatDescr_t>::type;
+using cusparseSpMatDescr = std::remove_pointer<hipsparseSpMatDescr_t>::type;
+using cusparseSpGEMMDescr = std::remove_pointer<hipsparseSpGEMMDescr_t>::type;
+#if AT_USE_HIPSPARSE_TRIANGULAR_SOLVE()
+using bsrsv2Info = std::remove_pointer<bsrsv2Info_t>::type;
+using bsrsm2Info = std::remove_pointer<bsrsm2Info_t>::type;
+#endif
+#endif
+
+// NOTE: This is only needed for CUDA 11 and earlier, since CUDA 12 introduced
+// API for const descriptors
+cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr);
+
+class TORCH_CUDA_CPP_API CuSparseMatDescriptor
+    : public CuSparseDescriptor<cusparseMatDescr, &cusparseDestroyMatDescr> {
+ public:
+  CuSparseMatDescriptor() {
+    cusparseMatDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseCreateMatDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+
+  CuSparseMatDescriptor(bool upper, bool unit) {
+    cusparseFillMode_t fill_mode =
+        upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER;
+    cusparseDiagType_t diag_type =
+        unit ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT;
+    cusparseMatDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseCreateMatDescr(&raw_descriptor));
+    TORCH_CUDASPARSE_CHECK(cusparseSetMatFillMode(raw_descriptor, fill_mode));
+    TORCH_CUDASPARSE_CHECK(cusparseSetMatDiagType(raw_descriptor, diag_type));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+#if AT_USE_HIPSPARSE_TRIANGULAR_SOLVE()
+
+class TORCH_CUDA_CPP_API CuSparseBsrsv2Info
+    : public CuSparseDescriptor<bsrsv2Info, &cusparseDestroyBsrsv2Info> {
+ public:
+  CuSparseBsrsv2Info() {
+    bsrsv2Info_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseCreateBsrsv2Info(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+class TORCH_CUDA_CPP_API CuSparseBsrsm2Info
+    : public CuSparseDescriptor<bsrsm2Info, &cusparseDestroyBsrsm2Info> {
+ public:
+  CuSparseBsrsm2Info() {
+    bsrsm2Info_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseCreateBsrsm2Info(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+
+#endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE
+
+#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type);
+
+#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS()
+class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
+    : public CuSparseDescriptor<cusparseDnMatDescr, &cusparseDestroyDnMat> {
+ public:
+  explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+};
+
+class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+    : public CuSparseDescriptor<const cusparseDnMatDescr, &destroyConstDnMat> {
+ public:
+  explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+};
+
+class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
+    : public CuSparseDescriptor<cusparseDnVecDescr, &cusparseDestroyDnVec> {
+ public:
+  explicit CuSparseDnVecDescriptor(const Tensor& input);
+};
+
+class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
+    : public CuSparseDescriptor<cusparseSpMatDescr, &cusparseDestroySpMat> {};
+
+#elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+  class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor
+      : public ConstCuSparseDescriptor<
+            cusparseDnMatDescr,
+            &cusparseDestroyDnMat> {
+   public:
+    explicit CuSparseDnMatDescriptor(
+        const Tensor& input,
+        int64_t batch_offset = -1);
+  };
+
+  class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor
+      : public ConstCuSparseDescriptor<
+            const cusparseDnMatDescr,
+            &destroyConstDnMat> {
+   public:
+    explicit CuSparseConstDnMatDescriptor(
+        const Tensor& input,
+        int64_t batch_offset = -1);
+  cusparseDnMatDescr* unsafe_mutable_descriptor() const {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  cusparseDnMatDescr* unsafe_mutable_descriptor() {
+    return const_cast<cusparseDnMatDescr*>(descriptor());
+  }
+  };
+
+  class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor
+      : public ConstCuSparseDescriptor<
+            cusparseDnVecDescr,
+            &cusparseDestroyDnVec> {
+   public:
+    explicit CuSparseDnVecDescriptor(const Tensor& input);
+  };
+
+  class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor
+      : public ConstCuSparseDescriptor<
+            cusparseSpMatDescr,
+            &cusparseDestroySpMat> {};
+#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS()
+
+class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor
+    : public CuSparseSpMatDescriptor {
+ public:
+  explicit CuSparseSpMatCsrDescriptor(const Tensor& input, int64_t batch_offset = -1);
+
+  std::tuple<int64_t, int64_t, int64_t> get_size() {
+    int64_t rows, cols, nnz;
+    TORCH_CUDASPARSE_CHECK(cusparseSpMatGetSize(
+        this->descriptor(),
+        &rows,
+        &cols,
+        &nnz));
+    return std::make_tuple(rows, cols, nnz);
+  }
+
+  void set_tensor(const Tensor& input) {
+    auto crow_indices = input.crow_indices();
+    auto col_indices = input.col_indices();
+    auto values = input.values();
+
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(crow_indices.is_contiguous());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(col_indices.is_contiguous());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.is_contiguous());
+    TORCH_CUDASPARSE_CHECK(cusparseCsrSetPointers(
+        this->descriptor(),
+        crow_indices.data_ptr(),
+        col_indices.data_ptr(),
+        values.data_ptr()));
+  }
+
+#if AT_USE_CUSPARSE_GENERIC_SPSV()
+  void set_mat_fill_mode(bool upper) {
+    cusparseFillMode_t fill_mode =
+        upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER;
+    TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
+        this->descriptor(),
+        CUSPARSE_SPMAT_FILL_MODE,
+        &fill_mode,
+        sizeof(fill_mode)));
+  }
+
+  void set_mat_diag_type(bool unit) {
+    cusparseDiagType_t diag_type =
+        unit ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT;
+    TORCH_CUDASPARSE_CHECK(cusparseSpMatSetAttribute(
+        this->descriptor(),
+        CUSPARSE_SPMAT_DIAG_TYPE,
+        &diag_type,
+        sizeof(diag_type)));
+  }
+#endif
+};
+
+#if AT_USE_CUSPARSE_GENERIC_SPSV()
+class TORCH_CUDA_CPP_API CuSparseSpSVDescriptor
+    : public CuSparseDescriptor<cusparseSpSVDescr, &cusparseSpSV_destroyDescr> {
+ public:
+  CuSparseSpSVDescriptor() {
+    cusparseSpSVDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseSpSV_createDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+#endif
+
+#if AT_USE_CUSPARSE_GENERIC_SPSM()
+class TORCH_CUDA_CPP_API CuSparseSpSMDescriptor
+    : public CuSparseDescriptor<cusparseSpSMDescr, &cusparseSpSM_destroyDescr> {
+ public:
+  CuSparseSpSMDescriptor() {
+    cusparseSpSMDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseSpSM_createDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+#endif
+
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50200) || !defined(USE_ROCM)
+class TORCH_CUDA_CPP_API CuSparseSpGEMMDescriptor
+    : public CuSparseDescriptor<cusparseSpGEMMDescr, &cusparseSpGEMM_destroyDescr> {
+ public:
+  CuSparseSpGEMMDescriptor() {
+    cusparseSpGEMMDescr_t raw_descriptor;
+    TORCH_CUDASPARSE_CHECK(cusparseSpGEMM_createDescr(&raw_descriptor));
+    descriptor_.reset(raw_descriptor);
+  }
+};
+#endif
+
+#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API()
+
+} // namespace at::cuda::sparse
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e3c8526a0004cde8198965f3aea34af25ac5c452
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/util/Half.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace at {
+template <>
+inline __half* Tensor::data() const {
+  return reinterpret_cast<__half*>(data<Half>());
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f41fae69ea89d078d61ebb3f698d0e24904761a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CUDAUtils.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+
+namespace at::cuda {
+
+// Check if every tensor in a list of tensors matches the current
+// device.
+inline bool check_device(ArrayRef<Tensor> ts) {
+  if (ts.empty()) {
+    return true;
+  }
+  Device curDevice = Device(kCUDA, current_device());
+  for (const Tensor& t : ts) {
+    if (t.device() != curDevice) return false;
+  }
+  return true;
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/CachingHostAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CachingHostAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..550000c0e580f0a91932c44161979922d5e00227
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/CachingHostAllocator.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAStream.h>
+
+namespace at::cuda {
+
+//
+// A caching allocator for CUDA host allocations (pinned memory).
+//
+// This provides a drop-in replacement for THCudaHostAllocator, which re-uses
+// freed pinned (page-locked) memory allocations. This avoids device
+// synchronizations due to cudaFreeHost calls.
+//
+// To ensure correct behavior, THCCachingHostAllocator_recordEvent must be
+// called anytime a pointer from this allocator is used in a cudaMemcpyAsync
+// call between host and device, and passed the corresponding context from the
+// allocation. This is currently invoked by at::native::copy_kernel_cuda.
+//
+// Note that this allocator does not split larger allocations into smaller
+// blocks, unlike the caching device allocator.
+//
+TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
+
+// Records an event in the specified stream. The allocation corresponding to the
+// input `ptr`/`ctx` will not be re-used until the event has occurred.
+TORCH_CUDA_CPP_API bool
+CachingHostAllocator_recordEvent(void* ptr, void* ctx, c10::cuda::CUDAStream stream);
+
+// Releases cached pinned memory allocations via cudaHostFree
+TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
+
+inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
+  return getCachingHostAllocator()->allocate(size);
+}
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/DeviceUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/DeviceUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3a258954db6306d16caf24906499faa7bc54aa77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/DeviceUtils.cuh
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <cuda.h>
+#include <c10/util/complex.h>
+#include <c10/util/Half.h>
+
+__device__ __forceinline__ unsigned int ACTIVE_MASK()
+{
+#if !defined(USE_ROCM)
+    return __activemask();
+#else
+// will be ignored anyway
+    return 0xffffffff;
+#endif
+}
+
+__device__ __forceinline__ void WARP_SYNC(unsigned mask = 0xffffffff) {
+#if !defined(USE_ROCM)
+  return __syncwarp(mask);
+#endif
+}
+
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int WARP_BALLOT(int predicate)
+{
+return __ballot(predicate);
+}
+#else
+__device__ __forceinline__ unsigned int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return __ballot_sync(mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+#endif
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return __shfl_sync(mask, value, srcLane, width);
+#else
+    return __shfl(value, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return __shfl_up_sync(mask, value, delta, width);
+#else
+    return __shfl_up(value, delta, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return __shfl_down_sync(mask, value, delta, width);
+#else
+    return __shfl_down(value, delta, width);
+#endif
+}
+
+#if defined(USE_ROCM)
+template<>
+__device__ __forceinline__ int64_t WARP_SHFL_DOWN<int64_t>(int64_t value, unsigned int delta, int width , unsigned int mask)
+{
+  //(HIP doesn't support int64_t). Trick from https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+  int2 a = *reinterpret_cast<int2*>(&value);
+  a.x = __shfl_down(a.x, delta);
+  a.y = __shfl_down(a.y, delta);
+  return *reinterpret_cast<int64_t*>(&a);
+}
+#endif
+
+template<>
+__device__ __forceinline__ c10::Half WARP_SHFL_DOWN<c10::Half>(c10::Half value, unsigned int delta, int width, unsigned int mask)
+{
+  return c10::Half(WARP_SHFL_DOWN<unsigned short>(value.x, delta, width, mask), c10::Half::from_bits_t{});
+}
+
+template <typename T>
+__device__ __forceinline__ c10::complex<T> WARP_SHFL_DOWN(c10::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if !defined(USE_ROCM)
+    return c10::complex<T>(
+        __shfl_down_sync(mask, value.real_, delta, width),
+        __shfl_down_sync(mask, value.imag_, delta, width));
+#else
+    return c10::complex<T>(
+        __shfl_down(value.real_, delta, width),
+        __shfl_down(value.imag_, delta, width));
+#endif
+}
+
+/**
+ * For CC 3.5+, perform a load using __ldg
+ */
+template <typename T>
+__device__ __forceinline__ T doLdg(const T* p) {
+#if __CUDA_ARCH__ >= 350 && !defined(USE_ROCM)
+  return __ldg(p);
+#else
+  return *p;
+#endif
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/EmptyTensor.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/EmptyTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..54942b88f761a277809e0901e931fcc6d18f950e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/EmptyTensor.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+TORCH_CUDA_CPP_API TensorBase empty_cuda(
+    IntArrayRef size,
+    ScalarType dtype,
+    c10::optional<Device> device_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_CUDA_CPP_API TensorBase empty_cuda(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
+TORCH_CUDA_CPP_API TensorBase empty_cuda(
+    IntArrayRef size,
+    const TensorOptions &options);
+
+TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt);
+
+TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt);
+
+TORCH_CUDA_CPP_API TensorBase empty_strided_cuda(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions &options);
+
+
+}  // namespace at::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/Exceptions.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f83d217db306ae038cc01023f54017c014af83e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Exceptions.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <cublas_v2.h>
+#include <cusparse.h>
+#include <c10/macros/Export.h>
+
+#ifdef CUDART_VERSION
+#include <cusolver_common.h>
+#endif
+
+#include <ATen/Context.h>
+#include <c10/util/Exception.h>
+#include <c10/cuda/CUDAException.h>
+
+
+namespace c10 {
+
+class CuDNNError : public c10::Error {
+  using Error::Error;
+};
+
+}  // namespace c10
+
+#define AT_CUDNN_FRONTEND_CHECK(EXPR, ...)                                                      \
+  do {                                                                                          \
+    auto error_object = EXPR;                                                                   \
+    if (!error_object.is_good()) {                                                              \
+      TORCH_CHECK_WITH(CuDNNError, false,                                                       \
+            "cuDNN Frontend error: ", error_object.get_message());                              \
+    }                                                                                           \
+  } while (0)                                                                                   \
+
+#define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__)
+
+// See Note [CHECK macro]
+#define AT_CUDNN_CHECK(EXPR, ...)                                                               \
+  do {                                                                                          \
+    cudnnStatus_t status = EXPR;                                                                \
+    if (status != CUDNN_STATUS_SUCCESS) {                                                       \
+      if (status == CUDNN_STATUS_NOT_SUPPORTED) {                                               \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ",                                                                    \
+            cudnnGetErrorString(status),                                                        \
+            ". This error may appear if you passed in a non-contiguous input.", ##__VA_ARGS__); \
+      } else {                                                                                  \
+        TORCH_CHECK_WITH(CuDNNError, false,                                                     \
+            "cuDNN error: ", cudnnGetErrorString(status), ##__VA_ARGS__);                       \
+      }                                                                                         \
+    }                                                                                           \
+  } while (0)
+
+namespace at::cuda::blas {
+C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error);
+} // namespace at::cuda::blas
+
+#define TORCH_CUDABLAS_CHECK(EXPR)                              \
+  do {                                                          \
+    cublasStatus_t __err = EXPR;                                \
+    TORCH_CHECK(__err == CUBLAS_STATUS_SUCCESS,                 \
+                "CUDA error: ",                                 \
+                at::cuda::blas::_cublasGetErrorEnum(__err),     \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+
+const char *cusparseGetErrorString(cusparseStatus_t status);
+
+#define TORCH_CUDASPARSE_CHECK(EXPR)                            \
+  do {                                                          \
+    cusparseStatus_t __err = EXPR;                              \
+    TORCH_CHECK(__err == CUSPARSE_STATUS_SUCCESS,               \
+                "CUDA error: ",                                 \
+                cusparseGetErrorString(__err),                  \
+                " when calling `" #EXPR "`");                   \
+  } while (0)
+
+// cusolver related headers are only supported on cuda now
+#ifdef CUDART_VERSION
+
+namespace at::cuda::solver {
+C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status);
+
+constexpr const char* _cusolver_backend_suggestion =            \
+  "If you keep seeing this error, you may use "                 \
+  "`torch.backends.cuda.preferred_linalg_library()` to try "    \
+  "linear algebra operators with other supported backends. "    \
+  "See https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.preferred_linalg_library";
+
+} // namespace at::cuda::solver
+
+// When cuda < 11.5, cusolver raises CUSOLVER_STATUS_EXECUTION_FAILED when input contains nan.
+// When cuda >= 11.5, cusolver normally finishes execution and sets info array indicating convergence issue.
+#define TORCH_CUSOLVER_CHECK(EXPR)                                      \
+  do {                                                                  \
+    cusolverStatus_t __err = EXPR;                                      \
+    if ((CUDA_VERSION < 11500 &&                                        \
+         __err == CUSOLVER_STATUS_EXECUTION_FAILED) ||                  \
+        (CUDA_VERSION >= 11500 &&                                       \
+         __err == CUSOLVER_STATUS_INVALID_VALUE)) {                     \
+      TORCH_CHECK_LINALG(                                               \
+          false,                                                        \
+          "cusolver error: ",                                           \
+          at::cuda::solver::cusolverGetErrorMessage(__err),             \
+          ", when calling `" #EXPR "`",                                 \
+          ". This error may appear if the input matrix contains NaN. ", \
+          at::cuda::solver::_cusolver_backend_suggestion);              \
+    } else {                                                            \
+      TORCH_CHECK(                                                      \
+          __err == CUSOLVER_STATUS_SUCCESS,                             \
+          "cusolver error: ",                                           \
+          at::cuda::solver::cusolverGetErrorMessage(__err),             \
+          ", when calling `" #EXPR "`. ",                               \
+          at::cuda::solver::_cusolver_backend_suggestion);              \
+    }                                                                   \
+  } while (0)
+
+#else
+#define TORCH_CUSOLVER_CHECK(EXPR) EXPR
+#endif
+
+#define AT_CUDA_CHECK(EXPR) C10_CUDA_CHECK(EXPR)
+
+// For CUDA Driver API
+//
+// This is here instead of in c10 because NVRTC is loaded dynamically via a stub
+// in ATen, and we need to use its nvrtcGetErrorString.
+// See NOTE [ USE OF NVRTC AND DRIVER API ].
+#if !defined(USE_ROCM)
+
+#define AT_CUDA_DRIVER_CHECK(EXPR)                                                                               \
+  do {                                                                                                           \
+    CUresult __err = EXPR;                                                                                       \
+    if (__err != CUDA_SUCCESS) {                                                                                 \
+      const char* err_str;                                                                                       \
+      CUresult get_error_str_err C10_UNUSED = at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str);  \
+      if (get_error_str_err != CUDA_SUCCESS) {                                                                   \
+        AT_ERROR("CUDA driver error: unknown error");                                                            \
+      } else {                                                                                                   \
+        AT_ERROR("CUDA driver error: ", err_str);                                                                \
+      }                                                                                                          \
+    }                                                                                                            \
+  } while (0)
+
+#else
+
+#define AT_CUDA_DRIVER_CHECK(EXPR)                                                \
+  do {                                                                            \
+    CUresult __err = EXPR;                                                        \
+    if (__err != CUDA_SUCCESS) {                                                  \
+      AT_ERROR("CUDA driver error: ", static_cast<int>(__err));                   \
+    }                                                                             \
+  } while (0)
+
+#endif
+
+// For CUDA NVRTC
+//
+// Note: As of CUDA 10, nvrtc error code 7, NVRTC_ERROR_BUILTIN_OPERATION_FAILURE,
+// incorrectly produces the error string "NVRTC unknown error."
+// The following maps it correctly.
+//
+// This is here instead of in c10 because NVRTC is loaded dynamically via a stub
+// in ATen, and we need to use its nvrtcGetErrorString.
+// See NOTE [ USE OF NVRTC AND DRIVER API ].
+#define AT_CUDA_NVRTC_CHECK(EXPR)                                                                   \
+  do {                                                                                              \
+    nvrtcResult __err = EXPR;                                                                       \
+    if (__err != NVRTC_SUCCESS) {                                                                   \
+      if (static_cast<int>(__err) != 7) {                                                           \
+        AT_ERROR("CUDA NVRTC error: ", at::globalContext().getNVRTC().nvrtcGetErrorString(__err));  \
+      } else {                                                                                      \
+        AT_ERROR("CUDA NVRTC error: NVRTC_ERROR_BUILTIN_OPERATION_FAILURE");                        \
+      }                                                                                             \
+    }                                                                                               \
+  } while (0)
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/NumericLimits.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/NumericLimits.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d02b41a8157f30aeb4e91fc865ed654598318351
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/NumericLimits.cuh
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <cuda.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+// NumericLimits.cuh is a holder for numeric limits definitions of commonly used
+// types. This header is very specific to ROCm HIP and may be removed in the future.
+// This header is derived from the legacy THCNumerics.cuh.
+
+// The lower_bound and upper_bound constants are same as lowest and max for
+// integral types, but are -inf and +inf for floating point types. They are
+// useful in implementing min, max, etc.
+
+namespace at {
+
+template <typename T>
+struct numeric_limits {
+};
+
+// WARNING: the following at::numeric_limits definitions are there only to support
+//          HIP compilation for the moment. Use std::numeric_limits if you are not
+//          compiling for ROCm.
+//          from @colesbury: "The functions on numeric_limits aren't marked with
+//          __device__ which is why they don't work with ROCm. CUDA allows them
+//          because they're constexpr."
+
+namespace {
+  // ROCm doesn't like INFINITY too.
+  constexpr double inf = INFINITY;
+}
+
+template <>
+struct numeric_limits<bool> {
+  static inline __host__ __device__ bool lowest() { return false; }
+  static inline __host__ __device__ bool max() { return true; }
+  static inline __host__ __device__ bool lower_bound() { return false; }
+  static inline __host__ __device__ bool upper_bound() { return true; }
+};
+
+template <>
+struct numeric_limits<uint8_t> {
+  static inline __host__ __device__ uint8_t lowest() { return 0; }
+  static inline __host__ __device__ uint8_t max() { return UINT8_MAX; }
+  static inline __host__ __device__ uint8_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint8_t upper_bound() { return UINT8_MAX; }
+};
+
+template <>
+struct numeric_limits<int8_t> {
+  static inline __host__ __device__ int8_t lowest() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t max() { return INT8_MAX; }
+  static inline __host__ __device__ int8_t lower_bound() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
+};
+
+template <>
+struct numeric_limits<int16_t> {
+  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t max() { return INT16_MAX; }
+  static inline __host__ __device__ int16_t lower_bound() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
+};
+
+template <>
+struct numeric_limits<int32_t> {
+  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t max() { return INT32_MAX; }
+  static inline __host__ __device__ int32_t lower_bound() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
+};
+
+template <>
+struct numeric_limits<int64_t> {
+#ifdef _MSC_VER
+  static inline __host__ __device__ int64_t lowest() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t max() { return _I64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return _I64_MAX; }
+#else
+  static inline __host__ __device__ int64_t lowest() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t max() { return INT64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return INT64_MAX; }
+#endif
+};
+
+template <>
+struct numeric_limits<at::Half> {
+  static inline __host__ __device__ at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half max() { return at::Half(0x7BFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half lower_bound() { return at::Half(0xFC00, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half upper_bound() { return at::Half(0x7C00, at::Half::from_bits()); }
+};
+
+template <>
+struct numeric_limits<at::BFloat16> {
+  static inline __host__ __device__ at::BFloat16 lowest() { return at::BFloat16(0xFF7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 max() { return at::BFloat16(0x7F7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 lower_bound() { return at::BFloat16(0xFF80, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 upper_bound() { return at::BFloat16(0x7F80, at::BFloat16::from_bits()); }
+};
+
+template <>
+struct numeric_limits<float> {
+  static inline __host__ __device__ float lowest() { return -FLT_MAX; }
+  static inline __host__ __device__ float max() { return FLT_MAX; }
+  static inline __host__ __device__ float lower_bound() { return -static_cast<float>(inf); }
+  static inline __host__ __device__ float upper_bound() { return static_cast<float>(inf); }
+};
+
+template <>
+struct numeric_limits<double> {
+  static inline __host__ __device__ double lowest() { return -DBL_MAX; }
+  static inline __host__ __device__ double max() { return DBL_MAX; }
+  static inline __host__ __device__ double lower_bound() { return -inf; }
+  static inline __host__ __device__ double upper_bound() { return inf; }
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/PeerToPeerAccess.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PeerToPeerAccess.h
new file mode 100644
index 0000000000000000000000000000000000000000..bad21b18d83c2e8110607ff83153bc568717524c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PeerToPeerAccess.h
@@ -0,0 +1,11 @@
+#include <c10/macros/Macros.h>
+#include <cstdint>
+
+namespace at::cuda {
+namespace detail {
+void init_p2p_access_cache(int64_t num_devices);
+}
+
+TORCH_CUDA_CPP_API bool get_p2p_access(int source_dev, int dest_dev);
+
+}  // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxCudaState.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxCudaState.h
new file mode 100644
index 0000000000000000000000000000000000000000..257ac6bbb896ab2883e7e85011ddee1426f53d15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxCudaState.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <cstdint>
+
+#include <ATen/cuda/detail/PhiloxCudaStateRaw.cuh>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..091dd5e4402b9987edebed96d6d06c3baffa8272
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PhiloxUtils.cuh
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <ATen/cuda/PhiloxCudaState.h>
+#include <ATen/cuda/detail/UnpackRaw.cuh>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/PinnedMemoryAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PinnedMemoryAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff65549139607e2ea3d5378e953010f4ef6040fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/PinnedMemoryAllocator.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <ATen/cuda/CachingHostAllocator.h>
+
+namespace at::cuda {
+
+inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() {
+  return getCachingHostAllocator();
+}
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/ScanUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ScanUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f81f560b4b523f8bc81423183cbbccca9c9d45e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ScanUtils.cuh
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/cuda/AsmUtils.cuh>
+#include <c10/macros/Macros.h>
+
+// Collection of in-kernel scan / prefix sum utilities
+
+namespace at::cuda {
+
+// Inclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
+  // Within-warp, we use warp voting.
+#if defined (USE_ROCM)
+  unsigned long long int vote = WARP_BALLOT(in);
+  T index = __popcll(getLaneMaskLe() & vote);
+  T carry = __popcll(vote);
+#else
+  T vote = WARP_BALLOT(in);
+  T index = __popc(getLaneMaskLe() & vote);
+  T carry = __popc(vote);
+#endif
+
+  int warp = threadIdx.x / C10_WARP_SIZE;
+
+  // Per each warp, write out a value
+  if (getLaneId() == 0) {
+    smem[warp] = carry;
+  }
+
+  __syncthreads();
+
+  // Sum across warps in one thread. This appears to be faster than a
+  // warp shuffle scan for CC 3.0+
+  if (threadIdx.x == 0) {
+    int current = 0;
+    for (int i = 0; i < blockDim.x / C10_WARP_SIZE; ++i) {
+      T v = smem[i];
+      smem[i] = binop(smem[i], current);
+      current = binop(current, v);
+    }
+  }
+
+  __syncthreads();
+
+  // load the carry from the preceding warp
+  if (warp >= 1) {
+    index = binop(index, smem[warp - 1]);
+  }
+
+  *out = index;
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+// Exclusive prefix sum for binary vars using intra-warp voting +
+// shared memory
+template <typename T, bool KillWARDependency, class BinaryFunction>
+__device__ void exclusiveBinaryPrefixScan(T* smem, bool in, T* out, T* carry, BinaryFunction binop) {
+  inclusiveBinaryPrefixScan<T, false, BinaryFunction>(smem, in, out, binop);
+
+  // Inclusive to exclusive
+  *out -= (T) in;
+
+  // The outgoing carry for all threads is the last warp's sum
+  *carry = smem[at::ceil_div<int>(blockDim.x, C10_WARP_SIZE) - 1];
+
+  if (KillWARDependency) {
+    __syncthreads();
+  }
+}
+
+}  // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/Sleep.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Sleep.h
new file mode 100644
index 0000000000000000000000000000000000000000..f14fbb5a8f9720b5f0da97e3d65f63bf041c0a18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/Sleep.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <cstdint>
+
+namespace at::cuda {
+
+// enqueues a kernel that spins for the specified number of cycles
+TORCH_CUDA_CU_API void sleep(int64_t cycles);
+
+}  // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/ThrustAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ThrustAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7f56bd455e5a71bef001908cd55f0e40a45f6ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/ThrustAllocator.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstddef>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+namespace at::cuda {
+
+/// Allocator for Thrust to re-route its internal device allocations
+/// to the THC allocator
+class ThrustAllocator {
+public:
+  typedef char value_type;
+
+  char* allocate(std::ptrdiff_t size) {
+    return static_cast<char*>(c10::cuda::CUDACachingAllocator::raw_alloc(size));
+  }
+
+  void deallocate(char* p, size_t size) {
+    c10::cuda::CUDACachingAllocator::raw_delete(p);
+  }
+};
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..954fac05ca331f5a6fd5b89eada0e8572ba1ec77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.cuh
@@ -0,0 +1,413 @@
+#pragma once
+#include <ATen/cuda/cub.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+#include <limits>
+
+#include <ATen/cuda/cub_definitions.cuh>
+
+#if USE_GLOBAL_CUB_WRAPPED_NAMESPACE()
+
+#include <cub/cub.cuh>
+
+#else
+
+// include cub in a safe manner, see:
+// https://github.com/pytorch/pytorch/pull/55292
+#undef CUB_NS_POSTFIX //undef to avoid redefinition warnings
+#undef CUB_NS_PREFIX
+#undef CUB_NS_QUALIFIER
+#define CUB_NS_PREFIX namespace at_cuda_detail {
+#define CUB_NS_POSTFIX }
+#define CUB_NS_QUALIFIER ::at_cuda_detail::cub
+#include <cub/cub.cuh>
+#undef CUB_NS_POSTFIX
+#undef CUB_NS_PREFIX
+#undef CUB_NS_QUALIFIER
+
+#endif
+
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAStream.h>
+
+// handle the temporary storage and 'twice' calls for cub API
+#define CUB_WRAPPER(func, ...) do {                                       \
+  size_t temp_storage_bytes = 0;                                          \
+  func(nullptr, temp_storage_bytes, __VA_ARGS__);                         \
+  auto& caching_allocator = *::c10::cuda::CUDACachingAllocator::get();    \
+  auto temp_storage = caching_allocator.allocate(temp_storage_bytes);     \
+  func(temp_storage.get(), temp_storage_bytes, __VA_ARGS__);              \
+  AT_CUDA_CHECK(cudaGetLastError());                                      \
+} while (false)
+
+#ifdef USE_ROCM
+#define NO_ROCM(x)
+#define ROCM_HIPCUB(x) ::hipcub
+#else
+#define NO_ROCM(x) x
+#define ROCM_HIPCUB(x) x
+#endif
+
+#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || \
+     (defined(USE_ROCM) && ROCM_VERSION >= 40500)
+
+#if !defined(USE_ROCM)
+namespace at_cuda_detail {
+#endif
+
+// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
+
+template <>
+struct ROCM_HIPCUB(cub)::FpLimits<c10::BFloat16>
+{
+    static __host__ __device__ __forceinline__ c10::BFloat16 Max() {
+        unsigned short max_word = 0x7F7F;
+        return reinterpret_cast<c10::BFloat16&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ c10::BFloat16 Lowest() {
+        unsigned short lowest_word = 0xFF7F;
+        return reinterpret_cast<c10::BFloat16&>(lowest_word);
+    }
+};
+
+template <>
+struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
+       ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
+
+#if !defined(USE_ROCM)
+} // namespace at_cuda_detail
+#endif
+
+#endif
+
+#if !defined(USE_ROCM)
+namespace at::native {
+namespace cub = ::at_cuda_detail::cub;
+} // namespace at::native
+#endif
+
+namespace at::cuda::cub {
+
+namespace detail {
+
+template<typename T>
+struct cuda_type {
+  using type = T;
+};
+template<>
+struct cuda_type<c10::Half> {
+  using type = __half;
+};
+
+#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()
+
+template<>
+struct cuda_type<c10::BFloat16> {
+  using type = __nv_bfloat16;
+};
+
+#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500)
+
+template<>
+struct cuda_type<c10::BFloat16> {
+  using type = hip_bfloat16;
+};
+
+#endif
+
+}  // namespace detail
+
+template<typename key_t, typename value_t, typename OffsetIteratorT>
+inline void segmented_sort_pairs(
+    const key_t *keys_in, key_t *keys_out,
+    const value_t *values_in, value_t *values_out,
+    int64_t num_elements, int64_t num_segments,
+    OffsetIteratorT begin_offsets, OffsetIteratorT end_offsets,
+    bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8
+) {
+  TORCH_CHECK(num_elements <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
+  TORCH_CHECK(num_segments <= std::numeric_limits<int>::max(),
+    "cub sort does not support sorting more than INT_MAX elements");
+  using key_t_ = typename detail::cuda_type<key_t>::type;
+
+  auto allocator = c10::cuda::CUDACachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+
+  if (keys_out == nullptr) {
+    keys_out_owner = allocator->allocate(num_elements * sizeof(key_t));
+    keys_out = reinterpret_cast<key_t *>(keys_out_owner.get());
+  }
+
+  const key_t_ *keys_in_ = reinterpret_cast<const key_t_*>(keys_in);
+  key_t_ *keys_out_ = reinterpret_cast<key_t_*>(keys_out);
+
+  if (descending) {
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairsDescending,
+      keys_in_, keys_out_, values_in, values_out,
+      num_elements, num_segments, begin_offsets, end_offsets,
+      begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
+  } else {
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSegmentedRadixSort::SortPairs,
+      keys_in_, keys_out_, values_in, values_out,
+      num_elements, num_segments, begin_offsets, end_offsets,
+      begin_bit, end_bit, c10::cuda::getCurrentCUDAStream());
+  }
+}
+
+#if CUB_SUPPORTS_UNIQUE_BY_KEY()
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename KeysOutputIteratorT, typename ValuesOutputIteratorT, typename NumSelectedIteratorT>
+inline void unique_by_key(
+  KeysInputIteratorT keys_in, ValuesInputIteratorT values_in,
+  KeysOutputIteratorT keys_out, ValuesOutputIteratorT values_out,
+  NumSelectedIteratorT num_selected, int64_t num_input_items)
+{
+  // TODO: use thrust::discard_iterator to handle null keys_out when https://github.com/NVIDIA/cub/issues/406 is fixed.
+  constexpr bool null_keys_out = std::is_same<KeysOutputIteratorT, std::nullptr_t>::value;
+  using KeyT = typename std::iterator_traits<KeysInputIteratorT>::value_type;
+  using RealKeysOutputIteratorT = typename std::conditional<null_keys_out, KeyT *, KeysOutputIteratorT>::type;
+  RealKeysOutputIteratorT keys_out_;
+  auto allocator = c10::cuda::CUDACachingAllocator::get();
+  c10::DataPtr keys_out_owner;
+  if constexpr (null_keys_out) {
+    keys_out_owner = allocator->allocate(num_input_items * sizeof(KeyT));
+    keys_out_ = static_cast<KeyT *>(keys_out_owner.get());
+  } else {
+    keys_out_ = keys_out;
+  }
+  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::UniqueByKey,
+    keys_in, values_in, keys_out_, values_out, num_selected, num_input_items, c10::cuda::getCurrentCUDAStream());
+}
+#endif
+
+namespace impl {
+
+template<typename InputIteratorT1, typename InputIteratorT2, typename OutputIteratorT, class ScanOpT>
+C10_LAUNCH_BOUNDS_1(1)
+__global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputIteratorT out, ScanOpT scan_op){
+  // NOTE: out here not the final scan output, but an intermediate of the accumulation type.
+  using acc_t = typename std::iterator_traits<OutputIteratorT>::value_type;
+  *out = scan_op(static_cast<acc_t>(*a), static_cast<acc_t>(*b));
+}
+
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+template<typename ValueT, typename InputIteratorT>
+struct chained_iterator {
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = ValueT;
+  using pointer           = ValueT*;
+  using reference         = ValueT&;
+
+  InputIteratorT iter;
+  ValueT *first;
+  difference_type offset = 0;
+
+  __device__ ValueT operator[](difference_type i) {
+    i +=  offset;
+    if (i == 0) {
+      return *first;
+    } else {
+      return ValueT(iter[i - 1]);
+    }
+  }
+  __device__ chained_iterator operator+(difference_type i) {
+    return chained_iterator{iter, first, i};
+  }
+  __device__ ValueT operator*() {
+    return (*this)[0];
+  }
+};
+#endif
+
+// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+// so split at int_max/2
+constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
+}
+
+// non synchronizing cub call
+// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+// so split at int_max/2
+template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, int max_cub_size=impl::max_cub_size>
+inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, int64_t num_items) {
+#if defined(USE_ROCM) && (ROCM_VERSION >= 50000)
+  //For ROCm, use hipCUB chained iterators
+  CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::InclusiveScan,
+      input,
+      output,
+      scan_op,
+      num_items,
+      at::cuda::getCurrentCUDAStream());
+  C10_HIP_KERNEL_LAUNCH_CHECK();
+#else
+  // non synchronizing cub call
+  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+  // so split at int_max/2
+  int size_cub = std::min<int64_t>(num_items, max_cub_size);
+  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+      input,
+      output,
+      scan_op,
+      size_cub,
+      at::cuda::getCurrentCUDAStream());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  using input_t = typename std::iterator_traits<InputIteratorT>::value_type;
+  for (int64_t i = max_cub_size; i < num_items; i += max_cub_size) {
+    auto allocator = c10::cuda::CUDACachingAllocator::get();
+    c10::DataPtr first_elem = allocator->allocate(sizeof(input_t));
+    auto first_elem_ptr = reinterpret_cast<input_t *>(first_elem.get());
+
+    size_cub = std::min<int64_t>(num_items - i, max_cub_size);
+    impl::transform_vals<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+        output + i - 1,
+        input + i,
+        first_elem_ptr,
+        scan_op);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>;
+    using tuple = typename ArgIndexInputIterator::value_type;
+    auto input_iter_transform = [=] __device__ (const tuple &x)->input_t  {
+      if (x.key == 0) {
+        return *first_elem_ptr;
+      } else {
+        return x.value;
+      }
+    };
+    auto input_ = NO_ROCM(at_cuda_detail)::cub::TransformInputIterator<input_t, decltype(input_iter_transform), ArgIndexInputIterator>(
+      ArgIndexInputIterator(input + i), input_iter_transform);
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
+        input + i + 1,
+        output + i,
+        scan_op,
+        ::at_cuda_detail::cub::FutureValue<input_t>(first_elem_ptr),
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#endif
+  }
+#endif
+}
+
+template<typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, int max_cub_size=impl::max_cub_size>
+inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT scan_op, InitValueT init_value, int64_t num_items) {
+#if defined(USE_ROCM) && (ROCM_VERSION >= 50000)
+  //For ROCm, use hipCUB chained iterators
+  CUB_WRAPPER(NO_ROCM(detail)::hipcub::DeviceScan::ExclusiveScan,
+      input,
+      output,
+      scan_op,
+      init_value,
+      num_items,
+      at::cuda::getCurrentCUDAStream());
+  C10_HIP_KERNEL_LAUNCH_CHECK();
+#else
+  // non synchronizing cub call
+  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
+  // so split at int_max/2
+  int size_cub = std::min<int64_t>(num_items, max_cub_size);
+  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
+      input,
+      output,
+      scan_op,
+      init_value,
+      size_cub,
+      at::cuda::getCurrentCUDAStream());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  for (int64_t i = max_cub_size; i < num_items; i += max_cub_size) {
+    auto allocator = c10::cuda::CUDACachingAllocator::get();
+    c10::DataPtr first_elem = allocator->allocate(sizeof(InitValueT));
+    auto first_elem_ptr = reinterpret_cast<InitValueT *>(first_elem.get());
+
+    size_cub = std::min<int64_t>(num_items - i, max_cub_size);
+    impl::transform_vals<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
+        output + i - 1,
+        input + i - 1,
+        first_elem_ptr,
+        scan_op);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+#if !CUB_SUPPORTS_FUTURE_VALUE()
+    auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{
+      input + i, first_elem_ptr};
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
+        input_,
+        output + i,
+        scan_op,
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#else
+    CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
+        input + i,
+        output + i,
+        scan_op,
+        ::at_cuda_detail::cub::FutureValue<InitValueT>(first_elem_ptr),
+        size_cub,
+        at::cuda::getCurrentCUDAStream());
+#endif
+  }
+#endif
+}
+
+#if CUB_SUPPORTS_SCAN_BY_KEY()
+
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT>
+inline void inclusive_sum_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+    "cub InclusiveSumByKey does not support more than INT_MAX elements");
+  CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveSumByKey,
+      keys, input, output, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
+}
+
+template <typename KeysInputIteratorT, typename ValuesInputIteratorT, typename ValuesOutputIteratorT, typename ScanOpT>
+inline void inclusive_scan_by_key(KeysInputIteratorT keys, ValuesInputIteratorT input, ValuesOutputIteratorT output, ScanOpT scan_op, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+    "cub InclusiveSumByKey does not support more than INT_MAX elements");
+  CUB_WRAPPER(at_cuda_detail::cub::DeviceScan::InclusiveScanByKey,
+      keys, input, output, scan_op, num_items, at_cuda_detail::cub::Equality(), at::cuda::getCurrentCUDAStream());
+}
+
+#endif
+
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+void unique(InputIteratorT input, OutputIteratorT output,
+            NumSelectedIteratorT num_selected_out, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub unique does not support more than INT_MAX elements");
+  CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceSelect::Unique,
+              input, output, num_selected_out, num_items, at::cuda::getCurrentCUDAStream());
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename CountsOutputIteratorT,
+          typename LengthOutputIteratorT>
+void run_length_encode(InputIteratorT input, OutputIteratorT output, CountsOutputIteratorT counts_out,
+                       LengthOutputIteratorT length_out, int64_t num_items) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub run_length_encode does not support more than INT_MAX elements");
+  CUB_WRAPPER(
+      NO_ROCM(at_cuda_detail)::cub::DeviceRunLengthEncode::Encode,
+      input, output, counts_out, length_out, num_items,
+      at::cuda::getCurrentCUDAStream());
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
+void reduce(InputIteratorT input, OutputIteratorT output, int64_t num_items, ReductionOpT op, T init) {
+  TORCH_CHECK(num_items <= std::numeric_limits<int>::max(),
+              "cub reduce does not support more than INT_MAX elements");
+  CUB_WRAPPER(
+      NO_ROCM(at_cuda_detail)::cub::DeviceReduce::Reduce,
+      input, output, num_items, op, init,
+      at::cuda::getCurrentCUDAStream());
+
+}
+
+}  // namespace at::cuda::cub
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.h
new file mode 100644
index 0000000000000000000000000000000000000000..37e9867f39be900c5d9a0a1e525cb94676dc134b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub.h
@@ -0,0 +1,87 @@
+#pragma once
+#include <cstdint>
+#include <c10/core/ScalarType.h>
+#include <ATen/cuda/CUDAConfig.h>
+
+// NOTE: These templates are intentionally not defined in this header,
+// which aviods re-compiling them for each translation unit. If you get
+// a link error, you need to add an explicit instantiation for your
+// types in cub.cu
+
+namespace at::cuda::cub {
+
+inline int get_num_bits(uint64_t max_key) {
+  int num_bits = 1;
+  while (max_key > 1) {
+    max_key >>= 1;
+    num_bits++;
+  }
+  return num_bits;
+}
+
+namespace detail {
+
+// radix_sort_pairs doesn't interact with value_t other than to copy
+// the data, so we can save template instantiations by reinterpreting
+// it as an opaque type.
+template <int N> struct alignas(N) OpaqueType { char data[N]; };
+
+template<typename key_t, int value_size>
+void radix_sort_pairs_impl(
+    const key_t *keys_in, key_t *keys_out,
+    const OpaqueType<value_size> *values_in, OpaqueType<value_size> *values_out,
+    int64_t n, bool descending, int64_t begin_bit, int64_t end_bit);
+
+}  // namespace detail
+
+template<typename key_t, typename value_t>
+void radix_sort_pairs(
+    const key_t *keys_in, key_t *keys_out,
+    const value_t *values_in, value_t *values_out,
+    int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8) {
+  static_assert(std::is_trivially_copyable<value_t>::value ||
+                AT_ROCM_ENABLED(),  // ROCm incorrectly fails this check for vector types
+                "radix_sort_pairs value type must be trivially copyable");
+  // Make value type opaque, so all inputs of a certain size use the same template instantiation
+  using opaque_t = detail::OpaqueType<sizeof(value_t)>;
+  static_assert(sizeof(value_t) <= 8 && (sizeof(value_t) & (sizeof(value_t) - 1)) == 0,
+                "This size of value_t is not instantiated. Please instantiate it in cub.cu"
+                " and modify this check.");
+  static_assert(sizeof(value_t) == alignof(value_t), "Expected value_t to be size-aligned");
+  detail::radix_sort_pairs_impl(
+      keys_in, keys_out,
+      reinterpret_cast<const opaque_t*>(values_in),
+      reinterpret_cast<opaque_t*>(values_out),
+      n, descending, begin_bit, end_bit);
+}
+
+template<typename key_t>
+void radix_sort_keys(
+    const key_t *keys_in, key_t *keys_out,
+    int64_t n, bool descending=false, int64_t begin_bit=0, int64_t end_bit=sizeof(key_t)*8);
+
+// NOTE: Intermediate sums will be truncated to input_t precision
+template <typename input_t, typename output_t>
+void inclusive_sum_truncating(const input_t *input, output_t *output, int64_t n);
+
+template <typename scalar_t>
+void inclusive_sum(const scalar_t *input, scalar_t *output, int64_t n) {
+  return inclusive_sum_truncating(input, output, n);
+}
+
+// NOTE: Sums are done is common_type<input_t, output_t>
+template <typename input_t, typename output_t>
+void exclusive_sum_in_common_type(const input_t *input, output_t *output, int64_t n);
+
+template <typename scalar_t>
+void exclusive_sum(const scalar_t *input, scalar_t *output, int64_t n) {
+  return exclusive_sum_in_common_type(input, output, n);
+}
+
+void mask_exclusive_sum(const uint8_t *mask, int64_t *output_idx, int64_t n);
+inline void mask_exclusive_sum(const bool *mask, int64_t *output_idx, int64_t n) {
+  return mask_exclusive_sum(
+      reinterpret_cast<const uint8_t*>(mask), output_idx, n);
+}
+
+}  // namespace at::cuda::cub
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub_definitions.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub_definitions.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a88086ae6d6a9b9e9dcd7a69822ef30d58481925
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/cub_definitions.cuh
@@ -0,0 +1,53 @@
+#pragma once
+
+#if !defined(USE_ROCM)
+#include <cuda.h>  // for CUDA_VERSION
+#endif
+
+#if !defined(USE_ROCM)
+#include <cub/version.cuh>
+#else
+#define CUB_VERSION 0
+#endif
+
+// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
+// https://github.com/NVIDIA/cub/pull/306
+#if CUB_VERSION >= 101300
+#define CUB_SUPPORTS_NV_BFLOAT16() true
+#else
+#define CUB_SUPPORTS_NV_BFLOAT16() false
+#endif
+
+// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
+// https://github.com/NVIDIA/cub/pull/326
+// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
+// starting from CUDA 11.5
+#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
+#else
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
+#endif
+
+// cub support for UniqueByKey is added to cub 1.16 in:
+// https://github.com/NVIDIA/cub/pull/405
+#if CUB_VERSION >= 101600
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
+#else
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+#endif
+
+// cub support for scan by key is added to cub 1.15
+// in https://github.com/NVIDIA/cub/pull/376
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_SCAN_BY_KEY() 1
+#else
+#define CUB_SUPPORTS_SCAN_BY_KEY() 0
+#endif
+
+// cub support for cub::FutureValue is added to cub 1.15 in:
+// https://github.com/NVIDIA/cub/pull/305
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_FUTURE_VALUE() true
+#else
+#define CUB_SUPPORTS_FUTURE_VALUE() false
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..904d333f72709afe077c015670f2a932c29c9882
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/detail/CUDAHooksInterface.h>
+
+#include <ATen/Generator.h>
+#include <c10/util/Optional.h>
+
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+
+namespace at::cuda::detail {
+
+// Set the callback to initialize Magma, which is set by
+// torch_cuda_cu. This indirection is required so magma_init is called
+// in the same library where Magma will be used.
+TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
+
+
+// The real implementation of CUDAHooksInterface
+struct CUDAHooks : public at::CUDAHooksInterface {
+  CUDAHooks(at::CUDAHooksArgs) {}
+  void initCUDA() const override;
+  Device getDeviceFromPtr(void* data) const override;
+  bool isPinnedPtr(const void* data) const override;
+  const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override;
+  bool hasCUDA() const override;
+  bool hasMAGMA() const override;
+  bool hasCuDNN() const override;
+  bool hasCuSOLVER() const override;
+  bool hasROCM() const override;
+  const at::cuda::NVRTC& nvrtc() const override;
+  DeviceIndex current_device() const override;
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  Allocator* getCUDADeviceAllocator() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool compiledWithCuDNN() const override;
+  bool compiledWithMIOpen() const override;
+  bool supportsDilatedConvolutionWithCuDNN() const override;
+  bool supportsDepthwiseConvolutionWithCuDNN() const override;
+  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool hasCUDART() const override;
+  long versionCUDART() const override;
+  long versionCuDNN() const override;
+  std::string showConfig() const override;
+  double batchnormMinEpsilonCuDNN() const override;
+  int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override;
+  void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
+  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
+  int getNumGPUs() const override;
+  void deviceSynchronize(DeviceIndex device_index) const override;
+};
+
+} // at::cuda::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h
new file mode 100644
index 0000000000000000000000000000000000000000..e17eed4b63a1c84f7f873e813ab9ed8bdf849472
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h
@@ -0,0 +1,151 @@
+// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
+// These handles are tied to device, and these libraries requires/recommends not to
+// share handles across host threads.
+//
+// These libraries recommend using one handle per host thread. We may not want to do
+// this because threads are relatively light-weight, but creating and destroying
+// handles is expensive (destroying the handle causes synchronizations). DataParallel,
+// for example, creates new threads for each forward pass.
+//
+// This file implements a handle pool mechanism. The handle pool returns handles on
+// demand as threads request them. If all existing handles in the pool are in use,
+// it creates a new one. As threads terminate, they release handles back into the pool.
+// In this way, the handle pool never creates more handles than the high-water mark of
+// active threads, so it's efficient with DataParallel.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <mutex>
+#include <memory>
+
+#include <c10/util/Exception.h>
+
+namespace at::cuda { namespace {
+
+template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
+struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {
+
+    struct Handle {
+    Handle_t handle;
+    Handle(bool create = false) : handle(nullptr)
+    {
+        if(create) Create(&handle);
+    }
+    // std::vector.emplace() and push_back() may route through temporaries and call
+    // copy/move constructors along the way.  If this is the case, we don't want
+    // the destructors of temporaries to call cudnnDestroy on the handle.
+    // We can achieve safety (for the narrow case of stashing within std::vectors)
+    // by making Handle moveable but not copyable, and transferring handle ownership
+    // to the latest constructed object.  This is not a substitute for full-blown
+    // reference counting, but reference counting may be overkill here.
+    // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
+    // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
+    Handle(const Handle& rhs) = delete;
+    // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
+    Handle(Handle&& rhs) : Handle() { std::swap(handle, rhs.handle); }
+    // operator= takes argument by value
+    Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
+    ~Handle() {
+        if(handle) Destroy(handle);
+    }
+    };
+
+    std::mutex mutex;
+
+    // Handles are lazily created as different threads request them,
+    // but are never destroyed until the end of the process.
+    // The maximum number of handles this process will create for each device is equal
+    // to the high-water mark of the number of concurrently active threads that request
+    // handles for that device.
+    // When threads terminate, they release their handles back into the pool for reuse.
+    // Otherwise, new handles would be created every time new threads were spawned,
+    // resulting in poor performance for Python modules that repeatedly or frequently
+    // spawned new sets of threads (like DataParallel, which creates a new set of threads
+    // for each forward pass).
+    //
+    // To prevent potential deadlocks, we explicitly choose not to cap the number
+    // of handles that are created per device.
+    // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
+    // only 4 can make forward progress at any time. The other 4 will not release their
+    // handles until they exit, so the fifth cannot make progress until then.  This is
+    // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
+    // intermediate point (ie, before any of them have exited).  We have no way to anticipate
+    // or enforce that user threads will not attempt such intermediate synchronization.
+    // The only way to ensure safety is to avoid imposing a cap on the number of handles.
+    std::unordered_map<int, std::vector<Handle>> created_handles;
+    std::unordered_map<int, std::vector<Handle_t>> available_handles;
+
+    // PoolWindow lazily creates and caches the handles that a particular thread is using,
+    // so in the common case handle access doesn't incur either handle creation or a mutex lock.
+    class PoolWindow
+    {
+    public:
+    PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
+    ~PoolWindow(){ release(); }
+
+    Handle_t reserve(int device)
+    {
+        // If this thread already has a handle for this device, return it
+        if(my_handles.find(device) != my_handles.end())
+        return my_handles[device];
+
+        // otherwise, either grab a handle from the pool if one is available,
+        // or if not, create a new one.
+        auto parent = weak_parent.lock();
+        TORCH_CHECK(parent, "Cannot create handle during program termination");
+        std::lock_guard<std::mutex> guard(parent->mutex);
+
+        if(parent->available_handles[device].size() > 0)
+        {
+        my_handles[device] = parent->available_handles[device].back();
+        parent->available_handles[device].pop_back();
+        }
+        else
+        {
+        // In local testing, I do observe that emplace_back sometimes routes through temporaries
+        // that incur move-constructor and destructor calls.  See comments in Handle above.
+        parent->created_handles[device].emplace_back(true /*create*/);
+        my_handles[device] = parent->created_handles[device].back().handle;
+        }
+
+        return my_handles[device];
+    }
+
+    private:
+    // Stores the per-device handles currently owned by this thread
+    std::unordered_map<int, Handle_t> my_handles;
+
+    std::weak_ptr<DeviceThreadHandlePool> weak_parent;
+
+    // Called by the destructor.  Releases this thread's handles back into the pool.
+    void release() {
+        if(my_handles.size() > 0) {
+            auto parent = weak_parent.lock();
+            if (!parent) {
+                // If this thread exits after atexit handlers have completed, the
+                // cuda context itself may be invalid, so we must leak the handles.
+                return;
+            }
+
+            std::lock_guard<std::mutex> guard(parent->mutex);
+            for(auto d_h : my_handles)
+                parent->available_handles[d_h.first].push_back(d_h.second);
+        }
+    }
+    };
+
+    // Warning:
+    // If you want to change this function, be aware that this function will be called
+    // by multiple threads and there is no mutex guarding the call of this function, so
+    // make sure your implementation is thread-safe.
+    PoolWindow *newPoolWindow() {
+        // The returned pointer will be owned by a thread local variable
+        // so that different threads does not share the same PoolWindow.
+        return new PoolWindow(this->shared_from_this());
+    }
+};
+
+}}  // namespace at::cuda::detail::<anonymous>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a1994fee2ae3f0a0f984f4e4ec60597c1af302ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at::cuda::detail {
+
+TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t);
+using at::native::canUse32BitIndexMath;
+
+template <typename scalar, typename IndexType>
+TensorInfo<scalar, IndexType>
+getTensorInfo(const at::TensorBase &t) {
+  IndexType sz[MAX_TENSORINFO_DIMS];
+  IndexType st[MAX_TENSORINFO_DIMS];
+
+  int dims = t.dim();
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = t.size(i);
+    st[i] = t.stride(i);
+  }
+
+  scalar* data_ptr = nullptr;
+
+  if constexpr (std::is_const<scalar>::value) {
+    data_ptr = t.const_data_ptr<scalar>();
+  } else {
+    data_ptr = t.mutable_data_ptr<scalar>();
+  }
+
+  return TensorInfo<scalar, IndexType>(
+    data_ptr, dims, sz, st);
+}
+
+} // namespace at::cuda::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7532aed5fee08a22c88135169634d206ab3c8982
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <assert.h>
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#include <cuda_runtime.h>
+#endif
+
+namespace at::cuda::detail {
+
+// A utility class to implement integer division by multiplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+
+  C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) { }
+
+  C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
+  C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
+  C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+
+  Value divisor;
+};
+
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+
+  IntDivider() = default;
+
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+
+  C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+
+  C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+
+  C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+
+}  // namespace at::cuda::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab85887311ebb1215e9ad670209608364cd77aee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <limits>
+#include <c10/util/Exception.h>
+
+namespace at::cuda::detail {
+
+// CUDA: grid stride looping
+//
+// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment.
+// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final
+// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be
+// greater than INT_MAX.  But in that case _i_n_d_e_x >= n, so there are no
+// further iterations and the overflowed value in i=_i_n_d_e_x is not used.
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)                         \
+  int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;           \
+  for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int)
+
+
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+
+  // Round up division for positive number that cannot cause integer overflow
+  auto block_num = (N - 1) / max_threads_per_block + 1;
+  TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device");
+
+  return static_cast<int>(block_num);
+}
+
+}  // namespace at::cuda::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h
new file mode 100644
index 0000000000000000000000000000000000000000..23821c88e964ea499df1479a0c369228ba854738
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at::cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+
+namespace detail {
+extern NVRTC lazyNVRTC;
+} // namespace detail
+
+}  // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4b11f1fa64be6651e4d208618d3b4a40a1a1c2fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <type_traits>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/cuda/detail/IntegerDivider.cuh>
+
+// If element_sizes is nullptr, then the strides will be in bytes, otherwise
+// the strides will be in # of elements.
+// Operands that share the same shape, but may have different strides.
+// OffsetCalculator iterates the tensor in a column-major order
+
+#if defined(USE_ROCM)
+constexpr int MAX_DIMS = 16;
+#else
+constexpr int MAX_DIMS = 25;
+#endif
+
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  // We allow having negative strides to implement some operations like torch.flip
+  using stride_t = std::conditional_t<signed_strides,
+                                      std::make_signed_t<index_t>,
+                                      index_t>;
+  // The offset for each argument. Wrapper around fixed-size array.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<stride_t, std::max<int>(NARGS, 1)>;
+
+  // if element_sizes is nullptr, then the strides will be in bytes, otherwise
+  // the strides will be in # of elements.
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i=0; i < dims; i++){
+      sizes_[i] = at::cuda::detail::IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+
+    #pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.div;
+
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.mod * strides_[dim][arg];
+      }
+
+    }
+    return offsets;
+  }
+
+  int dims;
+  at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+
+template <int NARGS, typename index_t = uint32_t>
+struct TrivialOffsetCalculator {
+  // The offset for each argument. Wrapper around fixed-size array.
+  // The offsets are in # of elements, not in bytes.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = at::detail::Array<index_t, std::max<int>(NARGS, 1)>;
+
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+
+// Make an OffsetCalculator with byte offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(iter.ndim(), iter.shape().data(), strides.data());
+}
+
+// Make an OffsetCalculator with element offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_element_offset_calculator(
+    const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  std::array<int64_t, N> element_sizes;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(
+      iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data());
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..94a69cacc552aaaccbd879497a47e1c8c7cf65c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
@@ -0,0 +1,43 @@
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxCudaState.h>, which has a #pragma once.
+
+// Stores RNG state values. Passed as a kernel argument.
+// See Note [CUDA Graph-safe RNG states].
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+namespace at {
+
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed,
+                  uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(int64_t* seed,
+                  int64_t* offset_extragraph,
+                  uint32_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+
+  Payload seed_;
+  Payload offset_;
+  uint32_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dec8f789c7358c3f487c1104007a0e7318829a0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <ATen/CollapseDims.h>
+
+namespace at::cuda::detail {
+
+#define MAX_TENSORINFO_DIMS 25
+
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo();
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+
+  // See note on [collapse dims].
+  int collapseDims(const int excludeDim = -1);
+
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo() {
+  data = nullptr;
+  dims = 0;
+}
+
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions");
+
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+  auto result = at::collapse_dims(sizes, strides, dims, excludeDim);
+  dims = std::get<1>(result);
+  return std::get<0>(result);
+}
+
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+    IndexType offset = 0;
+
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+
+    return offset + linearId * info.strides[0];
+  }
+};
+
+// Uses dynamic (runtime) instead of static (compiletime) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+
+      IndexType offset = 0;
+
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+
+      return offset + linearId * info.strides[0];
+  }
+};
+
+} // namespace at::cuda::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..274c9add050fe3a33794e39d79b661f7e2fcf8cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh
@@ -0,0 +1,28 @@
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxUtils.cuh>, which has a #pragma once.
+
+namespace at::cuda::philox {
+
+// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState instance whether
+// that instance was created with graph capture underway or not.
+// See Note [CUDA Graph-safe RNG states].
+//
+// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen.
+// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable.
+// Easiest thing that comes to mind is, define a __device__ unpack helper here, in ATen/cuda.
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t>
+unpack(at::PhiloxCudaState arg) {
+  if (arg.captured_) {
+    // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
+    // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
+    // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+
+} // namespace at::cuda::philox
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e67b0f83c5d8a52cb1534bdbc7879138b53bdf9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <c10/macros/Export.h>
+#include <c10/util/SmallVector.h>
+#include <ATen/core/Tensor.h>
+
+#include <string>
+#include <vector>
+
+namespace at::cuda {
+
+TORCH_CUDA_CPP_API c10::SmallVector<at::Tensor> CompileAndLaunchKernel(
+  const std::string& code_string,
+  const std::string& kernel_name,
+  const int num_outputs,
+  const c10::SmallVector<at::Tensor>& tensors,
+  const c10::SmallVector<at::Scalar>& extra_args,
+  bool return_by_ref);
+
+} // namespace at::cuda
+
+#else
+
+namespace at::cuda {
+
+TORCH_CUDA_CPP_API c10::SmallVector<at::Tensor> CompileAndLaunchKernel(
+  const std::string& code_string,
+  const std::string& kernel_name,
+  const int num_outputs,
+  const c10::SmallVector<at::Tensor>& tensors,
+  const c10::SmallVector<at::Scalar>& extra_args,
+  bool return_by_ref) {
+    TORCH_CHECK(false, "Jiterator is not supported");
+  }
+} // namespace at::cuda
+
+#endif // AT_USE_JITERATOR()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator_impl.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..db8334c9ba510c2488a1e3e6d26d1f4b357cc9e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/jiterator_impl.h
@@ -0,0 +1,249 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <ATen/native/TensorIterator.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
+
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace at::native {
+
+
+#define AT_FOR_8_CASES(_)  \
+  _(1)                      \
+  _(2)                      \
+  _(3)                      \
+  _(4)                      \
+  _(5)                      \
+  _(6)                      \
+  _(7)                      \
+  _(8)
+
+#define AT_FOR_8_CASES_WITH_COMMA(_)  \
+  _(1)     ,                           \
+  _(2)     ,                           \
+  _(3)     ,                           \
+  _(4)     ,                           \
+  _(5)     ,                           \
+  _(6)     ,                           \
+  _(7)     ,                           \
+  _(8)
+
+c10::SmallVector<std::string> get_extra_args_typenames(const c10::SmallVector<at::Scalar>& extra_args) {
+  c10::SmallVector<std::string> args_typenames(extra_args.size());
+  for (const auto i : c10::irange(extra_args.size())) {
+    args_typenames[i] = at::cuda::jit::typeName(extra_args[i].type());
+  }
+  return args_typenames;
+}
+
+int can_vectorize_up_to(at::ScalarType type, char* pointer) {
+  switch(type) {
+#define DEFINE_CASE(ctype, scalartype)                                   \
+    case ScalarType::scalartype : return memory::can_vectorize_up_to<ctype>(pointer);
+
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+#undef DEFINE_CASE
+
+    default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
+  }
+}
+
+// jitted version of the above
+// See Note [Jiterator], this relies on the assumptions enumerated there
+int jitted_can_vectorize_up_to(const TensorIteratorBase& iter) {
+  const at::ScalarType common_dtype = iter.common_dtype();
+  const at::ScalarType result_dtype = common_dtype;
+
+  // Deals with output
+  int result = can_vectorize_up_to(result_dtype, static_cast<char*>(iter.data_ptr(0)));
+
+  // Incorporates input(s)
+  for (auto i = 1; i < iter.ntensors(); ++i) {
+    result = std::min<int>(result, can_vectorize_up_to(common_dtype, static_cast<char*>(iter.data_ptr(i))));
+  }
+
+  return result;
+}
+
+template<bool IS_INPUT, int N>
+static std::unique_ptr<OffsetCalculator<N>> make_unique_offset_calculator(
+          const TensorIteratorBase& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == (IS_INPUT ? iter.ninputs() : iter.noutputs()));
+
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    int index = IS_INPUT ? i + iter.noutputs() : i;
+    strides[i] = iter.strides(index).data();
+    element_sizes[i] = iter.element_size(index);
+  }
+  return std::make_unique<OffsetCalculator<N>>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template <bool IS_INPUT>
+struct OffsetCalculatorVariant {
+#define DEFINE_CASE(index) std::unique_ptr<OffsetCalculator<index>>
+  using OffsetCalculatorTypes = std::variant<
+    AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  OffsetCalculatorVariant(const TensorIteratorBase& iter) {
+    int num = IS_INPUT ? iter.ninputs() : iter.noutputs();
+
+    switch(num) {
+#define DEFINE_CASE(index)        \
+      case index : v = make_unique_offset_calculator<IS_INPUT, index>(iter); break;
+
+      AT_FOR_8_CASES(DEFINE_CASE)
+#undef DEFINE_CASE
+      default:
+        TORCH_CHECK(false, "OffsetCalculatorVariant is not implemented for num_tensor = ", num);
+    }
+  }
+
+  void* data_ptr() {
+    return std::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
+  }
+
+ private:
+  OffsetCalculatorTypes v;
+};
+
+struct ArrayVariant {
+// works for up to 8 input + 8 outputs
+#define DEFINE_CASE(index) at::detail::Array<char*, index>, at::detail::Array<char*, index+8>
+  using ArrayTypes = std::variant<
+    AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  ArrayVariant(const TensorIteratorBase& iter) {
+    int ntensors = iter.ntensors();
+    switch(ntensors) {
+#define DEFINE_CASE(index)                                            \
+      case index: array = at::detail::Array<char*, index>{}; break;   \
+      case index+8: array = at::detail::Array<char*, index+8>{}; break;
+
+      AT_FOR_8_CASES(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "ArrayVariant is not implemented for ntensors = ", ntensors);
+    }
+
+    std::visit([&](auto& a) {
+      for (auto i = 0; i < ntensors; ++i) {
+        a[i] = (char*)iter.data_ptr(i);
+      }
+    }, array);
+  }
+
+  void* data_ptr() {
+    return std::visit([](auto & a){ return static_cast<void*>(&a); }, array);
+  }
+
+private:
+  ArrayTypes array;
+};
+
+struct TrivialOffsetCalculatorVariant {
+#define DEFINE_CASE(index) TrivialOffsetCalculator<index>
+  using TrivialOffsetCalculatorTypes = std::variant<
+    AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  TrivialOffsetCalculatorVariant(int num) {
+    switch(num) {
+#define DEFINE_CASE(index)      \
+      case index: v = TrivialOffsetCalculator<index>(); break;
+
+      AT_FOR_8_CASES(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "TrivialOffsetCalculatorVariant is not implemented for num_tensors = ", num);
+    }
+  }
+
+  void* data_ptr() {
+    return std::visit([](auto & v){ return static_cast<void*>(&v); }, v);
+  }
+
+private:
+  TrivialOffsetCalculatorTypes v;
+};
+
+struct LoadWithCastVariant {
+#define DEFINE_CASE(index) std::unique_ptr<memory::LoadWithCast<index>>
+  using LoadWithCastPtr = std::variant<
+    AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  LoadWithCastVariant(const TensorIteratorBase& iter) {
+    int arity = iter.ninputs();
+    switch(arity) {
+#define DEFINE_CASE(index)      \
+      case index: v = std::make_unique<memory::LoadWithCast<index>>(iter); break;
+
+      AT_FOR_8_CASES(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "LoadWithCastVariant is not implemented for ninputs = ", arity);
+    }
+  }
+
+  void* data_ptr() {
+    return std::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
+  }
+
+private:
+  LoadWithCastPtr v;
+};
+
+struct StoreWithCastVariant {
+#define DEFINE_CASE(index) std::unique_ptr<memory::StoreWithCast<index>>
+  using StoreWithCastPtr = std::variant<
+    AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
+  >;
+#undef DEFINE_CASE
+
+  StoreWithCastVariant(const TensorIteratorBase& iter) {
+    int num = iter.noutputs();
+    switch(num) {
+#define DEFINE_CASE(index)      \
+      case index: v = std::make_unique<memory::StoreWithCast<index>>(iter); break;
+
+      AT_FOR_8_CASES(DEFINE_CASE)
+#undef DEFINE_CASE
+
+      default:
+        TORCH_CHECK(false, "StoreWithCastVariant is not implemented for noutputs = ", num);
+    }
+  }
+
+  void* data_ptr() {
+    return std::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
+  }
+
+private:
+  StoreWithCastPtr v;
+};
+
+} // namespace at::native
+
+
+#endif // AT_USE_JITERATOR()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/llvm_jit_strings.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/llvm_jit_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec2caa7b34b80eec75210988b7d6081e368f65bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/llvm_jit_strings.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <string>
+#include <c10/macros/Export.h>
+
+namespace at::cuda {
+
+TORCH_CUDA_CPP_API const std::string &get_traits_string();
+TORCH_CUDA_CPP_API const std::string &get_cmath_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_body_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_half_body_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_math_string();
+
+} // namespace at::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..592f9fb17cd530cfa195fe073ea3156d67adbffb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h
@@ -0,0 +1,174 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <string>
+
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/StringUtil.h>
+
+namespace at::cuda::tunable {
+
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+
+inline std::string BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return "N";
+    case BlasOp::T:
+      return "T";
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return "N";
+}
+
+template <typename T>
+struct GemmParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+
+  GemmParams* DeepCopy() const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = m * n * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType<T>::value).device(at::kCUDA);
+    // comparison done as 1D tensor
+    at::Tensor ref = at::from_blob(c,        {m*n}, options);
+    at::Tensor oth = at::from_blob(other->c, {m*n}, options);
+    at::Tensor ref_float = ref.to(at::kFloat);
+    at::Tensor oth_float = oth.to(at::kFloat);
+    std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+    std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+    double last_succeed_atol = 1;
+    double last_succeed_rtol = 1;
+    for (auto& atol : atols) {
+      for (auto& rtol : rtols) {
+        if (at::allclose(ref_float, oth_float, rtol, atol)) {
+          last_succeed_atol = atol;
+          last_succeed_rtol = rtol;
+        }
+      }
+    }
+    if (last_succeed_atol == 1) {
+      return FAIL;
+    }
+    else {
+      TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+    }
+
+    return OK;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+};
+
+template <typename T>
+struct GemmStridedBatchedParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+  }
+
+  GemmStridedBatchedParams* DeepCopy() const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = batch * stride_c * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+  }
+
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType<T>::value).device(at::kCUDA);
+    // comparison done as 1D tensor
+    at::Tensor ref = at::from_blob(c,        {batch*stride_c}, options);
+    at::Tensor oth = at::from_blob(other->c, {batch*stride_c}, options);
+    at::Tensor ref_float = ref.to(at::kFloat);
+    at::Tensor oth_float = oth.to(at::kFloat);
+    std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+    std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+    double last_succeed_atol = 1;
+    double last_succeed_rtol = 1;
+    for (auto& atol : atols) {
+      for (auto& rtol : rtols) {
+        if (at::allclose(ref_float, oth_float, rtol, atol)) {
+          last_succeed_atol = atol;
+          last_succeed_rtol = rtol;
+        }
+      }
+    }
+    if (last_succeed_atol == 1) {
+      return FAIL;
+    }
+    else {
+      TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+    }
+
+    return OK;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  int64_t stride_a;
+  const T* b;
+  int64_t ldb;
+  int64_t stride_b;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+  int64_t stride_c;
+  int64_t batch;
+};
+
+} // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h
new file mode 100644
index 0000000000000000000000000000000000000000..91c54b229e5b61a613e711bbf6d2e2fc0d71b2fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h
@@ -0,0 +1,379 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+#ifdef HIPBLASLT_HAS_GETINDEXFROMALGO
+#define GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo)
+#else
+static int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo) {
+    int* algo_ptr = (int*)algo.data;
+    if(*algo_ptr < 0) {
+        return -1;
+    }
+    return *algo_ptr;
+}
+#define GETINDEXFROMALGO(algo) getIndexFromAlgo(algo)
+#endif
+
+#ifdef HIPBLASLT_CUSTOM_COMPUTE_TYPE
+#define COMPUTE_TYPE_32 HIPBLASLT_COMPUTE_F32
+#else
+#define COMPUTE_TYPE_32 HIPBLAS_COMPUTE_32F
+#endif
+
+#ifdef HIPBLASLT_CUSTOM_DATA_TYPE
+
+template <typename T>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor();
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<float>() {
+  return HIPBLASLT_R_32F;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<Half>() {
+  return HIPBLASLT_R_16F;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<BFloat16>() {
+  return HIPBLASLT_R_16B;
+}
+
+template <>
+constexpr hipblasltDatatype_t HipBlasDataTypeFor<double>() {
+  return HIPBLASLT_R_64F;
+}
+
+#define DATA_TYPE_R_32 HIPBLASLT_R_32F
+
+#else
+
+template <typename T>
+constexpr hipblasDatatype_t HipBlasDataTypeFor();
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<float>() {
+  return HIPBLAS_R_32F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<Half>() {
+  return HIPBLAS_R_16F;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<BFloat16>() {
+  return HIPBLAS_R_16B;
+}
+
+template <>
+constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {
+  return HIPBLAS_R_64F;
+}
+
+#ifdef HIPBLAS_V2
+#define DATA_TYPE_R_32 HIP_R_32F
+#else
+#define DATA_TYPE_R_32 HIPBLAS_R_32F
+#endif
+
+#endif
+
+template <typename T, typename ParamsT>
+int GetBatchFromParams(const ParamsT* params) {
+  return 1;
+}
+
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+
+template <typename T, typename ParamsT>
+int GetStrideAFromParams(const ParamsT* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+
+template <typename T, typename ParamsT>
+int GetStrideBFromParams(const ParamsT* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+
+template <typename T, typename ParamsT>
+int GetStrideCFromParams(const ParamsT* params) {
+  return 1;
+}
+
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  AT_ERROR(
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  AT_ERROR(
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+
+static size_t GetHipblasltWorkspaceSize() {
+  static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE");
+  // 256MB is max workspace size allowed for hipblaslt
+  // hipblaslt-bench uses 32MB
+  // recommendation from hipblaslt author was 76MB
+  size_t workspace_size = 2*128*1024*1024; // default 256MB
+  if (env) {
+    try {
+      workspace_size = std::stoi(env);
+    } catch(std::invalid_argument const& e) {
+      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    } catch(std::out_of_range const& e) {
+      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
+                 " using default workspace size of ", workspace_size, " bytes.");
+    }
+  }
+  return workspace_size;
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto in_out_datatype = HipBlasDataTypeFor<T>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+
+      float alpha = static_cast<float>(params->alpha);
+      float beta = static_cast<float>(params->beta);
+
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      hipblasLtMatmulDesc_t matmul;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescCreate(&matmul, COMPUTE_TYPE_32, DATA_TYPE_R_32));
+
+      int batch = GetBatchFromParams<T>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<T>(params);
+        int64_t stride_b = GetStrideBFromParams<T>(params);
+        int64_t stride_c = GetStrideCFromParams<T>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute(
+            matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &opa, sizeof(int32_t)));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute(
+            matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &opb, sizeof(int32_t)));
+
+      size_t workspace_size = GetHipblasltWorkspaceSize();
+
+      auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
+
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul,
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " workspace too large");
+          return FAIL;
+        }
+      }
+      else {
+        //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " not supported");
+        return FAIL;
+      }
+
+      void* workspace_buffer = nullptr;
+      if (workspace_size > 0) {
+        workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
+      }
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul,
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            at::cuda::getCurrentCUDAStream()));
+
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      if (workspace_size > 0) {
+        c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
+      }
+      return OK;
+    }
+
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto in_out_datatype = HipBlasDataTypeFor<T>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        in_out_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        COMPUTE_TYPE_32,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+
+  // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic.
+  std::sort(heuristic_result.begin(),
+      heuristic_result.end(),
+      [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) {
+      return GETINDEXFROMALGO(a.algo) < GETINDEXFROMALGO(b.algo);
+      });
+
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = GETINDEXFROMALGO(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<T, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = c10::str(
+        "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+
+  return ret;
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, ALayout, BLayout, GemmParams<T>>();
+}
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+
+#undef TORCH_HIPBLASLT_CHECK
+#undef GETINDEXFROMALGO
+#undef COMPUTE_TYPE_32
+#undef DATA_TYPE_R_32
+
+}  // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..37fcc3bea8e880cc7a2f547e00f1bff89405f49b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h
@@ -0,0 +1,275 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  AT_ERROR(
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+
+  private:
+    int solution_;
+};
+
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+
+}  // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..be2b23ca418a821deb10a05224868de164816c12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h
@@ -0,0 +1,34 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/tunable/Tunable.h>
+
+namespace at::cuda::tunable {
+
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    virtual ~StreamTimer();
+
+    void Start() override;
+
+    void End() override;
+
+    float Duration() override;
+
+  private:
+    cudaEvent_t start_;
+    cudaEvent_t end_;
+};
+
+} // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/Tunable.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/Tunable.h
new file mode 100644
index 0000000000000000000000000000000000000000..292c453aca1355ac6ded16be849933111ca259c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/Tunable.h
@@ -0,0 +1,205 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <c10/util/CallOnce.h>
+
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace at::cuda::tunable {
+
+static void TunableLog(const std::string& msg) {
+  static const char *env = getenv("PYTORCH_TUNABLEOP_VERBOSE");
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    std::cerr << msg << std::endl;
+  }
+}
+#define TUNABLE_LOG(...) TunableLog(c10::str(__VA_ARGS__))
+
+enum TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+
+// Mapping from params signature to kernel id
+class ResultEntry {
+  public:
+    explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {}
+    bool operator==(const ResultEntry& other) { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+
+  private:
+    std::string key_;
+    double time_;
+};
+
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+
+struct TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+
+class TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+
+    KernelMap Lookup(const std::string& op_signature);
+
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+
+    inline void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+
+    inline void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+
+    void Load(const ResultsMap& results_to_load);
+
+    ResultsMap Dump();
+
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+
+    size_t GetSize();
+
+  private:
+    std::mutex lock_;
+    ResultsMap results_;
+};
+
+class TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+
+  protected:
+    std::string GetPyTorchVersion() const;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+
+  private:
+    GetValidateFuncs validators_;
+};
+
+class TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+
+    void EnableTunableOp();
+    void DisableTunableOp();
+    bool IsTunableOpEnabled() const;
+
+    void EnableTuning();
+    void DisableTuning();
+    bool IsTuningEnabled() const;
+
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+
+    void EnableTunableOpAndTuning();
+    void DisableTunableOpAndTuning();
+
+    TuningResultsManager& GetTuningResultsManager();
+
+    TuningResultsValidator& GetTuningResultsValidator();
+
+    TuningResults GetTuningResults();
+
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+
+    void SetFilename(const std::string& filename);
+    std::string GetFilename() const;
+
+  protected:
+    bool ReadFile(const std::string& filename);
+    bool WriteFile(const std::string& filename);
+
+  private:
+    bool enable_;
+    bool tuning_enable_;
+    bool manager_initialized_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    size_t results_count_from_input_file_;
+};
+
+TuningContext* getTuningContext();
+
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+
+    virtual void Start() = 0;
+    virtual void End() = 0;
+
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+
+} // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e9edc0810eb5991b530afec7f17e85318143858
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h
@@ -0,0 +1,278 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/GemmCommon.h>
+#ifdef USE_ROCM
+#if ROCM_VERSION >= 50700
+#include <ATen/cuda/tunable/GemmHipblaslt.h>
+#endif
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#endif
+#include <ATen/cuda/tunable/StreamTimer.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+
+#ifdef USE_ROCM
+#include <rocm-core/rocm_version.h>
+#endif
+
+#define STRINGIFY(s) #s
+#define XSTRINGIFY(s) STRINGIFY(s)
+
+namespace at::cuda::tunable {
+
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::cuda::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::cuda::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+
+template <typename T>
+bool IsZero(T v) {
+  return v == 0.0f;
+}
+
+template <>
+bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+
+template <>
+bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+
+template <>
+bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+
+template <>
+bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+
+template <typename T>
+std::string TypeName(T v) {
+  return "unknown";
+}
+
+template <>
+std::string TypeName(float v) {
+  return "float";
+}
+
+template <>
+std::string TypeName(double v) {
+  return "double";
+}
+
+template <>
+std::string TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+
+template <>
+std::string TypeName(Half v) {
+  return "Half";
+}
+
+template <>
+std::string TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+
+template <>
+std::string TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>, StreamTimer> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+
+    if (validators.find("ROCM_VERSION") == validators.end()) {
+      std::string rocm_version = ROCM_BUILD_INFO;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCM_VERSION",
+          [rocm_version]() { return rocm_version; },
+          [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; });
+    }
+
+    if (validators.find("GCN_ARCH_NAME") == validators.end()) {
+      std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName;
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "GCN_ARCH_NAME",
+          [gcn_arch_name]() { return gcn_arch_name; },
+          [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; });
+    }
+
+    if (validators.find("ROCBLAS_VERSION") == validators.end()) {
+      std::string rocblas_version = c10::str(
+          XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".",
+          XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-",
+          XSTRINGIFY(ROCBLAS_VERSION_TWEAK));
+      getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+          "ROCBLAS_VERSION",
+          [rocblas_version]() { return rocblas_version; },
+          [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; });
+    }
+#endif
+
+#if defined(USE_ROCM) && ROCM_VERSION >= 50700
+    static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env == nullptr || strcmp(env, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+
+      if (validators.find("HIPBLASLT_VERSION") == validators.end()) {
+        std::string hipblaslt_version = c10::str(
+            XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".",
+            XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-",
+            XSTRINGIFY(HIPBLASLT_VERSION_TWEAK));
+        getTuningContext()->GetTuningResultsValidator().RegisterValidator(
+            "HIPBLASLT_VERSION",
+            [hipblaslt_version]() { return hipblaslt_version; },
+            [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; });
+      }
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    return c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+
+#undef XSTRINGIFY
+#undef STRINGIFY
+
+} // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..3047a90efc78d2355e3b8b7c4a74d53fd1a6c644
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h
@@ -0,0 +1,242 @@
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+
+#include <ATen/cuda/tunable/Tunable.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+namespace at::cuda::tunable {
+
+template <typename ParamsT>
+class Callable {
+  public:
+    Callable() = default;
+    Callable(Callable&&) = default;
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT*) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+
+template <typename ParamsT, typename TimerT>
+class TunableOp {
+  public:
+    TunableOp() = default;
+    TunableOp(TunableOp&&) = default;
+    virtual ~TunableOp() = default;
+
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null() && ctx->IsTuningEnabled()) {
+          result = FindFastest(params);
+          mgr.Add(op_sig, params_sig, result);
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+
+  private:
+    static void WarmUp(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+    }
+
+    static double Profile(Callable<ParamsT> *op, ParamsT* param, size_t num_iter) {
+      TimerT timer{};
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        TORCH_CHECK(op->Call(param) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+
+  protected:
+    bool IsNumericsCheckEnabled() {
+      static const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
+      if (env != nullptr && strcmp(env, "0") == 0) {
+        return false;
+      }
+      return true;
+    }
+
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      TUNABLE_LOG("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+
+      // calcaulte a reference answer for numerical check
+      ParamsT* reference_params = params->DeepCopy();
+      TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+
+      // need a copy of params to reuse
+      ParamsT* reusable_params = params->DeepCopy();
+
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+        auto status = candidate->Call(reusable_params);
+        if (status != OK) {
+          TUNABLE_LOG("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        if (IsNumericsCheckEnabled()) {
+          ParamsT* numerical_params = params->DeepCopy();
+          WarmUp(candidate, numerical_params, 1);
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+
+        // collect a small profile
+        constexpr const int approx_num_iter = 3;
+        auto approx_duration = Profile(candidate, reusable_params, approx_num_iter);
+        // bail if too slow
+        if (approx_duration > 2 * min_duration_ms) {
+          TUNABLE_LOG("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+
+        // for warmup does user set max duration, max iters, or both?
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 1; // default
+        if (max_warmup_duration > 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter > 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter > 0) {
+          warmup_iter = max_warmup_iter;
+        }
+
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        WarmUp(candidate, reusable_params, warmup_iter);
+        auto duration_ms = Profile(candidate, reusable_params, tuning_iter);
+        if (duration_ms < min_duration_ms) {
+          TUNABLE_LOG("├──found better instance id=", i, ". " , duration_ms, "ms. ", op_names_[i]);
+          min_duration_ms = duration_ms;
+          id_name = op_names_[i];
+        }
+      }
+
+      reusable_params->Delete();
+      reference_params->Delete();
+
+      TUNABLE_LOG("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      return ResultEntry(id_name, min_duration_ms);
+    }
+
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+
+struct OpParams {
+  OpParams() {}
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+};
+
+} // namespace at::cuda::tunable
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Descriptors.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Descriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d457601eb0fe115fb7816e39c43594a47414da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Descriptors.h
@@ -0,0 +1,391 @@
+#pragma once
+
+#include <string>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cudnn/Utils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <cuda.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8907
+#define USE_CUDNN_RNN_V8_API
+#endif
+
+namespace at { namespace native {
+
+std::string cudnnTypeToString(cudnnDataType_t dtype);
+
+// TODO: Add constructors for all of the descriptors
+
+inline int dataSize(cudnnDataType_t dataType)
+{
+  switch (dataType) {
+#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8200
+    case CUDNN_DATA_BFLOAT16:
+#endif
+    case CUDNN_DATA_HALF: return 2;
+    case CUDNN_DATA_FLOAT: return 4;
+    default: return 8;
+  }
+}
+
+// The stride for a size-1 dimensions is not uniquely determined; in
+// fact, it can be anything you want, because the fact that the
+// tensor is size 1 at this dimension means that you will never actually
+// try advancing your pointer by this stride.
+//
+// However, CuDNN has a much more stringent requirement on strides:
+// if you are passing a contiguous input, it better be the case
+// that the stride for dim i is the product of the sizes of dims
+// i+1 to the end.  This stride is indeed uniquely determined.  This
+// function modifies 'stride' in place so this invariant holds.
+template <typename T>
+static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
+  int64_t z = 1;
+  int index = 0;
+  std::vector<int> permutation(dim);
+
+  if (nhwc) {
+    permutation[index++] = 1;
+  }
+  for (int d = dim-1; d > 1; d--) {
+    permutation[index++] = d;
+  }
+  if (!nhwc) {
+    permutation[index++] = 1;
+  }
+  permutation[index++] = 0;
+  for (int d : permutation) {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
+template <typename T, cudnnStatus_t (*dtor)(T*)>
+struct DescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      AT_CUDNN_CHECK(dtor(x));
+    }
+  }
+};
+
+// A generic class for wrapping cuDNN descriptor types.  All you need
+// is to give the underlying type the Descriptor_t points to (usually,
+// if it's cudnnTensorDescriptor_t it points to cudnnTensorStruct),
+// the constructor and the destructor.  Subclasses are responsible
+// for defining a set() function to actually set the descriptor.
+//
+// Descriptors default construct to a nullptr, and have a descriptor
+// initialized the first time you call set() or any other initializing
+// function.
+template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
+class TORCH_CUDA_CPP_API Descriptor {
+ public:
+  // TODO: Figure out why const-correctness doesn't work here
+
+  // Use desc() to access the underlying descriptor pointer in
+  // a read-only fashion.  Most client code should use this.
+  // If the descriptor was never initialized, this will return
+  // nullptr.
+  T* desc() const { return desc_.get(); }
+  T* desc() { return desc_.get(); }
+
+  // Use mut_desc() to access the underlying descriptor pointer
+  // if you intend to modify what it points to (e.g., using
+  // cudnnSetFooDescriptor).  This will ensure that the descriptor
+  // is initialized.  Code in this file will use this function.
+  T* mut_desc() { init(); return desc_.get(); }
+protected:
+  void init() {
+    if (desc_ == nullptr) {
+      T* raw_desc;
+      AT_CUDNN_CHECK(ctor(&raw_desc));
+      desc_.reset(raw_desc);
+    }
+  }
+private:
+  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
+};
+
+class TORCH_CUDA_CPP_API RNNDataDescriptor : public Descriptor<
+                                       cudnnRNNDataStruct,
+                                       &cudnnCreateRNNDataDescriptor,
+                                       &cudnnDestroyRNNDataDescriptor> {
+public:
+  void set(const at::Tensor &t, cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize, int vectorSize, const int* seqLengthArray);
+private:
+  void set(cudnnDataType_t dataType, cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize, int vectorSize, const int* seqLengthArray) {
+    AT_CUDNN_CHECK(cudnnSetRNNDataDescriptor(mut_desc(), dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, NULL));
+  }
+};
+
+class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
+                                               cudnnTensorStruct,
+                                               &cudnnCreateTensorDescriptor,
+                                               &cudnnDestroyTensorDescriptor> {
+ public:
+  TensorDescriptor() = default;
+  explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
+    set(t, pad);
+  }
+
+  // Note [CuDNN broadcast padding]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // pad specifies the minimum dimensionality of the tensor descriptor
+  // we produce (it doesn't have anything to do with, e.g., convolution
+  // padding).  If 't' is lower-dimensional than 'pad', the remaining
+  // dimensions (on the right) are padded with ones.  This doesn't
+  // affect the underlying data layout.  This is particularly useful for
+  // dealing with a peculiarity of the CuDNN API, which is that broadcasting in CuDNN is
+  // done in two steps: first, the client code is expected to pad out
+  // (the dimensions) input tensors to be the same dimension as the
+  // target broadcast, and then second, CuDNN takes of actually
+  // broadcasting size 1 dimensions.
+
+  void set(const at::Tensor &t, size_t pad = 0);
+  void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
+  void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
+
+  void print();
+
+private:
+  void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
+
+  void set(cudnnDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    fixSizeOneDimStride<int>(dim, size, stride, nhwc);
+    AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
+
+class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
+                                               cudnnFilterStruct,
+                                               &cudnnCreateFilterDescriptor,
+                                               &cudnnDestroyFilterDescriptor> {
+ public:
+  void set(const at::Tensor &t, int64_t pad = 0) {
+    set(t, at::MemoryFormat::Contiguous, pad);
+  }
+
+  void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
+
+  void print();
+private:
+  void set(cudnnDataType_t dataType, int dim, int* size, cudnnTensorFormat_t filter_format) {
+    AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, filter_format, dim, size));
+  }
+};
+
+std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d);
+
+struct TORCH_CUDA_CPP_API ConvolutionDescriptor
+    : public Descriptor<
+          cudnnConvolutionStruct,
+          &cudnnCreateConvolutionDescriptor,
+          &cudnnDestroyConvolutionDescriptor> {
+  void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool allow_tf32) {
+    cudnnDataType_t mathType = dataType;
+    if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
+    AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
+                                          CUDNN_CROSS_CORRELATION, mathType));
+    AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
+    // See Note [behavior of cudnnFind and cudnnGet]
+    AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
+    if(dataType == CUDNN_DATA_HALF) {
+      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
+    } else if (dataType == CUDNN_DATA_FLOAT && !allow_tf32) {
+      AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_FMA_MATH));
+    }
+  }
+};
+
+struct TORCH_CUDA_CPP_API SpatialTransformerDescriptor
+    : public Descriptor<
+          cudnnSpatialTransformerStruct,
+          &cudnnCreateSpatialTransformerDescriptor,
+          &cudnnDestroySpatialTransformerDescriptor> {
+  void set(cudnnDataType_t dataType, int dim, int* size) {
+    AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size));
+  }
+};
+
+struct TORCH_CUDA_CPP_API DropoutDescriptor
+    : public Descriptor<
+          cudnnDropoutStruct,
+          &cudnnCreateDropoutDescriptor,
+          &cudnnDestroyDropoutDescriptor> {
+  at::Tensor state;
+
+  // Initialize a dropout descriptor's RNG state.
+  // WARNING: This function is very expensive, avoid calling this function!
+  void initialize_rng(cudnnHandle_t handle, float dropout, long long int seed, const TensorOptions& options) {
+    TORCH_INTERNAL_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    size_t state_size;
+    AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));
+    AT_ASSERT(options.device().type() == kCUDA);
+    AT_ASSERT(options.dtype() == kByte);
+    state = at::empty({static_cast<int64_t>(state_size)}, options);
+    AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));
+  }
+
+  // Restore a dropout descriptor given a dropout probability and existing RNG state.
+  void set(cudnnHandle_t handle, float dropout, at::Tensor state_) {
+    TORCH_INTERNAL_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
+    state = state_;
+    void *state_ptr = state.data_ptr();
+    size_t state_size = state.size(0);
+    // NB: The seed doesn't actually matter, so we give a dummy value
+    AT_CUDNN_CHECK(cudnnRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */));
+  }
+
+  // Restore a dropout descriptor corresponding to no dropout
+  void set_no_dropout(cudnnHandle_t handle) {
+    // NB: seed doesn't matter when dropout = 0, because no random number
+    // initialization actually takes place when there is no dropout.
+    // NB: Empirically, cudnnSetDropoutDescriptor is cheap when
+    // dropout == 0
+    AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */));
+  }
+};
+
+struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor<
+                                             cudnnRNNStruct,
+                                             &cudnnCreateRNNDescriptor,
+                                             &cudnnDestroyRNNDescriptor> {
+  DropoutDescriptor dropout_desc_;
+  void set(cudnnHandle_t handle,
+#ifdef USE_CUDNN_RNN_V8_API
+       int input_size,
+       bool packed,
+#endif
+       int hidden_size, int proj_size, int num_layers, DropoutDescriptor&& dropout_desc,
+           cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
+           cudnnRNNMode_t mode, cudnnDataType_t datatype, cudnnDataType_t input_type, cudnnRNNAlgo_t algo, bool allow_tf32) {
+    dropout_desc_ = std::move(dropout_desc);
+#ifndef USE_CUDNN_RNN_V8_API
+    AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
+          handle,
+          mut_desc(),
+          hidden_size,
+          num_layers,
+          dropout_desc_.desc(),
+          input_mode,
+          bidirectional,
+          mode,
+          algo,
+          datatype));
+    if (proj_size != 0) {
+      AT_CUDNN_CHECK(cudnnSetRNNProjectionLayers(
+            handle,
+            /*rnnDesc=*/mut_desc(),
+            /*recProjSize=*/proj_size,
+            /*outProjSize=*/0));
+    }
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    if (prop->major >= 7) {
+      if (input_type == CUDNN_DATA_HALF) {
+        cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
+      }
+      else if (input_type == CUDNN_DATA_FLOAT && !allow_tf32) {
+        cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_FMA_MATH);
+      }
+      else {
+        // Technically, as the default it's not necessary to explicitly
+        // set this.
+        cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH);
+      }
+    }
+#else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+    auto math_type = CUDNN_DEFAULT_MATH;
+    if (prop->major >= 7) {
+      if (input_type == CUDNN_DATA_HALF) {
+        math_type = CUDNN_TENSOR_OP_MATH;
+      } else if (!allow_tf32) {
+        math_type = CUDNN_FMA_MATH;
+      }
+    }
+    AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v8(
+          mut_desc(),
+          algo,
+          mode,
+          CUDNN_RNN_DOUBLE_BIAS,
+          bidirectional,
+          input_mode,
+          input_type,
+          datatype,
+          math_type,
+          input_size,
+          hidden_size,
+          proj_size ? proj_size : hidden_size,
+          num_layers,
+          dropout_desc_.desc(),
+          packed ? CUDNN_RNN_PADDED_IO_DISABLED : CUDNN_RNN_PADDED_IO_ENABLED));
+#endif
+  }
+};
+
+struct TORCH_CUDA_CPP_API CTCLossDescriptor
+    : public Descriptor<
+          cudnnCTCLossStruct,
+          &cudnnCreateCTCLossDescriptor,
+          &cudnnDestroyCTCLossDescriptor> {
+  void set(cudnnDataType_t datatype) {
+    AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
+  }
+  void setEx(
+      cudnnDataType_t datatype,
+      cudnnLossNormalizationMode_t normMode,
+      cudnnNanPropagation_t gradMode) {
+    AT_CUDNN_CHECK(
+        cudnnSetCTCLossDescriptorEx(mut_desc(), datatype, normMode, gradMode));
+  }
+};
+
+struct TORCH_CUDA_CPP_API ActivationDescriptor
+    : public Descriptor<
+          cudnnActivationStruct,
+          &cudnnCreateActivationDescriptor,
+          &cudnnDestroyActivationDescriptor> {
+  void set(cudnnActivationMode_t mode) {
+    AT_ASSERT(
+        mode == CUDNN_ACTIVATION_RELU,
+        "TODO: support more cuDNN activation modes");
+    AT_CUDNN_CHECK(cudnnSetActivationDescriptor(
+        mut_desc(),
+        mode,
+        cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN,
+        std::numeric_limits<double>::max()));
+  }
+};
+
+union Constant
+{
+  float f;
+  double d;
+  Constant(cudnnDataType_t dataType, double value) {
+    if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) {
+      f = static_cast<float>(value);
+    } else {
+      d = value;
+    }
+  }
+};
+
+}}  // namespace
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Exceptions.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handle.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..3415d86dd944d4b0451ea4d3586cbe807c8f32eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handle.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+
+namespace at { namespace native {
+
+TORCH_CUDA_CPP_API cudnnHandle_t getCudnnHandle();
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handles.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handles.h
new file mode 100644
index 0000000000000000000000000000000000000000..65b5d4454879ad165c8e002fc5df4c400da9303a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Handles.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/cudnn/Handle.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Types.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..31e39404036b946c7db6b8d9f14706905a18cf30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Types.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/Tensor.h>
+
+namespace at { namespace native {
+
+TORCH_CUDA_CPP_API cudnnDataType_t
+getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
+cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);
+
+int64_t cudnn_version();
+
+}}  // namespace at::cudnn
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Utils.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c132840385d8e7bf2de5e1b2afc9cf05a8a2b9ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/Utils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cudnn/cudnn-wrapper.h>
+#include <ATen/cudnn/Handle.h>
+
+namespace at { namespace native {
+
+// cuDNN has a buggy check for tensor being contiguous (that is, it does
+// not ignore stride for dimension that is equal to 0).  This function
+// makes tensors which have zero stride contiguous, by setting the
+// strides to 1 as cuDNN likes.
+inline Tensor contiguousIfZeroInStrides(const Tensor& t) {
+  for (auto s : t.strides()) {
+    if (s == 0) return t.contiguous();
+  }
+  return t;
+}
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/cudnn/cudnn-wrapper.h b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/cudnn-wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbea50d26fd01f5ee211ce0128ac220adafdcb5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/cudnn/cudnn-wrapper.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cudnn.h>
+
+#define STRINGIFY(x) #x
+#define STRING(x) STRINGIFY(x)
+
+#if CUDNN_MAJOR < 6
+#pragma message ("CuDNN v" STRING(CUDNN_MAJOR) " found, but need at least CuDNN v6. You can get the latest version of CuDNN from https://developer.nvidia.com/cudnn or disable CuDNN with USE_CUDNN=0")
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#endif
+
+#undef STRINGIFY
+#undef STRING
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/AcceleratorHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/AcceleratorHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..d013b91e9d50f54e518d29bff44aa34e3d6cf903
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/AcceleratorHooksInterface.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <c10/core/Device.h>
+
+namespace at {
+
+// AcceleratorHooksInterface is a shared interface provided by all
+// accelerators to allow generic code.
+// This inferface is hook-based as it corresponds to all the functions
+// that are going to be called in a generic way from the CPU code.
+
+struct TORCH_API AcceleratorHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~AcceleratorHooksInterface() = default;
+
+  // Whether the device at device_index is fully initialized or not.
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const = 0;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/CUDAHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/CUDAHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce9d84e62d0389c24aac9852aa315e5cb661844d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/CUDAHooksInterface.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
+// Forward-declares at::Generator and at::cuda::NVRTC
+namespace at {
+struct Generator;
+namespace cuda {
+struct NVRTC;
+} // namespace cuda
+} // namespace at
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+#ifdef _MSC_VER
+constexpr const char* CUDA_HELP =
+  "PyTorch splits its backend into two shared libraries: a CPU library "
+  "and a CUDA library; this error has occurred because you are trying "
+  "to use some CUDA functionality, but the CUDA library has not been "
+  "loaded by the dynamic linker for some reason.  The CUDA library MUST "
+  "be loaded, EVEN IF you don't directly use any symbols from the CUDA library! "
+  "One common culprit is a lack of -INCLUDE:?warp_size@cuda@at@@YAHXZ "
+  "in your link arguments; many dynamic linkers will delete dynamic library "
+  "dependencies if you don't depend on any of their symbols.  You can check "
+  "if this has occurred by using link on your binary to see if there is a "
+  "dependency on *_cuda.dll library.";
+#else
+constexpr const char* CUDA_HELP =
+  "PyTorch splits its backend into two shared libraries: a CPU library "
+  "and a CUDA library; this error has occurred because you are trying "
+  "to use some CUDA functionality, but the CUDA library has not been "
+  "loaded by the dynamic linker for some reason.  The CUDA library MUST "
+  "be loaded, EVEN IF you don't directly use any symbols from the CUDA library! "
+  "One common culprit is a lack of -Wl,--no-as-needed in your link arguments; many "
+  "dynamic linkers will delete dynamic library dependencies if you don't "
+  "depend on any of their symbols.  You can check if this has occurred by "
+  "using ldd on your binary to see if there is a dependency on *_cuda.so "
+  "library.";
+#endif
+
+// The CUDAHooksInterface is an omnibus interface for any CUDA functionality
+// which we may want to call into from CPU code (and thus must be dynamically
+// dispatched, to allow for separate compilation of CUDA code).  How do I
+// decide if a function should live in this class?  There are two tests:
+//
+//  1. Does the *implementation* of this function require linking against
+//     CUDA libraries?
+//
+//  2. Is this function *called* from non-CUDA ATen code?
+//
+// (2) should filter out many ostensible use-cases, since many times a CUDA
+// function provided by ATen is only really ever used by actual CUDA code.
+//
+// TODO: Consider putting the stub definitions in another class, so that one
+// never forgets to implement each virtual function in the real implementation
+// in CUDAHooks.  This probably doesn't buy us much though.
+struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~CUDAHooksInterface() override = default;
+
+  // Initialize THCState and, transitively, the CUDA state
+  virtual void initCUDA() const {
+    TORCH_CHECK(false, "Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual const Generator& getDefaultCUDAGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Cannot get default CUDA generator without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual Device getDeviceFromPtr(void* /*data*/) const {
+    TORCH_CHECK(false, "Cannot get device of pointer on CUDA without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual bool isPinnedPtr(const void* /*data*/) const {
+    return false;
+  }
+
+  virtual bool hasCUDA() const {
+    return false;
+  }
+
+  virtual bool hasCUDART() const {
+    return false;
+  }
+
+  virtual bool hasMAGMA() const {
+    return false;
+  }
+
+  virtual bool hasCuDNN() const {
+    return false;
+  }
+
+  virtual bool hasCuSOLVER() const {
+    return false;
+  }
+
+  virtual bool hasROCM() const {
+    return false;
+  }
+
+  virtual const at::cuda::NVRTC& nvrtc() const {
+    TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP);
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual DeviceIndex current_device() const {
+    return -1;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    TORCH_CHECK(false, "Pinned memory requires CUDA. ", CUDA_HELP);
+  }
+
+  virtual Allocator* getCUDADeviceAllocator() const {
+    TORCH_CHECK(false, "CUDADeviceAllocator requires CUDA. ", CUDA_HELP);
+  }
+
+  virtual bool compiledWithCuDNN() const {
+    return false;
+  }
+
+  virtual bool compiledWithMIOpen() const {
+    return false;
+  }
+
+  virtual bool supportsDilatedConvolutionWithCuDNN() const {
+    return false;
+  }
+
+  virtual bool supportsDepthwiseConvolutionWithCuDNN() const {
+    return false;
+  }
+
+  virtual bool supportsBFloat16ConvolutionWithCuDNNv8() const {
+    return false;
+  }
+
+  virtual long versionCuDNN() const {
+    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual long versionCUDART() const {
+    TORCH_CHECK(false, "Cannot query CUDART version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(false, "Cannot query detailed CUDA version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual double batchnormMinEpsilonCuDNN() const {
+    TORCH_CHECK(false,
+        "Cannot query batchnormMinEpsilonCuDNN() without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual void cuFFTSetPlanCacheMaxSize(DeviceIndex /*device_index*/, int64_t /*max_size*/) const {
+    TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int64_t cuFFTGetPlanCacheSize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual void cuFFTClearPlanCache(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot access cuFFT plan cache without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual int getNumGPUs() const {
+    return 0;
+  }
+
+  virtual void deviceSynchronize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API CUDAHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(CUDAHooksRegistry, CUDAHooksInterface, CUDAHooksArgs);
+#define REGISTER_CUDA_HOOKS(clsname) \
+  C10_REGISTER_CLASS(CUDAHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const CUDAHooksInterface& getCUDAHooks();
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/FunctionTraits.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/FunctionTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..51fe0b8320a38aad0cb7c253f701674051d3a496
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/FunctionTraits.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <tuple>
+
+// Modified from https://stackoverflow.com/questions/7943525/is-it-possible-to-figure-out-the-parameter-type-and-return-type-of-a-lambda
+
+// Fallback, anything with an operator()
+template <typename T>
+struct function_traits : public function_traits<decltype(&T::operator())> {
+};
+
+// Pointers to class members that are themselves functors.
+// For example, in the following code:
+// template <typename func_t>
+// struct S {
+//     func_t f;
+// };
+// template <typename func_t>
+// S<func_t> make_s(func_t f) {
+//     return S<func_t> { .f = f };
+// }
+//
+// auto s = make_s([] (int, float) -> double { /* ... */ });
+//
+// function_traits<decltype(&s::f)> traits;
+template <typename ClassType, typename T>
+struct function_traits<T ClassType::*> : public function_traits<T> {
+};
+
+// Const class member functions
+template <typename ClassType, typename ReturnType, typename... Args>
+struct function_traits<ReturnType(ClassType::*)(Args...) const> : public function_traits<ReturnType(Args...)> {
+};
+
+// Reference types
+template <typename T>
+struct function_traits<T&> : public function_traits<T> {};
+template <typename T>
+struct function_traits<T*> : public function_traits<T> {};
+
+// Free functions
+template <typename ReturnType, typename... Args>
+struct function_traits<ReturnType(Args...)> {
+  // arity is the number of arguments.
+  enum { arity = sizeof...(Args) };
+
+  typedef std::tuple<Args...> ArgsTuple;
+  typedef ReturnType result_type;
+
+  template <size_t i>
+  struct arg
+  {
+      typedef typename std::tuple_element<i, std::tuple<Args...>>::type type;
+      // the i-th argument is equivalent to the i-th tuple element of a tuple
+      // composed of those arguments.
+  };
+};
+
+template <typename T>
+struct nullary_function_traits {
+  using traits = function_traits<T>;
+  using result_type = typename traits::result_type;
+};
+
+template <typename T>
+struct unary_function_traits {
+  using traits = function_traits<T>;
+  using result_type = typename traits::result_type;
+  using arg1_t = typename traits::template arg<0>::type;
+};
+
+template <typename T>
+struct binary_function_traits {
+  using traits = function_traits<T>;
+  using result_type = typename traits::result_type;
+  using arg1_t = typename traits::template arg<0>::type;
+  using arg2_t = typename traits::template arg<1>::type;
+};
+
+
+// Traits for calling with c10::guts::invoke, where member_functions have a first argument of ClassType
+template <typename T>
+struct invoke_traits : public function_traits<T>{
+};
+
+template <typename T>
+struct invoke_traits<T&> : public invoke_traits<T>{
+};
+
+template <typename T>
+struct invoke_traits<T&&> : public invoke_traits<T>{
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct invoke_traits<ReturnType(ClassType::*)(Args...)> :
+  public function_traits<ReturnType(ClassType&, Args...)> {
+};
+
+template <typename ClassType, typename ReturnType, typename... Args>
+struct invoke_traits<ReturnType(ClassType::*)(Args...) const> :
+  public function_traits<ReturnType(const ClassType&, Args...)> {
+};
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/HIPHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/HIPHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9866a872b67849917903de7399061d2879782c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/HIPHooksInterface.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/util/Exception.h>
+
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+#include <memory>
+
+namespace at {
+class Context;
+}
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+// The HIPHooksInterface is an omnibus interface for any HIP functionality
+// which we may want to call into from CPU code (and thus must be dynamically
+// dispatched, to allow for separate compilation of HIP code).  See
+// CUDAHooksInterface for more detailed motivation.
+struct TORCH_API HIPHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~HIPHooksInterface() = default;
+
+  // Initialize the HIP library state
+  virtual void initHIP() const {
+    AT_ERROR("Cannot initialize HIP without ATen_hip library.");
+  }
+
+  virtual std::unique_ptr<c10::GeneratorImpl> initHIPGenerator(Context*) const {
+    AT_ERROR("Cannot initialize HIP generator without ATen_hip library.");
+  }
+
+  virtual bool hasHIP() const {
+    return false;
+  }
+
+  virtual c10::DeviceIndex current_device() const {
+    return -1;
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    AT_ERROR("Pinned memory requires HIP.");
+  }
+
+  virtual void registerHIPTypes(Context*) const {
+    AT_ERROR("Cannot registerHIPTypes() without ATen_hip library.");
+  }
+
+  virtual int getNumGPUs() const {
+    return 0;
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API HIPHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(HIPHooksRegistry, HIPHooksInterface, HIPHooksArgs);
+#define REGISTER_HIP_HOOKS(clsname) \
+  C10_REGISTER_CLASS(HIPHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const HIPHooksInterface& getHIPHooks();
+
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/IPUHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/IPUHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..daa89c733779d99fd49f384de638df2ce569b728
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/IPUHooksInterface.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+namespace at {
+
+struct TORCH_API IPUHooksInterface {
+  virtual ~IPUHooksInterface() = default;
+
+  virtual const Generator& getDefaultIPUGenerator(
+      DeviceIndex device_index = -1) const {
+    AT_ERROR(
+        "Cannot get the default IPU generator: the IPU backend is not "
+        "available.");
+  }
+
+  virtual Generator newIPUGenerator(DeviceIndex device_index = -1) const {
+    AT_ERROR(
+        "Cannot create a new IPU generator: the IPU backend is not available.");
+  }
+};
+
+struct TORCH_API IPUHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(IPUHooksRegistry, IPUHooksInterface, IPUHooksArgs);
+#define REGISTER_IPU_HOOKS(clsname) \
+  C10_REGISTER_CLASS(IPUHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const IPUHooksInterface& getIPUHooks();
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/MPSHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/MPSHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..e81b590476e3b5ed0d99ec99006c2ed8e9d62572
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/MPSHooksInterface.h
@@ -0,0 +1,106 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+
+namespace at {
+
+struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
+  // this fails the implementation if MPSHooks functions are called, but
+  // MPS backend is not present.
+  #define FAIL_MPSHOOKS_FUNC(func) \
+    TORCH_CHECK(false, "Cannot execute ", func, "() without MPS backend.");
+
+  virtual ~MPSHooksInterface() override = default;
+
+  // Initialize the MPS library state
+  virtual void initMPS() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual bool hasMPS() const {
+    return false;
+  }
+  virtual bool isOnMacOSorNewer(unsigned major = 13, unsigned minor = 0) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual const Generator& getDefaultMPSGenerator() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual Allocator* getMPSDeviceAllocator() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void deviceSynchronize() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void commitStream() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void* getCommandBuffer() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void* getDispatchQueue() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void emptyCache() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual size_t getCurrentAllocatedMemory() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual size_t getDriverAllocatedMemory() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void setMemoryFraction(double /*ratio*/) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void profilerStopTrace() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual uint32_t acquireEvent(bool enable_timing) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void releaseEvent(uint32_t event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void recordEvent(uint32_t event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void waitForEvent(uint32_t event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual void synchronizeEvent(uint32_t event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual bool queryEvent(uint32_t event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
+  #undef FAIL_MPSHOOKS_FUNC
+};
+
+struct TORCH_API MPSHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(MPSHooksRegistry, MPSHooksInterface, MPSHooksArgs);
+#define REGISTER_MPS_HOOKS(clsname) \
+  C10_REGISTER_CLASS(MPSHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const MPSHooksInterface& getMPSHooks();
+
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/MTIAHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/MTIAHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..43c110777cd0cded229b28587f3feb0dcf1b584c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/MTIAHooksInterface.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <c10/util/Registry.h>
+
+#include <ATen/detail/AcceleratorHooksInterface.h>
+
+#include <string>
+
+namespace at {
+class Context;
+}
+
+namespace at {
+
+constexpr const char* MTIA_HELP =
+    "The MTIA backend requires MTIA extension for PyTorch;"
+    "this error has occurred because you are trying "
+    "to use some MTIA's functionality without MTIA extension included.";
+
+struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
+  virtual ~MTIAHooksInterface() override = default;
+
+  virtual void initMTIA() const {
+    TORCH_CHECK(
+        false,
+        "Cannot initialize MTIA without MTIA Extension for PyTorch.",
+        MTIA_HELP);
+  }
+
+  virtual bool hasMTIA() const {
+    return false;
+  }
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(
+        false,
+        "Cannot query detailed MTIA version without MTIA Extension for PyTorch.",
+        MTIA_HELP);
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK(
+        false,
+        "Cannot check MTIA primary context without MTIA Extension for PyTorch.",
+        MTIA_HELP);
+  }
+
+};
+
+struct TORCH_API MTIAHooksArgs {};
+
+C10_DECLARE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs);
+#define REGISTER_MTIA_HOOKS(clsname) \
+  C10_REGISTER_CLASS(MTIAHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const MTIAHooksInterface& getMTIAHooks();
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/ORTHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/ORTHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..af22f687c13d6d81365dfbc5def7739165339f8b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/ORTHooksInterface.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+constexpr const char* ORT_HELP =
+  " You need to 'import torch_ort' to use the 'ort' device in PyTorch. "
+  "The 'torch_ort' module is provided by the ONNX Runtime itself "
+  "(https://onnxruntime.ai).";
+
+// NB: Class must live in `at` due to limitations of Registry.h.
+namespace at {
+
+struct TORCH_API ORTHooksInterface {
+  // This should never actually be implemented, but it is used to
+  // squelch -Werror=non-virtual-dtor
+  virtual ~ORTHooksInterface() = default;
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(false, "Cannot query detailed ORT version information.", ORT_HELP);
+  }
+};
+
+// NB: dummy argument to suppress "ISO C++11 requires at least one argument
+// for the "..." in a variadic macro"
+struct TORCH_API ORTHooksArgs {};
+
+TORCH_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs);
+#define REGISTER_ORT_HOOKS(clsname) \
+  C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const ORTHooksInterface& getORTHooks();
+} // namespace detail
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/PrivateUse1HooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/PrivateUse1HooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..330677fe95df38d471e2e106adae0aa22bb64034
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/PrivateUse1HooksInterface.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/Storage.h>
+#include <c10/util/Exception.h>
+namespace at {
+
+struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
+  virtual ~PrivateUse1HooksInterface() override = default;
+  virtual const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDefaultGenerator`.");
+  }
+
+  virtual at::Device getDeviceFromPtr(void* data) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
+  }
+
+  virtual Allocator* getPinnedMemoryAllocator() const {
+    TORCH_CHECK(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
+  }
+
+  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
+  }
+
+  virtual void initPrivateUse1() const {}
+  virtual void resizePrivateUse1Bytes(const c10::Storage &storage, size_t newsize) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `resizePrivateUse1Bytes`.");
+  }
+};
+
+struct TORCH_API PrivateUse1HooksArgs {};
+
+TORCH_API void RegisterPrivateUse1HooksInterface(
+    at::PrivateUse1HooksInterface* hook_);
+
+TORCH_API at::PrivateUse1HooksInterface* GetPrivateUse1HooksInterface();
+
+TORCH_API bool isPrivateUse1HooksRegistered();
+
+namespace detail {
+
+TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks();
+
+} // namespace detail
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/detail/XPUHooksInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/detail/XPUHooksInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..44b31b2348dd96ad3a0f4903cc96d046ea900e2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/detail/XPUHooksInterface.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/util/Exception.h>
+#include <ATen/core/Generator.h>
+#include <c10/util/Registry.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+
+namespace at {
+
+constexpr const char* XPU_HELP =
+    "The XPU backend requires Intel Extension for Pytorch;"
+    "this error has occurred because you are trying "
+    "to use some XPU's functionality, but the Intel Extension for Pytorch has not been "
+    "loaded for some reason. The Intel Extension for Pytorch MUST "
+    "be loaded, EVEN IF you don't directly use any symbols from that!";
+
+struct TORCH_API XPUHooksInterface {
+  virtual ~XPUHooksInterface() {}
+
+  virtual void initXPU() const {
+    TORCH_CHECK(
+        false,
+        "Cannot initialize XPU without Intel Extension for Pytorch.",
+        XPU_HELP);
+  }
+
+  virtual bool hasXPU() const {
+    return false;
+  }
+
+  virtual std::string showConfig() const {
+    TORCH_CHECK(
+        false,
+        "Cannot query detailed XPU version without Intel Extension for Pytorch. ",
+        XPU_HELP);
+  }
+
+  virtual int32_t getGlobalIdxFromDevice(const Device& device) const {
+    TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
+  }
+
+  virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Cannot get XPU generator without Intel Extension for Pytorch. ", XPU_HELP);
+  }
+
+  virtual const Generator& getDefaultXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Cannot get default XPU generator without Intel Extension for Pytorch. ", XPU_HELP);
+  }
+
+  virtual DeviceIndex getNumGPUs() const {
+    return 0;
+  }
+
+  virtual DeviceIndex current_device() const {
+    TORCH_CHECK(false, "Cannot get current device on XPU without ATen_xpu library.");
+  }
+
+  virtual Device getDeviceFromPtr(void* /*data*/) const {
+    TORCH_CHECK(false, "Cannot get device of pointer on XPU without ATen_xpu library.");
+  }
+
+  virtual void deviceSynchronize(DeviceIndex /*device_index*/) const {
+    TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library.");
+  }
+};
+
+struct TORCH_API XPUHooksArgs {};
+
+C10_DECLARE_REGISTRY(XPUHooksRegistry, XPUHooksInterface, XPUHooksArgs);
+#define REGISTER_XPU_HOOKS(clsname) \
+  C10_REGISTER_CLASS(XPUHooksRegistry, clsname, clsname)
+
+namespace detail {
+TORCH_API const XPUHooksInterface& getXPUHooks();
+} // namespace detail
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/div_rtn.h b/MLPY/Lib/site-packages/torch/include/ATen/div_rtn.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a6d088b798c2ac96e58107db224a35ba5c9e8c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/div_rtn.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// Integer division rounding to -Infinity
+template <typename T>
+static inline T div_rtn(T x, T y) {
+  int q = x / y;
+  int r = x % y;
+  if ((r != 0) && ((r < 0) != (y < 0)))
+    --q;
+  return q;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/dlpack.h b/MLPY/Lib/site-packages/torch/include/ATen/dlpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5a3a5a0143123038d7c0a3ed43fdaffb0eae359
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/dlpack.h
@@ -0,0 +1,232 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 80
+
+/*! \brief The current ABI version of dlpack */
+#define DLPACK_ABI_VERSION 1
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  const int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  const int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void * manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor * self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/ADInterpreters.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/ADInterpreters.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc151cedd1b05936922a94df29d771f39884e749
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/ADInterpreters.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/functorch/Interpreter.h>
+
+namespace at::functorch {
+
+// These are the interpreters for our AD transforms
+// (grad, vjp and jvp).
+// See NOTE: [functorch interpreter stack] for more details.
+
+struct TORCH_API GradInterpreterPtr {
+  explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool prevGradMode() const {
+    return std::get<GradInterpreterMeta>(base_->meta()).prevGradMode_;
+  }
+  Tensor lift(const Tensor& tensor) const;
+ private:
+  const Interpreter* base_;
+};
+
+struct TORCH_API JvpInterpreterPtr {
+  explicit JvpInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Jvp); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool prevFwdGradMode() const {
+    return std::get<JvpInterpreterMeta>(base_->meta()).prevFwdGradMode_;
+  }
+  Tensor lift(const Tensor& tensor) const;
+ private:
+  const Interpreter* base_;
+};
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c90dbabbe4c422767637946385700c2794e6d91b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h
@@ -0,0 +1,475 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+
+#include <c10/util/TypeList.h>
+
+#include <ATen/ATen.h>
+#include <ATen/Operators.h>
+
+#include <ATen/functorch/DynamicLayer.h>
+#include <ATen/functorch/TensorWrapper.h>
+#include <ATen/functorch/BatchingMetaprogramming.h>
+#include <ATen/functorch/LegacyVmapTransforms.h>
+#include <ATen/functorch/BatchedFallback.h>
+#include <ATen/functorch/PlumbingHelper.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/VmapGeneratedPlumbing.h>
+
+#include <utility>
+
+// This file contains helper functions for batching rules.
+
+namespace at::functorch {
+
+TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x);
+TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);
+
+TORCH_API Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x);
+
+Tensor moveBatchDimToFront(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
+int64_t rankWithoutBatchDim(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
+int64_t numelWithoutBatchDim(const Tensor& tensor, optional<int64_t> maybe_batch_dim);
+optional<int64_t> valIfNonempty(optional<int64_t> maybe_empty, int64_t new_val);
+int64_t getPhysicalDim(const Tensor& tensor, bool has_batch_dim, int64_t logical_dim);
+VmapDimVector getPhysicalDims(const Tensor& tensor, bool has_batch_dim, IntArrayRef logical_dims);
+
+void vmapIncompatibleInplaceError(const char* schema_name);
+
+Tensor maybePadToLogicalRank(const Tensor& tensor, optional<int64_t> has_bdim, int64_t logical_rank);
+
+void check_randomness(RandomnessType randomness);
+void check_randomness(RandomnessType randomness, bool any_tensor_bdim);
+
+inline Tensor ensure_has_bdim(const Tensor& tensor, bool has_bdim, c10::SymInt batch_size) {
+  if (has_bdim) {
+    return tensor;
+  }
+  const auto sizes = tensor.sym_sizes();
+  SymDimVector expanded_shape;
+  expanded_shape.reserve(sizes.size());
+  expanded_shape.emplace_back(std::move(batch_size));
+  expanded_shape.insert(expanded_shape.end(), sizes.begin(), sizes.end());
+  return tensor.expand_symint(expanded_shape);
+}
+
+#define VMAP_SUPPORT(op, batch_rule) \
+  m.impl(#op, op ## _generated_plumbing<decltype(&batch_rule), &batch_rule>);
+
+#define VMAP_SUPPORT2(op, overload, batch_rule) \
+  m.impl(#op "." #overload, op ## _ ## overload ## _generated_plumbing<decltype(&batch_rule), &batch_rule>);
+
+#define OP_DECOMPOSE(op)  m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
+#define OP_DECOMPOSE2(op, overload)  m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
+
+// DO NOT USE ME DIRECTLY! Use BASIC_UNARY_BATCH_RULE to save yourself some pain
+template <typename A, A a, typename C>
+struct BasicUnaryBatchRuleHelper;
+
+template <typename F, F Func, typename A, typename... T>
+struct BasicUnaryBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor,optional<int64_t>> apply(
+      const Tensor& tensor,
+      optional<int64_t> batch_dim,
+      T... extra_args) {
+    return std::make_tuple(Func(tensor, std::forward<T>(extra_args)...), batch_dim);
+  }
+};
+
+// USAGE: BASIC_UNARY_BATCH_RULE(at::sin)
+// INCORRECT USAGE: BASIC_UNARY_BATCH_RULE(&at::sin)
+// It is important that this macro is not passed a function pointer!!
+#define BASIC_UNARY_BATCH_RULE(fn) SINGLE_ARG(\
+    BasicUnaryBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+
+#define UNARY_POINTWISE(op) \
+  VMAP_SUPPORT(op, BASIC_UNARY_BATCH_RULE(ATEN_FN(op)));
+
+template <typename A, A a, typename C>
+struct VariadicBdimsBatchRuleHelper;
+
+template <typename F, F Func, typename A, typename... T>
+struct VariadicBdimsBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor,optional<int64_t>> apply(
+      const Tensor& tensor,
+      optional<int64_t> batch_dim,
+      T... extra_args) {
+    auto tensor_ = moveBatchDimToFront(tensor, batch_dim);
+    return std::make_tuple(Func(tensor_, std::forward<T>(extra_args)...), 0);
+  }
+};
+
+// USAGE: VARIADIC_BDIMS_BATCH_RULE(at::cholesky_inverse)
+// INCORRECT USAGE: VARIADIC_BDIMS_BATCH_RULE(&at::cholesky_inverse)
+// It is important that this macro is not passed a function pointer!!
+#define VARIADIC_BDIMS_BATCH_RULE(fn) SINGLE_ARG(\
+    VariadicBdimsBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+
+#define VARIADIC_BDIMS(op) \
+  VMAP_SUPPORT(op, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(op)));
+
+#define VARIADIC_BDIMS2(op, overload) \
+  VMAP_SUPPORT2(op, overload, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN2(op, overload)));
+
+template<class F, F Func>
+void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_tensor_inputs_batch_rule");
+
+  int64_t cur_level = maybe_layer->layerId();
+
+  auto orig_arguments = torch::jit::last(*stack, num_arguments);
+  if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+
+  auto arguments = torch::jit::pop(*stack, num_arguments);
+  std::vector<std::pair<Tensor, optional<int64_t>>> tensor_inputs;
+  std::vector<int64_t> tensor_pos;
+  for (const auto idx : c10::irange(0, num_arguments)) {
+    const auto& ivalue = arguments[idx];
+    if (ivalue.isTensor()) {
+      auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+      tensor_inputs.emplace_back(tensor_value, tensor_bdim);
+      tensor_pos.push_back(idx);
+    }
+  }
+  Func(tensor_inputs);
+
+  size_t tensor_idx = 0;
+  TORCH_INTERNAL_ASSERT(!tensor_pos.empty());
+  for (const auto arg_idx : c10::irange(0, num_arguments)) {
+    if (tensor_idx >= tensor_pos.size() || (int64_t)arg_idx != tensor_pos[tensor_idx]) {
+      torch::jit::push(stack, arguments[arg_idx]);
+    } else {
+      TORCH_INTERNAL_ASSERT(tensor_idx < tensor_inputs.size());
+      torch::jit::push(stack, tensor_inputs[tensor_idx].first);
+      tensor_idx++;
+    }
+  }
+
+  op.callBoxed(stack);
+  const auto returns = torch::jit::pop(*stack, num_returns);
+  for (const auto& ret : returns) {
+    if (ret.isTensor()) {
+      torch::jit::push(stack, makeBatched(ret.toTensor(), 0, cur_level));
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "This boxed batching rule does not currently support ops that return non-tensor values");
+    }
+  }
+}
+
+inline void handle_pointwise_ops(std::vector<std::pair<Tensor, optional<int64_t>>> &tensor_inputs) {
+  int64_t out_logical_rank = 0;
+  for (auto& tensor_input : tensor_inputs) {
+    int64_t cur_logical_rank = rankWithoutBatchDim(tensor_input.first, tensor_input.second);
+    out_logical_rank = std::max(out_logical_rank, cur_logical_rank);
+  }
+  for (auto& tensor_input: tensor_inputs) {
+    tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second);
+    tensor_input.first = maybePadToLogicalRank(tensor_input.first, tensor_input.second, out_logical_rank);
+  }
+}
+
+#define POINTWISE_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_pointwise_ops), &handle_pointwise_ops>>());
+
+#define POINTWISE_BOXED2(op, overload) \
+  m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_pointwise_ops), &handle_pointwise_ops>>());
+
+inline void handle_variadic_bdims(std::vector<std::pair<Tensor, optional<int64_t>>> &tensor_inputs) {
+  for (auto & tensor_input : tensor_inputs) {
+    tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second);
+  }
+}
+
+#define VARIADIC_BDIMS_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
+
+using UnpackedBatchedTensor = std::tuple<Tensor,optional<int64_t>>;
+
+inline void find_and_unpack_tensors(
+    const torch::jit::Stack* stack,
+    int64_t num_args,
+    int64_t cur_level,
+    SmallVector<UnpackedBatchedTensor, 5>* tensors,
+    SmallVector<int64_t, 5>* tensors_pos,
+    int64_t* batch_size) {
+
+  int64_t computed_batch_size = -1;
+  int64_t args_begin = stack->size() - num_args;
+
+  for (const auto idx : c10::irange(0, num_args)) {
+    const auto& ivalue = (*stack)[args_begin + idx];
+    if (!ivalue.isTensor()) {
+      continue;
+    }
+    auto unpacked = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+    const auto& tensor_value = std::get<0>(unpacked);
+    const auto tensor_bdim = std::get<1>(unpacked);
+    if (tensor_bdim.has_value()) {
+      auto candidate_batch_size = tensor_value.size(*tensor_bdim);
+      if (computed_batch_size == -1) {
+        computed_batch_size = candidate_batch_size;
+      }
+      TORCH_INTERNAL_ASSERT(candidate_batch_size == computed_batch_size);
+    }
+
+    tensors->push_back(std::move(unpacked));
+    tensors_pos->push_back(idx);
+  }
+  TORCH_INTERNAL_ASSERT(computed_batch_size > -1);
+  *batch_size = computed_batch_size;
+}
+
+inline void boxed_existing_bdim_all_batch_rule(
+    const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule");
+  int64_t cur_level = maybe_layer->layerId();
+
+  const auto arguments = torch::jit::last(stack, num_arguments);
+  if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+
+  int64_t args_begin = stack->size() - num_arguments;
+  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
+  SmallVector<int64_t, 5> tensor_pos;
+  int64_t batch_size;
+
+  find_and_unpack_tensors(
+      stack, num_arguments, cur_level,
+      &tensor_inputs, &tensor_pos, &batch_size);
+
+  // for each tensor, ensure it has a bdim and reshape it.
+  for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
+    const auto& value = std::get<0>(tensor_inputs[tensor_idx]);
+    auto bdim = std::get<1>(tensor_inputs[tensor_idx]);
+    auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
+    if (!bdim.has_value()) {
+      bdim = 0;
+    }
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(*bdim, 0, value_);
+  }
+
+  op.callBoxed(stack);
+
+  for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) {
+    const auto& ret = (*stack)[idx];
+    TORCH_INTERNAL_ASSERT(ret.isTensor(),
+        "This boxed batching rule does not currently support ops that return non-tensor values");
+    (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level);
+  }
+}
+
+// Use when all tensors arguments accept one (normal) batch dim.
+// This batching rule expands the batch dim on all Tensors, reshapes it into
+// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
+// This is not the most efficient thing; if there are alternatives, plese try
+// to use them. Use this only as a last resort.
+#define EXISTING_BDIM_ALL_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
+
+template <int64_t feature_rank, int64_t contig_tensor_index=-1>
+inline void boxed_all_tensors_have_optional_bdim(
+    const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_all_tensors_have_optional_bdim");
+  int64_t cur_level = maybe_layer->layerId();
+
+  const auto arguments = torch::jit::last(stack, num_arguments);
+  if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+
+  int64_t args_begin = stack->size() - num_arguments;
+  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
+  SmallVector<int64_t, 5> tensor_pos;
+  int64_t batch_size;
+
+  find_and_unpack_tensors(
+      stack, num_arguments, cur_level,
+      &tensor_inputs, &tensor_pos, &batch_size);
+
+  optional<bool> is_no_batch_dim_case;
+
+  for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
+    const auto& value = std::get<0>(tensor_inputs[tensor_idx]);
+    auto bdim = std::get<1>(tensor_inputs[tensor_idx]);
+    const auto logical_rank = rankWithoutBatchDim(value, bdim);
+
+    if (!is_no_batch_dim_case.has_value()) {
+      is_no_batch_dim_case = (logical_rank == feature_rank);
+    }
+    auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
+    if (!bdim.has_value()) {
+      bdim = 0;
+    }
+    if (*is_no_batch_dim_case) {
+      TORCH_INTERNAL_ASSERT(logical_rank == feature_rank);
+      value_ = moveBatchDimToFront(value_, bdim);
+      if (tensor_idx == contig_tensor_index) {
+        value_ = value_.contiguous();
+      }
+      (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
+      continue;
+    }
+    TORCH_INTERNAL_ASSERT(logical_rank == feature_rank + 1);
+    value_ = reshape_dim_into(*bdim, 0, value_);
+    if (tensor_idx == contig_tensor_index) {
+      value_ = value_.contiguous();
+    }
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
+  }
+
+  op.callBoxed(stack);
+
+  for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) {
+    const auto& ret = (*stack)[idx];
+    TORCH_INTERNAL_ASSERT(ret.isTensor(),
+        "This boxed batching rule does not currently support ops that return non-tensor values");
+    if (*is_no_batch_dim_case) {
+      (*stack)[idx] = makeBatched(ret.toTensor(), 0, cur_level);
+    } else {
+      (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level);
+    }
+  }
+}
+
+// Useful for many NN operators.
+// The operator must satisfy the following:
+// - All arguments must accept an optional batch dim.
+// - All arguments must be the same rank
+#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED(feature_rank, op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_all_tensors_have_optional_bdim<feature_rank>>());
+
+#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED_CONTIG1(feature_rank, op, contig_tensor_index) \
+  m.impl(#op, \
+         torch::CppFunction::makeFromBoxedFunction<\
+             boxed_all_tensors_have_optional_bdim<\
+                 feature_rank, \
+                 contig_tensor_index>\
+             >());
+
+template <typename A, A a, typename C>
+struct ExistingBdimBatchRuleHelper;
+
+template <typename F, F Func, typename A, typename... T>
+struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor,optional<int64_t>> apply(
+      const Tensor& self,
+      optional<int64_t> self_bdim,
+      T... extra_args) {
+    auto self_ = reshape_dim_into(*self_bdim, 0, self);
+    auto out = Func(self_, std::forward<T>(extra_args)...);
+    return std::make_tuple(reshape_dim_outof_symint(0, self.sym_sizes()[*self_bdim], out), 0);
+  }
+};
+
+// USAGE: EXISTING_BDIM_BATCH_RULE(at::cholesky_inverse)
+// INCORRECT USAGE: EXISTING_BDIM_BATCH_RULE(&at::cholesky_inverse)
+// It is important that this macro is not passed a function pointer!!
+#define EXISTING_BDIM_BATCH_RULE(fn) SINGLE_ARG(\
+    ExistingBdimBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+
+
+#define EXISTING_BDIM(op) \
+  VMAP_SUPPORT(op, EXISTING_BDIM_BATCH_RULE(ATEN_FN(op)));
+
+#define EXISTING_BDIM2(op, overload) \
+  VMAP_SUPPORT2(op, overload, EXISTING_BDIM_BATCH_RULE(ATEN_FN2(op, overload)));
+
+#define INVOKE(object,ptrToMember)  ((object).*(ptrToMember))
+
+
+template <typename F, F Method, typename... ExtraArgs>
+Tensor& unary_inplace_batch_rule(Tensor& self, optional<int64_t>, ExtraArgs... extra_args) {
+  INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
+  return self;
+}
+
+inline int64_t get_bdim_size4(
+    const Tensor& a_value, optional<int64_t> a_bdim,
+    const Tensor& b_value, optional<int64_t> b_bdim,
+    const Tensor& c_value, optional<int64_t> c_bdim,
+    const Tensor& d_value, optional<int64_t> d_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  if (c_bdim)
+    return c_value.size(*c_bdim);
+  if (d_bdim)
+    return d_value.size(*d_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+inline int64_t get_bdim_size3(
+    const Tensor& a_value, optional<int64_t> a_bdim,
+    const Tensor& b_value, optional<int64_t> b_bdim,
+    const Tensor& c_value, optional<int64_t> c_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  if (c_bdim)
+    return c_value.size(*c_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+inline int64_t get_bdim_size2(
+    const Tensor& a_value, optional<int64_t> a_bdim,
+    const Tensor& b_value, optional<int64_t> b_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+// [start, start + 1, ..., stop - 1]
+inline VmapDimVector range(int64_t start, int64_t stop) {
+  TORCH_INTERNAL_ASSERT(stop >= start);
+  VmapDimVector dims;
+  dims.reserve(stop - start);
+  for (int64_t i = start; i < stop; i++) {
+    dims.emplace_back(i);
+  }
+  return dims;
+}
+std::tuple<Tensor, Tensor> _binary_pointwise_helper(
+    const Tensor& tensor, optional<int64_t> tensor_batch_dim, const Tensor& other, optional<int64_t> other_batch_dim,
+    bool do_type_promotion=true);
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedFallback.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4fbc662aa3e0f28bc4e15432e56377a471a196
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedFallback.h
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+
+namespace at::functorch {
+
+// This file contains code for the vmap fallback (also known as the
+// BatchedTensor fallback or the Batched fallback). This code runs
+// when an operation doesn't have a batching rule implemented.
+
+// If an operator doesn't have a batching rule implemented then we fallback
+// to this implementation. The fallback doesn't work on out= variants or
+// view operations; that is, it works for out-of-place operations and
+// in-place non-view operations.
+//
+// For out-of-place operations, the fallback effectively takes all of the
+// BatchedTensors in `stack`, slices them, and runs `op` on all of the
+// corresponding slices to produce slices of the outputs. The output slices
+// then get `torch.stack`ed to create the
+// final returns.
+//
+// The performance of the fallback is not very good because it introduces an
+// extra copy from stacking the sliced outputs. Because of this, we prefer to
+// write batching rules for operators whenever possible.
+void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
+void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
+// The vmap fallback emits a warning by default, but it may be disabled if
+// the user finds it to be too annoying.
+TORCH_API bool isVmapFallbackWarningEnabled();
+TORCH_API void setVmapFallbackWarningEnabled(bool enabled);
+
+// Used for testing. The vmap fallback is enabled by default. When it is disabled,
+// it raises an error.
+TORCH_API bool isVmapFallbackEnabled();
+TORCH_API void setVmapFallbackEnabled(bool enabled);
+
+template <typename A> A vector_to_result(const std::vector<IValue>& buffer) {
+  return buffer[0].to<A>();
+}
+template <typename A, typename B> std::tuple<A, B> vector_to_result(const std::vector<IValue>& buffer) {
+  return std::make_tuple(buffer[0].to<A>(), buffer[1].to<B>());
+}
+template <typename A, typename B, typename C> std::tuple<A, B, C> vector_to_result(const std::vector<IValue>& buffer) {
+  return std::make_tuple(buffer[0].to<A>(), buffer[1].to<B>(), buffer[2].to<B>());
+}
+
+// slow_fallback is a way to call the vmap fallback inside some boxed kernel.
+// There is probably some better way to metaprogram this.
+template <typename Ret>
+Ret slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<Ret>(stack);
+}
+
+template <typename A, typename B>
+std::tuple<A, B> slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<A, B>(stack);
+}
+
+template <typename A, typename B, typename C>
+std::tuple<A, B, C> slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<A, B, C>(stack);
+}
+
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1982b94833e03dff8e15bf1f4ddddffd95260981
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h
@@ -0,0 +1,170 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <bitset>
+#include <utility>
+
+#include <ATen/ArrayRef.h>
+#include <ATen/SmallVector.h>
+#include <ATen/Tensor.h>
+
+namespace at::functorch {
+
+using Tensor = at::Tensor;
+
+// We assume this in a few other places in the codebase,
+// but there isn't a centralized definition.
+constexpr int64_t kVmapMaxTensorDims = 64;
+
+// The valid vmap levels range from [0, 64). This effectively means that we
+// support a maximum of 64 nested vmaps.
+constexpr int64_t kVmapNumLevels = 64;
+
+// Store this number of elements of BatchDims on the stack. Most people will
+// probably use <= 5 nested vmaps, but adjust this number as necessary.
+constexpr int64_t kBatchDimsStackSize = 5;
+
+// A BatchedTensorImpl holds an underlying Tensor and a single batch dim
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+//
+// The batch dimensions are treated as being "private"; they are not user-visible.
+// For example, in the following Tensor,
+//    bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0)
+// dimension 0 is batch dimension.
+//
+// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
+// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) tensor.
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
+  explicit BatchedTensorImpl(at::DispatchKeySet key_set, Tensor value, int64_t dim, int64_t level);
+
+  // Returns batch dimension of this tensor
+  int64_t bdim() const { return bdim_; }
+
+  // Returns batch dimension of this tensor
+  int64_t level() const { return level_; }
+
+  // BatchedTensorImpl wraps a Tensor
+  const Tensor& value() const { return value_; }
+
+  // Given a public dimension index, return the dimension index in the underlying
+  // value() tensor.
+  // For example, if we have
+  //    bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0)
+  // bt.actualDim(0) -> 1
+  // bt.actualDim(1) -> 2
+  // bt.actualDim(2) -> 3
+  // bt.actualDim(3) -> Error
+  int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
+
+  IntArrayRef sizes_custom() const override;
+  SymIntArrayRef sym_sizes_custom() const override;
+  int64_t size_custom(int64_t d) const override;
+  c10::SymInt sym_size_custom(int64_t d) const override;
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
+  // Override a bunch of methods inherited from TensorImpl to return error messages.
+  bool is_contiguous_custom(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const override;
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+
+  void refreshTensorMetadata();
+
+  // Used in torchdim. torchdim uses non-lexical BatchedTensor; the way it
+  // accomplishes this is a hack where it is able to modify the levels of
+  // BatchedTensor to match the level of the current vmap transform.
+  void _unsafe_set_level(int64_t level) {
+    level_ = level;
+  }
+
+  // Used in batching rule for in-place view operations that can change
+  // the index of the bdim (think squeeze_, unsqueeze_)
+  void unsafe_set_bdim(int64_t bdim) {
+    // NB: you MUST call refreshTensorMetadata after doing this.
+    bdim_ = bdim;
+  }
+ private:
+  // see NOTE: [BatchedTensorImpl levels invariant]
+  void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
+
+  Tensor value_;
+
+  int64_t level_;
+  int64_t bdim_;
+};
+
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+inline bool isBatchedTensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::FuncTorchBatched) ||
+      tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::BatchedNestedTensor);
+}
+
+// It is unsafe to call this on a Tensor that is not backed by a
+// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
+inline BatchedTensorImpl* unsafeGetBatchedImpl(Tensor tensor) {
+  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+
+inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
+  if (!isBatchedTensor(tensor)) {
+    return nullptr;
+  }
+  return unsafeGetBatchedImpl(std::move(tensor));
+}
+
+// Returns a bitset. If bit i is set, then that means dim i is a batchdim.
+inline std::bitset<kVmapMaxTensorDims> createBatchDimBitset(int64_t dim) {
+  std::bitset<kVmapMaxTensorDims> is_bdim;
+  is_bdim.set(dim);
+  return is_bdim;
+}
+
+// Creates a bitset for the given level
+inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(int64_t level) {
+  std::bitset<kVmapNumLevels> result;
+  result.set(level);
+  return result;
+}
+
+// Use this to construct a BatchedTensor from a regular Tensor
+TORCH_API Tensor makeBatched(const Tensor& tensor, int64_t dim, int64_t level);
+
+// Adds a batch dim to `tensor`, returning a BatchedTensor
+TORCH_API Tensor addBatchDim(const Tensor& tensor, int64_t dim, int64_t level);
+
+// Certain dispatch keys must be propagated to the BatchedTensor (or, in general,
+// any wrapper Tensor subclasses). This is because there are methods on Tensor
+// that skip dispatch and check for the presence of a dispatch key (e.g. is_cpu()).
+// TODO: should probably contain more (or all?) backend keys
+constexpr DispatchKeySet kKeysToPropagateToWrapper({
+  DispatchKey::Negative,
+  DispatchKey::Conjugate,
+  DispatchKey::XLA,
+  DispatchKey::CUDA,
+  DispatchKey::CPU,
+});
+
+inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
+  auto key_set = tensor.unsafeGetTensorImpl()->key_set();
+  return key_set & kKeysToPropagateToWrapper;
+}
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h
new file mode 100644
index 0000000000000000000000000000000000000000..608402801abc07565e370bff52475cc7ec7f6871
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h
@@ -0,0 +1,126 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/VmapGeneratedPlumbing.h>
+
+// This file contains template metaprogramming things that are used for our
+// batching rules.
+//
+// See NOTE: [vmap plumbing] for more details on why this is necessary.
+// The plumbing has a bunch of metaprogramming hacks for determining the signature
+// of a batching rule from the signature of the operator, many of which use the
+// helper functions in this file.
+
+namespace at::functorch {
+
+// Metaprogramming things
+template <class... Items> using typelist = c10::guts::typelist::typelist<Items...>;
+template <class TypeList> using head_t = c10::guts::typelist::head_t<TypeList>;
+template <class TL1, class TL2> using concat_t = c10::guts::typelist::concat_t<TL1, TL2>;
+template <typename T> class debug_t;
+
+// tail operation
+template<class TypeList>
+struct tail final {
+    static_assert(c10::guts::false_t<TypeList>::value,
+                  "In typelist::tail<T>, the T argument must be typelist<...>.");
+};
+template<class Head, class... Tail>
+struct tail<typelist<Head, Tail...>> final {
+  using type = typelist<Tail...>;
+};
+template<class TypeList> using tail_t = typename tail<TypeList>::type;
+
+template <class First, class Second, class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext {
+  using type = Next;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<Tensor, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<const Tensor&, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<Tensor&, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<optional<Tensor>, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<const optional<Tensor>&, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<optional<Tensor>&, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::vector<Tensor>, optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class TypeList> struct RemoveBatchDimAfterTensor {
+  using first = head_t<TypeList>;
+  using next = tail_t<TypeList>;
+  using second = head_t<next>;
+  using tail = tail_t<next>;
+
+  using type = concat_t<
+    typelist<first>,
+    typename RemoveBatchDimAfterTensor<
+      typename IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<first, second, next, tail>::type
+    >::type
+  >;
+};
+template <class Type> struct RemoveBatchDimAfterTensor<typelist<Type>> {
+  using type = typelist<Type>;
+};
+template <> struct RemoveBatchDimAfterTensor<typelist<>> {
+  using type = typelist<>;
+};
+template<class TypeList> using remove_batch_dim_after_tensor_t = typename RemoveBatchDimAfterTensor<TypeList>::type;
+
+template <typename T> struct UnpackSingleItemTuple {
+  using type = T;
+};
+template <typename T> struct UnpackSingleItemTuple<std::tuple<T>> {
+  using type = T;
+};
+template <typename T> using unpack_single_item_tuple_t = typename UnpackSingleItemTuple<T>::type;
+
+template <typename Return, typename TupleArgs> struct BuildFunctionHelper;
+template <typename Return, typename... Args> struct BuildFunctionHelper<Return, std::tuple<Args...>> {
+  using type = Return(Args...);
+};
+template <typename Return, typename TL>
+struct BuildFunction {
+  using type = typename BuildFunctionHelper<Return, c10::guts::typelist::to_tuple_t<TL>>::type;
+};
+template <typename Return, typename TL> using build_function_t = typename BuildFunction<Return, TL>::type;
+
+
+template <typename batch_rule_t> struct ToOperatorType {
+  using batch_rule_return_type = typename c10::guts::function_traits<batch_rule_t>::return_type;
+  using batch_rule_parameter_types = typename c10::guts::function_traits<batch_rule_t>::parameter_types;
+
+  using operator_parameter_types = remove_batch_dim_after_tensor_t<batch_rule_parameter_types>;
+  using operator_return_type =
+    unpack_single_item_tuple_t<
+      c10::guts::typelist::to_tuple_t<
+        remove_batch_dim_after_tensor_t<
+          c10::guts::typelist::from_tuple_t<batch_rule_return_type>>>>;
+
+  using type = build_function_t<operator_return_type, operator_parameter_types>;
+};
+template <typename batch_rule_t> using to_operator_t = typename ToOperatorType<batch_rule_t>::type;
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/DynamicLayer.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/DynamicLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cede226d7945bb7dc3c13311cb74e1d7c5869613
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/DynamicLayer.h
@@ -0,0 +1,124 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <ATen/functorch/Macros.h>
+#include <c10/core/DispatchKey.h>
+#include <ATen/core/function_schema.h>
+#include <c10/util/Optional.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <ATen/functorch/Interpreter.h>
+#include <ATen/functorch/VmapInterpreter.h>
+#include <ATen/functorch/ADInterpreters.h>
+#include <ATen/functorch/FunctionalizeInterpreter.h>
+
+// Forward declared
+namespace c10 { struct AutogradMetaInterface; }
+
+namespace at::functorch  {
+
+// This file contains the implementation of functorch's interpreter stack.
+// See NOTE: [functorch interpreter stack] first before reading on.
+//
+// NB: the functorch interpreter stack is also referred to as:
+// - the "dynamic layer stack" -- an older name for "interpreter" was
+//   "dynamic layer".
+// - the "functorch mode stack". You can think of each functorch transform as a
+//   "mode" (in the same sense as torch_dispatch mode or torch_function mode),
+//   and functorch being an implementation of a "mode stack" where the modes
+//   may be arbitrary composed.
+
+// DynamicLayer is basically the same thing as an Interpreter.
+// It represents a functorch transform and it holds an Interpreter,
+// which contains metadata related to the transform and instructions on
+// how to perform the transform.
+//
+// TODO: we can excise DynamicLayer in favor of Interpreter,
+// But I am going to leave it for now as a compatiblity shim to avoid
+// needing to refactor a lot of callsites...
+struct TORCH_API DynamicLayer {
+  explicit DynamicLayer(
+      TransformType transform_type,
+      int64_t layerId,
+      optional<c10::SymInt> batchSize = nullopt,
+      optional<RandomnessType> randomness = nullopt,
+      optional<bool> prev_grad_mode = nullopt,
+      optional<bool> pre_fwd_grad_mode = nullopt,
+      optional<bool> functionalize_add_back_views = nullopt);
+
+  TransformType key() const;
+  int64_t layerId() const;
+
+  const Interpreter& interpreter() const { return interpreter_; }
+  Interpreter& interpreter() { return interpreter_; }
+
+  // Only valid for vmap
+  c10::SymInt batchSize() const;
+  RandomnessType randomness() const;
+
+ private:
+  Interpreter interpreter_;
+};
+
+TORCH_API int64_t initAndPushDynamicLayer(
+    TransformType transform_type,
+    optional<c10::SymInt> batch_size = nullopt,
+    optional<RandomnessType> randomness = nullopt,
+    optional<bool> prev_grad_mode = nullopt,
+    optional<bool> prev_fwd_grad_mode = nullopt,
+    optional<bool> functionalize_add_back_views = nullopt);
+TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata();
+TORCH_API c10::optional<DynamicLayer> maybeCurrentDynamicLayer();
+TORCH_API const std::vector<DynamicLayer>& getDynamicLayerStack();
+TORCH_API void setDynamicLayerStack(const std::vector<DynamicLayer>& stack);
+TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included);
+
+// NOTE: [Life handles and lexically scoped transforms]
+// functorch transforms are lexically scoped.
+// Given a level, we store a "life handle" that is a boolean that tells us if the
+// transform with that level is active or not.
+//
+// functorch's TensorWrapper (for grad transforms) stores a life handle.
+// If a TensorWrapper escapes from the scope of the transform, then somehow
+// it must know it escaped; it can tell by querying the life handle.
+TORCH_API const std::shared_ptr<bool>& getLifeHandleForLevel(int64_t level);
+
+// Returns if an operator is in-place. An operator is inplace if:
+// 1. The first argument is a Tensor and it is being written to
+// 2. The first argument is being returned
+// 3. No other arguments are aliased
+// Here is an example of an in-place operator:
+// add_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema);
+
+// Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped
+TORCH_API c10::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input);
+
+TORCH_API Tensor unwrapIfDead(const Tensor& tensor);
+TORCH_API bool isDeadTensorWrapper(const Tensor& tensor);
+
+// Pretty printers
+TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer);
+TORCH_API std::ostream& operator<<(std::ostream& os, const std::vector<DynamicLayer>& dynamicLayerStack);
+
+// While a functorch transform is active, torch.autograd.function._SingleLevelFunction
+// is disabled by default. The following two APIs are APIs for enabling
+// it. These are not user-facing APIs. We can delete this in the future, but
+// it is useful for debugging when something goes wrong with the
+// autograd.Function <> functorch interaction, which uses _SingleLevelFunction,
+// because it leads to loud errors if something is incorrect.
+TORCH_API void setSingleLevelAutogradFunctionAllowed(bool allowed);
+TORCH_API bool getSingleLevelAutogradFunctionAllowed();
+
+// While a functorch grad transform is active, Tensor.requires_grad_() gets
+// disabled. These two functions are the mechanism to controlling that.
+TORCH_API void setInplaceRequiresGradAllowed(bool allowed);
+TORCH_API bool getInplaceRequiresGradAllowed();
+
+TORCH_API DynamicLayer popDynamicLayer();
+TORCH_API int64_t pushDynamicLayer(DynamicLayer&& layer);
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ae0bcdccdf5fc5c061b542d175f677eace9a4c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <ATen/functorch/Interpreter.h>
+
+namespace at::functorch {
+
+// This is the interpreter that handles the functionalize() transform.
+// See NOTE: [functorch interpreter stack] for more details.
+
+struct FunctionalizeInterpreterPtr {
+  explicit FunctionalizeInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Functionalize); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool functionalizeAddBackViews() const {
+    return std::get<FunctionalizeInterpreterMeta>(base_->meta()).functionalizeAddBackViews_;
+  }
+ private:
+  const Interpreter* base_;
+};
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/Interpreter.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/Interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba44a44676cab2864781d85166f5cc780f4c23a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/Interpreter.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <ATen/functorch/Macros.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Optional.h>
+#include <bitset>
+#include <variant>
+
+namespace at::functorch {
+
+// NOTE: [functorch interpreter stack]
+//
+// functorch's dispatching system uses a stack of interpreters.
+// Historically we've referred to this as the "DynamicLayerStack".
+//
+// An interpreter is something that reads in the code it is passed
+// and then executes it. We have a different interpreter per-transform:
+// the "VmapInterpreter" is responsible for reading in operators (like aten::mv)
+// and executing the batched version of it (the batching rule for aten::mv).
+//
+// Concretely, each interpreter is responsible for two things:
+//
+// 1) process(ophandle, stack)
+// Given an operator handle and a stack of arguments, the interpreter is
+// responsible for figuring out how to execute the operation under the semantics
+// of the interpreter. For e.g. VmapInterpreter, this is figuring out how to call
+// the batching rule.
+//
+// The batching rules are stored as kernels on the FuncTorchBatched key, so the way
+// VmapInterpreter calls the batching rule is roughly: (A) exclude all
+// dispatch keys aside from the Batched key, (B) redispatch so we get to the
+// Batched key.
+//
+// 2) sendToNextInterpreter(ophandle, stack)
+// The VmapInterpreter, when it sees aten::mv, will process it into a call to
+// aten::mm. It then needs to send the call to aten::mm to the next interpreter
+// in the interpreter stack.
+//
+// The VmapInterpreter just does this via a call to ophandle.callBoxed(stack)
+// and most Interpreters will implement it this way.
+
+enum class RandomnessType {
+    Error,      // always errors when calling a random function
+    Same,       // randomness appears the same across batches
+    Different,  // randomness appears different across batches
+    END
+};
+
+enum class TransformType {
+  Torch,  // Unused
+  Vmap,
+  Grad,  // reverse-mode AD, aka vjp
+  Jvp,  // forward-mode AD
+  Functionalize,
+};
+
+std::ostream& operator<<(std::ostream& os, const TransformType& t);
+
+// NOTE: [Interpreter "subclassing" design]
+//
+// How are various Interpreters for different transforms (vmap, grad, ...)
+// implemented?
+//
+// Accessing interpreters is in the hot-path of functorch so we have a constraint
+// that this code must be as fast as possible.
+//
+// As a result, we stay away from virtual methods and this causes our code
+// to look a little funny.
+//
+// `Interpreter` is the struct for Interpreters. It holds ALL of the
+// relevant information (what type of interpreter it is and the metadata).
+// Metadata for each interpreter is represented as a Union (std::variant)
+// of all possible metadata (VmapInterpreterMeta, GradInterpreterMeta, ...).
+//
+// Given an Interpreter, how do I get a "VmapInterpreter"? You may wish to do this
+// if you want to access the metadata fields (like batchSize and randomness).
+//
+// Each type of interpreter (e.g. Vmap) has a convenience struct
+// (e.g. VmapInterpreterPtr) associated with it.
+//
+// Construct the convenience struct with VmapInterpreterPtr(Interpreter*),
+// and then one can access methods on VmapInterpreterPtr like so:
+// >>> VmapInterpreterPtr(&interpreter).batchSize()
+//
+// Finally, Interpreter::process switches on the type of the interpreter
+// and calls one of {Transform}Intepreter::processImpl under the hood.
+// Same for Interpreter::sendToNextInterpreter :)
+
+struct VmapInterpreterMeta {
+  explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) :
+    batchSize_(std::move(batchSize)), randomness_(randomness) {}
+  c10::SymInt batchSize_;
+  RandomnessType randomness_;
+};
+
+struct GradInterpreterMeta {
+  explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {}
+  bool prevGradMode_;
+};
+
+struct JvpInterpreterMeta {
+  explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {}
+  bool prevFwdGradMode_;
+};
+
+struct FunctionalizeInterpreterMeta {
+  explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) :
+    functionalizeAddBackViews_(functionalizeAddBackViews) {}
+  bool functionalizeAddBackViews_;
+};
+
+typedef std::variant<
+  int64_t,
+  GradInterpreterMeta,
+  JvpInterpreterMeta,
+  VmapInterpreterMeta,
+  FunctionalizeInterpreterMeta
+> InterpreterMeta;
+
+
+struct Interpreter {
+  // factory functions
+  static Interpreter Vmap(int64_t level, c10::SymInt batchSize, RandomnessType randomness) {
+    return Interpreter(TransformType::Vmap, level, VmapInterpreterMeta(std::move(batchSize), randomness));
+  }
+  static Interpreter Grad(int64_t level, bool prevGradMode) {
+    return Interpreter(TransformType::Grad, level, GradInterpreterMeta(prevGradMode));
+  }
+  static Interpreter Jvp(int64_t level, bool prevFwdGradMode) {
+    return Interpreter(TransformType::Jvp, level, JvpInterpreterMeta(prevFwdGradMode));
+  }
+  static Interpreter Functionalize(int64_t level, bool functionalizeAddBackViews) {
+    return Interpreter(TransformType::Functionalize, level, FunctionalizeInterpreterMeta(functionalizeAddBackViews));
+  }
+
+  // methods
+  TransformType key() const { return type_; }
+  int64_t level() const { return level_; }
+  const InterpreterMeta& meta() const { return meta_; }
+
+  void process(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreter(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+
+  void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) {
+    TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value());
+    savedLocalDispatchKeySet_ = std::move(keyset);
+  }
+  void clearSavedLocalDispatchKeySet() {
+    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
+    savedLocalDispatchKeySet_ = c10::nullopt;
+  }
+  c10::impl::LocalDispatchKeySet getSavedLocalDispatchKeySet() const {
+    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
+    return *savedLocalDispatchKeySet_;
+  }
+
+  // An Interpreter is alive if we are currently inside the ongoing transform
+  // for the interpreter. For example, vmap(f)(x); inside of f, the vmap's
+  // corresponding Interpreter is alive, even when it is not on the DynamicLayerStack.
+  bool is_alive() const {
+    return *is_alive_;
+  }
+  const std::shared_ptr<bool>& is_alive_ptr() const {
+    return is_alive_;
+  }
+  void set_is_alive(bool alive) {
+    *is_alive_ = alive;
+  }
+
+  // Please don't use this
+  explicit Interpreter() = default;
+
+ private:
+  explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta):
+    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(meta) {}
+
+  // fields
+  TransformType type_;
+  int64_t level_;
+  optional<c10::impl::LocalDispatchKeySet> savedLocalDispatchKeySet_;
+  std::shared_ptr<bool> is_alive_;
+  InterpreterMeta meta_;
+};
+
+// Applies the following for-loop:
+// for i in range(begin, end):
+//   args[i] = func(args[i])
+void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
+    std::function<Tensor(const Tensor&)> func);
+
+// Applies the following for-loop:
+// for i in range(begin, end):
+//   if use_flag_relative[i] == 1: <-- treats use_flag_relative as a bitset
+//     args[i] = func(args[i], i - begin, true)
+//   args[i] = func(args[i], i - begin)
+void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
+    const std::bitset<64> use_flag_relative, std::function<Tensor(const Tensor&, bool)> func);
+
+std::vector<int64_t> findUnwrappedInputs(std::vector<IValue>& args, int64_t begin, int64_t end);
+
+DispatchKeySet keysToExcludeWhenEnteringDynamicLayer(TransformType key);
+
+void setup_dispatch_key_tls(TransformType key, DispatchKeySet include);
+
+void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ad7ee72b6425dfe3f8a9d26e8d46274bf3788c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h
@@ -0,0 +1,187 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <ATen/functorch/Macros.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
+
+namespace at::functorch {
+
+// This files contains the legacy (now-deprecated) batching rule API.
+// Please try to use the new-style batching rule API (see writing_batch_rules.md)
+
+// This file contains abstractions used for transforming *logical* vmap arguments
+// into *physical* arguments. (Keep reading for definitions of these terms).
+
+// NOTE: [Logical vs physical args]
+// Consider the following vmap.
+//   vmap(vmap(func, in_dims=(2,)), in_dims=(0,))(torch.ones(2, 3, 4))
+// This would produce a BatchedTensor wrapping a Tensor of size [2, 3, 4],
+// with batch dims 0 and 2:
+//   BatchedTensor(ones(2, 3, 4), bdims=[(lvl=1,dim=0),(lvl=2,dim=2)])
+//
+// We say the *logical* view of the tensor has size [3] -- tensors inside
+// `func` appear to have size [3].
+// However, the *physical* underlying tensor (the one passed to vmap) has size
+// [2, 3, 4].
+//
+// This notion of logical vs physical also extends to non-tensor arguments.
+// Consider the previous tensor; let's assume the user called
+// `torch.sum(tensor, dim=0)` inside of `func`. Then the logical
+// dimension they are reducing over is dim 0 but the physical dim is dim 1
+// (the first non-batch dimension)
+
+// Forward declared; see NOTE: [What is a VmapPhysicalView?]
+struct VmapPhysicalView;
+
+// Most PyTorch operators take 4 or fewer inputs.
+constexpr int64_t kVmapTransformStaticInputSize = 4;
+using VmapPhysicalViewVec = SmallVector<VmapPhysicalView, kVmapTransformStaticInputSize>;
+
+// Pytorch generally advertises good performance for <= 5 dims.
+// (see ATen/core/DimVector.h). We add a few extra dims (~3) for vmap
+// dimensions to get 8. Adjust this number as necessary
+constexpr int64_t kVmapStaticDimVecSize = 8;
+using VmapDimVector = SmallVector<int64_t, kVmapStaticDimVecSize>;
+using VmapSymDimVector = SmallVector<c10::SymInt, kVmapStaticDimVecSize>;
+
+// NOTE: [What is an VmapTransform?]
+// An *VmapTransform* converts logical views of tensors to physical views.
+//
+// Batching rules use VmapTransforms to convert logical arguments to
+// physical arguments, then call one or more at:: operator that handles the
+// physical arguments, and then converts the physical result back to a logical
+// argument.
+
+// VmapTransform for operators that take tensors with multiple batch dims.
+// Given one or more logical views on Tensors, `logicalToPhysical`
+// permutes all of the batch dims to the front of the tensor, aligns
+// and expands the batch dims to match each other (according to their `level`),
+// and returns a VmapPhysicalView on the tensor(s).
+struct TORCH_API MultiBatchVmapTransform {
+  static VmapPhysicalView logicalToPhysical(const Tensor& logical_tensor);
+  static VmapPhysicalViewVec logicalToPhysical(ITensorListRef logical_tensors);
+};
+
+// VmapTransform for operators that broadcast all inputs.
+// Given some logical views on Tensors, `logicalToPhysical`:
+// - permutes all of the batch dims to the front of the tensors
+// - aligns all the batch dims to the collective levels of all of the tensors.
+//   If a tensor does not have a batch dim for a vmap level, then it receives
+//   a size-one dimension for said level.
+// - aligns the non-batch dims to have the same dimensionality, adding extra
+//   size-1 dimensions in between the batch dimensions and the non-batch dimensions
+//   so that the batch dimensions are lined up from the right.
+//
+// For example: given inputs of size (B, 2) and (B, 3, 2) where B is the batch
+// dimension, BroadcastingVmapTransform returns VmapPhysicalViews that wrap tensors
+// of size (B, 1, 2) and (B, 3, 2).
+//
+// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
+// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
+// actually *need* to return a tensor of size (1, 2) for the second tensor
+// because the broadcasting operation takes care of that for us, but we do
+// it anyways to keep things simple.
+struct TORCH_API BroadcastingVmapTransform {
+  static VmapPhysicalViewVec logicalToPhysical(TensorList logical_tensors);
+};
+
+// Forward declared, if you're reading this file head to toe, don't worry about
+// it yet.
+struct VmapPhysicalToLogicalMap;
+
+// NOTE: [What is a VmapPhysicalView?]
+// VmapPhysicalView represents a physical view on a Tensor.
+//
+// One can use it to further convert logical dimension indices, logical shapes,
+// and more to their physical variants, or convert a new (physical) tensor into
+// a logical BatchedTensor. (TODO(rzou): some of these are not yet implemented).
+//
+// VmapPhysicalView stores a physical tensor with all of its batch dimensions at
+// the front and some levels that correspond to said batch dimensions.
+//
+// The levels bitset specifies which vmap levels correspond to the batch
+// dimensions at the front of the tensor. In particular, the number of set bits
+// corresponds to the number of batch dimensions on `tensor` and the rightmost
+// bit of `levels` specifies the maximum number of nested vmaps we are in at
+// this point in time.
+// For example, given:
+//   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3})
+//
+// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less
+// than or equal to 3.
+//   bitset: 010100
+//              ^
+//              |
+//   levels: 012345
+struct TORCH_API VmapPhysicalView {
+  VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
+      : levels_(levels), tensor_(tensor) {
+    // TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor));
+  }
+
+  Tensor& tensor() { return tensor_; }
+  const Tensor& tensor() const { return tensor_; }
+
+  // Maps logical dim indices to physical dim indices. Also does dim wrapping.
+  //
+  // For example, given:
+  //   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5), levels={1, 3})
+  //
+  // Then physical_view.getPhysicalDims({0, 1}) returns {2, 3}.
+  // This is because the size of levels tell us that the first two dimensions
+  // of `tensor_` are batch dimensions, so a logical dim of `n` is actually
+  // a physical dim of `n + 2`.
+  VmapDimVector getPhysicalDims(IntArrayRef logical_dims) const;
+  int64_t getPhysicalDim(int64_t logical_dim) const;
+
+  // Returns a VmapPhysicalToLogicalMap object. This can be used for
+  // mapping a physical tensor to a new logical tensor (BatchedTensor)
+  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
+
+  // Maps a logical shape to a physical shape by pre-pending the batch
+  // sizes to the logical shape.
+  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
+  SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const;
+
+  int64_t numBatchDims() const;
+
+ private:
+  int64_t numLogicalDims() const;
+
+  std::bitset<kVmapNumLevels> levels_;
+  Tensor tensor_;
+};
+
+// Convenience struct used for mapping a physical tensor (a non-BatchedTensor)
+// to a logical one (BatchedTensor). It holds some levels that are used to do the
+// mapping and assumes that the batch dimensions in the physical tensor all
+// occur at the front of the tensor.
+struct TORCH_API VmapPhysicalToLogicalMap {
+  VmapPhysicalToLogicalMap(std::bitset<kVmapNumLevels> levels): levels_(levels) {}
+
+  // Maps a physical tensor to a new logical tensor (BatchedTensor).
+  // Assumes that all of the "batch dimensions" are at the front
+  // of the physical tensor. For example, given:
+  // - x = rank-4 Tensor with size 2, 3, 5, 7
+  // - levels = (2, 4)
+  // Returns:
+  // - BatchedTensor(x, bdims=[(dim=0,lvl=2), (dim=1, lvl=4)])
+  Tensor apply(const Tensor& physical_tensor) const;
+
+  // Given a vector of physical tensors,
+  // 1. maps each tensor to a new logical tensor. Assumes that all of the
+  //    "batch dimensions" are at the front of the physical tensors.
+  // 2. stores the new logical tensors back into the passed-in vector. This is
+  //    to avoid additional dynamic allocations.
+  void applyInplace(std::vector<Tensor>& physical_tensors) const;
+
+  std::bitset<kVmapNumLevels> levels_;
+};
+
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/Macros.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/Macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..b99be8781c127d5d8c49fdc1b7b80027c9383e48
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/Macros.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#define SINGLE_ARG(...) __VA_ARGS__
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/PlumbingHelper.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/PlumbingHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a3b9e4df77819d155e3f4682f6b0d90d207a3b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/PlumbingHelper.h
@@ -0,0 +1,63 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/functorch/BatchedTensorImpl.h>
+#include <ATen/functorch/DynamicLayer.h>
+
+// NOTE: [vmap plumbing]
+//
+// Here's how "batching rules" work.
+// - we register kernels to the Batched key
+// - these kernels have the same signatures as the original operators.
+//   For example, at::sin(Tensor self) accepts a Tensor, and the batched kernel
+//   must also accept a Tensor
+// - However, it is more natural for users to write a batching rule like the
+//   following: sin_batch_rule(Tensor self, optional<int> self_bdim)
+// - There is some codegenerated layer (the "plumbing") that wraps the user
+//   defined batching rule (e.g. sin_batch_rule) in a kernel that can be
+//   registered to the Batched key.
+//
+// The plumbing is responsible for wrapping a batching rule into a form that may
+// be registered as the kernel for the batched key.
+
+namespace at::functorch {
+
+void vmap_check_escaped(const optional<DynamicLayer> &layer, const char* what);
+
+// Create a BatchedTensor given a tensor, bdim, and level
+TORCH_API Tensor makeBatched(const Tensor& tensor, optional<int64_t> bdim, int64_t level);
+
+// Given a Tensor that may or may not be a BatchedTensor, unwrap it.
+// If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level
+// doesn't match, then this returns (tensor, nullopt).
+// Otherwise, it returns (unwrap(tensor), bdim).
+TORCH_API std::tuple<Tensor, c10::optional<int64_t>> unwrapTensorAtLevel(const Tensor& tensor, int64_t level);
+
+// Creates a vector of BatchedTensor
+TORCH_API std::vector<Tensor> makeBatchedVector(const std::vector<Tensor>& tensors, optional<int64_t> bdim, int64_t level);
+
+// Returns True if ANY tensor in tensors is batched at level
+TORCH_API bool isBatchedAtLevel(ITensorListRef tensors, int64_t level);
+TORCH_API bool isBatchedAtLevel(const c10::List<c10::optional<Tensor>>& maybe_tensors, int64_t level);
+TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level);
+TORCH_API bool isBatchedAtLevel(const c10::optional<Tensor>& maybe_tensor, int64_t level);
+
+// Convenience helper. Returns true if any tensor is batched at level
+TORCH_API bool areAnyBatchedAtLevel(ArrayRef<optional<Tensor>> maybe_tensors, int64_t level);
+
+inline bool ivalueParticipatesInCurrentLevel(const IValue& ivalue) {
+  if (ivalue.isTensor()) {
+    auto maybe_level = maybeCurrentDynamicLayer();
+    TORCH_INTERNAL_ASSERT(maybe_level.has_value());
+    auto current_level = maybe_level->layerId();
+    return isBatchedAtLevel(ivalue.toTensor(), current_level);
+  }
+  // TODO: should really check this
+  return false;
+}
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/TensorWrapper.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/TensorWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..b99f3f937fa678950e7833c5d617ffc7f1c5dffc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/TensorWrapper.h
@@ -0,0 +1,103 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <ATen/functorch/Macros.h>
+#include <ATen/Tensor.h>
+#include <ATen/functorch/Interpreter.h>
+
+namespace at::functorch {
+
+// NOTE: [functorch's TensorWrapper]
+//
+// Taking better suggestions for a name. TensorWrapper is the wrapper Tensor
+// Subclass for functorch's grad-based transforms (grad, vjp, jvp). It is
+// analogous to how vmap uses BatchedTensor as the wrapper Tensor subclass.
+//
+// If you're familiar with the Tensor-Variable merge, TensorWrapper is effectively
+// another Variable.
+//
+// Consider grad(grad(torch.sin))(x). This wraps `x` as TensorWrapper(TensorWrapper(x)).
+// The reason why is so that each TensorWrapper can hold its own AutogradMeta and
+// participate in a **separate** autograd graph.
+//
+// There are alternative designs we could have chosen (e.g. each grad transform
+// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper
+// design is that we can re-use existing VariableType kernels (i.e. Autograd kernels)
+// without much modification. Since a TensorWrapper looks like a regular Tensor,
+// the VariableType kernel can pull out the AutogradMeta struct from where it
+// expects and extend the autograd graph
+
+struct TORCH_API TensorWrapper : public c10::TensorImpl {
+  explicit TensorWrapper(
+      c10::DispatchKeySet key_set,
+      Tensor value,
+      int64_t level,
+      std::shared_ptr<bool> is_alive,
+      bool is_immutable = false,  // if true, this came from an operation that aliases an immutable tensor
+      bool use_value_sizes_strides = true);
+
+  void refreshMetadata();
+
+  const Tensor& value() const {
+    return value_;
+  }
+  optional<int64_t> level() const {
+    if (is_alive()) {
+      return level_;
+    }
+    return {};
+  }
+  bool is_immutable() const {
+    return is_immutable_;
+  }
+  bool is_alive() const;
+
+  // Overrides necessary for autograd
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+
+ private:
+  const char* tensorimpl_type_name() const override;
+  Tensor value_;
+  int64_t level_;
+  bool is_immutable_;
+
+  // TensorWrapper receives a boolean flag on whether or not the Grad Interpreter
+  // that created it is still alive or not.
+  // If the Grad Interpreter is no longer alive then it attempts to behave like
+  // a regular Tensor.
+  //
+  // When we exit the level, this wrapper may be marked as "not alive".
+  // Wrappers that are not alive:
+  // 1) May still have autograd metadata on them
+  // 2) Forward dispatches to the underlying value()
+  std::shared_ptr<bool> is_alive_;
+};
+
+// There are two variants of makeTensorWrapper: one that accepts a level
+// and one that accepts an Interpreter.
+//
+// The one that accepts a level tries to automatically get the life handle from the
+// interpreter on the DynamicLayerStack.
+// It needs to be used with caution: if the interpreter is not on the
+// DynamicLayerStack, then we won't be able to find the life handle.
+//
+// In practice this isn't a problem: when we're constructing TensorWrapper in
+// Python, the corresponding interpreter is on the stack.
+TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, int64_t level, bool is_immutable=false);
+TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, const Interpreter& interpreter, bool is_immutable=false);
+TORCH_API TensorWrapper* maybeGetTensorWrapper(const Tensor& tensor);
+TORCH_API void dumpTensor(std::ostream & ss, const Tensor& tensor);
+TORCH_API void dumpTensorCout(const Tensor& tensor);
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/functorch/VmapInterpreter.h b/MLPY/Lib/site-packages/torch/include/ATen/functorch/VmapInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a2539e24faeae1308dc8376bfc3a2b15d438179
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/functorch/VmapInterpreter.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <ATen/functorch/Interpreter.h>
+
+namespace at::functorch {
+
+// This is the interpreter that handles the functionalize() transform.
+// See NOTE: [functorch interpreter stack] for more details.
+
+struct VmapInterpreterPtr {
+  explicit VmapInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Vmap); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  c10::SymInt batchSize() const {
+    return std::get<VmapInterpreterMeta>(base_->meta()).batchSize_;
+  }
+  RandomnessType randomness() const {
+    return std::get<VmapInterpreterMeta>(base_->meta()).randomness_;
+  }
+ private:
+  const Interpreter* base_;
+};
+
+} // namespace at::functorch
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd0650a58d6a4d87c3cedf34fccde5c0d5b7e3d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/DeviceType.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// Takes a valid HIPAllocator (of any sort) and turns it into
+// an allocator pretending to be a CUDA allocator.  See
+// Note [Masquerading as CUDA]
+class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
+  Allocator* allocator_;
+public:
+  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
+    : allocator_(allocator) {}
+  DataPtr allocate(size_t size) override {
+    DataPtr r = allocator_->allocate(size);
+    r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
+    return r;
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return allocator_->raw_deleter();
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {
+    allocator_->copy_data(dest, src, count);
+  }
+};
+
+}} // namespace c10::hip
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..4811b0d5e45e984bea140496c9ae10684d16e040
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <c10/hip/HIPCachingAllocator.h>
+#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+
+namespace c10 {
+// forward declaration
+class DataPtr;
+namespace hip {
+namespace HIPCachingAllocatorMasqueradingAsCUDA {
+
+C10_HIP_API Allocator* get();
+C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
+
+} // namespace HIPCachingAllocatorMasqueradingAsCUDA
+} // namespace hip
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a3992263025cea38f7bd896cfb1efc534ff9ae3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -0,0 +1,353 @@
+#pragma once
+
+#include <ATen/hip/HIPConfig.h>
+
+// The includes of HIPGuard.h
+#include <c10/hip/impl/HIPGuardImpl.h>
+#include <c10/hip/HIPMacros.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/util/Exception.h>
+
+#include <c10/hip/impl/HIPGuardImpl.h>
+
+#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// Note [Masquerading as CUDA]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// c10_hip is very easy to understand: it is HIPified from c10_cuda,
+// and anywhere you said CUDA, the source code now says HIP.  HIPified
+// PyTorch is much harder to understand: it is HIPified from regular
+// PyTorch, yes, but NO source-to-source translation from CUDA to
+// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
+// For example, when you use HIPified PyTorch, you say x.cuda() to
+// move a tensor onto ROCm device.  We call this situation "HIP
+// masquerading as CUDA".
+//
+// This leads to a very awkward situation when we want to call c10_hip
+// code from PyTorch, since c10_hip is expecting things to be called
+// HIP, but PyTorch is calling them CUDA (masquerading as HIP).  To
+// fix this impedance mismatch, we have MasqueradingAsCUDA variants
+// for all c10_hip classes.  These translate between the "HIP" and "CUDA
+// masquerading as HIP" worlds.  For example,
+// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a
+// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type()
+// returns CUDA, getDevice() reports the current HIP device as a CUDA
+// device.)
+//
+// We should be able to delete all of these classes entirely once
+// we switch PyTorch to calling a HIP a HIP.
+//
+// When you add a new MasqueradingAsCUDA class/function, you need to
+// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py
+//
+//
+//
+// By the way, note that the cpp file associated with this also
+// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with
+// this HIP implementation.
+
+struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA;
+  HIPGuardImplMasqueradingAsCUDA() {}
+  HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA);
+  }
+  c10::DeviceType type() const override {
+    return c10::DeviceType::CUDA;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+      C10_HIP_CHECK(hipSetDevice(d.index()));
+    }
+    return old_device;
+  }
+  Device getDevice() const override {
+    int device;
+    C10_HIP_CHECK(hipGetDevice(&device));
+    return Device(c10::DeviceType::CUDA, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    C10_HIP_CHECK(hipSetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_HIP_CHECK_WARN(hipSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultHIPStreamMasqueradingAsCUDA(d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override {
+    return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index());
+  }
+  Stream exchangeStream(Stream s) const noexcept override {
+    HIPStreamMasqueradingAsCUDA cs(s);
+    auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index());
+    setCurrentHIPStreamMasqueradingAsCUDA(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    int deviceCnt;
+    hipError_t _err;
+    _err = hipGetDeviceCount(&deviceCnt);
+#if defined(USE_ROCM) && (ROCM_VERSION < 50201)
+    if(_err == hipErrorInvalidDevice)
+        return 0;
+#endif
+    if(_err != hipErrorNoDevice && _err != hipSuccess)
+        C10_HIP_CHECK(_err);
+    return deviceCnt;
+  }
+
+  // Event-related functions
+  // Note: hipEventCreateWithFlags should be called on the same device as
+  //  the recording stream's device.
+  void createEvent(
+    hipEvent_t* hip_event,
+    const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to HIP flag
+    auto hip_flag = hipEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+      case EventFlag::HIP_EVENT_DISABLE_TIMING:
+        hip_flag = hipEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+      case EventFlag::HIP_EVENT_DEFAULT:
+        hip_flag = hipEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "HIP event received unknown flag");
+    }
+
+    C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag));
+  }
+
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override {
+    if (!event) return;
+    auto hip_event = static_cast<hipEvent_t>(event);
+    int orig_device;
+    C10_HIP_CHECK_WARN(hipGetDevice(&orig_device));
+    C10_HIP_CHECK_WARN(hipSetDevice(device_index));
+    C10_HIP_CHECK_WARN(hipEventDestroy(hip_event));
+    C10_HIP_CHECK_WARN(hipSetDevice(orig_device));
+  }
+
+  void record(void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override {
+    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(),
+      "Event device index ",
+      device_index,
+      " does not match recording stream's device index ",
+      stream.device_index(),
+      ".");
+
+    hipEvent_t hip_event = static_cast<hipEvent_t>(*event);
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!hip_event) createEvent(&hip_event, flag);
+    C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream));
+    // Makes the void* point to the (possibly just allocated) HIP event
+    *event = hip_event;
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(
+    void* event,
+    const Stream& stream) const override {
+    if (!event) return;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_HIP_CHECK(hipStreamWaitEvent(
+      hip_stream,
+      hip_event,
+      /*flags (must be zero)=*/ 0));
+    setDevice(orig_device);
+  }
+
+  bool queryEvent(void* event) const override {
+    if (!event) return true;
+    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
+    const hipError_t err = hipEventQuery(hip_event);
+    if (err != hipErrorNotReady) C10_HIP_CHECK(err);
+    else {
+      // ignore and clear the error if not ready
+      (void)hipGetLastError();
+    }
+    return (err == hipSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    return hip_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    hip_stream.synchronize();
+  }
+
+  void recordDataPtrOnStream(
+    const c10::DataPtr& data_ptr,
+    const Stream& stream) const override {
+    HIPStreamMasqueradingAsCUDA hip_stream{stream};
+    HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
+  }
+};
+
+// All of the guards which have HIPGuardImpl burned in need to also have
+// variants using HIPGuardImplMasqueradingAsCUDA.
+
+/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with
+/// the correct InlineDeviceGuard burned in.  Sorry about the
+/// copy-pasting.
+
+struct HIPGuardMasqueradingAsCUDA {
+  explicit HIPGuardMasqueradingAsCUDA() = delete;
+  explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {}
+  explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {}
+
+  HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete;
+  HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete;
+  HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete;
+  HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete;
+
+  void set_device(Device device) { guard_.set_device(device); }
+  void reset_device(Device device) { guard_.reset_device(device); }
+  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
+  Device original_device() const { return guard_.original_device(); }
+  Device current_device() const { return guard_.current_device(); }
+
+ private:
+  c10::impl::InlineDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct OptionalHIPGuardMasqueradingAsCUDA {
+  explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {}
+  explicit OptionalHIPGuardMasqueradingAsCUDA(optional<Device> device_opt) : guard_(device_opt) {}
+  explicit OptionalHIPGuardMasqueradingAsCUDA(optional<DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
+
+  OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
+  OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
+
+  void set_device(Device device) { guard_.set_device(device); }
+  void reset_device(Device device) { guard_.reset_device(device); }
+  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
+  optional<Device> original_device() const { return guard_.original_device(); }
+  optional<Device> current_device() const { return guard_.current_device(); }
+  void reset() { guard_.reset(); }
+
+private:
+  c10::impl::InlineOptionalDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct HIPStreamGuardMasqueradingAsCUDA {
+  explicit HIPStreamGuardMasqueradingAsCUDA() = delete;
+  explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
+  HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+  HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
+
+  HIPStreamMasqueradingAsCUDA original_stream() const {
+    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream());
+  }
+  HIPStreamMasqueradingAsCUDA current_stream() const {
+    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream());
+  }
+
+  Device current_device() const { return guard_.current_device(); }
+  Device original_device() const { return guard_.original_device(); }
+
+private:
+  c10::impl::InlineStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct OptionalHIPStreamGuardMasqueradingAsCUDA {
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {}
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
+  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(optional<Stream> stream_opt) : guard_(stream_opt) {}
+
+  OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
+
+  optional<HIPStreamMasqueradingAsCUDA> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return make_optional(HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  optional<HIPStreamMasqueradingAsCUDA> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return make_optional(HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  void reset() { guard_.reset(); }
+
+private:
+  c10::impl::InlineOptionalStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+};
+
+struct HIPMultiStreamGuardMasqueradingAsCUDA {
+  explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef<HIPStreamMasqueradingAsCUDA> streams)
+    : guard_(unwrapStreams(streams)) {}
+
+  HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
+  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
+
+private:
+  c10::impl::InlineMultiStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<HIPStreamMasqueradingAsCUDA> hipStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(hipStreams.size());
+    for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) {
+      streams.push_back(hipStream);
+    }
+    return streams;
+  }
+};
+
+}} // namespace c10::hip
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..7958146b81edcedff9facecd94d69cdb9011ecbd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <c10/hip/HIPStream.h>
+
+// Use of c10::hip namespace here makes hipification easier, because
+// I don't have to also fix namespaces.  Sorry!
+namespace c10 { namespace hip {
+
+// See Note [Masquerading as CUDA] for motivation
+
+class HIPStreamMasqueradingAsCUDA {
+public:
+
+  enum Unchecked { UNCHECKED };
+
+  explicit HIPStreamMasqueradingAsCUDA(Stream stream)
+    : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) {
+    // We did the coercion unchecked; check that it was right.
+    TORCH_CHECK(stream.device().is_cuda() /* !!! */);
+  }
+
+  explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream)
+    // Unsafely coerce the "CUDA" stream into a HIP stream
+    : stream_(
+        HIPStream(
+          Stream(
+            Stream::UNSAFE,
+            Device(c10::DeviceType::HIP, stream.device_index()),
+            stream.id())
+        )
+      ) {}
+
+  // New constructor, just for this.  Does NOT coerce.
+  explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {}
+
+  bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
+    return stream_ == other.stream_;
+  }
+
+  bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
+    return stream_ != other.stream_;
+  }
+
+  operator hipStream_t() const { return stream_.stream(); }
+
+  operator Stream() const {
+    // Unsafely coerce HIP stream into a "CUDA" stream
+    return Stream(Stream::UNSAFE, device(), id());
+  }
+
+  DeviceIndex device_index() const { return stream_.device_index(); }
+
+  // Unsafely coerce HIP device into CUDA device
+  c10::DeviceType device_type() const { return c10::DeviceType::CUDA; }
+
+  Device device() const {
+    // Unsafely coerce HIP device into CUDA device
+    return Device(c10::DeviceType::CUDA, stream_.device_index());
+  }
+
+  StreamId id() const        { return stream_.id(); }
+  bool query() const         { return stream_.query(); }
+  void synchronize() const   { stream_.synchronize(); }
+  int priority() const       { return stream_.priority(); }
+  hipStream_t stream() const { return stream_.stream(); }
+
+  Stream unwrap() const {
+    // Unsafely coerce HIP stream into "CUDA" stream
+    return Stream(Stream::UNSAFE, device(), id());
+  }
+
+  c10::StreamData3 pack3() const noexcept {
+    // Unsafely coerce HIP stream into "CUDA" stream before packing
+    return unwrap().pack3();
+  }
+
+  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
+                                             DeviceIndex device_index,
+                                             c10::DeviceType device_type) {
+    // NB: constructor manages CUDA->HIP translation for us
+    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
+        stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
+
+  // New method, gets the underlying HIPStream
+  HIPStream hip_stream() const { return stream_; }
+
+private:
+  HIPStream stream_;
+};
+
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
+}
+
+HIPStreamMasqueradingAsCUDA
+inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
+  return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
+}
+
+inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
+  return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));
+}
+
+inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
+  return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index));
+}
+
+inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) {
+  setCurrentHIPStream(stream.hip_stream());
+}
+
+inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) {
+  stream << s.hip_stream() << " (masquerading as CUDA)";
+  return stream;
+}
+
+}} // namespace c10::hip
+
+namespace std {
+  template <>
+  struct hash<c10::hip::HIPStreamMasqueradingAsCUDA> {
+    size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept {
+      return std::hash<c10::Stream>{}(s.unwrap());
+    }
+  };
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/jit_macros.h b/MLPY/Lib/site-packages/torch/include/ATen/jit_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac6d0432425f11f761dcf26de7b0402a8daae5ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/jit_macros.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <ATen/cuda/CUDAConfig.h>
+#include <string>
+
+// AT_USE_JITERATOR(), controls whether we jit some elementwise kernels
+#define AT_USE_JITERATOR() true
+#define jiterator_stringify(...) std::string(#__VA_ARGS__);
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/jiterator_macros.h b/MLPY/Lib/site-packages/torch/include/ATen/jiterator_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccde91c67237707108eb61cc0eea38d0768aa2b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/jiterator_macros.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <string>
+
+#define JITERATOR_HOST_DEVICE C10_HOST_DEVICE
+#if defined(_MSC_VER) && defined(__CUDACC__)
+// NVRTC on Windows errors if __host__ __device__ attribute is
+// present on kernel.
+// error: attribute "__host__" does not apply here
+// error: attribute "__device__" does not apply here
+#define JITERATOR_HOST_DEVICE
+#endif
+
+// jiterator_also_stringify_as macro is used to define code (for CPU/ROCm)
+// and generate code string for `jiterator` (only when compiling for CUDA).
+// Usage :
+//      jiterator_also_stringify_as(
+//          jiterator_code(template <typename T> T identity(T x) { return x; }),
+//          identity_string);
+// This will define the template `identity` as present in code and
+// also define `std::string identity_string` with the code as the string
+// if this is being compiled for CUDA.
+
+// `jiterator_code` macro is to deal with `,` in the kernel code.
+// These `,`s confuse the preprocessor into thinking we are passing
+// multiple arguments to the macro.
+#define jiterator_code(...) __VA_ARGS__
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// CPU and CUDA and ROCm case
+#define stringify_code(...) #__VA_ARGS__
+#define jiterator_also_stringify_as(code, str_name) \
+  code /* define the function */                    \
+      const std::string str_name = std::string(stringify_code(code));
+#else
+// CPU only or CPU and ROCm case
+// Only needs the function
+#define jiterator_also_stringify_as(code, str_name) code
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/Descriptors.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Descriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..b66cb9b8720260f7e8faeca08cdfbc4e8f704100
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Descriptors.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <ATen/miopen/Exceptions.h>
+
+#include <ATen/miopen/miopen-wrapper.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+
+namespace at { namespace native {
+
+inline int dataSize(miopenDataType_t dataType)
+{
+  switch (dataType) {
+    case miopenHalf: return 2;
+    case miopenFloat: return 4;
+    case miopenBFloat16: return 2;
+    default: return 8;
+  }
+}
+
+template <typename T, miopenStatus_t (*dtor)(T*)>
+struct DescriptorDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      MIOPEN_CHECK(dtor(x));
+    }
+  }
+};
+
+// A generic class for wrapping MIOpen descriptor types.  All you need
+// is to give the underlying type the Descriptor_t points to (usually,
+// if it's miopenTensorDescriptor_t it points to miopenTensorStruct),
+// the constructor and the destructor.  Subclasses are responsible
+// for defining a set() function to actually set the descriptor.
+//
+// Descriptors default construct to a nullptr, and have a descriptor
+// initialized the first time you call set() or any other initializing
+// function.
+template <typename T, miopenStatus_t (*ctor)(T**), miopenStatus_t (*dtor)(T*)>
+class Descriptor
+{
+public:
+  // Use desc() to access the underlying descriptor pointer in
+  // a read-only fashion.  Most client code should use this.
+  // If the descriptor was never initialized, this will return
+  // nullptr.
+  T* desc() const { return desc_.get(); }
+  T* desc() { return desc_.get(); }
+
+  // Use mut_desc() to access the underlying descriptor pointer
+  // if you intend to modify what it points to (e.g., using
+  // miopenSetFooDescriptor).  This will ensure that the descriptor
+  // is initialized.  Code in this file will use this function.
+  T* mut_desc() { init(); return desc_.get(); }
+protected:
+  void init() {
+    if (desc_ == nullptr) {
+      T* raw_desc;
+      MIOPEN_CHECK(ctor(&raw_desc));
+      desc_.reset(raw_desc);
+    }
+  }
+private:
+  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
+};
+
+class TensorDescriptor
+  : public Descriptor<miopenTensorDescriptor,
+                      &miopenCreateTensorDescriptor,
+                      &miopenDestroyTensorDescriptor>
+{
+public:
+  TensorDescriptor() {}
+  explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
+    set(t, pad);
+  }
+
+  void set(const at::Tensor &t, size_t pad = 0);
+  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
+
+  void print();
+
+private:
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
+
+class FilterDescriptor
+  : public Descriptor<miopenTensorDescriptor,
+                      &miopenCreateTensorDescriptor,
+                      &miopenDestroyTensorDescriptor>
+{
+ public:
+  void set(const at::Tensor &t, int64_t pad = 0) {
+    set(t, at::MemoryFormat::Contiguous, pad);
+  }
+
+  void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
+
+private:
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  }
+};
+
+struct ConvolutionDescriptor
+  : public Descriptor<miopenConvolutionDescriptor,
+                      &miopenCreateConvolutionDescriptor,
+                      &miopenDestroyConvolutionDescriptor>
+{
+  void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode,  int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool deterministic) {
+    MIOPEN_CHECK(miopenInitConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, c_mode));
+    MIOPEN_CHECK(miopenSetConvolutionGroupCount(mut_desc(), groups));
+    MIOPEN_CHECK(miopenSetConvolutionAttribute(mut_desc(), MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, deterministic ? 1 : 0));
+  }
+};
+
+
+struct RNNDescriptor
+  : public Descriptor<miopenRNNDescriptor,
+                      &miopenCreateRNNDescriptor,
+                      &miopenDestroyRNNDescriptor>
+{
+    void set(int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction, miopenRNNMode_t rnn_mode,
+              miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) {
+      MIOPEN_CHECK(miopenSetRNNDescriptor(mut_desc(), hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algorithm, datatype));
+    }
+};
+
+union Constant
+{
+  float f;
+  double d;
+  Constant(miopenDataType_t dataType, double value) {
+    if (dataType == miopenHalf || dataType == miopenFloat || dataType == miopenBFloat16) {
+      f = static_cast<float>(value);
+    } else {
+      d = value;
+    }
+  }
+};
+
+}}  // namespace
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/Exceptions.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..044ae3222aa83e512c796fc2b903b2a111285015
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Exceptions.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/miopen/miopen-wrapper.h>
+#include <string>
+#include <stdexcept>
+#include <sstream>
+
+namespace at { namespace native {
+
+class miopen_exception : public std::runtime_error {
+public:
+  miopenStatus_t status;
+  miopen_exception(miopenStatus_t status, const char* msg)
+      : std::runtime_error(msg)
+      , status(status) {}
+  miopen_exception(miopenStatus_t status, const std::string& msg)
+      : std::runtime_error(msg)
+      , status(status) {}
+};
+
+inline void MIOPEN_CHECK(miopenStatus_t status)
+{
+  if (status != miopenStatusSuccess) {
+    if (status == miopenStatusNotImplemented) {
+        throw miopen_exception(status, std::string(miopenGetErrorString(status)) +
+                ". This error may appear if you passed in a non-contiguous input.");
+    }
+    throw miopen_exception(status, miopenGetErrorString(status));
+  }
+}
+
+inline void HIP_CHECK(hipError_t error)
+{
+  if (error != hipSuccess) {
+    std::string msg("HIP error: ");
+    msg += hipGetErrorString(error);
+    throw std::runtime_error(msg);
+  }
+}
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/Handle.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..8307827d5bfd33c4173c8e14d6e91031e6f1adf9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Handle.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/miopen/miopen-wrapper.h>
+
+namespace at { namespace native {
+
+miopenHandle_t getMiopenHandle();
+
+}} // namespace
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/Types.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..74121cbb9e62f9f974db4fd43575554c89ac8df0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/miopen/miopen-wrapper.h>
+#include <ATen/Tensor.h>
+
+namespace at { namespace native {
+
+miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+
+int64_t miopen_version();
+
+}}  // namespace at::miopen
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/Utils.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f8e228165664c6e358838df3c26d4074ccd173
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/Utils.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/miopen/miopen-wrapper.h>
+#include <ATen/miopen/Handle.h>
+
+namespace at { namespace native {
+
+// This function makes tensors which have zero stride contiguous, by
+// setting the strides to 1.
+inline Tensor contiguousIfZeroInStrides(const Tensor& t) {
+  for (auto s : t.strides()) {
+    if (s == 0) return t.contiguous();
+  }
+  return t;
+}
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/miopen/miopen-wrapper.h b/MLPY/Lib/site-packages/torch/include/ATen/miopen/miopen-wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..202e189ef6db3456c3a46e88f9cf753459e2ae0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/miopen/miopen-wrapper.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <miopen/miopen.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/EmptyTensor.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/EmptyTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0256d2f0e25a5ee8ceb3187f64c4bc1d58043708
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/EmptyTensor.h
@@ -0,0 +1,29 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size, const TensorOptions &options);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    c10::optional<Device> device_opt);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions &options);
+
+} // namespace at::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/IndexKernels.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/IndexKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..d52c90e71f8b2ea7febcf8aa79a2a4ea9261c01b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/IndexKernels.h
@@ -0,0 +1,630 @@
+#pragma once
+
+namespace at::mps {
+
+static const char * indexing_metal_shaders = R"INDEX_METAL(
+#include <metal_stdlib>
+#include <metal_atomic>
+
+using namespace metal;
+
+#if __METAL_VERSION__ < 300
+struct IndexAB {
+    // Allow up to 16 indices
+    metal::array<constant void *, 16>  indexArray [[ id(0) ]];
+};
+#else
+struct IndexAB {
+    constant int64_t* indexArray;
+};
+
+#endif
+
+template<typename T, typename OffsetsT>
+kernel void index_select(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB           [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB           [[buffer(0)]],
+#endif
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+#if __METAL_VERSION__ >= 300
+        constant int64_t* indexArray = indexAB[i].indexArray;
+#else
+        constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
+#endif
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+     }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y + offset);
+    *out = *in;
+}
+
+template<typename T, typename OffsetsT>
+void index_put_impl(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB,
+#else
+    constant IndexAB  & indexAB,
+#endif
+    constant int64_t  * index_sizes,
+    constant int64_t  * index_strides,
+    constant OffsetsT * offsets,
+    constant void     * inputData,
+    device   void     * outputData,
+    constant uint32_t & num_indices,
+    uint thread_index) {
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+#if __METAL_VERSION__ >= 300
+        constant int64_t* indexArray = indexAB[i].indexArray;
+#else
+        constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
+#endif
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y);
+    *out = *in;
+}
+
+template<typename T, typename OffsetsT>
+kernel void index_put_serial(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB           [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB           [[buffer(0)]],
+#endif
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    constant uint     * numIters          [[buffer(7)]],
+    uint thread_index [[thread_position_in_grid]]) {
+
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+
+    for (uint iter_i = 0; iter_i < *numIters; iter_i++) {
+        index_put_impl<T>(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, iter_i);
+    }
+}
+
+template<typename T, typename OffsetsT>
+kernel void index_put(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB           [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB           [[buffer(0)]],
+#endif
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    index_put_impl<T>(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, thread_index);
+}
+
+#if __METAL_VERSION__ < 300
+#define REGISTER_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                             \
+    constant IndexAB & indexAB           [[buffer(0)]],                            \
+    constant void    * indexSizes        [[buffer(1)]],                            \
+    constant void    * indexStrides      [[buffer(2)]],                            \
+    constant IDX_DTYPE   * offsets           [[buffer(3)]],                        \
+    constant void    * inputData         [[buffer(4)]],                            \
+    device   void    * outputData        [[buffer(5)]],                            \
+    constant uint32_t & num_indices      [[buffer(6)]],                            \
+    uint thread_index [[thread_position_in_grid]]);
+#else
+#define REGISTER_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                             \
+    constant IndexAB * indexAB           [[buffer(0)]],                            \
+    constant void    * indexSizes        [[buffer(1)]],                            \
+    constant void    * indexStrides      [[buffer(2)]],                            \
+    constant IDX_DTYPE   * offsets           [[buffer(3)]],                        \
+    constant void    * inputData         [[buffer(4)]],                            \
+    device   void    * outputData        [[buffer(5)]],                            \
+    constant uint32_t & num_indices      [[buffer(6)]],                            \
+    uint thread_index [[thread_position_in_grid]]);
+#endif
+
+#define REGISTER_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)     \
+    REGISTER_INDEX_OP(8bit,  idx32, char,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(8bit,  idx64, char,  INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(32bit, idx32, int,   INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(32bit, idx64, int,   INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(64bit, idx32, long,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(64bit, idx64, long,  INDEX_OP_TYPE, ulong3);
+
+REGISTER_INDEX_OP_ALL_DTYPES(select);
+REGISTER_INDEX_OP_ALL_DTYPES(put);
+
+#if __METAL_VERSION__ < 300
+#define REGISTER_SINGLE_THREADED_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]                               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                                             \
+    constant IndexAB   & indexAB           [[buffer(0)]],                                          \
+    constant void      * indexSizes        [[buffer(1)]],                                          \
+    constant void      * indexStrides      [[buffer(2)]],                                          \
+    constant IDX_DTYPE * offsets           [[buffer(3)]],                                          \
+    constant void      * inputData         [[buffer(4)]],                                          \
+    device   void      * outputData        [[buffer(5)]],                                          \
+    constant uint32_t  & num_indices       [[buffer(6)]],                                          \
+    constant uint      * numIters          [[buffer(7)]],                                          \
+    uint thread_index [[thread_position_in_grid]]);
+#else
+#define REGISTER_SINGLE_THREADED_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]                               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                                             \
+    constant IndexAB   * indexAB           [[buffer(0)]],                                          \
+    constant void      * indexSizes        [[buffer(1)]],                                          \
+    constant void      * indexStrides      [[buffer(2)]],                                          \
+    constant IDX_DTYPE * offsets           [[buffer(3)]],                                          \
+    constant void      * inputData         [[buffer(4)]],                                          \
+    device   void      * outputData        [[buffer(5)]],                                          \
+    constant uint32_t  & num_indices       [[buffer(6)]],                                          \
+    constant uint      * numIters          [[buffer(7)]],                                          \
+    uint thread_index [[thread_position_in_grid]]);
+#endif
+
+#define REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)                   \
+    REGISTER_SINGLE_THREADED_INDEX_OP(8bit,  idx32, char,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(8bit,  idx64, char,  INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx32, int,   INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx64, int,   INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx32, long,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx64, long,  INDEX_OP_TYPE, ulong3);
+
+REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(put_serial);
+
+template<typename StridesT, typename DataT>
+kernel void kernel_index_offsets(constant StridesT * strides         [[buffer(0)]],
+                                device DataT      * data_offsets    [[buffer(1)]],
+                                constant uint     * iter_shape      [[buffer(2)]],
+                                constant uint     & num_dimensions  [[buffer(3)]],
+                                uint thread_index [[thread_position_in_grid]]) {
+    data_offsets[thread_index] = 0;
+    uint32_t idx = thread_index;
+    for (uint32_t dim = 0; dim < num_dimensions; dim++) {
+        uint32_t remainder = idx % iter_shape[dim];
+        idx /= iter_shape[dim];
+
+        data_offsets[thread_index] += remainder * DataT(strides[dim]);
+    }
+}
+
+template
+[[host_name("kernel_index_offsets_32")]]
+kernel void kernel_index_offsets<packed_uint3, uint3>(
+                constant packed_uint3 * strides         [[buffer(0)]],
+                device uint3          * data_offsets    [[buffer(1)]],
+                constant uint         * iter_shape      [[buffer(2)]],
+                constant uint         & num_dimensions  [[buffer(3)]],
+                uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("kernel_index_offsets_64")]]
+kernel void kernel_index_offsets<packed_uint3, ulong3>(
+                constant packed_uint3 * strides         [[buffer(0)]],
+                device ulong3          * data_offsets    [[buffer(1)]],
+                constant uint         * iter_shape      [[buffer(2)]],
+                constant uint         & num_dimensions  [[buffer(3)]],
+                uint thread_index [[thread_position_in_grid]]);
+
+template<typename T, typename E, typename OffsetsT>
+kernel void index_put_accumulate_native_dtypes(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB     [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB     [[buffer(0)]],
+#endif
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant OffsetsT * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device void       * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+#if __METAL_VERSION__ >= 300
+        constant int64_t* indexArray = indexAB[i].indexArray;
+#else
+        constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
+#endif
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant E * in  = (constant E*)((constant char*)inputData  + offsets[thread_index].y);
+    atomic_fetch_add_explicit(out, *in, memory_order_relaxed);
+}
+
+template<typename T>
+__attribute__((__always_inline__)) void atomic_fetch_add_relaxed(device void * addr, T value) {
+    device atomic_uint* uintAddr = (device atomic_uint*)addr;
+    uint expected = atomic_load_explicit(uintAddr, memory_order_relaxed);
+    T updated = as_type<T>(expected) + value;
+    while (!atomic_compare_exchange_weak_explicit(uintAddr, &expected, as_type<uint>(updated), memory_order_relaxed, memory_order_relaxed)) {
+        updated = as_type<T>(expected) + value;
+    }
+}
+
+template<typename T, typename OffsetsT>
+kernel void atomic_index_put_accumulate(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB           [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB           [[buffer(0)]],
+#endif
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+#if __METAL_VERSION__ >= 300
+        constant int64_t* indexArray = indexAB[i].indexArray;
+#else
+        constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
+#endif
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device void * out = (device void*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant T  * in  = (constant T*)((constant char*)inputData + offsets[thread_index].y);
+    atomic_fetch_add_relaxed<T>(out, *in);
+}
+
+template
+[[host_name("index_put_accumulate_32bit_float_idx32")]]
+kernel void atomic_index_put_accumulate<float, uint3>(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB     [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB     [[buffer(0)]],
+#endif
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant uint3    * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_float_idx64")]]
+kernel void atomic_index_put_accumulate<float, ulong3>(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB     [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB     [[buffer(0)]],
+#endif
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant ulong3   * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_int_idx32")]]
+kernel void index_put_accumulate_native_dtypes<atomic_int, int, uint3>(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB     [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB     [[buffer(0)]],
+#endif
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant uint3    * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_int_idx64")]]
+kernel void index_put_accumulate_native_dtypes<atomic_int, int, ulong3>(
+#if __METAL_VERSION__ >= 300
+    constant IndexAB  * indexAB     [[buffer(0)]],
+#else
+    constant IndexAB  & indexAB     [[buffer(0)]],
+#endif
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant ulong3   * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+)INDEX_METAL";
+
+static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+template<typename Y, typename X>
+Y cast(const X x);
+
+template<>
+{1} cast<{1}, {0}>(const {0} x) {{
+ return {2};
+}}
+
+kernel void scatter_kernel_5(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint5 & size   [[buffer(2)]],
+                             constant packed_uint5 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_4(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint4 & size   [[buffer(2)]],
+                             constant packed_uint4 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_3(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint3 & size   [[buffer(2)]],
+                             constant packed_uint3 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_2(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint2 & size   [[buffer(2)]],
+                             constant packed_uint2 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_1(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant int & size            [[buffer(2)]],
+                             constant int & stride          [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[strided_index] = cast<{1}>(src[linear_index]);
+}}
+)METAL_SCATTER";
+
+static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+template<typename Y, typename X>
+Y cast(const X x);
+
+template<>
+{1} cast<{1}, {0}>(const {0} x) {{
+ return {2};
+}}
+
+kernel void gather_kernel_5(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint5 & size    [[buffer(2)]],
+                            constant packed_uint5 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u]);
+}}
+
+kernel void gather_kernel_4(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint4 & size    [[buffer(2)]],
+                            constant packed_uint4 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w]);
+}}
+
+kernel void gather_kernel_3(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint3 & size    [[buffer(2)]],
+                            constant packed_uint3 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z]);
+}}
+
+kernel void gather_kernel_2(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint2 & size    [[buffer(2)]],
+                            constant packed_uint2 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y]);
+}}
+
+kernel void gather_kernel_1(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant int & size             [[buffer(2)]],
+                            constant int & stride           [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index]);
+}}
+)METAL_GATHER";
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocator.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..62bf958a9b95c63669a99200f7b05b262aa09f03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocator.h
@@ -0,0 +1,401 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSEvent.h>
+#include <ATen/mps/MPSStream.h>
+
+#include <cstdio>
+#include <mutex>
+#include <set>
+#include <unordered_set>
+#include <mach/vm_page_size.h>
+#include <c10/util/flat_hash_map.h>
+
+// this implementation is based on CUDACachingAllocator.
+// It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
+// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
+namespace at::mps::HeapAllocator {
+
+static const size_t kMaxSmallAlloc = MB(1);    // largest "small" allocation is 1 MiB
+static const size_t kMinLargeAlloc = MB(10);   // allocations between 1 and 10 MiB may use kLargeHeap
+static const size_t kRoundLarge    = MB(2);    // round up large allocations to 2 MiB
+static const size_t kSmallHeap     = MB(8);    // "small" allocations are packed in 8 MiB heaps
+static const size_t kLargeHeap     = MB(32);   // "large" allocations may be packed in 32 MiB heaps
+static const size_t kXLargeHeapD   = MB(128);  // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
+static const size_t kXLargeHeapU   = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
+static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
+
+// buffer pools could be customized with a combination of usage flags
+enum UsageFlags : uint32_t {
+  PRIVATE = 0,
+  SMALL   = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
+  SHARED  = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
+  MANAGED = (1 << 2), // managed storage mode
+  HAZARD  = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
+  SCALAR  = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
+};
+// debug verbosity flags
+enum DebugVerbosity : uint32_t {
+  SILENT      = 0,
+  PROFILING   = (1 << 0), // print generic profiling data for total system memory usage
+  ALLOCATIONS = (1 << 1), // print buffer allocations
+  RECYCLES    = (1 << 2), // print buffer recycling
+  RELEASES    = (1 << 3), // print buffer releases
+  LARGE_ONLY  = (1 << 4), // only log large buffer pool transactions
+};
+
+struct HeapBlock;
+
+struct BufferBlock {
+  id<MTLBuffer> buffer;
+  void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
+  size_t size; // size after alignment
+  size_t requested_size; // requested size (before alignment)
+  // buffer shape is used for retrieving base of views in cached graphs
+  std::vector<int64_t> shape;
+  bool in_use = false;
+  HeapBlock* heap;
+  id_t buf_id;
+  // counter to candidate least recently used buffers for garbage collection
+  uint32_t gc_count = 0;
+  uint32_t use_count = 0;
+  // counter to assign unique ids to buffer blocks
+  static uint64_t buffer_counter;
+  // Metal events used to sync GPU/CPU operations on the shared-storage buffers
+  MPSEventPtr event;
+
+  BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
+              HeapBlock* Heap = nullptr) :
+              buffer(Buffer), size(Size), requested_size(RequestedSize),
+              heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
+
+  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
+    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
+  }
+  static size_t alignUp(size_t Size, size_t Alignment) {
+    assert(((Alignment - 1) & Alignment) == 0);
+    return ((Size + Alignment - 1) & ~(Alignment - 1));
+  }
+  uint32_t retainCount() const { return [buffer retainCount]; }
+};
+typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
+
+struct BufferPool;
+struct AllocParams {
+  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
+              search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
+  size_t size() const { return search_key.size; }
+
+  BufferBlock search_key;
+  BufferPool* pool;
+  BufferBlock* buffer_block = nullptr;
+  size_t requested_size;
+  // true if we exceed the low watermark limit. In this case
+  // we apply strategies to relieve the pressure before allocation.
+  bool has_memory_pressure = false;
+  // true if we're allocating on a unified memory device
+  bool has_unified_memory = true;
+};
+
+struct HeapBlock {
+  id<MTLHeap> heap;
+  struct { size_t total, available; } size;
+  BufferPool* pool;
+  unsigned int n_buffers = 0;
+  id_t heap_id;
+  // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
+  bool is_split;
+  // counter to assign unique ids to heap blocks
+  static uint64_t heap_counter;
+
+  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
+            heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
+            heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
+
+  static MTLResourceOptions getOptions(uint32_t usage) {
+    // TODO: check the caching performance of write-combined mode
+    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;
+
+    if (usage & UsageFlags::MANAGED)
+      options |= MTLResourceStorageModeManaged;
+    else if (usage & UsageFlags::SHARED)
+      options |= MTLResourceStorageModeShared;
+    else
+      options |= MTLResourceStorageModePrivate;
+
+    options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
+
+    return options;
+  }
+
+  static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
+    HeapBlock *heapBlock = nullptr;
+    bool is_split = true;
+    const size_t size = params.size();
+    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
+    if (d) {
+      const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
+      if (size <= kMaxSmallAlloc) {
+        d.size = kSmallHeap;
+      } else if (size < kMinLargeAlloc) {
+        d.size = kLargeHeap;
+      } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
+        d.size = kXLargeHeap;
+      } else {
+        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        is_split = false;
+      }
+      d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
+      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+      // this automatically handles Metal buffer access synchronizations at the
+      // cost of slightly lower performance.
+      d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
+      d.resourceOptions = getOptions(usage);
+      d.type = MTLHeapTypeAutomatic;
+      id<MTLHeap> heap = [device newHeapWithDescriptor: d];
+      if (heap) {
+        [heap setPurgeableState:MTLPurgeableStateNonVolatile];
+        const size_t heap_size = heapAvailableSize(heap);
+        heapBlock = new HeapBlock(heap_size, heap, params.pool);
+        if (heapBlock) {
+          heapBlock->is_split = is_split;
+        }
+      }
+      [d release];
+    }
+    return heapBlock;
+  }
+  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
+    return (a->size.available != b->size.available) ? a->size.available < b->size.available :
+                                                      (uintptr_t)a->heap < (uintptr_t)b->heap;
+  }
+  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
+    return [heap maxAvailableSizeWithAlignment:Alignment];
+  }
+  NSUInteger Size() {
+    return [heap size];
+  }
+  id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
+    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
+    if (buf) {
+      updateAvailableSize();
+      n_buffers++;
+    }
+    return buf;
+  }
+  // returns the retainCount before releasing the buffer
+  uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
+    const uint32_t retainCount = [buffer retainCount];
+    [buffer release];
+    buffer = nil;
+    updateAvailableSize();
+    n_buffers--;
+    return retainCount;
+  }
+  // returns the retainCount before releasing the heap
+  uint32_t releaseMTLHeap() {
+    const uint32_t retainCount = [heap retainCount];
+    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
+    [heap setPurgeableState:MTLPurgeableStateEmpty];
+    [heap release];
+    heap = nil;
+    size.available = 0;
+    return retainCount;
+  }
+  uint32_t retainCount() const { return [heap retainCount]; }
+  void updateAvailableSize() { size.available = heapAvailableSize(heap); }
+};
+typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
+
+struct BufferPool {
+  enum class Kind {
+    PRIVATE_SMALL,
+    PRIVATE_LARGE,
+    SHARED_SMALL,
+    SHARED_LARGE,
+    SCALAR,
+  };
+
+  BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
+             device(Device), usage(Usage),
+             heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
+
+  const id<MTLDevice> device;
+  // usage flags to customize the pool for various purposes (see UsageFlags enum)
+  const uint32_t usage;
+  // total number of buffers in the pool
+  uint32_t n_buffers = 0;
+  // total allocations size on this pool
+  size_t allocated_size = 0;
+  // total memory available in the pool
+  size_t available_size = 0;
+  // list of heaps ordered by their "available" (not total) memory size
+  std::set<HeapBlock*, HeapComparison> heaps;
+  // list of only "available" buffers in the pool (i.e., buffers not in-use)
+  std::set<BufferBlock*, BufferComparison> available_buffers;
+  // list of buffers that are in a state of "limbo" where they've already been freed
+  // from PyTorch-side, but were not returned to pool due to still being
+  // in-use by command buffers with retainCount > 1. In this state, the buffer is
+  // neither ready to be recycled, nor could be returned to pool as available.
+  // These buffers will be returned to pool once the command buffer's
+  // completionHandler callbacks are called.
+  std::unordered_set<BufferBlock*> buffers_pending_free;
+  // list of heaps pending size update
+  std::unordered_set<HeapBlock*> heaps_pending_update;
+};
+
+class MPSHeapAllocatorImpl {
+public:
+  explicit MPSHeapAllocatorImpl() :
+    m_device(at::mps::MPSDevice::getInstance()->device()),
+    m_max_buffer_size([m_device maxBufferLength]),
+    m_stream(getDefaultMPSStream()),
+    m_event_pool(getMPSEventPool()) {
+    init_allocator();
+  }
+  ~MPSHeapAllocatorImpl() {
+    emptyCache();
+  }
+  // interface exposed to at::Allocator
+  id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
+  void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
+  void emptyCache();
+  // free inactive buffers that are pending to be freed
+  void freeInactiveBuffers();
+  // returns true if buffer was allocated from the shared pool
+  bool isSharedBuffer(const void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(const void* ptr);
+  // set the shape of a base tensor from a view tensor
+  void setBufferShape(const void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
+  IntArrayRef getBufferShape(const void* ptr);
+  // get the unique ID of the buffer
+  id_t getBufferId(const void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
+  id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
+  // returns a CPU-mapping of the input buffer and its retainCount,
+  // if only it has Shared storage-mode and allocated on MPSAllocator
+  std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
+  // records events for a list of MTLBuffers (list is used to lock the mutex once)
+  // returns true if records any event (given if passed buffers exist and are shared-storage)
+  bool recordEvents(c10::ArrayRef<const void*> buffers);
+  // waits for the event to signal the completion of GPU execution
+  // on the passed shared buffers (list is used to lock the mutex once)
+  // returns true if actually waited on any event
+  bool waitForEvents(c10::ArrayRef<const void*> buffers);
+  // this indicates how far (in Megabytes) the current total allocations are from the
+  // low watermark limit which is used to detect if we're under memory pressure
+  // This returns zero if we've reached the low watermark limit
+  ssize_t getLowWatermarkValue();
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
+  size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
+  // (see m_current_allocated_memory for description)
+  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
+  // total GPU memory allocated in the process by Metal driver; including
+  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
+  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
+  inline id<MTLDevice> Device() const { return m_device; }
+
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  inline std::string format_size(uint64_t size) const;
+
+private:
+  // (see m_high_watermark_ratio for description)
+  constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
+  // (see m_low_watermark_ratio for description)
+  // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
+  constexpr static double default_low_watermark_ratio_unified  = 1.4;
+  constexpr static double default_low_watermark_ratio_discrete = 1.0;
+
+  const id<MTLDevice> m_device;
+  std::recursive_mutex m_mutex;
+  // allocated buffers by device pointer
+  ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
+  // using a container for pools to simplify iterating them
+  ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
+  // total memory allocated by HeapAllocator (including blocks in pools)
+  size_t m_total_allocated_memory = 0;
+  // currently active memory allocations in use (i.e., blocks not in pools)
+  size_t m_current_allocated_memory = 0;
+  // max buffer size allowed by Metal
+  size_t m_max_buffer_size = 0;
+  // maximum total size allowed to be allocated
+  size_t m_max_total_allowed_size = 0;
+  // high watermark ratio is a hard limit for the total allowed allocations
+  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
+  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
+  // allocation size; beyond that, the allocations would fail with OOM error.
+  double m_high_watermark_ratio;
+  // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
+  // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
+  // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
+  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+  // allocation size.
+  double m_low_watermark_ratio;
+  // low watermark size limit (in Bytes) at the time we initialize the allocator
+  size_t m_low_watermark_limit;
+  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
+  uint32_t m_debug_verbosity;
+  // default MPS stream
+  MPSStream* m_stream;
+  // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
+  std::shared_ptr<MPSEventPool> m_event_pool;
+
+  void init_allocator();
+  void init_buffer_pools();
+  HeapBlock* get_free_heap(AllocParams& params);
+  bool get_free_buffer(AllocParams& params);
+  BufferBlock* get_allocated_buffer_block(const void* ptr);
+  BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
+  bool alloc_buffer(AllocParams& params);
+  void free_buffer(BufferBlock* buffer_block);
+  // returns true if the container heap is also released
+  bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
+  void release_buffers(BufferPool& pool);
+  bool release_available_cached_buffers(AllocParams& params);
+  bool release_cached_buffers();
+  // free unused cached blocks to reclaim GPU memory if memory pressure is high
+  void garbage_collect_cached_buffers(AllocParams& params);
+  // returns the suitable buffer pool type for the usage or
+  // requested/allocated sizes
+  BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
+  // returns the aligned allocation size that is optimized
+  // for the buffers to get reused frequently
+  size_t get_allocation_size(size_t size, uint32_t usage) const;
+  // maximum size of device memory available for allocation in current process
+  // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
+  size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
+  // there are implicit allocations from MPS backend, so we need to query the 'device' for
+  // total allocated size instead of manually tracking in MPSAllocator
+  size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
+
+  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
+    for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
+      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
+    }
+    return true;
+  }
+};
+
+} // namespace at::mps::HeapAllocator
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..88a977fe48bf4db6e082e7cb9d60fe42c0616531
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
@@ -0,0 +1,61 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+
+#define MB(x) (x * 1048576UL)
+
+namespace at::mps {
+
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual void freeInactiveBuffers() const = 0;
+  virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(const void* ptr) const = 0;
+  virtual id_t getBufferId(const void* ptr) const = 0;
+  virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0;
+  virtual bool isSharedBuffer(const void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual std::string formatSize(size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+  virtual size_t getCurrentAllocatedMemory() const = 0;
+  virtual size_t getDriverAllocatedMemory() const = 0;
+  virtual std::pair<const void*, uint32_t> getSharedBufferPtr(const void* ptr) const = 0;
+  virtual bool recordEvents(c10::ArrayRef<const void*> buffers) const = 0;
+  virtual bool waitForEvents(c10::ArrayRef<const void*> buffers) const = 0;
+};
+
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+    ALLOCATION_FAILED // buffer allocation failed
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSDevice.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d92ac5b7c41bd9905efdcdee77659a41b8a767b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSDevice.h
@@ -0,0 +1,85 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+typedef void* MTLLibrary_t;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary_t;
+#endif
+
+using namespace std;
+
+namespace at::mps {
+
+// Helper enum to check if a MPSGraph op is supported in a given macOS version
+enum class MacOSVersion : uint32_t {
+  MACOS_VER_13_0_PLUS = 0,
+  MACOS_VER_13_1_PLUS,
+  MACOS_VER_13_2_PLUS,
+  MACOS_VER_13_3_PLUS,
+  MACOS_VER_14_0_PLUS,
+};
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() {
+    return _mtl_device;
+  }
+  /**
+   * Returns whether running on Ventura or newer
+   */
+  bool isMacOS13Plus(MacOSVersion version) const;
+
+  MTLComputePipelineState_t metalIndexingPSO(const std::string &kernel);
+  MTLLibrary_t getMetalIndexingLibrary();
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MTLLibrary_t _mtl_indexing_library;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+TORCH_API bool is_macos_13_or_newer(MacOSVersion version = MacOSVersion::MACOS_VER_13_0_PLUS);
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSEvent.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab4ad68412a7c9f431da8032924b51c6c5c33660
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSEvent.h
@@ -0,0 +1,100 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSStream.h>
+#include <ctime>
+#include <stack>
+
+namespace at::mps {
+
+// NOTE: don't create instances of this class directly.
+// Use MPSEventPool to acquire instances of MPSEvent.
+class MPSEvent {
+public:
+  explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing);
+  ~MPSEvent();
+
+  // records an event on the stream
+  void record(bool needsLock, bool syncEvent = false);
+  // makes all future work submitted to the stream wait for this event.
+  bool wait(bool needsLock, bool syncEvent = false);
+  // schedules a notifyListener callback for the event.
+  bool notify(bool needsLock, MTLSharedEventNotificationBlock block);
+  // checks if events are already signaled.
+  bool query() const;
+  // blocks the CPU thread until all the GPU work that were scheduled
+  // prior to recording this event are completed.
+  bool synchronize();
+  // resets this event with new parameters in case it gets reused from the event pool
+  void reset(MPSStream* stream, bool enable_timing);
+  // returns the unique ID of the event instance
+  id_t getID() const { return m_id; }
+  // returns the completion timestamp of the event
+  uint64_t getCompletionTime() const { return m_completion_time; }
+  // if already recorded, waits for cpu_sync_cv to be signaled
+  void waitForCpuSync();
+
+private:
+  id_t m_id;
+  // enables measuring the completion time of the notifyListener of this event
+  bool m_enable_timing;
+  uint64_t m_signalCounter = 0;
+  MPSStream* m_stream = nullptr;
+  MTLSharedEvent_t m_event = nullptr;
+  MTLSharedEventListener* m_listener = nullptr;
+  // used to sync the events created on this Stream with CPU
+  std::mutex m_cpu_sync_mutex{};
+  std::condition_variable m_cpu_sync_cv{};
+  // CondVar predicate to sync the events created on this Stream with CPU
+  bool m_cpu_sync_completed = false;
+  // used to compute elapsed time
+  uint64_t m_completion_time = 0;
+
+  void recordLocked(bool syncEvent);
+  bool waitLocked(bool syncEvent);
+  bool notifyLocked(MTLSharedEventNotificationBlock block);
+  void notifyCpuSync();
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+typedef std::unique_ptr<MPSEvent, std::function<void(MPSEvent*)>> MPSEventPtr;
+
+class MPSEventPool {
+public:
+  explicit MPSEventPool(MPSStream* default_stream);
+  ~MPSEventPool();
+
+  MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream);
+  void emptyCache();
+
+  // these are mainly used for MPSHooks and torch.mps.Event() bindings
+  id_t acquireEvent(bool enable_timing);
+  void releaseEvent(id_t event_id);
+  void recordEvent(id_t event_id, bool syncEvent);
+  void waitForEvent(id_t event_id, bool syncEvent);
+  void synchronizeEvent(id_t event_id);
+  bool queryEvent(id_t event_id);
+  // returns elapsed time between two recorded events in milliseconds
+  double elapsedTime(id_t start_event_id, id_t end_event_id);
+
+private:
+  MPSStream* m_default_stream = nullptr;
+  std::recursive_mutex m_mutex;
+  std::stack<std::unique_ptr<MPSEvent>> m_pool{};
+  // dictionary to associate event IDs with event objects
+  // used to retain in-use events out of the pool
+  // for torch.mps.Event() bindings.
+  std::unordered_map<id_t, MPSEventPtr> m_in_use_events{};
+  uint64_t m_event_counter = 0;
+  std::function<void(MPSEvent*)> m_default_deleter;
+
+  MPSEvent* getInUseEvent(id_t event_id, bool locked = true);
+};
+
+// shared_ptr is used to get MPSEventPool destroyed after dependent instances
+std::shared_ptr<MPSEventPool> getMPSEventPool();
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ed6acd31cfa079a4bf9b5c1edc8824920f603d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h
@@ -0,0 +1,52 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+namespace mps::detail {
+
+static const uint32_t PHILOX_STATE_N = 7;
+struct rng_data_pod {
+  std::array<uint32_t, PHILOX_STATE_N> state{1};
+  uint64_t seed = default_rng_seed_val;
+};
+
+TORCH_API const Generator& getDefaultMPSGenerator();
+TORCH_API Generator createMPSGenerator(uint64_t seed_val = default_rng_seed_val);
+
+} // namespace mps::detail
+
+struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  MPSGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
+  ~MPSGeneratorImpl() override = default;
+
+  // MPSGeneratorImpl methods
+  std::shared_ptr<MPSGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void update_philox_counters();
+
+  void set_engine(at::Philox4_32 engine) { engine_ = engine; };
+  at::Philox4_32 engine() { return engine_; };
+  uint32_t* state_data() { return data_.state.data(); }
+  static DeviceType device_type() { return DeviceType::MPS; };
+
+private:
+  mps::detail::rng_data_pod data_;
+  at::Philox4_32 engine_;
+
+  MPSGeneratorImpl* clone_impl() const override;
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGuardImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..587ebdba6d240e68aa06e64eeccd48328e85c647
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSGuardImpl.h
@@ -0,0 +1,174 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/Context.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSEvent.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+#include <ATen/Tensor.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <sys/_types/_size_t.h>
+#include <memory>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/intrusive_ptr.h>
+
+
+namespace at::mps {
+
+typedef MPSEvent* mpsEvent_t;
+
+// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl
+// https://github.com/pytorch/pytorch/issues/77170
+struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::MPS;
+
+  // constructor
+  MPSGuardImpl() {}
+  explicit MPSGuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::MPS);
+  }
+
+  // returns the type
+  c10::DeviceType type() const override {
+    return c10::DeviceType::MPS;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  Device getDevice() const override {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  c10::optional<Device> uncheckedGetDevice() const noexcept {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_mps());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    // TODO: Currently setting only device 0
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
+  Stream getDefaultStream(Device d) const override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    if (at::hasMPS()) {
+      //TODO: extend it for multi-device case
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  // Event-related functions
+  void createEvent(
+    mpsEvent_t* event,
+    const EventFlag flag) const;
+
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override;
+
+  void record(
+    void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override;
+
+  void block(
+    void* event,
+    const Stream& stream) const override;
+
+  bool queryEvent(void* event) const override;
+
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for MPS.
+struct OptionalMPSGuard {
+  explicit OptionalMPSGuard() : guard_() {}
+
+  explicit OptionalMPSGuard(c10::optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current MPS device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalMPSGuard(c10::optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalMPSGuard(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard(OptionalMPSGuard&& other) = delete;
+  OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete;
+
+  /// Sets the MPS device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a MPS
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the MPS device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a MPS device.
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the MPS device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  c10::optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  c10::optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original MPS device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<MPSGuardImpl> guard_;
+};
+
+
+C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl);
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSHooks.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..b49d620527dc1bb63833644a74d7683ac635b98b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSHooks.h
@@ -0,0 +1,57 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/Generator.h>
+#include <ATen/mps/MPSEvent.h>
+#include <c10/util/Optional.h>
+
+namespace at::mps {
+
+// The real implementation of MPSHooksInterface
+struct MPSHooks : public at::MPSHooksInterface {
+  MPSHooks(at::MPSHooksArgs) {}
+  void initMPS() const override;
+
+  // MPSDevice interface
+  bool hasMPS() const override;
+  bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;
+
+  // MPSGeneratorImpl interface
+  const Generator& getDefaultMPSGenerator() const override;
+
+  // MPSStream interface
+  void deviceSynchronize() const override;
+  void commitStream() const override;
+  void* getCommandBuffer() const override;
+  void* getDispatchQueue() const override;
+
+  // MPSAllocator interface
+  Allocator* getMPSDeviceAllocator() const override;
+  void emptyCache() const override;
+  size_t getCurrentAllocatedMemory() const override;
+  size_t getDriverAllocatedMemory() const override;
+  void setMemoryFraction(double ratio) const override;
+
+  // MPSProfiler interface
+  void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override;
+  void profilerStopTrace() const override;
+
+  // MPSEvent interface
+  uint32_t acquireEvent(bool enable_timing) const override;
+  void releaseEvent(uint32_t event_id) const override;
+  void recordEvent(uint32_t event_id) const override;
+  void waitForEvent(uint32_t event_id) const override;
+  void synchronizeEvent(uint32_t event_id) const override;
+  bool queryEvent(uint32_t event_id) const override;
+  double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override;
+
+  // Compatibility with Accelerator API
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
+    // When MPS is available, it is always in use for the one device.
+    return true;
+  }
+};
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSProfiler.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSProfiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c6ce179943c2a8f790329eddf14e1e89b2a04dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSProfiler.h
@@ -0,0 +1,393 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+
+#include <os/signpost.h>
+#include <os/log.h>
+
+#include <sstream>
+#include <string>
+#include <atomic>
+#include <unordered_map>
+#include <utility>
+#include <ctime>
+
+namespace at::mps {
+
+namespace Profiler {
+
+struct BaseInfo {
+  // profiling info types
+  enum class Type {
+    GRAPH,
+    KERNEL,
+    COPY,
+    CPU_FALLBACK,
+  };
+
+  BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
+      type(infoType), profileId(Id), handle(Handle) { }
+  virtual ~BaseInfo() = default;
+
+  // type of profiling info
+  Type type;
+  // unique profile ID for execution instances of operations or copies
+  uint64_t profileId;
+  // ID generated by os_signpost
+  // since it's possible to use event and interval-based signposts at the
+  // same time, we need separate IDs for each.
+  os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
+  // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
+  std::atomic<double> totalGpuTime{0.0};
+  // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
+  std::atomic<double> totalSchedulingTime{0.0};
+  // indicates if the operation or copy execution has completed
+  std::atomic_bool completed{false};
+  // handle used to identify the profile info's instance (usually the pointer)
+  const uintptr_t handle;
+
+  virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
+  // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
+  static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
+    if (tensor.defined()) {
+      std::stringstream tensorStr;
+      auto deviceType = tensor.device().type();
+      tensorStr << c10::DeviceTypeName(deviceType);
+      // see comments for INCLUDE_BUFFER_ID
+      if (includeBufferId && deviceType == at::kMPS) {
+        id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+        tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
+                  << ":" << buffer.retainCount << ")";
+      }
+      tensorStr << ":"
+                << tensor.scalar_type() << tensor.sizes();
+      return tensorStr.str();
+    } else {
+      return "undefined";
+    }
+  }
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+struct OperationInfo : BaseInfo {
+  OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
+      BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
+
+  uint64_t runCount = 0;
+  std::string strKey;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  // builds a string for a kernel
+  static std::string buildKernelString(const std::string& kernelName,
+                                       const TensorList& tensors,
+                                       bool includeBufferId = false) {
+    std::stringstream kernelStr;
+    kernelStr << kernelName;
+    for (const Tensor& tensor: tensors) {
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+    }
+    return kernelStr.str();
+  }
+};
+
+struct CpuFbInfo : BaseInfo {
+  CpuFbInfo(uint64_t Id, const std::string& OpName) :
+      BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
+
+  uint64_t runCount = 0;
+  // the current and total overhead of copies in bytes required to convert the Op's
+  // input tensors from MPS to CPU and then output from CPU back to MPS
+  size_t currentCopyOverhead = 0;
+  size_t totalCopyOverhead = 0;
+  std::string opName;
+  std::string strKey;
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  void updateCopyOverhead(const TensorList& tensors) {
+    currentCopyOverhead = 0;
+    for (const Tensor& tensor: tensors) {
+      if (tensor.defined()) {
+        currentCopyOverhead += tensor.nbytes();
+      }
+    }
+    totalCopyOverhead += currentCopyOverhead;
+  }
+};
+
+struct CopyInfo : BaseInfo {
+  enum class Kind {
+    MPS_TO_MPS,
+    MPS_TO_CPU,
+    CPU_TO_MPS,
+  };
+
+  CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
+           BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
+           length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
+
+  Kind kind;
+  size_t length;
+  bool isNonBlocking;
+  bool usesBlitter;
+  std::string srcStrKey;
+  std::string dstStrKey;
+  // for copies that don't use blitters, we measure CPU time
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
+
+  static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
+    if (tensor.has_value()) {
+      return tensor->device().type() == at::kMPS;
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
+    // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
+    return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
+  }
+
+  static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
+                          const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
+    const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
+    const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
+    if (isSrcOnMPS && !isDstOnMPS) {
+      return Kind::MPS_TO_CPU;
+    } else if (!isSrcOnMPS && isDstOnMPS) {
+      return Kind::CPU_TO_MPS;
+    }
+    return Kind::MPS_TO_MPS;
+  }
+};
+
+struct CopyStat : CopyInfo {
+  explicit CopyStat(std::string CopyKindStr) :
+          CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
+  // total number of copies
+  size_t totalCount = 0;
+  // number of Scalar copies (i.e., less than sizeof(int64))
+  size_t scalarsCount = 0;
+  // number of blocking copies (i.e., require syncing to GPU)
+  size_t blockingCount = 0;
+  // number of copies that used memcpy(), instead of Metal Blit Encoder
+  size_t memcpyCount = 0;
+  // accumulated GPU time in ms for the scalar copies
+  std::atomic<double> scalarsGpuTime{0.0};
+  // copy kind in string type
+  std::string kindStr;
+};
+
+class MPSProfiler {
+public:
+  // lower 16 bits used for profiler options
+  enum ProfileOptions : uint32_t {
+    OPTIONS_NONE = 0,
+    // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
+    // (used for convenience to not compute bit flags by OR-ing manually)
+    // trace all signpost types using events
+    ALL_SIGNPOST_EVENTS    = (1 << 0),
+    // trace all signpost types using intervals
+    ALL_SIGNPOST_INTERVALS = (1 << 1),
+    // always wait for command buffer to finish executing after each commit
+    WAIT_UNTIL_COMPLETED   = (1 << 2),
+    // for interval-based signposts, include the scheduling portion of
+    // Graph/Kernel/Copy executions as well.
+    // if flag is disable, only "GPU run time" is included in interval,
+    // and not schedule time.
+    INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
+
+    // use these if you need to trace signposts types individually (rarely required)
+    // trace signpost using intervals
+    USE_INTERVALS = (1 << 4),
+    // trace signpost by emitting events
+    USE_EVENTS    = (1 << 5),
+    // used for sanity check (Change this when new option added)
+    OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
+  };
+
+  // when adding new types, #define the type string in MPSProfiler.mm as well.
+  // upper 16 bits used for event types
+  enum SignpostTypes : uint32_t {
+    SIGNPOST_NONE = 0,
+    // trace signposts for PyTorch operation executions
+    RUN_OPERATION = (1 << 16),
+    // trace signposts for blitter copies
+    BLIT_COPY     = (1 << 17),
+    // trace signposts for ops that fall back on CPU
+    CPU_FALLBACK  = (1 << 18),
+    // used for sanity check (Change this when new type added)
+    SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
+  };
+
+  enum LogOptions : uint32_t {
+    LOG_NONE = 0,
+
+    // Info logging options during execution
+    // -------------------------------------
+    // prints operation info (id/key/run_count) during execution
+    OPERATION_INFO      = (1 << 0),
+    // prints copy info (src/dst tensors/buffers, size, etc.) during execution
+    COPY_INFO           = (1 << 1),
+    // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
+    CPU_FALLBACK_INFO   = (1 << 2),
+
+    // Profiling Statistics logging options when process terminates
+    // ------------------------------------------------------------
+    // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
+    // this is convenient to not combine following stats bit flags manually
+    ALL_STATS           = (1 << 3),
+    // prints operation stats (GPU times, run count, etc.) before process terminates
+    OPERATION_STATS     = (1 << 4),
+    // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
+    COPY_STATS          = (1 << 5),
+    // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
+    // for tensors, etc.) before process terminates
+    CPU_FALLBACK_STATS  = (1 << 6),
+
+    // Metadata format options when logging the info
+    // ---------------------------------------------
+    // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
+    // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
+    INCLUDE_GPU_TIME    = (1 << 7),
+    // if enabled, includes GPU scheduling time in metadata separately
+    // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
+    // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
+    INCLUDE_KERNEL_TIME = (1 << 8),
+    // if enabled, includes the unique buffer ID in metadata for the storage
+    // of a tensor that was allocated on MPSAllocator. This is useful (along with
+    // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
+    // with various operations.
+    INCLUDE_BUFFER_ID   = (1 << 9),
+
+    // used for sanity check (Change this when new option added)
+    LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
+  };
+
+  explicit MPSProfiler();
+  ~MPSProfiler();
+
+  // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
+  uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
+  uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
+  uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
+                            const OptionalTensorRef srcTensor,
+                            const OptionalTensorRef dstTensor,
+                            size_t length, bool isNonBlocking, bool usesBlitter = true);
+  uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
+  void beginProfileGPUInterval(const void* handle);
+
+  void endProfileCopy(uint64_t profileId, SyncType syncType);
+  void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
+  void endProfileCPUFallback(const std::string& opName);
+
+  // these are used to hook into Python bindings for torch.mps.profiler module.
+  // this enables generating OS Signpost traces from MPSProfiler on-demand
+  // during runtime (instead of environment variables).
+  // The "mode" could be either "interval", "event", or both "interval,event"
+  // for interval-based and/or event-based signpost tracing.
+  void StartTrace(const string& mode, bool waitUntilCompleted);
+  void StopTrace();
+
+  // convenience functions to indicate whether signpost tracing or
+  // logging are enabled for the SignpostTypes
+  bool isOperationProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
+           (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
+  }
+  bool isCopyProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
+           (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
+  }
+  bool isCPUFallbackProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
+           (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
+  }
+  bool isSignpostTracingEnabled() const {
+    return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
+  }
+
+ private:
+  // indicates what type of signpost types are enabled and traced by MPS profiler.
+  uint32_t m_signpost_types = 0;
+  uint32_t m_profile_options = 0;
+  uint32_t m_log_options = 0;
+  uint64_t m_kernel_counter = 0;
+  uint64_t m_graph_counter = 0;
+  uint64_t m_cpu_fb_counter = 0;
+  uint64_t m_copy_counter = 0;
+  // technically, it's possible to trace both events and intervals at the same time
+  // so we use separate os_log categories for them
+  os_log_t m_os_log_events;
+  os_log_t m_os_log_intervals;
+  // stats logging could run either from destructor or signal handler
+  // so this is used to check if logging has already started.
+  std::atomic_bool hasLoggedStats{false};
+  // indicates there are pending completionHandler callbacks that haven't been called yet.
+  std::atomic_bool hasPendingCompletionHandlers{false};
+  // used to capture sigint signal to log profiling stats
+  static struct sigaction currentSigint, previousSigint;
+
+  // We use the following lists for two reasons:
+  // 1- for interval-based signposts the "begin" point won't be in same function
+  // as the "end" point where we need to be able to retrieve signpost's info
+  // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
+
+  // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
+  // the string key for this map is the op name that we fall back to execute on CPU
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
+  // this list contains the info for copies, and its key is the unique profileId
+  // which is generated from m_copy_counter
+  // The copyInfo list is not retained.
+  std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
+  // a short list that contains copy stats
+  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
+
+  void initialize();
+  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
+  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
+                           os_signpost_id_t interval_signpost_id,
+                           double gpuTime, double schedulingTime);
+  void addProfilerScheduledHandler(BaseInfo& info);
+  void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
+  void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                         const std::string& msg) const;
+  void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                             const std::string& msg) const;
+  void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
+
+  void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
+  // returns true if logging the profiling info "during the execution" is enabled
+  bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
+  // logs all the profiling stats that are enabled
+  void logProfilingStats();
+  // logs kernel profiling stats when the process ends.
+  void logOperationsProfilingStats(std::FILE* f) const;
+  // logs CPU Fallback profiling stats when the process ends.
+  void logCPUFallbackProfilingStats(std::FILE* f) const;
+  // logs copy profiling stats when the process ends.
+  void logCopyProfilingStats(std::FILE* f) const;
+
+  os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
+  static SignpostTypes getSignpostType(BaseInfo::Type infoType);
+  static void handleIntSignal(int signal);
+};
+
+} // namespace Profiler
+
+Profiler::MPSProfiler& getMPSProfiler();
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSStream.h b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc838de8e69bccebf8a385c7f9fc7eeb945302e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/mps/MPSStream.h
@@ -0,0 +1,133 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/core/Stream.h>
+#include <ATen/mps/MPSDevice.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLComputeCommandEncoder_t;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+
+namespace at::mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+enum class SyncType {
+  NONE,               // no commit to command buffer
+  COMMIT,             // commit and flush the command buffer
+  COMMIT_AND_WAIT,    // flush and wait for command buffer execution to finish
+  COMMIT_AND_CONTINUE,// commit and continue with a new underlying command buffer
+  COMMIT_ADAPTIVE,    // commit adaptively based on available memory
+};
+
+class TORCH_API MPSStream
+{
+public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MPSCommandBuffer* commandBuffer();
+  MTLComputeCommandEncoder_t commandEncoder();
+  void endKernelCoalescing();
+  void synchronize(SyncType syncType);
+  void fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t offset, SyncType syncType = SyncType::NONE);
+  void copy(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+            size_t length, size_t srcOffset, size_t dstOffset,
+            uint64_t profileId, SyncType syncType = SyncType::NONE);
+  void copy_and_sync(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+                     size_t length, size_t srcOffset, size_t dstOffset,
+                     bool non_blocking, uint64_t profileId);
+  void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results, SyncType syncType = SyncType::NONE);
+  void addCompletedHandler(MTLCommandBufferHandler block);
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device];}
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MPSCommandBuffer* _commandBuffer = nil;
+  MPSCommandBuffer* _prevCommandBuffer = nil;
+  MTLComputeCommandEncoder_t _commandEncoder = nil;
+  MPSGraphExecutionDescriptor *_executionDescriptor = nil;
+  MPSGraphCompilationDescriptor *_compilationDescriptor = nil;
+  dispatch_queue_t _serialQueue = nullptr;
+  // CommitAndContinue is enabled by default
+  bool _enableCommitAndContinue = true;
+
+  // use synchronize() to access any of these commit functions outside MPSStream
+  void commit();
+  void commitAndWait();
+  void commitAndContinue();
+  void flush();
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl
+{
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+} // namespace at::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Activation.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9eb8081fe06aa8daf4d3b4a3ae5a7ebbbf37ed9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Activation.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+class TensorBase;
+}
+
+namespace at::native {
+
+// These constants control the approximation behavior of gelu function.
+enum class GeluType {
+  None,             // Baseline Gelu
+  Tanh,             // Tahn Gelu Approximation
+  END
+};
+
+static GeluType get_gelutype_enum(const c10::string_view approximate) {
+  if (approximate == "none") {
+    return GeluType::None;
+  } else if (approximate == "tanh") {
+    return GeluType::Tanh;
+  } else {
+    TORCH_CHECK(false, "approximate argument must be either none or tanh.");
+  }
+}
+
+static std::string gelutype_to_string(const GeluType type) {
+  switch(type) {
+    case GeluType::None: return "none";
+    case GeluType::Tanh: return "tanh";
+    default: TORCH_CHECK(false, "unknown GELU type: ", static_cast<int>(type));
+  }
+}
+
+using structured_activation_fn = void (*)(TensorIteratorBase&);
+using structured_activation_backward_fn = void (*)(TensorIteratorBase&);
+
+using activation_fn = void (*)(TensorIterator&);
+using activation_backward_fn = void (*)(TensorIterator&);
+using softplus_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using softplus_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using threshold_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using hardtanh_backward_fn = void (*)(TensorIterator&, const c10::Scalar&, const c10::Scalar&);
+using hardsigmoid_fn = void(*)(TensorIteratorBase&);
+using hardsigmoid_backward_fn = void(*)(TensorIteratorBase&);
+using hardswish_fn = void(*)(TensorIterator&);
+using hardswish_backward_fn = void(*)(TensorIterator&);
+using shrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using softshrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using shrink_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using elu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&);
+using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&, bool);
+using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&);
+using gelu_fn = void (*)(TensorIteratorBase&, GeluType);
+using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType);
+using glu_jvp_fn = void (*)(TensorIteratorBase&);
+
+DECLARE_DISPATCH(elu_fn, elu_stub);
+DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub);
+DECLARE_DISPATCH(softplus_fn, softplus_stub);
+DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub);
+DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub);
+DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub);
+DECLARE_DISPATCH(threshold_fn, threshold_stub);
+DECLARE_DISPATCH(gelu_fn, GeluKernel);
+DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel);
+DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub);
+DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub);
+DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub);
+DECLARE_DISPATCH(hardswish_fn, hardswish_stub);
+DECLARE_DISPATCH(hardswish_backward_fn, hardswish_backward_stub);
+DECLARE_DISPATCH(shrink_fn, hardshrink_stub);
+DECLARE_DISPATCH(softshrink_fn, softshrink_stub);
+DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub);
+DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub);
+DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, glu_stub);
+DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
+DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub);
+DECLARE_DISPATCH(structured_activation_fn, silu_stub);
+DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, mish_stub);
+DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);
+DECLARE_DISPATCH(activation_fn, prelu_stub);
+DECLARE_DISPATCH(activation_backward_fn, prelu_backward_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/AdaptivePooling.h b/MLPY/Lib/site-packages/torch/include/ATen/native/AdaptivePooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..539a08ffee79c87028f10d7d71076a91006ddff2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/AdaptivePooling.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <cmath>
+
+namespace at::native {
+
+using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);
+
+using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel);
+
+static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+
+static inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+
+static inline void adaptive_pool_empty_output_check(const Tensor& gradOutput_, const char* arg_name) {
+  int64_t ndim = gradOutput_.ndimension();
+  for (const auto i : c10::irange(1, ndim)) {
+    TORCH_CHECK(gradOutput_.size(i) > 0,
+      arg_name, "(): Expected grad_output to have non-zero size for non-batch dimensions, "
+      "but grad_output has sizes ", gradOutput_.sizes(), " with dimension ", i,
+      " being empty");
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/AmpKernels.h b/MLPY/Lib/site-packages/torch/include/ATen/native/AmpKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bfd41885067671998aabe0bd178816c05325b1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/AmpKernels.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)(
+    TensorList,
+    Tensor&,
+    const Tensor&);
+
+using _amp_update_scale_cpu__fn = Tensor& (*)(
+    Tensor&,
+    Tensor&,
+    const Tensor&,
+    double,
+    double,
+    int64_t);
+
+DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h b/MLPY/Lib/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h
new file mode 100644
index 0000000000000000000000000000000000000000..67b1c18d24e8b3e1f371020ac0fadf239fe37044
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h
@@ -0,0 +1,321 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <c10/util/string_view.h>
+#include <ATen/Config.h>
+#include <ATen/native/DispatchStub.h>
+
+// Forward declare TI
+namespace at {
+class Tensor;
+struct TensorIterator;
+
+namespace native {
+enum class TransposeType;
+}
+
+}
+
+namespace at::native {
+
+enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss};
+
+#if AT_BUILD_WITH_LAPACK()
+// Define per-batch functions to be used in the implementation of batched
+// linear algebra operations
+
+template <class scalar_t>
+void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info);
+
+template <class scalar_t>
+void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info);
+
+template <class scalar_t, class value_t=scalar_t>
+void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info);
+
+template <class scalar_t>
+void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t>
+void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t>
+void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info);
+
+template <class scalar_t>
+void lapackGels(char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsd(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t* work, int lwork,
+    value_t *rwork, int* iwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsy(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    int *jpvt, value_t rcond, int *rank,
+    scalar_t *work, int lwork, value_t* rwork, int *info);
+
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelss(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t *work, int lwork,
+    value_t *rwork, int *info);
+
+template <LapackLstsqDriverType, class scalar_t, class value_t = scalar_t>
+struct lapackLstsq_impl;
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gels, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGels<scalar_t>(
+        trans, m, n, nrhs,
+        a, lda, b, ldb,
+        work, lwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsy, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsy<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        jpvt, rcond, rank,
+        work, lwork, rwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsd, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsd<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, iwork, info);
+  }
+};
+
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelss, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelss<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, info);
+  }
+};
+
+template <LapackLstsqDriverType driver_type, class scalar_t, class value_t = scalar_t>
+void lapackLstsq(
+    char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info, // Gels flavor
+    int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+    value_t *s, // Gelss flavor
+    int *iwork // Gelsd flavor
+    ) {
+  lapackLstsq_impl<driver_type, scalar_t, value_t>::call(
+      trans, m, n, nrhs,
+      a, lda, b, ldb,
+      work, lwork, info,
+      jpvt, rcond, rank, rwork,
+      s,
+      iwork);
+}
+
+template <class scalar_t>
+void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
+
+template <class scalar_t>
+void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
+
+template <class scalar_t>
+void lapackLdlHermitian(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSymmetric(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveHermitian(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
+template <class scalar_t>
+void lapackLdlSolveSymmetric(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+
+template<class scalar_t, class value_t=scalar_t>
+void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
+#endif
+
+#if AT_BUILD_WITH_BLAS()
+template <class scalar_t>
+void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb);
+#endif
+
+using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_fn, cholesky_stub);
+
+using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/);
+
+DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub);
+
+using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/);
+
+DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub);
+
+using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(geqrf_fn, geqrf_stub);
+
+using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(orgqr_fn, orgqr_stub);
+
+using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/);
+DECLARE_DISPATCH(ormqr_fn, ormqr_stub);
+
+using linalg_eigh_fn = void (*)(
+    const Tensor& /*eigenvalues*/,
+    const Tensor& /*eigenvectors*/,
+    const Tensor& /*infos*/,
+    bool /*upper*/,
+    bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub);
+
+using lstsq_fn = void (*)(
+    const Tensor& /*a*/,
+    Tensor& /*b*/,
+    Tensor& /*rank*/,
+    Tensor& /*singular_values*/,
+    Tensor& /*infos*/,
+    double /*rcond*/,
+    std::string /*driver_name*/);
+DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
+
+using triangular_solve_fn = void (*)(
+    const Tensor& /*A*/,
+    const Tensor& /*B*/,
+    bool /*left*/,
+    bool /*upper*/,
+    TransposeType /*transpose*/,
+    bool /*unitriangular*/);
+DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub);
+
+using lu_factor_fn = void (*)(
+    const Tensor& /*input*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*infos*/,
+    bool /*compute_pivots*/);
+DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub);
+
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  const int64_t dim_size,
+  const int64_t max_pivot);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+
+using lu_solve_fn = void (*)(
+    const Tensor& /*LU*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*B*/,
+    TransposeType /*trans*/);
+DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub);
+
+using ldl_factor_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*info*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub);
+
+using svd_fn = void (*)(
+    const Tensor& /*A*/,
+    const bool /*full_matrices*/,
+    const bool /*compute_uv*/,
+    const c10::optional<c10::string_view>& /*driver*/,
+    const Tensor& /*U*/,
+    const Tensor& /*S*/,
+    const Tensor& /*Vh*/,
+    const Tensor& /*info*/);
+DECLARE_DISPATCH(svd_fn, svd_stub);
+
+using ldl_solve_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*result*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub);
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/BinaryOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/BinaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae39cea88579168b97712d6ffc9b57f92d612543
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/BinaryOps.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/TypeSafeSignMath.h>
+
+
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+
+namespace at::native {
+
+inline void alpha_check(const ScalarType dtype, const Scalar& alpha) {
+  TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool,
+              "Boolean alpha only supported for Boolean results.");
+  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype)
+              || alpha.isIntegral(true),
+              "For integral input tensors, argument alpha must not be a floating point number.");
+  TORCH_CHECK(isComplexType(dtype) || !alpha.isComplex(),
+              "For non-complex input tensors, argument alpha must not be a complex number.")
+}
+
+// Basic checking for all sub functions.
+inline void sub_check(const TensorBase& self, const TensorBase& other) {
+  TORCH_CHECK(self.scalar_type() != kBool || other.scalar_type() != kBool,
+              "Subtraction, the `-` operator, with two bool tensors is not supported. "
+              "Use the `^` or `logical_xor()` operator instead.")
+  TORCH_CHECK(self.scalar_type() != kBool && other.scalar_type() != kBool,
+              "Subtraction, the `-` operator, with a bool tensor is not supported. "
+              "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+}
+
+inline void sub_check(const TensorBase& self, const Scalar& scalar) {
+  TORCH_CHECK(self.scalar_type() != kBool || !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with two bool tensors is not supported. "
+              "Use the `^` or `logical_xor()` operator instead.")
+  TORCH_CHECK(self.scalar_type() != kBool && !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with a bool tensor is not supported. "
+              "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+}
+
+using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+using structured_binary_fn_double = void(*)(TensorIteratorBase&, double);
+using structured_binary_fn = void(*)(TensorIteratorBase&);
+
+using binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+using binary_fn_double = void(*)(TensorIterator&, double);
+using binary_fn = void(*)(TensorIterator&);
+using binary_clamp_fn_alpha =
+    void(*)(TensorIterator&, const Scalar& alpha, const Scalar& min_val, const Scalar& max_val);
+
+// NB: codegenned
+DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
+
+DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub);
+DECLARE_DISPATCH(structured_binary_fn_alpha, sub_stub);
+DECLARE_DISPATCH(structured_binary_fn, mul_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_true_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_floor_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_trunc_stub);
+DECLARE_DISPATCH(structured_binary_fn, atan2_stub);
+DECLARE_DISPATCH(structured_binary_fn, remainder_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_and_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_or_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_xor_stub);
+DECLARE_DISPATCH(structured_binary_fn, lshift_stub);
+DECLARE_DISPATCH(structured_binary_fn, rshift_stub);
+DECLARE_DISPATCH(binary_fn, logical_xor_stub);
+DECLARE_DISPATCH(binary_fn, logical_and_stub);
+DECLARE_DISPATCH(binary_fn, logical_or_stub);
+DECLARE_DISPATCH(structured_binary_fn, lt_stub);
+DECLARE_DISPATCH(structured_binary_fn, le_stub);
+DECLARE_DISPATCH(structured_binary_fn, gt_stub);
+DECLARE_DISPATCH(structured_binary_fn, ge_stub);
+DECLARE_DISPATCH(structured_binary_fn, eq_stub);
+DECLARE_DISPATCH(structured_binary_fn, ne_stub);
+DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
+DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
+DECLARE_DISPATCH(structured_binary_fn, maximum_stub);
+DECLARE_DISPATCH(structured_binary_fn, minimum_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmax_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmin_stub);
+DECLARE_DISPATCH(structured_binary_fn_double, smooth_l1_stub);
+DECLARE_DISPATCH(binary_fn_double, huber_stub);
+DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub);
+DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
+DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub);
+DECLARE_DISPATCH(structured_binary_fn, mse_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmod_stub);
+DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub);
+DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub);
+DECLARE_DISPATCH(structured_binary_fn, gcd_stub);
+DECLARE_DISPATCH(structured_binary_fn, lcm_stub);
+DECLARE_DISPATCH(structured_binary_fn, hypot_stub);
+DECLARE_DISPATCH(structured_binary_fn, igamma_stub);
+DECLARE_DISPATCH(structured_binary_fn, igammac_stub);
+DECLARE_DISPATCH(structured_binary_fn, nextafter_stub);
+DECLARE_DISPATCH(structured_binary_fn, heaviside_stub);
+DECLARE_DISPATCH(structured_binary_fn, copysign_stub);
+DECLARE_DISPATCH(structured_binary_fn, xlogy_stub);
+DECLARE_DISPATCH(structured_binary_fn, xlog1py_stub);
+DECLARE_DISPATCH(structured_binary_fn, zeta_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_t_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_u_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_v_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_w_stub);
+DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_h_stub);
+DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_he_stub);
+DECLARE_DISPATCH(structured_binary_fn, laguerre_polynomial_l_stub);
+DECLARE_DISPATCH(structured_binary_fn, legendre_polynomial_p_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_t_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_u_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_v_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_w_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/BucketizationUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/BucketizationUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c6f0599b03234ce674fff05531ba4f97450b3a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/BucketizationUtils.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/ScalarOps.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type.h>
+#endif
+
+namespace at::native {
+
+// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to
+// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not
+// match, will change them to be a common super type so comparisons are done between the same types.
+// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the
+// corresponding raw_* version should be used since it was already contiguous of the right type.
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    Tensor& trimmed_sorter,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries,
+    const Tensor& raw_sorter) {
+  bool in_is_contiguous = raw_input.is_contiguous();
+  bool bd_is_contiguous = raw_boundaries.is_contiguous();
+  bool sort_is_contiguous = raw_sorter.is_contiguous();
+
+  if (!in_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_input = raw_input.contiguous();
+  }
+  if (!bd_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_boundaries = raw_boundaries.contiguous();
+  }
+  if (!sort_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_sorter = raw_sorter.contiguous();
+  }
+  if (raw_input.dtype() != raw_boundaries.dtype()) {
+    at::native::ResultTypeState state = {};
+    state = at::native::update_result_type_state(raw_boundaries, state);
+    state = at::native::update_result_type_state(raw_input, state);
+    ScalarType common_stype = at::native::result_type(state);
+
+    TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined);
+    if (common_stype != raw_input.scalar_type()) {
+      trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype);
+    }
+    if (common_stype != raw_boundaries.scalar_type()) {
+      trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype);
+    }
+  }
+}
+
+/* unused but needed for internal jagged tensor class */
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries) {
+  Tensor trimmed_sorter;
+  Tensor raw_sorter;
+  return searchsorted_maybe_trim_input_tensors(
+      trimmed_input,
+      trimmed_boundaries,
+      trimmed_sorter,
+      raw_input,
+      raw_boundaries,
+      raw_sorter);
+}
+
+inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) {
+  if (boundaries.dim() != input.dim()) {
+    return false;
+  }
+  const auto& dims_bd = boundaries.sizes();
+  const auto& dims_in = input.sizes();
+  for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) {
+    if (dims_bd[dim] != dims_in[dim]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) {
+  auto tensor = c10::scalar_to_tensor(scalar, device);
+  // This is to adopt the scalar promotion rules defined in native/TypeProperties.h
+  // So we have the same type promotion rules as binary operations.
+  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+  return tensor;
+}
+
+inline void searchsorted_pre_check(
+    const Tensor& boundaries,
+    const Tensor& input,
+    const Tensor& output,
+    const bool out_int32,
+    const bool right,
+    const c10::optional<c10::string_view> side_opt,
+    const Tensor& sorter) {
+  if (side_opt) {
+    const c10::string_view side = *side_opt;
+    TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ",
+      "got ", side);
+
+    // assume the user has not explicitly set (right=False, side="right")
+    TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side "
+    "of ", side, " while right was True");
+  }
+
+  TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ",
+    "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ",
+    "tensor device type ", input.device());
+
+  if (sorter.defined()) {
+    TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ",
+      "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ",
+      "device type ", boundaries.device());
+
+    TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same "
+      "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes());
+
+    TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
+      "dtype but got dtype ", sorter.scalar_type());
+
+    if (sorter.numel() > 0) {
+      auto minmax = sorter.aminmax();
+      int64_t vmin = std::get<0>(minmax).item().toLong();
+      int64_t vmax = std::get<1>(minmax).item().toLong();
+      TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
+    }
+  }
+
+  TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),
+    "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ",
+    "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(",
+    input.numel(), ")");
+
+  TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ",
+    "got 0 dimension");
+
+  TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input),
+    "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ",
+    "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ",
+    input.sizes());
+
+  ScalarType output_dtype = output.scalar_type();
+  TORCH_CHECK(
+      (output_dtype == ScalarType::Long && !out_int32) ||
+          (output_dtype == ScalarType::Int && out_int32),
+      "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ",
+      "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype,
+      " and out_int32 flag is ", (out_int32 ? "True" : "False"));
+
+  if (out_int32) {
+    TORCH_CHECK(boundaries.sizes().back() < INT_MAX,
+      "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ",
+      boundaries.sizes().back());
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/CPUBlas.h b/MLPY/Lib/site-packages/torch/include/ATen/native/CPUBlas.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a3fd28fb85b17cc75ca2cc04f9bdc947dfb2a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/CPUBlas.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TransposeType.h>
+#include <c10/util/complex.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native::cpublas {
+
+namespace internal {
+void normalize_last_dims(
+  TransposeType transa, TransposeType transb,
+  int64_t m, int64_t n, int64_t k,
+  int64_t *lda, int64_t *ldb, int64_t *ldc);
+}  // namespace internal
+
+using gemm_fn = void(*)(
+    at::ScalarType type,
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const Scalar& alpha,
+    const void *a, int64_t lda,
+    const void *b, int64_t ldb,
+    const Scalar& beta,
+    void *c, int64_t ldc);
+
+DECLARE_DISPATCH(gemm_fn, gemm_stub);
+
+template <typename scalar_t>
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    at::opmath_type<scalar_t> alpha,
+    const scalar_t *a, int64_t lda,
+    const scalar_t *b, int64_t ldb,
+    at::opmath_type<scalar_t> beta,
+    scalar_t *c, int64_t ldc) {
+  internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  gemm_stub(
+    kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+    transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    double alpha,
+    const double *a, int64_t lda,
+    const double *b, int64_t ldb,
+    double beta,
+    double *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    float beta,
+    at::BFloat16 *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    float beta,
+    at::Half *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<double> alpha,
+    const c10::complex<double> *a, int64_t lda,
+    const c10::complex<double> *b, int64_t ldb,
+    c10::complex<double> beta,
+    c10::complex<double> *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<float> alpha,
+    const c10::complex<float> *a, int64_t lda,
+    const c10::complex<float> *b, int64_t ldb,
+    c10::complex<float> beta,
+    c10::complex<float> *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    int64_t alpha,
+    const int64_t *a, int64_t lda,
+    const int64_t *b, int64_t ldb,
+    int64_t beta,
+    int64_t *c, int64_t ldc);
+
+template <typename scalar_t>
+void gemm_batched(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t * const *a, int64_t lda,
+    const scalar_t * const *b, int64_t ldb,
+    const scalar_t beta,
+    scalar_t * const *c, int64_t ldc);
+
+template <typename scalar_t>
+void gemm_batched_with_stride(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t *a, int64_t lda, int64_t batch_stride_a,
+    const scalar_t *b, int64_t ldb, int64_t batch_stride_b,
+    scalar_t beta,
+    scalar_t *c, int64_t ldc, int64_t batch_stride_c);
+
+using axpy_fn = void(*)(at::ScalarType type, int64_t n, const Scalar& a, const void *x, int64_t incx, void *y, int64_t incy);
+
+DECLARE_DISPATCH(axpy_fn, axpy_stub);
+
+template<typename scalar_t>
+void axpy(int64_t n, scalar_t a, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy){
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  axpy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, a, x, incx, y, incy);
+}
+
+void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t incy);
+void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t incy);
+void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+
+using copy_fn = void(*)(at::ScalarType type, int64_t n, const void *x, int64_t incx, void *y, int64_t incy);
+
+DECLARE_DISPATCH(copy_fn, copy_stub);
+
+template<typename scalar_t>
+void copy(int64_t n, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) {
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  copy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, x, incx, y, incy);
+}
+
+void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy);
+void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
+void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+
+}  // namespace at::native::cpublas
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/CPUFallback.h b/MLPY/Lib/site-packages/torch/include/ATen/native/CPUFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..7554956f5ce889a08a30cf140fc2c53924868b17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/CPUFallback.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/util/Metaprogramming.h>
+#include <torch/library.h>
+
+namespace at::native {
+
+// This function implements a boxed fallback to CPU.
+// External backends can add their own custom logging on top if it to customize their own CPU fallbacks.
+TORCH_API void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views = false);
+
+// This is a helper function that backends can use to directly call their boxed CPU fallback
+// TODO: update and add a usage example after https://github.com/pytorch/pytorch/pull/58092 lands.
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _call_fallback_fn final {};
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _call_fallback_fn<fallback_fn, Op, symint, ReturnType(ParameterTypes...)> final {
+    static ReturnType call(typename c10::maybe_keep_symint<symint, ParameterTypes>::type... args) {
+        auto op = c10::Dispatcher::singleton()
+            // TODO: figure out how to make compiler happy without dynamic casts
+            .findSchemaOrThrow((const char*) Op::name, (const char*) Op::overload_name)
+            //.findSchemaOrThrow("a", "b")
+            .typed<ReturnType (typename c10::maybe_keep_symint<symint, ParameterTypes>::type...)>();
+        return c10::impl::BoxedKernelWrapper<ReturnType (typename c10::maybe_keep_symint<symint, ParameterTypes>::type...)>::call(
+            c10::BoxedKernel::makeFromFunction<fallback_fn>(),
+            op,
+            c10::DispatchKeySet(), // we know that the cpu_fallback doesn't use the dispatch keyset.
+            // TODO: get std::forward<> to work
+            args...
+            );
+    }
+};
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op>
+using call_fallback_fn_symint = _call_fallback_fn<fallback_fn, Op, true, typename Op::schema>;
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op>
+using call_fallback_fn = _call_fallback_fn<fallback_fn, Op, false, typename Op::schema>;
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h b/MLPY/Lib/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..983ff7fe26e332a979ece32d42889081e6c56fcf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <limits>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+TORCH_API bool canUse32BitIndexMath(const at::TensorBase &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ComplexHelper.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ComplexHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5e468e2be88bfdb00818a191d489bf06dab0bb9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ComplexHelper.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/view_as_real_native.h>
+#include <ATen/ops/view_as_complex_native.h>
+
+#include <utility>
+#endif
+
+// WARNING: this header contains non-inline functions and should be only
+// included from ONE cpp file
+
+namespace at::native {
+
+// View tensor with new dtype, storage offset, sizes and strides
+inline Tensor view_tensor(
+    const Tensor &tensor, ScalarType dtype,
+    c10::SymInt offset, SymIntArrayRef sizes, SymIntArrayRef strides) {
+  Storage storage = tensor.storage();
+  auto key_set = tensor.key_set().remove(DispatchKey::Conjugate);
+  auto new_tensor = detail::make_tensor<TensorImpl>(
+      c10::TensorImpl::VIEW, std::move(storage), key_set, scalarTypeToTypeMeta(dtype));
+  auto * impl = new_tensor.unsafeGetTensorImpl();
+  impl->set_sizes_and_strides(sizes, strides, offset);
+  return new_tensor;
+}
+
+inline SymDimVector computeStrideForViewAsReal(SymIntArrayRef oldstride) {
+  SymDimVector res(oldstride.size() + 1);
+  for (const auto i : c10::irange(oldstride.size())) {
+    res[i] = oldstride[i] * 2;
+  }
+  res.back() = 1;
+  return res;
+}
+
+inline Tensor _view_as_real_physical(const Tensor& self) {
+  TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors");
+  auto old_sizes = self.sym_sizes();
+  SymDimVector new_sizes(old_sizes.size() + 1);
+  std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
+  // last dimension will always have two elements containing the real and imag vals
+  new_sizes.back() = 2;
+  auto new_strides = computeStrideForViewAsReal(self.sym_strides());
+  auto new_storage_offset = self.sym_storage_offset() * 2;
+  const auto float_type = c10::toRealValueType(self.scalar_type());
+  auto real_tensor = view_tensor(self, float_type, std::move(new_storage_offset), new_sizes, new_strides);
+  return real_tensor;
+}
+
+// expects as input a complex tensor and returns back a tensor
+// with corresponding real dtype containing the complex values
+// in the last two dimensions
+Tensor view_as_real(const Tensor& self) {
+  TORCH_CHECK(!self.is_conj(), "view_as_real doesn't work on unresolved conjugated tensors.  To resolve the conjugate tensor so you can view it as real, use self.resolve_conj(); however, be warned that the resulting tensor will NOT alias the original.");
+  return _view_as_real_physical(self);
+}
+
+inline SymDimVector computeStrideForViewAsComplex(SymIntArrayRef oldstride) {
+  const int64_t dim = oldstride.size();
+  TORCH_CHECK(oldstride[dim-1] == 1, "Tensor must have a last dimension with stride 1");
+
+  SymDimVector res(dim - 1);
+  for (const auto i : c10::irange(res.size())) {
+    TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
+    res[i] = oldstride[i] / 2;
+  }
+  return res;
+}
+
+// expects as input a float or double tensor with last dimension of size 2
+// and returns back a tensor with corresponding complex dtype
+Tensor view_as_complex(const Tensor& self) {
+  TORCH_CHECK(
+    self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf,
+    "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
+
+  auto old_sizes = self.sym_sizes();
+  TORCH_CHECK(!old_sizes.empty(), "Input tensor must have one or more dimensions");
+  TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
+  SymDimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
+
+  const auto new_strides = computeStrideForViewAsComplex(self.sym_strides());
+  const auto complex_type = c10::toComplexType(self.scalar_type());
+
+  TORCH_CHECK(self.sym_storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
+  const auto new_storage_offset = self.sym_storage_offset() / 2;
+
+  return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides);
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h b/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..27aefd57376f4468da7f628cec608c4e4837c4b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/native/CompositeRandomAccessorCommon.h>
+
+namespace at::native {
+
+struct TupleInfoCPU {
+  template <typename ...Types>
+  using tuple = std::tuple<Types...>;
+
+  template <typename ...Types>
+  static constexpr auto tie(Types&... args) noexcept {
+    return std::tie(args...);
+  }
+};
+
+template <typename KeyAccessor, typename ValueAccessor>
+using CompositeRandomAccessorCPU =
+  CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfoCPU>;
+
+template <typename Values, typename References>
+void swap(
+  references_holder<Values, References> rh1,
+  references_holder<Values, References> rh2
+) {
+  return std::swap(rh1.data(), rh2.data());
+}
+
+template <int N, typename Values, typename References>
+auto get(references_holder<Values, References> rh) -> decltype(std::get<N>(rh.data())) {
+  return std::get<N>(rh.data());
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h b/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..5db76a15575c4542004ef6fbedbdc3fb73d1f3fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h
@@ -0,0 +1,263 @@
+#include <utility>
+
+#pragma once
+
+namespace at::native {
+
+namespace {
+
+// operator_brackets_proxy is used in
+// CompositeRandomAccessor in place of operator[].
+// For some iterators, references returned by operator[]
+// could become invalid, operator_brackets_proxy tries to
+// resolve that by making accessor[n] to be equivalent to
+// *(accessor + n).
+template <typename Accessor>
+class operator_brackets_proxy {
+  using reference = typename std::iterator_traits<Accessor>::reference;
+  using value_type = typename std::iterator_traits<Accessor>::value_type;
+
+public:
+  C10_HOST_DEVICE
+  operator_brackets_proxy(Accessor const& accessor)
+    : accessor(accessor)
+  {}
+
+  C10_HOST_DEVICE
+  operator reference() {
+    return *accessor;
+  }
+
+  C10_HOST_DEVICE
+  reference operator*() {
+    return *accessor;
+  }
+
+  C10_HOST_DEVICE
+  operator_brackets_proxy& operator=(value_type const& val) {
+    *accessor = val;
+    return *this;
+  }
+
+private:
+  Accessor accessor;
+};
+
+}
+
+// references_holder is used as a surrogate for the
+// references type from std::iterator_traits in CompositeRandomAccessor.
+// It is assumed in CompositeRandomAccessor that
+// References = tuple<Types&...>,
+// Values = tuple<Types...> by default,
+// but they could be anything as long as References could be
+// cast to Values.
+// If you plan to use it with STL, for example, you will need to
+// define 'swap` and `get`(aka std::get) methods.
+template <typename Values, typename References>
+class references_holder {
+public:
+  using values = Values;
+  using references = References;
+
+  C10_HOST_DEVICE
+  references_holder(references refs)
+    : refs{std::move(refs)}
+  {}
+
+  C10_HOST_DEVICE
+  operator references() {
+    return refs;
+  }
+
+  C10_HOST_DEVICE
+  operator values() {
+    return refs;
+  }
+
+  C10_HOST_DEVICE
+  references_holder& operator=(values vals) {
+    refs = vals;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  references& data() {
+    return refs;
+  }
+
+protected:
+  references refs;
+};
+
+// CompositeRandomAccessor is essentially a simplified version of
+// a random access iterator over two random access iterators.
+// TupleInfo should contain a variadic type `tuple`, and a method `tie`,
+// which constructs a tuple of references from a variadic list of arguments.
+template <typename KeyAccessor, typename ValueAccessor, typename TupleInfo>
+class CompositeRandomAccessor {
+  using self_type = CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfo>;
+
+  using key_accessor_value_type =
+    typename std::iterator_traits<KeyAccessor>::value_type;
+  using value_accessor_value_type =
+    typename std::iterator_traits<ValueAccessor>::value_type;
+  using key_accessor_reference_type =
+    typename std::iterator_traits<KeyAccessor>::reference;
+  using value_accessor_reference_type =
+    typename std::iterator_traits<ValueAccessor>::reference;
+
+  using composite_value_type = typename TupleInfo::template tuple<
+    key_accessor_value_type,
+    value_accessor_value_type>;
+  using composite_reference = typename TupleInfo::template tuple<
+    key_accessor_reference_type,
+    value_accessor_reference_type>;
+
+public:
+  using value_type = composite_value_type;
+  using reference = references_holder<composite_value_type, composite_reference>;
+  // Note that CompositeRandomAccessor does not hold key and values
+  // in a specific datastructure, which means that a pointer to a (key, value)
+  // is not defined. Hence we just use a pointer type of the KeyAccessor.
+  using pointer = typename std::iterator_traits<KeyAccessor>::pointer;
+  using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
+  using iterator_category = std::random_access_iterator_tag;
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor() = default;
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values)
+    : keys(keys), values(values)
+  {}
+
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return TupleInfo::tie(*keys, *values);
+  }
+
+  // operator->() is supposed to return a pointer type.
+  // Since CompositeRandomAccessor does not hold pointers to pairs,
+  // we just return a pointer to a key.
+  C10_HOST_DEVICE
+  auto* operator->() const {
+    return keys.operator->();
+  }
+
+  C10_HOST_DEVICE
+  reference operator[](difference_type idx) {
+    return operator_brackets_proxy<self_type>(
+      CompositeRandomAccessor(keys + idx, values + idx)
+    );
+  }
+  // }
+
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator++() {
+    ++keys;
+    ++values;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator++(int) {
+    CompositeRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator--() {
+    --keys;
+    --values;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator--(int) {
+    CompositeRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator+=(difference_type offset) {
+    keys += offset;
+    values += offset;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator+(difference_type offset) const {
+    return CompositeRandomAccessor(keys + offset, values + offset);
+  }
+
+  C10_HOST_DEVICE
+  friend CompositeRandomAccessor operator+(
+    difference_type offset,
+    const CompositeRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator-=(difference_type offset) {
+    keys -= offset;
+    values -= offset;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator-(difference_type offset) const {
+    return CompositeRandomAccessor(keys - offset, values - offset);
+  }
+
+  C10_HOST_DEVICE
+  difference_type operator-(const CompositeRandomAccessor& other) const {
+    return keys - other.keys;
+  }
+  // }
+
+  // Comparison operators {
+  C10_HOST_DEVICE
+  bool operator==(const CompositeRandomAccessor& other) const {
+    return keys == other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator!=(const CompositeRandomAccessor& other) const {
+    return keys != other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator<(const CompositeRandomAccessor& other) const {
+    return keys < other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator<=(const CompositeRandomAccessor& other) const {
+    return keys <= other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator>(const CompositeRandomAccessor& other) const {
+    return keys > other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator>=(const CompositeRandomAccessor& other) const {
+    return keys >= other.keys;
+  }
+  // }
+
+protected:
+  KeyAccessor keys;
+  ValueAccessor values;
+};
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ConvUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ConvUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7784492c004f022390770d494a85f156f97ff69e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ConvUtils.h
@@ -0,0 +1,446 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+using conv_depthwise2d_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 2>);
+DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub);
+using conv_depthwise3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub);
+using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
+using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
+using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
+using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
+using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
+using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
+using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
+using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const c10::optional<Tensor>&,
+    IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
+using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
+using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub);
+using slow_conv_dilated3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub);
+using slow_conv_transpose2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub);
+using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
+
+namespace {
+  static bool cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
+}
+
+static inline bool cudnnv8_enabled_check_debug() {
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
+  static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
+  static uint8_t cudnnv8_debugcount = 0;
+  if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", cudnnv8_heuristic_mode_b);
+    cudnnv8_debugcount++;
+  }
+  return cudnnv8_flag == 1;
+}
+
+static inline bool cudnnv8_use_heur_mode_b() {
+  return cudnnv8_heuristic_mode_b;
+}
+
+// Keep in sync with py::enum_ in Module.cpp
+enum class ConvBackend {
+  CudaDepthwise2d,
+  CudaDepthwise3d,
+  Cudnn,
+  CudnnTranspose,
+  Empty,
+  Miopen,
+  MiopenDepthwise,
+  MiopenTranspose,
+  Mkldnn,
+  MkldnnTranspose,
+  MkldnnEmpty,
+  NnpackSpatial,
+  Overrideable,
+  Slow2d,
+  Slow3d,
+  SlowDilated2d,
+  SlowDilated3d,
+  SlowTranspose2d,
+  SlowTranspose3d,
+  Winograd3x3Depthwise,
+  Xnnpack2d,
+  Mps,
+  MpsTranspose,
+};
+
+// Overload for selecting the convolution backend from the full set of convolution inputs.
+// This overload is exposed to python for testing, etc.
+TORCH_API ConvBackend select_conv_backend(
+    const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
+
+TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
+    const Tensor& weight,
+    const ConvBackend backend);
+
+// ---------------------------------------------------------------------
+//
+// Math
+//
+// ---------------------------------------------------------------------
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
+{
+  TORCH_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  TORCH_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    AT_ERROR(ss.str());
+  }
+}
+
+
+// NOTE [ Convolution checks ]
+//
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+static void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups);
+
+  // Weight
+  checkSameDim(c, input, weight);
+
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+
+// NB: conv_output_size and conv_input_size are not bijections,
+// as conv_output_size loses information; this is why conv_input_size
+// takes an extra output_padding argument to resolve the ambiguity.
+
+template <typename T>
+static inline std::vector<T> _conv_output_size(
+    ArrayRef<T> input_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> stride, ArrayRef<T> dilation = ArrayRef<T>()
+) {
+  // ASSERT(input_size.size() > 2)
+  // ASSERT(input_size.size() == weight_size.size())
+  bool has_dilation = !dilation.empty();
+  auto dim = input_size.size();
+  std::vector<T> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (const auto d : c10::irange(2, dim)) {
+    auto dilation_ = has_dilation ? dilation[d - 2] : 1;
+    auto kernel = dilation_ * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+static inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+static inline std::vector<c10::SymInt> conv_output_size(
+    SymIntArrayRef input_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+template <typename T>
+std::vector<T> _conv_input_size(
+    ArrayRef<T> output_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, ArrayRef<T> stride, ArrayRef<T> dilation, T groups
+) {
+  // ASSERT(output_size.size() > 2)
+  // ASSERT(output_size.size() == weight_size.size())
+  auto dim = output_size.size();
+  std::vector<T> input_size(dim);
+  input_size[0] = output_size[output_batch_size_dim];
+  input_size[1] = weight_size[weight_input_channels_dim] * groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
+                     kernel + output_padding[d - 2];
+  }
+  return input_size;
+}
+
+static inline std::vector<c10::SymInt> conv_input_size(
+    SymIntArrayRef output_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+template <typename T>
+std::vector<T> _conv_weight_size(
+    ArrayRef<T> input_size, ArrayRef<T> output_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  auto dim = input_size.size();
+  std::vector<T> weight_size(dim);
+  weight_size[0] = output_size[1];
+  weight_size[1] = input_size[1] / groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + padding[d - 2] * 2 - output_padding[d - 2];
+    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
+  }
+  return weight_size;
+}
+
+static inline std::vector<c10::SymInt> conv_weight_size(
+    SymIntArrayRef input_size, SymIntArrayRef output_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+static inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
+  std::vector<int64_t> shape(dim, 1);
+  shape[1] = -1;
+  return bias.reshape(shape);
+}
+
+static inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithCuDNN() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return at::MemoryFormat::Contiguous;
+  }
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  auto weight_ndim = weight.ndimension();
+
+  bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
+  if (can_use_cudnn_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
+
+  bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+  );
+  if (can_use_cudnn_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
+
+  return at::MemoryFormat::Contiguous;
+}
+
+// controls whether emptyCache will be called following cudnn conv benchmarking
+TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
+TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
+
+
+static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  bool can_use_miopen_channels_last_2d = false;
+#if defined(USE_ROCM) && (ROCM_VERSION >= 40300)
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  static c10::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
+            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
+        );
+#endif
+
+  bool can_use_miopen_channels_last_3d = false;
+
+  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+}
+
+static inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  // disable NHWC for MkldnnCPU tensor.
+  if (input.is_mkldnn() || weight.is_mkldnn()) {
+    return false;
+  }
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_mkldnn_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+
+  bool can_use_mkldnn_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+
+  return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d;
+}
+
+static inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) || (
+       weight_memory_format == at::MemoryFormat::ChannelsLast));
+
+  return can_use_thnn_channels_last_2d;
+}
+
+static inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // check layout only for xpu tensor.
+  if (!input.is_xpu() || !weight.is_xpu()) {
+    return false;
+  }
+
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_xpu_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+
+  bool can_use_xpu_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+
+  return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ConvolutionMM3d.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ConvolutionMM3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..6db6f69d96a67c04ef0e689d88c1cf40392d9e18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ConvolutionMM3d.h
@@ -0,0 +1,14 @@
+#include <ATen/core/Tensor.h>
+
+namespace at::native {
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv3d_backward_cpu(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    std::array<bool, 3> output_mask);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Copy.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..200ea0e1d96cc02e7de2e29ea7217f7f1cd0aea0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Copy.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+class Tensor;
+struct TensorIterator;
+class TensorBase;
+
+namespace native {
+
+using copy_fn = void (*)(TensorIterator&, bool non_blocking);
+
+DECLARE_DISPATCH(copy_fn, copy_stub);
+
+TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Cross.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Cross.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2bf16e6cd3ad275ee224910480263418c3b91d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Cross.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d);
+
+DECLARE_DISPATCH(cross_fn, cross_stub);
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..687700b8d2a133d4355a65495a44e2bf534ae0bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <ATen/div_rtn.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+
+#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
+  TORCH_CHECK(                                       \
+      T.dim() == DIM && T.size(DIM_SIZE) == SIZE,    \
+      "Need " #T " of dimension ",                   \
+      DIM,                                           \
+      " and " #T ".size[",                           \
+      DIM_SIZE,                                      \
+      "] == ",                                       \
+      SIZE,                                          \
+      " but got input to be of shape ",              \
+      T.sizes())
+
+namespace at::native::internal {
+namespace {
+inline bool all_positive(IntArrayRef& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item > 0; });
+}
+
+inline bool all_nonnegative(std::vector<int64_t>& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item >= 0; });
+}
+
+} // namespace
+
+// calculate the rear part of output tensor sizes
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  std::vector<int64_t> sizes;
+  for (const auto index : c10::irange(dim)) {
+    sizes.push_back(
+        div_rtn<int64_t>(
+            input.size(index + input.dim() - dim) + 2 * pad_size[index] -
+                (dilation_size[index] * (kernel_size[index] - 1) + 1),
+            stride_size[index]) +
+        1);
+  }
+  return sizes;
+}
+
+// calculate the sizes of output tensor
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+  output_size.insert(output_size.begin(), weight.size(0));
+  if (input.dim() == dim + 2) {
+    output_size.insert(output_size.begin(), input.size(0));
+  }
+  return output_size;
+}
+/*
+  slow_conv_dilated_shape_check - check user-input to dilated convolution
+  forward and backward functions.
+*/
+template <int64_t dim>
+void slow_conv_dilated_shape_check(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  /*
+    When the following tensors are defined:
+
+    bias, grad_weight, grad_output
+
+    then these are assumed to be contiguous without checking
+    because of these tensors are made contiguous by calling
+    .contiguous() method or by resizing of zero-sized tensors in
+    forward/backward functions.
+
+    When grad_weight is defined then it is assumed without
+    checking to have the same shape as weight, see backward
+    functions.
+   */
+  // Check size arguments
+  TORCH_CHECK(
+      kernel_size.size() == dim,
+      "kernel sizes length should be ",
+      dim,
+      ", but got ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride_size.size() == dim,
+      "strides length should be ",
+      dim,
+      ", but got ",
+      stride_size.size());
+  TORCH_CHECK(
+      dilation_size.size() == dim,
+      "dilations length should be ",
+      dim,
+      ", but got ",
+      dilation_size.size());
+  TORCH_CHECK(
+      pad_size.size() == dim,
+      "pads length should be ",
+      dim,
+      ", but got ",
+      pad_size.size());
+
+  TORCH_CHECK(
+      all_positive(kernel_size),
+      "kernel size should be greater than zero, but got ",
+      kernel_size);
+  TORCH_CHECK(
+      all_positive(stride_size),
+      "stride should be greater than zero, but got ",
+      stride_size);
+  TORCH_CHECK(
+      all_positive(dilation_size),
+      "dilation should be greater than zero, but got ",
+      dilation_size);
+
+  // check input
+  TORCH_CHECK(input.defined(), "input must be defined");
+  bool is_batch = input.dim() == dim + 2;
+  int64_t n = (is_batch ? 2 : 1);
+  int64_t ndim = n + dim;
+  if (!is_batch) {
+    // input dim has to be dim + 1 if not batched
+    TORCH_CHECK(
+        input.dim() == dim + 1,
+        "input must be 4D or 5D tensor but got ",
+        input.dim(),
+        "D tensor");
+  }
+
+  // check output sizes
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+
+  TORCH_CHECK(
+      all_nonnegative(output_size),
+      "calculated output size ",
+      output_size,
+      " is too small (all sizes must be non-negative)");
+
+  // check weight
+  TORCH_CHECK(weight.defined(), "weight must be defined");
+  TORCH_CHECK(
+      weight.dim() == dim + 2,
+      "weight must be ",
+      dim + 2,
+      "D tensor but got ",
+      weight.dim(),
+      "D tensor dim=",
+      dim);
+  TORCH_CHECK(
+      weight.sizes().slice(2) == kernel_size,
+      "weight[2:] shape ",
+      weight.sizes().slice(2),
+      " must be equal to kernel_size ",
+      kernel_size);
+
+  TORCH_CHECK_DIM_SIZE(input, input.dim(), (is_batch ? 1 : 0), weight.size(1));
+
+  // check bias when present
+  if (bias.defined()) {
+    TORCH_CHECK(
+        bias.dim() == 1,
+        "bias must be 1D tensor but got ",
+        bias.dim(),
+        "D tensor");
+    TORCH_CHECK_DIM_SIZE(bias, 1, 0, weight.size(0));
+  }
+
+  // check grad_output when present
+  if (grad_output.defined()) {
+    TORCH_CHECK(
+        grad_output.dim() == ndim,
+        "grad_output must be ",
+        ndim,
+        "D tensor but got ",
+        grad_output.dim(),
+        "D tensor");
+    if (is_batch) {
+      TORCH_CHECK(
+          grad_output.size(0) == input.size(0),
+          "grad_output.size(0)=",
+          grad_output.size(0),
+          " must be input.size(0)=",
+          input.size(0));
+    }
+    TORCH_CHECK(
+        grad_output.size(n - 1) == weight.size(0),
+        "grad_output.size(",
+        n - 1,
+        ")=",
+        grad_output.size(n - 1),
+        " must be weight.size(0)=",
+        weight.size(0));
+    TORCH_CHECK(
+        grad_output.sizes().slice(n) == output_size,
+        "grad_output[",
+        n,
+        ":] shape",
+        grad_output.sizes().slice(n),
+        " must be equal to output size ",
+        output_size);
+  }
+}
+
+} // namespace at::native::internal
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/DispatchStub.h b/MLPY/Lib/site-packages/torch/include/ATen/native/DispatchStub.h
new file mode 100644
index 0000000000000000000000000000000000000000..d474f2ce342c9399f52d9b654033132c0330ecdc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/DispatchStub.h
@@ -0,0 +1,315 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <atomic>
+#include <utility>
+
+// Implements instruction set specific function dispatch.
+//
+// Kernels that may make use of specialized instruction sets (e.g. AVX2) are
+// compiled multiple times with different compiler flags (e.g. -mavx2). A
+// DispatchStub contains a table of function pointers for a kernel. At runtime,
+// the fastest available kernel is chosen based on the features reported by
+// cpuinfo.
+//
+// Example:
+//
+// In native/MyKernel.h:
+//   using fn_type = void(*)(const Tensor& x);
+//   DECLARE_DISPATCH(fn_type, stub);
+//
+// In native/MyKernel.cpp
+//   DEFINE_DISPATCH(stub);
+//
+// In native/cpu/MyKernel.cpp:
+//   namespace {
+//     // use anonymous namespace so that different cpu versions won't conflict
+//     void kernel(const Tensor& x) { ... }
+//   }
+//   REGISTER_DISPATCH(stub, &kernel);
+//
+// To call:
+//   stub(kCPU, tensor);
+//
+// TODO: CPU instruction set selection should be folded into whatever
+// the main dispatch mechanism is.
+
+// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere
+C10_CLANG_DIAGNOSTIC_PUSH()
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wundefined-var-template")
+
+namespace at::native {
+
+enum class CPUCapability {
+  DEFAULT = 0,
+#if defined(HAVE_VSX_CPU_DEFINITION)
+  VSX = 1,
+#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
+  ZVECTOR = 1,
+#else
+  AVX2 = 1,
+  AVX512 = 2,
+#endif
+  NUM_OPTIONS
+};
+
+CPUCapability get_cpu_capability();
+
+template <typename FnPtr, typename T>
+struct DispatchStub;
+
+/**
+ * The sole purpose of this class is to outline methods that don't need to be
+ * specialized or otherwise inlined and duplicated (by the compiler due to
+ * template expansion), since it causes size bloat if there are a significant
+ * number of specialization of the DispatchStub<> class.
+ */
+struct TORCH_API DispatchStubImpl {
+  void* get_call_ptr(
+    c10::DeviceType device_type
+    , void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , void *ZVECTOR
+#endif
+  );
+
+  /**
+   * The CPU Dispatch actual method is chosen in decreasing order of preference by
+   * DispatchStubImpl::choose_cpu_impl() in case none is found by
+   * DispatchStubImpl::get_call_ptr() in cpu_dispatch_ptr.
+   */
+  void* choose_cpu_impl(
+    void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+    , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+    , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+    , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+    , void *ZVECTOR
+#endif
+  );
+
+  // Fixing dispatch error in Windows debug builds.
+  // See https://github.com/pytorch/pytorch/issues/22681 for more details.
+  #if defined(_MSC_VER) && defined(_DEBUG)
+    std::atomic<void*> cpu_dispatch_ptr;
+    void* cuda_dispatch_ptr;
+    void* hip_dispatch_ptr;
+    void* mps_dispatch_ptr;
+    void* privateuse1_dispatch_ptr;
+  #else
+    std::atomic<void*> cpu_dispatch_ptr{nullptr};
+    void* cuda_dispatch_ptr = nullptr;
+    void* hip_dispatch_ptr = nullptr;
+    void* mps_dispatch_ptr = nullptr;
+    void* privateuse1_dispatch_ptr = nullptr;
+  #endif
+};
+
+template <typename rT, typename T, typename... Args>
+struct DispatchStub<rT (*)(Args...), T> {
+  using FnPtr = rT (*) (Args...);
+
+  DispatchStub() = default;
+  DispatchStub(const DispatchStub&) = delete;
+  DispatchStub& operator=(const DispatchStub&) = delete;
+
+private:
+  FnPtr get_call_ptr(c10::DeviceType device_type) {
+    return reinterpret_cast<FnPtr>(
+      impl.get_call_ptr(device_type
+      , reinterpret_cast<void*>(DEFAULT)
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX512)
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX2)
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , reinterpret_cast<void*>(VSX)
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , reinterpret_cast<void*>(ZVECTOR)
+#endif
+      )
+    );
+  }
+
+public:
+  template <typename... ArgTypes>
+  rT operator()(c10::DeviceType device_type, ArgTypes&&... args) {
+    FnPtr call_ptr = get_call_ptr(device_type);
+    return (*call_ptr)(std::forward<ArgTypes>(args)...);
+  }
+
+  void set_cuda_dispatch_ptr(FnPtr fn_ptr) {
+    impl.cuda_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  void set_hip_dispatch_ptr(FnPtr fn_ptr) {
+    impl.hip_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  void set_mps_dispatch_ptr(FnPtr fn_ptr) {
+    impl.mps_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  void set_privateuse1_dispatch_ptr(FnPtr fn_ptr) {
+    impl.privateuse1_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  static TORCH_API FnPtr DEFAULT;
+#ifdef HAVE_AVX512_CPU_DEFINITION
+  static TORCH_API FnPtr AVX512;
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+  static TORCH_API FnPtr AVX2;
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+  static TORCH_API FnPtr VSX;
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+  static TORCH_API FnPtr ZVECTOR;
+#endif
+private:
+  DispatchStubImpl impl;
+};
+
+namespace {
+template <typename DispatchStub>
+struct RegisterCUDADispatch {
+  RegisterCUDADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_cuda_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterMPSDispatch {
+  RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_mps_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterHIPDispatch {
+  RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    // TODO: make this point at hip_dispatch_ptr
+    stub.set_cuda_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterPRIVATEUSE1Dispatch {
+  RegisterPRIVATEUSE1Dispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_privateuse1_dispatch_ptr(value);
+  }
+};
+
+} // anonymous namespace
+// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
+// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
+// adding parentheses and using helper struct to get rid of the parentheses, do
+// not work with MSVC. So do a `using`-declaration if you need to pass in such
+// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
+#define DECLARE_DISPATCH(fn, name)         \
+  struct name : DispatchStub<fn, name> {   \
+    name() = default;                      \
+    name(const name&) = delete;            \
+    name& operator=(const name&) = delete; \
+  };                                       \
+  extern TORCH_API struct name name
+
+#define DEFINE_DISPATCH(name) struct name name
+
+#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
+  template <> name::FnPtr TORCH_API DispatchStub<name::FnPtr, struct name>::arch = fn;
+
+#ifdef HAVE_AVX512_CPU_DEFINITION
+#define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn)
+#else
+#define REGISTER_AVX512_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_AVX2_CPU_DEFINITION
+#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
+#else
+#define REGISTER_AVX2_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_VSX_CPU_DEFINITION
+#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn)
+#else
+#define REGISTER_VSX_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+#define REGISTER_ZVECTOR_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, ZVECTOR, fn)
+#else
+#define REGISTER_ZVECTOR_DISPATCH(name, fn)
+#endif
+
+// Macro to register the same kernel for all CPU arch types. This is useful
+// if a kernel does not benefit from being recompiled across different arch types.
+#define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
+  REGISTER_ARCH_DISPATCH(name, DEFAULT, fn)                                    \
+  REGISTER_AVX512_DISPATCH(name, fn)                                           \
+  REGISTER_AVX2_DISPATCH(name, fn)                                             \
+  REGISTER_VSX_DISPATCH(name, fn)                                              \
+  REGISTER_ZVECTOR_DISPATCH(name, fn)
+
+#define REGISTER_NO_CPU_DISPATCH(name)                                         \
+  REGISTER_ALL_CPU_DISPATCH(name, nullptr)
+
+#define REGISTER_CUDA_DISPATCH(name, fn) \
+  static RegisterCUDADispatch<struct name> name ## __register(name, fn);
+
+#define REGISTER_HIP_DISPATCH(name, fn) \
+  static RegisterHIPDispatch<struct name> name ## __register(name, fn);
+
+#define REGISTER_MPS_DISPATCH(name, fn) \
+  static RegisterMPSDispatch<struct name> name ## __register(name, fn);
+
+#define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \
+  static RegisterPRIVATEUSE1Dispatch<struct name> name ## __register(name, fn);
+
+// NB: This macro must be used in an actual 'cu' file; if you try using
+// it from a 'cpp' file it will not work!
+#if defined(__CUDACC__)
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
+#elif defined(__HIPCC__)
+// TODO: cut this over to HIP dispatch once we stop pretending that CUDA
+// is HIP in the PyTorch HIPify build.
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
+// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn)
+#elif defined(__OBJC__) && defined(USE_MPS)
+// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel
+#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn)
+#elif defined(CPU_CAPABILITY)
+// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
+// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
+#ifdef CPU_CAPABILITY_AVX512
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, nullptr)
+#else
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#endif
+#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#endif
+} // namespace at::native
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Distance.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8f02379d29e911bddbb671a6de5a238aad0b9d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Distance.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p);
+using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
+using cdist_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p);
+using cdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
+
+DECLARE_DISPATCH(pdist_forward_fn, pdist_forward_stub);
+DECLARE_DISPATCH(pdist_backward_fn, pdist_backward_stub);
+DECLARE_DISPATCH(cdist_fn, cdist_stub);
+DECLARE_DISPATCH(cdist_backward_fn, cdist_backward_stub);
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/DistributionTemplates.h b/MLPY/Lib/site-packages/torch/include/ATen/native/DistributionTemplates.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f194014c496354ee4d324d5a01561717837e9fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/DistributionTemplates.h
@@ -0,0 +1,394 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/Generator.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/Tensor.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/util/Optional.h>
+#include <limits>
+#include <cmath>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/view_as_real.h>
+#endif
+
+namespace at::native::templates {
+
+// ==================================================== Random ========================================================
+
+// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`.
+// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t).
+// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance:
+//
+//    auto actual = torch::empty({3, 3}, torch::half);
+//    actual.random_(0, 65504);
+//
+// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
+// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
+// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
+// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
+// available number for torch::half dtype.
+template<typename scalar_t>
+int64_t update_from(int64_t from) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto from_plus_1 = static_cast<int64_t>(static_cast<scalar_t>(from + 1));
+  if (from_plus_1 < from) {
+    int64_t from_ = std::abs(from + 1);
+    int n = 0;
+    while (from_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    from = from_plus_1 + (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return from;
+}
+
+template<typename scalar_t>
+int64_t update_to(int64_t to) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto to_minus_1 = static_cast<int64_t>(static_cast<scalar_t>(to - 1));
+  if (to_minus_1 >= to) {
+    int64_t to_ = std::abs(to - 1);
+    int n = 0;
+    while (to_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    to = to_minus_1 - (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return to;
+}
+
+// Return earlier for not invoking kernel.
+// See https://github.com/pytorch/pytorch/issues/103418 for more details
+#define CHECK_EMPTY_AND_RETURN(tensor) \
+  if (tensor.numel() == 0) {  \
+    return tensor;  \
+  }
+
+template<template<typename> class random_kernel, typename RNG>
+at::Tensor& random_impl(at::Tensor& self, c10::optional<Generator> generator) {
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  random_kernel<RNG>()(iter, generator);
+  return self;
+}
+
+#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \
+  TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \
+
+#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \
+  if (var < -(1LL << digits) || var > (1LL << digits)) { \
+    TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \
+      "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \
+      "This warning will become an error in version 1.7 release, please fix the code in advance"); \
+  }
+
+static void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) {
+  const auto scalar_type = typeMetaToScalarType(dtype);
+  if (isFloatingType(scalar_type)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] {
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+
+      constexpr auto digits = std::numeric_limits<scalar_t>::digits;
+      WARN_OUT_OF_BOUNDS(from, "from", digits, dtype);
+      WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype);
+    });
+  } else if (scalar_type == kUInt64) {
+    // When you do a comparison between int64_t and uint64_t, the usual
+    // arithmetic conversions say that the int64_t value is promoted to
+    // unsigned. But this conversion wraps around: if I had -1 as my int64_t,
+    // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never
+    // the right thing to do.
+    CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype);
+    CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype);
+  } else if (isIntegralType(scalar_type, /*includeBool=*/true)) {
+    AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() {
+      const auto min = static_cast<int64_t>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+    }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool);
+  } else {
+    TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types");
+  }
+}
+
+template<template<typename> class random_from_to_kernel, typename RNG>
+at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, c10::optional<int64_t> to_opt, c10::optional<Generator> generator) {
+  uint64_t range = 0;
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  if (to_opt.has_value()) {
+    // [from, to)
+    int64_t to = *to_opt;
+    TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to);
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] {
+        from = update_from<scalar_t>(from);
+        to = update_to<scalar_t>(to);
+        TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to);
+      });
+    }
+    check_from_to_in_range(from, to - 1, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to) - static_cast<uint64_t>(from);
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else if (from != std::numeric_limits<int64_t>::lowest()) {
+    // [from, std::numeric_limits<int64_t>::max()]
+    int64_t to_inc = 0;
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] {
+        constexpr int64_t scalar_t_max = static_cast<int64_t>(1) << std::numeric_limits<scalar_t>::digits;
+        to_inc = scalar_t_max > std::numeric_limits<int64_t>::max() ? std::numeric_limits<int64_t>::max() : static_cast<int64_t>(scalar_t_max);
+        from = update_from<scalar_t>(from);
+        TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc);
+      });
+    } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+      AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] {
+        if constexpr (std::is_same_v<scalar_t, bool>) {
+          to_inc = static_cast<int64_t>(true);
+        } else {
+          to_inc = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+        }
+      }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool);
+    } else {
+      TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types");
+    }
+    check_from_to_in_range(from, to_inc, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to_inc) - static_cast<uint64_t>(from) + 1;
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else {
+    // [std::numeric_limits<int64_t>::lowest(), std::numeric_limits<int64_t>::max()]
+    // range = 2^64
+    CHECK_EMPTY_AND_RETURN(self);
+    random_from_to_kernel<RNG>()(iter, generator);
+  }
+  return self;
+}
+
+// ==================================================== Normal ========================================================
+
+#define CHECK_NORMAL_TENSOR_STD(std) \
+  do { \
+    TORCH_CHECK( \
+      !std.is_complex(), \
+      "normal expects standard deviation to be non-complex"); \
+    TORCH_CHECK( \
+      std.numel() == 0 || std.is_meta() || std.min().ge(0).item<bool>(), \
+      "normal expects all elements of std >= 0.0"); \
+  } while (0)
+
+#define CHECK_NORMAL_STD(std) \
+  TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_impl_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  CHECK_EMPTY_AND_RETURN(self);
+
+  if (self.is_complex()) {
+    auto float_tensor = at::view_as_real(self);
+    // variance for normal distribution of the real and imaginary values
+    // is half of the input variance
+    normal_kernel<RNG>()(float_tensor, mean, std/(std::sqrt(2)), gen);
+  } else {
+    normal_kernel<RNG>()(self, mean, std, gen);
+  }
+  return self;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
+  auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, std, gen);
+  output.add_(mean);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto mean_tensor = at::full({}, mean, output.options());
+  auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean_tensor, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std
+  output.mul_(std).add_(mean_tensor);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean + mean * std instead of mean + output * std
+  output.mul_(std).add_(mean);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+// ==================================================== Uniform =======================================================
+
+template<template<typename> class uniform_kernel, typename RNG>
+at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, c10::optional<Generator> generator) {
+  if (self.is_complex()) {
+    CHECK_EMPTY_AND_RETURN(self);
+    auto float_tensor = at::view_as_real(self);
+    uniform_impl_<uniform_kernel, RNG>(float_tensor, from, to, generator);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] {
+      const auto dtype = self.dtype();
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype);
+      TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to);
+      TORCH_CHECK((to - from) <= std::numeric_limits<scalar_t>::max(),
+            "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()),
+            ">::max(), but found to=", to, " and from=", from,
+            " which result in to-from to exceed the limit");
+      from = std::min(std::max(from, min), max);
+      to = std::max(std::min(to, max), min);
+    });
+    CHECK_EMPTY_AND_RETURN(self);
+    auto iter = at::TensorIterator::borrowing_nullary_op(self);
+    uniform_kernel<RNG>()(iter, from, to, generator);
+  }
+  return self;
+}
+
+// ================================================== LogNormal =======================================================
+
+template<template<typename> class log_normal_kernel, typename RNG>
+at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  log_normal_kernel<RNG>()(iter, mean, std, gen);
+  return self;
+}
+
+// =================================================== Geometric ======================================================
+
+template<template<typename> class geometric_kernel, typename RNG>
+Tensor& geometric_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+  TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  geometric_kernel<RNG>()(iter, p, gen);
+  return self;
+}
+
+// ================================================== Exponential =====================================================
+
+template<template<typename> class exponential_kernel, typename RNG>
+Tensor& exponential_impl_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+  TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  exponential_kernel<RNG>()(iter, lambda, gen);
+  return self;
+}
+
+// ==================================================== Cauchy ========================================================
+
+template<template<typename> class cauchy_kernel, typename RNG>
+Tensor& cauchy_impl_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+  // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
+  // the variance, squared sigma, is undefined for cauchy distribution
+  TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype());
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  cauchy_kernel<RNG>()(iter, median, sigma, gen);
+  return self;
+}
+
+// ==================================================== Bernoulli =====================================================
+
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, c10::optional<Generator> gen) {
+  CHECK_EMPTY_AND_RETURN(self);
+  NoNamesGuard guard;
+  at::assert_no_internal_overlap(self);
+  bernoulli_tensor_kernel<RNG>()(self, p_, gen);
+  return self;
+}
+
+template<template<typename> class bernoulli_scalar_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, double p, c10::optional<Generator> gen) {
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  at::assert_no_internal_overlap(self);
+  bernoulli_scalar_kernel<RNG>()(self, p, gen);
+  return self;
+}
+
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, c10::optional<Generator> gen) {
+  // result.resize_as_(self) requires self to have same dtype as result, so we
+  // use resize_ instead.
+  // TODO: Fix resize_as_. See pytorch/pytorch#11665.
+  result.resize_(self.sizes());
+  bernoulli_impl_<bernoulli_tensor_kernel, RNG>(result, self, gen);
+  namedinference::propagate_names(result, self);
+  return result;
+}
+
+#undef CHECK_OUT_OF_BOUNDS
+#undef WARN_OUT_OF_BOUNDS
+
+} // namespace at::native::templates
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Distributions.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Distributions.h
new file mode 100644
index 0000000000000000000000000000000000000000..637dd73b6ba62835de12ec20cad521fc60dfcabc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Distributions.h
@@ -0,0 +1,518 @@
+#pragma once
+
+#include <ATen/native/Math.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/MathConstants.h>
+
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define compat_exp c10::cuda::compat::exp
+#define compat_ceil c10::cuda::compat::ceil
+#define compat_floor c10::cuda::compat::floor
+#define compat_log c10::cuda::compat::log
+#define compat_pow c10::cuda::compat::pow
+#define compat_sqrt c10::cuda::compat::sqrt
+#define compat_tan c10::cuda::compat::tan
+#define compat_abs c10::cuda::compat::abs
+#define compat_log1p c10::cuda::compat::log1p
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define compat_exp c10::hip::compat::exp
+#define compat_ceil c10::hip::compat::ceil
+#define compat_floor c10::hip::compat::floor
+#define compat_log c10::hip::compat::log
+#define compat_pow c10::hip::compat::pow
+#define compat_sqrt c10::hip::compat::sqrt
+#define compat_tan c10::hip::compat::tan
+#define compat_abs c10::hip::compat::abs
+#define compat_log1p c10::hip::compat::log1p
+#else
+#define compat_exp std::exp
+#define compat_ceil std::ceil
+#define compat_floor std::floor
+#define compat_log std::log
+#define compat_pow std::pow
+#define compat_sqrt std::sqrt
+#define compat_tan std::tan
+#define compat_abs std::abs
+#define compat_log1p std::log1p
+#endif
+
+namespace {
+
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+// we cannot use std::isnan directly due to some incompatibility of
+// gcc constexpr'ing and nvcc
+using std::isnan;
+#endif
+
+// Here sampler_t should be function type scalar_t(void). For gpu
+// "sampler" is a device function, but since ROCM doesn't have
+// equivalent to nvstd::function, we use a template type parameter to
+// capture it.
+template<typename scalar_t, typename sampler_t>
+struct BaseSampler {
+  sampler_t sampler;
+  C10_DEVICE BaseSampler(const sampler_t& sampler): sampler(sampler) {}
+  C10_DEVICE scalar_t sample() {
+    return sampler();
+  }
+};
+
+// The function `sample_gamma` is
+// is adapted from Numpy's distributions.c implementation.
+// It is MIT licensed, so here is the copyright:
+
+/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t, typename normal_sampler_t>
+C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform, BaseSampler<accscalar_t, normal_sampler_t>& standard_normal) {
+  accscalar_t scale = 1.0f;
+
+  // Boost alpha for higher acceptance probability.
+  if (alpha < 1.0f) {
+    if (alpha == 0.f) return 0.f;
+    scale *= compat_pow(1 - standard_uniform.sample(), 1.0f / alpha);
+    alpha += 1.0f;
+  }
+
+  // This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
+  // doi:10.1145/358407.358414
+  const accscalar_t d = alpha - 1.0f / 3.0f;
+  const accscalar_t c = 1.0f / compat_sqrt(9.0f * d);
+  for (;;) {
+    accscalar_t x, y;
+    do {
+      x = standard_normal.sample();
+      y = 1.0f + c * x;
+    } while (y <= 0);
+    const accscalar_t v = y * y * y;
+    const accscalar_t u = 1 - standard_uniform.sample();
+    const accscalar_t xx = x * x;
+    if (u < 1.0f - 0.0331f * xx * xx)
+      return static_cast<scalar_t>(scale * d * v);
+    if (compat_log(u) < 0.5f * xx + d * (1.0f - v + compat_log(v)))
+      return static_cast<scalar_t>(scale * d * v);
+  }
+}
+
+/* the functions stirling_approx_tail, binomial_inversion, and btrs are adapted
+ * from TensorFlow's random_binomial_op.cc implementation. That code is under
+ * copyright: 2019 The TensorFlow Authors.
+ *
+ * It was released under the Apache License, Version 2.0 (the "License"), available at:
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+template<typename scalar_t>
+C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
+  const static scalar_t kTailValues[] = {
+    0.0810614667953272,
+    0.0413406959554092,
+    0.0276779256849983,
+    0.02079067210376509,
+    0.0166446911898211,
+    0.0138761288230707,
+    0.0118967099458917,
+    0.0104112652619720,
+    0.00925546218271273,
+    0.00833056343336287
+  };
+  if (k <= 9) {
+    return kTailValues[static_cast<size_t>(k)];
+  }
+  scalar_t kp1sq = (k + 1) * (k + 1);
+  return (1.0 / 12 - (1.0 / 360 - 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1);
+}
+
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t binomial_inversion(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  accscalar_t U;
+  accscalar_t geom_sum = 0;
+  scalar_t num_geom = 0;
+
+  accscalar_t logprob = compat_log1p(-prob);
+
+  while (1) {
+    U = standard_uniform.sample();
+    accscalar_t geom = compat_ceil(compat_log(U) / logprob);
+    geom_sum += geom;
+    if (geom_sum > count) {
+      break;
+    }
+    num_geom = num_geom + 1;
+  }
+  return num_geom;
+}
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t btrs(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  scalar_t k;
+  accscalar_t U, V, us;
+
+  // This is spq in the paper.
+  const accscalar_t stddev = compat_sqrt(count * prob * (1 - prob));
+
+  // Other coefficients for Transformed Rejection sampling.
+  const accscalar_t b = 1.15 + 2.53 * stddev;
+  const accscalar_t a = -0.0873 + 0.0248 * b + 0.01 * prob;
+  const accscalar_t c = count * prob + 0.5;
+  const accscalar_t v_r = 0.92 - 4.2 / b;
+  const accscalar_t r = prob / (1 - prob);
+
+  const accscalar_t alpha = (2.83 + 5.1 / b) * stddev;
+  const accscalar_t m = compat_floor((count + 1) * prob);
+
+  while (1) {
+    U = standard_uniform.sample() - 0.5;
+    V = standard_uniform.sample();
+
+    us = 0.5 - compat_abs(U);
+    k = static_cast<scalar_t>(compat_floor((2 * a / us + b) * U + c));
+
+    // Reject non-sensical answers.
+    if (k < 0 || k > count) {
+      continue;
+    }
+    // Region for which the box is tight, and we can return our calculated value.
+    // This should happen 0.86 * v_r times. In the limit as n * p is large,
+    // the acceptance rate converges to ~79% (and in the lower regime it is ~24%).
+    if (us >= 0.07 && V <= v_r) {
+      return k;
+    }
+
+    // This deviates from Hormann's BTRS algorithm, as there is a log missing.
+    // For all (u, v) pairs outside of the bounding box, this calculates the
+    // transformed-reject ratio.
+    V = compat_log(V * alpha / (a / (us * us) + b));
+    accscalar_t upperbound =
+        ((m + 0.5) * compat_log((m + 1) / (r * (count - m + 1))) +
+         (count + 1) * compat_log((count - m + 1) / (count - k + 1)) +
+         (k + 0.5) * compat_log(r * (count - k + 1) / (k + 1)) +
+         stirling_approx_tail<accscalar_t>(m) + stirling_approx_tail<accscalar_t>(count - m) -
+         stirling_approx_tail<accscalar_t>(k) - stirling_approx_tail<accscalar_t>(count - k));
+
+    if (V <= upperbound) {
+      return k;
+    }
+  }
+}
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t sample_binomial(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  if (count <= 0.0 || prob <= 0.0) {
+    return 0;
+  } else if (prob >= 1.0) {
+    return count;
+  } else if (prob <= 0.5) {
+    if (count * prob >= 10.0) {
+      // btrs
+      return btrs<scalar_t, accscalar_t, uniform_sampler_t>(count, prob, standard_uniform);
+    } else {
+      // binomial inversion
+      return binomial_inversion<scalar_t, accscalar_t, uniform_sampler_t>(count, prob, standard_uniform);
+    }
+  } else if (prob > 0.5) {
+    scalar_t qprob = 1.0 - prob;
+    if (count * qprob >= 10.0) {
+      // btrs
+      return count - btrs<scalar_t, accscalar_t, uniform_sampler_t>(count, qprob, standard_uniform);
+    } else {
+      // count - binomial inversion
+      return count - binomial_inversion<scalar_t, accscalar_t, uniform_sampler_t>(count, qprob, standard_uniform);
+    }
+  } else {
+    // prob is nan?
+    return static_cast<scalar_t>(NAN);
+  }
+}
+
+/*
+ * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library] in ATen/native/Math.h.
+ */
+template<typename scalar_t, typename accscalar_t>
+C10_DEVICE static inline scalar_t digamma_one(scalar_t x) {
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  if (x == 0) {
+    return INFINITY;
+  }
+  accscalar_t additional_summand = 0;
+  int x_is_integer = x == compat_floor(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      return INFINITY;
+    }
+    // it is more standard to write this as recursion, but
+    // nvcc does not like that
+    additional_summand = -c10::pi<scalar_t> /
+        compat_tan(c10::pi<scalar_t> * x);
+    x = 1 - x;
+  }
+
+  // Push x to be >= 10
+  accscalar_t result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10 + additional_summand;
+  }
+
+  // Compute asymptotic digamma
+  static const accscalar_t A[] = {
+     8.33333333333333333333E-2,
+    -2.10927960927960927961E-2,
+     7.57575757575757575758E-3,
+    -4.16666666666666666667E-3,
+     3.96825396825396825397E-3,
+    -8.33333333333333333333E-3,
+     8.33333333333333333333E-2,
+  };
+
+  accscalar_t y = 0;
+  if (x < 1.0e17f) {
+    accscalar_t z = 1.0 / (x * x);
+    y = z * polevl<accscalar_t>(z, A, 6);
+  }
+  return static_cast<scalar_t>(
+      result + compat_log(x) - (0.5f / x) - y + additional_summand);
+}
+
+// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha)
+// for random number x drawn from a standard Gamma distribution Gamma(alpha).
+template <typename scalar_t, typename accscalar_t>
+C10_HOST_DEVICE scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) {
+  // Use a Taylor series expansion for small x.
+  accscalar_t x = static_cast<accscalar_t>(x_);
+  accscalar_t alpha = static_cast<accscalar_t>(alpha_);
+  if (x < 0.8f) {
+    accscalar_t numer = 1;
+    accscalar_t denom = alpha;
+    auto series1 = numer / denom;
+    auto series2 = numer / (denom * denom);
+    for (int i = 1; i <= 5; ++i) {
+      numer *= -x / static_cast<accscalar_t>(i);
+      denom += 1;
+      series1 += numer / denom;
+      series2 += numer / (denom * denom);
+    }
+    const auto pow_x_alpha = compat_pow(x, alpha);
+    const auto gamma_pdf = compat_pow(x, alpha - 1) * compat_exp(-x);
+    const auto gamma_cdf = pow_x_alpha * series1;
+    const auto gamma_cdf_alpha =
+        (compat_log(x) - digamma_one<accscalar_t, accscalar_t>(alpha)) *
+            gamma_cdf -
+        pow_x_alpha * series2;
+    const auto result = -gamma_cdf_alpha / gamma_pdf;
+    return isnan(result) ? static_cast<scalar_t>( 0.f ) : static_cast<scalar_t>(result);
+  }
+
+  // Use a Rice saddle point expansion for large alpha.
+  if (alpha > 8.0f) {
+    if (0.9f * alpha <= x && x <= 1.1f * alpha) {
+      const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha);
+      const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x)
+          - 65 * x * x / alpha + alpha * (107 + 3600 * x);
+      const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha);
+      return static_cast<scalar_t>(numer_1 * numer_2 / denom);
+    }
+    const auto denom = compat_sqrt(8 * alpha);
+    const auto term2 = denom / (alpha - x);
+    const auto term3 = compat_pow(
+        x - alpha - alpha * compat_log(x / alpha),
+        static_cast<accscalar_t>(-1.5));
+    const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3;
+    const auto term1 = compat_log(x / alpha) * term23 -
+        compat_sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x));
+    const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha));
+    const auto numer = x * term1;
+    return static_cast<scalar_t>(-stirling * numer / denom);
+  }
+
+  // Use a bivariate rational approximation to the reparameterized gradient.
+  const auto u = compat_log(x / alpha);
+  const auto v = compat_log(alpha);
+  static const accscalar_t coef_uv[3][8] = {
+    {0.16009398, -0.094634809, 0.025146376, -0.0030648343,
+     1, 0.32668115, 0.10406089, 0.0014179084},
+    {0.53487893, 0.1298071, 0.065735949, -0.0015649758,
+     0.16639465, 0.020070113, -0.0035938915, -0.00058392623},
+    {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777,
+     0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07},
+  };
+  accscalar_t coef_v[8];
+  for (int i = 0; i < 8; ++ i) {
+    coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]);
+  }
+  const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3]));
+  const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7]));
+  return static_cast<scalar_t>(compat_exp(p / q));
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes x is close to zero and uses a Taylor expansion.
+template <typename scalar_t, typename accscalar_t>
+C10_DEVICE static inline scalar_t _beta_grad_alpha_small(scalar_t x, scalar_t alpha, scalar_t beta) {
+  const scalar_t factor = digamma_one<scalar_t, accscalar_t>(alpha)
+                        - digamma_one<scalar_t, accscalar_t>(alpha + beta) - compat_log(x);
+  scalar_t numer = 1;
+  scalar_t series = numer / alpha * (factor + 1 / alpha);
+  for (int i = 1; i <= 10; ++i) {
+    scalar_t casted_i = static_cast<scalar_t>(i);
+    numer *= (casted_i - beta) * x / casted_i;
+    const scalar_t denom = alpha + casted_i;
+    series += numer / denom * (factor + 1 / denom);
+  }
+  const scalar_t result = x * compat_pow(1 - x, -beta) * series;
+  return isnan(result) ? static_cast<scalar_t>( 0.f ) : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta.
+// Assumes x is close to zero and uses a Taylor expansion.
+template <typename scalar_t, typename accscalar_t>
+C10_DEVICE static inline scalar_t _beta_grad_beta_small(scalar_t x, scalar_t alpha, scalar_t beta) {
+  const scalar_t factor = digamma_one<scalar_t, accscalar_t>(alpha + beta) - digamma_one<scalar_t, accscalar_t>(beta);
+  scalar_t numer = 1, betas = 1, dbetas = 0, series = factor / alpha;
+  for (int i = 1; i <= 8; ++i) {
+    scalar_t casted_i = static_cast<scalar_t>(i);
+    numer *= -x / casted_i;
+    dbetas = dbetas * (beta - casted_i) + betas;
+    betas = betas * (beta - casted_i);
+    series += numer / (alpha + casted_i) * (dbetas + factor * betas);
+  }
+  const scalar_t result = -compat_pow(1 - x, 1 - beta) * series;
+  return isnan(result) ? static_cast<scalar_t>( 0.f ) : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes alpha and beta are both large and uses a Rice saddle point expansion.
+// To ensure numerical stability, this computation is performed at higher precision.
+template<typename scalar_t, typename accscalar_t>
+C10_DEVICE static inline scalar_t _beta_grad_alpha_mid(accscalar_t x, accscalar_t alpha, accscalar_t beta) {
+  const accscalar_t total = alpha + beta;
+  const accscalar_t mean = alpha / total;
+  const accscalar_t std = compat_sqrt(alpha * beta / (total + 1)) / total;
+  if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) {
+    // Avoid the singularity at x = mean.
+    const accscalar_t poly = 47 * x * (beta * beta) * (beta * beta) + alpha * (
+                           (43 + 20 * (16 + 27 * beta) * x) * (beta * beta) * beta + alpha * (
+                           3 * (59 + 180 * beta - 90 * x) * (beta * beta) + alpha * (
+                           (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * (
+                           8 * (1 - x) * (135 * beta - 11)))));
+    const accscalar_t prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total);
+    const accscalar_t prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total);
+    return prefactor_num / (1 - x) * poly / prefactor_den;
+  }
+  const accscalar_t prefactor = -x / compat_sqrt(2 * alpha * beta / total);
+  const accscalar_t stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha * alpha))
+                             * (1 + 1 / (12 * beta) + 1 / (288 * beta * beta))
+                             / (1 + 1 / (12 * total) + 1 / (288 * total * total));
+  const accscalar_t term1_num = 2 * (alpha * alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta * beta);
+  const accscalar_t axbx = alpha * (x - 1) + beta * x;
+  const accscalar_t term1_den = compat_sqrt(2 * alpha / beta) * compat_pow(total, static_cast<accscalar_t>(1.5f)) * axbx * axbx;
+  const accscalar_t term1 = term1_num / term1_den;
+  const accscalar_t term2 = 0.5f * compat_log(alpha / (total * x));
+  const accscalar_t term3_num = compat_sqrt(8 * alpha * beta / total);
+  const accscalar_t term3_den = beta * x + alpha * (x - 1);
+  const accscalar_t term3 = term3_num / term3_den;
+  const accscalar_t term4_base = beta * compat_log(beta / (total * (1 - x))) +
+                               alpha * compat_log(alpha / (total * x));
+  const accscalar_t term4 = compat_pow(term4_base, static_cast<accscalar_t>(-1.5f));
+  const accscalar_t term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4));
+  return static_cast<scalar_t>(stirling * prefactor * term1234);
+}
+
+// Computes a scaled reparameterized gradient
+//   -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x)
+// for random number x drawn from a Beta distribution Beta(alpha,beta).
+// This function inputs total=alpha+beta to make it easy to implement
+// Dirichlet reparameterized gradients in terms of Betas.
+template<typename scalar_t, typename accscalar_t>
+C10_HOST_DEVICE static inline scalar_t dirichlet_grad_one(scalar_t x, scalar_t alpha, scalar_t total) {
+  accscalar_t x_ = static_cast<accscalar_t>(x);
+  accscalar_t alpha_ = static_cast<accscalar_t>(alpha);
+  accscalar_t total_ = static_cast<accscalar_t>(total);
+
+  const scalar_t beta = total - alpha;
+  const accscalar_t beta_ = total_ - alpha_;
+  const scalar_t boundary = total * x * (1 - x);
+
+  // Use an asymptotic approximation for x close to 0.
+  if (x <= 0.5f && boundary < 2.5f) {
+    return _beta_grad_alpha_small<scalar_t, accscalar_t>(x, alpha, beta);
+  }
+
+  // Use an asymptotic approximation for x close to 1.
+  if (x >= 0.5f && boundary < 0.75f) {
+    return -_beta_grad_beta_small<scalar_t, accscalar_t>(1 - x, beta, alpha);
+  }
+
+  // Use an asymptotic approximation when alpha and (total - alpha) are both large.
+  if (alpha > 6 && beta > 6) {
+    return _beta_grad_alpha_mid<scalar_t, accscalar_t>(x_, alpha_, beta_);
+  }
+
+  // Use a rational correction to an analytic approximation.
+  static const accscalar_t c[2][3][3][4] = {
+    {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863},
+      {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033},
+      {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}},
+     {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814},
+      {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057},
+      {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}},
+     {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565},
+      {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181},
+      {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}},
+    {{{1, -0.02924021934, -0.04438342661, 0.007285809825},
+      {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521},
+      {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}},
+     {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273},
+      {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956},
+      {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}},
+     {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05},
+      {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05},
+      {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}},
+  };
+  const accscalar_t u = compat_log(x_);
+  const accscalar_t a = compat_log(alpha_) - u;
+  const accscalar_t b = compat_log(total_) - a;
+  const accscalar_t pow_u[3] = {1, u, u * u};
+  const accscalar_t pow_a[3] = {1, a, a * a};
+  accscalar_t p = 0.0;
+  accscalar_t q = 0.0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      const accscalar_t ua = pow_u[i] * pow_a[j];
+      p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3])));
+      q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3])));
+    }
+  }
+  const accscalar_t approx = x_ * (digamma_one<scalar_t, accscalar_t>(total_) - digamma_one<scalar_t, accscalar_t>(alpha_)) / beta_;
+  return static_cast<scalar_t>(p / q * approx);
+}
+
+} // namespace
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/EmbeddingBag.h b/MLPY/Lib/site-packages/torch/include/ATen/native/EmbeddingBag.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa927d7831af53976c46babe7b7b1c45f392e90a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/EmbeddingBag.h
@@ -0,0 +1,139 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/Config.h>
+#include <cstdint>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/FbgemmEmbedding.h>
+#endif
+
+namespace at::native {
+
+void check_arguments(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    bool include_last_offset);
+
+void make_bag_size_out(
+    Tensor& bag_size_out,
+    const Tensor& offsets,
+    const Tensor& indices,
+    const int64_t mode,
+    const bool include_last_offset,
+    const bool requires_grad);
+
+void make_max_indices_out(
+    Tensor& max_indices_out,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& bag_size,
+    const int64_t mode,
+    bool include_last_offset);
+
+void make_offset2bag_out(
+    Tensor& offset2bag,
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const c10::optional<Tensor>& per_sample_weights,
+    const int64_t padding_idx = -1);
+
+#ifdef USE_FBGEMM
+
+template<bool has_weight, typename TIndex, typename TData>
+struct _CallbackAndBlockSize {
+    using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature<TData, TIndex, TIndex, TData>::Type;
+
+    int64_t blockSize = -1;
+    TCallback callback = nullptr;
+
+    static TCallback generateCallback(int64_t block_size) {
+        return fbgemm::GenerateEmbeddingSpMDM<TData, TIndex, TIndex, TData>(
+                block_size,
+                has_weight,
+                /* normalize_by_lengths */false,
+                /* prefetch */16,
+                /* is_weight_positional */false,
+                /* use_offsets */true);
+    }
+
+    _CallbackAndBlockSize() = default;
+
+    explicit _CallbackAndBlockSize(c10::optional<int64_t> maybe_block_size)
+      : blockSize(maybe_block_size.value_or(-1))
+      , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
+    {}
+};
+
+template<typename... StorageMixins>
+struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
+
+    _EmbeddingBagKernelCacheImpl() = default;
+    // use each of the mixins to store corresponding kernel and block size
+    explicit _EmbeddingBagKernelCacheImpl(c10::optional<int64_t> maybe_block_size)
+      : StorageMixins(maybe_block_size)...
+    {}
+
+    // this method is thread safe (call sites may call from different threads)
+    template<bool has_weight, typename TIndex, typename TData>
+    typename _CallbackAndBlockSize<has_weight, TIndex, TData>::TCallback
+    getCallback(int64_t block_size) const {
+        // if the cache doesn't store the kernel for the incoming block size
+        // (so it is different from the one stored in corresponding mixin)
+        // regenerate the kernel (not writing it into the cache so we avoid locks)
+        if (block_size != _CallbackAndBlockSize<has_weight, TIndex, TData>::blockSize) {
+            return _CallbackAndBlockSize<has_weight, TIndex, TData>::generateCallback(block_size);
+        }
+        // else retrieve the cached kernel from the corresponding mixin
+        return _CallbackAndBlockSize<has_weight, TIndex, TData>::callback;
+    }
+};
+
+// instantiate the cache with the list of storage mixins
+// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
+using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
+    _CallbackAndBlockSize<true, int32_t, float>,
+    _CallbackAndBlockSize<false, int32_t, float>,
+    _CallbackAndBlockSize<true, int64_t, float>,
+    _CallbackAndBlockSize<false, int64_t, float>,
+    _CallbackAndBlockSize<true, int32_t, unsigned short>,
+    _CallbackAndBlockSize<false, int32_t, unsigned short>,
+    _CallbackAndBlockSize<true, int64_t, unsigned short>,
+    _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+#else
+struct _EmbeddingBagKernelCache {
+    explicit _EmbeddingBagKernelCache(c10::optional<int64_t> /* maybe_block_size */) {}
+};
+#endif
+
+void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
+    Tensor& bag_size, Tensor* max_indices,
+    const Tensor &weight, const Tensor &indices,
+    const Tensor &offsets, const int64_t mode = 0,
+    const c10::optional<Tensor>& per_sample_weights = c10::nullopt,
+    bool include_last_offset = false,
+    int64_t padding_idx = -1,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    const bool sparse,
+    const c10::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const c10::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Fill.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..66ae4b0a14f0f73a7ac4e1ecca5e28f45867f9ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Fill.h
@@ -0,0 +1,21 @@
+// Functions that fill Tensors with constants. Implementations are in Fill.cpp.
+
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+class Tensor;
+struct TensorIterator;
+
+namespace native {
+
+DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub);
+
+Tensor& fill_out(Tensor& self, const Scalar& value);
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ForeachUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ForeachUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..18df3a2a0e78be9909231832956f066e94aaf2c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ForeachUtils.h
@@ -0,0 +1,371 @@
+#pragma once
+
+#include <ATen/Device.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ScalarType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type_native.h>
+#endif
+
+#include <unordered_map>
+#include <vector>
+
+namespace at::native {
+namespace {
+// Check if tensor list has either a boolean tensor or a integer tensor
+inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
+  return std::any_of(
+      tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
+        return at::isIntegralType(t.scalar_type(), includeBool);
+      });
+}
+// check if tensor list has bool tensors
+inline bool has_bool_tensor(TensorList tensors) {
+  return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool {
+    return t.scalar_type() == ScalarType::Bool;
+  });
+}
+
+// Check foreach API restrictions
+// - Tensor lists must be non-empty.
+// - All TensorLists and ScalarLists must have the same number of elements.
+// - Corresponding tensors must have the same size.
+inline void check_foreach_api_restrictions(TensorList tensors) {
+  TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors);
+  TORCH_CHECK(
+      tensors.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list.");
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+  TORCH_CHECK(
+      tensors1.size() == tensors3.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors3.size());
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  TORCH_CHECK(
+      tensors1.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list, got ",
+      tensors1.size(),
+      " and ",
+      scalars.size());
+}
+
+// Helper function called in check_fast_path_restrictions to check whether all
+// corresponding tensors (aligning in index across the tensorLists) share the
+// same device and dtype.
+inline bool _check_tensors_share_device_and_dtype(
+    ArrayRef<TensorList> tensorLists) {
+  const auto expected_dtype = tensorLists[0][0].dtype();
+  const auto expected_device = tensorLists[0][0].device();
+
+  auto is_tensor_okay = [&](const Tensor& tensor) {
+    return tensor.dtype() == expected_dtype &&
+        tensor.device() == expected_device && tensor.layout() == at::kStrided &&
+        tensor.is_non_overlapping_and_dense();
+  };
+
+  for (const auto& tensorList : tensorLists) {
+    for (const auto& tensor : tensorList) {
+      if (!is_tensor_okay(tensor)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Helper function called in check_fast_path_restrictions to check if
+// corresponding tensors in tensor lists have the same sizes and strides.
+inline bool _check_tensors_share_sizes_and_strides(
+    ArrayRef<TensorList> tensorLists) {
+  for (const auto i : c10::irange(1, tensorLists.size())) {
+    for (const auto j : c10::irange(tensorLists[0].size())) {
+      if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
+          tensorLists[0][j].strides() != tensorLists[i][j].strides()) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Helper function called in check_fast_path_restrictions to check whether
+// all tensors type promote properly with the scalars in scalarList. This
+// function assumes that _check_tensors_share_device_and_dtype has already been
+// called so that all corresponding tensors in tensorLists have the same dtype.
+// Then, it is sufficient to check the type promotion with just one tensorList.
+inline bool _check_tensors_do_type_promotion_with_scalars(
+    TensorList tensorList,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  for (const auto i : c10::irange(tensorList.size())) {
+    // For division, integer inputs will result in float.
+    if (does_op_promote_integer_inputs_to_float) {
+      if (at::isIntegralType(
+              tensorList[i].scalar_type(), /*includeBool*/ true)) {
+        return false;
+      }
+    }
+    if (!scalarList.empty()) {
+      const auto& scalar =
+          scalarList.size() == 1 ? scalarList[0] : scalarList[i];
+      const auto& tensor = tensorList[i];
+      // note(mkozuki): This check might be responsible for
+      // `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path.
+      if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// To go via 'fast' path, several conditions must be satisfied
+// - All tensors in all lists must have the same dtype.
+// - All tensors must be on the same device
+// - All tensors must have strided layout
+// - All tensors must be non-overlapping and dense
+// - Resulting tensor must have the same dtype as the input one
+
+// Please, make sure to call check_foreach_api_restrictions before calling this
+// method. There is a set of preconditions that have to be satisfied.
+inline bool check_fast_path_restrictions(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return _check_tensors_share_device_and_dtype(tensorLists) &&
+      _check_tensors_share_sizes_and_strides(tensorLists) &&
+      _check_tensors_do_type_promotion_with_scalars(
+             tensorLists[0],
+             scalarList,
+             does_op_promote_integer_inputs_to_float);
+}
+
+inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
+    const Tensor& scalarList_,
+    int64_t expect_length) {
+  std::vector<c10::Scalar> scalarList;
+  TORCH_CHECK(
+      scalarList_.device() == c10::kCPU,
+      "Expected scalars to be on CPU, got ",
+      scalarList_.device(),
+      " instead.");
+  TORCH_CHECK(
+      scalarList_.is_contiguous(), "Expected scalars to be contiguous.");
+  TORCH_CHECK(
+      scalarList_.dim() == 1,
+      "Expected packed scalar Tensor to be of dimension 1. Got ",
+      scalarList_.dim(),
+      " instead.");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16,
+      scalarList_.scalar_type(),
+      "convert_tensor_to_scalar_list",
+      [&]() {
+        const scalar_t* scalar_data = scalarList_.data_ptr<scalar_t>();
+        TORCH_CHECK(
+            (expect_length == scalarList_.size(0)),
+            "Expected length of scalars to match input of length ",
+            expect_length,
+            " but got ",
+            scalarList_.size(0),
+            " instead.");
+        for (int64_t i = 0; i < scalarList_.size(0); i++) {
+          scalarList.emplace_back(scalar_data[i]);
+        }
+      });
+  return scalarList;
+}
+
+inline bool can_use_fast_route(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return check_fast_path_restrictions(
+      tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
+}
+
+inline bool can_use_fast_route(
+    TensorList tensors1,
+    TensorList tensors2,
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return can_use_fast_route(
+      {tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
+}
+
+using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
+using IndicesT = std::vector<size_t>;
+using nested_optional_tensorvec_t =
+    std::vector<std::vector<c10::optional<at::Tensor>>>;
+using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
+using FlatMap = std::unordered_map<
+    DeviceDtypeKey,
+    TensorsAndIndicesT,
+    ParamsHash<DeviceDtypeKey>>;
+
+inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
+    const nested_optional_tensorvec_t& nested_tensorlist,
+    const bool with_indices) {
+  FlatMap grouped_tensors_with_indices;
+
+  TORCH_CHECK(!nested_tensorlist.empty());
+  TORCH_CHECK(!nested_tensorlist[0].empty());
+  const auto num_lists = nested_tensorlist.size();
+  const auto num_tensors = nested_tensorlist[0].size();
+
+  TORCH_CHECK(std::all_of(
+      nested_tensorlist.cbegin(),
+      nested_tensorlist.cend(),
+      [&](const auto& tensorlist) -> bool {
+        // note(crcrpar): Allow empty tensorlists following
+        // ref:
+        // https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24
+        return tensorlist.size() == num_tensors || tensorlist.size() == 0;
+      }));
+
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    const auto key = [&]() -> DeviceDtypeKey {
+      const auto t = nested_tensorlist[0][tensor_index];
+      TORCH_CHECK(
+          t.has_value(),
+          "Tensors of the first list of nested Tensor lists are supposed to be defined but ",
+          "the ",
+          tensor_index,
+          "-th Tensor is not.");
+      return {t->device(), t->scalar_type()};
+    }();
+    TORCH_CHECK(
+        std::all_of(
+            nested_tensorlist.cbegin(),
+            nested_tensorlist.cend(),
+            [&](const auto& tensorlist) -> bool {
+              if (tensorlist.size() == 0) {
+                return true;
+              }
+              const auto& tensor = tensorlist[tensor_index];
+              // note(crcrpar): Currently the scope of this function is
+              // optimizers so there could be `state_steps` and other scalars
+              // whose elements are float tensors no matter what the parameter's
+              // dtype is.
+              if (!tensor.has_value()) {
+                return true;
+              } else {
+                const auto s = tensor->scalar_type();
+                const auto d = tensor->device();
+                // Note: `step` or `state_step` is float32 by default.
+                if (key.first == d) {
+                  return key.second == s || s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else if (d.is_cpu()) {
+                  // note(crcrpar): There are some test cases (e.g.
+                  // TestOptim::test_adam) where state_steps are on CPU and the
+                  // others are on CUDA. Currently a state_step Tensor has the
+                  // dtype of float.
+                  return s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else {
+                  return false;
+                }
+              }
+            }),
+        "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
+    if (!grouped_tensors_with_indices.count(key)) {
+      grouped_tensors_with_indices.insert(
+          {key,
+           TensorsAndIndicesT{
+               [&]() -> nested_optional_tensorvec_t {
+                 nested_optional_tensorvec_t nested_tensorvec;
+                 nested_tensorvec.reserve(num_lists);
+                 for (const auto& i : c10::irange(num_lists)) {
+                   std::vector<c10::optional<at::Tensor>> tensors;
+                   if (!nested_tensorlist[i].empty()) {
+                     // NB: num_tensors is the max possible length for any of
+                     // the inner lists of tensor references. Reserving the max
+                     // trades memory for perf. This should not have significant
+                     // impact.
+                     tensors.reserve(num_tensors);
+                   }
+                   nested_tensorvec.emplace_back(tensors);
+                 }
+                 return nested_tensorvec;
+               }(),
+               [&]() -> IndicesT {
+                 if (!with_indices) {
+                   return {};
+                 } else {
+                   IndicesT indices;
+                   indices.reserve(num_tensors);
+                   return indices;
+                 }
+               }()}});
+    }
+    for (const auto& list_index : c10::irange(num_lists)) {
+      if (!nested_tensorlist[list_index].empty()) {
+        grouped_tensors_with_indices[key].first[list_index].emplace_back(
+            nested_tensorlist[list_index][tensor_index]);
+      }
+    }
+    if (with_indices) {
+      grouped_tensors_with_indices[key].second.emplace_back(tensor_index);
+    }
+  }
+
+  return grouped_tensors_with_indices;
+}
+
+} // namespace
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/FractionalMaxPooling.h b/MLPY/Lib/site-packages/torch/include/ATen/native/FractionalMaxPooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..af87ba8b8962904ef5dbc3d7290d90c27f52c20d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/FractionalMaxPooling.h
@@ -0,0 +1,80 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+template<typename scalar_t>
+static inline std::vector<int> generate_intervals(
+    scalar_t sample,
+    int64_t inputSize,
+    int64_t outputSize,
+    int64_t poolSize) {
+  std::vector<int> sequence(outputSize);
+  if (outputSize > 1) {
+    scalar_t alpha = static_cast<scalar_t>(inputSize - poolSize) /
+      static_cast<scalar_t>(outputSize - 1);
+
+    for (const auto i : c10::irange(outputSize - 1)) {
+      sequence[i] =
+        static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
+    }
+  }
+  if (outputSize > 0) {
+    sequence[outputSize - 1] = inputSize - poolSize;
+  }
+  return sequence;
+}
+
+template <int64_t ndim>
+static inline void fractional_max_pool_check_shape(
+    const Tensor& input,
+    const Tensor& randomSamples) {
+
+  TORCH_CHECK(
+      input.scalar_type() == randomSamples.scalar_type(),
+      "Expect _random_samples to have the same dtype as input");
+
+  int64_t ndimension = randomSamples.ndimension();
+  TORCH_CHECK(
+      ndimension == 3,
+      "Expect _random_samples to have 3 dimensions, got ", ndimension);
+
+  int64_t N = randomSamples.size(0);
+  int64_t C = randomSamples.size(1);
+  int64_t D = randomSamples.size(2);
+
+  int64_t input_batch, input_channel;
+  if (ndim == 2) {
+    // fractional_max_pool2d
+    if (input.ndimension() == 3) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  } else {
+    // factional_max_pool3d
+    if (input.ndimension() == 4) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  }
+
+  TORCH_CHECK(
+      N >= input_batch,
+      "Expect _random_samples.size(0) no less then input batch size.");
+  TORCH_CHECK(
+      C == input_channel,
+      "Expect _random_samples.size(1) equals to input channel size.");
+  TORCH_CHECK(
+      D == ndim,
+      "Expect _random_samples.size(2) equals to ", ndim, "; got ", D, ".");
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e760b3bfaa7eea53046b8ec7ea00605b6f209503
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+struct TensorIterator;
+
+namespace native {
+
+using _compute_linear_combination_fn = void(*)(
+  TensorIterator& iter,
+  int64_t in_stride,
+  int64_t coeff_stride,
+  int64_t num_summations
+);
+
+DECLARE_DISPATCH(_compute_linear_combination_fn, _compute_linear_combination_stub);
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/GridSampler.h b/MLPY/Lib/site-packages/torch/include/ATen/native/GridSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cad7bd8205bf08c83f1e322c6eca70df03c27935
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/GridSampler.h
@@ -0,0 +1,298 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+
+#include <ATen/native/GridSamplerUtils.h>
+
+namespace at::native {
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
+// except that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size,
+                                                         bool align_corners, scalar_t *grad_in) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    *grad_in = static_cast<scalar_t>(size - 1) / 2;
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    *grad_in = static_cast<scalar_t>(size) / 2;
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template<typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+}
+
+// clip_coordinates_set_grad works similarly to clip_coordinates except that
+// it also returns the `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template<typename scalar_t>
+static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit,
+                                                 scalar_t *grad_in) {
+  // Note that it is important for the gradient calculation that borders
+  // are considered out of bounds.
+  if (in <= static_cast<scalar_t>(0)) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  } else {
+    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<scalar_t>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<scalar_t>(1);
+      return in;
+    }
+  }
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template<typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+// reflect_coordinates_set_grad works similarly to reflect_coordinates except
+// that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template<typename scalar_t>
+static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low,
+                                                    int64_t twice_high, scalar_t *grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  }
+  int grad_in_mult_;
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<scalar_t>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  // `fmod` returns same sign as `in`, which is positive after the `if` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<scalar_t>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+// Mapping the out-of-boundary points back into boundary
+// This would only affect padding_mode=border or reflection
+template<typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           GridSamplerPadding padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2*(size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2*size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int64_t size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+// grid_sampler_compute_source_index_set_grad works similarly to
+// grid_sampler_compute_source_index except that it also returns the
+// `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index_set_grad(
+    scalar_t coord,
+    int64_t size,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    scalar_t *grad_in) {
+  scalar_t grad_clip, grad_refl;
+  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
+    } else {
+      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template<typename scalar_t>
+static inline scalar_t get_value_bounded(
+    scalar_t* data,
+    scalar_t x,
+    scalar_t y,
+    int64_t W,
+    int64_t H,
+    int64_t sW,
+    int64_t sH,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template<typename scalar_t>
+static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w,
+                               int64_t sH, int64_t sW, int64_t H, int64_t W,
+                               scalar_t delta) {
+  if (within_bounds_2d(h, w, H, W)) {
+    data[h * sH + w * sW] += delta;
+  }
+}
+
+template<typename scalar_t>
+static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w,
+                               int64_t sD, int64_t sH, int64_t sW,
+                               int64_t D, int64_t H, int64_t W,
+                               scalar_t delta) {
+  if (within_bounds_3d(d, h, w, D, H, W)) {
+    data[d * sD + h * sH + w * sW] += delta;
+  }
+}
+
+template<typename scalar_t>
+static inline void add_value_bounded(
+    scalar_t* data,
+    scalar_t x,
+    scalar_t y,
+    int64_t W,
+    int64_t H,
+    int64_t sW,
+    int64_t sH,
+    scalar_t delta,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  safe_add_2d(data, iy, ix, sH, sW, H, W, delta);
+}
+
+// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
+template<typename scalar_t>
+static inline void get_cubic_coefficients_grad(
+    scalar_t coeffs[4],
+    scalar_t t) {
+
+  // Must be the same as forward calculation in
+  // aten/src/ATen/native/UpSample.h:get_cubic_upsample_coefficients
+  scalar_t A = -0.75;
+
+  scalar_t x;
+  x = -1 - t; // 1 < x = |-1 - tx| < 2
+  coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A;
+  x = -t;     // x = |0 - tx| <= 1
+  coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 1 - t;  // x = |1 - tx| <= 1
+  coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 2 - t;  // 1 < x = |2 - tx| < 2
+  coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A;
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/GridSamplerUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/GridSamplerUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d1084366a4daf2fc4c783a3a5c0e0cd2533bb45
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/GridSamplerUtils.h
@@ -0,0 +1,109 @@
+#pragma once
+
+// See NOTE: [Tensor vs. TensorBase]
+// https://github.com/pytorch/pytorch/pull/66979
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/TensorProperties.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at::native {
+
+namespace detail {
+
+enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic};
+enum class GridSamplerPadding {Zeros, Border, Reflection};
+
+} // namespace detail
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+namespace {
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_common(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  auto input_opt = input.options();
+  auto grid_opt = grid.options();
+
+  TORCH_CHECK(
+    input.defined(),
+    "grid_sampler(): expected input to not be undefined");
+  TORCH_CHECK(
+    grid.defined(),
+    "grid_sampler(): expected grid to not be undefined");
+  TORCH_CHECK(
+    input_opt.device() == grid_opt.device(),
+    "grid_sampler(): expected input and grid to be on same device, but input "
+    "is on ", input_opt.device(), " and grid is on ", grid_opt.device());
+  TORCH_CHECK(
+    input_opt.layout() == kStrided && grid_opt.layout() == kStrided,
+    "grid_sampler(): expected input and grid to have torch.strided layout, but "
+    "input has ", input_opt.layout(), " and grid has ", grid_opt.layout());
+  TORCH_CHECK(
+    input.size(0) == grid.size(0),
+    "grid_sampler(): expected grid and input to have same batch size, but got "
+    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    grid.size(-1) == input.dim() - 2,
+    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
+    "dimension, but got grid with sizes ", grid.sizes());
+
+  for (const auto i : c10::irange(2, input.dim())) {
+    TORCH_CHECK(input.size(i) > 0,
+      "grid_sampler(): expected input to have non-empty spatial dimensions, "
+      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
+      "empty");
+  }
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_2d(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  TORCH_CHECK(
+    input.dim() == 4 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 4D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+void check_grid_sampler_3d(
+  const TensorBase& input,
+  const TensorBase& grid,
+  int64_t interpolation_mode
+) {
+  TORCH_CHECK(
+    input.dim() == 5 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 5D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    !(input.dim() == 5 &&
+      static_cast<GridSamplerInterpolation>(interpolation_mode) ==
+        GridSamplerInterpolation::Bicubic),
+    "grid_sampler(): bicubic interpolation only supports 4D input");
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+// cudnn does not support inputs larger than 1024.
+bool cond_cudnn_grid_sampler(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  return (
+    at::native::cudnn_is_acceptable(input) &&
+    at::native::cudnn_is_acceptable(grid) &&
+    at::native::canUse32BitIndexMath(input) &&
+    at::native::canUse32BitIndexMath(grid) &&
+    input.dim() == 4 &&
+    input.sym_size(1) <= 1024);
+}
+
+} // anonymous namespace
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Histogram.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Histogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..27265bdc7d89e16db6af4a315b7b1f2368c6c50c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Histogram.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using histogramdd_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&);
+using histogramdd_linear_fn = void(*)(const Tensor&, const c10::optional<Tensor>&, bool, Tensor&, const TensorList&, bool);
+using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges);
+
+DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub);
+DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub);
+DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/IndexKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e89d349cebbbbe5501ea587aac6ea18e7a86a83e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/IndexKernel.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at::native {
+
+using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides);
+using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source);
+using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride);
+using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
+using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate);
+using take_fn = void(*)(TensorIterator & iter, const TensorBase& input);
+using flip_fn = void(*)(TensorIterator &, const bool);
+using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar);
+using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
+using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &);
+
+DECLARE_DISPATCH(index_fn, index_stub);
+DECLARE_DISPATCH(index_fill_fn, index_fill_stub);
+DECLARE_DISPATCH(index_copy_fn, index_copy_stub);
+DECLARE_DISPATCH(index_put_fn, index_put_stub);
+DECLARE_DISPATCH(put_fn, put_stub);
+DECLARE_DISPATCH(take_fn, take_stub);
+DECLARE_DISPATCH(flip_fn, flip_stub);
+DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_stub);
+DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/IndexingUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/IndexingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf1edc6de186211932cc1e76ce410833e43c1b06
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/IndexingUtils.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/ExpandUtils.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/core/IListRef.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+[[noreturn]]
+static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) {
+  TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx);
+}
+
+
+static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, IOptTensorListRef indices) {
+  // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
+  std::vector<Tensor> result;
+  for (const auto& index_opt : indices) {
+    if (!index_opt.has_value()) {
+      result.emplace_back();
+    } else {
+      const auto& index = *index_opt;
+      if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
+        if (index.scalar_type() == kByte) {
+          TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
+          " please use a dtype torch.bool instead.");
+        }
+        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
+        // corresponding dimensions in self
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = static_cast<int64_t>(result.size() + j);
+          if (index.size(j) != self.size(srcIdx)) {
+            invalid_mask(self, srcIdx, index, j);
+          }
+        }
+        // Replace with nonzeros
+        auto nonzero = index.nonzero();
+        for (const auto j : c10::irange(index.dim())) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      } else {
+        result.emplace_back(index);
+      }
+    }
+  }
+  return result;
+}
+
+static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) {
+  for (const auto& tensor : indices) {
+    if (tensor.has_value() && tensor->defined()) {
+      auto scalarType = tensor->scalar_type();
+      if (allow_int) {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors");
+        }
+      } else {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors");
+        }
+      }
+    }
+  }
+}
+
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const Tensor& a : list) {
+    result.push_back(a);
+  }
+  return result;
+}
+
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValue> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const IValue& a : list) {
+    result.push_back(a.isTensor() ? c10::optional<Tensor>(a.toTensor()) : c10::optional<Tensor>());
+  }
+  return result;
+}
+
+static C10_UNUSED bool hasContiguousSubspace(TensorList tl) {
+  // true if all the non-null tensors are adjacent
+  auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
+  auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
+  auto start = std::find_if(tl.begin(), tl.end(), isDefined);
+  auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined);
+  auto it = std::find_if(start, stop.base(), isNull);
+  return it == stop.base();
+}
+
+
+// Transposes the tensor and indices together so that all the non-null indices
+// index the first k dimensions of the tensor. Returns the transposed tensor
+// and the reordered indices. For example:
+// transposeToFront(tensor, {nullptr, a, nullptr, b})
+// returns
+// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
+static C10_UNUSED std::tuple<Tensor, std::vector<Tensor>>
+transposeToFront(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices));
+}
+
+inline std::tuple<Tensor, std::vector<Tensor>, std::vector<int64_t>>
+transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<int64_t> invPerm;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  invPerm.resize(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    invPerm[dims[i]] = i;
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm));
+}
+
+struct AdvancedIndex {
+  AdvancedIndex(const Tensor& src, TensorList indices);
+
+  Tensor src;
+  std::vector<Tensor> indices;
+  DimVector indexed_sizes;
+  DimVector indexed_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+};
+
+
+} //namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Lerp.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Lerp.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fd66810125129ec9cf3fa7c3e9d218b084b687f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Lerp.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+template <typename scalar_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(scalar_t weight) {
+  return std::abs(weight) < scalar_t(0.5);
+}
+template <typename scalar_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(c10::complex<scalar_t> weight) {
+  // Avoid the sqrt in abs(weight)
+  return (weight.real() * weight.real() + weight.imag() * weight.imag()) < scalar_t(0.25);
+}
+
+template <typename scalar_t, typename weight_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE scalar_t lerp(scalar_t self_, scalar_t end_, weight_t weight_) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using opmath_weight_t = at::opmath_type<weight_t>;
+
+  opmath_t self = self_;
+  opmath_t end = end_;
+  opmath_weight_t weight = weight_;
+
+  // Conditional for better numeric. This has been discussed in
+  // https://github.com/pytorch/pytorch/pull/18871
+  return is_lerp_weight_small(weight)
+      ? self + weight * (end - self)
+      : end - (end - self) * (opmath_t(1) - weight);
+}
+
+using lerp_fn_scalar = void (*)(
+    at::TensorIteratorBase& iter,
+    const Scalar& weight);
+
+using lerp_fn_tensor = void (*)(
+    at::TensorIteratorBase& iter);
+
+DECLARE_DISPATCH(lerp_fn_scalar, lerp_kernel_scalar_weight);
+DECLARE_DISPATCH(lerp_fn_tensor, lerp_kernel_tensor_weight);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebra.h b/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebra.h
new file mode 100644
index 0000000000000000000000000000000000000000..507a5f7b9c43ec2aced978c0f83100e555477b0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebra.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace at::native {
+
+using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha);
+DECLARE_DISPATCH(addr_fn, addr_stub);
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd7c014226cc9147bfc204d4a2775c401661731
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h
@@ -0,0 +1,623 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+#include <c10/util/Exception.h>
+#include <c10/util/strides.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TransposeType.h>
+#include <limits>
+#include <type_traits>
+#include <sstream>
+#include <cstring>
+#include <cctype>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+
+static inline c10::MaybeOwned<Tensor> expect_resolved_conj(const Tensor& tensor) {
+  if (tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+static inline DimVector batched_matrix_contiguous_strides(
+    const IntArrayRef sizes,
+    const bool f_contig = false) {
+  // f_contig chooses between the strides of a batch of Fortran (F-contiguous)
+  // and C-contiguous matrices
+  auto strides = c10::contiguous_strides(sizes);
+  auto dim = strides.size();
+
+  if (f_contig && dim >= 2) {
+    // Fix the strides of the last two dimensions, so that we return
+    // C-contiguous batches of F-contiguous matrices.
+    strides[dim - 1] = std::max(sizes[dim - 2], static_cast<int64_t>(1));
+    strides[dim - 2] = 1;
+  }
+  return strides;
+}
+
+/*
+ * Clones a Tensor so that the following conditions hold:
+ * If we think of a Tensor of having size (B, M, N), where B is any number
+ * of batch dimensions, then:
+ * - Each (M, N) matrix is in column major form
+ * - Let Tensor P have size (B, M, N) and Q have size (B, M', N').
+ *   Then when laid out in memory, the M by N matrix starting at
+ *   P.data_ptr()[B * M * N] is of the same corresponding batch as the M' by N'
+ *   matrix starting at Q.data_ptr()[B * M' * N'].
+ */
+static inline Tensor cloneBatchedColumnMajor(const Tensor& src) {
+  // If src is already in batched column major format, then
+  // this will be efficient (no reordering of the data will occur)
+  // because the first transpose will make the tensor contiguous,
+  // and cloning a contiguous tensor is fast.
+  auto result = src.mT().clone(at::MemoryFormat::Contiguous);
+  result.transpose_(-2, -1);
+  return result;
+}
+
+/*
+ * contig chooses between C-contig (true) and F-contig (false)
+ */
+static inline c10::MaybeOwned<Tensor> borrow_else_clone(const bool cond, const Tensor& borrow, const Tensor& clone, const bool contig) {
+  return cond ? c10::MaybeOwned<Tensor>::borrowed(borrow)
+              : c10::MaybeOwned<Tensor>::owned(contig ? clone.clone(MemoryFormat::Contiguous)
+                                                      : cloneBatchedColumnMajor(clone));
+}
+
+/*
+ * This method is designed to be a faster alternative to
+ * `cloneBatchedColumnMajor` with some additional features,
+ * namely:
+ * 1. It uses `copy` instead of `clone` which could be much faster.
+ * 2. `nrows` parameter used to create inputs with the number of rows larger
+ *  than the original input, which is required for some LAPACK/MAGMA methods.
+ * 3. `desired_batch_size` is used to create copies with the batch size
+ *  which is either the original batch size of the input, or its larger
+ *  broadcasted shape.
+ */
+static inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1,
+    at::OptionalIntArrayRef desired_batch_sizes = c10::nullopt) {
+  nrows = (nrows == -1) ? src.size(-2) : nrows;
+  auto copy_sizes = desired_batch_sizes.has_value()
+    ? desired_batch_sizes.value().vec()
+    : IntArrayRef(src.sizes().data(), src.dim() - 2).vec();
+  copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)});
+  const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true);
+  auto copy = at::empty_strided(copy_sizes, copy_strides, src.options());
+  copy.narrow(-2, 0, src.size(-2)).copy_(src);
+  return copy;
+}
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches.
+ */
+static inline int64_t batchCount(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    result *= batched_matrices.size(i);
+  }
+  return result;
+}
+
+// Computes the number of elements of a matrix in a batched matrix tensor
+static inline int64_t matrixStride(const Tensor& batched_matrices) {
+  return batched_matrices.size(-1) * batched_matrices.size(-2);
+}
+
+// Validates input shapes for operations on batches of square matrices (inverse, cholesky, symeig, eig)
+static inline void checkIsMatrix(const Tensor& A, const char* const f_name, const char* const arg_name = "A") {
+  TORCH_CHECK(A.dim() >= 2, f_name, ": The input tensor ", arg_name, " must have at least 2 dimensions.");
+}
+static inline void squareCheckInputs(const Tensor& self, const char* const f_name, const char* const arg_name = "A") {
+  checkIsMatrix(self, f_name, arg_name);
+  TORCH_CHECK(self.sym_size(-1) == self.sym_size(-2),
+              f_name,
+              ": ", arg_name, " must be batches of square matrices, "
+              "but they are ", self.sym_size(-2), " by ", self.sym_size(-1), " matrices");
+}
+
+static inline void checkInputsSolver(const Tensor& A,
+                                     const Tensor& B,
+                                     const bool left,
+                                     const char* const f_name) {
+  squareCheckInputs(A, f_name, "A");
+  checkIsMatrix(B, f_name, "B");
+  TORCH_CHECK(left ? A.size(-2) == B.size(-2) : A.size(-1) == B.size(-1),
+              f_name, ": Incompatible shapes of A and B for the equation ",
+              left ? "AX = B" : "XA = B",
+              " (", A.size(-2), "x", A.size(-1), " and ", B.size(-2), "x", B.size(-1), ")");
+}
+
+static inline bool is_row_or_column_contiguous(const Tensor& t) {
+  // This could be made more general, similar to how it's checked in matmul, which would allow to
+  // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
+  // We choose to be conservative for simplicity
+  return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
+}
+
+static inline TransposeType to_transpose_type(const bool contig, const bool conj) {
+  if (conj) {
+    if (contig) { TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); }
+    else {        return TransposeType::ConjTranspose; }
+  } else {
+    if (contig) { return TransposeType::NoTranspose; }
+    else {        return TransposeType::Transpose; }
+  }
+}
+
+
+// This function is designed to be used with linear algebra methods that minimize
+// L(ax - b) = 0, where L is generally the identity map (`solve`, for example)
+// or the L2 norm (`lstsq`).
+// It is expected that `a` and `b` are contiguous tensors of column-major matrices
+// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`),
+// with the following additional properties:
+//
+// 1. a.dim() == b.dim()
+// 2. a.shape[:-2] broadcasts over b.shape[:-2]
+// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions)
+//
+// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method
+// is to be memory efficient, which means that if there exists an index i such that
+// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3,
+// then instead of materializing copies of `a` in the broadcasted shape, we keep
+// a buffer copy of `a` along with flags that check whether specific batch dimension
+// indices for `a` were already accessed. If they were, we copy the data from the buffer
+// into `a`. The number of copies does not exceed
+// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1)
+// and this value is attained by tensors with non-empty batch dimensions.
+//
+// func_t `f` is a callable that is being supplied with
+// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx.
+// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines,
+// and a_linear_batch_idx is an index in the 3d representation which corresponds to
+// the memory a_working_ptr points to, in other words:
+// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr<scalar_t>();
+// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example,
+// its rank or singular values (see linalg_lstsq).
+template<typename scalar_t, typename func_t>
+void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) {
+  IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2);
+  IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2);
+
+  auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes);
+  auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes);
+
+  TensorIterator iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .add_output(b_linear_batch_idx)
+    .add_input(a_linear_batch_idx)
+    .build();
+
+  auto m = a.size(-2);
+  auto n = a.size(-1);
+  auto a_3d = a.view({batchCount(a), m, n});
+  auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)});
+
+  auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes);
+  Tensor a_buffer, a_was_accessed, a_buffer_3d;
+  std::function<void(int64_t)> check_if_copy_needed_for_a
+    = [](int64_t /*a_curr_linear_batch_idx*/){};
+  if (a_broadcasts_over_b) {
+    a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options())
+      .copy_(a);
+    a_was_accessed = at::zeros(batchCount(a), at::kBool);
+    a_buffer_3d = a_buffer.view({batchCount(a), m, n});
+    check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) {
+      auto* a_was_accessed_flag = a_was_accessed
+        .select(0, a_curr_linear_batch_idx)
+        .data_ptr<bool>();
+      if (!(*a_was_accessed_flag)) {
+        *a_was_accessed_flag = true;
+      }
+      else {
+        a_3d.select(0, a_curr_linear_batch_idx)
+          .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx));
+      }
+    };
+  }
+
+  auto loop = [&](char** data, const int64_t* strides, int64_t nelems) {
+    auto* b_batch_idx_ptr = data[0];
+    auto* a_batch_idx_ptr = data[1];
+
+    for (const auto elem C10_UNUSED : c10::irange(nelems)) {
+      auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
+      auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
+
+      check_if_copy_needed_for_a(a_curr_linear_batch_idx);
+
+      auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx)
+        .data_ptr<scalar_t>();
+      auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx)
+        .data_ptr<scalar_t>();
+      f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx);
+
+      b_batch_idx_ptr += strides[0];
+      a_batch_idx_ptr += strides[1];
+    }
+  };
+  iter.serial_for_each(loop, {0, batchCount(b)});
+}
+
+// Returns the epsilon value for floating types except half
+static inline double _get_epsilon(const ScalarType& sc_type) {
+  switch (sc_type) {
+    case at::ScalarType::Float:
+      return static_cast<double>(std::numeric_limits<float>::epsilon());
+    case at::ScalarType::Double:
+      return std::numeric_limits<double>::epsilon();
+    default:
+      AT_ERROR("This function doesn't handle types other than float and double");
+  }
+}
+
+// Validates input shapes and devices
+// for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve)
+static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A, const char* name) {
+  TORCH_CHECK(self.device() == A.device(),
+              "Expected b and A to be on the same device, but found b on ",
+              self.device(), " and A on ", A.device(), " instead.");
+
+  TORCH_CHECK(self.scalar_type() == A.scalar_type(),
+              "Expected b and A to have the same dtype, but found b of type ",
+              self.scalar_type(), " and A of type ", A.scalar_type(), " instead.");
+
+  TORCH_CHECK(A.size(-1) == A.size(-2),
+              "A must be batches of square matrices, "
+              "but they are ", A.size(-2), " by ", A.size(-1), " matrices");
+
+  TORCH_CHECK(A.size(-1) == self.size(-2),
+              "Incompatible matrix sizes for ", name, ": each A "
+              "matrix is ", A.size(-1), " by ", A.size(-1),
+              " but each b matrix is ", self.size(-2), " by ", self.size(-1));
+}
+
+static inline void checkFloatingOrComplex(const Tensor& t, const char* const f_name, const bool allow_low_precision_dtypes=true) {
+  auto dtype = t.scalar_type();
+  TORCH_CHECK((at::isFloatingType(dtype) || at::isComplexType(dtype)),
+              f_name, ": Expected a floating point or complex tensor as input. Got ", dtype);
+  if (!allow_low_precision_dtypes) {
+    TORCH_CHECK(dtype == kFloat || dtype == kDouble || dtype == kComplexFloat || dtype == kComplexDouble,
+                f_name, ": Low precision dtypes not supported. Got ", dtype);
+  }
+}
+
+
+// Checks if all the Tensors in a TensorList are of the same dimensions
+static inline void checkAllSameDim(TensorList tensors, int64_t dim) {
+  for (auto &t : tensors) {
+    TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead.");
+  }
+}
+
+static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
+  // broadcast the batch dimensions of arg1 and arg2.
+  IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2);
+  IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2);
+  std::vector<int64_t> expand_batch_portion = infer_size(arg1_batch_sizes, arg2_batch_sizes);
+
+  std::vector<int64_t> arg1_expand_size({expand_batch_portion});
+  arg1_expand_size.insert(arg1_expand_size.end(), { arg1.size(-2), arg1.size(-1) });
+
+  std::vector<int64_t> arg2_expand_size({expand_batch_portion});
+  arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) });
+  return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size));
+}
+
+static inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
+  // If there's no name we assume we don't want to check the errors
+  if (name != nullptr) {
+    linearSolveCheckInputs(arg1, arg2, name);
+  }
+
+  auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
+
+  auto arg1_broadcasted  = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size);
+  auto arg2_broadcasted  = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size);
+  return std::make_tuple(arg1_broadcasted, arg2_broadcasted);
+}
+
+static inline std::vector<int64_t> broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) {
+  IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims);
+  IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims);
+  auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes);
+  return broadcasted_batch_sizes;
+}
+
+// Return a permutation with the given axes moved to the end.
+static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
+  const std::vector<int64_t> a = axes.vec();
+  const int64_t ndim = self.ndimension();
+  std::vector<int64_t> perm;
+
+  for (const auto i : c10::irange(ndim)) {
+    auto it = std::find(a.begin(), a.end(), i);
+    if (it == a.end()) {
+       perm.push_back(i);
+    }
+  }
+  for (auto i : a) {
+    perm.push_back(i);
+  }
+
+  TORCH_CHECK((int64_t)perm.size() == ndim,
+    "duplicate or invalid axis in 'dim' argument for tensor with ndim==", ndim);
+
+  return self.permute(perm);
+}
+
+// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced)
+static inline std::tuple<bool, bool> _parse_qr_mode(c10::string_view mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true; // this is actually irrelevant in this mode
+  } else {
+      TORCH_CHECK(false, "qr received unrecognized mode '", mode,
+                  "' but expected one of 'reduced' (default), 'r', or 'complete'");
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition
+static inline std::tuple<DimVector, DimVector, int64_t> _compute_geometry_for_Q(
+    const Tensor& input,
+    bool reduced) {
+  int64_t m = input.size(-2), n = input.size(-1);
+  int64_t n_columns_q;
+
+  // We need to compute the required size of Q based on the `reduced` option
+  DimVector q_sizes(input.sizes());
+  if (!reduced && m > n) {
+    q_sizes[input.dim() - 1] = m;
+    n_columns_q = m;
+  } else {
+    q_sizes[input.dim() - 1] = n;
+    n_columns_q = std::min(m, n);
+  }
+  auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true);
+  return std::make_tuple(q_sizes, q_strides, n_columns_q);
+}
+
+static inline bool svd_uses_cusolver(const Tensor& A) {
+  // if cusolver is available, it is used unconditionally
+  return A.is_cuda()
+         && at::globalContext().hasCuSOLVER()
+         && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma;
+}
+
+
+// Function used instead of .to so that the original strides are retained
+// .to doesn't retain strides and make the output tensor contiguous
+static inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) {
+  auto strided_to = at::empty_strided(original_tensor.sizes(),
+                                      original_tensor.strides(),
+                                      options);
+  strided_to.copy_(original_tensor);
+  return strided_to;
+}
+
+// Creates a dimension permutation array that can be given to `at::permute()`, which will shift
+// the two specified dimensions to the end of a tensor, without changing the order of
+// the other dimensions. `dim1` will be placed at the very end, and `dim0` will be
+// placed just to the left of it.
+//
+// For instance, given a 4-D tensor, dimensions 1 and 3 can be shifted to the end by
+// calling `create_dim_backshift_permutation(1, 3, 4)`. The resulting vector will
+// be `vec(0, 2, 1, 3)`.
+static inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0, int64_t dim1, int64_t ndim) {
+  TORCH_CHECK(
+    (dim0 != dim1) && (dim0 < ndim) && (dim0 >= 0) && (dim1 < ndim) && (dim1 >= 0),
+    "duplicate or invalid dimensions");
+  std::vector<int64_t> permutation(ndim);
+  int64_t cur_permuted_dim = 0;
+  for (const auto dim_ind : c10::irange(ndim)) {
+    if ((dim_ind != dim0) && (dim_ind != dim1)) {
+      permutation[cur_permuted_dim++] = dim_ind;
+    }
+  }
+  permutation[cur_permuted_dim++] = dim0;
+  permutation[cur_permuted_dim] = dim1;
+  return permutation;
+}
+
+// Creates a dimension permutation array that can be given to `at::permute()`, which
+// will reverse a given permutation.
+// The reverse permutation array is created by swapping the indices and their
+// associated values from the given permutation array.
+static inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
+  int64_t ndim = permutation.size();
+  std::vector<int64_t> reverse_permutation(ndim);
+  for (const auto dim_ind : c10::irange(ndim)) {
+    reverse_permutation[permutation[dim_ind]] = dim_ind;
+  }
+  return reverse_permutation;
+}
+
+// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd
+// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
+static inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) {
+  auto mn = std::min(m, n);
+  auto mx = std::max(m, n);
+  if (jobz == 'N') {
+#ifdef __APPLE__
+    // According to `vecLib.framework/Headers/clapack.h` Accelerate.framework is based on LAPACK 3.2.1
+    return 7 * mn;
+#else
+    // These setting is valid for on LAPACK 3.6+
+    return 5 * mn;
+#endif
+  }
+  if (mx > 10 * mn) {
+    return 5 * mn * mn + 5 * mn;
+  }
+  return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn);
+}
+
+// This function checks whether the uplo argument input is valid
+// Allowed strings are "u", "U", "l", "L"
+static inline void checkUplo(const c10::string_view uplo) {
+  // To use std::toupper safely with plain chars (or signed chars), the argument should first be converted to unsigned char
+  char uplo_uppercase = static_cast<char>(std::toupper(static_cast<unsigned char>(uplo[0])));
+  TORCH_CHECK(uplo.size() == 1 && (uplo_uppercase == 'U' || uplo_uppercase == 'L'),
+    "Expected UPLO argument to be 'L' or 'U', but got ", uplo);
+}
+
+static inline void checkSameDevice(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") {
+  TORCH_CHECK(
+      result.device() == input.device(),
+      fn_name,
+      ": Expected ", result_name, " and input tensors to be on the same device, but got ",
+      result_name, " on ", result.device(), " and input on ", input.device());
+}
+
+// Check the dtype of result and input tensors (for _out variants).
+// Most linear algebra functions have the same dtype for input and output
+// (either floating or complex type input), so we can check whether input's dtype can be casted to result's dtype.
+// According to https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch
+// c10::canCast is used for checking the "safe copy" dtype requirements.
+static inline void checkLinalgCompatibleDtype(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") {
+  bool can_cast = c10::canCast(input.scalar_type(), result.scalar_type());
+  TORCH_CHECK(
+      can_cast,
+      fn_name,
+      ": Expected ", result_name, " to be safely castable from ", input.scalar_type(), " dtype, but got ",
+      result_name, " with dtype ", result.scalar_type());
+}
+
+// Alternatively, we can check whether the specific expected output type (result_type) can be safely casted to out tensor dtype (out_type)
+static inline void checkLinalgCompatibleDtype(const std::string& fn_name, ScalarType out_type, ScalarType result_type, const std::string& out_name = "result") {
+  bool can_cast = c10::canCast(result_type, out_type);
+  TORCH_CHECK(
+      can_cast,
+      fn_name,
+      ": Expected ", out_name, " to be safely castable from ", result_type, " dtype, but got ",
+      out_name, " with dtype ", out_type);
+}
+
+static inline void checkNotComplexTolerance(const Tensor& tol, const c10::string_view f_name, const c10::string_view tol_name) {
+  TORCH_CHECK(!at::isComplexType(tol.scalar_type()),
+              f_name, ": ", tol_name, " tensor of complex type is not supported. Got ", tol.scalar_type());
+}
+
+/*
+  Two types of 'other' tensors are supported when solving
+  a system of linear equations matmul(input, x) = other:
+  * 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
+  * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case).
+  The original torch.solve supported only the matrix case, while NumPy works for both cases.
+  For the batched input we need to be able to distinguish them.
+  Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m).
+  This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389
+*/
+static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) {
+  auto expected_batched_rhs_shape = SymIntArrayRef(input.sym_sizes().data(), input.dim() - 1); // input.shape[:-1]
+  bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sym_sizes().equals(expected_batched_rhs_shape));
+  return vector_case;
+}
+
+/*
+  Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor.
+*/
+static inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) {
+  TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU);
+  return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous();
+}
+
+class BroadcastLinearIndices {
+ private:
+  Tensor linear_indices_;
+  bool is_broadcasting_;
+
+ public:
+  BroadcastLinearIndices(
+      int64_t numel,
+      IntArrayRef original_shape,
+      IntArrayRef broadcast_shape) : is_broadcasting_(!original_shape.equals(broadcast_shape)) {
+    // The assumption is that the broadcast_shape is a materialized broadcast
+    // shape of the original_shape. We need to compute the linear indices
+    // compatible with the original_shape to access the elements in the original
+    // tensor corresponding to the broadcast tensor.
+    if (is_broadcasting_) {
+      linear_indices_ =
+          get_linear_indices(numel, original_shape, broadcast_shape);
+    }
+  }
+  int64_t operator()(int64_t broadcast_linear_index) {
+    return is_broadcasting_
+        ? linear_indices_.data_ptr<int64_t>()[broadcast_linear_index]
+        : broadcast_linear_index;
+  }
+};
+
+static inline bool is_blas_compatible_column_major_order(const Tensor& input) {
+  IntArrayRef input_strides = input.strides();
+  IntArrayRef input_sizes = input.sizes();
+  auto ndim = input.dim();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.transpose(-2, -1).is_contiguous();
+  }
+  auto leading_dimension = input_strides[ndim - 1];
+  auto rows = input_sizes[ndim - 2];
+  bool batch_stride_compatible = true;
+  if (ndim == 3) {
+    auto cols = input_sizes[ndim - 1];
+    batch_stride_compatible =
+        input_strides[ndim - 3] >= leading_dimension * cols;
+  }
+  return (input_strides[ndim - 2] == 1) &&
+      (leading_dimension >= std::max<int64_t>(1, rows)) &&
+      batch_stride_compatible;
+}
+
+static inline bool is_blas_compatible_row_major_order(const Tensor& input) {
+  IntArrayRef input_strides = input.strides();
+  IntArrayRef input_sizes = input.sizes();
+  auto ndim = input.dim();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.is_contiguous();
+  }
+  auto leading_dimension = input_strides[ndim - 2];
+  auto cols = input_sizes[ndim - 1];
+  bool batch_stride_compatible = true;
+  if (ndim == 3) {
+    auto rows = input_sizes[ndim - 2];
+    batch_stride_compatible =
+        input_strides[ndim - 3] >= leading_dimension * rows;
+  }
+  return (input_strides[ndim - 1] == 1) &&
+      (leading_dimension >= std::max<int64_t>(1, cols)) &&
+      batch_stride_compatible;
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/LossMulti.h b/MLPY/Lib/site-packages/torch/include/ATen/native/LossMulti.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0a338234427cbf278ee70a921b769c42ad590b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/LossMulti.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+
+namespace at::native {
+namespace {
+  static C10_UNUSED void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+          target.dim() <= 1 && target.numel() == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+          target.dim() == 2 && target.size(0) == nframe &&
+          target.size(1) == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    }
+  }
+
+  static C10_UNUSED void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+
+    TORCH_CHECK(
+        target.dim() <= 1 && target.numel() == nframe,
+        "inconsistent target size, expected ", nframe, " but got ",
+        target.sizes());
+    if (weight && weight->defined()) {
+      TORCH_CHECK(
+          weight->dim() <= 1 && weight->numel() == dim,
+          "inconsistent weight size, expected ", dim, " but got ",
+          weight->sizes());
+    }
+}
+
+
+}  // anonymous namespace
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Math.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Math.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dc1f678c4546bf55cd6a2f48160e800c98e3fdc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Math.h
@@ -0,0 +1,3901 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/jiterator_macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/util/MathConstants.h>
+#include <cfloat>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+/* The next function is taken from  https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c.
+Below is the copyright.
+Output was modified to be inf or -inf when input is 1 or -1. */
+
+
+/*
+    Copyright (c) 2014 Indiana University
+    All rights reserved.
+
+    Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci.,
+            Indiana University, Bloomington, IN
+
+    This software is licensed under the New BSD license:
+
+    Redistribution and use in source and binary forms,
+    with or without modification, are permitted provided
+    that the following conditions are met:
+
+    Redistributions of source code must retain the above
+    copyright notice, this list of conditions and the
+    following disclaimer.
+
+    Redistributions in binary form must reproduce the
+    above copyright notice, this list of conditions and
+    the following disclaimer in the documentation and/or
+    other materials provided with the distribution.
+
+    Neither the name of Indiana University nor
+    the names of its contributors may be used to endorse
+    or promote products derived from this software without
+    specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+    CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+    WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+    THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+    USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+    IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+    USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+namespace {
+/*
+ * This function is derived from the implementation of the i0e function in the
+ * Cephes Math Library. See note [3-Clause BSD License for the Cephes Math
+ * Library].
+ *
+ * Computes an approximation of the exponentially scaled zeroth order modified
+ * Bessel function of the first kind. The approximation is actually two
+ * (sub)approximations, both using a Chebyshev polynomial expansion. One
+ * approximates the function over [0, 8], and the other over (8, infinity). This
+ * function takes the absolute value of all inputs to convert them into the
+ * domain of the approximation.
+ */
+jiterator_also_stringify_as(jiterator_code(
+  template <typename T>
+  JITERATOR_HOST_DEVICE T chbevl(T x, const T array[], const int len) {
+    T b0, b1, b2;
+
+    b0 = array[0];
+    b1 = 0;
+
+    for (int i = 1; i < len; ++i) {
+      b2 = b1;
+      b1 = b0;
+      b0 = x * b1 - b2 + array[i];
+    }
+
+    return T{0.5} * (b0 - b2);
+  }
+
+  template <typename T>
+  JITERATOR_HOST_DEVICE T calc_i0e(T _x) {
+    T x = std::fabs(_x);
+
+    if (x <= T{8.0}) {
+      static const T coefficients[] = {
+          -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+          -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+          -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+          -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+          -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+          -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+          -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+          -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+          -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+          -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+          -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+          -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+          -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+          -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+          -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+      T y = (x / T{2.0}) - T{2.0};
+      return chbevl(y, coefficients, int{30});
+    }
+
+    // x > 8
+    static const T coefficients[] = {
+        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+        4.46562142029675999901E-17,  3.46122286769746109310E-17,
+        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+        1.77256013305652638360E-15,  3.81168066935262242075E-15,
+        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+        1.54008621752140982691E-14,  3.85277838274214270114E-13,
+        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+        1.18891471078464383424E-11,  4.94060238822496958910E-10,
+        3.39623202570838634515E-9,   2.26666899049817806459E-8,
+        2.04891858946906374183E-7,   2.89137052083475648297E-6,
+        6.88975834691682398426E-5,   3.36911647825569408990E-3,
+        8.04490411014108831608E-1};
+
+    return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / std::sqrt(x);
+  }),
+  i0e_string); // i0e_string
+}
+
+#define CENTRAL_RANGE 0.7
+
+template <typename T>
+static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+calc_erfinv(T y) {
+/* Function to calculate inverse error function.  Rational approximation
+is used to generate an initial approximation, which is then improved to
+full accuracy by two steps of Newton's method.  Code is a direct
+translation of the erfinv m file in matlab version 2.0.
+Author:  Gary L. Pavlis, Indiana University
+Date:  February 1996
+*/
+  T x, z, num, dem; /*working variables */
+  /* coefficients in rational expansion */
+  T a[4] = {  T(0.886226899), T(-1.645349621),  T(0.914624893), T(-0.140543331) };
+  T b[4] = { T(-2.118377725),  T(1.442710462), T(-0.329097515),  T(0.012229801) };
+  T c[4] = { T(-1.970840454), T(-1.624906493),  T(3.429567803),  T(1.641345311) };
+  T d[2] = {  T(3.543889200),  T(1.637067800) };
+  T y_abs = std::abs(y);
+  if(y_abs > 1.0) return std::numeric_limits<T>::quiet_NaN();
+#ifdef _WIN32
+  // error C2039: '_copysign': is not a member of 'std'
+  if(y_abs == 1.0) return copysign(std::numeric_limits<T>::infinity(), y);
+#else
+  if(y_abs == 1.0) return std::copysign(std::numeric_limits<T>::infinity(), y);
+#endif
+  if(y_abs <= static_cast<T>(CENTRAL_RANGE)) {
+    z = y * y;
+    num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
+    dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + static_cast<T>(1.0));
+    x = y * num / dem;
+  }
+  else{
+    z = std::sqrt(-std::log((static_cast<T>(1.0)-y_abs)/static_cast<T>(2.0)));
+    num = ((c[3]*z + c[2])*z + c[1]) * z + c[0];
+    dem = (d[1]*z + d[0])*z + static_cast<T>(1.0);
+#ifdef _WIN32
+    // error C2039: '_copysign': is not a member of 'std'
+    x = copysign(num, y) / dem;
+#else
+    x = std::copysign(num, y) / dem;
+#endif
+  }
+  /* Two steps of Newton-Raphson correction */
+  x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
+  x = x - (std::erf(x) - y) / ((static_cast<T>(2.0)/static_cast<T>(std::sqrt(c10::pi<double>)))*std::exp(-x*x));
+
+  return(x);
+}
+
+#undef CENTRAL_RANGE
+
+/*
+ * Note [3-Clause BSD License for the Cephes Math Library]
+ * Code derived from implementations in the Cephes Math Library should mention its derivation and reference
+ * this note (ex. 'This function is derived from the implementation of X in the Cephes Math Library. See note
+ * [3-Clause BSD License for the Cephes Math Library]. The license is:
+ * Copyright (c) 2018, Steven Moshier
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of the nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Steven Moshier BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This function is derived from the implementation of the zeta function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+template <typename scalar_t, bool is_cuda=false>
+C10_HOST_DEVICE static inline scalar_t zeta(scalar_t x, scalar_t q) __ubsan_ignore_float_divide_by_zero__ {
+  using acc_t = at::acc_type<scalar_t, is_cuda>;
+  const acc_t MACHEP = acc_t{1.11022302462515654042E-16};
+  constexpr acc_t zero = acc_t{0.0};
+  constexpr acc_t half = acc_t{0.5};
+  constexpr acc_t one = acc_t{1.0};
+  static const acc_t A[] = {
+      12.0,
+      -720.0,
+      30240.0,
+      -1209600.0,
+      47900160.0,
+      -1.8924375803183791606e9, /*1.307674368e12/691*/
+      7.47242496e10,
+      -2.950130727918164224e12, /*1.067062284288e16/3617*/
+      1.1646782814350067249e14, /*5.109094217170944e18/43867*/
+      -4.5979787224074726105e15, /*8.028576626982912e20/174611*/
+      1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/
+      -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/
+  };
+
+  int i = 0;
+  acc_t a, b, k, s, t, w;
+  if (x == one) {
+    return std::numeric_limits<scalar_t>::infinity();
+  }
+
+  if (x < one) {
+    return std::numeric_limits<scalar_t>::quiet_NaN();
+  }
+
+  if (q <= zero) {
+    if (q == std::floor(q)) {
+      return std::numeric_limits<scalar_t>::infinity();
+    }
+    if (x != std::floor(x)) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+  }
+
+  s = std::pow(q, -x);
+  a = q;
+  i = 0;
+  b = zero;
+  while ((i < 9) || (a <= acc_t{9.0})) {
+    i += 1;
+    a += one;
+    b = ::pow(a, -x);
+    s += b;
+    if ((-MACHEP * s < b) && (b < MACHEP * s)) {
+      return static_cast<scalar_t>(s);
+    }
+  };
+
+  w = a;
+  s += b * w / (x - one);
+  s -= half * b;
+  a = one;
+  k = zero;
+  for (int i = 0; i < 12; i++) {
+    a *= x + k;
+    b /= w;
+    t = a * b / A[i];
+    s = s + t;
+    t = ::fabs(t / s);
+    if (t < MACHEP) {
+      return static_cast<scalar_t>(s);
+    }
+    k += one;
+    a *= x + k;
+    b /= w;
+    k += one;
+  }
+  return static_cast<scalar_t>(s);
+}
+
+/*
+ * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ */
+template <typename T>
+C10_HOST_DEVICE static inline T polevl(const T x, const T A[], size_t len) {
+  T result = 0;
+  for (size_t i = 0; i <= len; i++) {
+    result = result * x + A[i];
+  }
+  return result;
+}
+
+static inline double trigamma(double x) __ubsan_ignore_float_divide_by_zero__ {
+  double sign = +1;
+  double result = 0;
+  if (x < 0.5) {
+    sign = -1;
+    const double sin_pi_x = sin(c10::pi<double> * x);
+    result -= (c10::pi<double> * c10::pi<double>) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const double ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x;
+  return sign * result;
+}
+
+static inline float trigamma(float x) __ubsan_ignore_float_divide_by_zero__ {
+  float sign = +1;
+  float result = 0;
+  if (x < 0.5f) {
+    sign = -1;
+    const float sin_pi_x = sinf(c10::pi<float> * x);
+    result -= (c10::pi<float> * c10::pi<float>) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const float ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x;
+  return sign * result;
+}
+
+/*
+ * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+static inline double calc_digamma(double x) {
+  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+  static double PSI_10 = 2.25175258906672110764;
+  if (x == 0) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(INFINITY, -x);
+  }
+
+  bool x_is_integer = x == trunc(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return std::numeric_limits<double>::quiet_NaN();
+    }
+    // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+    // accurate than tan(pi * x). While these operations are mathematically equivalent
+    // since both x and r are in radians and tan() has a periodicity of pi, in practice
+    // the computation of pi * x is a source of error (when |x| > 1).
+    double q, r;
+    r = std::modf(x, &q);
+    return calc_digamma(1 - x) - c10::pi<double> / tan(c10::pi<double> * r);
+  }
+
+  // Push x to be >= 10
+  double result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10;
+  }
+
+  // Compute asymptotic digamma
+  static const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2,
+  };
+
+  double y = 0;
+  if (x < 1.0e17) {
+    double z = 1.0 / (x * x);
+    y = z * polevl(z, A, 6);
+  }
+  return result + log(x) - (0.5 / x) - y;
+}
+
+/*
+ * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+static inline float calc_digamma(float x) {
+  // See [C++ Standard Reference: Gamma Function]
+  static float PSI_10 = 2.25175258906672110764f;
+  if (x == 0) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(INFINITY, -x);
+  }
+
+  bool x_is_integer = x == truncf(x);
+  if (x < 0) {
+    if (x_is_integer) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is a negative integer, NaN is returned
+      return std::numeric_limits<float>::quiet_NaN();
+    }
+    // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+    // accurate than tan(pi * x). While these operations are mathematically equivalent
+    // since both x and r are in radians and tan() has a periodicity of pi, in practice
+    // the computation of pi * x is a source of error (when |x| > 1).
+    double q, r;
+    r = std::modf(x, &q);
+    float pi_over_tan_pi_x = (float)(c10::pi<double> / tan(c10::pi<double> * r));
+    return calc_digamma(1 - x) - pi_over_tan_pi_x;
+  }
+
+  // Push x to be >= 10
+  float result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10;
+  }
+
+  // Compute asymptotic digamma
+  static const float A[] = {
+      8.33333333333333333333E-2f,
+      -2.10927960927960927961E-2f,
+      7.57575757575757575758E-3f,
+      -4.16666666666666666667E-3f,
+      3.96825396825396825397E-3f,
+      -8.33333333333333333333E-3f,
+      8.33333333333333333333E-2f,
+  };
+
+  float y = 0;
+  if (x < 1.0e17f) {
+    float z = 1 / (x * x);
+    y = z * polevl(z, A, 6);
+  }
+  return result + logf(x) - (0.5f / x) - y;
+}
+
+static inline c10::BFloat16 calc_digamma(c10::BFloat16 a) {
+  return calc_digamma(static_cast<float>(a));
+}
+
+static inline c10::Half calc_digamma(c10::Half a) {
+  return calc_digamma(static_cast<float>(a));
+}
+
+template <typename scalar_t, bool is_cuda=false>
+static inline C10_HOST_DEVICE scalar_t calc_polygamma(scalar_t x, int n) {
+  // already blocked if n <= 1
+  const auto one = scalar_t{1};
+  return ((n % 2) ? one : -one) *
+      std::exp(std::lgamma(static_cast<scalar_t>(n) + one)) *
+      zeta<scalar_t, is_cuda>(static_cast<scalar_t>(n + 1), x);
+}
+
+// regularized lower incomplete gamma
+// the regularized lower, upper incomplete gamma, as well as their
+// helper functions follow SciPy's implementation
+
+/* References
+ * [igam1] "The Digital Library of Mathematical Functions", dlmf.nist.gov
+ * [igam2] Maddock et. al., "Incomplete Gamma Functions",
+ *     https://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html
+ */
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+template <typename scalar_t>
+static scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M,
+    const scalar_t denom[], int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  int64_t i, dir;
+  scalar_t y, num_ans, denom_ans;
+  scalar_t absx = std::fabs(x);
+  const scalar_t *p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  }
+  else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  }
+  else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return std::pow(x, i) * num_ans / denom_ans;
+  }
+  else {
+    return num_ans / denom_ans;
+  }
+}
+
+// SciPy's lanczos implementation is taken from Boost
+/* (C) Copyright John Maddock 2006.
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. See
+ * https://www.boost.org/LICENSE_1_0.txt or see NOTICE.
+ */
+template <typename scalar_t>
+static scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  static const scalar_t lanczos_sum_expg_scaled_num[13] = {
+    0.006061842346248906525783753964555936883222,
+    0.5098416655656676188125178644804694509993,
+    19.51992788247617482847860966235652136208,
+    449.9445569063168119446858607650988409623,
+    6955.999602515376140356310115515198987526,
+    75999.29304014542649875303443598909137092,
+    601859.6171681098786670226533699352302507,
+    3481712.15498064590882071018964774556468,
+    14605578.08768506808414169982791359218571,
+    43338889.32467613834773723740590533316085,
+    86363131.28813859145546927288977868422342,
+    103794043.1163445451906271053616070238554,
+    56906521.91347156388090791033559122686859
+  };
+  static const scalar_t lanczos_sum_expg_scaled_denom[13] = {
+    1.,
+    66.,
+    1925.,
+    32670.,
+    357423.,
+    2637558.,
+    13339535.,
+    45995730.,
+    105258076.,
+    150917976.,
+    120543840.,
+    39916800.,
+    0.
+  };
+  return ratevl(x, lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1);
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  scalar_t ax, fac, res, num, numfac;
+  static scalar_t MAXLOG = std::is_same<scalar_t,double>::value ?
+    7.09782712893383996843E2 : 88.72283905206835;
+  static scalar_t EXP1 = 2.718281828459045;
+  static scalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (std::fabs(a - x) > 0.4 * std::fabs(a)) {
+    ax = a * std::log(x) - x - std::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return std::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = std::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= std::exp(a - x) * std::pow(x / fac, a);
+  }
+  else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= std::exp(a * (std::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static int MAXITER = 2000;
+
+  int i;
+  scalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  int n;
+  scalar_t fac = 1;
+  scalar_t sum = 0;
+  scalar_t term, logx;
+  static scalar_t MAXITER = 2000;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (std::fabs(term) <= MACHEP * std::fabs(sum)) {
+        break;
+    }
+  }
+
+  logx = std::log(x);
+  term = -std::expm1(a * logx - std::lgamma(1+a));
+  return term - std::exp(a * logx - std::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+  static const scalar_t d[25][25] =
+    {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2,
+      1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4,
+      3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6,
+      8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9,
+      1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10,
+      -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11,
+      -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13,
+      -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16,
+      -1.9752288294349443e-15},
+    {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3,
+      -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7,
+      -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6,
+      4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8,
+      1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9,
+      4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14,
+      7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13,
+      -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
+      -4.13125571381061e-15},
+    {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4,
+      2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5,
+      -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6,
+      -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10,
+      -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9,
+      9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11,
+      1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
+      4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17,
+      8.8592218725911273e-15},
+    {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4,
+      2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7,
+      1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6,
+      -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
+      -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9,
+      -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
+      -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
+      6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14,
+      2.0453671226782849e-14},
+    {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4,
+      -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5,
+      1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6,
+      8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11,
+      2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9,
+      -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10,
+      -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12,
+      -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18,
+      -5.0453320690800944e-14},
+    {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4,
+      -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7,
+      -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6,
+      -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7,
+      4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9,
+      3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15,
+      9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
+      -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13,
+      -1.3249659916340829e-13},
+    {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4,
+      7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5,
+      -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6,
+      -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13,
+      -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8,
+      8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10,
+      2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
+      1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
+      3.6902800842763467e-13},
+    {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4,
+      2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7,
+      2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6,
+      4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7,
+      -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8,
+      -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17,
+      -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11,
+      2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12,
+      1.0865561947091654e-12},
+    {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4,
+      -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4,
+      4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5,
+      6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11,
+      3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8,
+      6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9,
+      -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10,
+      -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
+      -3.3721464474854592e-12},
+    {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4,
+      -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7,
+      -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5,
+      -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6,
+      8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7,
+      8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14,
+      3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10,
+      -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11,
+      -1.1002224534207726e-11},
+    {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3,
+      9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4,
+      -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5,
+      -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11,
+      -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7,
+      -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8,
+      1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9,
+      9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18,
+      3.7647749553543836e-11},
+    {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3,
+      2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7,
+      3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4,
+      2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5,
+      -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
+      -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14,
+      -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9,
+      -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10,
+      1.3481607129399749e-10},
+    {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3,
+      -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3,
+      8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4,
+      1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10,
+      1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6,
+      7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
+      -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
+      -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20,
+      -5.0423112718105824e-10},
+    {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
+      -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6,
+      -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4,
+      -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4,
+      4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5,
+      6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13,
+      3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8,
+      8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9,
+      -1.9661464453856102e-9},
+    {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2,
+      7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2,
+      -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3,
+      -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10,
+      -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5,
+      -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6,
+      1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7,
+      1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17,
+      7.9795091026746235e-9},
+    {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2,
+      5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6,
+      1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3,
+      3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3,
+      -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
+      -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12,
+      -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6,
+      -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7,
+      3.3654425209171788e-8},
+    {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1,
+      -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2,
+      4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2,
+      1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9,
+      1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4,
+      1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5,
+      -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6,
+      -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16,
+      -1.4729737374018841e-7},
+    {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
+      -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5,
+      -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2,
+      -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2,
+      5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3,
+      1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12,
+      8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5,
+      3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6,
+      -6.6812849447625594e-7},
+    {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968,
+      1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1,
+      -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1,
+      -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8,
+      -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3,
+      -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3,
+      3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5,
+      5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14,
+      3.1369106244517615e-6},
+    {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906,
+      4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4,
+      1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1,
+      1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1,
+      -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
+      -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11,
+      -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4,
+      9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5,
+      1.5227271505597605e-5},
+    {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1,
+      -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1,
+      5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816,
+      2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7,
+      3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1,
+      8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
+      -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
+      -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11,
+      -7.6340103696869031e-5},
+    {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1,
+      -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3,
+      -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1,
+      -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195,
+      1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1,
+      3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10,
+      3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3,
+      -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3,
+      -3.9479941246822517e-4},
+    {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2,
+      1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2,
+      -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
+      -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7,
+      -6.2716159907747034, 5.1168999071852637, -2.0319658112299095,
+      -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1,
+      1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2,
+      2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6,
+      2.1250180774699461e-3},
+    {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2,
+      7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2,
+      3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2,
+      1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1,
+      -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373,
+      -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7,
+      -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1,
+      1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2,
+      1.5109265210467774e-2},
+    {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3,
+      -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3,
+      1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2,
+      7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6,
+      1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1,
+      -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
+      -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468,
+      -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1,
+      4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  scalar_t lambda = x / a;
+  scalar_t sigma = (x - a) / a;
+  scalar_t eta, res, ck, ckterm, term, absterm;
+  scalar_t absoldterm = INFINITY;
+  scalar_t etapow[25] = {1};
+  scalar_t sum = 0;
+  scalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  }
+  else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = std::sqrt(-2 * (std::log1p(sigma) - sigma));
+  }
+  else if (lambda < 1) {
+    eta = -std::sqrt(-2 * (std::log1p(sigma) - sigma));
+  }
+  else {
+    eta = 0;
+  }
+  res = 0.5 * std::erfc(sgn * eta * std::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n-1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n]*etapow[n];
+      ck += ckterm;
+      if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = std::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * std::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * std::exp(-0.5 * a * eta * eta) * sum / std::sqrt(2 * c10::pi<float> * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+static scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+  int i;
+  scalar_t ans, ax, c, yc, r, t, y, z;
+  scalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  int MAXITER = 2000;
+  static scalar_t MACHEP = std::is_same<scalar_t, double>::value ?
+    1.11022302462515654042E-16 : 5.9604644775390625E-8;
+  static scalar_t BIG = std::is_same<scalar_t,double>::value ?
+    4.503599627370496e15 : 16777216.;
+  static scalar_t BIGINV = std::is_same<scalar_t,double>::value ?
+    2.22044604925031308085e-16 : 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = std::fabs((ans - r) / r);
+      ans = r;
+    }
+    else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (std::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+static inline scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the substraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+  scalar_t absxma_a;
+
+  static scalar_t SMALL = 20.0;
+  static scalar_t LARGE = 200.0;
+  static scalar_t SMALLRATIO = 0.3;
+  static scalar_t LARGERATIO = 4.5;
+
+  // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
+  // at most 1 of them can be 0), where igammac(0, x) = 0.0 iff x > 0.
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<scalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    }
+    else {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 1.0;
+  }
+  else if (std::isinf(a)) {
+    if (std::isinf(x)) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+    return 1.0;
+  }
+  else if (std::isinf(x)) {
+    return 0.0;
+  }
+
+  absxma_a = std::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
+     return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  }
+  else if (x <= 0.5) {
+    if (-0.4 / std::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+  else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    }
+    else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the substraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+  scalar_t absxma_a;
+  static scalar_t SMALL = 20.0;
+  static scalar_t LARGE = 200.0;
+  static scalar_t SMALLRATIO = 0.3;
+  static scalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e.,
+  // at most 1 of them can be 0), where igamma(0, x) = 1.0 iff x > 0.
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return std::numeric_limits<scalar_t>::quiet_NaN();
+  }
+  else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    }
+    else {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+  }
+  else if (x == 0) {
+    return 0.0; // zero integration limit
+  }
+  else if (std::isinf(a)) {
+    if (std::isinf(x)) {
+      return std::numeric_limits<scalar_t>::quiet_NaN();
+    }
+    return 0.0;
+  }
+  else if (std::isinf(x)) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. See [igam2] */
+  absxma_a = std::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+  else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+template <>
+C10_UNUSED c10::BFloat16 calc_igamma<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
+  return calc_igamma<float>(float(a), float(x));
+}
+
+template <>
+C10_UNUSED c10::Half calc_igamma<c10::Half>(c10::Half a, c10::Half x) {
+  return calc_igamma<float>(float(a), float(x));
+}
+
+template <>
+C10_UNUSED c10::BFloat16 calc_igammac<c10::BFloat16>(c10::BFloat16 a, c10::BFloat16 x) {
+  return calc_igammac<float>(float(a), float(x));
+}
+
+template <>
+C10_UNUSED c10::Half calc_igammac<c10::Half>(c10::Half a, c10::Half x) {
+  return calc_igammac<float>(float(a), float(x));
+}
+
+inline c10::BFloat16 calc_erfinv(c10::BFloat16 a) { return calc_erfinv(float(a)); }
+
+template <typename T>
+static T abs_impl(T v) {
+  return std::abs(v);
+}
+
+template <>
+C10_UNUSED uint8_t abs_impl(uint8_t v) {
+  return v;
+}
+
+template <typename T>
+static inline typename std::enable_if<std::is_integral<T>::value, T>::type
+calc_gcd(T a, T b) {
+  a = abs_impl(a);
+  b = abs_impl(b);
+  while (a != 0) {
+    T c = a;
+    a = b % a;
+    b = c;
+  }
+  return b;
+}
+
+template <typename T>
+C10_HOST_DEVICE T exp2_impl(T x) {
+  return std::exp2(x);
+}
+
+template <typename T>
+C10_HOST_DEVICE c10::complex<T> exp2_impl(c10::complex<T> x) {
+  // There is no std::exp2 overload for complex, so instead
+  // use the identity 2^x = e^(ln(2) * x)
+  constexpr auto ln2 = c10::ln_2<T>;
+  return std::exp(ln2 * x);
+}
+
+/*
+ * This function is derived from the implementation of the chbevl function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Evaluates the series
+ *
+ *       len-1
+ *         - '
+ *  y  =   >   array[i] T (x/2)
+ *         -             i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero order term is last in the array.  Note len is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1), over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity, this becomes x -> 4a/x - 1.
+ */
+template <typename T>
+static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+chbevl(const T x, const T array[], size_t len) {
+  T b0, b1, b2;
+
+  b0 = array[0];
+  b1 = static_cast<T>(0.0);
+
+  for (size_t i = 1; i < len; ++i) {
+    b2 = b1;
+    b1 = b0;
+    b0 = x * b1 - b2 + array[i];
+  }
+
+  return (static_cast<T>(0.5) * (b0 - b2));
+}
+
+/*
+ * This function is derived from the implementation of the i0 function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Computes an approximation of the zeroth order modified Bessel function of the first kind.
+ * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
+ * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
+ * of all inputs to convert them into the domain of the approximation.
+ */
+template <typename T>
+static inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
+  /* Chebyshev coefficients for exp(-x) I0(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I0(x) } = 1.
+   */
+  static const T coeff[] = {
+      -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+      -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+      -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+      -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+      -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+      -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+      -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+      -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+      -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+      -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+      -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+      -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+      -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+      -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+      -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+  return std::make_tuple(coeff, 30);
+};
+
+template <typename T>
+static inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+   */
+  static const T coeff[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return std::make_tuple(coeff, 25);
+};
+
+template <typename T>
+static inline typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coeff[] = {
+      2.77791411276104639959E-18, -2.11142121435816608115E-17,
+      1.55363195773620046921E-16, -1.10559694773538630805E-15,
+      7.60068429473540693410E-15, -5.04218550472791168711E-14,
+      3.22379336594557470981E-13, -1.98397439776494371520E-12,
+      1.17361862988909016308E-11, -6.66348972350202774223E-11,
+      3.62559028155211703701E-10, -1.88724975172282928790E-9,
+      9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+      2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+      3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+      4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+      5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+      4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+      2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+      1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+      2.52587186443633654823E-1};
+  return std::make_tuple(coeff, 29);
+};
+
+template <typename T>
+static inline typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coeff[] = {
+      9.38153738649577178388E-9f,
+      -4.44505912879632808065E-8f,
+      2.00329475355213526229E-7f,
+      -8.56872026469545474066E-7f,
+      3.47025130813767847674E-6f,
+      -1.32731636560394358279E-5f,
+      4.78156510755005422638E-5f,
+      -1.61760815825896745588E-4f,
+      5.12285956168575772895E-4f,
+      -1.51357245063125314899E-3f,
+      4.15642294431288815669E-3f,
+      -1.05640848946261981558E-2f,
+      2.47264490306265168283E-2f,
+      -5.29459812080949914269E-2f,
+      1.02643658689847095384E-1f,
+      -1.76416518357834055153E-1f,
+      2.52587186443633654823E-1f};
+  return std::make_tuple(coeff, 17);
+};
+
+template <typename T>
+static inline typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coeff[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+  return std::make_tuple(coeff, 25);
+};
+
+template <typename T>
+static inline typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coeff[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+  return std::make_tuple(coeff, 7);
+};
+
+template <typename T>
+static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+calc_i0(T _x) {
+  T x = std::abs(_x);
+
+  if (x <= T{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i0e_A<T>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    T y = (x / T{2.0}) - T{2.0};
+    return static_cast<T>(std::exp(x) * chbevl(y, A, len));
+  }
+  auto coeff_pair = chebyshev_coefficients_i0e_B<T>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  return std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x);
+}
+
+// Upcast bfloat16 input to float for numerical accuracy purposes
+static inline c10::BFloat16 calc_i0(c10::BFloat16 a) { return calc_i0(static_cast<float>(a)); }
+
+/*
+ * This function is derived from the implementation of the i1 function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Computes an approximation of the first order modified Bessel function of the first kind.
+ * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
+ * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
+ * of all inputs to convert them into the domain of the approximation.
+ */
+template <typename T>
+static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+calc_i1(T _x) {
+  T x = std::abs(_x);
+
+  if (x <= T{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<T>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    T y = (x / T{2.0}) - T{2.0};
+    const T out = std::exp(x) * x * chbevl(y, A, len);
+    return (_x < T{0.0}) ? -out : out;
+  }
+  auto coeff_pair = chebyshev_coefficients_i1e_B<T>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const T out = (std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len)) / std::sqrt(x);
+  return (_x < T{0.0}) ? -out : out;
+}
+
+/*
+ * This function is derived from the implementation of the i1e function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Computes an approximation of the exponentially scaled first order modified Bessel function of the first kind.
+ * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion.
+ * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value
+ * of all inputs to convert them into the domain of the approximation.
+ */
+template <typename T>
+static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+calc_i1e(T _x) {
+  T x = std::abs(_x);
+
+  if (x <= T{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<T>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    T y = (x / T{2.0}) - T{2.0};
+    const T out = chbevl(y, A, len) * x;
+    return (_x < T{0.0}) ? -out : out;
+  }
+  auto coeff_pair = chebyshev_coefficients_i1e_B<T>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const auto out = chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x);
+  return (_x < T{0.0}) ? -out : out;
+}
+
+/*
+ * This function is derived from the implementation of the i1e function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ *
+ * Computes the argument, x, for which the area under the Gaussian probability density function
+ * (integrated from minus infinity to x) is equal to y.
+ */
+template <typename T>
+static inline C10_HOST_DEVICE T calc_ndtri(T y0) {
+
+  /* sqrt(2pi) */
+  constexpr T s2pi = 2.50662827463100050242E0;
+  constexpr T one = 1;
+  constexpr T zero = 0;
+
+  /* approximation for 0 <= |y - 0.5| <= 3/8 */
+  static const T P0[5] = {
+      -5.99633501014107895267E1,
+      9.80010754185999661536E1,
+      -5.66762857469070293439E1,
+      1.39312609387279679503E1,
+      -1.23916583867381258016E0,
+  };
+
+  static const T Q0[9] = {
+      1.00000000000000000000E0,
+      1.95448858338141759834E0,
+      4.67627912898881538453E0,
+      8.63602421390890590575E1,
+      -2.25462687854119370527E2,
+      2.00260212380060660359E2,
+      -8.20372256168333339912E1,
+      1.59056225126211695515E1,
+      -1.18331621121330003142E0,
+  };
+
+  /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8
+  * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
+  */
+  static const T P1[9] = {
+      4.05544892305962419923E0,
+      3.15251094599893866154E1,
+      5.71628192246421288162E1,
+      4.40805073893200834700E1,
+      1.46849561928858024014E1,
+      2.18663306850790267539E0,
+      -1.40256079171354495875E-1,
+      -3.50424626827848203418E-2,
+      -8.57456785154685413611E-4,
+  };
+
+  static const T Q1[9] = {
+      1.00000000000000000000E0,
+      1.57799883256466749731E1,
+      4.53907635128879210584E1,
+      4.13172038254672030440E1,
+      1.50425385692907503408E1,
+      2.50464946208309415979E0,
+      -1.42182922854787788574E-1,
+      -3.80806407691578277194E-2,
+      -9.33259480895457427372E-4,
+  };
+
+  /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64
+  * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+  */
+
+  static const T P2[9] = {
+      3.23774891776946035970E0,
+      6.91522889068984211695E0,
+      3.93881025292474443415E0,
+      1.33303460815807542389E0,
+      2.01485389549179081538E-1,
+      1.23716634817820021358E-2,
+      3.01581553508235416007E-4,
+      2.65806974686737550832E-6,
+      6.23974539184983293730E-9,
+  };
+
+  static const T Q2[9] = {
+      1.00000000000000000000E0,
+      6.02427039364742014255E0,
+      3.67983563856160859403E0,
+      1.37702099489081330271E0,
+      2.16236993594496635890E-1,
+      1.34204006088543189037E-2,
+      3.28014464682127739104E-4,
+      2.89247864745380683936E-6,
+      6.79019408009981274425E-9,
+  };
+
+  if (y0 == zero) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  if (y0 == one) {
+    return std::numeric_limits<T>::infinity();
+  }
+  if (y0 < zero || y0 > one) {
+    return std::numeric_limits<T>::quiet_NaN();
+  }
+  bool code = true;
+  T y = y0;
+  if (y > one - T{0.13533528323661269189}) { /* 0.135... = exp(-2) */
+    y = one - y;
+    code = false;
+  }
+
+  if (y > T{0.13533528323661269189}) {
+    y = y - T{0.5};
+    const T y2 = y * y;
+    T x = y + y * (y2 * polevl(y2, P0, 4) / polevl(y2, Q0, 8));
+    return (x * s2pi);
+  }
+
+  T x = ::sqrt(T{-2.0} * ::log(y));
+  const T x0 = x - ::log(x) / x;
+
+  const T z = one / x;
+  T x1;
+  if (x < T{8.0}) /* y > exp(-32) = 1.2664165549e-14 */
+  {
+    x1 = z * polevl(z, P1, 8) / polevl(z, Q1, 8);
+  } else {
+    x1 = z * polevl(z, P2, 8) / polevl(z, Q2, 8);
+  }
+  x = x0 - x1;
+  if (code) {
+    x = -x;
+  }
+  return x;
+}
+
+/* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+
+/* Copyright (c) 2012 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
+   Steven G. Johnson, October 2012.
+
+   This function combines a few different ideas.
+
+   First, for x > 50, it uses a continued-fraction expansion (same as
+   for the Faddeeva function, but with algebraic simplifications for z=i*x).
+
+   Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
+   but with two twists:
+
+      a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
+         inspired by a similar transformation in the octave-forge/specfun
+         erfcx by Soren Hauberg, results in much faster Chebyshev convergence
+         than other simple transformations I have examined.
+
+      b) Instead of using a single Chebyshev polynomial for the entire
+         [0,1] y interval, we break the interval up into 100 equal
+         subintervals, with a switch/lookup table, and use much lower
+         degree Chebyshev polynomials in each subinterval. This greatly
+         improves performance in my tests.
+
+   For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
+   with the usual checks for overflow etcetera.
+
+   Performance-wise, it seems to be substantially faster than either
+   the SLATEC DERFC function [or an erfcx function derived therefrom]
+   or Cody's CALERF function (from netlib.org/specfun), while
+   retaining near machine precision in accuracy.  */
+
+/* Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x).
+
+   Uses a look-up table of 100 different Chebyshev polynomials
+   for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+   with the help of Maple and a little shell script.   This allows
+   the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+   compared to fitting the whole [0,1] interval with a single polynomial. */
+
+
+template <typename T>
+C10_HOST_DEVICE  static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+erfcx_y100(T y100)
+{
+  switch (static_cast<int>(y100)) {
+case 0: {
+T t = 2*y100 - 1;
+return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+}
+case 1: {
+T t = 2*y100 - 3;
+return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+}
+case 2: {
+T t = 2*y100 - 5;
+return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+}
+case 3: {
+T t = 2*y100 - 7;
+return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+}
+case 4: {
+T t = 2*y100 - 9;
+return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+}
+case 5: {
+T t = 2*y100 - 11;
+return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+}
+case 6: {
+T t = 2*y100 - 13;
+return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+}
+case 7: {
+T t = 2*y100 - 15;
+return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+}
+case 8: {
+T t = 2*y100 - 17;
+return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+}
+case 9: {
+T t = 2*y100 - 19;
+return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+}
+case 10: {
+T t = 2*y100 - 21;
+return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+}
+case 11: {
+T t = 2*y100 - 23;
+return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+}
+case 12: {
+T t = 2*y100 - 25;
+return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+}
+case 13: {
+T t = 2*y100 - 27;
+return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+}
+case 14: {
+T t = 2*y100 - 29;
+return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+}
+case 15: {
+T t = 2*y100 - 31;
+return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+}
+case 16: {
+T t = 2*y100 - 33;
+return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+}
+case 17: {
+T t = 2*y100 - 35;
+return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+}
+case 18: {
+T t = 2*y100 - 37;
+return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+}
+case 19: {
+T t = 2*y100 - 39;
+return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+}
+case 20: {
+T t = 2*y100 - 41;
+return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+}
+case 21: {
+T t = 2*y100 - 43;
+return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+}
+case 22: {
+T t = 2*y100 - 45;
+return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+}
+case 23: {
+T t = 2*y100 - 47;
+return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+}
+case 24: {
+T t = 2*y100 - 49;
+return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+}
+case 25: {
+T t = 2*y100 - 51;
+return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+}
+case 26: {
+T t = 2*y100 - 53;
+return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+}
+case 27: {
+T t = 2*y100 - 55;
+return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+}
+case 28: {
+T t = 2*y100 - 57;
+return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+}
+case 29: {
+T t = 2*y100 - 59;
+return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+}
+case 30: {
+T t = 2*y100 - 61;
+return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+}
+case 31: {
+T t = 2*y100 - 63;
+return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 32: {
+T t = 2*y100 - 65;
+return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+}
+case 33: {
+T t = 2*y100 - 67;
+return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+}
+case 34: {
+T t = 2*y100 - 69;
+return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+}
+case 35: {
+T t = 2*y100 - 71;
+return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+}
+case 36: {
+T t = 2*y100 - 73;
+return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+}
+case 37: {
+T t = 2*y100 - 75;
+return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+}
+case 38: {
+T t = 2*y100 - 77;
+return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+}
+case 39: {
+T t = 2*y100 - 79;
+return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+}
+case 40: {
+T t = 2*y100 - 81;
+return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+}
+case 41: {
+T t = 2*y100 - 83;
+return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 42: {
+T t = 2*y100 - 85;
+return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+}
+case 43: {
+T t = 2*y100 - 87;
+return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 44: {
+T t = 2*y100 - 89;
+return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+}
+case 45: {
+T t = 2*y100 - 91;
+return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+}
+case 46: {
+T t = 2*y100 - 93;
+return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+}
+case 47: {
+T t = 2*y100 - 95;
+return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 48: {
+T t = 2*y100 - 97;
+return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+}
+case 49: {
+T t = 2*y100 - 99;
+return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+}
+case 50: {
+T t = 2*y100 - 101;
+return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+}
+case 51: {
+T t = 2*y100 - 103;
+return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+}
+case 52: {
+T t = 2*y100 - 105;
+return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+}
+case 53: {
+T t = 2*y100 - 107;
+return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+}
+case 54: {
+T t = 2*y100 - 109;
+return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+}
+case 55: {
+T t = 2*y100 - 111;
+return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+}
+case 56: {
+T t = 2*y100 - 113;
+return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+}
+case 57: {
+T t = 2*y100 - 115;
+return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+}
+case 58: {
+T t = 2*y100 - 117;
+return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+}
+case 59: {
+T t = 2*y100 - 119;
+return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+}
+case 60: {
+T t = 2*y100 - 121;
+return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+}
+case 61: {
+T t = 2*y100 - 123;
+return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 62: {
+T t = 2*y100 - 125;
+return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+}
+case 63: {
+T t = 2*y100 - 127;
+return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+}
+case 64: {
+T t = 2*y100 - 129;
+return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+}
+case 65: {
+T t = 2*y100 - 131;
+return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+}
+case 66: {
+T t = 2*y100 - 133;
+return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+}
+case 67: {
+T t = 2*y100 - 135;
+return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+}
+case 68: {
+T t = 2*y100 - 137;
+return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+}
+case 69: {
+T t = 2*y100 - 139;
+return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+}
+case 70: {
+T t = 2*y100 - 141;
+return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+}
+case 71: {
+T t = 2*y100 - 143;
+return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+}
+case 72: {
+T t = 2*y100 - 145;
+return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+}
+case 73: {
+T t = 2*y100 - 147;
+return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+}
+case 74: {
+T t = 2*y100 - 149;
+return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+}
+case 75: {
+T t = 2*y100 - 151;
+return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+}
+case 76: {
+T t = 2*y100 - 153;
+return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 77: {
+T t = 2*y100 - 155;
+return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+}
+case 78: {
+T t = 2*y100 - 157;
+return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+}
+case 79: {
+T t = 2*y100 - 159;
+return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+}
+case 80: {
+T t = 2*y100 - 161;
+return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+}
+case 81: {
+T t = 2*y100 - 163;
+return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+}
+case 82: {
+T t = 2*y100 - 165;
+return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+}
+case 83: {
+T t = 2*y100 - 167;
+return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 84: {
+T t = 2*y100 - 169;
+return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+}
+case 85: {
+T t = 2*y100 - 171;
+return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+}
+case 86: {
+T t = 2*y100 - 173;
+return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+}
+case 87: {
+T t = 2*y100 - 175;
+return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+}
+case 88: {
+T t = 2*y100 - 177;
+return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+}
+case 89: {
+T t = 2*y100 - 179;
+return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 90: {
+T t = 2*y100 - 181;
+return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+}
+case 91: {
+T t = 2*y100 - 183;
+return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 92: {
+T t = 2*y100 - 185;
+return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+}
+case 93: {
+T t = 2*y100 - 187;
+return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+}
+case 94: {
+T t = 2*y100 - 189;
+return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 95: {
+T t = 2*y100 - 191;
+return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+}
+case 96: {
+T t = 2*y100 - 193;
+return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+}
+case 97: {
+T t = 2*y100 - 195;
+return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+}
+case 98: {
+T t = 2*y100 - 197;
+return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+}
+case 99: {
+T t = 2*y100 - 199;
+return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+}
+  }
+  // we only get here if y = 1, i.e. |x| < 4*eps, in which case
+  // erfcx is within 1e-15 of 1..
+  return 1.0;
+}
+
+template <typename T>
+C10_HOST_DEVICE static inline typename std::enable_if<std::is_floating_point<T>::value, T>::type
+calc_erfcx(T x)
+{
+  if (at::_isnan(x)) {
+    return x;
+  }
+
+  if (x >= 0) {
+    if (x > 50) { // continued-fraction expansion is faster
+      const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+      if (x > 5e7) { // 1-term expansion, important to avoid overflow
+        return ispi / x;
+      }
+      /* 5-term expansion (rely on compiler for CSE), simplified from:
+                ispi / (x+0.5/(x+1/(x+1.5/(x+2/x))))  */
+      return ispi*((x*x) * (x*x+4.5) + 2) / (x * ((x*x) * (x*x+5) + 3.75));
+    }
+    return erfcx_y100(400/(4+x));
+  }
+  else {
+    if (x < -26.7) {
+      return std::numeric_limits<T>::infinity();
+    }
+    else if (x < -6.1) {
+      return 2*exp(x*x);
+    }
+    else {
+      return 2*exp(x*x) - erfcx_y100(400/(4-x));
+    }
+  }
+}
+
+/*
+ * Logarithm of Gaussian cumulative distribution function.
+
+ * This implementation of log_ndtr and its helper functions
+ * follow SciPy's implementation
+ * See NOTICE for the licenses.
+ */
+template <typename T>
+static inline C10_HOST_DEVICE T calc_log_ndtr(T x) {
+  T t = x * c10::frac_sqrt_2<T>;
+  if (x < T{-1.0}) {
+    return std::log(calc_erfcx(-t) / 2) - t * t;
+  } else {
+    return std::log1p(-std::erfc(t) / 2);
+  }
+}
+
+template<typename T>
+static inline C10_HOST_DEVICE T airy_ai_forward(T x) {
+    static const T AN[] = {
+            +3.46538101525629032477e-01,
+            +1.20075952739645805542e+01,
+            +7.62796053615234516538e+01,
+            +1.68089224934630576269e+02,
+            +1.59756391350164413639e+02,
+            +7.05360906840444183113e+01,
+            +1.40264691163389668864e+01,
+            +9.99999999999999995305e-01,
+    };
+
+    static const T AD[] = {
+            +5.67594532638770212846e-01,
+            +1.47562562584847203173e+01,
+            +8.45138970141474626562e+01,
+            +1.77318088145400459522e+02,
+            +1.64234692871529701831e+02,
+            +7.14778400825575695274e+01,
+            +1.40959135607834029598e+01,
+            +1.00000000000000000470e+00,
+    };
+
+    static const T AFN[] = {
+            -1.31696323418331795333e-01,
+            -6.26456544431912369773e-01,
+            -6.93158036036933542233e-01,
+            -2.79779981545119124951e-01,
+            -4.91900132609500318020e-02,
+            -4.06265923594885404393e-03,
+            -1.59276496239262096340e-04,
+            -2.77649108155232920844e-06,
+            -1.67787698489114633780e-08,
+    };
+
+    static const T AFD[] = {
+            +1.33560420706553243746e+01,
+            +3.26825032795224613948e+01,
+            +2.67367040941499554804e+01,
+            +9.18707402907259625840e+00,
+            +1.47529146771666414581e+00,
+            +1.15687173795188044134e-01,
+            +4.40291641615211203805e-03,
+            +7.54720348287414296618e-05,
+            +4.51850092970580378464e-07,
+    };
+
+    static const T AGN[] = {
+            +1.97339932091685679179e-02,
+            +3.91103029615688277255e-01,
+            +1.06579897599595591108e+00,
+            +9.39169229816650230044e-01,
+            +3.51465656105547619242e-01,
+            +6.33888919628925490927e-02,
+            +5.85804113048388458567e-03,
+            +2.82851600836737019778e-04,
+            +6.98793669997260967291e-06,
+            +8.11789239554389293311e-08,
+            +3.41551784765923618484e-10,
+    };
+
+    static const T AGD[] = {
+            +9.30892908077441974853e+00,
+            +1.98352928718312140417e+01,
+            +1.55646628932864612953e+01,
+            +5.47686069422975497931e+00,
+            +9.54293611618961883998e-01,
+            +8.64580826352392193095e-02,
+            +4.12656523824222607191e-03,
+            +1.01259085116509135510e-04,
+            +1.17166733214413521882e-06,
+            +4.91834570062930015649e-09,
+    };
+
+    int domain_flag = 0;
+
+    T ai;
+
+    if (std::isinf(x)) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    if (x > T(103.892)) {
+        return T(0.0);
+    }
+
+    T f;
+    T g;
+    T k;
+
+    if (x < T(-2.09)) {
+        T z = T(1.0) / (T(-2.0) * x * std::sqrt(-x) / T(3.0));
+
+        T afn = 0.0;
+
+        for (uint8_t index = 0; index <= 8; index++) {
+            afn = afn * (z * z) + AFN[index];
+        }
+
+        T afd = 0.0;
+
+        for (uint8_t index = 0; index <= 8; index++) {
+            afd = afd * (z * z) + AFD[index];
+        }
+
+        T agn = 0.0;
+
+        for (uint8_t index = 0; index <= 10 + 0; index++) {
+            agn = agn * (z * z) + AGN[index];
+        }
+
+        T agd = 0.0;
+
+        for (uint8_t index = 0; index <= 10 - 1; index++) {
+            agd = agd * (z * z) + AGD[index];
+        }
+
+        T t = T(-2.0) * x * std::sqrt(-x) / T(3.0) + T(0.25) * c10::pi<T>;
+
+        return T(5.64189583547756286948e-01) / std::sqrt(std::sqrt(-x)) * (std::sin(t) * (T(1.0) + z * z * afn / afd) - std::cos(t) * (z * agn / agd));
+    }
+
+    if (x >= T(2.09)) {
+        domain_flag = 5;
+
+        T zeta = T(2.0) * x * std::sqrt(x) / T(3.0);
+
+        T an = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            an = an * (T(1.0) / zeta) + AN[index];
+        }
+
+        T ad = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            ad = ad * (T(1.0) / zeta) + AD[index];
+        }
+
+        ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * std::sqrt(std::sqrt(x)) * std::exp(zeta));
+
+        if (x > T(8.3203353)) {
+            return ai;
+        }
+    }
+
+    f = 1.0;
+    g = x;
+    k = 1.0;
+
+    T m = 1.0;
+    T n = x;
+    T t = 1.0;
+    T z = x * x * x;
+
+    while (t > std::numeric_limits<T>::epsilon()) {
+        m *= z;
+        k += T(1.0);
+        m /= k;
+        n *= z;
+        k += T(1.0);
+        n /= k;
+        m /= k;
+        f += m;
+        k += T(1.0);
+        n /= k;
+        g += n;
+
+        t = std::abs(m / f);
+    }
+
+    if ((domain_flag & 1) == 0) {
+        return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g;
+    }
+
+    return ai;
+} // T airy_ai(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T bessel_j0_forward(T x) {
+    static const T PP[] = {
+            +7.96936729297347051624e-04,
+            +8.28352392107440799803e-02,
+            +1.23953371646414299388e+00,
+            +5.44725003058768775090e+00,
+            +8.74716500199817011941e+00,
+            +5.30324038235394892183e+00,
+            +9.99999999999999997821e-01,
+    };
+
+    static const T PQ[] = {
+            +9.24408810558863637013e-04,
+            +8.56288474354474431428e-02,
+            +1.25352743901058953537e+00,
+            +5.47097740330417105182e+00,
+            +8.76190883237069594232e+00,
+            +5.30605288235394617618e+00,
+            +1.00000000000000000218e+00,
+    };
+
+    static const T QP[] = {
+            -1.13663838898469149931e-02,
+            -1.28252718670509318512e+00,
+            -1.95539544257735972385e+01,
+            -9.32060152123768231369e+01,
+            -1.77681167980488050595e+02,
+            -1.47077505154951170175e+02,
+            -5.14105326766599330220e+01,
+            -6.05014350600728481186e+00,
+    };
+
+    static const T QQ[] = {
+            +6.43178256118178023184e+01,
+            +8.56430025976980587198e+02,
+            +3.88240183605401609683e+03,
+            +7.24046774195652478189e+03,
+            +5.93072701187316984827e+03,
+            +2.06209331660327847417e+03,
+            +2.42005740240291393179e+02,
+    };
+
+    static const T RP[] = {
+            -4.79443220978201773821e+09,
+            +1.95617491946556577543e+12,
+            -2.49248344360967716204e+14,
+            +9.70862251047306323952e+15,
+    };
+
+    static const T RQ[] = {
+            +4.99563147152651017219e+02,
+            +1.73785401676374683123e+05,
+            +4.84409658339962045305e+07,
+            +1.11855537045356834862e+10,
+            +2.11277520115489217587e+12,
+            +3.10518229857422583814e+14,
+            +3.18121955943204943306e+16,
+            +1.71086294081043136091e+18,
+    };
+
+    if (x < T(0)) {
+        x = -x;
+    }
+
+    if (x <= T(5.0)) {
+        if (x < T(0.00001)) {
+            return T(1.0) - x * x / T(4.0);
+        }
+
+        T rp = 0.0;
+
+        for (uint8_t index = 0; index <= 3; index++) {
+            rp = rp * (x * x) + RP[index];
+        }
+
+        T rq = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            rq = rq * (x * x) + RQ[index];
+        }
+
+        return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+    }
+
+    T pp = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pp = pp * (T(25.0) / (x * x)) + PP[index];
+    }
+
+    T pq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pq = pq * (T(25.0) / (x * x)) + PQ[index];
+    }
+
+    T qp = 0.0;
+
+    for (uint8_t index = 0; index <= 7; index++) {
+        qp = qp * (T(25.0) / (x * x)) + QP[index];
+    }
+
+    T qq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        qq = qq * (T(25.0) / (x * x)) + QQ[index];
+    }
+
+    return (pp / pq * std::cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * std::sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
+} // bessel_j0_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T bessel_j1_forward(T x) {
+    static const T PP[] = {
+            +7.62125616208173112003e-04,
+            +7.31397056940917570436e-02,
+            +1.12719608129684925192e+00,
+            +5.11207951146807644818e+00,
+            +8.42404590141772420927e+00,
+            +5.21451598682361504063e+00,
+            +1.00000000000000000254e+00,
+    };
+
+    static const T PQ[] = {
+            +5.71323128072548699714e-04,
+            +6.88455908754495404082e-02,
+            +1.10514232634061696926e+00,
+            +5.07386386128601488557e+00,
+            +8.39985554327604159757e+00,
+            +5.20982848682361821619e+00,
+            +9.99999999999999997461e-01,
+    };
+
+    static const T QP[] = {
+            +5.10862594750176621635e-02,
+            +4.98213872951233449420e+00,
+            +7.58238284132545283818e+01,
+            +3.66779609360150777800e+02,
+            +7.10856304998926107277e+02,
+            +5.97489612400613639965e+02,
+            +2.11688757100572135698e+02,
+            +2.52070205858023719784e+01,
+    };
+
+    static const T QQ[] = {
+            +7.42373277035675149943e+01,
+            +1.05644886038262816351e+03,
+            +4.98641058337653607651e+03,
+            +9.56231892404756170795e+03,
+            +7.99704160447350683650e+03,
+            +2.82619278517639096600e+03,
+            +3.36093607810698293419e+02,
+    };
+
+    static const T RP[] = {
+            -8.99971225705559398224e+08,
+            +4.52228297998194034323e+11,
+            -7.27494245221818276015e+13,
+            +3.68295732863852883286e+15,
+    };
+
+    static const T RQ[] = {
+            +6.20836478118054335476e+02,
+            +2.56987256757748830383e+05,
+            +8.35146791431949253037e+07,
+            +2.21511595479792499675e+10,
+            +4.74914122079991414898e+12,
+            +7.84369607876235854894e+14,
+            +8.95222336184627338078e+16,
+            +5.32278620332680085395e+18,
+    };
+
+    if (x < T(0.0)) {
+        return -bessel_j1_forward(-x);
+    }
+
+    if (x <= T(5.0)) {
+        T rp = 0.0;
+
+        for (uint8_t index = 0; index <= 3; index++) {
+            rp = rp * (x * x) + RP[index];
+        }
+
+        T rq = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            rq = rq * (x * x) + RQ[index];
+        }
+
+        return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01));
+    }
+
+    T pp = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+    }
+
+    T pq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+    }
+
+    T qp = 0.0;
+
+    for (uint8_t index = 0; index <= 7; index++) {
+        qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+    }
+
+    T qq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+    }
+
+    return (pp / pq * std::cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * std::sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
+} // bessel_j1_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T bessel_y0_forward(T x) {
+    static const T PP[] = {
+            +7.96936729297347051624e-04,
+            +8.28352392107440799803e-02,
+            +1.23953371646414299388e+00,
+            +5.44725003058768775090e+00,
+            +8.74716500199817011941e+00,
+            +5.30324038235394892183e+00,
+            +9.99999999999999997821e-01,
+    };
+
+    static const T PQ[] = {
+            +9.24408810558863637013e-04,
+            +8.56288474354474431428e-02,
+            +1.25352743901058953537e+00,
+            +5.47097740330417105182e+00,
+            +8.76190883237069594232e+00,
+            +5.30605288235394617618e+00,
+            +1.00000000000000000218e+00,
+    };
+
+    static const T QP[] = {
+            -1.13663838898469149931e-02,
+            -1.28252718670509318512e+00,
+            -1.95539544257735972385e+01,
+            -9.32060152123768231369e+01,
+            -1.77681167980488050595e+02,
+            -1.47077505154951170175e+02,
+            -5.14105326766599330220e+01,
+            -6.05014350600728481186e+00,
+    };
+
+    static const T QQ[] = {
+            +6.43178256118178023184e+01,
+            +8.56430025976980587198e+02,
+            +3.88240183605401609683e+03,
+            +7.24046774195652478189e+03,
+            +5.93072701187316984827e+03,
+            +2.06209331660327847417e+03,
+            +2.42005740240291393179e+02,
+    };
+
+    static const T YP[] = {
+            +1.55924367855235737965e+04,
+            -1.46639295903971606143e+07,
+            +5.43526477051876500413e+09,
+            -9.82136065717911466409e+11,
+            +8.75906394395366999549e+13,
+            -3.46628303384729719441e+15,
+            +4.42733268572569800351e+16,
+            -1.84950800436986690637e+16,
+    };
+
+    static const T YQ[] = {
+            +1.04128353664259848412e+03,
+            +6.26107330137134956842e+05,
+            +2.68919633393814121987e+08,
+            +8.64002487103935000337e+10,
+            +2.02979612750105546709e+13,
+            +3.17157752842975028269e+15,
+            +2.50596256172653059228e+17,
+    };
+
+    if (x <= T(5.0)) {
+        if (x == T(0.0)) {
+            return -std::numeric_limits<T>::infinity();
+        }
+
+        if (x < T(0.0)) {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+
+        T yp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            yp = yp * (x * x) + YP[index];
+        }
+
+        T yq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            yq = yq * (x * x) + YQ[index];
+        }
+
+        return yp / yq + (T(0.636619772367581343075535053490057448) * std::log(x) * bessel_j0_forward(x));
+    }
+
+    T pp = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pp = pp * (T(25.0) / (x * x)) + PP[index];
+    }
+
+    T pq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pq = pq * (T(25.0) / (x * x)) + PQ[index];
+    }
+
+    T qp = 0.0;
+
+    for (uint8_t index = 0; index <= 7; index++) {
+        qp = qp * (T(25.0) / (x * x)) + QP[index];
+    }
+
+    T qq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        qq = qq * (T(25.0) / (x * x)) + QQ[index];
+    }
+
+    return (pp / pq * std::sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * std::cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
+} // bessel_y0_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T bessel_y1_forward(T x) {
+    static const T PP[] = {
+            +7.62125616208173112003e-04,
+            +7.31397056940917570436e-02,
+            +1.12719608129684925192e+00,
+            +5.11207951146807644818e+00,
+            +8.42404590141772420927e+00,
+            +5.21451598682361504063e+00,
+            +1.00000000000000000254e+00,
+    };
+
+    static const T PQ[] = {
+            +5.71323128072548699714e-04,
+            +6.88455908754495404082e-02,
+            +1.10514232634061696926e+00,
+            +5.07386386128601488557e+00,
+            +8.39985554327604159757e+00,
+            +5.20982848682361821619e+00,
+            +9.99999999999999997461e-01,
+    };
+
+    static const T QP[] = {
+            +5.10862594750176621635e-02,
+            +4.98213872951233449420e+00,
+            +7.58238284132545283818e+01,
+            +3.66779609360150777800e+02,
+            +7.10856304998926107277e+02,
+            +5.97489612400613639965e+02,
+            +2.11688757100572135698e+02,
+            +2.52070205858023719784e+01,
+    };
+
+    static const T QQ[] = {
+            +7.42373277035675149943e+01,
+            +1.05644886038262816351e+03,
+            +4.98641058337653607651e+03,
+            +9.56231892404756170795e+03,
+            +7.99704160447350683650e+03,
+            +2.82619278517639096600e+03,
+            +3.36093607810698293419e+02,
+    };
+
+    static const T YP[] = {
+            +1.26320474790178026440e+09,
+            -6.47355876379160291031e+11,
+            +1.14509511541823727583e+14,
+            -8.12770255501325109621e+15,
+            +2.02439475713594898196e+17,
+            -7.78877196265950026825e+17,
+    };
+
+    static const T YQ[] = {
+            +5.94301592346128195359e+02,
+            +2.35564092943068577943e+05,
+            +7.34811944459721705660e+07,
+            +1.87601316108706159478e+10,
+            +3.88231277496238566008e+12,
+            +6.20557727146953693363e+14,
+            +6.87141087355300489866e+16,
+            +3.97270608116560655612e+18,
+    };
+
+    if (x <= T(5.0)) {
+        if (x == T(0.0)) {
+            return -std::numeric_limits<T>::infinity();
+        }
+
+        if (x <= T(0.0)) {
+            return std::numeric_limits<T>::quiet_NaN();
+        }
+
+        T yp = 0.0;
+
+        for (uint8_t index = 0; index <= 5; index++) {
+            yp = yp * (x * x) + YP[index];
+        }
+
+        T yq = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            yq = yq * (x * x) + YQ[index];
+        }
+
+        return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * std::log(x) - T(1.0) / x));
+    }
+
+    T pp = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+    }
+
+    T pq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+    }
+
+    T qp = 0.0;
+
+    for (uint8_t index = 0; index <= 7; index++) {
+        qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+    }
+
+    T qq = 0.0;
+
+    for (uint8_t index = 0; index <= 6; index++) {
+        qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+    }
+
+    return (pp / pq * std::sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * std::cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x);
+} // bessel_y1_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(1.0)) {
+        if (x > T(0.0) || n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if ((n > 6) && (std::abs(x) < T(1.0))) {
+        return std::cos(n * std::acos(x));
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x;
+    }
+
+    T p = T(1.0);
+    T q = x;
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // chebyshev_polynomial_t_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, T n) {
+    return chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+} // chebyshev_polynomial_t_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(1.0)) {
+        if (x > T(0.0) || n % 2 == 0) {
+            return n + 1;
+        }
+
+        return -(n + 1);
+    }
+
+    if ((n > 8) && (std::abs(x) < T(1.0))) {
+        if (std::sin(std::acos(x)) != T(0.0)) {
+            return std::sin((n + 1) * std::acos(x)) / std::sin(std::acos(x));
+        }
+
+        return (n + 1) * std::cos((n + 1) * std::acos(x)) / x;
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x;
+    }
+
+    T p = T(1.0);
+    T q = x + x;
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // chebyshev_polynomial_u_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, T n) {
+    return chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+} // chebyshev_polynomial_u_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(1.0)) {
+        if (x > T(0.0)) {
+            return T(1.0);
+        }
+
+        if (n % 2 == 0) {
+            return n + n + 1;
+        }
+
+        return -(n + n + 1);
+    }
+
+    if ((n > 8) && (std::abs(x) < T(1.0))) {
+        if (std::sin(std::acos(x) / T(2.0)) != T(1.0)) {
+            return std::cos((n + T(0.5)) * std::acos(x)) / std::cos(std::acos(x) / T(2.0));
+        }
+
+        if (n % 2 == 0) {
+            return n + n + 1;
+        }
+
+        return -(n + n + 1);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x - T(1.0);
+    }
+
+    T p = T(1.0);
+    T q = x + x - T(1.0);
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, T n) {
+    return chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+} // chebyshev_polynomial_v_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(1.0)) {
+        if (x > T(0.0)) {
+            return n + n + 1;
+        }
+
+        if (n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if ((n > 8) && (std::abs(x) < T(1.0))) {
+        if (std::cos(std::acos(x) / T(2.0)) != T(1.0)) {
+            return std::sin((n + T(0.5)) * std::acos(x)) / std::sin(std::acos(x) / T(2.0));
+        }
+
+        if (x > T(0.0)) {
+            return n + n + 1;
+        }
+
+        if (n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x + T(1.0);
+    }
+
+    T p = T(1.0);
+    T q = x + x + T(1.0);
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, T n) {
+    return chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+} // chebyshev_polynomial_w_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x;
+    }
+
+    T p = T(1.0);
+    T q = x + x;
+    T r = T(0.0);
+
+    for (int64_t k = 2; k < n + n; k += 2) {
+        r = (x + x) * q - k * p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // hermite_polynomial_h_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false, std::enable_if_t<!std::is_floating_point<T>::value, int> = 0>
+static inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) {
+    return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
+} // hermite_polynomial_h_forward(T x, T n)
+
+template<typename T, bool is_cuda=false, std::enable_if_t<std::is_floating_point<T>::value, int> = 0>
+static inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) {
+    return hermite_polynomial_h_forward(x, ((!std::isinf(n)) && (!std::isnan(n))) ? static_cast<int64_t>(n) : static_cast<int64_t>(-1));
+} // hermite_polynomial_h_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x;
+    }
+
+    T p = T(1.0);
+    T q = x;
+    T r;
+
+    for (int64_t k = 1; k < n; k++) {
+        r = x * q - k * p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // hermite_polynomial_he_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, T n) {
+    return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
+} // hermite_polynomial_he_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(0.0)) {
+        return T(1.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return T(1.0) - x;
+    }
+
+    T p = T(1.0);
+    T q = T(1.0) - x;
+    T r;
+
+    for (int64_t k = 1; k < n; k++) {
+        r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // laguerre_polynomial_l_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, T n) {
+    return laguerre_polynomial_l_forward(x, static_cast<int64_t>(n));
+} // laguerre_polynomial_l_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) == T(1.0)) {
+        if (x > T(0.0) || n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x;
+    }
+
+    T p = T(1.0);
+    T q = x;
+    T r;
+
+    for (int64_t k = 1; k < n; k++) {
+        r = ((k + k + 1) * x * q - k * p) / (k + 1);
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // legendre_polynomial_p_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, T n) {
+    return legendre_polynomial_p_forward(x, static_cast<int64_t>(n));
+} // legendre_polynomial_p_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T modified_bessel_i0_forward(T x) {
+    static const T A[] = {
+            -4.41534164647933937950e-18,
+            +3.33079451882223809783e-17,
+            -2.43127984654795469359e-16,
+            +1.71539128555513303061e-15,
+            -1.16853328779934516808e-14,
+            +7.67618549860493561688e-14,
+            -4.85644678311192946090e-13,
+            +2.95505266312963983461e-12,
+            -1.72682629144155570723e-11,
+            +9.67580903537323691224e-11,
+            -5.18979560163526290666e-10,
+            +2.65982372468238665035e-09,
+            -1.30002500998624804212e-08,
+            +6.04699502254191894932e-08,
+            -2.67079385394061173391e-07,
+            +1.11738753912010371815e-06,
+            -4.41673835845875056359e-06,
+            +1.64484480707288970893e-05,
+            -5.75419501008210370398e-05,
+            +1.88502885095841655729e-04,
+            -5.76375574538582365885e-04,
+            +1.63947561694133579842e-03,
+            -4.32430999505057594430e-03,
+            +1.05464603945949983183e-02,
+            -2.37374148058994688156e-02,
+            +4.93052842396707084878e-02,
+            -9.49010970480476444210e-02,
+            +1.71620901522208775349e-01,
+            -3.04682672343198398683e-01,
+            +6.76795274409476084995e-01,
+    };
+
+    static const T B[] = {
+            -7.23318048787475395456e-18,
+            -4.83050448594418207126e-18,
+            +4.46562142029675999901e-17,
+            +3.46122286769746109310e-17,
+            -2.82762398051658348494e-16,
+            -3.42548561967721913462e-16,
+            +1.77256013305652638360e-15,
+            +3.81168066935262242075e-15,
+            -9.55484669882830764870e-15,
+            -4.15056934728722208663e-14,
+            +1.54008621752140982691e-14,
+            +3.85277838274214270114e-13,
+            +7.18012445138366623367e-13,
+            -1.79417853150680611778e-12,
+            -1.32158118404477131188e-11,
+            -3.14991652796324136454e-11,
+            +1.18891471078464383424e-11,
+            +4.94060238822496958910e-10,
+            +3.39623202570838634515e-09,
+            +2.26666899049817806459e-08,
+            +2.04891858946906374183e-07,
+            +2.89137052083475648297e-06,
+            +6.88975834691682398426e-05,
+            +3.36911647825569408990e-03,
+            +8.04490411014108831608e-01,
+    };
+
+    T p;
+    T q = 0.0;
+
+    if (std::abs(x) <= T(8.0)) {
+        T a = A[0];
+
+        for (uint8_t index = 1; index < 30; index++) {
+            p = q;
+            q = a;
+            a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+        }
+
+        return std::exp(std::abs(x)) * (T(0.5) * (a - p));
+    }
+
+    T b = B[0];
+
+    for (uint8_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index];
+    }
+
+    return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x));
+} // modified_bessel_i0_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T modified_bessel_i1_forward(T x) {
+    static const T A[] = {
+            +2.77791411276104639959e-18,
+            -2.11142121435816608115e-17,
+            +1.55363195773620046921e-16,
+            -1.10559694773538630805e-15,
+            +7.60068429473540693410e-15,
+            -5.04218550472791168711e-14,
+            +3.22379336594557470981e-13,
+            -1.98397439776494371520e-12,
+            +1.17361862988909016308e-11,
+            -6.66348972350202774223e-11,
+            +3.62559028155211703701e-10,
+            -1.88724975172282928790e-09,
+            +9.38153738649577178388e-09,
+            -4.44505912879632808065e-08,
+            +2.00329475355213526229e-07,
+            -8.56872026469545474066e-07,
+            +3.47025130813767847674e-06,
+            -1.32731636560394358279e-05,
+            +4.78156510755005422638e-05,
+            -1.61760815825896745588e-04,
+            +5.12285956168575772895e-04,
+            -1.51357245063125314899e-03,
+            +4.15642294431288815669e-03,
+            -1.05640848946261981558e-02,
+            +2.47264490306265168283e-02,
+            -5.29459812080949914269e-02,
+            +1.02643658689847095384e-01,
+            -1.76416518357834055153e-01,
+            +2.52587186443633654823e-01,
+    };
+
+    static const T B[] = {
+            +7.51729631084210481353e-18,
+            +4.41434832307170791151e-18,
+            -4.65030536848935832153e-17,
+            -3.20952592199342395980e-17,
+            +2.96262899764595013876e-16,
+            +3.30820231092092828324e-16,
+            -1.88035477551078244854e-15,
+            -3.81440307243700780478e-15,
+            +1.04202769841288027642e-14,
+            +4.27244001671195135429e-14,
+            -2.10154184277266431302e-14,
+            -4.08355111109219731823e-13,
+            -7.19855177624590851209e-13,
+            +2.03562854414708950722e-12,
+            +1.41258074366137813316e-11,
+            +3.25260358301548823856e-11,
+            -1.89749581235054123450e-11,
+            -5.58974346219658380687e-10,
+            -3.83538038596423702205e-09,
+            -2.63146884688951950684e-08,
+            -2.51223623787020892529e-07,
+            -3.88256480887769039346e-06,
+            -1.10588938762623716291e-04,
+            -9.76109749136146840777e-03,
+            +7.78576235018280120474e-01,
+    };
+
+    T p;
+    T q = 0.0;
+
+    if (std::abs(x) <= T(8.0)) {
+        T a = A[0];
+
+        for (uint8_t index = 1; index < 29; index++) {
+            p = q;
+            q = a;
+            a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+        }
+
+        if (x < T(0.0)) {
+            return -(T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x)));
+        }
+
+        return T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x));
+    }
+
+    T b = B[0];
+
+    for (uint8_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index];
+    }
+
+    if (x < T(0.0)) {
+        return -(std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x)));
+    }
+
+    return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x));
+} // modified_bessel_i1_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T modified_bessel_k0_forward(T x) {
+    static const T A[] = {
+            +1.37446543561352307156e-16,
+            +4.25981614279661018399e-14,
+            +1.03496952576338420167e-11,
+            +1.90451637722020886025e-09,
+            +2.53479107902614945675e-07,
+            +2.28621210311945178607e-05,
+            +1.26461541144692592338e-03,
+            +3.59799365153615016266e-02,
+            +3.44289899924628486886e-01,
+            -5.35327393233902768720e-01,
+    };
+
+    static const T B[] = {
+            +5.30043377268626276149e-18,
+            -1.64758043015242134646e-17,
+            +5.21039150503902756861e-17,
+            -1.67823109680541210385e-16,
+            +5.51205597852431940784e-16,
+            -1.84859337734377901440e-15,
+            +6.34007647740507060557e-15,
+            -2.22751332699166985548e-14,
+            +8.03289077536357521100e-14,
+            -2.98009692317273043925e-13,
+            +1.14034058820847496303e-12,
+            -4.51459788337394416547e-12,
+            +1.85594911495471785253e-11,
+            -7.95748924447710747776e-11,
+            +3.57739728140030116597e-10,
+            -1.69753450938905987466e-09,
+            +8.57403401741422608519e-09,
+            -4.66048989768794782956e-08,
+            +2.76681363944501510342e-07,
+            -1.83175552271911948767e-06,
+            +1.39498137188764993662e-05,
+            -1.28495495816278026384e-04,
+            +1.56988388573005337491e-03,
+            -3.14481013119645005427e-02,
+            +2.44030308206595545468e+00,
+    };
+
+    if (x == T(0.0)) {
+        return std::numeric_limits<T>::infinity();
+    }
+
+    if (x < T(0.0)) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    T p;
+    T q = 0.0;
+
+    if (x <= T(2.0)) {
+        T a = A[0];
+
+        for (uint8_t index = 1; index < 10; index++) {
+            p = q;
+            q = a;
+            a = (x * x - T(2.0)) * q - p + A[index];
+        }
+
+        return T(0.5) * (a - p) - std::log(0.5 * x) * modified_bessel_i0_forward(x);
+    }
+
+    T b = B[0];
+
+    for (uint8_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+    }
+
+    return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x);
+} // modified_bessel_k0_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T modified_bessel_k1_forward(T x) {
+    static const T A[] = {
+            -7.02386347938628759343e-18,
+            -2.42744985051936593393e-15,
+            -6.66690169419932900609e-13,
+            -1.41148839263352776110e-10,
+            -2.21338763073472585583e-08,
+            -2.43340614156596823496e-06,
+            -1.73028895751305206302e-04,
+            -6.97572385963986435018e-03,
+            -1.22611180822657148235e-01,
+            -3.53155960776544875667e-01,
+            +1.52530022733894777053e+00,
+    };
+
+    static const T B[] = {
+            -5.75674448366501715755e-18,
+            +1.79405087314755922667e-17,
+            -5.68946255844285935196e-17,
+            +1.83809354436663880070e-16,
+            -6.05704724837331885336e-16,
+            +2.03870316562433424052e-15,
+            -7.01983709041831346144e-15,
+            +2.47715442448130437068e-14,
+            -8.97670518232499435011e-14,
+            +3.34841966607842919884e-13,
+            -1.28917396095102890680e-12,
+            +5.13963967348173025100e-12,
+            -2.12996783842756842877e-11,
+            +9.21831518760500529508e-11,
+            -4.19035475934189648750e-10,
+            +2.01504975519703286596e-09,
+            -1.03457624656780970260e-08,
+            +5.74108412545004946722e-08,
+            -3.50196060308781257119e-07,
+            +2.40648494783721712015e-06,
+            -1.93619797416608296024e-05,
+            +1.95215518471351631108e-04,
+            -2.85781685962277938680e-03,
+            +1.03923736576817238437e-01,
+            +2.72062619048444266945e+00,
+    };
+
+    if (x == T(0.0)) {
+        return std::numeric_limits<T>::infinity();
+    }
+
+    if (x < T(0.0)) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    T p;
+    T q = 0.0;
+
+    if (x <= T(2.0)) {
+        T a = A[0];
+
+        for (uint8_t index = 1; index < 11; index++) {
+            p = q;
+            q = a;
+            a = (x * x - T(2.0)) * q - p + A[index];
+        }
+
+        return std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x;
+    }
+
+    T b = B[0];
+
+    for (uint8_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+    }
+
+    return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x);
+} // modified_bessel_k1_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T scaled_modified_bessel_k0_forward(T x) {
+    static const T A[] = {
+            +1.37446543561352307156e-16,
+            +4.25981614279661018399e-14,
+            +1.03496952576338420167e-11,
+            +1.90451637722020886025e-09,
+            +2.53479107902614945675e-07,
+            +2.28621210311945178607e-05,
+            +1.26461541144692592338e-03,
+            +3.59799365153615016266e-02,
+            +3.44289899924628486886e-01,
+            -5.35327393233902768720e-01,
+    };
+
+    static const T B[] = {
+            +5.30043377268626276149e-18,
+            -1.64758043015242134646e-17,
+            +5.21039150503902756861e-17,
+            -1.67823109680541210385e-16,
+            +5.51205597852431940784e-16,
+            -1.84859337734377901440e-15,
+            +6.34007647740507060557e-15,
+            -2.22751332699166985548e-14,
+            +8.03289077536357521100e-14,
+            -2.98009692317273043925e-13,
+            +1.14034058820847496303e-12,
+            -4.51459788337394416547e-12,
+            +1.85594911495471785253e-11,
+            -7.95748924447710747776e-11,
+            +3.57739728140030116597e-10,
+            -1.69753450938905987466e-09,
+            +8.57403401741422608519e-09,
+            -4.66048989768794782956e-08,
+            +2.76681363944501510342e-07,
+            -1.83175552271911948767e-06,
+            +1.39498137188764993662e-05,
+            -1.28495495816278026384e-04,
+            +1.56988388573005337491e-03,
+            -3.14481013119645005427e-02,
+            +2.44030308206595545468e+00,
+    };
+
+    if (x == T(0.0)) {
+        return std::numeric_limits<T>::infinity();
+    }
+
+    if (x < T(0.0)) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    T p;
+    T q = 0.0;
+
+    if (x <= T(2.0)) {
+        T a = A[0];
+
+        for (uint64_t index = 1; index < 10; index++) {
+            p = q;
+            q = a;
+            a = (x * x - T(2.0)) * q - p + A[index];
+        }
+
+        return (T(0.5) * (a - p) - std::log(T(0.5) * x) * modified_bessel_i0_forward(x)) * std::exp(x);
+    }
+
+    T b = B[0];
+
+    for (uint64_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+    }
+
+    return T(0.5) * (b - p) / std::sqrt(x);
+} // T scaled_modified_bessel_k0_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T scaled_modified_bessel_k1_forward(T x) {
+    static const T A[] = {
+            -7.02386347938628759343e-18,
+            -2.42744985051936593393e-15,
+            -6.66690169419932900609e-13,
+            -1.41148839263352776110e-10,
+            -2.21338763073472585583e-08,
+            -2.43340614156596823496e-06,
+            -1.73028895751305206302e-04,
+            -6.97572385963986435018e-03,
+            -1.22611180822657148235e-01,
+            -3.53155960776544875667e-01,
+            +1.52530022733894777053e+00,
+    };
+
+    static const T B[] = {
+            -5.75674448366501715755e-18,
+            +1.79405087314755922667e-17,
+            -5.68946255844285935196e-17,
+            +1.83809354436663880070e-16,
+            -6.05704724837331885336e-16,
+            +2.03870316562433424052e-15,
+            -7.01983709041831346144e-15,
+            +2.47715442448130437068e-14,
+            -8.97670518232499435011e-14,
+            +3.34841966607842919884e-13,
+            -1.28917396095102890680e-12,
+            +5.13963967348173025100e-12,
+            -2.12996783842756842877e-11,
+            +9.21831518760500529508e-11,
+            -4.19035475934189648750e-10,
+            +2.01504975519703286596e-09,
+            -1.03457624656780970260e-08,
+            +5.74108412545004946722e-08,
+            -3.50196060308781257119e-07,
+            +2.40648494783721712015e-06,
+            -1.93619797416608296024e-05,
+            +1.95215518471351631108e-04,
+            -2.85781685962277938680e-03,
+            +1.03923736576817238437e-01,
+            +2.72062619048444266945e+00,
+    };
+
+    if (x == T(0.0)) {
+        return std::numeric_limits<T>::infinity();
+    }
+
+    if (x < T(0.0)) {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    T p;
+    T q = 0.0;
+
+    if (x <= T(2.0)) {
+        T a = A[0];
+
+        for (uint64_t index = 1; index < 11; index++) {
+            p = q;
+            q = a;
+            a = (x * x - T(2.0)) * q - p + A[index];
+        }
+
+        return (std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * std::exp(x);
+    }
+
+    T b = B[0];
+
+    for (uint64_t index = 1; index < 25; index++) {
+        p = q;
+        q = b;
+        b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+    }
+
+    return (T(0.5) * (b - p) / std::sqrt(x));
+} // T scaled_modified_bessel_k1_forward(T x)
+
+template<typename T>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (x == T(1.0)) {
+        return T(1.0);
+    }
+
+    if (x == T(0.0)) {
+        if (n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
+        return std::cos(n * std::acos(x + x - T(1.0)));
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x - T(1.0);
+    }
+
+    T p = T(1.0);
+    T q = x + x - T(1.0);
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, T n) {
+    return shifted_chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+} // shifted_chebyshev_polynomial_t_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (x == T(1.0)) {
+        return n + 1;
+    }
+
+    if (x == T(0.0)) {
+        if (n % 2 == 0) {
+            return n + 1;
+        }
+
+        return -(n + 1);
+    }
+
+    if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
+        if (std::sin(std::acos(x + x - T(1.0))) != T(0.0)) {
+            return std::sin((n + 1) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0)));
+        }
+
+        return (n + 1) * std::cos((n + 1) * std::acos(x + x - T(1.0))) / (x + x - T(1.0));
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x - T(1.0) + (x + x - T(1.0));
+    }
+
+    T p = T(1.0);
+    T q = x + x - T(1.0) + (x + x - T(1.0));
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, T n) {
+    return shifted_chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+} // shifted_chebyshev_polynomial_u_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (x == T(1.0)) {
+        return T(1.0);
+    }
+
+    if (x == T(0.0)) {
+        if (n % 2 == 0) {
+            return (n + n + 1);
+        }
+
+        return -(n + n + 1);
+    }
+
+    if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) {
+        if (std::sin(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+            return std::cos(((n) + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0));
+        }
+
+        if (n % 2 == 0) {
+            return n + n + 1;
+        }
+
+        return -(n + n + 1);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+    }
+
+    T p = T(1.0);
+    T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, T n) {
+    return shifted_chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+} // shifted_chebyshev_polynomial_v_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+    if (n < 0) {
+        return T(0.0);
+    }
+
+    if (x == T(1.0)) {
+        return n + n + 1;
+    }
+
+    if (x == T(0.0)) {
+        if (n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if ((n > 4) && (std::abs(x + x - T(1.0)) < T(1.0))) {
+        if (std::cos(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+            return std::sin((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0)) / T(2.0));
+        }
+
+        if (n % 2 == 0) {
+            return T(1.0);
+        }
+
+        return T(-1.0);
+    }
+
+    if (n == 0) {
+        return T(1.0);
+    }
+
+    if (n == 1) {
+        return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+    }
+
+    T p = T(1.0);
+    T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+    T r;
+
+    for (int64_t k = 2; k <= n; k++) {
+        r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+        p = q;
+        q = r;
+    }
+
+    return r;
+} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
+template<typename T, bool is_cuda=false>
+static inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, T n) {
+    return shifted_chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+} // shifted_chebyshev_polynomial_w_forward(T x, T n)
+
+template<typename T>
+static inline C10_HOST_DEVICE T spherical_bessel_j0_forward(T x) {
+    if (std::isinf(x)) {
+        return T(0.0);
+    }
+
+    if (std::abs(x) < T(0.5)) {
+        return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0)))))));
+    }
+
+    return std::sin(x) / x;
+} // T spherical_bessel_j0_forward(T x)
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h b/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1e84f029202bdb27e825a062a63adbcb5151d76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h
@@ -0,0 +1,71 @@
+#pragma once
+
+namespace at {
+// views and their in-place version ops
+#define TORCH_VIEW_FNS(m) \
+  m.impl("as_strided_", torch::CppFunction::makeFallthrough()); \
+  m.impl("detach", torch::CppFunction::makeFallthrough()); \
+  m.impl("detach_", torch::CppFunction::makeFallthrough()); \
+  m.impl("diagonal", torch::CppFunction::makeFallthrough()); \
+  m.impl("expand", torch::CppFunction::makeFallthrough()); \
+  m.impl("expand_as", torch::CppFunction::makeFallthrough()); \
+  m.impl("movedim.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("movedim.intlist", torch::CppFunction::makeFallthrough()); \
+  m.impl("narrow", torch::CppFunction::makeFallthrough()); \
+  m.impl("permute", torch::CppFunction::makeFallthrough()); \
+  m.impl("select.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("select.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("squeeze", torch::CppFunction::makeFallthrough()); \
+  m.impl("squeeze_", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose_", torch::CppFunction::makeFallthrough()); \
+  m.impl("t", torch::CppFunction::makeFallthrough()); \
+  m.impl("t_", torch::CppFunction::makeFallthrough()); \
+  m.impl("real", torch::CppFunction::makeFallthrough()); \
+  m.impl("imag", torch::CppFunction::makeFallthrough()); \
+  m.impl("view_as_real", torch::CppFunction::makeFallthrough()); \
+  m.impl("unflatten.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("unflatten.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("unfold", torch::CppFunction::makeFallthrough()); \
+  m.impl("unsqueeze", torch::CppFunction::makeFallthrough()); \
+  m.impl("unsqueeze_", torch::CppFunction::makeFallthrough()); \
+  m.impl("view_as", torch::CppFunction::makeFallthrough()); \
+  m.impl("unbind.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("unbind.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("split.Tensor", torch::CppFunction::makeFallthrough()); \
+  m.impl("split_with_sizes", torch::CppFunction::makeFallthrough()); \
+  m.impl("swapaxes", torch::CppFunction::makeFallthrough()); \
+  m.impl("swapdims", torch::CppFunction::makeFallthrough()); \
+  m.impl("chunk", torch::CppFunction::makeFallthrough()); \
+  m.impl("reshape", torch::CppFunction::makeFallthrough()); \
+  m.impl("alias", torch::CppFunction::makeFallthrough()); \
+  m.impl("hsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("hsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("dsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("dsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("vsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("conj", torch::CppFunction::makeFallthrough()); \
+  m.impl("_conj", torch::CppFunction::makeFallthrough()); \
+  m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \
+  m.impl("resize_", torch::CppFunction::makeFallthrough());
+
+#define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \
+  m.impl("empty_like", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty.memory_format", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty.out", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty_strided", torch::CppFunction::makeFallthrough()); \
+  m.impl("full_like", torch::CppFunction::makeFallthrough()); \
+  m.impl("stride.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("stride.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("size.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("size.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("is_complex", torch::CppFunction::makeFallthrough()); \
+  m.impl("is_floating_point", torch::CppFunction::makeFallthrough()); \
+  m.impl("requires_grad_", torch::CppFunction::makeFallthrough());
+}
+
+#define TORCH_VIEW_FNS_NATIVE_FN_REGISTRATION(m) \
+  m.impl("as_strided", torch::CppFunction::makeFallthrough()); \
+  m.impl("view", torch::CppFunction::makeFallthrough());
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitsFallback.h b/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitsFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..add960c5b687695fd089c63bfe8ec21a0510cf2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/MathBitsFallback.h
@@ -0,0 +1,157 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/clone.h>
+
+#include <utility>
+#endif
+
+namespace at::native {
+// This fallback should only be used for operations that are self inverse and have a corresponding tensor
+// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit.
+// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit.
+// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called.
+
+// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
+struct MathOpFallback {
+  MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+  virtual bool is_bit_set(const Tensor&) = 0;
+  void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+    /*
+      Situations to handle:
+        1. Out-of-place operation.  Easy: materialize all inputs and
+          call it a day.
+        2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
+          Materialize other inputs as in (1).
+        3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
+        Materialize other inputs as in (1).
+
+        It is important to be able to tell if we READ from an argument and if we
+        WRITE to an argument.  Conservative approach is to assume that we always
+        READ from an argument, but in out= operations you can skip
+        conjugating inputs on entry that never get used. In the current schema we
+        can't easily tell if the operation is in in-place or out= operation.
+
+        Note:
+        1. Mutable tensorlists containing tensors whose math bit set to true are disallowed.
+        2. Mutable tensors with math bit set to true are unconditionally cloned to ensure
+           correct behavior in the case when the mutable tensor shares memory with non mutable arguments.
+
+           If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory
+           with these mutable inputs would read into wrong values in the following cases:
+           1. Non mutable inputs have their math bit set to false.
+           2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory
+              with one or more mutable arg(s)) are cloned.
+           At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs.
+    */
+    const auto& arguments = op.schema().arguments();
+    const auto num_arguments = arguments.size();
+    const auto stack_start = stack->size() - num_arguments;
+
+    c10::optional<bool> is_write;
+    for (const auto i : c10::irange(num_arguments)) {
+      // Three possible states:
+      // 1. alias_info has no value --> out-of-place operation
+      // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation
+      // 3. alias_info does have a value, alias_info->is_write=False --> view operation
+      const AliasInfo* alias_info = arguments[i].alias_info();
+      if (alias_info != nullptr) {
+        if (is_write.has_value()) {
+          TORCH_CHECK(*is_write == alias_info->isWrite(),
+            "Unsupported operator for ", op_name, " fallback: ", op.schema().name(),
+            op_name, " fallback doesn't work for operators with a mix "
+            "mutable and non-mutable inputs that alias with outputs, "
+            "this must be implemented manually.  "
+            "If you got this error on a core op, please report a bug to PyTorch.");
+        } else {
+          is_write = alias_info->isWrite();
+        }
+      }
+    }
+
+    if (is_write.has_value() && !*is_write) {
+      // We assume that view operators automatically handle the math bit
+      // correctly by propagating the dispatch key in key_set.
+      // This is not necessarily always right, so you should test these cases.
+      op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+      return;
+    }
+
+    // Mutable inputs with math bit set to True and their clones
+    std::vector<std::pair<Tensor, Tensor>> mutable_inputs_with_their_clones;
+    for (const auto i : c10::irange(num_arguments)) {
+      auto& ivalue = (*stack)[stack_start + i];
+      if (!(ivalue.isTensor() || ivalue.isTensorList())) {
+        continue;
+      }
+      const auto& argument = arguments[i];
+      bool mut_arg = false;
+      if (argument.alias_info()) {
+        // Was already tested by is_write loop above
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite());
+        mut_arg = true;
+      }
+      if (ivalue.isTensor()) {
+        if (!is_bit_set(ivalue.toTensor())) {
+          continue;
+        }
+        auto tensor = std::move(ivalue).toTensor();
+        auto resolved_tensor = at::clone(tensor);
+        if (mut_arg) {
+          TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ",
+            op_name, "bit set to true.");
+          mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor);
+        }
+        (*stack)[stack_start + i] = std::move(resolved_tensor);
+      } else if (ivalue.isTensorList()) {
+        auto tensors = std::move(ivalue).toTensorList();
+        for(const auto j : c10::irange(tensors.size())) {
+          const auto& tensor = tensors[j];
+          if (!is_bit_set(tensor)) {
+            continue;
+          }
+          TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ",
+              op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ",
+              op.schema().name());
+          tensors[j] = at::clone(tensor);
+        }
+        (*stack)[stack_start + i] = std::move(tensors);
+      }
+    }
+
+    op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+
+    TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1);
+
+    for (std::pair<Tensor, Tensor> mut_tensors: mutable_inputs_with_their_clones) {
+      auto& mutable_input =  mut_tensors.first;
+      auto& cloned_mutable_input =  mut_tensors.second;
+      auto& ivalue = (*stack)[stack_start];
+      auto returned_output = std::move(ivalue).toTensor();
+
+      // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input
+      TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output));
+
+      // necessary for out= arg
+      at::native::resize_output(mutable_input, returned_output.sizes());
+
+      mutable_input.copy_(returned_output);
+      (*stack)[stack_start] = std::move(mutable_input);
+    }
+  }
+
+  virtual ~MathOpFallback() = default;
+
+  DispatchKey key;
+  string op_name;
+};
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/MaxPooling.h b/MLPY/Lib/site-packages/torch/include/ATen/native/MaxPooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..89a1ff7080deb3d91e2c1784af0942cd423beae6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/MaxPooling.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Pool.h>
+
+namespace at::native {
+
+static void check_max_pool1d(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+
+  TORCH_CHECK(
+      self.dim() == 2 || self.dim() == 3,
+      "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes());
+  TORCH_CHECK(
+      kernel_size.size() == 1,
+      "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride.empty() || stride.size() == 1,
+      "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ",
+      stride.size());
+  TORCH_CHECK(
+      padding.size() == 1,
+      "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ",
+      padding.size());
+  TORCH_CHECK(
+      dilation.size() == 1,
+      "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ",
+      dilation.size());
+
+  // If stride=None then set it to kernel_size
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+
+  TORCH_CHECK(
+      kernel_size[0] > 0,
+      "max_pool1d() kernel_size must be greater than zero, but got ",
+      kernel_size[0]);
+  TORCH_CHECK(
+      stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]);
+  TORCH_CHECK(
+      padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]);
+  TORCH_CHECK(
+      padding[0] <= kernel_size[0] / 2,
+      "max_pool1d() padding should be at most half of kernel size, but got padding=",
+      padding[0],
+      " and kernel_size=",
+      kernel_size[0]);
+  TORCH_CHECK(
+      dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]);
+
+  const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode);
+  TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW);
+}
+
+// TODO(Heitor) Template by dimension
+struct PoolingParams1D {
+  int64_t NB; // Number of batches
+  int64_t NC; // Number of channels
+  int64_t IW; // Input width
+  int64_t OW; // Output width
+  int64_t KW; // Kernel width
+  int64_t SJ; // Column stride
+  int64_t PJ; // Column padding
+  int64_t DJ; // Column dilation
+
+  // Return index of input element for the given kernel and output index
+  inline int64_t index(int64_t kj, int64_t oj) const {
+    return oj * SJ + kj * DJ - PJ;
+  }
+
+  // Return index of first output within bounds for this kernel index
+  inline int64_t valid_output_start(int64_t kj) const {
+    int64_t ij = index(kj, 0);;
+    return ij < 0 ? at::divup(-ij, SJ) : 0;
+  }
+
+  // Return index one past last output within bounds for this kernel index
+  inline int64_t valid_output_end(int64_t kj) const {
+    int64_t ij = index(kj, OW - 1);
+    return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW;
+  }
+};
+
+using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&);
+
+DECLARE_DISPATCH(pooling_fn, max_pool1d_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/NonEmptyUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/NonEmptyUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..26cb65d844b4f0e1d88a45712159d18c0312ab73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/NonEmptyUtils.h
@@ -0,0 +1,27 @@
+#include <ATen/core/TensorBase.h>
+#include <algorithm>
+#include <vector>
+
+namespace at::native {
+
+inline int64_t ensure_nonempty_dim(int64_t dim) {
+  return std::max<int64_t>(dim, 1);
+}
+
+inline int64_t ensure_nonempty_size(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.size(dim);
+}
+
+inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.stride(dim);
+}
+
+using IdxVec = std::vector<int64_t>;
+inline IdxVec ensure_nonempty_vec(IdxVec vec) {
+  if (vec.empty()) {
+    vec.push_back(1);
+  }
+  return vec;
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/NonSymbolicBC.h b/MLPY/Lib/site-packages/torch/include/ATen/native/NonSymbolicBC.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbb4b0f7babdcf6ae263b40bc9f387bf9f7a6361
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/NonSymbolicBC.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+// This file contains non-symbolic signatures for ops that we have sym-intified the signature of.
+// However, in certain cases (such as static runtime), we call the native versions of the ops directly.
+// In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation.
+TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape);
+TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt, c10::optional<bool> is_coalesced=c10::nullopt);
+TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+// The below ops don't get a duplicated C++ implementation.
+// They are backward ops, which make them very unlikely to be called directly
+// by external code (at::native::trace_backward).
+// They get their own declaration for BC purposes however.
+TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
+TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
+TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
+TORCH_API at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index);
+TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Normalization.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a983e9ab6961f764c2eb1661e5f1c2cc7c6ed61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Normalization.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm);
+DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub);
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Padding.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee31048f559ee1797baa62f57cade7918220b115
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Padding.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using padding_fn = void (*)(const Tensor&, const Tensor&, IntArrayRef);
+
+// reflection padding
+DECLARE_DISPATCH(padding_fn, reflection_pad1d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad1d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad2d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad2d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad3d_kernel);
+DECLARE_DISPATCH(padding_fn, reflection_pad3d_backward_kernel);
+
+// replication padding
+DECLARE_DISPATCH(padding_fn, replication_pad1d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad1d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad2d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad2d_backward_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad3d_kernel);
+DECLARE_DISPATCH(padding_fn, replication_pad3d_backward_kernel);
+
+namespace padding {
+
+template <int dim>
+static inline void check_valid_input(const Tensor& input, IntArrayRef padding) {
+
+  TORCH_CHECK(padding.size() == 2 * dim,
+      "padding size is expected to be ", 2 * dim,
+      ", but got: ", padding.size());
+
+  int input_dim = input.dim();
+
+  bool is_batch_mode = input_dim == (dim + 2);
+
+  bool valid_batch_mode = is_batch_mode;
+  bool valid_non_batch_mode = !is_batch_mode;
+
+  if (is_batch_mode) {
+    // allow batch size of 0-dim.
+    for (const auto d : c10::irange(1, input_dim)) {
+      valid_batch_mode = valid_batch_mode && input.size(d) != 0;
+    }
+  } else {
+    for (const auto d : c10::irange(0, input_dim)) {
+      valid_non_batch_mode = valid_non_batch_mode && input.size(d) != 0;
+    }
+  }
+
+  // allow empty batch size but not other dimensions.
+  TORCH_CHECK(valid_batch_mode || valid_non_batch_mode,
+      "Expected ", dim + 1, "D or ", dim + 2,
+      "D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+}
+
+} // namespace padding
+
+} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/PixelShuffle.h b/MLPY/Lib/site-packages/torch/include/ATen/native/PixelShuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2b83f4baa45859e219576775fcc59aa9aac7f53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/PixelShuffle.h
@@ -0,0 +1,47 @@
+#include <ATen/core/Tensor.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+namespace native {
+
+inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_factor) {
+  TORCH_CHECK(self.dim() >= 3,
+              "pixel_shuffle expects input to have at least 3 dimensions, but got input with ",
+              self.dim(), " dimension(s)");
+  TORCH_CHECK(upscale_factor > 0,
+              "pixel_shuffle expects a positive upscale_factor, but got ",
+              upscale_factor);
+  int64_t c = self.size(-3);
+  int64_t upscale_factor_squared = upscale_factor * upscale_factor;
+  TORCH_CHECK(c % upscale_factor_squared == 0,
+              "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
+              "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared);
+}
+
+inline void check_pixel_unshuffle_shapes(const Tensor& self, int64_t downscale_factor) {
+  TORCH_CHECK(
+      self.dim() >= 3,
+      "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ",
+      self.dim(),
+      " dimension(s)");
+  TORCH_CHECK(
+      downscale_factor > 0,
+      "pixel_unshuffle expects a positive downscale_factor, but got ",
+      downscale_factor);
+  int64_t h = self.size(-2);
+  int64_t w = self.size(-1);
+  TORCH_CHECK(
+      h % downscale_factor == 0,
+      "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=",
+      h,
+      " is not divisible by ",
+      downscale_factor);
+  TORCH_CHECK(
+      w % downscale_factor == 0,
+      "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=",
+      w,
+      " is not divisible by ",
+      downscale_factor);
+}
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/PointwiseOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/PointwiseOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..785d62f8d2735f1baa3c31860b6473aa386aca43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/PointwiseOps.h
@@ -0,0 +1,28 @@
+// Ternary and higher-order pointwise operations
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+
+struct TensorIterator;
+struct TensorIteratorBase;
+
+namespace native {
+
+using pointwise_fn = void (*)(TensorIterator&, const Scalar& scalar);
+using structured_pointwise_fn = void (*)(TensorIteratorBase&, const Scalar& scalar);
+using pointwise_fn_double = void (*)(TensorIterator&, const Scalar&, double);
+
+DECLARE_DISPATCH(structured_pointwise_fn, addcmul_stub);
+DECLARE_DISPATCH(structured_pointwise_fn, addcdiv_stub);
+DECLARE_DISPATCH(pointwise_fn_double, smooth_l1_backward_stub);
+DECLARE_DISPATCH(pointwise_fn_double, huber_backward_stub);
+DECLARE_DISPATCH(pointwise_fn, mse_backward_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Pool.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..8daa5f56b8388e9090e8e45a1b6abdcefb7e0254
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Pool.h
@@ -0,0 +1,340 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/div_rtn.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/irange.h>
+
+#include <utility>
+
+#pragma once
+
+namespace at::native {
+
+using max_pool2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input,
+    int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH);
+using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+
+DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel);
+DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel);
+
+// averge pooling has same signature for forward and backward
+using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH,
+    int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH,
+    int dW, int dH, int padW, int padH, bool count_include_pad, c10::optional<int64_t> divisor_override);
+
+DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel);
+DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel);
+
+using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input,
+    int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD);
+using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+
+DECLARE_DISPATCH(max_pool3d_fn, max_pool3d_kernel);
+DECLARE_DISPATCH(max_pool3d_backward_fn, max_pool3d_backward_kernel);
+namespace {
+
+template <typename dest_t, typename src_t>
+static inline dest_t
+safe_downcast(src_t v)
+{
+  TORCH_CHECK(std::numeric_limits<dest_t>::min() <= v && v <= std::numeric_limits<dest_t>::max(),
+              "integer out of range");
+
+  return static_cast<dest_t>(v);
+}
+
+template<typename T>
+static inline T pooling_output_shape_pad_lr(
+        T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation,
+        bool ceil_mode) {
+    T outputSize = div_rtn<T>(
+        inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 +
+        (ceil_mode ? stride - 1 : 0), stride) + 1;
+    if (ceil_mode) {
+        // ensure that the last pooling starts inside the image
+        // needed to avoid problems in ceil mode
+        if ((outputSize - 1) * stride >= inputSize + pad_l) {
+          --outputSize;
+        }
+    }
+    return outputSize;
+}
+
+template<typename T>
+static inline T pooling_output_shape(
+      T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) {
+    TORCH_CHECK(stride != 0, "stride should not be zero");
+    TORCH_CHECK(pad >= 0,
+                "pad must be non-negative, but got pad: ", pad);
+    TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2,
+                "pad should be at most half of effective kernel size, but got pad=",
+                pad, ", kernel_size=", kernelSize, " and dilation=", dilation)
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode);
+}
+
+template <typename T>
+std::pair<T, T> _pooling_same_mode_padding_lr(
+    T inputSize, T kernelSize, T stride, T dilation) {
+  // NOTE: with strides, the output shape is ceil(inputSize/stride)
+  auto total_padding = T(dilation) * (kernelSize - 1);
+
+  // Prefer symmetric padding if possible
+  if (stride > 2 && (total_padding % 2 == 1)) {
+    // The floor in the output size calculation gives us a little wiggle room
+    auto wiggle_room = inputSize % stride - 1;
+    if (wiggle_room > 0) {
+      total_padding = total_padding - 1;
+    }
+  }
+
+  auto left = total_padding / 2;
+  return {left, total_padding - left};
+}
+
+inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
+    int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) {
+  return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation);
+}
+
+inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
+    c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) {
+  return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation));
+}
+
+// AveragePool2d/DilatedMaxPool2d (forward)
+static inline void
+pool2d_shape_check(
+  const Tensor& input,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format)
+{
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got "
+              "dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationH > 0 && dilationW > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationH: ", dilationH, " dilationW: ", dilationW);
+
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+  if (memory_format == at::MemoryFormat::ChannelsLast){
+    // Expect tensor in NHWC format and allow 0-dim only for N.
+    TORCH_CHECK((ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+      " with optional 0 dim batch size for input, but got: ", input.sizes());
+  } else {
+    TORCH_CHECK((ndim == 3 && input.size(0) != 0 && valid_dims) ||
+      (ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
+      input.sizes());
+  }
+
+  TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
+              "pad should be smaller than or equal to half of kernel size, but got ",
+              "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
+
+  TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
+              "Given input size: (",
+              nInputPlane, "x", inputHeight, "x", inputWidth, "). ",
+              "Calculated output size: (",
+              nOutputPlane, "x", outputHeight, "x", outputWidth, "). ",
+              "Output size is too small");
+}
+
+// DilatedMaxPool2d (backward)
+static inline void
+max_pool2d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format)
+{
+  pool2d_shape_check(
+    input,
+    kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format);
+
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane);
+  check_dim_size(gradOutput, ndim, ndim-2, outputHeight);
+  check_dim_size(gradOutput, ndim, ndim-1, outputWidth);
+
+  check_dim_size(indices, ndim, ndim-3, nOutputPlane);
+  check_dim_size(indices, ndim, ndim-2, outputHeight);
+  check_dim_size(indices, ndim, ndim-1, outputWidth);
+}
+
+// AveragePool2d (backward)
+static inline void
+avg_pool2d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  int64_t /*nbatch*/,
+  int kH, int kW, int dH, int dW, int padH, int padW,
+  int64_t nInputPlane,
+  int64_t inputHeight, int64_t inputWidth,
+  int64_t outputHeight, int64_t outputWidth,
+  MemoryFormat memory_format)
+{
+  pool2d_shape_check(
+    input,
+    kH, kW, dH, dW, padH, padW, 1, 1,
+    nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
+    memory_format);
+
+  const int64_t ndim = input.ndimension();
+  const int64_t nOutputPlane = nInputPlane;
+
+  check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane);
+  check_dim_size(gradOutput, ndim, ndim-2, outputHeight);
+  check_dim_size(gradOutput, ndim, ndim-1, outputWidth);
+}
+
+// AveragePool3d/DilatedMaxPool3d (forward)
+static inline void
+pool3d_shape_check(
+  const Tensor& input,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char *fn_name,
+  bool check_input_size=false)
+{
+  const int64_t ndim = input.ndimension();
+
+  TORCH_CHECK(kT > 0 && kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got ",
+              "kT: ", kT, " kH: ", kH, " kW: ", kW);
+  TORCH_CHECK(dT > 0 && dW > 0 && dH > 0,
+              "stride should be greater than zero, but got ",
+              "dT: ", dT, " dH: ", dH, " dW: ", dW);
+  TORCH_CHECK(dilationT > 0 && dilationW > 0 && dilationH > 0,
+              "dilation should be greater than zero, but got ",
+              "dilationT: ", dilationT, " dilationH: ", dilationH, " dilationW: ", dilationW);
+
+  TORCH_CHECK(ndim == 4 || ndim == 5,
+              fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes());
+
+  for (const auto i : c10::irange(ndim)) {
+    if (ndim == 5 && i == 0) {
+      // size of batch-dim can be 0.
+      continue;
+    }
+    TORCH_CHECK(
+        input.size(i) > 0,
+        fn_name,
+        ": Expected input's non-batch dimensions to have positive length,"
+        " but input has a shape of ",
+        input.sizes(),
+        " and non-batch dimension ",
+        input.size(i),
+        " has length zero!")
+  }
+
+  if (check_input_size) { // AveragePool3d
+    TORCH_CHECK(itime >= kT && iheight >= kH && iwidth >= kW,
+                "input image ", "(T: ", itime, " H: ", iheight, " W: ", iwidth, ") smaller than ",
+                "kernel size ", "(kT: ", kT, " kH: ", kH, " kW: ", kW, ")");
+  }
+
+  TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH,
+              "pad should be smaller than or equal to half of kernel size, but got "
+              "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH);
+
+  TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1,
+              "Given input size: (",
+              nslices,"x", itime, "x", iheight, "x", iwidth, "). ",
+              "Calculated output size: (",
+              nslices, "x", otime, "x", oheight, "x", owidth, "). ",
+              "Output size is too small");
+}
+
+static inline void
+max_pool3d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  const Tensor& indices,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int dilationT, int dilationH, int dilationW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char* fn_name)
+{
+  const int64_t ndim = input.ndimension();
+
+  pool3d_shape_check(
+    input,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    dilationT, dilationH, dilationW,
+    itime, iheight, iwidth,
+    otime, oheight, owidth, fn_name);
+
+  check_dim_size(gradOutput, ndim, ndim-4, nslices);
+  check_dim_size(gradOutput, ndim, ndim-3, otime);
+  check_dim_size(gradOutput, ndim, ndim-2, oheight);
+  check_dim_size(gradOutput, ndim, ndim-1, owidth);
+
+  check_dim_size(indices, ndim, ndim-4, nslices);
+  check_dim_size(indices, ndim, ndim-3, otime);
+  check_dim_size(indices, ndim, ndim-2, oheight);
+  check_dim_size(indices, ndim, ndim-1, owidth);
+}
+
+static inline void
+avg_pool3d_backward_shape_check(
+  const Tensor& input,
+  const Tensor& gradOutput,
+  int64_t nslices,
+  int kT, int kH, int kW,
+  int dT, int dH, int dW,
+  int pT, int pH, int pW,
+  int64_t itime, int64_t iheight, int64_t iwidth,
+  int64_t otime, int64_t oheight, int64_t owidth,
+  const char *fn_name)
+{
+  const int64_t ndim = input.ndimension();
+
+  pool3d_shape_check(
+    input,
+    nslices,
+    kT, kH, kW,
+    dT, dH, dW,
+    pT, pH, pW,
+    1, 1, 1,
+    itime, iheight, iwidth,
+    otime, oheight, owidth,
+    fn_name, true);
+
+  check_dim_size(gradOutput, ndim, ndim-4, nslices);
+  check_dim_size(gradOutput, ndim, ndim-3, otime);
+  check_dim_size(gradOutput, ndim, ndim-2, oheight);
+  check_dim_size(gradOutput, ndim, ndim-1, owidth);
+}
+
+} // anonymous namespace
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Pow.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Pow.h
new file mode 100644
index 0000000000000000000000000000000000000000..5845442f0de9102ad5c4793f23561970bc16f1fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Pow.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+
+struct TensorIterator;
+struct TensorIteratorBase;
+
+namespace native {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define HOST_DEVICE __host__ __device__
+#else
+#define HOST_DEVICE
+#endif
+
+// integral power in pytorch allows for negative exponents, giving truncated integral results.
+// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the
+// only non-zero result.
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE __ubsan_ignore_signed_int_overflow__ T powi_impl(T a, T b) {
+  T result = 1;
+  while (b) {
+    if (b & 1) {
+       result *= a;
+    }
+    b /= 2;
+    a *= a;
+  }
+  return result;
+}
+
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  return powi_impl(a, b);
+}
+
+template <class T,
+  typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, T>::type* = nullptr>
+static inline HOST_DEVICE T powi(T a, T b) {
+  if ( b < 0 ) {
+      if ( a == 1 ) {
+          return 1;
+      } else if ( a == -1 ) {
+          auto negative = (-b) % static_cast<T>(2);
+          return negative ? -1 : 1;
+      } else {
+          return 0;
+      }
+  }
+  return powi_impl(a, b);
+}
+
+using pow_tensor_tensor_fn = void (*)(TensorIteratorBase&);
+using pow_tensor_scalar_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+
+DECLARE_DISPATCH(pow_tensor_tensor_fn, pow_tensor_tensor_stub);
+DECLARE_DISPATCH(pow_tensor_scalar_fn, pow_tensor_scalar_stub);
+
+} // namespace native
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/RNN.h b/MLPY/Lib/site-packages/torch/include/ATen/native/RNN.h
new file mode 100644
index 0000000000000000000000000000000000000000..176897b560d3a71eb05a6db91b399a27d2c6f634
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/RNN.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool);
+using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool);
+using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool);
+using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool);
+
+DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub);
+DECLARE_DISPATCH(lstm_fn, lstm_miopen_stub);
+DECLARE_DISPATCH(lstm_fn, lstm_mkldnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, gru_miopen_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_tanh_miopen_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub);
+DECLARE_DISPATCH(rnn_fn, rnn_relu_miopen_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub);
+DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, gru_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_miopen_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub);
+DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_miopen_stub);
+
+inline void check_attributes(const Tensor& input, const TensorList& params, const TensorList& hiddens, bool check_dtype=false) {
+  auto input_device = input.device();
+  auto input_dtype = input.scalar_type();
+
+  auto check_tensors = [&](const std::string& name, const Tensor& t) {
+    if (!t.defined()) return;
+    auto t_device = t.device();
+    TORCH_CHECK(input_device == t_device,
+             "Input and ", name, " tensors are not at the same device, found input tensor at ",
+             input_device, " and ", name, " tensor at ", t_device);
+    if (check_dtype) {
+      auto t_dtype = t.scalar_type();
+      TORCH_CHECK(input_dtype == t_dtype,
+               "Input and ", name, " tensors are not the same dtype, found input tensor with ",
+               input_dtype, " and ", name, " tensor with ", t_dtype);
+    }
+  };
+
+  for (const auto& h : hiddens) check_tensors("hidden", h);
+  for (const auto& p : params) check_tensors("parameter", p);
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/RangeFactories.h b/MLPY/Lib/site-packages/torch/include/ATen/native/RangeFactories.h
new file mode 100644
index 0000000000000000000000000000000000000000..f39e9678f76308b3f510897b66edcf525e367b19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/RangeFactories.h
@@ -0,0 +1,12 @@
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
+struct TensorIterator;
+
+namespace native {
+
+DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub);
+DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceAllOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceAllOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f0bae6179a8968fdab7aef95a1a2b33bfb0dbed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceAllOps.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+}
+
+namespace at::native {
+
+using reduce_all_fn = void (*)(Tensor & result, const Tensor & self);
+using reduce_min_max_fn = void (*)(Tensor & max_result, Tensor & min_result, const Tensor & self);
+DECLARE_DISPATCH(reduce_all_fn, min_all_stub);
+DECLARE_DISPATCH(reduce_all_fn, max_all_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bef746b4ca89d15e162e517658c8b9544c1b81d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOps.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+struct TensorIterator;
+class Tensor;
+}
+
+namespace at::native {
+
+using reduce_fn = void(*)(TensorIterator &);
+
+DECLARE_DISPATCH(reduce_fn, sum_stub);
+DECLARE_DISPATCH(reduce_fn, nansum_stub);
+DECLARE_DISPATCH(reduce_fn, prod_stub);
+DECLARE_DISPATCH(reduce_fn, mean_stub);
+DECLARE_DISPATCH(reduce_fn, and_stub);
+DECLARE_DISPATCH(reduce_fn, or_stub);
+DECLARE_DISPATCH(reduce_fn, min_values_stub);
+DECLARE_DISPATCH(reduce_fn, max_values_stub);
+DECLARE_DISPATCH(reduce_fn, argmax_stub);
+DECLARE_DISPATCH(reduce_fn, argmin_stub);
+
+using reduce_std_var_function =
+    void (*)(TensorIterator&, double correction, bool take_sqrt);
+DECLARE_DISPATCH(reduce_std_var_function, std_var_stub);
+
+using reduce_norm_fn =
+    void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional<int64_t>);
+DECLARE_DISPATCH(reduce_norm_fn, norm_kernel);
+
+using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&);
+DECLARE_DISPATCH(reduce_fn_flag, norm_stub);
+
+using structured_cum_fn = void (*)(const Tensor&, const Tensor&, int64_t);
+using cum_fn = void (*)(Tensor&, const Tensor&, int64_t);
+DECLARE_DISPATCH(structured_cum_fn, cumsum_stub);
+DECLARE_DISPATCH(structured_cum_fn, cumprod_stub);
+DECLARE_DISPATCH(cum_fn, logcumsumexp_stub);
+
+DECLARE_DISPATCH(void (*)(const Tensor&, int64_t, bool, Tensor&, Tensor&), aminmax_stub);
+DECLARE_DISPATCH(void (*)(const Tensor&, Tensor&, Tensor&), aminmax_allreduce_stub);
+
+// Used in cuda/Normalization.cu
+TORCH_API std::tuple<Tensor&,Tensor&> var_mean_out(
+    Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim,
+    int64_t correction, bool keepdim);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOpsUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..01dc822cf23a3e55f836bbe8b85ab346e6c5dfe8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ReduceOpsUtils.h
@@ -0,0 +1,449 @@
+#pragma once
+
+#include <limits>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/NonEmptyUtils.h>
+#include <ATen/WrapDimUtilsMulti.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+namespace at::native {
+
+// Maximum and minimum possible scalar values, including infinities
+template <typename scalar_t>
+constexpr scalar_t upper_bound() {
+  using lim = std::numeric_limits<scalar_t>;
+  return lim::has_infinity ? lim::infinity() : lim::max();
+}
+
+template <typename scalar_t>
+constexpr scalar_t lower_bound() {
+  using lim = std::numeric_limits<scalar_t>;
+  return lim::has_infinity ? -lim::infinity() : lim::lowest();
+}
+
+static inline Tensor restride_dim(
+  const Tensor& src, int64_t dim,
+  IntArrayRef replacement_shape
+) {
+  auto strides = ensure_nonempty_vec(src.strides().vec());
+  strides[dim] = 0;
+  return src.as_strided(replacement_shape, strides);
+}
+
+inline void _dimreduce_setup(const Tensor &result, const Tensor &self,
+                                int64_t dim) {
+  IntArrayRef self_sizes = self.sizes();
+  std::vector<int64_t> result_sizes;
+  result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end());
+  result_sizes[dim] = 1;
+  result.resize_(result_sizes);
+}
+
+inline bool _dimreduce_return_trivial(const Tensor &result, const Tensor &self,
+                                      const Scalar& ident, int64_t dim, bool keepdim) {
+  if (self.numel() == 1 && self.ndimension() == 0) {
+    result.resize_({});
+    result.fill_(self);
+    return true;
+  }
+  // Return identity
+  if (self.numel() == 0) {
+    _dimreduce_setup(result, self, dim);
+    result.fill_(ident);
+    if (!keepdim) result.squeeze_(dim);
+    return true;
+  }
+  return false;
+}
+
+inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self,
+                                               int64_t /*dim*/, bool /*keepdim*/, const char* /*fn_name*/) {
+  if (self.numel() == 1 && self.ndimension() == 0) {
+    result.resize_({});
+    result.fill_(self);
+    return true;
+  }
+
+  return false;
+}
+
+inline c10::optional<Tensor> _allreduce_return_trivial(
+    const Tensor& self,
+    const Scalar& ident) {
+  // Return identity
+  if (self.numel() == 0) {
+    return at::scalar_tensor(ident, self.options());
+  }
+  return c10::nullopt;
+}
+
+#define OPTION_TYPE_EQUALITY_CHECK(option, out, self) \
+{ \
+  TORCH_CHECK(\
+    out.option() == self.option(),\
+    "expected ", #option, " ",\
+    self.option(),\
+    " but found ", out.option())\
+}
+
+static inline void check_scalar_type_device_layout_equal(const Tensor& out, const Tensor& self) {
+  OPTION_TYPE_EQUALITY_CHECK(scalar_type, out, self);
+  OPTION_TYPE_EQUALITY_CHECK(device, out.options(), self.options());
+  OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options());
+}
+
+static inline Tensor integer_upcast(const Tensor& self, c10::optional<ScalarType> dtype) {
+  ScalarType scalarType = self.scalar_type();
+  TORCH_CHECK(!isBarebonesUnsignedType(scalarType), "integer upcasting for uint16, uint32 and uint64 is not currently implemented");
+  ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType);
+  return self.toType(upcast_scalarType);
+}
+
+using DimMask = TensorIterator::DimMask;
+
+static DimVector make_dim_vector(OptionalIntArrayRef opt_dims, int64_t ndim) {
+  if (opt_dims.has_value()) {
+    return DimVector(opt_dims.value());
+  } else {
+    std::vector<int64_t> all_dims(ndim);
+    std::iota(all_dims.begin(), all_dims.end(), 0);
+    return DimVector(all_dims);
+  }
+}
+
+static DimMask make_dim_mask(OptionalIntArrayRef opt_dims, int64_t ndim, bool allow_empty_dims=false) {
+  DimMask mask;
+  if (opt_dims.has_value()) {
+    auto dims = opt_dims.value();
+    if (dims.empty() && !allow_empty_dims) {
+      mask = DimMask().flip();
+    } else {
+      mask = at::dim_list_to_bitset(dims, ndim);
+    }
+  } else {
+    mask = DimMask().flip();
+  }
+  return mask;
+}
+
+inline DimVector shape_from_dim_mask(const Tensor& self, DimMask mask, bool keepdim) {
+  auto shape = DimVector(self.sizes());
+  for (int dim = shape.size() - 1; dim >= 0; dim--) {
+    if (mask[dim]) {
+      if (keepdim) {
+        shape[dim] = 1;
+      } else {
+        shape.erase(shape.begin() + dim);
+      }
+    }
+  }
+  return shape;
+}
+
+static void resize_reduction_result(
+    Tensor& result, const Tensor& self, DimMask mask, bool keepdim,
+    ScalarType /*dtype*/)
+{
+  auto shape = shape_from_dim_mask(self, mask, keepdim);
+  TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
+  at::native::resize_output(result, shape);
+}
+
+inline Tensor create_reduction_result(
+  const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype
+) {
+  DimMask mask = make_dim_mask(dim, self.dim());
+  auto shape = shape_from_dim_mask(self, mask, keepdim);
+  return at::empty(shape, self.options().dtype(dtype));
+}
+
+static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, bool keepdim) {
+  if (keepdim) {
+    return result;
+  }
+  auto shape = DimVector(result.sizes());
+  auto stride = DimVector(result.strides());
+  for (const auto dim : c10::irange(ndim)) {
+    if (mask[dim]) {
+      shape.insert(shape.begin() + dim, 1);
+      stride.insert(stride.begin() + dim, 0);
+    }
+  }
+  return result.as_strided(shape, stride);
+}
+
+static TensorIterator make_reduction(
+    const char* name, Tensor& result, const Tensor& self,
+    at::OptionalIntArrayRef dim_opt,
+    bool keepdim, ScalarType in_dtype, ScalarType out_dtype) {
+  // check that result type and dtype match if provided
+  TORCH_CHECK(
+      !result.defined() || result.scalar_type() == out_dtype,
+      name, ": provided dtype must match dtype of result. Got ",
+      toString(result.scalar_type()),
+      " and ",
+      toString(out_dtype),
+      ".");
+  // dim={} performs an all-reduce, same as dim=None
+  IntArrayRef dim = dim_opt.value_or(IntArrayRef{});
+  int64_t ndim = self.dim();
+  auto mask = make_dim_mask(dim, ndim);
+  resize_reduction_result(result, self, mask, keepdim, out_dtype);
+  auto viewed_result = review_reduce_result(result, ndim, mask, keepdim);
+  namedinference::propagate_names_for_reduction(result, self, dim, keepdim);
+  if (self.scalar_type() == in_dtype) {
+    return TensorIterator::reduce_op(viewed_result, self);
+  }
+  return TensorIterator::reduce_op(viewed_result, self.to(in_dtype));
+}
+
+static C10_UNUSED TensorIterator make_reduction(
+    const char* name, Tensor& result, const Tensor& self,
+    at::OptionalIntArrayRef dim, bool keepdim, ScalarType out_dtype) {
+  // special case for type promotion in mixed precision, improves computational
+  // efficiency.
+  // not generalize this to common mismatched input/output types to avoid cross
+  // product of templated kernel launches.
+  const bool gpu_lowp_to_f32 = (
+    self.is_cuda() && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat);
+  auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type()
+                   : self.is_complex() ? c10::toComplexType(out_dtype)
+                                       : out_dtype;
+  return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype);
+}
+
+static TensorIterator make_reduction(
+    const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
+    at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType dtype1,
+    ScalarType dtype2) {
+  // check that result type and dtype match if provided
+  TORCH_CHECK(
+    (!result1.defined() || result1.scalar_type() == dtype1) && (!result2.defined() || result2.scalar_type() == dtype2),
+    name, ": provided dtype must match dtype of result. Got ",
+    toString(result1.scalar_type()), toString(result2.scalar_type()),
+    " and ",
+    toString(dtype1), toString(dtype2),
+    ".");
+
+  // dim={} performs an all-reduce, same as dim=None
+  auto dim = dim_opt.value_or(IntArrayRef{});
+  int64_t ndim = self.dim();
+  DimMask mask = make_dim_mask(dim, ndim);
+  resize_reduction_result(result1, self, mask, keepdim, dtype1);
+  auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim);
+
+  resize_reduction_result(result2, self, mask, keepdim, dtype2);
+  auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim);
+
+  namedinference::propagate_names_for_reduction(result1, self, dim, keepdim);
+  namedinference::propagate_names_for_reduction(result2, self, dim, keepdim);
+
+  // special case for type promotion in mixed precision, improves computational
+  // efficiency.
+  // We don't generalize this to common mismatched input/output types to avoid cross
+  // product of templated kernel launches.
+  if (self.scalar_type() == dtype1 ||
+      (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) {
+    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);
+  }
+  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
+}
+
+static C10_UNUSED TensorIterator make_reduction(
+    const char* name, Tensor& result1, Tensor& result2, const Tensor& self,
+    at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype) {
+  return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype);
+}
+
+static void zero_numel_check_dims(const Tensor& self, const int64_t dim, const char *fn_name) {
+  if (self.ndimension() == 0) {
+    TORCH_CHECK_INDEX(dim == 0 || dim == -1, fn_name,
+      ": Expected reduction dim -1 or 0 for scalar but got ", dim);
+  }
+  else {
+    TORCH_CHECK_INDEX(self.size(dim) != 0, fn_name,
+      ": Expected reduction dim ", dim, " to have non-zero size.");
+  }
+}
+
+static void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) {
+  TORCH_CHECK(
+    !dim.empty(),
+      fn_name, ": Expected reduction dim to be specified for input.numel() == 0. ",
+        "Specify the reduction dim with the 'dim' argument.");
+  for (const int64_t d : dim) {
+    zero_numel_check_dims(self, d, fn_name);
+  }
+}
+
+static std::vector<int64_t> get_zero_numel_tensor_size(
+    const Tensor& self,
+    const int64_t dim,
+    const bool keepdim,
+    const char* fn_name) {
+  TORCH_INTERNAL_ASSERT(self.numel() == 0,  fn_name, ": Expected self.numel() == 0.");
+  zero_numel_check_dims(self, dim, fn_name);
+  std::vector<int64_t> sizes;
+  if (keepdim) {
+    sizes = self.sizes().vec();
+    sizes[dim] = 1;
+  }
+  else {
+    for (const auto d : c10::irange(self.dim())) {
+      if (d != dim) {
+        sizes.push_back(self.sizes()[d]);
+      }
+    }
+  }
+  return sizes;
+}
+
+// Resize the result tensor and indices when result.numel() == 0 depending on values of
+// dim and keepdim for returning tensors containing reduction results.
+// This function should be called when you are reducing a zero-numel tensor and want to
+// resize the output and return it. This function exists for resizing zero-numel
+// tensors when the size of the reduction dimension is non-zero.
+static C10_UNUSED void zero_numel_tensor_resize(Tensor& result, Tensor& result_indices,
+                                     const Tensor& self, const int64_t dim,
+                                     const bool keepdim, const char *fn_name) {
+  auto sizes = get_zero_numel_tensor_size(self, dim, keepdim, fn_name);
+  at::native::resize_output(result, sizes);
+  at::native::resize_output(result_indices, sizes);
+}
+
+inline ScalarType get_dtype_from_self(
+    const Tensor& self,
+    const c10::optional<ScalarType>& dtype,
+    bool promote_integers) {
+  if (dtype.has_value()) {
+    return dtype.value();
+  }
+  ScalarType src_type = self.scalar_type();
+  if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) {
+    return kLong;
+  }
+  return src_type;
+}
+
+inline ScalarType get_dtype_from_result(Tensor& result, c10::optional<ScalarType> dtype) {
+  TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor.");
+  if (dtype.has_value()) {
+    return dtype.value();
+  } else {
+    return result.scalar_type();
+  }
+}
+
+
+} // namespace at::native
+
+namespace at::meta {
+
+static C10_UNUSED DimVector get_reduction_shape(
+    const Tensor& self,
+    IntArrayRef dims,
+    bool keepdim,
+    bool allow_empty_dims=false) {
+  auto mask = native::make_dim_mask(dims, self.dim(), allow_empty_dims);
+  return native::shape_from_dim_mask(self, mask, keepdim);
+}
+
+static void resize_reduction(
+    impl::MetaBase& meta,
+    const Tensor& self,
+    OptionalIntArrayRef opt_dims,
+    bool keepdim,
+    ScalarType out_dtype,
+    bool allow_empty_dims=false) {
+  DimVector dims_ = at::native::make_dim_vector(opt_dims, self.dim());
+  maybe_wrap_dims(dims_, self.dim());
+  auto shape = get_reduction_shape(self, dims_, keepdim, allow_empty_dims);
+  meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype));
+  namedinference::propagate_names_for_reduction(
+      meta.maybe_get_output(), self, dims_, keepdim);
+}
+
+static void resize_reduction_with_indices(
+    impl::MetaBase& meta,
+    const Tensor& self,
+    IntArrayRef dims,
+    bool keepdim,
+    ScalarType out_dtype) {
+  DimVector dims_(dims);
+  maybe_wrap_dims(dims_, self.dim());
+  auto shape = get_reduction_shape(self, dims_, keepdim);
+  meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype));
+  meta.set_output_raw_strided(1, shape, {}, self.options().dtype(kLong));
+  namedinference::propagate_names_for_reduction(
+      meta.maybe_get_output(0), self, dims_, keepdim);
+  namedinference::propagate_names_for_reduction(
+      meta.maybe_get_output(1), self, dims_, keepdim);
+}
+
+static TensorIterator make_reduction(
+    const Tensor& self,
+    const Tensor& result,
+    OptionalIntArrayRef opt_dims,
+    bool keepdim,
+    ScalarType in_dtype) {
+  int64_t ndim = self.dim();
+  auto mask = at::native::make_dim_mask(opt_dims, ndim);
+  auto viewed_result =
+      at::native::review_reduce_result(result, ndim, mask, keepdim);
+  if (self.scalar_type() == in_dtype) {
+    return TensorIterator::reduce_op(viewed_result, self);
+  }
+  return TensorIterator::reduce_op(viewed_result, self.to(in_dtype));
+}
+
+static TensorIterator make_reduction(
+    const Tensor& self,
+    const Tensor& result1,
+    const Tensor& result2,
+    IntArrayRef dims,
+    bool keepdim,
+    ScalarType dtype1,
+    ScalarType /*dtype2*/) {
+  int64_t ndim = self.dim();
+  auto mask = at::native::make_dim_mask(dims, ndim);
+  auto viewed_result1 = at::native::review_reduce_result(result1, ndim, mask, keepdim);
+  auto viewed_result2 = at::native::review_reduce_result(result2, ndim, mask, keepdim);
+  // special case for type promotion in mixed precision, improves computational efficiency.
+  // We don't generalize this to common mismatched input/output types to avoid cross product
+  // of templated kernel launches.
+  if (self.scalar_type() == dtype1 ||
+      (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) {
+    return TensorIterator::reduce_op(viewed_result1, viewed_result2, self);
+  }
+  return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1));
+}
+
+static C10_UNUSED TensorIterator make_reduction_from_out_ty(
+    const Tensor& self,
+    const Tensor& result,
+    OptionalIntArrayRef opt_dims,
+    bool keepdim,
+    ScalarType out_dtype) {
+  // special case for type promotion in mixed precision, improves computational
+  // efficiency.
+  // not generalize this to common mismatched input/output types to avoid cross
+  // product of templated kernel launches.
+  const bool gpu_lowp_to_f32 =
+      (self.is_cuda() &&
+       (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) &&
+       out_dtype == kFloat);
+  auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype;
+  return make_reduction(self, result, opt_dims, keepdim, in_dtype);
+}
+
+} // namespace at::meta
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ReductionType.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ReductionType.h
new file mode 100644
index 0000000000000000000000000000000000000000..97328c227ad6cb7556acc3f9efeca4bf3a66eaf6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ReductionType.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+enum class ReductionType {MAX, MEAN, MIN, SUM, PROD};
+
+static inline ReductionType get_reduction_enum(const c10::string_view& reduce) {
+  if (reduce == "max" || reduce == "amax") {
+    return ReductionType::MAX;
+  } else if (reduce == "mean") {
+    return ReductionType::MEAN;
+  } else if (reduce == "min" || reduce == "amin") {
+    return ReductionType::MIN;
+  } else if (reduce == "sum") {
+    return ReductionType::SUM;
+  } else if (reduce == "prod") {
+    return ReductionType::PROD;
+  } else {
+    TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin, got ", reduce);
+  }
+}
+
+// used for `scatter_reduce`, old options for BC.
+static inline ReductionType get_operator_enum(const c10::string_view reduce, bool use_new_options) {
+  if (use_new_options) {
+    return get_reduction_enum(reduce);
+  } else {
+    if (reduce == "add") {
+      return ReductionType::SUM;
+    } else if (reduce == "multiply") {
+      return ReductionType::PROD;
+    } else {
+      TORCH_CHECK(false, "reduce argument must be either add or multiply.")
+    }
+  }
+}
+
+} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Repeat.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Repeat.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3a81f0fba67747235a68fa13e8a2cc6d539b09f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Repeat.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorOperators.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
+namespace at::native {
+
+template <
+    typename index_t,
+    void compute(index_t*, int64_t*, index_t*, int64_t, int64_t)>
+static inline Tensor repeat_interleave_common(
+    const Tensor& repeats,
+    c10::optional<int64_t> output_size) {
+  TORCH_CHECK(
+      repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+  TORCH_CHECK(
+      repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt,
+      "repeats has to be Long or Int tensor");
+  if (repeats.size(0) == 0) {
+    return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeats_ = repeats.contiguous();
+  Tensor cumsum = repeats.cumsum(0);
+  int64_t total;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK(
+        (repeats >= 0).all().item<uint8_t>(), "repeats can not be negative");
+  }
+
+  Tensor result = at::empty({total}, repeats.options());
+  index_t* repeat_ptr = repeats_.data_ptr<index_t>();
+  int64_t* cumsum_ptr = cumsum.data_ptr<int64_t>();
+  index_t* result_ptr = result.data_ptr<index_t>();
+  compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total);
+  return result;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Resize.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Resize.h
new file mode 100644
index 0000000000000000000000000000000000000000..11aba9b4087f20b8596bae9e718829502d900867
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Resize.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ResizeCommon.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorUtils.h>
+
+#include <c10/core/CPUAllocator.h>
+
+#include <utility>
+
+
+namespace at::native {
+
+// TODO: make all operations that resize given outputs use this function
+//   for consistency and maintainability.
+//   Some operations like `cat` might not be able to make the use of
+//   resize_output directly. For more details to understand how it works in `cat`,
+//   see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
+// Resizes outputs
+// Functions accepting output tensors, like with the "out" kwarg, should
+//   call this function to handle resizing their output tensor.
+// Issues a warning if the output tensor has one or more elements and
+//   needs resizing
+// NOTE: In the future the warning will become an error
+// Returns a bool saying whether or not the resize actually happened or not
+TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape);
+// WARNING: Do NOT call this directly. If you are resizing an output and want
+// to support dynamic shapes call at::resize__symint and resize_output_check_symint.
+// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272
+TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape);
+
+// Utility for resize_output
+//  Returns a bool saying resize should happen or not and
+//  raises a warning if resizing for one or more elements
+TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape);
+TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape);
+
+TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
+TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes);
+TORCH_API void resize_bytes_nocuda(const Storage& storage, c10::SymInt size_bytes);
+
+static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in cuda/Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+
+  const Storage& storage = self->unsafe_storage();
+  if (!storage) {
+    auto new_storage = c10::make_intrusive<StorageImpl>(
+        StorageImpl::use_byte_size_t(),
+        new_size_bytes,
+        c10::GetCPUAllocator(),
+        true);
+    self->set_storage_keep_dtype(std::move(new_storage));
+  } else if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+
+TORCH_API TensorImpl* resize_impl_cpu_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool resize_storage = true);
+
+template <typename T>
+T maybe_convert_symint(c10::SymInt) = delete;
+
+template <>
+inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; }
+
+template <>
+inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); }
+
+template <typename T>
+static inline void checkInBoundsForStorage(
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset,
+    const caffe2::TypeMeta& data_type,
+    const Storage& new_storage) {
+  T storage_size_bytes =
+      at::detail::computeStorageNbytes(size, stride, data_type.itemsize());
+  T storage_offset_bytes = storage_offset * data_type.itemsize();
+  if (storage_size_bytes == 0) {
+    // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel.
+    return;
+  }
+  T new_storage_size_bytes = maybe_convert_symint<T>(new_storage.sym_nbytes());
+  TORCH_CHECK(
+      storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes,
+      "setStorage: sizes ",
+      size,
+      ", strides ",
+      stride,
+      ","
+      " storage offset ",
+      storage_offset,
+      ", and itemsize ",
+      data_type.itemsize(),
+      " requiring a storage size of ",
+      storage_size_bytes + storage_offset_bytes,
+      " are out of bounds for storage of size ",
+      new_storage_size_bytes);
+}
+
+template <typename T>
+static inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset,
+                                   ArrayRef<T> size, ArrayRef<T> stride) {
+  // FIXME: stride should be optional
+  if (stride.data()) {
+    TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(),
+                                              ") and stride length (", stride.size(), ")");
+  }
+
+#ifdef DEBUG
+  TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX");
+#endif
+
+  // storage: note this can't be replaced with result.set_(storage) as the semantics of that
+  // function is to set the tensor size to be equal to the size of the storage.
+  if (!result.storage().is_alias_of(storage)) {
+    // Caffe2 might have tensors whose storages are null, but we
+    // don't allow it in PyTorch.
+    TORCH_INTERNAL_ASSERT(storage);
+    TORCH_INTERNAL_ASSERT(result.storage());
+
+    // We used to allow this, but this breaks device caching.
+    // Let's put an actual error message for this one.
+    TORCH_CHECK(result.storage().device() == storage.device(),
+                "Attempted to set the storage of a tensor on device \"", result.storage().device(),
+                "\" to a storage on different device \"", storage.device(),
+                "\".  This is no longer allowed; the devices must match.");
+    result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage));
+  }
+
+  // storageOffset
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+}
+
+/**
+ * Set self's sizes, strides, and storage_offset.
+ * (size, stride, storage_offset) must be in bounds for self's storage.
+ */
+template <typename T>
+inline void setStrided(
+    const Tensor& self,
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset) {
+  TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
+  for (const auto& val : stride) {
+    TORCH_CHECK(val >= 0,
+                "as_strided: Negative strides are not supported at the moment, "
+                "got strides: ", stride);
+  }
+
+  auto* self_ = self.unsafeGetTensorImpl();
+  checkInBoundsForStorage(
+      size, stride, storage_offset, self_->dtype(), self_->storage());
+
+  /* storage offset */
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+  self_->set_sizes_and_strides(size, stride, c10::make_optional(storage_offset));
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ResizeCommon.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ResizeCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa836fac7b06db176d1712f6a5040d1598651a7b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ResizeCommon.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/NamedTensorUtils.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+template <typename T>
+inline T storage_size_for(ArrayRef<T> size, ArrayRef<T> stride) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(),
+      "storage_size_for(size, stride) requires that size and stride ",
+      "have the same size as a precondition.");
+  T storage_size = 1;
+  for (const auto dim : c10::irange(size.size())) {
+    if (size[dim] == 0) {
+      storage_size = 0;
+      break;
+    }
+    storage_size += (size[dim] - 1) * stride[dim];
+  }
+  return storage_size;
+}
+
+inline const Tensor& resize_named_tensor_(
+    const Tensor& self,
+    IntArrayRef size,
+    c10::optional<MemoryFormat> optional_memory_format) {
+  TORCH_INTERNAL_ASSERT(self.has_names());
+  TORCH_CHECK(
+      self.sizes() == size,
+      "Cannot resize named tensor with resize_ or resize_as_ (tried to resize "
+      "Tensor",
+      self.names(),
+      " with size ",
+      self.sizes(),
+      " to ",
+      size,
+      "). This may be caused by passing a named tensor ",
+      "as an `out=` argument; please ensure that the sizes are the same. ");
+  TORCH_CHECK(
+      !optional_memory_format.has_value(),
+      "Unsupported memory format for named tensor resize ",
+      optional_memory_format.value());
+  return self;
+}
+
+// For deterministic output, fill new elements that were added after a storage
+// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage
+// before the resize happened.
+inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) {
+  const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage();
+  int64_t new_storage_nbytes = storage.nbytes();
+  int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize();
+  int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize();
+  if (new_storage_numel > old_storage_numel) {
+    at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device()));
+    tensor_view.set_(
+      storage,
+      /*storage_offset=*/old_storage_numel,
+      /*size=*/{new_storage_numel - old_storage_numel},
+      /*stride=*/{1});
+    at::native::fill_empty_deterministic_(tensor_view);
+  }
+  return tensor;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/ScatterGatherChecks.h b/MLPY/Lib/site-packages/torch/include/ATen/native/ScatterGatherChecks.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b8a3a81abe3eca3cded51cb855366d388469441
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/ScatterGatherChecks.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <vector>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+namespace {
+
+// checks whether index.dtype == int64
+// and self.dtype == src.dtype if src is a Tensor
+static void scatter_gather_dtype_check(
+  const std::string& method_name,
+  const Tensor& self,
+  const Tensor& index,
+  const c10::optional<Tensor>& src_opt = c10::nullopt
+) {
+  if (index.numel() != 0) {
+    TORCH_CHECK(
+      index.scalar_type() == at::ScalarType::Long,
+      method_name, "(): Expected dtype int64 for index"
+    );
+  }
+
+  if (src_opt.has_value()) {
+    const auto& src = src_opt.value();
+    TORCH_CHECK(
+      self.scalar_type() == src.scalar_type(),
+      method_name, "(): Expected self.dtype to be equal to src.dtype"
+    );
+  }
+}
+
+// Used for `gather`-like methods
+// Note: self means the input tensor here
+// Test:
+// 1. index.size(d) <= self.size(d) for all d != dim
+// 2. index.dim() == self.dim()
+static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
+  const Tensor& index
+) {
+  auto self_dims = ensure_nonempty_dim(self.dim());
+  TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as input tensor"
+  );
+
+  for (const auto i : c10::irange(self_dims)) {
+    if (i != dim) {
+      TORCH_CHECK(
+        ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i),
+        "Size does not match at dimension ", i,
+        " expected index ", index.sizes(),
+        " to be smaller than self ", self.sizes(),
+        " apart from dimension ", dim
+      );
+    }
+  }
+}
+
+// Used for `scatter` and `scatter_add`
+// Tests:
+//  1. index.size(d) <= self.size(d) for all d != dim
+//  2. index.size(d) <= src.size(d) for all d if src is a Tensor
+//  3. index.dim() == self.dim() == src.dim()
+static C10_UNUSED void scatter_shape_check(
+  const Tensor& self, int64_t dim, const Tensor& index,
+  const c10::optional<Tensor>& src_opt = c10::nullopt
+) {
+  if (index.numel() == 0) return;
+  TORCH_CHECK(
+    ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as self tensor"
+  );
+
+  bool is_wrong_shape = false;
+  int64_t self_dims = ensure_nonempty_dim(self.dim());
+
+  //  Check: index.size(d) <= self.size(d) for all d != dim
+  for (const auto d : c10::irange(self_dims)) {
+    int64_t index_d_size = ensure_nonempty_size(index, d);
+    if (d == dim) continue;
+    if (index_d_size > ensure_nonempty_size(self, d)) {
+      is_wrong_shape = true;
+      break;
+    }
+  }
+
+  //  Check: index.size(d) <= src.size(d) for all d if src is Tensor
+  if (!is_wrong_shape && src_opt.has_value()) {
+    const auto& src = src_opt.value();
+    for (const auto d : c10::irange(self_dims)) {
+      int64_t index_d_size = ensure_nonempty_size(index, d);
+      if (index_d_size > ensure_nonempty_size(src, d)) {
+        is_wrong_shape = true;
+        break;
+      }
+    }
+  }
+
+  if (src_opt.has_value()) {
+    const auto& src = src_opt.value();
+
+    TORCH_CHECK(
+      ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()),
+      "Index tensor must have the same number of dimensions as src tensor"
+    );
+
+    TORCH_CHECK(!is_wrong_shape,
+      "Expected index ", index.sizes(),
+      " to be smaller than self ", self.sizes(),
+      " apart from dimension ", dim,
+      " and to be smaller size than src ", src.sizes()
+    );
+  }
+  else {
+    TORCH_CHECK(!is_wrong_shape,
+      "Expected index ", index.sizes(),
+      " to be smaller than self ", self.sizes(),
+      " apart from dimension ", dim
+    );
+  }
+}
+
+} // anonymous namespace
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SegmentReduce.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SegmentReduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..20c251cfd7581f5861e969880a54a6459dfca06b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SegmentReduce.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using segment_reduce_lengths_fn = Tensor (*)(
+    ReductionType,
+    const Tensor&,
+    const Tensor&,
+    int64_t,
+    const c10::optional<Scalar>&);
+DECLARE_DISPATCH(segment_reduce_lengths_fn, _segment_reduce_lengths_stub);
+
+using segment_reduce_offsets_fn = Tensor (*)(
+    ReductionType,
+    const Tensor&,
+    const Tensor&,
+    int64_t,
+    const c10::optional<Scalar>&);
+DECLARE_DISPATCH(segment_reduce_offsets_fn, _segment_reduce_offsets_stub);
+
+using segment_reduce_lengths_backward_fn = Tensor (*)(
+    const Tensor&,
+    const Tensor&,
+    const Tensor&,
+    ReductionType,
+    const Tensor&,
+    int64_t,
+    const c10::optional<Scalar>&);
+DECLARE_DISPATCH(segment_reduce_lengths_backward_fn, _segment_reduce_lengths_backward_stub);
+
+using segment_reduce_offsets_backward_fn = Tensor (*)(
+    const Tensor&,
+    const Tensor&,
+    const Tensor&,
+    ReductionType,
+    const Tensor&,
+    int64_t,
+    const c10::optional<Scalar>&);
+DECLARE_DISPATCH(segment_reduce_offsets_backward_fn, _segment_reduce_offsets_backward_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SharedReduceOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SharedReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..d619b05237acac6634544297b43fcf24d807ab6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SharedReduceOps.h
@@ -0,0 +1,544 @@
+#pragma once
+// Please note that this file is
+// used across both CPU and GPU.
+
+#include <type_traits>
+#include <complex>
+#include <c10/macros/Macros.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/NumericUtils.h>
+#if defined(__CUDACC__)
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/native/cuda/DeviceSqrt.cuh>
+#elif defined(__HIPCC__)
+#include <ATen/hip/DeviceUtils.cuh>
+#include <ATen/native/hip/DeviceSqrt.cuh>
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/pair.h>
+#else
+#include <cmath>
+#define device_sqrt std::sqrt
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename scalar_t>
+inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
+#else
+  scalar_t max = at::_isnan(b) ? b : std::max(a, b);
+#endif
+  return max;
+}
+template <typename scalar_t>
+inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
+#else
+  scalar_t min = at::_isnan(b) ? b : std::min(a, b);
+#endif
+  return min;
+}
+#define MAX(X, Y) max_propagate_nan(X,Y)
+#define MIN(X, Y) min_propagate_nan(X,Y)
+#else
+#include <ATen/native/cpu/zmath.h>
+#define MAX(X, Y) max_impl(X,Y)
+#define MIN(X, Y) min_impl(X,Y)
+#endif
+
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define compat_pow c10::cuda::compat::pow
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define compat_pow c10::hip::compat::pow
+#else
+#define compat_pow std::pow
+#endif
+
+namespace at { namespace native {
+
+namespace detail {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T1, typename T2> using pair = thrust::pair<T1, T2>;
+#else
+template <typename T1, typename T2> using pair = std::pair<T1, T2>;
+#endif
+
+} // namespace detail
+
+template <typename scalar_t, typename index_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  scalar_t nf;
+
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      scalar_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
+struct WelfordOps {
+  acc_scalar_t correction;
+  bool take_sqrt;
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    // We accumulate n in index_t to avoid cumulative rounding error, but still
+    // need nf for use in combine where int32 may overflow.
+    index_t new_n = acc.n + 1;
+    acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
+    acc_scalar_t delta = data - acc.mean;
+    acc_scalar_t new_mean = acc.mean + delta / new_nf;
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+      new_mean,
+      acc.m2 + delta * new_delta,
+      new_n,
+      new_nf,
+    };
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    acc_scalar_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
+  }
+  inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
+    const auto mean = static_cast<scalar_t>(acc.mean);
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto var = acc.m2 / divisor;
+    res_t results(take_sqrt ? device_sqrt(var) : var, mean);
+    return results;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.mean, offset)
+      , WARP_SHFL_DOWN(acc.m2, offset)
+      , WARP_SHFL_DOWN(acc.n, offset)
+      , WARP_SHFL_DOWN(acc.nf, offset)
+    };
+  }
+#endif
+  C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt)
+      : correction(correction), take_sqrt(take_sqrt) {}
+};
+
+template <typename scalar_t, typename acc_t=scalar_t, typename factor_t=acc_t, typename out_t = acc_t>
+struct MeanOps {
+  factor_t factor;
+
+  inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const {
+    return combine(a, static_cast<acc_t>(b));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a * factor;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+
+  MeanOps(factor_t factor): factor(factor) {
+  }
+};
+
+// This accumulator template is used to calculate the minimum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMinOps {
+
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MIN(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MIN(a, b);
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the maximum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMaxOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MAX(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MAX(a, b);
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the norm of the absolute value
+// of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOps {
+  acc_t norm_;
+
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + compat_pow(static_cast<acc_t>(std::abs(data)), norm_);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+
+  NormOps(acc_t norm_): norm_(norm_) {
+  }
+};
+
+// This accumulator template is used to calculate the order zero norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormZeroOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0) : static_cast<acc_t>(1));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the order one norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOneOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + static_cast<acc_t>(std::abs(data));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+
+template<typename acc_t>
+struct AbsSwitch {};
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(data);
+}
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+// This accumulator template is used to calculate the order two norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormTwoOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
+    return acc + data_ * data_;
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return device_sqrt(a);
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+template <typename acc_t, typename data_t>
+struct NanSumOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const {
+    return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b});
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return  a + b;
+  }
+
+  inline C10_DEVICE data_t project(acc_t a) const {
+    return data_t{a};
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
+namespace detail {
+
+template <typename scalar_t>
+struct LessOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else min(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a < b);
+  }
+};
+
+template <typename scalar_t>
+struct GreaterOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else max(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a > b);
+  }
+};
+
+template <typename comp_t>
+struct MinMaxReductionOps {
+  using scalar_t = typename binary_function_traits<comp_t>::arg1_t;
+  using index_t = int64_t;
+  using arg_t = detail::pair<scalar_t, index_t>;
+
+  static C10_DEVICE arg_t project(arg_t arg) {
+    return arg;
+  }
+
+  static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) {
+    return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx);
+  }
+
+  static C10_DEVICE arg_t combine(arg_t a, arg_t b) {
+    return comp_t{}(a.first, b.first, a.second, b.second) ? a : b;
+  }
+
+  static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) {
+    return {a.first, a.second + base_idx};
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) {
+    return arg_t(WARP_SHFL_DOWN(arg.first, offset),
+                 WARP_SHFL_DOWN(arg.second, offset));
+  }
+#endif
+};
+
+template <typename comp_t>
+struct ArgReductionOps : public MinMaxReductionOps<comp_t> {
+  using typename MinMaxReductionOps<comp_t>::scalar_t;
+  using typename MinMaxReductionOps<comp_t>::index_t;
+  using typename MinMaxReductionOps<comp_t>::arg_t;
+
+  static C10_DEVICE index_t project(arg_t arg) {
+    return arg.second;
+  }
+};
+
+} // namespace detail
+
+template <typename scalar_t>
+struct ArgMaxOps :
+  public detail::ArgReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct ArgMinOps :
+  public detail::ArgReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MinOps :
+  public detail::MinMaxReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MaxOps :
+  public detail::MinMaxReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t>
+struct MinMaxOps {
+  using acc_t = detail::pair<acc_scalar_t, acc_scalar_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    return combine(acc, {data, data});
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first;
+    auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second;
+
+    return {min_val, max_val};
+  }
+
+  inline C10_DEVICE acc_t project(acc_t acc) const {
+    return acc;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset)
+    };
+  }
+#endif
+};
+
+}} // namespace at::native
+
+#undef MAX
+#undef MIN
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fbc4e311dea531b7d0a2501dad0685671f8a1b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h
@@ -0,0 +1,55 @@
+/// This file contains some tensor-agnostic operations to be used in the
+/// core functions of the `SobolEngine`
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/pow.h>
+#endif
+
+namespace at::native::sobol_utils {
+
+/// Function to return the minimum of number of bits to represent the integer `n`
+inline int64_t bit_length(const int64_t n) {
+  int64_t nbits, nloc;
+  for (nloc = n, nbits = 0; nloc > 0; nloc /= 2, nbits++);
+  return nbits;
+}
+
+/// Function to get the position of the rightmost zero in the bit representation of an integer
+/// This value is the zero-indexed position
+inline int64_t rightmost_zero(const int64_t n) {
+  int64_t z, i;
+  for (z = n, i = 0; z % 2 == 1; z /= 2, i++);
+  return i;
+}
+
+/// Function to get a subsequence of bits in the representation of an integer starting from
+/// `pos` and of length `length`
+inline int64_t bitsubseq(const int64_t n, const int64_t pos, const int64_t length) {
+  return (n >> pos) & ((1 << length) - 1);
+}
+
+/// Function to perform the inner product between a batched square matrix and a power of 2 vector
+inline at::Tensor cdot_pow2(const at::Tensor& bmat) {
+  at::Tensor inter = at::arange(bmat.size(-1) - 1, -1, -1, bmat.options());
+  inter = at::pow(2, inter).expand_as(bmat);
+  return at::mul(inter, bmat).sum(-1);
+}
+
+/// All definitions below this point are data. These are constant, and should not be modified
+/// without notice
+
+constexpr int64_t MAXDIM = 21201;
+constexpr int64_t MAXDEG = 18;
+constexpr int64_t MAXBIT = 30;
+constexpr int64_t LARGEST_NUMBER = 1 << MAXBIT;
+constexpr float RECIPD = 1.0 / LARGEST_NUMBER;
+
+extern const int64_t poly[MAXDIM];
+extern const int64_t initsobolstate[MAXDIM][MAXDEG];
+
+} // namespace at::native::sobol_utils
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Sorting.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Sorting.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2126bd083d7ae7496ef06190557f17421708cdb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Sorting.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+enum class QUANTILE_INTERPOLATION_MODE : uint8_t {
+  LINEAR,
+  LOWER,
+  HIGHER,
+  MIDPOINT,
+  NEAREST
+};
+
+using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool);
+using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool);
+
+DECLARE_DISPATCH(sort_fn, sort_stub);
+DECLARE_DISPATCH(topk_fn, topk_stub);
+
+void _fill_indices(const TensorBase &indices, int64_t dim);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SortingUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SortingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb9e3d37c6768e08cc091d2ce8c7efed04a8a2cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SortingUtils.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+// ensure we get good values and indices for kthvalue, mode
+// this will always be with the reducing dim as 1-d
+inline void _reduction_with_indices_allocate_or_resize_output(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = 1;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    if (!keepdim && values.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      values.unsqueeze_(dim);
+    }
+    resize_output(values, result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    if (!keepdim && indices.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      indices.unsqueeze_(dim);
+    }
+    resize_output(indices, result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+
+// ensure we get good values and indices for topk
+inline void _allocate_or_resize_output_with_indices(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    int64_t k) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = k;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    values.resize_(result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    indices.resize_(result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SparseTensorUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SparseTensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f44d51b352eebed86c0743eca4446842a1b65ca3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SparseTensorUtils.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#endif
+
+namespace at::sparse {
+
+// Just for documentary purposes
+using SparseTensor = Tensor;
+using SparseType = Type;
+
+// This is an internal utility function for getting at the SparseTensorImpl,
+// so that we can write sparse tensor specific accessors for special fields
+// in SparseTensor.  You should only use this for writing low level
+// setters/getters for SparseTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+//
+// This may be called repeatedly, so make sure it's pretty cheap.
+inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+  TORCH_INTERNAL_ASSERT(
+      self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
+  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
+}
+
+// Takes indices and values and directly puts them into the sparse tensor, no
+// copy.  This used to be called THSTensor_(_move)
+inline void alias_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values) {
+  get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
+}
+
+// Take indices and values and makes a (data) copy of them to put into the
+// sparse indices/values.  This used to be called THSTensor_(_set)
+inline void copy_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values,
+    bool non_blocking) {
+  alias_into_sparse(
+      self,
+      indices.to(self._indices().options(), non_blocking, /*copy=*/true),
+      values.to(self._values().options(), non_blocking, /*copy=*/true));
+}
+
+// TODO: put this into the public API
+inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) {
+  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
+}
+
+inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) {
+  return self.sparse_dim() == src.sparse_dim() &&
+      self.dense_dim() == src.dense_dim();
+}
+
+// Give us a new values tensor, with the same dimensionality
+// as 'values' but with a new number of non-zero elements.
+// TODO: Expose this for real in ATen, some day?
+// NB: Doesn't preserve data.
+inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
+  std::vector<int64_t> size = values.sizes().vec();
+  size[0] = nnz;
+  return at::empty(size, values.options());
+}
+
+// NOTE [ Flatten Sparse Indices ]
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+TORCH_API Tensor flatten_indices(
+    const Tensor& indices,
+    IntArrayRef full_size,
+    bool force_clone = false);
+
+// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten
+// Sparse Indices ], except this one allows partial flatten: only flatten on
+// specified dims. Note that the flatten indices might be uncoalesced if
+// dims_to_flatten.size() < sparse_dim. Also if input indices is already
+// coalesced, the flattened indices will also be sorted.
+//
+// args:
+//    indices: sparse tensor indices
+//    sizes: sparse tensor sizes
+//    dims_to_flatten: a list of dim index to flatten
+//
+// Ex1:
+//   indices = [[2, 4, 0],
+//             [3, 1, 3]]
+//   sizes = [2, 12]
+//   dims_to_flatten = [0, 1]
+//   new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3]
+//
+// Ex2:
+//   dims_to_flatten = [1]
+//   new_indices = [ 3, 1, 3 ]  # uncoalesced
+TORCH_API Tensor flatten_indices_by_dims(
+    const Tensor& indices,
+    const IntArrayRef& sizes,
+    const IntArrayRef& dims_to_flatten);
+
+// Find the CSR representation for a row `indices` from the COO format
+TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
+
+TORCH_API Tensor zeros_like_with_indices(const Tensor& t);
+
+template <size_t static_shape_max_len>
+class TensorGeometryHolder {
+  using geometry_holder_t = std::array<int64_t, static_shape_max_len>;
+
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options = {}) {
+    std::copy(sizes.begin(), sizes.end(), t_sizes.begin());
+    std::copy(strides.begin(), strides.end(), t_strides.begin());
+  }
+
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides()) {}
+
+  auto operator*() const {
+    return std::make_tuple(t_sizes, t_strides);
+  }
+
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
+template <>
+class TensorGeometryHolder<0> {
+  using geometry_holder_t = Tensor;
+
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options) {
+    const int64_t t_ndims = sizes.size();
+    const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU);
+    Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options);
+    t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options));
+    t_sizes_and_strides_cpu.select(0, 1).copy_(
+        at::tensor(strides, cpu_options));
+    const Tensor t_sizes_and_strides =
+        t_sizes_and_strides_cpu.to(options.device());
+    t_sizes = t_sizes_and_strides.select(0, 0);
+    t_strides = t_sizes_and_strides.select(0, 1);
+  }
+
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {}
+
+  auto operator*() const {
+    return std::make_tuple(
+        t_sizes.template data_ptr<int64_t>(),
+        t_strides.template data_ptr<int64_t>());
+  }
+
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
+// Return all indices of a tensor with the given shape.
+//
+// full_coo_indices(shape) is equivalent to
+// torch.ones(shape).nonzero().transpose(-2, -1) but much faster.
+TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options);
+
+} // namespace at::sparse
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/SpectralOpsUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/SpectralOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a35d87e522307c99b9d0b95dfcc46bd0a93fe00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/SpectralOpsUtils.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include <sstream>
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/TensorBase.h>
+
+namespace at::native {
+
+// Normalization types used in _fft_with_size
+enum class fft_norm_mode {
+  none,       // No normalization
+  by_root_n,  // Divide by sqrt(signal_size)
+  by_n,       // Divide by signal_size
+};
+
+// NOTE [ Fourier Transform Conjugate Symmetry ]
+//
+// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
+// assuming X is the transformed K-dimensionsal signal, we have
+//
+//     X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
+//
+//       where j_k  = (N_k - i_k)  mod N_k, N_k being the signal size at dim k,
+//             * is the conjugate operator.
+//
+// Therefore, in such cases, FFT libraries return only roughly half of the
+// values to avoid redundancy:
+//
+//     X[:, :, ..., :floor(N / 2) + 1]
+//
+// This is also the assumption in cuFFT and MKL. In ATen SpectralOps, such
+// halved signal will also be returned by default (flag onesided=True).
+// The following infer_ft_real_to_complex_onesided_size function calculates the
+// onesided size from the twosided size.
+//
+// Note that this loses some information about the size of signal at last
+// dimension. E.g., both 11 and 10 maps to 6. Hence, the following
+// infer_ft_complex_to_real_onesided_size function takes in optional parameter
+// to infer the twosided size from given onesided size.
+//
+// cuFFT doc: http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional
+// MKL doc: https://software.intel.com/en-us/mkl-developer-reference-c-dfti-complex-storage-dfti-real-storage-dfti-conjugate-even-storage#CONJUGATE_EVEN_STORAGE
+
+inline int64_t infer_ft_real_to_complex_onesided_size(int64_t real_size) {
+  return (real_size / 2) + 1;
+}
+
+inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size,
+                                                      int64_t expected_size=-1) {
+  int64_t base = (complex_size - 1) * 2;
+  if (expected_size < 0) {
+    return base + 1;
+  } else if (base == expected_size) {
+    return base;
+  } else if (base + 1 == expected_size) {
+    return base + 1;
+  } else {
+    std::ostringstream ss;
+    ss << "expected real signal size " << expected_size << " is incompatible "
+       << "with onesided complex frequency size " << complex_size;
+    AT_ERROR(ss.str());
+  }
+}
+
+using fft_fill_with_conjugate_symmetry_fn =
+    void (*)(ScalarType dtype, IntArrayRef mirror_dims, IntArrayRef half_sizes,
+             IntArrayRef in_strides, const void* in_data,
+             IntArrayRef out_strides, void* out_data);
+DECLARE_DISPATCH(fft_fill_with_conjugate_symmetry_fn, fft_fill_with_conjugate_symmetry_stub);
+
+// In real-to-complex transform, cuFFT and MKL only fill half of the values
+// due to conjugate symmetry. This function fills in the other half of the full
+// fft by using the Hermitian symmetry in the signal.
+// self should be the shape of the full signal and dims.back() should be the
+// one-sided dimension.
+// See NOTE [ Fourier Transform Conjugate Symmetry ]
+TORCH_API void _fft_fill_with_conjugate_symmetry_(const Tensor& self, IntArrayRef dims);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/StridedRandomAccessor.h b/MLPY/Lib/site-packages/torch/include/ATen/native/StridedRandomAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee7da926bae6bdf5c8c9e1149152f6c75f263aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/StridedRandomAccessor.h
@@ -0,0 +1,301 @@
+#pragma once
+
+namespace at::native {
+
+// (Const)StridedRandomAccessor is a
+// (const) random access iterator defined over
+// a strided array.
+
+// The traits below are to introduce __restrict__
+// modifier on different platforms.
+
+template <typename T>
+struct DefaultPtrTraits {
+  using PtrType = T*;
+};
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define RESTRICT __restrict
+#else
+#define RESTRICT __restrict__
+#endif
+
+template <typename T>
+struct RestrictPtrTraits {
+  using PtrType = T* RESTRICT;
+};
+
+template <
+  typename T,
+  typename index_t = int64_t,
+  template <typename U> class PtrTraits = DefaultPtrTraits
+>
+class ConstStridedRandomAccessor {
+public:
+  using difference_type = index_t;
+  using value_type = const T;
+  using pointer = const typename PtrTraits<T>::PtrType;
+  using reference = const value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  using PtrType = typename PtrTraits<T>::PtrType;
+  using index_type = index_t;
+
+  // Constructors {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor(PtrType ptr, index_t stride)
+    : ptr{ptr}, stride{stride}
+  {}
+
+  C10_HOST_DEVICE
+  explicit ConstStridedRandomAccessor(PtrType ptr)
+    : ptr{ptr}, stride{static_cast<index_t>(1)}
+  {}
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor()
+    : ptr{nullptr}, stride{static_cast<index_t>(1)}
+  {}
+  // }
+
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return *ptr;
+  }
+
+  C10_HOST_DEVICE
+  const value_type* operator->() const {
+    return reinterpret_cast<const value_type*>(ptr);
+  }
+
+  C10_HOST_DEVICE
+  reference operator[](index_t idx) const {
+    return ptr[idx * stride];
+  }
+  // }
+
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator++() {
+    ptr += stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator++(int) {
+    ConstStridedRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator--() {
+    ptr -= stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator--(int) {
+    ConstStridedRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator+=(index_t offset) {
+    ptr += offset * stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator+(index_t offset) const {
+    return ConstStridedRandomAccessor(ptr + offset * stride, stride);
+  }
+
+  C10_HOST_DEVICE
+  friend ConstStridedRandomAccessor operator+(
+    index_t offset,
+    const ConstStridedRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor& operator-=(index_t offset) {
+    ptr -= offset * stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  ConstStridedRandomAccessor operator-(index_t offset) const {
+    return ConstStridedRandomAccessor(ptr - offset * stride, stride);
+  }
+
+  // Note that this operator is well-defined when `this` and `other`
+  // represent the same sequences, i.e. when
+  // 1. this.stride == other.stride,
+  // 2. |other - this| / this.stride is an Integer.
+  C10_HOST_DEVICE
+  difference_type operator-(const ConstStridedRandomAccessor& other) const {
+    return (ptr - other.ptr) / stride;
+  }
+  // }
+
+  // Comparison operators {
+  C10_HOST_DEVICE
+  bool operator==(const ConstStridedRandomAccessor& other) const {
+    return (ptr == other.ptr) && (stride == other.stride);
+  }
+
+  C10_HOST_DEVICE
+  bool operator!=(const ConstStridedRandomAccessor& other) const {
+    return !(*this == other);
+  }
+
+  C10_HOST_DEVICE
+  bool operator<(const ConstStridedRandomAccessor& other) const {
+    return ptr < other.ptr;
+  }
+
+  C10_HOST_DEVICE
+  bool operator<=(const ConstStridedRandomAccessor& other) const {
+    return (*this < other) || (*this == other);
+  }
+
+  C10_HOST_DEVICE
+  bool operator>(const ConstStridedRandomAccessor& other) const {
+    return !(*this <= other);
+  }
+
+  C10_HOST_DEVICE
+  bool operator>=(const ConstStridedRandomAccessor& other) const {
+    return !(*this < other);
+  }
+  // }
+
+protected:
+  PtrType ptr;
+  index_t stride;
+};
+
+template <
+  typename T,
+  typename index_t = int64_t,
+  template <typename U> class PtrTraits = DefaultPtrTraits
+>
+class StridedRandomAccessor
+  : public ConstStridedRandomAccessor<T, index_t, PtrTraits> {
+public:
+  using difference_type = index_t;
+  using value_type = T;
+  using pointer = typename PtrTraits<T>::PtrType;
+  using reference = value_type&;
+
+  using BaseType = ConstStridedRandomAccessor<T, index_t, PtrTraits>;
+  using PtrType = typename PtrTraits<T>::PtrType;
+
+  // Constructors {
+  C10_HOST_DEVICE
+  StridedRandomAccessor(PtrType ptr, index_t stride)
+    : BaseType(ptr, stride)
+  {}
+
+  C10_HOST_DEVICE
+  explicit StridedRandomAccessor(PtrType ptr)
+    : BaseType(ptr)
+  {}
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor()
+    : BaseType()
+  {}
+  // }
+
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return *this->ptr;
+  }
+
+  C10_HOST_DEVICE
+  value_type* operator->() const {
+    return reinterpret_cast<value_type*>(this->ptr);
+  }
+
+  C10_HOST_DEVICE
+  reference operator[](index_t idx) const {
+    return this->ptr[idx * this->stride];
+  }
+  // }
+
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator++() {
+    this->ptr += this->stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator++(int) {
+    StridedRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator--() {
+    this->ptr -= this->stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator--(int) {
+    StridedRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator+=(index_t offset) {
+    this->ptr += offset * this->stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator+(index_t offset) const {
+    return StridedRandomAccessor(this->ptr + offset * this->stride, this->stride);
+  }
+
+  C10_HOST_DEVICE
+  friend StridedRandomAccessor operator+(
+    index_t offset,
+    const StridedRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor& operator-=(index_t offset) {
+    this->ptr -= offset * this->stride;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  StridedRandomAccessor operator-(index_t offset) const {
+    return StridedRandomAccessor(this->ptr - offset * this->stride, this->stride);
+  }
+
+  // Note that here we call BaseType::operator- version
+  C10_HOST_DEVICE
+  difference_type operator-(const BaseType& other) const {
+    return (static_cast<const BaseType&>(*this) - other);
+  }
+  // }
+};
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dea4a1a279d78e430a6aaf4258bdabdab2a5e71
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h
@@ -0,0 +1,49 @@
+#pragma once
+
+// Indexing tensors by tensors
+
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace at::native {
+
+using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<c10::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
+using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<c10::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
+using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
+using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
+using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                  const Tensor& src, const ReductionType& reduce);
+using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                         const Scalar& value, const ReductionType& reduce);
+using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                      const Tensor& src, const ReductionType& reduce);
+
+DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub);
+DECLARE_DISPATCH(index_put_with_sort_quantized_fn, index_put_with_sort_quantized_stub);
+DECLARE_DISPATCH(gather_fn, gather_stub);
+DECLARE_DISPATCH(scatter_fn, scatter_stub);
+DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub);
+DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub);
+DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
+DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
+DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub);
+
+TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<c10::optional<at::Tensor>>& indices);
+
+using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&);
+using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool);
+using gather_expanded_index_fn = void (*)(const Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub);
+DECLARE_DISPATCH(scatter_reduce_expanded_index_fn, scatter_reduce_expanded_index_stub);
+DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8934989512f22ab96df8d348050673a6782e2ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native {
+namespace {
+static std::string shapes_as_str(TensorList tensors) {
+  std::ostringstream os;
+  bool first = true;
+  for (auto& tensor : tensors) {
+    if (tensor.defined()) {
+      if (!first) {
+        os << ", ";
+      }
+      os << tensor.sizes();
+      first = false;
+    }
+  }
+  return os.str();
+}
+} // anonymous namespace
+
+static std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<c10::optional<at::Tensor>>& indices,
+const Tensor& value){
+  if (!(value.numel() ==1 && value.device().is_cpu())){
+    return std::make_tuple(false,Tensor());
+  }
+  int64_t num_ind = 0;
+  Tensor mask;
+  auto self_device = self.device();
+  for (const c10::optional<Tensor>& i: indices) {
+    if (!i.has_value() || !(*i).defined()){
+      num_ind++;
+    } else {
+      const Tensor &index = *i;
+      if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
+          index.device() != self_device || mask.defined()){
+        return std::make_tuple(false, Tensor());
+      } else {
+        mask = index;
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = num_ind + j;
+          TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx);
+        }
+        num_ind += mask.ndimension();
+      }
+    }
+  }
+  for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) {
+    mask = mask.unsqueeze(-1);
+  }
+  return std::make_tuple(true, mask);
+}
+
+static AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
+  checkIndexTensorTypes(orig, /*allow_int*/ true);
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  auto indices = expandTensors(self, orig);
+  // next broadcast all index tensors together
+  try {
+    indices = expand_outplace(indices);
+  } catch (std::exception& e) {
+    TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
+                   " with shapes ", shapes_as_str(indices));
+  }
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices) = transposeToFront(self, indices);
+  }
+  // Ensure indices are on the same device as self
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.device() != self.device()) {
+      indice = indice.to(self.device());
+    }
+  }
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.dtype() == at::kInt) {
+      indice = indice.to(at::kLong);
+    }
+  }
+
+  return AdvancedIndex(self, indices);
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorCompare.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorCompare.h
new file mode 100644
index 0000000000000000000000000000000000000000..f61d336c5671fc1c0c35356479213192b04501ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorCompare.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+class Tensor;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+
+namespace at::native {
+
+using reduce_minmax_fn =
+    void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+using structured_reduce_minmax_fn =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, int64_t, bool);
+
+DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub);
+DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub);
+
+using where_fn = void (*)(TensorIterator &);
+DECLARE_DISPATCH(where_fn, where_kernel);
+
+using is_infinity_op_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub);
+DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub);
+
+using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
+DECLARE_DISPATCH(mode_fn, mode_stub);
+
+using clamp_tensor_fn = void (*)(TensorIteratorBase &);
+DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub);
+
+namespace detail {
+    enum class ClampLimits {Min, Max, MinMax};
+}
+
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub);
+
+using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
+DECLARE_DISPATCH(isin_default_fn, isin_default_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorConversions.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorConversions.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf0ae4482d5eebc7a530283c9558ac3d0dde4408
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorConversions.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
+
+namespace at {
+  class Tensor;
+namespace native {
+bool to_will_alias(
+    const Tensor& self,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    bool copy,
+    c10::optional<c10::MemoryFormat> optional_memory_format);
+
+Tensor to_meta(const Tensor& tensor);
+c10::optional<Tensor> to_meta(const c10::optional<Tensor>& tensor);
+std::vector<Tensor> to_meta(at::ITensorListRef t_list);
+Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim_opt);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorDimApply.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorDimApply.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cbd8c432f9885185022ceb4dc8257e2d934d78d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorDimApply.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+//input tensors are non-zero dim and non-empty
+template<typename T1, typename T2, typename Function>
+
+void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) {
+  int ndims = self.dim();
+  int tensor_dim_apply_has_finished = 0;
+  std::vector<int64_t> counter(ndims, 0);
+  const T1* self_data = self.const_data_ptr<T1>();
+  T1* values_data = values.data_ptr<T1>();
+  T2* indices_data = indices.data_ptr<T2>();
+  int64_t self_stride = self.stride(dim);
+  int64_t values_stride = values.stride(dim);
+  int64_t indices_stride = indices.stride(dim);
+  int self_dim_size = self.size(dim);
+
+  while (!tensor_dim_apply_has_finished) {
+    func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride);
+    if (ndims == 1) {
+       break;
+    }
+    for (const auto dim_i : c10::irange(ndims)) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+      self_data += self.stride(dim_i);
+      values_data += values.stride(dim_i);
+      indices_data += indices.stride(dim_i);
+
+      if (counter[dim_i] == self.size(dim_i)) {
+        if (dim_i == ndims-1) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        } else {
+          self_data -= counter[dim_i]*self.stride(dim_i);
+          values_data -= counter[dim_i]*values.stride(dim_i);
+          indices_data -= counter[dim_i]*indices.stride(dim_i);
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+     }
+    }
+  }
+}
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorFactories.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorFactories.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eb70f18fb15e4926ed5b24edc9118569a1bebb7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorFactories.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/native/DispatchStub.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+namespace at::native {
+// Different combinations of row, col, and offset can lead to two cases:
+//
+// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col
+//    Example A: offset > 0
+//      1 1 0 0 0
+//      1 1 1 0 0
+//      1 1 1 1 0
+//    Example B: offset <= 0
+//      0 0 0
+//      1 0 0
+//      1 1 0
+//    In this case, we calculate the number of elements in the first row and
+//    last row of the tril respectively, and then compute the tril size.
+//
+// Case 2 - Trapezoid + Rectangle: row + offset > col
+//    Example:
+//      1 1 0
+//      1 1 1
+//      1 1 1
+//    In this case, we first calculate the size of top trapezoid, and then
+//    calculate the size of the bottom rectangle.
+inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
+  // If either dimension is 0 then the there is no tril
+  if (row == 0 || col == 0) {
+    return 0;
+  }
+  // number of elements in the first row of the tril
+  auto m_first_row = offset > 0 ?
+    std::min<int64_t>(col, 1 + offset) : // upper bounded by col
+    row + offset > 0; // either 0 or 1
+  // number of elements in the last row of the tril, bounded by [0, col]
+  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
+  // number of rows, bounded by [0, row]
+  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
+  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
+
+  // calculate # of elements in the top trapezoid
+  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
+
+  // calculate # of elements in the bottom rectangle if there is any
+  auto diff_row = n_row_all - n_row_trapezoid;
+  if (diff_row > 0) {
+    tril_size += diff_row * col;
+  }
+
+  return tril_size;
+}
+
+inline void check_args(
+    int64_t row, int64_t col, c10::optional<Layout> layout_opt) {
+  TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
+  TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
+  if (layout_opt.has_value()) {
+    TORCH_CHECK(
+      *layout_opt == at::kStrided,
+      "only support layout=torch.strided, got",
+      *layout_opt)
+  }
+}
+
+using at::check_size_nonnegative;
+
+// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n))
+inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
+  // match defined() to behavior of checks below
+  TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(),
+              "n is too large for result tensor type: '", tensor.toString(), "'");
+
+  // Ensure sufficient precision for floating point representation.
+  switch (tensor.scalar_type()) {
+    case at::ScalarType::Half:
+      TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
+      break;
+    case at::ScalarType::Float:
+      TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
+      break;
+    case at::ScalarType::Double:  // Unlikely to happen, but doesn't hurt to check
+      TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
+      break;
+    default:
+      break;
+  }
+}
+
+// Called by `empty*` functions when deterministic algorithms are enabled to
+// fill the tensor with NaN if it is floating point or complex type, or fill
+// with max value if it is integer type
+inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
+  if (tensor.is_floating_point() || tensor.is_complex()) {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+      kBFloat16, kHalf, tensor.scalar_type(), "fill_empty_deterministic_", [&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
+    });
+  } else {
+    AT_DISPATCH_V2(
+      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::max());
+    }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2));
+  }
+  return tensor;
+}
+
+// The ZeroTensor allocator ignores whatever allocation is requested and always
+// gives you nullptr
+struct ZeroTensorAllocator final : public at::Allocator {
+  ZeroTensorAllocator(at::Device device) : device_(device) {};
+  ~ZeroTensorAllocator() override = default;
+  static void deleter(void* const pointer) {
+    TORCH_INTERNAL_ASSERT(!pointer);
+  }
+  DataPtr allocate(const size_t /*nbytes*/) override {
+    return {nullptr, nullptr, &deleter, device_};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return deleter;
+  }
+  void copy_data(void* dest, const void* src, std::size_t count) const final {}
+  at::Device device_;
+};
+
+using binary_fn = void (*)(TensorIterator&);
+
+DECLARE_DISPATCH(binary_fn, complex_stub);
+DECLARE_DISPATCH(binary_fn, polar_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIterator.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fb52e967ad7da6e58fca440b588f20767c0bf15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIterator.h
@@ -0,0 +1,2 @@
+#pragma once
+#include <ATen/TensorIterator.h>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h
new file mode 100644
index 0000000000000000000000000000000000000000..790e71dd5a5565da4c27869383a2f2436b774b7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <complex>
+#include <type_traits>
+#include <c10/core/ScalarType.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+
+
+// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
+
+// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
+// to the function that is being called.
+// On CUDA, the cast is currently pushed down into the kernel (for performance reasons).
+// On CPU, there is currently an internal assert that a dynamic_cast is not needed.
+
+namespace at::native {
+
+// `needs_dynamic_casting` compares the types expected by iterator
+// (i.e. dtypes of the operands) with the actual type of the arguments
+// (and returns) of func_t
+template<typename func_t, int nargs=function_traits<func_t>::arity>
+struct needs_dynamic_casting {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::template arg<nargs - 1>::type;
+    using cpp_map = c10::CppTypeToScalarType<cpp_type>;
+
+    if (iter.input_dtype(nargs-1) != cpp_map::value) {
+      return true;
+    }
+    return needs_dynamic_casting<func_t, nargs - 1>::check(iter);
+  }
+};
+
+template<typename func_t>
+struct needs_dynamic_casting<func_t, 0> {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::result_type;
+
+    // we could assert output numbers are correct here, but checks
+    // (including arity) are currently pushed outside of this struct.
+    if constexpr (std::is_void_v<cpp_type>) {
+      return false;
+    } else {
+      return iter.dtype(0) != c10::CppTypeToScalarType<cpp_type>::value;
+    }
+  }
+};
+
+} //namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorProperties.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorProperties.h
new file mode 100644
index 0000000000000000000000000000000000000000..8654b3dae577b192c75c9cb8f74ea417bcd3b961
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorProperties.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// See NOTE: [Tensor vs. TensorBase]
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+TORCH_API bool cudnn_is_acceptable(const TensorBase& self);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorShape.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorShape.h
new file mode 100644
index 0000000000000000000000000000000000000000..623c81c0b16066fe0766405e158621097427ad59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorShape.h
@@ -0,0 +1,105 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+
+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
+
+inline bool cat_should_skip_tensor(const Tensor& t) {
+  return t.numel() == 0 && t.dim() == 1;
+}
+
+ // Check to see if the shape of tensors is compatible
+ // for being concatenated along a given dimension.
+inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) {
+   int64_t first_dims = first.dim();
+   int64_t second_dims = second.dim();
+   TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ",
+               first_dims, " and ", second_dims);
+   for (const auto dim : c10::irange(first_dims)) {
+     if (dim == dimension) {
+       continue;
+     }
+     int64_t first_dim_size = first.sizes()[dim];
+     int64_t second_dim_size = second.sizes()[dim];
+     TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
+                 dimension, ". Expected size ", static_cast<long long>(first_dim_size), " but got size ", static_cast<long long>(second_dim_size), " for tensor number ", index, " in the list.");
+   }
+ }
+
+inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) {
+  int64_t i = 0;
+  for(const Tensor& t : tensors) {
+    TORCH_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+    i++;
+  }
+}
+
+inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  int64_t dim_size = self.size(dim);
+  TORCH_CHECK(split_size > 0 || dim_size == 0,
+           "split_size can only be 0 if dimension size is 0, "
+           "but got dimension size of ", dim_size);
+  // if split_size is 0 and dimension size is 0, there is 1 split.
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
+    // (returns a single split).  We might want to error here, but keep it for BC.
+    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
+  }
+  return num_splits;
+}
+
+inline bool have_same_ndims(TensorList tensors) {
+  auto ndim = tensors[0].dim();
+  for (const auto tensor_idx : c10::irange(tensors.size())) {
+    if(tensors[tensor_idx].dim() != ndim) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
+  auto tensor_zero_size = tensors[0].sizes();
+  std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
+  for (const auto i : c10::irange(tensors.size())) {
+    at::Tensor tensor = tensors[i];
+    for(const auto j : c10::irange(dim)) {
+      TORCH_CHECK(
+        tensor.size(j) == leading_dim_sizes[j],
+        "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
+      );
+    }
+  }
+}
+
+inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
+  TORCH_CHECK(!tensors.empty(),
+           "_chunk_cat expects a non-empty input tensor list");
+  auto expected_dtype = tensors[0].dtype();
+  auto expected_device = tensors[0].device();
+  for(const auto i : c10::irange(tensors.size())) {
+    TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
+    TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
+    TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
+  }
+  if (have_same_ndims(tensors)) {
+    dim = maybe_wrap_dim(dim, tensors[0].dim());
+  } else {
+    TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
+    for(const auto i : c10::irange(tensors.size())) {
+      TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
+    }
+  }
+  leading_dimension_matches(tensors, dim);
+  return dim;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TensorTransformations.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorTransformations.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e1e4232ce42bf827de488f3ee1ff9d50db235e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TensorTransformations.h
@@ -0,0 +1,30 @@
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/roll.h>
+#endif
+
+#include <c10/util/Exception.h>
+
+namespace at::native {
+
+static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
+  TORCH_CHECK(!shifts.empty(), "`shifts` required");
+  if (dims.empty() && shifts.size() == 1) {
+    auto flattened = self.contiguous().view(self.numel());
+    return roll(flattened, shifts[0], 0).view(self.sizes());
+  }
+  TORCH_CHECK(
+    shifts.size() == dims.size(),
+    "shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size()
+  );
+  AT_ASSERT(dims.size() > 1);
+  auto tail_shifts = shifts.slice(1);
+  auto tail_dims = dims.slice(1);
+  auto first_dim_rolled = roll(self, shifts[0], dims[0]);
+  return at::roll(first_dim_rolled, tail_shifts, tail_dims);
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TopKImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TopKImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8ffaf61295398c9e7a28bdcbc77d4c81e9b3846
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TopKImpl.h
@@ -0,0 +1,98 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/NumericUtils.h>
+
+namespace at::native {
+
+#ifdef CPU_CAPABILITY
+inline namespace CPU_CAPABILITY {
+#else
+inline namespace DEFAULT {
+#endif
+
+// Core topk loop, shared between CPU and QuantizedCPU
+template <typename scalar_t, typename accscalar_t>
+void topk_impl_loop(
+    const int64_t mode_values_stride,
+    const int64_t mode_indices_stride,
+    const int64_t tmp_values_stride,
+    const int64_t k,
+    const int64_t dim_size,
+    const bool largest,
+    const bool sorted,
+    char** data, const int64_t* strides, const int64_t n) {
+
+  // If k is zero, then output values and indices are empty tensors
+  // So iterating over other dims is pointless
+  if (k == 0) {
+    return;
+  }
+  using elem_t = std::pair<accscalar_t, int64_t>;
+  std::vector<elem_t> queue(dim_size);
+  for (const auto i : c10::irange(n)) {
+    TensorAccessor<scalar_t, 1> mode_values(
+        reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
+        &k, &mode_values_stride);
+    TensorAccessor<int64_t, 1> mode_indices(
+        reinterpret_cast<int64_t*>(data[1] + i * strides[1]),
+        &k, &mode_indices_stride);
+    TensorAccessor<const scalar_t, 1> tmp_values(
+        reinterpret_cast<scalar_t*>(data[2] + i * strides[2]),
+        &dim_size, &tmp_values_stride);
+
+    auto n_2 = dim_size;
+    auto use_partial_sort = k * 64 <= n_2;
+
+    for (const auto j : c10::irange(n_2)) {
+      queue[j].first = tmp_values[j];
+      queue[j].second = j;
+    }
+
+    // we want nan to be sorted as top for numpy compatibility
+    if (use_partial_sort) {
+      if (largest) {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+      } else {
+        std::partial_sort(queue.begin(), queue.begin() + k, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+      }
+    } else {
+      if (largest) {
+        std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k - 1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((_isnan<accscalar_t>(x.first) && !_isnan<accscalar_t>(y.first)) || (x.first > y.first));
+            });
+        }
+      } else {
+        std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(),
+          [](const elem_t& x, const elem_t& y) -> bool {
+            return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+          });
+        if (sorted) {
+          std::sort(queue.begin(), queue.begin() + k -1,
+            [](const elem_t& x, const elem_t& y) -> bool {
+              return ((!_isnan<accscalar_t>(x.first) && _isnan<accscalar_t>(y.first)) || (x.first < y.first));
+            });
+        }
+      }
+    }
+
+    for (const auto j : c10::irange(k)) {
+      mode_values[j] = queue[j].first;
+      mode_indices[j] = queue[j].second;
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TransposeType.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TransposeType.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ebdce31873a4ff7e6269551d374952a35f49fdc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TransposeType.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+namespace at::native {
+
+// Used as an interface between the different BLAS-like libraries
+enum class TransposeType {
+  NoTranspose,
+  Transpose,
+  ConjTranspose,
+};
+
+// Transforms TransposeType into the BLAS / LAPACK format
+static inline char to_blas(TransposeType trans) {
+  switch (trans) {
+    case TransposeType::Transpose: return 'T';
+    case TransposeType::NoTranspose: return 'N';
+    case TransposeType::ConjTranspose: return 'C';
+  }
+  TORCH_INTERNAL_ASSERT(false, "Invalid transpose type");
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..27fe2e18cb685b5fe32214b0fe10466d2b5d0189
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
@@ -0,0 +1,57 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+
+namespace at::native {
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches for Triu and Tril. This ignores stride 0 dimension
+ */
+static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    if (batched_matrices.stride(i) != 0) {
+      result *= batched_matrices.size(i);
+    }
+  }
+  return result;
+}
+
+/* Checks a necessary property for the triu and tril implementations, hence the name.
+ * Here batch contiguity is checked for tensors with greater than 4 dimensions.
+ * Contiguous tensors and tensors with less than 3 dimensions pass this check
+ */
+static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) {
+  // Complete contiguity is the most desired property, which is why
+  // we return true if the tensor is contiguous
+  if (tensor.is_contiguous()) {
+    auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes());
+    if (tensor.strides() == default_strides_for_size) {
+      return std::make_tuple(true, tensor);
+    } else {
+      return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size));
+    }
+  }
+
+  int64_t dims = tensor.dim();
+
+  // Tensors with dimension less than 4 are handled by default
+  if (allow_zero_stride && dims <= 3) {
+    return std::make_tuple(true, tensor);
+  }
+
+  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
+  for (int64_t i = dims - 3; i >= 0; i--) {
+    // Skip trivial dimension;
+    if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) {
+      continue;
+    }
+    if (expected_stride != tensor.stride(i)) {
+      return std::make_tuple(false, tensor.contiguous());
+    }
+    expected_stride *= tensor.size(i);
+  }
+  return std::make_tuple(true, tensor);
+}
+
+}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/TypeProperties.h b/MLPY/Lib/site-packages/torch/include/ATen/native/TypeProperties.h
new file mode 100644
index 0000000000000000000000000000000000000000..07f0028655e58f6c1305251782ad6a5e51ad7a74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/TypeProperties.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+
+struct ResultTypeState {
+  c10::ScalarType dimResult = ScalarType::Undefined;
+  c10::ScalarType wrappedResult = ScalarType::Undefined;
+  c10::ScalarType zeroResult = ScalarType::Undefined;
+};
+
+TORCH_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state);
+TORCH_API ResultTypeState update_result_type_state(const Scalar& scalar, const ResultTypeState& in_state);
+TORCH_API ScalarType result_type(const ResultTypeState& state);
+
+TORCH_API ScalarType result_type(ITensorListRef tensors);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/UnaryOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/UnaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..7953186237fd071e0b03fc1acee9077507b2869d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/UnaryOps.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/Generator.h>
+#include <c10/core/Scalar.h>
+#include <stdexcept>
+
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIteratorBase;
+}
+
+namespace at::native {
+
+using unary_fn = void(*)(TensorIteratorBase&);
+using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a);
+
+inline namespace CPU_CAPABILITY {
+void conj_kernel(TensorIteratorBase &iter);
+void neg_kernel(TensorIteratorBase &iter);
+void reciprocal_kernel(TensorIteratorBase &iter);
+void rsqrt_kernel(TensorIteratorBase& iter);
+void sqrt_kernel(TensorIteratorBase& iter);
+} // namespace CPU_CAPABILITY
+
+DECLARE_DISPATCH(unary_fn, abs_stub);
+DECLARE_DISPATCH(unary_fn, angle_stub);
+DECLARE_DISPATCH(unary_fn, conj_physical_stub);
+DECLARE_DISPATCH(unary_fn, acos_stub);
+DECLARE_DISPATCH(unary_fn, acosh_stub);
+DECLARE_DISPATCH(unary_fn, asinh_stub);
+DECLARE_DISPATCH(unary_fn, atanh_stub);
+DECLARE_DISPATCH(unary_fn, asin_stub);
+DECLARE_DISPATCH(unary_fn, atan_stub);
+DECLARE_DISPATCH(unary_fn, bitwise_not_stub);
+DECLARE_DISPATCH(unary_fn, logical_not_stub);
+DECLARE_DISPATCH(unary_fn, ceil_stub);
+DECLARE_DISPATCH(unary_fn, cos_stub);
+DECLARE_DISPATCH(unary_fn, cosh_stub);
+DECLARE_DISPATCH(unary_fn, digamma_stub);
+DECLARE_DISPATCH(unary_fn, special_entr_stub);
+DECLARE_DISPATCH(unary_fn, special_erfcx_stub);
+DECLARE_DISPATCH(unary_fn, erf_stub);
+DECLARE_DISPATCH(unary_fn, erfc_stub);
+DECLARE_DISPATCH(unary_fn, erfinv_stub);
+DECLARE_DISPATCH(unary_fn, exp_stub);
+DECLARE_DISPATCH(unary_fn, exp2_stub);
+DECLARE_DISPATCH(unary_fn, expm1_stub);
+DECLARE_DISPATCH(unary_fn, floor_stub);
+DECLARE_DISPATCH(unary_fn, frac_stub);
+DECLARE_DISPATCH(unary_fn, frexp_stub);
+DECLARE_DISPATCH(unary_fn, i0_stub);
+DECLARE_DISPATCH(unary_fn, special_i0e_stub);
+DECLARE_DISPATCH(unary_fn, special_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_i1e_stub);
+DECLARE_DISPATCH(unary_fn, log_stub);
+DECLARE_DISPATCH(unary_fn, log10_stub);
+DECLARE_DISPATCH(unary_fn, log1p_stub);
+DECLARE_DISPATCH(unary_fn, log2_stub);
+DECLARE_DISPATCH(unary_fn, special_ndtri_stub);
+DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub);
+DECLARE_DISPATCH(unary_fn, neg_stub);
+
+DECLARE_DISPATCH(unary_fn, reciprocal_stub);
+DECLARE_DISPATCH(unary_fn, round_stub);
+DECLARE_DISPATCH(unary_fn, rsqrt_stub);
+DECLARE_DISPATCH(unary_fn, sigmoid_stub);
+DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
+DECLARE_DISPATCH(unary_fn, sign_stub);
+DECLARE_DISPATCH(unary_fn, signbit_stub);
+DECLARE_DISPATCH(unary_fn, sgn_stub);
+DECLARE_DISPATCH(unary_fn, sin_stub);
+DECLARE_DISPATCH(unary_fn, sinc_stub);
+DECLARE_DISPATCH(unary_fn, sinh_stub);
+DECLARE_DISPATCH(unary_fn, sqrt_stub);
+DECLARE_DISPATCH(unary_fn, tan_stub);
+DECLARE_DISPATCH(unary_fn, tanh_stub);
+DECLARE_DISPATCH(unary_fn, trigamma_stub);
+DECLARE_DISPATCH(unary_fn, trunc_stub);
+DECLARE_DISPATCH(unary_fn, lgamma_stub);
+DECLARE_DISPATCH(unary_fn, special_airy_ai_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j1_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub);
+
+// NB: these are actually defined in Distribution
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional<Generator>), random_stub);
+
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
+DECLARE_DISPATCH(
+    void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
+    multinomial_with_replacement_stub);
+DECLARE_DISPATCH(
+    void (*)(
+        TensorIteratorBase&,
+        c10::optional<double>,
+        c10::optional<double>,
+        c10::optional<double>),
+    nan_to_num_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub);
+
+// Missing unary functions
+// digamma
+// lgamma
+// erfinv
+// clone
+// contiguous
+// zero
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold2d.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..f966d9d7c9776ab76c26da5a3be2ad98e13cf5f8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold2d.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/ScalarType.h>
+#include <cstdint>
+
+namespace at::native {
+
+using unfold2d_fn = void (*)(
+    ScalarType dtype,
+    void *finput,
+    void *input,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    bool is_channels_last
+);
+
+DECLARE_DISPATCH(unfold2d_fn, unfolded2d_copy_stub);
+DECLARE_DISPATCH(unfold2d_fn, unfolded2d_acc_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold3d.h b/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..eae526b7ec33a2ec1b34aeee808f78fc47931c82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/Unfold3d.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+
+namespace at::native {
+
+void Unfold3dCopyCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void* dst);
+
+void Unfold3dAccCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void *dst);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/UnfoldBackward.h b/MLPY/Lib/site-packages/torch/include/ATen/native/UnfoldBackward.h
new file mode 100644
index 0000000000000000000000000000000000000000..f715d4dbf9db2bb23715b7c87e62ccdaf989015a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/UnfoldBackward.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/NonEmptyUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#endif
+
+namespace at::native {
+
+using unfold_backward_fn = void (*)(
+  Tensor& grad_in,
+  const Tensor& grad,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+);
+
+DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub);
+
+namespace {
+
+// Note on naming: it is unconventional.
+// grad_in does not mean that it is a gradient wrt to input,
+// grad_in/grad_out is just an input/output of unfold_backward kernel.
+
+static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out(
+  Tensor& grad_out,
+  const Tensor& grad_in,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+) {
+  dim = maybe_wrap_dim(dim, grad_out.dim());
+  // last dim stores the folds
+
+  auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim);
+  auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim);
+  // dictates the number of elements to iterate over
+  // in dimension `dim`
+  auto iter_dim_size = std::min(
+    grad_out_dim_size,
+    (grad_in_dim_size - 1) * step + size
+  );
+
+  /* prepare grad_out for TensorIterator { */
+  auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec());
+  auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec());
+  grad_out_sizes[dim] = iter_dim_size;
+  auto grad_out_restrided = grad_out.as_strided(
+    grad_out_sizes, grad_out_strides
+  );
+  /* } */
+
+  /* prepare grad_in for TensorIterator { */
+  auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec());
+  auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec());
+
+  // set strides for dim to 0
+  // and size to 1 because
+  // this dimension is indexed inside the kernel
+  grad_in_strides[dim] = 0;
+  grad_in_sizes[dim] = 1;
+
+  grad_in_strides.pop_back();
+  grad_in_sizes.pop_back();
+
+  auto grad_in_restrided = grad_in.squeeze(-1).as_strided(
+    grad_in_sizes, grad_in_strides
+  );
+  /* } */
+
+  // During the TensorIterator iteration we have to know
+  // i_dim in grad_out[i_1,...,i_dim,...i_n],
+  // idx_dim stores this information
+  /* prepare idx_dim for TensorIterator { */
+  auto idx_dim = at::arange(
+    0, iter_dim_size, grad_in.options().dtype(at::kLong)
+  );
+
+  auto grad_out_dim = ensure_nonempty_dim(grad_out.dim());
+
+  auto idx_dim_strides = std::vector<int64_t>(grad_out_dim, 0);
+  auto idx_dim_sizes = std::vector<int64_t>(grad_out_dim, 1);
+
+  idx_dim_strides[dim] = 1;
+  idx_dim_sizes[dim] = iter_dim_size;
+
+  // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator
+  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
+  /* } */
+
+  auto iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .add_owned_output(grad_out_restrided)
+    .add_owned_input(grad_in_restrided)
+    .add_owned_input(idx_dim_restrided)
+    .build();
+
+  return iter;
+}
+
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/UpSample.h b/MLPY/Lib/site-packages/torch/include/ATen/native/UpSample.h
new file mode 100644
index 0000000000000000000000000000000000000000..72c4f1d72cdb847db5c7da5f11d3219317b4b187
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/UpSample.h
@@ -0,0 +1,506 @@
+#pragma once
+
+#include <math.h>
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/cpu/utils.h>
+
+/**
+ * Note [compute_scales_value]
+ * Note [area_pixel_compute_scale]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Interpolate with scale_factor can have different behaviors
+ * depending on the value of recompute_scale_factor:
+ *
+ * - With recompute_scale_factor = True (current default behavior):
+ * the scale_factor, when provided by the user, are used to calculate
+ * the output size. The input size and the computed output_size
+ * are then used to infer new values for the scales which are
+ * used in the interpolation.  Because floating-point math is not exact,
+ * this may be a different value from the user-supplied scales.
+ *
+ * - With recompute_scale_factor = False (which will be the default
+ * behavior starting 1.5.0):
+ * the behavior follows opencv logic, and the scales provided by
+ * the user are the ones used in the interpolation calculations.
+ *
+ * If the scales are not provided or if they are provided but
+ * recompute_scale_factor is set to True (default behavior), the scales
+ * are computed from the input and the output size;
+ *
+ *
+ * When the scales are inferred from the input and output sizes,
+ * we view each pixel as an area, idx + 0.5 as its center index.
+ * Here is an example formula in 1D case.
+ * if align_corners: center of two corner pixel areas are preserved,
+ *     (0.5, 0.5) -> (0.5, 0.5),
+ *     (input_size - 0.5, 0.5) -> (output_size - 0.5)
+ *     scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5)
+ *     src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5)
+ * if not align_corners: the whole range is scaled accordingly
+ *     scale = input_size / output_size
+ *     src_idx + 0.5 = scale * (dst_index + 0.5)
+ */
+
+namespace at::native {
+
+namespace upsample {
+
+TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
+    c10::IntArrayRef input_size,  // Full input tensor size.
+    at::OptionalIntArrayRef output_size,
+    c10::optional<c10::ArrayRef<double>> scale_factors);
+
+inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
+  if (!scales) {
+    return c10::nullopt;
+  }
+  return scales->at(idx);
+}
+
+} // namespace upsample
+
+using scale_t = c10::optional<double>;
+using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w);
+using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);
+
+static C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 3,
+      "It is expected input_size equals to 3, but got size ",
+      input_size.size());
+
+  int64_t output_width = output_size[0];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_width = input_size[2];
+
+  TORCH_CHECK(
+      input_width > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (W: ",
+      input_width,
+      ") and output (W: ",
+      output_width,
+      ")");
+
+  return {nbatch, channels, output_width};
+}
+
+static C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+
+  int64_t output_height = output_size[0];
+  int64_t output_width = output_size[1];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_height = input_size[2];
+  int64_t input_width = input_size[3];
+
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  return {nbatch, channels, output_height, output_width};
+}
+
+static C10_UNUSED
+std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_depth = input_size[2];
+  int64_t input_height = input_size[3];
+  int64_t input_width = input_size[4];
+
+  TORCH_CHECK(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+          output_depth > 0 && output_height > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (D: ",
+      input_depth,
+      ", H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (D: ",
+      output_depth,
+      ", H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+
+  return {nbatch, channels, output_depth, output_height, output_width};
+}
+
+static inline void upsample_2d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t nbatch,
+    int64_t nchannels,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width) {
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  if (input.defined()) {
+    // Allow for empty batch size but not other dimensions
+    TORCH_CHECK(
+                (input.numel() != 0 ||
+                 (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0)
+                 ) &&
+                input.dim() == 4,
+                "Non-empty 4D data tensor expected but got a tensor with sizes ",
+                input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 4, 0, nbatch);
+    check_dim_size(grad_output, 4, 1, nchannels);
+    check_dim_size(grad_output, 4, 2, output_height);
+    check_dim_size(grad_output, 4, 3, output_width);
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t compute_scales_value(
+    const c10::optional<double> scale,
+    int64_t input_size,
+    int64_t output_size) {
+      // see Note [compute_scales_value]
+      // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+      return (scale.has_value() && scale.value() > 0.)
+          ? static_cast<scalar_t>(1.0 / scale.value())
+          : (static_cast<scalar_t>(input_size) / output_size);
+}
+
+template <typename scalar_t>
+static inline scalar_t area_pixel_compute_scale(
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners,
+    const c10::optional<double> scale) {
+  // see Note [area_pixel_compute_scale]
+  if(align_corners) {
+    if(output_size > 1) {
+      return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
+    } else {
+      return static_cast<scalar_t>(0);
+    }
+  } else {
+    return compute_scales_value<scalar_t>(scale, input_size, output_size);
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t area_pixel_compute_source_index(
+    scalar_t scale,
+    int64_t dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    scalar_t src_idx = scale * (dst_index + static_cast<scalar_t>(0.5)) -
+        static_cast<scalar_t>(0.5);
+    // [Note] Follow Opencv resize logic:
+    // We allow negative src_idx here and later will use
+    //   dx = src_idx - floorf(src_idx)
+    // to compute the "distance"(which affects weights).
+    // For linear modes, weight distribution doesn't matter
+    // for negative indices as they use 2 pixels to interpolate.
+    // For example, [-1, 0], they both use pixel 0 value so it
+    // doesn't affect if we bound the src_idx to 0 or not.
+    // TODO: Our current linear mode impls use unbound indices
+    // where we should and then remove this cubic flag.
+    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+    // to interpolate and the weights can be affected.
+    return (!cubic && src_idx < static_cast<scalar_t>(0)) ? scalar_t(0)
+                                                          : src_idx;
+  }
+}
+
+static inline int64_t nearest_neighbor_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // Index computation matching OpenCV INTER_NEAREST
+  // which is buggy and kept for BC
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf(dst_index * scale)), input_size - 1);
+  return src_index;
+}
+
+static inline int64_t nearest_neighbor_exact_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // index_f32 = (output_index + 0.5) * scale - 0.5
+  // input_index = round(index_f32)
+  // Same as Pillow and Scikit-Image/Scipy ndi.zoom
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf((dst_index + 0.5) * scale)), input_size - 1);
+  return src_index;
+}
+
+static inline int64_t nearest_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  // This method specificly treats cases: output_size == input_size or
+  // output_size == 2 * input_size, that we would like to get rid of
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_exact_idx as replacement
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    return output_index;
+  } else if (output_size == 2 * input_size) {
+    // scale_factor = 2, shift input index
+    return output_index >> 1;
+  } else {
+    float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_compute_source_index(scale, output_index, input_size);
+  }
+}
+
+static inline int64_t nearest_exact_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    c10::optional<double> scales) {
+  float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
+}
+
+// Define a typedef to dispatch to nearest_idx or nearest_exact_idx
+typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, c10::optional<double>);
+
+template <typename scalar_t>
+static scalar_t upsample_get_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  return data[access_y * width + access_x];
+}
+
+template <typename scalar_t>
+static void upsample_increment_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y,
+    scalar_t value) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  data[access_y * width + access_x] += value;
+}
+
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+// when `real_input_index` becomes larger than the range the floating point
+// type can accurately represent, the type casting to `int64_t` might exceed
+// `input_size`, causing overflow. So we guard it with `std::min` below.
+template<typename scalar_t, typename opmath_t>
+static inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) {
+  input_index = std::min(static_cast<int64_t>(floorf(real_input_index)), input_size - 1);
+  lambda = std::min(
+      std::max(real_input_index - input_index, static_cast<opmath_t>(0)),
+      static_cast<opmath_t>(1)
+    );
+}
+
+template<typename scalar_t, typename opmath_t>
+static inline void compute_source_index_and_lambda(
+    int64_t& input_index0,
+    int64_t& input_index1,
+    scalar_t& lambda0,
+    scalar_t& lambda1,
+    opmath_t ratio,
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners) {
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    input_index0 = output_index;
+    input_index1 = output_index;
+    lambda0 = static_cast<scalar_t>(1);
+    lambda1 = static_cast<scalar_t>(0);
+  } else {
+    const auto real_input_index =
+        area_pixel_compute_source_index<opmath_t>(
+            ratio, output_index, align_corners, /*cubic=*/false);
+    guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1);
+    int64_t offset = (input_index0 < input_size - 1) ? 1 : 0;
+    input_index1 = input_index0 + offset;
+    lambda0 = static_cast<scalar_t>(1.) - lambda1;
+  }
+}
+
+// It will not be used by data types other than BFloat16 and Half.
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  TORCH_CHECK((is_reduced_floating_point_v<scalar_out>),
+              "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.")
+  TORCH_CHECK((std::is_same<scalar_in, float>::value),
+              "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.")
+  return;
+}
+
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  using bVec = Vectorized<scalar_out>;
+  using fVec = Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec gin_bvec = bVec::loadu(gin + d);
+    fVec gin_fvec0, gin_fvec1;
+    std::tie(gin_fvec0, gin_fvec1) = convert_to_float<scalar_out>(gin_bvec);
+    gin_fvec0 += fVec::loadu(buffer_ptr + d);
+    gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size());
+    fVec(0).store(buffer_ptr + d);
+    fVec(0).store(buffer_ptr + d + fVec::size());
+    convert_from_float<scalar_out>(gin_fvec0, gin_fvec1).store(gin + d);
+  }
+  for (; d < size; d++) {
+    gin[d] += buffer_ptr[d];
+    buffer_ptr[d] = 0;
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/batch_norm.h b/MLPY/Lib/site-packages/torch/include/ATen/native/batch_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..d38158cfe4b6e2c027ba4bd2daa44f2501881522
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/batch_norm.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using batch_norm_fn = void (*)(Tensor&, const Tensor&, const Tensor&,
+    const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double);
+using batch_norm_collect_stats_fn = void (*)(Tensor&, Tensor&, const Tensor&);
+using batch_norm_backward_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&,
+        const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double);
+
+DECLARE_DISPATCH(batch_norm_fn, batch_norm_cpu_stub);
+DECLARE_DISPATCH(batch_norm_collect_stats_fn, batch_norm_cpu_collect_stats_stub);
+DECLARE_DISPATCH(batch_norm_backward_fn, batch_norm_cpu_backward_stub);
+
+// TensorAccessor when it is defined to work around undefined...
+template <typename scalar_t>
+static TensorAccessor<scalar_t, 1> conditional_accessor_1d(const Tensor& t) {
+  if (! t.defined()) {
+    return TensorAccessor<scalar_t, 1>(nullptr, nullptr, nullptr);
+  }
+  return t.accessor<scalar_t, 1>();
+}
+
+template <typename scalar_t>
+static scalar_t* conditional_data_ptr(const Tensor& t) {
+  return t.defined() ? t.contiguous().data_ptr<scalar_t>()
+                     : nullptr;
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f2fe7f1a32f50cd35585d4f1060cd76386beec5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h
@@ -0,0 +1,37 @@
+#ifndef ATOMIC_ADD_FLOAT
+#define ATOMIC_ADD_FLOAT
+
+#if (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))
+#include <ATen/native/cpu/Intrinsics.h>
+#else
+#define _mm_pause()
+#endif
+
+#include <atomic>
+
+static inline void cpu_atomic_add_float(float* dst, float fvalue)
+{
+  typedef union {
+    unsigned intV;
+    float floatV;
+  } uf32_t;
+
+  uf32_t new_value, old_value;
+  std::atomic<unsigned>* dst_intV = (std::atomic<unsigned>*)(dst);
+
+  old_value.floatV = *dst;
+  new_value.floatV = old_value.floatV + fvalue;
+
+  unsigned* old_intV = (unsigned*)(&old_value.intV);
+  while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) {
+#ifdef __aarch64__
+    __asm__ __volatile__("yield;" : : : "memory");
+#else
+    _mm_pause();
+#endif
+    old_value.floatV = *dst;
+    new_value.floatV = old_value.floatV + fvalue;
+  }
+}
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CatKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..79bf7c06af6991d656114fd0bd8678c544e96a00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CatKernel.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/IListRef.h>
+
+namespace at { namespace native {
+
+using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t);
+DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0b62ef3226e0a129990f9b107a15e7e240489ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+using channel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t);
+DECLARE_DISPATCH(channel_shuffle_fn, channel_shuffle_kernel);
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CopyKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CopyKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..da04349fe44724e4e24d1e690160f2c5a3cf2fa5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/CopyKernel.h
@@ -0,0 +1,12 @@
+#pragma once
+
+namespace at {
+struct TensorIteratorBase;
+
+namespace native {
+inline namespace CPU_CAPABILITY {
+
+void direct_copy_kernel(TensorIteratorBase &iter);
+void copy_kernel(TensorIterator& iter, bool /*non_blocking*/);
+
+}}}  // namespace at::native::CPU_CAPABILITY
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbaff919a620b2c3c78603f03744016ccfdd4f10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+
+/*
+  Depthwise 3x3 Winograd convolution operator
+*/
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using convolution_depthwise3x3_winograd_fn =
+    Tensor (*)(const Tensor &, const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t);
+
+DECLARE_DISPATCH(convolution_depthwise3x3_winograd_fn, convolution_depthwise3x3_winograd_stub);
+
+}  // namespace native
+}  // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bc026ae278a95436698bf600bb3e8bb61327dd2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h
@@ -0,0 +1,369 @@
+#pragma once
+
+#include <ATen/CPUApplyUtils.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandBase.h>
+#include <ATen/core/DistributionsHelper.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cpu/Loops.h>
+#include <limits>
+#include <mutex>
+
+#ifdef CPU_CAPABILITY_AVX2
+#include <ATen/native/cpu/avx_mathfun.h>
+#include <c10/util/irange.h>
+#endif
+
+
+namespace at {
+namespace native {
+namespace templates {
+namespace cpu {
+namespace {
+
+// ==================================================== Random ========================================================
+
+template<typename RNG>
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) {
+  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cpu", AT_WRAP([&] {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t {
+      uniform_int_from_to_distribution<scalar_t> random(range, base);
+      return random(generator);
+    });
+  }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+// This is the special kernel to handle single specific case:
+// from(inclusive) = std::numeric_limits<int64_t>::lowest()
+// to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+template<typename RNG>
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] {
+    if constexpr (std::is_same<scalar_t, int64_t>::value ||
+        std::is_same<scalar_t, double>::value ||
+        std::is_same<scalar_t, float>::value ||
+        std::is_same<scalar_t, at::BFloat16>::value) {
+      std::lock_guard<std::mutex> lock(generator->mutex_);
+      cpu_serial_kernel(iter, [generator]() -> scalar_t {
+        uniform_int_full_range_distribution<scalar_t> random;
+        return random(generator);
+      });
+    } else {
+      TORCH_CHECK(false, "random_full_64_bits_range_kernel_cpu handles only int64, double, float and bfloat16");
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+    random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
+  }
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+    random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+template<typename RNG>
+void random_kernel(TensorIteratorBase& iter, RNG generator) {
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] {
+    cpu_serial_kernel(iter, [generator]() -> scalar_t {
+      uniform_int_distribution<scalar_t> random;
+      return random(generator);
+    });
+  });
+}
+
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+    random_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Normal ========================================================
+
+#ifdef CPU_CAPABILITY_AVX2
+static void normal_fill_16_AVX2(float *data,
+                         const __m256* two_pi,
+                         const __m256* one,
+                         const __m256* minus_two,
+                         const __m256* mean,
+                         const __m256* std_v) {
+  const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data));
+  const __m256 u2 = _mm256_loadu_ps(data + 8);
+  // sincos256_ps and log256_ps are from avx_mathfun.h
+  const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1)));
+  const __m256 theta = _mm256_mul_ps(*two_pi, u2);
+  __m256 sintheta, costheta;
+  sincos256_ps(theta, &sintheta, &costheta);
+  const __m256 n1 = _mm256_mul_ps(radius, costheta);
+  const __m256 n2 = _mm256_mul_ps(radius, sintheta);
+  _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *std_v, *mean));
+  _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *std_v, *mean));
+}
+
+template<typename RNG>
+void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) {
+  float *data = self.data_ptr<float>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<float> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+  const __m256 two_pi = _mm256_set1_ps(2.0f * c10::pi<double>);
+  const __m256 one = _mm256_set1_ps(1.0f);
+  const __m256 minus_two = _mm256_set1_ps(-2.0f);
+  const __m256 mean_v = _mm256_set1_ps(mean);
+  const __m256 std_v = _mm256_set1_ps(std);
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &std_v);
+  }
+
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<float> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &std_v);
+  }
+}
+#endif
+
+template <typename scalar_t>
+static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) {
+  for (const auto j : c10::irange(8)) {
+    const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log.
+    const scalar_t u2 = data[j + 8];
+    const scalar_t radius = std::sqrt(-2 * std::log(u1));
+    const scalar_t theta = 2.0f * c10::pi<double> * u2;
+    data[j] = radius * std::cos(theta) * std + mean;
+    data[j + 8] = radius * std::sin(theta) * std + mean;
+  }
+}
+
+template <typename scalar_t, typename RNG>
+void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) {
+  scalar_t *data = self.data_ptr<scalar_t>();
+  auto size = self.numel();
+  std::lock_guard<std::mutex> lock(generator->mutex_);
+  for (const auto i : c10::irange(size)) {
+    at::uniform_real_distribution<scalar_t> uniform(0, 1);
+    data[i] = uniform(generator);
+  }
+
+  for (int64_t i = 0; i < size - 15; i += 16) {
+    normal_fill_16<scalar_t>(data + i, mean, std);
+  }
+  if (size % 16 != 0) {
+    // Recompute the last 16 values.
+    data = data + size - 16;
+    for (const auto i : c10::irange(16)) {
+      at::uniform_real_distribution<scalar_t> uniform(0, 1);
+      data[i] = uniform(generator);
+    }
+    normal_fill_16<scalar_t>(data, mean, std);
+  }
+}
+
+template<typename RNG>
+void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) {
+  auto size = self.numel();
+  if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) {
+#ifdef CPU_CAPABILITY_AVX2
+    normal_fill_AVX2(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#else
+    normal_fill(self, static_cast<float>(mean), static_cast<float>(std), generator);
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "normal_kernel_cpu", [&] {
+      if (size >= 16 && self.is_contiguous()) {
+        normal_fill<scalar_t>(self, static_cast<scalar_t>(mean), static_cast<scalar_t>(std), generator);
+      } else {
+        auto iter = TensorIterator::borrowing_nullary_op(self);
+        std::lock_guard<std::mutex> lock(generator->mutex_);
+        cpu_serial_kernel(iter, [mean, std, generator]() -> scalar_t {
+          at::normal_distribution<double> normal(mean, std);
+          return static_cast<scalar_t>(normal(generator));
+        });
+      }
+    });
+  }
+}
+
+template<typename RNG>
+struct NormalKernel {
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+    normal_kernel(self, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Uniform =======================================================
+
+template<typename RNG>
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    at::uniform_real_distribution<scalar_t> uniform(from, to);
+    cpu_serial_kernel(iter, [&uniform, generator]() -> scalar_t {
+      return static_cast<scalar_t>(uniform(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+    uniform_kernel(iter, from, to, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Cauchy ========================================================
+
+template<typename RNG>
+void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::cauchy_distribution<double> cauchy(median, sigma);
+    cpu_serial_kernel(iter, [&cauchy, generator]() -> scalar_t {
+      return static_cast<scalar_t>(cauchy(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+    cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== LogNormal =======================================================
+
+template<typename RNG>
+void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::lognormal_distribution<double> logNormal(mean, std);
+    cpu_serial_kernel(iter, [&logNormal, generator]() -> scalar_t {
+      return static_cast<scalar_t>(logNormal(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+    log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// =================================================== Geometric ======================================================
+
+template<typename RNG>
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::geometric_distribution<double> geometric(p);
+    cpu_serial_kernel(iter, [&geometric, generator]() -> scalar_t {
+      return static_cast<scalar_t>(geometric(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+    geometric_kernel(iter, p, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Exponential =====================================================
+
+template<typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() {
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    at::exponential_distribution<double> exponential(lambda);
+    cpu_serial_kernel(iter, [&exponential, generator]() -> scalar_t {
+      return static_cast<scalar_t>(exponential(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+    exponential_kernel(iter, lambda, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Bernoulli =======================================================
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+  self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    using self_t = scalar_t;
+    auto p_cpu = p_.to(kCPU);
+    auto p = expand_inplace(self, p_cpu);
+    auto iter = TensorIteratorConfig()
+        .add_output(self)
+        .add_input(*p)
+        .check_all_same_dtype(false)
+        .build();
+    if (p->scalar_type() == kDouble) {
+      cpu_serial_kernel(iter, [&](const double p_val) -> self_t {
+        at::bernoulli_distribution<double> bernoulli(p_val);
+        return static_cast<self_t>(bernoulli(generator));
+      });
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half,
+      p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] {
+        using p_t = scalar_t;
+        cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t {
+          at::bernoulli_distribution<float> bernoulli(p_val);
+          return static_cast<self_t>(bernoulli(generator));
+        });
+      });
+    }
+  });
+}
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, double p, RNG generator) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half,
+  self.scalar_type(), "bernoulli_scalar_cpu_", [&] {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(generator->mutex_);
+    auto iter = TensorIterator::borrowing_nullary_op(self);
+    cpu_serial_kernel(iter, [p, generator]() -> scalar_t {
+      at::bernoulli_distribution<double> bernoulli(p);
+      return static_cast<scalar_t>(bernoulli(generator));
+    });
+  });
+}
+
+template<typename RNG>
+struct BernoulliKernel {
+  void operator()(const TensorBase &self, double p, c10::optional<Generator> gen) {
+    bernoulli_kernel(self, p, check_generator<RNG>(gen));
+  }
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
+  }
+};
+
+}}}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..137a578b3f74edc67511f0e9900fc7c320318916
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+using forward_2d_fn = void (*) (
+    const TensorBase &output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners);
+using backward_2d_fn = void (*) (
+    const TensorBase &grad_input,
+    const TensorBase &grad_grid,
+    const TensorBase &grad_output,
+    const TensorBase &input,
+    const TensorBase &grid,
+    int64_t interpolation_mode,
+    int64_t padding_mode,
+    bool align_corners,
+    std::array<bool, 2> output_mask);
+DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel);
+DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..de667f3fe287b8434743258dec2e40d29b99a9d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <ATen/native/TensorIterator.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace native {
+
+namespace {
+static bool is_constant_index(int ntensor, const int64_t* strides) {
+  AT_ASSERT(ntensor >= 3);
+  for (const auto arg : c10::irange(2, ntensor)) {
+    if (strides[arg] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+struct Indexer {
+  Indexer(int64_t num_indexers, char** indexers, const int64_t* indexer_strides,
+          IntArrayRef original_sizes, IntArrayRef original_strides)
+    : num_indexers(num_indexers)
+    , indexers(indexers)
+    , indexer_strides(indexer_strides)
+    , original_strides(original_strides.data())
+    , original_sizes(original_sizes.data()) {
+    AT_ASSERT(static_cast<int64_t>(original_strides.size()) == num_indexers);
+    AT_ASSERT(static_cast<int64_t>(original_sizes.size()) == num_indexers);
+  }
+
+  int64_t num_indexers;
+  char** indexers;
+  const int64_t* indexer_strides;
+  const int64_t* original_strides;
+  const int64_t* original_sizes;
+
+  int64_t get(int64_t idx) {
+    int64_t offset = 0;
+    for (const auto j : c10::irange(num_indexers)) {
+      int64_t value = *(int64_t*)&indexers[j][idx * indexer_strides[j]];
+      int64_t size = original_sizes[j];
+      TORCH_CHECK_INDEX(value >= -size && value < size,
+                        "index ", value, " is out of bounds for dimension ", j, " with size ", size);
+      if (value < 0) {
+        value += size;
+      }
+      offset += value * original_strides[j];
+    }
+    return offset;
+  }
+};
+} // anonymous namespace
+
+template <typename scalar_t, typename func_t>
+void cpu_index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride,
+                      const func_t& f, bool serial_execution=false)
+{
+  int ntensor = iter.ntensors();
+  // When launch the index parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE
+  // to make the whole available thread numbers get more balanced work load and a better cache location.
+  // The grain size here is chosen by the op benchmark to overcome the thread launch overhead
+  const int index_parallel_grain_size = 3000;
+  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
+    auto indexer = Indexer(ntensor - 2, &data[2], &strides[2], index_size, index_stride);
+    char* dst = data[0];
+    char* src = data[1];
+    if (is_constant_index(ntensor, strides)) {
+      // specialization for when every element uses the same index
+      int64_t offset = indexer.get(0);
+      for (const auto i : c10::irange(n)) {
+        f(dst + strides[0] * i, src + strides[1] * i, offset);
+      }
+    } else {
+      for (const auto i : c10::irange(n)) {
+        int64_t offset = indexer.get(i);
+        f(dst + strides[0] * i, src + strides[1] * i, offset);
+      }
+    }
+  };
+  if (serial_execution) {
+    iter.serial_for_each(loop, {0, iter.numel()});
+  } else {
+    iter.for_each(loop, index_parallel_grain_size);
+  }
+}
+} // at
+} // native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Intrinsics.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..c85239e5a7067907af8c7e903208f2d4338c8213
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Intrinsics.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+/* Clang-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(_MSC_VER)
+/* Microsoft C/C++-compatible compiler */
+#include <intrin.h>
+#if _MSC_VER <= 1900
+#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y])
+#endif
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* GCC-compatible compiler, targeting x86/x86-64 */
+#include <x86intrin.h>
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+/* GCC-compatible compiler, targeting ARM with NEON */
+#include <arm_neon.h>
+#elif defined(__GNUC__) && defined(__IWMMXT__)
+/* GCC-compatible compiler, targeting ARM with WMMX */
+#include <mmintrin.h>
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
+    (defined(__VEC__) || defined(__ALTIVEC__))
+/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
+#include <altivec.h>
+/* We need to undef those tokens defined by <altivec.h> to avoid conflicts
+   with the C++ types. => Can still use __bool/__vector */
+#undef bool
+#undef vector
+#undef pixel
+#elif defined(__GNUC__) && defined(__SPE__)
+/* GCC-compatible compiler, targeting PowerPC with SPE */
+#include <spe.h>
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IsContiguous.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IsContiguous.h
new file mode 100644
index 0000000000000000000000000000000000000000..d521bd122114b9abb6f052cff689a3ca120e0942
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/IsContiguous.h
@@ -0,0 +1,62 @@
+#pragma once
+
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+
+// n: number of function arguments (arity)
+// traits: function_traits (see FunctionTraits.h)
+// s: index of scalar argument or -1
+template <int n, int stride_index, typename traits, int s=-1>
+struct IsContiguous {
+  static bool eval(const int64_t* strides) {
+    using type = typename traits::template arg<n - 1>::type;
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
+  }
+};
+
+// will be called when there is an output exists
+template <typename traits, int s>
+struct IsContiguous<0, 0, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return strides[0] == sizeof(typename traits::result_type);
+  }
+};
+
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* /*strides*/) {
+    return true;
+  }
+};
+
+// output and all inputs are contiguous
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+
+template <typename traits,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
+}
+
+// input at `s` is scalar (stride 0); output and other inputs are contiguous
+// NB: output is typically at strides[0] so first input corresponds to s=1
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+
+template <typename traits, int s,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
+}
+
+}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/LogAddExp.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/LogAddExp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bf461849ac82c56431eb23a5e651e71c09df7aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/LogAddExp.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/util/complex.h>
+#include <ATen/NumericUtils.h>
+
+namespace at { namespace native {
+inline namespace CPU_CAPABILITY {
+
+// custom min and max to be used in logcumsumexp for complex arguments
+template <typename scalar_t>
+std::pair<c10::complex<scalar_t>, c10::complex<scalar_t>> _logcumsumexp_minmax(c10::complex<scalar_t> x, c10::complex<scalar_t> y) {
+  if (at::_isnan(y)) {  // either real is nan or imag is nan
+    return std::make_pair(y, y);
+  } else if (at::_isnan(x)) {  // either real is nan or imag is nan
+    return std::make_pair(x, x);
+  } else {
+    return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x);
+  }
+}
+
+template <typename scalar_t>
+scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan
+  scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan
+  if (min != max || std::isfinite(min)) {
+    // nan will be propagated here
+    return std::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  auto [min, max] = _logcumsumexp_minmax<scalar_t>(x, y);
+  auto min_real = std::real(min);
+  auto max_real = std::real(max);
+
+  if (at::_isnan(min)) {  // either real is nan or imag is nan
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  } else if (!std::isfinite(min_real) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      return std::log(std::exp(min) + std::exp(max));
+    }
+  } else {
+    return std::log1p(std::exp(min - max)) + max;
+  }
+}
+
+} // end namespace
+}} //end at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Loops.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Loops.h
new file mode 100644
index 0000000000000000000000000000000000000000..016d4ded00f574af8068882ec339220d5f324cd2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Loops.h
@@ -0,0 +1,394 @@
+#pragma once
+
+// This file provides two functions to help write elementwise kernels:
+//
+//   cpu_kernel(TensorIterator iter, <lambda>)
+//   cpu_kernel_vec(TensorIterator iter, <lambda>, <vec_lambda>)
+//
+// Both functions may generate vectorized code. The cpu_kernel implementation
+// relies on the compiler's auto-vectorization. The cpu_kernel_vec
+// implementation uses x86 SIMD intrinsics when available. These functions
+// are only intended to be used in the ATen/native/cpu subdirectory, since files
+// in other directories are not compiled with AVX/AVX2 enabled. See README.md
+// for more details.
+//
+// For example, to write a multiplication kernel for float:
+//
+//   cpu_kernel(iter, [](float a, float b) { return a * b; });
+//
+// Or you may write:
+//
+//   cpu_kernel_vec(iter,
+//     [](float a, float b) { return a * b; },
+//     [](Vectorized<float> a, Vectorized<float> b) { return a * b; });
+//
+// See BinaryOpsKernel.cpp for the complete implementation
+//
+//
+
+#include <stdint.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Load.h>
+#include <c10/util/irange.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/cpu/IsContiguous.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cpu/vec/vec.h>
+
+#include <utility>
+
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
+                 std::index_sequence<INDEX...>) {
+  return std::make_tuple(
+      c10::load<typename traits::template arg<INDEX>::type>(
+          data[INDEX] + i * strides[INDEX])...);
+}
+
+template <typename traits>
+typename traits::ArgsTuple
+dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_impl<traits>(data, strides, i, Indices{});
+}
+
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_vec_impl(char* C10_RESTRICT data[],
+                     const typename traits::result_type& opt_scalar,
+                     size_t S,
+                     int64_t i,
+                     std::index_sequence<INDEX...>) {
+  using Vec = typename traits::result_type;
+  using scalar_t = typename Vec::value_type;
+  return std::make_tuple(
+      S == INDEX + 1 ?
+      opt_scalar :
+      Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
+}
+
+template <typename traits>
+typename traits::ArgsTuple
+dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
+}
+
+template <typename func_t,
+    typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+
+template <typename func_t,
+    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(std::forward<func_t>(op), dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+
+// Basic loop operation (one output, N inputs). May be auto-vectorized
+// by the compiler. Supports inputs and outputs of different types.
+template <typename func_t>
+static inline void
+basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  constexpr int ntensors = traits::arity + 1;
+
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+
+  execute_op(data, strides, i, n, std::forward<func_t>(op));
+}
+
+// the recursive variadic template for iterating over the returned tuple
+template<class T, size_t N>
+struct TupleOutput {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    TupleOutput<T, N - 1>::handle(data, strides, i, tuple);
+
+    auto output = std::get<N - 1>(tuple);
+    using output_type = decltype(output);
+    output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]);
+    *out_ptr = output;
+  }
+};
+
+// Base case for the above recursive template
+template<class T>
+struct TupleOutput<T, 1> {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    auto output = std::get<0>(tuple);
+    using output_type = decltype(output);
+    output_type* out_ptr = (output_type *)(data[0] + i * strides[0]);
+    *out_ptr = output;
+  }
+};
+
+template<class... Args>
+void handle_tuple_outputs(char* C10_RESTRICT data[],
+                          const int64_t* strides,
+                          int64_t i,
+                          const std::tuple<Args...> &tuple) {
+  TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
+}
+
+// Loop operation for `cpu_kernel_multiple_outputs`.
+// 1. Use `c10::guts::apply` to make dynamic method invocation
+//    for the lambda passed in `cpu_kernel_multiple_outputs`.
+// 2. Iterate over the members of the returned tuple, set the corresponding
+//    output tensor by the tuple member in `handle_tuple_outputs` function.
+template <typename func_t>
+static inline void
+multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+
+  using result_type = typename traits::result_type;
+  constexpr int num_outputs = std::tuple_size<result_type>::value;
+  constexpr int ntensors = traits::arity + num_outputs;
+
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+
+  for (; i < n; i++) {
+    auto output = c10::guts::apply(op, dereference<traits>(
+      &data[num_outputs],
+      &strides[num_outputs],
+      i));
+    handle_tuple_outputs(data, strides, i, output);
+  }
+}
+
+// Explicitly vectorized loop implementation. All inputs and outputs must be
+// the same type and contiguous with one exception: a single input may be
+// a scalar (stride 0). It's position is indicated by the argument `S`. If `S`
+// is 0, then there are no scalar inputs.
+template <typename func_t, typename vec_func_t>
+static inline void
+vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
+  using traits = function_traits<vec_func_t>;
+  using scalar_t = typename function_traits<func_t>::result_type;
+  using Vec = Vectorized<scalar_t>;
+  constexpr int ntensors = traits::arity + 1;
+
+  char* C10_RESTRICT data[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    data[arg] = data_[arg];
+  }
+
+  Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0));
+  int64_t i = 0;
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
+    auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
+    auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
+    auto out1 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args1));
+    auto out2 = c10::guts::apply(std::forward<vec_func_t>(vop), std::move(args2));
+    out1.store(data[0] + i * sizeof(scalar_t));
+    out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
+  }
+  if (i < n) {
+    int64_t strides[ntensors];
+    for (const auto arg : c10::irange(ntensors)) {
+      strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
+    }
+    basic_loop(data, strides, i, n, std::forward<func_t>(op));
+  }
+}
+
+
+template <typename traits, typename cb_t>
+static inline void unroll_contiguous_scalar_checks(
+    const int64_t* /*strides*/,
+    std::index_sequence<>,
+    cb_t&& cb) {
+  cb(0);
+}
+
+template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
+static inline void unroll_contiguous_scalar_checks(
+    const int64_t* strides,
+    std::index_sequence<INDEX0, INDEX...>,
+    cb_t&& cb) {
+  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
+    cb(INDEX0 + 1);
+  } else {
+    unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
+  }
+}
+
+template <typename op_t, typename vop_t>
+struct VectorizedLoop2d {
+  op_t op;
+  vop_t vop;
+
+  using traits = function_traits<op_t>;
+  static constexpr int ntensors = traits::arity + 1;
+  using data_t = std::array<char*, ntensors>;
+
+  VectorizedLoop2d(const op_t &op, vop_t vop):
+    op(op), vop(std::move(vop)) {}
+
+  static void advance(data_t &data, const int64_t *outer_strides) {
+    for (const auto arg : c10::irange(data.size())) {
+      data[arg] += outer_strides[arg];
+    }
+  }
+
+  void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
+    data_t data;
+    std::copy_n(base, ntensors, data.data());
+    const int64_t *outer_strides = &strides[ntensors];
+
+    if (is_contiguous<traits>(strides)) {
+      for (const auto i C10_UNUSED : c10::irange(size1)) {
+        vectorized_loop(data.data(), size0, 0, op, vop);
+        advance(data, outer_strides);
+      }
+    } else {
+      using Indices = std::make_index_sequence<traits::arity>;
+      unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
+        if (idx) {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            vectorized_loop(data.data(), size0, idx, op, vop);
+            advance(data, outer_strides);
+          }
+        } else {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            basic_loop(data.data(), strides, 0, size0, op);
+            advance(data, outer_strides);
+          }
+        }
+      });
+    }
+  }
+};
+
+template <typename op_t, typename vop_t>
+VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
+    const op_t &op, const vop_t &vop) {
+  return VectorizedLoop2d<op_t, vop_t>(op, vop);
+}
+
+template <typename func_t>
+void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that
+    // iter.for_each is ever sending to the loop lambda
+      basic_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, grain_size);
+  iter.cast_outputs();
+}
+
+// This function helps write elementwise kernels that requires multiple outputs.
+// It follows the similar structure of cpu_kernel.
+// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is
+// manipulated to handle multiple return values.
+// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`)
+// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`.
+// The `gpu_kernel_multiple_outputs` is also implemented without this check,
+// We could extend `needs_dynamic_casting` to support both `std::tuple` and
+// `thrust::tuple` in the future.
+template <typename func_t>
+void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    multiple_outputs_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, grain_size);
+  iter.cast_outputs();
+}
+
+template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
+void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU, but some kernels (like Fill)
+  // explicitly dynamic_cast, so we give the opt-out of checking.
+  if constexpr (check_dynamic_cast) {
+    TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  }
+
+  iter.for_each(make_vectorized_loop2d(op, vop), grain_size);
+  iter.cast_outputs();
+}
+
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
+  using traits = function_traits<func_t>;
+  constexpr bool result_void = std::is_void<typename traits::result_type>::value;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
+                        ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1)));
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
+    basic_loop(data, strides, 0, n, std::forward<func_t>(op));
+  }, range);
+  iter.cast_outputs();
+}
+
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
+  cpu_serial_kernel(iter, op, {0, iter.numel()});
+}
+
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.serial_for_each(make_vectorized_loop2d(op, vop), range);
+  iter.cast_outputs();
+}
+
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
+  cpu_serial_kernel_vec(iter, op, vop, {0, iter.numel()});
+}
+
+}}}  // namespace at::native::<anonymous>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ea0bed4b1eae1e12dec1163835607310532f495
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel);
+DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel);
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cb98b008bf531ad5e987176f5b8dbc2acf73872
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+using pixel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t);
+DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel);
+DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel);
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Reduce.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..d02a1bcc5171f1738909aa28b7cf0390522697f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/Reduce.h
@@ -0,0 +1,314 @@
+#pragma once
+
+#include <ATen/native/cpu/Loops.h>
+#include <ATen/Parallel.h>
+#include <c10/util/TypeList.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/irange.h>
+
+#include <sstream>
+#include <type_traits>
+
+namespace at { namespace native { inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define VEC_LOOP_HEADER(func_t, data) \
+  using scalar_t = typename function_traits<func_t>::result_type; \
+  using Vec = Vectorized<scalar_t>; \
+  char* out_ptr = data[0]; \
+  (void) out_ptr;
+
+// reduction that is contiguous over the input in dim 0
+template <typename traits>
+static inline bool is_contiguous_reduction(const int64_t* strides) {
+  return strides[0] == 0 &&
+         strides[1] == sizeof(typename traits::arg2_t);
+}
+
+// reduction that is contiguous over the input in dim 1
+template <typename traits>
+static inline bool is_outer_reduction(const int64_t* strides) {
+  return strides[0] == 0 &&
+         strides[2] == sizeof(typename traits::result_type) &&
+         strides[3] == sizeof(typename traits::arg2_t);
+}
+
+template <typename func_t, typename vec_func_t>
+static inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
+                                        func_t op, vec_func_t vop, bool reduce) {
+  VEC_LOOP_HEADER(func_t, data)
+  const char* in1_ptr = data[1];
+  Vec acc[4];
+  for (const auto j : c10::irange(4)) {
+    acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
+  }
+  for (const auto i : c10::irange(1, n)) {
+    const char* ptr = in1_ptr + stride * i;
+    acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
+    acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
+    acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
+    acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
+  }
+  if (reduce) {
+    scalar_t buffer[Vec::size()];
+    acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
+    acc[0].store(buffer);
+    for (const auto j : c10::irange(1, Vec::size())) {
+      buffer[0] = op(buffer[0], buffer[j]);
+    }
+    auto dst = (scalar_t*)out_ptr;
+    *dst = op(*dst, buffer[0]);
+  } else {
+    for (const auto j : c10::irange(4)) {
+      auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
+      acc[j] = vop(acc[j], Vec::loadu(dst));
+      acc[j].store(dst);
+    }
+  }
+}
+
+template <typename F>
+static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
+  for (const auto j C10_UNUSED : c10::irange(n)) {
+    f();
+    data[0] += strides[0];
+    data[1] += strides[1];
+  }
+}
+
+// computes the reduction out = op(out, in)
+template <typename func_t, typename vec_func_t>
+static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
+  VEC_LOOP_HEADER(func_t, data)
+  int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
+  int64_t count = n / (4 * Vec::size());
+  if (count > 0) {
+    vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
+  }
+  char* ptrs[3] = { data[0], data[0], data[1] };
+  int64_t strides[] = { 0, 0, sizeof(scalar_t) };
+  basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
+}
+
+// computes the reduction out = op(out, in)
+template <typename func_t, typename vec_func_t>
+static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
+  VEC_LOOP_HEADER(func_t, data)
+
+  // reduce down each column of 4 * Vec::size() elements (128 or 256 bytes)
+#if defined(CPU_CAPABILITY_AVX512)
+  int64_t outer_stride[2] = { 256, 256 };
+#else
+  int64_t outer_stride[2] = { 128, 128 };
+#endif
+  UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
+    vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
+  });
+
+  // reduce down the remaining columns
+  int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
+  int64_t remaining = size1 % (4 * Vec::size());
+  UNARY_OUTER_LOOP(data, step, remaining, [&] {
+    char* ptrs[3] = { data[0], data[0], data[1] };
+    int64_t strides[] = { 0, 0, inner_stride };
+    basic_loop(ptrs, strides, 0, size0, op);
+  });
+}
+
+template<typename traits, typename res_t>
+static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
+  // static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
+  if (index < num_outputs) {
+    char *out = (char *) iter.data_ptr(index);
+    *(res_t *) out = result;
+  }
+}
+
+template<typename traits, typename res_t>
+static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs == 1);
+  set_result<traits>(0, result, iter, num_outputs);
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
+  return i;
+}
+
+template<typename traits, std::size_t i = 0, typename... tuple_t>
+static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
+  if (i < (size_t)num_outputs) {
+    set_result<traits>(i, std::get<i>(t), iter, num_outputs);
+    return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
+  }
+  return i;
+}
+
+template<typename traits, typename... res_t>
+static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
+  AT_ASSERT(num_outputs >= 1);
+  std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
+  AT_ASSERT((size_t)num_outputs == result_size);
+}
+
+template <typename T, typename... Args>
+struct all_same : std::conjunction<
+  std::is_same<T, Args>...
+> {};
+
+// data_t is the input/output data type.
+// acc_t is a type that contains all the necessary data
+// to continue reducing.
+// index_t is a one-dimensional index
+//
+// ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy
+// the following.
+// reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value.
+// combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one.
+// project: acc_t -> out_t finishes the reduction, getting the required output.
+//
+// Additionally, acc_t must be default-constructible:
+// acc_t {} is an identity for combine,
+// and project(acc_t {}) is the value of the operation on zero elements.
+//
+// The point of `combine` is to support parallelization -
+// the idea is to one sequence of `reduce` calls per thread of execution,
+// and then to combine them at the end with `combine`.
+//
+// If there is more than one output element,
+// our parallelization strategy is to use one thread for each of them,
+// which means that `combine` will never be called.
+//
+// If, on the other hand, there is only one, then we split the input into
+// into several pieces, reduce each separately, and then combine them.
+
+template <typename ops_t, typename init_t>
+void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
+  using rf_t = decltype(&ops_t::reduce);
+  using cf_t = decltype(&ops_t::combine);
+  using pf_t = decltype(&ops_t::project);
+  using r_traits = binary_function_traits<rf_t>;
+  using c_traits = binary_function_traits<cf_t>;
+  using p_traits = unary_function_traits<pf_t>;
+  using acc_t = typename p_traits::arg1_t;
+  using data_t = typename r_traits::arg2_t;
+  static_assert(
+    all_same<
+      acc_t,
+      init_t,
+      typename r_traits::arg1_t,
+      typename r_traits::result_type,
+      typename c_traits::arg1_t,
+      typename c_traits::arg2_t,
+      typename c_traits::result_type>::value,
+    "all accumulate types must match");
+  static_assert(
+    std::is_default_constructible<acc_t>::value,
+    "the accumulate type must be default-constructible"
+  );
+  const int num_outputs = iter.noutputs();
+  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
+    auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
+      int ntensors = sub_iter.ntensors();
+      sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
+        AT_ASSERT(ntensors - num_outputs == 1);
+        char *in = data[ntensors - 1];
+        int64_t stride = strides[ntensors - 1];
+        for (const auto i : c10::irange(size)) {
+          acc = ops.reduce(acc, c10::load<data_t>(in), begin + i);
+          in += stride;
+        }
+      }, {begin, end});
+      return ops.translate_idx(acc, sub_iter.view_offsets()[0]);
+    };
+    acc_t total_acc = init;
+    auto numel = sub_iter.numel();
+    if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
+        at::in_parallel_region()) {
+      total_acc = reduction_body(total_acc, 0, numel);
+    } else {
+      int max_threads = at::get_num_threads();
+      AT_ASSERT(max_threads > 0);
+      static_assert(
+        !std::is_same<acc_t, bool>::value,
+        "Concurrently modifying different references into std::vector<bool> is UB."
+      );
+      std::vector<acc_t> buffer((unsigned)max_threads, init);
+      at::parallel_for(0, numel, internal::GRAIN_SIZE,
+        [&](int64_t begin, int64_t end) {
+          auto& acc = buffer[at::get_thread_num()];
+          acc = reduction_body(acc, begin, end);
+        }
+      );
+      for (const auto i : c10::irange(max_threads)) {
+        total_acc = ops.combine(total_acc, buffer[i]);
+      }
+    }
+    set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
+  });
+}
+
+template <typename func_t, typename vec_func_t>
+void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
+  using traits = binary_function_traits<func_t>;
+  static_assert(
+    all_same<
+      typename traits::result_type,
+      typename traits::arg1_t,
+      typename traits::arg2_t>::value,
+    "all types must match");
+
+  iter.output_base().fill_(ident);
+  iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
+    int64_t outer_strides[] = { strides[2], strides[3] };
+    if (is_contiguous_reduction<traits>(strides)) {
+      // input is contiguous in dim 0, output is reduced in dim 0
+      UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
+        vectorized_inner_reduction(data, size0, op, vop);
+      });
+    } else if (is_outer_reduction<traits>(strides)) {
+      // input and output are contiguous in dim 1
+      int64_t inner_stride = strides[1]; // stride of input in dim 0
+      vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop);
+    } else {
+      UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
+        char* ptrs[3] = { data[0], data[0], data[1] };
+        int64_t inner_strides[3] = { strides[0], strides[0], strides[1] };
+        basic_loop(ptrs, inner_strides, 0, size0, op);
+      });
+    }
+  });
+}
+
+// when reduction is on most inner dimension (dim 0 in TensorIterator)
+// and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim`
+// can be used.
+static inline bool is_reduce_lastdim(TensorIteratorBase& iter) {
+  return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0)
+      && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1);
+}
+
+template <typename reduce_func_t>
+void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) {
+  auto shape = iter.shape();
+  int64_t dim_size = shape[0];
+  int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size);
+  TensorIterator sub_iter(iter);
+  // create sub iterator to parallel on all non-reduce-dims
+  sub_iter.narrow(0, 0, 1);
+  auto loop = [&](char** data, const int64_t* strides, int64_t size) {
+    char* out = data[0];
+    char* in = data[1];
+    for (int64_t i = 0; i < size; ++i) {
+      reduce_op(out, in, dim_size);
+      out += strides[0];
+      in += strides[1];
+    }
+  };
+  sub_iter.for_each(loop, grain_size);
+}
+
+}}}  // namespace at::native::<anonymous>
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f8c30e6fcebee4252f5272787599c1fabf67861
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/util/irange.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cpu/utils.h>
+#include <ATen/OpMathType.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define AT_DISPATCH_REDUCTION_TYPES(op, ...)                                   \
+  [&] {                                                                        \
+    switch (op) {                                                              \
+      case ReductionType::SUM: {                                               \
+        static constexpr auto reduce = ReductionType::SUM;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MEAN: {                                              \
+        static constexpr auto reduce = ReductionType::MEAN;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MIN: {                                               \
+        static constexpr auto reduce = ReductionType::MIN;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MAX: {                                               \
+        static constexpr auto reduce = ReductionType::MAX;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::PROD: {                                              \
+        static constexpr auto reduce = ReductionType::PROD;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value() {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val;
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    val = static_cast<acc_t>(0);
+  } else if (reduce == ReductionType::PROD) {
+    val = static_cast<acc_t>(1);
+  } else if (reduce == ReductionType::MAX) {
+    val = -std::numeric_limits<acc_t>::infinity();
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    val = std::numeric_limits<acc_t>::infinity();
+  }
+  return val;
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value(const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (initial.has_value()) {
+    return initial.value().to<acc_t>();
+  } else {
+    return init_value<scalar_t, reduce>();
+  }
+}
+
+template <typename scalar_t>
+inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
+  using Vec = Vectorized<vec_scalar_t<scalar_t>>;
+  map<scalar_t>(
+      [val](Vec x) { return Vec(val); },
+      out,
+      out,
+      size);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, const c10::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val = init_value<scalar_t, reduce>(initial);
+  init(out, size, val);
+}
+
+// overload with `include_self`, used by scatter_reduce
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, bool include_self = false) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (!include_self) {
+    acc_t val = init_value<scalar_t, reduce>();
+    init(out, size, val);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void _init(scalar_t* self_ptr, at::opmath_type<scalar_t>* buffer_ptr, int64_t size, bool include_self) {
+  if (!include_self) {
+    init<at::opmath_type<scalar_t>, reduce>(buffer_ptr, size, include_self);
+  } else {
+    vec::convert(self_ptr, buffer_ptr, size);
+  }
+}
+
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_max(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::max(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::maximum propagates NaN
+  return vec::maximum(x, y);
+}
+
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_max(const vec_t& x, const vec_t& y) {
+  // vec::maximum propagates NaN
+  return maximum(x, y);
+}
+
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_min(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::min(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::minimum propagates NaN
+  return vec::minimum(x, y);
+}
+
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_min(const vec_t& x, const vec_t& y) {
+  // vec::minimum propagates NaN
+  return minimum(x, y);
+}
+
+template <typename scalar_t, typename accumut, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map_acc(
+    const Op& vec_fun,
+    accumut* output_data,
+    const accumut* input_data,
+    const scalar_t* input_data2,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  using aVec = vec::Vectorized<accumut>;
+  int64_t d = 0;
+  constexpr int64_t kVecSize = Vec::size();
+  constexpr int64_t kaVecSize = aVec::size();
+  for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
+    Vec data2_vec = Vec::loadu(input_data2 + d);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    aVec input_vec0 = aVec::loadu(input_data + d);
+    aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize);
+    vec_fun(input_vec0, data2_avec0).store(output_data + d);
+    vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize);
+  }
+  if (size - d > 0) {
+    int64_t tail_size = size - d;
+    Vec data2_vec = Vec::loadu(input_data2 + d, tail_size);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    if (tail_size > kaVecSize) {
+      aVec input_vec0 = aVec::loadu(input_data + d);
+      aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d);
+      vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize);
+    } else {
+      aVec input_vec0 = aVec::loadu(input_data + d, tail_size);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size);
+    }
+  }
+}
+
+// for Max and Min, propagate NaN:
+template <typename T, ReductionType reduce>
+inline T update(const T& x, const T& y) {
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    return x + y;
+  } else if (reduce == ReductionType::PROD) {
+    return x * y;
+  } else if (reduce == ReductionType::MAX) {
+    return _max(x, y);
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    return _min(x, y);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void update(scalar_t* out, const scalar_t* data, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  map2<scalar_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void update(at::opmath_type<scalar_t>* out, const scalar_t* data, int64_t K) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using Vec = vec::Vectorized<opmath_t>;
+  map_acc<scalar_t, opmath_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void write(scalar_t* out, int64_t count, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  if (reduce == ReductionType::MEAN) {
+    if (count > 0) {
+      vec::map<scalar_t>(
+          [count](Vec x) { return x / Vec(count); },
+          out,
+          out,
+          K);
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebaf769f148a7887c5473aaa54b0f05fc55715ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using sampled_addmm_sparse_csr_fn = void(*)(const Tensor&, const Tensor&, const Scalar&, const Scalar&, const Tensor&);
+
+DECLARE_DISPATCH(sampled_addmm_sparse_csr_fn, sampled_addmm_sparse_csr_stub);
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa6785e41706f2b3eea51c7821c8a388ab866e4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h
@@ -0,0 +1,144 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/irange.h>
+
+namespace at { namespace native { namespace detail {
+
+struct InputMeta {
+  void* data_ptr;
+  int64_t inner_size;
+
+  InputMeta(const Tensor& t, int64_t dim, int64_t inner)
+      : data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {}
+};
+
+// This kernel is used by two TensorList types:
+// 1. stack_serial_kernel uses at::ArrayRef<Tensor>
+// 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with
+//    ProcessedNodeInputWrapper.
+// When making changes, make sure that they are compatible with both types!
+template <typename scalar_t, typename TensorListType>
+void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      dim >= 0 && dim <= result.dim(),
+      "dim out of range in stack_serial_kernel_impl");
+  int64_t outer =
+      result.numel() / (result.sizes()[dim] * result.strides()[dim]);
+  scalar_t* result_data = result.data_ptr<scalar_t>();
+  int64_t ninputs = tensors.size();
+  std::vector<InputMeta> inputs;
+  inputs.reserve(ninputs);
+  for (const auto& tensor : tensors) {
+    inputs.emplace_back(tensor, dim, tensor.strides()[dim]);
+  }
+
+  using Vec = vec::Vectorized<scalar_t>;
+  scalar_t* result_ptr = result_data;
+  for (const auto i : c10::irange(outer)) {
+    for (const auto j : c10::irange(ninputs)) {
+      int64_t local_inner = inputs[j].inner_size;
+      scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
+
+      if (local_inner < Vec::size()) {
+        for (const auto k : c10::irange(local_inner)) {
+          result_ptr[k] = input_ptr[k];
+        }
+      } else {
+        vec::map(
+            [](Vec x) { return x; }, result_ptr, input_ptr, local_inner);
+      }
+      result_ptr += local_inner;
+    }
+  }
+}
+
+// Checks to see whether native stack can be invoked under these conditions:
+// - result and input tensors are contiguous
+// - only one thread is used
+// - no type promotion has to occur
+// - tensors dtype is Double or Float
+template <typename TensorListType>
+bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) {
+  TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
+  const Tensor& first_tensor = tensors[0];
+  // stack dimension should be in range [0,firstTensor.dim())
+  // dim == firstTensor.dim() is a valid input, but it is handled by default code path
+  // that uses unsqueeze
+  if (dim >= first_tensor.dim()) return false;
+  // Native stack doesn't apply any tensor is skipped.
+  if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false;
+  // there should be no type promotion
+  if (result.dtype() != first_tensor.dtype()) return false;
+
+  auto first_tensor_mem_format = first_tensor.suggest_memory_format();
+  ScalarType dtype = first_tensor.scalar_type();
+
+  if (!result.is_contiguous(first_tensor_mem_format)) {
+    return false;
+  }
+
+  // fast path only works for Double and Float
+  if (dtype != ScalarType::Double && dtype != ScalarType::Float) {
+    return false;
+  }
+
+  // check remainder of inputs
+  auto const &first_tensor_shape = first_tensor.sizes();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    auto const &tensor = tensors[i];
+    TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(),
+      "stack expects each tensor to be equal size, but got ", first_tensor_shape,
+      " at entry 0 and ", tensor.sizes(), " at entry ", i);
+
+    // every tensor must be contiguous
+    // tensor sizes and strides must be the same
+    // there should be no type promotion
+    if (!tensor.is_contiguous(first_tensor_mem_format) ||
+      tensor.strides() != first_tensor.strides() ||
+      tensor.dtype() != dtype) {
+      return false;
+    }
+  }
+
+  // fast native stack should only be used when it is not worth using multiple threads
+  // or there is only one thread. Note that we aren't checking result.numel() here because
+  // it may not have been resized and we want to defer that cost till later.
+  int64_t numel_in_stack = first_tensor.numel() * tensors.size();
+  return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
+}
+
+template <typename TensorListType, bool should_skip_overlap_check>
+struct CanUseNativeSerialStack;
+
+template <typename TensorListType>
+struct CanUseNativeSerialStack<TensorListType, false> {
+  static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
+    // Inputs cannot alias the output tensor
+    for (const auto i : c10::irange(tensors.size())) {
+      auto lap = at::get_overlap_status(result, tensors[i]);
+      TORCH_CHECK(lap != at::MemOverlapStatus::Partial &&
+          lap != at::MemOverlapStatus::Full, 0,
+          "unsupported operation: the input tensors cannot refer to any of the "
+          "output memory locations. Found overlap in input tensor ", i);
+    }
+
+    return can_use_native_serial_stack_impl(result, tensors, dim);
+  }
+};
+
+template <typename TensorListType>
+struct CanUseNativeSerialStack<TensorListType, true> {
+  static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
+    return can_use_native_serial_stack_impl(result, tensors, dim);
+  }
+};
+
+}}}  // namespace at::native::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5e1124e54f2aa299ce2cc45e370c599610d3bb4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using forward_fn = void (*)(const Tensor&, const Tensor&);
+using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&);
+
+DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel);
+DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel);
+DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel);
+
+using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t);
+using backward_fn_with_dim =
+    void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t);
+
+DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel);
+DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel);
+DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel);
+}
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ae6de525c371d14093ebfe8c19f4198f9eef921
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+
+namespace at::native {
+
+using spmm_reduce_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_input_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+using spmm_reduce_backward_other_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op);
+
+DECLARE_DISPATCH(spmm_reduce_fn, spmm_reduce_stub);
+DECLARE_DISPATCH(spmm_reduce_arg_fn, spmm_reduce_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_fn, spmm_reduce_backward_input_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_input_arg_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_other_fn, spmm_reduce_backward_other_stub);
+DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_other_arg_stub);
+
+} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/StackKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/StackKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8259c1f546bf6804407b1357618ef564a076609
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/StackKernel.h
@@ -0,0 +1,12 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at { namespace native {
+
+using stack_serial_fn = void(*)(Tensor &, TensorList, int64_t);
+DECLARE_DISPATCH(stack_serial_fn, stack_serial_stub);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5ee2344bc178f9736da0f353d02ecb543bfd598
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h
@@ -0,0 +1,1376 @@
+/*
+The Python Imaging Library (PIL) is
+
+    Copyright © 1997-2011 by Secret Labs AB
+    Copyright © 1995-2011 by Fredrik Lundh
+
+Pillow is the friendly PIL fork. It is
+
+    Copyright © 2010-2022 by Alex Clark and contributors
+
+Like PIL, Pillow is licensed under the open source HPND License
+*/
+
+// This code is heavily inspired from PILLOW-SIMD's implementation:
+// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c
+
+#pragma once
+#ifdef CPU_CAPABILITY_AVX2
+// TODO: This file only supports AVX2. We could split the AVX kernels into
+// smaller logical blocks in order to port them into the Vec.h logic. This would
+// allow to support other vectorization architectures and perhaps also support
+// the non-vectorized fallback (we'd need to make sure it's not slower than the
+// current fallback).
+
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/intrinsics.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+
+namespace {
+
+static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+  int32_t v;
+  if (i32_aligned) {
+    v = *(const int32_t*)ptr;
+  } else {
+    std::memcpy(&v, ptr, 4);
+  }
+  return _mm_cvtsi32_si128(v);
+}
+
+static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) {
+  return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned));
+}
+
+static inline void _write_endline_rgb_as_uint32(
+    uint8_t* C10_RESTRICT output,
+    uint32_t data
+) {
+  // data is (R G B X), output is (X1 X2 X3 | R1 B1 G1 R2 ...)
+  // Here we explicitly set X as R1
+  uint8_t* data_ptr = reinterpret_cast<uint8_t*>(&data);
+  data_ptr[3] = output[3];
+  std::memcpy(output, data_ptr, 4);
+}
+
+at::Tensor unpack_rgb(const at::Tensor& packed_tensor) {
+  // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into
+  // RGBARGBARGBA format where A is hard-coded to 0. Each pixel is encoded
+  // into as 32 bits. This generalizes to num_channels <= 4 and also works for
+  // non-channels_last tensors.
+
+  const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  constexpr int rgba_size = 4;
+  auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte));
+  uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr<uint8_t>();
+
+  auto stride_i = packed_tensor.stride(2);
+  auto stride_j = packed_tensor.stride(0);
+
+  for (const auto i : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(rgba_size)) {
+      unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0;
+    }
+  }
+  return unpacked_tensor;
+}
+
+void pack_rgb(
+    const at::Tensor& unpacked_tensor, // IN
+    const at::Tensor& packed_tensor // OUT
+) {
+  // Convert from unpacked channels last 3-channels or 4-channels tensor into original data layout.
+
+  uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr<uint8_t>();
+  uint8_t* packed = (uint8_t*)packed_tensor.data_ptr<uint8_t>();
+  auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2);
+  auto num_channels = packed_tensor.size(0);
+
+  auto unpacked_increment = unpacked_tensor.size(0);
+  auto packed_increment = packed_tensor.stride(2);
+  auto packed_stride = packed_tensor.stride(0);
+
+  TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4);
+
+  for (const auto i C10_UNUSED : c10::irange(num_pixels)) {
+    for (const auto j : c10::irange(num_channels)) {
+      packed[j * packed_stride] = unpacked[j];
+    }
+    unpacked += unpacked_increment;
+    packed += packed_increment;
+  }
+}
+
+void ImagingResampleHorizontalConvolution8u4x(
+    uint8_t* C10_RESTRICT lineOut0,
+    uint8_t* C10_RESTRICT lineOut1,
+    uint8_t* C10_RESTRICT lineOut2,
+    uint8_t* C10_RESTRICT lineOut3,
+    int64_t out_xsize,
+    const uint8_t* C10_RESTRICT lineIn0,
+    const uint8_t* C10_RESTRICT lineIn1,
+    const uint8_t* C10_RESTRICT lineIn2,
+    const uint8_t* C10_RESTRICT lineIn3,
+    int64_t in_xsize,
+    const int64_t* idx_ptr_xmin,
+    const int64_t* idx_ptr_size,
+    const int16_t* kk,
+    int kmax,
+    unsigned int coefs_precision,
+    int64_t num_channels,
+    bool is_last_line);
+
+void ImagingResampleHorizontalConvolution8u(
+    uint8_t* C10_RESTRICT lineOut,
+    int64_t out_xsize,
+    const uint8_t* C10_RESTRICT lineIn,
+    int64_t in_xsize,
+    const int64_t* idx_ptr_xmin,
+    const int64_t* idx_ptr_size,
+    const int16_t* kk,
+    int kmax,
+    unsigned int coefs_precision,
+    int64_t num_channels,
+    bool is_last_line);
+
+void ImagingResampleVerticalConvolution8u(
+    uint8_t* C10_RESTRICT lineOut,
+    const uint8_t* C10_RESTRICT lineIn,
+    int64_t xsize,
+    int64_t ids_min,
+    int64_t ids_size,
+    const int16_t* k,
+    unsigned int coefs_precision,
+    int64_t num_channels);
+
+template<int num_channels>
+void ImagingResampleHorizontal(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& horiz_indices_weights,
+    unsigned int horiz_weights_precision) {
+
+  // Interpolation horizontal pass: we compute x-axis (image width) interpolation outputs.
+
+  // Input data is stored as
+  //   input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...]
+  // Weights are float values computed for each output pixel and rescaled to uint16:
+  //   weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]]
+  // We want to compute the output as following:
+  //   output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...]
+  // where
+  //   oR[yoffset + i] = r[yoffset + xmin[i]] * w[i, 0] + ... + r[yoffset + xmin[i] + K-1] * w[i, K-1]
+  //   oG[yoffset + i] = g[yoffset + xmin[i]] * w[i, 0] + ... + g[yoffset + xmin[i] + K-1] * w[i, K-1]
+  //   oB[yoffset + i] = b[yoffset + xmin[i]] * w[i, 0] + ... + b[yoffset + xmin[i] + K-1] * w[i, K-1]
+  //
+
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_horizontal<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+
+  const int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr<double>());
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+  auto xin = unpacked_input.size(2);
+  TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0));
+
+  const int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr<int64_t>();
+  const int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr<int64_t>();
+
+  uint8_t* unpacked_output_p = unpacked_output.data_ptr<uint8_t>();
+  const uint8_t* unpacked_input_p = unpacked_input.data_ptr<uint8_t>();
+
+  int64_t yy = 0;
+  auto xout_stride = xout * num_channels;
+  auto xin_stride = xin * num_channels;
+  for (; yy < yout - 3; yy += 4) {
+    ImagingResampleHorizontalConvolution8u4x(
+        unpacked_output_p + yy * xout_stride,
+        unpacked_output_p + (yy + 1) * xout_stride,
+        unpacked_output_p + (yy + 2) * xout_stride,
+        unpacked_output_p + (yy + 3) * xout_stride,
+        xout,
+        unpacked_input_p + yy * xin_stride,
+        unpacked_input_p + (yy + 1) * xin_stride,
+        unpacked_input_p + (yy + 2) * xin_stride,
+        unpacked_input_p + (yy + 3) * xin_stride,
+        xin,
+        idx_ptr_xmin,
+        idx_ptr_size,
+        kk,
+        ksize,
+        horiz_weights_precision,
+        num_channels,
+        yy + 3 == yout - 1);
+  }
+  for (; yy < yout; yy++) {
+    ImagingResampleHorizontalConvolution8u(
+        unpacked_output_p + yy * xout_stride,
+        xout,
+        unpacked_input_p + yy * xin_stride,
+        xin,
+        idx_ptr_xmin,
+        idx_ptr_size,
+        kk,
+        ksize,
+        horiz_weights_precision,
+        num_channels,
+        yy == yout - 1);
+  }
+}
+
+void ImagingResampleVertical(
+    const at::Tensor & unpacked_output,
+    const at::Tensor & unpacked_input,
+    int ksize,
+    const std::vector<at::Tensor>& vert_indices_weights,
+    unsigned int vert_weights_precision) {
+
+  // Interpolation vertical pass: we compute y-axis interpolation outputs.
+  // Input data is stored as
+  //   input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...]
+  // Weights are float values computed for each output pixel and rescaled to uint16:
+  //   weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]]
+  // We want to compute the output as following:
+  //   output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...]
+  // where
+  //   oR[xoffset + i] = r[xoffset + ymin[i]] * w[i, 0] + ... + r[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1]
+  //   oG[xoffset + i] = g[xoffset + ymin[i]] * w[i, 0] + ... + g[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1]
+  //   oB[xoffset + i] = b[xoffset + ymin[i]] * w[i, 0] + ... + b[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1]
+
+  // TODO: we may want to merge that into the fallback code (currently called
+  // basic_loop_aa_vertical<uint8_t>)
+  // Although this may not be needed if / when we port all this code to use
+  // Vec.h since this would potentially give us another fall-back implem
+  const int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr<double>());
+
+  const int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr<int64_t>();
+  const int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr<int64_t>();
+
+  uint8_t* unpacked_output_p = unpacked_output.data_ptr<uint8_t>();
+  const uint8_t* unpacked_input_p = unpacked_input.data_ptr<uint8_t>();
+
+  auto xout = unpacked_output.size(2);
+  auto yout = unpacked_output.size(1);
+  const auto num_channels = unpacked_input.size(0);
+  TORCH_INTERNAL_ASSERT(num_channels == unpacked_output.size(0));
+
+  auto xout_stride = xout * num_channels;
+  for (const auto yy : c10::irange(yout)) {
+    const auto* k = &kk[yy * ksize];
+    auto ids_min = idx_ptr_xmin[yy];
+    auto ids_size = idx_ptr_size[yy];
+    ImagingResampleVerticalConvolution8u(
+        unpacked_output_p + yy * xout_stride,
+        unpacked_input_p,
+        xout,
+        ids_min,
+        ids_size,
+        k,
+        vert_weights_precision,
+        num_channels);
+  }
+}
+
+// This is the only public entry point in this file.  It supports bilinear or bicubic
+// mode for uint8 dtype when C <= 4, with or without antialias. The
+// implem is based on PIL-SIMD.
+// Its equivalent implementation (fallback) for when AVX isn't supported or when
+// C > 4 is separable_upsample_generic_Nd_kernel_impl()  There are a bunch of
+// future improvement that can be done: look for the TODOs in this file.
+// For details on how the weights are computed and how the multiplications are
+// run on int (instead of float weights), see
+// [ Weights computation for uint8_t and multiplication trick ]
+// For details on how the AVX kernels are implemented, see
+// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5
+// See also [ Support for antialias=False as a subcase of antialias=True ] to
+// learn more about how the antialias=False case is computed. The same holds
+// here: all these kernels are general enough to handle an arbitrary number of
+// weights, but when aa=False they could be optimized further.
+template <typename scale_type, class F>
+void upsample_avx_bilinear_bicubic_uint8(
+    const at::Tensor& input_,
+    const at::Tensor& output,
+    bool align_corners,
+    const scale_type& scales,
+    bool antialias) {
+  auto batch_size = input_.size(0);
+  auto num_channels = input_.size(1);
+  auto xin = input_.size(3);
+  auto yin = input_.size(2);
+  auto xout = output.size(3);
+  auto yout = output.size(2);
+
+  if (xin == xout && yin == yout) {
+    output.copy_(input_);
+    return;
+  }
+
+  at::Tensor input = input_;
+  if (!(input.is_contiguous() || input.is_contiguous(at::MemoryFormat::ChannelsLast))) {
+    // If input is not contiguous with memory format channels first or channels last,
+    // we explicitly convert the input to contiguous channels last memory format.
+    // This simplifies the rest of the code and let us assume that the format is only contiguous channels first or channels last,
+    // Most tensors going through this `if` block won't need to go through unpacking, but those having C < 3 may
+    // have to (this means 2 copies are made). We could avoid the extra copy by handling non-contiguous input
+    // directly within unpack_rgb() and pack_rgb(), but initial attempts showed that this is fairly complex.
+    input = input.contiguous(at::MemoryFormat::ChannelsLast);
+  }
+
+  auto need_horizontal = xout != xin;
+  auto need_vertical = yout != yin;
+
+  int ksize_horiz, ksize_vert;
+  std::vector<at::Tensor> horiz_indices_weights, vert_indices_weights;
+  unsigned int horiz_weights_precision, vert_weights_precision;
+
+  bool skip_unpacking = (num_channels == 3 || num_channels == 4) && input.is_contiguous(at::MemoryFormat::ChannelsLast);
+  bool skip_packing = (num_channels == 3 || num_channels == 4) && output.is_contiguous(at::MemoryFormat::ChannelsLast);
+
+  if (need_horizontal) {
+    int interp_dim = 3;
+    auto stride = (skip_unpacking) ? num_channels : 4;
+    std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) =
+        F::compute_index_ranges_int16_weights(
+            /*input_size=*/xin,
+            /*output_size=*/xout,
+            /*stride=*/stride,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  if (need_vertical) {
+    int interp_dim = 2;
+    auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout;
+    std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) =
+        F::compute_index_ranges_int16_weights(
+            /*input_size=*/yin,
+            /*output_size=*/yout,
+            /*stride=*/stride,
+            /*ndims=*/4,
+            /*reshape_dim=*/interp_dim,
+            /*align_corners=*/align_corners,
+            /*opt_scale=*/scales[interp_dim - 2],
+            /*antialias=*/antialias,
+            /*align_i32=*/true);
+  }
+
+  at::Tensor buffer_horiz, buffer_vert;
+  // Minor optimization: we can avoid allocating an extra buffer if we're performing
+  // horizontal-only or vertical-only interpolation, and if the tensor doesn't
+  // need repacking
+  if (need_horizontal && (need_vertical || !skip_packing)) {
+    auto c = (skip_unpacking) ? num_channels : 4;
+    buffer_horiz = at::empty({c, yin, xout}, input.options());
+  }
+  if (need_vertical && !skip_packing) {
+    auto c = (skip_unpacking) ? num_channels : 4;
+    buffer_vert = at::empty({c, yout, xout}, input.options());
+  }
+
+  for (const auto i : c10::irange(batch_size)) {
+
+    at::Tensor unpacked_input = (skip_unpacking) ? input[i] : unpack_rgb(input[i]);
+    at::Tensor unpacked_output;
+
+    if (need_horizontal) {
+      at::Tensor unpacked_output_temp = (need_vertical || !skip_packing) ? buffer_horiz : output[i];
+
+      if (skip_unpacking && num_channels == 3) {
+        ImagingResampleHorizontal<3>(
+          unpacked_output_temp,
+          unpacked_input,
+          ksize_horiz,
+          horiz_indices_weights,
+          horiz_weights_precision);
+      } else {
+        ImagingResampleHorizontal<4>(
+            unpacked_output_temp,
+            unpacked_input,
+            ksize_horiz,
+            horiz_indices_weights,
+            horiz_weights_precision);
+      }
+      unpacked_output = unpacked_input = unpacked_output_temp;
+    }
+    if (need_vertical) {
+      unpacked_output = (skip_packing) ? output[i] : buffer_vert;
+
+      ImagingResampleVertical(
+          unpacked_output,
+          unpacked_input,
+          ksize_vert,
+          vert_indices_weights,
+          vert_weights_precision
+      );
+    }
+
+    TORCH_INTERNAL_ASSERT(unpacked_output.defined());
+
+    if (!skip_packing) {
+      pack_rgb(unpacked_output, output[i]);
+    }
+  }
+}
+
+void ImagingResampleHorizontalConvolution8u4x(
+    uint8_t* C10_RESTRICT lineOut0,
+    uint8_t* C10_RESTRICT lineOut1,
+    uint8_t* C10_RESTRICT lineOut2,
+    uint8_t* C10_RESTRICT lineOut3,
+    int64_t out_xsize,
+    const uint8_t* C10_RESTRICT lineIn0,
+    const uint8_t* C10_RESTRICT lineIn1,
+    const uint8_t* C10_RESTRICT lineIn2,
+    const uint8_t* C10_RESTRICT lineIn3,
+    int64_t in_xsize,
+    const int64_t* idx_ptr_xmin,
+    const int64_t* idx_ptr_size,
+    const int16_t* kk,
+    int kmax,
+    unsigned int coefs_precision,
+    int64_t num_channels,
+    bool is_last_line) {
+
+  // Interpolation horizontal pass processing together 4 vertical lines.
+  // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
+  //   we can encode 4 values as a single uint32 value.
+  // - We split the size of weight vector for a given output index as a sum:
+  //   ids_size = num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1.
+  // - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values
+  // in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1").
+
+  // Define shuffling masks (low/high) for num_channels 4 and 3
+  // Mask low casts lower half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA:
+  //   [r1 g1 b1 a1  r2 g2 b2 a2  ... | R1 G1 B1 A1  R2 G2 B2 A2 ... ] ->
+  //   [r1 0 r2 0  g1 0 g2 0  b1 0 b2 0  a1 0 a2 0 | R1 0 R2 0  G1 0 G2 0  B1 0 B2 0  A1 0 A2 0]
+  // Mask high casts upper half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA::
+  //   [ ... r3 g3 b3 a3  r4 g4 b4 a4 | ... R3 G3 B3 A3  R4 G4 B4 A4 ] ->
+  //   [r3 0 r4 0  g3 0 g4 0  b3 0 b4 0  a3 0 a4 0 | R3 0 R4 0  G3 0 G4 0  B3 0 B4 0  A3 0 A4 0]
+
+  const auto mask_low_c4 = _mm256_set_epi8(
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0,
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0);
+  const auto mask_high_c4 = _mm256_set_epi8(
+      -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8,
+      -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8);
+  const auto mask_low_c3 = _mm256_set_epi8(
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0,
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0);
+  const auto mask_high_c3 = _mm256_set_epi8(
+      -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6,
+      -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6);
+
+  const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4;
+  const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4;
+
+  const auto stride = num_channels * sizeof(uint8_t);
+
+  TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
+
+  // out_xsize = output width, out_x = output x index
+  // ids_min is the input offset index corresponding to out_x
+  // ids_size is the interpolation size for out_x
+
+  // Let's precompute ids_size limits for block 4 and block 2.
+  //
+  // In block 4 (4 means we process 4 weight values together), we read input data
+  // with _mm_loadu_si128, i.e. 16 bytes, per one line:
+  // lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
+  // --> i <= ids_size - 16.0 / stride
+  // Strict boundary:
+  // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
+  // Soft boundary for reading inside the buffer except its boundaries:
+  // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
+  // RGBA: b4_delta = b4_delta_soft = 3
+  // RGB : b4_delta = 5
+  // RGB : b4_delta_soft = 4
+  const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
+
+  // In block 2 (2 means we process 2 weights values together), we read input data
+  // with _mm_loadl_epi64, i.e. 8 bytes, per one line:
+  // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
+  // --> i <= ids_size - 8.0 / stride
+  // Strict boundary:
+  // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
+  // Soft boundary for reading inside the buffer except its boundaries:
+  // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
+  // RGBA: b2_delta = b2_delta_soft = 1
+  // RGB : b2_delta = 2
+  // RGB : b2_delta_soft = 1
+  const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
+
+  const auto max_out_x_strided = out_xsize * stride;
+  const auto max_in_x_strided = in_xsize * stride;
+
+  const auto zero = _mm256_setzero_si256();
+  const auto initial = _mm256_set1_epi32(1 << (coefs_precision - 1));
+
+  for (const auto out_x : c10::irange(out_xsize)) {
+    const auto ids_min = idx_ptr_xmin[out_x];
+    const auto ids_size = idx_ptr_size[out_x];
+    const auto * k = &kk[out_x * kmax];
+    int64_t i = 0;
+
+    auto sss0 = initial;
+    auto sss1 = initial;
+
+    const auto * lineIn0_min = lineIn0 + ids_min;
+    const auto * lineIn1_min = lineIn1 + ids_min;
+    const auto * lineIn2_min = lineIn2 + ids_min;
+    const auto * lineIn3_min = lineIn3 + ids_min;
+
+    // block 4
+    for (; i < ids_size - b4_delta; i += 4) {
+      // Load 4 values from weight vector
+      // mmk0 = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ...]
+      // mmk1 = [wl_2 wh_2 wl_3 wh_3  wl_2 wh_2 wl_3 wh_3  ...]
+      const auto mmk0 = _mm256_set1_epi32(*(int32_t*)&k[i]);
+      const auto mmk1 = _mm256_set1_epi32(*(int32_t*)&k[i + 2]);
+
+      // RGBA: Load 8 pixels (4 per line) from input lines 0 and 1:
+      // source = [
+      //   r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
+      //   R0 G0 B0 A0  R1 G1 B1 A1  R2 G2 B2 A2  R3 G3 B3 A3
+      // ]
+      // RGB: Load 10 pixels (5 per line)
+      // source = [
+      //   r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
+      //   R0 G0 B0 R1  G1 B1 R2 G2  B2 R3 G3 B3  R4 G4 B4 R5
+      // ]
+      auto source = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *) (lineIn0_min + stride * i))),
+          _mm_loadu_si128((__m128i *) (lineIn1_min + stride * i)), 1);
+
+      // Apply mask_low:
+      // RGBA:
+      //   [r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0 | R0 0 R1 0  G0 0 G1 0  B0 0 B1 0  A0 0 A1 0]
+      // RGB:
+      //   [r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  0 0 0 0 | R0 0 R1 0  G0 0 G1 0  B0 0 B1 0  0 0 0 0]
+      auto pix1 = _mm256_shuffle_epi8(source, mask_low);
+      // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk0));
+
+      // Apply mask_high:
+      // RGBA:
+      //   [r2 0 r3 0  g2 0 g3 0  b2 0 b3 0  a2 0 a3 0 | R2 0 R3 0  G2 0 G3 0  B2 0 B3 0  A2 0 A3 0]
+      // RGB:
+      //   [r2 0 r3 0  g2 0 g3 0  b2 0 b3 0  0 0 0 0 | R2 0 R3 0  G2 0 G3 0  B2 0 B3 0  0 0 0 0]
+      auto pix2 = _mm256_shuffle_epi8(source, mask_high);
+      // Compute output value as C += w2 * C2 + w3 * C3 for each channel in 32-bit precision
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix2, mmk1));
+
+      // Same as above to next lines 2 and 3:
+      auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *) (lineIn2_min + stride * i))),
+          _mm_loadu_si128((__m128i *) (lineIn3_min + stride * i)), 1);
+      auto pix3 = _mm256_shuffle_epi8(source2, mask_low);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix3, mmk0));
+      auto pix4 = _mm256_shuffle_epi8(source2, mask_high);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix4, mmk1));
+    }
+
+    // block 2
+    for (; i < ids_size - b2_delta; i += 2) {
+      // Load 2 values from weight vector
+      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ...]
+      const auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]);
+
+      // Load 4 pixels (2 per line) from input lines 0 and 1:
+      // RGBA: source1 = [
+      //   r0 g0 b0 a0  r1 g1 b1 a1  0 0 0 0  0 0 0 0
+      //   R0 G0 B0 A0  R1 G1 B1 A1  0 0 0 0  0 0 0 0
+      // ]
+      // RGB: source1 = [
+      //   r0 g0 b0 r1  g1 b1 r2  0 0 0 0  0 0 0 0
+      //   R0 G0 B0 R1  G1 B1 R2  0 0 0 0  0 0 0 0
+      // ]
+      auto source1 = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          _mm_loadl_epi64((__m128i *) (lineIn0_min + stride * i))),
+          _mm_loadl_epi64((__m128i *) (lineIn1_min + stride * i)), 1);
+      // Apply mask_low:
+      // RGBA:
+      //   [r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0 | R0 0 R1 0  G0 0 G1 0  B0 0 B1 0  A0 0 A1 0]
+      // RGB:
+      //   [r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  0 0 0 0 | R0 0 R1 0  G0 0 G1 0  B0 0 B1 0  0 0 0 0]
+      auto pix1 = _mm256_shuffle_epi8(source1, mask_low);
+      // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
+
+      // Same as above for lines 2 and 3:
+      auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          _mm_loadl_epi64((__m128i *) (lineIn2_min + stride * i))),
+          _mm_loadl_epi64((__m128i *) (lineIn3_min + stride * i)), 1);
+      auto pix2 = _mm256_shuffle_epi8(source2, mask_low);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
+    }
+
+    // block 1
+    const auto i32_aligned = num_channels == 4;
+    for (; i < ids_size - 1; i++) {
+      // Load 1 value from weight vector
+      // mmk = [wl_0 wh_0 0 0  wl_0 wh_0 0 0  ...]
+      const auto mmk = _mm256_set1_epi32(k[i]);
+
+      // Load 2 pixels (one per line) from input lines 0 and 1:
+      // RGBA: pix1 = [
+      //   r0 0 0 0  g0 0 0 0  b0 0 0 0  a0 0 0 0
+      //   R0 0 0 0  G0 0 0 0  B0 0 0 0  A0 0 0 0
+      // ]
+      // RGB: pix1 = [
+      //   r0 0 0 0  g0 0 0 0  b0 0 0 0  r1 0 0 0
+      //   R0 0 0 0  G0 0 0 0  B0 0 0 0  R1 0 0 0
+      // ]
+      auto pix1 = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
+          mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
+      // Compute output value as C += w0 * C0 for each channel in 32-bit precision
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
+
+      // Same as above for lines 2 and 3
+      auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned)),
+          mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned), 1);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
+    }
+
+    if (i == ids_size - 1) {
+      // last element
+      auto mmk = _mm256_set1_epi32(k[i]);
+      // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes
+      // lines 0, 1 and 2 wont go out of allocated memory bounds
+      auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256(
+          mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)),
+          mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1);
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk));
+
+      auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned);
+      __m128i p1;
+      if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
+        uint8_t input[4];
+        std::memcpy(input, lineIn3_min + stride * i, 3);
+        p1 = mm_cvtepu8_epi32(input, true);
+      } else {
+        p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned);
+      }
+      auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(p0), p1, 1);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
+    }
+
+    // Convert fixed point values back to integers (truncating)
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
+    // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0)
+    sss0 = _mm256_packs_epi32(sss0, zero);
+    sss1 = _mm256_packs_epi32(sss1, zero);
+    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
+    // (a a b b c c d d) -> (a b c d 0 0 0 0)
+    sss0 = _mm256_packus_epi16(sss0, zero);
+    sss1 = _mm256_packus_epi16(sss1, zero);
+
+    // Write the output into single uint32
+    // (a b c d) -> x_uint32
+    auto o0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss0));
+    auto o1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1));
+    auto o2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss1));
+    auto o3 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1));
+
+    const auto out_x_strided = stride * out_x;
+
+    if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) {
+      // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write
+      // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
+      // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
+      // value which was previously computed by another line. In other words, it means that we can not overwrite
+      // it by simply writing 4 bytes from the register to the output. We'll do the following:
+      //               v----------|
+      // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
+      // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1)
+      // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1)
+      // Output = [... R G B | R1 G1 B1 R2 ...]
+
+      _write_endline_rgb_as_uint32(lineOut0 + out_x_strided, o0);
+      _write_endline_rgb_as_uint32(lineOut1 + out_x_strided, o1);
+      _write_endline_rgb_as_uint32(lineOut2 + out_x_strided, o2);
+
+      if (C10_UNLIKELY(is_last_line)) {
+        // When we handle the last line, we can not access the next 4 bytes
+        // as they are out of memory bounds.
+        std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, num_channels);
+      } else {
+        _write_endline_rgb_as_uint32(lineOut3 + out_x_strided, o3);
+      }
+    } else if (num_channels == 3) {
+      // Memcpy 4-bytes is faster than 3-bytes and here
+      // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value
+      // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...)
+      std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4);
+      std::memcpy(lineOut1 + out_x_strided, (uint8_t *) &o1, 4);
+      std::memcpy(lineOut2 + out_x_strided, (uint8_t *) &o2, 4);
+      std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, 4);
+    } else {
+      // num_channels = 4 -> lineOutX + out_x_strided should be uint32 aligned
+      *(uint32_t *)(lineOut0 + out_x_strided) = o0;
+      *(uint32_t *)(lineOut1 + out_x_strided) = o1;
+      *(uint32_t *)(lineOut2 + out_x_strided) = o2;
+      *(uint32_t *)(lineOut3 + out_x_strided) = o3;
+    }
+  }
+}
+
+void ImagingResampleHorizontalConvolution8u(
+    uint8_t* C10_RESTRICT lineOut,
+    int64_t out_xsize,
+    const uint8_t* C10_RESTRICT lineIn,
+    int64_t in_xsize,
+    const int64_t* idx_ptr_xmin,
+    const int64_t* idx_ptr_size,
+    const int16_t* kk,
+    int kmax,
+    unsigned int coefs_precision,
+    int64_t num_channels,
+    bool is_last_line) {
+
+  // Interpolation horizontal pass processing only one vertical line.
+  // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA
+  //   we can encode 4 values as a single uint32 value.
+  // - We split the size of weight vector for a given output index as a sum:
+  //   ids_size = num_blocks_8 * 8 + num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1
+  // - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in
+  // in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1").
+
+  // Define various shuffling masks
+  const auto kmask_low = _mm256_set_epi8(
+      11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8,
+      3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+  const auto kmask_high = _mm256_set_epi8(
+      15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12,
+      7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4);
+  const auto kmask_hl = _mm256_set_epi8(
+      7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4,
+      3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+
+  const auto mask_low_c4 = _mm256_set_epi8(
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0,
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0);
+  const auto mask_high_c4 = _mm256_set_epi8(
+      -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8,
+      -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8);
+  const auto mask_low_c3 = _mm256_set_epi8(
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0,
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0);
+  const auto mask_high_c3 = _mm256_set_epi8(
+      -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6,
+      -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6);
+  const auto mask_hl_c3 = _mm256_set_epi8(
+      -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6,
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0);
+  const auto mask_hl_c4 = _mm256_set_epi8(
+      -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8,
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0);
+
+  const auto mask_low128_c3 = _mm_set_epi8(
+      -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0);
+  const auto mask_low128_c4 = _mm_set_epi8(
+      -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0);
+
+  const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4;
+  const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4;
+  const auto mask_hl = (num_channels == 3) ? mask_hl_c3 : mask_hl_c4;
+  const auto mask_low128 = (num_channels == 3) ? mask_low128_c3 : mask_low128_c4;
+
+  // out_xsize = output width, out_x = output x index
+  // ids_min is the input offset index corresponding to out_x
+  // ids_size is the interpolation size for out_x
+
+  const auto stride = num_channels * sizeof(uint8_t);
+  const auto zero = _mm_setzero_si128();
+
+  TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
+
+  // Let's precompute ids_size limits for block 8, block 4 and block 2
+  //
+  // In block 8 (8 means we process 8 weight values together), we read at
+  // most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB)
+  // lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min)
+  // --> i <= ids_size - 32.0 / stride
+  // Strict boundary:
+  // --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta
+  // Soft boundary for reading inside the buffer except its boundaries:
+  // --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft
+  // RGBA: b8_delta = b8_delta_soft = 7
+  // RGB : b8_delta = 10
+  // RGB : b8_delta_soft = 9
+  const auto b8_delta = (stride == 4) ? 7 : ((is_last_line) ? 10 : 9);
+
+  // In block 4 (4 means we process 4 weight values together), we read
+  // 16 bytes of input data.
+  // lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min)
+  // --> i <= ids_size - 16.0 / stride
+  // Strict boundary:
+  // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta
+  // Soft boundary for reading inside the buffer except its boundaries:
+  // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft
+  // RGBA: b4_delta = b4_delta_soft = 3
+  // RGB : b4_delta = 5
+  // RGB : b4_delta_soft = 4
+  const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4);
+
+  // In block 2 (2 means we process 2 weight values together), we read
+  // 8 bytes of input data.
+  // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min)
+  // --> i <= ids_size - 8.0 / stride
+  // Strict boundary:
+  // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta
+  // Soft boundary for reading inside the buffer except its boundaries:
+  // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft
+  // RGBA: b2_delta = b2_delta_soft = 1
+  // RGB : b2_delta = 2
+  // RGB : b2_delta_soft = 1
+  const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1);
+
+  const auto max_out_x_strided = out_xsize * stride;
+  const auto max_in_x_strided = in_xsize * stride;
+
+  for (const auto out_x : c10::irange(out_xsize)) {
+    __m128i sss;
+    const auto ids_min = idx_ptr_xmin[out_x];
+    const auto ids_size = idx_ptr_size[out_x];
+    const auto * k = &kk[out_x * kmax];
+    int64_t i = 0;
+
+    const auto * lineIn_min = lineIn + ids_min;
+
+    if (ids_size < 8) {
+      sss = _mm_set1_epi32(1 << (coefs_precision - 1));
+    } else {
+      // Lower part will be added to higher, use only half of the error
+      auto sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2));
+
+      // block 8
+      for (; i < ids_size - b8_delta; i += 8) {
+        // Load 8 values from weight vector
+        auto tmp = _mm_loadu_si128((__m128i*)&k[i]);
+        // ksource = [
+        //    wl_0 wh_0 wl_1 wh_1  wl_2 wh_2 wl_3 wh_3  wl_4 wh_4 wl_5 wh_5  wl_6 wh_6 wl_7 wh_7
+        //    wl_0 wh_0 wl_1 wh_1  wl_2 wh_2 wl_3 wh_3  wl_4 wh_4 wl_5 wh_5  wl_6 wh_6 wl_7 wh_7
+        // ]
+        auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // RGBA: Load 8 pixels from input:
+        // source = [
+        //    r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
+        //    r4 g4 b4 a4  r5 g5 b5 a5  r6 g6 b6 a6  r7 g7 b7 a7
+        // ]
+        // RGB: Load 10 pixels from input (however we can process only 8 pixels):
+        // source = [
+        //    r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
+        //    r4 g4 b4 r5  g5 b5 r6 g6  b6 r7 g7 b7  r8 g8 b8 r9
+        // ]
+        auto source = _mm256_inserti128_si256(_mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
+            _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
+
+        // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
+        // RGBA: pix1 = [
+        //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0
+        //   r4 0 r5 0  g4 0 g5 0  b4 0 b5 0  a4 0 a5 0
+        // ]
+        // RGB: pix1 = [
+        //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  0 0 0 0
+        //   r4 0 r5 0  g4 0 g5 0  b4 0 b5 0  0 0 0 0
+        // ]
+        auto pix1 = _mm256_shuffle_epi8(source, mask_low);
+        // mmk1 = [
+        //   wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ...  ...
+        //   wl_4 wh_4 wl_5 wh_5  wl_4 wh_4 wl_5 wh_5  ...  ...
+        // ]
+        auto mmk1 = _mm256_shuffle_epi8(ksource, kmask_low);
+        // Compute output value as
+        //   C += w0 * C0 + w1 * C1
+        //   C += w4 * C4 + w5 * C5 for each channel in 32-bit precision
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix1, mmk1));
+
+        // Same as above for higher part of each lane
+        auto pix2 = _mm256_shuffle_epi8(source, mask_high);
+        auto mmk2 = _mm256_shuffle_epi8(ksource, kmask_high);
+        // Compute output value as
+        //    C += w2 * C2 + w3 * C3
+        //    C += w6 * C6 + w7 * C7 for each channel in 32-bit precision
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix2, mmk2));
+      }
+
+      // block 4
+      for (; i < ids_size - b4_delta; i += 4) {
+        // Load 4 values from weight vector
+        auto tmp = _mm_loadl_epi64((__m128i *) &k[i]);
+        // ksource = [
+        //    wl_0 wh_0 wl_1 wh_1  wl_2 wh_2 wl_3 wh_3  0 0 0 0  0 0 0 0
+        //    wl_0 wh_0 wl_1 wh_1  wl_2 wh_2 wl_3 wh_3  0 0 0 0  0 0 0 0
+        // ]
+        auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // Load pixels from input line
+        tmp = _mm_loadu_si128((__m128i *) (lineIn_min + stride * i));
+        // RGBA: source = [
+        //   r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
+        //   r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
+        // ]
+        // RGB: source = [
+        //   r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
+        //   r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
+        // ]
+        auto source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1);
+
+        // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA
+        // RGBA: pix = [
+        //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  a0 0 a1 0
+        //   r2 0 r3 0  g2 0 g3 0  b2 0 b3 0  a2 0 a3 0
+        // ]
+        // RGB: pix = [
+        //   r0 0 r1 0  g0 0 g1 0  b0 0 b1 0  0 0 0 0
+        //   r2 0 r3 0  g2 0 g3 0  b2 0 b3 0  0 0 0 0
+        // ]
+        auto pix = _mm256_shuffle_epi8(source, mask_hl);
+        // mmk = [
+        //   wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ...
+        //   wl_2 wh_2 wl_3 wh_3  wl_2 wh_2 wl_3 wh_3  ... ...
+        // ]
+        auto mmk = _mm256_shuffle_epi8(ksource, kmask_hl);
+        // Compute output value as
+        //   C += w0 * C0 + w1 * C1
+        //   C += w2 * C2 + w3 * C3 for each channel in 32-bit precision
+        sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk));
+      }
+
+      // Sum results between the lanes
+      sss = _mm_add_epi32(
+          _mm256_extracti128_si256(sss256, 0),
+          _mm256_extracti128_si256(sss256, 1));
+    }
+
+    // block 2
+    for (; i < ids_size - b2_delta; i += 2) {
+      // Load 2 values from weight vector
+      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ...]
+      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);
+      // Load pixels from input line
+      // RGBA: source = [
+      //   r0 g0 b0 a0  r1 g1 b1 a1  0 0 0 0  0 0 0 0
+      // ]
+      // RGB: source = [
+      //   r0 g0 b0 r1  g1 b1 r2 g2  0 0 0 0  0 0 0 0
+      // ]
+      auto source = _mm_loadl_epi64((__m128i *) (lineIn_min + stride * i));
+      // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA
+      auto pix = _mm_shuffle_epi8(source, mask_low128);
+      // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    // block 1
+    const auto i32_aligned = num_channels == 4;
+    for (; i < ids_size - 1; i++) {
+      // Load 1 value from weight vector
+      // mmk = [wl_0 wh_0 0 0  wl_0 wh_0 0 0  ...]
+      auto mmk = _mm_set1_epi32(k[i]);
+      // Load one pixel from input line
+      // RGBA: pix = [
+      //   r0 0 0 0  g0 0 0 0  b0 0 0 0  a0 0 0 0
+      // ]
+      // RGB: pix = [
+      //   r0 0 0 0  g0 0 0 0  b0 0 0 0  r1 0 0 0
+      // ]
+      auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i, i32_aligned);
+      // Compute output value as C += w0 * C0 for each channel in 32-bit precision
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    if (i == ids_size - 1) {
+      // last element
+      auto mmk = _mm_set1_epi32(k[i]);
+      __m128i pix;
+      auto p = lineIn_min + stride * i;
+      if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) {
+        uint8_t input[4];
+        std::memcpy(input, p, 3);
+        pix = mm_cvtepu8_epi32(input, true);
+      } else {
+        pix = mm_cvtepu8_epi32(p, i32_aligned);
+      }
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    // Convert fixed point values back to integers (truncating)
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
+    // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0)
+    sss = _mm_packs_epi32(sss, zero);
+    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
+    // (a a b b c c d d) -> (a b c d 0 0 0 0)
+    sss = _mm_packus_epi16(sss, zero);
+    // Write the output into single uint32
+    // (a b c d) -> x_uint32
+    auto o = _mm_cvtsi128_si32(sss);
+    const auto out_x_strided = stride * out_x;
+    if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) {
+      if (C10_UNLIKELY(is_last_line)) {
+        // When we handle the last line, we can not access the next 4 bytes
+        // as they are out of memory bounds.
+        std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 3);
+      } else {
+        // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write
+        // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1).
+        // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct
+        // value which was previously computed by another line. In other words, it means that we can not overwrite
+        // it by simply writing 4 bytes from the register to the output. We'll do the following:
+        //               v----------|
+        // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...]
+        // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1)
+        // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1)
+        // Output = [... R G B | R1 G1 B1 R2 ...]
+        _write_endline_rgb_as_uint32(lineOut + out_x_strided, o);
+      }
+    } else if (num_channels == 3) {
+      // Memcpy 4-bytes is faster than 3-bytes and here
+      // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value
+      // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...)
+      std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 4);
+    } else {
+      // num_channels = 4 -> lineOut + out_x_strided should be uint32 aligned
+      *(uint32_t *)(lineOut + out_x_strided) = o;
+    }
+  }
+}
+
+void ImagingResampleVerticalConvolution8u(
+    uint8_t* C10_RESTRICT lineOut,
+    const uint8_t* C10_RESTRICT lineIn,
+    int64_t xsize,
+    int64_t ids_min,
+    int64_t ids_size,
+    const int16_t* k,
+    unsigned int coefs_precision,
+    int64_t num_channels) {
+
+  // Interpolation vertical pass processing one line.
+  // - We process x-axis data with blocks of 8, 2 and 1
+  // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m.
+
+  // xsize = output width, also equals to input width
+  // ids_size = interpolation size
+  // ids_min = input y start index
+  const auto stride = num_channels * sizeof(uint8_t);
+
+  TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4);
+
+  const int64_t data_size = xsize * stride;
+  const int64_t data_stride = stride;
+  constexpr auto vec_size = 256 / 8;
+
+  const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1));
+  const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1));
+  const auto zero = _mm_setzero_si128();
+  const auto zero_256 = _mm256_setzero_si256();
+
+  int64_t j = 0;
+  // block 8
+  const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride;
+  for (; j < data_size - vec_size; j += b8_usable_vec_stride) {
+    auto sss0 = initial_256;
+    auto sss1 = initial_256;
+    auto sss2 = initial_256;
+    auto sss3 = initial_256;
+    int64_t i = 0;
+    const auto * lineIn_min = lineIn + j + ids_min;
+
+    for (; i < ids_size - 1; i += 2) {
+      // Load 2 values from weight vector
+      auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]);
+
+      // RGBA: Load 8 pixels per line
+      // source1 = [
+      //    r0 g0 b0 a0  r1 g1 b1 a1  r2 g2 b2 a2  r3 g3 b3 a3
+      //    r4 g4 b4 a4  r5 g5 b5 a5  r6 g6 b6 a6  r7 g7 b7 a7
+      // ]
+      // RGB: Load 10 pixels per line (however we can process only 8 pixels):
+      // source1 = [
+      //    r0 g0 b0 r1  g1 b1 r2 g2  b2 r3 g3 b3  r4 g4 b4 r5
+      //    r4 g4 b4 r5  g5 b5 r6 g6  b6 r7 g7 b7  r8 g8 b8 r9
+      // ]
+      auto source1 =
+          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i));
+      auto source2 =
+          _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1)));
+
+      // Interleave source1 and source2 from the low half of each 128-bit lane
+      // and cast the result to epi16
+      // RGBA: pix1 = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
+      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
+      // ]
+      // RGB: pix1 = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
+      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
+      // ]
+      auto source_lo = _mm256_unpacklo_epi8(source1, source2);
+      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
+      // Compute output value as
+      //   C += w0 * c0 + w1 * C0
+      //   C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
+
+      // RGBA: pix2 = [
+      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  a2 0 A2 0
+      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  a3 0 A3 0
+      // ]
+      // RGB: pix2 = [
+      //    r2 0 R2 0  g2 0 G2 0  b2 0 B2 0  0 0 0 0
+      //    r3 0 R3 0  g3 0 G3 0  b3 0 B3 0  0 0 0 0
+      // ]
+      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
+      // Compute output value as
+      //   C += w0 * c2 + w1 * C2
+      //   C += w0 * c3 + w1 * C3 for each channel in 32-bit precision
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
+
+      // Same as above for the high half of each 128-bit lane
+      auto source_hi = _mm256_unpackhi_epi8(source1, source2);
+      auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256);
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
+      auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256);
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
+    }
+    // Same processing as above but with a single weight value
+    for (; i < ids_size; i += 1) {
+      auto mmk = _mm256_set1_epi32(k[i]);
+
+      auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size));
+
+      auto source_lo = _mm256_unpacklo_epi8(source1, zero_256);
+      auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256);
+      sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk));
+      auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256);
+      sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk));
+
+      auto source_hi = _mm256_unpackhi_epi8(source1, zero_256);
+      auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256());
+      sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk));
+      auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256());
+      sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk));
+    }
+    // Convert fixed point values back to integers (truncating)
+    sss0 = _mm256_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm256_srai_epi32(sss1, coefs_precision);
+    sss2 = _mm256_srai_epi32(sss2, coefs_precision);
+    sss3 = _mm256_srai_epi32(sss3, coefs_precision);
+    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
+    // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d)
+    sss0 = _mm256_packs_epi32(sss0, sss1);
+    sss2 = _mm256_packs_epi32(sss2, sss3);
+    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
+    // (a a b b c c d d) -> (a b c d)
+    sss0 = _mm256_packus_epi16(sss0, sss2);
+
+    // Stores 32 bytes
+    _mm256_storeu_si256((__m256i*)(lineOut + j), sss0);
+  }
+
+  // TODO: Do we also need block 4 ???
+  // block 2
+  const auto b2_usable_vec_stride = (8 / data_stride) * data_stride;
+  for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) {
+    auto sss0 = initial;
+    auto sss1 = initial;
+    int64_t i = 0;
+    const auto * lineIn_min = lineIn + j + ids_min;
+
+    for (; i < ids_size - 1; i += 2) {
+      // Load 2 values from weight vector
+      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
+      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);
+
+      // Load 2 pixels per line
+      // RGBA: source1 = [
+      //    r0 g0 b0 a0  r1 g1 b1 a1  0 0 0 0  0 0 0 0
+      // ]
+      // RGB: source1 = [
+      //    r0 g0 b0 r1  g1 b1 r2 g2  0 0 0 0  0 0 0 0
+      // ]
+      auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size));
+      auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size));
+      // Interleave source1 and source2 and cast the result to epi16
+      // RGBA: pix = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
+      // ]
+      // RGB: pix = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
+      // ]
+      auto source = _mm_unpacklo_epi8(source1, source2);
+      auto pix = _mm_unpacklo_epi8(source, zero);
+      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk));
+      // RGBA: pix = [
+      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  a1 0 A1 0
+      // ]
+      // RGB: pix = [
+      //    r1 0 R1 0  g1 0 G1 0  b1 0 B1 0  0 0 0 0
+      // ]
+      pix = _mm_unpackhi_epi8(source, zero);
+      // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk));
+    }
+    // Same processing as above but with a single weight value
+    for (; i < ids_size; i += 1) {
+      auto mmk = _mm_set1_epi32(k[i]);
+
+      auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size));
+
+      auto source = _mm_unpacklo_epi8(source1, zero);
+      auto pix1 = _mm_unpacklo_epi8(source, zero);
+      sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk));
+      auto pix2 = _mm_unpackhi_epi8(source, zero);
+      sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk));
+    }
+    // Convert fixed point values back to integers (truncating)
+    sss0 = _mm_srai_epi32(sss0, coefs_precision);
+    sss1 = _mm_srai_epi32(sss1, coefs_precision);
+    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
+    // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d)
+    sss0 = _mm_packs_epi32(sss0, sss1);
+    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
+    // (a a b b c c d d) -> (a b c d)
+    sss0 = _mm_packus_epi16(sss0, sss0);
+    // Store 2 pixels to the output
+    _mm_storel_epi64((__m128i*)(lineOut + j), sss0);
+  }
+
+  // block 1
+  const auto b1_usable_vec_stride = (4 / data_stride) * data_stride;
+  const auto i32_aligned = num_channels == 4;
+  for (; j < data_size - 4; j += b1_usable_vec_stride) {
+    auto sss = initial;
+    int64_t i = 0;
+    const auto * lineIn_min = lineIn + j + ids_min;
+
+    for (; i < ids_size - 1; i += 2) {
+      // Load 2 values from weight vector
+      // mmk = [wl_0 wh_0 wl_1 wh_1  wl_0 wh_0 wl_1 wh_1  ... ]
+      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);
+
+      // Load one pixel per line
+      // RGBA: source1 = [
+      //    r0 g0 b0 a0  0 0 0 0  0 0 0 0  0 0 0 0
+      // ]
+      // RGB: source1 = [
+      //    r0 g0 b0 r1  0 0 0 0  0 0 0 0  0 0 0 0
+      // ]
+      auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned);
+      auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned);
+
+      // Interleave source1 and source2 and cast the result to epi16
+      // RGBA: pix = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  a0 0 A0 0
+      // ]
+      // RGB: pix = [
+      //    r0 0 R0 0  g0 0 G0 0  b0 0 B0 0  0 0 0 0
+      // ]
+      auto source = _mm_unpacklo_epi8(source1, source2);
+      auto pix = _mm_unpacklo_epi8(source, zero);
+      // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    for (; i < ids_size; i++) {
+      auto mmk = _mm_set1_epi32(k[i]);
+      auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size, i32_aligned);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    sss = _mm_packs_epi32(sss, zero);
+    sss = _mm_packus_epi16(sss, zero);
+
+    auto o = _mm_cvtsi128_si32(sss);
+
+    // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3
+    // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data.
+    // We also wont go out of bounds of lineOut memory allocation
+    std::memcpy(lineOut + j, (uint8_t *) &o, 4);
+  }
+
+  for (; j < data_size; j += data_stride) {
+    auto sss = initial;
+    int64_t i = 0;
+    const auto * lineIn_min = lineIn + j + ids_min;
+    // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary
+    // for the last remaining line
+    for (; i < ids_size - 2; i += 2) {
+      // Load two coefficients at once
+      auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]);
+
+      // Load 2 lines
+      auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned);
+      auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned);
+
+      auto source = _mm_unpacklo_epi8(source1, source2);
+      auto pix = _mm_unpacklo_epi8(source, zero);
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    // Same processing as above but with a single weight value
+    for (; i < ids_size; i++) {
+      auto mmk = _mm_set1_epi32(k[i]);
+
+      const uint8_t * p = lineIn_min + i * data_size;
+      __m128i pix;
+      // There is no much perf gain using more detailed condition like
+      // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size
+      // const int64_t in_max_size = data_size * in_ysize;
+      if (num_channels == 3) {
+        uint8_t input[4];
+        std::memcpy(input, p, 3);
+        pix = mm_cvtepu8_epi32(input, true);
+      } else {
+        pix = mm_cvtepu8_epi32(p, true);
+      }
+      sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk));
+    }
+
+    // Convert fixed point values back to integers (truncating)
+    sss = _mm_srai_epi32(sss, coefs_precision);
+    // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation
+    // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d)
+    sss = _mm_packs_epi32(sss, zero);
+    // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation
+    // (a a b b c c d d) -> (a b c d)
+    sss = _mm_packus_epi16(sss, zero);
+    // Store one pixel to the output
+    auto o = _mm_cvtsi128_si32(sss);
+    if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) {
+      std::memcpy(lineOut + j, (uint8_t *) &o, 3);
+    } else {
+      std::memcpy(lineOut + j, (uint8_t *) &o, 4);
+    }
+  }
+}
+
+} // anonymous namespace
+#endif // CPU_CAPABILITY_AVX2
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..106b068d198989f5a4d71a9fe20c79f1b5a5d915
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+using weight_norm_fn = void(*)(
+    TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, int64_t);
+using weight_norm_backward_fn = void(*)(
+    TensorBase&, TensorBase&, const TensorBase&, const TensorBase&,
+    const TensorBase&, const TensorBase&, int64_t);
+
+DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub);
+DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce37f0aecb8cb88758703b3783c720d840a3d926
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h
@@ -0,0 +1,522 @@
+#pragma once
+/*
+   AVX implementation of sin, cos, sincos, exp and log
+
+   Based on "sse_mathfun.h", by Julien Pommier
+   http://gruntthepeon.free.fr/ssemath/
+
+   Copyright (C) 2012 Giovanni Garberoglio
+   Interdisciplinary Laboratory for Computational Science (LISC)
+   Fondazione Bruno Kessler and University of Trento
+   via Sommarive, 18
+   I-38123 Trento (Italy)
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <ATen/native/cpu/Intrinsics.h>
+
+/* The original source of this file has been modified. */
+#if defined(CPU_CAPABILITY_AVX2)
+
+#if defined(__GNUC__)
+# define ALIGN32_BEG __attribute__((aligned(32)))
+#elif defined(_WIN32)
+# define ALIGN32_BEG __declspec(align(32))
+#endif
+
+typedef __m256  v8sf; // vector of 8 float (avx2)
+typedef __m256i v8si; // vector of 8 int   (avx2)
+
+/* declare some AVX constants -- why can't I figure a better way to do that? */
+#define _PS256_CONST(Name, Val)                                            \
+  static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PI32_CONST256(Name, Val)                                            \
+  static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+#define _PS256_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
+
+_PS256_CONST(1  , 1.0f);
+_PS256_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST256(0, 0);
+_PI32_CONST256(1, 1);
+_PI32_CONST256(inv1, ~1);
+_PI32_CONST256(2, 2);
+_PI32_CONST256(4, 4);
+_PI32_CONST256(0x7f, 0x7f);
+
+_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS256_CONST(cephes_log_q1, -2.12194440e-4);
+_PS256_CONST(cephes_log_q2, 0.693359375);
+
+
+/* natural logarithm computed for 8 simultaneous float
+   return NaN for x <= 0
+*/
+inline v8sf log256_ps(v8sf x) {
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
+  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
+
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */
+
+  // can be done with AVX2
+  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
+  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
+
+  // this is again another AVX2 instruction
+  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  v8sf e = _mm256_cvtepi32_ps(imm0);
+
+  e = _mm256_add_ps(e, one);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
+  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf tmp = _mm256_and_ps(x, mask);
+  x = _mm256_sub_ps(x, one);
+  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
+  x = _mm256_add_ps(x, tmp);
+
+  v8sf z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
+  y = _mm256_mul_ps(y, x);
+
+  y = _mm256_mul_ps(y, z);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
+  y = _mm256_add_ps(y, tmp);
+
+
+  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+
+  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
+  x = _mm256_add_ps(x, y);
+  x = _mm256_add_ps(x, tmp);
+  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS256_CONST(exp_hi,        88.3762626647949f);
+_PS256_CONST(exp_lo,        -88.3762626647949f);
+
+_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS256_CONST(cephes_exp_C1, 0.693359375);
+_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+inline v8sf exp256_ps(v8sf x) {
+  v8sf tmp = _mm256_setzero_ps(), fx;
+  v8si imm0;
+  v8sf one = *(v8sf*)_ps256_1;
+
+  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
+  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
+  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+  //imm0 = _mm256_cvttps_epi32(fx);
+  //tmp  = _mm256_cvtepi32_ps(imm0);
+
+  tmp = _mm256_floor_ps(fx);
+
+  /* if greater, subtract 1 */
+  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
+  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
+  mask = _mm256_and_ps(mask, one);
+  fx = _mm256_sub_ps(tmp, mask);
+
+  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
+  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
+  x = _mm256_sub_ps(x, tmp);
+  x = _mm256_sub_ps(x, z);
+
+  z = _mm256_mul_ps(x,x);
+
+  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
+  y = _mm256_mul_ps(y, x);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, x);
+  y = _mm256_add_ps(y, one);
+
+  /* build 2^n */
+  imm0 = _mm256_cvttps_epi32(fx);
+  // another two AVX2 instructions
+  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
+  imm0 = _mm256_slli_epi32(imm0, 23);
+  v8sf pow2n = _mm256_castsi256_ps(imm0);
+  y = _mm256_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS256_CONST(minus_cephes_DP1, -0.78515625);
+_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS256_CONST(sincof_p0, -1.9515295891E-4);
+_PS256_CONST(sincof_p1,  8.3321608736E-3);
+_PS256_CONST(sincof_p2, -1.6666654611E-1);
+_PS256_CONST(coscof_p0,  2.443315711809948E-005);
+_PS256_CONST(coscof_p1, -1.388731625493765E-003);
+_PS256_CONST(coscof_p2,  4.166664568298827E-002);
+_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 8 sines at onces using AVX intrinsics
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+*/
+inline v8sf sin256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
+  v8si imm0, imm2;
+
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /*
+    Here we start a series of integer operations, which are in the
+    realm of AVX2.
+    If we don't have AVX, let's perform them using SSE2 directives
+  */
+
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  // another two AVX2 instruction
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+
+  /* get the swap sign flag */
+  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
+
+  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* almost the same as sin_ps */
+inline v8sf cos256_ps(v8sf x) { // any x
+  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
+  v8si imm0, imm2;
+
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  imm2 = _mm256_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+  y = _mm256_cvtepi32_ps(imm2);
+  imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
+
+  /* get the swap sign flag */
+  imm0 =  _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  /* get the polynom selection mask */
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+
+  v8sf sign_bit = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v8sf*)_ps256_coscof_p0;
+  v8sf z = _mm256_mul_ps(x,x);
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
+  y = _mm256_andnot_ps(xmm3, y);
+  y = _mm256_add_ps(y,y2);
+  /* update the sign */
+  y = _mm256_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
+
+  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
+  v8si imm0, imm2, imm4;
+
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
+
+  /* scale by 4/Pi */
+  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
+
+  /* store the integer part of y in imm2 */
+  imm2 = _mm256_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
+
+  y = _mm256_cvtepi32_ps(imm2);
+  imm4 = imm2;
+
+  /* get the swap sign flag for the sine */
+  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
+  imm0 = _mm256_slli_epi32(imm0, 29);
+  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+
+  /* get the polynom selection mask for the sine*/
+  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
+  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
+  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
+  v8sf poly_mask = _mm256_castsi256_ps(imm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
+  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
+  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
+  xmm1 = _mm256_mul_ps(y, xmm1);
+  xmm2 = _mm256_mul_ps(y, xmm2);
+  xmm3 = _mm256_mul_ps(y, xmm3);
+  x = _mm256_add_ps(x, xmm1);
+  x = _mm256_add_ps(x, xmm2);
+  x = _mm256_add_ps(x, xmm3);
+
+  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
+  imm4 =  _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
+  imm4 = _mm256_slli_epi32(imm4, 29);
+
+  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
+
+  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v8sf z = _mm256_mul_ps(x,x);
+  y = *(v8sf*)_ps256_coscof_p0;
+
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
+  y = _mm256_mul_ps(y, z);
+  y = _mm256_mul_ps(y, z);
+  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
+  y = _mm256_sub_ps(y, tmp);
+  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
+  y2 = _mm256_mul_ps(y2, z);
+  y2 = _mm256_mul_ps(y2, x);
+  y2 = _mm256_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */
+  xmm3 = poly_mask;
+  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
+  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
+  y2 = _mm256_sub_ps(y2,ysin2);
+  y = _mm256_sub_ps(y, ysin1);
+
+  xmm1 = _mm256_add_ps(ysin1,ysin2);
+  xmm2 = _mm256_add_ps(y,y2);
+
+  /* update the sign */
+  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
+}
+
+#endif // CPU_CAPABILITY_AVX2
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..734d4e13fd3e30a3e6f0ddf87694aad846629023
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using weight_to_int4pack_fn = void(*)(const Tensor&, const Tensor&, int, int);
+using int4pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&, int, int);
+using int8pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub);
+DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub);
+DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b84a452747be996c5064b962ff418f524d2f999
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+inline ScalarType first_type() {
+  return ScalarType::Undefined;
+}
+
+template <typename... Args>
+inline ScalarType first_type(const Tensor& arg, const Args&... parameters) {
+  return arg.defined() ? arg.scalar_type() : first_type(parameters...);
+}
+
+template <typename... Args>
+inline bool is_mixed_type(const Tensor& input, const Args&... parameters) {
+  const auto parameter_type = first_type(parameters...);
+  return ((parameter_type != ScalarType::Undefined) &&
+          (parameter_type != input.scalar_type()));
+}
+
+// currently on CPU, mixed data type is only supported
+// when input is 'BFloat16' or 'Half' and parameters are 'Float'
+inline void check_mixed_data_type(const Tensor& input) {
+  TORCH_CHECK(at::isReducedFloatingType(input.scalar_type()),
+      "mixed dtype (CPU): all inputs must share same datatype.");
+}
+
+template <typename... Args>
+inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) {
+  TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float,
+      "mixed dtype (CPU): expect parameter to have scalar type of Float");
+  check_mixed_data_type(input, parameters...);
+}
+
+inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) {
+  return is_mixed_type ? ScalarType::Float : t.scalar_type();
+}
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/moments_utils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/moments_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..194e53a8e1fea892351619d2d26768095d815984
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/moments_utils.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
+
+namespace at {
+namespace native {
+inline namespace CPU_CAPABILITY {
+
+template<typename T> using opmath_t = at::opmath_type<T>;
+
+constexpr int64_t kChunkSize = 16;
+
+template <typename T>
+void AddMoments(
+    int64_t m0_add,
+    const T& m1_add,
+    const T& m2_add,
+    int64_t& m0,
+    T& m1,
+    T& m2) {
+  const int64_t n = m0 + m0_add;
+  const T c = n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
+  const T delta = m1_add - m1;
+  m1 += c * delta;
+  m2 += m2_add + delta * delta * c * static_cast<T>(m0);
+  m0 = n;
+}
+
+template <typename T>
+C10_ALWAYS_INLINE void AddMomentsVec(
+    int64_t m0_add,
+    const vec::Vectorized<T>& m1_add,
+    const vec::Vectorized<T>& m2_add,
+    int64_t& m0,
+    vec::Vectorized<T>& m1,
+    vec::Vectorized<T>& m2) {
+  using Vec = vec::Vectorized<T>;
+  const int64_t n = m0 + m0_add;
+  const T c = n == 0 ? static_cast<T>(0) : static_cast<T>(m0_add) / static_cast<T>(n);
+  const Vec c_vec(c);
+  const Vec delta = m1_add - m1;
+  m1 += c_vec * delta;
+  m2 += m2_add + delta * delta * c_vec * Vec(static_cast<T>(m0));
+  m0 = n;
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, opmath_t<T>>::value, void>::type
+UpdateMomentsVec(
+    int64_t m0,
+    const T* X_ptr,
+    const std::array<vec::Vectorized<opmath_t<T>>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<opmath_t<T>>& m1_stk0,
+    vec::Vectorized<opmath_t<T>>& m2_stk0) {
+  using Vec = vec::Vectorized<opmath_t<T>>;
+  Vec m1_vec(0);
+  Vec m2_vec(0);
+  for (const auto j : c10::irange(m0)) {
+    const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size());
+    const Vec delta_vec = x_vec - m1_vec;
+    m1_vec += delta_vec * c_vecs[j];
+    m2_vec += delta_vec * (x_vec - m1_vec);
+  }
+  AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0);
+}
+
+// each bfloat16/half vector will be converted to two float vectors,
+// and accumulated successively on m1_stk0/m2_stk0.
+template <typename T>
+inline typename std::enable_if<!std::is_same<T, at::opmath_type<T>>::value, void>::type
+UpdateMomentsVec(
+    int64_t m0,
+    const T* X_ptr,
+    const std::array<vec::Vectorized<at::opmath_type<T>>, kChunkSize>& c_vecs,
+    int64_t& m0_stk0,
+    vec::Vectorized<at::opmath_type<T>>& m1_stk0,
+    vec::Vectorized<at::opmath_type<T>>& m2_stk0) {
+  using Vec = vec::Vectorized<T>;
+  using fVec = vec::Vectorized<at::opmath_type<T>>;
+  fVec m1_fvec0(0), m1_fvec1(0);
+  fVec m2_fvec0(0), m2_fvec1(0);
+  for (const auto j : c10::irange(m0)) {
+    const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size());
+    auto [x_fvec0, x_fvec1] = convert_to_float<T>(x_bvec);
+    const fVec delta_fvec0 = x_fvec0 - m1_fvec0;
+    const fVec delta_fvec1 = x_fvec1 - m1_fvec1;
+    m1_fvec0 += delta_fvec0 * c_vecs[j];
+    m1_fvec1 += delta_fvec1 * c_vecs[j];
+    m2_fvec0 += delta_fvec0 * (x_fvec0 - m1_fvec0);
+    m2_fvec1 += delta_fvec1 * (x_fvec1 - m1_fvec1);
+  }
+  AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0);
+  AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0);
+}
+
+// Compute rowwise moments by Welford algorithm and cascade sum to improve
+// numerical stability.
+// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename T, int64_t kMaxDepth>
+std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
+  using math_t = opmath_t<T>;
+
+  constexpr int64_t kVecSize = vec::Vectorized<T>::size();
+  constexpr int64_t kAccVecSize = vec::Vectorized<math_t>::size();
+  const int64_t n = N / kVecSize;
+  const int64_t m = divup(n, kChunkSize);
+  const int64_t depth = utils::CeilLog2(m);
+
+  using Vec = vec::Vectorized<math_t>;
+  const Vec kZeroVec(math_t(0));
+  c10::SmallVector<int64_t, kMaxDepth> m0_stk(depth, 0);
+  c10::SmallVector<Vec, kMaxDepth> m1_stk(depth, kZeroVec);
+  c10::SmallVector<Vec, kMaxDepth> m2_stk(depth, kZeroVec);
+
+  for (const auto i : c10::irange(m)) {
+    const T* X_ptr = X + i * kChunkSize * kVecSize;
+    const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize);
+    static std::array<Vec, kChunkSize> c_vecs = ([]() {
+      std::array<Vec, kChunkSize> result;
+      for (const auto i : c10::irange(kChunkSize)) {
+        result[i] = Vec(math_t(1) / static_cast<math_t>(i + 1));
+      }
+      return result;
+    })();
+    UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]);
+
+    int64_t mask = i + 1;
+    for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) {
+      AddMomentsVec(
+          m0_stk[j - 1],
+          m1_stk[j - 1],
+          m2_stk[j - 1],
+          m0_stk[j],
+          m1_stk[j],
+          m2_stk[j]);
+      m0_stk[j - 1] = 0;
+      m1_stk[j - 1] = kZeroVec;
+      m2_stk[j - 1] = kZeroVec;
+      mask >>= 1;
+    }
+  }
+  for (const auto i : c10::irange(1, depth)) {
+    AddMomentsVec(
+        m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]);
+  }
+
+  std::array<math_t, kAccVecSize> m1_arr{};
+  std::array<math_t, kAccVecSize> m2_arr{};
+  m1_stk[0].store(m1_arr.data());
+  m2_stk[0].store(m2_arr.data());
+
+  int64_t m0 = 0;
+  math_t m1 = 0;
+  math_t m2 = 0;
+  for (int64_t i = n * kVecSize; i < N; ++i) {
+    math_t x = static_cast<math_t>(X[i]);
+    const math_t delta = x - m1;
+    ++m0;
+    m1 += delta / static_cast<math_t>(m0);
+    m2 += delta * (x - m1);
+  }
+  // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result
+  int64_t m0_add = n * kVecSize / kAccVecSize;
+  for (const auto i : c10::irange(kAccVecSize)) {
+    AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2);
+  }
+
+  return std::make_pair(m1, m2 / static_cast<math_t>(N - ddof));
+}
+
+template <typename T>
+std::pair<opmath_t<T>, opmath_t<T>> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
+  using Vec = vec::Vectorized<T>;
+  constexpr int64_t kVecSize = Vec::size();
+  const int64_t n = N / kVecSize;
+  const int64_t m = divup(n, kChunkSize);
+  const int64_t depth = utils::CeilLog2(m);
+  if (depth <= 4) {
+    return RowwiseMomentsImpl<T, 4>(X, N, ddof);
+  } else if (depth <= 8) {
+    return RowwiseMomentsImpl<T, 8>(X, N, ddof);
+  } else if (depth <= 16) {
+    return RowwiseMomentsImpl<T, 16>(X, N, ddof);
+  } else if (depth <= 32) {
+    return RowwiseMomentsImpl<T, 32>(X, N, ddof);
+  } else {
+    return RowwiseMomentsImpl<T, 64>(X, N, ddof);
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/utils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..62bb4f20891116ac2e4aeff5e7698013f123a84f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/utils.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/llvmMathExtras.h>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/Fbgemm.h>
+#endif
+
+namespace at {
+namespace native {
+
+template <typename T>
+inline void _store(T* dst, at::vec::Vectorized<T> src) {
+  src.store(dst);
+}
+
+inline void _store(at::BFloat16* dst, at::vec::Vectorized<float> src) {
+  auto res = at::vec::convert_float_bfloat16(src, src);
+  res.store(dst, at::vec::Vectorized<float>::size());
+}
+
+inline void _store(at::Half* dst, at::vec::Vectorized<float> src) {
+  auto res = at::vec::convert_float_half(src, src);
+  res.store(dst, at::vec::Vectorized<float>::size());
+}
+
+inline namespace CPU_CAPABILITY {
+
+template <typename T>
+inline T data_index_init(T offset) {
+  return offset;
+}
+
+template <typename T, typename... Args>
+inline T data_index_init(T offset, T& x, const T& X, Args&&... args) {
+  offset = data_index_init(offset, std::forward<Args>(args)...);
+  x = offset % X;
+  return offset / X;
+}
+
+inline bool data_index_step() {
+  return true;
+}
+
+template <typename T, typename... Args>
+inline bool data_index_step(T& x, const T& X, Args&&... args) {
+  if (data_index_step(std::forward<Args>(args)...)) {
+    x = ((x + 1) == X) ? 0 : (x + 1);
+    return x == 0;
+  }
+  return false;
+}
+
+// Helper struct for bfloat16 vectorization
+// Useful when you need float as immediate dtype or accumulate dtype
+using namespace vec;
+struct Vec2 {
+  Vectorized<float> val0, val1;
+  Vec2(Vectorized<float> v0, Vectorized<float> v1) : val0(v0), val1(v1) {}
+  Vec2(float v) : val0(v), val1(v) {}
+  static Vec2 loadu(const BFloat16* ptr) {
+    auto [v0, v1] = convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+    return {v0, v1};
+  }
+  static Vec2 loadu(const float* ptr) {
+    return {Vectorized<float>::loadu(ptr), Vectorized<float>::loadu(ptr + Vectorized<float>::size())};
+  }
+  void store(BFloat16* ptr) const {
+    Vectorized<BFloat16> val = convert_float_bfloat16(val0, val1);
+    val.store(ptr);
+  }
+  void store(float* ptr) const {
+    val0.store(ptr);
+    val1.store(ptr + Vectorized<float>::size());
+  }
+};
+inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; }
+inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; }
+inline Vec2 operator-(const Vec2& a, const Vec2& b) { return {a.val0 - b.val0, a.val1 - b.val1}; }
+inline Vec2 operator/(const Vec2& a, const Vec2& b) { return {a.val0 / b.val0, a.val1 / b.val1}; }
+inline Vec2 maximum(const Vec2& a, const Vec2& b) { return {vec::maximum(a.val0, b.val0), vec::maximum(a.val1, b.val1)}; }
+inline Vec2 minimum(const Vec2& a, const Vec2& b) { return {vec::minimum(a.val0, b.val0), vec::minimum(a.val1, b.val1)}; }
+
+template <typename scalar_t> struct VectorizedType { using type = Vectorized<scalar_t>; };
+template <> struct VectorizedType<BFloat16> { using type = Vec2; };
+template <typename scalar_t> using VecType = typename VectorizedType<scalar_t>::type;
+
+// Helper for mixed data type parameter Vec::load
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr) {
+  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const Half* ptr) {
+  return convert_half_float(Vectorized<Half>::loadu(ptr));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr) {
+  using Vec = Vectorized<float>;
+  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size()));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const BFloat16* ptr, int64_t count) {
+  return convert_bfloat16_float(Vectorized<BFloat16>::loadu(ptr, count));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const Half* ptr, int64_t count) {
+  return convert_half_float(Vectorized<Half>::loadu(ptr, count));
+}
+
+inline std::tuple<Vectorized<float>, Vectorized<float>> load2f(const float* ptr, int64_t count) {
+  using Vec = Vectorized<float>;
+  if (count > Vec::size()) {
+  return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size(), count - Vec::size()));
+  } else {
+    return std::make_tuple(Vec::loadu(ptr, count), Vec(0));
+  }
+}
+
+} // namespace
+
+namespace utils {
+
+template <typename T>
+T CeilLog2(const T& x) {
+  if (x <= 2) {
+    return 1;
+  }
+  // Last set bit is floor(log2(x)), floor + 1 is ceil
+  // except when x is an exact powers of 2, so subtract 1 first
+  return static_cast<T>(llvm::findLastSet(static_cast<uint64_t>(x) - 1)) + 1;
+}
+
+// matrix transpose:
+//   src has shape of M by N, with leading dimension of ld_src
+//   dst has shape of N by M, with leading dimension of ld_dst
+template <typename T>
+inline void transpose(int64_t M, int64_t N, const T* src, int64_t ld_src, T* dst, int64_t ld_dst) {
+  for (int64_t j = 0; j < N; j++) {
+    for (int64_t i = 0; i < M; i++) {
+      dst[j * ld_dst + i] = src[i * ld_src + j];
+    }
+  }
+}
+
+#ifdef USE_FBGEMM
+template <>
+inline void transpose<float>(int64_t M, int64_t N, const float* src, int64_t ld_src, float* dst, int64_t ld_dst) {
+  TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
+  fbgemm::transpose_simd<float>(M, N, src, ld_src, dst, ld_dst);
+}
+#endif
+
+template <typename index_t, typename F>
+inline void parallel_sparse_csr(
+    const TensorAccessor<index_t, 1>& crow_acc,
+    const int64_t M,
+    const int64_t nnz,
+    const F& f) {
+  TORCH_CHECK(crow_acc.size(0) == M + 1);
+
+  // directly parallel on `M` may lead to load imbalance,
+  // statically determine thread partition here to average payload
+  // for each thread.
+  int num_threads = at::get_num_threads();
+  std::vector<int64_t> thread_splits(num_threads + 1, M);
+
+  int64_t thread_averge_payload = std::max((int64_t)1, divup(nnz, num_threads));
+
+  thread_splits[0] = 0;
+  int64_t sum = 0;
+  int64_t t = 1;
+  for (const auto m : c10::irange(M)) {
+    int64_t row_start = crow_acc[m];
+    int64_t row_end = crow_acc[m + 1];
+    sum += row_end - row_start;
+    if (sum > t * thread_averge_payload) {
+      thread_splits[t] = m;
+      t++;
+    }
+  }
+  // need to restore the last index,
+  // due to rounding error when calculating `thread_averge_payload`.
+  thread_splits[num_threads] = M;
+
+  at::parallel_for(0, num_threads, 1, [&](int64_t cbegin, int64_t cend) {
+    int tid = at::get_thread_num();
+    int64_t begin = thread_splits[tid];
+    int64_t end = thread_splits[tid + 1];
+    f(begin, end);
+  });
+}
+
+} // namespace utils
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/zmath.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/zmath.h
new file mode 100644
index 0000000000000000000000000000000000000000..d978e89b1e562d294a49e890474ff9ccceb1cece
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cpu/zmath.h
@@ -0,0 +1,250 @@
+#pragma once
+
+// Complex number math operations that act as no-ops for other dtypes.
+#include <c10/util/complex.h>
+#include <c10/util/MathConstants.h>
+#include<ATen/NumericUtils.h>
+
+namespace at { namespace native {
+inline namespace CPU_CAPABILITY {
+
+template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
+inline VALUE_TYPE zabs (SCALAR_TYPE z) {
+  return z;
+}
+
+template<>
+inline c10::complex<float> zabs <c10::complex<float>> (c10::complex<float> z) {
+  return c10::complex<float>(std::abs(z));
+}
+
+template<>
+inline float zabs <c10::complex<float>, float> (c10::complex<float> z) {
+  return std::abs(z);
+}
+
+template<>
+inline c10::complex<double> zabs <c10::complex<double>> (c10::complex<double> z) {
+  return c10::complex<double>(std::abs(z));
+}
+
+template<>
+inline double zabs <c10::complex<double>, double> (c10::complex<double> z) {
+  return std::abs(z);
+}
+
+// This overload corresponds to non-complex dtypes.
+// The function is consistent with its NumPy equivalent
+// for non-complex dtypes where `pi` is returned for
+// negative real numbers and `0` is returned for 0 or positive
+// real numbers.
+// Note: `nan` is propagated.
+template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
+inline VALUE_TYPE angle_impl (SCALAR_TYPE z) {
+  if (at::_isnan(z)) {
+    return z;
+  }
+  return z < 0 ? c10::pi<double> : 0;
+}
+
+template<>
+inline c10::complex<float> angle_impl <c10::complex<float>> (c10::complex<float> z) {
+  return c10::complex<float>(std::arg(z), 0.0);
+}
+
+template<>
+inline float angle_impl <c10::complex<float>, float> (c10::complex<float> z) {
+  return std::arg(z);
+}
+
+template<>
+inline c10::complex<double> angle_impl <c10::complex<double>> (c10::complex<double> z) {
+  return c10::complex<double>(std::arg(z), 0.0);
+}
+
+template<>
+inline double angle_impl <c10::complex<double>, double> (c10::complex<double> z) {
+  return std::arg(z);
+}
+
+template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
+constexpr VALUE_TYPE real_impl (SCALAR_TYPE z) {
+  return z; //No-Op
+}
+
+template<>
+constexpr c10::complex<float> real_impl <c10::complex<float>> (c10::complex<float> z) {
+  return c10::complex<float>(z.real(), 0.0);
+}
+
+template<>
+constexpr float real_impl <c10::complex<float>, float> (c10::complex<float> z) {
+  return z.real();
+}
+
+template<>
+constexpr c10::complex<double> real_impl <c10::complex<double>> (c10::complex<double> z) {
+  return c10::complex<double>(z.real(), 0.0);
+}
+
+template<>
+constexpr double real_impl <c10::complex<double>, double> (c10::complex<double> z) {
+  return z.real();
+}
+
+template <typename SCALAR_TYPE, typename VALUE_TYPE=SCALAR_TYPE>
+constexpr VALUE_TYPE imag_impl (SCALAR_TYPE /*z*/) {
+  return 0;
+}
+
+template<>
+constexpr c10::complex<float> imag_impl <c10::complex<float>> (c10::complex<float> z) {
+  return c10::complex<float>(z.imag(), 0.0);
+}
+
+template<>
+constexpr float imag_impl <c10::complex<float>, float> (c10::complex<float> z) {
+  return z.imag();
+}
+
+template<>
+constexpr c10::complex<double> imag_impl <c10::complex<double>> (c10::complex<double> z) {
+  return c10::complex<double>(z.imag(), 0.0);
+}
+
+template<>
+constexpr double imag_impl <c10::complex<double>, double> (c10::complex<double> z) {
+  return z.imag();
+}
+
+template <typename TYPE>
+inline TYPE conj_impl (TYPE z) {
+  return z; //No-Op
+}
+
+template<>
+inline c10::complex<at::Half> conj_impl <c10::complex<at::Half>> (c10::complex<at::Half> z) {
+  return c10::complex<at::Half>{z.real(), -z.imag()};
+}
+
+template<>
+inline c10::complex<float> conj_impl <c10::complex<float>> (c10::complex<float> z) {
+  return c10::complex<float>(z.real(), -z.imag());
+}
+
+template<>
+inline c10::complex<double> conj_impl <c10::complex<double>> (c10::complex<double> z) {
+  return c10::complex<double>(z.real(), -z.imag());
+}
+
+template <typename TYPE>
+inline TYPE ceil_impl (TYPE z) {
+  return std::ceil(z);
+}
+
+template <>
+inline c10::complex<float> ceil_impl (c10::complex<float> z) {
+  return c10::complex<float>(std::ceil(z.real()), std::ceil(z.imag()));
+}
+
+template <>
+inline c10::complex<double> ceil_impl (c10::complex<double> z) {
+  return c10::complex<double>(std::ceil(z.real()), std::ceil(z.imag()));
+}
+
+template<typename T>
+inline c10::complex<T> sgn_impl (c10::complex<T> z) {
+  if (z == c10::complex<T>(0, 0)) {
+    return c10::complex<T>(0, 0);
+  } else {
+    return z / zabs(z);
+  }
+}
+
+template <typename TYPE>
+inline TYPE floor_impl (TYPE z) {
+  return std::floor(z);
+}
+
+template <>
+inline c10::complex<float> floor_impl (c10::complex<float> z) {
+  return c10::complex<float>(std::floor(z.real()), std::floor(z.imag()));
+}
+
+template <>
+inline c10::complex<double> floor_impl (c10::complex<double> z) {
+  return c10::complex<double>(std::floor(z.real()), std::floor(z.imag()));
+}
+
+template <typename TYPE>
+inline TYPE round_impl (TYPE z) {
+  return std::nearbyint(z);
+}
+
+template <>
+inline c10::complex<float> round_impl (c10::complex<float> z) {
+  return c10::complex<float>(std::nearbyint(z.real()), std::nearbyint(z.imag()));
+}
+
+template <>
+inline c10::complex<double> round_impl (c10::complex<double> z) {
+  return c10::complex<double>(std::nearbyint(z.real()), std::nearbyint(z.imag()));
+}
+
+template <typename TYPE>
+inline TYPE trunc_impl (TYPE z) {
+  return std::trunc(z);
+}
+
+template <>
+inline c10::complex<float> trunc_impl (c10::complex<float> z) {
+  return c10::complex<float>(std::trunc(z.real()), std::trunc(z.imag()));
+}
+
+template <>
+inline c10::complex<double> trunc_impl (c10::complex<double> z) {
+  return c10::complex<double>(std::trunc(z.real()), std::trunc(z.imag()));
+}
+
+template <typename TYPE, std::enable_if_t<!c10::is_complex<TYPE>::value, int> = 0>
+inline TYPE max_impl (TYPE a, TYPE b) {
+  if (_isnan<TYPE>(a) || _isnan<TYPE>(b)) {
+    return std::numeric_limits<TYPE>::quiet_NaN();
+  } else {
+    return std::max(a, b);
+  }
+}
+
+template <typename TYPE, std::enable_if_t<c10::is_complex<TYPE>::value, int> = 0>
+inline TYPE max_impl (TYPE a, TYPE b) {
+  if (_isnan<TYPE>(a)) {
+    return a;
+  } else if (_isnan<TYPE>(b)) {
+    return b;
+  } else {
+    return std::abs(a) > std::abs(b) ? a : b;
+  }
+}
+
+template <typename TYPE, std::enable_if_t<!c10::is_complex<TYPE>::value, int> = 0>
+inline TYPE min_impl (TYPE a, TYPE b) {
+  if (_isnan<TYPE>(a) || _isnan<TYPE>(b)) {
+    return std::numeric_limits<TYPE>::quiet_NaN();
+  } else {
+    return std::min(a, b);
+  }
+}
+
+template <typename TYPE, std::enable_if_t<c10::is_complex<TYPE>::value, int> = 0>
+inline TYPE min_impl (TYPE a, TYPE b) {
+  if (_isnan<TYPE>(a)) {
+    return a;
+  } else if (_isnan<TYPE>(b)) {
+    return b;
+  } else {
+    return std::abs(a) < std::abs(b) ? a : b;
+  }
+}
+
+} // end namespace
+}} //end at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Activation.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf13717c177eb4ca9973ff043248c1eafff87174
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Activation.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <ATen/native/Activation.h>
+#include <cstdint>
+
+namespace at {
+struct TensorIteratorBase;
+class TensorBase;
+}
+
+namespace at { namespace native {
+
+void launch_glu_backward_kernel(const TensorIteratorBase& iter,
+                                int64_t gI_stride, int64_t I_stride);
+
+void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter);
+
+void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate);
+void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bfa8060f2e345945751db4330318ea43878e487
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h
@@ -0,0 +1,48 @@
+// DON'T include this except from Binary*.cu files. It should not leak into
+// headers.
+#pragma once
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/BinaryOps.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+
+#include <type_traits>
+
+namespace at {
+namespace native {
+namespace binary_internal {
+
+template <typename scalar_t>
+struct DivFunctor {
+  __device__ scalar_t operator()(scalar_t a, scalar_t b) const {
+    return a / b;
+  }
+};
+
+template <typename T>
+struct MulFunctor {
+  __device__ T operator()(T a, T b) const {
+    return a * b;
+  }
+};
+
+// Workaround for the error: '*' in boolean context, suggest '&&' instead
+// [-Werror=int-in-bool-context]
+template <>
+struct MulFunctor<bool> {
+  __device__ bool operator()(bool a, bool b) const {
+    return a && b;
+  }
+};
+void div_true_kernel_cuda(TensorIteratorBase& iter);
+void div_trunc_kernel_cuda(TensorIteratorBase& iter);
+} // namespace binary_internal
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1c0f669f69913d521d3e12317fb152e25d6e48a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh
@@ -0,0 +1,296 @@
+#pragma once
+#include <ATen/jit_macros.h>
+
+// Jiterator functions are guarded behind this macro
+#if AT_USE_JITERATOR()
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <ATen/native/cuda/Loops.cuh>
+
+#include <c10/macros/Macros.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/SmallBuffer.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <tuple>
+#include <mutex>
+
+namespace at {
+namespace native {
+
+template <typename Tuple, std::size_t... I>
+constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence<I...> seq) {
+    constexpr auto size = seq.size();
+    (void)t; // warning : unused parameter when tuple is empty.
+    return std::array<void*, size>{static_cast<void*>(&std::get<I>(t))...};
+}
+
+// Helper function convert tuple to std::array<void*, N>
+// for passing the arguments to CUDA Kernel
+// NOTE: We capture tuple by reference,
+// so the pointers in returned array are only valid
+// till tuple is alive.
+template <typename ...Args>
+constexpr auto tuple_to_array(std::tuple<Args...>& extra_args) {
+    constexpr auto tuple_size = sizeof...(Args);
+    return tuple_to_array_helper(extra_args, std::make_index_sequence<tuple_size>{});
+}
+
+struct JittedVecKernelCache {
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  at::cuda::jit::NvrtcFunction vec1;
+  at::cuda::jit::NvrtcFunction vec2;
+  at::cuda::jit::NvrtcFunction vec4;
+};
+
+struct JittedKernelVariantCache {
+  JittedVecKernelCache vec;
+  at::cuda::jit::NvrtcFunction noncontiguous;
+  at::cuda::jit::NvrtcFunction dynamic_contiguous;
+  at::cuda::jit::NvrtcFunction dynamic_noncontiguous;
+};
+
+inline c10::SmallBuffer<void*, 64> pack_kernel_args(
+    std::initializer_list<void*> args,
+    c10::ArrayRef<void*> extra_args) {
+  c10::SmallBuffer<void*, 64> ret(args.size() + extra_args.size());
+  std::copy(args.begin(), args.end(), ret.data());
+  std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size());
+  return ret;
+}
+
+template<typename array_t,
+         typename inp_calc_t,
+         typename out_calc_t,
+         typename loader_t,
+         typename storer_t>
+void launch_jitted_unrolled_kernel(
+    std::mutex &jiterator_mutex,
+    at::cuda::jit::NvrtcFunction &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    int64_t N,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s,
+    bool contiguous,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void* scalar_val,
+    c10::ArrayRef<void*> extra_args) {
+
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  //casting result to int is always safe, intermediate is int64 and won't overflow
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+
+  if (!fn_cache.function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_cache.function) {
+      constexpr bool dynamic_casting = !std::is_same<decltype(l), memory::LoadWithoutCast>() ||
+                                       !std::is_same<decltype(s), memory::StoreWithoutCast>();
+      auto code = at::cuda::jit::generate_code(
+          desc, contiguous, dynamic_casting, scalar_pos);
+      fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name);
+    }
+  }
+
+  auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+  at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u},
+  {num_threads(), 1u, 1u});
+}
+
+template<int arity, typename array_t>
+void launch_jitted_vectorized_kernel(
+    std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    void *scalar_val, c10::ArrayRef<void*> extra_args) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  // N is still int64_t for the computation, but it's always safe to cast result to int
+  const uint32_t grid = (N + block_work_size() - 1) / block_work_size();
+  const int vec_size = at::cuda::jit::can_vectorize_up_to(
+      desc, c10::ArrayRef<char*>(data.data, data.size()));
+
+  // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements)
+  //   fn_ptr is set to the appropriate function based on the vec size and GPU used
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  if (vec_size == 4) {
+    fn_ptr = &fn_cache.vec4;
+  } else if (vec_size == 2) {
+    fn_ptr = &fn_cache.vec2;
+  } else if (vec_size ==1) {
+    fn_ptr = &fn_cache.vec1;
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel");
+  }
+
+  bool vectorized = vec_size > 1;
+
+  if (!fn_ptr->function) {
+    const std::lock_guard<std::mutex> lock{jiterator_mutex};
+    if (!fn_ptr->function) { // cache miss!
+
+      // Generates program
+      auto code = at::cuda::jit::generate_code(
+          desc, /*contiguous=*/true, /*dynamic_casting=*/false,
+          scalar_pos, vectorized, vec_size);
+      std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name;
+
+      // Acquires the program
+      *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name);
+    }
+  }
+
+  if (vectorized) {
+    auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+  } else {
+// NVCC complains about unused variables l and s.
+// It should be false positive in most cases, so we suppress the warnings.
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
+    auto ic = TrivialOffsetCalculator<arity>();
+    auto oc = TrivialOffsetCalculator<1>();
+    auto l = memory::LoadWithoutCast();
+    auto s = memory::StoreWithoutCast();
+
+    auto args = pack_kernel_args(
+        {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args);
+    at::cuda::jit::launch_jitted_pwise_function(
+        *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u});
+#pragma nv_diagnostic pop
+  }
+}
+
+template <int arity>
+void jitted_gpu_kernel_generic(
+    std::mutex &jiterator_mutex,
+    JittedKernelVariantCache &cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    at::cuda::jit::BinaryFuncVariant scalar_pos,
+    c10::ArrayRef<void*> extra_args,
+    TensorIteratorBase& iter,
+    const bool dynamic_casting,
+    void *scalar_val) {
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  constexpr int ntensors = arity + 1;
+  at::detail::Array<char*, ntensors> data;
+  for (auto i : c10::irange(ntensors)) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+  bool contiguous = iter.is_contiguous();
+
+  // Decides which of 4 kernel types to launch
+  // Variations are:
+  //   - Case 1: no dynamic casting and contiguous
+  //   - Case 2: no dynamic casting and noncontiguous
+  //   - Case 3: dynamic casting and contiguous
+  //   - Case 4: dynamic casting and noncontiguous
+  // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl
+
+  if (!dynamic_casting) {
+    if (contiguous) {
+      // Case 1: no dynamic casting and contiguous
+      launch_jitted_vectorized_kernel<arity>(
+          jiterator_mutex, cache.vec, desc,
+          numel, data, scalar_pos, scalar_val, extra_args);
+      return;
+    }
+
+    // Case 2: no dynamic casting and noncontiguous
+    auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+    auto output_offset_calculator = make_output_offset_calculator(iter);
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.noncontiguous, desc, numel, data,
+        input_offset_calculator, output_offset_calculator, loader,
+        storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Cases 3 and 4 are handled below
+  // Both require construction of a storer (this asserts 1 output) and one or more loaders
+
+  // Creates store cast to output (the zeroth tensor in TensorIterator)
+  auto storer = memory::StoreWithCast<1>(iter);
+
+  // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors)
+  auto loader = memory::LoadWithCast<arity>(iter);
+
+  if (contiguous) {
+    // Case 3: dynamic casting and contiguous
+    auto input_offset_calculator = TrivialOffsetCalculator<arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_jitted_unrolled_kernel(
+        jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator,
+        output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+    return;
+  }
+
+  // Case 4: dynamic casting and noncontiguous
+  auto input_offset_calculator = make_input_offset_calculator<arity>(iter);
+  auto output_offset_calculator = make_output_offset_calculator(iter);
+  launch_jitted_unrolled_kernel(
+      jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator,
+      output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args);
+}
+
+// NOTE: static to reduce chances of name collision.
+template <
+    char const* name,
+    typename result_type,
+    typename f_inputs_type,
+    int arity,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    typename... ExtraArgs>
+static void jitted_gpu_kernel_impl(
+    TensorIteratorBase& iter,
+    const std::string &f,
+    const bool dynamic_casting,
+    at::opmath_type<f_inputs_type> scalar_val,
+    std::tuple<ExtraArgs...> extra_args) {
+
+  // TODO: Memory use can probably be optimized by re-using kernels across GPUs with
+  //   the same compute capability
+  static std::mutex jiterator_mutex;
+  static std::vector<JittedKernelVariantCache> device_caches(c10::cuda::device_count());
+
+  constexpr int nInputs = arity;
+  constexpr int nOutputs = 1;  // TODO: Support more than 1 output
+  static const auto desc = at::cuda::jit::make_kernel_descriptor<
+    result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs);
+
+  auto &cache = device_caches[iter.device().index()];
+  auto extra_args_array = tuple_to_array(extra_args);
+  return jitted_gpu_kernel_generic<arity>(
+      jiterator_mutex,
+      cache,
+      desc,
+      scalar_pos,
+      extra_args_array,
+      iter,
+      dynamic_casting,
+      &scalar_val
+    );
+}
+
+}}  // at::native
+
+#endif // AT_USE_JITERATOR()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ed31a997c18fb82d033e5810b9c657d5b125831e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh
@@ -0,0 +1,348 @@
+#pragma once
+
+// This file provides two functions to help write GPU elementwise kernels:
+//
+//   gpu_kernel(TensorIterator iter, <lambda>)
+//   gpu_kernel_with_scalars(TensorIterator iter, <lambda>)
+//
+// The gpu_kernel_with_scalars generates specializations that support a
+// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
+// is lifted to a kernel parameter instead of copying to device memory.
+// This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
+// which is the default for TensorIterator::binary_op. Otherwise, all inputs
+// and the output must be on the GPU.
+//
+// For example, to write a reciprocal kernel for GPU float Tensors:
+//
+//   gpu_kernel(iter, []GPU_LAMBDA(float a) {
+//    return 1.0f / a;
+//   });
+//
+// To write a multiplication kernel for GPU float Tensors where one argument
+// may be a CPU scalar:
+//
+//   gpu_kernel_with_scalars(iter, []GPU_LAMBDA(float a, float b) {
+//     return a * b;
+//   });
+//
+// See BinaryOpsKernel.cu for the complete implementation
+//
+
+#include <iostream>
+#include <tuple>
+#include <type_traits>
+
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/core/DynamicCast.h>
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeCast.h>
+
+#ifdef __NVCC__
+#define ASSERT_HOST_DEVICE_LAMBDA(type)                       \
+  static_assert(                                              \
+      __nv_is_extended_host_device_lambda_closure_type(type), \
+      #type " must be a __host__ __device__ lambda")
+#else
+#define ASSERT_HOST_DEVICE_LAMBDA(type)
+#endif
+
+namespace at {
+namespace native {
+
+template <int vec_size, typename func_t, typename array_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
+  using traits = function_traits<func_t>;
+  int remaining = N - block_work_size() * blockIdx.x;
+
+  if (remaining < block_work_size()) { // if this block handles the reminder,
+                                       // just do a naive unrolled loop
+    auto input_calc = TrivialOffsetCalculator<traits::arity>();
+    auto output_calc = TrivialOffsetCalculator<1>();
+    auto loader = memory::LoadWithoutCast();
+    auto storer = memory::StoreWithoutCast();
+    auto policy = memory::policies::unroll<
+        array_t,
+        decltype(input_calc),
+        decltype(output_calc),
+        memory::LoadWithoutCast,
+        memory::StoreWithoutCast>(
+        data, remaining, input_calc, output_calc, loader, storer);
+    elementwise_kernel_helper(f, policy);
+  } else { // if this block has a full `block_work_size` data to handle, use
+           // vectorized memory access
+    elementwise_kernel_helper(
+        f, memory::policies::vectorized<vec_size, array_t>(data));
+  }
+}
+
+template <
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void unrolled_elementwise_kernel(
+    int N,
+    func_t f,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s) {
+  int remaining = N - block_work_size() * blockIdx.x;
+  auto policy = memory::policies::
+      unroll<array_t, inp_calc_t, out_calc_t, loader_t, storer_t>(
+          data, remaining, ic, oc, l, s);
+  elementwise_kernel_helper(f, policy);
+}
+
+// this function assume trivial 1d and no dynamic casting
+template <typename func_t, typename array_t>
+static inline void launch_vectorized_kernel(
+    int64_t N,
+    const func_t& f,
+    array_t data) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  using traits = function_traits<func_t>;
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int vec_size = memory::can_vectorize_up_to<func_t>(data);
+
+  switch (vec_size) {
+    case 4:
+      vectorized_elementwise_kernel<4, func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(N, f, data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    case 2:
+      vectorized_elementwise_kernel<2, func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(N, f, data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    case 1: {
+      auto input_calc = TrivialOffsetCalculator<traits::arity>();
+      auto output_calc = TrivialOffsetCalculator<1>();
+      auto loader = memory::LoadWithoutCast();
+      auto storer = memory::StoreWithoutCast();
+      unrolled_elementwise_kernel<func_t, array_t>
+          <<<grid, num_threads(), 0, stream>>>(
+              N, f, data, input_calc, output_calc, loader, storer);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+      break;
+    }
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
+  }
+}
+
+template <
+    typename func_t,
+    typename array_t,
+    typename inp_calc_t,
+    typename out_calc_t,
+    typename loader_t,
+    typename storer_t>
+static inline void launch_unrolled_kernel(
+    int64_t N,
+    const func_t& f,
+    array_t data,
+    inp_calc_t ic,
+    out_calc_t oc,
+    loader_t l,
+    storer_t s) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  unrolled_elementwise_kernel<func_t, array_t>
+      <<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc, l, s);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <int nt, int vt, typename func_t>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void elementwise_kernel(int N, func_t f) {
+  int tid = threadIdx.x;
+  int nv = nt * vt;
+  int idx = nv * blockIdx.x + tid;
+#pragma unroll
+  for (int i = 0; i < vt; i++) {
+    if (idx < N) {
+      f(idx);
+      idx += nt;
+    }
+  }
+}
+
+template <int nt, int vt, typename func_t>
+static void launch_legacy_kernel(int64_t N, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
+  if (N == 0) {
+    return;
+  }
+  dim3 block(nt);
+  dim3 grid((N + block.x * vt - 1) / (block.x * vt));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename traits, typename func_t, typename index_t, size_t... INDEX>
+C10_HOST_DEVICE typename traits::result_type invoke_impl(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    int i,
+    std::index_sequence<INDEX...>) {
+  (void)strides;
+  (void)i;
+  return f(c10::load<typename traits::template arg<INDEX>::type>(
+      data[INDEX] + i * strides[INDEX])...);
+}
+
+template <
+    typename func_t,
+    typename index_t,
+    typename traits = function_traits<func_t>>
+C10_HOST_DEVICE typename traits::result_type invoke(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    int i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return invoke_impl<traits>(f, data, strides, i, Indices{});
+}
+
+template <typename traits, typename func_t, typename index_t, size_t... I>
+C10_HOST_DEVICE typename traits::result_type invoke_impl(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    const ScalarType dtypes[],
+    int i,
+    std::index_sequence<I...>) {
+  (void)strides;
+  (void)i;
+  return f(c10::fetch_and_cast<typename traits::template arg<I>::type>(
+      dtypes[I], data[I] + i * strides[I])...);
+}
+
+template <
+    typename func_t,
+    typename index_t,
+    typename traits = function_traits<func_t>>
+C10_HOST_DEVICE typename traits::result_type invoke(
+    const func_t& f,
+    char* const C10_RESTRICT data[],
+    const index_t strides[],
+    const ScalarType dtypes[],
+    int i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return invoke_impl<traits>(f, data, strides, dtypes, i, Indices{});
+}
+
+template <typename func_t>
+void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using arg0_t = typename traits::result_type;
+  constexpr int ntensors = traits::arity + 1;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  bool contiguous = iter.is_contiguous();
+
+  if (contiguous) {
+    return launch_vectorized_kernel(numel, f, data);
+  }
+  auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+  constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 2 : 4;
+  launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
+    auto offsets = offset_calc.get(idx);
+    arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
+    *out = invoke(f, &data.data[1], &offsets.data[1], 1);
+  });
+}
+
+template <typename func_t>
+void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
+  if (!needs_dynamic_casting<func_t>::check(iter)) {
+    return gpu_kernel_impl_nocast(iter, f);
+  }
+  using traits = function_traits<func_t>;
+  using arg0_t = typename traits::result_type;
+  constexpr int ntensors = traits::arity + 1;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  bool contiguous = iter.is_contiguous();
+
+  if (contiguous) {
+#ifdef USE_ROCM
+    at::detail::Array<ScalarType, ntensors> dtypes;
+    auto inner_strides = iter.get_inner_strides();
+    at::detail::Array<int, ntensors> strides;
+    for (int i = 0; i < ntensors; i++) {
+      dtypes[i] = iter.dtype(i);
+      strides[i] = inner_strides[i];
+    }
+    launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
+      void* out = data[0] + strides[0] * idx;
+      arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+    });
+#else
+    auto loader = memory::LoadWithCast<traits::arity>(iter);
+    auto storer = memory::StoreWithCast<1>(iter);
+    auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
+    auto output_offset_calculator = TrivialOffsetCalculator<1>();
+    launch_unrolled_kernel(
+        numel,
+        f,
+        data,
+        input_offset_calculator,
+        output_offset_calculator,
+        loader,
+        storer);
+#endif
+  } else {
+    at::detail::Array<ScalarType, ntensors> dtypes;
+    for (int i = 0; i < ntensors; i++) {
+      dtypes[i] = iter.dtype(i);
+    }
+    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
+      auto offsets = offset_calc.get(idx);
+      void* out = data[0] + offsets[0];
+      arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+    });
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0dc24872e6157de677146db592fe0fed86d51b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <ATen/native/CompositeRandomAccessorCommon.h>
+#include <thrust/tuple.h>
+
+namespace at { namespace native {
+
+struct TupleInfoCPU {
+  template <typename ...Types>
+  using tuple = thrust::tuple<Types...>;
+
+  template <typename ...Types>
+  static constexpr auto tie(Types&... args) noexcept {
+    return thrust::tie(args...);
+  }
+};
+
+template <typename KeyAccessor, typename ValueAccessor>
+using CompositeRandomAccessorCPU =
+  CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfoCPU>;
+
+template <typename Values, typename References>
+void swap(
+  references_holder<Values, References> rh1,
+  references_holder<Values, References> rh2
+) {
+  return thrust::swap(rh1.data(), rh2.data());
+}
+
+template <int N, typename Values, typename References>
+auto get(references_holder<Values, References> rh) -> decltype(thrust::get<N>(rh.data())) {
+  return thrust::get<N>(rh.data());
+}
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Copy.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9e23ad7fe8d56f7aa18833c371fd3969304e6ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Copy.h
@@ -0,0 +1,10 @@
+#pragma once
+
+namespace at {
+struct TensorIteratorBase;
+
+namespace native {
+
+void direct_copy_kernel_cuda(TensorIteratorBase &iter);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..116ed029e9e32e7ba27b4c9b5a013cd794c3362d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h
@@ -0,0 +1,494 @@
+#include <ATen/Config.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/CuFFTUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+
+#include <cufft.h>
+#include <cufftXt.h>
+
+#include <limits>
+#include <list>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace at { namespace native { namespace detail {
+
+// Enum representing the FFT type
+enum class CuFFTTransformType : int8_t {
+  C2C,  // Complex-to-complex
+  R2C,  // Real-to-complex
+  C2R,  // Complex-to-real
+};
+
+// This struct is used to let us easily compute hashes of the
+// parameters.
+// It will be the **key** to the plan cache.
+struct CuFFTParams
+{
+  int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3
+  // These include additional batch dimension as well.
+  int64_t sizes_[max_rank + 1];
+  int64_t input_strides_[max_rank + 1];
+  int64_t output_strides_[max_rank + 1];
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  CuFFTParams() = default;
+
+  CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_sizes.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank);
+
+    std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_);
+    std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_);
+    std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_);
+  }
+};
+
+static_assert(std::is_trivial<CuFFTParams>::value, "");
+
+// Returns true if the transform type has complex input
+inline bool cufft_complex_input(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::C2R:
+      return true;
+
+    case CuFFTTransformType::R2C:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+// Returns true if the transform type has complex output
+inline bool cufft_complex_output(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::R2C:
+      return true;
+
+    case CuFFTTransformType::C2R:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+// Create transform type enum from bools representing if input and output are complex
+inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) {
+  if (complex_input && complex_output) {
+    return CuFFTTransformType::C2C;
+  } else if (complex_input && !complex_output) {
+    return CuFFTTransformType::C2R;
+  } else if (!complex_input && complex_output) {
+    return CuFFTTransformType::R2C;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported");
+}
+
+
+class CuFFTHandle {
+  ::cufftHandle handle_;
+public:
+
+  CuFFTHandle() {
+    CUFFT_CHECK(cufftCreate(&handle_));
+  }
+
+  ::cufftHandle & get() { return handle_; }
+  const ::cufftHandle & get() const { return handle_; }
+
+  ~CuFFTHandle() {
+// Not using fftDestroy() for rocFFT to work around double freeing of handles
+#if !defined(USE_ROCM)
+    cufftDestroy(handle_);
+#endif
+  }
+};
+
+__forceinline__
+static bool is_pow_of_two(int64_t x) {
+  return (x & (x - 1)) == 0;
+}
+
+using cufft_size_type = long long int;
+
+using CuFFTDimVector = c10::SmallVector<cufft_size_type, at::kDimVectorStaticSize>;
+
+// Struct representing a tensor in CuFFT's data layout for planning transforms
+// See NOTE [ cuFFT Embedded Strides ].
+struct CuFFTDataLayout {
+  CuFFTDimVector embed;
+  cufft_size_type stride, dist;
+  bool must_clone, simple;
+};
+
+// Returns a cufft embedding for a contiguous signal of the given size.
+// e.g. if the input is cloned, this will be the resulting data layout
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) {
+  CuFFTDataLayout layout;
+  layout.simple = true;
+  layout.must_clone = false;
+  layout.embed.assign(sizes.cbegin() + 1, sizes.cend());
+  if (onesided) {
+    layout.embed.back() = sizes.back() / 2 + 1;
+  }
+  layout.stride = 1;
+  layout.dist = 1;
+  for (const auto& len : layout.embed) {
+    layout.dist *= len;
+  }
+  return layout;
+}
+
+// Convert strides to a CuFFT embedded representation.
+// If strides cannot be embedded, returns a simple layout and sets must_clone flag
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) {
+  const auto signal_ndim = strides.size() - 1;
+  CuFFTDataLayout layout;
+  auto last_stride = strides[signal_ndim];
+  layout.must_clone = (last_stride <= 0);
+
+  const auto last_dim_size = onesided ?
+      sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim];
+  const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size;
+
+  // Zero stides are not allowed, even if the batch size is one.
+  // If that happens just set a dummy case
+  if (sizes[0] == 1) {
+    layout.dist = signal_numel;
+  } else if (strides[0] == 0) {
+    layout.must_clone = true;
+  } else {
+    layout.dist = strides[0];
+  }
+
+  // Calculate the embedding shape, or set must_clone if the strides cannot be embedded
+  layout.embed.resize(signal_ndim);
+  for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) {
+    auto stride = strides[i];
+    if (sizes[i] == 1) {
+      layout.embed[i] = 1;
+    } else if (stride > 0 && stride % last_stride == 0) {
+      layout.embed[i] = stride / last_stride;
+      last_stride = stride;
+    } else {
+      layout.must_clone = true;
+    }
+  }
+
+  if (layout.must_clone) {
+    // If the input needs to be cloned, assume it will be contiguous
+    layout = cufft_simple_embed(sizes, onesided);
+    layout.must_clone = true;
+  } else {
+    layout.embed[0] = sizes[1];
+    layout.stride = strides[signal_ndim];
+    // Determine if layout represents a simple embedding (contiguous data)
+    layout.simple = [&] {
+      for (const auto i : c10::irange(1, signal_ndim - 1)) {
+        if (layout.embed[i] != sizes[i + 1]) {
+          return false;
+        }
+      }
+
+      return (layout.stride == 1 && layout.dist == signal_numel &&
+          layout.embed.back() == last_dim_size);
+    }();
+  }
+  return layout;
+}
+
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. whether to clone input before executing the plan
+//   3. the workspace size needed
+//
+// This class will be the **value** in the plan cache.
+// It **owns** the raw plan via a unique_ptr.
+class CuFFTConfig {
+public:
+
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  CuFFTConfig(const CuFFTConfig&) = delete;
+  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
+
+  explicit CuFFTConfig(const CuFFTParams& params):
+      CuFFTConfig(
+          IntArrayRef(params.input_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.output_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.sizes_, params.signal_ndim_ + 1),
+          params.fft_type_,
+          params.value_type_) {}
+
+  // For complex types, strides are in units of 2 * element_size(dtype)
+  // sizes are for the full signal, including batch size and always two-sided
+  CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype):
+        fft_type_(fft_type), value_type_(dtype) {
+
+    // signal sizes (excluding batch dim)
+    CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const int64_t batch = sizes[0];
+    const int64_t signal_ndim = sizes.size() - 1;
+
+    // Since cuFFT has limited non-unit stride support and various constraints, we
+    // use a flag to keep track throughout this function to see if we need to
+    // input = input.clone();
+
+#if defined(USE_ROCM)
+    // clone input to avoid issues with hipfft clobering the input and failing tests
+    clone_input = true;
+#else
+    clone_input = false;
+#endif
+
+    // For half, base strides on the real part of real-to-complex and
+    // complex-to-real transforms are not supported. Since our output is always
+    // contiguous, only need to check real-to-complex case.
+    if (dtype == ScalarType::Half) {
+      // cuFFT on half requires compute capability of at least SM_53
+      auto dev_prop = at::cuda::getCurrentDeviceProperties();
+      TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+               "cuFFT doesn't support signals of half type with compute "
+               "capability less than SM_53, but the device containing input half "
+               "tensor only has SM_", dev_prop->major, dev_prop->minor);
+      for (const auto i : c10::irange(signal_ndim)) {
+        TORCH_CHECK(is_pow_of_two(sizes[i + 1]),
+            "cuFFT only supports dimensions whose sizes are powers of two when"
+            " computing in half precision, but got a signal size of",
+            sizes.slice(1));
+      }
+      clone_input |= in_strides.back() != 1;
+    }
+
+    CuFFTDataLayout in_layout;
+    if (clone_input) {
+      in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R);
+    } else {
+      in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R);
+    }
+    auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C);
+    TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding");
+    clone_input |= in_layout.must_clone;
+
+    // Check if we can take advantage of simple data layout.
+    //
+    // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+
+    const bool simple_layout = in_layout.simple && out_layout.simple;
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = cufft_complex_input(fft_type);
+    const auto complex_output = cufft_complex_output(fft_type);
+    if (dtype == ScalarType::Float) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == ScalarType::Double) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == ScalarType::Half) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype);
+    }
+
+    // disable auto allocation of workspace to use THC allocator
+    CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    // make plan
+    if (simple_layout) {
+      // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL.
+      // In such case, cuFFT ignores istride, ostride, idist, and odist
+      // by assuming istride = ostride = 1.
+      //
+      // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+    } else {
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+            in_layout.embed.data(), in_layout.stride, in_layout.dist, itype,
+            out_layout.embed.data(), out_layout.stride, out_layout.dist, otype,
+            batch, &ws_size_t, exec_type));
+    }
+    ws_size = static_cast<int64_t>(ws_size_t);
+  }
+
+  const cufftHandle &plan() const { return plan_ptr.get(); }
+
+  CuFFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  bool should_clone_input() const { return clone_input; }
+  int64_t workspace_size() const { return ws_size; }
+
+private:
+  CuFFTHandle plan_ptr;
+  bool clone_input;
+  int64_t ws_size;
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+#if defined(USE_ROCM)
+  // Note that the max plan number for CUDA version < 10 has to be 1023
+  // due to a bug that fails on the 1024th plan
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<int64_t>::max();
+  // The default max cache size chosen for CUDA version > 10 is arbitrary.
+  // This number puts a limit on how big of a plan cache should we maintain by
+  // default. Users can always configure it via cufft_set_plan_cache_max_size.
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<int64_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class CuFFTParamsLRUCache {
+public:
+  using kv_t = typename std::pair<CuFFTParams, CuFFTConfig>;
+  using map_t = typename std::unordered_map<std::reference_wrapper<CuFFTParams>,
+                                            typename std::list<kv_t>::iterator,
+                                            ParamsHash<CuFFTParams>,
+                                            ParamsEqual<CuFFTParams>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+
+  CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  CuFFTParamsLRUCache(int64_t max_size) {
+    _set_max_size(max_size);
+  }
+
+  CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept :
+    _usage_list(std::move(other._usage_list)),
+    _cache_map(std::move(other._cache_map)),
+    _max_size(other._max_size) {}
+
+  CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  // Return const reference because CuFFTConfig shouldn't be tampered with once
+  // created.
+  const CuFFTConfig &lookup(CuFFTParams params) {
+    AT_ASSERT(_max_size > 0);
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                       std::forward_as_tuple(params),
+                       std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                std::forward_as_tuple(kv_it->first),
+                std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    TORCH_CHECK(new_size >= 0,
+             "cuFFT plan cache size must be non-negative, but got ", new_size);
+    TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
+             "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+// Since ATen is separated into CPU build and CUDA build, we need a way to call
+// these functions only when CUDA is loaded. We use CUDA hooks for this purpose
+// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual
+// native function counterparts (at native/SpectralOps.cpp), i.e.,
+// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size
+// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache.
+int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index);
+void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size);
+int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index);
+void cufft_clear_plan_cache_impl(DeviceIndex device_index);
+
+}}} // namespace at::native::detail
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..da5f79d8249621cde70647293e6d841eec23610f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <ATen/Config.h>
+
+#include <string>
+#include <stdexcept>
+#include <sstream>
+#include <cufft.h>
+#include <cufftXt.h>
+
+namespace at { namespace native {
+
+// This means that max dim is 3 + 2 = 5 with batch dimension and possible
+// complex dimension
+constexpr int max_rank = 3;
+
+static inline std::string _cudaGetErrorEnum(cufftResult error)
+{
+  switch (error)
+  {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+#if !defined(USE_ROCM)
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+#endif
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+    default:
+      std::ostringstream ss;
+      ss << "unknown error " << error;
+      return ss.str();
+  }
+}
+
+static inline void CUFFT_CHECK(cufftResult error)
+{
+  if (error != CUFFT_SUCCESS) {
+    std::ostringstream ss;
+    ss << "cuFFT error: " << _cudaGetErrorEnum(error);
+    AT_ERROR(ss.str());
+  }
+}
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..b7f10c697d44436a55e88a836896729d6c5cea29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace at { namespace native {
+#if defined(USE_ROCM)
+// take these out when ROCm implements std:: math functions
+#include <math.h>
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val);
+
+template <>
+__forceinline__ __device__ float device_sqrt(float val) {
+  return ::sqrtf(val);
+}
+
+template <>
+__forceinline__ __device__ double device_sqrt(double val) {
+  return ::sqrt(val);
+}
+#else
+template<typename scalar_t>
+__forceinline__ __device__ double device_sqrt(scalar_t val) {
+  return std::sqrt(val);
+}
+#endif
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f04779f737ba159c0ec3cbcfbae6874b05452ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h
@@ -0,0 +1,672 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/ExpandBase.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <c10/util/Half.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/core/DistributionsHelper.h>
+
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
+#include <cstdint>
+#include <limits>
+#include <utility>
+#include <mutex>
+#include <tuple>
+#include <type_traits>
+
+namespace at {
+namespace native {
+namespace {
+
+// launch bounds used for kernels utilizing TensorIterator
+const uint32_t block_size_bound = 256;
+const uint32_t grid_size_bound = 4;
+// number of randoms given by distributions like curand_uniform4, curand_uniform2_double
+// used in calculating philox offset.
+const uint32_t curand4_engine_calls = 4;
+
+// utility function that calculates proper philox_offset
+// for distributions utilizing TensorIterator. For distributions using
+// TensorIterator, we are using a grid-stride loop with each
+// thread yielding one element per thread. For the edge of the grid-stride
+// loop, if the tensor size is large, the unroll loop will kick in and the float4
+// from curand4 will start getting utilized (for common tensor sizes, we end up
+// using rand.x from each thread). Hence, the philox_offset is
+// (number of elements per thread * number of engine calls), which makes
+// sure that philox offset increment is not less than the number of randoms used
+// in each thread.
+std::tuple<uint64_t, dim3, dim3> calc_execution_policy(int64_t total_elements) {
+  const uint64_t numel = static_cast<uint64_t>(total_elements);
+  const uint32_t block_size = block_size_bound;
+  const uint32_t unroll = curand4_engine_calls;
+  dim3 dim_block(block_size);
+  dim3 grid((numel + block_size - 1) / block_size);
+  uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size;
+  grid.x = std::min(
+      static_cast<uint32_t>(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm,
+      grid.x);
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll) + 1)
+                                * curand4_engine_calls;
+  return std::make_tuple(counter_offset, grid, dim_block);
+}
+
+// grid stride loop kernel for distributions
+template<typename accscalar_t, int unroll_factor, typename dist_t, typename transform_t>
+C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound)
+__global__ void distribution_elementwise_grid_stride_kernel(int numel,
+                                                            PhiloxCudaState philox_args,
+                                                            const dist_t dist_func,
+                                                            const transform_t transform_func) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds),
+              idx,
+              std::get<1>(seeds),
+              &state);
+
+  int rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) *
+      blockDim.x * gridDim.x * unroll_factor;
+  for(int linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) {
+    auto rand = dist_func(&state);
+    #pragma unroll
+    for (int ii = 0; ii < unroll_factor; ii++) {
+      int li = linear_index + blockDim.x * gridDim.x * ii;
+      if (li < numel) {
+        transform_func(li, static_cast<accscalar_t>((&rand.x)[ii]));
+      }
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * distribution_nullary_kernel is analogous to gpu_kernel in
+ * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses
+ * TensorIterator to launch a kernel. However, the differences are
+ *   - it launches a grid-stride loop based kernel. The kernel is not
+ *     generic like elementwise_kernel in Loops.cuh and is specialized
+ *     for the distribution kernels here.
+ *   - For big size tensors, we can launch multiple kernels recursively
+ *     (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox
+ *     offset calculation is done in this function.
+ *
+ * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh
+ * to have grid-stride loop kernel and then use that to launch our distribution
+ * kernels? Note that we need a grid-stride loop kernel because, we found by testing
+ * that it achieves peak effective bandwidth.
+ */
+template<typename scalar_t,
+         typename accscalar_t,
+         int unroll_factor,
+         typename RNG,
+         typename dist_t,
+         typename transform_t>
+void distribution_nullary_kernel(at::TensorIteratorBase& iter,
+                                 RNG gen,
+                                 const dist_t& dist_func,
+                                 const transform_t transform_func) {
+  static_assert(unroll_factor >= 1, "unroll_factor must be >= 1.");
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+
+  auto execution_policy = calc_execution_policy(numel);
+  auto counter_offset = std::get<0>(execution_policy);
+  auto grid = std::get<1>(execution_policy);
+  auto block = std::get<2>(execution_policy);
+  PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_nullary_kernel<scalar_t, accscalar_t, unroll_factor>(sub_iter,
+        gen, dist_func, transform_func);
+    }
+    return;
+  }
+
+  char* out_data = (char*)iter.data_ptr(0);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  if (iter.is_trivial_1d()) {
+    auto strides = iter.get_inner_strides();
+    int stride0 = strides[0];
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        scalar_t* out = (scalar_t*)&out_data[stride0 * idx];
+        *out = transform_func(rand);
+      }
+    );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    auto offset_calc = make_offset_calculator<1>(iter);
+    distribution_elementwise_grid_stride_kernel<accscalar_t, unroll_factor><<<grid, block, 0, stream>>>(
+      numel,
+      rng_engine_inputs,
+      dist_func,
+      [=]__device__(int idx, accscalar_t rand) {
+        auto offsets = offset_calc.get(idx);
+        scalar_t* out = (scalar_t*)&out_data[offsets[0]];
+        *out = transform_func(rand);
+      }
+    );
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+// Binary kernel
+template <typename func_t, typename inp_offset_calc_t, typename out_offset_calc_t>
+__global__ void distribution_binary_elementwise_kernel(
+    int numel,
+    func_t f,
+    PhiloxCudaState philox_args,
+    typename function_traits<func_t>::result_type *output_data,
+    const typename function_traits<func_t>::template arg<1>::type *input_data_1,
+    const typename function_traits<func_t>::template arg<2>::type *input_data_2,
+    inp_offset_calc_t inp_calc,
+    out_offset_calc_t out_calc) {
+  auto seeds = at::cuda::philox::unpack(philox_args);
+
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+
+  input_t_1 inputs_1[thread_work_size()];
+  input_t_2 inputs_2[thread_work_size()];
+
+  int base_index = block_work_size() * blockIdx.x;
+  int remaining = std::min<int>(numel - base_index, block_work_size());
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds),
+              blockIdx.x * blockDim.x + threadIdx.x,
+              std::get<1>(seeds),
+              &state);
+
+  // load data into registers
+  int thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = inp_calc.get(input_idx);
+    inputs_1[i] = input_data_1[offsets[0]];
+    inputs_2[i] = input_data_2[offsets[1]];
+
+    thread_idx += num_threads();
+  }
+
+  // compute and store
+  thread_idx = threadIdx.x;
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (thread_idx >= remaining) {
+      break;
+    }
+    int input_idx = thread_idx + base_index;
+    auto offsets = out_calc.get(input_idx);
+    output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]);
+    thread_idx += num_threads();
+  }
+}
+
+template <typename func_t>
+void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) {
+  static_assert(std::is_same<typename function_traits<func_t>::template arg<0>::type, curandStatePhilox4_32_10_t&>::value, "the first argument of functor must be curandStatePhilox4_32_10_t");
+  using input_t_1 = typename function_traits<func_t>::template arg<1>::type;
+  using input_t_2 = typename function_traits<func_t>::template arg<2>::type;
+  using output_t = typename function_traits<func_t>::result_type;
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      distribution_binary_kernel(sub_iter, philox_args, f);
+    }
+    return;
+  }
+
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing());
+
+  int64_t numel = iter.numel();
+  if (numel == 0) {
+    return;
+  }
+
+  output_t *output_data = static_cast<output_t *>(iter.data_ptr(0));
+  const input_t_1 *input_data_1 = static_cast<const input_t_1 *>(iter.data_ptr(1));
+  const input_t_2 *input_data_2 = static_cast<const input_t_2 *>(iter.data_ptr(2));
+
+  int64_t grid = (numel + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (iter.is_contiguous()) {
+    distribution_binary_elementwise_kernel<<<grid,num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    distribution_binary_elementwise_kernel<<<grid, num_threads(), 0, stream>>>(
+        numel, f, philox_args, output_data, input_data_1, input_data_2,
+        make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace
+}} // namespace at::native
+
+
+namespace at {
+namespace native {
+namespace templates {
+namespace cuda {
+
+// ==================================================== Random ========================================================
+
+template<typename RNG>
+void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) {
+  AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] {
+    if ((
+      std::is_same<scalar_t, int64_t>::value ||
+      std::is_same<scalar_t, double>::value ||
+      std::is_same<scalar_t, float>::value ||
+      std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
+    {
+      // define lambda to mod with range and add base
+      auto random_func = [range, base] __device__ (uint64_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [range, base] __device__ (uint32_t rand) {
+        return transformation::uniform_int_from_to<scalar_t>(rand, range, base);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand4(state);
+        },
+        random_func);
+    }
+   }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+}
+
+// This is the special kernel to handle single specific case:
+// from(inclusive) = std::numeric_limits<int64_t>::lowest()
+// to(exclusive) = None (= std::numeric_limits<int64_t>::max() + 1)
+template<typename RNG>
+void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] {
+    if (std::is_same<scalar_t, int64_t>::value ||
+        std::is_same<scalar_t, double>::value ||
+        std::is_same<scalar_t, float>::value ||
+        std::is_same<scalar_t, at::BFloat16>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int_full_range<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16");
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, c10::optional<Generator> gen) {
+    random_from_to_kernel(iter, range, base, check_generator<RNG>(gen));
+  }
+  void operator()(TensorIteratorBase& iter, c10::optional<Generator> gen) {
+    random_full_64_bits_range_kernel(iter, check_generator<RNG>(gen));
+  }
+};
+
+template<typename RNG>
+void random_kernel(TensorIteratorBase& iter, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] {
+    if (std::is_same<scalar_t, double>::value || std::is_same<scalar_t, int64_t>::value) {
+      auto random_func = [] __device__ (uint64_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint64_t, curand4_engine_calls/2>(iter, gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 {
+          ulonglong2 ret;
+          uint4 rand_val = curand4(state);
+          ret.x = (static_cast<uint64_t>(rand_val.x) << 32) | rand_val.y;
+          ret.y = (static_cast<uint64_t>(rand_val.z) << 32) | rand_val.w;
+          return ret;
+        },
+        random_func);
+    } else {
+      auto random_func = [] __device__ (uint32_t rand) {
+        return transformation::uniform_int<scalar_t>(rand);
+      };
+      distribution_nullary_kernel<scalar_t, uint32_t, curand4_engine_calls>(iter,
+        gen,
+        [] __device__ (curandStatePhilox4_32_10_t* state) {
+          return curand4(state);
+        },
+        random_func);
+    }
+  });
+}
+
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIteratorBase& iter, RNG gen) {
+    random_kernel(iter, gen);
+  }
+};
+
+// ====================================================================================================================
+
+template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
+void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_uniform4(state); },
+      transform);
+  }
+}
+
+template<typename scalar_t, typename accscalar_t, size_t curand4_engine_calls, typename RNG, typename transform_t>
+void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) {
+  if (std::is_same<scalar_t, double>::value) {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls/2>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal2_double(state); },
+      transform);
+  } else {
+    distribution_nullary_kernel<scalar_t, accscalar_t, curand4_engine_calls>(iter,
+      gen,
+      [] __device__ (curandStatePhilox4_32_10_t* state) { return curand_normal4(state); },
+      transform);
+  }
+}
+
+// ==================================================== Normal ========================================================
+
+template<typename RNG>
+void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) {
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda to multiply std and add mean
+    auto normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::normal<accscalar_t>(rand, mean, std));
+    };
+    normal_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, normal_func);
+   });
+}
+
+template<typename RNG>
+struct NormalKernel {
+  void operator()(const TensorBase &self, double mean, double std, c10::optional<Generator> gen) {
+    normal_kernel(self, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Uniform ========================================================
+
+template<typename RNG>
+void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] {
+    auto from = static_cast<scalar_t>(from_);
+    auto to = static_cast<scalar_t>(to_);
+    using opmath_t = at::opmath_type<scalar_t>;
+    auto range = static_cast<opmath_t>(to-from);
+    // define lambda to reverse bounds, multiply 'range' and add 'from_'
+    auto uniform_func = [range, from, to] __device__ (opmath_t rand) {
+      // Compute output value before reversing the bounds
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947
+      auto value = static_cast<scalar_t>(rand * range + from);
+      // reverse the bounds of curand4 from (0, 1] to [0, 1)
+      // Note that this method is from legacy THCTensorRandom and is likely to give
+      // you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
+      // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
+      auto reverse_bound_value = value == to ? from : value;
+      return reverse_bound_value;
+    };
+    uniform_and_transform<scalar_t, opmath_t, curand4_engine_calls>(iter, gen, uniform_func);
+   });
+}
+
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIteratorBase& iter, double from, double to, c10::optional<Generator> gen) {
+    uniform_kernel(iter, from, to, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== LogNormal =======================================================
+
+template<typename RNG>
+void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto mean = static_cast<accscalar_t>(mean_);
+    auto std = static_cast<accscalar_t>(std_);
+    // define lambda for log_normal transformation
+    auto log_normal_func = [mean, std] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::log_normal<accscalar_t>(transformation::normal<accscalar_t>(rand, mean, std)));
+    };
+    normal_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, log_normal_func);
+   });
+}
+
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIteratorBase& iter, double mean, double std, c10::optional<Generator> gen) {
+    log_normal_kernel(iter, mean, std, check_generator<RNG>(gen));
+  }
+};
+
+// =================================================== Geometric ======================================================
+
+template<typename RNG>
+void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] {
+    using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+    // define lambda for geometric transformation
+    auto geometric_func = [p] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::geometric<accscalar_t>(rand, p));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, geometric_func);
+  });
+}
+
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+    geometric_kernel(iter, p, check_generator<RNG>(gen));
+  }
+};
+
+// ================================================== Exponential =====================================================
+
+template<typename RNG>
+void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) {
+  TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto lambda = static_cast<accscalar_t>(lambda_);
+    // define lambda for exponential transformation
+    auto exponential_func = [lambda] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::exponential<accscalar_t>(rand, lambda));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, exponential_func);
+   });
+}
+
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIteratorBase& iter, double lambda, c10::optional<Generator> gen) {
+    exponential_kernel(iter, lambda, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Cauchy ========================================================
+
+template<typename RNG>
+void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+    auto median = static_cast<accscalar_t>(median_);
+    auto sigma = static_cast<accscalar_t>(sigma_);
+    // define lambda for cauchy transformation
+    auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) {
+      return static_cast<scalar_t>(transformation::cauchy<accscalar_t>(rand, median, sigma));
+    };
+    uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, cauchy_func);
+   });
+}
+
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIteratorBase& iter, double median, double sigma, c10::optional<Generator> gen) {
+    cauchy_kernel(iter, median, sigma, check_generator<RNG>(gen));
+  }
+};
+
+// ==================================================== Bernoulli =====================================================
+
+template<typename scalar_t, typename prob_t>
+void bernoulli_tensor_cuda_kernel(
+    const TensorBase &ret, const at::TensorBase &p,
+    PhiloxCudaState philox_args) {
+  auto functor = [philox_args] __device__(
+          int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4,
+          const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) {
+        auto seeds = at::cuda::philox::unpack(philox_args);
+        curandStatePhilox4_32_10_t state;
+        curand_init(std::get<0>(seeds),
+                    blockIdx.x * blockDim.x + threadIdx.x,
+                    std::get<1>(seeds),
+                    &state);
+
+        // See Note [Register spilling in curand call for CUDA < 10]
+        float4 rand = curand_uniform4(&state);
+        switch (n) {
+          case 4: {
+            CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1);
+            v4 = static_cast<scalar_t>(rand.w <= p4);
+            // fallthrough
+          }
+          case 3: {
+            CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1);
+            v3 = static_cast<scalar_t>(rand.z <= p3);
+            // fallthrough
+          }
+          case 2: {
+            CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1);
+            v2 = static_cast<scalar_t>(rand.y <= p2);
+            // fallthrough
+          }
+          case 1: {
+            CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1);
+            v1 = static_cast<scalar_t>(rand.x <= p1);
+          }
+        }
+      };
+  // The template argument `4` below indicates that we want to operate on four
+  // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details.
+  at::cuda::CUDA_tensor_apply2<scalar_t, prob_t, 4, decltype(functor),
+                               /*max_threads_per_block=*/512,
+                               /*min_blocks_per_sm==*/2>(ret, p, functor);
+}
+
+template<typename RNG>
+void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) {
+  PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(10);
+  }
+  TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type());
+  // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else
+  const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat;
+  auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type));
+  auto p = expand_inplace(self, p_cuda);
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] {
+      if (std::is_same<scalar_t, double>::value) {
+        return bernoulli_tensor_cuda_kernel<double, double>(self, *p, rng_engine_inputs);
+      } else {
+        return bernoulli_tensor_cuda_kernel<scalar_t, float>(self, *p, rng_engine_inputs);
+      }
+   });
+}
+
+template<typename RNG>
+void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) {
+  AT_DISPATCH_ALL_TYPES_AND3(
+    at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] {
+      using accscalar_t = at::DiscreteDistributionType<scalar_t>::type;
+      // define lambda for bernoulli transformation
+      auto bernoulli_func = [p] __device__ (accscalar_t rand) {
+        return static_cast<scalar_t>(transformation::bernoulli<accscalar_t>(rand, p));
+      };
+      uniform_and_transform<scalar_t, accscalar_t, curand4_engine_calls>(iter, gen, bernoulli_func);
+   });
+}
+
+template<typename RNG>
+struct BernoulliKernel {
+  void operator()(TensorIteratorBase& iter, double p, c10::optional<Generator> gen) {
+    bernoulli_kernel(iter, p, check_generator<RNG>(gen));
+  }
+  void operator()(const TensorBase &self, const TensorBase &p_, c10::optional<Generator> gen) {
+    bernoulli_kernel(self, p_, check_generator<RNG>(gen));
+  }
+};
+
+}}}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Distributions.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Distributions.h
new file mode 100644
index 0000000000000000000000000000000000000000..053eff0c7d7a5a84db1601bf17fd19dc2cc35382
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Distributions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+namespace at {
+struct CUDAGeneratorImpl;
+struct TensorIteratorBase;
+class TensorBase;
+
+namespace native {
+
+void launch_poisson_cuda_kernel(
+    const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen);
+
+void launch_gamma_kernel(
+    const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen);
+
+void launch_binomial_cuda_kernel(
+    TensorIteratorBase &iter, CUDAGeneratorImpl *gen);
+
+void launch_dirichlet_kernel(TensorIteratorBase &iter);
+
+void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter);
+
+void launch_dirichlet_grad_kernel(TensorIteratorBase &iter);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ceed1577ab60b1b84a8522498e2ca438d7fb3ef4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@@ -0,0 +1,22 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/TensorUtils.h>
+
+namespace at {
+namespace native {
+
+Tensor embedding_backward_cuda_kernel(
+    const Tensor &grad,
+    const Tensor &orig_indices,
+    const Tensor &sorted_indices,
+    const Tensor &count,
+    int64_t num_weights,
+    int padding_idx = -1,
+    bool mode_mean = false,
+    const Tensor &offset2bag = Tensor(),
+    const Tensor &bag_size = Tensor(),
+    const Tensor &per_sample_weights = Tensor());
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df0b078ba24a1bbb14f998cd569403244d4a18f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh
@@ -0,0 +1,681 @@
+#pragma once
+#include <ATen/OpMathType.h>
+#include <ATen/native/ForeachUtils.h>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <ATen/native/cuda/Pow.cuh>
+
+namespace at::native {
+
+namespace {
+
+// TODO(crcrpar): Handle version bump in codegen.
+// rel:
+// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482
+inline void increment_version(TensorList tensors) {
+  for (const auto& t : tensors) {
+    t.unsafeGetTensorImpl()->bump_version();
+  }
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    TensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+// Initializes args and checks if all args are aligned
+template <int depth, typename T, typename T2>
+__device__ bool init_args(
+    T** args,
+    TensorListScalarListMetadata<T2, depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ bool init_args(
+    T** args,
+    FusedOptimizerTensorListMetadata<depth>& tl,
+    const int64_t chunk_idx,
+    const int64_t chunk_size,
+    const int64_t tensor_loc) {
+  bool all_aligned = true;
+  for (int i = 0; i < depth; i++) {
+    args[i] = (T*)tl.addresses[i][tensor_loc];
+    args[i] += chunk_idx * chunk_size;
+
+    if (!is_aligned(args[i])) {
+      all_aligned = false;
+    }
+  }
+  return all_aligned;
+}
+
+template <int depth, typename T>
+__device__ void load_args(
+    T r_args[][kILP],
+    T** args,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const auto i = i_start + threadIdx.x + ii * blockDim.x;
+    for (int r_index = 0; r_index < depth; r_index++) {
+      r_args[r_index][ii] = 0;
+      if (i < n && i < chunk_size) {
+        r_args[r_index][ii] = args[r_index][i];
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ void store_args(
+    T* dst,
+    T* src,
+    const int64_t i_start,
+    const int64_t chunk_size,
+    const int64_t n) {
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    const int64_t i = i_start + threadIdx.x + ii * blockDim.x;
+    if (i < n && i < chunk_size)
+      dst[i] = src[ii];
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void binary_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args
+      // has depth 1
+      load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            op(static_cast<opmath_t>(r_args[0][ii]),
+               static_cast<opmath_t>(scalar)));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+template <int res_arg_index, typename Op, typename T, typename opmath_t>
+__device__ __forceinline__ void pointwise_op_scalar(
+    T r_args[][kILP],
+    T** args,
+    opmath_t scalar,
+    const int64_t n,
+    const int64_t chunk_size,
+    const bool all_aligned,
+    Op op) {
+  // to make things simple, we put aligned case in a different code path
+  if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+    for (int64_t i_start = threadIdx.x;
+         i_start * kILP < n && i_start * kILP < chunk_size;
+         i_start += blockDim.x) {
+      // load
+      load_store(r_args[0], args[0], 0, i_start);
+      load_store(r_args[1], args[1], 0, i_start);
+      load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      // store
+      load_store(args[res_arg_index], r_args[0], i_start, 0);
+    }
+  } else {
+    for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+         i_start += blockDim.x * kILP) {
+      // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args
+      // has depth 3
+      load_args<3>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+      for (int ii = 0; ii < kILP; ii++) {
+        r_args[0][ii] = static_cast<T>(
+            static_cast<opmath_t>(r_args[0][ii]) +
+            scalar *
+                op(static_cast<opmath_t>(r_args[1][ii]),
+                   static_cast<opmath_t>(r_args[2][ii])));
+      }
+      store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+    }
+  }
+}
+
+//
+// Binary Functors
+//
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    binary_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpListAlphaFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 alpha * static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct BinaryOpScalarTensorFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      T* scalar,
+      opmath_t alpha) {
+    const int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        // Regardless if depth is 1 (for inplace) or 2 (for out of place),
+        // r_args has depth 1
+        load_args<1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(op(
+              static_cast<opmath_t>(r_args[0][ii]),
+              static_cast<opmath_t>(alpha) * static_cast<opmath_t>(*scalar)));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Unary Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct ZeroFunctor {
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<1>& tl) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const auto all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        // store
+        load_store(args[0], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = 0;
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct UnaryOpFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              static_cast<T>(op(static_cast<opmath_t>(r_args[0][ii])));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+//
+// Pointwise Functors
+//
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t scalar) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct PointwiseOpScalarListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListScalarListMetadata<opmath_t, depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    opmath_t scalar = tl.scalar_vals[tensor_loc];
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    pointwise_op_scalar<res_arg_index>(
+        r_args, args, scalar, n, chunk_size, all_aligned, op);
+  }
+};
+
+template <typename T, int depth>
+struct PointwiseOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[depth - 1][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        // store
+        load_store(args[2], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth - 1>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] = static_cast<T>(
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii])));
+        }
+        store_args(args[2], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpListFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op) {
+    static_assert(depth == 3 || depth == 4, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+        load_store(r_args[2], args[2], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 static_cast<opmath_t>(r_args[2][ii]));
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T, int depth, int r_args_depth, int res_arg_index>
+struct TernaryOpScalarFunctor {
+  using opmath_t = at::opmath_type<T>;
+  template <typename Op>
+  __device__ __forceinline__ void operator()(
+      int chunk_size,
+      TensorListMetadata<depth>& tl,
+      Op op,
+      opmath_t alpha) {
+    static_assert(depth == 2 || depth == 3, "");
+    static_assert(depth >= r_args_depth, "");
+    static_assert(res_arg_index == depth - 1 || res_arg_index == 0, "");
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    auto n = tl.numel_for_tensor[tensor_loc];
+
+    T* args[depth];
+    const bool all_aligned =
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc);
+    n -= chunk_idx * chunk_size;
+    T r_args[r_args_depth][kILP];
+
+    // to make things simple, we put aligned case in a different code path
+    if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+        // load
+        load_store(r_args[0], args[0], 0, i_start);
+        load_store(r_args[1], args[1], 0, i_start);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        // store
+        load_store(args[res_arg_index], r_args[0], i_start, 0);
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<r_args_depth>(r_args, args, i_start, chunk_size, n);
+#pragma unroll
+        for (int ii = 0; ii < kILP; ii++) {
+          r_args[0][ii] =
+              op(static_cast<opmath_t>(r_args[0][ii]),
+                 static_cast<opmath_t>(r_args[1][ii]),
+                 alpha);
+        }
+        store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(a, b);
+  }
+};
+
+template <typename T>
+struct reverse_power_functor {
+  C10_DEVICE T operator()(const T& a, const T& b) const {
+    return at::native::pow_(b, a);
+  }
+};
+
+} // namespace
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..32421ef305a9905cd6d54805429fa58bc78b0825
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+
+namespace at::native {
+
+// std:: does not have clamp functors
+template <typename T>
+struct minimum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a < b) ? a : b;
+  }
+};
+
+template <typename T>
+struct maximum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a > b) ? a : b;
+  }
+};
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d23bc89fa64e55017e69f1352e86f9e36dcc36a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh
@@ -0,0 +1,321 @@
+#pragma once
+#include <ATen/native/cuda/KernelUtils.cuh>
+#include <ATen/native/GridSamplerUtils.h>
+
+namespace at { namespace native {
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
+// except that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
+                                           bool align_corners, scalar_t *grad_in) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    *grad_in = static_cast<scalar_t>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    *grad_in = static_cast<scalar_t>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates(scalar_t in, int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+}
+
+// clip_coordinates_set_grad works similarly to clip_coordinates except that
+// it also returns the `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
+  // Note that it is important for the gradient calculation that borders
+  // are considered out of bounds.
+  if (in <= static_cast<scalar_t>(0)) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  } else {
+    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<scalar_t>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<scalar_t>(1);
+      return in;
+    }
+  }
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+// reflect_coordinates_set_grad works similarly to reflect_coordinates except
+// that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
+                                      scalar_t *grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  }
+  int grad_in_mult_;
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<scalar_t>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  // `fmod` returns same sign as `in`, which is positive after the `if` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<scalar_t>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t safe_downgrade_to_int_range(scalar_t x){
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t compute_coordinates(scalar_t coord, int size,
+                             GridSamplerPadding padding_mode,
+                             bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2*(size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2*size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+// grid_sampler_compute_source_index_set_grad works similarly to
+// grid_sampler_compute_source_index except that it also returns the
+// `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static __forceinline__ __device__
+scalar_t grid_sampler_compute_source_index_set_grad(
+    scalar_t coord,
+    int size,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    scalar_t *grad_in) {
+  scalar_t grad_clip, grad_refl;
+  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
+    } else {
+      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__
+bool within_bounds_2d(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__
+bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template<typename scalar_t>
+static __forceinline__ __device__
+scalar_t get_value_bounded(
+    scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_2d(scalar_t *data, int h, int w,
+                 int sH, int sW, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_2d(h, w, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void safe_add_3d(scalar_t *data, int d, int h, int w,
+                 int sD, int sH, int sW, int D, int H, int W,
+                 scalar_t delta,
+                 const index_t NC_offset,
+                 const index_t memory_span) {
+  if (within_bounds_3d(d, h, w, D, H, W)) {
+    fastAtomicAdd(data,
+                  NC_offset + d * sD + h * sH + w * sW,
+                  memory_span,
+                  delta,
+                  true);
+  }
+}
+
+template<typename scalar_t, typename index_t>
+static __forceinline__ __device__
+void add_value_bounded(
+    scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
+    scalar_t delta,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    const index_t NC_offset,
+    const index_t memory_span) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int ix = static_cast<int>(x);
+  int iy = static_cast<int>(y);
+
+  safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span);
+}
+
+// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
+template<typename scalar_t>
+static __forceinline__ __device__
+void get_cubic_coefficients_grad(
+    scalar_t coeffs[4],
+    scalar_t t) {
+
+  // Must be the same as forward calculation in
+  // aten/src/ATen/native/cuda/UpSample.cuh:get_cubic_upsample_coefficients
+  scalar_t A = -0.75;
+
+  scalar_t x;
+  x = -1 - t;  // 1 < x = |-1 - tx| < 2
+  coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A;
+  x = -t;     // x = |0 - tx| <= 1
+  coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 1 - t;  // x = |1 - tx| <= 1
+  coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 2 - t;  // 1 < x = |2 - tx| < 2
+  coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A;
+}
+
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..507662c13c8af7199e2620fd96f4f5309fa67884
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/GridSampler.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/IndexKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..de2ab4de3416634213bf8299ab8e06d26ed41e2b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/IndexKernel.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <cstdint>
+
+namespace at {
+struct TensorIteratorBase;
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+/// @param maskPrefixSum[in,out]
+void launch_masked_scatter_kernel(
+    const TensorBase &self, const TensorBase &mask,
+    const TensorBase &maskPrefixSum, const TensorBase &source);
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..38e1f571968eb19ebc3b71a595cd09bb85a8202d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <ATen/cuda/CUDAConfig.h>
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <ATen/native/cuda/CUDAJitLoops.cuh>
+
+namespace at {
+namespace native {
+
+/* Note [Jiterator]
+The "jiterator" simply just-in-time compiles the same kernels that
+Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time,
+build size, and initial CUDA context size.
+
+By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels.
+This behavior is controlled with two environment variables:
+  - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use
+  - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels
+
+The jiterator currently has some limitations, however. It cannot:
+  - handle math on complex datatypes
+  - handle kernels with scalar parameters
+
+These improvements will likely come soon.
+
+For examples of how to use the jiterator see the i1 and gcd kernel
+implementations, which pass jittable strings implementing their
+operations instead of the typical CUDA functors.
+
+To pass a runtime argument (similar to lambda captures in non-JIT kernels),
+we need to pass to additional arguments to `jitted_gpu_kernel` by value.
+Currently only primitive C++ types used for computation are valid.
+The order of these extra arguments should be same as the order they appear
+in kernel's function signature. (look at polygamma for example)
+
+NOTE: One big restriction being that these arguments should be after the
+arguments provided by TensorIterator. Eg. While capturing `n`, where
+`scalar_t x` and `scalar_t y` are provided by TensorIterator,
+* foo(scalar_t x, scalar_t y, int n) works!
+* foo(int n, scalar_t x, scalar_y) doesn't work
+* foo(scalar_t x, int n, scalar_y) doesn't work
+
+*/
+
+// Entrypoint for jitted GPU kernels.
+// Only handles elementwise unary and binary kernels with a
+//   common dtype and a single output.
+// NOTE: this assumes the op's iterator has a common_dtype.
+// NOTE: We use std::tuple instead of parameter pack
+//  for `extra_args` due to following
+// bug on older versions of clang
+// https://bugs.llvm.org/show_bug.cgi?id=23029
+template <
+    char const* name,
+    typename return_type,
+    typename f_inputs_type,
+    int arity,
+    typename... Args>
+void jitted_gpu_kernel(
+    TensorIteratorBase& iter,
+    const std::string& f,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    at::opmath_type<f_inputs_type> scalar_val = 0,
+    std::tuple<Args...> extra_args = std::make_tuple()) {
+  // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel
+  //   Maybe it could be refactored?
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      jitted_gpu_kernel<name, return_type, f_inputs_type, arity>(
+          sub_iter, f, scalar_pos, scalar_val, extra_args);
+    }
+
+    return;
+  }
+
+  // Computes if dynamic casting is needed
+  // Dynamic casting is needed if an input's dtype differs from the common dtype
+  //   or if the result dtype differs from the output's dtype
+  // Note: this is intentionally divergent from calling needs_dynamic_casting,
+  //   which is more general and inspects a lambda to determine if dynamic
+  //   casting is needed.
+  bool needs_dynamic_casting = false;
+
+  // Checks output
+  const ScalarType return_scalar_type = c10::CppTypeToScalarType<return_type>::value;
+  const auto dtype0 = iter.dtype(0);
+  if (dtype0 != return_scalar_type) {
+    needs_dynamic_casting = true;
+  }
+
+  // Checks input(s)
+  const ScalarType inputs_scalar_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  for (auto i = decltype(arity){1}; i < (arity + 1); ++i) {
+    const auto dtypei = iter.dtype(i);
+    if (dtypei != inputs_scalar_type) {
+      needs_dynamic_casting = true;
+      break;
+    }
+  }
+  if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) {
+    // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used
+    // for computation in the generated code and hence we pass a dummy
+    // value of `0`.
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::NoScalar>(
+        iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args);
+  } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::RhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+
+  } else {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::LhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+  }
+}
+
+// TODO: support runtime state capture similar to `jitted_gpu_kernel`.
+template <char const *name, typename return_type, typename f_inputs_type>
+void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+  //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type)
+  using opmath_t = at::opmath_type<f_inputs_type>;
+  if (iter.is_cpu_scalar(1)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(1);
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val);
+  } else if (iter.is_cpu_scalar(2)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(2);
+    iter.remove_operand(2);
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val);
+  } else {
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 2>(iter, f);
+  }
+}
+
+}}  // at::native
+
+#endif // AT_USE_JITERATOR()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1c12691ac9307243b8b00a7ac30930980e6456e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh
@@ -0,0 +1,149 @@
+#pragma once
+#include <ATen/cuda/Atomic.cuh>
+
+#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))))
+#include <cuda_bf16.h>
+#endif
+
+namespace at {
+namespace native {
+
+__device__ __forceinline__ size_t
+idx(const size_t nc,
+    const size_t height,
+    const size_t width,
+    const size_t h,
+    const size_t w) {
+  return (nc * height + h) * width + w;
+}
+
+// for channels-last
+__device__ __forceinline__ size_t
+idx_cl(
+  const size_t n, const size_t h, const size_t w, const size_t c,
+  const size_t height, const size_t width, const size_t channel
+) {
+  return ((n * height + h) * width + w) * channel + c;
+}
+
+// fastSpecializedAtomicAdd (and fastAtomicAdd) are an optimization
+// that speed up half-precision atomics.  The situation with half
+// precision atomics is that we have a slow __half atomic, and
+// a fast vectored __half2 atomic (this can be worth up to a 6x
+// speedup, see https://github.com/pytorch/pytorch/pull/21879).
+// We can convert a __half atomic into a __half2 atomic by simply
+// pairing the __half with a zero entry on the left/right depending
+// on alignment... but only if this wouldn't cause an out of bounds
+// access!  Thus, you must specify tensor and numel so we can check
+// if you would be out-of-bounds and use a plain __half atomic if
+// you would be.
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<std::is_same<c10::Half, scalar_t>::value>::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+#if (                      \
+    (defined(USE_ROCM)) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  gpuAtomicAddNoReturn(
+      reinterpret_cast<at::Half*>(tensor) + index,
+      static_cast<at::Half>(value));
+#else
+  // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned)
+  __half* target_addr = reinterpret_cast<__half*>(tensor + index);
+  bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
+
+  if (low_byte && index < (numel - 1)) {
+    __half2 value2;
+    value2.x = static_cast<__half>(value);
+    value2.y = __int2half_rz(0);
+    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
+
+  } else if (!low_byte && index > 0) {
+    __half2 value2;
+    value2.x = __int2half_rz(0);
+    value2.y = static_cast<__half>(value);
+    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(
+        reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value));
+  }
+#endif
+}
+
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<std::is_same<c10::BFloat16, scalar_t>::value>::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+#if (                      \
+    (defined(USE_ROCM)) || \
+    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
+  gpuAtomicAddNoReturn(
+      reinterpret_cast<at::BFloat16*>(tensor) + index,
+      static_cast<at::BFloat16>(value));
+#else
+  // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned)
+  __nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index);
+  bool low_byte = (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__nv_bfloat162) == 0);
+
+  if (low_byte && index < (numel - 1)) {
+    __nv_bfloat162 value2;
+    value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
+    value2.y = __int2bfloat16_rz(0);
+    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
+
+  } else if (!low_byte && index > 0) {
+    __nv_bfloat162 value2;
+    value2.x = __int2bfloat16_rz(0);
+    value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
+    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
+
+  } else {
+    atomicAdd(
+        reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
+  }
+#endif
+}
+
+
+template <
+    typename scalar_t,
+    typename index_t,
+    typename std::enable_if<!std::is_same<c10::Half, scalar_t>::value && !std::is_same<c10::BFloat16, scalar_t>::value >::type* =
+        nullptr>
+__device__ __forceinline__ void fastSpecializedAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value) {
+  gpuAtomicAddNoReturn(tensor + index, value);
+}
+
+template <class scalar_t, class index_t>
+__device__ __forceinline__ void fastAtomicAdd(
+    scalar_t* tensor,
+    index_t index,
+    const index_t numel,
+    scalar_t value,
+    bool fast_atomics) {
+  if (fast_atomics) {
+    fastSpecializedAtomicAdd(tensor, index, numel, value);
+  } else {
+    gpuAtomicAddNoReturn(tensor + index, value);
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..746fa2c34ecf7232fead66384142f9454efb67b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h
@@ -0,0 +1,18 @@
+#pragma once
+#include<algorithm>
+
+namespace at {
+namespace native {
+
+// returns 2**floor(log2(n))
+static int lastPow2(unsigned int n) {
+  n |= (n >> 1);
+  n |= (n >> 2);
+  n |= (n >> 4);
+  n |= (n >> 8);
+  n |= (n >> 16);
+  return std::max<int>(1, n - (n >> 1));
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Loops.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Loops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..741d31711e90669c12a7e163c76ea8eb1ba78027
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Loops.cuh
@@ -0,0 +1,326 @@
+#pragma once
+
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+
+namespace at { namespace native {
+
+template<int N>
+static OffsetCalculator<N> make_input_offset_calculator(const TensorIteratorBase& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template <int num_outputs = 1>
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
+  std::array<const int64_t*, num_outputs> strides;
+  int64_t element_sizes[num_outputs];
+  for (int i = 0; i < num_outputs; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template<typename func_t, typename policy_t>
+__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  using args_t = typename traits::ArgsTuple;
+
+  int idx = blockIdx.x;
+
+  return_t results[thread_work_size()];
+  args_t args[thread_work_size()];
+
+  // load
+  policy.load(args, idx);
+
+  // compute
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (policy.check_inbounds(i)) {
+      results[i] = c10::guts::apply(f, args[i]);
+    }
+  }
+
+  // store
+  policy.store(results, idx);
+}
+
+}}  // namespace at::native
+
+#include <ATen/native/cuda/CUDALoops.cuh>
+
+namespace at:: native {
+
+template <typename func_t>
+void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_nocast(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl_nocast(iter, f);
+}
+
+template <typename func_t>
+void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl(iter, f);
+}
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct AUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  __device__ return_t operator()(arg2_t b) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {}
+  private:
+    func_t f;
+    opmath_arg1_t a;
+};
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  __device__ return_t operator()(arg1_t a) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {}
+  private:
+    func_t f;
+    opmath_arg2_t b;
+};
+
+// Though seemingly noop, this inserts casts from arg1_t to func_t's type
+// (which may be higher precision), as well as casts to return_t
+template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BinaryFunctor {
+  __device__ return_t operator()(arg1_t a, arg2_t b) const {
+    return f(a, b);
+  }
+  BinaryFunctor(func_t f_): f(f_) {}
+  private:
+    func_t f;
+};
+
+// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which
+// accepts inputs at higher precision (typically opmath_t), but then
+// ensure that we load from memory at the correct precision (scalar_t)
+// to avoid expensive loads.  For the whole sordid story see
+// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302
+template <typename arg1_t, typename arg2_t = arg1_t, typename return_t = arg1_t, typename func_t>
+void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+
+  if (iter.is_cpu_scalar(1)) {
+    AUnaryFunctor<arg1_t, arg2_t, return_t, func_t> af(f, iter.scalar_value<opmath_arg1_t>(1));
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    gpu_kernel(iter, af);
+  } else if (iter.is_cpu_scalar(2)) {
+    BUnaryFunctor<arg1_t, arg2_t, return_t, func_t> bf(f, iter.scalar_value<opmath_arg2_t>(2));
+    iter.remove_operand(2);
+    gpu_kernel(iter, bf);
+  } else {
+    gpu_kernel(iter, BinaryFunctor<arg1_t, arg2_t, return_t, func_t>(f));
+  }
+}
+
+template <typename scalar_t, typename return_t = scalar_t, typename func_t>
+void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  // Use symmetric property of the functor to reduce number of kernels,
+  // requires f(a, b) == f(b, a)
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg_t = typename traits::template arg<0>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  static_assert(std::is_same<opmath_arg_t, typename traits::template arg<1>::type>::value,
+                "f is not symmetric");
+
+  OptionalDeviceGuard device_guard;
+  opmath_arg_t scalar_val{};
+
+  if (iter.is_cpu_scalar(1)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(1);
+    iter.remove_operand(1);
+
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    device_guard.reset_device(iter.device(1));
+  } else if (iter.is_cpu_scalar(2)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(2);
+    iter.remove_operand(2);
+  }
+
+  if (iter.ninputs() == 2) {
+    gpu_kernel(iter, BinaryFunctor<scalar_t, scalar_t, return_t, func_t>(f));
+  } else {
+    AUnaryFunctor<scalar_t, scalar_t, return_t, func_t> unary_f(f, scalar_val);
+    gpu_kernel(iter, unary_f);
+  }
+}
+
+// Legacy variant that assumes that func_t has the correct types
+// that we expect to load from memory
+template <typename func_t>
+void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  using arg1_t = typename traits::template arg<0>::type;
+  using arg2_t = typename traits::template arg<1>::type;
+  using return_t = typename traits::result_type;
+  opmath_gpu_kernel_with_scalars<arg1_t, arg2_t, return_t, func_t>(iter, f);
+}
+
+namespace { // functions for `gpu_kernel_multiple_outputs`.
+
+// check the return type is `thrust::tuple`, not `std::tuple`.
+template <typename T> struct is_tuple: std::false_type {};
+
+template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  int remaining = N - block_work_size() * blockIdx.x;
+  elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
+}
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using output_t = typename traits::result_type;
+  static_assert(is_tuple<output_t>::value, "f's return type must be `thrust::tuple`");
+  constexpr int num_outputs = thrust::tuple_size<output_t>::value;
+  constexpr int num_inputs = traits::arity;
+  constexpr int ntensors = num_outputs + num_inputs;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  if (iter.is_contiguous()) {
+    auto input_calc = TrivialOffsetCalculator<num_inputs>();
+    auto output_calc = TrivialOffsetCalculator<num_outputs>();
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  } else {
+    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
+    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  }
+}
+} // namespace
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) {
+  ASSERT_HOST_DEVICE_LAMBDA(func_t);
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda());
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_multiple_outputs(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_multiple_outputs_impl(iter, f);
+}
+
+} //namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Math.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Math.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4e6effa0247e25d55d37238d7718ba47c2362713
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Math.cuh
@@ -0,0 +1,3375 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/jit_macros.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/cuda/jit_utils.h>
+
+namespace at {
+namespace native {
+// See note [Jiterator]
+// TODO: elaborate in this comment on the structure of math.cuh
+#if AT_USE_JITERATOR()
+
+const auto ndtri_string = jiterator_stringify(
+  /*
+  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Evaluates polynomial of degree N:
+  *
+  *                     2          N
+  * y  =  C  + C x + C x  +...+ C x
+  *        0    1     2          N
+  *
+  * Coefficients are stored in reverse order:
+  *
+  * coef[0] = C  , ..., coef[N] = C  .
+  *            N                   0
+  */
+  template <typename T>
+  T polevl(const T x, const T A[], const int len) {
+    // NOTE: This `polevl` is different from other `polevl`
+    // implementation (in PyTorch) which expect the `len` to be
+    // `len(A) - 1` instead of `len(A)`.
+    T result = 0;
+    for (int i = 0; i < len; ++i) {
+      result = result * x + A[i];
+    }
+    return result;
+  }
+
+  /*
+  * This function is derived from the implementation of the i1e function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Computes the argument, x, for which the area under the Gaussian probability density function
+  * (integrated from minus infinity to x) is equal to y.
+  */
+  template <typename T>
+  T ndtri(T y0) {
+
+    constexpr T zero = 0;
+    constexpr T one = 1;
+
+    // Handles special cases
+    if (y0 == zero) {
+      return NEG_INFINITY;
+    }
+    if (y0 == one) {
+      return POS_INFINITY;
+    }
+    if (y0 < zero || y0 > one) {
+      return NAN;
+    }
+
+    bool code = true;
+    T y = y0;
+    // Note: the constant 0.135... is equal to exp(-2)
+    if (y > one - T{0.13533528323661269189}) {
+      y = one - y;
+      code = false;
+    }
+
+    if (y > T{0.13533528323661269189}) {
+      /* approximation for 0 <= |y - 0.5| <= 3/8 */
+      static const T P0[5] = {
+          -5.99633501014107895267E1,
+          9.80010754185999661536E1,
+          -5.66762857469070293439E1,
+          1.39312609387279679503E1,
+          -1.23916583867381258016E0,
+      };
+
+      static const T Q0[9] = {
+        1.00000000000000000000E0,
+        1.95448858338141759834E0,
+        4.67627912898881538453E0,
+        8.63602421390890590575E1,
+        -2.25462687854119370527E2,
+        2.00260212380060660359E2,
+        -8.20372256168333339912E1,
+        1.59056225126211695515E1,
+        -1.18331621121330003142E0,
+      };
+
+      /* sqrt(2pi) */
+      constexpr T s2pi = 2.50662827463100050242E0;
+
+      y = y - T{0.5};
+      const T y2 = y * y;
+      T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9}));
+      return x * s2pi;
+    }
+
+    T x = sqrt(T{-2.} * log(y));
+    const T x0 = x - (log(x) / x);
+
+    const T z = one / x;
+    T x1;
+
+    /* y > exp(-32) = 1.2664165549e-14 */
+    if (x < T{8.0}) {
+      /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8
+      * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
+      */
+      static const T P1[9] = {
+        4.05544892305962419923E0,
+        3.15251094599893866154E1,
+        5.71628192246421288162E1,
+        4.40805073893200834700E1,
+        1.46849561928858024014E1,
+        2.18663306850790267539E0,
+        -1.40256079171354495875E-1,
+        -3.50424626827848203418E-2,
+        -8.57456785154685413611E-4,
+      };
+
+      static const T Q1[9] = {
+        1.00000000000000000000E0,
+        1.57799883256466749731E1,
+        4.53907635128879210584E1,
+        4.13172038254672030440E1,
+        1.50425385692907503408E1,
+        2.50464946208309415979E0,
+        -1.42182922854787788574E-1,
+        -3.80806407691578277194E-2,
+        -9.33259480895457427372E-4,
+      };
+
+      x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9});
+    } else {
+      /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64
+      * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+      */
+      static const T P2[9] = {
+        3.23774891776946035970E0,
+        6.91522889068984211695E0,
+        3.93881025292474443415E0,
+        1.33303460815807542389E0,
+        2.01485389549179081538E-1,
+        1.23716634817820021358E-2,
+        3.01581553508235416007E-4,
+        2.65806974686737550832E-6,
+        6.23974539184983293730E-9,
+      };
+
+      static const T Q2[9] = {
+        1.00000000000000000000E0,
+        6.02427039364742014255E0,
+        3.67983563856160859403E0,
+        1.37702099489081330271E0,
+        2.16236993594496635890E-1,
+        1.34204006088543189037E-2,
+        3.28014464682127739104E-4,
+        2.89247864745380683936E-6,
+        6.79019408009981274425E-9,
+      };
+
+      x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9});
+    }
+
+    x = x0 - x1;
+    return (!code) ? x : -x;
+  }
+); // ndtri_string
+
+const auto log_ndtr_string = jiterator_stringify(
+  template <typename T>
+  T log_ndtr(T x) {
+    constexpr T SQRT1_2{0.707106781186547524400844362104849039};   // 1/sqrt(2)
+    T t = x * SQRT1_2;
+    if (x < T{-1.0}) {
+      return log(erfcx(-t) / 2) - t * t;
+    } else {
+      return log1p(-erfc(t) / 2);
+    }
+  }
+); // log_ndtr_string
+
+const auto gcd_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+); // gcd_string
+
+const auto lcm_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+
+  template <typename T>
+  T lcm(const T a, const T b) {
+    T g = gcd(a, b);
+    return (g == T{0}) ? T{0} : abs(a / g * b);
+  }
+); // lcm_string
+
+/*
+ * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+const auto digamma_string = jiterator_stringify(
+  template <typename T>
+  T digamma(T x) {
+    static const double PI_f64 = 3.14159265358979323846;
+
+    // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
+    if (x == 0) {
+      return copysign(POS_INFINITY, -x);
+    }
+
+    T result = 0;
+    if (x < 0) {
+      // Short-circuits if x is a negative integer and returns NaN
+      //   per the C++ standard
+      const bool x_is_integer = (x == trunc(x));
+      if (x_is_integer) {
+        return NAN;
+      }
+
+      // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+      // accurate than tan(pi * x). While these operations are mathematically equivalent
+      // since both x and r are in radians and tan() has a periodicity of pi, in practice
+      // the computation of pi * x is a source of error (when |x| > 1).
+      double q, r;
+      r = modf(static_cast<double>(x), &q);
+      result = - PI_f64 / tan(PI_f64 * r);
+      x = 1 - x;
+    }
+
+    while (x < T{10}) {
+      result -= T{1} / x;
+      x += T{1};
+    }
+
+    if (x == T{10}) {
+      return result + T{2.25175258906672110764};
+    }
+
+    T y = 0;
+    if (x < T{1.0e17}) {
+      const T A[] = {
+        8.33333333333333333333E-2,
+        -2.10927960927960927961E-2,
+        7.57575757575757575758E-3,
+        -4.16666666666666666667E-3,
+        3.96825396825396825397E-3,
+        -8.33333333333333333333E-3,
+        8.33333333333333333333E-2,
+      };
+
+
+      T z = T{1} / (x * x);
+
+      T polevl_result = 0;
+      for (int i = 0; i <= 6; i++) {
+        polevl_result = polevl_result * z + A[i];
+      }
+      y = z * polevl_result;
+    }
+
+    return log(x) - (T{0.5} / x) - y + result;
+  }
+); // digamma_string
+
+/*
+ * This function is derived from the implementation of the zeta function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+const auto zeta_string = jiterator_stringify(
+  template <typename T>
+  T zeta(T x, T q) {
+    const T MACHEP{1.11022302462515654042E-16};
+    constexpr T zero{0};
+    constexpr T half{0.5};
+    constexpr T one{1};
+    static const T A[] = {
+        12.0,
+        -720.0,
+        30240.0,
+        -1209600.0,
+        47900160.0,
+        -1.8924375803183791606e9, /*1.307674368e12/691*/
+        7.47242496e10,
+        -2.950130727918164224e12, /*1.067062284288e16/3617*/
+        1.1646782814350067249e14, /*5.109094217170944e18/43867*/
+        -4.5979787224074726105e15, /*8.028576626982912e20/174611*/
+        1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/
+        -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/
+    };
+
+    int i = 0;
+    T a, b, k, s, t, w;
+
+    // Short-circuits x -> +infty
+    if (x == one) {
+      return POS_INFINITY;
+    }
+
+    // Short-circuits x < 1 -> NaN
+    if (x < one) {
+      return NAN;
+    }
+
+    // Short-circuits negative q integers map to +infty,
+    //   negative q non-integers map to NaN
+    if (q <= zero) {
+      if (q == floor(q)) {
+        return POS_INFINITY;
+      }
+      if (x != floor(x)) {
+        return NAN;
+      }
+    }
+
+    s = pow(q, -x);
+    a = q;
+    i = 0;
+    b = zero;
+    while ((i < 9) || (a <= T{9.0})) {
+      i += 1;
+      a += one;
+      b = pow(a, -x);
+      s += b;
+      if ((-MACHEP * s < b) && (b < MACHEP * s)) {
+        return s;
+      }
+    };
+
+    w = a;
+    s += b * w / (x - one);
+    s -= half * b;
+    a = one;
+    k = zero;
+    for (int i = 0; i < 12; i++) {
+      a *= x + k;
+      b /= w;
+      t = a * b / A[i];
+      s = s + t;
+      t = fabs(t / s);
+
+      if (t < MACHEP) {
+        return s;
+      }
+
+      k += one;
+      a *= x + k;
+      b /= w;
+      k += one;
+    }
+
+    return s;
+  }
+); // zeta_string
+
+const auto trigamma_string = jiterator_stringify(
+  template <typename T>
+  T trigamma(T x) {
+    const T PI{3.14159265358979323846};
+    T sign = 1;
+    T result = 0;
+
+    if (x < T{0.5}) {
+      sign = -1;
+      T sin_pi_x = sin(PI * x);
+      result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+      x = 1 - x;
+    }
+
+    for (int i = 0; i < 6; ++i) {
+      result += T{1} / (x * x);
+      x += 1;
+    }
+
+    const T one{1};
+    const T ixx = one / (x*x);
+    result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x;
+    return sign * result;
+}
+); // trigamma_string
+
+const auto lgamma_string = jiterator_stringify(
+  template <typename T>
+  T lgamma_kernel(T a) {
+    return lgamma(a);
+  }
+); // lgamma_string
+
+const auto polygamma_string = zeta_string + jiterator_stringify(
+  template <typename T>
+  T polygamma(T x, int n) {
+    // already blocked if n <= 1
+    const auto one = T{1};
+    return ((n % 2) ? one : -one) * exp(lgamma(static_cast<T>(n) + one)) *
+        zeta<T>(static_cast<T>(n + 1), x);
+  }
+); // polygamma_string
+
+const auto exp2_string = jiterator_stringify(
+  template <typename T>
+  T exp2_impl(T a) {
+    return exp2(a);
+  }
+
+  namespace std { template <typename _Ty> class complex; }
+  template <typename T>
+  std::complex<T> exp2_impl(std::complex<T> x) {
+    // There is no std::exp2 overload for complex, so instead
+    // use the identity 2^x = e^(ln(2) * x)
+    const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
+    return exp(ln_2 * x);
+  }
+
+  template <typename T>
+  T exp2_kernel(T a) {
+    return exp2_impl(a);
+  }
+); // exp2_string
+
+const auto erfc_string = jiterator_stringify(
+  template <typename T>
+  T erfc_kernel(T a) {
+    return erfc(a);
+  }
+); // erfc_string
+
+const auto erfinv_string = jiterator_stringify(
+  template <typename T>
+  T erfinv_kernel(T a) {
+    return erfinv(a);
+  }
+); // erfinv_string
+
+const auto entr_string = jiterator_stringify(
+  template <typename T>
+  T entr(T a) {
+    if (a != a) {
+      return a;
+    }
+
+    if (a > 0) {
+      return -a * log(a);
+    }
+
+    if (a == 0) {
+      return 0;
+    }
+
+    return NEG_INFINITY;
+  }
+); // entr_string
+
+// NOTE: `kaiser_window_string` depends on `i0_string`
+//       for its implementation.
+const auto i0_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(T x, const T array[], const int len) {
+
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template<typename T>
+  T i0(T _x) {
+      T x = fabs(_x);
+
+      if (x <= T{8.0}) {
+          /* Chebyshev coefficients for exp(-x) I0(x)
+          *   in the interval [0,8].
+          *
+          * lim(x->0){ exp(-x) I0(x) } = 1.
+          */
+          static const T A[] = {
+              -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+              -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+              -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+              -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+              -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+              -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+              -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+              -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+              -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+              -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+              -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+              -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+              -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+              -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+              -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+          T y = (x / T{2.0}) - T{2.0};
+          return exp(x) * chbevl(y, A, int{30});
+      }
+
+      // Handles x > 8 case
+      /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+      * in the inverted interval [8,infinity].
+      *
+      * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+      */
+      const T B[] = {
+          -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+          4.46562142029675999901E-17,  3.46122286769746109310E-17,
+          -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+          1.77256013305652638360E-15,  3.81168066935262242075E-15,
+          -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+          1.54008621752140982691E-14,  3.85277838274214270114E-13,
+          7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+          -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+          1.18891471078464383424E-11,  4.94060238822496958910E-10,
+          3.39623202570838634515E-9,   2.26666899049817806459E-8,
+          2.04891858946906374183E-7,   2.89137052083475648297E-6,
+          6.88975834691682398426E-5,   3.36911647825569408990E-3,
+          8.04490411014108831608E-1};
+
+      return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x);
+  }
+); // i0_string
+
+const auto i1_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template <typename T>
+  T i1(T _x) {
+    const T x = fabs(_x);
+
+    if (x <= T{8.0}) {
+      // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
+      //   lim(x->0){ exp(-x) i1(x) / x } = 1/2
+      static const T coefficients[] = {
+          2.77791411276104639959E-18, -2.11142121435816608115E-17,
+          1.55363195773620046921E-16, -1.10559694773538630805E-15,
+          7.60068429473540693410E-15, -5.04218550472791168711E-14,
+          3.22379336594557470981E-13, -1.98397439776494371520E-12,
+          1.17361862988909016308E-11, -6.66348972350202774223E-11,
+          3.62559028155211703701E-10, -1.88724975172282928790E-9,
+          9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+          2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+          3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+          4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+          5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+          4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+          2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+          1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+          2.52587186443633654823E-1};
+      const T y = x / T{2.0} - T{2.0};
+      const T out = exp(x) * x * chbevl(y, coefficients, int{29});
+      return (_x < T{0.0}) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval [8, infinity]
+    //   lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
+    static const T coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+    const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x);
+    return (_x < T{0.}) ? -out : out;
+  }
+); // i1_string
+
+const auto i1e_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  // See double and float instantiations below
+  template <typename T>
+  T i1e(T _x) { }
+
+  // Double specialization (uses different coefficients than the float version)
+  template<>
+  double i1e(double _x) {
+    const double x = fabs(_x);
+    if (x <= double{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const double coefficients[] = {
+        2.77791411276104639959E-18, -2.11142121435816608115E-17,
+        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+        2.52587186443633654823E-1};
+      const double y = x / double{2.} - double{2.};
+      const double out = chbevl(y, coefficients, int{29}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const double coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+    const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x);
+    return (_x < double{0.}) ? -out : out;
+  }
+
+  // Float specialization (uses different coefficients than the double version)
+  template<>
+  float i1e(float _x) {
+    const float x = fabsf(_x);
+    if (x <= float{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const float coefficients[] = {
+        9.38153738649577178388E-9f,
+        -4.44505912879632808065E-8f,
+        2.00329475355213526229E-7f,
+        -8.56872026469545474066E-7f,
+        3.47025130813767847674E-6f,
+        -1.32731636560394358279E-5f,
+        4.78156510755005422638E-5f,
+        -1.61760815825896745588E-4f,
+        5.12285956168575772895E-4f,
+        -1.51357245063125314899E-3f,
+        4.15642294431288815669E-3f,
+        -1.05640848946261981558E-2f,
+        2.47264490306265168283E-2f,
+        -5.29459812080949914269E-2f,
+        1.02643658689847095384E-1f,
+        -1.76416518357834055153E-1f,
+        2.52587186443633654823E-1f};
+      const float y = x / float{2.} - float{2.};
+      const float out = chbevl(y, coefficients, int{17}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const float coefficients[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+    const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x);
+    return (_x < float{0.}) ? -out : out;
+  }
+); // i1e_string
+
+const auto kaiser_window_string = i0_string + jiterator_stringify(
+  template <typename T>
+  T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) {
+    T x = a * inv_alpha - T{1};
+    T y = max(T{0}, T{1} - x * x);
+    return i0(beta * sqrt(y)) * inv_i0_beta;
+  }
+); // kaiser_window_string
+
+const auto sinc_string = jiterator_stringify(
+  template <typename T>
+  T sinc(T a) {
+    if (a == T(0)) {
+      return T(1);
+    } else {
+      constexpr T pi = T(3.14159265358979323846L);
+      T product = pi * a;
+      return std::sin(product) / product;
+    }
+  }
+); // sinc_string
+
+const auto erfcx_string = jiterator_stringify(
+  /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+
+  /* Copyright (c) 2012 Massachusetts Institute of Technology
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining
+  * a copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sublicense, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be
+  * included in all copies or substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  */
+
+  /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
+    Steven G. Johnson, October 2012.
+
+    This function combines a few different ideas.
+
+    First, for x > 50, it uses a continued-fraction expansion (same as
+    for the Faddeeva function, but with algebraic simplifications for z=i*x).
+
+    Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
+    but with two twists:
+
+        a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
+          inspired by a similar transformation in the octave-forge/specfun
+          erfcx by Soren Hauberg, results in much faster Chebyshev convergence
+          than other simple transformations I have examined.
+
+        b) Instead of using a single Chebyshev polynomial for the entire
+          [0,1] y interval, we break the interval up into 100 equal
+          subintervals, with a switch/lookup table, and use much lower
+          degree Chebyshev polynomials in each subinterval. This greatly
+          improves performance in my tests.
+
+    For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
+    with the usual checks for overflow etcetera.
+
+    Performance-wise, it seems to be substantially faster than either
+    the SLATEC DERFC function [or an erfcx function derived therefrom]
+    or Cody's CALERF function (from netlib.org/specfun), while
+    retaining near machine precision in accuracy.
+  */
+
+  /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x).
+
+    Uses a look-up table of 100 different Chebyshev polynomials
+    for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    with the help of Maple and a little shell script.   This allows
+    the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+    compared to fitting the whole [0,1] interval with a single polynomial.
+  */
+
+  // TODO: review if this is computing in double when given a float input
+  template <typename T>
+  T erfcx_y100(T y100) {
+    switch (static_cast<int>(y100)) {
+      case 0: {
+      T t = 2*y100 - 1;
+      return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 1: {
+      T t = 2*y100 - 3;
+      return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 2: {
+      T t = 2*y100 - 5;
+      return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 3: {
+      T t = 2*y100 - 7;
+      return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 4: {
+      T t = 2*y100 - 9;
+      return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 5: {
+      T t = 2*y100 - 11;
+      return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 6: {
+      T t = 2*y100 - 13;
+      return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 7: {
+      T t = 2*y100 - 15;
+      return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 8: {
+      T t = 2*y100 - 17;
+      return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 9: {
+      T t = 2*y100 - 19;
+      return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 10: {
+      T t = 2*y100 - 21;
+      return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 11: {
+      T t = 2*y100 - 23;
+      return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 12: {
+      T t = 2*y100 - 25;
+      return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 13: {
+      T t = 2*y100 - 27;
+      return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 14: {
+      T t = 2*y100 - 29;
+      return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 15: {
+      T t = 2*y100 - 31;
+      return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 16: {
+      T t = 2*y100 - 33;
+      return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 17: {
+      T t = 2*y100 - 35;
+      return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 18: {
+      T t = 2*y100 - 37;
+      return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 19: {
+      T t = 2*y100 - 39;
+      return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 20: {
+      T t = 2*y100 - 41;
+      return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 21: {
+      T t = 2*y100 - 43;
+      return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 22: {
+      T t = 2*y100 - 45;
+      return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 23: {
+      T t = 2*y100 - 47;
+      return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 24: {
+      T t = 2*y100 - 49;
+      return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 25: {
+      T t = 2*y100 - 51;
+      return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 26: {
+      T t = 2*y100 - 53;
+      return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 27: {
+      T t = 2*y100 - 55;
+      return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 28: {
+      T t = 2*y100 - 57;
+      return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 29: {
+      T t = 2*y100 - 59;
+      return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 30: {
+      T t = 2*y100 - 61;
+      return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 31: {
+      T t = 2*y100 - 63;
+      return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 32: {
+      T t = 2*y100 - 65;
+      return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 33: {
+      T t = 2*y100 - 67;
+      return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 34: {
+      T t = 2*y100 - 69;
+      return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 35: {
+      T t = 2*y100 - 71;
+      return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 36: {
+      T t = 2*y100 - 73;
+      return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 37: {
+      T t = 2*y100 - 75;
+      return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 38: {
+      T t = 2*y100 - 77;
+      return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 39: {
+      T t = 2*y100 - 79;
+      return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 40: {
+      T t = 2*y100 - 81;
+      return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 41: {
+      T t = 2*y100 - 83;
+      return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 42: {
+      T t = 2*y100 - 85;
+      return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 43: {
+      T t = 2*y100 - 87;
+      return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 44: {
+      T t = 2*y100 - 89;
+      return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 45: {
+      T t = 2*y100 - 91;
+      return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 46: {
+      T t = 2*y100 - 93;
+      return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 47: {
+      T t = 2*y100 - 95;
+      return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 48: {
+      T t = 2*y100 - 97;
+      return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 49: {
+      T t = 2*y100 - 99;
+      return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 50: {
+      T t = 2*y100 - 101;
+      return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 51: {
+      T t = 2*y100 - 103;
+      return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 52: {
+      T t = 2*y100 - 105;
+      return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 53: {
+      T t = 2*y100 - 107;
+      return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 54: {
+      T t = 2*y100 - 109;
+      return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 55: {
+      T t = 2*y100 - 111;
+      return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 56: {
+      T t = 2*y100 - 113;
+      return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 57: {
+      T t = 2*y100 - 115;
+      return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 58: {
+      T t = 2*y100 - 117;
+      return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 59: {
+      T t = 2*y100 - 119;
+      return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 60: {
+      T t = 2*y100 - 121;
+      return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 61: {
+      T t = 2*y100 - 123;
+      return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 62: {
+      T t = 2*y100 - 125;
+      return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 63: {
+      T t = 2*y100 - 127;
+      return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 64: {
+      T t = 2*y100 - 129;
+      return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 65: {
+      T t = 2*y100 - 131;
+      return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 66: {
+      T t = 2*y100 - 133;
+      return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 67: {
+      T t = 2*y100 - 135;
+      return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 68: {
+      T t = 2*y100 - 137;
+      return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 69: {
+      T t = 2*y100 - 139;
+      return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 70: {
+      T t = 2*y100 - 141;
+      return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 71: {
+      T t = 2*y100 - 143;
+      return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 72: {
+      T t = 2*y100 - 145;
+      return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 73: {
+      T t = 2*y100 - 147;
+      return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 74: {
+      T t = 2*y100 - 149;
+      return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 75: {
+      T t = 2*y100 - 151;
+      return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 76: {
+      T t = 2*y100 - 153;
+      return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 77: {
+      T t = 2*y100 - 155;
+      return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 78: {
+      T t = 2*y100 - 157;
+      return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 79: {
+      T t = 2*y100 - 159;
+      return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 80: {
+      T t = 2*y100 - 161;
+      return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 81: {
+      T t = 2*y100 - 163;
+      return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 82: {
+      T t = 2*y100 - 165;
+      return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 83: {
+      T t = 2*y100 - 167;
+      return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 84: {
+      T t = 2*y100 - 169;
+      return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 85: {
+      T t = 2*y100 - 171;
+      return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 86: {
+      T t = 2*y100 - 173;
+      return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 87: {
+      T t = 2*y100 - 175;
+      return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 88: {
+      T t = 2*y100 - 177;
+      return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 89: {
+      T t = 2*y100 - 179;
+      return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 90: {
+      T t = 2*y100 - 181;
+      return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 91: {
+      T t = 2*y100 - 183;
+      return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 92: {
+      T t = 2*y100 - 185;
+      return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 93: {
+      T t = 2*y100 - 187;
+      return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 94: {
+      T t = 2*y100 - 189;
+      return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 95: {
+      T t = 2*y100 - 191;
+      return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 96: {
+      T t = 2*y100 - 193;
+      return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 97: {
+      T t = 2*y100 - 195;
+      return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 98: {
+      T t = 2*y100 - 197;
+      return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 99: {
+      T t = 2*y100 - 199;
+      return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+    }
+
+    // we only get here if y = 1, i.e. |x| < 4*eps, in which case
+    // erfcx is within 1e-15 of 1..
+    return 1.;
+  }
+
+  template <typename T>
+  T erfcx(T x) {
+    // Short-circuits on NaN (returning NaN)
+    if (x != x) {
+      return x;
+    }
+
+    if (x >= 0) {
+      if (x > T{50}) { // continued-fraction expansion is faster
+        const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+
+        if (x > T{5e7}) { // 1-term expansion, important to avoid overflow
+          return ispi / x;
+        }
+
+        /* 5-term expansion (rely on compiler for CSE), simplified from:
+                  ispi / (x+0.5/(x+1/(x+1.5/(x+2/x))))  */
+        return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75}));
+      }
+
+      // x >= 0 x <= 50
+      return erfcx_y100(T{400} / (T{4} + x));
+    }
+
+    // x < 0
+    if (x < T{-26.7}) {
+      return POS_INFINITY;
+    } else if (x < T{-6.1}) {
+      return T{2} * exp(x * x);
+    }
+
+    // x < 0 and x >= -6.1
+    return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x));
+  }
+); // erfcx_string
+
+const auto airy_ai_string = jiterator_stringify(
+    template<typename T>
+    T airy_ai_forward(T x) {
+        static const T AN[] = {
+                +3.46538101525629032477e-01,
+                +1.20075952739645805542e+01,
+                +7.62796053615234516538e+01,
+                +1.68089224934630576269e+02,
+                +1.59756391350164413639e+02,
+                +7.05360906840444183113e+01,
+                +1.40264691163389668864e+01,
+                +9.99999999999999995305e-01,
+        };
+
+        static const T AD[] = {
+                +5.67594532638770212846e-01,
+                +1.47562562584847203173e+01,
+                +8.45138970141474626562e+01,
+                +1.77318088145400459522e+02,
+                +1.64234692871529701831e+02,
+                +7.14778400825575695274e+01,
+                +1.40959135607834029598e+01,
+                +1.00000000000000000470e+00,
+        };
+
+        static const T AFN[] = {
+                -1.31696323418331795333e-01,
+                -6.26456544431912369773e-01,
+                -6.93158036036933542233e-01,
+                -2.79779981545119124951e-01,
+                -4.91900132609500318020e-02,
+                -4.06265923594885404393e-03,
+                -1.59276496239262096340e-04,
+                -2.77649108155232920844e-06,
+                -1.67787698489114633780e-08,
+        };
+
+        static const T AFD[] = {
+                +1.33560420706553243746e+01,
+                +3.26825032795224613948e+01,
+                +2.67367040941499554804e+01,
+                +9.18707402907259625840e+00,
+                +1.47529146771666414581e+00,
+                +1.15687173795188044134e-01,
+                +4.40291641615211203805e-03,
+                +7.54720348287414296618e-05,
+                +4.51850092970580378464e-07,
+        };
+
+        static const T AGN[] = {
+                +1.97339932091685679179e-02,
+                +3.91103029615688277255e-01,
+                +1.06579897599595591108e+00,
+                +9.39169229816650230044e-01,
+                +3.51465656105547619242e-01,
+                +6.33888919628925490927e-02,
+                +5.85804113048388458567e-03,
+                +2.82851600836737019778e-04,
+                +6.98793669997260967291e-06,
+                +8.11789239554389293311e-08,
+                +3.41551784765923618484e-10,
+        };
+
+        static const T AGD[] = {
+                +9.30892908077441974853e+00,
+                +1.98352928718312140417e+01,
+                +1.55646628932864612953e+01,
+                +5.47686069422975497931e+00,
+                +9.54293611618961883998e-01,
+                +8.64580826352392193095e-02,
+                +4.12656523824222607191e-03,
+                +1.01259085116509135510e-04,
+                +1.17166733214413521882e-06,
+                +4.91834570062930015649e-09,
+        };
+
+        int domain_flag = 0;
+
+        T ai;
+
+        if (isinf(x)) {
+            return NAN;
+        }
+
+        if (x > T(103.892)) {
+            return T(0.0);
+        }
+
+        T f;
+        T g;
+        T k;
+
+        if (x < T(-2.09)) {
+            T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0));
+
+            T afn = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afn = afn * (z * z) + AFN[index];
+            }
+
+            T afd = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afd = afd * (z * z) + AFD[index];
+            }
+
+            T agn = 0.0;
+
+            for (uint8_t index = 0; index <= 10 + 0; index++) {
+                agn = agn * (z * z) + AGN[index];
+            }
+
+            T agd = 0.0;
+
+            for (uint8_t index = 0; index <= 10 - 1; index++) {
+                agd = agd * (z * z) + AGD[index];
+            }
+
+            T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846);
+
+            return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd));
+        }
+
+        if (x >= T(2.09)) {
+            domain_flag = 5;
+
+            T zeta = T(2.0) * x * sqrt(x) / T(3.0);
+
+            T an = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                an = an * (T(1.0) / zeta) + AN[index];
+            }
+
+            T ad = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                ad = ad * (T(1.0) / zeta) + AD[index];
+            }
+
+            ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta));
+
+            if (x > T(8.3203353)) {
+                return ai;
+            }
+        }
+
+        f = 1.0;
+        g = x;
+        k = 1.0;
+
+        T m = 1.0;
+        T n = x;
+        T t = 1.0;
+        T z = x * x * x;
+
+        while (t > T(1.11022302462515654042e-16)) {
+            m *= z;
+            k += T(1.0);
+            m /= k;
+            n *= z;
+            k += T(1.0);
+            n /= k;
+            m /= k;
+            f += m;
+            k += T(1.0);
+            n /= k;
+            g += n;
+
+            t = abs(m / f);
+        }
+
+        if ((domain_flag & 1) == 0) {
+            return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g;
+        }
+
+        return ai;
+    } // T airy_ai(T x)
+); // airy_ai_string
+
+const auto bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T RP[] = {
+                -4.79443220978201773821e+09,
+                +1.95617491946556577543e+12,
+                -2.49248344360967716204e+14,
+                +9.70862251047306323952e+15,
+        };
+
+        static const T RQ[] = {
+                +4.99563147152651017219e+02,
+                +1.73785401676374683123e+05,
+                +4.84409658339962045305e+07,
+                +1.11855537045356834862e+10,
+                +2.11277520115489217587e+12,
+                +3.10518229857422583814e+14,
+                +3.18121955943204943306e+16,
+                +1.71086294081043136091e+18,
+        };
+
+        if (x < T(0)) {
+            x = -x;
+        }
+
+        if (x <= T(5.0)) {
+            if (x < T(0.00001)) {
+                return T(1.0) - x * x / T(4.0);
+            }
+
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j0_forward(T x)
+); // bessel_j0_string
+
+const auto bessel_y0_string = bessel_j0_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T YP[] = {
+                +1.55924367855235737965e+04,
+                -1.46639295903971606143e+07,
+                +5.43526477051876500413e+09,
+                -9.82136065717911466409e+11,
+                +8.75906394395366999549e+13,
+                -3.46628303384729719441e+15,
+                +4.42733268572569800351e+16,
+                -1.84950800436986690637e+16,
+        };
+
+        static const T YQ[] = {
+                +1.04128353664259848412e+03,
+                +6.26107330137134956842e+05,
+                +2.68919633393814121987e+08,
+                +8.64002487103935000337e+10,
+                +2.02979612750105546709e+13,
+                +3.17157752842975028269e+15,
+                +2.50596256172653059228e+17,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x < T(0.0)) {
+                NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 6; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y0_forward(T x)
+); // bessel_y0_string
+
+const auto bessel_j1_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T RP[] = {
+                -8.99971225705559398224e+08,
+                +4.52228297998194034323e+11,
+                -7.27494245221818276015e+13,
+                +3.68295732863852883286e+15,
+        };
+
+        static const T RQ[] = {
+                +6.20836478118054335476e+02,
+                +2.56987256757748830383e+05,
+                +8.35146791431949253037e+07,
+                +2.21511595479792499675e+10,
+                +4.74914122079991414898e+12,
+                +7.84369607876235854894e+14,
+                +8.95222336184627338078e+16,
+                +5.32278620332680085395e+18,
+        };
+
+        if (x < T(0.0)) {
+            return -bessel_j1_forward(-x);
+        }
+
+        if (x <= T(5.0)) {
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j1_forward(T x)
+); // bessel_j1_string
+
+const auto bessel_y1_string = bessel_j1_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T YP[] = {
+                +1.26320474790178026440e+09,
+                -6.47355876379160291031e+11,
+                +1.14509511541823727583e+14,
+                -8.12770255501325109621e+15,
+                +2.02439475713594898196e+17,
+                -7.78877196265950026825e+17,
+        };
+
+        static const T YQ[] = {
+                +5.94301592346128195359e+02,
+                +2.35564092943068577943e+05,
+                +7.34811944459721705660e+07,
+                +1.87601316108706159478e+10,
+                +3.88231277496238566008e+12,
+                +6.20557727146953693363e+14,
+                +6.87141087355300489866e+16,
+                +3.97270608116560655612e+18,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x <= T(0.0)) {
+                return NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 5; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y1_forward(T x)
+); // bessel_y1_string
+
+const auto chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x) < T(1.0))) {
+            return cos(n * acos(x));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, T n) {
+        return chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_t_forward(T x, T n)
+); // chebyshev_polynomial_t_string
+
+const auto chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x)) != T(0.0)) {
+                return sin((n + 1) * acos(x)) / sin(acos(x));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x)) / x;
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, T n) {
+        return chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_u_forward(T x, T n)
+); // chebyshev_polynomial_u_string
+
+const auto chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return T(1.0);
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x) / T(2.0)) != T(1.0)) {
+                return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, T n) {
+        return chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_v_forward(T x, T n)
+); // chebyshev_polynomial_v_string
+
+const auto chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (cos(acos(x) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0));
+            }
+
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, T n) {
+        return chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_w_forward(T x, T n)
+); // chebyshev_polynomial_w_string
+
+const auto hermite_polynomial_h_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r = T(0.0);
+
+        for (int64_t k = 2; k < n + n; k += 2) {
+            r = (x + x) * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_h_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, T n) {
+        return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_h_forward(T x, T n)
+); // hermite_polynomial_h_string
+
+const auto hermite_polynomial_he_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = x * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_he_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, T n) {
+        return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_he_forward(T x, T n)
+); // hermite_polynomial_he_string
+
+const auto laguerre_polynomial_l_string = jiterator_stringify(
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(0.0)) {
+            return T(1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return T(1.0) - x;
+        }
+
+        T p = T(1.0);
+        T q = T(1.0) - x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // laguerre_polynomial_l_forward(T x, int64_t n)
+
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, T n) {
+        return laguerre_polynomial_l_forward(x, static_cast<int64_t>(n));
+    } // laguerre_polynomial_l_forward(T x, T n)
+); // laguerre_polynomial_l_string
+
+const auto legendre_polynomial_p_string = jiterator_stringify(
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = ((k + k + 1) * x * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // legendre_polynomial_p_forward(T x, int64_t n)
+
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, T n) {
+        return legendre_polynomial_p_forward(x, static_cast<int64_t>(n));
+    } // legendre_polynomial_p_forward(T x, T n)
+); // legendre_polynomial_p_string
+
+const auto modified_bessel_i0_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i0_forward(T x) {
+        static const T A[] = {
+                -4.41534164647933937950e-18,
+                +3.33079451882223809783e-17,
+                -2.43127984654795469359e-16,
+                +1.71539128555513303061e-15,
+                -1.16853328779934516808e-14,
+                +7.67618549860493561688e-14,
+                -4.85644678311192946090e-13,
+                +2.95505266312963983461e-12,
+                -1.72682629144155570723e-11,
+                +9.67580903537323691224e-11,
+                -5.18979560163526290666e-10,
+                +2.65982372468238665035e-09,
+                -1.30002500998624804212e-08,
+                +6.04699502254191894932e-08,
+                -2.67079385394061173391e-07,
+                +1.11738753912010371815e-06,
+                -4.41673835845875056359e-06,
+                +1.64484480707288970893e-05,
+                -5.75419501008210370398e-05,
+                +1.88502885095841655729e-04,
+                -5.76375574538582365885e-04,
+                +1.63947561694133579842e-03,
+                -4.32430999505057594430e-03,
+                +1.05464603945949983183e-02,
+                -2.37374148058994688156e-02,
+                +4.93052842396707084878e-02,
+                -9.49010970480476444210e-02,
+                +1.71620901522208775349e-01,
+                -3.04682672343198398683e-01,
+                +6.76795274409476084995e-01,
+        };
+
+        static const T B[] = {
+                -7.23318048787475395456e-18,
+                -4.83050448594418207126e-18,
+                +4.46562142029675999901e-17,
+                +3.46122286769746109310e-17,
+                -2.82762398051658348494e-16,
+                -3.42548561967721913462e-16,
+                +1.77256013305652638360e-15,
+                +3.81168066935262242075e-15,
+                -9.55484669882830764870e-15,
+                -4.15056934728722208663e-14,
+                +1.54008621752140982691e-14,
+                +3.85277838274214270114e-13,
+                +7.18012445138366623367e-13,
+                -1.79417853150680611778e-12,
+                -1.32158118404477131188e-11,
+                -3.14991652796324136454e-11,
+                +1.18891471078464383424e-11,
+                +4.94060238822496958910e-10,
+                +3.39623202570838634515e-09,
+                +2.26666899049817806459e-08,
+                +2.04891858946906374183e-07,
+                +2.89137052083475648297e-06,
+                +6.88975834691682398426e-05,
+                +3.36911647825569408990e-03,
+                +8.04490411014108831608e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 30; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            return exp(abs(x)) * (T(0.5) * (a - p));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i0_forward(T x)
+); // modified_bessel_i0_string
+
+const auto modified_bessel_i1_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i1_forward(T x) {
+        static const T A[] = {
+                +2.77791411276104639959e-18,
+                -2.11142121435816608115e-17,
+                +1.55363195773620046921e-16,
+                -1.10559694773538630805e-15,
+                +7.60068429473540693410e-15,
+                -5.04218550472791168711e-14,
+                +3.22379336594557470981e-13,
+                -1.98397439776494371520e-12,
+                +1.17361862988909016308e-11,
+                -6.66348972350202774223e-11,
+                +3.62559028155211703701e-10,
+                -1.88724975172282928790e-09,
+                +9.38153738649577178388e-09,
+                -4.44505912879632808065e-08,
+                +2.00329475355213526229e-07,
+                -8.56872026469545474066e-07,
+                +3.47025130813767847674e-06,
+                -1.32731636560394358279e-05,
+                +4.78156510755005422638e-05,
+                -1.61760815825896745588e-04,
+                +5.12285956168575772895e-04,
+                -1.51357245063125314899e-03,
+                +4.15642294431288815669e-03,
+                -1.05640848946261981558e-02,
+                +2.47264490306265168283e-02,
+                -5.29459812080949914269e-02,
+                +1.02643658689847095384e-01,
+                -1.76416518357834055153e-01,
+                +2.52587186443633654823e-01,
+        };
+
+        static const T B[] = {
+                +7.51729631084210481353e-18,
+                +4.41434832307170791151e-18,
+                -4.65030536848935832153e-17,
+                -3.20952592199342395980e-17,
+                +2.96262899764595013876e-16,
+                +3.30820231092092828324e-16,
+                -1.88035477551078244854e-15,
+                -3.81440307243700780478e-15,
+                +1.04202769841288027642e-14,
+                +4.27244001671195135429e-14,
+                -2.10154184277266431302e-14,
+                -4.08355111109219731823e-13,
+                -7.19855177624590851209e-13,
+                +2.03562854414708950722e-12,
+                +1.41258074366137813316e-11,
+                +3.25260358301548823856e-11,
+                -1.89749581235054123450e-11,
+                -5.58974346219658380687e-10,
+                -3.83538038596423702205e-09,
+                -2.63146884688951950684e-08,
+                -2.51223623787020892529e-07,
+                -3.88256480887769039346e-06,
+                -1.10588938762623716291e-04,
+                -9.76109749136146840777e-03,
+                +7.78576235018280120474e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 29; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            if (x < T(0.0)) {
+                return -(T(0.5) * (a - p) * abs(x) * exp(abs(x)));
+            }
+
+            return T(0.5) * (a - p) * abs(x) * exp(abs(x));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        if (x < T(0.0)) {
+            return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)));
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i1_forward(T x)
+); // modified_bessel_i1_string
+
+const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k0_forward(T x)
+); // modified_bessel_k0_string
+
+const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return T(0.5) * (b - p) / sqrt(x);
+    } // T scaled_modified_bessel_k0_forward(T x)
+); // scaled_modified_bessel_k0_string
+
+const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x;
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k1_forward(T x)
+); // modified_bessel_k1_string
+
+const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return (T(0.5) * (b - p) / sqrt(x));
+    } // T scaled_modified_bessel_k1_forward(T x)
+); // scaled_modified_bessel_k1_string
+
+const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            return cos(n * acos(x + x - T(1.0)));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_t_forward(T x, T n)
+); // shifted_chebyshev_polynomial_t_string
+
+const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0))) != T(0.0)) {
+                return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0));
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0));
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_u_forward(T x, T n)
+); // shifted_chebyshev_polynomial_u_string
+
+const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return (n + n + 1);
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_v_forward(T x, T n)
+); // shifted_chebyshev_polynomial_v_string
+
+const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_w_forward(T x, T n)
+); // shifted_chebyshev_polynomial_w_string
+
+const auto spherical_bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T spherical_bessel_j0_forward(T x) {
+        if (isinf(x)) {
+            return T(0.0);
+        }
+
+        if (abs(x) < T(0.5)) {
+            return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0)))))));
+        }
+
+        return sin(x) / x;
+    } // T spherical_bessel_j0_forward(T x)
+); // spherical_bessel_j0_string
+
+#else // !AT_USE_JITERATOR() -- kernels must be precompiled
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) {
+  scalar_t a = ::abs(a_in);
+  scalar_t b = ::abs(b_in);
+  while (a != 0) {
+    scalar_t c = a;
+    a = b % a;
+    b = c;
+  }
+  return b;
+}
+
+/*
+ * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
+  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static const double PI_f64 = 3.14159265358979323846;
+  const accscalar_t PSI_10 = 2.25175258906672110764;
+  const accscalar_t A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2,
+  };
+
+  accscalar_t x = static_cast<accscalar_t>(in);
+  if (x == 0) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(static_cast<scalar_t>(INFINITY), -x);
+  }
+
+  bool x_is_integer = x == ::trunc(x);
+  accscalar_t result = 0;
+  if (x < 0) {
+    if (x_is_integer) {
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return static_cast<scalar_t>(NAN);
+    }
+    // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+    // accurate than tan(pi * x). While these operations are mathematically equivalent
+    // since both x and r are in radians and tan() has a periodicity of pi, in practice
+    // the computation of pi * x is a source of error (when |x| > 1).
+    double q, r;
+    r = ::modf(static_cast<double>(x), &q);
+    result = static_cast<accscalar_t>(- PI_f64 / ::tan(PI_f64 * r));
+    x = 1 - x;
+  }
+
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return static_cast<scalar_t>(result + PSI_10);
+  }
+
+  accscalar_t y = 0;
+  if (x < 1.0e17) {
+    accscalar_t z = 1 / (x * x);
+
+    accscalar_t polevl_result = 0;
+    for (int i = 0; i <= 6; i++) {
+      polevl_result = polevl_result * z + A[i];
+    }
+    y = z * polevl_result;
+  }
+
+  return static_cast<scalar_t>(::log(x) - (static_cast<accscalar_t>(0.5) / x) - y + result);
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) {
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  const accscalar_t PI = 3.14159265358979323846;
+  accscalar_t x = static_cast<accscalar_t>(in);
+  accscalar_t sign = +1;
+  accscalar_t result = 0;
+  if (x < 0.5f) {
+    sign = -1;
+    accscalar_t sin_pi_x = ::sin(PI * x);
+    result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const accscalar_t one = static_cast<scalar_t>(1);
+  const accscalar_t ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x;
+  return static_cast<scalar_t>(sign * result);
+}
+
+/*
+ * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t
+chbevl(scalar_t _x, const scalar_t array[], size_t len) {
+  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+
+  scalar_t b0, b1, b2;
+
+  b0 = array[0];
+  b1 = 0;
+
+  for (size_t i = 1; i < len; ++i)  {
+    b2 = b1;
+    b1 = b0;
+    b0 = _x * b1 - b2 + array[i];
+  }
+
+  return (0.5 * (b0 - b2));
+}
+
+/*
+ * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
+  /* Chebyshev coefficients for exp(-x) I0(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I0(x) } = 1.
+   */
+  static const T coefficients[] = {
+      -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+      -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+      -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+      -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+      -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+      -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+      -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+      -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+      -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+      -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+      -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+      -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+      -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+      -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+      -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+  return std::make_tuple(coefficients, 30);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+   */
+  static const T coefficients[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return std::make_tuple(coefficients, 25);
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) {
+  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+  // Upcast input for numerical accuracy purposes
+  // Needed for accurate results if input is bfloat16 or float16
+  scalar_t x = ::abs(_x);
+
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i0e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0};
+    return (::exp(x) * chbevl(y, A, len));
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i0e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x));
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coefficients[] = {
+      2.77791411276104639959E-18, -2.11142121435816608115E-17,
+      1.55363195773620046921E-16, -1.10559694773538630805E-15,
+      7.60068429473540693410E-15, -5.04218550472791168711E-14,
+      3.22379336594557470981E-13, -1.98397439776494371520E-12,
+      1.17361862988909016308E-11, -6.66348972350202774223E-11,
+      3.62559028155211703701E-10, -1.88724975172282928790E-9,
+      9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+      2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+      3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+      4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+      5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+      4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+      2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+      1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+      2.52587186443633654823E-1};
+
+  return std::make_tuple(coefficients, 29);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coeff[] = {
+      9.38153738649577178388E-9f,
+      -4.44505912879632808065E-8f,
+      2.00329475355213526229E-7f,
+      -8.56872026469545474066E-7f,
+      3.47025130813767847674E-6f,
+      -1.32731636560394358279E-5f,
+      4.78156510755005422638E-5f,
+      -1.61760815825896745588E-4f,
+      5.12285956168575772895E-4f,
+      -1.51357245063125314899E-3f,
+      4.15642294431288815669E-3f,
+      -1.05640848946261981558E-2f,
+      2.47264490306265168283E-2f,
+      -5.29459812080949914269E-2f,
+      1.02643658689847095384E-1f,
+      -1.76416518357834055153E-1f,
+      2.52587186443633654823E-1f};
+  return std::make_tuple(coeff, 17);
+};
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+  return std::make_tuple(coefficients, 25);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coeff[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+  return std::make_tuple(coeff, 7);
+};
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) {
+  const auto x = ::abs(_x);
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    scalar_t y = x / scalar_t{2.0} - scalar_t{2.0};
+    const scalar_t out = ::exp(x) * x * chbevl(y, A, len);
+    return (_x < scalar_t{0.0}) ? -out : out;
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i1e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x);
+  return (_x < scalar_t{0.0}) ? -out : out;
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) {
+  const auto x = ::abs(_x);
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0};
+    const scalar_t out = chbevl(y, A, len) * x;
+    return (_x < scalar_t{0.0}) ? -out : out;
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i1e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x);
+  return (_x < scalar_t{0.0}) ? -out : out;
+}
+
+#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive)
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fabc47acb137252f5b138c59fafab23d874c2c8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh
@@ -0,0 +1,384 @@
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <c10/core/DynamicCast.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <c10/macros/Macros.h>
+#include <ATen/core/Array.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
+// References:
+// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
+
+namespace at { namespace native { namespace memory {
+
+namespace detail {
+
+// What does the `static_unroll` do?
+//
+// We want to do something like:
+//
+//    using args_t = typename traits::ArgsTuple;
+//    args_t args;
+//    #pragma unroll
+//    for (int i = 0; i < traits::arity; i++) {
+//      std::get<i>(args) = ....
+//    }
+//
+// but unfortunately the above code does not work because
+// the template argument has to be a compile time constant
+// so `static_unroll` is created to simulate `#pragma unroll`
+// using template metaprogramming.
+
+template<template<int i> typename func, int end, int current=0>
+struct static_unroll {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current+1>::with_args(args...);
+  }
+};
+
+template<template<int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template<typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+
+// helper structs to be used with static_unroll to load arguments
+// one by one
+
+template<int arg_index>
+struct vectorized_load_helper {
+  template <typename args_t, typename policy_t>
+  static __device__ void apply(policy_t &self, args_t *args, int idx) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size() * idx;
+    auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
+    self.load_single_arg(args_accessor, ptr);
+  }
+};
+
+template<int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+
+template <int current>
+struct multi_outputs_store_helper {
+  template<int ntensors, int num_outputs, typename ...Args>
+  C10_HOST_DEVICE static void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      thrust::tuple<Args...> ret) {
+    using T = typename thrust::tuple_element<current, thrust::tuple<Args...>>::type;
+    T *to = reinterpret_cast<T *>(data[current]) + offsets[current];
+    *to = thrust::get<current>(ret);
+  }
+};
+
+}  // namespace detail
+
+struct LoadWithoutCast {
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t *>(base_ptr) + offset);
+  }
+};
+
+template <int N>
+struct LoadWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  LoadWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.ninputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i + iter.noutputs());
+      element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs()));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    return c10::fetch_and_cast<scalar_t>(dtypes[arg], ptr);
+  }
+};
+
+struct StoreWithoutCast {
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
+  }
+};
+
+template <int N = 1>
+struct StoreWithCast {
+  using array_t = at::detail::Array<at::ScalarType, std::max<int>(N, 1)>;
+  using size_array_t = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+
+  array_t dtypes;
+  size_array_t element_sizes;
+
+  StoreWithCast(const TensorIteratorBase& iter) {
+    CUDA_KERNEL_ASSERT(iter.noutputs() == N);
+    #pragma unroll
+    for (auto i = 0; i < N; ++i) {
+      this->dtypes[i] = iter.dtype(i);
+      element_sizes[i] = c10::elementSize(iter.dtype(i));
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) {
+    void *ptr = base_ptr + element_sizes[arg] * offset;
+    c10::cast_and_store<scalar_t>(dtypes[arg], ptr, value);
+  }
+};
+
+// aligned vector generates vectorized load/store on CUDA
+template<typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+  scalar_t val[vec_size];
+};
+
+template <int vec_size, typename scalar_t>
+__device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
+  using vec_t = aligned_vector<scalar_t, vec_size>;
+  auto *from = reinterpret_cast<const vec_t *>(base_ptr);
+  return from[offset];
+}
+
+template <int vec_size>
+__device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint32_t offset) {
+  // See NOTE [Loading boolean values]
+  auto tmp = load_vector<vec_size>(reinterpret_cast<const uint8_t*>(base_ptr), offset);
+  aligned_vector<bool, vec_size> ret;
+  for (int i = 0; i < vec_size; ++i) {
+    ret.val[i] = bool(tmp.val[i]);
+  }
+  return ret;
+}
+
+namespace policies {
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int num_outputs = 1>
+struct unroll {
+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  loader_t loader;
+  storer_t storer;
+
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      storer.store(from[i], data[0], offset);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+// Note:
+// Functions in vectorized policy does not do boundary check. It assumes the whole block
+// has its job to do. So the reminders should be handled by the caller manually.
+template <int vec_size, typename data_t>  // vec_size: number of scalars, can be 1, 2, or 4.
+struct vectorized {
+
+  static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size");
+  static constexpr int loop_size = thread_work_size() / vec_size;
+
+  data_t data;
+
+  __device__ vectorized(data_t data) : data(data) {}
+
+  __device__ inline constexpr bool check_inbounds(int thread_work_elem) {
+    return true;
+  }
+
+  template<typename accessor_t, typename scalar_t>
+  __device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      auto v = load_vector<vec_size>(from, index);
+      #pragma unroll
+      for (int j = 0; j < vec_size; j++) {
+        to(vec_size * i + j) = v.val[j];
+      }
+    }
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
+  }
+
+  template<typename scalar_t>
+  __device__ inline void store(scalar_t *from, int idx) {
+    using vec_t = aligned_vector<scalar_t, vec_size>;
+    scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size() * idx;
+    vec_t *to_ = reinterpret_cast<vec_t *>(to);
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < loop_size; i++) {
+      int index = thread_idx + i * num_threads();
+      vec_t v;
+      for (int j = 0; j < vec_size; j++) {
+        v.val[j] = from[vec_size * i + j];
+      }
+      to_[index] = v;
+    }
+  }
+};
+
+template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
+  //we don't use inheritance because of compiler bug in cuda 10.2+
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+
+  __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
+  data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
+
+  __device__ inline bool check_inbounds(int thread_work_elem) {
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
+  }
+
+  template<typename args_t>
+  __device__ inline void load(args_t *args, int idx) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
+    }
+  }
+
+
+  template <typename return_t>
+  __device__ inline void store(return_t *from, int idx) {
+    int thread_idx = threadIdx.x;
+    #pragma unroll
+    for (int i = 0; i < thread_work_size(); i++) {
+      if (thread_idx >= this->remaining) {
+        return;
+      }
+      int linear_idx = thread_idx + block_work_size() * idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      memory::detail::static_unroll<detail::multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      thread_idx += num_threads();
+    }
+  }
+};
+
+}  // namespace policies
+
+// This is only used in host, but we will wrap this into some templates
+// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
+// in order to compile
+template<typename scalar_t>
+inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
+  constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
+  if (address % vec4_alignment == 0) {
+    return 4;
+  } else if (address % vec2_alignment == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template<int i>
+struct can_vectorize_up_to_helper {
+  template <typename array_t, typename traits>
+  static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
+    using arg_t = typename traits::template arg<i>::type;
+    // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
+    // need a +1 offset to get the input
+    result = std::min<int>(result, can_vectorize_up_to<arg_t>(pointers[i + 1]));
+  }
+};
+
+template<typename func_t, typename array_t>
+inline int can_vectorize_up_to(array_t pointers) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  constexpr int arity = traits::arity;
+  int result = can_vectorize_up_to<return_t>(pointers[0]);
+  // We need to get the type for each argument of `func_t`, this can only
+  // be done at compile time.
+  detail::static_unroll<can_vectorize_up_to_helper, arity>::with_args(result, pointers, traits());
+  return result;
+}
+
+}}} // namespace at::native::memory
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MiscUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MiscUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..936b4d80a179a77afec1c6df4741161b14541934
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MiscUtils.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAConfig.h>
+#include <ATen/cuda/PinnedMemoryAllocator.h>
+
+namespace at {
+namespace native {
+
+static inline int cuda_int_cast(int64_t value, const char* varname) {
+  auto result = static_cast<int>(value);
+  TORCH_CHECK(static_cast<int64_t>(result) == value,
+              "cuda_int_cast: The value of ", varname, "(", (long long)value,
+              ") is too large to fit into a int (", sizeof(int), " bytes)");
+  return result;
+}
+
+// Creates an array of size elements of type T, backed by pinned memory
+// wrapped in a Storage
+template<class T>
+static inline Storage pin_memory(int64_t size) {
+  auto* allocator = cuda::getPinnedMemoryAllocator();
+  int64_t adjusted_size = size * sizeof(T);
+  return Storage(
+      Storage::use_byte_size_t(),
+      adjusted_size,
+      allocator,
+      /*resizable=*/false);
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1402f3bd038847e5491686c33669e2ac6ad59cb6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh
@@ -0,0 +1,379 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <vector>
+
+namespace at::native {
+
+namespace {
+
+static constexpr int64_t kILP = 4;
+static constexpr int64_t kChunkSize = 65536;
+static constexpr int64_t kBlockSize = 512;
+
+// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy`
+// TensorListMetadata has to be < 4KB - the limit for kernel launch argument
+static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30};
+static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = {
+    72,
+    60};
+
+template <typename T>
+__device__ __forceinline__ bool is_aligned(T* p) {
+  return ((uint64_t)p) % (kILP * sizeof(T)) == 0;
+}
+
+template <typename T>
+__device__ __forceinline__ void load_store(
+    T* dst,
+    T* src,
+    int64_t dst_offset,
+    int64_t src_offset) {
+  using LT = at::native::memory::aligned_vector<T, kILP>;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+
+template <int n>
+struct TensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename scalar_vals_t, int n>
+struct TensorListScalarListMetadata {
+  const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]];
+  scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+};
+
+// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of
+// 4kb with `c10::complex<double>`
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 1> {
+  const void* addresses[1]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]];
+  unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]];
+  int block_to_chunk[depth_to_max_blocks[1 - 1]];
+};
+
+template <>
+struct TensorListScalarListMetadata<c10::complex<double>, 2> {
+  const void* addresses[2]
+                       [depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  int64_t
+      numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  c10::complex<double>
+      scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]];
+  int block_to_chunk[depth_to_max_blocks[2 - 1]];
+};
+
+// NOTE(crcrpar): This is a conservative resolution to handle `state_steps`
+// whose each element is `at::Tensor` of 1 element representing the number of
+// `step`s called so far.
+template <int n>
+struct FusedOptimizerTensorListMetadata {
+  const void* addresses[n][depth_to_max_tensors[n - 1]];
+  int64_t numel_for_tensor[depth_to_max_tensors[n - 1]];
+  const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]];
+  unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+  int block_to_chunk[depth_to_max_blocks[n - 1]];
+  int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+C10_LAUNCH_BOUNDS_1(kBlockSize)
+__global__ void multi_tensor_apply_kernel(
+    T tensorListMeta,
+    U callable,
+    ArgTypes... args) {
+  // Hand the chunk information to the user-supplied functor to process however
+  // it likes.
+  callable(kChunkSize, tensorListMeta, args...);
+}
+
+} // namespace
+
+// multi_tensor_apply enables horizontal fusion across lists of tensors.
+// For example, whereas you once had a for-loop of a + b = c, where a, b,
+// and c are individual tensors in lists as, bs, and cs, you can now with
+// fewer kernel launches compute as + bs = cs.
+//
+// You can also imagine bs to be a scalar list vs a tensor list.
+//
+// The function below takes in tensor lists, scalars, and a callable and
+// chunks up the computation to launch as few kernels as possible by iterating
+// through every "chunk" in every tensor (thus the nested for loops). In the
+// simplest case, everything gets bundled into just one kernel launch, but
+// due to blocksize constraints, we may need to launch multiple kernels.
+// Each kernel launch is defined by one tensorListMeta construct, which we
+// use to track and reset the necessary metadata for each launch.
+template <int depth, typename scalar_T, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::ArrayRef<Scalar> scalars,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  using scalar_vals_t = typename T::opmath_t;
+  TensorListScalarListMetadata<scalar_vals_t, depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to<scalar_T>();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // now we enter [chunking territory].
+    // we will launch a kernel when EITHER the blocks get filled up OR
+    // the tensors get filled up. There will always be at least one block
+    // per tensor since the zero-sized ones will not enter the loop, so
+    // the nested forloop within represents iterating through the chunks
+    // of a single tensor.
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      // a tensor is not considered full unless all its chunks have been
+      // processed
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        // all chunks have already been handled in the kernel
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else { // blocks were full and tensor chunks remain
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.scalar_vals[0] =
+              tensorListMeta.scalar_vals[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // note: [finishing what we started]
+  // if there's remaining work to be done but the tensors/blocks aren't full
+  // yet we are at the end, submit the kernel to do the work!
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth.");
+  const size_t n_tensors = tensor_lists[0].size();
+  TensorListMetadata<depth> tensorListMeta;
+  tensorListMeta.start_tensor_this_launch = 0;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (size_t t = 0; t < n_tensors; t++) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][t].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][t].numel();
+    for (int d = 0; d < depth; d++) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][t].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see note: [chunking territory].
+    const auto numel = tensor_lists[0][t].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    for (auto chunk = 0; chunk < chunks; chunk++) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const bool tensors_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const bool blocks_full =
+          (loc_block_info == depth_to_max_blocks[depth - 1]);
+
+      if (tensors_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+          tensorListMeta.start_tensor_this_launch = t + 1;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          for (int d = 0; d < depth; d++) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+          tensorListMeta.start_tensor_this_launch = t;
+        }
+      }
+    }
+  }
+
+  // see note: [finishing what we started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply_for_fused_optimizer(
+    std::vector<std::vector<at::Tensor>>& tensor_lists,
+    at::TensorList state_steps,
+    T callable,
+    ArgTypes... args) {
+  TORCH_CHECK(
+      tensor_lists.size() == depth,
+      "Number of tensor lists has to match the depth");
+  const auto num_tensors = tensor_lists[0].size();
+  FusedOptimizerTensorListMetadata<depth> tensorListMeta;
+
+  int loc_block_info = 0;
+  int loc_tensor_info = 0;
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    // short-circuit to avoid adding empty tensors to tensorListMeta
+    if (tensor_lists[0][tensor_index].numel() == 0) {
+      continue;
+    }
+    tensorListMeta.state_steps_addresses[loc_tensor_info] =
+        state_steps[tensor_index].const_data_ptr();
+    tensorListMeta.numel_for_tensor[loc_tensor_info] =
+        tensor_lists[0][tensor_index].numel();
+    for (const auto& d : c10::irange(depth)) {
+      tensorListMeta.addresses[d][loc_tensor_info] =
+          tensor_lists[d][tensor_index].const_data_ptr();
+    }
+    loc_tensor_info++;
+
+    // see above note: [chunking territory]
+    const auto numel = tensor_lists[0][tensor_index].numel();
+    const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0);
+    TORCH_CHECK(chunks > -1);
+    for (const auto& chunk : c10::irange(chunks)) {
+      tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+      tensorListMeta.block_to_chunk[loc_block_info] = chunk;
+      loc_block_info++;
+
+      const auto tensor_full =
+          (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+           chunk == chunks - 1);
+      const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1];
+
+      if (tensor_full || blocks_full) {
+        multi_tensor_apply_kernel<<<
+            loc_block_info,
+            kBlockSize,
+            0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            tensorListMeta, callable, args...);
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Reset.
+        loc_block_info = 0;
+        if (chunk == chunks - 1) {
+          loc_tensor_info = 0;
+        } else {
+          tensorListMeta.numel_for_tensor[0] =
+              tensorListMeta.numel_for_tensor[loc_tensor_info - 1];
+          tensorListMeta.state_steps_addresses[0] =
+              tensorListMeta.state_steps_addresses[loc_tensor_info - 1];
+          for (const auto& d : c10::irange(depth)) {
+            tensorListMeta.addresses[d][0] =
+                tensorListMeta.addresses[d][loc_tensor_info - 1];
+          }
+          loc_tensor_info = 1;
+        }
+      }
+    }
+  }
+
+  // see above note: [finishing what we've started]
+  if (loc_block_info != 0) {
+    multi_tensor_apply_kernel<<<
+        loc_block_info,
+        kBlockSize,
+        0,
+        at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Normalization.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Normalization.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9a609e956aa37a13249dc81e0982a34404837816
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Normalization.cuh
@@ -0,0 +1,1742 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+#include <ATen/native/cuda/DeviceSqrt.cuh>
+#include <ATen/native/cuda/LaunchUtils.h>
+#include <c10/macros/Macros.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at { namespace native {
+
+// The maximum number of threads in a block
+#if defined(USE_ROCM)
+constexpr int MAX_BLOCK_SIZE = 256;
+#else
+constexpr int MAX_BLOCK_SIZE = 512;
+#endif
+
+constexpr unsigned MAX_GRID_SIZE = 65535u;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+#if defined(USE_ROCM)
+  int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
+#else
+  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+#endif
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+// Returns the index of the most significant 1 bit in `val`.
+__device__ __forceinline__ int getMSB(int val) {
+  return 31 - __clz(val);
+}
+
+template <typename scalar_t, typename accscalar_t>
+struct Float2 {
+  accscalar_t v1, v2;
+  __device__ Float2() {}
+  __device__ Float2(scalar_t v1, scalar_t v2) : v1(static_cast<accscalar_t>(v1)), v2(static_cast<accscalar_t>(v2)) {}
+  __device__ Float2(int v) : v1(static_cast<accscalar_t>(v)), v2(static_cast<accscalar_t>(v)) {}
+  __device__ Float2& operator+=(const Float2& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+  __device__ friend Float2 operator+(Float2 a, const Float2& b) {
+    a += b;
+    return a;
+  }
+};
+
+template <typename scalar_t, typename accscalar_t, typename PTA>
+struct GradOp {
+  __device__ GradOp(accscalar_t m, const PTA& i, const PTA& g)
+    : mean(m), input(i), grad_output(g) {}
+  __device__ __forceinline__ Float2<scalar_t, accscalar_t> operator()(int batch, int plane, int n) {
+    accscalar_t g = grad_output[batch][plane][n];
+    accscalar_t c = static_cast<accscalar_t>(input[batch][plane][n]) - mean;
+    return Float2<scalar_t, accscalar_t>(g, g * c);
+  }
+  const accscalar_t mean;
+  const PTA& input;
+  const PTA& grad_output;
+};
+
+template <typename acc_t>
+struct SumReduceOp {
+    __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; }
+
+    __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const {
+        return WARP_SHFL_DOWN(data, offset);
+    }
+};
+
+template <typename scalar_t, typename accscalar_t>
+struct SumReduceOp<Float2<scalar_t, accscalar_t>> {
+    using acc_t = Float2<scalar_t, accscalar_t>;
+
+    __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; }
+
+    __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const {
+        return {WARP_SHFL_DOWN(data.v1, offset), WARP_SHFL_DOWN(data.v2, offset)};
+    }
+};
+
+// Sum across (batch, x/y/z) applying Op() pointwise
+// this works by first having each thread sum it's part
+// of the data. Then there is a double-shuffling reduction.
+// First each warp (of C10_WARP_SIZE threads) uses warpSum to reduce its
+// data to the "warp leader", who writes its value into shared memory.
+// Then a single warp reads the remaining (at most C10_WARP_SIZE) items
+// and reduces them using another warpSum.
+// The implicit assumption is that there are no more
+// than C10_WARP_SIZE**2 threads.
+template<typename scalar_t, typename Op, typename PTA>
+__device__ scalar_t reduce(Op op, PTA tensor, int plane) {
+  // first the reductions each thread does separately
+  scalar_t sum = static_cast<scalar_t>(0);
+  for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) {
+    for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  __shared__ scalar_t shared[C10_WARP_SIZE];
+  SumReduceOp<scalar_t> reduce_op;
+  sum = cuda_utils::BlockReduce<scalar_t, SumReduceOp<scalar_t>, cuda_utils::Block2D>(sum, reduce_op, 0, shared);
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+      shared[0] = sum;
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole grad_input
+  return shared[0];
+}
+
+constexpr int ELEMENTS_PER_ITER = 4; // enables concurrency within each thread to hide latency
+constexpr int ELEMENTS_PER_THREAD = 16;
+constexpr int OPTIMAL_TILE_W = 32;
+constexpr int MAX_H_BLOCK = 128;
+
+__host__ void flexible_launch_configs(
+      const int reduction,
+      const int stride,
+      dim3 &block,
+      dim3 &grid,
+      const bool coop_flag = false) {
+  int block_x = std::min(lastPow2(stride), OPTIMAL_TILE_W);
+  int block_y = std::min(lastPow2(at::ceil_div(reduction , ELEMENTS_PER_THREAD)),
+                         MAX_BLOCK_SIZE / block_x);
+  if (block_x * block_y != MAX_BLOCK_SIZE) {
+    block_x = std::min(lastPow2(stride), MAX_BLOCK_SIZE / block_y);
+  }
+
+  int grid_x = at::ceil_div(stride, block_x);
+  int grid_y = std::min(at::ceil_div(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK);
+  if (coop_flag) {
+    // it's not worth having a grid reduction if the reduction dimension is not big enough
+    grid_y = grid_y < 8 ? 1 : grid_y;
+  }
+
+  block.x = block_x;
+  block.y = block_y;
+  block.z = 1;
+  grid.x = grid_x;
+  grid.y = grid_y;
+  grid.z = 1;
+}
+
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_element(C& count,
+                                                      T& mean,
+                                                      T& m2n,
+                                                      const C& count_new,
+                                                      const T& mean_new,
+                                                      const T& m2n_new) {
+      T factor = T(1.0) / ::max(1, (count + count_new));
+      T delta0 = mean - mean_new;
+      mean = (mean_new * count_new + mean * count) * factor;
+      m2n += m2n_new + delta0 * delta0 * count_new * count * factor;
+      count += count_new;
+}
+
+// merge mean/m2n among threadIdx.y within block
+template<typename T, typename C>
+__device__ __forceinline__ void welford_merge_block_vertical(C& count,
+                                                             T& mean,
+                                                             T& m2n,
+                                                             C* shmem_count,
+                                                             T* shmem_mean,
+                                                             T* shmem_m2n) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset*2) {
+      shmem_mean[address_base] = mean;
+      shmem_m2n[address_base] = m2n;
+      shmem_count[address_base] = count;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+      // read shared memory back to register for reduction
+      auto count_new = shmem_count[address];
+      auto mean_new = shmem_mean[address];
+      auto m2n_new = shmem_m2n[address];
+
+      welford_merge_element(count, mean, m2n, count_new, mean_new, m2n_new);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, bool train, typename index_t>
+__global__ void batch_norm_transform_input_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> output,
+    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> mean_,
+    const GenericPackedTensorAccessor<typename std::conditional<train, stat_accscalar_t, stat_scalar_t>::type, 1, RestrictPtrTraits, index_t> var_or_invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, RestrictPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, RestrictPtrTraits, index_t> bias,
+    stat_accscalar_t epsilon) {
+
+  index_t plane = blockIdx.x;
+
+  if (plane >= input.size(1)) {
+    return;
+  }
+
+  stat_accscalar_t gamma = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : static_cast<stat_accscalar_t>(1);
+  stat_accscalar_t beta = bias.size(0) > 0 ? static_cast<stat_accscalar_t>(bias[plane]) : static_cast<stat_accscalar_t>(0);
+  stat_accscalar_t mean = static_cast<stat_accscalar_t>(mean_[plane]);
+  stat_accscalar_t invstd;
+  if (train) {
+    invstd = var_or_invstd[plane];
+  } else {
+    invstd = static_cast<stat_accscalar_t>(1) / device_sqrt(static_cast<stat_accscalar_t>(var_or_invstd[plane]) + epsilon);
+  }
+
+  index_t bs = input.size(0);
+  index_t fs = input.size(2);
+
+  index_t bstep  = blockDim.y * gridDim.y;
+  for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) {
+    auto o = output[batch][plane];
+    auto i = input[batch][plane];
+    for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) {
+      o[feature] = static_cast<input_scalar_t>(gamma * (i[feature] - mean) * invstd + beta);
+    }
+  }
+}
+
+struct InvStd {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T var, double epsilon) const {
+    T invstd = 0;
+    if (var != static_cast<T>(0) || epsilon != static_cast<T>(0)) {
+      invstd = static_cast<T>(1) / device_sqrt(var + epsilon);
+    }
+    return invstd;
+  }
+};
+
+struct Var {
+  template <typename T>
+  __device__ __forceinline__ T operator()(T var, double epsilon) const {
+    return var;
+  }
+};
+
+template <typename VarTransform, typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_collect_statistics_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, RestrictPtrTraits, index_t> input,
+    const stat_accscalar_t epsilon,
+    const stat_accscalar_t momentum,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, RestrictPtrTraits, index_t> save_mean,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, RestrictPtrTraits, index_t> save_transformed_var) {
+
+  __shared__ int shared_n[2 * 2 * C10_WARP_SIZE + C10_WARP_SIZE];
+
+  int plane = blockIdx.x;
+  int N = input.size(0) * input.size(2);
+  int tid = threadIdx.x + threadIdx.y * blockDim.x;
+
+  // Compute the mean and variance across (batch, x/y/z)
+  // this uses the Welford (in the for loop)/parallel algorithm (to sum across the block)
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm
+  // and the parallel algorithm on the same page.
+  // We use two shuffles to reduce across the entire block.
+  // https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ has a description.
+  stat_accscalar_t* shared_avg_var = (stat_accscalar_t*) &shared_n[C10_WARP_SIZE];
+
+  // first the reductions each thread does separately
+  stat_accscalar_t avg = 0;
+  stat_accscalar_t var_n = 0;
+  int n = 0;
+  for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) {
+    for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) {
+      stat_accscalar_t v = input[batch][plane][x];
+      stat_accscalar_t d1 = v - avg;
+      n++;
+      avg += d1 / n;
+      var_n += d1 * (v - avg);
+    }
+  }
+
+  // first warpSum to get one value per thread to
+  // one value per warp
+  for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) {
+    stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE);
+    int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE);
+    stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n);
+    var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
+    avg = (n * avg + o_n * o_avg) * factor;
+    n += o_n;
+  }
+
+  // this writes each warps  item into shared memory
+  // there are at most C10_WARP_SIZE items left because
+  // there are at most C10_WARP_SIZE**2 threads at the beginning
+  __syncthreads();
+  if (tid % C10_WARP_SIZE == 0) {
+    shared_n[tid / C10_WARP_SIZE] = n;
+    shared_avg_var[tid / C10_WARP_SIZE * 2] = avg;
+    shared_avg_var[tid / C10_WARP_SIZE * 2 + 1] = var_n;
+  }
+  __syncthreads();
+  // now have a second warpSum to reduce the intermediate values
+  // from shared memory to a single number. The very first
+  // thread writes it to shared memory.
+
+  if (tid < C10_WARP_SIZE) {
+    n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_n[tid] : 0);
+    avg = (tid < blockDim.x * blockDim.y  / C10_WARP_SIZE ? shared_avg_var[2 * tid] : stat_accscalar_t(0));
+    var_n = (tid < blockDim.x * blockDim.y  / C10_WARP_SIZE ? shared_avg_var[2 * tid + 1] : stat_accscalar_t(0));
+  }
+  for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) {
+    stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE);
+    int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE);
+    stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n);
+    var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor;
+    avg = (n * avg + o_n * o_avg) * factor;
+    n += o_n;
+  }
+
+  // Save the mean, variance, and moving averages
+  if (tid == 0) {
+    if (save_mean.data() != NULL) {
+      save_mean[plane] = avg;
+    }
+    if (save_transformed_var.data() != NULL) {
+      save_transformed_var[plane] = VarTransform{}(var_n / N, epsilon);
+    }
+  }
+
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> running_mean,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> running_var,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> save_invstd,
+    bool train,
+    stat_accscalar_t epsilon) {
+
+  index_t plane = blockIdx.x;
+  index_t N = grad_output.size(0) * grad_output.size(2);
+
+  stat_accscalar_t mean, invstd;
+  if (train) {
+    mean = save_mean[plane];
+    invstd = save_invstd[plane];
+  } else {
+    mean = static_cast<stat_accscalar_t>(running_mean[plane]);
+    invstd = static_cast<stat_accscalar_t>(1) / device_sqrt(static_cast<stat_accscalar_t>(running_var[plane]) + epsilon);
+  }
+
+  stat_accscalar_t weight_val = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : stat_accscalar_t(1);
+  stat_accscalar_t norm = stat_accscalar_t(1) / N;
+
+  // Compute two values across (batch, x/y/z) in one pass:
+  // 1. Sum(grad_output)
+  // 2. DotProduct(input - mean, grad_output)
+  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>> g(mean, input, grad_output);
+  auto res = reduce<Float2<input_scalar_t, stat_accscalar_t>>(g, grad_output, plane);
+
+  stat_accscalar_t grad_output_sum = res.v1;
+  stat_accscalar_t dot_p = res.v2;
+
+  stat_accscalar_t grad_mean = grad_output_sum * norm;
+  stat_accscalar_t proj_scale = dot_p * norm * invstd * invstd;
+  stat_accscalar_t grad_scale = invstd * weight_val;
+
+  if (grad_input.data() != NULL) {
+    for (int batch = threadIdx.y; batch < grad_output.size(0); batch += blockDim.y) {
+      for (int x = threadIdx.x; x < grad_output.size(2); x += blockDim.x) {
+        input_scalar_t go = grad_output[batch][plane][x];
+        if (train) {
+          stat_accscalar_t inp = input[batch][plane][x];
+          stat_accscalar_t proj = (inp - mean) * proj_scale;
+          grad_input[batch][plane][x] = static_cast<input_scalar_t>((go - proj - grad_mean) * grad_scale);
+        } else {
+          grad_input[batch][plane][x] = static_cast<input_scalar_t>(go * grad_scale);
+        }
+      }
+    }
+  }
+
+  if (grad_weight.size(0) > 0) {
+    if (threadIdx.x == 0) {
+      grad_weight[plane] = static_cast<stat_scalar_t>(dot_p * invstd);
+    }
+  }
+
+  if (grad_bias.size(0) > 0) {
+    if (threadIdx.x == 0) {
+      grad_bias[plane] = static_cast<stat_scalar_t>(grad_output_sum);
+    }
+  }
+}
+
+template <typename scalar_t, typename accscalar_t, typename index_t>
+__global__ void batch_norm_reduce_statistics_kernel(
+    const GenericPackedTensorAccessor<accscalar_t, 2, RestrictPtrTraits, index_t> vec_mean,
+    const GenericPackedTensorAccessor<accscalar_t, 2, RestrictPtrTraits, index_t> vec_invstd,
+    GenericPackedTensorAccessor<accscalar_t, 1, RestrictPtrTraits, index_t> mean,
+    GenericPackedTensorAccessor<accscalar_t, 1, RestrictPtrTraits, index_t> invstd,
+    GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> running_mean,
+    GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> running_var,
+    const accscalar_t epsilon,
+    const accscalar_t momentum,
+    const GenericPackedTensorAccessor<scalar_t, 1, RestrictPtrTraits, index_t> counts) {
+
+  int feature_size = vec_mean.size(1);
+  int world_size = vec_mean.size(0);
+
+  int bid = blockIdx.x;
+  int tid = threadIdx.x;
+
+  // first the reductions each thread does separately
+  for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) {
+    accscalar_t avg = 0;
+    accscalar_t var_n = 0;
+    index_t n = 0;
+    for (int j = 0; j < world_size; j++) {
+      scalar_t count = counts[j];
+      accscalar_t m = vec_mean[j][i];
+      accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]);
+      v = (v * v - epsilon) * count;
+      accscalar_t factor = 1.0 / (n + count);
+      var_n += v + (avg - m) * (avg - m) * n * count * factor;
+      avg = n * factor * avg + count * factor * m;
+      n += count;
+    }
+    mean[i] = avg;
+    invstd[i] = static_cast<accscalar_t>(1) / device_sqrt(var_n / n + epsilon);
+    if (running_mean.data() != NULL) {
+      running_mean[i] = static_cast<scalar_t>((1 - momentum) * running_mean[i] + momentum * avg);
+    }
+    accscalar_t unbiasedVar = var_n / (n - 1);
+    if (running_var.data() != NULL) {
+      running_var[i] = static_cast<scalar_t>((1 - momentum) * running_var[i] + momentum * unbiasedVar);
+    }
+  }
+
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_reduce_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_weight,
+    GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> grad_bias) {
+
+  index_t plane = blockIdx.x;
+
+  stat_accscalar_t r_mean = mean[plane];
+  stat_accscalar_t factor = invstd[plane];
+
+  GradOp<input_scalar_t, stat_accscalar_t, GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t>> g(r_mean, input, grad_output);
+  auto res = reduce<Float2<input_scalar_t, stat_accscalar_t>>(g, grad_output, plane);
+
+  if (threadIdx.x == 0) {
+    if (grad_weight.size(0) > 0) {
+      grad_weight[plane] = static_cast<stat_scalar_t>(res.v2 * factor);
+    }
+    if (grad_bias.size(0) > 0) {
+      grad_bias[plane] = static_cast<stat_scalar_t>(res.v1);
+    }
+    if (sum_dy.size(0) > 0) {
+      sum_dy[plane] = static_cast<stat_accscalar_t>(res.v1);
+    }
+    if (sum_dy_xmu.size(0) > 0) {
+      sum_dy_xmu[plane] = static_cast<stat_accscalar_t>(res.v2);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__device__ __forceinline__ void batch_norm_backward_elemt_kernel_impl(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const stat_accscalar_t norm_fct) {
+  index_t plane = blockIdx.x;
+
+  if (plane >= input.size(1)) {
+    return;
+  }
+
+  stat_accscalar_t m_c = mean[plane];
+  stat_accscalar_t m_dy_c = sum_dy[plane] * norm_fct;
+  stat_accscalar_t factor_1_c = invstd[plane];
+  stat_accscalar_t factor_2_c = weight.size(0) > 0 ? static_cast<stat_accscalar_t>(weight[plane]) : stat_accscalar_t(1);
+  factor_2_c *= factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[plane] * norm_fct;
+
+  index_t bs = input.size(0);
+  index_t fs = input.size(2);
+
+  index_t bstep  = blockDim.y * gridDim.y;
+  for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) {
+    auto g_i = grad_input[batch][plane];
+    auto g_o = grad_output[batch][plane];
+    auto i = input[batch][plane];
+    for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) {
+      g_i[feature] = static_cast<input_scalar_t>((g_o[feature] - m_dy_c - (i[feature] - m_c) * factor_1_c) * factor_2_c);
+    }
+  }
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_elemt_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const int* __restrict__ numel, const int world_size) {
+  int64_t total_numel = 0;
+  for (int i = 0; i < world_size; i ++) {
+    total_numel += numel[i];
+  }
+
+  const stat_accscalar_t norm_fct =
+      static_cast<stat_accscalar_t>(1) / static_cast<stat_accscalar_t>(total_numel);
+  batch_norm_backward_elemt_kernel_impl(
+      input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+}
+
+template <typename input_scalar_t, typename stat_scalar_t, typename stat_accscalar_t, typename index_t>
+__global__ void batch_norm_backward_elemt_kernel(
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> input,
+    const GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_output,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> mean,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> invstd,
+    const GenericPackedTensorAccessor<stat_scalar_t, 1, DefaultPtrTraits, index_t> weight,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy,
+    const GenericPackedTensorAccessor<stat_accscalar_t, 1, DefaultPtrTraits, index_t> sum_dy_xmu,
+    GenericPackedTensorAccessor<input_scalar_t, 3, DefaultPtrTraits, index_t> grad_input,
+    const stat_accscalar_t norm_fct) {
+  batch_norm_backward_elemt_kernel_impl(
+      input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+}
+
+template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> get_packed_accessor(
+    const Tensor& t, c10::string_view var_name) {
+  constexpr auto expect_type = c10::CppTypeToScalarType<scalar_t>::value;
+  const auto actual_type = t.scalar_type();
+  TORCH_CHECK(actual_type == expect_type, "Expected ", var_name,
+              " to have type ", expect_type, " but got ", actual_type);
+  return t.generic_packed_accessor<scalar_t, dim, PtrTraits, index_t>();
+}
+
+template <typename scalar_t, int64_t dim, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+static GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t> packed_accessor_or_dummy(
+    const Tensor& t, c10::string_view var_name) {
+  if (!t.defined()) {
+    const std::array<index_t, dim> zeros{{0}};
+    return GenericPackedTensorAccessor<scalar_t, dim, PtrTraits, index_t>(nullptr, zeros.data(), zeros.data());
+  }
+  return get_packed_accessor<scalar_t, dim, PtrTraits, index_t>(t, var_name);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cuda_template(const Tensor& grad_out_, const Tensor& input_, const Tensor& weight_,
+                                                                     const Tensor& running_mean_, const Tensor& running_var_, const Tensor& save_mean_, const Tensor& save_invstd_,
+                                                                     bool train, double epsilon, std::array<bool,3> grad_input_mask) {
+
+  using accscalar_t = at::acc_type<stat_scalar_t, true>;
+  Tensor grad_input_;
+  Tensor grad_input_reshaped;
+  Tensor grad_weight_;
+  Tensor grad_bias_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1});
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+
+  if (grad_input_mask[0]) {
+    grad_input_ = at::empty_like(input_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    grad_input_reshaped = grad_input_.view(input_reshaped.sizes());
+  }
+  if (grad_input_mask[1]) {
+    grad_weight_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[2]) {
+    grad_bias_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto grad_input = packed_accessor_or_dummy<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto grad_weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight");
+  auto grad_bias = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias");
+  auto running_mean = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean");
+  auto running_var = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var");
+  auto save_mean = packed_accessor_or_dummy<
+      accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean");
+  auto save_invstd = packed_accessor_or_dummy<
+      accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  dim3 blocks(input.size(1));
+  int tf = getNumThreads(input.size(2));
+  dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
+
+  batch_norm_backward_kernel<input_scalar_t, stat_scalar_t, accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
+    (input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var,
+     save_mean, save_invstd, train, epsilon);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(grad_input_, grad_weight_, grad_bias_);
+}
+
+template<typename scalar_t, typename index_t, typename VarTransform>
+void batch_norm_stats_cuda_template(
+    const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input_, double epsilon) {
+
+  using accscalar_t = at::acc_type<scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  Tensor dummy_mean_;
+  Tensor dummy_var_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+
+  resize_output(out_mean, {n_input});
+  resize_output(out_invstd, {n_input});
+  auto input = get_packed_accessor<
+      scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+  TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() &&
+                        out_invstd.sizes()[0]);
+  TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() &&
+                        out_mean.sizes()[0]);
+
+  auto mean = packed_accessor_or_dummy<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(out_mean, "out_mean");
+  auto invstd = packed_accessor_or_dummy<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(out_invstd, "out_invstd");
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(input.size(1));
+  int tf = getNumThreads(input.size(2));
+  dim3 threads(tf, std::max<int>(1, MAX_BLOCK_SIZE/tf));
+  batch_norm_collect_statistics_kernel<VarTransform, scalar_t, scalar_t, accscalar_t, index_t> <<<blocks, threads, 0, stream>>>
+    (input, epsilon, 0.0, mean, invstd);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+void batch_norm_elemt_cuda_template(const Tensor& output_, const Tensor& input_, const Tensor& weight_,
+                                    const Tensor& bias_, const Tensor& mean_, const Tensor& invstd_) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1});
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input");
+  auto output = get_packed_accessor<
+      input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output");
+  auto weight = packed_accessor_or_dummy<
+    stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight");
+  auto bias = packed_accessor_or_dummy<
+      stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, RestrictPtrTraits, index_t>(invstd_, "invstd");
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // NOTE: We use transform_input_kernel in training mode, which ignores epsilon
+  const double dummy_epsilon = 1e-5;
+
+  // The input_transform kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+  batch_norm_transform_input_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, true, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
+    (input, output, mean, invstd, weight, bias, dummy_epsilon);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename accscalar_t, typename index_t>
+std::tuple<Tensor, Tensor> batch_norm_gather_stats_cuda_template(const Tensor& mean_, const Tensor& invstd_,
+                                                                 const Tensor& running_mean_, const Tensor& running_var_,
+                                                                 double momentum, double epsilon, const Tensor& counts_) {
+
+  Tensor save_mean_;
+  Tensor save_invstd_;
+
+  auto features = mean_.size(1);
+  auto input_options = mean_.options();
+  if (mean_.scalar_type() == at::ScalarType::Half || mean_.scalar_type() == at::ScalarType::BFloat16) {
+    input_options = input_options.dtype(ScalarType::Float);
+  }
+  save_mean_ = at::empty({features}, input_options);
+  save_invstd_ = at::empty({features}, input_options);
+
+  auto mean = packed_accessor_or_dummy<
+      accscalar_t, 2, RestrictPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      accscalar_t, 2, RestrictPtrTraits, index_t>(invstd_, "invstd");
+  auto running_mean = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(running_mean_, "running_mean");
+  auto running_var = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(running_var_, "running_mean");
+  auto counts = packed_accessor_or_dummy<
+      scalar_t, 1, RestrictPtrTraits, index_t>(counts_, "counts");
+
+  auto save_mean = get_packed_accessor<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(save_mean_, "save_mean");
+  auto save_invstd = get_packed_accessor<
+      accscalar_t, 1, RestrictPtrTraits, index_t>(save_invstd_, "save_invstd");
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  int block = getNumThreads(features);
+  int grid = std::max<int>(1, features/block);
+  batch_norm_reduce_statistics_kernel<scalar_t, accscalar_t, index_t> <<<grid, block, 0, stream>>>
+      (mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(save_mean_, save_invstd_);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce_cuda_template(const Tensor& grad_out_, const Tensor& input_,
+                                                                                    const Tensor& mean_, const Tensor& invstd_, const Tensor& weight_,
+                                                                                    const bool input_g, const bool weight_g, const bool bias_g) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  Tensor sum_dy_;
+  Tensor sum_dy_xmu_;
+  Tensor grad_weight_;
+  Tensor grad_bias_;
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+
+  if (input_g) {
+    sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (weight_g) {
+    grad_weight_ = at::empty({n_input}, weight_.options());
+  }
+  if (bias_g) {
+    grad_bias_ = at::empty({n_input}, weight_.options());
+  }
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto grad_weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight");
+  auto grad_bias = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto batch_size = input_reshaped.size(0);
+  auto feature_size = input_reshaped.size(2);
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  int warp_size = at::cuda::warp_size();
+  int block_y = std::min<int>(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size);
+  // We want block_x to be at least a warp width
+  int block_x = std::min<int>(std::max<int>(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y);
+  const dim3 block(block_x, block_y);
+  const dim3 grid(n_input);
+
+  batch_norm_backward_reduce_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<grid, block, 0, stream>>>
+    (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_);
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_,
+                                               const Tensor& mean_, const Tensor& invstd_,
+                                               const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+  auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // The kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+  auto reduction_size = input_.numel() / n_input;
+  auto norm_fct = static_cast<stat_accscalar_t>(1.0 / reduction_size);
+  batch_norm_backward_elemt_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t>
+      <<<blocks_trans, threads_trans, 0, stream>>>
+      (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return grad_input_reshaped.view(input_.sizes());
+}
+
+template<typename input_scalar_t, typename stat_scalar_t, typename index_t>
+Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_,
+                                               const Tensor& mean_, const Tensor& invstd_,
+                                               const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_, const Tensor& count) {
+
+  using stat_accscalar_t = at::acc_type<stat_scalar_t, true>;
+  int64_t n_input = input_.size(1);
+  auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions
+  auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes());
+  auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+
+  auto input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input");
+  auto grad_input = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input");
+  auto grad_output = get_packed_accessor<
+      input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output");
+  auto mean = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean");
+  auto invstd = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd");
+  auto weight = packed_accessor_or_dummy<
+      stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight");
+  auto sum_dy = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy");
+  auto sum_dy_xmu = packed_accessor_or_dummy<
+      stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu");
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  // The kernel is pointwise, but we need to balance reading parameters (save_var/mean,
+  // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks
+  // and good occupancy. Quiet likely, we could go with even more blocks than 1024.
+  // The various planes are independent, so we use blocks for them.
+  int tf = std::max<int>(getNumThreads(input.size(2)/4),
+                         std::min<int>(getNumThreads(input.size(2)), 64));
+  int tb = std::max<int>(64/tf, 1);
+  dim3 blocks_trans(input.size(1), std::max<int>(1, std::min<int>((256*1024)/input.size(1),
+                                                                  (input.size(0)+tb-1)/tb)));
+  blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE);
+  dim3 threads_trans(tf, tb);
+  batch_norm_backward_elemt_kernel<input_scalar_t, stat_scalar_t, stat_accscalar_t, index_t> <<<blocks_trans, threads_trans, 0, stream>>>
+    (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, count.const_data_ptr<int>(), count.numel());
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return grad_input_reshaped.view(input_.sizes());
+}
+
+// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance
+// original apex name: welford_kernel_c_last
+template
+   <typename VarTransform,
+    typename scalar_t,
+    typename accscalar_t,
+    int PARALLEL_LOADS>
+__global__ void
+batch_norm_collect_statistics_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      accscalar_t* __restrict__ out_mean,
+      accscalar_t* __restrict__ out_invstd,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride,
+      accscalar_t epsilon) {
+  // hide latency with concurrency
+  accscalar_t x_mean[PARALLEL_LOADS];
+  accscalar_t m_2_n[PARALLEL_LOADS];
+  int count[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    x_mean[i] = accscalar_t(0);
+    m_2_n[i] = accscalar_t(0);
+    count[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_math[PARALLEL_LOADS];
+    accscalar_t x_count_inv[PARALLEL_LOADS];
+    accscalar_t is_valid[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_math[j] = input[address_base];
+        count[j]++;
+        x_count_inv[j] = accscalar_t(1) / count[j];
+        is_valid[j] = accscalar_t(1);
+      } else {
+        x_math[j] = accscalar_t(0);
+        x_count_inv[j] = accscalar_t(0);
+        is_valid[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate mean/m2n with welford
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      accscalar_t delta0 = x_math[j] - x_mean[j];
+      x_mean[j] += delta0 * x_count_inv[j];
+      accscalar_t delta1 = x_math[j] - x_mean[j];
+      m_2_n[j] += delta0 * delta1 * is_valid[j];
+    }
+  }
+
+  // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]);
+  }
+
+  // release x_mean / m_2_n
+  auto mean_th = x_mean[0];
+  auto m2_th = m_2_n[0];
+  auto count_th = count[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE];
+  static __shared__ int shmem_count[MAX_BLOCK_SIZE];
+
+  welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_mean = staging_data;
+    volatile accscalar_t* staging_m2n = &staging_data[stride*gridDim.y];
+    volatile int* staging_count = reinterpret_cast<volatile int*>(&staging_m2n[stride*gridDim.y]);
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_mean[address_base] = mean_th;
+      staging_m2n[address_base] = m2_th;
+      staging_count[address_base] = count_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      count_th = 0;
+      mean_th = accscalar_t(0.0);
+      m2_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        int count_new = c_offset < stride ? staging_count[address_base] : 0;
+        accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0);
+        accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0);
+
+        welford_merge_element(count_th, mean_th, m2_th, count_new, mean_new, m2n_new);
+      }
+
+      welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        out_mean[c_offset] = static_cast<accscalar_t>(mean_th);
+        out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon);
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      out_mean[c_offset] = static_cast<accscalar_t>(mean_th);
+      out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon);
+    }
+  }
+}
+
+// elementwise BN kernel
+// original apex name: batchnorm_forward_c_last_kernel
+template <
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t,
+    int PARALLEL_LOADS>
+__global__ void batch_norm_transform_input_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ z,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const layerscalar_t* __restrict__ shift,
+      scalar_t* __restrict__ out,
+      const int reduction_size,
+      const int stride,
+      const bool fuse_relu) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  auto m_c = mean[c_offset];
+  auto inv_std_c = static_cast<accscalar_t>(inv_std[c_offset]);
+  auto w_c = weight == nullptr ? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset]);
+  auto s_c = shift == nullptr ? accscalar_t(0.0) : static_cast<accscalar_t>(shift[c_offset]);
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        auto tmp = w_c * (static_cast<accscalar_t>(input[address_base]) - m_c ) * inv_std_c + s_c;
+        if (z != nullptr) {
+          tmp += z[address_base];
+        }
+        out[address_base] = (fuse_relu && tmp <= accscalar_t(0.0) ? scalar_t(0.0) : static_cast<scalar_t>(tmp));
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+template<typename T>
+__device__ __forceinline__ void merge_block_vertical_backward(T& sum_dy,
+    T& sum_dy_xmu,
+    T* shmem_sum_dy,
+    T* shmem_sum_dy_xmu) {
+  // write to shared memory
+  auto address_base = threadIdx.x + threadIdx.y * blockDim.x;
+
+#pragma unroll
+  for (int offset = blockDim.y/2; offset > 0; offset >>= 1) {
+    if (threadIdx.y < offset*2) {
+      shmem_sum_dy[address_base] = sum_dy;
+      shmem_sum_dy_xmu[address_base] = sum_dy_xmu;
+    }
+    __syncthreads();
+    if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+      auto address = address_base + offset * blockDim.x;
+
+      sum_dy += shmem_sum_dy[address];
+      sum_dy_xmu += shmem_sum_dy_xmu[address];
+    }
+  }
+}
+
+// batchnorm backward kernel for c last tensor
+// original apex name: reduce_bn_c_last_kernel
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_reduce_channels_last_kernel(
+      const scalar_t* __restrict__ input,
+      const scalar_t* __restrict__ grad_output,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      accscalar_t* __restrict__ sum_dy_o,
+      accscalar_t* __restrict__ sum_dy_xmu_o,
+      layerscalar_t* __restrict__ grad_weight,
+      layerscalar_t* __restrict__ grad_bias,
+      volatile accscalar_t* staging_data,
+      int* semaphores,
+      const int reduction_size,
+      const int stride) {
+
+  // hide latency with concurrency
+  accscalar_t sum_dy[PARALLEL_LOADS];
+  accscalar_t sum_dy_xmu[PARALLEL_LOADS];
+
+#pragma unroll
+  for (int i = 0; i < PARALLEL_LOADS; i++) {
+    sum_dy[i] = accscalar_t(0);
+    sum_dy_xmu[i] = accscalar_t(0);
+  }
+  // tensor dimension (m,c)
+
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  auto r_mean = mean[c_offset];
+  auto factor = inv_std[c_offset];
+
+  for (int i = 0; i < loop_count; i++) {
+    accscalar_t x_input[PARALLEL_LOADS];
+    accscalar_t x_grad_output[PARALLEL_LOADS];
+
+    // load multiple data in
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        x_input[j] = input[address_base];
+        x_grad_output[j] = grad_output[address_base];
+      } else {
+        x_input[j] = accscalar_t(0);
+        x_grad_output[j] = accscalar_t(0);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+
+    // calculate sum_dy / sum_dy_xmu
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      sum_dy[j] += x_grad_output[j];
+      sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean);
+    }
+  }
+
+  // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS
+#pragma unroll
+  for (int j = 1; j < PARALLEL_LOADS; j++) {
+    sum_dy[0] += sum_dy[j];
+    sum_dy_xmu[0] += sum_dy_xmu[j];
+  }
+
+  // release array of registers
+  auto sum_dy_th = sum_dy[0];
+  auto sum_dy_xmu_th = sum_dy_xmu[0];
+
+  // block-wise reduction with shared memory (since reduction cannot be done within a warp)
+  static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE];
+  static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE];
+
+  merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+
+  if (gridDim.y > 1) {
+    volatile accscalar_t* staging_sum_dy = staging_data;
+    volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride*gridDim.y];
+
+    address_base = c_offset + blockIdx.y * stride;
+    // write data to staging_data;
+    if (threadIdx.y == 0 && c_offset < stride) {
+      staging_sum_dy[address_base] = sum_dy_th;
+      staging_sum_dy_xmu[address_base] = sum_dy_xmu_th;
+    }
+
+    __threadfence();
+    __syncthreads(); // ensuring writes to staging_ is visible to all blocks
+
+    __shared__ bool is_last_block_done;
+    // mark block done
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int old = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done = (old == (gridDim.y-1));
+    }
+
+    __syncthreads();
+
+    // check that all data is now available in global memory
+    if (is_last_block_done) {
+      sum_dy_th = accscalar_t(0.0);
+      sum_dy_xmu_th = accscalar_t(0.0);
+
+      for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) {
+        address_base = c_offset + y * stride;
+        sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0));
+        sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0));
+      }
+
+      merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu);
+      if (threadIdx.y == 0 && c_offset < stride) {
+        if (grad_bias != nullptr) {
+          grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+        }
+        if (grad_weight != nullptr) {
+          grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+        }
+        //mean_dy[c_offset] = sum_dy_th / reduction_size;
+        //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+        sum_dy_o[c_offset] = sum_dy_th;
+        sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
+      }
+    }
+  } else {
+    if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) {
+      if (grad_bias != nullptr) {
+        grad_bias[c_offset] = static_cast<layerscalar_t>(sum_dy_th);
+      }
+      if (grad_weight != nullptr) {
+        grad_weight[c_offset] = static_cast<layerscalar_t>(sum_dy_xmu_th * factor);
+      }
+      //mean_dy[c_offset] = sum_dy_th / reduction_size;
+      //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size;
+      sum_dy_o[c_offset] = sum_dy_th;
+      sum_dy_xmu_o[c_offset] = sum_dy_xmu_th;
+    }
+  }
+}
+
+// elementwise BN kernel
+// original apex name: batchnorm_backward_c_last_kernel
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__device__ __forceinline__ void batch_norm_backward_elemt_channels_last_kernel_impl(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const accscalar_t norm_fct,
+      const int reduction_size,
+      const int stride) {
+  // tensor dimension (m,c)
+  // loop along m dimension
+  int inner_loop_stride = blockDim.y * gridDim.y;
+
+  // offset along m dimension
+  int m_offset = blockIdx.y * blockDim.y + threadIdx.y;
+  int c_offset = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (c_offset >= stride || m_offset >= reduction_size) {
+    return;
+  }
+
+  auto m_c = mean[c_offset];
+  auto m_dy_c = sum_dy[c_offset] * norm_fct;
+  auto factor_1_c = inv_std[c_offset];
+  auto factor_2_c = (weight == nullptr? accscalar_t(1.0) : static_cast<accscalar_t>(weight[c_offset])) * factor_1_c;
+  factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[c_offset] * norm_fct;
+
+  int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS);
+  int address_base = m_offset * stride + c_offset;
+  int address_increment = inner_loop_stride * stride;
+
+  for (int i = 0; i < loop_count; i++) {
+#pragma unroll
+    for (int j = 0; j < PARALLEL_LOADS; j++) {
+      if (c_offset < stride && m_offset < reduction_size) {
+        grad_input[address_base] = static_cast<scalar_t>(
+            (static_cast<accscalar_t>(grad_output[address_base]) - m_dy_c -
+            (static_cast<accscalar_t>(input[address_base]) - m_c) * factor_1_c)
+            * factor_2_c);
+      }
+      m_offset += inner_loop_stride;
+      address_base += address_increment;
+    }
+  }
+}
+
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_elemt_channels_last_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      const int* __restrict__ numel,
+      scalar_t* __restrict__ grad_input,
+      const int64_t world_size,
+      const int reduction_size,
+      const int stride) {
+
+  int64_t total_numel = 0;
+  for (int i = 0; i < world_size; i++) {
+    total_numel += numel[i];
+  }
+
+  auto norm_fct = static_cast<accscalar_t>(1) / static_cast<accscalar_t>(total_numel);
+  batch_norm_backward_elemt_channels_last_kernel_impl<PARALLEL_LOADS>(
+      grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu,
+      grad_input, norm_fct, reduction_size, stride);
+}
+
+template <
+    int PARALLEL_LOADS,
+    typename scalar_t,
+    typename accscalar_t,
+    typename layerscalar_t>
+__global__ void batch_norm_backward_elemt_channels_last_kernel(
+      const scalar_t* __restrict__ grad_output,
+      const scalar_t* __restrict__ input,
+      const accscalar_t* __restrict__ mean,
+      const accscalar_t* __restrict__ inv_std,
+      const layerscalar_t* __restrict__ weight,
+      const accscalar_t* __restrict__ sum_dy,
+      const accscalar_t* __restrict__ sum_dy_xmu,
+      scalar_t* __restrict__ grad_input,
+      const accscalar_t norm_fct,
+      const int reduction_size,
+      const int stride) {
+  batch_norm_backward_elemt_channels_last_kernel_impl<PARALLEL_LOADS>(
+      grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu,
+      grad_input, norm_fct, reduction_size, stride);
+}
+
+template<typename scalar_t, typename VarTransform>
+void batch_norm_stats_channels_last_cuda_template(
+    const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input, double epsilon) {
+  using accscalar_t = at::acc_type<scalar_t, true>;
+
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  resize_output(out_mean, {stride});
+  resize_output(out_invstd, {stride});
+  TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() &&
+                        out_invstd.sizes()[0]);
+  TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() &&
+                        out_mean.sizes()[0]);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({4*stride*grid.y}, out_mean.options());
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+
+  accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+  int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+  batch_norm_collect_statistics_channels_last_kernel<VarTransform, scalar_t, accscalar_t, ELEMENTS_PER_ITER>
+      <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
+      input.const_data_ptr<scalar_t>(),
+      out_mean.mutable_data_ptr<accscalar_t>(),
+      out_invstd.mutable_data_ptr<accscalar_t>(),
+      staging_data_ptr,
+      semaphores_ptr,
+      reduction_size,
+      stride,
+      epsilon);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+void batch_norm_elemt_channels_last_cuda_template(
+    const at::Tensor& output,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& shift,  // bias of BN
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::optional<at::Tensor>& z = c10::nullopt,  // bias after BN
+    const bool fuse_relu = false) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  const auto second_dtype = weight.defined() ? weight.scalar_type() :
+      (shift.defined() ? shift.scalar_type() : input.scalar_type());
+
+  if (input.scalar_type() != second_dtype) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      batch_norm_transform_input_channels_last_kernel<scalar_t, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.const_data_ptr<scalar_t>(),
+          z.has_value() ? z.value().const_data_ptr<scalar_t>() : nullptr,
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<accscalar_t>() : nullptr,
+          shift.defined() ? shift.const_data_ptr<accscalar_t>() : nullptr,
+          output.mutable_data_ptr<scalar_t>(),
+          reduction_size,
+          stride,
+          fuse_relu);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()){
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_forward: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      batch_norm_transform_input_channels_last_kernel<scalar_t, accscalar_t, scalar_t, ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.const_data_ptr<scalar_t>(),
+          z.has_value() ? z.value().const_data_ptr<scalar_t>() : nullptr,
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          shift.defined() ? shift.const_data_ptr<scalar_t>(): nullptr,
+          output.mutable_data_ptr<scalar_t>(),
+          reduction_size,
+          stride,
+          fuse_relu);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor>
+batch_norm_backward_reduce_cuda_channels_last_template(const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const bool input_g, const bool weight_g, const bool bias_g) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  at::Tensor sumn_dy = at::empty({stride}, mean.options());
+  at::Tensor sum_dy_xmu = at::empty({stride}, mean.options());
+
+  at::Tensor grad_weight;
+  at::Tensor grad_bias;
+  if (weight.defined()) {
+    grad_weight = at::empty({stride}, weight.options());
+    grad_bias = at::empty({stride}, weight.options());
+  } else {
+    // because I cannot return an uninitialized at::Tensor
+    grad_weight = at::empty({0}, mean.options());
+    grad_bias = at::empty({0}, mean.options());
+  }
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid, true);
+
+  at::Tensor staging_data;
+  at::Tensor semaphores;
+  if (grid.y > 1) {
+    staging_data = at::empty({2*stride*grid.y}, mean.options());
+    semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt));
+  }
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (weight.defined() && input.scalar_type() != weight.scalar_type()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+      batch_norm_backward_reduce_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.const_data_ptr<scalar_t>(),
+          grad_output.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          sumn_dy.mutable_data_ptr<accscalar_t>(),
+          sum_dy_xmu.mutable_data_ptr<accscalar_t>(),
+          grad_weight.mutable_data_ptr<accscalar_t>(),
+          grad_bias.mutable_data_ptr<accscalar_t>(),
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()) {
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_reduce: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr<accscalar_t>() : nullptr;
+      int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr<int>() : nullptr;
+      batch_norm_backward_reduce_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          input.const_data_ptr<scalar_t>(),
+          grad_output.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          sumn_dy.mutable_data_ptr<accscalar_t>(),
+          sum_dy_xmu.mutable_data_ptr<accscalar_t>(),
+          weight.defined() ? grad_weight.mutable_data_ptr<scalar_t>() : nullptr,
+          weight.defined() ? grad_bias.mutable_data_ptr<scalar_t>() : nullptr,
+          staging_data_ptr,
+          semaphores_ptr,
+          reduction_size,
+          stride);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return std::make_tuple(sumn_dy, sum_dy_xmu, grad_weight, grad_bias);
+}
+
+at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const at::Tensor& sum_dy,
+    const at::Tensor& sum_dy_xmu,
+    const at::Tensor& count) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.const_data_ptr<accscalar_t>(),
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          count.const_data_ptr<int>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          count.numel(),
+          reduction_size,
+          stride);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  } else {
+    if (weight.defined()) {
+      TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_element: input.scalar_type() ", input.scalar_type(),
+        " is not supported with weight.scalar_type() ", weight.scalar_type());
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+      using accscalar_t = at::acc_type<scalar_t, true>;
+      batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          count.const_data_ptr<int>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          count.numel(),
+          reduction_size,
+          stride);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    });
+  }
+
+  return grad_input;
+}
+
+at::Tensor batch_norm_backward_elemt_channels_last_cuda_template(
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& mean,
+    const at::Tensor& inv_std,
+    const at::Tensor& weight,
+    const at::Tensor& sum_dy,
+    const at::Tensor& sum_dy_xmu) {
+  const auto stride = input.sizes()[1];
+  const auto reduction_size = input.numel() / stride;
+  auto norm_fct = 1.0 / reduction_size;
+
+  // Input is guarunteed to be channels-last compatible
+  at::Tensor grad_input = at::empty_like(input);
+
+  dim3 block;
+  dim3 grid;
+  flexible_launch_configs(reduction_size, stride, block, grid);
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] {
+    using accscalar_t = at::acc_type<scalar_t, true>;
+
+    if (weight.defined() && weight.scalar_type() != input.scalar_type()) {
+      batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.const_data_ptr<accscalar_t>(),
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          static_cast<accscalar_t>(norm_fct),
+          reduction_size,
+          stride);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      batch_norm_backward_elemt_channels_last_kernel<ELEMENTS_PER_ITER>
+          <<<grid, block, 0, stream>>>(
+          grad_output.const_data_ptr<scalar_t>(),
+          input.const_data_ptr<scalar_t>(),
+          mean.const_data_ptr<accscalar_t>(),
+          inv_std.const_data_ptr<accscalar_t>(),
+          weight.defined() ? weight.const_data_ptr<scalar_t>() : nullptr,
+          sum_dy.const_data_ptr<accscalar_t>(),
+          sum_dy_xmu.const_data_ptr<accscalar_t>(),
+          grad_input.mutable_data_ptr<scalar_t>(),
+          static_cast<accscalar_t>(norm_fct),
+          reduction_size,
+          stride);
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+  });
+
+  return grad_input;
+}
+
+} } // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..79ecec1981f376e528af2674a5822fa01cc6fd00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
@@ -0,0 +1,401 @@
+#pragma once
+
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/cuda/DeviceUtils.cuh>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
+// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
+// The template arguments have the following meaning:
+// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
+// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
+// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// This is important because it means only __shfl_ instructions are required for reductions.
+// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
+// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
+// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
+// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed.
+// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
+// This allows SoftMax to be fused with a cast immediately following the SoftMax.
+// The mask should have the same shape as input, with a boolean indicate if the value is masked.
+// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D.
+// For instance:
+// input_t=half,  acc_t=float, output_t=half  => read half tensor, float accumulators, write half tensor.
+// input_t=half,  acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor.
+// input_t_float, acc_t=float, output_t=half  => read float tensor, float accumulators, write half tensor.
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+    int idx_offset = first_batch * stride + local_idx;
+
+    src += idx_offset;
+    dst += idx_offset;
+
+    if (is_transformer_mask) {
+        mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx;
+    } else {
+        mask += idx_offset;
+    }
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                elements[i][it] = src[i*element_count+it*WARP_SIZE];
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        bool is_meaningful_max = false;
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (is_masked) {
+                int idx = it*WARP_SIZE;
+                if ((idx + local_idx) < batch_element_count) {
+                    if (!is_transformer_mask) {
+                        idx += i*element_count;
+                    }
+                    if (!mask[idx]) {
+                        max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+                        is_meaningful_max = true;
+                    }
+                }
+            } else {
+                max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it];
+            }
+        }
+        if (is_masked) {
+            if (!is_meaningful_max) {
+                max_value[i] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked) {
+                if (is_log_softmax) {
+                    sum[i] += std::exp(elements[i][it] - max_value[i]);
+                } else {
+                    elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                    sum[i] += elements[i][it];
+                }
+            } else {
+                int idx = it*WARP_SIZE;
+                bool valid = (idx + local_idx) < batch_element_count;
+                if (!is_transformer_mask) {
+                    idx += i*element_count;
+                }
+                if (valid) {
+                    if (!mask[idx]) {
+                        if (is_log_softmax) {
+                            sum[i] += std::exp(elements[i][it] - max_value[i]);
+                        } else {
+                            elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                            sum[i] += elements[i][it];
+                        }
+                    } else {
+                        if (!is_log_softmax) {
+                            // Masked values are treated as -infinity, and std::exp(-infinity) is 0.
+                            elements[i][it] = 0;
+                        }
+                    }
+                } else {
+                    if (!is_log_softmax) {
+                        elements[i][it] = 0.;
+                    }
+                }
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        if (is_log_softmax) sum[i] = std::log(sum[i]);
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_log_softmax) {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
+                } else if (sum[i] == 0) {
+                    dst[i*element_count+it*WARP_SIZE] = std::numeric_limits<acc_t>::quiet_NaN();
+                } else {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i];
+                }
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x % WARP_SIZE;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+    if (is_masked) {
+        mask += thread_offset;
+    }
+
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE];
+                output_reg[i][it] = output[i*element_count+it*WARP_SIZE];
+            } else {
+                grad_reg[i][it] = acc_t(0);
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) {
+                sum[i] += grad_reg[i][it];
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_masked && mask[i*element_count+it*WARP_SIZE]) {
+                    gradInput[i*element_count+it*WARP_SIZE] = 0;
+                }
+                // compute gradients
+                else if (is_log_softmax) {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
+                } else {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+                }
+            }
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E:                    \
+            softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>   \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst,   \
+                    src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
+            LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
+            LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
+            LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
+            LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
+            LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
+            LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
+            LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
+            LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
+            LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
+            LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E:                      \
+            softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked> \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>       \
+                (grad_input, grad, output, batch_count, softmax_elements_stride, \
+                softmax_elements, mask);                                              \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                      \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1
+            LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2
+            LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4
+            LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8
+            LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16
+            LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32
+            LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64
+            LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128
+            LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256
+            LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512
+            LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024
+            default:
+                break;
+        }
+    }
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Pow.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Pow.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f75054e47a6cf4b401f85fb53213178b08e33a17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Pow.cuh
@@ -0,0 +1,58 @@
+#pragma once
+#include <ATen/native/Pow.h>
+#include <c10/core/Scalar.h>
+
+namespace at { namespace native {
+
+namespace {
+
+
+// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt.
+// So we need to define the functions with the explicit function signatures.
+// As for pow, the following signatures are defined as the device function:
+//   pow(float, int)
+//   pow(double, int)
+//   pow(float, float)
+//   pow(double, double)
+#ifdef _MSC_VER
+// Functions for pow
+// pow for at::Half
+static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
+  return static_cast<at::Half>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow for at::BFloat16
+static inline __host__ __device__ at::BFloat16 pow_(at::BFloat16 base, at::BFloat16 exp) {
+  return static_cast<at::BFloat16>(std::pow(static_cast<float>(base), static_cast<float>(exp)));
+}
+// pow (floating, floating/int)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<std::is_floating_point<Base_type>::value && (std::is_same<Base_type, Exp_type>::value || std::is_same<Exp_type, int>::value), Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return std::pow(base, exp);
+}
+// pow (Otherwise)
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ typename std::enable_if<!std::is_same<Base_type, Exp_type>::value && !std::is_same<Exp_type, int>::value, Base_type>::type
+  pow_(Base_type base, Exp_type exp) {
+  return static_cast<Base_type>(std::pow(static_cast<double>(base), static_cast<double>(exp)));
+}
+#else
+template <typename Base_type, typename Exp_type>
+static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) {
+  return ::pow(base, exp);
+}
+#endif
+
+template <typename T>
+static inline __host__ __device__ std::enable_if_t<std::is_integral<T>::value, T> pow_(
+    T base, T exp) {
+  return at::native::powi(base, exp);
+}
+
+template <typename T>
+static inline __host__ __device__ c10::complex<T> pow_(c10::complex<T> base, c10::complex<T> exp) {
+  return c10_complex_math::pow(base, exp);
+}
+
+} // namespace
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Randperm.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Randperm.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3f908031f1e938b65d87c77d6a0d4182bc5747de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Randperm.cuh
@@ -0,0 +1,58 @@
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <ATen/Utils.h>
+
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
+
+namespace {
+
+// See note [Algorithm of randperm]
+template<typename T, typename scalar_t>
+__global__ void randperm_handle_duplicate_keys_kernel(T *keys, scalar_t *data, T mask, int n, at::PhiloxCudaState philox_args) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+
+  // find the beginning of islands
+  if (tid >= n - 1) return;  // out of range
+  if ((keys[tid] & mask) != (keys[tid + 1] & mask)) return;  // not in an island
+  if (tid != 0 && (keys[tid] & mask) == (keys[tid - 1] & mask)) return;  // not the beginning of an island
+
+  // find the size of islands
+  int island_size = 0;
+  do { island_size++; }
+  while ((tid + island_size < n) && (keys[tid + island_size] & mask) == (keys[tid] & mask));
+
+  // do random permutation inside each island.
+  data += tid;
+  auto seeds = at::cuda::philox::unpack(philox_args);
+  curandStatePhilox4_32_10_t state;
+  curand_init(std::get<0>(seeds), tid, std::get<1>(seeds), &state);
+  for (int i = island_size - 1; i > 0; i--) {
+    unsigned int r = curand(&state) % (i + 1);
+    if (i != r) {
+      scalar_t tmp = data[i];
+      data[i] = data[r];
+      data[r] = tmp;
+    }
+  }
+}
+
+// See note [Algorithm of randperm]
+template<typename T, typename scalar_t>
+void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, c10::optional<at::Generator> &gen_) {
+  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(gen_, at::cuda::detail::getDefaultCUDAGenerator());
+  int64_t counter_offset = n;
+  at::PhiloxCudaState rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_cuda_state(counter_offset);
+  }
+  T mask = static_cast<T>((1UL << bits) - 1);
+  randperm_handle_duplicate_keys_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+    keys, data, mask, n, rng_engine_inputs);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..765a2d4724e036820b33590d34304060cab7d690
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Reduce.cuh
@@ -0,0 +1,1354 @@
+#pragma once
+
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/thread_constants.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/OpMathType.h>
+#include <c10/macros/Macros.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <functional>
+#include <iosfwd>
+#include <type_traits>
+#include <utility>
+#include <thrust/pair.h>
+
+#include <ATen/native/cuda/jit_utils.h>
+
+namespace at { namespace native {
+
+using at::detail::Array;
+
+static inline int64_t div_up(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
+// returns floor(log2(n))
+static inline int last_pow2(int n) {
+  n |= (n >>  1);
+  n |= (n >>  2);
+  n |= (n >>  4);
+  n |= (n >>  8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+// returns reduced fraction numerator & denominator
+C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+  // get GCD of num and denom using Euclid's algorithm.
+  // Can replace this with std::gcd if we ever support c++17.
+  size_t a = denominator;
+  size_t b = numerator;
+  while (b != 0) {
+      a %= b;
+      // swap(a,b)
+      size_t tmp = a;
+      a = b;
+      b = tmp;
+  }
+
+  // a is now the GCD
+  numerator /= a;
+  denominator /= a;
+}
+
+//template for changing MAX_NUM_THREADS based on op dtype
+template <typename T>
+struct mnt_wrapper {
+  static constexpr int MAX_NUM_THREADS = 512;
+};
+
+template <>
+struct mnt_wrapper <c10::complex<double>>{
+  static constexpr int MAX_NUM_THREADS = 256;
+};
+
+constexpr int max_reduce_threads(c10::ScalarType type) {
+  return type == kComplexDouble ? 256 : 512;
+}
+
+struct ReduceConfig {
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+
+  ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs)
+    : element_size_bytes(element_size_bytes)
+    , num_inputs(num_inputs)
+    , num_outputs(num_outputs) {}
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  template <typename T>
+  void set_block_dimension(int64_t dim0, int64_t dim1) {
+    const int max_num_threads = mnt_wrapper<T>::MAX_NUM_THREADS / output_vec_size;
+    int dim0_pow2 = dim0 < max_num_threads ? static_cast<int>(last_pow2(dim0)) : max_num_threads;
+    int dim1_pow2 = dim1 < max_num_threads ? static_cast<int>(last_pow2(dim1)) : max_num_threads;
+    block_width = std::min(dim0_pow2, int(at::cuda::warp_size()));
+    block_height = std::min(dim1_pow2, int(max_num_threads / block_width));
+    block_width = std::min(dim0_pow2, int(max_num_threads / block_height));
+    num_threads = block_width * block_height;
+  }
+
+  int split_input(int parallelism) {
+    int step = step_input;
+    step_input *= parallelism;
+    return step;
+  }
+
+  int split_output(int parallelism) {
+    int step = step_output;
+    step_output *= parallelism;
+    return step;
+  }
+
+  dim3 block() const {
+    return dim3(block_width, block_height);
+  }
+
+  dim3 grid() const {
+    return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output);
+  }
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+  int shared_memory_size() const {
+    if (!should_block_y_reduce() &&
+        (!should_block_x_reduce() ||
+         block_width <= at::cuda::warp_size())) {
+      return 0;
+    }
+    return element_size_bytes * num_threads * output_vec_size;
+  }
+
+  int64_t global_memory_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output;
+    if (!should_block_x_reduce()) {
+      size *= block().x * output_vec_size;
+    }
+    return size;
+  }
+
+  int semaphore_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    return sizeof(int) * grid().x;
+  }
+
+  int values_per_thread() const {
+    return div_up(num_inputs, step_input);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const ReduceConfig& config);
+
+template<int nt, int output_vec_size, typename R>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void reduce_kernel(R reduction) {
+  reduction.template run<output_vec_size>();
+}
+
+template <typename index_t>
+static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int num_output_dims = iter.ndim() - num_reduce_dims;
+  int input_index = iter.ntensors() - 1;
+  int output_index = 0;
+  std::array<const int64_t*, 2> strides = {
+    iter.strides(output_index).data() + num_reduce_dims,
+    iter.strides(input_index).data() + num_reduce_dims,
+  };
+  auto shape = iter.shape().data() + num_reduce_dims;
+  return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data());
+}
+
+template <typename index_t>
+static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int input_index = iter.ntensors() - 1;
+  std::array<const int64_t*, 1> strides = {
+    iter.strides(input_index).data(),
+  };
+  return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data());
+}
+
+template <typename out_scalar_t, typename func_t>
+struct func_wrapper_t {
+  using arg_t = typename binary_function_traits<func_t>::arg1_t;
+  using scalar_t = typename binary_function_traits<func_t>::arg2_t;
+
+  func_t combine;
+  static inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+  static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  func_wrapper_t(const func_t& op) : combine(op) {
+  }
+
+  // wrap a normal reduction that ignores the index
+  __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const {
+    return combine(acc, val);
+  }
+};
+
+template <typename scalar_t, typename func_t>
+func_wrapper_t<scalar_t, func_t> func_wrapper(const func_t& op) {
+  return func_wrapper_t<scalar_t, func_t> { op };
+}
+
+template <typename scalar_t, typename out_scalar_t=scalar_t>
+struct ReduceJitOp {
+//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations
+//Maybe we can find a way to unify ReduceOp and ReduceJitOp
+  using InputCalculator = OffsetCalculator<1, uint32_t>;
+  using OutputCalculator = OffsetCalculator<2, uint32_t>;
+  //TODO for now arg_t is always opmath_t of the input, later we'll need to change it
+  using arg_t = at::opmath_type<scalar_t>;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+  //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
+  //not just wrapper
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceJitOp(
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+};
+
+template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
+struct ReduceOp {
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename std::decay<typename traits::template arg<0>::type>::type;
+
+  using InputCalculator = OffsetCalculator<1, index_t>;
+  using OutputCalculator = OffsetCalculator<2, index_t>;
+
+  static constexpr bool can_accumulate_in_output =
+    std::is_convertible<arg_t, out_scalar_t>::value
+    && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  ops_t ops;
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceOp(
+      ops_t ops,
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ops(ops),
+        ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    index_t output_idx = config.output_idx<output_vec_size>();
+    index_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+      value = thread_reduce<output_vec_size>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<output_vec_size>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<output_vec_size>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<output_vec_size>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      CUDA_KERNEL_ASSERT(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](index_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    index_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(at::native::memory::aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = at::native::memory::aligned_vector<scalar_t, input_vec_size>;
+
+    index_t idx = config.input_idx();
+    const index_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      const auto values_vec = memory::load_vector<input_vec_size>(data, idx);
+      #pragma unroll
+      for (index_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    index_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        const auto value = c10::load(data + idx);
+        value_list[0] = ops.reduce(value_list[0], value, idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = ops.combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    index_t idx = config.input_idx();
+    const index_t end = config.num_inputs;
+    const index_t stride = config.step_input;
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using load_t = at::native::memory::aligned_vector<scalar_t, output_vec_size>;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        const auto offset = calc(idx + i * stride) / output_vec_size;
+        values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      }
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (index_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      const auto offset = calc(idx) / output_vec_size;
+      values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_x_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = ops.warp_shfl_down(value[i], offset);
+        value[i] = ops.combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_y_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size> out,
+    at::detail::Array<arg_t, output_vec_size> value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    at::detail::Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = ops.combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  // This function should never be called --
+  // it's the version of `accumulate_in_output`
+  // when accumulation in the output is not possible.
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size>,
+    at::detail::Array<arg_t, output_vec_size>,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(false);
+    return arg_t {};
+  }
+
+  // This function should never be called --
+  // it's the version of `get_accumulated_output`
+  // when accumulation in the output is not possible.
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(false);
+    return *out;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const index_t base_offset) const {
+    CUDA_KERNEL_ASSERT(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+  //Currently implemented for max of two outputs
+  template<class T1, class T2>
+  C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+    if (noutputs >= 1) {
+      auto res0 = (T1*)((char*)dst[0] + base_offset);
+      *res0 = x.first;
+    }
+    if (noutputs >= 2) {
+      // base offset is computed assuming element size being sizeof(T1), so we need to make a
+      // correction to obtain the correct base offset
+      auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+      *res1 = x.second;
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<index_t, output_vec_size> base_offset) const {
+    CUDA_KERNEL_ASSERT(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(ops.project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> global_reduce(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    index_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      index_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        index_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      } else {
+        index_t input_offset = threadIdx.y;
+        index_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = ops.combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+template<int max_threads, typename R>
+static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int shared_memory = config.shared_memory_size();
+
+  switch(config.output_vec_size) {
+  case 4:
+    reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  case 2:
+    reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  default:
+    reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+inline void launch_jitted_reduce_kernel(
+    std::mutex &jiterator_mutex,
+    std::array<at::cuda::jit::NvrtcFunction, 3> &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    int vt0, const ReduceConfig& config, void *reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  int shared_memory = config.shared_memory_size();
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  switch(config.output_vec_size) {
+  case 4:
+    fn_ptr = &fn_cache[0];
+    break;
+  case 2:
+    fn_ptr = &fn_cache[1];
+    break;
+  default:
+    fn_ptr = &fn_cache[2];
+  }
+  if (!fn_ptr->function) {
+    int max_threads_codegen =
+        max_reduce_threads(desc.f_inputs_type) / config.output_vec_size;
+    auto code = at::cuda::jit::generate_reduction_code(
+        desc, vt0, true, false, config.output_vec_size, max_threads_codegen);
+
+    *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name);
+  }
+  constexpr int kernel_args = 1;
+  void* args[kernel_args];
+  args[0] = reduction;
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory);
+}
+
+
+class AccumulationBuffer {
+ public:
+  AccumulationBuffer() {}
+
+  AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) {
+    out_ptr_ = (char*)out_ptr;
+    if (out_t_size >= acc_t_size) {
+      // reusing output buffer for accumulation.
+      acc_ptr_ = (char*)out_ptr;
+      numerator_ = 1;
+      denominator_ = 1;
+    } else {
+      auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+      buffer_ = allocator.allocate(size);
+      acc_ptr_ = (char*)buffer_.get();
+      numerator_ = acc_t_size;
+      denominator_ = out_t_size;
+      reduce_fraction(numerator_, denominator_);
+    }
+  }
+
+  char* get_acc_slice(char* out_ptr) {
+    if (acc_ptr_ == nullptr) {
+      return nullptr;
+    }
+    return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_);
+  }
+
+ private:
+  char* acc_ptr_ = nullptr;
+  char* out_ptr_ = nullptr;
+  size_t numerator_;
+  size_t denominator_;
+  at::DataPtr buffer_;
+};
+
+template <typename scalar_t>
+int get_output_vec_size(const TensorIterator &iter) {
+  int vec_size = 4;
+  auto update_vec_size = [&vec_size](uint64_t n) {
+    while(n % vec_size != 0) {
+      vec_size /= 2;
+    }
+  };
+
+  uint64_t base_address = reinterpret_cast<uint64_t>(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t);
+  update_vec_size(base_address);
+
+  const int output_index = iter.num_reduce_dims();
+  update_vec_size(iter.shape()[output_index]);
+
+  int j = 0;
+  for(auto i : iter.strides(iter.noutputs())) {
+    if (j != output_index) {
+      update_vec_size(i / sizeof(scalar_t));
+    }
+    j++;
+  }
+  return vec_size;
+}
+
+template<typename arg_t, typename scalar_t, int vt0>
+ReduceConfig setReduceConfig(const TensorIterator& iter){
+  // Start by assuming that each thread handles a single output and all
+  // the inputs for that output.
+  int64_t num_outputs = iter.num_output_elements();
+  int64_t inputs_per_output = iter.numel() / num_outputs;
+  int input_index = iter.ntensors() - 1;
+
+  auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
+
+  int64_t dim0;
+  int64_t dim1;
+  int64_t fastest_moving_stride;
+  bool reduction_on_fastest_striding_dimension;
+
+  if (iter.ndim() > 0) {
+    // Adjust block size to map block width to fastest changing dimension of input
+    // tensor. This grants the best possible memory accessing pattern, given that
+    // for non-contiguous tensor with space in between, we cannot have perfect
+    // memory coalescing.
+    reduction_on_fastest_striding_dimension =
+        (iter.num_reduce_dims() == iter.ndim()) ||
+        (iter.strides(/*arg=*/input_index)[0] <
+        iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]);
+    // Notice that dim0 & dim1 does NOT guarantee any launch configuration here!
+    // dim0 & dim1 are more like the upper bound of the block dimension. The
+    // actual launch config and reduction scheme is determined by setting values
+    // to `config.input_mult` and `config.output_mult`.
+    // We try to max out dim1 so that we have enough threads per CTA to deliver
+    // performance for larger problem size.
+    if (reduction_on_fastest_striding_dimension) {
+      // Map block.x to the fastest reducing dimension. It implies:
+      //   1. block_x_reduce is required.
+      //   2. block.y now max out to num_outputs.
+      dim0 = inputs_per_output;
+      dim1 = num_outputs;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[0];
+    } else {
+      // Map block.x to the fastest non reducing dimension. It implies:
+      //   1. block_x_reduce is turned off.
+      //   2. block.y now max out to inputs_per_output.
+      dim0 = num_outputs;
+      dim1 = inputs_per_output;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()];
+    }
+  } else {
+    reduction_on_fastest_striding_dimension = true;
+    fastest_moving_stride = sizeof(scalar_t);
+    dim0 = 1;
+    dim1 = 1;
+  }
+
+  // We do vectorization to gain better memory access, there are two cases which we call
+  // "vectorize along input" and "vectorize along output". Note that the "input/output"
+  // here does not mean we are vectorizing load/store instructions. We always only vectorize
+  // load instructions.
+  //
+  // Case 1: "vectorize along input"
+  // This case happens when we are reducing along fastest moving dimesion. In such case, threads
+  // with the same threadIdx.y works on the same reduction cooperatively and will produce results
+  // for the same output. In such case, values in each loaded vector always correspond to the same output.
+  //
+  // Case 2: "vectorize along output"
+  // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
+  // threads with different threadIdx.x are independent and will produce results for different outputs.
+  // In such case, values in each loaded vector always correspond to different outputs.
+  if (fastest_moving_stride == sizeof(scalar_t)) {
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) {
+      // Case 1: "vectorize along input"
+      // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
+      // we should avoid vectorization.
+      config.vectorize_input = true;
+      dim0 /= config.input_vec_size;
+    } else if (!reduction_on_fastest_striding_dimension) {
+      // Case 2: "vectorize along output"
+      config.output_vec_size = get_output_vec_size<scalar_t>(iter);
+      dim0 /= config.output_vec_size;
+    }
+  }
+
+  // Adjust block_width and block_height
+  config.set_block_dimension<scalar_t>(dim0, dim1);
+
+  int block_width = config.block_width;
+  int block_height = config.block_height;
+
+  if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) {
+    // Split the input across lanes if the input is contiguous in the reduced
+    // dimension. This will require reduction between threads using warp
+    // shuffle instructions and shared memory (if block_width > warpSize).
+    config.input_mult[0] = config.split_input(block_width);
+  } else {
+    // Otherwise split the output across lanes in a warp.
+    config.output_mult[0] = config.split_output(block_width);
+  }
+
+  constexpr int min_values_per_thread = 16;
+  constexpr int max_values_per_thread = 256;
+
+  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
+    // Divide the input across warps in a thread-block, if that leaves at least
+    // 16 elements to be summed by each thread. This will require inter-warp
+    // reduction using shared memory.
+    config.input_mult[1] = config.split_input(block_height);
+  } else {
+    // Otherwise, each warp handles a separate output.
+    config.output_mult[1] = config.split_output(block_height);
+  }
+
+  const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads;
+  const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const int target_grid_size = num_mp * blocks_per_sm;
+  int grid = config.grid().x;
+  if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) {
+    // Divide the input across thread-blocks if the amount of work per-thread
+    // is large enough and the size of the output is small enough. This will
+    // require a reduction using global memory.
+    // If we decide to split input across blocks, as long as we can get enough
+    // number of blocks (`target_grid_size`) to balance SM, we should still
+    // make the number of values per thread large for best performance.
+    int ctas_per_output1 = div_up(target_grid_size, grid);
+    int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread);
+    int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread);
+    // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have
+    // a large number of values to deal with. But we don't want values_per_thread to be larger than
+    // max_values_per_thread
+    config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
+    if (config.ctas_per_output > 1) {
+      config.input_mult[2] = config.split_input(config.ctas_per_output);
+    }
+  }
+  return config;
+};
+
+template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
+inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename traits::template arg<0>::type;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(arg_t)));
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+    AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
+      ops,
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  launch_reduce_kernel<mnt_wrapper<scalar_t>::MAX_NUM_THREADS>(config, reduce);
+}
+
+//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function
+//try unifying with gpu_reduce_kernel
+template <char const* name, typename scalar_t, typename out_scalar_t, int vt0=4, typename ident_t=double>
+inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  //TODO - this will be different for more complicated reductions, but for now reductions using
+  //func_wrapper all have arg_t = opmath
+  using arg_t = at::opmath_type<scalar_t>;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(out_scalar_t))); //TODO
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      jitted_gpu_reduce_kernel<name, scalar_t, out_scalar_t, vt0>(sub_iter, func, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  //TODO - for now we support a single input, we may be able to relax this constraint
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+    AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceJitOp<scalar_t, out_scalar_t>(
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  constexpr int nInputs = 1;
+  constexpr int nOutputs = 1;
+  static auto desc = at::cuda::jit::make_kernel_descriptor<
+    out_scalar_t, scalar_t>(name, func, nInputs, nOutputs);
+
+  static std::mutex jiterator_mutex;
+  static std::vector<std::array<at::cuda::jit::NvrtcFunction, 3>> fn_cache(c10::cuda::device_count());
+  auto &cache = fn_cache[iter.device().index()];
+
+  launch_jitted_reduce_kernel(
+      jiterator_mutex, cache, desc, vt0, config, &reduce);
+}
+
+}} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5ef4c050130397dea644386896e7795a2035033
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ReduceOps.h
@@ -0,0 +1,20 @@
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at { namespace native {
+
+void norm_launch_kernel(TensorIterator &iter, double val);
+void min_launch_kernel(TensorIterator &iter);
+void max_launch_kernel(TensorIterator &iter);
+void aminmax_launch_kernel(TensorIterator &iter);
+void min_all_launch_kernel(TensorIterator &iter);
+void max_all_launch_kernel(TensorIterator &iter);
+void aminmax_allreduce_launch_kernel(TensorIterator &iter);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Resize.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Resize.h
new file mode 100644
index 0000000000000000000000000000000000000000..9740ed43ff5288b9ffe3f6666d1ce274500a83b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Resize.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <ATen/EmptyTensor.h>
+#include <ATen/native/ResizeCommon.h>
+
+#include <c10/cuda/CUDAGuard.h>
+
+namespace at { namespace native {
+
+TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes);
+
+static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+
+  const Storage &storage = self->unsafe_storage();
+  TORCH_CHECK(storage, "Tensor: invalid null storage");
+  if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+
+inline TensorImpl* resize_impl_cuda_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool device_guard = true) {
+  if (self->sizes() == size && (!stride || self->strides() == stride)) {
+    return self;
+  }
+
+  // NB: We don't need to hold the device guard when calling from TH
+  cuda::OptionalCUDAGuard guard;
+  if (device_guard) {
+    guard.set_index(self->storage().device().index());
+  }
+
+  const auto itemsize = self->dtype().itemsize();
+  const auto storage_offset = self->storage_offset();
+  size_t storage_size = 1;
+  if (stride) {
+    self->set_sizes_and_strides(size, *stride);
+    storage_size = at::detail::computeStorageNbytes(
+        size, *stride, itemsize, storage_offset);
+  } else {
+    self->set_sizes_contiguous(size);
+    storage_size = at::detail::computeStorageNbytesContiguous(
+        size, itemsize, storage_offset);
+  }
+  maybe_resize_storage_cuda(self, storage_size);
+
+  return self;
+}
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc3d974cf9684205036fa4d71f4fbebbfd1ed52
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+
+namespace native {
+
+// NOTE: these functions require output tensors to be contiguous
+void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..19683e3d030d3246d8895754a84046d4f6906fae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh
@@ -0,0 +1,459 @@
+#pragma once
+#include <ATen/NumericUtils.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/cuda/cub.cuh>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <c10/util/Load.h>
+#include <limits>
+#include <cmath>
+
+namespace at {
+namespace native {
+
+template <typename integer>
+constexpr inline integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr inline integer get_log_num_threads_x_inner_scan(integer num_rows, integer row_size) {
+  integer log_num_threads_x = 0;
+  integer log_num_threads_y = 0;
+  while (((integer)1 << log_num_threads_x) < row_size) {
+    ++log_num_threads_x;
+  }
+  while (((integer)1 << log_num_threads_y) < num_rows) {
+    ++log_num_threads_y;
+  }
+  // we want to keep the ratio between the x-threads and y-threads about the same as
+  // the ratio between the row_size and num_rows, but the total number of threads in
+  // a block should be about 512
+  integer diff = log_num_threads_x - log_num_threads_y;
+  // 9 is from log2(512)
+  log_num_threads_x = ((integer)9 + diff) / (integer)2;
+  // I found that in having larger log_num_threads_x can give significant speed up in some cases,
+  // but detrimental in another case, so just keep the lower bound to be log2(16) == 4 to make it
+  // similar to the previous implementation
+  // Keeping the upper bound to be log2(512) == 9 as the maximum number of threads in a block.
+  log_num_threads_x = std::min(std::max((integer)4, log_num_threads_x), (integer)9);
+  return log_num_threads_x;
+}
+
+template<typename scalar_t, typename idx_t, typename BinaryOperation>
+__device__ void binary_op_update(const scalar_t lhs, scalar_t& rhs, const idx_t lhs_idx, idx_t& rhs_idx, BinaryOperation binary_op) {
+  if(!at::_isnan(rhs) && (at::_isnan(lhs) || !binary_op(rhs, lhs))) {
+    rhs = lhs;
+    rhs_idx = lhs_idx;
+  }
+}
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename scalar_t, class BinaryFunction>
+__global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_,
+                                                int num_rows, int row_size,
+                                                const uint32_t num_threads, const uint32_t log_num_threads_x,
+                                                scalar_t init, BinaryFunction binary_op) {
+  // dynamic memory allocation for vbuf and ibuf
+  alignas(sizeof(double)) extern __shared__ char buf[];
+  scalar_t* vbuf = reinterpret_cast<scalar_t*>(buf); // the size is num_threads * 2
+  int64_t* ibuf = reinterpret_cast<int64_t*>(vbuf + num_threads * 2);
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  scalar_t* row_buf = vbuf + 2 * num_threads_x * threadIdx.y;
+  int64_t* row_idx_buf = ibuf + 2 * num_threads_x * threadIdx.y;
+
+  for (int block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    int row = block_row + threadIdx.y;
+    const scalar_t *row_self = self_ + row * row_size;
+    scalar_t *row_values = values_ + row * row_size;
+    int64_t *row_indices = indices_ + row * row_size;
+    scalar_t block_total = init;
+    int64_t block_idx_final = 0;
+    const bool row_exists = row < num_rows;
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (int block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      int col1 = block_col + threadIdx.x;
+      int col2 = block_col + num_threads_x + threadIdx.x;
+      if (row_exists) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = c10::load(&row_self[col1]);
+          row_idx_buf[threadIdx.x] = col1;
+        } else {
+          row_buf[threadIdx.x] = init;
+          // No need to set the index here as the value in init will never be selected
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = c10::load(&row_self[col2]);
+          row_idx_buf[num_threads_x + threadIdx.x] = col2;
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+          // No need to set the index here as the value in init will never be selected
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          binary_op_update(block_total, row_buf[0], block_idx_final, row_idx_buf[0], binary_op);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction with Sklansky method. The diagram can be seen on this paper:
+      // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+      for (uint32_t s = 1; s <= num_threads_x; s <<= 1) {
+        if (row_exists) {
+          uint32_t a = (threadIdx.x / s) * (2 * s) + s;
+          uint32_t ti = a + (threadIdx.x % s);
+          uint32_t si = a - 1;
+          binary_op_update(row_buf[si], row_buf[ti], row_idx_buf[si], row_idx_buf[ti], binary_op);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row_exists) {
+        if (col1 < row_size){
+          row_values[col1] = row_buf[threadIdx.x];
+          row_indices[col1] = row_idx_buf[threadIdx.x];
+        }
+        if (col2 < row_size) {
+          row_values[col2] = row_buf[num_threads_x + threadIdx.x];
+          row_indices[col2] = row_idx_buf[num_threads_x + threadIdx.x];
+        }
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      block_idx_final = row_idx_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to compute the variance;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename scalar_t, class BinaryFunction>
+__global__ void tensor_kernel_scan_outer_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_,
+                  const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) {
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      const scalar_t *self = self_ + orow * row_size * num_irows + irow;
+      scalar_t *values = values_ + orow * row_size * num_irows + irow;
+      int64_t *indices = indices_ + orow * row_size * num_irows + irow;
+      scalar_t out = init;
+      int64_t out_idx = 0;
+
+      for (auto col = decltype(row_size){0}; col < row_size; ++col) {
+        const auto val = c10::load(self);
+        if(at::_isnan(val) || (!at::_isnan(out) && binary_op(val, out))) {
+          out = val;
+          out_idx = col;
+        }
+        *values = out;
+        *indices = out_idx;
+        self += num_irows;
+        values += num_irows;
+        indices += num_irows;
+      }
+    }
+  }
+}
+
+inline void check_fits_in_unsigned(int64_t val, const char* name) {
+  constexpr auto umax = std::numeric_limits<uint32_t>::max();
+  TORCH_CHECK(
+      val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value");
+}
+
+
+template<typename scalar_t, class BinaryFunction>
+__host__ void scan_outer_dim_with_indices(
+    const TensorBase& self, const TensorBase& values, const TensorBase& indices,
+    int dim, scalar_t init, BinaryFunction binary_op) {
+  int64_t row_size = self.size(dim);
+  auto sizes = self.sizes();
+
+  // Treat all outer dimensions (i.e. dim_ < dim) as one.
+  const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim);
+
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end());
+  //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row,
+  //make sure that input is not bigger than supported by uint32_t
+  check_fits_in_unsigned(num_irows, "num_irows");
+  check_fits_in_unsigned(num_orows, "num_orows");
+  check_fits_in_unsigned(row_size, "row_size");
+
+
+  dim3 threads(std::min(512, int(num_irows)));
+  int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x})));
+  tensor_kernel_scan_outer_dim_with_indices<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+    self.const_data_ptr<scalar_t>(), values.mutable_data_ptr<scalar_t>(), indices.mutable_data_ptr<int64_t>(),
+    num_orows, num_irows, row_size, init, binary_op);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t, class BinaryFunction>
+__host__ void scan_innermost_dim_with_indices(
+    const TensorBase& self, const TensorBase& values, const TensorBase& indices,
+    scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  // Treat all outer dimensions as a single dimension.
+  int row_size = self.size(ndim - 1);
+  int num_rows = self.numel() / row_size;
+
+  // assuming max_num_threads per block is 512
+  const uint32_t num_threads = 512;
+  const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan<uint32_t>(num_rows, row_size);
+  const uint32_t num_threads_x = (1 << log_num_threads_x);
+  const uint32_t num_threads_y = num_threads / num_threads_x;
+  dim3 threads(num_threads_x, num_threads_y);
+  dim3 grid(std::min(at::cuda::getCurrentDeviceProperties()->maxGridSize[0], ceil_div(num_rows, int(threads.y))));
+
+  const uint32_t mem_size = 2 * num_threads * (sizeof(scalar_t) + sizeof(int64_t));
+  tensor_kernel_scan_innermost_dim_with_indices<scalar_t><<<grid, threads, mem_size,
+                                                            at::cuda::getCurrentCUDAStream()>>>(
+    self.const_data_ptr<scalar_t>(), values.mutable_data_ptr<scalar_t>(), indices.mutable_data_ptr<int64_t>(),
+    num_rows, row_size, num_threads, log_num_threads_x, init, binary_op);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename BinaryFunction>
+void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64_t dim) {
+     int64_t dim, scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  auto self_ = self.expect_contiguous();
+  TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous());
+  if (dim == ndim - 1) {
+    scan_innermost_dim_with_indices<scalar_t>(*self_, values, indices, init, binary_op);
+  } else {
+    scan_outer_dim_with_indices<scalar_t>(*self_, values, indices, dim, init, binary_op);
+  }
+}
+
+// TODO: The implementation of `tensor_kernel_scan_outer_dim` and
+// `tensor_kernel_scan_innermost_dim` is similar to
+// `tensor_kernel_scan_outer_dim_with_indices`
+// `tensor_kernel_scan_outer_dim_with_indices` and should be refactored to
+// remove the duplication.
+
+/* Perform an inclusive scan along an outer dimension of a tensor.
+ *
+ * - num_orows is the size of the flattened outer dimensions;
+ * - num_irows is the size of the flattened inner dimensions;
+ * - row_size is the size of the dimension along which to scan;
+ *
+ * The dimensions to the outside and inside of the specified dimension are considered as flattened.
+ * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened
+ * outer dimensions, which contains several "inner rows").
+ * Each thread processes a single inner row at a time.
+ */
+template<typename scalar_t, class BinaryOp>
+__global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_,
+                                              const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size,
+                                              const scalar_t init, BinaryOp binary_op)
+{
+  for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+    for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
+      const scalar_t *src = src_ + orow * row_size * num_irows + irow;
+      scalar_t *tgt = tgt_ + orow * row_size * num_irows + irow;
+      scalar_t acc = init;
+
+      for (uint32_t col = 0; col < row_size; ++col) {
+        acc = binary_op(acc, c10::load(src));
+        *tgt = acc;
+
+        src += num_irows;
+        tgt += num_irows;
+      }
+    }
+  }
+}
+
+/* Perform an inclusive scan along the innermost dimension of a tensor.
+ *
+ * - num_rows is the size of the flattened outer dimensions;
+ * - row_size is the size of the innermost dimension;
+ *
+ * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
+ * considered as having 'num_rows' rows of size 'row_size'.
+ * Each thread block processes one or more sets of contiguous rows (processing multiple rows
+ * per thread block is quicker than processing a single row, especially for short rows).
+ */
+template<typename T, class BinaryFunction>
+__device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, const T *src_,
+                                      const uint32_t num_rows, const uint32_t row_size,
+                                      const uint32_t log_num_threads_x,
+                                      T init, BinaryFunction binary_op){
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  for (uint32_t block_row = blockIdx.x * blockDim.y;
+       block_row < num_rows;
+       block_row += blockDim.y * gridDim.x) {
+    uint32_t row = block_row + threadIdx.y;
+    T block_total = init;
+
+    const T *row_src = src_ + row * row_size;
+    T *row_tgt = tgt_ + row * row_size;
+    const bool row_exists = row < num_rows;
+
+    // Perform scan on one block at a time, keeping track of the total value of
+    // all blocks processed so far.
+    for (uint32_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) {
+      // Load data into shared memory (two values per thread).
+      uint32_t col1 = block_col + threadIdx.x;
+      uint32_t col2 = block_col + num_threads_x + threadIdx.x;
+      if (row_exists) {
+        if (col1 < row_size) {
+          row_buf[threadIdx.x] = row_src[col1];
+        } else {
+          row_buf[threadIdx.x] = init;
+        }
+
+        if (col2 < row_size) {
+          row_buf[num_threads_x + threadIdx.x] = row_src[col2];
+        } else {
+          row_buf[num_threads_x + threadIdx.x] = init;
+        }
+
+        // Add the total value of all previous blocks to the first value of this block.
+        if (threadIdx.x == 0) {
+          row_buf[0] = binary_op(row_buf[0], block_total);
+        }
+      }
+      __syncthreads();
+
+      // Parallel reduction with Sklansky method. The diagram can be seen on this paper:
+      // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back
+      for (uint32_t m = 0; m <= log_num_threads_x; ++m) {
+        if (row_exists) {
+          uint32_t s = 1 << m; // s = 2 ^ m
+          uint32_t a = ((threadIdx.x >> m) << (m + 1)) | s; // a = (threadIdx.x / s) * (2 * s) + s
+          uint32_t ti = a + (threadIdx.x % s);
+          uint32_t si = a - 1;
+          row_buf[ti] = binary_op(row_buf[ti], row_buf[si]);
+        }
+        __syncthreads();
+      }
+
+      // Write back to output.
+      if (row_exists) {
+        if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x];
+        if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x];
+      }
+      block_total = row_buf[2 * num_threads_x - 1];
+      __syncthreads();
+    }
+  }
+}
+
+template <
+    typename T,
+    class BinaryFunction>
+__global__ void tensor_kernel_scan_innermost_dim(
+    T* tgt_,
+    const T* src_,
+    const uint32_t num_rows,
+    const uint32_t row_size,
+    const uint32_t log_num_threads_x,
+    T init,
+    BinaryFunction binary_op) {
+  alignas(sizeof(double)) extern __shared__ char sbuf[];
+  T* sbuf2 = reinterpret_cast<T*>(sbuf);
+  const uint32_t num_threads_x = 1 << log_num_threads_x;
+  T* row_buf = reinterpret_cast<T*>(sbuf2 + num_threads_x * 2 * threadIdx.y);
+
+  tensor_kernel_scan_innermost_dim_impl<T>(
+      row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op);
+}
+
+
+template<typename scalar_t, class BinaryFunction>
+__host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result,
+                             int dim, scalar_t init, BinaryFunction binary_op) {
+  const int64_t row_size = self.size(dim);
+  auto sizes = self.sizes();
+
+  // Treat all outer dimensions (i.e. dim_ < dim) as one.
+  const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim);
+
+  // Treat all inner dimensions (i.e. dim > dimension) as one.
+  const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end());
+
+  dim3 threads(std::min(512, int(num_irows)));
+  int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+  dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x})));
+
+  check_fits_in_unsigned(num_irows, "num_irows");
+  check_fits_in_unsigned(num_orows, "num_orows");
+  check_fits_in_unsigned(row_size, "row_size");
+
+  tensor_kernel_scan_outer_dim<scalar_t><<<grid, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
+    num_orows, num_irows, row_size, init, binary_op);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename scalar_t, class BinaryFunction>
+void scan_innermost_dim(const TensorBase& self, const TensorBase& result,
+                        scalar_t init, BinaryFunction binary_op) {
+  int64_t ndim = self.dim();
+  // Treat all outer dimensions as a single dimension.
+  int64_t row_size = self.size(ndim - 1);
+  int64_t num_rows = self.numel() / row_size;
+
+  // assuming max_num_threads per block is 512
+  const uint32_t num_threads = 512;
+  const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan<uint32_t>(num_rows, row_size);
+  const uint32_t num_threads_x = (1 << log_num_threads_x);
+  const uint32_t num_threads_y = num_threads / num_threads_x;
+  dim3 threads(num_threads_x, num_threads_y);
+  int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[0];
+  dim3 grid(std::min(maxGridDim, ceil_div(num_rows, int64_t{threads.y})));
+
+  check_fits_in_unsigned(num_rows, "Number of rows (self.numel()/self.size(self.dim()-1))");
+  check_fits_in_unsigned(row_size, "row_size");
+
+  tensor_kernel_scan_innermost_dim<scalar_t><<<grid, threads, num_threads * 2 * sizeof(scalar_t),
+                                               at::cuda::getCurrentCUDAStream()>>>(
+    result.mutable_data_ptr<scalar_t>(), self.const_data_ptr<scalar_t>(),
+    num_rows, row_size, log_num_threads_x, init, binary_op);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template<typename scalar_t, typename BinaryFunction>
+void scan_dim(const TensorBase& self, const TensorBase& result,
+     int64_t dim, scalar_t init, BinaryFunction binary_op) {
+  int ndim = self.dim();
+  auto self_ = self.expect_contiguous();
+  TORCH_INTERNAL_ASSERT(result.is_contiguous());
+
+  if (self.numel() == self.size(dim)) {
+    cuda::cub::inclusive_scan(self_->const_data_ptr<scalar_t>(), result.mutable_data_ptr<scalar_t>(), binary_op, self.numel());
+  } else if (dim == ndim - 1) {
+    scan_innermost_dim<scalar_t>(*self_, result, init, binary_op);
+  } else {
+    scan_outer_dim<scalar_t>(*self_, result, dim, init, binary_op);
+  }
+}
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sort.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..388401118a9303955a018e1f881688e602742edd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sort.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <cstdint>
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/cuda/SortStable.h>
+
+namespace at {
+namespace native {
+
+inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
+  return self.size(dim) <= 4096;
+}
+
+void sortKeyValueInplace(
+    const TensorBase &key, const TensorBase &value, int dim,
+    bool descending, bool stable=false);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortStable.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortStable.h
new file mode 100644
index 0000000000000000000000000000000000000000..e511e4422163da3ab41e9af206ca33e550d076cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortStable.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <cstdint>
+
+namespace at {
+namespace native {
+
+// Stable-sort self into values, and set indices to the
+// inverse-permutation from values back to self.
+// Output tensors must be pre-allocated and contiguous.
+void launch_stable_sort_kernel(
+    const TensorBase& self,
+    int64_t dim,
+    bool descending,
+    const TensorBase& values,
+    const TensorBase& indices);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6848fa9acd3df7279a4eb938c148621d9b9c1704
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh
@@ -0,0 +1,344 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+#include <ATen/cuda/cub.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/native/cuda/SortingCommon.cuh>
+#include <ATen/native/cuda/Sort.h>
+#include <ATen/native/StridedRandomAccessor.h>
+
+#define HAS_WARP_MERGE_SORT() (CUDA_VERSION >= 110600)
+
+
+namespace at { namespace native {
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA,
+                                   K& kB, V& vB, bool& validB,
+                                   bool dir,
+                                   const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <int Power2SortSize, typename IndexType, typename Comparator,
+          typename K, typename V>
+__device__ inline void bitonicSort(K *keys,
+                                   V *values,
+                                   bool *valid,
+                                   const Comparator& comp) {
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+
+      __syncthreads();
+
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwap<Comparator, K, V>(
+        keys[pos], values[pos], valid[pos],
+        keys[pos + stride], values[pos + stride], valid[pos + stride],
+        flag, comp);
+    }
+  }
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+
+    __syncthreads();
+
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwap<Comparator, K, V>(
+      keys[pos], values[pos], valid[pos],
+      keys[pos + stride], values[pos + stride], valid[pos + stride],
+      false, comp);
+  }
+
+  __syncthreads();
+
+}
+
+// at::cuda::detail::TensorInfo version
+// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
+// modifies the input `keys` and `values`
+template <int KeyDims, int ValueDims, int block_dim_x, int max_block_dim_y,
+          typename K, typename V, typename Comparator, typename IndexType>
+C10_LAUNCH_BOUNDS_1(block_dim_x * max_block_dim_y)
+__global__ void
+bitonicSortKVInPlace(at::cuda::detail::TensorInfo<K, IndexType> keys,
+                     IndexType keySlices,
+                     IndexType keySliceSize,
+                     IndexType keySliceStride,
+                     at::cuda::detail::TensorInfo<V, IndexType> values,
+                     IndexType valueSliceStride,
+                     Comparator comp) {
+  // Find the slice of the tensor that we are sorting
+  // NOTE: blockDim.y may be less max_block_dim_y
+  const IndexType blockIndex = getLinearBlockId<IndexType>();
+  const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y;
+
+  // If the entire block is out of bounds exit early
+  if (blockIndex * blockDim.y >= keySlices) {
+    return;
+  }
+  // It's also possible for some rows of a block to be out of bounds
+  // but all thread need to run for __syncthreads to work.
+  const bool row_valid = linearIndex < keySlices;
+
+  constexpr int items_per_thread = 2;
+  constexpr int Power2SortSize = block_dim_x * items_per_thread;
+
+  // Storage for max_block_dim_y sorts performed in parallel
+  __shared__ K blockSharedKeys[max_block_dim_y][Power2SortSize];
+  __shared__ V blockSharedValues[max_block_dim_y][Power2SortSize];
+  __shared__ bool blockSharedValid[max_block_dim_y][Power2SortSize];
+
+  auto sharedKeys = blockSharedKeys[threadIdx.y];
+  auto sharedValues = blockSharedValues[threadIdx.y];
+  auto sharedValid = blockSharedValid[threadIdx.y];
+
+  const IndexType keyStartOffset =
+    at::cuda::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::cuda::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  // Load 2 values per thread into the shared workspace
+  #pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    auto idx = threadIdx.x + k * blockDim.x;
+    bool valid = row_valid && idx < keySliceSize;
+
+    sharedKeys[idx] = valid ?
+        keys.data[idx * keySliceStride + keyStartOffset] : K{};
+    sharedValues[idx] = valid ?
+        values.data[idx * valueSliceStride + valueStartOffset] : V{};
+    sharedValid[idx] = valid;
+  }
+
+  // Sort!
+  bitonicSort<Power2SortSize, IndexType>(
+      sharedKeys, sharedValues, sharedValid, comp);
+
+  if (!row_valid) {
+    return;
+  }
+
+  // Store outputs
+  #pragma unroll
+  for (int k = 0; k < items_per_thread; ++k) {
+    auto idx = threadIdx.x + k * blockDim.x;
+    if (idx < keySliceSize) {
+      keys.data[idx * keySliceStride + keyStartOffset] = sharedKeys[idx];
+      values.data[idx * valueSliceStride + valueStartOffset] = sharedValues[idx];
+    }
+  }
+}
+
+#if HAS_WARP_MERGE_SORT()
+
+template <int KeyDims, int ValueDims, int sort_size, int max_block_dim_y,
+          typename K, typename V, typename Comparator, typename IndexType>
+C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE * max_block_dim_y)
+__global__ void
+warpMergeSortKVInPlace(
+    at::cuda::detail::TensorInfo<K, IndexType> keys,
+    IndexType keySlices,
+    IndexType keySliceSize,
+    IndexType keySliceStride,
+    at::cuda::detail::TensorInfo<V, IndexType> values,
+    IndexType valueSliceStride,
+    Comparator comp,
+    K invalid_key) {
+  // Find the slice of the tensor that we are sorting
+  // NOTE: blockDim.y may be less max_block_dim_y
+  const IndexType blockIndex = getLinearBlockId<IndexType>();
+  const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y;
+
+  // If this row is out of bounds exit early
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  const IndexType keyStartOffset =
+    at::cuda::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::cuda::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  K *keys_slice = &keys.data[keyStartOffset];
+  V *values_slice = &values.data[valueStartOffset];
+
+  StridedRandomAccessor<K, IndexType> keys_iter(keys_slice, keySliceStride);
+  StridedRandomAccessor<V, IndexType> values_iter(values_slice, valueSliceStride);
+
+  namespace cub = ROCM_HIPCUB(at_cuda_detail::cub);
+
+  CUDA_KERNEL_ASSERT(blockDim.x == C10_WARP_SIZE);
+  CUDA_KERNEL_ASSERT(blockDim.y <= max_block_dim_y);
+  constexpr int items_per_thread = sort_size / C10_WARP_SIZE;
+  static_assert(
+      items_per_thread * C10_WARP_SIZE == sort_size,
+      "sort_size must be a multiple of C10_WARP_SIZE");
+
+
+  using LoadKeys = cub::WarpLoad<K, items_per_thread, cub::WARP_LOAD_TRANSPOSE>;
+  using LoadValues = cub::WarpLoad<V, items_per_thread, cub::WARP_LOAD_TRANSPOSE>;
+  using Sort = cub::WarpMergeSort<K, items_per_thread, C10_WARP_SIZE, V>;
+  using StoreKeys = cub::WarpStore<K, items_per_thread, cub::WARP_STORE_TRANSPOSE>;
+  using StoreValues = cub::WarpStore<V, items_per_thread, cub::WARP_STORE_TRANSPOSE>;
+
+  __shared__ union {
+    typename LoadKeys::TempStorage load_keys;
+    typename LoadValues::TempStorage load_values;
+    typename Sort::TempStorage sort;
+    typename StoreKeys::TempStorage store_keys;
+    typename StoreValues::TempStorage store_values;
+  } tmp_storage[max_block_dim_y];
+
+  auto& warp_storage = tmp_storage[threadIdx.y];
+
+  // Load inputs
+  K local_keys[items_per_thread];
+  V local_values[items_per_thread];
+
+  const auto invalid_value = V{};
+  LoadKeys(warp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key);
+  WARP_SYNC();
+  LoadValues(warp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value);
+  WARP_SYNC();
+
+  // Sort! We use stable sort to ensure that invalid values are never
+  // sorted before valid values. In testing it performed the same as
+  // .Sort, so there is no down-side.
+  Sort(warp_storage.sort).StableSort(
+      local_keys, local_values, comp, keySliceSize, invalid_key);
+  WARP_SYNC();
+
+  // Store outputs
+  StoreKeys(warp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize);
+  WARP_SYNC();
+  StoreValues(warp_storage.store_values).Store(values_iter, local_values, keySliceSize);
+}
+
+#endif // HAS_WARP_MERGE_SORT()
+
+template <int KeyDims, int ValueDims,
+          int block_size, int items_per_thread,
+          typename K, typename V, typename IndexType>
+C10_LAUNCH_BOUNDS_1(block_size)
+__global__ void
+radixSortKVInPlace(at::cuda::detail::TensorInfo<K, IndexType> keys,
+                   IndexType keySlices,
+                   IndexType keySliceSize,
+                   IndexType keySliceStride,
+                   at::cuda::detail::TensorInfo<V, IndexType> values,
+                   IndexType valueSliceStride,
+                   bool descending) {
+  static_assert(block_size > 0, "");
+
+  // Find the slice of the tensor that we are sorting
+  const IndexType linearIndex = getLinearBlockId<IndexType>();
+  // Tiling the slices could have us be out of bounds, if there are a
+  // lot of slices to sort
+  if (linearIndex >= keySlices) {
+    return;
+  }
+
+  const IndexType keyStartOffset =
+    at::cuda::detail::IndexToOffset<K, IndexType, KeyDims>::get(linearIndex, keys);
+  const IndexType valueStartOffset =
+    at::cuda::detail::IndexToOffset<V, IndexType, ValueDims>::get(linearIndex, values);
+
+  K *keys_slice = &keys.data[keyStartOffset];
+  V *values_slice = &values.data[valueStartOffset];
+
+  StridedRandomAccessor<K, IndexType> keys_iter(keys_slice, keySliceStride);
+  StridedRandomAccessor<V, IndexType> values_iter(values_slice, valueSliceStride);
+
+  namespace cub = ROCM_HIPCUB(at_cuda_detail::cub);
+
+  using key_t = typename at::cuda::cub::detail::cuda_type<K>::type;
+  using LoadKeys = cub::BlockLoad<K, block_size, items_per_thread,
+                                  cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+  using LoadValues = cub::BlockLoad<V, block_size, items_per_thread,
+                                    cub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE>;
+  using Sort = cub::BlockRadixSort<key_t, block_size, items_per_thread, V>;
+  using StoreKeys = cub::BlockStore<K, block_size, items_per_thread,
+                                    cub::BLOCK_STORE_TRANSPOSE>;
+  using StoreValues = cub::BlockStore<V, block_size, items_per_thread,
+                                      cub::BLOCK_STORE_TRANSPOSE>;
+
+  __shared__ union {
+    typename LoadKeys::TempStorage load_keys;
+    typename LoadValues::TempStorage load_values;
+    typename Sort::TempStorage sort;
+    typename StoreKeys::TempStorage store_keys;
+    typename StoreValues::TempStorage store_values;
+  } tmp_storage;
+
+  // cub's Block operations operate on a fixed number of items, but the
+  // actual slice we are sorting might be smaller. So, we need to make
+  // up the difference with keys that will always sort higher.
+  const K invalid_key = [descending] {
+    using radix_t = typename cub::Traits<key_t>::UnsignedBits;
+    union {
+      K key;
+      radix_t radix;
+    } tmp;
+    tmp.radix = descending ?
+        cub::Traits<key_t>::LOWEST_KEY :
+        cub::Traits<key_t>::MAX_KEY;
+    return tmp.key;
+  }();
+  const V invalid_value = static_cast<V>(0);
+
+  // Load inputs
+  K local_keys[items_per_thread];
+  V local_values[items_per_thread];
+
+  LoadKeys(tmp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key);
+  __syncthreads();
+  LoadValues(tmp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value);
+  __syncthreads();
+
+  // Sort!
+  if (descending) {
+    Sort(tmp_storage.sort).SortDescending(
+        reinterpret_cast<key_t (&)[items_per_thread]>(local_keys),
+        local_values);
+  } else {
+    Sort(tmp_storage.sort).Sort(
+        reinterpret_cast<key_t (&)[items_per_thread]>(local_keys),
+        local_values);
+  }
+  __syncthreads();
+
+  // Store outputs
+  StoreKeys(tmp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize);
+  __syncthreads();
+  StoreValues(tmp_storage.store_values).Store(values_iter, local_values, keySliceSize);
+}
+
+}} // at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sorting.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sorting.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eddefcf1be3316cfead0630dd59a02219c9de94
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/Sorting.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_kthvalue_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t k);
+void launch_median_kernel(
+    const TensorBase &vals, const TensorBase &inds,
+    const TensorBase &in, int64_t dim, bool ignore_nan);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..58636e5e1d0b059d1379d8f8fed293db95c6445c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh
@@ -0,0 +1,193 @@
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
+#include <ATen/NumericUtils.h>
+#include <c10/macros/Macros.h>
+#include <stdlib.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+
+namespace at {
+namespace native {
+
+// Is this questionable namespace pollution?
+#if defined(USE_ROCM)
+constexpr int MAX_BLOCK_SIZE = 256;
+
+#else
+constexpr int MAX_BLOCK_SIZE = 1024;
+#endif
+
+// Maximum size per grid dimension that we assume (compute capability >= 2.0)
+constexpr int64_t MAX_GRID_SIZE = 65535LL;
+
+static bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
+  if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
+    return false;
+  }
+
+  int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+  int64_t gridY = 1;
+  int64_t gridZ = 1;
+
+  if (gridTiles > MAX_GRID_SIZE) {
+    gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE);
+    gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+
+    if (gridTiles > MAX_GRID_SIZE) {
+      gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE);
+      gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles;
+    }
+  }
+
+  grid = dim3(gridX, gridY, gridZ);
+  return true;
+}
+
+template <typename scalar_t, bool handleNaN = false>
+struct GTOp {
+  __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const {
+    return (handleNaN && at::_isnan(lhs) && !at::_isnan(rhs)) || (lhs > rhs);
+  }
+};
+
+template <typename scalar_t, bool handleNaN = false>
+struct LTOp {
+  __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const {
+    return (handleNaN && at::_isnan(rhs) && !at::_isnan(lhs)) || (lhs < rhs);
+  }
+};
+
+template <typename index_t>
+__device__ __forceinline__ index_t getLinearBlockId() {
+  return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x +
+      blockIdx.x;
+}
+
+// For slice sorting in Thrust; extracts a slice index from a linear
+// index and uses that for comparison
+struct SliceComp {
+  SliceComp(int64_t size) : sliceSize(size) {}
+
+  __device__ bool operator()(const int64_t& a, const int64_t& b) const {
+    // Since the slices are guaranteed to be innermost,
+    // the segment is just via int64_t division
+    int64_t segA = a / sliceSize;
+    int64_t segB = b / sliceSize;
+    return segA < segB;
+  }
+
+  const int64_t sliceSize;
+};
+
+// For sorting in Thurst; extracts a within-slice index from a linear index
+struct GlobalIndexToPerSliceIndex {
+  GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {}
+
+  __device__ inline void operator()(int64_t& v) const {
+    v = v % sliceSize;
+  }
+
+  const int64_t sliceSize;
+};
+
+// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
+static uint64_t nextHighestPowerOf2(uint64_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+#ifndef _MSC_VER
+  n |= n >> 32;
+#endif
+  n++;
+
+  return n;
+}
+
+
+// WARNING: This function assumes input tensors are contiguous
+template <typename scalar_t, typename index_t, typename Launcher>
+void run_launcher(
+    const TensorBase &values,
+    const TensorBase &indices,
+    const TensorBase &self,
+    int64_t dim,
+    Launcher l) {
+  auto self_info = cuda::detail::getTensorInfo<const scalar_t, index_t>(self);
+  auto values_info = cuda::detail::getTensorInfo<scalar_t, index_t>(values);
+  auto indices_info = cuda::detail::getTensorInfo<int64_t, index_t>(indices);
+
+  int64_t slice_size = self.size(dim);
+  /* We use these structures solely to find the offset to */
+  /* each slice we are operating on */
+  self_info.reduceDim(dim);
+  values_info.reduceDim(dim);
+  indices_info.reduceDim(dim);
+
+  /* Collapse all other dims */
+  int collapse_self_dim = self_info.collapseDims(dim);
+  int collapse_values_dim = values_info.collapseDims(dim);
+  int collapse_indices_dim = indices_info.collapseDims(dim);
+
+  int64_t num_slices = 1;
+  for (int i = 0; i < self_info.dims; ++i) {
+    num_slices *= self_info.sizes[i];
+  }
+
+  /* This is used as a template parameter to calculate indices. */
+  /* We only specialize it if all collapsed dim sizes are the */
+  /* same; otherwise, we use -1 which is the specialization */
+  /* parameter for arbitrary dimensions */
+  int all_dims = self_info.dims;
+  if (values_info.dims != all_dims || indices_info.dims != all_dims) {
+    all_dims = -1;
+  }
+
+  if (all_dims == 1) {
+    l.template launch<scalar_t, index_t, 1>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else if (all_dims == 2) {
+    l.template launch<scalar_t, index_t, 2>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else if (all_dims == 3) {
+    l.template launch<scalar_t, index_t, 3>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  } else {
+    l.template launch<scalar_t, index_t, -1>(
+        values_info,
+        collapse_values_dim,
+        indices_info,
+        collapse_indices_dim,
+        self_info,
+        collapse_self_dim,
+        num_slices,
+        slice_size);
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..446ca5d796a903155bb7987d4b2348750b685465
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh
@@ -0,0 +1,429 @@
+#include <ATen/ceil_div.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/cuda/AsmUtils.cuh>
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+template <typename scalar_t>
+struct TopKTypeConfig {};
+
+template <>
+struct TopKTypeConfig<float> {
+  typedef uint32_t RadixType;
+
+  // Converts a float to an integer representation with the same
+  // sorting; i.e., for floats f1, f2:
+  // if f1 < f2 then convert(f1) < convert(f2)
+  // We use this to enable radix selection of floating-point values.
+  // This also gives a relative order for NaNs, but that's ok, as they
+  // will all be adjacent
+  // neg inf: signbit=1 exp=ff fraction=0 --> radix = 0 00 ff..
+  // pos inf: signbit=0 exp=ff fraction=0 --> radix = 1 ff 00..
+  // pos nan: signbit=0 exp=ff fraction>0 --> radix = 1 ff x>0
+  // neg nan: signbit=1 exp=ff fraction>0 --> radix = 0 00 x<ff...
+  static inline __device__ RadixType convert(float v) {
+    RadixType x = __float_as_int(v);
+    RadixType mask = (x & 0x80000000) ? 0xffffffff : 0x80000000;
+
+    return (v == v) ? (x ^ mask) : 0xffffffff;
+  }
+
+  static inline __device__ float deconvert(RadixType v) {
+    RadixType mask = (v & 0x80000000) ? 0x80000000 : 0xffffffff;
+
+    return __int_as_float(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<uint8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(uint8_t v) {
+    return v;
+  }
+
+  static inline __device__ uint8_t deconvert(RadixType v) {
+    return v;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int8_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int8_t v) {
+    return 128u + v;
+  }
+
+  static inline __device__ int8_t deconvert(RadixType v) {
+    return v - 128;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int16_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int16_t v) {
+    static_assert(sizeof(short) == 2, "");
+    return 32768u + v;
+  }
+
+  static inline __device__ int16_t deconvert(RadixType v) {
+    return v - 32768;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int32_t> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(int32_t v) {
+    static_assert(sizeof(int) == 4, "");
+    return 2147483648u + v;
+  }
+
+  static inline __device__ int32_t deconvert(RadixType v) {
+    return v - 2147483648u;
+  }
+};
+
+template <>
+struct TopKTypeConfig<int64_t> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(int64_t v) {
+    static_assert(sizeof(int64_t) == 8, "");
+    return 9223372036854775808ull + v;
+  }
+
+  static inline __device__ int64_t deconvert(RadixType v) {
+    return v - 9223372036854775808ull;
+  }
+};
+
+template <>
+struct TopKTypeConfig<double> {
+  typedef uint64_t RadixType;
+
+  static inline __device__ RadixType convert(double v) {
+    RadixType x = __double_as_longlong(v);
+    RadixType mask = -((x >> 63)) | 0x8000000000000000;
+    return (v == v) ? (x ^ mask) : 0xffffffffffffffff;
+  }
+
+  static inline __device__ double deconvert(RadixType v) {
+    RadixType mask = ((v >> 63) - 1) | 0x8000000000000000;
+    return __longlong_as_double(v ^ mask);
+  }
+};
+
+template <>
+struct TopKTypeConfig<at::Half> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(at::Half v) {
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+    RadixType x = __half_as_ushort(v);
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+#else
+    CUDA_KERNEL_ASSERT(false);
+    return 0u;
+#endif
+  }
+
+  static inline __device__ at::Half deconvert(RadixType v) {
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    return __ushort_as_half(v ^ mask);
+#else
+    CUDA_KERNEL_ASSERT(false);
+    return static_cast<at::Half>(0);
+#endif
+  }
+};
+
+template <>
+struct TopKTypeConfig<at::BFloat16> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType convert(at::BFloat16 v) {
+    RadixType x = v.x;
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+  }
+
+  static inline __device__ at::BFloat16 deconvert(RadixType v) {
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    at::BFloat16 r;
+    r.x = (v ^ mask);
+    return r;
+  }
+};
+
+// This function counts the distribution of all input values in a
+// slice we are selecting by radix digit at `radixDigitPos`, but only
+// those that pass the filter `((v & desiredMask) == desired)`.
+// This produces and broadcasts the seen counts for a single block only.
+// `smem` must have at least `RadixSize` elements.
+template <
+    typename scalar_t,
+    typename bitwise_t,
+    typename index_t,
+    typename CountType,
+    int RadixSize,
+    int RadixBits>
+__device__ void countRadixUsingMask(
+    CountType counts[RadixSize],
+    CountType* smem,
+    bitwise_t desired,
+    bitwise_t desiredMask,
+    int radixDigitPos,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    const scalar_t* data) {
+  // Clear out per-thread counts from a previous round
+#pragma unroll
+  for (int i = 0; i < RadixSize; ++i) {
+    counts[i] = 0;
+  }
+
+  if (threadIdx.x < RadixSize) {
+    smem[threadIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Scan over all the data. Upon a read, the warp will accumulate
+  // counts per each digit in the radix using warp voting.
+#if !defined(USE_ROCM)
+  // Must be called outside of loop to ensure all threads participate
+  unsigned mask = WARP_BALLOT(threadIdx.x < sliceSize);
+#endif
+  for (index_t i = threadIdx.x; i < sliceSize;) {
+    bitwise_t val =
+        TopKTypeConfig<scalar_t>::convert(doLdg(&data[i * withinSliceStride]));
+
+    bool hasVal = ((val & desiredMask) == desired);
+    bitwise_t digitInRadix = at::cuda::Bitfield<bitwise_t>::getBitfield(
+        val, radixDigitPos, RadixBits);
+
+#pragma unroll
+    for (uint32_t j = 0; j < RadixSize; ++j) {
+      bool vote = hasVal && (digitInRadix == j);
+#if defined(USE_ROCM)
+      counts[j] += __popcll(WARP_BALLOT(vote));
+#else
+      counts[j] += __popc(WARP_BALLOT(vote, mask));
+#endif
+    }
+    i += blockDim.x;
+#if !defined(USE_ROCM)
+    mask = WARP_BALLOT(i < sliceSize, mask);
+#endif
+  }
+
+  // Now, for each warp, sum values
+  if (at::cuda::getLaneId() == 0) {
+#pragma unroll
+    for (uint32_t i = 0; i < RadixSize; ++i) {
+      gpuAtomicAddNoReturn(&smem[i], counts[i]);
+    }
+  }
+
+  __syncthreads();
+
+  // For each thread, read in the total counts
+#pragma unroll
+  for (uint32_t i = 0; i < RadixSize; ++i) {
+    counts[i] = smem[i];
+  }
+
+  __syncthreads();
+}
+
+// Over what radix we are selecting values
+constexpr int RADIX_BITS = 2; // digits are base-(2 ^ RADIX_BITS)
+constexpr int RADIX_SIZE = 4; // 2 ^ RADIX_BITS
+constexpr int RADIX_MASK = (RADIX_SIZE - 1);
+
+// This finds the unique value `v` that matches the pattern
+// ((v & desired) == desiredMask) in our sorted int format
+template <typename scalar_t, typename bitwise_t, typename index_t>
+__device__ scalar_t findPattern(
+    scalar_t* smem,
+    const scalar_t* data,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    bitwise_t desired,
+    bitwise_t desiredMask) {
+  if (threadIdx.x < 2) {
+    smem[threadIdx.x] = static_cast<scalar_t>(0);
+  }
+  __syncthreads();
+
+  // All threads participate in the loop, in order to sync on the flag
+  index_t numIterations =
+      round_up(sliceSize, static_cast<index_t>(blockDim.x));
+  for (index_t i = threadIdx.x; i < numIterations; i += blockDim.x) {
+    bool inRange = (i < sliceSize);
+    scalar_t v = inRange ? doLdg(&data[i * withinSliceStride])
+                         : static_cast<scalar_t>(0);
+
+    if (inRange &&
+        ((TopKTypeConfig<scalar_t>::convert(v) & desiredMask) == desired)) {
+      // There should not be conflicts if we are using findPattern,
+      // since the result is unique
+      smem[0] = static_cast<scalar_t>(1);
+      smem[1] = v; // can't use val as the flag, since it could be 0
+    }
+
+    __syncthreads();
+
+    scalar_t found = smem[0];
+    scalar_t val = smem[1];
+
+    __syncthreads();
+
+    // Check to see if a thread found the value
+    if (found != static_cast<scalar_t>(0)) {
+      // all threads return this value
+      return val;
+    }
+  }
+
+  // should not get here
+  CUDA_KERNEL_ASSERT(false);
+  return static_cast<scalar_t>(0);
+}
+
+// Returns the top-Kth element found in the data using radix selection
+template <typename scalar_t, typename bitwise_t, typename index_t>
+__device__ void radixSelect(
+    const scalar_t* data,
+    index_t k,
+    bool largest,
+    index_t sliceSize,
+    index_t withinSliceStride,
+    int* smem,
+    scalar_t* topK) {
+  // Per-thread buckets into which we accumulate digit counts in our
+  // radix
+  int counts[RADIX_SIZE];
+
+  // We only consider elements x such that (x & desiredMask) == desired
+  // Initially, we consider all elements of the array, so the above
+  // statement is true regardless of input.
+  bitwise_t desired = 0;
+  bitwise_t desiredMask = 0;
+
+  // We are looking for the top kToFind-th element when iterating over
+  // digits; this count gets reduced by elimination when counting
+  // successive digits
+  int kToFind = k;
+
+  // We start at the most significant digit in our radix, scanning
+  // through to the least significant digit
+  for (int digitPos = sizeof(scalar_t) * 8 - RADIX_BITS; digitPos >= 0;
+       digitPos -= RADIX_BITS) {
+    // Count radix distribution for the current position and reduce
+    // across all threads
+    countRadixUsingMask<
+        scalar_t,
+        bitwise_t,
+        index_t,
+        int,
+        RADIX_SIZE,
+        RADIX_BITS>(
+        counts,
+        smem,
+        desired,
+        desiredMask,
+        digitPos,
+        sliceSize,
+        withinSliceStride,
+        data);
+
+    auto found_unique = [&](int i, int count) -> bool {
+      /* All threads have the same value in counts here, so all */
+      /* threads will return from the function. */
+      if (count == 1 && kToFind == 1) {
+        /* There is a unique answer. */
+        desired = at::cuda::Bitfield<bitwise_t>::setBitfield(
+            desired, i, digitPos, RADIX_BITS);
+        desiredMask = at::cuda::Bitfield<bitwise_t>::setBitfield(
+            desiredMask, RADIX_MASK, digitPos, RADIX_BITS);
+
+        /* The answer is now the unique element v such that: */
+        /* (v & desiredMask) == desired */
+        /* However, we do not yet know what the actual element is. We */
+        /* need to perform a search through the data to find the */
+        /* element that matches this pattern. */
+        *topK = findPattern<scalar_t, bitwise_t, index_t>(
+            (scalar_t*)smem,
+            data,
+            sliceSize,
+            withinSliceStride,
+            desired,
+            desiredMask);
+        return true;
+      }
+      return false;
+    };
+    auto found_non_unique = [&](int i, int count) -> bool {
+      if (count >= kToFind) {
+        desired =
+            at::cuda::Bitfield<bitwise_t>::setBitfield(
+                desired, i, digitPos, RADIX_BITS);
+        desiredMask = at::cuda::Bitfield<bitwise_t>::setBitfield(
+            desiredMask, RADIX_MASK, digitPos, RADIX_BITS);
+
+        /* The top-Kth element v must now be one such that: */
+        /* (v & desiredMask == desired) */
+        /* but we haven't narrowed it down; we must check the next */
+        /* least-significant digit */
+        return true;
+      }
+      kToFind -= count;
+      return false; // continue the loop
+    };
+
+    // All threads participate in the comparisons below to know the
+    // final result
+    if (largest) {
+      // Process in descending order
+#pragma unroll
+      for (int i = RADIX_SIZE - 1; i >= 0; --i) {
+        int count = counts[i];
+        if (found_unique(i, count)) {
+          return;
+        }
+        if (found_non_unique(i, count)) {
+          break;
+        }
+      }
+    } else {
+      // Process in ascending order
+#pragma unroll
+      for (int i = 0; i < RADIX_SIZE; ++i) {
+        int count = counts[i];
+        if (found_unique(i, count)) {
+          return;
+        }
+        if (found_non_unique(i, count)) {
+          break;
+        }
+      }
+    }
+  } // end digitPos for
+
+  // There is no unique result, but there is a non-unique result
+  // matching `desired` exactly
+  *topK = TopKTypeConfig<scalar_t>::deconvert(desired);
+}
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..aadc34b67d7dc065aac2454ec0387c7f4e4a78b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh
@@ -0,0 +1,435 @@
+#pragma once
+
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/SortingCommon.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+namespace at {
+namespace native {
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+  unsigned int val;
+  bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int,
+// unsigned int) pairs of data
+struct ModeUnsignedPair {
+  unsigned int val;
+  unsigned int index;
+};
+
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to
+// the nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+//    0  1  2  3  4  5  6  7
+//       1     5     9    13
+//             6          22
+//                        28
+//
+// Downsweep:
+//                  15
+//         3     10    21
+template <int Power2ScanSize, typename T, class BinaryOp>
+__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = binop(smem[index], smem[index - stride]);
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = binop(smem[index + stride], smem[index]);
+    }
+    __syncthreads();
+  }
+}
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <int N, typename T, typename ReduceOp>
+__device__ T reduceBlockWithNThreadLocalReductions(
+    T* smem,
+    T threadVals[N],
+    const unsigned int numVals,
+    ReduceOp reduceOp,
+    T init) {
+  int offset = threadIdx.x * N;
+  T local = offset < numVals ? threadVals[0] : init;
+
+#pragma unroll
+  for (int i = 1; i < N; ++i) {
+    ++offset;
+    T next = offset < numVals ? threadVals[i] : init;
+    local = reduceOp.combine(local, next);
+  }
+
+  return cuda_utils::BlockReduce(local, reduceOp, init, smem);
+}
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(
+    K& kA,
+    V& vA,
+    bool& validA,
+    K& kB,
+    V& vB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(
+    K& kA,
+    bool& validA,
+    K& kB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(validA, validB);
+  }
+}
+
+template <
+    typename K,
+    typename IndexType,
+    int Power2SortSize,
+    typename Comparator>
+__device__ inline void bitonicSortKeys(
+    K keys[Power2SortSize],
+    bool valid[Power2SortSize],
+    const Comparator& comp) {
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwapKeys<Comparator, K>(
+          keys[pos],
+          valid[pos],
+          keys[pos + stride],
+          valid[pos + stride],
+          flag,
+          comp);
+    }
+  }
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwapKeys<Comparator, K>(
+        keys[pos],
+        valid[pos],
+        keys[pos + stride],
+        valid[pos + stride],
+        false,
+        comp);
+  }
+
+  __syncthreads();
+}
+
+// The mode kernel has the following characteristics: It uses internal shared
+// memory buffers of Power2Size, which must be greater than the number of
+// elements. Additionally, there is one block for every slice to calculate the
+// mode for, and in each block there is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode
+// dimension as the innermost dim, such that we can get the particular slice for
+// a Tensor via its linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11070
+__launch_bounds__(1024, 1)
+#endif
+__global__ void compute_mode(
+    const T* input,
+    at::cuda::detail::TensorInfo<T, unsigned int> values,
+    at::cuda::detail::TensorInfo<int64_t, unsigned int> indices,
+    int64_t sliceSize,
+    int64_t slices) {
+  int tidx = threadIdx.x;
+  int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+  // First, we need to calculate the offset into the sorted Tensor that
+  // represents the start of the slice for this block to calculate the mode for.
+  // This offset is a combination of the gridIndices, and the number of elements
+  // in the slice.
+  unsigned int blockId = getLinearBlockId<unsigned int>();
+  unsigned int linearOffset = blockId * sliceSize;
+
+  if (blockId >= slices) {
+      return;
+  }
+
+  // shmem is a dynamically sized buffer we will use throughout the kernel to
+  // handle computation efficiently. The size of this shmem must be
+  // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+  //
+  // Initially, the buffer will be organized as follows:
+  //
+  // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+  extern __shared__ char shmem[];
+
+  // smem represents a proportion of the shared memory buffer that is used to
+  // store the elements from the slice:
+  T* smem = reinterpret_cast<T*>(shmem);
+
+  // Each thread loads up to two elements from the Tensor into shared memory
+  if (tidx < sliceSize) {
+    smem[tidx] = c10::load(&input[linearOffset + tidx]);
+  }
+  if (stidx < sliceSize) {
+    smem[stidx] = c10::load(&input[linearOffset + stidx]);
+  }
+
+  // Next, we initialize a boolean region of the buffer, offset by the loaded
+  // element smem region
+  bool* bmem = reinterpret_cast<bool*>(&smem[Power2Size]);
+
+  // The first use of this region stores bmem[i] = i < sliceSize to mark the
+  // valid components in the smem buffer
+  bmem[tidx] = tidx < sliceSize;
+  bmem[stidx] = stidx < sliceSize;
+  __syncthreads(); // barrier for smem, bmem initialization
+
+  // First, sort the input slice in ascending order. smem contains the input
+  // elements, and bmem marks the valid indices
+  bitonicSortKeys<T, unsigned int, Power2Size>(
+      smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) {
+        return a < b;
+      });
+  __syncthreads(); // make no assumptions that the sort syncs at end
+
+  // The next step of our algorithm is performing a block-wide comparison of
+  // neighboring elements. In particular, given an sorted input slice A, we
+  // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise
+  // 0.
+  //
+  // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+  //                 B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  //
+  // In particular, we can think of B[i] true indicating the start of a sequence
+  // of equal values in the sorted list. Similarly, we will also store the
+  // negation of B, which we'll call C. In particular, we can think of C[i] =
+  // true iff A[i-1] == A[i] in our original sorted slice.
+  //
+  //                 C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+  // We overwrite bmem, and treat the rest of shared memory as a buffer of
+  // (index, flag) pairs where the index represents values from C, and the flag
+  // represents values from B.
+  //
+  // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+  struct ModeUnsignedBoolPair* ubpmem =
+      reinterpret_cast<struct ModeUnsignedBoolPair*>(&smem[Power2Size]);
+
+  if (tidx == 0) {
+    ubpmem[0].flag = true;
+    ubpmem[0].val = 0;
+  }
+
+  // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+  ubpmem[tidx * 2 + 1].flag =
+      smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc.
+  ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+  // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+  if (((tidx + 1) * 2) < Power2Size) {
+    ubpmem[(tidx + 1) * 2].flag =
+        smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2];
+    ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+  }
+  __syncthreads(); // barrier for ubpmem initialization
+
+  // Next, we perform a segmented prefix sum on the neighboring elements, where
+  // the presence of a one indicates the start of a segment. In this case B acts
+  // as the segment start flags, and C is the buffer to be summed:
+  //
+  // Input  (C)  = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+  // Flag   (B)  = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  // Output (C)  = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+  //
+  // Afterwards, the (index) components of the ubpmem buffer contain the lengths
+  // of the segments (minus 1), i.e. the counts of each element in the original
+  // input.
+  inclusivePrefixScan<Power2Size>(
+      ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) {
+        ModeUnsignedBoolPair c;
+        c.val = a.flag ? a.val : a.val + b.val;
+        c.flag = a.flag | b.flag;
+        return c;
+      });
+  // assumes scan syncs at the end
+
+  // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e.
+  // we treat the boolean flag regions as integers). We initialize these to
+  // represent indices, and we'll call this buffer I
+  struct ModeUnsignedPair* uupmem =
+      reinterpret_cast<struct ModeUnsignedPair*>(ubpmem);
+
+  // At this point, we need to find the maximum element in lengths buffer C.
+  // This element will represent the count (-1) of the mode. Because of the
+  // way we have set up the problem, the index where this mode occurs will
+  // also be the location of the mode value in the sorted array, e.g.
+  //
+  // smem = [0, 0, 1, 1, 1, 2]
+  // C    = [0, 1, 0, 1, 2, 0]
+  // I    = [0, 1, 2, 3, 4, 5]
+  //                     ^
+  //                     maximum value, also aligned with mode = 1
+  //
+  // We perform a block wide max-reduction of the C buffer, but we also need the
+  // indices to come along with it, so we utilize the uupmem construction.
+  //
+  // At the end we need to return the ModeUnsignedPair containing index = 4, val
+  // = 2, which represents the max
+
+  // In practice, we will make each thread locally reduce 2 values in its
+  // registers prior to the global block-wide reduction. Note that instead of
+  // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with
+  // adjacent elements. This is because the reduce code below relies on thread
+  // elements to be adjacent.
+  struct ModeUnsignedPair uup[2];
+  uup[0].index = tidx * 2;
+  uup[0].val = ubpmem[tidx * 2].val;
+  uup[1].index = tidx * 2 + 1;
+  uup[1].val = ubpmem[tidx * 2 + 1].val;
+  __syncthreads();
+
+  struct ModeUnsignedPair max = {0, 0};
+
+  struct MaxOp {
+    inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const {
+      return b.val > a.val ? b : a;
+    }
+
+    inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const {
+      ModeUnsignedPair ret;
+      ret.index = WARP_SHFL_DOWN(acc.index, offset);
+      ret.val = WARP_SHFL_DOWN(acc.val, offset);
+      return ret;
+    }
+  } max_op;
+
+  max = reduceBlockWithNThreadLocalReductions<2>(
+      uupmem,
+      uup,
+      sliceSize,
+      max_op,
+      max);
+
+  // Store the mode in shared memory for use in finding the mode in the input
+  // slice
+  __shared__ T mode;
+
+  // Given the above constraints, the mode is the value at the reduced index in
+  // the original sorted element buffer
+  if (tidx == 0) {
+    mode = smem[max.index];
+  }
+  __syncthreads(); // broadcast mode
+
+  // Finally, we need to find "an" index of the mode in the input
+  // Tensor. The API does not constrain which index we pick, but here
+  // we always pick the largest index. We store the index if the value
+  // is the mode, or 0 otherwise. Then find the maximum value.
+  //
+  // Again we reduce 2 elements in the thread's registers prior to the
+  // block-wide reduction
+  unsigned mode_index[2] = {0u, 0u};
+  if (tidx * 2 < sliceSize) {
+    const unsigned idx = tidx * 2;
+    mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+  if (tidx * 2 + 1 < sliceSize) {
+    const unsigned idx = tidx * 2 + 1;
+    mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+
+  struct MaxIndexOp {
+    inline __device__ unsigned combine(unsigned a, unsigned b) const {
+      return b > a ? b : a;
+    }
+
+    inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const {
+      return WARP_SHFL_DOWN(acc, offset);
+    }
+  } max_index_op;
+
+  int64_t index = reduceBlockWithNThreadLocalReductions<2>(
+      reinterpret_cast<unsigned*>(&shmem[0]),
+      mode_index,
+      sliceSize,
+      max_index_op,
+      0u);
+
+  // Finally, we have the mode, and an index where it occurs. We use a single
+  // thread to place this in the appropriate output position
+  if (tidx == 0) {
+    unsigned int outputOffset =
+        at::cuda::detail::IndexToOffset<T, unsigned int, -1>::get(
+            blockId, values);
+    values.data[outputOffset] = mode;
+    indices.data[outputOffset] = index;
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ee6fb5c0fd2a17c4f29e44985fd7c2bab0f8b0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_fused_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t slice_size, int64_t slices);
+
+void launch_apply_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t ndim);
+
+}}  // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorTopK.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorTopK.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f34706de244852b83865f9d0f45601fe01e0d5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/TensorTopK.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+void launch_gather_topk_kernel(
+    const TensorBase& self,
+    int64_t k, int64_t dim, bool largest,
+    const TensorBase& values, const TensorBase& indices);
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3b39fbebc8daac24f402a7230b9547706a300b24
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh
@@ -0,0 +1,16 @@
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace internal {
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
+    const Tensor& self,
+    const bool consecutive,
+    const bool return_inverse,
+    const bool return_counts);
+
+} // namespace internal
+} // namespace at
+} // namespace native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UpSample.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UpSample.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..11f644f8d2aaa8fd5cba0bf128cf8e0d600f660b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/UpSample.cuh
@@ -0,0 +1,370 @@
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/Atomic.cuh>
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/OptionalArrayRef.h>
+
+#include <math.h>
+
+namespace at {
+namespace native {
+
+namespace upsample {
+// TODO: Remove duplicate declaration.
+TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
+    c10::IntArrayRef input_size,  // Full input tensor size.
+    at::OptionalIntArrayRef output_size,
+    c10::optional<c10::ArrayRef<double>> scale_factors);
+} // namespace upsample
+
+namespace upsample_cuda {
+
+// TODO: Remove duplication with Upsample.h (CPU).
+inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
+  if (!scales) {
+    return nullopt;
+  }
+  return scales->at(idx);
+}
+
+} // namespace upsample_cuda
+
+
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+
+// NOTE [ Nearest neighbor upsampling kernel implementation ]
+//
+// The nearest neighbor upsampling kernel implementation is symmetrical as
+// expected. We launch kernels with threads mapping to destination tensors where
+// kernels write data to, each thread reads data from the source tensor, this
+// means:
+// 1. In the forward kernel,
+//      src_xxx refers to properties of input tensors;
+//      dst_xxx refers to properties of output tensors;
+//      scale_factor is the ratio of src_size to dst_size;
+// 2. In the backward kernel,
+//      src_xxx refers to properties of grad_output tensors;
+//      dst_xxx refers to properties of grad_input tensors;
+//      scale_factor is the ratio of src_size to dst_size;
+//
+// Because of this, we need to take the reciprocal of the scale defined by
+// upsample layer during forward path. The motivation is to avoid slow
+// division in the kernel code, so we can use faster multiplication instead.
+// This is not necessary during backward path, since the scale_factor is already
+// the reciprocal of corresponding scale_factor used in the forward path due to
+// the swap of source and destination tensor.
+//
+// Similarly, since the mapping from grad_input to grad_output during backward
+// is the reverse of the mapping of output to input, we need to have opposite
+// mapping functions to compute the source index.
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t compute_scales_value(
+    const c10::optional<double> scale,
+    int64_t src_size,
+    int64_t dst_size) {
+  // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+  return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value())
+                                                   : (accscalar_t)src_size / dst_size;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
+    const c10::optional<double> scale,
+    int64_t src_size,
+    int64_t dst_size) {
+  // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+  return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value()
+                                                   : (accscalar_t)src_size / dst_size;
+}
+
+template <typename accscalar_t>
+__host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
+    int input_size,
+    int output_size,
+    bool align_corners,
+    const c10::optional<double> scale) {
+  if(align_corners) {
+    if(output_size > 1) {
+      return (accscalar_t)(input_size - 1) / (output_size - 1);
+    }
+    else {
+      return static_cast<accscalar_t>(0);
+    }
+  }
+  else{
+    return compute_scales_value<accscalar_t>(scale, input_size, output_size);
+  }
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
+    accscalar_t scale,
+    int dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    accscalar_t src_idx = scale * (dst_index + static_cast<accscalar_t>(0.5)) -
+        static_cast<accscalar_t>(0.5);
+    // See Note[Follow Opencv resize logic]
+    return (!cubic && src_idx < static_cast<accscalar_t>(0))
+        ? static_cast<accscalar_t>(0)
+        : src_idx;
+  }
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_compute_source_index(
+    const float scale,
+    int dst_index,
+    int input_size) {
+  // index_f32 = (output_index) * scale
+  // input_index = round(index_f32)
+  // Same as a buggy OpenCV INTER_NEAREST
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_neighbor_exact_compute_source_index as replacement
+  const int src_index =
+      min(static_cast<int>(floorf((dst_index) * scale)), input_size - 1);
+  return src_index;
+}
+
+__device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index(
+    const float scale,
+    int dst_index,
+    int input_size) {
+  // index_f32 = (output_index + 0.5) * scale - 0.5
+  // input_index = round(index_f32)
+  // Same as Pillow and Scikit-Image/Scipy ndi.zoom
+  const int src_index =
+      min(static_cast<int>(floorf((dst_index + static_cast<float>(0.5)) * scale)), input_size - 1);
+  return src_index;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
+    const float scale,
+    int dst_index,
+    int output_size) {
+  // Equivalent to buggy OpenCV INTER_NEAREST
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_neighbor_exact_bw_compute_source_index as replacement
+  const int src_index =
+      min(static_cast<int>(ceilf(dst_index * scale)), output_size);
+  return src_index;
+}
+
+// see NOTE [ Nearest neighbor upsampling kernel implementation ]
+__device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index(
+    const float scale,
+    int dst_index,
+    int output_size) {
+  // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom
+  const int src_index =
+      min(static_cast<int>(ceilf(dst_index * scale - static_cast<float>(0.5))), output_size);
+  return src_index;
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(
+    const PackedTensorAccessor64<const scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x) {
+  int access_y = max(min(y, height - 1), 0);
+  int access_x = max(min(x, width - 1), 0);
+  return data[batch][channel][access_y][access_x];
+}
+
+/* Used by UpSampleBicubic2d.cu */
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static void upsample_increment_value_bounded(
+    PackedTensorAccessor64<scalar_t, 4>& data,
+    int batch,
+    int channel,
+    int height,
+    int width,
+    int y,
+    int x,
+    accscalar_t value) {
+  int access_y = max(min(y, height - 1), 0);
+  int access_x = max(min(x, width - 1), 0);
+  /* TODO: result here is truncated to scalar_t,
+     check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
+   */
+  gpuAtomicAddNoReturn(
+      &data[batch][channel][access_y][access_x], static_cast<scalar_t>(value));
+}
+
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution1(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_convolution2(
+    accscalar_t x,
+    accscalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename accscalar_t>
+__device__ __forceinline__ static void get_cubic_upsampling_coefficients(
+    accscalar_t coeffs[4],
+    accscalar_t t) {
+  accscalar_t A = -0.75;
+
+  accscalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<accscalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<accscalar_t>(x1, A);
+
+  // opposite coefficients
+  accscalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<accscalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<accscalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static accscalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    accscalar_t t) {
+  accscalar_t coeffs[4];
+  get_cubic_upsampling_coefficients<accscalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+namespace upsample_antialias {
+
+// taken from
+// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
+// src/libImaging/Resample.c#L20-L29
+struct BilinearFilterFunctor {
+
+  template <typename accscalar_t>
+  __device__ accscalar_t operator()(accscalar_t x) const {
+    if (x < 0) {
+      x = -x;
+    }
+    if (x < 1) {
+      return 1 - x;
+    }
+    return 0;
+  }
+
+  static const int size = 2;
+};
+
+// taken from
+// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
+// src/libImaging/Resample.c#L46-L62
+struct BicubicFilterFunctor {
+
+  template <typename accscalar_t>
+  __device__ accscalar_t operator()(accscalar_t x) const {
+    // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+    const accscalar_t a = -0.5;
+    if (x < 0) {
+      x = -x;
+    }
+    if (x < 1) {
+      return ((a + 2) * x - (a + 3)) * x * x + 1;
+    }
+    if (x < 2) {
+      return (((x - 5) * x + 8) * x - 4) * a;
+    }
+    return 0;
+  }
+
+  static const int size = 4;
+};
+
+template <typename accscalar_t>
+__device__ __forceinline__ static void _compute_weights_span(
+    const int i,
+    const int input_size,
+    const accscalar_t scale,
+    const accscalar_t support,
+    int& xmin,
+    int& xsize,
+    accscalar_t& center) {
+  center = scale * (i + static_cast<accscalar_t>(0.5));
+  xmin = max(static_cast<int>(center - support + static_cast<accscalar_t>(0.5)), static_cast<int>(0));
+  xsize = min(static_cast<int>(center + support + static_cast<accscalar_t>(0.5)), input_size) - xmin;
+}
+
+template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
+__device__ __forceinline__ static void _compute_weights(
+    scalar_t* wt_ptr,
+    const accscalar_t scale,
+    int interp_size,
+    const interp_filter_t& interp_filter,
+    accscalar_t xmin_m_center,
+    int xsize) {
+
+  accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
+  accscalar_t total_w = 0.0;
+  int j = 0;
+  for (j = 0; j < xsize; j++) {
+    accscalar_t w = interp_filter((j + xmin_m_center + static_cast<accscalar_t>(0.5)) * invscale);
+    wt_ptr[j] = static_cast<scalar_t>(w);
+    total_w += w;
+  }
+  for (j = 0; j < xsize; j++) {
+    if (total_w != 0.0) {
+      wt_ptr[j] /= total_w;
+    }
+  }
+  for (; j < interp_size; j++) {
+    wt_ptr[j] = static_cast<scalar_t>(0.0);
+  }
+}
+
+template <typename scalar_t, typename accscalar_t>
+__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
+    const scalar_t* src,
+    const scalar_t* weights,
+    int size) {
+  scalar_t t = static_cast<accscalar_t>(*src);
+  scalar_t wts = static_cast<accscalar_t>(weights[0]);
+  accscalar_t output = t * wts;
+
+  int j = 1;
+  for (; j < size; j++) {
+    wts = static_cast<accscalar_t>(weights[j]);
+    t = static_cast<accscalar_t>(*(src + j));
+    output += t * wts;
+  }
+  return output;
+}
+
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1366dbd3a2adc2fee0f672eaca4df59efdc5c883
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <thrust/tuple.h>
+
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+
+namespace at {
+namespace native {
+namespace cuda_utils {
+
+constexpr int kCUDABlockReduceNumThreads = 512;
+// Algorithmic limitation: BlockReduce does two WarpReduce calls, each
+// of which reduces C10_WARP_SIZE elements. So, at most
+// C10_WARP_SIZE**2 elements can be reduced at a time.
+// NOTE: This is >= the max block size on current hardware anyway (1024).
+constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE;
+
+// Sums `val` across all threads in a warp.
+//
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+template <typename T>
+__inline__ __device__ T WarpReduceSum(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val += WARP_SHFL_DOWN(val, offset);
+  }
+  return val;
+}
+
+// Picks the maximum `val` across all threads in a warp.
+//
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+template <typename T>
+__inline__ __device__ T WarpReduceMax(T val) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset));
+  }
+  return val;
+}
+
+struct Block1D {
+    static __forceinline__ __device__ int Tid() { return threadIdx.x; }
+
+    static __forceinline__ __device__ int Warps() {
+        return blockDim.x / C10_WARP_SIZE;
+    }
+};
+
+struct Block2D {
+    static __forceinline__ __device__ int Tid() {
+        return threadIdx.x + threadIdx.y * blockDim.x;
+    }
+
+    static __forceinline__ __device__ int Warps() {
+        return blockDim.x * blockDim.y / C10_WARP_SIZE;
+    }
+};
+
+// Sums `val` across all threads in a block.
+//
+// Warning: the return value is only valid for thread 0.
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+//   - `shared` should be a pointer to shared memory with size of, at least,
+//     `sizeof(T) * number_of_warps`
+template <typename T, typename B = Block1D>
+__inline__ __device__ T BlockReduceSum(T val, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduceSum(val);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceSum(val);
+  }
+  return val;
+}
+
+// Picks out the maximum `val` across all threads in a block.
+//
+// Warning: the return value is only valid for thread 0.
+// Assumptions:
+//   - The size of each block should be a multiple of `C10_WARP_SIZE`
+//   - `shared` should be a pointer to shared memory with size of, at least,
+//     `sizeof(T) * number_of_warps`
+template <typename T, typename B = Block1D>
+__inline__ __device__ T BlockReduceMax(T val, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduceMax(val);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : T(0);
+  if (wid == 0) {
+    val = WarpReduceMax(val);
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp>
+__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) {
+#pragma unroll
+  for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) {
+    val = op.combine(val, op.warp_shfl_down(val, offset));
+  }
+  return val;
+}
+
+template <typename T, class ReduceOp, typename B = Block1D>
+__inline__ __device__ T
+BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) {
+  const int tid = B::Tid();
+  const int lid = tid % C10_WARP_SIZE;
+  const int wid = tid / C10_WARP_SIZE;
+  val = WarpReduce(val, op);
+  __syncthreads(); // prevent races when BlockReduces are called in a row.
+  if (lid == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+  val = (tid < B::Warps()) ? shared[lid] : identity_element;
+  if (wid == 0) {
+    val = WarpReduce(val, op);
+  }
+  return val;
+}
+
+} // namespace cuda_utils
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..211cb62dcae2bf5feea8bca8dbf5e48f3d81dee9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d94d65c9c2ba07f3f3b1b6342abd8ed8da1206b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..be681ad162b5951d733ef3efc0c764d5c0d45d20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh
@@ -0,0 +1,202 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/cuda/ForeachFunctors.cuh>
+#include <ATen/native/cuda/MultiTensorApply.cuh>
+#include <ATen/native/cuda/Pow.cuh>
+#include <utility>
+
+namespace at {
+namespace native {
+
+enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 };
+
+namespace {
+
+constexpr uint8_t kParamIdx = 0;
+constexpr uint8_t kGradIdx = 1;
+constexpr uint8_t kExpAvgIdx = 2;
+constexpr uint8_t kExpAvgSqIdx = 3;
+constexpr uint8_t kMaxExpAvgSqIdx = 4;
+
+template <
+    typename scalar_type,
+    typename opmath_t,
+    int depth,
+    ADAM_MODE adam_mode,
+    bool amsgrad>
+C10_DEVICE inline void adam_math(
+    scalar_type r_args[depth][kILP],
+    const double& lr,
+    const double& beta1,
+    const double& beta2,
+    const double& weight_decay,
+    const double& eps,
+    const bool& maximize,
+    const float* grad_scale_ptr,
+    const float* found_inf_ptr,
+    const opmath_t& bias_correction1,
+    const opmath_t& bias_correction2_sqrt) {
+  static_assert(depth == 4 || depth == 5);
+#pragma unroll
+  for (int ii = 0; ii < kILP; ii++) {
+    // Load values.
+    opmath_t param = static_cast<opmath_t>(r_args[kParamIdx][ii]);
+    opmath_t grad = static_cast<opmath_t>(r_args[kGradIdx][ii]);
+    if (grad_scale_ptr) {
+      grad /= (static_cast<double>(*grad_scale_ptr));
+    }
+    const opmath_t grad_to_store = grad;
+    if (maximize) {
+      grad = -grad;
+    }
+    opmath_t exp_avg = static_cast<opmath_t>(r_args[kExpAvgIdx][ii]);
+    opmath_t exp_avg_sq = static_cast<opmath_t>(r_args[kExpAvgSqIdx][ii]);
+    opmath_t max_exp_avg_sq;
+    if (amsgrad) {
+      max_exp_avg_sq = static_cast<opmath_t>(r_args[kMaxExpAvgSqIdx][ii]);
+    }
+    // Update param, grad, 1st and 2nd order momentum.
+    if (weight_decay != 0) {
+      if constexpr (adam_mode == ADAM_MODE::ORIGINAL) {
+        grad += param * weight_decay;
+      } else if constexpr (adam_mode == ADAM_MODE::ADAMW) {
+        param -= lr * weight_decay * param;
+      }
+    }
+    // todo(crcrpar): use lerp
+    // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/
+    exp_avg = beta1 * exp_avg + (1 - beta1) * grad;
+    exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad;
+    const opmath_t step_size = lr / bias_correction1;
+    opmath_t denom;
+    if (amsgrad) {
+      max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq);
+      denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps;
+    } else {
+      denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps;
+    }
+    param -= step_size * exp_avg / denom;
+
+    // Store results.
+    r_args[kParamIdx][ii] = param;
+    if (grad_scale_ptr) {
+      r_args[kGradIdx][ii] = grad_to_store;
+    }
+    r_args[kExpAvgIdx][ii] = exp_avg;
+    r_args[kExpAvgSqIdx][ii] = exp_avg_sq;
+    if (amsgrad) {
+      r_args[kMaxExpAvgSqIdx][ii] = max_exp_avg_sq;
+    }
+  }
+}
+
+// [note: Conditional Gradient Store when `optimizer.step` is called by
+// GradScaler] When a user is training their model(s) with an FP16 AMP recipe,
+// parameter updates are done via `grad_scaler.step(optimizer)` instead of
+// `optimizer.step()`. For most optimizers, GradScaler unscales gradients on
+// behalf of those optimizers. Also, before `.step`, it makes sure that all the
+// gradients involved are finite, which incurs a device sync. On the other hand,
+// fused optimizers set their member variable of `_step_supports_amp_scaling` to
+// `True` in order to remove the device sync above. This means that fused
+// optimizers have to have their CUDA kernels (a) unscale gradients and (b) skip
+// parameter updates accordingly. To be functionally on par with `torch.optim`
+// optimizers and `_multi_tensor` ones, the kernel below writes out gradients
+// only when `grad_scale_ptr != nullptr.
+template <typename scalar_type, int depth, ADAM_MODE adam_mode, bool amsgrad>
+struct FusedAdamMathFunctor {
+  static_assert(
+      depth == 4 || depth == 5,
+      "depth of 4 for Adam, depth of 5 for Adam with AMSGrad.");
+  using opmath_t = at::opmath_type<scalar_type>;
+  C10_DEVICE __forceinline__ void operator()(
+      int chunk_size,
+      FusedOptimizerTensorListMetadata<depth>& tl,
+      const float* lr_ptr,
+      const double& lr,
+      const double& beta1,
+      const double& beta2,
+      const double& weight_decay,
+      const double& eps,
+      const bool& maximize,
+      const float* grad_scale_ptr,
+      const float* found_inf_ptr) {
+    const auto tensor_loc = tl.block_to_tensor[blockIdx.x];
+    const auto chunk_idx = tl.block_to_chunk[blockIdx.x];
+    const double lr_double = lr_ptr ? *lr_ptr : lr;
+
+    if (found_inf_ptr && *found_inf_ptr == 1) {
+      return;
+    }
+    const auto [bias_correction1, bias_correction2_sqrt] =
+        [&]() -> std::pair<double, double> {
+      auto* step_count =
+          reinterpret_cast<const float*>(tl.state_steps_addresses[tensor_loc]);
+      const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count);
+      const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count);
+      const auto bias_correction2_sqrt = std::sqrt(bias_correction2);
+      return {bias_correction1, bias_correction2_sqrt};
+    }();
+
+    scalar_type* args[depth];
+    scalar_type r_args[depth][kILP];
+    const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
+
+    const bool all_aligned{
+        init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
+    if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) {
+      for (int64_t i_start = threadIdx.x;
+           i_start * kILP < n && i_start * kILP < chunk_size;
+           i_start += blockDim.x) {
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          load_store(r_args[i], args[i], 0, i_start);
+        }
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
+            r_args,
+            lr_double,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr,
+            bias_correction1,
+            bias_correction2_sqrt);
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          if (i != kGradIdx || grad_scale_ptr) {
+            load_store(args[i], r_args[i], i_start, 0);
+          }
+        }
+      }
+    } else {
+      for (int64_t i_start = 0; i_start < n && i_start < chunk_size;
+           i_start += blockDim.x * kILP) {
+        load_args<depth>(r_args, args, i_start, chunk_size, n);
+        adam_math<scalar_type, opmath_t, depth, adam_mode, amsgrad>(
+            r_args,
+            lr_double,
+            beta1,
+            beta2,
+            weight_decay,
+            eps,
+            maximize,
+            grad_scale_ptr,
+            found_inf_ptr,
+            bias_correction1,
+            bias_correction2_sqrt);
+#pragma unroll
+        for (int i = 0; i < depth; i++) {
+          if (i != kGradIdx || grad_scale_ptr) {
+            store_args(args[i], r_args[i], i_start, chunk_size, n);
+          }
+        }
+      }
+    }
+  }
+};
+} // namespace
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..06782055593021b1301f8f670b444db5af9001da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d454ed93960ec71fc598ee1f8a2cd96f01301d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/im2col.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/im2col.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8829ecb6155cb12b91b35575baa29973fb963ebb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/im2col.cuh
@@ -0,0 +1,345 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+using namespace at::cuda::detail;
+
+// Kernel for fast unfold+copy
+// (borrowed from Caffe:
+// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
+// CUDA_NUM_THREADS = 1024
+
+template <typename dt>
+C10_LAUNCH_BOUNDS_1(1024)
+__global__ void im2col_kernel(
+    const int64_t n,
+    const dt* data_im,
+    const int64_t height,
+    const int64_t width,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    int64_t w_out = index % width_col;
+
+    int64_t idx = index / width_col;
+
+    int64_t h_out = idx % height_col;
+    int64_t channel_in = idx / height_col;
+    int64_t channel_out = channel_in * kernel_height * kernel_width;
+    int64_t h_in = h_out * stride_height - pad_height;
+    int64_t w_in = w_out * stride_width - pad_width;
+
+    dt* col = data_col + (channel_out * height_col + h_out) * width_col + w_out;
+    const dt* im = data_im + (channel_in * height + h_in) * width + w_in;
+
+    for (int64_t i = 0; i < kernel_height; ++i) {
+      for (int64_t j = 0; j < kernel_width; ++j) {
+        int64_t h = h_in + i * dilation_height;
+        int64_t w = w_in + j * dilation_width;
+        *col = (h >= 0 && w >= 0 && h < height && w < width)
+            ? im[i * dilation_height * width + j * dilation_width]
+            : static_cast<dt>(0);
+        col += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename dt>
+void im2col(
+    cudaStream_t stream,
+    const dt* data_im,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int64_t num_kernels = channels * height_col * width_col;
+  // Launch CUDA_NUM_THREADS = 1024
+  im2col_kernel<<<GET_BLOCKS(num_kernels), 1024, 0, stream>>>(
+      num_kernels,
+      data_im,
+      height,
+      width,
+      kernel_height,
+      kernel_width,
+      pad_height,
+      pad_width,
+      stride_height,
+      stride_width,
+      dilation_height,
+      dilation_width,
+      height_col,
+      width_col,
+      data_col);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename accT, typename dt>
+__forceinline__ __device__ void col2im_device(
+    const int64_t index,
+    const dt* data_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im) {
+  accT val = static_cast<accT>(0);
+  const int64_t w_im = index % width + pad_width;
+  const int64_t h_im = (index / width) % height + pad_height;
+  const int64_t c_im = index / (width * height);
+  int64_t kernel_extent_w = (kernel_w - 1) * dilation_width + 1;
+  int64_t kernel_extent_h = (kernel_h - 1) * dilation_height + 1;
+  // compute the start and end of the output
+  const int64_t w_col_start = (w_im < kernel_extent_w)
+      ? 0
+      : (w_im - kernel_extent_w) / stride_width + 1;
+  const int64_t w_col_end = ::min(w_im / stride_width + 1, width_col);
+  const int64_t h_col_start = (h_im < kernel_extent_h)
+      ? 0
+      : (h_im - kernel_extent_h) / stride_height + 1;
+  const int64_t h_col_end = ::min(h_im / stride_height + 1, height_col);
+
+  // TODO: use LCM of stride and dilation to avoid unnecessary loops
+  for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+    for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+      int64_t h_k = (h_im - h_col * stride_height);
+      int64_t w_k = (w_im - w_col * stride_width);
+      if (h_k % dilation_height == 0 && w_k % dilation_width == 0) {
+        h_k /= dilation_height;
+        w_k /= dilation_width;
+        int64_t data_col_index =
+            (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col +
+              h_col) *
+                width_col +
+            w_col;
+        val += data_col[data_col_index];
+      }
+    }
+  }
+  data_im[index] = static_cast<dt>(val);
+}
+
+template <typename dt, typename accT>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void col2im_kernel(
+    const int64_t n,
+    const dt* data_col,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    col2im_device<accT>(
+        index,
+        data_col,
+        height,
+        width,
+        channels,
+        kernel_h,
+        kernel_w,
+        pad_height,
+        pad_width,
+        stride_height,
+        stride_width,
+        dilation_height,
+        dilation_width,
+        height_col,
+        width_col,
+        data_im);
+  }
+}
+
+template <typename dt, typename accT>
+void col2im(
+    cudaStream_t stream,
+    const dt* data_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t patch_height,
+    const int64_t patch_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_im) {
+  int64_t num_kernels = channels * height * width;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // CUDA_NUM_THREADS = 1024
+  col2im_kernel<dt, accT>
+      <<<GET_BLOCKS(num_kernels, 512), 512, 0, stream>>>(
+          num_kernels,
+          data_col,
+          height,
+          width,
+          channels,
+          patch_height,
+          patch_width,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          height_col,
+          width_col,
+          data_im);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename dt>
+C10_LAUNCH_BOUNDS_1(512)
+__global__ void col2im_batched_kernel(
+    const int64_t n,
+    const dt* data_col,
+    const int64_t col_batch_stride,
+    const int64_t nbatch,
+    const int64_t height,
+    const int64_t width,
+    const int64_t channels,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    const int64_t height_col,
+    const int64_t width_col,
+    dt* data_im,
+    const int64_t im_batch_stride) {
+  using accT = at::acc_type<dt, /*is_cuda*/true>;
+  const auto im_numel = n * nbatch;
+
+  CUDA_KERNEL_LOOP_TYPE(index, im_numel, int64_t) {
+    const auto ibatch = index / n;
+    const auto slice_index = index % n;
+
+    col2im_device<accT>(
+        slice_index,
+        data_col + ibatch * col_batch_stride,
+        height,
+        width,
+        channels,
+        kernel_h,
+        kernel_w,
+        pad_height,
+        pad_width,
+        stride_height,
+        stride_width,
+        dilation_height,
+        dilation_width,
+        height_col,
+        width_col,
+        data_im + ibatch * im_batch_stride);
+  }
+}
+
+template <typename dt>
+void col2im_batched(
+    cudaStream_t stream,
+    const dt* data_col,
+    const int64_t col_batch_stride,
+    const int64_t nbatch,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t patch_height,
+    const int64_t patch_width,
+    const int64_t pad_height,
+    const int64_t pad_width,
+    const int64_t stride_height,
+    const int64_t stride_width,
+    const int64_t dilation_height,
+    const int64_t dilation_width,
+    dt* data_im,
+    const int64_t im_batch_stride) {
+  const int64_t num_kernels = channels * height * width;
+  const int64_t output_numel = nbatch * num_kernels;
+  if (output_numel == 0) {
+    return;  // No work to do
+  }
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // CUDA_NUM_THREADS = 1024
+  col2im_batched_kernel<<<GET_BLOCKS(output_numel, 512), 512, 0, stream>>>(
+          num_kernels,
+          data_col,
+          col_batch_stride,
+          nbatch,
+          height,
+          width,
+          channels,
+          patch_height,
+          patch_width,
+          pad_height,
+          pad_width,
+          stride_height,
+          stride_width,
+          dilation_height,
+          dilation_width,
+          height_col,
+          width_col,
+          data_im,
+          im_batch_stride);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/jit_utils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/jit_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..38af0cc125d5f191ea7d6321853198a3ac79d11f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/jit_utils.h
@@ -0,0 +1,215 @@
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/util/irange.h>
+#include <ATen/jit_macros.h>
+#include <ATen/cuda/detail/LazyNVRTC.h>
+
+namespace at { namespace cuda { namespace jit {
+
+enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar};
+
+struct NvrtcFunction {
+  CUmodule module = CUmodule();
+  CUfunction function = nullptr;
+};
+
+struct KernelDescriptor {
+  std::string name;
+  std::string f;
+  c10::ScalarType f_inputs_type;
+  c10::ScalarType result_type;
+  c10::SmallVector<c10::ScalarType> extra_args_types;
+  int nInputs, nOutputs;
+};
+
+// Helper function to return a vector<string>
+// corresponding to the type of the arguments in parameter pack.
+template <typename... Args>
+c10::SmallVector<at::ScalarType> get_extra_args_types() {
+  return {c10::CppTypeToScalarType<Args>::value ...};
+}
+
+template <
+  typename result_type,
+  typename f_inputs_type,
+  typename... ExtraArgs>
+KernelDescriptor make_kernel_descriptor(
+    std::string name,
+    std::string f,
+    int nInputs,
+    int nOutputs) {
+  KernelDescriptor ret;
+  ret.name = std::move(name);
+  ret.f = std::move(f);
+  ret.f_inputs_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  ret.result_type = c10::CppTypeToScalarType<result_type>::value;
+  ret.extra_args_types = get_extra_args_types<ExtraArgs...>();
+  ret.nInputs = nInputs;
+  ret.nOutputs = nOutputs;
+  return ret;
+}
+
+inline int can_vectorize_up_to(size_t default_alignment, void *pointer) {
+  auto ip = reinterpret_cast<uintptr_t>(pointer);
+  if (ip % (4 * default_alignment) == 0) {
+    return 4;
+  }
+  if (ip % (2 * default_alignment) == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef<char*> pointers) {
+  TORCH_INTERNAL_ASSERT(desc.nOutputs == 1);
+  TORCH_INTERNAL_ASSERT(static_cast<int64_t>(pointers.size()) == 1 + desc.nInputs);
+
+  // Deals with output
+  auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize();
+  int result = can_vectorize_up_to(result_size, pointers[0]);
+
+  // Incorporates input(s)
+  auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize();
+  for (auto i : c10::irange(1, pointers.size())) {
+    result = std::min(result, can_vectorize_up_to(input_size, pointers[i]));
+  }
+
+  return result;
+}
+
+std::string generate_code(
+    int nInputs,
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const std::string& f_input_type,
+    const std::string& compute_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    c10::SmallVector<std::string>& extra_args_typenames,
+    bool vectorized=false,
+    int vec_size=0,
+    bool return_by_ref=false);
+
+std::string generate_code(
+    const KernelDescriptor &desc,
+    bool contiguous,
+    bool dynamic_casting,
+    BinaryFuncVariant scalar_pos,
+    bool vectorized=false,
+    int vec_size=0,
+    bool return_by_ref=false);
+
+std::string generate_reduction_code(
+    int nOutputs,
+    const std::string& func,
+    const std::string& name,
+    const int vt0,
+    const std::string& f_inputs_type,
+    const std::string& reduction_accum_type,
+    const std::string& result_type,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen);
+
+std::string generate_reduction_code(
+    const KernelDescriptor &desc,
+    const int vt0,
+    bool contiguous,
+    bool vectorized,
+    int vec_size,
+    int max_threads_codegen);
+
+NvrtcFunction jit_pwise_function(
+    const std::string& code,
+    const std::string& kernel_name);
+
+void launch_jitted_pwise_function(
+    NvrtcFunction function,
+    void* args[],
+    const dim3 nBlocks,
+    const dim3 kBlockSize,
+    const int smem=0);
+
+template <typename T>
+struct delayed_false : std::false_type {
+};
+
+// Defines type names
+// NOTE: General case is instantiated only for invalid types.
+// All the valid types have specialization using the TYPE_NAME_FN
+// macro below.
+template <typename T>
+inline std::string typeName() {
+  // we can't use static_assert(false) directly as the
+  // program will be not compiled even if the template is not
+  // instantiated, so we use `delayed_false`
+  // to make sure compiler doesn't eagerly raise
+  // fail this assertion.
+  static_assert(delayed_false<T>::value, "invalid type for jiterator");
+  return "void";
+}
+
+#define TYPE_NAME_FN(ctype, name) \
+template <> inline std::string typeName<ctype>(){ \
+    return std::string(#ctype);    \
+}
+
+AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN)
+#undef TYPE_NAME_FN
+// JIT uses std::complex directly, because nvRTC compile programs
+// with -default-device, so there is no such issue like:
+//   "std::sin(complex) is __host__ only"
+template <> inline std::string typeName<bool>(){
+    return "bool";
+}
+template <> inline std::string typeName<c10::complex<at::Half>>(){
+    return "std::complex<at::Half>";
+}
+template <> inline std::string typeName<c10::complex<float>>(){
+    return "std::complex<float>";
+}
+template <> inline std::string typeName<c10::complex<double>>(){
+    return "std::complex<double>";
+}
+template <> inline std::string typeName<at::Half>(){
+    return "at::Half";
+}
+template <> inline std::string typeName<at::BFloat16>(){
+    return "at::BFloat16";
+}
+template <> inline std::string typeName<at::Float8_e5m2>(){
+    return "at::Float8_e5m2";
+}
+template <> inline std::string typeName<at::Float8_e4m3fn>(){
+    return "at::Float8_e4m3fn";
+}
+template <> inline std::string typeName<at::Float8_e5m2fnuz>() {
+    return "at::Float8_e5m2fnuz";
+}
+template <> inline std::string typeName<at::Float8_e4m3fnuz>() {
+    return "at::Float8_e4m3fnuz";
+}
+
+#define TYPE_NAME_CASE(ctype, scalartype)                    \
+  case ScalarType::scalartype:  return typeName<ctype>();
+inline std::string typeName(ScalarType t) {
+    switch (t) {
+      AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE)
+      default:
+          TORCH_CHECK(false, "invalid type for jiterator");
+    }
+}
+#undef TYPE_NAME_CASE
+
+TORCH_CUDA_CPP_API void initializeCudaContext();
+
+}}}  // namespace at::cuda::jit
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6350c44eab91827ac7a7fc1df75ca7f88ad44c7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
@@ -0,0 +1,680 @@
+namespace at {
+namespace cuda {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+  #if defined(__clang__) && defined(__HIP__)
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  // until ROCm support for kernel asserts is restored
+  #define assert(expr) (static_cast<void>(0))
+  #endif
+
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+  #if defined(__clang__) && defined(__HIP__)
+    return __shfl_down(value, delta, width);
+  #else
+    return __shfl_down_sync(mask, value, delta, width);
+  #endif
+  }
+
+
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+  #if defined(__clang__) && defined(__HIP__)
+        __shfl_down(value.real(), delta, width),
+        __shfl_down(value.imag(), delta, width));
+  #else
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  #endif
+  }
+  #endif
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+
+
+
+
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+
+  };
+
+
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+
+  inline __device__ ${functor}
+
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+
+
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    scalar_t values[input_vec_size];
+
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+
+  const std::string reduction_template_1 = R"ESCAPE(
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+
+
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+
+}}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/thread_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c8e524a0467ad1034c069d2b69a58dab92d7d68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/thread_constants.h
@@ -0,0 +1,22 @@
+#pragma once
+#include <c10/macros/Macros.h>
+
+// Marks a lambda as executable on both the host and device. The __host__
+// attribute is important so that we can access static type information from
+// the host, even if the function is typically only executed on the device.
+#ifndef GPU_LAMBDA
+#define GPU_LAMBDA __host__ __device__
+#endif
+
+#if defined(USE_ROCM)
+constexpr int num_threads() {
+  return 256;
+}
+#else
+constexpr uint32_t num_threads() {
+  return C10_WARP_SIZE * 4;
+}
+#endif
+
+constexpr int thread_work_size() { return 4; }
+constexpr int block_work_size() { return thread_work_size() * num_threads(); }
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/vol2col.cuh b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/vol2col.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..285fd470563d81562743808efdd3a39300e4264c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/cuda/vol2col.cuh
@@ -0,0 +1,263 @@
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+using namespace at::cuda::detail;
+
+// Kernel for fast unfold+copy on volumes
+template <typename T>
+__global__ void vol2col_kernel(
+    const int64_t n,
+    const T* data_vol,
+    const int depth,
+    const int height,
+    const int width,
+    const int ksize_t,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_t,
+    const int pad_h,
+    const int pad_w,
+    const int stride_t,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_t,
+    const int dilation_h,
+    const int dilation_w,
+    const int depth_col,
+    const int height_col,
+    const int width_col,
+    T* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    auto w_out = index % width_col;
+    index /= width_col;
+    auto h_out = index % height_col;
+    index /= height_col;
+    auto t_out = index % depth_col;
+    auto channel_in = index / depth_col;
+    auto channel_out = channel_in * ksize_t * ksize_h * ksize_w;
+    auto t_in = t_out * stride_t - pad_t;
+    auto h_in = h_out * stride_h - pad_h;
+    auto w_in = w_out * stride_w - pad_w;
+    data_col +=
+        ((channel_out * depth_col + t_out) * height_col + h_out) * width_col +
+        w_out;
+    data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in;
+    for (int i = 0; i < ksize_t; ++i) {
+      for (int j = 0; j < ksize_h; ++j) {
+        for (int k = 0; k < ksize_w; ++k) {
+          auto t = t_in + i * dilation_t;
+          auto h = h_in + j * dilation_h;
+          auto w = w_in + k * dilation_w;
+          *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height &&
+                       w < width)
+              ? data_vol
+                    [i * dilation_t * height * width + j * dilation_h * width +
+                     k * dilation_w]
+              : static_cast<T>(0);
+          data_col += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void vol2col(
+    cudaStream_t stream,
+    const T* data_vol,
+    const int channels,
+    const int depth,
+    const int height,
+    const int width,
+    const int depth_col,
+    const int height_col,
+    const int width_col,
+    const int ksize_t,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_t,
+    const int pad_h,
+    const int pad_w,
+    const int stride_t,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_t,
+    const int dilation_h,
+    const int dilation_w,
+    T* data_col) {
+  // We are going to launch channels * depth_col * height_col * width_col
+  // kernels, each kernel responsible for copying a single-channel grid.
+  // We cast an operand to int64 so that the product will not overflow
+  const auto num_kernels = static_cast<int64_t>(channels) * depth_col * height_col * width_col;
+  // Launch
+  vol2col_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
+      num_kernels,
+      data_vol,
+      depth,
+      height,
+      width,
+      ksize_t,
+      ksize_h,
+      ksize_w,
+      pad_t,
+      pad_h,
+      pad_w,
+      stride_t,
+      stride_h,
+      stride_w,
+      dilation_t,
+      dilation_h,
+      dilation_w,
+      depth_col,
+      height_col,
+      width_col,
+      data_col);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename T, typename accT>
+__global__ void vol2im_kernel(
+    const int64_t n,
+    const T* data_col,
+    const unsigned depth,
+    const unsigned height,
+    const unsigned width,
+    const unsigned channels,
+    const unsigned kernel_t,
+    const unsigned kernel_h,
+    const unsigned kernel_w,
+    const unsigned pad_t,
+    const unsigned pad_h,
+    const unsigned pad_w,
+    const unsigned stride_t,
+    const unsigned stride_h,
+    const unsigned stride_w,
+    const unsigned dilation_t,
+    const unsigned dilation_h,
+    const unsigned dilation_w,
+    const unsigned depth_col,
+    const unsigned height_col,
+    const unsigned width_col,
+    T* data_vol) {
+  CUDA_KERNEL_LOOP(index, n) {
+    accT val = static_cast<accT>(0);
+    const auto w_im = index % width + pad_w;
+    const auto h_im = (index / width) % height + pad_h;
+    const auto t_im = (index / width / height) % depth + pad_t;
+    const auto c_im = index / (width * height * depth);
+    auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1;
+    // compute the start and end of the output
+    const auto w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const auto w_col_end = std::min(w_im / stride_w + 1, width_col);
+    const auto h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const auto h_col_end = std::min(h_im / stride_h + 1, height_col);
+    const auto t_col_start =
+        (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1;
+    const auto t_col_end = std::min(t_im / stride_t + 1, depth_col);
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) {
+      for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+        for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+          uint64_t t_k = (t_im - t_col * stride_t);
+          uint64_t h_k = (h_im - h_col * stride_h);
+          uint64_t w_k = (w_im - w_col * stride_w);
+          if (t_k % dilation_t == 0 && h_k % dilation_h == 0 &&
+              w_k % dilation_w == 0) {
+            t_k /= dilation_t;
+            h_k /= dilation_h;
+            w_k /= dilation_w;
+            const int64_t idx_k =
+                ((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k;
+            const int64_t data_col_index =
+                ((idx_k * depth_col + t_col) *
+                    height_col + h_col) *
+                  width_col + w_col;
+            val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_vol[index] = static_cast<T>(val);
+  }
+}
+
+template <typename T, typename accT>
+void col2vol(
+    cudaStream_t stream,
+    const T* data_col,
+    const int64_t channels,
+    const int64_t depth,
+    const int64_t height,
+    const int64_t width,
+    const int64_t output_depth,
+    const int64_t output_height,
+    const int64_t output_width,
+    const int64_t patch_t,
+    const int64_t patch_h,
+    const int64_t patch_w,
+    const int64_t pad_t,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_t,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_t,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    T* data_vol) {
+  const auto num_kernels = channels * depth * height * width;
+
+  auto check_fits_in_unsigned =
+    [](int64_t val, const char * name) {
+      constexpr auto umax = std::numeric_limits<unsigned>::max();
+      TORCH_CHECK(val >= 0 && val <= umax,
+                  name, " must fit in a 32-bit unsigned value");
+    };
+  check_fits_in_unsigned(num_kernels, "input size");
+  check_fits_in_unsigned(
+      channels * patch_t * patch_h * patch_w, "channels x kernel size");
+
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  vol2im_kernel<T, accT>
+      <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
+          num_kernels,
+          data_col,
+          depth,
+          height,
+          width,
+          channels,
+          patch_t,
+          patch_h,
+          patch_w,
+          pad_t,
+          pad_h,
+          pad_w,
+          stride_t,
+          stride_h,
+          stride_w,
+          dilation_t,
+          dilation_h,
+          dilation_w,
+          output_depth,
+          output_height,
+          output_width,
+          data_vol);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/group_norm.h b/MLPY/Lib/site-packages/torch/include/ATen/native/group_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..2747015d02fb20e2407719885867250cb9b4cfb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/group_norm.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using forward_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* N */,
+    int64_t /* C */,
+    int64_t /* HxW */,
+    int64_t /* group */,
+    double /* eps */,
+    Tensor& /* Y */,
+    Tensor& /* mean */,
+    Tensor& /* rstd */);
+
+using backward_fn = void (*)(
+    const Tensor& /* dY */,
+    const Tensor& /* X */,
+    const Tensor& /* mean */,
+    const Tensor& /* rstd */,
+    const Tensor& /* gamma */,
+    int64_t /* N */,
+    int64_t /* C */,
+    int64_t /* HxW */,
+    int64_t /* group */,
+    Tensor& /* dX */,
+    Tensor& /* dgamma */,
+    Tensor& /* dbeta */);
+
+DECLARE_DISPATCH(forward_fn, GroupNormKernel);
+DECLARE_DISPATCH(backward_fn, GroupNormBackwardKernel);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/im2col.h b/MLPY/Lib/site-packages/torch/include/ATen/native/im2col.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9093c4ae116d1135af196d943a065942527e684
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/im2col.h
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/Utils.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/cpu/utils.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+
+namespace at::native {
+
+template <typename T>
+static void im2col(
+    const T* data_im,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t output_height,
+    const int64_t output_width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    T* data_col,
+    bool is_channels_last = false) {
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+
+  if (is_channels_last) {
+    at::parallel_for(0, height_col * width_col, 0, [&](int64_t begin, int64_t end) {
+      int64_t h_col{0}, w_col{0};
+      data_index_init(begin, h_col, height_col, w_col, width_col);
+
+      for (const auto i_col : c10::irange(begin, end)) {
+        for (const auto h_offset : c10::irange(kernel_h)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_offset : c10::irange(kernel_w)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+
+            const T* slice_im = data_im + (h_im * width + w_im) * channels;
+            T* slice_col = data_col + (i_col * kernel_h * kernel_w + h_offset * kernel_w + w_offset) * channels;
+
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              std::copy_n(slice_im, channels, slice_col);
+            } else {
+              std::fill_n(slice_col, channels, T(0));
+            }
+          }
+        }
+
+        // move the next index
+        data_index_step(h_col, height_col, w_col, width_col);
+      }
+    });
+  } else {
+    at::parallel_for(0, channels_col, 0, [&](int64_t begin, int64_t end) {
+      int64_t c_im{0}, h_offset{0}, w_offset{0};
+      data_index_init(begin, c_im, channels, h_offset, kernel_h, w_offset, kernel_w);
+
+      for (const auto c_col : c10::irange(begin, end)) {
+        for (const auto h_col : c10::irange(height_col)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_col : c10::irange(width_col)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+            data_col[(c_col * height_col + h_col) * width_col + w_col] =
+                (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                ? data_im[(c_im * height + h_im) * width + w_im]
+                : static_cast<T>(0);
+          }
+        }
+
+        // move to the next index
+        data_index_step(c_im, channels, h_offset, kernel_h, w_offset, kernel_w);
+      }
+    });
+  }
+}
+
+template <typename T>
+static void col2im(
+    const T* data_col,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t output_height,
+    const int64_t output_width,
+    const int64_t kernel_h,
+    const int64_t kernel_w,
+    const int64_t pad_h,
+    const int64_t pad_w,
+    const int64_t stride_h,
+    const int64_t stride_w,
+    const int64_t dilation_h,
+    const int64_t dilation_w,
+    T* data_im,
+    bool is_channels_last = false) {
+  std::fill_n(data_im, height * width * channels, T(0));
+
+  const int64_t height_col = output_height;
+  const int64_t width_col = output_width;
+  const int64_t channels_col = channels * kernel_h * kernel_w;
+
+  if (is_channels_last) {
+    for (const auto h_col : c10::irange(height_col)) {
+      for (const auto w_col : c10::irange(width_col)) {
+        for (const auto h_offset : c10::irange(kernel_h)) {
+          int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+          for (const auto w_offset : c10::irange(kernel_w)) {
+            int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+
+            T* slice_im = data_im + (h_im * width + w_im) * channels;
+            const T* slice_col = data_col + ((h_col * width_col + w_col) * kernel_h * kernel_w
+                + h_offset * kernel_w + w_offset) * channels;
+
+            if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) {
+              std::transform(slice_col, slice_col + channels, slice_im, slice_im, std::plus<T>());
+            }
+          }
+        }
+      }
+    }
+  } else {
+    for (const auto c_col : c10::irange(channels_col)) {
+      int64_t w_offset = c_col % kernel_w;
+      int64_t h_offset = (c_col / kernel_w) % kernel_h;
+      int64_t c_im = c_col / kernel_h / kernel_w;
+
+      for (const auto h_col : c10::irange(height_col)) {
+        int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+        for (const auto w_col : c10::irange(width_col)) {
+          int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+
+          if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+            data_im[(c_im * height + h_im) * width + w_im] +=
+                data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+    }
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/im2col_shape_check.h b/MLPY/Lib/site-packages/torch/include/ATen/native/im2col_shape_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fa2afa27cfd00c809de4852811458fc175d3e33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/im2col_shape_check.h
@@ -0,0 +1,232 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/div_rtn.h>
+
+namespace at::native {
+
+static inline void col2im_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t kernel_height,
+    int64_t kernel_width,
+    int64_t dilation_height,
+    int64_t dilation_width,
+    int64_t pad_height,
+    int64_t pad_width,
+    int64_t stride_height,
+    int64_t stride_width) {
+  TORCH_CHECK(
+      kernel_width > 0 && kernel_height > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_height,
+      " kernel_width: ",
+      kernel_width);
+  TORCH_CHECK(
+      stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+  TORCH_CHECK(
+      dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+  TORCH_CHECK(
+      pad_width >= 0 && pad_height >= 0,
+      "padding should be non-negative, but got pad_height: ",
+      pad_height,
+      " pad_width: ",
+      pad_width);
+
+
+  int64_t ndim = input.ndimension();
+  // allow dim=0 only the batch dimension.
+  TORCH_CHECK(
+      (ndim == 2 && input.size(0) != 0 && input.size(1) != 0) ||
+      (ndim == 3 && input.size(1) != 0 && input.size(2) != 0),
+      "Expected 2D or 3D (batch mode) tensor for input with possibly 0 batch size and non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int64_t batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t n_input_plane = input.size(batch_dim + 1);
+
+  if (n_input_plane % (kernel_width * kernel_height) != 0) {
+    AT_ERROR(
+        "Expected size of input's dimension 1 to be divisible by the "
+        "product of kernel_size, but got input.size(1)=",
+        n_input_plane,
+        " and kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        ").");
+  }
+
+  int64_t input_length = input.size(batch_dim + 2);
+  int64_t n_blocks_height =
+      div_rtn<int64_t>(
+          output_height + 2 * pad_height -
+              dilation_height * (kernel_height - 1) - 1,
+          stride_height) +
+      1;
+  int64_t n_blocks_width = div_rtn<int64_t>(
+                                   output_width + 2 * pad_width -
+                                       dilation_width * (kernel_width - 1) - 1,
+                                   stride_width) +
+      1;
+
+  if (input_length != (n_blocks_height * n_blocks_width)) {
+    AT_ERROR(
+        "Given output_size=(",
+        output_height,
+        ", ",
+        output_width,
+        "), kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        "), dilation=(",
+        dilation_height,
+        ", ",
+        dilation_width,
+        "), padding=(",
+        pad_height,
+        ", ",
+        pad_width,
+        "), stride=(",
+        stride_height,
+        ", ",
+        stride_width,
+        "), expected size of input's dimension 2 to match the calculated number of ",
+        "sliding blocks ",
+        n_blocks_height,
+        " * ",
+        n_blocks_width,
+        " = ",
+        (n_blocks_height * n_blocks_width),
+        ", but got input.size(2)=",
+        input_length,
+        ".");
+  }
+
+  TORCH_CHECK(
+    n_blocks_height >= 1 && n_blocks_width >= 1,
+    "Given output_size=(", output_height, ", ", output_width, "), ",
+    "kernel_size=(", kernel_height, ", ", kernel_width, "), ",
+    "dilation=(", dilation_height, ", ", dilation_width, "), ",
+    "padding=(", pad_height, ", ", pad_width, "), ",
+    "stride=(", stride_height, ", ", stride_width, "), ",
+    "calculated shape of the array of sliding blocks as ",
+    "(", n_blocks_height, ", ", n_blocks_width, "), ",
+    "which is too small (non-positive)");
+
+  if (output_width < 1 || output_height < 1) {
+    AT_ERROR(
+        "Expected output spatial size to be positive, but got: output_size=(",
+        output_height,
+        ", ",
+        output_width,
+        ").");
+  }
+}
+
+static inline void im2col_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t kernel_height,
+    int64_t kernel_width,
+    int64_t dilation_height,
+    int64_t dilation_width,
+    int64_t pad_height,
+    int64_t pad_width,
+    int64_t stride_height,
+    int64_t stride_width) {
+  TORCH_CHECK(
+      kernel_width > 0 && kernel_height > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_height,
+      " kernel_width: ",
+      kernel_width);
+
+  TORCH_CHECK(
+      dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+
+  TORCH_CHECK(
+      pad_width >= 0 && pad_height >= 0,
+      "padding should be non-negative, but got pad_height: ",
+      pad_height,
+      " pad_width: ",
+      pad_width);
+
+  TORCH_CHECK(
+      stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+
+  int64_t ndim = input.ndimension();
+
+  // allow dim=0 only the batch dimension.
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+  TORCH_CHECK(
+      (ndim == 3 && input.size(0) && valid_dims) ||
+      (ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int64_t dim_batch = 0;
+
+  if (ndim == 3) {
+    dim_batch = -1;
+  }
+
+  int64_t input_height = input.size(dim_batch + 2);
+  int64_t input_width = input.size(dim_batch + 3);
+  int64_t output_height = div_rtn<int64_t>(
+                              input_height + 2 * pad_height -
+                                  (dilation_height * (kernel_height - 1) + 1),
+                              stride_height) +
+      1;
+  int64_t output_width = div_rtn<int64_t>(
+                             input_width + 2 * pad_width -
+                                 (dilation_width * (kernel_width - 1) + 1),
+                             stride_width) +
+      1;
+
+  if (output_height < 1 || output_width < 1) {
+    AT_ERROR(
+        "Given input with spatial size (",
+        input_height,
+        ", ",
+        input_height,
+        "), kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        "), dilation=(",
+        dilation_height,
+        ", ",
+        dilation_width,
+        "), padding=(",
+        pad_height,
+        ", ",
+        pad_width,
+        "), calculated shape of the array of sliding blocks as (",
+        output_height,
+        ", ",
+        output_width,
+        "), but its components must be at least one.");
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/layer_norm.h b/MLPY/Lib/site-packages/torch/include/ATen/native/layer_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..b452a0575397cb4e764abc7186c0a77717a9ac28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/layer_norm.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/accumulate.h>
+
+namespace at::native {
+
+namespace {
+
+C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_layer_norm_inputs(
+    const Tensor& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */) {
+
+  const int normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(
+      normalized_ndim >= 1,
+      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+      "containing at least one element, but got normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !bias.defined() || bias.sizes().equals(normalized_shape),
+      "Expected bias to be of same shape as normalized_shape, but got ",
+      "bias of shape ",
+      bias.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  const auto input_shape = input.sizes();
+  const auto input_ndim = input.dim();
+
+  if (input_ndim < normalized_ndim ||
+      !input_shape.slice(input_ndim - normalized_ndim)
+           .equals(normalized_shape)) {
+    std::stringstream ss;
+    ss << "Given normalized_shape=" << normalized_shape
+       << ", expected input with shape [*";
+    for (auto size : normalized_shape) {
+      ss << ", " << size;
+    }
+    ss << "], but got input of size" << input_shape;
+    AT_ERROR(ss.str());
+  }
+
+  const int axis = input_ndim - normalized_ndim;
+  const int64_t M =
+      c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis);
+  const int64_t N =
+      c10::multiply_integers(input_shape.cbegin() + axis, input_shape.cend());
+
+  return std::make_pair(M, N);
+}
+
+} // namespace
+
+void layer_norm_cpu_out(
+    at::Tensor& out,
+    const at::Tensor& input,
+    const Tensor& gamma,
+    const Tensor& beta,
+    double eps,
+    int64_t M,
+    int64_t N);
+
+using forward_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */,
+    Tensor* /* mean */,
+    Tensor* /* rstd */);
+
+using backward_fn = void (*)(
+    const Tensor& /* dY */,
+    const Tensor& /* X */,
+    const Tensor& /* mean */,
+    const Tensor& /* rstd */,
+    const Tensor& /* gamma */,
+    int64_t /* M */,
+    int64_t /* N */,
+    Tensor* /* dX */,
+    Tensor* /* dgamma */,
+    Tensor* /* dbeta */);
+
+DECLARE_DISPATCH(forward_fn, LayerNormKernel);
+DECLARE_DISPATCH(backward_fn, LayerNormBackwardKernel);
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/Copy.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..03366154489d12267705dc699bde5f9bdf4e3025
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/Copy.h
@@ -0,0 +1,15 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace mps {
+
+at::Tensor& mps_copy_(at::Tensor& dst, const at::Tensor& src, bool non_blocking);
+void copy_blit_mps(void* dst, const void* src, size_t size);
+
+} // namespace mps
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..91fcba78006345dd7ce1a88bb29b7c6edc9adcf8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphSonomaOps.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#if !defined(__MAC_14_0) && \
+    (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode)
+{
+    MPSGraphFFTScalingModeNone          = 0L,
+    MPSGraphFFTScalingModeSize          = 1L,
+    MPSGraphFFTScalingModeUnitary       = 2L,
+};
+
+@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
+@property (readwrite, nonatomic) BOOL inverse;
+@property (readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
+@property (readwrite, nonatomic) BOOL roundToOddHermitean;
++(nullable instancetype) descriptor;
+@end
+
+@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
+
+@interface MPSGraph (SonomaOps)
+-(MPSGraphTensor * _Nonnull) conjugateWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                            name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) realPartOfTensor:(MPSGraphTensor * _Nonnull) tensor
+                                         name:(NSString * _Nullable) name;
+
+
+-(MPSGraphTensor * _Nonnull) fastFourierTransformWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                       axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                                 descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                       name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) realToHermiteanFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                     axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                               descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                     name:(NSString * _Nullable) name;
+
+-(MPSGraphTensor * _Nonnull) HermiteanToRealFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                                     axes:(NSArray<NSNumber *> * _Nonnull) axes
+                                               descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor
+                                                     name:(NSString * _Nullable) name;
+@end
+
+// define BFloat16 enums for MacOS13
+#define MPSDataTypeBFloat16 ((MPSDataType) (MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
+
+// define Metal version
+#define MTLLanguageVersion3_1 ((MTLLanguageVersion) ((3 << 16) + 1))
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa5637bf562bb92349f943b8a5f4272b6d84941a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h
@@ -0,0 +1,197 @@
+#pragma once
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+// TODO: Remove me when moved to MacOS 13
+#if !defined(__MAC_13_2) && \
+    (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
+
+@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
+
+@property (readwrite, nonatomic) NSUInteger strideInX;
+@property (readwrite, nonatomic) NSUInteger strideInY;
+@property (readwrite, nonatomic) NSUInteger strideInZ;
+@property (readwrite, nonatomic) NSUInteger dilationRateInX;
+@property (readwrite, nonatomic) NSUInteger dilationRateInY;
+@property (readwrite, nonatomic) NSUInteger dilationRateInZ;
+
+@property (readwrite, nonatomic) NSUInteger paddingLeft;
+@property (readwrite, nonatomic) NSUInteger paddingRight;
+@property (readwrite, nonatomic) NSUInteger paddingTop;
+@property (readwrite, nonatomic) NSUInteger paddingBottom;
+@property (readwrite, nonatomic) NSUInteger paddingFront;
+@property (readwrite, nonatomic) NSUInteger paddingBack;
+
+@property (readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
+@property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
+@property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
+
+@property (readwrite, nonatomic) NSUInteger groups;
+
+@end
+
+@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
+
+#endif
+
+@interface MPSGraph (VenturaOps)
+
+#if !defined(__MAC_13_0) && \
+    (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
+
+typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
+{
+    MPSGraphResizeNearestRoundingModeRoundPreferCeil   =  0L,
+    MPSGraphResizeNearestRoundingModeRoundPreferFloor  =  1L,
+    MPSGraphResizeNearestRoundingModeCeil              =  2L,
+    MPSGraphResizeNearestRoundingModeFloor             =  3L,
+    MPSGraphResizeNearestRoundingModeRoundToEven       =  4L,
+    MPSGraphResizeNearestRoundingModeRoundToOdd        =  5L,
+};
+
+// Define complex enums for MacOS 12
+#define MPSDataTypeComplexBit 0x01000000
+#define MPSDataTypeComplexFloat32 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
+#define MPSDataTypeComplexFloat16 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
+#endif
+
+- (MPSGraphTensor * _Nonnull) convolution3DWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                            weightsTensor:(MPSGraphTensor * _Nonnull) weights
+                                               descriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) descriptor
+                                                     name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
+                                                                  weightsTensor:(MPSGraphTensor * _Nonnull) weights
+                                                                    outputShape:(MPSShape * _Nonnull) outputShape
+                                                   forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
+                                                                           name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
+                                                                      sourceTensor:(MPSGraphTensor * _Nonnull) source
+                                                                       outputShape:(MPSShape * _Nonnull) outputShape
+                                                      forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
+                                                                              name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor * _Nonnull)tensor
+                                                axis:(NSInteger)axis
+                                                name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull)sortWithTensor:(MPSGraphTensor * _Nonnull)tensor
+                                       axis:(NSInteger)axis
+                                       name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                               axis:(NSInteger) axis
+                         descending:(BOOL) descending
+                               name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                         axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                         descending:(BOOL) descending
+                               name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                         axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                               name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
+                                          axis:(NSInteger)axis
+                                          name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                  axis:(NSInteger) axis
+                            descending:(BOOL) descending
+                                  name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                           axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                           descending:(BOOL) descending
+                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                           axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
+                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
+                                        name:(NSString * _Nullable)name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                           sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                  nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                         centerResult:(BOOL) centerResult
+                                         alignCorners:(BOOL) alignCorners
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                           sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                    scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                  nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                               layout:(MPSGraphTensorNamedDataLayout) layout
+                                                 name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                            sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                          centerResult:(BOOL) centerResult
+                                          alignCorners:(BOOL) alignCorners
+                                                layout:(MPSGraphTensorNamedDataLayout) layout
+                                                  name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
+                                            sizeTensor:(MPSGraphTensor * _Nonnull) size
+                                     scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                                layout:(MPSGraphTensorNamedDataLayout) layout
+                                                  name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                        input:(MPSGraphTensor * _Nonnull) input
+                                          nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                                 centerResult:(BOOL) centerResult
+                                                 alignCorners:(BOOL) alignCorners
+                                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                                         name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                        input:(MPSGraphTensor * _Nonnull) input
+                                            scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                          nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                                       layout:(MPSGraphTensorNamedDataLayout) layout
+                                                         name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                         input:(MPSGraphTensor * _Nonnull) input
+                                                  centerResult:(BOOL) centerResult
+                                                  alignCorners:(BOOL) alignCorners
+                                                        layout:(MPSGraphTensorNamedDataLayout) layout
+                                                          name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
+                                                         input:(MPSGraphTensor * _Nonnull) input
+                                             scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
+                                                        layout:(MPSGraphTensorNamedDataLayout) layout
+                                                          name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                            samplingMode:(MPSGraphResizeMode) samplingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
+
+- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
+                                        coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
+                                                  layout:(MPSGraphTensorNamedDataLayout) layout
+                                    normalizeCoordinates:(BOOL) normalizeCoordinates
+                                     relativeCoordinates:(BOOL) relativeCoordinates
+                                            alignCorners:(BOOL) alignCorners
+                                             paddingMode:(MPSGraphPaddingMode) paddingMode
+                                     nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
+                                           constantValue:(double) constantValue
+                                                    name:(NSString * _Nullable) name;
+- (MPSGraphTensor * _Nonnull) truncateWithTensor:(MPSGraphTensor * _Nonnull) tensor
+                                            name:(NSString * _Nullable) name;
+
+@end
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/OperationUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/OperationUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..46078ea068d406f8a2ff5eca9609c1bc51dd197b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/OperationUtils.h
@@ -0,0 +1,394 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Tensor.h>
+#include <ATen/Utils.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/TensorFactory.h>
+#include <c10/util/Optional.h>
+#include <c10/core/ScalarType.h>
+#include <torch/library.h>
+#include <exception>
+#include <unordered_map>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#endif
+
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+// Fwd declarations
+namespace at {
+  struct TensorIteratorBase;
+}
+using namespace at::mps;
+
+namespace at::native::mps {
+
+void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
+
+struct MPSScalar {
+  id<MTLBuffer> getMTLBuffer() const { return __builtin_bit_cast(id<MTLBuffer>, buffer.get()); }
+
+  size_t size = 0;
+  ScalarType type = ScalarType::Undefined;
+  c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope)
+  union {
+    float f; // MPS doesn't support 'double'
+    at::Half h;
+    int64_t i;
+    bool b;
+    c10::complex<float> cf;
+    c10::complex<at::Half> ch;
+    at::BFloat16 bf16;
+  } value {};
+};
+
+void runMPSGraph(MPSStream* mpsStream,
+    MPSGraph* mpsGraph,
+    NSDictionary* feeds,
+    NSDictionary* results);
+
+MPSDataType getMPSDataType(ScalarType scalar_type);
+static inline MPSDataType getMPSDataType(const Tensor& t) {
+  return getMPSDataType(t.scalar_type());
+}
+MPSDataType getMPSScalarType(ScalarType scalar_type);
+static inline MPSDataType getMPSScalarType(const Tensor& t) {
+  return getMPSScalarType(t.scalar_type());
+}
+MPSScalar   getMPSScalar(const Scalar& scalar, ScalarType type);
+std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false);
+static inline std::string getMPSTypeString(const Tensor& t, bool short_name = false) {
+  return getMPSTypeString(t.scalar_type(), short_name);
+}
+std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type);
+NSArray<NSNumber*>* getTensorAxes(const Tensor& t);
+NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
+std::string getMPSShapeString(MPSShape* shape);
+std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true);
+std::string getArrayRefString(const IntArrayRef s);
+// use has_storage() on the returned tensor to determine if src actually is a view
+Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
+Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
+bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape);
+MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType);
+MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
+
+// The MPSShape could vary based on memory format
+MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
+
+static inline id<MTLBuffer> getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+
+class Placeholder {
+ public:
+  Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr,
+              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
+  MPSGraphTensor* getMPSGraphTensor() {
+    return _placeholder;
+  }
+  MPSGraphTensorData* getMPSGraphTensorData() {
+    return _value;
+  }
+  bool isIntermediate() {
+    return _value == nullptr;
+  }
+
+ private:
+  MPSGraphTensor* _placeholder;
+  MPSGraphTensorData* _value;
+  Tensor _tensor;
+};
+
+void resize_tensor(Tensor* output);
+Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device);
+MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType);
+MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, MPSDataType toType);
+MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
+MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);
+
+MPSGraph* make_mps_graph();
+void printTensorNDArray(const Tensor& t);
+MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType);
+
+MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape);
+MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType);
+MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar);
+
+string get_mem_format_string(c10::MemoryFormat memory_format);
+
+using MPSCacheKey = uint64_t;
+
+// derive this class to cache a graph and its inputs/outputs
+// can be used to store any NSObject
+struct MPSCachedGraph
+{
+  MPSCachedGraph(NSObject *object) : _object([object retain]) {}
+  virtual ~MPSCachedGraph() {
+   [_object release];
+   _object = nullptr;
+  }
+
+  template<typename T>
+  inline T* as() {
+    return static_cast<T*>(this);
+  }
+
+  MPSGraph *graph() const { return (MPSGraph *)_object; }
+  NSObject *object() const { return _object; }
+private:
+  NSObject *_object = nullptr;
+};
+
+struct MPSUnaryCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+
+struct MPSUnaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSUnaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil; // some backward input is actually the forward's output
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+
+struct MPSBinaryCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *outputTensor_ = nil;
+};
+
+struct MPSBinaryGradCachedGraph : public MPSCachedGraph
+{
+  MPSBinaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {}
+  MPSGraphTensor *gradOutputTensor_ = nil;
+  MPSGraphTensor *inputTensor_ = nil;
+  MPSGraphTensor *otherTensor_ = nil;
+  MPSGraphTensor *gradInputTensor_ = nil;
+};
+
+// TODO: Improve the overall design of MPSGraphCache.
+// https://github.com/pytorch/pytorch/issues/77176
+// Cache holding various keys mapped to graphs
+struct MPSGraphCache
+{
+  typedef MPSCachedGraph * (^CreateCachedGraphBlock)();
+
+  struct CacheEntry {
+    CacheEntry(const std::string& key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {}
+    MPSCachedGraph* cachedGraph_ = nullptr;
+    std::string key_;
+  };
+
+ public:
+
+  static MPSGraphCache* getInstance() {
+    if(_instance_cache == nullptr) {
+      _instance_cache = new MPSGraphCache();
+    }
+    return _instance_cache;
+  }
+
+  ~MPSGraphCache() {
+    dispatch_release(serialQueue_);
+
+    for (const auto& i : cache_) {
+      delete i.second.cachedGraph_;
+    }
+  }
+
+  // Disallow the copy constructor and operator= functions
+  MPSGraphCache(const MPSGraphCache&) = delete;
+  void operator=(const MPSGraphCache&) = delete;
+
+  MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+
+    __block MPSCachedGraph* cachedGraph = nil;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync_with_rethrow(serialQueue_, ^() {
+      // verify the cached entry doesn't already exist
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+      } else {
+        cachedGraph = createCacheBlock();
+        CacheEntry entry(key, cachedGraph);
+        cache_.emplace(hash, entry);
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+
+  template<typename T>
+  inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) {
+    return static_cast<T *>(CreateCachedGraph(key, createCacheBlock));
+  }
+
+  MPSCachedGraph* LookUp(const std::string& key) const {
+
+    __block MPSCachedGraph* cachedGraph = nullptr;
+
+    MPSCacheKey hash = std::hash<std::string>{}(key);
+
+    dispatch_sync(serialQueue_, ^() {
+
+      if (cache_.count(hash) != 0) {
+        auto& entry = cache_.at(hash);
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+        cachedGraph = entry.cachedGraph_;
+        profileCachedGraph(entry);
+      }
+    });
+    return cachedGraph;
+  }
+
+  template<typename T>
+  inline T* LookUpAs(const std::string& key) const {
+    return static_cast<T *>(LookUp(key));
+  }
+
+ private:
+  MPSGraphCache() {
+    serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
+  }
+  // this is defined in OperationUtils.mm to not include
+  // MPSProfiler.h in header OperationUtils.h
+  void profileCachedGraph(const CacheEntry& cacheEntry) const;
+
+  static MPSGraphCache* _instance_cache;
+  std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+  dispatch_queue_t serialQueue_ = nullptr;
+
+};
+
+// Common template for creating graph with a specified cache if missing
+template<typename T>
+inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function<void(MPSGraph*, T*)> instantiate) {
+  auto cache_ = MPSGraphCache::getInstance();
+  if (auto rc  = cache_->LookUpAs<T>(key)) {
+    return rc;
+  }
+  return cache_->CreateCachedGraphAs<T>(key, ^mps::MPSCachedGraph*() {
+    T* newCachedGraph = nil;
+    @autoreleasepool {
+      // Initialize graph
+      auto mpsGraph = mps::make_mps_graph();
+      newCachedGraph = new T(mpsGraph);
+      instantiate(mpsGraph, newCachedGraph);
+    }
+    return newCachedGraph;
+  });
+}
+
+// Common math operations
+MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor);
+
+#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name)                                           \
+  if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) {                                                       \
+     TORCH_WARN_ONCE("MPS: no support for int64 for ", op_name,                                                         \
+     ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3.");   \
+  }
+
+/**
+ * Returns distance from lowest to highest element offset in given tensor.
+ */
+size_t compute_storage_numel_distance(const at::Tensor& t);
+
+/**
+ * Checks whether tensor is mapped to a contiguous area in the storage.
+ */
+inline bool is_dense_in_storage(const at::Tensor& t) {
+  return compute_storage_numel_distance(t) == static_cast<size_t>(t.numel());
+}
+
+static inline void mtl_setBuffer(id<MTLComputeCommandEncoder> encoder, const Tensor& t, unsigned idx) {
+  [encoder setBuffer:getMTLBufferStorage(t)
+              offset:t.storage_offset() * t.element_size()
+             atIndex:idx];
+}
+
+static inline void mtl_dispatch1DJob(id<MTLComputeCommandEncoder> encoder,
+                                     id<MTLComputePipelineState> cplState,
+                                     uint32_t length) {
+  const uint32_t maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup];
+  auto size = MTLSizeMake(length, 1, 1);
+  auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1);
+  [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize];
+}
+
+id<MTLBuffer> generateKernelDataOffsets(id<MTLComputeCommandEncoder> commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false);
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) {
+        return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+         };
+}
+
+inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) {
+        return @{
+                p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(),
+                p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(),
+                p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(),
+                p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(),
+         };
+}
+
+inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) {
+        runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result));
+}
+
+inline bool supportsComplex() {
+  return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS);
+}
+
+// MPS yet to support double types, but starting from MacOS 14, supports bfloat16
+inline bool supportedFloatingType(ScalarType dtype) {
+  return dtype == kFloat || dtype == kHalf || dtype == kBFloat16;
+}
+
+inline bool supportedFloatingType(const Tensor& t) {
+  return supportedFloatingType(t.scalar_type());
+}
+
+} // namespace at::native::mps
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/TensorFactory.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/TensorFactory.h
new file mode 100644
index 0000000000000000000000000000000000000000..5512899934355d19d78f3fc1f65700aa043fc413
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/TensorFactory.h
@@ -0,0 +1,12 @@
+//  Copyright © 2022 Apple Inc.
+
+#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(                                                   \
+      TYPE, NAME,                                                       \
+      AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__)                \
+      AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__)              \
+      AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)               \
+      AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__))
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/mps/UnaryConstants.h b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/UnaryConstants.h
new file mode 100644
index 0000000000000000000000000000000000000000..90ac12c0b8f845940108c6585426fcec0632ef0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/mps/UnaryConstants.h
@@ -0,0 +1,43 @@
+#pragma once
+
+const char* UNARY_KERNEL_TEMPLATE = R"METAL(
+#include <metal_stdlib>
+using namespace metal;
+
+constant float a[4] = {{0.886226899, -1.645349621, 0.914624893, -0.140543331}};
+constant float b[4] = {{-2.118377725, 1.442710462, -0.329097515, 0.012229801}};
+constant float c[4] = {{-1.970840454, -1.624906493, 3.429567803, 1.641345311}};
+constant float d[2] = {{3.543889200, 1.637067800}};
+
+kernel void erfinv_mps_kernel( device {0} *output [[buffer(0)]],
+                            device {1} *input [[buffer(1)]],
+                            uint index [[thread_position_in_grid]]) {{
+
+  float y = input[index];
+  float x, z, num, dem; /*working variables */
+  /* coefficients in rational expansion */
+
+  float y_abs = abs(y);
+  if(y_abs > 1.0f){{
+    output[index] = NAN;
+    return;
+  }}
+  if(y_abs == 1.0f){{
+    output[index] = copysign(INFINITY, y);
+    return;
+  }}
+  if(y_abs <= 0.7f) {{
+    z = y * y;
+    num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
+    dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + 1.0f);
+    x = y * num / dem;
+  }}
+  else{{
+    z = sqrt(-1.0f*log((1.0-y_abs)/2.0));
+    num = ((c[3]*z + c[2])*z + c[1]) * z + c[0];
+    dem = (d[1]*z + d[0])*z + 1.0f;
+    x = copysign(num, y) / dem;
+  }}
+
+  output[index] = x;
+}})METAL";
\ No newline at end of file
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0155a074db1686f62fe4d0a12b04642f2b1a9f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+enum class NESTED_DENSE_OP: uint8_t {ADD, MUL};
+
+using nested_dense_elementwise_fn = void (*)(Tensor& result, const Tensor & self, const Tensor & other, const NESTED_DENSE_OP& op);
+
+DECLARE_DISPATCH(nested_dense_elementwise_fn, nested_dense_elementwise_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h
new file mode 100644
index 0000000000000000000000000000000000000000..2efb0cbfc4fd856ea38472d375c258dd01350f0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace at {
+namespace native {
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..9520b517c81d62dae0d24a1ec51a8089fa563eec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/NestedTensorImpl.h>
+#include <c10/macros/Macros.h>
+
+namespace at {
+namespace native {
+
+TORCH_API Tensor NestedTensor_to_padded_tensor_generic(
+    const Tensor& t,
+    double padding,
+    OptionalIntArrayRef output_size);
+
+template <typename Func>
+Tensor map_nt(const Tensor& nt, Func f) {
+  auto* nt_impl = get_nested_tensor_impl(nt);
+  const auto& sizes = nt_impl->get_nested_sizes();
+  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
+}
+template <typename Func>
+Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){
+  auto* nt_impl_1 = get_nested_tensor_impl(nt_1);
+  auto* nt_impl_2 = get_nested_tensor_impl(nt_2);
+  const auto& sizes = nt_impl_1->get_nested_sizes();
+  return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes);
+}
+
+C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
+    const NestedTensorImpl& input,
+    IntArrayRef normalized_shape,
+    const Tensor& weight /* optional */,
+    const Tensor& bias /* optional */) {
+
+  const size_t normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(
+      normalized_ndim >= 1,
+      "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+      "containing at least one element, but got normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ",
+      weight.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+  TORCH_CHECK(
+      !bias.defined() || bias.sizes().equals(normalized_shape),
+      "Expected bias to be of same shape as normalized_shape, but got ",
+      "bias of shape ",
+      bias.sizes(),
+      " and normalized_shape = ",
+      normalized_shape);
+
+  // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
+  // Also, compute M and N considering the idiosyncracies of NestedTensors
+  int64_t N = 1;
+  for (const auto i: c10::irange(normalized_ndim)) {
+    TORCH_CHECK(
+      input.opt_size(-normalized_ndim + i) != c10::nullopt,
+      "normalized_shape extends into irregular dimensions for the nested tensor"
+    );
+    TORCH_CHECK(
+      normalized_shape[i] == *input.opt_size(-normalized_ndim + i),
+      "The shape at dimension ",
+      i,
+      "of normalized_shape doesn't match the input"
+    );
+    N *= normalized_shape[i];
+  }
+
+  const int64_t M = input.numel() / N;
+
+  return std::make_pair(M, N);
+}
+
+Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0aa6fe52ab95d10afde810adbfd9b191416057ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h
@@ -0,0 +1,103 @@
+/**
+ * Transformer-specific NestedTensor utility functions.
+ *
+ * Not co-located with NestedTensor core code yet because they only
+ * support specific cases needed in transformers.
+ */
+#pragma once
+
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+class Scalar;
+} // namespace c10
+
+namespace at {
+class Tensor;
+namespace native {
+struct NestedTensorImpl;
+
+// Requires that self is a contiguous NestedTensor, other is not a
+// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self
+// must have a consistent last dimension across its included Tensors
+// and that dimension must match other.size(0).
+Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other);
+
+// Requires that mat1 is a contiguous NestedTensor, self & mat2 are
+// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1
+// has a consistent last dimension across its included Tensors that
+// matches mat2.size(0).
+Tensor NestedTensor_times_Tensor_plus_Tensor_addmm(
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const c10::Scalar& beta,
+    const c10::Scalar& alpha,
+    c10::optional<bool> use_gelu = c10::nullopt);
+
+Tensor NestedTensor_add_NestedTensor_in_place(
+    const Tensor& self,
+    const Tensor& other);
+
+TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor(
+    const Tensor& sizes,
+    int64_t extra_elements);
+
+Tensor NestedTensor_from_padded_tensor_cpu(
+    const Tensor& padded,
+    const NestedTensorImpl& nt);
+
+Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional<int64_t> mask_dim, c10::optional<int64_t> mask_dim_length);
+
+template <typename T>
+void remove_padding_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void remove_padding_transform0213_kernelLauncher(
+    const T* input,
+    T* output,
+    const int* offsets,
+    const int* input_sizes,
+    const int* output_sizes,
+    int output_dim,
+    const int batch_size);
+
+template <typename T>
+void add_padding_kernelLauncher(
+    T* input,
+    T* output,
+    T padding_value,
+    const int* offsets,
+    const int* input_sizes,
+    int input_dim,
+    const std::vector<int64_t>& output_sizes,
+    const int batch_size,
+    const int output_batch_size);
+
+TORCH_API Tensor flash_attention_helper(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+
+TORCH_API std::tuple<Tensor, Tensor> mem_efficient_helper_nested_unpacked(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    double dropout_p,
+    bool need_attn_weights,
+    bool is_causal);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca2a4ea6c2c9b89a61eb956c586e2e1d5b365de5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h
@@ -0,0 +1,44 @@
+#include <ATen/ATen.h>
+
+
+namespace at {
+namespace native {
+namespace preprocessing {
+
+/**
+ * This function will take nested query, key, and value
+ * and will preprocess it in order to run with either
+ * the flash-attention or efficient-attention kernels.
+ * @return A tuple containing all the necessary data for running the fused
+ * kernels
+ */
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, int64_t, int64_t, Tensor>
+sdpa_nested_preprocessing(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value);
+
+/**
+ * This function will take nested query, key, and value, grad_out, and out
+ * and will preprocess it in order to run with either
+ * the flash-attention or efficient-attention kernels backwards.
+ * We use both functions to avoid having to do the same preprocessing
+ * for cumulative_sequence_length_q and cumulative_sequence_length_kv
+ * @return A tuple containing all the necessary data for running the fused
+ * kernels
+ */
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
+sdpa_nested_preprocessing_backward(
+    const at::Tensor& grad_out_,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& out,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_kv,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_kv);
+
+} // namespace preprocessing
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..497c7ecc8cb6e88849a4ac1348ca8f4e682b4c03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h
@@ -0,0 +1,415 @@
+#pragma once
+
+#include <ATen/Dispatch.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/Parallel.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/cat.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/ones_native.h>
+#include <ATen/ops/prod.h>
+#include <ATen/ops/stack_native.h>
+#include <ATen/ops/tensor.h>
+#endif
+
+#include <utility>
+#include <vector>
+
+namespace at {
+namespace native {
+struct NestedTensorImpl;
+
+// The following functions are used to construct nested tensors from buffers and
+// metadata.
+
+inline at::Tensor wrap_buffer(at::Tensor buffer, at::Tensor nested_sizes) {
+  TORCH_CHECK(
+      buffer.dim() == 1,
+      "Expected given buffer to be 1dim, but got ",
+      buffer.dim(),
+      " instead.");
+  TORCH_CHECK(
+      buffer.is_contiguous(), "Expected given buffer to be contiguous.");
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(buffer), std::move(nested_sizes));
+}
+
+// TODO: Figure out if we need a non-moving wrap_buffer()
+inline at::Tensor wrap_buffer(
+    at::Tensor buffer,
+    at::Tensor nested_sizes,
+    at::Tensor nested_strides,
+    at::Tensor storage_offsets) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      buffer.is_contiguous(), "Given buffer must be contiguous.");
+  return at::detail::make_tensor<NestedTensorImpl>(
+      std::move(buffer),
+      std::move(nested_sizes),
+      std::move(nested_strides),
+      std::move(storage_offsets));
+}
+
+inline at::Tensor get_buffer(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_buffer();
+}
+
+/**
+ * Create a new nested tensor that is a view of a base nested tensor
+ *
+ * create_view_tensor calls a specialized constructor that copys the
+ * the keys from base onto the new view tensor being created.
+ * The storage is shared between the base and the returned view tensor
+ *
+ * All callers of this helper must:
+ * - Only return a view of the input
+ * - Must be explicit and define a derivative
+ *
+ * @param base Base tensor to construct view from.
+ * @param nested_sizes View tensors' sizes.
+ * @param nested_strides View tensors' strides.
+ * @param storage_offsets View tensors' offsets.
+ * @return A newly constructed view tensor
+ */
+inline at::Tensor create_nested_view_tensor(
+    const at::Tensor& base,
+    at::Tensor nested_sizes,
+    at::Tensor nested_strides,
+    at::Tensor storage_offsets) {
+  TORCH_INTERNAL_ASSERT(
+      base.is_nested(),
+      "This function can only be used to create nested tensor views");
+  TORCH_INTERNAL_ASSERT(
+      c10::impl::tls_local_dispatch_key_set().excluded_.has(
+          c10::DispatchKey::AutogradFunctionality),
+      "Creating a non differentiable nested tensor view in a CompositeImplicit function is not allowed.");
+  return at::detail::make_tensor<NestedTensorImpl>(
+      c10::TensorImpl::VIEW,
+      base,
+      nested_sizes,
+      nested_strides,
+      storage_offsets);
+}
+//  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Helper functions for getting information about a nested tensor's shape.
+
+int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
+
+// The sizes of the underlying tensors
+inline std::vector<IntArrayRef> NestedTensor_get_sizes(
+    const NestedTensorImpl* self_ptr) {
+  int64_t ntensors = self_ptr->size(0);
+  std::vector<IntArrayRef> sizes(ntensors);
+  if (ntensors == 0) {
+    return sizes;
+  }
+  const Tensor& sizemat = self_ptr->get_nested_sizes();
+  int64_t orig_dim = sizemat.size(1);
+  // nesting scalars has empty sizes
+  if (orig_dim == 0) {
+    return sizes;
+  }
+  const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
+
+  for (const auto i : c10::irange(ntensors)) {
+    sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
+    sizemat_ptr += orig_dim;
+  }
+  return sizes;
+}
+
+TORCH_API std::vector<int64_t> NestedTensor_get_max_size(
+    const NestedTensorImpl& nt);
+
+std::vector<int64_t> NestedTensor_get_max_size_from_size_tensor(
+    const Tensor& sizes);
+
+inline std::vector<IntArrayRef> NestedTensor_get_sizes(const at::Tensor& self) {
+  const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
+  return NestedTensor_get_sizes(self_ptr);
+}
+// The strides of the underlying tensors
+inline std::vector<IntArrayRef> NestedTensor_get_strides(
+    const NestedTensorImpl* self_ptr) {
+  int64_t ntensors = self_ptr->size(0);
+  std::vector<IntArrayRef> strides(ntensors);
+  if (ntensors == 0) {
+    return strides;
+  }
+  const Tensor& stridemat = self_ptr->get_nested_strides();
+  int64_t orig_dim = stridemat.size(1);
+  // nesting scalars has empty strides
+  if (orig_dim == 0) {
+    return strides;
+  }
+  const int64_t* stridemat_ptr = stridemat.data_ptr<int64_t>();
+  for (const auto i : c10::irange(ntensors)) {
+    strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim);
+    stridemat_ptr += orig_dim;
+  }
+  return strides;
+}
+
+inline std::vector<IntArrayRef> NestedTensor_get_strides(
+    const at::Tensor& self) {
+  const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
+  return NestedTensor_get_strides(self_ptr);
+}
+
+inline void check_numel_equals_buffer_size(const at::Tensor& self) {
+  auto self_impl = get_nested_tensor_impl(self);
+  TORCH_CHECK(
+      self.numel() == static_cast<int64_t>(self_impl->get_buffer_size()),
+      "Number of elements in nested tensor must match number of elements in buffer.");
+}
+
+inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
+  TORCH_CHECK(
+      self_ptr->numel() == static_cast<int64_t>(self_ptr->get_buffer_size()),
+      "Number of elements in nested tensor must match number of elements in buffer.");
+}
+//  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Data structures and functions for generically applying a function on a nested
+// tensor.
+namespace impl {
+
+template <typename T>
+struct NestedNode {
+  NestedNode() = delete;
+  explicit NestedNode(std::vector<T>&& children)
+      : _is_leaf(false), _children(children) {}
+  explicit NestedNode(TensorList children)
+      : _is_leaf(false), _children(children.vec()) {}
+  // NestedNode(NestedNode&) = delete;
+  // NestedNode(const NestedNode&) = delete;
+  // NestedNode& operator=(NestedNode) = delete;
+  explicit NestedNode(T payload) : _is_leaf(true), _payload(std::move(payload)) {}
+  inline bool is_leaf() const {
+    return _is_leaf;
+  }
+  inline size_t degree() const {
+    return _children.size();
+  }
+  inline const std::vector<T> unbind() const {
+    return _children;
+  }
+  inline T children(size_t i) const {
+    return _children[i];
+  }
+  inline const T& payload() const {
+    return _payload;
+  }
+  inline T& payload() {
+    return _payload;
+  }
+
+ private:
+  bool _is_leaf;
+  std::vector<T> _children;
+  T _payload;
+};
+
+using TensorNode = NestedNode<at::Tensor>;
+
+template <class F, class A, class TypeList>
+class _map;
+
+template <class F, class A, class... Args>
+class _map<F, A, c10::guts::typelist::typelist<Args...>> {
+ public:
+  static A function_one(F&& fn, const Args&... nested_node) {
+    return std::forward<F>(fn)(nested_node...);
+  }
+  // NOTE: We must move F to avoid copying objects if it is a lambda with
+  // captures.
+  static NestedNode<A> function(
+      F&& fn,
+      const NestedNode<Args>&... nested_node) {
+    size_t degree = 0;
+    bool all_leaf = true;
+    c10::guts::tuple_map(
+        std::forward_as_tuple(nested_node...), [&all_leaf, &degree](auto n) {
+          all_leaf = all_leaf && (n.is_leaf());
+          if (degree > 1 && n.degree() > 1) {
+            TORCH_CHECK(
+                degree == n.degree(), "NestedNodes must match in degree.");
+          }
+          if (n.degree() > degree) {
+            degree = n.degree();
+          }
+          return nullptr;
+        });
+    // All NestedNodes just wrap regular objects.
+    if (all_leaf) {
+      return NestedNode<A>(std::forward<F>(fn)(nested_node.payload()...));
+    }
+    // Some NestedNodes wrap regular Tensors, some NestedTensors and some other
+    // types.
+    std::vector<A> result;
+    for (size_t i = 0; i < degree; i++) {
+      std::tuple<Args...> children = c10::guts::tuple_map(
+          std::forward_as_tuple(nested_node...), [&i](auto a) {
+            static_assert(
+                c10::guts::is_instantiation_of<NestedNode, decltype(a)>::value,
+                "Internal error.");
+            // Broadcast regular arguments across NestedTensor constituents.
+            // This could be a Tensor, integer or anything else really.
+            if (a.is_leaf()) {
+              return a.payload();
+            }
+            // Broadcast NestedTensors with one constituent.
+            if (a.degree() == 1 && !a.is_leaf()) {
+              return a.children(0);
+            }
+            TORCH_CHECK(a.degree() > 0, "Internal assert.");
+            return a.children(i);
+          });
+      c10::guts::apply(
+          [&result, &fn](Args... filtered) {
+            result.emplace_back(function_one(std::forward<F>(fn), filtered...));
+          },
+          std::move(children));
+    }
+    return NestedNode<A>(std::move(result));
+  }
+};
+
+// TODO: Add static assert to verify lambda arguments match nested_node types
+template <class F, class... B>
+static inline NestedNode<
+    typename c10::guts::infer_function_traits<F>::type::return_type>
+map(F&& fn, const NestedNode<B>&... nested_node) {
+  return _map<
+      F,
+      typename c10::guts::infer_function_traits<F>::type::return_type,
+      typename c10::guts::infer_function_traits<F>::type::parameter_types>::
+      function(std::forward<F>(fn), nested_node...);
+}
+
+inline TensorNode get_nested_tensor_structure(at::Tensor tensor) {
+  if (get_nested_tensor_impl_or_null(tensor) == nullptr) {
+    return TensorNode(std::move(tensor));
+  }
+  return TensorNode(tensor.unbind());
+}
+
+inline Tensor wrap_tensor_node(
+    TensorNode tensor_node,
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device,
+    c10::optional<bool> pin_memory) {
+  TORCH_CHECK(
+      !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors.");
+  TensorOptions options_ =
+      TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
+          pin_memory);
+  if (tensor_node.degree() == 0) {
+    return wrap_buffer(ones({0}, dtype, layout, device), ones({}));
+  }
+
+  // Fast path: if all tensors are on CPU, have contiguous memory, and the same
+  // dtype, copying can be done much faster.
+  bool all_tensors_cpu = true;
+  bool all_tensors_contiguous = true;
+  bool all_tensors_same_dtype = true;
+  auto first_dtype = tensor_node.children(0).dtype();
+  std::vector<long> start_offsets(tensor_node.degree());
+  start_offsets[0] = 0;
+  long total_size = 0;
+  for (const auto i : c10::irange(tensor_node.degree())) {
+    all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu();
+    all_tensors_contiguous =
+        all_tensors_contiguous && tensor_node.children(i).is_contiguous();
+    all_tensors_same_dtype = all_tensors_same_dtype &&
+        (first_dtype == tensor_node.children(i).dtype());
+    if (!(all_tensors_cpu && all_tensors_contiguous &&
+          all_tensors_same_dtype)) {
+      break;
+    }
+    if (i > 0) {
+      start_offsets[i] =
+          start_offsets[i - 1] + tensor_node.children(i - 1).numel();
+    }
+    total_size += tensor_node.children(i).numel();
+  }
+
+  TensorOptions options;
+  Tensor nt_buffer, nt_sizes;
+  if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) {
+    nt_buffer = at::empty({total_size}, tensor_node.children(0).options());
+    nt_sizes = at::empty(
+        {static_cast<long>(tensor_node.degree()),
+         static_cast<long>(tensor_node.children(0).sizes().size())},
+        TensorOptions().dtype(kLong));
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
+        at::ScalarType::Half,
+        at::ScalarType::Bool,
+        at::ScalarType::BFloat16,
+        c10::typeMetaToScalarType(first_dtype),
+        "create_nt_buffer",
+        [&]() {
+          at::parallel_for(
+              0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) {
+                for (int64_t i = begin; i < end; ++i) {
+                  // Only try copying memory if there is more than 0 elements
+                  // for a certain tensor
+                  if (tensor_node.children(i).numel() > 0) {
+                    memcpy(
+                        nt_buffer.mutable_data_ptr<scalar_t>() + start_offsets[i],
+                        tensor_node.children(i).const_data_ptr<scalar_t>(),
+                        tensor_node.children(i).numel() * sizeof(scalar_t));
+                  }
+                }
+              });
+        });
+    long sizes_offset = 0;
+    for (size_t i = 0; i < tensor_node.degree(); ++i) {
+      auto tensor_sizes = tensor_node.children(i).sizes();
+      for (int64_t tensor_size : tensor_sizes) {
+        nt_sizes.mutable_data_ptr<int64_t>()[sizes_offset++] = tensor_size;
+      }
+    }
+    options = nt_buffer.options().merge_in(options_);
+  } else { // Slow path
+    std::vector<Tensor> flat_tensors;
+    std::vector<Tensor> sizes;
+    for (const auto i : c10::irange(tensor_node.degree())) {
+      flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous());
+      sizes.push_back(
+          tensor(c10::IntArrayRef(tensor_node.children(i).sizes())));
+    }
+    options = flat_tensors[0].options().merge_in(options_);
+    nt_buffer = at::cat(flat_tensors);
+    nt_sizes = at::native::stack(sizes);
+  }
+
+  return wrap_buffer(nt_buffer.to(options), nt_sizes);
+}
+
+} // namespace impl
+
+// This function is meant to ease rapid operator coverage for
+// NestedTensor kernels. It is not meant to be efficient. Use it judiciously.
+template <class F, class... A>
+inline at::Tensor map_nested_tensor(F&& fn, A... a) {
+  return wrap_tensor_node(
+      impl::map(std::forward<F>(fn), impl::get_nested_tensor_structure(a)...),
+      c10::nullopt,
+      c10::nullopt,
+      c10::nullopt,
+      c10::nullopt);
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..141bf5c6e10eb992358d5cf474306eba1cb5327c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantize_tensor_per_tensor_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& quantize_tensor_per_channel_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+Tensor& quantize_tensor_per_channel_float_qparams(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+Tensor& dequantize_tensor_per_tensor_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& dequantize_tensor_per_channel_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& dequantize_tensor_per_channel_float_qparams(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    Tensor scales,
+    Tensor zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point);
+
+using quantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point);
+
+using dequantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point);
+
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_fn,
+    quantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_affine_fn,
+    quantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_float_qparams_fn,
+    quantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_fn,
+    dequantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_affine_fn,
+    dequantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_float_qparams_fn,
+    dequantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
+
+template <typename T>
+TORCH_API Tensor quantize_tensor(
+    Tensor rtensor,
+    Tensor qtensor,
+    double scale,
+    int64_t zero_point);
+template <typename T>
+TORCH_API Tensor dequantize_tensor(
+    Tensor qtensor,
+    Tensor rtensor,
+    double scale,
+    int64_t zero_point);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..67cb7a7c451a587c566f0e5be6949c745e5d8b68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/core/ScalarType.h>
+
+namespace at {
+namespace native {
+
+// Quantize a float value into a uint value given scale and zero_point
+template <typename T>
+TORCH_API T quantize_val(double scale, int64_t zero_point, float value);
+// TODO combine this with quantize_val once the numerics for ARM are aligned
+// with it
+template <typename T>
+T quantize_val_arm(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
+template <typename T, int precision = 8>
+void quantize_vec(
+    double scale,
+    int64_t zero_point,
+    const float* src,
+    T* dst,
+    size_t count = 8);
+template <typename T>
+TORCH_API float dequantize_val(double scale, int64_t zero_point, T value);
+template <typename T>
+TORCH_API float dequantize_vec(
+    double scale,
+    int64_t zero_point,
+    const T* src,
+    float* dst,
+    size_t count = 8);
+template <typename SRC_T, typename DST_T>
+TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+
+// Given a multiplier and a zero_point, requantize int32_t computed values back
+// to quantized values. See comment above
+// make_per_tensor_affine_quantizer function for the usage of int64_t
+template <typename DST_T>
+TORCH_API DST_T
+requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
+
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..679777f8d65862e79fb8f3deb6aaa3b2d5c9428e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/native/ConvUtils.h>
+
+namespace at::native::quantized {
+namespace {
+// MakeConvOutputShape used from both CPU and CUDA libraries
+// and exporting symbol from torch_cpu would probably take more storage
+// than duplicating implementation which likely be inlined away
+template <int kSpatialDim>
+at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, kSpatialDim>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation);
+
+#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK)
+template <>
+at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 2>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const at::List<int64_t>& dilation) {
+  const int H = input_image_shape[0];
+  const int W = input_image_shape[1];
+  const int64_t Y_H =
+      (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  return {N, M, Y_H, Y_W};
+}
+
+template <>
+at::SmallVector<int64_t, 5> MakeConvOutputShape<3>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 3>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation) {
+  const int D = input_image_shape[0];
+  const int H = input_image_shape[1];
+  const int W = input_image_shape[2];
+  const int64_t Y_D =
+      (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_H =
+      (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1;
+  return {N, M, Y_D, Y_H, Y_W};
+}
+
+#endif
+} // anonymous namespace
+} // namespace at::native::quantized
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/Copy.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c611251db31ec8ad3c5f9cb6d843a81ba886cbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/Copy.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src);
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b5c36415346489ff17d9731b4b3eca09212a50e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+struct TensorIterator;
+
+namespace native {
+
+using fake_quant_tensor_cachemask_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float sc,
+    int64_t z_point,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    const Tensor& sc,
+    const Tensor& z_point,
+    const Tensor& fake_quant_enabled,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_learnable_grad_tensor_fn = void (*)(
+    TensorIterator& iter,
+    float scale,
+    float inv_scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    float grad_factor);
+
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub);
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub);
+DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub);
+
+using fake_quant_per_channel_fn = void (*)(
+    TensorIterator &iter,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_per_channel_cachemask_fn = void (*)(
+    TensorIterator &iter,
+    TensorIterator &iter_mask,
+    int64_t quant_min,
+    int64_t quant_max);
+
+DECLARE_DISPATCH(fake_quant_per_channel_cachemask_fn, fake_quant_per_channel_cachemask_stub);
+
+using fake_quant_learnable_per_channel_fn = void (*)(
+    TensorIterator &iter,
+    int64_t quant_min,
+    int64_t quant_max,
+    float grad_factor);
+
+DECLARE_DISPATCH(fake_quant_learnable_per_channel_fn, fake_quant_grad_learnable_channel_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8338878cc208ddb61fd9b66ef39ec1ba3236762c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/TensorIterator.h>
+
+namespace at {
+namespace native {
+using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point);
+using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point);
+
+DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub);
+DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub);
+
+
+} // native
+} // at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/PackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..5800a3d8fff18953192da734920cdd837c113b73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/PackedParams.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+
+  // out variant of LinearPackedParamsBase::apply
+  virtual at::Tensor& apply_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual at::Tensor& apply_relu_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    input: float32 Tensor, will be quantized to quint8 in the op
+  // Returns:
+  //    float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  virtual at::Tensor apply_dynamic(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor apply_dynamic_relu(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+
+  virtual at::Tensor& apply_dynamic_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual c10::optional<at::Tensor> bias() = 0;
+
+  virtual void set_bias(c10::optional<at::Tensor> /*bias*/) {
+    throw std::runtime_error(
+        "set_bias is not implemented for this packed "
+        "parameter type");
+  }
+};
+
+template <int kSpatialDim = 2>
+struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) = 0;
+
+  virtual std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() = 0;
+
+  virtual torch::List<int64_t> stride() const = 0;
+  virtual torch::List<int64_t> padding() const = 0;
+  virtual torch::List<int64_t> output_padding() const = 0;
+  virtual torch::List<int64_t> dilation() const = 0;
+  virtual int64_t groups() const = 0;
+  virtual bool transpose() const = 0;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e34516a9ab73887d606c7baf6886d17819fb9d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h
@@ -0,0 +1,8 @@
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+TORCH_API Tensor
+quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point);
+}
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..c24760a459ab7289f950758eea0588915a2707cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor embeddingbag_byte(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor embeddingbag_4bit(
+    const at::Tensor& indices,
+    const c10::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const c10::optional<at::Tensor>& per_sample_weights_,
+    const c10::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor unpack() = 0;
+
+  virtual int64_t bit_rate() const = 0;
+  virtual int64_t version() const = 0;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f726926f7390bd37512d0bd03620235aa7cf883b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h
@@ -0,0 +1,445 @@
+#pragma once
+
+#include <ATen/Config.h>
+#if AT_MKLDNN_ENABLED()
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ideep.hpp>
+#include <cpuinfo.h>
+
+#include <c10/util/CallOnce.h>
+
+using PrimitiveCacheKey = std::tuple<
+    double, // input_scale
+    int64_t, // input_zero_point
+    std::vector<int64_t>, // input_shape
+    double, // output_scale
+    int64_t, // output_zero_point
+    int64_t, // OMP_number_of_threads
+    double, // accum_scale
+    int64_t>; // accum_zero_point
+
+enum CacheKeyIndex {
+  InputScale,
+  InputZeroPoint,
+  InputShape,
+  OutputScale,
+  OutputZeroPoint,
+  NumOfThreads,
+};
+
+// Base class of primitive cache
+struct PrimitiveCache {
+  PrimitiveCacheKey key;
+
+  bool hit(const PrimitiveCacheKey& key) {
+    return this->key == key;
+  }
+};
+
+using LinearParams = ideep::matmul_forward_params;
+using Conv = dnnl::convolution_forward;
+using ConvDesc = dnnl::convolution_forward::primitive_desc;
+using ConvParams = ideep::convolution_forward_params;
+using Deconv = dnnl::deconvolution_forward;
+using DeconvDesc = dnnl::deconvolution_forward::primitive_desc;
+using DeconvParams = ideep::deconv_forward_params;
+
+struct LinearPrimitiveCache : PrimitiveCache {
+  LinearPrimitiveCache() {}
+
+  LinearPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const LinearParams& param) {
+    this->key = key;
+    this->param = param;
+  }
+
+  LinearParams param;
+
+  // For dynamic qlinear, scale and zero point
+  // are set at execution time. So we only need to compare
+  // the rest part of key.
+  bool hit_dynamic(const PrimitiveCacheKey& new_key) {
+    auto cached_input_shape = std::get<InputShape>(this->key);
+    auto new_input_shape = std::get<InputShape>(new_key);
+    return (
+        cached_input_shape == new_input_shape &&
+        std::get<NumOfThreads>(this->key) == std::get<NumOfThreads>(new_key));
+  }
+
+  LinearParams& get_param() {
+    return param;
+  }
+};
+
+struct ConvPrimitiveCache : PrimitiveCache {
+  ConvPrimitiveCache() {}
+
+  ConvPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const ConvParams& params) {
+    this->key = key;
+    this->params = params;
+  }
+
+  ConvParams params;
+
+  ConvParams& get_params() {
+    return params;
+  }
+};
+
+struct DeconvPrimitiveCache : PrimitiveCache {
+  DeconvPrimitiveCache() {}
+
+  DeconvPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const DeconvParams& params) {
+    this->key = key;
+    this->params = params;
+  }
+
+  DeconvParams params;
+
+  DeconvParams& get_params() {
+    return params;
+  }
+};
+
+enum PostOps {
+  NoPostOp,
+  Relu,
+  LeakyRelu,
+  Tanh,
+  Gelu
+};
+
+static std::unordered_map<std::string, PostOps> POST_OP_TABLE = {
+  {"none", NoPostOp},
+  {"relu", Relu},
+  {"leaky_relu", LeakyRelu},
+  {"tanh", Tanh},
+  {"gelu", Gelu}
+};
+
+struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
+  PackedLinearWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      c10::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> orig_bias)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        orig_weight_(std::move(orig_weight)),
+        orig_bias_(std::move(orig_bias)) {
+    cache_initialized_flag = std::make_unique<c10::once_flag>();
+  }
+  std::unique_ptr<ideep::tensor> weight_;
+  c10::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  c10::optional<at::Tensor> orig_bias_;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  at::Tensor apply_leaky_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point,
+      double negative_slope);
+
+  at::Tensor apply_tanh(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return orig_bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+ private:
+  LinearPrimitiveCache prim_cache;
+  std::unique_ptr<c10::once_flag> cache_initialized_flag;
+
+  template <PostOps post_op>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point,
+      torch::List<at::Scalar> post_op_args = torch::List<at::Scalar>());
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false);
+
+  LinearPrimitiveCache& get_cache() {
+    return prim_cache;
+  }
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      c10::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      c10::optional<at::Tensor> orig_bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      uint8_t transpose)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        orig_weight_(std::move(orig_weight)),
+        orig_bias_(std::move(orig_bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose) {
+    cache_initialized_flag = std::make_unique<c10::once_flag>();
+  }
+
+  std::unique_ptr<ideep::tensor> weight_;
+  c10::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  c10::optional<at::Tensor> orig_bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  uint8_t transpose_;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) override;
+
+  at::Tensor apply_add(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  at::Tensor apply_add_relu(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return (bool)transpose_;
+  }
+
+ private:
+  ConvPrimitiveCache conv_prim_cache;
+  DeconvPrimitiveCache deconv_prim_cache;
+  std::unique_ptr<c10::once_flag> cache_initialized_flag;
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      const c10::optional<at::Tensor>& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  ConvPrimitiveCache& get_conv_cache() {
+    assert(!transpose());
+    return conv_prim_cache;
+  }
+
+  DeconvPrimitiveCache& get_deconv_cache() {
+    assert(transpose());
+    return deconv_prim_cache;
+  }
+};
+
+namespace onednn_utils {
+
+static ideep::attr_t create_attr_by_post_op(
+    const std::string& post_op_name,
+    const torch::List<c10::optional<at::Scalar>>& post_op_args,
+    const dnnl::algorithm post_algorithm) {
+  using ideep::tensor;
+  PostOps post_op = POST_OP_TABLE[post_op_name];
+  if (post_op == Relu) {
+    return ideep::attr_t::fuse_relu();
+  } else if (post_op == LeakyRelu) {
+    return ideep::attr_t::fuse_relu_v2(/*alpha=*/post_op_args[0].value().to<float>());
+  } else if (post_op == Tanh) {
+    return ideep::attr_t::fuse_tanh();
+  } else if (post_op == Gelu) {
+    return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm);
+  }
+  return ideep::attr_t();
+}
+
+// Try to reorder tensor to expected desc at runtime
+// Do it in a `try...catch...` manner to avoid oneDNN's errors
+// TODO: Move it to third_party/ideep
+static void try_reorder(
+    ideep::tensor& t,
+    const ideep::tensor::desc&& desc,
+    ideep::scale_t scales) {
+  if (t.get_desc() != desc) {
+    try {
+      t = t.reorder_if_differ_in(desc);
+    } catch (...) {
+      ideep::tensor&& plain = t.to_public(nullptr, t.get_data_type());
+      t = plain.reorder_if_differ_in(desc);
+    }
+    t.set_scale(scales);
+  }
+}
+
+// ONEDNN requires symmetric quantization of weight
+// Use this util function to check.
+static bool is_weight_symmetric_quant(
+      const at::Tensor& weight,
+      bool is_transposed_conv) {
+  bool is_symmetric = true;
+  const auto qtype = weight.qscheme();
+  if (qtype == c10::kPerTensorAffine) {
+    is_symmetric &= (weight.q_zero_point() == 0);
+  } else if (qtype == c10::kPerChannelAffine) {
+    if (is_transposed_conv) {
+      // This case is currently not supported in PyTorch
+      // but we do not want to raise an error in this util function.
+      is_symmetric = false;
+    } else {
+      auto output_channels = weight.size(0);
+      for (int i = 0; i < output_channels; ++i) {
+        auto zp = weight.q_per_channel_zero_points()[i].item<int32_t>();
+        is_symmetric &= (zp == 0);
+      }
+    }
+  } else {
+    // This case is currently not supported in PyTorch
+      // but we do not want to raise an error in this util function.
+    is_symmetric = false;
+  }
+  return is_symmetric;
+}
+
+// When qengine is x86, use this util func to check if onednn kernel
+// is preferred than fbgemm's to get better performance.
+static bool should_use_onednn_quant(
+    const at::Tensor& weight,
+    bool is_transposed_conv,
+    int groups,
+    torch::List<int64_t> output_padding) {
+  // Performance of onednn is only validated on Linux right now.
+  // Also, the heuristics for dispatching are based on perf data on Linux.
+  // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux.
+  // TODO Support more OSs.
+#if !defined(__linux__)
+  return false;
+#else
+  bool vnni_available = cpuinfo_has_x86_avx512vnni();
+  bool w_sym_quant =
+      is_weight_symmetric_quant(weight, is_transposed_conv);
+  bool opad_all_zero =
+      std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; });
+  return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero;
+#endif
+}
+
+} // onednn_utils
+
+at::Tensor _qconv_prepack_onednn(
+    at::Tensor weight, // from CPU backend instead of QuantizedCPU
+    at::Tensor weight_scales, // Weight zero points must be 0 for onednn
+    double input_scale,
+    int64_t input_zero_point,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    c10::optional<torch::List<int64_t>> input_shape=c10::nullopt);
+
+static at::Tensor _quantized_convolution_onednn(
+    at::Tensor act, // contains quantized values but not QTensor
+    double act_scale,
+    int64_t act_zero_point,
+    at::Tensor weight, // MKLDNN tensor with quantized values
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    c10::optional<at::Tensor> bias, // Bias is packed if not None
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    bool transposed,
+    int64_t groups,
+    double inv_output_scale,
+    int64_t output_zero_point,
+    c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
+    double accum_scale=1.0,
+    int64_t accum_zero_point=0,
+    bool fp32_output=false,
+    c10::optional<c10::string_view> binary_attr=c10::nullopt,
+    c10::optional<at::Scalar> binary_alpha=c10::nullopt,
+    c10::optional<c10::string_view> unary_attr=c10::nullopt,
+    torch::List<c10::optional<at::Scalar>> unary_scalars=torch::List<c10::optional<at::Scalar>>(),
+    c10::optional<c10::string_view> unary_algorithm=c10::nullopt);
+
+#endif // #if AT_MKLDNN_ENABLED()
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1429338b6c07c1f4c3877bbc638398415fe0be2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -0,0 +1,527 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <pytorch_qnnpack.h>
+#include <qnnpack_func.h>
+#include <ATen/native/quantized/cpu/XnnpackUtils.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/utils/Factory.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#include <utility>
+inline int kPaddingChannels = 8;
+struct QnnpackOperatorDeleter {
+  void operator()(pytorch_qnnp_operator_t op) {
+    pytorch_qnnp_delete_operator(op);
+  }
+};
+
+// PackedWeight struct for QNNPACK stores the original Weight and Bias as
+// QNNPACK currently does not support an unpack function.
+// For PyTorch Mobile, once the model is scripted and serialized we don't need
+// to call unpack, so we can save some memory by checking for this case and free
+// the original weights after packing.
+// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized
+// with input scale which is available at runtime in pytorch. During runtime if
+// input scale value changes then we requantize bias with the updated scale. For
+// inference we expect the graph to be static so the input scale should not
+// change across consecutive inference calls.
+struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
+  PackedLinearWeightsQnnp(
+      std::unique_ptr<qnnpack::PackBMatrix> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      c10::optional<double> input_scale,
+      at::Tensor w_scales,
+      std::vector<uint8_t>&& w_zps)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
+            bias, bias.suggest_memory_format())),
+        per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
+        input_scale(std::move(input_scale)),
+        w_scales(std::move(w_scales)),
+        w_zero_points(std::move(w_zps)),
+        q_scheme(this->orig_weight.qscheme()) {
+    weight_sizes = this->orig_weight.sizes().vec();
+  }
+
+  std::unique_ptr<qnnpack::PackBMatrix> w;
+  at::Tensor orig_weight;
+  at::Tensor bias_;
+  bool per_channel_;
+  c10::optional<double> input_scale;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  std::vector<int64_t> weight_sizes;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+  bool per_channel() const {
+    return per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+
+#ifdef USE_XNNPACK
+  xnnpack_operator xnnp_linear_op;
+
+  template <typename scalar_t, bool kReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range);
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsQnnp(
+      std::unique_ptr<qnnpack::PrePackConvWeights> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose,
+      c10::optional<double> input_scale,
+      std::vector<int64_t> kernel,
+      at::Tensor w_scale,
+      std::vector<uint8_t>&& w_zps,
+      bool is_per_channel)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        is_per_channel_(is_per_channel),
+        input_scale(input_scale),
+        kernel_(std::move(kernel)),
+        w_scales(std::move(w_scale)),
+        w_zero_points(std::move(w_zps)) {
+    const bool any_padding = std::any_of(
+        padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; });
+    const size_t kernel_size =
+        std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>());
+
+    const size_t group_input_channels = transpose
+        ? this->orig_weight.size(0) / groups
+        : this->orig_weight.size(1);
+    const size_t group_output_channels = transpose
+        ? this->orig_weight.size(1)
+        : this->orig_weight.size(0) / groups;
+
+    const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1;
+    const size_t kernel_height = kernel_[kSpatialDim - 2];
+    const size_t kernel_width = kernel_[kSpatialDim - 1];
+
+    pytorch_qnnp_ukernel_type ukernel_type;
+    if (transpose_) {
+      ukernel_type = pytorch_qnnp_ukernel_type_conv;
+    } else {
+      ukernel_type = pytorch_qnnp_ukernel_type_none;
+
+      const bool has_depthwise_dimensions =
+          (kSpatialDim == 2 &&
+           ((kernel_height == 3 && kernel_width == 3) ||
+            (kernel_height == 5 && kernel_width == 5))) ||
+          (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 &&
+           kernel_depth == 3);
+      const bool has_depthwise_grouping =
+          group_input_channels == 1 && group_output_channels == 1 && groups > 1;
+
+      if (has_depthwise_dimensions && has_depthwise_grouping) {
+        ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
+      } else if (
+          kernel_size == 1 &&
+          std::all_of(
+              stride_.begin(),
+              stride_.end(),
+              [](const auto& e) { return e == 1; }) &&
+          !any_padding) {
+        ukernel_type = group_input_channels >= SIZE_MAX
+            ? pytorch_qnnp_ukernel_type_xzp_gemm
+            : pytorch_qnnp_ukernel_type_gemm;
+      } else {
+        ukernel_type = pytorch_qnnp_ukernel_type_conv;
+      }
+    }
+
+    if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
+      TORCH_INTERNAL_ASSERT(
+          false, "Per channel quantized weights are not supported for XZP kernels");
+    }
+
+    pytorch_qnnp_operator_t convolution{nullptr};
+    // Initially all the params are set to zero.
+    convolution = static_cast<pytorch_qnnp_operator_t>(
+        calloc(1, sizeof(struct pytorch_qnnp_operator)));
+    if (convolution == nullptr) {
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+          sizeof(struct pytorch_qnnp_operator));
+    }
+
+    convolution_op =
+        std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>(
+            convolution);
+
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    convolution->ukernel_type = ukernel_type;
+    convolution->groups = groups;
+    convolution->group_input_channels = group_input_channels;
+    convolution->group_output_channels = group_output_channels;
+    convolution->kernel_depth = kernel_depth;
+    convolution->kernel_height = kernel_height;
+    convolution->kernel_width = kernel_width;
+    convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1;
+    convolution->stride_height = stride_[kSpatialDim - 2];
+    convolution->stride_width = stride_[kSpatialDim - 1];
+    convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
+    convolution->dilation_height = dilation_[kSpatialDim - 2];
+    convolution->dilation_width = dilation_[kSpatialDim - 1];
+    convolution->input_padding_height = padding_[kSpatialDim - 2];
+    convolution->input_padding_width = padding_[kSpatialDim - 1];
+    convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
+    convolution->per_channel = is_per_channel_;
+    convolution->transpose = transpose_;
+
+    const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
+    const size_t k_stride = (group_input_channels + (kr - 1)) & -kr;
+
+    size_t zero_size = sizeof(uint8_t) * k_stride;
+    size_t zero_offset = 0;
+
+    if (transpose_) {
+      convolution->adjustment_width = output_padding_[1];
+      convolution->adjustment_height = output_padding_[0];
+      if (group_input_channels < 8) {
+        zero_size += 8;
+        zero_offset = 8;
+      }
+    } else {
+      zero_buffer_size = 0;
+      if (any_padding) {
+        zero_size = 0;
+        zero_offset = 0;
+        if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) {
+          const uint32_t cr = pytorch_qnnp_params.q8dw9.cr;
+          const size_t group_stride = (groups + (cr - 1)) & -cr;
+          if (groups >= 8) {
+            zero_size = sizeof(uint8_t) * group_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * group_stride + 8;
+            zero_offset = sizeof(uint8_t) * 8;
+          }
+        } else if (
+            ukernel_type == pytorch_qnnp_ukernel_type_conv ||
+            ukernel_type == pytorch_qnnp_ukernel_type_gemm) {
+          if (group_input_channels >= 8) {
+            zero_size = sizeof(uint8_t) * k_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * k_stride + 8;
+            zero_offset = 8;
+          }
+        }
+      }
+    }
+
+    // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI)
+    void* zero_buffer = malloc(zero_size);
+    if (zero_buffer == nullptr) {
+      pytorch_qnnp_delete_operator(convolution);
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for zero padding",
+          zero_size);
+    }
+    // Need to set to input zero point
+    // memset(zero_buffer, input_zero_point, zero_size);
+    zero_buffer_size = zero_size;
+    convolution->zero_buffer = zero_buffer;
+    convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
+  }
+
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
+  #ifdef USE_XNNPACK
+  xnnpack_operator xnnp_convolution_op;
+  #endif  // USE_XNNPACK
+  std::unique_ptr<qnnpack::PrePackConvWeights> w;
+  at::Tensor orig_weight;
+  at::Tensor bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+  bool is_per_channel_;
+  c10::optional<double> input_scale;
+  std::vector<int64_t> kernel_;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  size_t zero_buffer_size;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+  bool per_channel() const {
+    return is_per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+
+#ifdef USE_XNNPACK
+  template <typename scalar_t, bool ReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+};
+
+enum class Activation : uint8_t { NONE = 0, RELU = 1 };
+
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+  return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+  return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+  return std::nearbyint(x);
+}
+#endif
+
+template<typename T>
+inline T QuantizeValue(float scale, int32_t zero_point, float value) {
+  const int32_t qmin = std::numeric_limits<T>::min();
+  const int32_t qmax = std::numeric_limits<T>::max();
+  auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<T>(r);
+}
+
+template<typename T>
+inline std::pair<T, T> activationLimits(
+    float scale,
+    int32_t zero_point,
+    Activation Ac) {
+  switch (Ac) {
+    case Activation::NONE:
+      return {std::numeric_limits<T>::min(),
+              std::numeric_limits<T>::max()};
+    case Activation::RELU:
+      return {QuantizeValue<T>(scale, zero_point, 0.0),
+              std::numeric_limits<T>::max()};
+    default:
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+  }
+}
+
+namespace at {
+namespace native {
+namespace qnnp_avgpool_helper {
+Tensor qnnpack_avg_pool2d(
+    Tensor input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+} // qnnp_avgpool_helper
+} // namespace native
+} // namespace at
+
+namespace {
+C10_UNUSED std::vector<float> generate_requantization_scales(
+    const at::Tensor& weight_scales,
+    const float input_scale,
+    const float output_scale,
+    std::vector<float>& requant_scales) {
+  // Since weight scale is allocated with padding
+  // weight_scales.numel() gives us padded num elements.
+  const auto num_output_channels_padded = weight_scales.numel();
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (static_cast<int64_t>(requant_scales.size()) < num_output_channels_padded) {
+    requant_scales.resize(num_output_channels_padded);
+  }
+  for (const auto i : c10::irange(num_output_channels_padded)) {
+    const auto inverse_output_scale = 1.f /output_scale;
+    requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale;
+    TORCH_CHECK(
+        (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])),
+        "failed to create op with requantization scale: ",
+        requant_scales[i],
+        ": requantization scale must be finite and positive");
+  }
+  return requant_scales;
+}
+
+C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+    const at::Tensor& weight_contig,
+    bool transpose = false,
+    uint32_t groups = 1
+  ) {
+  const int out_ch_idx = transpose ? 1 : 0;
+  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
+  // Add 8 to account for bufferring needed by QNNPACK.
+  const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
+  const auto qtype = weight_contig.qscheme();
+  std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
+  // Adjust weight zero point, similar to weight data.
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128);
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong,
+        "Per channel zero points dtype must be long int.");
+    const int64_t* per_channel_zero_points =
+      weight_contig.q_per_channel_zero_points().data_ptr<int64_t>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  at:: Tensor weight_scales =
+    at::empty(
+        {num_output_channels_padded},
+        at::device(at::kCPU).dtype(at::kFloat));
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = weight_contig.q_scale();
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_scales().scalar_type() == at::kDouble,
+        "Per channel scales dtype must be double.");
+    const double *const per_channel_scales =
+      weight_contig.q_per_channel_scales().data_ptr<double>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = static_cast<float>(per_channel_scales[i]);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) {
+    weight_scales_data[i] = 1.f;
+  }
+  return {weight_zp, weight_scales};
+}
+} // namespace
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4499bb73694cde5e422d1a3c7dd15cbbd17697d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/TensorOperators.h>
+#include <c10/util/irange.h>
+#include <algorithm>
+#include <cmath>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/quantize_per_channel_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace quant_utils {
+namespace {
+  float RawUint16ToFp16(unsigned short value) {
+    // Convert raw 16 bits half precision floating point number
+    // to single precision floating point number.
+    const unsigned short sign_bits = value >> 15;
+    const unsigned short exponent_bits = value >> 10 & 0x1f;
+    const unsigned short significand_bits = value & 0x3ff;
+
+    const float sign = sign_bits ? -1 : 1;
+    const float significand =
+        1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10;
+    const float exponent = exponent_bits - 0xf;
+
+    return sign * std::ldexp(significand, exponent);
+}
+
+template <typename T>
+bool CheckAndSaturate(T max_val, T* element) {
+  if (*element > max_val) {
+    *element = max_val;
+    return true;
+  }
+  if (*element < -max_val) {
+    *element = -max_val;
+    return true;
+  }
+  return false;
+}
+}
+using namespace std;
+// A structure to hold quantization parameters 'scale' and 'zero_point'.
+// The meaning of these values is as the constants in the quantization equation
+//
+//   real_value = scale * (quantized_value - zero_point)
+//
+// In other words, 'zero_point' is the quantized value that corresponds
+// to the real value 0, and 'scale' is the difference of real values
+// corresponding to consecutive quantized values.
+struct TensorQuantizationParams {
+  double scale;
+  std::int32_t zero_point;
+  int precision;
+};
+
+// Use fp16_min as the small scale cutoff because we don't want to use scales in
+// fp16 subnormal range. This is to be consistent with Glow and FakeLowP
+// implementation for NNPI.
+constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
+// Following implementation should be identical to fbgemm::ChooseQuantizationParams
+inline TensorQuantizationParams ChooseQuantizationParams(
+    float min,
+    float max,
+    int32_t qmin,
+    int32_t qmax,
+    bool preserve_sparsity = false,
+    bool force_scale_power_of_two = false,
+    bool reduce_range = false) {
+  TORCH_CHECK(
+      min <= max,
+      "In ChooseQuantizationParams, min should be less than or equal to max");
+
+  if (reduce_range) {
+    qmin = qmin/2;
+    qmax = qmax/2;
+  }
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    int symmetric_qmin = -((qmax - qmin) / 2 + 1);
+    int symmetric_qmax = (qmax - qmin) / 2;
+    double max_scale =
+        std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax));
+    min = max_scale * symmetric_qmin;
+    max = max_scale * symmetric_qmax;
+  }
+
+  // We extend the [min, max] interval to ensure that it contains 0.
+  // Otherwise, we would not meet the requirement that 0 be an exactly
+  // representable value.
+  min = std::min(min, 0.f);
+  max = std::max(max, 0.f);
+
+  TORCH_CHECK(
+      qmin < qmax,
+      "In ChooseQuantizationParams, qmin should be less than qmax");
+
+  // Use double precision for intermediate computation but use single precision
+  // in final number to reflect the actual number used during quantization.
+  double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+  // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
+  // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
+  // infinity because some of fbgemm code pre-computes scale's reciprocal to do
+  // multiplication instead of division in the time critical part of code.
+  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+    scale = 0.1;
+  }
+  TORCH_CHECK(scale > 0, "quantization scale should be > 0");
+
+  if (force_scale_power_of_two) {
+    if (scale < 1) {
+      scale = 1.0 / (1 << static_cast<int>(floor(log(1.0 / scale) / log(2))));
+    } else {
+      scale = 1 << static_cast<int>(ceil(log(scale) / log(2)));
+    }
+  }
+
+  // Cut off small scale
+  if (scale < SMALL_SCALE_THRESHOLD) {
+    float org_scale = scale;
+    scale = SMALL_SCALE_THRESHOLD;
+    // Adjust the min and max based on the new scale
+    if (min == 0.0f) {
+      max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else if (max == 0.0f) {
+      min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else {
+      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
+      min *= amplifier;
+      max *= amplifier;
+    }
+  }
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  double zero_point_from_min = qmin - min / static_cast<double>(scale);
+  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min_error =
+      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+  double zero_point_from_max_error =
+      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+  double initial_zero_point =
+      zero_point_from_min_error < zero_point_from_max_error
+      ? zero_point_from_min
+      : zero_point_from_max;
+
+  // for symmetric quantization (preserve_sparsity == true), we force zero_point
+  // to be a middle value between qmin and qmax.
+  // If either min or max is 0, then we just use 0 as zero_point.
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    initial_zero_point = static_cast<double>(qmin + qmax) / 2;
+  }
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with zero
+  // padding).
+  int32_t nudged_zero_point = 0;
+  if (initial_zero_point < qmin) {
+    nudged_zero_point = qmin;
+  } else if (initial_zero_point > qmax) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = nearbyint(initial_zero_point);
+  }
+
+  TensorQuantizationParams result;
+  result.scale = scale;
+  result.zero_point = nudged_zero_point;
+  return result;
+}
+
+// This function helps to convert the Conv1D dimensions usable by the Conv2d op.
+constexpr int64_t kConv1dSqueezeDim = 0;
+static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
+                                             int64_t base_value) {
+  TORCH_CHECK(!arg.empty(), "Argument must have elements.");
+  torch::List<int64_t> result({arg.get(0), base_value});
+  if (arg.size() == 1) {
+    result[1] = arg.get(0);
+  } else {
+    result[1] = arg.get(1);
+  }
+  result[kConv1dSqueezeDim] = base_value;
+  return result;
+}
+
+// The range for using FP16 quantization of weights requires that the elements
+// should be in the range of [5.96e-8, 65504]. If it is out of range, then the
+// number will be saturated to max or min representable values by FP16.
+inline void HandleWeightsSaturation(int64_t N, float* weight) {
+  const float kFp16Max = RawUint16ToFp16(0x7BFF);
+  bool found_out_of_range = false;
+  for (const auto i : c10::irange(N)) {
+    bool saturate = CheckAndSaturate<float>(kFp16Max, weight + i);
+    if (saturate) {
+      found_out_of_range = true;
+    }
+  }
+  if (found_out_of_range) {
+    TORCH_WARN("FOUND weight out of range ");
+  }
+}
+
+// Util function for quantizing bias.
+inline at::Tensor QuantizeBias(
+    bool is_per_channel,
+    const at::Tensor& bias,
+    const at::Tensor& weight_contig,
+    double input_scale) {
+  at::Tensor qbias;
+  if (is_per_channel) {
+    auto bias_quant_scales =
+        weight_contig.q_per_channel_scales() * input_scale;
+    auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+    qbias = at::native::quantize_per_channel(
+        bias, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+  } else {
+    qbias = at::native::quantize_per_tensor(
+        bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
+  }
+  return qbias;
+}
+
+} // namespace quant_utils
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..15e442a15b01ed74981ebae504756f9b7a38aad7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
@@ -0,0 +1,258 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                                const Scalar& /*negval_*/);
+using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */);
+using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point);
+using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qclamp_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& min,
+    const Scalar& max,
+    at::Tensor& /*qy*/);
+using qclamp_minmax_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*min or max*/,
+    at::Tensor& /*qy*/);
+using qthreshold_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& threshold,
+    const Scalar& value,
+    at::Tensor& /*qy*/);
+using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qelu_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*alpha*/,
+    const Scalar& /*scale*/,
+    const Scalar& /*input_scale*/,
+    at::Tensor& /*qy*/);
+using qbinary_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/);
+using qadd_scalar_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/);
+using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qdropout_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*p*/,
+    bool training /*training*/,
+    at::Tensor& /*qy*/);
+using qmaxpool_2d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qmaxpool_3d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iT,
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oT,
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kT,
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sT,
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pT,
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dT,
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qadaptive_avg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideH,
+    int64_t istrideW);
+using qadaptive_avg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeD,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeD,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideD,
+    int64_t istrideH,
+    int64_t istrideW);
+using qavg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+using qavg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t inputDepth,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int64_t outputDepth,
+    int kW,
+    int kH,
+    int kD,
+    int dW,
+    int dH,
+    int dD,
+    int padW,
+    int padH,
+    int padD,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override);
+
+using qupsample_bilinear2d_fn = void (*)(
+    Tensor& output,
+    const Tensor& input,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t nbatch,
+    int64_t channels,
+    bool align_corners,
+    c10::optional<double> scales_h,
+    c10::optional<double> scales_w);
+
+using qcat_nhwc_fn = Tensor (*)(
+    const MaterializedITensorListRef& qxs,
+    int64_t dim,
+    double scale,
+    int64_t zero_point);
+using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
+
+using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&);
+
+using qnormalize_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qmean_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* opt_dim */,
+    bool /* keepdim */,
+    c10::optional<ScalarType> /* opt_dtype */,
+    Tensor& /* Y */);
+
+using qstd_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* dim */,
+    const c10::optional<Scalar>& /* correction */,
+    bool /* keepdim */,
+    Tensor& /* Y */);
+
+using qnormalize_nhwc_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                           const Tensor& /*qw*/);
+
+DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub);
+DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub);
+DECLARE_DISPATCH(qclamp_fn, qclamp_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub);
+DECLARE_DISPATCH(qelu_fn, qelu_stub);
+DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub);
+DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub);
+DECLARE_DISPATCH(qdropout_fn, qdropout_stub);
+DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub);
+DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub);
+DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub);
+DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub);
+DECLARE_DISPATCH(qrelu_fn, qrelu_stub);
+DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub);
+DECLARE_DISPATCH(qgelu_fn, qgelu_stub);
+DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub);
+DECLARE_DISPATCH(qtanh_fn, qtanh_stub);
+DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub);
+DECLARE_DISPATCH(qtopk_fn, qtopk_stub);
+DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub);
+DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub);
+DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub);
+DECLARE_DISPATCH(qprelu_fn, qprelu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecfddcf449d5968f72b2d518914f980615183fe6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef USE_RUY_QMATMUL
+
+#include <ruy/ruy.h>
+
+namespace at {
+namespace native {
+namespace ruy_utils {
+
+ruy::Context* get_ruy_context();
+
+void quantize_multiplier(double scale,
+                         int* multiplier_fixedpoint,
+                         int* multiplier_exponent);
+
+} // namespace ruy_utils
+} // namespace native
+} // namespace
+
+#endif // USE_RUY_QMATMUL
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b0d6ab04ddac6cdf9dbda696e72bed4afd9c7ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h
@@ -0,0 +1,335 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+#include <cstdint>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/xnnpack/Common.h>
+
+using xnnpack_operator = at::native::xnnpack::Operator;
+
+namespace at {
+namespace native {
+namespace xnnp_utils {
+
+/*
+ * Return shape in the same order as the memory format
+ * e.g. channels_last will return NHWC instead of NCHW
+ */
+std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in);
+
+/*
+ * Input is always int8_t, output can be [int8_t, uint8_t].
+ * input  + offset = output
+ * int8_t + 128    = uint8_t
+ * int8_t + 0      = int8_t
+ */
+template <typename PT>
+void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out);
+
+template <int kSpatialDim>
+Tensor convert_conv_weights_to_channel_last_tensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
+
+/*
+ * Series of create wrapper functions to call xnn_create_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_convolution2d_nhwc(
+    uint32_t pad_top,
+    uint32_t pad_right,
+    uint32_t pad_bottom,
+    uint32_t pad_left,
+    uint32_t kernel_h,
+    uint32_t kernel_w,
+    uint32_t stride_h,
+    uint32_t stride_w,
+    uint32_t dilation_h,
+    uint32_t dilation_w,
+    uint32_t groups,
+    size_t group_input_channels,
+    size_t group_output_channels,
+    size_t ip_chan_stride,
+    size_t op_chan_stride,
+    int8_t izp,
+    float ip_scale,
+    int8_t kzp,
+    const float* k_scales,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t ozp,
+    float op_scale,
+    int8_t op_min,
+    int8_t op_max,
+    uint32_t flags,
+    xnn_operator_t* op,
+    bool per_channel,
+    bool transpose) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero."
+                    "But got: ", kzp);
+
+  if (transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_create_deconvolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t output_padding_top          */
+        pad_right,      /* uint32_t output_padding_right        */
+        pad_bottom,     /* uint32_t output_padding_bottom       */
+        pad_left,       /* uint32_t output_padding_left         */
+        kernel_h,       /* uint32_t kernel_height               */
+        kernel_w,       /* uint32_t kernel_width                */
+        stride_h,       /* uint32_t stride_height               */
+        stride_w,       /* uint32_t stride_width                */
+        dilation_h,     /* uint32_t dilation_height             */
+        dilation_w,     /* uint32_t dilation_width              */
+        groups,         /* uint32_t groups                      */
+        group_input_channels,  /* size_t group_input_channels   */
+        group_output_channels, /* size_t group_output_channels  */
+        ip_chan_stride, /* size_t input_pixel_stride            */
+        op_chan_stride, /* size_t output_pixel_stride           */
+        izp,            /* int8_t input_zero_point              */
+        ip_scale,       /* float input_scale                    */
+        k_scales[0],    /* float kernel_scale                   */
+        kernel,         /* const int8_t* kernel                 */
+        bias,           /* const int32_t* bias                  */
+        ozp,            /* int8_t output_zero_point             */
+        op_scale,       /* float output_scale                   */
+        op_min,         /* int8_t output_min                    */
+        op_max,         /* int8_t output_max                    */
+        flags,          /* uint32_t flags                       */
+        nullptr,        /* xnn_caches_t caches                  */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* deconvolution_op_out */
+
+  }
+
+  if (!per_channel) {
+    return xnn_create_convolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales[0],    /* float kernel_scale                 */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* convolution_op_out */
+  } else { /* per_channel */
+    return xnn_create_convolution2d_nhwc_qs8_qc8w(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales,       /* const float* kernel_scale          */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* convolution_op_out */
+  }
+}
+
+/*
+ * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_reshape_convolution2d_nhwc(
+    xnn_operator_t op,
+    size_t batch,
+    size_t in_h,
+    size_t in_w,
+    pthreadpool_t pt_pool,
+    bool per_channel = false,
+    bool transpose = false,
+    uint32_t adj_h = 0,
+    uint32_t adj_w = 0) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_reshape_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        batch,    /* size_t batch_size               */
+        in_h,     /* size_t input_height             */
+        in_w,     /* size_t input_width              */
+        adj_h,    /* uint32_t adjustment_height      */
+        adj_w,    /* uint32_t adjustment_width       */
+        nullptr,  /* size_t* output_height_out       */
+        nullptr,  /* size_t* output_width_out        */
+        pt_pool); /* pthreadpool_t threadpool        */
+  }
+
+  size_t workspace_size = SIZE_MAX;
+  size_t workspace_alignment = SIZE_MAX;
+
+  if (!per_channel) {
+    return xnn_reshape_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  } else { /* per_channel */
+    return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  }
+}
+
+
+/*
+ * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_convolution2d_nhwc(
+    xnn_operator_t op,
+    const int8_t* inp,
+    int8_t* outp,
+    bool per_channel = false,
+    bool transpose = false) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+
+    return xnn_setup_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  }
+
+  if (!per_channel) {
+    return xnn_setup_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  } else { /* per_channel */
+    return xnn_setup_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  }
+}
+
+
+/*
+ * Series of wrapper functions to call xnn_create* and xnn_setup*
+ * functions for linear
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_fully_connected_nc(
+    size_t input_channels,
+    size_t output_channels,
+    size_t input_stride,
+    size_t output_stride,
+    int8_t input_zero_point,
+    float input_scale,
+    int8_t kernel_zero_point,
+    float kernel_scale,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t output_zero_point,
+    float output_scale,
+    int8_t output_min,
+    int8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* fully_connected_op_out) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero."
+                    "But got: ", kernel_zero_point);
+  return xnn_create_fully_connected_nc_qs8(
+      input_channels,          /* size_t input_channels                  */
+      output_channels,         /* size_t output_channels                 */
+      input_stride,            /* size_t input_stride                    */
+      output_stride,           /* size_t output_stride                   */
+      input_zero_point,        /* int8_t input_zero_point                */
+      input_scale,             /* float input_scale                      */
+      kernel_scale,            /* float kernel_scale                     */
+      kernel,                  /* const int8_t* kernel                   */
+      bias,                    /* const int32_t* bias                    */
+      output_zero_point,       /* int8_t output_zero_point               */
+      output_scale,            /* float output_scale                     */
+      output_min,              /* int8_t output_min                      */
+      output_max,              /* int8_t output_max                      */
+      flags,                   /* uint32_t flags                         */
+      nullptr,                 /* xnn_caches_t caches                    */
+      nullptr,                 /* xnn_weights_cache_t                    */
+      fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_reshape_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    size_t batch_size,
+    pthreadpool_t threadpool) {
+  return xnn_reshape_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      batch_size,         /* size_t batch_size                 */
+      threadpool);        /* pthreadpool_t threadpool          */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    const int8_t* input,
+    int8_t* output) {
+  return xnn_setup_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      input,              /* const int8_t* input               */
+      output              /* int8_t* output                    */
+    );
+}
+
+} // namespace xnnp_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_XNNPACK
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..9af6c65af7716108586889bc5b2be0edbd32e138
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
@@ -0,0 +1,414 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/QnnpackUtils.h>
+#include <ATen/native/quantized/cpu/OnednnUtils.h>
+#include <c10/util/irange.h>
+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+#endif
+
+
+#include <tuple>
+
+/* Convolution prepacked parameters serialization.
+ *
+ * Version 1
+ *
+ * - Fields:
+ *  1. weight
+ *  2. bias
+ *  3. stride x kSpatialDim
+ *  4. padding x kSpatialDim
+ *  5. dilation x kSpatialDim
+ *  6. groups
+ *
+ * Version 2
+ *
+ * - Fields:
+ *  0. version (string)
+ *  1. list of non-optional tensors
+ *    0: packed parameters (int16_t)
+ *      - kSpatialDim
+ *      - stride x kSpatialDim
+ *      - padding x kSpatialDim
+ *      - dilation x kSpatialDim
+ *      - output_padding x kSpatialDim
+ *      - groups
+ *      - transpose (0 or 1)
+ *    1: weight
+ *  2. list of optional tensors
+ *    0: bias
+ *
+ * Version 3
+ *
+ * - Fields:
+ *  0. version (int64_t)
+ *  1. list of int64_t configuration values
+ *    - kSpatialDim
+ *    - stride x kSpatialDim
+ *    - padding x kSpatialDim
+ *    - dilation x kSpatialDim
+ *    - output_padding x kSpatialDim
+ *    - groups
+ *    - flags (bitmask)
+ *      - (1 << 0) transpose (1 = yes)
+ *  2. list of optional tensors
+ *    0: None (helps with type inference)
+ *    1: weight (this must be present)
+ *    2: bias
+ */
+
+using ConvParamsSerializationTypeV2 = std::tuple<
+  // version, for versions 2 and up
+  std::string,
+  // non-optional tensors
+  std::vector<at::Tensor>,
+  // optional tensors
+  std::vector<c10::optional<at::Tensor>>>;
+
+using ConvParamsSerializationTypeV3 = std::tuple<
+  // version, int for versions 3 and up
+  int64_t,
+  // configuration values
+  std::vector<int64_t>,
+  // optional tensors
+  std::vector<c10::optional<at::Tensor>>>;
+
+// Parses any historical conv packed params format into
+// the current format.
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
+
+  // determine the version based on IValue contents
+  int version = -1;
+  if (v.isTuple()) {
+    const auto& elements = v.toTupleRef().elements();
+    if (!elements.empty()) {
+      auto firstElement = elements[0];
+      if (firstElement.isTensor()) {
+        version = 1;
+      } else if (firstElement.isString()) {
+        const std::string& version_str = firstElement.toStringRef();
+        // note: not parsing the string to automatically handle bad
+        // inputs
+        if (version_str == "2") {
+          version = 2;
+        }
+      } else if (firstElement.isInt()) {
+        auto raw_version = firstElement.toInt();
+        if (raw_version == 3) {
+          version = 3;
+        }
+      }
+    }
+  }
+  TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version");
+
+  if (version == 1) {
+    // version 1 - convert to version 3 manually
+
+    const auto& elements = v.toTupleRef().elements();
+
+    at::Tensor weight = elements[0].toTensor();
+    c10::optional<at::Tensor> bias = elements[1].toOptional<at::Tensor>();
+    torch::List<at::Tensor> stride_x_kSpatialDim = elements[2].toTensorList();
+    torch::List<at::Tensor> padding_x_kSpatialDim = elements[3].toTensorList();
+    torch::List<at::Tensor> dilation_x_kSpatialDim = elements[4].toTensorList();
+    at::Tensor groups = elements[5].toTensor();
+
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(
+        stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() +
+        dilation_x_kSpatialDim.size() + kSpatialDim + 3);
+    config_vals.push_back(kSpatialDim);
+    for (const auto i : c10::irange(stride_x_kSpatialDim.size())) {
+      auto stride = stride_x_kSpatialDim.get(i);
+      config_vals.push_back(stride[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(padding_x_kSpatialDim.size())) {
+      auto padding = padding_x_kSpatialDim.get(i);
+      config_vals.push_back(padding[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) {
+      auto dilation = dilation_x_kSpatialDim.get(i);
+      config_vals.push_back(dilation[0].item<int16_t>());
+    }
+    // output_padding does not exist in v1, so we fill in a default value
+    for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+      config_vals.push_back(0);
+    }
+    config_vals.push_back(groups[0].item<int16_t>());
+    // transpose does not exist in v1, so we fill in a default value
+    config_vals.push_back(0);
+
+    std::vector<c10::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 2) {
+    // version 2
+    const auto& elements = v.toTupleRef().elements();
+    std::vector<at::Tensor> non_optional = elements[1].toTensorList().vec();
+    std::vector<c10::optional<at::Tensor>> optional;
+
+    if (elements[2].isTensorList()) {
+      for (const auto& elem : elements[2].toTensorList()) {
+        optional.emplace_back(static_cast<at::Tensor>(elem));
+      }
+    } else {
+      for (const auto& elem : elements[2].toList()) {
+        optional.emplace_back(static_cast<c10::IValue>(elem).toOptional<at::Tensor>());
+      }
+    }
+    // create default optional value for bias
+    if (optional.empty()) {
+      optional.emplace_back();
+    }
+
+    auto config_a = non_optional[0].accessor<int16_t, 1>();
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(config_a.size(0));
+    for (const auto i : c10::irange(config_a.size(0))) {
+      config_vals.emplace_back(config_a[i]);
+    }
+
+    auto weight = non_optional[1];
+    auto bias = optional[0];
+
+    std::vector<c10::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 3) {
+    return v.to<ConvParamsSerializationTypeV3>();
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ",
+        version);
+  }
+}
+
+#define QCONV_SERIALIZATION_VERSION 2
+
+#if QCONV_SERIALIZATION_VERSION == 2
+using ConvParamsSerializationType = ConvParamsSerializationTypeV2;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV2 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+
+  std::string version = "2";
+  std::vector<at::Tensor> non_optional;
+  std::vector<c10::optional<at::Tensor>> optional;
+
+  // create a packed int8_t tensor for conv params
+  std::vector<int16_t> params_vec;
+  params_vec.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  params_vec.insert(params_vec.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  params_vec.insert(params_vec.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  params_vec.insert(params_vec.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  params_vec.insert(params_vec.end(), output_padding.begin(),
+                    output_padding.end());
+  params_vec.push_back(params->groups());
+  params_vec.push_back(params->transpose());
+  int64_t vec_size = params_vec.size();
+  at::Tensor params_tensor = at::from_blob(
+      params_vec.data(), {vec_size},
+      at::TensorOptions().dtype(at::kShort))
+    // clone to retain ownership of the data
+    .clone();
+
+  auto [weight, bias] = params->unpack();
+
+  non_optional.emplace_back(std::move(params_tensor));
+  non_optional.emplace_back(std::move(weight));
+  optional.emplace_back(std::move(bias));
+
+  return std::tie(version, non_optional, optional);
+}
+
+#elif QCONV_SERIALIZATION_VERSION == 3
+using ConvParamsSerializationType = ConvParamsSerializationTypeV3;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+  std::vector<int64_t> config_vals;
+  config_vals.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  config_vals.insert(config_vals.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  config_vals.insert(config_vals.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  config_vals.insert(config_vals.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  config_vals.insert(config_vals.end(), output_padding.begin(),
+                    output_padding.end());
+  config_vals.push_back(params->groups());
+  config_vals.push_back(params->transpose());
+
+  auto [weight, bias] = params->unpack();
+
+  std::vector<c10::optional<at::Tensor>> tensors;
+  tensors.emplace_back();
+  tensors.emplace_back(weight);
+  tensors.emplace_back(bias);
+
+  int64_t version = 3;
+  return std::tie(version, config_vals, tensors);
+}
+
+#else
+#error "Invalid qconv serialization version."
+#endif
+
+template <uint32_t kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
+    ConvParamsSerializationTypeV3 state) {
+  auto [version, config_vals, tensors] = state;
+  TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version);
+
+  TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size());
+  c10::optional<at::Tensor> weight = tensors[1];
+  c10::optional<at::Tensor> bias = tensors[2];
+  TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv.");
+
+  torch::List<int64_t> stride, padding, output_padding, dilation;
+  // skip kSpatialDim
+  int idx = 1;
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    stride.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    dilation.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    TORCH_INTERNAL_ASSERT(idx < static_cast<int64_t>(config_vals.size()),
+        "Unexpected index = ", idx, " for config_vals of size ",
+        config_vals.size());
+    output_padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  int64_t groups = config_vals.at(idx);
+  idx++;
+  int64_t flags = config_vals.at(idx);
+  idx++;
+  TORCH_INTERNAL_ASSERT(idx == static_cast<int64_t>(config_vals.size()),
+      "Unexpected length of config_vals, expected ",
+      idx,
+      " got ",
+      config_vals.size());
+
+  bool transpose = flags & (1 << 0);
+
+  int64_t other_flags = flags & ~(1 << 0);
+  TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, ".");
+
+  auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::X86) {
+#if AT_MKLDNN_ENABLED()
+    bool use_onednn = onednn_utils::should_use_onednn_quant(
+        weight.value(), transpose, groups, output_padding);
+    if (use_onednn) {
+      return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+        weight.value(),
+        bias,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        transpose
+      );
+    }
+#endif
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  } // x86
+#endif
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::FBGEMM) {
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_FBGEMM
+#ifdef USE_PYTORCH_QNNPACK
+  if (ctx.qEngine() == at::QEngine::QNNPACK) {
+    TORCH_CHECK(
+        kSpatialDim == 2,
+        "prepack/__setstate__: QNNPACK only supports Conv2d "
+        "now.");
+    return PackedConvWeightsQnnp<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::ONEDNN) {
+    return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // AT_MKLDNN_ENABLED()
+TORCH_CHECK(
+  false,
+  "Didn't find engine for when deserializing ConvPackedParams: ",
+  toString(ctx.qEngine()));
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d35336fde12173423f61a74735bfd5fb65d76377
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -0,0 +1,411 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/quantized/cpu/EmbeddingPackedParams.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/irange.h>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/Fbgemm.h>
+#include <fbgemm/FbgemmFP16.h>
+#include <fbgemm/QuantUtils.h>
+
+// The struct for the packed weight matrix (PackBMatrix) and the corresponding
+// column offsets used for the fully connect layer, which are both prepared in
+// the prepacking step to save the computations in the inference. Note the
+// column offsets include the sum of the B columns as well as the scalar term
+// B_zero_point * K, whereas the row offsets created by
+// PackAWithQuantRowOffset/PackAWithIm2Col/PackAWithRowOffset are only the sum
+// of the A rows. The column offsets are needed for the asymmetric quantization
+// (affine quantization) of input matrix.
+// Note that in JIT mode we can think of a way to fuse col_offsets with bias.
+struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
+  PackedLinearWeight(
+      std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w,
+      c10::optional<at::Tensor> bias,
+      std::vector<int32_t> col_offsets,
+      std::vector<float> w_scale,
+      std::vector<int32_t> w_zp,
+      c10::QScheme q_scheme)
+      : w(std::move(w)),
+        bias_(std::move(bias)),
+        col_offsets(std::move(col_offsets)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        q_scheme(std::move(q_scheme)) {}
+  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
+  c10::optional<at::Tensor> bias_;
+  std::vector<int32_t> col_offsets;
+  std::vector<float> w_scale;
+  std::vector<int32_t> w_zp;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor& apply_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor& apply_relu_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+ private:
+  template <bool ReluFused>
+  at::Tensor& apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output);
+
+  template <bool ReluFused>
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl(
+      const at::Tensor& input,
+      double input_scale,
+      int64_t input_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false);
+};
+
+struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
+  PackedLinearWeightFp16(
+      std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w,
+      c10::optional<at::Tensor> bias)
+      : w(std::move(w)), bias_(std::move(bias)) {}
+
+  std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w;
+  c10::optional<at::Tensor> bias_;
+
+  at::Tensor apply(
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
+    TORCH_INTERNAL_ASSERT(false);
+  }
+  at::Tensor apply_relu(
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
+    TORCH_INTERNAL_ASSERT(false);
+  }
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor& apply_dynamic_out(
+      const at::Tensor& input,
+      at::Tensor& output,
+      bool reduce_range = false) override;
+  at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& input,
+      at::Tensor& output,
+      bool reduce_range = false) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  c10::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias);
+
+  void set_bias(c10::optional<at::Tensor> bias) override;
+
+ private:
+  template <bool ReluFused>
+  at::Tensor& apply_dynamic_impl(const at::Tensor& input, at::Tensor& output);
+};
+
+template <int kSpatialDim = 2>
+struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeight(
+      std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      uint8_t transpose,
+      std::vector<int32_t> col_offsets,
+      std::vector<int64_t> kernel,
+      std::vector<float> w_scale,
+      std::vector<int32_t> w_zp,
+      c10::QScheme q_scheme)
+      : w(std::move(w)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        col_offsets(std::move(col_offsets)),
+        kernel(std::move(kernel)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        q_scheme(q_scheme) {}
+
+  std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w;
+  c10::optional<at::Tensor> bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  uint8_t transpose_;
+  std::vector<int32_t> col_offsets;
+  std::vector<int64_t> kernel;
+  std::vector<float> w_scale;
+  std::vector<int32_t> w_zp;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) override;
+
+  std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      c10::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  const float* GetBiasData(at::Tensor* bias);
+
+  void GetQuantizationParams(
+      float act_scale,
+      float out_scale,
+      std::vector<float>* output_multiplier_float,
+      std::vector<float>* act_times_w_scale);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return (bool)transpose_;
+  }
+
+ private:
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+};
+
+// PackWeight: Convert the weight from uint8 to int8.
+inline void convert_uint8_int8(
+    int len,
+    const uint8_t* src_uint8,
+    int8_t* dst_int8) {
+  for (const auto i : c10::irange(len)) {
+    dst_int8[i] = static_cast<int8_t>(static_cast<int32_t>(src_uint8[i]) - 128);
+  }
+}
+
+// UnpackWeight: Convert the weight from int8 to uint8.
+inline void convert_int8_uint8(
+    int len,
+    const int8_t* src_int8,
+    uint8_t* dst_uint8) {
+  for (const auto i : c10::irange(len)) {
+    dst_uint8[i] =
+        static_cast<uint8_t>(static_cast<int32_t>(src_int8[i]) + 128);
+  }
+}
+
+namespace at {
+namespace native {
+namespace fbgemm_utils {
+
+template <int kSpatialDim = 2>
+fbgemm::conv_param_t<kSpatialDim> MakeFbgemmConvParam(
+    int N,
+    int C,
+    int M,
+    const std::vector<int>& image_shape,
+    int groups,
+    const std::vector<int>& kernels,
+    const std::vector<int>& strides,
+    const std::vector<int>& pads,
+    const std::vector<int>& dilations,
+    const std::vector<int>& output_padding = std::vector<int>(kSpatialDim, 0),
+    bool transposed = false);
+
+// TODO: Remove functions below when ChannelsLast3d is ready.
+Tensor MakeStridedQTensorCPU(
+    const IntArrayRef& sizes,
+    const IntArrayRef& strides,
+    const TensorOptions& options,
+    QuantizerPtr quantizer);
+
+Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor(
+    int64_t N,
+    int64_t C,
+    int64_t D,
+    int64_t H,
+    int64_t W,
+    const TensorOptions& options,
+    double scale,
+    int64_t zero_point);
+
+Tensor MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor(
+    int64_t N,
+    int64_t C,
+    int64_t D,
+    int64_t H,
+    int64_t W,
+    const TensorOptions& options,
+    const Tensor& scales,
+    const Tensor& zero_points);
+
+Tensor ConvertToChannelsLast3dTensor(const Tensor& src);
+
+template <int kSpatialDim = 2>
+Tensor TransposeConvTensorUnpackConversion(const Tensor& src, int groups);
+
+template <int kSpatialDim>
+Tensor ConvertConvWeightsToChannelLastTensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
+} // namespace fbgemm_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_FBGEMM
+
+struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
+  PackedEmbeddingBagWeight(
+      at::Tensor packed_w,
+      std::vector<float> w_scale,
+      std::vector<float> w_zp,
+      int64_t bit_rate,
+      c10::QScheme q_scheme,
+      int64_t version)
+      : packed_w(std::move(packed_w)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        bit_rate_(bit_rate),
+        q_scheme(q_scheme),
+        version_(version) {
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.Move)
+    if (!packed_w.is_contiguous()) {
+      packed_w = packed_w.contiguous();
+    }
+  }
+
+  at::Tensor packed_w;
+  std::vector<float> w_scale;
+  std::vector<float> w_zp;
+  int64_t bit_rate_;
+  c10::QScheme q_scheme;
+  int64_t version_;
+
+  at::Tensor unpack() override;
+  static c10::intrusive_ptr<EmbeddingPackedParamsBase> prepack(
+      at::Tensor weight);
+
+  int64_t bit_rate() const override {
+    return bit_rate_;
+  }
+
+  int64_t version() const override {
+    return version_;
+  }
+
+  at::Tensor embeddingbag_byte(
+      const at::Tensor& indices,
+      const c10::optional<at::Tensor>& offsets,
+      bool pruned_weights,
+      const c10::optional<at::Tensor>& per_sample_weights_,
+      const c10::optional<at::Tensor>& compressed_indices_mapping,
+      bool include_last_offset,
+      bool is_embedding_op) override;
+
+  at::Tensor embeddingbag_4bit(
+      const at::Tensor& indices,
+      const c10::optional<at::Tensor>& offsets,
+      bool pruned_weights,
+      const c10::optional<at::Tensor>& per_sample_weights_,
+      const c10::optional<at::Tensor>& compressed_indices_mapping,
+      bool include_last_offset,
+      bool is_embedding_op) override;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7a1033e9758b19f9f05b05e2854289be44324c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+
+namespace at {
+namespace native {
+
+void initQNNPACK();
+
+} // namespace native
+} // namespace at
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd2c04e589c439d55a19fe55fee7f76fc433a5ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <cstdint>
+
+namespace at {
+namespace native {
+Tensor& embedding_bag_byte_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& embedding_bag_4bit_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const c10::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const c10::optional<Tensor>& per_sample_weights_,
+    const c10::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight);
+
+} // native
+} // at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h
new file mode 100644
index 0000000000000000000000000000000000000000..652e7501c25dd90a0d38ba2d865d09666b5434f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack(const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight);
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/utils/Factory.h b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/Factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..28444494242ae5ce6e6d728686573491302c3721
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/Factory.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace mobile {
+
+Tensor allocate_padded_contiguous_if_needed(
+    const Tensor& input,
+    c10::MemoryFormat memory_format);
+
+// TODO: Remove this function when at::native::empty() is modified to accept a
+// custom memory allocator.
+
+at::Tensor empty_with_tail_padding(
+    IntArrayRef size,
+    const caffe2::TypeMeta dtype,
+    c10::MemoryFormat memory_format,
+    c10::optional<DimnameList> maybe_names);
+
+} // namespace mobile
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamUtils.h b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f4cdf5b906b9ecc9b7a5ff130f1a607e7ea4d25
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamUtils.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <vector>
+
+namespace at {
+namespace native {
+
+template <typename T>
+inline std::vector<T> _expand_param_if_needed(
+    ArrayRef<T> list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  if (list_param.size() == 1) {
+    return std::vector<T>(expected_dim, list_param[0]);
+  } else if ((int64_t)list_param.size() != expected_dim) {
+    std::ostringstream ss;
+    ss << "expected " << param_name << " to be a single integer value or a "
+       << "list of " << expected_dim << " values to match the convolution "
+       << "dimensions, but got " << param_name << "=" << list_param;
+    AT_ERROR(ss.str());
+  } else {
+    return list_param.vec();
+  }
+}
+
+inline std::vector<int64_t> expand_param_if_needed(
+    IntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
+inline std::vector<c10::SymInt> expand_param_if_needed(
+    SymIntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamsHash.h b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamsHash.h
new file mode 100644
index 0000000000000000000000000000000000000000..24c836f3308d145f8a3e56f0d240404f53df5f0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/utils/ParamsHash.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <memory>
+#include <mutex>
+
+namespace at::native {
+
+// Hashing machinery for Params
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Params>
+struct ParamsHash {
+  // Params must be a POD because we read out its memory
+  // contents as char* when hashing
+  static_assert(std::is_standard_layout_v<Params>, "Params is not POD");
+
+  size_t operator()(const Params& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (const auto i : c10::irange(sizeof(Params))) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+template <typename Params>
+struct ParamsEqual {
+  // Params must be a POD because we read out its memory
+  // contents as char* when comparing
+  static_assert(std::is_standard_layout_v<Params>, "Params is not POD");
+
+  bool operator()(const Params& a, const Params& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Params)) == 0;
+  }
+};
+
+// Provide explicit byte-for-byte constructors to avoid uwittingly leaving
+// padding bytes unitialized (e.g., when passing Params by value)
+template <typename T>
+struct ParamsWrapper {
+  T pod;
+  static_assert(
+      std::is_standard_layout_v<T>,
+      "ParamsWrapper cannot wrap non-POD data");
+
+  ParamsWrapper() {
+    memset(&(this->pod), 0, sizeof(this->pod));
+  }
+
+  ParamsWrapper(const ParamsWrapper& other) {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+  }
+
+  ParamsWrapper(ParamsWrapper&& other) noexcept {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+  }
+
+  ParamsWrapper& operator=(const ParamsWrapper& other) {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+    return *this;
+  }
+
+  ParamsWrapper& operator=(ParamsWrapper&& other) noexcept {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+    return *this;
+  }
+
+  inline friend bool operator==(
+      const ParamsWrapper& lhs,
+      const ParamsWrapper& rhs) noexcept {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&(lhs.pod));
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&(rhs.pod));
+    return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0;
+  }
+};
+
+// Wrapped version: this allows the outer struct to have custom copy and move
+// constructors for additional safety
+template <typename ParamsWrapper>
+struct ParamsWrapperHash {
+  // Params must be a POD because we read out its memory
+  // contents as char* when hashing
+  static_assert(
+      std::is_standard_layout_v<decltype(ParamsWrapper::pod)>,
+      "ParamsWrapper cannot wrap non-POD data");
+
+  size_t operator()(const ParamsWrapper& params_wrapper) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&(params_wrapper.pod));
+    uint32_t value = 0x811C9DC5;
+    for (const auto i : c10::irange(sizeof(params_wrapper.pod))) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/verbose_wrapper.h b/MLPY/Lib/site-packages/torch/include/ATen/native/verbose_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc16ad2c373177cb92d297b4b78da0efa9800225
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/verbose_wrapper.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace torch::verbose {
+TORCH_API int _mkl_set_verbose(int enable);
+TORCH_API int _mkldnn_set_verbose(int level);
+} // namespace torch::verbose
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/native/vol2col.h b/MLPY/Lib/site-packages/torch/include/ATen/native/vol2col.h
new file mode 100644
index 0000000000000000000000000000000000000000..7067c741cbc6a23a13d33341963294874b5c3716
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/native/vol2col.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <cstring>
+
+namespace at::native {
+
+template <typename T>
+static void vol2col(
+    const T* data_vol,
+    const int64_t channels,
+    const int64_t depth,
+    const int64_t height,
+    const int64_t width,
+    const int64_t depth_col,
+    const int64_t height_col,
+    const int64_t width_col,
+    const int64_t kT,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pT,
+    const int64_t pH,
+    const int64_t pW,
+    const int64_t dT,
+    const int64_t dH,
+    const int64_t dW,
+    const int64_t dilationT,
+    const int64_t dilationH,
+    const int64_t dilationW,
+    T* data_col) {
+  int64_t c, t, h, w;
+  int64_t channels_col = channels * kT * kernel_height * kernel_width;
+  for (c = 0; c < channels_col; ++c) {
+    int64_t w_offset = c % kernel_width;
+    int64_t h_offset = (c / kernel_width) % kernel_height;
+    int64_t t_offset = (c / kernel_width / kernel_height) % kT;
+    int64_t c_vol = c / kT / kernel_height / kernel_width;
+    for (t = 0; t < depth_col; ++t) {
+      int64_t t_pad = t * dT - pT + t_offset * dilationT;
+      for (h = 0; h < height_col; ++h) {
+        int64_t h_pad = h * dH - pH + h_offset * dilationH;
+        for (w = 0; w < width_col; ++w) {
+          int64_t w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+                data_vol
+                    [((c_vol * depth + t_pad) * height + h_pad) * width +
+                     w_pad];
+          else
+            data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+                0;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static void col2vol(
+    const T* data_col,
+    const int64_t channels,
+    const int64_t depth,
+    const int64_t height,
+    const int64_t width,
+    const int64_t out_depth,
+    const int64_t out_height,
+    const int64_t out_width,
+    const int64_t kT,
+    const int64_t kernel_height,
+    const int64_t kernel_width,
+    const int64_t pT,
+    const int64_t pH,
+    const int64_t pW,
+    const int64_t dT,
+    const int64_t dH,
+    const int64_t dW,
+    const int64_t dilationT,
+    const int64_t dilationH,
+    const int64_t dilationW,
+    T* data_vol) {
+  memset(data_vol, 0, sizeof(T) * depth * height * width * channels);
+  int64_t depth_col = out_depth;
+  int64_t height_col = out_height;
+  int64_t width_col = out_width;
+  int64_t channels_col = channels * kT * kernel_height * kernel_width;
+  for (int64_t c = 0; c < channels_col; ++c) {
+    int64_t w_offset = c % kernel_width;
+    int64_t h_offset = (c / kernel_width) % kernel_height;
+    int64_t t_offset = (c / kernel_width / kernel_height) % kT;
+    int64_t c_vol = c / kT / kernel_height / kernel_width;
+    for (int64_t t = 0; t < depth_col; ++t) {
+      int64_t t_pad = t * dT - pT + t_offset * dilationT;
+      for (int64_t h = 0; h < height_col; ++h) {
+        int64_t h_pad = h * dH - pH + h_offset * dilationH;
+        for (int64_t w = 0; w < width_col; ++w) {
+          int64_t w_pad = w * dW - pW + w_offset * dilationW;
+          if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height &&
+              w_pad >= 0 && w_pad < width)
+            data_vol
+                [((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+                data_col
+                    [((c * depth_col + t) * height_col + h) * width_col + w];
+        }
+      }
+    }
+  }
+}
+
+} // namespace at::native
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec5807aed88f2adbe3743936bc29ac430f22db16
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/abs_ops.h>
+
+namespace at {
+
+
+// aten::abs(Tensor self) -> Tensor
+inline at::Tensor abs(const at::Tensor & self) {
+    return at::_ops::abs::call(self);
+}
+
+// aten::abs_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & abs_(at::Tensor & self) {
+    return at::_ops::abs_::call(self);
+}
+
+// aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & abs_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::abs_out::call(self, out);
+}
+// aten::abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & abs_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::abs_out::call(self, out);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_compositeexplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6464bff68321e5e032d3caeab3c5607692887418
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor abs(const at::Tensor & self);
+TORCH_API at::Tensor & abs_(at::Tensor & self);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3318cfc1b25c809c02e00de2a958fa2889d0038d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & abs_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & abs_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d19eeae3b4cbb14e13e3ae89b7d264a564fdf15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & abs_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & abs_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c32dc5290c27dfb642d93211a240fe9f2e56ab0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_native.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor abs(const at::Tensor & self);
+TORCH_API at::Tensor & abs_(at::Tensor & self);
+TORCH_API at::Tensor & abs_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor NestedTensor_abs(const at::Tensor & self);
+TORCH_API at::Tensor & NestedTensor_abs_(at::Tensor & self);
+TORCH_API at::Tensor abs_sparse(const at::Tensor & self);
+TORCH_API at::Tensor & abs_sparse_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & abs_sparse_(at::Tensor & self);
+TORCH_API at::Tensor abs_sparse_csr(const at::Tensor & self);
+TORCH_API at::Tensor & abs_sparse_csr_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & abs_sparse_csr_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..22f93d2322e7d71bae4b02158a33d463c2af871c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/abs_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API abs {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API abs_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API abs_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute.h
new file mode 100644
index 0000000000000000000000000000000000000000..553cea7dfad348bf5cf66e9884d20ab518b89a7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/absolute_ops.h>
+
+namespace at {
+
+
+// aten::absolute(Tensor self) -> Tensor
+inline at::Tensor absolute(const at::Tensor & self) {
+    return at::_ops::absolute::call(self);
+}
+
+// aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & absolute_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::absolute_out::call(self, out);
+}
+// aten::absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & absolute_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::absolute_out::call(self, out);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_compositeimplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1638f19b5f208cf4c3cf1479f2095b8e45497c6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor absolute(const at::Tensor & self);
+TORCH_API at::Tensor & absolute_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & absolute_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & absolute_(at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..025d6908fafc2af3b41ccf303fee2717660e981f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor absolute(const at::Tensor & self);
+TORCH_API at::Tensor & absolute_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & absolute_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..62bb1e6ddd4451e18927e583dfc0732314fe4aae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/absolute_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API absolute {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::absolute")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "absolute(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API absolute_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::absolute_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "absolute_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API absolute_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::absolute")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos.h
new file mode 100644
index 0000000000000000000000000000000000000000..446813b7360b2ed29cf88ed2228106323b178286
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/acos_ops.h>
+
+namespace at {
+
+
+// aten::acos(Tensor self) -> Tensor
+inline at::Tensor acos(const at::Tensor & self) {
+    return at::_ops::acos::call(self);
+}
+
+// aten::acos_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & acos_(at::Tensor & self) {
+    return at::_ops::acos_::call(self);
+}
+
+// aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & acos_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::acos_out::call(self, out);
+}
+// aten::acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & acos_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::acos_out::call(self, out);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e29662d852815777e082ad49320532f5d400b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor acos(const at::Tensor & self);
+TORCH_API at::Tensor & acos_(at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2396f7b6f9ddeeee84eb60f25d40d583e154ae14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor acos(const at::Tensor & self);
+TORCH_API at::Tensor & acos_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acos_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acos_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..31049f254f7e6138f568bcd22b976218f256c57b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor acos(const at::Tensor & self);
+TORCH_API at::Tensor & acos_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acos_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acos_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c1b110ab944a5e528aeabde83e6cb0b53ecd46a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_acos : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..67c00a184c2419173a7a3078a7409ef37588e2b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor acos(const at::Tensor & self);
+TORCH_API at::Tensor & acos_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acos_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acos_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a0be9eca577fd3d8a631de12de4cf3367bc8700
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/acos_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_acos_out : public at::meta::structured_acos {
+void impl(const at::Tensor & self, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9aeeb020d5e41fff6adfe89e6d03fc4f68da332d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acos_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API acos {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acos")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acos(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API acos_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acos_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acos_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API acos_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acos")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea56387376a1a2df0a2a592d3085dfcb1b3e3229
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/acosh_ops.h>
+
+namespace at {
+
+
+// aten::acosh(Tensor self) -> Tensor
+inline at::Tensor acosh(const at::Tensor & self) {
+    return at::_ops::acosh::call(self);
+}
+
+// aten::acosh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & acosh_(at::Tensor & self) {
+    return at::_ops::acosh_::call(self);
+}
+
+// aten::acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & acosh_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::acosh_out::call(self, out);
+}
+// aten::acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & acosh_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::acosh_out::call(self, out);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..279bd1957fe1e17321b6d062b343d12459bfb487
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor acosh(const at::Tensor & self);
+TORCH_API at::Tensor & acosh_(at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..026f535fdf09102ea68284aaf189f3de877f1ed9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor acosh(const at::Tensor & self);
+TORCH_API at::Tensor & acosh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acosh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acosh_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a31a1c967c1185ee9b87bd763c281a1ee491d141
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor acosh(const at::Tensor & self);
+TORCH_API at::Tensor & acosh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acosh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acosh_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..1974e9150ba6ac3c648bf38c90f4bb0b7b0f0ff7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_acosh : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6f867a40b3cc2dc93a7b780cf0142f929228d36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor acosh(const at::Tensor & self);
+TORCH_API at::Tensor & acosh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & acosh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & acosh_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..35b66762bd865fb68c256b900bb89d24b7467f11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/acosh_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_acosh_out : public at::meta::structured_acosh {
+void impl(const at::Tensor & self, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6dcfda790b3c9d749632e5988d3013b59ca7ad3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/acosh_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API acosh {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acosh")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acosh(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API acosh_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acosh_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acosh_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API acosh_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::acosh")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d28c33d8834feff6600f61635b8c2e93832d273
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_avg_pool1d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
+inline at::Tensor adaptive_avg_pool1d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool1d::call(self, output_size);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_compositeimplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..45bb93f27cacd0ded73f9227e7be915c3f3de42d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor adaptive_avg_pool1d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff7f39f6536670c5dbf7dbf9ae37179d1fe2ef12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor adaptive_avg_pool1d(const at::Tensor & self, at::IntArrayRef output_size);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..61a858c40cce2203bb164cdd40ca985e449a1a6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_avg_pool1d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..433acd7aed345d789c1b57ee6086d4b3169f262c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_avg_pool2d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+  }
+}
+
+// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+  }
+}
+
+// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out);
+  }
+}
+
+// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out);
+  }
+}
+
+// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+inline at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size));
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size));
+  }
+}
+
+// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor
+inline at::Tensor adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d::call(self, output_size);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool2d::call(self, output_size);
+  }
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_compositeimplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b14e16398365dbed76601b8a1fecc31053869166
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..168960f65674dc06fb52a3331be4ff949a7ac84e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6363b3a2ce1d7c7e914ad1fd03301b649157d405
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ed978fd6a96f295ee2683e6e6c782d8488c1980
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool2d_out_cpu(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool2d_out_cuda(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & mkldnn_adaptive_avg_pool2d_out_stub(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cfeee2c7dd4a077debcfc4bab04b5cbce2d282d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_avg_pool2d_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+};
+
+struct TORCH_API adaptive_avg_pool2d {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c1cac534a1327ae16f8db2231860530f98617d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_avg_pool3d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & adaptive_avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+  }
+}
+
+// aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & adaptive_avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, c10::fromIntArrayRefSlow(output_size), out);
+  }
+}
+
+// aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, output_size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & adaptive_avg_pool3d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, output_size, out);
+  }
+}
+
+// aten::adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, output_size, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & adaptive_avg_pool3d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) {
+    return at::_ops::adaptive_avg_pool3d_out::call(self, output_size, out);
+  }
+}
+
+// aten::adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+inline at::Tensor adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d::call(self, c10::fromIntArrayRefSlow(output_size));
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d::call(self, c10::fromIntArrayRefSlow(output_size));
+  }
+}
+
+// aten::adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor
+inline at::Tensor adaptive_avg_pool3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d::call(self, output_size);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor adaptive_avg_pool3d(const at::Tensor & self, c10::SymIntArrayRef output_size) {
+    return at::_ops::adaptive_avg_pool3d::call(self, output_size);
+  }
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1e984135bced3550b4f60ca58ce22e876aa7e50
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h
@@ -0,0 +1,34 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_avg_pool3d_backward_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) {
+    return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input);
+}
+// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) {
+    return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d6271488475837cc58198cabb052077194eef9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8433c1e6df18a07e0b1e6a229ae5357530ab4dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..a72bd814f72257a81fd997e0435d36bd1054afa1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_out_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+TORCH_API at::Tensor & adaptive_avg_pool3d_backward_out_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..70eccdbad7ee1b4078119728a7f4dc13d92b0cc2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_avg_pool3d_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool3d_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..46872d80ad9b344940c662aac8ea906a77a329ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8d2528bcee029c651ee315d8ae8e3f997ce25b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & adaptive_avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..52f8f7a70f29d0d5da89ca7a2a5f03755c70e579
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & adaptive_avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..0949add8cb6083c204f673e05af0a20eedc0a04f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor adaptive_avg_pool3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & adaptive_avg_pool3d_out_cpu(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool3d_out_cuda(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & adaptive_avg_pool3d_out_quantized_cpu(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f420ebc5b3597b4e185670c500430dab3a259886
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_avg_pool3d_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+};
+
+struct TORCH_API adaptive_avg_pool3d {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_avg_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..22baccb753eebbd559ce06d4d449482d22d2769c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_max_pool1d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool1d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_max_pool1d::call(self, output_size);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4546f00b62879ef7a6b3e241c3e7295bf36f61c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool1d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2af9d523f42c94746fca4af685436f4cd6eeffe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool1d(const at::Tensor & self, at::IntArrayRef output_size);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a1393a3b436cce4ca8920c0d71ad17271ab39f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_max_pool1d {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, at::IntArrayRef output_size);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1520b27b55ae9ca3cb71e6854982a6277d4289a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_max_pool2d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_max_pool2d_out::call(self, output_size, out, indices);
+}
+// aten::adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool2d_out::call(self, output_size, out, indices);
+}
+
+// aten::adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_max_pool2d::call(self, output_size);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..af85a8dae17ff557ed665c416058a178a42c93a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_max_pool2d_backward_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_max_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool2d_backward_grad_input::call(grad_output, self, indices, grad_input);
+}
+// aten::adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) {
+    return at::_ops::adaptive_max_pool2d_backward_grad_input::call(grad_output, self, indices, grad_input);
+}
+
+// aten::adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+inline at::Tensor adaptive_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool2d_backward::call(grad_output, self, indices);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fde38f5cc95032d0abcefdc3587cf86990b78226
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor adaptive_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..798c421dbac601d569b02fb7bda45d1c5145c3c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor adaptive_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e176d3228926b817dc8ea7b323f94a19a273e739
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor adaptive_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..9794ab4c66de52e58f47cc0e08763ea039f58573
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_adaptive_max_pool2d_backward : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..89982f55f7f3227ae11c6f416446f4ef25221fa7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor adaptive_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e380439c0ab5a40a66abf4c8ab62af387d8b3c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_adaptive_max_pool2d_backward_out_cpu : public at::meta::structured_adaptive_max_pool2d_backward {
+void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input);
+};
+struct TORCH_API structured_adaptive_max_pool2d_backward_out_cuda : public at::meta::structured_adaptive_max_pool2d_backward {
+void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..70ec6f1f710852683b88f80053564701593806b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_max_pool2d_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool2d_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+};
+
+struct TORCH_API adaptive_max_pool2d_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool2d_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6a6e5560eff20cd763f28e34c6932ddd48ded3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d3ee55bcba794e2b8c47da8d94deabed98e1a00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e30f25019eefa8d5e58c0ba2a75594470ae62c24
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..3566130ca13f037f479c13f3e2686a4d43122db6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_adaptive_max_pool2d : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9668c4ece2710352a58d9951f04dd1e5409951b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e968a36b208c1d4359a43e992f21e48ddd17d34
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/adaptive_max_pool2d_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_adaptive_max_pool2d_out_cpu : public at::meta::structured_adaptive_max_pool2d {
+void impl(const at::Tensor & self, at::IntArrayRef output_size, const at::Tensor & out, const at::Tensor & indices);
+};
+struct TORCH_API structured_adaptive_max_pool2d_out_cuda : public at::meta::structured_adaptive_max_pool2d {
+void impl(const at::Tensor & self, at::IntArrayRef output_size, const at::Tensor & out, const at::Tensor & indices);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3179829d6b41fe74bebfc26e73dffa37e5b4456b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_max_pool2d_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, at::IntArrayRef, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+};
+
+struct TORCH_API adaptive_max_pool2d {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, at::IntArrayRef output_size);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..46140dc618b2f84eb8735189167a3142364b5e0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_max_pool3d_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_max_pool3d_out::call(self, output_size, out, indices);
+}
+// aten::adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool3d_out::call(self, output_size, out, indices);
+}
+
+// aten::adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(const at::Tensor & self, at::IntArrayRef output_size) {
+    return at::_ops::adaptive_max_pool3d::call(self, output_size);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..a937d646f57af67af5c70fccfe8cad3b8a6e44b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/adaptive_max_pool3d_backward_ops.h>
+
+namespace at {
+
+
+// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input);
+}
+// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) {
+    return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input);
+}
+
+// aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
+inline at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) {
+    return at::_ops::adaptive_max_pool3d_backward::call(grad_output, self, indices);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..39370d2febf72e354824f2de2da25812378534d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f82cc66807a723d28089c883a1ef9091344b5d8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8e4a5c65488f5d479b869a4e49280c4d9428079
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a36e644418b0ced5d4c1a099778e85a233c693f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_adaptive_max_pool3d_backward : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef8b885cbb1c5c98254c2eca5f8539f5e96f0310
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..caf64b5c5552cdbef6eb731aaa732f19d0731144
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_adaptive_max_pool3d_backward_out_cpu : public at::meta::structured_adaptive_max_pool3d_backward {
+void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input);
+};
+struct TORCH_API structured_adaptive_max_pool3d_backward_out_cuda : public at::meta::structured_adaptive_max_pool3d_backward {
+void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a01977617a72331ea121d30ee2dca63bb847780e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_max_pool3d_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool3d_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+};
+
+struct TORCH_API adaptive_max_pool3d_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool3d_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..684cfdeec4c31441721d1e75c50e4b7797773db6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..41dd65be3684d5febb0eb37d655c37c9f71d17ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..55acbfee68654cf1c7578be8c3b1c9c992329fc2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f923ba0637757dab45adf74824f2dba850bccfd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_adaptive_max_pool3d : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaa79aaa2193efbd7f8d7dcde2865de883604108
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> adaptive_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..80421c103dbd626a591c9d9b2b9177cefe52869d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/adaptive_max_pool3d_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_adaptive_max_pool3d_out_cpu : public at::meta::structured_adaptive_max_pool3d {
+void impl(const at::Tensor & self, at::IntArrayRef output_size, const at::Tensor & out, const at::Tensor & indices);
+};
+struct TORCH_API structured_adaptive_max_pool3d_out_cuda : public at::meta::structured_adaptive_max_pool3d {
+void impl(const at::Tensor & self, at::IntArrayRef output_size, const at::Tensor & out, const at::Tensor & indices);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..214ae900afcaf832a50361120a28cdd160dd254f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API adaptive_max_pool3d_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, at::IntArrayRef, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices);
+};
+
+struct TORCH_API adaptive_max_pool3d {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::adaptive_max_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, at::IntArrayRef output_size);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/add.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/add.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1de6d07e5548f142fa2131f28d5c11af45d5a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/add.h
@@ -0,0 +1,53 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/add_ops.h>
+
+namespace at {
+
+
+// aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
+inline at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+    return at::_ops::add_Tensor::call(self, other, alpha);
+}
+
+// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) {
+    return at::_ops::add_out::call(self, other, alpha, out);
+}
+// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & add_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) {
+    return at::_ops::add_out::call(self, other, alpha, out);
+}
+
+// aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
+inline at::Tensor add(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+    return at::_ops::add_Scalar::call(self, other, alpha);
+}
+
+// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) {
+    return at::_ops::add_Scalar_out::call(self, other, alpha, out);
+}
+// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & add_outf(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) {
+    return at::_ops::add_Scalar_out::call(self, other, alpha, out);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f74a0f869fb9afb99cd25ebbfb6d248449b32fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/addbmm_ops.h>
+
+namespace at {
+
+
+// aten::addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addbmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::addbmm_out::call(self, batch1, batch2, beta, alpha, out);
+}
+// aten::addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addbmm_outf(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+    return at::_ops::addbmm_out::call(self, batch1, batch2, beta, alpha, out);
+}
+
+// aten::addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::addbmm::call(self, batch1, batch2, beta, alpha);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b79e23aeeac7d2fb9d957d8e9129715f6fdd1f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & addbmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & addbmm_outf(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bc7a7c0680d6278569997b0cba0cb28a265e6de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & addbmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & addbmm_outf(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6452de7679774bcddb9012c48260bb5e3beea5f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_meta_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e50590419df3b2f79326e3abfd795b533946bdb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & addbmm_out(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc14be4b1a3af8207594ae24ca6091d25f824f88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addbmm_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API addbmm_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addbmm_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha);
+};
+
+struct TORCH_API addbmm_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addbmm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+};
+
+struct TORCH_API addbmm {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addbmm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23b44aa2f23959d3f1cb3034a20cbed7ba91ad6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/addcdiv_ops.h>
+
+namespace at {
+
+
+// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out);
+}
+// aten::addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) {
+    return at::_ops::addcdiv_out::call(self, tensor1, tensor2, value, out);
+}
+
+// aten::addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+inline at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcdiv::call(self, tensor1, tensor2, value);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c772d2c64dcd37674988ebfc110fbf642eb9f63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cpu_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..67600a3ac708466a3b8551bbc0254ae3ca7a4e6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace cpu
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..85268d589375de645afc8e2cdb4f83b2180be8a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace cuda
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..76775529f6e3750339f0c032f77312346ed2dc22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_addcdiv : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+};
+
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b1dca2d129367c1303657d280f40b69719ac5f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace meta
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_native.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6ef52a6ca95add2c4609395220a645c05ebb468
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/addcdiv_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_addcdiv_out : public at::meta::structured_addcdiv {
+void impl(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_ops.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9b02622035b42513306066fb9e4b901bf4aa37
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcdiv_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API addcdiv_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addcdiv")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+};
+
+struct TORCH_API addcdiv {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addcdiv")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+};
+
+struct TORCH_API addcdiv_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::addcdiv_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+};
+
+}} // namespace at::_ops
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d35481d8b78ee36ff9a60a5780394651f8fc65b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/addcmul_ops.h>
+
+namespace at {
+
+
+// aten::addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcmul_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcmul_out::call(self, tensor1, tensor2, value, out);
+}
+// aten::addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addcmul_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out) {
+    return at::_ops::addcmul_out::call(self, tensor1, tensor2, value, out);
+}
+
+// aten::addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
+inline at::Tensor addcmul(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1) {
+    return at::_ops::addcmul::call(self, tensor1, tensor2, value);
+}
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul_compositeexplicitautogradnonfunctional_dispatch.h b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c97b1dfdafa251d1f1ea3440bf0d62036b68bbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/ops/addcmul_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor addcmul(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcmul_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/quantized/QTensorImpl.h b/MLPY/Lib/site-packages/torch/include/ATen/quantized/QTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe4bea80c14ad8d19a987c887e935f4709e6488
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/quantized/QTensorImpl.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <ATen/quantized/Quantizer.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+
+/**
+ * QTensorImpl is a TensorImpl for Quantized Tensors, it stores Quantizer which
+ * specifies the quantization scheme and parameters, for more information please
+ * see ATen/quantized/Quantizer.h
+ *
+ * We'll use QTensor in code or documentation to refer to a Tensor with QTensorImpl.
+ */
+struct TORCH_API QTensorImpl : public c10::TensorImpl {
+ public:
+  QTensorImpl(
+      Storage&& storage,
+      DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      QuantizerPtr quantizer);
+
+  // See Note [Enum ImplType]
+  QTensorImpl(
+      ImplType type,
+      Storage&& storage,
+      DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      QuantizerPtr quantizer);
+
+
+  // TODO: Expose in PyTorch Frontend
+  QuantizerPtr quantizer() {
+    return quantizer_;
+  }
+
+  void set_quantizer_(QuantizerPtr quantizer) {
+    quantizer_ = quantizer;
+  }
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<QTensorImpl>(
+        Storage(storage()), key_set(), data_type_, quantizer_);
+    copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/version_counter,
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    impl->refresh_contiguous();
+    return impl;
+  }
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override {
+    auto impl = c10::make_intrusive<QTensorImpl>(
+        Storage(storage()), key_set(), data_type_, quantizer_);
+    copy_tensor_metadata(
+      /*src_impl=*/this,
+      /*dest_impl=*/impl.get(),
+      /*version_counter=*/std::move(version_counter),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    impl->refresh_numel();
+    impl->refresh_contiguous();
+    return impl;
+  }
+
+  /**
+   * Shallow-copies data from another TensorImpl into this TensorImpl.
+   *
+   * For why this function doesn't check this TensorImpl's `allow_tensor_metadata_change_`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override {
+    AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
+    auto q_impl = static_cast<const QTensorImpl*>(impl.get());
+    copy_tensor_metadata(
+      /*src_impl=*/q_impl,
+      /*dest_impl=*/this,
+      /*version_counter=*/version_counter(),
+      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+ private:
+  QuantizerPtr quantizer_;
+
+  const char* tensorimpl_type_name() const override;
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / storage_offset)
+   * from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const QTensorImpl* src_q_impl,
+      QTensorImpl* dest_q_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(src_q_impl, dest_q_impl, version_counter, allow_tensor_metadata_change);
+
+    // OpaqueTensorImpl-specific fields.
+    dest_q_impl->quantizer_ = src_q_impl->quantizer_;
+  }
+};
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/quantized/Quantizer.h b/MLPY/Lib/site-packages/torch/include/ATen/quantized/Quantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..92e7bb6844f5a1e010174ad37c7a9f8928392e6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/quantized/Quantizer.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#include <c10/core/QScheme.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/TensorOptions.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+
+#include <ATen/core/QuantizerBase.h>
+
+#include <cmath>
+#include <memory>
+#include <utility>
+
+namespace at {
+
+/**
+ * UnknownQuantizer is a placeholder quantizer for functions that implement
+ * quantization in a two step process.  First a tensor is allocated but with
+ * unknown quantizer, and then the quantization kernel decides what the final
+ * quantizer will be.
+ */
+struct TORCH_API UnknownQuantizer : public Quantizer {
+  explicit UnknownQuantizer(ScalarType scalar_type)
+    : Quantizer(scalar_type) {}
+
+  Tensor quantize(const Tensor& tensor) override;
+  Tensor dequantize(const Tensor& qtensor) override;
+  Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
+  QScheme qscheme() const override;
+  bool equalTo(QuantizerPtr other) const override;
+};
+
+/**
+ * UniformQuantizer is the parent class for all uniform quantizers.
+ * These quantization scheme will map float value uniformly to
+ * the quantized value. For example, affine quantizer is
+ * the most commonly used scheme in this category.
+ */
+struct TORCH_API UniformQuantizer : public Quantizer {
+  explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
+};
+
+/**
+ * NonUniformQuantizer is the parent class for all non-uniform quantizers.
+ * These quantization scheme may map float value non-uniformly to the quantized
+ * value. K-means quantization is a representative example in this category.
+ */
+struct TORCH_API NonUniformQuantizer : public Quantizer {
+  explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
+};
+
+// There is also StochasticQuantizer which is uniform but not affine
+
+/**
+ * AffineQuantizer uses affine transformation to do quantization.
+ *
+ * For quantize:
+ * Y = clamp(round(X / scale + zero_point), min, max)
+ * For dequantize:
+ * X = (Y - zero_point) * scale
+ */
+struct TORCH_API AffineQuantizer : public UniformQuantizer {
+  explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
+};
+
+// Note that we will not have Symmetric Quantizer in backend to reduce
+// complications in quantized kernel implementation.
+
+/**
+ * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
+ * all the values in the Tensor.
+ */
+struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer {
+  explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
+    : AffineQuantizer(scalar_type),
+        scale_(scale),
+        zero_point_(zero_point) {}
+
+  Tensor quantize(const Tensor& tensor) override;
+  Tensor dequantize(const Tensor& qtensor) override;
+  Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
+
+  QScheme qscheme() const override {
+    return kPerTensorAffine;
+  }
+
+  double scale() const {
+    return scale_;
+  }
+
+  int64_t zero_point() const {
+    return zero_point_;
+  }
+
+  bool equalTo(QuantizerPtr other) const override {
+    if (!other.get() || other->qscheme() != kPerTensorAffine) {
+      return false;
+    }
+    auto* other_per_tensor_affine =
+        static_cast<PerTensorAffineQuantizer*>(other.get());
+    return scalar_type() == other_per_tensor_affine->scalar_type() &&
+        scale() == other_per_tensor_affine->scale() &&
+        zero_point() == other_per_tensor_affine->zero_point();
+  }
+
+ private:
+  const double scale_;
+  // We use int64_t for consistency with Python
+  const int64_t zero_point_;
+};
+
+/**
+ * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
+ * except that we have an independent scale and zero_point parameter
+ * for each channel.
+ *
+ * Also note that per channel quantization is mostly applied to output channels
+ * of weights since per-input channel of weight quantization or per-channel
+ * quantization for activations can't be efficiently supported in most of
+ * processors since it requires each multiplication result within a single
+ * dot-product to have a different scale.
+ */
+struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer {
+  explicit PerChannelAffineQuantizer(
+      ScalarType scalar_type,
+      Tensor scales,
+      Tensor zero_points,
+      int64_t axis)
+      : AffineQuantizer(scalar_type),
+        scales_(std::move(scales)),
+        zero_points_(std::move(zero_points)),
+        axis_(axis) {}
+
+  QScheme qscheme() const override {
+    return kPerChannelAffine;
+  }
+
+  Tensor scales() const {
+    return scales_;
+  }
+
+  Tensor zero_points() const {
+    return zero_points_;
+  }
+
+  int64_t axis() const {
+    return axis_;
+  }
+
+  Tensor quantize(const Tensor& tensor) override;
+  Tensor dequantize(const Tensor& qtensor) override;
+  Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
+
+  bool equalTo(QuantizerPtr other) const override {
+    if (!other.get() || other->qscheme() != kPerChannelAffine) {
+      return false;
+    }
+    auto* other_per_channel_affine =
+        static_cast<PerChannelAffineQuantizer*>(other.get());
+    return scalar_type() == other_per_channel_affine->scalar_type() &&
+        scales().equal(other_per_channel_affine->scales()) &&
+        zero_points().equal(other_per_channel_affine->zero_points()) &&
+        axis() == other_per_channel_affine->axis();
+  }
+
+ protected:
+  Tensor scales_;
+  Tensor zero_points_;
+  const int64_t axis_;
+};
+
+/**
+ * PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer
+ * except that it expects both scale and zero point to be floating point values.
+ *
+ * This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of
+ * kPerChannelAffine.
+ *
+ * The quantize equation in this case looks like -
+ * Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale
+ *
+ * Note: Usage of floating point zero point is useful in cases where 0 doesn't need to
+ * be exactly represented in the quantized space. We can get additional precision by
+ * using floating point values for zero point.
+ */
+struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer {
+  explicit PerChannelAffineFloatQParamsQuantizer(
+      ScalarType scalar_type,
+      Tensor scales,
+      Tensor zero_points,
+      int64_t axis)
+      : PerChannelAffineQuantizer(scalar_type,
+        scales,
+        zero_points,
+        axis) {}
+
+  QScheme qscheme() const override {
+    return kPerChannelAffineFloatQParams;
+  }
+
+  Tensor quantize(const Tensor& tensor) override;
+  Tensor dequantize(const Tensor& qtensor) override;
+  Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override;
+
+  bool equalTo(QuantizerPtr other) const override {
+    if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) {
+      return false;
+    }
+    auto* other_per_channel_float_qparams =
+        static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get());
+    return scalar_type() == other_per_channel_float_qparams->scalar_type() &&
+        scales().equal(other_per_channel_float_qparams->scales()) &&
+        zero_points().equal(other_per_channel_float_qparams->zero_points()) &&
+        axis() == other_per_channel_float_qparams->axis();
+  }
+};
+
+// This is an internal utility function for getting at the QTensorImpl,
+// You should only use this for writing low level
+// setters/getters for QTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+// This may be called repeatedly, so make sure it's pretty cheap.
+TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self);
+
+// double and int64_t are because of the native function API, we only have these
+// argument types right now in native functions
+TORCH_API QuantizerPtr
+make_per_tensor_affine_quantizer(
+    double scale, int64_t zero_point, ScalarType scalar_type);
+
+TORCH_API QuantizerPtr make_per_channel_affine_quantizer(
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis,
+    ScalarType scalar_type);
+
+TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
+
+// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
+TORCH_API Tensor new_qtensor(
+    IntArrayRef sizes,
+    const TensorOptions& options,
+    QuantizerPtr quantizer);
+
+TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);
+
+TORCH_API Tensor from_blob_quantized_per_tensor_affine(
+    void* data,
+    IntArrayRef sizes,
+    IntArrayRef strides,
+    std::function<void(void*)> deleter,
+    const float scale,
+    const int64_t zeroPoint,
+    const TensorOptions& options);
+
+TORCH_API Tensor from_blob_quantized_per_tensor_affine(
+    void* data,
+    IntArrayRef sizes,
+    std::function<void(void*)> deleter,
+    const float scale,
+    const int64_t zeroPoint,
+    const TensorOptions& options);
+
+TORCH_API Tensor from_blob_quantized_per_channel_affine(
+    void* data,
+    IntArrayRef sizes,
+    std::function<void(void*)> deleter,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    const int64_t axis,
+    const TensorOptions& options);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/ATen/record_function.h b/MLPY/Lib/site-packages/torch/include/ATen/record_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aa5b67766b3c6c3be921bcf69a8c74098d682f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/ATen/record_function.h
@@ -0,0 +1,740 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/operator_name.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
+
+#include <array>
+#include <functional>
+#include <memory>
+#include <variant>
+
+namespace c10 {
+class TORCH_API OperatorHandle;
+}
+
+namespace at {
+
+// Function name to record NCCL metadata
+extern TORCH_API const std::string kParamCommsCallName;
+
+// Kind of record function scope;
+enum class C10_API_ENUM RecordScope : uint8_t {
+  // c10/ATen ops, autograd nodes
+  FUNCTION = 0,
+  // Functions/nodes called from the autograd
+  BACKWARD_FUNCTION,
+  // TorchScript functions, methods
+  TORCHSCRIPT_FUNCTION,
+  // Kernel Function dtype Tag
+  KERNEL_FUNCTION_DTYPE,
+  // Torchbind custom class,
+  CUSTOM_CLASS,
+  // Generic Build Feature
+  BUILD_FEATURE,
+  // Kernel Function dtype Tag
+  LITE_INTERPRETER,
+  // User defined scope (e.g. with record_function())
+  USER_SCOPE,
+  // Scopes for static runtime, a specialized TorchScript interpreter
+  STATIC_RUNTIME_OP,
+  STATIC_RUNTIME_MODEL,
+  NUM_SCOPES, // must be the last in the list
+};
+
+} // namespace at
+
+namespace std {
+template <>
+struct hash<at::RecordScope> {
+  size_t operator()(const at::RecordScope& sc) const {
+    return static_cast<std::size_t>(sc);
+  }
+};
+} // namespace std
+
+namespace at {
+
+struct TORCH_API StringView {
+  StringView() : StringView(nullptr) {}
+  explicit StringView(const char* str_ptr)
+      : owned_str_ptr_(nullptr), str_ptr_(str_ptr) {}
+  explicit StringView(std::string str)
+      : owned_str_ptr_(std::make_shared<std::string>(std::move(str))),
+        str_ptr_(owned_str_ptr_->c_str()) {}
+
+  const char* str() const {
+    return str_ptr_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const StringView& dt) {
+    os << dt.str();
+    return os;
+  }
+
+  friend bool operator==(const StringView& lhs, const StringView& rhs) {
+    return strcmp(lhs.str(), rhs.str()) == 0;
+  }
+
+  friend bool operator!=(const StringView& lhs, const StringView& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  std::shared_ptr<std::string> owned_str_ptr_;
+  const char* str_ptr_;
+};
+
+// Soft limit on the number of callbacks to use;
+constexpr std::size_t kSoftLimitCallbacks = 4;
+
+// An abstract base class for various observer contexts that can be attached to
+// the RecordFunction.
+struct ObserverContext {
+  virtual ~ObserverContext() = default;
+
+ protected:
+  ObserverContext() = default;
+};
+
+typedef c10::SmallVector<uint64_t, kSoftLimitCallbacks> CallbackHandles;
+typedef c10::SmallVector<std::unique_ptr<ObserverContext>, kSoftLimitCallbacks>
+    ObserverContextList;
+typedef uint64_t RecordFunctionHandle;
+struct RecordFunction;
+
+//
+// PyTorch callbacks/observers API:
+//
+
+/**
+ * RecordFunctionCallback represents a pair of callbacks to be used with
+ * RecordFunction, members:
+ *   start, end - the callbacks to run when entering and exiting the scope;
+ *     optionally, the start callback may return an ObserverContext which will
+ *     be passed to the end callback, use appropriate constructor accordingly.
+ *   needs_inputs - whether the callbacks need the inputs passed from the
+ * observed function/range; NOTE: passing the inputs incurs an additional
+ * overhead; sampling_probability - if not 1.0, then the callback is
+ * probabilistically sampled to run; NOTE: start and end callbacks always run as
+ * a pair and are sampled together; scopes - types of scopes to execute the
+ * callbacks on (see RecordScope); passing empty set means the callbacks will be
+ * executed for all possible scope types should_run - optional function that
+ * returns whether this callback should run; overwrites the effect of setting
+ * sampling_probability
+ */
+class TORCH_API RecordFunctionCallback {
+ public:
+  using StartCallback =
+      std::unique_ptr<ObserverContext> (*)(const RecordFunction&);
+  using EndCallback = void (*)(const RecordFunction&, ObserverContext*);
+
+  // This interface supports observers that require passing an ObserverContext
+  // between start and end callbacks.
+  explicit RecordFunctionCallback(
+      StartCallback start,
+      EndCallback end = nullptr)
+      : start_(start), end_(end) {
+    scopes_.fill(true);
+  }
+
+  RecordFunctionCallback& needsInputs(bool needs_inputs) {
+    needs_inputs_ = needs_inputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsOutputs(bool needs_outputs) {
+    needs_outputs_ = needs_outputs;
+    return *this;
+  }
+
+  RecordFunctionCallback& needsIds(bool needs_ids) {
+    needs_ids_ = needs_ids;
+    return *this;
+  }
+
+  RecordFunctionCallback& samplingProb(double sampling_prob) {
+    TORCH_CHECK(
+        sampling_prob >= 0.0 && sampling_prob <= 1.0,
+        "Invalid sampling probability");
+    sampling_prob_ = sampling_prob;
+    return *this;
+  }
+
+  RecordFunctionCallback& scopes(
+      const std::unordered_set<RecordScope, std::hash<RecordScope>>& scopes) {
+    if (!scopes.empty()) {
+      scopes_.fill(false);
+      for (auto sc : scopes) {
+        scopes_[static_cast<size_t>(sc)] = true;
+      }
+    } else {
+      scopes_.fill(true);
+    }
+    return *this;
+  }
+
+  bool needsInputs() const {
+    return needs_inputs_;
+  }
+
+  bool needsOutputs() const {
+    return needs_outputs_;
+  }
+
+  bool needsIds() const {
+    return needs_ids_;
+  }
+
+  double samplingProb() const {
+    return sampling_prob_;
+  }
+
+  bool checkScope(RecordScope sc) const {
+    return scopes_[(size_t)sc];
+  }
+
+  StartCallback start() const {
+    return start_;
+  }
+
+  EndCallback end() const {
+    return end_;
+  }
+
+ private:
+  StartCallback start_;
+  EndCallback end_;
+  double sampling_prob_ = 1.0;
+  std::array<bool, static_cast<size_t>(RecordScope::NUM_SCOPES)> scopes_ = {};
+  bool needs_inputs_ = false;
+  bool needs_outputs_ = false;
+  bool needs_ids_ = false;
+};
+
+// Notes:
+//  - two types of callbacks are provided: thread local and global
+//     - thread local callbacks are added/removed only for the given thread
+//       and are stored locally for each thread and separately from the list
+//       of the global callbacks
+//     - global callbacks are stored in a single per process list and are
+//       invoked by every RecordFunction, in addition to the thread local
+//       callbacks specific to the given thread
+//  - we allow the added callbacks to be sampled, by specifying a sampling
+//    probability for each callback pair, if the start callback is
+//    not picked to run, the corresponding end callback won't be called
+//  - a typical use case for the global callbacks is passive monitoring
+//    in the background (e.g. fleet-wide monitoring), without focusing on
+//    the specific piece of code
+//  - in contrast, thread local callbacks are enabled locally, on demand,
+//    for the specific piece of code (range) and are not sampled
+//  - a typical use case for thread local callbacks is profiler and code
+//    execution tracer
+//  - note, thread local callbacks are automatically propagated with
+//    ThreadLocalState across JIT continuations and async tasks (at::launch)
+
+typedef uint64_t CallbackHandle;
+
+constexpr CallbackHandle INVALID_CALLBACK_HANDLE{0};
+
+// It is unnecessary to use atomic operations for enabling
+// thread-local function callbacks. Moreover, it prevents saving to
+// ThreadLocalState because std::atomic is non-copyable.
+struct RecordFunctionCallbacksEntry {
+  RecordFunctionCallbacksEntry(RecordFunctionCallback cb, CallbackHandle h)
+      : callback_(cb), handle_(h) {}
+
+  RecordFunctionCallback callback_;
+  bool enabled_{true};
+  CallbackHandle handle_;
+};
+
+// Holds pairs (callbacks, unique_id)
+using RecordFunctionCallbacks = std::vector<RecordFunctionCallbacksEntry>;
+
+// Generated by the callback managers to determine which functions to run.
+struct StepCallbacks {
+  StepCallbacks() = default;
+  StepCallbacks(uint64_t thread_id, RecordScope scope)
+      : thread_id_{thread_id}, scope_{scope} {}
+
+  bool empty() const {
+    return callbacks_.empty();
+  }
+
+  struct StartEndPair {
+    RecordFunctionCallback::StartCallback start_;
+    RecordFunctionCallback::EndCallback end_;
+  };
+
+  using StartEndPairs = c10::SmallVector<StartEndPair, kSoftLimitCallbacks>;
+
+  StartEndPairs callbacks_;
+  uint64_t thread_id_{0};
+  RecordScope scope_{RecordScope::FUNCTION};
+  bool needs_inputs_{false};
+  bool needs_outputs_{false};
+  bool needs_ids_{false};
+};
+
+struct TORCH_API RecordFunction {
+  // Default constructor is used with before function called afterwards:
+  //  scope - record scope that this function tracks
+  //  pre_sampled - whether this RecordFunction was already pre-sampled with
+  //    kLowProb probability
+  explicit RecordFunction(RecordScope scope = RecordScope::FUNCTION);
+  explicit RecordFunction(StepCallbacks&& step_callbacks);
+
+  template <typename F>
+  void before(
+      F fn,
+      c10::ArrayRef<const c10::IValue> args,
+      int64_t current_sequence_nr = -1) {
+    if (!isActive()) {
+      return;
+    }
+    inputs_ = args;
+    before(fn, current_sequence_nr);
+  }
+
+  template <typename F>
+  void before(
+      F fn,
+      const std::vector<IValue>* args,
+      int64_t current_sequence_nr = -1) {
+    before(
+        std::move(fn),
+        c10::ArrayRef<const c10::IValue>(args->data(), args->size()),
+        current_sequence_nr);
+  }
+
+  // Destructor calls end callbacks
+  virtual ~RecordFunction();
+
+  RecordFunction(const RecordFunction&) = delete;
+  RecordFunction& operator=(const RecordFunction&) = delete;
+
+  const char* name() const;
+
+  int64_t seqNr() const {
+    return sequence_nr_;
+  }
+
+  c10::ArrayRef<const IValue> inputs() const {
+#ifndef NDEBUG
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        inputs_valid_, "Called inputs() outside RecordFunction start callback");
+#endif
+    return inputs_;
+  }
+
+  const std::vector<c10::IValue>& outputs() const {
+    return outputs_;
+  }
+
+  void setOutputs(std::vector<c10::IValue>&& outputs) {
+    outputs_ = std::move(outputs);
+  }
+
+  void setOutputs(c10::ArrayRef<c10::IValue> outputs) {
+    outputs_ = outputs.vec();
+  }
+
+  size_t num_inputs() const;
+  size_t num_outputs() const;
+
+  // Retrieves the thread_id that this RecordFunction ran start callbacks with.
+  // Useful for writing thread safe end callbacks that may be potentially
+  // executed in a different thread (async ops)
+  uint64_t threadId() const {
+    return step_callbacks_.thread_id_;
+  }
+
+  // For backward functions - thread id of the corresponding forward function,
+  // or zero otherwise;
+  // used alongside with sequence number to correlate backward functions with
+  // the forward ones
+  uint64_t forwardThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setForwardThreadId(uint64_t thread_id) {
+    fwd_thread_id_ = thread_id;
+  }
+
+  RecordScope scope() const {
+    return step_callbacks_.scope_;
+  }
+
+  // Returns logical thread_id for the current thread
+  static uint64_t currentThreadId();
+
+  // Internal functions, do not use directly;
+  // used in python's context manager
+
+  // before functions initialize RecordFunction members and call
+  // start callbacks
+  using schema_ref_t = std::reference_wrapper<const c10::FunctionSchema>;
+  void before(const char* name, int64_t sequence_nr = -1);
+  void before(std::string name, int64_t sequence_nr = -1);
+  void before(schema_ref_t schema, int64_t sequence_nr = -1);
+
+  // Sets node ID for distributed profiling
+  static void setDefaultNodeId(int64_t defaultNodeId);
+  // Gets node ID for distributed profiling
+  static int64_t getDefaultNodeId();
+
+  // Calls end callbacks. After end(), accessors will no longer provide useful
+  // results.
+  void end();
+
+  // Internal-only, used only force async event for distributed events
+  // profiling.
+  void _setAsync();
+
+  // Returns whether this RecordFunction corresponds to an async event or not.
+  bool isAsync() const;
+
+  // Returns whether this RecordFunction corresponds to NCCL metadata collection
+  // or not.
+  bool isNcclMeta() const {
+    return is_nccl_meta_;
+  }
+
+  // Internal-only, used to denote out variant used for Static Runtime execution
+  void _setStaticRuntimeOutVariant();
+  bool isStaticRuntimeOutVariant() const;
+
+  RecordFunctionHandle handle() const {
+    return handle_;
+  }
+
+  c10::optional<OperatorName> operator_name() const;
+
+  // This method returns a copy of the FunctionSchema and can be expensive.
+  c10::optional<FunctionSchema> operator_schema() const;
+
+  void setHandle(RecordFunctionHandle handle) {
+    handle_ = handle;
+  }
+
+  // Whether this RecordFunction runs any callbacks.
+  bool isActive() const {
+    return !step_callbacks_.empty();
+  }
+
+  bool needsInputs() const {
+    return step_callbacks_.needs_inputs_;
+  }
+
+  bool needsOutputs() const {
+    return step_callbacks_.needs_outputs_;
+  }
+
+  int64_t debugHandle() const {
+    return debug_handle_;
+  }
+
+  void setDebugHandle(int64_t debug_handle) {
+    debug_handle_ = debug_handle;
+  }
+
+  void invalidateInputs() {
+#ifndef NDEBUG
+    inputs_valid_ = false;
+#endif
+  }
+
+ private:
+  void runStartCallbacks();
+
+  StepCallbacks step_callbacks_;
+
+  // In cases when RecordFunction might be active but we chose not to
+  // use the observers (e.g. operator is not observed), this boolean
+  // flag is used to check whether the start callbacks were called
+  bool called_start_callbacks_ = false;
+
+#ifndef NDEBUG
+  bool inputs_valid_ = false;
+#endif
+
+  // Stores various ObserverContext objects with event metadata for callbacks.
+  ObserverContextList ctx_;
+
+  std::variant<std::string, schema_ref_t> fn_;
+
+  int64_t sequence_nr_ = -1;
+  c10::ArrayRef<const IValue> inputs_;
+  std::vector<c10::IValue> outputs_;
+
+  // For backward functions - thread id of the forward function
+  uint64_t fwd_thread_id_ = 0;
+
+  // Unique id for this RecordFunction, used in callbacks to track start
+  // and end of ranges
+  RecordFunctionHandle handle_{0};
+
+  // Whether this record_function corresponds to an async event or not. Async
+  // events can complete in different threads or follow a future-like pattern
+  // of use.
+  bool is_async_{false};
+
+  // Debug handles are used for lazy annotation of module hierarchy
+  // and callstack.
+  // This is specifically is useful for mobile runtime, where generated
+  // debug handles can be lazily symbolicated using debug information
+  int64_t debug_handle_{-1};
+
+  // Whether this RecordFunction is used for an out variant run with
+  // Static Runtime
+  bool is_static_runtime_out_variant_{false};
+
+  // Whether this RecordFunction is used for NCCL metadata collection
+  bool is_nccl_meta_{false};
+};
+
+TORCH_API StepCallbacks getStepCallbacks(RecordScope scope);
+
+TORCH_API c10::optional<StepCallbacks> getStepCallbacksUnlessEmpty(
+    RecordScope scope);
+
+namespace detail {
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+    const Inputs& inputs,
+    Args&&... args) {
+  if (guard.needsInputs()) {
+    guard.before(
+        fn,
+        c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
+        std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
+  }
+}
+
+template <typename Inputs, typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+    int64_t debug_handle,
+    const Inputs& inputs,
+    Args&&... args) {
+  guard.setDebugHandle(debug_handle);
+  if (guard.needsInputs()) {
+    guard.before(
+        fn,
+        c10::ArrayRef<const c10::IValue>(inputs.data(), inputs.size()),
+        std::forward<Args>(args)...);
+  } else {
+    guard.before(fn, std::forward<Args>(args)...);
+  }
+}
+
+template <typename F, typename... Args>
+void record_function_with_scope(
+    RecordFunction& guard,
+    F fn,
+    c10::ArrayRef<const c10::IValue> inputs,
+    Args&&... args) {
+  return record_function_with_scope<
+      c10::ArrayRef<const c10::IValue>,
+      F,
+      Args...>(guard, std::move(fn), inputs, std::forward<Args>(args)...);
+}
+
+template <typename F, typename... Args>
+void record_function_with_scope_and_debug_handle(
+    RecordFunction& guard,
+    F fn,
+    int64_t debug_handle,
+    c10::ArrayRef<const c10::IValue> inputs,
+    Args&&... args) {
+  return record_function_with_scope_and_debug_handle<
+      c10::ArrayRef<const c10::IValue>,
+      F,
+      Args...>(
+      guard, std::move(fn), debug_handle, inputs, std::forward<Args>(args)...);
+}
+
+} // namespace detail
+
+// optional argument - function's seq_no
+#define RECORD_FUNCTION_WITH_SCOPE(scope, fn, inputs, ...) \
+  at::RecordFunction guard(scope);                         \
+  if (guard.isActive()) {                                  \
+    ::at::detail::record_function_with_scope(              \
+        guard, fn, inputs, ##__VA_ARGS__);                 \
+  }
+
+#define RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS( \
+    scope, fn, inputs, outputs, ...)               \
+  at::RecordFunction guard(scope);                 \
+  if (guard.isActive()) {                          \
+    if (guard.needsInputs()) {                     \
+      guard.before(fn, inputs, ##__VA_ARGS__);     \
+    } else {                                       \
+      guard.before(fn, ##__VA_ARGS__);             \
+    }                                              \
+    if (guard.needsOutputs()) {                    \
+      guard.setOutputs(outputs);                   \
+    }                                              \
+  }
+
+#define RECORD_FUNCTION(fn, inputs, ...) \
+  RECORD_FUNCTION_WITH_SCOPE(            \
+      at::RecordScope::FUNCTION, fn, inputs, ##__VA_ARGS__)
+
+#define RECORD_TORCHSCRIPT_FUNCTION(mn, inputs) \
+  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::TORCHSCRIPT_FUNCTION, mn, inputs)
+
+#define RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(fn, inputs, outputs, ...) \
+  RECORD_FUNCTION_WITH_SCOPE_INPUTS_OUTPUTS(                          \
+      at::RecordScope::FUNCTION, fn, inputs, outputs, ##__VA_ARGS__)
+
+// Custom user scopes in C++; similar to Python's 'with record_function("..."):'
+#define RECORD_USER_SCOPE(fn) \
+  RECORD_FUNCTION_WITH_SCOPE( \
+      at::RecordScope::USER_SCOPE, fn, c10::ArrayRef<const c10::IValue>{})
+
+// RECORD_USER_SCOPE with inputs
+#define RECORD_USER_SCOPE_WITH_INPUTS(fn, inputs) \
+  RECORD_FUNCTION_WITH_SCOPE(at::RecordScope::USER_SCOPE, fn, inputs)
+
+// Helper macro to pass in debug handle that is used to
+// post process events
+#define RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(             \
+    scope, fn, debug_handle, inputs, ...)                      \
+  at::RecordFunction guard(scope);                             \
+  if (guard.isActive()) {                                      \
+    ::at::detail::record_function_with_scope_and_debug_handle( \
+        guard, fn, debug_handle, inputs, ##__VA_ARGS__);       \
+  }
+
+// Helper macros to record LITE INTERPETER scope events with debug handles
+#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
+    fn, debug_handle, inputs)                           \
+  RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS(            \
+      at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)
+
+// Bookend to the RECORD_FUNCTION macros.  Use this after the kernel
+// launch to let the profiler bind the outputs to the op that produced
+// them.  Note that guard is declared by RECORD_FUNCTION so this macro
+// needs to be called from the same scope as RECORD_FUNCTION
+#define RECORD_OUTPUTS(outputs)                                    \
+  if (guard.needsOutputs()) {                                      \
+    guard.setOutputs(                                              \
+        std::vector<c10::IValue>(outputs.begin(), outputs.end())); \
+  }
+
+/**
+ * addThreadLocalCallback adds a thread local callback to run with
+ * RecordFunction, returns handle to use with removeThreadLocalCallback
+ */
+TORCH_API CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb);
+
+/**
+ * hasThreadLocalCallbacks returns whether there're callbacks registered
+ * with addThreadLocalCallback
+ */
+TORCH_API bool hasThreadLocalCallbacks();
+
+/**
+ * clearThreadLocalCallbacks removes all thread local callbacks
+ */
+TORCH_API void clearThreadLocalCallbacks();
+
+/**
+ * addGlobalCallback adds a global callback to run with RecordFunction:
+ *
+ * only during the program initialization
+ */
+TORCH_API CallbackHandle addGlobalCallback(RecordFunctionCallback cb);
+
+/**
+ * removeCallback removes a callback given the handle returned by
+ * addThreadLocalCallback or addGlobalCallback;
+ *
+ * no other code can run simultaneously
+ */
+TORCH_API void removeCallback(CallbackHandle handle);
+
+/**
+ * Prevent the given callback from executing. If handle is invalid,
+ * does nothing.
+ */
+TORCH_API void disableCallback(CallbackHandle handle);
+
+/**
+ * Allow the given callback, previously disabled with disableCallback, to
+ * execute again. If handle is invalid, does nothing.
+ */
+TORCH_API void reenableCallback(CallbackHandle handle);
+
+/**
+ * hasGlobalCallbacks returns whether there're global callbacks
+ * registered with pushGlobalCallback
+ */
+TORCH_API bool hasGlobalCallbacks();
+
+/**
+ * clearGlobalCallbacks removes all global callbacks
+ */
+TORCH_API void clearGlobalCallbacks();
+
+// for both thread local and global callbacks
+TORCH_API bool hasCallbacks();
+TORCH_API void clearCallbacks();
+
+/**
+ * enableRecordFunction enables RecordFunction thread locally
+ */
+TORCH_API void enableRecordFunction(bool enable = true);
+
+/**
+ * isRecordFunctionEnabled returns whether RecordFunction
+ * is enabled thread locally
+ */
+TORCH_API bool isRecordFunctionEnabled();
+
+class TORCH_API RecordFunctionGuard {
+ public:
+  explicit RecordFunctionGuard(bool is_enabled = true)
+      : prev_value_(isRecordFunctionEnabled()) {
+    enableRecordFunction(is_enabled);
+  }
+
+  virtual ~RecordFunctionGuard() {
+    enableRecordFunction(prev_value_);
+  }
+
+ private:
+  bool prev_value_ = false;
+};
+
+class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard {
+ public:
+  DisableRecordFunctionGuard() : RecordFunctionGuard(false) {}
+  ~DisableRecordFunctionGuard() override = default;
+};
+
+struct TORCH_API RecordFunctionTLS {
+  // Thread local vector of callbacks, holds pairs (callbacks, unique_id);
+  // must be sorted in increasing handles order
+  RecordFunctionCallbacks sorted_tls_callbacks_;
+
+  bool tls_record_function_enabled_ = true;
+};
+
+TORCH_API const RecordFunctionTLS& get_record_function_tls_();
+
+TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls);
+
+TORCH_API void set_record_function_seed_for_testing(uint32_t seed);
+
+} // namespace at
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Allocator.h b/MLPY/Lib/site-packages/torch/include/c10/core/Allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..842cae2c1234f78a9d2d2bec2b74f44109ae569b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Allocator.h
@@ -0,0 +1,319 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+namespace c10 {
+
+// A DataPtr is a unique pointer (with an attached deleter and some
+// context for the deleter) to some memory, which also records what
+// device is for its data.
+//
+// nullptr DataPtrs can still have a nontrivial device; this allows
+// us to treat zero-size allocations uniformly with non-zero allocations.
+//
+class C10_API DataPtr {
+ private:
+  c10::detail::UniqueVoidPtr ptr_;
+  Device device_;
+
+ public:
+  // Choice of CPU here is arbitrary; if there's an "undefined" device
+  // we could use that too
+  DataPtr() : ptr_(), device_(DeviceType::CPU) {}
+  DataPtr(void* data, Device device) : ptr_(data), device_(device) {}
+  DataPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter, Device device)
+      : ptr_(data, ctx, ctx_deleter), device_(device) {}
+  void* operator->() const {
+    return ptr_.get();
+  }
+  void clear() {
+    ptr_.clear();
+  }
+  void* get() const {
+    return ptr_.get();
+  }
+  void* mutable_get() {
+    return ptr_.get();
+  }
+  void* get_context() const {
+    return ptr_.get_context();
+  }
+  void* release_context() {
+    return ptr_.release_context();
+  }
+  std::unique_ptr<void, DeleterFnPtr>&& move_context() {
+    return ptr_.move_context();
+  }
+  operator bool() const {
+    return static_cast<bool>(ptr_);
+  }
+  template <typename T>
+  T* cast_context(DeleterFnPtr expected_deleter) const {
+    return ptr_.cast_context<T>(expected_deleter);
+  }
+  DeleterFnPtr get_deleter() const {
+    return ptr_.get_deleter();
+  }
+  /**
+   * Compare the deleter in a DataPtr to expected_deleter.
+   * If it matches, replace the deleter with new_deleter
+   * and return true; otherwise, does nothing and returns
+   * false.
+   *
+   * In general, it is not safe to unconditionally set the
+   * deleter on a DataPtr, because you don't know what
+   * the deleter is, and thus will have a hard time properly
+   * disposing of the deleter without storing the original
+   * deleter (this is difficult to do, because DeleterFnPtr
+   * is not a closure, and because the context on DataPtr is
+   * only a single word, you generally don't have enough
+   * space to store both the original deleter and its context).
+   * However, in some cases, you know /exactly/ what the deleter
+   * is, and you have a new deleter that manually wraps
+   * the old one.  In this case, you can safely swap the deleter
+   * after asserting that the deleters line up.
+   *
+   * What are the requirements on new_deleter?  It must still
+   * properly dispose of the void* pointer passed in as its argument,
+   * where void* is whatever the context of the original deleter
+   * is.  So in general, you expect the new deleter to look something
+   * like this:
+   *
+   *      [](void* ptr) {
+   *        some_new_stuff(ptr);
+   *        get_orig_allocator()->raw_deleter(ptr);
+   *      }
+   *
+   * Note that it won't work to close over the original
+   * allocator; you don't have enough space to do that!  Also,
+   * it's unsafe to assume that the passed in pointer in
+   * question is the memory pointer in question; it might not
+   * be; be sure to read the source code of the Allocator
+   * in question to confirm this.
+   */
+  C10_NODISCARD bool compare_exchange_deleter(
+      DeleterFnPtr expected_deleter,
+      DeleterFnPtr new_deleter) {
+    return ptr_.compare_exchange_deleter(expected_deleter, new_deleter);
+  }
+  Device device() const {
+    return device_;
+  }
+  // Unsafely mutates the device on a DataPtr.  Under normal use,
+  // you should never actually need to call this function.
+  // We need this for the implementation of the hack detailed
+  // in Note [Masquerading as CUDA]
+  void unsafe_set_device(Device device) {
+    device_ = device;
+  }
+};
+
+// NB: Device is NOT tested for here; a CUDA nullptr is as much a nullptr as a
+// CPU nullptr
+
+inline bool operator==(const DataPtr& dp, std::nullptr_t) noexcept {
+  return !dp;
+}
+inline bool operator==(std::nullptr_t, const DataPtr& dp) noexcept {
+  return !dp;
+}
+inline bool operator!=(const DataPtr& dp, std::nullptr_t) noexcept {
+  return dp;
+}
+inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
+  return dp;
+}
+
+// Note [raw_allocate/raw_deallocate and Thrust]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Thrust's support for custom allocators requires us to write something
+// like this:
+//
+//  class ThrustAllocator {
+//    char* allocate(size_t);
+//    void deallocate(char*, size_t);
+//  };
+//
+// This is not good for our unique_ptr based allocator interface, as
+// there is no way to get to the context when we free.
+//
+// However, in some cases the context is exactly the same as
+// the data pointer.  In this case, we can support the "raw"
+// allocate and deallocate interface.  This is what
+// raw_deleter signifies.  By default, it returns a nullptr, which means that
+// the raw interface is not implemented.  Be sure to implement it whenever
+// possible, or the raw interface will incorrectly reported as unsupported,
+// when it is actually possible.
+
+struct C10_API Allocator {
+  virtual ~Allocator() = default;
+
+  virtual DataPtr allocate(size_t n) = 0;
+
+  // Clones an allocation that came from this allocator.
+  //
+  // To perform the copy, this function calls `copy_data`, which
+  // must be implemented by derived classes.
+  //
+  // Note that this explicitly ignores any context that may have been
+  // attached to the input data.
+  //
+  // Requires: input data was allocated by the same allocator.
+  DataPtr clone(const void* data, std::size_t n);
+
+  // Checks if DataPtr has a simple context, not wrapped with any out of the
+  // ordinary contexts.
+  virtual bool is_simple_data_ptr(const DataPtr& data_ptr) const;
+
+  // If this returns a non nullptr, it means that allocate()
+  // is guaranteed to return a unique_ptr with this deleter attached;
+  // it means the rawAllocate and rawDeallocate APIs are safe to use.
+  // This function MUST always return the same BoundDeleter.
+  virtual DeleterFnPtr raw_deleter() const {
+    return nullptr;
+  }
+  void* raw_allocate(size_t n) {
+    auto dptr = allocate(n);
+    AT_ASSERT(dptr.get() == dptr.get_context());
+    return dptr.release_context();
+  }
+  void raw_deallocate(void* ptr) {
+    auto d = raw_deleter();
+    AT_ASSERT(d);
+    d(ptr);
+  }
+
+  // Copies data from one allocation to another.
+  // Pure virtual, so derived classes must define behavior.
+  // Derived class implementation can simply call `default_copy_data`
+  // to use `std::memcpy`.
+  //
+  // Requires: src and dest were allocated by this allocator
+  // Requires: src and dest both have length >= count
+  virtual void copy_data(void* dest, const void* src, std::size_t count)
+      const = 0;
+
+ protected:
+  // Uses `std::memcpy` to copy data.
+  // Child classes can use this as `copy_data` when an alternative copy
+  // API is not needed.
+  void default_copy_data(void* dest, const void* src, std::size_t count) const;
+};
+
+// This context is used to generate DataPtr which have arbitrary
+// std::function deleters associated with them.  In some user facing
+// functions, we give a (user-friendly) interface for constructing
+// tensors from external data which take an arbitrary std::function
+// deleter.  Grep for InefficientStdFunctionContext to find these
+// occurrences.
+//
+// This context is inefficient because we have to do a dynamic
+// allocation InefficientStdFunctionContext, on top of the dynamic
+// allocation which is implied by std::function itself.
+struct C10_API InefficientStdFunctionContext {
+  void* ptr_;
+  std::function<void(void*)> deleter_;
+  InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
+      : ptr_(ptr), deleter_(std::move(deleter)) {}
+  ~InefficientStdFunctionContext() {
+    if (deleter_) {
+      deleter_(ptr_);
+    }
+  }
+  static DataPtr makeDataPtr(
+      void* ptr,
+      std::function<void(void*)> deleter,
+      Device device);
+};
+
+/** Set the allocator for DeviceType `t`. The passed in allocator pointer is
+ *  expected to have static lifetime; this function does NOT take ownership
+ *  of the raw pointer. (The reason for this is to prevent existing pointers
+ *  to an allocator of a particular device from being invalidated when
+ *  SetAllocator is called.)
+ *
+ *  Also note that this is not thread-safe, and we assume this function will
+ *  only be called during initialization.
+ *
+ *  The 'priority' flag is introduced when we want to overwrite the default
+ *  allocator, since the allocators are set statically. The default priority
+ *  is 0, which means the lowest. Only higher or equal priority can overwrite
+ *  existing ones.
+ */
+C10_API void SetAllocator(DeviceType t, Allocator* alloc, uint8_t priority = 0);
+C10_API Allocator* GetAllocator(const DeviceType& t);
+
+template <DeviceType t>
+struct AllocatorRegisterer {
+  explicit AllocatorRegisterer(Allocator* alloc) {
+    SetAllocator(t, alloc);
+  }
+};
+
+#define REGISTER_ALLOCATOR(t, f)                       \
+  namespace {                                          \
+  static c10::AllocatorRegisterer<t> g_allocator_d(f); \
+  }
+
+// An interface for reporting thread local memory usage
+// per device
+struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
+  MemoryReportingInfoBase();
+  ~MemoryReportingInfoBase() override = default;
+
+  /**
+   * alloc_size corresponds to the size of the ptr.
+   *
+   * total_allocated corresponds to total allocated memory.
+   *
+   * total_reserved corresponds to total size of memory pool, both used and
+   * unused, if applicable.
+   */
+  virtual void reportMemoryUsage(
+      void* ptr,
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      Device device) = 0;
+
+  virtual void reportOutOfMemory(
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      Device device);
+
+  virtual bool memoryProfilingEnabled() const = 0;
+};
+
+C10_API bool memoryProfilingEnabled();
+C10_API void reportMemoryUsageToProfiler(
+    void* ptr,
+    int64_t alloc_size,
+    size_t total_allocated,
+    size_t total_reserved,
+    Device device);
+
+C10_API void reportOutOfMemoryToProfiler(
+    int64_t alloc_size,
+    size_t total_allocated,
+    size_t total_reserved,
+    Device device);
+
+// used to hold traceback information in allocators
+struct GatheredContext {
+  virtual ~GatheredContext() = default;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/AutogradState.h b/MLPY/Lib/site-packages/torch/include/c10/core/AutogradState.h
new file mode 100644
index 0000000000000000000000000000000000000000..328ca686a11c441b2d0777f6045dcbcf59708715
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/AutogradState.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+// Structure used to pack all the thread local boolean
+// flags used by autograd
+struct C10_API AutogradState {
+  static AutogradState& get_tls_state();
+  static void set_tls_state(AutogradState state);
+
+  AutogradState(
+      bool grad_mode,
+      bool inference_mode,
+      bool fw_grad_mode,
+      bool multithreading_enabled)
+      : grad_mode_(grad_mode),
+        inference_mode_(inference_mode),
+        fw_grad_mode_(fw_grad_mode),
+        multithreading_enabled_(multithreading_enabled),
+        view_replay_enabled_(false) {}
+
+  void set_grad_mode(bool enabled) {
+    grad_mode_ = enabled;
+  }
+
+  void set_fw_grad_mode(bool enabled) {
+    fw_grad_mode_ = enabled;
+  }
+
+  void set_inference_mode(bool enabled) {
+    inference_mode_ = enabled;
+  }
+
+  void set_multithreading_enabled(bool multithreading_enabled) {
+    multithreading_enabled_ = multithreading_enabled;
+  }
+
+  void set_view_replay_enabled(bool view_replay_enabled) {
+    view_replay_enabled_ = view_replay_enabled;
+  }
+
+  bool get_grad_mode() const {
+    return grad_mode_;
+  }
+
+  bool get_fw_grad_mode() const {
+    return fw_grad_mode_;
+  }
+
+  bool get_inference_mode() const {
+    return inference_mode_;
+  }
+
+  bool get_multithreading_enabled() const {
+    return multithreading_enabled_;
+  }
+
+  bool get_view_replay_enabled() const {
+    return view_replay_enabled_;
+  }
+
+ private:
+  bool grad_mode_ : 1;
+  bool inference_mode_ : 1;
+  bool fw_grad_mode_ : 1;
+  bool multithreading_enabled_ : 1;
+  bool view_replay_enabled_ : 1;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Backend.h b/MLPY/Lib/site-packages/torch/include/c10/core/Backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..352427d9ed99e260a450826edf2a4ceaa77cbb7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Backend.h
@@ -0,0 +1,388 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/Exception.h>
+
+#include <stdexcept>
+
+namespace c10 {
+
+/**
+ * This legacy enum class defines the set of backends supported by old school,
+ * code generated Type-based ATen.  A "backend" in this sense roughly
+ * corresponds to the cartesian product of (device type, layout), but restricted
+ * only to combinations which we actually have kernels for.  Backend does NOT
+ * include dtype.
+ *
+ * The reason we are sunsetting this enum class is because it doesn't allow for
+ * open registration; e.g., if you want to add SparseXLA, you'd have to
+ * edit this enum; you wouldn't be able to do it out of tree.  DispatchKey is
+ * the replacement for Backend which supports open registration.
+ *
+ * NB: The concept of 'Backend' here disagrees with the notion of backend
+ * exposed to users in torch.backends.  Backend here is something like "CPU"
+ * or "SparseCUDA"; backend in torch.backends is something like "MKL" or
+ * "CUDNN".
+ */
+enum class Backend {
+  CPU,
+  CUDA,
+  HIP,
+  VE,
+  FPGA,
+  IPU,
+  XPU,
+  SparseCPU,
+  SparseCUDA,
+  SparseCsrCPU,
+  SparseCsrCUDA,
+  SparseHIP,
+  SparseVE,
+  SparseXPU,
+  SparsePrivateUse1,
+  SparseCsrHIP,
+  SparseCsrVE,
+  SparseCsrXPU,
+  SparseCsrPrivateUse1,
+  ORT,
+  XLA,
+  Vulkan,
+  Metal,
+  Meta,
+  QuantizedCPU,
+  QuantizedCUDA,
+  QuantizedXPU,
+  QuantizedPrivateUse1,
+  Undefined,
+  MkldnnCPU,
+  MPS,
+  HPU,
+  Lazy,
+  MTIA,
+  PrivateUse1,
+  NumOptions
+};
+
+static inline Backend dispatchKeyToBackend(DispatchKey t) {
+  if (t == DispatchKey::CPU || t == DispatchKey::AutogradCPU) {
+    return Backend::CPU;
+  } else if (t == DispatchKey::CUDA || t == DispatchKey::AutogradCUDA) {
+    return Backend::CUDA;
+  } else if (t == DispatchKey::HIP) {
+    return Backend::HIP;
+  } else if (t == DispatchKey::VE) {
+    return Backend::VE;
+  } else if (t == DispatchKey::FPGA) {
+    return Backend::FPGA;
+  } else if (t == DispatchKey::ORT) {
+    return Backend::ORT;
+  } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
+    return Backend::XLA;
+  } else if (t == DispatchKey::Lazy || t == DispatchKey::AutogradLazy) {
+    return Backend::Lazy;
+  } else if (t == DispatchKey::MPS || t == DispatchKey::AutogradMPS) {
+    return Backend::MPS;
+  } else if (t == DispatchKey::Vulkan) {
+    return Backend::Vulkan;
+  } else if (t == DispatchKey::Metal) {
+    return Backend::Metal;
+  } else if (t == DispatchKey::Meta) {
+    return Backend::Meta;
+  } else if (t == DispatchKey::SparseCPU) {
+    return Backend::SparseCPU;
+  } else if (t == DispatchKey::SparseCUDA) {
+    return Backend::SparseCUDA;
+  } else if (t == DispatchKey::SparseHIP) {
+    return Backend::SparseHIP;
+  } else if (t == DispatchKey::SparseVE) {
+    return Backend::SparseVE;
+  } else if (t == DispatchKey::SparsePrivateUse1) {
+    return Backend::SparsePrivateUse1;
+  } else if (t == DispatchKey::SparseCsrCPU) {
+    return Backend::SparseCsrCPU;
+  } else if (t == DispatchKey::SparseCsrCUDA) {
+    return Backend::SparseCsrCUDA;
+  } else if (t == DispatchKey::SparseCsrHIP) {
+    return Backend::SparseCsrHIP;
+  } else if (t == DispatchKey::SparseCsrVE) {
+    return Backend::SparseCsrVE;
+  } else if (t == DispatchKey::SparseCsrPrivateUse1) {
+    return Backend::SparseCsrPrivateUse1;
+  } else if (t == DispatchKey::MkldnnCPU) {
+    return Backend::MkldnnCPU;
+  } else if (t == DispatchKey::QuantizedCPU) {
+    return Backend::QuantizedCPU;
+  } else if (t == DispatchKey::QuantizedCUDA) {
+    return Backend::QuantizedCUDA;
+  } else if (t == DispatchKey::IPU || t == DispatchKey::AutogradIPU) {
+    return Backend::IPU;
+  } else if (t == DispatchKey::XPU || t == DispatchKey::AutogradXPU) {
+    return Backend::XPU;
+  } else if (t == DispatchKey::SparseXPU) {
+    return Backend::SparseXPU;
+  } else if (t == DispatchKey::SparseCsrXPU) {
+    return Backend::SparseCsrXPU;
+  } else if (t == DispatchKey::QuantizedXPU) {
+    return Backend::QuantizedXPU;
+  } else if (t == DispatchKey::QuantizedPrivateUse1) {
+    return Backend::QuantizedPrivateUse1;
+  } else if (t == DispatchKey::HPU || t == DispatchKey::AutogradHPU) {
+    return Backend::HPU;
+  } else if (t == DispatchKey::MTIA || t == DispatchKey::AutogradMTIA) {
+    return Backend::MTIA;
+  } else if (
+      t == DispatchKey::PrivateUse1 || t == DispatchKey::AutogradPrivateUse1) {
+    return Backend::PrivateUse1;
+  } else if (t == DispatchKey::Undefined) {
+    return Backend::Undefined;
+  } else {
+    TORCH_CHECK(false, "Unrecognized tensor type ID: ", t);
+  }
+}
+
+static inline DispatchKey backendToDispatchKey(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return DispatchKey::CPU;
+    case Backend::CUDA:
+      return DispatchKey::CUDA;
+    case Backend::HIP:
+      return DispatchKey::HIP;
+    case Backend::VE:
+      return DispatchKey::VE;
+    case Backend::FPGA:
+      return DispatchKey::FPGA;
+    case Backend::ORT:
+      return DispatchKey::ORT;
+    case Backend::XLA:
+      return DispatchKey::XLA;
+    case Backend::Lazy:
+      return DispatchKey::Lazy;
+    case Backend::IPU:
+      return DispatchKey::IPU;
+    case Backend::XPU:
+      return DispatchKey::XPU;
+    case Backend::SparseXPU:
+      return DispatchKey::SparseXPU;
+    case Backend::SparseCsrXPU:
+      return DispatchKey::SparseCsrXPU;
+    case Backend::SparseCPU:
+      return DispatchKey::SparseCPU;
+    case Backend::SparseCUDA:
+      return DispatchKey::SparseCUDA;
+    case Backend::SparseHIP:
+      return DispatchKey::SparseHIP;
+    case Backend::SparseVE:
+      return DispatchKey::SparseVE;
+    case Backend::SparsePrivateUse1:
+      return DispatchKey::SparsePrivateUse1;
+    case Backend::SparseCsrCPU:
+      return DispatchKey::SparseCsrCPU;
+    case Backend::SparseCsrCUDA:
+      return DispatchKey::SparseCsrCUDA;
+    case Backend::SparseCsrHIP:
+      return DispatchKey::SparseCsrHIP;
+    case Backend::SparseCsrVE:
+      return DispatchKey::SparseCsrVE;
+    case Backend::SparseCsrPrivateUse1:
+      return DispatchKey::SparseCsrPrivateUse1;
+    case Backend::MkldnnCPU:
+      return DispatchKey::MkldnnCPU;
+    case Backend::Vulkan:
+      return DispatchKey::Vulkan;
+    case Backend::Metal:
+      return DispatchKey::Metal;
+    case Backend::Meta:
+      return DispatchKey::Meta;
+    case Backend::QuantizedCPU:
+      return DispatchKey::QuantizedCPU;
+    case Backend::QuantizedCUDA:
+      return DispatchKey::QuantizedCUDA;
+    case Backend::QuantizedPrivateUse1:
+      return DispatchKey::QuantizedPrivateUse1;
+    case Backend::Undefined:
+      return DispatchKey::Undefined;
+    case Backend::MPS:
+      return DispatchKey::MPS;
+    case Backend::HPU:
+      return DispatchKey::HPU;
+    case Backend::MTIA:
+      return DispatchKey::MTIA;
+    case Backend::PrivateUse1:
+      return DispatchKey::PrivateUse1;
+    default:
+      throw std::runtime_error("Unknown backend");
+  }
+}
+
+static inline DeviceType backendToDeviceType(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+    case Backend::MkldnnCPU:
+    case Backend::SparseCPU:
+    case Backend::SparseCsrCPU:
+    case Backend::QuantizedCPU:
+      return DeviceType::CPU;
+    case Backend::CUDA:
+    case Backend::SparseCUDA:
+    case Backend::QuantizedCUDA:
+    case Backend::SparseCsrCUDA:
+      return DeviceType::CUDA;
+    case Backend::HIP:
+      return DeviceType::HIP;
+    case Backend::VE:
+      return DeviceType::VE;
+    case Backend::FPGA:
+      return DeviceType::FPGA;
+    case Backend::ORT:
+      return DeviceType::ORT;
+    case Backend::XLA:
+      return DeviceType::XLA;
+    case Backend::Lazy:
+      return DeviceType::Lazy;
+    case Backend::SparseHIP:
+      return DeviceType::HIP;
+    case Backend::SparseVE:
+      return DeviceType::VE;
+    case Backend::SparseCsrHIP:
+      return DeviceType::HIP;
+    case Backend::SparseCsrVE:
+      return DeviceType::VE;
+    case Backend::IPU:
+      return DeviceType::IPU;
+    case Backend::XPU:
+    case Backend::SparseXPU:
+    case Backend::SparseCsrXPU:
+    case Backend::QuantizedXPU:
+      return DeviceType::XPU;
+    case Backend::Vulkan:
+      return DeviceType::Vulkan;
+    case Backend::Metal:
+      return DeviceType::Metal;
+    case Backend::Meta:
+      return DeviceType::Meta;
+    case Backend::MPS:
+      return DeviceType::MPS;
+    case Backend::HPU:
+      return DeviceType::HPU;
+    case Backend::MTIA:
+      return DeviceType::MTIA;
+    case Backend::PrivateUse1:
+    case Backend::SparsePrivateUse1:
+    case Backend::SparseCsrPrivateUse1:
+    case Backend::QuantizedPrivateUse1:
+      return DeviceType::PrivateUse1;
+    case Backend::Undefined:
+      TORCH_CHECK(false, "Undefined backend is not a valid device type");
+    default:
+      TORCH_CHECK(false, "Unknown backend");
+  }
+}
+
+// TODO: This probably shouldn't actually be static inline
+static inline const char* toString(Backend b) {
+  switch (b) {
+    case Backend::CPU:
+      return "CPU";
+    case Backend::CUDA:
+      return "CUDA";
+    case Backend::HIP:
+      return "HIP";
+    case Backend::VE:
+      return "VE";
+    case Backend::FPGA:
+      return "FPGA";
+    case Backend::XPU:
+      return "XPU";
+    case Backend::IPU:
+      return "IPU";
+    case Backend::ORT:
+      return "ORT";
+    case Backend::XLA:
+      return "XLA";
+    case Backend::Lazy:
+      return "Lazy";
+    case Backend::MPS:
+      return "MPS";
+    case Backend::SparseCPU:
+      return "SparseCPU";
+    case Backend::SparseCUDA:
+      return "SparseCUDA";
+    case Backend::SparseHIP:
+      return "SparseHIP";
+    case Backend::SparseVE:
+      return "SparseVE";
+    case Backend::SparseXPU:
+      return "SparseXPU";
+    case Backend::SparsePrivateUse1:
+      return "SparsePrivateUse1";
+    case Backend::SparseCsrCPU:
+      return "SparseCsrCPU";
+    case Backend::SparseCsrCUDA:
+      return "SparseCsrCUDA";
+    case Backend::SparseCsrHIP:
+      return "SparseCsrHIP";
+    case Backend::SparseCsrVE:
+      return "SparseCsrVE";
+    case Backend::SparseCsrXPU:
+      return "SparseCsrXPU";
+    case Backend::SparseCsrPrivateUse1:
+      return "SparseCsrPrivateUse1";
+    case Backend::MkldnnCPU:
+      return "MkldnnCPU";
+    case Backend::Vulkan:
+      return "Vulkan";
+    case Backend::Metal:
+      return "Metal";
+    case Backend::Meta:
+      return "Meta";
+    case Backend::QuantizedCPU:
+      return "QuantizedCPU";
+    case Backend::QuantizedCUDA:
+      return "QuantizedCUDA";
+    case Backend::QuantizedXPU:
+      return "QuantizedXPU";
+    case Backend::QuantizedPrivateUse1:
+      return "QuantizedPrivateUse1";
+    case Backend::HPU:
+      return "HPU";
+    case Backend::MTIA:
+      return "MTIA";
+    case Backend::PrivateUse1:
+      return "PrivateUseOne";
+    default:
+      return "UNKNOWN_BACKEND";
+  }
+}
+
+static inline bool isSparse(Backend b) {
+  switch (b) {
+    case Backend::SparseXPU:
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+    case Backend::SparseHIP:
+    case Backend::SparseVE:
+    case Backend::SparsePrivateUse1:
+      return true;
+    default:
+      return false;
+  }
+}
+
+static inline bool isSparseCsr(Backend b) {
+  switch (b) {
+    case Backend::SparseCsrXPU:
+    case Backend::SparseCsrCPU:
+    case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrPrivateUse1:
+      return true;
+    default:
+      return false;
+  }
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/CPUAllocator.h b/MLPY/Lib/site-packages/torch/include/c10/core/CPUAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..d84ac28ec9e99d989692e37a1c465689dc7edf9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/CPUAllocator.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+
+#include <c10/core/Allocator.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+
+// TODO: rename to c10
+C10_DECLARE_bool(caffe2_report_cpu_memory_usage);
+
+namespace c10 {
+
+using MemoryDeleter = void (*)(void*);
+
+// A helper function that is basically doing nothing.
+C10_API void NoDelete(void*);
+
+// A simple struct that is used to report C10's memory allocation,
+// deallocation status and out-of-memory events to the profiler
+class C10_API ProfiledCPUMemoryReporter {
+ public:
+  ProfiledCPUMemoryReporter() = default;
+  void New(void* ptr, size_t nbytes);
+  void OutOfMemory(size_t nbytes);
+  void Delete(void* ptr);
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<void*, size_t> size_table_;
+  size_t allocated_ = 0;
+  size_t log_cnt_ = 0;
+};
+
+C10_API ProfiledCPUMemoryReporter& profiledCPUMemoryReporter();
+
+// Get the CPU Allocator.
+C10_API at::Allocator* GetCPUAllocator();
+// Sets the CPU allocator to the given allocator: the caller gives away the
+// ownership of the pointer.
+C10_API void SetCPUAllocator(at::Allocator* alloc, uint8_t priority = 0);
+
+// Get the Default CPU Allocator
+C10_API at::Allocator* GetDefaultCPUAllocator();
+
+// Get the Default Mobile CPU Allocator
+C10_API at::Allocator* GetDefaultMobileCPUAllocator();
+
+// The CPUCachingAllocator is experimental and might disappear in the future.
+// The only place that uses it is in StaticRuntime.
+// Set the CPU Caching Allocator
+C10_API void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority = 0);
+// Get the CPU Caching Allocator
+C10_API Allocator* GetCPUCachingAllocator();
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h b/MLPY/Lib/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h
new file mode 100644
index 0000000000000000000000000000000000000000..698f191056693249c0f0a95f1e671f7f6cf67d12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/CompileTimeFunctionPointer.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <c10/util/TypeTraits.h>
+#include <type_traits>
+
+namespace c10 {
+
+/**
+ * Represent a function pointer as a C++ type.
+ * This allows using the function pointer as a type
+ * in a template and calling it from inside the template
+ * allows the compiler to inline the call because it
+ * knows the function pointer at compile time.
+ *
+ * Example 1:
+ *  int add(int a, int b) {return a + b;}
+ *  using Add = TORCH_FN_TYPE(add);
+ *  template<class Func> struct Executor {
+ *    int execute(int a, int b) {
+ *      return Func::func_ptr()(a, b);
+ *    }
+ *  };
+ *  Executor<Add> executor;
+ *  EXPECT_EQ(3, executor.execute(1, 2));
+ *
+ * Example 2:
+ *  int add(int a, int b) {return a + b;}
+ *  template<class Func> int execute(Func, int a, int b) {
+ *    return Func::func_ptr()(a, b);
+ *  }
+ *  EXPECT_EQ(3, execute(TORCH_FN(add), 1, 2));
+ */
+template <class FuncType_, FuncType_* func_ptr_>
+struct CompileTimeFunctionPointer final {
+  static_assert(
+      guts::is_function_type<FuncType_>::value,
+      "TORCH_FN can only wrap function types.");
+  using FuncType = FuncType_;
+
+  static constexpr FuncType* func_ptr() {
+    return func_ptr_;
+  }
+};
+
+template <class T>
+struct is_compile_time_function_pointer : std::false_type {};
+template <class FuncType, FuncType* func_ptr>
+struct is_compile_time_function_pointer<
+    CompileTimeFunctionPointer<FuncType, func_ptr>> : std::true_type {};
+
+} // namespace c10
+
+#define TORCH_FN_TYPE(func)                                           \
+  ::c10::CompileTimeFunctionPointer<                                  \
+      std::remove_pointer_t<std::remove_reference_t<decltype(func)>>, \
+      func>
+#define TORCH_FN(func) TORCH_FN_TYPE(func)()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..863a701ecf7c86c9027f5e46b1a09746d4530bf8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/ConstantSymNodeImpl.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <cstdint>
+#include <string>
+#include <variant>
+
+namespace c10 {
+
+// Unlike other SymNodeImpl, this cannot be "dispatched" conventionally,
+// as it typically needs to defer to another SymNodeImpl
+//
+// Can either represent a bool, int (don't support float yet) this is useful
+// for representing otherwise unrepresentable large negative integer constant.
+template <typename T>
+class C10_API ConstantSymNodeImpl : public SymNodeImpl {
+  static_assert(
+      ::std::is_same_v<T, int64_t> || ::std::is_same_v<T, bool>,
+      "ConstantSymNodeImpl can only accept int64_t or bool types");
+
+ public:
+  ConstantSymNodeImpl(T val) : value_(val) {}
+
+  bool is_int() override {
+    return is_int_();
+  }
+  bool is_bool() override {
+    return is_bool_();
+  }
+  bool is_float() override {
+    return false;
+  }
+  int64_t guard_int(const char* file, int64_t line) override {
+    TORCH_CHECK(is_int(), "not an int");
+    return int_();
+  }
+  bool guard_bool(const char* file, int64_t line) override {
+    TORCH_CHECK(is_bool(), "not a bool");
+    return bool_();
+  }
+  double guard_float(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a float");
+  }
+  int64_t int_() override {
+    TORCH_CHECK(is_int(), "not an int");
+    return ::std::get<int64_t>(value_);
+  }
+  bool bool_() override {
+    TORCH_CHECK(is_bool(), "not a bool");
+    return ::std::get<bool>(value_);
+  }
+  bool has_hint() override {
+    return true;
+  }
+  c10::SymNode eq(const c10::SymNode& other) override;
+  c10::SymNode ne(const c10::SymNode& other) override;
+  c10::SymNode ge(const c10::SymNode& other) override;
+  c10::SymNode le(const c10::SymNode& other) override;
+  c10::SymNode lt(const c10::SymNode& other) override;
+  c10::SymNode gt(const c10::SymNode& other) override;
+  c10::SymNode mul(const c10::SymNode& other) override;
+  ::std::string str() override {
+    if constexpr (is_int_()) {
+      return ::std::to_string(::std::get<int64_t>(value_));
+    } else {
+      return ::std::get<bool>(value_) ? "true" : "false";
+    }
+  }
+  c10::optional<int64_t> constant_int() override {
+    if constexpr (is_int_()) {
+      return ::std::get<int64_t>(value_);
+    } else {
+      return c10::nullopt;
+    }
+  }
+  c10::optional<bool> constant_bool() override {
+    if constexpr (is_bool_()) {
+      return ::std::get<bool>(value_);
+    } else {
+      return c10::nullopt;
+    }
+  }
+  bool is_constant() override {
+    return true;
+  }
+  bool is_symbolic() override {
+    return false;
+  }
+
+ private:
+  ::std::variant<int64_t, bool> value_;
+
+  static constexpr bool is_int_() {
+    return ::std::is_same_v<T, int64_t>;
+  }
+  static constexpr bool is_bool_() {
+    return ::std::is_same_v<T, bool>;
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Contiguity.h b/MLPY/Lib/site-packages/torch/include/c10/core/Contiguity.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c28aa0d83f09ed477419e92db0ad758f46cbf42
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Contiguity.h
@@ -0,0 +1,129 @@
+#pragma once
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cstdint>
+
+namespace c10 {
+
+template <typename T>
+bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
+  bool is_contiguous = true;
+  if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0))) {
+    return is_contiguous;
+  }
+  T z = 1;
+  // NB: make sure we do signed arithmetic
+  for (int64_t d = int64_t(sizes.size()) - 1; d >= 0; d--) {
+    const auto& size_d = sizes[d];
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(strides[d], z))) {
+        z *= size_d;
+      } else {
+        is_contiguous = false;
+        break;
+      }
+    }
+  }
+  return is_contiguous;
+}
+
+template <typename T>
+bool _compute_channels_last_contiguous_2d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 4: {
+      T expected = 1;
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+bool _compute_channels_last_contiguous_3d(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  // Please don't combine these code, constant array is used here to let
+  // compiler fully unroll the loop to get better performance
+  switch (sizes.size()) {
+    case 5: {
+      T expected = 1;
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+      }
+      return true;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+bool _compute_non_overlapping_and_dense(
+    ArrayRef<T> sizes,
+    ArrayRef<T> strides) {
+  auto dim = sizes.size();
+  if (dim == 1) {
+    return sizes[0] < 2 || strides[0] == 1;
+  }
+  SmallVector<int64_t, 5> perm;
+  perm.resize(dim);
+  for (const auto i : c10::irange(dim)) {
+    perm[i] = i;
+  }
+  // Sort by strides, leaving 0 and 1 sized dims at the end of the array
+  std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
+    if (sizes[a] < 2) {
+      return false;
+    } else if (sizes[b] < 2) {
+      return true;
+    }
+    return strides[a] < strides[b];
+  });
+  T require_stride = 1;
+  for (const auto i : c10::irange(dim)) {
+    const auto& size_perm_i = sizes[perm[i]];
+    if (size_perm_i < 2) {
+      return true;
+    }
+    if (strides[perm[i]] != require_stride) {
+      return false;
+    }
+    require_stride *= size_perm_i;
+  }
+  return true;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/CopyBytes.h b/MLPY/Lib/site-packages/torch/include/c10/core/CopyBytes.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5b08d74aa6ba3525d2ab50f73dd217b893b4f9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/CopyBytes.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <cstddef>
+
+namespace c10 {
+
+using CopyBytesFunction = void (*)(
+    size_t nbytes,
+    const void* src,
+    Device src_device,
+    void* dst,
+    Device dst_device);
+
+struct C10_API _CopyBytesFunctionRegisterer {
+  _CopyBytesFunctionRegisterer(
+      DeviceType from,
+      DeviceType to,
+      CopyBytesFunction func_sync,
+      CopyBytesFunction func_async = nullptr);
+};
+
+#define REGISTER_COPY_BYTES_FUNCTION(from, to, ...)           \
+  namespace {                                                 \
+  static _CopyBytesFunctionRegisterer C10_ANONYMOUS_VARIABLE( \
+      g_copy_function)(from, to, __VA_ARGS__);                \
+  }
+
+/*
+ * WARNING: Implementations for this function are currently registered from
+ * ATen and caffe2, not yet from c10. Don't use this if not either ATen
+ * or caffe2 is present as well.
+ * We can't move them yet, because the CUDA implementations aren't unified yet
+ * between ATen and caffe2.
+ * We're planning to move the implementations into c10/backend/xxx
+ * to make c10 self contained again.
+ */
+C10_API void CopyBytes(
+    size_t nbytes,
+    const void* src,
+    Device src_device,
+    void* dst,
+    Device dst_device,
+    bool async);
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DefaultDtype.h b/MLPY/Lib/site-packages/torch/include/c10/core/DefaultDtype.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c4cd53fa78dc59fc7da6b0fbe4fd2ad9f91cf77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DefaultDtype.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Export.h>
+
+namespace caffe2 {
+class TypeMeta;
+} // namespace caffe2
+
+namespace c10 {
+C10_API void set_default_dtype(caffe2::TypeMeta dtype);
+C10_API const caffe2::TypeMeta get_default_dtype();
+C10_API ScalarType get_default_dtype_as_scalartype();
+C10_API const caffe2::TypeMeta get_default_complex_dtype();
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DefaultTensorOptions.h b/MLPY/Lib/site-packages/torch/include/c10/core/DefaultTensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c00197ead055805164a6f99ff8600b46784f3963
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DefaultTensorOptions.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Layout.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/typeid.h>
+
+namespace c10 {
+
+struct TensorOptions;
+
+/// Like TensorOptions, but all fields are guaranteed to be filled.
+struct DefaultTensorOptions {
+  DefaultTensorOptions() = default;
+
+  caffe2::TypeMeta dtype() const noexcept {
+    return dtype_;
+  }
+  Device device() const noexcept {
+    return device_;
+  }
+  Layout layout() const noexcept {
+    return layout_;
+  }
+  bool requires_grad() const noexcept {
+    return requires_grad_;
+  }
+
+  // Defined in TensorOptions.h
+  inline DefaultTensorOptions& merge(const TensorOptions& options);
+
+ private:
+  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 64-bit
+  Device device_ = at::kCPU; // 32-bit
+  Layout layout_ = at::kStrided; // 8-bit
+  bool requires_grad_ = false; // 8-bit
+};
+
+inline const DefaultTensorOptions& getDefaultTensorOptions() {
+  static const auto options = DefaultTensorOptions();
+  return options;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Device.h b/MLPY/Lib/site-packages/torch/include/c10/core/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..064b1f8c2d67d1a9d5a306fdf7db7d11d6efe83f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Device.h
@@ -0,0 +1,216 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <string>
+
+namespace c10 {
+
+/// An index representing a specific device; e.g., the 1 in GPU 1.
+/// A DeviceIndex is not independently meaningful without knowing
+/// the DeviceType it is associated; try to use Device rather than
+/// DeviceIndex directly.
+using DeviceIndex = int8_t;
+
+/// Represents a compute device on which a tensor is located. A device is
+/// uniquely identified by a type, which specifies the type of machine it is
+/// (e.g. CPU or CUDA GPU), and a device index or ordinal, which identifies the
+/// specific compute device when there is more than one of a certain type. The
+/// device index is optional, and in its defaulted state represents (abstractly)
+/// "the current device". Further, there are two constraints on the value of the
+/// device index, if one is explicitly stored:
+/// 1. A negative index represents the current device, a non-negative index
+/// represents a specific, concrete device,
+/// 2. When the device type is CPU, the device index must be zero.
+struct C10_API Device final {
+  using Type = DeviceType;
+
+  /// Constructs a new `Device` from a `DeviceType` and an optional device
+  /// index.
+  /* implicit */ Device(DeviceType type, DeviceIndex index = -1)
+      : type_(type), index_(index) {
+    validate();
+  }
+
+  /// Constructs a `Device` from a string description, for convenience.
+  /// The string supplied must follow the following schema:
+  /// `(cpu|cuda)[:<device-index>]`
+  /// where `cpu` or `cuda` specifies the device type, and
+  /// `:<device-index>` optionally specifies a device index.
+  /* implicit */ Device(const std::string& device_string);
+
+  /// Returns true if the type and index of this `Device` matches that of
+  /// `other`.
+  bool operator==(const Device& other) const noexcept {
+    return this->type_ == other.type_ && this->index_ == other.index_;
+  }
+
+  /// Returns true if the type or index of this `Device` differs from that of
+  /// `other`.
+  bool operator!=(const Device& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// Sets the device index.
+  void set_index(DeviceIndex index) {
+    index_ = index;
+  }
+
+  /// Returns the type of device this is.
+  DeviceType type() const noexcept {
+    return type_;
+  }
+
+  /// Returns the optional index.
+  DeviceIndex index() const noexcept {
+    return index_;
+  }
+
+  /// Returns true if the device has a non-default index.
+  bool has_index() const noexcept {
+    return index_ != -1;
+  }
+
+  /// Return true if the device is of CUDA type.
+  bool is_cuda() const noexcept {
+    return type_ == DeviceType::CUDA;
+  }
+
+  /// Return true if the device is of PrivateUse1 type.
+  bool is_privateuseone() const noexcept {
+    return type_ == DeviceType::PrivateUse1;
+  }
+
+  /// Return true if the device is of MPS type.
+  bool is_mps() const noexcept {
+    return type_ == DeviceType::MPS;
+  }
+
+  /// Return true if the device is of HIP type.
+  bool is_hip() const noexcept {
+    return type_ == DeviceType::HIP;
+  }
+
+  /// Return true if the device is of VE type.
+  bool is_ve() const noexcept {
+    return type_ == DeviceType::VE;
+  }
+
+  /// Return true if the device is of XPU type.
+  bool is_xpu() const noexcept {
+    return type_ == DeviceType::XPU;
+  }
+
+  /// Return true if the device is of IPU type.
+  bool is_ipu() const noexcept {
+    return type_ == DeviceType::IPU;
+  }
+
+  /// Return true if the device is of XLA type.
+  bool is_xla() const noexcept {
+    return type_ == DeviceType::XLA;
+  }
+
+  /// Return true if the device is of MTIA type.
+  bool is_mtia() const noexcept {
+    return type_ == DeviceType::MTIA;
+  }
+
+  /// Return true if the device is of HPU type.
+  bool is_hpu() const noexcept {
+    return type_ == DeviceType::HPU;
+  }
+
+  /// Return true if the device is of Lazy type.
+  bool is_lazy() const noexcept {
+    return type_ == DeviceType::Lazy;
+  }
+
+  /// Return true if the device is of Vulkan type.
+  bool is_vulkan() const noexcept {
+    return type_ == DeviceType::Vulkan;
+  }
+
+  /// Return true if the device is of Metal type.
+  bool is_metal() const noexcept {
+    return type_ == DeviceType::Metal;
+  }
+
+  /// Return true if the device is of ORT type.
+  bool is_ort() const noexcept {
+    return type_ == DeviceType::ORT;
+  }
+
+  /// Return true if the device is of META type.
+  bool is_meta() const noexcept {
+    return type_ == DeviceType::Meta;
+  }
+
+  /// Return true if the device is of CPU type.
+  bool is_cpu() const noexcept {
+    return type_ == DeviceType::CPU;
+  }
+
+  /// Return true if the device supports arbitrary strides.
+  bool supports_as_strided() const noexcept {
+    return type_ != DeviceType::IPU && type_ != DeviceType::XLA &&
+        type_ != DeviceType::Lazy && type_ != DeviceType::MTIA;
+  }
+
+  /// Same string as returned from operator<<.
+  std::string str() const;
+
+ private:
+  DeviceType type_;
+  DeviceIndex index_ = -1;
+  void validate() {
+    // Removing these checks in release builds noticeably improves
+    // performance in micro-benchmarks.
+    // This is safe to do, because backends that use the DeviceIndex
+    // have a later check when we actually try to switch to that device.
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        index_ >= -1,
+        "Device index must be -1 or non-negative, got ",
+        static_cast<int>(index_));
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        !is_cpu() || index_ <= 0,
+        "CPU device index must be -1 or zero, got ",
+        static_cast<int>(index_));
+  }
+};
+
+C10_API std::ostream& operator<<(std::ostream& stream, const Device& device);
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::Device> {
+  size_t operator()(c10::Device d) const noexcept {
+    // Are you here because this static assert failed?  Make sure you ensure
+    // that the bitmasking code below is updated accordingly!
+    static_assert(sizeof(c10::DeviceType) == 1, "DeviceType is not 8-bit");
+    static_assert(sizeof(c10::DeviceIndex) == 1, "DeviceIndex is not 8-bit");
+    // Note [Hazard when concatenating signed integers]
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // We must first convert to a same-sized unsigned type, before promoting to
+    // the result type, to prevent sign extension when any of the values is -1.
+    // If sign extension occurs, you'll clobber all of the values in the MSB
+    // half of the resulting integer.
+    //
+    // Technically, by C/C++ integer promotion rules, we only need one of the
+    // uint32_t casts to the result type, but we put in both for explicitness's
+    // sake.
+    uint32_t bits = static_cast<uint32_t>(static_cast<uint8_t>(d.type()))
+            << 16 |
+        static_cast<uint32_t>(static_cast<uint8_t>(d.index()));
+    return std::hash<uint32_t>{}(bits);
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DeviceArray.h b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceArray.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a98e7c47bce1153dbc9cde165973dfa580a9977
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceArray.h
@@ -0,0 +1,28 @@
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace c10 {
+
+template <typename T>
+class DeviceArray {
+ public:
+  DeviceArray(c10::Allocator& allocator, size_t size)
+      : data_ptr_(allocator.allocate(size * sizeof(T))) {
+    static_assert(std::is_trivial<T>::value, "T must be a trivial type");
+    TORCH_INTERNAL_ASSERT(
+        0 == (reinterpret_cast<intptr_t>(data_ptr_.get()) % alignof(T)),
+        "c10::DeviceArray: Allocated memory is not aligned for this data type");
+  }
+
+  T* get() {
+    return static_cast<T*>(data_ptr_.get());
+  }
+
+ private:
+  c10::DataPtr data_ptr_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DeviceGuard.h b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..2101f1c7b001fcd62000fb729be3693cdf87b95d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceGuard.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+/// RAII guard that sets a certain default device in its constructor, and
+/// changes it back to the device that was originally active upon destruction.
+///
+/// The device is always reset to the one that was active at the time of
+/// construction of the guard. Even if you `set_device` after construction, the
+/// destructor will still reset the device to the one that was active at
+/// construction time.
+///
+/// This device guard does NOT have an uninitialized state; it is guaranteed
+/// to reset a device on exit.  If you are in a situation where you *might*
+/// want to setup a guard (i.e., are looking for the moral equivalent
+/// of optional<DeviceGuard>), see OptionalDeviceGuard.
+class DeviceGuard {
+ public:
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit DeviceGuard() = delete;
+
+  /// Set the current device to the passed Device.
+  explicit DeviceGuard(Device device) : guard_(device) {}
+
+  /// This constructor is for testing only.
+  explicit DeviceGuard(
+      Device device,
+      const impl::DeviceGuardImplInterface* impl)
+      : guard_(device, impl) {}
+
+  /// Copy is disallowed
+  DeviceGuard(const DeviceGuard&) = delete;
+  DeviceGuard& operator=(const DeviceGuard&) = delete;
+
+  /// Move is disallowed, as DeviceGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  DeviceGuard(DeviceGuard&& other) = delete;
+  DeviceGuard& operator=(DeviceGuard&& other) = delete;
+
+  /// Sets the device to the given one.  The specified device must be consistent
+  /// with the device type originally specified during guard construction.
+  ///
+  /// TODO: The consistency check here is inconsistent with StreamGuard's
+  /// behavior with set_stream, where a stream on a different device than
+  /// the original one isn't an error; we just reset the stream and then
+  /// switch devices.
+  void reset_device(at::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// This method is for testing only.
+  void reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl) {
+    guard_.reset_device(device, impl);
+  }
+
+  /// Sets the device index to the given one.  The device type is inferred
+  /// from the original device type the guard was constructed with.
+  void set_index(DeviceIndex index) {
+    guard_.set_index(index);
+  }
+
+  /// Returns the device that was set at the time the guard was constructed.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  impl::InlineDeviceGuard<impl::VirtualGuardImpl> guard_;
+};
+
+/**
+ * A OptionalDeviceGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * Morally, a OptionalDeviceGuard is equivalent to optional<DeviceGuard>, but
+ * with extra constructors and methods as appropriate.
+ *
+ * Besides its obvious use (optionally applying a DeviceGuard),
+ * OptionalDeviceGuard is often also used for the following idiom:
+ *
+ *    OptionalDeviceGuard g;
+ *    for (const auto& t : tensors) {
+ *      g.set_device(t.device());
+ *      do_something_with(t);
+ *    }
+ *
+ * This usage is marginally more efficient than constructing a DeviceGuard every
+ * iteration of the for loop, as it avoids an unnecessary device reset.
+ *
+ * Unlike DeviceGuard, a OptionalDeviceGuard may be uninitialized.  This occurs
+ * when you use the nullary constructor, or pass a nullopt to the constructor.
+ * Uninitialized OptionalDeviceGuards do *nothing*; they do not know what the
+ * original device was and they do not reset on destruction.  This is why
+ * original_device() and current_device() return optional<Device> rather than
+ * Device (as they do in DeviceGuard), and also is why we didn't just
+ * provide OptionalDeviceGuard by default and hide DeviceGuard from users.
+ *
+ * The semantics of an OptionalDeviceGuard are exactly explained by thinking
+ * of it as an optional<DeviceGuard>.  In particular, an initialized
+ * OptionalDeviceGuard doesn't restore device to its value at construction; it
+ * restores device to its value *at initialization*.  So if you have the
+ * program:
+ *
+ *     setDevice(1);
+ *     OptionalDeviceGuard g;
+ *     setDevice(2);
+ *     g.reset_device(Device(DeviceType::CUDA, 3));  // initializes!
+ *
+ * On destruction, g will reset device to 2, rather than 1.
+ *
+ * An uninitialized OptionalDeviceGuard is distinct from a (initialized)
+ * DeviceGuard whose original_device_ and current_device_ match, since the
+ * DeviceGuard will still reset the device to original_device_.
+ */
+class OptionalDeviceGuard {
+ public:
+  /// Create an uninitialized guard.  Set the guard later using reset_device.
+  explicit OptionalDeviceGuard() = default;
+
+  /// Initialize the guard, setting the current device to the passed Device.
+  explicit OptionalDeviceGuard(Device device) : guard_(device) {}
+
+  /// Initialize the guard if a Device is passed; otherwise leave the
+  /// guard uninitialized.
+  explicit OptionalDeviceGuard(optional<Device> device) : guard_(device) {}
+
+  /// Constructor for testing only.
+  explicit OptionalDeviceGuard(
+      Device device,
+      const impl::DeviceGuardImplInterface* impl)
+      : guard_(device, impl) {}
+
+  /// Copy is disallowed
+  OptionalDeviceGuard(const OptionalDeviceGuard&) = delete;
+  OptionalDeviceGuard& operator=(const OptionalDeviceGuard&) = delete;
+
+  /// Move is disallowed
+  /// See Note [Explicit initialization of optional fields]
+  /// and // Note [Move construction for RAII guards is tricky]
+  /// for rationale.
+  OptionalDeviceGuard(OptionalDeviceGuard&& other) = delete;
+  OptionalDeviceGuard& operator=(OptionalDeviceGuard&& other) = delete;
+
+  /// Sets the device to the given one.  The specified device must be consistent
+  /// with the device type originally specified during guard construction.
+  void reset_device(at::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// For testing only
+  void reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl) {
+    guard_.reset_device(device, impl);
+  }
+
+  /// Returns the device that was set at the time the guard was constructed.
+  optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via reset_device.
+  optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl> guard_{};
+};
+
+// Note [Whither the DeviceGuard boilerplate]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Design note: in principle, we could avoid these wrappers using:
+//
+// using DeviceGuard = impl::InlineDeviceGuard<impl::VirtualGuardImpl>;
+// using OptionalDeviceGuard =
+// impl::InlineOptionalDeviceGuard<impl::VirtualGuardImpl>;
+//
+// But the error messages are worse, and our users can't just look at the
+// header file to find out what's going on.  Furthermore, for specializations
+// like CUDAStreamGuard, it can be profitable to replace some interfaces with
+// refined types (e.g., return CUDAStream instead of Stream).  So, we eat
+// the boilerplate and write out the API explicitly.
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DeviceType.h b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceType.h
new file mode 100644
index 0000000000000000000000000000000000000000..64b50f3f5701b4650445c556127055ee3bd1d056
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DeviceType.h
@@ -0,0 +1,123 @@
+#pragma once
+
+// This is directly synchronized with caffe2/proto/caffe2.proto, but
+// doesn't require me to figure out how to get Protobuf headers into
+// ATen/core (which would require a lot more build system hacking.)
+// If you modify me, keep me synchronized with that file.
+
+#include <c10/macros/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+
+namespace c10 {
+
+// These contains all device types that also have a BackendComponent
+// and therefore participate in per-backend functionality dispatch keys.
+// This is most backends except PrivateUse2 and PrivateUse3
+#define C10_FORALL_BACKEND_DEVICE_TYPES(_, extra) \
+  _(CPU, extra)                                   \
+  _(CUDA, extra)                                  \
+  _(HIP, extra)                                   \
+  _(XLA, extra)                                   \
+  _(MPS, extra)                                   \
+  _(IPU, extra)                                   \
+  _(XPU, extra)                                   \
+  _(HPU, extra)                                   \
+  _(VE, extra)                                    \
+  _(Lazy, extra)                                  \
+  _(Meta, extra)                                  \
+  _(MTIA, extra)                                  \
+  _(PrivateUse1, extra)
+
+enum class DeviceType : int8_t {
+  CPU = 0,
+  CUDA = 1, // CUDA.
+  MKLDNN = 2, // Reserved for explicit MKLDNN
+  OPENGL = 3, // OpenGL
+  OPENCL = 4, // OpenCL
+  IDEEP = 5, // IDEEP.
+  HIP = 6, // AMD HIP
+  FPGA = 7, // FPGA
+  ORT = 8, // ONNX Runtime / Microsoft
+  XLA = 9, // XLA / TPU
+  Vulkan = 10, // Vulkan
+  Metal = 11, // Metal
+  XPU = 12, // XPU
+  MPS = 13, // MPS
+  Meta = 14, // Meta (tensors with no data)
+  HPU = 15, // HPU / HABANA
+  VE = 16, // SX-Aurora / NEC
+  Lazy = 17, // Lazy Tensors
+  IPU = 18, // Graphcore IPU
+  MTIA = 19, // Meta training and inference devices
+  PrivateUse1 = 20, // PrivateUse1 device
+  // NB: If you add more devices:
+  //  - Change the implementations of DeviceTypeName and isValidDeviceType
+  //    in DeviceType.cpp
+  //  - Change the number below
+  COMPILE_TIME_MAX_DEVICE_TYPES = 21,
+};
+
+constexpr DeviceType kCPU = DeviceType::CPU;
+constexpr DeviceType kCUDA = DeviceType::CUDA;
+constexpr DeviceType kHIP = DeviceType::HIP;
+constexpr DeviceType kFPGA = DeviceType::FPGA;
+constexpr DeviceType kORT = DeviceType::ORT;
+constexpr DeviceType kXLA = DeviceType::XLA;
+constexpr DeviceType kMPS = DeviceType::MPS;
+constexpr DeviceType kMeta = DeviceType::Meta;
+constexpr DeviceType kVulkan = DeviceType::Vulkan;
+constexpr DeviceType kMetal = DeviceType::Metal;
+constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kHPU = DeviceType::HPU;
+constexpr DeviceType kVE = DeviceType::VE;
+constexpr DeviceType kLazy = DeviceType::Lazy;
+constexpr DeviceType kIPU = DeviceType::IPU;
+constexpr DeviceType kMTIA = DeviceType::MTIA;
+constexpr DeviceType kPrivateUse1 = DeviceType::PrivateUse1;
+
+// define explicit int constant
+constexpr int COMPILE_TIME_MAX_DEVICE_TYPES =
+    static_cast<int>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+static_assert(
+    COMPILE_TIME_MAX_DEVICE_TYPES <= 21,
+    "Hey!  You seem to be adding a lot of new DeviceTypes.  The intent was "
+    "for this constant to reflect the actual number of DeviceTypes we support "
+    "in PyTorch; it's important that this number is not too large as we "
+    "use this to allocate stack arrays in some places in our code.  If you "
+    "are indeed just adding the 20th device type, feel free to change "
+    "the check to 32; but if you are adding some sort of extensible device "
+    "types registration, please be aware that you are affecting code that "
+    "this number is small.  Try auditing uses of this constant.");
+
+C10_API std::string DeviceTypeName(DeviceType d, bool lower_case = false);
+
+C10_API bool isValidDeviceType(DeviceType d);
+
+C10_API std::ostream& operator<<(std::ostream& stream, DeviceType type);
+
+C10_API void register_privateuse1_backend(const std::string& backend_name);
+C10_API std::string get_privateuse1_backend(bool lower_case = true);
+
+C10_API bool is_privateuse1_backend_registered();
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::DeviceType> {
+  std::size_t operator()(c10::DeviceType k) const {
+    return std::hash<int>()(static_cast<int>(k));
+  }
+};
+} // namespace std
+
+namespace torch {
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::DeviceType;
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKey.h b/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKey.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eafef5a7ca4c077a98b8f62922ae82f1228d2db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKey.h
@@ -0,0 +1,748 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+
+namespace c10 {
+
+// Semantically, each value of BackendComponent identifies a "backend" for our
+// dispatch. Some functionalities that we may dispatch to are allowed to
+// register different handlers for each backend. The BackendComponent is then
+// used to figure out which backend implementation to dispatch to.
+
+// In implementation terms, the backend component identifies a specific "bit" in
+// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom
+// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to
+// functionalities. When we encounter a functionality bit that is known to be
+// customizable per-backend, then we also look at the lower BackendComponent
+// bits and take the highest bit to determine which backend's implementation to
+// use.
+
+// WARNING!  If you add a new backend component to the end of this list,
+// make sure you register it before Meta.
+// Meta must be at the end so that meta key in tls triggers meta kernels.
+// (But you shouldn't: private use keys should have higher precedence than all
+// built-in keys)
+
+// If you add a new (non-privateuse) backend here,
+// make sure to add an Autograd<Backend> fallthrough kernel
+// in aten/src/ATen/core/VariableFallbackKernel.cpp
+
+#define C10_FORALL_BACKEND_COMPONENTS(_, extra) \
+  _(CPU, extra)                                 \
+  _(CUDA, extra)                                \
+  _(HIP, extra)                                 \
+  _(XLA, extra)                                 \
+  _(MPS, extra)                                 \
+  _(IPU, extra)                                 \
+  _(XPU, extra)                                 \
+  _(HPU, extra)                                 \
+  _(VE, extra)                                  \
+  _(Lazy, extra)                                \
+  _(MTIA, extra)                                \
+  _(PrivateUse1, extra)                         \
+  _(PrivateUse2, extra)                         \
+  _(PrivateUse3, extra)                         \
+  _(Meta, extra)
+
+// WARNING!  If we add a new per-backend functionality key that has higher
+// priority than Autograd, then make sure you update EndOfRuntimeBackendKeys
+
+#define C10_FORALL_FUNCTIONALITY_KEYS(_) \
+  _(Dense, )                             \
+  _(Quantized, Quantized)                \
+  _(Sparse, Sparse)                      \
+  _(SparseCsr, SparseCsr)                \
+  _(NestedTensor, NestedTensor)          \
+  _(AutogradFunctionality, Autograd)
+
+enum class BackendComponent : uint8_t {
+
+  // A "backend" is colloquially used to refer to handlers for dispatch
+  // which actually implement the numerics of an operation in question.
+  //
+  // Due to the nature of the enum, these backends are specified in
+  // an ordered way, but for most backends this order is not semantically
+  // meaningful (e.g., it's valid to reorder these backends without changing
+  // semantics).  The only situation when backend ordering is meaningful
+  // is when the backend participates in multiple dispatch with another
+  // backend; e.g., CPU and CUDA (cuda must have higher priority).
+
+  // These keys don't correspond to individual kernels.
+  // Instead, they represent the backends that are allowed to override specific
+  // pieces of functionality:
+  // - dense kernels (e.g. DispatchKey::CPU)
+  // - sparse kernels (e.g. DispatchKey::SparseCPU)
+  // - quantized kernels (e.g. DispatchKey::QuantizedCPU)
+  // - autograd kernels (e.g. DispatchKey::AutogradCPU)
+  // We reserve space in the runtime operator table for this full cross product
+  // of
+  // [backends in this enum] x [keys below that are explicitly marked as having
+  // per-backend functionality]
+  //
+  // A meta tensor is a tensor without any data associated with it.  (They
+  // have also colloquially been referred to as tensors on the "null" device).
+  // A meta tensor can be used to dry run operators without actually doing any
+  // computation, e.g., add on two meta tensors would give you another meta
+  // tensor with the output shape and dtype, but wouldn't actually add anything.
+
+  InvalidBit = 0,
+#define DEFINE_BACKEND_COMPONENT(n, _) n##Bit,
+  C10_FORALL_BACKEND_COMPONENTS(DEFINE_BACKEND_COMPONENT, unused)
+#undef DEFINE_BACKEND_COMPONENT
+
+  // Define an alias to represent end of backend dispatch keys.
+  // If you add new backend keys after PrivateUse3, please also update it here.
+  EndOfBackendKeys = MetaBit,
+};
+
+// Semantically, a dispatch key identifies a possible "level" in our
+// dispatch, for which a handler may be registered. Each handler corresponds
+// to a type of functionality.
+//
+// In implementation terms, the dispatch key identifies a specific "bit" in a
+// DispatchKeySet.  Higher bit indexes get handled by dispatching first (because
+// we "count leading zeros" when we extract the highest priority dispatch
+// key.)
+//
+// Note [DispatchKey Classification]
+// This enum actually contains several types of keys, which are explained
+// in more detail further down:
+// (1) non-customizable backends (e.g. FPGA)
+// (2) non-customizable functionalities (e.g. Functionalize)
+// (3) functionalized that are customizable per backend (e.g. Dense, Sparse,
+// AutogradFunctionality) (4) per-backend instances of customizable
+// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g.
+// CompositeImplicitAutograd)
+//
+// Of the categories above, it's important to note:
+// (a) which keys are assigned individual bits in a DispatchKeySet
+// (b) which keys are assigned individual slots in the runtime operator table
+// ("Runtime keys")
+//
+// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet.
+// (1), (2) and (4) all get their own dedicated slots in the runtime operator
+// table.
+
+// See Note [DispatchKeySet Internal Representation] for more details.
+//
+// NOTE: Keep the list in sync with `DispatchKey` in torchgen/model.py
+enum class DispatchKey : uint16_t {
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // This is not a "real" functionality, but it exists to give us a "nullopt"
+  // element we can return for cases when a DispatchKeySet contains no elements.
+  // You can think a more semantically accurate definition of DispatchKey is:
+  //
+  //    using DispatchKey = optional<RealDispatchKey>
+  //
+  // and Undefined == nullopt.  We didn't actually represent
+  // it this way because optional<RealDispatchKey> would take two
+  // words, when DispatchKey fits in eight bits.
+
+  Undefined = 0,
+
+  // Define an alias for Undefined to represent CatchAll (long term
+  // this will get eliminated, but for now it's convenient)
+  CatchAll = Undefined,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ //
+  // Every value in the enum (up to EndOfFunctionalityKeys)
+  // corresponds to an individual "functionality" that can be dispatched to.
+  // This is represented in the DispatchKeySet by assigning each of these enum
+  // values
+  // to each of the remaining (64 - len(BackendComponent)) bits.
+  //
+  // Most of these functionalities have a single handler assigned to them,
+  // making them "runtime keys".
+  // That map to a single slot in the runtime operator table.
+  //
+  // A few functionalities are allowed to be customizable per backend.
+  // See [Note: Per-Backend Functionality Dispatch Keys] for details.
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Dense,
+
+  // Below are non-extensible backends.
+  // These are backends that currently don't have their own overrides for
+  // Autograd/Sparse/Quantized kernels,
+  // and we therefore don't waste space in the runtime operator table allocating
+  // space for them.
+  // If any of these backends ever need to customize, e.g., Autograd, then we'll
+  // need to add a DispatchKey::*Bit for them.
+
+  // TODO: put this in BackendComponents
+  FPGA, // Xilinx support lives out of tree at
+  // https://gitlab.com/pytorch-complex/vitis_kernels
+
+  // TODO: put this in BackendComponents
+  // ONNX Runtime, lives out of tree at https://github.com/pytorch/ort and
+  // https://github.com/microsoft/onnxruntime, and is also used to test general
+  // backend/extension machinery in the core. cf:
+  // - test/cpp_extensions/ort_extension.cpp
+  // - test/test_torch.py
+  // - aten/src/ATen/test/extension_backend_test.cpp
+  ORT,
+
+  Vulkan, // TODO: put this in BackendComponents
+  Metal, // TODO: put this in BackendComponents
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Quantized,
+
+  // This backend is to support custom RNGs; it lets you go
+  // to a different kernel if you pass in a generator that is not a
+  // traditional CPUGeneratorImpl/CUDAGeneratorImpl.  To make use of this
+  // key:
+  //  1) set it as a second parameter of at::Generator constructor call in
+  //     the user-defined PRNG class.
+  //  2) use it as a dispatch key while registering custom kernels
+  //     (templatized kernels specialized for user-defined PRNG class)
+  // intended for out of tree use; tested by aten/src/ATen/test/rng_test.cpp
+  CustomRNGKeyId,
+
+  // TODO: Make Mkldnn a functionality key, so we can give it Meta
+  // support
+  // Here are backends which specify more specialized operators
+  // based on the layout of the tensor.  Note that the sparse backends
+  // are one case where ordering matters: sparse multi-dispatches with
+  // the corresponding dense tensors, and must be handled before them.
+  MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp
+  // NB: not to be confused with MKLDNN, which is Caffe2 only
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  Sparse,
+
+  SparseCsr,
+
+  NestedTensor,
+
+  // In some situations, it is not immediately obvious what the correct
+  // backend for function is, because the function in question doesn't
+  // have any "tensor" arguments.  In this case, a BackendSelect function
+  // can be registered to implement the custom determination of the
+  // correct backend.
+  BackendSelect,
+
+  Python,
+
+  // Out-of-core key for Fake Tensor in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/fake_tensor.html
+  // TODO: delete this in favor of Python-implemented fake tensor
+  Fake,
+  // See Note [Out-of-tree vmap+grad prototype]. The purpose of this key
+  // is to insert code after the "autograd subsystem" runs, so this key should
+  // be directly after ADInplaceOrView and all of the autograd keys.
+  FuncTorchDynamicLayerBackMode,
+
+  // Alias and mutation removal.
+  // If some backends want to opt into only alias removal or only mutation
+  // removal,
+  // we can consider adding separate keys dedicated to those individual passes.
+  // See Note [Functionalization Pass In Core] for details.
+  Functionalize,
+
+  // The named dispatch key is set for any tensors with named dimensions.
+  // Although we have a dispatch key for named tensors, for historical reasons,
+  // this dispatch key doesn't do any of the substantive functionality for named
+  // tensor (though, hypothetically, it could!)  At the moment, it's just
+  // responsible for letting us give good error messages when operations
+  // don't support named tensors.
+  //
+  // NB: If you ever consider moving named tensor functionality into
+  // this dispatch key, note that it might be necessary add another dispatch
+  // key that triggers before composite operators, in case a composite operator
+  // has named dimension propagation that doesn't match that of its
+  // constituent parts.
+  // TODO: delete this once torchdim lands in functorch
+  Named,
+
+  // The Conjugate dispatch key is set for any tensors that need to perform
+  // conjugation
+  // This is implemented at a dispatch level right before any backends run
+  Conjugate,
+
+  // The Negative dispatch key is set for any tensors that need to perform
+  // negation
+  // This is implemented at a dispatch level right before any backends run
+  Negative,
+
+  ZeroTensor, // registered at build/aten/src/ATen/RegisterZeroTensor.cpp
+
+  // Note [ADInplaceOrView key]
+  // ADInplaceOrView key is used by inplace or view ops to register a kernel
+  // that does additional setup for future autograd computation.
+  //
+  // 1. For inplace ops this kernel does version bump
+  // 2. For view ops this kernel does `as_view` setup where we properly setup
+  //    DifferentiableViewMeta on the view tensors.
+  //
+  // For other ops it's fallthrough kernel since there's no extra
+  // work to do.
+  //
+  // Note [Dream: skip VariableType kernel when requires_grad=false]
+  //
+  // In an ideal world where we can skip VariableType kernel for inputs
+  // with requires_grad=false, instead of a fallthrough kernel, we'll
+  // register a kernel shown below to all functional ops as well:
+  // torch::Tensor my_functional_op(...) {
+  //   {
+  //     // Note for every op in VariableType, you need to go through
+  //     // `AutoDispatchBelowADInplaceOrView` guard exactly once to add the
+  //     // key to TLS excluded set. If you don't go through it at all,
+  //     // inplace/view ops called through `at::` inside your backend
+  //     // kernel will dispatch to ADInplaceOrView kernels and do a lot
+  //     // of extra work.
+  //     at::AutoDispatchBelowADInplaceOrView guard;
+  //     at::redispatch::my_functional_op(...);
+  //   }
+  // }
+  // But this work is currently blocked since it adds an extra dispatch
+  // for all ops and it's non-trivial overhead at model level(a few percents).
+  // Thus our current approach takes advantage of the fact every kernel go
+  // through VariableType kernel first and pulls the
+  // `at::AutoDispatchBelowADInplaceOrView` guard of functional ops
+  // up to the `VariableType` kernel. Thus we only add the extra dispatch
+  // to view/inplace ops to minimize its perf impact to real models.
+  ADInplaceOrView,
+  // Note [Alias Dispatch Key : Autograd]
+  // All backends are oblivious to autograd; autograd is handled as a
+  // layer which happens on top of all backends. It inspects the autograd
+  // metadata of all inputs, determines what autograd metadata should be
+  // constructed by the output, and otherwise defers to the backend to
+  // actually do the numeric computation.  Autograd contains
+  // the bulk of this logic.
+
+  // Autograd is now an alias dispatch key which by default maps to all
+  // backend-specific autograd keys.
+  // Backend-specific allow backends to override the default kernel registered
+  // to Autograd key as needed.
+  // For example, XLA wants to define autograd for einsum directly.
+  // Registering a custom autograd implementation at the XLA key won't work
+  // because we process Autograd before XLA.  This key has higher priority and
+  // gets processed first.  You generally should NOT redispatch after handling
+  // autograd here (since that would result in execution of the Autograd
+  // operator, which you're trying to skip).  In AutogradXLA implementations,
+  // you are responsible for handling autograd yourself, or deferring to other
+  // operators which support autograd.
+
+  // Currently we only have backend-specific autograd keys for CPU/CUDA/XLA and
+  // reserved user-defined backends. All other in-tree backends share the
+  // AutogradOther key. We can add specific autograd key for those backends
+  // upon request.
+  AutogradOther,
+
+  // See [Note: Per-Backend Functionality Dispatch Keys]
+  AutogradFunctionality,
+
+  // NestedTensor is an example of something that isn't a "real backend"
+  // (because it mostly consists of redispatching kernels)
+  // but it would like to override autograd functionality in C++.
+  // We can handle cases like this by adding an extra functionality key
+  // exclusively for handling autograd for NestedTensor.
+  // lives out of tree at
+  // https://github.com/pytorch/nestedtensor
+  AutogradNestedTensor,
+
+  Tracer,
+
+  // TODO: make Autocast a functionality key
+  // Autocasting precedes VariableTypeId, to ensure casts are autograd-exposed
+  // and inputs are saved for backward in the post-autocast type.
+  AutocastCPU,
+  AutocastXPU,
+  AutocastIPU,
+  AutocastHPU,
+  AutocastXLA,
+  // AutocastXLA is only being used for TPUs. XLA GPUs continue to use
+  // AutocastCUDA.
+  AutocastCUDA,
+  AutocastPrivateUse1,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ WRAPPERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // There are a number of alternative modes which may want to handle before
+  // autograd; for example, error checking, tracing, profiling or vmap.  They
+  // go here.
+
+  FuncTorchBatched, // See Note [Out-of-tree vmap+grad prototype]
+
+  // Dispatch key for BatchedTensorImpl wrapping a nested tensor.
+  BatchedNestedTensor,
+
+  FuncTorchVmapMode, // See Note [Out-of-tree vmap+grad prototype]
+
+  // This is the dispatch key for BatchedTensorImpl, which is used to implement
+  // batching rules for vmap.
+  Batched,
+
+  // When we are inside a vmap, all tensors dispatch on this key.
+  // See Note: [DispatchKey::VmapMode usage] for more details.
+  VmapMode,
+
+  FuncTorchGradWrapper, // See Note [Out-of-tree vmap+grad prototype]
+
+  // Out-of-core key for Deferred Module Initialization in torchdistx.
+  // See https://pytorch.org/torchdistx/latest/deferred_init.html
+  DeferredInit,
+
+  // Used by Python key logic to know the set of tls on entry to the dispatcher
+  // This kernel assumes it is the top-most non-functorch-related DispatchKey.
+  // If you add a key above, make sure to update the fallback implementation for
+  // this.
+  PythonTLSSnapshot,
+
+  // This key should be at the very top of the dispatcher
+  FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype]
+
+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptable use is within a single
+  // process test.  Use it by creating a TensorImpl with this DispatchKey, and
+  // then registering operators to operate on this type id.  See
+  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp for a usage example.
+  TESTING_ONLY_GenericWrapper,
+
+  // TESTING: This is intended to be a generic testing tensor type id.
+  // Don't use it for anything real; its only acceptable use is within a ingle
+  // process test.  Use it by toggling the mode on and off via
+  // TESTING_ONLY_tls_generic_mode_set_enabled and then registering operators
+  // to operate on this type id.  See
+  // aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+  // for a usage example
+  TESTING_ONLY_GenericMode,
+
+  // This key is used for pre-dispatch tracing in make_fx.
+  // It has lower priority than the PythonDispatcher key
+  // because we use the PythonDispatcher to intercept the key from python,
+  // and avoid having to implement it in C++.
+  PreDispatch,
+
+  // This is a bypass that allows you to skip running the C++ dispatcher
+  // entirely
+  PythonDispatcher,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  EndOfFunctionalityKeys, // End of functionality keys.
+
+// ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ //
+// Here are backends which you think of as traditionally specifying
+// how to implement operations on some device.
+
+#define DEFINE_PER_BACKEND_KEYS_FOR_BACKEND(n, prefix) prefix##n,
+
+#define DEFINE_PER_BACKEND_KEYS(fullname, prefix)      \
+  StartOf##fullname##Backends,                         \
+      C10_FORALL_BACKEND_COMPONENTS(                   \
+          DEFINE_PER_BACKEND_KEYS_FOR_BACKEND, prefix) \
+          EndOf##fullname##Backends = prefix##Meta,
+
+  C10_FORALL_FUNCTIONALITY_KEYS(DEFINE_PER_BACKEND_KEYS)
+
+#undef DEFINE_PER_BACKEND_KEYS
+#undef DEFINE_PER_BACKEND_KEYS_FOR_BACKEND
+
+      EndOfRuntimeBackendKeys = EndOfAutogradFunctionalityBackends,
+
+  // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // Note [Alias Dispatch Keys]
+  // Alias dispatch keys are synthetic dispatch keys which map to multiple
+  // runtime dispatch keys. Alisa keys have precedence, but they are always
+  // lower precedence than runtime keys. You can register a kernel to an
+  // alias key, the kernel might be populated to the mapped runtime keys
+  // during dispatch table computation.
+  // If a runtime dispatch key has multiple kernels from alias keys, which
+  // kernel wins is done based on the precedence of alias keys (but runtime
+  // keys always have precedence over alias keys).
+  // Alias keys won't be directly called during runtime.
+
+  // See Note [Alias Dispatch Key : Autograd]
+  Autograd,
+  CompositeImplicitAutograd, // registered at
+  // build/aten/src/ATen/RegisterCompositeImplicitAutograd.cpp
+
+  // Note: The alias keyset for FuncTorchBatchedDecomposition is disjoint from
+  // all
+  // other alias keysets
+  // and so precedence order doesn't matter
+  FuncTorchBatchedDecomposition, // registered at
+  // build/aten/src/ATen/RegisterFuncTorchBatchedDecomposition.cpp
+  // Note: The alias keyset for CompositeImplicitAutogradNestedTensor is
+  // disjoint from all other alias keysets
+  CompositeImplicitAutogradNestedTensor, // registered at
+  // build/aten/src/ATen/RegisterCompositeImplicitAutogradNestedTensor.cpp
+  CompositeExplicitAutograd, // registered at
+  // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
+  // See Note [CompositeExplicitAutogradNonFunctional Key]
+  CompositeExplicitAutogradNonFunctional, // registered at
+  // build/aten/src/ATen/RegisterCompositeExplicitAutograd.cpp
+
+  // Define an alias key to represent end of alias dispatch keys.
+  // If you add new alias keys after Autograd, please also update it here.
+  StartOfAliasKeys = Autograd,
+  EndOfAliasKeys = CompositeExplicitAutogradNonFunctional, //
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //
+  // The aliases exist for backwards compatibility reasons, they shouldn't
+  // be used
+  CPUTensorId = CPU,
+  CUDATensorId = CUDA,
+  DefaultBackend = CompositeExplicitAutograd,
+  PrivateUse1_PreAutograd = AutogradPrivateUse1,
+  PrivateUse2_PreAutograd = AutogradPrivateUse2,
+  PrivateUse3_PreAutograd = AutogradPrivateUse3,
+  Autocast = AutocastCUDA,
+};
+
+// Note [Private use DispatchKey]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Private use tensor IDs are preallocated tensor type IDs for use in user
+// applications.  Similar to private use fields in HTTP, they can be used
+// by end users for experimental or private applications, without needing
+// to "standardize" the tensor ID (which would be done by submitting a PR
+// to PyTorch to add your type ID).
+//
+// Private use tensor IDs are appropriate to use if you want to experiment
+// with adding a new tensor type (without having to patch PyTorch first) or
+// have a private, non-distributed application that needs to make use of a
+// new tensor type.  Private use tensor IDs are NOT appropriate to use for
+// libraries intended to be distributed to further users: please contact
+// the PyTorch developers to get a type ID registered in this case.
+//
+// We provide two classes of private user tensor id: regular DispatchKeys
+// and Autograd DispatchKeys.  DispatchKeys serve the role of ordinary "backend"
+// DispatchKeys; if you were adding support for a new type of accelerator, you
+// would use a backend DispatchKey, and ideally automatically reuse
+// AutogradOther definitions already defined in PyTorch.  AutogradPrivateUse
+// DispatchKeys serve as "wrapper" DispatchKeys: they are only necessary for
+// tensors that compose multiple internal tensors, and for cases when the
+// built-in autograd formulas for operators are not appropriate.
+
+static_assert(
+    (static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) +
+     static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys)) <= 64,
+    "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)"
+    " both map to backend and functionality bits"
+    " into a 64-bit bitmask; you must have less than 64 total entries between them");
+
+// Check if a DispatchKey is an alias mapping to other runtime keys.
+constexpr bool isAliasDispatchKey(DispatchKey k) {
+  return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys;
+}
+
+// [Note: Per-Backend Functionality Dispatch Keys]
+// Check if a DispatchKey is a per-backend functionality key
+// Any functionalities that can be customized per-backend should be added here.
+// These keys correspond to functionalities that can be customized individually
+// per backend. While they only take up one bit in the `DispatchKeySet` bitset,
+// they map to (# backends) slots in the operator table.
+// Each of these keys also has a separate set of "runtime keys" in the dispatch
+// key enum, per backend, which *do* map to the individual operator table slots.
+// For example, the "Sparse" key maps to an individual bit in the
+// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual
+// slots in the runtime operator table.
+
+constexpr bool isPerBackendFunctionalityKey(DispatchKey k) {
+  if (k == DispatchKey::Dense || k == DispatchKey::Quantized ||
+      k == DispatchKey::Sparse || k == DispatchKey::SparseCsr ||
+      k == DispatchKey::AutogradFunctionality ||
+      k == DispatchKey::NestedTensor) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Note that this includes Undefined in the total count.
+// BUT EndOfFunctionalityKeys is its own (placeholder) key.
+// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3.
+// In the above example, there are 3 total functionality keys.
+constexpr uint8_t num_functionality_keys =
+    static_cast<uint8_t>(DispatchKey::EndOfFunctionalityKeys);
+
+constexpr uint8_t num_backends =
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys);
+
+// Note [No More Than 16 Backends]
+// Search for this note to find places in the code where the "no more than 16
+// backends" invariant is baked in.
+static_assert(
+    static_cast<uint8_t>(BackendComponent::EndOfBackendKeys) <= 16,
+    "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \
+there are a few places where this invariant is baked in");
+
+constexpr uint8_t numPerBackendFunctionalityKeys() {
+  uint8_t count = 0;
+  for (uint8_t k = 0; k <= num_functionality_keys; ++k) {
+    if (isPerBackendFunctionalityKey(static_cast<DispatchKey>(k)))
+      ++count;
+  }
+  return count;
+}
+
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+// See [Note: Trimmed Mobile Dispatch Keys]
+constexpr uint16_t num_runtime_entries = 8;
+#else
+constexpr uint16_t num_runtime_entries = num_functionality_keys +
+    (numPerBackendFunctionalityKeys() * (num_backends - 1));
+#endif
+
+// See Note [No More Than 16 Backends]
+constexpr uint16_t full_backend_mask =
+    (static_cast<uint16_t>(1) << num_backends) - 1;
+
+C10_API const char* toString(DispatchKey);
+C10_API const char* toString(BackendComponent);
+C10_API std::ostream& operator<<(std::ostream&, DispatchKey);
+C10_API std::ostream& operator<<(std::ostream&, BackendComponent);
+
+C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k);
+
+// Parses a string into a dispatch key.
+// If the string cannot be correctly parsed, throws an exception.
+C10_API c10::DispatchKey parseDispatchKey(const std::string& k);
+
+// These are some convenience identifiers for dispatch keys which are
+// shorter to type than their long counterparts.  Note that some of these
+// dispatch keys directly correspond to DeviceType; and most APIs that
+// accept DispatchKey also accept DeviceType; e.g.,
+// torch::dispatch(torch::kCPU, ...) is also valid.
+constexpr DispatchKey kAutograd = DispatchKey::Autograd;
+
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr BackendComponent toBackendComponent(DispatchKey k) {
+  if (k >= DispatchKey::StartOfDenseBackends &&
+      k <= DispatchKey::EndOfDenseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends));
+  } else if (
+      k >= DispatchKey::StartOfQuantizedBackends &&
+      k <= DispatchKey::EndOfQuantizedBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseBackends &&
+      k <= DispatchKey::EndOfSparseBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends));
+  } else if (
+      k >= DispatchKey::StartOfSparseCsrBackends &&
+      k <= DispatchKey::EndOfSparseCsrBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends));
+  } else if (
+      k >= DispatchKey::StartOfNestedTensorBackends &&
+      k <= DispatchKey::EndOfNestedTensorBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends));
+  } else if (
+      k >= DispatchKey::StartOfAutogradFunctionalityBackends &&
+      k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
+    return static_cast<BackendComponent>(
+        static_cast<uint8_t>(k) -
+        static_cast<uint8_t>(
+            DispatchKey::StartOfAutogradFunctionalityBackends));
+  } else {
+    return BackendComponent::InvalidBit;
+  }
+}
+
+constexpr DispatchKey toFunctionalityKey(DispatchKey k) {
+  if (k <= DispatchKey::EndOfFunctionalityKeys) {
+    return k;
+  } else if (k <= DispatchKey::EndOfDenseBackends) {
+    return DispatchKey::Dense;
+  } else if (k <= DispatchKey::EndOfQuantizedBackends) {
+    return DispatchKey::Quantized;
+  } else if (k <= DispatchKey::EndOfSparseBackends) {
+    return DispatchKey::Sparse;
+  } else if (k <= DispatchKey::EndOfSparseCsrBackends) {
+    return DispatchKey::SparseCsr;
+  } else if (k <= DispatchKey::EndOfNestedTensorBackends) {
+    return DispatchKey::NestedTensor;
+  } else if (k <= DispatchKey::EndOfAutogradFunctionalityBackends) {
+    return DispatchKey::AutogradFunctionality;
+  } else {
+    return DispatchKey::Undefined;
+  }
+}
+
+BackendComponent toBackendComponent(DeviceType device_type);
+
+// Given (DispatchKey::Dense, BackendComponent::CUDABit), returns
+// DispatchKey::CUDA.
+// See Note [The Ordering of Per-Backend Dispatch Keys Matters!]
+// This function relies on the invariant that the dispatch keys between
+// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend
+// in the same order as `BackendComponent`.
+constexpr DispatchKey toRuntimePerBackendFunctionalityKey(
+    DispatchKey functionality_k,
+    BackendComponent backend_k) {
+  if (functionality_k == DispatchKey::Dense) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfDenseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Sparse) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::SparseCsr) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfSparseCsrBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::Quantized) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfQuantizedBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::NestedTensor) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(DispatchKey::StartOfNestedTensorBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  if (functionality_k == DispatchKey::AutogradFunctionality) {
+    return static_cast<DispatchKey>(
+        static_cast<uint8_t>(
+            DispatchKey::StartOfAutogradFunctionalityBackends) +
+        static_cast<uint8_t>(backend_k));
+  }
+  return DispatchKey::Undefined;
+}
+
+} // namespace c10
+
+namespace torch {
+// Expose the constant, but not the TYPE (DispatchKey is an implementation
+// detail!)
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using c10::kAutograd;
+} // namespace torch
+
+// NB: You really shouldn't use this instance; this enum is guaranteed
+// to be pretty small so a regular array should be acceptable.
+namespace std {
+template <>
+struct hash<c10::DispatchKey> {
+  typedef size_t result_type;
+  typedef c10::DispatchKey argument_type;
+
+  size_t operator()(c10::DispatchKey x) const {
+    return static_cast<size_t>(x);
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKeySet.h b/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKeySet.h
new file mode 100644
index 0000000000000000000000000000000000000000..45fff3879055eae7e504210a1b47e0c4cb1d743b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DispatchKeySet.h
@@ -0,0 +1,941 @@
+#pragma once
+#include <c10/core/DispatchKey.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/llvmMathExtras.h>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+namespace c10 {
+
+struct FunctionalityOffsetAndMask {
+  // empty constructor shouldn't be used; only needed to initialize
+  // the array before populating it.
+  FunctionalityOffsetAndMask() = default;
+  FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask)
+      : offset(offset), mask(mask) {}
+  // This needs to big enough to cover the size of the operator table.
+  uint16_t offset{};
+  // See Note [No More Than 16 Backends]
+  // This mask needs to be big enough to mask all of the backend bits.
+  // We probably don't ever want to have more than 16 backend bits, so uint16_t
+  // should be enough.
+  uint16_t mask{};
+};
+static_assert(
+    c10::num_runtime_entries < 65536,
+    "The dispatcher currently only supports up to 2^16 runtime entries");
+
+C10_API std::array<FunctionalityOffsetAndMask, num_functionality_keys>
+initializeFunctionalityOffsetsAndMasks();
+
+C10_ALWAYS_INLINE static const std::
+    array<FunctionalityOffsetAndMask, num_functionality_keys>&
+    offsetsAndMasks() {
+  static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks();
+  return offsets_and_masks_;
+}
+
+// A representation of a set of DispatchKeys. A DispatchKeySet contains both
+// "functionality" bits and "backend bits", and every tensor holds its own
+// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the
+// keyset on every input tensor, or’ing them together, and dispatching to a
+// specific piece of functionality. The functionality bits are *ordered*. When
+// multiple functionality bits are set, we use the highest priority
+// functionality. Similarly, multiple backend bits can theoretically be set if
+// you call an operator with multiple tensors from difference devices (e.g. CPU
+// and CUDA), although support for mixed device dispatch is limited (the only
+// kernels that gracefully handle mixed device inputs for now are cuda kernels
+// that take in a scalar cpu tensor).
+
+// A representation of a set of DispatchKeys.  A tensor may have multiple
+// tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the
+// DispatchKeySet specifies what type ids apply.  The internal representation is
+// as a 64-bit bit set (this means only 64 tensor type ids are supported).
+//
+// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like
+// "what is the highest priority DispatchKey in the set"?  (The set itself is
+// not ordered; two sets with the same ids will always have the ids ordered in
+// the same way.)
+//
+// Note [DispatchKeySet Internal Representation]
+// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects
+// that get passed around at runtime.
+// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset
+// and individual dispatch keys.
+//
+// First: why do we have this distinction, and why not map every dispatch key
+// directly to a bit? This is mostly because we have several types of
+// functionalities that different backends would like to customize. For example,
+// we have:
+// - "Dense":     CPU, CUDA, XLA, ... (~12 keys)
+// - "Sparse":    SparseCPU, SparseCUDA, ...
+// - "SparseCsr": SparseCsrCPU, SparseCsrCUDA, ...
+// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ...
+// - "Autograd":  AutogradCPU, AutogradCUDA, Autograd XLA, ...
+// The problem is that total number of keys grows quadratically with [#
+// backends] x [# functionalities], making it very difficult to map each key
+// directly to a bit in a bitset without dramatically increasing the size of the
+// bitset over time.
+//
+// The two enums (BackendComponent and DispatchKey) can be divided roughly into
+// 5 categories.
+//
+// (1) "Building block" keys
+//    (a) backends: Everything in the BackendComponent enum (e.g. CPUBit,
+//    CUDABit) (b) functionalities: (per-backend) functionality-bit DispatchKeys
+//    (e.g. AutogradFunctionality, SparseCsr, Sparse, Dense)
+// (2) "Runtime" keys
+//    (a) "non-customizable backends" (e.g. FPGA)
+//    (b) "non-customizable functionalities" (e.g. Functionalize)
+//    (c) "per-backend instances of customizable functionalities" (e.g. CPU,
+//    SparseCPU, AutogradCPU)
+// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys])
+//
+// (1) Building block keys always correspond to individual bits in a
+// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual
+// runtime keys. e.g.
+//     auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit,
+//     DispatchKey::Dense});
+//     // The keyset has the runtime dense-cpu key.
+//     dense_cpu_ks.has(DispatchKey::CPU);
+//     // And it contains the building block keys too.
+//     dense_cpu_ks.has(DispatchKey::CPUBit);
+//     dense_cpu_ks.has(DispatchKey::Dense);
+//
+// Not every backend and not every functionality counts as a "building block
+// key". This is mostly to give us more levers to pull in the design space.
+// Backend keys and functionality keys that count as "building blocks" will
+// contribute to a full cross product of functionality that can be overriden.
+//
+// For example, right now we have at least 12 "backend" building
+// blocks (CPU, CUDA, XLA, ...) and at least 5 "functionality"
+// building blocks (Dense, Sparse, SparseCsr, Quantized,
+// AutogradFunctionality, ...). These keys together allow every
+// dispatcher operator to be customized in up to 12*4 different
+// ways. Each of those requires a slot in the operator table of every
+// dispatcher operator.  Not every piece of functionality necessarily
+// needs to be customizable per-backend, and not every backend
+// necessarily needs to be able to customize every type of
+// functionality.
+//
+//
+// (2) Every runtime key corresponds directly to a slot in an operator's runtime
+// dispatch table, and you can directly register kernels to a runtime dispatch
+// key.
+//
+// For per-backend functionalities like "Dense" or "AutogradFunctionality",
+// you can think of the corresponding runtime dispatch keys as "instances" of
+// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all
+// runtime instances of the "Dense" building block key.
+
+// (2a) and (2b) are represented identically in the DispatchKeySet logic:
+// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT
+// customizable per backend.
+//   In order to do so, we'd need to promote it to a per-backend functionality
+//   "building block" key.
+// - non-customizable backends (e.g. FPGA) can NOT customize existing
+// functionality like Sparse, Autograd, etc.
+//   In order to do so, we'd need to promote it to a backend "building block"
+//   key.
+//
+// In both cases, these keys directly correspond to runtime slots in the
+// operator table.
+//
+//
+// (3) "Alias" keys
+// See Note [Alias Dispatch Keys]
+//
+// Final note: for anyone making future changes to the Dispatcher +
+// DispatchKeySet internals, there's a closed PR with a basic
+// python-implementation of the Dispatcher that might be useful in quickly
+// testing out and validating changes. See it at
+// https://github.com/pytorch/pytorch/pull/68743
+
+// An undefined tensor is one with an empty tensor type set.
+class DispatchKeySet final {
+ public:
+  enum Full { FULL };
+  enum FullAfter { FULL_AFTER };
+  enum Raw { RAW };
+
+  // NB: default constructor representation as zero is MANDATORY as
+  // use of DispatchKeySet in TLS requires this.
+  constexpr DispatchKeySet() = default;
+
+  constexpr DispatchKeySet(Full)
+      : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {}
+
+  constexpr DispatchKeySet(FullAfter, DispatchKey t)
+      // LSB after t are OK, but not t itself.
+      // "functionalities" have a notion of ordering (e.g. Autograd > Sparse >
+      // Quantized > Dense). But backends don't really have an ordering.
+      // Therefore, we're enforcing that FullAfter can only be used on
+      // "functionality" keys.
+      : repr_(
+            (1ULL
+             << (num_backends + static_cast<uint8_t>(toFunctionalityKey(t)) -
+                 1)) -
+            1) {
+    *this = add(DispatchKey::PythonDispatcher);
+  }
+
+  // Public version of DispatchKeySet(uint64_t) API; external users
+  // must be explicit when they do this!
+  constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {}
+
+  constexpr explicit DispatchKeySet(BackendComponent k) {
+    if (k == BackendComponent::InvalidBit) {
+      repr_ = 0;
+    } else {
+      repr_ = 1ULL << (static_cast<uint8_t>(k) - 1);
+    }
+  }
+
+  constexpr explicit DispatchKeySet(DispatchKey k) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (k == DispatchKey::Undefined) {
+      // Case 1: handle Undefined specifically
+      repr_ = 0;
+    } else if (k <= DispatchKey::EndOfFunctionalityKeys) {
+      // Case 2: handle "functionality-only" keys
+      // These keys have a functionality bit set, but no backend bits
+      // These can technically be either:
+      // - valid runtime keys (e.g. DispatchKey::AutogradOther,
+      // DispatchKey::FuncTorchBatched, etc)
+      // - "building block" keys that aren't actual runtime keys (e.g.
+      // DispatchKey::Dense or Sparse)
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(k) - 1);
+      repr_ = functionality_val;
+    } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) {
+      // Case 3: "runtime" keys that have a functionality bit AND a backend bit.
+      // First compute which bit to flip for the functionality.
+      auto functionality_k = toFunctionalityKey(k);
+      // The - 1 is because Undefined is technically a "functionality" that
+      // doesn't show up in the bitset. So e.g. Dense is technically the second
+      // functionality, but the lowest functionality bit.
+      uint64_t functionality_val = 1ULL
+          << (num_backends + static_cast<uint8_t>(functionality_k) - 1);
+
+      // then compute which bit to flip for the backend
+      // Case 4a: handle the runtime instances of "per-backend functionality"
+      // keys For example, given DispatchKey::CPU, we should set:
+      // - the Dense functionality bit
+      // - the CPUBit backend bit
+      // first compute which bit to flip for the backend
+      auto backend_k = toBackendComponent(k);
+      uint64_t backend_val = backend_k == BackendComponent::InvalidBit
+          ? 0
+          : 1ULL << (static_cast<uint8_t>(backend_k) - 1);
+      repr_ = functionality_val + backend_val;
+    } else {
+      // At this point, we should have covered every case except for alias keys.
+      // Technically it would be possible to add alias dispatch keys to a
+      // DispatchKeySet, but the semantics are a little confusing and this
+      // currently isn't needed anywhere.
+      repr_ = 0;
+    }
+  }
+
+  constexpr uint64_t keys_to_repr(std::initializer_list<DispatchKey> ks) {
+    uint64_t repr = 0;
+    for (auto k : ks) {
+      repr |= DispatchKeySet(k).repr_;
+    }
+    return repr;
+  }
+
+  constexpr uint64_t backend_bits_to_repr(
+      std::initializer_list<BackendComponent> ks) {
+    uint64_t repr = 0;
+    for (auto k : ks) {
+      repr |= DispatchKeySet(k).repr_;
+    }
+    return repr;
+  }
+
+  explicit constexpr DispatchKeySet(std::initializer_list<DispatchKey> ks)
+      : repr_(keys_to_repr(ks)) {}
+
+  explicit constexpr DispatchKeySet(std::initializer_list<BackendComponent> ks)
+      // Note: for some reason, putting this logic directly in the constructor
+      // appears to fail to compile on CUDA 10.1.
+      // See an example internal failure at
+      // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr
+      : repr_(backend_bits_to_repr(ks)) {}
+
+  // Test if a DispatchKey is in the set
+  inline bool has(DispatchKey t) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined);
+    return has_all(DispatchKeySet(t));
+  }
+  constexpr bool has_backend(BackendComponent t) const {
+    return has_all(DispatchKeySet(t));
+  }
+
+  // Test if a DispatchKey is in the set
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if all of them are in the current set.
+  constexpr bool has_all(DispatchKeySet ks) const {
+    return static_cast<bool>((repr_ & ks.repr_) == ks.repr_);
+  }
+
+  // Given a DispatchKeySet of functionality keys and (potentially) backend
+  // keys, tests if any of them are in the current set. This could technically
+  // be pretty easily implemented using has(). It is strictly a perf
+  // optimization though. There are many places in the code base where we want
+  // to test for multiple functionality keys together. HOWEVER, runtime
+  // per-backend functionality keys aren't allowed to be used with this
+  // function, because you can end up with weird results. e.g.
+  // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU))
+  // would return true.
+  inline bool has_any(DispatchKeySet ks) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // Either there are no backend bits in the input keyset
+        ((ks.repr_ & full_backend_mask) == 0) ||
+        // or there are no per-backend-functionality bits
+        // See [Note: Per-Backend Functionality Dispatch Keys]
+        ((ks &
+          DispatchKeySet({
+                             DispatchKey::Dense,
+                             DispatchKey::Quantized,
+                             DispatchKey::Sparse,
+                             DispatchKey::SparseCsr,
+                             DispatchKey::AutogradFunctionality,
+                         })
+              .repr_) == 0));
+    return static_cast<bool>((repr_ & ks.repr_) != 0);
+  }
+  // Test if DispatchKeySet is a superset of ks.
+  bool isSupersetOf(DispatchKeySet ks) const {
+    return (repr_ & ks.repr_) == ks.repr_;
+  }
+  // Perform set union
+  constexpr DispatchKeySet operator|(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ | other.repr_);
+  }
+  // Perform set intersection
+  constexpr DispatchKeySet operator&(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ & other.repr_);
+  }
+  // Compute the set difference self - other,
+  // but ONLY for the functionality keys.
+  // Any backend bits set on self will remain unchanged.
+  // See Note [Removing keys from DispatchKeySet Only Affects Functionality
+  // Keys]
+  constexpr DispatchKeySet operator-(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_));
+  }
+
+  // Compute self ^ other
+  constexpr DispatchKeySet operator^(DispatchKeySet other) const {
+    return DispatchKeySet(repr_ ^ other.repr_);
+  }
+  bool operator==(DispatchKeySet other) const {
+    return repr_ == other.repr_;
+  }
+  bool operator!=(DispatchKeySet other) const {
+    return repr_ != other.repr_;
+  }
+  // Add a DispatchKey to the DispatchKey set.  Does NOT mutate,
+  // returns the extended DispatchKeySet!
+  C10_NODISCARD constexpr DispatchKeySet add(DispatchKey t) const {
+    return *this | DispatchKeySet(t);
+  }
+  C10_NODISCARD constexpr DispatchKeySet add(DispatchKeySet ks) const {
+    return *this | ks;
+  }
+
+  // Remove a DispatchKey from the DispatchKey set.
+  // This is generally not an operation you should be doing
+  // (it's used to implement the printing overload, operator<<)
+  //
+  // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys]
+  // Only functionality bits are allowed to be removed from a keyset.
+  // For now, we're only allowing removal of "functionality bits" from the
+  // keyset, which is specifically needed by the fallthrough key calculation
+  // logic. Why is removing backend bits problematic? Consider this example:
+  //
+  // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA,
+  // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA)
+  // DispatchKeySet([DispatchKey.CPU,
+  // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA)
+  //
+  // What do we want to happen?
+  // Technically, we'd like it to be true that after removal,
+  // the first keyset still has the CUDA dispatch key while the second doesn't.
+  // Unfortunately there's no way to represent that, because the two keysets are
+  // represented the same way internally: functionality bits: Autograd, Dense
+  // backend bits: CPU, CUDA
+  //
+  // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd"
+  // bit from the bitset.
+  C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const {
+    return DispatchKeySet(
+        repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask));
+  }
+  // You're allowed to remove a backend bit from a DispatchKeySet,
+  // but you have to be explicit about it (remove_backend() instead of
+  // remove()).
+  constexpr DispatchKeySet remove_backend(BackendComponent b) const {
+    return DispatchKeySet(repr_ & ~(DispatchKeySet(b).repr_));
+  }
+  // Is the set empty?  (AKA undefined tensor)
+  bool empty() const {
+    return repr_ == 0;
+  }
+  uint64_t raw_repr() {
+    return repr_;
+  }
+
+  DispatchKey highestFunctionalityKey() const {
+    auto functionality_idx = indexOfHighestBit();
+    // This means that none of the functionality bits were set.
+    if (functionality_idx < num_backends)
+      return DispatchKey::Undefined;
+    // The first num_backend bits in the keyset don't correspond to real
+    // dispatch keys.
+    return static_cast<DispatchKey>(functionality_idx - num_backends);
+  }
+
+  // This is similar like toBackendComponent(DispatchKey), but less restrictive.
+  // toBackendComponent() errors out if the key that it was passed has no
+  // backend bits, which is useful for error checking. We need a version of that
+  // here that can also handle "fake" backends like FPGA, because they need to
+  // map to the AutogradOther key. For those backends, we return
+  // BackendComponent::InvalidBit.
+  BackendComponent highestBackendKey() const {
+    // mask to mask out functionality bits
+    auto backend_idx =
+        DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit();
+    // all zeros across the backend bits means that no backend bits are set.
+    if (backend_idx == 0)
+      return BackendComponent::InvalidBit;
+    return static_cast<BackendComponent>(backend_idx);
+  }
+
+  // returns the DispatchKey of highest priority in the set.
+  DispatchKey highestPriorityTypeId() const {
+    auto functionality_k = highestFunctionalityKey();
+    if (isPerBackendFunctionalityKey(functionality_k)) {
+      return toRuntimePerBackendFunctionalityKey(
+          functionality_k, highestBackendKey());
+    }
+    return functionality_k;
+  }
+
+  // Returns the index of the most-significant bit in the keyset.
+  // This is used to as part of the calculation into the operator table to get:
+  // - the highest "functionality" bit in the keyset.
+  // - the highest "backend" bit in the keyset.
+  uint8_t indexOfHighestBit() const {
+    return 64 - llvm::countLeadingZeros(repr_);
+  }
+
+#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS)
+  // [Note: Trimmed Mobile Dispatch Keys]
+  /**
+   * The method below maps the dispatch key in the enum DispatchKey to an
+   * integer index in the dispatchTable_ array in OperatorEntry. The array
+   * is trimmed for mobile to reduce peak memory usage since it's
+   * unnecessary to reserve additional space for dispatch keys that will
+   * never be used on mobile.
+   */
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto dk = highestPriorityTypeId();
+    switch (dk) {
+      case DispatchKey::Undefined:
+        return 0;
+      case DispatchKey::CPU:
+        return 1;
+      case DispatchKey::QuantizedCPU:
+        return 2;
+      case DispatchKey::SparseCPU:
+        return 3;
+      case DispatchKey::BackendSelect:
+        return 4;
+      case DispatchKey::ADInplaceOrView:
+        return 5;
+      case DispatchKey::AutogradOther:
+        return 6;
+      case DispatchKey::AutogradCPU:
+        return 7;
+      default:
+        return -1;
+    }
+  }
+#else
+  // returns the index in the operator table of highest priority key in the the
+  // keyset Note that we could in theory implement this using
+  // highestPriorityTypeId(), but this code is very hotpath and we can do it
+  // faster without it.
+  int getDispatchTableIndexForDispatchKeySet() const {
+    auto functionality_idx =
+        DispatchKeySet(repr_ >> num_backends).indexOfHighestBit();
+    auto offset_and_mask = offsetsAndMasks()[functionality_idx];
+    // Mask the functionality bits out first, then right-shift by 1.
+    // right-shifting by 1 because everything is zero-indexed.
+    // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should
+    // give us an offset of 1, etc.
+    auto backend_idx =
+        DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit();
+    return offset_and_mask.offset + backend_idx;
+  }
+#endif
+
+  // returns the "index" of the highest priority backend in the keyset.
+  // This is pretty similar to getBackendKey(), but:
+  // - It's hotpath code (part of the runtime bitset calculation)
+  // - I's returns an integer index, not an enum value
+  // - Everything is shifted to the right by 1.
+  //   BackendComponent::InvalidBit is technically the lowest enum value,
+  //   but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2,
+  //   etc.
+  uint64_t getBackendIndex() const {
+    return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit();
+  }
+
+ private:
+  constexpr DispatchKeySet(uint64_t repr) : repr_(repr) {}
+  uint64_t repr_ = 0;
+
+ public:
+  // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys
+  // in the set. The iterator is only invalidated by the destruction of the
+  // underlying DispatchKeySet as the iterator stores a pointer to the raw
+  // representation of the DispatchKeySet. Note: When we encounter a per-backend
+  // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend
+  // in the keyset, for that functionality. For example, if the next
+  // functionality key to iterate over is Autograd, and the backend bits in the
+  // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit],
+  // then the next two keys we return will be DispatchKey::AutogradCPU,
+  // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than
+  // CUDA in DispatchKey.h).
+  class iterator {
+   public:
+    using self_type = iterator;
+    using iterator_category = std::input_iterator_tag;
+    using value_type = DispatchKey;
+    using difference_type = ptrdiff_t;
+    using reference = value_type&;
+    using pointer = value_type*;
+    // final mask value should mask out the entire keyset
+    static const uint8_t end_iter_mask_val =
+        num_backends + num_functionality_keys;
+    // final key value should be the last DispatchKey
+    static const uint8_t end_iter_key_val = num_functionality_keys;
+
+    // current_dispatchkey_idx_ will iterate through all functionality bits.
+    // current_backendcomponent_idx_ will iterate through all backend bits.
+    explicit iterator(
+        const uint64_t* data_ptr,
+        uint8_t next_functionality = num_backends,
+        uint8_t next_backend = 0)
+        : data_ptr_(data_ptr),
+          next_functionality_(next_functionality),
+          next_backend_(next_backend),
+          // These are in an invalid state at construction time, and set by the
+          // first increment call
+          current_dispatchkey_idx_(end_iter_key_val),
+          current_backendcomponent_idx_(end_iter_key_val) {
+      // Go to the first key in the set
+      TORCH_INTERNAL_ASSERT(
+          next_functionality_ >= num_backends,
+          "num_backends=",
+          static_cast<uint32_t>(num_backends),
+          "next_functionality_=",
+          static_cast<uint32_t>(next_functionality_));
+      ++(*this);
+    }
+
+    C10_API self_type& operator++();
+
+    self_type operator++(int) {
+      self_type previous_iterator = *this;
+      ++(*this);
+      return previous_iterator;
+    }
+
+    bool operator==(const self_type& rhs) const {
+      return next_functionality_ == rhs.next_functionality_ &&
+          current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ &&
+          next_backend_ == rhs.next_backend_ &&
+          current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_;
+    }
+    bool operator!=(const self_type& rhs) const {
+      return next_functionality_ != rhs.next_functionality_ ||
+          current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ ||
+          next_backend_ != rhs.next_backend_ ||
+          current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_;
+    }
+    DispatchKey operator*() const {
+      auto functionality_key =
+          static_cast<DispatchKey>(current_dispatchkey_idx_);
+      if (isPerBackendFunctionalityKey(functionality_key)) {
+        auto next_key = toRuntimePerBackendFunctionalityKey(
+            functionality_key,
+            static_cast<BackendComponent>(current_backendcomponent_idx_));
+        // We expect all of the Dense, Sparse, Quantized, and Autograd keys to
+        // be ordered the same way with respect to their backends
+        TORCH_INTERNAL_ASSERT(
+            toBackendComponent(next_key) ==
+                static_cast<BackendComponent>(current_backendcomponent_idx_),
+            "Tried to map functionality key ",
+            toString(functionality_key),
+            " and backend bit ",
+            toString(
+                static_cast<BackendComponent>(current_backendcomponent_idx_)),
+            " to a runtime key, but ended up with ",
+            toString(next_key),
+            ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.",
+            " Please double check that enum for inconsistencies.");
+        return next_key;
+      } else {
+        return functionality_key;
+      }
+    }
+
+   private:
+    const uint64_t* data_ptr_;
+    uint8_t next_functionality_;
+    uint8_t next_backend_;
+    uint8_t current_dispatchkey_idx_;
+    uint8_t current_backendcomponent_idx_;
+  };
+
+ public:
+  // Returns iterator to the first key in the set. If no keys are in the
+  // set, then will return the end iterator.
+  iterator begin() const {
+    return iterator(&repr_);
+  }
+
+  // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat
+  // this as the end iterator.
+  iterator end() const {
+    return iterator(&repr_, iterator::end_iter_mask_val);
+  }
+};
+
+C10_API std::string toString(DispatchKeySet);
+C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
+
+C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
+  return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
+}
+
+// Alias key DispatchKey::Autograd maps to
+// (autograd_dispatch_keyset x full_backend_mask)
+// NB: keys in this set also get associated with CompositeImplicitAutograd
+//
+// Note [autograd_dispatch_keyset Does Not Include Backend Bits]
+// We don't want to include any backend bits (BackendComponent::CPUBit, etc)
+// directly in autograd_dispatch_keyset.
+// Why? keysets like autograd_dispatch_keyset are commonly used to remove
+// autograd keys from a DispatchKeySet throughout the code base. However, you
+// are only allowed to remove functionality bits from a keyset, not backend
+// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality
+// Keys] for details. To be consistent and avoid confusion, we're explicitly
+// setting up autograd_dispatch_keyset to not have any backend bits.
+constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({
+    DispatchKey::AutogradFunctionality,
+    DispatchKey::AutogradOther,
+    DispatchKey::AutogradNestedTensor,
+});
+
+constexpr DispatchKeySet autocast_dispatch_keyset = DispatchKeySet({
+    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
+    DispatchKey::AutocastIPU,
+    DispatchKey::AutocastHPU,
+    DispatchKey::AutocastXLA,
+    DispatchKey::AutocastPrivateUse1,
+});
+
+// See Note [TLS Initialization]
+constexpr DispatchKeySet default_included_set = DispatchKeySet({
+    DispatchKey::BackendSelect,
+    DispatchKey::ADInplaceOrView,
+});
+
+constexpr DispatchKeySet default_excluded_set = DispatchKeySet({
+    DispatchKey::AutocastCPU,
+    DispatchKey::AutocastCUDA,
+    DispatchKey::AutocastXPU,
+    DispatchKey::AutocastIPU,
+    DispatchKey::AutocastHPU,
+    DispatchKey::AutocastXLA,
+    DispatchKey::AutocastPrivateUse1,
+});
+
+constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView =
+    autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView);
+
+constexpr DispatchKeySet python_ks = DispatchKeySet({
+    DispatchKey::Python,
+    DispatchKey::PythonTLSSnapshot,
+});
+
+constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse);
+
+constexpr DispatchKeySet sparse_csr_ks = DispatchKeySet(DispatchKey::SparseCsr);
+
+constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU);
+
+// backend dispatch keys that map to DispatchKey::AutogradOther
+// NB: keys in this set also get associated with CompositeImplicitAutograd
+constexpr DispatchKeySet autogradother_backends =
+    DispatchKeySet(
+        // HIP and VE aren't in this list: they now have their own backend bits
+        // which means that they can now have their own Autograd keys.
+        // Technically, HIP will now redispatch to its own custom AutogradHIP
+        // slot in the runtime table.
+        {DispatchKey::FPGA,
+         DispatchKey::ORT,
+         DispatchKey::Vulkan,
+         DispatchKey::Metal,
+         DispatchKey::CustomRNGKeyId,
+         DispatchKey::MkldnnCPU,
+         // Sparse and Quantized backends also live here.
+         DispatchKey::Sparse,
+         DispatchKey::SparseCsr,
+         DispatchKey::Quantized})
+    // Including the backend bits because this keyset is used during op
+    // registration, which requires looping over all runtime autogradother
+    // backend keys.
+    | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+
+// The set of dispatch keys that come after autograd
+// n.b. this relies on the fact that AutogradOther is currently the lowest
+// Autograd key
+constexpr DispatchKeySet after_autograd_keyset =
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::AutogradOther);
+
+// The set of dispatch keys that come after ADInplaceOrView
+constexpr DispatchKeySet after_ADInplaceOrView_keyset = DispatchKeySet(
+    DispatchKeySet::FULL_AFTER,
+    c10::DispatchKey::ADInplaceOrView);
+
+// The set of dispatch keys that come after Functionalize
+constexpr DispatchKeySet after_func_keyset =
+    DispatchKeySet(DispatchKeySet::FULL_AFTER, c10::DispatchKey::Functionalize)
+        .remove(
+            // NOTE: we also need to remove ADInplaceOrView from the keyset when
+            // redispatching after the func kernels. This is because we're not
+            // calling the same op; we originally called an inplace op, and now
+            // we aren't. The original key calculation figured out which keys
+            // were Fallthrough based on the inplace op. That means that it did
+            // not include the ADInPlaceOrView kernel as a fallthrough key.
+            // However, we WANT the ADInPlaceOrView kernel to be ignored now
+            // that we're calling an out-of-place op. Re-invoking
+            // Dispatcher::call would re-run the Fallthrough key calculation and
+            // get us that, But at::redispatch is more performant. We can get
+            // away with it by explicitly removing the key here.
+            c10::DispatchKey::ADInplaceOrView);
+
+constexpr DispatchKeySet backend_bitset_mask =
+    DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1);
+
+constexpr auto inplace_or_view_ks =
+    DispatchKeySet(DispatchKey::ADInplaceOrView);
+constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU);
+constexpr auto autograd_ipu_ks = DispatchKeySet(DispatchKey::AutogradIPU);
+constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU);
+constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA);
+constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA);
+constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy);
+constexpr auto autograd_meta_ks = DispatchKeySet(DispatchKey::AutogradMeta);
+constexpr auto autograd_mps_ks = DispatchKeySet(DispatchKey::AutogradMPS);
+constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU);
+constexpr auto autograd_privateuse1_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse1);
+constexpr auto autograd_privateuse2_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse2);
+constexpr auto autograd_privateuse3_ks =
+    DispatchKeySet(DispatchKey::AutogradPrivateUse3);
+constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther);
+constexpr auto autograd_nested =
+    DispatchKeySet(DispatchKey::AutogradNestedTensor);
+// keyset corresponding to functorch keys that have their own dedicated
+// TensorImpl subclass.
+constexpr auto functorch_transforms_ks = DispatchKeySet(
+    {DispatchKey::FuncTorchBatched,
+     DispatchKey::FuncTorchVmapMode,
+     DispatchKey::Batched,
+     DispatchKey::VmapMode,
+     DispatchKey::FuncTorchGradWrapper});
+
+constexpr auto functorch_batched_ks =
+    DispatchKeySet({DispatchKey::FuncTorchBatched});
+
+// This keyset has:
+// (1) the functionality bits corresponding to backends (dense, sparse,
+// quantized) (2) all of the backend bits set
+constexpr DispatchKeySet backend_functionality_keys =
+    DispatchKeySet({
+        DispatchKey::Dense,
+        DispatchKey::Quantized,
+        DispatchKey::Sparse,
+        DispatchKey::SparseCsr,
+    }) |
+    DispatchKeySet(DispatchKeySet::RAW, full_backend_mask);
+
+struct OpTableOffsetAndMask {
+  uint16_t offset;
+  uint16_t backend_mask;
+};
+
+static_assert(
+    num_backends <= 16,
+    "Right now we expect the number of backends not to exceed 16. In the (unlikely) event"
+    " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too.");
+
+// true if t is a backend dispatch key
+C10_API bool isBackendDispatchKey(DispatchKey t);
+
+// Resolve alias dispatch key to DispatchKeySet if applicable
+C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t);
+
+// Resolve alias dispatch key to DispatchKeySet if applicable,
+// and check if k is a part of that set
+C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k);
+
+// Returns a DispatchKeySet of all backend keys mapped to Autograd dispatch key
+// t, DispatchKeySet is empty if t is not alias of DispatchKey::Autograd.
+C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
+
+// Returns a DispatchKeySet of autograd related keys mapped to backend.
+// for a given backend key, use the associated autograd key.
+// for non-backend keys, use AutogradOther as a default.
+// Note: it's convenient and fast to return a default here rather than (say)
+// returning an optional<DispatchKey>, or throwing. But it makes callers
+// responsible for either a) enforcing the invariant that only backend keys
+// be passed as arguments, or b) interpreting our return value carefully.
+inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) {
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return inplace_or_view_ks | autograd_cpu_ks;
+    case BackendComponent::IPUBit:
+      return inplace_or_view_ks | autograd_ipu_ks;
+    case BackendComponent::XPUBit:
+      return inplace_or_view_ks | autograd_xpu_ks;
+    case BackendComponent::CUDABit:
+      return inplace_or_view_ks | autograd_cuda_ks;
+    case BackendComponent::XLABit:
+      return inplace_or_view_ks | autograd_xla_ks;
+    case BackendComponent::LazyBit:
+      return inplace_or_view_ks | autograd_lazy_ks;
+    case BackendComponent::MetaBit:
+      return inplace_or_view_ks | autograd_meta_ks;
+    case BackendComponent::MPSBit:
+      return inplace_or_view_ks | autograd_mps_ks;
+    case BackendComponent::HPUBit:
+      return inplace_or_view_ks | autograd_hpu_ks;
+    case BackendComponent::PrivateUse1Bit:
+      return inplace_or_view_ks | autograd_privateuse1_ks;
+    case BackendComponent::PrivateUse2Bit:
+      return inplace_or_view_ks | autograd_privateuse2_ks;
+    case BackendComponent::PrivateUse3Bit:
+      return inplace_or_view_ks | autograd_privateuse3_ks;
+    default:
+      return inplace_or_view_ks | autograd_other_ks;
+  }
+}
+
+// Returns a DispatchKeySet of autocast related keys mapped to backend.
+inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) {
+  constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU);
+  constexpr auto autocast_xpu_ks = DispatchKeySet(DispatchKey::AutocastXPU);
+  constexpr auto autocast_ipu_ks = DispatchKeySet(DispatchKey::AutocastIPU);
+  constexpr auto autocast_hpu_ks = DispatchKeySet(DispatchKey::AutocastHPU);
+  constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA);
+  constexpr auto autocast_xla_ks = DispatchKeySet(DispatchKey::AutocastXLA);
+  constexpr auto autocast_privateuse1_ks =
+      DispatchKeySet(DispatchKey::AutocastPrivateUse1);
+  switch (t) {
+    case BackendComponent::CPUBit:
+      return autocast_cpu_ks;
+    case BackendComponent::XPUBit:
+      return autocast_xpu_ks;
+    case BackendComponent::IPUBit:
+      return autocast_ipu_ks;
+    case BackendComponent::HPUBit:
+      return autocast_hpu_ks;
+    case BackendComponent::CUDABit:
+      return autocast_cuda_ks;
+    case BackendComponent::XLABit:
+      return autocast_xla_ks;
+    case BackendComponent::PrivateUse1Bit:
+      return autocast_privateuse1_ks;
+    default:
+      return DispatchKeySet();
+  }
+}
+
+// returns the "backend" DispatchKey of highest priority in the set.
+// This is basically like highestBackendKey(), except that we have some
+// "functionality" bits that correspond to backends (Sparse, Quantized)
+inline DispatchKey highestPriorityBackendTypeId(DispatchKeySet ks) {
+  return (ks & backend_functionality_keys).highestPriorityTypeId();
+}
+
+// This API exists because we have a use case for checking
+// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
+// in OperatorEntry.cpp but we disallow it in has() API.
+C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias);
+
+// Historically, every tensor only had a single DispatchKey, and it was always
+// something like CPU, and there wasn't any of this business where TLS
+// could cause the DispatchKey of a tensor to change.  But we still have some
+// legacy code that is still using DispatchKey for things like instanceof
+// checks; if at all possible, refactor the code to stop using DispatchKey in
+// those cases.
+static inline DispatchKey legacyExtractDispatchKey(DispatchKeySet s) {
+  // NB: If you add any extra keys that can be stored in TensorImpl on
+  // top of existing "backend" keys like CPU/CUDA, you need to add it
+  // here.  At the moment, autograd keys and ADInplaceOrView key need this
+  // treatment;
+  return (s - autograd_dispatch_keyset_with_ADInplaceOrView -
+          autocast_dispatch_keyset -
+          DispatchKeySet(
+              {DispatchKey::Functionalize,
+               DispatchKey::PythonTLSSnapshot,
+               DispatchKey::Python}))
+      .highestPriorityTypeId();
+}
+
+template <class T>
+using is_not_DispatchKeySet = std::negation<std::is_same<DispatchKeySet, T>>;
+
+// Given a function type, constructs a function_traits type that drops the first
+// parameter type if the first parameter is of type DispatchKeySet. NB:
+// DispatchKeySet is currently explicitly hidden from JIT (mainly to avoid
+// pushing unnecessary arguments on the stack - see Note [ Plumbing Keys Through
+// the Dispatcher] for details). If at any point in the future we need to expose
+// this type to JIT, revisit the usage of this type alias.
+template <class FuncType>
+using remove_DispatchKeySet_arg_from_func = guts::make_function_traits_t<
+    typename guts::infer_function_traits_t<FuncType>::return_type,
+    typename std::conditional_t<
+        std::is_same_v<
+            DispatchKeySet,
+            typename guts::typelist::head_with_default_t<
+                void,
+                typename guts::infer_function_traits_t<
+                    FuncType>::parameter_types>>,
+        guts::typelist::drop_if_nonempty_t<
+            typename guts::infer_function_traits_t<FuncType>::parameter_types,
+            1>,
+        typename guts::infer_function_traits_t<FuncType>::parameter_types>>;
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/DynamicCast.h b/MLPY/Lib/site-packages/torch/include/c10/core/DynamicCast.h
new file mode 100644
index 0000000000000000000000000000000000000000..65a5e4f3b66ff99829ac8089f07dd6eb5c97b699
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/DynamicCast.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Load.h>
+#include <c10/util/TypeCast.h>
+
+namespace c10 {
+
+// Dynamic type casting utils:
+// - fetch_and_cast
+// - cast_and_store
+//
+// fetch_and_cast fetch a value with dynamic type specified by a ScalarType
+// from a void pointer and cast it to a static type.
+//
+// cast_and_store casts a static typed value into dynamic type specified
+// by a ScalarType, and store it into a void pointer.
+//
+// NOTE:
+//
+// Dynamic casting allows us to support type promotion without blowing up
+// the combination space: For example, without dynamic cast, in order to
+// implement `add_` with type promotion, we would need something like
+//
+// AT_DISPATCH_ALL_TYPES(output.dtype(),
+//    AT_DISPATCH_ALL_TYPES(input1.dtype(),
+//       AT_DISPATCH_ALL_TYPES(input2.dtype(),
+//           [](arg0_t a, arg1_t b) -> out_t { return a + b; }
+//       )
+//    )
+// )
+//
+// If we support N dtypes, the above code would generate the a+b kernel for
+// all the N * N * N different supported types, the compilation time and
+// binary size would become horrible.
+//
+// Dynamic casting might sounds like a bad idea in terms of performance.
+// Especially if you ever do it in a loop, you are going to do a billion tests.
+// But in practice it is not as bad as it might look:
+//
+// - on CPU, this is a branch that always has the same outcome, therefore
+//   hopefully the branch predictor could do the job pretty well
+// - on GPU, these branches will not diverge, so we could still have the same
+//   warp executing the same line of code
+// - Most kernels, like `add`, are bandwidth bound, adding a few clock cycles to
+//   check an integer does not hurt the performance much because the ALUs would
+//   wait for load instructions anyway.
+//
+// For the discussion and benchmark, refer to:
+// - https://github.com/pytorch/pytorch/pull/28343
+// - https://github.com/pytorch/pytorch/pull/28344
+// - https://github.com/pytorch/pytorch/pull/28345
+//
+
+#ifdef C10_HOST_DEVICE
+#define ERROR_UNSUPPORTED_CAST CUDA_KERNEL_ASSERT(false);
+#else
+#define ERROR_UNSUPPORTED_CAST TORCH_CHECK(false, "Unexpected scalar type");
+#endif
+
+// Fetch a value with dynamic type src_type from ptr, and cast it to static type
+// dest_t.
+#define FETCH_AND_CAST_CASE(type, scalartype) \
+  case ScalarType::scalartype:                \
+    return c10::convert<dest_t>(c10::load<type>(ptr));
+
+template <typename dest_t>
+C10_HOST_DEVICE inline dest_t fetch_and_cast(
+    const ScalarType src_type,
+    const void* ptr) {
+  switch (src_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(FETCH_AND_CAST_CASE)
+    FETCH_AND_CAST_CASE(uint16_t, UInt16)
+    FETCH_AND_CAST_CASE(uint32_t, UInt32)
+    FETCH_AND_CAST_CASE(uint64_t, UInt64)
+    default:
+      ERROR_UNSUPPORTED_CAST
+  }
+  return dest_t(0); // just to avoid compiler warning
+}
+
+// Cast a value with static type src_t into dynamic dest_type, and store it to
+// ptr.
+#define CAST_AND_STORE_CASE(type, scalartype) \
+  case ScalarType::scalartype:                \
+    *(type*)ptr = c10::convert<type>(value);  \
+    return;
+template <typename src_t>
+C10_HOST_DEVICE inline void cast_and_store(
+    const ScalarType dest_type,
+    void* ptr,
+    src_t value) {
+  switch (dest_type) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(CAST_AND_STORE_CASE)
+    CAST_AND_STORE_CASE(uint16_t, UInt16)
+    CAST_AND_STORE_CASE(uint32_t, UInt32)
+    CAST_AND_STORE_CASE(uint64_t, UInt64)
+    default:;
+  }
+  ERROR_UNSUPPORTED_CAST
+}
+
+#define DEFINE_UNCASTABLE(T, scalartype_)                     \
+  template <>                                                 \
+  C10_HOST_DEVICE inline T fetch_and_cast<T>(                 \
+      const ScalarType src_type, const void* ptr) {           \
+    CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == src_type);  \
+    return c10::load<T>(ptr);                                 \
+  }                                                           \
+  template <>                                                 \
+  C10_HOST_DEVICE inline void cast_and_store<T>(              \
+      const ScalarType dest_type, void* ptr, T value) {       \
+    CUDA_KERNEL_ASSERT(ScalarType::scalartype_ == dest_type); \
+    *(T*)ptr = value;                                         \
+  }
+
+AT_FORALL_QINT_TYPES(DEFINE_UNCASTABLE)
+
+#undef FETCH_AND_CAST_CASE
+#undef CAST_AND_STORE_CASE
+#undef DEFINE_UNCASTABLE
+#undef ERROR_UNSUPPORTED_CAST
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Event.h b/MLPY/Lib/site-packages/torch/include/c10/core/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..475aca5fdf252730a88cc40de2ab56c938827335
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Event.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/InlineEvent.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+
+namespace c10 {
+
+/**
+ * A backend-generic movable, not copyable, not thread-safe event.
+ *
+ * The design of this event follows that of CUDA and HIP events. These events
+ * are recorded and waited on by streams and can be rerecorded to,
+ * each rerecording essentially creating a new version of the event.
+ * For example, if (in CPU time), stream X is asked to record E,
+ * stream Y waits on E, and stream X is asked to record E again, then Y will
+ * wait for X to finish the first call to record and not the second, because
+ * it's waiting on the first version of event E, not the second.
+ * Querying an event only returns the status of its most recent version.
+ *
+ * Backend-generic events are implemented by this class and
+ * impl::InlineEvent. In addition to these events there are also
+ * some backend-specific events, like ATen's CUDAEvent. Each of these
+ * classes has its own use.
+ *
+ * impl::InlineEvent<...> or a backend-specific event should be
+ * preferred when the backend is known at compile time and known to
+ * be compiled. Backend-specific events may have additional functionality.
+ *
+ * This Event should be used if a particular backend may not be available,
+ * or the backend required is not known at compile time.
+ *
+ * These generic events are built on top of DeviceGuardImpls, analogous
+ * to DeviceGuard and InlineDeviceGuard. The name "DeviceGuardImpls,"
+ * is no longer entirely accurate, as these classes implement the
+ * backend-specific logic for a generic backend interface.
+ *
+ * See DeviceGuardImplInterface.h for a list of all supported flags.
+ */
+
+struct Event final {
+  // Constructors
+  Event() = delete;
+  Event(
+      const DeviceType _device_type,
+      const EventFlag _flag = EventFlag::PYTORCH_DEFAULT)
+      : impl_{_device_type, _flag} {}
+
+  // Copy constructor and copy assignment operator (deleted)
+  Event(const Event&) = delete;
+  Event& operator=(const Event&) = delete;
+
+  // Move constructor and move assignment operator
+  Event(Event&&) noexcept = default;
+  Event& operator=(Event&&) noexcept = default;
+
+  // Destructor
+  ~Event() = default;
+
+  // Getters
+  Device device() const noexcept {
+    return Device(device_type(), device_index());
+  }
+  DeviceType device_type() const noexcept {
+    return impl_.device_type();
+  }
+  DeviceIndex device_index() const noexcept {
+    return impl_.device_index();
+  }
+  EventFlag flag() const noexcept {
+    return impl_.flag();
+  }
+  bool was_marked_for_recording() const noexcept {
+    return impl_.was_marked_for_recording();
+  }
+
+  /**
+   * Calls record() if and only if record() has never been called for this
+   * event. Note: because Event is not thread-safe recordOnce() may call
+   * record() multiple times if called from multiple threads.
+   */
+  void recordOnce(const Stream& stream) {
+    impl_.recordOnce(stream);
+  }
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  void record(const Stream& stream) {
+    impl_.record(stream);
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  void block(const Stream& stream) const {
+    impl_.block(stream);
+  }
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  bool query() const {
+    return impl_.query();
+  }
+
+ private:
+  impl::InlineEvent<impl::VirtualGuardImpl> impl_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/GeneratorImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/GeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..089dd1dba43611df95e1ed6076e2e1eb9149aa0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/GeneratorImpl.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <cstdint>
+#include <mutex>
+
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/python_stub.h>
+
+/**
+ * Note [Generator]
+ * ~~~~~~~~~~~~~~~~
+ * A Pseudo Random Number Generator (PRNG) is an engine that uses an algorithm
+ * to generate a seemingly random sequence of numbers, that may be later be used
+ * in creating a random distribution. Such an engine almost always maintains a
+ * state and requires a seed to start off the creation of random numbers. Often
+ * times, users have found it beneficial to be able to explicitly create,
+ * retain, and destroy PRNG states and also be able to have control over the
+ * seed value.
+ *
+ * A Generator in ATen gives users the ability to read, write and modify a PRNG
+ * engine. For instance, it does so by letting users seed a PRNG engine, fork
+ * the state of the engine, etc.
+ *
+ * By default, there is one generator per device, and a device's generator is
+ * lazily created. A user can use the torch.Generator() api to create their own
+ * generator. Currently torch.Generator() can only create a CPUGeneratorImpl.
+ */
+
+/**
+ * Note [Acquire lock when using random generators]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Generator and its derived classes are NOT thread-safe. Please note that most
+ * of the places where we have inserted locking for generators are historically
+ * based, and we haven't actually checked that everything is truly thread safe
+ * (and it probably isn't). Please use the public mutex_ when using any methods
+ * from these classes, except for the read-only methods. You can learn about the
+ * usage by looking into the unittests (aten/src/ATen/cpu_generator_test.cpp)
+ * and other places where we have used lock_guard.
+ *
+ * TODO: Look into changing the threading semantics of Generators in ATen (e.g.,
+ * making them non-thread safe and instead making the generator state
+ * splittable, to accommodate forks into other threads).
+ */
+
+namespace c10 {
+
+// The default seed is selected to be a large number
+// with good distribution of 0s and 1s in bit representation
+constexpr uint64_t default_rng_seed_val = 67280421310721;
+
+struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
+  // Constructors
+  GeneratorImpl(Device device_in, DispatchKeySet key_set);
+
+  // Delete all copy and move assignment in favor of clone()
+  // method
+  GeneratorImpl(const GeneratorImpl& other) = delete;
+  GeneratorImpl(GeneratorImpl&& other) = delete;
+  GeneratorImpl& operator=(const GeneratorImpl& other) = delete;
+
+  ~GeneratorImpl() override = default;
+  c10::intrusive_ptr<GeneratorImpl> clone() const;
+
+  // Common methods for all generators
+  virtual void set_current_seed(uint64_t seed) = 0;
+  virtual void set_offset(uint64_t offset) = 0;
+  virtual uint64_t get_offset() const = 0;
+  virtual uint64_t current_seed() const = 0;
+  virtual uint64_t seed() = 0;
+  virtual void set_state(const c10::TensorImpl& new_state) = 0;
+  virtual c10::intrusive_ptr<c10::TensorImpl> get_state() const = 0;
+  Device device() const;
+
+  // See Note [Acquire lock when using random generators]
+  std::mutex mutex_;
+
+  DispatchKeySet key_set() const {
+    return key_set_;
+  }
+
+  inline void set_pyobj(PyObject* pyobj) noexcept {
+    pyobj_ = pyobj;
+  }
+
+  inline PyObject* pyobj() const noexcept {
+    return pyobj_;
+  }
+
+ protected:
+  Device device_;
+  DispatchKeySet key_set_;
+  PyObject* pyobj_ = nullptr;
+
+  virtual GeneratorImpl* clone_impl() const = 0;
+};
+
+namespace detail {
+
+C10_API uint64_t getNonDeterministicRandom(bool is_cuda = false);
+
+} // namespace detail
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/GradMode.h b/MLPY/Lib/site-packages/torch/include/c10/core/GradMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49c9fdacd38d433b967e6ee0107f83e83e3feee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/GradMode.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <c10/core/AutogradState.h>
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+struct C10_API GradMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct C10_API AutoGradMode {
+  AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
+    GradMode::set_enabled(enabled);
+  }
+  ~AutoGradMode() {
+    GradMode::set_enabled(prev_mode);
+  }
+  bool prev_mode;
+};
+
+// A RAII, thread local (!) guard that stops future operations from building
+// gradients.
+struct C10_API NoGradGuard : public AutoGradMode {
+  NoGradGuard() : AutoGradMode(/*enabled=*/false) {}
+};
+
+// A RAII, thread local (!) guard that enables or disables forward grad mode
+// upon construction, and sets it back to the original value upon destruction.
+struct C10_API AutoFwGradMode {
+  AutoFwGradMode(bool enabled)
+      : prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
+    AutogradState::get_tls_state().set_fw_grad_mode(enabled);
+  }
+  ~AutoFwGradMode() {
+    AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
+  }
+  bool prev_mode;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/InferenceMode.h b/MLPY/Lib/site-packages/torch/include/c10/core/InferenceMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecbbdbded7ecd7fc6adf48a67618f075dddbd310
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/InferenceMode.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <c10/core/AutogradState.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+// A RAII, thread local (!) guard that enables or disables inference mode upon
+// construction, and sets it back to the original value upon destruction.
+struct C10_API InferenceMode {
+  // Note [Expected TLS state in InferenceMode]:
+  //   InferenceMode: ADInplaceOrView not in
+  //   raw_local_dispatch_key_set.included(),
+  //                  Autograd in raw_local_dispatch_key_set.excluded()
+  //                  GradMode is disabled.
+  //   NormalMode: ADInplaceOrView in raw_local_dispatch_key_set.included(),
+  //               Autograd not in raw_local_dispatch_key_set.excluded()
+  //               GradMode is enabled by default unless toggled manually
+  //               through other APIs, e.g. NoGradGuard.
+  //
+  // Invariant:
+  // - ADInplaceOrView is never in the excluded set
+  // - Autograd is never in the included set
+  // - Setting InferenceMode will set GradMode accordingly, but not vice versa.
+  //
+  //  1. Why do we put ADInplaceOrView in included set outside InferenceMode?
+  //
+  //     Inplace update to inference tensor outside InferenceMode is not
+  //     allowed. See Note [Inplace update inference tensor] for more details.
+  //     Without going through ADInplaceOrView kernel, we cannot throw error
+  //     for `inference_tensor.add_(1)` case.
+  //
+  // 2. Why not put ADInplaceOrView in the excluded set inside InferenceMode?
+  //
+  //    For example:
+  //    torch::Tensor a = torch::ones({1, 2, 3}).set_requires_grad(true);
+  //    torch::Tensor k = a + 2;
+  //    {
+  //      c10::InferenceMode guard(true);
+  //      k.add_(2);
+  //    }
+  //    `k.add_(2)` still need to go through ADInplaceOrView kernel so that it's
+  //    prepared for future autograd.
+  //
+  // 3. Why does setting InferenceMode also set GradMode?
+  //
+  //    This is required since InferenceMode is a faster and more restrictive
+  //    version of NoGradGuard. All runtime checks using GradMode::is_enabled()
+  //    are applicable to InferenceMode as well, e.g.
+  //    `tensorTypeInCurrentExecutionContext` in interpreter.cpp.
+  InferenceMode(bool enabled = true)
+      : prev_mode(AutogradState::get_tls_state()),
+        prev_keyset(c10::impl::tls_local_dispatch_key_set()) {
+    // Enabling inference mode means disabling grad modes
+    // And disabling inference mode means enabling grad modes
+    AutogradState::set_tls_state(AutogradState(
+        /* grad_mode */ !enabled,
+        /* inference_mode */ enabled,
+        /* fw_grad_mode */ !enabled,
+        /* multithreading_enabled*/ !enabled));
+    DispatchKeySet included = enabled
+        ? prev_keyset.included_.remove(c10::DispatchKey::ADInplaceOrView)
+        : prev_keyset.included_.add(c10::DispatchKey::ADInplaceOrView);
+    DispatchKeySet excluded = enabled
+        ? (prev_keyset.excluded_ | c10::autograd_dispatch_keyset)
+        : (prev_keyset.excluded_ - c10::autograd_dispatch_keyset);
+    c10::impl::PODLocalDispatchKeySet cur_keyset{};
+    cur_keyset.set_included(included);
+    cur_keyset.set_excluded(excluded);
+    c10::impl::_force_tls_local_dispatch_key_set(cur_keyset);
+  }
+
+  ~InferenceMode() {
+    AutogradState::set_tls_state(prev_mode);
+    c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
+  }
+  static bool is_enabled();
+
+ private:
+  AutogradState prev_mode;
+  c10::impl::LocalDispatchKeySet prev_keyset;
+};
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Layout.h b/MLPY/Lib/site-packages/torch/include/c10/core/Layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ec87697d18b902a5e3ec112911a5f4c615de678
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Layout.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <ostream>
+
+namespace c10 {
+enum class Layout : int8_t {
+  Strided,
+  Sparse,
+  SparseCsr,
+  Mkldnn,
+  SparseCsc,
+  SparseBsr,
+  SparseBsc,
+  Jagged,
+  NumOptions
+};
+
+constexpr auto kStrided = Layout::Strided;
+constexpr auto kSparse = Layout::Sparse;
+constexpr auto kSparseCsr = Layout::SparseCsr;
+constexpr auto kMkldnn = Layout::Mkldnn;
+constexpr auto kSparseCsc = Layout::SparseCsc;
+constexpr auto kSparseBsr = Layout::SparseBsr;
+constexpr auto kSparseBsc = Layout::SparseBsc;
+constexpr auto kJagged = Layout::Jagged;
+
+inline Layout layout_from_backend(Backend backend) {
+  switch (backend) {
+    case Backend::SparseCPU:
+    case Backend::SparseCUDA:
+    case Backend::SparseHIP:
+    case Backend::SparseVE:
+    case Backend::SparseXPU:
+    case Backend::SparsePrivateUse1:
+      return Layout::Sparse;
+    case Backend::MkldnnCPU:
+      return Layout::Mkldnn;
+    case Backend::SparseCsrCPU:
+    case Backend::SparseCsrCUDA:
+    case Backend::SparseCsrHIP:
+    case Backend::SparseCsrVE:
+    case Backend::SparseCsrXPU:
+      TORCH_CHECK(
+          false,
+          "Cannot map Backend SparseCsr(CPU|CUDA|HIP|VE|XPU) to a unique layout.");
+    default:
+      return Layout::Strided;
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& stream, at::Layout layout) {
+  switch (layout) {
+    case at::kStrided:
+      return stream << "Strided";
+    case at::kSparse:
+      return stream << "Sparse";
+    case at::kSparseCsr:
+      return stream << "SparseCsr";
+    case at::kSparseCsc:
+      return stream << "SparseCsc";
+    case at::kSparseBsr:
+      return stream << "SparseBsr";
+    case at::kSparseBsc:
+      return stream << "SparseBsc";
+    case at::kMkldnn:
+      return stream << "Mkldnn";
+    case at::kJagged:
+      return stream << "Jagged";
+    default:
+      TORCH_CHECK(false, "Unknown layout");
+  }
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/MemoryFormat.h b/MLPY/Lib/site-packages/torch/include/c10/core/MemoryFormat.h
new file mode 100644
index 0000000000000000000000000000000000000000..55d25bd24d35b5cc50afce435042bc85d4f7fb76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/MemoryFormat.h
@@ -0,0 +1,290 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <ostream>
+#include <vector>
+
+// Memory format is not the property of a Tensor. It is the way to tell an
+// operator how the result should be organized in memory and nothing more. That
+// means memory format should never be used as return value for any tensor state
+// interrogation functions (internally and externally).
+//
+// Possible options are:
+//  Preserve:
+//    If any of the input tensors is in channels_last format, operator output
+//    should be in channels_last format
+//
+//  Contiguous:
+//    Regardless of input tensors format, the output should be contiguous
+//    Tensor.
+//
+//  ChannelsLast:
+//    Regardless of input tensors format, the output should be in channels_last
+//    format.
+
+namespace c10 {
+enum class MemoryFormat : int8_t {
+  Contiguous,
+  Preserve,
+  ChannelsLast,
+  ChannelsLast3d,
+  NumOptions
+};
+
+// If you are seeing this, it means that this call site was not checked if
+// the memory format could be preserved, and it was switched to old default
+// behaviour of contiguous
+#define LEGACY_CONTIGUOUS_MEMORY_FORMAT c10::get_contiguous_memory_format()
+
+inline MemoryFormat get_contiguous_memory_format() {
+  return MemoryFormat::Contiguous;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::MemoryFormat memory_format) {
+  switch (memory_format) {
+    case MemoryFormat::Preserve:
+      return stream << "Preserve";
+    case MemoryFormat::Contiguous:
+      return stream << "Contiguous";
+    case MemoryFormat::ChannelsLast:
+      return stream << "ChannelsLast";
+    case MemoryFormat::ChannelsLast3d:
+      return stream << "ChannelsLast3d";
+    default:
+      TORCH_CHECK(false, "Unknown memory format ", memory_format);
+  }
+}
+
+// Note: Hardcoded the channel last stride indices here to get better
+// performance
+template <typename T>
+inline std::vector<T> get_channels_last_strides_2d(ArrayRef<T> sizes) {
+  std::vector<T> strides(sizes.size());
+  switch (sizes.size()) {
+    case 4:
+      strides[1] = 1;
+      strides[3] = sizes[1];
+      strides[2] = strides[3] * sizes[3];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    case 3:
+      strides[0] = 1;
+      strides[2] = sizes[0];
+      strides[1] = strides[2] * sizes[2];
+      return strides;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "ChannelsLast2d doesn't support size ", sizes.size());
+  }
+}
+
+inline std::vector<int64_t> get_channels_last_strides_2d(IntArrayRef sizes) {
+  return get_channels_last_strides_2d<int64_t>(sizes);
+}
+
+template <typename T>
+std::vector<T> get_channels_last_strides_3d(ArrayRef<T> sizes) {
+  std::vector<T> strides(sizes.size());
+  switch (sizes.size()) {
+    case 5:
+      strides[1] = 1;
+      strides[4] = sizes[1];
+      strides[3] = strides[4] * sizes[4];
+      strides[2] = strides[3] * sizes[3];
+      strides[0] = strides[2] * sizes[2];
+      return strides;
+    case 4:
+      strides[0] = 1;
+      strides[3] = sizes[0];
+      strides[2] = strides[3] * sizes[3];
+      strides[1] = strides[2] * sizes[2];
+      return strides;
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "ChannelsLast3d doesn't support size ", sizes.size());
+  }
+}
+
+inline std::vector<int64_t> get_channels_last_strides_3d(IntArrayRef sizes) {
+  return get_channels_last_strides_3d<int64_t>(sizes);
+}
+
+// NOTE:
+// Below are Helper functions for is_channels_last_strides_xd.
+// 1. Please do not combine these helper functions, each helper function handles
+// exactly one case of sizes + memory_format, by doing this, the strides indices
+// will be a constant array and we can access it using constant index number,
+// the compiler will fully unroll the loop on strides indices to gain a better
+// performance.
+// 2. No error check in helper function, caller ensures the correctness of the
+// input
+// 3. All helper functions have similar comments, only 1st helper function is
+// commented here.
+template <typename T>
+inline bool is_channels_last_strides_2d_s4(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  T min = 0;
+  // special case for trivial C dimension. default to NCHW
+  if (strides[1] == 0) {
+    return false;
+  }
+  // loop strides indices
+  for (auto& d : {1, 3, 2, 0}) {
+    if (sizes[d] == 0) {
+      return false;
+    }
+    if (strides[d] < min) {
+      return false;
+    }
+    // Fallback to NCHW as default layout for ambiguous cases
+    // This is the flaw of implicit memory_format from strides.
+    // N111 tensor with identical strides for size 1 dimension;
+    // Two cases could lead us here:
+    // a. N111 contiguous Tensor ([N,1,1,1]@[1,1,1,1])
+    // b. N11W contiguous Tensor sliced on the W-dimension.
+    // ([N,1,1,1]@[W,W,W,W])
+    if (d == 0 && min == strides[1]) {
+      return false;
+    }
+    // This is necessary to:
+    // 1. distinguish the memory_format of N1H1;
+    //     [H, 1, 1, 1] channels_last stride
+    //     [H, H, 1, 1] contiguous stride
+    // 2. permutation of 1C1W:
+    //     [1, C, 1, H]@[HC, H, H, 1] transpose(1, 3)
+    //     [1, H, 1, C]@[HC, 1, H, H] shouldn't be identified as channels_last
+    min = strides[d];
+    if (sizes[d] > 1) {
+      min *= sizes[d];
+    }
+  }
+  return true;
+}
+
+template <typename T>
+inline bool is_channels_last_strides_3d_s5(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  T min = 0;
+  if (strides[1] == 0) {
+    return false;
+  }
+  for (auto& d : {1, 4, 3, 2, 0}) {
+    if (sizes[d] == 0) {
+      return false;
+    }
+    if (strides[d] < min) {
+      return false;
+    }
+    if (d == 0 && min == strides[1]) {
+      return false;
+    }
+    min = strides[d];
+    if (sizes[d] > 1) {
+      min *= sizes[d];
+    }
+  }
+  return true;
+}
+
+// Note [Ambiguous is_channels_last_strides_xd]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// The flaw of carrying memory_format implicitly through strides is very hard
+// to WAR properly. issue #24090
+// Without the history of permutation, we can't infer the memory_format of a
+// tensor from the snapshot of its size & stride
+// e.g.
+//
+// 1. We can NOT specify the memory_format of N111 tensor through strides in a
+//  meaningful way;
+//
+// 2. Two path that ended up with identical size/stride
+//  N11W contiguous tensor sliced at w-dimension becomes [N,1,1,1]@[W,W,W,W]
+//  NC11 channels_last tensor sliced at c-dimension becomes [N,1,1,1]@[C,C,C,C]
+//    So if we see a tensor [N,1,1,1]@[X,X,X,X], there's no way for us to infer
+//    the memory_format of the original tensor.
+//
+// Due to the limitations, our temporary WAR `is_channels_last_strides` does the
+// best effort to infer whether the original memory_format of a tensor is
+// at::MemoryFormat::ChannelsLast. The two objectives of this function (ordered
+// by their importance):
+//   1. Ensure that normal shape manipulation does not accidentally change the
+//      MemoryFormat of an existing tensor.
+//   2. Allows user to mark MemoryFormat::ChannelsLast to tensors;
+//
+// The function does so via checking strides of the tensor, including strides of
+// size-1 dimensions. Although conventionally PyTorch implies no restriction on
+// trivial stride (stride for size-1 dimension).
+//
+// Note that this approach is a compromise. We did not solve the problem
+// completely. Many cases we will not be able to infer the correct memory
+// format.
+// The implementation of `is_channels_last_strides` is to serve the objectives:
+// MemoryFormat::ChannelsLast has to be explicitly opted-in (no accidental
+// conversion); Best effort to maintain the ChannelsLast flag.
+//
+// Due to the fact that this is not a bulletproof solution, through testing
+// (aten/src/ATen/test/memory_format_test.cpp)
+//   a. we ensure that the common tasks are supported;
+//   a. we identify corner cases where the implementation compromises on.
+//
+// By the time accumulated permutation is enabled to replace implicit
+// memory_format through strides, we should be updating our tests and fix the
+// issues in our tests.
+//
+// We use Channels Last 2d as an example above.
+// This is a general problem for all the is_channels_last_strides_xd
+// implementation. Please check the helper functions
+// (is_channels_last_strides_*d_s*) for more details.
+
+template <typename T>
+inline bool is_channels_last_strides_2d(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  switch (sizes.size()) {
+    case 4:
+      return is_channels_last_strides_2d_s4(sizes, strides);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+template <typename T>
+inline bool is_channels_last_strides_3d(
+    const ArrayRef<T> sizes,
+    const ArrayRef<T> strides) {
+  switch (sizes.size()) {
+    case 5:
+      return is_channels_last_strides_3d_s5(sizes, strides);
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return false;
+    default:
+      return false;
+  }
+}
+
+inline bool is_channels_last_strides_2d(
+    const IntArrayRef sizes,
+    const IntArrayRef strides) {
+  return is_channels_last_strides_2d<int64_t>(sizes, strides);
+}
+
+inline bool is_channels_last_strides_3d(
+    const IntArrayRef sizes,
+    const IntArrayRef strides) {
+  return is_channels_last_strides_3d<int64_t>(sizes, strides);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/OptionalRef.h b/MLPY/Lib/site-packages/torch/include/c10/core/OptionalRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..84c843ec68164c1dcf01990a0319b25f8cd674ec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/OptionalRef.h
@@ -0,0 +1,31 @@
+#pragma once
+
+namespace c10 {
+
+template <typename T>
+class OptionalRef {
+ public:
+  OptionalRef() : data_(nullptr) {}
+  OptionalRef(const T* data) : data_(data) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_);
+  }
+  OptionalRef(const T& data) : data_(&data) {}
+
+  bool has_value() const {
+    return data_ != nullptr;
+  }
+
+  const T& get() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(data_);
+    return *data_;
+  }
+
+  operator bool() const {
+    return has_value();
+  }
+
+ private:
+  const T* data_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/PyHandleCache.h b/MLPY/Lib/site-packages/torch/include/c10/core/PyHandleCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..37245dbed26c4afc2700089a6cf06a34d2a12d8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/PyHandleCache.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/python_stub.h>
+
+#include <atomic>
+
+namespace c10 {
+
+// A PyHandleCache represents a cached pointer from a C++ object to
+// a Python object that represents that object analogously in Python.
+// Upon a cache hit, the relevant object can be retrieved after a test
+// and then a memory load.  Two conditions must hold to be able to use this
+// class:
+//
+//  - This must truly be a cache; e.g., the caller must be able to produce
+//    the object some other way if the cache hit misses.
+//
+//  - This must truly be a handle; e.g., the Python object referenced by
+//    this class must have static lifetime.  This means we don't have to
+//    maintain strong ownership or deallocate the object when the C++ object
+//    dies.  Static lifetime is a good idea in conjunction with the cache,
+//    since if you are producing a fresh object on miss you won't be
+//    maintaining object identity.  If you need bidirectional ownership,
+//    you will want to factor out the pattern in TensorImpl with
+//    resurrection.
+//
+// This cache is expected to not improve perf under torchdeploy, as one
+// interpreter will fill up the cache, and all the interpreters will be
+// unable to use the slot.  A potential improvement is to have multiple
+// slots (one per interpreter), which will work in deployment scenarios
+// where there a stable, fixed number of interpreters.  You can also store
+// the relevant state in the Python library, rather than in the non-Python
+// library (although in many cases, this is not convenient, as there may
+// not be a way to conveniently index based on the object.)
+class PyHandleCache {
+ public:
+  PyHandleCache() : pyinterpreter_(nullptr) {}
+
+  // Attempt to fetch the pointer from the cache, if the PyInterpreter
+  // matches.  If it doesn't exist, or the cache entry is not valid,
+  // use slow_accessor to get the real pointer value and return that
+  // (possibly writing it to the cache, if the cache entry is
+  // available.)
+  template <typename F>
+  PyObject* ptr_or(impl::PyInterpreter* self_interpreter, F slow_accessor)
+      const {
+    // Note [Memory ordering on Python interpreter tag]
+    impl::PyInterpreter* interpreter =
+        pyinterpreter_.load(std::memory_order_acquire);
+    if (C10_LIKELY(interpreter == self_interpreter)) {
+      return data_;
+    } else if (interpreter == nullptr) {
+      auto* r = slow_accessor();
+      impl::PyInterpreter* expected = nullptr;
+      // attempt to claim this cache entry with the specified interpreter tag
+      if (pyinterpreter_.compare_exchange_strong(
+              expected, self_interpreter, std::memory_order_acq_rel)) {
+        data_ = r;
+      }
+      // This shouldn't be possible, as you should be GIL protected
+      TORCH_INTERNAL_ASSERT(expected != self_interpreter);
+      return r;
+    } else {
+      return slow_accessor();
+    }
+  }
+
+ private:
+  mutable std::atomic<impl::PyInterpreter*> pyinterpreter_;
+  mutable PyObject* data_{nullptr};
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/QEngine.h b/MLPY/Lib/site-packages/torch/include/c10/core/QEngine.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8a0ac9639303e3ce466db30c53e02739ef68224
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/QEngine.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <string>
+
+namespace c10 {
+
+/**
+ * QEngine is an enum that is used to select the engine to run quantized ops.
+ * Keep this enum in sync with get_qengine_id() in
+ * torch/backends/quantized/__init__.py
+ */
+enum class QEngine : uint8_t {
+  NoQEngine = 0,
+  FBGEMM = 1,
+  QNNPACK = 2,
+  ONEDNN = 3,
+  X86 = 4,
+};
+
+constexpr auto kNoQEngine = QEngine::NoQEngine;
+constexpr auto kFBGEMM = QEngine::FBGEMM;
+constexpr auto kQNNPACK = QEngine::QNNPACK;
+constexpr auto kONEDNN = QEngine::ONEDNN;
+constexpr auto kX86 = QEngine::X86;
+
+inline std::string toString(QEngine qengine) {
+  switch (qengine) {
+    case kNoQEngine:
+      return "NoQEngine";
+    case kFBGEMM:
+      return "FBGEMM";
+    case kQNNPACK:
+      return "QNNPACK";
+    case kONEDNN:
+      return "ONEDNN";
+    case kX86:
+      return "X86";
+    default:
+      TORCH_CHECK(
+          false, "Unrecognized Quantized Engine: ", static_cast<int>(qengine));
+  }
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/QScheme.h b/MLPY/Lib/site-packages/torch/include/c10/core/QScheme.h
new file mode 100644
index 0000000000000000000000000000000000000000..158839257eabe869fdb2c04e7754a938fe2eacd8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/QScheme.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <string>
+
+namespace c10 {
+
+/**
+ * QScheme is an enum that specifies the type of quantization. This has a one
+ * to one correspondence with Quantizer
+ * Please refer to ATen/quantized/Quantizer.h to see the Quantizers classes.
+ * Keep this file in sync with torch/nn/_qscheme.py
+ */
+enum class QScheme : uint8_t {
+  PER_TENSOR_AFFINE = 0,
+  PER_CHANNEL_AFFINE = 1,
+  PER_TENSOR_SYMMETRIC = 2,
+  PER_CHANNEL_SYMMETRIC = 3,
+  PER_CHANNEL_AFFINE_FLOAT_QPARAMS = 4,
+  COMPILE_TIME_NUM_QSCHEMES = 5,
+};
+
+constexpr auto kPerTensorAffine = QScheme::PER_TENSOR_AFFINE;
+constexpr auto kPerChannelAffine = QScheme::PER_CHANNEL_AFFINE;
+constexpr auto kPerTensorSymmetric = QScheme::PER_TENSOR_SYMMETRIC;
+constexpr auto kPerChannelSymmetric = QScheme::PER_CHANNEL_SYMMETRIC;
+constexpr auto kPerChannelAffineFloatQParams =
+    QScheme::PER_CHANNEL_AFFINE_FLOAT_QPARAMS;
+constexpr int COMPILE_TIME_NUM_QSCHEMES =
+    static_cast<int>(QScheme::COMPILE_TIME_NUM_QSCHEMES);
+
+inline std::string toString(QScheme qscheme) {
+  switch (qscheme) {
+    case kPerTensorAffine:
+      return "per_tensor_affine";
+    case kPerChannelAffine:
+      return "per_channel_affine";
+    case kPerTensorSymmetric:
+      return "per_tensor_symmetric";
+    case kPerChannelSymmetric:
+      return "per_channel_symmetric";
+    case kPerChannelAffineFloatQParams:
+      return "per_channel_affine_float_qparams";
+    default:
+      TORCH_CHECK(false, "Unrecognized qscheme: ", static_cast<int>(qscheme));
+  }
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/RefcountedDeleter.h b/MLPY/Lib/site-packages/torch/include/c10/core/RefcountedDeleter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e7125f5858b8cf6b331f6d32cdad25dfa2af8f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/RefcountedDeleter.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <c10/core/Storage.h>
+#include <c10/macros/Export.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <atomic>
+#include <memory>
+
+namespace c10 {
+
+// A RefcountedDeleterContext object is used as the `ctx` argument for DataPtr
+// to implement a shared DataPtr. Normally, a DataPtr is unique, but we use
+// this custom context and the `refcounted_deleter` function below to make the
+// DataPtr act like a non-unique DataPtr. This context object holds onto an
+// inner context and deleter function which handle the actual deletion of the
+// data when the refcount reaches 0.
+//
+// This shared DataPtr feature is only used when storages are shared between
+// multiple Python interpreters in MultiPy. Before storages had PyObject
+// preservation, interpreters could just share the same StorageImpl instance.
+// But now a StorageImpl can only be associated with one interpreter in order
+// to properly manage a zombie PyObject. So we share storages across Python
+// interpreters by creating a different StorageImpl instance for each one, but
+// they all point to the same data.
+struct C10_API RefcountedDeleterContext {
+  RefcountedDeleterContext(void* other_ctx, c10::DeleterFnPtr other_deleter)
+      : other_ctx(other_ctx, other_deleter), refcount(1) {}
+
+  std::unique_ptr<void, c10::DeleterFnPtr> other_ctx;
+  std::atomic_int refcount;
+};
+
+// `refcounted_deleter` is used as the `ctx_deleter` for DataPtr to implement
+// a shared DataPtr.
+//
+// Warning: This should only be called on a pointer to
+// a RefcountedDeleterContext that was allocated on the heap with `new`,
+// because when the refcount reaches 0, the context is deleted with `delete`
+C10_API void refcounted_deleter(void* ctx_);
+
+// If the storage's DataPtr does not use `refcounted_deleter`, replace it with
+// a DataPtr that does, so it can be shared between multiple StorageImpls
+C10_API void maybeApplyRefcountedDeleter(const c10::Storage& storage);
+
+// Create a new StorageImpl that points to the same data. If the original
+// StorageImpl's DataPtr does not use `refcounted_deleter`, it will be replaced
+// with one that does
+C10_API c10::Storage newStorageImplFromRefcountedDataPtr(
+    const c10::Storage& storage);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SafePyObject.h b/MLPY/Lib/site-packages/torch/include/c10/core/SafePyObject.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f86dbd83b269c08a8396717b5c3f18fa3967646
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SafePyObject.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Export.h>
+#include <c10/util/python_stub.h>
+#include <utility>
+
+namespace c10 {
+
+// This is an safe owning holder for a PyObject, akin to pybind11's
+// py::object, with two major differences:
+//
+//  - It is in c10/core; i.e., you can use this type in contexts where
+//    you do not have a libpython dependency
+//
+//  - It is multi-interpreter safe (ala torchdeploy); when you fetch
+//    the underlying PyObject* you are required to specify what the current
+//    interpreter context is and we will check that you match it.
+//
+// It is INVALID to store a reference to a Tensor object in this way;
+// you should just use TensorImpl directly in that case!
+struct C10_API SafePyObject {
+  // Steals a reference to data
+  SafePyObject(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+  SafePyObject(SafePyObject&& other) noexcept
+      : data_(std::exchange(other.data_, nullptr)),
+        pyinterpreter_(other.pyinterpreter_) {}
+
+  // In principle this could be copyable if we add an incref to PyInterpreter
+  // but for now it's easier to just disallow it.
+  SafePyObject(SafePyObject const&) = delete;
+  SafePyObject& operator=(SafePyObject const&) = delete;
+
+  ~SafePyObject() {
+    if (data_ != nullptr) {
+      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+    }
+  }
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+
+  // stop tracking the current object, and return it
+  PyObject* release() {
+    auto rv = data_;
+    data_ = nullptr;
+    return rv;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
+// Like SafePyObject, but non-owning.  Good for references to global PyObjects
+// that will be leaked on interpreter exit.  You get a copy constructor/assign
+// this way.
+struct C10_API SafePyHandle {
+  SafePyHandle() : data_(nullptr), pyinterpreter_(nullptr) {}
+  SafePyHandle(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
+      : data_(data), pyinterpreter_(pyinterpreter) {}
+
+  c10::impl::PyInterpreter& pyinterpreter() const {
+    return *pyinterpreter_;
+  }
+  PyObject* ptr(const c10::impl::PyInterpreter*) const;
+  void reset() {
+    data_ = nullptr;
+    pyinterpreter_ = nullptr;
+  }
+  operator bool() {
+    return data_;
+  }
+
+ private:
+  PyObject* data_;
+  c10::impl::PyInterpreter* pyinterpreter_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Scalar.h b/MLPY/Lib/site-packages/torch/include/c10/core/Scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc63c0738cef12123e66b23e7972217062af2324
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Scalar.h
@@ -0,0 +1,461 @@
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#include <c10/core/OptionalRef.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/complex.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+
+/**
+ * Scalar represents a 0-dimensional tensor which contains a single element.
+ * Unlike a tensor, numeric literals (in C++) are implicitly convertible to
+ * Scalar (which is why, for example, we provide both add(Tensor) and
+ * add(Scalar) overloads for many operations). It may also be used in
+ * circumstances where you statically know a tensor is 0-dim and single size,
+ * but don't know its type.
+ */
+class C10_API Scalar {
+ public:
+  Scalar() : Scalar(int64_t(0)) {}
+
+  void destroy() {
+    if (Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag) {
+      raw::intrusive_ptr::decref(v.p);
+      v.p = nullptr;
+    }
+  }
+
+  ~Scalar() {
+    destroy();
+  }
+
+#define DEFINE_IMPLICIT_CTOR(type, name) \
+  Scalar(type vv) : Scalar(vv, true) {}
+
+  AT_FORALL_SCALAR_TYPES_AND7(
+      Half,
+      BFloat16,
+      Float8_e5m2,
+      Float8_e4m3fn,
+      Float8_e5m2fnuz,
+      Float8_e4m3fnuz,
+      ComplexHalf,
+      DEFINE_IMPLICIT_CTOR)
+  AT_FORALL_COMPLEX_TYPES(DEFINE_IMPLICIT_CTOR)
+
+  // Helper constructors to allow Scalar creation from long and long long types
+  // As std::is_same_v<long, long long> is false(except Android), one needs to
+  // provide a constructor from either long or long long in addition to one from
+  // int64_t
+#if defined(__APPLE__) || defined(__MACOSX)
+  static_assert(
+      std::is_same_v<long long, int64_t>,
+      "int64_t is the same as long long on MacOS");
+  Scalar(long vv) : Scalar(vv, true) {}
+#endif
+#if defined(__linux__) && !defined(__ANDROID__)
+  static_assert(
+      std::is_same_v<long, int64_t>,
+      "int64_t is the same as long on Linux");
+  Scalar(long long vv) : Scalar(vv, true) {}
+#endif
+
+  Scalar(uint16_t vv) : Scalar(vv, true) {}
+  Scalar(uint32_t vv) : Scalar(vv, true) {}
+  Scalar(uint64_t vv) {
+    if (vv > static_cast<uint64_t>(INT64_MAX)) {
+      tag = Tag::HAS_u;
+      v.u = vv;
+    } else {
+      tag = Tag::HAS_i;
+      // NB: no need to use convert, we've already tested convertibility
+      v.i = static_cast<int64_t>(vv);
+    }
+  }
+
+#undef DEFINE_IMPLICIT_CTOR
+
+  // Value* is both implicitly convertible to SymbolicVariable and bool which
+  // causes ambiguity error. Specialized constructor for bool resolves this
+  // problem.
+  template <
+      typename T,
+      typename std::enable_if_t<std::is_same_v<T, bool>, bool>* = nullptr>
+  Scalar(T vv) : tag(Tag::HAS_b) {
+    v.i = convert<int64_t, bool>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<std::is_same_v<T, c10::SymBool>, bool>* =
+          nullptr>
+  Scalar(T vv) : tag(Tag::HAS_sb) {
+    v.i = convert<int64_t, c10::SymBool>(vv);
+  }
+
+#define DEFINE_ACCESSOR(type, name)                                   \
+  type to##name() const {                                             \
+    if (Tag::HAS_d == tag) {                                          \
+      return checked_convert<type, double>(v.d, #type);               \
+    } else if (Tag::HAS_z == tag) {                                   \
+      return checked_convert<type, c10::complex<double>>(v.z, #type); \
+    }                                                                 \
+    if (Tag::HAS_b == tag) {                                          \
+      return checked_convert<type, bool>(v.i, #type);                 \
+    } else if (Tag::HAS_i == tag) {                                   \
+      return checked_convert<type, int64_t>(v.i, #type);              \
+    } else if (Tag::HAS_u == tag) {                                   \
+      return checked_convert<type, uint64_t>(v.u, #type);             \
+    } else if (Tag::HAS_si == tag) {                                  \
+      return checked_convert<type, int64_t>(                          \
+          toSymInt().guard_int(__FILE__, __LINE__), #type);           \
+    } else if (Tag::HAS_sd == tag) {                                  \
+      return checked_convert<type, int64_t>(                          \
+          toSymFloat().guard_float(__FILE__, __LINE__), #type);       \
+    } else if (Tag::HAS_sb == tag) {                                  \
+      return checked_convert<type, int64_t>(                          \
+          toSymBool().guard_bool(__FILE__, __LINE__), #type);         \
+    }                                                                 \
+    TORCH_CHECK(false)                                                \
+  }
+
+  // TODO: Support ComplexHalf accessor
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ACCESSOR)
+  DEFINE_ACCESSOR(uint16_t, UInt16)
+  DEFINE_ACCESSOR(uint32_t, UInt32)
+  DEFINE_ACCESSOR(uint64_t, UInt64)
+
+#undef DEFINE_ACCESSOR
+
+  SymInt toSymInt() const {
+    if (Tag::HAS_si == tag) {
+      return c10::SymInt(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toLong();
+    }
+  }
+
+  SymFloat toSymFloat() const {
+    if (Tag::HAS_sd == tag) {
+      return c10::SymFloat(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toDouble();
+    }
+  }
+
+  SymBool toSymBool() const {
+    if (Tag::HAS_sb == tag) {
+      return c10::SymBool(intrusive_ptr<SymNodeImpl>::reclaim_copy(
+          static_cast<SymNodeImpl*>(v.p)));
+    } else {
+      return toBool();
+    }
+  }
+
+  // also support scalar.to<int64_t>();
+  // Deleted for unsupported types, but specialized below for supported types
+  template <typename T>
+  T to() const = delete;
+
+  // audit uses of data_ptr
+  const void* data_ptr() const {
+    TORCH_INTERNAL_ASSERT(!isSymbolic());
+    return static_cast<const void*>(&v);
+  }
+
+  bool isFloatingPoint() const {
+    return Tag::HAS_d == tag || Tag::HAS_sd == tag;
+  }
+
+  C10_DEPRECATED_MESSAGE(
+      "isIntegral is deprecated. Please use the overload with 'includeBool' parameter instead.")
+  bool isIntegral() const {
+    return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag;
+  }
+  bool isIntegral(bool includeBool) const {
+    return Tag::HAS_i == tag || Tag::HAS_si == tag || Tag::HAS_u == tag ||
+        (includeBool && isBoolean());
+  }
+
+  bool isComplex() const {
+    return Tag::HAS_z == tag;
+  }
+  bool isBoolean() const {
+    return Tag::HAS_b == tag || Tag::HAS_sb == tag;
+  }
+
+  // you probably don't actually want these; they're mostly for testing
+  bool isSymInt() const {
+    return Tag::HAS_si == tag;
+  }
+  bool isSymFloat() const {
+    return Tag::HAS_sd == tag;
+  }
+  bool isSymBool() const {
+    return Tag::HAS_sb == tag;
+  }
+
+  bool isSymbolic() const {
+    return Tag::HAS_si == tag || Tag::HAS_sd == tag || Tag::HAS_sb == tag;
+  }
+
+  C10_ALWAYS_INLINE Scalar& operator=(Scalar&& other) noexcept {
+    if (&other == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(other));
+    return *this;
+  }
+
+  C10_ALWAYS_INLINE Scalar& operator=(const Scalar& other) {
+    if (&other == this) {
+      return *this;
+    }
+
+    *this = Scalar(other);
+    return *this;
+  }
+
+  Scalar operator-() const;
+  Scalar conj() const;
+  Scalar log() const;
+
+  template <
+      typename T,
+      typename std::enable_if_t<!c10::is_complex<T>::value, int> = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      auto val = v.z;
+      return (val.real() == num) && (val.imag() == T());
+    } else if (isFloatingPoint()) {
+      TORCH_CHECK(!isSymbolic(), "NYI SymFloat equality");
+      return v.d == num;
+    } else if (tag == Tag::HAS_i) {
+      if (overflows<T>(v.i, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.i) == num;
+      }
+    } else if (tag == Tag::HAS_u) {
+      if (overflows<T>(v.u, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.u) == num;
+      }
+    } else if (tag == Tag::HAS_si) {
+      TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality");
+    } else if (isBoolean()) {
+      // boolean scalar does not equal to a non boolean value
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return false;
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return v.z == num;
+    } else if (isFloatingPoint()) {
+      TORCH_CHECK(!isSymbolic(), "NYI SymFloat equality");
+      return (v.d == num.real()) && (num.imag() == T());
+    } else if (tag == Tag::HAS_i) {
+      if (overflows<T>(v.i, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.i) == num.real() && num.imag() == T();
+      }
+    } else if (tag == Tag::HAS_u) {
+      if (overflows<T>(v.u, /* strict_unsigned */ true)) {
+        return false;
+      } else {
+        return static_cast<T>(v.u) == num.real() && num.imag() == T();
+      }
+    } else if (tag == Tag::HAS_si) {
+      TORCH_INTERNAL_ASSERT(false, "NYI SymInt equality");
+    } else if (isBoolean()) {
+      // boolean scalar does not equal to a non boolean value
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return false;
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+  bool equal(bool num) const {
+    if (isBoolean()) {
+      TORCH_INTERNAL_ASSERT(!isSymbolic());
+      return static_cast<bool>(v.i) == num;
+    } else {
+      return false;
+    }
+  }
+
+  ScalarType type() const {
+    if (isComplex()) {
+      return ScalarType::ComplexDouble;
+    } else if (isFloatingPoint()) {
+      return ScalarType::Double;
+    } else if (isIntegral(/*includeBool=*/false)) {
+      // Represent all integers as long, UNLESS it is unsigned and therefore
+      // unrepresentable as long
+      if (Tag::HAS_u == tag) {
+        return ScalarType::UInt64;
+      }
+      return ScalarType::Long;
+    } else if (isBoolean()) {
+      return ScalarType::Bool;
+    } else {
+      throw std::runtime_error("Unknown scalar type.");
+    }
+  }
+
+  Scalar(Scalar&& rhs) noexcept : tag(rhs.tag) {
+    moveFrom(std::move(rhs));
+  }
+
+  Scalar(const Scalar& rhs) : tag(rhs.tag), v(rhs.v) {
+    if (isSymbolic()) {
+      c10::raw::intrusive_ptr::incref(v.p);
+    }
+  }
+
+  Scalar(c10::SymInt si) {
+    if (auto m = si.maybe_as_int()) {
+      tag = Tag::HAS_i;
+      v.i = *m;
+    } else {
+      tag = Tag::HAS_si;
+      v.p = std::move(si).release();
+    }
+  }
+
+  Scalar(c10::SymFloat sd) {
+    if (sd.is_symbolic()) {
+      tag = Tag::HAS_sd;
+      v.p = std::move(sd).release();
+    } else {
+      tag = Tag::HAS_d;
+      v.d = sd.as_float_unchecked();
+    }
+  }
+
+  Scalar(c10::SymBool sb) {
+    if (auto m = sb.maybe_as_bool()) {
+      tag = Tag::HAS_b;
+      v.i = *m;
+    } else {
+      tag = Tag::HAS_sb;
+      v.p = std::move(sb).release();
+    }
+  }
+
+  // We can't set v in the initializer list using the
+  // syntax v{ .member = ... } because it doesn't work on MSVC
+ private:
+  enum class Tag { HAS_d, HAS_i, HAS_u, HAS_z, HAS_b, HAS_sd, HAS_si, HAS_sb };
+
+  // Note [Meaning of HAS_u]
+  // ~~~~~~~~~~~~~~~~~~~~~~~
+  // HAS_u is a bit special.  On its face, it just means that we
+  // are holding an unsigned integer.  However, we generally don't
+  // distinguish between different bit sizes in Scalar (e.g., we represent
+  // float as double), instead, it represents a mathematical notion
+  // of some quantity (integral versus floating point).  So actually,
+  // HAS_u is used solely to represent unsigned integers that could
+  // not be represented as a signed integer.  That means only uint64_t
+  // potentially can get this tag; smaller types like uint8_t fits into a
+  // regular int and so for BC reasons we keep as an int.
+
+  // NB: assumes that self has already been cleared
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  C10_ALWAYS_INLINE void moveFrom(Scalar&& rhs) noexcept {
+    v = rhs.v;
+    tag = rhs.tag;
+    if (rhs.tag == Tag::HAS_si || rhs.tag == Tag::HAS_sd ||
+        rhs.tag == Tag::HAS_sb) {
+      // Move out of scalar
+      rhs.tag = Tag::HAS_i;
+      rhs.v.i = 0;
+    }
+  }
+
+  Tag tag;
+
+  union v_t {
+    double d{};
+    int64_t i;
+    // See Note [Meaning of HAS_u]
+    uint64_t u;
+    c10::complex<double> z;
+    c10::intrusive_ptr_target* p;
+    // NOLINTNEXTLINE(modernize-use-equals-default)
+    v_t() {} // default constructor
+  } v;
+
+  template <
+      typename T,
+      typename std::enable_if_t<
+          std::is_integral_v<T> && !std::is_same_v<T, bool>,
+          bool>* = nullptr>
+  Scalar(T vv, bool) : tag(Tag::HAS_i) {
+    v.i = convert<decltype(v.i), T>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<
+          !std::is_integral_v<T> && !c10::is_complex<T>::value,
+          bool>* = nullptr>
+  Scalar(T vv, bool) : tag(Tag::HAS_d) {
+    v.d = convert<decltype(v.d), T>(vv);
+  }
+
+  template <
+      typename T,
+      typename std::enable_if_t<c10::is_complex<T>::value, bool>* = nullptr>
+  Scalar(T vv, bool) : tag(Tag::HAS_z) {
+    v.z = convert<decltype(v.z), T>(vv);
+  }
+};
+
+using OptionalScalarRef = c10::OptionalRef<Scalar>;
+
+// define the scalar.to<int64_t>() specializations
+#define DEFINE_TO(T, name)         \
+  template <>                      \
+  inline T Scalar::to<T>() const { \
+    return to##name();             \
+  }
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_TO)
+DEFINE_TO(uint16_t, UInt16)
+DEFINE_TO(uint32_t, UInt32)
+DEFINE_TO(uint64_t, UInt64)
+#undef DEFINE_TO
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/ScalarType.h b/MLPY/Lib/site-packages/torch/include/c10/core/ScalarType.h
new file mode 100644
index 0000000000000000000000000000000000000000..08f26ac2b30d429ed7a7bc72877531661c9ac11e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/ScalarType.h
@@ -0,0 +1,620 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/bits.h>
+#include <c10/util/complex.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint2x4.h>
+#include <c10/util/quint4x2.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <type_traits>
+
+namespace c10 {
+
+// dummy struct for uint1 to uint7, actual functionality
+// of these dtypes will be implemented in python with Tensor subclass
+template <unsigned int N>
+struct dummy_uint1_7_t {};
+
+// For the macros below:
+//
+// For users: If you want to macro some code for all non-QInt scalar types
+// (i.e. types with complete information, you probably want one of the
+// AT_FORALL_SCALAR_TYPES / AT_FORALL_SCALAR_TYPES_AND macros below, which are
+// designed to behave similarly to the Dispatch macros with the same name.
+//
+// For adding a new dtype: In the beginning, we had an idea that there was a
+// list of all scalar types, and you could use AT_FORALL_SCALAR_TYPES to
+// iterate over them.  But over the years we added weird types which couldn't
+// be handled uniformly everywhere and so in the end we ended up with some
+// mish-mosh of some helper macros, but mostly use sites making a call about
+// what dtypes they can or can't support.  So if you want to add a new dtype,
+// the preferred resolution is to find a dtype similar to what you want,
+// grep for it and edit all the sites you find this way.  If you need to add
+// a completely new kind of dtype, you're going to have to laboriously audit
+// all of the sites everywhere to figure out how it should work.  Consulting
+// some old PRs where we added new dtypes (check history of this file) can
+// help give you an idea where to start.
+
+// NB: Order matters for this macro; it is relied upon in
+// _promoteTypesLookup and the serialization format.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(_) \
+  _(uint8_t, Byte) /* 0 */                               \
+  _(int8_t, Char) /* 1 */                                \
+  _(int16_t, Short) /* 2 */                              \
+  _(int, Int) /* 3 */                                    \
+  _(int64_t, Long) /* 4 */                               \
+  _(at::Half, Half) /* 5 */                              \
+  _(float, Float) /* 6 */                                \
+  _(double, Double) /* 7 */                              \
+  _(c10::complex<c10::Half>, ComplexHalf) /* 8 */        \
+  _(c10::complex<float>, ComplexFloat) /* 9 */           \
+  _(c10::complex<double>, ComplexDouble) /* 10 */        \
+  _(bool, Bool) /* 11 */                                 \
+  _(c10::qint8, QInt8) /* 12 */                          \
+  _(c10::quint8, QUInt8) /* 13 */                        \
+  _(c10::qint32, QInt32) /* 14 */                        \
+  _(at::BFloat16, BFloat16) /* 15 */                     \
+  _(c10::quint4x2, QUInt4x2) /* 16 */                    \
+  _(c10::quint2x4, QUInt2x4) /* 17 */                    \
+  _(c10::bits1x8, Bits1x8) /* 18 */                      \
+  _(c10::bits2x4, Bits2x4) /* 19 */                      \
+  _(c10::bits4x2, Bits4x2) /* 20 */                      \
+  _(c10::bits8, Bits8) /* 21 */                          \
+  _(c10::bits16, Bits16) /* 22 */                        \
+  _(c10::Float8_e5m2, Float8_e5m2) /* 23 */              \
+  _(c10::Float8_e4m3fn, Float8_e4m3fn) /* 24 */          \
+  _(c10::Float8_e5m2fnuz, Float8_e5m2fnuz) /* 25 */      \
+  _(c10::Float8_e4m3fnuz, Float8_e4m3fnuz) /* 26 */      \
+  _(uint16_t, UInt16) /* 27 */                           \
+  _(uint32_t, UInt32) /* 28 */                           \
+  _(uint64_t, UInt64) /* 29 */                           \
+  _(c10::dummy_uint1_7_t<1>, UInt1) /* 30 */             \
+  _(c10::dummy_uint1_7_t<2>, UInt2) /* 31 */             \
+  _(c10::dummy_uint1_7_t<3>, UInt3) /* 32 */             \
+  _(c10::dummy_uint1_7_t<4>, UInt4) /* 33 */             \
+  _(c10::dummy_uint1_7_t<5>, UInt5) /* 34 */             \
+  _(c10::dummy_uint1_7_t<6>, UInt6) /* 35 */             \
+  _(c10::dummy_uint1_7_t<7>, UInt7) /* 36 */
+
+// If you want to support ComplexHalf for real, add ComplexHalf
+// into this macro (and change the name).  But beware: convert()
+// doesn't work for all the conversions you need...
+//
+// TODO: To add unsigned int types here, we must define accumulate type.
+// But uint8 currently accumulates into int64, so we would have to make
+// an inconsistent choice for the larger types.  Difficult.
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF_F8NZ(_) \
+  _(uint8_t, Byte)                                                      \
+  _(int8_t, Char)                                                       \
+  _(int16_t, Short)                                                     \
+  _(int, Int)                                                           \
+  _(int64_t, Long)                                                      \
+  _(at::Half, Half)                                                     \
+  _(float, Float)                                                       \
+  _(double, Double)                                                     \
+  _(c10::complex<float>, ComplexFloat)                                  \
+  _(c10::complex<double>, ComplexDouble)                                \
+  _(bool, Bool)                                                         \
+  _(at::BFloat16, BFloat16)                                             \
+  _(at::Float8_e5m2, Float8_e5m2)                                       \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)
+
+// This macro controls many of our C++ APIs, including constructors
+// for Scalar as well as the data() and item() accessors on Tensor
+#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
+  _(uint8_t, Byte)                             \
+  _(int8_t, Char)                              \
+  _(int16_t, Short)                            \
+  _(int, Int)                                  \
+  _(int64_t, Long)                             \
+  _(at::Half, Half)                            \
+  _(float, Float)                              \
+  _(double, Double)                            \
+  _(c10::complex<c10::Half>, ComplexHalf)      \
+  _(c10::complex<float>, ComplexFloat)         \
+  _(c10::complex<double>, ComplexDouble)       \
+  _(bool, Bool)                                \
+  _(at::BFloat16, BFloat16)                    \
+  _(at::Float8_e5m2, Float8_e5m2)              \
+  _(at::Float8_e4m3fn, Float8_e4m3fn)          \
+  _(at::Float8_e5m2fnuz, Float8_e5m2fnuz)      \
+  _(at::Float8_e4m3fnuz, Float8_e4m3fnuz)
+
+enum class ScalarType : int8_t {
+#define DEFINE_ST_ENUM_VAL_(_1, n) n,
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ST_ENUM_VAL_)
+#undef DEFINE_ENUM_ST_ENUM_VAL_
+      Undefined,
+  NumOptions
+};
+
+constexpr uint16_t NumScalarTypes =
+    static_cast<uint16_t>(ScalarType::NumOptions);
+
+namespace impl {
+
+// These are used to map ScalarTypes to C++ types.
+
+template <c10::ScalarType N>
+struct ScalarTypeToCPPType;
+
+#define SPECIALIZE_ScalarTypeToCPPType(cpp_type, scalar_type)                \
+  template <>                                                                \
+  struct ScalarTypeToCPPType<c10::ScalarType::scalar_type> {                 \
+    using type = cpp_type;                                                   \
+                                                                             \
+    /* This is a workaround for the CUDA bug which prevents */               \
+    /* ::detail::ScalarTypeToCType<T>::type being used directly due to */    \
+    /* ambiguous reference which can't to be resolved. For some reason it */ \
+    /* can't pick between at::detail and at::cuda::detail. */                \
+    /* For repro example, please see: */                                     \
+    /* https://gist.github.com/izdeby/952ae7cf256ddb740a73776d39a7e7ba */    \
+    /* TODO: remove once the bug is fixed. */                                \
+    static type t;                                                           \
+  };
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_ScalarTypeToCPPType)
+
+#undef SPECIALIZE_ScalarTypeToCPPType
+
+template <c10::ScalarType N>
+using ScalarTypeToCPPTypeT = typename ScalarTypeToCPPType<N>::type;
+
+} // namespace impl
+
+template <typename T>
+struct CppTypeToScalarType;
+
+#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type)                  \
+  template <>                                                                  \
+  struct CppTypeToScalarType<cpp_type>                                         \
+      : std::                                                                  \
+            integral_constant<c10::ScalarType, c10::ScalarType::scalar_type> { \
+  };
+
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
+
+#undef SPECIALIZE_CppTypeToScalarType
+
+// NB: despite its generic sounding name, the macros that don't take _AND
+// are mostly only used by tensorexpr
+#define AT_FORALL_INT_TYPES(_) \
+  _(uint8_t, Byte)             \
+  _(int8_t, Char)              \
+  _(int16_t, Short)            \
+  _(int, Int)                  \
+  _(int64_t, Long)
+
+#define AT_FORALL_SCALAR_TYPES(_) \
+  _(uint8_t, Byte)                \
+  _(int8_t, Char)                 \
+  _(int16_t, Short)               \
+  _(int, Int)                     \
+  _(int64_t, Long)                \
+  _(float, Float)                 \
+  _(double, Double)
+
+// These macros are often controlling how many template instantiations we
+// create for kernels.  It is typically inappropriate to add new dtypes here,
+// instead, new types should be added to use sites on a case-by-case basis.
+// We generally are not accepting new dtypes due to binary size concerns.
+
+#define AT_FORALL_SCALAR_TYPES_AND(SCALARTYPE, _) \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE>::t),  \
+    SCALARTYPE)
+
+#define AT_FORALL_SCALAR_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, _) \
+  _(uint8_t, Byte)                                               \
+  _(int8_t, Char)                                                \
+  _(int16_t, Short)                                              \
+  _(int, Int)                                                    \
+  _(int64_t, Long)                                               \
+  _(float, Float)                                                \
+  _(double, Double)                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE1>::t),                \
+    SCALARTYPE1)                                                 \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                   \
+             ::c10::ScalarType::SCALARTYPE2>::t),                \
+    SCALARTYPE2)
+
+#define AT_FORALL_SCALAR_TYPES_AND3(SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, _) \
+  _(uint8_t, Byte)                                                            \
+  _(int8_t, Char)                                                             \
+  _(int16_t, Short)                                                           \
+  _(int, Int)                                                                 \
+  _(int64_t, Long)                                                            \
+  _(float, Float)                                                             \
+  _(double, Double)                                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE1>::t),                             \
+    SCALARTYPE1)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE2>::t),                             \
+    SCALARTYPE2)                                                              \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                                \
+             ::c10::ScalarType::SCALARTYPE3>::t),                             \
+    SCALARTYPE3)
+
+#define AT_FORALL_SCALAR_TYPES_AND4(                       \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, _) \
+  _(uint8_t, Byte)                                         \
+  _(int8_t, Char)                                          \
+  _(int16_t, Short)                                        \
+  _(int, Int)                                              \
+  _(int64_t, Long)                                         \
+  _(float, Float)                                          \
+  _(double, Double)                                        \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
+             ::c10::ScalarType::SCALARTYPE1>::t),          \
+    SCALARTYPE1)                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
+             ::c10::ScalarType::SCALARTYPE2>::t),          \
+    SCALARTYPE2)                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
+             ::c10::ScalarType::SCALARTYPE3>::t),          \
+    SCALARTYPE3)                                           \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<             \
+             ::c10::ScalarType::SCALARTYPE4>::t),          \
+    SCALARTYPE4)
+
+#define AT_FORALL_SCALAR_TYPES_AND5(                                    \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, _) \
+  _(uint8_t, Byte)                                                      \
+  _(int8_t, Char)                                                       \
+  _(int16_t, Short)                                                     \
+  _(int, Int)                                                           \
+  _(int64_t, Long)                                                      \
+  _(float, Float)                                                       \
+  _(double, Double)                                                     \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
+             ::c10::ScalarType::SCALARTYPE1>::t),                       \
+    SCALARTYPE1)                                                        \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
+             ::c10::ScalarType::SCALARTYPE2>::t),                       \
+    SCALARTYPE2)                                                        \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
+             ::c10::ScalarType::SCALARTYPE3>::t),                       \
+    SCALARTYPE3)                                                        \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
+             ::c10::ScalarType::SCALARTYPE4>::t),                       \
+    SCALARTYPE4)                                                        \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<                          \
+             ::c10::ScalarType::SCALARTYPE5>::t),                       \
+    SCALARTYPE5)
+
+#define AT_FORALL_SCALAR_TYPES_AND6(              \
+    SCALARTYPE1,                                  \
+    SCALARTYPE2,                                  \
+    SCALARTYPE3,                                  \
+    SCALARTYPE4,                                  \
+    SCALARTYPE5,                                  \
+    SCALARTYPE6,                                  \
+    _)                                            \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE1>::t), \
+    SCALARTYPE1)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE2>::t), \
+    SCALARTYPE2)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE3>::t), \
+    SCALARTYPE3)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE4>::t), \
+    SCALARTYPE4)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE5>::t), \
+    SCALARTYPE5)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE6>::t), \
+    SCALARTYPE6)
+
+#define AT_FORALL_SCALAR_TYPES_AND7(              \
+    SCALARTYPE1,                                  \
+    SCALARTYPE2,                                  \
+    SCALARTYPE3,                                  \
+    SCALARTYPE4,                                  \
+    SCALARTYPE5,                                  \
+    SCALARTYPE6,                                  \
+    SCALARTYPE7,                                  \
+    _)                                            \
+  _(uint8_t, Byte)                                \
+  _(int8_t, Char)                                 \
+  _(int16_t, Short)                               \
+  _(int, Int)                                     \
+  _(int64_t, Long)                                \
+  _(float, Float)                                 \
+  _(double, Double)                               \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE1>::t), \
+    SCALARTYPE1)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE2>::t), \
+    SCALARTYPE2)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE3>::t), \
+    SCALARTYPE3)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE4>::t), \
+    SCALARTYPE4)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE5>::t), \
+    SCALARTYPE5)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE6>::t), \
+    SCALARTYPE6)                                  \
+  _(decltype(::c10::impl::ScalarTypeToCPPType<    \
+             ::c10::ScalarType::SCALARTYPE7>::t), \
+    SCALARTYPE7)
+
+#define AT_FORALL_QINT_TYPES(_) \
+  _(c10::qint8, QInt8)          \
+  _(c10::quint8, QUInt8)        \
+  _(c10::qint32, QInt32)        \
+  _(c10::quint4x2, QUInt4x2)    \
+  _(c10::quint2x4, QUInt2x4)
+
+#define AT_FORALL_COMPLEX_TYPES(_)     \
+  _(c10::complex<float>, ComplexFloat) \
+  _(c10::complex<double>, ComplexDouble)
+
+#define DEFINE_CONSTANT(_, name) \
+  constexpr ScalarType k##name = ScalarType::name;
+
+// NOLINTNEXTLINE(clang-diagnostic-unused-const-variable)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CONSTANT)
+#undef DEFINE_CONSTANT
+
+static inline const char* toString(ScalarType t) {
+#define DEFINE_CASE(_, name) \
+  case ScalarType::name:     \
+    return #name;
+
+  switch (t) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CASE)
+    default:
+      return "UNKNOWN_SCALAR";
+  }
+#undef DEFINE_CASE
+}
+
+static inline size_t elementSize(ScalarType t) {
+#define CASE_ELEMENTSIZE_CASE(ctype, name) \
+  case ScalarType::name:                   \
+    return sizeof(ctype);
+
+  switch (t) {
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(CASE_ELEMENTSIZE_CASE)
+    default:
+      TORCH_CHECK(false, "Unknown ScalarType");
+  }
+#undef CASE_ELEMENTSIZE_CASE
+}
+
+static inline bool isIntegralType(ScalarType t, bool includeBool) {
+  bool isIntegral =
+      (t == ScalarType::Byte || t == ScalarType::Char || t == ScalarType::Int ||
+       t == ScalarType::Long || t == ScalarType::Short ||
+       t == ScalarType::UInt16 || t == ScalarType::UInt32 ||
+       t == ScalarType::UInt64);
+
+  return isIntegral || (includeBool && t == ScalarType::Bool);
+}
+
+C10_DEPRECATED_MESSAGE(
+    "isIntegralType is deprecated. Please use the overload with 'includeBool' parameter instead.")
+static inline bool isIntegralType(ScalarType t) {
+  return isIntegralType(t, /*includeBool=*/false);
+}
+
+static inline bool isFloat8Type(ScalarType t) {
+  return t == ScalarType::Float8_e5m2 || t == ScalarType::Float8_e5m2fnuz ||
+      t == ScalarType::Float8_e4m3fn || t == ScalarType::Float8_e4m3fnuz;
+}
+
+static inline bool isReducedFloatingType(ScalarType t) {
+  return t == ScalarType::Half || t == ScalarType::BFloat16 || isFloat8Type(t);
+}
+
+static inline bool isFloatingType(ScalarType t) {
+  return t == ScalarType::Double || t == ScalarType::Float ||
+      isReducedFloatingType(t);
+}
+
+static inline bool isComplexType(ScalarType t) {
+  return (
+      t == ScalarType::ComplexHalf || t == ScalarType::ComplexFloat ||
+      t == ScalarType::ComplexDouble);
+}
+
+static inline bool isQIntType(ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
+      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
+      t == ScalarType::QUInt2x4;
+}
+
+static inline bool isBitsType(ScalarType t) {
+  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
+      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
+      t == ScalarType::Bits16;
+}
+
+static inline bool isBarebonesUnsignedType(ScalarType t) {
+  return t == ScalarType::UInt1 || t == ScalarType::UInt2 ||
+      t == ScalarType::UInt3 || t == ScalarType::UInt4 ||
+      t == ScalarType::UInt5 || t == ScalarType::UInt6 ||
+      t == ScalarType::UInt7 || t == ScalarType::UInt16 ||
+      t == ScalarType::UInt32 || t == ScalarType::UInt64;
+}
+
+static inline ScalarType toQIntType(ScalarType t) {
+  switch (t) {
+    case ScalarType::Byte:
+      return ScalarType::QUInt8;
+    case ScalarType::Char:
+      return ScalarType::QInt8;
+    case ScalarType::Int:
+      return ScalarType::QInt32;
+    default:
+      return t;
+  }
+}
+
+static inline ScalarType toUnderlying(ScalarType t) {
+  switch (t) {
+    case ScalarType::QUInt8:
+    case ScalarType::QUInt4x2:
+      [[fallthrough]];
+    case ScalarType::QUInt2x4:
+      return ScalarType::Byte;
+    case ScalarType::QInt8:
+      return ScalarType::Char;
+    case ScalarType::QInt32:
+      return ScalarType::Int;
+    default:
+      return t;
+  }
+}
+
+static inline bool isSignedType(ScalarType t) {
+  TORCH_CHECK(!isQIntType(t), "isSignedType not supported for quantized types");
+#define CASE_SIGNED(ctype, name) \
+  case ScalarType::name:         \
+    return std::numeric_limits<ctype>::is_signed;
+
+  switch (t) {
+    case ScalarType::Bits1x8:
+    case ScalarType::Bits2x4:
+    case ScalarType::Bits4x2:
+    case ScalarType::Bits8:
+    case ScalarType::Bits16:
+      TORCH_CHECK(false, "Bits types are undefined");
+    case ScalarType::ComplexHalf:
+    case ScalarType::ComplexFloat:
+    case ScalarType::ComplexDouble:
+      return true;
+      AT_FORALL_SCALAR_TYPES_AND7(
+          Half,
+          Bool,
+          BFloat16,
+          Float8_e5m2,
+          Float8_e4m3fn,
+          Float8_e5m2fnuz,
+          Float8_e4m3fnuz,
+          CASE_SIGNED)
+    default:
+      TORCH_CHECK(false, "Unknown ScalarType");
+  }
+#undef CASE_SIGNED
+}
+
+static inline bool isUnderlying(ScalarType type, ScalarType qtype) {
+  return type == toUnderlying(qtype);
+}
+
+static inline ScalarType toRealValueType(ScalarType t) {
+  switch (t) {
+    case ScalarType::ComplexHalf:
+      return ScalarType::Half;
+    case ScalarType::ComplexFloat:
+      return ScalarType::Float;
+    case ScalarType::ComplexDouble:
+      return ScalarType::Double;
+    default:
+      return t;
+  }
+}
+
+static inline ScalarType toComplexType(ScalarType t) {
+  switch (t) {
+    case ScalarType::BFloat16:
+      // BFloat16 has range equivalent to Float,
+      // so we map it to ComplexFloat.
+      return ScalarType::ComplexFloat;
+    case ScalarType::Half:
+      return ScalarType::ComplexHalf;
+    case ScalarType::Float:
+      return ScalarType::ComplexFloat;
+    case ScalarType::Double:
+      return ScalarType::ComplexDouble;
+    case ScalarType::ComplexHalf:
+      return ScalarType::ComplexHalf;
+    case ScalarType::ComplexFloat:
+      return ScalarType::ComplexFloat;
+    case ScalarType::ComplexDouble:
+      return ScalarType::ComplexDouble;
+    default:
+      TORCH_CHECK(false, "Unknown Complex ScalarType for ", t);
+  }
+}
+
+// see tensor_attributes.rst for detailed explanation and examples
+// of casting rules.
+static inline bool canCast(const ScalarType from, const ScalarType to) {
+  // We disallow complex -> non complex, e.g., float_tensor *= complex is
+  // disallowed.
+  if (isComplexType(from) && !isComplexType(to)) {
+    return false;
+  }
+  // We disallow float -> integral, e.g., int_tensor *= float is disallowed.
+  if (isFloatingType(from) && isIntegralType(to, false)) {
+    return false;
+  }
+
+  // Treat bool as a distinct "category," to be consistent with type promotion
+  // rules (e.g. `bool_tensor + 5 -> int64_tensor`). If `5` was in the same
+  // category as `bool_tensor`, we would not promote. Differing categories
+  // implies `bool_tensor += 5` is disallowed.
+  //
+  // NB: numpy distinguishes "unsigned" as a category to get the desired
+  // `bool_tensor + 5 -> int64_tensor` behavior. We don't, because:
+  // * We don't want the performance hit of checking the runtime sign of
+  // Scalars.
+  // * `uint8_tensor + 5 -> int64_tensor` would be undesirable.
+  if (from != ScalarType::Bool && to == ScalarType::Bool) {
+    return false;
+  }
+  return true;
+}
+
+C10_API ScalarType promoteTypes(ScalarType a, ScalarType b);
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::ScalarType scalar_type) {
+  return stream << toString(scalar_type);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h b/MLPY/Lib/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h
new file mode 100644
index 0000000000000000000000000000000000000000..65f4302325727e7d15f8ccc84e2a71c129d5d387
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/ScalarTypeToTypeMeta.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
+#include <c10/util/typeid.h>
+
+// these just expose TypeMeta/ScalarType bridge functions in c10
+// TODO move to typeid.h (or codemod away) when TypeMeta et al
+// are moved from caffe2 to c10 (see note at top of typeid.h)
+
+namespace c10 {
+
+/**
+ * convert ScalarType enum values to TypeMeta handles
+ */
+static inline caffe2::TypeMeta scalarTypeToTypeMeta(ScalarType scalar_type) {
+  return caffe2::TypeMeta::fromScalarType(scalar_type);
+}
+
+/**
+ * convert TypeMeta handles to ScalarType enum values
+ */
+static inline ScalarType typeMetaToScalarType(caffe2::TypeMeta dtype) {
+  return dtype.toScalarType();
+}
+
+/**
+ * typeMetaToScalarType(), lifted to optional
+ */
+static inline optional<at::ScalarType> optTypeMetaToScalarType(
+    optional<caffe2::TypeMeta> type_meta) {
+  if (!type_meta.has_value()) {
+    return c10::nullopt;
+  }
+  return type_meta->toScalarType();
+}
+
+/**
+ * convenience: equality across TypeMeta/ScalarType conversion
+ */
+static inline bool operator==(ScalarType t, caffe2::TypeMeta m) {
+  return m.isScalarType(t);
+}
+
+static inline bool operator==(caffe2::TypeMeta m, ScalarType t) {
+  return t == m;
+}
+
+static inline bool operator!=(ScalarType t, caffe2::TypeMeta m) {
+  return !(t == m);
+}
+
+static inline bool operator!=(caffe2::TypeMeta m, ScalarType t) {
+  return !(t == m);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Storage.h b/MLPY/Lib/site-packages/torch/include/c10/core/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..09be93941bb9e2a766a0784d96ee2a35dae8d099
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Storage.h
@@ -0,0 +1,272 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstddef>
+#include <utility>
+
+namespace c10 {
+
+struct Storage;
+
+C10_API bool isSharedStorageAlias(
+    const Storage& storage0,
+    const Storage& storage1);
+
+struct C10_API Storage {
+ public:
+  struct use_byte_size_t {};
+  struct unsafe_borrow_t {
+    explicit unsafe_borrow_t() = default;
+  };
+
+  Storage() = default;
+  Storage(c10::intrusive_ptr<StorageImpl> ptr)
+      : storage_impl_(std::move(ptr)) {}
+
+  // Allocates memory buffer using given allocator and creates a storage with it
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      const SymInt& size_bytes,
+      Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            size_bytes,
+            allocator,
+            resizable)) {}
+
+  // Creates storage with pre-allocated memory buffer. Allocator is given for
+  // potential future reallocations, however it can be nullptr if the storage
+  // is non-resizable
+  Storage(
+      use_byte_size_t /*use_byte_size*/,
+      size_t size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator = nullptr,
+      bool resizable = false)
+      : storage_impl_(c10::make_intrusive<StorageImpl>(
+            StorageImpl::use_byte_size_t(),
+            size_bytes,
+            std::move(data_ptr),
+            allocator,
+            resizable)) {}
+
+ protected:
+  explicit Storage(unsafe_borrow_t, const Storage& rhs)
+      : storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
+            rhs.storage_impl_.get())) {}
+
+  friend MaybeOwnedTraits<Storage>;
+
+ public:
+  // Legacy constructor for partially initialized (dtype or memory) storages
+  // that can be temporarily created with Caffe2 APIs. See the note on top of
+  // TensorImpl.h for details.
+  static Storage create_legacy(at::Device device) {
+    auto allocator = GetAllocator(device.type());
+    return Storage(c10::make_intrusive<StorageImpl>(
+        StorageImpl::use_byte_size_t(),
+        0,
+        allocator->allocate(0), // materialize a non-default Device.
+        allocator,
+        true));
+  }
+
+  // Mimic create_legacy, but without requiring a newly-created StorageImpl.
+  void reset_legacy() {
+    TORCH_CHECK(resizable() && allocator());
+    set_nbytes(0);
+    set_data_ptr_noswap(allocator()->allocate(0));
+  }
+
+  // TODO: remove later
+  void set_nbytes(size_t size_bytes) const {
+    storage_impl_->set_nbytes(size_bytes);
+  }
+
+  void set_nbytes(c10::SymInt size_bytes) const {
+    storage_impl_->set_nbytes(std::move(size_bytes));
+  }
+
+  bool resizable() const {
+    return storage_impl_->resizable();
+  }
+
+  size_t nbytes() const {
+    return storage_impl_->nbytes();
+  }
+
+  SymInt sym_nbytes() const {
+    return storage_impl_->sym_nbytes();
+  }
+  // get() use here is to get const-correctness
+
+  const void* data() const {
+    return storage_impl_->data();
+  }
+
+  void* mutable_data() const {
+    return storage_impl_->mutable_data();
+  }
+
+  at::DataPtr& mutable_data_ptr() const {
+    return storage_impl_->mutable_data_ptr();
+  }
+
+  const at::DataPtr& data_ptr() const {
+    return storage_impl_->data_ptr();
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) const {
+    return storage_impl_->set_data_ptr(std::move(data_ptr));
+  }
+
+  void set_data_ptr_noswap(at::DataPtr&& data_ptr) const {
+    return storage_impl_->set_data_ptr_noswap(std::move(data_ptr));
+  }
+
+  DeviceType device_type() const {
+    return storage_impl_->device_type();
+  }
+
+  at::Allocator* allocator() const {
+    return storage_impl_->allocator();
+  }
+
+  at::Device device() const {
+    return storage_impl_->device();
+  }
+
+  StorageImpl* unsafeReleaseStorageImpl() {
+    return storage_impl_.release();
+  }
+
+  StorageImpl* unsafeGetStorageImpl() const noexcept {
+    return storage_impl_.get();
+  }
+
+  c10::weak_intrusive_ptr<StorageImpl> getWeakStorageImpl() const {
+    return c10::weak_intrusive_ptr<StorageImpl>(storage_impl_);
+  }
+
+  operator bool() const {
+    return storage_impl_;
+  }
+
+  size_t use_count() const {
+    return storage_impl_.use_count();
+  }
+
+  inline bool unique() const {
+    return storage_impl_.unique();
+  }
+
+  bool is_alias_of(const Storage& other) const {
+    return (
+        storage_impl_ == other.storage_impl_ ||
+        isSharedStorageAlias(*this, other));
+  }
+
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      size_t capacity,
+      DeleterFnPtr d = nullptr) {
+    if (!storage_impl_.unique()) {
+      TORCH_CHECK(
+          false,
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(src, capacity, d);
+  }
+
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      size_t capacity) {
+    if (!storage_impl_.unique()) {
+      TORCH_CHECK(
+          false,
+          "UniqueStorageShareExternalPointer can only be called when use_count == 1");
+    }
+    storage_impl_->UniqueStorageShareExternalPointer(
+        std::move(data_ptr), capacity);
+  }
+
+ protected:
+  c10::intrusive_ptr<StorageImpl> storage_impl_;
+};
+
+template <>
+struct MaybeOwnedTraits<c10::Storage> {
+  using owned_type = c10::Storage;
+  using borrow_type = c10::Storage;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseStorageImpl();
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseStorageImpl(); // "leak" it, but it was already +0.
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <>
+struct ExclusivelyOwnedTraits<c10::Storage> {
+  using repr_type = c10::Storage;
+  using pointer_type = c10::Storage*;
+  using const_pointer_type = const c10::Storage*;
+
+  static repr_type nullRepr() {
+    return c10::Storage();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return c10::Storage(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(c10::Storage&& x) {
+    return std::move(x);
+  }
+
+  static c10::Storage take(c10::Storage& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/StorageImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/StorageImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..714b7d9fbe949285e89cd540e05ca5145549b2db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/StorageImpl.h
@@ -0,0 +1,276 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/impl/COW.h>
+#include <c10/core/impl/COWDeleter.h>
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstddef>
+#include <utility>
+
+namespace c10 {
+
+// A storage represents the underlying backing data buffer for a
+// tensor.  This concept was inherited from the original Torch7
+// codebase; we'd kind of like to get rid of the concept
+// (see https://github.com/pytorch/pytorch/issues/14797) but
+// it's hard work and no one has gotten around to doing it.
+//
+// NB: storage is supposed to uniquely own a data pointer; e.g.,
+// two non-null data pointers alias if and only if they are from
+// the same storage.  Technically you can violate this invariant
+// (e.g., you can create a non-owning StorageImpl with at::from_blob)
+// but a lot of things won't work correctly, including:
+//
+// - An ordinary deleter on such a storage is wrong, because normal deleters
+//   assume unique ownership, but if you have two storages at the same data,
+//   that implies there is some sort of shared ownership. So your deleter would
+//   have to actually be internally doing some sort of refcount thing
+// - Deepcopy in Python side relies on storage equality and not data pointer
+//   equality; so if there are two separate storages pointing to the same data,
+//   the data will actually get duplicated in that case (one data ptr before,
+//   two data ptrs after)
+// - Version counts won't work correctly, because we do all VC tracking at the
+//   level of storages (unless you explicitly disconnect the VC with detach);
+//   mutation because data pointers are the same are totally untracked
+struct C10_API StorageImpl : public c10::intrusive_ptr_target {
+ public:
+  struct use_byte_size_t {};
+
+  StorageImpl(
+      use_byte_size_t /*use_byte_size*/,
+      SymInt size_bytes,
+      at::DataPtr data_ptr,
+      at::Allocator* allocator,
+      bool resizable)
+      : data_ptr_(std::move(data_ptr)),
+        size_bytes_(std::move(size_bytes)),
+        size_bytes_is_heap_allocated_(size_bytes_.is_heap_allocated()),
+        resizable_(resizable),
+        received_cuda_(false),
+        allocator_(allocator) {
+    if (resizable) {
+      TORCH_INTERNAL_ASSERT(
+          allocator_, "For resizable storage, allocator must be provided");
+    }
+  }
+
+  StorageImpl(
+      use_byte_size_t /*use_byte_size*/,
+      const SymInt& size_bytes,
+      at::Allocator* allocator,
+      bool resizable)
+      : StorageImpl(
+            use_byte_size_t(),
+            size_bytes,
+            size_bytes.is_heap_allocated()
+                ? allocator->allocate(0)
+                : allocator->allocate(size_bytes.as_int_unchecked()),
+            allocator,
+            resizable) {}
+
+  StorageImpl& operator=(StorageImpl&& other) = delete;
+  StorageImpl& operator=(const StorageImpl&) = delete;
+  StorageImpl() = delete;
+  StorageImpl(StorageImpl&& other) = delete;
+  StorageImpl(const StorageImpl&) = delete;
+  ~StorageImpl() override = default;
+
+  void reset() {
+    data_ptr_.clear();
+    size_bytes_ = 0;
+    size_bytes_is_heap_allocated_ = false;
+  }
+
+  // Destructor doesn't call release_resources because it's
+  // unnecessary; don't forget to change that if needed!
+  void release_resources() override {
+    data_ptr_.clear();
+  }
+
+  size_t nbytes() const {
+    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
+    TORCH_CHECK(!size_bytes_is_heap_allocated_);
+    return size_bytes_.as_int_unchecked();
+  }
+
+  SymInt sym_nbytes() const {
+    return size_bytes_;
+  }
+
+  // TODO: remove later
+  void set_nbytes(size_t size_bytes) {
+    size_bytes_ = static_cast<int64_t>(size_bytes);
+    size_bytes_is_heap_allocated_ = false;
+  }
+
+  void set_nbytes(c10::SymInt size_bytes) {
+    size_bytes_ = std::move(size_bytes);
+  }
+
+  bool resizable() const {
+    return resizable_;
+  }
+
+  at::DataPtr& mutable_data_ptr() {
+    maybe_materialize_cow();
+    return data_ptr_;
+  }
+
+  const at::DataPtr& data_ptr() const {
+    return data_ptr_;
+  }
+
+  // Returns the previous data_ptr
+  at::DataPtr set_data_ptr(at::DataPtr&& data_ptr) {
+    // We need to materialize the old COW DataPtr because it is
+    // being returned as mutable.
+    maybe_materialize_cow();
+    return set_data_ptr_no_materialize_cow(std::move(data_ptr));
+  }
+
+  void set_data_ptr_noswap(at::DataPtr&& data_ptr) {
+    data_ptr_ = std::move(data_ptr);
+  }
+
+  const void* data() const {
+    return data_ptr_.get();
+  }
+
+  void* mutable_data() {
+    maybe_materialize_cow();
+    return data_ptr_.mutable_get();
+  }
+
+  at::DeviceType device_type() const {
+    return data_ptr_.device().type();
+  }
+
+  at::Allocator* allocator() {
+    return allocator_;
+  }
+
+  const at::Allocator* allocator() const {
+    return allocator_;
+  }
+
+  // You generally shouldn't use this method, but it is occasionally
+  // useful if you want to override how a tensor will be reallocated,
+  // after it was already allocated (and its initial allocator was
+  // set)
+  void set_allocator(at::Allocator* allocator) {
+    allocator_ = allocator;
+  }
+
+  Device device() const {
+    return data_ptr_.device();
+  }
+
+  void set_resizable(bool resizable) {
+    if (resizable) {
+      // We need an allocator to be resizable
+      AT_ASSERT(allocator_);
+    }
+    resizable_ = resizable;
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      void* src,
+      size_t size_bytes,
+      DeleterFnPtr d = nullptr) {
+    UniqueStorageShareExternalPointer(
+        at::DataPtr(src, src, d, data_ptr_.device()), size_bytes);
+  }
+
+  /**
+   * Can only be called when use_count is 1
+   */
+  void UniqueStorageShareExternalPointer(
+      at::DataPtr&& data_ptr,
+      size_t size_bytes) {
+    data_ptr_ = std::move(data_ptr);
+    size_bytes_ = static_cast<int64_t>(size_bytes);
+    size_bytes_is_heap_allocated_ = false;
+    allocator_ = nullptr;
+    resizable_ = false;
+  }
+
+  // This method can be used only after storage construction and cannot be used
+  // to modify storage status
+  void set_received_cuda(bool received_cuda) {
+    received_cuda_ = received_cuda;
+  }
+
+  bool received_cuda() {
+    return received_cuda_;
+  }
+
+  impl::PyObjectSlot* pyobj_slot() {
+    return &pyobj_slot_;
+  }
+
+  const impl::PyObjectSlot* pyobj_slot() const {
+    return &pyobj_slot_;
+  }
+
+ protected:
+  // materialize_cow_storage needs to call set_data_ptr_no_materlize_cow
+  friend void c10::impl::cow::materialize_cow_storage(StorageImpl& storage);
+
+  // Returns the previous data_ptr. If the old data_ptr was COW,
+  // this avoids materializing it
+  at::DataPtr set_data_ptr_no_materialize_cow(at::DataPtr&& data_ptr) {
+    at::DataPtr old_data_ptr(std::move(data_ptr_));
+    data_ptr_ = std::move(data_ptr);
+    return old_data_ptr;
+  }
+
+ private:
+  // Triggers a copy if this is a copy-on-write tensor.
+  void maybe_materialize_cow() {
+    if (data_ptr_.get_deleter() == impl::cow::cow_deleter) {
+      impl::cow::materialize_cow_storage(*this);
+    }
+  }
+
+  DataPtr data_ptr_;
+  SymInt size_bytes_;
+  bool size_bytes_is_heap_allocated_;
+  bool resizable_;
+  // Identifies that Storage was received from another process and doesn't have
+  // local to process cuda memory allocation
+  bool received_cuda_;
+  Allocator* allocator_;
+  impl::PyObjectSlot pyobj_slot_;
+};
+
+// Declare StorageImpl create function pointer types.
+using StorageImplCreateHelper = intrusive_ptr<StorageImpl> (*)(
+    StorageImpl::use_byte_size_t,
+    SymInt size_bytes,
+    DataPtr data_ptr,
+    Allocator* allocator,
+    bool resizable);
+
+C10_API void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr);
+
+C10_API StorageImplCreateHelper GetStorageImplCreate(DeviceType t);
+
+C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
+    c10::StorageImpl::use_byte_size_t use_byte_size,
+    c10::SymInt size_bytes,
+    c10::DataPtr data_ptr,
+    c10::Allocator* allocator,
+    bool resizable,
+    c10::optional<at::Device> device_opt);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/Stream.h b/MLPY/Lib/site-packages/torch/include/c10/core/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..8defb338f4cb31e7b4f769722d66aa3b0ec1e46a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/Stream.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <ostream>
+
+namespace c10 {
+
+/// An index representing a specific stream.  A StreamId is not independently
+/// meaningful without knowing the Device it is associated with; try to
+/// use Stream rather than StreamId directly.
+///
+/// StreamIds are opaque; they are assigned by some DeviceType-specific
+/// numbering system which is not visible to the user.  HOWEVER, we
+/// guarantee that StreamId 0 is always a valid stream, and corresponds
+/// to some sort of "default" stream.
+using StreamId = int64_t;
+
+struct C10_API StreamData3 {
+  StreamId stream_id;
+  DeviceIndex device_index;
+  DeviceType device_type;
+};
+
+// NB: I decided not to call the above StreamIndex to avoid confusion with
+// DeviceIndex.  This way, you access device index with index(), and stream id
+// with id()
+
+/**
+ * A stream is a software mechanism used to synchronize launched kernels
+ * without requiring explicit synchronizations between kernels.  The basic
+ * model is that every kernel launch is associated with a stream: every
+ * kernel on the same stream is implicitly synchronized so that if I launch
+ * kernels A and B on the same stream, A is guaranteed to finish before B
+ * launches.  If I want B to run concurrently with A, I must schedule
+ * it on a different stream.
+ *
+ * The Stream class is a backend agnostic value class representing a stream
+ * which I may schedule a kernel on.  Every stream is associated with a device,
+ * which is recorded in stream, which is used to avoid confusion about which
+ * device a stream refers to.
+ *
+ * Streams are explicitly thread-safe, in the sense that it is OK to pass
+ * a Stream from one thread to another, and kernels queued from two different
+ * threads will still get serialized appropriately.  (Of course, the
+ * time when the kernels get queued is undetermined unless you synchronize
+ * host side ;)
+ *
+ * Stream does NOT have a default constructor.  Streams are for expert
+ * users; if you want to use Streams, we're going to assume you know
+ * how to deal with C++ template error messages if you try to
+ * resize() a vector of Streams.
+ *
+ * Known instances of streams in backends:
+ *
+ *  - cudaStream_t (CUDA)
+ *  - hipStream_t (HIP)
+ *  - cl_command_queue (OpenCL)  (NB: Caffe2's existing OpenCL integration
+ *    does NOT support command queues.)
+ *
+ * Because this class is device agnostic, it cannot provide backend-specific
+ * functionality (e.g., get the cudaStream_t of a CUDA stream.)  There are
+ * wrapper classes which provide this functionality, e.g., CUDAStream.
+ */
+class C10_API Stream final {
+ private:
+  Device device_;
+  StreamId id_;
+
+ public:
+  enum Unsafe { UNSAFE };
+  enum Default { DEFAULT };
+
+  /// Unsafely construct a stream from a Device and a StreamId.  In
+  /// general, only specific implementations of streams for a
+  /// backend should manufacture Stream directly in this way; other users
+  /// should use the provided APIs to get a stream.  In particular,
+  /// we don't require backends to give any guarantees about non-zero
+  /// StreamIds; they are welcome to allocate in whatever way they like.
+  explicit Stream(Unsafe, Device device, StreamId id)
+      : device_(device), id_(id) {}
+
+  /// Construct the default stream of a Device.  The default stream is
+  /// NOT the same as the current stream; default stream is a fixed stream
+  /// that never changes, whereas the current stream may be changed by
+  /// StreamGuard.
+  explicit Stream(Default, Device device) : device_(device), id_(0) {}
+
+  bool operator==(const Stream& other) const noexcept {
+    return this->device_ == other.device_ && this->id_ == other.id_;
+  }
+  bool operator!=(const Stream& other) const noexcept {
+    return !(*this == other);
+  }
+
+  Device device() const noexcept {
+    return device_;
+  }
+  DeviceType device_type() const noexcept {
+    return device_.type();
+  }
+  DeviceIndex device_index() const noexcept {
+    return device_.index();
+  }
+  StreamId id() const noexcept {
+    return id_;
+  }
+
+  // Enqueues a wait instruction in the stream's work queue.
+  // This instruction is a no-op unless the event is marked
+  // for recording. In that case the stream stops processing
+  // until the event is recorded.
+  template <typename T>
+  void wait(const T& event) const {
+    event.block(*this);
+  }
+
+  // Return whether all asynchronous work previously enqueued on this stream
+  // has completed running on the device.
+  bool query() const;
+
+  // Wait (by blocking the calling thread) until all asynchronous work enqueued
+  // on this stream has completed running on the device.
+  void synchronize() const;
+
+  // The purpose of this function is to more conveniently permit binding
+  // of Stream to and from Python.  Without packing, I have to setup a whole
+  // class with two fields (device and stream id); with packing I can just
+  // store a single uint64_t.
+  //
+  // The particular way we pack streams into a uint64_t is considered an
+  // implementation detail and should not be relied upon.
+  uint64_t hash() const noexcept {
+    // Concat these together into a 64-bit integer
+    uint64_t bits = static_cast<uint64_t>(device_type()) << 56 |
+        static_cast<uint64_t>(device_index()) << 48 |
+        // Remove the sign extension part of the 64-bit address because
+        // the id might be used to hold a pointer.
+        (static_cast<uint64_t>(id()) & ((1ull << 48) - 1));
+    return bits;
+  }
+
+  struct StreamData3 pack3() const {
+    return {id(), device_index(), device_type()};
+  }
+
+  static Stream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    TORCH_CHECK(isValidDeviceType(device_type));
+    return Stream(UNSAFE, Device(device_type, device_index), stream_id);
+  }
+
+  // I decided NOT to provide setters on this class, because really,
+  // why would you change the device of a stream?  Just construct
+  // it correctly from the beginning dude.
+};
+
+C10_API std::ostream& operator<<(std::ostream& stream, const Stream& s);
+
+} // namespace c10
+
+namespace std {
+template <>
+struct hash<c10::Stream> {
+  size_t operator()(c10::Stream s) const noexcept {
+    return std::hash<uint64_t>{}(s.hash());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/StreamGuard.h b/MLPY/Lib/site-packages/torch/include/c10/core/StreamGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..275de06d62d2e4fcb94a8d1ee57f63f2c9529814
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/StreamGuard.h
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+
+/**
+ * A StreamGuard is an RAII class that changes the current device
+ * to the device corresponding to some stream, and changes the
+ * default stream on that device to be this stream.
+ *
+ * Use of StreamGuard is HIGHLY discouraged in operator definitions.  In
+ * a single operator, you probably don't know enough about the global
+ * state of the world to profitably decide how to set streams.  Let
+ * the caller handle this appropriately, and just use the current stream
+ * in your operator code.
+ *
+ * This StreamGuard does NOT have an uninitialized state; it is guaranteed
+ * to reset the stream and device on exit.  If you are in a situation
+ * where you *might* want to setup a stream guard, see OptionalStreamGuard.
+ */
+struct StreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit StreamGuard() = delete;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current  stream on that device to the passed stream.
+  explicit StreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Copy is disallowed
+  StreamGuard(const StreamGuard&) = delete;
+  StreamGuard& operator=(const StreamGuard&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  StreamGuard(StreamGuard&& other) = delete;
+  StreamGuard& operator=(StreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on , use MultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the stream that was set at the time the guard was constructed.
+  Stream original_stream() const {
+    return guard_.original_stream();
+  }
+
+  /// Returns the most recent stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  Stream current_stream() const {
+    return guard_.current_stream();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::VirtualGuardImpl> guard_;
+};
+
+/**
+ * An OptionalStreamGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * See OptionalDeviceGuard for more guidance on how to use this class.
+ */
+struct OptionalStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalStreamGuard() = default;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  explicit OptionalStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalStreamGuard(optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalStreamGuard(const OptionalStreamGuard&) = delete;
+  OptionalStreamGuard& operator=(const OptionalStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalStreamGuard(OptionalStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalStreamGuard& operator=(OptionalStreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the stream that was set at the time the guard was most recently
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<Stream> original_stream() const {
+    return guard_.original_stream();
+  }
+
+  /// Returns the most recent  stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<Stream> current_stream() const {
+    return guard_.current_stream();
+  }
+
+  /// Restore the original  device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::VirtualGuardImpl> guard_{};
+};
+
+/**
+ * A MultiStreamGuard is an RAII class that sets the current streams of a set of
+ * devices all at once, and resets them to their original values on destruction.
+ */
+struct MultiStreamGuard {
+  /// Set the current streams to the passed streams on each of their respective
+  /// devices.
+  explicit MultiStreamGuard(ArrayRef<Stream> streams) : guard_(streams) {}
+
+  /// Copy is disallowed
+  MultiStreamGuard(const MultiStreamGuard&) = delete;
+  MultiStreamGuard& operator=(const MultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  MultiStreamGuard(MultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  MultiStreamGuard& operator=(MultiStreamGuard&& other) = delete;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::VirtualGuardImpl> guard_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymBool.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymBool.h
new file mode 100644
index 0000000000000000000000000000000000000000..31073aa373fc70d896570fa574777e1731cf9b31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymBool.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <ostream>
+#include <utility>
+
+namespace c10 {
+
+class C10_API SymBool {
+ public:
+  /*implicit*/ SymBool(bool b) : data_(b){};
+  SymBool(SymNode ptr) : data_(false), ptr_(std::move(ptr)) {
+    TORCH_CHECK(ptr_->is_bool());
+  };
+  SymBool() : data_(false) {}
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    return ptr_.get();
+  }
+
+  SymNodeImpl* release() && {
+    return std::move(ptr_).release();
+  }
+
+  // Only valid if is_heap_allocated()
+  SymNode toSymNodeImpl() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  bool expect_bool() const {
+    c10::optional<bool> c = maybe_as_bool();
+    TORCH_CHECK(c.has_value());
+    return *c;
+  }
+
+  SymBool sym_and(const SymBool&) const;
+  SymBool sym_or(const SymBool&) const;
+  SymBool sym_not() const;
+
+  SymBool operator&(const SymBool& other) const {
+    return sym_and(other);
+  }
+  SymBool operator|(const SymBool& other) const {
+    return sym_or(other);
+  }
+  SymBool operator~() const {
+    return sym_not();
+  }
+
+  // Insert a guard for the bool to be its concrete value, and then return
+  // that value.  Note that C++ comparison operations default to returning
+  // bool, so it's not so common to have to call this
+  bool guard_bool(const char* file, int64_t line) const;
+  bool expect_true(const char* file, int64_t line) const;
+  bool guard_size_oblivious(const char* file, int64_t line) const;
+
+  bool has_hint() const;
+
+  bool as_bool_unchecked() const {
+    return data_;
+  }
+
+  c10::optional<bool> maybe_as_bool() const {
+    if (!is_heap_allocated()) {
+      return c10::make_optional(data_);
+    }
+    return toSymNodeImplUnowned()->constant_bool();
+  }
+
+  bool is_heap_allocated() const {
+    return ptr_;
+  }
+
+ private:
+  // TODO: optimize to union
+  bool data_;
+  SymNode ptr_;
+};
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymBool& s);
+
+#define TORCH_SYM_CHECK(cond, ...) \
+  TORCH_CHECK((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)
+#define TORCH_SYM_INTERNAL_ASSERT(cond, ...) \
+  TORCH_INTERNAL_ASSERT((cond).expect_true(__FILE__, __LINE__), __VA_ARGS__)
+
+inline bool guard_size_oblivious(bool b, const char* file, int64_t line) {
+  return b;
+}
+
+inline bool guard_size_oblivious(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.guard_size_oblivious(file, line);
+}
+
+#define TORCH_GUARD_SIZE_OBLIVIOUS(cond) \
+  c10::guard_size_oblivious((cond), __FILE__, __LINE__)
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymFloat.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymFloat.h
new file mode 100644
index 0000000000000000000000000000000000000000..38dfb025f850af31af8415cacfd143e9b63ff0bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymFloat.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <c10/core/SymBool.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+
+#include <cstdint>
+#include <limits>
+#include <ostream>
+#include <utility>
+
+namespace c10 {
+
+// NB: this is actually double precision; we're using the Python naming here
+class C10_API SymFloat {
+ public:
+  /*implicit*/ SymFloat(double d) : data_(d){};
+  SymFloat(SymNode ptr)
+      : data_(std::numeric_limits<double>::quiet_NaN()), ptr_(std::move(ptr)) {
+    TORCH_CHECK(ptr_->is_float());
+  };
+  SymFloat() : data_(0.0) {}
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    return ptr_.get();
+  }
+
+  SymNodeImpl* release() && {
+    return std::move(ptr_).release();
+  }
+
+  // Only valid if is_symbolic()
+  SymNode toSymNodeImpl() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  double expect_float() const {
+    TORCH_CHECK(!is_symbolic());
+    return data_;
+  }
+
+  SymFloat operator+(const SymFloat&) const;
+  SymFloat operator-(const SymFloat&) const;
+  SymFloat operator*(const SymFloat&) const;
+  SymFloat operator/(const SymFloat&) const;
+
+  SymBool sym_eq(const SymFloat&) const;
+  SymBool sym_ne(const SymFloat&) const;
+  SymBool sym_lt(const SymFloat&) const;
+  SymBool sym_le(const SymFloat&) const;
+  SymBool sym_gt(const SymFloat&) const;
+  SymBool sym_ge(const SymFloat&) const;
+
+  bool operator==(const SymFloat& o) const {
+    return sym_eq(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator!=(const SymFloat& o) const {
+    return sym_ne(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<(const SymFloat& o) const {
+    return sym_lt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<=(const SymFloat& o) const {
+    return sym_le(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>(const SymFloat& o) const {
+    return sym_gt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>=(const SymFloat& o) const {
+    return sym_ge(o).guard_bool(__FILE__, __LINE__);
+  }
+
+  SymFloat min(const SymFloat& sci) const;
+  SymFloat max(const SymFloat& sci) const;
+
+  // Need guidance on where to put this code
+  SymFloat sqrt() const;
+
+  // Insert a guard for the float to be its concrete value, and then return
+  // that value.  This operation always works, even if the float is symbolic,
+  // so long as we know what the underlying value is. Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_float(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  double guard_float(const char* file, int64_t line) const;
+
+  bool has_hint() const;
+
+  // N.B. It's important to keep this definition in the header
+  // as we expect if checks to be folded for mobile builds
+  // where `is_symbolic` is always false
+  C10_ALWAYS_INLINE bool is_symbolic() const {
+    return ptr_;
+  }
+
+  double as_float_unchecked() const {
+    return data_;
+  }
+
+ private:
+  // TODO: optimize to union
+  double data_;
+  SymNode ptr_;
+};
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymFloat& s);
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymInt.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymInt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c1bba01c0065ec1d80062cb7fef7adddea36fac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymInt.h
@@ -0,0 +1,423 @@
+#pragma once
+
+#include <c10/core/SymBool.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <ostream>
+#include <type_traits>
+
+namespace c10 {
+
+class SymFloat;
+
+// SymInt represents either a regular int64_t, or a symbolic integer
+// (represented in a type erased way as SymNode).  The intention is for SymInt
+// to represent symbolic sizes that arise when doing shape computation in
+// operator kernels. This allows for tracing through programs without baking in
+// concrete sizes into kernel calls.
+//
+// SymInt has an API equivalent to int64_t.  In particular, it is a value type.
+// Internally, SymInt is represented in a clever packed way, so that it only
+// occupies one word of space; but morally, it is a union between an int64_t
+// and an intrusive pointer to SymNodeImpl.
+//
+// Invariant: the referenced SymNodeImpl is guaranteed to be a SymNode where
+// is_int() returns true
+
+class C10_API SymInt {
+ public:
+  enum Unchecked {
+    UNCHECKED,
+  };
+
+  /*implicit*/ SymInt(int64_t d) : data_(d) {
+    if (is_heap_allocated()) {
+      // Large negative number, heap allocate it
+      promote_to_negative();
+    }
+  };
+  SymInt() : data_(0) {}
+  SymInt(SymNode n);
+
+  // unchecked c-tor accepting raw `data_`
+  // One appropriate use for this is when you are constructing a symint
+  // in a situation where you know it is non-negative (or, if it is negative,
+  // the negative value is -1; i.e., not user controlled)
+  SymInt(Unchecked, int64_t d) : data_(d) {}
+
+  // TODO: these implementations are not optimal because they allocate a
+  // temporary and then use the move constructor/assignment
+  SymInt(const SymInt& s) : data_(0) {
+    if (s.is_heap_allocated()) {
+      *this = SymInt(s.toSymNode());
+    } else {
+      data_ = s.data_;
+    }
+  }
+  SymInt(SymInt&& s) noexcept : data_(s.data_) {
+    s.data_ = 0;
+  }
+
+  SymInt& operator=(const SymInt& s) {
+    if (this != &s) {
+      if (s.is_heap_allocated()) {
+        *this = SymInt(s.toSymNode());
+      } else {
+        data_ = s.data_;
+      }
+    }
+    return *this;
+  }
+  SymInt& operator=(SymInt&& s) noexcept {
+    if (this != &s) {
+      release_(); // release the current SymNode if any
+      data_ = s.data_;
+      if (s.is_heap_allocated())
+        s.data_ = 0;
+    };
+    return *this;
+  }
+
+  SymNodeImpl* toSymNodeImplUnowned() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(is_heap_allocated());
+    uint64_t unextended_bits = static_cast<uint64_t>(data_) & ~MASK;
+    uint64_t sign_bit_mask = 1ULL << (62 - 1);
+    // https://stackoverflow.com/questions/42534749/signed-extension-from-24-bit-to-32-bit-in-c
+    uint64_t extended_bits = (unextended_bits ^ sign_bit_mask) - sign_bit_mask;
+    return static_cast<SymNodeImpl*>(
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        reinterpret_cast<void*>(static_cast<uintptr_t>(extended_bits)));
+  }
+
+  void release_() {
+    if (is_heap_allocated()) {
+      SymNode::reclaim(toSymNodeImplUnowned()); // steal
+    }
+  }
+
+  SymNodeImpl* release() && {
+#ifndef C10_MOBILE
+    TORCH_INTERNAL_ASSERT(is_heap_allocated());
+    auto* r = toSymNodeImplUnowned();
+    data_ = 0; // transfer ownership
+    return r;
+#else
+    TORCH_INTERNAL_ASSERT(false);
+#endif
+  }
+
+  // Only valid if is_heap_allocated()
+  SymNode toSymNode() const;
+
+  // Guaranteed to return a SymNode, wrapping using base if necessary
+  SymNode wrap_node(const SymNode& base) const;
+
+  ~SymInt() {
+    release_();
+  }
+
+  // Require the int to be non-symbolic, and if it is symbolic raise an
+  // error.  This is safe to use for C++ code that doesn't work for symbolic
+  // shapes, and you don't have time to fix it immediately, as if we
+  // try to trigger the path in C++ you'll appropriately get an error
+  int64_t expect_int() const {
+    if (auto r = maybe_as_int()) {
+      return *r;
+    }
+    TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(
+        false, "when unpacking SymInt, expected int but got ", *this);
+  }
+
+  // Test if we have a hint for this int (e.g., guard_int would work).
+  // Most of the time this is true; it is only false when you have
+  // an unbacked SymInt.
+  bool has_hint() const;
+
+  // Insert a guard for the int to be its concrete value, and then return
+  // that value.  This operation always works, even if the int is symbolic,
+  // so long as we know what the underlying value is (e.g., this won't work
+  // if you call it on the size of nonzero output).  Don't blindly put this
+  // everywhere; you can cause overspecialization of PyTorch programs with
+  // this method.
+  //
+  // It should be called as guard_int(__FILE__, __LINE__).  The file and line
+  // number can be used to diagnose overspecialization.
+  int64_t guard_int(const char* file, int64_t line) const;
+
+  // Insert a guard that this SymInt must be size-like, returning true if
+  // the integer actually is >= 0.  Unlike manually performing a >= 0 test,
+  // if the SymInt in question is an unbacked SymInt (or, potentially in the
+  // future, if it contains unbacked SymInts), we will also treat the
+  // unbacked SymInt as statically testing >= 2 (which will prevent us from
+  // choking on, e.g., contiguity checks.)
+  bool expect_size(const char* file, int64_t line) const;
+
+  // Distinguish actual symbolic values from constants stored on the heap
+  bool is_symbolic() const {
+    return is_heap_allocated() &&
+        !toSymNodeImplUnowned()->constant_int().has_value();
+  }
+
+  // N.B. It's important to keep this definition in the header
+  // as we expect if checks to be folded for mobile builds
+  // where `is_heap_allocated` is always false and optimize dead code paths
+  C10_ALWAYS_INLINE bool is_heap_allocated() const {
+#ifdef C10_MOBILE
+    return false;
+#else
+    return !check_range(data_);
+#endif
+  }
+
+  SymInt operator+(const SymInt& sci) const;
+  SymInt operator-(const SymInt& sci) const;
+  SymInt operator*(const SymInt& sci) const;
+  SymInt operator/(const SymInt& sci) const;
+  SymInt operator%(const SymInt& sci) const;
+  void operator*=(const SymInt& sci);
+  void operator+=(const SymInt& sci);
+  void operator/=(const SymInt& sci);
+
+  SymInt clone() const;
+
+  SymBool sym_eq(const SymInt&) const;
+  SymBool sym_ne(const SymInt&) const;
+  SymBool sym_lt(const SymInt&) const;
+  SymBool sym_le(const SymInt&) const;
+  SymBool sym_gt(const SymInt&) const;
+  SymBool sym_ge(const SymInt&) const;
+
+  bool operator==(const SymInt& o) const {
+    return sym_eq(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator!=(const SymInt& o) const {
+    return sym_ne(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<(const SymInt& o) const {
+    return sym_lt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator<=(const SymInt& o) const {
+    return sym_le(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>(const SymInt& o) const {
+    return sym_gt(o).guard_bool(__FILE__, __LINE__);
+  }
+  bool operator>=(const SymInt& o) const {
+    return sym_ge(o).guard_bool(__FILE__, __LINE__);
+  }
+
+  SymInt min(const SymInt& sci) const;
+  SymInt max(const SymInt& sci) const;
+
+  // If both are symbolic, this checks if
+  // they share the same node.
+  // If both are not symbolic this just checks normal equality.
+  bool is_same(const SymInt& other) const;
+
+  operator SymFloat() const;
+
+  // Don't use this.  Prefer maybe_as_int instead
+  int64_t as_int_unchecked() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_heap_allocated());
+    return data_;
+  }
+
+  c10::optional<int64_t> maybe_as_int() const {
+    if (!is_heap_allocated()) {
+      return c10::make_optional(data_);
+    }
+    auto* node = toSymNodeImplUnowned();
+    if (auto c = node->constant_int()) {
+      return c;
+    }
+    return node->maybe_as_int();
+  }
+
+  // Return whether the integer is directly coercible to a SymInt
+  // without requiring heap allocation.  You don't need to use this
+  // to check if you can pass an integer to SymInt; this is guaranteed
+  // to work (it just might heap allocate!)
+  static bool check_range(int64_t i) {
+    return i > MAX_UNREPRESENTABLE_INT;
+  }
+
+  // Return the min representable integer as a SymInt without
+  // heap allocation.  For quantities that count bytes (or larger),
+  // this is still much larger than you need, so you may consider
+  // using this as a more efficient version of MIN_INT
+  static constexpr int64_t min_representable_int() {
+    return MAX_UNREPRESENTABLE_INT + 1;
+  }
+
+ private:
+  void promote_to_negative();
+
+  // Constraints on the internal representation:
+  //
+  // - Should represent positive and small negative ints
+  // - No conversion necessary for operations on ints
+  // - Must represent valid 64-bit pointers
+  // - Is symbolic test should be FAST (two arithmetic instructions is too
+  // much).
+  //   This code being a hotpath is based on Strobelight profiles of
+  //   is_heap_allocated().  FB only: https://fburl.com/strobelight/5l50ncxd
+  //   (you will need to change the time window).
+  //
+  // So, the scheme is to reserve large negative numbers (assuming
+  // two's complement):
+  //
+  // - 0b0.... means we are a positive int
+  // - 0b11... means we are a small negative int
+  // - 0b10... means we are are a pointer. This means that
+  //           [-2^63, -2^62-1] are not representable as ints.
+  //           We don't actually need all of this space as on x86_64
+  //           as the top 16bits aren't used for anything
+  static constexpr uint64_t MASK = 1ULL << 63 | 1ULL << 62 | 1ULL << 61;
+  static constexpr uint64_t IS_SYM = 1ULL << 63 | 1ULL << 61;
+  // We must manually translate the bit pattern test into a greater
+  // than test because compiler doesn't figure it out:
+  // https://godbolt.org/z/356aferaW
+  static constexpr int64_t MAX_UNREPRESENTABLE_INT =
+      -1LL & static_cast<int64_t>(~(1ULL << 62));
+  int64_t data_;
+};
+
+/// Sum of a list of SymInt; accumulates into the c10::SymInt expression
+template <
+    typename C,
+    typename std::enable_if_t<
+        std::is_same_v<typename C::value_type, c10::SymInt>,
+        int> = 0>
+inline c10::SymInt multiply_integers(const C& container) {
+  return std::accumulate(
+      container.begin(),
+      container.end(),
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
+template <
+    typename Iter,
+    typename = std::enable_if_t<std::is_same_v<
+        typename std::iterator_traits<Iter>::value_type,
+        c10::SymInt>>>
+inline c10::SymInt multiply_integers(Iter begin, Iter end) {
+  return std::accumulate(
+      begin,
+      end,
+      c10::SymInt(1),
+      [](const c10::SymInt& a, const c10::SymInt& b) { return a * b; });
+}
+
+#define DECLARE_SYMINT_OP_INTONLY(scalar_t, RetTy)      \
+  C10_API RetTy operator%(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator%(scalar_t a, const SymInt& b);
+
+#define DECLARE_SYMINT_OP(scalar_t, RetTy)              \
+  C10_API RetTy operator+(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator-(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator*(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator/(const SymInt& a, scalar_t b); \
+  C10_API RetTy operator+(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator-(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator*(scalar_t a, const SymInt& b); \
+  C10_API RetTy operator/(scalar_t a, const SymInt& b); \
+  C10_API bool operator==(const SymInt& a, scalar_t b); \
+  C10_API bool operator!=(const SymInt& a, scalar_t b); \
+  C10_API bool operator<(const SymInt& a, scalar_t b);  \
+  C10_API bool operator<=(const SymInt& a, scalar_t b); \
+  C10_API bool operator>(const SymInt& a, scalar_t b);  \
+  C10_API bool operator>=(const SymInt& a, scalar_t b); \
+  C10_API bool operator==(scalar_t a, const SymInt& b); \
+  C10_API bool operator!=(scalar_t a, const SymInt& b); \
+  C10_API bool operator<(scalar_t a, const SymInt& b);  \
+  C10_API bool operator<=(scalar_t a, const SymInt& b); \
+  C10_API bool operator>(scalar_t a, const SymInt& b);  \
+  C10_API bool operator>=(scalar_t a, const SymInt& b);
+
+DECLARE_SYMINT_OP_INTONLY(int64_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(int32_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(uint64_t, SymInt)
+DECLARE_SYMINT_OP_INTONLY(uint32_t, SymInt)
+DECLARE_SYMINT_OP(int64_t, SymInt)
+DECLARE_SYMINT_OP(int32_t, SymInt) // make sure constants work
+DECLARE_SYMINT_OP(uint64_t, SymInt)
+DECLARE_SYMINT_OP(uint32_t, SymInt)
+DECLARE_SYMINT_OP(double, SymFloat)
+DECLARE_SYMINT_OP(float, SymFloat) // just for completeness
+
+// On OSX size_t is different than uint64_t so we have to
+// define it separately
+#if defined(__APPLE__)
+DECLARE_SYMINT_OP_INTONLY(size_t, SymInt)
+DECLARE_SYMINT_OP(size_t, SymInt)
+#endif
+
+#undef DECLARE_SYMINT_OP
+
+C10_API std::ostream& operator<<(std::ostream& os, const SymInt& s);
+C10_API SymInt operator-(const SymInt& s);
+
+inline bool sym_eq(int64_t a, int64_t b) {
+  return a == b;
+}
+
+inline SymBool sym_eq(const SymInt& a, const SymInt& b) {
+  return a.sym_eq(b);
+}
+
+inline bool sym_ne(int64_t a, int64_t b) {
+  return a != b;
+}
+
+inline SymBool sym_ne(const SymInt& a, const SymInt& b) {
+  return a.sym_ne(b);
+}
+
+inline bool sym_lt(int64_t a, int64_t b) {
+  return a < b;
+}
+
+inline SymBool sym_lt(const SymInt& a, const SymInt& b) {
+  return a.sym_lt(b);
+}
+
+inline bool sym_le(int64_t a, int64_t b) {
+  return a <= b;
+}
+
+inline SymBool sym_le(const SymInt& a, const SymInt& b) {
+  return a.sym_le(b);
+}
+
+inline bool sym_gt(int64_t a, int64_t b) {
+  return a > b;
+}
+
+inline SymBool sym_gt(const SymInt& a, const SymInt& b) {
+  return a.sym_gt(b);
+}
+
+inline bool sym_ge(int64_t a, int64_t b) {
+  return a >= b;
+}
+
+inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
+  return a.sym_ge(b);
+}
+
+inline bool definitely_true(
+    const c10::SymBool& b,
+    const char* file,
+    int64_t line) {
+  return b.has_hint() && b.guard_bool(file, line);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymIntArrayRef.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymIntArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..91557143e9d8627721379a50dcff92fa60a488c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymIntArrayRef.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <cstdint>
+
+namespace c10 {
+using SymIntArrayRef = ArrayRef<SymInt>;
+
+inline at::IntArrayRef asIntArrayRefUnchecked(c10::SymIntArrayRef ar) {
+  return IntArrayRef(reinterpret_cast<const int64_t*>(ar.data()), ar.size());
+}
+
+// TODO: a SymIntArrayRef containing a heap allocated large negative integer
+// can actually technically be converted to an IntArrayRef... but not with
+// the non-owning API we have here.  We can't reinterpet cast; we have to
+// allocate another buffer and write the integers into it.  If you need it,
+// we can do it.  But I don't think you need it.
+
+inline c10::optional<at::IntArrayRef> asIntArrayRefSlowOpt(
+    c10::SymIntArrayRef ar) {
+  for (const c10::SymInt& sci : ar) {
+    if (sci.is_heap_allocated()) {
+      return c10::nullopt;
+    }
+  }
+
+  return {asIntArrayRefUnchecked(ar)};
+}
+
+inline at::IntArrayRef asIntArrayRefSlow(
+    c10::SymIntArrayRef ar,
+    const char* file,
+    int64_t line) {
+  for (const c10::SymInt& sci : ar) {
+    TORCH_CHECK(
+        !sci.is_heap_allocated(),
+        file,
+        ":",
+        line,
+        ": SymIntArrayRef expected to contain only concrete integers");
+  }
+  return asIntArrayRefUnchecked(ar);
+}
+
+#define C10_AS_INTARRAYREF_SLOW(a) c10::asIntArrayRefSlow(a, __FILE__, __LINE__)
+
+// Prefer using a more semantic constructor, like
+// fromIntArrayRefKnownNonNegative
+inline SymIntArrayRef fromIntArrayRefUnchecked(IntArrayRef array_ref) {
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
+}
+
+inline SymIntArrayRef fromIntArrayRefKnownNonNegative(IntArrayRef array_ref) {
+  return fromIntArrayRefUnchecked(array_ref);
+}
+
+inline SymIntArrayRef fromIntArrayRefSlow(IntArrayRef array_ref) {
+  for (long i : array_ref) {
+    TORCH_CHECK(
+        SymInt::check_range(i),
+        "IntArrayRef contains an int that cannot be represented as a SymInt: ",
+        i);
+  }
+  return SymIntArrayRef(
+      reinterpret_cast<const SymInt*>(array_ref.data()), array_ref.size());
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymNodeImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymNodeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..02fba001736a2320c1ed131d91c2f2ac9ed2450e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymNodeImpl.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <ostream>
+#include <string>
+
+namespace c10 {
+
+class SymNodeImpl;
+using SymNode = c10::intrusive_ptr<SymNodeImpl>;
+
+// When you add a method, you also need to edit
+// torch/csrc/jit/python/init.cpp
+// torch/csrc/utils/python_symnode.h
+// c10/core/ConstantSymNodeImpl.h
+class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
+ public:
+  ~SymNodeImpl() override = default;
+
+  template <typename T>
+  c10::intrusive_ptr<T> dyn_cast() const {
+    return c10::intrusive_ptr<T>::reclaim_copy(dynamic_cast<T*>(this));
+  }
+
+  // these could be pure virtual when we implement LTC versions
+  virtual bool is_int() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool is_bool() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool is_float() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool is_nested_int() const {
+    return false;
+  };
+  virtual SymNode add(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sub(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode mul(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode truediv(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode pow(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode floordiv(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode mod(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode eq(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode ne(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode gt(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode lt(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode le(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode ge(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode ceil() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode floor() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode neg() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_min(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_max(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_or(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_and(const SymNode& other) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_not() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_ite(const SymNode& then_val, const SymNode& else_val) {
+    TORCH_CHECK(false, "NYI");
+  };
+  // NB: self is ignored here, only the arguments are used
+  virtual SymNode is_contiguous(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_contiguous_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_contiguous_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_strides_2d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_channels_last_strides_3d(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode is_non_overlapping_and_dense(
+      ArrayRef<SymNode> sizes,
+      ArrayRef<SymNode> strides) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode clone() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode sym_float() {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual SymNode wrap_int(int64_t num) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode wrap_float(double num) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual SymNode wrap_bool(bool num) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual int64_t guard_int(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool guard_bool(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual double guard_float(const char* file, int64_t line) {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool guard_size_oblivious(const char* file, int64_t line) {
+    // No improvement for unbacked SymBools by default, replace this
+    // with a better implementation!
+    return guard_bool(file, line);
+  }
+  virtual bool expect_true(const char* file, int64_t line) {
+    // No improvement for unbacked SymBools by default, replace this
+    // with a better implementation!
+    return guard_bool(file, line);
+  };
+  virtual bool expect_size(const char* file, int64_t line) {
+    // No improvement for unbacked SymInts by default, replace this
+    // with a better implementation!
+    return ge(wrap_int(0))->guard_bool(file, line);
+  };
+  virtual int64_t int_() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool bool_() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual bool has_hint() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual std::string str() {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual c10::optional<int64_t> nested_int() {
+    return c10::nullopt;
+  }
+  virtual c10::optional<int64_t> nested_int_coeff() {
+    return c10::nullopt;
+  }
+  virtual c10::optional<int64_t> constant_int() {
+    return c10::nullopt;
+  }
+  virtual c10::optional<bool> constant_bool() {
+    return c10::nullopt;
+  }
+  virtual c10::optional<int64_t> maybe_as_int() {
+    return c10::nullopt;
+  }
+  virtual bool is_constant() {
+    return false;
+  }
+  virtual bool is_symbolic() {
+    return true;
+  }
+  std::ostream& operator<<(std::ostream& os) {
+    os << str();
+    return os;
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/SymbolicShapeMeta.h b/MLPY/Lib/site-packages/torch/include/c10/core/SymbolicShapeMeta.h
new file mode 100644
index 0000000000000000000000000000000000000000..f68f87e9486833aff29924da4afdad18a788b1a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/SymbolicShapeMeta.h
@@ -0,0 +1,214 @@
+#pragma once
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/DimVector.h>
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <utility>
+
+namespace c10 {
+
+class C10_API SymbolicShapeMeta {
+ public:
+  // Basic metadata from which other quantities are derived
+  SymDimVector sizes_ = {0};
+  SymDimVector strides_ = {1};
+  SymInt storage_offset_ = 0;
+
+  bool strides_valid_ = true; // e.g. for sparse where there are no strides
+
+  SymbolicShapeMeta() = default;
+  SymbolicShapeMeta(const SymbolicShapeMeta& other);
+
+  void refresh_numel() {
+    // Non-const, don't need to hold mutables_ lock
+    available_.fetch_and(~numel_avail);
+    numel_ = 1;
+  }
+
+  void refresh_contiguous() {
+    // Non-const, don't need to hold mutables_ lock
+    available_.fetch_and(numel_avail);
+    is_contiguous_ = false;
+    is_channels_last_contiguous_ = false;
+    is_channels_last_3d_contiguous_ = false;
+    is_channels_last_ = false;
+    is_channels_last_3d_ = false;
+    is_non_overlapping_and_dense_ = false;
+  }
+
+  int64_t dim() const {
+    return static_cast<int64_t>(sizes_.size());
+  }
+
+  // Accessors for derived quantities, computed lazily on first access
+
+  bool has_numel() const {
+    return available_.load() & numel_avail;
+  }
+  bool has_is_contiguous() const {
+    return available_.load() & is_contiguous_avail;
+  }
+  bool has_is_channels_last_contiguous() const {
+    return available_.load() & is_channels_last_contiguous_avail;
+  }
+  bool has_is_channels_last_3d_contiguous() const {
+    return available_.load() & is_channels_last_3d_contiguous_avail;
+  }
+  bool has_is_channels_last() const {
+    return available_.load() & is_channels_last_avail;
+  }
+  bool has_is_channels_last_3d() const {
+    return available_.load() & is_channels_last_3d_avail;
+  }
+  bool has_is_non_overlapping_and_dense() const {
+    return available_.load() & is_non_overlapping_and_dense_avail;
+  }
+
+  // Accessors to cached derived properties
+  // DO NOT call with mutables_ lock held
+  const SymInt& numel() const {
+    if (C10_UNLIKELY(!has_numel())) {
+      init_numel();
+    }
+    return numel_;
+  }
+
+  const SymBool& is_contiguous() const {
+    if (C10_UNLIKELY(!has_is_contiguous())) {
+      init_is_contiguous();
+    }
+    return is_contiguous_;
+  }
+
+  const SymBool& is_channels_last_contiguous() const {
+    if (C10_UNLIKELY(!has_is_channels_last_contiguous())) {
+      init_is_channels_last_contiguous();
+    }
+    return is_channels_last_contiguous_;
+  }
+
+  const SymBool& is_channels_last_3d_contiguous() const {
+    if (C10_UNLIKELY(!has_is_channels_last_3d_contiguous())) {
+      init_is_channels_last_3d_contiguous();
+    }
+    return is_channels_last_3d_contiguous_;
+  }
+
+  const SymBool& is_channels_last() const {
+    if (C10_UNLIKELY(!has_is_channels_last())) {
+      init_is_channels_last();
+    }
+    return is_channels_last_;
+  }
+
+  const SymBool& is_channels_last_3d() const {
+    if (C10_UNLIKELY(!has_is_channels_last_3d())) {
+      init_is_channels_last_3d();
+    }
+    return is_channels_last_3d_;
+  }
+
+  const SymBool& is_non_overlapping_and_dense() const {
+    if (C10_UNLIKELY(!has_is_non_overlapping_and_dense())) {
+      init_is_non_overlapping_and_dense();
+    }
+    return is_non_overlapping_and_dense_;
+  }
+
+  // Assumptions so we can short-circuit computation
+  // NOTE: Don't need to lock mutables_ since these aren't const
+  void assume_contiguous(SymBool val = true) {
+    is_contiguous_ = std::move(val);
+    available_.fetch_or(is_contiguous_avail);
+  }
+  void assume_channels_last_contiguous(SymBool val = true) {
+    is_contiguous_ = std::move(val);
+    available_.fetch_or(is_channels_last_contiguous_avail);
+  }
+  void assume_channels_last_3d_contiguous(SymBool val = true) {
+    is_channels_last_3d_contiguous_ = std::move(val);
+    available_.fetch_or(is_channels_last_3d_contiguous_avail);
+  }
+  void assume_channels_last(SymBool val = true) {
+    is_channels_last_ = std::move(val);
+    available_.fetch_or(is_channels_last_avail);
+  }
+  void assume_channels_last_3d(SymBool val = true) {
+    is_channels_last_3d_ = std::move(val);
+    available_.fetch_or(is_channels_last_3d_avail);
+  }
+  void assume_non_overlapping_and_dense(SymBool val = true) {
+    is_non_overlapping_and_dense_ = std::move(val);
+    available_.fetch_or(is_non_overlapping_and_dense_avail);
+  }
+
+ private:
+  SymBool compute_contiguous() const;
+  SymBool compute_channels_last_contiguous_2d() const;
+  SymBool compute_channels_last_contiguous_3d() const;
+  SymBool compute_strides_like_channels_last_2d() const;
+  SymBool compute_strides_like_channels_last_3d() const;
+  SymBool compute_non_overlapping_and_dense() const;
+
+  // These are little wrappers over the real compute_ functions that
+  // can make use of other contiguity fields to short circuit.
+  // They need to be implemented separately for SymBool, as SymBool does
+  // not short circuit.
+  // TODO: should the SymBool cases avoid the short circuit?  Need to reason
+  // if its correct, and reason if the simpler expressions are better for
+  // analysis (maybe not!)
+
+  SymBool compute_channels_last_contiguous_3d_dim5() const;
+  SymBool compute_channels_last_2d_dim5() const;
+  SymBool compute_channels_last_3d_dim5() const;
+  SymBool compute_is_non_overlapping_and_dense_dim4() const;
+  SymBool compute_is_non_overlapping_and_dense_dim5() const;
+  SymBool compute_is_non_overlapping_and_dense_anydim() const;
+
+  void init_numel() const;
+  void init_is_contiguous() const;
+  void init_is_channels_last_contiguous() const;
+  void init_is_channels_last_3d_contiguous() const;
+  void init_is_channels_last() const;
+  void init_is_channels_last_3d() const;
+  void init_is_non_overlapping_and_dense() const;
+
+  // NOTE: These only set if !has_foo()
+  void set_numel(SymInt val) const;
+  void set_is_contiguous(SymBool val) const;
+  void set_is_channels_last_contiguous(SymBool val) const;
+  void set_is_channels_last_3d_contiguous(SymBool val) const;
+  void set_is_channels_last(SymBool val) const;
+  void set_is_channels_last_3d(SymBool val) const;
+  void set_is_non_overlapping_and_dense(SymBool val) const;
+
+  // Lazily initialized variables, with the corresponding available_ flag
+  // indicating whether the value has been initialized
+  mutable std::atomic<int> available_{0};
+  enum avail {
+    numel_avail = 1 << 0,
+    is_contiguous_avail = 1 << 1,
+    is_channels_last_contiguous_avail = 1 << 2,
+    is_channels_last_3d_contiguous_avail = 1 << 3,
+    is_channels_last_avail = 1 << 4,
+    is_channels_last_3d_avail = 1 << 5,
+    is_non_overlapping_and_dense_avail = 1 << 6,
+  };
+
+  // Mutex to prevent races when initializing the variable from const accessors
+  mutable std::mutex mutables_;
+  mutable SymInt numel_ = 1;
+  mutable SymBool is_contiguous_{true};
+  mutable SymBool is_channels_last_contiguous_{false};
+  mutable SymBool is_channels_last_3d_contiguous_{false};
+  mutable SymBool is_channels_last_{false};
+  mutable SymBool is_channels_last_3d_{false};
+  mutable SymBool is_non_overlapping_and_dense_{true};
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/TensorImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/TensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..adfaa1adb5419fc692777994db1585ab7fae2923
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/TensorImpl.h
@@ -0,0 +1,3249 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymbolicShapeMeta.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/core/impl/SizesAndStrides.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Flags.h>
+#include <c10/util/Optional.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+#include <c10/util/safe_numerics.h>
+#include <c10/util/typeid.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// A global boolean variable to control whether we free memory when a Tensor
+// is shrunk to a smaller size. As a result, a Tensor is always going to
+// keep the memory allocated for its maximum capacity reshaped to so far.
+//
+// This parameter is respected "upper-case" methods which call Resize()
+// (e.g., CopyFrom, ResizeLike); it is NOT respected by Tensor::resize_
+// or ShrinkTo, both of which guarantee to never to free memory.
+C10_DECLARE_bool(caffe2_keep_on_shrink);
+
+// Since we can have high variance in blob memory allocated across different
+// inputs in the same run, we will shrink the blob only if the memory gain
+// is larger than this flag in bytes.  This only applies to functions which
+// respect caffe2_keep_on_shrink.
+C10_DECLARE_int64(caffe2_max_keep_on_shrink_memory);
+
+namespace at {
+class Tensor;
+class TensorBase;
+} // namespace at
+
+namespace c10 {
+
+/**
+ * A utility function to convert vector<int> to vector<int64_t>.
+ */
+inline std::vector<int64_t> ToVectorint64_t(const ArrayRef<int>& src) {
+  return std::vector<int64_t>(src.begin(), src.end());
+}
+
+/**
+ * Return product of all dimensions starting from k
+ */
+inline int64_t size_from_dim_(int k, IntArrayRef dims) {
+  int64_t r = 1;
+  for (const auto i : c10::irange(k, dims.size())) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims up to k (not including dims[k])
+inline int64_t size_to_dim_(int k, IntArrayRef dims) {
+  TORCH_CHECK(k >= 0 && static_cast<size_t>(k) <= dims.size());
+  int64_t r = 1;
+  for (const auto i : c10::irange(k)) {
+    r *= dims[i];
+  }
+  return r;
+}
+
+// Product of all dims between k and l (not including dims[k] and dims[l])
+inline int64_t size_between_dim_(int k, int l, IntArrayRef dims) {
+  TORCH_CHECK((unsigned)l < dims.size() && (unsigned)k < dims.size());
+  int64_t r = 1;
+  if (k < l) {
+    for (int i = k + 1; i < l; ++i) {
+      r *= dims[i];
+    }
+  } else {
+    for (int i = l + 1; i < k; ++i) {
+      r *= dims[i];
+    }
+  }
+  return r;
+}
+
+// Wrap around axis_index if it is negative, s.t., -1 is the last dim
+inline int canonical_axis_index_(int axis_index, int ndims) {
+  TORCH_CHECK(axis_index >= -ndims);
+  TORCH_CHECK(axis_index < ndims);
+  if (axis_index < 0) {
+    return axis_index + ndims;
+  }
+  return axis_index;
+}
+
+using PlacementDtor = void (*)(void*, size_t);
+
+/*
+ * A Context that will call extra placement deleter during
+ * deconstruction.
+ *
+ * Accept a already constructed DataPtr and store it as member
+ * during destruction, we'll call extra deleter on the underlying
+ * data pointer before the DataPtr is destructed.
+ * `data_ptr_` owns the memory.
+ */
+struct C10_API PlacementDeleteContext {
+  DataPtr data_ptr_;
+  PlacementDtor placement_dtor_;
+  size_t size_;
+  PlacementDeleteContext(
+      DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size)
+      : data_ptr_(std::move(data_ptr)),
+        placement_dtor_(placement_dtor),
+        size_(size) {}
+  static DataPtr makeDataPtr(
+      DataPtr&& data_ptr,
+      PlacementDtor placement_dtor,
+      size_t size,
+      Device device);
+  ~PlacementDeleteContext() {
+    placement_dtor_(data_ptr_.get(), size_);
+    // original memory will be freed when data_ptr_ is destructed
+  }
+};
+
+struct C10_API AutogradMetaInterface {
+  virtual void set_requires_grad(
+      bool requires_grad,
+      at::TensorImpl* self_impl) = 0;
+  virtual bool requires_grad() const = 0;
+  virtual at::Tensor& mutable_grad() = 0;
+  virtual const at::Tensor& grad() const = 0;
+  virtual const at::Tensor& fw_grad(uint64_t level, const at::TensorBase& self)
+      const = 0;
+  virtual void set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op) = 0;
+  virtual ~AutogradMetaInterface();
+};
+
+namespace impl {
+
+// Unfortunately, the definition of AutogradMeta lives in a separate
+// compilation unit than TensorImpl (libtorch.so versus libc10.so)
+// which means that we cannot construct an AutogradMeta from TensorImpl,
+// not even from the cpp file.  So we have to indirect it through a factory
+// function which will be initialized when we load libtorch.so.
+
+struct C10_API AutogradMetaFactory {
+  virtual ~AutogradMetaFactory() = default;
+  virtual std::unique_ptr<AutogradMetaInterface> make() const = 0;
+  // This method is the dumbest method.  But I don't have access
+  // to Tensor (not TensorImpl) which is undefined in this header.
+  virtual const at::Tensor& undefined_tensor() const = 0;
+};
+
+C10_API void SetAutogradMetaFactory(AutogradMetaFactory* factory);
+C10_API AutogradMetaFactory* GetAutogradMetaFactory();
+
+struct C10_API AutogradMetaFactoryRegisterer {
+  explicit AutogradMetaFactoryRegisterer(AutogradMetaFactory* factory) {
+    SetAutogradMetaFactory(factory);
+  }
+};
+
+} // namespace impl
+
+struct C10_API NamedTensorMetaInterface {
+  virtual ~NamedTensorMetaInterface() = default;
+  virtual std::unique_ptr<NamedTensorMetaInterface> clone() const {
+    TORCH_INTERNAL_ASSERT(
+        false, "Not implemented: NamedTensorMetaInterface::clone");
+  };
+  virtual int64_t slow_dim() const {
+    TORCH_INTERNAL_ASSERT(
+        false, "Not implemented: NamedTensorMetaInterface::slow_dim");
+  };
+};
+
+// For ease of copy pasting
+#if 0
+is_contiguous
+is_channels_last_contiguous
+is_channels_last_3d_contiguous
+is_channels_last
+is_channels_last_3d
+is_non_overlapping_and_dense
+#endif
+
+/**
+ * This structure is intended to hold additional metadata of the specific device
+ * backend.
+ **/
+struct C10_API BackendMeta : intrusive_ptr_target {
+  ~BackendMeta() override = default;
+  virtual intrusive_ptr<BackendMeta> clone(
+      const intrusive_ptr<BackendMeta>& ptr) const {
+    return ptr;
+  }
+};
+
+struct C10_API ExtraMeta {
+  std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta_ = nullptr;
+  std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta_ = nullptr;
+  intrusive_ptr<c10::BackendMeta> backend_meta_ = nullptr;
+  c10::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
+  c10::optional<std::string> custom_storage_error_msg_ = c10::nullopt;
+
+  ExtraMeta() = default;
+  ExtraMeta(const ExtraMeta& other) {
+    if (other.symbolic_shape_meta_) {
+      symbolic_shape_meta_ =
+          std::make_unique<c10::SymbolicShapeMeta>(*other.symbolic_shape_meta_);
+    }
+    if (other.named_tensor_meta_) {
+      named_tensor_meta_ = other.named_tensor_meta_->clone();
+    }
+    if (other.backend_meta_) {
+      backend_meta_ = other.backend_meta_->clone(other.backend_meta_);
+    }
+    if (other.custom_data_ptr_error_msg_) {
+      custom_data_ptr_error_msg_ = other.custom_data_ptr_error_msg_;
+    }
+    if (other.custom_storage_error_msg_) {
+      custom_storage_error_msg_ = other.custom_storage_error_msg_;
+    }
+  }
+
+  ExtraMeta(
+      std::unique_ptr<c10::SymbolicShapeMeta> symbolic_shape_meta,
+      std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta,
+      intrusive_ptr<c10::BackendMeta> backend_meta,
+      c10::optional<std::string> custom_data_ptr_error_msg = c10::nullopt,
+      c10::optional<std::string> custom_storage_access_error_msg = c10::nullopt)
+      : symbolic_shape_meta_(std::move(symbolic_shape_meta)),
+        named_tensor_meta_(std::move(named_tensor_meta)),
+        backend_meta_(std::move(backend_meta)),
+        custom_data_ptr_error_msg_(std::move(custom_data_ptr_error_msg)),
+        custom_storage_error_msg_(std::move(custom_storage_access_error_msg)) {}
+
+  std::unique_ptr<ExtraMeta> clone() const {
+    return std::make_unique<ExtraMeta>(*this);
+  }
+};
+
+// NOTE [ Version Counter Sharing ]
+//
+// Every Tensor has a version counter. Version counters are incremented whenever
+// the data or size of a tensor changes through in-place Variable operations.
+// Version counters are used to detect modifications to saved variables which
+// would result in incorrect gradient calculations. Version counters may be
+// shared between Variables:
+//
+// 1. A view shares the version counter of the base Variable,
+// 2. `x.detach()` shares the version counter of `x`,
+// 3. Unpacked saved variables share the version counter of the source.
+//
+// Version counters are not shared in these scenarios:
+//
+// 1. When we replace a `Variable`'s underlying `Tensor` by calling
+// `set_data(...)`,
+// 2. `x.data` does not share the version counter of `x`. (See discussion at
+// https://github.com/pytorch/pytorch/issues/5396)
+//
+// Question: Why do we put the version counter in TensorImpl instead of
+// AutogradMeta?
+//
+// Answer: After the Variable/Tensor merge, a tensor will not have AutogradMeta
+// when its `requires_grad_` is false, but when we use this tensor in the
+// forward pass of a function that requires saving this tensor for backward, we
+// need to keep track of this tensor's version to make sure it's always valid in
+// the autograd graph.
+//
+// To achieve this goal, we put the version counter in TensorImpl instead of
+// AutogradMeta, and have it always be available. This allows us to have the
+// optimization of not carrying AutogradMeta when a tensor doesn't require
+// gradient.
+//
+// A hypothetical alternative way to achieve this goal is to initialize
+// AutogradMeta and create the version counter for the non-requires-grad tensor
+// only when it's saved for backward. However, since saving a tensor for
+// backward happens in the forward pass, and our invariant is that forward pass
+// needs to be thread-safe, lazy-initializing AutogradMeta when saving a tensor
+// can introduce race conditions when we are running the forward pass in
+// multi-thread scenarios, thus making the forward pass not thread-safe anymore,
+// which breaks the invariant.
+struct C10_API VariableVersion {
+ private:
+  struct VersionCounter : intrusive_ptr_target {
+    VersionCounter(uint32_t version) : version_(version) {}
+    std::atomic<uint32_t> version_;
+  };
+  c10::intrusive_ptr<VersionCounter> version_counter_;
+
+ public:
+  // Note [Disabled VariableVersion]
+  // VariableVersion struct has an intrusive_ptr pointing VersionCounter struct
+  // with an atomic variable. Thus `VariableVersion(/*version=*/0)` is not as
+  // cheap as we expected. In some cases constructing a VariableVersion with
+  // version 0 is not necessary so we add a cheap constructor which
+  // doesn't allocate the intrusive_ptr.
+  // Example use cases are:
+  //  - Inference tensors don't track version counter, so they'll just always
+  //    have disabled VariableVersion.
+  //  - In SavedVariable class we override version_counter_ inside its
+  //  constructor
+  //    so that we can use the cheap constructor there.
+  enum Disabled { DISABLED };
+  // It's okay to return true even for inference tensor which
+  // doesn't have version counter enabled.
+  // We want to be permissive here since in many cases (e.g. make_variable)
+  // we can std::move a TensorImpl if there's no other uses which saves us
+  // an additional TensorImpl allocation.
+  bool unique() const {
+    return version_counter_ ? 1 == version_counter_.use_count() : true;
+  }
+  // NOTE: As of C++11 and 14, default-constructing a std::atomic variable
+  // leaves it in a persistently undefined state. See
+  // https://cplusplus.github.io/LWG/issue2334.
+  VariableVersion(uint32_t version)
+      : version_counter_(c10::make_intrusive<VersionCounter>(version)) {}
+  VariableVersion(Disabled = DISABLED) {}
+
+  bool enabled() const {
+    return version_counter_;
+  }
+
+  // Note [Inplace update inference tensor]
+  // 1. Inplace update to inference tensor is forbidden in normal mode.
+  //   For example:
+  //     inference_tensor.copy_(normal_tensor_requires_grad)
+  //   This inplace makes inference_tensor have requires_grad=True and
+  //   have a grad_fn.  This is bad because views of `inference_tensor`
+  //   created in InferenceMode won't be able to know the grad_fn since
+  //   their ViewMeta were not recorded. To match NoGradMode behavior
+  //   that "inplace update to a view created in NoGradMode raise an error",
+  //   we just ban inplace update to inference tensor since we can't tell
+  //   if an inference tensor is a view created in InferenceMode.
+  //
+  //   Note that views of normal tensor created in InferenceMode has proper
+  //   ViewMeta so that they're aware of the grad_fn correctly.
+  //
+  // 2. Inplace update to inference tensor in inference tensor doesn't bump
+  //    version counter.
+  //    * It either doesn't call bump() by skipping ADInplaceOrView kernel,
+  //      - e.g. inference_tensor.add_(1)
+  //    * or bump() is a no-op for inference tensor.
+  //      - e.g. inference_tensor.add_(normal_tensor)
+  void bump() {
+    // TODO: Replace the link to the documentation once it's available.
+    TORCH_CHECK(
+        version_counter_ || InferenceMode::is_enabled(),
+        "Inplace update to inference tensor outside InferenceMode is not allowed."
+        "You can make a clone to get a normal tensor before doing inplace update."
+        "See https://github.com/pytorch/rfcs/pull/17 for more details.");
+    if (version_counter_) {
+      ++version_counter_->version_;
+    }
+  }
+
+  void set_version(int64_t i) {
+    TORCH_CHECK(
+        version_counter_,
+        "Tried to call torch.autograd._unsafe_set_version() on a tensor "
+        "that does not have a version counter. Was it created in inference mode?");
+    TORCH_CHECK(i >= 0, "Cannot set a version_counter to a value below 0: ", i);
+    version_counter_->version_ = i;
+  }
+
+  // Inference tensor doesn't have version counter so it shouldn't be
+  // accessed.
+  uint32_t current_version() const {
+    TORCH_CHECK(
+        version_counter_, "Inference tensors do not track version counter.");
+    return version_counter_->version_;
+  }
+};
+
+// Forward declaration of TensorImpl needed for forward declaration of
+// C10_TensorImpl_Size_Check_Dummy_Class
+struct C10_API TensorImpl;
+
+/**
+ * NOTE: Some TensorImpl methods are small and not overridden in the
+ * PyTorch codebase itself, but may theoretically need to be
+ * overridden by third-party TensorImpl subclasses. This macro allows
+ * users that need maximum performance and don't need these extension
+ * points to disable them with a build-time flag. (In particular,
+ * XLA's XLATensorImpl currently overrides these methods, so we can't
+ * enable this flag by default.)
+ */
+#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
+#define TENSORIMPL_MAYBE_VIRTUAL
+#else
+#define TENSORIMPL_MAYBE_VIRTUAL virtual
+#endif
+
+/**
+ * The low-level representation of a tensor, which contains a pointer
+ * to a storage (which contains the actual data) and metadata (e.g., sizes and
+ * strides) describing this particular view of the data as a tensor.
+ *
+ * Some basic characteristics about our in-memory representation of
+ * tensors:
+ *
+ *  - It contains a pointer to a storage struct (Storage/StorageImpl)
+ *    which contains the pointer to the actual data and records the
+ *    data type and device of the view.  This allows multiple tensors
+ *    to alias the same underlying data, which allows to efficiently
+ *    implement differing *views* on a tensor.
+ *
+ *  - The tensor struct itself records view-specific metadata about
+ *    the tensor, e.g., sizes, strides and offset into storage.
+ *    Each view of a storage can have a different size or offset.
+ *
+ *  - This class is intrusively refcounted.  It is refcounted so that
+ *    we can support prompt deallocation of large tensors; it is
+ *    intrusively refcounted so that we can still perform reference
+ *    counted operations on raw pointers, which is often more convenient
+ *    when passing tensors across language boundaries.
+ *
+ *  - For backwards-compatibility reasons, a tensor may be in an
+ *    uninitialized state.  A tensor may be uninitialized in the following
+ *    two ways:
+ *
+ *      - A tensor may be DTYPE UNINITIALIZED.  A tensor of this
+ *        form has an uninitialized dtype.  This situation most
+ *        frequently arises when a user writes Tensor x(CPU).  The dtype
+ *        is subsequently initialized when mutable_data<T>() is
+ *        invoked for the first time.
+ *
+ *      - A tensor may be STORAGE UNINITIALIZED.  A tensor of this form
+ *        has non-zero size, but has a storage with a null data pointer.
+ *        This situation most frequently arises when a user calls
+ *        Resize() or FreeMemory().  This is because Caffe2 historically
+ *        does lazy allocation: allocation of data doesn't occur until
+ *        mutable_data<T>() is invoked.  A tensor with zero size is
+ *        always storage initialized, because no allocation is necessary
+ *        in this case.
+ *
+ *    All combinations of these two uninitialized states are possible.
+ *    Consider the following transcript in idiomatic Caffe2 API:
+ *
+ *      Tensor x(CPU); // x is storage-initialized, dtype-UNINITIALIZED
+ *      x.Resize(4); // x is storage-UNINITIALIZED, dtype-UNINITIALIZED
+ *      x.mutable_data<float>(); // x is storage-initialized, dtype-initialized
+ *      x.FreeMemory(); // x is storage-UNINITIALIZED, dtype-initialized.
+ *
+ *    All other fields on tensor are always initialized.  In particular,
+ *    size is always valid. (Historically, a tensor declared as Tensor x(CPU)
+ *    also had uninitialized size, encoded as numel == -1, but we have now
+ *    decided to default to zero size, resulting in numel == 0).
+ *
+ *    Uninitialized storages MUST be uniquely owned, to keep our model
+ *    simple.  Thus, we will reject operations which could cause an
+ *    uninitialized storage to become shared (or a shared storage to
+ *    become uninitialized, e.g., from FreeMemory).
+ *
+ *    In practice, tensors which are storage-UNINITIALIZED and
+ *    dtype-UNINITIALIZED are *extremely* ephemeral: essentially,
+ *    after you do a Resize(), you basically always call mutable_data()
+ *    immediately afterwards.  Most functions are not designed to
+ *    work if given a storage-UNINITIALIZED, dtype-UNINITIALIZED tensor.
+ *
+ *    We intend to eliminate all uninitialized states, so that every
+ *    tensor is fully initialized in all fields.  Please do not write new code
+ *    that depends on these uninitialized states.
+ */
+struct C10_API TensorImpl : public c10::intrusive_ptr_target {
+  TensorImpl() = delete;
+  ~TensorImpl() override;
+  // Note [Enum ImplType]
+  // This enum is temporary. In the followup refactor we should
+  // think about how to specialize TensorImpl creation for view
+  // tensors. Currently we only special case its key_set_ but
+  // there's also potential to share version_counter_ directly
+  // without creating first and then override in as_view.
+  enum ImplType { VIEW };
+
+  /**
+   * Construct a 1-dim 0-size tensor backed by the given storage.
+   */
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet,
+      const caffe2::TypeMeta data_type);
+
+  // See Note [Enum ImplType]
+  TensorImpl(
+      ImplType,
+      Storage&& storage,
+      DispatchKeySet,
+      const caffe2::TypeMeta data_type);
+
+  /**
+   * Construct a 1-dim 0 size tensor that doesn't have a storage.
+   */
+  TensorImpl(
+      DispatchKeySet,
+      const caffe2::TypeMeta data_type,
+      c10::optional<c10::Device> device_opt);
+
+  // Legacy constructors so I don't have to go update call sites.
+  // TODO: When Variable is added, delete these constructors
+  TensorImpl(
+      Storage&& storage,
+      DispatchKey dispatch_key,
+      const caffe2::TypeMeta data_type)
+      : TensorImpl(
+            std::move(storage),
+            DispatchKeySet(dispatch_key),
+            data_type) {}
+  TensorImpl(
+      DispatchKey dispatch_key,
+      const caffe2::TypeMeta data_type,
+      c10::optional<c10::Device> device_opt)
+      : TensorImpl(DispatchKeySet(dispatch_key), data_type, device_opt) {}
+
+ private:
+  // This constructor is private, because the data_type is redundant with
+  // storage.  Still, we pass it in separately because it's easier to write
+  // the initializer list if we're not worried about storage being moved out
+  // from under us.
+  TensorImpl(
+      Storage&& storage,
+      DispatchKeySet,
+      const caffe2::TypeMeta data_type,
+      c10::optional<c10::Device>);
+
+ public:
+  TensorImpl(const TensorImpl&) = delete;
+  TensorImpl& operator=(const TensorImpl&) = delete;
+  TensorImpl(TensorImpl&&) = delete;
+  TensorImpl& operator=(TensorImpl&&) = delete;
+
+  /**
+   * Release (decref) storage, and any other external allocations.  This
+   * override is for `intrusive_ptr_target` and is used to implement weak
+   * tensors.
+   */
+  void release_resources() override;
+
+ public:
+  /**
+   * Return the DispatchKeySet corresponding to this Tensor, specifying
+   * all of the DispatchKeys that this Tensor identifies as.  This is the
+   * information used to dispatch operations on this tensor.
+   */
+  DispatchKeySet key_set() const {
+    return key_set_;
+  }
+
+ private:
+  [[noreturn]] void throw_cannot_call_with_symbolic(const char* meth) const;
+
+  // NOTE: The general recipe for customizable methods is that the fastpath
+  // function (e.g., sizes()) does an unlikely policy test, and if doesn't
+  // trigger, it does the fast path implementation with no checks and going
+  // directly to on-TensorImpl fields.  In particular, you never need to
+  // check ExtraMeta if the policy doesn't trigger, as non-trivial ExtraMeta
+  // implies the policy will always match.
+  //
+  // The default implementations of methods are "safe": they do extra tests
+  // to make sure the internal state is consistent no matter if you are
+  // doing symbolic shapes or not.  If you don't want the tests, directly
+  // override the custom method (e.g., custom_sizes()) to do your preferred
+  // behavior.
+
+ public:
+  /**
+   * Return a reference to the sizes of this tensor.  This reference remains
+   * valid as long as the tensor is live and not resized.
+   */
+  IntArrayRef sizes() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sizes_custom();
+    }
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  SymIntArrayRef sym_sizes() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_sizes_custom();
+    }
+    // Sizes guaranteed to be non-negative, so unchecked cast is OK
+    return c10::fromIntArrayRefKnownNonNegative(
+        sizes_and_strides_.sizes_arrayref());
+  }
+
+  IntArrayRef sizes_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("sizes");
+    }
+    return sizes_and_strides_.sizes_arrayref();
+  }
+
+  SymIntArrayRef sym_sizes_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().sizes_;
+    } else {
+      // Sizes guaranteed to be non-negative, so unchecked cast is OK
+      return c10::fromIntArrayRefKnownNonNegative(sizes_default());
+    }
+  }
+
+  // From https://stackoverflow.com/a/3057522/23845
+  // TODO: does C++14 have a stdlib template for this?
+  template <typename T>
+  struct identity {
+    typedef T type;
+  };
+
+  template <typename T>
+  ArrayRef<T> generic_sizes() {
+    return _generic_sizes(identity<T>());
+  }
+
+  ArrayRef<int64_t> _generic_sizes(identity<int64_t>) {
+    return sizes();
+  }
+  ArrayRef<c10::SymInt> _generic_sizes(identity<c10::SymInt>) {
+    return sym_sizes();
+  }
+
+  template <typename T>
+  ArrayRef<T> generic_strides() {
+    return _generic_strides(identity<T>());
+  }
+
+  ArrayRef<int64_t> _generic_strides(identity<int64_t>) {
+    return strides();
+  }
+  ArrayRef<c10::SymInt> _generic_strides(identity<c10::SymInt>) {
+    return sym_strides();
+  }
+
+  template <typename T>
+  T generic_storage_offset() {
+    return _generic_storage_offset(identity<T>());
+  }
+
+  int64_t _generic_storage_offset(identity<int64_t>) {
+    return storage_offset();
+  }
+  c10::SymInt _generic_storage_offset(identity<c10::SymInt>) {
+    return sym_storage_offset();
+  }
+
+  /**
+   * The number of elements in a tensor.
+   *
+   * WARNING: Previously, if you were using the Caffe2 API, you could
+   * test numel() == -1 to see if a tensor was uninitialized.  This
+   * is no longer true; numel always accurately reports the product
+   * of sizes of a tensor.
+   */
+  int64_t numel() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return numel_custom();
+    }
+    return numel_;
+  }
+
+  c10::SymInt sym_numel() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_numel_custom();
+    }
+    return c10::SymInt(SymInt::UNCHECKED, numel_);
+  }
+
+  int64_t numel_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("numel");
+    }
+    return numel_;
+  }
+
+  c10::SymInt sym_numel_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().numel();
+    } else {
+      return c10::SymInt(SymInt::UNCHECKED, numel_);
+    }
+  }
+
+  /**
+   * Return the number of dimensions of this tensor.  Note that 0-dimension
+   * represents a Tensor that is a Scalar, e.g., one that has a single element.
+   */
+  int64_t dim() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return dim_custom();
+    }
+    return static_cast<int64_t>(sizes_and_strides_.size());
+  }
+
+  int64_t dim_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return static_cast<int64_t>(symbolic_shape_meta().sizes_.size());
+    } else {
+      return static_cast<int64_t>(sizes_and_strides_.size());
+    }
+  }
+
+  /**
+   * Return the offset in number of elements into the storage that this
+   * tensor points to.  Most tensors have storage_offset() == 0, but,
+   * for example, an index into a tensor will have a non-zero storage_offset().
+   *
+   * WARNING: This is NOT computed in bytes.
+   */
+  int64_t storage_offset() const {
+    // TODO: maybe this should be toggled by strides
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return storage_offset_custom();
+    }
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_storage_offset_custom();
+    }
+    return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+  }
+
+  int64_t storage_offset_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("storage_offset");
+    }
+    return storage_offset_;
+  }
+
+  c10::SymInt sym_storage_offset_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().storage_offset_;
+    } else {
+      return c10::SymInt(SymInt::UNCHECKED, storage_offset_);
+    }
+  }
+
+  /**
+   * Return a reference to the strides of this tensor.  This reference remains
+   * valid as long as the tensor is live and not restrided.
+   */
+  IntArrayRef strides() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return strides_custom();
+    }
+    return sizes_and_strides_.strides_arrayref();
+  }
+
+  c10::SymIntArrayRef sym_strides() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return sym_strides_custom();
+    }
+    return c10::fromIntArrayRefKnownNonNegative(strides_default());
+  }
+
+  IntArrayRef strides_default() const {
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      throw_cannot_call_with_symbolic("strides");
+    }
+    return sizes_and_strides_.strides_arrayref();
+  }
+
+  c10::SymIntArrayRef sym_strides_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().strides_;
+    } else {
+      return c10::fromIntArrayRefKnownNonNegative(strides_default());
+    }
+  }
+
+  /**
+   * Whether or not a tensor is laid out in contiguous memory.
+   *
+   * Tensors with non-trivial strides are not contiguous.  See
+   * compute_contiguous() for the exact definition of whether or not
+   * a tensor is contiguous or not.
+   */
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_contiguous_custom(memory_format);
+    }
+    return is_contiguous_default(memory_format);
+  }
+
+  // These are factored into separate functions in case subclasses
+  // want to use them
+  bool is_contiguous_default(at::MemoryFormat memory_format) const {
+    if (has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return symbolic_shape_meta().is_channels_last_contiguous().guard_bool(
+            __FILE__, __LINE__);
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return symbolic_shape_meta()
+            .is_channels_last_3d_contiguous()
+            .guard_bool(__FILE__, __LINE__);
+      }
+      return symbolic_shape_meta().is_contiguous().guard_bool(
+          __FILE__, __LINE__);
+    }
+
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_contiguous_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_contiguous_;
+    }
+    return is_contiguous_;
+  }
+
+  bool is_strides_like_default(at::MemoryFormat memory_format) const {
+    if (has_symbolic_sizes_strides_) {
+      if (memory_format == at::MemoryFormat::ChannelsLast) {
+        return symbolic_shape_meta().is_channels_last().guard_bool(
+            __FILE__, __LINE__);
+      } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+        return symbolic_shape_meta().is_channels_last_3d().guard_bool(
+            __FILE__, __LINE__);
+      } else {
+        return false;
+      }
+    }
+
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      return is_channels_last_;
+    } else if (memory_format == at::MemoryFormat::ChannelsLast3d) {
+      return is_channels_last_3d_;
+    } else {
+      return false;
+    }
+  }
+
+  bool is_non_overlapping_and_dense_default() const {
+    if (has_symbolic_sizes_strides_) {
+      return symbolic_shape_meta().is_non_overlapping_and_dense().guard_bool(
+          __FILE__, __LINE__);
+    } else {
+      return is_non_overlapping_and_dense_;
+    }
+  }
+
+  // NB: these dim accessor functions don't have _default(), as you can use
+  // sizes_default/strides_default
+  /**
+   * Return the size of a tensor at some dimension, wrapping the dimension if
+   * necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t size(int64_t d) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return size_custom(d);
+    }
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sizes_and_strides_.size_at_unchecked(d);
+  }
+
+  c10::SymInt sym_size(int64_t d) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomSizes))) {
+      return sym_size_custom(d);
+    }
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    const auto sizes = this->sym_sizes();
+    return sizes[d];
+  }
+
+  /**
+   * Return the stride of a tensor at some dimension, wrapping the dimension
+   * if necessary.
+   *
+   * NOTE: if you know wrapping is unnecessary, do sizes()[d] instead; it will
+   * be faster
+   */
+  int64_t stride(int64_t d) const {
+    d = maybe_wrap_dim(d, dim(), false);
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      // TODO: provide stride_custom, symmetrically with size_custom.
+      // There is presently no user for it; only NestedTensor is using
+      // size_custom overrideability
+      return strides_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+    }
+    // Intentionally don't call default, which also handles symbolic
+    return sizes_and_strides_.stride_at_unchecked(d);
+  }
+
+  enum class SizesStridesPolicy : uint8_t {
+    // Default behavior, e.g., dense tensor.
+    //
+    // Can override: nothing
+    Default = 0,
+    // Customizable strides behavior, e.g., sparse tensor,
+    // mkldnn tensor.
+    //
+    // Can override: strides(), is_contiguous()
+    CustomStrides = 1,
+    // Customizable sizes behavior, e.g., nested tensor
+    //
+    // Can override: strides(), is_contiguous(), sizes(), dim(), numel()
+    CustomSizes = 2
+  };
+
+ protected:
+  inline bool matches_policy(SizesStridesPolicy policy) const {
+    return sizes_strides_policy_ >= static_cast<uint8_t>(policy);
+  }
+
+  inline bool matches_custom(SizesStridesPolicy policy) const {
+    return custom_sizes_strides_ >= static_cast<uint8_t>(policy);
+  }
+
+  inline bool matches_python_custom(SizesStridesPolicy policy) const {
+    auto r = python_custom_sizes_strides_ >= static_cast<uint8_t>(policy);
+    if (r) {
+      TORCH_INTERNAL_ASSERT(is_python_dispatch())
+    }
+    return r;
+  }
+
+  /**
+   * Customization points for the functions above.  sizes_strides_policy_
+   * must be set to enable these.
+   *
+   * NB: dim is overrideable separately from sizes because it is possible
+   * for a tensor to have rank, but not well defined sizes.
+   */
+  // sizes_strides_policy_ >= CustomStrides
+  virtual bool is_contiguous_custom(at::MemoryFormat memory_format) const;
+  virtual bool is_strides_like_custom(at::MemoryFormat memory_format) const;
+  virtual bool is_non_overlapping_and_dense_custom() const;
+  // sizes_strides_policy_ >= CustomSizes
+  // Currently this method only exists to be overwritten by subclasses such as
+  // NestedTensorImpl.
+  virtual int64_t size_custom(int64_t d) const {
+    // TODO: We could add support to Python dispatch here.
+    // TODO: We could call into aten::size.int instead of
+    // sizes_custom()[d] and enable use of the dispatcher.
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+  }
+
+  virtual c10::SymInt sym_size_custom(int64_t d) const {
+    // TODO: We could add support to Python dispatch here.
+    // TODO: We could call into aten::size.int instead of
+    // sym_sizes_custom()[d] and enable use of the dispatcher.
+    d = maybe_wrap_dim(d, dim(), /*wrap_scalar=*/false);
+    return sym_sizes_custom()[d]; // unchecked (maybe_wrap_dim enforces bounds)
+  }
+
+  virtual IntArrayRef sizes_custom() const;
+  virtual IntArrayRef strides_custom() const;
+  virtual int64_t numel_custom() const;
+  virtual int64_t storage_offset_custom() const;
+  virtual int64_t dim_custom() const;
+  virtual Device device_custom() const;
+  virtual Layout layout_custom() const;
+
+  virtual c10::SymIntArrayRef sym_sizes_custom() const;
+  virtual c10::SymIntArrayRef sym_strides_custom() const;
+  virtual c10::SymInt sym_numel_custom() const;
+  virtual c10::SymInt sym_storage_offset_custom() const;
+
+ public:
+  /**
+   * True if this tensor has storage. See storage() for details.
+   */
+#ifdef DEBUG
+  // Allow subclasses to check that their storage_ is never getting set in debug
+  // builds.
+  virtual
+#else
+  TENSORIMPL_MAYBE_VIRTUAL
+#endif
+      bool
+      has_storage() const
+  // NOTE: we devirtualize this because it arguably shouldn't be an
+  // error just to ask subclasses if they have storage.
+  // This used to throw for most subclasses, but OpaqueTensorImpl
+  // wanted it to successfully return false, so we went ahead and made
+  // it a non-error.
+#ifdef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
+  {
+    return storage_;
+  }
+#else
+      ;
+#endif
+
+  /**
+   * Return the underlying storage of a Tensor.  Multiple tensors may share
+   * a single storage.  A Storage is an impoverished, Tensor-like class
+   * which supports far less operations than Tensor.
+   *
+   * Avoid using this method if possible; try to use only Tensor APIs to perform
+   * operations.
+   */
+  TENSORIMPL_MAYBE_VIRTUAL const Storage& storage() const {
+    if (C10_UNLIKELY(storage_access_should_throw_)) {
+      throw_storage_access_error();
+    }
+    return storage_;
+  }
+
+  /**
+   * Return the underlying storage, unsafely assuming this is a basic strided
+   * tensor. In cases where `storage` access would throw, this returns a
+   * default-constructed Storage.
+   */
+  inline const Storage& unsafe_storage() const {
+    return storage_;
+  }
+
+  bool unique_version() const {
+    return version_counter_.unique();
+  }
+
+ protected:
+  virtual Layout layout_impl() const {
+    TORCH_CHECK(
+        false, "layout_impl is only implemented for TensorImpl subclasses.");
+  }
+
+ public:
+  // Whether a tensor is sparse COO or not.
+  bool is_sparse() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    return key_set_.has_all(c10::sparse_ks);
+  }
+
+  // Whether a tensor is sparse CSR or not.
+  bool is_sparse_csr() const {
+    return layout() == kSparseCsr;
+  }
+
+  // Whether a tensor is sparse CSR/CSC/BSR/BSC or not.
+  bool is_sparse_compressed() const {
+    return key_set_.has_all(c10::sparse_csr_ks);
+  }
+
+  bool is_quantized() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized);
+    return key_set_.has_all(quantized_ks);
+  }
+
+  bool is_meta() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_meta();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMeta;
+  }
+
+  bool is_cpu() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_cpu();
+    }
+    // Note: we cannot rely on dispatch keys to determine the device type
+    // of a tensor, because "wrapper" tensors (like FunctionalTensorWrapper)
+    // don't include backend dispatch keys.
+    return device_opt_.has_value() && device_opt_->type() == kCPU;
+  }
+
+  bool is_cuda() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_cuda();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kCUDA;
+  }
+
+  bool is_xpu() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_xpu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kXPU;
+  }
+
+  bool is_ipu() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_ipu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kIPU;
+  }
+
+  bool is_xla() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_xla();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kXLA;
+  }
+
+  bool is_mtia() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_mtia();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMTIA;
+  }
+
+  bool is_hpu() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_hpu();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kHPU;
+  }
+
+  bool is_lazy() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_lazy();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kLazy;
+  }
+
+  bool is_hip() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_hip();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kHIP;
+  }
+
+  bool is_ve() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_ve();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kVE;
+  }
+
+  bool is_privateuseone() const {
+    // NB: This method is not virtual and avoid dispatches for performance
+    // reasons.
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_privateuseone();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kPrivateUse1;
+  }
+
+  bool is_mkldnn() const {
+    return key_set_.has_all(c10::mkldnn_ks);
+  }
+
+  bool is_vulkan() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_vulkan();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kVulkan;
+  }
+
+  bool is_metal() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_metal();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMetal;
+  }
+
+  bool is_mps() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_mps();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kMPS;
+  }
+
+  bool is_ort() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().is_ort();
+    }
+    return device_opt_.has_value() && device_opt_->type() == kORT;
+  }
+
+  bool is_nested() const {
+    return key_set_.has(DispatchKey::NestedTensor);
+  }
+
+  // TODO: remove this once we don't automatically enabled Autograd dispatch
+  // keys
+  //       in TensorImpl constructor.
+  // DON'T USE THIS API!! It's only created for testing purpose in
+  // file aten/src/ATen/core/boxing/impl/test_helpers.h
+  void remove_autograd_key() {
+    key_set_ = key_set_ - autograd_dispatch_keyset;
+  }
+
+  // Inference tensor doesn't have autograd or ADInplaceOrView key.
+  // Invariant:
+  //   Inference tensor has version_counter_.enabled() == false
+  bool is_inference() {
+    bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks);
+    bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        no_ADInplaceOrView == no_Autograd,
+        "ADInplaceOrView and Autograd keys must be on/off at the same time.");
+    return no_ADInplaceOrView && no_Autograd;
+  }
+
+  DeviceIndex get_device() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom().index();
+    }
+    return device_default().index();
+  }
+
+  Device device() const {
+    if (C10_UNLIKELY(device_policy_)) {
+      return device_custom();
+    }
+    return device_default();
+  }
+
+ protected:
+  c10::Device device_default() const {
+    TORCH_CHECK(device_opt_.has_value(), "tensor does not have a device");
+    // See NOTE [c10::optional operator usage in CUDA]
+    return *device_opt_;
+  }
+
+ public:
+  Layout layout() const {
+    if (C10_UNLIKELY(layout_policy_)) {
+      return layout_custom();
+    }
+
+    // NB: This method is not virtual and avoid dispatches for perf.
+    // strided is also the most common layout type, so we check for
+    // strided case first.
+    // This keyset must also be kept in sync with the logic in
+    // is_sparse() / is_sparse_csr() / is_mkldnn()
+    constexpr auto sparse_and_sparsecsr_and_mkldnn_ks =
+        c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks;
+    if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) {
+      return kStrided;
+    } else if (is_sparse()) {
+      return kSparse;
+    } else if (is_sparse_compressed()) {
+      // Typically, the tensor dispatch keys define the tensor layout
+      // uniquely. This allows using non-virtual layout method for
+      // better performance. However, when tensor's layout depends,
+      // say, on tensor attributes, one must use this execution path
+      // where the corresponding tensor impl class overwrites virtual
+      // layout_impl() method.
+      //
+      // TODO: implement layout() as native function/method so that
+      // __torch_dispatch__ users will be able to redefine the
+      // layout() method.
+      return layout_impl();
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          is_mkldnn(), "There is an error in the layout calculation logic.");
+      return kMkldnn;
+    }
+  }
+
+  /**
+   * True if a tensor was auto-wrapped from a C++ or Python number.
+   * For example, when you write 't + 2', 2 is auto-wrapped into a Tensor
+   * with `is_wrapped_number_` set to true.
+   *
+   * Wrapped numbers do not participate in the result type computation for
+   * mixed-type operations if there are any Tensors that are not wrapped
+   * numbers.  This is useful, because we want 't + 2' to work with
+   * any type of tensor, not just LongTensor (which is what integers
+   * in Python represent).
+   *
+   * Otherwise, they behave like their non-wrapped equivalents.
+   * See [Result type computation] in TensorIterator.h.
+   *
+   * Why did we opt for wrapped numbers, as opposed to just having
+   * an extra function add(Tensor, Scalar)?  This helps greatly reduce
+   * the amount of code we have to write for add, when actually
+   * a Tensor-Scalar addition is really just a Tensor-Tensor
+   * addition when the RHS is 0-dim (except for promotion behavior.)
+   */
+  bool is_wrapped_number() const {
+    return is_wrapped_number_;
+  }
+
+  /**
+   * Set whether or not a tensor was auto-wrapped from a C++ or Python
+   * number.  You probably don't want to call this, unless you are
+   * writing binding code.
+   */
+  void set_wrapped_number(bool value) {
+    TORCH_INTERNAL_ASSERT(dim() == 0);
+    is_wrapped_number_ = value;
+  }
+
+  /**
+   * Returns true if Tensor supports as_strided and as_strided_backward.
+   * This is used in autograd to perform inplace update on view Tensors.
+   * See Note [View + Inplace update for base tensor] and
+   * [View + Inplace update for view tensor] for details.
+   * Note this method only returns true for XLA backend, where it
+   * simulates strided Tensor to support most view ops, but it cannot
+   * fully support general `as_strided` case.
+   * It can be expanded as needed in the future, e.g sparse Tensor.
+   */
+  inline bool support_as_strided() const {
+    if (is_nested()) {
+      return false;
+    }
+    if (key_set_.has(DispatchKey::Functionalize)) {
+      return false;
+    }
+    return device().supports_as_strided();
+  }
+
+  // ~~~~~ Autograd API ~~~~~
+  // Some methods below are defined in TensorImpl.cpp because Tensor is an
+  // incomplete type.
+
+  /**
+   * Set whether or not a tensor requires gradient.
+   */
+  void set_requires_grad(bool requires_grad);
+
+  /**
+   * True if a tensor requires gradient.  Tensors which require gradient
+   * have history tracked for any operations performed on them, so that
+   * we can automatically differentiate back to them.  A tensor that
+   * requires gradient and has no history is a "leaf" tensor, which we
+   * accumulate gradients into.
+   */
+  bool requires_grad() const;
+
+  /**
+   * Return a mutable reference to the gradient.  This is conventionally
+   * used as `t.grad() = x` to set a gradient to a completely new tensor.
+   */
+  at::Tensor& mutable_grad();
+
+  /**
+   * Return the accumulated gradient of a tensor.  This gradient is written
+   * into when performing backwards, when this tensor is a leaf tensor.
+   */
+  const at::Tensor& grad() const;
+
+  /**
+   * Whether or not the imaginary part of the tensor should be negated
+   */
+  inline bool is_conj() const {
+    constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate);
+    return key_set_.has_all(conjugate_ks);
+  }
+
+  /**
+   * Set whether or not to take the conjugate of the tensor (flip the imaginary
+   * bit).
+   */
+  void _set_conj(bool value) {
+    if (value) {
+      key_set_ = key_set_.add(DispatchKey::Conjugate);
+      TORCH_INTERNAL_ASSERT(isComplexType(typeMetaToScalarType(dtype())));
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::Conjugate);
+    }
+  }
+
+  /**
+   * XXX: do not use, private api!
+   * Update the backend component related keys to the backend component
+   * corresponding to this device.
+   */
+  void _change_backend_component_keys(c10::Device device);
+
+  /**
+   * Whether or not the tensor is a zerotensor
+   */
+  inline bool _is_zerotensor() const {
+    constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor);
+    return key_set_.has_all(zerotensor_ks);
+  }
+
+  /**
+   Set whether or not the tensor is a zero tensor
+  */
+  void _set_zero(bool value) {
+    if (value) {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "Please call `torch._efficientzerotensor` if you want to create a tensor with no storage.");
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::ZeroTensor);
+    }
+  }
+
+  /**
+   * Whether or not the tensor should be negated
+   */
+  inline bool is_neg() const {
+    constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative);
+    return key_set_.has_all(negative_ks);
+  }
+
+  /**
+   * Set whether or not to take the conjugate of the tensor (flip the imaginary
+   * bit).
+   */
+  void _set_neg(bool value) {
+    if (value) {
+      key_set_ = key_set_.add(DispatchKey::Negative);
+    } else {
+      key_set_ = key_set_.remove(DispatchKey::Negative);
+    }
+  }
+
+  /**
+   * Return the accumulated gradient of a tensor. This gradient is computed
+   * using forward mode AD.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be returned. Note that since levels are not fully
+   *     supported yet, this argument should be 0. See documentation for
+   *     torch::autograd::enter_dual_level for more details about forward AD
+   * nesting.
+   *   - "self" should represent the Tensor whose forward grad is accessed. It
+   * is required when dealing with view.
+   */
+  const at::Tensor& _fw_grad(uint64_t level, const at::TensorBase& self) const;
+
+  /**
+   * Sets the forward gradient for this Tensor.
+   * The given Tensor might not be used directly and its content will be copied.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "new_grad" is a Tensor containing the new value of the gradient that
+   * should be set
+   *   - "self" should represent the Tensor whose forward grad is accessed. It
+   * is required when dealing with view.
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be set. Note that since levels are not fully supported
+   *     yet, this argument should be 0. See documentation for
+   * torch::autograd::enter_dual_level for more details about forward AD
+   * nesting.
+   *   - "is_inplace_op" is a boolean flag that tells if this gradient was
+   * generated by an inplace operation or an out of place one. This allows
+   * better error checking.
+   */
+  void _set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op);
+
+  /**
+   * Return a typed data pointer to the actual data which this tensor refers to.
+   * This checks that the requested type (from the template parameter) matches
+   * the internal type of the tensor.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if
+   * the size is 0.
+   *
+   * WARNING: If a tensor is not contiguous, you MUST use strides when
+   * performing index calculations to determine the location of elements in
+   * the tensor.  We recommend using 'TensorAccessor' to handle this computation
+   * for you; this class is available from 'Tensor'.
+   */
+  template <typename T>
+  const T* data_dtype_initialized() const {
+    return data_dtype_initialized_impl<const T>(
+        [this] { return static_cast<const T*>(storage_.data()); });
+  }
+
+  /**
+   * Return a mutable typed data pointer to the actual data which this
+   * tensor refers to. This checks that the requested type (from the
+   * template parameter) matches the internal type of the tensor.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if
+   * the size is 0.
+   *
+   * WARNING: If a tensor is not contiguous, you MUST use strides when
+   * performing index calculations to determine the location of elements in
+   * the tensor.  We recommend using 'TensorAccessor' to handle this computation
+   * for you; this class is available from 'Tensor'.
+   */
+  template <typename T>
+  T* mutable_data_dtype_initialized() {
+    return data_dtype_initialized_impl<T>(
+        [this] { return static_cast<T*>(storage_.mutable_data()); });
+  }
+
+ private:
+  // Shared implementation of data_dtype_initialized() and
+  // mutable_data_dtype_initialized().
+  template <typename T, typename Func>
+  T* data_dtype_initialized_impl(const Func& get_data) const {
+    TORCH_CHECK(
+        data_type_.Match<std::remove_const_t<T>>(),
+        "Tensor type mismatch, caller expects elements to be ",
+        caffe2::TypeMeta::TypeName<std::remove_const_t<T>>(),
+        ", while tensor contains ",
+        data_type_.name(),
+        ". ");
+    return data_ptr_impl_impl<T>(get_data);
+  }
+
+ public:
+  /**
+   * More efficient helper for Tensor::data_ptr(). Like data<T>(), but
+   * does not do a type check. Unlike the untemplated data(), does
+   * check has_storage() and storage_initialized().
+   */
+  template <typename T>
+  inline const T* data_ptr_impl() const {
+    return data_ptr_impl_impl<const T>(
+        [this] { return static_cast<const T*>(storage_.data()); });
+  }
+
+  /**
+   * More efficient helper for Tensor::data_ptr(). Like data<T>(), but
+   * does not do a type check. Unlike the untemplated data(), does
+   * check has_storage() and storage_initialized().
+   */
+  template <typename T>
+  inline T* mutable_data_ptr_impl() {
+    return data_ptr_impl_impl<T>(
+        [this] { return static_cast<T*>(storage_.mutable_data()); });
+  }
+
+ private:
+  // Shared implementation of mutable_data_ptr_impl() and the future
+  // mutable_data_ptr_impl().
+  template <typename T, typename Func>
+  __ubsan_ignore_pointer_overflow__ T* data_ptr_impl_impl(
+      const Func& get_data) const {
+    if (C10_UNLIKELY(!has_storage())) {
+      throw_data_ptr_access_error();
+    }
+    TORCH_CHECK(
+        storage_initialized(),
+        "The tensor has a non-zero number of elements, but its data is not allocated yet. "
+        "Caffe2 uses a lazy allocation, so you will need to call "
+        "mutable_data() or raw_mutable_data() to actually allocate memory.");
+    // Caller does the type check.
+    // Note: storage_offset_ can be non-null even for zero-elements tensors
+    // (for example if created as `torch.empty(5)[10:]`) that triggers
+    // applying non-zero offset to null pointer in UBSan
+    return get_data() + storage_offset_;
+  }
+
+ public:
+  /**
+   * Return a const void* data pointer to the actual data which this
+   * tensor refers to.
+   *
+   * It is invalid to call data() on a dtype-uninitialized tensor, even if the
+   * size is 0.
+   *
+   * WARNING: The data pointed to by this tensor may not contiguous; do NOT
+   * assume that itemsize() * numel() is sufficient to compute the bytes that
+   * can be validly read from this tensor.
+   */
+  inline const void* data() const {
+    return data_impl<const void>(
+        [this] { return static_cast<const char*>(storage_.data()); });
+  }
+
+  /**
+   * Return a void* data pointer to the actual data which this tensor refers to.
+   *
+   * It is invalid to call mutable_data() on a dtype-uninitialized
+   * tensor, even if the size is 0.
+   *
+   * WARNING: The data pointed to by this tensor may not contiguous; do NOT
+   * assume that itemsize() * numel() is sufficient to compute the bytes that
+   * can be validly read from this tensor.
+   */
+  inline void* mutable_data() {
+    return data_impl<void>(
+        [this] { return static_cast<char*>(storage_.mutable_data()); });
+  }
+
+ private:
+  /// Shared implementation of data() and mutable_data().
+  ///
+  /// get_data must return a byte-addressed pointer, e.g. char*,
+  /// std::byte const*, etc.
+  template <typename Void, typename Func>
+  Void* data_impl(const Func& get_data) const {
+    if (C10_UNLIKELY(!has_storage())) {
+      throw_data_ptr_access_error();
+    }
+    TORCH_CHECK(
+        dtype_initialized(),
+        "Cannot access data pointer of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
+    auto* data = get_data();
+    static_assert(
+        sizeof(*data) == 1, "get_data must return a byte-addressed pointer.");
+    // Computing an offset into an empty tensor would be UB, since an empty
+    // tensor's storage will be nullptr, and adding a nonzero offset to nullptr
+    // is UB.  So we skip the offset computation in this case.
+    if (is_empty()) {
+      return nullptr;
+    }
+    return data + data_type_.itemsize() * storage_offset_;
+  }
+
+ public:
+  /**
+   * Returns the TypeMeta of a tensor, which describes what data type
+   * it is (e.g., int, float, ...)
+   */
+  const caffe2::TypeMeta dtype() const {
+    return data_type_;
+  }
+
+  /**
+   * Return the size of a single element of this tensor in bytes.
+   */
+  size_t itemsize() const {
+    TORCH_CHECK(
+        dtype_initialized(),
+        "Cannot report itemsize of Tensor that doesn't have initialized dtype "
+        "(e.g., caffe2::Tensor x(CPU), prior to calling mutable_data<T>() on x)");
+    return data_type_.itemsize();
+  }
+
+  void set_backend_meta(intrusive_ptr<c10::BackendMeta> backend_meta) {
+    get_extra_meta().backend_meta_ = std::move(backend_meta);
+  }
+
+  c10::BackendMeta* get_backend_meta() {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->backend_meta_.get();
+  }
+
+  intrusive_ptr<c10::BackendMeta> get_backend_meta_intrusive_ptr() const {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->backend_meta_;
+  }
+
+  void release_storage_and_set_meta_custom_data_ptr_error_msg_(
+      c10::optional<std::string> s) {
+    storage_ = {};
+    set_storage_access_should_throw();
+    get_extra_meta().custom_data_ptr_error_msg_ = s;
+    get_extra_meta().custom_storage_error_msg_ = std::move(s);
+  }
+
+ protected:
+  /**
+   * Returns the human-readable name of the actual type of this object (e.g.,
+   * TensorImpl, BatchedTensorImpl, etc.). Used for error messages.
+   */
+  virtual const char* tensorimpl_type_name() const {
+    return "TensorImpl";
+  }
+
+ private:
+  [[noreturn]] void throw_storage_access_error() const;
+  [[noreturn]] void throw_data_ptr_access_error() const;
+
+  ExtraMeta& get_extra_meta() {
+    if (!extra_meta_) {
+      extra_meta_ = std::make_unique<ExtraMeta>();
+    }
+    return *extra_meta_;
+  }
+
+  c10::SymbolicShapeMeta& symbolic_shape_meta() {
+    TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_);
+    return *extra_meta_->symbolic_shape_meta_;
+  }
+
+  const c10::SymbolicShapeMeta& symbolic_shape_meta() const {
+    TORCH_INTERNAL_ASSERT(extra_meta_ && extra_meta_->symbolic_shape_meta_);
+    return *extra_meta_->symbolic_shape_meta_;
+  }
+
+ public:
+  /**
+   * True if a tensor has no elements (e.g., numel() == 0).
+   */
+  inline bool is_empty() const {
+    return numel() == 0;
+  }
+
+  // if we are going to use sym sizes, we should be setting sym strides at the
+  // same time, otherwise it's very easy to misuse this API
+  void set_sizes_and_strides(
+      c10::SymIntArrayRef sizes,
+      c10::SymIntArrayRef strides,
+      c10::optional<c10::SymInt> storage_offset = c10::nullopt);
+  // This is renamed to avoid breaking overload BC
+  void generic_set_sizes_contiguous(c10::SymIntArrayRef sizes);
+  void generic_set_sizes_contiguous(c10::IntArrayRef sizes) {
+    set_sizes_contiguous(sizes);
+  }
+
+  /**
+   * Change the size at some dimension.  This DOES NOT update strides;
+   * thus, most changes to size will not preserve contiguity.  You probably
+   * also want to call set_stride() when you call this.
+   *
+   * TODO: This should be jettisoned in favor of `set_sizes_and_strides`,
+   * which is harder to misuse.
+   */
+  virtual void set_size(int64_t dim, int64_t new_size) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_size ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !matches_policy(SizesStridesPolicy::CustomSizes),
+        "set_size() called on tensor with dynamic shapes or customized size behavior")
+    sizes_and_strides_.size_at(dim) = new_size;
+    refresh_numel();
+    refresh_contiguous();
+  }
+
+  /**
+   * Change the stride at some dimension.
+   *
+   * TODO: This should be jettisoned in favor of `set_sizes_and_strides`,
+   * which is harder to misuse.
+   */
+  virtual void set_stride(int64_t dim, int64_t new_stride) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_stride ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_stride() called on tensor with symbolic shape")
+    sizes_and_strides_.stride_at_unchecked(dim) = new_stride;
+    refresh_contiguous();
+  }
+
+  /**
+   * Set the offset into the storage of this tensor.
+   *
+   * WARNING: This does NOT check if the tensor is in bounds for the new
+   * location at the storage; the caller is responsible for checking this
+   * (and resizing if necessary.)
+   */
+  virtual void set_storage_offset(int64_t storage_offset) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_storage_offset ",
+        err_msg_tensor_metadata_change_not_allowed);
+    // TODO: this should probably consult policy
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_storage_offset() called on tensor with symbolic shape")
+    storage_offset_ = storage_offset;
+  }
+
+  /**
+   * Like set_sizes_and_strides but assumes contiguous strides.
+   *
+   * WARNING: This function does not check if the requested
+   * sizes/strides are in bounds for the storage that is allocated;
+   * this is the responsibility of the caller
+   */
+  void set_sizes_contiguous(IntArrayRef new_size) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_sizes_contiguous ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !matches_policy(SizesStridesPolicy::CustomStrides),
+        "tried to directly modify sizes for customized tensor");
+    sizes_and_strides_.set_sizes(new_size);
+
+    refresh_numel();
+    empty_tensor_restride(
+        MemoryFormat::Contiguous); // calls refresh_contiguous()
+  }
+
+  /**
+   * Set the sizes and strides of a tensor.
+   *
+   * WARNING: This function does not check if the requested
+   * sizes/strides are in bounds for the storage that is allocated;
+   * this is the responsibility of the caller
+   */
+  void set_sizes_and_strides(
+      IntArrayRef new_size,
+      IntArrayRef new_stride,
+      c10::optional<int64_t> storage_offset = c10::nullopt) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_sizes_and_strides ",
+        err_msg_tensor_metadata_change_not_allowed);
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "set_sizes_and_strides() called on tensor with symbolic shape")
+    TORCH_CHECK(
+        new_size.size() == new_stride.size(),
+        "dimensionality of sizes (",
+        new_size.size(),
+        ") must match dimensionality of strides (",
+        new_stride.size(),
+        ")");
+    const auto new_dim = new_size.size();
+    bool overflowed = false;
+    sizes_and_strides_.set_sizes(new_size);
+
+    if (new_dim > 0) {
+      for (size_t dim = new_dim - 1;; dim--) {
+        if (new_stride[dim] >= 0) {
+          sizes_and_strides_.stride_at_unchecked(dim) = new_stride[dim];
+        } else {
+          // XXX: This behavior is surprising and may need to be removed to
+          // support negative strides. Some pytorch functions rely on it:
+          // for example, torch.cat (run TestTorch.test_cat_empty).
+          if (dim == new_dim - 1) {
+            sizes_and_strides_.stride_at_unchecked(dim) = 1;
+          } else {
+            // Keep stride monotonically increasing to match NumPy.
+            overflowed |= c10::mul_overflows(
+                sizes_and_strides_.stride_at_unchecked(dim + 1),
+                std::max<int64_t>(
+                    sizes_and_strides_.size_at_unchecked(dim + 1), 1),
+                std::addressof(sizes_and_strides_.stride_at_unchecked(dim)));
+          }
+        }
+        if (dim == 0)
+          break;
+      }
+      TORCH_CHECK(!overflowed, "Stride calculation overflowed");
+    }
+
+    refresh_numel();
+    refresh_contiguous();
+
+    if (storage_offset.has_value()) {
+      storage_offset_ = *storage_offset;
+    }
+  }
+
+  /**
+   * Set whether a tensor allows changes to its metadata (e.g. sizes / strides /
+   * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor
+   * ] for details.
+   */
+  void set_allow_tensor_metadata_change(bool value) {
+    // TODO: at some point, we should kill this field completely.
+    allow_tensor_metadata_change_ = true;
+  }
+
+  /**
+   * True if a tensor allows changes to its metadata (e.g. sizes / strides /
+   * storage / storage_offset). See NOTE [ Metadata Change for a Detached Tensor
+   * ] for details.
+   */
+  bool allow_tensor_metadata_change() const {
+    return allow_tensor_metadata_change_;
+  }
+
+  /**
+   * Set the pointer to autograd metadata.
+   */
+  void set_autograd_meta(
+      std::unique_ptr<c10::AutogradMetaInterface> autograd_meta);
+
+  /**
+   * Return the pointer to autograd metadata.  May return nullptr if the
+   * tensor does not track gradients.
+   */
+  c10::AutogradMetaInterface* autograd_meta() const;
+
+  /**
+   * Set the pointer to named tensor metadata.
+   */
+  void set_named_tensor_meta(
+      std::unique_ptr<c10::NamedTensorMetaInterface> named_tensor_meta) {
+    TORCH_WARN_ONCE(
+        "Named tensors and all their associated APIs are an experimental feature ",
+        "and subject to change. Please do not use them for anything important ",
+        "until they are released as stable.");
+#ifdef DEBUG
+    if (named_tensor_meta) {
+      TORCH_INTERNAL_ASSERT(named_tensor_meta->slow_dim() == dim());
+    }
+#endif
+    if (named_tensor_meta) {
+      get_extra_meta().named_tensor_meta_ = std::move(named_tensor_meta);
+      key_set_ = key_set_.add(DispatchKey::Named);
+    } else {
+      if (extra_meta_) {
+        extra_meta_->named_tensor_meta_ = nullptr;
+      }
+      key_set_ = key_set_.remove(DispatchKey::Named);
+    }
+  }
+
+  void set_python_dispatch(bool k) {
+    if (k) {
+      key_set_ = key_set_.add(c10::python_ks);
+    } else {
+      key_set_ = key_set_ - c10::python_ks;
+    }
+  }
+
+  bool is_python_dispatch() const {
+    return key_set_.has_all(c10::python_ks);
+  }
+
+  /**
+   * Return the pointer to named tensor metadata.
+   */
+  const c10::NamedTensorMetaInterface* named_tensor_meta() const {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->named_tensor_meta_.get();
+  }
+
+  c10::NamedTensorMetaInterface* named_tensor_meta() {
+    if (!extra_meta_) {
+      return nullptr;
+    }
+    return extra_meta_->named_tensor_meta_.get();
+  }
+
+  bool has_named_tensor_meta() const {
+    if (!extra_meta_) {
+      return false;
+    }
+    return extra_meta_->named_tensor_meta_ != nullptr;
+  }
+
+  // NOTE [ TensorImpl Shallow-Copying ]
+  //
+  // TensorImpl shallow-copying is used when we want to have two Variables share
+  // the same tensor metadata (e.g. sizes / strides / storage pointer /
+  // storage_offset), but each with a different autograd history. Example call
+  // sites:
+  //
+  // 1. `var_detached = var.detach()` uses `shallow_copy_and_detach()` to create
+  // `var_detached` that shares the same tensor metadata with `var`, but with a
+  // completely new autograd history.
+  // 2. `var.set_data(tensor)` uses `shallow_copy_from()` to copy tensor
+  // metadata from `tensor` into `var`, while keeping `var`'s original
+  // AutogradMeta.
+  //
+  // Functions that shallow-copy a TensorImpl (such as
+  // `shallow_copy_and_detach()` / `shallow_copy_from()` /
+  // `copy_tensor_metadata()`) copy the tensor metadata fields (e.g. sizes /
+  // strides / storage pointer / storage_offset) by value. However, the
+  // following fields are not copied:
+  //
+  // 1. the AutogradMeta pointer, because it is unique for each Variable.
+  // 2. the version counter, because the destination TensorImpl's version
+  // counter is either set to the passed-in `version_counter` (in
+  // `shallow_copy_and_detach()` and `copy_tensor_metadata()`), or it is kept
+  // intact (in `shallow_copy_from()`). See NOTE [ Version Counter Sharing ] for
+  // details.
+  //
+  // In `shallow_copy_and_detach()` and `copy_tensor_metadata()`, the passed-in
+  // `allow_tensor_metadata_change` determines whether the TensorImpl
+  // shallow-copy allows changes to its metadata (e.g. sizes / strides / storage
+  // / storage_offset). See NOTE [ Metadata Change for a Detached Tensor ] for
+  // details.
+  //
+  // In `shallow_copy_from()`, we don't check the destination TensorImpl's
+  // `allow_tensor_metadata_change_`, because `shallow_copy_from()` is used for
+  // implementing functions such as `var.set_data(tensor)`, which changes
+  // `var`'s tensor metadata and expects its `allow_tensor_metadata_change_` to
+  // be ignored.
+
+  /**
+   * One TensorImpl can be copied to another TensorImpl if they have the same
+   * DispatchKeySet. The only two special cases (for legacy reason) are:
+   * CPU is compatible with CUDA and SparseCPU is
+   * compatible with SparseCUDA.
+   */
+  inline bool has_compatible_shallow_copy_type(DispatchKeySet from) {
+    auto is_dense = [](DispatchKeySet ts) {
+      constexpr auto dense_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::MPSBit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit,
+           BackendComponent::HPUBit});
+      constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense);
+      return ts.has_any(dense_k) && ts.has_any(dense_backends);
+    };
+    auto is_sparse = [](DispatchKeySet ts) {
+      constexpr auto sparse_backends = DispatchKeySet(
+          {BackendComponent::CPUBit,
+           BackendComponent::CUDABit,
+           BackendComponent::HIPBit,
+           BackendComponent::XPUBit});
+      constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse);
+      return ts.has_any(sparse_k) && ts.has_any(sparse_backends);
+    };
+    auto is_sparse_compressed = [](DispatchKeySet ts) {
+      constexpr auto sparse_compressed_k =
+          DispatchKeySet(DispatchKey::SparseCsr);
+      return ts.has_any(sparse_compressed_k);
+    };
+    return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) ||
+        (is_sparse(key_set_) && is_sparse(from)) ||
+        (is_sparse_compressed(key_set_) && is_sparse_compressed(from));
+    ;
+  }
+
+ private:
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+ public:
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Return a TensorImpl that is a shallow-copy of this TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`,
+   * see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+
+  /**
+   * Shallow-copies data from another TensorImpl into this TensorImpl.
+   *
+   * For why this function doesn't check this TensorImpl's
+   * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ].
+   */
+  virtual void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
+    copy_tensor_metadata(
+        /*src_impl=*/impl.get(),
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+  }
+
+  // Inference tensor doesn't have version counter,
+  // set_version_counter is no-op for them.
+  void set_version_counter(const c10::VariableVersion& version_counter) {
+    TORCH_CHECK(
+        !(is_inference() && version_counter.enabled()),
+        "Cannot set version_counter for inference tensor");
+    version_counter_ = version_counter;
+  }
+
+  void set_version_counter(c10::VariableVersion&& version_counter) {
+    TORCH_CHECK(
+        !(is_inference() && version_counter.enabled()),
+        "Cannot set version_counter for inference tensor");
+    version_counter_ = std::move(version_counter);
+  }
+
+  const c10::VariableVersion& version_counter() const noexcept {
+    return version_counter_;
+  }
+
+  void bump_version() {
+    version_counter_.bump();
+  }
+
+  impl::PyObjectSlot* pyobj_slot() {
+    return &pyobj_slot_;
+  }
+
+  const impl::PyObjectSlot* pyobj_slot() const {
+    return &pyobj_slot_;
+  }
+
+ private:
+  // See NOTE [c10::optional operator usage in CUDA]
+  // We probably don't want to expose this publicly until
+  // the note is addressed.
+  c10::optional<c10::Device> device_opt() const {
+    return device_opt_;
+  }
+
+ public:
+  /**
+   * The device type of a Tensor, e.g., DeviceType::CPU or DeviceType::CUDA.
+   */
+  DeviceType device_type() const {
+    // TODO: A useful internal assert would be to show that device_opt_ is null
+    // only if you are an undefined tensor
+    TORCH_CHECK(
+        device_opt_.has_value(),
+        "device_type cannot be run on undefined Tensor");
+    // See NOTE [c10::optional operator usage in CUDA]
+    return (*device_opt_).type();
+  }
+
+  /**
+   * @brief Extends the outer-most dimension of this tensor by num elements,
+   * preserving the existing data.
+   *
+   * The underlying data may be reallocated in order to accommodate the new
+   * elements, in which case this tensors' capacity is grown at a factor of
+   * growthPct. This ensures that Extend runs on an amortized O(1) time
+   * complexity.
+   *
+   * This op is auto-asynchronous if the underlying device (CUDA) supports it.
+   */
+  void Extend(int64_t num, float growthPct);
+
+  /**
+   * @brief Reserve space for the underlying tensor.
+   *
+   * This must be called after Resize(), since we only specify the first
+   * dimension This does not copy over the old data to the newly allocated space
+   */
+  void ReserveSpace(int64_t outer_dim);
+
+  /**
+   * @brief Resizes a tensor.
+   *
+   * Resize takes in a vector of ints specifying the dimensions of the tensor.
+   * You can pass in an empty vector to specify that it is a scalar (i.e.
+   * containing one single item).
+   *
+   * The underlying storage may be deleted after calling Resize: if the new
+   * shape leads to a different number of items in the tensor, the old memory
+   * is deleted and new memory will be allocated next time you call
+   * mutable_data(). However, if the shape is different but the total number of
+   * items is the same, the underlying storage is kept.
+   *
+   * This method respects caffe2_keep_on_shrink.  Consult the internal logic
+   * of this method to see exactly under what circumstances this flag matters.
+   */
+  template <typename... Ts>
+  void Resize(Ts... dim_source) {
+    bool size_changed = SetDims(dim_source...);
+    if (size_changed) {
+      HandleResize();
+    }
+  }
+
+  template <typename T>
+  void Resize(const std::vector<T>& dim_source) {
+    Resize(ArrayRef<T>(dim_source));
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   */
+  void Reshape(const std::vector<int64_t>& dims);
+
+  /**
+   * Release whatever memory the tensor was holding but keep size and type
+   * information. Subsequent call to mutable_data will trigger new memory
+   * allocation.
+   */
+  void FreeMemory();
+
+  /**
+   * @brief Shares the data with another tensor.
+   *
+   * To share data between two tensors, the sizes of the two tensors must be
+   * equal already. The reason we do not implicitly do a Resize to make the two
+   * tensors have the same shape is that we want to allow tensors of different
+   * shapes but the same number of items to still be able to share data. This
+   * allows one to e.g. have a n-dimensional Tensor and a flattened version
+   * sharing the same underlying storage.
+   *
+   * The source tensor should already have its data allocated.
+   */
+  // To be deprecated
+  void ShareData(const TensorImpl& src);
+
+  void ShareExternalPointer(
+      DataPtr&& data_ptr,
+      const caffe2::TypeMeta data_type,
+      size_t size_bytes);
+
+  /**
+   * Returns a mutable raw pointer of the underlying storage. Since we will need
+   * to know the type of the data for allocation, a TypeMeta object is passed in
+   * to specify the necessary information. This is conceptually equivalent of
+   * calling mutable_data<T>() where the TypeMeta parameter meta is derived from
+   * the type T. This function differs from mutable_data<T>() in the sense that
+   * the type T can be specified during runtime via the TypeMeta object.
+   *
+   * If the existing data does not match the desired type, it will be deleted
+   * and a new storage will be created.
+   */
+  inline void* raw_mutable_data(const caffe2::TypeMeta& meta) {
+    // For 0-size tensors it's fine to return any pointer (including nullptr)
+    if (data_type_ == meta && storage_initialized()) {
+      return static_cast<void*>(
+          static_cast<char*>(storage_.mutable_data()) +
+          storage_offset_ * meta.itemsize());
+    } else {
+      bool had_special_dtor = data_type_.placementDelete() != nullptr;
+      storage_offset_ = 0;
+      data_type_ = meta;
+      // NB: device is not changed
+
+      // We can reuse the existing buffer if the current data does not have
+      // a special destructor and the new data doesn't have a special
+      // constructor.
+      if (numel_ == 0 ||
+          (meta.placementNew() == nullptr && !had_special_dtor &&
+           (storage_.nbytes() >= (numel_ * data_type_.itemsize())))) {
+        TORCH_INTERNAL_ASSERT(
+            storage_offset_ == 0); // because we just reallocated
+        return storage_.mutable_data();
+      }
+      Allocator* allocator = storage_.allocator();
+      // Storage might have nullptr allocator in rare cases, for example, if
+      // an external memory segment has been wrapped with Tensor and we don't
+      // know how to reallocate it. However, in order to preserve legacy C2
+      // behavior, we allow reallocating the memory using default allocator.
+      if (allocator == nullptr) {
+        allocator = GetAllocator(storage_.device_type());
+      }
+      if (meta.placementNew()) {
+        // For types that need placement new, we will call it, as well as
+        // making sure that when the data is freed, it calls the right
+        // destruction procedure.
+        auto size = numel_;
+        auto dtor = data_type_.placementDelete();
+        auto data_ptr = allocator->allocate(numel_ * data_type_.itemsize());
+        storage_.set_data_ptr_noswap(PlacementDeleteContext::makeDataPtr(
+            std::move(data_ptr), dtor, size, storage_.device()));
+        data_type_.placementNew()(storage_.mutable_data(), numel_);
+      } else {
+        // For fundamental type, new and delete is easier.
+        storage_.set_data_ptr_noswap(
+            allocator->allocate(numel_ * data_type_.itemsize()));
+      }
+      storage_.set_nbytes(numel_ * data_type_.itemsize());
+      TORCH_INTERNAL_ASSERT(
+          storage_offset_ == 0); // because we just reallocated
+      device_opt_ = storage_.device();
+      return storage_.mutable_data();
+    }
+  }
+
+  /**
+   * Returns a typed pointer of the underlying storage.
+   *
+   * For fundamental types, we reuse possible existing storage if there
+   * is sufficient capacity.
+   */
+  template <typename T>
+  inline T* mutable_data() {
+    if (storage_initialized() && data_type_.Match<T>()) {
+      return static_cast<T*>(storage_.mutable_data()) + storage_offset_;
+    }
+    // Check it here statically - otherwise TypeMeta would throw the runtime
+    // error in attempt to invoke TypeMeta::ctor()
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "Tensor can't hold non-default-constructable types");
+    return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
+  }
+
+  /**
+   * True if a tensor is storage initialized.  A tensor may become
+   * storage UNINITIALIZED after a Resize() or FreeMemory()
+   */
+  bool storage_initialized() const {
+    TORCH_CHECK(
+        has_storage(),
+        "cannot call storage_initialized on tensor that does not have storage");
+    return storage_.data() || numel_ == 0;
+  }
+
+  /**
+   * True if a tensor is dtype initialized.  A tensor allocated with
+   * Caffe2-style constructors is dtype uninitialized until the
+   * first time mutable_data<T>() is called.
+   */
+  bool dtype_initialized() const noexcept {
+    return data_type_ != caffe2::TypeMeta();
+  }
+
+  void set_storage_keep_dtype(at::Storage storage) {
+    TORCH_CHECK(
+        allow_tensor_metadata_change(),
+        "set_storage ",
+        err_msg_tensor_metadata_change_not_allowed);
+    storage_ = std::move(storage);
+    device_opt_ = storage_.device();
+  }
+
+  void set_storage_and_dtype(
+      at::Storage storage,
+      const caffe2::TypeMeta data_type) {
+    set_storage_keep_dtype(std::move(storage));
+    data_type_ = data_type;
+  }
+
+  void empty_tensor_restride_symint(MemoryFormat memory_format);
+
+  /**
+   * Set the strides of the tensor to match memory_format
+   *
+   * WARNING: This function doesn't rearrange data and assumes tensor is a
+   * memory contiguous
+   */
+  void empty_tensor_restride(MemoryFormat memory_format) {
+    if (has_symbolic_sizes_strides_) {
+      empty_tensor_restride_symint(memory_format);
+      return;
+    }
+#ifdef DEBUG
+    TORCH_INTERNAL_ASSERT(
+        compute_numel() == numel_,
+        "If you are seeing this error, that means empty_tensor_restride was "
+        "called before setting correct numel");
+#endif
+    switch (memory_format) {
+      case MemoryFormat::Contiguous: {
+        // dim_ is a virtual call, don't repeat it
+        const auto dim_ = dim();
+        sizes_and_strides_.resize(dim_);
+        if (dim_ > 0) {
+          bool overflowed = false;
+          const auto last_idx = dim_ - 1;
+          sizes_and_strides_.stride_at_unchecked(last_idx) = 1;
+          for (auto i = last_idx - 1; i >= 0; --i) {
+            overflowed |= c10::mul_overflows(
+                sizes_and_strides_.stride_at_unchecked(i + 1),
+                std::max<int64_t>(
+                    sizes_and_strides_.size_at_unchecked(i + 1), 1),
+                std::addressof(sizes_and_strides_.stride_at_unchecked(i)));
+          }
+          TORCH_CHECK(!overflowed, "Stride calculation overflowed");
+        }
+        break;
+      }
+      case MemoryFormat::ChannelsLast: {
+        TORCH_CHECK(
+            dim() == 4, "required rank 4 tensor to use channels_last format");
+        set_sizes_and_strides(sizes(), get_channels_last_strides_2d(sizes()));
+        break;
+      }
+      case MemoryFormat::ChannelsLast3d: {
+        TORCH_CHECK(
+            dim() == 5,
+            "required rank 5 tensor to use channels_last_3d format");
+        set_sizes_and_strides(sizes(), get_channels_last_strides_3d(sizes()));
+        break;
+      }
+      case MemoryFormat::Preserve:
+        TORCH_CHECK(false, "unsupported memory format ", memory_format);
+        // Cleaning warning messages, no need to break as TORCH_CHECK(false)
+        // terminates flow.
+        // break;
+      case MemoryFormat::NumOptions:
+        TORCH_INTERNAL_ASSERT(false, "invalid memory format ", memory_format);
+    }
+    // recompute contiguous flag, as currently NHWC/NCHW flags are not mutually
+    // exclusive see #24090
+    refresh_contiguous();
+  }
+
+  bool is_strides_like(at::MemoryFormat memory_format) const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_strides_like_custom(memory_format);
+    }
+    return is_strides_like_default(memory_format);
+  }
+
+  bool is_strides_like_channels_last() const {
+    return is_strides_like(at::MemoryFormat::ChannelsLast);
+  }
+
+  bool is_strides_like_channels_last_3d() const {
+    return is_strides_like(at::MemoryFormat::ChannelsLast3d);
+  }
+
+  bool is_non_overlapping_and_dense() const {
+    if (C10_UNLIKELY(matches_policy(SizesStridesPolicy::CustomStrides))) {
+      return is_non_overlapping_and_dense_custom();
+    }
+    return is_non_overlapping_and_dense_default();
+  }
+
+  bool has_symbolic_sizes_strides() const {
+    return has_symbolic_sizes_strides_;
+  }
+
+ private:
+  void HandleResize();
+
+  // The Caffe2 Resize() method supports being called both as Resize({2,2}) as
+  // well as variadic with Resize(2, 2).  These overloads provide all of the
+  // supported calling configurations, while being overloads (and not templates)
+  // so that implicit conversions still work.
+  //
+  // SetDims on ArrayRef is internally implemented as a template, so we can
+  // handle both ArrayRefs of different types (there are some uses of
+  // Resize in Caffe2 which pass in int, not int64_t.)
+
+  template <
+      typename T,
+      typename = typename std::enable_if_t<std::is_integral_v<T>>>
+  bool SetDimsTemplate(ArrayRef<T> src) {
+    TORCH_CHECK(
+        !has_symbolic_sizes_strides_,
+        "SetDims() called on tensor with symbolic shape")
+
+    auto old_numel = numel_;
+    sizes_and_strides_.resize(src.size());
+    int64_t new_numel = 1;
+    for (const auto i : c10::irange(src.size())) {
+      new_numel *= src[i];
+      sizes_and_strides_.size_at_unchecked(i) = src[i];
+    }
+    numel_ = new_numel;
+    empty_tensor_restride(MemoryFormat::Contiguous);
+    return numel_ != old_numel;
+  }
+
+  bool SetDims(ArrayRef<int64_t> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims(ArrayRef<int> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims(ArrayRef<size_t> s) {
+    return SetDimsTemplate(s);
+  }
+
+  bool SetDims() {
+    return SetDims(IntArrayRef{});
+  }
+
+  bool SetDims(const int64_t d0) {
+    return SetDims(IntArrayRef{d0});
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1) {
+    return SetDims(IntArrayRef{d0, d1});
+  }
+
+  bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
+    return SetDims(IntArrayRef{d0, d1, d2});
+  }
+
+  bool SetDims(
+      const int64_t d0,
+      const int64_t d1,
+      const int64_t d2,
+      const int64_t d3) {
+    return SetDims(IntArrayRef{d0, d1, d2, d3});
+  }
+
+  /**
+   * Compute the number of elements based on the sizes of a tensor.
+   */
+  // NB: This is ONLY called when sizes_and_strides_ is used directly; if
+  // we are virtualizing, then numel calls are virtualized as well, and this
+  // should never get called
+  int64_t compute_numel() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_);
+#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE)
+    // Use overflow checks if supported by the compiler
+    return safe_compute_numel();
+#else
+    return c10::multiply_integers(sizes_and_strides_.sizes_arrayref());
+#endif
+  }
+
+  /**
+   * Compute the number of elements based on the sizes of a
+   * tensor. Catches integer overflow that may occur when a tensor
+   * using a sparse layout has multiple dimensions with large sizes.
+   */
+  int64_t safe_compute_numel() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!has_symbolic_sizes_strides_);
+    uint64_t n = 1;
+    bool overflows =
+        c10::safe_multiplies_u64(sizes_and_strides_.sizes_arrayref(), &n);
+    constexpr auto numel_max = std::min(
+        static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
+        static_cast<uint64_t>(std::numeric_limits<size_t>::max()));
+
+    overflows |= (n > numel_max);
+    TORCH_CHECK(!overflows, "numel: integer multiplication overflow");
+    return static_cast<int64_t>(n);
+  }
+
+  /**
+   * Compute whether or not a tensor is contiguous based on the sizes and
+   * strides of a tensor.
+   */
+  bool compute_contiguous(identity<bool>) const;
+
+  bool compute_channels_last_contiguous_2d(identity<bool>) const;
+
+  bool compute_channels_last_contiguous_3d(identity<bool>) const;
+
+  bool compute_strides_like_channels_last_2d(identity<bool>) const;
+
+  bool compute_strides_like_channels_last_3d(identity<bool>) const;
+
+  bool compute_non_overlapping_and_dense(identity<bool>) const;
+
+ protected:
+  /**
+   * Recompute the cached numel of a tensor.  Call this if you modify
+   * sizes.
+   *
+   * For tensors with sparse layouts, use safe_refresh_numel() instead
+   * because it will catch integer overflow that may occur for tensors
+   * with sparse layouts and large dimensions.
+   *
+   * NB: We may uselessly recompute cached numel even in situations where
+   * it is completely never used (e.g., if CustomSizes for Python).  However,
+   * we still must keep it up to date in case the Python overload
+   * returns None (in which case we will consult the field here).  This also
+   * implies that sizes/strides will never be complete garbage; in the
+   * very worst case scenario, it will reflect a 1-dim zero size tensor.
+   */
+  void refresh_numel() {
+    if (has_symbolic_sizes_strides_) {
+      symbolic_shape_meta().refresh_numel();
+    } else {
+      numel_ = compute_numel();
+    }
+  }
+
+  /**
+   * Recompute the cached numel of a tensor.  Call this if you modify
+   * sizes. Use only for tensors with sparse layouts because only
+   * sparse tensor are likely to have sizes that may lead to integer
+   * overflow when computing numel.
+   */
+  void safe_refresh_numel() {
+    if (has_symbolic_sizes_strides_) {
+      // NB: sym numel is done with symbolic integers, which handle overflow
+      // checking
+      symbolic_shape_meta().refresh_numel();
+    } else {
+      numel_ = safe_compute_numel();
+    }
+  }
+
+ private:
+  // NB: the TypeId argument prevents confusion where you pass a true/false
+  // literal and pick the wrong overload
+
+  void _set_is_contiguous(identity<bool>, bool b) {
+    is_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_contiguous(identity<bool>, bool b) {
+    is_channels_last_contiguous_ = b;
+  }
+
+  void _set_is_channels_last_3d_contiguous(identity<bool>, bool b) {
+    is_channels_last_3d_contiguous_ = b;
+  }
+
+  void _set_is_channels_last(identity<bool>, bool b) {
+    is_channels_last_ = b;
+  }
+
+  void _set_is_channels_last_3d(identity<bool>, bool b) {
+    is_channels_last_3d_ = b;
+  }
+
+  void _set_is_non_overlapping_and_dense(identity<bool>, bool b) {
+    is_non_overlapping_and_dense_ = b;
+  }
+
+  // These are little wrappers over the real compute_ functions that
+  // can make use of other contiguity fields to short circuit.
+
+  bool compute_is_non_overlapping_and_dense_dim4(identity<bool> type_id) {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        compute_non_overlapping_and_dense(type_id);
+  }
+
+  bool compute_channels_last_contiguous_3d_dim5(identity<bool> type_id) {
+    return !is_channels_last_contiguous_ &&
+        compute_channels_last_contiguous_3d(type_id);
+  }
+
+  bool compute_channels_last_2d_dim5(identity<bool> type_id) {
+    return !is_channels_last_3d_contiguous_ &&
+        compute_strides_like_channels_last_2d(type_id);
+  }
+
+  bool compute_channels_last_3d_dim5(identity<bool> type_id) {
+    return !is_channels_last_ && compute_strides_like_channels_last_3d(type_id);
+  }
+
+  bool compute_is_non_overlapping_and_dense_dim5(identity<bool> type_id) {
+    return is_contiguous_ || is_channels_last_contiguous_ ||
+        is_channels_last_3d_contiguous_ ||
+        compute_non_overlapping_and_dense(type_id);
+  }
+
+  bool compute_is_non_overlapping_and_dense_anydim(identity<bool> type_id) {
+    return is_contiguous_ || compute_non_overlapping_and_dense(type_id);
+  }
+
+  template <typename T>
+  void _refresh_contiguous() {
+    auto type_id = identity<T>();
+    // Note:
+    // Dim 0, 1, 2 will never be a channels last 2d/3d format
+    // Dim 3+ is possibly be a channels last 2d format (Dim 4 only at this
+    // point) Dim 4+ is possibly be a channels last 3d format (Dim 5 only at
+    // this point)
+    switch (dim()) {
+      case 4: {
+        _set_is_contiguous(type_id, compute_contiguous(type_id));
+        _set_is_channels_last_contiguous(
+            type_id, compute_channels_last_contiguous_2d(type_id));
+        _set_is_channels_last_3d_contiguous(type_id, false);
+        _set_is_channels_last(
+            type_id, compute_strides_like_channels_last_2d(type_id));
+        _set_is_channels_last_3d(type_id, false);
+        _set_is_non_overlapping_and_dense(
+            type_id, compute_is_non_overlapping_and_dense_dim4(type_id));
+        break;
+      }
+      case 5: {
+        _set_is_contiguous(type_id, compute_contiguous(type_id));
+        _set_is_channels_last_contiguous(
+            type_id, compute_channels_last_contiguous_2d(type_id));
+        _set_is_channels_last_3d_contiguous(
+            type_id, compute_channels_last_contiguous_3d_dim5(type_id));
+        _set_is_channels_last(type_id, compute_channels_last_2d_dim5(type_id));
+        _set_is_channels_last_3d(
+            type_id, compute_channels_last_3d_dim5(type_id));
+        _set_is_non_overlapping_and_dense(
+            type_id, compute_is_non_overlapping_and_dense_dim5(type_id));
+        break;
+      }
+      default:
+        // is_channels_last_ and is_channels_last_3d_ are suggested
+        // memory_format. Being channels_last_contiguous doesn't necessarily
+        // mean the tensor is strided like channels_last: for strides on channel
+        // dimension could suggest desired memory_layout, but it doesn't affect
+        // memory storage
+        _set_is_contiguous(type_id, compute_contiguous(type_id));
+        _set_is_channels_last_contiguous(type_id, false);
+        _set_is_channels_last_3d_contiguous(type_id, false);
+        _set_is_channels_last(type_id, false);
+        _set_is_channels_last_3d(type_id, false);
+        _set_is_non_overlapping_and_dense(
+            type_id, compute_is_non_overlapping_and_dense_anydim(type_id));
+        break;
+    }
+  }
+
+ protected:
+  /**
+   * Recompute the cached contiguity of a tensor.  Call this if you modify sizes
+   * or strides.
+   */
+  void refresh_contiguous() {
+    if (has_symbolic_sizes_strides_) {
+      symbolic_shape_meta().refresh_contiguous();
+    } else {
+      _refresh_contiguous<bool>();
+    }
+  }
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
+
+  /**
+   * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer /
+   * storage_offset) from one TensorImpl to another TensorImpl.
+   *
+   * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE
+   * [ TensorImpl Shallow-Copying ].
+   */
+  static void copy_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change);
+
+ private:
+  static void copy_tensor_metadata_except_version_counter(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl,
+      bool allow_tensor_metadata_change);
+
+ protected:
+  // Error message to show when the user tries to change tensor metadata on
+  // Tensor created from .data or .detach().
+  //
+  // See NOTE [ Metadata Change for a Detached Tensor ] for details.
+  static const char* const err_msg_tensor_metadata_change_not_allowed;
+
+  static void copy_generic_tensor_metadata(
+      const TensorImpl* src_impl,
+      TensorImpl* dest_impl);
+
+ public:
+  void set_storage_access_should_throw() {
+    storage_access_should_throw_ = true;
+  }
+
+ public:
+  void set_custom_sizes_strides(SizesStridesPolicy policy) {
+    custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
+  }
+
+  void set_python_custom_sizes_strides(SizesStridesPolicy policy) {
+    python_custom_sizes_strides_ = static_cast<uint8_t>(policy);
+    refresh_sizes_strides_policy();
+  }
+
+  void set_custom_device(bool custom_device) {
+    custom_device_ = custom_device;
+    refresh_device_policy();
+  }
+
+  void set_custom_layout(bool custom_layout) {
+    custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+  void set_python_custom_device(bool custom_device) {
+    python_custom_device_ = custom_device;
+    refresh_device_policy();
+  }
+
+  void set_python_custom_layout(bool custom_layout) {
+    python_custom_layout_ = custom_layout;
+    refresh_layout_policy();
+  }
+
+ protected:
+  void refresh_sizes_strides_policy() {
+    if (has_symbolic_sizes_strides_) {
+      sizes_strides_policy_ =
+          static_cast<uint8_t>(SizesStridesPolicy::CustomSizes);
+    } else {
+      sizes_strides_policy_ =
+          std::max(custom_sizes_strides_, python_custom_sizes_strides_);
+    }
+  }
+
+  void refresh_device_policy() {
+    device_policy_ = custom_device_ || python_custom_device_;
+  }
+
+  void refresh_layout_policy() {
+    layout_policy_ = custom_layout_ || python_custom_layout_;
+  }
+
+ protected:
+  Storage storage_;
+
+ private:
+  // This pointer points to an AutogradMeta struct that stores autograd-specific
+  // fields (such as grad_ / grad_fn_ / grad_accumulator_). This pointer always
+  // has unique ownership (meaning only one TensorImpl can own it at a time).
+  //
+  // autograd_meta_ can be nullptr, as an optimization.  When this occurs, it is
+  // equivalent to having an autograd_meta_ pointing to a default constructed
+  // AutogradMeta; intuitively, tensors which don't require grad will have this
+  // field set to null.
+  //
+  // This means accessors on autograd_meta_ have to be careful to test if they
+  // got a nullptr, and handle default behavior appropriately in that case.
+  //
+  // Note that we don't enforce the invariant that if the AutogradMeta is
+  // default constructed, it is nullptr (to do this, we'd have to continuously
+  // check if an AutogradMeta became, by mutation, equal to the default
+  // constructed form.  (This might be useful, but it seems rare enough that
+  // a requires_grad=True variable will turn back into the requires_grad=False
+  // version.)  So there are three representable states:
+  //
+  //    1. autograd_meta_ == nullptr
+  //    2. autograd_meta_ is default constructed (semantically, same as (1))
+  //    3. autograd_meta_ has nontrivial information content
+  //
+  std::unique_ptr<c10::AutogradMetaInterface> autograd_meta_ = nullptr;
+
+ protected:
+  std::unique_ptr<c10::ExtraMeta> extra_meta_ = nullptr;
+
+  c10::VariableVersion version_counter_;
+
+  impl::PyObjectSlot pyobj_slot_;
+
+  c10::impl::SizesAndStrides sizes_and_strides_;
+
+  int64_t storage_offset_ = 0;
+  // If sizes and strides are empty, the numel is 1!!  However, most of the
+  // time, we will immediately set sizes to {0} and reset numel to 0.
+  // (Can't do that in the default initializers, because there's no way to
+  // spell "allocate a one-element array" for strides_).
+  int64_t numel_ = 1;
+
+  // INVARIANT: When storage is non-null, this type meta must
+  // agree with the type meta in storage
+  caffe2::TypeMeta data_type_;
+
+  // NOTE [c10::optional operator usage in CUDA]
+  // Our optional definition doesn't compile in .cu file if `value()` or
+  // `operator->` are used.  Instead, we always use `operator*`.
+  // See https://github.com/pytorch/pytorch/issues/18496 for more info.
+  // If this is too burdensome to maintain, we can just
+  // manually implement this with an additional bool.
+
+  // INVARIANT: When storage is non-null, this Device must
+  // agree with the type meta in storage.
+  //
+  // INVARIANT: device_opt_ is only nullopt for undefined tensors
+  // (which do not have a device.)
+  c10::optional<c10::Device> device_opt_;
+
+  // default member initializers for bit-fields only available with -std=c++2a
+  // or -std=gnu++2a
+  inline void init_bitfields() {
+    is_contiguous_ = true;
+    is_channels_last_ = false;
+    is_channels_last_contiguous_ = false;
+    is_channels_last_3d_ = false;
+    is_channels_last_3d_contiguous_ = false;
+    is_non_overlapping_and_dense_ = true;
+    is_wrapped_number_ = false;
+    allow_tensor_metadata_change_ = true;
+    reserved_ = false;
+    sizes_strides_policy_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    custom_sizes_strides_ = static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_sizes_strides_ =
+        static_cast<uint8_t>(SizesStridesPolicy::Default);
+    python_custom_device_ = false;
+    python_custom_layout_ = false;
+    custom_device_ = false;
+    custom_layout_ = false;
+    device_policy_ = false;
+    layout_policy_ = false;
+    storage_access_should_throw_ = false;
+    has_symbolic_sizes_strides_ = false;
+  }
+
+  // Tensor is contiguous
+  bool is_contiguous_ : 1;
+
+  // Tensor is a subclass that does not permit storage access.
+  bool storage_access_should_throw_ : 1;
+
+  // Tensor is stored in the channels last 2d memory format, when dimensions
+  // order is (N)CHW and C-strides < W-strides < H-strides (< N-strides)
+  // (If size of any dimension is equal to 1, this dimension strides value
+  // is not taken into account).
+  bool is_channels_last_ : 1;
+
+  // Channels last contiguous tensor is channel last tensor which occupies
+  // contiguous memory block.
+  bool is_channels_last_contiguous_ : 1;
+
+  // Tensor is stored in the channels last 3d memory format, when dimensions
+  // order is (N)CDHW and C-strides < W-strides < H-strides < D - strides (<
+  // N-strides) (If size of any dimension is equal to 1, this dimension strides
+  // value is not taken into account).
+  bool is_channels_last_3d_ : 1;
+
+  // Channels last 3d contiguous tensor is channel last 3d tensor which occupies
+  // contiguous memory block.
+  bool is_channels_last_3d_contiguous_ : 1;
+
+  // Dense tensor is the tensor that store values in a contiguous block of
+  // memory. Non-overlapping tensor is the tensor in which elements occupy
+  // individual non-repetitive memory.
+  bool is_non_overlapping_and_dense_ : 1;
+
+  bool is_wrapped_number_ : 1;
+
+  // NOTE [ Metadata Change for a Detached Tensor ]
+  //
+  // Normally, a user is allowed to change the tensor metadata
+  // (e.g. sizes / strides / storage / storage_offset) of a tensor.
+  // However, if the tensor is created by `t1_detached = t1.data` in Python
+  // or `t1_detached = t1.detach()` in Python/C++, those changes to the
+  // tensor metadata of `t1_detached` will not be propagated back to the
+  // original tensor `t1`. In order to make such changes explicitly illegal,
+  // we created the `allow_tensor_metadata_change_` flag, to prevent users
+  // from changing metadata of the detached tensor and expecting the original
+  // tensor to also be updated.
+  //
+  // NOTE: For a full list of tensor metadata fields, please see
+  // `copy_tensor_metadata()` in TensorImpl and its subclasses to find
+  // which fields are copied by value.
+  bool allow_tensor_metadata_change_ : 1;
+
+  // we decide to keep reserved_ and it will
+  // live in Tensor after the split
+  // The logic is that if Extend() or ReserveSpace() were ever called,
+  // then subsequent Resize()s will not free up Storage.
+  bool reserved_ : 1;
+
+  // Call _custom() virtual methods for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  // This is a combination of sizes_strides_custom_dispatch_
+  // and has_symbolic_sizes_strides_
+  uint8_t sizes_strides_policy_ : 2;
+
+  // Whether or not sizes_and_strides_ contains a symbolic value.
+  bool has_symbolic_sizes_strides_ : 1;
+
+  // Call _custom() virtual method for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t custom_sizes_strides_ : 2;
+
+  // Combo of custom_ and python_custom_
+  bool device_policy_ : 1;
+  bool layout_policy_ : 1;
+
+  // Call _custom() virtual method for device()
+  bool custom_device_ : 1;
+
+  // Call _custom() virtual method for layout()
+  bool custom_layout_ : 1;
+
+  // Call into Python for
+  // strides()/is_contiguous()/sizes()/dim()/numel()
+  uint8_t python_custom_sizes_strides_ : 2;
+
+  // Call into Python for device()
+  bool python_custom_device_ : 1;
+
+  // Call into Python for layout()
+  bool python_custom_layout_ : 1;
+
+  // The set of DispatchKeys which describe this tensor.  NB: this
+  // does NOT include Autograd (historically, it did, but
+  // not anymore!)
+  //
+  // INVARIANT: extra_meta_->named_tensor_meta_ != nullptr  <==>
+  // key_set_.has(DispatchKey::Named)
+  DispatchKeySet key_set_;
+
+ private:
+  // C10_TensorImpl_Size_Check_Dummy_Class needs to be friends with
+  // TensorImpl so it can inspect the size of private fields
+  template <
+      size_t cplusplus,
+      size_t clang_ver_major,
+      size_t gcc_ver,
+      size_t gcc_ver_minor,
+      size_t nvcc,
+      size_t cuda_version,
+      size_t cuda_version_major,
+      size_t ptr_size>
+  friend class C10_TensorImpl_Size_Check_Dummy_Class;
+};
+
+// Note [TensorImpl size constraints]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Changed the size of TensorImpl?  If the size went down, good for
+// you!  Adjust the documentation below and the expected size.
+// Did it go up?  Read on...
+//
+// Struct size matters.  In some production systems at Facebook, we have
+// 400M live tensors during a training run.  Do the math: every 64-bit
+// word you add to Tensor is an extra 3.2 gigabytes in RAM.
+//
+// If you are a Facebook employee, you can check if the run in question
+// has tipped you over the point using the command here:
+// https://fburl.com/q5enpv98
+//
+// For reference, we OOMed at 160 bytes (20 words) per TensorImpl.
+// This is not counting overhead from strides out-of-line allocation and
+// StorageImpl space and this is from before we inlined sizes and strides
+// directly into TensorImpl as SmallVectors.
+//
+// Our memory usage on 32-bit systems is suboptimal, but we're not checking
+// for it at the moment (to help avoid rage inducing cycles when the
+// 32-bit number is wrong).
+//
+// Current breakdown:
+//
+//    vtable pointer
+//    strong refcount           TODO: pack these into one word
+//    weak refcount
+//    storage pointer
+//    autograd metadata pointer
+//    named tensor metadata pointer
+//    version counter pointer
+//    PyObjectSlot
+//    SizesAndStrides size/pointer
+//    SizesAndStrides sizes (pre-allocated 0)
+//    SizesAndStrides sizes (pre-allocated 1)
+//    SizesAndStrides sizes (pre-allocated 2)
+//    SizesAndStrides sizes (pre-allocated 3)
+//    SizesAndStrides sizes (pre-allocated 4)
+//    SizesAndStrides strides (pre-allocated 0)
+//    SizesAndStrides strides (pre-allocated 1)
+//    SizesAndStrides strides (pre-allocated 2)
+//    SizesAndStrides strides (pre-allocated 3)
+//    SizesAndStrides strides (pre-allocated 4)
+//    storage offset
+//    numel
+//    data type, device, is_contiguous, storage_access_should_throw_, bitfields
+//    DispatchKeySet
+//
+
+// Various preprocessor macros we use to check that the
+// TensorImpl size hasn't changed unexpectedly. We undef
+// these later.
+#ifndef __NVCC__
+#define C10_NVCC 0
+#else
+#define C10_NVCC __NVCC__
+#endif
+
+#ifndef __CUDA_VER_MAJOR__
+#define C10_CUDA_VERSION_MAJOR 0
+#else
+#define C10_CUDA_VERSION_MAJOR __CUDA_VER_MAJOR__
+#endif
+
+#ifndef CUDA_VERSION
+#define C10_CUDA_VERSION 0
+#else
+#define C10_CUDA_VERSION CUDA_VERSION
+#endif
+
+#ifndef __clang_major__
+#define C10_CLANG_MAJOR_VERSION 0
+#else
+#define C10_CLANG_MAJOR_VERSION __clang_major__
+#endif
+
+#ifndef __GNUC__
+#define C10_GCC_VERSION 0
+#else
+#define C10_GCC_VERSION __GNUC__
+#endif
+
+#ifndef __GNUC_MINOR__
+#define C10_GCC_VERSION_MINOR 0
+#else
+#define C10_GCC_VERSION_MINOR __GNUC_MINOR__
+#endif
+
+// We use a templatized class to both contain the logic of checking the sizes
+// as well as to provide compile-time information that might be useful in
+// figuring out why sizes may have changed.
+// All the compile time information is given by the template fields that are
+// always printed by the compiler when the static_assert fails.
+template <
+    size_t cplusplus = __cplusplus,
+    size_t clang_ver_major = C10_CLANG_MAJOR_VERSION,
+    size_t gcc_ver = C10_GCC_VERSION,
+    size_t gcc_ver_minor = C10_GCC_VERSION_MINOR,
+    size_t nvcc = C10_NVCC,
+    size_t cuda_version = C10_CUDA_VERSION,
+    size_t cuda_version_major = C10_CUDA_VERSION_MAJOR,
+    size_t ptr_size = sizeof(void*)>
+class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
+  // Names of (non-bitfield) fields in TensorImpl; used to provide
+  // compile-time info about fields whose size changes unexpectedly.
+  enum class FieldNameEnum {
+    storage_,
+    autograd_meta_,
+    extra_meta_,
+    version_counter_,
+    pyobj_slot_,
+    sizes_and_strides_,
+    storage_offset_,
+    numel_,
+    data_type_,
+    device_opt_,
+    key_set_,
+    TOTAL_SIZE
+  };
+
+  // Provides compile-time equality check that reveals what numbers
+  // were used and on which quantity
+  template <size_t Actual, size_t Expected, FieldNameEnum FiledName>
+  constexpr static bool are_equal() {
+    static_assert(
+        Actual == Expected,
+        "Actual and Expected sizes of a field did not match!");
+    return true;
+  }
+
+  // Provides compile-time <= check that reveals what numbers
+  // were used and on which quantity
+  template <size_t Actual, size_t Expected, FieldNameEnum FiledName>
+  constexpr static bool is_le() {
+    static_assert(
+        Actual <= Expected,
+        "Actual and Expected sizes of a field did not match!");
+    return true;
+  }
+
+ public:
+  // Compile-time check that TensorImpl field sizes are as expected
+  //
+  // Observed total sizes and associated versions
+  // If you find a flag that predicts when unique_ptr has 16 bytes
+  // on 64-bit systems or when sizes_and_strides_ is 84 vs 88 bytes
+  // on 32-bit systems you get a cookie!
+  // Length | LLVM | GCC  |    C++ |  CUDA
+  //    192 |    ? | 11.2 | 201703 | 11040
+  //    208 |    ? | 11.2 | 201703 | 11040
+  //    208 |    ? | 11.2 | 201402 | 11040
+  //    192 |    ? | 11.2 | 201402 | 11040
+  //    160 |   12 |  4.2 | 201703 |     0
+  //
+  // To keep things clean, we split on systems here.
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+  // This is a 32-bit system
+  static constexpr bool check_sizes() {
+    constexpr size_t tsize = 20 * sizeof(int64_t);
+
+    // clang-format off
+    are_equal<sizeof(storage_),            4,  FieldNameEnum::storage_>();
+    are_equal<sizeof(autograd_meta_),      4,  FieldNameEnum::autograd_meta_>();
+    are_equal<sizeof(extra_meta_),         4,  FieldNameEnum::extra_meta_>();
+    are_equal<sizeof(version_counter_),    4,  FieldNameEnum::version_counter_>();
+    are_equal<sizeof(pyobj_slot_),    8,  FieldNameEnum::pyobj_slot_>();
+    is_le<sizeof(sizes_and_strides_),     88, FieldNameEnum::sizes_and_strides_>();
+    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
+    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
+    are_equal<sizeof(data_type_),          2,  FieldNameEnum::data_type_>();
+    are_equal<sizeof(device_opt_),         3,  FieldNameEnum::device_opt_>();
+    are_equal<sizeof(key_set_),            8,  FieldNameEnum::key_set_>();
+    is_le<sizeof(TensorImpl),          tsize,  FieldNameEnum::TOTAL_SIZE>();
+    // clang-format on
+
+    return true;
+  }
+#else
+  // This is a 64-bit system
+  static constexpr bool check_sizes() {
+    constexpr size_t tsize = 26 * sizeof(int64_t);
+
+    // clang-format off
+    are_equal<sizeof(storage_),            8,  FieldNameEnum::storage_>();
+    // On some systems involving NVCC the size of unique_ptr is 16 bytes. We haven't
+    // figured out how to detect those via macro preprocessors yet, so we use <=
+    // comparisons for the relevant fields.
+    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
+    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
+    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
+    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
+    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
+    are_equal<sizeof(data_type_),          2,  FieldNameEnum::data_type_>();
+    are_equal<sizeof(device_opt_),         3,  FieldNameEnum::device_opt_>();
+    are_equal<sizeof(key_set_),            8,  FieldNameEnum::key_set_>();
+    is_le<sizeof(TensorImpl),          tsize,  FieldNameEnum::TOTAL_SIZE>();
+    // clang-format on
+
+    return true;
+  }
+#endif
+};
+
+// We use a class to encapsulate size-checking logic with
+// templates to capture sizes and flags. We call this within
+// a static assert to prove there is no run-time behaviour.
+// Since the methods we call return either true or fail their
+// own static_asserts, we should never see the error messages
+// below. We have to provide it though for c++ <17.
+static_assert(
+    C10_TensorImpl_Size_Check_Dummy_Class<>::check_sizes(),
+    "You should not see this message.");
+
+// Clean up after ourselves
+#undef C10_NVCC
+#undef C10_CUDA_VERSION_MAJOR
+#undef C10_CUDA_VERSION
+#undef C10_CLANG_MAJOR_VERSION
+#undef C10_GCC_VERSION
+#undef C10_GCC_VERSION_MINOR
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/TensorOptions.h b/MLPY/Lib/site-packages/torch/include/c10/core/TensorOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..476af8d63ede96b15f013ae177230d5e492a21ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/TensorOptions.h
@@ -0,0 +1,787 @@
+#pragma once
+
+#include <c10/core/Backend.h>
+#include <c10/core/DefaultDtype.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+DispatchKey computeDispatchKey(
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device);
+
+inline ScalarType dtype_or_default(c10::optional<ScalarType> dtype) {
+  return value_or_else(dtype, [] { return get_default_dtype_as_scalartype(); });
+}
+
+inline caffe2::TypeMeta dtype_or_default(
+    c10::optional<caffe2::TypeMeta> dtype) {
+  return value_or_else(dtype, [] { return get_default_dtype(); });
+}
+
+inline Layout layout_or_default(c10::optional<Layout> layout) {
+  return layout.value_or(kStrided);
+}
+
+inline Device device_or_default(c10::optional<Device> device) {
+  return value_or_else(device, [] { return Device(kCPU); });
+}
+
+inline bool pinned_memory_or_default(c10::optional<bool> pinned_memory) {
+  return pinned_memory.value_or(false);
+}
+
+/// A class to encapsulate construction axes of an Tensor.  TensorOptions was
+/// designed to support the Python style API for specifying construction options
+/// on factory functions, e.g.,
+///
+///     torch.zeros(2, 3, dtype=torch.int32)
+///
+/// Because C++ doesn't natively support keyword arguments, there must be
+/// another way of specifying keyword-like arguments.  TensorOptions is a
+/// builder class which can be used to construct this "dictionary" of keyword
+/// arguments: functions which support TensorOptions conventionally take this
+/// argument optionally as their last argument.
+///
+/// WARNING: In PyTorch, there are `torch::` variants of factory functions,
+/// e.g., torch::zeros for at::zeros.  These return Variables (while the
+/// stock ATen functions return plain Tensors).  If you mix these functions
+/// up, you WILL BE SAD.
+///
+/// Rather than use the constructor of this class directly, you should prefer to
+/// use the constructor functions, and then chain setter methods on top of them.
+///
+///     at::device(at::kCUDA).dtype(kInt)
+///     at::dtype(at::kInt)
+///
+/// Additionally, anywhere a TensorOptions is expected, you can directly
+/// pass at::kCUDA / at::kInt, and it will implicitly convert to a
+/// TensorOptions.
+///
+/// Here are some recommended ways to create a 2x2 tensor of zeros
+/// with certain properties.  These all *implicitly* make use of
+/// TensorOptions, even if they don't mention the class explicitly:
+///
+///     at::zeros({2,2}, at::kCUDA);
+///     at::zeros({2,2}, at::kLong);
+///     at::zeros({2,2}, at::device(at::kCUDA).dtype(at::kLong()));
+///     at::zeros({2,2}, at::device({at::kCUDA, 1})); // place on device 1
+///     at::zeros({2,2}, at::requires_grad());
+///
+
+/// NOTE [ TensorOptions Constructors ]
+///
+/// TensorOptions is like a dictionary with entries from the set:
+/// {requires_grad, device, dtype, layout}, where each entry may be
+/// unspecified (i.e., is optional). It is used to specify the properties of
+/// tensors in many places both in C++ internal and API, e.g., tensor factory
+/// methods like `at::empty({10}, options)`, tensor conversions like
+/// `tensor.to(...)`, etc.
+///
+/// To provide a simple API that is consistent with Python, where one can do
+/// `torch.empty(sizes, X)` with `X` being a `torch.device`, `torch.dtype`, or a
+/// `torch.layout`, we want TensorOptions to be implicitly convertible from
+/// `ScalarType dtype`, `Layout layout` and `Device device`. Therefore, we have
+/// three implicit constructors from each of these three types.
+///
+/// This is sufficient for `ScalarType` and `Layout` as they are simple Enum
+/// classes. However, `Device` is an ordinary class with implicit constructors
+/// `Device(DeviceType, DeviceIndex = -1)` and `Device(std::string)` to be
+/// consistent with Python API, where strings are treated as equivalent with a
+/// `torch.device` object (e.g., "cuda:1" can be passed to everywhere a
+/// `torch.device("cuda:1")` is accepted). To support the syntax
+/// `at::empty({10}, {kCUDA, 1})` and `tensor.to(kCUDA)`, we need to make sure
+/// that `TensorOptions` is implicitly constructible with any arguments that a
+/// `Device` can constructed from. So we have,
+///
+///    /* implicit */ TensorOptions(T&& device) : TensorOptions() {
+///      this->set_device(device);
+///    }
+///
+///    template <typename... Args,
+///             typename = std::enable_if_t<std::is_constructible<Device,
+///             Args&&...>::value>>
+///    /* implicit */  TensorOptions(Args&&... args)
+///     : TensorOptions(Device(std::forward<Args>(args)...)) {}
+///
+///
+/// But this will be problematic. Consider this: `TensorOptions({kCUDA, 1})`.
+/// Compiler will complain about ambiguity between the copy constructor and the
+/// `Device` constructor because `{kCUDA, 1}` can be converted to both a
+/// `TensorOption` and a `Device`.
+///
+/// To get around this, we templatize the `Device` constructor. Since overload
+/// resolution is done before template resolution, our problem is solved.
+
+DispatchKey computeDispatchKey(
+    optional<ScalarType> dtype,
+    optional<Layout> layout,
+    optional<Device> device);
+
+struct C10_API TensorOptions {
+  TensorOptions()
+      : requires_grad_(false),
+        pinned_memory_(false),
+        has_device_(false),
+        has_dtype_(false),
+        has_layout_(false),
+        has_requires_grad_(false),
+        has_pinned_memory_(false),
+        has_memory_format_(false) {}
+
+  /// Constructs a `TensorOptions` object with the given layout.
+  /* implicit */ TensorOptions(Layout layout) : TensorOptions() {
+    this->set_layout(layout);
+  }
+
+  /// Constructs a `TensorOptions` object with the given device.
+  /// See NOTE [ TensorOptions Constructors ] on why this is templatized.
+  template <
+      typename T,
+      typename = std::enable_if_t<std::is_same_v<std::decay_t<T>, Device>>>
+  /* implicit */ TensorOptions(T&& device) : TensorOptions() {
+    this->set_device(std::forward<T>(device));
+  }
+
+  /// Constructs a `TensorOptions` object from arguments allowed in `Device`
+  /// constructors.
+  ///
+  /// See NOTE [ TensorOptions Constructors ].
+  ///
+  /// NB: Ideally we only allow implicit constructors here. But there is no easy
+  ///     way to detect them. So we have this one that allows explicit
+  ///     constructors too.
+  template <
+      typename... Args,
+      typename = std::enable_if_t<std::is_constructible_v<Device, Args&&...>>>
+  /* implicit */ TensorOptions(Args&&... args)
+      : TensorOptions(Device(std::forward<Args>(args)...)) {}
+
+  /// Constructs a `TensorOptions` object with the given dtype.
+  /* implicit */ TensorOptions(caffe2::TypeMeta dtype) : TensorOptions() {
+    this->set_dtype(dtype);
+  }
+
+  /// legacy constructor to support ScalarType
+  /* implicit */ TensorOptions(ScalarType dtype) : TensorOptions() {
+    this->set_dtype(dtype);
+  }
+
+  /// Constructs a `TensorOptions` object with the given memory format.
+  /* implicit */ TensorOptions(MemoryFormat memory_format) : TensorOptions() {
+    set_memory_format(memory_format);
+  }
+
+  /// Return a copy of `TensorOptions` with `device` set to the given one, or
+  /// cleared if `device` is `nullopt`.
+  C10_NODISCARD TensorOptions
+  device(c10::optional<Device> device) const noexcept {
+    TensorOptions r = *this;
+    r.set_device(device);
+    return r;
+  }
+
+  /// Return a copy of `TensorOptions` with `device` set to the given one.
+  /// (This overload ensures that variadic template c10::optional constructor
+  /// for Device work correctly.)
+  template <typename... Args>
+  C10_NODISCARD TensorOptions device(Args&&... args) const noexcept {
+    return device(
+        c10::optional<Device>(std::in_place, std::forward<Args>(args)...));
+  }
+
+  /// Return a copy of `TensorOptions`, but with device set to CUDA, and the
+  /// device index set to the given one.
+  ///
+  /// TODO: This function encourages bad behavior (assuming CUDA is
+  /// the only device that matters).  Get rid of it / rename it.
+  C10_NODISCARD TensorOptions
+  device_index(c10::DeviceIndex device_index) const noexcept {
+    return device(Device::Type::CUDA, device_index);
+  }
+
+  /// Return a copy of `TensorOptions` with `dtype` set to the given one.
+  C10_NODISCARD TensorOptions
+  dtype(c10::optional<caffe2::TypeMeta> dtype) const noexcept {
+    TensorOptions r = *this;
+    r.set_dtype(dtype);
+    return r;
+  }
+
+  // legacy function to support ScalarType
+  C10_NODISCARD TensorOptions
+  dtype(c10::optional<ScalarType> dtype) const noexcept {
+    TensorOptions r = *this;
+    r.set_dtype(dtype);
+    return r;
+  }
+
+  // Since dtype is taken...
+  template <typename T>
+  TensorOptions& dtype() {
+    dtype_ = caffe2::TypeMeta::Make<T>();
+    has_dtype_ = true;
+    return *this;
+  }
+
+  /// Sets the layout of the `TensorOptions`.
+  C10_NODISCARD TensorOptions
+  layout(c10::optional<Layout> layout) const noexcept {
+    TensorOptions r = *this;
+    r.set_layout(layout);
+    return r;
+  }
+
+  /// Sets the `requires_grad` property of the `TensorOptions`.
+  C10_NODISCARD TensorOptions
+  requires_grad(c10::optional<bool> requires_grad) const noexcept {
+    TensorOptions r = *this;
+    r.set_requires_grad(requires_grad);
+    return r;
+  }
+
+  /// Sets the `pinned_memory` property on the `TensorOptions`.
+  C10_NODISCARD TensorOptions
+  pinned_memory(c10::optional<bool> pinned_memory) const noexcept {
+    TensorOptions r = *this;
+    r.set_pinned_memory(pinned_memory);
+    return r;
+  }
+
+  /// Sets the `memory_format` property on `TensorOptions`.
+  C10_NODISCARD TensorOptions
+  memory_format(c10::optional<MemoryFormat> memory_format) const noexcept {
+    TensorOptions r = *this;
+    r.set_memory_format(memory_format);
+    return r;
+  }
+
+  /// Returns the device of the `TensorOptions`.
+  Device device() const noexcept {
+    return device_or_default(device_opt());
+  }
+
+  /// Returns whether the device is specified.
+  bool has_device() const noexcept {
+    return has_device_;
+  }
+
+  /// Returns the device of the `TensorOptions`, or `c10::nullopt` if
+  /// device is not specified.
+  c10::optional<Device> device_opt() const noexcept {
+    return has_device_ ? c10::make_optional(device_) : c10::nullopt;
+  }
+
+  /// Returns the device index of the `TensorOptions`.
+  c10::DeviceIndex device_index() const noexcept {
+    return device().index();
+  }
+
+  /// Returns the dtype of the `TensorOptions`.
+  caffe2::TypeMeta dtype() const noexcept {
+    return dtype_or_default(dtype_opt());
+  }
+
+  /// Returns whether the dtype is specified.
+  bool has_dtype() const noexcept {
+    return has_dtype_;
+  }
+
+  /// Returns the dtype of the `TensorOptions`, or `c10::nullopt` if
+  /// device is not specified.
+  c10::optional<caffe2::TypeMeta> dtype_opt() const noexcept {
+    return has_dtype_ ? c10::make_optional(dtype_) : c10::nullopt;
+  }
+
+  /// Returns the layout of the `TensorOptions`.
+  Layout layout() const noexcept {
+    return layout_or_default(layout_opt());
+  }
+
+  /// Returns whether the layout is specified.
+  bool has_layout() const noexcept {
+    return has_layout_;
+  }
+
+  /// Returns the layout of the `TensorOptions`, or `c10::nullopt` if
+  /// layout is not specified.
+  c10::optional<Layout> layout_opt() const noexcept {
+    return has_layout_ ? c10::make_optional(layout_) : c10::nullopt;
+  }
+
+  /// Returns the `requires_grad` property of the `TensorOptions`.
+  bool requires_grad() const noexcept {
+    return has_requires_grad_ ? requires_grad_ : false;
+  }
+
+  /// Returns whether the `requires_grad` is specified.
+  bool has_requires_grad() const noexcept {
+    return has_requires_grad_;
+  }
+
+  /// Returns the `requires_grad` property of the `TensorOptions`, or
+  /// `c10::nullopt` if `requires_grad` is not specified.
+  c10::optional<bool> requires_grad_opt() const noexcept {
+    return has_requires_grad_ ? c10::make_optional(requires_grad_)
+                              : c10::nullopt;
+  }
+
+  /// Returns the `pinned_memory` property of the `TensorOptions`.
+  bool pinned_memory() const noexcept {
+    return pinned_memory_or_default(pinned_memory_opt());
+  }
+
+  /// Returns whether the `pinned_memory` is specified.
+  bool has_pinned_memory() const noexcept {
+    return has_pinned_memory_;
+  }
+
+  /// Returns if the layout is sparse
+  bool is_sparse() const {
+    return layout_ == c10::Layout::Sparse;
+  }
+
+  /// Returns if the layout is sparse CSR, deprecated, use
+  /// is_sparse_compressed() instead
+  bool is_sparse_csr() const {
+    return layout_ == c10::Layout::SparseCsr;
+  }
+
+  bool is_sparse_compressed() const {
+    return layout_ == c10::Layout::SparseCsr ||
+        layout_ == c10::Layout::SparseCsc ||
+        layout_ == c10::Layout::SparseBsr || layout_ == c10::Layout::SparseBsc;
+  }
+
+  // For compatibility with legacy tensor.type() comparisons
+  bool type_equal(const TensorOptions& other) const {
+    return computeDispatchKey() == other.computeDispatchKey() &&
+        typeMetaToScalarType(dtype_) == typeMetaToScalarType(other.dtype());
+  }
+
+  /// Returns the `pinned_memory` property of the `TensorOptions`, or
+  /// `c10::nullopt` if `pinned_memory` is not specified.
+  c10::optional<bool> pinned_memory_opt() const noexcept {
+    return has_pinned_memory_ ? c10::make_optional(pinned_memory_)
+                              : c10::nullopt;
+  }
+
+  /// Returns whether the `memory_layout` is specified
+  bool has_memory_format() const noexcept {
+    return has_memory_format_;
+  }
+
+  // NB: memory_format() getter is PURPOSELY not defined, as the default
+  // behavior of memory_format varies from function to function.
+
+  /// Returns the `memory_layout` property of `TensorOptions, or
+  /// `c10::nullopt` if `memory_format` is not specified.
+  c10::optional<MemoryFormat> memory_format_opt() const noexcept {
+    return has_memory_format_ ? c10::make_optional(memory_format_)
+                              : c10::nullopt;
+  }
+
+  // Resolves the ATen backend specified by the current construction axes.
+  // TODO: Deprecate this
+  Backend backend() const {
+    return at::dispatchKeyToBackend(computeDispatchKey());
+  }
+
+  /// Return the right-biased merge of two TensorOptions.  This has the
+  /// effect of overwriting settings from self with specified options
+  /// of options.
+  ///
+  /// NB: This merging operation does NOT respect device merges.
+  /// For example, if you device({kCUDA, 1}).merge_in(kCUDA)
+  /// you will get kCUDA in the end!  Functions like Tensor.new_empty
+  /// ensure the right device is selected anyway by way of a
+  /// device guard.
+  ///
+  TensorOptions merge_in(TensorOptions options) const noexcept {
+    TensorOptions merged = *this;
+    if (options.has_device())
+      merged.set_device(options.device_opt());
+    if (options.has_dtype())
+      merged.set_dtype(options.dtype_opt());
+    if (options.has_layout())
+      merged.set_layout(options.layout_opt());
+    // NB: requires grad is right biased; not a logical AND/OR!
+    if (options.has_requires_grad())
+      merged.set_requires_grad(options.requires_grad_opt());
+    if (options.has_pinned_memory())
+      merged.set_pinned_memory(options.pinned_memory_opt());
+    if (options.has_memory_format())
+      merged.set_memory_format(options.memory_format_opt());
+    return merged;
+  }
+
+  // TODO remove after TensorOptions rationalization
+  TensorOptions merge_memory_format(
+      c10::optional<MemoryFormat> optional_memory_format) const noexcept {
+    TensorOptions merged = *this;
+    if (optional_memory_format.has_value()) {
+      merged.set_memory_format(*optional_memory_format);
+    }
+    return merged;
+  }
+
+  // INVARIANT: computeDispatchKey returns only the subset of dispatch keys for
+  // which dispatchKeyToBackend is injective, if it is defined at all  (for
+  // the most part, this just means that this function never returns an
+  // Autograd key)
+  DispatchKey computeDispatchKey() const {
+    return c10::computeDispatchKey(
+        optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt());
+  }
+
+ private:
+  // These methods are currently private because I'm not sure if it's wise
+  // to actually publish them.  They are methods because I need them in
+  // the constructor and the functional API implementation.
+  //
+  // If you really, really need it, you can make these public, but check if you
+  // couldn't just do what you need with the functional API.  Similarly, these
+  // methods are not chainable, because if you wanted chaining, you probably
+  // want to use the functional API instead.  (It's probably OK to make
+  // these chainable, because these functions are all explicitly annotated
+  // with a ref-qualifier, the trailing &, that makes them illegal to call
+  // on temporaries.)
+
+  /// Mutably set the device of `TensorOptions`.
+  void set_device(c10::optional<Device> device) & noexcept {
+    if (device) {
+      device_ = *device;
+      has_device_ = true;
+    } else {
+      has_device_ = false;
+    }
+  }
+
+  /// Mutably set the dtype of `TensorOptions`.
+  void set_dtype(c10::optional<caffe2::TypeMeta> dtype) & noexcept {
+    if (dtype) {
+      dtype_ = *dtype;
+      has_dtype_ = true;
+    } else {
+      has_dtype_ = false;
+    }
+  }
+
+  // legacy function to support ScalarType
+  void set_dtype(c10::optional<ScalarType> dtype) & noexcept {
+    if (dtype) {
+      dtype_ = scalarTypeToTypeMeta(*dtype);
+      has_dtype_ = true;
+    } else {
+      has_dtype_ = false;
+    }
+  }
+
+  /// Mutably set the layout of `TensorOptions`.
+  void set_layout(c10::optional<Layout> layout) & noexcept {
+    if (layout) {
+      layout_ = *layout;
+      has_layout_ = true;
+    } else {
+      has_layout_ = false;
+    }
+  }
+
+  /// Mutably set the `requires_grad` property of `TensorOptions`.
+  void set_requires_grad(c10::optional<bool> requires_grad) & noexcept {
+    if (requires_grad) {
+      requires_grad_ = *requires_grad;
+      has_requires_grad_ = true;
+    } else {
+      has_requires_grad_ = false;
+    }
+  }
+
+  /// Mutably set the `pinned_memory` property of `TensorOptions`.
+  void set_pinned_memory(c10::optional<bool> pinned_memory) & noexcept {
+    if (pinned_memory) {
+      pinned_memory_ = *pinned_memory;
+      has_pinned_memory_ = true;
+    } else {
+      has_pinned_memory_ = false;
+    }
+  }
+
+  /// Mutably set the `memory_Format` property of `TensorOptions`.
+  void set_memory_format(c10::optional<MemoryFormat> memory_format) & noexcept {
+    if (memory_format) {
+      memory_format_ = *memory_format;
+      has_memory_format_ = true;
+    } else {
+      has_memory_format_ = false;
+    }
+  }
+
+  // WARNING: If you edit TensorOptions to add more options, you
+  // may need to adjust the implementation of Tensor::options.
+  // The criteria for whether or not Tensor::options must be adjusted
+  // is whether or not the new option you added should preserved
+  // by functions such as empty_like(); if it should be preserved,
+  // you must adjust options().
+  //
+  // TODO: MemoryFormat is not implemented in this way
+
+  // NB: We didn't use c10::optional here, because then we can't pack
+  // the has_***_ boolean fields.
+
+  Device device_ = at::kCPU; // 16-bit
+  caffe2::TypeMeta dtype_ = caffe2::TypeMeta::Make<float>(); // 16-bit
+  Layout layout_ = at::kStrided; // 8-bit
+  MemoryFormat memory_format_ = MemoryFormat::Contiguous; // 8-bit
+
+  // Bitmask required here to get this to fit inside 32 bits (or even 64 bits,
+  // for that matter)
+
+  bool requires_grad_ : 1;
+  bool pinned_memory_ : 1;
+
+  bool has_device_ : 1;
+  bool has_dtype_ : 1;
+  bool has_layout_ : 1;
+  bool has_requires_grad_ : 1;
+  bool has_pinned_memory_ : 1;
+  bool has_memory_format_ : 1;
+};
+
+// We should aspire to fit in one machine-size word; but a size greater than two
+// words is too much.  (We are doing terribly on 32-bit archs, where we require
+// three machine size words to store tensor options.  Eek!)
+static_assert(
+    sizeof(TensorOptions) <= sizeof(int64_t) * 2,
+    "TensorOptions must fit in 128-bits");
+
+/// Convenience function that returns a `TensorOptions` object with the `dtype`
+/// set to the given one.
+inline TensorOptions dtype(caffe2::TypeMeta dtype) {
+  return TensorOptions().dtype(dtype);
+}
+
+// legacy function to support ScalarType
+inline TensorOptions dtype(ScalarType dtype) {
+  return TensorOptions().dtype(scalarTypeToTypeMeta(dtype));
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `layout`
+/// set to the given one.
+inline TensorOptions layout(Layout layout) {
+  return TensorOptions().layout(layout);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the `device`
+/// set to the given one.
+inline TensorOptions device(Device device) {
+  return TensorOptions().device(device);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `device` set to CUDA and the `device_index` set to the given one.
+inline TensorOptions device_index(c10::DeviceIndex device_index) {
+  return TensorOptions().device_index(device_index);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `requires_grad` set to the given one.
+inline TensorOptions requires_grad(bool requires_grad = true) {
+  return TensorOptions().requires_grad(requires_grad);
+}
+
+/// Convenience function that returns a `TensorOptions` object with the
+/// `memory_format` set to the given one.
+inline TensorOptions memory_format(MemoryFormat memory_format) {
+  return TensorOptions().memory_format(memory_format);
+}
+
+C10_API std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorOptions& options);
+
+template <typename T>
+inline TensorOptions dtype() {
+  return dtype(caffe2::TypeMeta::Make<T>());
+}
+
+inline std::string toString(const TensorOptions& options) {
+  std::ostringstream stream;
+  stream << options;
+  return stream.str();
+}
+
+// This is intended to be a centralized location by which we can determine
+// what an appropriate DispatchKey for a tensor is.
+inline DispatchKey computeDispatchKey(
+    c10::optional<ScalarType> dtype,
+    c10::optional<Layout> layout,
+    c10::optional<Device> device) {
+  const auto layout_ = layout_or_default(layout);
+  const auto device_ = device_or_default(device);
+  switch (layout_) {
+    case Layout::Jagged:
+    case Layout::Strided: {
+      const auto dtype_ = dtype_or_default(dtype);
+      switch (device_.type()) {
+#define DO_CASE(device, _)                   \
+  case c10::DeviceType::device: {            \
+    if (isQIntType(dtype_)) {                \
+      return DispatchKey::Quantized##device; \
+    }                                        \
+    return DispatchKey::device;              \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        case c10::DeviceType::FPGA:
+          return DispatchKey::FPGA;
+        case c10::DeviceType::ORT:
+          return DispatchKey::ORT;
+        case c10::DeviceType::Vulkan:
+          return DispatchKey::Vulkan;
+        case c10::DeviceType::Metal:
+          return DispatchKey::Metal;
+        case c10::DeviceType::MKLDNN:
+        case c10::DeviceType::OPENGL:
+        case c10::DeviceType::OPENCL:
+        case c10::DeviceType::IDEEP:
+          TORCH_INTERNAL_ASSERT(
+              0,
+              "This is a grandfathered Caffe2 device type ",
+              device_.type(),
+              ", it shouldn't ever convert to a DispatchKey.  File a bug describing what you were doing if you think this is in error.");
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for dense layout: ",
+              device_.type());
+      }
+    }
+    case Layout::Sparse:
+      switch (device_.type()) {
+#define DO_CASE(device, _)              \
+  case c10::DeviceType::device: {       \
+    return DispatchKey::Sparse##device; \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for sparse layout: ",
+              device_.type());
+      }
+    case Layout::Mkldnn:
+      switch (device_.type()) {
+        case c10::DeviceType::CPU:
+          return DispatchKey::MkldnnCPU;
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for mkldnn layout: ",
+              device_.type());
+      }
+    case Layout::SparseCsr:
+    case Layout::SparseCsc:
+    case Layout::SparseBsr:
+    case Layout::SparseBsc:
+      switch (device_.type()) {
+#define DO_CASE(device, _)                 \
+  case c10::DeviceType::device: {          \
+    return DispatchKey::SparseCsr##device; \
+  }
+        C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, unused)
+#undef DO_CASE
+        default:
+          TORCH_CHECK_NOT_IMPLEMENTED(
+              false,
+              "Unsupported device type for ",
+              layout_,
+              " layout: ",
+              device_.type());
+      }
+    default:
+      TORCH_CHECK(false, "Unsupported layout: ", layout_);
+  }
+}
+
+inline Layout dispatchKeyToLayout(DispatchKey dispatch_key) {
+  switch (dispatch_key) {
+#define DO_CASE(bc, _) case DispatchKey::Sparse##bc:
+    C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
+#undef DO_CASE
+    return Layout::Sparse;
+#define DO_CASE(bc, _) case DispatchKey::SparseCsr##bc:
+    C10_FORALL_BACKEND_COMPONENTS(DO_CASE, unused)
+#undef DO_CASE
+    TORCH_CHECK(
+        false, "Cannot map DispatchKey ", dispatch_key, " to a unique layout.");
+    case DispatchKey::MkldnnCPU:
+      return Layout::Mkldnn;
+    default:
+      return Layout::Strided;
+  }
+}
+
+inline c10::DeviceType dispatchKeyToDeviceType(DispatchKey dispatch_key) {
+  switch (dispatch_key) {
+    // stuff that's real
+#define DO_CASE(suffix, prefix)     \
+  case DispatchKey::prefix##suffix: \
+    return c10::DeviceType::suffix;
+#define DO_CASES(_, prefix) C10_FORALL_BACKEND_DEVICE_TYPES(DO_CASE, prefix)
+    C10_FORALL_FUNCTIONALITY_KEYS(DO_CASES)
+#undef DO_CASES
+#undef DO_CASE
+
+    case DispatchKey::MkldnnCPU:
+      return c10::DeviceType::CPU;
+    case DispatchKey::Vulkan:
+      return c10::DeviceType::Vulkan;
+
+    case DispatchKey::ORT:
+      return c10::DeviceType::ORT;
+    default:
+      TORCH_CHECK(
+          false,
+          "DispatchKey ",
+          dispatch_key,
+          " doesn't correspond to a device");
+  }
+}
+
+inline TensorOptions dispatchKeyToTensorOptions(DispatchKey dispatch_key) {
+  return TensorOptions()
+      .layout(dispatchKeyToLayout(dispatch_key))
+      .device(dispatchKeyToDeviceType(dispatch_key));
+}
+
+namespace detail {
+inline bool backend_supports_empty_operator(const TensorOptions& options) {
+  // Quantized backends don't support at::empty().
+  // They have separate operators like at::empty_quantized() that take in
+  // extra information about how to quantize the tensor.
+  return !isQIntType(typeMetaToScalarType(options.dtype()));
+}
+
+} // namespace detail
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/UndefinedTensorImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/UndefinedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed74fd79b8f5c22ab9112b4e5081d78e5a49e15f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/UndefinedTensorImpl.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <cstdint>
+
+namespace c10 {
+
+struct C10_API UndefinedTensorImpl final : public TensorImpl {
+ public:
+  // Without this, we get:
+  //  error: identifier "at::UndefinedTensorImpl::_singleton" is undefined in
+  //  device code
+  // (ostensibly because the constexpr tricks MSVC into trying to compile this
+  // function for device as well).
+#ifdef _WIN32
+  static inline TensorImpl* singleton() {
+#else
+  static constexpr inline TensorImpl* singleton() {
+#endif
+    return &_singleton;
+  }
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+  void set_storage_offset(int64_t offset) override;
+
+ protected:
+  bool is_contiguous_custom(MemoryFormat format) const override;
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
+
+ private:
+  UndefinedTensorImpl();
+  static UndefinedTensorImpl _singleton;
+  const char* tensorimpl_type_name() const override;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/WrapDimMinimal.h b/MLPY/Lib/site-packages/torch/include/c10/core/WrapDimMinimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc3b0d3267171a60c5725d8d9185772a2b4601ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/WrapDimMinimal.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>
+#include <utility>
+
+namespace c10 {
+
+namespace detail {
+// This template can only be specialized at int64_t and c10::SymInt;
+// you'll get linker errors otherwise
+template <typename T>
+C10_API T maybe_wrap_dim_slow(T dim, T dim_post_expr, bool wrap_scalar);
+} // namespace detail
+
+template <typename T>
+T _maybe_wrap_dim(T dim, T dim_post_expr, bool wrap_scalar = true) {
+  // Inline the fast paths
+  if (C10_LIKELY(dim_post_expr * -1 <= dim && dim < dim_post_expr)) {
+    // For SymInts, we want an explicit control flow to trigger a guard, so we
+    // may as well branch too.
+    if (dim < 0) {
+      return dim + dim_post_expr;
+    }
+    return dim;
+  }
+  // Check edge-cases out-of-line (wrapping scalars and out-of-bounds errors)
+  return c10::detail::maybe_wrap_dim_slow<T>(
+      std::move(dim), std::move(dim_post_expr), wrap_scalar);
+}
+
+inline int64_t maybe_wrap_dim(
+    int64_t dim,
+    int64_t dim_post_expr,
+    bool wrap_scalar = true) {
+  return _maybe_wrap_dim(dim, dim_post_expr, wrap_scalar);
+}
+
+inline c10::SymInt maybe_wrap_dim(
+    c10::SymInt dim,
+    c10::SymInt dim_post_expr,
+    bool wrap_scalar = true) {
+  return _maybe_wrap_dim(std::move(dim), std::move(dim_post_expr), wrap_scalar);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/alignment.h b/MLPY/Lib/site-packages/torch/include/c10/core/alignment.h
new file mode 100644
index 0000000000000000000000000000000000000000..32cac40eb982d97808989aa6245ac4081c6eb824
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/alignment.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstddef>
+
+namespace c10 {
+
+#ifdef C10_MOBILE
+// Use 16-byte alignment on mobile
+// - ARM NEON AArch32 and AArch64
+// - x86[-64] < AVX
+constexpr size_t gAlignment = 16;
+#else
+// Use 64-byte alignment should be enough for computation up to AVX512.
+constexpr size_t gAlignment = 64;
+#endif
+
+constexpr size_t gPagesize = 4096;
+// since the default thp pagesize is 2MB, enable thp only
+// for buffers of size 2MB or larger to avoid memory bloating
+constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/COW.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/COW.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3a94d9681de0da82c1a1ddde114eee0376647fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/COW.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10 {
+struct StorageImpl;
+class DataPtr;
+}; // namespace c10
+
+namespace c10::impl::cow {
+
+// Creates a Copy-on-write (COW) clone of the given storage. This will also
+// convert the given storage into a COW storage if it is not COW already.
+//
+// Converting the storage into a COW storage will not be successful if the
+// storage's DataPtr has some context (`DataPtr::get_context()`) which is not
+// equal to the data pointer (`DataPtr::get()`). In this case, a nullptr is
+// returned.
+C10_API c10::intrusive_ptr<StorageImpl> lazy_clone_storage(
+    StorageImpl& storage);
+
+// Check if a storage has a simple DataPtr with no abnormal context
+C10_API bool has_simple_data_ptr(const c10::StorageImpl& storage);
+
+// Check if a DataPtr is COW
+C10_API bool is_cow_data_ptr(const c10::DataPtr& data_ptr);
+
+// Eagerly copies a COW storage's data, turning it into a non-COW storage.
+C10_API void materialize_cow_storage(StorageImpl& storage);
+
+} // namespace c10::impl::cow
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/COWDeleter.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/COWDeleter.h
new file mode 100644
index 0000000000000000000000000000000000000000..58378c4ec2e3b9826cd7cb4cc11ab610662b3b16
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/COWDeleter.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/UniqueVoidPtr.h>
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <shared_mutex>
+#include <variant>
+
+namespace c10::impl::cow {
+
+// A COWDeleterContext object is used as the `ctx` argument for DataPtr
+// to implement a Copy-on-write (COW) DataPtr.
+class C10_API COWDeleterContext {
+ public:
+  // Creates an instance, holding the pair of data and original
+  // deleter.
+  //
+  // Note that the deleter will only be called in our destructor if
+  // the last reference to this goes away without getting
+  // materialized.
+  explicit COWDeleterContext(std::unique_ptr<void, DeleterFnPtr> data);
+
+  // Increments the current refcount.
+  void increment_refcount();
+
+  // See README.md in this directory to understand the locking
+  // strategy.
+
+  // Represents a reference to the context.
+  //
+  // This is returned by decrement_refcount to allow the caller to
+  // copy the data under the shared lock.
+  using NotLastReference = std::shared_lock<std::shared_mutex>;
+
+  // Represents the last reference to the context.
+  //
+  // This will be returned by decrement_refcount when it is the last
+  // reference remaining and after any pending copies have completed.
+  using LastReference = std::unique_ptr<void, DeleterFnPtr>;
+
+  // Decrements the refcount, returning a handle indicating what to
+  // do with it.
+  std::variant<NotLastReference, LastReference> decrement_refcount();
+
+ private:
+  // The destructor is hidden, this should only ever be used within
+  // UniqueVoidPtr using cow::delete_context as the deleter.
+  ~COWDeleterContext();
+
+  std::shared_mutex mutex_;
+  std::unique_ptr<void, DeleterFnPtr> data_;
+  std::atomic<std::int64_t> refcount_ = 1;
+};
+
+// `cow_deleter` is used as the `ctx_deleter` for DataPtr to implement a COW
+// DataPtr.
+//
+// Warning: This should only be called on a pointer to a COWDeleterContext that
+// was allocated on the heap with `new`, because when the refcount reaches 0,
+// the context is deleted with `delete`.
+C10_API void cow_deleter(void* ctx);
+
+} // namespace c10::impl::cow
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..19df643064a83fabfa43442590ab570d054ed096
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/DeviceGuardImplInterface.h
@@ -0,0 +1,337 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+
+// Just for C10_ANONYMOUS_VARIABLE
+#include <c10/util/Registry.h>
+
+#include <atomic>
+
+namespace c10 {
+
+// Forward declaration
+class DataPtr;
+
+/**
+ * Flags defining the behavior of events.
+ *
+ * PYTORCH_DEFAULT and BACKEND_DEFAULT are valid for all backends. The
+ * BACKEND_DEFAULT is what a particular backend would select if no
+ * flags were given. PYTORCH_DEFAULT is the PyTorch's framework default
+ * choice for events on that backend, which may not be the same. For example,
+ * when PyTorch creates a CUDA event it sets the flag
+ * CUDA_EVENT_DISABLING_TIMING by default to improve performance.
+ *
+ * The mapping of PYTORCH_DEFAULT and BACKEND_DEFAULT is done by each
+ * backend implementation. Backend-specific flags, like CUDA_EVENT_DEFAULT,
+ * should map one-to-one with actual event flags for those backends.
+ */
+enum class EventFlag {
+  PYTORCH_DEFAULT,
+  BACKEND_DEFAULT,
+  // CUDA flags
+  CUDA_EVENT_DEFAULT,
+  CUDA_EVENT_DISABLE_TIMING, // PyTorch-default for CUDA
+  // HIP flags
+  HIP_EVENT_DEFAULT,
+  HIP_EVENT_DISABLE_TIMING, // PyTorch-default for HIP
+  // FOR TESTING ONLY
+  INVALID
+};
+
+namespace impl {
+
+/**
+ * DeviceGuardImplInterface represents the virtual interface which provides
+ * functionality to provide an RAII class for device and stream switching,
+ * via DeviceGuard.  Every distinct device type, e.g., CUDA and HIP, is
+ * expected to implement and register an implementation of this interface.
+ * All classes which inherit from DeviceGuardImplInterface should be declared
+ * 'final'.
+ *
+ * This class exists because we provide a unified interface for performing
+ * device guards via DeviceGuard, but we cannot assume that we have actually
+ * compiled against the, e.g., CUDA library, which actually implements
+ * this guard functionality.  In this case, a dynamic dispatch is required
+ * to cross the library boundary.
+ *
+ * If possible, you should directly use implementations of this interface;
+ * those uses will be devirtualized.
+ */
+struct C10_API DeviceGuardImplInterface {
+  DeviceGuardImplInterface() = default;
+  DeviceGuardImplInterface(const DeviceGuardImplInterface&) = default;
+  DeviceGuardImplInterface& operator=(const DeviceGuardImplInterface&) =
+      default;
+  DeviceGuardImplInterface(DeviceGuardImplInterface&&) noexcept = default;
+  DeviceGuardImplInterface& operator=(DeviceGuardImplInterface&&) noexcept =
+      default;
+
+  /**
+   * Return the type of device managed by this guard implementation.
+   */
+  virtual DeviceType type() const = 0;
+
+  /**
+   * Set the current device to Device, and return the previous Device.
+   */
+  virtual Device exchangeDevice(Device) const = 0;
+  // NB: Implementations of exchangeDevice can be a bit boilerplatey.  You might
+  // consider replacing exchangeDevice with a non-virtual function with a baked
+  // in implementation; however, note that this will triple the number of
+  // virtual calls (when you implement exchangeDevice in a final subclass,
+  // the compiler gets to devirtualize everything; it won't do that if you don't
+  // define it in the subclass!)  A common way to solve this problem is to use
+  // some sort of CRTP; however, we can template DeviceGuardImplInterface since
+  // we really *do* need it to be virtual.  A little boilerplate seems easiest
+  // to explain.  (Another way around this problem is to provide inline
+  // functions that provide the default implementations, but this seems a little
+  // hard to explain.  In any case, we're only going to have on order of ten
+  // implementations of this anyway.)
+
+  /**
+   * Get the current device.
+   */
+  virtual Device getDevice() const = 0;
+
+  /**
+   * Set the current device to Device.
+   */
+  virtual void setDevice(Device) const = 0;
+
+  /**
+   * Set the current device to Device, without checking for errors
+   * (so, e.g., this can be called from a destructor).
+   */
+  virtual void uncheckedSetDevice(Device) const noexcept = 0;
+
+  /**
+   * Get the current stream for a given device.
+   */
+  virtual Stream getStream(Device) const noexcept = 0;
+
+  /**
+   * Get the default stream for a given device.
+   */
+  virtual Stream getDefaultStream(Device) const {
+    TORCH_CHECK(false, "Backend doesn't support acquiring a default stream.")
+  }
+
+  /**
+   * Get a stream from the global pool for a given device.
+   */
+  virtual Stream getStreamFromGlobalPool(Device, bool isHighPriority = false)
+      const {
+    (void)isHighPriority; // Suppress unused variable warning
+    TORCH_CHECK(false, "Backend doesn't support acquiring a stream from pool.")
+  }
+
+  /**
+   * Set a stream to be the thread local current stream for its device.
+   * Return the previous stream for that device. You are NOT required
+   * to set the current device to match the device of this stream.
+   */
+  virtual Stream exchangeStream(Stream) const noexcept = 0;
+
+  /**
+   * Destroys the given event.
+   */
+  virtual void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+      const noexcept {}
+
+  /**
+   * Increments the event's version and enqueues a job with this version
+   * in the stream's work queue. When the stream process that job
+   * it notifies all streams waiting on / blocked by that version of the
+   * event to continue and marks that version as recorded.
+   * */
+  virtual void record(
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const c10::EventFlag /*flag*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Does nothing if the event has not been scheduled to be recorded.
+   * If the event was previously enqueued to be recorded, a command
+   * to wait for the version of the event that exists at the time of this call
+   * is inserted in the stream's work queue.
+   * When the stream reaches this command it will stop processing
+   * additional commands until that version of the event is marked as recorded.
+   */
+  virtual void block(void* /*event*/, const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Returns true if (and only if)
+   *  (1) the event has never been scheduled to be recorded
+   *  (2) the current version is marked as recorded.
+   * Returns false otherwise.
+   */
+  virtual bool queryEvent(void* /*event*/) const {
+    TORCH_CHECK(false, "Backend doesn't support events.");
+  }
+
+  /**
+   * Get the number of devices.  WARNING: This is REQUIRED to not raise
+   * an exception.  If there is some sort of problem, e.g., driver error,
+   * you should report that there are zero available devices.
+   */
+  virtual DeviceIndex deviceCount() const noexcept = 0;
+
+  /**
+   * Return true if all the work previously enqueued on the stream for
+   * asynchronous execution has completed running on the device.
+   */
+  virtual bool queryStream(const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support querying streams.");
+  }
+
+  /**
+   * Wait (by blocking the calling thread) until all the work previously
+   * enqueued on the stream has completed running on the device.
+   */
+  virtual void synchronizeStream(const Stream& /*stream*/) const {
+    TORCH_CHECK(false, "Backend doesn't support synchronizing streams.");
+  }
+
+  /**
+   * Ensure the caching allocator (if any) is aware that the given DataPtr is
+   * being used on the given stream, and that it should thus avoid recycling the
+   * DataPtr until all work on that stream is done.
+   */
+  virtual void recordDataPtrOnStream(const c10::DataPtr&, const Stream&) const {
+  }
+
+  /**
+   * Intended use of this class is to leak the DeviceGuardImpl at program end.
+   * So you better not call the destructor, buster!
+   */
+  virtual ~DeviceGuardImplInterface() = default;
+};
+
+// A no-op device guard impl that doesn't do anything interesting.  Useful
+// for devices that don't actually have a concept of device index.  Prominent
+// examples are CPU and Meta.
+template <DeviceType D>
+struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
+  NoOpDeviceGuardImpl() = default;
+  DeviceType type() const override {
+    return D;
+  }
+  Device exchangeDevice(Device) const override {
+    return Device(D, -1); // no-op
+  }
+  Device getDevice() const override {
+    return Device(D, -1);
+  }
+  void setDevice(Device) const override {
+    // no-op
+  }
+  void uncheckedSetDevice(Device) const noexcept override {
+    // no-op
+  }
+  Stream getStream(Device) const noexcept override {
+    // no-op
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream) const noexcept override {
+    // no-op
+    return Stream(Stream::DEFAULT, Device(D, -1));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return 1;
+  }
+
+  // Event-related functions
+  void record(
+      void** /*event*/,
+      const Stream& /*stream*/,
+      const DeviceIndex /*device_index*/,
+      const EventFlag /*flag*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.");
+  }
+  void block(void* /*event*/, const Stream& /*stream*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.")
+  }
+  bool queryEvent(void* /*event*/) const override {
+    TORCH_CHECK(false, D, " backend doesn't support events.")
+  }
+  void destroyEvent(void* /*event*/, const DeviceIndex /*device_index*/)
+      const noexcept override {}
+
+  // Stream-related functions
+  bool queryStream(const Stream& /*stream*/) const override {
+    return true;
+  }
+  void synchronizeStream(const Stream& /*stream*/) const override {
+    // Don't wait for anything.
+  }
+};
+
+// The registry is NON-owning.  Each stored pointer is std::atomic so
+// that under all interleavings of registry calls the structure is
+// race-free.  This doesn't cost us anything on reads in X86.  (An
+// unsynchronized implementation probably is OK too, but I didn't want
+// to prove that we never read from device_guard_impl_registry at the
+// same time some registration is occurring.  Shiver.)
+//
+// I'd like this registry to be valid even at program destruction time
+// (in case someone uses a DeviceGuard in a destructor to do some cleanup
+// in the CUDA API.)  Since there are no direct accesses of the underlying
+// owning objects which I can use to enforce initialization order (unlike
+// in a Meyer singleton), it implies that you must *leak* objects when
+// putting them in the registry.  This is done by deleting the destructor
+// on DeviceGuardImplInterface.
+// NOLINTNEXTLINE(*c-arrays*)
+extern C10_API std::atomic<const DeviceGuardImplInterface*>
+    device_guard_impl_registry[static_cast<size_t>(
+        DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)];
+
+// I can't conveniently use c10/util/Registry.h for the following reason:
+// c10/util/Registry.h gives me a slow way of Create'ing a object of some
+// interface from the registry, but no way of quickly accessing an already
+// created object.  I'll be banging on getDeviceGuardImpl every time we do a
+// DeviceGuard, so I really don't want to be doing an unordered_map lookup.
+// Better if the registration mechanism directly drops its implementation
+// into device_guard_impl_registry.
+
+class C10_API DeviceGuardImplRegistrar {
+ public:
+  DeviceGuardImplRegistrar(DeviceType, const DeviceGuardImplInterface*);
+};
+
+#define C10_REGISTER_GUARD_IMPL(DevType, DeviceGuardImpl)              \
+  static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE( \
+      g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl());
+
+inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) {
+  // Two adjacent int16_t fields DeviceType and DeviceIndex has field access
+  // miscompiled on NVCC. To workaround this issue, we apply a mask to the
+  // DeviceType. First check if the DeviceType is 16-bit.
+  // FB employees can see
+  //   https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/
+  // for more details
+  static_assert(sizeof(DeviceType) == 1, "DeviceType is not 8-bit");
+  auto p = device_guard_impl_registry[static_cast<size_t>(type) & 0xFF].load();
+
+  // This seems to be the first place where you make use of a device
+  // when you pass devices to factory functions.  Give a nicer error
+  // message in this case.
+  TORCH_CHECK(p, "PyTorch is not linked with support for ", type, " devices");
+  return p;
+}
+
+inline bool hasDeviceGuardImpl(DeviceType type) {
+  return device_guard_impl_registry[static_cast<size_t>(type)].load();
+}
+
+} // namespace impl
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1f015eb31cc14d0dd80c540b417151f495fa952
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/FakeGuardImpl.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+#include <array>
+
+namespace c10::impl {
+
+// FakeGuardImpl is hardcoded to have eight devices.  Not for
+// any good reason, just to simplify code.
+constexpr DeviceIndex kFakeGuardImplMaxDevices = 8;
+
+/**
+ * A fake implementation of DeviceGuardImplInterface suitable for testing.
+ * The current device is modeled as a mutable field in the guard implementation
+ * class.  See DeviceGuard_test.cpp for an example use.
+ */
+template <DeviceType T>
+struct FakeGuardImpl final : public DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = T;
+  // Runtime device type is not used
+  FakeGuardImpl(DeviceType) {}
+  FakeGuardImpl() = default;
+  DeviceType type() const override {
+    return T;
+  }
+  Device exchangeDevice(Device d) const override {
+    AT_ASSERT(d.type() == type());
+    AT_ASSERT(d.index() < kFakeGuardImplMaxDevices);
+    Device old_device = getDevice();
+    if (old_device.index() != d.index()) {
+      current_device_ = d.index();
+    }
+    return old_device;
+  }
+  Device getDevice() const override {
+    return Device(type(), current_device_);
+  }
+  void setDevice(Device d) const override {
+    AT_ASSERT(d.type() == type());
+    AT_ASSERT(d.index() >= 0);
+    AT_ASSERT(d.index() < kFakeGuardImplMaxDevices);
+    current_device_ = d.index();
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    current_device_ = d.index();
+  }
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::UNSAFE, d, current_streams_[d.index()]);
+  }
+  Stream exchangeStream(Stream s) const noexcept override {
+    auto old_id = current_streams_[s.device_index()];
+    current_streams_[s.device_index()] = s.id();
+    return Stream(Stream::UNSAFE, s.device(), old_id);
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return kFakeGuardImplMaxDevices;
+  }
+
+  // Event-related functions
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {}
+  void block(void* event, const Stream& stream) const override {}
+  bool queryEvent(void* event) const override {
+    return true;
+  }
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {}
+
+  // Convenience methods for testing
+  static DeviceIndex getDeviceIndex() {
+    return current_device_;
+  }
+  static void setDeviceIndex(DeviceIndex i) {
+    AT_ASSERT(i >= 0);
+    AT_ASSERT(i < kFakeGuardImplMaxDevices);
+    current_device_ = i;
+  }
+  static StreamId getCurrentStreamIdFor(DeviceIndex i) {
+    return current_streams_.at(i);
+  }
+  static void resetStreams() {
+    current_streams_.fill(0);
+  }
+
+ private:
+  thread_local static DeviceIndex current_device_;
+  thread_local static std::array<StreamId, kFakeGuardImplMaxDevices>
+      current_streams_;
+};
+
+template <DeviceType T>
+thread_local DeviceIndex FakeGuardImpl<T>::current_device_ = 0;
+
+template <DeviceType T>
+thread_local std::array<StreamId, kFakeGuardImplMaxDevices>
+    FakeGuardImpl<T>::current_streams_ = {0, 0, 0, 0, 0, 0, 0, 0};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/GPUTrace.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/GPUTrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..9101b1b29c34a8b3cf61c4f0b759066a804febf5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/GPUTrace.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+
+namespace c10::impl {
+
+struct C10_API GPUTrace {
+  // On the x86 architecture the atomic operations are lock-less.
+  static std::atomic<const PyInterpreter*> gpuTraceState;
+
+  // When PyTorch migrates to C++20, this should be changed to an atomic flag.
+  // Currently, the access to this variable is not synchronized, on the basis
+  // that it will only be flipped once and by the first interpreter that
+  // accesses it.
+  static bool haveState;
+
+  // This function will only register the first interpreter that tries to invoke
+  // it. For all of the next ones it will be a no-op.
+  static void set_trace(const PyInterpreter*);
+
+  static const PyInterpreter* get_trace() {
+    if (!haveState)
+      return nullptr;
+    return gpuTraceState.load(std::memory_order_acquire);
+  }
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd22d19adbd0d75a50ca649b360340cfd12dd537
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/HermeticPyObjectTLS.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <atomic>
+
+namespace c10::impl {
+
+// This TLS controls whether or not we permanently associate PyObject
+// with Tensor the first time it is allocated.  When hermetic PyObject
+// TLS is enabled (state is true), we DO NOT save PyObjects to Tensor,
+// meaning you get a distinct PyObject whenever you execute the code in
+// question.
+struct C10_API HermeticPyObjectTLS {
+  static void set_state(bool state);
+  static bool get_state() {
+    // Hypothetical fastpath if torchdeploy/multipy isn't used.  Per
+    // https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+    // this qualifies relaxed access because it is a single-location data
+    // structure (only the boolean here).
+    //
+    // Forgetting about data races for a moment, is there a logical race?
+    //
+    //  - Boolean only ever transitions from false to true.  So the
+    //    critical situation is when one interpreter is already running
+    //    when a second interpreter switches haveState from false to true.
+    //
+    //  - The first interpreter is indifferent whether or not it sees
+    //    hasState true/false; obviously false works (this is what the
+    //    interpreter was previously using; more directly, the interpreter
+    //    calls into itself as the handler, so being hermetic is not
+    //    required), and true simply means serviced python operator calls will
+    //    be hermetic; in these cases it is expected to be functionally
+    //    equivalent.
+    //
+    //  - The second interpreter MUST see hasState true (as its requests will
+    //    be forwarded to the first interpreter), but it is assumed that there
+    //    is a synchronization between the interpreter initialization, and
+    //    when we actually perform operations, so it is guaranteed to see
+    //    hasState true.
+    //
+    // QED.
+    //
+    // This fastpath is currently disabled so that we can more easily test that
+    // hermetic mode works correctly even on stock build of PyTorch.
+    if (false && !haveState_.load(std::memory_order_relaxed))
+      return false;
+    return get_tls_state();
+  }
+  // Call this from the multipy/torchdeploy top level
+  static void init_state();
+
+ private:
+  // This only flipped once from false to true during torchdeploy/multipy
+  // initialization, and never again.
+  static std::atomic<bool> haveState_;
+  static bool get_tls_state();
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5e647e7205eb9c06c17b5d662f9d8fa742cac47
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineDeviceGuard.h
@@ -0,0 +1,428 @@
+#pragma once
+
+// This file provides implementations of InlineDeviceGuard and
+// InlineOptionalDeviceGuard.
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/VirtualGuardImpl.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <type_traits>
+#include <utility>
+
+namespace c10::impl {
+
+/**
+ * A DeviceGuard is an RAII class that sets a device to some value
+ * on construction, and resets the device to its original value on
+ * destruction.
+ *
+ * InlineDeviceGuard is a helper class for implementing DeviceGuards.
+ * It is templated over a DeviceGuardImpl (anything that implements
+ * DeviceGuardImplInterface).  There are two primary ways to instantiate
+ * InlineDeviceGuard:
+ *
+ *  - With a concrete implementation of DeviceGuardImpl, e.g., CUDAGuardImpl.
+ *    This is the best way to use InlineDeviceGuard, as all calls are
+ *    devirtualized, giving you code as efficient as straight line
+ *    calls to cudaGetDevice/cudaSetDevice.
+ *
+ *  - With VirtualGuardImpl, which does a virtual dispatch to a DeviceGuardImpl
+ *    retrieved from a DeviceType registry.  We have explicitly instantiated
+ *    InlineDeviceGuard this way as c10::DeviceGuard.
+ *
+ * If you are in a hurry, you can use InlineDeviceGuard directly:
+ *
+ *    using CUDAGuard = impl::InlineDeviceGuard<CUDAGuardImpl>;
+ *
+ * However, you can provide a better user experience if you explicitly write a
+ * wrapper class that itself contains the template instantiation:
+ *
+ *    class CUDAGuard {
+ *    public:
+ *      // ... the API ...
+ *    private:
+ *      impl::InlineDeviceGuard<CUDAGuardImpl> guard_;
+ *    }
+ *
+ * The wrapper class provides a good place to write documentation, and helps
+ * avoid weird template instantiation errors when a user incorrectly uses the
+ * class.
+ *
+ * If you need to test this class, consider instantiating it with FakeGuardImpl.
+ */
+template <typename T>
+class InlineDeviceGuard {
+ public:
+  // Note [Omitted default constructor from RAII]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // In principle, we could add a default constructor to
+  // DeviceGuard which reads the current device and promises to
+  // restore to that device on exit.  However, most cases where you
+  // would have written this, you probably meant to actually just
+  // use OptionalDeviceGuard (since you don't actually need the
+  // restore to happen if you don't ever actually set the device).
+  // We remove the constructor here to encourage you to think about
+  // what you actually want to happen.
+  explicit InlineDeviceGuard() = delete;
+
+  /// Set the current device to the passed Device.
+  explicit InlineDeviceGuard(Device device)
+      : impl_(device.type()),
+        original_device_(
+            device.index() == -1 ? impl_.getDevice()
+                                 : impl_.exchangeDevice(device)),
+        current_device_(device.index() == -1 ? original_device_ : device) {}
+
+  /// Set the current device index to the passed DeviceIndex.  (The
+  /// device type is inferred from the template parameter T).
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineDeviceGuard(DeviceIndex device_index)
+      : InlineDeviceGuard(Device(U::static_type, device_index)) {}
+
+  /// Construct an InlineDeviceGuard using VirtualGuardImpl with an explicit
+  /// DeviceGuardImplInterface pointer.
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineDeviceGuard(
+      Device device,
+      const DeviceGuardImplInterface* impl)
+      : impl_(
+            VirtualGuardImpl(impl ? impl : getDeviceGuardImpl(device.type()))),
+        original_device_(
+            device.index() == -1 ? impl_.getDevice()
+                                 : impl_.exchangeDevice(device)),
+        current_device_(device.index() == -1 ? original_device_ : device) {}
+
+  /// Copy is disallowed
+  InlineDeviceGuard(const InlineDeviceGuard<T>&) = delete;
+  InlineDeviceGuard<T>& operator=(const InlineDeviceGuard<T>&) = delete;
+
+  /// Move is disallowed, as DeviceGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineDeviceGuard(InlineDeviceGuard<T>&& other) = delete;
+  InlineDeviceGuard& operator=(InlineDeviceGuard<T>&& other) = delete;
+
+  ~InlineDeviceGuard() {
+    impl_.uncheckedSetDevice(original_device_);
+  }
+
+  /// Sets the device to the given one.
+  template <
+      typename U = T,
+      typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>, int> = 0>
+  void set_device(at::Device device) {
+    AT_ASSERT(
+        (U::static_type == DeviceType::HIP && device.is_cuda()) ||
+        device.type() == U::static_type);
+    auto index = device.index();
+    if (index == -1)
+      return;
+    impl_.setDevice(device);
+    current_device_ = device;
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device.  This is effectively equivalent to
+  /// set_device when a guard supports only a single device type.
+  template <typename U = T>
+  typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>> reset_device(
+      at::Device device) {
+    set_device(device);
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device (for a possibly different device
+  /// type).
+  ///
+  /// This method is named reset_device to highlight the fact that previous
+  /// device settings from this guard are NOT preserved, even if the device
+  /// has a different device type.  For example:
+  ///
+  ///   // CUDA device is 0
+  ///   DeviceGuard g(Device(kCUDA, 1));
+  ///   g.reset_device(Device(kHIP, 2));
+  ///   // CUDA device is 0 (!!)
+  ///
+  /// NOTE: this implementation may skip some device setting if it can prove
+  /// that it is unnecessary.
+  ///
+  /// Optional argument is for testing only.
+  template <typename U = T>
+  typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>> reset_device(
+      at::Device device,
+      const impl::DeviceGuardImplInterface* impl = nullptr) {
+    auto index = device.index();
+    if (index == -1)
+      return;
+    if (device.type() == original_device_.type()) {
+      AT_ASSERT(impl == nullptr || impl->type() == device.type());
+      impl_.setDevice(device);
+      current_device_ = device;
+    } else {
+      // Destruct and reconstruct the DeviceGuard in place
+      impl_.setDevice(original_device_);
+      impl_ = !impl ? VirtualGuardImpl(device.type()) : VirtualGuardImpl(impl);
+      original_device_ = impl_.exchangeDevice(device);
+      current_device_ = device;
+    }
+  }
+
+  /// Sets the device index to the given one.  The device type is inferred
+  /// from the original device type.
+  void set_index(DeviceIndex index) {
+    reset_device(Device(original_device_.type(), index));
+  }
+
+  /// Returns the device that was set at the time the most recent
+  /// reset_device(), or otherwise the device at construction time.
+  Device original_device() const {
+    return original_device_;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return current_device_;
+  }
+
+ protected:
+  T impl_;
+
+ private:
+  Device original_device_;
+  Device current_device_;
+};
+
+/**
+ * A OptionalDeviceGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ *
+ * InlineOptionalDeviceGuard is a helper class for implementing
+ * OptionalDeviceGuards.  See guidance in InlineDeviceGuard on how to
+ * use this.  See OptionalDeviceGuard for user-oriented usage notes.
+ */
+template <typename T>
+class InlineOptionalDeviceGuard {
+ public:
+  // Note [Explicit initialization of optional fields]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Explicit initialization of optional fields
+  // required to workaround an nvcc bug; see
+  // https://github.com/pytorch/pytorch/issues/12117
+
+  /// Creates an uninitialized OptionalDeviceGuard.
+  explicit InlineOptionalDeviceGuard()
+      : guard_() // See Note [Explicit initialization of optional fields]
+  {}
+
+  /// Set the current device to the passed Device, if it is not nullopt.
+  explicit InlineOptionalDeviceGuard(optional<Device> device_opt)
+      : guard_() { // See Note [Explicit initialization of optional fields]
+    if (device_opt.has_value()) {
+      guard_.emplace(device_opt.value());
+    }
+  }
+
+  /// Set the current device to the passed DeviceIndex, if it is not nullopt.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineOptionalDeviceGuard(optional<DeviceIndex> device_index_opt)
+      : guard_() { // See Note [Explicit initialization of optional fields]
+    if (device_index_opt.has_value()) {
+      guard_.emplace(device_index_opt.value());
+    }
+  }
+
+  /// All constructors of DeviceGuard are valid for OptionalDeviceGuard
+  /// and result in initialized OptionalDeviceGuard.
+  template <typename... Args>
+  explicit InlineOptionalDeviceGuard(Args&&... args)
+      : guard_(std::in_place, std::forward<Args>(args)...) {}
+
+  // TODO: Consider reading Tensor and TensorList constructors here, when
+  // Tensor moves to c10.  (These are only valid on OptionalDeviceGuard,
+  // because a Tensor may be undefined, in which case we need an uninitialized
+  // tensor guard.)
+
+  // Note [Move construction for RAII guards is tricky]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // In principle, move construction is useful for terminating
+  // the lifetime of a `OptionalDeviceGuard` early; for example:
+  //
+  //     // current device is d0
+  //     OptionalDeviceGuard g1(d1);
+  //     // current device is d1
+  //     {
+  //       OptionalDeviceGuard g2(std::move(g1));
+  //     }
+  //     // current device is d0!!
+  //
+  // However, it's difficult to implement the move constructor
+  // in a way that works in all situations.  For example, consider
+  // the following example:
+  //
+  //     OptionalDeviceGuard g1(d1);
+  //     {
+  //       OptionalDeviceGuard g2(d2);
+  //       {
+  //         OptionalDeviceGuard g3(std::move(g1)); // !!!
+  //       }
+  //     }
+  //
+  // What should the current device be while g3 in scope... and what
+  // should it be after it goes out of scope?  What about g2?
+  // There don't seem to be satisfactory answers for these questions.
+  //
+  // It's in principle possible to raise an error when this occurs
+  // by doing some extra thread-local bookkeeping.  But why bother?
+  // Just don't provide the constructor.
+  InlineOptionalDeviceGuard(InlineOptionalDeviceGuard<T>&& other) = delete;
+
+  // Note [Move assignment for RAII guards is tricky]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Move assignment is deleted, because you need to know which guard was
+  // defined "first", as that guard's original_device_ wins--with the current
+  // representation, we have no way of telling which is the case.  (Move
+  // construction does not have this problem, as one guard is always
+  // uninitialized.)
+  //
+  // We can make this clear by way of a pair of examples:
+  //
+  // Example 1:
+  //
+  //  // initial device is n0
+  //  {
+  //    CUDAGuard g1(n1);
+  //    {
+  //      CUDAGuard g2(n2);
+  //      // current device should be n2
+  //      g1 = std::move(g2);
+  //      // current device should still be n2
+  //    }
+  //    // current device should still be n2
+  //  }
+  //  // current device should be n0
+  //
+  //  Example 2 (flip the order of the two guards):
+  //
+  //  // initial device is n0
+  //  {
+  //    CUDAGuard g2(n2);
+  //    {
+  //      CUDAGuard g1(n1);
+  //      // current device should be n1
+  //      g1 = std::move(g2);
+  //      // current device should be n2
+  //    }
+  //    // current device should be n0 (since g2 has been vacated)
+  //  }
+  //
+  // In both examples, we need g1 to restore to n0 after move assignment.
+  // However, in example 1, this is determined by the restore value of g1
+  // (prior to the move). In example 2, however, it is determined by the the
+  // restore value of g2(!!). We don't know which one should win, without having
+  // a way of telling which guard was allocated first.
+  //
+  // We could solve this with an extra thread-local variable.  But no one is
+  // actually using move-assignment.  So just get rid of it.
+  InlineOptionalDeviceGuard& operator=(InlineOptionalDeviceGuard&& other) =
+      delete;
+
+  /// Sets the device to the given one.  Initializes OptionalDeviceGuard if it
+  /// is not already initialized.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void set_device(at::Device device) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device);
+    } else {
+      guard_->set_device(device);
+    }
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device (for a possibly different device
+  /// type).  Initializes OptionalDeviceGuard if it is not already initialized.
+  ///
+  /// See notes on why this is called reset_device on InlineDeviceGuard.
+  ///
+  /// Optional argument is for testing only.
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  void reset_device(
+      at::Device device,
+      const DeviceGuardImplInterface* impl = nullptr) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device, impl);
+    } else {
+      guard_->reset_device(device, impl);
+    }
+  }
+
+  /// Resets the currently set device to its original device, and then sets the
+  /// current device to the passed device.  Initializes the guard if it is
+  /// not already initialized.  This is effectively equivalent to set_device
+  /// when a guard supports only a single device type.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void reset_device(at::Device device) {
+    if (!guard_.has_value()) {
+      guard_.emplace(device);
+    } else {
+      guard_->reset_device(device);
+    }
+  }
+
+  /// Sets the device index to the given one.  The device type is statically
+  /// known.
+  template <
+      typename U = T,
+      typename =
+          typename std::enable_if_t<!std::is_same_v<U, VirtualGuardImpl>>>
+  void set_index(DeviceIndex index) {
+    if (!guard_.has_value()) {
+      guard_.emplace(index);
+    } else {
+      guard_->set_index(index);
+    }
+  }
+
+  /// Returns the device that was set immediately prior to initialization of
+  /// the, guard, or nullopt if the guard is uninitialized.
+  optional<Device> original_device() const {
+    return guard_.has_value() ? make_optional(guard_->original_device())
+                              : nullopt;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  optional<Device> current_device() const {
+    return guard_.has_value() ? make_optional(guard_->current_device())
+                              : nullopt;
+  }
+
+  /// Restore the original device, resetting this guard to uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  optional<InlineDeviceGuard<T>> guard_;
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineEvent.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ff255440af161e77e1cc8863f52d4c102c45ef1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineEvent.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/Exception.h>
+
+namespace c10::impl {
+
+template <typename T>
+struct InlineEvent final {
+  InlineEvent() = delete;
+  InlineEvent(
+      const DeviceType _device_type,
+      const EventFlag _flag = EventFlag::PYTORCH_DEFAULT)
+      : backend_{_device_type}, device_type_{_device_type}, flag_{_flag} {}
+
+  // Copy constructor and copy assignment operator (deleted)
+  InlineEvent(const InlineEvent&) = delete;
+  InlineEvent& operator=(const InlineEvent&) = delete;
+
+  // Move constructor and move assignment operator
+  InlineEvent(InlineEvent&& other) noexcept
+      : event_(other.event_),
+        backend_(std::move(other.backend_)),
+        device_type_(other.device_type_),
+        device_index_(other.device_index_),
+        flag_(other.flag_),
+        was_marked_for_recording_(other.was_marked_for_recording_) {
+    other.event_ = nullptr;
+  }
+  InlineEvent& operator=(InlineEvent&& other) noexcept {
+    swap(other);
+    return *this;
+  }
+
+  void swap(InlineEvent& other) noexcept {
+    std::swap(event_, other.event_);
+    std::swap(backend_, other.backend_);
+    std::swap(device_type_, other.device_type_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(flag_, other.flag_);
+    std::swap(was_marked_for_recording_, other.was_marked_for_recording_);
+  }
+
+  ~InlineEvent() noexcept {
+    if (event_)
+      backend_.destroyEvent(event_, device_index_);
+  }
+
+  DeviceType device_type() const noexcept {
+    return device_type_;
+  }
+  DeviceIndex device_index() const noexcept {
+    return device_index_;
+  }
+  EventFlag flag() const noexcept {
+    return flag_;
+  }
+  bool was_marked_for_recording() const noexcept {
+    return was_marked_for_recording_;
+  }
+
+  void recordOnce(const Stream& stream) {
+    if (!was_marked_for_recording_)
+      record(stream);
+  }
+
+  void record(const Stream& stream) {
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match recording stream's device type ",
+        DeviceTypeName(stream.device_type()),
+        ".");
+
+    backend_.record(&event_, stream, device_index_, flag_);
+    was_marked_for_recording_ = true;
+    device_index_ = stream.device_index();
+  }
+
+  void block(const Stream& stream) const {
+    if (!was_marked_for_recording_)
+      return;
+
+    TORCH_CHECK(
+        stream.device_type() == device_type_,
+        "Event device type ",
+        DeviceTypeName(device_type_),
+        " does not match blocking stream's device type ",
+        DeviceTypeName(stream.device_type()),
+        ".");
+
+    backend_.block(event_, stream);
+  }
+
+  bool query() const {
+    if (!was_marked_for_recording_)
+      return true;
+    return backend_.queryEvent(event_);
+  }
+
+ private:
+  void* event_ = nullptr;
+  T backend_;
+  DeviceType device_type_;
+  DeviceIndex device_index_ = -1;
+  EventFlag flag_ = EventFlag::PYTORCH_DEFAULT;
+  bool was_marked_for_recording_ = false;
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..52d3a648aced64d62eb99ca8ce47c0069729f922
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/InlineStreamGuard.h
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+
+namespace c10::impl {
+
+/**
+ * A StreamGuard is an RAII class that changes the current device
+ * to the device corresponding to some stream, and changes the
+ * default stream on that device to be this stream.
+ *
+ * InlineStreamGuard is a helper class for implementing StreamGuards.
+ * See InlineDeviceGuard for guidance on how to use this class.
+ */
+template <typename T>
+class InlineStreamGuard : private InlineDeviceGuard<T> {
+ public:
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit InlineStreamGuard() = delete;
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  explicit InlineStreamGuard(Stream stream)
+      : InlineDeviceGuard<T>(stream.device()),
+        original_stream_of_original_device_(
+            this->impl_.getStream(original_device())),
+        original_stream_of_current_device_(this->impl_.exchangeStream(stream)),
+        current_stream_(stream) {}
+
+  /// This constructor exists purely for testing
+  template <
+      typename U = T,
+      typename = typename std::enable_if_t<std::is_same_v<U, VirtualGuardImpl>>>
+  explicit InlineStreamGuard(
+      Stream stream,
+      const DeviceGuardImplInterface* impl)
+      : InlineDeviceGuard<T>(
+            stream.device(),
+            impl ? impl : getDeviceGuardImpl(stream.device_type())),
+        original_stream_of_original_device_(
+            this->impl_.getStream(original_device())),
+        original_stream_of_current_device_(this->impl_.exchangeStream(stream)),
+        current_stream_(stream) {}
+
+  /// Copy is disallowed
+  InlineStreamGuard(const InlineStreamGuard<T>&) = delete;
+  InlineStreamGuard<T>& operator=(const InlineStreamGuard<T>&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineStreamGuard(InlineStreamGuard<T>&& other) = delete;
+  InlineStreamGuard& operator=(InlineStreamGuard<T>&& other) = delete;
+
+  ~InlineStreamGuard() {
+    this->impl_.exchangeStream(original_stream_of_current_device_);
+  }
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// use MultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    // TODO: make a version that takes an impl argument.  Unfortunately,
+    // that will require SFINAE because impl is only valid for the
+    // VirtualGuardImpl specialization.
+    if (stream.device() == this->current_device()) {
+      this->impl_.exchangeStream(stream);
+      current_stream_ = stream;
+    } else {
+      // Destruct and reconstruct the StreamGuard in-place
+      this->impl_.exchangeStream(original_stream_of_current_device_);
+      this->reset_device(stream.device());
+      original_stream_of_current_device_ = this->impl_.exchangeStream(stream);
+      current_stream_ = stream;
+    }
+  }
+
+  // It's not clear if set_device should also reset the current stream
+  // if the device is unchanged; therefore, we don't provide it.
+  // The situation is somewhat clearer with reset_device, but it's still
+  // a pretty weird thing to do, so haven't added this either.
+
+  /// Returns the stream of the original device prior to this guard.  Subtly,
+  /// the stream returned here is the original stream of the *original*
+  /// device; i.e., it's the stream that your computation *would* have
+  /// been put on, if it hadn't been for this meddling stream guard.
+  /// This is usually what you want.
+  Stream original_stream() const {
+    return original_stream_of_original_device_;
+  }
+
+  /// Returns the most recent stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  Stream current_stream() const {
+    return current_stream_;
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return InlineDeviceGuard<T>::current_device();
+  }
+
+  /// Returns the device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return InlineDeviceGuard<T>::original_device();
+  }
+
+ private:
+  Stream
+      original_stream_of_original_device_; // what the user probably cares about
+  Stream original_stream_of_current_device_; // what we need to restore
+  Stream current_stream_;
+};
+
+/**
+ * An OptionalStreamGuard is an RAII class that sets a device to some value on
+ * initialization, and resets the device to its original value on destruction.
+ * See InlineOptionalDeviceGuard for more guidance on how to use this class.
+ */
+template <typename T>
+class InlineOptionalStreamGuard {
+ public:
+  /// Creates an uninitialized stream guard.
+  explicit InlineOptionalStreamGuard()
+      : guard_() // See Note [Explicit initialization of optional fields]
+  {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit InlineOptionalStreamGuard(optional<Stream> stream_opt) : guard_() {
+    if (stream_opt.has_value()) {
+      guard_.emplace(stream_opt.value());
+    }
+  }
+
+  /// All constructors of StreamGuard are valid for OptionalStreamGuard
+  template <typename... Args>
+  explicit InlineOptionalStreamGuard(Args&&... args)
+      : guard_(std::in_place, std::forward<Args>(args)...) {}
+
+  // See Note [Move construction for RAII guards is tricky]
+  InlineOptionalStreamGuard(InlineOptionalStreamGuard<T>&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  InlineOptionalStreamGuard& operator=(InlineOptionalStreamGuard&& other) =
+      delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the OptionalStreamGuard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    if (guard_.has_value()) {
+      guard_->reset_stream(stream);
+    } else {
+      guard_.emplace(stream);
+    }
+  }
+
+  /// Returns the stream that was set at the time the guard was most recently
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<Stream> original_stream() const {
+    return guard_.has_value() ? make_optional(guard_->original_stream())
+                              : nullopt;
+  }
+
+  /// Returns the most recent stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<Stream> current_stream() const {
+    return guard_.has_value() ? make_optional(guard_->current_stream())
+                              : nullopt;
+  }
+
+  /// Restore the original device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  optional<InlineStreamGuard<T>> guard_;
+};
+
+template <typename T>
+class InlineMultiStreamGuard {
+ public:
+  /// Calls `set_stream` on each of the streams in the list.
+  /// This may be useful if you need to set different streams
+  /// for different devices.
+  explicit InlineMultiStreamGuard(ArrayRef<Stream> streams) {
+    if (!streams.empty()) {
+      impl_.emplace(getDeviceTypeOfStreams(streams));
+      original_streams_.reserve(streams.size());
+      for (const Stream& s : streams) {
+        original_streams_.emplace_back(this->impl_->exchangeStream(s));
+      }
+    }
+  }
+
+  /// Copy is disallowed
+  InlineMultiStreamGuard(const InlineMultiStreamGuard&) = delete;
+  InlineMultiStreamGuard<T>& operator=(const InlineMultiStreamGuard&) = delete;
+
+  /// Move is disallowed, as StreamGuard does not have an uninitialized state,
+  /// which is required for moves on types with nontrivial destructors.
+  InlineMultiStreamGuard(InlineMultiStreamGuard&& other) = delete;
+  InlineMultiStreamGuard& operator=(InlineMultiStreamGuard&& other) = delete;
+
+  ~InlineMultiStreamGuard() noexcept {
+    if (this->impl_.has_value()) {
+      for (const Stream& s : original_streams_) {
+        this->impl_->exchangeStream(s);
+      }
+    }
+  }
+
+ protected:
+  optional<T> impl_;
+
+ private:
+  /// The original streams that were active on all devices.
+  std::vector<Stream> original_streams_;
+
+  static DeviceType getDeviceTypeOfStreams(ArrayRef<Stream> streams) {
+    TORCH_INTERNAL_ASSERT(!streams.empty());
+    DeviceType type = streams[0].device_type();
+    for (const auto idx : c10::irange(1, streams.size())) {
+      TORCH_CHECK_VALUE(
+          streams[idx].device_type() == type,
+          "Streams have a mix of device types: stream 0 is on ",
+          streams[0].device(),
+          " while stream ",
+          idx,
+          " is on device ",
+          streams[idx].device());
+    }
+    return type;
+  }
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h
new file mode 100644
index 0000000000000000000000000000000000000000..acf7fce944b9fc7b5be7e94a7822541ab03a407f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/LocalDispatchKeySet.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Export.h>
+
+// TLS management for DispatchKeySet (the "local" DispatchKeySet(s))
+//
+// This manages two thread-local DispatchKeySets:
+//
+//  - The included type set, which adds a tensor type for consideration
+//    in dispatch.  (For example, you might add Profiling to
+//    the included type set to turn on profiling on all tensor operations.)
+//
+//  - The excluded type set, which disqualifies a tensor type from dispatch.
+//    (For example, after redispatching on variable, we disqualify
+//    Autograd so we don't attempt to handle variable again.)
+//    (Exclusion wins over inclusion.)
+//
+// NB: Originally, I implemented the excluded type set as storing the inverted
+// set, but TLS is defined to be zero-initialized, so this doesn't actually work
+// (if it's inverted, you want the set to be -1 initialized).
+
+namespace c10::impl {
+
+// POD version of LocalDispatchKeySet.  Declared here just so that
+// we can put it in the guards.
+// This struct encapsulates special handling for TLS initialization
+// in set_included()/included() API so that they reflect the truth.
+// If you want to create PODLocalDispatchKeySet with non-zero state,
+// use set_included() instead of default constructor.
+struct C10_API PODLocalDispatchKeySet {
+  uint64_t included_;
+  uint64_t excluded_;
+
+  // See Note [TLS Initialization]
+  DispatchKeySet included() const {
+    return DispatchKeySet(DispatchKeySet::RAW, included_) ^
+        c10::default_included_set;
+  }
+  DispatchKeySet excluded() const {
+    return DispatchKeySet(DispatchKeySet::RAW, excluded_) ^
+        c10::default_excluded_set;
+  }
+
+  void set_included(DispatchKeySet x) {
+    included_ = (x ^ c10::default_included_set).raw_repr();
+  }
+  void set_excluded(DispatchKeySet x) {
+    excluded_ = (x ^ c10::default_excluded_set).raw_repr();
+  }
+};
+static_assert(
+    std::is_trivial_v<PODLocalDispatchKeySet>,
+    "PODLocalDispatchKeySet must be a POD type.");
+
+struct C10_API LocalDispatchKeySet {
+  /* implicit */ LocalDispatchKeySet(PODLocalDispatchKeySet x)
+      : included_(x.included()), excluded_(x.excluded()) {}
+  DispatchKeySet included_;
+  DispatchKeySet excluded_;
+};
+
+// thread_local variables cannot be C10_API on Windows.
+// Inlining this seems to break AutoDispatchBelowAutograd on Android.
+#if defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+C10_API LocalDispatchKeySet tls_local_dispatch_key_set();
+#else // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
+
+inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() {
+  // Don't let people fiddle with the thread_local directly just
+  // because they include this header.
+  return raw_local_dispatch_key_set;
+}
+#endif // defined(_MSC_VER) || defined(C10_ANDROID) || defined(C10_IPHONE)
+
+// Internal, use ThreadLocalStateGuard
+C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);
+
+// RAII API for manipulating the thread-local dispatch state.
+
+class C10_API IncludeDispatchKeyGuard {
+ public:
+  IncludeDispatchKeyGuard(DispatchKeySet);
+  IncludeDispatchKeyGuard(DispatchKey k)
+      : IncludeDispatchKeyGuard(DispatchKeySet(k)) {}
+  IncludeDispatchKeyGuard(const IncludeDispatchKeyGuard&) = delete;
+  IncludeDispatchKeyGuard operator=(const IncludeDispatchKeyGuard&) = delete;
+  IncludeDispatchKeyGuard(IncludeDispatchKeyGuard&&) = delete;
+  IncludeDispatchKeyGuard operator=(IncludeDispatchKeyGuard&&) = delete;
+  ~IncludeDispatchKeyGuard();
+
+ private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalDispatchKeySet* tls_;
+  DispatchKeySet include_;
+};
+
+class C10_API ExcludeDispatchKeyGuard {
+ public:
+  ExcludeDispatchKeyGuard(DispatchKeySet);
+  ExcludeDispatchKeyGuard(DispatchKey k)
+      : ExcludeDispatchKeyGuard(DispatchKeySet(k)) {}
+  ExcludeDispatchKeyGuard(const ExcludeDispatchKeyGuard&) = delete;
+  ExcludeDispatchKeyGuard operator=(const ExcludeDispatchKeyGuard&) = delete;
+  ExcludeDispatchKeyGuard(ExcludeDispatchKeyGuard&&) = delete;
+  ExcludeDispatchKeyGuard operator=(ExcludeDispatchKeyGuard&&) = delete;
+  ~ExcludeDispatchKeyGuard();
+
+ private:
+  // A little micro-optimization to save us from tls_get_addr call
+  // on destruction
+  PODLocalDispatchKeySet* tls_;
+  DispatchKeySet exclude_;
+};
+
+struct C10_API ForceDispatchKeyGuard {
+ public:
+  ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set)
+      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+    c10::impl::_force_tls_local_dispatch_key_set(key_set);
+  }
+  ForceDispatchKeyGuard(
+      c10::DispatchKeySet include,
+      c10::DispatchKeySet exclude)
+      : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) {
+    auto updated_set = saved_keyset_;
+    updated_set.included_ = include;
+    updated_set.excluded_ = exclude;
+    c10::impl::_force_tls_local_dispatch_key_set(updated_set);
+  }
+  ~ForceDispatchKeyGuard() {
+    c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
+  }
+
+ private:
+  c10::impl::LocalDispatchKeySet saved_keyset_;
+};
+
+// Non-RAII API for manipulating the thread-local dispatch state.
+// Please prefer the RAII API.  The non-RAII API may be useful when
+// the included/excluded state of a given DispatchKey must span
+// many calls from the Python to the C++, so you cannot conveniently
+// use an RAII guard.
+//
+// Example use case:  a Python context manager that includes a certain
+// DispatchKey, to ensure ops running under the context manager dispatch
+// through that DispatchKey's registered overrides.
+//
+// The non-RAII API is less efficient than the RAII guards because both the
+// getter and setter will do a tls_getaddr lookup (the RAII struct only needs
+// one!)
+
+C10_API bool tls_is_dispatch_key_excluded(DispatchKey x);
+C10_API void tls_set_dispatch_key_excluded(DispatchKey x, bool desired_state);
+C10_API bool tls_is_dispatch_key_included(DispatchKey x);
+C10_API void tls_set_dispatch_key_included(DispatchKey x, bool desired_state);
+C10_API bool tls_is_dispatch_keyset_excluded(DispatchKeySet ks);
+C10_API bool tls_is_dispatch_keyset_included(DispatchKeySet ks);
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyInterpreter.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f759aac2b7b5a39fe627673b03914be381156fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyInterpreter.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/macros/Export.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/python_stub.h>
+#include <string>
+#include <vector>
+
+// Forward declarations
+
+namespace c10 {
+struct IValue;
+class OperatorHandle;
+struct TensorImpl;
+} // namespace c10
+
+namespace torch::jit {
+using Stack = std::vector<c10::IValue>;
+}
+
+// Actual implementation
+
+namespace c10::impl {
+
+struct C10_API PyInterpreter;
+
+// Note [Python interpreter tag]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Traditionally, PyTorch is layered such that our Python library
+// (libtorch_python) references our pure C++ library (libtorch) as the
+// natural order of things.  However, sometimes this natural order is
+// subverted: C++ objects refer to Python objects (for example, we
+// store a PyObject* pointer on TensorImpl so that converting from a
+// C++ Tensor to a Python Tensor is just a memory dereference).
+//
+// These unusual orderings must be treated with care.  To start, you need to
+// virtualize the destructor so that the PyObject can be decref'ed on
+// destruction (because the C++ object itself doesn't know anything about
+// Python--remember, layering!).  This process itself is fraught, since
+// acquiring the GIL could lead to deadlocks if someone is blocking on you
+// while holding the GIL.  Furthermore, if the C++ objects outlive the
+// interpreter (which can happen if you stash them in a static global
+// variable defined in libtorch), you may attempt to decref the object when
+// the Python interpreter has already been shutdown.
+//
+// BUT WAIT, IT GETS WORSE.  With torchdeploy, there may be multiple Python
+// interpreters in a single process. If a C++ object is accessible from
+// multiple interpreters, we must take care not to accidentally pass a
+// PyObject from one interpreter with another interpreter.
+//
+// To prevent these mixups, we introduce a PyInterpreter "tag" (object with
+// a vtable), which specifies a specific Python interpreter.
+//
+//  - Any given object can be associated with AT MOST one Python interpreter.
+//    We represent the interpreter tag as a memory address to an instance of
+//    a virtual class that is allocated once per interpreter (this is so that
+//    we can request the interpreter to perform operations for us, if
+//    necessary).
+//
+//  - It can be recorded with a PyObject (PyInterpreterObject) so that
+//    we know what interpreter the object is associated with, and we can
+//    raise an error if you try to use the PyObject from the wrong
+//    interpreter context.
+//
+//  - It contains a vtable that can be used to perform various Python
+//    operations from ordinary C++ code that ordinarily wouldn't be accessible
+//    from libtorch.
+//
+// A simple use case is when a C++ object must be associated with a PyObject.
+// However, for TensorImpl, we lazily allocate a PyObject the first time the
+// object passes into Python.  The invariants for this situation are more
+// subtle:
+//
+//  - A given TensorImpl's interpreter tag can only go from uninitialized to
+//    tagged; once tagged, this is a quiescent state (once tagged to an
+//    interpreter, ALWAYS tagged to that interpreter)
+//
+//  - A thread may mutate the PyObject field of a TensorImpl if and only if it
+//    holds the GIL for the interpreter tagged on the TensorImpl.  (If the
+//    TensorImpl is not tagged, it must first atomically claim its tag before it
+//    can validly write)
+//
+// WARNING: This class has to be written very carefully, because it may be
+// possible for a Tensor to have a reference an interpreter corresponding to
+// a shared library that has ALREADY BEEN UNLOADED.  This makes blindly calling
+// virtual methods very dangerous, because the vtable may be garbage at that
+// point (on a good day, you might get "pure virtual method called").
+//
+// The idea to solve this problem is we always leak PyInterpreters (so they
+// always stay live even after dlclose), and make sure we can disarm their
+// virtual methods by indirecting through a separate PyInterpreterVTable
+// object.  This can be replaced with a no-op vtable from libc10.so, which
+// is guaranteed to stick around until the bitter end.
+//
+// NB: The downside with representing PyInterpreter tags as full objects is that
+// it takes an extra word on TensorImpl.  If tags were instead just integer
+// indices, on 64-bit architectures we could pack the tag and PyObject together
+// into a single atomic word.  On 32-bit architectures we could simply say that
+// only one Python interpreter is supported (erroring if a nontrivial
+// interpreter tag is attempted to be set).
+//
+// The difficulty with this scheme is we need to maintain an out-of-line table
+// to get at the PyInterpreters so that we can do virtual method calls on them,
+// and registration/deregistration to this table must be done in a thread safe
+// manner.  This can be easily done if the number of possible PyInterpreters is
+// small enough (e.g., 8-bit integer) by simply preallocating an array of
+// sufficient size to hold all possible interpreters.  Surely 128 threads is
+// more than enough for anyone!
+//
+// I didn't decide to do this technique at the moment, because the extra word
+// added by the PyInterpreter tag takes us to 24 words, which means that we
+// still fit inside three eight word cache lines.  If you need to penny pinch
+// another word consider doing this!
+
+struct C10_API PyInterpreterVTable {
+  virtual ~PyInterpreterVTable() = default;
+
+  // Report the name of this interpreter
+  virtual std::string name() const = 0;
+
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
+  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
+  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+
+  // Perform a detach by deferring to the __torch_dispatch__ implementation of
+  // detach, which will also arrange for the PyObject to get copied in this
+  // situation
+  virtual c10::intrusive_ptr<TensorImpl> detach(
+      const TensorImpl* self) const = 0;
+
+  // Invoke the Python boxed fallback dispatch to go back into Python
+  virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
+      const = 0;
+
+  virtual void reportErrorCallback(PyObject* callback, DispatchKey key)
+      const = 0;
+
+  // This is only invoked in the multipy/torchdeploy situation from
+  // pythonOpRegistrationTrampoline; this lets us get to the Python
+  // interpreter to actually find the appropriate Python op registration
+  // entry to call.
+  virtual void python_op_registration_trampoline(
+      const c10::OperatorHandle& op,
+      c10::DispatchKey,
+      torch::jit::Stack* stack) const = 0;
+
+  virtual void throw_abstract_impl_not_imported_error(
+      std::string opname,
+      const char* pymodule,
+      const char* context) const = 0;
+
+  // Invoke the Python dispatcher to handle this call
+  virtual void python_dispatcher(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet,
+      torch::jit::Stack* stack) const = 0;
+
+  virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
+      const = 0;
+  virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
+      const = 0;
+  virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
+  virtual c10::Device device(const TensorImpl* self) const = 0;
+  virtual int64_t dim(const TensorImpl* self) const = 0;
+  virtual c10::IntArrayRef strides(const TensorImpl* self) const = 0;
+  virtual c10::IntArrayRef sizes(const TensorImpl* self) const = 0;
+  virtual c10::SymIntArrayRef sym_sizes(const TensorImpl* self) const = 0;
+  virtual c10::Layout layout(const TensorImpl* self) const = 0;
+  virtual int64_t numel(const TensorImpl* self) const = 0;
+  virtual c10::SymInt sym_numel(const TensorImpl* self) const = 0;
+  virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0;
+  virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0;
+
+  virtual void trace_gpu_event_creation(uintptr_t event) const = 0;
+  virtual void trace_gpu_event_deletion(uintptr_t event) const = 0;
+  virtual void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
+      const = 0;
+  virtual void trace_gpu_event_wait(uintptr_t event, uintptr_t stream)
+      const = 0;
+  virtual void trace_gpu_memory_allocation(uintptr_t ptr) const = 0;
+  virtual void trace_gpu_memory_deallocation(uintptr_t ptr) const = 0;
+  virtual void trace_gpu_stream_creation(uintptr_t stream) const = 0;
+  virtual void trace_gpu_device_synchronization() const = 0;
+  virtual void trace_gpu_stream_synchronization(uintptr_t stream) const = 0;
+  virtual void trace_gpu_event_synchronization(uintptr_t event) const = 0;
+
+  virtual void reset_backward_hooks(const TensorImpl* self) const = 0;
+};
+
+struct C10_API PyInterpreter {
+  const PyInterpreterVTable* vtable_;
+
+  PyInterpreter(const PyInterpreterVTable* vtable) : vtable_(vtable){};
+
+  const PyInterpreterVTable& operator*() const noexcept {
+    return *vtable_;
+  }
+  const PyInterpreterVTable* operator->() const noexcept {
+    return vtable_;
+  }
+
+  // Disarm this PyInterpreter, making all of its methods noops.
+  // The vtable pointer is not an atomic at the moment, which means
+  // a disarm() invocation that is concurrent with active destructors
+  // is not thread safe and will trigger TSAN.  My hope is that this
+  // situations doesn't ever actually happen; tensor destruction should
+  // quiesce when a dlclose happens, and any long lived tensors whose
+  // destructors would be disarmed here only begin the destruction process
+  // on process shutdown (long after the dlclose has occurred).
+  void disarm() noexcept;
+};
+
+// PyInterpreterStatus describes what the state of its interpreter tag
+// is, relative to the thread currently holding the GIL.
+enum class PyInterpreterStatus {
+  // We just allocated the Tensor, it hasn't escaped to other threads,
+  // we know that it definitely hasn't been tagged to be associated
+  // with an interpreter.
+  DEFINITELY_UNINITIALIZED,
+  // We queried the interpreter field and it looked uninitialized.  But
+  // another thread may have raced with us to tag it with some other
+  // interpreter id.  So we will have to do a CEX to make sure we can
+  // actually nab it.
+  MAYBE_UNINITIALIZED,
+  // We queried the interpreter field and it was tagged to belong to us.
+  // This means we have sole write access (as we hold the GIL for this
+  // interpreter)
+  TAGGED_BY_US,
+  // Someone else tagged this.  We can't use this TensorImpl from Python.
+  TAGGED_BY_OTHER,
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyObjectSlot.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyObjectSlot.h
new file mode 100644
index 0000000000000000000000000000000000000000..b850099490bdb336feb51ad1a91c2f191fd874e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PyObjectSlot.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <c10/core/impl/HermeticPyObjectTLS.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Optional.h>
+#include <c10/util/python_stub.h>
+
+#include <atomic>
+
+namespace c10::impl {
+
+struct C10_API PyObjectSlot {
+ public:
+  PyObjectSlot();
+
+  ~PyObjectSlot();
+
+  void maybe_destroy_pyobj();
+
+  // Associate the TensorImpl with the specified PyObject, and, if necessary,
+  // also tag the interpreter.
+  //
+  // NB: This lives in a header so that we can inline away the switch on status
+  //
+  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
+  // PyObject if necessary!
+  void init_pyobj(
+      PyInterpreter* self_interpreter,
+      PyObject* pyobj,
+      PyInterpreterStatus status) {
+    impl::PyInterpreter* expected = nullptr;
+    switch (status) {
+      case impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED:
+        // caller guarantees there is no multithreaded access; if there is
+        // no data race OK to do a relaxed store
+        pyobj_interpreter_.store(self_interpreter, std::memory_order_relaxed);
+        break;
+      case impl::PyInterpreterStatus::TAGGED_BY_US:
+        // no tagging is necessary, the tag is already correct
+        break;
+      case impl::PyInterpreterStatus::MAYBE_UNINITIALIZED:
+        // attempt to claim this TensorImpl with the specified interpreter
+        // tag
+        if (pyobj_interpreter_.compare_exchange_strong(
+                expected, self_interpreter, std::memory_order_acq_rel)) {
+          break;
+        }
+        // test if, actually, it was already tagged by us!  this situation can't
+        // be caused by a race, but it could be caused by a situation
+        // where someone conservatively tagged the tensor as MAYBE_UNINITIALIZED
+        // (because they didn't pre-check the tag) when actually it was
+        // owned by the interpreter
+        if (expected == self_interpreter) {
+          break;
+        }
+        // fallthrough, we lost the race.  We are guaranteed not to lose the
+        // race with ourself, as calls to init_pyobj with the same interpreter
+        // ID must be sequentialized by the GIL
+        [[fallthrough]];
+      case impl::PyInterpreterStatus::TAGGED_BY_OTHER:
+        TORCH_CHECK(
+            false,
+            "cannot allocate PyObject for Tensor on interpreter ",
+            self_interpreter,
+            " that has already been used by another torch deploy interpreter ",
+            pyobj_interpreter_.load());
+    }
+
+    // we are the ONLY thread that can have gotten to this point.  It is not
+    // possible to conflict with another zero interpreter as access is protected
+    // by GIL
+    // NB: owns_pyobj tag is initially false
+    pyobj_ = pyobj;
+  }
+
+  // Query the PyObject interpreter.  This may return null if there is no
+  // interpreter.  This is racy!
+  PyInterpreter* pyobj_interpreter();
+
+  PyObject* _unchecked_untagged_pyobj() const;
+
+  // Test the interpreter tag.  If tagged for the current interpreter, return
+  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
+  // returns a nullopt.  If it is definitely invalid, raises an error.
+  //
+  // If `ignore_hermetic_tls` is false and this function is called from a
+  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
+  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
+  // context is ignored, allowing you to check the interpreter tag of a
+  // nonhermetic PyObject from within a hermetic context. This is necessary
+  // because there are some cases where the deallocator function of a
+  // nonhermetic PyObject is called from within a hermetic context, so it must
+  // be properly treated as a nonhermetic PyObject.
+  //
+  // NB: this lives in header so that we can avoid actually creating the
+  // c10::optional
+  c10::optional<PyObject*> check_pyobj(
+      PyInterpreter* self_interpreter,
+      bool ignore_hermetic_tls = false) const {
+    // Note [Memory ordering on Python interpreter tag]
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
+      // NB: This never returns DEFINITELY_UNINITIALIZED because there is
+      // always the possibility that another thread races to initialize
+      // after we query here.  The only time when we can conclude a tensor
+      // is definitely uninitialized is when we have just allocated it and
+      // it cannot have escaped to other threads yet
+      return c10::nullopt;
+    } else if (interpreter == self_interpreter) {
+      // NB: pyobj_ could still be null!
+      if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
+        return c10::nullopt;
+      } else {
+        return c10::make_optional(_unchecked_untagged_pyobj());
+      }
+    } else {
+      TORCH_CHECK(
+          false,
+          "cannot access PyObject for Tensor on interpreter ",
+          (*self_interpreter)->name(),
+          " that has already been used by another torch deploy interpreter ",
+          (*pyobj_interpreter_.load())->name());
+    }
+  }
+
+  // Clear the PyObject field for an interpreter, in situations where we
+  // statically know the tensor is tagged with our interpreter.
+  void unchecked_clear_pyobj(PyInterpreter* interpreter);
+
+  PyInterpreter& load_pyobj_interpreter() const;
+
+  // Check if the PyObjectSlot's interpreter is the same as the specified
+  // interpreter
+  bool check_interpreter(PyInterpreter* interpreter);
+
+  // Check if the PyObjectSlot is holding a PyObject, owned or non-owned
+  bool has_pyobj_nonhermetic();
+
+  bool owns_pyobj();
+
+  void set_owns_pyobj(bool b);
+
+ private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
+  // This field contains a reference to a PyObject representing this Tensor.
+  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
+  // PyObject for it and set this field.  This field does not have to be
+  // protected by an atomic as it is only allowed to be accessed when you hold
+  // the GIL, or during destruction of the tensor.
+  //
+  // When a PyObject dies, you are obligated to clear this field
+  // (otherwise, you will try to use-after-free the pyobj); this currently
+  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
+  //
+  // NB: Ordinarily, this should not be a strong reference, as if the
+  // PyObject owns the Tensor, this would create a reference cycle.
+  // However, sometimes this ownership flips.  To track who owns
+  // who, this has a single pointer tag indicating whether or not the
+  // C++ object owns the PyObject (the common case, zero, means PyObject
+  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
+  // or check_pyobj for checked access.  See references to PyObject
+  // resurrection in torch/csrc/autograd/python_variable.cpp
+  PyObject* pyobj_;
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c6ed14312275653ee8bbf4c7940a4918d1df2b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/PythonDispatcherTLS.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/macros/Export.h>
+
+namespace c10::impl {
+
+struct C10_API PythonDispatcherTLS {
+  static void set_state(PyInterpreter* state);
+  static PyInterpreter* get_state();
+  static void reset_state();
+};
+
+struct C10_API DisablePythonDispatcher {
+  DisablePythonDispatcher() : old_(PythonDispatcherTLS::get_state()) {
+    PythonDispatcherTLS::set_state({});
+  }
+  ~DisablePythonDispatcher() {
+    PythonDispatcherTLS::set_state(old_);
+  }
+  PyInterpreter* old_;
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/SizesAndStrides.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/SizesAndStrides.h
new file mode 100644
index 0000000000000000000000000000000000000000..59f5a9a4d7fccba8d3a046da0dcc82355c580b60
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/SizesAndStrides.h
@@ -0,0 +1,315 @@
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/SmallVector.h>
+
+#define C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE 5
+
+namespace c10::impl {
+
+// Packed container for TensorImpl sizes and strides.
+// This design improves on the previous approach of using a pair of
+// c10::SmallVector<int64_t, 5> by specializing for the operations we
+// actually use and enforcing that the number of sizes is the same as
+// the number of strides. The memory layout is as follows:
+//
+// 1 size_t for the size
+// 5 eightbytes of inline sizes and 5 eightbytes of inline strides, OR pointer
+// to out-of-line array
+class C10_API SizesAndStrides {
+ public:
+  // TODO: different iterator types for sizes & strides to prevent
+  // mixing the two accidentally.
+  using sizes_iterator = int64_t*;
+  using sizes_const_iterator = const int64_t*;
+  using strides_iterator = int64_t*;
+  using strides_const_iterator = const int64_t*;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  SizesAndStrides() {
+    size_at_unchecked(0) = 0;
+    stride_at_unchecked(0) = 1;
+  }
+
+  ~SizesAndStrides() {
+    if (C10_UNLIKELY(!isInline())) {
+      // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+      free(outOfLineStorage_);
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  SizesAndStrides(const SizesAndStrides& rhs) : size_(rhs.size_) {
+    if (C10_LIKELY(rhs.isInline())) {
+      copyDataInline(rhs);
+    } else {
+      allocateOutOfLineStorage(size_);
+      copyDataOutline(rhs);
+    }
+  }
+
+  SizesAndStrides& operator=(const SizesAndStrides& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_LIKELY(rhs.isInline())) {
+      if (C10_UNLIKELY(!isInline())) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      copyDataInline(rhs);
+    } else {
+      if (isInline()) {
+        allocateOutOfLineStorage(rhs.size_);
+      } else {
+        resizeOutOfLineStorage(rhs.size_);
+      }
+      copyDataOutline(rhs);
+    }
+    size_ = rhs.size_;
+    return *this;
+  }
+
+  // Move from rhs. rhs.size() == 0 afterwards.
+  SizesAndStrides(SizesAndStrides&& rhs) noexcept : size_(rhs.size_) {
+    if (C10_LIKELY(isInline())) {
+      memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_));
+    } else {
+      outOfLineStorage_ = rhs.outOfLineStorage_;
+      rhs.outOfLineStorage_ = nullptr;
+    }
+
+    rhs.size_ = 0;
+  }
+
+  // Move from rhs. rhs.size() == 0 afterwards.
+  SizesAndStrides& operator=(SizesAndStrides&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_LIKELY(rhs.isInline())) {
+      if (C10_UNLIKELY(!isInline())) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      copyDataInline(rhs);
+    } else {
+      // They're outline. We're going to steal their vector.
+      if (!isInline()) {
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        free(outOfLineStorage_);
+      }
+      outOfLineStorage_ = rhs.outOfLineStorage_;
+      rhs.outOfLineStorage_ = nullptr;
+    }
+    size_ = rhs.size_;
+    rhs.size_ = 0;
+
+    return *this;
+  }
+
+  size_t size() const noexcept {
+    return size_;
+  }
+
+  const int64_t* sizes_data() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[0];
+    } else {
+      return &outOfLineStorage_[0];
+    }
+  }
+
+  int64_t* sizes_data() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[0];
+    } else {
+      return &outOfLineStorage_[0];
+    }
+  }
+
+  sizes_const_iterator sizes_begin() const noexcept {
+    return sizes_data();
+  }
+
+  sizes_iterator sizes_begin() noexcept {
+    return sizes_data();
+  }
+
+  sizes_const_iterator sizes_end() const noexcept {
+    return sizes_begin() + size();
+  }
+
+  sizes_iterator sizes_end() noexcept {
+    return sizes_begin() + size();
+  }
+
+  IntArrayRef sizes_arrayref() const noexcept {
+    return IntArrayRef{sizes_data(), size()};
+  }
+
+  void set_sizes(IntArrayRef newSizes) {
+    resize(newSizes.size());
+    std::copy(newSizes.begin(), newSizes.end(), sizes_begin());
+  }
+
+  void set_strides(IntArrayRef strides) {
+    TORCH_INTERNAL_ASSERT(strides.size() == size());
+    std::copy(strides.begin(), strides.end(), strides_begin());
+  }
+
+  const int64_t* strides_data() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  int64_t* strides_data() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_const_iterator strides_begin() const noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_iterator strides_begin() noexcept {
+    if (C10_LIKELY(isInline())) {
+      return &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE];
+    } else {
+      return &outOfLineStorage_[size()];
+    }
+  }
+
+  strides_const_iterator strides_end() const noexcept {
+    return strides_begin() + size();
+  }
+
+  strides_iterator strides_end() noexcept {
+    return strides_begin() + size();
+  }
+
+  IntArrayRef strides_arrayref() const noexcept {
+    return IntArrayRef{strides_data(), size()};
+  }
+
+  // Size accessors.
+  int64_t size_at(size_t idx) const noexcept {
+    assert(idx < size());
+    return sizes_data()[idx];
+  }
+
+  int64_t& size_at(size_t idx) noexcept {
+    assert(idx < size());
+    return sizes_data()[idx];
+  }
+
+  int64_t size_at_unchecked(size_t idx) const noexcept {
+    return sizes_data()[idx];
+  }
+
+  int64_t& size_at_unchecked(size_t idx) noexcept {
+    return sizes_data()[idx];
+  }
+
+  // Size accessors.
+  int64_t stride_at(size_t idx) const noexcept {
+    assert(idx < size());
+    return strides_data()[idx];
+  }
+
+  int64_t& stride_at(size_t idx) noexcept {
+    assert(idx < size());
+    return strides_data()[idx];
+  }
+
+  int64_t stride_at_unchecked(size_t idx) const noexcept {
+    return strides_data()[idx];
+  }
+
+  int64_t& stride_at_unchecked(size_t idx) noexcept {
+    return strides_data()[idx];
+  }
+
+  void resize(size_t newSize) {
+    const auto oldSize = size();
+    if (newSize == oldSize) {
+      return;
+    }
+    if (C10_LIKELY(
+            newSize <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE && isInline())) {
+      if (oldSize < newSize) {
+        const auto bytesToZero =
+            (newSize - oldSize) * sizeof(inlineStorage_[0]);
+        memset(&inlineStorage_[oldSize], 0, bytesToZero);
+        memset(
+            &inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE + oldSize],
+            0,
+            bytesToZero);
+      }
+      size_ = newSize;
+    } else {
+      resizeSlowPath(newSize, oldSize);
+    }
+  }
+
+  void resizeSlowPath(size_t newSize, size_t oldSize);
+
+ private:
+  bool isInline() const noexcept {
+    return size_ <= C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE;
+  }
+
+  void copyDataInline(const SizesAndStrides& rhs) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(rhs.isInline());
+    memcpy(inlineStorage_, rhs.inlineStorage_, sizeof(inlineStorage_));
+  }
+
+  static size_t storageBytes(size_t size) noexcept {
+    return size * 2 * sizeof(int64_t);
+  }
+
+  void allocateOutOfLineStorage(size_t size) {
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+    outOfLineStorage_ = static_cast<int64_t*>(malloc(storageBytes(size)));
+    TORCH_CHECK(
+        outOfLineStorage_,
+        "Could not allocate memory for Tensor SizesAndStrides!");
+  }
+
+  void resizeOutOfLineStorage(size_t newSize) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!isInline());
+    outOfLineStorage_ = static_cast<int64_t*>(
+        // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+        realloc(outOfLineStorage_, storageBytes(newSize)));
+    TORCH_CHECK(
+        outOfLineStorage_,
+        "Could not allocate memory for Tensor SizesAndStrides!");
+  }
+
+  void copyDataOutline(const SizesAndStrides& rhs) noexcept {
+    memcpy(outOfLineStorage_, rhs.outOfLineStorage_, storageBytes(rhs.size_));
+  }
+
+  size_t size_{1};
+  union {
+    int64_t* outOfLineStorage_;
+    // NOLINTNEXTLINE(*c-array*)
+    int64_t inlineStorage_[C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE * 2]{};
+  };
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h
new file mode 100644
index 0000000000000000000000000000000000000000..12546ff72817e58f8b6c5bdf286977dfc578c7fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/TorchDispatchModeTLS.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Export.h>
+
+namespace c10::impl {
+
+enum class TorchDispatchModeKey : int8_t {
+  FAKE,
+  PROXY,
+  FUNCTIONAL,
+  NUM_MODE_KEYS
+};
+
+struct C10_API TorchDispatchModeTLS {
+  // This API is NOT invariant safe.
+  // It must not take in an infra mode that uses TorchDispatchModeKey
+  // If you're pushing an infra mode onto the stack, we expect
+  // you to use set_mode
+  static void push_non_infra_mode_onto_stack(
+      std::shared_ptr<SafePyObject> mode);
+  // Pops the top mode of the stack,
+  // giving precedence to user modes before attempting to pop
+  // any infra modes
+  static const std::shared_ptr<SafePyObject> pop_stack();
+  // Returns the highest-priority infra mode on the stack,
+  // along with its mode key.
+  static const std::tuple<std::shared_ptr<SafePyObject>, TorchDispatchModeKey>
+  pop_highest_infra_mode();
+
+  static const std::shared_ptr<SafePyObject>& get_stack_at(int64_t idx);
+  static int64_t stack_len();
+
+  static const c10::optional<std::shared_ptr<SafePyObject>> get_mode(
+      TorchDispatchModeKey mode_key);
+  static const c10::optional<std::shared_ptr<SafePyObject>> unset_mode(
+      TorchDispatchModeKey mode_key);
+  static void set_mode(
+      const std::shared_ptr<SafePyObject>& mode,
+      TorchDispatchModeKey mode_key);
+
+  static const TorchDispatchModeTLS& get_state();
+  static void set_state(TorchDispatchModeTLS state);
+
+  static bool any_modes_set(bool skip_infra_modes = false);
+
+ private:
+  std::vector<std::shared_ptr<c10::SafePyObject>> stack_;
+  // Users are allowed to push multiple ProxyTorchDispatchMode objects onto the
+  // stack
+  // However, we only allow a single FakeTensorMode onto the stack at a time
+  // (Pushing additional FakeTensorModes onto the stack is a no-op)
+  std::array<
+      c10::optional<std::shared_ptr<c10::SafePyObject>>,
+      static_cast<size_t>(TorchDispatchModeKey::NUM_MODE_KEYS)>
+      infra_modes_;
+};
+
+C10_API bool dispatch_mode_enabled();
+
+C10_API std::string to_string(TorchDispatchModeKey mode_key);
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..62e430e423855787f6deae36735687c358f649fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/VirtualGuardImpl.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+
+namespace c10::impl {
+
+/**
+ * An implementation of DeviceGuardImplInterface which delegates
+ * to virtual dispatch on the DeviceGuardImpl registry.
+ */
+class VirtualGuardImpl final : public DeviceGuardImplInterface {
+ public:
+  VirtualGuardImpl(DeviceType device_type)
+      : impl_(getDeviceGuardImpl(device_type)) {}
+  // This constructor exists purely for testing
+  VirtualGuardImpl(const DeviceGuardImplInterface* impl) : impl_(impl) {}
+
+  // Copying and moving is OK!
+  VirtualGuardImpl(const VirtualGuardImpl&) = default;
+  VirtualGuardImpl& operator=(const VirtualGuardImpl&) = default;
+  VirtualGuardImpl(VirtualGuardImpl&&) noexcept = default;
+  VirtualGuardImpl& operator=(VirtualGuardImpl&&) noexcept = default;
+
+  DeviceType type() const override {
+    return impl_->type();
+  }
+  Device exchangeDevice(Device d) const override {
+    return impl_->exchangeDevice(d);
+  }
+  Device getDevice() const override {
+    return impl_->getDevice();
+  }
+  void setDevice(Device d) const override {
+    impl_->setDevice(d);
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    impl_->uncheckedSetDevice(d);
+  }
+  Stream getStream(Device d) const noexcept override {
+    return impl_->getStream(d);
+  }
+  Stream getDefaultStream(Device d) const override {
+    return impl_->getDefaultStream(d);
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return impl_->getStreamFromGlobalPool(d, isHighPriority);
+  }
+  Stream exchangeStream(Stream s) const noexcept override {
+    return impl_->exchangeStream(s);
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return impl_->deviceCount();
+  }
+
+  // Event functions
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    impl_->record(event, stream, device_index, flag);
+  }
+  void block(void* event, const Stream& stream) const override {
+    impl_->block(event, stream);
+  }
+  bool queryEvent(void* event) const override {
+    return impl_->queryEvent(event);
+  }
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    impl_->destroyEvent(event, device_index);
+  }
+
+  bool queryStream(const Stream& stream) const override {
+    return impl_->queryStream(stream);
+  }
+  void synchronizeStream(const Stream& stream) const override {
+    impl_->synchronizeStream(stream);
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    impl_->recordDataPtrOnStream(data_ptr, stream);
+  }
+
+ private:
+  const DeviceGuardImplInterface* impl_ = nullptr;
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/impl/alloc_cpu.h b/MLPY/Lib/site-packages/torch/include/c10/core/impl/alloc_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d2b27d572fda80daa6c8705714fdf9689b9ce9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/impl/alloc_cpu.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <cstddef>
+
+namespace c10 {
+
+C10_API void* alloc_cpu(size_t nbytes);
+C10_API void free_cpu(void* data);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/core/thread_pool.h b/MLPY/Lib/site-packages/torch/include/c10/core/thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cc15b325dfb9593205253efba320ec428294ea0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/core/thread_pool.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <c10/macros/Export.h>
+#include <c10/util/Registry.h>
+#include <c10/util/numa.h>
+#include <c10/util/thread_name.h>
+
+namespace c10 {
+
+class C10_API TaskThreadPoolBase {
+ public:
+  virtual void run(std::function<void()> func) = 0;
+
+  virtual size_t size() const = 0;
+
+  /**
+   * The number of available (i.e. idle) threads in this thread pool.
+   */
+  virtual size_t numAvailable() const = 0;
+
+  /**
+   * Check if the current thread is from the thread pool.
+   */
+  virtual bool inThreadPool() const = 0;
+
+  virtual ~TaskThreadPoolBase() noexcept = default;
+
+  static size_t defaultNumThreads();
+};
+
+class C10_API ThreadPool : public c10::TaskThreadPoolBase {
+ protected:
+  struct task_element_t {
+    bool run_with_id;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::function<void()> no_id;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::function<void(std::size_t)> with_id;
+
+    explicit task_element_t(std::function<void()> f)
+        : run_with_id(false), no_id(std::move(f)), with_id(nullptr) {}
+    explicit task_element_t(std::function<void(std::size_t)> f)
+        : run_with_id(true), no_id(nullptr), with_id(std::move(f)) {}
+  };
+
+  std::queue<task_element_t> tasks_;
+  std::vector<std::thread> threads_;
+  mutable std::mutex mutex_;
+  std::condition_variable condition_;
+  std::condition_variable completed_;
+  std::atomic_bool running_;
+  bool complete_;
+  std::size_t available_;
+  std::size_t total_;
+  int numa_node_id_;
+
+ public:
+  ThreadPool() = delete;
+
+  explicit ThreadPool(
+      int pool_size,
+      int numa_node_id = -1,
+      const std::function<void()>& init_thread = nullptr);
+
+  ~ThreadPool() override;
+
+  size_t size() const override;
+
+  size_t numAvailable() const override;
+
+  bool inThreadPool() const override;
+
+  void run(std::function<void()> func) override;
+
+  template <typename Task>
+  void runTaskWithID(Task task) {
+    std::unique_lock<std::mutex> lock(mutex_);
+
+    // Set task and signal condition variable so that a worker thread will
+    // wake up and use the task.
+    tasks_.emplace(static_cast<std::function<void(std::size_t)>>(task));
+    complete_ = false;
+    condition_.notify_one();
+  }
+
+  /// @brief Wait for queue to be empty
+  void waitWorkComplete();
+
+ private:
+  // @brief Entry point for pool threads.
+  void main_loop(std::size_t index);
+};
+
+class C10_API TaskThreadPool : public c10::ThreadPool {
+ public:
+  explicit TaskThreadPool(int pool_size, int numa_node_id = -1)
+      : ThreadPool(pool_size, numa_node_id, [numa_node_id]() {
+          setThreadName("CaffeTaskThread");
+          NUMABind(numa_node_id);
+        }) {}
+};
+
+C10_DECLARE_SHARED_REGISTRY(
+    ThreadPoolRegistry,
+    TaskThreadPoolBase,
+    int,
+    int,
+    bool);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..f771421d4c8c4e4270ee6ff27ef736bbbcec318e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAlgorithm.h
@@ -0,0 +1,31 @@
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#endif
+namespace c10::cuda {
+#ifdef THRUST_DEVICE_LOWER_BOUND_WORKS
+template <typename Iter, typename Scalar>
+__forceinline__ __device__ Iter
+lower_bound(Iter start, Iter end, Scalar value) {
+  return thrust::lower_bound(thrust::device, start, end, value);
+}
+#else
+// thrust::lower_bound is broken on device, see
+// https://github.com/NVIDIA/thrust/issues/1734 Implementation inspired by
+// https://github.com/pytorch/pytorch/blob/805120ab572efef66425c9f595d9c6c464383336/aten/src/ATen/native/cuda/Bucketization.cu#L28
+template <typename Iter, typename Scalar>
+__device__ Iter lower_bound(Iter start, Iter end, Scalar value) {
+  while (start < end) {
+    auto mid = start + ((end - start) >> 1);
+    if (*mid < value) {
+      start = mid + 1;
+    } else {
+      end = mid;
+    }
+  }
+  return end;
+}
+#endif // THRUST_DEVICE_LOWER_BOUND_WORKS
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad45cd71d4e7858dee81ca82e247a5e33395e305
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAAllocatorConfig.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/util/Exception.h>
+#include <cuda_runtime_api.h>
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <mutex>
+#include <string>
+
+namespace c10::cuda::CUDACachingAllocator {
+
+// Environment config parser
+class C10_CUDA_API CUDAAllocatorConfig {
+ public:
+  static size_t max_split_size() {
+    return instance().m_max_split_size;
+  }
+  static double garbage_collection_threshold() {
+    return instance().m_garbage_collection_threshold;
+  }
+
+  static bool expandable_segments() {
+#ifndef PYTORCH_C10_DRIVER_API_SUPPORTED
+    if (instance().m_expandable_segments) {
+      TORCH_WARN_ONCE("expandable_segments not supported on this platform")
+    }
+    return false;
+#else
+    return instance().m_expandable_segments;
+#endif
+  }
+
+  static bool release_lock_on_cudamalloc() {
+    return instance().m_release_lock_on_cudamalloc;
+  }
+
+  /** Pinned memory allocator settings */
+  static bool pinned_use_cuda_host_register() {
+    return instance().m_pinned_use_cuda_host_register;
+  }
+
+  static size_t pinned_num_register_threads() {
+    return instance().m_pinned_num_register_threads;
+  }
+
+  static size_t pinned_max_register_threads() {
+    // Based on the benchmark results, we see better allocation performance
+    // with 8 threads. However on future systems, we may need more threads
+    // and limiting this to 128 threads.
+    return 128;
+  }
+
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As ane example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions(size_t size);
+
+  static std::vector<size_t> roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
+  static std::string last_allocator_settings() {
+    std::lock_guard<std::mutex> lock(
+        instance().m_last_allocator_settings_mutex);
+    return instance().m_last_allocator_settings;
+  }
+
+  static CUDAAllocatorConfig& instance() {
+    static CUDAAllocatorConfig* s_instance = ([]() {
+      auto inst = new CUDAAllocatorConfig();
+      const char* env = getenv("PYTORCH_CUDA_ALLOC_CONF");
+      inst->parseArgs(env);
+      return inst;
+    })();
+    return *s_instance;
+  }
+
+  void parseArgs(const char* env);
+
+ private:
+  CUDAAllocatorConfig();
+
+  static void lexArgs(const char* env, std::vector<std::string>& config);
+  static void consumeToken(
+      const std::vector<std::string>& config,
+      size_t i,
+      const char c);
+  size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseGarbageCollectionThreshold(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseRoundUpPower2Divisions(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parseAllocatorConfig(
+      const std::vector<std::string>& config,
+      size_t i,
+      bool& used_cudaMallocAsync);
+  size_t parsePinnedUseCudaHostRegister(
+      const std::vector<std::string>& config,
+      size_t i);
+  size_t parsePinnedNumRegisterThreads(
+      const std::vector<std::string>& config,
+      size_t i);
+
+  std::atomic<size_t> m_max_split_size;
+  std::vector<size_t> m_roundup_power2_divisions;
+  std::atomic<double> m_garbage_collection_threshold;
+  std::atomic<size_t> m_pinned_num_register_threads;
+  std::atomic<bool> m_expandable_segments;
+  std::atomic<bool> m_release_lock_on_cudamalloc;
+  std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::string m_last_allocator_settings;
+  std::mutex m_last_allocator_settings_mutex;
+};
+
+// General caching allocator utilities
+C10_CUDA_API void setAllocatorSettings(const std::string& env);
+
+} // namespace c10::cuda::CUDACachingAllocator
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..40f6bebe187585135a2d1e5c5863bec8b6585b57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDACachingAllocator.h
@@ -0,0 +1,481 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+namespace c10 {
+
+// Caching allocator will execute every registered callback if it unable to find
+// block inside of already allocated area.
+class C10_CUDA_API FreeMemoryCallback {
+ public:
+  virtual ~FreeMemoryCallback() = default;
+  virtual bool Execute() = 0;
+};
+
+C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
+#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
+} // namespace c10
+  //
+// TODO: Turn this into an honest to goodness class. I briefly attempted to do
+// this, but it was a bit irritating to figure out how to also correctly
+// apply pimpl pattern so I didn't have to leak any internal implementation
+// details in the header (CUDACachingAllocator could be made a pimpl, but
+// you also need to appropriately define a class which is a subclass
+// of Allocator. Not impossible, but required a bit more surgery than
+// I wanted to do at the time.)
+//
+// Why is this using a namespace rather than old-style THCCachingAllocator_
+// prefix?  Mostly because it made the HIPify rules easier to write; _ is
+// not counted as a word boundary, so you would otherwise have to list each
+// of these functions.
+
+namespace c10::cuda::CUDACachingAllocator {
+
+extern const size_t kLargeBuffer;
+
+struct Stat {
+  int64_t current = 0;
+  int64_t peak = 0;
+  int64_t allocated = 0;
+  int64_t freed = 0;
+};
+
+enum struct StatType : uint64_t {
+  AGGREGATE = 0,
+  SMALL_POOL = 1,
+  LARGE_POOL = 2,
+  NUM_TYPES = 3 // remember to update this whenever a new stat type is added
+};
+
+typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
+
+// Struct containing memory allocator summary statistics for a device.
+struct DeviceStats {
+  // COUNT: allocations requested by client code
+  StatArray allocation;
+  // COUNT: number of allocated segments from cudaMalloc().
+  StatArray segment;
+  // COUNT: number of active memory blocks (allocated or used by stream)
+  StatArray active;
+  // COUNT: number of inactive, split memory blocks (unallocated but can't be
+  // released via cudaFree)
+  StatArray inactive_split;
+
+  // SUM: bytes allocated by this memory alocator
+  StatArray allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  StatArray reserved_bytes;
+  // SUM: bytes within active memory blocks
+  StatArray active_bytes;
+  // SUM: bytes within inactive, split memory blocks
+  StatArray inactive_split_bytes;
+  // SUM: bytes requested by client code
+  StatArray requested_bytes;
+
+  // COUNT: total number of failed calls to CUDA malloc necessitating cache
+  // flushes.
+  int64_t num_alloc_retries = 0;
+
+  // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
+  int64_t num_ooms = 0;
+
+  // COUNT: total number of oversize blocks allocated from pool
+  Stat oversize_allocations;
+
+  // COUNT: total number of oversize blocks requiring malloc
+  Stat oversize_segments;
+
+  // COUNT: total number of synchronize_and_free_events() calls
+  int64_t num_sync_all_streams = 0;
+
+  // COUNT: total number of CUDA allocation calls. This includes both cuMemMap
+  // and cudaMalloc.
+  int64_t num_device_alloc = 0;
+
+  // COUNT: total number of CUDA free calls. This includes both cuMemUnmap
+  // and cudaFree.
+  int64_t num_device_free = 0;
+
+  // SIZE: maximum block size that is allowed to be split.
+  int64_t max_split_size = 0;
+};
+
+typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
+
+// Struct containing info of an allocation block (i.e. a fractional part of a
+// cudaMalloc)..
+struct BlockInfo {
+  int64_t size = 0;
+  int64_t requested_size = 0;
+  int32_t gc_counter = 0;
+  bool allocated = false;
+  bool active = false;
+  std::shared_ptr<GatheredContext>
+      context_when_allocated; // per-watcher context
+};
+
+// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
+struct SegmentInfo {
+  c10::DeviceIndex device = 0;
+  int64_t address = 0;
+  int64_t total_size = 0;
+  int64_t requested_size = 0; // unrounded, actually requested size
+  int64_t allocated_size = 0;
+  int64_t active_size = 0;
+  cudaStream_t stream = nullptr;
+  bool is_large = false;
+  bool is_expandable = false;
+  MempoolId_t owner_private_pool_id = {0, 0};
+  std::vector<BlockInfo> blocks;
+  std::shared_ptr<GatheredContext> context_when_allocated;
+};
+
+struct AllocatorState {
+  virtual ~AllocatorState() = default;
+};
+
+union trace_time_ {
+  time_t t_;
+  approx_time_t approx_t_;
+};
+
+struct TraceEntry {
+  enum Action {
+    ALLOC, // API made to the caching allocator for new memory
+    FREE_REQUESTED, // API call made to the caching allocator to free memory
+    FREE_COMPLETED, // The allocator might have to delay a free because
+                    // it is still in use on another stream via record_stream
+                    // This event is generated when a free actually completes.
+    SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
+    SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
+                  // defragment or empty_caches)
+    SEGMENT_MAP, // a call to cuMemMap (used with expandable_segments)
+    SEGMENT_UNMAP, // unmap part of a segment (used with expandable segments)
+    SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
+              // events
+    OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
+        // bytes reported by cuda)
+  };
+  TraceEntry(
+      Action action,
+      c10::DeviceIndex device,
+      int64_t addr,
+      size_t size,
+      cudaStream_t stream,
+      approx_time_t time,
+      std::shared_ptr<GatheredContext> context = nullptr)
+      : action_(action),
+        device_(device),
+        addr_(addr),
+        context_(std::move(context)),
+        stream_(stream),
+        size_(static_cast<int64_t>(size)) {
+    time_.approx_t_ = time;
+  }
+  Action action_;
+  c10::DeviceIndex device_;
+  int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
+  std::shared_ptr<GatheredContext> context_;
+  cudaStream_t stream_{};
+  int64_t size_;
+  trace_time_ time_{};
+};
+
+struct AllocatorConfigInfo {
+  double garbage_collection_threshold;
+  size_t max_split_size;
+  size_t pinned_num_register_threads;
+  bool expandable_segments;
+  bool release_lock_on_malloc;
+  bool pinned_use_host_register;
+  std::string last_allocator_settings;
+  std::vector<size_t> roundup_power2_divisions;
+};
+
+struct SnapshotInfo {
+  std::vector<SegmentInfo> segments;
+  std::vector<std::vector<TraceEntry>> device_traces;
+  AllocatorConfigInfo config_metadata;
+};
+
+// returns the pointers freed in the pool
+// and the pointers allocated. Note: a pointer
+// may appear in both freed and allocated
+struct CheckpointDelta {
+  std::vector<void*> ptrs_freed;
+  std::vector<at::DataPtr> dataptrs_allocd;
+};
+
+enum struct RecordContext {
+  NEVER = 0,
+  STATE = 1, // only keep stacks for active allocations
+  ALLOC = 2, // additionally keep stacks for allocations in the trace history
+  ALL = 3, // additionally record stacks for when something is freed
+};
+
+// Size pretty-printer
+std::string format_size(uint64_t size);
+
+using OutOfMemoryObserver = std::function<void(
+    int64_t device,
+    int64_t allocated,
+    int64_t device_total,
+    int64_t device_free)>;
+
+using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
+
+class CUDAAllocator : public Allocator {
+ public:
+  virtual void* raw_alloc(size_t nbytes) = 0;
+  virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
+  virtual void raw_delete(void* ptr) = 0;
+  virtual void init(int device_count) = 0;
+  virtual bool initialized() = 0;
+  virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
+  virtual void emptyCache() = 0;
+  virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
+  virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
+  virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
+  virtual DeviceStats getDeviceStats(c10::DeviceIndex device) = 0;
+  virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
+  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+  virtual SnapshotInfo snapshot() = 0;
+  virtual void beginAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      std::function<bool(cudaStream_t)> filter) = 0;
+  virtual void endAllocateToPool(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id) = 0;
+  virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
+  // returns true if the allocated blocks are equal to expected live allocations
+  virtual bool checkPoolLiveAllocations(
+      c10::DeviceIndex device,
+      MempoolId_t mempool_id,
+      const std::unordered_set<void*>& expected_live_allocations) {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support checkPoolLiveAllocations. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
+  virtual bool isHistoryEnabled() {
+    TORCH_CHECK(
+        false,
+        name(),
+        " does not yet support recordHistory. "
+        "If you need it, please file an issue describing your use case.");
+  }
+  virtual void recordHistory(
+      bool enabled,
+      CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      RecordContext when) = 0;
+  virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
+
+  // Attached AllocatorTraceTracker callbacks will be called while the
+  // per-device allocator lock is held. Any additional locks taken from within
+  // the callback must be proven to always have the lock order that never
+  // triggers a deadlock. In particular, Python's GIL may be held when
+  // calling the allocator so it is unsafe to try to acquire the GIL in this
+  // callback.
+  virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
+
+  virtual void enablePeerAccess(
+      c10::DeviceIndex dev,
+      c10::DeviceIndex dev_to_access) = 0;
+
+  // memory not allocated from cudaMalloc cannot be copied
+  // across devices using cudaMemcpyAsync if peer to peer access is disabled.
+  // instead it requires cudaMemcpyAsyncPeer
+  //  with P2P Enabled, all combinations work
+  //  with P2P Disabled:
+  //                       cudaMalloc cudaMallocAsync/cuMemMap
+  // cudaMemcpyAsyncPeer   works      works
+  // cudaMemcpyAsync       works      error
+
+  // This function performs chooses to use the Peer version of
+  // memcpy if required based on where the allocated put dst/src.
+  virtual cudaError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      cudaStream_t stream,
+      bool p2p_enabled) = 0;
+  virtual std::shared_ptr<AllocatorState> getCheckpointState(
+      c10::DeviceIndex device,
+      MempoolId_t id) = 0;
+  virtual CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<AllocatorState> pps) = 0;
+  virtual std::string name() = 0;
+};
+
+// Allocator object, statically initialized
+// See BackendInitializer in CUDACachingAllocator.cpp.
+// Atomic loads on x86 are just normal loads,
+// (atomic stores are different), so reading this value
+// is no different than loading a pointer.
+C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
+
+inline CUDAAllocator* get() {
+  return allocator.load();
+}
+
+// Called directly by clients.
+inline void* raw_alloc(size_t nbytes) {
+  return get()->raw_alloc(nbytes);
+}
+
+inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
+  return get()->raw_alloc_with_stream(nbytes, stream);
+}
+
+inline void raw_delete(void* ptr) {
+  return get()->raw_delete(ptr);
+}
+
+inline void init(int device_count) {
+  return get()->init(device_count);
+}
+
+inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
+  return get()->setMemoryFraction(fraction, device);
+}
+
+inline void emptyCache() {
+  return get()->emptyCache();
+}
+
+inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
+  return get()->cacheInfo(device, largestBlock);
+}
+
+inline void* getBaseAllocation(void* ptr, size_t* size) {
+  return get()->getBaseAllocation(ptr, size);
+}
+
+inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
+  return get()->recordStream(dataPtr, stream);
+}
+
+inline DeviceStats getDeviceStats(c10::DeviceIndex device) {
+  return get()->getDeviceStats(device);
+}
+
+inline void resetAccumulatedStats(c10::DeviceIndex device) {
+  return get()->resetAccumulatedStats(device);
+}
+
+inline void resetPeakStats(c10::DeviceIndex device) {
+  return get()->resetPeakStats(device);
+}
+
+inline SnapshotInfo snapshot() {
+  return get()->snapshot();
+}
+
+inline std::shared_ptr<AllocatorState> getCheckpointState(
+    c10::DeviceIndex device,
+    MempoolId_t id) {
+  return get()->getCheckpointState(device, id);
+}
+
+inline CheckpointDelta setCheckpointPoolState(
+    c10::DeviceIndex device,
+    std::shared_ptr<AllocatorState> pps) {
+  return get()->setCheckpointPoolState(device, std::move(pps));
+}
+
+// CUDAGraph interactions
+inline void beginAllocateToPool(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    std::function<bool(cudaStream_t)> filter) {
+  get()->beginAllocateToPool(device, mempool_id, std::move(filter));
+}
+
+inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  get()->endAllocateToPool(device, mempool_id);
+}
+
+inline void recordHistory(
+    bool enabled,
+    CreateContextFn context_recorder,
+    size_t alloc_trace_max_entries,
+    RecordContext when) {
+  return get()->recordHistory(
+      enabled, context_recorder, alloc_trace_max_entries, when);
+}
+
+inline bool isHistoryEnabled() {
+  return get()->isHistoryEnabled();
+}
+
+inline bool checkPoolLiveAllocations(
+    c10::DeviceIndex device,
+    MempoolId_t mempool_id,
+    const std::unordered_set<void*>& expected_live_allocations) {
+  return get()->checkPoolLiveAllocations(
+      device, mempool_id, expected_live_allocations);
+}
+
+inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
+  return get()->attachOutOfMemoryObserver(std::move(observer));
+}
+
+inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
+  return get()->attachAllocatorTraceTracker(std::move(tracker));
+}
+
+inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
+  return get()->releasePool(device, mempool_id);
+}
+// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
+inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
+  return get()->getIpcDevPtr(std::move(handle));
+}
+
+inline std::string name() {
+  return get()->name();
+}
+
+inline cudaError_t memcpyAsync(
+    void* dst,
+    int dstDevice,
+    const void* src,
+    int srcDevice,
+    size_t count,
+    cudaStream_t stream,
+    bool p2p_enabled) {
+  return get()->memcpyAsync(
+      dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
+}
+
+inline void enablePeerAccess(
+    c10::DeviceIndex dev,
+    c10::DeviceIndex dev_to_access) {
+  return get()->enablePeerAccess(dev, dev_to_access);
+}
+
+} // namespace c10::cuda::CUDACachingAllocator
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
new file mode 100644
index 0000000000000000000000000000000000000000..258abc302ae187cec646b5431ba1c96b331f0e6b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertion.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <c10/cuda/CUDAException.h>
+#include <c10/macros/Macros.h>
+
+namespace c10::cuda {
+
+#ifdef TORCH_USE_CUDA_DSA
+// Copy string from `src` to `dst`
+static __device__ void dstrcpy(char* dst, const char* src) {
+  int i = 0;
+  // Copy string from source to destination, ensuring that it
+  // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1`
+  while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) {
+    *dst++ = *src++;
+  }
+  *dst = '\0';
+}
+
+static __device__ void dsa_add_new_assertion_failure(
+    DeviceAssertionsData* assertions_data,
+    const char* assertion_msg,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const uint32_t caller,
+    const dim3 block_id,
+    const dim3 thread_id) {
+  // `assertions_data` may be nullptr if device-side assertion checking
+  // is disabled at run-time. If it is disabled at compile time this
+  // function will never be called
+  if (!assertions_data) {
+    return;
+  }
+
+  // Atomically increment so other threads can fail at the same time
+  // Note that incrementing this means that the CPU can observe that
+  // a failure has happened and can begin to respond before we've
+  // written information about that failure out to the buffer.
+  const auto nid = atomicAdd(&(assertions_data->assertion_count), 1);
+
+  if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) {
+    // At this point we're ran out of assertion buffer space.
+    // We could print a message about this, but that'd get
+    // spammy if a lot of threads did it, so we just silently
+    // ignore any other assertion failures. In most cases the
+    // failures will all probably be analogous anyway.
+    return;
+  }
+
+  // Write information about the assertion failure to memory.
+  // Note that this occurs only after the `assertion_count`
+  // increment broadcasts that there's been a problem.
+  auto& self = assertions_data->assertions[nid];
+  dstrcpy(self.assertion_msg, assertion_msg);
+  dstrcpy(self.filename, filename);
+  dstrcpy(self.function_name, function_name);
+  self.line_number = line_number;
+  self.caller = caller;
+  self.block_id[0] = block_id.x;
+  self.block_id[1] = block_id.y;
+  self.block_id[2] = block_id.z;
+  self.thread_id[0] = thread_id.x;
+  self.thread_id[1] = thread_id.y;
+  self.thread_id[2] = thread_id.z;
+}
+
+// Emulates a kernel assertion. The assertion won't stop the kernel's progress,
+// so you should assume everything the kernel produces is garbage if there's an
+// assertion failure.
+// NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are
+//       arguments of the kernel and therefore accessible.
+#define CUDA_KERNEL_ASSERT2(condition)                                   \
+  do {                                                                   \
+    if (C10_UNLIKELY(!(condition))) {                                    \
+      /* Has an atomic element so threads can fail at the same time */   \
+      c10::cuda::dsa_add_new_assertion_failure(                          \
+          assertions_data,                                               \
+          C10_STRINGIZE(condition),                                      \
+          __FILE__,                                                      \
+          __FUNCTION__,                                                  \
+          __LINE__,                                                      \
+          assertion_caller_id,                                           \
+          blockIdx,                                                      \
+          threadIdx);                                                    \
+      /* Now that the kernel has failed we early exit the kernel, but */ \
+      /* otherwise keep going and rely on the host to check UVM and */   \
+      /* determine we've had a problem */                                \
+      return;                                                            \
+    }                                                                    \
+  } while (false)
+#else
+#define CUDA_KERNEL_ASSERT2(condition) assert(condition)
+#endif
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d3e99198e7de7dd61c1880489c956231fb53337
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDADeviceAssertionHost.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_CUDA
+#define TORCH_USE_CUDA_DSA
+#endif
+
+/// Number of assertion failure messages we can store. If this is too small
+/// threads will fail silently.
+constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
+constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;
+
+namespace c10::cuda {
+
+/// Holds information about any device-side assertions that fail.
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionData {
+  /// Stringification of the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// File the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Name of the function the assertion was in
+  // NOLINTNEXTLINE(*-c-arrays)
+  char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
+  /// Line number the assertion was at
+  int line_number{};
+  /// Number uniquely identifying the kernel launch that triggered the assertion
+  uint32_t caller{};
+  /// block_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t block_id[3]{};
+  /// third_id of the thread that failed the assertion
+  // NOLINTNEXTLINE(*-c-arrays)
+  int32_t thread_id[3]{};
+};
+
+/// Used to hold assertions generated by the device
+/// Held in managed memory and access by both the CPU and the GPU.
+struct DeviceAssertionsData {
+  /// Total number of assertions found; a subset of thse will be recorded
+  /// in `assertions`
+  int32_t assertion_count{};
+  /// An array of assertions that will be written to in a race-free manner
+  // NOLINTNEXTLINE(*-c-arrays)
+  DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
+};
+
+/// Use to hold info about kernel launches so that we can run kernels
+/// asynchronously and still associate launches with device-side
+/// assertion failures
+struct CUDAKernelLaunchInfo {
+  /// Filename of the code where the kernel was launched from
+  const char* launch_filename;
+  /// Function from which the kernel was launched
+  const char* launch_function;
+  /// Line number of where the code was launched from
+  uint32_t launch_linenum;
+  /// Backtrace of where the kernel was launched from, only populated if
+  /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
+  std::string launch_stacktrace;
+  /// Kernel that was launched
+  const char* kernel_name;
+  /// Device the kernel was launched on
+  int device;
+  /// Stream the kernel was launched on
+  int32_t stream;
+  /// A number that uniquely identifies the kernel launch
+  uint64_t generation_number;
+};
+
+/// Circular buffer used to hold information about kernel launches
+/// this is later used to reconstruct how a device-side kernel assertion failure
+/// occurred CUDAKernelLaunchRegistry is used as a singleton
+class C10_CUDA_API CUDAKernelLaunchRegistry {
+ private:
+  /// Assume that this is the max number of kernel launches that might ever be
+  /// enqueued across all streams on a single device
+  static constexpr int max_kernel_launches = 1024;
+  /// How many kernel launch infos we've inserted. Used to ensure that circular
+  /// queue doesn't provide false information by always increasing, but also to
+  /// mark where we are inserting into the queue
+#ifdef TORCH_USE_CUDA_DSA
+  uint64_t generation_number = 0;
+#endif
+  /// Shared mutex between writer and accessor to ensure multi-threaded safety.
+  mutable std::mutex read_write_mutex;
+  /// Used to ensure prevent race conditions in GPU memory allocation
+  mutable std::mutex gpu_alloc_mutex;
+  /// Pointer to managed memory keeping track of device-side assertions. There
+  /// is one entry for each possible device the process might work with. Unused
+  /// entries are nullptrs. We could also use an unordered_set here, but this
+  /// vector design will be faster and the wasted memory is small since we
+  /// expect the number of GPUs per node will always be small
+  std::vector<
+      std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
+      uvm_assertions;
+  /// A single circular buffer holds information about every kernel launch the
+  /// process makes across all devices.
+  std::vector<CUDAKernelLaunchInfo> kernel_launches;
+  bool check_env_for_enable_launch_stacktracing() const;
+  bool check_env_for_dsa_enabled() const;
+
+ public:
+  CUDAKernelLaunchRegistry();
+  /// Register a new kernel launch and obtain a generation number back to be
+  /// passed to the kernel
+  uint32_t insert(
+      const char* launch_filename,
+      const char* launch_function,
+      const uint32_t launch_linenum,
+      const char* kernel_name,
+      const int32_t stream_id);
+  /// Get copies of the kernel launch registry and each device's assertion
+  /// failure buffer so they can be inspected without raising race conditions
+  std::
+      pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
+      snapshot() const;
+  /// Get a pointer to the current device's assertion failure buffer. If no such
+  /// buffer exists then one is created. This means that the first kernel launch
+  /// made on each device will be slightly slower because memory allocations are
+  /// required
+  DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
+  /// Gets the global singleton of the registry
+  static CUDAKernelLaunchRegistry& get_singleton_ref();
+  /// If not all devices support DSA, we disable it
+  const bool do_all_devices_support_managed_memory = false;
+  /// Whether or not to gather stack traces when launching kernels
+  bool gather_launch_stacktrace = false;
+  /// Whether or not host-side DSA is enabled or disabled at run-time
+  /// Note: Device-side code cannot be enabled/disabled at run-time
+  bool enabled_at_runtime = false;
+  /// Whether or not a device has indicated a failure
+  bool has_failed() const;
+#ifdef TORCH_USE_CUDA_DSA
+  const bool enabled_at_compile_time = true;
+#else
+  const bool enabled_at_compile_time = false;
+#endif
+};
+
+std::string c10_retrieve_device_side_assertion_info();
+
+} // namespace c10::cuda
+
+// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
+// requires the same input arguments. We introduce the following macro to
+// standardize these.
+#define TORCH_DSA_KERNEL_ARGS                                              \
+  [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
+      [[maybe_unused]] uint32_t assertion_caller_id
+
+// This macro can be used to pass the DSA arguments onward to another
+// function
+#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAException.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAException.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f49c852aa12f65b0df73fc4ef976595ca440332
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAException.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <c10/cuda/CUDADeviceAssertionHost.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAMiscFunctions.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cuda.h>
+
+// Note [CHECK macro]
+// ~~~~~~~~~~~~~~~~~~
+// This is a macro so that AT_ERROR can get accurate __LINE__
+// and __FILE__ information.  We could split this into a short
+// macro and a function implementation if we pass along __LINE__
+// and __FILE__, but no one has found this worth doing.
+
+// Used to denote errors from CUDA framework.
+// This needs to be declared here instead util/Exception.h for proper conversion
+// during hipify.
+namespace c10 {
+class C10_CUDA_API CUDAError : public c10::Error {
+  using Error::Error;
+};
+} // namespace c10
+
+#define C10_CUDA_CHECK(EXPR)                                        \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    c10::cuda::c10_cuda_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        true);                                                      \
+  } while (0)
+
+#define C10_CUDA_CHECK_WARN(EXPR)                              \
+  do {                                                         \
+    const cudaError_t __err = EXPR;                            \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                  \
+      auto error_unused C10_UNUSED = cudaGetLastError();       \
+      (void)error_unused;                                      \
+      TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
+    }                                                          \
+  } while (0)
+
+// Indicates that a CUDA error is handled in a non-standard way
+#define C10_CUDA_ERROR_HANDLED(EXPR) EXPR
+
+// Intentionally ignore a CUDA error
+#define C10_CUDA_IGNORE_ERROR(EXPR)                             \
+  do {                                                          \
+    const cudaError_t __err = EXPR;                             \
+    if (C10_UNLIKELY(__err != cudaSuccess)) {                   \
+      cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+      (void)error_unused;                                       \
+    }                                                           \
+  } while (0)
+
+// Clear the last CUDA error
+#define C10_CUDA_CLEAR_ERROR()                                \
+  do {                                                        \
+    cudaError_t error_unused C10_UNUSED = cudaGetLastError(); \
+    (void)error_unused;                                       \
+  } while (0)
+
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
+
+/// Launches a CUDA kernel appending to it all the information need to handle
+/// device-side assertion failures. Checks that the launch was successful.
+#define TORCH_DSA_KERNEL_LAUNCH(                                      \
+    kernel, blocks, threads, shared_mem, stream, ...)                 \
+  do {                                                                \
+    auto& launch_registry =                                           \
+        c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();     \
+    kernel<<<blocks, threads, shared_mem, stream>>>(                  \
+        __VA_ARGS__,                                                  \
+        launch_registry.get_uvm_assertions_ptr_for_current_device(),  \
+        launch_registry.insert(                                       \
+            __FILE__, __FUNCTION__, __LINE__, #kernel, stream.id())); \
+    C10_CUDA_KERNEL_LAUNCH_CHECK();                                   \
+  } while (0)
+
+namespace c10::cuda {
+
+/// In the event of a CUDA failure, formats a nice error message about that
+/// failure and also checks for device-side assertion failures
+C10_CUDA_API void c10_cuda_check_implementation(
+    const int32_t err,
+    const char* filename,
+    const char* function_name,
+    const int line_number,
+    const bool include_device_assertions);
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAFunctions.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..98540d9ab7a79a4d86545eb506505c310e584e5e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAFunctions.h
@@ -0,0 +1,116 @@
+#pragma once
+
+// This header provides C++ wrappers around commonly used CUDA API functions.
+// The benefit of using C++ here is that we can raise an exception in the
+// event of an error, rather than explicitly pass around error codes.  This
+// leads to more natural APIs.
+//
+// The naming convention used here matches the naming convention of torch.cuda
+
+#include <c10/core/Device.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <cuda_runtime_api.h>
+namespace c10::cuda {
+
+// NB: In the past, we were inconsistent about whether or not this reported
+// an error if there were driver problems are not.  Based on experience
+// interacting with users, it seems that people basically ~never want this
+// function to fail; it should just return zero if things are not working.
+// Oblige them.
+// It still might log a warning for user first time it's invoked
+C10_CUDA_API DeviceIndex device_count() noexcept;
+
+// Version of device_count that throws is no devices are detected
+C10_CUDA_API DeviceIndex device_count_ensure_non_zero();
+
+C10_CUDA_API DeviceIndex current_device();
+
+C10_CUDA_API void set_device(DeviceIndex device);
+
+C10_CUDA_API void device_synchronize();
+
+C10_CUDA_API void warn_or_error_on_sync();
+
+// Raw CUDA device management functions
+C10_CUDA_API cudaError_t GetDeviceCount(int* dev_count);
+
+C10_CUDA_API cudaError_t GetDevice(DeviceIndex* device);
+
+C10_CUDA_API cudaError_t SetDevice(DeviceIndex device);
+
+C10_CUDA_API cudaError_t MaybeSetDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex ExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API DeviceIndex MaybeExchangeDevice(DeviceIndex device);
+
+C10_CUDA_API void SetTargetDevice();
+
+enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
+
+// this is a holder for c10 global state (similar to at GlobalContext)
+// currently it's used to store cuda synchronization warning state,
+// but can be expanded to hold other related global state, e.g. to
+// record stream usage
+class WarningState {
+ public:
+  void set_sync_debug_mode(SyncDebugMode l) {
+    sync_debug_mode = l;
+  }
+
+  SyncDebugMode get_sync_debug_mode() {
+    return sync_debug_mode;
+  }
+
+ private:
+  SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
+};
+
+C10_CUDA_API __inline__ WarningState& warning_state() {
+  static WarningState warning_state_;
+  return warning_state_;
+}
+// the subsequent functions are defined in the header because for performance
+// reasons we want them to be inline
+C10_CUDA_API void __inline__ memcpy_and_sync(
+    void* dst,
+    const void* src,
+    int64_t nbytes,
+    cudaMemcpyKind kind,
+    cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        reinterpret_cast<uintptr_t>(stream));
+  }
+#if defined(TORCH_HIP_VERSION) && (TORCH_HIP_VERSION >= 301)
+  C10_CUDA_CHECK(hipMemcpyWithStream(dst, src, nbytes, kind, stream));
+#else
+  C10_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, kind, stream));
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+#endif
+}
+
+C10_CUDA_API void __inline__ stream_synchronize(cudaStream_t stream) {
+  if (C10_UNLIKELY(
+          warning_state().get_sync_debug_mode() != SyncDebugMode::L_DISABLED)) {
+    warn_or_error_on_sync();
+  }
+  const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+  if (C10_UNLIKELY(interp)) {
+    (*interp)->trace_gpu_stream_synchronization(
+        reinterpret_cast<uintptr_t>(stream));
+  }
+  C10_CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+C10_CUDA_API bool hasPrimaryContext(DeviceIndex device_index);
+C10_CUDA_API c10::optional<DeviceIndex> getDeviceIndexWithPrimaryContext();
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6792068fb0377c4f4c2edc0ede11e3a3c0efdf2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGraphsC10Utils.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <iostream>
+#include <utility>
+
+// CUDA Graphs utils used by c10 and aten.
+// aten/cuda/CUDAGraphsUtils.cuh adds utils used by aten only.
+
+namespace c10::cuda {
+
+using CaptureId_t = unsigned long long;
+
+// first is set if the instance is created by CUDAGraph::capture_begin.
+// second is set if the instance is created by at::cuda::graph_pool_handle.
+using MempoolId_t = std::pair<CaptureId_t, CaptureId_t>;
+
+// RAII guard for "cudaStreamCaptureMode", a thread-local value
+// that controls the error-checking strictness of a capture.
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+struct C10_CUDA_API CUDAStreamCaptureModeGuard {
+  CUDAStreamCaptureModeGuard(cudaStreamCaptureMode desired)
+      : strictness_(desired) {
+    C10_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+  ~CUDAStreamCaptureModeGuard() {
+    C10_CUDA_CHECK_WARN(cudaThreadExchangeStreamCaptureMode(&strictness_));
+  }
+
+ private:
+  cudaStreamCaptureMode strictness_;
+};
+#endif
+
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+// Protects against enum cudaStreamCaptureStatus implementation changes.
+// Some compilers seem not to like static_assert without the messages.
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) == 0,
+    "unexpected int(cudaStreamCaptureStatusNone) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive) == 1,
+    "unexpected int(cudaStreamCaptureStatusActive) value");
+static_assert(
+    int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated) == 2,
+    "unexpected int(cudaStreamCaptureStatusInvalidated) value");
+#endif
+
+enum class CaptureStatus : int {
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+  None = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusNone),
+  Active = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusActive),
+  Invalidated = int(cudaStreamCaptureStatus::cudaStreamCaptureStatusInvalidated)
+#else
+  None = 0
+#endif
+};
+
+inline std::ostream& operator<<(std::ostream& os, CaptureStatus status) {
+  switch (status) {
+    case CaptureStatus::None:
+      os << "cudaStreamCaptureStatusNone";
+      break;
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+    case CaptureStatus::Active:
+      os << "cudaStreamCaptureStatusActive";
+      break;
+    case CaptureStatus::Invalidated:
+      os << "cudaStreamCaptureStatusInvalidated";
+      break;
+#endif
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unknown CUDA graph CaptureStatus", int(status));
+  }
+  return os;
+}
+
+// Use this version where you're sure a CUDA context exists already.
+inline CaptureStatus currentStreamCaptureStatusMayInitCtx() {
+#if !defined(USE_ROCM) || ROCM_VERSION >= 50300
+  cudaStreamCaptureStatus is_capturing{cudaStreamCaptureStatusNone};
+  C10_CUDA_CHECK(
+      cudaStreamIsCapturing(c10::cuda::getCurrentCUDAStream(), &is_capturing));
+  return CaptureStatus(is_capturing);
+#else
+  return CaptureStatus::None;
+#endif
+}
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGuard.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..09f1a6f2b6f8be7fbf7966a6b6256ffad1c09da6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAGuard.h
@@ -0,0 +1,303 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/impl/CUDAGuardImpl.h>
+
+#include <cstddef>
+
+namespace c10::cuda {
+
+// This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
+// boilerplate]
+
+/// A variant of DeviceGuard that is specialized for CUDA.  It accepts
+/// integer indices (interpreting them as CUDA devices) and is a little
+/// more efficient than DeviceGuard (it compiles to straight line
+/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
+/// from code that links against CUDA directly.
+struct CUDAGuard {
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit CUDAGuard() = delete;
+
+  /// Set the current CUDA device to the passed device index.
+  explicit CUDAGuard(DeviceIndex device_index) : guard_(device_index) {}
+
+  /// Sets the current CUDA device to the passed device.  Errors if the passed
+  /// device is not a CUDA device.
+  explicit CUDAGuard(Device device) : guard_(device) {}
+
+  // Copy is not allowed
+  CUDAGuard(const CUDAGuard&) = delete;
+  CUDAGuard& operator=(const CUDAGuard&) = delete;
+
+  // Move is not allowed (there is no uninitialized state)
+  CUDAGuard(CUDAGuard&& other) = delete;
+  CUDAGuard& operator=(CUDAGuard&& other) = delete;
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device.  Errors if the given device
+  /// is not a CUDA device.  (This method is provided for uniformity with
+  /// DeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set upon construction of the guard
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any, otherwise
+  /// the device passed during construction.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  /// The guard for the current device.
+  c10::impl::InlineDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAGuard {
+  /// Create an uninitialized OptionalCUDAGuard.
+  explicit OptionalCUDAGuard() : guard_() {}
+
+  /// Set the current CUDA device to the passed Device, if it is not nullopt.
+  explicit OptionalCUDAGuard(optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current CUDA device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalCUDAGuard(optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalCUDAGuard(const OptionalCUDAGuard&) = delete;
+  OptionalCUDAGuard& operator=(const OptionalCUDAGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAGuard(OptionalCUDAGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAGuard& operator=(OptionalCUDAGuard&& other) = delete;
+
+  /// Sets the CUDA device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a CUDA
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the CUDA device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a CUDA device.
+  /// (This method is provided for uniformity with OptionalDeviceGuard).
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the CUDA device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original CUDA device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of StreamGuard that is specialized for CUDA.  See CUDAGuard
+/// for when you can use this.
+struct CUDAStreamGuard {
+  /// No default constructor, see Note [Omitted default constructor from RAII]
+  explicit CUDAStreamGuard() = delete;
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit CUDAStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Copy is disallowed
+  CUDAStreamGuard(const CUDAStreamGuard&) = delete;
+  CUDAStreamGuard& operator=(const CUDAStreamGuard&) = delete;
+
+  /// Move is disallowed, as CUDAStreamGuard does not have an uninitialized
+  /// state, which is required for moves on types with nontrivial destructors.
+  CUDAStreamGuard(CUDAStreamGuard&& other) = delete;
+  CUDAStreamGuard& operator=(CUDAStreamGuard&& other) = delete;
+
+  /// Resets the currently set stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Errors if the stream passed is not a CUDA stream.
+  ///
+  /// NOTE: this implementation may skip some stream/device setting if
+  /// it can prove that it is unnecessary.
+  ///
+  /// WARNING: reset_stream does NOT preserve previously set streams on
+  /// different devices.  If you need to set streams on multiple devices
+  /// on CUDA, use CUDAMultiStreamGuard instead.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was
+  /// constructed.
+  CUDAStream original_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.original_stream());
+  }
+
+  /// Returns the most recent CUDA stream that was set using this device guard,
+  /// either from construction, or via set_stream.
+  CUDAStream current_stream() const {
+    return CUDAStream(CUDAStream::UNCHECKED, guard_.current_stream());
+  }
+
+  /// Returns the most recent CUDA device that was set using this device guard,
+  /// either from construction, or via set_device/reset_device/set_index.
+  Device current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Returns the CUDA device that was set at the most recent reset_stream(),
+  /// or otherwise the device at construction time.
+  Device original_device() const {
+    return guard_.original_device();
+  }
+
+ private:
+  c10::impl::InlineStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of OptionalStreamGuard that is specialized for CUDA.  See
+/// CUDAGuard for when you can use this.
+struct OptionalCUDAStreamGuard {
+  /// Create an uninitialized guard.
+  explicit OptionalCUDAStreamGuard() : guard_() {}
+
+  /// Set the current CUDA device to the device associated with the passed
+  /// stream, and set the current CUDA stream on that device to the passed
+  /// stream. Errors if the Stream is not a CUDA stream.
+  explicit OptionalCUDAStreamGuard(Stream stream) : guard_(stream) {}
+
+  /// Set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream,
+  /// if the passed stream is not nullopt.
+  explicit OptionalCUDAStreamGuard(optional<Stream> stream_opt)
+      : guard_(stream_opt) {}
+
+  /// Copy is disallowed
+  OptionalCUDAStreamGuard(const OptionalCUDAStreamGuard&) = delete;
+  OptionalCUDAStreamGuard& operator=(const OptionalCUDAStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalCUDAStreamGuard(OptionalCUDAStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalCUDAStreamGuard& operator=(OptionalCUDAStreamGuard&& other) = delete;
+
+  /// Resets the currently set CUDA stream to the original stream and
+  /// the currently set device to the original device.  Then,
+  /// set the current device to the device associated with the passed stream,
+  /// and set the current stream on that device to the passed stream.
+  /// Initializes the guard if it was not previously initialized.
+  void reset_stream(Stream stream) {
+    guard_.reset_stream(stream);
+  }
+
+  /// Returns the CUDA stream that was set at the time the guard was most
+  /// recently initialized, or nullopt if the guard is uninitialized.
+  optional<CUDAStream> original_stream() const {
+    auto r = guard_.original_stream();
+    if (r.has_value()) {
+      return make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  /// Returns the most recent CUDA stream that was set using this stream guard,
+  /// either from construction, or via reset_stream, if the guard is
+  /// initialized, or nullopt if the guard is uninitialized.
+  optional<CUDAStream> current_stream() const {
+    auto r = guard_.current_stream();
+    if (r.has_value()) {
+      return make_optional(CUDAStream(CUDAStream::UNCHECKED, r.value()));
+    } else {
+      return nullopt;
+    }
+  }
+
+  /// Restore the original CUDA device and stream, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalStreamGuard<impl::CUDAGuardImpl> guard_;
+};
+
+/// A variant of MultiStreamGuard that is specialized for CUDA.
+struct CUDAMultiStreamGuard {
+  explicit CUDAMultiStreamGuard(ArrayRef<CUDAStream> streams)
+      : guard_(unwrapStreams(streams)) {}
+
+  /// Copy is disallowed
+  CUDAMultiStreamGuard(const CUDAMultiStreamGuard&) = delete;
+  CUDAMultiStreamGuard& operator=(const CUDAMultiStreamGuard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  CUDAMultiStreamGuard(CUDAMultiStreamGuard&& other) = delete;
+
+  // See Note [Move assignment for RAII guards is tricky]
+  CUDAMultiStreamGuard& operator=(CUDAMultiStreamGuard&& other) = delete;
+
+ private:
+  c10::impl::InlineMultiStreamGuard<impl::CUDAGuardImpl> guard_;
+
+  static std::vector<Stream> unwrapStreams(ArrayRef<CUDAStream> cudaStreams) {
+    std::vector<Stream> streams;
+    streams.reserve(cudaStreams.size());
+    for (const CUDAStream& cudaStream : cudaStreams) {
+      streams.push_back(cudaStream);
+    }
+    return streams;
+  }
+};
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMacros.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b69035c4e83fd2eca488a3a726eddd49cab4058
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMacros.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+
+// We have not yet modified the AMD HIP build to generate this file so
+// we add an extra option to specifically ignore it.
+#ifndef C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+#include <c10/cuda/impl/cuda_cmake_macros.h>
+#endif // C10_CUDA_NO_CMAKE_CONFIGURE_FILE
+
+#endif
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#ifdef _WIN32
+#if defined(C10_CUDA_BUILD_SHARED_LIBS)
+#define C10_CUDA_EXPORT __declspec(dllexport)
+#define C10_CUDA_IMPORT __declspec(dllimport)
+#else
+#define C10_CUDA_EXPORT
+#define C10_CUDA_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_CUDA_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_CUDA_EXPORT
+#endif // defined(__GNUC__)
+#define C10_CUDA_IMPORT C10_CUDA_EXPORT
+#endif // _WIN32
+
+// This one is being used by libc10_cuda.so
+#ifdef C10_CUDA_BUILD_MAIN_LIB
+#define C10_CUDA_API C10_CUDA_EXPORT
+#else
+#define C10_CUDA_API C10_CUDA_IMPORT
+#endif
+
+/**
+ * The maximum number of GPUs that we recognizes. Increasing this beyond the
+ * initial limit of 16 broke Caffe2 testing, hence the ifdef guards.
+ * This value cannot be more than 128 because our DeviceIndex is a uint8_t.
+o */
+#ifdef FBCODE_CAFFE2
+// fbcode depends on this value being 16
+#define C10_COMPILE_TIME_MAX_GPUS 16
+#else
+#define C10_COMPILE_TIME_MAX_GPUS 120
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMathCompat.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c41bd5f382b35c7955bcda8035b156e4ad26e18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMathCompat.h
@@ -0,0 +1,152 @@
+#pragma once
+
+/* This file defines math functions compatible across different gpu
+ * platforms (currently CUDA and HIP).
+ */
+#if defined(__CUDACC__) || defined(__HIPCC__)
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __HIPCC__
+#define __MATH_FUNCTIONS_DECL__ inline C10_DEVICE
+#else /* __HIPCC__ */
+#ifdef __CUDACC_RTC__
+#define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline C10_HOST_DEVICE
+#endif /* __CUDACC_RTC__ */
+#endif /* __HIPCC__ */
+
+namespace c10::cuda::compat {
+
+__MATH_FUNCTIONS_DECL__ float abs(float x) {
+  return ::fabsf(x);
+}
+__MATH_FUNCTIONS_DECL__ double abs(double x) {
+  return ::fabs(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp(float x) {
+  return ::expf(x);
+}
+__MATH_FUNCTIONS_DECL__ double exp(double x) {
+  return ::exp(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float ceil(float x) {
+  return ::ceilf(x);
+}
+__MATH_FUNCTIONS_DECL__ double ceil(double x) {
+  return ::ceil(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(float x, float y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysignf(x, y);
+#else
+  // std::copysign gets ICE/Segfaults with gcc 7.5/8 on arm64
+  // (e.g. Jetson), see PyTorch PR #51834
+  // This host function needs to be here for the compiler but is never used
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+__MATH_FUNCTIONS_DECL__ double copysign(double x, double y) {
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+  return ::copysign(x, y);
+#else
+  // see above
+  TORCH_INTERNAL_ASSERT(
+      false, "CUDAMathCompat copysign should not run on the CPU");
+#endif
+}
+
+__MATH_FUNCTIONS_DECL__ float floor(float x) {
+  return ::floorf(x);
+}
+__MATH_FUNCTIONS_DECL__ double floor(double x) {
+  return ::floor(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log(float x) {
+  return ::logf(x);
+}
+__MATH_FUNCTIONS_DECL__ double log(double x) {
+  return ::log(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(float x) {
+  return ::log1pf(x);
+}
+
+__MATH_FUNCTIONS_DECL__ double log1p(double x) {
+  return ::log1p(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float max(float x, float y) {
+  return ::fmaxf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double max(double x, double y) {
+  return ::fmax(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float min(float x, float y) {
+  return ::fminf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double min(double x, double y) {
+  return ::fmin(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ float pow(float x, float y) {
+  return ::powf(x, y);
+}
+__MATH_FUNCTIONS_DECL__ double pow(double x, double y) {
+  return ::pow(x, y);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(float x, float* sptr, float* cptr) {
+  return ::sincosf(x, sptr, cptr);
+}
+__MATH_FUNCTIONS_DECL__ void sincos(double x, double* sptr, double* cptr) {
+  return ::sincos(x, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float sqrt(float x) {
+  return ::sqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double sqrt(double x) {
+  return ::sqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(float x) {
+  return ::rsqrtf(x);
+}
+__MATH_FUNCTIONS_DECL__ double rsqrt(double x) {
+  return ::rsqrt(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tan(float x) {
+  return ::tanf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tan(double x) {
+  return ::tan(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float tanh(float x) {
+  return ::tanhf(x);
+}
+__MATH_FUNCTIONS_DECL__ double tanh(double x) {
+  return ::tanh(x);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(float x) {
+  return ::normcdff(x);
+}
+__MATH_FUNCTIONS_DECL__ double normcdf(double x) {
+  return ::normcdf(x);
+}
+
+} // namespace c10::cuda::compat
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..203363028fd72d01065f161832c7e6c5f6b5217d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAMiscFunctions.h
@@ -0,0 +1,12 @@
+#pragma once
+// this file is to avoid circular dependency between CUDAFunctions.h and
+// CUDAExceptions.h
+
+#include <c10/cuda/CUDAMacros.h>
+
+#include <mutex>
+
+namespace c10::cuda {
+C10_CUDA_API const char* get_cuda_check_suffix() noexcept;
+C10_CUDA_API std::mutex* getFreeMutex();
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAStream.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b6609f210d4967c191705c31121ddf7520c30a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/CUDAStream.h
@@ -0,0 +1,271 @@
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <cuda_runtime_api.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/util/Exception.h>
+
+/*
+ * Stream pool note.
+ *
+ * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
+ * are backed by cuStreams, but they use several pools to minimize the costs
+ * associated with creating, retaining, and destroying cuStreams.
+ *
+ * There are three pools per device, and a device's pools are lazily created.
+ *
+ * The first pool contains only the default stream. When the default stream
+ * is requested it's returned.
+ *
+ * The second pool is the "low priority" or "default priority" streams. In
+ * HIP builds there is no distinction between streams in this pool and streams
+ * in the third pool (below). There are 32 of these streams per device, and
+ * when a stream is requested one of these streams is returned round-robin.
+ * That is, the first stream requested is at index 0, the second at index 1...
+ * to index 31, then index 0 again.
+ *
+ * This means that if 33 low priority streams are requested, the first and
+ * last streams requested are actually the same stream (under the covers)
+ * and kernels enqueued on them cannot run concurrently.
+ *
+ * The third pool is the "high priority" streams. The third pool acts like
+ * the second pool except the streams are created with a higher priority.
+ *
+ * These pools suggest that stream users should prefer many short-lived streams,
+ * as the cost of acquiring and releasing streams is effectively zero. If
+ * many longer-lived streams are required in performance critical scenarios
+ * then the functionality here may need to be extended to allow, for example,
+ * "reserving" a subset of the pool so that other streams do not accidentally
+ * overlap the performance critical streams.
+ *
+ * Note: although the notion of "current stream for device" is thread local
+ * (every OS thread has a separate current stream, as one might expect),
+ * the stream pool is global across all threads; stream 0 is always stream 0
+ * no matter which thread you use it on.  Multiple threads can synchronize
+ * on the same stream.  Although the CUDA documentation is not very clear
+ * on the matter, streams are thread safe; e.g., it is safe to enqueue
+ * a kernel on the same stream from two different threads.
+ */
+
+namespace c10::cuda {
+
+static constexpr int max_compile_time_stream_priorities = 4;
+
+// Value object representing a CUDA stream.  This is just a wrapper
+// around c10::Stream, but it comes with a little extra CUDA-specific
+// functionality (conversion to cudaStream_t), and a guarantee that
+// the wrapped c10::Stream really is a CUDA stream.
+class C10_CUDA_API CUDAStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a CUDAStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a CUDA stream.
+  explicit CUDAStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
+  }
+
+  /// Construct a CUDAStream from a Stream with no error checking.
+  /// This constructor uses the "named" constructor idiom, and can
+  /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
+  explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const CUDAStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const CUDAStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  /// Implicit conversion to cudaStream_t.
+  operator cudaStream_t() const {
+    return stream();
+  }
+
+  /// Implicit conversion to Stream (a.k.a., forget that the stream is a
+  /// CUDA stream).
+  operator Stream() const {
+    return unwrap();
+  }
+
+  /// Used to avoid baking in device type explicitly to Python-side API.
+  DeviceType device_type() const {
+    return DeviceType::CUDA;
+  }
+
+  /// Get the CUDA device index that this stream is associated with.
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  Device device() const {
+    return Device(DeviceType::CUDA, device_index());
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+    cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
+
+    if (err == cudaSuccess) {
+      return true;
+    } else if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    c10::cuda::stream_synchronize(stream());
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  /// Explicit conversion to cudaStream_t.
+  cudaStream_t stream() const;
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  /// Reversibly pack a CUDAStream into a struct representation.
+  /// Previously the stream's data was packed into a single int64_t,
+  /// as it was assumed the fields would not require more than
+  /// 64 bits of storage in total.
+  /// See https://github.com/pytorch/pytorch/issues/75854
+  /// for more information regarding newer platforms that may violate
+  /// this assumption.
+  ///
+  /// The CUDAStream can be unpacked using unpack().
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  // Unpack a CUDAStream from the 3 fields generated by pack().
+  static CUDAStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+    // Note: this returns the range of priority **supported by PyTorch**, not
+    // the range of priority **supported by CUDA**. The former is a subset of
+    // the latter.
+    int least_priority = 0, greatest_priority = 0;
+    C10_CUDA_CHECK(
+        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+#ifdef USE_ROCM
+    // See Note [HIP stream priorities]
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 1, "Unexpected HIP stream priority range");
+    least_priority = 0;
+#else
+    TORCH_INTERNAL_ASSERT(
+        least_priority == 0, "Unexpected CUDA stream priority range");
+#endif
+    TORCH_INTERNAL_ASSERT(
+        greatest_priority <= -1, "Unexpected CUDA stream priority range");
+    greatest_priority = std::max(
+        -c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
+    return std::make_tuple(least_priority, greatest_priority);
+  }
+
+  // Deleted for now; use CUDAEvent::block instead
+  // void synchronize_with(const CUDAEvent& event) const;
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a new stream from the CUDA stream pool.  You can think of this
+ * as "creating" a new stream, but no such creation actually happens;
+ * instead, streams are preallocated from the pool and returned in a
+ * round-robin fashion.
+ *
+ * You can request a stream from the high priority pool by setting
+ * isHighPriority to true, or a stream for a specific device by setting device
+ * (defaulting to the current CUDA stream.)
+ */
+C10_API CUDAStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+// no default priority to disambiguate overloads
+C10_API CUDAStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get a CUDAStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+C10_API CUDAStream
+getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
+
+/**
+ * Get the default CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The default stream is
+ * where most computation occurs when you aren't explicitly using
+ * streams.
+ */
+C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Get the current CUDA stream, for the passed CUDA device, or for the
+ * current device if no device index is passed.  The current CUDA stream
+ * will usually be the default CUDA stream for the device, but it may
+ * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
+ * or 'CUDAStreamGuard'.
+ */
+C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be
+ * the passed in stream.  Yes, you read that right: this function
+ * has *nothing* to do with the current device: it toggles the current
+ * stream of the device of the passed stream.
+ *
+ * Confused?  Avoid using this function; prefer using 'CUDAStreamGuard' instead
+ * (which will switch both your current device and current stream in the way you
+ * expect, and reset it back to its original state afterwards).
+ */
+C10_API void setCurrentCUDAStream(CUDAStream stream);
+
+C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
+
+} // namespace c10::cuda
+
+namespace std {
+template <>
+struct hash<c10::cuda::CUDAStream> {
+  size_t operator()(c10::cuda::CUDAStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/driver_api.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/driver_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..80b7bcbec62c8821e597eedf2e51ac64cf48f649
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/driver_api.h
@@ -0,0 +1,49 @@
+#pragma once
+#include <cuda.h>
+#define NVML_NO_UNVERSIONED_FUNC_DEFS
+#include <nvml.h>
+
+#define C10_CUDA_DRIVER_CHECK(EXPR)                                        \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err C10_UNUSED =                              \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        AT_ERROR("CUDA driver error: unknown error");                      \
+      } else {                                                             \
+        AT_ERROR("CUDA driver error: ", err_str);                          \
+      }                                                                    \
+    }                                                                      \
+  } while (0)
+
+#define C10_LIBCUDA_DRIVER_API(_) \
+  _(cuMemAddressReserve)          \
+  _(cuMemRelease)                 \
+  _(cuMemMap)                     \
+  _(cuMemAddressFree)             \
+  _(cuMemSetAccess)               \
+  _(cuMemUnmap)                   \
+  _(cuMemCreate)                  \
+  _(cuGetErrorString)
+
+#define C10_NVML_DRIVER_API(_)           \
+  _(nvmlInit_v2)                         \
+  _(nvmlDeviceGetHandleByPciBusId_v2)    \
+  _(nvmlDeviceGetNvLinkRemoteDeviceType) \
+  _(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
+  _(nvmlDeviceGetComputeRunningProcesses)
+
+namespace c10::cuda {
+
+struct DriverAPI {
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
+  C10_NVML_DRIVER_API(CREATE_MEMBER)
+#undef CREATE_MEMBER
+  static DriverAPI* get();
+  static void* get_nvml_handle();
+};
+
+} // namespace c10::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..844c5dd12e340370cd220d1abb52da8bb266280b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDAGuardImpl.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/GPUTrace.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAFunctions.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/util/Optional.h>
+#include <cuda_runtime_api.h>
+#include <cstdint>
+
+namespace c10::cuda::impl {
+
+struct CUDAGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = DeviceType::CUDA;
+
+  CUDAGuardImpl() = default;
+  explicit CUDAGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == DeviceType::CUDA);
+  }
+  DeviceType type() const override {
+    return DeviceType::CUDA;
+  }
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    auto old_device_index = c10::cuda::ExchangeDevice(d.index());
+    return Device(DeviceType::CUDA, old_device_index);
+  }
+  Device getDevice() const override {
+    DeviceIndex device = 0;
+    C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    return Device(DeviceType::CUDA, device);
+  }
+  c10::optional<Device> uncheckedGetDevice() const noexcept {
+    DeviceIndex device{-1};
+    const auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDevice(&device));
+    C10_CUDA_CHECK_WARN(err);
+    if (err != cudaSuccess) {
+      return c10::nullopt;
+    }
+    return Device(DeviceType::CUDA, device);
+  }
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_cuda());
+    C10_CUDA_CHECK(c10::cuda::SetDevice(d.index()));
+  }
+  void uncheckedSetDevice(Device d) const noexcept override {
+    C10_CUDA_CHECK_WARN(c10::cuda::MaybeSetDevice(d.index()));
+  }
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentCUDAStream(d.index()).unwrap();
+  }
+  Stream getDefaultStream(Device d) const override {
+    return getDefaultCUDAStream(d.index());
+  }
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    CUDAStream cs(s);
+    auto old_stream = getCurrentCUDAStream(s.device().index());
+    setCurrentCUDAStream(cs);
+    return old_stream.unwrap();
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    return device_count();
+  }
+
+  // Event-related functions
+  void createEvent(cudaEvent_t* cuda_event, const EventFlag flag) const {
+    // Maps PyTorch's Event::Flag to CUDA flag
+    auto cuda_flag = cudaEventDefault;
+    switch (flag) {
+      case EventFlag::PYTORCH_DEFAULT:
+      case EventFlag::CUDA_EVENT_DISABLE_TIMING:
+        cuda_flag = cudaEventDisableTiming;
+        break;
+      case EventFlag::BACKEND_DEFAULT:
+      case EventFlag::CUDA_EVENT_DEFAULT:
+        cuda_flag = cudaEventDefault;
+        break;
+      default:
+        TORCH_CHECK(false, "CUDA event received unknown flag");
+    }
+
+    C10_CUDA_CHECK(cudaEventCreateWithFlags(cuda_event, cuda_flag));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_creation(
+          reinterpret_cast<uintptr_t>(cuda_event));
+    }
+  }
+
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {
+    if (!event)
+      return;
+    auto cuda_event = static_cast<cudaEvent_t>(event);
+    DeviceIndex orig_device{-1};
+    C10_CUDA_CHECK_WARN(c10::cuda::GetDevice(&orig_device));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(device_index));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_deletion(
+          reinterpret_cast<uintptr_t>(cuda_event));
+    }
+    C10_CUDA_CHECK_WARN(cudaEventDestroy(cuda_event));
+    C10_CUDA_CHECK_WARN(c10::cuda::SetDevice(orig_device));
+  }
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(*event);
+    CUDAStream cuda_stream{stream};
+
+    // Moves to stream's device to record
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+
+    // Creates the event (lazily)
+    if (!cuda_event)
+      createEvent(&cuda_event, flag);
+    C10_CUDA_CHECK(cudaEventRecord(cuda_event, cuda_stream));
+    // Makes the void* point to the (possibly just allocated) CUDA event
+    *event = cuda_event;
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_record(
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+
+    // Resets device
+    setDevice(orig_device);
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    CUDAStream cuda_stream{stream};
+    const auto orig_device = getDevice();
+    setDevice(stream.device());
+    C10_CUDA_CHECK(cudaStreamWaitEvent(
+        cuda_stream,
+        cuda_event,
+        /*flags (must be zero)=*/0));
+    const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
+    if (C10_UNLIKELY(interp)) {
+      (*interp)->trace_gpu_event_wait(
+          reinterpret_cast<uintptr_t>(cuda_event),
+          reinterpret_cast<uintptr_t>(cuda_stream.stream()));
+    }
+    setDevice(orig_device);
+  }
+
+  // May be called from any device
+  bool queryEvent(void* event) const override {
+    if (!event)
+      return true;
+    cudaEvent_t cuda_event = static_cast<cudaEvent_t>(event);
+    const cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaEventQuery(cuda_event));
+    if (err != cudaErrorNotReady) {
+      C10_CUDA_CHECK(err);
+    } else {
+      // ignore and clear the error if not ready
+      (void)cudaGetLastError();
+    }
+    return (err == cudaSuccess);
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    return cuda_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    CUDAStream cuda_stream{stream};
+    cuda_stream.synchronize();
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    CUDAStream cuda_stream{stream};
+    CUDACachingAllocator::recordStream(data_ptr, cuda_stream);
+  }
+};
+
+} // namespace c10::cuda::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDATest.h b/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDATest.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b288250b5cec90a2540be3b5cae5af2ae965994
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/cuda/impl/CUDATest.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+
+namespace c10::cuda::impl {
+
+C10_CUDA_API int c10_cuda_test();
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/c10/macros/Export.h b/MLPY/Lib/site-packages/torch/include/c10/macros/Export.h
new file mode 100644
index 0000000000000000000000000000000000000000..fce560336543b36dd50afe0ce1732aa8e04494f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/macros/Export.h
@@ -0,0 +1,160 @@
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
+/* Header file to define the common scaffolding for exported symbols.
+ *
+ * Export is by itself a quite tricky situation to deal with, and if you are
+ * hitting this file, make sure you start with the background here:
+ * - Linux: https://gcc.gnu.org/wiki/Visibility
+ * - Windows:
+ * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017
+ *
+ * Do NOT include this file directly. Instead, use c10/macros/Macros.h
+ */
+
+// You do not need to edit this part of file unless you are changing the core
+// pytorch export abstractions.
+//
+// This part defines the C10 core export and import macros. This is controlled
+// by whether we are building shared libraries or not, which is determined
+// during build time and codified in c10/core/cmake_macros.h.
+// When the library is built as a shared lib, EXPORT and IMPORT will contain
+// visibility attributes. If it is being built as a static lib, then EXPORT
+// and IMPORT basically have no effect.
+
+// As a rule of thumb, you should almost NEVER mix static and shared builds for
+// libraries that depend on c10. AKA, if c10 is built as a static library, we
+// recommend everything dependent on c10 to be built statically. If c10 is built
+// as a shared library, everything dependent on it should be built as shared. In
+// the PyTorch project, all native libraries shall use the macro
+// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static
+// libraries.
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#ifdef _WIN32
+#define C10_HIDDEN
+#if defined(C10_BUILD_SHARED_LIBS)
+#define C10_EXPORT __declspec(dllexport)
+#define C10_IMPORT __declspec(dllimport)
+#else
+#define C10_EXPORT
+#define C10_IMPORT
+#endif
+#else // _WIN32
+#if defined(__GNUC__)
+#define C10_EXPORT __attribute__((__visibility__("default")))
+#define C10_HIDDEN __attribute__((__visibility__("hidden")))
+#else // defined(__GNUC__)
+#define C10_EXPORT
+#define C10_HIDDEN
+#endif // defined(__GNUC__)
+#define C10_IMPORT C10_EXPORT
+#endif // _WIN32
+
+#ifdef NO_EXPORT
+#undef C10_EXPORT
+#define C10_EXPORT
+#endif
+
+// Definition of an adaptive XX_API macro, that depends on whether you are
+// building the library itself or not, routes to XX_EXPORT and XX_IMPORT.
+// Basically, you will need to do this for each shared library that you are
+// building, and the instruction is as follows: assuming that you are building
+// a library called libawesome.so. You should:
+// (1) for your cmake target (usually done by "add_library(awesome, ...)"),
+//     define a macro called AWESOME_BUILD_MAIN_LIB using
+//     target_compile_options.
+// (2) define the AWESOME_API macro similar to the one below.
+// And in the source file of your awesome library, use AWESOME_API to
+// annotate public symbols.
+
+// Here, for the C10 library, we will define the macro C10_API for both import
+// and export.
+
+// This one is being used by libc10.so
+#ifdef C10_BUILD_MAIN_LIB
+#define C10_API C10_EXPORT
+#else
+#define C10_API C10_IMPORT
+#endif
+
+// This one is being used by libtorch.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_API C10_EXPORT
+#else
+#define TORCH_API C10_IMPORT
+#endif
+
+// You may be wondering: Whose brilliant idea was it to split torch_cuda into
+// two pieces with confusing names?
+// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
+// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
+// issues when linking big binaries.
+// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
+//    (1) Stop supporting so many GPU architectures
+//    (2) Do something else
+// We chose #2 and decided to split the behemoth that was torch_cuda into two
+// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
+// and the other that had..well..everything else (torch_cuda_cpp). The idea was
+// this: instead of linking our static libraries (like the hefty
+// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
+// relocation marker issues, we could link our static libraries to a smaller
+// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
+
+// libtorch_cuda_cu.so
+#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+// libtorch_cuda_cpp.so
+#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#elif defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#endif
+
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
+// same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#elif !defined(BUILD_SPLIT_CUDA)
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+#if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_API C10_EXPORT
+#else
+#define TORCH_HIP_API C10_IMPORT
+#endif
+
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
+#define C10_API_ENUM C10_API
+#else
+#define C10_API_ENUM
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/macros/Macros.h b/MLPY/Lib/site-packages/torch/include/c10/macros/Macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..fae63b7b91ceec59b56bf6cb05111ef280602193
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/macros/Macros.h
@@ -0,0 +1,546 @@
+#ifndef C10_MACROS_MACROS_H_
+#define C10_MACROS_MACROS_H_
+#include <cassert>
+
+/* Main entry for c10/macros.
+ *
+ * In your code, include c10/macros/Macros.h directly, instead of individual
+ * files in this folder.
+ */
+
+// For build systems that do not directly depend on CMake and directly build
+// from the source directory (such as Buck), one may not have a cmake_macros.h
+// file at all. In this case, the build system is responsible for providing
+// correct macro definitions corresponding to the cmake_macros.h.in file.
+//
+// In such scenarios, one should define the macro
+//     C10_USING_CUSTOM_GENERATED_MACROS
+// to inform this header that it does not need to include the cmake_macros.h
+// file.
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <c10/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
+#include <c10/macros/Export.h>
+
+#if defined(__clang__)
+#define __ubsan_ignore_float_divide_by_zero__ \
+  __attribute__((no_sanitize("float-divide-by-zero")))
+#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
+#define __ubsan_ignore_signed_int_overflow__ \
+  __attribute__((no_sanitize("signed-integer-overflow")))
+#define __ubsan_ignore_pointer_overflow__ \
+  __attribute__((no_sanitize("pointer-overflow")))
+#define __ubsan_ignore_function__ __attribute__((no_sanitize("function")))
+#else
+#define __ubsan_ignore_float_divide_by_zero__
+#define __ubsan_ignore_undefined__
+#define __ubsan_ignore_signed_int_overflow__
+#define __ubsan_ignore_pointer_overflow__
+#define __ubsan_ignore_function__
+#endif
+
+// Detect address sanitizer as some stuff doesn't work with it
+#undef C10_ASAN_ENABLED
+
+// for clang
+#if defined(__has_feature)
+#if ((__has_feature(address_sanitizer)))
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+
+// for gcc
+#if defined(__SANITIZE_ADDRESS__)
+#if __SANITIZE_ADDRESS__
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 1
+#endif
+#endif
+#endif
+
+#if !defined(C10_ASAN_ENABLED)
+#define C10_ASAN_ENABLED 0
+#endif
+
+// Disable the copy and assignment operator for a class. Note that this will
+// disable the usage of the class in std containers.
+#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
+  classname(const classname&) = delete;        \
+  classname& operator=(const classname&) = delete
+
+#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
+#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
+
+#define C10_MACRO_EXPAND(args) args
+
+#define C10_STRINGIZE_IMPL(x) #x
+#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
+
+/**
+ * C10_ANONYMOUS_VARIABLE(str) introduces an identifier starting with
+ * str and ending with a number that varies with the line.
+ */
+#ifdef __COUNTER__
+#define C10_UID __COUNTER__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
+#else
+#define C10_UID __LINE__
+#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
+#endif
+
+#ifdef __has_cpp_attribute
+#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#define C10_HAS_CPP_ATTRIBUTE(x) (0)
+#endif
+
+/// C10_NODISCARD - Warn if a type or return value is discarded.
+
+// Technically, we should check if __cplusplus > 201402L here, because
+// [[nodiscard]] is only defined in C++17.  However, some compilers
+// we care about don't advertise being C++17 (e.g., clang), but
+// support the attribute anyway.  In fact, this is not just a good idea,
+// it's the law: clang::warn_unused_result doesn't work on nvcc + clang
+// and the best workaround for this case is to use [[nodiscard]]
+// instead; see https://github.com/pytorch/pytorch/issues/13118
+//
+// Note to future editors: if you have noticed that a compiler is
+// misbehaving (e.g., it advertises support, but the support doesn't
+// actually work, or it is emitting warnings).  Some compilers which
+// are strict about the matter include MSVC, which will complain:
+//
+//  error C2429: attribute 'nodiscard' requires compiler flag '/std:c++latest'
+//
+// Exhibits:
+//  - MSVC 19.14: https://godbolt.org/z/Dzd7gn (requires /std:c++latest)
+//  - Clang 8.0.0: https://godbolt.org/z/3PYL4Z (always advertises support)
+//  - gcc 8.3: https://godbolt.org/z/4tLMQS (always advertises support)
+#if C10_HAS_CPP_ATTRIBUTE(nodiscard)
+#define C10_NODISCARD [[nodiscard]]
+// Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
+// error when __has_cpp_attribute is given a scoped attribute in C mode.
+#elif __cplusplus && C10_HAS_CPP_ATTRIBUTE(clang::warn_unused_result)
+// TODO: It's possible this is still triggering
+// https://github.com/pytorch/pytorch/issues/13118 on Windows; if it is, better
+// fix it.
+#define C10_NODISCARD [[clang::warn_unused_result]]
+#else
+#define C10_NODISCARD
+#endif
+
+// suppress an unused variable.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define C10_UNUSED __pragma(warning(suppress : 4100 4101))
+#else
+#define C10_UNUSED __attribute__((__unused__))
+#endif //_MSC_VER
+
+#if !defined(__has_attribute)
+#define __has_attribute(x) 0
+#endif
+
+// Direct port of LLVM_ATTRIBUTE_USED.
+#if __has_attribute(used)
+#define C10_USED __attribute__((__used__))
+#else
+#define C10_USED
+#endif
+
+#define C10_RESTRICT __restrict
+
+// Simply define the namespace, in case a dependent library want to refer to
+// the c10 namespace but not any nontrivial files.
+namespace c10 {}
+namespace c10::cuda {}
+namespace c10::hip {}
+namespace c10::xpu {}
+
+// Since C10 is the core library for caffe2 (and aten), we will simply reroute
+// all abstractions defined in c10 to be available in caffe2 as well.
+// This is only for backwards compatibility. Please use the symbols from the
+// c10 namespace where possible.
+namespace caffe2 {
+using namespace c10;
+}
+namespace at {
+using namespace c10;
+}
+namespace at::cuda {
+using namespace c10::cuda;
+} // namespace at::cuda
+
+// WARNING!!! THIS IS A GIANT HACK!!!
+// This line means you cannot simultaneously include c10/hip
+// and c10/cuda and then use them from the at::cuda namespace.
+// This is true in practice, because HIPIFY works inplace on
+// files in ATen/cuda, so it assumes that c10::hip is available
+// from at::cuda.  This namespace makes that happen.  When
+// HIPIFY is no longer out-of-place, we can switch the cuda
+// here to hip and everyone is happy.
+namespace at::cuda {
+using namespace c10::hip;
+} // namespace at::cuda
+
+namespace at::xpu {
+using namespace c10::xpu;
+} // namespace at::xpu
+
+// C10_LIKELY/C10_UNLIKELY
+//
+// These macros provide parentheses, so you can use these macros as:
+//
+//    if C10_LIKELY(some_expr) {
+//      ...
+//    }
+//
+// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
+// takes a long argument, which means you may trigger the wrong conversion
+// without it.
+//
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
+#else
+#define C10_LIKELY(expr) (expr)
+#define C10_UNLIKELY(expr) (expr)
+#endif
+
+/// C10_NOINLINE - Functions whose declaration is annotated with this will not
+/// be inlined.
+#ifdef __GNUC__
+#define C10_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define C10_NOINLINE __declspec(noinline)
+#else
+#define C10_NOINLINE
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ALWAYS_INLINE __forceinline
+#elif __has_attribute(always_inline) || defined(__GNUC__)
+#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
+#else
+#define C10_ALWAYS_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+#define C10_ATTR_VISIBILITY_HIDDEN
+#elif defined(__GNUC__)
+#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
+#else
+#define C10_ATTR_VISIBILITY_HIDDEN
+#endif
+
+#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
+
+#include <cstdint>
+
+#ifdef __HIPCC__
+// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
+// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
+// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
+#include <hip/hip_runtime.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define C10_HOST_DEVICE __host__ __device__
+#define C10_DEVICE __device__
+#define C10_HOST __host__
+// constants from
+// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
+// The maximum number of threads per multiprocessor is 1024 for Turing
+// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
+// 2048 for all other architectures. You'll get warnings if you exceed these
+// constants. Hence, the following macros adjust the input values from the user
+// to resolve potential warnings.
+#if __CUDA_ARCH__ == 750
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
+#else
+constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
+#endif
+// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
+constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
+// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
+// size. 256 is a good number for this fallback and should give good occupancy
+// and versatility across all architectures.
+constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
+// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
+//       turns out that although __launch_bounds__ can take constexpr, it
+//       can't take a constexpr that has anything to do with templates.
+//       Currently we use launch_bounds that depend on template arguments in
+//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
+//       and C10_MIN_BLOCKS_PER_SM are kept as macros.
+// Suppose you were planning to write __launch_bounds__(a, b), based on your
+// performance tuning on a modern GPU. Instead, you should write
+// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
+// which will also properly respect limits on old architectures.
+#define C10_MAX_THREADS_PER_BLOCK(val)           \
+  (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
+                                         : CUDA_THREADS_PER_BLOCK_FALLBACK)
+#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
+  ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
+        ? (blocks_per_sm)                                              \
+        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) /         \
+           (threads_per_block))))
+// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
+#define C10_LAUNCH_BOUNDS_0 \
+  __launch_bounds__(        \
+      256, 4) // default launch bounds that should give good occupancy and
+              // versatility across all architectures.
+#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
+  __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
+#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
+  __launch_bounds__(                                                  \
+      (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))),           \
+      (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
+#else
+#define C10_HOST_DEVICE
+#define C10_HOST
+#define C10_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_HIP_HOST_DEVICE __host__ __device__
+#else
+#define C10_HIP_HOST_DEVICE
+#endif
+
+#if defined(USE_ROCM)
+#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
+#else
+#define C10_WARP_SIZE 32
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+// CUDA_KERNEL_ASSERT checks the assertion
+// even when NDEBUG is defined. This is useful for important assertions in CUDA
+// code that would otherwise be suppressed when building Release.
+#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 40100)
+// Those platforms do not support assert()
+#define CUDA_KERNEL_ASSERT(cond)
+#define SYCL_KERNEL_ASSERT(cond)
+#elif defined(_MSC_VER)
+#if defined(NDEBUG)
+extern "C" {
+C10_IMPORT
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void _wassert(
+    const wchar_t* wexpr,
+    const wchar_t* wfile,
+    unsigned line);
+#else
+#if defined(__CUDA_ARCH__)
+__host__ __device__
+#endif // __CUDA_ARCH__
+    void
+    _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+#define CUDA_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                 \
+  if (C10_UNLIKELY(!(cond))) {                   \
+    (void)(_wassert(                             \
+               _CRT_WIDE(#cond),                 \
+               _CRT_WIDE(__FILE__),              \
+               static_cast<unsigned>(__LINE__)), \
+           0);                                   \
+  }
+#else // __APPLE__, _MSC_VER
+#if defined(NDEBUG)
+extern "C" {
+#if defined(__SYCL_DEVICE_ONLY__)
+extern SYCL_EXTERNAL void __assert_fail(
+    const char* expr,
+    const char* file,
+    unsigned int line,
+    const char* func);
+#else // __SYCL_DEVICE_ONLY__
+#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
+// CUDA supports __assert_fail function which are common for both device
+// and host side code.
+__host__ __device__
+#endif
+
+    // This forward declaration matching the declaration of __assert_fail
+    // exactly how it is in glibc in case parts of the program are compiled with
+    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
+    // error. Note: On ROCm - this declaration serves for host side compilation.
+    void
+    __assert_fail(
+        const char* assertion,
+        const char* file,
+        unsigned int line,
+        const char* function) noexcept __attribute__((__noreturn__));
+
+#endif // __SYCL_DEVICE_ONLY__
+}
+#endif // NDEBUG
+// ROCm disable kernel assert by default
+#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
+#define CUDA_KERNEL_ASSERT(cond)
+#define SYCL_KERNEL_ASSERT(cond)
+#else
+#define CUDA_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#define SYCL_KERNEL_ASSERT(cond)                                         \
+  if (C10_UNLIKELY(!(cond))) {                                           \
+    __assert_fail(                                                       \
+        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
+  }
+#endif //  C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM
+#endif // __APPLE__
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
+
+#if defined(__ANDROID__)
+#define C10_ANDROID 1
+#define C10_MOBILE 1
+#elif (                   \
+    defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
+#define C10_IOS 1
+#define C10_MOBILE 1
+#endif // ANDROID / IOS
+
+#if defined(C10_MOBILE) && C10_MOBILE
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline
+#else
+#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if defined(_MSC_VER) && defined(__CUDACC__)
+#define CONSTEXPR_EXCEPT_WIN_CUDA const
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
+
+// Note [static constexpr char* members for windows NVCC]
+// The Windows NVCC compiler doesn't handle static constexpr class members,
+// although it's fixed in a later version.
+// (see
+// https://developercommunity.visualstudio.com/t/intellisense-error-c11-static-constexpr-member-ini/245425)
+//
+// If we want to ensure that our field is static under all builds, then we need
+// to work around it specifically for windows NVCC by making it (a) const, (b)
+// defined outside of the class definition We need to define it outside of the
+// class definition because of the C++ standard; char* is not an integral type
+// (see
+// https://stackoverflow.com/questions/24278473/intellisense-a-member-of-type-const-char-const-cannot-have-an-in-class-in)
+//
+// So instead of this:
+// struct Foo {
+//     static constexpr const char* name = "foo";
+// }
+// In Windows NVCC, we end up with this:
+// struct Foo {
+//     static const char* name;
+// }
+// const char* Foo::name = "foo";
+//
+// This gives us a small perf hit for any code that wants to access these field
+// members, but right now it isn't used in any perf-critical code paths.
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static const char* field;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) \
+  const char* cls::field = val;
+#else
+#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA __host__
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static constexpr const char* field = val;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
+#endif
+#else
+#if defined(_MSC_VER) && defined(__CUDACC__)
+#define CONSTEXPR_EXCEPT_WIN_CUDA const
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static const char* field;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) \
+  const char* cls::field = val;
+#else
+#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
+
+#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
+  static constexpr const char* field = val;
+#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
+#endif
+#endif
+
+#ifndef HAS_DEMANGLE
+#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
+#define HAS_DEMANGLE 0
+#elif defined(__APPLE__) && \
+    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
+#define HAS_DEMANGLE 0
+#else
+#define HAS_DEMANGLE 1
+#endif
+#endif // HAS_DEMANGLE
+
+#define _C10_PRAGMA__(string) _Pragma(#string)
+#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
+
+#ifdef __clang__
+#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
+#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
+  _C10_PRAGMA_(clang diagnostic ignored flag)
+#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
+#else
+#define C10_CLANG_DIAGNOSTIC_PUSH()
+#define C10_CLANG_DIAGNOSTIC_POP()
+#define C10_CLANG_DIAGNOSTIC_IGNORE(flag)
+#define C10_CLANG_HAS_WARNING(flag) 0
+#endif
+
+#ifdef __clang__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)         \
+  _C10_PRAGMA_(clang diagnostic push)                               \
+  _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \
+  _C10_PRAGMA_(clang diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop)
+
+#elif __GNUC__
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \
+  _C10_PRAGMA_(GCC diagnostic push)                         \
+  _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas")          \
+  _C10_PRAGMA_(GCC diagnostic ignored warning)
+
+#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop)
+
+#else
+
+#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)
+#define C10_DIAGNOSTIC_POP()
+
+#endif
+
+#endif // C10_MACROS_MACROS_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/macros/cmake_macros.h b/MLPY/Lib/site-packages/torch/include/c10/macros/cmake_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..45bba88997cc9f1645a27c82e63fa51ad6bf3ddd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/macros/cmake_macros.h
@@ -0,0 +1,14 @@
+#ifndef C10_MACROS_CMAKE_MACROS_H_
+#define C10_MACROS_CMAKE_MACROS_H_
+
+// Automatically generated header file for the C10 library.
+// Do not include this file directly. Instead, include c10/macros/Macros.h.
+
+#define C10_BUILD_SHARED_LIBS
+/* #undef C10_USE_GLOG */
+/* #undef C10_USE_GFLAGS */
+/* #undef C10_USE_NUMA */
+/* #undef C10_USE_MSVC_STATIC_RUNTIME */
+/* #undef C10_USE_ROCM_KERNEL_ASSERT */
+
+#endif // C10_MACROS_CMAKE_MACROS_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/AbortHandler.h b/MLPY/Lib/site-packages/torch/include/c10/util/AbortHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..327f7a93eda1f5d1e8c65e8f0b1aca97a5886c5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/AbortHandler.h
@@ -0,0 +1,81 @@
+#include <c10/macros/Macros.h>
+#include <c10/util/Backtrace.h>
+#include <c10/util/env.h>
+#include <cstdlib>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <optional>
+
+namespace c10 {
+class AbortHandlerHelper {
+ public:
+  static AbortHandlerHelper& getInstance() {
+#ifdef _WIN32
+    thread_local
+#endif // _WIN32
+        static AbortHandlerHelper instance;
+    return instance;
+  }
+
+  void set(std::terminate_handler handler) {
+    std::lock_guard<std::mutex> lk(mutex);
+    if (!inited) {
+      prev = std::set_terminate(handler);
+      curr = std::get_terminate();
+      inited = true;
+    }
+  }
+
+  std::terminate_handler getPrev() const {
+    return prev;
+  }
+
+ private:
+  std::terminate_handler prev = nullptr;
+  std::terminate_handler curr = nullptr;
+  bool inited = false;
+  std::mutex mutex;
+  AbortHandlerHelper() = default;
+  ~AbortHandlerHelper() {
+    // Only restore the handler if we are the current one
+    if (inited && curr == std::get_terminate()) {
+      std::set_terminate(prev);
+    }
+  }
+
+ public:
+  AbortHandlerHelper(AbortHandlerHelper const&) = delete;
+  void operator=(AbortHandlerHelper const&) = delete;
+};
+
+namespace detail {
+C10_ALWAYS_INLINE void terminate_handler() {
+  std::cout << "Unhandled exception caught in c10/util/AbortHandler.h" << '\n';
+  auto backtrace = get_backtrace();
+  std::cout << backtrace << '\n' << std::flush;
+  auto prev_handler = AbortHandlerHelper::getInstance().getPrev();
+  if (prev_handler) {
+    prev_handler();
+  } else {
+    std::abort();
+  }
+}
+} // namespace detail
+
+C10_ALWAYS_INLINE void set_terminate_handler() {
+  bool use_custom_terminate = false;
+  // On Windows it is enabled by default based on
+  // https://github.com/pytorch/pytorch/pull/50320#issuecomment-763147062
+#ifdef _WIN32
+  use_custom_terminate = true;
+#endif // _WIN32
+  auto result = c10::utils::check_env("TORCH_CUSTOM_TERMINATE");
+  if (result != std::nullopt) {
+    use_custom_terminate = result.value();
+  }
+  if (use_custom_terminate) {
+    AbortHandlerHelper::getInstance().set(detail::terminate_handler);
+  }
+}
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/AlignOf.h b/MLPY/Lib/site-packages/torch/include/c10/util/AlignOf.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd15693fb7369d94e9ada29509f5240d8fb061d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/AlignOf.h
@@ -0,0 +1,176 @@
+//===--- AlignOf.h - Portable calculation of type alignment -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AlignedCharArray and AlignedCharArrayUnion classes.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::AlignOf
+// replaced LLVM_ALIGNAS with alignas
+
+#pragma once
+
+#include <cstddef>
+
+namespace c10 {
+
+/// \struct AlignedCharArray
+/// \brief Helper for building an aligned character array type.
+///
+/// This template is used to explicitly build up a collection of aligned
+/// character array types. We have to build these up using a macro and explicit
+/// specialization to cope with MSVC (at least till 2015) where only an
+/// integer literal can be used to specify an alignment constraint. Once built
+/// up here, we can then begin to indirect between these using normal C++
+/// template parameters.
+
+// MSVC requires special handling here.
+#ifndef _MSC_VER
+
+template <size_t Alignment, size_t Size>
+struct AlignedCharArray {
+  // NOLINTNEXTLINE(*c-arrays)
+  alignas(Alignment) char buffer[Size];
+};
+
+#else // _MSC_VER
+
+/// \brief Create a type with an aligned char buffer.
+template <size_t Alignment, size_t Size>
+struct AlignedCharArray;
+
+// We provide special variations of this template for the most common
+// alignments because __declspec(align(...)) doesn't actually work when it is
+// a member of a by-value function argument in MSVC, even if the alignment
+// request is something reasonably like 8-byte or 16-byte. Note that we can't
+// even include the declspec with the union that forces the alignment because
+// MSVC warns on the existence of the declspec despite the union member forcing
+// proper alignment.
+
+template <size_t Size>
+struct AlignedCharArray<1, Size> {
+  union {
+    char aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<2, Size> {
+  union {
+    short aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<4, Size> {
+  union {
+    int aligned;
+    char buffer[Size];
+  };
+};
+
+template <size_t Size>
+struct AlignedCharArray<8, Size> {
+  union {
+    double aligned;
+    char buffer[Size];
+  };
+};
+
+// The rest of these are provided with a __declspec(align(...)) and we simply
+// can't pass them by-value as function arguments on MSVC.
+
+#define AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <size_t Size>                          \
+  struct AlignedCharArray<x, Size> {              \
+    __declspec(align(x)) char buffer[Size];       \
+  };
+
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64)
+AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128)
+
+#undef AT_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#endif // _MSC_VER
+
+namespace detail {
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+class AlignerImpl {
+  T1 t1;
+  T2 t2;
+  T3 t3;
+  T4 t4;
+  T5 t5;
+  T6 t6;
+  T7 t7;
+  T8 t8;
+  T9 t9;
+  T10 t10;
+
+ public:
+  AlignerImpl() = delete;
+};
+
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+union SizerImpl {
+  // NOLINTNEXTLINE(*c-arrays)
+  char arr1[sizeof(T1)], arr2[sizeof(T2)], arr3[sizeof(T3)], arr4[sizeof(T4)],
+      arr5[sizeof(T5)], arr6[sizeof(T6)], arr7[sizeof(T7)], arr8[sizeof(T8)],
+      arr9[sizeof(T9)], arr10[sizeof(T10)];
+};
+} // end namespace detail
+
+/// \brief This union template exposes a suitably aligned and sized character
+/// array member which can hold elements of any of up to ten types.
+///
+/// These types may be arrays, structs, or any other types. The goal is to
+/// expose a char array buffer member which can be used as suitable storage for
+/// a placement new of any of these types. Support for more than ten types can
+/// be added at the cost of more boilerplate.
+template <
+    typename T1,
+    typename T2 = char,
+    typename T3 = char,
+    typename T4 = char,
+    typename T5 = char,
+    typename T6 = char,
+    typename T7 = char,
+    typename T8 = char,
+    typename T9 = char,
+    typename T10 = char>
+struct AlignedCharArrayUnion
+    : AlignedCharArray<
+          alignof(detail::AlignerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>),
+          sizeof(::c10::detail::
+                     SizerImpl<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>)> {};
+} // end namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ApproximateClock.h b/MLPY/Lib/site-packages/torch/include/c10/util/ApproximateClock.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0fb3efe5b4f3787226c6b4215f477796716f593
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ApproximateClock.h
@@ -0,0 +1,115 @@
+// Copyright 2023-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <array>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <ctime>
+#include <functional>
+#include <type_traits>
+
+#if defined(C10_IOS) && defined(C10_MOBILE)
+#include <sys/time.h> // for gettimeofday()
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
+#define C10_RDTSC
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__CUDACC__) || defined(__HIPCC__)
+#undef C10_RDTSC
+#elif defined(__clang__)
+// `__rdtsc` is available by default.
+// NB: This has to be first, because Clang will also define `__GNUC__`
+#elif defined(__GNUC__)
+#include <x86intrin.h>
+#else
+#undef C10_RDTSC
+#endif
+#endif
+
+namespace c10 {
+
+using time_t = int64_t;
+using steady_clock_t = std::conditional_t<
+    std::chrono::high_resolution_clock::is_steady,
+    std::chrono::high_resolution_clock,
+    std::chrono::steady_clock>;
+
+inline time_t getTimeSinceEpoch() {
+  auto now = std::chrono::system_clock::now().time_since_epoch();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(now).count();
+}
+
+inline time_t getTime(bool allow_monotonic = false) {
+#if defined(C10_IOS) && defined(C10_MOBILE)
+  // clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS
+  // can't rely on CLOCK_REALTIME, as it is defined no matter if clock_gettime
+  // is implemented or not
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<time_t>(now.tv_sec) * 1000000000 +
+      static_cast<time_t>(now.tv_usec) * 1000;
+#elif defined(_WIN32) || defined(__MACH__)
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             steady_clock_t::now().time_since_epoch())
+      .count();
+#else
+  // clock_gettime is *much* faster than std::chrono implementation on Linux
+  struct timespec t {};
+  auto mode = CLOCK_REALTIME;
+  if (allow_monotonic) {
+    mode = CLOCK_MONOTONIC;
+  }
+  clock_gettime(mode, &t);
+  return static_cast<time_t>(t.tv_sec) * 1000000000 +
+      static_cast<time_t>(t.tv_nsec);
+#endif
+}
+
+// We often do not need to capture true wall times. If a fast mechanism such
+// as TSC is available we can use that instead and convert back to epoch time
+// during post processing. This greatly reduce the clock's contribution to
+// profiling.
+//   http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/
+//   https://quick-bench.com/q/r8opkkGZSJMu9wM_XTbDouq-0Io
+// TODO: We should use
+// `https://github.com/google/benchmark/blob/main/src/cycleclock.h`
+inline auto getApproximateTime() {
+#if defined(C10_RDTSC)
+  return static_cast<uint64_t>(__rdtsc());
+#else
+  return getTime();
+#endif
+}
+
+using approx_time_t = decltype(getApproximateTime());
+static_assert(
+    std::is_same_v<approx_time_t, int64_t> ||
+        std::is_same_v<approx_time_t, uint64_t>,
+    "Expected either int64_t (`getTime`) or uint64_t (some TSC reads).");
+
+// Convert `getCount` results to Nanoseconds since unix epoch.
+class C10_API ApproximateClockToUnixTimeConverter final {
+ public:
+  ApproximateClockToUnixTimeConverter();
+  std::function<time_t(approx_time_t)> makeConverter();
+
+  struct UnixAndApproximateTimePair {
+    time_t t_;
+    approx_time_t approx_t_;
+  };
+  static UnixAndApproximateTimePair measurePair();
+
+ private:
+  static constexpr size_t replicates = 1001;
+  using time_pairs = std::array<UnixAndApproximateTimePair, replicates>;
+  time_pairs measurePairs();
+
+  time_pairs start_times_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Array.h b/MLPY/Lib/site-packages/torch/include/c10/util/Array.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecf91b578137d931301f674a9cb10bd2ec86b5ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Array.h
@@ -0,0 +1,16 @@
+#include <array>
+#include <utility>
+
+namespace c10 {
+
+// This helper function creates a constexpr std::array
+// From a compile time list of values, without requiring you to explicitly
+// write out the length.
+//
+// See also https://stackoverflow.com/a/26351760/23845
+template <typename V, typename... T>
+inline constexpr auto array_of(T&&... t) -> std::array<V, sizeof...(T)> {
+  return {{std::forward<T>(t)...}};
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ArrayRef.h b/MLPY/Lib/site-packages/torch/include/c10/util/ArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..c347c5263483be94921086cd0d9c3a3b492aec0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ArrayRef.h
@@ -0,0 +1,380 @@
+//===--- ArrayRef.h - Array Reference Wrapper -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::ArrayRef.
+// removed llvm-specific functionality
+// removed some implicit const -> non-const conversions that rely on
+// complicated std::enable_if meta-programming
+// removed a bunch of slice variants for simplicity...
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <initializer_list>
+#include <iterator>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+namespace c10 {
+/// ArrayRef - Represent a constant reference to an array (0 or more elements
+/// consecutively in memory), i.e. a start pointer and a length.  It allows
+/// various APIs to take consecutive elements easily and conveniently.
+///
+/// This class does not own the underlying data, it is expected to be used in
+/// situations where the data resides in some other buffer, whose lifetime
+/// extends past that of the ArrayRef. For this reason, it is not in general
+/// safe to store an ArrayRef.
+///
+/// This is intended to be trivially copyable, so it should be passed by
+/// value.
+template <typename T>
+class ArrayRef final {
+ public:
+  using iterator = const T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  const T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+  void debugCheckNullptrInvariant() {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        Data != nullptr || Length == 0,
+        "created ArrayRef with nullptr and non-zero length! c10::optional relies on this being illegal");
+  }
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty ArrayRef.
+  /* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct an ArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an ArrayRef from a pointer and length.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef(const T* data, size_t length)
+      : Data(data), Length(length) {
+    debugCheckNullptrInvariant();
+  }
+
+  /// Construct an ArrayRef from a range.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef(const T* begin, const T* end)
+      : Data(begin), Length(end - begin) {
+    debugCheckNullptrInvariant();
+  }
+
+  /// Construct an ArrayRef from a SmallVector. This is templated in order to
+  /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+  /// copy-construct an ArrayRef.
+  template <typename U>
+  /* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    debugCheckNullptrInvariant();
+  }
+
+  template <
+      typename Container,
+      typename = std::enable_if_t<std::is_same_v<
+          std::remove_const_t<decltype(std::declval<Container>().data())>,
+          T*>>>
+  /* implicit */ ArrayRef(const Container& container)
+      : Data(container.data()), Length(container.size()) {
+    debugCheckNullptrInvariant();
+  }
+
+  /// Construct an ArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ ArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same<T, bool>::value,
+        "ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct an ArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an ArrayRef from a C array.
+  template <size_t N>
+  // NOLINTNEXTLINE(*c-arrays*)
+  /* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// Construct an ArrayRef from a std::initializer_list.
+  /* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  // These are actually the same as iterator, since ArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return Data;
+  }
+  constexpr const_iterator cend() const {
+    return Data + Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr const T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// front - Get the first element.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& front() const {
+    TORCH_CHECK(
+        !empty(), "ArrayRef: attempted to access front() of empty list");
+    return Data[0];
+  }
+
+  /// back - Get the last element.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& back() const {
+    TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
+    return Data[Length - 1];
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(ArrayRef RHS) const {
+    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// slice(n, m) - Take M elements of the array starting at element N
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef<T> slice(size_t N, size_t M)
+      const {
+    TORCH_CHECK(
+        N + M <= size(),
+        "ArrayRef: invalid slice, N = ",
+        N,
+        "; M = ",
+        M,
+        "; size = ",
+        size());
+    return ArrayRef<T>(data() + N, M);
+  }
+
+  /// slice(n) - Chop off the first N elements of the array.
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA ArrayRef<T> slice(size_t N) const {
+    TORCH_CHECK(
+        N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
+    return slice(N, size() - N);
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Vector compatibility
+  C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA const T& at(size_t Index) const {
+    TORCH_CHECK(
+        Index < Length,
+        "ArrayRef: invalid index Index = ",
+        Index,
+        "; Length = ",
+        Length);
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+
+  /// @}
+  /// @name Expensive Operations
+  /// @{
+  std::vector<T> vec() const {
+    return std::vector<T>(Data, Data + Length);
+  }
+
+  /// @}
+};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
+  int i = 0;
+  out << "[";
+  for (const auto& e : list) {
+    if (i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << "]";
+  return out;
+}
+
+/// @name ArrayRef Convenience constructors
+/// @{
+
+/// Construct an ArrayRef from a single element.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T& OneElt) {
+  return OneElt;
+}
+
+/// Construct an ArrayRef from a pointer and length.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* data, size_t length) {
+  return ArrayRef<T>(data, length);
+}
+
+/// Construct an ArrayRef from a range.
+template <typename T>
+ArrayRef<T> makeArrayRef(const T* begin, const T* end) {
+  return ArrayRef<T>(begin, end);
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const SmallVectorImpl<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a SmallVector.
+template <typename T, unsigned N>
+ArrayRef<T> makeArrayRef(const SmallVector<T, N>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::vector.
+template <typename T>
+ArrayRef<T> makeArrayRef(const std::vector<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a std::array.
+template <typename T, std::size_t N>
+ArrayRef<T> makeArrayRef(const std::array<T, N>& Arr) {
+  return Arr;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op) (const)
+template <typename T>
+ArrayRef<T> makeArrayRef(const ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from an ArrayRef (no-op)
+template <typename T>
+ArrayRef<T>& makeArrayRef(ArrayRef<T>& Vec) {
+  return Vec;
+}
+
+/// Construct an ArrayRef from a C array.
+template <typename T, size_t N>
+// NOLINTNEXTLINE(*c-arrays*)
+ArrayRef<T> makeArrayRef(const T (&Arr)[N]) {
+  return ArrayRef<T>(Arr);
+}
+
+// WARNING: Template instantiation will NOT be willing to do an implicit
+// conversions to get you to an c10::ArrayRef, which is why we need so
+// many overloads.
+
+template <typename T>
+bool operator==(c10::ArrayRef<T> a1, c10::ArrayRef<T> a2) {
+  return a1.equals(a2);
+}
+
+template <typename T>
+bool operator!=(c10::ArrayRef<T> a1, c10::ArrayRef<T> a2) {
+  return !a1.equals(a2);
+}
+
+template <typename T>
+bool operator==(const std::vector<T>& a1, c10::ArrayRef<T> a2) {
+  return c10::ArrayRef<T>(a1).equals(a2);
+}
+
+template <typename T>
+bool operator!=(const std::vector<T>& a1, c10::ArrayRef<T> a2) {
+  return !c10::ArrayRef<T>(a1).equals(a2);
+}
+
+template <typename T>
+bool operator==(c10::ArrayRef<T> a1, const std::vector<T>& a2) {
+  return a1.equals(c10::ArrayRef<T>(a2));
+}
+
+template <typename T>
+bool operator!=(c10::ArrayRef<T> a1, const std::vector<T>& a2) {
+  return !a1.equals(c10::ArrayRef<T>(a2));
+}
+
+using IntArrayRef = ArrayRef<int64_t>;
+
+// This alias is deprecated because it doesn't make ownership
+// semantics obvious.  Use IntArrayRef instead!
+C10_DEFINE_DEPRECATED_USING(IntList, ArrayRef<int64_t>)
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b8061d34a911b682e1fc1a002aeed1c9b5f1ba4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-inl.h
@@ -0,0 +1,343 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+/// Constructors
+inline C10_HOST_DEVICE BFloat16::BFloat16(float value)
+    :
+#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 800
+      x(__bfloat16_as_ushort(__float2bfloat16(value)))
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+      x(c10::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
+#else
+      // RNE by default
+      x(detail::round_to_nearest_even(value))
+#endif
+{
+}
+
+/// Implicit conversions
+inline C10_HOST_DEVICE BFloat16::operator float() const {
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  return float(*reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x));
+#else
+  return detail::f32_from_bits(x);
+#endif
+}
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const {
+  return *reinterpret_cast<const __nv_bfloat16*>(&x);
+}
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+inline C10_HOST_DEVICE BFloat16::BFloat16(
+    const sycl::ext::oneapi::bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const {
+  return *reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __ldg(reinterpret_cast<const __nv_bfloat16*>(ptr));
+#else
+  return *ptr;
+#endif
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE BFloat16
+operator+(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator-(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16
+operator*(const BFloat16& a, const BFloat16& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator/(const BFloat16& a, const BFloat16& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator-(const BFloat16& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE BFloat16& operator+=(BFloat16& a, const BFloat16& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator-=(BFloat16& a, const BFloat16& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator*=(BFloat16& a, const BFloat16& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator/=(BFloat16& a, const BFloat16& b) {
+  a = a / b;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator|(BFloat16& a, const BFloat16& b) {
+  a.x = a.x | b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator^(BFloat16& a, const BFloat16& b) {
+  a.x = a.x ^ b.x;
+  return a;
+}
+
+inline C10_HOST_DEVICE BFloat16& operator&(BFloat16& a, const BFloat16& b) {
+  a.x = a.x & b.x;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(BFloat16 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(BFloat16 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(BFloat16 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(BFloat16 a, float b) {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, BFloat16 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, BFloat16 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, BFloat16 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, BFloat16 b) {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const BFloat16& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const BFloat16& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const BFloat16& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const BFloat16& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(BFloat16 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(BFloat16 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(BFloat16 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(BFloat16 a, double b) {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, BFloat16 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, BFloat16 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, BFloat16 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, BFloat16 b) {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int64_t b) {
+  return a + static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int64_t b) {
+  return a - static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int64_t b) {
+  return a * static_cast<BFloat16>(b);
+}
+inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int64_t b) {
+  return a / static_cast<BFloat16>(b);
+}
+
+inline C10_HOST_DEVICE BFloat16 operator+(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) + b;
+}
+inline C10_HOST_DEVICE BFloat16 operator-(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) - b;
+}
+inline C10_HOST_DEVICE BFloat16 operator*(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) * b;
+}
+inline C10_HOST_DEVICE BFloat16 operator/(int64_t a, BFloat16 b) {
+  return static_cast<BFloat16>(a) / b;
+}
+
+// Overloading < and > operators, because std::max and std::min use them.
+
+inline C10_HOST_DEVICE bool operator>(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+inline C10_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::BFloat16> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::BFloat16 min() {
+    return c10::BFloat16(0x0080, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 lowest() {
+    return c10::BFloat16(0xFF7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 max() {
+    return c10::BFloat16(0x7F7F, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 epsilon() {
+    return c10::BFloat16(0x3C00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 round_error() {
+    return c10::BFloat16(0x3F00, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 infinity() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 quiet_NaN() {
+    return c10::BFloat16(0x7FC0, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 signaling_NaN() {
+    return c10::BFloat16(0x7F80, c10::BFloat16::from_bits());
+  }
+  static constexpr c10::BFloat16 denorm_min() {
+    return c10::BFloat16(0x0001, c10::BFloat16::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-math.h b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-math.h
new file mode 100644
index 0000000000000000000000000000000000000000..63c48046cf5df668db477b60db19903a966a9b59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16-math.h
@@ -0,0 +1,287 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+namespace std {
+
+template <typename T>
+struct is_reduced_floating_point
+    : std::integral_constant<
+          bool,
+          std::is_same_v<T, c10::Half> || std::is_same_v<T, c10::BFloat16>> {};
+
+template <typename T>
+constexpr bool is_reduced_floating_point_v =
+    is_reduced_floating_point<T>::value;
+
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T acos(T a) {
+  return std::acos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T asin(T a) {
+  return std::asin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T atan(T a) {
+  return std::atan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T atanh(T a) {
+  return std::atanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T erf(T a) {
+  return std::erf(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T erfc(T a) {
+  return std::erfc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T exp(T a) {
+  return std::exp(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T expm1(T a) {
+  return std::expm1(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T log(T a) {
+  return std::log(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T log10(T a) {
+  return std::log10(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T log1p(T a) {
+  return std::log1p(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T log2(T a) {
+  return std::log2(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T ceil(T a) {
+  return std::ceil(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T cos(T a) {
+  return std::cos(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T floor(T a) {
+  return std::floor(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T nearbyint(T a) {
+  return std::nearbyint(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T sin(T a) {
+  return std::sin(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T tan(T a) {
+  return std::tan(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T sinh(T a) {
+  return std::sinh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T cosh(T a) {
+  return std::cosh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T tanh(T a) {
+  return std::tanh(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T trunc(T a) {
+  return std::trunc(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T lgamma(T a) {
+  return std::lgamma(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T sqrt(T a) {
+  return std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T rsqrt(T a) {
+  return 1.0 / std::sqrt(float(a));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T abs(T a) {
+  return std::abs(float(a));
+}
+#if defined(_MSC_VER) && defined(__CUDACC__)
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), float(b));
+}
+#else
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, double b) {
+  return std::pow(float(a), b);
+}
+#endif
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T pow(T a, T b) {
+  return std::pow(float(a), float(b));
+}
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+inline T fmod(T a, T b) {
+  return std::fmod(float(a), float(b));
+}
+
+/*
+  The following function is inspired from the implementation in `musl`
+  Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+  ----------------------------------------------------------------------
+  Copyright © 2005-2020 Rich Felker, et al.
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  ----------------------------------------------------------------------
+ */
+template <
+    typename T,
+    typename std::enable_if_t<is_reduced_floating_point_v<T>, int> = 0>
+C10_HOST_DEVICE inline T nextafter(T from, T to) {
+  // Reference:
+  // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c
+  using int_repr_t = uint16_t;
+  using float_t = T;
+  constexpr uint8_t bits = 16;
+  union {
+    float_t f;
+    int_repr_t i;
+  } ufrom = {from}, uto = {to};
+
+  // get a mask to get the sign bit i.e. MSB
+  int_repr_t sign_mask = int_repr_t{1} << (bits - 1);
+
+  // short-circuit: if either is NaN, return NaN
+  if (from != from || to != to) {
+    return from + to;
+  }
+
+  // short-circuit: if they are exactly the same.
+  if (ufrom.i == uto.i) {
+    return from;
+  }
+
+  // mask the sign-bit to zero i.e. positive
+  // equivalent to abs(x)
+  int_repr_t abs_from = ufrom.i & ~sign_mask;
+  int_repr_t abs_to = uto.i & ~sign_mask;
+  if (abs_from == 0) {
+    // if both are zero but with different sign,
+    // preserve the sign of `to`.
+    if (abs_to == 0) {
+      return to;
+    }
+    // smallest subnormal with sign of `to`.
+    ufrom.i = (uto.i & sign_mask) | int_repr_t{1};
+    return ufrom.f;
+  }
+
+  // if abs(from) > abs(to) or sign(from) != sign(to)
+  if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) {
+    ufrom.i--;
+  } else {
+    ufrom.i++;
+  }
+
+  return ufrom.f;
+}
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16.h b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4df7d934c680e945df4be7e85da7e4eab4d3ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/BFloat16.h
@@ -0,0 +1,117 @@
+#pragma once
+
+// Defines the bloat16 type (brain floating-point). This representation uses
+// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.
+
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
+namespace c10 {
+
+namespace detail {
+inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) {
+  float res = 0;
+  uint32_t tmp = src;
+  tmp <<= 16;
+
+#if defined(USE_ROCM)
+  float* tempRes;
+
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  tempRes = reinterpret_cast<float*>(&tmp);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &tmp, sizeof(tmp));
+#endif
+
+  return res;
+}
+
+inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) {
+  uint32_t res = 0;
+
+#if defined(USE_ROCM)
+  // We should be using memcpy in order to respect the strict aliasing rule
+  // but it fails in the HIP environment.
+  uint32_t* tempRes = reinterpret_cast<uint32_t*>(&src);
+  res = *tempRes;
+#else
+  std::memcpy(&res, &src, sizeof(res));
+#endif
+
+  return res >> 16;
+}
+
+inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) {
+#if defined(USE_ROCM)
+  if (src != src) {
+#elif defined(_MSC_VER)
+  if (isnan(src)) {
+#else
+  if (std::isnan(src)) {
+#endif
+    return UINT16_C(0x7FC0);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    union {
+      uint32_t U32;
+      float F32;
+    };
+
+    F32 = src;
+    uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    return static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+  }
+}
+} // namespace detail
+
+struct alignas(2) BFloat16 {
+  uint16_t x;
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct from_bits_t {};
+  static constexpr C10_HOST_DEVICE from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t)
+      : x(bits){};
+  inline C10_HOST_DEVICE BFloat16(float value);
+  inline C10_HOST_DEVICE operator float() const;
+
+#if defined(__CUDACC__) && !defined(USE_ROCM)
+  inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
+#endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
+};
+
+} // namespace c10
+
+#include <c10/util/BFloat16-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Backtrace.h b/MLPY/Lib/site-packages/torch/include/c10/util/Backtrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2c21db94abd40a3a3d2c1942d14bfdae289070c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Backtrace.h
@@ -0,0 +1,17 @@
+#ifndef C10_UTIL_BACKTRACE_H_
+#define C10_UTIL_BACKTRACE_H_
+
+#include <cstddef>
+#include <string>
+#include <typeinfo>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+C10_API std::string get_backtrace(
+    size_t frames_to_skip = 0,
+    size_t maximum_number_of_frames = 64,
+    bool skip_python_frames = true);
+} // namespace c10
+
+#endif // C10_UTIL_BACKTRACE_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Bitset.h b/MLPY/Lib/site-packages/torch/include/c10/util/Bitset.h
new file mode 100644
index 0000000000000000000000000000000000000000..f66282e62e79c1932ff86dcdb75ab929b4e72be9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Bitset.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <cstddef>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+namespace c10::utils {
+
+/**
+ * This is a simple bitset class with sizeof(long long int) bits.
+ * You can set bits, unset bits, query bits by index,
+ * and query for the first set bit.
+ * Before using this class, please also take a look at std::bitset,
+ * which has more functionality and is more generic. It is probably
+ * a better fit for your use case. The sole reason for c10::utils::bitset
+ * to exist is that std::bitset misses a find_first_set() method.
+ */
+struct bitset final {
+ private:
+#if defined(_MSC_VER)
+  // MSVCs _BitScanForward64 expects int64_t
+  using bitset_type = int64_t;
+#else
+  // POSIX ffsll expects long long int
+  using bitset_type = long long int;
+#endif
+ public:
+  static constexpr size_t NUM_BITS() {
+    return 8 * sizeof(bitset_type);
+  }
+
+  constexpr bitset() noexcept = default;
+  constexpr bitset(const bitset&) noexcept = default;
+  constexpr bitset(bitset&&) noexcept = default;
+  // there is an issure for gcc 5.3.0 when define default function as constexpr
+  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
+  bitset& operator=(const bitset&) noexcept = default;
+  bitset& operator=(bitset&&) noexcept = default;
+
+  constexpr void set(size_t index) noexcept {
+    bitset_ |= (static_cast<long long int>(1) << index);
+  }
+
+  constexpr void unset(size_t index) noexcept {
+    bitset_ &= ~(static_cast<long long int>(1) << index);
+  }
+
+  constexpr bool get(size_t index) const noexcept {
+    return bitset_ & (static_cast<long long int>(1) << index);
+  }
+
+  constexpr bool is_entirely_unset() const noexcept {
+    return 0 == bitset_;
+  }
+
+  // Call the given functor with the index of each bit that is set
+  template <class Func>
+  void for_each_set_bit(Func&& func) const {
+    bitset cur = *this;
+    size_t index = cur.find_first_set();
+    while (0 != index) {
+      // -1 because find_first_set() is not one-indexed.
+      index -= 1;
+      func(index);
+      cur.unset(index);
+      index = cur.find_first_set();
+    }
+  }
+
+ private:
+  // Return the index of the first set bit. The returned index is one-indexed
+  // (i.e. if the very first bit is set, this function returns '1'), and a
+  // return of '0' means that there was no bit set.
+  size_t find_first_set() const {
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+    unsigned long result;
+    bool has_bits_set = (0 != _BitScanForward64(&result, bitset_));
+    if (!has_bits_set) {
+      return 0;
+    }
+    return result + 1;
+#elif defined(_MSC_VER) && defined(_M_IX86)
+    unsigned long result;
+    if (static_cast<uint32_t>(bitset_) != 0) {
+      bool has_bits_set =
+          (0 != _BitScanForward(&result, static_cast<uint32_t>(bitset_)));
+      if (!has_bits_set) {
+        return 0;
+      }
+      return result + 1;
+    } else {
+      bool has_bits_set =
+          (0 != _BitScanForward(&result, static_cast<uint32_t>(bitset_ >> 32)));
+      if (!has_bits_set) {
+        return 32;
+      }
+      return result + 33;
+    }
+#else
+    return __builtin_ffsll(bitset_);
+#endif
+  }
+
+  friend bool operator==(bitset lhs, bitset rhs) noexcept {
+    return lhs.bitset_ == rhs.bitset_;
+  }
+
+  bitset_type bitset_{0};
+};
+
+inline bool operator!=(bitset lhs, bitset rhs) noexcept {
+  return !(lhs == rhs);
+}
+
+} // namespace c10::utils
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/C++17.h b/MLPY/Lib/site-packages/torch/include/c10/util/C++17.h
new file mode 100644
index 0000000000000000000000000000000000000000..448621b758ca01ce3ab976e7d0a7ab14bdafbc12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/C++17.h
@@ -0,0 +1,166 @@
+#pragma once
+#ifndef C10_UTIL_CPP17_H_
+#define C10_UTIL_CPP17_H_
+
+#include <c10/macros/Macros.h>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 9
+#error \
+    "You're trying to build PyTorch with a too old version of GCC. We need GCC 9 or later."
+#endif
+
+#if defined(__clang__) && __clang_major__ < 9
+#error \
+    "You're trying to build PyTorch with a too old version of Clang. We need Clang 9 or later."
+#endif
+
+#if (defined(_MSC_VER) && (!defined(_MSVC_LANG) || _MSVC_LANG < 201703L)) || \
+    (!defined(_MSC_VER) && __cplusplus < 201703L)
+#error You need C++17 to compile PyTorch
+#endif
+
+#if defined(_WIN32) && (defined(min) || defined(max))
+#error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#endif
+
+/*
+ * This header adds some polyfills with C++17 functionality
+ */
+
+namespace c10 {
+
+// in c++17 std::result_of has been superseded by std::invoke_result.  Since
+// c++20, std::result_of is removed.
+template <typename F, typename... args>
+#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703L
+using invoke_result = typename std::invoke_result<F, args...>;
+#else
+using invoke_result = typename std::result_of<F && (args && ...)>;
+#endif
+
+template <typename F, typename... args>
+using invoke_result_t = typename invoke_result<F, args...>::type;
+
+// std::is_pod is deprecated in C++20, std::is_standard_layout and
+// std::is_trivial are introduced in C++11, std::conjunction has been introduced
+// in C++17.
+template <typename T>
+using is_pod = std::conjunction<std::is_standard_layout<T>, std::is_trivial<T>>;
+
+template <typename T>
+constexpr bool is_pod_v = is_pod<T>::value;
+
+namespace guts {
+
+template <typename Base, typename Child, typename... Args>
+std::enable_if_t<
+    !std::is_array_v<Base> && !std::is_array_v<Child> &&
+        std::is_base_of_v<Base, Child>,
+    std::unique_ptr<Base>>
+make_unique_base(Args&&... args) {
+  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
+}
+
+template <class... B>
+using conjunction = std::conjunction<B...>;
+template <class... B>
+using disjunction = std::disjunction<B...>;
+template <bool B>
+using bool_constant = std::bool_constant<B>;
+template <class B>
+using negation = std::negation<B>;
+
+template <class T>
+using void_t = std::void_t<T>;
+
+#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
+
+template <class F, class Tuple>
+C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
+}
+
+#else
+
+// Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
+// modified)
+// TODO This is an incomplete implementation of std::apply, not working for
+// member functions.
+namespace detail {
+template <class F, class Tuple, std::size_t... INDEX>
+#if defined(_MSC_VER)
+// MSVC has a problem with the decltype() return type, but it also doesn't need
+// it
+C10_HOST_DEVICE constexpr auto apply_impl(
+    F&& f,
+    Tuple&& t,
+    std::index_sequence<INDEX...>)
+#else
+// GCC/Clang need the decltype() return type
+C10_HOST_DEVICE constexpr decltype(auto) apply_impl(
+    F&& f,
+    Tuple&& t,
+    std::index_sequence<INDEX...>)
+#endif
+{
+  return std::forward<F>(f)(std::get<INDEX>(std::forward<Tuple>(t))...);
+}
+} // namespace detail
+
+template <class F, class Tuple>
+C10_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {
+  return detail::apply_impl(
+      std::forward<F>(f),
+      std::forward<Tuple>(t),
+      std::make_index_sequence<
+          std::tuple_size<std::remove_reference_t<Tuple>>::value>{});
+}
+
+#endif
+
+template <typename Functor, typename... Args>
+std::enable_if_t<
+    std::is_member_pointer_v<std::decay_t<Functor>>,
+    typename c10::invoke_result_t<Functor, Args...>>
+invoke(Functor&& f, Args&&... args) {
+  return std::mem_fn(std::forward<Functor>(f))(std::forward<Args>(args)...);
+}
+
+template <typename Functor, typename... Args>
+std::enable_if_t<
+    !std::is_member_pointer_v<std::decay_t<Functor>>,
+    typename c10::invoke_result_t<Functor, Args...>>
+invoke(Functor&& f, Args&&... args) {
+  return std::forward<Functor>(f)(std::forward<Args>(args)...);
+}
+
+namespace detail {
+struct _identity final {
+  template <class T>
+  using type_identity = T;
+
+  template <class T>
+  decltype(auto) operator()(T&& arg) {
+    return std::forward<T>(arg);
+  }
+};
+
+template <class Func, class Enable = void>
+struct function_takes_identity_argument : std::false_type {};
+
+template <class Func>
+struct function_takes_identity_argument<
+    Func,
+    std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {
+};
+} // namespace detail
+
+} // namespace guts
+} // namespace c10
+
+#endif // C10_UTIL_CPP17_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/CallOnce.h b/MLPY/Lib/site-packages/torch/include/c10/util/CallOnce.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ddac80f7c694dbad19076a6448481f63242580e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/CallOnce.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <utility>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/C++17.h>
+
+namespace c10 {
+
+// custom c10 call_once implementation to avoid the deadlock in std::call_once.
+// The implementation here is a simplified version from folly and likely much
+// much higher memory footprint.
+template <typename Flag, typename F, typename... Args>
+inline void call_once(Flag& flag, F&& f, Args&&... args) {
+  if (C10_LIKELY(flag.test_once())) {
+    return;
+  }
+  flag.call_once_slow(std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+class once_flag {
+ public:
+#ifndef _WIN32
+  // running into build error on MSVC. Can't seem to get a repro locally so I'm
+  // just avoiding constexpr
+  //
+  //   C:/actions-runner/_work/pytorch/pytorch\c10/util/CallOnce.h(26): error:
+  //   defaulted default constructor cannot be constexpr because the
+  //   corresponding implicitly declared default constructor would not be
+  //   constexpr 1 error detected in the compilation of
+  //   "C:/actions-runner/_work/pytorch/pytorch/aten/src/ATen/cuda/cub.cu".
+  constexpr
+#endif
+      once_flag() noexcept = default;
+  once_flag(const once_flag&) = delete;
+  once_flag& operator=(const once_flag&) = delete;
+
+ private:
+  template <typename Flag, typename F, typename... Args>
+  friend void call_once(Flag& flag, F&& f, Args&&... args);
+
+  template <typename F, typename... Args>
+  void call_once_slow(F&& f, Args&&... args) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    if (init_.load(std::memory_order_relaxed)) {
+      return;
+    }
+    c10::guts::invoke(std::forward<F>(f), std::forward<Args>(args)...);
+    init_.store(true, std::memory_order_release);
+  }
+
+  bool test_once() {
+    return init_.load(std::memory_order_acquire);
+  }
+
+  void reset_once() {
+    init_.store(false, std::memory_order_release);
+  }
+
+ private:
+  std::mutex mutex_;
+  std::atomic<bool> init_{false};
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ConstexprCrc.h b/MLPY/Lib/site-packages/torch/include/c10/util/ConstexprCrc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fb725370184efdf20ec36685006cd97bcd041da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ConstexprCrc.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <c10/util/IdWrapper.h>
+#include <c10/util/string_view.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace c10::util {
+
+namespace detail {
+// NOLINTNEXTLINE(*c-arrays*)
+constexpr uint64_t crc64_table[] = {
+    0x0000000000000000, 0x7ad870c830358979, 0xf5b0e190606b12f2,
+    0x8f689158505e9b8b, 0xc038e5739841b68f, 0xbae095bba8743ff6,
+    0x358804e3f82aa47d, 0x4f50742bc81f2d04, 0xab28ecb46814fe75,
+    0xd1f09c7c5821770c, 0x5e980d24087fec87, 0x24407dec384a65fe,
+    0x6b1009c7f05548fa, 0x11c8790fc060c183, 0x9ea0e857903e5a08,
+    0xe478989fa00bd371, 0x7d08ff3b88be6f81, 0x07d08ff3b88be6f8,
+    0x88b81eabe8d57d73, 0xf2606e63d8e0f40a, 0xbd301a4810ffd90e,
+    0xc7e86a8020ca5077, 0x4880fbd87094cbfc, 0x32588b1040a14285,
+    0xd620138fe0aa91f4, 0xacf86347d09f188d, 0x2390f21f80c18306,
+    0x594882d7b0f40a7f, 0x1618f6fc78eb277b, 0x6cc0863448deae02,
+    0xe3a8176c18803589, 0x997067a428b5bcf0, 0xfa11fe77117cdf02,
+    0x80c98ebf2149567b, 0x0fa11fe77117cdf0, 0x75796f2f41224489,
+    0x3a291b04893d698d, 0x40f16bccb908e0f4, 0xcf99fa94e9567b7f,
+    0xb5418a5cd963f206, 0x513912c379682177, 0x2be1620b495da80e,
+    0xa489f35319033385, 0xde51839b2936bafc, 0x9101f7b0e12997f8,
+    0xebd98778d11c1e81, 0x64b116208142850a, 0x1e6966e8b1770c73,
+    0x8719014c99c2b083, 0xfdc17184a9f739fa, 0x72a9e0dcf9a9a271,
+    0x08719014c99c2b08, 0x4721e43f0183060c, 0x3df994f731b68f75,
+    0xb29105af61e814fe, 0xc849756751dd9d87, 0x2c31edf8f1d64ef6,
+    0x56e99d30c1e3c78f, 0xd9810c6891bd5c04, 0xa3597ca0a188d57d,
+    0xec09088b6997f879, 0x96d1784359a27100, 0x19b9e91b09fcea8b,
+    0x636199d339c963f2, 0xdf7adabd7a6e2d6f, 0xa5a2aa754a5ba416,
+    0x2aca3b2d1a053f9d, 0x50124be52a30b6e4, 0x1f423fcee22f9be0,
+    0x659a4f06d21a1299, 0xeaf2de5e82448912, 0x902aae96b271006b,
+    0x74523609127ad31a, 0x0e8a46c1224f5a63, 0x81e2d7997211c1e8,
+    0xfb3aa75142244891, 0xb46ad37a8a3b6595, 0xceb2a3b2ba0eecec,
+    0x41da32eaea507767, 0x3b024222da65fe1e, 0xa2722586f2d042ee,
+    0xd8aa554ec2e5cb97, 0x57c2c41692bb501c, 0x2d1ab4dea28ed965,
+    0x624ac0f56a91f461, 0x1892b03d5aa47d18, 0x97fa21650afae693,
+    0xed2251ad3acf6fea, 0x095ac9329ac4bc9b, 0x7382b9faaaf135e2,
+    0xfcea28a2faafae69, 0x8632586aca9a2710, 0xc9622c4102850a14,
+    0xb3ba5c8932b0836d, 0x3cd2cdd162ee18e6, 0x460abd1952db919f,
+    0x256b24ca6b12f26d, 0x5fb354025b277b14, 0xd0dbc55a0b79e09f,
+    0xaa03b5923b4c69e6, 0xe553c1b9f35344e2, 0x9f8bb171c366cd9b,
+    0x10e3202993385610, 0x6a3b50e1a30ddf69, 0x8e43c87e03060c18,
+    0xf49bb8b633338561, 0x7bf329ee636d1eea, 0x012b592653589793,
+    0x4e7b2d0d9b47ba97, 0x34a35dc5ab7233ee, 0xbbcbcc9dfb2ca865,
+    0xc113bc55cb19211c, 0x5863dbf1e3ac9dec, 0x22bbab39d3991495,
+    0xadd33a6183c78f1e, 0xd70b4aa9b3f20667, 0x985b3e827bed2b63,
+    0xe2834e4a4bd8a21a, 0x6debdf121b863991, 0x1733afda2bb3b0e8,
+    0xf34b37458bb86399, 0x8993478dbb8deae0, 0x06fbd6d5ebd3716b,
+    0x7c23a61ddbe6f812, 0x3373d23613f9d516, 0x49aba2fe23cc5c6f,
+    0xc6c333a67392c7e4, 0xbc1b436e43a74e9d, 0x95ac9329ac4bc9b5,
+    0xef74e3e19c7e40cc, 0x601c72b9cc20db47, 0x1ac40271fc15523e,
+    0x5594765a340a7f3a, 0x2f4c0692043ff643, 0xa02497ca54616dc8,
+    0xdafce7026454e4b1, 0x3e847f9dc45f37c0, 0x445c0f55f46abeb9,
+    0xcb349e0da4342532, 0xb1eceec59401ac4b, 0xfebc9aee5c1e814f,
+    0x8464ea266c2b0836, 0x0b0c7b7e3c7593bd, 0x71d40bb60c401ac4,
+    0xe8a46c1224f5a634, 0x927c1cda14c02f4d, 0x1d148d82449eb4c6,
+    0x67ccfd4a74ab3dbf, 0x289c8961bcb410bb, 0x5244f9a98c8199c2,
+    0xdd2c68f1dcdf0249, 0xa7f41839ecea8b30, 0x438c80a64ce15841,
+    0x3954f06e7cd4d138, 0xb63c61362c8a4ab3, 0xcce411fe1cbfc3ca,
+    0x83b465d5d4a0eece, 0xf96c151de49567b7, 0x76048445b4cbfc3c,
+    0x0cdcf48d84fe7545, 0x6fbd6d5ebd3716b7, 0x15651d968d029fce,
+    0x9a0d8ccedd5c0445, 0xe0d5fc06ed698d3c, 0xaf85882d2576a038,
+    0xd55df8e515432941, 0x5a3569bd451db2ca, 0x20ed197575283bb3,
+    0xc49581ead523e8c2, 0xbe4df122e51661bb, 0x3125607ab548fa30,
+    0x4bfd10b2857d7349, 0x04ad64994d625e4d, 0x7e7514517d57d734,
+    0xf11d85092d094cbf, 0x8bc5f5c11d3cc5c6, 0x12b5926535897936,
+    0x686de2ad05bcf04f, 0xe70573f555e26bc4, 0x9ddd033d65d7e2bd,
+    0xd28d7716adc8cfb9, 0xa85507de9dfd46c0, 0x273d9686cda3dd4b,
+    0x5de5e64efd965432, 0xb99d7ed15d9d8743, 0xc3450e196da80e3a,
+    0x4c2d9f413df695b1, 0x36f5ef890dc31cc8, 0x79a59ba2c5dc31cc,
+    0x037deb6af5e9b8b5, 0x8c157a32a5b7233e, 0xf6cd0afa9582aa47,
+    0x4ad64994d625e4da, 0x300e395ce6106da3, 0xbf66a804b64ef628,
+    0xc5bed8cc867b7f51, 0x8aeeace74e645255, 0xf036dc2f7e51db2c,
+    0x7f5e4d772e0f40a7, 0x05863dbf1e3ac9de, 0xe1fea520be311aaf,
+    0x9b26d5e88e0493d6, 0x144e44b0de5a085d, 0x6e963478ee6f8124,
+    0x21c640532670ac20, 0x5b1e309b16452559, 0xd476a1c3461bbed2,
+    0xaeaed10b762e37ab, 0x37deb6af5e9b8b5b, 0x4d06c6676eae0222,
+    0xc26e573f3ef099a9, 0xb8b627f70ec510d0, 0xf7e653dcc6da3dd4,
+    0x8d3e2314f6efb4ad, 0x0256b24ca6b12f26, 0x788ec2849684a65f,
+    0x9cf65a1b368f752e, 0xe62e2ad306bafc57, 0x6946bb8b56e467dc,
+    0x139ecb4366d1eea5, 0x5ccebf68aecec3a1, 0x2616cfa09efb4ad8,
+    0xa97e5ef8cea5d153, 0xd3a62e30fe90582a, 0xb0c7b7e3c7593bd8,
+    0xca1fc72bf76cb2a1, 0x45775673a732292a, 0x3faf26bb9707a053,
+    0x70ff52905f188d57, 0x0a2722586f2d042e, 0x854fb3003f739fa5,
+    0xff97c3c80f4616dc, 0x1bef5b57af4dc5ad, 0x61372b9f9f784cd4,
+    0xee5fbac7cf26d75f, 0x9487ca0fff135e26, 0xdbd7be24370c7322,
+    0xa10fceec0739fa5b, 0x2e675fb4576761d0, 0x54bf2f7c6752e8a9,
+    0xcdcf48d84fe75459, 0xb71738107fd2dd20, 0x387fa9482f8c46ab,
+    0x42a7d9801fb9cfd2, 0x0df7adabd7a6e2d6, 0x772fdd63e7936baf,
+    0xf8474c3bb7cdf024, 0x829f3cf387f8795d, 0x66e7a46c27f3aa2c,
+    0x1c3fd4a417c62355, 0x935745fc4798b8de, 0xe98f353477ad31a7,
+    0xa6df411fbfb21ca3, 0xdc0731d78f8795da, 0x536fa08fdfd90e51,
+    0x29b7d047efec8728,
+};
+
+inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA uint64_t
+crc64impl(uint64_t accumulator, const char* data, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    accumulator =
+        crc64_table[(accumulator ^ data[i]) & 0xFF] ^ (accumulator >> 8);
+  }
+  return accumulator;
+}
+} // namespace detail
+
+struct crc64_t final : IdWrapper<crc64_t, uint64_t> {
+  constexpr crc64_t(uint64_t checksum) : IdWrapper(checksum) {}
+  constexpr uint64_t checksum() const {
+    return this->underlyingId();
+  }
+};
+
+// CRC64 with Jones coefficients and an init value of 0.
+inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA crc64_t
+crc64(const char* str, size_t size) {
+  return crc64_t{detail::crc64impl(0, str, size)};
+}
+
+inline C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA crc64_t crc64(c10::string_view str) {
+  return crc64(str.data(), str.size());
+}
+} // namespace c10::util
+
+// Allow usage of crc64_t in std::unordered_set
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::crc64_t);
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/DeadlockDetection.h b/MLPY/Lib/site-packages/torch/include/c10/util/DeadlockDetection.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee7ce021e69532c3b7f0e067c301391222928ba1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/DeadlockDetection.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+
+/// This file provides some simple utilities for detecting common deadlocks in
+/// PyTorch.  For now, we focus exclusively on detecting Python GIL deadlocks,
+/// as the GIL is a wide ranging lock that is taken out in many situations.
+/// The basic strategy is before performing an operation that may block, you
+/// can use TORCH_ASSERT_NO_GIL_WITHOUT_PYTHON_DEP() to assert that the GIL is
+/// not held.  This macro is to be used in contexts where no static dependency
+/// on Python is available (we will handle indirecting a virtual call for you).
+///
+/// If the GIL is held by a torchdeploy interpreter, we always report false.
+/// If you are in a context where Python bindings are available, it's better
+/// to directly assert on PyGILState_Check (as it avoids a vcall and also
+/// works correctly with torchdeploy.)
+
+#define TORCH_ASSERT_NO_GIL_WITHOUT_PYTHON_DEP() \
+  TORCH_INTERNAL_ASSERT(                         \
+      !c10::impl::check_python_gil(),            \
+      "Holding GIL before a blocking operation!  Please release the GIL before blocking, or see https://github.com/pytorch/pytorch/issues/56297 for how to release the GIL for destructors of objects")
+
+namespace c10::impl {
+
+C10_API bool check_python_gil();
+
+struct C10_API PythonGILHooks {
+  virtual ~PythonGILHooks() = default;
+  // Returns true if we hold the GIL.  If not linked against Python we
+  // always return false.
+  virtual bool check_python_gil() const = 0;
+};
+
+C10_API void SetPythonGILHooks(PythonGILHooks* factory);
+
+// DO NOT call this registerer from a torch deploy instance!  You will clobber
+// other registrations
+struct C10_API PythonGILHooksRegisterer {
+  explicit PythonGILHooksRegisterer(PythonGILHooks* factory) {
+    SetPythonGILHooks(factory);
+  }
+  ~PythonGILHooksRegisterer() {
+    SetPythonGILHooks(nullptr);
+  }
+};
+
+} // namespace c10::impl
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Deprecated.h b/MLPY/Lib/site-packages/torch/include/c10/util/Deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..6242b93ea400702ed47c7ddcaa6f25b325359bd3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Deprecated.h
@@ -0,0 +1,102 @@
+#pragma once
+
+/**
+ * This file provides portable macros for marking declarations
+ * as deprecated.  You should generally use C10_DEPRECATED,
+ * except when marking 'using' declarations as deprecated,
+ * in which case you should use C10_DEFINE_DEPRECATED_USING
+ * (due to portability concerns).
+ */
+
+// Sample usage:
+//
+//    C10_DEPRECATED void bad_func();
+//    struct C10_DEPRECATED BadStruct {
+//      ...
+//    };
+
+// NB: __cplusplus doesn't work for MSVC, so for now MSVC always uses
+// the "__declspec(deprecated)" implementation and not the C++14
+// "[[deprecated]]" attribute. We tried enabling "[[deprecated]]" for C++14 on
+// MSVC, but ran into issues with some older MSVC versions.
+#if (defined(__cplusplus) && __cplusplus >= 201402L)
+#define C10_DEPRECATED [[deprecated]]
+#define C10_DEPRECATED_MESSAGE(message) [[deprecated(message)]]
+#elif defined(__GNUC__)
+#define C10_DEPRECATED __attribute__((deprecated))
+// TODO Is there some way to implement this?
+#define C10_DEPRECATED_MESSAGE(message) __attribute__((deprecated))
+
+#elif defined(_MSC_VER)
+#define C10_DEPRECATED __declspec(deprecated)
+#define C10_DEPRECATED_MESSAGE(message) __declspec(deprecated(message))
+#else
+#warning "You need to implement C10_DEPRECATED for this compiler"
+#define C10_DEPRECATED
+#endif
+
+// Sample usage:
+//
+//    C10_DEFINE_DEPRECATED_USING(BadType, int)
+//
+//   which is the portable version of
+//
+//    using BadType [[deprecated]] = int;
+
+// technically [[deprecated]] syntax is from c++14 standard, but it works in
+// many compilers.
+#if defined(__has_cpp_attribute)
+#if __has_cpp_attribute(deprecated) && !defined(__CUDACC__)
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName [[deprecated]] = TypeThingy;
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#if defined(__CUDACC__)
+// neither [[deprecated]] nor __declspec(deprecated) work on nvcc on Windows;
+// you get the error:
+//
+//    error: attribute does not apply to any entity
+//
+// So we just turn the macro off in this case.
+#if defined(C10_DEFINE_DEPRECATED_USING)
+#undef C10_DEFINE_DEPRECATED_USING
+#endif
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName = TypeThingy;
+#else
+// [[deprecated]] does work in windows without nvcc, though msc doesn't support
+// `__has_cpp_attribute` when c++14 is supported, otherwise
+// __declspec(deprecated) is used as the alternative.
+#ifndef C10_DEFINE_DEPRECATED_USING
+#if defined(_MSVC_LANG) && _MSVC_LANG >= 201402L
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName [[deprecated]] = TypeThingy;
+#else
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName = __declspec(deprecated) TypeThingy;
+#endif
+#endif
+#endif
+#endif
+
+#if !defined(C10_DEFINE_DEPRECATED_USING) && defined(__GNUC__)
+// nvcc has a bug where it doesn't understand __attribute__((deprecated))
+// declarations even when the host compiler supports it. We'll only use this gcc
+// attribute when not cuda, and when using a GCC compiler that doesn't support
+// the c++14 syntax we checked for above (available in __GNUC__ >= 5)
+#if !defined(__CUDACC__)
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName __attribute__((deprecated)) = TypeThingy;
+#else
+// using cuda + gcc < 5, neither deprecated syntax is available so turning off.
+#define C10_DEFINE_DEPRECATED_USING(TypeName, TypeThingy) \
+  using TypeName = TypeThingy;
+#endif
+#endif
+
+#if !defined(C10_DEFINE_DEPRECATED_USING)
+#warning "You need to implement C10_DEFINE_DEPRECATED_USING for this compiler"
+#define C10_DEFINE_DEPRECATED_USING
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/DimVector.h b/MLPY/Lib/site-packages/torch/include/c10/util/DimVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae8169e86682bf2d084cba8e8eaa9a186faa9c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/DimVector.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <c10/core/impl/SizesAndStrides.h>
+#include <c10/util/SmallVector.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace c10 {
+
+constexpr size_t kDimVectorStaticSize = C10_SIZES_AND_STRIDES_MAX_INLINE_SIZE;
+
+/// A container for sizes or strides
+using DimVector = SmallVector<int64_t, kDimVectorStaticSize>;
+using SymDimVector = SmallVector<c10::SymInt, kDimVectorStaticSize>;
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Exception.h b/MLPY/Lib/site-packages/torch/include/c10/util/Exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..64cb7351fc47c477f9e12ca31d1ca2e15029d7ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Exception.h
@@ -0,0 +1,711 @@
+#ifndef C10_UTIL_EXCEPTION_H_
+#define C10_UTIL_EXCEPTION_H_
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/StringUtil.h>
+
+#include <cstdint>
+#include <exception>
+#include <string>
+#include <variant>
+#include <vector>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1900
+#define __func__ __FUNCTION__
+#endif
+
+namespace c10 {
+
+/// The primary ATen error class.
+/// Provides a complete error message with source location information via
+/// `what()`, and a more concise message via `what_without_backtrace()`.
+/// Don't throw this directly; use TORCH_CHECK/TORCH_INTERNAL_ASSERT instead.
+///
+/// NB: c10::Error is handled specially by the default torch to suppress the
+/// backtrace, see torch/csrc/Exceptions.h
+class C10_API Error : public std::exception {
+  // The actual error message.
+  std::string msg_;
+
+  // Context for the message (in order of decreasing specificity).  Context will
+  // be automatically formatted appropriately, so it is not necessary to add
+  // extra leading/trailing newlines to strings inside this vector
+  std::vector<std::string> context_;
+
+  // The C++ backtrace at the point when this exception was raised.  This
+  // may be empty if there is no valid backtrace.  (We don't use optional
+  // here to reduce the dependencies this file has.)
+  std::string backtrace_;
+
+  // These two are derived fields from msg_stack_ and backtrace_, but we need
+  // fields for the strings so that we can return a const char* (as the
+  // signature of std::exception requires).  Currently, the invariant
+  // is that these fields are ALWAYS populated consistently with respect
+  // to msg_stack_ and backtrace_.
+  std::string what_;
+  std::string what_without_backtrace_;
+
+  // This is a little debugging trick: you can stash a relevant pointer
+  // in caller, and then when you catch the exception, you can compare
+  // against pointers you have on hand to get more information about
+  // where the exception came from.  In Caffe2, this is used to figure
+  // out which operator raised an exception.
+  const void* caller_;
+
+ public:
+  // PyTorch-style Error constructor.  NB: the implementation of this
+  // is actually in Logging.cpp
+  Error(SourceLocation source_location, std::string msg);
+
+  // Caffe2-style error message
+  Error(
+      const char* file,
+      const uint32_t line,
+      const char* condition,
+      const std::string& msg,
+      const std::string& backtrace,
+      const void* caller = nullptr);
+
+  // Base constructor
+  Error(std::string msg, std::string backtrace, const void* caller = nullptr);
+
+  // Add some new context to the message stack.  The last added context
+  // will be formatted at the end of the context list upon printing.
+  // WARNING: This method is O(n) in the size of the stack, so don't go
+  // wild adding a ridiculous amount of context to error messages.
+  void add_context(std::string msg);
+
+  const std::string& msg() const {
+    return msg_;
+  }
+
+  const std::vector<std::string>& context() const {
+    return context_;
+  }
+
+  const std::string& backtrace() const {
+    return backtrace_;
+  }
+
+  /// Returns the complete error message, including the source location.
+  /// The returned pointer is invalidated if you call add_context() on
+  /// this object.
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+  const void* caller() const noexcept {
+    return caller_;
+  }
+
+  /// Returns only the error message string, without source location.
+  /// The returned pointer is invalidated if you call add_context() on
+  /// this object.
+  virtual const char* what_without_backtrace() const noexcept {
+    return what_without_backtrace_.c_str();
+  }
+
+ private:
+  void refresh_what();
+  std::string compute_what(bool include_backtrace) const;
+};
+
+class C10_API Warning {
+ public:
+  class C10_API UserWarning {};
+  class C10_API DeprecationWarning {};
+
+  using warning_variant_t = std::variant<UserWarning, DeprecationWarning>;
+
+  Warning(
+      warning_variant_t type,
+      const SourceLocation& source_location,
+      std::string msg,
+      bool verbatim);
+
+  Warning(
+      warning_variant_t type,
+      SourceLocation source_location,
+      const char* msg,
+      bool verbatim);
+
+  Warning(
+      warning_variant_t type,
+      SourceLocation source_location,
+      ::c10::detail::CompileTimeEmptyString msg,
+      bool verbatim);
+
+  // Getters for members
+  warning_variant_t type() const;
+  const SourceLocation& source_location() const;
+  const std::string& msg() const;
+  bool verbatim() const;
+
+ private:
+  // The type of warning
+  warning_variant_t type_;
+
+  // Where the warning happened.
+  SourceLocation source_location_;
+
+  // The actual warning message.
+  std::string msg_;
+
+  // See note: [Verbatim Warnings]
+  bool verbatim_;
+};
+
+using UserWarning = Warning::UserWarning;
+using DeprecationWarning = Warning::DeprecationWarning;
+
+// Issue a warning with a given message. Dispatched to the current
+// warning handler.
+void C10_API warn(const Warning& warning);
+
+class C10_API WarningHandler {
+ public:
+  virtual ~WarningHandler() = default;
+  /// The default warning handler. Prints the message to stderr.
+  virtual void process(const Warning& warning);
+};
+
+namespace WarningUtils {
+
+// Note: [Verbatim Warnings]
+// Warnings originating in C++ code can appear out-of-place to Python users:
+// a user runs a line in Python, but the warning references a line in C++.
+// Some parts of PyTorch, like the JIT, are cognizant of this mismatch
+// and take care to map warnings back to the user's program, but most
+// of PyTorch simply throws a context-free warning. To allow warning
+// handlers to add context where appropriate, warn takes the
+// "verbatim" flag. When this is false a warning handler might append
+// the C++ warning to a Python warning message that relates the warning
+// back to the user's program. Callers who have already accounted for
+// context in their warnings should set verbatim to true so their warnings
+// appear without modification.
+
+/// Sets the global warning handler. This is not thread-safe, so it should
+/// generally be called once during initialization or while holding the GIL
+/// for programs that use python.
+/// User is responsible for keeping the WarningHandler alive until
+/// it is not needed.
+C10_API void set_warning_handler(WarningHandler* handler) noexcept(true);
+/// Gets the global warning handler.
+C10_API WarningHandler* get_warning_handler() noexcept(true);
+
+class C10_API WarningHandlerGuard {
+  WarningHandler* prev_handler_;
+
+ public:
+  WarningHandlerGuard(WarningHandler* new_handler)
+      : prev_handler_(c10::WarningUtils::get_warning_handler()) {
+    c10::WarningUtils::set_warning_handler(new_handler);
+  }
+  ~WarningHandlerGuard() {
+    c10::WarningUtils::set_warning_handler(prev_handler_);
+  }
+};
+
+/// The TORCH_WARN_ONCE macro is difficult to test for. Use
+/// setWarnAlways(true) to turn it into TORCH_WARN, which can be
+/// tested for more easily.
+C10_API void set_warnAlways(bool) noexcept(true);
+C10_API bool get_warnAlways() noexcept(true);
+
+// A RAII guard that sets warn_always (not thread-local) on
+// construction, and sets it back to the original value upon destruction.
+struct C10_API WarnAlways {
+ public:
+  explicit WarnAlways(bool setting = true);
+  ~WarnAlways();
+
+ private:
+  bool prev_setting;
+};
+
+} // namespace WarningUtils
+
+// Like Error, but we always report the C++ backtrace, instead of only
+// reporting when TORCH_SHOW_CPP_STACKTRACES
+class C10_API ErrorAlwaysShowCppStacktrace : public Error {
+  using Error::Error;
+  const char* what_without_backtrace() const noexcept override {
+    return what();
+  }
+};
+
+// Used in ATen for out-of-bound indices that can reasonably only be detected
+// lazily inside a kernel (See: advanced indexing).  These turn into
+// IndexError when they cross to Python.
+class C10_API IndexError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for invalid values.  These turn into
+// ValueError when they cross to Python.
+class C10_API ValueError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for invalid types.  These turn into
+// TypeError when they cross to Python.
+class C10_API TypeError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for functionality that is not implemented.  These turn into
+// NotImplementedError when they cross to Python.
+class C10_API NotImplementedError : public Error {
+  using Error::Error;
+};
+
+// Used in ATen for non finite indices.  These turn into
+// ExitException when they cross to Python.
+class C10_API EnforceFiniteError : public Error {
+  using Error::Error;
+};
+
+// Used in Onnxifi backend lowering.  These turn into
+// ExitException when they cross to Python.
+class C10_API OnnxfiBackendSystemError : public Error {
+  using Error::Error;
+};
+
+// Used for numerical errors from the linalg module. These
+// turn into LinAlgError when they cross into Python.
+class C10_API LinAlgError : public Error {
+  using Error::Error;
+};
+
+class C10_API OutOfMemoryError : public Error {
+  using Error::Error;
+};
+
+// Base error type for all distributed errors.
+// These turn into DistError when they cross into Python.
+class C10_API DistError : public Error {
+  using Error::Error;
+};
+
+// Used for collective communication library errors from the distributed module.
+// These turn into DistBackendError when they cross into Python.
+class C10_API DistBackendError : public DistError {
+  using DistError::DistError;
+};
+
+// Used for errors originating from the store.
+// These turn into DistStoreError when they cross into Python.
+class C10_API DistStoreError : public DistError {
+  using DistError::DistError;
+};
+
+// Used for errors originating from the TCP/IP stack and not from collective
+// libraries. These turn into DistNetworkError when they cross into Python.
+class C10_API DistNetworkError : public DistError {
+  using DistError::DistError;
+};
+
+// A utility function to return an exception std::string by prepending its
+// exception type before its what() content
+C10_API std::string GetExceptionString(const std::exception& e);
+
+} // namespace c10
+
+// Private helper macro for implementing TORCH_INTERNAL_ASSERT and TORCH_CHECK
+//
+// Note: In the debug build With MSVC, __LINE__ might be of long type (a.k.a
+// int32_t), which is different from the definition of `SourceLocation` that
+// requires unsigned int (a.k.a uint32_t) and may cause a compile error with the
+// message: error C2397: conversion from 'long' to 'uint32_t' requires a
+// narrowing conversion Here the static cast is used to pass the build. if this
+// is used inside a lambda the __func__ macro expands to operator(), which isn't
+// very useful, but hard to fix in a macro so suppressing the warning.
+#define C10_THROW_ERROR(err_type, msg) \
+  throw ::c10::err_type(               \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
+
+#define C10_BUILD_ERROR(err_type, msg) \
+  ::c10::err_type({__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
+
+// Private helper macro for workaround MSVC misexpansion of nested macro
+// invocations involving __VA_ARGS__.  See
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define C10_EXPAND_MSVC_WORKAROUND(x) x
+
+// On nvcc, C10_UNLIKELY thwarts missing return statement analysis.  In cases
+// where the unlikely expression may be a constant, use this macro to ensure
+// return statement analysis keeps working (at the cost of not getting the
+// likely/unlikely annotation on nvcc).
+// https://github.com/pytorch/pytorch/issues/21418
+//
+// Currently, this is only used in the error reporting macros below.  If you
+// want to use it more generally, move me to Macros.h
+//
+// TODO: Brian Vaughan observed that we might be able to get this to work on
+// nvcc by writing some sort of C++ overload that distinguishes constexpr inputs
+// from non-constexpr.  Since there isn't any evidence that losing C10_UNLIKELY
+// in nvcc is causing us perf problems, this is not yet implemented, but this
+// might be an interesting piece of C++ code for an intrepid bootcamper to
+// write.
+#if defined(__CUDACC__)
+#define C10_UNLIKELY_OR_CONST(e) e
+#else
+#define C10_UNLIKELY_OR_CONST(e) C10_UNLIKELY(e)
+#endif
+
+// ----------------------------------------------------------------------------
+// Error reporting macros
+// ----------------------------------------------------------------------------
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_RETHROW(e, ...) throw
+#else
+#define TORCH_RETHROW(e, ...)               \
+  do {                                      \
+    e.add_context(::c10::str(__VA_ARGS__)); \
+    throw;                                  \
+  } while (false)
+#endif
+
+// A utility macro to provide assert()-like functionality; that is, enforcement
+// of internal invariants in code.  It supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the assert
+// failure message using operator<< (this is useful to print some variables
+// which may be useful for debugging.)
+//
+// Usage:
+//    TORCH_INTERNAL_ASSERT(should_be_true);
+//    TORCH_INTERNAL_ASSERT(x == 0, "x = ", x);
+//
+// Assuming no bugs in PyTorch, the conditions tested by this macro should
+// always be true; e.g., it should be possible to disable all of these
+// conditions without changing observable user behavior.  If you would like to
+// do error reporting for user input, please use TORCH_CHECK instead.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike assert()).
+//
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_INTERNAL_ASSERT(cond, ...)                              \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                               \
+    ::c10::detail::torchCheckFail(                                    \
+        __func__,                                                     \
+        __FILE__,                                                     \
+        static_cast<uint32_t>(__LINE__),                              \
+        #cond " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__)); \
+  }
+#else
+// It would be nice if we could build a combined string literal out of
+// the TORCH_INTERNAL_ASSERT prefix and a user-provided string literal
+// as the first argument, but there doesn't seem to be any good way to
+// do that while still supporting having a first argument that isn't a
+// string literal.
+#define TORCH_INTERNAL_ASSERT(cond, ...)                                         \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                          \
+    ::c10::detail::torchInternalAssertFail(                                      \
+        __func__,                                                                \
+        __FILE__,                                                                \
+        static_cast<uint32_t>(__LINE__),                                         \
+        #cond                                                                    \
+        " INTERNAL ASSERT FAILED at " C10_STRINGIZE(__FILE__) ":" C10_STRINGIZE( \
+            __LINE__) ", please report a bug to PyTorch. ",                      \
+        c10::str(__VA_ARGS__));                                                  \
+  }
+#endif
+
+// A utility macro to make it easier to test for error conditions from user
+// input.  Like TORCH_INTERNAL_ASSERT, it supports an arbitrary number of extra
+// arguments (evaluated only on failure), which will be printed in the error
+// message using operator<< (e.g., you can pass any object which has
+// operator<< defined.  Most objects in PyTorch have these definitions!)
+//
+// Usage:
+//    TORCH_CHECK(should_be_true); // A default error message will be provided
+//                                 // in this case; but we recommend writing an
+//                                 // explicit error message, as it is more
+//                                 // user friendly.
+//    TORCH_CHECK(x == 0, "Expected x to be 0, but got ", x);
+//
+// On failure, this macro will raise an exception.  If this exception propagates
+// to Python, it will convert into a Python RuntimeError.
+//
+// NOTE: It is SAFE to use this macro in production code; on failure, this
+// simply raises an exception, it does NOT unceremoniously quit the process
+// (unlike CHECK() from glog.)
+//
+#define TORCH_CHECK_WITH(error_t, cond, ...) \
+  TORCH_CHECK_WITH_MSG(error_t, cond, "", __VA_ARGS__)
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_CHECK_MSG(cond, type, ...) \
+  (#cond #type " CHECK FAILED at " C10_STRINGIZE(__FILE__))
+#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...)                \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                               \
+    C10_THROW_ERROR(Error, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \
+  }
+#else
+
+namespace c10::detail {
+template <typename... Args>
+decltype(auto) torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {
+  return ::c10::str(args...);
+}
+inline C10_API const char* torchCheckMsgImpl(const char* msg) {
+  return msg;
+}
+// If there is just 1 user-provided C-string argument, use it.
+inline C10_API const char* torchCheckMsgImpl(
+    const char* /*msg*/,
+    const char* args) {
+  return args;
+}
+} // namespace c10::detail
+
+#define TORCH_CHECK_MSG(cond, type, ...)                   \
+  (::c10::detail::torchCheckMsgImpl(                       \
+      "Expected " #cond                                    \
+      " to be true, but got false.  "                      \
+      "(Could this error message be improved?  If so, "    \
+      "please report an enhancement request to PyTorch.)", \
+      ##__VA_ARGS__))
+#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...)                  \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                                 \
+    C10_THROW_ERROR(error_t, TORCH_CHECK_MSG(cond, type, __VA_ARGS__)); \
+  }
+#endif
+
+namespace c10::detail {
+
+[[noreturn]] C10_API void torchCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const std::string& msg);
+[[noreturn]] C10_API void torchCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+// The c10::str() call that creates userMsg can have 1 of 3 return
+// types depending on the number and types of arguments passed to
+// TORCH_INTERNAL_ASSERT.  0 arguments will get a
+// CompileTimeEmptyString, 1 const char * will be passed straight
+// through, and anything else will get converted to std::string.
+[[noreturn]] C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    const char* userMsg);
+[[noreturn]] inline C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    ::c10::detail::CompileTimeEmptyString /*userMsg*/) {
+  torchCheckFail(func, file, line, condMsg);
+}
+[[noreturn]] C10_API void torchInternalAssertFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* condMsg,
+    const std::string& userMsg);
+
+} // namespace c10::detail
+
+#ifdef STRIP_ERROR_MESSAGES
+#define TORCH_CHECK(cond, ...)                   \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {          \
+    ::c10::detail::torchCheckFail(               \
+        __func__,                                \
+        __FILE__,                                \
+        static_cast<uint32_t>(__LINE__),         \
+        TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \
+  }
+#else
+#define TORCH_CHECK(cond, ...)                     \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {            \
+    ::c10::detail::torchCheckFail(                 \
+        __func__,                                  \
+        __FILE__,                                  \
+        static_cast<uint32_t>(__LINE__),           \
+        TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \
+  }
+#endif
+
+// An utility macro that does what `TORCH_CHECK` does if compiled in the host
+// code, otherwise does nothing. Supposed to be used in the code shared between
+// host and device code as an alternative for `TORCH_CHECK`.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...)
+#else
+#define TORCH_CHECK_IF_NOT_ON_CUDA(cond, ...) TORCH_CHECK(cond, ##__VA_ARGS__)
+#endif
+
+// Debug only version of TORCH_INTERNAL_ASSERT. This macro only checks in debug
+// build, and does nothing in release build.  It is appropriate to use
+// in situations where you want to add an assert to a hotpath, but it is
+// too expensive to run this assert on production builds.
+#ifdef NDEBUG
+// Optimized version - generates no code.
+#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \
+  while (false)                               \
+  C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__))
+#else
+#define TORCH_INTERNAL_ASSERT_DEBUG_ONLY(...) \
+  C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__))
+#endif
+
+// TODO: We're going to get a lot of similar looking string literals
+// this way; check if this actually affects binary size.
+
+// Like TORCH_CHECK, but raises LinAlgError instead of Error.
+#define TORCH_CHECK_LINALG(cond, ...) \
+  TORCH_CHECK_WITH_MSG(LinAlgError, cond, "LINALG", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises IndexErrors instead of Errors.
+#define TORCH_CHECK_INDEX(cond, ...) \
+  TORCH_CHECK_WITH_MSG(IndexError, cond, "INDEX", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises ValueErrors instead of Errors.
+#define TORCH_CHECK_VALUE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(ValueError, cond, "VALUE", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises TypeErrors instead of Errors.
+#define TORCH_CHECK_TYPE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(TypeError, cond, "TYPE", __VA_ARGS__)
+
+// Like TORCH_CHECK, but raises NotImplementedErrors instead of Errors.
+#define TORCH_CHECK_NOT_IMPLEMENTED(cond, ...) \
+  TORCH_CHECK_WITH_MSG(NotImplementedError, cond, "TYPE", __VA_ARGS__)
+
+#define TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(cond, ...) \
+  TORCH_CHECK_WITH_MSG(                                   \
+      ErrorAlwaysShowCppStacktrace, cond, "TYPE", ##__VA_ARGS__)
+
+#ifdef STRIP_ERROR_MESSAGES
+#define WARNING_MESSAGE_STRING(...) \
+  ::c10::detail::CompileTimeEmptyString {}
+#else
+#define WARNING_MESSAGE_STRING(...) ::c10::str(__VA_ARGS__)
+#endif
+
+// Report a warning to the user.  Accepts an arbitrary number of extra
+// arguments which are concatenated into the warning message using operator<<
+//
+#ifdef DISABLE_WARN
+#define _TORCH_WARN_WITH(...) ((void)0);
+#else
+#define _TORCH_WARN_WITH(warning_t, ...)                     \
+  ::c10::warn(::c10::Warning(                                \
+      warning_t(),                                           \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+      WARNING_MESSAGE_STRING(__VA_ARGS__),                   \
+      false));
+#endif
+
+#define TORCH_WARN(...) _TORCH_WARN_WITH(::c10::UserWarning, __VA_ARGS__);
+
+#define TORCH_WARN_DEPRECATION(...) \
+  _TORCH_WARN_WITH(::c10::DeprecationWarning, __VA_ARGS__);
+
+// Report a warning to the user only once.  Accepts an arbitrary number of extra
+// arguments which are concatenated into the warning message using operator<<
+//
+#define _TORCH_WARN_ONCE(...)                                             \
+  C10_UNUSED static const auto C10_ANONYMOUS_VARIABLE(torch_warn_once_) = \
+      [&] {                                                               \
+        TORCH_WARN(__VA_ARGS__);                                          \
+        return true;                                                      \
+      }()
+
+#ifdef DISABLE_WARN
+#define TORCH_WARN_ONCE(...) ((void)0);
+#else
+#define TORCH_WARN_ONCE(...)                   \
+  if (::c10::WarningUtils::get_warnAlways()) { \
+    TORCH_WARN(__VA_ARGS__);                   \
+  } else {                                     \
+    _TORCH_WARN_ONCE(__VA_ARGS__);             \
+  }
+#endif
+
+// Report an error with a specific argument
+// NOTE: using the argument name in TORCH_CHECK's message is preferred
+#define TORCH_CHECK_ARG(cond, argN, ...) \
+  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
+
+// ----------------------------------------------------------------------------
+// Deprecated macros
+// ----------------------------------------------------------------------------
+
+namespace c10::detail {
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ERROR(msg) is deprecated, use TORCH_CHECK(false, msg)
+instead.")
+*/
+inline void deprecated_AT_ERROR() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ASSERT is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")
+*/
+inline void deprecated_AT_ASSERT() {}
+
+/*
+// Deprecation disabled until we fix sites in our codebase
+C10_DEPRECATED_MESSAGE("AT_ASSERTM is deprecated, if you mean to indicate an
+internal invariant failure, use " \
+                       "TORCH_INTERNAL_ASSERT instead; if you mean to do user
+error checking, use " \ "TORCH_CHECK.  See
+https://github.com/pytorch/pytorch/issues/20287 for more details.")
+*/
+inline void deprecated_AT_ASSERTM() {}
+
+} // namespace c10::detail
+
+// Deprecated alias; this alias was deprecated because people kept mistakenly
+// using it for user error checking.  Use TORCH_INTERNAL_ASSERT or TORCH_CHECK
+// instead. See https://github.com/pytorch/pytorch/issues/20287 for more
+// details.
+#define AT_ASSERT(...)                                              \
+  do {                                                              \
+    ::c10::detail::deprecated_AT_ASSERT();                          \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(__VA_ARGS__)); \
+  } while (false)
+
+// Deprecated alias, like AT_ASSERT.  The new TORCH_INTERNAL_ASSERT macro
+// supports both 0-ary and variadic calls, so having a separate
+// message-accepting macro is not necessary.
+//
+// NB: we MUST include cond explicitly here, as MSVC will miscompile the macro
+// expansion, shunting all of __VA_ARGS__ to cond.  An alternate workaround
+// can be seen at
+// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+#define AT_ASSERTM(cond, ...)                                             \
+  do {                                                                    \
+    ::c10::detail::deprecated_AT_ASSERTM();                               \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__)); \
+  } while (false)
+
+// Deprecated alias; this alias was deprecated because it represents extra API
+// surface that makes it hard for people to understand what macro to use.
+// Use TORCH_CHECK(false, ...) or TORCH_INTERNAL_ASSERT(false, ...) to
+// unconditionally fail at a line of code.
+#define AT_ERROR(...)                                                        \
+  do {                                                                       \
+    ::c10::detail::deprecated_AT_ERROR();                                    \
+    C10_EXPAND_MSVC_WORKAROUND(TORCH_CHECK(false, ::c10::str(__VA_ARGS__))); \
+  } while (false)
+
+#endif // C10_UTIL_EXCEPTION_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwned.h b/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwned.h
new file mode 100644
index 0000000000000000000000000000000000000000..62a7cca47da91c98494b0006dea29d153746ecbe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwned.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <utility>
+
+namespace c10 {
+
+// See example implementation in TensorBase.h and TensorBody.h.
+// Synopsis:
+//
+// repr_type -- type to use to store an owned T in ExclusivelyOwned.
+//
+// pointer_type -- pointer-esque type to return from
+// ExclusivelyOwned's get() and operator*() methods.
+//
+// const_pointer_type -- similar to pointer_type, used for the const methods.
+//
+// static repr_type nullRepr() -- return a null instance of repr_type.
+//
+// template <class... Args>
+// static repr_type createInPlace(Args&&... args) -- used by the in-place
+// ExclusivelyOwned constructor.
+//
+// static repr_type moveToRepr(T&& x) -- move the given x into an
+// instance of repr_type. used by the ExclusivelyOwned(T&&)
+// constructor.
+//
+// static void destroyOwned(repr_type x) -- free memory for a
+// known-exclusively-owned instance of x. Replaces calling repr_type's
+// destructor. Being able to implement this more efficiently than
+// repr_type's destructor is the main reason to use ExclusivelyOwned
+// for a type.
+//
+// static T take(repr_type&) -- move out of the given repr_type into an owned T.
+//
+// static pointer_type getImpl(const repr_type&) -- return a pointer
+// to the given repr_type. May take repr_type by value if that is more
+// efficient.
+template <typename T>
+struct ExclusivelyOwnedTraits;
+
+/// ExclusivelyOwned is a smart-pointer-like wrapper around an
+/// exclusively-owned instance of some type T that normally has
+/// mandatory reference counting (currently just Tensor). If you have
+/// an isolated piece of code that knows that it has sole ownership of
+/// an object of one of these types (i.e., because you created it
+/// directly or using a factory function) and that object will not
+/// escape from that isolated piece of code, then moving the object
+/// into an ExclusivelyOwned will avoid an atomic reference count
+/// decrement at destruction time.
+///
+/// If you directly create the Tensor in the first
+/// place, you can use the in_place constructor of ExclusivelyOwned to
+/// additionally avoid doing any stores to initialize the refcount &
+/// weakcount.
+template <typename T>
+class ExclusivelyOwned {
+  using EOT = ExclusivelyOwnedTraits<T>;
+  typename ExclusivelyOwnedTraits<T>::repr_type repr_;
+
+ public:
+  ExclusivelyOwned() : repr_(EOT::nullRepr()) {}
+
+  explicit ExclusivelyOwned(T&& t) : repr_(EOT::moveToRepr(std::move(t))) {}
+
+  template <class... Args>
+  explicit ExclusivelyOwned(std::in_place_t, Args&&... args)
+      : repr_(EOT::createInPlace(std::forward<Args>(args)...)) {}
+
+  ExclusivelyOwned(const ExclusivelyOwned&) = delete;
+
+  ExclusivelyOwned(ExclusivelyOwned&& rhs) noexcept
+      : repr_(std::move(rhs.repr_)) {
+    rhs.repr_ = EOT::nullRepr();
+  }
+
+  ExclusivelyOwned& operator=(const ExclusivelyOwned&) = delete;
+
+  ExclusivelyOwned& operator=(ExclusivelyOwned&& rhs) noexcept {
+    EOT::destroyOwned(repr_);
+    repr_ = std::move(rhs.repr_);
+    rhs.repr_ = EOT::nullRepr();
+    return *this;
+  }
+
+  ExclusivelyOwned& operator=(T&& rhs) noexcept {
+    EOT::destroyOwned(repr_);
+    repr_ = EOT::moveToRepr(std::move(rhs));
+    return *this;
+  }
+
+  ~ExclusivelyOwned() {
+    EOT::destroyOwned(repr_);
+    // Don't bother to call the destructor of repr_, since we already
+    // did specialized destruction for the exclusively-owned case in
+    // destroyOwned!
+  }
+
+  // We don't provide this because it would require us to be able to
+  // differentiate an owned-but-empty T from a lack of T. This is
+  // particularly problematic for Tensor, which wants to use an
+  // undefined Tensor as its null state.
+  explicit operator bool() const noexcept = delete;
+
+  operator T() && {
+    return take();
+  }
+
+  // NOTE: the equivalent operation on MaybeOwned is a moving
+  // operator*. For ExclusivelyOwned, take() and operator*() may well
+  // have different return types, so they are different functions.
+  T take() && {
+    return EOT::take(repr_);
+  }
+
+  typename EOT::const_pointer_type operator->() const {
+    return get();
+  }
+
+  typename EOT::const_pointer_type get() const {
+    return EOT::getImpl(repr_);
+  }
+
+  typename EOT::pointer_type operator->() {
+    return get();
+  }
+
+  typename EOT::pointer_type get() {
+    return EOT::getImpl(repr_);
+  }
+
+  std::remove_pointer_t<typename EOT::const_pointer_type>& operator*() const {
+    return *get();
+  }
+
+  std::remove_pointer_t<typename EOT::pointer_type>& operator*() {
+    return *get();
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwnedTensorTraits.h b/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwnedTensorTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d61440a7b4b1b6f8a38e12de0799822c6d4223b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ExclusivelyOwnedTensorTraits.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+
+#include <utility>
+
+namespace c10 {
+// Shared ExclusivelyOwnedTraits implementation between caffe2::Tensor and
+// at::TensorBase.
+template <typename TensorType>
+struct ExclusivelyOwnedTensorTraits {
+  using repr_type = TensorType;
+  using pointer_type = TensorType*;
+  using const_pointer_type = const TensorType*;
+
+  static repr_type nullRepr() {
+    return TensorType();
+  }
+
+  template <class... Args>
+  static repr_type createInPlace(Args&&... args) {
+    return TensorType(std::forward<Args>(args)...);
+  }
+
+  static repr_type moveToRepr(TensorType&& x) {
+    return std::move(x);
+  }
+
+  static void destroyOwned(TensorType& x) {
+    TensorImpl* const toDestroy = x.unsafeReleaseTensorImpl();
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        toDestroy != nullptr, "Tensor somehow got null TensorImpl?");
+    // May be 0 because UndefinedTensorImpl doesn't get its refcount
+    // incremented.
+    const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
+        isUndefined,
+        " and refcount ",
+        toDestroy->refcount_,
+        ", expected 1 or, if isUndefined, 0!");
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        toDestroy->weakcount_ == 1 ||
+            (toDestroy->weakcount_ == 0 &&
+             toDestroy == UndefinedTensorImpl::singleton()),
+        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
+        isUndefined,
+        " and weakcount ",
+        toDestroy->weakcount_,
+        ", expected 1 or, if isUndefined, 0!");
+    if (!isUndefined) {
+#ifndef NDEBUG
+      // Needed to pass the debug assertions in ~intrusive_ptr_target.
+      toDestroy->refcount_ = 0;
+      toDestroy->weakcount_ = 0;
+#endif
+      delete toDestroy;
+    }
+  }
+
+  static TensorType take(TensorType& x) {
+    return std::move(x);
+  }
+
+  static pointer_type getImpl(repr_type& x) {
+    return &x;
+  }
+
+  static const_pointer_type getImpl(const repr_type& x) {
+    return &x;
+  }
+};
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/FbcodeMaps.h b/MLPY/Lib/site-packages/torch/include/c10/util/FbcodeMaps.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b8abdbcfbd99deb2842bd00bd9dddcd6b2713e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/FbcodeMaps.h
@@ -0,0 +1,29 @@
+#ifndef C10_UTIL_FBCODEMAPS_H_
+#define C10_UTIL_FBCODEMAPS_H_
+
+// Map typedefs so that we can use folly's F14 maps in fbcode without
+// taking a folly dependency.
+
+#ifdef FBCODE_CAFFE2
+#include <folly/container/F14Map.h>
+#include <folly/container/F14Set.h>
+#else
+#include <unordered_map>
+#include <unordered_set>
+#endif
+
+namespace c10 {
+#ifdef FBCODE_CAFFE2
+template <typename Key, typename Value>
+using FastMap = folly::F14FastMap<Key, Value>;
+template <typename Key>
+using FastSet = folly::F14FastSet<Key>;
+#else
+template <typename Key, typename Value>
+using FastMap = std::unordered_map<Key, Value>;
+template <typename Key>
+using FastSet = std::unordered_set<Key>;
+#endif
+} // namespace c10
+
+#endif // C10_UTIL_FBCODEMAPS_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Flags.h b/MLPY/Lib/site-packages/torch/include/c10/util/Flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fef9972125def4a273bfe16a583bc51d4ffd2ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Flags.h
@@ -0,0 +1,226 @@
+#ifndef C10_UTIL_FLAGS_H_
+#define C10_UTIL_FLAGS_H_
+
+/* Commandline flags support for C10.
+ *
+ * This is a portable commandline flags tool for c10, so we can optionally
+ * choose to use gflags or a lightweight custom implementation if gflags is
+ * not possible on a certain platform. If you have gflags installed, set the
+ * macro C10_USE_GFLAGS will seamlessly route everything to gflags.
+ *
+ * To define a flag foo of type bool default to true, do the following in the
+ * *global* namespace:
+ *     C10_DEFINE_bool(foo, true, "An example.");
+ *
+ * To use it in another .cc file, you can use C10_DECLARE_* as follows:
+ *     C10_DECLARE_bool(foo);
+ *
+ * In both cases, you can then access the flag via FLAGS_foo.
+ *
+ * It is recommended that you build with gflags. To learn more about the flags
+ * usage, refer to the gflags page here:
+ *
+ * https://gflags.github.io/gflags/
+ *
+ * Note about Python users / devs: gflags is initiated from a C++ function
+ * ParseCommandLineFlags, and is usually done in native binaries in the main
+ * function. As Python does not have a modifiable main function, it is usually
+ * difficult to change the flags after Python starts. Hence, it is recommended
+ * that one sets the default value of the flags to one that's acceptable in
+ * general - that will allow Python to run without wrong flags.
+ */
+
+#include <c10/macros/Export.h>
+#include <string>
+
+#include <c10/util/Registry.h>
+
+namespace c10 {
+/**
+ * Sets the usage message when a commandline tool is called with "--help".
+ */
+C10_API void SetUsageMessage(const std::string& str);
+
+/**
+ * Returns the usage message for the commandline tool set by SetUsageMessage.
+ */
+C10_API const char* UsageMessage();
+
+/**
+ * Parses the commandline flags.
+ *
+ * This command parses all the commandline arguments passed in via pargc
+ * and argv. Once it is finished, partc and argv will contain the remaining
+ * commandline args that c10 does not deal with. Note that following
+ * convention, argv[0] contains the binary name and is not parsed.
+ */
+C10_API bool ParseCommandLineFlags(int* pargc, char*** pargv);
+
+/**
+ * Checks if the commandline flags has already been passed.
+ */
+C10_API bool CommandLineFlagsHasBeenParsed();
+
+} // namespace c10
+
+////////////////////////////////////////////////////////////////////////////////
+// Below are gflags and non-gflags specific implementations.
+// In general, they define the following macros for one to declare (use
+// C10_DECLARE) or define (use C10_DEFINE) flags:
+// C10_{DECLARE,DEFINE}_{int,int64,double,bool,string}
+////////////////////////////////////////////////////////////////////////////////
+
+#ifdef C10_USE_GFLAGS
+
+////////////////////////////////////////////////////////////////////////////////
+// Begin gflags section: most functions are basically rerouted to gflags.
+////////////////////////////////////////////////////////////////////////////////
+#include <gflags/gflags.h>
+
+// C10 uses hidden visibility by default. However, in gflags, it only uses
+// export on Windows platform (with dllexport) but not on linux/mac (with
+// default visibility). As a result, to ensure that we are always exporting
+// global variables, we will redefine the GFLAGS_DLL_DEFINE_FLAG macro if we
+// are building C10 as a shared library.
+// This has to be done after the inclusion of gflags, because some early
+// versions of gflags.h (e.g. 2.0 on ubuntu 14.04) directly defines the
+// macros, so we need to do definition after gflags is done.
+#ifdef GFLAGS_DLL_DEFINE_FLAG
+#undef GFLAGS_DLL_DEFINE_FLAG
+#endif // GFLAGS_DLL_DEFINE_FLAG
+#ifdef GFLAGS_DLL_DECLARE_FLAG
+#undef GFLAGS_DLL_DECLARE_FLAG
+#endif // GFLAGS_DLL_DECLARE_FLAG
+#define GFLAGS_DLL_DEFINE_FLAG C10_EXPORT
+#define GFLAGS_DLL_DECLARE_FLAG C10_IMPORT
+
+// gflags before 2.0 uses namespace google and after 2.1 uses namespace gflags.
+// Using GFLAGS_GFLAGS_H_ to capture this change.
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif // GFLAGS_GFLAGS_H_
+
+// Motivation about the gflags wrapper:
+// (1) We would need to make sure that the gflags version and the non-gflags
+// version of C10 are going to expose the same flags abstraction. One should
+// explicitly use FLAGS_flag_name to access the flags.
+// (2) For flag names, it is recommended to start with c10_ to distinguish it
+// from regular gflags flags. For example, do
+//    C10_DEFINE_BOOL(c10_my_flag, true, "An example");
+// to allow one to use FLAGS_c10_my_flag.
+// (3) Gflags has a design issue that does not properly expose the global flags,
+// if one builds the library with -fvisibility=hidden. The current gflags (as of
+// Aug 2018) only deals with the Windows case using dllexport, and not the Linux
+// counterparts. As a result, we will explicitly use C10_EXPORT to export the
+// flags defined in C10. This is done via a global reference, so the flag
+// itself is not duplicated - under the hood it is the same global gflags flag.
+#define C10_GFLAGS_DEF_WRAPPER(type, real_type, name, default_value, help_str) \
+  DEFINE_##type(name, default_value, help_str);
+
+#define C10_DEFINE_int(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(int32, gflags::int32, name, default_value, help_str)
+#define C10_DEFINE_int32(name, default_value, help_str) \
+  C10_DEFINE_int(name, default_value, help_str)
+#define C10_DEFINE_int64(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(int64, gflags::int64, name, default_value, help_str)
+#define C10_DEFINE_double(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(double, double, name, default_value, help_str)
+#define C10_DEFINE_bool(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(bool, bool, name, default_value, help_str)
+#define C10_DEFINE_string(name, default_value, help_str) \
+  C10_GFLAGS_DEF_WRAPPER(string, ::fLS::clstring, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define C10_GFLAGS_DECLARE_WRAPPER(type, real_type, name) DECLARE_##type(name);
+
+#define C10_DECLARE_int(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(int32, gflags::int32, name)
+#define C10_DECLARE_int32(name) C10_DECLARE_int(name)
+#define C10_DECLARE_int64(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(int64, gflags::int64, name)
+#define C10_DECLARE_double(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(double, double, name)
+#define C10_DECLARE_bool(name) C10_GFLAGS_DECLARE_WRAPPER(bool, bool, name)
+#define C10_DECLARE_string(name) \
+  C10_GFLAGS_DECLARE_WRAPPER(string, ::fLS::clstring, name)
+
+////////////////////////////////////////////////////////////////////////////////
+// End gflags section.
+////////////////////////////////////////////////////////////////////////////////
+
+#else // C10_USE_GFLAGS
+
+////////////////////////////////////////////////////////////////////////////////
+// Begin non-gflags section: providing equivalent functionality.
+////////////////////////////////////////////////////////////////////////////////
+
+namespace c10 {
+
+class C10_API C10FlagParser {
+ public:
+  bool success() {
+    return success_;
+  }
+
+ protected:
+  template <typename T>
+  bool Parse(const std::string& content, T* value);
+  bool success_{false};
+};
+
+C10_DECLARE_REGISTRY(C10FlagsRegistry, C10FlagParser, const std::string&);
+
+} // namespace c10
+
+// The macros are defined outside the c10 namespace. In your code, you should
+// write the C10_DEFINE_* and C10_DECLARE_* macros outside any namespace
+// as well.
+
+#define C10_DEFINE_typed_var(type, name, default_value, help_str)       \
+  C10_EXPORT type FLAGS_##name = default_value;                         \
+  namespace c10 {                                                       \
+  namespace {                                                           \
+  class C10FlagParser_##name : public C10FlagParser {                   \
+   public:                                                              \
+    explicit C10FlagParser_##name(const std::string& content) {         \
+      success_ = C10FlagParser::Parse<type>(content, &FLAGS_##name);    \
+    }                                                                   \
+  };                                                                    \
+  }                                                                     \
+  RegistererC10FlagsRegistry g_C10FlagsRegistry_##name(                 \
+      #name,                                                            \
+      C10FlagsRegistry(),                                               \
+      RegistererC10FlagsRegistry::DefaultCreator<C10FlagParser_##name>, \
+      "(" #type ", default " #default_value ") " help_str);             \
+  }
+
+#define C10_DEFINE_int(name, default_value, help_str) \
+  C10_DEFINE_typed_var(int, name, default_value, help_str)
+#define C10_DEFINE_int32(name, default_value, help_str) \
+  C10_DEFINE_int(name, default_value, help_str)
+#define C10_DEFINE_int64(name, default_value, help_str) \
+  C10_DEFINE_typed_var(int64_t, name, default_value, help_str)
+#define C10_DEFINE_double(name, default_value, help_str) \
+  C10_DEFINE_typed_var(double, name, default_value, help_str)
+#define C10_DEFINE_bool(name, default_value, help_str) \
+  C10_DEFINE_typed_var(bool, name, default_value, help_str)
+#define C10_DEFINE_string(name, default_value, help_str) \
+  C10_DEFINE_typed_var(std::string, name, default_value, help_str)
+
+// DECLARE_typed_var should be used in header files and in the global namespace.
+#define C10_DECLARE_typed_var(type, name) C10_API extern type FLAGS_##name
+
+#define C10_DECLARE_int(name) C10_DECLARE_typed_var(int, name)
+#define C10_DECLARE_int32(name) C10_DECLARE_int(name)
+#define C10_DECLARE_int64(name) C10_DECLARE_typed_var(int64_t, name)
+#define C10_DECLARE_double(name) C10_DECLARE_typed_var(double, name)
+#define C10_DECLARE_bool(name) C10_DECLARE_typed_var(bool, name)
+#define C10_DECLARE_string(name) C10_DECLARE_typed_var(std::string, name)
+
+////////////////////////////////////////////////////////////////////////////////
+// End non-gflags section.
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // C10_USE_GFLAGS
+
+#endif // C10_UTIL_FLAGS_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..902a7c0a577ec3799a474bcad26e0755257c3869
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn-inl.h
@@ -0,0 +1,274 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e4m3fn::Float8_e4m3fn(float value)
+    : x(detail::fp8e4m3fn_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e4m3fn::operator float() const {
+  return detail::fp8e4m3fn_to_fp32_value(x);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e4m3fn::isnan() const {
+  return (x & 0b01111111) == 0b01111111;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator+(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator-(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn
+operator*(const Float8_e4m3fn& a, const Float8_e4m3fn& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(
+    const Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(const Float8_e4m3fn& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator+=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator-=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator*=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn& operator/=(
+    Float8_e4m3fn& a,
+    const Float8_e4m3fn& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e4m3fn a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e4m3fn a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e4m3fn b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e4m3fn b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e4m3fn b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e4m3fn b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e4m3fn& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e4m3fn& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e4m3fn& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e4m3fn& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e4m3fn a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e4m3fn a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e4m3fn b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e4m3fn b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e4m3fn b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e4m3fn b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(Float8_e4m3fn a, int b) {
+  return a + static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(Float8_e4m3fn a, int b) {
+  return a - static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(Float8_e4m3fn a, int b) {
+  return a * static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(Float8_e4m3fn a, int b) {
+  return a / static_cast<Float8_e4m3fn>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(int a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(Float8_e4m3fn a, int64_t b) {
+  return a + static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(Float8_e4m3fn a, int64_t b) {
+  return a - static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(Float8_e4m3fn a, int64_t b) {
+  return a * static_cast<Float8_e4m3fn>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(Float8_e4m3fn a, int64_t b) {
+  return a / static_cast<Float8_e4m3fn>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fn operator+(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator-(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator*(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fn operator/(int64_t a, Float8_e4m3fn b) {
+  return static_cast<Float8_e4m3fn>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e4m3fn to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e4m3fn> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -5;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e4m3fn min() {
+    return c10::Float8_e4m3fn(0x08, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn lowest() {
+    return c10::Float8_e4m3fn(0xFE, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn max() {
+    return c10::Float8_e4m3fn(0x7E, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn epsilon() {
+    return c10::Float8_e4m3fn(0x20, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn round_error() {
+    return c10::Float8_e4m3fn(0x30, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn quiet_NaN() {
+    return c10::Float8_e4m3fn(0x7F, c10::Float8_e4m3fn::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fn denorm_min() {
+    return c10::Float8_e4m3fn(0x01, c10::Float8_e4m3fn::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn.h
new file mode 100644
index 0000000000000000000000000000000000000000..86034ccef3f5ebae6d3ab7fb7796326f117160ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fn.h
@@ -0,0 +1,246 @@
+#pragma once
+
+/// Defines the Float8_e4m3fn type (8-bit floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration:
+/// s eeee mmm
+/// 1 sign bit
+/// 4 exponent bits
+/// 3 mantissa bits
+/// bias = 7
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2209.05433.pdf
+/// and inspired by Half implementation from pytorch/c10/util/Half.h
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#include <cmath>
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include <typeinfo> // operator typeid
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 8-bit floating-point number in fp8 E4M3FN format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline C10_HOST_DEVICE float fp8e4m3fn_to_fp32_value(uint8_t input) {
+  /*
+   * Extend the fp8 E4M3FN number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEE|MMM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31 27-30 24-26          0-23
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)input << 24;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEE|MMM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31  27-30 24-26      0-23
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 5 bits (sign == 0 and 4-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  uint32_t renorm_shift = __clz(nonsign);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  // Note: zero is not a supported input into `__builtin_clz`
+  uint32_t renorm_shift =
+      nonsign != 0 ? __builtin_clz(nonsign) : sizeof(uint32_t) * CHAR_BIT;
+#elif defined(_MSC_VER)
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  // Note: zero is not a supported input into `__builtin_clz`
+  uint32_t renorm_shift =
+      nonsign != 0 ? __builtin_clz(nonsign) : sizeof(uint32_t) * CHAR_BIT;
+#endif
+  renorm_shift = renorm_shift > 4 ? renorm_shift - 4 : 0;
+  /*
+   * Iff fp8e4m3fn number has all exponent and mantissa bits set to 1,
+   * the addition overflows it into bit 31, and the subsequent shift turns the
+   * high 9 bits into 1. Thus inf_nan_mask == 0x7F800000 if the fp8e4m3fn number
+   * is Nan, 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x01000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 4 so the exponent (4 bits originally)
+   * becomes an 8-bit field and 3-bit mantissa shifts into the 3 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x78 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0x07
+   * for fp8e4m3fn number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x78, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  uint32_t result = sign |
+      ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+  return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E4M3FN format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e4m3fn_from_fp32_value(float f) {
+  /*
+   * Binary representation of 480.0f, which is the first value
+   * not representable in fp8e4m3fn range:
+   * 0 1111 111 - fp8e4m3fn
+   * 0 10000111 11100000000000000000000 - fp32
+   */
+  constexpr uint32_t fp8_max = UINT32_C(1087) << 20;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e4m3fn normal range
+   * into denorm representation
+   * magic number: ((127 - 7) + (23 - 3) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(141) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+
+  uint8_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fp8_max) {
+    // NaN - all exponent and mantissa bits set to 1
+    result = 0x7f;
+  } else {
+    if (f_bits < (UINT32_C(121) << 23)) {
+      // Input number is smaller than 2^(-6), which is the smallest
+      // fp8e4m3fn normal number
+      f_bits =
+          fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+      result = static_cast<uint8_t>(f_bits - denorm_mask);
+    } else {
+      // resulting mantissa is odd
+      uint8_t mant_odd = (f_bits >> 20) & 1;
+
+      // update exponent, rounding bias part 1
+      f_bits += ((uint32_t)(7 - 127) << 23) + 0x7FFFF;
+
+      // rounding bias part 2
+      f_bits += mant_odd;
+
+      // take the bits!
+      result = static_cast<uint8_t>(f_bits >> 20);
+    }
+  }
+
+  result |= static_cast<uint8_t>(sign >> 24);
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e4m3fn {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e4m3fn() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e4m3fn(uint8_t bits, from_bits_t)
+      : x(bits){};
+  inline C10_HOST_DEVICE Float8_e4m3fn(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value);
+
+} // namespace c10
+
+#include <c10/util/Float8_e4m3fn-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c54c3b7d7d04b222a6498f15cb97d076d1bf890
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz-inl.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz::Float8_e4m3fnuz(float value)
+    : x(detail::fp8e4m3fnuz_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<4, 3>(x);
+}
+
+/// Special values helper
+
+inline C10_HOST_DEVICE bool Float8_e4m3fnuz::isnan() const {
+  return x == 0b10000000;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator+(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator-(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz
+operator*(const Float8_e4m3fnuz& a, const Float8_e4m3fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(
+    const Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(const Float8_e4m3fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator+=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator-=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator*=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz& operator/=(
+    Float8_e4m3fnuz& a,
+    const Float8_e4m3fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e4m3fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e4m3fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e4m3fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e4m3fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e4m3fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e4m3fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e4m3fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e4m3fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e4m3fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e4m3fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e4m3fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e4m3fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e4m3fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e4m3fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e4m3fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(Float8_e4m3fnuz a, int64_t b) {
+  return a + static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(Float8_e4m3fnuz a, int64_t b) {
+  return a - static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(Float8_e4m3fnuz a, int64_t b) {
+  return a * static_cast<Float8_e4m3fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(Float8_e4m3fnuz a, int64_t b) {
+  return a / static_cast<Float8_e4m3fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator+(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator-(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator*(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e4m3fnuz operator/(int64_t a, Float8_e4m3fnuz b) {
+  return static_cast<Float8_e4m3fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e4m3fnuz to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e4m3fnuz> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -6;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before = false;
+
+  static constexpr c10::Float8_e4m3fnuz min() {
+    return c10::Float8_e4m3fnuz(0x08, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz lowest() {
+    return c10::Float8_e4m3fnuz(0xFF, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz max() {
+    return c10::Float8_e4m3fnuz(0x7F, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz epsilon() {
+    return c10::Float8_e4m3fnuz(0x28, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz round_error() {
+    return c10::Float8_e4m3fnuz(0x38, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz infinity() {
+    // NaN (no infinities)
+    return c10::Float8_e4m3fnuz(0x80, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz quiet_NaN() {
+    return c10::Float8_e4m3fnuz(0x80, c10::Float8_e4m3fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e4m3fnuz denorm_min() {
+    return c10::Float8_e4m3fnuz(0x01, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h
new file mode 100644
index 0000000000000000000000000000000000000000..c329024b81d43f8a6fcc21ca3952514b527d6cac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e4m3fnuz.h
@@ -0,0 +1,136 @@
+#pragma once
+
+/// Defines the Float8_e4m3fnuz type (8-bit floating-point) including
+/// conversions to standard C types and basic arithmetic operations. Note that
+/// arithmetic operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration remains the same as Float8_e4m3fn:
+/// s eeee mmm
+/// 1 sign bit
+/// 4 exponent bits
+/// 3 mantissa bits
+/// The key differences versus Float8_e4m3fn are:
+/// bias = 8
+/// no infinities or negative zero
+/// NaN only when sign bit is 1, rest all 0s
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
+/// the existing Float8_e4m3fn implementation.
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E4M3FNUZ format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e4m3fnuz_from_fp32_value(float f) {
+  /*
+   * Binary representation of 256.0f, which is the first value not representable
+   * (i.e. the first value which would overflow in to the sign bit, resulting in
+   * a NaN) in fp8e4m3fnuz range:
+   * 1 0000 000 - fp8e4m3fnuz
+   * 0 10000111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fnuz_max = UINT32_C(0x87) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e4m3fnuz normal range
+   * into denorm representation
+   * magic number: ((127 - 8) + (23 - 3) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(0x8C) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+
+  uint32_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fnuz_max) {
+    // NaN -- sign bit set to 1, rest 0s.
+    return 0x80;
+  }
+
+  if (f_bits < (UINT32_C(0x78) << 23) /* 2^-7 in float32 */) {
+    // Input exponent is less than -7, the smallest e4m3fnuz exponent, so the
+    // number will become subnormal.
+    f_bits = fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+    result = static_cast<uint8_t>(f_bits - denorm_mask);
+    if (result == 0) {
+      // fnuz types don't have negative zero.
+      return 0;
+    }
+  } else {
+    // resulting mantissa is odd
+    uint8_t mant_odd = (f_bits >> 20) & 1;
+
+    // update exponent, rounding bias part 1
+    f_bits += ((uint32_t)(8 - 127) << 23) + 0x7FFFF;
+
+    // rounding bias part 2
+    f_bits += mant_odd;
+
+    // take the bits!
+    result = static_cast<uint8_t>(f_bits >> 20);
+  }
+
+  result |= sign >> 24;
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e4m3fnuz {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e4m3fnuz() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e4m3fnuz(uint8_t bits, from_bits_t)
+      : x(bits){};
+  inline C10_HOST_DEVICE Float8_e4m3fnuz(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+};
+
+C10_API std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e4m3fnuz& value);
+
+} // namespace c10
+
+#include <c10/util/Float8_e4m3fnuz-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d242247823f43ee4b1a0536cdff8bf599a00143
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2-inl.h
@@ -0,0 +1,283 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#define EXP_WIDTH_FP8 5
+#define MAN_WIDTH_FP8 2
+#define EXP_BIAS_FP8 15
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e5m2::Float8_e5m2(float value)
+    : x(detail::fp8e5m2_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e5m2::operator float() const {
+  return detail::fp8e5m2_to_fp32_value(x);
+}
+
+/// Special values helpers
+
+inline C10_HOST_DEVICE bool Float8_e5m2::isnan() const {
+  return (x & 0b01111111) > 0b01111100;
+}
+
+inline C10_HOST_DEVICE bool Float8_e5m2::isinf() const {
+  return (x & 0b01111111) == 0b01111100;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator+(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator-(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2
+operator*(const Float8_e5m2& a, const Float8_e5m2& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator/(
+    const Float8_e5m2& a,
+    const Float8_e5m2& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator-(const Float8_e5m2& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator+=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator-=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator*=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2& operator/=(
+    Float8_e5m2& a,
+    const Float8_e5m2& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e5m2 a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e5m2 a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e5m2 b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e5m2 b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e5m2 b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e5m2 b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e5m2& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e5m2& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e5m2& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e5m2& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e5m2 a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e5m2 a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e5m2 b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e5m2 b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e5m2 b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e5m2 b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(Float8_e5m2 a, int b) {
+  return a + static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(Float8_e5m2 a, int b) {
+  return a - static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(Float8_e5m2 a, int b) {
+  return a * static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(Float8_e5m2 a, int b) {
+  return a / static_cast<Float8_e5m2>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(int a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(Float8_e5m2 a, int64_t b) {
+  return a + static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(Float8_e5m2 a, int64_t b) {
+  return a - static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(Float8_e5m2 a, int64_t b) {
+  return a * static_cast<Float8_e5m2>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(Float8_e5m2 a, int64_t b) {
+  return a / static_cast<Float8_e5m2>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2 operator+(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator-(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator*(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2 operator/(int64_t a, Float8_e5m2 b) {
+  return static_cast<Float8_e5m2>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e5m2 to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e5m2> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = false;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::Float8_e5m2 min() {
+    return c10::Float8_e5m2(0x4, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 max() {
+    return c10::Float8_e5m2(0x7B, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 lowest() {
+    return c10::Float8_e5m2(0xFB, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 epsilon() {
+    return c10::Float8_e5m2(0x34, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 round_error() {
+    return c10::Float8_e5m2(0x38, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 infinity() {
+    return c10::Float8_e5m2(0x7C, c10::Float8_e5m2::from_bits());
+  }
+  static constexpr c10::Float8_e5m2 denorm_min() {
+    return c10::Float8_e5m2(0x01, c10::Float8_e5m2::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2.h
new file mode 100644
index 0000000000000000000000000000000000000000..da2eec186a535dd51bae3670a73f04fed0e93c47
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2.h
@@ -0,0 +1,143 @@
+#pragma once
+
+/// Defines the Float8_e5m2 type (8-bit floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration:
+/// s eeeee mm
+/// 1 sign bit
+/// 5 exponent bits
+/// 2 mantissa bits
+/// bias = 15
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2209.05433.pdf
+/// and inspired by Half implementation from pytorch/c10/util/Half.h
+
+#include <c10/util/Half.h>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 8-bit floating-point number in fp8 E5M2 format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline C10_HOST_DEVICE float fp8e5m2_to_fp32_value(uint8_t input) {
+  /*
+   * Extend the fp8 E5M2 number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+----+---+-----------------------------+
+   *      | S |EEEEE|MM|0000 0000 0000 0000 0000 0000|
+   *      +---+----+---+-----------------------------+
+   * Bits  31 26-30 24-25          0-23
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  uint16_t half_representation = input;
+  half_representation <<= 8;
+  return fp16_ieee_to_fp32_value(half_representation);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E5M2 format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e5m2_from_fp32_value(float f) {
+  /*
+   * Binary representation of fp32 infinity
+   * 0 11111111 00000000000000000000000
+   */
+  constexpr uint32_t fp32_inf = UINT32_C(255) << 23;
+
+  /*
+   * Binary representation of 65536.0f, which is the first value
+   * not representable in fp8e5m2 range:
+   * 0 11111 00 - fp8e5m2
+   * 0 10001111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fp8_max = UINT32_C(143) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e5m2 normal range
+   * into denorm representation
+   * magic number: ((127 - 15) + (23 - 2) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(134) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+  uint8_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fp8_max) {
+    // NaN - all exponent and mantissa bits set to 1
+    result = f_bits > fp32_inf ? UINT8_C(0x7F) : UINT8_C(0x7C);
+  } else {
+    if (f_bits < (UINT32_C(113) << 23)) {
+      // Input number is smaller than 2^(-14), which is the smallest
+      // fp8e5m2 normal number
+      f_bits =
+          fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+      result = static_cast<uint8_t>(f_bits - denorm_mask);
+    } else {
+      // resulting mantissa is odd
+      uint32_t mant_odd = (f_bits >> 21) & 1;
+
+      // update exponent, rounding bias part 1
+      f_bits += ((uint32_t)(15 - 127) << 23) + 0xFFFFF;
+
+      // rounding bias part 2
+      f_bits += mant_odd;
+
+      // take the bits!
+      result = static_cast<uint8_t>(f_bits >> 21);
+    }
+  }
+
+  result |= static_cast<uint8_t>(sign >> 24);
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e5m2 {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e5m2() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e5m2(uint8_t bits, from_bits_t) : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e5m2(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+  inline C10_HOST_DEVICE bool isinf() const;
+};
+
+C10_API std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value);
+
+} // namespace c10
+
+#include <c10/util/Float8_e5m2-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..300107f14c05b8ebc30ceba2b0cfea856ed2409d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz-inl.h
@@ -0,0 +1,280 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Float8_fnuz_cvt.h>
+#include <cstring>
+#include <limits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Constructors
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz::Float8_e5m2fnuz(float value)
+    : x(detail::fp8e5m2fnuz_from_fp32_value(value)) {}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz::operator float() const {
+  return detail::fp8_fnuz_to_fp32_value<5, 2>(x);
+}
+
+/// Special values helpers
+
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isnan() const {
+  return x == 0b10000000;
+}
+
+inline C10_HOST_DEVICE bool Float8_e5m2fnuz::isinf() const {
+  return false;
+}
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator+(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator-(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz
+operator*(const Float8_e5m2fnuz& a, const Float8_e5m2fnuz& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(
+    const Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(const Float8_e5m2fnuz& a) {
+  return -static_cast<float>(a);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator+=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator-=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator*=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz& operator/=(
+    Float8_e5m2fnuz& a,
+    const Float8_e5m2fnuz& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Float8_e5m2fnuz a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Float8_e5m2fnuz a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Float8_e5m2fnuz b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Float8_e5m2fnuz b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Float8_e5m2fnuz b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Float8_e5m2fnuz& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Float8_e5m2fnuz& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Float8_e5m2fnuz& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Float8_e5m2fnuz& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Float8_e5m2fnuz a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Float8_e5m2fnuz a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Float8_e5m2fnuz b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Float8_e5m2fnuz b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Float8_e5m2fnuz b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Float8_e5m2fnuz b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(Float8_e5m2fnuz a, int64_t b) {
+  return a + static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(Float8_e5m2fnuz a, int64_t b) {
+  return a - static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(Float8_e5m2fnuz a, int64_t b) {
+  return a * static_cast<Float8_e5m2fnuz>(b);
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(Float8_e5m2fnuz a, int64_t b) {
+  return a / static_cast<Float8_e5m2fnuz>(b);
+}
+
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator+(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) + b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator-(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) - b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator*(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) * b;
+}
+inline C10_HOST_DEVICE Float8_e5m2fnuz operator/(int64_t a, Float8_e5m2fnuz b) {
+  return static_cast<Float8_e5m2fnuz>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Float8_e5m2fnuz to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Float8_e5m2fnuz> {
+ public:
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -14;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+
+  static constexpr c10::Float8_e5m2fnuz min() {
+    return c10::Float8_e5m2fnuz(0x04, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz max() {
+    return c10::Float8_e5m2fnuz(0x7F, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz lowest() {
+    return c10::Float8_e5m2fnuz(0xFF, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz epsilon() {
+    return c10::Float8_e5m2fnuz(0x34, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz round_error() {
+    return c10::Float8_e5m2fnuz(0x38, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz infinity() {
+    return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
+  }
+  static constexpr c10::Float8_e5m2fnuz denorm_min() {
+    return c10::Float8_e5m2fnuz(0x01, c10::Float8_e5m2fnuz::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7d8e25ab059e844d4d1c1c724e7c5fe088cca67
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_e5m2fnuz.h
@@ -0,0 +1,135 @@
+#pragma once
+
+/// Defines the Float8_e5m2fnuz type (8-bit floating-point) including
+/// conversions to standard C types and basic arithmetic operations. Note that
+/// arithmetic operations are implemented by converting to floating point and
+/// performing the operation in float32.
+/// Binary configuration remains the same as e5m2:
+/// s eeeee mm
+/// 1 sign bit
+/// 5 exponent bits
+/// 2 mantissa bits
+/// The key differences that e5m2fnuz brings are:
+/// bias = 16
+/// no infinities or negative zero
+/// NaN only when sign bit is 1, rest all 0s
+///
+/// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
+/// the existing Float8_e4m3fn implementation.
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/floating_point_utils.h>
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#include <stdint.h>
+#endif
+
+#include <iosfwd>
+#include <ostream>
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 8-bit floating-point number in fp8 E5M2 format, in bit representation.
+ */
+inline C10_HOST_DEVICE uint8_t fp8e5m2fnuz_from_fp32_value(float f) {
+  /*
+   * Binary representation of 65536.0f, which is the first value not
+   * representable (i.e. the first value which would overflow in to the sign
+   * bit, resulting in a NaN) in fp8e4m3fnuz range:
+   * 1 00000 00 - fp8e5m2fnuz
+   * 0 10001111 00000000000000000000000 - fp32
+   */
+  constexpr uint32_t fnuz_max = UINT32_C(0x8F) << 23;
+
+  /*
+   * A mask for converting fp32 numbers lower than fp8e5m2fnuz normal range
+   * into denormalized representation.
+   * magic number: ((127 - 16) + (23 - 2) + 1)
+   */
+  constexpr uint32_t denorm_mask = UINT32_C(0x85) << 23;
+
+  uint32_t f_bits = fp32_to_bits(f);
+  uint32_t result = 0u;
+
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = f_bits & UINT32_C(0x80000000);
+
+  /*
+   * Set sign bit to 0
+   */
+  f_bits ^= sign;
+
+  if (f_bits >= fnuz_max) {
+    // NaN -- sign bit set to 1, rest 0s
+    return 0x80;
+  }
+
+  if (f_bits < (UINT32_C(0x70) << 23) /* 2^-15 in float32 */) {
+    // Input exponent is less than -15, the smallest e5m2fnuz exponent, so the
+    // number will become subnormal.
+    f_bits = fp32_to_bits(fp32_from_bits(f_bits) + fp32_from_bits(denorm_mask));
+    result = static_cast<uint8_t>(f_bits - denorm_mask);
+    if (result == 0) {
+      // fnuz types don't have negative zero.
+      return 0;
+    }
+  } else {
+    // resulting mantissa is odd
+    uint8_t mant_odd = (f_bits >> 21) & 1;
+
+    // update exponent, rounding bias part 1
+    f_bits += ((uint32_t)(16 - 127) << 23) + 0xFFFFF;
+
+    // rounding bias part 2
+    f_bits += mant_odd;
+
+    // take the bits!
+    result = static_cast<uint8_t>(f_bits >> 21);
+  }
+
+  result |= sign >> 24;
+  return result;
+}
+
+} // namespace detail
+
+struct alignas(1) Float8_e5m2fnuz {
+  uint8_t x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  Float8_e5m2fnuz() = default;
+
+  constexpr C10_HOST_DEVICE Float8_e5m2fnuz(uint8_t bits, from_bits_t)
+      : x(bits) {}
+  inline C10_HOST_DEVICE Float8_e5m2fnuz(float value);
+  inline C10_HOST_DEVICE operator float() const;
+  inline C10_HOST_DEVICE bool isnan() const;
+  inline C10_HOST_DEVICE bool isinf() const;
+};
+
+C10_API std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e5m2fnuz& value);
+
+} // namespace c10
+
+#include <c10/util/Float8_e5m2fnuz-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Float8_fnuz_cvt.h b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_fnuz_cvt.h
new file mode 100644
index 0000000000000000000000000000000000000000..1abf3f1c4122120b29882b0596df529720577546
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Float8_fnuz_cvt.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <c10/util/floating_point_utils.h>
+
+#include <cstdint>
+
+namespace c10::detail {
+
+/*
+ * Convert a 8-bit floating-point number in either f8 E4M3FNUZ or bf8 E5M2FNUZ
+ * format, in bit representation, to a 32-bit floating-point number.
+ */
+template <uint32_t we, uint32_t wm>
+inline C10_HOST_DEVICE float fp8_fnuz_to_fp32_value(uint8_t x) {
+  static_assert((we == 4 && wm == 3) || (we == 5 && wm == 2));
+  constexpr uint32_t weo = 8;
+  constexpr uint32_t wmo = 23;
+
+  if (x == 0) {
+    return 0;
+  }
+
+  if (x == 0x80) {
+    constexpr uint32_t ifNaN = 0x7F800001;
+    return fp32_from_bits(ifNaN);
+  }
+
+  uint32_t mantissa = x & ((1 << wm) - 1);
+  uint32_t exponent = (x & 0x7F) >> wm;
+
+  // subnormal input
+  if (exponent == 0) {
+    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    uint32_t renorm_shift = __clz(mantissa);
+#elif defined(_MSC_VER)
+    unsigned long nonsign_bsr;
+    _BitScanReverse(&nonsign_bsr, (unsigned long)mantissa);
+    uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+    uint32_t renorm_shift = __builtin_clz(mantissa);
+#endif
+    uint32_t sh = 1 + renorm_shift - (32 - wm);
+    mantissa <<= sh;
+    exponent += 1 - sh;
+    mantissa &= ((1 << wm) - 1);
+  }
+
+  const uint32_t exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1));
+  exponent += exp_low_cutoff - 1;
+  mantissa <<= wmo - wm;
+
+  uint32_t sign = x >> 7;
+  uint32_t retval = (sign << 31) | (exponent << 23) | mantissa;
+  return fp32_from_bits(retval);
+}
+
+} // namespace c10::detail
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/FunctionRef.h b/MLPY/Lib/site-packages/torch/include/c10/util/FunctionRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..aac5e11d99e13bd33431e25e68158a08a8de1223
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/FunctionRef.h
@@ -0,0 +1,73 @@
+//===- llvm/ADT/STLExtras.h - Useful STL related functions ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some templates that are useful if you are working with the
+// STL at all.
+//
+// No library is required when using these functions.
+//
+//===----------------------------------------------------------------------===//
+
+// c10: modified from llvm::function_ref
+// c10: added more SFINAE to enable use in overloaded functions
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/// An efficient, type-erasing, non-owning reference to a callable. This is
+/// intended for use as the type of a function parameter that is not used
+/// after the function in question returns.
+///
+/// This class does not own the callable, so it is not in general safe to store
+/// a function_ref.
+template <typename Fn>
+class function_ref;
+
+template <typename Ret, typename... Params>
+class function_ref<Ret(Params...)> {
+  Ret (*callback)(intptr_t callable, Params... params) = nullptr;
+  intptr_t callable{};
+
+  template <typename Callable>
+  static Ret callback_fn(intptr_t callable, Params... params) {
+    return (*reinterpret_cast<Callable*>(callable))(
+        std::forward<Params>(params)...);
+  }
+
+ public:
+  function_ref() = default;
+  function_ref(std::nullptr_t) {}
+
+  template <typename Callable>
+  function_ref(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      Callable&& callable,
+      std::enable_if_t<
+          !std::is_same_v<std::remove_reference_t<Callable>, function_ref>>* =
+          nullptr,
+      std::enable_if_t<std::is_convertible_v<
+          typename std::invoke_result_t<Callable, Params...>,
+          Ret>>* = nullptr)
+      : callback(callback_fn<std::remove_reference_t<Callable>>),
+        callable(reinterpret_cast<intptr_t>(&callable)) {}
+
+  Ret operator()(Params... params) const {
+    return callback(callable, std::forward<Params>(params)...);
+  }
+
+  operator bool() const {
+    return callback;
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Half-inl.h b/MLPY/Lib/site-packages/torch/include/c10/util/Half-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..14194d035739f05808e5b7071df456639ea05306
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Half-inl.h
@@ -0,0 +1,350 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/bit_cast.h>
+
+#include <cstring>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+#include <ATen/cpu/vec/vec_half.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+/// Constructors
+inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {}
+inline Half::operator float16_t() const {
+  return detail::fp16_from_bits(x);
+}
+#else
+
+inline C10_HOST_DEVICE Half::Half(float value)
+    :
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+      x(__half_as_short(__float2half(value)))
+#elif defined(__SYCL_DEVICE_ONLY__)
+      x(c10::bit_cast<uint16_t>(sycl::half(value)))
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+      x(at::vec::float2half_scalar(value))
+#else
+      x(detail::fp16_ieee_from_fp32_value(value))
+#endif
+{
+}
+
+/// Implicit conversions
+
+inline C10_HOST_DEVICE Half::operator float() const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+  return __half2float(*reinterpret_cast<const __half*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return float(c10::bit_cast<sycl::half>(x));
+#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
+    !defined(__APPLE__)
+  return at::vec::half2float_scalar(x);
+#elif defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+  return detail::native_fp16_to_fp32_value(x);
+#else
+  return detail::fp16_ieee_to_fp32_value(x);
+#endif
+}
+
+#endif /* !defined(__aarch64__) || defined(C10_MOBILE) || defined(__CUDACC__) \
+        */
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+inline C10_HOST_DEVICE Half::Half(const __half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator __half() const {
+  return *reinterpret_cast<const __half*>(&x);
+}
+#endif
+
+#ifdef SYCL_LANGUAGE_VERSION
+inline C10_HOST_DEVICE Half::Half(const sycl::half& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE Half::operator sycl::half() const {
+  return *reinterpret_cast<const sycl::half*>(&x);
+}
+#endif
+
+// CUDA intrinsics
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \
+    (defined(__clang__) && defined(__CUDA__))
+inline __device__ Half __ldg(const Half* ptr) {
+  return __ldg(reinterpret_cast<const __half*>(ptr));
+}
+#endif
+
+/// Arithmetic
+
+inline C10_HOST_DEVICE Half operator+(const Half& a, const Half& b) {
+  return static_cast<float>(a) + static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a, const Half& b) {
+  return static_cast<float>(a) - static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator*(const Half& a, const Half& b) {
+  return static_cast<float>(a) * static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator/(const Half& a, const Half& b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE Half operator-(const Half& a) {
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
+    defined(__HIP_DEVICE_COMPILE__)
+  return __hneg(a);
+#elif defined(__SYCL_DEVICE_ONLY__)
+  return -c10::bit_cast<sycl::half>(a);
+#else
+  return -static_cast<float>(a);
+#endif
+}
+
+inline C10_HOST_DEVICE Half& operator+=(Half& a, const Half& b) {
+  a = a + b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator-=(Half& a, const Half& b) {
+  a = a - b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator*=(Half& a, const Half& b) {
+  a = a * b;
+  return a;
+}
+
+inline C10_HOST_DEVICE Half& operator/=(Half& a, const Half& b) {
+  a = a / b;
+  return a;
+}
+
+/// Arithmetic with floats
+
+inline C10_HOST_DEVICE float operator+(Half a, float b) {
+  return static_cast<float>(a) + b;
+}
+inline C10_HOST_DEVICE float operator-(Half a, float b) {
+  return static_cast<float>(a) - b;
+}
+inline C10_HOST_DEVICE float operator*(Half a, float b) {
+  return static_cast<float>(a) * b;
+}
+inline C10_HOST_DEVICE float operator/(Half a, float b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<float>(a) / b;
+}
+
+inline C10_HOST_DEVICE float operator+(float a, Half b) {
+  return a + static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator-(float a, Half b) {
+  return a - static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator*(float a, Half b) {
+  return a * static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float operator/(float a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<float>(b);
+}
+
+inline C10_HOST_DEVICE float& operator+=(float& a, const Half& b) {
+  return a += static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator-=(float& a, const Half& b) {
+  return a -= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator*=(float& a, const Half& b) {
+  return a *= static_cast<float>(b);
+}
+inline C10_HOST_DEVICE float& operator/=(float& a, const Half& b) {
+  return a /= static_cast<float>(b);
+}
+
+/// Arithmetic with doubles
+
+inline C10_HOST_DEVICE double operator+(Half a, double b) {
+  return static_cast<double>(a) + b;
+}
+inline C10_HOST_DEVICE double operator-(Half a, double b) {
+  return static_cast<double>(a) - b;
+}
+inline C10_HOST_DEVICE double operator*(Half a, double b) {
+  return static_cast<double>(a) * b;
+}
+inline C10_HOST_DEVICE double operator/(Half a, double b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return static_cast<double>(a) / b;
+}
+
+inline C10_HOST_DEVICE double operator+(double a, Half b) {
+  return a + static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator-(double a, Half b) {
+  return a - static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator*(double a, Half b) {
+  return a * static_cast<double>(b);
+}
+inline C10_HOST_DEVICE double operator/(double a, Half b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  return a / static_cast<double>(b);
+}
+
+/// Arithmetic with ints
+
+inline C10_HOST_DEVICE Half operator+(Half a, int b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+//// Arithmetic with int64_t
+
+inline C10_HOST_DEVICE Half operator+(Half a, int64_t b) {
+  return a + static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator-(Half a, int64_t b) {
+  return a - static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator*(Half a, int64_t b) {
+  return a * static_cast<Half>(b);
+}
+inline C10_HOST_DEVICE Half operator/(Half a, int64_t b) {
+  return a / static_cast<Half>(b);
+}
+
+inline C10_HOST_DEVICE Half operator+(int64_t a, Half b) {
+  return static_cast<Half>(a) + b;
+}
+inline C10_HOST_DEVICE Half operator-(int64_t a, Half b) {
+  return static_cast<Half>(a) - b;
+}
+inline C10_HOST_DEVICE Half operator*(int64_t a, Half b) {
+  return static_cast<Half>(a) * b;
+}
+inline C10_HOST_DEVICE Half operator/(int64_t a, Half b) {
+  return static_cast<Half>(a) / b;
+}
+
+/// NOTE: we do not define comparisons directly and instead rely on the implicit
+/// conversion from c10::Half to float.
+
+} // namespace c10
+
+namespace std {
+
+template <>
+class numeric_limits<c10::Half> {
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr auto has_denorm = numeric_limits<float>::has_denorm;
+  static constexpr auto has_denorm_loss =
+      numeric_limits<float>::has_denorm_loss;
+  static constexpr auto round_style = numeric_limits<float>::round_style;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 11;
+  static constexpr int digits10 = 3;
+  static constexpr int max_digits10 = 5;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = numeric_limits<float>::traps;
+  static constexpr auto tinyness_before =
+      numeric_limits<float>::tinyness_before;
+  static constexpr c10::Half min() {
+    return c10::Half(0x0400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half lowest() {
+    return c10::Half(0xFBFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half max() {
+    return c10::Half(0x7BFF, c10::Half::from_bits());
+  }
+  static constexpr c10::Half epsilon() {
+    return c10::Half(0x1400, c10::Half::from_bits());
+  }
+  static constexpr c10::Half round_error() {
+    return c10::Half(0x3800, c10::Half::from_bits());
+  }
+  static constexpr c10::Half infinity() {
+    return c10::Half(0x7C00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half quiet_NaN() {
+    return c10::Half(0x7E00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half signaling_NaN() {
+    return c10::Half(0x7D00, c10::Half::from_bits());
+  }
+  static constexpr c10::Half denorm_min() {
+    return c10::Half(0x0001, c10::Half::from_bits());
+  }
+};
+
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Half.h b/MLPY/Lib/site-packages/torch/include/c10/util/Half.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fee5505ca1dcea55bb6851a66ceaf06e978b159
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Half.h
@@ -0,0 +1,538 @@
+#pragma once
+
+/// Defines the Half type (half-precision floating-point) including conversions
+/// to standard C types and basic arithmetic operations. Note that arithmetic
+/// operations are implemented by converting to floating point and
+/// performing the operation in float32, instead of using CUDA half intrinsics.
+/// Most uses of this type within ATen are memory bound, including the
+/// element-wise kernels, and the half intrinsics aren't efficient on all GPUs.
+/// If you are writing a compute bound kernel, you can use the CUDA half
+/// intrinsics directly on the Half type from device code.
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/complex.h>
+#include <c10/util/floating_point_utils.h>
+#include <type_traits>
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#include <cmath>
+#elif !defined(__OPENCL_VERSION__)
+#include <math.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+
+#ifdef __CUDACC__
+#include <cuda_fp16.h>
+#endif
+
+#ifdef __HIPCC__
+#include <hip/hip_fp16.h>
+#endif
+
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#include <arm_neon.h>
+#endif
+
+namespace c10 {
+
+namespace detail {
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format, in bit representation.
+ *
+ * @note The implementation doesn't use any floating-point operations.
+ */
+inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the bits 0-30
+   * of the 32-bit word:
+   *
+   *      +---+-----+------------+-------------------+
+   *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  30  27-31     17-26            0-16
+   */
+  const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+  /*
+   * Renorm shift is the number of bits to shift mantissa left to make the
+   * half-precision number normalized. If the initial number is normalized, some
+   * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case
+   * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note
+   * that if we shift denormalized nonsign by renorm_shift, the unit bit of
+   * mantissa will shift into exponent, turning the biased exponent into 1, and
+   * making mantissa normalized (i.e. without leading 1).
+   */
+#ifdef _MSC_VER
+  unsigned long nonsign_bsr;
+  _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
+  uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
+#else
+  uint32_t renorm_shift = __builtin_clz(nonsign);
+#endif
+  renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
+  /*
+   * Iff half-precision number has exponent of 15, the addition overflows
+   * it into bit 31, and the subsequent shift turns the high 9 bits
+   * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number
+   * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise
+   */
+  const int32_t inf_nan_mask =
+      ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
+  /*
+   * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31
+   * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31
+   * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask ==
+   * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
+   * 0x00000000 otherwise
+   */
+  const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
+  /*
+   * 1. Shift nonsign left by renorm_shift to normalize it (if the input
+   * was denormal)
+   * 2. Shift nonsign right by 3 so the exponent (5 bits originally)
+   * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high
+   * bits of the 23-bit mantissa of IEEE single-precision number.
+   * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the
+   * different in exponent bias (0x7F for single-precision number less 0xF
+   * for half-precision number).
+   * 4. Subtract renorm_shift from the exponent (starting at bit 23) to
+   * account for renormalization. As renorm_shift is less than 0x70, this
+   * can be combined with step 3.
+   * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the
+   * input was NaN or infinity.
+   * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent
+   * into zero if the input was zero.
+   * 7. Combine with the sign of the input number.
+   */
+  return sign |
+      ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) |
+        inf_nan_mask) &
+       ~zero_mask);
+}
+
+/*
+ * Convert a 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation, to a 32-bit floating-point number in IEEE single-precision
+ * format.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) {
+  /*
+   * Extend the half-precision floating-point number to 32 bits and shift to the
+   * upper part of the 32-bit word:
+   *      +---+-----+------------+-------------------+
+   *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+   *      +---+-----+------------+-------------------+
+   * Bits  31  26-30    16-25            0-15
+   *
+   * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0
+   * - zero bits.
+   */
+  const uint32_t w = (uint32_t)h << 16;
+  /*
+   * Extract the sign of the input number into the high bit of the 32-bit word:
+   *
+   *      +---+----------------------------------+
+   *      | S |0000000 00000000 00000000 00000000|
+   *      +---+----------------------------------+
+   * Bits  31                 0-31
+   */
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  /*
+   * Extract mantissa and biased exponent of the input number into the high bits
+   * of the 32-bit word:
+   *
+   *      +-----+------------+---------------------+
+   *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+   *      +-----+------------+---------------------+
+   * Bits  27-31    17-26            0-16
+   */
+  const uint32_t two_w = w + w;
+
+  /*
+   * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become
+   * mantissa and exponent of a single-precision floating-point number:
+   *
+   *       S|Exponent |          Mantissa
+   *      +-+---+-----+------------+----------------+
+   *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+   *      +-+---+-----+------------+----------------+
+   * Bits   | 23-31   |           0-22
+   *
+   * Next, there are some adjustments to the exponent:
+   * - The exponent needs to be corrected by the difference in exponent bias
+   * between single-precision and half-precision formats (0x7F - 0xF = 0x70)
+   * - Inf and NaN values in the inputs should become Inf and NaN values after
+   * conversion to the single-precision number. Therefore, if the biased
+   * exponent of the half-precision input was 0x1F (max possible value), the
+   * biased exponent of the single-precision output must be 0xFF (max possible
+   * value). We do this correction in two steps:
+   *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset
+   * below) rather than by 0x70 suggested by the difference in the exponent bias
+   * (see above).
+   *   - Then we multiply the single-precision result of exponent adjustment by
+   * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the
+   * necessary exponent adjustment by 0x70 due to difference in exponent bias.
+   *     The floating-point multiplication hardware would ensure than Inf and
+   * NaN would retain their value on at least partially IEEE754-compliant
+   * implementations.
+   *
+   * Note that the above operations do not handle denormal inputs (where biased
+   * exponent == 0). However, they also do not operate on denormal inputs, and
+   * do not produce denormal results.
+   */
+  constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23;
+  // const float exp_scale = 0x1.0p-112f;
+  constexpr uint32_t scale_bits = (uint32_t)15 << 23;
+  float exp_scale_val = 0;
+  std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val));
+  const float exp_scale = exp_scale_val;
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  /*
+   * Convert denormalized half-precision inputs into single-precision results
+   * (always normalized). Zero inputs are also handled here.
+   *
+   * In a denormalized number the biased exponent is zero, and mantissa has
+   * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word.
+   *
+   *                  zeros           |  mantissa
+   *      +---------------------------+------------+
+   *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+   *      +---------------------------+------------+
+   * Bits             10-31                0-9
+   *
+   * Now, remember that denormalized half-precision numbers are represented as:
+   *    FP16 = mantissa * 2**(-24).
+   * The trick is to construct a normalized single-precision number with the
+   * same mantissa and thehalf-precision input and with an exponent which would
+   * scale the corresponding mantissa bits to 2**(-24). A normalized
+   * single-precision floating-point number is represented as: FP32 = (1 +
+   * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased
+   * exponent is 126, a unit change in the mantissa of the input denormalized
+   * half-precision number causes a change of the constructed single-precision
+   * number by 2**(-24), i.e. the same amount.
+   *
+   * The last step is to adjust the bias of the constructed single-precision
+   * number. When the input half-precision number is zero, the constructed
+   * single-precision number has the value of FP32 = 1 * 2**(126 - 127) =
+   * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed
+   * single-precision number to get the numerical equivalent of the input
+   * half-precision number.
+   */
+  constexpr uint32_t magic_mask = UINT32_C(126) << 23;
+  constexpr float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  /*
+   * - Choose either results of conversion of input as a normalized number, or
+   * as a denormalized number, depending on the input exponent. The variable
+   * two_w contains input exponent in bits 27-31, therefore if its smaller than
+   * 2**27, the input is either a denormal number, or zero.
+   * - Combine the result of conversion of exponent and mantissa with the sign
+   * of the input number.
+   */
+  constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result = sign |
+      (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                   : fp32_to_bits(normalized_value));
+  return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a
+ * 16-bit floating-point number in IEEE half-precision format, in bit
+ * representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding
+ * mode and no operations on denormals) floating-point operations and bitcasts
+ * between integer and floating-point variables.
+ */
+inline uint16_t fp16_ieee_from_fp32_value(float f) {
+  // const float scale_to_inf = 0x1.0p+112f;
+  // const float scale_to_zero = 0x1.0p-110f;
+  constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23;
+  constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23;
+  float scale_to_inf_val = 0, scale_to_zero_val = 0;
+  std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+  std::memcpy(
+      &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+  const float scale_to_inf = scale_to_inf_val;
+  const float scale_to_zero = scale_to_zero_val;
+
+#if defined(_MSC_VER) && _MSC_VER == 1916
+  float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
+#else
+  float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+#endif
+
+  const uint32_t w = fp32_to_bits(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+  if (bias < UINT32_C(0x71000000)) {
+    bias = UINT32_C(0x71000000);
+  }
+
+  base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+  const uint32_t bits = fp32_to_bits(base);
+  const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+  const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return static_cast<uint16_t>(
+      (sign >> 16) |
+      (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign));
+}
+
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+constexpr inline float16_t fp16_from_bits(uint16_t h) {
+  union {
+    uint16_t as_bits;
+    float16_t as_value;
+  } fp16 = {h};
+  return fp16.as_value;
+}
+
+constexpr inline uint16_t fp16_to_bits(float16_t f) {
+  union {
+    float16_t as_value;
+    uint16_t as_bits;
+  } fp16 = {.as_value = f};
+  return fp16.as_bits;
+}
+
+// According to https://godbolt.org/z/8s14GvEjo it would translate to single
+// fcvt s0, h0
+inline float native_fp16_to_fp32_value(uint16_t h) {
+  return static_cast<float>(fp16_from_bits(h));
+}
+
+inline uint16_t native_fp16_from_fp32_value(float f) {
+  return fp16_to_bits(static_cast<float16_t>(f));
+}
+#endif
+
+} // namespace detail
+
+struct alignas(2) Half {
+  unsigned short x;
+
+  struct from_bits_t {};
+  C10_HOST_DEVICE static constexpr from_bits_t from_bits() {
+    return from_bits_t();
+  }
+
+  // HIP wants __host__ __device__ tag, CUDA does not
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Half() = default;
+#else
+  Half() = default;
+#endif
+
+  constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {}
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+  inline Half(float16_t value);
+  inline operator float16_t() const;
+#else
+  inline C10_HOST_DEVICE Half(float value);
+  inline C10_HOST_DEVICE operator float() const;
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_HOST_DEVICE Half(const __half& value);
+  inline C10_HOST_DEVICE operator __half() const;
+#endif
+#ifdef SYCL_LANGUAGE_VERSION
+  inline C10_HOST_DEVICE Half(const sycl::half& value);
+  inline C10_HOST_DEVICE operator sycl::half() const;
+#endif
+};
+
+// TODO : move to complex.h
+template <>
+struct alignas(4) complex<Half> {
+  Half real_;
+  Half imag_;
+
+  // Constructors
+  complex() = default;
+  // Half constructor is not constexpr so the following constructor can't
+  // be constexpr
+  C10_HOST_DEVICE explicit inline complex(const Half& real, const Half& imag)
+      : real_(real), imag_(imag) {}
+  C10_HOST_DEVICE inline complex(const c10::complex<float>& value)
+      : real_(value.real()), imag_(value.imag()) {}
+
+  // Conversion operator
+  inline C10_HOST_DEVICE operator c10::complex<float>() const {
+    return {real_, imag_};
+  }
+
+  constexpr C10_HOST_DEVICE Half real() const {
+    return real_;
+  }
+  constexpr C10_HOST_DEVICE Half imag() const {
+    return imag_;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator+=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) + static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) + static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator-=(const complex<Half>& other) {
+    real_ = static_cast<float>(real_) - static_cast<float>(other.real_);
+    imag_ = static_cast<float>(imag_) - static_cast<float>(other.imag_);
+    return *this;
+  }
+
+  C10_HOST_DEVICE complex<Half>& operator*=(const complex<Half>& other) {
+    auto a = static_cast<float>(real_);
+    auto b = static_cast<float>(imag_);
+    auto c = static_cast<float>(other.real());
+    auto d = static_cast<float>(other.imag());
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+};
+
+// In some versions of MSVC, there will be a compiler error when building.
+// C4146: unary minus operator applied to unsigned type, result still unsigned
+// C4804: unsafe use of type 'bool' in operation
+// It can be addressed by disabling the following warning.
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#pragma warning(disable : 4804)
+#pragma warning(disable : 4018)
+#endif
+
+// The overflow checks may involve float to int conversion which may
+// trigger precision loss warning. Re-enable the warning once the code
+// is fixed. See T58053069.
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+// bool can be converted to any type.
+// Without specializing on bool, in pytorch_linux_trusty_py2_7_9_build:
+// `error: comparison of constant '255' with boolean expression is always false`
+// for `f > limit::max()` below
+template <typename To, typename From>
+std::enable_if_t<std::is_same_v<From, bool>, bool> overflows(
+    From /*f*/,
+    bool strict_unsigned = false) {
+  return false;
+}
+
+// skip isnan and isinf check for integral types
+template <typename To, typename From>
+std::enable_if_t<std::is_integral_v<From> && !std::is_same_v<From, bool>, bool>
+overflows(From f, bool strict_unsigned = false) {
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
+  if constexpr (!limit::is_signed && std::numeric_limits<From>::is_signed) {
+    // allow for negative numbers to wrap using two's complement arithmetic.
+    // For example, with uint8, this allows for `a - b` to be treated as
+    // `a + 255 * b`.
+    if (!strict_unsigned) {
+      return greater_than_max<To>(f) ||
+          (c10::is_negative(f) &&
+           -static_cast<uint64_t>(f) > static_cast<uint64_t>(limit::max()));
+    }
+  }
+  return c10::less_than_lowest<To>(f) || greater_than_max<To>(f);
+}
+
+template <typename To, typename From>
+std::enable_if_t<std::is_floating_point_v<From>, bool> overflows(
+    From f,
+    bool strict_unsigned = false) {
+  using limit = std::numeric_limits<typename scalar_value_type<To>::type>;
+  if (limit::has_infinity && std::isinf(static_cast<double>(f))) {
+    return false;
+  }
+  if (!limit::has_quiet_NaN && (f != f)) {
+    return true;
+  }
+  return f < limit::lowest() || f > limit::max();
+}
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+template <typename To, typename From>
+std::enable_if_t<is_complex<From>::value, bool> overflows(
+    From f,
+    bool strict_unsigned = false) {
+  // casts from complex to real are considered to overflow if the
+  // imaginary component is non-zero
+  if (!is_complex<To>::value && f.imag() != 0) {
+    return true;
+  }
+  // Check for overflow componentwise
+  // (Technically, the imag overflow check is guaranteed to be false
+  // when !is_complex<To>, but any optimizer worth its salt will be
+  // able to figure it out.)
+  return overflows<
+             typename scalar_value_type<To>::type,
+             typename From::value_type>(f.real()) ||
+      overflows<
+             typename scalar_value_type<To>::type,
+             typename From::value_type>(f.imag());
+}
+
+C10_API std::ostream& operator<<(std::ostream& out, const Half& value);
+
+} // namespace c10
+
+#include <c10/util/Half-inl.h> // IWYU pragma: keep
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/IdWrapper.h b/MLPY/Lib/site-packages/torch/include/c10/util/IdWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..086f456fc27ab1866a5c4ad9f9e651eed91ada68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/IdWrapper.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstddef>
+#include <functional>
+#include <utility>
+
+namespace c10 {
+
+/**
+ * This template simplifies generation of simple classes that wrap an id
+ * in a typesafe way. Namely, you can use it to create a very lightweight
+ * type that only offers equality comparators and hashing. Example:
+ *
+ *   struct MyIdType final : IdWrapper<MyIdType, uint32_t> {
+ *     constexpr explicit MyIdType(uint32_t id): IdWrapper(id) {}
+ *   };
+ *
+ * Then in the global top level namespace:
+ *
+ *   C10_DEFINE_HASH_FOR_IDWRAPPER(MyIdType);
+ *
+ * That's it - equality operators and hash functions are automatically defined
+ * for you, given the underlying type supports it.
+ */
+template <class ConcreteType, class UnderlyingType>
+class IdWrapper {
+ public:
+  using underlying_type = UnderlyingType;
+  using concrete_type = ConcreteType;
+
+ protected:
+  constexpr explicit IdWrapper(underlying_type id) noexcept(
+      noexcept(underlying_type(std::declval<underlying_type>())))
+      : id_(id) {}
+
+  constexpr underlying_type underlyingId() const
+      noexcept(noexcept(underlying_type(std::declval<underlying_type>()))) {
+    return id_;
+  }
+
+ private:
+  friend size_t hash_value(const concrete_type& v) {
+    return std::hash<underlying_type>()(v.id_);
+  }
+
+  // TODO Making operator== noexcept if underlying type is noexcept equality
+  // comparable doesn't work with GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator==(
+      const concrete_type& lhs,
+      const concrete_type& rhs) noexcept {
+    return lhs.id_ == rhs.id_;
+  }
+
+  // TODO Making operator!= noexcept if operator== is noexcept doesn't work with
+  // GCC 4.8.
+  //      Fix this once we don't need GCC 4.8 anymore.
+  friend constexpr bool operator!=(
+      const concrete_type& lhs,
+      const concrete_type& rhs) noexcept {
+    return !(lhs == rhs);
+  }
+
+  underlying_type id_;
+};
+
+} // namespace c10
+
+#define C10_DEFINE_HASH_FOR_IDWRAPPER(ClassName) \
+  namespace std {                                \
+  template <>                                    \
+  struct hash<ClassName> {                       \
+    size_t operator()(ClassName x) const {       \
+      return hash_value(x);                      \
+    }                                            \
+  };                                             \
+  }
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/LeftRight.h b/MLPY/Lib/site-packages/torch/include/c10/util/LeftRight.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6c09ac98d964277dfe414d6d8c04cb2ac0e28d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/LeftRight.h
@@ -0,0 +1,223 @@
+#include <c10/macros/Macros.h>
+#include <c10/util/Synchronized.h>
+#include <array>
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+namespace c10 {
+
+namespace detail {
+
+struct IncrementRAII final {
+ public:
+  explicit IncrementRAII(std::atomic<int32_t>* counter) : _counter(counter) {
+    _counter->fetch_add(1);
+  }
+
+  ~IncrementRAII() {
+    _counter->fetch_sub(1);
+  }
+
+ private:
+  std::atomic<int32_t>* _counter;
+
+  C10_DISABLE_COPY_AND_ASSIGN(IncrementRAII);
+};
+
+} // namespace detail
+
+// LeftRight wait-free readers synchronization primitive
+// https://hal.archives-ouvertes.fr/hal-01207881/document
+//
+// LeftRight is quite easy to use (it can make an arbitrary
+// data structure permit wait-free reads), but it has some
+// particular performance characteristics you should be aware
+// of if you're deciding to use it:
+//
+//  - Reads still incur an atomic write (this is how LeftRight
+//    keeps track of how long it needs to keep around the old
+//    data structure)
+//
+//  - Writes get executed twice, to keep both the left and right
+//    versions up to date.  So if your write is expensive or
+//    nondeterministic, this is also an inappropriate structure
+//
+// LeftRight is used fairly rarely in PyTorch's codebase.  If you
+// are still not sure if you need it or not, consult your local
+// C++ expert.
+//
+template <class T>
+class LeftRight final {
+ public:
+  template <class... Args>
+  explicit LeftRight(const Args&... args)
+      : _counters{{{0}, {0}}},
+        _foregroundCounterIndex(0),
+        _foregroundDataIndex(0),
+        _data{{T{args...}, T{args...}}},
+        _writeMutex() {}
+
+  // Copying and moving would not be threadsafe.
+  // Needs more thought and careful design to make that work.
+  LeftRight(const LeftRight&) = delete;
+  LeftRight(LeftRight&&) noexcept = delete;
+  LeftRight& operator=(const LeftRight&) = delete;
+  LeftRight& operator=(LeftRight&&) noexcept = delete;
+
+  ~LeftRight() {
+    // wait until any potentially running writers are finished
+    { std::unique_lock<std::mutex> lock(_writeMutex); }
+
+    // wait until any potentially running readers are finished
+    while (_counters[0].load() != 0 || _counters[1].load() != 0) {
+      std::this_thread::yield();
+    }
+  }
+
+  template <typename F>
+  auto read(F&& readFunc) const {
+    detail::IncrementRAII _increment_counter(
+        &_counters[_foregroundCounterIndex.load()]);
+
+    return std::forward<F>(readFunc)(_data[_foregroundDataIndex.load()]);
+  }
+
+  // Throwing an exception in writeFunc is ok but causes the state to be either
+  // the old or the new state, depending on if the first or the second call to
+  // writeFunc threw.
+  template <typename F>
+  auto write(F&& writeFunc) {
+    std::unique_lock<std::mutex> lock(_writeMutex);
+
+    return _write(std::forward<F>(writeFunc));
+  }
+
+ private:
+  template <class F>
+  auto _write(const F& writeFunc) {
+    /*
+     * Assume, A is in background and B in foreground. In simplified terms, we
+     * want to do the following:
+     * 1. Write to A (old background)
+     * 2. Switch A/B
+     * 3. Write to B (new background)
+     *
+     * More detailed algorithm (explanations on why this is important are below
+     * in code):
+     * 1. Write to A
+     * 2. Switch A/B data pointers
+     * 3. Wait until A counter is zero
+     * 4. Switch A/B counters
+     * 5. Wait until B counter is zero
+     * 6. Write to B
+     */
+
+    auto localDataIndex = _foregroundDataIndex.load();
+
+    // 1. Write to A
+    _callWriteFuncOnBackgroundInstance(writeFunc, localDataIndex);
+
+    // 2. Switch A/B data pointers
+    localDataIndex = localDataIndex ^ 1;
+    _foregroundDataIndex = localDataIndex;
+
+    /*
+     * 3. Wait until A counter is zero
+     *
+     * In the previous write run, A was foreground and B was background.
+     * There was a time after switching _foregroundDataIndex (B to foreground)
+     * and before switching _foregroundCounterIndex, in which new readers could
+     * have read B but incremented A's counter.
+     *
+     * In this current run, we just switched _foregroundDataIndex (A back to
+     * foreground), but before writing to the new background B, we have to make
+     * sure A's counter was zero briefly, so all these old readers are gone.
+     */
+    auto localCounterIndex = _foregroundCounterIndex.load();
+    _waitForBackgroundCounterToBeZero(localCounterIndex);
+
+    /*
+     * 4. Switch A/B counters
+     *
+     * Now that we know all readers on B are really gone, we can switch the
+     * counters and have new readers increment A's counter again, which is the
+     * correct counter since they're reading A.
+     */
+    localCounterIndex = localCounterIndex ^ 1;
+    _foregroundCounterIndex = localCounterIndex;
+
+    /*
+     * 5. Wait until B counter is zero
+     *
+     * This waits for all the readers on B that came in while both data and
+     * counter for B was in foreground, i.e. normal readers that happened
+     * outside of that brief gap between switching data and counter.
+     */
+    _waitForBackgroundCounterToBeZero(localCounterIndex);
+
+    // 6. Write to B
+    return _callWriteFuncOnBackgroundInstance(writeFunc, localDataIndex);
+  }
+
+  template <class F>
+  auto _callWriteFuncOnBackgroundInstance(
+      const F& writeFunc,
+      uint8_t localDataIndex) {
+    try {
+      return writeFunc(_data[localDataIndex ^ 1]);
+    } catch (...) {
+      // recover invariant by copying from the foreground instance
+      _data[localDataIndex ^ 1] = _data[localDataIndex];
+      // rethrow
+      throw;
+    }
+  }
+
+  void _waitForBackgroundCounterToBeZero(uint8_t counterIndex) {
+    while (_counters[counterIndex ^ 1].load() != 0) {
+      std::this_thread::yield();
+    }
+  }
+
+  mutable std::array<std::atomic<int32_t>, 2> _counters;
+  std::atomic<uint8_t> _foregroundCounterIndex;
+  std::atomic<uint8_t> _foregroundDataIndex;
+  std::array<T, 2> _data;
+  std::mutex _writeMutex;
+};
+
+// RWSafeLeftRightWrapper is API compatible with LeftRight and uses a
+// read-write lock to protect T (data).
+template <class T>
+class RWSafeLeftRightWrapper final {
+ public:
+  template <class... Args>
+  explicit RWSafeLeftRightWrapper(const Args&... args) : data_{args...} {}
+
+  // RWSafeLeftRightWrapper is not copyable or moveable since LeftRight
+  // is not copyable or moveable.
+  RWSafeLeftRightWrapper(const RWSafeLeftRightWrapper&) = delete;
+  RWSafeLeftRightWrapper(RWSafeLeftRightWrapper&&) noexcept = delete;
+  RWSafeLeftRightWrapper& operator=(const RWSafeLeftRightWrapper&) = delete;
+  RWSafeLeftRightWrapper& operator=(RWSafeLeftRightWrapper&&) noexcept = delete;
+
+  template <typename F>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  auto read(F&& readFunc) const {
+    return data_.withLock(
+        [&readFunc](T const& data) { return std::forward<F>(readFunc)(data); });
+  }
+
+  template <typename F>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  auto write(F&& writeFunc) {
+    return data_.withLock(
+        [&writeFunc](T& data) { return std::forward<F>(writeFunc)(data); });
+  }
+
+ private:
+  c10::Synchronized<T> data_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Load.h b/MLPY/Lib/site-packages/torch/include/c10/util/Load.h
new file mode 100644
index 0000000000000000000000000000000000000000..3aec348dbee66b80e2f89f3cf980dde08af2f773
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Load.h
@@ -0,0 +1,38 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <cstring>
+
+namespace c10 {
+namespace detail {
+
+template <typename T>
+struct LoadImpl {
+  C10_HOST_DEVICE static T apply(const void* src) {
+    return *reinterpret_cast<const T*>(src);
+  }
+};
+
+template <>
+struct LoadImpl<bool> {
+  C10_HOST_DEVICE static bool apply(const void* src) {
+    static_assert(sizeof(bool) == sizeof(char));
+    // NOTE: [Loading boolean values]
+    // Protect against invalid boolean values by loading as a byte
+    // first, then converting to bool (see gh-54789).
+    return *reinterpret_cast<const unsigned char*>(src);
+  }
+};
+
+} // namespace detail
+
+template <typename T>
+C10_HOST_DEVICE T load(const void* src) {
+  return c10::detail::LoadImpl<T>::apply(src);
+}
+
+template <typename scalar_t>
+C10_HOST_DEVICE scalar_t load(const scalar_t* src) {
+  return c10::detail::LoadImpl<scalar_t>::apply(src);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Logging.h b/MLPY/Lib/site-packages/torch/include/c10/util/Logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c83b26cb451d53ae004876f2244030ce673fed2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Logging.h
@@ -0,0 +1,340 @@
+#ifndef C10_UTIL_LOGGING_H_
+#define C10_UTIL_LOGGING_H_
+
+#include <climits>
+#include <exception>
+#include <functional>
+#include <limits>
+#include <sstream>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Flags.h>
+#include <c10/util/StringUtil.h>
+
+// CAFFE2_LOG_THRESHOLD is a compile time flag that would allow us to turn off
+// logging at compile time so no logging message below that level is produced
+// at all. The value should be between INT_MIN and CAFFE_FATAL.
+#ifndef CAFFE2_LOG_THRESHOLD
+// If we have not defined the compile time log threshold, we keep all the
+// log cases.
+#define CAFFE2_LOG_THRESHOLD INT_MIN
+#endif // CAFFE2_LOG_THRESHOLD
+
+// Below are different implementations for glog and non-glog cases.
+#ifdef C10_USE_GLOG
+#include <c10/util/logging_is_google_glog.h>
+#else // !C10_USE_GLOG
+#include <c10/util/logging_is_not_google_glog.h>
+#endif // C10_USE_GLOG
+
+C10_DECLARE_int(caffe2_log_level);
+C10_DECLARE_bool(caffe2_use_fatal_for_enforce);
+
+// Some versions of GLOG support less-spammy version of LOG_EVERY_MS. If it's
+// not available - just short-circuit to the always working one one.
+// We define the C10_ name to avoid confusing other files
+#ifdef LOG_EVERY_MS
+#define C10_LOG_EVERY_MS(severity, ms) LOG_EVERY_MS(severity, ms)
+#else
+#define C10_LOG_EVERY_MS(severity, ms) LOG(severity)
+#endif
+
+// Same for LOG_FIRST_N
+#ifdef LOG_FIRST_N
+#define C10_LOG_FIRST_N(severity, n) LOG_FIRST_N(severity, n)
+#else
+#define C10_LOG_FIRST_N(severity, n) LOG(severity)
+#endif
+
+// Same for LOG_EVERY_N
+#ifdef LOG_EVERY_N
+#define C10_LOG_EVERY_N(severity, n) LOG_EVERY_N(severity, n)
+#else
+#define C10_LOG_EVERY_N(severity, n) LOG(severity)
+#endif
+
+namespace c10 {
+
+using std::string;
+
+// Functions that we use for initialization.
+C10_API bool InitCaffeLogging(int* argc, char** argv);
+C10_API void UpdateLoggingLevelsFromFlags();
+
+[[noreturn]] C10_API void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller = nullptr);
+
+[[noreturn]] C10_API void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const char* msg,
+    const void* caller = nullptr);
+
+[[noreturn]] C10_API inline void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    detail::CompileTimeEmptyString /*msg*/,
+    const void* caller = nullptr) {
+  ThrowEnforceNotMet(file, line, condition, "", caller);
+}
+
+[[noreturn]] C10_API void ThrowEnforceFiniteNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller = nullptr);
+
+[[noreturn]] C10_API void ThrowEnforceFiniteNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const char* msg,
+    const void* caller = nullptr);
+
+[[noreturn]] C10_API inline void ThrowEnforceFiniteNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    detail::CompileTimeEmptyString /*msg*/,
+    const void* caller = nullptr) {
+  ThrowEnforceFiniteNotMet(file, line, condition, "", caller);
+}
+
+constexpr bool IsUsingGoogleLogging() {
+#ifdef C10_USE_GLOG
+  return true;
+#else
+  return false;
+#endif
+}
+
+/**
+ * A utility to allow one to show log info to stderr after the program starts.
+ *
+ * This is similar to calling GLOG's --logtostderr, or setting caffe2_log_level
+ * to smaller than INFO. You are recommended to only use this in a few sparse
+ * cases, such as when you want to write a tutorial or something. Normally, use
+ * the commandline flags to set the log level.
+ */
+C10_API void ShowLogInfoToStderr();
+
+C10_API void SetStackTraceFetcher(std::function<string(void)> fetcher);
+
+using EnforceNotMet = ::c10::Error;
+
+#define CAFFE_ENFORCE(condition, ...)                               \
+  do {                                                              \
+    if (C10_UNLIKELY(!(condition))) {                               \
+      ::c10::ThrowEnforceNotMet(                                    \
+          __FILE__, __LINE__, #condition, ::c10::str(__VA_ARGS__)); \
+    }                                                               \
+  } while (false)
+
+#define CAFFE_ENFORCE_FINITE(condition, ...)                        \
+  do {                                                              \
+    if (C10_UNLIKELY(!(condition))) {                               \
+      ::c10::ThrowEnforceFiniteNotMet(                              \
+          __FILE__, __LINE__, #condition, ::c10::str(__VA_ARGS__)); \
+    }                                                               \
+  } while (false)
+
+#define CAFFE_ENFORCE_WITH_CALLER(condition, ...)                         \
+  do {                                                                    \
+    if (C10_UNLIKELY(!(condition))) {                                     \
+      ::c10::ThrowEnforceNotMet(                                          \
+          __FILE__, __LINE__, #condition, ::c10::str(__VA_ARGS__), this); \
+    }                                                                     \
+  } while (false)
+
+#define CAFFE_THROW(...) \
+  ::c10::ThrowEnforceNotMet(__FILE__, __LINE__, "", ::c10::str(__VA_ARGS__))
+
+/**
+ * Rich logging messages
+ *
+ * CAFFE_ENFORCE_THAT can be used with one of the "checker functions" that
+ * capture input argument values and add it to the exception message. E.g.
+ * `CAFFE_ENFORCE_THAT(Equals(foo(x), bar(y)), "Optional additional message")`
+ * would evaluate both foo and bar only once and if the results are not equal -
+ * include them in the exception message.
+ *
+ * Some of the basic checker functions like Equals or Greater are already
+ * defined below. Other header might define customized checkers by adding
+ * functions to caffe2::enforce_detail namespace. For example:
+ *
+ *   namespace caffe2 { namespace enforce_detail {
+ *   inline EnforceFailMessage IsVector(const vector<int64_t>& shape) {
+ *     if (shape.size() == 1) { return EnforceOK(); }
+ *     return c10::str("Shape ", shape, " is not a vector");
+ *   }
+ *   }}
+ *
+ * With further usages like `CAFFE_ENFORCE_THAT(IsVector(Input(0).dims()))`
+ *
+ * Convenient wrappers for binary operations like CAFFE_ENFORCE_EQ are provided
+ * too. Please use them instead of TORCH_CHECK_EQ and friends for failures in
+ * user-provided input.
+ */
+
+namespace enforce_detail {
+
+template <typename T1, typename T2>
+std::string enforceFailMsgImpl(const T1& x, const T2& y) {
+  return c10::str(x, " vs ", y);
+}
+
+template <typename T1, typename T2, typename... Args>
+std::string enforceFailMsgImpl(const T1& x, const T2& y, const Args&... args) {
+  return c10::str(x, " vs ", y, ". ", args...);
+}
+
+template <typename Pred, typename T1, typename T2, typename GetFailMsgFunc>
+void enforceThatImpl(
+    Pred p,
+    const T1& lhs,
+    const T2& rhs,
+    const char* file,
+    int line,
+    const char* expr,
+    const void* caller,
+    GetFailMsgFunc getFailMsg) {
+  if (C10_UNLIKELY(!(p(lhs, rhs)))) {
+    ::c10::ThrowEnforceNotMet(file, line, expr, getFailMsg(lhs, rhs), caller);
+  }
+}
+
+#define CAFFE_ENFORCE_THAT_IMPL(op, lhs, rhs, expr, ...)  \
+  ::c10::enforce_detail::enforceThatImpl(                 \
+      op,                                                 \
+      (lhs),                                              \
+      (rhs),                                              \
+      __FILE__,                                           \
+      __LINE__,                                           \
+      expr,                                               \
+      nullptr,                                            \
+      [&](const auto& arg1, const auto& arg2) {           \
+        return ::c10::enforce_detail::enforceFailMsgImpl( \
+            arg1, arg2, ##__VA_ARGS__);                   \
+      })
+
+#define CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(op, lhs, rhs, expr, ...) \
+  ::c10::enforce_detail::enforceThatImpl(                            \
+      op,                                                            \
+      (lhs),                                                         \
+      (rhs),                                                         \
+      __FILE__,                                                      \
+      __LINE__,                                                      \
+      expr,                                                          \
+      this,                                                          \
+      [&](const auto& arg1, const auto& arg2) {                      \
+        return ::c10::enforce_detail::enforceFailMsgImpl(            \
+            arg1, arg2, ##__VA_ARGS__);                              \
+      })
+
+} // namespace enforce_detail
+
+#define CAFFE_ENFORCE_THAT(cmp, op, lhs, rhs, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(cmp, lhs, rhs, #lhs " " #op " " #rhs, ##__VA_ARGS__)
+
+#define CAFFE_ENFORCE_BINARY_OP(cmp, op, x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL(cmp, x, y, #x " " #op " " #y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_EQ(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::equal_to<void>(), ==, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_NE(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::not_equal_to<void>(), !=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_LE(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::less_equal<void>(), <=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_LT(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::less<void>(), <, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_GE(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::greater_equal<void>(), >=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_GT(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP(std::greater<void>(), >, x, y, ##__VA_ARGS__)
+
+#define CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(cmp, op, x, y, ...) \
+  CAFFE_ENFORCE_THAT_IMPL_WITH_CALLER(                          \
+      cmp, x, y, #x " " #op " " #y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_EQ_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(          \
+      std::equal_to<void>(), ==, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_NE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(          \
+      std::not_equal_to<void>(), !=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_LE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(          \
+      std::less_equal<void>(), <=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_LT_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(std::less<void>(), <, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_GE_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(          \
+      std::greater_equal<void>(), >=, x, y, ##__VA_ARGS__)
+#define CAFFE_ENFORCE_GT_WITH_CALLER(x, y, ...) \
+  CAFFE_ENFORCE_BINARY_OP_WITH_CALLER(          \
+      std::greater<void>(), >, x, y, ##__VA_ARGS__)
+
+/**
+ * Very lightweight logging for the first time API usage. It's beneficial for
+ * tracking of individual functionality usage in larger applications.
+ *
+ * In order to ensure light-weightedness of logging, we utilize static variable
+ * trick - LogAPIUsage will be invoked only once and further invocations will
+ * just do an atomic check.
+ *
+ * Example:
+ *   // Logs caller info with an arbitrary text event, if there is a usage.
+ *   C10_LOG_API_USAGE_ONCE("my_api");
+ */
+#define C10_LOG_API_USAGE_ONCE(...)                        \
+  C10_UNUSED static bool C10_ANONYMOUS_VARIABLE(logFlag) = \
+      ::c10::detail::LogAPIUsageFakeReturn(__VA_ARGS__);
+
+// API usage logging capabilities
+C10_API void SetAPIUsageLogger(std::function<void(const std::string&)> logger);
+C10_API void LogAPIUsage(const std::string& context);
+
+C10_API void SetAPIUsageMetadataLogger(
+    std::function<void(
+        const std::string&,
+        const std::map<std::string, std::string>& metadata_map)> logger);
+C10_API void LogAPIUsageMetadata(
+    const std::string& context,
+    const std::map<std::string, std::string>& metadata_map);
+
+// PyTorch ddp usage logging capabilities
+// DDPLoggingData holds data that can be logged in applications
+// for analysis and debugging. Data structure is defined in
+// c10 directory so that it can be easily imported by both c10
+// and torch files.
+struct DDPLoggingData {
+  // logging fields that are string types.
+  std::map<std::string, std::string> strs_map;
+  // logging fields that are int64_t types.
+  std::map<std::string, int64_t> ints_map;
+};
+
+C10_API void SetPyTorchDDPUsageLogger(
+    std::function<void(const DDPLoggingData&)> logger);
+C10_API void LogPyTorchDDPUsage(const DDPLoggingData& ddpData);
+
+namespace detail {
+// Return value is needed to do the static variable initialization trick
+C10_API bool LogAPIUsageFakeReturn(const std::string& context);
+} // namespace detail
+
+// Initializes the c10 logger.
+C10_API void initLogging();
+
+// Sets the rank, which will be included in log messages
+C10_API void SetGlobalRank(int64_t rank);
+
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/MathConstants.h b/MLPY/Lib/site-packages/torch/include/c10/util/MathConstants.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecdf9f34a945b6acb73614a35c637e80da4a6fba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/MathConstants.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+
+namespace c10 {
+// TODO: Replace me with inline constexpr variable when C++17 becomes available
+namespace detail {
+template <typename T>
+C10_HOST_DEVICE inline constexpr T e() {
+  return static_cast<T>(2.718281828459045235360287471352662);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T euler() {
+  return static_cast<T>(0.577215664901532860606512090082402);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_1_pi() {
+  return static_cast<T>(0.318309886183790671537767526745028);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_1_sqrt_pi() {
+  return static_cast<T>(0.564189583547756286948079451560772);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_sqrt_2() {
+  return static_cast<T>(0.707106781186547524400844362104849);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T frac_sqrt_3() {
+  return static_cast<T>(0.577350269189625764509148780501957);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T golden_ratio() {
+  return static_cast<T>(1.618033988749894848204586834365638);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T ln_10() {
+  return static_cast<T>(2.302585092994045684017991454684364);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T ln_2() {
+  return static_cast<T>(0.693147180559945309417232121458176);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T log_10_e() {
+  return static_cast<T>(0.434294481903251827651128918916605);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T log_2_e() {
+  return static_cast<T>(1.442695040888963407359924681001892);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T pi() {
+  return static_cast<T>(3.141592653589793238462643383279502);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T sqrt_2() {
+  return static_cast<T>(1.414213562373095048801688724209698);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline constexpr T sqrt_3() {
+  return static_cast<T>(1.732050807568877293527446341505872);
+}
+
+template <>
+C10_HOST_DEVICE inline constexpr BFloat16 pi<BFloat16>() {
+  // According to
+  // https://en.wikipedia.org/wiki/Bfloat16_floating-point_format#Special_values
+  // pi is encoded as 4049
+  return BFloat16(0x4049, BFloat16::from_bits());
+}
+
+template <>
+C10_HOST_DEVICE inline constexpr Half pi<Half>() {
+  return Half(0x4248, Half::from_bits());
+}
+} // namespace detail
+
+template <typename T>
+constexpr T e = c10::detail::e<T>();
+
+template <typename T>
+constexpr T euler = c10::detail::euler<T>();
+
+template <typename T>
+constexpr T frac_1_pi = c10::detail::frac_1_pi<T>();
+
+template <typename T>
+constexpr T frac_1_sqrt_pi = c10::detail::frac_1_sqrt_pi<T>();
+
+template <typename T>
+constexpr T frac_sqrt_2 = c10::detail::frac_sqrt_2<T>();
+
+template <typename T>
+constexpr T frac_sqrt_3 = c10::detail::frac_sqrt_3<T>();
+
+template <typename T>
+constexpr T golden_ratio = c10::detail::golden_ratio<T>();
+
+template <typename T>
+constexpr T ln_10 = c10::detail::ln_10<T>();
+
+template <typename T>
+constexpr T ln_2 = c10::detail::ln_2<T>();
+
+template <typename T>
+constexpr T log_10_e = c10::detail::log_10_e<T>();
+
+template <typename T>
+constexpr T log_2_e = c10::detail::log_2_e<T>();
+
+template <typename T>
+constexpr T pi = c10::detail::pi<T>();
+
+template <typename T>
+constexpr T sqrt_2 = c10::detail::sqrt_2<T>();
+
+template <typename T>
+constexpr T sqrt_3 = c10::detail::sqrt_3<T>();
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/MaybeOwned.h b/MLPY/Lib/site-packages/torch/include/c10/util/MaybeOwned.h
new file mode 100644
index 0000000000000000000000000000000000000000..074ae1070adb2a5699561bab078c5973e013ac38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/MaybeOwned.h
@@ -0,0 +1,237 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/// MaybeOwnedTraits<T> describes how to borrow from T.  Here is how we
+/// can implement borrowing from an arbitrary type T using a raw
+/// pointer to const:
+template <typename T>
+struct MaybeOwnedTraitsGenericImpl {
+  using owned_type = T;
+  using borrow_type = const T*;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    return &from;
+  }
+
+  static void assignBorrow(borrow_type& lhs, borrow_type rhs) {
+    lhs = rhs;
+  }
+
+  static void destroyBorrow(borrow_type& /*toDestroy*/) {}
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return *borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& borrow) {
+    return borrow != nullptr;
+  }
+};
+
+/// It is possible to eliminate the extra layer of indirection for
+/// borrows for some types that we control. For examples, see
+/// intrusive_ptr.h and TensorBody.h.
+
+template <typename T>
+struct MaybeOwnedTraits;
+
+// Explicitly enable MaybeOwned<shared_ptr<T>>, rather than allowing
+// MaybeOwned to be used for any type right away.
+template <typename T>
+struct MaybeOwnedTraits<std::shared_ptr<T>>
+    : public MaybeOwnedTraitsGenericImpl<std::shared_ptr<T>> {};
+
+/// A smart pointer around either a borrowed or owned T. When
+/// constructed with borrowed(), the caller MUST ensure that the
+/// borrowed-from argument outlives this MaybeOwned<T>. Compare to
+/// Rust's std::borrow::Cow
+/// (https://doc.rust-lang.org/std/borrow/enum.Cow.html), but note
+/// that it is probably not suitable for general use because C++ has
+/// no borrow checking. Included here to support
+/// Tensor::expect_contiguous.
+template <typename T>
+class MaybeOwned final {
+  using borrow_type = typename MaybeOwnedTraits<T>::borrow_type;
+  using owned_type = typename MaybeOwnedTraits<T>::owned_type;
+
+  bool isBorrowed_;
+  union {
+    borrow_type borrow_;
+    owned_type own_;
+  };
+
+  /// Don't use this; use borrowed() instead.
+  explicit MaybeOwned(const owned_type& t)
+      : isBorrowed_(true), borrow_(MaybeOwnedTraits<T>::createBorrow(t)) {}
+
+  /// Don't use this; use owned() instead.
+  explicit MaybeOwned(T&& t) noexcept(std::is_nothrow_move_constructible_v<T>)
+      : isBorrowed_(false), own_(std::move(t)) {}
+
+  /// Don't use this; use owned() instead.
+  template <class... Args>
+  explicit MaybeOwned(std::in_place_t, Args&&... args)
+      : isBorrowed_(false), own_(std::forward<Args>(args)...) {}
+
+ public:
+  explicit MaybeOwned() : isBorrowed_(true), borrow_() {}
+
+  // Copying a borrow yields another borrow of the original, as with a
+  // T*. Copying an owned T yields another owned T for safety: no
+  // chains of borrowing by default! (Note you could get that behavior
+  // with MaybeOwned<T>::borrowed(*rhs) if you wanted it.)
+  MaybeOwned(const MaybeOwned& rhs) : isBorrowed_(rhs.isBorrowed_) {
+    if (C10_LIKELY(rhs.isBorrowed_)) {
+      MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+    } else {
+      new (&own_) T(rhs.own_);
+    }
+  }
+
+  MaybeOwned& operator=(const MaybeOwned& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_UNLIKELY(!isBorrowed_)) {
+      if (rhs.isBorrowed_) {
+        own_.~T();
+        MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+        isBorrowed_ = true;
+      } else {
+        own_ = rhs.own_;
+      }
+    } else {
+      if (C10_LIKELY(rhs.isBorrowed_)) {
+        MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+      } else {
+        MaybeOwnedTraits<T>::destroyBorrow(borrow_);
+        new (&own_) T(rhs.own_);
+        isBorrowed_ = false;
+      }
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isBorrowed_ == rhs.isBorrowed_);
+    return *this;
+  }
+
+  MaybeOwned(MaybeOwned&& rhs) noexcept(
+      // NOLINTNEXTLINE(*-noexcept-move-*)
+      std::is_nothrow_move_constructible_v<T> &&
+      std::is_nothrow_move_assignable_v<borrow_type>)
+      : isBorrowed_(rhs.isBorrowed_) {
+    if (C10_LIKELY(rhs.isBorrowed_)) {
+      MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+    } else {
+      new (&own_) T(std::move(rhs.own_));
+    }
+  }
+
+  MaybeOwned& operator=(MaybeOwned&& rhs) noexcept(
+      std::is_nothrow_move_assignable_v<T> &&
+      std::is_nothrow_move_assignable_v<borrow_type> &&
+      std::is_nothrow_move_constructible_v<T> &&
+      // NOLINTNEXTLINE(*-noexcept-move-*)
+      std::is_nothrow_destructible_v<T> &&
+      std::is_nothrow_destructible_v<borrow_type>) {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (C10_UNLIKELY(!isBorrowed_)) {
+      if (rhs.isBorrowed_) {
+        own_.~T();
+        MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+        isBorrowed_ = true;
+      } else {
+        own_ = std::move(rhs.own_);
+      }
+    } else {
+      if (C10_LIKELY(rhs.isBorrowed_)) {
+        MaybeOwnedTraits<T>::assignBorrow(borrow_, rhs.borrow_);
+      } else {
+        MaybeOwnedTraits<T>::destroyBorrow(borrow_);
+        new (&own_) T(std::move(rhs.own_));
+        isBorrowed_ = false;
+      }
+    }
+    return *this;
+  }
+
+  static MaybeOwned borrowed(const T& t) {
+    return MaybeOwned(t);
+  }
+
+  static MaybeOwned owned(T&& t) noexcept(
+      std::is_nothrow_move_constructible_v<T>) {
+    return MaybeOwned(std::move(t));
+  }
+
+  template <class... Args>
+  static MaybeOwned owned(std::in_place_t, Args&&... args) {
+    return MaybeOwned(std::in_place, std::forward<Args>(args)...);
+  }
+
+  ~MaybeOwned() noexcept(
+      // NOLINTNEXTLINE(*-noexcept-destructor)
+      std::is_nothrow_destructible_v<T> &&
+      std::is_nothrow_destructible_v<borrow_type>) {
+    if (C10_UNLIKELY(!isBorrowed_)) {
+      own_.~T();
+    } else {
+      MaybeOwnedTraits<T>::destroyBorrow(borrow_);
+    }
+  }
+
+  // This is an implementation detail!  You should know what you're doing
+  // if you are testing this.  If you just want to guarantee ownership move
+  // this into a T
+  bool unsafeIsBorrowed() const {
+    return isBorrowed_;
+  }
+
+  const T& operator*() const& {
+    if (isBorrowed_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          MaybeOwnedTraits<T>::debugBorrowIsValid(borrow_));
+    }
+    return C10_LIKELY(isBorrowed_)
+        ? MaybeOwnedTraits<T>::referenceFromBorrow(borrow_)
+        : own_;
+  }
+
+  const T* operator->() const {
+    if (isBorrowed_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          MaybeOwnedTraits<T>::debugBorrowIsValid(borrow_));
+    }
+    return C10_LIKELY(isBorrowed_)
+        ? MaybeOwnedTraits<T>::pointerFromBorrow(borrow_)
+        : &own_;
+  }
+
+  // If borrowed, copy the underlying T. If owned, move from
+  // it. borrowed/owned state remains the same, and either we
+  // reference the same borrow as before or we are an owned moved-from
+  // T.
+  T operator*() && {
+    if (isBorrowed_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          MaybeOwnedTraits<T>::debugBorrowIsValid(borrow_));
+      return MaybeOwnedTraits<T>::referenceFromBorrow(borrow_);
+    } else {
+      return std::move(own_);
+    }
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Metaprogramming.h b/MLPY/Lib/site-packages/torch/include/c10/util/Metaprogramming.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e47b356cd6cdcc3c7d4bc5811553fa371b2be5e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Metaprogramming.h
@@ -0,0 +1,224 @@
+#pragma once
+
+#include <c10/util/TypeList.h>
+#include <type_traits>
+
+namespace c10::guts {
+
+/**
+ * Access information about result type or arguments from a function type.
+ * Example:
+ * using A = function_traits<int (float, double)>::return_type // A == int
+ * using A = function_traits<int (float, double)>::parameter_types::tuple_type
+ * // A == tuple<float, double>
+ */
+template <class Func>
+struct function_traits {
+  static_assert(
+      !std::is_same_v<Func, Func>,
+      "In function_traits<Func>, Func must be a plain function type.");
+};
+template <class Result, class... Args>
+struct function_traits<Result(Args...)> {
+  using func_type = Result(Args...);
+  using return_type = Result;
+  using parameter_types = typelist::typelist<Args...>;
+  static constexpr auto number_of_parameters = sizeof...(Args);
+};
+
+/**
+ * infer_function_traits: creates a `function_traits` type for a simple
+ * function (pointer) or functor (lambda/struct). Currently does not support
+ * class methods.
+ */
+
+template <typename Functor>
+struct infer_function_traits {
+  using type = function_traits<
+      c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
+};
+
+template <typename Result, typename... Args>
+struct infer_function_traits<Result (*)(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename Result, typename... Args>
+struct infer_function_traits<Result(Args...)> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename T>
+using infer_function_traits_t = typename infer_function_traits<T>::type;
+
+/**
+ * make_function_traits: creates a `function_traits` type given a Return type
+ * and a typelist of Argument types
+ *
+ * Example:
+ * bool f(int, int);
+ *
+ * infer_function_traits_t<f> == make_function_traits_t<bool,
+ * typelist::typelist<int, int>>
+ */
+template <typename Result, typename ArgList>
+struct make_function_traits {
+  static_assert(
+      false_t<ArgList>::value,
+      "In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
+};
+
+template <typename Result, typename... Args>
+struct make_function_traits<Result, typelist::typelist<Args...>> {
+  using type = function_traits<Result(Args...)>;
+};
+
+template <typename Result, typename ArgList>
+using make_function_traits_t =
+    typename make_function_traits<Result, ArgList>::type;
+
+/**
+ * make_offset_index_sequence<Start, N>
+ * Like make_index_sequence<N>, but starting from Start instead of 0.
+ *
+ * Example:
+ *  make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
+ */
+template <size_t Start, size_t N, size_t... Is>
+struct make_offset_index_sequence_impl
+    : make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
+  static_assert(
+      static_cast<int>(Start) >= 0,
+      "make_offset_index_sequence: Start < 0");
+  static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
+};
+
+template <size_t Start, size_t... Is>
+struct make_offset_index_sequence_impl<Start, 0, Is...> {
+  typedef std::index_sequence<Is...> type;
+};
+
+template <size_t Start, size_t N>
+using make_offset_index_sequence =
+    typename make_offset_index_sequence_impl<Start, N>::type;
+
+/**
+ * Use tuple_elements to extract a position-indexed subset of elements
+ * from the argument tuple into a result tuple.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
+ *  std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
+ * 2>());
+ */
+template <class Tuple, size_t... Is>
+constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...>) {
+  return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
+}
+
+/**
+ * Use tuple_take to extract the first or last n elements from the argument
+ * tuple into a result tuple.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
+ *  std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
+ *  std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
+ */
+template <class Tuple, int N, class Enable = void>
+struct TupleTake {};
+
+template <class Tuple, int N>
+struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(N <= size, "tuple_take: N > size");
+    return tuple_elements(t, std::make_index_sequence<N>{});
+  }
+};
+
+template <class Tuple, int N>
+    struct TupleTake < Tuple,
+    N, std::enable_if_t<N<0, void>> {
+  static auto call(Tuple t) {
+    constexpr size_t size = std::tuple_size<Tuple>();
+    static_assert(-N <= size, "tuple_take: -N > size");
+    return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
+  }
+};
+
+template <class Tuple, int N>
+auto tuple_take(Tuple t) {
+  return TupleTake<Tuple, N>::call(t);
+}
+
+/**
+ * Use tuple_slice to extract a contiguous subtuple from the argument.
+ *
+ * Example:
+ *  std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
+ * "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
+ * tuple_slice<decltype(t), 1, 2>(t);
+ */
+template <class Tuple, size_t Start, size_t N>
+constexpr auto tuple_slice(Tuple t) {
+  constexpr size_t size = std::tuple_size<Tuple>();
+  static_assert(Start + N <= size, "tuple_slice: Start + N > size");
+  return tuple_elements(t, make_offset_index_sequence<Start, N>{});
+}
+
+/**
+ * Use tuple_map to run a mapping function over a tuple to get a new tuple.
+ *
+ * Example 1:
+ *   auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
+ * (int32_t a) -> int16_t {return a+1;});
+ *   // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
+ *
+ * Example 2:
+ *   struct Mapper {
+ *     std::string operator()(int32_t a) const {
+ *       return std::to_string(a);
+ *     }
+ *     int64_t operator()(const std::string& a) const {
+ *        return atoi(a.c_str());
+ *     }
+ *   };
+ *   auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
+ * Mapper());
+ *   // result == std::tuple<std::string, int64_t>("3", 4)
+ *
+ * Example 3:
+ *   struct A final {
+ *    int32_t func() {
+ *      return 5;
+ *    }
+ *  };
+ *  struct B final {
+ *    std::string func() {
+ *      return "5";
+ *    }
+ *  };
+ *  auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
+ * a.func(); });
+ *  // result == std::tuple<int32_t, std::string>(5, "5");
+ */
+namespace detail {
+template <class Mapper, class... Args, size_t... Indices>
+auto tuple_map(
+    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+    std::tuple<Args...>&& tuple,
+    const Mapper& mapper,
+    std::index_sequence<Indices...>) {
+  return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
+      tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
+}
+} // namespace detail
+
+template <class Mapper, class... Args>
+auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
+  return detail::tuple_map(
+      std::move(tuple), mapper, std::index_sequence_for<Args...>());
+}
+
+} // namespace c10::guts
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Optional.h b/MLPY/Lib/site-packages/torch/include/c10/util/Optional.h
new file mode 100644
index 0000000000000000000000000000000000000000..3229bef1315c505d4b7c6a1c5bcbb13b618cfb63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Optional.h
@@ -0,0 +1,48 @@
+#ifndef C10_UTIL_OPTIONAL_H_
+#define C10_UTIL_OPTIONAL_H_
+
+#include <optional>
+#include <type_traits>
+
+// Macros.h is not needed, but it does namespace shenanigans that lots
+// of downstream code seems to rely on. Feel free to remove it and fix
+// up builds.
+
+namespace c10 {
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::bad_optional_access;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::make_optional;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::nullopt_t;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::optional;
+
+namespace detail_ {
+// the call to convert<A>(b) has return type A and converts b to type A iff b
+// decltype(b) is implicitly convertible to A
+template <class U>
+constexpr U convert(U v) {
+  return v;
+}
+} // namespace detail_
+template <class T, class F>
+constexpr T value_or_else(const optional<T>& v, F&& func) {
+  static_assert(
+      std::is_convertible_v<typename std::invoke_result_t<F>, T>,
+      "func parameters must be a callable that returns a type convertible to the value stored in the optional");
+  return v.has_value() ? *v : detail_::convert<T>(std::forward<F>(func)());
+}
+
+template <class T, class F>
+constexpr T value_or_else(optional<T>&& v, F&& func) {
+  static_assert(
+      std::is_convertible_v<typename std::invoke_result_t<F>, T>,
+      "func parameters must be a callable that returns a type convertible to the value stored in the optional");
+  return v.has_value() ? constexpr_move(std::move(v).contained_val())
+                       : detail_::convert<T>(std::forward<F>(func)());
+}
+} // namespace c10
+#endif // C10_UTIL_OPTIONAL_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/OptionalArrayRef.h b/MLPY/Lib/site-packages/torch/include/c10/util/OptionalArrayRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b4ccf26679b2593f4dfa9d8ac0e679a6f892e87
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/OptionalArrayRef.h
@@ -0,0 +1,236 @@
+// This file defines OptionalArrayRef<T>, a class that has almost the same
+// exact functionality as c10::optional<ArrayRef<T>>, except that its
+// converting constructor fixes a dangling pointer issue.
+//
+// The implicit converting constructor of both c10::optional<ArrayRef<T>> and
+// std::optional<ArrayRef<T>> can cause the underlying ArrayRef<T> to store
+// a dangling pointer. OptionalArrayRef<T> prevents this by wrapping
+// a c10::optional<ArrayRef<T>> and fixing the constructor implementation.
+//
+// See https://github.com/pytorch/pytorch/issues/63645 for more on this.
+
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <cstdint>
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+template <typename T>
+class OptionalArrayRef final {
+ public:
+  // Constructors
+
+  constexpr OptionalArrayRef() noexcept = default;
+
+  constexpr OptionalArrayRef(nullopt_t) noexcept {}
+
+  OptionalArrayRef(const OptionalArrayRef& other) = default;
+
+  OptionalArrayRef(OptionalArrayRef&& other) noexcept = default;
+
+  constexpr OptionalArrayRef(const optional<ArrayRef<T>>& other) noexcept
+      : wrapped_opt_array_ref(other) {}
+
+  constexpr OptionalArrayRef(optional<ArrayRef<T>>&& other) noexcept
+      : wrapped_opt_array_ref(std::move(other)) {}
+
+  constexpr OptionalArrayRef(const T& value) noexcept
+      : wrapped_opt_array_ref(value) {}
+
+  template <
+      typename U = ArrayRef<T>,
+      std::enable_if_t<
+          !std::is_same_v<std::decay_t<U>, OptionalArrayRef> &&
+              !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+              std::is_constructible_v<ArrayRef<T>, U&&> &&
+              std::is_convertible_v<U&&, ArrayRef<T>> &&
+              !std::is_convertible_v<U&&, T>,
+          bool> = false>
+  constexpr OptionalArrayRef(U&& value) noexcept(
+      std::is_nothrow_constructible_v<ArrayRef<T>, U&&>)
+      : wrapped_opt_array_ref(std::forward<U>(value)) {}
+
+  template <
+      typename U = ArrayRef<T>,
+      std::enable_if_t<
+          !std::is_same_v<std::decay_t<U>, OptionalArrayRef> &&
+              !std::is_same_v<std::decay_t<U>, std::in_place_t> &&
+              std::is_constructible_v<ArrayRef<T>, U&&> &&
+              !std::is_convertible_v<U&&, ArrayRef<T>>,
+          bool> = false>
+  constexpr explicit OptionalArrayRef(U&& value) noexcept(
+      std::is_nothrow_constructible_v<ArrayRef<T>, U&&>)
+      : wrapped_opt_array_ref(std::forward<U>(value)) {}
+
+  template <typename... Args>
+  constexpr explicit OptionalArrayRef(
+      std::in_place_t ip,
+      Args&&... args) noexcept
+      : wrapped_opt_array_ref(ip, std::forward<Args>(args)...) {}
+
+  template <typename U, typename... Args>
+  constexpr explicit OptionalArrayRef(
+      std::in_place_t ip,
+      std::initializer_list<U> il,
+      Args&&... args)
+      : wrapped_opt_array_ref(ip, il, std::forward<Args>(args)...) {}
+
+  constexpr OptionalArrayRef(const std::initializer_list<T>& Vec)
+      : wrapped_opt_array_ref(ArrayRef<T>(Vec)) {}
+
+  // Destructor
+
+  ~OptionalArrayRef() = default;
+
+  // Assignment
+
+  constexpr OptionalArrayRef& operator=(nullopt_t) noexcept {
+    wrapped_opt_array_ref = c10::nullopt;
+    return *this;
+  }
+
+  OptionalArrayRef& operator=(const OptionalArrayRef& other) = default;
+
+  OptionalArrayRef& operator=(OptionalArrayRef&& other) noexcept = default;
+
+  constexpr OptionalArrayRef& operator=(
+      const optional<ArrayRef<T>>& other) noexcept {
+    wrapped_opt_array_ref = other;
+    return *this;
+  }
+
+  constexpr OptionalArrayRef& operator=(
+      optional<ArrayRef<T>>&& other) noexcept {
+    wrapped_opt_array_ref = std::move(other);
+    return *this;
+  }
+
+  template <
+      typename U = ArrayRef<T>,
+      typename = std::enable_if_t<
+          !std::is_same_v<std::decay_t<U>, OptionalArrayRef> &&
+          std::is_constructible_v<ArrayRef<T>, U&&> &&
+          std::is_assignable_v<ArrayRef<T>&, U&&>>>
+  constexpr OptionalArrayRef& operator=(U&& value) noexcept(
+      std::is_nothrow_constructible_v<ArrayRef<T>, U&&> &&
+      std::is_nothrow_assignable_v<ArrayRef<T>&, U&&>) {
+    wrapped_opt_array_ref = std::forward<U>(value);
+    return *this;
+  }
+
+  // Observers
+
+  constexpr ArrayRef<T>* operator->() noexcept {
+    return &wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>* operator->() const noexcept {
+    return &wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>& operator*() & noexcept {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>& operator*() const& noexcept {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>&& operator*() && noexcept {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr const ArrayRef<T>&& operator*() const&& noexcept {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr explicit operator bool() const noexcept {
+    return wrapped_opt_array_ref.has_value();
+  }
+
+  constexpr bool has_value() const noexcept {
+    return wrapped_opt_array_ref.has_value();
+  }
+
+  constexpr ArrayRef<T>& value() & {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr const ArrayRef<T>& value() const& {
+    return wrapped_opt_array_ref.value();
+  }
+
+  constexpr ArrayRef<T>&& value() && {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  constexpr const ArrayRef<T>&& value() const&& {
+    return std::move(wrapped_opt_array_ref.value());
+  }
+
+  template <typename U>
+  constexpr std::
+      enable_if_t<std::is_convertible_v<U&&, ArrayRef<T>>, ArrayRef<T>>
+      value_or(U&& default_value) const& {
+    return wrapped_opt_array_ref.value_or(std::forward<U>(default_value));
+  }
+
+  template <typename U>
+  constexpr std::
+      enable_if_t<std::is_convertible_v<U&&, ArrayRef<T>>, ArrayRef<T>>
+      value_or(U&& default_value) && {
+    return wrapped_opt_array_ref.value_or(std::forward<U>(default_value));
+  }
+
+  // Modifiers
+
+  constexpr void swap(OptionalArrayRef& other) noexcept {
+    std::swap(wrapped_opt_array_ref, other.wrapped_opt_array_ref);
+  }
+
+  constexpr void reset() noexcept {
+    wrapped_opt_array_ref.reset();
+  }
+
+  template <typename... Args>
+  constexpr std::
+      enable_if_t<std::is_constructible_v<ArrayRef<T>, Args&&...>, ArrayRef<T>&>
+      emplace(Args&&... args) noexcept(
+          std::is_nothrow_constructible_v<ArrayRef<T>, Args&&...>) {
+    return wrapped_opt_array_ref.emplace(std::forward<Args>(args)...);
+  }
+
+  template <typename U, typename... Args>
+  constexpr ArrayRef<T>& emplace(
+      std::initializer_list<U> il,
+      Args&&... args) noexcept {
+    return wrapped_opt_array_ref.emplace(il, std::forward<Args>(args)...);
+  }
+
+ private:
+  optional<ArrayRef<T>> wrapped_opt_array_ref;
+};
+
+using OptionalIntArrayRef = OptionalArrayRef<int64_t>;
+
+inline bool operator==(
+    const OptionalIntArrayRef& a1,
+    const IntArrayRef& other) {
+  if (!a1.has_value()) {
+    return false;
+  }
+  return a1.value() == other;
+}
+
+inline bool operator==(
+    const c10::IntArrayRef& a1,
+    const c10::OptionalIntArrayRef& a2) {
+  return a2 == a1;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ParallelGuard.h b/MLPY/Lib/site-packages/torch/include/c10/util/ParallelGuard.h
new file mode 100644
index 0000000000000000000000000000000000000000..cabeafcacbdfd6235040b74c00c2e89183b3bf8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ParallelGuard.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+// RAII thread local guard that tracks whether code is being executed in
+// `at::parallel_for` or `at::parallel_reduce` loop function.
+class C10_API ParallelGuard {
+ public:
+  static bool is_enabled();
+
+  ParallelGuard(bool state);
+  ~ParallelGuard();
+
+ private:
+  bool previous_state_;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Registry.h b/MLPY/Lib/site-packages/torch/include/c10/util/Registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..20490019b3cfecd722030d19bedd3c900ea156b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Registry.h
@@ -0,0 +1,326 @@
+#ifndef C10_UTIL_REGISTRY_H_
+#define C10_UTIL_REGISTRY_H_
+
+/**
+ * Simple registry implementation that uses static variables to
+ * register object creators during program initialization time.
+ */
+
+// NB: This Registry works poorly when you have other namespaces.
+// Make all macro invocations from inside the at namespace.
+
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Type.h>
+
+namespace c10 {
+
+template <typename KeyType>
+inline std::string KeyStrRepr(const KeyType& /*key*/) {
+  return "[key type printing not supported]";
+}
+
+template <>
+inline std::string KeyStrRepr(const std::string& key) {
+  return key;
+}
+
+enum RegistryPriority {
+  REGISTRY_FALLBACK = 1,
+  REGISTRY_DEFAULT = 2,
+  REGISTRY_PREFERRED = 3,
+};
+
+/**
+ * @brief A template class that allows one to register classes by keys.
+ *
+ * The keys are usually a std::string specifying the name, but can be anything
+ * that can be used in a std::map.
+ *
+ * You should most likely not use the Registry class explicitly, but use the
+ * helper macros below to declare specific registries as well as registering
+ * objects.
+ */
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registry {
+ public:
+  typedef std::function<ObjectPtrType(Args...)> Creator;
+
+  Registry(bool warning = true) : registry_(), priority_(), warning_(warning) {}
+
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
+    std::lock_guard<std::mutex> lock(register_mutex_);
+    // The if statement below is essentially the same as the following line:
+    // TORCH_CHECK_EQ(registry_.count(key), 0) << "Key " << key
+    //                                   << " registered twice.";
+    // However, TORCH_CHECK_EQ depends on google logging, and since registration
+    // is carried out at static initialization time, we do not want to have an
+    // explicit dependency on glog's initialization function.
+    if (registry_.count(key) != 0) {
+      auto cur_priority = priority_[key];
+      if (priority > cur_priority) {
+#ifdef DEBUG
+        std::string warn_msg =
+            "Overwriting already registered item for key " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+#endif
+        registry_[key] = creator;
+        priority_[key] = priority;
+      } else if (priority == cur_priority) {
+        std::string err_msg =
+            "Key already registered with the same priority: " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", err_msg.c_str());
+        if (terminate_) {
+          std::exit(1);
+        } else {
+          throw std::runtime_error(err_msg);
+        }
+      } else if (warning_) {
+        std::string warn_msg =
+            "Higher priority item already registered, skipping registration of " +
+            KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+      }
+    } else {
+      registry_[key] = creator;
+      priority_[key] = priority;
+    }
+  }
+
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const std::string& help_msg,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
+    Register(key, creator, priority);
+    help_message_[key] = help_msg;
+  }
+
+  inline bool Has(const SrcType& key) {
+    return (registry_.count(key) != 0);
+  }
+
+  ObjectPtrType Create(const SrcType& key, Args... args) {
+    auto it = registry_.find(key);
+    if (it == registry_.end()) {
+      // Returns nullptr if the key is not registered.
+      return nullptr;
+    }
+    return it->second(args...);
+  }
+
+  /**
+   * Returns the keys currently registered as a std::vector.
+   */
+  std::vector<SrcType> Keys() const {
+    std::vector<SrcType> keys;
+    keys.reserve(registry_.size());
+    for (const auto& it : registry_) {
+      keys.push_back(it.first);
+    }
+    return keys;
+  }
+
+  inline const std::unordered_map<SrcType, std::string>& HelpMessage() const {
+    return help_message_;
+  }
+
+  const char* HelpMessage(const SrcType& key) const {
+    auto it = help_message_.find(key);
+    if (it == help_message_.end()) {
+      return nullptr;
+    }
+    return it->second.c_str();
+  }
+
+  // Used for testing, if terminate is unset, Registry throws instead of
+  // calling std::exit
+  void SetTerminate(bool terminate) {
+    terminate_ = terminate;
+  }
+
+ private:
+  std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, RegistryPriority> priority_;
+  bool terminate_{true};
+  const bool warning_;
+  std::unordered_map<SrcType, std::string> help_message_;
+  std::mutex register_mutex_;
+
+  C10_DISABLE_COPY_AND_ASSIGN(Registry);
+};
+
+template <class SrcType, class ObjectPtrType, class... Args>
+class Registerer {
+ public:
+  explicit Registerer(
+      const SrcType& key,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg);
+  }
+
+  explicit Registerer(
+      const SrcType& key,
+      const RegistryPriority priority,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg, priority);
+  }
+
+  template <class DerivedType>
+  static ObjectPtrType DefaultCreator(Args... args) {
+    return ObjectPtrType(new DerivedType(args...));
+  }
+};
+
+/**
+ * C10_DECLARE_TYPED_REGISTRY is a macro that expands to a function
+ * declaration, as well as creating a convenient typename for its corresponding
+ * registerer.
+ */
+// Note on C10_IMPORT and C10_EXPORT below: we need to explicitly mark DECLARE
+// as import and DEFINE as export, because these registry macros will be used
+// in downstream shared libraries as well, and one cannot use *_API - the API
+// macro will be defined on a per-shared-library basis. Semantically, when one
+// declares a typed registry it is always going to be IMPORT, and when one
+// defines a registry (which should happen ONLY ONCE and ONLY IN SOURCE FILE),
+// the instantiation unit is always going to be exported.
+//
+// The only unique condition is when in the same file one does DECLARE and
+// DEFINE - in Windows compilers, this generates a warning that dllimport and
+// dllexport are mixed, but the warning is fine and linker will be properly
+// exporting the symbol. Same thing happens in the gflags flag declaration and
+// definition caes.
+#define C10_DECLARE_TYPED_REGISTRY(                                      \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                     \
+  C10_API ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*  \
+  RegistryName();                                                        \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__> \
+      Registerer##RegistryName
+
+#define TORCH_DECLARE_TYPED_REGISTRY(                                     \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                      \
+  TORCH_API ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName();                                                         \
+  typedef ::c10::Registerer<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>  \
+      Registerer##RegistryName
+
+#define C10_DEFINE_TYPED_REGISTRY(                                         \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                       \
+  C10_EXPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>* \
+  RegistryName() {                                                         \
+    static ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*   \
+        registry = new ::c10::                                             \
+            Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>();       \
+    return registry;                                                       \
+  }
+
+#define C10_DEFINE_TYPED_REGISTRY_WITHOUT_WARNING(                            \
+    RegistryName, SrcType, ObjectType, PtrType, ...)                          \
+  C10_EXPORT ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*    \
+  RegistryName() {                                                            \
+    static ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>*      \
+        registry =                                                            \
+            new ::c10::Registry<SrcType, PtrType<ObjectType>, ##__VA_ARGS__>( \
+                false);                                                       \
+    return registry;                                                          \
+  }
+
+// Note(Yangqing): The __VA_ARGS__ below allows one to specify a templated
+// creator with comma in its templated arguments.
+#define C10_REGISTER_TYPED_CREATOR(RegistryName, key, ...)                  \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, RegistryName(), ##__VA_ARGS__);
+
+#define C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                           \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, priority, RegistryName(), ##__VA_ARGS__);
+
+#define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
+#define C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                             \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      priority,                                                             \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
+// C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use
+// std::string as the key type, because that is the most commonly used cases.
+#define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define TORCH_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
+  TORCH_DECLARE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_REGISTRY_WITHOUT_WARNING(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY_WITHOUT_WARNING(                               \
+      RegistryName, std::string, ObjectType, std::unique_ptr, ##__VA_ARGS__)
+
+#define C10_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define TORCH_DECLARE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  TORCH_DECLARE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_SHARED_REGISTRY(RegistryName, ObjectType, ...) \
+  C10_DEFINE_TYPED_REGISTRY(                                      \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+#define C10_DEFINE_SHARED_REGISTRY_WITHOUT_WARNING( \
+    RegistryName, ObjectType, ...)                  \
+  C10_DEFINE_TYPED_REGISTRY_WITHOUT_WARNING(        \
+      RegistryName, std::string, ObjectType, std::shared_ptr, ##__VA_ARGS__)
+
+// C10_REGISTER_CREATOR and C10_REGISTER_CLASS are hard-wired to use std::string
+// as the key
+// type, because that is the most commonly used cases.
+#define C10_REGISTER_CREATOR(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
+
+#define C10_REGISTER_CREATOR_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
+#define C10_REGISTER_CLASS(RegistryName, key, ...) \
+  C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
+
+#define C10_REGISTER_CLASS_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
+} // namespace c10
+
+#endif // C10_UTIL_REGISTRY_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ScopeExit.h b/MLPY/Lib/site-packages/torch/include/c10/util/ScopeExit.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ed2373eea03bb8615d6742f05996e9aa27158d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ScopeExit.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/**
+ * Mostly copied from https://llvm.org/doxygen/ScopeExit_8h_source.html
+ */
+template <typename Callable>
+class scope_exit {
+  Callable ExitFunction;
+  bool Engaged = true; // False once moved-from or release()d.
+
+ public:
+  template <typename Fp>
+  // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+  explicit scope_exit(Fp&& F) : ExitFunction(std::forward<Fp>(F)) {}
+
+  scope_exit(scope_exit&& Rhs) noexcept
+      : ExitFunction(std::move(Rhs.ExitFunction)), Engaged(Rhs.Engaged) {
+    Rhs.release();
+  }
+  scope_exit(const scope_exit&) = delete;
+  scope_exit& operator=(scope_exit&&) = delete;
+  scope_exit& operator=(const scope_exit&) = delete;
+
+  void release() {
+    Engaged = false;
+  }
+
+  ~scope_exit() {
+    if (Engaged) {
+      ExitFunction();
+    }
+  }
+};
+
+// Keeps the callable object that is passed in, and execute it at the
+// destruction of the returned object (usually at the scope exit where the
+// returned object is kept).
+//
+// Interface is specified by p0052r2.
+template <typename Callable>
+scope_exit<std::decay_t<Callable>> make_scope_exit(Callable&& F) {
+  return scope_exit<std::decay_t<Callable>>(std::forward<Callable>(F));
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/SmallBuffer.h b/MLPY/Lib/site-packages/torch/include/c10/util/SmallBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e4317ee03d0913d1a4128c7330631d01b889510
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/SmallBuffer.h
@@ -0,0 +1,87 @@
+#pragma once
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+/** Helper class for allocating temporary fixed size arrays with SBO.
+ *
+ * This is intentionally much simpler than SmallVector, to improve performance
+ * at the expense of many features:
+ * - No zero-initialization for numeric types
+ * - No resizing after construction
+ * - No copy/move
+ * - No non-trivial types
+ */
+
+namespace c10 {
+
+template <typename T, size_t N>
+class SmallBuffer {
+  static_assert(std::is_trivial_v<T>, "SmallBuffer is intended for POD types");
+
+  std::array<T, N> storage_;
+  size_t size_{};
+  T* data_{};
+
+ public:
+  SmallBuffer(size_t size) : size_(size) {
+    if (size > N) {
+      data_ = new T[size];
+    } else {
+      data_ = &storage_[0];
+    }
+  }
+
+  SmallBuffer(const SmallBuffer&) = delete;
+  SmallBuffer& operator=(const SmallBuffer&) = delete;
+
+  // move constructor is needed in function return
+  SmallBuffer(SmallBuffer&& rhs) noexcept : size_{rhs.size_} {
+    rhs.size_ = 0;
+    if (size_ > N) {
+      data_ = rhs.data_;
+      rhs.data_ = nullptr;
+    } else {
+      storage_ = std::move(rhs.storage_);
+      data_ = &storage_[0];
+    }
+  }
+
+  SmallBuffer& operator=(SmallBuffer&&) = delete;
+
+  ~SmallBuffer() {
+    if (size_ > N) {
+      delete[] data_;
+    }
+  }
+  T& operator[](size_t idx) {
+    return data()[idx];
+  }
+  const T& operator[](size_t idx) const {
+    return data()[idx];
+  }
+  T* data() {
+    return data_;
+  }
+  const T* data() const {
+    return data_;
+  }
+  size_t size() const {
+    return size_;
+  }
+  T* begin() {
+    return data_;
+  }
+  const T* begin() const {
+    return data_;
+  }
+  T* end() {
+    return data_ + size_;
+  }
+  const T* end() const {
+    return data_ + size_;
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/SmallVector.h b/MLPY/Lib/site-packages/torch/include/c10/util/SmallVector.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a72446179e4493d6033fac59664586a16ad921c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/SmallVector.h
@@ -0,0 +1,1476 @@
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+// ATen: modified from llvm::SmallVector.
+// used std::is_trivially_{copy,move}_constructible
+// replaced iterator_range constructor with inline Container&& constructor
+// replaced LLVM_NODISCARD, LLVM_LIKELY, and LLVM_UNLIKELY with c10 equivalents
+// removed LLVM_GSL_OWNER
+// added SmallVector::at
+// added operator<< for std::ostream
+// added C10_API to export SmallVectorBase
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/AlignOf.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace c10 {
+
+/// This is all the stuff common to all SmallVectors.
+///
+/// The template parameter specifies the type which should be used to hold the
+/// Size and Capacity of the SmallVector, so it can be adjusted.
+/// Using 32 bit size is desirable to shrink the size of the SmallVector.
+/// Using 64 bit size is desirable for cases like SmallVector<char>, where a
+/// 32 bit size would limit the vector to ~4GB. SmallVectors are used for
+/// buffering bitcode output - which can exceed 4GB.
+template <class Size_T>
+class C10_API SmallVectorBase {
+ protected:
+  void* BeginX;
+  Size_T Size = 0, Capacity;
+
+  /// The maximum value of the Size_T used.
+  static constexpr size_t SizeTypeMax() {
+    return std::numeric_limits<Size_T>::max();
+  }
+
+  SmallVectorBase(void* FirstEl, size_t TotalCapacity)
+      : BeginX(FirstEl), Capacity(TotalCapacity) {}
+
+  /// This is a helper for \a grow() that's out of line to reduce code
+  /// duplication.  This function will report a fatal error if it can't grow at
+  /// least to \p MinSize.
+  void* mallocForGrow(size_t MinSize, size_t TSize, size_t& NewCapacity);
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  /// This function will report a fatal error if it cannot increase capacity.
+  void grow_pod(void* FirstEl, size_t MinSize, size_t TSize);
+
+ public:
+  SmallVectorBase() = delete;
+  size_t size() const {
+    return Size;
+  }
+  size_t capacity() const {
+    return Capacity;
+  }
+
+  C10_NODISCARD bool empty() const {
+    return !Size;
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_t N) {
+    assert(N <= capacity());
+    Size = N;
+  }
+};
+
+template <class T>
+using SmallVectorSizeType =
+    std::conditional_t<sizeof(T) < 4 && sizeof(void*) >= 8, uint64_t, uint32_t>;
+
+/// Figure out the offset of the first element.
+template <class T, typename = void>
+struct SmallVectorAlignmentAndSize {
+  alignas(SmallVectorBase<SmallVectorSizeType<T>>) char Base[sizeof(
+      SmallVectorBase<SmallVectorSizeType<T>>)];
+  alignas(T) char FirstEl[sizeof(T)];
+};
+
+/// This is the part of SmallVectorTemplateBase which does not depend on whether
+/// the type T is a POD. The extra dummy template argument is used by ArrayRef
+/// to avoid unnecessarily requiring T to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon
+    : public SmallVectorBase<SmallVectorSizeType<T>> {
+  using Base = SmallVectorBase<SmallVectorSizeType<T>>;
+
+  /// Find the address of the first element.  For this pointer math to be valid
+  /// with small-size of 0 for T with lots of alignment, it's important that
+  /// SmallVectorStorage is properly-aligned even for small-size of 0.
+  void* getFirstEl() const {
+    return const_cast<void*>(reinterpret_cast<const void*>(
+        reinterpret_cast<const char*>(this) +
+        offsetof(SmallVectorAlignmentAndSize<T>, FirstEl)));
+  }
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+ protected:
+  SmallVectorTemplateCommon(size_t Size) : Base(getFirstEl(), Size) {}
+
+  void grow_pod(size_t MinSize, size_t TSize) {
+    Base::grow_pod(getFirstEl(), MinSize, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return this->BeginX == getFirstEl();
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    this->BeginX = getFirstEl();
+    this->Size = this->Capacity = 0; // FIXME: Setting Capacity to 0 is suspect.
+  }
+
+  /// Return true if V is an internal reference to the given range.
+  bool isReferenceToRange(const void* V, const void* First, const void* Last)
+      const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(V, First) && LessThan(V, Last);
+  }
+
+  /// Return true if V is an internal reference to this vector.
+  bool isReferenceToStorage(const void* V) const {
+    return isReferenceToRange(V, this->begin(), this->end());
+  }
+
+  /// Return true if First and Last form a valid (possibly empty) range in this
+  /// vector's storage.
+  bool isRangeInStorage(const void* First, const void* Last) const {
+    // Use std::less to avoid UB.
+    std::less<> LessThan;
+    return !LessThan(First, this->begin()) && !LessThan(Last, First) &&
+        !LessThan(this->end(), Last);
+  }
+
+  /// Return true unless Elt will be invalidated by resizing the vector to
+  /// NewSize.
+  bool isSafeToReferenceAfterResize(const void* Elt, size_t NewSize) {
+    // Past the end.
+    if (C10_LIKELY(!isReferenceToStorage(Elt)))
+      return true;
+
+    // Return false if Elt will be destroyed by shrinking.
+    if (NewSize <= this->size())
+      return Elt < this->begin() + NewSize;
+
+    // Return false if we need to grow.
+    return NewSize <= this->capacity();
+  }
+
+  /// Check whether Elt will be invalidated by resizing the vector to NewSize.
+  void assertSafeToReferenceAfterResize(const void* Elt, size_t NewSize) {
+    (void)Elt; // Suppress unused variable warning
+    (void)NewSize; // Suppress unused variable warning
+    assert(
+        isSafeToReferenceAfterResize(Elt, NewSize) &&
+        "Attempting to reference an element of the vector in an operation "
+        "that invalidates it");
+  }
+
+  /// Check whether Elt will be invalidated by increasing the size of the
+  /// vector by N.
+  void assertSafeToAdd(const void* Elt, size_t N = 1) {
+    this->assertSafeToReferenceAfterResize(Elt, this->size() + N);
+  }
+
+  /// Check whether any part of the range will be invalidated by clearing.
+  void assertSafeToReferenceAfterClear(const T* From, const T* To) {
+    if (From == To)
+      return;
+    this->assertSafeToReferenceAfterResize(From, 0);
+    this->assertSafeToReferenceAfterResize(To - 1, 0);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
+          false>
+  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+
+  /// Check whether any part of the range will be invalidated by growing.
+  void assertSafeToAddRange(const T* From, const T* To) {
+    if (From == To)
+      return;
+    this->assertSafeToAdd(From, To - From);
+    this->assertSafeToAdd(To - 1, To - From);
+  }
+  template <
+      class ItTy,
+      std::enable_if_t<!std::is_same_v<std::remove_const_t<ItTy>, T*>, bool> =
+          false>
+  void assertSafeToAddRange(ItTy, ItTy) {}
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  template <class U>
+  static const T* reserveForParamAndGetAddressImpl(
+      U* This,
+      const T& Elt,
+      size_t N) {
+    size_t NewSize = This->size() + N;
+    if (C10_LIKELY(NewSize <= This->capacity()))
+      return &Elt;
+
+    bool ReferencesStorage = false;
+    int64_t Index = -1;
+    if (!U::TakesParamByValue) {
+      if (C10_UNLIKELY(This->isReferenceToStorage(&Elt))) {
+        ReferencesStorage = true;
+        Index = &Elt - This->begin();
+      }
+    }
+    This->grow(NewSize);
+    return ReferencesStorage ? This->begin() + Index : &Elt;
+  }
+
+ public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using iterator = T*;
+  using const_iterator = const T*;
+
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+  using const_pointer = const T*;
+
+  using Base::capacity;
+  using Base::empty;
+  using Base::size;
+
+  // forward iterator creation methods.
+  iterator begin() {
+    return (iterator)this->BeginX;
+  }
+  const_iterator begin() const {
+    return (const_iterator)this->BeginX;
+  }
+  iterator end() {
+    return begin() + size();
+  }
+  const_iterator end() const {
+    return begin() + size();
+  }
+
+  // reverse iterator creation methods.
+  reverse_iterator rbegin() {
+    return reverse_iterator(end());
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  reverse_iterator rend() {
+    return reverse_iterator(begin());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  size_type size_in_bytes() const {
+    return size() * sizeof(T);
+  }
+  size_type max_size() const {
+    return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
+  }
+
+  size_t capacity_in_bytes() const {
+    return capacity() * sizeof(T);
+  }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() {
+    return pointer(begin());
+  }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const {
+    return const_pointer(begin());
+  }
+
+  // SmallVector::at is NOT from LLVM.
+  reference at(size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference at(size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  reference operator[](size_type idx) {
+    assert(idx < size());
+    return begin()[idx];
+  }
+  const_reference operator[](size_type idx) const {
+    assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    assert(!empty());
+    return begin()[0];
+  }
+  const_reference front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    assert(!empty());
+    return end()[-1];
+  }
+  const_reference back() const {
+    assert(!empty());
+    return end()[-1];
+  }
+};
+
+/// SmallVectorTemplateBase<TriviallyCopyable = false> - This is where we put
+/// method implementations that are designed to work with non-trivial T's.
+///
+/// We approximate is_trivially_copyable with trivial move/copy construction and
+/// trivial destruction. While the standard doesn't specify that you're allowed
+/// copy these types with memcpy, there is no way for the type to observe this.
+/// This catches the important case of std::pair<POD, POD>, which is not
+/// trivially assignable.
+///
+/// XXX: if build fails here fall back to C10_IS_TRIVIALLY_COPYABLE and make a
+/// note
+template <
+    typename T,
+    bool = (std::is_trivially_copy_constructible_v<T>)&&(
+        std::is_trivially_move_constructible_v<
+            T>)&&std::is_trivially_destructible_v<T>>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
+ protected:
+  static constexpr bool TakesParamByValue = false;
+  using ValueParamT = const T&;
+
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T* S, T* E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(
+        std::make_move_iterator(I), std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+  /// Create a new allocation big enough for \p MinSize and pass back its size
+  /// in \p NewCapacity. This is the first section of \a grow().
+  T* mallocForGrow(size_t MinSize, size_t& NewCapacity) {
+    return static_cast<T*>(
+        SmallVectorBase<SmallVectorSizeType<T>>::mallocForGrow(
+            MinSize, sizeof(T), NewCapacity));
+  }
+
+  /// Move existing elements over to the new allocation \p NewElts, the middle
+  /// section of \a grow().
+  void moveElementsForGrow(T* NewElts);
+
+  /// Transfer ownership of the allocation, finishing up \a grow().
+  void takeAllocationForGrow(T* NewElts, size_t NewCapacity);
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) {
+    return const_cast<T*>(this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  static T&& forward_value_param(T&& V) {
+    return std::move(V);
+  }
+  static const T& forward_value_param(const T& V) {
+    return V;
+  }
+
+  void growAndAssign(size_t NumElts, const T& Elt) {
+    // Grow manually in case Elt is an internal reference.
+    size_t NewCapacity = 0;
+    T* NewElts = mallocForGrow(NumElts, NewCapacity);
+    std::uninitialized_fill_n(NewElts, NumElts, Elt);
+    this->destroy_range(this->begin(), this->end());
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes>
+  T& growAndEmplaceBack(ArgTypes&&... Args) {
+    // Grow manually in case one of Args is an internal reference.
+    size_t NewCapacity = 0;
+    T* NewElts = mallocForGrow(0, NewCapacity);
+    ::new ((void*)(NewElts + this->size())) T(std::forward<ArgTypes>(Args)...);
+    moveElementsForGrow(NewElts);
+    takeAllocationForGrow(NewElts, NewCapacity);
+    this->set_size(this->size() + 1);
+    return this->back();
+  }
+
+ public:
+  void push_back(const T& Elt) {
+    const T* EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void*)this->end()) T(*EltPtr);
+    this->set_size(this->size() + 1);
+  }
+
+  void push_back(T&& Elt) {
+    T* EltPtr = reserveForParamAndGetAddress(Elt);
+    ::new ((void*)this->end()) T(::std::move(*EltPtr));
+    this->set_size(this->size() + 1);
+  }
+
+  void pop_back() {
+    this->set_size(this->size() - 1);
+    this->end()->~T();
+  }
+};
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::grow(size_t MinSize) {
+  size_t NewCapacity = 0;
+  T* NewElts = mallocForGrow(MinSize, NewCapacity);
+  moveElementsForGrow(NewElts);
+  takeAllocationForGrow(NewElts, NewCapacity);
+}
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::moveElementsForGrow(
+    T* NewElts) {
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+}
+
+// Define this out-of-line to dissuade the C++ compiler from inlining it.
+template <typename T, bool TriviallyCopyable>
+void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
+    T* NewElts,
+    size_t NewCapacity) {
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    free(this->begin());
+
+  this->BeginX = NewElts;
+  this->Capacity = NewCapacity;
+}
+
+/// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
+/// method implementations that are designed to work with trivially copyable
+/// T's. This allows using memcpy in place of copy/move construction and
+/// skipping destruction.
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+  friend class SmallVectorTemplateCommon<T>;
+
+ protected:
+  /// True if it's cheap enough to take parameters by value. Doing so avoids
+  /// overhead related to mitigations for reference invalidation.
+  static constexpr bool TakesParamByValue = sizeof(T) <= 2 * sizeof(void*);
+
+  /// Either const T& or T, depending on whether it's cheap enough to take
+  /// parameters by value.
+  using ValueParamT = std::conditional_t<TakesParamByValue, T, const T&>;
+
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T*, T*) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1* I,
+      T1* E,
+      T2* Dest,
+      std::enable_if_t<std::is_same_v<std::remove_const_t<T1>, T2>>* =
+          nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(reinterpret_cast<void*>(Dest), I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize, sizeof(T));
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  const T* reserveForParamAndGetAddress(const T& Elt, size_t N = 1) {
+    return this->reserveForParamAndGetAddressImpl(this, Elt, N);
+  }
+
+  /// Reserve enough space to add one element, and return the updated element
+  /// pointer in case it was a reference to the storage.
+  T* reserveForParamAndGetAddress(T& Elt, size_t N = 1) {
+    return const_cast<T*>(this->reserveForParamAndGetAddressImpl(this, Elt, N));
+  }
+
+  /// Copy \p V or return a reference, depending on \a ValueParamT.
+  static ValueParamT forward_value_param(ValueParamT V) {
+    return V;
+  }
+
+  void growAndAssign(size_t NumElts, T Elt) {
+    // Elt has been copied in case it's an internal reference, side-stepping
+    // reference invalidation problems without losing the realloc optimization.
+    this->set_size(0);
+    this->grow(NumElts);
+    std::uninitialized_fill_n(this->begin(), NumElts, Elt);
+    this->set_size(NumElts);
+  }
+
+  template <typename... ArgTypes>
+  T& growAndEmplaceBack(ArgTypes&&... Args) {
+    // Use push_back with a copy in case Args has an internal reference,
+    // side-stepping reference invalidation problems without losing the realloc
+    // optimization.
+    push_back(T(std::forward<ArgTypes>(Args)...));
+    return this->back();
+  }
+
+ public:
+  void push_back(ValueParamT Elt) {
+    const T* EltPtr = reserveForParamAndGetAddress(Elt);
+    memcpy(reinterpret_cast<void*>(this->end()), EltPtr, sizeof(T));
+    this->set_size(this->size() + 1);
+  }
+
+  void pop_back() {
+    this->set_size(this->size() - 1);
+  }
+};
+
+/// This class consists of common code factored out of the SmallVector class to
+/// reduce code duplication based on the SmallVector 'N' template parameter.
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T> {
+  using SuperClass = SmallVectorTemplateBase<T>;
+
+ public:
+  using iterator = typename SuperClass::iterator;
+  using const_iterator = typename SuperClass::const_iterator;
+  using reference = typename SuperClass::reference;
+  using size_type = typename SuperClass::size_type;
+
+ protected:
+  using SmallVectorTemplateBase<T>::TakesParamByValue;
+  using ValueParamT = typename SuperClass::ValueParamT;
+
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N) : SmallVectorTemplateBase<T>(N) {}
+
+ public:
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
+
+  ~SmallVectorImpl() {
+    // Subclass has already destructed this vector's elements.
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      free(this->begin());
+  }
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->Size = 0;
+  }
+
+ private:
+  template <bool ForOverwrite>
+  void resizeImpl(size_type N) {
+    if (N < this->size()) {
+      this->pop_back_n(this->size() - N);
+    } else if (N > this->size()) {
+      this->reserve(N);
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        if (ForOverwrite)
+          new (&*I) T;
+        else
+          new (&*I) T();
+      this->set_size(N);
+    }
+  }
+
+ public:
+  void resize(size_type N) {
+    resizeImpl<false>(N);
+  }
+
+  /// Like resize, but \ref T is POD, the new values won't be initialized.
+  void resize_for_overwrite(size_type N) {
+    resizeImpl<true>(N);
+  }
+
+  void resize(size_type N, ValueParamT NV) {
+    if (N == this->size())
+      return;
+
+    if (N < this->size()) {
+      this->pop_back_n(this->size() - N);
+      return;
+    }
+
+    // N > this->size(). Defer to append.
+    this->append(N - this->size(), NV);
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  void pop_back_n(size_type NumItems) {
+    assert(this->size() >= NumItems);
+    this->destroy_range(this->end() - NumItems, this->end());
+    this->set_size(this->size() - NumItems);
+  }
+
+  C10_NODISCARD T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl& RHS) noexcept;
+
+  /// Add the specified range to the end of the SmallVector.
+  template <
+      typename in_iter,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>>>
+  void append(in_iter in_start, in_iter in_end) {
+    this->assertSafeToAddRange(in_start, in_end);
+    size_type NumInputs = std::distance(in_start, in_end);
+    this->reserve(this->size() + NumInputs);
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->set_size(this->size() + NumInputs);
+  }
+
+  /// Append \p NumInputs copies of \p Elt to the end.
+  void append(size_type NumInputs, ValueParamT Elt) {
+    const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumInputs);
+    std::uninitialized_fill_n(this->end(), NumInputs, *EltPtr);
+    this->set_size(this->size() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  void append(const SmallVectorImpl& RHS) {
+    append(RHS.begin(), RHS.end());
+  }
+
+  void assign(size_type NumElts, ValueParamT Elt) {
+    // Note that Elt could be an internal reference.
+    if (NumElts > this->capacity()) {
+      this->growAndAssign(NumElts, Elt);
+      return;
+    }
+
+    // Assign over existing elements.
+    std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
+    if (NumElts > this->size())
+      std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
+    else if (NumElts < this->size())
+      this->destroy_range(this->begin() + NumElts, this->end());
+    this->set_size(NumElts);
+  }
+
+  // FIXME: Consider assigning over existing elements, rather than clearing &
+  // re-initializing them - for all assign(...) variants.
+
+  template <
+      typename in_iter,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<in_iter>::iterator_category,
+          std::input_iterator_tag>>>
+  void assign(in_iter in_start, in_iter in_end) {
+    this->assertSafeToReferenceAfterClear(in_start, in_end);
+    clear();
+    append(in_start, in_end);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  void assign(const SmallVectorImpl& RHS) {
+    assign(RHS.begin(), RHS.end());
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    assert(
+        this->isReferenceToStorage(CI) &&
+        "Iterator to erase is out of bounds.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I + 1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return (N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    assert(this->isRangeInStorage(S, E) && "Range to erase is out of bounds.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->set_size(I - this->begin());
+    return (N);
+  }
+
+ private:
+  template <class ArgType>
+  iterator insert_one_impl(iterator I, ArgType&& Elt) {
+    // Callers ensure that ArgType is derived from T.
+    static_assert(
+        std::is_same<std::remove_const_t<std::remove_reference_t<ArgType>>, T>::
+            value,
+        "ArgType must be derived from T!");
+
+    if (I == this->end()) { // Important special case for empty vector.
+      this->push_back(::std::forward<ArgType>(Elt));
+      return this->end() - 1;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Grow if necessary.
+    size_t Index = I - this->begin();
+    std::remove_reference_t<ArgType>* EltPtr =
+        this->reserveForParamAndGetAddress(Elt);
+    I = this->begin() + Index;
+
+    ::new ((void*)this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end() - 1, this->end());
+    this->set_size(this->size() + 1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference (never happens if TakesParamByValue).
+    static_assert(
+        !TakesParamByValue || std::is_same<ArgType, T>::value,
+        "ArgType must be 'T' when taking by value!");
+    if (!TakesParamByValue && this->isReferenceToRange(EltPtr, I, this->end()))
+      ++EltPtr;
+
+    *I = ::std::forward<ArgType>(*EltPtr);
+    return I;
+  }
+
+ public:
+  iterator insert(iterator I, T&& Elt) {
+    return insert_one_impl(I, this->forward_value_param(std::move(Elt)));
+  }
+
+  iterator insert(iterator I, const T& Elt) {
+    return insert_one_impl(I, this->forward_value_param(Elt));
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, ValueParamT Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin() + InsertElt;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Ensure there is enough space, and get the (maybe updated) address of
+    // Elt.
+    const T* EltPtr = this->reserveForParamAndGetAddress(Elt, NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      // If we just moved the element we're inserting, be sure to update
+      // the reference (never happens if TakesParamByValue).
+      if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+        EltPtr += NumToInsert;
+
+      std::fill_n(I, NumToInsert, *EltPtr);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->set_size(this->size() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference (never happens if TakesParamByValue).
+    if (!TakesParamByValue && I <= EltPtr && EltPtr < this->end())
+      EltPtr += NumToInsert;
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, *EltPtr);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert - NumOverwritten, *EltPtr);
+    return I;
+  }
+
+  template <
+      typename ItTy,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>>>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) { // Important special case for empty vector.
+      append(From, To);
+      return this->begin() + InsertElt;
+    }
+
+    assert(
+        this->isReferenceToStorage(I) &&
+        "Insertion iterator is out of bounds.");
+
+    // Check that the reserve that follows doesn't invalidate the iterators.
+    this->assertSafeToAddRange(From, To);
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin() + InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end() - I) >= NumToInsert) {
+      T* OldEnd = this->end();
+      append(
+          std::move_iterator<iterator>(this->end() - NumToInsert),
+          std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd - NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T* OldEnd = this->end();
+    this->set_size(this->size() + NumToInsert);
+    size_t NumOverwritten = OldEnd - I;
+    this->uninitialized_move(I, OldEnd, this->end() - NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T* J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J;
+      ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes>
+  reference emplace_back(ArgTypes&&... Args) {
+    if (C10_UNLIKELY(this->size() >= this->capacity()))
+      return this->growAndEmplaceBack(std::forward<ArgTypes>(Args)...);
+
+    ::new ((void*)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->set_size(this->size() + 1);
+    return this->back();
+  }
+
+  SmallVectorImpl& operator=(const SmallVectorImpl& RHS);
+
+  SmallVectorImpl& operator=(SmallVectorImpl&& RHS) noexcept(
+      std::is_nothrow_move_constructible_v<T> &&
+      std::is_nothrow_destructible_v<T>);
+
+  bool operator==(const SmallVectorImpl& RHS) const {
+    if (this->size() != RHS.size())
+      return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl& RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl& RHS) const {
+    return std::lexicographical_compare(
+        this->begin(), this->end(), RHS.begin(), RHS.end());
+  }
+};
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T>& RHS) noexcept {
+  if (this == &RHS)
+    return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->Size, RHS.Size);
+    std::swap(this->Capacity, RHS.Capacity);
+    return;
+  }
+  this->reserve(RHS.size());
+  RHS.reserve(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size())
+    NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin() + NumShared, this->end(), RHS.end());
+    RHS.set_size(RHS.size() + EltDiff);
+    this->destroy_range(this->begin() + NumShared, this->end());
+    this->set_size(NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin() + NumShared, RHS.end(), this->end());
+    this->set_size(this->size() + EltDiff);
+    this->destroy_range(RHS.begin() + NumShared, RHS.end());
+    RHS.set_size(NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
+    const SmallVectorImpl<T>& RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin() + RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->set_size(RHSSize);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->clear();
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->set_size(RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T>& SmallVectorImpl<T>::
+operator=(SmallVectorImpl<T>&& RHS) noexcept(
+    std::is_nothrow_move_constructible_v<T> &&
+    std::is_nothrow_destructible_v<T>) {
+  // Avoid self-assignment.
+  if (this == &RHS)
+    return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall())
+      free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->Size = RHS.Size;
+    this->Capacity = RHS.Capacity;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->set_size(RHSSize);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->clear();
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin() + CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(
+      RHS.begin() + CurSize, RHS.end(), this->begin() + CurSize);
+
+  // Set end.
+  this->set_size(RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/// Storage for the SmallVector elements.  This is specialized for the N=0 case
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  alignas(T) char InlineElts[N * sizeof(T)];
+};
+
+/// We need the storage to be properly aligned even for small-size of 0 so that
+/// the pointer math in \a SmallVectorTemplateCommon::getFirstEl() is
+/// well-defined.
+template <typename T>
+struct alignas(T) SmallVectorStorage<T, 0> {};
+
+/// Forward declaration of SmallVector so that
+/// calculateSmallVectorDefaultInlinedElements can reference
+/// `sizeof(SmallVector<T, 0>)`.
+template <typename T, unsigned N>
+class /* LLVM_GSL_OWNER */ SmallVector;
+
+/// Helper class for calculating the default number of inline elements for
+/// `SmallVector<T>`.
+///
+/// This should be migrated to a constexpr function when our minimum
+/// compiler support is enough for multi-statement constexpr functions.
+template <typename T>
+struct CalculateSmallVectorDefaultInlinedElements {
+  // Parameter controlling the default number of inlined elements
+  // for `SmallVector<T>`.
+  //
+  // The default number of inlined elements ensures that
+  // 1. There is at least one inlined element.
+  // 2. `sizeof(SmallVector<T>) <= kPreferredSmallVectorSizeof` unless
+  // it contradicts 1.
+  static constexpr size_t kPreferredSmallVectorSizeof = 64;
+
+  // static_assert that sizeof(T) is not "too big".
+  //
+  // Because our policy guarantees at least one inlined element, it is possible
+  // for an arbitrarily large inlined element to allocate an arbitrarily large
+  // amount of inline storage. We generally consider it an antipattern for a
+  // SmallVector to allocate an excessive amount of inline storage, so we want
+  // to call attention to these cases and make sure that users are making an
+  // intentional decision if they request a lot of inline storage.
+  //
+  // We want this assertion to trigger in pathological cases, but otherwise
+  // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
+  // larger than kPreferredSmallVectorSizeof (otherwise,
+  // `SmallVector<SmallVector<T>>` would be one easy way to trip it, and that
+  // pattern seems useful in practice).
+  //
+  // One wrinkle is that this assertion is in theory non-portable, since
+  // sizeof(T) is in general platform-dependent. However, we don't expect this
+  // to be much of an issue, because most LLVM development happens on 64-bit
+  // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
+  // 32-bit hosts, dodging the issue. The reverse situation, where development
+  // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
+  // 64-bit host, is expected to be very rare.
+  static_assert(
+      sizeof(T) <= 256,
+      "You are trying to use a default number of inlined elements for "
+      "`SmallVector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `SmallVector<T, N>` to make "
+      "sure you really want that much inline storage.");
+
+  // Discount the size of the header itself when calculating the maximum inline
+  // bytes.
+  static constexpr size_t PreferredInlineBytes =
+      kPreferredSmallVectorSizeof - sizeof(SmallVector<T, 0>);
+  static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
+  static constexpr size_t value =
+      NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
+};
+
+/// This is a 'vector' (really, a variable-sized array), optimized
+/// for the case when the array is small.  It contains some number of elements
+/// in-place, which allows it to avoid heap allocation when the actual number of
+/// elements is below that threshold.  This allows normal "small" cases to be
+/// fast without losing generality for large inputs.
+///
+/// \note
+/// In the absence of a well-motivated choice for the number of inlined
+/// elements \p N, it is recommended to use \c SmallVector<T> (that is,
+/// omitting the \p N). This will choose a default number of inlined elements
+/// reasonable for allocation on the stack (for example, trying to keep \c
+/// sizeof(SmallVector<T>) around 64 bytes).
+///
+/// \warning This does not attempt to be exception safe.
+///
+/// \see https://llvm.org/docs/ProgrammersManual.html#llvm-adt-smallvector-h
+template <
+    typename T,
+    unsigned N = CalculateSmallVectorDefaultInlinedElements<T>::value>
+class /* LLVM_GSL_OWNER */ SmallVector : public SmallVectorImpl<T>,
+                                         SmallVectorStorage<T, N> {
+ public:
+  SmallVector() : SmallVectorImpl<T>(N) {}
+
+  ~SmallVector() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+  }
+
+  explicit SmallVector(size_t Size, const T& Value = T())
+      : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  template <
+      typename ItTy,
+      typename = std::enable_if_t<std::is_convertible_v<
+          typename std::iterator_traits<ItTy>::iterator_category,
+          std::input_iterator_tag>>>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  explicit SmallVector(Container&& c) : SmallVectorImpl<T>(N) {
+    this->append(c.begin(), c.end());
+  }
+
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  SmallVector(const SmallVector& RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  SmallVector& operator=(const SmallVector& RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  SmallVector(SmallVector&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>)
+      : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  SmallVector& operator=(const Container& RHS) {
+    this->assign(RHS.begin(), RHS.end());
+    return *this;
+  }
+
+  SmallVector(SmallVectorImpl<T>&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>)
+      : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  SmallVector& operator=(SmallVector&& RHS) noexcept(
+      std::is_nothrow_move_assignable_v<SmallVectorImpl<T>>) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  SmallVector& operator=(SmallVectorImpl<T>&& RHS) noexcept(
+      std::is_nothrow_move_constructible_v<SmallVectorImpl<T>>) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  // note: The enable_if restricts Container to types that have a .begin() and
+  // .end() that return valid input iterators.
+  template <
+      typename Container,
+      std::enable_if_t<
+          std::is_convertible_v<
+              typename std::iterator_traits<
+                  decltype(std::declval<Container>()
+                               .begin())>::iterator_category,
+              std::input_iterator_tag> &&
+              std::is_convertible_v<
+                  typename std::iterator_traits<
+                      decltype(std::declval<Container>()
+                                   .end())>::iterator_category,
+                  std::input_iterator_tag>,
+          int> = 0>
+  SmallVector& operator=(Container&& C) {
+    this->assign(C.begin(), C.end());
+    return *this;
+  }
+
+  SmallVector& operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template <typename T, unsigned N>
+inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
+  return X.capacity_in_bytes();
+}
+
+template <typename T, unsigned N>
+std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
+  int i = 0;
+  out << "[";
+  for (auto e : list) {
+    if (i++ > 0)
+      out << ", ";
+    out << e;
+  }
+  out << "]";
+  return out;
+}
+
+template <typename RangeType>
+using ValueTypeFromRangeType = std::remove_const_t<
+    std::remove_reference_t<decltype(*std::begin(std::declval<RangeType&>()))>>;
+
+/// Given a range of type R, iterate the entire range and return a
+/// SmallVector with elements of the vector.  This is useful, for example,
+/// when you want to iterate a range and then sort the results.
+template <unsigned Size, typename R>
+SmallVector<ValueTypeFromRangeType<R>, Size> to_vector(R&& Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+template <typename R>
+SmallVector<
+    ValueTypeFromRangeType<R>,
+    CalculateSmallVectorDefaultInlinedElements<
+        ValueTypeFromRangeType<R>>::value>
+to_vector(R&& Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+
+} // end namespace c10
+
+namespace std {
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T>
+inline void swap(
+    c10::SmallVectorImpl<T>& LHS,
+    c10::SmallVectorImpl<T>& RHS) noexcept {
+  LHS.swap(RHS);
+}
+
+/// Implement std::swap in terms of SmallVector swap.
+template <typename T, unsigned N>
+inline void swap(
+    c10::SmallVector<T, N>& LHS,
+    c10::SmallVector<T, N>& RHS) noexcept {
+  LHS.swap(RHS);
+}
+
+} // end namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/StringUtil.h b/MLPY/Lib/site-packages/torch/include/c10/util/StringUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..35e6a30540e1d99521bb9deb8dd793f7f3a7e104
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/StringUtil.h
@@ -0,0 +1,211 @@
+#ifndef C10_UTIL_STRINGUTIL_H_
+#define C10_UTIL_STRINGUTIL_H_
+
+#include <c10/macros/Macros.h>
+#include <c10/util/string_utils.h>
+#include <c10/util/string_view.h>
+
+#include <cstddef>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace c10 {
+
+namespace detail {
+
+// Obtains the base name from a full path.
+C10_API std::string StripBasename(const std::string& full_path);
+
+C10_API std::string ExcludeFileExtension(const std::string& full_path);
+
+struct CompileTimeEmptyString {
+  operator const std::string&() const {
+    static const std::string empty_string_literal;
+    return empty_string_literal;
+  }
+  operator const char*() const {
+    return "";
+  }
+};
+
+template <typename T>
+struct CanonicalizeStrTypes {
+  using type = const T&;
+};
+
+template <size_t N>
+// NOLINTNEXTLINE(*c-arrays*)
+struct CanonicalizeStrTypes<char[N]> {
+  using type = const char*;
+};
+
+inline std::ostream& _str(std::ostream& ss) {
+  return ss;
+}
+
+template <typename T>
+inline std::ostream& _str(std::ostream& ss, const T& t) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  ss << t;
+  return ss;
+}
+
+// Overloads of _str for wide types; forces narrowing.
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t* wCStr);
+C10_API std::ostream& _str(std::ostream& ss, const wchar_t& wChar);
+C10_API std::ostream& _str(std::ostream& ss, const std::wstring& wString);
+
+template <>
+inline std::ostream& _str<CompileTimeEmptyString>(
+    std::ostream& ss,
+    const CompileTimeEmptyString&) {
+  return ss;
+}
+
+template <typename T, typename... Args>
+inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
+  return _str(_str(ss, t), args...);
+}
+
+template <typename... Args>
+struct _str_wrapper final {
+  static std::string call(const Args&... args) {
+    std::ostringstream ss;
+    _str(ss, args...);
+    return ss.str();
+  }
+};
+
+// Specializations for already-a-string types.
+template <>
+struct _str_wrapper<std::string> final {
+  // return by reference to avoid the binary size of a string copy
+  static const std::string& call(const std::string& str) {
+    return str;
+  }
+};
+
+template <>
+struct _str_wrapper<const char*> final {
+  static const char* call(const char* str) {
+    return str;
+  }
+};
+
+// For c10::str() with an empty argument list (which is common in our assert
+// macros), we don't want to pay the binary size for constructing and
+// destructing a stringstream or even constructing a string.
+template <>
+struct _str_wrapper<> final {
+  static CompileTimeEmptyString call() {
+    return CompileTimeEmptyString();
+  }
+};
+
+} // namespace detail
+
+// Convert a list of string-like arguments into a single string.
+template <typename... Args>
+inline decltype(auto) str(const Args&... args) {
+  return detail::_str_wrapper<
+      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);
+}
+
+template <class Container>
+inline std::string Join(const std::string& delimiter, const Container& v) {
+  std::stringstream s;
+  int cnt = static_cast<int64_t>(v.size()) - 1;
+  for (auto i = v.begin(); i != v.end(); ++i, --cnt) {
+    s << (*i) << (cnt ? delimiter : "");
+  }
+  return s.str();
+}
+
+// Replace all occurrences of "from" substring to "to" string.
+// Returns number of replacements
+size_t C10_API
+ReplaceAll(std::string& s, c10::string_view from, c10::string_view to);
+
+/// Represents a location in source code (for debugging).
+struct C10_API SourceLocation {
+  const char* function;
+  const char* file;
+  uint32_t line;
+};
+
+std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
+
+// unix isprint but insensitive to locale
+inline static bool isPrint(char s) {
+  return s > 0x1f && s < 0x7f;
+}
+
+inline void printQuotedString(std::ostream& stmt, const string_view str) {
+  stmt << "\"";
+  for (auto s : str) {
+    switch (s) {
+      case '\\':
+        stmt << "\\\\";
+        break;
+      case '\'':
+        stmt << "\\'";
+        break;
+      case '\"':
+        stmt << "\\\"";
+        break;
+      case '\a':
+        stmt << "\\a";
+        break;
+      case '\b':
+        stmt << "\\b";
+        break;
+      case '\f':
+        stmt << "\\f";
+        break;
+      case '\n':
+        stmt << "\\n";
+        break;
+      case '\r':
+        stmt << "\\r";
+        break;
+      case '\t':
+        stmt << "\\t";
+        break;
+      case '\v':
+        stmt << "\\v";
+        break;
+      default:
+        if (isPrint(s)) {
+          stmt << s;
+        } else {
+          // C++ io has stateful formatting settings. Messing with
+          // them is probably worse than doing this manually.
+          // NOLINTNEXTLINE(*c-arrays*)
+          char buf[4] = "000";
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[2] += s % 8;
+          s /= 8;
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[1] += s % 8;
+          s /= 8;
+          // NOLINTNEXTLINE(*narrowing-conversions)
+          buf[0] += s;
+          stmt << "\\" << buf;
+        }
+        break;
+    }
+  }
+  stmt << "\"";
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#endif // C10_UTIL_STRINGUTIL_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Synchronized.h b/MLPY/Lib/site-packages/torch/include/c10/util/Synchronized.h
new file mode 100644
index 0000000000000000000000000000000000000000..da39195c1b2441cb10be89d91eef2c189441a5d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Synchronized.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <mutex>
+
+namespace c10 {
+
+/**
+ * A very simple Synchronization class for error-free use of data
+ * in a multi-threaded context. See folly/docs/Synchronized.md for
+ * the inspiration of this class.
+ *
+ * Full URL:
+ * https://github.com/facebook/folly/blob/main/folly/docs/Synchronized.md
+ *
+ * This class implements a small subset of the generic functionality
+ * implemented by folly:Synchronized<T>. Specifically, only withLock<T>
+ * is implemented here since it's the smallest possible API that is
+ * able to cover a large surface area of functionality offered by
+ * folly::Synchronized<T>.
+ */
+template <typename T>
+class Synchronized final {
+  mutable std::mutex mutex_;
+  T data_;
+
+ public:
+  Synchronized() = default;
+  Synchronized(T const& data) : data_(data) {}
+  Synchronized(T&& data) : data_(std::move(data)) {}
+
+  // Don't permit copy construction, move, assignment, or
+  // move assignment, since the underlying std::mutex
+  //  isn't necessarily copyable/moveable.
+  Synchronized(Synchronized const&) = delete;
+  Synchronized(Synchronized&&) = delete;
+  Synchronized operator=(Synchronized const&) = delete;
+  Synchronized operator=(Synchronized&&) = delete;
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by reference. Use the protected variable in the
+   * provided callback safely.
+   */
+  template <typename CB>
+  auto withLock(CB&& cb) {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return std::forward<CB>(cb)(this->data_);
+  }
+
+  /**
+   * To use, call withLock<T> with a callback that accepts T either
+   * by copy or by const reference. Use the protected variable in
+   * the provided callback safely.
+   */
+  template <typename CB>
+  auto withLock(CB&& cb) const {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    return std::forward<CB>(cb)(this->data_);
+  }
+};
+} // end namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocal.h b/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocal.h
new file mode 100644
index 0000000000000000000000000000000000000000..f44f297500d2a4f31fc3598a5c31b012459f1474
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocal.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+/**
+ * Android versions with libgnustl incorrectly handle thread_local C++
+ * qualifier with composite types. NDK up to r17 version is affected.
+ *
+ * (A fix landed on Jun 4 2018:
+ * https://android-review.googlesource.com/c/toolchain/gcc/+/683601)
+ *
+ * In such cases, use c10::ThreadLocal<T> wrapper
+ * which is `pthread_*` based with smart pointer semantics.
+ *
+ * In addition, convenient macro C10_DEFINE_TLS_static is available.
+ * To define static TLS variable of type std::string, do the following
+ * ```
+ *  C10_DEFINE_TLS_static(std::string, str_tls_);
+ *  ///////
+ *  {
+ *    *str_tls_ = "abc";
+ *    assert(str_tls_->length(), 3);
+ *  }
+ * ```
+ *
+ * (see c10/test/util/ThreadLocal_test.cpp for more examples)
+ */
+#if !defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+
+#if defined(C10_ANDROID) && defined(__GLIBCXX__) && __GLIBCXX__ < 20180604
+#define C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE
+#endif // defined(C10_ANDROID) && defined(__GLIBCXX__) && __GLIBCXX__ < 20180604
+
+#endif // !defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+
+#if defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+#include <c10/util/Exception.h>
+#include <errno.h>
+#include <pthread.h>
+#include <memory>
+namespace c10 {
+
+/**
+ * @brief Temporary thread_local C++ qualifier replacement for Android
+ * based on `pthread_*`.
+ * To be used with composite types that provide default ctor.
+ */
+template <typename Type>
+class ThreadLocal {
+ public:
+  ThreadLocal() {
+    pthread_key_create(
+        &key_, [](void* buf) { delete static_cast<Type*>(buf); });
+  }
+
+  ~ThreadLocal() {
+    if (void* current = pthread_getspecific(key_)) {
+      delete static_cast<Type*>(current);
+    }
+
+    pthread_key_delete(key_);
+  }
+
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
+
+  Type& get() {
+    if (void* current = pthread_getspecific(key_)) {
+      return *static_cast<Type*>(current);
+    }
+
+    std::unique_ptr<Type> ptr = std::make_unique<Type>();
+    if (0 == pthread_setspecific(key_, ptr.get())) {
+      return *ptr.release();
+    }
+
+    int err = errno;
+    TORCH_INTERNAL_ASSERT(false, "pthread_setspecific() failed, errno = ", err);
+  }
+
+  Type& operator*() {
+    return get();
+  }
+
+  Type* operator->() {
+    return &get();
+  }
+
+ private:
+  pthread_key_t key_;
+};
+
+} // namespace c10
+
+#define C10_DEFINE_TLS_static(Type, Name) static ::c10::ThreadLocal<Type> Name
+
+#define C10_DECLARE_TLS_class_static(Class, Type, Name) \
+  static ::c10::ThreadLocal<Type> Name
+
+#define C10_DEFINE_TLS_class_static(Class, Type, Name) \
+  ::c10::ThreadLocal<Type> Class::Name
+
+#else // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
+
+namespace c10 {
+
+/**
+ * @brief Default thread_local implementation for non-Android cases.
+ * To be used with composite types that provide default ctor.
+ */
+template <typename Type>
+class ThreadLocal {
+ public:
+  using Accessor = Type* (*)();
+  explicit ThreadLocal(Accessor accessor) : accessor_(accessor) {}
+
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
+
+  Type& get() {
+    return *accessor_();
+  }
+
+  Type& operator*() {
+    return get();
+  }
+
+  Type* operator->() {
+    return &get();
+  }
+
+ private:
+  Accessor accessor_;
+};
+
+} // namespace c10
+
+#define C10_DEFINE_TLS_static(Type, Name)     \
+  static ::c10::ThreadLocal<Type> Name([]() { \
+    static thread_local Type var;             \
+    return &var;                              \
+  })
+
+#define C10_DECLARE_TLS_class_static(Class, Type, Name) \
+  static ::c10::ThreadLocal<Type> Name
+
+#define C10_DEFINE_TLS_class_static(Class, Type, Name) \
+  ::c10::ThreadLocal<Type> Class::Name([]() {          \
+    static thread_local Type var;                      \
+    return &var;                                       \
+  })
+
+#endif // defined(C10_PREFER_CUSTOM_THREAD_LOCAL_STORAGE)
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h b/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9540dcd9783c9a265232837220451010cb6e7fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ThreadLocalDebugInfo.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace c10 {
+
+enum class C10_API_ENUM DebugInfoKind : uint8_t {
+  PRODUCER_INFO = 0,
+  MOBILE_RUNTIME_INFO,
+  PROFILER_STATE,
+  INFERENCE_CONTEXT, // for inference usage
+  PARAM_COMMS_INFO,
+
+  TEST_INFO, // used only in tests
+  TEST_INFO_2, // used only in tests
+};
+
+class C10_API DebugInfoBase {
+ public:
+  DebugInfoBase() = default;
+  virtual ~DebugInfoBase() = default;
+};
+
+// Thread local debug information is propagated across the forward
+// (including async fork tasks) and backward passes and is supposed
+// to be utilized by the user's code to pass extra information from
+// the higher layers (e.g. model id) down to the lower levels
+// (e.g. to the operator observers used for debugging, logging,
+// profiling, etc)
+class C10_API ThreadLocalDebugInfo {
+ public:
+  static DebugInfoBase* get(DebugInfoKind kind);
+
+  // Get current ThreadLocalDebugInfo
+  static std::shared_ptr<ThreadLocalDebugInfo> current();
+
+  // Internal, use DebugInfoGuard/ThreadLocalStateGuard
+  static void _forceCurrentDebugInfo(
+      std::shared_ptr<ThreadLocalDebugInfo> info);
+
+  // Push debug info struct of a given kind
+  static void _push(DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info);
+  // Pop debug info, throws in case the last pushed
+  // debug info is not of a given kind
+  static std::shared_ptr<DebugInfoBase> _pop(DebugInfoKind kind);
+  // Peek debug info, throws in case the last pushed debug info is not of the
+  // given kind
+  static std::shared_ptr<DebugInfoBase> _peek(DebugInfoKind kind);
+
+ private:
+  std::shared_ptr<DebugInfoBase> info_;
+  DebugInfoKind kind_;
+  std::shared_ptr<ThreadLocalDebugInfo> parent_info_;
+
+  friend class DebugInfoGuard;
+};
+
+// DebugInfoGuard is used to set debug information,
+// ThreadLocalDebugInfo is semantically immutable, the values are set
+// through the scope-based guard object.
+// Nested DebugInfoGuard adds/overrides existing values in the scope,
+// restoring the original values after exiting the scope.
+// Users can access the values through the ThreadLocalDebugInfo::get() call;
+class C10_API DebugInfoGuard {
+ public:
+  DebugInfoGuard(DebugInfoKind kind, std::shared_ptr<DebugInfoBase> info);
+
+  explicit DebugInfoGuard(std::shared_ptr<ThreadLocalDebugInfo> info);
+
+  ~DebugInfoGuard();
+
+  DebugInfoGuard(const DebugInfoGuard&) = delete;
+  DebugInfoGuard(DebugInfoGuard&&) = delete;
+
+ private:
+  bool active_ = false;
+  std::shared_ptr<ThreadLocalDebugInfo> prev_info_ = nullptr;
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Type.h b/MLPY/Lib/site-packages/torch/include/c10/util/Type.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2d73853bc3a6a5a573fc7710b362d75c6496d10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Type.h
@@ -0,0 +1,30 @@
+#ifndef C10_UTIL_TYPE_H_
+#define C10_UTIL_TYPE_H_
+
+#include <cstddef>
+#include <string>
+#ifdef __GXX_RTTI
+#include <typeinfo>
+#endif // __GXX_RTTI
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/// Utility to demangle a C++ symbol name.
+C10_API std::string demangle(const char* name);
+
+/// Returns the printable name of the type.
+template <typename T>
+inline const char* demangle_type() {
+#ifdef __GXX_RTTI
+  static const auto& name = *(new std::string(demangle(typeid(T).name())));
+  return name.c_str();
+#else // __GXX_RTTI
+  return "(RTTI disabled, cannot show name)";
+#endif // __GXX_RTTI
+}
+
+} // namespace c10
+
+#endif // C10_UTIL_TYPE_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/TypeCast.h b/MLPY/Lib/site-packages/torch/include/c10/util/TypeCast.h
new file mode 100644
index 0000000000000000000000000000000000000000..beda372caa02f40da8869e5b9c9839bca4f9d0c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/TypeCast.h
@@ -0,0 +1,169 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/complex.h>
+
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+template <typename dest_t, typename src_t>
+struct needs_real {
+  constexpr static bool value =
+      (is_complex<src_t>::value && !is_complex<dest_t>::value);
+};
+
+template <bool, typename src_t>
+struct maybe_real {
+  C10_HOST_DEVICE static inline src_t apply(src_t src) {
+    return src;
+  }
+};
+
+template <typename src_t>
+struct maybe_real<true, src_t> {
+  C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
+    return src.real();
+  }
+};
+
+// Note: deliberately ignores undefined behavior, consistent with NumPy.
+// PyTorch's type conversions can cause a variety of undefined behavior,
+// including float to integral overflow and signed to unsigned integer overflow.
+// Some of this undefined behavior is addressed below.
+template <typename dest_t, typename src_t>
+struct static_cast_with_inter_type {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline dest_t apply(
+      src_t src) {
+    constexpr bool real = needs_real<dest_t, src_t>::value;
+    auto r = maybe_real<real, src_t>::apply(src);
+    return static_cast<dest_t>(r);
+  }
+};
+
+// Partial template instantiation for casting to uint8.
+// Note: Converting from negative float values to unsigned integer types is
+// undefined behavior in C++, and current CPU and GPU compilers exhibit
+// divergent behavior. Casting from negative float values to signed
+// integer types and then to unsigned integer types is not undefined,
+// however, so this cast improves the consistency of type conversions
+// to uint8 across compilers.
+// Further note: Type conversions across compilers still have other undefined
+// and divergent behavior.
+template <typename src_t>
+struct static_cast_with_inter_type<uint8_t, src_t> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline uint8_t apply(
+      src_t src) {
+    constexpr bool real = needs_real<uint8_t, src_t>::value;
+    return static_cast<uint8_t>(
+        static_cast<int64_t>(maybe_real<real, src_t>::apply(src)));
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::BFloat16> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::BFloat16 src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Float8_e5m2> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Float8_e5m2 src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::Float8_e5m2fnuz> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Float8_e5m2fnuz src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::Float8_e4m3fn> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Float8_e4m3fn src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::Float8_e4m3fnuz> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Float8_e4m3fnuz src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<c10::complex<c10::Half>, c10::Half> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::Half src) {
+    return static_cast<c10::complex<c10::Half>>(c10::complex<float>{src});
+  }
+};
+
+template <>
+struct static_cast_with_inter_type<
+    c10::complex<c10::Half>,
+    c10::complex<double>> {
+  C10_HOST_DEVICE __ubsan_ignore_undefined__ static inline c10::complex<
+      c10::Half>
+  apply(c10::complex<double> src) {
+    return static_cast<c10::complex<c10::Half>>(
+        static_cast<c10::complex<float>>(src));
+  }
+};
+
+template <typename To, typename From>
+C10_HOST_DEVICE To convert(From f) {
+  return static_cast_with_inter_type<To, From>::apply(f);
+}
+
+// Define separately to avoid being inlined and prevent code-size bloat
+C10_API void report_overflow(const char* name);
+
+template <typename To, typename From>
+To checked_convert(From f, const char* name) {
+  // Converting to bool can't overflow so we exclude this case from checking.
+  if (!std::is_same_v<To, bool> && overflows<To, From>(f)) {
+    report_overflow(name);
+  }
+  return convert<To, From>(f);
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+// Trigger tests for D25440771. TODO: Remove this line any time you want.
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/TypeIndex.h b/MLPY/Lib/site-packages/torch/include/c10/util/TypeIndex.h
new file mode 100644
index 0000000000000000000000000000000000000000..a36c7030cb915c4c617df537fd77785074430ec5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/TypeIndex.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <c10/util/ConstexprCrc.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/string_view.h>
+#include <cstdint>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace c10::util {
+
+// TODO Make it work for more compilers
+
+// Intel compiler works
+#if defined(__INTEL_COMPILER)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
+#define C10_TYPENAME_CONSTEXPR
+
+// Clang works
+#elif defined(__clang__)
+
+// except for NVCC
+#if defined(__CUDACC__)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
+#define C10_TYPENAME_CONSTEXPR
+#else
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
+#define C10_TYPENAME_CONSTEXPR constexpr
+#endif
+
+// Windows works
+#elif defined(_MSC_VER)
+
+// except for NVCC
+#if defined(__CUDACC__)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
+#define C10_TYPENAME_CONSTEXPR
+#else
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
+#define C10_TYPENAME_CONSTEXPR constexpr
+#endif
+
+// GCC works
+#elif defined(__GNUC__)
+
+// except when gcc < 9
+#if (__GNUC__ < 9) || defined(__CUDACC__)
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 0
+#define C10_TYPENAME_CONSTEXPR
+#else
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
+#define C10_TYPENAME_CONSTEXPR constexpr
+#endif
+
+// some other compiler we don't know about
+#else
+#define C10_TYPENAME_SUPPORTS_CONSTEXPR 1
+#define C10_TYPENAME_CONSTEXPR constexpr
+#endif
+
+struct type_index final : IdWrapper<type_index, uint64_t> {
+  constexpr explicit type_index(uint64_t checksum) : IdWrapper(checksum) {}
+
+  // Allow usage in std::map / std::set
+  // TODO Disallow this and rather use std::unordered_map/set everywhere
+  friend constexpr bool operator<(type_index lhs, type_index rhs) noexcept {
+    return lhs.underlyingId() < rhs.underlyingId();
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream, type_index typeId) {
+    return stream << typeId.underlyingId();
+  }
+};
+
+namespace detail {
+
+#if !defined(__clang__) && !defined(_MSC_VER) && defined(__GNUC__) && \
+    __GNUC__ < 5
+// Getting __PRETTY_FUNCTION__ at compile time only works with GCC >= 5
+#error "You're running a too old version of GCC. We need GCC 5 or later."
+#endif
+
+#if defined(__clang__) && __clang_major__ < 4
+// Getting __PRETTY_FUNCTION__ at compile time only works with Clang >= 4
+#error "You're running a too old version of Clang. We need Clang 4 or later."
+#endif
+
+inline constexpr string_view extract(
+    string_view prefix,
+    string_view suffix,
+    string_view str) {
+#if !defined(__CUDA_ARCH__) // CUDA doesn't like std::logic_error in device code
+  return (!str.starts_with(prefix) || !str.ends_with(suffix))
+      ? (throw std::logic_error("Invalid pattern"), string_view())
+      : str.substr(prefix.size(), str.size() - prefix.size() - suffix.size());
+#else
+  return str.substr(prefix.size(), str.size() - prefix.size() - suffix.size());
+#endif
+}
+
+template <typename T>
+inline C10_TYPENAME_CONSTEXPR c10::string_view fully_qualified_type_name_impl() {
+#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(__NVCC__)
+  return extract(
+      "c10::basic_string_view<char> c10::util::detail::fully_qualified_type_name_impl<",
+      ">()",
+      __FUNCSIG__);
+#else
+  return extract(
+      "class c10::basic_string_view<char> __cdecl c10::util::detail::fully_qualified_type_name_impl<",
+      ">(void)",
+      __FUNCSIG__);
+#endif
+#elif defined(__clang__)
+  return extract(
+      "c10::string_view c10::util::detail::fully_qualified_type_name_impl() [T = ",
+      "]",
+      __PRETTY_FUNCTION__);
+#elif defined(__GNUC__)
+  return extract(
+#if C10_TYPENAME_SUPPORTS_CONSTEXPR
+      "constexpr c10::string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ",
+#else
+      "c10::string_view c10::util::detail::fully_qualified_type_name_impl() [with T = ",
+#endif
+      "; c10::string_view = c10::basic_string_view<char>]",
+      __PRETTY_FUNCTION__);
+#endif
+}
+
+#if !defined(__CUDA_ARCH__)
+template <typename T>
+inline constexpr uint64_t type_index_impl() {
+// Idea: __PRETTY_FUNCTION__ (or __FUNCSIG__ on msvc) contains a qualified name
+// of this function, including its template parameter, i.e. including the
+// type we want an id for. We use this name and run crc64 on it to get a type
+// id.
+#if defined(_MSC_VER) && !defined(__clang__)
+  return crc64(__FUNCSIG__, sizeof(__FUNCSIG__)).checksum();
+#elif defined(__clang__)
+  return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum();
+#elif defined(__GNUC__)
+  return crc64(__PRETTY_FUNCTION__, sizeof(__PRETTY_FUNCTION__)).checksum();
+#endif
+}
+#endif
+
+} // namespace detail
+
+template <typename T>
+inline constexpr type_index get_type_index() {
+#if !defined(__CUDA_ARCH__)
+  // To enforce that this is really computed at compile time, we pass the
+  // type index through std::integral_constant.
+  return type_index{std::integral_constant<
+      uint64_t,
+      detail::type_index_impl<std::decay_t<T>>()>::value};
+#else
+  // There's nothing in theory preventing us from running this on device code
+  // except for nvcc throwing a compiler error if we enable it.
+  return (abort(), type_index(0));
+#endif
+}
+
+#if !defined(TORCH_PEDANTIC)
+// Use precomputed hashsum for std::string
+// Needed to workaround ambiguity in class name resolution
+// into __PRETTY_FUNCTION__ when abovementioned class is defined in inlined
+// namespace. In multi-ABI C++ library, `std::string` is an alias to
+// `std::__cxx11::basic_string<char>` which depending on compiler flags can be
+// resolved to `basic_string<char>` either in `std` namespace or in
+// `std::__cxx11` one (`__cxx11` is an inline namespace)
+template <>
+inline constexpr type_index get_type_index<std::string>() {
+  // hashsum for std::basic_string<char>
+  return type_index{4193213214807308375ULL};
+}
+#endif
+
+template <typename T>
+inline C10_TYPENAME_CONSTEXPR string_view
+get_fully_qualified_type_name() noexcept {
+#if C10_TYPENAME_SUPPORTS_CONSTEXPR
+  constexpr
+#else
+  static
+#endif
+      string_view name = detail::fully_qualified_type_name_impl<T>();
+  return name;
+}
+} // namespace c10::util
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(c10::util::type_index);
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/TypeList.h b/MLPY/Lib/site-packages/torch/include/c10/util/TypeList.h
new file mode 100644
index 0000000000000000000000000000000000000000..438477a73518f24a5a978fe164a1a8dcdcffd721
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/TypeList.h
@@ -0,0 +1,515 @@
+#pragma once
+
+#include <c10/util/TypeTraits.h>
+#include <algorithm>
+#include <cstddef>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace c10::guts {
+
+template <class... T>
+struct false_t : std::false_type {};
+template <template <class> class... T>
+struct false_higher_t : std::false_type {};
+
+namespace typelist {
+
+/**
+ * Type holding a list of types for compile time type computations
+ */
+template <class... Items>
+struct typelist final {
+ public:
+  typelist() = delete; // not for instantiation
+};
+
+/**
+ * Returns the number of types in a typelist
+ * Example:
+ *   3  ==  size<typelist<int, int, double>>::value
+ */
+template <class TypeList>
+struct size final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::size<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct size<typelist<Types...>> final {
+  static constexpr size_t value = sizeof...(Types);
+};
+
+/**
+ * Transforms a list of types into a tuple holding these types.
+ * Example:
+ *   std::tuple<int, string>  ==  to_tuple_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct to_tuple final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::to_tuple<T>, T must be typelist<...>.");
+};
+template <class... Types>
+struct to_tuple<typelist<Types...>> final {
+  using type = std::tuple<Types...>;
+};
+template <class TypeList>
+using to_tuple_t = typename to_tuple<TypeList>::type;
+
+/**
+ * Creates a typelist containing the types of a given tuple.
+ * Example:
+ *   typelist<int, string>  ==  from_tuple_t<std::tuple<int, string>>
+ */
+template <class Tuple>
+struct from_tuple final {
+  static_assert(
+      false_t<Tuple>::value,
+      "In typelist::from_tuple<T>, T must be std::tuple<...>.");
+};
+template <class... Types>
+struct from_tuple<std::tuple<Types...>> final {
+  using type = typelist<Types...>;
+};
+template <class Tuple>
+using from_tuple_t = typename from_tuple<Tuple>::type;
+
+/**
+ * Concatenates multiple type lists.
+ * Example:
+ *   typelist<int, string, int>  ==  concat_t<typelist<int, string>,
+ * typelist<int>>
+ */
+template <class... TypeLists>
+struct concat final {
+  static_assert(
+      false_t<TypeLists...>::value,
+      "In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
+};
+template <class... Head1Types, class... Head2Types, class... TailLists>
+struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...>
+    final {
+  using type =
+      typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::
+          type;
+};
+template <class... HeadTypes>
+struct concat<typelist<HeadTypes...>> final {
+  using type = typelist<HeadTypes...>;
+};
+template <>
+struct concat<> final {
+  using type = typelist<>;
+};
+template <class... TypeLists>
+using concat_t = typename concat<TypeLists...>::type;
+
+/**
+ * Filters the types in a type list by a type trait.
+ * Examples:
+ *   typelist<int&, const string&&>  ==  filter_t<std::is_reference,
+ * typelist<void, string, int&, bool, const string&&, int>>
+ */
+template <template <class> class Condition, class TypeList>
+struct filter final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class Head, class... Tail>
+struct filter<Condition, typelist<Head, Tail...>> final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = std::conditional_t<
+      Condition<Head>::value,
+      concat_t<
+          typelist<Head>,
+          typename filter<Condition, typelist<Tail...>>::type>,
+      typename filter<Condition, typelist<Tail...>>::type>;
+};
+template <template <class> class Condition>
+struct filter<Condition, typelist<>> final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  using type = typelist<>;
+};
+template <template <class> class Condition, class TypeList>
+using filter_t = typename filter<Condition, TypeList>::type;
+
+/**
+ * Counts how many types in the list fulfill a type trait
+ * Examples:
+ *   2  ==  count_if<std::is_reference, typelist<void, string, int&, bool, const
+ * string&&, int>>
+ */
+template <template <class> class Condition, class TypeList>
+struct count_if final {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+  // TODO Direct implementation might be faster
+  static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
+};
+
+/**
+ * Checks if a typelist contains a certain type.
+ * Examples:
+ *  contains<typelist<int, string>, string> == true_type
+ *  contains<typelist<int, string>, double> == false_type
+ */
+namespace detail {
+template <class TypeList, class Type, class Enable = void>
+struct contains {};
+template <class Type>
+struct contains<typelist<>, Type, void> : std::false_type {};
+template <class Type, class Head, class... Tail>
+struct contains<
+    typelist<Head, Tail...>,
+    Type,
+    std::enable_if_t<std::is_same_v<Head, Type>>> : std::true_type {};
+template <class Type, class Head, class... Tail>
+struct contains<
+    typelist<Head, Tail...>,
+    Type,
+    std::enable_if_t<!std::is_same_v<Head, Type>>>
+    : contains<typelist<Tail...>, Type> {};
+} // namespace detail
+template <class TypeList, class Type>
+using contains = typename detail::contains<TypeList, Type>::type;
+
+/**
+ * Returns true iff the type trait is true for all types in the type list
+ * Examples:
+ *   true   ==  all<std::is_reference, typelist<int&, const float&&, const
+ * MyClass&>>::value false  ==  all<std::is_reference, typelist<int&, const
+ * float&&, MyClass>>::value
+ */
+template <template <class> class Condition, class TypeList>
+struct all {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::all<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class... Types>
+struct all<Condition, typelist<Types...>>
+    : std::conjunction<Condition<Types>...> {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::all<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+};
+
+/**
+ * Returns true iff the type trait is true for any type in the type list
+ * Examples:
+ *   true   ==  true_for_any_type<std::is_reference, typelist<int, const
+ * float&&, const MyClass>>::value false  ==
+ * true_for_any_type<std::is_reference, typelist<int, const float,
+ * MyClass>>::value
+ */
+template <template <class> class Condition, class TypeList>
+struct true_for_any_type final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::true_for_any_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition, class... Types>
+struct true_for_any_type<Condition, typelist<Types...>> final
+    : std::disjunction<Condition<Types>...> {
+  static_assert(
+      is_type_condition<Condition>::value,
+      "In typelist::true_for_any_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
+};
+
+/**
+ * Maps types of a type list using a type trait
+ * Example:
+ *  typelist<int&, double&, string&>  ==  map_t<std::add_lvalue_reference_t,
+ * typelist<int, double, string>>
+ */
+template <template <class> class Mapper, class TypeList>
+struct map final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Mapper, class... Types>
+struct map<Mapper, typelist<Types...>> final {
+  using type = typelist<Mapper<Types>...>;
+};
+template <template <class> class Mapper, class TypeList>
+using map_t = typename map<Mapper, TypeList>::type;
+
+/**
+ * Returns the first element of a type list.
+ * Example:
+ *   int  ==  head_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct head final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::head<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct head<typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class TypeList>
+using head_t = typename head<TypeList>::type;
+
+/**
+ * Returns the first element of a type list, or the specified default if the
+ * type list is empty. Example: int  ==  head_t<bool, typelist<int, string>>
+ *   bool  ==  head_t<bool, typelist<>>
+ */
+template <class Default, class TypeList>
+struct head_with_default final {
+  using type = Default;
+};
+template <class Default, class Head, class... Tail>
+struct head_with_default<Default, typelist<Head, Tail...>> final {
+  using type = Head;
+};
+template <class Default, class TypeList>
+using head_with_default_t = typename head_with_default<Default, TypeList>::type;
+
+/**
+ * Returns the N-th element of a type list.
+ * Example:
+ * int == element_t<1, typelist<float, int, char>>
+ */
+
+/// Base template.
+template <size_t Index, class TypeList>
+struct element final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::element<T>, the T argument must be typelist<...>.");
+};
+
+/// Successful case, we have reached the zero index and can "return" the head
+/// type.
+template <class Head, class... Tail>
+struct element<0, typelist<Head, Tail...>> {
+  using type = Head;
+};
+
+/// Error case, we have an index but ran out of types! It will only be selected
+/// if `Ts...` is actually empty!
+template <size_t Index, class... Ts>
+struct element<Index, typelist<Ts...>> {
+  static_assert(
+      Index < sizeof...(Ts),
+      "Index is out of bounds in typelist::element");
+};
+
+/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
+template <size_t Index, class Head, class... Tail>
+struct element<Index, typelist<Head, Tail...>>
+    : element<Index - 1, typelist<Tail...>> {};
+
+/// Convenience alias.
+template <size_t Index, class TypeList>
+using element_t = typename element<Index, TypeList>::type;
+
+/**
+ * Returns the last element of a type list.
+ * Example:
+ *   int  ==  last_t<typelist<int, string>>
+ */
+template <class TypeList>
+struct last final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::last<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct last<typelist<Head, Tail...>> final {
+  using type = typename last<typelist<Tail...>>::type;
+};
+template <class Head>
+struct last<typelist<Head>> final {
+  using type = Head;
+};
+template <class TypeList>
+using last_t = typename last<TypeList>::type;
+static_assert(std::is_same_v<int, last_t<typelist<double, float, int>>>);
+
+/**
+ * Take/drop a number of arguments from a typelist.
+ * Example:
+ *   typelist<int, string> == take_t<typelist<int, string, bool>, 2>
+ *   typelist<bool> == drop_t<typelist<int, string, bool>, 2>
+ */
+namespace detail {
+template <class TypeList, size_t offset, class IndexSequence>
+struct take_elements final {};
+
+template <class TypeList, size_t offset, size_t... Indices>
+struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
+  using type = typelist<typename element<offset + Indices, TypeList>::type...>;
+};
+} // namespace detail
+
+template <class TypeList, size_t num>
+struct take final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::take<T, num>, the T argument must be typelist<...>.");
+  static_assert(
+      num <= size<TypeList>::value,
+      "Tried to typelist::take more elements than there are in the list");
+  using type = typename detail::
+      take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
+};
+template <class TypeList, size_t num>
+using take_t = typename take<TypeList, num>::type;
+
+template <class TypeList, size_t num>
+struct drop final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  static_assert(
+      num <= size<TypeList>::value,
+      "Tried to typelist::drop more elements than there are in the list");
+  using type = typename detail::take_elements<
+      TypeList,
+      num,
+      std::make_index_sequence<size<TypeList>::value - num>>::type;
+};
+template <class TypeList, size_t num>
+using drop_t = typename drop<TypeList, num>::type;
+
+/**
+ * Like drop, but returns an empty list rather than an assertion error if `num`
+ * is larger than the size of the TypeList.
+ * Example:
+ *   typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
+ *   typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
+ */
+template <class TypeList, size_t num>
+struct drop_if_nonempty final {
+  static_assert(
+      is_instantiation_of<typelist, TypeList>::value,
+      "In typelist::drop<T, num>, the T argument must be typelist<...>.");
+  using type = typename detail::take_elements<
+      TypeList,
+      std::min(num, size<TypeList>::value),
+      std::make_index_sequence<
+          size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
+};
+template <class TypeList, size_t num>
+using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
+
+/**
+ * Reverses a typelist.
+ * Example:
+ *   typelist<int, string>  == reverse_t<typelist<string, int>>
+ */
+template <class TypeList>
+struct reverse final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::reverse<T>, the T argument must be typelist<...>.");
+};
+template <class Head, class... Tail>
+struct reverse<typelist<Head, Tail...>> final {
+  using type =
+      concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
+};
+template <>
+struct reverse<typelist<>> final {
+  using type = typelist<>;
+};
+template <class TypeList>
+using reverse_t = typename reverse<TypeList>::type;
+
+/**
+ * Find the index of the first type in a typelist fulfilling a type trait
+ * condition. Example:
+ *
+ * 2 == find_if<typelist<char, int, char&, int&>, std::is_reference>::value
+ */
+template <class TypeList, template <class> class Condition, class Enable = void>
+struct find_if final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::find_if<TypeList, Condition>, the TypeList argument must be typelist<...>.");
+};
+template <template <class> class Condition>
+struct find_if<typelist<>, Condition, void> final {
+  static_assert(
+      false_higher_t<Condition>::value,
+      "In typelist::find_if<Type/List, Condition>, didn't find any type fulfilling the Condition.");
+};
+template <class Head, class... Tail, template <class> class Condition>
+struct find_if<
+    typelist<Head, Tail...>,
+    Condition,
+    std::enable_if_t<Condition<Head>::value>>
+    final {
+  static constexpr size_t value = 0;
+};
+template <class Head, class... Tail, template <class> class Condition>
+struct find_if<
+    typelist<Head, Tail...>,
+    Condition,
+    std::enable_if_t<!Condition<Head>::value>>
+    final {
+  static constexpr size_t value =
+      1 + find_if<typelist<Tail...>, Condition>::value;
+};
+
+/**
+ * Maps a list of types into a list of values.
+ * Examples:
+ *   // Example 1
+ *   auto sizes =
+ *     map_types_to_values<typelist<int64_t, bool, uint32_t>>(
+ *       [] (auto t) { return sizeof(decltype(t)::type); }
+ *     );
+ *   //  sizes  ==  std::tuple<size_t, size_t, size_t>{8, 1, 4}
+ *
+ *   // Example 2
+ *   auto shared_ptrs =
+ *     map_types_to_values<typelist<int, double>>(
+ *       [] (auto t) { return make_shared<typename decltype(t)::type>(); }
+ *     );
+ *   // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
+ */
+namespace detail {
+template <class T>
+struct type_ final {
+  using type = T;
+};
+template <class TypeList>
+struct map_types_to_values final {
+  static_assert(
+      false_t<TypeList>::value,
+      "In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
+};
+template <class... Types>
+struct map_types_to_values<typelist<Types...>> final {
+  template <class Func>
+  static auto call(Func&& func) {
+    return std::tuple{std::forward<Func>(func)(type_<Types>())...};
+  }
+};
+} // namespace detail
+
+template <class TypeList, class Func>
+decltype(auto) map_types_to_values(Func&& func) {
+  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
+}
+
+} // namespace typelist
+} // namespace c10::guts
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/TypeSafeSignMath.h b/MLPY/Lib/site-packages/torch/include/c10/util/TypeSafeSignMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..36e3719d188e90813a7fbd4801abeee4281f7e16
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/TypeSafeSignMath.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <limits>
+#include <type_traits>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wstring-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+namespace c10 {
+
+/// Returns false since we cannot have x < 0 if x is unsigned.
+template <typename T>
+static inline constexpr bool is_negative(
+    const T& /*x*/,
+    std::true_type /*is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if a signed variable x < 0
+template <typename T>
+static inline constexpr bool is_negative(
+    const T& x,
+    std::false_type /*is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns true if x < 0
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr bool is_negative(const T& x) {
+  return is_negative(x, std::is_unsigned<T>());
+}
+
+/// Returns the sign of an unsigned variable x as 0, 1
+template <typename T>
+static inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) {
+  return T(0) < x;
+}
+
+/// Returns the sign of a signed variable x as -1, 0, 1
+template <typename T>
+static inline constexpr int signum(
+    const T& x,
+    std::false_type /*is_unsigned*/) {
+  return (T(0) < x) - (x < T(0));
+}
+
+/// Returns the sign of x as -1, 0, 1
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :-(
+template <typename T>
+inline constexpr int signum(const T& x) {
+  return signum(x, std::is_unsigned<T>());
+}
+
+/// Returns true if a and b are not both negative
+template <typename T, typename U>
+inline constexpr bool signs_differ(const T& a, const U& b) {
+  return is_negative(a) != is_negative(b);
+}
+
+// Suppress sign compare warning when compiling with GCC
+// as later does not account for short-circuit rule before
+// raising the warning, see https://godbolt.org/z/Tr3Msnz99
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+/// Returns true if x is greater than the greatest value of the type Limit
+template <typename Limit, typename T>
+inline constexpr bool greater_than_max(const T& x) {
+  constexpr bool can_overflow =
+      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
+  return can_overflow && x > std::numeric_limits<Limit>::max();
+}
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+/// Returns true if x < lowest(Limit). Standard comparison
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& x,
+    std::false_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < std::numeric_limits<Limit>::lowest();
+}
+
+/// Returns false since all the limit is signed and therefore includes
+/// negative values but x cannot be negative because it is unsigned
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::false_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x < 0, where 0 is constructed from T.
+/// Limit is not signed, so its lower value is zero
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& x,
+    std::true_type /*limit_is_unsigned*/,
+    std::false_type /*x_is_unsigned*/) {
+  return x < T(0);
+}
+
+/// Returns false sign both types are unsigned
+template <typename Limit, typename T>
+static inline constexpr bool less_than_lowest(
+    const T& /*x*/,
+    std::true_type /*limit_is_unsigned*/,
+    std::true_type /*x_is_unsigned*/) {
+  return false;
+}
+
+/// Returns true if x is less than the lowest value of type T
+/// NOTE: Will fail on an unsigned custom type
+///       For the most part it's possible to fix this if
+///       the custom type has a constexpr constructor.
+///       However, notably, c10::Half does not :
+template <typename Limit, typename T>
+inline constexpr bool less_than_lowest(const T& x) {
+  return less_than_lowest<Limit>(
+      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/TypeTraits.h b/MLPY/Lib/site-packages/torch/include/c10/util/TypeTraits.h
new file mode 100644
index 0000000000000000000000000000000000000000..27b5c7a30af939176e340c792e84389929923623
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/TypeTraits.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <functional>
+#include <type_traits>
+
+namespace c10::guts {
+
+/**
+ * is_equality_comparable<T> is true_type iff the equality operator is defined
+ * for T.
+ */
+template <class T, class Enable = void>
+struct is_equality_comparable : std::false_type {};
+template <class T>
+struct is_equality_comparable<
+    T,
+    std::void_t<decltype(std::declval<T&>() == std::declval<T&>())>>
+    : std::true_type {};
+template <class T>
+using is_equality_comparable_t = typename is_equality_comparable<T>::type;
+
+/**
+ * is_hashable<T> is true_type iff std::hash is defined for T
+ */
+template <class T, class Enable = void>
+struct is_hashable : std::false_type {};
+template <class T>
+struct is_hashable<T, std::void_t<decltype(std::hash<T>()(std::declval<T&>()))>>
+    : std::true_type {};
+template <class T>
+using is_hashable_t = typename is_hashable<T>::type;
+
+/**
+ * is_function_type<T> is true_type iff T is a plain function type (i.e.
+ * "Result(Args...)")
+ */
+template <class T>
+struct is_function_type : std::false_type {};
+template <class Result, class... Args>
+struct is_function_type<Result(Args...)> : std::true_type {};
+template <class T>
+using is_function_type_t = typename is_function_type<T>::type;
+
+/**
+ * is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
+ * (e.g. vector<int> is an instantiation of vector) Example:
+ *    is_instantiation_of_t<vector, vector<int>> // true
+ *    is_instantiation_of_t<pair, pair<int, string>> // true
+ *    is_instantiation_of_t<vector, pair<int, string>> // false
+ */
+template <template <class...> class Template, class T>
+struct is_instantiation_of : std::false_type {};
+template <template <class...> class Template, class... Args>
+struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
+template <template <class...> class Template, class T>
+using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
+
+namespace detail {
+/**
+ * strip_class: helper to remove the class type from pointers to `operator()`.
+ */
+
+template <typename T>
+struct strip_class {};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...)> {
+  using type = Result(Args...);
+};
+template <typename Class, typename Result, typename... Args>
+struct strip_class<Result (Class::*)(Args...) const> {
+  using type = Result(Args...);
+};
+template <typename T>
+using strip_class_t = typename strip_class<T>::type;
+} // namespace detail
+
+/**
+ * Evaluates to true_type, iff the given class is a Functor
+ * (i.e. has a call operator with some set of arguments)
+ */
+
+template <class Functor, class Enable = void>
+struct is_functor : std::false_type {};
+template <class Functor>
+struct is_functor<
+    Functor,
+    std::enable_if_t<is_function_type<
+        detail::strip_class_t<decltype(&Functor::operator())>>::value>>
+    : std::true_type {};
+
+/**
+ * lambda_is_stateless<T> is true iff the lambda type T is stateless
+ * (i.e. does not have a closure).
+ * Example:
+ *  auto stateless_lambda = [] (int a) {return a;};
+ *  lambda_is_stateless<decltype(stateless_lambda)> // true
+ *  auto stateful_lambda = [&] (int a) {return a;};
+ *  lambda_is_stateless<decltype(stateful_lambda)> // false
+ */
+namespace detail {
+template <class LambdaType, class FuncType>
+struct is_stateless_lambda__ final {
+  static_assert(
+      !std::is_same_v<LambdaType, LambdaType>,
+      "Base case shouldn't be hit");
+};
+// implementation idea: According to the C++ standard, stateless lambdas are
+// convertible to function pointers
+template <class LambdaType, class C, class Result, class... Args>
+struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...) const>
+    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
+template <class LambdaType, class C, class Result, class... Args>
+struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...)>
+    : std::is_convertible<LambdaType, Result (*)(Args...)> {};
+
+// case where LambdaType is not even a functor
+template <class LambdaType, class Enable = void>
+struct is_stateless_lambda_ final : std::false_type {};
+// case where LambdaType is a functor
+template <class LambdaType>
+struct is_stateless_lambda_<
+    LambdaType,
+    std::enable_if_t<is_functor<LambdaType>::value>>
+    : is_stateless_lambda__<LambdaType, decltype(&LambdaType::operator())> {};
+} // namespace detail
+template <class T>
+using is_stateless_lambda = detail::is_stateless_lambda_<std::decay_t<T>>;
+
+/**
+ * is_type_condition<C> is true_type iff C<...> is a type trait representing a
+ * condition (i.e. has a constexpr static bool ::value member) Example:
+ *   is_type_condition<std::is_reference>  // true
+ */
+template <template <class> class C, class Enable = void>
+struct is_type_condition : std::false_type {};
+template <template <class> class C>
+struct is_type_condition<
+    C,
+    std::enable_if_t<
+        std::is_same_v<bool, std::remove_cv_t<decltype(C<int>::value)>>>>
+    : std::true_type {};
+
+/**
+ * is_fundamental<T> is true_type iff the lambda type T is a fundamental type
+ * (that is, arithmetic type, void, or nullptr_t). Example: is_fundamental<int>
+ * // true We define it here to resolve a MSVC bug. See
+ * https://github.com/pytorch/pytorch/issues/30932 for details.
+ */
+template <class T>
+struct is_fundamental : std::is_fundamental<T> {};
+} // namespace c10::guts
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Unicode.h b/MLPY/Lib/site-packages/torch/include/c10/util/Unicode.h
new file mode 100644
index 0000000000000000000000000000000000000000..83d4b52e38f48df2e0c6bf5a90386b13e74a1750
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Unicode.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#if defined(_WIN32)
+#include <c10/util/Exception.h>
+#include <c10/util/win32-headers.h>
+#include <string>
+#endif
+
+namespace c10 {
+#if defined(_WIN32)
+C10_API std::wstring u8u16(const std::string& str);
+C10_API std::string u16u8(const std::wstring& wstr);
+#endif
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/UniqueVoidPtr.h b/MLPY/Lib/site-packages/torch/include/c10/util/UniqueVoidPtr.h
new file mode 100644
index 0000000000000000000000000000000000000000..707c529128f641382a33712a9f81686a6febaeb8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/UniqueVoidPtr.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+using DeleterFnPtr = void (*)(void*);
+
+namespace detail {
+
+// Does not delete anything
+C10_API void deleteNothing(void*);
+
+// A detail::UniqueVoidPtr is an owning smart pointer like unique_ptr, but
+// with three major differences:
+//
+//    1) It is specialized to void
+//
+//    2) It is specialized for a function pointer deleter
+//       void(void* ctx); i.e., the deleter doesn't take a
+//       reference to the data, just to a context pointer
+//       (erased as void*).  In fact, internally, this pointer
+//       is implemented as having an owning reference to
+//       context, and a non-owning reference to data; this is why
+//       you release_context(), not release() (the conventional
+//       API for release() wouldn't give you enough information
+//       to properly dispose of the object later.)
+//
+//    3) The deleter is guaranteed to be called when the unique
+//       pointer is destructed and the context is non-null; this is different
+//       from std::unique_ptr where the deleter is not called if the
+//       data pointer is null.
+//
+// Some of the methods have slightly different types than std::unique_ptr
+// to reflect this.
+//
+class UniqueVoidPtr {
+ private:
+  // Lifetime tied to ctx_
+  void* data_;
+  std::unique_ptr<void, DeleterFnPtr> ctx_;
+
+ public:
+  UniqueVoidPtr() : data_(nullptr), ctx_(nullptr, &deleteNothing) {}
+  explicit UniqueVoidPtr(void* data)
+      : data_(data), ctx_(nullptr, &deleteNothing) {}
+  UniqueVoidPtr(void* data, void* ctx, DeleterFnPtr ctx_deleter)
+      : data_(data), ctx_(ctx, ctx_deleter ? ctx_deleter : &deleteNothing) {}
+  void* operator->() const {
+    return data_;
+  }
+  void clear() {
+    ctx_ = nullptr;
+    data_ = nullptr;
+  }
+  void* get() const {
+    return data_;
+  }
+  void* get_context() const {
+    return ctx_.get();
+  }
+  void* release_context() {
+    return ctx_.release();
+  }
+  std::unique_ptr<void, DeleterFnPtr>&& move_context() {
+    return std::move(ctx_);
+  }
+  C10_NODISCARD bool compare_exchange_deleter(
+      DeleterFnPtr expected_deleter,
+      DeleterFnPtr new_deleter) {
+    if (get_deleter() != expected_deleter)
+      return false;
+    ctx_ = std::unique_ptr<void, DeleterFnPtr>(ctx_.release(), new_deleter);
+    return true;
+  }
+
+  template <typename T>
+  T* cast_context(DeleterFnPtr expected_deleter) const {
+    if (get_deleter() != expected_deleter)
+      return nullptr;
+    return static_cast<T*>(get_context());
+  }
+  operator bool() const {
+    return data_ || ctx_;
+  }
+  DeleterFnPtr get_deleter() const {
+    return ctx_.get_deleter();
+  }
+};
+
+// Note [How UniqueVoidPtr is implemented]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// UniqueVoidPtr solves a common problem for allocators of tensor data, which
+// is that the data pointer (e.g., float*) which you are interested in, is not
+// the same as the context pointer (e.g., DLManagedTensor) which you need
+// to actually deallocate the data.  Under a conventional deleter design, you
+// have to store extra context in the deleter itself so that you can actually
+// delete the right thing.  Implementing this with standard C++ is somewhat
+// error-prone: if you use a std::unique_ptr to manage tensors, the deleter will
+// not be called if the data pointer is nullptr, which can cause a leak if the
+// context pointer is non-null (and the deleter is responsible for freeing both
+// the data pointer and the context pointer).
+//
+// So, in our reimplementation of unique_ptr, which just store the context
+// directly in the unique pointer, and attach the deleter to the context
+// pointer itself.  In simple cases, the context pointer is just the pointer
+// itself.
+
+inline bool operator==(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
+  return !sp;
+}
+inline bool operator==(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
+  return !sp;
+}
+inline bool operator!=(const UniqueVoidPtr& sp, std::nullptr_t) noexcept {
+  return sp;
+}
+inline bool operator!=(std::nullptr_t, const UniqueVoidPtr& sp) noexcept {
+  return sp;
+}
+
+} // namespace detail
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/Unroll.h b/MLPY/Lib/site-packages/torch/include/c10/util/Unroll.h
new file mode 100644
index 0000000000000000000000000000000000000000..b89fa575380888e6f30699b2765412d9d45272dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/Unroll.h
@@ -0,0 +1,30 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <type_traits>
+
+// Utility to guarantee complete unrolling of a loop where the bounds are known
+// at compile time. Various pragmas achieve similar effects, but are not as
+// portable across compilers.
+
+// Example: c10::ForcedUnroll<4>{}(f); is equivalent to f(0); f(1); f(2); f(3);
+
+namespace c10 {
+
+template <int n>
+struct ForcedUnroll {
+  template <typename Func, typename... Args>
+  C10_ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    ForcedUnroll<n - 1>{}(f, args...);
+    f(std::integral_constant<int, n - 1>{}, args...);
+  }
+};
+
+template <>
+struct ForcedUnroll<1> {
+  template <typename Func, typename... Args>
+  C10_ALWAYS_INLINE void operator()(const Func& f, Args... args) const {
+    f(std::integral_constant<int, 0>{}, args...);
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/accumulate.h b/MLPY/Lib/site-packages/torch/include/c10/util/accumulate.h
new file mode 100644
index 0000000000000000000000000000000000000000..83cdba349087b39fc85b69f9233176fd5233ea41
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/accumulate.h
@@ -0,0 +1,124 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+namespace c10 {
+
+/// Sum of a list of integers; accumulates into the int64_t datatype
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t sum_integers(const C& container) {
+  // std::accumulate infers return type from `init` type, so if the `init` type
+  // is not large enough to hold the result, computation can overflow. We use
+  // `int64_t` here to avoid this.
+  return std::accumulate(
+      container.begin(), container.end(), static_cast<int64_t>(0));
+}
+
+/// Sum of integer elements referred to by iterators; accumulates into the
+/// int64_t datatype
+template <
+    typename Iter,
+    std::enable_if_t<
+        std::is_integral_v<typename std::iterator_traits<Iter>::value_type>,
+        int> = 0>
+inline int64_t sum_integers(Iter begin, Iter end) {
+  // std::accumulate infers return type from `init` type, so if the `init` type
+  // is not large enough to hold the result, computation can overflow. We use
+  // `int64_t` here to avoid this.
+  return std::accumulate(begin, end, static_cast<int64_t>(0));
+}
+
+/// Product of a list of integers; accumulates into the int64_t datatype
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t multiply_integers(const C& container) {
+  // std::accumulate infers return type from `init` type, so if the `init` type
+  // is not large enough to hold the result, computation can overflow. We use
+  // `int64_t` here to avoid this.
+  return std::accumulate(
+      container.begin(),
+      container.end(),
+      static_cast<int64_t>(1),
+      std::multiplies<>());
+}
+
+/// Product of integer elements referred to by iterators; accumulates into the
+/// int64_t datatype
+template <
+    typename Iter,
+    std::enable_if_t<
+        std::is_integral_v<typename std::iterator_traits<Iter>::value_type>,
+        int> = 0>
+inline int64_t multiply_integers(Iter begin, Iter end) {
+  // std::accumulate infers return type from `init` type, so if the `init` type
+  // is not large enough to hold the result, computation can overflow. We use
+  // `int64_t` here to avoid this.
+  return std::accumulate(
+      begin, end, static_cast<int64_t>(1), std::multiplies<>());
+}
+
+/// Return product of all dimensions starting from k
+/// Returns 1 if k>=dims.size()
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t numelements_from_dim(const int k, const C& dims) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(k >= 0);
+
+  if (k > static_cast<int>(dims.size())) {
+    return 1;
+  } else {
+    auto cbegin = dims.cbegin();
+    std::advance(cbegin, k);
+    return multiply_integers(cbegin, dims.cend());
+  }
+}
+
+/// Product of all dims up to k (not including dims[k])
+/// Throws an error if k>dims.size()
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t numelements_to_dim(const int k, const C& dims) {
+  TORCH_INTERNAL_ASSERT(0 <= k);
+  TORCH_INTERNAL_ASSERT((unsigned)k <= dims.size());
+
+  auto cend = dims.cbegin();
+  std::advance(cend, k);
+  return multiply_integers(dims.cbegin(), cend);
+}
+
+/// Product of all dims between k and l (including dims[k] and excluding
+/// dims[l]) k and l may be supplied in either order
+template <
+    typename C,
+    std::enable_if_t<std::is_integral_v<typename C::value_type>, int> = 0>
+inline int64_t numelements_between_dim(int k, int l, const C& dims) {
+  TORCH_INTERNAL_ASSERT(0 <= k);
+  TORCH_INTERNAL_ASSERT(0 <= l);
+
+  if (k > l) {
+    std::swap(k, l);
+  }
+
+  TORCH_INTERNAL_ASSERT((unsigned)l < dims.size());
+
+  auto cbegin = dims.cbegin();
+  auto cend = dims.cbegin();
+  std::advance(cbegin, k);
+  std::advance(cend, l);
+  return multiply_integers(cbegin, cend);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/bit_cast.h b/MLPY/Lib/site-packages/torch/include/c10/util/bit_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0ab7d29518e92bf4a182d7b78f2ec3ae6095e26
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/bit_cast.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+
+namespace c10 {
+
+// Implementations of std::bit_cast() from C++ 20.
+//
+// This is a less sketchy version of reinterpret_cast.
+//
+// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more
+// information as well as the source of our implementations.
+template <class To, class From>
+std::enable_if_t<
+    sizeof(To) == sizeof(From) && std::is_trivially_copyable_v<From> &&
+        std::is_trivially_copyable_v<To>,
+    To>
+// constexpr support needs compiler magic
+bit_cast(const From& src) noexcept {
+  static_assert(
+      std::is_trivially_constructible_v<To>,
+      "This implementation additionally requires "
+      "destination type to be trivially constructible");
+
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/bits.h b/MLPY/Lib/site-packages/torch/include/c10/util/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaafc3b2f435a84d5d4e7b7e0e373661b7cfdf80
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/bits.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * bits1x8 is an uninterpreted dtype of a tensor with 1 bit (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits1x8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits1x8() = default;
+  C10_HOST_DEVICE explicit bits1x8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits2x4 is an uninterpreted dtype of a tensor with 2 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits2x4() = default;
+  C10_HOST_DEVICE explicit bits2x4(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits4x2 is an uninterpreted dtype of a tensor with 4 bits (packed to byte
+ * boundary), without any semantics defined.
+ */
+struct alignas(1) bits4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  bits4x2() = default;
+  C10_HOST_DEVICE explicit bits4x2(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits8 is an uninterpreted dtype of a tensor with 8 bits, without any
+ * semantics defined.
+ */
+struct alignas(1) bits8 {
+  uint8_t val_;
+  bits8() = default;
+  C10_HOST_DEVICE explicit bits8(uint8_t val) : val_(val) {}
+};
+
+/**
+ * bits16 is an uninterpreted dtype of a tensor with 16 bits, without any
+ * semantics defined.
+ */
+struct alignas(2) bits16 {
+  uint16_t val_;
+  bits16() = default;
+  C10_HOST_DEVICE explicit bits16(uint16_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/complex.h b/MLPY/Lib/site-packages/torch/include/c10/util/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..49b0bef89f988e329c7497b007f03112f3f0926f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/complex.h
@@ -0,0 +1,618 @@
+#pragma once
+
+#include <complex>
+
+#include <c10/macros/Macros.h>
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/complex.h>
+#endif
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion")
+#endif
+#if C10_CLANG_HAS_WARNING("-Wfloat-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wfloat-conversion")
+#endif
+
+namespace c10 {
+
+// c10::complex is an implementation of complex numbers that aims
+// to work on all devices supported by PyTorch
+//
+// Most of the APIs duplicates std::complex
+// Reference: https://en.cppreference.com/w/cpp/numeric/complex
+//
+// [NOTE: Complex Operator Unification]
+// Operators currently use a mix of std::complex, thrust::complex, and
+// c10::complex internally. The end state is that all operators will use
+// c10::complex internally.  Until then, there may be some hacks to support all
+// variants.
+//
+//
+// [Note on Constructors]
+//
+// The APIs of constructors are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/complex
+//
+// Since C++14, all constructors are constexpr in std::complex
+//
+// There are three types of constructors:
+// - initializing from real and imag:
+//     `constexpr complex( const T& re = T(), const T& im = T() );`
+// - implicitly-declared copy constructor
+// - converting constructors
+//
+// Converting constructors:
+// - std::complex defines converting constructor between float/double/long
+// double,
+//   while we define converting constructor between float/double.
+// - For these converting constructors, upcasting is implicit, downcasting is
+//   explicit.
+// - We also define explicit casting from std::complex/thrust::complex
+//   - Note that the conversion from thrust is not constexpr, because
+//     thrust does not define them as constexpr ????
+//
+//
+// [Operator =]
+//
+// The APIs of operator = are mostly copied from C++ standard:
+//   https://en.cppreference.com/w/cpp/numeric/complex/operator%3D
+//
+// Since C++20, all operator= are constexpr. Although we are not building with
+// C++20, we also obey this behavior.
+//
+// There are three types of assign operator:
+// - Assign a real value from the same scalar type
+//   - In std, this is templated as complex& operator=(const T& x)
+//     with specialization `complex& operator=(T x)` for float/double/long
+//     double Since we only support float and double, on will use `complex&
+//     operator=(T x)`
+// - Copy assignment operator and converting assignment operator
+//   - There is no specialization of converting assignment operators, which type
+//   is
+//     convertible is solely dependent on whether the scalar type is convertible
+//
+// In addition to the standard assignment, we also provide assignment operators
+// with std and thrust
+//
+//
+// [Casting operators]
+//
+// std::complex does not have casting operators. We define casting operators
+// casting to std::complex and thrust::complex
+//
+//
+// [Operator ""]
+//
+// std::complex has custom literals `i`, `if` and `il` defined in namespace
+// `std::literals::complex_literals`. We define our own custom literals in the
+// namespace `c10::complex_literals`. Our custom literals does not follow the
+// same behavior as in std::complex, instead, we define _if, _id to construct
+// float/double complex literals.
+//
+//
+// [real() and imag()]
+//
+// In C++20, there are two overload of these functions, one it to return the
+// real/imag, another is to set real/imag, they are both constexpr. We follow
+// this design.
+//
+//
+// [Operator +=,-=,*=,/=]
+//
+// Since C++20, these operators become constexpr. In our implementation, they
+// are also constexpr.
+//
+// There are two types of such operators: operating with a real number, or
+// operating with another complex number. For the operating with a real number,
+// the generic template form has argument type `const T &`, while the overload
+// for float/double/long double has `T`. We will follow the same type as
+// float/double/long double in std.
+//
+// [Unary operator +-]
+//
+// Since C++20, they are constexpr. We also make them expr
+//
+// [Binary operators +-*/]
+//
+// Each operator has three versions (taking + as example):
+// - complex + complex
+// - complex + real
+// - real + complex
+//
+// [Operator ==, !=]
+//
+// Each operator has three versions (taking == as example):
+// - complex == complex
+// - complex == real
+// - real == complex
+//
+// Some of them are removed on C++20, but we decide to keep them
+//
+// [Operator <<, >>]
+//
+// These are implemented by casting to std::complex
+//
+//
+//
+// TODO(@zasdfgbnm): c10::complex<c10::Half> is not currently supported,
+// because:
+//  - lots of members and functions of c10::Half are not constexpr
+//  - thrust::complex only support float and double
+
+template <typename T>
+struct alignas(sizeof(T) * 2) complex {
+  using value_type = T;
+
+  T real_ = T(0);
+  T imag_ = T(0);
+
+  constexpr complex() = default;
+  C10_HOST_DEVICE constexpr complex(const T& re, const T& im = T())
+      : real_(re), imag_(im) {}
+  template <typename U>
+  explicit constexpr complex(const std::complex<U>& other)
+      : complex(other.real(), other.imag()) {}
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  explicit C10_HOST_DEVICE complex(const thrust::complex<U>& other)
+      : real_(other.real()), imag_(other.imag()) {}
+// NOTE can not be implemented as follow due to ROCm bug:
+//   explicit C10_HOST_DEVICE complex(const thrust::complex<U> &other):
+//   complex(other.real(), other.imag()) {}
+#endif
+
+  // Use SFINAE to specialize casting constructor for c10::complex<float> and
+  // c10::complex<double>
+  template <typename U = T>
+  C10_HOST_DEVICE explicit constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, float>, complex<double>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+  template <typename U = T>
+  C10_HOST_DEVICE constexpr complex(
+      const std::enable_if_t<std::is_same_v<U, double>, complex<float>>& other)
+      : real_(other.real_), imag_(other.imag_) {}
+
+  constexpr complex<T>& operator=(T re) {
+    real_ = re;
+    imag_ = 0;
+    return *this;
+  }
+
+  constexpr complex<T>& operator+=(T re) {
+    real_ += re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator-=(T re) {
+    real_ -= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator*=(T re) {
+    real_ *= re;
+    imag_ *= re;
+    return *this;
+  }
+
+  constexpr complex<T>& operator/=(T re) {
+    real_ /= re;
+    imag_ /= re;
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator=(const complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator+=(const complex<U>& rhs) {
+    real_ += rhs.real();
+    imag_ += rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator-=(const complex<U>& rhs) {
+    real_ -= rhs.real();
+    imag_ -= rhs.imag();
+    return *this;
+  }
+
+  template <typename U>
+  constexpr complex<T>& operator*=(const complex<U>& rhs) {
+    // (a + bi) * (c + di) = (a*c - b*d) + (a * d + b * c) i
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+    real_ = a * c - b * d;
+    imag_ = a * d + b * c;
+    return *this;
+  }
+
+#ifdef __APPLE__
+#define FORCE_INLINE_APPLE __attribute__((always_inline))
+#else
+#define FORCE_INLINE_APPLE
+#endif
+  template <typename U>
+  constexpr FORCE_INLINE_APPLE complex<T>& operator/=(const complex<U>& rhs)
+      __ubsan_ignore_float_divide_by_zero__ {
+    // (a + bi) / (c + di) = (ac + bd)/(c^2 + d^2) + (bc - ad)/(c^2 + d^2) i
+    // the calculation below follows numpy's complex division
+    T a = real_;
+    T b = imag_;
+    U c = rhs.real();
+    U d = rhs.imag();
+
+#if defined(__GNUC__) && !defined(__clang__)
+    // std::abs is already constexpr by gcc
+    auto abs_c = std::abs(c);
+    auto abs_d = std::abs(d);
+#else
+    auto abs_c = c < 0 ? -c : c;
+    auto abs_d = d < 0 ? -d : d;
+#endif
+
+    if (abs_c >= abs_d) {
+      if (abs_c == 0 && abs_d == 0) {
+        /* divide by zeros should yield a complex inf or nan */
+        real_ = a / abs_c;
+        imag_ = b / abs_d;
+      } else {
+        auto rat = d / c;
+        auto scl = 1.0 / (c + d * rat);
+        real_ = (a + b * rat) * scl;
+        imag_ = (b - a * rat) * scl;
+      }
+    } else {
+      auto rat = c / d;
+      auto scl = 1.0 / (d + c * rat);
+      real_ = (a * rat + b) * scl;
+      imag_ = (b * rat - a) * scl;
+    }
+    return *this;
+  }
+#undef FORCE_INLINE_APPLE
+
+  template <typename U>
+  constexpr complex<T>& operator=(const std::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE complex<T>& operator=(const thrust::complex<U>& rhs) {
+    real_ = rhs.real();
+    imag_ = rhs.imag();
+    return *this;
+  }
+#endif
+
+  template <typename U>
+  explicit constexpr operator std::complex<U>() const {
+    return std::complex<U>(std::complex<T>(real(), imag()));
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  template <typename U>
+  C10_HOST_DEVICE explicit operator thrust::complex<U>() const {
+    return static_cast<thrust::complex<U>>(thrust::complex<T>(real(), imag()));
+  }
+#endif
+
+  // consistent with NumPy behavior
+  explicit constexpr operator bool() const {
+    return real() || imag();
+  }
+
+  C10_HOST_DEVICE constexpr T real() const {
+    return real_;
+  }
+  constexpr void real(T value) {
+    real_ = value;
+  }
+  C10_HOST_DEVICE constexpr T imag() const {
+    return imag_;
+  }
+  constexpr void imag(T value) {
+    imag_ = value;
+  }
+};
+
+namespace complex_literals {
+
+constexpr complex<float> operator""_if(long double imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(long double imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+constexpr complex<float> operator""_if(unsigned long long imag) {
+  return complex<float>(0.0f, static_cast<float>(imag));
+}
+
+constexpr complex<double> operator""_id(unsigned long long imag) {
+  return complex<double>(0.0, static_cast<double>(imag));
+}
+
+} // namespace complex_literals
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& val) {
+  return val;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& val) {
+  return complex<T>(-val.real(), -val.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result += rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator+(const T& lhs, const complex<T>& rhs) {
+  return complex<T>(lhs + rhs.real(), rhs.imag());
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result -= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator-(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = -rhs;
+  return result += lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result *= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator*(const T& lhs, const complex<T>& rhs) {
+  complex<T> result = rhs;
+  return result *= lhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const complex<T>& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const complex<T>& lhs, const T& rhs) {
+  complex<T> result = lhs;
+  return result /= rhs;
+}
+
+template <typename T>
+constexpr complex<T> operator/(const T& lhs, const complex<T>& rhs) {
+  complex<T> result(lhs, T());
+  return result /= rhs;
+}
+
+// Define operators between integral scalars and c10::complex. std::complex does
+// not support this when T is a floating-point number. This is useful because it
+// saves a lot of "static_cast" when operate a complex and an integer. This
+// makes the code both less verbose and potentially more efficient.
+#define COMPLEX_INTEGER_OP_TEMPLATE_CONDITION                 \
+  typename std::enable_if_t<                                  \
+      std::is_floating_point_v<fT> && std::is_integral_v<iT>, \
+      int> = 0
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const c10::complex<fT>& a, const iT& b) {
+  return a + static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator+(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) + b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const c10::complex<fT>& a, const iT& b) {
+  return a - static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator-(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) - b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const c10::complex<fT>& a, const iT& b) {
+  return a * static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator*(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) * b;
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const c10::complex<fT>& a, const iT& b) {
+  return a / static_cast<fT>(b);
+}
+
+template <typename fT, typename iT, COMPLEX_INTEGER_OP_TEMPLATE_CONDITION>
+constexpr c10::complex<fT> operator/(const iT& a, const c10::complex<fT>& b) {
+  return static_cast<fT>(a) / b;
+}
+
+#undef COMPLEX_INTEGER_OP_TEMPLATE_CONDITION
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const complex<T>& rhs) {
+  return (lhs.real() == rhs.real()) && (lhs.imag() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator==(const complex<T>& lhs, const T& rhs) {
+  return (lhs.real() == rhs) && (lhs.imag() == T());
+}
+
+template <typename T>
+constexpr bool operator==(const T& lhs, const complex<T>& rhs) {
+  return (lhs == rhs.real()) && (T() == rhs.imag());
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const complex<T>& lhs, const T& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T>
+constexpr bool operator!=(const T& lhs, const complex<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>& operator<<(
+    std::basic_ostream<CharT, Traits>& os,
+    const complex<T>& x) {
+  return (os << static_cast<std::complex<T>>(x));
+}
+
+template <typename T, typename CharT, typename Traits>
+std::basic_istream<CharT, Traits>& operator>>(
+    std::basic_istream<CharT, Traits>& is,
+    complex<T>& x) {
+  std::complex<T> tmp;
+  is >> tmp;
+  x = tmp;
+  return is;
+}
+
+} // namespace c10
+
+// std functions
+//
+// The implementation of these functions also follow the design of C++20
+
+namespace std {
+
+template <typename T>
+constexpr T real(const c10::complex<T>& z) {
+  return z.real();
+}
+
+template <typename T>
+constexpr T imag(const c10::complex<T>& z) {
+  return z.imag();
+}
+
+template <typename T>
+C10_HOST_DEVICE T abs(const c10::complex<T>& z) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return thrust::abs(static_cast<thrust::complex<T>>(z));
+#else
+  return std::abs(static_cast<std::complex<T>>(z));
+#endif
+}
+
+#if defined(USE_ROCM)
+#define ROCm_Bug(x)
+#else
+#define ROCm_Bug(x) x
+#endif
+
+template <typename T>
+C10_HOST_DEVICE T arg(const c10::complex<T>& z) {
+  return ROCm_Bug(std)::atan2(std::imag(z), std::real(z));
+}
+
+#undef ROCm_Bug
+
+template <typename T>
+constexpr T norm(const c10::complex<T>& z) {
+  return z.real() * z.real() + z.imag() * z.imag();
+}
+
+// For std::conj, there are other versions of it:
+//   constexpr std::complex<float> conj( float z );
+//   template< class DoubleOrInteger >
+//   constexpr std::complex<double> conj( DoubleOrInteger z );
+//   constexpr std::complex<long double> conj( long double z );
+// These are not implemented
+// TODO(@zasdfgbnm): implement them as c10::conj
+template <typename T>
+constexpr c10::complex<T> conj(const c10::complex<T>& z) {
+  return c10::complex<T>(z.real(), -z.imag());
+}
+
+// Thrust does not have complex --> complex version of thrust::proj,
+// so this function is not implemented at c10 right now.
+// TODO(@zasdfgbnm): implement it by ourselves
+
+// There is no c10 version of std::polar, because std::polar always
+// returns std::complex. Use c10::polar instead;
+
+} // namespace std
+
+namespace c10 {
+
+template <typename T>
+C10_HOST_DEVICE complex<T> polar(const T& r, const T& theta = T()) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<complex<T>>(thrust::polar(r, theta));
+#else
+  // std::polar() requires r >= 0, so spell out the explicit implementation to
+  // avoid a branch.
+  return complex<T>(r * std::cos(theta), r * std::sin(theta));
+#endif
+}
+
+} // namespace c10
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#define C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
+// math functions are included in a separate file
+#include <c10/util/complex_math.h> // IWYU pragma: keep
+// utilities for complex types
+#include <c10/util/complex_utils.h> // IWYU pragma: keep
+#undef C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/complex_math.h b/MLPY/Lib/site-packages/torch/include/c10/util/complex_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..940701f3641e698b8163de4912bf1f237e4404eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/complex_math.h
@@ -0,0 +1,406 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_math.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+namespace c10_complex_math {
+
+// Exponential functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> exp(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::exp(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::exp(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log10(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::log10(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::log10(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log2(const c10::complex<T>& x) {
+  const c10::complex<T> log2 = c10::complex<T>(::log(2.0), 0.0);
+  return c10_complex_math::log(x) / log2;
+}
+
+// Power functions
+//
+#if defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX))
+namespace _detail {
+C10_API c10::complex<float> sqrt(const c10::complex<float>& in);
+C10_API c10::complex<double> sqrt(const c10::complex<double>& in);
+C10_API c10::complex<float> acos(const c10::complex<float>& in);
+C10_API c10::complex<double> acos(const c10::complex<double>& in);
+} // namespace _detail
+#endif
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sqrt(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sqrt(static_cast<thrust::complex<T>>(x)));
+#elif !(                        \
+    defined(_LIBCPP_VERSION) || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX11_USE_C99_COMPLEX)))
+  return static_cast<c10::complex<T>>(
+      std::sqrt(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::sqrt(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const c10::complex<T>& x,
+    const T& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> pow(
+    const T& x,
+    const c10::complex<T>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(thrust::pow(
+      static_cast<thrust::complex<T>>(x), static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(std::pow(
+      static_cast<std::complex<T>>(x), static_cast<std::complex<T>>(y)));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const c10::complex<T>& x,
+    const U& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(static_cast<thrust::complex<T>>(x), y));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(static_cast<std::complex<T>>(x), y));
+#endif
+}
+
+template <typename T, typename U>
+C10_HOST_DEVICE inline c10::complex<decltype(T() * U())> pow(
+    const T& x,
+    const c10::complex<U>& y) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::pow(x, static_cast<thrust::complex<T>>(y)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::pow(x, static_cast<std::complex<T>>(y)));
+#endif
+}
+
+// Trigonometric functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cos(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cos(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asin(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asin(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asin(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acos(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acos(static_cast<thrust::complex<T>>(x)));
+#elif !defined(_LIBCPP_VERSION)
+  return static_cast<c10::complex<T>>(
+      std::acos(static_cast<std::complex<T>>(x)));
+#else
+  return _detail::acos(x);
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atan(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atan(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atan(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+// Hyperbolic functions
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> sinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::sinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::sinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> cosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::cosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::cosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> tanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::tanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::tanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> asinh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::asinh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::asinh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> acosh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::acosh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::acosh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> atanh(const c10::complex<T>& x) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return static_cast<c10::complex<T>>(
+      thrust::atanh(static_cast<thrust::complex<T>>(x)));
+#else
+  return static_cast<c10::complex<T>>(
+      std::atanh(static_cast<std::complex<T>>(x)));
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> log1p(const c10::complex<T>& z) {
+#if defined(__APPLE__) || defined(__MACOSX) || defined(__CUDACC__) || \
+    defined(__HIPCC__)
+  // For Mac, the new implementation yielded a high relative error. Falling back
+  // to the old version for now.
+  // See https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  // For CUDA we also use this one, as thrust::log(thrust::complex) takes
+  // *forever* to compile
+
+  // log1p(z) = log(1 + z)
+  // Let's define 1 + z = r * e ^ (i * a), then we have
+  // log(r * e ^ (i * a)) = log(r) + i * a
+  // With z = x + iy, the term r can be written as
+  // r = ((1 + x) ^ 2 + y ^ 2) ^ 0.5
+  //   = (1 + x ^ 2 + 2 * x + y ^ 2) ^ 0.5
+  // So, log(r) is
+  // log(r) = 0.5 * log(1 + x ^ 2 + 2 * x + y ^ 2)
+  //        = 0.5 * log1p(x * (x + 2) + y ^ 2)
+  // we need to use the expression only on certain condition to avoid overflow
+  // and underflow from `(x * (x + 2) + y ^ 2)`
+  T x = z.real();
+  T y = z.imag();
+  T zabs = std::abs(z);
+  T theta = std::atan2(y, x + T(1));
+  if (zabs < 0.5) {
+    T r = x * (T(2) + x) + y * y;
+    if (r == 0) { // handle underflow
+      return {x, theta};
+    }
+    return {T(0.5) * std::log1p(r), theta};
+  } else {
+    T z0 = std::hypot(x + 1, y);
+    return {std::log(z0), theta};
+  }
+#else
+  // CPU path
+  // Based on https://github.com/numpy/numpy/pull/22611#issuecomment-1667945354
+  c10::complex<T> u = z + T(1);
+  if (u == T(1)) {
+    return z;
+  } else {
+    auto log_u = log(u);
+    if (u - T(1) == z) {
+      return log_u;
+    }
+    return log_u * (z / (u - T(1)));
+  }
+#endif
+}
+
+template <typename T>
+C10_HOST_DEVICE inline c10::complex<T> expm1(const c10::complex<T>& z) {
+  // expm1(z) = exp(z) - 1
+  // Define z = x + i * y
+  // f = e ^ (x + i * y) - 1
+  //   = e ^ x * e ^ (i * y) - 1
+  //   = (e ^ x * cos(y) - 1) + i * (e ^ x * sin(y))
+  //   = (e ^ x - 1) * cos(y) - (1 - cos(y)) + i * e ^ x * sin(y)
+  //   = expm1(x) * cos(y) - 2 * sin(y / 2) ^ 2 + i * e ^ x * sin(y)
+  T x = z.real();
+  T y = z.imag();
+  T a = std::sin(y / 2);
+  T er = std::expm1(x) * std::cos(y) - T(2) * a * a;
+  T ei = std::exp(x) * std::sin(y);
+  return {er, ei};
+}
+
+} // namespace c10_complex_math
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+namespace std {
+
+using c10_complex_math::acos;
+using c10_complex_math::acosh;
+using c10_complex_math::asin;
+using c10_complex_math::asinh;
+using c10_complex_math::atan;
+using c10_complex_math::atanh;
+using c10_complex_math::cos;
+using c10_complex_math::cosh;
+using c10_complex_math::exp;
+using c10_complex_math::expm1;
+using c10_complex_math::log;
+using c10_complex_math::log10;
+using c10_complex_math::log1p;
+using c10_complex_math::log2;
+using c10_complex_math::pow;
+using c10_complex_math::sin;
+using c10_complex_math::sinh;
+using c10_complex_math::sqrt;
+using c10_complex_math::tan;
+using c10_complex_math::tanh;
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/complex_utils.h b/MLPY/Lib/site-packages/torch/include/c10/util/complex_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbb89384760637b9b9a8a83b4a592f1b02d199df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/complex_utils.h
@@ -0,0 +1,46 @@
+#if !defined(C10_INTERNAL_INCLUDE_COMPLEX_REMAINING_H)
+#error \
+    "c10/util/complex_utils.h is not meant to be individually included. Include c10/util/complex.h instead."
+#endif
+
+#include <limits>
+
+namespace c10 {
+
+template <typename T>
+struct is_complex : public std::false_type {};
+
+template <typename T>
+struct is_complex<std::complex<T>> : public std::true_type {};
+
+template <typename T>
+struct is_complex<c10::complex<T>> : public std::true_type {};
+
+// Extract double from std::complex<double>; is identity otherwise
+// TODO: Write in more idiomatic C++17
+template <typename T>
+struct scalar_value_type {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<std::complex<T>> {
+  using type = T;
+};
+template <typename T>
+struct scalar_value_type<c10::complex<T>> {
+  using type = T;
+};
+
+} // namespace c10
+
+namespace std {
+
+template <typename T>
+class numeric_limits<c10::complex<T>> : public numeric_limits<T> {};
+
+template <typename T>
+bool isnan(const c10::complex<T>& v) {
+  return std::isnan(v.real()) || std::isnan(v.imag());
+}
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/copysign.h b/MLPY/Lib/site-packages/torch/include/c10/util/copysign.h
new file mode 100644
index 0000000000000000000000000000000000000000..c65ad3337891551ac66c8ce7a075897d23a397ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/copysign.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+
+namespace c10 {
+
+// Note: Explicit implementation of copysign for Half and BFloat16
+// is needed to workaround g++-7/8 crash on aarch64, but also makes
+// copysign faster for the half-precision types
+template <typename T, typename U>
+inline auto copysign(const T& a, const U& b) {
+  return std::copysign(a, b);
+}
+
+// Implement copysign for half precision floats using bit ops
+// Sign is the most significant bit for both half and bfloat16 types
+inline c10::Half copysign(c10::Half a, c10::Half b) {
+  return c10::Half((a.x & 0x7fff) | (b.x & 0x8000), c10::Half::from_bits());
+}
+
+inline c10::BFloat16 copysign(c10::BFloat16 a, c10::BFloat16 b) {
+  return c10::BFloat16(
+      (a.x & 0x7fff) | (b.x & 0x8000), c10::BFloat16::from_bits());
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/env.h b/MLPY/Lib/site-packages/torch/include/c10/util/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a724da6ad941e8fd10b2cd006aac2ef738023ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/env.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstdlib>
+#include <cstring>
+#include <optional>
+
+namespace c10::utils {
+// Reads an environment variable and returns
+// - optional<true>,              if set equal to "1"
+// - optional<false>,             if set equal to "0"
+// - nullopt,   otherwise
+//
+// NB:
+// Issues a warning if the value of the environment variable is not 0 or 1.
+inline std::optional<bool> check_env(const char* name) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)
+#endif
+  auto envar = std::getenv(name);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+  if (envar) {
+    if (strcmp(envar, "0") == 0) {
+      return false;
+    }
+    if (strcmp(envar, "1") == 0) {
+      return true;
+    }
+    TORCH_WARN(
+        "Ignoring invalid value for boolean flag ",
+        name,
+        ": ",
+        envar,
+        "valid values are 0 or 1.");
+  }
+  return std::nullopt;
+}
+} // namespace c10::utils
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/flat_hash_map.h b/MLPY/Lib/site-packages/torch/include/c10/util/flat_hash_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..f14e72b3a9d35f54db0f4561e59ef6dea1279b7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/flat_hash_map.h
@@ -0,0 +1,2102 @@
+// Taken from
+// https://github.com/skarupke/flat_hash_map/blob/2c4687431f978f02a3780e24b8b701d22aa32d9c/flat_hash_map.hpp
+// with fixes applied:
+// - https://github.com/skarupke/flat_hash_map/pull/25
+// - https://github.com/skarupke/flat_hash_map/pull/26
+// - replace size_t with uint64_t to fix it for 32bit
+// - add "GCC diagnostic" pragma to ignore -Wshadow
+// - make sherwood_v3_table::convertible_to_iterator public because GCC5 seems
+// to have issues with it otherwise
+// - fix compiler warnings in operator templated_iterator<const value_type>
+// - make use of 'if constexpr' and eliminate AssignIfTrue template
+
+//          Copyright Malte Skarupke 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See http://www.boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 4624) // destructor was implicitly defined as deleted
+#endif
+
+#ifdef _MSC_VER
+#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
+#else
+#define SKA_NOINLINE(...) __VA_ARGS__ __attribute__((noinline))
+#endif
+
+namespace ska {
+struct prime_number_hash_policy;
+struct power_of_two_hash_policy;
+struct fibonacci_hash_policy;
+
+namespace detailv3 {
+template <typename Result, typename Functor>
+struct functor_storage : Functor {
+  functor_storage() = default;
+  functor_storage(const Functor& functor) : Functor(functor) {}
+  template <typename... Args>
+  Result operator()(Args&&... args) {
+    return static_cast<Functor&>(*this)(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  Result operator()(Args&&... args) const {
+    return static_cast<const Functor&>(*this)(std::forward<Args>(args)...);
+  }
+};
+template <typename Result, typename... Args>
+struct functor_storage<Result, Result (*)(Args...)> {
+  typedef Result (*function_ptr)(Args...);
+  function_ptr function;
+  functor_storage(function_ptr function) : function(function) {}
+  Result operator()(Args... args) const {
+    return function(std::forward<Args>(args)...);
+  }
+  operator function_ptr&() {
+    return function;
+  }
+  operator const function_ptr&() {
+    return function;
+  }
+};
+template <typename key_type, typename value_type, typename hasher>
+struct KeyOrValueHasher : functor_storage<uint64_t, hasher> {
+  typedef functor_storage<uint64_t, hasher> hasher_storage;
+  KeyOrValueHasher() = default;
+  KeyOrValueHasher(const hasher& hash) : hasher_storage(hash) {}
+  uint64_t operator()(const key_type& key) {
+    return static_cast<hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const key_type& key) const {
+    return static_cast<const hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const value_type& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  uint64_t operator()(const value_type& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+};
+template <typename key_type, typename value_type, typename key_equal>
+struct KeyOrValueEquality : functor_storage<bool, key_equal> {
+  typedef functor_storage<bool, key_equal> equality_storage;
+  KeyOrValueEquality() = default;
+  KeyOrValueEquality(const key_equal& equality) : equality_storage(equality) {}
+  bool operator()(const key_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs);
+  }
+  bool operator()(const key_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  bool operator()(const value_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  bool operator()(const value_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const key_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  template <typename F, typename S>
+  bool operator()(const value_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename FL, typename SL, typename FR, typename SR>
+  bool operator()(const std::pair<FL, SL>& lhs, const std::pair<FR, SR>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+};
+static constexpr int8_t min_lookups = 4;
+template <typename T>
+struct sherwood_v3_entry {
+  sherwood_v3_entry() = default;
+  sherwood_v3_entry(int8_t distance_from_desired)
+      : distance_from_desired(distance_from_desired) {}
+  ~sherwood_v3_entry() = default;
+
+  bool has_value() const {
+    return distance_from_desired >= 0;
+  }
+  bool is_empty() const {
+    return distance_from_desired < 0;
+  }
+  bool is_at_desired_position() const {
+    return distance_from_desired <= 0;
+  }
+  template <typename... Args>
+  void emplace(int8_t distance, Args&&... args) {
+    new (std::addressof(value)) T(std::forward<Args>(args)...);
+    distance_from_desired = distance;
+  }
+
+  void destroy_value() {
+    value.~T();
+    distance_from_desired = -1;
+  }
+
+  int8_t distance_from_desired = -1;
+  static constexpr int8_t special_end_value = 0;
+  union {
+    T value;
+  };
+};
+
+inline int8_t log2(uint64_t value) {
+  // NOLINTNEXTLINE(*c-arrays*)
+  static constexpr int8_t table[64] = {
+      63, 0,  58, 1,  59, 47, 53, 2,  60, 39, 48, 27, 54, 33, 42, 3,
+      61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4,
+      62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21,
+      56, 45, 25, 31, 35, 16, 9,  12, 44, 24, 15, 8,  23, 7,  6,  5};
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  value |= value >> 32;
+  return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58];
+}
+
+inline uint64_t next_power_of_two(uint64_t i) {
+  --i;
+  i |= i >> 1;
+  i |= i >> 2;
+  i |= i >> 4;
+  i |= i >> 8;
+  i |= i >> 16;
+  i |= i >> 32;
+  ++i;
+  return i;
+}
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+// (it takes CWG1558 into account and also works for older compilers)
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+template <typename T, typename = void>
+struct HashPolicySelector {
+  typedef fibonacci_hash_policy type;
+};
+template <typename T>
+struct HashPolicySelector<T, void_t<typename T::hash_policy>> {
+  typedef typename T::hash_policy type;
+};
+
+template <
+    typename T,
+    typename FindKey,
+    typename ArgumentHash,
+    typename DetailHasher,
+    typename ArgumentEqual,
+    typename Equal,
+    typename ArgumentAlloc,
+    typename EntryAlloc>
+class sherwood_v3_table : private EntryAlloc,
+                          private DetailHasher,
+                          private Equal {
+  using Entry = detailv3::sherwood_v3_entry<T>;
+  using AllocatorTraits = std::allocator_traits<EntryAlloc>;
+  using EntryPointer = typename AllocatorTraits::pointer;
+
+ public:
+  struct convertible_to_iterator;
+
+  using value_type = T;
+  using size_type = uint64_t;
+  using difference_type = std::ptrdiff_t;
+  using hasher = ArgumentHash;
+  using key_equal = ArgumentEqual;
+  using allocator_type = EntryAlloc;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  sherwood_v3_table() = default;
+  explicit sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : EntryAlloc(alloc), DetailHasher(hash), Equal(equal) {
+    rehash(bucket_count);
+  }
+  sherwood_v3_table(size_type bucket_count, const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(bucket_count, hash, ArgumentEqual(), alloc) {}
+  explicit sherwood_v3_table(const ArgumentAlloc& alloc) : EntryAlloc(alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            hash,
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    if (bucket_count == 0)
+      rehash(il.size());
+    insert(il.begin(), il.end());
+  }
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            il,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(il, bucket_count, hash, ArgumentEqual(), alloc) {}
+  sherwood_v3_table(const sherwood_v3_table& other)
+      : sherwood_v3_table(
+            other,
+            AllocatorTraits::select_on_container_copy_construction(
+                other.get_allocator())) {}
+  sherwood_v3_table(const sherwood_v3_table& other, const ArgumentAlloc& alloc)
+      : EntryAlloc(alloc),
+        DetailHasher(other),
+        Equal(other),
+        _max_load_factor(other._max_load_factor) {
+    rehash_for_other_container(other);
+    try {
+      insert(other.begin(), other.end());
+    } catch (...) {
+      clear();
+      deallocate_data(entries, num_slots_minus_one, max_lookups);
+      throw;
+    }
+  }
+  sherwood_v3_table(sherwood_v3_table&& other) noexcept
+      : EntryAlloc(std::move(other)),
+        DetailHasher(std::move(other)),
+        Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table(
+      sherwood_v3_table&& other,
+      const ArgumentAlloc& alloc) noexcept
+      : EntryAlloc(alloc),
+        DetailHasher(std::move(other)),
+        Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table& operator=(const sherwood_v3_table& other) {
+    if (this == std::addressof(other))
+      return *this;
+
+    clear();
+    if constexpr (AllocatorTraits::propagate_on_container_copy_assignment::
+                      value) {
+      if (static_cast<EntryAlloc&>(*this) !=
+          static_cast<const EntryAlloc&>(other)) {
+        reset_to_empty_state();
+      }
+      static_cast<EntryAlloc&>(*this) = other;
+    }
+    _max_load_factor = other._max_load_factor;
+    static_cast<DetailHasher&>(*this) = other;
+    static_cast<Equal&>(*this) = other;
+    rehash_for_other_container(other);
+    insert(other.begin(), other.end());
+    return *this;
+  }
+  sherwood_v3_table& operator=(sherwood_v3_table&& other) noexcept {
+    if (this == std::addressof(other))
+      return *this;
+    else if constexpr (AllocatorTraits::propagate_on_container_move_assignment::
+                           value) {
+      clear();
+      reset_to_empty_state();
+      static_cast<EntryAlloc&>(*this) = std::move(other);
+      swap_pointers(other);
+    } else if (
+        static_cast<EntryAlloc&>(*this) == static_cast<EntryAlloc&>(other)) {
+      swap_pointers(other);
+    } else {
+      clear();
+      _max_load_factor = other._max_load_factor;
+      rehash_for_other_container(other);
+      for (T& elem : other)
+        emplace(std::move(elem));
+      other.clear();
+    }
+    static_cast<DetailHasher&>(*this) = std::move(other);
+    static_cast<Equal&>(*this) = std::move(other);
+    return *this;
+  }
+  ~sherwood_v3_table() {
+    clear();
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+  }
+
+  const allocator_type& get_allocator() const {
+    return static_cast<const allocator_type&>(*this);
+  }
+  const ArgumentEqual& key_eq() const {
+    return static_cast<const ArgumentEqual&>(*this);
+  }
+  const ArgumentHash& hash_function() const {
+    return static_cast<const ArgumentHash&>(*this);
+  }
+
+  template <typename ValueType>
+  struct templated_iterator {
+    templated_iterator() = default;
+    templated_iterator(EntryPointer current) : current(current) {}
+    EntryPointer current = EntryPointer();
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ValueType;
+    using difference_type = ptrdiff_t;
+    using pointer = ValueType*;
+    using reference = ValueType&;
+
+    friend bool operator==(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return lhs.current == rhs.current;
+    }
+    friend bool operator!=(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return !(lhs == rhs);
+    }
+
+    templated_iterator& operator++() {
+      do {
+        ++current;
+      } while (current->is_empty());
+      return *this;
+    }
+    templated_iterator operator++(int) {
+      templated_iterator copy(*this);
+      ++*this;
+      return copy;
+    }
+
+    ValueType& operator*() const {
+      return current->value;
+    }
+    ValueType* operator->() const {
+      return std::addressof(current->value);
+    }
+
+    // the template automatically disables the operator when value_type is
+    // already const, because that would cause a lot of compiler warnings
+    // otherwise.
+    template <
+        class target_type = const value_type,
+        class = std::enable_if_t<
+            std::is_same_v<target_type, const value_type> &&
+            !std::is_same_v<target_type, value_type>>>
+    operator templated_iterator<target_type>() const {
+      return {current};
+    }
+  };
+  using iterator = templated_iterator<value_type>;
+  using const_iterator = templated_iterator<const value_type>;
+
+  iterator begin() {
+    for (EntryPointer it = entries;; ++it) {
+      if (it->has_value())
+        return {it};
+    }
+  }
+  const_iterator begin() const {
+    for (EntryPointer it = entries;; ++it) {
+      if (it->has_value())
+        return {it};
+    }
+  }
+  const_iterator cbegin() const {
+    return begin();
+  }
+  iterator end() {
+    return {
+        entries + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups)};
+  }
+  const_iterator end() const {
+    return {
+        entries + static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups)};
+  }
+  const_iterator cend() const {
+    return end();
+  }
+
+  iterator find(const FindKey& key) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer it = entries + ptrdiff_t(index);
+    for (int8_t distance = 0; it->distance_from_desired >= distance;
+         ++distance, ++it) {
+      if (compares_equal(key, it->value))
+        return {it};
+    }
+    return end();
+  }
+  const_iterator find(const FindKey& key) const {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return const_cast<sherwood_v3_table*>(this)->find(key);
+  }
+  uint64_t count(const FindKey& key) const {
+    return find(key) == end() ? 0 : 1;
+  }
+  std::pair<iterator, iterator> equal_range(const FindKey& key) {
+    iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+  std::pair<const_iterator, const_iterator> equal_range(
+      const FindKey& key) const {
+    const_iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+
+  template <typename Key, typename... Args>
+  std::pair<iterator, bool> emplace(Key&& key, Args&&... args) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer current_entry = entries + ptrdiff_t(index);
+    int8_t distance_from_desired = 0;
+    for (; current_entry->distance_from_desired >= distance_from_desired;
+         ++current_entry, ++distance_from_desired) {
+      if (compares_equal(key, current_entry->value))
+        return {{current_entry}, false};
+    }
+    return emplace_new_key(
+        distance_from_desired,
+        current_entry,
+        std::forward<Key>(key),
+        std::forward<Args>(args)...);
+  }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return emplace(value);
+  }
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return emplace(std::move(value));
+  }
+  template <typename... Args>
+  iterator emplace_hint(const_iterator, Args&&... args) {
+    return emplace(std::forward<Args>(args)...).first;
+  }
+  iterator insert(const_iterator, const value_type& value) {
+    return emplace(value).first;
+  }
+  iterator insert(const_iterator, value_type&& value) {
+    return emplace(std::move(value)).first;
+  }
+
+  template <typename It>
+  void insert(It begin, It end) {
+    for (; begin != end; ++begin) {
+      emplace(*begin);
+    }
+  }
+  void insert(std::initializer_list<value_type> il) {
+    insert(il.begin(), il.end());
+  }
+
+  void rehash(uint64_t num_buckets) {
+    num_buckets = std::max(
+        num_buckets,
+        static_cast<uint64_t>(
+            std::ceil(num_elements / static_cast<double>(_max_load_factor))));
+    if (num_buckets == 0) {
+      reset_to_empty_state();
+      return;
+    }
+    auto new_prime_index = hash_policy.next_size_over(num_buckets);
+    if (num_buckets == bucket_count())
+      return;
+    int8_t new_max_lookups = compute_max_lookups(num_buckets);
+    EntryPointer new_buckets(
+        AllocatorTraits::allocate(*this, num_buckets + new_max_lookups));
+    EntryPointer special_end_item =
+        new_buckets + static_cast<ptrdiff_t>(num_buckets + new_max_lookups - 1);
+    for (EntryPointer it = new_buckets; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    std::swap(entries, new_buckets);
+    std::swap(num_slots_minus_one, num_buckets);
+    --num_slots_minus_one;
+    hash_policy.commit(new_prime_index);
+    int8_t old_max_lookups = max_lookups;
+    max_lookups = new_max_lookups;
+    num_elements = 0;
+    for (EntryPointer
+             it = new_buckets,
+             end = it + static_cast<ptrdiff_t>(num_buckets + old_max_lookups);
+         it != end;
+         ++it) {
+      if (it->has_value()) {
+        emplace(std::move(it->value));
+        it->destroy_value();
+      }
+    }
+    deallocate_data(new_buckets, num_buckets, old_max_lookups);
+  }
+
+  void reserve(uint64_t num_elements_) {
+    uint64_t required_buckets = num_buckets_for_reserve(num_elements_);
+    if (required_buckets > bucket_count())
+      rehash(required_buckets);
+  }
+
+  // the return value is a type that can be converted to an iterator
+  // the reason for doing this is that it's not free to find the
+  // iterator pointing at the next element. if you care about the
+  // next iterator, turn the return value into an iterator
+  convertible_to_iterator erase(const_iterator to_erase) {
+    EntryPointer current = to_erase.current;
+    current->destroy_value();
+    --num_elements;
+    for (EntryPointer next = current + ptrdiff_t(1);
+         !next->is_at_desired_position();
+         ++current, ++next) {
+      current->emplace(next->distance_from_desired - 1, std::move(next->value));
+      next->destroy_value();
+    }
+    return {to_erase.current};
+  }
+
+  iterator erase(const_iterator begin_it, const_iterator end_it) {
+    if (begin_it == end_it)
+      return {begin_it.current};
+    for (EntryPointer it = begin_it.current, end = end_it.current; it != end;
+         ++it) {
+      if (it->has_value()) {
+        it->destroy_value();
+        --num_elements;
+      }
+    }
+    if (end_it == this->end())
+      return this->end();
+    ptrdiff_t num_to_move = std::min(
+        static_cast<ptrdiff_t>(end_it.current->distance_from_desired),
+        end_it.current - begin_it.current);
+    EntryPointer to_return = end_it.current - num_to_move;
+    for (EntryPointer it = end_it.current; !it->is_at_desired_position();) {
+      EntryPointer target = it - num_to_move;
+      target->emplace(
+          it->distance_from_desired - num_to_move, std::move(it->value));
+      it->destroy_value();
+      ++it;
+      num_to_move = std::min(
+          static_cast<ptrdiff_t>(it->distance_from_desired), num_to_move);
+    }
+    return {to_return};
+  }
+
+  uint64_t erase(const FindKey& key) {
+    auto found = find(key);
+    if (found == end())
+      return 0;
+    else {
+      erase(found);
+      return 1;
+    }
+  }
+
+  void clear() {
+    for (EntryPointer it = entries,
+                      end = it +
+             static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups);
+         it != end;
+         ++it) {
+      if (it->has_value())
+        it->destroy_value();
+    }
+    num_elements = 0;
+  }
+
+  void shrink_to_fit() {
+    rehash_for_other_container(*this);
+  }
+
+  void swap(sherwood_v3_table& other) noexcept {
+    using std::swap;
+    swap_pointers(other);
+    swap(static_cast<ArgumentHash&>(*this), static_cast<ArgumentHash&>(other));
+    swap(
+        static_cast<ArgumentEqual&>(*this), static_cast<ArgumentEqual&>(other));
+    if (AllocatorTraits::propagate_on_container_swap::value)
+      swap(static_cast<EntryAlloc&>(*this), static_cast<EntryAlloc&>(other));
+  }
+
+  uint64_t size() const {
+    return num_elements;
+  }
+  uint64_t max_size() const {
+    return (AllocatorTraits::max_size(*this)) / sizeof(Entry);
+  }
+  uint64_t bucket_count() const {
+    return num_slots_minus_one ? num_slots_minus_one + 1 : 0;
+  }
+  size_type max_bucket_count() const {
+    return (AllocatorTraits::max_size(*this) - min_lookups) / sizeof(Entry);
+  }
+  uint64_t bucket(const FindKey& key) const {
+    return hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+  }
+  float load_factor() const {
+    uint64_t buckets = bucket_count();
+    if (buckets)
+      return static_cast<float>(num_elements) / bucket_count();
+    else
+      return 0;
+  }
+  void max_load_factor(float value) {
+    _max_load_factor = value;
+  }
+  float max_load_factor() const {
+    return _max_load_factor;
+  }
+
+  bool empty() const {
+    return num_elements == 0;
+  }
+
+ private:
+  EntryPointer entries = empty_default_table();
+  uint64_t num_slots_minus_one = 0;
+  typename HashPolicySelector<ArgumentHash>::type hash_policy;
+  int8_t max_lookups = detailv3::min_lookups - 1;
+  float _max_load_factor = 0.5f;
+  uint64_t num_elements = 0;
+
+  EntryPointer empty_default_table() {
+    EntryPointer result =
+        AllocatorTraits::allocate(*this, detailv3::min_lookups);
+    EntryPointer special_end_item =
+        result + static_cast<ptrdiff_t>(detailv3::min_lookups - 1);
+    for (EntryPointer it = result; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    return result;
+  }
+
+  static int8_t compute_max_lookups(uint64_t num_buckets) {
+    int8_t desired = detailv3::log2(num_buckets);
+    return std::max(detailv3::min_lookups, desired);
+  }
+
+  uint64_t num_buckets_for_reserve(uint64_t num_elements_) const {
+    return static_cast<uint64_t>(std::ceil(
+        static_cast<double>(num_elements_) /
+        std::min(0.5, static_cast<double>(_max_load_factor))));
+  }
+  void rehash_for_other_container(const sherwood_v3_table& other) {
+    rehash(
+        std::min(num_buckets_for_reserve(other.size()), other.bucket_count()));
+  }
+
+  void swap_pointers(sherwood_v3_table& other) {
+    using std::swap;
+    swap(hash_policy, other.hash_policy);
+    swap(entries, other.entries);
+    swap(num_slots_minus_one, other.num_slots_minus_one);
+    swap(num_elements, other.num_elements);
+    swap(max_lookups, other.max_lookups);
+    swap(_max_load_factor, other._max_load_factor);
+  }
+
+  template <typename Key, typename... Args>
+  SKA_NOINLINE(std::pair<iterator, bool>)
+  emplace_new_key(
+      int8_t distance_from_desired,
+      EntryPointer current_entry,
+      Key&& key,
+      Args&&... args) {
+    using std::swap;
+    if (num_slots_minus_one == 0 || distance_from_desired == max_lookups ||
+        num_elements + 1 >
+            (num_slots_minus_one + 1) * static_cast<double>(_max_load_factor)) {
+      grow();
+      return emplace(std::forward<Key>(key), std::forward<Args>(args)...);
+    } else if (current_entry->is_empty()) {
+      current_entry->emplace(
+          distance_from_desired,
+          std::forward<Key>(key),
+          std::forward<Args>(args)...);
+      ++num_elements;
+      return {{current_entry}, true};
+    }
+    value_type to_insert(std::forward<Key>(key), std::forward<Args>(args)...);
+    swap(distance_from_desired, current_entry->distance_from_desired);
+    swap(to_insert, current_entry->value);
+    iterator result = {current_entry};
+    for (++distance_from_desired, ++current_entry;; ++current_entry) {
+      if (current_entry->is_empty()) {
+        current_entry->emplace(distance_from_desired, std::move(to_insert));
+        ++num_elements;
+        return {result, true};
+      } else if (current_entry->distance_from_desired < distance_from_desired) {
+        swap(distance_from_desired, current_entry->distance_from_desired);
+        swap(to_insert, current_entry->value);
+        ++distance_from_desired;
+      } else {
+        ++distance_from_desired;
+        if (distance_from_desired == max_lookups) {
+          swap(to_insert, result.current->value);
+          grow();
+          return emplace(std::move(to_insert));
+        }
+      }
+    }
+  }
+
+  void grow() {
+    rehash(std::max(uint64_t(4), 2 * bucket_count()));
+  }
+
+  void deallocate_data(
+      EntryPointer begin,
+      uint64_t num_slots_minus_one_,
+      int8_t max_lookups_) {
+    AllocatorTraits::deallocate(
+        *this, begin, num_slots_minus_one_ + max_lookups_ + 1);
+  }
+
+  void reset_to_empty_state() {
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+    entries = empty_default_table();
+    num_slots_minus_one = 0;
+    hash_policy.reset();
+    max_lookups = detailv3::min_lookups - 1;
+  }
+
+  template <typename U>
+  uint64_t hash_object(const U& key) {
+    return static_cast<DetailHasher&>(*this)(key);
+  }
+  template <typename U>
+  uint64_t hash_object(const U& key) const {
+    return static_cast<const DetailHasher&>(*this)(key);
+  }
+  template <typename L, typename R>
+  bool compares_equal(const L& lhs, const R& rhs) {
+    return static_cast<Equal&>(*this)(lhs, rhs);
+  }
+
+ public:
+  struct convertible_to_iterator {
+    EntryPointer it;
+
+    operator iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++iterator{it};
+    }
+    operator const_iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++const_iterator{it};
+    }
+  };
+};
+} // namespace detailv3
+
+struct prime_number_hash_policy {
+  static uint64_t mod0(uint64_t) {
+    return 0llu;
+  }
+  static uint64_t mod2(uint64_t hash) {
+    return hash % 2llu;
+  }
+  static uint64_t mod3(uint64_t hash) {
+    return hash % 3llu;
+  }
+  static uint64_t mod5(uint64_t hash) {
+    return hash % 5llu;
+  }
+  static uint64_t mod7(uint64_t hash) {
+    return hash % 7llu;
+  }
+  static uint64_t mod11(uint64_t hash) {
+    return hash % 11llu;
+  }
+  static uint64_t mod13(uint64_t hash) {
+    return hash % 13llu;
+  }
+  static uint64_t mod17(uint64_t hash) {
+    return hash % 17llu;
+  }
+  static uint64_t mod23(uint64_t hash) {
+    return hash % 23llu;
+  }
+  static uint64_t mod29(uint64_t hash) {
+    return hash % 29llu;
+  }
+  static uint64_t mod37(uint64_t hash) {
+    return hash % 37llu;
+  }
+  static uint64_t mod47(uint64_t hash) {
+    return hash % 47llu;
+  }
+  static uint64_t mod59(uint64_t hash) {
+    return hash % 59llu;
+  }
+  static uint64_t mod73(uint64_t hash) {
+    return hash % 73llu;
+  }
+  static uint64_t mod97(uint64_t hash) {
+    return hash % 97llu;
+  }
+  static uint64_t mod127(uint64_t hash) {
+    return hash % 127llu;
+  }
+  static uint64_t mod151(uint64_t hash) {
+    return hash % 151llu;
+  }
+  static uint64_t mod197(uint64_t hash) {
+    return hash % 197llu;
+  }
+  static uint64_t mod251(uint64_t hash) {
+    return hash % 251llu;
+  }
+  static uint64_t mod313(uint64_t hash) {
+    return hash % 313llu;
+  }
+  static uint64_t mod397(uint64_t hash) {
+    return hash % 397llu;
+  }
+  static uint64_t mod499(uint64_t hash) {
+    return hash % 499llu;
+  }
+  static uint64_t mod631(uint64_t hash) {
+    return hash % 631llu;
+  }
+  static uint64_t mod797(uint64_t hash) {
+    return hash % 797llu;
+  }
+  static uint64_t mod1009(uint64_t hash) {
+    return hash % 1009llu;
+  }
+  static uint64_t mod1259(uint64_t hash) {
+    return hash % 1259llu;
+  }
+  static uint64_t mod1597(uint64_t hash) {
+    return hash % 1597llu;
+  }
+  static uint64_t mod2011(uint64_t hash) {
+    return hash % 2011llu;
+  }
+  static uint64_t mod2539(uint64_t hash) {
+    return hash % 2539llu;
+  }
+  static uint64_t mod3203(uint64_t hash) {
+    return hash % 3203llu;
+  }
+  static uint64_t mod4027(uint64_t hash) {
+    return hash % 4027llu;
+  }
+  static uint64_t mod5087(uint64_t hash) {
+    return hash % 5087llu;
+  }
+  static uint64_t mod6421(uint64_t hash) {
+    return hash % 6421llu;
+  }
+  static uint64_t mod8089(uint64_t hash) {
+    return hash % 8089llu;
+  }
+  static uint64_t mod10193(uint64_t hash) {
+    return hash % 10193llu;
+  }
+  static uint64_t mod12853(uint64_t hash) {
+    return hash % 12853llu;
+  }
+  static uint64_t mod16193(uint64_t hash) {
+    return hash % 16193llu;
+  }
+  static uint64_t mod20399(uint64_t hash) {
+    return hash % 20399llu;
+  }
+  static uint64_t mod25717(uint64_t hash) {
+    return hash % 25717llu;
+  }
+  static uint64_t mod32401(uint64_t hash) {
+    return hash % 32401llu;
+  }
+  static uint64_t mod40823(uint64_t hash) {
+    return hash % 40823llu;
+  }
+  static uint64_t mod51437(uint64_t hash) {
+    return hash % 51437llu;
+  }
+  static uint64_t mod64811(uint64_t hash) {
+    return hash % 64811llu;
+  }
+  static uint64_t mod81649(uint64_t hash) {
+    return hash % 81649llu;
+  }
+  static uint64_t mod102877(uint64_t hash) {
+    return hash % 102877llu;
+  }
+  static uint64_t mod129607(uint64_t hash) {
+    return hash % 129607llu;
+  }
+  static uint64_t mod163307(uint64_t hash) {
+    return hash % 163307llu;
+  }
+  static uint64_t mod205759(uint64_t hash) {
+    return hash % 205759llu;
+  }
+  static uint64_t mod259229(uint64_t hash) {
+    return hash % 259229llu;
+  }
+  static uint64_t mod326617(uint64_t hash) {
+    return hash % 326617llu;
+  }
+  static uint64_t mod411527(uint64_t hash) {
+    return hash % 411527llu;
+  }
+  static uint64_t mod518509(uint64_t hash) {
+    return hash % 518509llu;
+  }
+  static uint64_t mod653267(uint64_t hash) {
+    return hash % 653267llu;
+  }
+  static uint64_t mod823117(uint64_t hash) {
+    return hash % 823117llu;
+  }
+  static uint64_t mod1037059(uint64_t hash) {
+    return hash % 1037059llu;
+  }
+  static uint64_t mod1306601(uint64_t hash) {
+    return hash % 1306601llu;
+  }
+  static uint64_t mod1646237(uint64_t hash) {
+    return hash % 1646237llu;
+  }
+  static uint64_t mod2074129(uint64_t hash) {
+    return hash % 2074129llu;
+  }
+  static uint64_t mod2613229(uint64_t hash) {
+    return hash % 2613229llu;
+  }
+  static uint64_t mod3292489(uint64_t hash) {
+    return hash % 3292489llu;
+  }
+  static uint64_t mod4148279(uint64_t hash) {
+    return hash % 4148279llu;
+  }
+  static uint64_t mod5226491(uint64_t hash) {
+    return hash % 5226491llu;
+  }
+  static uint64_t mod6584983(uint64_t hash) {
+    return hash % 6584983llu;
+  }
+  static uint64_t mod8296553(uint64_t hash) {
+    return hash % 8296553llu;
+  }
+  static uint64_t mod10453007(uint64_t hash) {
+    return hash % 10453007llu;
+  }
+  static uint64_t mod13169977(uint64_t hash) {
+    return hash % 13169977llu;
+  }
+  static uint64_t mod16593127(uint64_t hash) {
+    return hash % 16593127llu;
+  }
+  static uint64_t mod20906033(uint64_t hash) {
+    return hash % 20906033llu;
+  }
+  static uint64_t mod26339969(uint64_t hash) {
+    return hash % 26339969llu;
+  }
+  static uint64_t mod33186281(uint64_t hash) {
+    return hash % 33186281llu;
+  }
+  static uint64_t mod41812097(uint64_t hash) {
+    return hash % 41812097llu;
+  }
+  static uint64_t mod52679969(uint64_t hash) {
+    return hash % 52679969llu;
+  }
+  static uint64_t mod66372617(uint64_t hash) {
+    return hash % 66372617llu;
+  }
+  static uint64_t mod83624237(uint64_t hash) {
+    return hash % 83624237llu;
+  }
+  static uint64_t mod105359939(uint64_t hash) {
+    return hash % 105359939llu;
+  }
+  static uint64_t mod132745199(uint64_t hash) {
+    return hash % 132745199llu;
+  }
+  static uint64_t mod167248483(uint64_t hash) {
+    return hash % 167248483llu;
+  }
+  static uint64_t mod210719881(uint64_t hash) {
+    return hash % 210719881llu;
+  }
+  static uint64_t mod265490441(uint64_t hash) {
+    return hash % 265490441llu;
+  }
+  static uint64_t mod334496971(uint64_t hash) {
+    return hash % 334496971llu;
+  }
+  static uint64_t mod421439783(uint64_t hash) {
+    return hash % 421439783llu;
+  }
+  static uint64_t mod530980861(uint64_t hash) {
+    return hash % 530980861llu;
+  }
+  static uint64_t mod668993977(uint64_t hash) {
+    return hash % 668993977llu;
+  }
+  static uint64_t mod842879579(uint64_t hash) {
+    return hash % 842879579llu;
+  }
+  static uint64_t mod1061961721(uint64_t hash) {
+    return hash % 1061961721llu;
+  }
+  static uint64_t mod1337987929(uint64_t hash) {
+    return hash % 1337987929llu;
+  }
+  static uint64_t mod1685759167(uint64_t hash) {
+    return hash % 1685759167llu;
+  }
+  static uint64_t mod2123923447(uint64_t hash) {
+    return hash % 2123923447llu;
+  }
+  static uint64_t mod2675975881(uint64_t hash) {
+    return hash % 2675975881llu;
+  }
+  static uint64_t mod3371518343(uint64_t hash) {
+    return hash % 3371518343llu;
+  }
+  static uint64_t mod4247846927(uint64_t hash) {
+    return hash % 4247846927llu;
+  }
+  static uint64_t mod5351951779(uint64_t hash) {
+    return hash % 5351951779llu;
+  }
+  static uint64_t mod6743036717(uint64_t hash) {
+    return hash % 6743036717llu;
+  }
+  static uint64_t mod8495693897(uint64_t hash) {
+    return hash % 8495693897llu;
+  }
+  static uint64_t mod10703903591(uint64_t hash) {
+    return hash % 10703903591llu;
+  }
+  static uint64_t mod13486073473(uint64_t hash) {
+    return hash % 13486073473llu;
+  }
+  static uint64_t mod16991387857(uint64_t hash) {
+    return hash % 16991387857llu;
+  }
+  static uint64_t mod21407807219(uint64_t hash) {
+    return hash % 21407807219llu;
+  }
+  static uint64_t mod26972146961(uint64_t hash) {
+    return hash % 26972146961llu;
+  }
+  static uint64_t mod33982775741(uint64_t hash) {
+    return hash % 33982775741llu;
+  }
+  static uint64_t mod42815614441(uint64_t hash) {
+    return hash % 42815614441llu;
+  }
+  static uint64_t mod53944293929(uint64_t hash) {
+    return hash % 53944293929llu;
+  }
+  static uint64_t mod67965551447(uint64_t hash) {
+    return hash % 67965551447llu;
+  }
+  static uint64_t mod85631228929(uint64_t hash) {
+    return hash % 85631228929llu;
+  }
+  static uint64_t mod107888587883(uint64_t hash) {
+    return hash % 107888587883llu;
+  }
+  static uint64_t mod135931102921(uint64_t hash) {
+    return hash % 135931102921llu;
+  }
+  static uint64_t mod171262457903(uint64_t hash) {
+    return hash % 171262457903llu;
+  }
+  static uint64_t mod215777175787(uint64_t hash) {
+    return hash % 215777175787llu;
+  }
+  static uint64_t mod271862205833(uint64_t hash) {
+    return hash % 271862205833llu;
+  }
+  static uint64_t mod342524915839(uint64_t hash) {
+    return hash % 342524915839llu;
+  }
+  static uint64_t mod431554351609(uint64_t hash) {
+    return hash % 431554351609llu;
+  }
+  static uint64_t mod543724411781(uint64_t hash) {
+    return hash % 543724411781llu;
+  }
+  static uint64_t mod685049831731(uint64_t hash) {
+    return hash % 685049831731llu;
+  }
+  static uint64_t mod863108703229(uint64_t hash) {
+    return hash % 863108703229llu;
+  }
+  static uint64_t mod1087448823553(uint64_t hash) {
+    return hash % 1087448823553llu;
+  }
+  static uint64_t mod1370099663459(uint64_t hash) {
+    return hash % 1370099663459llu;
+  }
+  static uint64_t mod1726217406467(uint64_t hash) {
+    return hash % 1726217406467llu;
+  }
+  static uint64_t mod2174897647073(uint64_t hash) {
+    return hash % 2174897647073llu;
+  }
+  static uint64_t mod2740199326961(uint64_t hash) {
+    return hash % 2740199326961llu;
+  }
+  static uint64_t mod3452434812973(uint64_t hash) {
+    return hash % 3452434812973llu;
+  }
+  static uint64_t mod4349795294267(uint64_t hash) {
+    return hash % 4349795294267llu;
+  }
+  static uint64_t mod5480398654009(uint64_t hash) {
+    return hash % 5480398654009llu;
+  }
+  static uint64_t mod6904869625999(uint64_t hash) {
+    return hash % 6904869625999llu;
+  }
+  static uint64_t mod8699590588571(uint64_t hash) {
+    return hash % 8699590588571llu;
+  }
+  static uint64_t mod10960797308051(uint64_t hash) {
+    return hash % 10960797308051llu;
+  }
+  static uint64_t mod13809739252051(uint64_t hash) {
+    return hash % 13809739252051llu;
+  }
+  static uint64_t mod17399181177241(uint64_t hash) {
+    return hash % 17399181177241llu;
+  }
+  static uint64_t mod21921594616111(uint64_t hash) {
+    return hash % 21921594616111llu;
+  }
+  static uint64_t mod27619478504183(uint64_t hash) {
+    return hash % 27619478504183llu;
+  }
+  static uint64_t mod34798362354533(uint64_t hash) {
+    return hash % 34798362354533llu;
+  }
+  static uint64_t mod43843189232363(uint64_t hash) {
+    return hash % 43843189232363llu;
+  }
+  static uint64_t mod55238957008387(uint64_t hash) {
+    return hash % 55238957008387llu;
+  }
+  static uint64_t mod69596724709081(uint64_t hash) {
+    return hash % 69596724709081llu;
+  }
+  static uint64_t mod87686378464759(uint64_t hash) {
+    return hash % 87686378464759llu;
+  }
+  static uint64_t mod110477914016779(uint64_t hash) {
+    return hash % 110477914016779llu;
+  }
+  static uint64_t mod139193449418173(uint64_t hash) {
+    return hash % 139193449418173llu;
+  }
+  static uint64_t mod175372756929481(uint64_t hash) {
+    return hash % 175372756929481llu;
+  }
+  static uint64_t mod220955828033581(uint64_t hash) {
+    return hash % 220955828033581llu;
+  }
+  static uint64_t mod278386898836457(uint64_t hash) {
+    return hash % 278386898836457llu;
+  }
+  static uint64_t mod350745513859007(uint64_t hash) {
+    return hash % 350745513859007llu;
+  }
+  static uint64_t mod441911656067171(uint64_t hash) {
+    return hash % 441911656067171llu;
+  }
+  static uint64_t mod556773797672909(uint64_t hash) {
+    return hash % 556773797672909llu;
+  }
+  static uint64_t mod701491027718027(uint64_t hash) {
+    return hash % 701491027718027llu;
+  }
+  static uint64_t mod883823312134381(uint64_t hash) {
+    return hash % 883823312134381llu;
+  }
+  static uint64_t mod1113547595345903(uint64_t hash) {
+    return hash % 1113547595345903llu;
+  }
+  static uint64_t mod1402982055436147(uint64_t hash) {
+    return hash % 1402982055436147llu;
+  }
+  static uint64_t mod1767646624268779(uint64_t hash) {
+    return hash % 1767646624268779llu;
+  }
+  static uint64_t mod2227095190691797(uint64_t hash) {
+    return hash % 2227095190691797llu;
+  }
+  static uint64_t mod2805964110872297(uint64_t hash) {
+    return hash % 2805964110872297llu;
+  }
+  static uint64_t mod3535293248537579(uint64_t hash) {
+    return hash % 3535293248537579llu;
+  }
+  static uint64_t mod4454190381383713(uint64_t hash) {
+    return hash % 4454190381383713llu;
+  }
+  static uint64_t mod5611928221744609(uint64_t hash) {
+    return hash % 5611928221744609llu;
+  }
+  static uint64_t mod7070586497075177(uint64_t hash) {
+    return hash % 7070586497075177llu;
+  }
+  static uint64_t mod8908380762767489(uint64_t hash) {
+    return hash % 8908380762767489llu;
+  }
+  static uint64_t mod11223856443489329(uint64_t hash) {
+    return hash % 11223856443489329llu;
+  }
+  static uint64_t mod14141172994150357(uint64_t hash) {
+    return hash % 14141172994150357llu;
+  }
+  static uint64_t mod17816761525534927(uint64_t hash) {
+    return hash % 17816761525534927llu;
+  }
+  static uint64_t mod22447712886978529(uint64_t hash) {
+    return hash % 22447712886978529llu;
+  }
+  static uint64_t mod28282345988300791(uint64_t hash) {
+    return hash % 28282345988300791llu;
+  }
+  static uint64_t mod35633523051069991(uint64_t hash) {
+    return hash % 35633523051069991llu;
+  }
+  static uint64_t mod44895425773957261(uint64_t hash) {
+    return hash % 44895425773957261llu;
+  }
+  static uint64_t mod56564691976601587(uint64_t hash) {
+    return hash % 56564691976601587llu;
+  }
+  static uint64_t mod71267046102139967(uint64_t hash) {
+    return hash % 71267046102139967llu;
+  }
+  static uint64_t mod89790851547914507(uint64_t hash) {
+    return hash % 89790851547914507llu;
+  }
+  static uint64_t mod113129383953203213(uint64_t hash) {
+    return hash % 113129383953203213llu;
+  }
+  static uint64_t mod142534092204280003(uint64_t hash) {
+    return hash % 142534092204280003llu;
+  }
+  static uint64_t mod179581703095829107(uint64_t hash) {
+    return hash % 179581703095829107llu;
+  }
+  static uint64_t mod226258767906406483(uint64_t hash) {
+    return hash % 226258767906406483llu;
+  }
+  static uint64_t mod285068184408560057(uint64_t hash) {
+    return hash % 285068184408560057llu;
+  }
+  static uint64_t mod359163406191658253(uint64_t hash) {
+    return hash % 359163406191658253llu;
+  }
+  static uint64_t mod452517535812813007(uint64_t hash) {
+    return hash % 452517535812813007llu;
+  }
+  static uint64_t mod570136368817120201(uint64_t hash) {
+    return hash % 570136368817120201llu;
+  }
+  static uint64_t mod718326812383316683(uint64_t hash) {
+    return hash % 718326812383316683llu;
+  }
+  static uint64_t mod905035071625626043(uint64_t hash) {
+    return hash % 905035071625626043llu;
+  }
+  static uint64_t mod1140272737634240411(uint64_t hash) {
+    return hash % 1140272737634240411llu;
+  }
+  static uint64_t mod1436653624766633509(uint64_t hash) {
+    return hash % 1436653624766633509llu;
+  }
+  static uint64_t mod1810070143251252131(uint64_t hash) {
+    return hash % 1810070143251252131llu;
+  }
+  static uint64_t mod2280545475268481167(uint64_t hash) {
+    return hash % 2280545475268481167llu;
+  }
+  static uint64_t mod2873307249533267101(uint64_t hash) {
+    return hash % 2873307249533267101llu;
+  }
+  static uint64_t mod3620140286502504283(uint64_t hash) {
+    return hash % 3620140286502504283llu;
+  }
+  static uint64_t mod4561090950536962147(uint64_t hash) {
+    return hash % 4561090950536962147llu;
+  }
+  static uint64_t mod5746614499066534157(uint64_t hash) {
+    return hash % 5746614499066534157llu;
+  }
+  static uint64_t mod7240280573005008577(uint64_t hash) {
+    return hash % 7240280573005008577llu;
+  }
+  static uint64_t mod9122181901073924329(uint64_t hash) {
+    return hash % 9122181901073924329llu;
+  }
+  static uint64_t mod11493228998133068689(uint64_t hash) {
+    return hash % 11493228998133068689llu;
+  }
+  static uint64_t mod14480561146010017169(uint64_t hash) {
+    return hash % 14480561146010017169llu;
+  }
+  static uint64_t mod18446744073709551557(uint64_t hash) {
+    return hash % 18446744073709551557llu;
+  }
+
+  using mod_function = uint64_t (*)(uint64_t);
+
+  mod_function next_size_over(uint64_t& size) const {
+    // prime numbers generated by the following method:
+    // 1. start with a prime p = 2
+    // 2. go to wolfram alpha and get p = NextPrime(2 * p)
+    // 3. repeat 2. until you overflow 64 bits
+    // you now have large gaps which you would hit if somebody called reserve()
+    // with an unlucky number.
+    // 4. to fill the gaps for every prime p go to wolfram alpha and get
+    // ClosestPrime(p * 2^(1/3)) and ClosestPrime(p * 2^(2/3)) and put those in
+    // the gaps
+    // 5. get PrevPrime(2^64) and put it at the end
+    // NOLINTNEXTLINE(*c-arrays*)
+    static constexpr const uint64_t prime_list[] = {
+        2llu,
+        3llu,
+        5llu,
+        7llu,
+        11llu,
+        13llu,
+        17llu,
+        23llu,
+        29llu,
+        37llu,
+        47llu,
+        59llu,
+        73llu,
+        97llu,
+        127llu,
+        151llu,
+        197llu,
+        251llu,
+        313llu,
+        397llu,
+        499llu,
+        631llu,
+        797llu,
+        1009llu,
+        1259llu,
+        1597llu,
+        2011llu,
+        2539llu,
+        3203llu,
+        4027llu,
+        5087llu,
+        6421llu,
+        8089llu,
+        10193llu,
+        12853llu,
+        16193llu,
+        20399llu,
+        25717llu,
+        32401llu,
+        40823llu,
+        51437llu,
+        64811llu,
+        81649llu,
+        102877llu,
+        129607llu,
+        163307llu,
+        205759llu,
+        259229llu,
+        326617llu,
+        411527llu,
+        518509llu,
+        653267llu,
+        823117llu,
+        1037059llu,
+        1306601llu,
+        1646237llu,
+        2074129llu,
+        2613229llu,
+        3292489llu,
+        4148279llu,
+        5226491llu,
+        6584983llu,
+        8296553llu,
+        10453007llu,
+        13169977llu,
+        16593127llu,
+        20906033llu,
+        26339969llu,
+        33186281llu,
+        41812097llu,
+        52679969llu,
+        66372617llu,
+        83624237llu,
+        105359939llu,
+        132745199llu,
+        167248483llu,
+        210719881llu,
+        265490441llu,
+        334496971llu,
+        421439783llu,
+        530980861llu,
+        668993977llu,
+        842879579llu,
+        1061961721llu,
+        1337987929llu,
+        1685759167llu,
+        2123923447llu,
+        2675975881llu,
+        3371518343llu,
+        4247846927llu,
+        5351951779llu,
+        6743036717llu,
+        8495693897llu,
+        10703903591llu,
+        13486073473llu,
+        16991387857llu,
+        21407807219llu,
+        26972146961llu,
+        33982775741llu,
+        42815614441llu,
+        53944293929llu,
+        67965551447llu,
+        85631228929llu,
+        107888587883llu,
+        135931102921llu,
+        171262457903llu,
+        215777175787llu,
+        271862205833llu,
+        342524915839llu,
+        431554351609llu,
+        543724411781llu,
+        685049831731llu,
+        863108703229llu,
+        1087448823553llu,
+        1370099663459llu,
+        1726217406467llu,
+        2174897647073llu,
+        2740199326961llu,
+        3452434812973llu,
+        4349795294267llu,
+        5480398654009llu,
+        6904869625999llu,
+        8699590588571llu,
+        10960797308051llu,
+        13809739252051llu,
+        17399181177241llu,
+        21921594616111llu,
+        27619478504183llu,
+        34798362354533llu,
+        43843189232363llu,
+        55238957008387llu,
+        69596724709081llu,
+        87686378464759llu,
+        110477914016779llu,
+        139193449418173llu,
+        175372756929481llu,
+        220955828033581llu,
+        278386898836457llu,
+        350745513859007llu,
+        441911656067171llu,
+        556773797672909llu,
+        701491027718027llu,
+        883823312134381llu,
+        1113547595345903llu,
+        1402982055436147llu,
+        1767646624268779llu,
+        2227095190691797llu,
+        2805964110872297llu,
+        3535293248537579llu,
+        4454190381383713llu,
+        5611928221744609llu,
+        7070586497075177llu,
+        8908380762767489llu,
+        11223856443489329llu,
+        14141172994150357llu,
+        17816761525534927llu,
+        22447712886978529llu,
+        28282345988300791llu,
+        35633523051069991llu,
+        44895425773957261llu,
+        56564691976601587llu,
+        71267046102139967llu,
+        89790851547914507llu,
+        113129383953203213llu,
+        142534092204280003llu,
+        179581703095829107llu,
+        226258767906406483llu,
+        285068184408560057llu,
+        359163406191658253llu,
+        452517535812813007llu,
+        570136368817120201llu,
+        718326812383316683llu,
+        905035071625626043llu,
+        1140272737634240411llu,
+        1436653624766633509llu,
+        1810070143251252131llu,
+        2280545475268481167llu,
+        2873307249533267101llu,
+        3620140286502504283llu,
+        4561090950536962147llu,
+        5746614499066534157llu,
+        7240280573005008577llu,
+        9122181901073924329llu,
+        11493228998133068689llu,
+        14480561146010017169llu,
+        18446744073709551557llu};
+    // NOLINTNEXTLINE(*c-arrays*)
+    static constexpr uint64_t (*const mod_functions[])(uint64_t) = {
+        &mod0,
+        &mod2,
+        &mod3,
+        &mod5,
+        &mod7,
+        &mod11,
+        &mod13,
+        &mod17,
+        &mod23,
+        &mod29,
+        &mod37,
+        &mod47,
+        &mod59,
+        &mod73,
+        &mod97,
+        &mod127,
+        &mod151,
+        &mod197,
+        &mod251,
+        &mod313,
+        &mod397,
+        &mod499,
+        &mod631,
+        &mod797,
+        &mod1009,
+        &mod1259,
+        &mod1597,
+        &mod2011,
+        &mod2539,
+        &mod3203,
+        &mod4027,
+        &mod5087,
+        &mod6421,
+        &mod8089,
+        &mod10193,
+        &mod12853,
+        &mod16193,
+        &mod20399,
+        &mod25717,
+        &mod32401,
+        &mod40823,
+        &mod51437,
+        &mod64811,
+        &mod81649,
+        &mod102877,
+        &mod129607,
+        &mod163307,
+        &mod205759,
+        &mod259229,
+        &mod326617,
+        &mod411527,
+        &mod518509,
+        &mod653267,
+        &mod823117,
+        &mod1037059,
+        &mod1306601,
+        &mod1646237,
+        &mod2074129,
+        &mod2613229,
+        &mod3292489,
+        &mod4148279,
+        &mod5226491,
+        &mod6584983,
+        &mod8296553,
+        &mod10453007,
+        &mod13169977,
+        &mod16593127,
+        &mod20906033,
+        &mod26339969,
+        &mod33186281,
+        &mod41812097,
+        &mod52679969,
+        &mod66372617,
+        &mod83624237,
+        &mod105359939,
+        &mod132745199,
+        &mod167248483,
+        &mod210719881,
+        &mod265490441,
+        &mod334496971,
+        &mod421439783,
+        &mod530980861,
+        &mod668993977,
+        &mod842879579,
+        &mod1061961721,
+        &mod1337987929,
+        &mod1685759167,
+        &mod2123923447,
+        &mod2675975881,
+        &mod3371518343,
+        &mod4247846927,
+        &mod5351951779,
+        &mod6743036717,
+        &mod8495693897,
+        &mod10703903591,
+        &mod13486073473,
+        &mod16991387857,
+        &mod21407807219,
+        &mod26972146961,
+        &mod33982775741,
+        &mod42815614441,
+        &mod53944293929,
+        &mod67965551447,
+        &mod85631228929,
+        &mod107888587883,
+        &mod135931102921,
+        &mod171262457903,
+        &mod215777175787,
+        &mod271862205833,
+        &mod342524915839,
+        &mod431554351609,
+        &mod543724411781,
+        &mod685049831731,
+        &mod863108703229,
+        &mod1087448823553,
+        &mod1370099663459,
+        &mod1726217406467,
+        &mod2174897647073,
+        &mod2740199326961,
+        &mod3452434812973,
+        &mod4349795294267,
+        &mod5480398654009,
+        &mod6904869625999,
+        &mod8699590588571,
+        &mod10960797308051,
+        &mod13809739252051,
+        &mod17399181177241,
+        &mod21921594616111,
+        &mod27619478504183,
+        &mod34798362354533,
+        &mod43843189232363,
+        &mod55238957008387,
+        &mod69596724709081,
+        &mod87686378464759,
+        &mod110477914016779,
+        &mod139193449418173,
+        &mod175372756929481,
+        &mod220955828033581,
+        &mod278386898836457,
+        &mod350745513859007,
+        &mod441911656067171,
+        &mod556773797672909,
+        &mod701491027718027,
+        &mod883823312134381,
+        &mod1113547595345903,
+        &mod1402982055436147,
+        &mod1767646624268779,
+        &mod2227095190691797,
+        &mod2805964110872297,
+        &mod3535293248537579,
+        &mod4454190381383713,
+        &mod5611928221744609,
+        &mod7070586497075177,
+        &mod8908380762767489,
+        &mod11223856443489329,
+        &mod14141172994150357,
+        &mod17816761525534927,
+        &mod22447712886978529,
+        &mod28282345988300791,
+        &mod35633523051069991,
+        &mod44895425773957261,
+        &mod56564691976601587,
+        &mod71267046102139967,
+        &mod89790851547914507,
+        &mod113129383953203213,
+        &mod142534092204280003,
+        &mod179581703095829107,
+        &mod226258767906406483,
+        &mod285068184408560057,
+        &mod359163406191658253,
+        &mod452517535812813007,
+        &mod570136368817120201,
+        &mod718326812383316683,
+        &mod905035071625626043,
+        &mod1140272737634240411,
+        &mod1436653624766633509,
+        &mod1810070143251252131,
+        &mod2280545475268481167,
+        &mod2873307249533267101,
+        &mod3620140286502504283,
+        &mod4561090950536962147,
+        &mod5746614499066534157,
+        &mod7240280573005008577,
+        &mod9122181901073924329,
+        &mod11493228998133068689,
+        &mod14480561146010017169,
+        &mod18446744073709551557};
+    const uint64_t* found = std::lower_bound(
+        std::begin(prime_list), std::end(prime_list) - 1, size);
+    size = *found;
+    return mod_functions[1 + found - prime_list];
+  }
+  void commit(mod_function new_mod_function) {
+    current_mod_function = new_mod_function;
+  }
+  void reset() {
+    current_mod_function = &mod0;
+  }
+
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return current_mod_function(hash);
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index > num_slots_minus_one ? current_mod_function(index) : index;
+  }
+
+ private:
+  mod_function current_mod_function = &mod0;
+};
+
+struct power_of_two_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t num_slots_minus_one) const {
+    return hash & num_slots_minus_one;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index_for_hash(index, num_slots_minus_one);
+  }
+  int8_t next_size_over(uint64_t& size) const {
+    size = detailv3::next_power_of_two(size);
+    return 0;
+  }
+  void commit(int8_t) {}
+  void reset() {}
+};
+
+struct fibonacci_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return (11400714819323198485ull * hash) >> shift;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index & num_slots_minus_one;
+  }
+
+  int8_t next_size_over(uint64_t& size) const {
+    size = std::max(uint64_t(2), detailv3::next_power_of_two(size));
+    return static_cast<int8_t>(64 - detailv3::log2(size));
+  }
+  void commit(int8_t shift_) {
+    shift = shift_;
+  }
+  void reset() {
+    shift = 63;
+  }
+
+ private:
+  int8_t shift = 63;
+};
+
+template <
+    typename K,
+    typename V,
+    typename H = std::hash<K>,
+    typename E = std::equal_to<K>,
+    typename A = std::allocator<std::pair<K, V>>>
+class flat_hash_map
+    : public detailv3::sherwood_v3_table<
+          std::pair<K, V>,
+          K,
+          H,
+          detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+          E,
+          detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<std::pair<K, V>>>> {
+  using Table = detailv3::sherwood_v3_table<
+      std::pair<K, V>,
+      K,
+      H,
+      detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+      E,
+      detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<std::pair<K, V>>>>;
+
+ public:
+  using key_type = K;
+  using mapped_type = V;
+
+  using Table::Table;
+  flat_hash_map() = default;
+
+  inline V& operator[](const K& key) {
+    return emplace(key, convertible_to_value()).first->second;
+  }
+  inline V& operator[](K&& key) {
+    return emplace(std::move(key), convertible_to_value()).first->second;
+  }
+  V& at(const K& key) {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+  const V& at(const K& key) const {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+
+  using Table::emplace;
+  std::pair<typename Table::iterator, bool> emplace() {
+    return emplace(key_type(), convertible_to_value());
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      const key_type& key,
+      M&& m) {
+    auto emplace_result = emplace(key, std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      key_type&& key,
+      M&& m) {
+    auto emplace_result = emplace(std::move(key), std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator,
+      const key_type& key,
+      M&& m) {
+    return insert_or_assign(key, std::forward<M>(m)).first;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator,
+      key_type&& key,
+      M&& m) {
+    return insert_or_assign(std::move(key), std::forward<M>(m)).first;
+  }
+
+  friend bool operator==(const flat_hash_map& lhs, const flat_hash_map& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const typename Table::value_type& value : lhs) {
+      auto found = rhs.find(value.first);
+      if (found == rhs.end() || value.second != found->second)
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(const flat_hash_map& lhs, const flat_hash_map& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  struct convertible_to_value {
+    operator V() const {
+      return V();
+    }
+  };
+};
+
+template <
+    typename T,
+    typename H = std::hash<T>,
+    typename E = std::equal_to<T>,
+    typename A = std::allocator<T>>
+class flat_hash_set
+    : public detailv3::sherwood_v3_table<
+          T,
+          T,
+          H,
+          detailv3::functor_storage<uint64_t, H>,
+          E,
+          detailv3::functor_storage<bool, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<T>>> {
+  using Table = detailv3::sherwood_v3_table<
+      T,
+      T,
+      H,
+      detailv3::functor_storage<uint64_t, H>,
+      E,
+      detailv3::functor_storage<bool, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<T>>>;
+
+ public:
+  using key_type = T;
+
+  using Table::Table;
+  flat_hash_set() = default;
+
+  template <typename... Args>
+  std::pair<typename Table::iterator, bool> emplace(Args&&... args) {
+    return Table::emplace(T(std::forward<Args>(args)...));
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+
+  friend bool operator==(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const T& value : lhs) {
+      if (rhs.find(value) == rhs.end())
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <typename T>
+struct power_of_two_std_hash : std::hash<T> {
+  typedef ska::power_of_two_hash_policy hash_policy;
+};
+
+} // end namespace ska
+
+C10_CLANG_DIAGNOSTIC_POP()
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/floating_point_utils.h b/MLPY/Lib/site-packages/torch/include/c10/util/floating_point_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd74665f830075d2f60c338dbea7e59cb6ce8312
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/floating_point_utils.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <cstdint>
+
+namespace c10::detail {
+
+C10_HOST_DEVICE inline float fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+  return as_float(w);
+#elif defined(__CUDA_ARCH__)
+  return __uint_as_float((unsigned int)w);
+#elif defined(__INTEL_COMPILER)
+  return _castu32_f32(w);
+#else
+  union {
+    uint32_t as_bits;
+    float as_value;
+  } fp32 = {w};
+  return fp32.as_value;
+#endif
+}
+
+C10_HOST_DEVICE inline uint32_t fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+  return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+  return (uint32_t)__float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+  return _castf32_u32(f);
+#else
+  union {
+    float as_value;
+    uint32_t as_bits;
+  } fp32 = {f};
+  return fp32.as_bits;
+#endif
+}
+
+} // namespace c10::detail
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/generic_math.h b/MLPY/Lib/site-packages/torch/include/c10/util/generic_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7e8a7fa1d85636674b4529bb2cf1192b99348e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/generic_math.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <cmath>
+
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define C10_COMPAT_COPYSIGN c10::cuda::compat::copysign
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define C10_COMPAT_COPYSIGN c10::hip::compat::copysign
+#else
+#include <c10/util/copysign.h>
+#define C10_COMPAT_COPYSIGN c10::copysign
+#endif
+
+// The functions in this file should be header-only as it is used under
+// ABI-compatibility mode.
+
+namespace c10 {
+
+// NOTE: [Floor Division in Python]
+// Python's __floordiv__ operator is more complicated than just floor(a / b).
+// It aims to maintain the property: a == (a // b) * b + remainder(a, b)
+// which can otherwise fail due to rounding errors in the remainder.
+// So, instead it is calculated as: a // b = (a - remainder(a, b)) / b
+// With some additional fix-ups added to the result.
+//
+// For reference, see CPython's implementation:
+// https://github.com/python/cpython/blob/ace008c531dd685a30c1dd68f9b5ba35f20171cf/Objects/floatobject.c#L636
+
+template <typename scalar_t>
+inline C10_HOST_DEVICE scalar_t div_floor_floating(scalar_t a, scalar_t b)
+    __ubsan_ignore_float_divide_by_zero__ {
+  if (C10_UNLIKELY(b == 0)) {
+    // Divide by zero: return standard IEEE result
+    return a / b;
+  }
+
+  auto mod = std::fmod(a, b);
+  auto div = (a - mod) / b;
+  if ((mod != 0) && (b < 0) != (mod < 0)) {
+    div -= scalar_t(1);
+  }
+
+  scalar_t floordiv;
+  if (div != 0) {
+    floordiv = std::floor(div);
+    if (div - floordiv > scalar_t(0.5)) {
+      floordiv += scalar_t(1.0);
+    }
+  } else {
+    floordiv = C10_COMPAT_COPYSIGN(scalar_t(0), a / b);
+  }
+  return floordiv;
+}
+
+template <typename scalar_t>
+inline C10_HOST_DEVICE scalar_t div_floor_integer(scalar_t a, scalar_t b) {
+  if (c10::signs_differ(a, b)) {
+    // Subtracts one from the results of truncation division if the
+    // divisor and dividend have different sign(bit)s and the remainder of
+    // the division is nonzero
+    const auto quot = a / b;
+    const auto rem = a % b;
+    return rem ? quot - 1 : quot;
+  }
+  return a / b;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/hash.h b/MLPY/Lib/site-packages/torch/include/c10/util/hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..917c394cfcb07ab29188621024cffff18492a183
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/hash.h
@@ -0,0 +1,379 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <cstddef>
+#include <functional>
+#include <iomanip>
+#include <ios>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/complex.h>
+
+namespace c10 {
+
+// NOTE: hash_combine and SHA1 hashing is based on implementation from Boost
+//
+// Boost Software License - Version 1.0 - August 17th, 2003
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+inline size_t hash_combine(size_t seed, size_t value) {
+  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
+}
+
+// Creates the SHA1 hash of a string. A 160-bit hash.
+// Based on the implementation in Boost (see notice above).
+// Note that SHA1 hashes are no longer considered cryptographically
+//   secure, but are the standard hash for generating unique ids.
+// Usage:
+//   // Let 'code' be a std::string
+//   c10::sha1 sha1_hash{code};
+//   const auto hash_code = sha1_hash.str();
+// TODO: Compare vs OpenSSL and/or CryptoPP implementations
+struct sha1 {
+  typedef unsigned int(digest_type)[5];
+
+  sha1(const std::string& s = "") {
+    if (!s.empty()) {
+      reset();
+      process_bytes(s.c_str(), s.size());
+    }
+  }
+
+  void reset() {
+    h_[0] = 0x67452301;
+    h_[1] = 0xEFCDAB89;
+    h_[2] = 0x98BADCFE;
+    h_[3] = 0x10325476;
+    h_[4] = 0xC3D2E1F0;
+
+    block_byte_index_ = 0;
+    bit_count_low = 0;
+    bit_count_high = 0;
+  }
+
+  std::string str() {
+    unsigned int digest[5];
+    get_digest(digest);
+
+    std::ostringstream buf;
+    for (unsigned int i : digest) {
+      buf << std::hex << std::setfill('0') << std::setw(8) << i;
+    }
+
+    return buf.str();
+  }
+
+ private:
+  unsigned int left_rotate(unsigned int x, std::size_t n) {
+    return (x << n) ^ (x >> (32 - n));
+  }
+
+  void process_block_impl() {
+    unsigned int w[80];
+
+    for (std::size_t i = 0; i < 16; ++i) {
+      w[i] = (block_[i * 4 + 0] << 24);
+      w[i] |= (block_[i * 4 + 1] << 16);
+      w[i] |= (block_[i * 4 + 2] << 8);
+      w[i] |= (block_[i * 4 + 3]);
+    }
+
+    for (std::size_t i = 16; i < 80; ++i) {
+      w[i] = left_rotate((w[i - 3] ^ w[i - 8] ^ w[i - 14] ^ w[i - 16]), 1);
+    }
+
+    unsigned int a = h_[0];
+    unsigned int b = h_[1];
+    unsigned int c = h_[2];
+    unsigned int d = h_[3];
+    unsigned int e = h_[4];
+
+    for (std::size_t i = 0; i < 80; ++i) {
+      unsigned int f = 0;
+      unsigned int k = 0;
+
+      if (i < 20) {
+        f = (b & c) | (~b & d);
+        k = 0x5A827999;
+      } else if (i < 40) {
+        f = b ^ c ^ d;
+        k = 0x6ED9EBA1;
+      } else if (i < 60) {
+        f = (b & c) | (b & d) | (c & d);
+        k = 0x8F1BBCDC;
+      } else {
+        f = b ^ c ^ d;
+        k = 0xCA62C1D6;
+      }
+
+      unsigned temp = left_rotate(a, 5) + f + e + k + w[i];
+      e = d;
+      d = c;
+      c = left_rotate(b, 30);
+      b = a;
+      a = temp;
+    }
+
+    h_[0] += a;
+    h_[1] += b;
+    h_[2] += c;
+    h_[3] += d;
+    h_[4] += e;
+  }
+
+  void process_byte_impl(unsigned char byte) {
+    block_[block_byte_index_++] = byte;
+
+    if (block_byte_index_ == 64) {
+      block_byte_index_ = 0;
+      process_block_impl();
+    }
+  }
+
+  void process_byte(unsigned char byte) {
+    process_byte_impl(byte);
+
+    // size_t max value = 0xFFFFFFFF
+    // if (bit_count_low + 8 >= 0x100000000) { // would overflow
+    // if (bit_count_low >= 0x100000000-8) {
+    if (bit_count_low < 0xFFFFFFF8) {
+      bit_count_low += 8;
+    } else {
+      bit_count_low = 0;
+
+      if (bit_count_high <= 0xFFFFFFFE) {
+        ++bit_count_high;
+      } else {
+        TORCH_CHECK(false, "sha1 too many bytes");
+      }
+    }
+  }
+
+  void process_block(void const* bytes_begin, void const* bytes_end) {
+    unsigned char const* begin = static_cast<unsigned char const*>(bytes_begin);
+    unsigned char const* end = static_cast<unsigned char const*>(bytes_end);
+    for (; begin != end; ++begin) {
+      process_byte(*begin);
+    }
+  }
+
+  void process_bytes(void const* buffer, std::size_t byte_count) {
+    unsigned char const* b = static_cast<unsigned char const*>(buffer);
+    process_block(b, b + byte_count);
+  }
+
+  void get_digest(digest_type& digest) {
+    // append the bit '1' to the message
+    process_byte_impl(0x80);
+
+    // append k bits '0', where k is the minimum number >= 0
+    // such that the resulting message length is congruent to 56 (mod 64)
+    // check if there is enough space for padding and bit_count
+    if (block_byte_index_ > 56) {
+      // finish this block
+      while (block_byte_index_ != 0) {
+        process_byte_impl(0);
+      }
+
+      // one more block
+      while (block_byte_index_ < 56) {
+        process_byte_impl(0);
+      }
+    } else {
+      while (block_byte_index_ < 56) {
+        process_byte_impl(0);
+      }
+    }
+
+    // append length of message (before pre-processing)
+    // as a 64-bit big-endian integer
+    process_byte_impl(
+        static_cast<unsigned char>((bit_count_high >> 24) & 0xFF));
+    process_byte_impl(
+        static_cast<unsigned char>((bit_count_high >> 16) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_high >> 8) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_high) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_low >> 24) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_low >> 16) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_low >> 8) & 0xFF));
+    process_byte_impl(static_cast<unsigned char>((bit_count_low) & 0xFF));
+
+    // get final digest
+    digest[0] = h_[0];
+    digest[1] = h_[1];
+    digest[2] = h_[2];
+    digest[3] = h_[3];
+    digest[4] = h_[4];
+  }
+
+  unsigned int h_[5]{};
+  unsigned char block_[64]{};
+  std::size_t block_byte_index_{};
+  std::size_t bit_count_low{};
+  std::size_t bit_count_high{};
+};
+
+constexpr uint64_t twang_mix64(uint64_t key) noexcept {
+  key = (~key) + (key << 21); // key *= (1 << 21) - 1; key -= 1;
+  key = key ^ (key >> 24);
+  key = key + (key << 3) + (key << 8); // key *= 1 + (1 << 3) + (1 << 8)
+  key = key ^ (key >> 14);
+  key = key + (key << 2) + (key << 4); // key *= 1 + (1 << 2) + (1 << 4)
+  key = key ^ (key >> 28);
+  key = key + (key << 31); // key *= 1 + (1 << 31)
+  return key;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// c10::hash implementation
+////////////////////////////////////////////////////////////////////////////////
+
+namespace _hash_detail {
+
+// Use template argument deduction to shorten calls to c10::hash
+template <typename T>
+size_t simple_get_hash(const T& o);
+
+template <typename T, typename V>
+using type_if_not_enum = std::enable_if_t<!std::is_enum_v<T>, V>;
+
+// Use SFINAE to dispatch to std::hash if possible, cast enum types to int
+// automatically, and fall back to T::hash otherwise. NOTE: C++14 added support
+// for hashing enum types to the standard, and some compilers implement it even
+// when C++14 flags aren't specified. This is why we have to disable this
+// overload if T is an enum type (and use the one below in this case).
+template <typename T>
+auto dispatch_hash(const T& o)
+    -> decltype(std::hash<T>()(o), type_if_not_enum<T, size_t>()) {
+  return std::hash<T>()(o);
+}
+
+template <typename T>
+std::enable_if_t<std::is_enum_v<T>, size_t> dispatch_hash(const T& o) {
+  using R = std::underlying_type_t<T>;
+  return std::hash<R>()(static_cast<R>(o));
+}
+
+template <typename T>
+auto dispatch_hash(const T& o) -> decltype(T::hash(o), size_t()) {
+  return T::hash(o);
+}
+
+} // namespace _hash_detail
+
+// Hasher struct
+template <typename T>
+struct hash {
+  size_t operator()(const T& o) const {
+    return _hash_detail::dispatch_hash(o);
+  };
+};
+
+// Specialization for std::tuple
+template <typename... Types>
+struct hash<std::tuple<Types...>> {
+  template <size_t idx, typename... Ts>
+  struct tuple_hash {
+    size_t operator()(const std::tuple<Ts...>& t) const {
+      return hash_combine(
+          _hash_detail::simple_get_hash(std::get<idx>(t)),
+          tuple_hash<idx - 1, Ts...>()(t));
+    }
+  };
+
+  template <typename... Ts>
+  struct tuple_hash<0, Ts...> {
+    size_t operator()(const std::tuple<Ts...>& t) const {
+      return _hash_detail::simple_get_hash(std::get<0>(t));
+    }
+  };
+
+  size_t operator()(const std::tuple<Types...>& t) const {
+    return tuple_hash<sizeof...(Types) - 1, Types...>()(t);
+  }
+};
+
+template <typename T1, typename T2>
+struct hash<std::pair<T1, T2>> {
+  size_t operator()(const std::pair<T1, T2>& pair) const {
+    std::tuple<T1, T2> tuple = std::make_tuple(pair.first, pair.second);
+    return _hash_detail::simple_get_hash(tuple);
+  }
+};
+
+template <typename T>
+struct hash<c10::ArrayRef<T>> {
+  size_t operator()(c10::ArrayRef<T> v) const {
+    size_t seed = 0;
+    for (const auto& elem : v) {
+      seed = hash_combine(seed, _hash_detail::simple_get_hash(elem));
+    }
+    return seed;
+  }
+};
+
+// Specialization for std::vector
+template <typename T>
+struct hash<std::vector<T>> {
+  size_t operator()(const std::vector<T>& v) const {
+    return hash<c10::ArrayRef<T>>()(v);
+  }
+};
+
+namespace _hash_detail {
+
+template <typename T>
+size_t simple_get_hash(const T& o) {
+  return c10::hash<T>()(o);
+}
+
+} // namespace _hash_detail
+
+// Use this function to actually hash multiple things in one line.
+// Dispatches to c10::hash, so it can hash containers.
+// Example:
+//
+// static size_t hash(const MyStruct& s) {
+//   return get_hash(s.member1, s.member2, s.member3);
+// }
+template <typename... Types>
+size_t get_hash(const Types&... args) {
+  return c10::hash<decltype(std::tie(args...))>()(std::tie(args...));
+}
+
+// Specialization for c10::complex
+template <typename T>
+struct hash<c10::complex<T>> {
+  size_t operator()(const c10::complex<T>& c) const {
+    return get_hash(c.real(), c.imag());
+  }
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/int128.h b/MLPY/Lib/site-packages/torch/include/c10/util/int128.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2239fcb21f333e27015ffbd8e48d3b27a83497
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/int128.h
@@ -0,0 +1,398 @@
+// This file is based on the uint128 implementation of protobuf at
+// https://github.com/protocolbuffers/protobuf/blob/1e88936fce10cf773cb72b44c6a7f48b38c7578b/src/google/protobuf/stubs/int128.h
+//
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <cstdint>
+#include <iosfwd>
+
+namespace c10 {
+
+struct uint128_pod;
+
+// TODO(xiaofeng): Define GOOGLE_PROTOBUF_HAS_CONSTEXPR when constexpr is
+// available.
+#ifdef GOOGLE_PROTOBUF_HAS_CONSTEXPR
+#define UINT128_CONSTEXPR constexpr
+#else
+#define UINT128_CONSTEXPR
+#endif
+
+class uint128;
+static inline uint128& operator<<=(uint128& self, int amount);
+
+// An unsigned 128-bit integer type. Thread-compatible.
+class C10_API uint128 {
+ public:
+  UINT128_CONSTEXPR uint128(); // Sets to 0, but don't trust on this behavior.
+  UINT128_CONSTEXPR uint128(uint64_t top, uint64_t bottom);
+#ifndef SWIG
+  UINT128_CONSTEXPR uint128(int bottom);
+  UINT128_CONSTEXPR uint128(uint32_t bottom); // Top 96 bits = 0
+#endif
+  UINT128_CONSTEXPR uint128(uint64_t bottom); // hi_ = 0
+  UINT128_CONSTEXPR uint128(const uint128_pod& val);
+
+  // Trivial copy constructor, assignment operator and destructor.
+
+  void Initialize(uint64_t top, uint64_t bottom);
+
+  // Arithmetic operators.
+  uint128& operator+=(const uint128& b);
+  uint128& operator-=(const uint128& b);
+  uint128& operator*=(const uint128& b);
+  // Long division/modulo for uint128.
+  uint128& operator/=(const uint128& b);
+  uint128& operator%=(const uint128& b);
+  uint128 operator++(int);
+  uint128 operator--(int);
+  // Make msvc happy with using operator<<= from DivModImpl
+  // which is a static function, and linker complained about missing
+  // static version of this overload
+  friend uint128& operator<<=(uint128&, int);
+  uint128& operator>>=(int);
+  uint128& operator&=(const uint128& b);
+  uint128& operator|=(const uint128& b);
+  uint128& operator^=(const uint128& b);
+  uint128& operator++();
+  uint128& operator--();
+
+  friend uint64_t Uint128Low64(const uint128& v);
+  friend uint64_t Uint128High64(const uint128& v);
+
+  // We add "std::" to avoid including all of port.h.
+  C10_API friend std::ostream& operator<<(std::ostream& o, const uint128& b);
+
+ private:
+  static void DivModImpl(
+      uint128 dividend,
+      uint128 divisor,
+      uint128* quotient_ret,
+      uint128* remainder_ret);
+
+  // Little-endian memory order optimizations can benefit from
+  // having lo_ first, hi_ last.
+  // See util/endian/endian.h and Load128/Store128 for storing a uint128.
+  uint64_t lo_;
+  uint64_t hi_;
+
+  // Not implemented, just declared for catching automatic type conversions.
+  uint128(uint8_t);
+  uint128(uint16_t);
+  uint128(float v);
+  uint128(double v);
+};
+
+// This is a POD form of uint128 which can be used for static variables which
+// need to be operated on as uint128.
+struct uint128_pod {
+  // Note: The ordering of fields is different than 'class uint128' but the
+  // same as its 2-arg constructor.  This enables more obvious initialization
+  // of static instances, which is the primary reason for this struct in the
+  // first place.  This does not seem to defeat any optimizations wrt
+  // operations involving this struct.
+  uint64_t hi;
+  uint64_t lo;
+};
+
+C10_API extern const uint128_pod kuint128max;
+
+// allow uint128 to be logged
+C10_API extern std::ostream& operator<<(std::ostream& o, const uint128& b);
+
+// Methods to access low and high pieces of 128-bit value.
+// Defined externally from uint128 to facilitate conversion
+// to native 128-bit types when compilers support them.
+inline uint64_t Uint128Low64(const uint128& v) {
+  return v.lo_;
+}
+inline uint64_t Uint128High64(const uint128& v) {
+  return v.hi_;
+}
+
+// TODO: perhaps it would be nice to have int128, a signed 128-bit type?
+
+// --------------------------------------------------------------------------
+//                      Implementation details follow
+// --------------------------------------------------------------------------
+inline bool operator==(const uint128& lhs, const uint128& rhs) {
+  return (
+      Uint128Low64(lhs) == Uint128Low64(rhs) &&
+      Uint128High64(lhs) == Uint128High64(rhs));
+}
+inline bool operator!=(const uint128& lhs, const uint128& rhs) {
+  return !(lhs == rhs);
+}
+
+C10_API inline UINT128_CONSTEXPR uint128::uint128() : lo_(0), hi_(0) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint64_t top, uint64_t bottom)
+    : lo_(bottom), hi_(top) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(const uint128_pod& v)
+    : lo_(v.lo), hi_(v.hi) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint64_t bottom)
+    : lo_(bottom), hi_(0) {}
+#ifndef SWIG
+C10_API inline UINT128_CONSTEXPR uint128::uint128(uint32_t bottom)
+    : lo_(bottom), hi_(0) {}
+C10_API inline UINT128_CONSTEXPR uint128::uint128(int bottom)
+    : lo_(bottom), hi_(static_cast<int64_t>((bottom < 0) ? -1 : 0)) {}
+#endif
+
+#undef UINT128_CONSTEXPR
+
+C10_API inline void uint128::Initialize(uint64_t top, uint64_t bottom) {
+  hi_ = top;
+  lo_ = bottom;
+}
+
+// Comparison operators.
+
+#define CMP128(op)                                                  \
+  inline bool operator op(const uint128& lhs, const uint128& rhs) { \
+    return (Uint128High64(lhs) == Uint128High64(rhs))               \
+        ? (Uint128Low64(lhs) op Uint128Low64(rhs))                  \
+        : (Uint128High64(lhs) op Uint128High64(rhs));               \
+  }
+
+CMP128(<)
+CMP128(>)
+CMP128(>=)
+CMP128(<=)
+
+#undef CMP128
+
+// Unary operators
+
+inline uint128 operator-(const uint128& val) {
+  const uint64_t hi_flip = ~Uint128High64(val);
+  const uint64_t lo_flip = ~Uint128Low64(val);
+  const uint64_t lo_add = lo_flip + 1;
+  if (lo_add < lo_flip) {
+    return uint128(hi_flip + 1, lo_add);
+  }
+  return uint128(hi_flip, lo_add);
+}
+
+inline bool operator!(const uint128& val) {
+  return !Uint128High64(val) && !Uint128Low64(val);
+}
+
+// Logical operators.
+
+inline uint128 operator~(const uint128& val) {
+  return uint128(~Uint128High64(val), ~Uint128Low64(val));
+}
+
+#define LOGIC128(op)                                                   \
+  inline uint128 operator op(const uint128& lhs, const uint128& rhs) { \
+    return uint128(                                                    \
+        Uint128High64(lhs) op Uint128High64(rhs),                      \
+        Uint128Low64(lhs) op Uint128Low64(rhs));                       \
+  }
+
+LOGIC128(|)
+LOGIC128(&)
+LOGIC128(^)
+
+#undef LOGIC128
+
+#define LOGICASSIGN128(op)                                              \
+  C10_API inline uint128& uint128::operator op(const uint128 & other) { \
+    hi_ op other.hi_;                                                   \
+    lo_ op other.lo_;                                                   \
+    return *this;                                                       \
+  }
+
+LOGICASSIGN128(|=)
+LOGICASSIGN128(&=)
+LOGICASSIGN128(^=)
+
+#undef LOGICASSIGN128
+
+// Shift operators.
+
+inline uint128 operator<<(const uint128& val, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount == 0) {
+      return val;
+    }
+    uint64_t new_hi =
+        (Uint128High64(val) << amount) | (Uint128Low64(val) >> (64 - amount));
+    uint64_t new_lo = Uint128Low64(val) << amount;
+    return uint128(new_hi, new_lo);
+  } else if (amount < 128) {
+    return uint128(Uint128Low64(val) << (amount - 64), 0);
+  } else {
+    return uint128(0, 0);
+  }
+}
+
+inline uint128 operator>>(const uint128& val, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount == 0) {
+      return val;
+    }
+    uint64_t new_hi = Uint128High64(val) >> amount;
+    uint64_t new_lo =
+        (Uint128Low64(val) >> amount) | (Uint128High64(val) << (64 - amount));
+    return uint128(new_hi, new_lo);
+  } else if (amount < 128) {
+    return uint128(0, Uint128High64(val) >> (amount - 64));
+  } else {
+    return uint128(0, 0);
+  }
+}
+
+static inline uint128& operator<<=(uint128& self, int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount != 0) {
+      self.hi_ = (self.hi_ << amount) | (self.lo_ >> (64 - amount));
+      self.lo_ = self.lo_ << amount;
+    }
+  } else if (amount < 128) {
+    self.hi_ = self.lo_ << (amount - 64);
+    self.lo_ = 0;
+  } else {
+    self.hi_ = 0;
+    self.lo_ = 0;
+  }
+  return self;
+}
+
+C10_API inline uint128& uint128::operator>>=(int amount) {
+  // uint64_t shifts of >= 64 are undefined, so we will need some
+  // special-casing.
+  if (amount < 64) {
+    if (amount != 0) {
+      lo_ = (lo_ >> amount) | (hi_ << (64 - amount));
+      hi_ = hi_ >> amount;
+    }
+  } else if (amount < 128) {
+    lo_ = hi_ >> (amount - 64);
+    hi_ = 0;
+  } else {
+    lo_ = 0;
+    hi_ = 0;
+  }
+  return *this;
+}
+
+inline uint128 operator+(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) += rhs;
+}
+
+inline uint128 operator-(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) -= rhs;
+}
+
+inline uint128 operator*(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) *= rhs;
+}
+
+inline uint128 operator/(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) /= rhs;
+}
+
+inline uint128 operator%(const uint128& lhs, const uint128& rhs) {
+  return uint128(lhs) %= rhs;
+}
+
+C10_API inline uint128& uint128::operator+=(const uint128& b) {
+  hi_ += b.hi_;
+  uint64_t lolo = lo_ + b.lo_;
+  if (lolo < lo_)
+    ++hi_;
+  lo_ = lolo;
+  return *this;
+}
+
+C10_API inline uint128& uint128::operator-=(const uint128& b) {
+  hi_ -= b.hi_;
+  if (b.lo_ > lo_)
+    --hi_;
+  lo_ -= b.lo_;
+  return *this;
+}
+
+C10_API inline uint128& uint128::operator*=(const uint128& b) {
+  uint64_t a96 = hi_ >> 32;
+  uint64_t a64 = hi_ & 0xffffffffu;
+  uint64_t a32 = lo_ >> 32;
+  uint64_t a00 = lo_ & 0xffffffffu;
+  uint64_t b96 = b.hi_ >> 32;
+  uint64_t b64 = b.hi_ & 0xffffffffu;
+  uint64_t b32 = b.lo_ >> 32;
+  uint64_t b00 = b.lo_ & 0xffffffffu;
+  // multiply [a96 .. a00] x [b96 .. b00]
+  // terms higher than c96 disappear off the high side
+  // terms c96 and c64 are safe to ignore carry bit
+  uint64_t c96 = a96 * b00 + a64 * b32 + a32 * b64 + a00 * b96;
+  uint64_t c64 = a64 * b00 + a32 * b32 + a00 * b64;
+  this->hi_ = (c96 << 32) + c64;
+  this->lo_ = 0;
+  // add terms after this one at a time to capture carry
+  *this += uint128(a32 * b00) << 32;
+  *this += uint128(a00 * b32) << 32;
+  *this += a00 * b00;
+  return *this;
+}
+
+C10_API inline uint128 uint128::operator++(int) {
+  uint128 tmp(*this);
+  *this += 1;
+  return tmp;
+}
+
+C10_API inline uint128 uint128::operator--(int) {
+  uint128 tmp(*this);
+  *this -= 1;
+  return tmp;
+}
+
+C10_API inline uint128& uint128::operator++() {
+  *this += 1;
+  return *this;
+}
+
+C10_API inline uint128& uint128::operator--() {
+  *this -= 1;
+  return *this;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/intrusive_ptr.h b/MLPY/Lib/site-packages/torch/include/c10/util/intrusive_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..e408581fa4c15fd92c2200d3f6bc565372f1f2bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/intrusive_ptr.h
@@ -0,0 +1,1074 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/MaybeOwned.h>
+#include <atomic>
+#include <climits>
+#include <memory>
+#include <type_traits>
+
+namespace pybind11 {
+template <typename, typename...>
+class class_;
+}
+
+namespace c10 {
+class intrusive_ptr_target;
+namespace raw {
+namespace weak_intrusive_ptr {
+inline void incref(intrusive_ptr_target* self);
+}
+namespace intrusive_ptr {
+inline void incref(intrusive_ptr_target* self);
+}
+
+// constructor tag used by intrusive_ptr constructors
+struct DontIncreaseRefcount {};
+} // namespace raw
+
+namespace detail {
+constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+} // namespace detail
+
+/**
+ * intrusive_ptr<T> is an alternative to shared_ptr<T> that has better
+ * performance because it does the refcounting intrusively
+ * (i.e. in a member of the object itself).
+ * Your class T needs to inherit from intrusive_ptr_target to allow it to be
+ * used in an intrusive_ptr<T>. Your class's constructor should not allow
+ *`this` to escape to other threads or create an intrusive_ptr from `this`.
+ */
+
+// Note [Stack allocated intrusive_ptr_target safety]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A well known problem with std::enable_shared_from_this is that it
+// allows you to create a std::shared_ptr from a stack allocated object,
+// which is totally bogus because the object will die once you return
+// from the stack.  In intrusive_ptr, we can detect that this has occurred,
+// because we set the refcount/weakcount of objects which inherit from
+// intrusive_ptr_target to zero, *unless* we can prove that the object
+// was dynamically allocated (e.g., via make_intrusive).
+//
+// Thus, whenever you transmute a T* into a intrusive_ptr<T>, we check
+// and make sure that the refcount isn't zero (or, a more subtle
+// test for weak_intrusive_ptr<T>, for which the refcount may validly
+// be zero, but the weak refcount better not be zero), because that
+// tells us if the object was allocated by us.  If it wasn't, no
+// intrusive_ptr for you!
+
+// NOLINTNEXTLINE(cppcoreguidelines-virtual-class-destructor)
+class C10_API intrusive_ptr_target {
+  // Note [Weak references for intrusive refcounting]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Here's the scheme:
+  //
+  //  - refcount == number of strong references to the object
+  //    weakcount == number of weak references to the object,
+  //      plus one more if refcount > 0
+  //    An invariant: refcount > 0  =>  weakcount > 0
+  //
+  //  - c10::StorageImpl stays live as long as there are any strong
+  //    or weak pointers to it (weakcount > 0, since strong
+  //    references count as a +1 to weakcount)
+  //
+  //  - finalizers are called and data_ptr is deallocated when refcount == 0
+  //
+  //  - Once refcount == 0, it can never again be > 0 (the transition
+  //    from > 0 to == 0 is monotonic)
+  //
+  //  - When you access c10::StorageImpl via a weak pointer, you must
+  //    atomically increment the use count, if it is greater than 0.
+  //    If it is not, you must report that the storage is dead.
+  //
+  mutable std::atomic<uint32_t> refcount_;
+  mutable std::atomic<uint32_t> weakcount_;
+
+  template <typename T, typename NullType>
+  friend class intrusive_ptr;
+  friend inline void raw::intrusive_ptr::incref(intrusive_ptr_target* self);
+
+  template <typename T, typename NullType>
+  friend class weak_intrusive_ptr;
+  friend inline void raw::weak_intrusive_ptr::incref(
+      intrusive_ptr_target* self);
+
+  template <typename T>
+  friend struct ExclusivelyOwnedTensorTraits;
+
+ protected:
+  // protected destructor. We never want to destruct intrusive_ptr_target*
+  // directly.
+  virtual ~intrusive_ptr_target() {
+// Disable -Wterminate and -Wexceptions so we're allowed to use assertions
+// (i.e. throw exceptions) in a destructor.
+// We also have to disable -Wunknown-warning-option and -Wpragmas, because
+// some other compilers don't know about -Wterminate or -Wexceptions and
+// will show a warning about unknown warning options otherwise.
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning( \
+    disable : 4297) // function assumed not to throw an exception but does
+#else
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wterminate"
+#pragma GCC diagnostic ignored "-Wexceptions"
+#endif
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // Second condition is there to accommodate
+        // unsafe_adapt_non_heap_allocated: since we are doing our own
+        // deallocation in that case, it is correct for each
+        // expected_decref to have happened (some user code tried to
+        // decref and thus free the object, but it didn't happen right
+        // away) or not (no user code tried to free the object, and
+        // now it's getting destroyed through whatever mechanism the
+        // caller of unsafe_adapt_non_heap_allocated wanted to
+        // use). We choose our reference count such that the count
+        // will not dip below kImpracticallyHugeReferenceCount regardless.
+        refcount_.load() == 0 ||
+            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
+        refcount_.load());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        // See ~intrusive_ptr for optimization that will frequently result in 1
+        // at destruction time.
+        weakcount_.load() == 1 || weakcount_.load() == 0 ||
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#else
+#pragma GCC diagnostic pop
+#endif
+  }
+
+  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+
+  // intrusive_ptr_target supports copy and move: but refcount and weakcount
+  // don't participate (since they are intrinsic properties of the memory
+  // location)
+  intrusive_ptr_target(intrusive_ptr_target&& /*other*/) noexcept
+      : intrusive_ptr_target() {}
+
+  intrusive_ptr_target& operator=(intrusive_ptr_target&& /*other*/) noexcept {
+    return *this;
+  }
+
+  intrusive_ptr_target(const intrusive_ptr_target& /*other*/) noexcept
+      : intrusive_ptr_target() {}
+
+  intrusive_ptr_target& operator=(
+      const intrusive_ptr_target& /*other*/) noexcept {
+    return *this;
+  }
+
+ private:
+  /**
+   * This is called when refcount reaches zero.
+   * You can override this to release expensive resources.
+   * There might still be weak references, so your object might not get
+   * destructed yet, but you can assume the object isn't used anymore,
+   * i.e. no more calls to methods or accesses to members (we just can't
+   * destruct it yet because we need the weakcount accessible).
+   *
+   * If there are no weak references (i.e. your class is about to be
+   * destructed), this function WILL NOT be called.
+   */
+  virtual void release_resources() {}
+};
+
+namespace detail {
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+// Increment needs to be acquire-release to make use_count() and
+// unique() reliable.
+inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
+  return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
+}
+
+// weak_use_count() is only used for testing, so we don't need it to
+// be reliable. Relaxed should be fine.
+inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
+  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
+}
+
+// Both decrements need to be acquire-release for correctness. See
+// e.g. std::shared_ptr implementation.
+inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
+  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
+  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+} // namespace detail
+
+template <class TTarget, class NullType>
+class weak_intrusive_ptr;
+
+template <
+    class TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>>
+class intrusive_ptr final {
+ private:
+//  the following static assert would be nice to have but it requires
+//  the target class T to be fully defined when intrusive_ptr<T> is instantiated
+//  this is a problem for classes that contain pointers to themselves
+//  static_assert(
+//      std::is_base_of<intrusive_ptr_target, TTarget>::value,
+//      "intrusive_ptr can only be used for classes that inherit from
+//      intrusive_ptr_target.");
+#ifndef _WIN32
+  // This static_assert triggers on MSVC
+  //  error C2131: expression did not evaluate to a constant
+  static_assert(
+      // NOLINTNEXTLINE(misc-redundant-expression)
+      NullType::singleton() == NullType::singleton(),
+      "NullType must have a constexpr singleton() method");
+#endif
+  static_assert(
+      std::is_base_of_v<
+          TTarget,
+          std::remove_pointer_t<decltype(NullType::singleton())>>,
+      "NullType::singleton() must return a element_type* pointer");
+
+  TTarget* target_;
+
+  template <typename T>
+  friend struct ExclusivelyOwnedTensorTraits;
+  template <class TTarget2, class NullType2>
+  friend class intrusive_ptr;
+  friend class weak_intrusive_ptr<TTarget, NullType>;
+
+  // Make pybind11::class_ be a friend class of intrusive_ptr, so that custom
+  // smart holder in pybind11 could access the private constructor of
+  // intrusive_ptr(T*) which took the ownership of the object. This is required
+  // by customer holder macro PYBIND11_DECLARE_HOLDER_TYPE, where it uses
+  // intrusive_ptr(TTarget*) to initialize and take ownership of the object. For
+  // details, see
+  // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers
+  template <typename, typename...>
+  friend class pybind11::class_;
+
+  void retain_() {
+    if (target_ != NullType::singleton()) {
+      uint32_t new_refcount =
+          detail::atomic_refcount_increment(target_->refcount_);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          new_refcount != 1,
+          "intrusive_ptr: Cannot increase refcount after it reached zero.");
+    }
+  }
+
+  void reset_() noexcept {
+    if (target_ != NullType::singleton() &&
+        detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+      // See comment above about weakcount. As long as refcount>0,
+      // weakcount is one larger than the actual number of weak references.
+      // So we need to decrement it here.
+      bool should_delete =
+          target_->weakcount_.load(std::memory_order_acquire) == 1;
+      if (!should_delete) {
+        // justification for const_cast: release_resources is basically a
+        // destructor and a destructor always mutates the object, even for const
+        // objects. NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<std::remove_const_t<TTarget>*>(target_)->release_resources();
+        should_delete =
+            detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+      }
+      if (should_delete) {
+        delete target_;
+      }
+    }
+  }
+
+  // raw pointer constructors are not public because we shouldn't make
+  // intrusive_ptr out of raw pointers except from inside the make_intrusive(),
+  // reclaim() and weak_intrusive_ptr::lock() implementations.
+
+  // This constructor will increase the ref counter for you.
+  // This constructor will be used by the make_intrusive(), and also pybind11,
+  // which wrap the intrusive_ptr holder around the raw pointer and incref
+  // correspondingly (pybind11 requires raw pointer constructor to incref by
+  // default).
+  explicit intrusive_ptr(TTarget* target)
+      : intrusive_ptr(target, raw::DontIncreaseRefcount{}) {
+    if (target_ != NullType::singleton()) {
+      // We just created result.target_, so we know no other thread has
+      // access to it, so we know we needn't care about memory ordering.
+      // (On x86_64, a store with memory_order_relaxed generates a plain old
+      // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
+      // much more expensive: https://godbolt.org/z/eKPzj8.)
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
+          "constructor do something strange like incref or create an "
+          "intrusive_ptr from `this`?");
+      target_->refcount_.store(1, std::memory_order_relaxed);
+      target_->weakcount_.store(1, std::memory_order_relaxed);
+    }
+  }
+
+ public:
+  using element_type = TTarget;
+
+  intrusive_ptr() noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
+
+  intrusive_ptr(std::nullptr_t) noexcept
+      : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
+
+  // This constructor will not increase the ref counter for you.
+  // We use the tagged dispatch mechanism to explicitly mark this constructor
+  // to not increase the refcount
+  explicit intrusive_ptr(TTarget* target, raw::DontIncreaseRefcount) noexcept
+      : target_(target) {}
+
+  explicit intrusive_ptr(std::unique_ptr<TTarget> rhs) noexcept
+      : intrusive_ptr(rhs.release()) {}
+
+  intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
+    rhs.target_ = NullType::singleton();
+  }
+
+  template <class From, class FromNullType>
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  /* implicit */ intrusive_ptr(intrusive_ptr<From, FromNullType>&& rhs) noexcept
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. intrusive_ptr move constructor got pointer of wrong type.");
+    rhs.target_ = FromNullType::singleton();
+  }
+
+  intrusive_ptr(const intrusive_ptr& rhs) : target_(rhs.target_) {
+    retain_();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ intrusive_ptr(const intrusive_ptr<From, FromNullType>& rhs)
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. intrusive_ptr copy constructor got pointer of wrong type.");
+    retain_();
+  }
+
+  ~intrusive_ptr() noexcept {
+    reset_();
+  }
+
+  intrusive_ptr& operator=(intrusive_ptr&& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign*)
+    return operator= <TTarget, NullType>(std::move(rhs));
+  }
+
+  template <class From, class FromNullType>
+  intrusive_ptr& operator=(intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. intrusive_ptr move assignment got pointer of wrong type.");
+    intrusive_ptr tmp = std::move(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  // Assignment is implemented using copy and swap. That's safe for self
+  // assignment.
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment)
+  intrusive_ptr& operator=(const intrusive_ptr& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign-operator, *assignment-signature)
+    return operator= <TTarget, NullType>(rhs);
+  }
+
+  template <class From, class FromNullType>
+  intrusive_ptr& operator=(
+      const intrusive_ptr<From, NullType>& rhs) & noexcept {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. intrusive_ptr copy assignment got pointer of wrong type.");
+    intrusive_ptr tmp = rhs;
+    swap(tmp);
+    return *this;
+  }
+
+  TTarget* get() const noexcept {
+    return target_;
+  }
+
+  TTarget& operator*() const noexcept {
+    return *target_;
+  }
+
+  TTarget* operator->() const noexcept {
+    return target_;
+  }
+
+  operator bool() const noexcept {
+    return target_ != NullType::singleton();
+  }
+
+  void reset() noexcept {
+    reset_();
+    target_ = NullType::singleton();
+  }
+
+  void swap(intrusive_ptr& rhs) noexcept {
+    std::swap(target_, rhs.target_);
+  }
+
+  // We do a lot of null-pointer checks in our code, good to have this be cheap.
+  bool defined() const noexcept {
+    return target_ != NullType::singleton();
+  }
+
+  uint32_t use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->refcount_.load(std::memory_order_acquire);
+  }
+
+  uint32_t weak_use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->weakcount_.load(std::memory_order_acquire);
+  }
+
+  bool unique() const noexcept {
+    return use_count() == 1;
+  }
+
+  /**
+   * Returns an owning (!) pointer to the underlying object and makes the
+   * intrusive_ptr instance invalid. That means the refcount is not decreased.
+   * You *must* put the returned pointer back into a intrusive_ptr using
+   * intrusive_ptr::reclaim(ptr) to properly destruct it.
+   * This is helpful for C APIs.
+   */
+  TTarget* release() noexcept {
+    // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.Assign)
+    TTarget* result = target_;
+    target_ = NullType::singleton();
+    return result;
+  }
+
+  /**
+   * Takes an owning pointer to TTarget* and creates an intrusive_ptr that takes
+   * over ownership. That means the refcount is not increased.
+   * This is the counter-part to intrusive_ptr::release() and the pointer
+   * passed in *must* have been created using intrusive_ptr::release().
+   */
+  static intrusive_ptr reclaim(TTarget* owning_ptr) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        owning_ptr == NullType::singleton() ||
+            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
+    return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
+  }
+
+  /**
+   * Takes an owning pointer to TTarget* and creates an intrusive_ptr
+   * representing a new reference, i.e. the raw pointer retains
+   * ownership.
+   */
+  static intrusive_ptr reclaim_copy(TTarget* owning_ptr) {
+    auto ret = reclaim(owning_ptr);
+    ret.retain_();
+    return ret;
+  }
+
+  /**
+   * Allocate a heap object with args and wrap it inside a intrusive_ptr and
+   * incref. This is a helper function to let make_intrusive() access private
+   * intrusive_ptr constructors.
+   */
+  template <class... Args>
+  static intrusive_ptr make(Args&&... args) {
+    return intrusive_ptr(new TTarget(std::forward<Args>(args)...));
+  }
+
+  /**
+   * Turn a new instance of TTarget (e.g., literally allocated
+   * using new TTarget(...) into an intrusive_ptr.  If possible,
+   * use intrusive_ptr::make instead which statically guarantees
+   * that the allocation was done properly.
+   *
+   * At the moment, the only reason this method exists is because
+   * pybind11 holder types expect to be able to allocate in
+   * this way (because pybind11 handles the new allocation itself).
+   */
+  static intrusive_ptr unsafe_steal_from_new(TTarget* raw_ptr) {
+    return intrusive_ptr(raw_ptr);
+  }
+
+  /**
+   * Turn an instance of TTarget that should not be reference counted
+   * (e.g., allocated into an arena with placement new) into an
+   * intrusive_ptr. This is gratuitously unsafe and should only be
+   * used if you can guarantee that the pointer will not escape and be
+   * refcounted as normal.
+   *
+   * `expected_decrefs` is a debugging parameter: it indicates the
+   * number of strong owners the intrusive_ptr_target in question is
+   * expected to get. In most use cases, this will likely be 1.
+   *
+   * The reason this method exists is for manually sharing
+   * StorageImpls across Tensors in the static runtime. It needs
+   * access to private intrusive_ptr members so that the refcounts can
+   * be initialized to custom values.
+   */
+  static intrusive_ptr unsafe_adapt_non_heap_allocated(
+      TTarget* raw_ptr,
+      uint32_t expected_decrefs) {
+    intrusive_ptr result(raw_ptr, raw::DontIncreaseRefcount{});
+    // kImpracticallyHugeReferenceCount is impractically huge for a reference
+    // count, while being in no danger of overflowing uint32_t. We actually only
+    // need to initialize the refcount to 2 -- we are just doing an unbalanced
+    // incref to prevent the non-heap-allocated target from being
+    // freed, and we are optimizing that incref by directly
+    // initializing the refcounts rather than doing an expensive
+    // atomic increment. The reason to use kImpracticallyHugeReferenceCount is
+    // to accommodate the debug assertions in ~intrusive_ptr_target.
+#ifdef NDEBUG
+    expected_decrefs = 0;
+#endif
+    result.target_->refcount_.store(
+        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+        std::memory_order_relaxed);
+    result.target_->weakcount_.store(
+        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
+    return result;
+  }
+
+  /**
+   * Turn a **non-owning raw pointer** to an intrusive_ptr.  It is
+   * the moral equivalent of enable_shared_from_this on a shared pointer.
+   *
+   * This method is only valid for objects that are already live.  If
+   * you are looking for the moral equivalent of unique_ptr<T>(T*)
+   * constructor, see steal_from_new.
+   *
+   * TODO: https://github.com/pytorch/pytorch/issues/56482
+   */
+  static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
+    // See Note [Stack allocated intrusive_ptr_target safety]
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+        "intrusive_ptr: Can only reclaim pointers that are owned by someone");
+    auto ptr = reclaim(raw_ptr); // doesn't increase refcount
+    ptr.retain_();
+    return ptr;
+  }
+};
+
+template <
+    class TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>,
+    class... Args>
+inline intrusive_ptr<TTarget, NullType> make_intrusive(Args&&... args) {
+  return intrusive_ptr<TTarget, NullType>::make(std::forward<Args>(args)...);
+}
+
+template <class TTarget, class NullType>
+inline void swap(
+    intrusive_ptr<TTarget, NullType>& lhs,
+    intrusive_ptr<TTarget, NullType>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+// To allow intrusive_ptr inside std::map or std::set, we need operator<
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator<(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.get() < rhs.get();
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator==(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.get() == rhs.get();
+}
+
+template <class TTarget1, class NullType1>
+inline bool operator==(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return lhs.get() == nullptr;
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator==(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return nullptr == rhs.get();
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator!=(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+template <class TTarget1, class NullType1>
+inline bool operator!=(
+    const intrusive_ptr<TTarget1, NullType1>& lhs,
+    std::nullptr_t) noexcept {
+  return !operator==(lhs, nullptr);
+}
+
+template <class TTarget2, class NullType2>
+inline bool operator!=(
+    std::nullptr_t,
+    const intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(nullptr, rhs);
+}
+template <typename T>
+struct MaybeOwnedTraits<c10::intrusive_ptr<T>> {
+  using owned_type = c10::intrusive_ptr<T>;
+  using borrow_type = c10::intrusive_ptr<T>;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    return borrow_type::reclaim(from.get());
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.release();
+    lhs = borrow_type::reclaim(rhs.get());
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.release();
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+
+template <
+    typename TTarget,
+    class NullType = detail::intrusive_target_default_null_type<TTarget>>
+class weak_intrusive_ptr final {
+ private:
+  static_assert(
+      std::is_base_of_v<intrusive_ptr_target, TTarget>,
+      "intrusive_ptr can only be used for classes that inherit from intrusive_ptr_target.");
+#ifndef _WIN32
+  // This static_assert triggers on MSVC
+  //  error C2131: expression did not evaluate to a constant
+  static_assert(
+      NullType::singleton() == NullType::singleton(),
+      "NullType must have a constexpr singleton() method");
+#endif
+  static_assert(
+      std::is_base_of_v<
+          TTarget,
+          std::remove_pointer_t<decltype(NullType::singleton())>>,
+      "NullType::singleton() must return a element_type* pointer");
+
+  TTarget* target_;
+
+  template <class TTarget2, class NullType2>
+  friend class weak_intrusive_ptr;
+
+  void retain_() {
+    if (target_ != NullType::singleton()) {
+      uint32_t new_weakcount =
+          detail::atomic_weakcount_increment(target_->weakcount_);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          new_weakcount != 1,
+          "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
+    }
+  }
+
+  void reset_() noexcept {
+    if (target_ != NullType::singleton() &&
+        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
+      delete target_;
+    }
+    target_ = NullType::singleton();
+  }
+
+  constexpr explicit weak_intrusive_ptr(TTarget* target) : target_(target) {}
+
+ public:
+  using element_type = TTarget;
+
+  explicit weak_intrusive_ptr(const intrusive_ptr<TTarget, NullType>& ptr)
+      : weak_intrusive_ptr(ptr.get()) {
+    retain_();
+  }
+
+  weak_intrusive_ptr(weak_intrusive_ptr&& rhs) noexcept : target_(rhs.target_) {
+    rhs.target_ = NullType::singleton();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ weak_intrusive_ptr(
+      // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+      weak_intrusive_ptr<From, FromNullType>&& rhs) noexcept
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. weak_intrusive_ptr move constructor got pointer of wrong type.");
+    rhs.target_ = FromNullType::singleton();
+  }
+
+  weak_intrusive_ptr(const weak_intrusive_ptr& rhs) : target_(rhs.target_) {
+    retain_();
+  }
+
+  template <class From, class FromNullType>
+  /* implicit */ weak_intrusive_ptr(
+      const weak_intrusive_ptr<From, FromNullType>& rhs)
+      : target_(
+            detail::assign_ptr_<TTarget, NullType, FromNullType>(rhs.target_)) {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. weak_intrusive_ptr copy constructor got pointer of wrong type.");
+    retain_();
+  }
+
+  ~weak_intrusive_ptr() noexcept {
+    reset_();
+  }
+
+  weak_intrusive_ptr& operator=(weak_intrusive_ptr&& rhs) & noexcept {
+    // NOLINTNEXTLINE(*assign*)
+    return operator= <TTarget, NullType>(std::move(rhs));
+  }
+
+  template <class From, class FromNullType>
+  weak_intrusive_ptr& operator=(
+      weak_intrusive_ptr<From, FromNullType>&& rhs) & noexcept {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. weak_intrusive_ptr move assignment got pointer of wrong type.");
+    weak_intrusive_ptr tmp = std::move(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  weak_intrusive_ptr& operator=(const weak_intrusive_ptr& rhs) & noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+    // NOLINTNEXTLINE(*assign*)
+    return operator= <TTarget, NullType>(rhs);
+  }
+
+  weak_intrusive_ptr& operator=(
+      const intrusive_ptr<TTarget, NullType>& rhs) & noexcept {
+    weak_intrusive_ptr tmp(rhs);
+    swap(tmp);
+    return *this;
+  }
+
+  template <class From, class FromNullType>
+  weak_intrusive_ptr& operator=(
+      const weak_intrusive_ptr<From, NullType>& rhs) & noexcept {
+    static_assert(
+        std::is_convertible<From*, TTarget*>::value,
+        "Type mismatch. weak_intrusive_ptr copy assignment got pointer of wrong type.");
+    weak_intrusive_ptr tmp = rhs;
+    swap(tmp);
+    return *this;
+  }
+
+  void reset() noexcept {
+    reset_();
+  }
+
+  void swap(weak_intrusive_ptr& rhs) noexcept {
+    TTarget* tmp = target_;
+    target_ = rhs.target_;
+    rhs.target_ = tmp;
+  }
+
+  // NB: This should ONLY be used by the std::hash implementation
+  // for weak_intrusive_ptr.  Another way you could do this is
+  // friend std::hash<weak_intrusive_ptr>, but this triggers two
+  // bugs:
+  //
+  //  (1) It triggers an nvcc bug, where std::hash in a friend class
+  //      declaration gets preprocessed into hash, which then cannot
+  //      actually be found.  The error in this case looks like:
+  //
+  //        error: no template named 'hash'; did you mean 'std::hash'?
+  //
+  //  (2) On OS X, std::hash is declared as a struct, not a class.
+  //      This twings:
+  //
+  //        error: class 'hash' was previously declared as a struct
+  //        [-Werror,-Wmismatched-tags]
+  //
+  // Both of these are work-aroundable, but on the whole, I decided
+  // it would be simpler and easier to make work if we just expose
+  // an unsafe getter for target_
+  //
+  TTarget* _unsafe_get_target() const noexcept {
+    return target_;
+  }
+
+  uint32_t use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->refcount_.load(
+        std::memory_order_acquire); // refcount, not weakcount!
+  }
+
+  uint32_t weak_use_count() const noexcept {
+    if (target_ == NullType::singleton()) {
+      return 0;
+    }
+    return target_->weakcount_.load(std::memory_order_acquire);
+  }
+
+  bool expired() const noexcept {
+    return use_count() == 0;
+  }
+
+  intrusive_ptr<TTarget, NullType> lock() const noexcept {
+    if (expired()) {
+      return intrusive_ptr<TTarget, NullType>();
+    } else {
+      auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
+      do {
+        if (refcount == 0) {
+          // Object already destructed, no strong references left anymore.
+          // Return nullptr.
+          return intrusive_ptr<TTarget, NullType>();
+        }
+      } while (
+          !target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
+      return intrusive_ptr<TTarget, NullType>(
+          target_, raw::DontIncreaseRefcount{});
+    }
+  }
+
+  /**
+   * Returns an owning (but still only weakly referenced) pointer to the
+   * underlying object and makes the weak_intrusive_ptr instance invalid.
+   * That means the weakcount is not decreased.
+   * You *must* put the returned pointer back into a weak_intrusive_ptr using
+   * weak_intrusive_ptr::reclaim(ptr) to properly destruct it.
+   * This is helpful for C APIs.
+   */
+  TTarget* release() noexcept {
+    TTarget* result = target_;
+    target_ = NullType::singleton();
+    return result;
+  }
+
+  /**
+   * Takes an owning (but must be weakly referenced) pointer to TTarget* and
+   * creates a weak_intrusive_ptr that takes over ownership.
+   * This means that the weakcount is not increased.
+   * This is the counter-part to weak_intrusive_ptr::release() and the pointer
+   * passed in *must* have been created using weak_intrusive_ptr::release().
+   */
+  static weak_intrusive_ptr reclaim(TTarget* owning_weak_ptr) {
+    // See Note [Stack allocated intrusive_ptr_target safety]
+    // if refcount > 0, weakcount must be >1 for weak references to exist.
+    // see weak counting explanation at top of this file.
+    // if refcount == 0, weakcount only must be >0.
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        owning_weak_ptr == NullType::singleton() ||
+            owning_weak_ptr->weakcount_.load() > 1 ||
+            (owning_weak_ptr->refcount_.load() == 0 &&
+             owning_weak_ptr->weakcount_.load() > 0),
+        "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
+    return weak_intrusive_ptr(owning_weak_ptr);
+  }
+
+  /**
+   * Takes a pointer to TTarget* (may be weak or strong) and creates a
+   * new weak_intrusive_ptr representing a new weak reference, i.e.
+   * the raw pointer retains ownership.
+   */
+  static weak_intrusive_ptr reclaim_copy(TTarget* owning_ptr) {
+    auto ret = reclaim(owning_ptr);
+    ret.retain_();
+    return ret;
+  }
+
+  template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+  friend bool operator<(
+      const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+      const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept;
+  template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+  friend bool operator==(
+      const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+      const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept;
+};
+
+template <class TTarget, class NullType>
+inline void swap(
+    weak_intrusive_ptr<TTarget, NullType>& lhs,
+    weak_intrusive_ptr<TTarget, NullType>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+// To allow weak_intrusive_ptr inside std::map or std::set, we need operator<
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator<(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.target_ < rhs.target_;
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator==(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return lhs.target_ == rhs.target_;
+}
+
+template <class TTarget1, class NullType1, class TTarget2, class NullType2>
+inline bool operator!=(
+    const weak_intrusive_ptr<TTarget1, NullType1>& lhs,
+    const weak_intrusive_ptr<TTarget2, NullType2>& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+// Alias for documentary purposes, to more easily distinguish
+// weak raw intrusive pointers from intrusive pointers.
+using weak_intrusive_ptr_target = intrusive_ptr_target;
+
+// This namespace provides some methods for working with
+// raw pointers that subclass intrusive_ptr_target.  They are not provided
+// as methods on intrusive_ptr_target, because ideally you would not need these
+// methods at all (use smart pointers), but if you are dealing with legacy code
+// that still needs to pass around raw pointers, you may find these quite
+// useful.
+//
+// An important usage note: some functions are only valid if you have a
+// strong raw pointer to the object, while others are only valid if you
+// have a weak raw pointer to the object.  ONLY call intrusive_ptr namespace
+// functions on strong pointers, and weak_intrusive_ptr namespace functions
+// on weak pointers.  If you mix it up, you may get an assert failure.
+namespace raw {
+
+namespace intrusive_ptr {
+
+// WARNING: Unlike the reclaim() API, it is NOT valid to pass
+// NullType::singleton to this function
+inline void incref(intrusive_ptr_target* self) {
+  if (self) {
+    detail::atomic_refcount_increment(self->refcount_);
+  }
+}
+
+// WARNING: Unlike the reclaim() API, it is NOT valid to pass
+// NullType::singleton to this function
+inline void decref(intrusive_ptr_target* self) {
+  // Let it die
+  c10::intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  // NB: Caller still has 'self' pointer, but it's now invalid.
+  // If you want more safety, used the actual c10::intrusive_ptr class
+}
+
+template <typename T>
+inline T* make_weak(T* self) {
+  // NB: 'this' is a strong pointer, but we return a weak pointer
+  auto ptr = c10::intrusive_ptr<T>::reclaim(self);
+  c10::weak_intrusive_ptr<T> wptr(ptr);
+  ptr.release();
+  return wptr.release();
+}
+
+inline uint32_t use_count(intrusive_ptr_target* self) {
+  auto ptr = c10::intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  auto r = ptr.use_count();
+  ptr.release();
+  return r;
+}
+
+} // namespace intrusive_ptr
+
+namespace weak_intrusive_ptr {
+
+inline void incref(weak_intrusive_ptr_target* self) {
+  detail::atomic_weakcount_increment(self->weakcount_);
+}
+
+inline void decref(weak_intrusive_ptr_target* self) {
+  // Let it die
+  c10::weak_intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  // NB: You still "have" the 'self' pointer, but it's now invalid.
+  // If you want more safety, used the actual c10::weak_intrusive_ptr class
+}
+
+template <typename T>
+inline T* lock(T* self) {
+  auto wptr = c10::weak_intrusive_ptr<T>::reclaim(self);
+  auto ptr = wptr.lock();
+  wptr.release();
+  return ptr.release();
+}
+
+// This gives the STRONG refcount of a WEAK pointer
+inline uint32_t use_count(weak_intrusive_ptr_target* self) {
+  auto wptr = c10::weak_intrusive_ptr<intrusive_ptr_target>::reclaim(self);
+  auto r = wptr.use_count();
+  wptr.release();
+  return r;
+}
+
+} // namespace weak_intrusive_ptr
+
+} // namespace raw
+
+} // namespace c10
+
+namespace std {
+// To allow intrusive_ptr and weak_intrusive_ptr inside std::unordered_map or
+// std::unordered_set, we need std::hash
+template <class TTarget, class NullType>
+struct hash<c10::intrusive_ptr<TTarget, NullType>> {
+  size_t operator()(const c10::intrusive_ptr<TTarget, NullType>& x) const {
+    return std::hash<TTarget*>()(x.get());
+  }
+};
+template <class TTarget, class NullType>
+struct hash<c10::weak_intrusive_ptr<TTarget, NullType>> {
+  size_t operator()(const c10::weak_intrusive_ptr<TTarget, NullType>& x) const {
+    return std::hash<TTarget*>()(x._unsafe_get_target());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/irange.h b/MLPY/Lib/site-packages/torch/include/c10/util/irange.h
new file mode 100644
index 0000000000000000000000000000000000000000..9679c95cba69cad22f37d64b7ae2719ffcc96f03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/irange.h
@@ -0,0 +1,124 @@
+// Copyright 2004-present Facebook. All Rights Reserved.
+
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace c10 {
+
+namespace detail {
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, int> = 0>
+struct integer_iterator {
+  using iterator_category = std::input_iterator_tag;
+  using value_type = I;
+  using difference_type = std::ptrdiff_t;
+  using pointer = I*;
+  using reference = I&;
+
+  explicit integer_iterator(I value) : value(value) {}
+
+  I operator*() const {
+    return value;
+  }
+
+  I const* operator->() const {
+    return &value;
+  }
+
+  integer_iterator& operator++() {
+    ++value;
+    return *this;
+  }
+
+  integer_iterator operator++(int) {
+    const auto copy = *this;
+    ++*this;
+    return copy;
+  }
+
+  bool operator==(const integer_iterator& other) const {
+    if constexpr (one_sided) {
+      // Range-for loops' end test is `begin != end`, not `begin <
+      // end`. To handle `c10::irange(n)` where n < 0 (which should be
+      // empty), we just make `begin != end` fail whenever `end` is
+      // negative.
+      return is_negative(other.value) || value == other.value;
+    } else {
+      return value == other.value;
+    }
+    // Suppress "warning: missing return statement at end of non-void function"
+    // which Nvidia's Robert Crovella confirms is an NVCC compiler error
+    // here https://stackoverflow.com/a/64561686/752843 on 2020-10-27
+    // `__builtin_unreachable();` would be best here, but it's not
+    // available with all compilers. So we instead return an arbitrary
+    // value trusting that this line will, in fact, never be reached.
+    return false; // Horrible hack
+  }
+
+  bool operator!=(const integer_iterator& other) const {
+    return !(*this == other);
+  }
+
+ protected:
+  I value;
+};
+
+} // namespace detail
+
+template <
+    typename I,
+    bool one_sided = false,
+    std::enable_if_t<std::is_integral_v<I>, bool> = true>
+struct integer_range {
+ public:
+  integer_range(I begin, I end) : begin_(begin), end_(end) {}
+  using iterator = detail::integer_iterator<I, one_sided>;
+  iterator begin() const {
+    return begin_;
+  }
+  iterator end() const {
+    return end_;
+  }
+
+ private:
+  iterator begin_;
+  iterator end_;
+};
+
+/// Creates an integer range for the half-open interval [begin, end)
+/// If end<=begin, then the range is empty.
+/// The range has the type of the `end` integer; `begin` integer is
+/// cast to this type.
+template <
+    typename Integer1,
+    typename Integer2,
+    std::enable_if_t<std::is_integral_v<Integer1>, bool> = true,
+    std::enable_if_t<std::is_integral_v<Integer2>, bool> = true>
+integer_range<Integer2> irange(Integer1 begin, Integer2 end) {
+  // If end<=begin then the range is empty; we can achieve this effect by
+  // choosing the larger of {begin, end} as the loop terminator
+  return {
+      static_cast<Integer2>(begin),
+      std::max(static_cast<Integer2>(begin), end)};
+}
+
+/// Creates an integer range for the half-open interval [0, end)
+/// If end<=begin, then the range is empty
+template <
+    typename Integer,
+    std::enable_if_t<std::is_integral_v<Integer>, bool> = true>
+integer_range<Integer, true> irange(Integer end) {
+  return {Integer(), end};
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/llvmMathExtras.h b/MLPY/Lib/site-packages/torch/include/c10/util/llvmMathExtras.h
new file mode 100644
index 0000000000000000000000000000000000000000..24394096ce10a61c638fc513cf88ef97061fda3f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/llvmMathExtras.h
@@ -0,0 +1,906 @@
+//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some functions that are useful for math stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <c10/util/bit_cast.h>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#ifdef __ANDROID_NDK__
+#include <android/api-level.h>
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#ifndef LLVM_GNUC_PREREQ
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define LLVM_GNUC_PREREQ(maj, min, patch)                             \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 20) + ((min) << 10) + (patch))
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__)
+#define LLVM_GNUC_PREREQ(maj, min, patch) \
+  ((__GNUC__ << 20) + (__GNUC_MINOR__ << 10) >= ((maj) << 20) + ((min) << 10))
+#else
+#define LLVM_GNUC_PREREQ(maj, min, patch) 0
+#endif
+#endif
+
+#ifdef _MSC_VER
+// Declare these intrinsics manually rather including intrin.h. It's very
+// expensive, and MathExtras.h is popular.
+// #include <intrin.h>
+extern "C" {
+unsigned char _BitScanForward(unsigned long* _Index, unsigned long _Mask);
+unsigned char _BitScanForward64(unsigned long* _Index, unsigned __int64 _Mask);
+unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask);
+unsigned char _BitScanReverse64(unsigned long* _Index, unsigned __int64 _Mask);
+}
+#endif
+
+namespace c10::llvm {
+/// The behavior an operation has on an input of 0.
+enum ZeroBehavior {
+  /// The returned value is undefined.
+  ZB_Undefined,
+  /// The returned value is numeric_limits<T>::max()
+  ZB_Max,
+  /// The returned value is numeric_limits<T>::digits
+  ZB_Width
+};
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct TrailingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+    if (Val & 0x1)
+      return 0;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    T Shift = std::numeric_limits<T>::digits >> 1;
+    T Mask = std::numeric_limits<T>::max() >> Shift;
+    while (Shift) {
+      if ((Val & Mask) == 0) {
+        Val >>= Shift;
+        ZeroBits |= Shift;
+      }
+      Shift >>= 1;
+      Mask >>= Shift;
+    }
+    return ZeroBits;
+  }
+};
+
+#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER)
+template <typename T>
+struct TrailingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
+
+#if __has_builtin(__builtin_ctz) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_ctz(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanForward(&Index, Val);
+    return Index;
+#endif
+  }
+};
+
+#if !defined(_MSC_VER) || defined(_M_X64)
+template <typename T>
+struct TrailingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
+
+#if __has_builtin(__builtin_ctzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_ctzll(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanForward64(&Index, Val);
+    return Index;
+#endif
+  }
+};
+#endif
+#endif
+} // namespace detail
+
+/// Count number of 0's from the least significant bit to the most
+///   stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct LeadingZerosCounter {
+  static std::size_t count(T Val, ZeroBehavior) {
+    if (!Val)
+      return std::numeric_limits<T>::digits;
+
+    // Bisection method.
+    std::size_t ZeroBits = 0;
+    for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
+      T Tmp = Val >> Shift;
+      if (Tmp)
+        Val = Tmp;
+      else
+        ZeroBits |= Shift;
+    }
+    return ZeroBits;
+  }
+};
+
+#if (defined(__GNUC__) && __GNUC__ >= 4) || defined(_MSC_VER)
+template <typename T>
+struct LeadingZerosCounter<T, 4> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 32;
+
+#if __has_builtin(__builtin_clz) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_clz(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanReverse(&Index, Val);
+    return Index ^ 31;
+#endif
+  }
+};
+
+#if !defined(_MSC_VER) || defined(_M_X64)
+template <typename T>
+struct LeadingZerosCounter<T, 8> {
+  static std::size_t count(T Val, ZeroBehavior ZB) {
+    if (ZB != ZB_Undefined && Val == 0)
+      return 64;
+
+#if __has_builtin(__builtin_clzll) || LLVM_GNUC_PREREQ(4, 0, 0)
+    return __builtin_clzll(Val);
+#elif defined(_MSC_VER)
+    unsigned long Index;
+    _BitScanReverse64(&Index, Val);
+    return Index ^ 63;
+#endif
+  }
+};
+#endif
+#endif
+} // namespace detail
+
+/// Count number of 0's from the most significant bit to the least
+///   stopping at the first 1.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+}
+
+/// Get the index of the first set bit starting from the least
+///   significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
+  if (ZB == ZB_Max && Val == 0)
+    return std::numeric_limits<T>::max();
+
+  return countTrailingZeros(Val, ZB_Undefined);
+}
+
+/// Create a bitmask with the N right-most bits set to 1, and all other
+/// bits set to 0.  Only unsigned types are allowed.
+template <typename T>
+T maskTrailingOnes(unsigned N) {
+  static_assert(std::is_unsigned_v<T>, "Invalid type!");
+  const unsigned Bits = CHAR_BIT * sizeof(T);
+  assert(N <= Bits && "Invalid bit index");
+  return N == 0 ? 0 : (T(-1) >> (Bits - N));
+}
+
+/// Create a bitmask with the N left-most bits set to 1, and all other
+/// bits set to 0.  Only unsigned types are allowed.
+template <typename T>
+T maskLeadingOnes(unsigned N) {
+  return ~maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Create a bitmask with the N right-most bits set to 0, and all other
+/// bits set to 1.  Only unsigned types are allowed.
+template <typename T>
+T maskTrailingZeros(unsigned N) {
+  return maskLeadingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Create a bitmask with the N left-most bits set to 0, and all other
+/// bits set to 1.  Only unsigned types are allowed.
+template <typename T>
+T maskLeadingZeros(unsigned N) {
+  return maskTrailingOnes<T>(CHAR_BIT * sizeof(T) - N);
+}
+
+/// Get the index of the last set bit starting from the least
+///   significant bit.
+///
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
+///   valid arguments.
+template <typename T>
+T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
+  if (ZB == ZB_Max && Val == 0)
+    return std::numeric_limits<T>::max();
+
+  // Use ^ instead of - because both gcc and llvm can remove the associated ^
+  // in the __builtin_clz intrinsic on x86.
+  return countLeadingZeros(Val, ZB_Undefined) ^
+      (std::numeric_limits<T>::digits - 1);
+}
+
+/// Macro compressed bit reversal table for 256 bits.
+///
+/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+/// NOLINTNEXTLINE(*c-arrays*)
+static constexpr unsigned char BitReverseTable256[256] = {
+#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
+#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
+#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
+    R6(0),
+    R6(2),
+    R6(1),
+    R6(3)
+#undef R2
+#undef R4
+#undef R6
+};
+
+/// Reverse the bits in \p Val.
+template <typename T>
+T reverseBits(T Val) {
+  // NOLINTNEXTLINE(*c-arrays*)
+  unsigned char in[sizeof(Val)];
+  // NOLINTNEXTLINE(*c-arrays*)
+  unsigned char out[sizeof(Val)];
+  std::memcpy(in, &Val, sizeof(Val));
+  for (unsigned i = 0; i < sizeof(Val); ++i)
+    out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
+  std::memcpy(&Val, out, sizeof(Val));
+  return Val;
+}
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+
+/// Return the high 32 bits of a 64 bit value.
+constexpr inline uint32_t Hi_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Return the low 32 bits of a 64 bit value.
+constexpr inline uint32_t Lo_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value);
+}
+
+/// Make a 64-bit integer from a high / low pair of 32-bit integers.
+constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
+  return ((uint64_t)High << 32) | (uint64_t)Low;
+}
+
+/// Checks if an integer fits into the given bit width.
+template <unsigned N>
+constexpr inline bool isInt(int64_t x) {
+  return N >= 64 ||
+      (-(INT64_C(1) << (N - 1)) <= x && x < (INT64_C(1) << (N - 1)));
+}
+// Template specializations to get better code for common cases.
+template <>
+constexpr inline bool isInt<8>(int64_t x) {
+  return static_cast<int8_t>(x) == x;
+}
+template <>
+constexpr inline bool isInt<16>(int64_t x) {
+  return static_cast<int16_t>(x) == x;
+}
+template <>
+constexpr inline bool isInt<32>(int64_t x) {
+  return static_cast<int32_t>(x) == x;
+}
+
+/// Checks if a signed integer is an N bit number shifted left by S.
+template <unsigned N, unsigned S>
+constexpr inline bool isShiftedInt(int64_t x) {
+  static_assert(
+      N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
+  static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
+  return isInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+}
+
+/// Checks if an unsigned integer fits into the given bit width.
+///
+/// This is written as two functions rather than as simply
+///
+///   return N >= 64 || X < (UINT64_C(1) << N);
+///
+/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
+/// left too many places.
+template <unsigned N>
+constexpr inline std::enable_if_t<(N < 64), bool> isUInt(uint64_t X) {
+  static_assert(N > 0, "isUInt<0> doesn't make sense");
+  return X < (UINT64_C(1) << (N));
+}
+template <unsigned N>
+constexpr inline std::enable_if_t<N >= 64, bool> isUInt(uint64_t /*X*/) {
+  return true;
+}
+
+// Template specializations to get better code for common cases.
+template <>
+constexpr inline bool isUInt<8>(uint64_t x) {
+  return static_cast<uint8_t>(x) == x;
+}
+template <>
+constexpr inline bool isUInt<16>(uint64_t x) {
+  return static_cast<uint16_t>(x) == x;
+}
+template <>
+constexpr inline bool isUInt<32>(uint64_t x) {
+  return static_cast<uint32_t>(x) == x;
+}
+
+/// Checks if a unsigned integer is an N bit number shifted left by S.
+template <unsigned N, unsigned S>
+constexpr inline bool isShiftedUInt(uint64_t x) {
+  static_assert(
+      N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
+  static_assert(
+      N + S <= 64, "isShiftedUInt<N, S> with N + S > 64 is too wide.");
+  // Per the two static_asserts above, S must be strictly less than 64.  So
+  // 1 << S is not undefined behavior.
+  return isUInt<N + S>(x) && (x % (UINT64_C(1) << S) == 0);
+}
+
+/// Gets the maximum value for a N-bit unsigned integer.
+inline uint64_t maxUIntN(uint64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+
+  // uint64_t(1) << 64 is undefined behavior, so we can't do
+  //   (uint64_t(1) << N) - 1
+  // without checking first that N != 64.  But this works and doesn't have a
+  // branch.
+  return UINT64_MAX >> (64 - N);
+}
+
+// Ignore the false warning "Arithmetic overflow" for MSVC
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+
+/// Gets the minimum value for a N-bit signed integer.
+inline int64_t minIntN(int64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  return -(UINT64_C(1) << (N - 1));
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+/// Gets the maximum value for a N-bit signed integer.
+inline int64_t maxIntN(int64_t N) {
+  assert(N > 0 && N <= 64 && "integer width out of range");
+
+  // This relies on two's complement wraparound when N == 64, so we convert to
+  // int64_t only at the very end to avoid UB.
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  return (UINT64_C(1) << (N - 1)) - 1;
+}
+
+/// Checks if an unsigned integer fits into the given (dynamic) bit width.
+inline bool isUIntN(unsigned N, uint64_t x) {
+  return N >= 64 || x <= maxUIntN(N);
+}
+
+/// Checks if an signed integer fits into the given (dynamic) bit width.
+inline bool isIntN(unsigned N, int64_t x) {
+  return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
+}
+
+/// Return true if the argument is a non-empty sequence of ones starting at the
+/// least significant bit with the remainder zero (32 bit version).
+/// Ex. isMask_32(0x0000FFFFU) == true.
+constexpr inline bool isMask_32(uint32_t Value) {
+  return Value && ((Value + 1) & Value) == 0;
+}
+
+/// Return true if the argument is a non-empty sequence of ones starting at the
+/// least significant bit with the remainder zero (64 bit version).
+constexpr inline bool isMask_64(uint64_t Value) {
+  return Value && ((Value + 1) & Value) == 0;
+}
+
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
+constexpr inline bool isShiftedMask_32(uint32_t Value) {
+  return Value && isMask_32((Value - 1) | Value);
+}
+
+/// Return true if the argument contains a non-empty sequence of ones with the
+/// remainder zero (64 bit version.)
+constexpr inline bool isShiftedMask_64(uint64_t Value) {
+  return Value && isMask_64((Value - 1) | Value);
+}
+
+/// Return true if the argument is a power of two > 0.
+/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
+constexpr inline bool isPowerOf2_32(uint32_t Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+/// Return true if the argument is a power of two > 0 (64 bit edition.)
+constexpr inline bool isPowerOf2_64(uint64_t Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+/// Count the number of ones from the most significant bit to the first
+/// zero bit.
+///
+/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return countLeadingZeros<T>(~Value, ZB);
+}
+
+/// Count the number of ones from the least significant bit to the first
+/// zero bit.
+///
+/// Ex. countTrailingOnes(0x00FF00FF) == 8.
+/// Only unsigned integral types are allowed.
+///
+/// \param ZB the behavior on an input of all ones. Only ZB_Width and
+/// ZB_Undefined are valid arguments.
+template <typename T>
+std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return countTrailingZeros<T>(~Value, ZB);
+}
+
+namespace detail {
+template <typename T, std::size_t SizeOfT>
+struct PopulationCounter {
+  static unsigned count(T Value) {
+    // Generic version, forward to 32 bits.
+    static_assert(SizeOfT <= 4, "Not implemented!");
+#if defined(__GNUC__) && __GNUC__ >= 4
+    return __builtin_popcount(Value);
+#else
+    uint32_t v = Value;
+    v = v - ((v >> 1) & 0x55555555);
+    v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
+    return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
+#endif
+  }
+};
+
+template <typename T>
+struct PopulationCounter<T, 8> {
+  static unsigned count(T Value) {
+#if defined(__GNUC__) && __GNUC__ >= 4
+    return __builtin_popcountll(Value);
+#else
+    uint64_t v = Value;
+    v = v - ((v >> 1) & 0x5555555555555555ULL);
+    v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
+    v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+    return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+  }
+};
+} // namespace detail
+
+/// Count the number of set bits in a value.
+/// Ex. countPopulation(0xF000F000) = 8
+/// Returns 0 if the word is zero.
+template <typename T>
+inline unsigned countPopulation(T Value) {
+  static_assert(
+      std::numeric_limits<T>::is_integer && !std::numeric_limits<T>::is_signed,
+      "Only unsigned integral types are allowed.");
+  return detail::PopulationCounter<T, sizeof(T)>::count(Value);
+}
+
+/// Return the log base 2 of the specified value.
+inline double Log2(double Value) {
+#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
+  return __builtin_log(Value) / __builtin_log(2.0);
+#else
+  return log2(Value);
+#endif
+}
+
+/// Return the floor log base 2 of the specified value, -1 if the value is zero.
+/// (32 bit edition.)
+/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
+inline unsigned Log2_32(uint32_t Value) {
+  return static_cast<unsigned>(31 - countLeadingZeros(Value));
+}
+
+/// Return the floor log base 2 of the specified value, -1 if the value is zero.
+/// (64 bit edition.)
+inline unsigned Log2_64(uint64_t Value) {
+  return static_cast<unsigned>(63 - countLeadingZeros(Value));
+}
+
+/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
+/// (32 bit edition).
+/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
+inline unsigned Log2_32_Ceil(uint32_t Value) {
+  return static_cast<unsigned>(32 - countLeadingZeros(Value - 1));
+}
+
+/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
+/// (64 bit edition.)
+inline unsigned Log2_64_Ceil(uint64_t Value) {
+  return static_cast<unsigned>(64 - countLeadingZeros(Value - 1));
+}
+
+/// Return the greatest common divisor of the values using Euclid's algorithm.
+inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
+  while (B) {
+    uint64_t T = B;
+    B = A % B;
+    A = T;
+  }
+  return A;
+}
+
+/// This function takes a 64-bit integer and returns the bit equivalent double.
+inline double BitsToDouble(uint64_t Bits) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  double D;
+  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+  memcpy(&D, &Bits, sizeof(Bits));
+  return D;
+}
+
+/// This function takes a 32-bit integer and returns the bit equivalent float.
+inline float BitsToFloat(uint32_t Bits) {
+  // TODO: Use std::bit_cast once C++20 becomes available.
+  return c10::bit_cast<float>(Bits);
+}
+
+/// This function takes a double and returns the bit equivalent 64-bit integer.
+/// Note that copying doubles around changes the bits of NaNs on some hosts,
+/// notably x86, so this routine cannot be used if these bits are needed.
+inline uint64_t DoubleToBits(double Double) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint64_t Bits;
+  static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
+  memcpy(&Bits, &Double, sizeof(Double));
+  return Bits;
+}
+
+/// This function takes a float and returns the bit equivalent 32-bit integer.
+/// Note that copying floats around changes the bits of NaNs on some hosts,
+/// notably x86, so this routine cannot be used if these bits are needed.
+inline uint32_t FloatToBits(float Float) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  uint32_t Bits;
+  static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
+  memcpy(&Bits, &Float, sizeof(Float));
+  return Bits;
+}
+
+/// A and B are either alignments or offsets. Return the minimum alignment that
+/// may be assumed after adding the two together.
+constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
+  // The largest power of 2 that divides both A and B.
+  //
+  // Replace "-Value" by "1+~Value" in the following commented code to avoid
+  // MSVC warning C4146
+  //    return (A | B) & -(A | B);
+  return (A | B) & (1 + ~(A | B));
+}
+
+/// Aligns \c Addr to \c Alignment bytes, rounding up.
+///
+/// Alignment should be a power of two.  This method rounds up, so
+/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
+inline uintptr_t alignAddr(const void* Addr, size_t Alignment) {
+  assert(
+      Alignment && isPowerOf2_64((uint64_t)Alignment) &&
+      "Alignment is not a power of two!");
+
+  assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr);
+
+  return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
+}
+
+/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
+/// bytes, rounding up.
+inline size_t alignmentAdjustment(const void* Ptr, size_t Alignment) {
+  return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
+}
+
+/// Returns the next power of two (in 64-bits) that is strictly greater than A.
+/// Returns zero on overflow.
+inline uint64_t NextPowerOf2(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+/// Returns the power of two which is less than or equal to the given value.
+/// Essentially, it is a floor operation across the domain of powers of two.
+inline uint64_t PowerOf2Floor(uint64_t A) {
+  if (!A)
+    return 0;
+  return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
+}
+
+/// Returns the power of two which is greater than or equal to the given value.
+/// Essentially, it is a ceil operation across the domain of powers of two.
+inline uint64_t PowerOf2Ceil(uint64_t A) {
+  if (!A)
+    return 0;
+  return NextPowerOf2(A - 1);
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// If non-zero \p Skew is specified, the return value will be a minimal
+/// integer that is greater than or equal to \p Value and equal to
+/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
+/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
+///
+/// Examples:
+/// \code
+///   alignTo(5, 8) = 8
+///   alignTo(17, 8) = 24
+///   alignTo(~0LL, 8) = 0
+///   alignTo(321, 255) = 510
+///
+///   alignTo(5, 8, 7) = 7
+///   alignTo(17, 8, 1) = 17
+///   alignTo(~0LL, 8, 3) = 3
+///   alignTo(321, 255, 42) = 552
+/// \endcode
+inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+  assert(Align != 0u && "Align can't be 0.");
+  Skew %= Align;
+  return (Value + Align - 1 - Skew) / Align * Align + Skew;
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
+template <uint64_t Align>
+constexpr inline uint64_t alignTo(uint64_t Value) {
+  static_assert(Align != 0u, "Align must be non-zero");
+  return (Value + Align - 1) / Align * Align;
+}
+
+/// Returns the integer ceil(Numerator / Denominator).
+inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
+  return alignTo(Numerator, Denominator) / Denominator;
+}
+
+/// \c alignTo for contexts where a constant expression is required.
+/// \sa alignTo
+///
+/// \todo FIXME: remove when \c constexpr becomes really \c constexpr
+template <uint64_t Align>
+struct AlignTo {
+  static_assert(Align != 0u, "Align must be non-zero");
+  template <uint64_t Value>
+  struct from_value {
+    static const uint64_t value = (Value + Align - 1) / Align * Align;
+  };
+};
+
+/// Returns the largest uint64_t less than or equal to \p Value and is
+/// \p Skew mod \p Align. \p Align must be non-zero
+inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
+  assert(Align != 0u && "Align can't be 0.");
+  Skew %= Align;
+  return (Value - Skew) / Align * Align + Skew;
+}
+
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align. \p Align must be
+/// non-zero.
+inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
+  return alignTo(Value, Align) - Value;
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+/// Requires 0 < B <= 32.
+template <unsigned B>
+constexpr inline int32_t SignExtend32(uint32_t X) {
+  static_assert(B > 0, "Bit width can't be 0.");
+  static_assert(B <= 32, "Bit width out of range.");
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
+/// Requires 0 < B < 32.
+inline int32_t SignExtend32(uint32_t X, unsigned B) {
+  assert(B > 0 && "Bit width can't be 0.");
+  assert(B <= 32 && "Bit width out of range.");
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+/// Requires 0 < B < 64.
+template <unsigned B>
+constexpr inline int64_t SignExtend64(uint64_t x) {
+  static_assert(B > 0, "Bit width can't be 0.");
+  static_assert(B <= 64, "Bit width out of range.");
+  return int64_t(x << (64 - B)) >> (64 - B);
+}
+
+/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
+/// Requires 0 < B < 64.
+inline int64_t SignExtend64(uint64_t X, unsigned B) {
+  assert(B > 0 && "Bit width can't be 0.");
+  assert(B <= 64 && "Bit width out of range.");
+  return int64_t(X << (64 - B)) >> (64 - B);
+}
+
+/// Subtract two unsigned integers, X and Y, of type T and return the absolute
+/// value of the result.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) {
+  return std::max(X, Y) - std::min(X, Y);
+}
+
+/// Add two unsigned integers, X and Y, of type T.  Clamp the result to the
+/// maximum representable value of T on overflow.  ResultOverflowed indicates if
+/// the result is larger than the maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(
+    T X,
+    T Y,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+  // Hacker's Delight, p. 29
+  T Z = X + Y;
+  Overflowed = (Z < X || Z < Y);
+  if (Overflowed)
+    return std::numeric_limits<T>::max();
+  else
+    return Z;
+}
+
+/// Multiply two unsigned integers, X and Y, of type T.  Clamp the result to the
+/// maximum representable value of T on overflow.  ResultOverflowed indicates if
+/// the result is larger than the maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingMultiply(
+    T X,
+    T Y,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+  // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
+  // because it fails for uint16_t (where multiplication can have undefined
+  // behavior due to promotion to int), and requires a division in addition
+  // to the multiplication.
+
+  Overflowed = false;
+
+  // Log2(Z) would be either Log2Z or Log2Z + 1.
+  // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
+  // will necessarily be less than Log2Max as desired.
+  int Log2Z = Log2_64(X) + Log2_64(Y);
+  const T Max = std::numeric_limits<T>::max();
+  int Log2Max = Log2_64(Max);
+  if (Log2Z < Log2Max) {
+    return X * Y;
+  }
+  if (Log2Z > Log2Max) {
+    Overflowed = true;
+    return Max;
+  }
+
+  // We're going to use the top bit, and maybe overflow one
+  // bit past it. Multiply all but the bottom bit then add
+  // that on at the end.
+  T Z = (X >> 1) * Y;
+  if (Z & ~(Max >> 1)) {
+    Overflowed = true;
+    return Max;
+  }
+  Z <<= 1;
+  if (X & 1)
+    return SaturatingAdd(Z, Y, ResultOverflowed);
+
+  return Z;
+}
+
+/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
+/// the product. Clamp the result to the maximum representable value of T on
+/// overflow. ResultOverflowed indicates if the result is larger than the
+/// maximum representable value of type T.
+template <typename T>
+std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingMultiplyAdd(
+    T X,
+    T Y,
+    T A,
+    bool* ResultOverflowed = nullptr) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool Dummy;
+  bool& Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
+
+  T Product = SaturatingMultiply(X, Y, &Overflowed);
+  if (Overflowed)
+    return Product;
+
+  return SaturatingAdd(A, Product, &Overflowed);
+}
+
+/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
+extern const float huge_valf;
+} // namespace c10::llvm
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_google_glog.h b/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_google_glog.h
new file mode 100644
index 0000000000000000000000000000000000000000..d22e9d06b97f306dc4361fed75ee328a9d008579
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_google_glog.h
@@ -0,0 +1,109 @@
+#ifndef C10_UTIL_LOGGING_IS_GOOGLE_GLOG_H_
+#define C10_UTIL_LOGGING_IS_GOOGLE_GLOG_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include <iomanip> // because some of the caffe2 code uses e.g. std::setw
+// Using google glog. For glog 0.3.2 versions, stl_logging.h needs to be before
+// logging.h to actually use stl_logging. Because template magic.
+// In addition, we do not do stl logging in .cu files because nvcc does not like
+// it. Some mobile platforms do not like stl_logging, so we add an
+// overload in that case as well.
+
+#ifdef __CUDACC__
+#include <cuda.h>
+#endif
+
+#if !defined(__CUDACC__) && !defined(C10_USE_MINIMAL_GLOG)
+#include <glog/stl_logging.h>
+
+// Old versions of glog don't declare this using declaration, so help
+// them out.  Fortunately, C++ won't complain if you declare the same
+// using declaration multiple times.
+namespace std {
+using ::operator<<;
+}
+
+#else // !defined(__CUDACC__) && !defined(C10_USE_MINIMAL_GLOG)
+
+// In the cudacc compiler scenario, we will simply ignore the container
+// printout feature. Basically we need to register a fake overload for
+// vector/string - here, we just ignore the entries in the logs.
+
+namespace std {
+#define INSTANTIATE_FOR_CONTAINER(container)                      \
+  template <class... Types>                                       \
+  ostream& operator<<(ostream& out, const container<Types...>&) { \
+    return out;                                                   \
+  }
+
+INSTANTIATE_FOR_CONTAINER(vector)
+INSTANTIATE_FOR_CONTAINER(map)
+INSTANTIATE_FOR_CONTAINER(set)
+#undef INSTANTIATE_FOR_CONTAINER
+} // namespace std
+
+#endif
+
+#include <glog/logging.h>
+
+// Additional macros on top of glog
+#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
+#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
+#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
+#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
+#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  DCHECK_GT(val1, val2)
+#endif // NDEBUG
+
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
+
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  DCHECK_NOTNULL(val)
+#endif // NDEBUG
+
+// Log with source location information override (to be used in generic
+// warning/error handlers implemented as functions, not macros)
+//
+// Note, we don't respect GOOGLE_STRIP_LOG here for simplicity
+#define LOG_AT_FILE_LINE(n, file, line) \
+  ::google::LogMessage(file, line, ::google::GLOG_##n).stream()
+
+#endif // C10_UTIL_LOGGING_IS_GOOGLE_GLOG_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_not_google_glog.h b/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_not_google_glog.h
new file mode 100644
index 0000000000000000000000000000000000000000..deedc822526a837de04a22b1a34010c423b5d628
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/logging_is_not_google_glog.h
@@ -0,0 +1,258 @@
+#ifndef C10_UTIL_LOGGING_IS_NOT_GOOGLE_GLOG_H_
+#define C10_UTIL_LOGGING_IS_NOT_GOOGLE_GLOG_H_
+
+#include <chrono>
+#include <climits>
+#include <ctime>
+#include <iomanip>
+#include <map>
+#include <ostream>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <c10/util/Flags.h>
+
+const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";
+
+namespace c10 {
+
+// Log severity level constants.
+const int GLOG_FATAL = 3;
+const int GLOG_ERROR = 2;
+const int GLOG_WARNING = 1;
+const int GLOG_INFO = 0;
+
+class C10_API MessageLogger {
+ public:
+  MessageLogger(const char* file, int line, int severity);
+  ~MessageLogger();
+  // Return the stream associated with the logger object.
+  std::stringstream& stream() {
+    return stream_;
+  }
+
+ private:
+  // When there is a fatal log, we simply abort.
+  void DealWithFatal() {
+    abort();
+  }
+
+  const char* tag_;
+  std::stringstream stream_;
+  int severity_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s) {}
+};
+
+// Log a message and terminate.
+template <class T>
+void LogMessageFatal(const char* file, int line, const T& message) {
+  MessageLogger(file, line, GLOG_FATAL).stream() << message;
+}
+
+// Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
+// pointers and smart pointers.
+template <typename T>
+T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
+  if (t == nullptr) {
+    LogMessageFatal(file, line, std::string(names));
+  }
+  return t;
+}
+
+template <typename T>
+T* CheckNotNull(const char* file, int line, const char* names, T* t) {
+  return CheckNotNullCommon(file, line, names, t);
+}
+
+template <typename T>
+T& CheckNotNull(const char* file, int line, const char* names, T& t) {
+  return CheckNotNullCommon(file, line, names, t);
+}
+} // namespace c10
+
+// ---------------------- Logging Macro definitions --------------------------
+
+static_assert(
+    CAFFE2_LOG_THRESHOLD <= ::c10::GLOG_FATAL,
+    "CAFFE2_LOG_THRESHOLD should at most be GLOG_FATAL.");
+// If n is under the compile time caffe log threshold, The _CAFFE_LOG(n)
+// should not generate anything in optimized code.
+#define LOG(n)                                 \
+  if (::c10::GLOG_##n >= CAFFE2_LOG_THRESHOLD) \
+  ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
+#define VLOG(n)                   \
+  if (-n >= CAFFE2_LOG_THRESHOLD) \
+  ::c10::MessageLogger(__FILE__, __LINE__, -n).stream()
+
+#define LOG_IF(n, condition)                                  \
+  if (::c10::GLOG_##n >= CAFFE2_LOG_THRESHOLD && (condition)) \
+  ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
+#define VLOG_IF(n, condition)                    \
+  if (-n >= CAFFE2_LOG_THRESHOLD && (condition)) \
+  ::c10::MessageLogger(__FILE__, __LINE__, -n).stream()
+
+#define VLOG_IS_ON(verboselevel) (CAFFE2_LOG_THRESHOLD <= -(verboselevel))
+
+// Log with source location information override (to be used in generic
+// warning/error handlers implemented as functions, not macros)
+#define LOG_AT_FILE_LINE(n, file, line)        \
+  if (::c10::GLOG_##n >= CAFFE2_LOG_THRESHOLD) \
+  ::c10::MessageLogger(file, line, ::c10::GLOG_##n).stream()
+
+// Log only if condition is met.  Otherwise evaluates to void.
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+
+// Check for a given boolean condition.
+#define CHECK(condition) FATAL_IF(condition) << "Check failed: " #condition " "
+
+#ifndef NDEBUG
+// Debug only version of CHECK
+#define DCHECK(condition) FATAL_IF(condition) << "Check failed: " #condition " "
+#define DLOG(severity) LOG(severity)
+#else // NDEBUG
+// Optimized version - generates no code.
+#define DCHECK(condition) \
+  while (false)           \
+  CHECK(condition)
+
+#define DLOG(n)                   \
+  true ? (void)0                  \
+       : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
+#endif // NDEBUG
+
+#define TORCH_CHECK_OP(val1, val2, op)                                        \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << ") "
+
+// TORCH_CHECK_OP macro definitions
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+#ifndef NDEBUG
+// Debug only versions of TORCH_CHECK_OP macros.
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(           \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(            \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
+// ---------------------- Support for std objects --------------------------
+// These are adapted from glog to support a limited set of logging capability
+// for STL objects.
+
+namespace std {
+// Forward declare these two, and define them after all the container streams
+// operators so that we can recurse from pair -> container -> container -> pair
+// properly.
+template <class First, class Second>
+std::ostream& operator<<(std::ostream& out, const std::pair<First, Second>& p);
+} // namespace std
+
+namespace c10 {
+template <class Iter>
+void PrintSequence(std::ostream& ss, Iter begin, Iter end);
+} // namespace c10
+
+namespace std {
+#define INSTANTIATE_FOR_CONTAINER(container)               \
+  template <class... Types>                                \
+  std::ostream& operator<<(                                \
+      std::ostream& out, const container<Types...>& seq) { \
+    c10::PrintSequence(out, seq.begin(), seq.end());       \
+    return out;                                            \
+  }
+
+INSTANTIATE_FOR_CONTAINER(std::vector)
+INSTANTIATE_FOR_CONTAINER(std::map)
+INSTANTIATE_FOR_CONTAINER(std::set)
+#undef INSTANTIATE_FOR_CONTAINER
+
+template <class First, class Second>
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const std::pair<First, Second>& p) {
+  out << '(' << p.first << ", " << p.second << ')';
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const std::nullptr_t&) {
+  out << "(null)";
+  return out;
+}
+} // namespace std
+
+namespace c10 {
+template <class Iter>
+inline void PrintSequence(std::ostream& out, Iter begin, Iter end) {
+  // Output at most 100 elements -- appropriate if used for logging.
+  for (int i = 0; begin != end && i < 100; ++i, ++begin) {
+    if (i > 0)
+      out << ' ';
+    out << *begin;
+  }
+  if (begin != end) {
+    out << " ...";
+  }
+}
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_IS_NOT_GOOGLE_GLOG_H_
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/numa.h b/MLPY/Lib/site-packages/torch/include/c10/util/numa.h
new file mode 100644
index 0000000000000000000000000000000000000000..11939e87bc75d764fc874ce2a86b5ac4d1c3cef2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/numa.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+#include <cstddef>
+
+C10_DECLARE_bool(caffe2_cpu_numa_enabled);
+
+namespace c10 {
+
+/**
+ * Check whether NUMA is enabled
+ */
+C10_API bool IsNUMAEnabled();
+
+/**
+ * Bind to a given NUMA node
+ */
+C10_API void NUMABind(int numa_node_id);
+
+/**
+ * Get the NUMA id for a given pointer `ptr`
+ */
+C10_API int GetNUMANode(const void* ptr);
+
+/**
+ * Get number of NUMA nodes
+ */
+C10_API int GetNumNUMANodes();
+
+/**
+ * Move the memory pointed to by `ptr` of a given size to another NUMA node
+ */
+C10_API void NUMAMove(void* ptr, size_t size, int numa_node_id);
+
+/**
+ * Get the current NUMA node id
+ */
+C10_API int GetCurrentNUMANode();
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h b/MLPY/Lib/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..cea0dcb5b1df17860f7d711a85f506ff810c1b02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/order_preserving_flat_hash_map.h
@@ -0,0 +1,2216 @@
+// Taken from
+// https://github.com/skarupke/flat_hash_map/blob/2c4687431f978f02a3780e24b8b701d22aa32d9c/flat_hash_map.hpp
+// with fixes applied:
+// - https://github.com/skarupke/flat_hash_map/pull/25
+// - https://github.com/skarupke/flat_hash_map/pull/26
+// - replace size_t with uint64_t to fix it for 32bit
+// - add "GCC diagnostic" pragma to ignore -Wshadow
+// - make sherwood_v3_table::convertible_to_iterator public because GCC5 seems
+// to have issues with it otherwise
+// - fix compiler warnings in operator templated_iterator<const value_type>
+// - make use of 'if constexpr' and eliminate AssignIfTrue template
+
+//          Copyright Malte Skarupke 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See http://www.boost.org/LICENSE_1_0.txt)
+
+// Modified to maintain insertion and deletion order through a doubly-linked
+// list
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+#ifdef _MSC_VER
+#define SKA_NOINLINE(...) __declspec(noinline) __VA_ARGS__
+#else
+#define SKA_NOINLINE(...) __VA_ARGS__ __attribute__((noinline))
+#endif
+
+namespace ska_ordered {
+
+struct prime_number_hash_policy;
+struct power_of_two_hash_policy;
+struct fibonacci_hash_policy;
+
+namespace detailv3 {
+template <typename Result, typename Functor>
+struct functor_storage : Functor {
+  functor_storage() = default;
+  functor_storage(const Functor& functor) : Functor(functor) {}
+  template <typename... Args>
+  Result operator()(Args&&... args) {
+    return static_cast<Functor&>(*this)(std::forward<Args>(args)...);
+  }
+  template <typename... Args>
+  Result operator()(Args&&... args) const {
+    return static_cast<const Functor&>(*this)(std::forward<Args>(args)...);
+  }
+};
+template <typename Result, typename... Args>
+struct functor_storage<Result, Result (*)(Args...)> {
+  typedef Result (*function_ptr)(Args...);
+  function_ptr function;
+  functor_storage(function_ptr function) : function(function) {}
+  Result operator()(Args... args) const {
+    return function(std::forward<Args>(args)...);
+  }
+  operator function_ptr&() {
+    return function;
+  }
+  operator const function_ptr&() {
+    return function;
+  }
+};
+template <typename key_type, typename value_type, typename hasher>
+struct KeyOrValueHasher : functor_storage<uint64_t, hasher> {
+  typedef functor_storage<uint64_t, hasher> hasher_storage;
+  KeyOrValueHasher() = default;
+  KeyOrValueHasher(const hasher& hash) : hasher_storage(hash) {}
+  uint64_t operator()(const key_type& key) {
+    return static_cast<hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const key_type& key) const {
+    return static_cast<const hasher_storage&>(*this)(key);
+  }
+  uint64_t operator()(const value_type& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  uint64_t operator()(const value_type& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) {
+    return static_cast<hasher_storage&>(*this)(value.first);
+  }
+  template <typename F, typename S>
+  uint64_t operator()(const std::pair<F, S>& value) const {
+    return static_cast<const hasher_storage&>(*this)(value.first);
+  }
+};
+template <typename key_type, typename value_type, typename key_equal>
+struct KeyOrValueEquality : functor_storage<bool, key_equal> {
+  typedef functor_storage<bool, key_equal> equality_storage;
+  KeyOrValueEquality() = default;
+  KeyOrValueEquality(const key_equal& equality) : equality_storage(equality) {}
+  bool operator()(const key_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs);
+  }
+  bool operator()(const key_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  bool operator()(const value_type& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  bool operator()(const value_type& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const key_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const key_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs);
+  }
+  template <typename F, typename S>
+  bool operator()(const value_type& lhs, const std::pair<F, S>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename F, typename S>
+  bool operator()(const std::pair<F, S>& lhs, const value_type& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+  template <typename FL, typename SL, typename FR, typename SR>
+  bool operator()(const std::pair<FL, SL>& lhs, const std::pair<FR, SR>& rhs) {
+    return static_cast<equality_storage&>(*this)(lhs.first, rhs.first);
+  }
+};
+static constexpr int8_t min_lookups = 4;
+template <typename T>
+struct sherwood_v3_entry {
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  sherwood_v3_entry() {}
+  sherwood_v3_entry(int8_t distance_from_desired)
+      : distance_from_desired(distance_from_desired) {}
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  ~sherwood_v3_entry() {}
+
+  bool has_value() const {
+    return distance_from_desired >= 0;
+  }
+  bool is_empty() const {
+    return distance_from_desired < 0;
+  }
+  bool is_at_desired_position() const {
+    return distance_from_desired <= 0;
+  }
+  template <typename... Args>
+  void emplace(int8_t distance, Args&&... args) {
+    new (std::addressof(value)) T(std::forward<Args>(args)...);
+    distance_from_desired = distance;
+  }
+
+  void destroy_value() {
+    value.~T();
+    distance_from_desired = -1;
+  }
+
+  sherwood_v3_entry<T>* prev = nullptr;
+  sherwood_v3_entry<T>* next = nullptr;
+  int8_t distance_from_desired = -1;
+  static constexpr int8_t special_end_value = 0;
+  union {
+    T value;
+  };
+};
+
+inline int8_t log2(uint64_t value) {
+  static constexpr std::array<int8_t, 64> table = {
+      63, 0,  58, 1,  59, 47, 53, 2,  60, 39, 48, 27, 54, 33, 42, 3,
+      61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4,
+      62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21,
+      56, 45, 25, 31, 35, 16, 9,  12, 44, 24, 15, 8,  23, 7,  6,  5};
+  value |= value >> 1;
+  value |= value >> 2;
+  value |= value >> 4;
+  value |= value >> 8;
+  value |= value >> 16;
+  value |= value >> 32;
+  return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58];
+}
+
+inline uint64_t next_power_of_two(uint64_t i) {
+  --i;
+  i |= i >> 1;
+  i |= i >> 2;
+  i |= i >> 4;
+  i |= i >> 8;
+  i |= i >> 16;
+  i |= i >> 32;
+  ++i;
+  return i;
+}
+
+// Implementation taken from http://en.cppreference.com/w/cpp/types/void_t
+// (it takes CWG1558 into account and also works for older compilers)
+template <typename... Ts>
+struct make_void {
+  typedef void type;
+};
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+template <typename T, typename = void>
+struct HashPolicySelector {
+  typedef fibonacci_hash_policy type;
+};
+template <typename T>
+struct HashPolicySelector<T, void_t<typename T::hash_policy>> {
+  typedef typename T::hash_policy type;
+};
+
+template <
+    typename T,
+    typename FindKey,
+    typename ArgumentHash,
+    typename Hasher,
+    typename ArgumentEqual,
+    typename Equal,
+    typename ArgumentAlloc,
+    typename EntryAlloc>
+class sherwood_v3_table : private EntryAlloc, private Hasher, private Equal {
+  using Entry = detailv3::sherwood_v3_entry<T>;
+  using AllocatorTraits = std::allocator_traits<EntryAlloc>;
+  using EntryPointer = typename AllocatorTraits::pointer;
+
+ public:
+  struct convertible_to_iterator;
+
+  using value_type = T;
+  using size_type = uint64_t;
+  using difference_type = std::ptrdiff_t;
+  using hasher = ArgumentHash;
+  using key_equal = ArgumentEqual;
+  using allocator_type = EntryAlloc;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+
+  sherwood_v3_table() = default;
+  explicit sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : EntryAlloc(alloc), Hasher(hash), Equal(equal) {
+    rehash(bucket_count);
+  }
+  sherwood_v3_table(size_type bucket_count, const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(bucket_count, hash, ArgumentEqual(), alloc) {}
+  explicit sherwood_v3_table(const ArgumentAlloc& alloc) : EntryAlloc(alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  template <typename It>
+  sherwood_v3_table(
+      It first,
+      It last,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            first,
+            last,
+            bucket_count,
+            hash,
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count = 0,
+      const ArgumentHash& hash = ArgumentHash(),
+      const ArgumentEqual& equal = ArgumentEqual(),
+      const ArgumentAlloc& alloc = ArgumentAlloc())
+      : sherwood_v3_table(bucket_count, hash, equal, alloc) {
+    if (bucket_count == 0)
+      rehash(il.size());
+    insert(il.begin(), il.end());
+  }
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(
+            il,
+            bucket_count,
+            ArgumentHash(),
+            ArgumentEqual(),
+            alloc) {}
+  sherwood_v3_table(
+      std::initializer_list<T> il,
+      size_type bucket_count,
+      const ArgumentHash& hash,
+      const ArgumentAlloc& alloc)
+      : sherwood_v3_table(il, bucket_count, hash, ArgumentEqual(), alloc) {}
+  sherwood_v3_table(const sherwood_v3_table& other)
+      : sherwood_v3_table(
+            other,
+            AllocatorTraits::select_on_container_copy_construction(
+                other.get_allocator())) {}
+  sherwood_v3_table(const sherwood_v3_table& other, const ArgumentAlloc& alloc)
+      : EntryAlloc(alloc),
+        Hasher(other),
+        Equal(other),
+        _max_load_factor(other._max_load_factor) {
+    rehash_for_other_container(other);
+    try {
+      insert(other.begin(), other.end());
+    } catch (...) {
+      clear();
+      deallocate_data(entries, num_slots_minus_one, max_lookups);
+      throw;
+    }
+  }
+  sherwood_v3_table(sherwood_v3_table&& other) noexcept
+      : EntryAlloc(std::move(other)),
+        Hasher(std::move(other)),
+        Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table(
+      sherwood_v3_table&& other,
+      const ArgumentAlloc& alloc) noexcept
+      : EntryAlloc(alloc), Hasher(std::move(other)), Equal(std::move(other)) {
+    swap_pointers(other);
+  }
+  sherwood_v3_table& operator=(const sherwood_v3_table& other) {
+    if (this == std::addressof(other))
+      return *this;
+
+    clear();
+    if constexpr (AllocatorTraits::propagate_on_container_copy_assignment::
+                      value) {
+      if (static_cast<EntryAlloc&>(*this) !=
+          static_cast<const EntryAlloc&>(other)) {
+        reset_to_empty_state();
+      }
+      static_cast<EntryAlloc&>(*this) = other;
+    }
+    _max_load_factor = other._max_load_factor;
+    static_cast<Hasher&>(*this) = other;
+    static_cast<Equal&>(*this) = other;
+    rehash_for_other_container(other);
+    insert(other.begin(), other.end());
+    return *this;
+  }
+  sherwood_v3_table& operator=(sherwood_v3_table&& other) noexcept {
+    if (this == std::addressof(other))
+      return *this;
+    else if constexpr (AllocatorTraits::propagate_on_container_move_assignment::
+                           value) {
+      clear();
+      reset_to_empty_state();
+      static_cast<EntryAlloc&>(*this) = std::move(other);
+      swap_pointers(other);
+    } else if (
+        static_cast<EntryAlloc&>(*this) == static_cast<EntryAlloc&>(other)) {
+      swap_pointers(other);
+    } else {
+      clear();
+      _max_load_factor = other._max_load_factor;
+      rehash_for_other_container(other);
+      for (T& elem : other)
+        emplace(std::move(elem));
+      other.clear();
+    }
+    static_cast<Hasher&>(*this) = std::move(other);
+    static_cast<Equal&>(*this) = std::move(other);
+    return *this;
+  }
+  ~sherwood_v3_table() {
+    clear();
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+  }
+
+  const allocator_type& get_allocator() const {
+    return static_cast<const allocator_type&>(*this);
+  }
+  const ArgumentEqual& key_eq() const {
+    return static_cast<const ArgumentEqual&>(*this);
+  }
+  const ArgumentHash& hash_function() const {
+    return static_cast<const ArgumentHash&>(*this);
+  }
+
+  template <typename ValueType>
+  struct templated_iterator {
+    templated_iterator() = default;
+    templated_iterator(EntryPointer current) : current(current) {}
+    EntryPointer current = EntryPointer();
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = ValueType;
+    using difference_type = ptrdiff_t;
+    using pointer = ValueType*;
+    using reference = ValueType&;
+
+    friend bool operator==(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return lhs.current == rhs.current;
+    }
+    friend bool operator!=(
+        const templated_iterator& lhs,
+        const templated_iterator& rhs) {
+      return !(lhs == rhs);
+    }
+
+    templated_iterator& operator++() {
+      current = current->next;
+      return *this;
+    }
+    templated_iterator operator++(int) {
+      templated_iterator copy(*this);
+      ++*this;
+      return copy;
+    }
+
+    ValueType& operator*() const {
+      return current->value;
+    }
+    ValueType* operator->() const {
+      return std::addressof(current->value);
+    }
+
+    // the template automatically disables the operator when value_type is
+    // already const, because that would cause a lot of compiler warnings
+    // otherwise.
+    template <
+        class target_type = const value_type,
+        class = std::enable_if_t<
+            std::is_same_v<target_type, const value_type> &&
+            !std::is_same_v<target_type, value_type>>>
+    operator templated_iterator<target_type>() const {
+      return {current};
+    }
+  };
+  using iterator = templated_iterator<value_type>;
+  using const_iterator = templated_iterator<const value_type>;
+
+  iterator begin() {
+    return sentinel->next;
+  }
+  const_iterator begin() const {
+    return sentinel->next;
+  }
+  const_iterator cbegin() const {
+    return begin();
+  }
+  iterator end() {
+    return sentinel;
+  }
+  const_iterator end() const {
+    return sentinel;
+  }
+  const_iterator cend() const {
+    return end();
+  }
+
+  iterator find(const FindKey& key) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer it = entries + ptrdiff_t(index);
+    for (int8_t distance = 0; it->distance_from_desired >= distance;
+         ++distance, ++it) {
+      if (compares_equal(key, it->value))
+        return {it};
+    }
+    return end();
+  }
+  const_iterator find(const FindKey& key) const {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return const_cast<sherwood_v3_table*>(this)->find(key);
+  }
+  uint64_t count(const FindKey& key) const {
+    return find(key) == end() ? 0 : 1;
+  }
+  std::pair<iterator, iterator> equal_range(const FindKey& key) {
+    iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+  std::pair<const_iterator, const_iterator> equal_range(
+      const FindKey& key) const {
+    const_iterator found = find(key);
+    if (found == end())
+      return {found, found};
+    else
+      return {found, std::next(found)};
+  }
+
+  template <typename Key, typename... Args>
+  std::pair<iterator, bool> emplace(Key&& key, Args&&... args) {
+    uint64_t index =
+        hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+    EntryPointer current_entry = entries + ptrdiff_t(index);
+    int8_t distance_from_desired = 0;
+    for (; current_entry->distance_from_desired >= distance_from_desired;
+         ++current_entry, ++distance_from_desired) {
+      // insertion of an existing key does not change ordering
+      if (compares_equal(key, current_entry->value))
+        return {{current_entry}, false};
+    }
+    return emplace_new_key(
+        distance_from_desired,
+        current_entry,
+        std::forward<Key>(key),
+        std::forward<Args>(args)...);
+  }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return emplace(value);
+  }
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return emplace(std::move(value));
+  }
+  template <typename... Args>
+  iterator emplace_hint(const_iterator, Args&&... args) {
+    return emplace(std::forward<Args>(args)...).first;
+  }
+  iterator insert(const_iterator, const value_type& value) {
+    return emplace(value).first;
+  }
+  iterator insert(const_iterator, value_type&& value) {
+    return emplace(std::move(value)).first;
+  }
+
+  template <typename It>
+  void insert(It begin, It end) {
+    for (; begin != end; ++begin) {
+      emplace(*begin);
+    }
+  }
+  void insert(std::initializer_list<value_type> il) {
+    insert(il.begin(), il.end());
+  }
+
+  void rehash(uint64_t num_buckets) {
+    num_buckets = std::max(
+        num_buckets,
+        static_cast<uint64_t>(std::ceil(
+            static_cast<double>(num_elements) /
+            static_cast<double>(_max_load_factor))));
+    if (num_buckets == 0) {
+      reset_to_empty_state();
+      return;
+    }
+    auto new_prime_index = hash_policy.next_size_over(num_buckets);
+    if (num_buckets == bucket_count())
+      return;
+    int8_t new_max_lookups = compute_max_lookups(num_buckets);
+    EntryPointer new_buckets(
+        AllocatorTraits::allocate(*this, num_buckets + new_max_lookups));
+    EntryPointer special_end_item =
+        new_buckets + static_cast<ptrdiff_t>(num_buckets + new_max_lookups - 1);
+    for (EntryPointer it = new_buckets; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    std::swap(entries, new_buckets);
+    std::swap(num_slots_minus_one, num_buckets);
+    --num_slots_minus_one;
+    hash_policy.commit(new_prime_index);
+    int8_t old_max_lookups = max_lookups;
+    max_lookups = new_max_lookups;
+    num_elements = 0;
+
+    auto start = sentinel->next;
+    // point sentinel to itself;
+    reset_list();
+    // reinsert list
+    for (EntryPointer it = start; it != sentinel;) {
+      auto next = it->next;
+      emplace(std::move(it->value));
+      it->destroy_value();
+      it = next;
+    }
+
+    deallocate_data(new_buckets, num_buckets, old_max_lookups);
+  }
+
+  void reserve(uint64_t num_elements_) {
+    uint64_t required_buckets = num_buckets_for_reserve(num_elements_);
+    if (required_buckets > bucket_count())
+      rehash(required_buckets);
+  }
+
+  void replace_linked_list_position(
+      EntryPointer to_be_replaced,
+      EntryPointer new_node) {
+    remove_from_list(new_node);
+    insert_after(new_node, to_be_replaced->prev);
+    remove_from_list(to_be_replaced);
+  }
+
+  // the return value is a type that can be converted to an iterator
+  // the reason for doing this is that it's not free to find the
+  // iterator pointing at the next element. if you care about the
+  // next iterator, turn the return value into an iterator
+  convertible_to_iterator erase(const_iterator to_erase) {
+    EntryPointer current = to_erase.current;
+    remove_from_list(current);
+    current->destroy_value();
+    --num_elements;
+
+    for (EntryPointer next = current + ptrdiff_t(1);
+         !next->is_at_desired_position();
+         ++current, ++next) {
+      // if an entry is being removed, and there are other entries with the
+      // same hash, the other entries get moved to their desired position by
+      // reinserting.
+      current->emplace(next->distance_from_desired - 1, std::move(next->value));
+      replace_linked_list_position(next, current);
+      next->destroy_value();
+    }
+    return {to_erase.current};
+  }
+
+  iterator erase(const_iterator begin_it, const_iterator end_it) {
+    // whenever an entry is removed and there are other entries with the same
+    // hash, the other entries must get moved to their desired position.
+    // any reference to a moved entry is invalidated.
+    // here, we iterate through the range, and make sure that we update
+    // the pointer to our next entry in the list or the end of the iterator
+    // when it is invalidated.
+
+    auto curr_iter = begin_it.current;
+    auto next_iter = curr_iter->next;
+    auto end_iter = end_it.current;
+
+    while (curr_iter != end_iter) {
+      remove_from_list(curr_iter);
+      curr_iter->destroy_value();
+      --num_elements;
+
+      for (EntryPointer next_hash_slot = curr_iter + ptrdiff_t(1);
+           !next_hash_slot->is_at_desired_position();
+           ++curr_iter, ++next_hash_slot) {
+        curr_iter->emplace(
+            next_hash_slot->distance_from_desired - 1,
+            std::move(next_hash_slot->value));
+        replace_linked_list_position(next_hash_slot, curr_iter);
+        next_hash_slot->destroy_value();
+
+        // we are invalidating next_iter or end_iter
+        if (next_hash_slot == end_iter) {
+          end_iter = curr_iter;
+        } else if (next_hash_slot == next_iter) {
+          next_iter = curr_iter;
+        }
+      }
+      curr_iter = next_iter;
+      next_iter = curr_iter->next;
+    }
+
+    return {end_iter};
+  }
+
+  uint64_t erase(const FindKey& key) {
+    auto found = find(key);
+    if (found == end())
+      return 0;
+    else {
+      erase(found);
+      return 1;
+    }
+  }
+
+  void clear() {
+    for (EntryPointer it = entries,
+                      end = it +
+             static_cast<ptrdiff_t>(num_slots_minus_one + max_lookups);
+         it != end;
+         ++it) {
+      if (it->has_value())
+        it->destroy_value();
+    }
+    reset_list();
+    num_elements = 0;
+  }
+
+  void shrink_to_fit() {
+    rehash_for_other_container(*this);
+  }
+
+  void swap(sherwood_v3_table& other) noexcept {
+    using std::swap;
+    swap_pointers(other);
+    swap(static_cast<ArgumentHash&>(*this), static_cast<ArgumentHash&>(other));
+    swap(
+        static_cast<ArgumentEqual&>(*this), static_cast<ArgumentEqual&>(other));
+    if (AllocatorTraits::propagate_on_container_swap::value)
+      swap(static_cast<EntryAlloc&>(*this), static_cast<EntryAlloc&>(other));
+  }
+
+  uint64_t size() const {
+    return num_elements;
+  }
+  uint64_t max_size() const {
+    return (AllocatorTraits::max_size(*this)) / sizeof(Entry);
+  }
+  uint64_t bucket_count() const {
+    return num_slots_minus_one ? num_slots_minus_one + 1 : 0;
+  }
+  size_type max_bucket_count() const {
+    return (AllocatorTraits::max_size(*this) - min_lookups) / sizeof(Entry);
+  }
+  uint64_t bucket(const FindKey& key) const {
+    return hash_policy.index_for_hash(hash_object(key), num_slots_minus_one);
+  }
+  float load_factor() const {
+    uint64_t buckets = bucket_count();
+    if (buckets)
+      return static_cast<float>(num_elements) / bucket_count();
+    else
+      return 0;
+  }
+  void max_load_factor(float value) {
+    _max_load_factor = value;
+  }
+  float max_load_factor() const {
+    return _max_load_factor;
+  }
+
+  bool empty() const {
+    return num_elements == 0;
+  }
+
+ private:
+  EntryPointer entries = empty_default_table();
+  uint64_t num_slots_minus_one = 0;
+  typename HashPolicySelector<ArgumentHash>::type hash_policy;
+  int8_t max_lookups = detailv3::min_lookups - 1;
+  float _max_load_factor = 0.5f;
+  uint64_t num_elements = 0;
+  std::unique_ptr<sherwood_v3_entry<T>> sentinel_val;
+
+  // head of doubly linked list
+  EntryPointer sentinel = initSentinel();
+
+  EntryPointer initSentinel() {
+    // needs to be a pointer so that hash map can be used with forward declared
+    // types
+    sentinel_val = std::make_unique<sherwood_v3_entry<T>>();
+    sentinel = sentinel_val.get();
+    reset_list();
+    return sentinel;
+  }
+
+  EntryPointer empty_default_table() {
+    EntryPointer result =
+        AllocatorTraits::allocate(*this, detailv3::min_lookups);
+    EntryPointer special_end_item =
+        result + static_cast<ptrdiff_t>(detailv3::min_lookups - 1);
+    for (EntryPointer it = result; it != special_end_item; ++it)
+      it->distance_from_desired = -1;
+    special_end_item->distance_from_desired = Entry::special_end_value;
+    return result;
+  }
+
+  static int8_t compute_max_lookups(uint64_t num_buckets) {
+    int8_t desired = detailv3::log2(num_buckets);
+    return std::max(detailv3::min_lookups, desired);
+  }
+
+  uint64_t num_buckets_for_reserve(uint64_t num_elements_) const {
+    return static_cast<uint64_t>(std::ceil(
+        static_cast<double>(num_elements_) /
+        std::min(0.5, static_cast<double>(_max_load_factor))));
+  }
+  void rehash_for_other_container(const sherwood_v3_table& other) {
+    rehash(
+        std::min(num_buckets_for_reserve(other.size()), other.bucket_count()));
+  }
+
+  void swap_pointers(sherwood_v3_table& other) {
+    using std::swap;
+    swap(hash_policy, other.hash_policy);
+    swap(entries, other.entries);
+    swap(num_slots_minus_one, other.num_slots_minus_one);
+    swap(num_elements, other.num_elements);
+    swap(max_lookups, other.max_lookups);
+    swap(_max_load_factor, other._max_load_factor);
+    swap(sentinel, other.sentinel);
+    swap(sentinel_val, other.sentinel_val);
+  }
+
+  void reset_list() {
+    sentinel->next = sentinel;
+    sentinel->prev = sentinel;
+  }
+
+  void remove_from_list(EntryPointer elem) {
+    elem->prev->next = elem->next;
+    elem->next->prev = elem->prev;
+  }
+
+  void insert_after(EntryPointer new_elem, EntryPointer prev) {
+    auto next = prev->next;
+
+    prev->next = new_elem;
+    new_elem->prev = prev;
+
+    new_elem->next = next;
+    next->prev = new_elem;
+  }
+
+  void swap_adjacent_nodes(EntryPointer before, EntryPointer after) {
+    // sentinel stays constant, so before->prev cannot equal after
+    auto before_prev = before->prev;
+    auto after_next = after->next;
+
+    before_prev->next = after;
+    after->prev = before_prev;
+
+    after_next->prev = before;
+    before->next = after_next;
+
+    before->prev = after;
+    after->next = before;
+  }
+
+  void swap_positions(EntryPointer p1, EntryPointer p2) {
+    if (p1 == p2) {
+      return;
+    }
+    if (p1->next == p2) {
+      return swap_adjacent_nodes(p1, p2);
+    } else if (p2->next == p1) {
+      return swap_adjacent_nodes(p2, p1);
+    }
+
+    auto p1_prev = p1->prev;
+    auto p1_next = p1->next;
+
+    auto p2_prev = p2->prev;
+    auto p2_next = p2->next;
+
+    p1_prev->next = p2;
+    p2->prev = p1_prev;
+
+    p1_next->prev = p2;
+    p2->next = p1_next;
+
+    p2_prev->next = p1;
+    p1->prev = p2_prev;
+
+    p2_next->prev = p1;
+    p1->next = p2_next;
+  }
+
+  void append_to_list(EntryPointer new_tail) {
+    insert_after(new_tail, sentinel->prev);
+  }
+
+  template <typename Key, typename... Args>
+  SKA_NOINLINE(std::pair<iterator, bool>)
+  emplace_new_key(
+      int8_t distance_from_desired,
+      EntryPointer current_entry,
+      Key&& key,
+      Args&&... args) {
+    using std::swap;
+    if (num_slots_minus_one == 0 || distance_from_desired == max_lookups ||
+        static_cast<double>(num_elements + 1) >
+            static_cast<double>(num_slots_minus_one + 1) *
+                static_cast<double>(_max_load_factor)) {
+      grow();
+      return emplace(std::forward<Key>(key), std::forward<Args>(args)...);
+    } else if (current_entry->is_empty()) {
+      current_entry->emplace(
+          distance_from_desired,
+          std::forward<Key>(key),
+          std::forward<Args>(args)...);
+      ++num_elements;
+      append_to_list(current_entry);
+      return {{current_entry}, true};
+    }
+    value_type to_insert(std::forward<Key>(key), std::forward<Args>(args)...);
+    swap(distance_from_desired, current_entry->distance_from_desired);
+    // We maintain the invariant that:
+    // - result.current_entry contains the new value we're inserting
+    //   and is in the LinkedList position of to_insert
+    // - to_insert contains the value that represents the position of
+    //   result.current_entry
+    swap(to_insert, current_entry->value);
+    iterator result = {current_entry};
+    for (++distance_from_desired, ++current_entry;; ++current_entry) {
+      if (current_entry->is_empty()) {
+        current_entry->emplace(distance_from_desired, std::move(to_insert));
+        append_to_list(current_entry);
+        // now we can swap back the displaced value to its correct position,
+        // putting the new value we're inserting to the front of the list
+        swap_positions(current_entry, result.current);
+        ++num_elements;
+        return {result, true};
+      } else if (current_entry->distance_from_desired < distance_from_desired) {
+        swap(distance_from_desired, current_entry->distance_from_desired);
+        swap(to_insert, current_entry->value);
+        // to maintain our invariants we need to swap positions
+        // of result.current & current_entry:
+        swap_positions(result.current, current_entry);
+        ++distance_from_desired;
+      } else {
+        ++distance_from_desired;
+        if (distance_from_desired == max_lookups) {
+          // the displaced element gets put back into its correct position
+          // we grow the hash table, and then try again to reinsert the new
+          // element
+          swap(to_insert, result.current->value);
+          grow();
+          return emplace(std::move(to_insert));
+        }
+      }
+    }
+  }
+
+  void grow() {
+    rehash(std::max(uint64_t(4), 2 * bucket_count()));
+  }
+
+  void deallocate_data(
+      EntryPointer begin,
+      uint64_t num_slots_minus_one_,
+      int8_t max_lookups_) {
+    AllocatorTraits::deallocate(
+        *this, begin, num_slots_minus_one_ + max_lookups_ + 1);
+  }
+
+  void reset_to_empty_state() {
+    deallocate_data(entries, num_slots_minus_one, max_lookups);
+    entries = empty_default_table();
+    num_slots_minus_one = 0;
+    hash_policy.reset();
+    max_lookups = detailv3::min_lookups - 1;
+  }
+
+  template <typename U>
+  uint64_t hash_object(const U& key) {
+    return static_cast<Hasher&>(*this)(key);
+  }
+  template <typename U>
+  uint64_t hash_object(const U& key) const {
+    return static_cast<const Hasher&>(*this)(key);
+  }
+  template <typename L, typename R>
+  bool compares_equal(const L& lhs, const R& rhs) {
+    return static_cast<Equal&>(*this)(lhs, rhs);
+  }
+
+ public:
+  struct convertible_to_iterator {
+    EntryPointer it;
+
+    operator iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++iterator{it};
+    }
+    operator const_iterator() {
+      if (it->has_value())
+        return {it};
+      else
+        return ++const_iterator{it};
+    }
+  };
+};
+} // namespace detailv3
+
+struct prime_number_hash_policy {
+  static uint64_t mod0(uint64_t) {
+    return 0llu;
+  }
+  static uint64_t mod2(uint64_t hash) {
+    return hash % 2llu;
+  }
+  static uint64_t mod3(uint64_t hash) {
+    return hash % 3llu;
+  }
+  static uint64_t mod5(uint64_t hash) {
+    return hash % 5llu;
+  }
+  static uint64_t mod7(uint64_t hash) {
+    return hash % 7llu;
+  }
+  static uint64_t mod11(uint64_t hash) {
+    return hash % 11llu;
+  }
+  static uint64_t mod13(uint64_t hash) {
+    return hash % 13llu;
+  }
+  static uint64_t mod17(uint64_t hash) {
+    return hash % 17llu;
+  }
+  static uint64_t mod23(uint64_t hash) {
+    return hash % 23llu;
+  }
+  static uint64_t mod29(uint64_t hash) {
+    return hash % 29llu;
+  }
+  static uint64_t mod37(uint64_t hash) {
+    return hash % 37llu;
+  }
+  static uint64_t mod47(uint64_t hash) {
+    return hash % 47llu;
+  }
+  static uint64_t mod59(uint64_t hash) {
+    return hash % 59llu;
+  }
+  static uint64_t mod73(uint64_t hash) {
+    return hash % 73llu;
+  }
+  static uint64_t mod97(uint64_t hash) {
+    return hash % 97llu;
+  }
+  static uint64_t mod127(uint64_t hash) {
+    return hash % 127llu;
+  }
+  static uint64_t mod151(uint64_t hash) {
+    return hash % 151llu;
+  }
+  static uint64_t mod197(uint64_t hash) {
+    return hash % 197llu;
+  }
+  static uint64_t mod251(uint64_t hash) {
+    return hash % 251llu;
+  }
+  static uint64_t mod313(uint64_t hash) {
+    return hash % 313llu;
+  }
+  static uint64_t mod397(uint64_t hash) {
+    return hash % 397llu;
+  }
+  static uint64_t mod499(uint64_t hash) {
+    return hash % 499llu;
+  }
+  static uint64_t mod631(uint64_t hash) {
+    return hash % 631llu;
+  }
+  static uint64_t mod797(uint64_t hash) {
+    return hash % 797llu;
+  }
+  static uint64_t mod1009(uint64_t hash) {
+    return hash % 1009llu;
+  }
+  static uint64_t mod1259(uint64_t hash) {
+    return hash % 1259llu;
+  }
+  static uint64_t mod1597(uint64_t hash) {
+    return hash % 1597llu;
+  }
+  static uint64_t mod2011(uint64_t hash) {
+    return hash % 2011llu;
+  }
+  static uint64_t mod2539(uint64_t hash) {
+    return hash % 2539llu;
+  }
+  static uint64_t mod3203(uint64_t hash) {
+    return hash % 3203llu;
+  }
+  static uint64_t mod4027(uint64_t hash) {
+    return hash % 4027llu;
+  }
+  static uint64_t mod5087(uint64_t hash) {
+    return hash % 5087llu;
+  }
+  static uint64_t mod6421(uint64_t hash) {
+    return hash % 6421llu;
+  }
+  static uint64_t mod8089(uint64_t hash) {
+    return hash % 8089llu;
+  }
+  static uint64_t mod10193(uint64_t hash) {
+    return hash % 10193llu;
+  }
+  static uint64_t mod12853(uint64_t hash) {
+    return hash % 12853llu;
+  }
+  static uint64_t mod16193(uint64_t hash) {
+    return hash % 16193llu;
+  }
+  static uint64_t mod20399(uint64_t hash) {
+    return hash % 20399llu;
+  }
+  static uint64_t mod25717(uint64_t hash) {
+    return hash % 25717llu;
+  }
+  static uint64_t mod32401(uint64_t hash) {
+    return hash % 32401llu;
+  }
+  static uint64_t mod40823(uint64_t hash) {
+    return hash % 40823llu;
+  }
+  static uint64_t mod51437(uint64_t hash) {
+    return hash % 51437llu;
+  }
+  static uint64_t mod64811(uint64_t hash) {
+    return hash % 64811llu;
+  }
+  static uint64_t mod81649(uint64_t hash) {
+    return hash % 81649llu;
+  }
+  static uint64_t mod102877(uint64_t hash) {
+    return hash % 102877llu;
+  }
+  static uint64_t mod129607(uint64_t hash) {
+    return hash % 129607llu;
+  }
+  static uint64_t mod163307(uint64_t hash) {
+    return hash % 163307llu;
+  }
+  static uint64_t mod205759(uint64_t hash) {
+    return hash % 205759llu;
+  }
+  static uint64_t mod259229(uint64_t hash) {
+    return hash % 259229llu;
+  }
+  static uint64_t mod326617(uint64_t hash) {
+    return hash % 326617llu;
+  }
+  static uint64_t mod411527(uint64_t hash) {
+    return hash % 411527llu;
+  }
+  static uint64_t mod518509(uint64_t hash) {
+    return hash % 518509llu;
+  }
+  static uint64_t mod653267(uint64_t hash) {
+    return hash % 653267llu;
+  }
+  static uint64_t mod823117(uint64_t hash) {
+    return hash % 823117llu;
+  }
+  static uint64_t mod1037059(uint64_t hash) {
+    return hash % 1037059llu;
+  }
+  static uint64_t mod1306601(uint64_t hash) {
+    return hash % 1306601llu;
+  }
+  static uint64_t mod1646237(uint64_t hash) {
+    return hash % 1646237llu;
+  }
+  static uint64_t mod2074129(uint64_t hash) {
+    return hash % 2074129llu;
+  }
+  static uint64_t mod2613229(uint64_t hash) {
+    return hash % 2613229llu;
+  }
+  static uint64_t mod3292489(uint64_t hash) {
+    return hash % 3292489llu;
+  }
+  static uint64_t mod4148279(uint64_t hash) {
+    return hash % 4148279llu;
+  }
+  static uint64_t mod5226491(uint64_t hash) {
+    return hash % 5226491llu;
+  }
+  static uint64_t mod6584983(uint64_t hash) {
+    return hash % 6584983llu;
+  }
+  static uint64_t mod8296553(uint64_t hash) {
+    return hash % 8296553llu;
+  }
+  static uint64_t mod10453007(uint64_t hash) {
+    return hash % 10453007llu;
+  }
+  static uint64_t mod13169977(uint64_t hash) {
+    return hash % 13169977llu;
+  }
+  static uint64_t mod16593127(uint64_t hash) {
+    return hash % 16593127llu;
+  }
+  static uint64_t mod20906033(uint64_t hash) {
+    return hash % 20906033llu;
+  }
+  static uint64_t mod26339969(uint64_t hash) {
+    return hash % 26339969llu;
+  }
+  static uint64_t mod33186281(uint64_t hash) {
+    return hash % 33186281llu;
+  }
+  static uint64_t mod41812097(uint64_t hash) {
+    return hash % 41812097llu;
+  }
+  static uint64_t mod52679969(uint64_t hash) {
+    return hash % 52679969llu;
+  }
+  static uint64_t mod66372617(uint64_t hash) {
+    return hash % 66372617llu;
+  }
+  static uint64_t mod83624237(uint64_t hash) {
+    return hash % 83624237llu;
+  }
+  static uint64_t mod105359939(uint64_t hash) {
+    return hash % 105359939llu;
+  }
+  static uint64_t mod132745199(uint64_t hash) {
+    return hash % 132745199llu;
+  }
+  static uint64_t mod167248483(uint64_t hash) {
+    return hash % 167248483llu;
+  }
+  static uint64_t mod210719881(uint64_t hash) {
+    return hash % 210719881llu;
+  }
+  static uint64_t mod265490441(uint64_t hash) {
+    return hash % 265490441llu;
+  }
+  static uint64_t mod334496971(uint64_t hash) {
+    return hash % 334496971llu;
+  }
+  static uint64_t mod421439783(uint64_t hash) {
+    return hash % 421439783llu;
+  }
+  static uint64_t mod530980861(uint64_t hash) {
+    return hash % 530980861llu;
+  }
+  static uint64_t mod668993977(uint64_t hash) {
+    return hash % 668993977llu;
+  }
+  static uint64_t mod842879579(uint64_t hash) {
+    return hash % 842879579llu;
+  }
+  static uint64_t mod1061961721(uint64_t hash) {
+    return hash % 1061961721llu;
+  }
+  static uint64_t mod1337987929(uint64_t hash) {
+    return hash % 1337987929llu;
+  }
+  static uint64_t mod1685759167(uint64_t hash) {
+    return hash % 1685759167llu;
+  }
+  static uint64_t mod2123923447(uint64_t hash) {
+    return hash % 2123923447llu;
+  }
+  static uint64_t mod2675975881(uint64_t hash) {
+    return hash % 2675975881llu;
+  }
+  static uint64_t mod3371518343(uint64_t hash) {
+    return hash % 3371518343llu;
+  }
+  static uint64_t mod4247846927(uint64_t hash) {
+    return hash % 4247846927llu;
+  }
+  static uint64_t mod5351951779(uint64_t hash) {
+    return hash % 5351951779llu;
+  }
+  static uint64_t mod6743036717(uint64_t hash) {
+    return hash % 6743036717llu;
+  }
+  static uint64_t mod8495693897(uint64_t hash) {
+    return hash % 8495693897llu;
+  }
+  static uint64_t mod10703903591(uint64_t hash) {
+    return hash % 10703903591llu;
+  }
+  static uint64_t mod13486073473(uint64_t hash) {
+    return hash % 13486073473llu;
+  }
+  static uint64_t mod16991387857(uint64_t hash) {
+    return hash % 16991387857llu;
+  }
+  static uint64_t mod21407807219(uint64_t hash) {
+    return hash % 21407807219llu;
+  }
+  static uint64_t mod26972146961(uint64_t hash) {
+    return hash % 26972146961llu;
+  }
+  static uint64_t mod33982775741(uint64_t hash) {
+    return hash % 33982775741llu;
+  }
+  static uint64_t mod42815614441(uint64_t hash) {
+    return hash % 42815614441llu;
+  }
+  static uint64_t mod53944293929(uint64_t hash) {
+    return hash % 53944293929llu;
+  }
+  static uint64_t mod67965551447(uint64_t hash) {
+    return hash % 67965551447llu;
+  }
+  static uint64_t mod85631228929(uint64_t hash) {
+    return hash % 85631228929llu;
+  }
+  static uint64_t mod107888587883(uint64_t hash) {
+    return hash % 107888587883llu;
+  }
+  static uint64_t mod135931102921(uint64_t hash) {
+    return hash % 135931102921llu;
+  }
+  static uint64_t mod171262457903(uint64_t hash) {
+    return hash % 171262457903llu;
+  }
+  static uint64_t mod215777175787(uint64_t hash) {
+    return hash % 215777175787llu;
+  }
+  static uint64_t mod271862205833(uint64_t hash) {
+    return hash % 271862205833llu;
+  }
+  static uint64_t mod342524915839(uint64_t hash) {
+    return hash % 342524915839llu;
+  }
+  static uint64_t mod431554351609(uint64_t hash) {
+    return hash % 431554351609llu;
+  }
+  static uint64_t mod543724411781(uint64_t hash) {
+    return hash % 543724411781llu;
+  }
+  static uint64_t mod685049831731(uint64_t hash) {
+    return hash % 685049831731llu;
+  }
+  static uint64_t mod863108703229(uint64_t hash) {
+    return hash % 863108703229llu;
+  }
+  static uint64_t mod1087448823553(uint64_t hash) {
+    return hash % 1087448823553llu;
+  }
+  static uint64_t mod1370099663459(uint64_t hash) {
+    return hash % 1370099663459llu;
+  }
+  static uint64_t mod1726217406467(uint64_t hash) {
+    return hash % 1726217406467llu;
+  }
+  static uint64_t mod2174897647073(uint64_t hash) {
+    return hash % 2174897647073llu;
+  }
+  static uint64_t mod2740199326961(uint64_t hash) {
+    return hash % 2740199326961llu;
+  }
+  static uint64_t mod3452434812973(uint64_t hash) {
+    return hash % 3452434812973llu;
+  }
+  static uint64_t mod4349795294267(uint64_t hash) {
+    return hash % 4349795294267llu;
+  }
+  static uint64_t mod5480398654009(uint64_t hash) {
+    return hash % 5480398654009llu;
+  }
+  static uint64_t mod6904869625999(uint64_t hash) {
+    return hash % 6904869625999llu;
+  }
+  static uint64_t mod8699590588571(uint64_t hash) {
+    return hash % 8699590588571llu;
+  }
+  static uint64_t mod10960797308051(uint64_t hash) {
+    return hash % 10960797308051llu;
+  }
+  static uint64_t mod13809739252051(uint64_t hash) {
+    return hash % 13809739252051llu;
+  }
+  static uint64_t mod17399181177241(uint64_t hash) {
+    return hash % 17399181177241llu;
+  }
+  static uint64_t mod21921594616111(uint64_t hash) {
+    return hash % 21921594616111llu;
+  }
+  static uint64_t mod27619478504183(uint64_t hash) {
+    return hash % 27619478504183llu;
+  }
+  static uint64_t mod34798362354533(uint64_t hash) {
+    return hash % 34798362354533llu;
+  }
+  static uint64_t mod43843189232363(uint64_t hash) {
+    return hash % 43843189232363llu;
+  }
+  static uint64_t mod55238957008387(uint64_t hash) {
+    return hash % 55238957008387llu;
+  }
+  static uint64_t mod69596724709081(uint64_t hash) {
+    return hash % 69596724709081llu;
+  }
+  static uint64_t mod87686378464759(uint64_t hash) {
+    return hash % 87686378464759llu;
+  }
+  static uint64_t mod110477914016779(uint64_t hash) {
+    return hash % 110477914016779llu;
+  }
+  static uint64_t mod139193449418173(uint64_t hash) {
+    return hash % 139193449418173llu;
+  }
+  static uint64_t mod175372756929481(uint64_t hash) {
+    return hash % 175372756929481llu;
+  }
+  static uint64_t mod220955828033581(uint64_t hash) {
+    return hash % 220955828033581llu;
+  }
+  static uint64_t mod278386898836457(uint64_t hash) {
+    return hash % 278386898836457llu;
+  }
+  static uint64_t mod350745513859007(uint64_t hash) {
+    return hash % 350745513859007llu;
+  }
+  static uint64_t mod441911656067171(uint64_t hash) {
+    return hash % 441911656067171llu;
+  }
+  static uint64_t mod556773797672909(uint64_t hash) {
+    return hash % 556773797672909llu;
+  }
+  static uint64_t mod701491027718027(uint64_t hash) {
+    return hash % 701491027718027llu;
+  }
+  static uint64_t mod883823312134381(uint64_t hash) {
+    return hash % 883823312134381llu;
+  }
+  static uint64_t mod1113547595345903(uint64_t hash) {
+    return hash % 1113547595345903llu;
+  }
+  static uint64_t mod1402982055436147(uint64_t hash) {
+    return hash % 1402982055436147llu;
+  }
+  static uint64_t mod1767646624268779(uint64_t hash) {
+    return hash % 1767646624268779llu;
+  }
+  static uint64_t mod2227095190691797(uint64_t hash) {
+    return hash % 2227095190691797llu;
+  }
+  static uint64_t mod2805964110872297(uint64_t hash) {
+    return hash % 2805964110872297llu;
+  }
+  static uint64_t mod3535293248537579(uint64_t hash) {
+    return hash % 3535293248537579llu;
+  }
+  static uint64_t mod4454190381383713(uint64_t hash) {
+    return hash % 4454190381383713llu;
+  }
+  static uint64_t mod5611928221744609(uint64_t hash) {
+    return hash % 5611928221744609llu;
+  }
+  static uint64_t mod7070586497075177(uint64_t hash) {
+    return hash % 7070586497075177llu;
+  }
+  static uint64_t mod8908380762767489(uint64_t hash) {
+    return hash % 8908380762767489llu;
+  }
+  static uint64_t mod11223856443489329(uint64_t hash) {
+    return hash % 11223856443489329llu;
+  }
+  static uint64_t mod14141172994150357(uint64_t hash) {
+    return hash % 14141172994150357llu;
+  }
+  static uint64_t mod17816761525534927(uint64_t hash) {
+    return hash % 17816761525534927llu;
+  }
+  static uint64_t mod22447712886978529(uint64_t hash) {
+    return hash % 22447712886978529llu;
+  }
+  static uint64_t mod28282345988300791(uint64_t hash) {
+    return hash % 28282345988300791llu;
+  }
+  static uint64_t mod35633523051069991(uint64_t hash) {
+    return hash % 35633523051069991llu;
+  }
+  static uint64_t mod44895425773957261(uint64_t hash) {
+    return hash % 44895425773957261llu;
+  }
+  static uint64_t mod56564691976601587(uint64_t hash) {
+    return hash % 56564691976601587llu;
+  }
+  static uint64_t mod71267046102139967(uint64_t hash) {
+    return hash % 71267046102139967llu;
+  }
+  static uint64_t mod89790851547914507(uint64_t hash) {
+    return hash % 89790851547914507llu;
+  }
+  static uint64_t mod113129383953203213(uint64_t hash) {
+    return hash % 113129383953203213llu;
+  }
+  static uint64_t mod142534092204280003(uint64_t hash) {
+    return hash % 142534092204280003llu;
+  }
+  static uint64_t mod179581703095829107(uint64_t hash) {
+    return hash % 179581703095829107llu;
+  }
+  static uint64_t mod226258767906406483(uint64_t hash) {
+    return hash % 226258767906406483llu;
+  }
+  static uint64_t mod285068184408560057(uint64_t hash) {
+    return hash % 285068184408560057llu;
+  }
+  static uint64_t mod359163406191658253(uint64_t hash) {
+    return hash % 359163406191658253llu;
+  }
+  static uint64_t mod452517535812813007(uint64_t hash) {
+    return hash % 452517535812813007llu;
+  }
+  static uint64_t mod570136368817120201(uint64_t hash) {
+    return hash % 570136368817120201llu;
+  }
+  static uint64_t mod718326812383316683(uint64_t hash) {
+    return hash % 718326812383316683llu;
+  }
+  static uint64_t mod905035071625626043(uint64_t hash) {
+    return hash % 905035071625626043llu;
+  }
+  static uint64_t mod1140272737634240411(uint64_t hash) {
+    return hash % 1140272737634240411llu;
+  }
+  static uint64_t mod1436653624766633509(uint64_t hash) {
+    return hash % 1436653624766633509llu;
+  }
+  static uint64_t mod1810070143251252131(uint64_t hash) {
+    return hash % 1810070143251252131llu;
+  }
+  static uint64_t mod2280545475268481167(uint64_t hash) {
+    return hash % 2280545475268481167llu;
+  }
+  static uint64_t mod2873307249533267101(uint64_t hash) {
+    return hash % 2873307249533267101llu;
+  }
+  static uint64_t mod3620140286502504283(uint64_t hash) {
+    return hash % 3620140286502504283llu;
+  }
+  static uint64_t mod4561090950536962147(uint64_t hash) {
+    return hash % 4561090950536962147llu;
+  }
+  static uint64_t mod5746614499066534157(uint64_t hash) {
+    return hash % 5746614499066534157llu;
+  }
+  static uint64_t mod7240280573005008577(uint64_t hash) {
+    return hash % 7240280573005008577llu;
+  }
+  static uint64_t mod9122181901073924329(uint64_t hash) {
+    return hash % 9122181901073924329llu;
+  }
+  static uint64_t mod11493228998133068689(uint64_t hash) {
+    return hash % 11493228998133068689llu;
+  }
+  static uint64_t mod14480561146010017169(uint64_t hash) {
+    return hash % 14480561146010017169llu;
+  }
+  static uint64_t mod18446744073709551557(uint64_t hash) {
+    return hash % 18446744073709551557llu;
+  }
+
+  using mod_function = uint64_t (*)(uint64_t);
+
+  mod_function next_size_over(uint64_t& size) const {
+    // prime numbers generated by the following method:
+    // 1. start with a prime p = 2
+    // 2. go to wolfram alpha and get p = NextPrime(2 * p)
+    // 3. repeat 2. until you overflow 64 bits
+    // you now have large gaps which you would hit if somebody called reserve()
+    // with an unlucky number.
+    // 4. to fill the gaps for every prime p go to wolfram alpha and get
+    // ClosestPrime(p * 2^(1/3)) and ClosestPrime(p * 2^(2/3)) and put those in
+    // the gaps
+    // 5. get PrevPrime(2^64) and put it at the end
+    // NOLINTNEXTLINE(*c-array*)
+    static constexpr const uint64_t prime_list[] = {
+        2llu,
+        3llu,
+        5llu,
+        7llu,
+        11llu,
+        13llu,
+        17llu,
+        23llu,
+        29llu,
+        37llu,
+        47llu,
+        59llu,
+        73llu,
+        97llu,
+        127llu,
+        151llu,
+        197llu,
+        251llu,
+        313llu,
+        397llu,
+        499llu,
+        631llu,
+        797llu,
+        1009llu,
+        1259llu,
+        1597llu,
+        2011llu,
+        2539llu,
+        3203llu,
+        4027llu,
+        5087llu,
+        6421llu,
+        8089llu,
+        10193llu,
+        12853llu,
+        16193llu,
+        20399llu,
+        25717llu,
+        32401llu,
+        40823llu,
+        51437llu,
+        64811llu,
+        81649llu,
+        102877llu,
+        129607llu,
+        163307llu,
+        205759llu,
+        259229llu,
+        326617llu,
+        411527llu,
+        518509llu,
+        653267llu,
+        823117llu,
+        1037059llu,
+        1306601llu,
+        1646237llu,
+        2074129llu,
+        2613229llu,
+        3292489llu,
+        4148279llu,
+        5226491llu,
+        6584983llu,
+        8296553llu,
+        10453007llu,
+        13169977llu,
+        16593127llu,
+        20906033llu,
+        26339969llu,
+        33186281llu,
+        41812097llu,
+        52679969llu,
+        66372617llu,
+        83624237llu,
+        105359939llu,
+        132745199llu,
+        167248483llu,
+        210719881llu,
+        265490441llu,
+        334496971llu,
+        421439783llu,
+        530980861llu,
+        668993977llu,
+        842879579llu,
+        1061961721llu,
+        1337987929llu,
+        1685759167llu,
+        2123923447llu,
+        2675975881llu,
+        3371518343llu,
+        4247846927llu,
+        5351951779llu,
+        6743036717llu,
+        8495693897llu,
+        10703903591llu,
+        13486073473llu,
+        16991387857llu,
+        21407807219llu,
+        26972146961llu,
+        33982775741llu,
+        42815614441llu,
+        53944293929llu,
+        67965551447llu,
+        85631228929llu,
+        107888587883llu,
+        135931102921llu,
+        171262457903llu,
+        215777175787llu,
+        271862205833llu,
+        342524915839llu,
+        431554351609llu,
+        543724411781llu,
+        685049831731llu,
+        863108703229llu,
+        1087448823553llu,
+        1370099663459llu,
+        1726217406467llu,
+        2174897647073llu,
+        2740199326961llu,
+        3452434812973llu,
+        4349795294267llu,
+        5480398654009llu,
+        6904869625999llu,
+        8699590588571llu,
+        10960797308051llu,
+        13809739252051llu,
+        17399181177241llu,
+        21921594616111llu,
+        27619478504183llu,
+        34798362354533llu,
+        43843189232363llu,
+        55238957008387llu,
+        69596724709081llu,
+        87686378464759llu,
+        110477914016779llu,
+        139193449418173llu,
+        175372756929481llu,
+        220955828033581llu,
+        278386898836457llu,
+        350745513859007llu,
+        441911656067171llu,
+        556773797672909llu,
+        701491027718027llu,
+        883823312134381llu,
+        1113547595345903llu,
+        1402982055436147llu,
+        1767646624268779llu,
+        2227095190691797llu,
+        2805964110872297llu,
+        3535293248537579llu,
+        4454190381383713llu,
+        5611928221744609llu,
+        7070586497075177llu,
+        8908380762767489llu,
+        11223856443489329llu,
+        14141172994150357llu,
+        17816761525534927llu,
+        22447712886978529llu,
+        28282345988300791llu,
+        35633523051069991llu,
+        44895425773957261llu,
+        56564691976601587llu,
+        71267046102139967llu,
+        89790851547914507llu,
+        113129383953203213llu,
+        142534092204280003llu,
+        179581703095829107llu,
+        226258767906406483llu,
+        285068184408560057llu,
+        359163406191658253llu,
+        452517535812813007llu,
+        570136368817120201llu,
+        718326812383316683llu,
+        905035071625626043llu,
+        1140272737634240411llu,
+        1436653624766633509llu,
+        1810070143251252131llu,
+        2280545475268481167llu,
+        2873307249533267101llu,
+        3620140286502504283llu,
+        4561090950536962147llu,
+        5746614499066534157llu,
+        7240280573005008577llu,
+        9122181901073924329llu,
+        11493228998133068689llu,
+        14480561146010017169llu,
+        18446744073709551557llu};
+    // NOLINTNEXTLINE(*c-array*)
+    static constexpr uint64_t (*const mod_functions[])(uint64_t) = {
+        &mod0,
+        &mod2,
+        &mod3,
+        &mod5,
+        &mod7,
+        &mod11,
+        &mod13,
+        &mod17,
+        &mod23,
+        &mod29,
+        &mod37,
+        &mod47,
+        &mod59,
+        &mod73,
+        &mod97,
+        &mod127,
+        &mod151,
+        &mod197,
+        &mod251,
+        &mod313,
+        &mod397,
+        &mod499,
+        &mod631,
+        &mod797,
+        &mod1009,
+        &mod1259,
+        &mod1597,
+        &mod2011,
+        &mod2539,
+        &mod3203,
+        &mod4027,
+        &mod5087,
+        &mod6421,
+        &mod8089,
+        &mod10193,
+        &mod12853,
+        &mod16193,
+        &mod20399,
+        &mod25717,
+        &mod32401,
+        &mod40823,
+        &mod51437,
+        &mod64811,
+        &mod81649,
+        &mod102877,
+        &mod129607,
+        &mod163307,
+        &mod205759,
+        &mod259229,
+        &mod326617,
+        &mod411527,
+        &mod518509,
+        &mod653267,
+        &mod823117,
+        &mod1037059,
+        &mod1306601,
+        &mod1646237,
+        &mod2074129,
+        &mod2613229,
+        &mod3292489,
+        &mod4148279,
+        &mod5226491,
+        &mod6584983,
+        &mod8296553,
+        &mod10453007,
+        &mod13169977,
+        &mod16593127,
+        &mod20906033,
+        &mod26339969,
+        &mod33186281,
+        &mod41812097,
+        &mod52679969,
+        &mod66372617,
+        &mod83624237,
+        &mod105359939,
+        &mod132745199,
+        &mod167248483,
+        &mod210719881,
+        &mod265490441,
+        &mod334496971,
+        &mod421439783,
+        &mod530980861,
+        &mod668993977,
+        &mod842879579,
+        &mod1061961721,
+        &mod1337987929,
+        &mod1685759167,
+        &mod2123923447,
+        &mod2675975881,
+        &mod3371518343,
+        &mod4247846927,
+        &mod5351951779,
+        &mod6743036717,
+        &mod8495693897,
+        &mod10703903591,
+        &mod13486073473,
+        &mod16991387857,
+        &mod21407807219,
+        &mod26972146961,
+        &mod33982775741,
+        &mod42815614441,
+        &mod53944293929,
+        &mod67965551447,
+        &mod85631228929,
+        &mod107888587883,
+        &mod135931102921,
+        &mod171262457903,
+        &mod215777175787,
+        &mod271862205833,
+        &mod342524915839,
+        &mod431554351609,
+        &mod543724411781,
+        &mod685049831731,
+        &mod863108703229,
+        &mod1087448823553,
+        &mod1370099663459,
+        &mod1726217406467,
+        &mod2174897647073,
+        &mod2740199326961,
+        &mod3452434812973,
+        &mod4349795294267,
+        &mod5480398654009,
+        &mod6904869625999,
+        &mod8699590588571,
+        &mod10960797308051,
+        &mod13809739252051,
+        &mod17399181177241,
+        &mod21921594616111,
+        &mod27619478504183,
+        &mod34798362354533,
+        &mod43843189232363,
+        &mod55238957008387,
+        &mod69596724709081,
+        &mod87686378464759,
+        &mod110477914016779,
+        &mod139193449418173,
+        &mod175372756929481,
+        &mod220955828033581,
+        &mod278386898836457,
+        &mod350745513859007,
+        &mod441911656067171,
+        &mod556773797672909,
+        &mod701491027718027,
+        &mod883823312134381,
+        &mod1113547595345903,
+        &mod1402982055436147,
+        &mod1767646624268779,
+        &mod2227095190691797,
+        &mod2805964110872297,
+        &mod3535293248537579,
+        &mod4454190381383713,
+        &mod5611928221744609,
+        &mod7070586497075177,
+        &mod8908380762767489,
+        &mod11223856443489329,
+        &mod14141172994150357,
+        &mod17816761525534927,
+        &mod22447712886978529,
+        &mod28282345988300791,
+        &mod35633523051069991,
+        &mod44895425773957261,
+        &mod56564691976601587,
+        &mod71267046102139967,
+        &mod89790851547914507,
+        &mod113129383953203213,
+        &mod142534092204280003,
+        &mod179581703095829107,
+        &mod226258767906406483,
+        &mod285068184408560057,
+        &mod359163406191658253,
+        &mod452517535812813007,
+        &mod570136368817120201,
+        &mod718326812383316683,
+        &mod905035071625626043,
+        &mod1140272737634240411,
+        &mod1436653624766633509,
+        &mod1810070143251252131,
+        &mod2280545475268481167,
+        &mod2873307249533267101,
+        &mod3620140286502504283,
+        &mod4561090950536962147,
+        &mod5746614499066534157,
+        &mod7240280573005008577,
+        &mod9122181901073924329,
+        &mod11493228998133068689,
+        &mod14480561146010017169,
+        &mod18446744073709551557};
+    const uint64_t* found = std::lower_bound(
+        std::begin(prime_list), std::end(prime_list) - 1, size);
+    size = *found;
+    return mod_functions[1 + found - prime_list];
+  }
+  void commit(mod_function new_mod_function) {
+    current_mod_function = new_mod_function;
+  }
+  void reset() {
+    current_mod_function = &mod0;
+  }
+
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return current_mod_function(hash);
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index > num_slots_minus_one ? current_mod_function(index) : index;
+  }
+
+ private:
+  mod_function current_mod_function = &mod0;
+};
+
+struct power_of_two_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t num_slots_minus_one) const {
+    return hash & num_slots_minus_one;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index_for_hash(index, num_slots_minus_one);
+  }
+  int8_t next_size_over(uint64_t& size) const {
+    size = detailv3::next_power_of_two(size);
+    return 0;
+  }
+  void commit(int8_t) {}
+  void reset() {}
+};
+
+struct fibonacci_hash_policy {
+  uint64_t index_for_hash(uint64_t hash, uint64_t /*num_slots_minus_one*/)
+      const {
+    return (11400714819323198485ull * hash) >> shift;
+  }
+  uint64_t keep_in_range(uint64_t index, uint64_t num_slots_minus_one) const {
+    return index & num_slots_minus_one;
+  }
+
+  int8_t next_size_over(uint64_t& size) const {
+    size = std::max(uint64_t(2), detailv3::next_power_of_two(size));
+    return static_cast<int8_t>(64 - detailv3::log2(size));
+  }
+  void commit(int8_t shift_) {
+    shift = shift_;
+  }
+  void reset() {
+    shift = 63;
+  }
+
+ private:
+  int8_t shift = 63;
+};
+
+template <
+    typename K,
+    typename V,
+    typename H = std::hash<K>,
+    typename E = std::equal_to<K>,
+    typename A = std::allocator<std::pair<K, V>>>
+class order_preserving_flat_hash_map
+    : public detailv3::sherwood_v3_table<
+          std::pair<K, V>,
+          K,
+          H,
+          detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+          E,
+          detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<std::pair<K, V>>>> {
+  using Table = detailv3::sherwood_v3_table<
+      std::pair<K, V>,
+      K,
+      H,
+      detailv3::KeyOrValueHasher<K, std::pair<K, V>, H>,
+      E,
+      detailv3::KeyOrValueEquality<K, std::pair<K, V>, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<std::pair<K, V>>>>;
+
+ public:
+  using key_type = K;
+  using mapped_type = V;
+
+  using Table::Table;
+  order_preserving_flat_hash_map() = default;
+
+  inline V& operator[](const K& key) {
+    return emplace(key, convertible_to_value()).first->second;
+  }
+  inline V& operator[](K&& key) {
+    return emplace(std::move(key), convertible_to_value()).first->second;
+  }
+  V& at(const K& key) {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+  const V& at(const K& key) const {
+    auto found = this->find(key);
+    if (found == this->end())
+      throw std::out_of_range("Argument passed to at() was not in the map.");
+    return found->second;
+  }
+
+  using Table::emplace;
+  std::pair<typename Table::iterator, bool> emplace() {
+    return emplace(key_type(), convertible_to_value());
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      const key_type& key,
+      M&& m) {
+    auto emplace_result = emplace(key, std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  std::pair<typename Table::iterator, bool> insert_or_assign(
+      key_type&& key,
+      M&& m) {
+    auto emplace_result = emplace(std::move(key), std::forward<M>(m));
+    if (!emplace_result.second)
+      emplace_result.first->second = std::forward<M>(m);
+    return emplace_result;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator,
+      const key_type& key,
+      M&& m) {
+    return insert_or_assign(key, std::forward<M>(m)).first;
+  }
+  template <typename M>
+  typename Table::iterator insert_or_assign(
+      typename Table::const_iterator,
+      key_type&& key,
+      M&& m) {
+    return insert_or_assign(std::move(key), std::forward<M>(m)).first;
+  }
+
+  friend bool operator==(
+      const order_preserving_flat_hash_map& lhs,
+      const order_preserving_flat_hash_map& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const typename Table::value_type& value : lhs) {
+      auto found = rhs.find(value.first);
+      if (found == rhs.end() || value.second != found->second)
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(
+      const order_preserving_flat_hash_map& lhs,
+      const order_preserving_flat_hash_map& rhs) {
+    return !(lhs == rhs);
+  }
+
+ private:
+  struct convertible_to_value {
+    operator V() const {
+      return V();
+    }
+  };
+};
+
+template <
+    typename T,
+    typename H = std::hash<T>,
+    typename E = std::equal_to<T>,
+    typename A = std::allocator<T>>
+class flat_hash_set
+    : public detailv3::sherwood_v3_table<
+          T,
+          T,
+          H,
+          detailv3::functor_storage<uint64_t, H>,
+          E,
+          detailv3::functor_storage<bool, E>,
+          A,
+          typename std::allocator_traits<A>::template rebind_alloc<
+              detailv3::sherwood_v3_entry<T>>> {
+  using Table = detailv3::sherwood_v3_table<
+      T,
+      T,
+      H,
+      detailv3::functor_storage<uint64_t, H>,
+      E,
+      detailv3::functor_storage<bool, E>,
+      A,
+      typename std::allocator_traits<A>::template rebind_alloc<
+          detailv3::sherwood_v3_entry<T>>>;
+
+ public:
+  using key_type = T;
+
+  using Table::Table;
+  flat_hash_set() = default;
+
+  template <typename... Args>
+  std::pair<typename Table::iterator, bool> emplace(Args&&... args) {
+    return Table::emplace(T(std::forward<Args>(args)...));
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type& arg) {
+    return Table::emplace(arg);
+  }
+  std::pair<typename Table::iterator, bool> emplace(const key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+  std::pair<typename Table::iterator, bool> emplace(key_type&& arg) {
+    return Table::emplace(std::move(arg));
+  }
+
+  friend bool operator==(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    for (const T& value : lhs) {
+      if (rhs.find(value) == rhs.end())
+        return false;
+    }
+    return true;
+  }
+  friend bool operator!=(const flat_hash_set& lhs, const flat_hash_set& rhs) {
+    return !(lhs == rhs);
+  }
+};
+
+template <typename T>
+struct power_of_two_std_hash : std::hash<T> {
+  typedef ska_ordered::power_of_two_hash_policy hash_policy;
+};
+
+} // namespace ska_ordered
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/overloaded.h b/MLPY/Lib/site-packages/torch/include/c10/util/overloaded.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1ffe61e2babcb67ceeda5d4bccc048495e850bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/overloaded.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <memory>
+namespace c10 {
+namespace detail {
+
+template <class... Ts>
+struct overloaded_t {};
+
+template <class T0>
+struct overloaded_t<T0> : T0 {
+  using T0::operator();
+  overloaded_t(T0 t0) : T0(std::move(t0)) {}
+};
+template <class T0, class... Ts>
+struct overloaded_t<T0, Ts...> : T0, overloaded_t<Ts...> {
+  using T0::operator();
+  using overloaded_t<Ts...>::operator();
+  overloaded_t(T0 t0, Ts... ts)
+      : T0(std::move(t0)), overloaded_t<Ts...>(std::move(ts)...) {}
+};
+
+} // namespace detail
+
+// Construct an overloaded callable combining multiple callables, e.g. lambdas
+template <class... Ts>
+detail::overloaded_t<Ts...> overloaded(Ts... ts) {
+  return {std::move(ts)...};
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/python_stub.h b/MLPY/Lib/site-packages/torch/include/c10/util/python_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3ce0d8907f520ed290e38c434fd5c1ad2927d0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/python_stub.h
@@ -0,0 +1,4 @@
+#pragma once
+
+struct _object;
+using PyObject = _object;
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/qint32.h b/MLPY/Lib/site-packages/torch/include/c10/util/qint32.h
new file mode 100644
index 0000000000000000000000000000000000000000..119985551a58d731ab3321a80ff5ffa7ff7193eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/qint32.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * qint32 is for signed 32 bit quantized Tensors
+ */
+struct alignas(4) qint32 {
+  using underlying = int32_t;
+  int32_t val_;
+  qint32() = default;
+  C10_HOST_DEVICE explicit qint32(int32_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/qint8.h b/MLPY/Lib/site-packages/torch/include/c10/util/qint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..2af1da150fad0ffa05ed6f3a7980d00f3c749e1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/qint8.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * This is the data type for quantized Tensors. Right now we only have
+ * qint8 which is for 8 bit Tensors, and qint32 for 32 bit int Tensors,
+ * we might have 4 bit, 2 bit or 1 bit data types in the future.
+ */
+struct alignas(1) qint8 {
+  using underlying = int8_t;
+  int8_t val_;
+  qint8() = default;
+  C10_HOST_DEVICE explicit qint8(int8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/quint2x4.h b/MLPY/Lib/site-packages/torch/include/c10/util/quint2x4.h
new file mode 100644
index 0000000000000000000000000000000000000000..17d9c633242423c799253b5f291e865ef033d98d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/quint2x4.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint2x4 is for un-signed 2 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint2x4 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint2x4() = default;
+  C10_HOST_DEVICE explicit quint2x4(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/quint4x2.h b/MLPY/Lib/site-packages/torch/include/c10/util/quint4x2.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ae19a4a76ece66c1383765a65b3cf455bc14a8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/quint4x2.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte
+ * boundary.
+ */
+struct alignas(1) quint4x2 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint4x2() = default;
+  C10_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/quint8.h b/MLPY/Lib/site-packages/torch/include/c10/util/quint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cd94a3191005daff4e1fb40b9008c74cae4f79c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/quint8.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * quint8 is for unsigned 8 bit quantized Tensors
+ */
+struct alignas(1) quint8 {
+  using underlying = uint8_t;
+  uint8_t val_;
+  quint8() = default;
+  C10_HOST_DEVICE explicit quint8(uint8_t val) : val_(val) {}
+};
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/safe_numerics.h b/MLPY/Lib/site-packages/torch/include/c10/util/safe_numerics.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc54b668a955f6df9d7088a80ce68148358a7cee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/safe_numerics.h
@@ -0,0 +1,90 @@
+#pragma once
+#include <c10/macros/Macros.h>
+
+#include <cstdint>
+
+// GCC has __builtin_mul_overflow from before it supported __has_builtin
+#ifdef _MSC_VER
+#define C10_HAS_BUILTIN_OVERFLOW() (0)
+#include <c10/util/llvmMathExtras.h>
+#include <intrin.h>
+#else
+#define C10_HAS_BUILTIN_OVERFLOW() (1)
+#endif
+
+namespace c10 {
+
+C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_add_overflow(a, b, out);
+#else
+  unsigned long long tmp;
+#if defined(_M_IX86) || defined(_M_X64)
+  auto carry = _addcarry_u64(0, a, b, &tmp);
+#else
+  tmp = a + b;
+  unsigned long long vector = (a & b) ^ ((a ^ b) & ~tmp);
+  auto carry = vector >> 63;
+#endif
+  *out = tmp;
+  return carry;
+#endif
+}
+
+C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_mul_overflow(a, b, out);
+#else
+  *out = a * b;
+  // This test isnt exact, but avoids doing integer division
+  return (
+      (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64);
+#endif
+}
+
+C10_ALWAYS_INLINE bool mul_overflows(int64_t a, int64_t b, int64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  return __builtin_mul_overflow(a, b, out);
+#else
+  volatile int64_t tmp = a * b;
+  *out = tmp;
+  if (a == 0 || b == 0) {
+    return false;
+  }
+  return !(a == tmp / b);
+#endif
+}
+
+template <typename It>
+bool safe_multiplies_u64(It first, It last, uint64_t* out) {
+#if C10_HAS_BUILTIN_OVERFLOW()
+  uint64_t prod = 1;
+  bool overflow = false;
+  for (; first != last; ++first) {
+    overflow |= c10::mul_overflows(prod, *first, &prod);
+  }
+  *out = prod;
+  return overflow;
+#else
+  uint64_t prod = 1;
+  uint64_t prod_log2 = 0;
+  bool is_zero = false;
+  for (; first != last; ++first) {
+    auto x = static_cast<uint64_t>(*first);
+    prod *= x;
+    // log2(0) isn't valid, so need to track it specially
+    is_zero |= (x == 0);
+    prod_log2 += c10::llvm::Log2_64_Ceil(x);
+  }
+  *out = prod;
+  // This test isnt exact, but avoids doing integer division
+  return !is_zero && (prod_log2 >= 64);
+#endif
+}
+
+template <typename Container>
+bool safe_multiplies_u64(const Container& c, uint64_t* out) {
+  return safe_multiplies_u64(c.begin(), c.end(), out);
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/signal_handler.h b/MLPY/Lib/site-packages/torch/include/c10/util/signal_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..77f974f329f30489f39935781cb7e0d96e8ca040
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/signal_handler.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <csignal>
+#include <cstdint>
+#include <mutex>
+
+#include <c10/macros/Export.h>
+
+#if defined(__APPLE__)
+#define C10_SUPPORTS_SIGNAL_HANDLER
+#elif defined(__linux__) && !defined(C10_DISABLE_SIGNAL_HANDLERS)
+#define C10_SUPPORTS_FATAL_SIGNAL_HANDLERS
+#define C10_SUPPORTS_SIGNAL_HANDLER
+#endif
+
+#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+#include <pthread.h>
+#endif
+
+namespace c10 {
+
+class C10_API SignalHandler {
+ public:
+  enum class Action { NONE, STOP };
+
+  // Constructor. Specify what action to take when a signal is received.
+  SignalHandler(Action SIGINT_action, Action SIGHUP_action);
+  ~SignalHandler();
+
+  Action CheckForSignals();
+
+  bool GotSIGINT();
+  bool GotSIGHUP();
+
+  Action SIGINT_action_;
+  Action SIGHUP_action_;
+  std::atomic<uint64_t> my_sigint_count_;
+  std::atomic<uint64_t> my_sighup_count_;
+};
+
+#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
+class C10_API FatalSignalHandler {
+  // This works by setting up certain fatal signal handlers. Previous fatal
+  // signal handlers will still be called when the signal is raised. Defaults
+  // to being off.
+ public:
+  C10_API void setPrintStackTracesOnFatalSignal(bool print);
+  C10_API bool printStackTracesOnFatalSignal();
+  static FatalSignalHandler& getInstance();
+  virtual ~FatalSignalHandler();
+
+ protected:
+  explicit FatalSignalHandler();
+
+ private:
+  void installFatalSignalHandlers();
+  void uninstallFatalSignalHandlers();
+  static void fatalSignalHandlerStatic(int signum);
+  void fatalSignalHandler(int signum);
+  virtual void fatalSignalHandlerPostProcess();
+  struct sigaction* getPreviousSigaction(int signum);
+  const char* getSignalName(int signum);
+  void callPreviousSignalHandler(
+      struct sigaction* action,
+      int signum,
+      siginfo_t* info,
+      void* ctx);
+  void stacktraceSignalHandler(bool needsLock);
+  static void stacktraceSignalHandlerStatic(
+      int signum,
+      siginfo_t* info,
+      void* ctx);
+  void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx);
+
+  // The mutex protects the bool.
+  std::mutex fatalSignalHandlersInstallationMutex;
+  bool fatalSignalHandlersInstalled;
+  // We need to hold a reference to call the previous SIGUSR2 handler in case
+  // we didn't signal it
+  struct sigaction previousSigusr2 {};
+  // Flag dictating whether the SIGUSR2 handler falls back to previous handlers
+  // or is intercepted in order to print a stack trace.
+  std::atomic<bool> fatalSignalReceived;
+  // Global state set when a fatal signal is received so that backtracing
+  // threads know why they're printing a stacktrace.
+  const char* fatalSignalName;
+  int fatalSignum = -1;
+  // This wait condition is used to wait for other threads to finish writing
+  // their stack trace when in fatal sig handler (we can't use pthread_join
+  // because there's no way to convert from a tid to a pthread_t).
+  std::condition_variable writingCond;
+  std::mutex writingMutex;
+  // used to indicate if the other thread responded to the signal
+  bool signalReceived;
+
+  struct signal_handler {
+    const char* name;
+    int signum;
+    struct sigaction previous;
+  };
+
+  // NOLINTNEXTLINE(*c-arrays*)
+  static signal_handler kSignalHandlers[];
+};
+
+#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/sparse_bitset.h b/MLPY/Lib/site-packages/torch/include/c10/util/sparse_bitset.h
new file mode 100644
index 0000000000000000000000000000000000000000..315dded4ca75f8a9484c0b9caa3cba7115139a2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/sparse_bitset.h
@@ -0,0 +1,892 @@
+//===- llvm/ADT/SparseBitVector.h - Efficient Sparse BitVector --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SparseBitVector class.  See the doxygen comment for
+// SparseBitVector for more details on the algorithm used.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/llvmMathExtras.h>
+#include <array>
+#include <cassert>
+#include <climits>
+#include <iterator>
+#include <list>
+#include <ostream>
+
+namespace c10 {
+
+/// SparseBitVector is an implementation of a bitvector that is sparse by only
+/// storing the elements that have non-zero bits set.  In order to make this
+/// fast for the most common cases, SparseBitVector is implemented as a linked
+/// list of SparseBitVectorElements.  We maintain a pointer to the last
+/// SparseBitVectorElement accessed (in the form of a list iterator), in order
+/// to make multiple in-order test/set constant time after the first one is
+/// executed.  Note that using vectors to store SparseBitVectorElement's does
+/// not work out very well because it causes insertion in the middle to take
+/// enormous amounts of time with a large amount of bits.  Other structures that
+/// have better worst cases for insertion in the middle (various balanced trees,
+/// etc) do not perform as well in practice as a linked list with this iterator
+/// kept up to date.  They are also significantly more memory intensive.
+
+template <unsigned ElementSize = 128>
+struct SparseBitVectorElement {
+ public:
+  using BitWord = unsigned long;
+  using size_type = unsigned;
+  enum {
+    BITWORD_SIZE = sizeof(BitWord) * CHAR_BIT,
+    BITWORDS_PER_ELEMENT = (ElementSize + BITWORD_SIZE - 1) / BITWORD_SIZE,
+    BITS_PER_ELEMENT = ElementSize
+  };
+
+ private:
+  // Index of Element in terms of where first bit starts.
+  unsigned ElementIndex;
+  std::array<BitWord, BITWORDS_PER_ELEMENT> Bits{};
+
+  SparseBitVectorElement() : ElementIndex(~0U) {}
+
+ public:
+  explicit SparseBitVectorElement(unsigned Idx) : ElementIndex(Idx) {}
+
+  // Comparison.
+  bool operator==(const SparseBitVectorElement& RHS) const {
+    if (ElementIndex != RHS.ElementIndex)
+      return false;
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+      if (Bits[i] != RHS.Bits[i])
+        return false;
+    return true;
+  }
+
+  bool operator!=(const SparseBitVectorElement& RHS) const {
+    return !(*this == RHS);
+  }
+
+  // Return the bits that make up word Idx in our element.
+  BitWord word(unsigned Idx) const {
+    assert(Idx < BITWORDS_PER_ELEMENT);
+    return Bits[Idx];
+  }
+
+  unsigned index() const {
+    return ElementIndex;
+  }
+
+  bool empty() const {
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+      if (Bits[i])
+        return false;
+    return true;
+  }
+
+  void set(unsigned Idx) {
+    Bits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE);
+  }
+
+  bool test_and_set(unsigned Idx) {
+    bool old = test(Idx);
+    if (!old) {
+      set(Idx);
+      return true;
+    }
+    return false;
+  }
+
+  void reset(unsigned Idx) {
+    Bits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE));
+  }
+
+  bool test(unsigned Idx) const {
+    return Bits[Idx / BITWORD_SIZE] & (1L << (Idx % BITWORD_SIZE));
+  }
+
+  size_type count() const {
+    unsigned NumBits = 0;
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+      NumBits += llvm::countPopulation(Bits[i]);
+    return NumBits;
+  }
+
+  /// find_first - Returns the index of the first set bit.
+  int find_first() const {
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i)
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + llvm::countTrailingZeros(Bits[i]);
+    throw std::runtime_error("Illegal empty element");
+  }
+
+  /// find_last - Returns the index of the last set bit.
+  int find_last() const {
+    for (unsigned I = 0; I < BITWORDS_PER_ELEMENT; ++I) {
+      unsigned Idx = BITWORDS_PER_ELEMENT - I - 1;
+      if (Bits[Idx] != 0)
+        return Idx * BITWORD_SIZE + BITWORD_SIZE -
+            llvm::countLeadingZeros(Bits[Idx]);
+    }
+    throw std::runtime_error("Illegal empty element");
+  }
+
+  /// find_next - Returns the index of the next set bit starting from the
+  /// "Curr" bit. Returns -1 if the next set bit is not found.
+  int find_next(unsigned Curr) const {
+    if (Curr >= BITS_PER_ELEMENT)
+      return -1;
+
+    unsigned WordPos = Curr / BITWORD_SIZE;
+    unsigned BitPos = Curr % BITWORD_SIZE;
+    BitWord Copy = Bits[WordPos];
+    assert(
+        WordPos <= BITWORDS_PER_ELEMENT && "Word Position outside of element");
+
+    // Mask off previous bits.
+    Copy &= ~0UL << BitPos;
+
+    if (Copy != 0)
+      return WordPos * BITWORD_SIZE + llvm::countTrailingZeros(Copy);
+
+    // Check subsequent words.
+    for (unsigned i = WordPos + 1; i < BITWORDS_PER_ELEMENT; ++i)
+      if (Bits[i] != 0)
+        return i * BITWORD_SIZE + llvm::countTrailingZeros(Bits[i]);
+    return -1;
+  }
+
+  // Union this element with RHS and return true if this one changed.
+  bool unionWith(const SparseBitVectorElement& RHS) {
+    bool changed = false;
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+      BitWord old = changed ? 0 : Bits[i];
+
+      Bits[i] |= RHS.Bits[i];
+      if (!changed && old != Bits[i])
+        changed = true;
+    }
+    return changed;
+  }
+
+  // Return true if we have any bits in common with RHS
+  bool intersects(const SparseBitVectorElement& RHS) const {
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+      if (RHS.Bits[i] & Bits[i])
+        return true;
+    }
+    return false;
+  }
+
+  // Intersect this Element with RHS and return true if this one changed.
+  // BecameZero is set to true if this element became all-zero bits.
+  bool intersectWith(const SparseBitVectorElement& RHS, bool& BecameZero) {
+    bool changed = false;
+    bool allzero = true;
+
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+      BitWord old = changed ? 0 : Bits[i];
+
+      Bits[i] &= RHS.Bits[i];
+      if (Bits[i] != 0)
+        allzero = false;
+
+      if (!changed && old != Bits[i])
+        changed = true;
+    }
+    BecameZero = allzero;
+    return changed;
+  }
+
+  // Intersect this Element with the complement of RHS and return true if this
+  // one changed.  BecameZero is set to true if this element became all-zero
+  // bits.
+  bool intersectWithComplement(
+      const SparseBitVectorElement& RHS,
+      bool& BecameZero) {
+    bool changed = false;
+    bool allzero = true;
+
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+      BitWord old = changed ? 0 : Bits[i];
+
+      Bits[i] &= ~RHS.Bits[i];
+      if (Bits[i] != 0)
+        allzero = false;
+
+      if (!changed && old != Bits[i])
+        changed = true;
+    }
+    BecameZero = allzero;
+    return changed;
+  }
+
+  // Three argument version of intersectWithComplement that intersects
+  // RHS1 & ~RHS2 into this element
+  void intersectWithComplement(
+      const SparseBitVectorElement& RHS1,
+      const SparseBitVectorElement& RHS2,
+      bool& BecameZero) {
+    bool allzero = true;
+
+    for (unsigned i = 0; i < BITWORDS_PER_ELEMENT; ++i) {
+      Bits[i] = RHS1.Bits[i] & ~RHS2.Bits[i];
+      if (Bits[i] != 0)
+        allzero = false;
+    }
+    BecameZero = allzero;
+  }
+};
+
+template <unsigned ElementSize = 128>
+class SparseBitVector {
+  using ElementList = std::list<SparseBitVectorElement<ElementSize>>;
+  using ElementListIter = typename ElementList::iterator;
+  using ElementListConstIter = typename ElementList::const_iterator;
+  enum { BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE };
+
+  ElementList Elements;
+  // Pointer to our current Element. This has no visible effect on the external
+  // state of a SparseBitVector, it's just used to improve performance in the
+  // common case of testing/modifying bits with similar indices.
+  mutable ElementListIter CurrElementIter;
+
+  // This is like std::lower_bound, except we do linear searching from the
+  // current position.
+  ElementListIter FindLowerBoundImpl(unsigned ElementIndex) const {
+    // We cache a non-const iterator so we're forced to resort to const_cast to
+    // get the begin/end in the case where 'this' is const. To avoid duplication
+    // of code with the only difference being whether the const cast is present
+    // 'this' is always const in this particular function and we sort out the
+    // difference in FindLowerBound and FindLowerBoundConst.
+    ElementListIter Begin =
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<SparseBitVector<ElementSize>*>(this)->Elements.begin();
+    ElementListIter End =
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<SparseBitVector<ElementSize>*>(this)->Elements.end();
+
+    if (Elements.empty()) {
+      CurrElementIter = Begin;
+      return CurrElementIter;
+    }
+
+    // Make sure our current iterator is valid.
+    if (CurrElementIter == End)
+      --CurrElementIter;
+
+    // Search from our current iterator, either backwards or forwards,
+    // depending on what element we are looking for.
+    ElementListIter ElementIter = CurrElementIter;
+    if (CurrElementIter->index() == ElementIndex) {
+      return ElementIter;
+    } else if (CurrElementIter->index() > ElementIndex) {
+      while (ElementIter != Begin && ElementIter->index() > ElementIndex)
+        --ElementIter;
+    } else {
+      while (ElementIter != End && ElementIter->index() < ElementIndex)
+        ++ElementIter;
+    }
+    CurrElementIter = ElementIter;
+    return ElementIter;
+  }
+  ElementListConstIter FindLowerBoundConst(unsigned ElementIndex) const {
+    return FindLowerBoundImpl(ElementIndex);
+  }
+  ElementListIter FindLowerBound(unsigned ElementIndex) {
+    return FindLowerBoundImpl(ElementIndex);
+  }
+
+  // Iterator to walk set bits in the bitmap.  This iterator is a lot uglier
+  // than it would be, in order to be efficient.
+  class SparseBitVectorIterator {
+   private:
+    bool AtEnd{false};
+
+    const SparseBitVector<ElementSize>* BitVector = nullptr;
+
+    // Current element inside of bitmap.
+    ElementListConstIter Iter;
+
+    // Current bit number inside of our bitmap.
+    unsigned BitNumber{0};
+
+    // Current word number inside of our element.
+    unsigned WordNumber{0};
+
+    // Current bits from the element.
+    typename SparseBitVectorElement<ElementSize>::BitWord Bits{0};
+
+    // Move our iterator to the first non-zero bit in the bitmap.
+    void AdvanceToFirstNonZero() {
+      if (AtEnd)
+        return;
+      if (BitVector->Elements.empty()) {
+        AtEnd = true;
+        return;
+      }
+      Iter = BitVector->Elements.begin();
+      BitNumber = Iter->index() * ElementSize;
+      unsigned BitPos = Iter->find_first();
+      BitNumber += BitPos;
+      WordNumber = (BitNumber % ElementSize) / BITWORD_SIZE;
+      Bits = Iter->word(WordNumber);
+      Bits >>= BitPos % BITWORD_SIZE;
+    }
+
+    // Move our iterator to the next non-zero bit.
+    void AdvanceToNextNonZero() {
+      if (AtEnd)
+        return;
+
+      while (Bits && !(Bits & 1)) {
+        Bits >>= 1;
+        BitNumber += 1;
+      }
+
+      // See if we ran out of Bits in this word.
+      if (!Bits) {
+        int NextSetBitNumber = Iter->find_next(BitNumber % ElementSize);
+        // If we ran out of set bits in this element, move to next element.
+        if (NextSetBitNumber == -1 || (BitNumber % ElementSize == 0)) {
+          ++Iter;
+          WordNumber = 0;
+
+          // We may run out of elements in the bitmap.
+          if (Iter == BitVector->Elements.end()) {
+            AtEnd = true;
+            return;
+          }
+          // Set up for next non-zero word in bitmap.
+          BitNumber = Iter->index() * ElementSize;
+          NextSetBitNumber = Iter->find_first();
+          BitNumber += NextSetBitNumber;
+          WordNumber = (BitNumber % ElementSize) / BITWORD_SIZE;
+          Bits = Iter->word(WordNumber);
+          Bits >>= NextSetBitNumber % BITWORD_SIZE;
+        } else {
+          WordNumber = (NextSetBitNumber % ElementSize) / BITWORD_SIZE;
+          Bits = Iter->word(WordNumber);
+          Bits >>= NextSetBitNumber % BITWORD_SIZE;
+          BitNumber = Iter->index() * ElementSize;
+          BitNumber += NextSetBitNumber;
+        }
+      }
+    }
+
+   public:
+    SparseBitVectorIterator() = default;
+
+    SparseBitVectorIterator(
+        const SparseBitVector<ElementSize>* RHS,
+        bool end = false)
+        : AtEnd(end),
+          BitVector(RHS),
+          Iter(BitVector->Elements.begin()),
+          WordNumber(~0) {
+      AdvanceToFirstNonZero();
+    }
+
+    // Preincrement.
+    inline SparseBitVectorIterator& operator++() {
+      ++BitNumber;
+      Bits >>= 1;
+      AdvanceToNextNonZero();
+      return *this;
+    }
+
+    // Postincrement.
+    inline SparseBitVectorIterator operator++(int) {
+      SparseBitVectorIterator tmp = *this;
+      ++*this;
+      return tmp;
+    }
+
+    // Return the current set bit number.
+    unsigned operator*() const {
+      return BitNumber;
+    }
+
+    bool operator==(const SparseBitVectorIterator& RHS) const {
+      // If they are both at the end, ignore the rest of the fields.
+      if (AtEnd && RHS.AtEnd)
+        return true;
+      // Otherwise they are the same if they have the same bit number and
+      // bitmap.
+      return AtEnd == RHS.AtEnd && RHS.BitNumber == BitNumber;
+    }
+
+    bool operator!=(const SparseBitVectorIterator& RHS) const {
+      return !(*this == RHS);
+    }
+  };
+
+ public:
+  using iterator = SparseBitVectorIterator;
+
+  SparseBitVector() : Elements(), CurrElementIter(Elements.begin()) {}
+
+  SparseBitVector(const SparseBitVector& RHS)
+      : Elements(RHS.Elements), CurrElementIter(Elements.begin()) {}
+  SparseBitVector(SparseBitVector&& RHS) noexcept
+      : Elements(std::move(RHS.Elements)), CurrElementIter(Elements.begin()) {}
+
+  // Clear.
+  void clear() {
+    Elements.clear();
+  }
+
+  // Assignment
+  SparseBitVector& operator=(const SparseBitVector& RHS) {
+    if (this == &RHS)
+      return *this;
+
+    Elements = RHS.Elements;
+    CurrElementIter = Elements.begin();
+    return *this;
+  }
+  SparseBitVector& operator=(SparseBitVector&& RHS) noexcept {
+    Elements = std::move(RHS.Elements);
+    CurrElementIter = Elements.begin();
+    return *this;
+  }
+
+  // Test, Reset, and Set a bit in the bitmap.
+  bool test(unsigned Idx) const {
+    if (Elements.empty())
+      return false;
+
+    unsigned ElementIndex = Idx / ElementSize;
+    ElementListConstIter ElementIter = FindLowerBoundConst(ElementIndex);
+
+    // If we can't find an element that is supposed to contain this bit, there
+    // is nothing more to do.
+    if (ElementIter == Elements.end() || ElementIter->index() != ElementIndex)
+      return false;
+    return ElementIter->test(Idx % ElementSize);
+  }
+
+  void reset(unsigned Idx) {
+    if (Elements.empty())
+      return;
+
+    unsigned ElementIndex = Idx / ElementSize;
+    ElementListIter ElementIter = FindLowerBound(ElementIndex);
+
+    // If we can't find an element that is supposed to contain this bit, there
+    // is nothing more to do.
+    if (ElementIter == Elements.end() || ElementIter->index() != ElementIndex)
+      return;
+    ElementIter->reset(Idx % ElementSize);
+
+    // When the element is zeroed out, delete it.
+    if (ElementIter->empty()) {
+      ++CurrElementIter;
+      Elements.erase(ElementIter);
+    }
+  }
+
+  void set(unsigned Idx) {
+    unsigned ElementIndex = Idx / ElementSize;
+    ElementListIter ElementIter;
+    if (Elements.empty()) {
+      ElementIter = Elements.emplace(Elements.end(), ElementIndex);
+    } else {
+      ElementIter = FindLowerBound(ElementIndex);
+
+      if (ElementIter == Elements.end() ||
+          ElementIter->index() != ElementIndex) {
+        // We may have hit the beginning of our SparseBitVector, in which case,
+        // we may need to insert right after this element, which requires moving
+        // the current iterator forward one, because insert does insert before.
+        if (ElementIter != Elements.end() &&
+            ElementIter->index() < ElementIndex)
+          ++ElementIter;
+        ElementIter = Elements.emplace(ElementIter, ElementIndex);
+      }
+    }
+    CurrElementIter = ElementIter;
+
+    ElementIter->set(Idx % ElementSize);
+  }
+
+  bool test_and_set(unsigned Idx) {
+    bool old = test(Idx);
+    if (!old) {
+      set(Idx);
+      return true;
+    }
+    return false;
+  }
+
+  bool operator!=(const SparseBitVector& RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator==(const SparseBitVector& RHS) const {
+    ElementListConstIter Iter1 = Elements.begin();
+    ElementListConstIter Iter2 = RHS.Elements.begin();
+
+    for (; Iter1 != Elements.end() && Iter2 != RHS.Elements.end();
+         ++Iter1, ++Iter2) {
+      if (*Iter1 != *Iter2)
+        return false;
+    }
+    return Iter1 == Elements.end() && Iter2 == RHS.Elements.end();
+  }
+
+  // Union our bitmap with the RHS and return true if we changed.
+  bool operator|=(const SparseBitVector& RHS) {
+    if (this == &RHS)
+      return false;
+
+    if (empty()) {
+      *this = RHS;
+      return true;
+    }
+
+    bool changed = false;
+    ElementListIter Iter1 = Elements.begin();
+    ElementListConstIter Iter2 = RHS.Elements.begin();
+
+    // If RHS is empty, we are done
+    if (RHS.Elements.empty())
+      return false;
+
+    while (Iter2 != RHS.Elements.end()) {
+      if (Iter1 == Elements.end() || Iter1->index() > Iter2->index()) {
+        Elements.insert(Iter1, *Iter2);
+        ++Iter2;
+        changed = true;
+      } else if (Iter1->index() == Iter2->index()) {
+        changed |= Iter1->unionWith(*Iter2);
+        ++Iter1;
+        ++Iter2;
+      } else {
+        ++Iter1;
+      }
+    }
+    CurrElementIter = Elements.begin();
+    return changed;
+  }
+
+  // Intersect our bitmap with the RHS and return true if ours changed.
+  bool operator-=(const SparseBitVector& RHS) {
+    return intersectWithComplement(RHS);
+  }
+
+  // Intersect our bitmap with the RHS and return true if ours changed.
+  bool operator&=(const SparseBitVector& RHS) {
+    if (this == &RHS)
+      return false;
+
+    bool changed = false;
+    ElementListIter Iter1 = Elements.begin();
+    ElementListConstIter Iter2 = RHS.Elements.begin();
+
+    // Check if both bitmaps are empty.
+    if (Elements.empty() && RHS.Elements.empty())
+      return false;
+
+    // Loop through, intersecting as we go, erasing elements when necessary.
+    while (Iter2 != RHS.Elements.end()) {
+      if (Iter1 == Elements.end()) {
+        CurrElementIter = Elements.begin();
+        return changed;
+      }
+
+      if (Iter1->index() > Iter2->index()) {
+        ++Iter2;
+      } else if (Iter1->index() == Iter2->index()) {
+        bool BecameZero = false;
+        changed |= Iter1->intersectWith(*Iter2, BecameZero);
+        if (BecameZero) {
+          ElementListIter IterTmp = Iter1;
+          ++Iter1;
+          Elements.erase(IterTmp);
+        } else {
+          ++Iter1;
+        }
+        ++Iter2;
+      } else {
+        ElementListIter IterTmp = Iter1;
+        ++Iter1;
+        Elements.erase(IterTmp);
+        changed = true;
+      }
+    }
+    if (Iter1 != Elements.end()) {
+      Elements.erase(Iter1, Elements.end());
+      changed = true;
+    }
+    CurrElementIter = Elements.begin();
+    return changed;
+  }
+
+  // Intersect our bitmap with the complement of the RHS and return true
+  // if ours changed.
+  bool intersectWithComplement(const SparseBitVector& RHS) {
+    if (this == &RHS) {
+      if (!empty()) {
+        clear();
+        return true;
+      }
+      return false;
+    }
+
+    bool changed = false;
+    ElementListIter Iter1 = Elements.begin();
+    ElementListConstIter Iter2 = RHS.Elements.begin();
+
+    // If either our bitmap or RHS is empty, we are done
+    if (Elements.empty() || RHS.Elements.empty())
+      return false;
+
+    // Loop through, intersecting as we go, erasing elements when necessary.
+    while (Iter2 != RHS.Elements.end()) {
+      if (Iter1 == Elements.end()) {
+        CurrElementIter = Elements.begin();
+        return changed;
+      }
+
+      if (Iter1->index() > Iter2->index()) {
+        ++Iter2;
+      } else if (Iter1->index() == Iter2->index()) {
+        bool BecameZero = false;
+        changed |= Iter1->intersectWithComplement(*Iter2, BecameZero);
+        if (BecameZero) {
+          ElementListIter IterTmp = Iter1;
+          ++Iter1;
+          Elements.erase(IterTmp);
+        } else {
+          ++Iter1;
+        }
+        ++Iter2;
+      } else {
+        ++Iter1;
+      }
+    }
+    CurrElementIter = Elements.begin();
+    return changed;
+  }
+
+  bool intersectWithComplement(const SparseBitVector<ElementSize>* RHS) const {
+    return intersectWithComplement(*RHS);
+  }
+
+  //  Three argument version of intersectWithComplement.
+  //  Result of RHS1 & ~RHS2 is stored into this bitmap.
+  void intersectWithComplement(
+      const SparseBitVector<ElementSize>& RHS1,
+      const SparseBitVector<ElementSize>& RHS2) {
+    if (this == &RHS1) {
+      intersectWithComplement(RHS2);
+      return;
+    } else if (this == &RHS2) {
+      SparseBitVector RHS2Copy(RHS2);
+      intersectWithComplement(RHS1, RHS2Copy);
+      return;
+    }
+
+    Elements.clear();
+    CurrElementIter = Elements.begin();
+    ElementListConstIter Iter1 = RHS1.Elements.begin();
+    ElementListConstIter Iter2 = RHS2.Elements.begin();
+
+    // If RHS1 is empty, we are done
+    // If RHS2 is empty, we still have to copy RHS1
+    if (RHS1.Elements.empty())
+      return;
+
+    // Loop through, intersecting as we go, erasing elements when necessary.
+    while (Iter2 != RHS2.Elements.end()) {
+      if (Iter1 == RHS1.Elements.end())
+        return;
+
+      if (Iter1->index() > Iter2->index()) {
+        ++Iter2;
+      } else if (Iter1->index() == Iter2->index()) {
+        bool BecameZero = false;
+        Elements.emplace_back(Iter1->index());
+        Elements.back().intersectWithComplement(*Iter1, *Iter2, BecameZero);
+        if (BecameZero)
+          Elements.pop_back();
+        ++Iter1;
+        ++Iter2;
+      } else {
+        Elements.push_back(*Iter1++);
+      }
+    }
+
+    // copy the remaining elements
+    std::copy(Iter1, RHS1.Elements.end(), std::back_inserter(Elements));
+  }
+
+  void intersectWithComplement(
+      const SparseBitVector<ElementSize>* RHS1,
+      const SparseBitVector<ElementSize>* RHS2) {
+    intersectWithComplement(*RHS1, *RHS2);
+  }
+
+  bool intersects(const SparseBitVector<ElementSize>* RHS) const {
+    return intersects(*RHS);
+  }
+
+  // Return true if we share any bits in common with RHS
+  bool intersects(const SparseBitVector<ElementSize>& RHS) const {
+    ElementListConstIter Iter1 = Elements.begin();
+    ElementListConstIter Iter2 = RHS.Elements.begin();
+
+    // Check if both bitmaps are empty.
+    if (Elements.empty() && RHS.Elements.empty())
+      return false;
+
+    // Loop through, intersecting stopping when we hit bits in common.
+    while (Iter2 != RHS.Elements.end()) {
+      if (Iter1 == Elements.end())
+        return false;
+
+      if (Iter1->index() > Iter2->index()) {
+        ++Iter2;
+      } else if (Iter1->index() == Iter2->index()) {
+        if (Iter1->intersects(*Iter2))
+          return true;
+        ++Iter1;
+        ++Iter2;
+      } else {
+        ++Iter1;
+      }
+    }
+    return false;
+  }
+
+  // Return true iff all bits set in this SparseBitVector are
+  // also set in RHS.
+  bool contains(const SparseBitVector<ElementSize>& RHS) const {
+    SparseBitVector<ElementSize> Result(*this);
+    Result &= RHS;
+    return (Result == RHS);
+  }
+
+  // Return the first set bit in the bitmap.  Return -1 if no bits are set.
+  int find_first() const {
+    if (Elements.empty())
+      return -1;
+    const SparseBitVectorElement<ElementSize>& First = *(Elements.begin());
+    return (First.index() * ElementSize) + First.find_first();
+  }
+
+  // Return the last set bit in the bitmap.  Return -1 if no bits are set.
+  int find_last() const {
+    if (Elements.empty())
+      return -1;
+    const SparseBitVectorElement<ElementSize>& Last = *(Elements.rbegin());
+    return (Last.index() * ElementSize) + Last.find_last();
+  }
+
+  // Return true if the SparseBitVector is empty
+  bool empty() const {
+    return Elements.empty();
+  }
+
+  unsigned count() const {
+    unsigned BitCount = 0;
+    for (ElementListConstIter Iter = Elements.begin(); Iter != Elements.end();
+         ++Iter)
+      BitCount += Iter->count();
+
+    return BitCount;
+  }
+
+  iterator begin() const {
+    return iterator(this);
+  }
+
+  iterator end() const {
+    return iterator(this, true);
+  }
+};
+
+// Convenience functions to allow Or and And without dereferencing in the user
+// code.
+
+template <unsigned ElementSize>
+inline bool operator|=(
+    SparseBitVector<ElementSize>& LHS,
+    const SparseBitVector<ElementSize>* RHS) {
+  return LHS |= *RHS;
+}
+
+template <unsigned ElementSize>
+inline bool operator|=(
+    SparseBitVector<ElementSize>* LHS,
+    const SparseBitVector<ElementSize>& RHS) {
+  return LHS->operator|=(RHS);
+}
+
+template <unsigned ElementSize>
+inline bool operator&=(
+    SparseBitVector<ElementSize>* LHS,
+    const SparseBitVector<ElementSize>& RHS) {
+  return LHS->operator&=(RHS);
+}
+
+template <unsigned ElementSize>
+inline bool operator&=(
+    SparseBitVector<ElementSize>& LHS,
+    const SparseBitVector<ElementSize>* RHS) {
+  return LHS &= *RHS;
+}
+
+// Convenience functions for infix union, intersection, difference operators.
+
+template <unsigned ElementSize>
+inline SparseBitVector<ElementSize> operator|(
+    const SparseBitVector<ElementSize>& LHS,
+    const SparseBitVector<ElementSize>& RHS) {
+  SparseBitVector<ElementSize> Result(LHS);
+  Result |= RHS;
+  return Result;
+}
+
+template <unsigned ElementSize>
+inline SparseBitVector<ElementSize> operator&(
+    const SparseBitVector<ElementSize>& LHS,
+    const SparseBitVector<ElementSize>& RHS) {
+  SparseBitVector<ElementSize> Result(LHS);
+  Result &= RHS;
+  return Result;
+}
+
+template <unsigned ElementSize>
+inline SparseBitVector<ElementSize> operator-(
+    const SparseBitVector<ElementSize>& LHS,
+    const SparseBitVector<ElementSize>& RHS) {
+  SparseBitVector<ElementSize> Result;
+  Result.intersectWithComplement(LHS, RHS);
+  return Result;
+}
+
+template <unsigned ElementSize>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const SparseBitVector<ElementSize>& vec) {
+  bool first = true;
+  stream << "{";
+  for (auto el : vec) {
+    if (first) {
+      first = false;
+    } else {
+      stream << ", ";
+    }
+    stream << el;
+  }
+  stream << "}";
+  return stream;
+}
+
+} // end namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/ssize.h b/MLPY/Lib/site-packages/torch/include/c10/util/ssize.h
new file mode 100644
index 0000000000000000000000000000000000000000..25a8d3eddd1ae4ce8ea3e38be672ae17b759caf3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/ssize.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/TypeSafeSignMath.h>
+
+#include <cstddef>
+#include <type_traits>
+
+namespace c10 {
+
+// Implementations of std::ssize() from C++ 20.
+//
+// This is useful in particular for avoiding -Werror=sign-compare
+// issues.
+//
+// Use this with argument-dependent lookup, e.g.:
+// use c10::ssize;
+// auto size = ssize(container);
+//
+// As with the standard library version, containers are permitted to
+// specialize this with a free function defined in the same namespace.
+//
+// See https://en.cppreference.com/w/cpp/iterator/size for more
+// information as well as the source of our implementations.
+//
+// We augment the implementation by adding an assert() if an overflow
+// would occur.
+
+template <typename C>
+constexpr auto ssize(const C& c) -> std::
+    common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(c.size())>> {
+  using R = std::
+      common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(c.size())>>;
+  // We expect this to be exceedingly rare to fire and don't wish to
+  // pay a performance hit in release mode.
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!greater_than_max<R>(c.size()));
+  return static_cast<R>(c.size());
+}
+
+template <typename T, std::ptrdiff_t N>
+// NOLINTNEXTLINE(*-c-arrays)
+constexpr auto ssize(const T (&array)[N]) noexcept -> std::ptrdiff_t {
+  return N;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint.h b/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint.h
new file mode 100644
index 0000000000000000000000000000000000000000..facd57e318e369b3c622d192e609f1b6857f84b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) && \
+    !(defined(TORCH_DISABLE_SDT) && TORCH_DISABLE_SDT)
+
+#define TORCH_HAVE_SDT 1
+
+#include <c10/util/static_tracepoint_elfx86.h>
+
+#define TORCH_SDT(name, ...) \
+  TORCH_SDT_PROBE_N(         \
+      pytorch, name, 0, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+// Use TORCH_SDT_DEFINE_SEMAPHORE(name) to define the semaphore
+// as global variable before using the TORCH_SDT_WITH_SEMAPHORE macro
+#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \
+  TORCH_SDT_PROBE_N(                        \
+      pytorch, name, 1, TORCH_SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+#define TORCH_SDT_IS_ENABLED(name) (TORCH_SDT_SEMAPHORE(pytorch, name) > 0)
+
+#else
+
+#define TORCH_HAVE_SDT 0
+
+#define TORCH_SDT(name, ...) \
+  do {                       \
+  } while (0)
+#define TORCH_SDT_WITH_SEMAPHORE(name, ...) \
+  do {                                      \
+  } while (0)
+#define TORCH_SDT_IS_ENABLED(name) (false)
+#define TORCH_SDT_DEFINE_SEMAPHORE(name)
+#define TORCH_SDT_DECLARE_SEMAPHORE(name)
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h b/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h
new file mode 100644
index 0000000000000000000000000000000000000000..24a3c340b95576fde9b2b1bd9dfc289e87389c47
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/static_tracepoint_elfx86.h
@@ -0,0 +1,132 @@
+#pragma once
+
+// clang-format off
+
+// Default constraint for the probe arguments as operands.
+#ifndef TORCH_SDT_ARG_CONSTRAINT
+#define TORCH_SDT_ARG_CONSTRAINT      "nor"
+#endif
+
+// Instruction to emit for the probe.
+#define TORCH_SDT_NOP                 nop
+
+// Note section properties.
+#define TORCH_SDT_NOTE_NAME           "stapsdt"
+#define TORCH_SDT_NOTE_TYPE           3
+
+// Semaphore variables are put in this section
+#define TORCH_SDT_SEMAPHORE_SECTION   ".probes"
+
+// Size of address depending on platform.
+#ifdef __LP64__
+#define TORCH_SDT_ASM_ADDR            .8byte
+#else
+#define TORCH_SDT_ASM_ADDR            .4byte
+#endif
+
+// Assembler helper Macros.
+#define TORCH_SDT_S(x)                #x
+#define TORCH_SDT_ASM_1(x)            TORCH_SDT_S(x) "\n"
+#define TORCH_SDT_ASM_2(a, b)         TORCH_SDT_S(a) "," TORCH_SDT_S(b) "\n"
+#define TORCH_SDT_ASM_3(a, b, c)      TORCH_SDT_S(a) "," TORCH_SDT_S(b) ","    \
+                                      TORCH_SDT_S(c) "\n"
+#define TORCH_SDT_ASM_STRING(x)       TORCH_SDT_ASM_1(.asciz TORCH_SDT_S(x))
+
+// Helper to determine the size of an argument.
+#define TORCH_SDT_IS_ARRAY_POINTER(x)  ((__builtin_classify_type(x) == 14) ||  \
+                                        (__builtin_classify_type(x) == 5))
+#define TORCH_SDT_ARGSIZE(x)  (TORCH_SDT_IS_ARRAY_POINTER(x)                   \
+                               ? sizeof(void*)                                 \
+                               : sizeof(x))
+
+// Format of each probe arguments as operand.
+// Size of the argument tagged with TORCH_SDT_Sn, with "n" constraint.
+// Value of the argument tagged with TORCH_SDT_An, with configured constraint.
+#define TORCH_SDT_ARG(n, x)                                                    \
+  [TORCH_SDT_S##n] "n"                ((size_t)TORCH_SDT_ARGSIZE(x)),          \
+  [TORCH_SDT_A##n] TORCH_SDT_ARG_CONSTRAINT (x)
+
+// Templates to append arguments as operands.
+#define TORCH_SDT_OPERANDS_0()        [__sdt_dummy] "g" (0)
+#define TORCH_SDT_OPERANDS_1(_1)      TORCH_SDT_ARG(1, _1)
+#define TORCH_SDT_OPERANDS_2(_1, _2)                                           \
+  TORCH_SDT_OPERANDS_1(_1), TORCH_SDT_ARG(2, _2)
+#define TORCH_SDT_OPERANDS_3(_1, _2, _3)                                       \
+  TORCH_SDT_OPERANDS_2(_1, _2), TORCH_SDT_ARG(3, _3)
+#define TORCH_SDT_OPERANDS_4(_1, _2, _3, _4)                                   \
+  TORCH_SDT_OPERANDS_3(_1, _2, _3), TORCH_SDT_ARG(4, _4)
+#define TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5)                               \
+  TORCH_SDT_OPERANDS_4(_1, _2, _3, _4), TORCH_SDT_ARG(5, _5)
+#define TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6)                           \
+  TORCH_SDT_OPERANDS_5(_1, _2, _3, _4, _5), TORCH_SDT_ARG(6, _6)
+#define TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7)                       \
+  TORCH_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), TORCH_SDT_ARG(7, _7)
+#define TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8)                   \
+  TORCH_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7), TORCH_SDT_ARG(8, _8)
+#define TORCH_SDT_OPERANDS_9(_1, _2, _3, _4, _5, _6, _7, _8, _9)               \
+  TORCH_SDT_OPERANDS_8(_1, _2, _3, _4, _5, _6, _7, _8), TORCH_SDT_ARG(9, _9)
+
+// Templates to reference the arguments from operands in note section.
+#define TORCH_SDT_ARGFMT(no)        %n[TORCH_SDT_S##no]@%[TORCH_SDT_A##no]
+#define TORCH_SDT_ARG_TEMPLATE_0    /*No arguments*/
+#define TORCH_SDT_ARG_TEMPLATE_1    TORCH_SDT_ARGFMT(1)
+#define TORCH_SDT_ARG_TEMPLATE_2    TORCH_SDT_ARG_TEMPLATE_1 TORCH_SDT_ARGFMT(2)
+#define TORCH_SDT_ARG_TEMPLATE_3    TORCH_SDT_ARG_TEMPLATE_2 TORCH_SDT_ARGFMT(3)
+#define TORCH_SDT_ARG_TEMPLATE_4    TORCH_SDT_ARG_TEMPLATE_3 TORCH_SDT_ARGFMT(4)
+#define TORCH_SDT_ARG_TEMPLATE_5    TORCH_SDT_ARG_TEMPLATE_4 TORCH_SDT_ARGFMT(5)
+#define TORCH_SDT_ARG_TEMPLATE_6    TORCH_SDT_ARG_TEMPLATE_5 TORCH_SDT_ARGFMT(6)
+#define TORCH_SDT_ARG_TEMPLATE_7    TORCH_SDT_ARG_TEMPLATE_6 TORCH_SDT_ARGFMT(7)
+#define TORCH_SDT_ARG_TEMPLATE_8    TORCH_SDT_ARG_TEMPLATE_7 TORCH_SDT_ARGFMT(8)
+#define TORCH_SDT_ARG_TEMPLATE_9    TORCH_SDT_ARG_TEMPLATE_8 TORCH_SDT_ARGFMT(9)
+
+// Semaphore define, declare and probe note format
+
+#define TORCH_SDT_SEMAPHORE(provider, name)                                    \
+  torch_sdt_semaphore_##provider##_##name
+
+#define TORCH_SDT_DEFINE_SEMAPHORE(name)                                       \
+  extern "C" {                                                                 \
+    volatile unsigned short TORCH_SDT_SEMAPHORE(pytorch, name)                 \
+    __attribute__((section(TORCH_SDT_SEMAPHORE_SECTION), used)) = 0;           \
+  }
+
+#define TORCH_SDT_DECLARE_SEMAPHORE(name)                                      \
+  extern "C" volatile unsigned short TORCH_SDT_SEMAPHORE(pytorch, name)
+
+#define TORCH_SDT_SEMAPHORE_NOTE_0(provider, name)                             \
+  TORCH_SDT_ASM_1(     TORCH_SDT_ASM_ADDR 0) /*No Semaphore*/                  \
+
+#define TORCH_SDT_SEMAPHORE_NOTE_1(provider, name)                             \
+  TORCH_SDT_ASM_1(TORCH_SDT_ASM_ADDR TORCH_SDT_SEMAPHORE(provider, name))
+
+// Structure of note section for the probe.
+#define TORCH_SDT_NOTE_CONTENT(provider, name, has_semaphore, arg_template)    \
+  TORCH_SDT_ASM_1(990: TORCH_SDT_NOP)                                          \
+  TORCH_SDT_ASM_3(     .pushsection .note.stapsdt,"","note")                   \
+  TORCH_SDT_ASM_1(     .balign 4)                                              \
+  TORCH_SDT_ASM_3(     .4byte 992f-991f, 994f-993f, TORCH_SDT_NOTE_TYPE)       \
+  TORCH_SDT_ASM_1(991: .asciz TORCH_SDT_NOTE_NAME)                             \
+  TORCH_SDT_ASM_1(992: .balign 4)                                              \
+  TORCH_SDT_ASM_1(993: TORCH_SDT_ASM_ADDR 990b)                                \
+  TORCH_SDT_ASM_1(     TORCH_SDT_ASM_ADDR 0) /*Reserved for Base Address*/     \
+  TORCH_SDT_SEMAPHORE_NOTE_##has_semaphore(provider, name)                     \
+  TORCH_SDT_ASM_STRING(provider)                                               \
+  TORCH_SDT_ASM_STRING(name)                                                   \
+  TORCH_SDT_ASM_STRING(arg_template)                                           \
+  TORCH_SDT_ASM_1(994: .balign 4)                                              \
+  TORCH_SDT_ASM_1(     .popsection)
+
+// Main probe Macro.
+#define TORCH_SDT_PROBE(provider, name, has_semaphore, n, arglist)             \
+    __asm__ __volatile__ (                                                     \
+      TORCH_SDT_NOTE_CONTENT(                                                  \
+        provider, name, has_semaphore, TORCH_SDT_ARG_TEMPLATE_##n)             \
+      :: TORCH_SDT_OPERANDS_##n arglist                                        \
+    )                                                                          \
+
+// Helper Macros to handle variadic arguments.
+#define TORCH_SDT_NARG_(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
+#define TORCH_SDT_NARG(...)                                                    \
+  TORCH_SDT_NARG_(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+#define TORCH_SDT_PROBE_N(provider, name, has_semaphore, N, ...)               \
+  TORCH_SDT_PROBE(provider, name, has_semaphore, N, (__VA_ARGS__))
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/strides.h b/MLPY/Lib/site-packages/torch/include/c10/util/strides.h
new file mode 100644
index 0000000000000000000000000000000000000000..9555ed4342c3cdf4c3b6799ac150ca36d76e8a76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/strides.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <c10/util/ArrayRef.h>
+#include <c10/util/DimVector.h>
+#include <algorithm>
+
+namespace c10 {
+
+// Computes the contiguous strides of a tensor, given its sizes.
+static inline DimVector contiguous_strides(const IntArrayRef sizes) {
+  using Int = IntArrayRef::value_type;
+  const Int dims = static_cast<Int>(sizes.size());
+
+  // With this initialisation we get the case dim == 0 or 1 right
+  DimVector strides(dims, 1);
+
+  for (auto i = dims - 2; i >= 0; --i) {
+    // Strides can't be 0 even if sizes are 0.
+    strides[i] = strides[i + 1] * std::max(sizes[i + 1], Int{1});
+  }
+
+  return strides;
+}
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/string_utils.h b/MLPY/Lib/site-packages/torch/include/c10/util/string_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b085f3f7a455c8c51b1e21b3520cb225281f389
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/string_utils.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <string>
+
+namespace c10 {
+
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stod;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoi;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoll;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::stoull;
+// NOLINTNEXTLINE(misc-unused-using-decls)
+using std::to_string;
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/string_view.h b/MLPY/Lib/site-packages/torch/include/c10/util/string_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..3114c48f2c299781d07ef42d352f362b587d3376
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/string_view.h
@@ -0,0 +1,609 @@
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+/**
+ * Port of std::string_view with methods from C++20.
+ * Implemented following the interface definition in
+ * https://en.cppreference.com/w/cpp/string/basic_string_view
+ * See there for the API documentation.
+ *
+ * Difference: We don't have a Traits template parameter because
+ * std::char_traits isn't constexpr and we'd have to reimplement
+ * std::char_traits if we wanted to use it with our constexpr basic_string_view.
+ */
+template <class CharT>
+class basic_string_view final {
+ public:
+  using value_type = CharT;
+  using pointer = CharT*;
+  using const_pointer = const CharT*;
+  using reference = CharT&;
+  using const_reference = const CharT&;
+  using const_iterator = const CharT*;
+  using iterator = const_iterator;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using reverse_iterator = const_reverse_iterator;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr size_type npos = size_type(-1);
+
+  constexpr basic_string_view() noexcept : begin_(nullptr) {}
+
+  explicit constexpr basic_string_view(const_pointer str, size_type count)
+      : begin_(str), size_(count) {}
+
+  /* implicit */ constexpr basic_string_view(const_pointer str)
+      : basic_string_view(str, strlen_(str)) {}
+
+  /* implicit */ basic_string_view(const ::std::basic_string<CharT>& str)
+      : basic_string_view(str.data(), str.size()) {}
+
+  constexpr basic_string_view(const basic_string_view&) noexcept = default;
+
+  constexpr basic_string_view& operator=(
+      const basic_string_view& rhs) noexcept {
+    begin_ = rhs.begin_;
+    size_ = rhs.size_;
+    return *this;
+  }
+
+  explicit operator ::std::basic_string<CharT>() const {
+    return ::std::basic_string<CharT>(data(), size());
+  }
+
+  constexpr const_iterator begin() const noexcept {
+    return cbegin();
+  }
+
+  constexpr const_iterator cbegin() const noexcept {
+    return begin_;
+  }
+
+  constexpr const_iterator end() const noexcept {
+    return cend();
+  }
+
+  constexpr const_iterator cend() const noexcept {
+    return begin_ + size_;
+  }
+
+  constexpr const_reverse_iterator rbegin() const noexcept {
+    return crbegin();
+  }
+
+  constexpr const_reverse_iterator crbegin() const noexcept {
+    return const_reverse_iterator(this->end());
+  }
+
+  constexpr const_reverse_iterator rend() const noexcept {
+    return crend();
+  }
+
+  constexpr const_reverse_iterator crend() const noexcept {
+    return const_reverse_iterator(this->begin());
+  }
+
+  friend constexpr const_iterator begin(basic_string_view sv) noexcept {
+    return sv.begin();
+  }
+
+  friend constexpr const_iterator end(basic_string_view sv) noexcept {
+    return sv.end();
+  }
+
+  constexpr const_reference operator[](size_type pos) const {
+    // TODO: split out
+    return at_(pos);
+  }
+
+  constexpr const_reference at(size_type pos) const {
+#if !defined( \
+    __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code
+    return C10_UNLIKELY(pos >= size_)
+        ? (throw std::out_of_range(
+               "string_view::operator[] or string_view::at() out of range. Index: " +
+               std::to_string(pos) + ", size: " + std::to_string(size())),
+           at_(0))
+        : at_(pos);
+#else
+    return at_(pos);
+#endif
+  }
+
+  constexpr const_reference front() const {
+    return *begin_;
+  }
+
+  constexpr const_reference back() const {
+    return *(begin_ + size_ - 1);
+  }
+
+  constexpr const_pointer data() const noexcept {
+    return begin_;
+  }
+
+  constexpr size_type size() const noexcept {
+    return size_;
+  }
+
+  constexpr size_type length() const noexcept {
+    return size();
+  }
+
+  constexpr size_type max_size() const noexcept {
+    return std::numeric_limits<difference_type>::max();
+  }
+
+  C10_NODISCARD constexpr bool empty() const noexcept {
+    return size() == 0;
+  }
+
+  constexpr void remove_prefix(size_type n) {
+    if (n > size()) {
+      throw std::out_of_range(
+          "basic_string_view::remove_prefix: out of range. PrefixLength: " +
+          std::to_string(n) + ", size: " + std::to_string(size()));
+    }
+    begin_ += n;
+    size_ -= n;
+  }
+
+  constexpr void remove_suffix(size_type n) {
+    if (n > size()) {
+      throw std::out_of_range(
+          "basic_string_view::remove_suffix: out of range. SuffixLength: " +
+          std::to_string(n) + ", size: " + std::to_string(size()));
+    }
+    size_ -= n;
+  }
+
+  constexpr void swap(basic_string_view& sv) noexcept {
+    auto tmp = *this;
+    *this = sv;
+    sv = tmp;
+  }
+
+  size_type copy(pointer dest, size_type count, size_type pos = 0) const {
+    if (pos > size_) {
+      throw std::out_of_range(
+          "basic_string_view::copy: out of range. Index: " +
+          std::to_string(pos) + ", size: " + std::to_string(size()));
+    }
+    size_type copy_length = std::min(count, size_ - pos);
+    for (auto iter = begin() + pos, end = iter + copy_length; iter != end;) {
+      *(dest++) = *(iter++);
+    }
+    return copy_length;
+  }
+
+  constexpr basic_string_view substr(size_type pos = 0, size_type count = npos)
+      const {
+#if !defined( \
+    __CUDA_ARCH__) // CUDA doesn't like std::out_of_range in device code
+    return (pos > size_)
+        ? (throw std::out_of_range(
+               "basic_string_view::substr parameter out of bounds. Index: " +
+               std::to_string(pos) + ", size: " + std::to_string(size())),
+           substr_())
+        : substr_(pos, count);
+#else
+    return substr_(pos, count);
+#endif
+  }
+
+  constexpr int compare(basic_string_view rhs) const noexcept {
+    // Write it iteratively. This is faster.
+    for (size_t i = 0, end = std::min(size(), rhs.size()); i < end; ++i) {
+      if (at_(i) < rhs.at_(i)) {
+        return -1;
+      } else if (at_(i) > rhs.at_(i)) {
+        return 1;
+      }
+    }
+    if (size() < rhs.size()) {
+      return -1;
+    } else if (size() > rhs.size()) {
+      return 1;
+    }
+    return 0;
+  }
+
+  constexpr int compare(size_type pos1, size_type count1, basic_string_view v)
+      const {
+    return substr(pos1, count1).compare(v);
+  }
+
+  constexpr int compare(
+      size_type pos1,
+      size_type count1,
+      basic_string_view v,
+      size_type pos2,
+      size_type count2) const {
+    return substr(pos1, count1).compare(v.substr(pos2, count2));
+  }
+
+  constexpr int compare(const_pointer s) const {
+    return compare(basic_string_view(s));
+  }
+
+  constexpr int compare(size_type pos1, size_type count1, const_pointer s)
+      const {
+    return substr(pos1, count1).compare(basic_string_view(s));
+  }
+
+  constexpr int compare(
+      size_type pos1,
+      size_type count1,
+      const_pointer s,
+      size_type count2) const {
+    return substr(pos1, count1).compare(basic_string_view(s, count2));
+  }
+
+  friend constexpr bool operator==(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return lhs.equals_(rhs);
+  }
+
+  friend constexpr bool operator!=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs == rhs);
+  }
+
+  friend constexpr bool operator<(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return lhs.compare(rhs) < 0;
+  }
+
+  friend constexpr bool operator>=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs < rhs);
+  }
+
+  friend constexpr bool operator>(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return rhs < lhs;
+  }
+
+  friend constexpr bool operator<=(
+      basic_string_view lhs,
+      basic_string_view rhs) noexcept {
+    return !(lhs > rhs);
+  }
+
+  constexpr bool starts_with(basic_string_view prefix) const noexcept {
+    return (prefix.size() > size()) ? false
+                                    : prefix.equals_(substr_(0, prefix.size()));
+  }
+
+  constexpr bool starts_with(CharT prefix) const noexcept {
+    return !empty() && prefix == front();
+  }
+
+  constexpr bool starts_with(const_pointer prefix) const {
+    return starts_with(basic_string_view(prefix));
+  }
+
+  constexpr bool ends_with(basic_string_view suffix) const noexcept {
+    return (suffix.size() > size())
+        ? false
+        : suffix.equals_(substr_(size() - suffix.size(), suffix.size()));
+  }
+
+  constexpr bool ends_with(CharT suffix) const noexcept {
+    return !empty() && suffix == back();
+  }
+
+  constexpr bool ends_with(const_pointer suffix) const {
+    return ends_with(basic_string_view(suffix));
+  }
+
+  constexpr size_type find(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    if (v.size() == 0) {
+      return pos <= size() ? pos : npos;
+    }
+
+    if (pos + v.size() <= size()) {
+      for (size_type cur = pos, end = size() - v.size(); cur <= end; ++cur) {
+        if (v.at_(0) == at_(cur) &&
+            v.substr_(1).equals_(substr_(cur + 1, v.size() - 1))) {
+          return cur;
+        }
+      }
+    }
+    return npos;
+  }
+
+  constexpr size_type find(CharT ch, size_type pos = 0) const noexcept {
+    return find_first_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find(const_pointer s, size_type pos, size_type count)
+      const {
+    return find(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find(const_pointer s, size_type pos = 0) const {
+    return find(basic_string_view(s), pos);
+  }
+
+  constexpr size_type rfind(basic_string_view v, size_type pos = npos)
+      const noexcept {
+    // Write it iteratively. This is faster.
+    if (v.size() == 0) {
+      return pos <= size() ? pos : size();
+    }
+
+    if (v.size() <= size()) {
+      pos = std::min(size() - v.size(), pos);
+      do {
+        if (v.at_(0) == at_(pos) &&
+            v.substr_(1).equals_(substr_(pos + 1, v.size() - 1))) {
+          return pos;
+        }
+      } while (pos-- > 0);
+    }
+    return npos;
+  }
+
+  constexpr size_type rfind(CharT ch, size_type pos = npos) const noexcept {
+    return find_last_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type rfind(const_pointer s, size_type pos, size_type count)
+      const {
+    return rfind(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type rfind(const_pointer s, size_type pos = npos) const {
+    return rfind(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_first_of(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, stringViewContainsChar_{v});
+  }
+
+  constexpr size_type find_first_of(CharT ch, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find_first_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_first_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_first_of(const_pointer s, size_type pos = 0) const {
+    return find_first_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_last_of(basic_string_view v, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, stringViewContainsChar_{v});
+  }
+
+  constexpr size_type find_last_of(CharT ch, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, charIsEqual_{ch});
+  }
+
+  constexpr size_type find_last_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_last_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_last_of(const_pointer s, size_type pos = npos)
+      const {
+    return find_last_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_first_not_of(basic_string_view v, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, stringViewDoesNotContainChar_{v});
+  }
+
+  constexpr size_type find_first_not_of(CharT ch, size_type pos = 0)
+      const noexcept {
+    return find_first_if_(pos, charIsNotEqual_{ch});
+  }
+
+  constexpr size_type find_first_not_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_first_not_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_first_not_of(const_pointer s, size_type pos = 0)
+      const {
+    return find_first_not_of(basic_string_view(s), pos);
+  }
+
+  constexpr size_type find_last_not_of(
+      basic_string_view v,
+      size_type pos = npos) const noexcept {
+    return find_last_if_(pos, stringViewDoesNotContainChar_{v});
+  }
+
+  constexpr size_type find_last_not_of(CharT ch, size_type pos = npos)
+      const noexcept {
+    return find_last_if_(pos, charIsNotEqual_{ch});
+  }
+
+  constexpr size_type find_last_not_of(
+      const_pointer s,
+      size_type pos,
+      size_type count) const {
+    return find_last_not_of(basic_string_view(s, count), pos);
+  }
+
+  constexpr size_type find_last_not_of(const_pointer s, size_type pos = npos)
+      const {
+    return find_last_not_of(basic_string_view(s), pos);
+  }
+
+ private:
+  static constexpr size_type strlen_(const_pointer str) noexcept {
+    const_pointer current = str;
+    while (*current != '\0') {
+      ++current;
+    }
+    return current - str;
+  }
+
+  constexpr const_reference at_(size_type pos) const noexcept {
+    return *(begin_ + pos);
+  }
+
+  constexpr basic_string_view substr_(size_type pos = 0, size_type count = npos)
+      const {
+    return basic_string_view{begin_ + pos, std::min(count, size() - pos)};
+  }
+
+  template <class Condition>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  constexpr size_type find_first_if_(size_type pos, Condition&& condition)
+      const noexcept {
+    if (pos + 1 <= size()) {
+      for (size_type cur = pos; cur < size(); ++cur) {
+        if (condition(at_(cur))) {
+          return cur;
+        }
+      }
+    }
+    return npos;
+  }
+
+  template <class Condition>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  constexpr size_type find_last_if_(size_type pos, Condition&& condition)
+      const noexcept {
+    // Write it iteratively. This is faster.
+    if (size() > 0) {
+      pos = std::min(size() - 1, pos);
+      do {
+        if (condition(at_(pos))) {
+          return pos;
+        }
+      } while (pos-- > 0);
+    }
+    return npos;
+  }
+
+  constexpr bool equals_(basic_string_view rhs) const {
+    // We don't use string_view::compare() here but implement it manually
+    // because only looking at equality allows for more optimized code.
+#if defined(__GNUC__) && !defined(__CUDACC__)
+    return size() == rhs.size() &&
+        0 == __builtin_memcmp(data(), rhs.data(), size());
+#else
+    if (size() != rhs.size()) {
+      return false;
+    }
+    // Yes, memcmp would be laster than this loop, but memcmp isn't constexpr
+    // and I didn't feel like implementing a constexpr memcmp variant.
+    // TODO At some point this should probably be done, including tricks
+    // like comparing one machine word instead of a byte per iteration.
+    for (typename basic_string_view<CharT>::size_type pos = 0; pos < size();
+         ++pos) {
+      if (at_(pos) != rhs.at_(pos)) {
+        return false;
+      }
+    }
+    return true;
+#endif
+  }
+
+  struct charIsEqual_ final {
+    CharT expected;
+    constexpr bool operator()(CharT actual) const noexcept {
+      return expected == actual;
+    }
+  };
+
+  struct charIsNotEqual_ final {
+    CharT expected;
+    constexpr bool operator()(CharT actual) const noexcept {
+      return expected != actual;
+    }
+  };
+
+  struct stringViewContainsChar_ final {
+    basic_string_view expected;
+    constexpr bool operator()(CharT ch) const noexcept {
+      return npos != expected.find(ch);
+    }
+  };
+
+  struct stringViewDoesNotContainChar_ final {
+    basic_string_view expected;
+    constexpr bool operator()(CharT ch) const noexcept {
+      return npos == expected.find(ch);
+    }
+  };
+
+  const_pointer begin_;
+  size_type size_{};
+};
+
+template <class CharT>
+inline std::basic_ostream<CharT>& operator<<(
+    std::basic_ostream<CharT>& stream,
+    basic_string_view<CharT> sv) {
+  // The rules for operator<< are quite complex, so lets defer to the
+  // STL implementation.
+  using std_string_type = ::std::basic_string_view<CharT>;
+  return stream << std_string_type(sv.data(), sv.size());
+}
+
+template <class CharT>
+constexpr inline void swap(
+    basic_string_view<CharT>& lhs,
+    basic_string_view<CharT>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+
+using string_view = basic_string_view<char>;
+
+} // namespace c10
+
+namespace std {
+template <class CharT>
+struct hash<::c10::basic_string_view<CharT>> {
+  size_t operator()(::c10::basic_string_view<CharT> x) const {
+    // The standard says that std::string_view hashing must do the same as
+    // std::string hashing but leaves the details of std::string hashing
+    // up to the implementer. So, to be conformant, we need to re-use and
+    // existing STL type's hash function. The std::string fallback is probably
+    // slow but the only way to be conformant.
+
+    using std_string_type = ::std::basic_string_view<CharT>;
+    return ::std::hash<std_string_type>{}(std_string_type(x.data(), x.size()));
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/strong_type.h b/MLPY/Lib/site-packages/torch/include/c10/util/strong_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..829096200b0e2bbf7248dae4fc670be2b6cedb1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/strong_type.h
@@ -0,0 +1,1682 @@
+/*
+ * strong_type C++14/17/20 strong typedef library
+ *
+ * Copyright (C) Björn Fahller
+ *
+ *  Use, modification and distribution is subject to the
+ *  Boost Software License, Version 1.0. (See accompanying
+ *  file LICENSE_1_0.txt or copy at
+ *  http://www.boost.org/LICENSE_1_0.txt)
+ *
+ * Project home: https://github.com/rollbear/strong_type
+ */
+
+#ifndef ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
+#define ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
+
+#include <functional>
+#include <istream>
+#include <ostream>
+#include <type_traits>
+#include <utility>
+
+#if __cplusplus >= 201703L
+#define STRONG_NODISCARD [[nodiscard]]
+#else
+#define STRONG_NODISCARD
+#endif
+
+#if defined(_MSC_VER) && !defined(__clang__) && __MSC_VER < 1922
+#define STRONG_CONSTEXPR
+#else
+#define STRONG_CONSTEXPR constexpr
+#endif
+
+#ifndef STRONG_HAS_STD_FORMAT
+#define STRONG_HAS_STD_FORMAT 0
+#endif
+
+#ifndef STRONG_HAS_FMT_FORMAT
+#define STRONG_HAS_FMT_FORMAT 0
+#endif
+
+#if STRONG_HAS_STD_FORMAT
+#include <format>
+#if !defined(__cpp_lib_format) || __cpp_lib_format < 201907
+#undef STRONG_HAS_STD_FORMAT
+#define STRONG_HAS_STD_FORMAT 0
+#endif
+#endif
+
+#if STRONG_HAS_FMT_FORMAT
+#include <fmt/format.h>
+#endif
+
+namespace strong
+{
+
+namespace impl
+{
+  template <typename T, typename ... V>
+  using WhenConstructible = std::enable_if_t<std::is_constructible<T, V...>::value>;
+}
+
+template <typename M, typename T>
+using modifier = typename M::template modifier<T>;
+
+struct uninitialized_t {};
+static constexpr uninitialized_t uninitialized{};
+
+struct default_constructible
+{
+  template <typename T>
+  class modifier
+  {
+  };
+};
+
+namespace impl {
+  template <typename T>
+  constexpr bool supports_default_construction(const ::strong::default_constructible::modifier<T>*)
+  {
+    return true;
+  }
+}
+
+template <typename T, typename Tag, typename ... M>
+class type : public modifier<M, type<T, Tag, M...>>...
+{
+public:
+  template <typename TT = T, typename = std::enable_if_t<std::is_trivially_constructible<TT>{}>>
+  explicit type(uninitialized_t)
+    noexcept
+  {
+  }
+  template <typename type_ = type,
+            bool = impl::supports_default_construction(static_cast<type_*>(nullptr))>
+  constexpr
+  type()
+    noexcept(noexcept(T{}))
+  : val{}
+  {
+  }
+
+  template <typename U,
+    typename = impl::WhenConstructible<T, std::initializer_list<U>>>
+  constexpr
+  explicit
+  type(
+    std::initializer_list<U> us
+  )
+    noexcept(noexcept(T{us}))
+  : val{us}
+  {
+  }
+  template <typename ... U,
+            typename = std::enable_if_t<std::is_constructible<T, U&&...>::value && (sizeof...(U) > 0)>>
+  constexpr
+  explicit
+  type(
+    U&& ... u)
+  noexcept(std::is_nothrow_constructible<T, U...>::value)
+  : val(std::forward<U>(u)...)
+  {}
+
+  friend STRONG_CONSTEXPR void swap(type& a, type& b) noexcept(
+                                                        std::is_nothrow_move_constructible<T>::value &&
+                                                        std::is_nothrow_move_assignable<T>::value
+                                                      )
+  {
+    using std::swap;
+    swap(a.val, b.val);
+  }
+
+  STRONG_NODISCARD
+  constexpr T& value_of() & noexcept { return val;}
+  STRONG_NODISCARD
+  constexpr const T& value_of() const & noexcept { return val;}
+  STRONG_NODISCARD
+  constexpr T&& value_of() && noexcept { return std::move(val);}
+
+  STRONG_NODISCARD
+  friend constexpr T& value_of(type& t) noexcept { return t.val;}
+  STRONG_NODISCARD
+  friend constexpr const T& value_of(const type& t) noexcept { return t.val;}
+  STRONG_NODISCARD
+  friend constexpr T&& value_of(type&& t) noexcept { return std::move(t).val;}
+private:
+  T val;
+};
+
+namespace impl {
+  template <typename T, typename Tag, typename ... Ms>
+  constexpr bool is_strong_type_func(const strong::type<T, Tag, Ms...>*) { return true;}
+  constexpr bool is_strong_type_func(...) { return false;}
+  template <typename T, typename Tag, typename ... Ms>
+  constexpr T underlying_type(strong::type<T, Tag, Ms...>*);
+
+}
+
+template <typename T>
+struct is_strong_type : std::integral_constant<bool, impl::is_strong_type_func(static_cast<T *>(nullptr))> {};
+
+namespace impl {
+  template <typename T>
+  using WhenStrongType = std::enable_if_t<is_strong_type<std::decay_t<T>>::value>;
+  template <typename T>
+  using WhenNotStrongType = std::enable_if_t<!is_strong_type<std::decay_t<T>>::value>;
+}
+
+template <typename T, bool = is_strong_type<T>::value>
+struct underlying_type
+{
+  using type = decltype(impl::underlying_type(static_cast<T*>(nullptr)));
+};
+
+template <typename T>
+struct underlying_type<T, false>
+{
+  using type = T;
+};
+
+template <typename T>
+using underlying_type_t = typename underlying_type<T>::type;
+
+
+namespace impl {
+  template<
+    typename T,
+    typename = impl::WhenNotStrongType<T>>
+  constexpr
+  T &&
+  access(T &&t)
+  noexcept {
+    return std::forward<T>(t);
+  }
+  template <
+    typename T,
+    typename = impl::WhenStrongType<T>>
+  STRONG_NODISCARD
+  constexpr
+  auto
+  access(T&& t)
+  noexcept
+  -> decltype(value_of(std::forward<T>(t)))
+  {
+    return value_of(std::forward<T>(t));
+  }
+
+}
+struct equality
+{
+  template <typename T>
+  class modifier;
+};
+
+
+template <typename T, typename Tag, typename ... M>
+class equality::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator==(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() == std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() == std::declval<const T&>())
+  {
+    return value_of(lh) == value_of(rh);
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator!=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() != std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() != std::declval<const T&>())
+  {
+    return value_of(lh) != value_of(rh);
+  }
+};
+
+namespace impl
+{
+  template <typename T, typename Other>
+  class typed_equality
+  {
+  private:
+    using TT = underlying_type_t<T>;
+    using OT = underlying_type_t<Other>;
+  public:
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator==(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() == std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() == std::declval<const OT&>())
+    {
+      return value_of(lh) == impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator==(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() == std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() == std::declval<const TT&>())
+    {
+      return impl::access(lh) == value_of(rh) ;
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator!=(const T& lh, const Other rh)
+    noexcept(noexcept(std::declval<const TT&>() != std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() != std::declval<const OT&>())
+    {
+      return value_of(lh) != impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator!=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() != std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() != std::declval<const TT&>())
+    {
+      return impl::access(lh) != value_of(rh) ;
+    }
+  };
+}
+template <typename ... Ts>
+struct equality_with
+{
+  template <typename T>
+  class modifier : public impl::typed_equality<T, Ts>...
+  {
+  };
+};
+
+namespace impl
+{
+  template <typename T, typename Other>
+  class typed_ordering
+  {
+  private:
+    using TT = underlying_type_t<T>;
+    using OT = underlying_type_t<Other>;
+  public:
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator<(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() < std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() < std::declval<const OT&>())
+    {
+      return value_of(lh) < impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator<(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() < std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() < std::declval<const TT&>())
+    {
+      return impl::access(lh) < value_of(rh) ;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator<=(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() <= std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() <= std::declval<const OT&>())
+    {
+      return value_of(lh) <= impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator<=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() <= std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() <= std::declval<const TT&>())
+    {
+      return impl::access(lh) <= value_of(rh) ;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator>(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() > std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() > std::declval<const OT&>())
+    {
+      return value_of(lh) > impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator>(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() > std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() > std::declval<const TT&>())
+    {
+      return impl::access(lh) > value_of(rh) ;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator>=(const T& lh, const Other& rh)
+    noexcept(noexcept(std::declval<const TT&>() >= std::declval<const OT&>()))
+    -> decltype(std::declval<const TT&>() >= std::declval<const OT&>())
+    {
+      return value_of(lh) >= impl::access(rh);
+    }
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    auto operator>=(const Other& lh, const T& rh)
+    noexcept(noexcept(std::declval<const OT&>() >= std::declval<const TT&>()))
+    -> decltype(std::declval<const OT&>() >= std::declval<const TT&>())
+    {
+      return impl::access(lh) >= value_of(rh) ;
+    }
+  };
+}
+
+template <typename ... Ts>
+struct ordered_with
+{
+  template <typename T>
+  class modifier : public impl::typed_ordering<T, Ts>...
+  {
+  };
+};
+
+namespace impl
+{
+  template <typename T>
+  struct require_copy_constructible
+  {
+    static constexpr bool value = std::is_copy_constructible<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be copy constructible");
+  };
+  template <typename T>
+  struct require_move_constructible
+  {
+    static constexpr bool value = std::is_move_constructible<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be move constructible");
+  };
+  template <typename T>
+  struct require_copy_assignable
+  {
+    static constexpr bool value = std::is_copy_assignable<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be copy assignable");
+  };
+  template <typename T>
+  struct require_move_assignable
+  {
+    static constexpr bool value = std::is_move_assignable<underlying_type_t<T>>::value;
+    static_assert(value, "underlying type must be move assignable");
+  };
+
+  template <bool> struct valid_type;
+  template <>
+  struct valid_type<true> {};
+
+  template <typename T>
+  struct require_semiregular
+    : valid_type<require_copy_constructible<T>::value &&
+                 require_move_constructible<T>::value &&
+                 require_copy_assignable<T>::value &&
+                 require_move_assignable<T>::value>
+  {
+  };
+
+}
+struct semiregular
+{
+  template <typename>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class semiregular::modifier<::strong::type<T, Tag, M...>>
+  : public default_constructible::modifier<T>
+  , private impl::require_semiregular<T>
+{
+};
+
+struct regular
+{
+  template <typename T>
+  class modifier
+    : public semiregular::modifier<T>
+    , public equality::modifier<T>
+  {
+  };
+};
+
+struct unique
+{
+  template <typename T>
+  class modifier
+    : private impl::valid_type<
+      impl::require_move_constructible<T>::value &&
+      impl::require_move_assignable<T>::value
+    >
+  {
+  public:
+    constexpr modifier() = default;
+    modifier(const modifier&) = delete;
+    constexpr modifier(modifier&&) = default;
+    modifier& operator=(const modifier&) = delete;
+    constexpr modifier& operator=(modifier&&) = default;
+  };
+};
+struct ordered
+{
+  template <typename T>
+  class modifier;
+};
+
+
+template <typename T, typename Tag, typename ... M>
+class ordered::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator<(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() < std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() < std::declval<const T&>())
+  {
+    return value_of(lh) < value_of(rh);
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator<=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() <= std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() <= std::declval<const T&>())
+  {
+    return value_of(lh) <= value_of(rh);
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator>(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() > std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() > std::declval<const T&>())
+  {
+    return value_of(lh) > value_of(rh);
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+
+  auto
+  operator>=(
+    const type& lh,
+    const type& rh)
+  noexcept(noexcept(std::declval<const T&>() >= std::declval<const T&>()))
+  -> decltype(std::declval<const T&>() >= std::declval<const T&>())
+  {
+    return value_of(lh) >= value_of(rh);
+  }
+};
+
+struct ostreamable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    std::ostream&
+    operator<<(
+      std::ostream &os,
+      const T &t)
+    {
+      return os << value_of(t);
+    }
+  };
+};
+
+struct istreamable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    std::istream&
+    operator>>(
+      std::istream &is,
+      T &t)
+    {
+      return is >> value_of(t);
+    }
+  };
+};
+
+struct iostreamable
+{
+  template <typename T>
+  class modifier
+    : public ostreamable::modifier<T>
+    , public istreamable::modifier<T>
+  {
+  };
+};
+
+struct incrementable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator++(T& t)
+    noexcept(noexcept(++std::declval<T&>().value_of()))
+    {
+      ++value_of(t);
+      return t;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator++(T& t, int)
+    {
+      auto copy = t;
+      ++t;
+      return copy;
+    }
+  };
+};
+
+struct decrementable
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator--(T& t)
+    noexcept(noexcept(--std::declval<T&>().value_of()))
+    {
+      --value_of(t);
+      return t;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator--(T& t, int)
+    {
+      auto copy = t;
+      --t;
+      return copy;
+    }
+  };
+};
+
+struct bicrementable
+{
+  template <typename T>
+  class modifier
+    : public incrementable::modifier<T>
+    , public decrementable::modifier<T>
+  {
+  };
+};
+
+struct boolean
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    explicit STRONG_CONSTEXPR operator bool() const
+    noexcept(noexcept(static_cast<bool>(value_of(std::declval<const T&>()))))
+    {
+      const auto& self = static_cast<const T&>(*this);
+      return static_cast<bool>(value_of(self));
+    }
+  };
+};
+
+struct hashable
+{
+  template <typename T>
+  class modifier{};
+};
+
+struct difference
+{
+  template <typename T>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class difference::modifier<::strong::type<T, Tag, M...>>
+: public ordered::modifier<::strong::type<T, Tag, M...>>
+, public equality::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  friend
+  STRONG_CONSTEXPR
+  type& operator+=(type& lh, const type& rh)
+  noexcept(noexcept(value_of(lh) += value_of(rh)))
+  {
+    value_of(lh) += value_of(rh);
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type& operator-=(type& lh, const type& rh)
+    noexcept(noexcept(value_of(lh) -= value_of(rh)))
+  {
+    value_of(lh) -= value_of(rh);
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type& operator*=(type& lh, const T& rh)
+  noexcept(noexcept(value_of(lh) *= rh))
+  {
+    value_of(lh) *= rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type& operator/=(type& lh, const T& rh)
+    noexcept(noexcept(value_of(lh) /= rh))
+  {
+    value_of(lh) /= rh;
+    return lh;
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT&>()%= std::declval<const TT&>())>
+  friend
+  STRONG_CONSTEXPR
+  type& operator%=(type& lh, const T& rh)
+    noexcept(noexcept(value_of(lh) %= rh))
+  {
+    value_of(lh)%= rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type operator+(type lh, const type& rh)
+  {
+    lh += rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type operator-(type lh, const type& rh)
+  {
+    lh -= rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type operator*(type lh, const T& rh)
+  {
+    lh *= rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type operator*(const T& lh, type rh)
+  {
+    rh *= lh;
+    return rh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type operator/(type lh, const T& rh)
+  {
+    lh /= rh;
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  T operator/(const type& lh, const type& rh)
+  {
+    return value_of(lh) / value_of(rh);
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT&>() %= std::declval<const TT&>())>
+  friend
+  STRONG_CONSTEXPR
+  type operator%(type lh, const T& rh)
+    noexcept(noexcept(lh%= rh))
+  {
+      lh %= rh;
+      return lh;
+  }
+
+  template <typename TT = T, typename = decltype(std::declval<TT>() % std::declval<TT>())>
+  friend
+  STRONG_CONSTEXPR
+  T operator%(type lh, type rh)
+    noexcept(noexcept(value_of(lh) % value_of(rh)))
+  {
+      return value_of(lh) % value_of(rh);
+  }
+};
+
+template <typename D = void>
+struct affine_point
+{
+  template <typename T>
+  class modifier;
+};
+
+namespace impl
+{
+  template <typename ...>
+  using void_t = void;
+
+  template <typename T, typename = void>
+  struct subtractable : std::false_type {};
+
+  template <typename T>
+  struct subtractable<T, void_t<decltype(std::declval<const T&>() - std::declval<const T&>())>>
+  : std::true_type {};
+}
+
+
+template <typename D>
+template <typename T, typename Tag, typename ... M>
+class affine_point<D>::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  static_assert(impl::subtractable<T>::value, "it must be possible to subtract instances of your underlying type");
+  using base_diff_type = decltype(std::declval<const T&>() - std::declval<const T&>());
+public:
+  using difference = std::conditional_t<std::is_same<D, void>{}, strong::type<base_diff_type, Tag, strong::difference>, D>;
+  static_assert(std::is_constructible<difference, base_diff_type>::value, "");
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  difference
+  operator-(
+    const type& lh,
+    const type& rh)
+  {
+    return difference(value_of(lh) - value_of(rh));
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type&
+  operator+=(
+    type& lh,
+    const difference& d)
+  noexcept(noexcept(value_of(lh) += impl::access(d)))
+  {
+    value_of(lh) += impl::access(d);
+    return lh;
+  }
+
+  friend
+  STRONG_CONSTEXPR
+  type&
+  operator-=(
+    type& lh,
+    const difference& d)
+  noexcept(noexcept(value_of(lh) -= impl::access(d)))
+  {
+    value_of(lh) -= impl::access(d);
+    return lh;
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  type
+  operator+(
+    type lh,
+    const difference& d)
+  {
+    return lh += d;
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  type
+  operator+(
+    const difference& d,
+    type rh)
+  {
+    return rh+= d;
+  }
+
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  type
+  operator-(
+    type lh,
+    const difference& d)
+  {
+    return lh -= d;
+  }
+};
+
+
+struct pointer
+{
+  template <typename T>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class pointer::modifier<::strong::type<T, Tag, M...>>
+{
+  using type = strong::type<T, Tag, M...>;
+public:
+  template <typename TT = T>
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator==(
+    const type& t,
+    std::nullptr_t)
+  noexcept(noexcept(std::declval<const TT&>() == nullptr))
+  -> decltype(std::declval<const TT&>() == nullptr)
+  {
+    return value_of(t) == nullptr;
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator==(
+    std::nullptr_t,
+    const type& t)
+  noexcept(noexcept(nullptr == std::declval<const TT&>()))
+  -> decltype(nullptr == std::declval<const TT&>())
+  {
+    return value_of(t) == nullptr;
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator!=(
+    const type& t,
+    std::nullptr_t)
+  noexcept(noexcept(std::declval<const TT&>() != nullptr))
+  -> decltype(std::declval<const TT&>() != nullptr)
+  {
+    return value_of(t) != nullptr;
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  friend
+  STRONG_CONSTEXPR
+  auto
+  operator!=(
+    std::nullptr_t,
+    const type& t)
+  noexcept(noexcept(nullptr != std::declval<const TT&>()))
+  -> decltype(nullptr != std::declval<const TT&>())
+  {
+    return value_of(t) != nullptr;
+  }
+
+  STRONG_NODISCARD
+  STRONG_CONSTEXPR
+  decltype(*std::declval<const T&>())
+  operator*()
+  const
+  {
+    auto& self = static_cast<const type&>(*this);
+    return *value_of(self);
+  }
+
+  STRONG_NODISCARD
+  STRONG_CONSTEXPR
+  decltype(&(*std::declval<const T&>())) operator->() const { return &operator*();}
+};
+
+struct arithmetic
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator-(
+      const T &lh)
+    {
+      return T{-value_of(lh)};
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator+=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) += value_of(rh)))
+    {
+      value_of(lh) += value_of(rh);
+      return lh;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator-=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) -= value_of(rh)))
+    {
+      value_of(lh) -= value_of(rh);
+      return lh;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator*=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) *= value_of(rh)))
+    {
+      value_of(lh) *= value_of(rh);
+      return lh;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator/=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) /= value_of(rh)))
+    {
+      value_of(lh) /= value_of(rh);
+      return lh;
+    }
+
+    template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator%=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) %= value_of(rh)))
+    {
+      value_of(lh) %= value_of(rh);
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator+(
+      T lh,
+      const T &rh)
+    {
+      lh += rh;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator-(
+      T lh,
+      const T &rh)
+    {
+      lh -= rh;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator*(
+      T lh,
+      const T &rh)
+    {
+      lh *= rh;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator/(
+      T lh,
+      const T &rh)
+    {
+      lh /= rh;
+      return lh;
+    }
+
+    template <typename TT = T, typename = decltype(value_of(std::declval<TT>()) % value_of(std::declval<TT>()))>
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator%(
+      T lh,
+      const T &rh)
+    {
+      lh %= rh;
+      return lh;
+    }
+
+  };
+};
+
+
+struct bitarithmetic
+{
+  template <typename T>
+  class modifier
+  {
+  public:
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator&=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) &= value_of(rh)))
+    {
+      value_of(lh) &= value_of(rh);
+      return lh;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator|=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) |= value_of(rh)))
+    {
+      value_of(lh) |= value_of(rh);
+      return lh;
+    }
+
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator^=(
+      T &lh,
+      const T &rh)
+    noexcept(noexcept(value_of(lh) ^= value_of(rh)))
+    {
+      value_of(lh) ^= value_of(rh);
+      return lh;
+    }
+
+    template <typename C>
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator<<=(
+      T &lh,
+      C c)
+    noexcept(noexcept(value_of(lh) <<= c))
+    {
+      value_of(lh) <<= c;
+      return lh;
+    }
+
+    template <typename C>
+    friend
+    STRONG_CONSTEXPR
+    T&
+    operator>>=(
+      T &lh,
+      C c)
+    noexcept(noexcept(value_of(lh) >>= c))
+    {
+      value_of(lh) >>= c;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator~(
+      const T &lh)
+    {
+      auto v = value_of(lh);
+      v = ~v;
+      return T(v);
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator&(
+      T lh,
+      const T &rh)
+    {
+      lh &= rh;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator|(
+      T lh,
+      const T &rh)
+    {
+      lh |= rh;
+      return lh;
+    }
+
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator^(
+      T lh,
+      const T &rh)
+    {
+      lh ^= rh;
+      return lh;
+    }
+
+    template <typename C>
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator<<(
+      T lh,
+      C c)
+    {
+      lh <<= c;
+      return lh;
+    }
+
+    template <typename C>
+    STRONG_NODISCARD
+    friend
+    STRONG_CONSTEXPR
+    T
+    operator>>(
+      T lh,
+      C c)
+    {
+      lh >>= c;
+      return lh;
+    }
+  };
+};
+template <typename I = void>
+struct indexed
+{
+  template <typename T>
+  class modifier;
+};
+
+template <>
+struct indexed<void> {
+  template<typename>
+  class modifier;
+
+  template <typename T, typename Tag, typename ... Ms>
+  class modifier<type<T, Tag, Ms...>> {
+    using ref = T&;
+    using cref = const T&;
+    using rref = T&&;
+    using type = strong::type<T, Tag, Ms...>;
+  public:
+    template<typename I>
+    STRONG_NODISCARD
+    auto
+    operator[](
+      const I &i)
+    const &
+    noexcept(noexcept(std::declval<cref>()[impl::access(i)]))
+    -> decltype(std::declval<cref>()[impl::access(i)]) {
+      auto& self = static_cast<const type&>(*this);
+      return value_of(self)[impl::access(i)];
+    }
+
+    template<typename I>
+    STRONG_NODISCARD
+    auto
+    operator[](
+      const I &i)
+    &
+    noexcept(noexcept(std::declval<ref>()[impl::access(i)]))
+    -> decltype(std::declval<ref>()[impl::access(i)]) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(self)[impl::access(i)];
+    }
+
+    template<typename I>
+    STRONG_NODISCARD
+    auto
+    operator[](
+      const I &i)
+    &&
+    noexcept(noexcept(std::declval<rref>()[impl::access(i)]))
+    -> decltype(std::declval<rref>()[impl::access(i)]) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(std::move(self))[impl::access(i)];
+    }
+
+    template<typename I, typename C = cref>
+    STRONG_NODISCARD
+    auto
+    at(
+      const I &i)
+    const &
+    -> decltype(std::declval<C>().at(impl::access(i))) {
+      auto& self = static_cast<const type&>(*this);
+      return value_of(self).at(impl::access(i));
+    }
+
+    template<typename I, typename R = ref>
+    STRONG_NODISCARD
+    auto
+    at(
+      const I &i)
+    &
+    -> decltype(std::declval<R>().at(impl::access(i))) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(self).at(impl::access(i));
+    }
+
+    template<typename I, typename R = rref>
+    STRONG_NODISCARD
+    auto
+    at(
+      const I &i)
+    &&
+    -> decltype(std::declval<R>().at(impl::access(i))) {
+      auto& self = static_cast<type&>(*this);
+      return value_of(std::move(self)).at(impl::access(i));
+    }
+  };
+};
+
+template <typename I>
+template <typename T, typename Tag, typename ... M>
+class indexed<I>::modifier<type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+public:
+  STRONG_NODISCARD
+  auto
+  operator[](
+    const I& i)
+  const &
+  noexcept(noexcept(std::declval<const T&>()[impl::access(i)]))
+  -> decltype(std::declval<const T&>()[impl::access(i)])
+  {
+    auto& self = static_cast<const type&>(*this);
+    return value_of(self)[impl::access(i)];
+  }
+
+  STRONG_NODISCARD
+  auto
+  operator[](
+    const I& i)
+  &
+  noexcept(noexcept(std::declval<T&>()[impl::access(i)]))
+  -> decltype(std::declval<T&>()[impl::access(i)])
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(self)[impl::access(i)];
+  }
+
+  STRONG_NODISCARD
+  auto
+  operator[](
+    const I& i)
+  &&
+  noexcept(noexcept(std::declval<T&&>()[impl::access(i)]))
+  -> decltype(std::declval<T&&>()[impl::access(i)])
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(std::move(self))[impl::access(i)];
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  auto
+  at(
+    const I& i)
+  const &
+  -> decltype(std::declval<const TT&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return value_of(self).at(impl::access(i));
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  auto
+  at(
+    const I& i)
+  &
+  -> decltype(std::declval<TT&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(self).at(impl::access(i));
+  }
+
+  template <typename TT = T>
+  STRONG_NODISCARD
+  auto
+  at(
+    const I& i)
+  &&
+  -> decltype(std::declval<TT&&>().at(impl::access(i)))
+  {
+    auto& self = static_cast<type&>(*this);
+    return value_of(std::move(self)).at(impl::access(i));
+  }
+};
+
+class iterator
+{
+public:
+  template <typename I, typename category = typename std::iterator_traits<underlying_type_t<I>>::iterator_category>
+  class modifier
+    : public pointer::modifier<I>
+    , public equality::modifier<I>
+    , public incrementable::modifier<I>
+  {
+  public:
+    using difference_type = typename std::iterator_traits<underlying_type_t<I>>::difference_type;
+    using value_type = typename std::iterator_traits<underlying_type_t<I>>::value_type;
+    using pointer = typename std::iterator_traits<underlying_type_t<I>>::value_type;
+    using reference = typename std::iterator_traits<underlying_type_t<I>>::reference;
+    using iterator_category = typename std::iterator_traits<underlying_type_t<I>>::iterator_category;
+  };
+
+  template <typename I>
+  class modifier<I, std::bidirectional_iterator_tag>
+    : public modifier<I, std::forward_iterator_tag>
+      , public decrementable::modifier<I>
+  {
+  };
+  template <typename I>
+  class modifier<I, std::random_access_iterator_tag>
+    : public modifier<I, std::bidirectional_iterator_tag>
+      , public affine_point<typename std::iterator_traits<underlying_type_t<I>>::difference_type>::template modifier<I>
+      , public indexed<>::modifier<I>
+      , public ordered::modifier<I>
+  {
+  };
+};
+
+class range
+{
+public:
+  template <typename R>
+  class modifier;
+};
+
+template <typename T, typename Tag, typename ... M>
+class range::modifier<type<T, Tag, M...>>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  using r_iterator = decltype(std::declval<T&>().begin());
+  using r_const_iterator = decltype(std::declval<const T&>().begin());
+public:
+  using iterator = ::strong::type<r_iterator, Tag, strong::iterator>;
+  using const_iterator = ::strong::type<r_const_iterator, Tag, strong::iterator>;
+
+  iterator
+  begin()
+  noexcept(noexcept(std::declval<T&>().begin()))
+  {
+    auto& self = static_cast<type&>(*this);
+    return iterator{value_of(self).begin()};
+  }
+
+  iterator
+  end()
+  noexcept(noexcept(std::declval<T&>().end()))
+  {
+    auto& self = static_cast<type&>(*this);
+    return iterator{value_of(self).end()};
+  }
+
+  const_iterator
+  cbegin()
+    const
+  noexcept(noexcept(std::declval<const T&>().begin()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).begin()};
+  }
+
+  const_iterator
+  cend()
+    const
+  noexcept(noexcept(std::declval<const T&>().end()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).end()};
+  }
+
+  const_iterator
+  begin()
+  const
+  noexcept(noexcept(std::declval<const T&>().begin()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).begin()};
+  }
+
+  const_iterator
+  end()
+  const
+  noexcept(noexcept(std::declval<const T&>().end()))
+  {
+    auto& self = static_cast<const type&>(*this);
+    return const_iterator{value_of(self).end()};
+  }
+};
+
+namespace impl {
+
+  template<typename T, typename D>
+  struct converter
+  {
+    STRONG_CONSTEXPR explicit operator D() const
+    noexcept(noexcept(static_cast<D>(std::declval<const underlying_type_t<T>&>())))
+    {
+      auto& self = static_cast<const T&>(*this);
+      return static_cast<D>(value_of(self));
+    }
+  };
+  template<typename T, typename D>
+  struct implicit_converter
+  {
+    STRONG_CONSTEXPR operator D() const
+    noexcept(noexcept(static_cast<D>(std::declval<const underlying_type_t<T>&>())))
+    {
+      auto& self = static_cast<const T&>(*this);
+      return static_cast<D>(value_of(self));
+    }
+  };
+}
+template <typename ... Ts>
+struct convertible_to
+{
+  template <typename T>
+  struct modifier : impl::converter<T, Ts>...
+  {
+  };
+};
+
+template <typename ... Ts>
+struct implicitly_convertible_to
+{
+  template <typename T>
+  struct modifier : impl::implicit_converter<T, Ts>...
+  {
+  };
+
+};
+
+struct formattable
+{
+    template <typename T>
+    class modifier{};
+};
+
+}
+
+namespace std {
+template <typename T, typename Tag, typename ... M>
+struct hash<::strong::type<T, Tag, M...>>
+  : std::conditional_t<
+    std::is_base_of<
+      ::strong::hashable::modifier<
+        ::strong::type<T, Tag, M...>
+      >,
+      ::strong::type<T, Tag, M...>
+    >::value,
+    hash<T>,
+    std::false_type>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  decltype(auto)
+  operator()(
+    const ::strong::hashable::modifier<type>& t)
+  const
+  noexcept(noexcept(std::declval<hash<T>>()(value_of(std::declval<const type&>()))))
+  {
+    auto& tt = static_cast<const type&>(t);
+    return hash<T>::operator()(value_of(tt));
+  }
+};
+template <typename T, typename Tag, typename ... M>
+struct is_arithmetic<::strong::type<T, Tag, M...>>
+  : is_base_of<::strong::arithmetic::modifier<::strong::type<T, Tag, M...>>,
+               ::strong::type<T, Tag, M...>>
+{
+};
+
+#if STRONG_HAS_STD_FORMAT
+template<typename T, typename Tag, typename... M, typename Char>
+struct formatter<::strong::type<T, Tag, M...>, Char,
+                 std::enable_if_t<
+                     std::is_base_of<
+                         ::strong::formattable::modifier<
+                             ::strong::type<T, Tag, M...>
+                             >,
+                         ::strong::type<T, Tag, M...>
+                         >::value
+                     >>
+    : formatter<T>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  template<typename FormatContext>
+  STRONG_CONSTEXPR
+  decltype(auto)
+  format(const ::strong::formattable::modifier<type>& t, FormatContext& fc)
+      noexcept(noexcept(std::declval<formatter<T, Char>>().format(value_of(std::declval<const type&>()), fc)))
+  {
+    const auto& tt = static_cast<const type&>(t);
+    return formatter<T, Char>::format(value_of(tt), fc);
+  }
+};
+#endif
+
+}
+
+#if STRONG_HAS_FMT_FORMAT
+namespace fmt
+{
+template<typename T, typename Tag, typename... M, typename Char>
+struct formatter<::strong::type<T, Tag, M...>, Char,
+                 std::enable_if_t<
+                   std::is_base_of<
+                     ::strong::formattable::modifier<
+                       ::strong::type<T, Tag, M...>
+                     >,
+                     ::strong::type<T, Tag, M...>
+                   >::value
+                 >>
+  : formatter<T>
+{
+  using type = ::strong::type<T, Tag, M...>;
+  template<typename FormatContext>
+  STRONG_CONSTEXPR
+  decltype(auto)
+  format(const ::strong::formattable::modifier<type>& t, FormatContext& fc)
+      noexcept(noexcept(std::declval<formatter<T, Char>>().format(value_of(std::declval<const type&>()), fc)))
+  {
+    const auto& tt = static_cast<const type&>(t);
+    return formatter<T, Char>::format(value_of(tt), fc);
+  }
+};
+}
+#endif
+#endif //ROLLBEAR_STRONG_TYPE_HPP_INCLUDED
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/tempfile.h b/MLPY/Lib/site-packages/torch/include/c10/util/tempfile.h
new file mode 100644
index 0000000000000000000000000000000000000000..89fd988913797c0a5a18fb53818b44b1919c2aa4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/tempfile.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+
+namespace c10 {
+struct C10_API TempFile {
+  TempFile(std::string_view name, int fd = -1) noexcept : fd(fd), name(name) {}
+  TempFile(const TempFile&) = delete;
+  TempFile(TempFile&& other) noexcept
+      : fd(other.fd), name(std::move(other.name)) {
+    other.fd = -1;
+  }
+
+  TempFile& operator=(const TempFile&) = delete;
+  TempFile& operator=(TempFile&& other) noexcept {
+    fd = other.fd;
+    name = std::move(other.name);
+    other.fd = -1;
+    return *this;
+  }
+#if defined(_WIN32)
+  bool open();
+#endif
+
+  ~TempFile();
+
+  int fd;
+
+  std::string name;
+};
+
+struct C10_API TempDir {
+  TempDir() = delete;
+  explicit TempDir(std::string_view name) noexcept : name(name) {}
+  TempDir(const TempDir&) = delete;
+  TempDir(TempDir&& other) noexcept : name(std::move(other.name)) {
+    other.name.clear();
+  }
+
+  TempDir& operator=(const TempDir&) = delete;
+  TempDir& operator=(TempDir&& other) noexcept {
+    name = std::move(other.name);
+    return *this;
+  }
+
+  ~TempDir();
+
+  std::string name;
+};
+
+/// Attempts to return a temporary file or returns `nullopt` if an error
+/// occurred.
+///
+/// The file returned follows the pattern
+/// `<tmp-dir>/<name-prefix><random-pattern>`, where `<tmp-dir>` is the value of
+/// the `"TMPDIR"`, `"TMP"`, `"TEMP"` or
+/// `"TEMPDIR"` environment variable if any is set, or otherwise `/tmp`;
+/// `<name-prefix>` is the value supplied to this function, and
+/// `<random-pattern>` is a random sequence of numbers.
+/// On Windows, `name_prefix` is ignored and `tmpnam_s` is used,
+/// and no temporary file is opened.
+C10_API std::optional<TempFile> try_make_tempfile(
+    std::string_view name_prefix = "torch-file-");
+
+/// Like `try_make_tempfile`, but throws an exception if a temporary file could
+/// not be returned.
+C10_API TempFile make_tempfile(std::string_view name_prefix = "torch-file-");
+
+/// Attempts to return a temporary directory or returns `nullopt` if an error
+/// occurred.
+///
+/// The directory returned follows the pattern
+/// `<tmp-dir>/<name-prefix><random-pattern>/`, where `<tmp-dir>` is the value
+/// of the `"TMPDIR"`, `"TMP"`, `"TEMP"` or
+/// `"TEMPDIR"` environment variable if any is set, or otherwise `/tmp`;
+/// `<name-prefix>` is the value supplied to this function, and
+/// `<random-pattern>` is a random sequence of numbers.
+/// On Windows, `name_prefix` is ignored.
+C10_API std::optional<TempDir> try_make_tempdir(
+    std::string_view name_prefix = "torch-dir-");
+
+/// Like `try_make_tempdir`, but throws an exception if a temporary directory
+/// could not be returned.
+C10_API TempDir make_tempdir(std::string_view name_prefix = "torch-dir-");
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/thread_name.h b/MLPY/Lib/site-packages/torch/include/c10/util/thread_name.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b45ffdabba5ce926328d41bf61cb82edfb35a85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/thread_name.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <string>
+
+#include <c10/macros/Export.h>
+
+namespace c10 {
+
+C10_API void setThreadName(std::string name);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/c10/util/typeid.h b/MLPY/Lib/site-packages/torch/include/c10/util/typeid.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd966372f74275f786a6dc658037f00aee372859
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/util/typeid.h
@@ -0,0 +1,714 @@
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Half.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/TypeIndex.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/irange.h>
+#include <c10/util/string_view.h>
+
+#include <c10/core/ScalarType.h>
+
+/*
+ * TypeIdentifier is a small type containing an id.
+ * Types must be registered using CAFFE_DECLARE_KNOWN_TYPE() (in their header)
+ * and CAFFE_DEFINE_KNOWN_TYPE() (in their .cpp file) for them to have a type
+ * id. If a type is registered, you can also create an object containing meta
+ * data like constructor, destructor, stringified name, ... about the type by
+ * calling TypeMeta::Make<T>. This returns a TypeMeta() object, which is
+ * basically just a pointer to the type information, so it's cheap to pass
+ * around.
+ */
+
+// TODO: This file is still in the caffe2 namespace, despite living
+// in the ATen directory.  This is because the macro
+// CAFFE_KNOWN_TYPE (and CAFFE_DECLARE_KNOWN_TYPE) defines a template
+// specialization, which relies
+// on the namespace of TypeMeta matching the namespace where the macro is
+// called.  This requires us to fix all of the call-sites, which I want to do
+// later.  So the namespace is not fixed at the moment.
+
+// Make at::Half a fundamental type.
+
+namespace c10::guts {
+template <>
+struct is_fundamental<at::Half> : std::true_type {};
+} // namespace c10::guts
+
+namespace caffe2 {
+
+/**
+ * A type id is a unique id for a given C++ type.
+ * You need to register your types using CAFFE_KNOWN_TYPE(MyType) to be able to
+ * use TypeIdentifier with custom types. This is for example used to store the
+ * dtype of tensors.
+ */
+class C10_API TypeIdentifier final
+    : public at::IdWrapper<TypeIdentifier, c10::util::type_index> {
+ public:
+  friend std::ostream& operator<<(std::ostream& stream, TypeIdentifier typeId);
+  friend constexpr bool operator<(TypeIdentifier lhs, TypeIdentifier rhs);
+
+  /**
+   * Returns the unique id for the given type T. The id is unique for the type T
+   * in the sense that for any two different types, their ids are different; for
+   * the same type T, the id remains the same over different calls of the
+   * function. However, this is not guaranteed over different runs, as the id
+   * is generated during run-time. Do NOT serialize the id for storage.
+   */
+  template <typename T>
+  static C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA TypeIdentifier Get() noexcept {
+    return TypeIdentifier(c10::util::get_type_index<T>());
+  }
+
+  static constexpr TypeIdentifier uninitialized() {
+    return TypeIdentifier(c10::util::type_index{0});
+  }
+
+ private:
+  constexpr explicit TypeIdentifier(c10::util::type_index id) : IdWrapper(id) {}
+};
+
+// Allow usage in std::map / std::set
+// TODO Disallow this and rather use std::unordered_map/set everywhere
+inline constexpr bool operator<(TypeIdentifier lhs, TypeIdentifier rhs) {
+  return lhs.underlyingId() < rhs.underlyingId();
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    caffe2::TypeIdentifier typeId) {
+  return stream << typeId.underlyingId();
+}
+
+} // namespace caffe2
+
+namespace at {
+using DataType = caffe2::TypeIdentifier;
+}
+
+C10_DEFINE_HASH_FOR_IDWRAPPER(caffe2::TypeIdentifier)
+
+namespace caffe2 {
+
+namespace detail {
+
+// This struct holds the actual type information. There will be
+// one allocated per type. TypeMeta objects will then point to the struct
+// instance for the type they're configured for.
+struct TypeMetaData final {
+  using New = void*();
+  using PlacementNew = void(void*, size_t);
+  using Copy = void(const void*, void*, size_t);
+  using PlacementDelete = void(void*, size_t);
+  using Delete = void(void*);
+
+  constexpr TypeMetaData() noexcept
+      : itemsize_(0),
+        new_(nullptr),
+        placementNew_(nullptr),
+        copy_(nullptr),
+        placementDelete_(nullptr),
+        delete_(nullptr),
+        id_(TypeIdentifier::uninitialized()),
+        name_("nullptr (uninitialized)") {}
+
+  constexpr TypeMetaData(
+      size_t itemsize,
+      New* newFn,
+      PlacementNew* placementNew,
+      Copy* copy,
+      PlacementDelete* placementDelete,
+      Delete* deleteFn,
+      TypeIdentifier id,
+      c10::string_view name) noexcept
+      : itemsize_(itemsize),
+        new_(newFn),
+        placementNew_(placementNew),
+        copy_(copy),
+        placementDelete_(placementDelete),
+        delete_(deleteFn),
+        id_(id),
+        name_(name) {}
+
+  size_t itemsize_;
+  New* new_;
+  PlacementNew* placementNew_;
+  Copy* copy_;
+  PlacementDelete* placementDelete_;
+  Delete* delete_;
+  TypeIdentifier id_;
+  c10::string_view name_;
+};
+
+// Mechanism for throwing errors which can't be prevented at compile time
+// due to type erasure. E.g. somebody calling TypeMeta::copy() for
+// non-copyable type. Right now just throws exception but is implemented
+// in .cpp to manage dependencies
+[[noreturn]] C10_API void _ThrowRuntimeTypeLogicError(const std::string& msg);
+
+/**
+ * Placement new function for the type.
+ */
+template <typename T>
+inline void _PlacementNew(void* ptr, size_t n) {
+  T* typed_ptr = static_cast<T*>(ptr);
+  for (const auto i : c10::irange(n)) {
+    new (typed_ptr + i) T;
+  }
+}
+
+template <typename T>
+inline void _PlacementNewNotDefault(void* /*ptr*/, size_t /*n*/) {
+  _ThrowRuntimeTypeLogicError(
+      "Type " + std::string(c10::util::get_fully_qualified_type_name<T>()) +
+      " is not default-constructible.");
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_default_constructible_v<T>>* = nullptr>
+inline constexpr TypeMetaData::PlacementNew* _PickPlacementNew() {
+  return (c10::guts::is_fundamental<T>::value || std::is_pointer_v<T>)
+      ? nullptr
+      : &_PlacementNew<T>;
+}
+
+template <
+    typename T,
+    std::enable_if_t<!std::is_default_constructible_v<T>>* = nullptr>
+inline constexpr TypeMetaData::PlacementNew* _PickPlacementNew() {
+  static_assert(
+      !c10::guts::is_fundamental<T>::value && !std::is_pointer_v<T>,
+      "this should have picked the other SFINAE case");
+  return &_PlacementNewNotDefault<T>;
+}
+
+template <typename T>
+inline void* _New() {
+  return new T;
+}
+
+template <typename T>
+inline void* _NewNotDefault() {
+  _ThrowRuntimeTypeLogicError(
+      "Type " + std::string(c10::util::get_fully_qualified_type_name<T>()) +
+      " is not default-constructible.");
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_default_constructible_v<T>>* = nullptr>
+inline constexpr TypeMetaData::New* _PickNew() {
+  return &_New<T>;
+}
+
+template <
+    typename T,
+    std::enable_if_t<!std::is_default_constructible_v<T>>* = nullptr>
+inline constexpr TypeMetaData::New* _PickNew() {
+  return &_NewNotDefault<T>;
+}
+
+/**
+ * Typed copy function for classes.
+ */
+template <typename T>
+inline void _Copy(const void* src, void* dst, size_t n) {
+  const T* typed_src = static_cast<const T*>(src);
+  T* typed_dst = static_cast<T*>(dst);
+  for (const auto i : c10::irange(n)) {
+    typed_dst[i] = typed_src[i];
+  }
+}
+
+/**
+ * A placeholder function for types that do not allow assignment.
+ */
+template <typename T>
+inline void _CopyNotAllowed(const void* /*src*/, void* /*dst*/, size_t /*n*/) {
+  _ThrowRuntimeTypeLogicError(
+      "Type " + std::string(c10::util::get_fully_qualified_type_name<T>()) +
+      " does not allow assignment.");
+}
+
+template <typename T, std::enable_if_t<std::is_copy_assignable_v<T>>* = nullptr>
+inline constexpr TypeMetaData::Copy* _PickCopy() {
+  return (c10::guts::is_fundamental<T>::value || std::is_pointer_v<T>)
+      ? nullptr
+      : &_Copy<T>;
+}
+
+template <
+    typename T,
+    std::enable_if_t<!std::is_copy_assignable_v<T>>* = nullptr>
+inline constexpr TypeMetaData::Copy* _PickCopy() {
+  static_assert(
+      !c10::guts::is_fundamental<T>::value && !std::is_pointer_v<T>,
+      "this should have picked the other SFINAE case");
+  return &_CopyNotAllowed<T>;
+}
+
+/**
+ * Destructor for non-fundamental types.
+ */
+template <typename T>
+inline void _PlacementDelete(void* ptr, size_t n) {
+  T* typed_ptr = static_cast<T*>(ptr);
+  for (const auto i : c10::irange(n)) {
+    typed_ptr[i].~T();
+  }
+}
+
+template <typename T>
+inline constexpr TypeMetaData::PlacementDelete* _PickPlacementDelete() {
+  return (c10::guts::is_fundamental<T>::value || std::is_pointer_v<T>)
+      ? nullptr
+      : &_PlacementDelete<T>;
+}
+
+template <typename T>
+inline void _Delete(void* ptr) {
+  T* typed_ptr = static_cast<T*>(ptr);
+  delete typed_ptr;
+}
+
+template <class T>
+inline constexpr TypeMetaData::Delete* _PickDelete() noexcept {
+  return &_Delete<T>;
+}
+
+class _Uninitialized final {};
+
+} // namespace detail
+
+//
+// note: this is outside TypeMeta bc gcc seems to have trouble
+// with scalarTypeItemSizes as a constexpr static member used by
+// a public inline instance method
+//
+
+// item sizes for TypeMeta::itemsize() fast path
+static constexpr std::array<uint8_t, NumScalarTypes> scalarTypeItemSizes = {
+#define SCALAR_TYPE_SIZE(T, name) sizeof(T),
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SCALAR_TYPE_SIZE)
+#undef SCALAR_TYPE_SIZE
+        0, // Undefined
+};
+
+/**
+ * TypeMeta is a thin class that allows us to store the type of a container such
+ * as a blob, or the data type of a tensor, with a unique run-time id. It also
+ * stores some additional data such as the item size and the name of the type
+ * for run-time inspection.
+ */
+class C10_API TypeMeta final {
+ public:
+  using New = detail::TypeMetaData::New;
+  using PlacementNew = detail::TypeMetaData::PlacementNew;
+  using Copy = detail::TypeMetaData::Copy;
+  using PlacementDelete = detail::TypeMetaData::PlacementDelete;
+  using Delete = detail::TypeMetaData::Delete;
+
+  /** Create a dummy TypeMeta object. To create a TypeMeta object for a specific
+   * type, use TypeMeta::Make<T>().
+   */
+  TypeMeta() noexcept;
+
+  /**
+   * Copy constructor.
+   */
+  TypeMeta(const TypeMeta& src) noexcept = default;
+
+  /**
+   * Assignment operators.
+   */
+  TypeMeta& operator=(const TypeMeta& src) noexcept = default;
+
+  TypeMeta(TypeMeta&& rhs) noexcept = default;
+
+  inline TypeMeta& operator=(ScalarType scalar_type) noexcept {
+    index_ = static_cast<uint16_t>(scalar_type);
+    return *this;
+  }
+
+ private:
+  // TypeMeta can only be created by Make, making sure that we do not
+  // create incorrectly mixed up TypeMeta objects.
+  explicit TypeMeta(const uint16_t index) noexcept : index_(index) {}
+
+ public:
+  /**
+   * Returns the type id.
+   */
+  TypeIdentifier id() const noexcept {
+    return data().id_;
+  }
+  /**
+   * true if we represent some ScalarType type
+   */
+  inline bool isScalarType() const noexcept {
+    return index_ < NumScalarTypes;
+  }
+  /**
+   * true if we represent ScalarType scalar_type
+   */
+  inline bool isScalarType(ScalarType scalar_type) const noexcept {
+    return index_ == static_cast<uint16_t>(scalar_type);
+  }
+  /**
+   * Returns the size of the item.
+   */
+  inline size_t itemsize() const noexcept {
+    if (C10_LIKELY(isScalarType())) {
+      return scalarTypeItemSizes[index_];
+    }
+    return data().itemsize_;
+  }
+  /**
+   * Returns the new function pointer for individual items.
+   */
+  New* newFn() const noexcept {
+    return data().new_;
+  }
+  /**
+   * Returns the placement new function pointer for individual items.
+   */
+  PlacementNew* placementNew() const noexcept {
+    return data().placementNew_;
+  }
+  /**
+   * Returns the typed copy function pointer for individual iterms.
+   */
+  Copy* copy() const noexcept {
+    return data().copy_;
+  }
+  /**
+   * Returns the destructor function pointer for individual items.
+   */
+  PlacementDelete* placementDelete() const noexcept {
+    return data().placementDelete_;
+  }
+  Delete* deleteFn() const noexcept {
+    return data().delete_;
+  }
+  /**
+   * Returns a printable name for the type.
+   */
+  c10::string_view name() const noexcept {
+    return data().name_;
+  }
+
+  friend bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept;
+
+  template <typename T>
+  bool Match() const noexcept {
+    return (*this == Make<T>());
+  }
+
+  // Below are static functions that can be called by passing a specific type.
+
+  template <class T>
+  static C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA TypeIdentifier Id() noexcept {
+    return TypeIdentifier::Get<T>();
+  }
+
+  template <class T>
+  static c10::string_view TypeName() noexcept {
+    return c10::util::get_fully_qualified_type_name<T>();
+  }
+
+  template <class T>
+  static constexpr size_t ItemSize() noexcept {
+    return sizeof(T);
+  }
+
+  /**
+   * Returns a TypeMeta object that corresponds to the typename T.
+   */
+  template <typename T>
+  static TypeMeta Make() {
+    // The instance pointed to is declared here, but defined in a .cpp file.
+    // We need to silence the compiler warning about using an undefined
+    // variable template. '-Wpragmas' and '-Wunknown-warning-option' has to be
+    // disabled for compilers that don't know '-Wundefined-var-template' and
+    // would error at our attempt to disable it.
+#ifndef _MSC_VER
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wundefined-var-template"
+#endif
+    return TypeMeta(_typeMetaData<T>());
+#ifndef _MSC_VER
+#pragma GCC diagnostic pop
+#endif
+  }
+
+  /**
+   * convert ScalarType enum values to TypeMeta handles
+   */
+  static inline caffe2::TypeMeta fromScalarType(ScalarType scalar_type) {
+    const auto index = static_cast<uint16_t>(scalar_type);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        index < NumScalarTypes,
+        "Unrecognized Scalartype ",
+        scalar_type,
+        " (please report this error)");
+    return TypeMeta(index);
+  }
+
+  /**
+   * convert TypeMeta handles to ScalarType enum values
+   */
+  inline ScalarType toScalarType() {
+    if (C10_LIKELY(isScalarType())) {
+      return static_cast<ScalarType>(index_);
+    }
+    error_unsupported_typemeta(*this);
+  }
+
+ private:
+  [[noreturn]] static void error_unsupported_typemeta(caffe2::TypeMeta dtype);
+
+  // hard limit number of registered types
+  // note: constexpr provokes Windows compilation error "member may not be
+  // initialized" static constexpr size_t MaxTypeIndex = 32;
+  //
+#if defined C10_MOBILE
+// The reason for this not to be UINT8_MAX is that the array
+// initialization takes space which is proportional to the size of the array.
+// The compiler seems to add code (or data padding) to initialize the array with
+// empty elements. Please see
+// https://github.com/pytorch/pytorch/pull/51881 for details.
+//
+#define MaxTypeIndex                                                           \
+  (NumScalarTypes + 15 /* number of CAFFE_DEFINE_KNOWN_TYPE in typeid.cpp */ + \
+   1 /* 1 more for caffe2 tensor */)
+#else
+#define MaxTypeIndex UINT8_MAX
+#endif
+
+  // Protects type metadata allocation.
+  // NOLINTNEXTLINE(facebook-hte-NonPodStaticDeclaration)
+  static std::mutex& getTypeMetaDatasLock();
+  static uint16_t nextTypeIndex;
+
+  static detail::TypeMetaData* typeMetaDatas();
+
+  static uint16_t existingMetaDataIndexForType(TypeIdentifier identifier);
+
+ public:
+#ifdef __CUDACC__
+  // NOTE [ TypeIdentifier::Get nvcc/clang discrepancy]
+  // nvcc and clang do not produce identical results for
+  // TypeIdentifier::Get, because TypeIdentifier::Get relies on
+  // __PRETTY_FUNCTION__ and they don't agree on the canonical names
+  // of types (e.g., nvcc normalizes to `short unsigned int`, but clang
+  // calls it `unsigned short`). Hide the implementation of this function
+  // from nvcc so that we always use clang (or whatever host C++ compiler)
+  // for TypeIdentifier::Get.
+  template <class T>
+  C10_EXPORT static uint16_t addTypeMetaData();
+#else
+  template <class T>
+  C10_EXPORT static uint16_t addTypeMetaData() {
+    const auto identifier = TypeIdentifier::Get<T>();
+    // Need to hold this for the rest of the function, protecting:
+    // 1) existingMetaDataIndexForType()
+    // 2) nextTypeIndex++
+    // 3) the write into typeMetaDatas()
+    std::lock_guard<std::mutex> lock(getTypeMetaDatasLock());
+    // It may exist already if added in a different dynamic shared library.
+    const uint16_t existing_index = existingMetaDataIndexForType(identifier);
+    if (existing_index != MaxTypeIndex) {
+      return existing_index;
+    }
+    const uint16_t index = nextTypeIndex++;
+    TORCH_CHECK(
+        index <= MaxTypeIndex,
+        "Maximum number of CAFFE_KNOWN_TYPE declarations has been exceeded. ",
+        "Please report this issue.");
+    typeMetaDatas()[index] = detail::TypeMetaData{
+        sizeof(T),
+        detail::_PickNew<T>(),
+        detail::_PickPlacementNew<T>(),
+        detail::_PickCopy<T>(),
+        detail::_PickPlacementDelete<T>(),
+        detail::_PickDelete<T>(),
+        identifier,
+        c10::util::get_fully_qualified_type_name<T>()};
+    return index;
+  }
+#endif
+
+ private:
+  // specializations return indexes into typeMetaDataInstances()
+  template <class T>
+  C10_API static uint16_t _typeMetaData() noexcept;
+
+  //
+  // TypeMeta just wraps this index
+  //
+
+  uint16_t index_;
+
+  inline const detail::TypeMetaData& data() const {
+    return typeMetaDatas()[index_];
+  }
+};
+
+// specializations of TypeMeta::_typeMetaData for ScalarType types
+
+#define DEFINE_SCALAR_METADATA_INSTANCE(T, name)             \
+  template <>                                                \
+  constexpr uint16_t TypeMeta::_typeMetaData<T>() noexcept { \
+    return static_cast<uint16_t>(ScalarType::name);          \
+  }
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_SCALAR_METADATA_INSTANCE)
+#undef DEFINE_SCALAR_METADATA_INSTANCE
+
+template <>
+C10_EXPORT constexpr uint16_t TypeMeta::_typeMetaData<
+    detail::_Uninitialized>() noexcept {
+  return static_cast<uint16_t>(ScalarType::Undefined);
+}
+
+inline TypeMeta::TypeMeta() noexcept
+    : index_(_typeMetaData<detail::_Uninitialized>()) {}
+
+inline bool operator==(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return (lhs.index_ == rhs.index_);
+}
+inline bool operator!=(const TypeMeta& lhs, const TypeMeta& rhs) noexcept {
+  return !operator==(lhs, rhs);
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    caffe2::TypeMeta typeMeta) {
+  return stream << typeMeta.name();
+}
+
+/**
+ * Register unique id for a type so it can be used in TypeMeta context, e.g. be
+ * used as a type for Blob or for Tensor elements.
+ *
+ * CAFFE_KNOWN_TYPE is deprecated; prefer CAFFE_DECLARE_KNOWN_TYPE and
+ * CAFFE_DEFINE_KNOWN_TYPE.
+ *
+ * CAFFE_KNOWN_TYPE does explicit instantiation of TypeIdentifier::Get<T>
+ * template function and thus needs to be put in a single translation unit (.cpp
+ * file) for a given type T. Other translation units that use type T as a type
+ * of the caffe2::Blob or element type of caffe2::Tensor need to depend on the
+ * translation unit that contains CAFFE_KNOWN_TYPE declaration via regular
+ * linkage dependencies.
+ *
+ * NOTE: the macro needs to be invoked in ::caffe2 namespace
+ */
+// Implementation note: in MSVC, we will need to prepend the C10_API
+// keyword in order to get things compiled properly. in Linux, gcc seems to
+// create attribute ignored error for explicit template instantiations, see
+//   http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51930
+// and as a result, we define these two macros slightly differently.
+#if defined(_MSC_VER) || defined(__clang__)
+#define EXPORT_IF_NOT_GCC C10_EXPORT
+#else
+#define EXPORT_IF_NOT_GCC
+#endif
+
+// CAFFE_KNOWN_TYPE is deprecated! Use CAFFE_DECLARE_KNOWN_TYPE and
+// CAFFE_DEFINE_KNOWN_TYPE instead.
+#define CAFFE_KNOWN_TYPE(T)                                          \
+  template uint16_t TypeMeta::addTypeMetaData<T>();                  \
+  template <>                                                        \
+  EXPORT_IF_NOT_GCC uint16_t TypeMeta::_typeMetaData<T>() noexcept { \
+    static const uint16_t index = addTypeMetaData<T>();              \
+    return index;                                                    \
+  }
+
+#define CAFFE_DEFINE_KNOWN_TYPE(T, ident)                   \
+  template uint16_t TypeMeta::addTypeMetaData<T>();         \
+  namespace detail {                                        \
+  EXPORT_IF_NOT_GCC const uint16_t ident##_metadata_index = \
+      TypeMeta::addTypeMetaData<T>();                       \
+  } // namespace detail
+
+// Unlike CAFFE_KNOWN_TYPE, CAFFE_DECLARE_KNOWN_TYPE avoids a function
+// call to access _typeMetaData in the common case.
+#define CAFFE_DECLARE_KNOWN_TYPE(T, ident)                 \
+  extern template uint16_t TypeMeta::addTypeMetaData<T>(); \
+  namespace detail {                                       \
+  extern C10_API const uint16_t ident##_metadata_index;    \
+  } /* namespace detail */                                 \
+  template <>                                              \
+  EXPORT_IF_NOT_GCC C10_ALWAYS_INLINE uint16_t             \
+  TypeMeta::_typeMetaData<T>() noexcept {                  \
+    return detail::ident##_metadata_index;                 \
+  }
+
+#define CAFFE_KNOWN_TYPE_NOEXPORT(T)                    \
+  template <>                                           \
+  uint16_t TypeMeta::_typeMetaData<T>() noexcept {      \
+    static const uint16_t index = addTypeMetaData<T>(); \
+    return index;                                       \
+  }
+
+CAFFE_DECLARE_KNOWN_TYPE(std::string, std_string)
+CAFFE_DECLARE_KNOWN_TYPE(char, char)
+CAFFE_DECLARE_KNOWN_TYPE(std::unique_ptr<std::mutex>, std_unique_ptr_std_mutex)
+CAFFE_DECLARE_KNOWN_TYPE(
+    std::unique_ptr<std::atomic<bool>>,
+    std_unique_ptr_std_atomic_bool)
+CAFFE_DECLARE_KNOWN_TYPE(std::vector<int32_t>, std_vector_int32_t)
+CAFFE_DECLARE_KNOWN_TYPE(std::vector<int64_t>, std_vector_int64_t)
+CAFFE_DECLARE_KNOWN_TYPE(std::vector<unsigned long>, std_vector_unsigned_long)
+CAFFE_DECLARE_KNOWN_TYPE(bool*, bool_ptr)
+CAFFE_DECLARE_KNOWN_TYPE(char*, char_ptr)
+CAFFE_DECLARE_KNOWN_TYPE(int*, int_ptr)
+
+// For some of the compilers, long is defined separately from int32_t and
+// int64_t. As a result we will need to actually define them separately.
+// It is recommended that one does NOT use long - use int32_t and int64_t
+// explicitly. Explicit long type annotation may go away in the future.
+// details: This hack works by defining a _guard_long_unique type, which is
+// long iff the compiler has a separate long type and is a dummy type otherwise.
+// we then allocate a type id to that _guard_long_unique. If the compiler has a
+// separate long type, this allocates a type id for long. Otherwise, it
+// allocates a type id for the dummy type, which doesn't matter.
+namespace detail {
+template <class T>
+class _guard_long_unique_dummy final {};
+template <class T>
+using _guard_long_unique = std::conditional_t<
+    std::is_same_v<long, int32_t> || std::is_same_v<long, int64_t>,
+    _guard_long_unique_dummy<T>,
+    T>;
+} // namespace detail
+
+CAFFE_DECLARE_KNOWN_TYPE(
+    detail::_guard_long_unique<long>,
+    detail_guard_long_unique_long);
+CAFFE_DECLARE_KNOWN_TYPE(
+    detail::_guard_long_unique<std::vector<long>>,
+    detail_guard_long_unique_std_vector_long)
+
+CAFFE_DECLARE_KNOWN_TYPE(float*, float_ptr)
+CAFFE_DECLARE_KNOWN_TYPE(at::Half*, at_Half)
+
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e218b9e8b510556afe47344b48ce8d1b20c4f2a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUCachingAllocator.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/xpu/XPUStream.h>
+
+namespace c10::xpu::XPUCachingAllocator {
+
+C10_XPU_API Allocator* get();
+
+C10_XPU_API void init(DeviceIndex device_count);
+
+C10_XPU_API void emptyCache();
+
+C10_XPU_API void* raw_alloc(size_t size);
+
+C10_XPU_API void raw_delete(void* ptr);
+
+C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
+
+} // namespace c10::xpu::XPUCachingAllocator
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUDeviceProp.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7cad6c1b5e816b04456391bdd2ce0450a09e692
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUDeviceProp.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <c10/xpu/XPUMacros.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+#define AT_FORALL_XPU_DEVICE_PROPERTIES(_)                                     \
+  /* the device name of this SYCL device. */                                   \
+  _(name)                                                                      \
+                                                                               \
+  /* the device type associated with the device. */                            \
+  _(device_type)                                                               \
+                                                                               \
+  /* the vendor of this SYCL device. */                                        \
+  _(vendor)                                                                    \
+                                                                               \
+  /* a backend-defined driver version as a std::string. */                     \
+  _(driver_version)                                                            \
+                                                                               \
+  /* the SYCL version as a std::string in the form <major>.<minor> */          \
+  _(version)                                                                   \
+                                                                               \
+  /* true if the SYCL device is available. Otherwise, return false. */         \
+  _(is_available)                                                              \
+                                                                               \
+  /* the maximum size in bytes of the arguments that can be passed to a        \
+   * kernel. */                                                                \
+  _(max_parameter_size)                                                        \
+                                                                               \
+  /* the number of parallel compute units available to the device. */          \
+  _(max_compute_units)                                                         \
+                                                                               \
+  /* the maximum dimensions that specify the global and local work-item IDs    \
+   * used by the data parallel execution model. */                             \
+  _(max_work_item_dimensions)                                                  \
+                                                                               \
+  /* the maximum number of workitems that are permitted in a work-group        \
+   * executing a kernel on a single compute unit. */                           \
+  _(max_work_group_size)                                                       \
+                                                                               \
+  /* the maximum number of subgroups in a work-group for any kernel executed   \
+   * on the device. */                                                         \
+  _(max_num_sub_groups)                                                        \
+                                                                               \
+  /* a std::vector of size_t containing the set of sub-group sizes  supported  \
+   * by the device. */                                                         \
+  _(sub_group_sizes)                                                           \
+                                                                               \
+  /* the maximum configured clock frequency of this SYCL device in MHz. */     \
+  _(max_clock_frequency)                                                       \
+                                                                               \
+  /* the default compute device address space size specified as an unsigned    \
+   * integer value in bits. Must return either 32 or 64. */                    \
+  _(address_bits)                                                              \
+                                                                               \
+  /* the maximum size of memory object allocation in bytes. */                 \
+  _(max_mem_alloc_size)                                                        \
+                                                                               \
+  /* the minimum value in bits of the largest supported SYCL built-in data     \
+   * type if this SYCL device is not of device type                            \
+   * sycl::info::device_type::custom. */                                       \
+  _(mem_base_addr_align)                                                       \
+                                                                               \
+  /* a std::vector of info::fp_config describing the half/single/double        \
+   * precision floating-point capability of this SYCL device. */               \
+  _(half_fp_config)                                                            \
+  _(single_fp_config)                                                          \
+  _(double_fp_config)                                                          \
+                                                                               \
+  /* the size of global device memory in bytes. */                             \
+  _(global_mem_size)                                                           \
+                                                                               \
+  /* the type of global memory cache supported. */                             \
+  _(global_mem_cache_type)                                                     \
+                                                                               \
+  /* the size of global memory cache in bytes. */                              \
+  _(global_mem_cache_size)                                                     \
+                                                                               \
+  /* the size of global memory cache line in bytes. */                         \
+  _(global_mem_cache_line_size)                                                \
+                                                                               \
+  /* the type of local memory supported. */                                    \
+  _(local_mem_type)                                                            \
+                                                                               \
+  /* the size of local memory arena in bytes. */                               \
+  _(local_mem_size)                                                            \
+                                                                               \
+  /* the maximum number of sub-devices that can be created when this device is \
+   * partitioned. */                                                           \
+  _(partition_max_sub_devices)                                                 \
+                                                                               \
+  /* the resolution of device timer in nanoseconds. */                         \
+  _(profiling_timer_resolution)                                                \
+                                                                               \
+  /* the preferred native vector width size for built-in scalar types that can \
+   * be put into vectors. */                                                   \
+  _(preferred_vector_width_char)                                               \
+  _(preferred_vector_width_short)                                              \
+  _(preferred_vector_width_int)                                                \
+  _(preferred_vector_width_long)                                               \
+  _(preferred_vector_width_float)                                              \
+  _(preferred_vector_width_double)                                             \
+  _(preferred_vector_width_half)                                               \
+                                                                               \
+  /* the native ISA vector width. The vector width is defined as the number of \
+   * scalar elements that can be stored in the vector. */                      \
+  _(native_vector_width_char)                                                  \
+  _(native_vector_width_short)                                                 \
+  _(native_vector_width_int)                                                   \
+  _(native_vector_width_long)                                                  \
+  _(native_vector_width_float)                                                 \
+  _(native_vector_width_double)                                                \
+  _(native_vector_width_half)
+
+#define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)           \
+  /* the number of EUs associated with the Intel GPU. */ \
+  _(gpu_eu_count, 512)                                   \
+                                                         \
+  /* the number of EUs in a subslice. */                 \
+  _(gpu_eu_count_per_subslice, 8)                        \
+                                                         \
+  /* the simd width of EU of GPU. */                     \
+  _(gpu_eu_simd_width, 8)                                \
+                                                         \
+  /* the number of hardware threads per EU of GPU. */    \
+  _(gpu_hw_threads_per_eu, 8)
+
+#define _DEFINE_SYCL_PROP(ns, property, member) \
+  ns::property::return_type member;
+
+#define DEFINE_DEVICE_PROP(property) \
+  _DEFINE_SYCL_PROP(sycl::info::device, property, property)
+
+#define DEFINE_PLATFORM_PROP(property, member) \
+  _DEFINE_SYCL_PROP(sycl::info::platform, property, member)
+
+#define DEFINE_EXT_DEVICE_PROP(property, ...) \
+  _DEFINE_SYCL_PROP(sycl::ext::intel::info::device, property, property)
+
+struct C10_XPU_API DeviceProp {
+  AT_FORALL_XPU_DEVICE_PROPERTIES(DEFINE_DEVICE_PROP);
+
+  // the platform name.
+  DEFINE_PLATFORM_PROP(name, platform_name);
+
+  AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(DEFINE_EXT_DEVICE_PROP)
+};
+
+#undef _DEFINE_SYCL_PROP
+#undef DEFINE_DEVICE_PROP
+#undef DEFINE_PLATFORM_PROP
+#undef DEFINE_EXT_DEVICE_PROP
+
+} // namespace c10::xpu
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUException.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUException.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff54d27914c876bcd3e09e7c1d8660582e7a7eeb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUException.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <sycl/sycl.hpp>
+
+namespace c10::xpu {
+
+static inline sycl::async_handler asyncHandler = [](sycl::exception_list el) {
+  if (el.size() == 0) {
+    return;
+  }
+  for (const auto& e : el) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception& e) {
+      TORCH_WARN("SYCL Exception: ", e.what());
+    }
+  }
+  throw;
+};
+
+} // namespace c10::xpu
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUFunctions.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..56cc6394eb22e4e6f2c3dba9743c6225ea806270
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUFunctions.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/xpu/XPUDeviceProp.h>
+#include <c10/xpu/XPUMacros.h>
+
+// The naming convention used here matches the naming convention of torch.xpu
+
+namespace c10::xpu {
+
+// Log a warning only once if no devices are detected.
+C10_XPU_API DeviceIndex device_count();
+
+// Throws an error if no devices are detected.
+C10_XPU_API DeviceIndex device_count_ensure_non_zero();
+
+C10_XPU_API DeviceIndex current_device();
+
+C10_XPU_API void set_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex exchange_device(DeviceIndex device);
+
+C10_XPU_API DeviceIndex maybe_exchange_device(DeviceIndex to_device);
+
+C10_XPU_API sycl::device& get_raw_device(DeviceIndex device);
+
+C10_XPU_API sycl::context& get_device_context();
+
+C10_XPU_API void get_device_properties(
+    DeviceProp* device_prop,
+    DeviceIndex device);
+
+C10_XPU_API DeviceIndex get_device_idx_from_pointer(void* ptr);
+
+} // namespace c10::xpu
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUMacros.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUMacros.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eeba1510a06bdc56bc145fc7dfe72996865d45a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUMacros.h
@@ -0,0 +1,19 @@
+#pragma once
+
+// See c10/macros/Export.h for a detailed explanation of what the function
+// of these macros are.  We need one set of macros for every separate library
+// we build.
+
+#if defined(__GNUC__)
+#define C10_XPU_EXPORT __attribute__((__visibility__("default")))
+#else // defined(__GNUC__)
+#define C10_XPU_EXPORT
+#endif // defined(__GNUC__)
+#define C10_XPU_IMPORT C10_XPU_EXPORT
+
+// This one is being used by libc10_xpu.so
+#ifdef C10_XPU_BUILD_MAIN_LIB
+#define C10_XPU_API C10_XPU_EXPORT
+#else
+#define C10_XPU_API C10_XPU_IMPORT
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUStream.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eb84fc04c26fdaa32301ff46c682074a20cd61c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/XPUStream.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <c10/core/Stream.h>
+#include <c10/xpu/XPUFunctions.h>
+
+namespace c10::xpu {
+
+/*
+ * Note [Stream Management]
+ *
+ * An XPUStream is an abstraction of an actual SYCL queue in which SYCL kernel
+ * can execute. Currently, there are several pools per device to manage SYCL
+ * queue, and a device's pool is lazily created.
+ *
+ * There are two pools per device. The first pool contains "normal priority"
+ * queues. The second pool is the "high priority" queues. There are 32 queues in
+ * per pool per device, and when a queue is requested one of these queues is
+ * returned round-robin. That is, the first queue requested is at index 0, the
+ * second at index 1... to index 31, then index 0 again.
+ *
+ * This means that if 33 queues are requested, the first and last queues
+ * requested are actually the same queue (under the covers) and kernels enqueued
+ * on them cannot run concurrently.
+ *
+ * It is safe to enqueue a kernel on the same queue from two different
+ * threads as the SYCL specification described.
+ */
+
+static constexpr int max_compile_time_stream_priorities = 2;
+
+/*
+ * This serves as a wrapper around c10::Stream and acts as a representation for
+ * a SYCL queue. On each device, a SYCL queue pool consists of kStreamsPerPool
+ * queues, and you can access a particular queue by its index. The index is
+ * extracted from XPUStream.id().
+ */
+class C10_XPU_API XPUStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  // Construct a XPUStream from a Stream. This construction is checked, and
+  // will raise an error if the Stream is not, in fact, a XPU stream.
+  explicit XPUStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::XPU);
+  }
+
+  // Construct a XPUStream from a Stream with no error checking.
+  explicit XPUStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const XPUStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const XPUStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  operator sycl::queue&() const {
+    return queue();
+  }
+
+  operator Stream() const {
+    return unwrap();
+  }
+
+  DeviceType device_type() const {
+    return DeviceType::XPU;
+  }
+
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  Device device() const {
+    return Device(DeviceType::XPU, device_index());
+  }
+
+  // Return the stream ID corresponding to this particular stream. StreamId is
+  /// a int64_t representation generated by its type and index.
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    return queue().ext_oneapi_empty();
+  }
+
+  void synchronize() const {
+    queue().wait_and_throw();
+  }
+
+  int priority() const;
+
+  // Explicit conversion to sycl::queue&.
+  sycl::queue& queue() const;
+
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static XPUStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return XPUStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+  static std::tuple<int, int> priority_range() {
+    return std::make_tuple(0, -max_compile_time_stream_priorities + 1);
+  }
+
+ private:
+  Stream stream_;
+};
+
+/**
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the high priority pool by setting
+ * isHighPriority to true, or a priority value for a specific device by setting
+ * device.
+ */
+C10_XPU_API XPUStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+// The priority number lower, the priority higher.
+C10_XPU_API XPUStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/**
+ * Get the current XPU stream, for the passed XPU device, or for the current
+ * device if no device index is passed.
+ */
+C10_XPU_API XPUStream getCurrentXPUStream(DeviceIndex device = -1);
+
+/**
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+C10_XPU_API void setCurrentXPUStream(XPUStream stream);
+
+C10_XPU_API std::ostream& operator<<(std::ostream& stream, const XPUStream& s);
+
+/**
+ * Block all reserved SYCL queues in the stream pools on the device, and wait
+ * for their synchronizations.
+ */
+C10_XPU_API void syncStreamsOnDevice(DeviceIndex device = -1);
+
+} // namespace c10::xpu
+
+namespace std {
+template <>
+struct hash<c10::xpu::XPUStream> {
+  size_t operator()(c10::xpu::XPUStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h b/MLPY/Lib/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ff58fbe87d10c47018635188d49eab66f2336bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/c10/xpu/impl/XPUGuardImpl.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/xpu/XPUCachingAllocator.h>
+#include <c10/xpu/XPUFunctions.h>
+#include <c10/xpu/XPUStream.h>
+
+#include <vector>
+
+namespace c10::xpu::impl {
+
+struct XPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr DeviceType static_type = kXPU;
+
+  XPUGuardImpl() = default;
+
+  explicit XPUGuardImpl(DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == kXPU);
+  }
+
+  DeviceType type() const override {
+    return kXPU;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    const auto old_device_index = c10::xpu::exchange_device(d.index());
+    return Device(kXPU, old_device_index);
+  }
+
+  Device getDevice() const override {
+    const auto device = c10::xpu::current_device();
+    return Device(kXPU, device);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_xpu());
+    c10::xpu::set_device(d.index());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    c10::xpu::set_device(d.index());
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return getCurrentXPUStream(d.index()).unwrap();
+  }
+
+  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false)
+      const override {
+    return getStreamFromPool(isHighPriority, d.index());
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    const XPUStream stream(s);
+    const auto old_stream = getCurrentXPUStream(s.device().index());
+    setCurrentXPUStream(stream);
+    return old_stream.unwrap();
+  }
+
+  DeviceIndex deviceCount() const noexcept override {
+    return c10::xpu::device_count();
+  }
+
+  // Event-related functions
+  void destroyEvent(void* event, const DeviceIndex device_index)
+      const noexcept override {}
+
+  void record(
+      void** event,
+      const Stream& stream,
+      const DeviceIndex device_index,
+      const EventFlag flag) const override {
+    TORCH_CHECK(
+        device_index == -1 || device_index == stream.device_index(),
+        "Event device index ",
+        device_index,
+        " does not match recording stream's device index ",
+        stream.device_index(),
+        ".");
+
+    auto* xpu_event = reinterpret_cast<sycl::event*>(*event);
+    const XPUStream xpu_stream{stream};
+    *xpu_event = xpu_stream.queue().ext_oneapi_submit_barrier();
+  }
+
+  void block(void* event, const Stream& stream) const override {
+    if (!event)
+      return;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    std::vector<sycl::event> event_list{*xpu_event};
+    const XPUStream xpu_stream(stream);
+    xpu_stream.queue().ext_oneapi_submit_barrier(event_list);
+  }
+
+  bool queryEvent(void* event) const override {
+    using namespace sycl::info;
+    if (!event)
+      return true;
+    auto* xpu_event = reinterpret_cast<sycl::event*>(event);
+    return xpu_event->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+  }
+
+  // Stream-related functions
+  bool queryStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    return xpu_stream.query();
+  }
+
+  void synchronizeStream(const Stream& stream) const override {
+    const XPUStream xpu_stream{stream};
+    xpu_stream.synchronize();
+  }
+
+  void recordDataPtrOnStream(const c10::DataPtr& data_ptr, const Stream& stream)
+      const override {
+    const XPUStream xpu_stream{stream};
+    XPUCachingAllocator::recordStream(data_ptr, xpu_stream);
+  }
+};
+
+} // namespace c10::xpu::impl
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/crc_alt.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/crc_alt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c0ede9740e84871edaabb4bde89692a7181b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/crc_alt.h
@@ -0,0 +1,1343 @@
+#pragma once
+
+// //////////////////////////////////////////////////////////
+// Crc32.h
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table by undefining these lines:
+#define CRC32_USE_LOOKUP_TABLE_BYTE
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+#define CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte_tableless and crc32_1byte_tableless2 don't need it at all
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+// using the aforementioned #defines the table is automatically fitted to your needs
+
+// uint8_t, uint32_t, int32_t
+#include <stdint.h>
+// size_t
+#include <cstddef>
+
+// crc32_fast selects the fastest algorithm depending on flags (CRC32_USE_LOOKUP_...)
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast    (const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (half-byte algoritm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte   (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32 = 0);
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes  (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32 = 0);
+#endif
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes (const void* data, size_t length, uint32_t previousCrc32 = 0);
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32 = 0, size_t prefetchAhead = 256);
+#endif
+
+// //////////////////////////////////////////////////////////
+// Crc32.cpp
+// Copyright (c) 2011-2019 Stephan Brumme. All rights reserved.
+// Slicing-by-16 contributed by Bulat Ziganshin
+// Tableless bytewise CRC contributed by Hagai Gold
+// see http://create.stephan-brumme.com/disclaimer.html
+//
+
+// if running on an embedded system, you might consider shrinking the
+// big Crc32Lookup table:
+// - crc32_bitwise  doesn't need it at all
+// - crc32_halfbyte has its own small lookup table
+// - crc32_1byte    needs only Crc32Lookup[0]
+// - crc32_4bytes   needs only Crc32Lookup[0..3]
+// - crc32_8bytes   needs only Crc32Lookup[0..7]
+// - crc32_4x8bytes needs only Crc32Lookup[0..7]
+// - crc32_16bytes  needs all of Crc32Lookup
+
+
+#ifndef __LITTLE_ENDIAN
+  #define __LITTLE_ENDIAN 1234
+#endif
+#ifndef __BIG_ENDIAN
+  #define __BIG_ENDIAN    4321
+#endif
+
+// define endianess and some integer data types
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  // Windows always little endian
+  #define __BYTE_ORDER __LITTLE_ENDIAN
+
+  // intrinsics / prefetching
+  #if defined(_M_ARM64)
+    #include <intrin.h>
+  #else
+    #include <xmmintrin.h>
+  #endif
+
+  #ifdef __MINGW32__
+    #define PREFETCH(location) __builtin_prefetch(location)
+  #else
+    #if defined(_M_ARM64)
+      #define PREFETCH(location) __prefetch(location)
+    #else
+      #define PREFETCH(location) _mm_prefetch(location, _MM_HINT_T0)
+    #endif
+  #endif
+#elif defined(__APPLE__)
+  #include <TargetConditionals.h>
+    #if TARGET_IPHONE_SIMULATOR
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_IPHONE
+      #define __BYTE_ORDER __LITTLE_ENDIAN
+    #elif TARGET_OS_MAC
+      #include <machine/endian.h>
+      #if defined(__BIG_ENDIAN__)
+          #define __BYTE_ORDER __BIG_ENDIAN
+      #endif
+      #if defined(__LITTLE_ENDIAN__)
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+      #endif
+    #else
+      # error "Unknown Apple platform"
+    #endif
+#elif defined(__ARMEB__)
+  #define __BYTE_ORDER __BIG_ENDIAN
+#elif (defined(__BYTE_ORDER__) and !defined(__BYTE_ORDER))
+    #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        #define __BYTE_ORDER __BIG_ENDIAN
+    #else
+        #define __BYTE_ORDER __LITTLE_ENDIAN
+    #endif
+#else
+  // defines __BYTE_ORDER as __LITTLE_ENDIAN or __BIG_ENDIAN
+  #include <sys/param.h>
+#endif
+
+// intrinsics / prefetching
+#ifdef __GNUC__
+  #define PREFETCH(location) __builtin_prefetch(location)
+#else
+#ifndef PREFETCH
+  // no prefetching
+  #define PREFETCH(location) ;
+#endif
+#endif
+
+// abort if byte order is undefined
+#ifndef __BYTE_ORDER
+#error undefined byte order, compile with -D__BYTE_ORDER=1234 (if little endian) or -D__BYTE_ORDER=4321 (big endian)
+#endif
+
+
+namespace
+{
+  /// zlib's CRC32 polynomial
+  const uint32_t Polynomial = 0xEDB88320;
+
+  /// swap endianess
+  static inline uint32_t swap(uint32_t x)
+  {
+  #if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(x);
+  #else
+    return (x >> 24) |
+          ((x >>  8) & 0x0000FF00) |
+          ((x <<  8) & 0x00FF0000) |
+           (x << 24);
+  #endif
+  }
+
+  /// Slicing-By-16
+  #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  const size_t MaxSlice = 16;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  const size_t MaxSlice = 8;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  const size_t MaxSlice = 4;
+  #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  const size_t MaxSlice = 1;
+  #else
+    #define NO_LUT // don't need Crc32Lookup at all
+  #endif
+
+} // anonymous namespace
+
+#ifndef NO_LUT
+/// forward declaration, table is at the end of this file
+extern const uint32_t Crc32Lookup[MaxSlice][256]; // extern is needed to keep compiler happy
+#endif
+
+
+/// compute CRC32 (bitwise algorithm)
+uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc ^= *current++;
+
+    for (int j = 0; j < 8; j++)
+    {
+      // branch-free
+      crc = (crc >> 1) ^ (-int32_t(crc & 1) & Polynomial);
+
+      // branching, much slower:
+      //if (crc & 1)
+      //  crc = (crc >> 1) ^ Polynomial;
+      //else
+      //  crc =  crc >> 1;
+    }
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (half-byte algoritm)
+uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  /// look-up table for half-byte, same as crc32Lookup[0][16*i]
+  static const uint32_t Crc32Lookup16[16] =
+  {
+    0x00000000,0x1DB71064,0x3B6E20C8,0x26D930AC,0x76DC4190,0x6B6B51F4,0x4DB26158,0x5005713C,
+    0xEDB88320,0xF00F9344,0xD6D6A3E8,0xCB61B38C,0x9B64C2B0,0x86D3D2D4,0xA00AE278,0xBDBDF21C
+  };
+
+  while (length-- != 0)
+  {
+    crc = Crc32Lookup16[(crc ^  *current      ) & 0x0F] ^ (crc >> 4);
+    crc = Crc32Lookup16[(crc ^ (*current >> 4)) & 0x0F] ^ (crc >> 4);
+    current++;
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
+/// compute CRC32 (standard algorithm)
+uint32_t crc32_1byte(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *current++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    uint8_t s = uint8_t(crc) ^ *current++;
+
+    // Hagai Gold made me aware of this table-less algorithm and send me code
+
+    // polynomial 0xEDB88320 can be written in binary as 11101101101110001000001100100000b
+    // reverse the bits (or just assume bit 0 is the first one)
+    // and we have bits set at position 0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 16, 22, 23, 26
+    // => those are the shift offsets:
+    //crc = (crc >> 8) ^
+    //       t ^
+    //      (t >>  1) ^ (t >>  2) ^ (t >>  4) ^ (t >>  5) ^  // == y
+    //      (t >>  7) ^ (t >>  8) ^ (t >> 10) ^ (t >> 11) ^  // == y >> 6
+    //      (t >> 12) ^ (t >> 16) ^                          // == z
+    //      (t >> 22) ^ (t >> 26) ^                          // == z >> 10
+    //      (t >> 23);
+
+    // the fastest I can come up with:
+    uint32_t low = (s ^ (s << 6)) & 0xFF;
+    uint32_t a   = (low * ((1 << 23) + (1 << 14) + (1 << 2)));
+    crc = (crc >> 8) ^
+          (low * ((1 << 24) + (1 << 16) + (1 << 8))) ^
+           a ^
+          (a >> 1) ^
+          (low * ((1 << 20) + (1 << 12)           )) ^
+          (low << 19) ^
+          (low << 17) ^
+          (low >>  2);
+
+    // Hagai's code:
+    /*uint32_t t = (s ^ (s << 6)) << 24;
+    // some temporaries to optimize XOR
+    uint32_t x = (t >> 1) ^ (t >> 2);
+    uint32_t y = x ^ (x >> 3);
+    uint32_t z = (t >> 12) ^ (t >> 16);
+    crc = (crc >> 8) ^
+           t ^ (t >> 23) ^
+           y ^ (y >>  6) ^
+           z ^ (z >> 10);*/
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (byte algorithm) without lookup tables
+uint32_t crc32_1byte_tableless2(const void* data, size_t length, uint32_t previousCrc32)
+{
+  int32_t crc = ~previousCrc32; // note: signed integer, right shift distributes sign bit into lower bits
+  const uint8_t* current = (const uint8_t*) data;
+
+  while (length-- != 0)
+  {
+    crc = crc ^ *current++;
+
+    uint32_t c = (((crc << 31) >> 31) & ((Polynomial >> 7)  ^ (Polynomial >> 1))) ^
+                 (((crc << 30) >> 31) & ((Polynomial >> 6)  ^  Polynomial)) ^
+                 (((crc << 29) >> 31) &  (Polynomial >> 5)) ^
+                 (((crc << 28) >> 31) &  (Polynomial >> 4)) ^
+                 (((crc << 27) >> 31) &  (Polynomial >> 3)) ^
+                 (((crc << 26) >> 31) &  (Polynomial >> 2)) ^
+                 (((crc << 25) >> 31) &  (Polynomial >> 1)) ^
+                 (((crc << 24) >> 31) &   Polynomial);
+
+    crc = ((uint32_t)crc >> 8) ^ c; // convert to unsigned integer before right shift
+  }
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_4
+/// compute CRC32 (Slicing-by-4 algorithm)
+uint32_t crc32_4bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t  crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process four bytes at once (Slicing-by-4)
+  while (length >= 4)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    crc = Crc32Lookup[0][ one      & 0xFF] ^
+          Crc32Lookup[1][(one>> 8) & 0xFF] ^
+          Crc32Lookup[2][(one>>16) & 0xFF] ^
+          Crc32Lookup[3][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    crc = Crc32Lookup[0][(one>>24) & 0xFF] ^
+          Crc32Lookup[1][(one>>16) & 0xFF] ^
+          Crc32Lookup[2][(one>> 8) & 0xFF] ^
+          Crc32Lookup[3][ one      & 0xFF];
+#endif
+
+    length -= 4;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 3 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+/// compute CRC32 (Slicing-by-8 algorithm)
+uint32_t crc32_8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // process eight bytes at once (Slicing-by-8)
+  while (length >= 8)
+  {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one = *current++ ^ swap(crc);
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][ two      & 0xFF] ^
+          Crc32Lookup[1][(two>> 8) & 0xFF] ^
+          Crc32Lookup[2][(two>>16) & 0xFF] ^
+          Crc32Lookup[3][(two>>24) & 0xFF] ^
+          Crc32Lookup[4][ one      & 0xFF] ^
+          Crc32Lookup[5][(one>> 8) & 0xFF] ^
+          Crc32Lookup[6][(one>>16) & 0xFF] ^
+          Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+    uint32_t one = *current++ ^ crc;
+    uint32_t two = *current++;
+    crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+          Crc32Lookup[1][(two>>16) & 0xFF] ^
+          Crc32Lookup[2][(two>> 8) & 0xFF] ^
+          Crc32Lookup[3][ two      & 0xFF] ^
+          Crc32Lookup[4][(one>>24) & 0xFF] ^
+          Crc32Lookup[5][(one>>16) & 0xFF] ^
+          Crc32Lookup[6][(one>> 8) & 0xFF] ^
+          Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    length -= 8;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 7 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-8 algorithm), unroll inner loop 4 times
+uint32_t crc32_4x8bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 8 * Unroll;
+
+  // process 4x eight bytes at once (Slicing-by-8)
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+      uint32_t one = *current++ ^ swap(crc);
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][ two      & 0xFF] ^
+            Crc32Lookup[1][(two>> 8) & 0xFF] ^
+            Crc32Lookup[2][(two>>16) & 0xFF] ^
+            Crc32Lookup[3][(two>>24) & 0xFF] ^
+            Crc32Lookup[4][ one      & 0xFF] ^
+            Crc32Lookup[5][(one>> 8) & 0xFF] ^
+            Crc32Lookup[6][(one>>16) & 0xFF] ^
+            Crc32Lookup[7][(one>>24) & 0xFF];
+#else
+      uint32_t one = *current++ ^ crc;
+      uint32_t two = *current++;
+      crc = Crc32Lookup[0][(two>>24) & 0xFF] ^
+            Crc32Lookup[1][(two>>16) & 0xFF] ^
+            Crc32Lookup[2][(two>> 8) & 0xFF] ^
+            Crc32Lookup[3][ two      & 0xFF] ^
+            Crc32Lookup[4][(one>>24) & 0xFF] ^
+            Crc32Lookup[5][(one>>16) & 0xFF] ^
+            Crc32Lookup[6][(one>> 8) & 0xFF] ^
+            Crc32Lookup[7][ one      & 0xFF];
+#endif
+
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 31 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8
+
+
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+/// compute CRC32 (Slicing-by-16 algorithm)
+uint32_t crc32_16bytes(const void* data, size_t length, uint32_t previousCrc32)
+{
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the inner for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce)
+  {
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+
+
+/// compute CRC32 (Slicing-by-16 algorithm, prefetch upcoming data blocks)
+uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previousCrc32, size_t prefetchAhead)
+{
+  // CRC code is identical to crc32_16bytes (including unrolling), only added prefetching
+  // 256 bytes look-ahead seems to be the sweet spot on Core i7 CPUs
+
+  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
+  const uint32_t* current = (const uint32_t*) data;
+
+  // enabling optimization (at least -O2) automatically unrolls the for-loop
+  const size_t Unroll = 4;
+  const size_t BytesAtOnce = 16 * Unroll;
+
+  while (length >= BytesAtOnce + prefetchAhead)
+  {
+    PREFETCH(((const char*) current) + prefetchAhead);
+
+    for (size_t unrolling = 0; unrolling < Unroll; unrolling++)
+    {
+#if __BYTE_ORDER == __BIG_ENDIAN
+    uint32_t one   = *current++ ^ swap(crc);
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][ four         & 0xFF] ^
+           Crc32Lookup[ 1][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 3][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 4][ three        & 0xFF] ^
+           Crc32Lookup[ 5][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 6][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 7][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 8][ two          & 0xFF] ^
+           Crc32Lookup[ 9][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[10][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[11][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[12][ one          & 0xFF] ^
+           Crc32Lookup[13][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[14][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[15][(one   >> 24) & 0xFF];
+#else
+    uint32_t one   = *current++ ^ crc;
+    uint32_t two   = *current++;
+    uint32_t three = *current++;
+    uint32_t four  = *current++;
+    crc  = Crc32Lookup[ 0][(four  >> 24) & 0xFF] ^
+           Crc32Lookup[ 1][(four  >> 16) & 0xFF] ^
+           Crc32Lookup[ 2][(four  >>  8) & 0xFF] ^
+           Crc32Lookup[ 3][ four         & 0xFF] ^
+           Crc32Lookup[ 4][(three >> 24) & 0xFF] ^
+           Crc32Lookup[ 5][(three >> 16) & 0xFF] ^
+           Crc32Lookup[ 6][(three >>  8) & 0xFF] ^
+           Crc32Lookup[ 7][ three        & 0xFF] ^
+           Crc32Lookup[ 8][(two   >> 24) & 0xFF] ^
+           Crc32Lookup[ 9][(two   >> 16) & 0xFF] ^
+           Crc32Lookup[10][(two   >>  8) & 0xFF] ^
+           Crc32Lookup[11][ two          & 0xFF] ^
+           Crc32Lookup[12][(one   >> 24) & 0xFF] ^
+           Crc32Lookup[13][(one   >> 16) & 0xFF] ^
+           Crc32Lookup[14][(one   >>  8) & 0xFF] ^
+           Crc32Lookup[15][ one          & 0xFF];
+#endif
+    }
+
+    length -= BytesAtOnce;
+  }
+
+  const uint8_t* currentChar = (const uint8_t*) current;
+  // remaining 1 to 63 bytes (standard algorithm)
+  while (length-- != 0)
+    crc = (crc >> 8) ^ Crc32Lookup[0][(crc & 0xFF) ^ *currentChar++];
+
+  return ~crc; // same as crc ^ 0xFFFFFFFF
+}
+#endif
+
+
+/// compute CRC32 using the fastest algorithm for large datasets on modern CPUs
+uint32_t crc32_fast(const void* data, size_t length, uint32_t previousCrc32)
+{
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  return crc32_16bytes (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8)
+  return crc32_8bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4)
+  return crc32_4bytes  (data, length, previousCrc32);
+#elif defined(CRC32_USE_LOOKUP_TABLE_BYTE)
+  return crc32_1byte   (data, length, previousCrc32);
+#else
+  return crc32_halfbyte(data, length, previousCrc32);
+#endif
+}
+
+
+/// merge two CRC32 such that result = crc32(dataB, lengthB, crc32(dataA, lengthA))
+uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
+{
+  // based on Mark Adler's crc_combine from
+  // https://github.com/madler/pigz/blob/master/pigz.c
+
+  // main idea:
+  // - if you have two equally-sized blocks A and B,
+  //   then you can create a block C = A ^ B
+  //   which has the property crc(C) = crc(A) ^ crc(B)
+  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
+  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
+  //   then exists a C' = A' ^ B'
+  // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
+  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
+  // - the trick is to compute crc(A') based on crc(A)
+  //                       and crc(B') based on crc(B)
+  // - since B' starts with many zeros, the crc of those initial zeros is still zero
+  // - that means crc(B') = crc(B)
+  // - unfortunately the trailing zeros of A' change the crc, so usually crc(A') != crc(A)
+  // - the following code is a fast algorithm to compute crc(A')
+  // - starting with crc(A) and appending length(B) zeros, needing just log2(length(B)) iterations
+  // - the details are explained by the original author at
+  //   https://stackoverflow.com/questions/23122312/crc-calculation-of-a-mostly-static-data-stream/23126768
+  //
+  // notes:
+  // - I squeezed everything into one function to keep global namespace clean (original code two helper functions)
+  // - most original comments are still in place, I added comments where these helper functions where made inline code
+  // - performance-wise there isn't any differenze to the original zlib/pigz code
+
+  // degenerated case
+  if (lengthB == 0)
+    return crcA;
+
+  /// CRC32 => 32 bits
+  const uint32_t CrcBits = 32;
+
+  uint32_t odd [CrcBits]; // odd-power-of-two  zeros operator
+  uint32_t even[CrcBits]; // even-power-of-two zeros operator
+
+  // put operator for one zero bit in odd
+  odd[0] = Polynomial;    // CRC-32 polynomial
+  for (uint32_t i = 1; i < CrcBits; i++)
+    odd[i] = 1 << (i - 1);
+
+  // put operator for two zero bits in even
+  // same as gf2_matrix_square(even, odd);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = odd[i];
+    even[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        even[i] ^= odd[j];
+  }
+  // put operator for four zero bits in odd
+  // same as gf2_matrix_square(odd, even);
+  for (uint32_t i = 0; i < CrcBits; i++)
+  {
+    uint32_t vec = even[i];
+    odd[i] = 0;
+    for (int j = 0; vec != 0; j++, vec >>= 1)
+      if (vec & 1)
+        odd[i] ^= even[j];
+  }
+
+  // the following loop becomes much shorter if I keep swapping even and odd
+  uint32_t* a = even;
+  uint32_t* b = odd;
+  // apply secondLength zeros to firstCrc32
+  for (; lengthB > 0; lengthB >>= 1)
+  {
+    // same as gf2_matrix_square(a, b);
+    for (uint32_t i = 0; i < CrcBits; i++)
+    {
+      uint32_t vec = b[i];
+      a[i] = 0;
+      for (int j = 0; vec != 0; j++, vec >>= 1)
+        if (vec & 1)
+          a[i] ^= b[j];
+    }
+
+    // apply zeros operator for this bit
+    if (lengthB & 1)
+    {
+      // same as firstCrc32 = gf2_matrix_times(a, firstCrc32);
+      uint32_t sum = 0;
+      for (int i = 0; crcA != 0; i++, crcA >>= 1)
+        if (crcA & 1)
+          sum ^= a[i];
+      crcA = sum;
+    }
+
+    // switch even and odd
+    uint32_t* t = a; a = b; b = t;
+  }
+
+  // return combined crc
+  return crcA ^ crcB;
+}
+
+
+// //////////////////////////////////////////////////////////
+// constants
+
+
+#ifndef NO_LUT
+/// look-up table, already declared above
+const uint32_t Crc32Lookup[MaxSlice][256] =
+{
+  //// same algorithm as crc32_bitwise
+  //for (int i = 0; i <= 0xFF; i++)
+  //{
+  //  uint32_t crc = i;
+  //  for (int j = 0; j < 8; j++)
+  //    crc = (crc >> 1) ^ ((crc & 1) * Polynomial);
+  //  Crc32Lookup[0][i] = crc;
+  //}
+  //// ... and the following slicing-by-8 algorithm (from Intel):
+  //// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+  //// http://sourceforge.net/projects/slicing-by-8/
+  //for (int slice = 1; slice < MaxSlice; slice++)
+  //  Crc32Lookup[slice][i] = (Crc32Lookup[slice - 1][i] >> 8) ^ Crc32Lookup[0][Crc32Lookup[slice - 1][i] & 0xFF];
+  {
+    // note: the first number of every second row corresponds to the half-byte look-up table !
+    0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,0x9E6495A3,
+    0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,0xE7B82D07,0x90BF1D91,
+    0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,0x6DDDE4EB,0xF4D4B551,0x83D385C7,
+    0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,
+    0x3B6E20C8,0x4C69105E,0xD56041E4,0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,
+    0x35B5A8FA,0x42B2986C,0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,
+    0x26D930AC,0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
+    0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,0xB6662D3D,
+    0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,0x9FBFE4A5,0xE8B8D433,
+    0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,0x086D3D2D,0x91646C97,0xE6635C01,
+    0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,
+    0x65B0D9C6,0x12B7E950,0x8BBEB8EA,0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,
+    0x4DB26158,0x3AB551CE,0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,
+    0x4369E96A,0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
+    0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,0xCE61E49F,
+    0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,0xB7BD5C3B,0xC0BA6CAD,
+    0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,0x9DD277AF,0x04DB2615,0x73DC1683,
+    0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,
+    0xF00F9344,0x8708A3D2,0x1E01F268,0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,
+    0xFED41B76,0x89D32BE0,0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,
+    0xD6D6A3E8,0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
+    0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,0x4669BE79,
+    0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,0x220216B9,0x5505262F,
+    0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,
+    0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,
+    0x95BF4A82,0xE2B87A14,0x7BB12BAE,0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,
+    0x86D3D2D4,0xF1D4E242,0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,
+    0x88085AE6,0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
+    0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,0x3E6E77DB,
+    0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,0x47B2CF7F,0x30B5FFE9,
+    0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,0xCDD70693,0x54DE5729,0x23D967BF,
+    0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D,
+  }
+
+#if defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-4, Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x191B3141,0x32366282,0x2B2D53C3,0x646CC504,0x7D77F445,0x565AA786,0x4F4196C7,
+    0xC8D98A08,0xD1C2BB49,0xFAEFE88A,0xE3F4D9CB,0xACB54F0C,0xB5AE7E4D,0x9E832D8E,0x87981CCF,
+    0x4AC21251,0x53D92310,0x78F470D3,0x61EF4192,0x2EAED755,0x37B5E614,0x1C98B5D7,0x05838496,
+    0x821B9859,0x9B00A918,0xB02DFADB,0xA936CB9A,0xE6775D5D,0xFF6C6C1C,0xD4413FDF,0xCD5A0E9E,
+    0x958424A2,0x8C9F15E3,0xA7B24620,0xBEA97761,0xF1E8E1A6,0xE8F3D0E7,0xC3DE8324,0xDAC5B265,
+    0x5D5DAEAA,0x44469FEB,0x6F6BCC28,0x7670FD69,0x39316BAE,0x202A5AEF,0x0B07092C,0x121C386D,
+    0xDF4636F3,0xC65D07B2,0xED705471,0xF46B6530,0xBB2AF3F7,0xA231C2B6,0x891C9175,0x9007A034,
+    0x179FBCFB,0x0E848DBA,0x25A9DE79,0x3CB2EF38,0x73F379FF,0x6AE848BE,0x41C51B7D,0x58DE2A3C,
+    0xF0794F05,0xE9627E44,0xC24F2D87,0xDB541CC6,0x94158A01,0x8D0EBB40,0xA623E883,0xBF38D9C2,
+    0x38A0C50D,0x21BBF44C,0x0A96A78F,0x138D96CE,0x5CCC0009,0x45D73148,0x6EFA628B,0x77E153CA,
+    0xBABB5D54,0xA3A06C15,0x888D3FD6,0x91960E97,0xDED79850,0xC7CCA911,0xECE1FAD2,0xF5FACB93,
+    0x7262D75C,0x6B79E61D,0x4054B5DE,0x594F849F,0x160E1258,0x0F152319,0x243870DA,0x3D23419B,
+    0x65FD6BA7,0x7CE65AE6,0x57CB0925,0x4ED03864,0x0191AEA3,0x188A9FE2,0x33A7CC21,0x2ABCFD60,
+    0xAD24E1AF,0xB43FD0EE,0x9F12832D,0x8609B26C,0xC94824AB,0xD05315EA,0xFB7E4629,0xE2657768,
+    0x2F3F79F6,0x362448B7,0x1D091B74,0x04122A35,0x4B53BCF2,0x52488DB3,0x7965DE70,0x607EEF31,
+    0xE7E6F3FE,0xFEFDC2BF,0xD5D0917C,0xCCCBA03D,0x838A36FA,0x9A9107BB,0xB1BC5478,0xA8A76539,
+    0x3B83984B,0x2298A90A,0x09B5FAC9,0x10AECB88,0x5FEF5D4F,0x46F46C0E,0x6DD93FCD,0x74C20E8C,
+    0xF35A1243,0xEA412302,0xC16C70C1,0xD8774180,0x9736D747,0x8E2DE606,0xA500B5C5,0xBC1B8484,
+    0x71418A1A,0x685ABB5B,0x4377E898,0x5A6CD9D9,0x152D4F1E,0x0C367E5F,0x271B2D9C,0x3E001CDD,
+    0xB9980012,0xA0833153,0x8BAE6290,0x92B553D1,0xDDF4C516,0xC4EFF457,0xEFC2A794,0xF6D996D5,
+    0xAE07BCE9,0xB71C8DA8,0x9C31DE6B,0x852AEF2A,0xCA6B79ED,0xD37048AC,0xF85D1B6F,0xE1462A2E,
+    0x66DE36E1,0x7FC507A0,0x54E85463,0x4DF36522,0x02B2F3E5,0x1BA9C2A4,0x30849167,0x299FA026,
+    0xE4C5AEB8,0xFDDE9FF9,0xD6F3CC3A,0xCFE8FD7B,0x80A96BBC,0x99B25AFD,0xB29F093E,0xAB84387F,
+    0x2C1C24B0,0x350715F1,0x1E2A4632,0x07317773,0x4870E1B4,0x516BD0F5,0x7A468336,0x635DB277,
+    0xCBFAD74E,0xD2E1E60F,0xF9CCB5CC,0xE0D7848D,0xAF96124A,0xB68D230B,0x9DA070C8,0x84BB4189,
+    0x03235D46,0x1A386C07,0x31153FC4,0x280E0E85,0x674F9842,0x7E54A903,0x5579FAC0,0x4C62CB81,
+    0x8138C51F,0x9823F45E,0xB30EA79D,0xAA1596DC,0xE554001B,0xFC4F315A,0xD7626299,0xCE7953D8,
+    0x49E14F17,0x50FA7E56,0x7BD72D95,0x62CC1CD4,0x2D8D8A13,0x3496BB52,0x1FBBE891,0x06A0D9D0,
+    0x5E7EF3EC,0x4765C2AD,0x6C48916E,0x7553A02F,0x3A1236E8,0x230907A9,0x0824546A,0x113F652B,
+    0x96A779E4,0x8FBC48A5,0xA4911B66,0xBD8A2A27,0xF2CBBCE0,0xEBD08DA1,0xC0FDDE62,0xD9E6EF23,
+    0x14BCE1BD,0x0DA7D0FC,0x268A833F,0x3F91B27E,0x70D024B9,0x69CB15F8,0x42E6463B,0x5BFD777A,
+    0xDC656BB5,0xC57E5AF4,0xEE530937,0xF7483876,0xB809AEB1,0xA1129FF0,0x8A3FCC33,0x9324FD72,
+  },
+
+  {
+    0x00000000,0x01C26A37,0x0384D46E,0x0246BE59,0x0709A8DC,0x06CBC2EB,0x048D7CB2,0x054F1685,
+    0x0E1351B8,0x0FD13B8F,0x0D9785D6,0x0C55EFE1,0x091AF964,0x08D89353,0x0A9E2D0A,0x0B5C473D,
+    0x1C26A370,0x1DE4C947,0x1FA2771E,0x1E601D29,0x1B2F0BAC,0x1AED619B,0x18ABDFC2,0x1969B5F5,
+    0x1235F2C8,0x13F798FF,0x11B126A6,0x10734C91,0x153C5A14,0x14FE3023,0x16B88E7A,0x177AE44D,
+    0x384D46E0,0x398F2CD7,0x3BC9928E,0x3A0BF8B9,0x3F44EE3C,0x3E86840B,0x3CC03A52,0x3D025065,
+    0x365E1758,0x379C7D6F,0x35DAC336,0x3418A901,0x3157BF84,0x3095D5B3,0x32D36BEA,0x331101DD,
+    0x246BE590,0x25A98FA7,0x27EF31FE,0x262D5BC9,0x23624D4C,0x22A0277B,0x20E69922,0x2124F315,
+    0x2A78B428,0x2BBADE1F,0x29FC6046,0x283E0A71,0x2D711CF4,0x2CB376C3,0x2EF5C89A,0x2F37A2AD,
+    0x709A8DC0,0x7158E7F7,0x731E59AE,0x72DC3399,0x7793251C,0x76514F2B,0x7417F172,0x75D59B45,
+    0x7E89DC78,0x7F4BB64F,0x7D0D0816,0x7CCF6221,0x798074A4,0x78421E93,0x7A04A0CA,0x7BC6CAFD,
+    0x6CBC2EB0,0x6D7E4487,0x6F38FADE,0x6EFA90E9,0x6BB5866C,0x6A77EC5B,0x68315202,0x69F33835,
+    0x62AF7F08,0x636D153F,0x612BAB66,0x60E9C151,0x65A6D7D4,0x6464BDE3,0x662203BA,0x67E0698D,
+    0x48D7CB20,0x4915A117,0x4B531F4E,0x4A917579,0x4FDE63FC,0x4E1C09CB,0x4C5AB792,0x4D98DDA5,
+    0x46C49A98,0x4706F0AF,0x45404EF6,0x448224C1,0x41CD3244,0x400F5873,0x4249E62A,0x438B8C1D,
+    0x54F16850,0x55330267,0x5775BC3E,0x56B7D609,0x53F8C08C,0x523AAABB,0x507C14E2,0x51BE7ED5,
+    0x5AE239E8,0x5B2053DF,0x5966ED86,0x58A487B1,0x5DEB9134,0x5C29FB03,0x5E6F455A,0x5FAD2F6D,
+    0xE1351B80,0xE0F771B7,0xE2B1CFEE,0xE373A5D9,0xE63CB35C,0xE7FED96B,0xE5B86732,0xE47A0D05,
+    0xEF264A38,0xEEE4200F,0xECA29E56,0xED60F461,0xE82FE2E4,0xE9ED88D3,0xEBAB368A,0xEA695CBD,
+    0xFD13B8F0,0xFCD1D2C7,0xFE976C9E,0xFF5506A9,0xFA1A102C,0xFBD87A1B,0xF99EC442,0xF85CAE75,
+    0xF300E948,0xF2C2837F,0xF0843D26,0xF1465711,0xF4094194,0xF5CB2BA3,0xF78D95FA,0xF64FFFCD,
+    0xD9785D60,0xD8BA3757,0xDAFC890E,0xDB3EE339,0xDE71F5BC,0xDFB39F8B,0xDDF521D2,0xDC374BE5,
+    0xD76B0CD8,0xD6A966EF,0xD4EFD8B6,0xD52DB281,0xD062A404,0xD1A0CE33,0xD3E6706A,0xD2241A5D,
+    0xC55EFE10,0xC49C9427,0xC6DA2A7E,0xC7184049,0xC25756CC,0xC3953CFB,0xC1D382A2,0xC011E895,
+    0xCB4DAFA8,0xCA8FC59F,0xC8C97BC6,0xC90B11F1,0xCC440774,0xCD866D43,0xCFC0D31A,0xCE02B92D,
+    0x91AF9640,0x906DFC77,0x922B422E,0x93E92819,0x96A63E9C,0x976454AB,0x9522EAF2,0x94E080C5,
+    0x9FBCC7F8,0x9E7EADCF,0x9C381396,0x9DFA79A1,0x98B56F24,0x99770513,0x9B31BB4A,0x9AF3D17D,
+    0x8D893530,0x8C4B5F07,0x8E0DE15E,0x8FCF8B69,0x8A809DEC,0x8B42F7DB,0x89044982,0x88C623B5,
+    0x839A6488,0x82580EBF,0x801EB0E6,0x81DCDAD1,0x8493CC54,0x8551A663,0x8717183A,0x86D5720D,
+    0xA9E2D0A0,0xA820BA97,0xAA6604CE,0xABA46EF9,0xAEEB787C,0xAF29124B,0xAD6FAC12,0xACADC625,
+    0xA7F18118,0xA633EB2F,0xA4755576,0xA5B73F41,0xA0F829C4,0xA13A43F3,0xA37CFDAA,0xA2BE979D,
+    0xB5C473D0,0xB40619E7,0xB640A7BE,0xB782CD89,0xB2CDDB0C,0xB30FB13B,0xB1490F62,0xB08B6555,
+    0xBBD72268,0xBA15485F,0xB853F606,0xB9919C31,0xBCDE8AB4,0xBD1CE083,0xBF5A5EDA,0xBE9834ED,
+  },
+
+  {
+    0x00000000,0xB8BC6765,0xAA09C88B,0x12B5AFEE,0x8F629757,0x37DEF032,0x256B5FDC,0x9DD738B9,
+    0xC5B428EF,0x7D084F8A,0x6FBDE064,0xD7018701,0x4AD6BFB8,0xF26AD8DD,0xE0DF7733,0x58631056,
+    0x5019579F,0xE8A530FA,0xFA109F14,0x42ACF871,0xDF7BC0C8,0x67C7A7AD,0x75720843,0xCDCE6F26,
+    0x95AD7F70,0x2D111815,0x3FA4B7FB,0x8718D09E,0x1ACFE827,0xA2738F42,0xB0C620AC,0x087A47C9,
+    0xA032AF3E,0x188EC85B,0x0A3B67B5,0xB28700D0,0x2F503869,0x97EC5F0C,0x8559F0E2,0x3DE59787,
+    0x658687D1,0xDD3AE0B4,0xCF8F4F5A,0x7733283F,0xEAE41086,0x525877E3,0x40EDD80D,0xF851BF68,
+    0xF02BF8A1,0x48979FC4,0x5A22302A,0xE29E574F,0x7F496FF6,0xC7F50893,0xD540A77D,0x6DFCC018,
+    0x359FD04E,0x8D23B72B,0x9F9618C5,0x272A7FA0,0xBAFD4719,0x0241207C,0x10F48F92,0xA848E8F7,
+    0x9B14583D,0x23A83F58,0x311D90B6,0x89A1F7D3,0x1476CF6A,0xACCAA80F,0xBE7F07E1,0x06C36084,
+    0x5EA070D2,0xE61C17B7,0xF4A9B859,0x4C15DF3C,0xD1C2E785,0x697E80E0,0x7BCB2F0E,0xC377486B,
+    0xCB0D0FA2,0x73B168C7,0x6104C729,0xD9B8A04C,0x446F98F5,0xFCD3FF90,0xEE66507E,0x56DA371B,
+    0x0EB9274D,0xB6054028,0xA4B0EFC6,0x1C0C88A3,0x81DBB01A,0x3967D77F,0x2BD27891,0x936E1FF4,
+    0x3B26F703,0x839A9066,0x912F3F88,0x299358ED,0xB4446054,0x0CF80731,0x1E4DA8DF,0xA6F1CFBA,
+    0xFE92DFEC,0x462EB889,0x549B1767,0xEC277002,0x71F048BB,0xC94C2FDE,0xDBF98030,0x6345E755,
+    0x6B3FA09C,0xD383C7F9,0xC1366817,0x798A0F72,0xE45D37CB,0x5CE150AE,0x4E54FF40,0xF6E89825,
+    0xAE8B8873,0x1637EF16,0x048240F8,0xBC3E279D,0x21E91F24,0x99557841,0x8BE0D7AF,0x335CB0CA,
+    0xED59B63B,0x55E5D15E,0x47507EB0,0xFFEC19D5,0x623B216C,0xDA874609,0xC832E9E7,0x708E8E82,
+    0x28ED9ED4,0x9051F9B1,0x82E4565F,0x3A58313A,0xA78F0983,0x1F336EE6,0x0D86C108,0xB53AA66D,
+    0xBD40E1A4,0x05FC86C1,0x1749292F,0xAFF54E4A,0x322276F3,0x8A9E1196,0x982BBE78,0x2097D91D,
+    0x78F4C94B,0xC048AE2E,0xD2FD01C0,0x6A4166A5,0xF7965E1C,0x4F2A3979,0x5D9F9697,0xE523F1F2,
+    0x4D6B1905,0xF5D77E60,0xE762D18E,0x5FDEB6EB,0xC2098E52,0x7AB5E937,0x680046D9,0xD0BC21BC,
+    0x88DF31EA,0x3063568F,0x22D6F961,0x9A6A9E04,0x07BDA6BD,0xBF01C1D8,0xADB46E36,0x15080953,
+    0x1D724E9A,0xA5CE29FF,0xB77B8611,0x0FC7E174,0x9210D9CD,0x2AACBEA8,0x38191146,0x80A57623,
+    0xD8C66675,0x607A0110,0x72CFAEFE,0xCA73C99B,0x57A4F122,0xEF189647,0xFDAD39A9,0x45115ECC,
+    0x764DEE06,0xCEF18963,0xDC44268D,0x64F841E8,0xF92F7951,0x41931E34,0x5326B1DA,0xEB9AD6BF,
+    0xB3F9C6E9,0x0B45A18C,0x19F00E62,0xA14C6907,0x3C9B51BE,0x842736DB,0x96929935,0x2E2EFE50,
+    0x2654B999,0x9EE8DEFC,0x8C5D7112,0x34E11677,0xA9362ECE,0x118A49AB,0x033FE645,0xBB838120,
+    0xE3E09176,0x5B5CF613,0x49E959FD,0xF1553E98,0x6C820621,0xD43E6144,0xC68BCEAA,0x7E37A9CF,
+    0xD67F4138,0x6EC3265D,0x7C7689B3,0xC4CAEED6,0x591DD66F,0xE1A1B10A,0xF3141EE4,0x4BA87981,
+    0x13CB69D7,0xAB770EB2,0xB9C2A15C,0x017EC639,0x9CA9FE80,0x241599E5,0x36A0360B,0x8E1C516E,
+    0x866616A7,0x3EDA71C2,0x2C6FDE2C,0x94D3B949,0x090481F0,0xB1B8E695,0xA30D497B,0x1BB12E1E,
+    0x43D23E48,0xFB6E592D,0xE9DBF6C3,0x516791A6,0xCCB0A91F,0x740CCE7A,0x66B96194,0xDE0506F1,
+  }
+#endif // defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+#if defined (CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) || defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_16)
+  // beyond this point only relevant for Slicing-by-8 and Slicing-by-16
+  ,{
+    0x00000000,0x3D6029B0,0x7AC05360,0x47A07AD0,0xF580A6C0,0xC8E08F70,0x8F40F5A0,0xB220DC10,
+    0x30704BC1,0x0D106271,0x4AB018A1,0x77D03111,0xC5F0ED01,0xF890C4B1,0xBF30BE61,0x825097D1,
+    0x60E09782,0x5D80BE32,0x1A20C4E2,0x2740ED52,0x95603142,0xA80018F2,0xEFA06222,0xD2C04B92,
+    0x5090DC43,0x6DF0F5F3,0x2A508F23,0x1730A693,0xA5107A83,0x98705333,0xDFD029E3,0xE2B00053,
+    0xC1C12F04,0xFCA106B4,0xBB017C64,0x866155D4,0x344189C4,0x0921A074,0x4E81DAA4,0x73E1F314,
+    0xF1B164C5,0xCCD14D75,0x8B7137A5,0xB6111E15,0x0431C205,0x3951EBB5,0x7EF19165,0x4391B8D5,
+    0xA121B886,0x9C419136,0xDBE1EBE6,0xE681C256,0x54A11E46,0x69C137F6,0x2E614D26,0x13016496,
+    0x9151F347,0xAC31DAF7,0xEB91A027,0xD6F18997,0x64D15587,0x59B17C37,0x1E1106E7,0x23712F57,
+    0x58F35849,0x659371F9,0x22330B29,0x1F532299,0xAD73FE89,0x9013D739,0xD7B3ADE9,0xEAD38459,
+    0x68831388,0x55E33A38,0x124340E8,0x2F236958,0x9D03B548,0xA0639CF8,0xE7C3E628,0xDAA3CF98,
+    0x3813CFCB,0x0573E67B,0x42D39CAB,0x7FB3B51B,0xCD93690B,0xF0F340BB,0xB7533A6B,0x8A3313DB,
+    0x0863840A,0x3503ADBA,0x72A3D76A,0x4FC3FEDA,0xFDE322CA,0xC0830B7A,0x872371AA,0xBA43581A,
+    0x9932774D,0xA4525EFD,0xE3F2242D,0xDE920D9D,0x6CB2D18D,0x51D2F83D,0x167282ED,0x2B12AB5D,
+    0xA9423C8C,0x9422153C,0xD3826FEC,0xEEE2465C,0x5CC29A4C,0x61A2B3FC,0x2602C92C,0x1B62E09C,
+    0xF9D2E0CF,0xC4B2C97F,0x8312B3AF,0xBE729A1F,0x0C52460F,0x31326FBF,0x7692156F,0x4BF23CDF,
+    0xC9A2AB0E,0xF4C282BE,0xB362F86E,0x8E02D1DE,0x3C220DCE,0x0142247E,0x46E25EAE,0x7B82771E,
+    0xB1E6B092,0x8C869922,0xCB26E3F2,0xF646CA42,0x44661652,0x79063FE2,0x3EA64532,0x03C66C82,
+    0x8196FB53,0xBCF6D2E3,0xFB56A833,0xC6368183,0x74165D93,0x49767423,0x0ED60EF3,0x33B62743,
+    0xD1062710,0xEC660EA0,0xABC67470,0x96A65DC0,0x248681D0,0x19E6A860,0x5E46D2B0,0x6326FB00,
+    0xE1766CD1,0xDC164561,0x9BB63FB1,0xA6D61601,0x14F6CA11,0x2996E3A1,0x6E369971,0x5356B0C1,
+    0x70279F96,0x4D47B626,0x0AE7CCF6,0x3787E546,0x85A73956,0xB8C710E6,0xFF676A36,0xC2074386,
+    0x4057D457,0x7D37FDE7,0x3A978737,0x07F7AE87,0xB5D77297,0x88B75B27,0xCF1721F7,0xF2770847,
+    0x10C70814,0x2DA721A4,0x6A075B74,0x576772C4,0xE547AED4,0xD8278764,0x9F87FDB4,0xA2E7D404,
+    0x20B743D5,0x1DD76A65,0x5A7710B5,0x67173905,0xD537E515,0xE857CCA5,0xAFF7B675,0x92979FC5,
+    0xE915E8DB,0xD475C16B,0x93D5BBBB,0xAEB5920B,0x1C954E1B,0x21F567AB,0x66551D7B,0x5B3534CB,
+    0xD965A31A,0xE4058AAA,0xA3A5F07A,0x9EC5D9CA,0x2CE505DA,0x11852C6A,0x562556BA,0x6B457F0A,
+    0x89F57F59,0xB49556E9,0xF3352C39,0xCE550589,0x7C75D999,0x4115F029,0x06B58AF9,0x3BD5A349,
+    0xB9853498,0x84E51D28,0xC34567F8,0xFE254E48,0x4C059258,0x7165BBE8,0x36C5C138,0x0BA5E888,
+    0x28D4C7DF,0x15B4EE6F,0x521494BF,0x6F74BD0F,0xDD54611F,0xE03448AF,0xA794327F,0x9AF41BCF,
+    0x18A48C1E,0x25C4A5AE,0x6264DF7E,0x5F04F6CE,0xED242ADE,0xD044036E,0x97E479BE,0xAA84500E,
+    0x4834505D,0x755479ED,0x32F4033D,0x0F942A8D,0xBDB4F69D,0x80D4DF2D,0xC774A5FD,0xFA148C4D,
+    0x78441B9C,0x4524322C,0x028448FC,0x3FE4614C,0x8DC4BD5C,0xB0A494EC,0xF704EE3C,0xCA64C78C,
+  },
+
+  {
+    0x00000000,0xCB5CD3A5,0x4DC8A10B,0x869472AE,0x9B914216,0x50CD91B3,0xD659E31D,0x1D0530B8,
+    0xEC53826D,0x270F51C8,0xA19B2366,0x6AC7F0C3,0x77C2C07B,0xBC9E13DE,0x3A0A6170,0xF156B2D5,
+    0x03D6029B,0xC88AD13E,0x4E1EA390,0x85427035,0x9847408D,0x531B9328,0xD58FE186,0x1ED33223,
+    0xEF8580F6,0x24D95353,0xA24D21FD,0x6911F258,0x7414C2E0,0xBF481145,0x39DC63EB,0xF280B04E,
+    0x07AC0536,0xCCF0D693,0x4A64A43D,0x81387798,0x9C3D4720,0x57619485,0xD1F5E62B,0x1AA9358E,
+    0xEBFF875B,0x20A354FE,0xA6372650,0x6D6BF5F5,0x706EC54D,0xBB3216E8,0x3DA66446,0xF6FAB7E3,
+    0x047A07AD,0xCF26D408,0x49B2A6A6,0x82EE7503,0x9FEB45BB,0x54B7961E,0xD223E4B0,0x197F3715,
+    0xE82985C0,0x23755665,0xA5E124CB,0x6EBDF76E,0x73B8C7D6,0xB8E41473,0x3E7066DD,0xF52CB578,
+    0x0F580A6C,0xC404D9C9,0x4290AB67,0x89CC78C2,0x94C9487A,0x5F959BDF,0xD901E971,0x125D3AD4,
+    0xE30B8801,0x28575BA4,0xAEC3290A,0x659FFAAF,0x789ACA17,0xB3C619B2,0x35526B1C,0xFE0EB8B9,
+    0x0C8E08F7,0xC7D2DB52,0x4146A9FC,0x8A1A7A59,0x971F4AE1,0x5C439944,0xDAD7EBEA,0x118B384F,
+    0xE0DD8A9A,0x2B81593F,0xAD152B91,0x6649F834,0x7B4CC88C,0xB0101B29,0x36846987,0xFDD8BA22,
+    0x08F40F5A,0xC3A8DCFF,0x453CAE51,0x8E607DF4,0x93654D4C,0x58399EE9,0xDEADEC47,0x15F13FE2,
+    0xE4A78D37,0x2FFB5E92,0xA96F2C3C,0x6233FF99,0x7F36CF21,0xB46A1C84,0x32FE6E2A,0xF9A2BD8F,
+    0x0B220DC1,0xC07EDE64,0x46EAACCA,0x8DB67F6F,0x90B34FD7,0x5BEF9C72,0xDD7BEEDC,0x16273D79,
+    0xE7718FAC,0x2C2D5C09,0xAAB92EA7,0x61E5FD02,0x7CE0CDBA,0xB7BC1E1F,0x31286CB1,0xFA74BF14,
+    0x1EB014D8,0xD5ECC77D,0x5378B5D3,0x98246676,0x852156CE,0x4E7D856B,0xC8E9F7C5,0x03B52460,
+    0xF2E396B5,0x39BF4510,0xBF2B37BE,0x7477E41B,0x6972D4A3,0xA22E0706,0x24BA75A8,0xEFE6A60D,
+    0x1D661643,0xD63AC5E6,0x50AEB748,0x9BF264ED,0x86F75455,0x4DAB87F0,0xCB3FF55E,0x006326FB,
+    0xF135942E,0x3A69478B,0xBCFD3525,0x77A1E680,0x6AA4D638,0xA1F8059D,0x276C7733,0xEC30A496,
+    0x191C11EE,0xD240C24B,0x54D4B0E5,0x9F886340,0x828D53F8,0x49D1805D,0xCF45F2F3,0x04192156,
+    0xF54F9383,0x3E134026,0xB8873288,0x73DBE12D,0x6EDED195,0xA5820230,0x2316709E,0xE84AA33B,
+    0x1ACA1375,0xD196C0D0,0x5702B27E,0x9C5E61DB,0x815B5163,0x4A0782C6,0xCC93F068,0x07CF23CD,
+    0xF6999118,0x3DC542BD,0xBB513013,0x700DE3B6,0x6D08D30E,0xA65400AB,0x20C07205,0xEB9CA1A0,
+    0x11E81EB4,0xDAB4CD11,0x5C20BFBF,0x977C6C1A,0x8A795CA2,0x41258F07,0xC7B1FDA9,0x0CED2E0C,
+    0xFDBB9CD9,0x36E74F7C,0xB0733DD2,0x7B2FEE77,0x662ADECF,0xAD760D6A,0x2BE27FC4,0xE0BEAC61,
+    0x123E1C2F,0xD962CF8A,0x5FF6BD24,0x94AA6E81,0x89AF5E39,0x42F38D9C,0xC467FF32,0x0F3B2C97,
+    0xFE6D9E42,0x35314DE7,0xB3A53F49,0x78F9ECEC,0x65FCDC54,0xAEA00FF1,0x28347D5F,0xE368AEFA,
+    0x16441B82,0xDD18C827,0x5B8CBA89,0x90D0692C,0x8DD55994,0x46898A31,0xC01DF89F,0x0B412B3A,
+    0xFA1799EF,0x314B4A4A,0xB7DF38E4,0x7C83EB41,0x6186DBF9,0xAADA085C,0x2C4E7AF2,0xE712A957,
+    0x15921919,0xDECECABC,0x585AB812,0x93066BB7,0x8E035B0F,0x455F88AA,0xC3CBFA04,0x089729A1,
+    0xF9C19B74,0x329D48D1,0xB4093A7F,0x7F55E9DA,0x6250D962,0xA90C0AC7,0x2F987869,0xE4C4ABCC,
+  },
+
+  {
+    0x00000000,0xA6770BB4,0x979F1129,0x31E81A9D,0xF44F2413,0x52382FA7,0x63D0353A,0xC5A73E8E,
+    0x33EF4E67,0x959845D3,0xA4705F4E,0x020754FA,0xC7A06A74,0x61D761C0,0x503F7B5D,0xF64870E9,
+    0x67DE9CCE,0xC1A9977A,0xF0418DE7,0x56368653,0x9391B8DD,0x35E6B369,0x040EA9F4,0xA279A240,
+    0x5431D2A9,0xF246D91D,0xC3AEC380,0x65D9C834,0xA07EF6BA,0x0609FD0E,0x37E1E793,0x9196EC27,
+    0xCFBD399C,0x69CA3228,0x582228B5,0xFE552301,0x3BF21D8F,0x9D85163B,0xAC6D0CA6,0x0A1A0712,
+    0xFC5277FB,0x5A257C4F,0x6BCD66D2,0xCDBA6D66,0x081D53E8,0xAE6A585C,0x9F8242C1,0x39F54975,
+    0xA863A552,0x0E14AEE6,0x3FFCB47B,0x998BBFCF,0x5C2C8141,0xFA5B8AF5,0xCBB39068,0x6DC49BDC,
+    0x9B8CEB35,0x3DFBE081,0x0C13FA1C,0xAA64F1A8,0x6FC3CF26,0xC9B4C492,0xF85CDE0F,0x5E2BD5BB,
+    0x440B7579,0xE27C7ECD,0xD3946450,0x75E36FE4,0xB044516A,0x16335ADE,0x27DB4043,0x81AC4BF7,
+    0x77E43B1E,0xD19330AA,0xE07B2A37,0x460C2183,0x83AB1F0D,0x25DC14B9,0x14340E24,0xB2430590,
+    0x23D5E9B7,0x85A2E203,0xB44AF89E,0x123DF32A,0xD79ACDA4,0x71EDC610,0x4005DC8D,0xE672D739,
+    0x103AA7D0,0xB64DAC64,0x87A5B6F9,0x21D2BD4D,0xE47583C3,0x42028877,0x73EA92EA,0xD59D995E,
+    0x8BB64CE5,0x2DC14751,0x1C295DCC,0xBA5E5678,0x7FF968F6,0xD98E6342,0xE86679DF,0x4E11726B,
+    0xB8590282,0x1E2E0936,0x2FC613AB,0x89B1181F,0x4C162691,0xEA612D25,0xDB8937B8,0x7DFE3C0C,
+    0xEC68D02B,0x4A1FDB9F,0x7BF7C102,0xDD80CAB6,0x1827F438,0xBE50FF8C,0x8FB8E511,0x29CFEEA5,
+    0xDF879E4C,0x79F095F8,0x48188F65,0xEE6F84D1,0x2BC8BA5F,0x8DBFB1EB,0xBC57AB76,0x1A20A0C2,
+    0x8816EAF2,0x2E61E146,0x1F89FBDB,0xB9FEF06F,0x7C59CEE1,0xDA2EC555,0xEBC6DFC8,0x4DB1D47C,
+    0xBBF9A495,0x1D8EAF21,0x2C66B5BC,0x8A11BE08,0x4FB68086,0xE9C18B32,0xD82991AF,0x7E5E9A1B,
+    0xEFC8763C,0x49BF7D88,0x78576715,0xDE206CA1,0x1B87522F,0xBDF0599B,0x8C184306,0x2A6F48B2,
+    0xDC27385B,0x7A5033EF,0x4BB82972,0xEDCF22C6,0x28681C48,0x8E1F17FC,0xBFF70D61,0x198006D5,
+    0x47ABD36E,0xE1DCD8DA,0xD034C247,0x7643C9F3,0xB3E4F77D,0x1593FCC9,0x247BE654,0x820CEDE0,
+    0x74449D09,0xD23396BD,0xE3DB8C20,0x45AC8794,0x800BB91A,0x267CB2AE,0x1794A833,0xB1E3A387,
+    0x20754FA0,0x86024414,0xB7EA5E89,0x119D553D,0xD43A6BB3,0x724D6007,0x43A57A9A,0xE5D2712E,
+    0x139A01C7,0xB5ED0A73,0x840510EE,0x22721B5A,0xE7D525D4,0x41A22E60,0x704A34FD,0xD63D3F49,
+    0xCC1D9F8B,0x6A6A943F,0x5B828EA2,0xFDF58516,0x3852BB98,0x9E25B02C,0xAFCDAAB1,0x09BAA105,
+    0xFFF2D1EC,0x5985DA58,0x686DC0C5,0xCE1ACB71,0x0BBDF5FF,0xADCAFE4B,0x9C22E4D6,0x3A55EF62,
+    0xABC30345,0x0DB408F1,0x3C5C126C,0x9A2B19D8,0x5F8C2756,0xF9FB2CE2,0xC813367F,0x6E643DCB,
+    0x982C4D22,0x3E5B4696,0x0FB35C0B,0xA9C457BF,0x6C636931,0xCA146285,0xFBFC7818,0x5D8B73AC,
+    0x03A0A617,0xA5D7ADA3,0x943FB73E,0x3248BC8A,0xF7EF8204,0x519889B0,0x6070932D,0xC6079899,
+    0x304FE870,0x9638E3C4,0xA7D0F959,0x01A7F2ED,0xC400CC63,0x6277C7D7,0x539FDD4A,0xF5E8D6FE,
+    0x647E3AD9,0xC209316D,0xF3E12BF0,0x55962044,0x90311ECA,0x3646157E,0x07AE0FE3,0xA1D90457,
+    0x579174BE,0xF1E67F0A,0xC00E6597,0x66796E23,0xA3DE50AD,0x05A95B19,0x34414184,0x92364A30,
+  },
+
+  {
+    0x00000000,0xCCAA009E,0x4225077D,0x8E8F07E3,0x844A0EFA,0x48E00E64,0xC66F0987,0x0AC50919,
+    0xD3E51BB5,0x1F4F1B2B,0x91C01CC8,0x5D6A1C56,0x57AF154F,0x9B0515D1,0x158A1232,0xD92012AC,
+    0x7CBB312B,0xB01131B5,0x3E9E3656,0xF23436C8,0xF8F13FD1,0x345B3F4F,0xBAD438AC,0x767E3832,
+    0xAF5E2A9E,0x63F42A00,0xED7B2DE3,0x21D12D7D,0x2B142464,0xE7BE24FA,0x69312319,0xA59B2387,
+    0xF9766256,0x35DC62C8,0xBB53652B,0x77F965B5,0x7D3C6CAC,0xB1966C32,0x3F196BD1,0xF3B36B4F,
+    0x2A9379E3,0xE639797D,0x68B67E9E,0xA41C7E00,0xAED97719,0x62737787,0xECFC7064,0x205670FA,
+    0x85CD537D,0x496753E3,0xC7E85400,0x0B42549E,0x01875D87,0xCD2D5D19,0x43A25AFA,0x8F085A64,
+    0x562848C8,0x9A824856,0x140D4FB5,0xD8A74F2B,0xD2624632,0x1EC846AC,0x9047414F,0x5CED41D1,
+    0x299DC2ED,0xE537C273,0x6BB8C590,0xA712C50E,0xADD7CC17,0x617DCC89,0xEFF2CB6A,0x2358CBF4,
+    0xFA78D958,0x36D2D9C6,0xB85DDE25,0x74F7DEBB,0x7E32D7A2,0xB298D73C,0x3C17D0DF,0xF0BDD041,
+    0x5526F3C6,0x998CF358,0x1703F4BB,0xDBA9F425,0xD16CFD3C,0x1DC6FDA2,0x9349FA41,0x5FE3FADF,
+    0x86C3E873,0x4A69E8ED,0xC4E6EF0E,0x084CEF90,0x0289E689,0xCE23E617,0x40ACE1F4,0x8C06E16A,
+    0xD0EBA0BB,0x1C41A025,0x92CEA7C6,0x5E64A758,0x54A1AE41,0x980BAEDF,0x1684A93C,0xDA2EA9A2,
+    0x030EBB0E,0xCFA4BB90,0x412BBC73,0x8D81BCED,0x8744B5F4,0x4BEEB56A,0xC561B289,0x09CBB217,
+    0xAC509190,0x60FA910E,0xEE7596ED,0x22DF9673,0x281A9F6A,0xE4B09FF4,0x6A3F9817,0xA6959889,
+    0x7FB58A25,0xB31F8ABB,0x3D908D58,0xF13A8DC6,0xFBFF84DF,0x37558441,0xB9DA83A2,0x7570833C,
+    0x533B85DA,0x9F918544,0x111E82A7,0xDDB48239,0xD7718B20,0x1BDB8BBE,0x95548C5D,0x59FE8CC3,
+    0x80DE9E6F,0x4C749EF1,0xC2FB9912,0x0E51998C,0x04949095,0xC83E900B,0x46B197E8,0x8A1B9776,
+    0x2F80B4F1,0xE32AB46F,0x6DA5B38C,0xA10FB312,0xABCABA0B,0x6760BA95,0xE9EFBD76,0x2545BDE8,
+    0xFC65AF44,0x30CFAFDA,0xBE40A839,0x72EAA8A7,0x782FA1BE,0xB485A120,0x3A0AA6C3,0xF6A0A65D,
+    0xAA4DE78C,0x66E7E712,0xE868E0F1,0x24C2E06F,0x2E07E976,0xE2ADE9E8,0x6C22EE0B,0xA088EE95,
+    0x79A8FC39,0xB502FCA7,0x3B8DFB44,0xF727FBDA,0xFDE2F2C3,0x3148F25D,0xBFC7F5BE,0x736DF520,
+    0xD6F6D6A7,0x1A5CD639,0x94D3D1DA,0x5879D144,0x52BCD85D,0x9E16D8C3,0x1099DF20,0xDC33DFBE,
+    0x0513CD12,0xC9B9CD8C,0x4736CA6F,0x8B9CCAF1,0x8159C3E8,0x4DF3C376,0xC37CC495,0x0FD6C40B,
+    0x7AA64737,0xB60C47A9,0x3883404A,0xF42940D4,0xFEEC49CD,0x32464953,0xBCC94EB0,0x70634E2E,
+    0xA9435C82,0x65E95C1C,0xEB665BFF,0x27CC5B61,0x2D095278,0xE1A352E6,0x6F2C5505,0xA386559B,
+    0x061D761C,0xCAB77682,0x44387161,0x889271FF,0x825778E6,0x4EFD7878,0xC0727F9B,0x0CD87F05,
+    0xD5F86DA9,0x19526D37,0x97DD6AD4,0x5B776A4A,0x51B26353,0x9D1863CD,0x1397642E,0xDF3D64B0,
+    0x83D02561,0x4F7A25FF,0xC1F5221C,0x0D5F2282,0x079A2B9B,0xCB302B05,0x45BF2CE6,0x89152C78,
+    0x50353ED4,0x9C9F3E4A,0x121039A9,0xDEBA3937,0xD47F302E,0x18D530B0,0x965A3753,0x5AF037CD,
+    0xFF6B144A,0x33C114D4,0xBD4E1337,0x71E413A9,0x7B211AB0,0xB78B1A2E,0x39041DCD,0xF5AE1D53,
+    0x2C8E0FFF,0xE0240F61,0x6EAB0882,0xA201081C,0xA8C40105,0x646E019B,0xEAE10678,0x264B06E6,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_8 || CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+#ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+  // beyond this point only relevant for Slicing-by-16
+  ,{
+    0x00000000,0x177B1443,0x2EF62886,0x398D3CC5,0x5DEC510C,0x4A97454F,0x731A798A,0x64616DC9,
+    0xBBD8A218,0xACA3B65B,0x952E8A9E,0x82559EDD,0xE634F314,0xF14FE757,0xC8C2DB92,0xDFB9CFD1,
+    0xACC04271,0xBBBB5632,0x82366AF7,0x954D7EB4,0xF12C137D,0xE657073E,0xDFDA3BFB,0xC8A12FB8,
+    0x1718E069,0x0063F42A,0x39EEC8EF,0x2E95DCAC,0x4AF4B165,0x5D8FA526,0x640299E3,0x73798DA0,
+    0x82F182A3,0x958A96E0,0xAC07AA25,0xBB7CBE66,0xDF1DD3AF,0xC866C7EC,0xF1EBFB29,0xE690EF6A,
+    0x392920BB,0x2E5234F8,0x17DF083D,0x00A41C7E,0x64C571B7,0x73BE65F4,0x4A335931,0x5D484D72,
+    0x2E31C0D2,0x394AD491,0x00C7E854,0x17BCFC17,0x73DD91DE,0x64A6859D,0x5D2BB958,0x4A50AD1B,
+    0x95E962CA,0x82927689,0xBB1F4A4C,0xAC645E0F,0xC80533C6,0xDF7E2785,0xE6F31B40,0xF1880F03,
+    0xDE920307,0xC9E91744,0xF0642B81,0xE71F3FC2,0x837E520B,0x94054648,0xAD887A8D,0xBAF36ECE,
+    0x654AA11F,0x7231B55C,0x4BBC8999,0x5CC79DDA,0x38A6F013,0x2FDDE450,0x1650D895,0x012BCCD6,
+    0x72524176,0x65295535,0x5CA469F0,0x4BDF7DB3,0x2FBE107A,0x38C50439,0x014838FC,0x16332CBF,
+    0xC98AE36E,0xDEF1F72D,0xE77CCBE8,0xF007DFAB,0x9466B262,0x831DA621,0xBA909AE4,0xADEB8EA7,
+    0x5C6381A4,0x4B1895E7,0x7295A922,0x65EEBD61,0x018FD0A8,0x16F4C4EB,0x2F79F82E,0x3802EC6D,
+    0xE7BB23BC,0xF0C037FF,0xC94D0B3A,0xDE361F79,0xBA5772B0,0xAD2C66F3,0x94A15A36,0x83DA4E75,
+    0xF0A3C3D5,0xE7D8D796,0xDE55EB53,0xC92EFF10,0xAD4F92D9,0xBA34869A,0x83B9BA5F,0x94C2AE1C,
+    0x4B7B61CD,0x5C00758E,0x658D494B,0x72F65D08,0x169730C1,0x01EC2482,0x38611847,0x2F1A0C04,
+    0x6655004F,0x712E140C,0x48A328C9,0x5FD83C8A,0x3BB95143,0x2CC24500,0x154F79C5,0x02346D86,
+    0xDD8DA257,0xCAF6B614,0xF37B8AD1,0xE4009E92,0x8061F35B,0x971AE718,0xAE97DBDD,0xB9ECCF9E,
+    0xCA95423E,0xDDEE567D,0xE4636AB8,0xF3187EFB,0x97791332,0x80020771,0xB98F3BB4,0xAEF42FF7,
+    0x714DE026,0x6636F465,0x5FBBC8A0,0x48C0DCE3,0x2CA1B12A,0x3BDAA569,0x025799AC,0x152C8DEF,
+    0xE4A482EC,0xF3DF96AF,0xCA52AA6A,0xDD29BE29,0xB948D3E0,0xAE33C7A3,0x97BEFB66,0x80C5EF25,
+    0x5F7C20F4,0x480734B7,0x718A0872,0x66F11C31,0x029071F8,0x15EB65BB,0x2C66597E,0x3B1D4D3D,
+    0x4864C09D,0x5F1FD4DE,0x6692E81B,0x71E9FC58,0x15889191,0x02F385D2,0x3B7EB917,0x2C05AD54,
+    0xF3BC6285,0xE4C776C6,0xDD4A4A03,0xCA315E40,0xAE503389,0xB92B27CA,0x80A61B0F,0x97DD0F4C,
+    0xB8C70348,0xAFBC170B,0x96312BCE,0x814A3F8D,0xE52B5244,0xF2504607,0xCBDD7AC2,0xDCA66E81,
+    0x031FA150,0x1464B513,0x2DE989D6,0x3A929D95,0x5EF3F05C,0x4988E41F,0x7005D8DA,0x677ECC99,
+    0x14074139,0x037C557A,0x3AF169BF,0x2D8A7DFC,0x49EB1035,0x5E900476,0x671D38B3,0x70662CF0,
+    0xAFDFE321,0xB8A4F762,0x8129CBA7,0x9652DFE4,0xF233B22D,0xE548A66E,0xDCC59AAB,0xCBBE8EE8,
+    0x3A3681EB,0x2D4D95A8,0x14C0A96D,0x03BBBD2E,0x67DAD0E7,0x70A1C4A4,0x492CF861,0x5E57EC22,
+    0x81EE23F3,0x969537B0,0xAF180B75,0xB8631F36,0xDC0272FF,0xCB7966BC,0xF2F45A79,0xE58F4E3A,
+    0x96F6C39A,0x818DD7D9,0xB800EB1C,0xAF7BFF5F,0xCB1A9296,0xDC6186D5,0xE5ECBA10,0xF297AE53,
+    0x2D2E6182,0x3A5575C1,0x03D84904,0x14A35D47,0x70C2308E,0x67B924CD,0x5E341808,0x494F0C4B,
+  },
+
+  {
+    0x00000000,0xEFC26B3E,0x04F5D03D,0xEB37BB03,0x09EBA07A,0xE629CB44,0x0D1E7047,0xE2DC1B79,
+    0x13D740F4,0xFC152BCA,0x172290C9,0xF8E0FBF7,0x1A3CE08E,0xF5FE8BB0,0x1EC930B3,0xF10B5B8D,
+    0x27AE81E8,0xC86CEAD6,0x235B51D5,0xCC993AEB,0x2E452192,0xC1874AAC,0x2AB0F1AF,0xC5729A91,
+    0x3479C11C,0xDBBBAA22,0x308C1121,0xDF4E7A1F,0x3D926166,0xD2500A58,0x3967B15B,0xD6A5DA65,
+    0x4F5D03D0,0xA09F68EE,0x4BA8D3ED,0xA46AB8D3,0x46B6A3AA,0xA974C894,0x42437397,0xAD8118A9,
+    0x5C8A4324,0xB348281A,0x587F9319,0xB7BDF827,0x5561E35E,0xBAA38860,0x51943363,0xBE56585D,
+    0x68F38238,0x8731E906,0x6C065205,0x83C4393B,0x61182242,0x8EDA497C,0x65EDF27F,0x8A2F9941,
+    0x7B24C2CC,0x94E6A9F2,0x7FD112F1,0x901379CF,0x72CF62B6,0x9D0D0988,0x763AB28B,0x99F8D9B5,
+    0x9EBA07A0,0x71786C9E,0x9A4FD79D,0x758DBCA3,0x9751A7DA,0x7893CCE4,0x93A477E7,0x7C661CD9,
+    0x8D6D4754,0x62AF2C6A,0x89989769,0x665AFC57,0x8486E72E,0x6B448C10,0x80733713,0x6FB15C2D,
+    0xB9148648,0x56D6ED76,0xBDE15675,0x52233D4B,0xB0FF2632,0x5F3D4D0C,0xB40AF60F,0x5BC89D31,
+    0xAAC3C6BC,0x4501AD82,0xAE361681,0x41F47DBF,0xA32866C6,0x4CEA0DF8,0xA7DDB6FB,0x481FDDC5,
+    0xD1E70470,0x3E256F4E,0xD512D44D,0x3AD0BF73,0xD80CA40A,0x37CECF34,0xDCF97437,0x333B1F09,
+    0xC2304484,0x2DF22FBA,0xC6C594B9,0x2907FF87,0xCBDBE4FE,0x24198FC0,0xCF2E34C3,0x20EC5FFD,
+    0xF6498598,0x198BEEA6,0xF2BC55A5,0x1D7E3E9B,0xFFA225E2,0x10604EDC,0xFB57F5DF,0x14959EE1,
+    0xE59EC56C,0x0A5CAE52,0xE16B1551,0x0EA97E6F,0xEC756516,0x03B70E28,0xE880B52B,0x0742DE15,
+    0xE6050901,0x09C7623F,0xE2F0D93C,0x0D32B202,0xEFEEA97B,0x002CC245,0xEB1B7946,0x04D91278,
+    0xF5D249F5,0x1A1022CB,0xF12799C8,0x1EE5F2F6,0xFC39E98F,0x13FB82B1,0xF8CC39B2,0x170E528C,
+    0xC1AB88E9,0x2E69E3D7,0xC55E58D4,0x2A9C33EA,0xC8402893,0x278243AD,0xCCB5F8AE,0x23779390,
+    0xD27CC81D,0x3DBEA323,0xD6891820,0x394B731E,0xDB976867,0x34550359,0xDF62B85A,0x30A0D364,
+    0xA9580AD1,0x469A61EF,0xADADDAEC,0x426FB1D2,0xA0B3AAAB,0x4F71C195,0xA4467A96,0x4B8411A8,
+    0xBA8F4A25,0x554D211B,0xBE7A9A18,0x51B8F126,0xB364EA5F,0x5CA68161,0xB7913A62,0x5853515C,
+    0x8EF68B39,0x6134E007,0x8A035B04,0x65C1303A,0x871D2B43,0x68DF407D,0x83E8FB7E,0x6C2A9040,
+    0x9D21CBCD,0x72E3A0F3,0x99D41BF0,0x761670CE,0x94CA6BB7,0x7B080089,0x903FBB8A,0x7FFDD0B4,
+    0x78BF0EA1,0x977D659F,0x7C4ADE9C,0x9388B5A2,0x7154AEDB,0x9E96C5E5,0x75A17EE6,0x9A6315D8,
+    0x6B684E55,0x84AA256B,0x6F9D9E68,0x805FF556,0x6283EE2F,0x8D418511,0x66763E12,0x89B4552C,
+    0x5F118F49,0xB0D3E477,0x5BE45F74,0xB426344A,0x56FA2F33,0xB938440D,0x520FFF0E,0xBDCD9430,
+    0x4CC6CFBD,0xA304A483,0x48331F80,0xA7F174BE,0x452D6FC7,0xAAEF04F9,0x41D8BFFA,0xAE1AD4C4,
+    0x37E20D71,0xD820664F,0x3317DD4C,0xDCD5B672,0x3E09AD0B,0xD1CBC635,0x3AFC7D36,0xD53E1608,
+    0x24354D85,0xCBF726BB,0x20C09DB8,0xCF02F686,0x2DDEEDFF,0xC21C86C1,0x292B3DC2,0xC6E956FC,
+    0x104C8C99,0xFF8EE7A7,0x14B95CA4,0xFB7B379A,0x19A72CE3,0xF66547DD,0x1D52FCDE,0xF29097E0,
+    0x039BCC6D,0xEC59A753,0x076E1C50,0xE8AC776E,0x0A706C17,0xE5B20729,0x0E85BC2A,0xE147D714,
+  },
+
+  {
+    0x00000000,0xC18EDFC0,0x586CB9C1,0x99E26601,0xB0D97382,0x7157AC42,0xE8B5CA43,0x293B1583,
+    0xBAC3E145,0x7B4D3E85,0xE2AF5884,0x23218744,0x0A1A92C7,0xCB944D07,0x52762B06,0x93F8F4C6,
+    0xAEF6C4CB,0x6F781B0B,0xF69A7D0A,0x3714A2CA,0x1E2FB749,0xDFA16889,0x46430E88,0x87CDD148,
+    0x1435258E,0xD5BBFA4E,0x4C599C4F,0x8DD7438F,0xA4EC560C,0x656289CC,0xFC80EFCD,0x3D0E300D,
+    0x869C8FD7,0x47125017,0xDEF03616,0x1F7EE9D6,0x3645FC55,0xF7CB2395,0x6E294594,0xAFA79A54,
+    0x3C5F6E92,0xFDD1B152,0x6433D753,0xA5BD0893,0x8C861D10,0x4D08C2D0,0xD4EAA4D1,0x15647B11,
+    0x286A4B1C,0xE9E494DC,0x7006F2DD,0xB1882D1D,0x98B3389E,0x593DE75E,0xC0DF815F,0x01515E9F,
+    0x92A9AA59,0x53277599,0xCAC51398,0x0B4BCC58,0x2270D9DB,0xE3FE061B,0x7A1C601A,0xBB92BFDA,
+    0xD64819EF,0x17C6C62F,0x8E24A02E,0x4FAA7FEE,0x66916A6D,0xA71FB5AD,0x3EFDD3AC,0xFF730C6C,
+    0x6C8BF8AA,0xAD05276A,0x34E7416B,0xF5699EAB,0xDC528B28,0x1DDC54E8,0x843E32E9,0x45B0ED29,
+    0x78BEDD24,0xB93002E4,0x20D264E5,0xE15CBB25,0xC867AEA6,0x09E97166,0x900B1767,0x5185C8A7,
+    0xC27D3C61,0x03F3E3A1,0x9A1185A0,0x5B9F5A60,0x72A44FE3,0xB32A9023,0x2AC8F622,0xEB4629E2,
+    0x50D49638,0x915A49F8,0x08B82FF9,0xC936F039,0xE00DE5BA,0x21833A7A,0xB8615C7B,0x79EF83BB,
+    0xEA17777D,0x2B99A8BD,0xB27BCEBC,0x73F5117C,0x5ACE04FF,0x9B40DB3F,0x02A2BD3E,0xC32C62FE,
+    0xFE2252F3,0x3FAC8D33,0xA64EEB32,0x67C034F2,0x4EFB2171,0x8F75FEB1,0x169798B0,0xD7194770,
+    0x44E1B3B6,0x856F6C76,0x1C8D0A77,0xDD03D5B7,0xF438C034,0x35B61FF4,0xAC5479F5,0x6DDAA635,
+    0x77E1359F,0xB66FEA5F,0x2F8D8C5E,0xEE03539E,0xC738461D,0x06B699DD,0x9F54FFDC,0x5EDA201C,
+    0xCD22D4DA,0x0CAC0B1A,0x954E6D1B,0x54C0B2DB,0x7DFBA758,0xBC757898,0x25971E99,0xE419C159,
+    0xD917F154,0x18992E94,0x817B4895,0x40F59755,0x69CE82D6,0xA8405D16,0x31A23B17,0xF02CE4D7,
+    0x63D41011,0xA25ACFD1,0x3BB8A9D0,0xFA367610,0xD30D6393,0x1283BC53,0x8B61DA52,0x4AEF0592,
+    0xF17DBA48,0x30F36588,0xA9110389,0x689FDC49,0x41A4C9CA,0x802A160A,0x19C8700B,0xD846AFCB,
+    0x4BBE5B0D,0x8A3084CD,0x13D2E2CC,0xD25C3D0C,0xFB67288F,0x3AE9F74F,0xA30B914E,0x62854E8E,
+    0x5F8B7E83,0x9E05A143,0x07E7C742,0xC6691882,0xEF520D01,0x2EDCD2C1,0xB73EB4C0,0x76B06B00,
+    0xE5489FC6,0x24C64006,0xBD242607,0x7CAAF9C7,0x5591EC44,0x941F3384,0x0DFD5585,0xCC738A45,
+    0xA1A92C70,0x6027F3B0,0xF9C595B1,0x384B4A71,0x11705FF2,0xD0FE8032,0x491CE633,0x889239F3,
+    0x1B6ACD35,0xDAE412F5,0x430674F4,0x8288AB34,0xABB3BEB7,0x6A3D6177,0xF3DF0776,0x3251D8B6,
+    0x0F5FE8BB,0xCED1377B,0x5733517A,0x96BD8EBA,0xBF869B39,0x7E0844F9,0xE7EA22F8,0x2664FD38,
+    0xB59C09FE,0x7412D63E,0xEDF0B03F,0x2C7E6FFF,0x05457A7C,0xC4CBA5BC,0x5D29C3BD,0x9CA71C7D,
+    0x2735A3A7,0xE6BB7C67,0x7F591A66,0xBED7C5A6,0x97ECD025,0x56620FE5,0xCF8069E4,0x0E0EB624,
+    0x9DF642E2,0x5C789D22,0xC59AFB23,0x041424E3,0x2D2F3160,0xECA1EEA0,0x754388A1,0xB4CD5761,
+    0x89C3676C,0x484DB8AC,0xD1AFDEAD,0x1021016D,0x391A14EE,0xF894CB2E,0x6176AD2F,0xA0F872EF,
+    0x33008629,0xF28E59E9,0x6B6C3FE8,0xAAE2E028,0x83D9F5AB,0x42572A6B,0xDBB54C6A,0x1A3B93AA,
+  },
+
+  {
+    0x00000000,0x9BA54C6F,0xEC3B9E9F,0x779ED2F0,0x03063B7F,0x98A37710,0xEF3DA5E0,0x7498E98F,
+    0x060C76FE,0x9DA93A91,0xEA37E861,0x7192A40E,0x050A4D81,0x9EAF01EE,0xE931D31E,0x72949F71,
+    0x0C18EDFC,0x97BDA193,0xE0237363,0x7B863F0C,0x0F1ED683,0x94BB9AEC,0xE325481C,0x78800473,
+    0x0A149B02,0x91B1D76D,0xE62F059D,0x7D8A49F2,0x0912A07D,0x92B7EC12,0xE5293EE2,0x7E8C728D,
+    0x1831DBF8,0x83949797,0xF40A4567,0x6FAF0908,0x1B37E087,0x8092ACE8,0xF70C7E18,0x6CA93277,
+    0x1E3DAD06,0x8598E169,0xF2063399,0x69A37FF6,0x1D3B9679,0x869EDA16,0xF10008E6,0x6AA54489,
+    0x14293604,0x8F8C7A6B,0xF812A89B,0x63B7E4F4,0x172F0D7B,0x8C8A4114,0xFB1493E4,0x60B1DF8B,
+    0x122540FA,0x89800C95,0xFE1EDE65,0x65BB920A,0x11237B85,0x8A8637EA,0xFD18E51A,0x66BDA975,
+    0x3063B7F0,0xABC6FB9F,0xDC58296F,0x47FD6500,0x33658C8F,0xA8C0C0E0,0xDF5E1210,0x44FB5E7F,
+    0x366FC10E,0xADCA8D61,0xDA545F91,0x41F113FE,0x3569FA71,0xAECCB61E,0xD95264EE,0x42F72881,
+    0x3C7B5A0C,0xA7DE1663,0xD040C493,0x4BE588FC,0x3F7D6173,0xA4D82D1C,0xD346FFEC,0x48E3B383,
+    0x3A772CF2,0xA1D2609D,0xD64CB26D,0x4DE9FE02,0x3971178D,0xA2D45BE2,0xD54A8912,0x4EEFC57D,
+    0x28526C08,0xB3F72067,0xC469F297,0x5FCCBEF8,0x2B545777,0xB0F11B18,0xC76FC9E8,0x5CCA8587,
+    0x2E5E1AF6,0xB5FB5699,0xC2658469,0x59C0C806,0x2D582189,0xB6FD6DE6,0xC163BF16,0x5AC6F379,
+    0x244A81F4,0xBFEFCD9B,0xC8711F6B,0x53D45304,0x274CBA8B,0xBCE9F6E4,0xCB772414,0x50D2687B,
+    0x2246F70A,0xB9E3BB65,0xCE7D6995,0x55D825FA,0x2140CC75,0xBAE5801A,0xCD7B52EA,0x56DE1E85,
+    0x60C76FE0,0xFB62238F,0x8CFCF17F,0x1759BD10,0x63C1549F,0xF86418F0,0x8FFACA00,0x145F866F,
+    0x66CB191E,0xFD6E5571,0x8AF08781,0x1155CBEE,0x65CD2261,0xFE686E0E,0x89F6BCFE,0x1253F091,
+    0x6CDF821C,0xF77ACE73,0x80E41C83,0x1B4150EC,0x6FD9B963,0xF47CF50C,0x83E227FC,0x18476B93,
+    0x6AD3F4E2,0xF176B88D,0x86E86A7D,0x1D4D2612,0x69D5CF9D,0xF27083F2,0x85EE5102,0x1E4B1D6D,
+    0x78F6B418,0xE353F877,0x94CD2A87,0x0F6866E8,0x7BF08F67,0xE055C308,0x97CB11F8,0x0C6E5D97,
+    0x7EFAC2E6,0xE55F8E89,0x92C15C79,0x09641016,0x7DFCF999,0xE659B5F6,0x91C76706,0x0A622B69,
+    0x74EE59E4,0xEF4B158B,0x98D5C77B,0x03708B14,0x77E8629B,0xEC4D2EF4,0x9BD3FC04,0x0076B06B,
+    0x72E22F1A,0xE9476375,0x9ED9B185,0x057CFDEA,0x71E41465,0xEA41580A,0x9DDF8AFA,0x067AC695,
+    0x50A4D810,0xCB01947F,0xBC9F468F,0x273A0AE0,0x53A2E36F,0xC807AF00,0xBF997DF0,0x243C319F,
+    0x56A8AEEE,0xCD0DE281,0xBA933071,0x21367C1E,0x55AE9591,0xCE0BD9FE,0xB9950B0E,0x22304761,
+    0x5CBC35EC,0xC7197983,0xB087AB73,0x2B22E71C,0x5FBA0E93,0xC41F42FC,0xB381900C,0x2824DC63,
+    0x5AB04312,0xC1150F7D,0xB68BDD8D,0x2D2E91E2,0x59B6786D,0xC2133402,0xB58DE6F2,0x2E28AA9D,
+    0x489503E8,0xD3304F87,0xA4AE9D77,0x3F0BD118,0x4B933897,0xD03674F8,0xA7A8A608,0x3C0DEA67,
+    0x4E997516,0xD53C3979,0xA2A2EB89,0x3907A7E6,0x4D9F4E69,0xD63A0206,0xA1A4D0F6,0x3A019C99,
+    0x448DEE14,0xDF28A27B,0xA8B6708B,0x33133CE4,0x478BD56B,0xDC2E9904,0xABB04BF4,0x3015079B,
+    0x428198EA,0xD924D485,0xAEBA0675,0x351F4A1A,0x4187A395,0xDA22EFFA,0xADBC3D0A,0x36197165,
+  },
+
+  {
+    0x00000000,0xDD96D985,0x605CB54B,0xBDCA6CCE,0xC0B96A96,0x1D2FB313,0xA0E5DFDD,0x7D730658,
+    0x5A03D36D,0x87950AE8,0x3A5F6626,0xE7C9BFA3,0x9ABAB9FB,0x472C607E,0xFAE60CB0,0x2770D535,
+    0xB407A6DA,0x69917F5F,0xD45B1391,0x09CDCA14,0x74BECC4C,0xA92815C9,0x14E27907,0xC974A082,
+    0xEE0475B7,0x3392AC32,0x8E58C0FC,0x53CE1979,0x2EBD1F21,0xF32BC6A4,0x4EE1AA6A,0x937773EF,
+    0xB37E4BF5,0x6EE89270,0xD322FEBE,0x0EB4273B,0x73C72163,0xAE51F8E6,0x139B9428,0xCE0D4DAD,
+    0xE97D9898,0x34EB411D,0x89212DD3,0x54B7F456,0x29C4F20E,0xF4522B8B,0x49984745,0x940E9EC0,
+    0x0779ED2F,0xDAEF34AA,0x67255864,0xBAB381E1,0xC7C087B9,0x1A565E3C,0xA79C32F2,0x7A0AEB77,
+    0x5D7A3E42,0x80ECE7C7,0x3D268B09,0xE0B0528C,0x9DC354D4,0x40558D51,0xFD9FE19F,0x2009381A,
+    0xBD8D91AB,0x601B482E,0xDDD124E0,0x0047FD65,0x7D34FB3D,0xA0A222B8,0x1D684E76,0xC0FE97F3,
+    0xE78E42C6,0x3A189B43,0x87D2F78D,0x5A442E08,0x27372850,0xFAA1F1D5,0x476B9D1B,0x9AFD449E,
+    0x098A3771,0xD41CEEF4,0x69D6823A,0xB4405BBF,0xC9335DE7,0x14A58462,0xA96FE8AC,0x74F93129,
+    0x5389E41C,0x8E1F3D99,0x33D55157,0xEE4388D2,0x93308E8A,0x4EA6570F,0xF36C3BC1,0x2EFAE244,
+    0x0EF3DA5E,0xD36503DB,0x6EAF6F15,0xB339B690,0xCE4AB0C8,0x13DC694D,0xAE160583,0x7380DC06,
+    0x54F00933,0x8966D0B6,0x34ACBC78,0xE93A65FD,0x944963A5,0x49DFBA20,0xF415D6EE,0x29830F6B,
+    0xBAF47C84,0x6762A501,0xDAA8C9CF,0x073E104A,0x7A4D1612,0xA7DBCF97,0x1A11A359,0xC7877ADC,
+    0xE0F7AFE9,0x3D61766C,0x80AB1AA2,0x5D3DC327,0x204EC57F,0xFDD81CFA,0x40127034,0x9D84A9B1,
+    0xA06A2517,0x7DFCFC92,0xC036905C,0x1DA049D9,0x60D34F81,0xBD459604,0x008FFACA,0xDD19234F,
+    0xFA69F67A,0x27FF2FFF,0x9A354331,0x47A39AB4,0x3AD09CEC,0xE7464569,0x5A8C29A7,0x871AF022,
+    0x146D83CD,0xC9FB5A48,0x74313686,0xA9A7EF03,0xD4D4E95B,0x094230DE,0xB4885C10,0x691E8595,
+    0x4E6E50A0,0x93F88925,0x2E32E5EB,0xF3A43C6E,0x8ED73A36,0x5341E3B3,0xEE8B8F7D,0x331D56F8,
+    0x13146EE2,0xCE82B767,0x7348DBA9,0xAEDE022C,0xD3AD0474,0x0E3BDDF1,0xB3F1B13F,0x6E6768BA,
+    0x4917BD8F,0x9481640A,0x294B08C4,0xF4DDD141,0x89AED719,0x54380E9C,0xE9F26252,0x3464BBD7,
+    0xA713C838,0x7A8511BD,0xC74F7D73,0x1AD9A4F6,0x67AAA2AE,0xBA3C7B2B,0x07F617E5,0xDA60CE60,
+    0xFD101B55,0x2086C2D0,0x9D4CAE1E,0x40DA779B,0x3DA971C3,0xE03FA846,0x5DF5C488,0x80631D0D,
+    0x1DE7B4BC,0xC0716D39,0x7DBB01F7,0xA02DD872,0xDD5EDE2A,0x00C807AF,0xBD026B61,0x6094B2E4,
+    0x47E467D1,0x9A72BE54,0x27B8D29A,0xFA2E0B1F,0x875D0D47,0x5ACBD4C2,0xE701B80C,0x3A976189,
+    0xA9E01266,0x7476CBE3,0xC9BCA72D,0x142A7EA8,0x695978F0,0xB4CFA175,0x0905CDBB,0xD493143E,
+    0xF3E3C10B,0x2E75188E,0x93BF7440,0x4E29ADC5,0x335AAB9D,0xEECC7218,0x53061ED6,0x8E90C753,
+    0xAE99FF49,0x730F26CC,0xCEC54A02,0x13539387,0x6E2095DF,0xB3B64C5A,0x0E7C2094,0xD3EAF911,
+    0xF49A2C24,0x290CF5A1,0x94C6996F,0x495040EA,0x342346B2,0xE9B59F37,0x547FF3F9,0x89E92A7C,
+    0x1A9E5993,0xC7088016,0x7AC2ECD8,0xA754355D,0xDA273305,0x07B1EA80,0xBA7B864E,0x67ED5FCB,
+    0x409D8AFE,0x9D0B537B,0x20C13FB5,0xFD57E630,0x8024E068,0x5DB239ED,0xE0785523,0x3DEE8CA6,
+  },
+
+  {
+    0x00000000,0x9D0FE176,0xE16EC4AD,0x7C6125DB,0x19AC8F1B,0x84A36E6D,0xF8C24BB6,0x65CDAAC0,
+    0x33591E36,0xAE56FF40,0xD237DA9B,0x4F383BED,0x2AF5912D,0xB7FA705B,0xCB9B5580,0x5694B4F6,
+    0x66B23C6C,0xFBBDDD1A,0x87DCF8C1,0x1AD319B7,0x7F1EB377,0xE2115201,0x9E7077DA,0x037F96AC,
+    0x55EB225A,0xC8E4C32C,0xB485E6F7,0x298A0781,0x4C47AD41,0xD1484C37,0xAD2969EC,0x3026889A,
+    0xCD6478D8,0x506B99AE,0x2C0ABC75,0xB1055D03,0xD4C8F7C3,0x49C716B5,0x35A6336E,0xA8A9D218,
+    0xFE3D66EE,0x63328798,0x1F53A243,0x825C4335,0xE791E9F5,0x7A9E0883,0x06FF2D58,0x9BF0CC2E,
+    0xABD644B4,0x36D9A5C2,0x4AB88019,0xD7B7616F,0xB27ACBAF,0x2F752AD9,0x53140F02,0xCE1BEE74,
+    0x988F5A82,0x0580BBF4,0x79E19E2F,0xE4EE7F59,0x8123D599,0x1C2C34EF,0x604D1134,0xFD42F042,
+    0x41B9F7F1,0xDCB61687,0xA0D7335C,0x3DD8D22A,0x581578EA,0xC51A999C,0xB97BBC47,0x24745D31,
+    0x72E0E9C7,0xEFEF08B1,0x938E2D6A,0x0E81CC1C,0x6B4C66DC,0xF64387AA,0x8A22A271,0x172D4307,
+    0x270BCB9D,0xBA042AEB,0xC6650F30,0x5B6AEE46,0x3EA74486,0xA3A8A5F0,0xDFC9802B,0x42C6615D,
+    0x1452D5AB,0x895D34DD,0xF53C1106,0x6833F070,0x0DFE5AB0,0x90F1BBC6,0xEC909E1D,0x719F7F6B,
+    0x8CDD8F29,0x11D26E5F,0x6DB34B84,0xF0BCAAF2,0x95710032,0x087EE144,0x741FC49F,0xE91025E9,
+    0xBF84911F,0x228B7069,0x5EEA55B2,0xC3E5B4C4,0xA6281E04,0x3B27FF72,0x4746DAA9,0xDA493BDF,
+    0xEA6FB345,0x77605233,0x0B0177E8,0x960E969E,0xF3C33C5E,0x6ECCDD28,0x12ADF8F3,0x8FA21985,
+    0xD936AD73,0x44394C05,0x385869DE,0xA55788A8,0xC09A2268,0x5D95C31E,0x21F4E6C5,0xBCFB07B3,
+    0x8373EFE2,0x1E7C0E94,0x621D2B4F,0xFF12CA39,0x9ADF60F9,0x07D0818F,0x7BB1A454,0xE6BE4522,
+    0xB02AF1D4,0x2D2510A2,0x51443579,0xCC4BD40F,0xA9867ECF,0x34899FB9,0x48E8BA62,0xD5E75B14,
+    0xE5C1D38E,0x78CE32F8,0x04AF1723,0x99A0F655,0xFC6D5C95,0x6162BDE3,0x1D039838,0x800C794E,
+    0xD698CDB8,0x4B972CCE,0x37F60915,0xAAF9E863,0xCF3442A3,0x523BA3D5,0x2E5A860E,0xB3556778,
+    0x4E17973A,0xD318764C,0xAF795397,0x3276B2E1,0x57BB1821,0xCAB4F957,0xB6D5DC8C,0x2BDA3DFA,
+    0x7D4E890C,0xE041687A,0x9C204DA1,0x012FACD7,0x64E20617,0xF9EDE761,0x858CC2BA,0x188323CC,
+    0x28A5AB56,0xB5AA4A20,0xC9CB6FFB,0x54C48E8D,0x3109244D,0xAC06C53B,0xD067E0E0,0x4D680196,
+    0x1BFCB560,0x86F35416,0xFA9271CD,0x679D90BB,0x02503A7B,0x9F5FDB0D,0xE33EFED6,0x7E311FA0,
+    0xC2CA1813,0x5FC5F965,0x23A4DCBE,0xBEAB3DC8,0xDB669708,0x4669767E,0x3A0853A5,0xA707B2D3,
+    0xF1930625,0x6C9CE753,0x10FDC288,0x8DF223FE,0xE83F893E,0x75306848,0x09514D93,0x945EACE5,
+    0xA478247F,0x3977C509,0x4516E0D2,0xD81901A4,0xBDD4AB64,0x20DB4A12,0x5CBA6FC9,0xC1B58EBF,
+    0x97213A49,0x0A2EDB3F,0x764FFEE4,0xEB401F92,0x8E8DB552,0x13825424,0x6FE371FF,0xF2EC9089,
+    0x0FAE60CB,0x92A181BD,0xEEC0A466,0x73CF4510,0x1602EFD0,0x8B0D0EA6,0xF76C2B7D,0x6A63CA0B,
+    0x3CF77EFD,0xA1F89F8B,0xDD99BA50,0x40965B26,0x255BF1E6,0xB8541090,0xC435354B,0x593AD43D,
+    0x691C5CA7,0xF413BDD1,0x8872980A,0x157D797C,0x70B0D3BC,0xEDBF32CA,0x91DE1711,0x0CD1F667,
+    0x5A454291,0xC74AA3E7,0xBB2B863C,0x2624674A,0x43E9CD8A,0xDEE62CFC,0xA2870927,0x3F88E851,
+  },
+
+  {
+    0x00000000,0xB9FBDBE8,0xA886B191,0x117D6A79,0x8A7C6563,0x3387BE8B,0x22FAD4F2,0x9B010F1A,
+    0xCF89CC87,0x7672176F,0x670F7D16,0xDEF4A6FE,0x45F5A9E4,0xFC0E720C,0xED731875,0x5488C39D,
+    0x44629F4F,0xFD9944A7,0xECE42EDE,0x551FF536,0xCE1EFA2C,0x77E521C4,0x66984BBD,0xDF639055,
+    0x8BEB53C8,0x32108820,0x236DE259,0x9A9639B1,0x019736AB,0xB86CED43,0xA911873A,0x10EA5CD2,
+    0x88C53E9E,0x313EE576,0x20438F0F,0x99B854E7,0x02B95BFD,0xBB428015,0xAA3FEA6C,0x13C43184,
+    0x474CF219,0xFEB729F1,0xEFCA4388,0x56319860,0xCD30977A,0x74CB4C92,0x65B626EB,0xDC4DFD03,
+    0xCCA7A1D1,0x755C7A39,0x64211040,0xDDDACBA8,0x46DBC4B2,0xFF201F5A,0xEE5D7523,0x57A6AECB,
+    0x032E6D56,0xBAD5B6BE,0xABA8DCC7,0x1253072F,0x89520835,0x30A9D3DD,0x21D4B9A4,0x982F624C,
+    0xCAFB7B7D,0x7300A095,0x627DCAEC,0xDB861104,0x40871E1E,0xF97CC5F6,0xE801AF8F,0x51FA7467,
+    0x0572B7FA,0xBC896C12,0xADF4066B,0x140FDD83,0x8F0ED299,0x36F50971,0x27886308,0x9E73B8E0,
+    0x8E99E432,0x37623FDA,0x261F55A3,0x9FE48E4B,0x04E58151,0xBD1E5AB9,0xAC6330C0,0x1598EB28,
+    0x411028B5,0xF8EBF35D,0xE9969924,0x506D42CC,0xCB6C4DD6,0x7297963E,0x63EAFC47,0xDA1127AF,
+    0x423E45E3,0xFBC59E0B,0xEAB8F472,0x53432F9A,0xC8422080,0x71B9FB68,0x60C49111,0xD93F4AF9,
+    0x8DB78964,0x344C528C,0x253138F5,0x9CCAE31D,0x07CBEC07,0xBE3037EF,0xAF4D5D96,0x16B6867E,
+    0x065CDAAC,0xBFA70144,0xAEDA6B3D,0x1721B0D5,0x8C20BFCF,0x35DB6427,0x24A60E5E,0x9D5DD5B6,
+    0xC9D5162B,0x702ECDC3,0x6153A7BA,0xD8A87C52,0x43A97348,0xFA52A8A0,0xEB2FC2D9,0x52D41931,
+    0x4E87F0BB,0xF77C2B53,0xE601412A,0x5FFA9AC2,0xC4FB95D8,0x7D004E30,0x6C7D2449,0xD586FFA1,
+    0x810E3C3C,0x38F5E7D4,0x29888DAD,0x90735645,0x0B72595F,0xB28982B7,0xA3F4E8CE,0x1A0F3326,
+    0x0AE56FF4,0xB31EB41C,0xA263DE65,0x1B98058D,0x80990A97,0x3962D17F,0x281FBB06,0x91E460EE,
+    0xC56CA373,0x7C97789B,0x6DEA12E2,0xD411C90A,0x4F10C610,0xF6EB1DF8,0xE7967781,0x5E6DAC69,
+    0xC642CE25,0x7FB915CD,0x6EC47FB4,0xD73FA45C,0x4C3EAB46,0xF5C570AE,0xE4B81AD7,0x5D43C13F,
+    0x09CB02A2,0xB030D94A,0xA14DB333,0x18B668DB,0x83B767C1,0x3A4CBC29,0x2B31D650,0x92CA0DB8,
+    0x8220516A,0x3BDB8A82,0x2AA6E0FB,0x935D3B13,0x085C3409,0xB1A7EFE1,0xA0DA8598,0x19215E70,
+    0x4DA99DED,0xF4524605,0xE52F2C7C,0x5CD4F794,0xC7D5F88E,0x7E2E2366,0x6F53491F,0xD6A892F7,
+    0x847C8BC6,0x3D87502E,0x2CFA3A57,0x9501E1BF,0x0E00EEA5,0xB7FB354D,0xA6865F34,0x1F7D84DC,
+    0x4BF54741,0xF20E9CA9,0xE373F6D0,0x5A882D38,0xC1892222,0x7872F9CA,0x690F93B3,0xD0F4485B,
+    0xC01E1489,0x79E5CF61,0x6898A518,0xD1637EF0,0x4A6271EA,0xF399AA02,0xE2E4C07B,0x5B1F1B93,
+    0x0F97D80E,0xB66C03E6,0xA711699F,0x1EEAB277,0x85EBBD6D,0x3C106685,0x2D6D0CFC,0x9496D714,
+    0x0CB9B558,0xB5426EB0,0xA43F04C9,0x1DC4DF21,0x86C5D03B,0x3F3E0BD3,0x2E4361AA,0x97B8BA42,
+    0xC33079DF,0x7ACBA237,0x6BB6C84E,0xD24D13A6,0x494C1CBC,0xF0B7C754,0xE1CAAD2D,0x583176C5,
+    0x48DB2A17,0xF120F1FF,0xE05D9B86,0x59A6406E,0xC2A74F74,0x7B5C949C,0x6A21FEE5,0xD3DA250D,
+    0x8752E690,0x3EA93D78,0x2FD45701,0x962F8CE9,0x0D2E83F3,0xB4D5581B,0xA5A83262,0x1C53E98A,
+  },
+
+  {
+    0x00000000,0xAE689191,0x87A02563,0x29C8B4F2,0xD4314C87,0x7A59DD16,0x539169E4,0xFDF9F875,
+    0x73139F4F,0xDD7B0EDE,0xF4B3BA2C,0x5ADB2BBD,0xA722D3C8,0x094A4259,0x2082F6AB,0x8EEA673A,
+    0xE6273E9E,0x484FAF0F,0x61871BFD,0xCFEF8A6C,0x32167219,0x9C7EE388,0xB5B6577A,0x1BDEC6EB,
+    0x9534A1D1,0x3B5C3040,0x129484B2,0xBCFC1523,0x4105ED56,0xEF6D7CC7,0xC6A5C835,0x68CD59A4,
+    0x173F7B7D,0xB957EAEC,0x909F5E1E,0x3EF7CF8F,0xC30E37FA,0x6D66A66B,0x44AE1299,0xEAC68308,
+    0x642CE432,0xCA4475A3,0xE38CC151,0x4DE450C0,0xB01DA8B5,0x1E753924,0x37BD8DD6,0x99D51C47,
+    0xF11845E3,0x5F70D472,0x76B86080,0xD8D0F111,0x25290964,0x8B4198F5,0xA2892C07,0x0CE1BD96,
+    0x820BDAAC,0x2C634B3D,0x05ABFFCF,0xABC36E5E,0x563A962B,0xF85207BA,0xD19AB348,0x7FF222D9,
+    0x2E7EF6FA,0x8016676B,0xA9DED399,0x07B64208,0xFA4FBA7D,0x54272BEC,0x7DEF9F1E,0xD3870E8F,
+    0x5D6D69B5,0xF305F824,0xDACD4CD6,0x74A5DD47,0x895C2532,0x2734B4A3,0x0EFC0051,0xA09491C0,
+    0xC859C864,0x663159F5,0x4FF9ED07,0xE1917C96,0x1C6884E3,0xB2001572,0x9BC8A180,0x35A03011,
+    0xBB4A572B,0x1522C6BA,0x3CEA7248,0x9282E3D9,0x6F7B1BAC,0xC1138A3D,0xE8DB3ECF,0x46B3AF5E,
+    0x39418D87,0x97291C16,0xBEE1A8E4,0x10893975,0xED70C100,0x43185091,0x6AD0E463,0xC4B875F2,
+    0x4A5212C8,0xE43A8359,0xCDF237AB,0x639AA63A,0x9E635E4F,0x300BCFDE,0x19C37B2C,0xB7ABEABD,
+    0xDF66B319,0x710E2288,0x58C6967A,0xF6AE07EB,0x0B57FF9E,0xA53F6E0F,0x8CF7DAFD,0x229F4B6C,
+    0xAC752C56,0x021DBDC7,0x2BD50935,0x85BD98A4,0x784460D1,0xD62CF140,0xFFE445B2,0x518CD423,
+    0x5CFDEDF4,0xF2957C65,0xDB5DC897,0x75355906,0x88CCA173,0x26A430E2,0x0F6C8410,0xA1041581,
+    0x2FEE72BB,0x8186E32A,0xA84E57D8,0x0626C649,0xFBDF3E3C,0x55B7AFAD,0x7C7F1B5F,0xD2178ACE,
+    0xBADAD36A,0x14B242FB,0x3D7AF609,0x93126798,0x6EEB9FED,0xC0830E7C,0xE94BBA8E,0x47232B1F,
+    0xC9C94C25,0x67A1DDB4,0x4E696946,0xE001F8D7,0x1DF800A2,0xB3909133,0x9A5825C1,0x3430B450,
+    0x4BC29689,0xE5AA0718,0xCC62B3EA,0x620A227B,0x9FF3DA0E,0x319B4B9F,0x1853FF6D,0xB63B6EFC,
+    0x38D109C6,0x96B99857,0xBF712CA5,0x1119BD34,0xECE04541,0x4288D4D0,0x6B406022,0xC528F1B3,
+    0xADE5A817,0x038D3986,0x2A458D74,0x842D1CE5,0x79D4E490,0xD7BC7501,0xFE74C1F3,0x501C5062,
+    0xDEF63758,0x709EA6C9,0x5956123B,0xF73E83AA,0x0AC77BDF,0xA4AFEA4E,0x8D675EBC,0x230FCF2D,
+    0x72831B0E,0xDCEB8A9F,0xF5233E6D,0x5B4BAFFC,0xA6B25789,0x08DAC618,0x211272EA,0x8F7AE37B,
+    0x01908441,0xAFF815D0,0x8630A122,0x285830B3,0xD5A1C8C6,0x7BC95957,0x5201EDA5,0xFC697C34,
+    0x94A42590,0x3ACCB401,0x130400F3,0xBD6C9162,0x40956917,0xEEFDF886,0xC7354C74,0x695DDDE5,
+    0xE7B7BADF,0x49DF2B4E,0x60179FBC,0xCE7F0E2D,0x3386F658,0x9DEE67C9,0xB426D33B,0x1A4E42AA,
+    0x65BC6073,0xCBD4F1E2,0xE21C4510,0x4C74D481,0xB18D2CF4,0x1FE5BD65,0x362D0997,0x98459806,
+    0x16AFFF3C,0xB8C76EAD,0x910FDA5F,0x3F674BCE,0xC29EB3BB,0x6CF6222A,0x453E96D8,0xEB560749,
+    0x839B5EED,0x2DF3CF7C,0x043B7B8E,0xAA53EA1F,0x57AA126A,0xF9C283FB,0xD00A3709,0x7E62A698,
+    0xF088C1A2,0x5EE05033,0x7728E4C1,0xD9407550,0x24B98D25,0x8AD11CB4,0xA319A846,0x0D7139D7,
+  }
+#endif // CRC32_USE_LOOKUP_TABLE_SLICING_BY_16
+};
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/file_adapter.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/file_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c80770cf919f19f4cb0b0c5267ff07d438325e83
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/file_adapter.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <c10/macros/Macros.h>
+
+#include "caffe2/serialize/istream_adapter.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+
+namespace caffe2 {
+namespace serialize {
+
+class TORCH_API FileAdapter final : public ReadAdapterInterface {
+ public:
+  C10_DISABLE_COPY_AND_ASSIGN(FileAdapter);
+  explicit FileAdapter(const std::string& file_name);
+  size_t size() const override;
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override;
+  ~FileAdapter() override;
+
+ private:
+  // An RAII Wrapper for a FILE pointer. Closes on destruction.
+  struct RAIIFile {
+    FILE* fp_;
+    explicit RAIIFile(const std::string& file_name);
+    ~RAIIFile();
+  };
+
+  RAIIFile file_;
+  // The size of the opened file in bytes
+  uint64_t size_;
+};
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a73cfa1e4df326671273d7adf8a137433a7077c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/in_memory_adapter.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <cstring>
+#include <caffe2/serialize/read_adapter_interface.h>
+
+
+namespace caffe2 {
+namespace serialize {
+
+class MemoryReadAdapter final : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  explicit MemoryReadAdapter(const void* data, off_t size)
+      : data_(data), size_(size) {}
+
+  size_t size() const override {
+    return size_;
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override {
+    (void) what;
+    memcpy(buf, (int8_t*)(data_) + pos, n);
+    return n;
+  }
+
+ private:
+  const void* data_;
+  off_t size_;
+};
+
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/inline_container.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/inline_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..e706e449254ddcca5bbe43edadbcc19c9ab367bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/inline_container.h
@@ -0,0 +1,278 @@
+#pragma once
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <istream>
+#include <mutex>
+#include <ostream>
+#include <unordered_set>
+
+#include <c10/core/Allocator.h>
+#include <c10/core/Backend.h>
+
+#include "caffe2/serialize/istream_adapter.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+#include "caffe2/serialize/versions.h"
+
+
+extern "C" {
+typedef struct mz_zip_archive mz_zip_archive;
+}
+
+// PyTorch containers are a special zip archive with the following layout
+// archive_name.zip contains:
+//    archive_name/
+//        version # a file with a single decimal number written in ascii,
+//                # used to establish the version of the archive format
+//        model.json # overall model description, this is a json output of
+//                   # ModelDef from torch.proto
+//        # the following names are by convention only, model.json will
+//        # refer to these files by full names
+//        tensors/
+//          0 # flat storage for tensor data, meta-data about shapes, etc. is
+//            # in model.json
+//          1
+//          ...
+//        # code entries will only exist for modules that have methods attached
+//        code/
+//          archive_name.py # serialized torch script code (python syntax, using
+//          PythonPrint) archive_name_my_submodule.py # submodules have separate
+//          files
+//
+// The PyTorchStreamWriter also ensures additional useful properties for these
+// files
+// 1. All files are stored uncompressed.
+// 2. All files in the archive are aligned to 64 byte boundaries such that
+//    it is possible to mmap the entire file and get an aligned pointer to
+//    tensor data.
+// 3. We universally write in ZIP64 format for consistency.
+
+// The PyTorchStreamReader also provides additional properties:
+// 1. It can read zip files that are created with common
+//    zip tools. This means that even though our writer doesn't compress files,
+//    the reader can still read files that were compressed.
+// 2. It provides a getRecordOffset function which returns the offset into the
+//    raw file where file data lives. If the file was written with
+//    PyTorchStreamWriter it is guaranteed to be 64 byte aligned.
+
+// PyTorchReader/Writer handle checking the version number on the archive format
+// and ensure that all files are written to a archive_name directory so they
+// unzip cleanly.
+
+// When developing this format we want to pay particular attention to the
+// following use cases:
+//
+// -- Reading --
+// 1) Reading with full random access
+//   a) Reading with file api's such as fread()
+//   b) mmaping the file and jumping around the mapped region
+// 2) Reading with 1-pass sequential access
+//      -> A reader will need to build up a data structure of parsed structures
+//         as it reads
+//
+// -- Writing --
+// 1) Writing with full random access
+// 2) Writing with 1-pass sequential access
+//      -> We must take care not to require updating values that have already
+//         been written. We place the variable-length index at the end and do
+//         not put any indicies into the header to fulfill this constraint.
+
+// The model.json, which contains all the metadata information,
+// should be written as the last file. One reason is that the size of tensor
+// data is usually stable. As long as the shape and type of the tensor do not
+// change, the size of the data won't change. On the other sied, the size of the
+// serialized model is likely to change, so we store it as the last record, and
+// we don't need to move previous records when updating the model data.
+
+// The zip format is sufficiently flexible to handle the above use-case.
+// it puts its central directory at the end of the archive and we write
+// model.json as the last file when writing after we have accumulated all
+// other information.
+
+namespace caffe2 {
+namespace serialize {
+
+static constexpr const char* kSerializationIdRecordName = ".data/serialization_id";
+
+struct MzZipReaderIterWrapper;
+
+class TORCH_API ChunkRecordIterator {
+ public:
+  ~ChunkRecordIterator();
+
+  // Read at most `chunkSize` into `buf`. Return the number of actual bytes read.
+  size_t next(void* buf);
+  size_t recordSize() const { return recordSize_; }
+
+ private:
+ ChunkRecordIterator(
+      size_t recordSize,
+      size_t chunkSize,
+      std::unique_ptr<MzZipReaderIterWrapper> iter);
+
+  const size_t recordSize_;
+  const size_t chunkSize_;
+  size_t offset_;
+  std::unique_ptr<MzZipReaderIterWrapper> iter_;
+
+  friend class PyTorchStreamReader;
+};
+
+class TORCH_API PyTorchStreamReader final {
+ public:
+  explicit PyTorchStreamReader(const std::string& file_name);
+  explicit PyTorchStreamReader(std::istream* in);
+  explicit PyTorchStreamReader(std::shared_ptr<ReadAdapterInterface> in);
+
+  // return dataptr, size
+  std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
+  // multi-thread getRecord
+  std::tuple<at::DataPtr, size_t> getRecord(const std::string& name, std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders);
+  // inplace memory writing
+  size_t getRecord(const std::string& name, void* dst, size_t n);
+  // inplace memory writing, multi-threads.
+  // When additionalReaders is empty, the default behavior is call getRecord(name, dst, n) with default reader
+  // This approach can be used for reading large tensors.
+  size_t getRecord(const std::string& name, void* dst, size_t n,
+    std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders);
+  size_t getRecord(
+      const std::string& name,
+      void* dst,
+      size_t n,
+      size_t chunk_size,
+      void* buf,
+      const std::function<void(void*, const void*, size_t)>& memcpy_func = nullptr);
+
+  // Concurrent reading records with multiple readers.
+  // additionalReaders are additional clients to access the underlying record at different offsets
+  // and write to different trunks of buffers.
+  // If the overall size of the tensor is 10, and size of additionalReader is 2.
+  // The default thread will read [0,4), the additional reader will read [4,8).
+  // The default reader will read [8,10).
+  // The default reader will write to buffer[0,4), the additional reader will write to buffer[4,8),
+  // the additional reader will write to buffer[8,10).
+  // When additionalReaders is empty, the default behavior is call getRecord(name) with default reader
+  // This approach can be used for reading large tensors.
+  size_t getRecordMultiReaders(const std::string& name,
+  std::vector<std::shared_ptr<ReadAdapterInterface>>& additionalReaders,
+  void *dst, size_t n);
+
+  size_t getRecordSize(const std::string& name);
+
+  size_t getRecordOffset(const std::string& name);
+  bool hasRecord(const std::string& name);
+  std::vector<std::string> getAllRecords();
+
+  ChunkRecordIterator createChunkReaderIter(
+      const std::string& name,
+      const size_t recordSize,
+      const size_t chunkSize);
+
+  ~PyTorchStreamReader();
+  uint64_t version() const {
+    return version_;
+  }
+  const std::string& serializationId() {
+    return serialization_id_;
+  }
+
+  void setShouldLoadDebugSymbol(bool should_load_debug_symbol) {
+    load_debug_symbol_ = should_load_debug_symbol;
+  }
+  void setAdditionalReaderSizeThreshold(const size_t& size){
+    additional_reader_size_threshold_ = size;
+  }
+ private:
+  void init();
+  size_t read(uint64_t pos, char* buf, size_t n);
+  void valid(const char* what, const char* info = "");
+  size_t getRecordID(const std::string& name);
+
+  friend size_t
+  istream_read_func(void* pOpaque, uint64_t file_ofs, void* pBuf, size_t n);
+  std::unique_ptr<mz_zip_archive> ar_;
+  std::string archive_name_;
+  std::string archive_name_plus_slash_;
+  std::shared_ptr<ReadAdapterInterface> in_;
+  int64_t version_;
+  std::mutex reader_lock_;
+  bool load_debug_symbol_ = true;
+  std::string serialization_id_;
+  size_t additional_reader_size_threshold_;
+};
+
+class TORCH_API PyTorchStreamWriter final {
+ public:
+  explicit PyTorchStreamWriter(const std::string& archive_name);
+  explicit PyTorchStreamWriter(
+      const std::function<size_t(const void*, size_t)> writer_func);
+
+  void setMinVersion(const uint64_t version);
+
+  void writeRecord(
+      const std::string& name,
+      const void* data,
+      size_t size,
+      bool compress = false);
+  void writeEndOfFile();
+
+  const std::unordered_set<std::string>& getAllWrittenRecords();
+
+  bool finalized() const {
+    return finalized_;
+  }
+
+  const std::string& archiveName() {
+    return archive_name_;
+  }
+
+  const std::string& serializationId() {
+    return serialization_id_;
+  }
+
+  ~PyTorchStreamWriter();
+
+ private:
+  void setup(const std::string& file_name);
+  void valid(const char* what, const char* info = "");
+  void writeSerializationId();
+  size_t current_pos_ = 0;
+  std::unordered_set<std::string> files_written_;
+  std::unique_ptr<mz_zip_archive> ar_;
+  std::string archive_name_;
+  std::string archive_name_plus_slash_;
+  std::string padding_;
+  std::ofstream file_stream_;
+  std::function<size_t(const void*, size_t)> writer_func_;
+  uint64_t combined_uncomp_crc32_ = 0;
+  std::string serialization_id_;
+
+  // This number will be updated when the model has operators
+  // that have valid upgraders.
+  uint64_t version_ = kMinProducedFileFormatVersion;
+  bool finalized_ = false;
+  bool err_seen_ = false;
+  friend size_t ostream_write_func(
+      void* pOpaque,
+      uint64_t file_ofs,
+      const void* pBuf,
+      size_t n);
+};
+
+namespace detail {
+// Writer-specific constants
+constexpr uint64_t kFieldAlignment = 64;
+
+// Returns a record to be appended to the local user extra data entry in order
+// to make data beginning aligned at kFieldAlignment bytes boundary.
+size_t getPadding(
+    size_t cursor,
+    size_t filename_size,
+    size_t size,
+    std::string& padding_buf);
+} // namespace detail
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/istream_adapter.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/istream_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..10ad90e9fc7a9dfb48d4f4b06f82278fa676605b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/istream_adapter.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <istream>
+
+#include "c10/macros/Macros.h"
+#include "caffe2/serialize/read_adapter_interface.h"
+
+namespace caffe2 {
+namespace serialize {
+
+// this is a reader implemented by std::istream
+class TORCH_API IStreamAdapter final : public ReadAdapterInterface {
+ public:
+  C10_DISABLE_COPY_AND_ASSIGN(IStreamAdapter);
+  explicit IStreamAdapter(std::istream* istream);
+  size_t size() const override;
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const override;
+  ~IStreamAdapter() override;
+
+ private:
+  std::istream* istream_;
+  void validate(const char* what) const;
+};
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cadded34f387b19e45649ed16731cacb2c62203
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/read_adapter_interface.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include "c10/macros/Macros.h"
+
+namespace caffe2 {
+namespace serialize {
+
+// this is the interface for the (file/stream/memory) reader in
+// PyTorchStreamReader. with this interface, we can extend the support
+// besides standard istream
+class TORCH_API ReadAdapterInterface {
+ public:
+  virtual size_t size() const = 0;
+  virtual size_t read(uint64_t pos, void* buf, size_t n, const char* what = "")
+      const = 0;
+  virtual ~ReadAdapterInterface();
+};
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/caffe2/serialize/versions.h b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/versions.h
new file mode 100644
index 0000000000000000000000000000000000000000..77c4429cf8434ace8883993268b6f5c1fcf89f5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/caffe2/serialize/versions.h
@@ -0,0 +1,133 @@
+#pragma once
+#include <cstdint>
+
+namespace caffe2 {
+namespace serialize {
+
+constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
+
+constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL;
+
+// Versions (i.e. why was the version number bumped?)
+
+// Note [Dynamic Versions and torch.jit.save vs. torch.save]
+//
+// Our versioning scheme has a "produced file format version" which
+// describes how an archive is to be read. The version written in an archive
+// is at least this current produced file format version, but may be greater
+// if it includes certain symbols. We refer to these conditional versions
+// as "dynamic," since they are identified at runtime.
+//
+// Dynamic versioning is useful when an operator's semantics are updated.
+// When using torch.jit.save we want those semantics to be preserved. If
+// we bumped the produced file format version on every change, however,
+// then older versions of PyTorch couldn't read even simple archives, like
+// a single tensor, from newer versions of PyTorch. Instead, we
+// assign dynamic versions to these changes that override the
+// produced file format version as needed. That is, when the semantics
+// of torch.div changed it was assigned dynamic version 4, and when
+// torch.jit.saving modules that use torch.div those archives also have
+// (at least) version 4. This prevents earlier versions of PyTorch
+// from accidentally performing the wrong kind of division. Modules
+// that don't use torch.div or other operators with dynamic versions
+// can write the produced file format version, and these programs will
+// run as expected on earlier versions of PyTorch.
+//
+// While torch.jit.save attempts to preserve operator semantics,
+// torch.save does not. torch.save is analogous to pickling Python, so
+// a function that uses torch.div will have different behavior if torch.saved
+// and torch.loaded across PyTorch versions. From a technical perspective,
+// torch.save ignores dynamic versioning.
+
+// 1. Initial version
+// 2. Removed op_version_set version numbers
+// 3. Added type tags to pickle serialization of container types
+// 4. (Dynamic) Stopped integer division using torch.div
+//      (a versioned symbol preserves the historic behavior of versions 1--3)
+// 5. (Dynamic) Stops torch.full inferring a floating point dtype
+//      when given bool or integer fill values.
+// 6. Write version string to `./data/version` instead of `version`.
+
+// [12/15/2021]
+// kProducedFileFormatVersion is set to 7 from 3 due to a different
+// interpretation of what file format version is.
+// Whenever there is new upgrader introduced,
+// this number should be bumped.
+// The reasons that version is bumped in the past:
+//     1. aten::div is changed at version 4
+//     2. aten::full is changed at version 5
+//     3. torch.package uses version 6
+//     4. Introduce new upgrader design and set the version number to 7
+//        mark this change
+// --------------------------------------------------
+// We describe new operator version bump reasons here:
+// 1) [01/24/2022]
+//     We bump the version number to 8 to update aten::linspace
+//     and aten::linspace.out to error out when steps is not
+//     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+// 2) [01/30/2022]
+//     Bump the version number to 9 to update aten::logspace and
+//     and aten::logspace.out to error out when steps is not
+//     provided. (see: https://github.com/pytorch/pytorch/issues/55951)
+// 3) [02/11/2022]
+//     Bump the version number to 10 to update aten::gelu and
+//     and aten::gelu.out to support the new approximate kwarg.
+//     (see: https://github.com/pytorch/pytorch/pull/61439)
+constexpr uint64_t kProducedFileFormatVersion = 0xAL;
+
+// Absolute minimum version we will write packages. This
+// means that every package from now on will always be
+// greater than this number.
+constexpr uint64_t kMinProducedFileFormatVersion = 0x3L;
+
+// The version we write when the archive contains bytecode.
+// It must be higher or eq to kProducedFileFormatVersion.
+// Because torchscript changes is likely introduce bytecode change.
+// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
+// should be increased too. The relationship is:
+// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
+//   >= kProducedFileFormatVersion
+// If a format change is forward compatible (still readable by older
+// executables), we will not increment the version number, to minimize the
+// risk of breaking existing clients. TODO: A better way would be to allow
+// the caller that creates a model to specify a maximum version that its
+// clients can accept.
+// Versions:
+//  0x1L: Initial version
+//  0x2L: (Comment missing)
+//  0x3L: (Comment missing)
+//  0x4L: (update) Added schema to function tuple. Forward-compatible change.
+//  0x5L: (update) Update bytecode is sharing constant tensor files from
+//  torchscript, and only serialize extra tensors that are not in the
+//  torchscript constant table. Also update tensor storage schema adapting to
+//  the unify format, the root key of tensor storage is updated from {index} to
+//  {the_pointer_value_the_tensor.storage}, for example:
+//  `140245072983168.storage` Forward-compatibility change.
+//  0x6L: Implicit opereator versioning using number of specified argument.
+//  Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for
+//  details.
+//  0x7L: Enable support for operators with default arguments plus out
+//  arguments. Refer. See https://github.com/pytorch/pytorch/pull/63651 for
+//  details.
+//  0x8L: Emit promoted operators as instructions. See
+//  https://github.com/pytorch/pytorch/pull/71662 for details.
+//  0x9L: Change serialization format from pickle to format This version is to
+//  serve migration. v8 pickle and v9 flatbuffer are the same. Refer to the
+//  summary of https://github.com/pytorch/pytorch/pull/75201 for more details.
+constexpr uint64_t kProducedBytecodeVersion = 0x8L;
+
+// static_assert(
+//     kProducedBytecodeVersion >= kProducedFileFormatVersion,
+//     "kProducedBytecodeVersion must be higher or equal to
+//     kProducedFileFormatVersion.");
+
+// Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion
+// for limited backward/forward compatibility support of bytecode. If
+// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion
+// (in loader), we should support this model_version. For example, we provide a
+// wrapper to handle an updated operator.
+constexpr uint64_t kMinSupportedBytecodeVersion = 0x4L;
+constexpr uint64_t kMaxSupportedBytecodeVersion = 0x9L;
+
+} // namespace serialize
+} // namespace caffe2
diff --git a/MLPY/Lib/site-packages/torch/include/cpuinfo.h b/MLPY/Lib/site-packages/torch/include/cpuinfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..83444a8603ccc97e9f605ec0a06b73528419aa77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/cpuinfo.h
@@ -0,0 +1,1956 @@
+#pragma once
+#ifndef CPUINFO_H
+#define CPUINFO_H
+
+#ifndef __cplusplus
+	#include <stdbool.h>
+#endif
+
+#ifdef __APPLE__
+	#include <TargetConditionals.h>
+#endif
+
+#include <stdint.h>
+
+/* Identify architecture and define corresponding macro */
+
+#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86)
+	#define CPUINFO_ARCH_X86 1
+#endif
+
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+	#define CPUINFO_ARCH_X86_64 1
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+	#define CPUINFO_ARCH_ARM 1
+#endif
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+	#define CPUINFO_ARCH_ARM64 1
+#endif
+
+#if defined(__PPC64__) || defined(__powerpc64__) || defined(_ARCH_PPC64)
+	#define CPUINFO_ARCH_PPC64 1
+#endif
+
+#if defined(__asmjs__)
+	#define CPUINFO_ARCH_ASMJS 1
+#endif
+
+#if defined(__wasm__)
+	#if defined(__wasm_simd128__)
+		#define CPUINFO_ARCH_WASMSIMD 1
+	#else
+		#define CPUINFO_ARCH_WASM 1
+	#endif
+#endif
+
+/* Define other architecture-specific macros as 0 */
+
+#ifndef CPUINFO_ARCH_X86
+	#define CPUINFO_ARCH_X86 0
+#endif
+
+#ifndef CPUINFO_ARCH_X86_64
+	#define CPUINFO_ARCH_X86_64 0
+#endif
+
+#ifndef CPUINFO_ARCH_ARM
+	#define CPUINFO_ARCH_ARM 0
+#endif
+
+#ifndef CPUINFO_ARCH_ARM64
+	#define CPUINFO_ARCH_ARM64 0
+#endif
+
+#ifndef CPUINFO_ARCH_PPC64
+	#define CPUINFO_ARCH_PPC64 0
+#endif
+
+#ifndef CPUINFO_ARCH_ASMJS
+	#define CPUINFO_ARCH_ASMJS 0
+#endif
+
+#ifndef CPUINFO_ARCH_WASM
+	#define CPUINFO_ARCH_WASM 0
+#endif
+
+#ifndef CPUINFO_ARCH_WASMSIMD
+	#define CPUINFO_ARCH_WASMSIMD 0
+#endif
+
+#if CPUINFO_ARCH_X86 && defined(_MSC_VER)
+	#define CPUINFO_ABI __cdecl
+#elif CPUINFO_ARCH_X86 && defined(__GNUC__)
+	#define CPUINFO_ABI __attribute__((__cdecl__))
+#else
+	#define CPUINFO_ABI
+#endif
+
+#define CPUINFO_CACHE_UNIFIED          0x00000001
+#define CPUINFO_CACHE_INCLUSIVE        0x00000002
+#define CPUINFO_CACHE_COMPLEX_INDEXING 0x00000004
+
+struct cpuinfo_cache {
+	/** Cache size in bytes */
+	uint32_t size;
+	/** Number of ways of associativity */
+	uint32_t associativity;
+	/** Number of sets */
+	uint32_t sets;
+	/** Number of partitions */
+	uint32_t partitions;
+	/** Line size in bytes */
+	uint32_t line_size;
+	/**
+	 * Binary characteristics of the cache (unified cache, inclusive cache, cache with complex indexing).
+	 *
+	 * @see CPUINFO_CACHE_UNIFIED, CPUINFO_CACHE_INCLUSIVE, CPUINFO_CACHE_COMPLEX_INDEXING
+	 */
+	uint32_t flags;
+	/** Index of the first logical processor that shares this cache */
+	uint32_t processor_start;
+	/** Number of logical processors that share this cache */
+	uint32_t processor_count;
+};
+
+struct cpuinfo_trace_cache {
+	uint32_t uops;
+	uint32_t associativity;
+};
+
+#define CPUINFO_PAGE_SIZE_4KB  0x1000
+#define CPUINFO_PAGE_SIZE_1MB  0x100000
+#define CPUINFO_PAGE_SIZE_2MB  0x200000
+#define CPUINFO_PAGE_SIZE_4MB  0x400000
+#define CPUINFO_PAGE_SIZE_16MB 0x1000000
+#define CPUINFO_PAGE_SIZE_1GB  0x40000000
+
+struct cpuinfo_tlb {
+	uint32_t entries;
+	uint32_t associativity;
+	uint64_t pages;
+};
+
+/** Vendor of processor core design */
+enum cpuinfo_vendor {
+	/** Processor vendor is not known to the library, or the library failed to get vendor information from the OS. */
+	cpuinfo_vendor_unknown = 0,
+
+	/* Active vendors of modern CPUs */
+
+	/**
+	 * Intel Corporation. Vendor of x86, x86-64, IA64, and ARM processor microarchitectures.
+	 *
+	 * Sold its ARM design subsidiary in 2006. The last ARM processor design was released in 2004.
+	 */
+	cpuinfo_vendor_intel    = 1,
+	/** Advanced Micro Devices, Inc. Vendor of x86 and x86-64 processor microarchitectures. */
+	cpuinfo_vendor_amd      = 2,
+	/** ARM Holdings plc. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_arm      = 3,
+	/** Qualcomm Incorporated. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_qualcomm = 4,
+	/** Apple Inc. Vendor of ARM and ARM64 processor microarchitectures. */
+	cpuinfo_vendor_apple    = 5,
+	/** Samsung Electronics Co., Ltd. Vendir if ARM64 processor microarchitectures. */
+	cpuinfo_vendor_samsung  = 6,
+	/** Nvidia Corporation. Vendor of ARM64-compatible processor microarchitectures. */
+	cpuinfo_vendor_nvidia   = 7,
+	/** MIPS Technologies, Inc. Vendor of MIPS processor microarchitectures. */
+	cpuinfo_vendor_mips     = 8,
+	/** International Business Machines Corporation. Vendor of PowerPC processor microarchitectures. */
+	cpuinfo_vendor_ibm      = 9,
+	/** Ingenic Semiconductor. Vendor of MIPS processor microarchitectures. */
+	cpuinfo_vendor_ingenic  = 10,
+	/**
+	 * VIA Technologies, Inc. Vendor of x86 and x86-64 processor microarchitectures.
+	 *
+	 * Processors are designed by Centaur Technology, a subsidiary of VIA Technologies.
+	 */
+	cpuinfo_vendor_via      = 11,
+	/** Cavium, Inc. Vendor of ARM64 processor microarchitectures. */
+	cpuinfo_vendor_cavium   = 12,
+	/** Broadcom, Inc. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_broadcom = 13,
+	/** Applied Micro Circuits Corporation (APM). Vendor of ARM64 processor microarchitectures. */
+	cpuinfo_vendor_apm      = 14,
+	/**
+	 * Huawei Technologies Co., Ltd. Vendor of ARM64 processor microarchitectures.
+	 *
+	 * Processors are designed by HiSilicon, a subsidiary of Huawei.
+	 */
+	cpuinfo_vendor_huawei   = 15,
+	/**
+	 * Hygon (Chengdu Haiguang Integrated Circuit Design Co., Ltd), Vendor of x86-64 processor microarchitectures.
+	 *
+	 * Processors are variants of AMD cores.
+	 */
+	cpuinfo_vendor_hygon    = 16,
+
+	/* Active vendors of embedded CPUs */
+
+	/** Texas Instruments Inc. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_texas_instruments = 30,
+	/** Marvell Technology Group Ltd. Vendor of ARM processor microarchitectures. */
+	cpuinfo_vendor_marvell           = 31,
+	/** RDC Semiconductor Co., Ltd. Vendor of x86 processor microarchitectures. */
+	cpuinfo_vendor_rdc               = 32,
+	/** DM&P Electronics Inc. Vendor of x86 processor microarchitectures. */
+	cpuinfo_vendor_dmp               = 33,
+	/** Motorola, Inc. Vendor of PowerPC and ARM processor microarchitectures. */
+	cpuinfo_vendor_motorola          = 34,
+
+	/* Defunct CPU vendors */
+
+	/**
+	 * Transmeta Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 2004.
+	 * Transmeta processors implemented VLIW ISA and used binary translation to execute x86 code.
+	 */
+	cpuinfo_vendor_transmeta = 50,
+	/**
+	 * Cyrix Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1996.
+	 */
+	cpuinfo_vendor_cyrix     = 51,
+	/**
+	 * Rise Technology. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1999.
+	 */
+	cpuinfo_vendor_rise      = 52,
+	/**
+	 * National Semiconductor. Vendor of x86 processor microarchitectures.
+	 *
+	 * Sold its x86 design subsidiary in 1999. The last processor design was released in 1998.
+	 */
+	cpuinfo_vendor_nsc       = 53,
+	/**
+	 * Silicon Integrated Systems. Vendor of x86 processor microarchitectures.
+	 *
+	 * Sold its x86 design subsidiary in 2001. The last processor design was released in 2001.
+	 */
+	cpuinfo_vendor_sis       = 54,
+	/**
+	 * NexGen. Vendor of x86 processor microarchitectures.
+	 *
+	 * Now defunct. The last processor design was released in 1994.
+	 * NexGen designed the first x86 microarchitecture which decomposed x86 instructions into simple microoperations.
+	 */
+	cpuinfo_vendor_nexgen    = 55,
+	/**
+	 * United Microelectronics Corporation. Vendor of x86 processor microarchitectures.
+	 *
+	 * Ceased x86 in the early 1990s. The last processor design was released in 1991.
+	 * Designed U5C and U5D processors. Both are 486 level.
+	 */
+	cpuinfo_vendor_umc       = 56,
+	/**
+	 * Digital Equipment Corporation. Vendor of ARM processor microarchitecture.
+	 *
+	 * Sold its ARM designs in 1997. The last processor design was released in 1997.
+	 */
+	cpuinfo_vendor_dec       = 57,
+};
+
+/**
+ * Processor microarchitecture
+ *
+ * Processors with different microarchitectures often have different instruction performance characteristics,
+ * and may have dramatically different pipeline organization.
+ */
+enum cpuinfo_uarch {
+	/** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */
+	cpuinfo_uarch_unknown = 0,
+
+	/** Pentium and Pentium MMX microarchitecture. */
+	cpuinfo_uarch_p5    = 0x00100100,
+	/** Intel Quark microarchitecture. */
+	cpuinfo_uarch_quark = 0x00100101,
+
+	/** Pentium Pro, Pentium II, and Pentium III. */
+	cpuinfo_uarch_p6           = 0x00100200,
+	/** Pentium M. */
+	cpuinfo_uarch_dothan       = 0x00100201,
+	/** Intel Core microarchitecture. */
+	cpuinfo_uarch_yonah        = 0x00100202,
+	/** Intel Core 2 microarchitecture on 65 nm process. */
+	cpuinfo_uarch_conroe       = 0x00100203,
+	/** Intel Core 2 microarchitecture on 45 nm process. */
+	cpuinfo_uarch_penryn       = 0x00100204,
+	/** Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). */
+	cpuinfo_uarch_nehalem      = 0x00100205,
+	/** Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). */
+	cpuinfo_uarch_sandy_bridge = 0x00100206,
+	/** Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). */
+	cpuinfo_uarch_ivy_bridge   = 0x00100207,
+	/** Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). */
+	cpuinfo_uarch_haswell      = 0x00100208,
+	/** Intel Broadwell microarchitecture. */
+	cpuinfo_uarch_broadwell    = 0x00100209,
+	/** Intel Sky Lake microarchitecture (14 nm, including Kaby/Coffee/Whiskey/Amber/Comet/Cascade/Cooper Lake). */
+	cpuinfo_uarch_sky_lake     = 0x0010020A,
+	/** DEPRECATED (Intel Kaby Lake microarchitecture). */
+	cpuinfo_uarch_kaby_lake    = 0x0010020A,
+	/** Intel Palm Cove microarchitecture (10 nm, Cannon Lake). */
+	cpuinfo_uarch_palm_cove    = 0x0010020B,
+	/** Intel Sunny Cove microarchitecture (10 nm, Ice Lake). */
+	cpuinfo_uarch_sunny_cove   = 0x0010020C,
+
+	/** Pentium 4 with Willamette, Northwood, or Foster cores. */
+	cpuinfo_uarch_willamette = 0x00100300,
+	/** Pentium 4 with Prescott and later cores. */
+	cpuinfo_uarch_prescott   = 0x00100301,
+
+	/** Intel Atom on 45 nm process. */
+	cpuinfo_uarch_bonnell       = 0x00100400,
+	/** Intel Atom on 32 nm process. */
+	cpuinfo_uarch_saltwell      = 0x00100401,
+	/** Intel Silvermont microarchitecture (22 nm out-of-order Atom). */
+	cpuinfo_uarch_silvermont    = 0x00100402,
+	/** Intel Airmont microarchitecture (14 nm out-of-order Atom). */
+	cpuinfo_uarch_airmont       = 0x00100403,
+	/** Intel Goldmont microarchitecture (Denverton, Apollo Lake). */
+	cpuinfo_uarch_goldmont      = 0x00100404,
+	/** Intel Goldmont Plus microarchitecture (Gemini Lake). */
+	cpuinfo_uarch_goldmont_plus = 0x00100405,
+
+	/** Intel Knights Ferry HPC boards. */
+	cpuinfo_uarch_knights_ferry   = 0x00100500,
+	/** Intel Knights Corner HPC boards (aka Xeon Phi). */
+	cpuinfo_uarch_knights_corner  = 0x00100501,
+	/** Intel Knights Landing microarchitecture (second-gen MIC). */
+	cpuinfo_uarch_knights_landing = 0x00100502,
+	/** Intel Knights Hill microarchitecture (third-gen MIC). */
+	cpuinfo_uarch_knights_hill    = 0x00100503,
+	/** Intel Knights Mill Xeon Phi. */
+	cpuinfo_uarch_knights_mill    = 0x00100504,
+
+	/** Intel/Marvell XScale series. */
+	cpuinfo_uarch_xscale = 0x00100600,
+
+	/** AMD K5. */
+	cpuinfo_uarch_k5        = 0x00200100,
+	/** AMD K6 and alike. */
+	cpuinfo_uarch_k6        = 0x00200101,
+	/** AMD Athlon and Duron. */
+	cpuinfo_uarch_k7        = 0x00200102,
+	/** AMD Athlon 64, Opteron 64. */
+	cpuinfo_uarch_k8        = 0x00200103,
+	/** AMD Family 10h (Barcelona, Istambul, Magny-Cours). */
+	cpuinfo_uarch_k10       = 0x00200104,
+	/**
+	 * AMD Bulldozer microarchitecture
+	 * Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs.
+	 */
+	cpuinfo_uarch_bulldozer = 0x00200105,
+	/**
+	 * AMD Piledriver microarchitecture
+	 * Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs.
+	 */
+	cpuinfo_uarch_piledriver  = 0x00200106,
+	/** AMD Steamroller microarchitecture (Kaveri APUs). */
+	cpuinfo_uarch_steamroller = 0x00200107,
+	/** AMD Excavator microarchitecture (Carizzo APUs). */
+	cpuinfo_uarch_excavator   = 0x00200108,
+	/** AMD Zen microarchitecture (12/14 nm Ryzen and EPYC CPUs). */
+	cpuinfo_uarch_zen         = 0x00200109,
+	/** AMD Zen 2 microarchitecture (7 nm Ryzen and EPYC CPUs). */
+	cpuinfo_uarch_zen2        = 0x0020010A,
+	/** AMD Zen 3 microarchitecture. */
+	cpuinfo_uarch_zen3        = 0x0020010B,
+	/** AMD Zen 4 microarchitecture. */
+	cpuinfo_uarch_zen4        = 0x0020010C,
+
+	/** NSC Geode and AMD Geode GX and LX. */
+	cpuinfo_uarch_geode  = 0x00200200,
+	/** AMD Bobcat mobile microarchitecture. */
+	cpuinfo_uarch_bobcat = 0x00200201,
+	/** AMD Jaguar mobile microarchitecture. */
+	cpuinfo_uarch_jaguar = 0x00200202,
+	/** AMD Puma mobile microarchitecture. */
+	cpuinfo_uarch_puma   = 0x00200203,
+
+	/** ARM7 series. */
+	cpuinfo_uarch_arm7  = 0x00300100,
+	/** ARM9 series. */
+	cpuinfo_uarch_arm9  = 0x00300101,
+	/** ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. */
+	cpuinfo_uarch_arm11 = 0x00300102,
+
+	/** ARM Cortex-A5. */
+	cpuinfo_uarch_cortex_a5  = 0x00300205,
+	/** ARM Cortex-A7. */
+	cpuinfo_uarch_cortex_a7  = 0x00300207,
+	/** ARM Cortex-A8. */
+	cpuinfo_uarch_cortex_a8  = 0x00300208,
+	/** ARM Cortex-A9. */
+	cpuinfo_uarch_cortex_a9  = 0x00300209,
+	/** ARM Cortex-A12. */
+	cpuinfo_uarch_cortex_a12 = 0x00300212,
+	/** ARM Cortex-A15. */
+	cpuinfo_uarch_cortex_a15 = 0x00300215,
+	/** ARM Cortex-A17. */
+	cpuinfo_uarch_cortex_a17 = 0x00300217,
+
+	/** ARM Cortex-A32. */
+	cpuinfo_uarch_cortex_a32   = 0x00300332,
+	/** ARM Cortex-A35. */
+	cpuinfo_uarch_cortex_a35   = 0x00300335,
+	/** ARM Cortex-A53. */
+	cpuinfo_uarch_cortex_a53   = 0x00300353,
+	/** ARM Cortex-A55 revision 0 (restricted dual-issue capabilities compared to revision 1+). */
+	cpuinfo_uarch_cortex_a55r0 = 0x00300354,
+	/** ARM Cortex-A55. */
+	cpuinfo_uarch_cortex_a55   = 0x00300355,
+	/** ARM Cortex-A57. */
+	cpuinfo_uarch_cortex_a57   = 0x00300357,
+	/** ARM Cortex-A65. */
+	cpuinfo_uarch_cortex_a65   = 0x00300365,
+	/** ARM Cortex-A72. */
+	cpuinfo_uarch_cortex_a72   = 0x00300372,
+	/** ARM Cortex-A73. */
+	cpuinfo_uarch_cortex_a73   = 0x00300373,
+	/** ARM Cortex-A75. */
+	cpuinfo_uarch_cortex_a75   = 0x00300375,
+	/** ARM Cortex-A76. */
+	cpuinfo_uarch_cortex_a76   = 0x00300376,
+	/** ARM Cortex-A77. */
+	cpuinfo_uarch_cortex_a77   = 0x00300377,
+	/** ARM Cortex-A78. */
+	cpuinfo_uarch_cortex_a78   = 0x00300378,
+
+	/** ARM Neoverse N1. */
+	cpuinfo_uarch_neoverse_n1  = 0x00300400,
+	/** ARM Neoverse E1. */
+	cpuinfo_uarch_neoverse_e1  = 0x00300401,
+	/** ARM Neoverse V1. */
+	cpuinfo_uarch_neoverse_v1  = 0x00300402,
+	/** ARM Neoverse N2. */
+	cpuinfo_uarch_neoverse_n2  = 0x00300403,
+	/** ARM Neoverse V2. */
+	cpuinfo_uarch_neoverse_v2  = 0x00300404,
+
+	/** ARM Cortex-X1. */
+	cpuinfo_uarch_cortex_x1    = 0x00300501,
+	/** ARM Cortex-X2. */
+	cpuinfo_uarch_cortex_x2    = 0x00300502,
+	/** ARM Cortex-X3. */
+	cpuinfo_uarch_cortex_x3    = 0x00300503,
+
+	/** ARM Cortex-A510. */
+	cpuinfo_uarch_cortex_a510  = 0x00300551,
+	/** ARM Cortex-A710. */
+	cpuinfo_uarch_cortex_a710  = 0x00300571,
+	/** ARM Cortex-A715. */
+	cpuinfo_uarch_cortex_a715  = 0x00300572,
+
+	/** Qualcomm Scorpion. */
+	cpuinfo_uarch_scorpion = 0x00400100,
+	/** Qualcomm Krait. */
+	cpuinfo_uarch_krait    = 0x00400101,
+	/** Qualcomm Kryo. */
+	cpuinfo_uarch_kryo     = 0x00400102,
+	/** Qualcomm Falkor. */
+	cpuinfo_uarch_falkor   = 0x00400103,
+	/** Qualcomm Saphira. */
+	cpuinfo_uarch_saphira  = 0x00400104,
+
+	/** Nvidia Denver. */
+	cpuinfo_uarch_denver   = 0x00500100,
+	/** Nvidia Denver 2. */
+	cpuinfo_uarch_denver2  = 0x00500101,
+	/** Nvidia Carmel. */
+	cpuinfo_uarch_carmel   = 0x00500102,
+
+	/** Samsung Exynos M1 (Exynos 8890 big cores). */
+	cpuinfo_uarch_exynos_m1 = 0x00600100,
+	/** Samsung Exynos M2 (Exynos 8895 big cores). */
+	cpuinfo_uarch_exynos_m2 = 0x00600101,
+	/** Samsung Exynos M3 (Exynos 9810 big cores). */
+	cpuinfo_uarch_exynos_m3  = 0x00600102,
+	/** Samsung Exynos M4 (Exynos 9820 big cores). */
+	cpuinfo_uarch_exynos_m4  = 0x00600103,
+	/** Samsung Exynos M5 (Exynos 9830 big cores). */
+	cpuinfo_uarch_exynos_m5  = 0x00600104,
+
+	/* Deprecated synonym for Cortex-A76 */
+	cpuinfo_uarch_cortex_a76ae = 0x00300376,
+	/* Deprecated names for Exynos. */
+	cpuinfo_uarch_mongoose_m1 = 0x00600100,
+	cpuinfo_uarch_mongoose_m2 = 0x00600101,
+	cpuinfo_uarch_meerkat_m3  = 0x00600102,
+	cpuinfo_uarch_meerkat_m4  = 0x00600103,
+
+	/** Apple A6 and A6X processors. */
+	cpuinfo_uarch_swift     = 0x00700100,
+	/** Apple A7 processor. */
+	cpuinfo_uarch_cyclone   = 0x00700101,
+	/** Apple A8 and A8X processor. */
+	cpuinfo_uarch_typhoon   = 0x00700102,
+	/** Apple A9 and A9X processor. */
+	cpuinfo_uarch_twister   = 0x00700103,
+	/** Apple A10 and A10X processor. */
+	cpuinfo_uarch_hurricane = 0x00700104,
+	/** Apple A11 processor (big cores). */
+	cpuinfo_uarch_monsoon   = 0x00700105,
+	/** Apple A11 processor (little cores). */
+	cpuinfo_uarch_mistral   = 0x00700106,
+	/** Apple A12 processor (big cores). */
+	cpuinfo_uarch_vortex    = 0x00700107,
+	/** Apple A12 processor (little cores). */
+	cpuinfo_uarch_tempest   = 0x00700108,
+	/** Apple A13 processor (big cores). */
+	cpuinfo_uarch_lightning = 0x00700109,
+	/** Apple A13 processor (little cores). */
+	cpuinfo_uarch_thunder   = 0x0070010A,
+	/** Apple A14 / M1 processor (big cores). */
+	cpuinfo_uarch_firestorm = 0x0070010B,
+	/** Apple A14 / M1 processor (little cores). */
+	cpuinfo_uarch_icestorm  = 0x0070010C,
+	/** Apple A15 / M2 processor (big cores). */
+	cpuinfo_uarch_avalanche = 0x0070010D,
+	/** Apple A15 / M2 processor (little cores). */
+	cpuinfo_uarch_blizzard  = 0x0070010E,
+
+	/** Cavium ThunderX. */
+	cpuinfo_uarch_thunderx = 0x00800100,
+	/** Cavium ThunderX2 (originally Broadcom Vulkan). */
+	cpuinfo_uarch_thunderx2 = 0x00800200,
+
+	/** Marvell PJ4. */
+	cpuinfo_uarch_pj4 = 0x00900100,
+
+	/** Broadcom Brahma B15. */
+	cpuinfo_uarch_brahma_b15 = 0x00A00100,
+	/** Broadcom Brahma B53. */
+	cpuinfo_uarch_brahma_b53 = 0x00A00101,
+
+	/** Applied Micro X-Gene. */
+	cpuinfo_uarch_xgene = 0x00B00100,
+
+	/* Hygon Dhyana (a modification of AMD Zen for Chinese market). */
+	cpuinfo_uarch_dhyana = 0x01000100,
+
+	/** HiSilicon TaiShan v110 (Huawei Kunpeng 920 series processors). */
+	cpuinfo_uarch_taishan_v110 = 0x00C00100,
+};
+
+struct cpuinfo_processor {
+	/** SMT (hyperthread) ID within a core */
+	uint32_t smt_id;
+	/** Core containing this logical processor */
+	const struct cpuinfo_core* core;
+	/** Cluster of cores containing this logical processor */
+	const struct cpuinfo_cluster* cluster;
+	/** Physical package containing this logical processor */
+	const struct cpuinfo_package* package;
+#if defined(__linux__)
+	/**
+	 * Linux-specific ID for the logical processor:
+	 * - Linux kernel exposes information about this logical processor in /sys/devices/system/cpu/cpu<linux_id>/
+	 * - Bit <linux_id> in the cpu_set_t identifies this logical processor
+	 */
+	int linux_id;
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+	/** Windows-specific ID for the group containing the logical processor. */
+	uint16_t windows_group_id;
+	/**
+	 * Windows-specific ID of the logical processor within its group:
+	 * - Bit <windows_processor_id> in the KAFFINITY mask identifies this logical processor within its group.
+	 */
+	uint16_t windows_processor_id;
+#endif
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** APIC ID (unique x86-specific ID of the logical processor) */
+	uint32_t apic_id;
+#endif
+	struct {
+		/** Level 1 instruction cache */
+		const struct cpuinfo_cache* l1i;
+		/** Level 1 data cache */
+		const struct cpuinfo_cache* l1d;
+		/** Level 2 unified or data cache */
+		const struct cpuinfo_cache* l2;
+		/** Level 3 unified or data cache */
+		const struct cpuinfo_cache* l3;
+		/** Level 4 unified or data cache */
+		const struct cpuinfo_cache* l4;
+	} cache;
+};
+
+struct cpuinfo_core {
+	/** Index of the first logical processor on this core. */
+	uint32_t processor_start;
+	/** Number of logical processors on this core */
+	uint32_t processor_count;
+	/** Core ID within a package */
+	uint32_t core_id;
+	/** Cluster containing this core */
+	const struct cpuinfo_cluster* cluster;
+	/** Physical package containing this core. */
+	const struct cpuinfo_package* package;
+	/** Vendor of the CPU microarchitecture for this core */
+	enum cpuinfo_vendor vendor;
+	/** CPU microarchitecture for this core */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register for this core */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) for this core */
+	uint32_t midr;
+#endif
+	/** Clock rate (non-Turbo) of the core, in Hz */
+	uint64_t frequency;
+};
+
+struct cpuinfo_cluster {
+	/** Index of the first logical processor in the cluster */
+	uint32_t processor_start;
+	/** Number of logical processors in the cluster */
+	uint32_t processor_count;
+	/** Index of the first core in the cluster */
+	uint32_t core_start;
+	/** Number of cores on the cluster */
+	uint32_t core_count;
+	/** Cluster ID within a package */
+	uint32_t cluster_id;
+	/** Physical package containing the cluster */
+	const struct cpuinfo_package* package;
+	/** CPU microarchitecture vendor of the cores in the cluster */
+	enum cpuinfo_vendor vendor;
+	/** CPU microarchitecture of the cores in the cluster */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register of the cores in the cluster */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) of the cores in the cluster */
+	uint32_t midr;
+#endif
+	/** Clock rate (non-Turbo) of the cores in the cluster, in Hz */
+	uint64_t frequency;
+};
+
+#define CPUINFO_PACKAGE_NAME_MAX 48
+
+struct cpuinfo_package {
+	/** SoC or processor chip model name */
+	char name[CPUINFO_PACKAGE_NAME_MAX];
+	/** Index of the first logical processor on this physical package */
+	uint32_t processor_start;
+	/** Number of logical processors on this physical package */
+	uint32_t processor_count;
+	/** Index of the first core on this physical package */
+	uint32_t core_start;
+	/** Number of cores on this physical package */
+	uint32_t core_count;
+	/** Index of the first cluster of cores on this physical package */
+	uint32_t cluster_start;
+	/** Number of clusters of cores on this physical package */
+	uint32_t cluster_count;
+};
+
+struct cpuinfo_uarch_info {
+	/** Type of CPU microarchitecture */
+	enum cpuinfo_uarch uarch;
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/** Value of CPUID leaf 1 EAX register for the microarchitecture */
+	uint32_t cpuid;
+#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/** Value of Main ID Register (MIDR) for the microarchitecture */
+	uint32_t midr;
+#endif
+	/** Number of logical processors with the microarchitecture */
+	uint32_t processor_count;
+	/** Number of cores with the microarchitecture */
+	uint32_t core_count;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool CPUINFO_ABI cpuinfo_initialize(void);
+
+void CPUINFO_ABI cpuinfo_deinitialize(void);
+
+#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+	/* This structure is not a part of stable API. Use cpuinfo_has_x86_* functions instead. */
+	struct cpuinfo_x86_isa {
+		#if CPUINFO_ARCH_X86
+			bool rdtsc;
+		#endif
+		bool rdtscp;
+		bool rdpid;
+		bool sysenter;
+		#if CPUINFO_ARCH_X86
+			bool syscall;
+		#endif
+		bool msr;
+		bool clzero;
+		bool clflush;
+		bool clflushopt;
+		bool mwait;
+		bool mwaitx;
+		#if CPUINFO_ARCH_X86
+			bool emmx;
+		#endif
+		bool fxsave;
+		bool xsave;
+		#if CPUINFO_ARCH_X86
+			bool fpu;
+			bool mmx;
+			bool mmx_plus;
+		#endif
+		bool three_d_now;
+		bool three_d_now_plus;
+		#if CPUINFO_ARCH_X86
+			bool three_d_now_geode;
+		#endif
+		bool prefetch;
+		bool prefetchw;
+		bool prefetchwt1;
+		#if CPUINFO_ARCH_X86
+			bool daz;
+			bool sse;
+			bool sse2;
+		#endif
+		bool sse3;
+		bool ssse3;
+		bool sse4_1;
+		bool sse4_2;
+		bool sse4a;
+		bool misaligned_sse;
+		bool avx;
+		bool avxvnni;
+		bool fma3;
+		bool fma4;
+		bool xop;
+		bool f16c;
+		bool avx2;
+		bool avx512f;
+		bool avx512pf;
+		bool avx512er;
+		bool avx512cd;
+		bool avx512dq;
+		bool avx512bw;
+		bool avx512vl;
+		bool avx512ifma;
+		bool avx512vbmi;
+		bool avx512vbmi2;
+		bool avx512bitalg;
+		bool avx512vpopcntdq;
+		bool avx512vnni;
+		bool avx512bf16;
+		bool avx512fp16;
+		bool avx512vp2intersect;
+		bool avx512_4vnniw;
+		bool avx512_4fmaps;
+		bool hle;
+		bool rtm;
+		bool xtest;
+		bool mpx;
+		#if CPUINFO_ARCH_X86
+			bool cmov;
+			bool cmpxchg8b;
+		#endif
+		bool cmpxchg16b;
+		bool clwb;
+		bool movbe;
+		#if CPUINFO_ARCH_X86_64
+			bool lahf_sahf;
+		#endif
+		bool fs_gs_base;
+		bool lzcnt;
+		bool popcnt;
+		bool tbm;
+		bool bmi;
+		bool bmi2;
+		bool adx;
+		bool aes;
+		bool vaes;
+		bool pclmulqdq;
+		bool vpclmulqdq;
+		bool gfni;
+		bool rdrand;
+		bool rdseed;
+		bool sha;
+		bool rng;
+		bool ace;
+		bool ace2;
+		bool phe;
+		bool pmm;
+		bool lwp;
+	};
+
+	extern struct cpuinfo_x86_isa cpuinfo_isa;
+#endif
+
+static inline bool cpuinfo_has_x86_rdtsc(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.rdtsc;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdtscp(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdtscp;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdpid(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdpid;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_clzero(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.clzero;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mwait(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mwait;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mwaitx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mwaitx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fxsave(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fxsave;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xsave(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xsave;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fpu(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.fpu;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mmx(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.mmx;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mmx_plus(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.mmx_plus;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.three_d_now;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow_plus(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.three_d_now_plus;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_3dnow_geode(void) {
+	#if CPUINFO_ARCH_X86_64
+		return false;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return false;
+		#else
+			return cpuinfo_isa.three_d_now_geode;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetch(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetch;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetchw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetchw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_prefetchwt1(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.prefetchwt1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_daz(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.daz;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse2(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse2;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse3;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_ssse3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.ssse3;
+		#endif
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4_1(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse4_1;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.sse4_1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4_2(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.sse4_2;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.sse4_2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sse4a(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.sse4a;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_misaligned_sse(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.misaligned_sse;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avxvnni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avxvnni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fma3(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fma3;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_fma4(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.fma4;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xop(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xop;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_f16c(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.f16c;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512f(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512f;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512pf(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512pf;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512er(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512er;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512cd(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512cd;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512dq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512dq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vl(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vl;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512ifma(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512ifma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vbmi(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vbmi;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vbmi2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vbmi2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bitalg(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bitalg;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vpopcntdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vpopcntdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vnni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vnni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512bf16(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512fp16(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512vp2intersect(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512vp2intersect;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512_4vnniw(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512_4vnniw;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_avx512_4fmaps(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.avx512_4fmaps;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_hle(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.hle;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rtm(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rtm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_xtest(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.xtest;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_mpx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.mpx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmov(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.cmov;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmpxchg8b(void) {
+	#if CPUINFO_ARCH_X86_64
+		return true;
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.cmpxchg8b;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_cmpxchg16b(void) {
+	#if CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.cmpxchg16b;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_clwb(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.clwb;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_movbe(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.movbe;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_lahf_sahf(void) {
+	#if CPUINFO_ARCH_X86
+		return true;
+	#elif CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.lahf_sahf;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_lzcnt(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.lzcnt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_popcnt(void) {
+	#if CPUINFO_ARCH_X86_64
+		#if defined(__ANDROID__)
+			return true;
+		#else
+			return cpuinfo_isa.popcnt;
+		#endif
+	#elif CPUINFO_ARCH_X86
+		return cpuinfo_isa.popcnt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_tbm(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.tbm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_bmi(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.bmi;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_bmi2(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.bmi2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_adx(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.adx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_aes(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.aes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_vaes(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.vaes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_pclmulqdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.pclmulqdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_vpclmulqdq(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.vpclmulqdq;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_gfni(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.gfni;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdrand(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdrand;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_rdseed(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.rdseed;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_x86_sha(void) {
+	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
+		return cpuinfo_isa.sha;
+	#else
+		return false;
+	#endif
+}
+
+#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+	/* This structure is not a part of stable API. Use cpuinfo_has_arm_* functions instead. */
+	struct cpuinfo_arm_isa {
+		#if CPUINFO_ARCH_ARM
+			bool thumb;
+			bool thumb2;
+			bool thumbee;
+			bool jazelle;
+			bool armv5e;
+			bool armv6;
+			bool armv6k;
+			bool armv7;
+			bool armv7mp;
+			bool armv8;
+			bool idiv;
+
+			bool vfpv2;
+			bool vfpv3;
+			bool d32;
+			bool fp16;
+			bool fma;
+
+			bool wmmx;
+			bool wmmx2;
+			bool neon;
+		#endif
+		#if CPUINFO_ARCH_ARM64
+			bool atomics;
+			bool bf16;
+			bool sve;
+			bool sve2;
+			bool i8mm;
+		#endif
+		bool rdm;
+		bool fp16arith;
+		bool dot;
+		bool jscvt;
+		bool fcma;
+		bool fhm;
+
+		bool aes;
+		bool sha1;
+		bool sha2;
+		bool pmull;
+		bool crc32;
+	};
+
+	extern struct cpuinfo_arm_isa cpuinfo_isa;
+#endif
+
+static inline bool cpuinfo_has_arm_thumb(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.thumb;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_thumb2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.thumb2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v5e(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv5e;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v6(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv6;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v6k(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv6k;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v7(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv7;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v7mp(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv7mp;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_v8(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.armv8;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_idiv(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.idiv;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_fp16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv3_fp16_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16 && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv4(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_vfpv4_d32(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma && cpuinfo_isa.d32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fp16_arith(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fp16arith;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_wmmx(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.wmmx;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_wmmx2(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.wmmx2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fp16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fp16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fma(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_v8(void) {
+	#if CPUINFO_ARCH_ARM64
+		return true;
+	#elif CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.armv8;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_atomics(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.atomics;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_rdm(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.rdm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_fp16_arith(void) {
+	#if CPUINFO_ARCH_ARM
+		return cpuinfo_isa.neon && cpuinfo_isa.fp16arith;
+	#elif CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fp16arith;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fhm(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fhm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_dot(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.dot;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_neon_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_jscvt(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.jscvt;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_fcma(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.fcma;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_i8mm(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.i8mm;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_aes(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.aes;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sha1(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sha1;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sha2(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sha2;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_pmull(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.pmull;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_crc32(void) {
+	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.crc32;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve_bf16(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve && cpuinfo_isa.bf16;
+	#else
+		return false;
+	#endif
+}
+
+static inline bool cpuinfo_has_arm_sve2(void) {
+	#if CPUINFO_ARCH_ARM64
+		return cpuinfo_isa.sve2;
+	#else
+		return false;
+	#endif
+}
+
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processors(void);
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_cores(void);
+const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_clusters(void);
+const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_packages(void);
+const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarchs(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_caches(void);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_caches(void);
+
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processor(uint32_t index);
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_core(uint32_t index);
+const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_cluster(uint32_t index);
+const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_package(uint32_t index);
+const struct cpuinfo_uarch_info* CPUINFO_ABI cpuinfo_get_uarch(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_cache(uint32_t index);
+const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_cache(uint32_t index);
+
+uint32_t CPUINFO_ABI cpuinfo_get_processors_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_cores_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_clusters_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_packages_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_uarchs_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l1i_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l1d_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l2_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l3_caches_count(void);
+uint32_t CPUINFO_ABI cpuinfo_get_l4_caches_count(void);
+
+/**
+ * Returns upper bound on cache size.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_max_cache_size(void);
+
+/**
+ * Identify the logical processor that executes the current thread.
+ *
+ * There is no guarantee that the thread will stay on the same logical processor for any time.
+ * Callers should treat the result as only a hint, and be prepared to handle NULL return value.
+ */
+const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_current_processor(void);
+
+/**
+ * Identify the core that executes the current thread.
+ *
+ * There is no guarantee that the thread will stay on the same core for any time.
+ * Callers should treat the result as only a hint, and be prepared to handle NULL return value.
+ */
+const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_current_core(void);
+
+/**
+ * Identify the microarchitecture index of the core that executes the current thread.
+ * If the system does not support such identification, the function returns 0.
+ *
+ * There is no guarantee that the thread will stay on the same type of core for any time.
+ * Callers should treat the result as only a hint.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index(void);
+
+/**
+ * Identify the microarchitecture index of the core that executes the current thread.
+ * If the system does not support such identification, the function returns the user-specified default value.
+ *
+ * There is no guarantee that the thread will stay on the same type of core for any time.
+ * Callers should treat the result as only a hint.
+ */
+uint32_t CPUINFO_ABI cpuinfo_get_current_uarch_index_with_default(uint32_t default_uarch_index);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* CPUINFO_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl.h b/MLPY/Lib/site-packages/torch/include/dnnl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8ebd3b838642891a300fc77d88bea921a4cad73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_H
+#define DNNL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+#endif /* DNNL_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_config.h b/MLPY/Lib/site-packages/torch/include/dnnl_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebc23d438ff76b329065e710bee5dfbde5505c2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_config.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_CONFIG_H
+#define DNNL_CONFIG_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+#endif /* DNNL_CONFIG_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_debug.h b/MLPY/Lib/site-packages/torch/include/dnnl_debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..d650115f2ebde41d7e1a726e9164d31ad5bcdaf3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_debug.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_DEBUG_H
+#define DNNL_DEBUG_H
+
+#include "oneapi/dnnl/dnnl_debug.h"
+
+#endif /* DNNL_DEBUG_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_ocl.h b/MLPY/Lib/site-packages/torch/include/dnnl_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4554534de61413e6aaa3e4e38eb3721b7dcf910
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_ocl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_OCL_H
+#define DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#endif /* DNNL_OCL_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_sycl.h b/MLPY/Lib/site-packages/torch/include/dnnl_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d029e0e17bf548fdeff9c1866eb0115d31674d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_sycl.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_SYCL_H
+#define DNNL_SYCL_H
+
+#include "oneapi/dnnl/dnnl_sycl.h"
+
+#endif /* DNNL_SYCL_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_sycl_types.h b/MLPY/Lib/site-packages/torch/include/dnnl_sycl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f071ca6cfbd5c76d5ce1fbfd7ca6ee27dd354b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_sycl_types.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_SYCL_TYPES_H
+#define DNNL_SYCL_TYPES_H
+
+#include "oneapi/dnnl/dnnl_sycl_types.h"
+
+#endif /* DNNL_SYCL_TYPES_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_threadpool.h b/MLPY/Lib/site-packages/torch/include/dnnl_threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a189ac44d923aeae9bc4e448450bd5ae583356b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_threadpool.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_THREADPOOL_H
+#define DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#endif /* DNNL_THREADPOOL_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_types.h b/MLPY/Lib/site-packages/torch/include/dnnl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..130d67e0b3c2175b11863cdc9e62d31b55f57d45
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_types.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_TYPES_H
+#define DNNL_TYPES_H
+
+#include "oneapi/dnnl/dnnl_types.h"
+
+#endif /* DNNL_TYPES_H */
diff --git a/MLPY/Lib/site-packages/torch/include/dnnl_version.h b/MLPY/Lib/site-packages/torch/include/dnnl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..956a253adbba299be9747a5fc32fe647b115ee89
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/dnnl_version.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_VERSION_H
+#define DNNL_VERSION_H
+
+#include "oneapi/dnnl/dnnl_version.h"
+
+#endif /* DNNL_VERSION_H */
diff --git a/MLPY/Lib/site-packages/torch/include/experiments-config.h b/MLPY/Lib/site-packages/torch/include/experiments-config.h
new file mode 100644
index 0000000000000000000000000000000000000000..601a83e881af930e6cf9e4f172eebace724e4a52
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/experiments-config.h
@@ -0,0 +1,25 @@
+// Copyright 2023 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct xnn_experiment_config {
+  bool adaptive_avx_optimization;
+};
+
+struct xnn_experiment_config* xnn_get_experiment_config();
+
+void xnn_experiment_enable_adaptive_avx_optimization();
+
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/fp16.h b/MLPY/Lib/site-packages/torch/include/fp16.h
new file mode 100644
index 0000000000000000000000000000000000000000..84a01d40d9225b9fb1d5bcb25d845ee3471502ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/fp16.h
@@ -0,0 +1,11 @@
+#pragma once
+#ifndef FP16_H
+#define FP16_H
+
+#include <fp16/fp16.h>
+
+#if defined(PSIMD_H)
+#include <fp16/psimd.h>
+#endif
+
+#endif /* FP16_H */
diff --git a/MLPY/Lib/site-packages/torch/include/fxdiv.h b/MLPY/Lib/site-packages/torch/include/fxdiv.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0bb47c45fcfa0ca4f41cb8fe08d6f4ce55a94af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/fxdiv.h
@@ -0,0 +1,425 @@
+#pragma once
+#ifndef FXDIV_H
+#define FXDIV_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstddef>
+	#include <cstdint>
+	#include <climits>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stddef.h>
+	#include <stdint.h>
+	#include <limits.h>
+#endif
+
+#if defined(_MSC_VER)
+	#include <intrin.h>
+	#if defined(_M_IX86) || defined(_M_X64)
+		#include <immintrin.h>
+	#endif
+#endif
+
+#ifndef FXDIV_USE_INLINE_ASSEMBLY
+	#define FXDIV_USE_INLINE_ASSEMBLY 0
+#endif
+
+static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
+#if defined(_MSC_VER) && defined(_M_IX86)
+	return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
+#else
+	return (uint64_t) a * (uint64_t) b;
+#endif
+}
+
+static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
+#elif defined(_MSC_VER) && defined(_M_IX86)
+	return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
+#elif defined(_MSC_VER) && defined(_M_ARM)
+	return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
+#else
+	return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
+#endif
+}
+
+static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
+#elif defined(_MSC_VER) && defined(_M_X64)
+	return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
+#elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
+	return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
+#else
+	const uint32_t a_lo = (uint32_t) a;
+	const uint32_t a_hi = (uint32_t) (a >> 32);
+	const uint32_t b_lo = (uint32_t) b;
+	const uint32_t b_hi = (uint32_t) (b >> 32);
+
+	const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
+		(uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
+	return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
+		((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
+#endif
+}
+
+static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
+#if SIZE_MAX == UINT32_MAX
+	return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
+#elif SIZE_MAX == UINT64_MAX
+	return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
+#else
+	#error Unsupported platform
+#endif
+}
+
+struct fxdiv_divisor_uint32_t {
+	uint32_t value;
+	uint32_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint32_t {
+	uint32_t quotient;
+	uint32_t remainder;
+};
+
+struct fxdiv_divisor_uint64_t {
+	uint64_t value;
+	uint64_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint64_t {
+	uint64_t quotient;
+	uint64_t remainder;
+};
+
+struct fxdiv_divisor_size_t {
+	size_t value;
+	size_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_size_t {
+	size_t quotient;
+	size_t remainder;
+};
+
+static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
+	struct fxdiv_divisor_uint32_t result = { d };
+	if (d == 1) {
+		result.m = UINT32_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t l_minus_1 = 31 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
+			unsigned long l_minus_1;
+			_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
+		#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t l_minus_1;
+			__asm__("BSRL %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1)
+				: "cc");
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+
+			uint32_t l_minus_1 = 0;
+			uint32_t x = d - 1;
+			uint32_t y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+		#endif
+		uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
+		#if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t q;
+			__asm__("DIVL %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (0)
+				: "cc");
+		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
+			unsigned int remainder;
+			const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
+		#else
+			const uint32_t q = ((uint64_t) u_hi << 32) / d;
+		#endif
+
+		result.m = q + UINT32_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
+	struct fxdiv_divisor_uint64_t result = { d };
+	if (d == 1) {
+		result.m = UINT64_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t nlz_d = clz(d);
+			const uint32_t l_minus_1 = 63 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t nlz_d = __clzll((long long) d);
+			const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
+		#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
+			unsigned long l_minus_1;
+			_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
+			unsigned long bsr_d;
+			_BitScanReverse64(&bsr_d, (unsigned __int64) d);
+			const uint32_t nlz_d = bsr_d ^ 0x3F;
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
+			const uint64_t d_minus_1 = d - 1;
+			const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			unsigned long l_minus_1;
+			if ((uint32_t) (d_minus_1 >> 32) == 0) {
+				_BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
+			} else {
+				_BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
+				l_minus_1 += 32;
+			}
+			const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
+		#elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t l_minus_1;
+			__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1)
+				: "cc");
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
+			const uint32_t nlz_d = __builtin_clzll(d);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+			const uint64_t d_minus_1 = d - 1;
+			const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			uint32_t l_minus_1 = 0;
+			uint32_t x = (uint32_t) d_minus_1;
+			uint32_t y = d_minus_1 >> 32;
+			if (y != 0) {
+				l_minus_1 += 32;
+				x = y;
+			}
+			y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+			const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
+		#endif
+		uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
+		#if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t q;
+			__asm__("DIVQ %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (UINT64_C(0))
+				: "cc");
+		#elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
+			/* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
+			const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
+		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
+			unsigned __int64 remainder;
+			const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
+		#else
+			/* Implementation based on code from Hacker's delight */
+
+			/* Normalize divisor and shift divident left */
+			d <<= nlz_d;
+			u_hi <<= nlz_d;
+			/* Break divisor up into two 32-bit digits */
+			const uint64_t d_hi = (uint32_t) (d >> 32);
+			const uint32_t d_lo = (uint32_t) d;
+
+			/* Compute the first quotient digit, q1 */
+			uint64_t q1 = u_hi / d_hi;
+			uint64_t r1 = u_hi - q1 * d_hi;
+
+			while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
+				q1 -= 1;
+				r1 += d_hi;
+				if ((r1 >> 32) != 0) {
+					break;
+				}
+			}
+
+			/* Multiply and subtract. */
+			u_hi = (u_hi << 32) - q1 * d;
+
+			/* Compute the second quotient digit, q0 */
+			uint64_t q0 = u_hi / d_hi;
+			uint64_t r0 = u_hi - q0 * d_hi;
+
+			while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
+				q0 -= 1;
+				r0 += d_hi;
+				if ((r0 >> 32) != 0) {
+					break;
+				}
+			}
+			const uint64_t q = (q1 << 32) | (uint32_t) q0;
+		#endif
+		result.m = q + UINT64_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
+#else
+	#error Unsupported platform
+#endif
+	struct fxdiv_divisor_size_t size_result = {
+		(size_t) uint_result.value,
+		(size_t) uint_result.m,
+		uint_result.s1,
+		uint_result.s2
+	};
+	return size_result;
+}
+
+static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint32_divisor = {
+		(uint32_t) divisor.value,
+		(uint32_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint64_divisor = {
+		(uint64_t) divisor.value,
+		(uint64_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
+#else
+	#error Unsupported platform
+#endif
+}
+
+static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
+	const size_t quotient = fxdiv_quotient_size_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	const uint32_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint32_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	const uint64_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint64_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	const size_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_size_t result = { quotient, remainder };
+	return result;
+}
+
+#endif /* FXDIV_H */
diff --git a/MLPY/Lib/site-packages/torch/include/libshm.h b/MLPY/Lib/site-packages/torch/include/libshm.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb916f32cce15081f3b11b40c7d7c0e128283acc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/libshm.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/MapAllocator.h>
+
+#ifdef __cplusplus
+
+#ifdef SHM_EXPORTS
+#define SHM_API __declspec(dllexport)
+#else
+#define SHM_API __declspec(dllimport)
+#endif
+
+SHM_API void libshm_init(const char* manager_exec_path);
+
+class SHM_API THManagedMapAllocator : public at::RefcountedMapAllocator {
+ public:
+  THManagedMapAllocator(
+      const char* manager_handle,
+      const char* filename,
+      int flags,
+      size_t size)
+      : at::RefcountedMapAllocator(filename, flags, size) {}
+
+  static at::DataPtr makeDataPtr(
+      const char* manager_handle,
+      const char* filename,
+      int flags,
+      size_t size);
+  static THManagedMapAllocator* fromDataPtr(const at::DataPtr&);
+
+  const char* manager_handle() const {
+    return "no_manager";
+  }
+};
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/psimd.h b/MLPY/Lib/site-packages/torch/include/psimd.h
new file mode 100644
index 0000000000000000000000000000000000000000..824d3ad546827eebcbc7bcd64ee7ff11ee5491b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/psimd.h
@@ -0,0 +1,1384 @@
+#pragma once
+#ifndef PSIMD_H
+#define PSIMD_H
+
+#if defined(__CUDA_ARCH__)
+	/* CUDA compiler */
+	#define PSIMD_INTRINSIC __forceinline__ __device__
+#elif defined(__OPENCL_VERSION__)
+	/* OpenCL compiler */
+	#define PSIMD_INTRINSIC inline static
+#elif defined(__INTEL_COMPILER)
+	/* Intel compiler, even on Windows */
+	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
+#elif defined(__GNUC__)
+	/* GCC-compatible compiler (gcc/clang/icc) */
+	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
+#elif defined(_MSC_VER)
+	/* MSVC-compatible compiler (cl/icl/clang-cl) */
+	#define PSIMD_INTRINSIC __forceinline static
+#elif defined(__cplusplus)
+	/* Generic C++ compiler */
+	#define PSIMD_INTRINSIC inline static
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+	/* Generic C99 compiler */
+	#define PSIMD_INTRINSIC inline static
+#else
+	/* Generic C compiler */
+	#define PSIMD_INTRINSIC static
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+	#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+		#include <arm_neon.h>
+	#endif
+
+	#if defined(__SSE2__)
+		#include <emmintrin.h>
+	#endif
+
+	#if defined(__SSE3__)
+		#include <pmmintrin.h>
+	#endif
+
+	#if defined(__SSSE3__)
+		#include <tmmintrin.h>
+	#endif
+
+	#if defined(__SSE4_1__)
+		#include <smmintrin.h>
+	#endif
+
+	#if defined(__SSE4_2__)
+		#include <nmmintrin.h>
+	#endif
+
+	#if defined(__AVX__)
+		#include <immintrin.h>
+	#endif
+#elif defined(_MSC_VER)
+	#include <intrin.h>
+#endif
+
+#if defined(__cplusplus)
+	#define PSIMD_CXX_SYNTAX
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+	#define PSIMD_C11_SYNTAX
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+	#define PSIMD_C99_SYNTAX
+#else
+	#define PSIMD_C89_SYNTAX
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstddef>
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stddef.h>
+	#include <stdint.h>
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+	#define PSIMD_HAVE_F64 0
+	#define PSIMD_HAVE_F32 1
+	#define PSIMD_HAVE_U8 1
+	#define PSIMD_HAVE_S8 1
+	#define PSIMD_HAVE_U16 1
+	#define PSIMD_HAVE_S16 1
+	#define PSIMD_HAVE_U32 1
+	#define PSIMD_HAVE_S32 1
+	#define PSIMD_HAVE_U64 0
+	#define PSIMD_HAVE_S64 0
+
+	typedef int8_t   psimd_s8  __attribute__((vector_size(16), aligned(1)));
+	typedef uint8_t  psimd_u8  __attribute__((vector_size(16), aligned(1)));
+	typedef int16_t  psimd_s16 __attribute__((vector_size(16), aligned(2)));
+	typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
+	typedef int32_t  psimd_s32 __attribute__((vector_size(16), aligned(4)));
+	typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
+	typedef float    psimd_f32 __attribute__((vector_size(16), aligned(4)));
+
+	typedef struct {
+		psimd_s8 lo;
+		psimd_s8 hi;
+	} psimd_s8x2;
+
+	typedef struct {
+		psimd_u8 lo;
+		psimd_u8 hi;
+	} psimd_u8x2;
+
+	typedef struct {
+		psimd_s16 lo;
+		psimd_s16 hi;
+	} psimd_s16x2;
+
+	typedef struct {
+		psimd_u16 lo;
+		psimd_u16 hi;
+	} psimd_u16x2;
+
+	typedef struct {
+		psimd_s32 lo;
+		psimd_s32 hi;
+	} psimd_s32x2;
+
+	typedef struct {
+		psimd_u32 lo;
+		psimd_u32 hi;
+	} psimd_u32x2;
+
+	typedef struct {
+		psimd_f32 lo;
+		psimd_f32 hi;
+	} psimd_f32x2;
+
+	/* Bit casts */
+	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
+		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
+		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
+		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
+		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
+		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
+		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
+	}
+
+	/* Swap */
+	PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
+		const psimd_s8 new_a = *b;
+		const psimd_s8 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
+		const psimd_u8 new_a = *b;
+		const psimd_u8 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
+		const psimd_s16 new_a = *b;
+		const psimd_s16 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
+		const psimd_u16 new_a = *b;
+		const psimd_u16 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
+		const psimd_s32 new_a = *b;
+		const psimd_s32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
+		const psimd_u32 new_a = *b;
+		const psimd_u32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
+		const psimd_f32 new_a = *b;
+		const psimd_f32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	/* Zero-initialization */
+	PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
+		return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
+		return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
+		return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
+		return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
+		return (psimd_s32) { 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
+		return (psimd_u32) { 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
+		return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
+	}
+
+	/* Initialization to the same constant */
+	PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
+		return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
+		return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
+		return (psimd_s16) { c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
+		return (psimd_u16) { c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
+		return (psimd_s32) { c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
+		return (psimd_u32) { c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
+		return (psimd_f32) { c, c, c, c };
+	}
+
+	/* Load vector */
+	PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
+		return *((const psimd_s8*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
+		return *((const psimd_u8*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
+		return *((const psimd_s16*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
+		return *((const psimd_u16*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
+		return *((const psimd_s32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
+		return *((const psimd_u32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
+		return *((const psimd_f32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
+		return psimd_splat_s8(*((const int8_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
+		return psimd_splat_u8(*((const uint8_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
+		return psimd_splat_s16(*((const int16_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
+		return psimd_splat_u16(*((const uint16_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
+		return psimd_splat_s32(*((const int32_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
+		return psimd_splat_u32(*((const uint32_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
+		return psimd_splat_f32(*((const float*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
+		return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
+		return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
+		return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
+		const int32_t* address_s32 = (const int32_t*) address;
+		return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
+		const uint32_t* address_u32 = (const uint32_t*) address;
+		return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
+		const int32_t* address_s32 = (const int32_t*) address;
+		return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
+		const uint32_t* address_u32 = (const uint32_t*) address;
+		return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
+		return psimd_load_s32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
+		return psimd_load_u32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
+		return psimd_load_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
+		const psimd_f32 v0x1x = psimd_load_f32(address);
+		const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
+		#if defined(__clang__)
+			return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
+		#else
+			return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
+		return psimd_load_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
+		const psimd_f32 v0x1x = psimd_load_f32(address);
+		const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
+		#if defined(__clang__)
+			return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
+		#else
+			return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
+		return psimd_load_stride2_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
+		const float* address0_f32 = (const float*) address;
+		const float* address1_f32 = address0_f32 + stride;
+		const float* address2_f32 = address1_f32 + stride;
+		const float* address3_f32 = address2_f32 + stride;
+		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
+		return psimd_load1_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
+		const float* address0_f32 = (const float*) address;
+		const float* address1_f32 = address0_f32 + stride;
+		const float* address2_f32 = address1_f32 + stride;
+		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
+		return psimd_load_stride_f32(address, stride);
+	}
+
+	/* Store vector */
+	PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
+		*((psimd_s8*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
+		*((psimd_u8*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
+		*((psimd_s16*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
+		*((psimd_u16*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
+		*((psimd_s32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
+		*((psimd_u32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
+		*((psimd_f32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
+		*((int32_t*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
+		*((uint32_t*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
+		*((float*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
+		int32_t* address_s32 = (int32_t*) address;
+		address_s32[0] = value[0];
+		address_s32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
+		uint32_t* address_u32 = (uint32_t*) address;
+		address_u32[0] = value[0];
+		address_u32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0] = value[0];
+		address_f32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
+		int32_t* address_s32 = (int32_t*) address;
+		address_s32[0] = value[0];
+		address_s32[1] = value[1];
+		address_s32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
+		uint32_t* address_u32 = (uint32_t*) address;
+		address_u32[0] = value[0];
+		address_u32[1] = value[1];
+		address_u32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0] = value[0];
+		address_f32[1] = value[1];
+		address_f32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
+		psimd_store_s32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
+		psimd_store_u32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
+		psimd_store_f32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address0_f32 = (float*) address;
+		float* address1_f32 = address0_f32 + stride;
+		float* address2_f32 = address1_f32 + stride;
+		float* address3_f32 = address2_f32 + stride;
+		*address0_f32 = value[0];
+		*address1_f32 = value[1];
+		*address2_f32 = value[2];
+		*address3_f32 = value[3];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		psimd_store1_f32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0]      = value[0];
+		address_f32[stride] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address0_f32 = (float*) address;
+		float* address1_f32 = address0_f32 + stride;
+		float* address2_f32 = address1_f32 + stride;
+		*address0_f32 = value[0];
+		*address1_f32 = value[1];
+		*address2_f32 = value[2];
+	}
+
+	/* Vector addition */
+	PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a + b;
+		#endif
+	}
+
+	/* Vector subtraction */
+	PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a - b;
+		#endif
+	}
+
+	/* Vector multiplication */
+	PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a * b;
+		#endif
+	}
+
+	/* Quasi-Fused Multiply-Add */
+	PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
+		#if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
+			return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
+		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
+			return (psimd_f32) _mm_fmadd_ps((__m128) b, (__m128) c, (__m128) a);
+		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
+			return (psimd_f32) _mm_macc_ps((__m128) b, (__m128) c, (__m128) a);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__) && PSIMD_ENABLE_WASM_QFMA
+			return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
+		#else
+			return a + b * c;
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
+		return a / b;
+	}
+
+	/* Vector and */
+	PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
+		return (psimd_f32) (mask & (psimd_s32) v);
+	}
+
+	/* Vector and-not */
+	PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
+		return (psimd_f32) (~mask & (psimd_s32) v);
+	}
+
+	/* Vector blend */
+	PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
+		#endif
+	}
+
+	/* Vector blend on sign */
+	PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
+		return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
+		return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
+		return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
+		return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
+		return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
+		return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
+		const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
+		return psimd_blend_f32(mask, a, b);
+	}
+
+	/* Vector absolute value */
+	PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
+		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
+		return (psimd_f32) ((psimd_s32) v & ~mask);
+	}
+
+	/* Vector negation */
+	PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
+		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
+		return (psimd_f32) ((psimd_s32) v ^ mask);
+	}
+
+	/* Vector maximum */
+	PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
+		#else
+			return psimd_blend_s8(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
+		#else
+			return psimd_blend_u8(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
+		#else
+			return psimd_blend_s16(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
+		#else
+			return psimd_blend_u16(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
+		#else
+			return psimd_blend_s32(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
+		#else
+			return psimd_blend_u32(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return __builtin_wasm_max_f32x4(a, b);
+		#else
+			return psimd_blend_f32(a > b, a, b);
+		#endif
+	}
+
+	/* Vector minimum */
+	PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
+		#else
+			return psimd_blend_s8(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
+		#else
+			return psimd_blend_u8(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
+		#else
+			return psimd_blend_s16(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
+		#else
+			return psimd_blend_u16(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
+		#else
+			return psimd_blend_s32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
+		#else
+			return psimd_blend_u32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return __builtin_wasm_min_f32x4(a, b);
+		#else
+			return psimd_blend_f32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
+		#if defined(__clang__)
+			return __builtin_convertvector(v, psimd_f32);
+		#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
+		#elif defined(__SSE2__)
+			return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
+		#else
+			return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
+		#endif
+	}
+
+	/* Broadcast vector element */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 0, 0, 0, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 1, 1, 1, 1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 2, 2, 2, 2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 3, 3, 3, 3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
+		}
+	#endif
+
+	/* Reversal of vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
+			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
+			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
+			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
+			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
+			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
+			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
+			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
+			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+	#endif
+
+	/* Interleaving of vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+	#endif
+
+	/* Concatenation of low/high vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+	#endif
+
+	/* Concatenation of even/odd vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shufflevector(a, b,
+				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
+		}
+
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shufflevector(a, b,
+				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shufflevector(a, b,
+				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shufflevector(a, b,
+				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
+		}
+
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+	#endif
+
+	/* Vector reduce */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
+			return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
+			return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
+			return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
+			const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
+			const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
+			const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
+			return result[0];
+		}
+	#else
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
+			return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
+			return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
+			return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_sum_f32(v);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_max_f32(v);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_min_f32(v);
+			return result[0];
+		}
+	#endif
+#endif
+
+#endif /* PSIMD_H */
diff --git a/MLPY/Lib/site-packages/torch/include/pthreadpool.h b/MLPY/Lib/site-packages/torch/include/pthreadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b779adcb96bd845944856387195c98b40567afe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pthreadpool.h
@@ -0,0 +1,2555 @@
+#ifndef PTHREADPOOL_H_
+#define PTHREADPOOL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct pthreadpool* pthreadpool_t;
+
+typedef void (*pthreadpool_task_1d_t)(void*, size_t);
+typedef void (*pthreadpool_task_1d_with_thread_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_task_2d_with_thread_t)(void*, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_1d_with_thread_t)(void*, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
+
+typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t);
+typedef void (*pthreadpool_task_2d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_1d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);
+
+typedef void (*pthreadpool_task_2d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_task_3d_tile_1d_with_id_with_thread_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t);
+
+
+/**
+ * Disable support for denormalized numbers to the maximum extent possible for
+ * the duration of the computation.
+ *
+ * Handling denormalized floating-point numbers is often implemented in
+ * microcode, and incurs significant performance degradation. This hint
+ * instructs the thread pool to disable support for denormalized numbers before
+ * running the computation by manipulating architecture-specific control
+ * registers, and restore the initial value of control registers after the
+ * computation is complete. The thread pool temporary disables denormalized
+ * numbers on all threads involved in the computation (i.e. the caller threads,
+ * and potentially worker threads).
+ *
+ * Disabling denormalized numbers may have a small negative effect on results'
+ * accuracy. As various architectures differ in capabilities to control
+ * processing of denormalized numbers, using this flag may also hurt results'
+ * reproducibility across different instruction set architectures.
+ */
+#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001
+
+/**
+ * Yield worker threads to the system scheduler after the operation is finished.
+ *
+ * Force workers to use kernel wait (instead of active spin-wait by default) for
+ * new commands after this command is processed. This flag affects only the
+ * immediate next operation on this thread pool. To make the thread pool always
+ * use kernel wait, pass this flag to all parallelization functions.
+ */
+#define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Create a thread pool with the specified number of threads.
+ *
+ * @param  threads_count  the number of threads in the thread pool.
+ *    A value of 0 has special interpretation: it creates a thread pool with as
+ *    many threads as there are logical processors in the system.
+ *
+ * @returns  A pointer to an opaque thread pool object if the call is
+ *    successful, or NULL pointer if the call failed.
+ */
+pthreadpool_t pthreadpool_create(size_t threads_count);
+
+/**
+ * Query the number of threads in a thread pool.
+ *
+ * @param  threadpool  the thread pool to query.
+ *
+ * @returns  The number of threads in the thread pool.
+ */
+size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);
+
+/**
+ * Process items on a 1D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range; i++)
+ *     function(context, i);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each item.
+ * @param context     the first argument passed to the specified function.
+ * @param range       the number of items on the 1D grid to process. The
+ *    specified function will be called once for each item.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_1d_t function,
+	void* context,
+	size_t range,
+	uint32_t flags);
+
+/**
+ * Process items on a 1D grid passing along the current thread id.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range; i++)
+ *     function(context, thread_index, i);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each item.
+ * @param context     the first argument passed to the specified function.
+ * @param range       the number of items on the 1D grid to process. The
+ *    specified function will be called once for each item.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_1d_with_thread(
+	pthreadpool_t threadpool,
+	pthreadpool_task_1d_with_thread_t function,
+	void* context,
+	size_t range,
+	uint32_t flags);
+
+/**
+ * Process items on a 1D grid using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range; i++)
+ *     function(context, uarch_index, i);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each item.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range                the number of items on the 1D grid to process.
+ *    The specified function will be called once for each item.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_1d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_1d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range,
+	uint32_t flags);
+
+/**
+ * Process items on a 1D grid with specified maximum tile size.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range; i += tile)
+ *     function(context, i, min(range - i, tile));
+ *
+ * When the call returns, all items have been processed and the thread pool is
+ * ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool,
+ *    the calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range       the number of items on the 1D grid to process.
+ * @param tile        the maximum number of items on the 1D grid to process in
+ *    one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_1d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_1d_tile_1d_t function,
+	void* context,
+	size_t range,
+	size_t tile,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       function(context, i, j);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each item.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid passing along the current thread id.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       function(context, thread_index, i, j);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each item.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_with_thread(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_with_thread_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       function(context, i, j, min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_tile_1d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along the
+ * last grid dimension using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       function(context, uarch_index, i, j, min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_tile_1d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_tile_1d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along the
+ * last grid dimension using a microarchitecture-aware task function and passing
+ * along the current thread id.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       function(context, uarch_index, thread_index, i, j, min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_tile_1d_with_uarch_with_thread(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_tile_1d_with_id_with_thread_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along each
+ * grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i += tile_i)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       function(context, i, j,
+ *         min(range_i - i, tile_i), min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the first dimension of
+ *    the 2D grid to process in one function call.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_tile_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_tile_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_i,
+	size_t tile_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along each
+ * grid dimension using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i += tile_i)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       function(context, uarch_index, i, j,
+ *         min(range_i - i, tile_i), min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each tile.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *                             pthreadpool is configured without cpuinfo,
+ *                             cpuinfo initialization failed, or index returned
+ *                             by cpuinfo_get_current_uarch_index() exceeds
+ *                             the max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected
+ *                             by the specified function. If the index returned
+ *                             by cpuinfo_get_current_uarch_index() exceeds this
+ *                             value, default_uarch_index will be used instead.
+ *                             default_uarch_index can exceed max_uarch_index.
+ * @param range_i              the number of items to process along the first
+ *    dimension of the 2D grid.
+ * @param range_j              the number of items to process along the second
+ *    dimension of the 2D grid.
+ * @param tile_j               the maximum number of items along the first
+ *    dimension of the 2D grid to process in one function call.
+ * @param tile_j               the maximum number of items along the second
+ *    dimension of the 2D grid to process in one function call.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_2d_tile_2d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_2d_tile_2d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_i,
+	size_t tile_j,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         function(context, i, j, k);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, i, j, k, min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 3D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_tile_1d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last grid dimension and passing along the current thread id.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, thread_index, i, j, k, min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 3D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_1d_with_thread(
+  pthreadpool_t threadpool,
+  pthreadpool_task_3d_tile_1d_with_thread_t function,
+  void* context,
+  size_t range_i,
+  size_t range_j,
+  size_t range_k,
+  size_t tile_k,
+  uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last grid dimension using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, uarch_index, i, j, k, min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each tile.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i              the number of items to process along the first
+ *    dimension of the 3D grid.
+ * @param range_j              the number of items to process along the second
+ *    dimension of the 3D grid.
+ * @param range_k              the number of items to process along the third
+ *    dimension of the 3D grid.
+ * @param tile_k               the maximum number of items along the third
+ *    dimension of the 3D grid to process in one function call.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_1d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_tile_1d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last grid dimension using a microarchitecture-aware task function and passing
+ * along the current thread id.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, uarch_index, thread_index, i, j, k, min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each tile.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i              the number of items to process along the first
+ *    dimension of the 3D grid.
+ * @param range_j              the number of items to process along the second
+ *    dimension of the 3D grid.
+ * @param range_k              the number of items to process along the third
+ *    dimension of the 3D grid.
+ * @param tile_k               the maximum number of items along the third
+ *    dimension of the 3D grid to process in one function call.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_1d_with_uarch_with_thread(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_tile_1d_with_id_with_thread_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, i, j, k,
+ *           min(range_j - j, tile_j), min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 3D grid to process in one function call.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 3D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_tile_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_j,
+	size_t tile_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last two grid dimensions using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         function(context, uarch_index, i, j, k,
+ *           min(range_j - j, tile_j), min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each tile.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i              the number of items to process along the first
+ *    dimension of the 3D grid.
+ * @param range_j              the number of items to process along the second
+ *    dimension of the 3D grid.
+ * @param range_k              the number of items to process along the third
+ *    dimension of the 3D grid.
+ * @param tile_j               the maximum number of items along the second
+ *    dimension of the 3D grid to process in one function call.
+ * @param tile_k               the maximum number of items along the third
+ *    dimension of the 3D grid to process in one function call.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_3d_tile_2d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_3d_tile_2d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_j,
+	size_t tile_k,
+	uint32_t flags);
+
+/**
+ * Process items on a 4D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           function(context, i, j, k, l);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_4d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_4d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	uint32_t flags);
+
+/**
+ * Process items on a 4D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           function(context, i, j, k, l, min(range_l - l, tile_l));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 4D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_4d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_4d_tile_1d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_l,
+	uint32_t flags);
+
+/**
+ * Process items on a 4D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           function(context, i, j, k, l,
+ *             min(range_k - k, tile_k), min(range_l - l, tile_l));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 4D grid to process in one function call.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 4D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_4d_tile_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_4d_tile_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_k,
+	size_t tile_l,
+	uint32_t flags);
+
+/**
+ * Process items on a 4D grid with the specified maximum tile size along the
+ * last two grid dimensions using a microarchitecture-aware task function.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   uint32_t uarch_index = cpuinfo_initialize() ?
+ *       cpuinfo_get_current_uarch_index() : default_uarch_index;
+ *   if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           function(context, uarch_index, i, j, k, l,
+ *             min(range_k - k, tile_k), min(range_l - l, tile_l));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool           the thread pool to use for parallelisation. If
+ *    threadpool is NULL, all items are processed serially on the calling
+ *    thread.
+ * @param function             the function to call for each tile.
+ * @param context              the first argument passed to the specified
+ *    function.
+ * @param default_uarch_index  the microarchitecture index to use when
+ *    pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
+ *    or index returned by cpuinfo_get_current_uarch_index() exceeds the
+ *    max_uarch_index value.
+ * @param max_uarch_index      the maximum microarchitecture index expected by
+ *    the specified function. If the index returned by
+ *    cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
+ *    will be used instead. default_uarch_index can exceed max_uarch_index.
+ * @param range_i              the number of items to process along the first
+ *    dimension of the 4D grid.
+ * @param range_j              the number of items to process along the second
+ *    dimension of the 4D grid.
+ * @param range_k              the number of items to process along the third
+ *    dimension of the 4D grid.
+ * @param range_l              the number of items to process along the fourth
+ *    dimension of the 4D grid.
+ * @param tile_k               the maximum number of items along the third
+ *    dimension of the 4D grid to process in one function call.
+ * @param tile_l               the maximum number of items along the fourth
+ *    dimension of the 4D grid to process in one function call.
+ * @param flags                a bitwise combination of zero or more optional
+ *    flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
+ *    PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_4d_tile_2d_with_uarch(
+	pthreadpool_t threadpool,
+	pthreadpool_task_4d_tile_2d_with_id_t function,
+	void* context,
+	uint32_t default_uarch_index,
+	uint32_t max_uarch_index,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_k,
+	size_t tile_l,
+	uint32_t flags);
+
+/**
+ * Process items on a 5D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             function(context, i, j, k, l, m);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_5d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_5d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	uint32_t flags);
+
+/**
+ * Process items on a 5D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             function(context, i, j, k, l, m, min(range_m - m, tile_m));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 5D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_5d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_5d_tile_1d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t tile_m,
+	uint32_t flags);
+
+/**
+ * Process items on a 5D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             function(context, i, j, k, l, m,
+ *               min(range_l - l, tile_l), min(range_m - m, tile_m));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 5D grid to process in one function call.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 5D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_5d_tile_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_5d_tile_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t tile_l,
+	size_t tile_m,
+	uint32_t flags);
+
+/**
+ * Process items on a 6D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             for (size_t n = 0; n < range_n; n++)
+ *               function(context, i, j, k, l, m, n);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_6d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_6d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	uint32_t flags);
+
+/**
+ * Process items on a 6D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             for (size_t n = 0; n < range_n; n += tile_n)
+ *               function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_6d_tile_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_6d_tile_1d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	size_t tile_n,
+	uint32_t flags);
+
+/**
+ * Process items on a 6D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             for (size_t n = 0; n < range_n; n += tile_n)
+ *               function(context, i, j, k, l, m, n,
+ *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param function    the function to call for each tile.
+ * @param context     the first argument passed to the specified function.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 6D grid to process in one function call.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one function call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+void pthreadpool_parallelize_6d_tile_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_task_6d_tile_2d_t function,
+	void* context,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	size_t tile_m,
+	size_t tile_n,
+	uint32_t flags);
+
+/**
+ * Terminates threads in the thread pool and releases associated resources.
+ *
+ * @warning  Accessing the thread pool after a call to this function constitutes
+ *    undefined behaviour and may cause data corruption.
+ *
+ * @param[in,out]  threadpool  The thread pool to destroy.
+ */
+void pthreadpool_destroy(pthreadpool_t threadpool);
+
+#ifndef PTHREADPOOL_NO_DEPRECATED_API
+
+/* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
+#if defined(__GNUC__)
+	#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
+#else
+	#define PTHREADPOOL_DEPRECATED
+#endif
+
+typedef void (*pthreadpool_function_1d_t)(void*, size_t);
+typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
+typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
+
+void pthreadpool_compute_1d(
+	pthreadpool_t threadpool,
+	pthreadpool_function_1d_t function,
+	void* argument,
+	size_t range) PTHREADPOOL_DEPRECATED;
+
+void pthreadpool_compute_1d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_1d_tiled_t function,
+	void* argument,
+	size_t range,
+	size_t tile) PTHREADPOOL_DEPRECATED;
+
+void pthreadpool_compute_2d(
+	pthreadpool_t threadpool,
+	pthreadpool_function_2d_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j) PTHREADPOOL_DEPRECATED;
+
+void pthreadpool_compute_2d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_2d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_i,
+	size_t tile_j) PTHREADPOOL_DEPRECATED;
+
+void pthreadpool_compute_3d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_3d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k) PTHREADPOOL_DEPRECATED;
+
+void pthreadpool_compute_4d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_4d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k,
+	size_t tile_l) PTHREADPOOL_DEPRECATED;
+
+#endif /* PTHREADPOOL_NO_DEPRECATED_API */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#ifdef __cplusplus
+
+namespace libpthreadpool {
+namespace detail {
+namespace {
+
+template<class T>
+void call_wrapper_1d(void* arg, size_t i) {
+	(*static_cast<const T*>(arg))(i);
+}
+
+template<class T>
+void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
+	(*static_cast<const T*>(arg))(range_i, tile_i);
+}
+
+template<class T>
+void call_wrapper_2d(void* functor, size_t i, size_t j) {
+	(*static_cast<const T*>(functor))(i, j);
+}
+
+template<class T>
+void call_wrapper_2d_tile_1d(void* functor,
+		                         size_t i, size_t range_j, size_t tile_j)
+{
+	(*static_cast<const T*>(functor))(i, range_j, tile_j);
+}
+
+template<class T>
+void call_wrapper_2d_tile_2d(void* functor,
+		                         size_t range_i, size_t range_j,
+		                         size_t tile_i, size_t tile_j)
+{
+	(*static_cast<const T*>(functor))(range_i, range_j, tile_i, tile_j);
+}
+
+template<class T>
+void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
+	(*static_cast<const T*>(functor))(i, j, k);
+}
+
+template<class T>
+void call_wrapper_3d_tile_1d(void* functor,
+		                         size_t i, size_t j, size_t range_k,
+		                         size_t tile_k)
+{
+	(*static_cast<const T*>(functor))(i, j, range_k, tile_k);
+}
+
+template<class T>
+void call_wrapper_3d_tile_2d(void* functor,
+		                         size_t i, size_t range_j, size_t range_k,
+		                         size_t tile_j, size_t tile_k)
+{
+	(*static_cast<const T*>(functor))(i, range_j, range_k, tile_j, tile_k);
+}
+
+template<class T>
+void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
+	(*static_cast<const T*>(functor))(i, j, k, l);
+}
+
+template<class T>
+void call_wrapper_4d_tile_1d(void* functor,
+		                         size_t i, size_t j, size_t k, size_t range_l,
+		                         size_t tile_l)
+{
+	(*static_cast<const T*>(functor))(i, j, k, range_l, tile_l);
+}
+
+template<class T>
+void call_wrapper_4d_tile_2d(void* functor,
+		                         size_t i, size_t j, size_t range_k, size_t range_l,
+		                         size_t tile_k, size_t tile_l)
+{
+	(*static_cast<const T*>(functor))(i, j, range_k, range_l, tile_k, tile_l);
+}
+
+template<class T>
+void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
+	(*static_cast<const T*>(functor))(i, j, k, l, m);
+}
+
+template<class T>
+void call_wrapper_5d_tile_1d(void* functor,
+		                         size_t i, size_t j, size_t k, size_t l, size_t range_m,
+		                         size_t tile_m)
+{
+	(*static_cast<const T*>(functor))(i, j, k, l, range_m, tile_m);
+}
+
+template<class T>
+void call_wrapper_5d_tile_2d(void* functor,
+		                         size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
+		                         size_t tile_l, size_t tile_m)
+{
+	(*static_cast<const T*>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
+}
+
+template<class T>
+void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
+	(*static_cast<const T*>(functor))(i, j, k, l, m, n);
+}
+
+template<class T>
+void call_wrapper_6d_tile_1d(void* functor,
+		                         size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
+		                         size_t tile_n)
+{
+	(*static_cast<const T*>(functor))(i, j, k, l, m, range_n, tile_n);
+}
+
+template<class T>
+void call_wrapper_6d_tile_2d(void* functor,
+		                         size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
+		                         size_t tile_m, size_t tile_n)
+{
+	(*static_cast<const T*>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
+}
+
+}  /* namespace */
+}  /* namespace detail */
+}  /* namespace libpthreadpool */
+
+/**
+ * Process items on a 1D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range; i++)
+ *     functor(i);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each item.
+ * @param range       the number of items on the 1D grid to process. The
+ *    specified functor will be called once for each item.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range,
+		flags);
+}
+
+/**
+ * Process items on a 1D grid with specified maximum tile size.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range; i += tile)
+ *     functor(i, min(range - i, tile));
+ *
+ * When the call returns, all items have been processed and the thread pool is
+ * ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool,
+ *    the calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range       the number of items on the 1D grid to process.
+ * @param tile        the maximum number of items on the 1D grid to process in
+ *    one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_1d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range,
+	size_t tile,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_1d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range,
+		tile,
+		flags);
+}
+
+/**
+ * Process items on a 2D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       functor(i, j);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each item.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		flags);
+}
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       functor(i, j, min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_2d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_j,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_2d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		tile_j,
+		flags);
+}
+
+/**
+ * Process items on a 2D grid with the specified maximum tile size along each
+ * grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i += tile_i)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       functor(i, j,
+ *         min(range_i - i, tile_i), min(range_j - j, tile_j));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 2D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 2D grid.
+ * @param tile_j      the maximum number of items along the first dimension of
+ *    the 2D grid to process in one functor call.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 2D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_2d_tile_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t tile_i,
+	size_t tile_j,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_2d_tile_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		tile_i,
+		tile_j,
+		flags);
+}
+
+/**
+ * Process items on a 3D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         functor(i, j, k);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_3d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_3d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_3d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		flags);
+}
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         functor(i, j, k, min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 3D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_3d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_k,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_3d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		tile_k,
+		flags);
+}
+
+/**
+ * Process items on a 3D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j += tile_j)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         functor(i, j, k,
+ *           min(range_j - j, tile_j), min(range_k - k, tile_k));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 3D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 3D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 3D grid.
+ * @param tile_j      the maximum number of items along the second dimension of
+ *    the 3D grid to process in one functor call.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 3D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_3d_tile_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_j,
+	size_t tile_k,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_3d_tile_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		tile_j,
+		tile_k,
+		flags);
+}
+
+/**
+ * Process items on a 4D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           functor(i, j, k, l);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_4d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_4d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_4d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		flags);
+}
+
+/**
+ * Process items on a 4D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           functor(i, j, k, l, min(range_l - l, tile_l));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 4D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_4d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_l,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_4d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		tile_l,
+		flags);
+}
+
+/**
+ * Process items on a 4D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k += tile_k)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           functor(i, j, k, l,
+ *             min(range_k - k, tile_k), min(range_l - l, tile_l));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 4D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 4D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 4D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 4D grid.
+ * @param tile_k      the maximum number of items along the third dimension of
+ *    the 4D grid to process in one functor call.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 4D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_4d_tile_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_k,
+	size_t tile_l,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_4d_tile_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		tile_k,
+		tile_l,
+		flags);
+}
+
+/**
+ * Process items on a 5D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             functor(i, j, k, l, m);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_5d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_5d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_5d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		flags);
+}
+
+/**
+ * Process items on a 5D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             functor(i, j, k, l, m, min(range_m - m, tile_m));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 5D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_5d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t tile_m,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_5d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		tile_m,
+		flags);
+}
+
+/**
+ * Process items on a 5D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l += tile_l)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             functor(i, j, k, l, m,
+ *               min(range_l - l, tile_l), min(range_m - m, tile_m));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 5D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 5D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 5D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 5D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 5D grid.
+ * @param tile_l      the maximum number of items along the fourth dimension of
+ *    the 5D grid to process in one functor call.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 5D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_5d_tile_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t tile_l,
+	size_t tile_m,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_5d_tile_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		tile_l,
+		tile_m,
+		flags);
+}
+
+/**
+ * Process items on a 6D grid.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             for (size_t n = 0; n < range_n; n++)
+ *               functor(i, j, k, l, m, n);
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_6d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_6d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_6d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		range_n,
+		flags);
+}
+
+/**
+ * Process items on a 6D grid with the specified maximum tile size along the
+ * last grid dimension.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m++)
+ *             for (size_t n = 0; n < range_n; n += tile_n)
+ *               functor(i, j, k, l, m, n, min(range_n - n, tile_n));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_6d_tile_1d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	size_t tile_n,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_6d_tile_1d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		range_n,
+		tile_n,
+		flags);
+}
+
+/**
+ * Process items on a 6D grid with the specified maximum tile size along the
+ * last two grid dimensions.
+ *
+ * The function implements a parallel version of the following snippet:
+ *
+ *   for (size_t i = 0; i < range_i; i++)
+ *     for (size_t j = 0; j < range_j; j++)
+ *       for (size_t k = 0; k < range_k; k++)
+ *         for (size_t l = 0; l < range_l; l++)
+ *           for (size_t m = 0; m < range_m; m += tile_m)
+ *             for (size_t n = 0; n < range_n; n += tile_n)
+ *               functor(i, j, k, l, m, n,
+ *                 min(range_m - m, tile_m), min(range_n - n, tile_n));
+ *
+ * When the function returns, all items have been processed and the thread pool
+ * is ready for a new task.
+ *
+ * @note If multiple threads call this function with the same thread pool, the
+ *    calls are serialized.
+ *
+ * @param threadpool  the thread pool to use for parallelisation. If threadpool
+ *    is NULL, all items are processed serially on the calling thread.
+ * @param functor     the functor to call for each tile.
+ * @param range_i     the number of items to process along the first dimension
+ *    of the 6D grid.
+ * @param range_j     the number of items to process along the second dimension
+ *    of the 6D grid.
+ * @param range_k     the number of items to process along the third dimension
+ *    of the 6D grid.
+ * @param range_l     the number of items to process along the fourth dimension
+ *    of the 6D grid.
+ * @param range_m     the number of items to process along the fifth dimension
+ *    of the 6D grid.
+ * @param range_n     the number of items to process along the sixth dimension
+ *    of the 6D grid.
+ * @param tile_m      the maximum number of items along the fifth dimension of
+ *    the 6D grid to process in one functor call.
+ * @param tile_n      the maximum number of items along the sixth dimension of
+ *    the 6D grid to process in one functor call.
+ * @param flags       a bitwise combination of zero or more optional flags
+ *    (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
+ */
+template<class T>
+inline void pthreadpool_parallelize_6d_tile_2d(
+	pthreadpool_t threadpool,
+	const T& functor,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t range_m,
+	size_t range_n,
+	size_t tile_m,
+	size_t tile_n,
+	uint32_t flags = 0)
+{
+	pthreadpool_parallelize_6d_tile_2d(
+		threadpool,
+		&libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
+		const_cast<void*>(static_cast<const void*>(&functor)),
+		range_i,
+		range_j,
+		range_k,
+		range_l,
+		range_m,
+		range_n,
+		tile_m,
+		tile_n,
+		flags);
+}
+
+#endif  /* __cplusplus */
+
+#endif /* PTHREADPOOL_H_ */
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/attr.h b/MLPY/Lib/site-packages/torch/include/pybind11/attr.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ea183b39653c32d9e3458be697e270a6cd695be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/attr.h
@@ -0,0 +1,690 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "cast.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method {
+    handle class_;
+    explicit is_method(const handle &c) : class_(c) {}
+};
+
+/// Annotation for setters
+struct is_setter {};
+
+/// Annotation for operators
+struct is_operator {};
+
+/// Annotation for classes that cannot be subclassed
+struct is_final {};
+
+/// Annotation for parent scope
+struct scope {
+    handle value;
+    explicit scope(const handle &s) : value(s) {}
+};
+
+/// Annotation for documentation
+struct doc {
+    const char *value;
+    explicit doc(const char *value) : value(value) {}
+};
+
+/// Annotation for function names
+struct name {
+    const char *value;
+    explicit name(const char *value) : value(value) {}
+};
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling {
+    handle value;
+    explicit sibling(const handle &value) : value(value.ptr()) {}
+};
+
+/// Annotation indicating that a class derives from another given type
+template <typename T>
+struct base {
+
+    PYBIND11_DEPRECATED(
+        "base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() = default;
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient>
+struct keep_alive {};
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance {};
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr {};
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol {};
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() = default;
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) {}
+};
+
+/// Specifies a custom callback with signature `void (PyHeapTypeObject*)` that
+/// may be used to customize the Python type.
+///
+/// The callback is invoked immediately before `PyType_Ready`.
+///
+/// Note: This is an advanced interface, and uses of it may require changes to
+/// work with later versions of pybind11.  You may wish to consult the
+/// implementation of `make_new_python_type` in `detail/classes.h` to understand
+/// the context in which the callback will be run.
+struct custom_type_setup {
+    using callback = std::function<void(PyHeapTypeObject *heap_type)>;
+
+    explicit custom_type_setup(callback value) : value(std::move(value)) {}
+
+    callback value;
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local {
+    const bool value;
+    constexpr explicit module_local(bool v = true) : value(v) {}
+};
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic {};
+
+/// Mark a function for addition at the beginning of the existing overload chain instead of the end
+struct prepend {};
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts>
+struct call_guard;
+
+template <>
+struct call_guard<> {
+    using type = detail::void_type;
+};
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t>
+struct op_;
+void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) {}
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads,
+/// etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), is_method(false), is_setter(false), has_args(false),
+          has_kwargs(false), prepend(false) {}
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl)(function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = {};
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data)(function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// True if this is a setter
+    bool is_setter : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this function is to be inserted at the beginning of the overload resolution chain
+    bool prepend : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Number of leading positional arguments, which are terminated by a py::args or py::kwargs
+    /// argument or by a py::kw_only annotation.
+    std::uint16_t nargs_pos = 0;
+
+    /// Number of leading arguments (counted in `nargs`) that are positional-only
+    std::uint16_t nargs_pos_only = 0;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false), is_final(false) {}
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Custom type setup.
+    custom_type_setup::callback custom_type_setup_callback;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    /// Is the class inheritable from python classes?
+    bool is_final : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *) ) {
+        auto *base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name)
+                          + "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" "
+                          + (default_holder ? "does not have" : "has")
+                          + " a non-default holder type while its base \"" + tname + "\" "
+                          + (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+#if PY_VERSION_HEX < 0x030B0000
+        dynamic_attr |= base_info->type->tp_dictoffset != 0;
+#else
+        dynamic_attr |= (base_info->type->tp_flags & Py_TPFLAGS_MANAGED_DICT) != 0;
+#endif
+
+        if (caster) {
+            base_info->implicit_casts.emplace_back(type, caster);
+        }
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) : func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor {};
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void>
+struct process_attribute;
+
+template <typename T>
+struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) {}
+    static void init(const T &, type_record *) {}
+    static void precall(function_call &) {}
+    static void postcall(function_call &, handle) {}
+};
+
+/// Process an attribute specifying the function's name
+template <>
+struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <>
+struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <>
+struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = d; }
+};
+template <>
+struct process_attribute<char *> : process_attribute<const char *> {};
+
+/// Process an attribute indicating the function's return value policy
+template <>
+struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a
+/// given sibling
+template <>
+struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <>
+struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) {
+        r->is_method = true;
+        r->scope = s.class_;
+    }
+};
+
+/// Process an attribute which indicates that this function is a setter
+template <>
+struct process_attribute<is_setter> : process_attribute_default<is_setter> {
+    static void init(const is_setter &, function_record *r) { r->is_setter = true; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <>
+struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <>
+struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <>
+struct process_attribute<is_new_style_constructor>
+    : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) {
+        r->is_new_style_constructor = true;
+    }
+};
+
+inline void check_kw_only_arg(const arg &a, function_record *r) {
+    if (r->args.size() > r->nargs_pos && (!a.name || a.name[0] == '\0')) {
+        pybind11_fail("arg(): cannot specify an unnamed argument after a kw_only() annotation or "
+                      "args() argument");
+    }
+}
+
+inline void append_self_arg_if_needed(function_record *r) {
+    if (r->is_method && r->args.empty()) {
+        r->args.emplace_back("self", nullptr, handle(), /*convert=*/true, /*none=*/false);
+    }
+}
+
+/// Process a keyword argument attribute (*without* a default value)
+template <>
+struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <>
+struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty()) {
+            r->args.emplace_back(
+                "self", /*descr=*/nullptr, /*parent=*/handle(), /*convert=*/true, /*none=*/false);
+        }
+
+        if (!a.value) {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            std::string descr("'");
+            if (a.name) {
+                descr += std::string(a.name) + ": ";
+            }
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name) {
+                    descr += " in method '" + (std::string) str(r->scope) + "."
+                             + (std::string) r->name + "'";
+                } else {
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+                }
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument " + descr
+                          + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                          "more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword-only-arguments-follow pseudo argument
+template <>
+struct process_attribute<kw_only> : process_attribute_default<kw_only> {
+    static void init(const kw_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        if (r->has_args && r->nargs_pos != static_cast<std::uint16_t>(r->args.size())) {
+            pybind11_fail("Mismatched args() and kw_only(): they must occur at the same relative "
+                          "argument location (or omit kw_only() entirely)");
+        }
+        r->nargs_pos = static_cast<std::uint16_t>(r->args.size());
+    }
+};
+
+/// Process a positional-only-argument maker
+template <>
+struct process_attribute<pos_only> : process_attribute_default<pos_only> {
+    static void init(const pos_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->nargs_pos_only = static_cast<std::uint16_t>(r->args.size());
+        if (r->nargs_pos_only > r->nargs_pos) {
+            pybind11_fail("pos_only(): cannot follow a py::args() argument");
+        }
+        // It also can't follow a kw_only, but a static_assert in pybind11.h checks that
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees
+/// that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>>
+    : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) {
+        r->multiple_inheritance = true;
+    }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<custom_type_setup> {
+    static void init(const custom_type_setup &value, type_record *r) {
+        r->custom_type_setup_callback = value.value;
+    }
+};
+
+template <>
+struct process_attribute<is_final> : process_attribute_default<is_final> {
+    static void init(const is_final &, type_record *r) { r->is_final = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process a 'prepend' attribute, putting this at the beginning of the overload chain
+template <>
+struct process_attribute<prepend> : process_attribute_default<prepend> {
+    static void init(const prepend &, function_record *r) { r->prepend = true; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> {};
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient>
+struct process_attribute<keep_alive<Nurse, Patient>>
+    : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) {
+        keep_alive_impl(Nurse, Patient, call, handle());
+    }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) {
+        keep_alive_impl(Nurse, Patient, call, ret);
+    }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args>
+struct process_attributes {
+    static void init(const Args &...args, function_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{
+            0, ((void) process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void init(const Args &...args, type_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void precall(function_call &call) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::precall(call), 0)...};
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call, fn_ret);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(fn_ret);
+        using expander = int[];
+        (void) expander{
+            0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0)...};
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(nargs, has_args, has_kwargs);
+    return named == 0 || (self + named + size_t(has_args) + size_t(has_kwargs)) == nargs;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/buffer_info.h b/MLPY/Lib/site-packages/torch/include/pybind11/buffer_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd17c62ab094c472e9aebfe721e32b4b8aabaf0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/buffer_info.h
@@ -0,0 +1,208 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Default, C-style strides
+inline std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    if (ndim > 0) {
+        for (size_t i = ndim - 1; i > 0; --i) {
+            strides[i - 1] = strides[i] * shape[i];
+        }
+    }
+    return strides;
+}
+
+// F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+inline std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    for (size_t i = 1; i < ndim; ++i) {
+        strides[i] = strides[i - 1] * shape[i - 1];
+    }
+    return strides;
+}
+
+template <typename T, typename SFINAE = void>
+struct compare_buffer_info;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to
+                                  // format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries
+                                  // (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() = default;
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+          shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size()) {
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        }
+        for (size_t i = 0; i < (size_t) ndim; ++i) {
+            size *= shape[i];
+        }
+    }
+
+    template <typename T>
+    buffer_info(T *ptr,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : buffer_info(private_ctr_tag(),
+                      ptr,
+                      sizeof(T),
+                      format_descriptor<T>::format(),
+                      static_cast<ssize_t>(shape_in->size()),
+                      std::move(shape_in),
+                      std::move(strides_in),
+                      readonly) {}
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t size,
+                bool readonly = false)
+        : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) {}
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly = false)
+        : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly = true)
+        : buffer_info(
+            const_cast<T *>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+        : buffer_info(
+            view->buf,
+            view->itemsize,
+            view->format,
+            view->ndim,
+            {view->shape, view->shape + view->ndim},
+            /* Though buffer::request() requests PyBUF_STRIDES, ctypes objects
+             * ignore this flag and return a view with NULL strides.
+             * When strides are NULL, build them manually.  */
+            view->strides
+                ? std::vector<ssize_t>(view->strides, view->strides + view->ndim)
+                : detail::c_strides({view->shape, view->shape + view->ndim}, view->itemsize),
+            (view->readonly != 0)) {
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->m_view = view;
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info &operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) noexcept { (*this) = std::move(other); }
+
+    buffer_info &operator=(buffer_info &&rhs) noexcept {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(m_view, rhs.m_view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (m_view && ownview) {
+            PyBuffer_Release(m_view);
+            delete m_view;
+        }
+    }
+
+    Py_buffer *view() const { return m_view; }
+    Py_buffer *&view() { return m_view; }
+
+    /* True if the buffer item type is equivalent to `T`. */
+    // To define "equivalent" by example:
+    // `buffer_info::item_type_is_equivalent_to<int>(b)` and
+    // `buffer_info::item_type_is_equivalent_to<long>(b)` may both be true
+    // on some platforms, but `int` and `unsigned` will never be equivalent.
+    // For the ground truth, please inspect `detail::compare_buffer_info<>`.
+    template <typename T>
+    bool item_type_is_equivalent_to() const {
+        return detail::compare_buffer_info<T>::compare(*this);
+    }
+
+private:
+    struct private_ctr_tag {};
+
+    buffer_info(private_ctr_tag,
+                void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in,
+                detail::any_container<ssize_t> &&strides_in,
+                bool readonly)
+        : buffer_info(
+            ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) {}
+
+    Py_buffer *m_view = nullptr;
+    bool ownview = false;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE>
+struct compare_buffer_info {
+    static bool compare(const buffer_info &b) {
+        // NOLINTNEXTLINE(bugprone-sizeof-expression) Needed for `PyObject *`
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return (size_t) b.itemsize == sizeof(T)
+               && (b.format == format_descriptor<T>::value
+                   || ((sizeof(T) == sizeof(long))
+                       && b.format == (std::is_unsigned<T>::value ? "L" : "l"))
+                   || ((sizeof(T) == sizeof(size_t))
+                       && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/cast.h b/MLPY/Lib/site-packages/torch/include/pybind11/cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9b1f3da614c74c73015db82b83a6761915f51e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/cast.h
@@ -0,0 +1,1837 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "detail/type_caster_base.h"
+#include "detail/typeid.h"
+#include "pytypes.h"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <iosfwd>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type, typename SFINAE = void>
+class type_caster : public type_caster_base<type> {};
+template <typename type>
+using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T>
+typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    using result_t = typename make_caster<T>::template cast_op_type<T>; // See PR #4893
+    return caster.operator result_t();
+}
+template <typename T>
+typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    using result_t = typename make_caster<T>::template cast_op_type<
+        typename std::add_rvalue_reference<T>::type>; // See PR #4893
+    return std::move(caster).operator result_t();
+}
+
+template <typename type>
+class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using reference_t = type &;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<reference_t>;
+
+    static_assert(
+        std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value
+            || std::is_same<reference_t, subcaster_cast_op_type>::value,
+        "std::reference_wrapper<T> caster requires T to have a caster with an "
+        "`operator T &()` or `operator const T &()`");
+
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle
+    cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership
+            || policy == return_value_policy::automatic) {
+            policy = return_value_policy::automatic_reference;
+        }
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T>
+    using cast_op_type = std::reference_wrapper<type>;
+    explicit operator std::reference_wrapper<type>() { return cast_op<type &>(subcaster); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name)                                                       \
+protected:                                                                                        \
+    type value;                                                                                   \
+                                                                                                  \
+public:                                                                                           \
+    static constexpr auto name = py_name;                                                         \
+    template <typename T_,                                                                        \
+              ::pybind11::detail::enable_if_t<                                                    \
+                  std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,                 \
+                  int>                                                                            \
+              = 0>                                                                                \
+    static ::pybind11::handle cast(                                                               \
+        T_ *src, ::pybind11::return_value_policy policy, ::pybind11::handle parent) {             \
+        if (!src)                                                                                 \
+            return ::pybind11::none().release();                                                  \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {                          \
+            auto h = cast(std::move(*src), policy, parent);                                       \
+            delete src;                                                                           \
+            return h;                                                                             \
+        }                                                                                         \
+        return cast(*src, policy, parent);                                                        \
+    }                                                                                             \
+    operator type *() { return &value; }               /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &() { return value; }                /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &&() && { return std::move(value); } /* NOLINT(bugprone-macro-parentheses) */   \
+    template <typename T_>                                                                        \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+template <typename CharT>
+using is_std_char_type = any_of<std::is_same<CharT, char>, /* std::string */
+#if defined(PYBIND11_HAS_U8STRING)
+                                std::is_same<CharT, char8_t>, /* std::u8string */
+#endif
+                                std::is_same<CharT, char16_t>, /* std::u16string */
+                                std::is_same<CharT, char32_t>, /* std::u32string */
+                                std::is_same<CharT, wchar_t>   /* std::wstring */
+                                >;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value,
+                                     _py_type_0,
+                                     typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+
+public:
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src) {
+            return false;
+        }
+
+#if !defined(PYPY_VERSION)
+        auto index_check = [](PyObject *o) { return PyIndex_Check(o); };
+#else
+        // In PyPy 7.3.3, `PyIndex_Check` is implemented by calling `__index__`,
+        // while CPython only considers the existence of `nb_index`/`__index__`.
+        auto index_check = [](PyObject *o) { return hasattr(o, "__index__"); };
+#endif
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr())) {
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            } else {
+                return false;
+            }
+        } else if (PyFloat_Check(src.ptr())
+                   || (!convert && !PYBIND11_LONG_CHECK(src.ptr()) && !index_check(src.ptr()))) {
+            return false;
+        } else {
+            handle src_or_index = src;
+            // PyPy: 7.3.7's 3.8 does not implement PyLong_*'s __index__ calls.
+#if PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+            object index;
+            if (!PYBIND11_LONG_CHECK(src.ptr())) { // So: index_check(src.ptr())
+                index = reinterpret_steal<object>(PyNumber_Index(src.ptr()));
+                if (!index) {
+                    PyErr_Clear();
+                    if (!convert)
+                        return false;
+                } else {
+                    src_or_index = index;
+                }
+            }
+#endif
+            if (std::is_unsigned<py_type>::value) {
+                py_value = as_unsigned<py_type>(src_or_index.ptr());
+            } else { // signed integer:
+                py_value = sizeof(T) <= sizeof(long)
+                               ? (py_type) PyLong_AsLong(src_or_index.ptr())
+                               : (py_type) PYBIND11_LONG_AS_LONGLONG(src_or_index.ptr());
+            }
+        }
+
+        // Python API reported an error
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Check to see if the conversion is valid (integers should match exactly)
+        // Signed/unsigned checks happen elsewhere
+        if (py_err
+            || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T)
+                && py_value != (py_type) (T) py_value)) {
+            PyErr_Clear();
+            if (py_err && convert && (PyNumber_Check(src.ptr()) != 0)) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                         ? PyNumber_Float(src.ptr())
+                                                         : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) <= sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) <= sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) > sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) > sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name<std::is_integral<T>::value>("int", "float"));
+};
+
+template <typename T>
+struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none()) {
+            return true;
+        }
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().release();
+    }
+    PYBIND11_TYPE_CASTER(T, const_name("None"));
+};
+
+template <>
+class type_caster<void_type> : public void_caster<void_type> {};
+
+template <>
+class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        }
+        if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        const auto &bases = all_type_info((PyTypeObject *) type::handle_of(h).ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr) {
+            return capsule(ptr).release();
+        }
+        return none().release();
+    }
+
+    template <typename T>
+    using cast_op_type = void *&;
+    explicit operator void *&() { return value; }
+    static constexpr auto name = const_name("capsule");
+
+private:
+    void *value = nullptr;
+};
+
+template <>
+class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> {};
+
+template <>
+class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.ptr() == Py_True) {
+            value = true;
+            return true;
+        }
+        if (src.ptr() == Py_False) {
+            value = false;
+            return true;
+        }
+        if (convert || is_numpy_bool(src)) {
+            // (allow non-implicit conversion for numpy booleans), use strncmp
+            // since NumPy 1.x had an additional trailing underscore.
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0; // None is implicitly converted to False
+            }
+#if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+#else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto *tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+#endif
+            if (res == 0 || res == 1) {
+                value = (res != 0);
+                return true;
+            }
+            PyErr_Clear();
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, const_name("bool"));
+
+private:
+    // Test if an object is a NumPy boolean (without fetching the type).
+    static inline bool is_numpy_bool(handle object) {
+        const char *type_name = Py_TYPE(object.ptr())->tp_name;
+        // Name changed to `numpy.bool` in NumPy 2, `numpy.bool_` is needed for 1.x support
+        return std::strcmp("numpy.bool", type_name) == 0
+               || std::strcmp("numpy.bool_", type_name) == 0;
+    }
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false>
+struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1,
+                  "Unsupported char size != 1");
+#if defined(PYBIND11_HAS_U8STRING)
+    static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1,
+                  "Unsupported char8_t size != 1");
+#endif
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2,
+                  "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4,
+                  "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+                  "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+        handle load_src = src;
+        if (!src) {
+            return false;
+        }
+        if (!PyUnicode_Check(load_src.ptr())) {
+            return load_raw(load_src);
+        }
+
+        // For UTF-8 we avoid the need for a temporary `bytes` object by using
+        // `PyUnicode_AsUTF8AndSize`.
+        if (UTF_N == 8) {
+            Py_ssize_t size = -1;
+            const auto *buffer
+                = reinterpret_cast<const CharT *>(PyUnicode_AsUTF8AndSize(load_src.ptr(), &size));
+            if (!buffer) {
+                PyErr_Clear();
+                return false;
+            }
+            value = StringType(buffer, static_cast<size_t>(size));
+            return true;
+        }
+
+        auto utfNbytes
+            = reinterpret_steal<object>(PyUnicode_AsEncodedString(load_src.ptr(),
+                                                                  UTF_N == 8    ? "utf-8"
+                                                                  : UTF_N == 16 ? "utf-16"
+                                                                                : "utf-32",
+                                                                  nullptr));
+        if (!utfNbytes) {
+            PyErr_Clear();
+            return false;
+        }
+
+        const auto *buffer
+            = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        // Skip BOM for UTF-16/32
+        if (UTF_N > 8) {
+            buffer++;
+            length--;
+        }
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView) {
+            loader_life_support::add_patient(utfNbytes);
+        }
+
+        return true;
+    }
+
+    static handle
+    cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        auto nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) {
+            throw error_already_set();
+        }
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, const_name(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return UTF_N == 8    ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr)
+               : UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr)
+                             : PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy segfaults when on PyUnicode_DecodeUTF16 (and possibly on PyUnicode_DecodeUTF32 as
+        // well), so bypass the whole thing by just passing the encoding as a string value, which
+        // works properly:
+        return PyUnicode_Decode(buffer,
+                                nbytes,
+                                UTF_N == 8    ? "utf-8"
+                                : UTF_N == 16 ? "utf-16"
+                                              : "utf-32",
+                                nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes/bytearray object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<std::is_same<C, char>::value, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (!bytes) {
+                pybind11_fail("Unexpected PYBIND11_BYTES_AS_STRING() failure.");
+            }
+            value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+            return true;
+        }
+        if (PyByteArray_Check(src.ptr())) {
+            // We were passed a bytearray; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytearray = PyByteArray_AsString(src.ptr());
+            if (!bytearray) {
+                pybind11_fail("Unexpected PyByteArray_AsString() failure.");
+            }
+            value = StringType(bytearray, (size_t) PyByteArray_Size(src.ptr()));
+            return true;
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<!std::is_same<C, char>::value, handle>) {
+        return false;
+    }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT>
+struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = make_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) {
+            return pybind11::none().release();
+        }
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) {
+                throw error_already_set();
+            }
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    explicit operator CharT *() {
+        return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str());
+    }
+    explicit operator CharT &() {
+        if (none) {
+            throw value_error("Cannot convert None to a character");
+        }
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0) {
+            throw value_error("Cannot convert empty string to a character");
+        }
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to
+        // figure out how long the first encoded character is in bytes to distinguish between these
+        // two errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as
+        // those can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            auto v0 = static_cast<unsigned char>(value[0]);
+            // low bits only: 0-127
+            // 0b110xxxxx - start of 2-byte sequence
+            // 0b1110xxxx - start of 3-byte sequence
+            // 0b11110xxx - start of 4-byte sequence
+            size_t char0_bytes = (v0 & 0x80) == 0      ? 1
+                                 : (v0 & 0xE0) == 0xC0 ? 2
+                                 : (v0 & 0xF0) == 0xE0 ? 3
+                                                       : 4;
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6)
+                                                  + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000) {
+                throw value_error("Character code point not in range(0x10000)");
+            }
+        }
+
+        if (str_len != 1) {
+            throw value_error("Expected a character, but multi-character string found");
+        }
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = const_name(PYBIND11_STRING_NAME);
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template <typename...> class Tuple, typename... Ts>
+class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size) {
+            return false;
+        }
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    // copied from the PYBIND11_TYPE_CASTER macro
+    template <typename T>
+    static handle cast(T *src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            auto h = cast(std::move(*src), policy, parent);
+            delete src;
+            return h;
+        }
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = const_name("tuple[")
+                                 + ::pybind11::detail::concat(make_caster<Ts>::name...)
+                                 + const_name("]");
+
+    template <typename T>
+    using cast_op_type = type;
+
+    explicit operator type() & { return implicit_cast(indices{}); }
+    explicit operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & {
+        return type(cast_op<Ts>(std::get<Is>(subcasters))...);
+    }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && {
+        return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...);
+    }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(subcasters).load(seq[Is], convert))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle
+    cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(src, policy, parent);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(policy, parent);
+        std::array<object, size> entries{{reinterpret_steal<object>(
+            make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...}};
+        for (const auto &entry : entries) {
+            if (!entry) {
+                return handle();
+            }
+        }
+        tuple result(size);
+        int counter = 0;
+        for (auto &entry : entries) {
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        }
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2>
+class type_caster<std::pair<T1, T2>> : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts>
+class type_caster<std::tuple<Ts...>> : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+/// The SFINAE hook is provided to help work around the current lack of support
+/// for smart-pointer interoperability. Please consider it an implementation
+/// detail that may change in the future, as formal support for smart-pointer
+/// interoperability is added into pybind11.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type *() { return this->value; }
+    // static_cast works around compiler error with MSVC 17 and CUDA 10.2
+    // see issue #2180
+    explicit operator type &() { return *(static_cast<type *>(this->value)); }
+    explicit operator holder_type *() { return std::addressof(holder); }
+    explicit operator holder_type &() { return holder; }
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder) {
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+        }
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        }
+        throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                         "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                         "type information)");
+#else
+                         "of type '"
+                         + type_id<holder_type>() + "''");
+#endif
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<!std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) {
+        return false;
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> {};
+
+/// Type caster for holder types like std::unique_ptr.
+/// Please consider the SFINAE hook an implementation detail, as explained
+/// in the comment for the copyable_holder_caster.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> {};
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false>
+struct always_construct_holder {
+    static constexpr bool value = Value;
+};
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...)                                      \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <typename type>                                                                      \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__> {  \
+    };                                                                                            \
+    template <typename type>                                                                      \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>>               \
+        : public type_caster_holder<type, holder_type> {};                                        \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder>
+struct is_holder_type
+    : std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter>
+struct is_holder_type<base, std::unique_ptr<base, deleter>> : std::true_type {};
+
+#ifdef PYBIND11_DISABLE_HANDLE_TYPE_NAME_DEFAULT_IMPLEMENTATION // See PR #4888
+
+// This leads to compilation errors if a specialization is missing.
+template <typename T>
+struct handle_type_name;
+
+#else
+
+template <typename T>
+struct handle_type_name {
+    static constexpr auto name = const_name<T>();
+};
+
+#endif
+
+template <>
+struct handle_type_name<object> {
+    static constexpr auto name = const_name("object");
+};
+template <>
+struct handle_type_name<list> {
+    static constexpr auto name = const_name("list");
+};
+template <>
+struct handle_type_name<dict> {
+    static constexpr auto name = const_name("dict");
+};
+template <>
+struct handle_type_name<anyset> {
+    static constexpr auto name = const_name("Union[set, frozenset]");
+};
+template <>
+struct handle_type_name<set> {
+    static constexpr auto name = const_name("set");
+};
+template <>
+struct handle_type_name<frozenset> {
+    static constexpr auto name = const_name("frozenset");
+};
+template <>
+struct handle_type_name<str> {
+    static constexpr auto name = const_name("str");
+};
+template <>
+struct handle_type_name<tuple> {
+    static constexpr auto name = const_name("tuple");
+};
+template <>
+struct handle_type_name<bool_> {
+    static constexpr auto name = const_name("bool");
+};
+template <>
+struct handle_type_name<bytes> {
+    static constexpr auto name = const_name(PYBIND11_BYTES_NAME);
+};
+template <>
+struct handle_type_name<buffer> {
+    static constexpr auto name = const_name("Buffer");
+};
+template <>
+struct handle_type_name<int_> {
+    static constexpr auto name = const_name("int");
+};
+template <>
+struct handle_type_name<iterable> {
+    static constexpr auto name = const_name("Iterable");
+};
+template <>
+struct handle_type_name<iterator> {
+    static constexpr auto name = const_name("Iterator");
+};
+template <>
+struct handle_type_name<float_> {
+    static constexpr auto name = const_name("float");
+};
+template <>
+struct handle_type_name<function> {
+    static constexpr auto name = const_name("Callable");
+};
+template <>
+struct handle_type_name<handle> {
+    static constexpr auto name = handle_type_name<object>::name;
+};
+template <>
+struct handle_type_name<none> {
+    static constexpr auto name = const_name("None");
+};
+template <>
+struct handle_type_name<sequence> {
+    static constexpr auto name = const_name("Sequence");
+};
+template <>
+struct handle_type_name<bytearray> {
+    static constexpr auto name = const_name("bytearray");
+};
+template <>
+struct handle_type_name<memoryview> {
+    static constexpr auto name = const_name("memoryview");
+};
+template <>
+struct handle_type_name<slice> {
+    static constexpr auto name = const_name("slice");
+};
+template <>
+struct handle_type_name<type> {
+    static constexpr auto name = const_name("type");
+};
+template <>
+struct handle_type_name<capsule> {
+    static constexpr auto name = const_name("capsule");
+};
+template <>
+struct handle_type_name<ellipsis> {
+    static constexpr auto name = const_name("ellipsis");
+};
+template <>
+struct handle_type_name<weakref> {
+    static constexpr auto name = const_name("weakref");
+};
+template <>
+struct handle_type_name<args> {
+    static constexpr auto name = const_name("*args");
+};
+template <>
+struct handle_type_name<kwargs> {
+    static constexpr auto name = const_name("**kwargs");
+};
+template <>
+struct handle_type_name<obj_attr_accessor> {
+    static constexpr auto name = const_name<obj_attr_accessor>();
+};
+template <>
+struct handle_type_name<str_attr_accessor> {
+    static constexpr auto name = const_name<str_attr_accessor>();
+};
+template <>
+struct handle_type_name<item_accessor> {
+    static constexpr auto name = const_name<item_accessor>();
+};
+template <>
+struct handle_type_name<sequence_accessor> {
+    static constexpr auto name = const_name<sequence_accessor>();
+};
+template <>
+struct handle_type_name<list_accessor> {
+    static constexpr auto name = const_name<list_accessor>();
+};
+template <>
+struct handle_type_name<tuple_accessor> {
+    static constexpr auto name = const_name<tuple_accessor>();
+};
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    pyobject_caster() : value() {}
+
+    // `type` may not be default constructible (e.g. frozenset, anyset).  Initializing `value`
+    // to a nil handle is safe since it will only be accessed if `load` succeeds.
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    pyobject_caster() : value(reinterpret_steal<type>(handle())) {}
+
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        value = src;
+        return static_cast<bool>(value);
+    }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src)) {
+            return false;
+        }
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> {};
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T>
+using move_is_plain_type
+    = satisfies_none_of<T, std::is_void, std::is_pointer, std::is_reference, std::is_const>;
+template <typename T, typename SFINAE = void>
+struct move_always : std::false_type {};
+template <typename T>
+struct move_always<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<is_copy_constructible<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct move_if_unreferenced : std::false_type {};
+template <typename T>
+struct move_if_unreferenced<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<move_always<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T>
+using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type>
+using cast_is_temporary_value_reference
+    = bool_constant<(std::is_reference<type>::value || std::is_pointer<type>::value)
+                    && !std::is_base_of<type_caster_generic, make_caster<type>>::value
+                    && !std::is_same<intrinsic_t<type>, void>::value>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void>
+struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return>
+struct return_value_policy_override<
+    Return,
+    detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value && !std::is_pointer<Return>::value
+                   ? return_value_policy::move
+                   : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE>
+type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    static_assert(!detail::is_pyobject<T>::value,
+                  "Internal error: type_caster should only be used for C++ types");
+    if (!conv.load(handle, true)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python instance of type "
+            + str(type::handle_of(handle)).cast<std::string>()
+            + " to C++ type '?' (#define "
+              "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type "
+                         + str(type::handle_of(handle)).cast<std::string>() + " to C++ type '"
+                         + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T>
+make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T,
+          detail::enable_if_t<!detail::is_pyobject<T>::value
+                                  && !detail::is_same_ignoring_cvref<T, PyObject *>::value,
+                              int>
+          = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+                  "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    return T(reinterpret_borrow<object>(handle));
+}
+
+// Note that `cast<PyObject *>(obj)` increments the reference count of `obj`.
+// This is necessary for the case that `obj` is a temporary, and could
+// not possibly be different, given
+// 1. the established convention that the passed `handle` is borrowed, and
+// 2. we don't want to force all generic code using `cast<T>()` to special-case
+//    handling of `T` = `PyObject *` (to increment the reference count there).
+// It is the responsibility of the caller to ensure that the reference count
+// is decremented.
+template <typename T,
+          typename Handle,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Handle, handle>::value,
+                              int>
+          = 0>
+T cast(Handle &&handle) {
+    return handle.inc_ref().ptr();
+}
+// To optimize way an inc_ref/dec_ref cycle:
+template <typename T,
+          typename Object,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Object, object>::value,
+                              int>
+          = 0>
+T cast(Object &&obj) {
+    return obj.release().ptr();
+}
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(T &&value,
+            return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    using no_ref_T = typename std::remove_reference<T>::type;
+    if (policy == return_value_policy::automatic) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::take_ownership
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    } else if (policy == return_value_policy::automatic_reference) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::reference
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    }
+    return reinterpret_steal<object>(
+        detail::make_caster<T>::cast(std::forward<T>(value), policy, parent));
+}
+
+template <typename T>
+T handle::cast() const {
+    return pybind11::cast<T>(*this);
+}
+template <>
+inline void handle::cast() const {
+    return;
+}
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python " + str(type::handle_of(obj)).cast<std::string>()
+            + " instance to C++ rvalue: instance has multiple references"
+              " (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python "
+                         + str(type::handle_of(obj)).cast<std::string>() + " instance to C++ "
+                         + type_id<T>() + " instance: instance has multiple references");
+#endif
+    }
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T &());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind11::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_always<T>::value, T>
+cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_if_unreferenced<T>::value, T>
+cast(object &&object) {
+    if (object.ref_count() > 1) {
+        return cast<T>(object);
+    }
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_never<T>::value, T>
+cast(object &&object) {
+    return cast<T>(object);
+}
+
+// pytype rvalue -> pytype (calls converting constructor)
+template <typename T>
+detail::enable_if_t<detail::is_pyobject<T>::value, T> cast(object &&object) {
+    return T(std::move(object));
+}
+
+template <typename T>
+T object::cast() const & {
+    return pybind11::cast<T>(*this);
+}
+template <typename T>
+T object::cast() && {
+    return pybind11::cast<T>(std::move(*this));
+}
+template <>
+inline void object::cast() const & {
+    return;
+}
+template <>
+inline void object::cast() && {
+    return;
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) {
+    return pybind11::cast(std::forward<T>(o));
+}
+
+// Placeholder type for the unneeded (and dead code) static variable in the
+// PYBIND11_OVERRIDE_OVERRIDE macro
+struct override_unused {};
+template <typename ret_type>
+using override_caster_t = conditional_t<cast_is_temporary_value_reference<ret_type>::value,
+                                        make_caster<ret_type>,
+                                        override_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o,
+                                                                     make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T>
+enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&,
+                                                                      override_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked");
+}
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to
+// static_assert, even though if it's in dead code, so we provide a "trampoline" to pybind11::cast
+// that only does anything in cases where pybind11::cast is valid.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked");
+}
+template <typename T>
+enable_if_t<std::is_void<T>::value, void> cast_safe(object &&) {}
+template <typename T>
+enable_if_t<detail::none_of<cast_is_temporary_value_reference<T>, std::is_void<T>>::value, T>
+cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// The overloads could coexist, i.e. the #if is not strictly speaking needed,
+// but it is an easy minor optimization.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name) {
+    return cast_error("Unable to convert call argument '" + name
+                      + "' to Python object (#define "
+                        "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+}
+#else
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name,
+                                                        const std::string &type) {
+    return cast_error("Unable to convert call argument '" + name + "' of type '" + type
+                      + "' to Python object");
+}
+#endif
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() {
+    return tuple(0);
+}
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+tuple make_tuple(Args &&...args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args{{reinterpret_steal<object>(
+        detail::make_caster<Args>::cast(std::forward<Args>(args_), policy, nullptr))...}};
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i));
+#else
+            std::array<std::string, size> argtypes{{type_id<Args>()...}};
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i), argtypes[i]);
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args) {
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    }
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a
+    /// positional argument.
+    constexpr explicit arg(const char *name = nullptr)
+        : name(name), flag_noconvert(false), flag_none(true) {}
+    /// Assign a value to this argument
+    template <typename T>
+    arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) {
+        flag_noconvert = flag;
+        return *this;
+    }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) {
+        flag_none = flag;
+        return *this;
+    }
+
+    const char *name;        ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type
+                             ///< caster!)
+    bool flag_none : 1;      ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base), value(reinterpret_steal<object>(detail::make_caster<T>::cast(
+                         std::forward<T>(x), return_value_policy::automatic, {}))),
+          descr(descr)
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+          ,
+          type(type_id<T>())
+#endif
+    {
+        // Workaround! See:
+        // https://github.com/pybind/pybind11/issues/2336
+        // https://github.com/pybind/pybind11/pull/2685#issuecomment-731286700
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+        }
+    }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) {}
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) {}
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) {
+        arg::noconvert(flag);
+        return *this;
+    }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) {
+        arg::none(flag);
+        return *this;
+    }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+/// \ingroup annotations
+/// Annotation indicating that all following arguments are keyword-only; the is the equivalent of
+/// an unnamed '*' argument
+struct kw_only {};
+
+/// \ingroup annotations
+/// Annotation indicating that all previous arguments are positional-only; the is the equivalent of
+/// an unnamed '/' argument (in Python 3.8)
+struct pos_only {};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const {
+    return {*this, std::forward<T>(value)};
+}
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/>
+using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+operator"" _a // gcc 4.8.5 insists on having a space (hard error).
+#else
+operator""_a // clang 17 generates a deprecation warning if there is a space.
+#endif
+    (const char *name, size_t) {
+    return arg(name);
+}
+} // namespace literals
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+using is_kw_only = std::is_same<intrinsic_t<T>, kw_only>;
+template <typename T>
+using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg>
+    using argument_is_args = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg>
+    using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get kwargs argument position, or -1 if not present:
+    static constexpr auto kwargs_pos = constexpr_last<argument_is_kwargs, Args...>();
+
+    static_assert(kwargs_pos == -1 || kwargs_pos == (int) sizeof...(Args) - 1,
+                  "py::kwargs is only permitted as the last argument of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos != -1;
+
+    // py::args argument position; -1 if not present.
+    static constexpr int args_pos = constexpr_last<argument_is_args, Args...>();
+
+    static_assert(args_pos == -1 || args_pos == constexpr_first<argument_is_args, Args...>(),
+                  "py::args cannot be specified more than once");
+
+    static constexpr auto arg_names
+        = ::pybind11::detail::concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) { return load_impl_sequence(call, indices{}); }
+
+    template <typename Return, typename Guard, typename Func>
+    // NOLINTNEXTLINE(readability-const-return-type)
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is]))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) && {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) {}
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        using expander = int[];
+        (void) expander{0, (process(args_list, std::forward<Ts>(values)), 0)...};
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(
+            detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()));
+#else
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()),
+                                                        type_id<T>());
+#endif
+        }
+        args_list.append(std::move(o));
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (auto a : ap) {
+            args_list.append(a);
+        }
+    }
+
+    void process(list & /*args_list*/, arg_v a) {
+        if (!a.name) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+        }
+        if (m_kwargs.contains(a.name)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(a.name);
+#else
+            throw cast_error_unable_to_convert_call_arg(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = std::move(a.value);
+    }
+
+    void process(list & /*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp) {
+            return;
+        }
+        for (auto k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error(
+            "Got kwargs without a name; only named arguments "
+            "may be passed via py::arg() to a python function call. "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(const std::string &type) {
+        throw type_error("Got kwargs without a name of type '" + type
+                         + "'; only named "
+                           "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error(
+            "Got multiple values for keyword argument "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(const std::string &name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+// [workaround(intel)] Separate function required here
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<!all_of<is_positional<Args>...>::value>
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_positional() {
+    return all_of<is_positional<Args>...>::value;
+}
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<args_are_all_positional<Args...>()>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<!args_are_all_positional<Args...>()>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(constexpr_last<is_positional, Args...>()
+                          < constexpr_first<is_keyword_or_ds, Args...>()
+                      && constexpr_last<is_s_unpacking, Args...>()
+                             < constexpr_first<is_ds_unpacking, Args...>(),
+                  "Invalid function call: positional args must precede keywords and ** unpacking; "
+                  "* unpacking must precede ** unpacking");
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+#ifndef NDEBUG
+    if (!PyGILState_Check()) {
+        pybind11_fail("pybind11::object_api<>::operator() PyGILState_Check() failure.");
+    }
+#endif
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+handle type::handle_of() {
+    static_assert(std::is_base_of<detail::type_caster_generic, detail::make_caster<T>>::value,
+                  "py::type::of<T> only supports the case where T is a registered C++ types.");
+
+    return detail::get_type_handle(typeid(T), true);
+}
+
+#define PYBIND11_MAKE_OPAQUE(...)                                                                 \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <>                                                                                   \
+    class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> {};                     \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.:
+/// `PYBIND11_OVERRIDE(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/chrono.h b/MLPY/Lib/site-packages/torch/include/pybind11/chrono.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e343a415fa28228bd9d759e0e1c69007535d67
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/chrono.h
@@ -0,0 +1,225 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <datetime.h>
+#include <mutex>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type>
+class duration_caster {
+public:
+    using rep = typename type::rep;
+    using period = typename type::period;
+
+    // signed 25 bits required by the standard.
+    using days = std::chrono::duration<int_least32_t, std::ratio<86400>>;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period> &
+    get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock>
+    static std::chrono::duration<rep, period>
+    get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Declare these special duration types so the conversions happen with the correct
+        // primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.timedelta"));
+};
+
+inline std::tm *localtime_thread_safe(const std::time_t *time, std::tm *buf) {
+#if (defined(__STDC_LIB_EXT1__) && defined(__STDC_WANT_LIB_EXT1__)) || defined(_MSC_VER)
+    if (localtime_s(buf, time))
+        return nullptr;
+    return buf;
+#else
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    std::tm *tm_ptr = std::localtime(time);
+    if (tm_ptr != nullptr) {
+        *buf = *tm_ptr;
+    }
+    return tm_ptr;
+#endif
+}
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration>
+class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    using type = std::chrono::time_point<std::chrono::system_clock, Duration>;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec = 0;
+            cal.tm_min = 0;
+            cal.tm_hour = 0;
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday = 1;  // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year = 70; // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        } else {
+            return false;
+        }
+
+        value = time_point_cast<Duration>(system_clock::from_time_t(std::mktime(&cal)) + msecs);
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src,
+                       return_value_policy /* policy */,
+                       handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Get out microseconds, and make sure they are positive, to avoid bug in eastern
+        // hemisphere time zones (cfr. https://github.com/pybind/pybind11/issues/2417)
+        using us_t = duration<int, std::micro>;
+        auto us = duration_cast<us_t>(src.time_since_epoch() % seconds(1));
+        if (us.count() < 0) {
+            us += seconds(1);
+        }
+
+        // Subtract microseconds BEFORE `system_clock::to_time_t`, because:
+        // > If std::time_t has lower precision, it is implementation-defined whether the value is
+        // rounded or truncated. (https://en.cppreference.com/w/cpp/chrono/system_clock/to_time_t)
+        std::time_t tt
+            = system_clock::to_time_t(time_point_cast<system_clock::duration>(src - us));
+
+        std::tm localtime;
+        std::tm *localtime_ptr = localtime_thread_safe(&tt, &localtime);
+        if (!localtime_ptr) {
+            throw cast_error("Unable to represent system_clock in local time");
+        }
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          us.count());
+    }
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration>
+class type_caster<std::chrono::time_point<Clock, Duration>>
+    : public duration_caster<std::chrono::time_point<Clock, Duration>> {};
+
+template <typename Rep, typename Period>
+class type_caster<std::chrono::duration<Rep, Period>>
+    : public duration_caster<std::chrono::duration<Rep, Period>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/common.h b/MLPY/Lib/site-packages/torch/include/pybind11/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b391aa6fc99a2e30c38a2ef739301f7dc4e31dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/complex.h b/MLPY/Lib/site-packages/torch/include/pybind11/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7357b97849bf5f31f4a0770954dbfddb0b014c3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/complex.h
@@ -0,0 +1,74 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#    undef I
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T>
+struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = {'Z', c, '\0'};
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T>
+constexpr const char
+    format_descriptor<std::complex<T>,
+                      detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T>
+class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!convert && !PyComplex_Check(src.ptr())) {
+            return false;
+        }
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle
+    cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, const_name("complex"));
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/class.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/class.h
new file mode 100644
index 0000000000000000000000000000000000000000..92d20add6219416c283993295bc0e6705423e8fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/class.h
@@ -0,0 +1,748 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(PYPY_VERSION)
+#    define PYBIND11_BUILTIN_QUALNAME
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In PyPy, we still set __qualname__ so that we can produce reliable function type
+// signatures; in CPython this macro expands to nothing:
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)                                             \
+        setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline std::string get_fully_qualified_tp_name(PyTypeObject *type) {
+#if !defined(PYPY_VERSION)
+    return type->tp_name;
+#else
+    auto module_name = handle((PyObject *) type).attr("__module__").cast<std::string>();
+    if (module_name == PYBIND11_BUILTINS_MODULE)
+        return type->tp_name;
+    else
+        return std::move(module_name) + "." + type->tp_name;
+#endif
+}
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+// Forward declaration to use in `make_static_property_type()`
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type);
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_static_property_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#    ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#    endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+#    if PY_VERSION_HEX >= 0x030C0000
+    // Since Python-3.12 property-derived types are required to
+    // have dynamic attributes (to set `__doc__`)
+    enable_dynamic_attributes(heap_type);
+#    endif
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+class pybind11_static_property(property):
+    def __get__(self, obj, cls):
+        return property.__get__(self, cls, cls)
+
+    def __set__(self, obj, value):
+        cls = obj if isinstance(obj, type) else type(obj)
+        property.__set__(self, cls, value)
+)",
+                                    Py_file_input,
+                                    d.ptr(),
+                                    d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name, PyObject *value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    auto *const static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = (descr != nullptr) && (value != nullptr)
+                                && (PyObject_IsInstance(descr, static_prop) != 0)
+                                && (PyObject_IsInstance(value, static_prop) == 0);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    return PyType_Type.tp_getattro(obj, name);
+}
+
+/// metaclass `__call__` function that is used to create all pybind11 objects.
+extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) {
+
+    // use the default metaclass call to create/initialize the object
+    PyObject *self = PyType_Type.tp_call(type, args, kwargs);
+    if (self == nullptr) {
+        return nullptr;
+    }
+
+    // Ensure that the base __init__ function(s) were called
+    values_and_holders vhs(self);
+    for (const auto &vh : vhs) {
+        if (!vh.holder_constructed() && !vhs.is_redundant_value_and_holder(vh)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s.__init__() must be called when overriding __init__",
+                         get_fully_qualified_tp_name(vh.type->type).c_str());
+            Py_DECREF(self);
+            return nullptr;
+        }
+    }
+
+    return self;
+}
+
+/// Cleanup the type-info for a pybind11-registered type.
+extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
+    auto *type = (PyTypeObject *) obj;
+    auto &internals = get_internals();
+
+    // A pybind11-registered type will:
+    // 1) be found in internals.registered_types_py
+    // 2) have exactly one associated `detail::type_info`
+    auto found_type = internals.registered_types_py.find(type);
+    if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1
+        && found_type->second[0]->type == type) {
+
+        auto *tinfo = found_type->second[0];
+        auto tindex = std::type_index(*tinfo->cpptype);
+        internals.direct_conversions.erase(tindex);
+
+        if (tinfo->module_local) {
+            get_local_internals().registered_types_cpp.erase(tindex);
+        } else {
+            internals.registered_types_cpp.erase(tindex);
+        }
+        internals.registered_types_py.erase(tinfo->type);
+
+        // Actually just `std::erase_if`, but that's only available in C++20
+        auto &cache = internals.inactive_override_cache;
+        for (auto it = cache.begin(), last = cache.end(); it != last;) {
+            if (it->first == (PyObject *) tinfo->type) {
+                it = cache.erase(it);
+            } else {
+                ++it;
+            }
+        }
+
+        delete tinfo;
+    }
+
+    PyType_Type.tp_dealloc(obj);
+}
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject *make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_call = pybind11_meta_call;
+
+    type->tp_setattro = pybind11_meta_setattro;
+    type->tp_getattro = pybind11_meta_getattro;
+
+    type->tp_dealloc = pybind11_meta_dealloc;
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base
+/// ptrs.
+inline void traverse_offset_bases(void *valueptr,
+                                  const detail::type_info *tinfo,
+                                  instance *self,
+                                  bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto *parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr) {
+                        f(parentptr, self);
+                    }
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (self == it->second) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+    }
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    }
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout
+/// for holding C++ objects and holders.  Allocation is done lazily (the first time the instance is
+/// cast to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first
+    // inherited object is a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto *inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!";
+    set_error(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto *instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients) {
+        Py_CLEAR(patient);
+    }
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered()
+                && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) {
+                pybind11_fail(
+                    "pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+            }
+
+            if (instance->owned || v_h.holder_constructed()) {
+                v_h.type->dealloc(v_h);
+            }
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs) {
+        PyObject_ClearWeakRefs(self);
+    }
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr) {
+        Py_CLEAR(*dict_ptr);
+    }
+
+    if (instance->has_patients) {
+        clear_patients(self);
+    }
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    auto *type = Py_TYPE(self);
+
+    // If this is a GC tracked object, untrack it first
+    // Note that the track call is implicitly done by the
+    // default tp_alloc, which we never override.
+    if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
+        PyObject_GC_UnTrack(self);
+    }
+
+    clear_instance(self);
+
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+std::string error_string();
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail("make_object_base_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("PyType_Ready failed in make_object_base_type(): " + error_string());
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_VisitManagedDict(self, visit, arg);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+#endif
+// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+#if PY_VERSION_HEX >= 0x03090000
+    Py_VISIT(Py_TYPE(self));
+#endif
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_ClearManagedDict(self);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+#endif
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto *type = &heap_type->ht_type;
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+#if PY_VERSION_HEX < 0x030B0000
+    type->tp_dictoffset = type->tp_basicsize;           // place dict at the end
+    type->tp_basicsize += (ssize_t) sizeof(PyObject *); // and allocate enough space for it
+#else
+    type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
+#endif
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {{
+#if PY_VERSION_HEX < 0x03070000
+                                       const_cast<char *>("__dict__"),
+#else
+                                       "__dict__",
+#endif
+                                       PyObject_GenericGetDict,
+                                       PyObject_GenericSetDict,
+                                       nullptr,
+                                       nullptr},
+                                   {nullptr, nullptr, nullptr, nullptr, nullptr}};
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer) {
+            break;
+        }
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view) {
+            view->obj = nullptr;
+        }
+        set_error(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        delete info;
+        // view->obj = nullptr;  // Was just memset to 0, so not necessary
+        set_error(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape) {
+        view->len *= s;
+    }
+    view->readonly = static_cast<int>(info->readonly);
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        view->format = const_cast<char *>(info->format.c_str());
+    }
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = info->strides.data();
+        view->shape = info->shape.data();
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject *make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    }
+
+    object module_;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__")) {
+            module_ = rec.scope.attr("__module__");
+        } else if (hasattr(rec.scope, "__name__")) {
+            module_ = rec.scope.attr("__name__");
+        }
+    }
+
+    const auto *full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module_ ? str(module_).cast<std::string>() + "." + rec.name :
+#endif
+                rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = std::strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        std::memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto *base = (bases.empty()) ? internals.instance_base : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *metaclass
+        = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() : internals.default_metaclass;
+
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+    }
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *) base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (!bases.empty()) {
+        type->tp_bases = bases.release().ptr();
+    }
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+    type->tp_as_async = &heap_type->as_async;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
+    if (!rec.is_final) {
+        type->tp_flags |= Py_TPFLAGS_BASETYPE;
+    }
+
+    if (rec.dynamic_attr) {
+        enable_dynamic_attributes(heap_type);
+    }
+
+    if (rec.buffer_protocol) {
+        enable_buffer_protocol(heap_type);
+    }
+
+    if (rec.custom_type_setup_callback) {
+        rec.custom_type_setup_callback(heap_type);
+    }
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed: " + error_string());
+    }
+
+    assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope) {
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    } else {
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+    }
+
+    if (module_) { // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module_);
+    }
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/common.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e6bceca9a38cef246a49589e2e267ae727694cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/common.h
@@ -0,0 +1,1267 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 12
+#define PYBIND11_VERSION_PATCH 0
+
+// Similar to Python's convention: https://docs.python.org/3/c-api/apiabiversion.html
+// Additional convention: 0xD = dev
+#define PYBIND11_VERSION_HEX 0x020C0000
+
+// Define some generic pybind11 helper macros for warning management.
+//
+// Note that compiler-specific push/pop pairs are baked into the
+// PYBIND11_NAMESPACE_BEGIN/PYBIND11_NAMESPACE_END pair of macros. Therefore manual
+// PYBIND11_WARNING_PUSH/PYBIND11_WARNING_POP are usually only needed in `#include` sections.
+//
+// If you find you need to suppress a warning, please try to make the suppression as local as
+// possible using these macros. Please also be sure to push/pop with the pybind11 macros. Please
+// only use compiler specifics if you need to check specific versions, e.g. Apple Clang vs. vanilla
+// Clang.
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPILER_MSVC
+#    define PYBIND11_PRAGMA(...) __pragma(__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning(push))
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning(pop))
+#elif defined(__INTEL_COMPILER)
+#    define PYBIND11_COMPILER_INTEL
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning pop)
+#elif defined(__clang__)
+#    define PYBIND11_COMPILER_CLANG
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(clang diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(clang diagnostic push)
+#elif defined(__GNUC__)
+#    define PYBIND11_COMPILER_GCC
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(GCC diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(GCC diagnostic pop)
+#endif
+
+#ifdef PYBIND11_COMPILER_MSVC
+#    define PYBIND11_WARNING_DISABLE_MSVC(name) PYBIND11_PRAGMA(warning(disable : name))
+#else
+#    define PYBIND11_WARNING_DISABLE_MSVC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_CLANG
+#    define PYBIND11_WARNING_DISABLE_CLANG(name) PYBIND11_PRAGMA(clang diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_CLANG(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_GCC
+#    define PYBIND11_WARNING_DISABLE_GCC(name) PYBIND11_PRAGMA(GCC diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_GCC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_INTEL
+#    define PYBIND11_WARNING_DISABLE_INTEL(name) PYBIND11_PRAGMA(warning disable name)
+#else
+#    define PYBIND11_WARNING_DISABLE_INTEL(name)
+#endif
+
+#define PYBIND11_NAMESPACE_BEGIN(name)                                                            \
+    namespace name {                                                                              \
+    PYBIND11_WARNING_PUSH
+
+#define PYBIND11_NAMESPACE_END(name)                                                              \
+    PYBIND11_WARNING_POP                                                                          \
+    }
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute
+// on the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#    ifdef __GNUG__
+#        define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#    else
+#        define PYBIND11_NAMESPACE pybind11
+#    endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L)
+#    if __cplusplus >= 201402L
+#        define PYBIND11_CPP14
+#        if __cplusplus >= 201703L
+#            define PYBIND11_CPP17
+#            if __cplusplus >= 202002L
+#                define PYBIND11_CPP20
+// Please update tests/pybind11_tests.cpp `cpp_std()` when adding a macro here.
+#            endif
+#        endif
+#    endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully
+// implemented). Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3
+// or newer.
+#    if _MSVC_LANG >= 201402L
+#        define PYBIND11_CPP14
+#        if _MSVC_LANG > 201402L
+#            define PYBIND11_CPP17
+#            if _MSVC_LANG >= 202002L
+#                define PYBIND11_CPP20
+#            endif
+#        endif
+#    endif
+#endif
+
+#if defined(PYBIND11_CPP20)
+#    define PYBIND11_CONSTINIT constinit
+#    define PYBIND11_DTOR_CONSTEXPR constexpr
+#else
+#    define PYBIND11_CONSTINIT
+#    define PYBIND11_DTOR_CONSTEXPR
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#    if __INTEL_COMPILER < 1800
+#        error pybind11 requires Intel C++ compiler v18 or newer
+#    elif __INTEL_COMPILER < 1900 && defined(PYBIND11_CPP14)
+#        error pybind11 supports only C++11 with Intel C++ compiler v18. Use v19 or newer for C++14.
+#    endif
+/* The following pragma cannot be pop'ed:
+   https://community.intel.com/t5/Intel-C-Compiler/Inline-and-no-inline-warning/td-p/1216764 */
+#    pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline"
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#    if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#        error pybind11 requires clang 3.3 or newer
+#    endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#    if __clang_major__ < 5
+#        error pybind11 requires Xcode/clang 5.0 or newer
+#    endif
+#elif defined(__GNUG__)
+#    if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#        error pybind11 requires gcc 4.8 or newer
+#    endif
+#elif defined(_MSC_VER)
+#    if _MSC_VER < 1910
+#        error pybind11 2.10+ requires MSVC 2017 or newer
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#    if defined(WIN32) || defined(_WIN32)
+#        define PYBIND11_EXPORT __declspec(dllexport)
+#    else
+#        define PYBIND11_EXPORT __attribute__((visibility("default")))
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT_EXCEPTION)
+#    if defined(__apple_build_version__)
+#        define PYBIND11_EXPORT_EXCEPTION PYBIND11_EXPORT
+#    else
+#        define PYBIND11_EXPORT_EXCEPTION
+#    endif
+#endif
+
+// For CUDA, GCC7, GCC8:
+// PYBIND11_NOINLINE_FORCED is incompatible with `-Wattributes -Werror`.
+// When defining PYBIND11_NOINLINE_FORCED, it is best to also use `-Wno-attributes`.
+// However, the measured shared-library size saving when using noinline are only
+// 1.7% for CUDA, -0.2% for GCC7, and 0.0% for GCC8 (using -DCMAKE_BUILD_TYPE=MinSizeRel,
+// the default under pybind11/tests).
+#if !defined(PYBIND11_NOINLINE_FORCED)                                                            \
+    && (defined(__CUDACC__) || (defined(__GNUC__) && (__GNUC__ == 7 || __GNUC__ == 8)))
+#    define PYBIND11_NOINLINE_DISABLED
+#endif
+
+// The PYBIND11_NOINLINE macro is for function DEFINITIONS.
+// In contrast, FORWARD DECLARATIONS should never use this macro:
+// https://stackoverflow.com/questions/9317473/forward-declaration-of-inline-functions
+#if defined(PYBIND11_NOINLINE_DISABLED) // Option for maximum portability and experimentation.
+#    define PYBIND11_NOINLINE inline
+#elif defined(_MSC_VER)
+#    define PYBIND11_NOINLINE __declspec(noinline) inline
+#else
+#    define PYBIND11_NOINLINE __attribute__((noinline)) inline
+#endif
+
+#if defined(__MINGW32__)
+// For unknown reasons all PYBIND11_DEPRECATED member trigger a warning when declared
+// whether it is used or not
+#    define PYBIND11_DEPRECATED(reason)
+#elif defined(PYBIND11_CPP14)
+#    define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#    define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    define PYBIND11_MAYBE_UNUSED [[maybe_unused]]
+#elif defined(_MSC_VER) && !defined(__clang__)
+#    define PYBIND11_MAYBE_UNUSED
+#else
+#    define PYBIND11_MAYBE_UNUSED __attribute__((__unused__))
+#endif
+
+/* Don't let Python.h #define (v)snprintf as macro because they are implemented
+   properly in Visual Studio since 2015. */
+#if defined(_MSC_VER)
+#    define HAVE_SNPRINTF 1
+#endif
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4505)
+// C4505: 'PySlice_GetIndicesEx': unreferenced local function has been removed (PyPy only)
+#    if defined(_DEBUG) && !defined(Py_DEBUG)
+// Workaround for a VS 2022 issue.
+// NOTE: This workaround knowingly violates the Python.h include order requirement:
+// https://docs.python.org/3/c-api/intro.html#include-files
+// See https://github.com/pybind/pybind11/pull/3497 for full context.
+#        include <yvals.h>
+#        if _MSVC_STL_VERSION >= 143
+#            include <crtdefs.h>
+#        endif
+#        define PYBIND11_DEBUG_MARKER
+#        undef _DEBUG
+#    endif
+#endif
+
+// https://en.cppreference.com/w/c/chrono/localtime
+#if defined(__STDC_LIB_EXT1__) && !defined(__STDC_WANT_LIB_EXT1__)
+#    define __STDC_WANT_LIB_EXT1__
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#    if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#        define PYBIND11_HAS_OPTIONAL 1
+#    endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#    if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#        define PYBIND11_HAS_EXP_OPTIONAL 1
+#    endif
+// std::variant
+#    if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#        define PYBIND11_HAS_VARIANT 1
+#    endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#    define PYBIND11_HAS_OPTIONAL 1
+#    define PYBIND11_HAS_VARIANT 1
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    if defined(__has_include)
+#        if __has_include(<string_view>)
+#            define PYBIND11_HAS_STRING_VIEW
+#        endif
+#    elif defined(_MSC_VER)
+#        define PYBIND11_HAS_STRING_VIEW
+#    endif
+#endif
+
+#include <Python.h>
+// Reminder: WITH_THREAD is always defined if PY_VERSION_HEX >= 0x03070000
+#if PY_VERSION_HEX < 0x03060000
+#    error "PYTHON < 3.6 IS UNSUPPORTED. pybind11 v2.9 was the last to support Python 2 and 3.5."
+#endif
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#    undef isalnum
+#    undef isalpha
+#    undef islower
+#    undef isspace
+#    undef isupper
+#    undef tolower
+#    undef toupper
+#endif
+
+#if defined(copysign)
+#    undef copysign
+#endif
+
+#if defined(PYBIND11_NUMPY_1_ONLY)
+#    define PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED
+#endif
+
+#if defined(PYPY_VERSION) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    define PYBIND11_SIMPLE_GIL_MANAGEMENT
+#endif
+
+#if defined(_MSC_VER)
+#    if defined(PYBIND11_DEBUG_MARKER)
+#        define _DEBUG
+#        undef PYBIND11_DEBUG_MARKER
+#    endif
+PYBIND11_WARNING_POP
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <exception>
+#include <forward_list>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#if defined(__has_include)
+#    if __has_include(<version>)
+#        include <version>
+#    endif
+#endif
+
+// Must be after including <version> or one of the other headers specified by the standard
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#    define PYBIND11_HAS_U8STRING
+#endif
+
+// See description of PR #4246:
+#if !defined(PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF) && !defined(NDEBUG)                       \
+    && !defined(PYPY_VERSION) && !defined(PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF)
+#    define PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+#endif
+
+// #define PYBIND11_STR_LEGACY_PERMISSIVE
+// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
+//             (probably surprising and never documented, but this was the
+//             legacy behavior until and including v2.6.x). As a side-effect,
+//             pybind11::isinstance<str>() is true for both pybind11::str and
+//             pybind11::bytes.
+// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
+//               pybind11::isinstance<str>() is true only for pybind11::str.
+//               However, for Python 2 only (!), the pybind11::str caster
+//               implicitly decoded bytes to PyUnicodeObject. This was to ease
+//               the transition from the legacy behavior to the non-permissive
+//               behavior.
+
+/// Compatibility macros for Python 2 / Python 3 versions TODO: remove
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) (o))
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) (o))
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_BUILTINS_MODULE "builtins"
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy.
+// See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
+#define PYBIND11_PLUGIN_IMPL(name)                                                                \
+    extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT PyObject *PyInit_##name();                   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION                                                             \
+    {                                                                                             \
+        const char *compiled_ver                                                                  \
+            = PYBIND11_TOSTRING(PY_MAJOR_VERSION) "." PYBIND11_TOSTRING(PY_MINOR_VERSION);        \
+        const char *runtime_ver = Py_GetVersion();                                                \
+        size_t len = std::strlen(compiled_ver);                                                   \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                                     \
+            || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {                            \
+            PyErr_Format(PyExc_ImportError,                                                       \
+                         "Python version mismatch: module was compiled for Python %s, "           \
+                         "but the interpreter version is incompatible: %s.",                      \
+                         compiled_ver,                                                            \
+                         runtime_ver);                                                            \
+            return nullptr;                                                                       \
+        }                                                                                         \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    catch (pybind11::error_already_set & e) {                                                     \
+        pybind11::raise_from(e, PyExc_ImportError, "initialization failed");                      \
+        return nullptr;                                                                           \
+    }                                                                                             \
+    catch (const std::exception &e) {                                                             \
+        ::pybind11::set_error(PyExc_ImportError, e.what());                                       \
+        return nullptr;                                                                           \
+    }
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module_` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module_ m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                                     \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")                     \
+    static PyObject *pybind11_init();                                                             \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        try {                                                                                     \
+            return pybind11_init();                                                               \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the first argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module_` which can be used to initialize the module.
+
+    The entry point is marked as "maybe unused" to aid dead-code detection analysis:
+    since the entry point is typically only looked up at runtime and not referenced
+    during translation, it would otherwise appear as unused ("dead") code.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                                           \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name)            \
+        PYBIND11_MAYBE_UNUSED;                                                                    \
+    PYBIND11_MAYBE_UNUSED                                                                         \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ & (variable))
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t = std::size_t;
+
+template <typename IntType>
+inline ssize_t ssize_t_cast(const IntType &val) {
+    static_assert(sizeof(IntType) <= sizeof(ssize_t), "Implicit narrowing is not permitted.");
+    return static_cast<ssize_t>(val);
+}
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object's reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object's lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property's implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) {
+    return (n <= 1) ? k : log2(n >> 1, k + 1);
+}
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) {
+    return 1 + ((s - 1) >> log2(sizeof(void *)));
+}
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+                  "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a
+     * pointer and the holder object governing that pointer, i.e. [val1*][holder].  This layout is
+     * applied whenever there is no python-side multiple inheritance of bound C++ types *and* the
+     * type's holder will fit in the default space (which is large enough to hold either a
+     * std::unique_ptr or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than
+     * `shared_ptr` (which is typically the size of two pointers), or when multiple inheritance is
+     * used on the python side.  Non-simple layout allocates the required amount of memory to have
+     * multiple bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is
+     * set to a pointer to allocated space of the required space to hold a sequence of value
+     * pointers and holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple
+     * of `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the beginning of
+     * the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values
+    /// themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr,
+                                          bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value,
+              "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14)
+using std::conditional_t;
+using std::enable_if_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+#if defined(PYBIND11_CPP20)
+using std::remove_cvref;
+using std::remove_cvref_t;
+#else
+template <class T>
+struct remove_cvref {
+    using type = remove_cv_t<remove_reference_t<T>>;
+};
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
+/// Example usage: is_same_ignoring_cvref<T, PyObject *>::value
+template <typename T, typename U>
+using is_same_ignoring_cvref = std::is_same<detail::remove_cvref_t<T>, U>;
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template <size_t...>
+struct index_sequence {};
+template <size_t N, size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, S...> {};
+template <size_t... S>
+struct make_index_sequence_impl<0, S...> {
+    using type = index_sequence<S...>;
+};
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...>
+struct select_indices_impl {
+    using type = ISeq;
+};
+template <size_t... IPrev, size_t I, bool B, bool... Bs>
+struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>,
+                          I + 1,
+                          Bs...> {};
+template <bool... Bs>
+using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B>
+using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+struct negation : bool_constant<!T::value> {};
+
+// PGI/Intel cannot detect operator delete with the "compatible" void_t impl, so
+// using the new one (C++14 defect, so generally works on newer compilers, even
+// if not in C++17 mode)
+#if defined(__PGIC__) || defined(__INTEL_COMPILER)
+template <typename...>
+using void_t = void;
+#else
+template <typename...>
+struct void_t_impl {
+    using type = void;
+};
+template <typename... Ts>
+using void_t = typename void_t_impl<Ts...>::type;
+#endif
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts>
+using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts>
+using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...>
+struct bools {};
+template <class... Ts>
+using all_of = std::is_same<bools<Ts::value..., true>, bools<true, Ts::value...>>;
+template <class... Ts>
+using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts>
+using all_of = std::conjunction<Ts...>;
+template <class... Ts>
+using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts>
+using none_of = negation<any_of<Ts...>>;
+
+template <class T, template <class> class... Predicates>
+using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T>
+struct remove_class {};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...)> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const> {
+    using type = R(A...);
+};
+#ifdef __cpp_noexcept_function_type
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) noexcept> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const noexcept> {
+    using type = R(A...);
+};
+#endif
+/// Helper template to strip away type modifiers
+template <typename T>
+struct intrinsic_type {
+    using type = T;
+};
+template <typename T>
+struct intrinsic_type<const T> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T *> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &&> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<const T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type {};
+
+/// Helper template which holds a list of types
+template <typename...>
+struct type_list {};
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts>
+constexpr size_t constexpr_sum(Ts... ns) {
+    return (0 + ... + size_t{ns});
+}
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) {
+    return size_t{n} + constexpr_sum(ns...);
+}
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) {
+    return v ? i : first(i + 1, vs...);
+}
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) {
+    return last(i + 1, v ? i : result, vs...);
+}
+PYBIND11_NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.
+/// Returns sizeof...(Ts) if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() {
+    return constexpr_impl::first(0, Predicate<Ts>::value...);
+}
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() {
+    return constexpr_impl::last(0, -1, Predicate<Ts>::value...);
+}
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element {
+    using type = typename pack_element<N - 1, Ts...>::type;
+};
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> {
+    using type = T;
+};
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template <typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template <typename> class P, typename Default>
+struct exactly_one<P, Default> {
+    using type = Default;
+};
+
+template <template <typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/>
+struct deferred_type {
+    using type = T;
+};
+template <typename T, typename... Us>
+using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived>
+using is_strict_base_of
+    = bool_constant<std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived
+/// pointer can be converted to a Base pointer) For unions, `is_base_of<T, T>::value` is False, so
+/// we need to check `is_same` as well.
+template <typename Base, typename Derived>
+using is_accessible_base_of
+    = bool_constant<(std::is_same<Base, Derived>::value || std::is_base_of<Base, Derived>::value)
+                    && std::is_convertible<Derived *, Base *>::value>;
+
+template <template <typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us>
+    static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template <typename...> class Base, typename T>
+// Sadly, all MSVC versions incl. 2022 need the workaround, even in C++20 mode.
+// See also: https://github.com/pybind/pybind11/pull/3741
+#if !defined(_MSC_VER)
+using is_template_base_of
+    = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr));
+#else
+struct is_template_base_of
+    : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr)) {
+};
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template <typename...> class Class, typename T>
+struct is_instantiation : std::false_type {};
+template <template <typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type {};
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T>
+using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void>
+struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T,
+                         void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T>
+using is_function_pointer
+    = bool_constant<std::is_pointer<T>::value
+                    && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F>
+struct strip_function_object {
+    // If you are encountering an
+    // 'error: name followed by "::" must be a class or namespace name'
+    // with the Intel compiler and a noexcept function here,
+    // try to use noexcept(true) instead of plain noexcept.
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+                           std::remove_pointer<F>,
+                           strip_function_object<F>>::type>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T>
+using is_lambda = satisfies_none_of<remove_reference_t<T>,
+                                    std::is_function,
+                                    std::is_pointer,
+                                    std::is_member_pointer>;
+
+// [workaround(intel)] Internal error on fold expression
+/// Apply a function over each element of a parameter pack
+#if defined(__cpp_fold_expressions) && !defined(__INTEL_COMPILER)
+// Intel compiler produces an internal error on this fold expression (tested with ICC 19.0.2)
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN)                                                 \
+        (void) pybind11::detail::expand_side_effects { ((PATTERN), void(), false)..., false }
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class PYBIND11_EXPORT_EXCEPTION builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type)                                                    \
+    class PYBIND11_EXPORT_EXCEPTION name : public builtin_exception {                             \
+    public:                                                                                       \
+        using builtin_exception::builtin_exception;                                               \
+        name() : name("") {}                                                                      \
+        void set_error() const override { PyErr_SetString(type, what()); }                        \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(attribute_error, PyExc_AttributeError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or
+                                                           /// handle::call fail due to a type
+                                                           /// casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const char *reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const std::string &reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+
+template <typename T, typename SFINAE = void>
+struct format_descriptor {};
+
+template <typename T>
+struct format_descriptor<
+    T,
+    detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr const char c = 'O';
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void>
+struct is_fmt_numeric {
+    static constexpr bool value = false;
+};
+template <typename T>
+struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index
+        = std::is_same<T, bool>::value
+              ? 0
+              : 1
+                    + (std::is_integral<T>::value
+                           ? detail::log2(sizeof(T)) * 2 + std::is_unsigned<T>::value
+                           : 8
+                                 + (std::is_same<T, double>::value        ? 1
+                                    : std::is_same<T, long double>::value ? 2
+                                                                          : 0));
+};
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T>
+constexpr const char
+    format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    error_scope(const error_scope &) = delete;
+    error_scope &operator=(const error_scope &) = delete;
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete {
+    template <typename T>
+    void operator()(T *) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept -> decltype(pf) {
+        return pf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+        -> decltype(pmf) {
+        return pmf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+        -> decltype(pmf) {
+        return pmf;
+    }
+};
+PYBIND11_NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#    define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast{};
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args>
+struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) {}
+
+    // Implicit conversion constructor from any arbitrary container type
+    // with values convertible to T
+    template <typename Container,
+              typename = enable_if_t<
+                  std::is_convertible<decltype(*std::begin(std::declval<const Container &>())),
+                                      T>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) {}
+
+    // initializer_list's aren't deducible, so don't get matched by the above template;
+    // we need this to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) {}
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(std::vector<T> &&v) : v(std::move(v)) {}
+
+    // Moves the vector out of an rvalue any_container
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+// Forward-declaration; see detail/class.h
+std::string get_fully_qualified_tp_name(PyTypeObject *);
+
+template <typename T>
+inline static std::shared_ptr<T>
+try_get_shared_from_this(std::enable_shared_from_this<T> *holder_value_ptr) {
+// Pre C++17, this code path exploits undefined behavior, but is known to work on many platforms.
+// Use at your own risk!
+// See also https://en.cppreference.com/w/cpp/memory/enable_shared_from_this, and in particular
+// the `std::shared_ptr<Good> gp1 = not_so_good.getptr();` and `try`-`catch` parts of the example.
+#if defined(__cpp_lib_enable_shared_from_this) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    return holder_value_ptr->weak_from_this().lock();
+#else
+    try {
+        return holder_value_ptr->shared_from_this();
+    } catch (const std::bad_weak_ptr &) {
+        return nullptr;
+    }
+#endif
+}
+
+// For silencing "unused" compiler warnings in special situations.
+template <typename... Args>
+#if defined(_MSC_VER) && _MSC_VER < 1920 // MSVC 2017
+constexpr
+#endif
+    inline void
+    silence_unused_warnings(Args &&...) {
+}
+
+// MSVC warning C4100: Unreferenced formal parameter
+#if defined(_MSC_VER) && _MSC_VER <= 1916
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)                                         \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)
+#endif
+
+// GCC -Wunused-but-set-parameter  All GCC versions (as of July 2021).
+#if defined(__GNUG__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)                       \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)
+#endif
+
+#if defined(__clang__)                                                                            \
+    && (defined(__apple_build_version__) /* AppleClang 13.0.0.13000029 was the only data point    \
+                                            available. */                                         \
+        || (__clang_major__ >= 7                                                                  \
+            && __clang_major__ <= 12) /* Clang 3, 5, 13, 14, 15 do not generate the warning. */   \
+    )
+#    define PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+// Example:
+// tests/test_kwargs_and_defaults.cpp:46:68: error: local variable 'args' will be copied despite
+// being returned by name [-Werror,-Wreturn-std-move]
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+// test_kwargs_and_defaults.cpp:46:68: note: call 'std::move' explicitly to avoid copying
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+//                                                                    std::move(args)
+#endif
+
+// Pybind offers detailed error messages by default for all builts that are debug (through the
+// negation of NDEBUG). This can also be manually enabled by users, for any builds, through
+// defining PYBIND11_DETAILED_ERROR_MESSAGES. This information is primarily useful for those
+// who are writing (as opposed to merely using) libraries that use pybind11.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(NDEBUG)
+#    define PYBIND11_DETAILED_ERROR_MESSAGES
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/descr.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/descr.h
new file mode 100644
index 0000000000000000000000000000000000000000..be05a5425a4e61cebc9fb64b1a71d648e05fe1a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/descr.h
@@ -0,0 +1,171 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#    define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#    define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1]{'\0'};
+
+    constexpr descr() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char const (&s)[N + 1]) : descr(s, make_index_sequence<N>()) {}
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N + 1], index_sequence<Is...>) : text{s[Is]..., '\0'} {}
+
+    template <typename... Chars>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} {}
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>,
+                                                   index_sequence<Is2...>) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(b);
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> const_name(char const (&text)[N]) {
+    return descr<N - 1>(text);
+}
+constexpr descr<0> const_name(char const (&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits>
+struct int_to_str : int_to_str<Rem / 10, Rem % 10, Digits...> {};
+template <size_t... Digits>
+struct int_to_str<0, Digits...> {
+    // WARNING: This only works with C++17 or higher.
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> const_name(char const (&text1)[N1], char const (&)[N2]) {
+    return const_name(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> const_name(char const (&)[N1], char const (&text2)[N2]) {
+    return const_name(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> const_name(const T1 &d, const T2 &) {
+    return d;
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> const_name(const T1 &, const T2 &d) {
+    return d;
+}
+
+template <size_t Size>
+auto constexpr const_name() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type>
+constexpr descr<1, Type> const_name() {
+    return {'%'};
+}
+
+// If "_" is defined as a macro, py::detail::_ cannot be provided.
+// It is therefore best to use py::detail::const_name universally.
+// This block is for backward compatibility only.
+// (The const_name code is repeated to avoid introducing a "_" #define ourselves.)
+#ifndef _
+#    define PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY
+template <size_t N>
+constexpr descr<N - 1> _(char const (&text)[N]) {
+    return const_name<N>(text);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+
+template <size_t Size>
+auto constexpr _() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return const_name<Size>();
+}
+template <typename Type>
+constexpr descr<1, Type> _() {
+    return const_name<Type>();
+}
+#endif // #ifndef _
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) {
+    return descr;
+}
+
+#ifdef __cpp_fold_expressions
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2 + 2, Ts1..., Ts2...> operator,(const descr<N1, Ts1...> &a,
+                                                       const descr<N2, Ts2...> &b) {
+    return a + const_name(", ") + b;
+}
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args) {
+    return (d, ..., args);
+}
+#else
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + const_name(", ") + concat(args...);
+}
+#endif
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return const_name("{") + descr + const_name("}");
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/init.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb582c1f105ffdeaa0c8c77c45548611c89e3020
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/init.h
@@ -0,0 +1,434 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename>
+    using cast_op_type = value_and_holder &;
+    explicit operator value_and_holder &() { return *value; }
+    static constexpr auto name = const_name<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+PYBIND11_NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) {
+        throw type_error("pybind11::init(): factory function returned nullptr");
+    }
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class>
+using Cpp = typename Class::type;
+template <typename Class>
+using Alias = typename Class::type_alias;
+template <typename Class>
+using Holder = typename Class::holder_type;
+
+template <typename Class>
+using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) {
+    return false;
+}
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initialization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class(std::forward<Args>(args)...);
+}
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class{std::forward<Args>(args)...};
+}
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h,
+                              Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &,
+                                           Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+                  "pybind11::init(): init function must return a compatible pointer, "
+                  "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true);          // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder
+// constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    no_nullptr(ptr);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+    }
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    static_assert(is_move_constructible<Cpp<Class>>::value,
+                  "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias) {
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    } else {
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+    }
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(
+        is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+                } else {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && !std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args>
+struct alias_constructor {
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>,
+          typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [func = std::move(class_factory)]
+#else
+        auto &func = class_factory;
+        cl.def(
+            "__init__",
+            [func]
+#endif
+            (value_and_holder &v_h, Args... args) {
+                construct<Class>(
+                    v_h, func(std::forward<Args>(args)...), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc,
+          typename CReturn,
+          typename... CArgs,
+          typename AReturn,
+          typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) {}
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        static_assert(Class::has_alias,
+                      "The two-argument version of `py::init()` can "
+                      "only be used if the class has an alias");
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+#else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def(
+            "__init__",
+            [class_func, alias_func]
+#endif
+            (value_and_holder &v_h, CArgs... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    // If the instance type equals the registered type we don't have inheritance,
+                    // so don't need the alias and can construct using the class function:
+                    construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+                } else {
+                    construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class,
+          typename T,
+          typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    auto d = handle(result.second);
+    if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) {
+        // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily.
+        // See PR #2972 for details.
+        return;
+    }
+    setattr((PyObject *) v_h.inst, "__dict__", d);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get,
+          typename Set,
+          typename = function_signature_t<Get>,
+          typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get,
+          typename Set,
+          typename RetState,
+          typename Self,
+          typename NewInstance,
+          typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set) : get(std::forward<Get>(get)), set(std::forward<Set>(set)) {}
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__setstate__",
+            [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def(
+            "__setstate__",
+            [func]
+#endif
+            (value_and_holder &v_h, ArgState state) {
+                setstate<Class>(
+                    v_h, func(std::forward<ArgState>(state)), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+PYBIND11_NAMESPACE_END(initimpl)
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/internals.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/internals.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed4e9b8b9c053a0b3a22430d501acabf7c64e365
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/internals.h
@@ -0,0 +1,667 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+#if defined(WITH_THREAD) && defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "../gil.h"
+#endif
+
+#include "../pytypes.h"
+
+#include <exception>
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version.
+///
+/// Some portions of the code use an ABI that is conditional depending on this
+/// version number.  That allows ABI-breaking changes to be "pre-implemented".
+/// Once the default version number is incremented, the conditional logic that
+/// no longer applies can be removed.  Additionally, users that need not
+/// maintain ABI compatibility can increase the version number in order to take
+/// advantage of any functionality/efficiency improvements that depend on the
+/// newer ABI.
+///
+/// WARNING: If you choose to manually increase the ABI version, note that
+/// pybind11 may not be tested as thoroughly with a non-default ABI version, and
+/// further ABI-incompatible changes may be made before the ABI is officially
+/// changed to the new version.
+#ifndef PYBIND11_INTERNALS_VERSION
+#    if PY_VERSION_HEX >= 0x030C0000 || defined(_MSC_VER)
+// Version bump for Python 3.12+, before first 3.12 beta release.
+// Version bump for MSVC piggy-backed on PR #4779. See comments there.
+#        define PYBIND11_INTERNALS_VERSION 5
+#    else
+#        define PYBIND11_INTERNALS_VERSION 4
+#    endif
+#endif
+
+// This requirement is mainly to reduce the support burden (see PR #4570).
+static_assert(PY_VERSION_HEX < 0x030C0000 || PYBIND11_INTERNALS_VERSION >= 5,
+              "pybind11 ABI version 5 is the minimum for Python 3.12+");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ExceptionTranslator = void (*)(std::exception_ptr);
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+constexpr const char *internals_function_record_capsule_name = "pybind11_function_record_capsule";
+
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+// Avoid unnecessary allocation of `Py_tss_t`, since we cannot use
+// `Py_LIMITED_API` anyway.
+#    if PYBIND11_INTERNALS_VERSION > 4
+#        define PYBIND11_TLS_KEY_REF Py_tss_t &
+#        if defined(__clang__)
+#            define PYBIND11_TLS_KEY_INIT(var)                                                    \
+                _Pragma("clang diagnostic push")                                         /**/     \
+                    _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") /**/     \
+                    Py_tss_t var                                                                  \
+                    = Py_tss_NEEDS_INIT;                                                          \
+                _Pragma("clang diagnostic pop")
+#        elif defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#            define PYBIND11_TLS_KEY_INIT(var)                                                    \
+                _Pragma("GCC diagnostic push")                                         /**/       \
+                    _Pragma("GCC diagnostic ignored \"-Wmissing-field-initializers\"") /**/       \
+                    Py_tss_t var                                                                  \
+                    = Py_tss_NEEDS_INIT;                                                          \
+                _Pragma("GCC diagnostic pop")
+#        else
+#            define PYBIND11_TLS_KEY_INIT(var) Py_tss_t var = Py_tss_NEEDS_INIT;
+#        endif
+#        define PYBIND11_TLS_KEY_CREATE(var) (PyThread_tss_create(&(var)) == 0)
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get(&(key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set(&(key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set(&(key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_delete(&(key))
+#    else
+#        define PYBIND11_TLS_KEY_REF Py_tss_t *
+#        define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr;
+#        define PYBIND11_TLS_KEY_CREATE(var)                                                      \
+            (((var) = PyThread_tss_alloc()) != nullptr && (PyThread_tss_create((var)) == 0))
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#    endif
+#else
+// Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_REF decltype(PyThread_create_key())
+#    define PYBIND11_TLS_KEY_INIT(var) PYBIND11_TLS_KEY_REF var = 0;
+#    define PYBIND11_TLS_KEY_CREATE(var) (((var) = PyThread_create_key()) != -1)
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if defined(PYPY_VERSION)
+// On CPython < 3.4 and on PyPy, `PyThread_set_key_value` strangely does not set
+// the value if it has already been set.  Instead, it must first be deleted and
+// then set again.
+inline void tls_replace_value(PYBIND11_TLS_KEY_REF key, void *value) {
+    PyThread_delete_key_value(key);
+    PyThread_set_key_value(key, value);
+}
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                                            \
+            ::pybind11::detail::tls_replace_value((key), (value))
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void) key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if (PYBIND11_INTERNALS_VERSION <= 4 && defined(__GLIBCXX__))                                     \
+    || (PYBIND11_INTERNALS_VERSION >= 5 && !defined(_LIBCPP_VERSION))
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++)) {
+            hash = (hash * 33) ^ c;
+        }
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct override_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *> &v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second) + 0x9e3779b9 + (value << 6) + (value >> 2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    // std::type_index -> pybind11's type information
+    type_map<type_info *> registered_types_cpp;
+    // PyTypeObject* -> base type_info(s)
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py;
+    std::unordered_multimap<const void *, instance *> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, override_hash>
+        inactive_override_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across
+                                                         // extensions
+#if PYBIND11_INTERNALS_VERSION == 4
+    std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
+#endif
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing
+                                                   // detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PYBIND11_TLS_KEY_INIT(tstate)
+#    if PYBIND11_INTERNALS_VERSION > 4
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+#    endif // PYBIND11_INTERNALS_VERSION > 4
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PyInterpreterState *istate = nullptr;
+
+#    if PYBIND11_INTERNALS_VERSION > 4
+    // Note that we have to use a std::string to allocate memory to ensure a unique address
+    // We want unique addresses since we use pointer equality to compare function records
+    std::string function_record_capsule_name = internals_function_record_capsule_name;
+#    endif
+
+    internals() = default;
+    internals(const internals &other) = delete;
+    internals &operator=(const internals &other) = delete;
+    ~internals() {
+#    if PYBIND11_INTERNALS_VERSION > 4
+        PYBIND11_TLS_FREE(loader_life_support_tls_key);
+#    endif // PYBIND11_INTERNALS_VERSION > 4
+
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is
+        // called. PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does
+        // nothing. PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX).
+        // Neither of those have anything to do with CPython internals. PyMem_RawFree *requires*
+        // that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*) (PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*) (void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance.
+     * A type can be simple even if it has non-simple ancestors as long as it has no descendants.
+     */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#    define PYBIND11_BUILD_TYPE "_debug"
+#else
+#    define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+/// A user can manually set this string if they know their
+/// compiler is compatible.
+#ifndef PYBIND11_COMPILER_TYPE
+#    if defined(_MSC_VER)
+#        define PYBIND11_COMPILER_TYPE "_msvc"
+#    elif defined(__INTEL_COMPILER)
+#        define PYBIND11_COMPILER_TYPE "_icc"
+#    elif defined(__clang__)
+#        define PYBIND11_COMPILER_TYPE "_clang"
+#    elif defined(__PGI)
+#        define PYBIND11_COMPILER_TYPE "_pgi"
+#    elif defined(__MINGW32__)
+#        define PYBIND11_COMPILER_TYPE "_mingw"
+#    elif defined(__CYGWIN__)
+#        define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#    elif defined(__GNUC__)
+#        define PYBIND11_COMPILER_TYPE "_gcc"
+#    else
+#        define PYBIND11_COMPILER_TYPE "_unknown"
+#    endif
+#endif
+
+/// Also standard libs
+#ifndef PYBIND11_STDLIB
+#    if defined(_LIBCPP_VERSION)
+#        define PYBIND11_STDLIB "_libcpp"
+#    elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#        define PYBIND11_STDLIB "_libstdcpp"
+#    else
+#        define PYBIND11_STDLIB ""
+#    endif
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+/// On MSVC, changes in _MSC_VER may indicate ABI incompatibility (#2898).
+#ifndef PYBIND11_BUILD_ABI
+#    if defined(__GXX_ABI_VERSION)
+#        define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#    elif defined(_MSC_VER)
+#        define PYBIND11_BUILD_ABI "_mscver" PYBIND11_TOSTRING(_MSC_VER)
+#    else
+#        define PYBIND11_BUILD_ABI ""
+#    endif
+#endif
+
+#ifndef PYBIND11_INTERNALS_KIND
+#    if defined(WITH_THREAD)
+#        define PYBIND11_INTERNALS_KIND ""
+#    else
+#        define PYBIND11_INTERNALS_KIND "_without_thread"
+#    endif
+#endif
+
+#define PYBIND11_INTERNALS_ID                                                                     \
+    "__pybind11_internals_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                        \
+        PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI         \
+            PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID                                                                  \
+    "__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                     \
+        PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI         \
+            PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+// forward decl
+inline void translate_exception(std::exception_ptr);
+
+template <class T,
+          enable_if_t<std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    std::exception_ptr nested = exc.nested_ptr();
+    if (nested != nullptr && nested != p) {
+        translate_exception(nested);
+        return true;
+    }
+    return false;
+}
+
+template <class T,
+          enable_if_t<!std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(exc))) {
+        return handle_nested_exception(*nep, p);
+    }
+    return false;
+}
+
+inline bool raise_err(PyObject *exc_type, const char *msg) {
+    if (PyErr_Occurred()) {
+        raise_from(exc_type, msg);
+        return true;
+    }
+    set_error(exc_type, msg);
+    return false;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    if (!p) {
+        return;
+    }
+    try {
+        std::rethrow_exception(p);
+    } catch (error_already_set &e) {
+        handle_nested_exception(e, p);
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        // Could not use template since it's an abstract class.
+        if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(e))) {
+            handle_nested_exception(*nep, p);
+        }
+        e.set_error();
+        return;
+    } catch (const std::bad_alloc &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_MemoryError, e.what());
+        return;
+    } catch (const std::domain_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::invalid_argument &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::length_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::out_of_range &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_IndexError, e.what());
+        return;
+    } catch (const std::range_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::overflow_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_OverflowError, e.what());
+        return;
+    } catch (const std::exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, e.what());
+        return;
+    } catch (const std::nested_exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, "Caught an unknown nested exception!");
+        return;
+    } catch (...) {
+        raise_err(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) {
+            std::rethrow_exception(p);
+        }
+    } catch (error_already_set &e) {
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        e.set_error();
+        return;
+    }
+}
+#endif
+
+inline object get_python_state_dict() {
+    object state_dict;
+#if PYBIND11_INTERNALS_VERSION <= 4 || PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+    state_dict = reinterpret_borrow<object>(PyEval_GetBuiltins());
+#else
+#    if PY_VERSION_HEX < 0x03090000
+    PyInterpreterState *istate = _PyInterpreterState_Get();
+#    else
+    PyInterpreterState *istate = PyInterpreterState_Get();
+#    endif
+    if (istate) {
+        state_dict = reinterpret_borrow<object>(PyInterpreterState_GetDict(istate));
+    }
+#endif
+    if (!state_dict) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_python_state_dict() FAILED");
+        throw error_already_set();
+    }
+    return state_dict;
+}
+
+inline object get_internals_obj_from_state_dict(handle state_dict) {
+    return reinterpret_borrow<object>(dict_getitemstring(state_dict.ptr(), PYBIND11_INTERNALS_ID));
+}
+
+inline internals **get_internals_pp_from_capsule(handle obj) {
+    void *raw_ptr = PyCapsule_GetPointer(obj.ptr(), /*name=*/nullptr);
+    if (raw_ptr == nullptr) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_internals_pp_from_capsule() FAILED");
+        throw error_already_set();
+    }
+    return static_cast<internals **>(raw_ptr);
+}
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp) {
+        return **internals_pp;
+    }
+
+#if defined(WITH_THREAD)
+#    if defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+    gil_scoped_acquire gil;
+#    else
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state(PyGILState_Ensure()) {}
+        gil_scoped_acquire_local(const gil_scoped_acquire_local &) = delete;
+        gil_scoped_acquire_local &operator=(const gil_scoped_acquire_local &) = delete;
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+#    endif
+#endif
+    error_scope err_scope;
+
+    dict state_dict = get_python_state_dict();
+    if (object internals_obj = get_internals_obj_from_state_dict(state_dict)) {
+        internals_pp = get_internals_pp_from_capsule(internals_obj);
+    }
+    if (internals_pp && *internals_pp) {
+        // We loaded the internals through `state_dict`, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+        // libc++ with CPython doesn't require this (types are explicitly exported)
+        // libc++ with PyPy still need it, awaiting further investigation
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) {
+            internals_pp = new internals *();
+        }
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+
+        PyThreadState *tstate = PyThreadState_Get();
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
+            pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
+        }
+        PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);
+
+#    if PYBIND11_INTERNALS_VERSION > 4
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
+            pybind11_fail("get_internals: could not successfully initialize the "
+                          "loader_life_support TSS key!");
+        }
+#    endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        state_dict[PYBIND11_INTERNALS_ID] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+// the internals struct (above) is shared between all the modules. local_internals are only
+// for a single module. Any changes made to internals may require an update to
+// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
+// restricted to a single module. Whether a module has local internals or not should not
+// impact any other modules, because the only things accessing the local internals is the
+// module that contains them.
+struct local_internals {
+    type_map<type_info *> registered_types_cpp;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+#if defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
+
+    // For ABI compatibility, we can't store the loader_life_support TLS key in
+    // the `internals` struct directly.  Instead, we store it in `shared_data` and
+    // cache a copy in `local_internals`.  If we allocated a separate TLS key for
+    // each instance of `local_internals`, we could end up allocating hundreds of
+    // TLS keys if hundreds of different pybind11 modules are loaded (which is a
+    // plausible number).
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+
+    // Holds the shared TLS key for the loader_life_support stack.
+    struct shared_loader_life_support_data {
+        PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+        shared_loader_life_support_data() {
+            // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+            if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
+                pybind11_fail("local_internals: could not successfully initialize the "
+                              "loader_life_support TLS key!");
+            }
+        }
+        // We can't help but leak the TLS key, because Python never unloads extension modules.
+    };
+
+    local_internals() {
+        auto &internals = get_internals();
+        // Get or create the `loader_life_support_stack_key`.
+        auto &ptr = internals.shared_data["_life_support"];
+        if (!ptr) {
+            ptr = new shared_loader_life_support_data;
+        }
+        loader_life_support_tls_key
+            = static_cast<shared_loader_life_support_data *>(ptr)->loader_life_support_tls_key;
+    }
+#endif //  defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
+};
+
+/// Works like `get_internals`, but for things which are locally registered.
+inline local_internals &get_local_internals() {
+    // Current static can be created in the interpreter finalization routine. If the later will be
+    // destroyed in another static variable destructor, creation of this static there will cause
+    // static deinitialization fiasco. In order to avoid it we avoid destruction of the
+    // local_internals static. One can read more about the problem and current solution here:
+    // https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables
+    static auto *locals = new local_internals();
+    return *locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+inline const char *get_function_record_capsule_name() {
+#if PYBIND11_INTERNALS_VERSION > 4
+    return get_internals().function_record_capsule_name.c_str();
+#else
+    return nullptr;
+#endif
+}
+
+// Determine whether or not the following capsule contains a pybind11 function record.
+// Note that we use `internals` to make sure that only ABI compatible records are touched.
+//
+// This check is currently used in two places:
+// - An important optimization in functional.h to avoid overhead in C++ -> Python -> C++
+// - The sibling feature of cpp_function to allow overloads
+inline bool is_function_record_capsule(const capsule &cap) {
+    // Pointer equality as we rely on internals() to ensure unique pointers
+    return cap.name() == get_function_record_capsule_name();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template <typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..9528c51103407120d1b572a9c248eff82c62e4d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h
@@ -0,0 +1,1218 @@
+/*
+    pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h)
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+#include "common.h"
+#include "descr.h"
+#include "internals.h"
+#include "typeid.h"
+
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+private:
+    loader_life_support *parent = nullptr;
+    std::unordered_set<PyObject *> keep_alive;
+
+#if defined(WITH_THREAD)
+    // Store stack pointer in thread-local storage.
+    static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
+#    if PYBIND11_INTERNALS_VERSION == 4
+        return get_local_internals().loader_life_support_tls_key;
+#    else
+        return get_internals().loader_life_support_tls_key;
+#    endif
+    }
+    static loader_life_support *get_stack_top() {
+        return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
+    }
+    static void set_stack_top(loader_life_support *value) {
+        PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
+    }
+#else
+    // Use single global variable for stack.
+    static loader_life_support **get_stack_pp() {
+        static loader_life_support *global_stack = nullptr;
+        return global_stack;
+    }
+    static loader_life_support *get_stack_top() { return *get_stack_pp(); }
+    static void set_stack_top(loader_life_support *value) { *get_stack_pp() = value; }
+#endif
+
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() : parent{get_stack_top()} { set_stack_top(this); }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        if (get_stack_top() != this) {
+            pybind11_fail("loader_life_support: internal error");
+        }
+        set_stack_top(parent);
+        for (auto *item : keep_alive) {
+            Py_DECREF(item);
+        }
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        loader_life_support *frame = get_stack_top();
+        if (!frame) {
+            // NOTE: It would be nice to include the stack frames here, as this indicates
+            // use of pybind11::cast<> outside the normal call framework, finding such
+            // a location is challenging. Developers could consider printing out
+            // stack frame addresses here using something like __builtin_frame_address(0)
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+        }
+
+        if (frame->keep_alive.insert(h.ptr()).second) {
+            Py_INCREF(h.ptr());
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type);
+
+// Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+inline void all_type_info_add_base_most_derived_first(std::vector<type_info *> &bases,
+                                                      type_info *addl_base) {
+    for (auto it = bases.begin(); it != bases.end(); it++) {
+        type_info *existing_base = *it;
+        if (PyType_IsSubtype(addl_base->type, existing_base->type) != 0) {
+            bases.insert(it, addl_base);
+            return;
+        }
+    }
+    bases.push_back(addl_base);
+}
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    assert(bases.empty());
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases)) {
+        check.push_back((PyTypeObject *) parent.ptr());
+    }
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto *type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) {
+            continue;
+        }
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before:
+            // we want to follow Python/virtual C++ rules that there should only be one instance of
+            // a common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    all_type_info_add_base_most_derived_first(bases, tinfo);
+                }
+            }
+        } else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases)) {
+                check.push_back((PyTypeObject *) parent.ptr());
+            }
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second) {
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+    }
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) {
+    const auto &bases = all_type_info(type);
+    if (bases.empty()) {
+        return nullptr;
+    }
+    if (bases.size() > 1) {
+        pybind11_fail(
+            "pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    }
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = get_local_internals().registered_types_cpp;
+    auto it = locals.find(tp);
+    if (it != locals.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return
+/// nullptr.
+PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp,
+                                                   bool throw_if_missing = false) {
+    if (auto *ltype = get_local_type_info(tp)) {
+        return ltype;
+    }
+    if (auto *gtype = get_global_type_info(tp)) {
+        return gtype;
+    }
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \""
+                      + std::move(tname) + '"');
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+// Searches the inheritance graph for a registered Python instance, using all_type_info().
+PYBIND11_NOINLINE handle find_registered_python_instance(void *src,
+                                                         const detail::type_info *tinfo) {
+    auto it_instances = get_internals().registered_instances.equal_range(src);
+    for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+        for (auto *instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+            if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) {
+                return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+    }
+    return handle();
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index)
+        : inst{i}, index{index}, type{type},
+          vh{inst->simple_layout ? inst->simple_value_holder
+                                 : &inst->nonsimple.values_and_holders[vpos]} {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() = default;
+
+    // Used for past-the-end iterator
+    explicit value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void>
+    V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr() != nullptr; }
+
+    template <typename H>
+    H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+                   ? inst->simple_holder_constructed
+                   : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u;
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_holder_constructed = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed;
+        }
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+                   ? inst->simple_instance_registered
+                   : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0);
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_instance_registered = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered;
+        }
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    explicit values_and_holders(instance *inst)
+        : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    explicit values_and_holders(PyObject *obj)
+        : inst{nullptr}, tinfo(all_type_info(Py_TYPE(obj))) {
+        if (!tinfo.empty()) {
+            inst = reinterpret_cast<instance *>(obj);
+        }
+    }
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo) : inst{inst}, types{tinfo} {
+            if (inst != nullptr) {
+                assert(!types->empty());
+                curr = value_and_holder(
+                    inst /* instance */,
+                    (*types)[0] /* type info */,
+                    0, /* vpos: (non-simple types only): the first vptr comes first */
+                    0 /* index */);
+            }
+        }
+        // Past-the-end iterator:
+        explicit iterator(size_t end) : curr(end) {}
+
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout) {
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            }
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) {
+            ++it;
+        }
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+
+    // Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+    bool is_redundant_value_and_holder(const value_and_holder &vh) {
+        for (size_t i = 0; i < vh.index; i++) {
+            if (PyType_IsSubtype(tinfo[i]->type, tinfo[vh.index]->type) != 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE value_and_holder
+instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/,
+                               bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type) {
+        return value_and_holder(this, find_type, 0, 0);
+    }
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end()) {
+        return *it;
+    }
+
+    if (!throw_if_missing) {
+        return value_and_holder();
+    }
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `"
+                  + get_fully_qualified_tp_name(find_type->type)
+                  + "' is not a pybind11 base of the given `"
+                  + get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance");
+#else
+    pybind11_fail(
+        "pybind11::detail::instance::get_value_and_holder: "
+        "type is not a pybind11 base of the given instance "
+        "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for type details)");
+#endif
+}
+
+PYBIND11_NOINLINE void instance::allocate_layout() {
+    const auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0) {
+        pybind11_fail(
+            "instance allocation failed: new instance has no pybind11-registered base types");
+    }
+
+    simple_layout
+        = n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    } else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto *t : tinfo) {
+            space += 1;                      // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and
+                                        // instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation
+        // functions: Python is using pymalloc, which is designed to be
+        // efficient for small allocations like the one we're doing here;
+        // for larger allocations they are just wrappers around malloc.
+        // TODO: is this still true for pure Python 3.6?
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) {
+            throw std::bad_alloc();
+        }
+        nonsimple.status
+            = reinterpret_cast<std::uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+// NOLINTNEXTLINE(readability-make-member-function-const)
+PYBIND11_NOINLINE void instance::deallocate_layout() {
+    if (!simple_layout) {
+        PyMem_Free(nonsimple.values_and_holders);
+    }
+}
+
+PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type) {
+        return false;
+    }
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (const auto &vh : values_and_holders(it->second)) {
+            if (vh.type == type) {
+                return handle((PyObject *) it->second);
+            }
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x030D0000
+    return _PyThreadState_UncheckedGet();
+#else
+    return PyThreadState_GetUnchecked();
+#endif
+}
+
+// Forward declarations
+void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
+
+    explicit type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
+
+    bool load(handle src, bool convert) { return load_impl<type_caster_generic>(src, convert); }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src,
+                                         return_value_policy policy,
+                                         handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) { // no type info: error will be set already
+            return handle();
+        }
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr) {
+            return none().release();
+        }
+
+        if (handle registered_inst = find_registered_python_instance(src, tinfo)) {
+            return registered_inst;
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto *wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " + type_name
+                                     + " is non-copyable!");
+#else
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or "
+                                     "compile in debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor) {
+                    valueptr = move_constructor(src);
+                } else if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " + type_name
+                                     + " is neither movable nor copyable!");
+#else
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in "
+                                     "debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            const auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+                    vptr = ::operator new(type->type_size, std::align_val_t(type->type_align));
+                } else {
+                    vptr = ::operator new(type->type_size);
+                }
+#else
+                vptr = ::operator new(type->type_size);
+#endif
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (const auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value)) {
+                return true;
+            }
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false)) {
+            return caster.value;
+        }
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = type::handle_of(src);
+        if (!hasattr(pytype, local_key)) {
+            return false;
+        }
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp
+        // type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) {
+            return false;
+        }
+
+        if (auto *result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!typeinfo) {
+            return try_load_foreign_module_local(src);
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            const auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see
+            // if we can find an exact match (or, for a simple C++ type, an inherited match); if
+            // so, we can safely reinterpret_cast to the relevant pointer.
+            if (bases.size() > 1) {
+                for (auto *base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type)
+                                  : base->type == typeinfo->type) {
+                        this_.load_value(
+                            reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type
+            // match in the registered bases, above, so try implicit casting (needed for proper C++
+            // casting when MI is involved).
+            if (this_.try_implicit_casts(src, convert)) {
+                return true;
+            }
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (const auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src)) {
+                return true;
+            }
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto *gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        if (try_load_foreign_module_local(src)) {
+            return true;
+        }
+
+        // Custom converters didn't take None, now we convert None to nullptr.
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            value = nullptr;
+            return true;
+        }
+
+        return false;
+    }
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *>
+    src_and_type(const void *src,
+                 const std::type_info &cast_type,
+                 const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type)) {
+            return {src, const_cast<const type_info *>(tpi)};
+        }
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        set_error(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type = conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+                                   typename std::add_pointer<intrinsic_t<T>>::type,
+                                   typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type
+    = conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+                    typename std::add_pointer<intrinsic_t<T>>::type,
+                    conditional_t<std::is_rvalue_reference<T>::value,
+                                  typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+                                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// Does the container have a mapped type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_mapped_type_traits {
+    static constexpr bool has_mapped_type = false;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::mapped_type, Container>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = true;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::mapped_type, Container>>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+// Does the container have a value type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_value_type_traits : std::false_type {
+    static constexpr bool has_value_type = false;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::value_type, Container>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = true;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::value_type, Container>>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+/*
+ * Tag to be used for representing the bottom of recursively defined types.
+ * Define this tag so we don't have to use void.
+ */
+struct recursive_bottom {};
+
+/*
+ * Implementation detail of `recursive_container_traits` below.
+ * `T` is the `value_type` of the container, which might need to be modified to
+ * avoid recursive types and const types.
+ */
+template <typename T, bool is_this_a_map>
+struct impl_type_to_check_recursively {
+    /*
+     * If the container is recursive, then no further recursion should be done.
+     */
+    using if_recursive = recursive_bottom;
+    /*
+     * Otherwise yield `T` unchanged.
+     */
+    using if_not_recursive = T;
+};
+
+/*
+ * For pairs - only as value type of a map -, the first type should remove the `const`.
+ * Also, if the map is recursive, then the recursive checking should consider
+ * the first type only.
+ */
+template <typename A, typename B>
+struct impl_type_to_check_recursively<std::pair<A, B>, /* is_this_a_map = */ true> {
+    using if_recursive = typename std::remove_const<A>::type;
+    using if_not_recursive = std::pair<typename std::remove_const<A>::type, B>;
+};
+
+/*
+ * Implementation of `recursive_container_traits` below.
+ */
+template <typename Container, typename SFINAE = void>
+struct impl_recursive_container_traits {
+    using type_to_check_recursively = recursive_bottom;
+};
+
+template <typename Container>
+struct impl_recursive_container_traits<
+    Container,
+    typename std::enable_if<container_value_type_traits<Container>::has_value_type>::type> {
+    static constexpr bool is_recursive
+        = container_mapped_type_traits<Container>::has_recursive_mapped_type
+          || container_value_type_traits<Container>::has_recursive_value_type;
+    /*
+     * This member dictates which type Pybind11 should check recursively in traits
+     * such as `is_move_constructible`, `is_copy_constructible`, `is_move_assignable`, ...
+     * Direct access to `value_type` should be avoided:
+     * 1. `value_type` might recursively contain the type again
+     * 2. `value_type` of STL map types is `std::pair<A const, B>`, the `const`
+     *    should be removed.
+     *
+     */
+    using type_to_check_recursively = typename std::conditional<
+        is_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_not_recursive>::type;
+};
+
+/*
+ * This trait defines the `type_to_check_recursively` which is needed to properly
+ * handle recursively defined traits such as `is_move_constructible` without going
+ * into an infinite recursion.
+ * Should be used instead of directly accessing the `value_type`.
+ * It cancels the recursion by returning the `recursive_bottom` tag.
+ *
+ * The default definition of `type_to_check_recursively` is as follows:
+ *
+ * 1. By default, it is `recursive_bottom`, so that the recursion is canceled.
+ * 2. If the type is non-recursive and defines a `value_type`, then the `value_type` is used.
+ *    If the `value_type` is a pair and a `mapped_type` is defined,
+ *    then the `const` is removed from the first type.
+ * 3. If the type is recursive and `value_type` is not a pair, then `recursive_bottom` is returned.
+ * 4. If the type is recursive and `value_type` is a pair and a `mapped_type` is defined,
+ *    then `const` is removed from the first type and the first type is returned.
+ *
+ * This behavior can be extended by the user as seen in test_stl_binders.cpp.
+ *
+ * This struct is exactly the same as impl_recursive_container_traits.
+ * The duplication achieves that user-defined specializations don't compete
+ * with internal specializations, but take precedence.
+ */
+template <typename Container, typename SFINAE = void>
+struct recursive_container_traits : impl_recursive_container_traits<Container> {};
+
+template <typename T>
+struct is_move_constructible
+    : all_of<std::is_move_constructible<T>,
+             is_move_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_move_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the move constructor not exist when the two types aren't
+// themselves move constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_move_constructible<std::pair<T1, T2>>
+    : all_of<is_move_constructible<T1>, is_move_constructible<T2>> {};
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T>
+struct is_copy_constructible
+    : all_of<std::is_copy_constructible<T>,
+             is_copy_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_copy_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't
+// themselves copy constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T>
+struct is_copy_assignable
+    : all_of<
+          std::is_copy_assignable<T>,
+          is_copy_assignable<typename recursive_container_traits<T>::type_to_check_recursively>> {
+};
+
+template <>
+struct is_copy_assignable<recursive_bottom> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base {
+    static const void *get(const itype *src, const std::type_info *&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>> {
+    static const void *get(const itype *src, const std::type_info *&type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void *>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type>
+class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = const_name<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) {}
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) {}
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        const auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type)) {
+                return {vsrc, tpi};
+            }
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer,
+        // so don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         policy,
+                                         parent,
+                                         st.second,
+                                         make_copy_constructor(src),
+                                         make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         return_value_policy::take_ownership,
+                                         {},
+                                         st.second,
+                                         nullptr,
+                                         nullptr,
+                                         holder);
+    }
+
+    template <typename T>
+    using cast_op_type = detail::cast_op_type<T>;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype *() { return (type *) value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype &() {
+        if (!value) {
+            throw reference_cast_error();
+        }
+        return *((itype *) value);
+    }
+
+protected:
+    using Constructor = void *(*) (const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. A comma operator is used in the
+       decltype argument to apply SFINAE to the public copy/move constructors.*/
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *)
+        -> decltype(new T(std::declval<const T>()), Constructor{}) {
+        return [](const void *arg) -> void * { return new T(*reinterpret_cast<const T *>(arg)); };
+    }
+
+    template <typename T, typename = enable_if_t<is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *)
+        -> decltype(new T(std::declval<T &&>()), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+inline std::string quote_cpp_type_name(const std::string &cpp_type_name) {
+    return cpp_type_name; // No-op for now. See PR #4888
+}
+
+PYBIND11_NOINLINE std::string type_info_description(const std::type_info &ti) {
+    if (auto *type_data = get_type_info(ti)) {
+        handle th((PyObject *) type_data->type);
+        return th.attr("__module__").cast<std::string>() + '.'
+               + th.attr("__qualname__").cast<std::string>();
+    }
+    return quote_cpp_type_name(clean_type_id(ti.name()));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/detail/typeid.h b/MLPY/Lib/site-packages/torch/include/pybind11/detail/typeid.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0722b114456cf91b3553657c9f0e39f71101f37
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/detail/typeid.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#    include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) {
+            break;
+        }
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res{
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free};
+    if (status == 0) {
+        name = res.get();
+    }
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+
+inline std::string clean_type_id(const char *typeid_name) {
+    std::string name(typeid_name);
+    detail::clean_type_id(name);
+    return name;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T>
+static std::string type_id() {
+    return detail::clean_type_id(typeid(T).name());
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/eigen.h b/MLPY/Lib/site-packages/torch/include/pybind11/eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6c87c0c4728dab5bfd64d67067d65dc3cae4e8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/eigen.h
@@ -0,0 +1,12 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "eigen/matrix.h"
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/eigen/common.h b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..94542aaba1bf1397fd45a52c8328318e29ae6e35
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/common.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+// Common message for `static_assert()`s, which are useful to easily
+// preempt much less obvious errors.
+#define PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED                                    \
+    "Pointer types (in particular `PyObject *`) are not supported as scalar types for Eigen "     \
+    "types."
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/eigen/matrix.h b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ed3284e28128e1300f20d155401a4da21b86ac6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/matrix.h
@@ -0,0 +1,714 @@
+/*
+    pybind11/eigen/matrix.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../numpy.h"
+#include "common.h"
+
+/* HINT: To suppress warnings originating from the Eigen headers, use -isystem.
+   See also:
+       https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir
+       https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler
+*/
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(5054) // https://github.com/pybind/pybind11/pull/3741
+//       C5054: operator '&': deprecated between enumerations of different types
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+PYBIND11_WARNING_POP
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7),
+              "Eigen matrix support in pybind11 requires Eigen >= 3.2.7");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType>
+using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
+using EigenIndex = Eigen::Index;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::Map<Eigen::SparseMatrix<Scalar, Flags, StorageIndex>>;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::MappedSparseMatrix<Scalar, Flags, StorageIndex>;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T>
+using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>,
+                                  std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T>
+using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T>
+using is_eigen_dense_plain
+    = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T>
+using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T>
+using is_eigen_other
+    = all_of<is_template_base_of<Eigen::EigenBase, T>,
+             negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor>
+struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};    // Only valid if negativestrides is false!
+    bool negativestrides = false; // If true, do not use stride!
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride, EigenIndex cstride)
+        : conformable{true}, rows{r}, cols{c},
+          // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity.
+          // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+          stride{EigenRowMajor ? (rstride > 0 ? rstride : 0)
+                               : (cstride > 0 ? cstride : 0) /* outer stride */,
+                 EigenRowMajor ? (cstride > 0 ? cstride : 0)
+                               : (rstride > 0 ? rstride : 0) /* inner stride */},
+          negativestrides{rstride < 0 || cstride < 0} {}
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c * stride : stride, c == 1 ? r : r * stride) {}
+
+    template <typename props>
+    bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is
+        // irrelevant). Alternatively, if any dimension size is 0, the strides are not relevant
+        // (and numpy ≥ 1.23 sets the strides to 0 in that case, so we need to check explicitly).
+        if (negativestrides) {
+            return false;
+        }
+        if (rows == 0 || cols == 0) {
+            return true;
+        }
+        return (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner()
+                || (EigenRowMajor ? cols : rows) == 1)
+               && (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer()
+                   || (EigenRowMajor ? rows : cols) == 1);
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return conformable; }
+};
+
+template <typename Type>
+struct eigen_extract_stride {
+    using type = Type;
+};
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> {
+    using type = StrideType;
+};
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> {
+    using type = StrideType;
+};
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_>
+struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex rows = Type::RowsAtCompileTime, cols = Type::ColsAtCompileTime,
+                                size = Type::SizeAtCompileTime;
+    static constexpr bool row_major = Type::IsRowMajor,
+                          vector
+                          = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic,
+                          fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols;             // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero>
+    using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride
+        = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+        outer_stride = if_zero < StrideType::OuterStrideAtCompileTime,
+        vector      ? size
+        : row_major ? cols
+                    : rows > ::value;
+    static constexpr bool dynamic_stride
+        = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major
+        = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major
+        = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex np_rows = a.shape(0), np_cols = a.shape(1),
+                       np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                       np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) {
+                return false;
+            }
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but
+        // whichever is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+                         stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n) {
+                return false; // Vector size mismatch
+            }
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) {
+                return false;
+            }
+            return {1, n, stride};
+        } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+        if (fixed_rows && rows != n) {
+            return false;
+        }
+        return {n, 1, stride};
+    }
+
+    static constexpr bool show_writeable
+        = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous
+        = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor
+        = const_name("numpy.ndarray[") + npy_format_descriptor<Scalar>::name + const_name("[")
+          + const_name<fixed_rows>(const_name<(size_t) rows>(), const_name("m")) + const_name(", ")
+          + const_name<fixed_cols>(const_name<(size_t) cols>(), const_name("n")) + const_name("]")
+          +
+          // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to
+          // be satisfied: writeable=True (for a mutable reference), and, depending on the map's
+          // stride options, possibly f_contiguous or c_contiguous.  We include them in the
+          // descriptor output to provide some hint as to why a TypeError is occurring (otherwise
+          // it can be confusing to see that a function accepts a 'numpy.ndarray[float64[3,2]]' and
+          // an error message that you *gave* a numpy.ndarray of the right type and dimensions.
+          const_name<show_writeable>(", flags.writeable", "")
+          + const_name<show_c_contiguous>(", flags.c_contiguous", "")
+          + const_name<show_f_contiguous>(", flags.f_contiguous", "") + const_name("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props>
+handle
+eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector) {
+        a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base);
+    } else {
+        a = array({src.rows(), src.cols()},
+                  {elem_size * src.rowStride(), elem_size * src.colStride()},
+                  src.data(),
+                  base);
+    }
+
+    if (!writeable) {
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+    }
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a
+// numpy array that references the encapsulated data with a python-side reference to the capsule to
+// tie its destruction to that of any dependent python objects.  Const-ness is determined by
+// whether or not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src)) {
+            return false;
+        }
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf) {
+            return false;
+        }
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        auto fits = props::conformable(buf);
+        if (!fits) {
+            return false;
+        }
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) {
+            ref = ref.squeeze();
+        } else if (ref.ndim() == 1) {
+            buf = buf.squeeze();
+        }
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return &value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &&() && { return std::move(value); }
+    template <typename T>
+    using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType>
+struct eigen_map_caster {
+    static_assert(!std::is_pointer<typename MapType::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+private:
+    using props = EigenProps<MapType>;
+
+public:
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing
+    // (and have an appropriate keep_alive in place).  We return a numpy array pointing directly at
+    // the ref's data (The numpy array ends up read-only if the ref was to a const matrix type.)
+    // Note that this means you need to ensure you don't destroy the object in some other way (e.g.
+    // with an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename>
+    using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>> : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>>
+    : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array
+        = array_t<Scalar,
+                  array::forcecast
+                      | ((props::row_major ? props::inner_stride : props::outer_stride) == 1
+                             ? array::c_style
+                         : (props::row_major ? props::outer_stride : props::inner_stride) == 1
+                             ? array::f_style
+                             : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an
+    // incompatible layout, or is an array of a type that needs to be converted).  Using a numpy
+    // temporary (rather than an Eigen temporary) saves an extra copy when we need both type
+    // conversion and storage order conversion.  (Note that we refuse to use this temporary copy
+    // when loading an argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we
+        // can't avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            auto aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) {
+                    return false; // Incompatible dimensions
+                }
+                if (!fits.template stride_compatible<props>()) {
+                    need_copy = true;
+                } else {
+                    copy_or_ref = std::move(aref);
+                }
+            } else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) {
+                return false;
+            }
+
+            Array copy = Array::ensure(src);
+            if (!copy) {
+                return false;
+            }
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>()) {
+                return false;
+            }
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref),
+                              fits.rows,
+                              fits.cols,
+                              make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return ref.get(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return *ref; }
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) {
+        return a.mutable_data();
+    }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) {
+        return a.data();
+    }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S>
+    using stride_ctor_default = bool_constant<S::InnerStrideAtCompileTime != Eigen::Dynamic
+                                              && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                                              && std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S>
+    using stride_ctor_dual
+        = bool_constant<!stride_ctor_default<S>::value
+                        && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S>
+    using stride_ctor_outer
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::OuterStrideAtCompileTime == Eigen::Dynamic
+                        && S::InnerStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+    template <typename S>
+    using stride_ctor_inner
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::InnerStrideAtCompileTime == Eigen::Dynamic
+                        && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) {
+        return S();
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) {
+        return S(outer, inner);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) {
+        return S(outer);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) {
+        return S(inner);
+    }
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+protected:
+    using Matrix
+        = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename>
+    using cast_op_type = Type;
+};
+
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using StorageIndex = remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>;
+    using Index = typename Type::Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src) {
+            return false;
+        }
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module_::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!type::handle_of(obj).is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices) {
+            return false;
+        }
+
+        value = EigenMapSparseMatrix<Scalar,
+                                     Type::Flags &(Eigen::RowMajor | Eigen::ColMajor),
+                                     StorageIndex>(shape[0].cast<Index>(),
+                                                   shape[1].cast<Index>(),
+                                                   std::move(nnz),
+                                                   outerIndices.mutable_data(),
+                                                   innerIndices.mutable_data(),
+                                                   values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type &>(src).makeCompressed();
+
+        object matrix_type
+            = module_::import("scipy.sparse").attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(pybind11::make_tuple(
+                               std::move(data), std::move(innerIndices), std::move(outerIndices)),
+                           pybind11::make_tuple(src.rows(), src.cols()))
+            .release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[",
+                                                             "scipy.sparse.csc_matrix[")
+                             + npy_format_descriptor<Scalar>::name + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/eigen/tensor.h b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a920a8648577a03dc88879fdff79eca688d20465
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/eigen/tensor.h
@@ -0,0 +1,517 @@
+/*
+    pybind11/eigen/tensor.h: Transparent conversion for Eigen tensors
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../numpy.h"
+#include "common.h"
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+static_assert(__GNUC__ > 5, "Eigen Tensor support in pybind11 requires GCC > 5.0");
+#endif
+
+// Disable warnings for Eigen
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4554)
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+PYBIND11_WARNING_POP
+
+static_assert(EIGEN_VERSION_AT_LEAST(3, 3, 0),
+              "Eigen Tensor support in pybind11 requires Eigen >= 3.3.0");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline bool is_tensor_aligned(const void *data) {
+    return (reinterpret_cast<std::size_t>(data) % EIGEN_DEFAULT_ALIGN_BYTES) == 0;
+}
+
+template <typename T>
+constexpr int compute_array_flag_from_tensor() {
+    static_assert((static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor))
+                      || (static_cast<int>(T::Layout) == static_cast<int>(Eigen::ColMajor)),
+                  "Layout must be row or column major");
+    return (static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor)) ? array::c_style
+                                                                              : array::f_style;
+}
+
+template <typename T>
+struct eigen_tensor_helper {};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType>
+struct eigen_tensor_helper<Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>> {
+    using Type = Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>;
+    using ValidType = void;
+
+    static Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape(const Type &f) {
+        return f.dimensions();
+    }
+
+    static constexpr bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> & /*shape*/) {
+        return true;
+    }
+
+    template <typename T>
+    struct helper {};
+
+    template <size_t... Is>
+    struct helper<index_sequence<Is...>> {
+        static constexpr auto value = ::pybind11::detail::concat(const_name(((void) Is, "?"))...);
+    };
+
+    static constexpr auto dimensions_descriptor
+        = helper<decltype(make_index_sequence<Type::NumIndices>())>::value;
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        return new Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) { delete tensor; }
+};
+
+template <typename Scalar_, typename std::ptrdiff_t... Indices, int Options_, typename IndexType>
+struct eigen_tensor_helper<
+    Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>> {
+    using Type = Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>;
+    using ValidType = void;
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices>
+    get_shape(const Type & /*f*/) {
+        return get_shape();
+    }
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape() {
+        return Eigen::DSizes<typename Type::Index, Type::NumIndices>(Indices...);
+    }
+
+    static bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> &shape) {
+        return get_shape() == shape;
+    }
+
+    static constexpr auto dimensions_descriptor
+        = ::pybind11::detail::concat(const_name<Indices>()...);
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        Eigen::aligned_allocator<Type> allocator;
+        return ::new (allocator.allocate(1)) Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) {
+        Eigen::aligned_allocator<Type> allocator;
+        tensor->~Type();
+        allocator.deallocate(tensor, 1);
+    }
+};
+
+template <typename Type, bool ShowDetails, bool NeedsWriteable = false>
+struct get_tensor_descriptor {
+    static constexpr auto details
+        = const_name<NeedsWriteable>(", flags.writeable", "")
+          + const_name<static_cast<int>(Type::Layout) == static_cast<int>(Eigen::RowMajor)>(
+              ", flags.c_contiguous", ", flags.f_contiguous");
+    static constexpr auto value
+        = const_name("numpy.ndarray[") + npy_format_descriptor<typename Type::Scalar>::name
+          + const_name("[") + eigen_tensor_helper<remove_cv_t<Type>>::dimensions_descriptor
+          + const_name("]") + const_name<ShowDetails>(details, const_name("")) + const_name("]");
+};
+
+// When EIGEN_AVOID_STL_ARRAY is defined, Eigen::DSizes<T, 0> does not have the begin() member
+// function. Falling back to a simple loop works around this issue.
+//
+// We need to disable the type-limits warning for the inner loop when size = 0.
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits")
+
+template <typename T, int size>
+std::vector<T> convert_dsizes_to_vector(const Eigen::DSizes<T, size> &arr) {
+    std::vector<T> result(size);
+
+    for (size_t i = 0; i < size; i++) {
+        result[i] = arr[i];
+    }
+
+    return result;
+}
+
+template <typename T, int size>
+Eigen::DSizes<T, size> get_shape_for_array(const array &arr) {
+    Eigen::DSizes<T, size> result;
+    const T *shape = arr.shape();
+    for (size_t i = 0; i < size; i++) {
+        result[i] = shape[i];
+    }
+
+    return result;
+}
+
+PYBIND11_WARNING_POP
+
+template <typename Type>
+struct type_caster<Type, typename eigen_tensor_helper<Type>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using Helper = eigen_tensor_helper<Type>;
+    static constexpr auto temp_name = get_tensor_descriptor<Type, false>::value;
+    PYBIND11_TYPE_CASTER(Type, temp_name);
+
+    bool load(handle src, bool convert) {
+        if (!convert) {
+            if (!isinstance<array>(src)) {
+                return false;
+            }
+            array temp = array::ensure(src);
+            if (!temp) {
+                return false;
+            }
+
+            if (!temp.dtype().is(dtype::of<typename Type::Scalar>())) {
+                return false;
+            }
+        }
+
+        array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()> arr(
+            reinterpret_borrow<object>(src));
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+        auto data_pointer = arr.data();
+#else
+        // Handle Eigen bug
+        auto data_pointer = const_cast<typename Type::Scalar *>(arr.data());
+#endif
+
+        if (is_tensor_aligned(arr.data())) {
+            value = Eigen::TensorMap<const Type, Eigen::Aligned>(data_pointer, shape);
+        } else {
+            value = Eigen::TensorMap<const Type>(data_pointer, shape);
+        }
+
+        return true;
+    }
+
+    static handle cast(Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(const Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        bool writeable = false;
+        switch (policy) {
+            case return_value_policy::move:
+                if (std::is_const<C>::value) {
+                    pybind11_fail("Cannot move from a constant reference");
+                }
+
+                src = Helper::alloc(std::move(*src));
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::take_ownership:
+                if (std::is_const<C>::value) {
+                    // This cast is ugly, and might be UB in some cases, but we don't have an
+                    // alternative here as we must free that memory
+                    Helper::free(const_cast<Type *>(src));
+                    pybind11_fail("Cannot take ownership of a const reference");
+                }
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::copy:
+                writeable = true;
+                break;
+
+            case return_value_policy::reference:
+                parent_object = none();
+                writeable = !std::is_const<C>::value;
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                writeable = !std::is_const<C>::value;
+                break;
+
+            default:
+                pybind11_fail("pybind11 bug in eigen.h, please file a bug report");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)), src->data(), parent_object);
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+};
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<!needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+    return reinterpret_cast<StoragePointerType>(arr.data());
+#else
+    // Handle Eigen bug
+    return reinterpret_cast<StoragePointerType>(const_cast<void *>(arr.data()));
+#endif
+}
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+    return reinterpret_cast<StoragePointerType>(arr.mutable_data());
+}
+
+template <typename T, typename = void>
+struct get_storage_pointer_type;
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::StoragePointerType>> {
+    using SPT = typename MapType::StoragePointerType;
+};
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::PointerArgType>> {
+    using SPT = typename MapType::PointerArgType;
+};
+
+template <typename Type, int Options>
+struct type_caster<Eigen::TensorMap<Type, Options>,
+                   typename eigen_tensor_helper<remove_cv_t<Type>>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::TensorMap<Type, Options>;
+    using Helper = eigen_tensor_helper<remove_cv_t<Type>>;
+
+    bool load(handle src, bool /*convert*/) {
+        // Note that we have a lot more checks here as we want to make sure to avoid copies
+        if (!isinstance<array>(src)) {
+            return false;
+        }
+        auto arr = reinterpret_borrow<array>(src);
+        if ((arr.flags() & compute_array_flag_from_tensor<Type>()) == 0) {
+            return false;
+        }
+
+        if (!arr.dtype().is(dtype::of<typename Type::Scalar>())) {
+            return false;
+        }
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+
+        constexpr bool is_aligned = (Options & Eigen::Aligned) != 0;
+
+        if (is_aligned && !is_tensor_aligned(arr.data())) {
+            return false;
+        }
+
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+        if (needs_writeable && !arr.writeable()) {
+            return false;
+        }
+
+        auto result = get_array_data_for_type<typename get_storage_pointer_type<MapType>::SPT,
+                                              needs_writeable>(arr);
+
+        value.reset(new MapType(std::move(result), std::move(shape)));
+
+        return true;
+    }
+
+    static handle cast(MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        constexpr bool writeable = !std::is_const<C>::value;
+        switch (policy) {
+            case return_value_policy::reference:
+                parent_object = none();
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                break;
+
+            case return_value_policy::take_ownership:
+                delete src;
+                // fallthrough
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map type, must be either "
+                              "reference or reference_internal");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)),
+            src->data(),
+            std::move(parent_object));
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+
+    static constexpr bool needs_writeable = !std::is_const<typename std::remove_pointer<
+        typename get_storage_pointer_type<MapType>::SPT>::type>::value;
+#else
+    // Handle Eigen bug
+    static constexpr bool needs_writeable = !std::is_const<Type>::value;
+#endif
+
+protected:
+    // TODO: Move to std::optional once std::optional has more support
+    std::unique_ptr<MapType> value;
+
+public:
+    static constexpr auto name = get_tensor_descriptor<Type, true, needs_writeable>::value;
+    explicit operator MapType *() { return value.get(); }
+    explicit operator MapType &() { return *value; }
+    explicit operator MapType &&() && { return std::move(*value); }
+
+    template <typename T_>
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/embed.h b/MLPY/Lib/site-packages/torch/include/pybind11/embed.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ec54f02daf2d5537ed893198cf5e777a6d45842
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/embed.h
@@ -0,0 +1,316 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#include <memory>
+#include <vector>
+
+#if defined(PYPY_VERSION)
+#    error Embedding the interpreter is not supported with PyPy
+#endif
+
+#define PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                       \
+    extern "C" PyObject *pybind11_init_impl_##name();                                             \
+    extern "C" PyObject *pybind11_init_impl_##name() { return pybind11_init_wrapper_##name(); }
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                                                  \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name);           \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {                            \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                           \
+    ::pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name)(                  \
+        PYBIND11_TOSTRING(name), PYBIND11_CONCAT(pybind11_init_impl_, name));                     \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_                                \
+                                               & variable) // NOLINT(bugprone-macro-parentheses)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+    using init_t = PyObject *(*) ();
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized() != 0) {
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+        }
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1) {
+            pybind11_fail("Insufficient memory to add a new module");
+        }
+    }
+};
+
+struct wide_char_arg_deleter {
+    void operator()(wchar_t *ptr) const {
+        // API docs: https://docs.python.org/3/c-api/sys.html#c.Py_DecodeLocale
+        PyMem_RawFree(ptr);
+    }
+};
+
+inline wchar_t *widen_chars(const char *safe_arg) {
+    wchar_t *widened_arg = Py_DecodeLocale(safe_arg, nullptr);
+    return widened_arg;
+}
+
+inline void precheck_interpreter() {
+    if (Py_IsInitialized() != 0) {
+        pybind11_fail("The interpreter is already running");
+    }
+}
+
+#if !defined(PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX)
+#    define PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX (0x03080000)
+#endif
+
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter_pre_pyconfig(bool init_signal_handlers,
+                                                int argc,
+                                                const char *const *argv,
+                                                bool add_program_dir_to_path) {
+    detail::precheck_interpreter();
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+#    if defined(WITH_THREAD) && PY_VERSION_HEX < 0x03070000
+    PyEval_InitThreads();
+#    endif
+
+    // Before it was special-cased in python 3.8, passing an empty or null argv
+    // caused a segfault, so we have to reimplement the special case ourselves.
+    bool special_case = (argv == nullptr || argc <= 0);
+
+    const char *const empty_argv[]{"\0"};
+    const char *const *safe_argv = special_case ? empty_argv : argv;
+    if (special_case) {
+        argc = 1;
+    }
+
+    auto argv_size = static_cast<size_t>(argc);
+    // SetArgv* on python 3 takes wchar_t, so we have to convert.
+    std::unique_ptr<wchar_t *[]> widened_argv(new wchar_t *[argv_size]);
+    std::vector<std::unique_ptr<wchar_t[], detail::wide_char_arg_deleter>> widened_argv_entries;
+    widened_argv_entries.reserve(argv_size);
+    for (size_t ii = 0; ii < argv_size; ++ii) {
+        widened_argv_entries.emplace_back(detail::widen_chars(safe_argv[ii]));
+        if (!widened_argv_entries.back()) {
+            // A null here indicates a character-encoding failure or the python
+            // interpreter out of memory. Give up.
+            return;
+        }
+        widened_argv[ii] = widened_argv_entries.back().get();
+    }
+
+    auto *pysys_argv = widened_argv.get();
+
+    PySys_SetArgvEx(argc, pysys_argv, static_cast<int>(add_program_dir_to_path));
+}
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter(PyConfig *config,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+    detail::precheck_interpreter();
+    PyStatus status = PyConfig_SetBytesArgv(config, argc, const_cast<char *const *>(argv));
+    if (PyStatus_Exception(status) != 0) {
+        // A failure here indicates a character-encoding failure or the python
+        // interpreter out of memory. Give up.
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to prepare CPython");
+    }
+    status = Py_InitializeFromConfig(config);
+    if (PyStatus_Exception(status) != 0) {
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to init CPython");
+    }
+    if (add_program_dir_to_path) {
+        PyRun_SimpleString("import sys, os.path; "
+                           "sys.path.insert(0, "
+                           "os.path.abspath(os.path.dirname(sys.argv[0])) "
+                           "if sys.argv and os.path.exists(sys.argv[0]) else '')");
+    }
+    PyConfig_Clear(config);
+}
+#endif
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional `init_signal_handlers` parameter can be used to skip the registration of
+    signal handlers (see the `Python documentation`_ for details). Calling this function
+    again after the interpreter has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    The remaining optional parameters, `argc`, `argv`, and `add_program_dir_to_path` are
+    used to populate ``sys.argv`` and ``sys.path``.
+    See the |PySys_SetArgvEx documentation|_ for details.
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+    .. |PySys_SetArgvEx documentation| replace:: ``PySys_SetArgvEx`` documentation
+    .. _PySys_SetArgvEx documentation: https://docs.python.org/3/c-api/init.html#c.PySys_SetArgvEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    detail::initialize_interpreter_pre_pyconfig(
+        init_signal_handlers, argc, argv, add_program_dir_to_path);
+#else
+    PyConfig config;
+    PyConfig_InitPythonConfig(&config);
+    // See PR #4473 for background
+    config.parse_argv = 0;
+
+    config.install_signal_handlers = init_signal_handlers ? 1 : 0;
+    initialize_interpreter(&config, argc, argv, add_program_dir_to_path);
+#endif
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in state_dict, so look there too:
+    if (object internals_obj
+        = get_internals_obj_from_state_dict(detail::get_python_state_dict())) {
+        internals_ptr_ptr = detail::get_internals_pp_from_capsule(internals_obj);
+    }
+    // Local internals contains data managed by the current interpreter, so we must clear them to
+    // avoid undefined behaviors when initializing another interpreter
+    detail::get_local_internals().registered_types_cpp.clear();
+    detail::get_local_internals().registered_exception_translators.clear();
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    See `initialize_interpreter` for a discussion of its constructor arguments.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    explicit scoped_interpreter(bool init_signal_handlers = true,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(init_signal_handlers, argc, argv, add_program_dir_to_path);
+    }
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    explicit scoped_interpreter(PyConfig *config,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(config, argc, argv, add_program_dir_to_path);
+    }
+#endif
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid) {
+            finalize_interpreter();
+        }
+    }
+
+private:
+    bool is_valid = true;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/eval.h b/MLPY/Lib/site-packages/torch/include/pybind11/eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..62259b6c6fe65f74dad84c00b07b029883ee24a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/eval.h
@@ -0,0 +1,156 @@
+/*
+    pybind11/eval.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline void ensure_builtins_in_globals(object &global) {
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x03080000
+    // Running exec and eval adds `builtins` module under `__builtins__` key to
+    // globals if not yet present.  Python 3.8 made PyRun_String behave
+    // similarly. Let's also do that for older versions, for consistency. This
+    // was missing from PyPy3.8 7.3.7.
+    if (!global.contains("__builtins__"))
+        global["__builtins__"] = module_::import(PYBIND11_BUILTINS_MODULE);
+#else
+    (void) global;
+#endif
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(const str &expr, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module_::import("textwrap").attr("dedent")(s)) : str(s);
+    return eval<mode>(expr, std::move(global), std::move(local));
+}
+
+inline void exec(const str &expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, std::move(global), std::move(local));
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, std::move(global), std::move(local));
+}
+
+#if defined(PYPY_VERSION)
+template <eval_mode mode = eval_statements>
+object eval_file(str, object, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+#else
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+    if (!global.contains("__file__")) {
+        global["__file__"] = std::move(fname);
+    }
+
+    PyObject *result
+        = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(), local.ptr(), closeFile);
+
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+#endif
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/functional.h b/MLPY/Lib/site-packages/torch/include/pybind11/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f59ff1390cdfd21651e3119e901931f79faeabb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/functional.h
@@ -0,0 +1,138 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*)(Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            return true;
+        }
+
+        if (!isinstance<function>(src)) {
+            return false;
+        }
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto *cfunc_self = PyCFunction_GET_SELF(cfunc.ptr());
+            if (cfunc_self == nullptr) {
+                PyErr_Clear();
+            } else if (isinstance<capsule>(cfunc_self)) {
+                auto c = reinterpret_borrow<capsule>(cfunc_self);
+
+                function_record *rec = nullptr;
+                // Check that we can safely reinterpret the capsule into a function_record
+                if (detail::is_function_record_capsule(c)) {
+                    rec = c.get_pointer<function_record>();
+                }
+
+                while (rec != nullptr) {
+                    if (rec->is_stateless
+                        && same_type(typeid(function_type),
+                                     *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                        struct capture {
+                            function_type f;
+                        };
+                        value = ((capture *) &rec->data)->f;
+                        return true;
+                    }
+                    rec = rec->next;
+                }
+            }
+            // PYPY segfaults here when passing builtin function like sum.
+            // Raising an fail exception here works to prevent the segfault, but only on gcc.
+            // See PR #1413 for full details
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+#if !(defined(_MSC_VER) && _MSC_VER == 1916 && defined(PYBIND11_CPP17))
+            // This triggers a syntax error under very special conditions (very weird indeed).
+            explicit
+#endif
+                func_handle(function &&f_) noexcept
+                : f(std::move(f_)) {
+            }
+            func_handle(const func_handle &f_) { operator=(f_); }
+            func_handle &operator=(const func_handle &f_) {
+                gil_scoped_acquire acq;
+                f = f_.f;
+                return *this;
+            }
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            explicit func_wrapper(func_handle &&hf) noexcept : hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                // casts the returned object as a rvalue to the return type
+                return hfunc.f(std::forward<Args>(args)...).template cast<Return>();
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_) {
+            return none().release();
+        }
+
+        auto result = f_.template target<function_type>();
+        if (result) {
+            return cpp_function(*result, policy).release();
+        }
+        return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type,
+                         const_name("Callable[[")
+                             + ::pybind11::detail::concat(make_caster<Args>::name...)
+                             + const_name("], ") + make_caster<retval_type>::name
+                             + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/gil.h b/MLPY/Lib/site-packages/torch/include/pybind11/gil.h
new file mode 100644
index 0000000000000000000000000000000000000000..e35f801f0eb7d00509b412a697e7c6cdc226cb9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/gil.h
@@ -0,0 +1,247 @@
+/*
+    pybind11/gil.h: RAII helpers for managing the GIL
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+#include <cassert>
+
+#if defined(WITH_THREAD) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "detail/internals.h"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// forward declarations
+PyThreadState *get_thread_state_unchecked();
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if defined(WITH_THREAD)
+
+#    if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!tstate) {
+                pybind11_fail("scoped_acquire: could not create thread state!");
+            }
+#        endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            PyEval_AcquireThread(tstate);
+        }
+
+        inc_ref();
+    }
+
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+
+    void inc_ref() { ++tstate->gilstate_counter; }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        if (detail::get_thread_state_unchecked() != tstate) {
+            pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+        }
+        if (tstate->gilstate_counter < 0) {
+            pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        }
+#        endif
+        if (tstate->gilstate_counter == 0) {
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!release) {
+                pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            }
+#        endif
+            PyThreadState_Clear(tstate);
+            if (active) {
+                PyThreadState_DeleteCurrent();
+            }
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release) {
+            PyEval_SaveThread();
+        }
+    }
+
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+    bool active = true;
+};
+
+class gil_scoped_release {
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        assert(PyGILState_Check());
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        auto &internals = detail::get_internals();
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    ~gil_scoped_release() {
+        if (!tstate) {
+            return;
+        }
+        // `PyEval_RestoreThread()` should not be called if runtime is finalizing
+        if (active) {
+            PyEval_RestoreThread(tstate);
+        }
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+    bool active = true;
+};
+
+#    else // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+
+public:
+    gil_scoped_acquire() : state{PyGILState_Ensure()} {}
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+    void disarm() {}
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    gil_scoped_release() {
+        assert(PyGILState_Check());
+        state = PyEval_SaveThread();
+    }
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+    void disarm() {}
+};
+
+#    endif // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+#else // WITH_THREAD
+
+class gil_scoped_acquire {
+public:
+    gil_scoped_acquire() {
+        // Trick to suppress `unused variable` error messages (at call sites).
+        (void) (this != (this + 1));
+    }
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    void disarm() {}
+};
+
+class gil_scoped_release {
+public:
+    gil_scoped_release() {
+        // Trick to suppress `unused variable` error messages (at call sites).
+        (void) (this != (this + 1));
+    }
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    void disarm() {}
+};
+
+#endif // WITH_THREAD
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h b/MLPY/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h
new file mode 100644
index 0000000000000000000000000000000000000000..43f43e99bfdff5eed5f6db524aa965e8e218b76d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+#include "detail/common.h"
+#include "gil.h"
+
+#include <cassert>
+#include <mutex>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Use the `gil_safe_call_once_and_store` class below instead of the naive
+//
+//   static auto imported_obj = py::module_::import("module_name"); // BAD, DO NOT USE!
+//
+// which has two serious issues:
+//
+//     1. Py_DECREF() calls potentially after the Python interpreter was finalized already, and
+//     2. deadlocks in multi-threaded processes (because of missing lock ordering).
+//
+// The following alternative avoids both problems:
+//
+//   PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object> storage;
+//   auto &imported_obj = storage // Do NOT make this `static`!
+//       .call_once_and_store_result([]() {
+//           return py::module_::import("module_name");
+//       })
+//       .get_stored();
+//
+// The parameter of `call_once_and_store_result()` must be callable. It can make
+// CPython API calls, and in particular, it can temporarily release the GIL.
+//
+// `T` can be any C++ type, it does not have to involve CPython API types.
+//
+// The behavior with regard to signals, e.g. `SIGINT` (`KeyboardInterrupt`),
+// is not ideal. If the main thread is the one to actually run the `Callable`,
+// then a `KeyboardInterrupt` will interrupt it if it is running normal Python
+// code. The situation is different if a non-main thread runs the
+// `Callable`, and then the main thread starts waiting for it to complete:
+// a `KeyboardInterrupt` will not interrupt the non-main thread, but it will
+// get processed only when it is the main thread's turn again and it is running
+// normal Python code. However, this will be unnoticeable for quick call-once
+// functions, which is usually the case.
+template <typename T>
+class gil_safe_call_once_and_store {
+public:
+    // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called.
+    template <typename Callable>
+    gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) {
+        if (!is_initialized_) { // This read is guarded by the GIL.
+            // Multiple threads may enter here, because the GIL is released in the next line and
+            // CPython API calls in the `fn()` call below may release and reacquire the GIL.
+            gil_scoped_release gil_rel; // Needed to establish lock ordering.
+            std::call_once(once_flag_, [&] {
+                // Only one thread will ever enter here.
+                gil_scoped_acquire gil_acq;
+                ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL.
+                is_initialized_ = true;   // This write is guarded by the GIL.
+            });
+            // All threads will observe `is_initialized_` as true here.
+        }
+        // Intentionally not returning `T &` to ensure the calling code is self-documenting.
+        return *this;
+    }
+
+    // This must only be called after `call_once_and_store_result()` was called.
+    T &get_stored() {
+        assert(is_initialized_);
+        PYBIND11_WARNING_PUSH
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+        // Needed for gcc 4.8.5
+        PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing")
+#endif
+        return *reinterpret_cast<T *>(storage_);
+        PYBIND11_WARNING_POP
+    }
+
+    constexpr gil_safe_call_once_and_store() = default;
+    PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default;
+
+private:
+    alignas(T) char storage_[sizeof(T)] = {};
+    std::once_flag once_flag_ = {};
+    bool is_initialized_ = false;
+    // The `is_initialized_`-`storage_` pair is very similar to `std::optional`,
+    // but the latter does not have the triviality properties of former,
+    // therefore `std::optional` is not a viable alternative here.
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/iostream.h b/MLPY/Lib/site-packages/torch/include/pybind11/iostream.h
new file mode 100644
index 0000000000000000000000000000000000000000..1783c90a2495e4ff3ee633c3d800236f47f68513
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/iostream.h
@@ -0,0 +1,265 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) override {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    // Computes how many bytes at the end of the buffer are part of an
+    // incomplete sequence of UTF-8 bytes.
+    // Precondition: pbase() < pptr()
+    size_t utf8_remainder() const {
+        const auto rbase = std::reverse_iterator<char *>(pbase());
+        const auto rpptr = std::reverse_iterator<char *>(pptr());
+        auto is_ascii = [](char c) { return (static_cast<unsigned char>(c) & 0x80) == 0x00; };
+        auto is_leading = [](char c) { return (static_cast<unsigned char>(c) & 0xC0) == 0xC0; };
+        auto is_leading_2b = [](char c) { return static_cast<unsigned char>(c) <= 0xDF; };
+        auto is_leading_3b = [](char c) { return static_cast<unsigned char>(c) <= 0xEF; };
+        // If the last character is ASCII, there are no incomplete code points
+        if (is_ascii(*rpptr)) {
+            return 0;
+        }
+        // Otherwise, work back from the end of the buffer and find the first
+        // UTF-8 leading byte
+        const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
+        const auto leading = std::find_if(rpptr, rpend, is_leading);
+        if (leading == rbase) {
+            return 0;
+        }
+        const auto dist = static_cast<size_t>(leading - rpptr);
+        size_t remainder = 0;
+
+        if (dist == 0) {
+            remainder = 1; // 1-byte code point is impossible
+        } else if (dist == 1) {
+            remainder = is_leading_2b(*leading) ? 0 : dist + 1;
+        } else if (dist == 2) {
+            remainder = is_leading_3b(*leading) ? 0 : dist + 1;
+        }
+        // else if (dist >= 3), at least 4 bytes before encountering an UTF-8
+        // leading byte, either no remainder or invalid UTF-8.
+        // Invalid UTF-8 will cause an exception later when converting
+        // to a Python string, so that's not handled here.
+        return remainder;
+    }
+
+    // This function must be non-virtual to be called in a destructor.
+    int _sync() {
+        if (pbase() != pptr()) { // If buffer is not empty
+            gil_scoped_acquire tmp;
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
+
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+                pywrite(std::move(line));
+                pyflush();
+            }
+
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0) {
+                std::memmove(pbase(), pptr() - remainder, remainder);
+            }
+            setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
+        }
+        return 0;
+    }
+
+    int sync() override { return _sync(); }
+
+public:
+    explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf &&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() override { _sync(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{
+                std::cerr, py::module::import("sys").attr("stderr")};
+            std::cout << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    explicit scoped_ostream_redirect(std::ostream &costream = std::cout,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() { costream.rdbuf(old); }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    explicit scoped_estream_redirect(std::ostream &costream = std::cerr,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream, pyostream) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_) {
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        }
+        if (do_stderr_) {
+            redirect_stderr.reset(new scoped_estream_redirect());
+        }
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(std::move(m), name.c_str(), module_local())
+        .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/numpy.h b/MLPY/Lib/site-packages/torch/include/pybind11/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..043e1c3271db37344546862a63ee77c041e7003d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/numpy.h
@@ -0,0 +1,2133 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+#include "complex.h"
+#include "gil_safe_call_once.h"
+#include "pytypes.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <utility>
+#include <vector>
+
+#if defined(PYBIND11_NUMPY_1_ONLY) && !defined(PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED)
+#    error PYBIND11_NUMPY_1_ONLY must be defined before any pybind11 header is included.
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user.
+   Note that NumPy 2 now uses ssize_t for `npy_intp` to simplify this. */
+static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(std::is_signed<Py_intptr_t>::value, "Py_intptr_t must be signed");
+// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+class dtype; // Forward declaration
+class array; // Forward declaration
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<dtype> {
+    static constexpr auto name = const_name("numpy.dtype");
+};
+
+template <>
+struct handle_type_name<array> {
+    static constexpr auto name = const_name("numpy.ndarray");
+};
+
+template <typename type, typename SFINAE = void>
+struct npy_format_descriptor;
+
+/* NumPy 1 proxy (always includes legacy fields) */
+struct PyArrayDescr1_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+#ifndef PYBIND11_NUMPY_1_ONLY
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    /* Additional fields are NumPy version specific. */
+};
+#else
+/* NumPy 1.x only, we can expose all fields */
+using PyArrayDescr_Proxy = PyArrayDescr1_Proxy;
+#endif
+
+/* NumPy 2 proxy, including legacy fields */
+struct PyArrayDescr2_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    std::uint64_t flags;
+    ssize_t elsize;
+    ssize_t alignment;
+    PyObject *metadata;
+    Py_hash_t hash;
+    void *reserved_null[2];
+    /* The following fields only exist if 0 <= type_num < 2056 */
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject *dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info &tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end()) {
+            return &(it->second);
+        }
+        if (throw_if_missing) {
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        }
+        return nullptr;
+    }
+
+    template <typename T>
+    numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals &get_numpy_internals() {
+    static numpy_internals *ptr = nullptr;
+    if (!ptr) {
+        load_numpy_internals(ptr);
+    }
+    return *ptr;
+}
+
+PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) {
+    module_ numpy = module_::import("numpy");
+    str version_string = numpy.attr("__version__");
+
+    module_ numpy_lib = module_::import("numpy.lib");
+    object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+#ifdef PYBIND11_NUMPY_1_ONLY
+    if (major_version >= 2) {
+        throw std::runtime_error(
+            "This extension was built with PYBIND11_NUMPY_1_ONLY defined, "
+            "but NumPy 2 is used in this process. For NumPy2 compatibility, "
+            "this extension needs to be rebuilt without the PYBIND11_NUMPY_1_ONLY define.");
+    }
+#endif
+    /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially
+        became a private module. */
+    std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core";
+    return module_::import((numpy_core_path + "." + submodule_name).c_str());
+}
+
+template <typename T>
+struct same_size {
+    template <typename U>
+    using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete>
+constexpr int platform_lookup() {
+    return -1;
+}
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_,
+        NPY_UBYTE_,
+        NPY_SHORT_,
+        NPY_USHORT_,
+        NPY_INT_,
+        NPY_UINT_,
+        NPY_LONG_,
+        NPY_ULONG_,
+        NPY_LONGLONG_,
+        NPY_ULONGLONG_,
+        NPY_FLOAT_,
+        NPY_DOUBLE_,
+        NPY_LONGDOUBLE_,
+        NPY_CFLOAT_,
+        NPY_CDOUBLE_,
+        NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_,
+        NPY_UNICODE_,
+        NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_
+        = platform_lookup<std::int32_t, long, int, short>(NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_
+        = platform_lookup<std::int64_t, long, long long, int>(NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_
+        = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    unsigned int PyArray_RUNTIME_VERSION_;
+
+    struct PyArray_Dims {
+        Py_intptr_t *ptr;
+        int len;
+    };
+
+    static npy_api &get() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<npy_api> storage;
+        return storage.call_once_and_store_result(lookup).get_stored();
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArray_Type_) != 0;
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0;
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *,
+                                       PyObject *,
+                                       int,
+                                       Py_intptr_t const *,
+                                       Py_intptr_t const *,
+                                       void *,
+                                       int,
+                                       PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_)(PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_)(PyObject *, PyObject *);
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *,
+                                             PyObject *,
+                                             unsigned char,
+                                             PyObject **,
+                                             int *,
+                                             Py_intptr_t *,
+                                             PyObject **,
+                                             PyObject *);
+#endif
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int);
+    PyObject *(*PyArray_Newshape_)(PyObject *, PyArray_Dims *, int);
+    PyObject *(*PyArray_View_)(PyObject *, PyObject *, PyObject *);
+
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        // CopyInto was slot 82 and 50 was effectively an alias. NumPy 2 removed 82.
+        API_PyArray_CopyInto = 50,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 96,
+        API_PyArray_Newshape = 135,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_View = 137,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+#ifdef PYBIND11_NUMPY_1_ONLY
+        API_PyArray_GetArrayParamsFromObject = 278,
+#endif
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module_ m = detail::import_numpy_core_submodule("multiarray");
+        auto c = m.attr("_ARRAY_API");
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr);
+        if (api_ptr == nullptr) {
+            raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer.");
+            throw error_already_set();
+        }
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        api.PyArray_RUNTIME_VERSION_ = api.PyArray_GetNDArrayCFeatureVersion_();
+        if (api.PyArray_RUNTIME_VERSION_ < 0x7) {
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        }
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_Newshape);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_View);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+#ifdef PYBIND11_NUMPY_1_ONLY
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+#endif
+        DECL_NPY_API(PyArray_SetBaseObject);
+
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy *array_proxy(void *ptr) { return reinterpret_cast<PyArray_Proxy *>(ptr); }
+
+inline const PyArray_Proxy *array_proxy(const void *ptr) {
+    return reinterpret_cast<const PyArray_Proxy *>(ptr);
+}
+
+inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) {
+    return reinterpret_cast<PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr1_Proxy *array_descriptor1_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr1_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr2_Proxy *array_descriptor2_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr2_Proxy *>(ptr);
+}
+
+inline bool check_flags(const void *ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T>
+struct is_std_array : std::false_type {};
+template <typename T, size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+template <typename T>
+struct is_complex : std::false_type {};
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+struct array_info_scalar {
+    using type = T;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = const_name("");
+    static void append_extents(list & /* shape */) {}
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T>
+struct array_info : array_info_scalar<T> {};
+template <typename T, size_t N>
+struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list &shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = const_name<array_info<T>::is_array>(
+        ::pybind11::detail::concat(const_name<N>(), array_info<T>::extents), const_name<N>());
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N>
+struct array_info<char[N]> : array_info_scalar<char[N]> {};
+template <size_t N>
+struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> {};
+template <typename T, size_t N>
+struct array_info<T[N]> : array_info<std::array<T, N>> {};
+template <typename T>
+using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T>
+using is_pod_struct
+    = all_of<std::is_standard_layout<T>, // since we're accessing directly in memory
+                                         // we need a standard layout type
+#if defined(__GLIBCXX__)                                                                          \
+    && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623              \
+        || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803)
+             // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after
+             // 5) don't implement is_trivially_copyable, so approximate it
+             std::is_trivially_destructible<T>,
+             satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#else
+             std::is_trivially_copyable<T>,
+#endif
+             satisfies_none_of<T,
+                               std::is_reference,
+                               std::is_array,
+                               is_std_array,
+                               std::is_arithmetic,
+                               is_complex,
+                               std::is_enum>>;
+
+// Replacement for std::is_pod (deprecated in C++20)
+template <typename T>
+using is_pod = all_of<std::is_standard_layout<T>, std::is_trivial<T>>;
+
+template <ssize_t Dim = 0, typename Strides>
+ssize_t byte_offset_unsafe(const Strides &) {
+    return 0;
+}
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`. `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>> shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<!Dyn, ssize_t>)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<Dyn, ssize_t> dims)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides},
+          dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix>
+    const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_
+                                            + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const {
+        return operator()(index);
+    }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix>
+    const T *data(Ix... ix) const {
+        return &operator()(ssize_t(ix)...);
+    }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the
+    /// shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(
+            shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span
+    /// in memory may be larger if the referenced array has non-contiguous strides (e.g. for a
+    /// slice).
+    ssize_t nbytes() const { return size() * itemsize(); }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+
+public:
+    // Bring in const-qualified versions from base class
+    using ConstBase::operator();
+    using ConstBase::operator[];
+
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix>
+    T &operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) {
+        return operator()(index);
+    }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix>
+    T *mutable_data(Ix... ix) {
+        return &operator()(ssize_t(ix)...);
+    }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */,
+                  "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>>
+    : type_caster<unchecked_reference<T, Dim>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_)
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(pybind11::str(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize())
+                    .release()
+                    .ptr();
+    }
+
+    explicit dtype(const pybind11::str &format) : dtype(from_args(format)) {}
+
+    explicit dtype(const std::string &format) : dtype(pybind11::str(format)) {}
+
+    explicit dtype(const char *format) : dtype(pybind11::str(format)) {}
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = std::move(names);
+        args["formats"] = std::move(formats);
+        args["offsets"] = std::move(offsets);
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// Return dtype for the given typenum (one of the NPY_TYPES).
+    /// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_DescrFromType
+    explicit dtype(int typenum)
+        : object(detail::npy_api::get().PyArray_DescrFromType_(typenum), stolen_t{}) {
+        if (m_ptr == nullptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(const object &args) {
+        PyObject *ptr = nullptr;
+        if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T>
+    static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    ssize_t itemsize() const { return detail::array_descriptor_proxy(m_ptr)->elsize; }
+#else
+    ssize_t itemsize() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->elsize;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->elsize;
+    }
+#endif
+
+    /// Returns true for structured data types.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    bool has_fields() const { return detail::array_descriptor_proxy(m_ptr)->names != nullptr; }
+#else
+    bool has_fields() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->names != nullptr;
+        }
+        const auto *proxy = detail::array_descriptor2_proxy(m_ptr);
+        if (proxy->type_num < 0 || proxy->type_num >= 2056) {
+            return false;
+        }
+        return proxy->names != nullptr;
+    }
+#endif
+
+    /// Single-character code for dtype's kind.
+    /// For example, floating point types are 'f' and integral types are 'i'.
+    char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; }
+
+    /// Single-character for dtype's type.
+    /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'.
+    char char_() const {
+        // Note: The signature, `dtype::char_` follows the naming of NumPy's
+        // public Python API (i.e., ``dtype.char``), rather than its internal
+        // C API (``PyArray_Descr::type``).
+        return detail::array_descriptor_proxy(m_ptr)->type;
+    }
+
+    /// type number of dtype.
+    int num() const {
+        // Note: The signature, `dtype::num` follows the naming of NumPy's public
+        // Python API (i.e., ``dtype.num``), rather than its internal
+        // C API (``PyArray_Descr::type_num``).
+        return detail::array_descriptor_proxy(m_ptr)->type_num;
+    }
+
+    /// Single character for byteorder
+    char byteorder() const { return detail::array_descriptor_proxy(m_ptr)->byteorder; }
+
+/// Alignment of the data type
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int alignment() const { return detail::array_descriptor_proxy(m_ptr)->alignment; }
+#else
+    ssize_t alignment() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->alignment;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->alignment;
+    }
+#endif
+
+/// Flags for the array descriptor
+#ifdef PYBIND11_NUMPY_1_ONLY
+    char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; }
+#else
+    std::uint64_t flags() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return (unsigned char) detail::array_descriptor1_proxy(m_ptr)->flags;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->flags;
+    }
+#endif
+
+private:
+    static object &_dtype_from_pep3118() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<object> storage;
+        return storage
+            .call_once_and_store_result([]() {
+                return detail::import_numpy_core_submodule("_internal")
+                    .attr("_dtype_from_pep3118");
+            })
+            .get_stored();
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields()) {
+            return *this;
+        }
+
+        struct field_descr {
+            pybind11::str name;
+            object format;
+            pybind11::int_ offset;
+            field_descr(pybind11::str &&name, object &&format, pybind11::int_ &&offset)
+                : name{std::move(name)}, format{std::move(format)}, offset{std::move(offset)} {};
+        };
+        auto field_dict = attr("fields").cast<dict>();
+        std::vector<field_descr> field_descriptors;
+        field_descriptors.reserve(field_dict.size());
+
+        for (auto field : field_dict.attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto spec_fo = spec[1].cast<tuple>();
+            auto format = spec_fo[0].cast<dtype>();
+            auto offset = spec_fo[1].cast<pybind11::int_>();
+            if ((len(name) == 0u) && format.kind() == 'V') {
+                continue;
+            }
+            field_descriptors.emplace_back(
+                std::move(name), format.strip_padding(format.itemsize()), std::move(offset));
+        }
+
+        std::sort(field_descriptors.begin(),
+                  field_descriptors.end(),
+                  [](const field_descr &a, const field_descr &b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto &descr : field_descriptors) {
+            names.append(std::move(descr.name));
+            formats.append(std::move(descr.format));
+            offsets.append(std::move(descr.offset));
+        }
+        return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array(0, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          StridesContainer strides,
+          const void *ptr = nullptr,
+          handle base = handle()) {
+
+        if (strides->empty()) {
+            *strides = detail::c_strides(*shape, dt.itemsize());
+        }
+
+        auto ndim = shape->size();
+        if (ndim != strides->size()) {
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        }
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base)) {
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags()
+                        & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            } else {
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+            }
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_,
+            descr.release().ptr(),
+            (int) ndim,
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t *>(shape->data()),
+            reinterpret_cast<Py_intptr_t *>(strides->data()),
+            const_cast<void *>(ptr),
+            flags,
+            nullptr));
+        if (!tmp) {
+            throw error_already_set();
+        }
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(
+                    api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          const void *ptr = nullptr,
+          handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) {}
+
+    template <typename T,
+              typename
+              = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) {}
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    explicit array(const buffer_info &info, handle base = handle())
+        : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) {}
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const { return dtype().itemsize(); }
+
+    /// Total number of bytes
+    ssize_t nbytes() const { return size() * itemsize(); }
+
+    /// Number of dimensions
+    ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; }
+
+    /// Base object
+    object base() const { return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base); }
+
+    /// Dimensions of the array
+    const ssize_t *shape() const { return detail::array_proxy(m_ptr)->dimensions; }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const { return detail::array_proxy(m_ptr)->flags; }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    const void *data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template <typename... Ix>
+    void *mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim()) {
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        }
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_mutable_reference<T, Dims>(
+            mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto &api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d
+            = {// Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+               reinterpret_cast<Py_intptr_t *>(new_shape->data()),
+               int(new_shape->size())};
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        auto new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        if (isinstance<array>(new_array)) {
+            *this = std::move(new_array);
+        }
+    }
+
+    /// Optional `order` parameter omitted, to be added as needed.
+    array reshape(ShapeContainer new_shape) {
+        detail::npy_api::PyArray_Dims d
+            = {reinterpret_cast<Py_intptr_t *>(new_shape->data()), int(new_shape->size())};
+        auto new_array
+            = reinterpret_steal<array>(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        return new_array;
+    }
+
+    /// Create a view of an array in a different data type.
+    /// This function may fundamentally reinterpret the data in the array.
+    /// It is the responsibility of the caller to ensure that this is safe.
+    /// Only supports the `dtype` argument, the `type` argument is omitted,
+    /// to be added as needed.
+    array view(const std::string &dtype) {
+        auto &api = detail::npy_api::get();
+        auto new_view = reinterpret_steal<array>(api.PyArray_View_(
+            m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr));
+        if (!new_view) {
+            throw error_already_set();
+        }
+        return new_view;
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+protected:
+    template <typename, typename>
+    friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string &msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) + " (ndim = " + std::to_string(ndim())
+                          + ')');
+    }
+
+    template <typename... Ix>
+    ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable()) {
+            throw std::domain_error("array is not writeable");
+        }
+    }
+
+    template <typename... Ix>
+    void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t *) const {}
+
+    template <typename... Ix>
+    void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i)
+                              + " is out of bounds for axis " + std::to_string(axis)
+                              + " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast>
+class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor,
+            ShapeContainer &&shape,
+            StridesContainer &&strides,
+            const T *ptr,
+            handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {}
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) {}
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            PyErr_Clear();
+        }
+        if (!is_borrowed) {
+            Py_XDECREF(h.ptr());
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    explicit array_t(const buffer_info &info, handle base = handle()) : array(info, base) {}
+
+    array_t(ShapeContainer shape,
+            StridesContainer strides,
+            const T *ptr = nullptr,
+            handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{},
+                  std::move(shape),
+                  (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize())
+                                              : detail::c_strides(*shape, itemsize()),
+                  ptr,
+                  base) {}
+
+    explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    constexpr ssize_t itemsize() const { return sizeof(T); }
+
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template <typename... Ix>
+    const T *data(Ix... index) const {
+        return static_cast<const T *>(array::data(index...));
+    }
+
+    template <typename... Ix>
+    T *mutable_data(Ix... index) {
+        return static_cast<T *>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template <typename... Ix>
+    const T &at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<const T *>(array::data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template <typename... Ix>
+    T &mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<T *>(array::mutable_data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr,
+                                          dtype::of<T>().ptr())
+               && detail::check_flags(h.ptr(), ExtraFlags & (array::c_style | array::f_style));
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(ptr,
+                                                       dtype::of<T>().release().ptr(),
+                                                       0,
+                                                       0,
+                                                       detail::npy_api::NPY_ARRAY_ENSUREARRAY_
+                                                           | ExtraFlags,
+                                                       nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N>
+struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+template <size_t N>
+struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = const_name("(") + array_info<T>::extents + const_name(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src)) {
+            return false;
+        }
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = const_name<std::is_same<T, bool>::value>(
+        const_name("bool"),
+        const_name<std::is_signed<T>::value>("numpy.int", "numpy.uint")
+            + const_name<sizeof(T) * 8>());
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<T, float>::value
+                                 || std::is_same<T, const float>::value
+                                 || std::is_same<T, double>::value
+                                 || std::is_same<T, const double>::value
+                                        > (const_name("numpy.float") + const_name<sizeof(T) * 8>(),
+                                           const_name("numpy.longdouble"));
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<typename T::value_type, float>::value
+                                 || std::is_same<typename T::value_type, const float>::value
+                                 || std::is_same<typename T::value_type, double>::value
+                                 || std::is_same<typename T::value_type, const double>::value
+                                        > (const_name("numpy.complex")
+                                               + const_name<sizeof(typename T::value_type) * 16>(),
+                                           const_name("numpy.longcomplex"));
+};
+
+template <typename T>
+struct npy_format_descriptor<
+    T,
+    enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {npy_api::NPY_BOOL_,
+                                             npy_api::NPY_BYTE_,
+                                             npy_api::NPY_UBYTE_,
+                                             npy_api::NPY_INT16_,
+                                             npy_api::NPY_UINT16_,
+                                             npy_api::NPY_INT32_,
+                                             npy_api::NPY_UINT32_,
+                                             npy_api::NPY_INT64_,
+                                             npy_api::NPY_UINT64_,
+                                             npy_api::NPY_FLOAT_,
+                                             npy_api::NPY_DOUBLE_,
+                                             npy_api::NPY_LONGDOUBLE_,
+                                             npy_api::NPY_CFLOAT_,
+                                             npy_api::NPY_CDOUBLE_,
+                                             npy_api::NPY_CLONGDOUBLE_};
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr auto name = const_name("object");
+
+    static constexpr int value = npy_api::NPY_OBJECT_;
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+#define PYBIND11_DECL_CHAR_FMT                                                                    \
+    static constexpr auto name = const_name("S") + const_name<N>();                               \
+    static pybind11::dtype dtype() {                                                              \
+        return pybind11::dtype(std::string("S") + std::to_string(N));                             \
+    }
+template <size_t N>
+struct npy_format_descriptor<char[N]> {
+    PYBIND11_DECL_CHAR_FMT
+};
+template <size_t N>
+struct npy_format_descriptor<std::array<char, N>> {
+    PYBIND11_DECL_CHAR_FMT
+};
+#undef PYBIND11_DECL_CHAR_FMT
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name
+        = const_name("(") + array_info<T>::extents + const_name(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(
+            pybind11::make_tuple(base_descr::dtype(), std::move(shape)));
+    }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+PYBIND11_NOINLINE void register_structured_dtype(any_container<field_descriptor> fields,
+                                                 const std::type_info &tinfo,
+                                                 ssize_t itemsize,
+                                                 bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto &numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false)) {
+        pybind11_fail("NumPy: dtype is already registered");
+    }
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(
+        ordered_fields.begin(),
+        ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto &field : ordered_fields) {
+        if (!field.descr) {
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ "
+                          + tinfo.name());
+        }
+        names.append(pybind11::str(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto *dtype_ptr
+        = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize)
+              .release()
+              .ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto &field : ordered_fields) {
+        if (field.offset > offset) {
+            oss << (field.offset - offset) << 'x';
+        }
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset) {
+        oss << (itemsize - offset) << 'x';
+    }
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Smoke test: verify that NumPy properly parses our buffer format string
+    auto &api = npy_api::get();
+    auto arr = array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) {
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+    }
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = {dtype_ptr, std::move(format_str)};
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE>
+struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value,
+                  "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() { return reinterpret_borrow<pybind11::dtype>(dtype_ptr()); }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields),
+                                  typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T),
+                                  &direct_converter);
+    }
+
+private:
+    static PyObject *dtype_ptr() {
+        static PyObject *ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void *&value) {
+        auto &api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) {
+            return false;
+        }
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+#    define PYBIND11_NUMPY_DTYPE(Type, ...) ((void) 0)
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void) 0)
+#else
+
+#    define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+        ::pybind11::detail::field_descriptor {                                                    \
+            Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+                ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),       \
+                ::pybind11::detail::npy_format_descriptor<                                        \
+                    decltype(std::declval<T>().Field)>::dtype()                                   \
+        }
+
+// Extract name, offset and format descriptor for a struct field
+#    define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#    define PYBIND11_EVAL0(...) __VA_ARGS__
+#    define PYBIND11_EVAL1(...) PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__)))
+#    define PYBIND11_EVAL2(...) PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__)))
+#    define PYBIND11_EVAL3(...) PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__)))
+#    define PYBIND11_EVAL4(...) PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__)))
+#    define PYBIND11_EVAL(...) PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__)))
+#    define PYBIND11_MAP_END(...)
+#    define PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_COMMA ,
+#    define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#    define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0)
+#    define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    if defined(_MSC_VER)                                                                         \
+        && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP_LIST_NEXT(test, next)                                                    \
+        PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP_LIST0(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP_LIST1(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#    define PYBIND11_MAP_LIST(f, t, ...)                                                          \
+        PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE(Type, ...)                                                       \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#    if defined(_MSC_VER) && !defined(__clang__)
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP2_LIST_NEXT(test, next)                                                   \
+        PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#    define PYBIND11_MAP2_LIST(f, t, ...)                                                         \
+        PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...)                                                    \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP2_LIST(PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : m_strides() {}
+
+    common_iterator(void *ptr, const container_type &strides, const container_type &shape)
+        : p_ptr(reinterpret_cast<char *>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            auto s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) { p_ptr += m_strides[dim]; }
+
+    void *data() const { return p_ptr; }
+
+private:
+    char *p_ptr{nullptr};
+    container_type m_strides;
+};
+
+template <size_t N>
+class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers, const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i) {
+            m_shape[i] = shape[i];
+        }
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i) {
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+        }
+    }
+
+    multi_array_iterator &operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            }
+            m_index[i] = 0;
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void>
+    T *data() const {
+        return reinterpret_cast<T *>(m_common_iterator[K].data());
+    }
+
+private:
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter) {
+                *strides_iter = *buffer_strides_iter;
+            } else {
+                *strides_iter = 0;
+            }
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator) {
+            iter.increment(dim);
+        }
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a
+// broadcast_trivial enum value indicating whether the broadcast is "trivial"--that is, has each
+// buffer being either a singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous
+// (`f_trivial`) storage buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial
+broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(
+        buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+            return std::max(res, buf.ndim);
+        });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1
+    // or the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end;
+             ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across
+            // buffers
+            if (dim_size_out == 1) {
+                dim_size_out = dim_size_in;
+            } else if (dim_size_in != 1 && dim_size_in != dim_size_out) {
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+            }
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1) {
+            continue;
+        }
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(),
+                      stride_iter = buffers[i].strides.crbegin();
+                 trivial_broadcast_c && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_c = false;
+                }
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(),
+                      stride_iter = buffers[i].strides.cbegin();
+                 trivial_broadcast_f && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_f = false;
+                }
+            }
+        }
+    }
+
+    return trivial_broadcast_c   ? broadcast_trivial::c_trivial
+           : trivial_broadcast_f ? broadcast_trivial::f_trivial
+                                 : broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value,
+                  "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize
+        = satisfies_any_of<call_type, std::is_arithmetic, is_complex, is_pod>::value
+          && satisfies_none_of<call_type,
+                               std::is_pointer,
+                               std::is_array,
+                               is_std_array,
+                               std::is_enum>::value
+          && (!std::is_reference<T>::value
+              || (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+// py::vectorize when a return type is present
+template <typename Func, typename Return, typename... Args>
+struct vectorize_returned_array {
+    using Type = array_t<Return>;
+
+    static Type create(broadcast_trivial trivial, const std::vector<ssize_t> &shape) {
+        if (trivial == broadcast_trivial::f_trivial) {
+            return array_t<Return, array::f_style>(shape);
+        }
+        return array_t<Return>(shape);
+    }
+
+    static Return *mutable_data(Type &array) { return array.mutable_data(); }
+
+    static Return call(Func &f, Args &...args) { return f(args...); }
+
+    static void call(Return *out, size_t i, Func &f, Args &...args) { out[i] = f(args...); }
+};
+
+// py::vectorize when a return type is not present
+template <typename Func, typename... Args>
+struct vectorize_returned_array<Func, void, Args...> {
+    using Type = none;
+
+    static Type create(broadcast_trivial, const std::vector<ssize_t> &) { return none(); }
+
+    static void *mutable_data(Type &) { return nullptr; }
+
+    static detail::void_type call(Func &f, Args &...args) {
+        f(args...);
+        return {};
+    }
+
+    static void call(void *, size_t, Func &f, Args &...args) { f(args...); }
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+
+// NVCC for some reason breaks if NVectorized is private
+#ifdef __CUDACC__
+public:
+#else
+private:
+#endif
+
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(
+        NVectorized >= 1,
+        "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T,
+              // SFINAE to prevent shadowing the copy constructor.
+              typename = detail::enable_if_t<
+                  !std::is_same<vectorize_helper, typename std::decay<T>::type>::value>>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling
+    // with "/permissive-" flag when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index>
+    using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    using returned_array = vectorize_returned_array<Func, Return, Args...>;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    object run(typename vectorize_arg<Args>::type &...args,
+               index_sequence<Index...> i_seq,
+               index_sequence<VIndex...> vi_seq,
+               index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{&args...}};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{
+            {reinterpret_cast<array *>(params[VIndex])->request()...}};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        auto ndim = (size_t) nd;
+
+        size_t size
+            = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(
+                returned_array::call(f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        auto result = returned_array::create(trivial, shape);
+
+        PYBIND11_WARNING_PUSH
+#ifdef PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+        PYBIND11_WARNING_DISABLE_CLANG("-Wreturn-std-move")
+#endif
+
+        if (size == 0) {
+            return result;
+        }
+
+        /* Call the function */
+        auto *mutable_data = returned_array::mutable_data(result);
+        if (trivial == broadcast_trivial::non_trivial) {
+            apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq);
+        } else {
+            apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq);
+        }
+
+        return result;
+        PYBIND11_WARNING_POP
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>,
+                       index_sequence<VIndex...>,
+                       index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{
+            {std::pair<unsigned char *&, const size_t>(
+                reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>))...}};
+
+        for (size_t i = 0; i < size; ++i) {
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) {
+                x.first += x.second;
+            }
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         Return *out,
+                         size_t size,
+                         const std::vector<ssize_t> &output_shape,
+                         index_sequence<Index...>,
+                         index_sequence<VIndex...>,
+                         index_sequence<BIndex...>) {
+
+        multi_array_iterator<NVectorized> input_iter(buffers, output_shape);
+
+        for (size_t i = 0; i < size; ++i, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((params[VIndex] = input_iter.template data<BIndex>()));
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...> vectorize_extractor(const Func &f, Return (*)(Args...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags>
+struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name
+        = const_name("numpy.ndarray[") + npy_format_descriptor<T>::name + const_name("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...> vectorize(Return (*f)(Args...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f)
+    -> decltype(detail::vectorize_extractor(std::forward<Func>(f),
+                                            (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f),
+                                       (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())),
+              Return,
+              Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())),
+              Return,
+              const Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/operators.h b/MLPY/Lib/site-packages/torch/include/pybind11/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2b224e89304806d58bcd93b61b196e8fc88b0f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/operators.h
@@ -0,0 +1,202 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add,
+    op_sub,
+    op_mul,
+    op_div,
+    op_mod,
+    op_divmod,
+    op_pow,
+    op_lshift,
+    op_rshift,
+    op_and,
+    op_xor,
+    op_or,
+    op_neg,
+    op_pos,
+    op_abs,
+    op_invert,
+    op_int,
+    op_long,
+    op_float,
+    op_str,
+    op_cmp,
+    op_gt,
+    op_ge,
+    op_lt,
+    op_le,
+    op_eq,
+    op_ne,
+    op_iadd,
+    op_isub,
+    op_imul,
+    op_idiv,
+    op_imod,
+    op_ilshift,
+    op_irshift,
+    op_iand,
+    op_ixor,
+    op_ior,
+    op_complex,
+    op_bool,
+    op_nonzero,
+    op_repr,
+    op_truediv,
+    op_itruediv,
+    op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t {};
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t {};
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R>
+struct op_impl {};
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R>
+struct op_ {
+    static constexpr bool op_enable_if_hook = true;
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+    }
+    template <typename Class, typename... Extra>
+    void execute_cast(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                               \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const L &l, const R &r) { return B(expr); }                         \
+    };                                                                                            \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_r, B, L, R> {                                                      \
+        static char const *name() { return "__" #rid "__"; }                                      \
+        static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const R &r, const L &l) { return B(expr); }                         \
+    };                                                                                            \
+    inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {                \
+        return op_<op_##id, op_l, self_t, self_t>();                                              \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {                                 \
+        return op_<op_##id, op_r, T, self_t>();                                                   \
+    }
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                                   \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }                  \
+        static B execute_cast(L &l, const R &r) { return B(expr); }                               \
+    };                                                                                            \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                                     \
+    template <typename B, typename L>                                                             \
+    struct op_impl<op_##id, op_u, B, L, undefined_t> {                                            \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l) -> decltype(expr) { return expr; }                        \
+        static B execute_cast(const L &l) { return B(expr); }                                     \
+    };                                                                                            \
+    inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                           \
+        return op_<op_##id, op_u, self_t, undefined_t>();                                         \
+    }
+
+PYBIND11_BINARY_OPERATOR(sub, rsub, operator-, l - r)
+PYBIND11_BINARY_OPERATOR(add, radd, operator+, l + r)
+PYBIND11_BINARY_OPERATOR(mul, rmul, operator*, l *r)
+PYBIND11_BINARY_OPERATOR(truediv, rtruediv, operator/, l / r)
+PYBIND11_BINARY_OPERATOR(mod, rmod, operator%, l % r)
+PYBIND11_BINARY_OPERATOR(lshift, rlshift, operator<<, l << r)
+PYBIND11_BINARY_OPERATOR(rshift, rrshift, operator>>, l >> r)
+PYBIND11_BINARY_OPERATOR(and, rand, operator&, l &r)
+PYBIND11_BINARY_OPERATOR(xor, rxor, operator^, l ^ r)
+PYBIND11_BINARY_OPERATOR(eq, eq, operator==, l == r)
+PYBIND11_BINARY_OPERATOR(ne, ne, operator!=, l != r)
+PYBIND11_BINARY_OPERATOR(or, ror, operator|, l | r)
+PYBIND11_BINARY_OPERATOR(gt, lt, operator>, l > r)
+PYBIND11_BINARY_OPERATOR(ge, le, operator>=, l >= r)
+PYBIND11_BINARY_OPERATOR(lt, gt, operator<, l < r)
+PYBIND11_BINARY_OPERATOR(le, ge, operator<=, l <= r)
+// PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd, operator+=, l += r)
+PYBIND11_INPLACE_OPERATOR(isub, operator-=, l -= r)
+PYBIND11_INPLACE_OPERATOR(imul, operator*=, l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=, l /= r)
+PYBIND11_INPLACE_OPERATOR(imod, operator%=, l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift, operator<<=, l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift, operator>>=, l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand, operator&=, l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor, operator^=, l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior, operator|=, l |= r)
+PYBIND11_UNARY_OPERATOR(neg, operator-, -l)
+PYBIND11_UNARY_OPERATOR(pos, operator+, +l)
+// WARNING: This usage of `abs` should only be done for existing STL overloads.
+// Adding overloads directly in to the `std::` namespace is advised against:
+// https://en.cppreference.com/w/cpp/language/extending_std
+PYBIND11_UNARY_OPERATOR(abs, abs, std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash, hash, std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert, operator~, (~l))
+PYBIND11_UNARY_OPERATOR(bool, operator!, !!l)
+PYBIND11_UNARY_OPERATOR(int, int_, (int) l)
+PYBIND11_UNARY_OPERATOR(float, float_, (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+PYBIND11_NAMESPACE_END(detail)
+
+using detail::self;
+// Add named operators so that they are accessible via `py::`.
+using detail::hash;
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/options.h b/MLPY/Lib/site-packages/torch/include/pybind11/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..02c41feb7533c4216f59dae1a4723f7f14e947ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/options.h
@@ -0,0 +1,92 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options &) = delete;
+    options &operator=(const options &) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() { global_state() = previous_state; }
+
+    // Setter methods (affect the global state):
+
+    options &disable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = false;
+        return *this;
+    }
+
+    options &enable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = true;
+        return *this;
+    }
+
+    options &disable_function_signatures() & {
+        global_state().show_function_signatures = false;
+        return *this;
+    }
+
+    options &enable_function_signatures() & {
+        global_state().show_function_signatures = true;
+        return *this;
+    }
+
+    options &disable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = false;
+        return *this;
+    }
+
+    options &enable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = true;
+        return *this;
+    }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() {
+        return global_state().show_user_defined_docstrings;
+    }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    static bool show_enum_members_docstring() {
+        return global_state().show_enum_members_docstring;
+    }
+
+    // This type is not meant to be allocated on the heap.
+    void *operator new(size_t) = delete;
+
+private:
+    struct state {
+        bool show_user_defined_docstrings = true; //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;     //< Include auto-generated function signatures
+                                                  //  in docstrings.
+        bool show_enum_members_docstring = true;  //< Include auto-generated member list in enum
+                                                  //  docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/pybind11.h b/MLPY/Lib/site-packages/torch/include/pybind11/pybind11.h
new file mode 100644
index 0000000000000000000000000000000000000000..f658703d726d924552add7ad1474f0602e28df72
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/pybind11.h
@@ -0,0 +1,2963 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/class.h"
+#include "detail/init.h"
+#include "attr.h"
+#include "gil.h"
+#include "gil_safe_call_once.h"
+#include "options.h"
+#include "typing.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if defined(__cpp_lib_launder) && !(defined(_MSC_VER) && (_MSC_VER < 1914))
+#    define PYBIND11_STD_LAUNDER std::launder
+#    define PYBIND11_HAS_STD_LAUNDER 1
+#else
+#    define PYBIND11_STD_LAUNDER
+#    define PYBIND11_HAS_STD_LAUNDER 0
+#endif
+#if defined(__GNUG__) && !defined(__clang__)
+#    include <cxxabi.h>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* https://stackoverflow.com/questions/46798456/handling-gccs-noexcept-type-warning
+   This warning is about ABI compatibility, not code health.
+   It is only actually needed in a couple places, but apparently GCC 7 "generates this warning if
+   and only if the first template instantiation ... involves noexcept" [stackoverflow], therefore
+   it could get triggered from seemingly random places, depending on user code.
+   No other GCC version generates this warning.
+ */
+#if defined(__GNUC__) && __GNUC__ == 7
+PYBIND11_WARNING_DISABLE_GCC("-Wnoexcept-type")
+#endif
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline std::string replace_newlines_and_squash(const char *text) {
+    const char *whitespaces = " \t\n\r\f\v";
+    std::string result(text);
+    bool previous_is_whitespace = false;
+
+    if (result.size() >= 2) {
+        // Do not modify string representations
+        char first_char = result[0];
+        char last_char = result[result.size() - 1];
+        if (first_char == last_char && first_char == '\'') {
+            return result;
+        }
+    }
+    result.clear();
+
+    // Replace characters in whitespaces array with spaces and squash consecutive spaces
+    while (*text != '\0') {
+        if (std::strchr(whitespaces, *text)) {
+            if (!previous_is_whitespace) {
+                result += ' ';
+                previous_is_whitespace = true;
+            }
+        } else {
+            result += *text;
+            previous_is_whitespace = false;
+        }
+        ++text;
+    }
+
+    // Strip leading and trailing whitespaces
+    const size_t str_begin = result.find_first_not_of(whitespaces);
+    if (str_begin == std::string::npos) {
+        return "";
+    }
+
+    const size_t str_end = result.find_last_not_of(whitespaces);
+    const size_t str_range = str_end - str_begin + 1;
+
+    return result.substr(str_begin, str_range);
+}
+
+// Apply all the extensions translators from a list
+// Return true if one of the translators completed without raising an exception
+// itself. Return of false indicates that if there are other translators
+// available, they should be tried.
+inline bool apply_exception_translators(std::forward_list<ExceptionTranslator> &translators) {
+    auto last_exception = std::current_exception();
+
+    for (auto &translator : translators) {
+        try {
+            translator(last_exception);
+            return true;
+        } catch (...) {
+            last_exception = std::current_exception();
+        }
+    }
+    return false;
+}
+
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPAT_STRDUP _strdup
+#else
+#    define PYBIND11_COMPAT_STRDUP strdup
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(std::nullptr_t) {}
+    cpp_function(std::nullptr_t, const is_setter &) {}
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (*f)(Args...), const Extra &...extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func,
+              typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Func &&f, const Extra &...extra) {
+        initialize(
+            std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...), const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, lvalue ref-qualifier)
+    /// A copy of the overload for non-const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) &, const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, lvalue ref-qualifier)
+    /// A copy of the overload for const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const &, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    struct InitializingFunctionRecordDeleter {
+        // `destruct(function_record, false)`: `initialize_generic` copies strings and
+        // takes care of cleaning up in case of exceptions. So pass `false` to `free_strings`.
+        void operator()(detail::function_record *rec) { destruct(rec, false); }
+    };
+    using unique_function_record
+        = std::unique_ptr<detail::function_record, InitializingFunctionRecordDeleter>;
+
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE unique_function_record make_function_record() {
+        return unique_function_record(new detail::function_record());
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra &...extra) {
+        using namespace detail;
+        struct capture {
+            remove_reference_t<Func> f;
+        };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture
+         * object) */
+        // The unique_ptr makes sure nothing is leaked in case of an exception.
+        auto unique_rec = make_function_record();
+        auto *rec = unique_rec.get();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+            PYBIND11_WARNING_PUSH
+
+#if defined(__GNUG__) && __GNUC__ >= 6
+            PYBIND11_WARNING_DISABLE_GCC("-Wplacement-new")
+#endif
+
+            new ((capture *) &rec->data) capture{std::forward<Func>(f)};
+
+#if !PYBIND11_HAS_STD_LAUNDER
+            PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing")
+#endif
+
+            // UB without std::launder, but without breaking ABI and/or
+            // a significant refactoring it's "impossible" to solve.
+            if (!std::is_trivially_destructible<capture>::value) {
+                rec->free_data = [](function_record *r) {
+                    auto data = PYBIND11_STD_LAUNDER((capture *) &r->data);
+                    (void) data;
+                    data->~capture();
+                };
+            }
+            PYBIND11_WARNING_POP
+        } else {
+            rec->data[0] = new capture{std::forward<Func>(f)};
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out
+            = make_caster<conditional_t<std::is_void<Return>::value, void_type, Return>>;
+
+        static_assert(
+            expected_num_args<Extra...>(
+                sizeof...(Args), cast_in::args_pos >= 0, cast_in::has_kwargs),
+            "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call)) {
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+            }
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            const auto *data = (sizeof(capture) <= sizeof(call.func.data) ? &call.func.data
+                                                                          : call.func.data[0]);
+            auto *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy
+                = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result;
+            if (call.func.is_setter) {
+                (void) std::move(args_converter).template call<Return, Guard>(cap->f);
+                result = none().release();
+            } else {
+                result = cast_out::cast(
+                    std::move(args_converter).template call<Return, Guard>(cap->f),
+                    policy,
+                    call.parent);
+            }
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        rec->nargs_pos = cast_in::args_pos >= 0
+                             ? static_cast<std::uint16_t>(cast_in::args_pos)
+                             : sizeof...(Args) - cast_in::has_kwargs; // Will get reduced more if
+                                                                      // we have a kw_only
+        rec->has_args = cast_in::args_pos >= 0;
+        rec->has_kwargs = cast_in::has_kwargs;
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        {
+            constexpr bool has_kw_only_args = any_of<std::is_same<kw_only, Extra>...>::value,
+                           has_pos_only_args = any_of<std::is_same<pos_only, Extra>...>::value,
+                           has_arg_annotations = any_of<is_keyword<Extra>...>::value;
+            static_assert(has_arg_annotations || !has_kw_only_args,
+                          "py::kw_only requires the use of argument annotations");
+            static_assert(has_arg_annotations || !has_pos_only_args,
+                          "py::pos_only requires the use of argument annotations (for docstrings "
+                          "and aligning the annotations to the argument)");
+
+            static_assert(constexpr_sum(is_kw_only<Extra>::value...) <= 1,
+                          "py::kw_only may be specified only once");
+            static_assert(constexpr_sum(is_pos_only<Extra>::value...) <= 1,
+                          "py::pos_only may be specified only once");
+            constexpr auto kw_only_pos = constexpr_first<is_kw_only, Extra...>();
+            constexpr auto pos_only_pos = constexpr_first<is_pos_only, Extra...>();
+            static_assert(!(has_kw_only_args && has_pos_only_args) || pos_only_pos < kw_only_pos,
+                          "py::pos_only must come before py::kw_only");
+        }
+
+        /* Generate a readable signature describing the function's arguments and return
+           value types */
+        static constexpr auto signature
+            = const_name("(") + cast_in::arg_names + const_name(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        // Pass on the ownership over the `unique_rec` to `initialize_generic`. `rec` stays valid.
+        initialize_generic(std::move(unique_rec), signature.text, types.data(), sizeof...(Args));
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr
+            = std::is_convertible<Func, FunctionType>::value && sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1]
+                = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    // Utility class that keeps track of all duplicated strings, and cleans them up in its
+    // destructor, unless they are released. Basically a RAII-solution to deal with exceptions
+    // along the way.
+    class strdup_guard {
+    public:
+        strdup_guard() = default;
+        strdup_guard(const strdup_guard &) = delete;
+        strdup_guard &operator=(const strdup_guard &) = delete;
+
+        ~strdup_guard() {
+            for (auto *s : strings) {
+                std::free(s);
+            }
+        }
+        char *operator()(const char *s) {
+            auto *t = PYBIND11_COMPAT_STRDUP(s);
+            strings.push_back(t);
+            return t;
+        }
+        void release() { strings.clear(); }
+
+    private:
+        std::vector<char *> strings;
+    };
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(unique_function_record &&unique_rec,
+                            const char *text,
+                            const std::type_info *const *types,
+                            size_t args) {
+        // Do NOT receive `unique_rec` by value. If this function fails to move out the unique_ptr,
+        // we do not want this to destruct the pointer. `initialize` (the caller) still relies on
+        // the pointee being alive after this call. Only move out if a `capsule` is going to keep
+        // it alive.
+        auto *rec = unique_rec.get();
+
+        // Keep track of strdup'ed strings, and clean them up as long as the function's capsule
+        // has not taken ownership yet (when `unique_rec.release()` is called).
+        // Note: This cannot easily be fixed by a `unique_ptr` with custom deleter, because the
+        // strings are only referenced before strdup'ing. So only *after* the following block could
+        // `destruct` safely be called, but even then, `repr` could still throw in the middle of
+        // copying all strings.
+        strdup_guard guarded_strdup;
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = guarded_strdup(rec->name ? rec->name : "");
+        if (rec->doc) {
+            rec->doc = guarded_strdup(rec->doc);
+        }
+        for (auto &a : rec->args) {
+            if (a.name) {
+                a.name = guarded_strdup(a.name);
+            }
+            if (a.descr) {
+                a.descr = guarded_strdup(a.descr);
+            } else if (a.value) {
+                a.descr = guarded_strdup(repr(a.value).cast<std::string>().c_str());
+            }
+        }
+
+        rec->is_constructor = (std::strcmp(rec->name, "__init__") == 0)
+                              || (std::strcmp(rec->name, "__setstate__") == 0);
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name
+                = detail::get_fully_qualified_tp_name((PyTypeObject *) rec->scope.ptr());
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(PyExc_FutureWarning,
+                         ("pybind11-bound class '" + class_name
+                          + "' is using an old-style "
+                            "placement-new '"
+                          + func_name
+                          + "' which has been deprecated. See "
+                            "the upgrade guide in pybind11's docs. This message is only visible "
+                            "when compiled in debug mode.")
+                             .c_str(),
+                         0);
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        bool is_starred = false;
+        for (const auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                is_starred = *(pc + 1) == '*';
+                if (is_starred) {
+                    continue;
+                }
+                // Separator for keyword-only arguments, placed before the kw
+                // arguments start (unless we are already putting an *args)
+                if (!rec->has_args && arg_index == rec->nargs_pos) {
+                    signature += "*, ";
+                }
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (!is_starred && arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += detail::replace_newlines_and_squash(rec->args[arg_index].descr);
+                }
+                // Separator for positional-only arguments (placed after the
+                // argument, rather than before like *
+                if (rec->nargs_pos_only > 0 && (arg_index + 1) == rec->nargs_pos_only) {
+                    signature += ", /";
+                }
+                if (!is_starred) {
+                    arg_index++;
+                }
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t) {
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                }
+                if (auto *tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature += th.attr("__module__").cast<std::string>() + "."
+                                 + th.attr("__qualname__").cast<std::string>();
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature += rec->scope.attr("__module__").cast<std::string>() + "."
+                                 + rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    signature += detail::quote_cpp_type_name(detail::clean_type_id(t->name()));
+                }
+            } else {
+                signature += c;
+            }
+        }
+
+        if (arg_index != args - rec->has_args - rec->has_kwargs || types[type_index] != nullptr) {
+            pybind11_fail("Internal error while parsing type signature (2)");
+        }
+
+        rec->signature = guarded_strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr())) {
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+        }
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto *self = PyCFunction_GET_SELF(rec->sibling.ptr());
+                if (!isinstance<capsule>(self)) {
+                    chain = nullptr;
+                } else {
+                    auto rec_capsule = reinterpret_borrow<capsule>(self);
+                    if (detail::is_function_record_capsule(rec_capsule)) {
+                        chain = rec_capsule.get_pointer<detail::function_record>();
+                        /* Never append a method to an overload chain of a parent class;
+                           instead, hide the parent's overloads in this case */
+                        if (!chain->scope.is(rec->scope)) {
+                            chain = nullptr;
+                        }
+                    } else {
+                        chain = nullptr;
+                    }
+                }
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors
+            // that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_') {
+                pybind11_fail("Cannot overload existing non-function object \""
+                              + std::string(rec->name) + "\" with a function of the same name");
+            }
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth
+                = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*)()>(dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(unique_rec.release(),
+                                detail::get_function_record_capsule_name(),
+                                [](void *ptr) { destruct((detail::function_record *) ptr); });
+            guarded_strdup.release();
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr) {
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+            }
+        } else {
+            /* Append at the beginning or end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            if (chain->is_method != rec->is_method) {
+                pybind11_fail(
+                    "overloading a method with both static and instance methods is not supported; "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for more "
+                    "details"
+#else
+                    "error while attempting to bind "
+                    + std::string(rec->is_method ? "instance" : "static") + " method "
+                    + std::string(pybind11::str(rec->scope.attr("__name__"))) + "."
+                    + std::string(rec->name) + signature
+#endif
+                );
+            }
+
+            if (rec->prepend) {
+                // Beginning of chain; we need to replace the capsule's current head-of-the-chain
+                // pointer with this one, then make this one point to the previous head of the
+                // chain.
+                chain_start = rec;
+                rec->next = chain;
+                auto rec_capsule
+                    = reinterpret_borrow<capsule>(((PyCFunctionObject *) m_ptr)->m_self);
+                rec_capsule.set_pointer(unique_rec.release());
+                guarded_strdup.release();
+            } else {
+                // Or end of chain (normal behavior)
+                chain_start = chain;
+                while (chain->next) {
+                    chain = chain->next;
+                }
+                chain->next = unique_rec.release();
+                guarded_strdup.release();
+            }
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto *it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) {
+                    signatures += '\n';
+                }
+                if (chain) {
+                    signatures += std::to_string(++index) + ". ";
+                }
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += '\n';
+            }
+            if (it->doc && it->doc[0] != '\0' && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures,
+                // we need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) {
+                        first_user_def = false;
+                    } else {
+                        signatures += '\n';
+                    }
+                }
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+                signatures += it->doc;
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+            }
+        }
+
+        /* Install docstring */
+        auto *func = (PyCFunctionObject *) m_ptr;
+        std::free(const_cast<char *>(func->m_ml->ml_doc));
+        // Install docstring if it's non-empty (when at least one option is enabled)
+        func->m_ml->ml_doc
+            = signatures.empty() ? nullptr : PYBIND11_COMPAT_STRDUP(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr) {
+                pybind11_fail(
+                    "cpp_function::cpp_function(): Could not allocate instance method object");
+            }
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec, bool free_strings = true) {
+// If on Python 3.9, check the interpreter "MICRO" (patch) version.
+// If this is running on 3.9.0, we have to work around a bug.
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+        static bool is_zero = Py_GetVersion()[4] == '0';
+#endif
+
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data) {
+                rec->free_data(rec);
+            }
+            // During initialization, these strings might not have been copied yet,
+            // so they cannot be freed. Once the function has been created, they can.
+            // Check `make_function_record` for more details.
+            if (free_strings) {
+                std::free((char *) rec->name);
+                std::free((char *) rec->doc);
+                std::free((char *) rec->signature);
+                for (auto &arg : rec->args) {
+                    std::free(const_cast<char *>(arg.name));
+                    std::free(const_cast<char *>(arg.descr));
+                }
+            }
+            for (auto &arg : rec->args) {
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+// Python 3.9.0 decref's these in the wrong order; rec->def
+// If loaded on 3.9.0, let these leak (use Python 3.9.1 at runtime to fix)
+// See https://github.com/python/cpython/pull/22670
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+                if (!is_zero) {
+                    delete rec->def;
+                }
+#else
+                delete rec->def;
+#endif
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+        assert(isinstance<capsule>(self));
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = reinterpret_cast<function_record *>(
+                                  PyCapsule_GetPointer(self, get_function_record_capsule_name())),
+                              *current_overload = overloads;
+        assert(overloads != nullptr);
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right
+           overload */
+        const auto n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            if (!parent
+                || !PyObject_TypeCheck(parent.ptr(), (PyTypeObject *) overloads->scope.ptr())) {
+                set_error(PyExc_TypeError,
+                          "__init__(self, ...) called with invalid or missing `self` argument");
+                return nullptr;
+            }
+
+            auto *const tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            auto *const pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, true);
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered()) {
+                return none().release().ptr();
+            }
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded
+                = current_overload != nullptr && current_overload->next != nullptr;
+
+            for (; current_overload != nullptr; current_overload = current_overload->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name. If
+                      so, use it (and remove it from kwargs); if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the
+                   function takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get
+                   a result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *current_overload;
+                size_t num_args = func.nargs; // Number of positional arguments that we need
+                if (func.has_args) {
+                    --num_args; // (but don't count py::args
+                }
+                if (func.has_kwargs) {
+                    --num_args; //  or py::kwargs)
+                }
+                size_t pos_args = func.nargs_pos;
+
+                if (!func.has_args && n_args_in > pos_args) {
+                    continue; // Too many positional arguments for this overload
+                }
+
+                if (n_args_in < pos_args && func.args.size() < pos_args) {
+                    continue; // Not enough positional arguments given, and not enough defaults to
+                              // fill in the blanks
+                }
+
+                function_call call(func, parent);
+
+                // Protect std::min with parentheses
+                size_t args_to_copy = (std::min)(pos_args, n_args_in);
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder) {
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+                    }
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.emplace_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec
+                        = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name
+                        && dict_getitemstring(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg) {
+                    continue; // Maybe it was meant for another overload (issue #688)
+                }
+
+                // Keep track of how many position args we copied out in case we need to come back
+                // to copy the rest into a py::args argument.
+                size_t positional_args_copied = args_copied;
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 1.5. Fill in any missing pos_only args from defaults if they exist
+                if (args_copied < func.nargs_pos_only) {
+                    for (; args_copied < func.nargs_pos_only; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+                        handle value;
+
+                        if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < func.nargs_pos_only) {
+                        continue; // Not enough defaults to fill the positional arguments
+                    }
+                }
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < num_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < num_args; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg_rec.name) {
+                            value = dict_getitemstring(kwargs.ptr(), arg_rec.name);
+                        }
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            if (PyDict_DelItemString(kwargs.ptr(), arg_rec.name) == -1) {
+                                throw error_already_set();
+                            }
+                        } else if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+
+                        if (!arg_rec.none && value.is_none()) {
+                            break;
+                        }
+
+                        if (value) {
+                            // If we're at the py::args index then first insert a stub for it to be
+                            // replaced later
+                            if (func.has_args && call.args.size() == func.nargs_pos) {
+                                call.args.push_back(none());
+                            }
+
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < num_args) {
+                        continue; // Not enough arguments, defaults, or kwargs to fill the
+                                  // positional arguments
+                    }
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && !kwargs.empty() && !func.has_kwargs) {
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+                }
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (positional_args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - positional_args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, positional_args_copied + i);
+                        }
+                    }
+                    if (call.args.size() <= func.nargs_pos) {
+                        call.args.push_back(extra_args);
+                    } else {
+                        call.args[func.nargs_pos] = extra_args;
+                    }
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr()) {
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    }
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+// 5. Put everything in a vector.  Not technically step 5, we've been building it
+// in `call.args` all along.
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs) {
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number "
+                                  "of arguments!");
+                }
+#endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                    break;
+                }
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion
+                // allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'current_overload' to be valid,
+                        // as it would be if we'd encountered this failure in the first-pass loop.
+                        if (!result) {
+                            current_overload = &call.func;
+                        }
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#ifdef __GLIBCXX__
+        } catch (abi::__forced_unwind &) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception. First
+               all module-local translators will be tried in reverse order of
+               registration. If none of the module-locale translators handle
+               the exception (or there are no module-locale translators) then
+               the global translators will be tried, also in reverse order of
+               registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call py::set_error()
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception.
+             */
+
+            auto &local_exception_translators
+                = get_local_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(local_exception_translators)) {
+                return nullptr;
+            }
+            auto &exception_translators = get_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(exception_translators)) {
+                return nullptr;
+            }
+
+            set_error(PyExc_SystemError, "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator) {
+                return handle(Py_NotImplemented).inc_ref().ptr();
+            }
+
+            std::string msg = std::string(overloads->name) + "(): incompatible "
+                              + std::string(overloads->is_constructor ? "constructor" : "function")
+                              + " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    " + std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as
+                    // `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) {
+                            next = end = sig.find(')');
+                        }
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) {
+                    msg += it2->signature;
+                }
+
+                msg += '\n';
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) {
+                    some_args = true;
+                } else {
+                    msg += ", ";
+                }
+                try {
+                    msg += pybind11::repr(args_[ti]);
+                } catch (const error_already_set &) {
+                    msg += "<repr raised Error>";
+                }
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (!kwargs.empty()) {
+                    if (some_args) {
+                        msg += "; ";
+                    }
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (const auto &kwarg : kwargs) {
+                        if (first) {
+                            first = false;
+                        } else {
+                            msg += ", ";
+                        }
+                        msg += pybind11::str("{}=").format(kwarg.first);
+                        try {
+                            msg += pybind11::repr(kwarg.second);
+                        } catch (const error_already_set &) {
+                            msg += "<repr raised Error>";
+                        }
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                // #HelpAppreciated: unit test coverage for this branch.
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            set_error(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            assert(current_overload != nullptr);
+            msg += current_overload->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            set_error(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+            auto *pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder.type->init_instance(pi, nullptr);
+        }
+        return result.ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<cpp_function> {
+    static constexpr auto name = const_name("Callable");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Wrapper for Python extension modules
+class module_ : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module_, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    PYBIND11_DEPRECATED("Use PYBIND11_MODULE or module_::create_extension_module instead")
+    explicit module_(const char *name, const char *doc = nullptr) {
+        *this = create_extension_module(name, doc, new PyModuleDef());
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function func(std::forward<Func>(f),
+                          name(name_),
+                          scope(*this),
+                          sibling(getattr(*this, name_, none())),
+                          extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting
+        // non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module_ m("example", "pybind11 example plugin");
+            py::module_ m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module_ m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module_ def_submodule(const char *name, const char *doc = nullptr) {
+        const char *this_name = PyModule_GetName(m_ptr);
+        if (this_name == nullptr) {
+            throw error_already_set();
+        }
+        std::string full_name = std::string(this_name) + '.' + name;
+        handle submodule = PyImport_AddModule(full_name.c_str());
+        if (!submodule) {
+            throw error_already_set();
+        }
+        auto result = reinterpret_borrow<module_>(submodule);
+        if (doc && options::show_user_defined_docstrings()) {
+            result.attr("__doc__") = pybind11::str(doc);
+        }
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module_ import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<module_>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj) {
+            throw error_already_set();
+        }
+        *this = reinterpret_steal<module_>(obj);
+    }
+
+    /** \rst
+        Adds an object to the module using the given name.  Throws if an object with the given name
+        already exists.
+
+        ``overwrite`` should almost always be false: attempting to overwrite objects that pybind11
+        has established will, in most cases, break things.
+    \endrst */
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name)) {
+            pybind11_fail(
+                "Error during initialization: multiple incompatible definitions with name \""
+                + std::string(name) + "\"");
+        }
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+
+    using module_def = PyModuleDef; // TODO: Can this be removed (it was needed only for Python 2)?
+
+    /** \rst
+        Create a new top-level module that can be used as the main module of a C extension.
+
+        ``def`` should point to a statically allocated module_def.
+    \endrst */
+    static module_ create_extension_module(const char *name, const char *doc, module_def *def) {
+        // module_def is PyModuleDef
+        // Placement new (not an allocation).
+        def = new (def)
+            PyModuleDef{/* m_base */ PyModuleDef_HEAD_INIT,
+                        /* m_name */ name,
+                        /* m_doc */ options::show_user_defined_docstrings() ? doc : nullptr,
+                        /* m_size */ -1,
+                        /* m_methods */ nullptr,
+                        /* m_slots */ nullptr,
+                        /* m_traverse */ nullptr,
+                        /* m_clear */ nullptr,
+                        /* m_free */ nullptr};
+        auto *m = PyModule_Create(def);
+        if (m == nullptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Internal error in module_::create_extension_module()");
+        }
+        // TODO: Should be reinterpret_steal for Python 3, but Python also steals it again when
+        //       returned from PyInit_...
+        //       For Python 2, reinterpret_borrow was correct.
+        return reinterpret_borrow<module_>(m);
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<module_> {
+    static constexpr auto name = const_name("module");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// When inside a namespace (or anywhere as long as it's not the first item on a line),
+// C++20 allows "module" to be used. This is provided for backward compatibility, and for
+// simplicity, if someone wants to use py::module for example, that is perfectly safe.
+using module = module_;
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module_::import("__main__").attr("__dict__").ptr());
+}
+
+template <typename... Args, typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>>
+PYBIND11_DEPRECATED("make_simple_namespace should be replaced with "
+                    "py::module_::import(\"types\").attr(\"SimpleNamespace\") ")
+object make_simple_namespace(Args &&...args_) {
+    return module_::import("types").attr("SimpleNamespace")(std::forward<Args>(args_)...);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, "__dict__")
+            && rec.scope.attr("__dict__").contains(rec.name)) {
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name)
+                          + "\": an object with that name is already defined");
+        }
+
+        if ((rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            != nullptr) {
+            pybind11_fail("generic_type: type \"" + std::string(rec.name)
+                          + "\" is already registered!");
+        }
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local) {
+            get_local_internals().registered_types_cpp[tindex] = tinfo;
+        } else {
+            internals.registered_types_cpp[tindex] = tinfo;
+        }
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = {tinfo};
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        } else if (rec.bases.size() == 1) {
+            auto *parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            assert(parent_tinfo != nullptr);
+            bool parent_simple_ancestors = parent_tinfo->simple_ancestors;
+            tinfo->simple_ancestors = parent_simple_ancestors;
+            // The parent can no longer be a simple type if it has MI and has a child
+            parent_tinfo->simple_type = parent_tinfo->simple_type && parent_simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto *tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2) {
+                tinfo2->simple_type = false;
+            }
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(buffer_info *(*get_buffer)(PyObject *, void *),
+                              void *get_buffer_data) {
+        auto *type = (PyHeapTypeObject *) m_ptr;
+        auto *tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer) {
+            pybind11_fail("To be able to register buffer protocol support for the type '"
+                          + get_fully_qualified_tp_name(tinfo->type)
+                          + "' the associated class<>(..) invocation must "
+                            "include the pybind11::buffer_protocol() annotation!");
+        }
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget,
+                                  handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = (rec_func != nullptr) && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = (rec_func != nullptr) && (rec_func->doc != nullptr)
+                             && pybind11::options::show_user_defined_docstrings();
+        auto property = handle(
+            (PyObject *) (is_static ? get_internals().static_property_type : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/ none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T,
+          typename = void_t<decltype(static_cast<void *(*) (size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) {
+    r->operator_new = &T::operator new;
+}
+
+template <typename>
+void set_operator_new(...) {}
+
+template <typename T, typename SFINAE = void>
+struct has_operator_delete : std::false_type {};
+template <typename T>
+struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct has_operator_delete_size : std::false_type {};
+template <typename T>
+struct has_operator_delete_size<
+    T,
+    void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>> : std::true_type {
+};
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) {
+    T::operator delete(p);
+}
+template <typename T,
+          enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int>
+          = 0>
+void call_operator_delete(T *p, size_t s, size_t) {
+    T::operator delete(p, s);
+}
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void) s;
+    (void) a;
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+#    ifdef __cpp_sized_deallocation
+        ::operator delete(p, s, std::align_val_t(a));
+#    else
+        ::operator delete(p, std::align_val_t(a));
+#    endif
+        return;
+    }
+#endif
+#ifdef __cpp_sized_deallocation
+    ::operator delete(p, s);
+#else
+    ::operator delete(p);
+#endif
+}
+
+inline void add_class_method(object &cls, const char *name_, const cpp_function &cf) {
+    cls.attr(cf.name()) = cf;
+    if (std::strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
+        cls.attr("__hash__") = none();
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) {
+    return std::forward<F>(f);
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T>
+    using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T>
+    using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T>
+    using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T>
+    struct is_valid_class_option : detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+                  "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+                  "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &...extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+                (constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                 constexpr_sum(is_base<options>::value...) == 0 &&   // no template option bases
+                 // no multiple_inheritance attr
+                 none_of<std::is_same<multiple_inheritance, Extra>...>::value),
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type> &);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? get_local_internals().registered_types_cpp
+                                                  : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))]
+                = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) {}
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)),
+                        name(name_),
+                        is_method(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        add_class_method(*this, name_, cf);
+        return *this;
+    }
+
+    template <typename Func, typename... Extra>
+    class_ &def_static(const char *name_, Func &&f, const Extra &...extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                      "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f),
+                        name(name_),
+                        scope(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        auto cf_name = cf.name();
+        attr(std::move(cf_name)) = staticmethod(std::move(cf));
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def(const T &op, const Extra &...extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def_cast(const T &op, const Extra &...extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra &...extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func>
+    class_ &def_buffer(Func &&func) {
+        struct capture {
+            Func func;
+        };
+        auto *ptr = new capture{std::forward<Func>(func)};
+        install_buffer_funcs(
+            [](PyObject *obj, void *ptr) -> buffer_info * {
+                detail::make_caster<type> caster;
+                if (!caster.load(obj, false)) {
+                    return nullptr;
+                }
+                return new buffer_info(((capture *) ptr)->func(std::move(caster)));
+            },
+            ptr);
+        weakref(m_ptr, cpp_function([ptr](handle wr) {
+                    delete ptr;
+                    wr.dec_ref();
+                }))
+            .release();
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func](type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func](const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this)),
+            fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this)),
+            fset([pm](const object &, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly(name,
+                                     cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal,
+                                     extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &
+    def_property_readonly(const char *name, const cpp_function &fget, const Extra &...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &
+    def_property_readonly_static(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly_static(
+            name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name,
+                                         const cpp_function &fget,
+                                         const Extra &...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &
+    def_property(const char *name, const Getter &fget, const Setter &fset, const Extra &...extra) {
+        return def_property(
+            name, fget, cpp_function(method_adaptor<type>(fset), is_setter()), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name,
+                         const Getter &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property(name,
+                            cpp_function(method_adaptor<type>(fget)),
+                            fset,
+                            return_value_policy::reference_internal,
+                            extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name,
+                         const cpp_function &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const Getter &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        return def_property_static(
+            name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const cpp_function &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        static_assert(0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+            char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific
+                                               documentation string */
+            detail::process_attributes<Extra...>::init(extra..., rec_fget);
+            if (rec_fget->doc && rec_fget->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fget->doc = PYBIND11_COMPAT_STRDUP(rec_fget->doc);
+            }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fset->doc = PYBIND11_COMPAT_STRDUP(rec_fset->doc);
+            }
+            if (!rec_active) {
+                rec_active = rec_fset;
+            }
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type * /* unused */,
+                            const std::enable_shared_from_this<T> * /* dummy */) {
+
+        auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+            detail::try_get_shared_from_this(v_h.value_ptr<type>()));
+        if (sh) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+            v_h.set_holder_constructed();
+        }
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if
+    /// possible
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type *holder_ptr,
+                            const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (detail::always_construct_holder<holder_type>::value || inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes
+    /// an optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        // We could be deallocating because we are cleaning up after a Python exception.
+        // If so, the Python error indicator will be set. We need to clear that before
+        // running the destructor, in case the destructor code calls more Python.
+        // If we don't, the Python API will exit with an exception, and pybind11 will
+        // throw error_already_set from the C++ destructor which is forbidden and triggers
+        // std::terminate().
+        error_scope scope;
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        } else {
+            detail::call_operator_delete(
+                v_h.value_ptr<type>(), v_h.type->type_size, v_h.type->type_align);
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        if (!h) {
+            return nullptr;
+        }
+
+        handle func_self = PyCFunction_GET_SELF(h.ptr());
+        if (!func_self) {
+            throw error_already_set();
+        }
+        if (!isinstance<capsule>(func_self)) {
+            return nullptr;
+        }
+        auto cap = reinterpret_borrow<capsule>(func_self);
+        if (!detail::is_function_record_capsule(cap)) {
+            return nullptr;
+        }
+        return cap.get_pointer<detail::function_record>();
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args>
+detail::initimpl::constructor<Args...> init() {
+    return {};
+}
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args>
+detail::initimpl::alias_constructor<Args...> init_alias() {
+    return {};
+}
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) {
+    return {std::forward<Func>(f)};
+}
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the
+/// second when an alias is needed (i.e. due to python-side inheritance).  Arguments must be
+/// identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline str enum_name(handle arg) {
+    dict entries = arg.get_type().attr("__entries");
+    for (auto kv : entries) {
+        if (handle(kv.second[int_(0)]).equal(arg)) {
+            return pybind11::str(kv.first);
+        }
+    }
+    return "???";
+}
+
+struct enum_base {
+    enum_base(const handle &base, const handle &parent) : m_base(base), m_parent(parent) {}
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](const object &arg) -> str {
+                handle type = type::handle_of(arg);
+                object type_name = type.attr("__name__");
+                return pybind11::str("<{}.{}: {}>")
+                    .format(std::move(type_name), enum_name(arg), int_(arg));
+            },
+            name("__repr__"),
+            is_method(m_base));
+
+        m_base.attr("name") = property(cpp_function(&enum_name, name("name"), is_method(m_base)));
+
+        m_base.attr("__str__") = cpp_function(
+            [](handle arg) -> str {
+                object type_name = type::handle_of(arg).attr("__name__");
+                return pybind11::str("{}.{}").format(std::move(type_name), enum_name(arg));
+            },
+            name("__str__"),
+            is_method(m_base));
+
+        if (options::show_enum_members_docstring()) {
+            m_base.attr("__doc__") = static_property(
+                cpp_function(
+                    [](handle arg) -> std::string {
+                        std::string docstring;
+                        dict entries = arg.attr("__entries");
+                        if (((PyTypeObject *) arg.ptr())->tp_doc) {
+                            docstring += std::string(
+                                reinterpret_cast<PyTypeObject *>(arg.ptr())->tp_doc);
+                            docstring += "\n\n";
+                        }
+                        docstring += "Members:";
+                        for (auto kv : entries) {
+                            auto key = std::string(pybind11::str(kv.first));
+                            auto comment = kv.second[int_(1)];
+                            docstring += "\n\n  ";
+                            docstring += key;
+                            if (!comment.is_none()) {
+                                docstring += " : ";
+                                docstring += pybind11::str(comment).cast<std::string>();
+                            }
+                        }
+                        return docstring;
+                    },
+                    name("__doc__")),
+                none(),
+                none(),
+                "");
+        }
+
+        m_base.attr("__members__") = static_property(cpp_function(
+                                                         [](handle arg) -> dict {
+                                                             dict entries = arg.attr("__entries"),
+                                                                  m;
+                                                             for (auto kv : entries) {
+                                                                 m[kv.first] = kv.second[int_(0)];
+                                                             }
+                                                             return m;
+                                                         },
+                                                         name("__members__")),
+                                                     none(),
+                                                     none(),
+                                                     "");
+
+#define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                                        \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a, const object &b) {                                                    \
+            if (!type::handle_of(a).is(type::handle_of(b)))                                       \
+                strict_behavior; /* NOLINT(bugprone-macro-parentheses) */                         \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV(op, expr)                                                           \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b_) {                                                  \
+            int_ a(a_), b(b_);                                                                    \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                                       \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b) {                                                   \
+            int_ a(a_);                                                                           \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() && a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__", b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__", a < b);
+                PYBIND11_ENUM_OP_CONV("__gt__", a > b);
+                PYBIND11_ENUM_OP_CONV("__le__", a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__", a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__", a & b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a & b);
+                PYBIND11_ENUM_OP_CONV("__or__", a | b);
+                PYBIND11_ENUM_OP_CONV("__ror__", a | b);
+                PYBIND11_ENUM_OP_CONV("__xor__", a ^ b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b);
+                m_base.attr("__invert__")
+                    = cpp_function([](const object &arg) { return ~(int_(arg)); },
+                                   name("__invert__"),
+                                   is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+#define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+#undef PYBIND11_THROW
+            }
+        }
+
+#undef PYBIND11_ENUM_OP_CONV_LHS
+#undef PYBIND11_ENUM_OP_CONV
+#undef PYBIND11_ENUM_OP_STRICT
+
+        m_base.attr("__getstate__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
+
+        m_base.attr("__hash__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
+    }
+
+    PYBIND11_NOINLINE void value(char const *name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(std::move(type_name) + ": element \"" + std::string(name_)
+                              + "\" already exists!");
+        }
+
+        entries[name] = pybind11::make_tuple(value, doc);
+        m_base.attr(std::move(name)) = std::move(value);
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (auto kv : entries) {
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+        }
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+template <bool is_signed, size_t length>
+struct equivalent_integer {};
+template <>
+struct equivalent_integer<true, 1> {
+    using type = int8_t;
+};
+template <>
+struct equivalent_integer<false, 1> {
+    using type = uint8_t;
+};
+template <>
+struct equivalent_integer<true, 2> {
+    using type = int16_t;
+};
+template <>
+struct equivalent_integer<false, 2> {
+    using type = uint16_t;
+};
+template <>
+struct equivalent_integer<true, 4> {
+    using type = int32_t;
+};
+template <>
+struct equivalent_integer<false, 4> {
+    using type = uint32_t;
+};
+template <>
+struct equivalent_integer<true, 8> {
+    using type = int64_t;
+};
+template <>
+struct equivalent_integer<false, 8> {
+    using type = uint64_t;
+};
+
+template <typename IntLike>
+using equivalent_integer_t =
+    typename equivalent_integer<std::is_signed<IntLike>::value, sizeof(IntLike)>::type;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type>
+class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::attr;
+    using Base::def;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Underlying = typename std::underlying_type<Type>::type;
+    // Scalar is the integer representation of underlying type
+    using Scalar = detail::conditional_t<detail::any_of<detail::is_std_char_type<Underlying>,
+                                                        std::is_same<Underlying, bool>>::value,
+                                         detail::equivalent_integer_t<Underlying>,
+                                         Underlying>;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra &...extra)
+        : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Underlying>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }), arg("value"));
+        def_property_readonly("value", [](Type value) { return (Scalar) value; });
+        def("__int__", [](Type value) { return (Scalar) value; });
+        def("__index__", [](Type value) { return (Scalar) value; });
+        attr("__setstate__") = cpp_function(
+            [](detail::value_and_holder &v_h, Scalar arg) {
+                detail::initimpl::setstate<Base>(
+                    v_h, static_cast<Type>(arg), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            detail::is_new_style_constructor(),
+            pybind11::name("__setstate__"),
+            is_method(*this),
+            arg("state"));
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_ &export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_ &value(char const *name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+PYBIND11_NOINLINE void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient) {
+        pybind11_fail("Could not activate keep_alive!");
+    }
+
+    if (patient.is_none() || nurse.is_none()) {
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+    }
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    } else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport([patient](handle weakref) {
+            patient.dec_ref();
+            weakref.dec_ref();
+        });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE void
+keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0) {
+            return ret;
+        }
+        if (n == 1 && call.init_self) {
+            return call.init_self;
+        }
+        if (n <= call.args.size()) {
+            return call.args[n - 1];
+        }
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals()
+                   .registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+                   .try_emplace(type);
+#else
+                   .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+                    get_internals().registered_types_py.erase(type);
+
+                    // TODO consolidate the erasure code in pybind11_meta_dealloc() in class.h
+                    auto &cache = get_internals().inactive_override_cache;
+                    for (auto it = cache.begin(), last = cache.end(); it != last;) {
+                        if (it->first == reinterpret_cast<PyObject *>(type)) {
+                            it = cache.erase(it);
+                        } else {
+                            ++it;
+                        }
+                    }
+
+                    wr.dec_ref();
+                }))
+            .release();
+    }
+
+    return res;
+}
+
+/* There are a large number of apparently unused template arguments because
+ * each combination requires a separate py::class_ registration.
+ */
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+// Note: these helpers take the iterator by non-const reference because some
+// iterators in the wild can't be dereferenced when const. The & after Iterator
+// is required for MSVC < 16.9. SFINAE cannot be reused for result_type due to
+// bugs in ICC, NVCC, and PGI compilers. See PR #3293.
+template <typename Iterator, typename SFINAE = decltype(*std::declval<Iterator &>())>
+struct iterator_access {
+    using result_type = decltype(*std::declval<Iterator &>());
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    result_type operator()(Iterator &it) const { return *it; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).first)>
+class iterator_key_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    /* If either the pair itself or the element of the pair is a reference, we
+     * want to return a reference, otherwise a value. When the decltype
+     * expression is parenthesized it is based on the value category of the
+     * expression; otherwise it is the declared type of the pair member.
+     * The use of declval<pair_type> in the second branch rather than directly
+     * using *std::declval<Iterator &>() is a workaround for nvcc
+     * (it's not used in the first branch because going via decltype and back
+     * through declval does not perfectly preserve references).
+     */
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).first)),
+                        decltype(std::declval<pair_type>().first)>;
+    result_type operator()(Iterator &it) const { return (*it).first; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).second)>
+class iterator_value_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).second)),
+                        decltype(std::declval<pair_type>().second)>;
+    result_type operator()(Iterator &it) const { return (*it).second; }
+};
+
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+iterator make_iterator_impl(Iterator first, Sentinel last, Extra &&...extra) {
+    using state = detail::iterator_state<Access, Policy, Iterator, Sentinel, ValueType, Extra...>;
+    // TODO: state captures only the types of Extra, not the values
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state & { return s; })
+            .def(
+                "__next__",
+                [](state &s) -> ValueType {
+                    if (!s.first_or_done) {
+                        ++s.it;
+                    } else {
+                        s.first_or_done = false;
+                    }
+                    if (s.it == s.end) {
+                        s.first_or_done = true;
+                        throw stop_iteration();
+                    }
+                    return Access()(s.it);
+                    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+                },
+                std::forward<Extra>(extra)...,
+                Policy);
+    }
+
+    return cast(state{std::forward<Iterator>(first), std::forward<Sentinel>(last), true});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = typename detail::iterator_key_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<KeyType> make_key_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_key_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      KeyType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the values (`.second`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_value_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_value_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_value_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename ValueType = typename detail::iterator_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_iterator(Type &value, Extra &&...extra) {
+    return make_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename KeyType = typename detail::iterator_key_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<KeyType> make_key_iterator(Type &value, Extra &&...extra) {
+    return make_key_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the values (`.second`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename ValueType = typename detail::iterator_value_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_value_iterator(Type &value, Extra &&...extra) {
+    return make_value_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+template <typename InputType, typename OutputType>
+void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        explicit set_flag(bool &flag_) : flag(flag_) { flag_ = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) { // implicit conversions are non-reentrant
+            return nullptr;
+        }
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false)) {
+            return nullptr;
+        }
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr) {
+            PyErr_Clear();
+        }
+        return result;
+    };
+
+    if (auto *tinfo = detail::get_type_info(typeid(OutputType))) {
+        tinfo->implicit_conversions.emplace_back(std::move(implicit_caster));
+    } else {
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+    }
+}
+
+inline void register_exception_translator(ExceptionTranslator &&translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Add a new module-local exception translator. Locally registered functions
+ * will be tried before any globally registered exception translators, which
+ * will only be invoked if the module-local handlers do not deal with
+ * the exception.
+ */
+inline void register_local_exception_translator(ExceptionTranslator &&translator) {
+    detail::get_local_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with py::set_error() for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, handle base = PyExc_Exception) {
+        std::string full_name
+            = scope.attr("__name__").cast<std::string>() + std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base.ptr(), nullptr);
+        if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name)) {
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \""
+                          + std::string(name) + "\"");
+        }
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    PYBIND11_DEPRECATED("Please use py::set_error() instead "
+                        "(https://github.com/pybind/pybind11/pull/4772)")
+    void operator()(const char *message) const { set_error(*this, message); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<exception<void>> {
+    static constexpr auto name = const_name("Exception");
+};
+
+// Helper function for register_exception and register_local_exception
+template <typename CppException>
+exception<CppException> &
+register_exception_impl(handle scope, const char *name, handle base, bool isLocal) {
+    PYBIND11_CONSTINIT static gil_safe_call_once_and_store<exception<CppException>> exc_storage;
+    exc_storage.call_once_and_store_result(
+        [&]() { return exception<CppException>(scope, name, base); });
+
+    auto register_func
+        = isLocal ? &register_local_exception_translator : &register_exception_translator;
+
+    register_func([](std::exception_ptr p) {
+        if (!p) {
+            return;
+        }
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            set_error(exc_storage.get_stored(), e.what());
+        }
+    });
+    return exc_storage.get_stored();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, false /* isLocal */);
+}
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This translator will only be used for exceptions that are thrown in this module and will be
+ * tried before global exception translators, including those registered with register_exception.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_local_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, true /* isLocal */);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE void print(const tuple &args, const dict &kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : str(" ");
+    auto line = sep.attr("join")(std::move(strings));
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module_::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(std::move(line));
+    write(kwargs.contains("end") ? kwargs["end"] : str("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>()) {
+        file.attr("flush")();
+    }
+}
+PYBIND11_NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+inline void
+error_already_set::m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr) {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    delete raw_ptr;
+}
+
+inline const char *error_already_set::what() const noexcept {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    return m_fetched_error->error_string().c_str();
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline function
+get_type_override(const void *this_ptr, const type_info *this_type, const char *name) {
+    handle self = get_object_handle(this_ptr, this_type);
+    if (!self) {
+        return function();
+    }
+    handle type = type::handle_of(self);
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overridden in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = get_internals().inactive_override_cache;
+    if (cache.find(key) != cache.end()) {
+        return function();
+    }
+
+    function override = getattr(self, name, function());
+    if (override.is_cpp_function()) {
+        cache.insert(std::move(key));
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+#    if PY_VERSION_HEX >= 0x03090000
+    PyFrameObject *frame = PyThreadState_GetFrame(PyThreadState_Get());
+    if (frame != nullptr) {
+        PyCodeObject *f_code = PyFrame_GetCode(frame);
+        // f_code is guaranteed to not be NULL
+        if ((std::string) str(f_code->co_name) == name && f_code->co_argcount > 0) {
+            PyObject *locals = PyEval_GetLocals();
+            if (locals != nullptr) {
+#        if PY_VERSION_HEX >= 0x030b0000
+                PyObject *co_varnames = PyCode_GetVarnames(f_code);
+#        else
+                PyObject *co_varnames = PyObject_GetAttrString((PyObject *) f_code, "co_varnames");
+#        endif
+                PyObject *self_arg = PyTuple_GET_ITEM(co_varnames, 0);
+                Py_DECREF(co_varnames);
+                PyObject *self_caller = dict_getitem(locals, self_arg);
+                if (self_caller == self.ptr()) {
+                    Py_DECREF(f_code);
+                    Py_DECREF(frame);
+                    return function();
+                }
+            }
+        }
+        Py_DECREF(f_code);
+        Py_DECREF(frame);
+    }
+#    else
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame != nullptr && (std::string) str(frame->f_code->co_name) == name
+        && frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller
+            = dict_getitem(frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr()) {
+            return function();
+        }
+    }
+#    endif
+
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d;
+    d["self"] = self;
+    d["name"] = pybind11::str(name);
+    PyObject *result
+        = PyRun_String("import inspect\n"
+                       "frame = inspect.currentframe()\n"
+                       "if frame is not None:\n"
+                       "    frame = frame.f_back\n"
+                       "    if frame is not None and str(frame.f_code.co_name) == name and "
+                       "frame.f_code.co_argcount > 0:\n"
+                       "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+                       "        if self_caller == self:\n"
+                       "            self = None\n",
+                       Py_file_input,
+                       d.ptr(),
+                       d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    if (d["self"].is_none())
+        return function();
+#endif
+
+    return override;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the
+  this_ptr.
+
+  :this_ptr: The pointer to the object the overridden method should be retrieved for. This should
+             be the first non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overridden Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T>
+function get_override(const T *this_ptr, const char *name) {
+    auto *tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? detail::get_type_override(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERRIDE_IMPL(ret_type, cname, name, ...)                                        \
+    do {                                                                                          \
+        pybind11::gil_scoped_acquire gil;                                                         \
+        pybind11::function override                                                               \
+            = pybind11::get_override(static_cast<const cname *>(this), name);                     \
+        if (override) {                                                                           \
+            auto o = override(__VA_ARGS__);                                                       \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) {           \
+                static pybind11::detail::override_caster_t<ret_type> caster;                      \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);                \
+            }                                                                                     \
+            return pybind11::detail::cast_safe<ret_type>(std::move(o));                           \
+        }                                                                                         \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a
+    method named 'fn' from the Python side, deals with the :ref:`gil` and necessary argument
+    conversions to call this method and return the appropriate type.
+    See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERRIDE_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            "__str__",   // Name of method in Python (name)
+            toString,    // Name of function in C++ (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERRIDE_NAME(ret_type, cname, name, fn, ...)                                    \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        return cname::fn(__VA_ARGS__);                                                            \
+    } while (false)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to
+    :c:macro:`PYBIND11_OVERRIDE_NAME`, except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        pybind11::pybind11_fail(                                                                  \
+            "Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");   \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the
+    method from the Python side, deals with the :ref:`gil` and necessary argument conversions to
+    call this method and return the appropriate type. This macro should be used if the method name
+    in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERRIDE_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERRIDE(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERRIDE`,
+    except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+// Deprecated versions
+
+PYBIND11_DEPRECATED("get_type_overload has been deprecated")
+inline function
+get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name) {
+    return detail::get_type_override(this_ptr, this_type, name);
+}
+
+template <class T>
+inline function get_overload(const T *this_ptr, const char *name) {
+    return get_override(this_ptr, name);
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...)                                         \
+    PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...)                                    \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__);
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__);
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/pytypes.h b/MLPY/Lib/site-packages/torch/include/pybind11/pytypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0bcba38bb5f2ae25f289f6af161e40f130693db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/pytypes.h
@@ -0,0 +1,2574 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+
+#include <assert.h>
+#include <cstddef>
+#include <exception>
+#include <frameobject.h>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#endif
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+#    include <string_view>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+/* A few forward declarations */
+class handle;
+class object;
+class str;
+class iterator;
+class type;
+struct arg;
+struct arg_v;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+class args_proxy;
+bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy>
+class accessor;
+namespace accessor_policies {
+struct obj_attr;
+struct str_attr;
+struct generic_item;
+struct sequence_item;
+struct list_item;
+struct tuple_item;
+} // namespace accessor_policies
+// PLEASE KEEP handle_type_name SPECIALIZATIONS IN SYNC.
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag {};
+template <typename T>
+using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    item_accessor operator[](object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    obj_attr_accessor attr(object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T>
+    bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+    object call(Args &&...args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const &other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other);
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other);
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other);
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other);
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other);
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other);
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other);
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other);
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other);
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+
+    // TODO PYBIND11_DEPRECATED(
+    //     "Call py::type::handle_of(h) or py::type::of(h) instead of h.get_type()")
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+template <typename T>
+using is_pyobj_ptr_or_nullptr_t = detail::any_of<std::is_same<T, PyObject *>,
+                                                 std::is_same<T, PyObject *const>,
+                                                 std::is_same<T, std::nullptr_t>>;
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if !defined(PYBIND11_HANDLE_REF_DEBUG) && !defined(NDEBUG)
+#    define PYBIND11_HANDLE_REF_DEBUG
+#endif
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+
+    /// Enable implicit conversion from ``PyObject *`` and ``nullptr``.
+    /// Not using ``handle(PyObject *ptr)`` to avoid implicit conversion from ``0``.
+    template <typename T,
+              detail::enable_if_t<detail::is_pyobj_ptr_or_nullptr_t<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T ptr) : m_ptr(ptr) {}
+
+    /// Enable implicit conversion through ``T::operator PyObject *()``.
+    template <
+        typename T,
+        detail::enable_if_t<detail::all_of<detail::none_of<std::is_base_of<handle, T>,
+                                                           detail::is_pyobj_ptr_or_nullptr_t<T>>,
+                                           std::is_convertible<T, PyObject *>>::value,
+                            int>
+        = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T &obj) : m_ptr(obj) {}
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &inc_ref() const & {
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+        inc_ref_counter(1);
+#endif
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::inc_ref()");
+        }
+#endif
+        Py_XINCREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &dec_ref() const & {
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::dec_ref()");
+        }
+#endif
+        Py_XDECREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T>
+    T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+
+protected:
+    PyObject *m_ptr = nullptr;
+
+private:
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+    void throw_gilstate_error(const std::string &function_name) const {
+        fprintf(
+            stderr,
+            "%s is being called while the GIL is either not held or invalid. Please see "
+            "https://pybind11.readthedocs.io/en/stable/advanced/"
+            "misc.html#common-sources-of-global-interpreter-lock-errors for debugging advice.\n"
+            "If you are convinced there is no bug in your code, you can #define "
+            "PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF "
+            "to disable this check. In that case you have to ensure this #define is consistently "
+            "used for all translation units linked into a given pybind11 extension, otherwise "
+            "there will be ODR violations.",
+            function_name.c_str());
+        if (Py_TYPE(m_ptr)->tp_name != nullptr) {
+            fprintf(stderr,
+                    " The failing %s call was triggered on a %s object.",
+                    function_name.c_str(),
+                    Py_TYPE(m_ptr)->tp_name);
+        }
+        fprintf(stderr, "\n");
+        fflush(stderr);
+        throw std::runtime_error(function_name + " PyGILState_Check() failure.");
+    }
+#endif
+
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+    static std::size_t inc_ref_counter(std::size_t add) {
+        thread_local std::size_t counter = 0;
+        counter += add;
+        return counter;
+    }
+
+public:
+    static std::size_t inc_ref_counter() { return inc_ref_counter(0); }
+#endif
+};
+
+inline void set_error(const handle &type, const char *message) {
+    PyErr_SetString(type.ptr(), message);
+}
+
+inline void set_error(const handle &type, const handle &value) {
+    PyErr_SetObject(type.ptr(), value.ptr());
+}
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) {
+        if (is_borrowed) {
+            inc_ref();
+        }
+    }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept : handle(other) { other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+        PyObject *tmp = m_ptr;
+        m_ptr = nullptr;
+        return handle(tmp);
+    }
+
+    object &operator=(const object &other) {
+        // Skip inc_ref and dec_ref if both objects are the same
+        if (!this->is(other)) {
+            other.inc_ref();
+            // Use temporary variable to ensure `*this` remains valid while
+            // `Py_XDECREF` executes, in case `*this` is accessible from Python.
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    object &operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+#define PYBIND11_INPLACE_OP(iop)                                                                  \
+    object iop(object_api const &other) { return operator=(handle::iop(other)); }
+
+    PYBIND11_INPLACE_OP(operator+=)
+    PYBIND11_INPLACE_OP(operator-=)
+    PYBIND11_INPLACE_OP(operator*=)
+    PYBIND11_INPLACE_OP(operator/=)
+    PYBIND11_INPLACE_OP(operator|=)
+    PYBIND11_INPLACE_OP(operator&=)
+    PYBIND11_INPLACE_OP(operator^=)
+    PYBIND11_INPLACE_OP(operator<<=)
+    PYBIND11_INPLACE_OP(operator>>=)
+#undef PYBIND11_INPLACE_OP
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T>
+    T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T>
+    T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t {};
+    struct stolen_t {};
+
+    /// @cond BROKEN
+    template <typename T>
+    friend T reinterpret_borrow(handle);
+    template <typename T>
+    friend T reinterpret_steal(handle);
+    /// @endcond
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) {}
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T>
+T reinterpret_borrow(handle h) {
+    return {h, object::borrowed_t{}};
+}
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T>
+T reinterpret_steal(handle h) {
+    return {h, object::stolen_t{}};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Equivalent to obj.__class__.__name__ (or obj.__name__ if obj is a class).
+inline const char *obj_class_name(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return reinterpret_cast<PyTypeObject *>(obj)->tp_name;
+    }
+    return Py_TYPE(obj)->tp_name;
+}
+
+std::string error_string();
+
+// The code in this struct is very unusual, to minimize the chances of
+// masking bugs (elsewhere) by errors during the error handling (here).
+// This is meant to be a lifeline for troubleshooting long-running processes
+// that crash under conditions that are virtually impossible to reproduce.
+// Low-level implementation alternatives are preferred to higher-level ones
+// that might raise cascading exceptions. Last-ditch-kind-of attempts are made
+// to report as much of the original error as possible, even if there are
+// secondary issues obtaining some of the details.
+struct error_fetch_and_normalize {
+    // This comment only applies to Python <= 3.11:
+    //     Immediate normalization is long-established behavior (starting with
+    //     https://github.com/pybind/pybind11/commit/135ba8deafb8bf64a15b24d1513899eb600e2011
+    //     from Sep 2016) and safest. Normalization could be deferred, but this could mask
+    //     errors elsewhere, the performance gain is very minor in typical situations
+    //     (usually the dominant bottleneck is EH unwinding), and the implementation here
+    //     would be more complex.
+    // Starting with Python 3.12, PyErr_Fetch() normalizes exceptions immediately.
+    // Any errors during normalization are tracked under __notes__.
+    explicit error_fetch_and_normalize(const char *called) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (!m_type) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " called while "
+                            "Python error indicator not set.");
+        }
+        const char *exc_type_name_orig = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_orig == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the original active exception type.");
+        }
+        m_lazy_error_string = exc_type_name_orig;
+#if PY_VERSION_HEX >= 0x030C0000
+        // The presence of __notes__ is likely due to exception normalization
+        // errors, although that is not necessarily true, therefore insert a
+        // hint only:
+        if (PyObject_HasAttrString(m_value.ptr(), "__notes__")) {
+            m_lazy_error_string += "[WITH __notes__]";
+        }
+#else
+        // PyErr_NormalizeException() may change the exception type if there are cascading
+        // failures. This can potentially be extremely confusing.
+        PyErr_NormalizeException(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (m_type.ptr() == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to normalize the "
+                            "active exception.");
+        }
+        const char *exc_type_name_norm = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_norm == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the normalized active exception type.");
+        }
+#    if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x07030a00
+        // This behavior runs the risk of masking errors in the error handling, but avoids a
+        // conflict with PyPy, which relies on the normalization here to change OSError to
+        // FileNotFoundError (https://github.com/pybind/pybind11/issues/4075).
+        m_lazy_error_string = exc_type_name_norm;
+#    else
+        if (exc_type_name_norm != m_lazy_error_string) {
+            std::string msg = std::string(called)
+                              + ": MISMATCH of original and normalized "
+                                "active exception types: ";
+            msg += "ORIGINAL ";
+            msg += m_lazy_error_string;
+            msg += " REPLACED BY ";
+            msg += exc_type_name_norm;
+            msg += ": " + format_value_and_trace();
+            pybind11_fail(msg);
+        }
+#    endif
+#endif
+    }
+
+    error_fetch_and_normalize(const error_fetch_and_normalize &) = delete;
+    error_fetch_and_normalize(error_fetch_and_normalize &&) = delete;
+
+    std::string format_value_and_trace() const {
+        std::string result;
+        std::string message_error_string;
+        if (m_value) {
+            auto value_str = reinterpret_steal<object>(PyObject_Str(m_value.ptr()));
+            constexpr const char *message_unavailable_exc
+                = "<MESSAGE UNAVAILABLE DUE TO ANOTHER EXCEPTION>";
+            if (!value_str) {
+                message_error_string = detail::error_string();
+                result = message_unavailable_exc;
+            } else {
+                // Not using `value_str.cast<std::string>()`, to not potentially throw a secondary
+                // error_already_set that will then result in process termination (#4288).
+                auto value_bytes = reinterpret_steal<object>(
+                    PyUnicode_AsEncodedString(value_str.ptr(), "utf-8", "backslashreplace"));
+                if (!value_bytes) {
+                    message_error_string = detail::error_string();
+                    result = message_unavailable_exc;
+                } else {
+                    char *buffer = nullptr;
+                    Py_ssize_t length = 0;
+                    if (PyBytes_AsStringAndSize(value_bytes.ptr(), &buffer, &length) == -1) {
+                        message_error_string = detail::error_string();
+                        result = message_unavailable_exc;
+                    } else {
+                        result = std::string(buffer, static_cast<std::size_t>(length));
+                    }
+                }
+            }
+#if PY_VERSION_HEX >= 0x030B0000
+            auto notes
+                = reinterpret_steal<object>(PyObject_GetAttrString(m_value.ptr(), "__notes__"));
+            if (!notes) {
+                PyErr_Clear(); // No notes is good news.
+            } else {
+                auto len_notes = PyList_Size(notes.ptr());
+                if (len_notes < 0) {
+                    result += "\nFAILURE obtaining len(__notes__): " + detail::error_string();
+                } else {
+                    result += "\n__notes__ (len=" + std::to_string(len_notes) + "):";
+                    for (ssize_t i = 0; i < len_notes; i++) {
+                        PyObject *note = PyList_GET_ITEM(notes.ptr(), i);
+                        auto note_bytes = reinterpret_steal<object>(
+                            PyUnicode_AsEncodedString(note, "utf-8", "backslashreplace"));
+                        if (!note_bytes) {
+                            result += "\nFAILURE obtaining __notes__[" + std::to_string(i)
+                                      + "]: " + detail::error_string();
+                        } else {
+                            char *buffer = nullptr;
+                            Py_ssize_t length = 0;
+                            if (PyBytes_AsStringAndSize(note_bytes.ptr(), &buffer, &length)
+                                == -1) {
+                                result += "\nFAILURE formatting __notes__[" + std::to_string(i)
+                                          + "]: " + detail::error_string();
+                            } else {
+                                result += '\n';
+                                result += std::string(buffer, static_cast<std::size_t>(length));
+                            }
+                        }
+                    }
+                }
+            }
+#endif
+        } else {
+            result = "<MESSAGE UNAVAILABLE>";
+        }
+        if (result.empty()) {
+            result = "<EMPTY MESSAGE>";
+        }
+
+        bool have_trace = false;
+        if (m_trace) {
+#if !defined(PYPY_VERSION)
+            auto *tb = reinterpret_cast<PyTracebackObject *>(m_trace.ptr());
+
+            // Get the deepest trace possible.
+            while (tb->tb_next) {
+                tb = tb->tb_next;
+            }
+
+            PyFrameObject *frame = tb->tb_frame;
+            Py_XINCREF(frame);
+            result += "\n\nAt:\n";
+            while (frame) {
+#    if PY_VERSION_HEX >= 0x030900B1
+                PyCodeObject *f_code = PyFrame_GetCode(frame);
+#    else
+                PyCodeObject *f_code = frame->f_code;
+                Py_INCREF(f_code);
+#    endif
+                int lineno = PyFrame_GetLineNumber(frame);
+                result += "  ";
+                result += handle(f_code->co_filename).cast<std::string>();
+                result += '(';
+                result += std::to_string(lineno);
+                result += "): ";
+                result += handle(f_code->co_name).cast<std::string>();
+                result += '\n';
+                Py_DECREF(f_code);
+#    if PY_VERSION_HEX >= 0x030900B1
+                auto *b_frame = PyFrame_GetBack(frame);
+#    else
+                auto *b_frame = frame->f_back;
+                Py_XINCREF(b_frame);
+#    endif
+                Py_DECREF(frame);
+                frame = b_frame;
+            }
+
+            have_trace = true;
+#endif //! defined(PYPY_VERSION)
+        }
+
+        if (!message_error_string.empty()) {
+            if (!have_trace) {
+                result += '\n';
+            }
+            result += "\nMESSAGE UNAVAILABLE DUE TO EXCEPTION: " + message_error_string;
+        }
+
+        return result;
+    }
+
+    std::string const &error_string() const {
+        if (!m_lazy_error_string_completed) {
+            m_lazy_error_string += ": " + format_value_and_trace();
+            m_lazy_error_string_completed = true;
+        }
+        return m_lazy_error_string;
+    }
+
+    void restore() {
+        if (m_restore_called) {
+            pybind11_fail("Internal error: pybind11::detail::error_fetch_and_normalize::restore() "
+                          "called a second time. ORIGINAL ERROR: "
+                          + error_string());
+        }
+        PyErr_Restore(m_type.inc_ref().ptr(), m_value.inc_ref().ptr(), m_trace.inc_ref().ptr());
+        m_restore_called = true;
+    }
+
+    bool matches(handle exc) const {
+        return (PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()) != 0);
+    }
+
+    // Not protecting these for simplicity.
+    object m_type, m_value, m_trace;
+
+private:
+    // Only protecting invariants.
+    mutable std::string m_lazy_error_string;
+    mutable bool m_lazy_error_string_completed = false;
+    mutable bool m_restore_called = false;
+};
+
+inline std::string error_string() {
+    return error_fetch_and_normalize("pybind11::detail::error_string").error_string();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class PYBIND11_EXPORT_EXCEPTION error_already_set : public std::exception {
+public:
+    /// Fetches the current Python exception (using PyErr_Fetch()), which will clear the
+    /// current Python error indicator.
+    error_already_set()
+        : m_fetched_error{new detail::error_fetch_and_normalize("pybind11::error_already_set"),
+                          m_fetched_error_deleter} {}
+
+    /// The what() result is built lazily on demand.
+    /// WARNING: This member function needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    const char *what() const noexcept override;
+
+    /// Restores the currently-held Python error (which will clear the Python error indicator first
+    /// if already set).
+    /// NOTE: This member function will always restore the normalized exception, which may or may
+    ///       not be the original Python exception.
+    /// WARNING: The GIL must be held when this member function is called!
+    void restore() { m_fetched_error->restore(); }
+
+    /// If it is impossible to raise the currently-held error, such as in a destructor, we can
+    /// write it out using Python's unraisable hook (`sys.unraisablehook`). The error context
+    /// should be some object whose `repr()` helps identify the location of the error. Python
+    /// already knows the type and value of the error, so there is no need to repeat that.
+    void discard_as_unraisable(object err_context) {
+        restore();
+        PyErr_WriteUnraisable(err_context.ptr());
+    }
+    /// An alternate version of `discard_as_unraisable()`, where a string provides information on
+    /// the location of the error. For example, `__func__` could be helpful.
+    /// WARNING: The GIL must be held when this member function is called!
+    void discard_as_unraisable(const char *err_context) {
+        discard_as_unraisable(reinterpret_steal<object>(PYBIND11_FROM_STRING(err_context)));
+    }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return m_fetched_error->matches(exc); }
+
+    const object &type() const { return m_fetched_error->m_type; }
+    const object &value() const { return m_fetched_error->m_value; }
+    const object &trace() const { return m_fetched_error->m_trace; }
+
+private:
+    std::shared_ptr<detail::error_fetch_and_normalize> m_fetched_error;
+
+    /// WARNING: This custom deleter needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    static void m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr);
+};
+
+/// Replaces the current Python error indicator with the chosen error, performing a
+/// 'raise from' to indicate that the chosen error was caused by the original error.
+inline void raise_from(PyObject *type, const char *message) {
+    // Based on _PyErr_FormatVFromCause:
+    // https://github.com/python/cpython/blob/467ab194fc6189d9f7310c89937c51abeac56839/Python/errors.c#L405
+    // See https://github.com/pybind/pybind11/pull/2112 for details.
+    PyObject *exc = nullptr, *val = nullptr, *val2 = nullptr, *tb = nullptr;
+
+    assert(PyErr_Occurred());
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_NormalizeException(&exc, &val, &tb);
+    if (tb != nullptr) {
+        PyException_SetTraceback(val, tb);
+        Py_DECREF(tb);
+    }
+    Py_DECREF(exc);
+    assert(!PyErr_Occurred());
+
+    PyErr_SetString(type, message);
+
+    PyErr_Fetch(&exc, &val2, &tb);
+    PyErr_NormalizeException(&exc, &val2, &tb);
+    Py_INCREF(val);
+    PyException_SetCause(val2, val);
+    PyException_SetContext(val2, val);
+    PyErr_Restore(exc, val2, tb);
+}
+
+/// Sets the current Python error indicator with the chosen error, performing a 'raise from'
+/// from the error contained in error_already_set to indicate that the chosen error was
+/// caused by the original error.
+inline void raise_from(error_already_set &err, PyObject *type, const char *message) {
+    err.restore();
+    raise_from(type, message);
+}
+
+/** \defgroup python_builtins const_name
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return T::check_(obj);
+}
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return detail::isinstance_generic(obj, typeid(T));
+}
+
+template <>
+inline bool isinstance<handle>(handle) = delete;
+template <>
+inline bool isinstance<object>(handle obj) {
+    return obj.ptr() != nullptr;
+}
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1) {
+        throw error_already_set();
+    }
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) {
+        throw error_already_set();
+    }
+    return h;
+}
+
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+        if (PyInstanceMethod_Check(value.ptr())) {
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        } else if (PyMethod_Check(value.ptr())) {
+            value = PyMethod_GET_FUNCTION(value.ptr());
+        }
+    }
+    return value;
+}
+
+// Reimplementation of python's dict helper functions to ensure that exceptions
+// aren't swallowed (see #2862)
+
+// copied from cpython _PyDict_GetItemStringWithError
+inline PyObject *dict_getitemstring(PyObject *v, const char *key) {
+    PyObject *kv = nullptr, *rv = nullptr;
+    kv = PyUnicode_FromString(key);
+    if (kv == nullptr) {
+        throw error_already_set();
+    }
+
+    rv = PyDict_GetItemWithError(v, kv);
+    Py_DECREF(kv);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+inline PyObject *dict_getitem(PyObject *v, PyObject *key) {
+    PyObject *rv = PyDict_GetItemWithError(v, key);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python
+// accessors/methods. When given a pyobject, this simply returns the pyobject as-is; for other C++
+// type, the value goes through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) {
+    return std::forward<T>(o);
+}
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4522) // warning C4522: multiple assignment operators specified
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) {}
+    accessor(const accessor &) = default;
+    accessor(accessor &&) noexcept = default;
+
+    // accessor overload required to override default assignment operator (templates are not
+    // allowed to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T>
+    void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T>
+    void operator=(T &&value) & {
+        get_cache() = ensure_object(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED(
+        "Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value
+                             || std::is_same<T, accessor_policies::obj_attr>::value,
+                         bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T>
+    T cast() const {
+        return get_cache().template cast<T>();
+    }
+
+private:
+    static object ensure_object(object &&o) { return std::move(o); }
+    static object ensure_object(handle h) { return reinterpret_borrow<object>(h); }
+
+    object &get_cache() const {
+        if (!cache) {
+            cache = Policy::get(obj, key);
+        }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+PYBIND11_WARNING_POP
+
+PYBIND11_NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), ssize_t_cast(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+PYBIND11_NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) {}
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const { return Policy::dereference(); }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() {
+        Policy::increment();
+        return *this;
+    }
+    It operator++(int) {
+        auto copy = *this;
+        Policy::increment();
+        return copy;
+    }
+    It &operator--() {
+        Policy::decrement();
+        return *this;
+    }
+    It operator--(int) {
+        auto copy = *this;
+        Policy::decrement();
+        return copy;
+    }
+    It &operator+=(difference_type n) {
+        Policy::advance(n);
+        return *this;
+    }
+    It &operator-=(difference_type n) {
+        Policy::advance(-n);
+        return *this;
+    }
+
+    friend It operator+(const It &a, difference_type n) {
+        auto copy = a;
+        return copy += n;
+    }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) {
+        auto copy = a;
+        return copy -= n;
+    }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator<(const It &a, const It &b) { return b - a > 0; }
+    friend bool operator>(const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    arrow_proxy(T &&value) noexcept : value(std::move(value)) {}
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) {}
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) {}
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type; // PR #3263
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return {key, value}; }
+    void increment() {
+        if (PyDict_Next(obj.ptr(), &pos, &key, &value) == 0) {
+            pos = -1;
+        }
+    }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+PYBIND11_NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    }
+    PyErr_Clear();
+    return false;
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+
+#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
+inline bool PyUnicode_Check_Permissive(PyObject *o) {
+    return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o);
+}
+#    define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
+#else
+#    define PYBIND11_STR_CHECK_FUN PyUnicode_Check
+#endif
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) {}
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) {}
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T>
+using is_keyword = std::is_base_of<arg, T>;
+template <typename T>
+using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T>
+using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T>
+using is_positional = satisfies_none_of<T, is_keyword, is_s_unpacking, is_ds_unpacking>;
+template <typename T>
+using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+PYBIND11_NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                            \
+public:                                                                                           \
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<" #Name ">() or reinterpret_steal<" #Name ">()")  \
+    Name(handle h, bool is_borrowed)                                                              \
+        : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) {}                \
+    Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) {}                                       \
+    Name(handle h, stolen_t) : Parent(h, stolen_t{}) {}                                           \
+    PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead")                       \
+    bool check() const { return m_ptr != nullptr && (CheckFun(m_ptr) != 0); }                     \
+    static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }              \
+    template <typename Policy_> /* NOLINTNEXTLINE(google-explicit-constructor) */                 \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) {}
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o)                                                                         \
+        : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) {               \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) {  \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }
+
+#define PYBIND11_OBJECT_CVT_DEFAULT(Name, Parent, CheckFun, ConvertFun)                           \
+    PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                       \
+    Name() = default;
+
+#define PYBIND11_OBJECT_CHECK_FAILED(Name, o_ptr)                                                 \
+    ::pybind11::type_error("Object of type '"                                                     \
+                           + ::pybind11::detail::get_fully_qualified_tp_name(Py_TYPE(o_ptr))      \
+                           + "' is not an instance of '" #Name "'")
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun)                                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o) : Parent(o) {                                                           \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(std::move(o)) {                                                     \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun)                                           \
+    PYBIND11_OBJECT(Name, Parent, CheckFun)                                                       \
+    Name() = default;
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator &operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto &self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const {
+        operator*();
+        return &value;
+    }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (value.ptr() == nullptr && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    object value = {};
+};
+
+class type : public object {
+public:
+    PYBIND11_OBJECT(type, object, PyType_Check)
+
+    /// Return a type handle from a handle or an object
+    static handle handle_of(handle h) { return handle((PyObject *) Py_TYPE(h.ptr())); }
+
+    /// Return a type object from a handle or an object
+    static type of(handle h) { return type(type::handle_of(h), borrowed_t{}); }
+
+    // Defined in pybind11/cast.h
+    /// Convert C++ type to handle if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static handle handle_of();
+
+    /// Convert C++ type to type if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static type of() {
+        return type(type::handle_of<T>(), borrowed_t{});
+    }
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    str(const char *c, const SzType &n)
+        : object(PyUnicode_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit
+    // conversion to py::str from C++ string-like objects
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const char *c = "") : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const std::string &s) : str(s.data(), s.size()) {}
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(T s) : str(s.data(), s.size()) {}
+
+#    ifdef PYBIND11_HAS_U8STRING
+    // reinterpret_cast here is safe (C++20 guarantees char8_t has the same size/alignment as char)
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(std::u8string_view s) : str(reinterpret_cast<const char *>(s.data()), s.size()) {}
+#    endif
+
+#endif
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp) {
+                throw error_already_set();
+            }
+        }
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+operator"" _s // gcc 4.8.5 insists on having a space (hard error).
+#else
+operator""_s // clang 17 generates a deprecation warning if there is a space.
+#endif
+    (const char *s, size_t size) {
+    return {s, size};
+}
+} // namespace literals
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const char *c = "") : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytes(const char *c, const SzType &n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const std::string &s) : bytes(s.data(), s.size()) {}
+
+    explicit bytes(const pybind11::str &s);
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const { return string_op<std::string>(); }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(T s) : bytes(s.data(), s.size()) {}
+
+    // Obtain a string view that views the current `bytes` buffer value.  Note that this is only
+    // valid so long as the `bytes` instance remains alive and so generally should not outlive the
+    // lifetime of the `bytes` instance.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string_view() const { return string_op<std::string_view>(); }
+#endif
+private:
+    template <typename T>
+    T string_op() const {
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(m_ptr, &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return {buffer, static_cast<size_t>(length)};
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp) {
+            throw error_already_set();
+        }
+    }
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj) {
+        pybind11_fail("Could not allocate bytes object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes &b) {
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(b.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, length));
+    if (!obj) {
+        if (PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        pybind11_fail("Could not allocate string object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytearray : public object {
+public:
+    PYBIND11_OBJECT_CVT(bytearray, object, PyByteArray_Check, PyByteArray_FromObject)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytearray(const char *c, const SzType &n)
+        : object(PyByteArray_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytearray object!");
+        }
+    }
+
+    bytearray() : bytearray("", 0) {}
+
+    explicit bytearray(const std::string &s) : bytearray(s.data(), s.size()) {}
+
+    size_t size() const { return static_cast<size_t>(PyByteArray_Size(m_ptr)); }
+
+    explicit operator std::string() const {
+        char *buffer = PyByteArray_AS_STRING(m_ptr);
+        ssize_t size = PyByteArray_GET_SIZE(m_ptr);
+        return std::string(buffer, static_cast<size_t>(size));
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+/// \addtogroup pytypes
+/// @{
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) {}
+};
+
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) {}
+};
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) {}
+    // Allow implicit conversion from and to `bool`:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) {}
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return (m_ptr != nullptr) && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) {
+            return nullptr;
+        }
+        return handle(value != 0 ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    unsigned long long v = PyLong_AsUnsignedLongLong(o);
+    return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) {}
+    // Allow implicit conversion from C++ integral types:
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLong((long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+            }
+        } else {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLongLong((long long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+            }
+        }
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate int object!");
+        }
+    }
+
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator T() const {
+        return std::is_unsigned<T>::value  ? detail::as_unsigned<T>(m_ptr)
+               : sizeof(T) <= sizeof(long) ? (T) PyLong_AsLong(m_ptr)
+                                           : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_CVT_DEFAULT(weakref, object, PyWeakref_Check, raw_weakref)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate weak reference!");
+        }
+    }
+
+private:
+    static PyObject *raw_weakref(PyObject *o) { return PyWeakref_NewRef(o, nullptr); }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(handle start, handle stop, handle step)
+        : object(PySlice_New(start.ptr(), stop.ptr(), step.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate slice object!");
+        }
+    }
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    slice(std::optional<ssize_t> start, std::optional<ssize_t> stop, std::optional<ssize_t> step)
+        : slice(index_to_object(start), index_to_object(stop), index_to_object(step)) {}
+#else
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_)
+        : slice(int_(start_), int_(stop_), int_(step_)) {}
+#endif
+
+    bool
+    compute(size_t length, size_t *start, size_t *stop, size_t *step, size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length,
+                                    (ssize_t *) start,
+                                    (ssize_t *) stop,
+                                    (ssize_t *) step,
+                                    (ssize_t *) slicelength)
+               == 0;
+    }
+    bool compute(
+        ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, ssize_t *slicelength) const {
+        return PySlice_GetIndicesEx(
+                   (PYBIND11_SLICE_OBJECT *) m_ptr, length, start, stop, step, slicelength)
+               == 0;
+    }
+
+private:
+    template <typename T>
+    static object index_to_object(T index) {
+        return index ? object(int_(*index)) : object(none());
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed)
+        : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) {}
+
+    explicit capsule(const void *value,
+                     const char *name = nullptr,
+                     PyCapsule_Destructor destructor = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    PYBIND11_DEPRECATED("Please use the ctor with value, name, destructor args")
+    capsule(const void *value, PyCapsule_Destructor destructor)
+        : object(PyCapsule_New(const_cast<void *>(value), nullptr, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// Capsule name is nullptr.
+    capsule(const void *value, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, nullptr, destructor);
+    }
+
+    capsule(const void *value, const char *name, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, name, destructor);
+    }
+
+    explicit capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            const char *name = get_name_in_error_scope(o);
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, name));
+            if (destructor == nullptr) {
+                throw error_already_set();
+            }
+            destructor();
+        });
+
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    template <typename T>
+    operator T *() const { // NOLINT(google-explicit-constructor)
+        return get_pointer<T>();
+    }
+
+    /// Get the pointer the capsule holds.
+    template <typename T = void>
+    T *get_pointer() const {
+        const auto *name = this->name();
+        T *result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) {
+            throw error_already_set();
+        }
+        return result;
+    }
+
+    /// Replaces a capsule's pointer *without* calling the destructor on the existing one.
+    void set_pointer(const void *value) {
+        if (PyCapsule_SetPointer(m_ptr, const_cast<void *>(value)) != 0) {
+            throw error_already_set();
+        }
+    }
+
+    const char *name() const {
+        const char *name = PyCapsule_GetName(m_ptr);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        return name;
+    }
+
+    /// Replaces a capsule's name *without* calling the destructor on the existing one.
+    void set_name(const char *new_name) {
+        if (PyCapsule_SetName(m_ptr, new_name) != 0) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    static const char *get_name_in_error_scope(PyObject *o) {
+        error_scope error_guard;
+
+        const char *name = PyCapsule_GetName(o);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            // write out and consume error raised by call to PyCapsule_GetName
+            PyErr_WriteUnraisable(o);
+        }
+
+        return name;
+    }
+
+    void initialize_with_void_ptr_destructor(const void *value,
+                                             const char *name,
+                                             void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), name, [](PyObject *o) {
+            // guard if destructor called while err indicator is set
+            error_scope error_guard;
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            if (destructor == nullptr && PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            const char *name = get_name_in_error_scope(o);
+            void *ptr = PyCapsule_GetPointer(o, name);
+            if (ptr == nullptr) {
+                throw error_already_set();
+            }
+
+            if (destructor != nullptr) {
+                destructor(ptr);
+            }
+        });
+
+        if (!m_ptr || PyCapsule_SetContext(m_ptr, reinterpret_cast<void *>(destructor)) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit tuple(SzType size = 0) : object(PyTuple_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate tuple object!");
+        }
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<all_of<is_keyword_or_ds<Args>...>::value> part below
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_keyword_or_ds() {
+    return detail::all_of<detail::is_keyword_or_ds<Args>...>::value;
+}
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate dict object!");
+        }
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the
+              // collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) {}
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() /* py-non-const */ { PyDict_Clear(ptr()); }
+    template <typename T>
+    bool contains(T &&key) const {
+        auto result = PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op)) {
+            return handle(op).inc_ref().ptr();
+        }
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const {
+        ssize_t result = PySequence_Size(m_ptr);
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return (size_t) result;
+    }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit list(SzType size = 0) : object(PyList_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate list object!");
+        }
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T>
+    void append(T &&val) /* py-non-const */ {
+        if (PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+    template <typename IdxType,
+              typename ValType,
+              detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    void insert(const IdxType &index, ValType &&val) /* py-non-const */ {
+        if (PyList_Insert(m_ptr,
+                          ssize_t_cast(index),
+                          detail::object_or_cast(std::forward<ValType>(val)).ptr())
+            != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+class args : public tuple {
+    PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check)
+};
+class kwargs : public dict {
+    PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)
+};
+
+class anyset : public object {
+public:
+    PYBIND11_OBJECT(anyset, object, PyAnySet_Check)
+    size_t size() const { return static_cast<size_t>(PySet_Size(m_ptr)); }
+    bool empty() const { return size() == 0; }
+    template <typename T>
+    bool contains(T &&val) const {
+        auto result = PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+};
+
+class set : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(set, anyset, PySet_Check, PySet_New)
+    set() : anyset(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate set object!");
+        }
+    }
+    template <typename T>
+    bool add(T &&val) /* py-non-const */ {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() /* py-non-const */ { PySet_Clear(m_ptr); }
+};
+
+class frozenset : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(frozenset, anyset, PyFrozenSet_Check, PyFrozenSet_New)
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr())) {
+            return fun;
+        }
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) {
+            flags |= PyBUF_WRITABLE;
+        }
+        auto *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+
+    /** \rst
+        Creates ``memoryview`` from ``buffer_info``.
+
+        ``buffer_info`` must be created from ``buffer::request()``. Otherwise
+        throws an exception.
+
+        For creating a ``memoryview`` from objects that support buffer protocol,
+        use ``memoryview(const object& obj)`` instead of this constructor.
+     \endrst */
+    explicit memoryview(const buffer_info &info) {
+        if (!info.view()) {
+            pybind11_fail("Prohibited to create memoryview without Py_buffer");
+        }
+        // Note: PyMemoryView_FromBuffer never increments obj reference.
+        m_ptr = (info.view()->obj) ? PyMemoryView_FromObject(info.view()->obj)
+                                   : PyMemoryView_FromBuffer(info.view());
+        if (!m_ptr) {
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+        }
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static buffer.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``ptr`` and ``format``, which MUST outlive the memoryview constructed
+        here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromBuffer:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromBuffer
+
+        :param ptr: Pointer to the buffer.
+        :param itemsize: Byte size of an element.
+        :param format: Pointer to the null-terminated format string. For
+            homogeneous Buffers, this should be set to
+            ``format_descriptor<T>::value``.
+        :param shape: Shape of the tensor (1 entry per dimension).
+        :param strides: Number of bytes between adjacent entries (for each
+            per dimension).
+        :param readonly: Flag to indicate if the underlying storage may be
+            written to.
+     \endrst */
+    static memoryview from_buffer(void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false);
+
+    static memoryview from_buffer(const void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<void *>(ptr), itemsize, format, std::move(shape), std::move(strides), true);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false) {
+        return memoryview::from_buffer(reinterpret_cast<void *>(ptr),
+                                       sizeof(T),
+                                       format_descriptor<T>::value,
+                                       std::move(shape),
+                                       std::move(strides),
+                                       readonly);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(const T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<T *>(ptr), std::move(shape), std::move(strides), true);
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static memory.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``mem``, which MUST outlive the memoryview constructed here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromMemory:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromMemory
+     \endrst */
+    static memoryview from_memory(void *mem, ssize_t size, bool readonly = false) {
+        PyObject *ptr = PyMemoryView_FromMemory(
+            reinterpret_cast<char *>(mem), size, (readonly) ? PyBUF_READ : PyBUF_WRITE);
+        if (!ptr) {
+            pybind11_fail("Could not allocate memoryview object!");
+        }
+        return memoryview(object(ptr, stolen_t{}));
+    }
+
+    static memoryview from_memory(const void *mem, ssize_t size) {
+        return memoryview::from_memory(const_cast<void *>(mem), size, true);
+    }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    static memoryview from_memory(std::string_view mem) {
+        return from_memory(const_cast<char *>(mem.data()), static_cast<ssize_t>(mem.size()), true);
+    }
+#endif
+};
+
+/// @cond DUPLICATE
+inline memoryview memoryview::from_buffer(void *ptr,
+                                          ssize_t itemsize,
+                                          const char *format,
+                                          detail::any_container<ssize_t> shape,
+                                          detail::any_container<ssize_t> strides,
+                                          bool readonly) {
+    size_t ndim = shape->size();
+    if (ndim != strides->size()) {
+        pybind11_fail("memoryview: shape length doesn't match strides length");
+    }
+    ssize_t size = ndim != 0u ? 1 : 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        size *= (*shape)[i];
+    }
+    Py_buffer view;
+    view.buf = ptr;
+    view.obj = nullptr;
+    view.len = size * itemsize;
+    view.readonly = static_cast<int>(readonly);
+    view.itemsize = itemsize;
+    view.format = const_cast<char *>(format);
+    view.ndim = static_cast<int>(ndim);
+    view.shape = shape->data();
+    view.strides = strides->data();
+    view.suboffsets = nullptr;
+    view.internal = nullptr;
+    PyObject *obj = PyMemoryView_FromBuffer(&view);
+    if (!obj) {
+        throw error_already_set();
+    }
+    return memoryview(object(obj, stolen_t{}));
+}
+/// @endcond
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+
+/// Get the length of a Python object.
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0) {
+        throw error_already_set();
+    }
+    return (size_t) result;
+}
+
+/// Get the length hint of a Python object.
+/// Returns 0 when this cannot be determined.
+inline size_t len_hint(handle h) {
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename D>
+iterator object_api<D>::begin() const {
+    return iter(derived());
+}
+template <typename D>
+iterator object_api<D>::end() const {
+    return iterator::sentinel();
+}
+template <typename D>
+item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D>
+args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D>
+template <typename T>
+bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const {
+    return pybind11::str(derived());
+}
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const {
+    return attr("__doc__");
+}
+
+template <typename D>
+handle object_api<D>::get_type() const {
+    return type::handle_of(derived());
+}
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1) {
+        throw error_already_set();
+    }
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                                      \
+    template <typename D>                                                                         \
+    object object_api<D>::op() const {                                                            \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));                           \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                                     \
+    template <typename D>                                                                         \
+    object object_api<D>::op(object_api const &other) const {                                     \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY_INPLACE(iop, fn)                                            \
+    template <typename D>                                                                         \
+    object object_api<D>::iop(object_api const &other) {                                          \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY(operator~, PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY(operator-, PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator+=, PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator-=, PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator*=, PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator/=, PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator|=, PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator&=, PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator^=, PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+#undef PYBIND11_MATH_OPERATOR_BINARY_INPLACE
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/stl.h b/MLPY/Lib/site-packages/torch/include/pybind11/stl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e26d97bed52ab5842b65c5c3a7743c820e04a77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/stl.h
@@ -0,0 +1,448 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+
+#include <deque>
+#include <list>
+#include <map>
+#include <ostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <valarray>
+
+// See `detail/common.h` for implementation of these guards.
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+#    include <experimental/optional>
+#endif
+
+#if defined(PYBIND11_HAS_VARIANT)
+#    include <variant>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<std::is_lvalue_reference<T>::value,
+                                     remove_reference_t<U> &,
+                                     remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+constexpr forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+// Checks if a container has a STL style reserve method.
+// This will only return true for a `reserve()` with a `void` return.
+template <typename C>
+using has_reserve_method = std::is_same<decltype(std::declval<C>().reserve(0)), void>;
+
+template <typename Type, typename Key>
+struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const anyset &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const anyset &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<anyset>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<anyset>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert)) {
+                return false;
+            }
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Key>::policy(policy);
+        }
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(std::move(value_))) {
+                return handle();
+            }
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("set[") + key_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Key, typename Value>
+struct map_caster {
+    using key_conv = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const dict &d, Type *) {
+        value.reserve(d.size());
+    }
+    void reserve_maybe(const dict &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src)) {
+            return false;
+        }
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        reserve_maybe(d, &value);
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) || !vconv.load(it.second.ptr(), convert)) {
+                return false;
+            }
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value) {
+                return handle();
+            }
+            d[std::move(key)] = std::move(value);
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("dict[") + key_conv::name + const_name(", ") + value_conv::name
+                             + const_name("]"));
+};
+
+template <typename Type, typename Value>
+struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (const auto &it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const sequence &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const sequence &, void *) {}
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("list[") + value_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::vector<Type, Alloc>> : list_caster<std::vector<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::deque<Type, Alloc>> : list_caster<std::deque<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::list<Type, Alloc>> : list_caster<std::list<Type, Alloc>, Type> {};
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0>
+struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size) {
+            value.resize(size);
+        }
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size())) {
+            return false;
+        }
+        size_t ctr = 0;
+        for (const auto &it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType,
+                         const_name<Resizable>(const_name(""), const_name("Annotated["))
+                             + const_name("list[") + value_conv::name + const_name("]")
+                             + const_name<Resizable>(const_name(""),
+                                                     const_name(", FixedSize(")
+                                                         + const_name<Size>() + const_name(")]")));
+};
+
+template <typename Type, size_t Size>
+struct type_caster<std::array<Type, Size>>
+    : array_caster<std::array<Type, Size>, Type, false, Size> {};
+
+template <typename Type>
+struct type_caster<std::valarray<Type>> : array_caster<std::valarray<Type>, Type, true> {};
+
+template <typename Key, typename Compare, typename Alloc>
+struct type_caster<std::set<Key, Compare, Alloc>>
+    : set_caster<std::set<Key, Compare, Alloc>, Key> {};
+
+template <typename Key, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+    : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> {};
+
+template <typename Key, typename Value, typename Compare, typename Alloc>
+struct type_caster<std::map<Key, Value, Compare, Alloc>>
+    : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> {};
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+    : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> {};
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template <typename Type, typename Value = typename Type::value_type>
+struct optional_caster {
+    using value_conv = make_caster<Value>;
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+        return value_conv::cast(*std::forward<T>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            return true; // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert)) {
+            return false;
+        }
+
+        value.emplace(cast_op<Value &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("Optional[") + value_conv::name + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+template <typename T>
+struct type_caster<std::optional<T>> : public optional_caster<std::optional<T>> {};
+
+template <>
+struct type_caster<std::nullopt_t> : public void_caster<std::nullopt_t> {};
+#endif
+
+#if defined(PYBIND11_HAS_EXP_OPTIONAL)
+template <typename T>
+struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template <>
+struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template <typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant>
+struct variant_caster;
+
+template <template <typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(std::move(caster));
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{})) {
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("Union[")
+                             + ::pybind11::detail::concat(make_caster<Ts>::name...)
+                             + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_VARIANT)
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> {};
+
+template <>
+struct type_caster<std::monostate> : public void_caster<std::monostate> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+#ifdef PYBIND11_HAS_STRING_VIEW
+    os << str(obj).cast<std::string_view>();
+#else
+    os << (std::string) str(obj);
+#endif
+    return os;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/stl_bind.h b/MLPY/Lib/site-packages/torch/include/pybind11/stl_bind.h
new file mode 100644
index 0000000000000000000000000000000000000000..008b947dda2bac34cff1ff2d6611022ff2e8e30f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/stl_bind.h
@@ -0,0 +1,823 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/type_caster_base.h"
+#include "cast.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+#include <type_traits>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>
+struct container_traits {
+    template <typename T2>
+    static std::true_type
+    test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>()) *);
+    template <typename T2>
+    static std::false_type test_comparable(...);
+    template <typename T2>
+    static std::true_type test_value(typename T2::value_type *);
+    template <typename T2>
+    static std::false_type test_value(...);
+    template <typename T2>
+    static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2>
+    static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable
+        = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair
+        = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector
+        = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type {};
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T,
+    enable_if_t<container_traits<T>::is_element && container_traits<T>::is_comparable>>
+    : std::true_type {};
+
+/* For a vector/map data structure, recursively check the value type
+   (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>>
+    : is_comparable<typename recursive_container_traits<T>::type_to_check_recursively> {};
+
+template <>
+struct is_comparable<recursive_bottom> : std::true_type {};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value = is_comparable<typename T::first_type>::value
+                                        && is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void vector_if_copy_constructible(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_equal_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_modifiers(const Args &...) {}
+
+template <typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template <typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def(
+        "count",
+        [](const Vector &v, const T &x) { return std::count(v.begin(), v.end(), x); },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list");
+
+    cl.def(
+        "remove",
+        [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end()) {
+                v.erase(p);
+            } else {
+                throw value_error();
+            }
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item.");
+
+    cl.def(
+        "__contains__",
+        [](const Vector &v, const T &x) { return std::find(v.begin(), v.end(), x) != v.end(); },
+        arg("x"),
+        "Return true the container contains ``x``");
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it
+// seems silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(
+    enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "append",
+        [](Vector &v, const T &value) { v.push_back(value); },
+        arg("x"),
+        "Add an item to the end of the list");
+
+    cl.def(init([](const iterable &it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it) {
+            v->push_back(h.cast<T>());
+        }
+        return v.release();
+    }));
+
+    cl.def(
+        "clear", [](Vector &v) { v.clear(); }, "Clear the contents");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const Vector &src) { v.insert(v.end(), src.begin(), src.end()); },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const iterable &it) {
+            const size_t old_size = v.size();
+            v.reserve(old_size + len_hint(it));
+            try {
+                for (handle h : it) {
+                    v.push_back(h.cast<T>());
+                }
+            } catch (const cast_error &) {
+                v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size),
+                        v.end());
+                try {
+                    v.shrink_to_fit();
+                } catch (const std::exception &) {
+                    // Do nothing
+                }
+                throw;
+            }
+        },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0) {
+                i += v.size();
+            }
+            if (i < 0 || (SizeType) i > v.size()) {
+                throw index_error();
+            }
+            v.insert(v.begin() + i, x);
+        },
+        arg("i"),
+        arg("x"),
+        "Insert an item at a given position.");
+
+    cl.def(
+        "pop",
+        [](Vector &v) {
+            if (v.empty()) {
+                throw index_error();
+            }
+            T t = std::move(v.back());
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item");
+
+    cl.def(
+        "pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = std::move(v[(SizeType) i]);
+            v.erase(std::next(v.begin(), i));
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``");
+
+    cl.def("__setitem__", [wrap_i](Vector &v, DiffType i, const T &t) {
+        i = wrap_i(i, v.size());
+        v[(SizeType) i] = t;
+    });
+
+    /// Slicing protocol
+    cl.def(
+        "__getitem__",
+        [](const Vector &v, const slice &slice) -> Vector * {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            auto *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object");
+
+    cl.def(
+        "__setitem__",
+        [](Vector &v, const slice &slice, const Vector &value) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (slicelength != value.size()) {
+                throw std::runtime_error(
+                    "Left and right hand size of slice assignment have different sizes!");
+            }
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object");
+
+    cl.def(
+        "__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``");
+
+    cl.def(
+        "__delitem__",
+        [](Vector &v, const slice &slice) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object");
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector>
+using vector_needs_copy
+    = negation<std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]),
+                            typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType) i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::reference_internal, ItType, ItType, T &>(
+                v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+    cl.def("__getitem__", [](const Vector &v, DiffType i) -> T {
+        if (i < 0) {
+            i += v.size();
+            if (i < 0) {
+                throw index_error();
+            }
+        }
+        auto i_st = static_cast<SizeType>(i);
+        if (i_st >= v.size()) {
+            throw index_error();
+        }
+        return v[i_st];
+    });
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::copy, ItType, ItType, T>(v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_>
+auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Vector::value_type>(),
+                void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def(
+        "__repr__",
+        [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i = 0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1) {
+                    s << ", ";
+                }
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list.");
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data()
+// is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<
+    Vector,
+    enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(),
+                                      std::declval<Vector>().data()),
+                             typename Vector::value_type *>::value>> : std::true_type {};
+
+// [workaround(intel)] Separate function required here
+// Workaround as the Intel compiler does not compile the enable_if_t part below
+// (tested with icc (ICC) 2021.1 Beta 20200827)
+template <typename... Args>
+constexpr bool args_any_are_buffer() {
+    return detail::any_of<std::is_same<Args, buffer_protocol>...>::value;
+}
+
+// [workaround(intel)] Separate function required here
+// [workaround(msvc)] Can't use constexpr bool in return type
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &cl, std::true_type) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value,
+                  "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard
+    // at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector &v) -> buffer_info {
+        return buffer_info(v.data(),
+                           static_cast<ssize_t>(sizeof(T)),
+                           format_descriptor<T>::format(),
+                           1,
+                           {v.size()},
+                           {sizeof(T)});
+    });
+
+    cl.def(init([](const buffer &buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T))) {
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        }
+        if (!detail::compare_buffer_info<T>::compare(info)
+            || (ssize_t) sizeof(T) != info.itemsize) {
+            throw type_error("Format mismatch (Python: " + info.format
+                             + " C++: " + format_descriptor<T>::format() + ")");
+        }
+
+        T *p = static_cast<T *>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        if (step == 1) {
+            return Vector(p, end);
+        }
+        Vector vec;
+        vec.reserve((size_t) info.shape[0]);
+        for (; p != end; p += step) {
+            vec.push_back(*p);
+        }
+        return vec;
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &, std::false_type) {}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer(Class_ &cl) {
+    vector_buffer_impl<Vector, Class_, Args...>(
+        cl, detail::any_of<std::is_same<Args, buffer_protocol>...>{});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args &&...args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto *vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def(
+        "__bool__",
+        [](const Vector &v) -> bool { return !v.empty(); },
+        "Check whether the list is nonempty");
+
+    cl.def("__len__", [](const Vector &vec) { return vec.size(); });
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+//
+// std::map, std::unordered_map
+//
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void map_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void map_assignment(const Args &...) {}
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(
+    enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        auto it = m.find(k);
+        if (it != m.end()) {
+            it->second = v;
+        } else {
+            m.emplace(k, v);
+        }
+    });
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and
+// reinserting
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<!is_copy_assignable<typename Map::mapped_type>::value
+                                    && is_copy_constructible<typename Map::mapped_type>::value,
+                                Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        // We can't use m[k] = v; because value type might not be default constructable
+        auto r = m.emplace(k, v);
+        if (!r.second) {
+            // value type is not copy assignable so the only way to insert it is to erase it
+            // first...
+            m.erase(r.first);
+            m.emplace(k, v);
+        }
+    });
+}
+
+template <typename Map, typename Class_>
+auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Map::key_type>()
+                                               << std::declval<typename Map::mapped_type>(),
+                void()) {
+
+    cl.def(
+        "__repr__",
+        [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f) {
+                    s << ", ";
+                }
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map.");
+}
+
+struct keys_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual bool contains(const handle &k) = 0;
+    virtual ~keys_view() = default;
+};
+
+struct values_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~values_view() = default;
+};
+
+struct items_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~items_view() = default;
+};
+
+template <typename Map>
+struct KeysViewImpl : public detail::keys_view {
+    explicit KeysViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_key_iterator(map.begin(), map.end()); }
+    bool contains(const handle &k) override {
+        try {
+            return map.find(k.template cast<typename Map::key_type>()) != map.end();
+        } catch (const cast_error &) {
+            return false;
+        }
+    }
+    Map &map;
+};
+
+template <typename Map>
+struct ValuesViewImpl : public detail::values_view {
+    explicit ValuesViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_value_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+template <typename Map>
+struct ItemsViewImpl : public detail::items_view {
+    explicit ItemsViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args &&...args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using KeysView = detail::keys_view;
+    using ValuesView = detail::values_view;
+    using ItemsView = detail::items_view;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto *tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Wrap KeysView if it wasn't already wrapped
+    if (!detail::get_type_info(typeid(KeysView))) {
+        class_<KeysView> keys_view(scope, "KeysView", pybind11::module_local(local));
+        keys_view.def("__len__", &KeysView::len);
+        keys_view.def("__iter__",
+                      &KeysView::iter,
+                      keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+        keys_view.def("__contains__", &KeysView::contains);
+    }
+    // Similarly for ValuesView:
+    if (!detail::get_type_info(typeid(ValuesView))) {
+        class_<ValuesView> values_view(scope, "ValuesView", pybind11::module_local(local));
+        values_view.def("__len__", &ValuesView::len);
+        values_view.def("__iter__",
+                        &ValuesView::iter,
+                        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+    // Similarly for ItemsView:
+    if (!detail::get_type_info(typeid(ItemsView))) {
+        class_<ItemsView> items_view(scope, "ItemsView", pybind11::module_local(local));
+        items_view.def("__len__", &ItemsView::len);
+        items_view.def("__iter__",
+                       &ItemsView::iter,
+                       keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def(
+        "__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty");
+
+    cl.def(
+        "__iter__",
+        [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+        keep_alive<0, 1>() /* Essential: keep map alive while iterator exists */
+    );
+
+    cl.def(
+        "keys",
+        [](Map &m) { return std::unique_ptr<KeysView>(new detail::KeysViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "values",
+        [](Map &m) { return std::unique_ptr<ValuesView>(new detail::ValuesViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "items",
+        [](Map &m) { return std::unique_ptr<ItemsView>(new detail::ItemsViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end()) {
+                throw key_error();
+            }
+            return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__", [](Map &m, const KeyType &k) -> bool {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            return false;
+        }
+        return true;
+    });
+    // Fallback for when the object is not of the key type
+    cl.def("__contains__", [](Map &, const object &) -> bool { return false; });
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__", [](Map &m, const KeyType &k) {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            throw key_error();
+        }
+        m.erase(it);
+    });
+
+    // Always use a lambda in case of `using` declaration
+    cl.def("__len__", [](const Map &m) { return m.size(); });
+
+    return cl;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h b/MLPY/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adb03394bf63aa3a4b2384d80cf39614d49901b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "cast.h"
+#include "pytypes.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<PyObject> {
+public:
+    static constexpr auto name = const_name("object"); // See discussion under PR #4601.
+
+    // This overload is purely to guard against accidents.
+    template <typename T,
+              detail::enable_if_t<!is_same_ignoring_cvref<T, PyObject *>::value, int> = 0>
+    static handle cast(T &&, return_value_policy, handle /*parent*/) {
+        static_assert(is_same_ignoring_cvref<T, PyObject *>::value,
+                      "Invalid C++ type T for to-Python conversion (type_caster<PyObject>).");
+        return nullptr; // Unreachable.
+    }
+
+    static handle cast(PyObject *src, return_value_policy policy, handle /*parent*/) {
+        if (src == nullptr) {
+            throw error_already_set();
+        }
+        if (PyErr_Occurred()) {
+            raise_from(PyExc_SystemError, "src != nullptr but PyErr_Occurred()");
+            throw error_already_set();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            return src;
+        }
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::automatic_reference) {
+            return handle(src).inc_ref();
+        }
+        pybind11_fail("type_caster<PyObject>::cast(): unsupported return_value_policy: "
+                      + std::to_string(static_cast<int>(policy)));
+    }
+
+    bool load(handle src, bool) {
+        value = reinterpret_borrow<object>(src);
+        return true;
+    }
+
+    template <typename T>
+    using cast_op_type = PyObject *;
+
+    explicit operator PyObject *() { return value.ptr(); }
+
+private:
+    object value;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/pybind11/typing.h b/MLPY/Lib/site-packages/torch/include/pybind11/typing.h
new file mode 100644
index 0000000000000000000000000000000000000000..31c6188f5b3283ba0a26fc22f6c336b7adaaa892
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/pybind11/typing.h
@@ -0,0 +1,125 @@
+/*
+    pybind11/typing.h: Convenience wrapper classes for basic Python types
+    with more explicit annotations.
+
+    Copyright (c) 2023 Dustin Spicuzza <dustin@virtualroadside.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "cast.h"
+#include "pytypes.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(typing)
+
+/*
+    The following types can be used to direct pybind11-generated docstrings
+    to have have more explicit types (e.g., `list[str]` instead of `list`).
+    Just use these in place of existing types.
+
+    There is no additional enforcement of types at runtime.
+*/
+
+template <typename... Types>
+class Tuple : public tuple {
+    using tuple::tuple;
+};
+
+template <typename K, typename V>
+class Dict : public dict {
+    using dict::dict;
+};
+
+template <typename T>
+class List : public list {
+    using list::list;
+};
+
+template <typename T>
+class Set : public set {
+    using set::set;
+};
+
+template <typename T>
+class Iterable : public iterable {
+    using iterable::iterable;
+};
+
+template <typename T>
+class Iterator : public iterator {
+    using iterator::iterator;
+};
+
+template <typename Signature>
+class Callable;
+
+template <typename Return, typename... Args>
+class Callable<Return(Args...)> : public function {
+    using function::function;
+};
+
+PYBIND11_NAMESPACE_END(typing)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename... Types>
+struct handle_type_name<typing::Tuple<Types...>> {
+    static constexpr auto name = const_name("tuple[")
+                                 + ::pybind11::detail::concat(make_caster<Types>::name...)
+                                 + const_name("]");
+};
+
+template <>
+struct handle_type_name<typing::Tuple<>> {
+    // PEP 484 specifies this syntax for an empty tuple
+    static constexpr auto name = const_name("tuple[()]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Tuple<T, ellipsis>> {
+    // PEP 484 specifies this syntax for a variable-length tuple
+    static constexpr auto name
+        = const_name("tuple[") + make_caster<T>::name + const_name(", ...]");
+};
+
+template <typename K, typename V>
+struct handle_type_name<typing::Dict<K, V>> {
+    static constexpr auto name = const_name("dict[") + make_caster<K>::name + const_name(", ")
+                                 + make_caster<V>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::List<T>> {
+    static constexpr auto name = const_name("list[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Set<T>> {
+    static constexpr auto name = const_name("set[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Iterable<T>> {
+    static constexpr auto name = const_name("Iterable[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Iterator<T>> {
+    static constexpr auto name = const_name("Iterator[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename Return, typename... Args>
+struct handle_type_name<typing::Callable<Return(Args...)>> {
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    static constexpr auto name
+        = const_name("Callable[[") + ::pybind11::detail::concat(make_caster<Args>::name...)
+          + const_name("], ") + make_caster<retval_type>::name + const_name("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..45930e5327aaa556eb7e6e34e1fa64d91639308c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h
@@ -0,0 +1,143 @@
+#pragma once
+#ifdef USE_CUDA
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime_api.h>
+#include <torch/csrc/Export.h>
+#include <cstddef>
+namespace torch {
+
+TORCH_CUDA_CU_API bool CudaIPCCollect();
+
+struct CudaIPCReceivedData final {
+  CudaIPCReceivedData() = default;
+  explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
+      : shared_ptr_(std::move(shared_ptr)) {}
+  std::shared_ptr<void> shared_ptr_;
+};
+
+struct CudaIPCSentData final {
+  std::string handle_;
+  uint64_t offset_;
+  uint64_t* counter_ptr_; // Reference counter shared memory block
+  at::DataPtr original_ptr_; // Original mem allocation
+  cudaEvent_t event_; // Sync cuEventDestroy
+  bool event_sync_required_;
+  at::Device device_;
+
+  CudaIPCSentData(
+      std::string handle,
+      uint64_t offset,
+      uint64_t* counter_ptr,
+      at::Device device);
+  ~CudaIPCSentData();
+
+  uint64_t counter_value();
+  std::string handle() {
+    return handle_;
+  }
+  uint64_t offset() {
+    return offset_;
+  }
+  void set_original_ptr(at::DataPtr data_ptr) {
+    original_ptr_ = std::move(data_ptr);
+  }
+};
+
+TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData(
+    void* data,
+    at::Device device);
+
+namespace {
+
+inline constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000;
+inline constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
+// This was determined empirically that CUDA (v10.1 and below) have the limit
+// on the number of recorded blocking interprocess events. It is around ~22,000.
+// And to give us leeway, we picked 1000 as it gives us enough events to share
+// tensors effectively.
+inline constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
+
+// All to be deleted data blocks with non zero reference counter goes there
+struct CudaIPCSentDataLimbo final {
+  ~CudaIPCSentDataLimbo();
+  bool collect();
+  void add(std::unique_ptr<CudaIPCSentData> shared_block);
+  uint64_t size();
+
+ private:
+  // TODO: Can be changed to FIFO in order to avoid full traverse on every
+  // collect()
+  std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
+  std::mutex limbo_mutex_;
+};
+
+struct CudaIPCRefCountersFile final {
+  CudaIPCRefCountersFile(
+      std::string handle,
+      uint64_t size,
+      at::DataPtr data_ptr)
+      : size_(size),
+
+        handle_(std::move(handle)),
+        refcounted_shared_mem_(std::move(data_ptr)) {}
+
+  uint64_t* counter_ptr() {
+    return static_cast<uint64_t*>(refcounted_shared_mem_.get()) + next_offset_;
+  }
+
+  void set_counter(uint64_t value) {
+    *counter_ptr() = value;
+  }
+
+  bool have_offsets() {
+    return next_offset_ < size_;
+  }
+
+  bool offsets_in_use() {
+    return used_slots_;
+  }
+
+  uint64_t get_offset() {
+    return next_offset_;
+  }
+
+  void rotate_offset() {
+    next_offset_++;
+    used_slots_++;
+  }
+
+  void return_offset(uint64_t offset /* unused */) {
+    used_slots_--;
+  }
+
+  std::string handle() {
+    return handle_;
+  }
+
+ private:
+  uint64_t next_offset_{0};
+  uint64_t size_;
+  uint64_t used_slots_{0};
+  std::string handle_;
+  at::DataPtr refcounted_shared_mem_;
+};
+
+} // namespace
+} // namespace torch
+
+namespace c10 {
+namespace {
+class CudaIPCCollectCallback : public FreeMemoryCallback {
+ public:
+  bool Execute() override {
+    return torch::CudaIPCCollect();
+  }
+};
+} // namespace
+
+} // namespace c10
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/DataLoader.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/DataLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..405c2620b62e7663a6a847515e4a019819d2be11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/DataLoader.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays)
+extern PyMethodDef DataLoaderMethods[];
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Device.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..711fe2f9ccb4f44545077112ba67f8f7bca2d0b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Device.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Device.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct TORCH_API THPDevice {
+  PyObject_HEAD at::Device device;
+};
+
+TORCH_API extern PyTypeObject THPDeviceType;
+
+inline bool THPDevice_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPDeviceType;
+}
+
+TORCH_API PyObject* THPDevice_New(const at::Device& device);
+
+TORCH_API void THPDevice_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Dtype.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Dtype.h
new file mode 100644
index 0000000000000000000000000000000000000000..187efa97cc2701da7ab9613ff70b3ef2237575d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Dtype.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+constexpr int DTYPE_NAME_LEN = 64;
+
+struct TORCH_API THPDtype {
+  PyObject_HEAD at::ScalarType scalar_type;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[DTYPE_NAME_LEN + 1];
+};
+
+TORCH_API extern PyTypeObject THPDtypeType;
+
+inline bool THPDtype_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPDtypeType;
+}
+
+inline bool THPPythonScalarType_Check(PyObject* obj) {
+  return obj == (PyObject*)(&PyFloat_Type) ||
+      obj == (PyObject*)(&PyBool_Type) || obj == (PyObject*)(&PyLong_Type);
+}
+
+TORCH_API PyObject* THPDtype_New(
+    at::ScalarType scalar_type,
+    const std::string& name);
+
+void THPDtype_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..151ce3ace8f3d3b281bf3538823bd8af26e64491
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h
@@ -0,0 +1,36 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and at::Tensor.
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Device.h>
+#include <c10/core/Backend.h>
+#include <c10/core/Layout.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <torch/csrc/Export.h>
+
+#include <memory>
+#include <string>
+
+struct THPDtype;
+struct THPLayout;
+
+namespace c10 {
+struct Storage;
+}
+
+namespace torch {
+void registerDtypeObject(THPDtype* dtype, at::ScalarType scalarType);
+void registerLayoutObject(THPLayout* thp_layout, at::Layout layout);
+
+TORCH_PYTHON_API PyObject* createPyObject(const at::Storage& storage);
+at::Storage createStorage(PyObject* obj);
+std::tuple<at::Storage, at::ScalarType, bool> createStorageGetType(
+    PyObject* obj);
+bool isStorage(PyObject* obj);
+
+TORCH_PYTHON_API THPDtype* getTHPDtype(at::ScalarType scalarType);
+THPLayout* getTHPLayout(at::Layout layout);
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Exceptions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e83a449f3a9080092aac059d12d4526b69d996c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Exceptions.h
@@ -0,0 +1,390 @@
+#pragma once
+
+#include <exception>
+#include <memory>
+#include <string>
+#include <system_error>
+
+#include <ATen/detail/FunctionTraits.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/StringUtil.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
+#include <torch/csrc/utils/pybind.h>
+
+#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
+#include <torch/csrc/distributed/c10d/exception.h>
+#endif
+
+static inline void PyErr_SetString(PyObject* type, const std::string& message) {
+  PyErr_SetString(type, message.c_str());
+}
+/// NOTE [ Conversion Cpp Python Warning ]
+/// The warning handler cannot set python warnings immediately
+/// as it requires acquiring the GIL (potential deadlock)
+/// and would need to cleanly exit if the warning raised a
+/// python error. To solve this, we buffer the warnings and
+/// process them when we go back to python.
+/// This requires the two try/catch blocks below to handle the
+/// following cases:
+///   - If there is no Error raised in the inner try/catch, the
+///     buffered warnings are processed as python warnings.
+///     - If they don't raise an error, the function process with the
+///       original return code.
+///     - If any of them raise an error, the error is set (PyErr_*) and
+///       the destructor will raise a cpp exception python_error() that
+///       will be caught by the outer try/catch that will be able to change
+///       the return value of the function to reflect the error.
+///   - If an Error was raised in the inner try/catch, the inner try/catch
+///     must set the python error. The buffered warnings are then
+///     processed as cpp warnings as we cannot predict before hand
+///     whether a python warning will raise an error or not and we
+///     cannot handle two errors at the same time.
+/// This advanced handler will only be used in the current thread.
+/// If any other thread is used, warnings will be processed as
+/// cpp warnings.
+#define HANDLE_TH_ERRORS                              \
+  try {                                               \
+    torch::PyWarningHandler __enforce_warning_buffer; \
+    try {
+#define _CATCH_GENERIC_ERROR(ErrorType, PythonErrorType, retstmnt) \
+  catch (const c10::ErrorType& e) {                                \
+    auto msg = torch::get_cpp_stacktraces_enabled()                \
+        ? e.what()                                                 \
+        : e.what_without_backtrace();                              \
+    PyErr_SetString(PythonErrorType, torch::processErrorMsg(msg)); \
+    retstmnt;                                                      \
+  }
+
+// Only catch torch-specific exceptions
+#define CATCH_CORE_ERRORS(retstmnt)                                           \
+  catch (python_error & e) {                                                  \
+    e.restore();                                                              \
+    retstmnt;                                                                 \
+  }                                                                           \
+  catch (py::error_already_set & e) {                                         \
+    e.restore();                                                              \
+    retstmnt;                                                                 \
+  }                                                                           \
+  _CATCH_GENERIC_ERROR(IndexError, PyExc_IndexError, retstmnt)                \
+  _CATCH_GENERIC_ERROR(ValueError, PyExc_ValueError, retstmnt)                \
+  _CATCH_GENERIC_ERROR(TypeError, PyExc_TypeError, retstmnt)                  \
+  _CATCH_GENERIC_ERROR(                                                       \
+      NotImplementedError, PyExc_NotImplementedError, retstmnt)               \
+  _CATCH_GENERIC_ERROR(LinAlgError, THPException_LinAlgError, retstmnt)       \
+  _CATCH_GENERIC_ERROR(                                                       \
+      OutOfMemoryError, THPException_OutOfMemoryError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistBackendError, THPException_DistBackendError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistNetworkError, THPException_DistNetworkError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(DistStoreError, THPException_DistStoreError, retstmnt) \
+  _CATCH_GENERIC_ERROR(DistError, THPException_DistError, retstmnt)           \
+  _CATCH_GENERIC_ERROR(Error, PyExc_RuntimeError, retstmnt)                   \
+  catch (torch::PyTorchError & e) {                                           \
+    auto msg = torch::processErrorMsg(e.what());                              \
+    PyErr_SetString(e.python_type(), msg);                                    \
+    retstmnt;                                                                 \
+  }
+
+#define CATCH_TH_ERRORS(retstmnt) CATCH_CORE_ERRORS(retstmnt)
+
+#define CATCH_ALL_ERRORS(retstmnt)               \
+  CATCH_TH_ERRORS(retstmnt)                      \
+  catch (const std::exception& e) {              \
+    auto msg = torch::processErrorMsg(e.what()); \
+    PyErr_SetString(PyExc_RuntimeError, msg);    \
+    retstmnt;                                    \
+  }
+
+#define END_HANDLE_TH_ERRORS_PYBIND                                 \
+  }                                                                 \
+  catch (...) {                                                     \
+    __enforce_warning_buffer.set_in_exception();                    \
+    throw;                                                          \
+  }                                                                 \
+  }                                                                 \
+  catch (py::error_already_set & e) {                               \
+    throw;                                                          \
+  }                                                                 \
+  catch (py::builtin_exception & e) {                               \
+    throw;                                                          \
+  }                                                                 \
+  catch (torch::jit::JITException & e) {                            \
+    throw;                                                          \
+  }                                                                 \
+  catch (const std::exception& e) {                                 \
+    torch::translate_exception_to_python(std::current_exception()); \
+    throw py::error_already_set();                                  \
+  }
+
+#define END_HANDLE_TH_ERRORS_RET(retval)                            \
+  }                                                                 \
+  catch (...) {                                                     \
+    __enforce_warning_buffer.set_in_exception();                    \
+    throw;                                                          \
+  }                                                                 \
+  }                                                                 \
+  catch (const std::exception& e) {                                 \
+    torch::translate_exception_to_python(std::current_exception()); \
+    return retval;                                                  \
+  }
+
+#define END_HANDLE_TH_ERRORS END_HANDLE_TH_ERRORS_RET(nullptr)
+
+extern PyObject *THPException_FatalError, *THPException_LinAlgError,
+    *THPException_OutOfMemoryError, *THPException_DistError,
+    *THPException_DistBackendError, *THPException_DistNetworkError,
+    *THPException_DistStoreError;
+
+// Throwing this exception means that the python error flags have been already
+// set and control should be immediately returned to the interpreter.
+struct python_error : public std::exception {
+  python_error() = default;
+
+  python_error(const python_error& other)
+      : type(other.type),
+        value(other.value),
+        traceback(other.traceback),
+        message(other.message) {
+    pybind11::gil_scoped_acquire gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+  }
+
+  python_error(python_error&& other) noexcept
+      : type(other.type),
+        value(other.value),
+        traceback(other.traceback),
+        message(std::move(other.message)) {
+    other.type = nullptr;
+    other.value = nullptr;
+    other.traceback = nullptr;
+  }
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~python_error() override {
+    if (type || value || traceback) {
+      pybind11::gil_scoped_acquire gil;
+      Py_XDECREF(type);
+      Py_XDECREF(value);
+      Py_XDECREF(traceback);
+    }
+  }
+
+  const char* what() const noexcept override {
+    return message.c_str();
+  }
+
+  void build_message() {
+    // Ensure we have the GIL.
+    pybind11::gil_scoped_acquire gil;
+
+    // No errors should be set when we enter the function since PyErr_Fetch
+    // clears the error indicator.
+    TORCH_INTERNAL_ASSERT(!PyErr_Occurred());
+
+    // Default message.
+    message = "python_error";
+
+    // Try to retrieve the error message from the value.
+    if (value != nullptr) {
+      // Reference count should not be zero.
+      TORCH_INTERNAL_ASSERT(Py_REFCNT(value) > 0);
+
+      PyObject* pyStr = PyObject_Str(value);
+      if (pyStr != nullptr) {
+        PyObject* encodedString =
+            PyUnicode_AsEncodedString(pyStr, "utf-8", "strict");
+        if (encodedString != nullptr) {
+          char* bytes = PyBytes_AS_STRING(encodedString);
+          if (bytes != nullptr) {
+            // Set the message.
+            message = std::string(bytes);
+          }
+          Py_XDECREF(encodedString);
+        }
+        Py_XDECREF(pyStr);
+      }
+    }
+
+    // Clear any errors since we don't want to propagate errors for functions
+    // that are trying to build a string for the error message.
+    PyErr_Clear();
+  }
+
+  /** Saves the exception so that it can be re-thrown on a different thread */
+  inline void persist() {
+    if (type)
+      return; // Don't overwrite exceptions
+    // PyErr_Fetch overwrites the pointers
+    pybind11::gil_scoped_acquire gil;
+    Py_XDECREF(type);
+    Py_XDECREF(value);
+    Py_XDECREF(traceback);
+    PyErr_Fetch(&type, &value, &traceback);
+    build_message();
+  }
+
+  /** Sets the current Python error from this exception */
+  inline void restore() {
+    if (!type)
+      return;
+    // PyErr_Restore steals references
+    pybind11::gil_scoped_acquire gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+    PyErr_Restore(type, value, traceback);
+  }
+
+  PyObject* type{nullptr};
+  PyObject* value{nullptr};
+  PyObject* traceback{nullptr};
+
+  // Message to return to the user when 'what()' is invoked.
+  std::string message;
+};
+
+bool THPException_init(PyObject* module);
+
+namespace torch {
+
+// Set python current exception from a C++ exception
+TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr&);
+
+TORCH_PYTHON_API std::string processErrorMsg(std::string str);
+
+// Abstract base class for exceptions which translate to specific Python types
+struct PyTorchError : public std::exception {
+  PyTorchError() = default;
+  PyTorchError(std::string msg_) : msg(std::move(msg_)) {}
+  virtual PyObject* python_type() = 0;
+  const char* what() const noexcept override {
+    return msg.c_str();
+  }
+  std::string msg;
+};
+
+// Declare a printf-like function on gcc & clang
+// The compiler can then warn on invalid format specifiers
+#ifdef __GNUC__
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX) \
+  __attribute__((format(printf, FORMAT_INDEX, VA_ARGS_INDEX)))
+#else
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX)
+#endif
+
+// Translates to Python TypeError
+struct TypeError : public PyTorchError {
+  using PyTorchError::PyTorchError;
+  TORCH_PYTHON_API TypeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+  PyObject* python_type() override {
+    return PyExc_TypeError;
+  }
+};
+
+// Translates to Python AttributeError
+struct AttributeError : public PyTorchError {
+  AttributeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+  PyObject* python_type() override {
+    return PyExc_AttributeError;
+  }
+};
+
+// ATen warning handler for Python
+struct PyWarningHandler {
+  // Move actual handler into a separate class with a noexcept
+  // destructor. Otherwise, we need to force all WarningHandler
+  // subclasses to have a noexcept(false) destructor.
+  struct InternalHandler : at::WarningHandler {
+    ~InternalHandler() override = default;
+    void process(const c10::Warning& warning) override;
+
+    std::vector<c10::Warning> warning_buffer_;
+  };
+
+ public:
+  /// See NOTE [ Conversion Cpp Python Warning ] for noexcept justification
+  TORCH_PYTHON_API PyWarningHandler() noexcept(true);
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  TORCH_PYTHON_API ~PyWarningHandler() noexcept(false);
+
+  /** Call if an exception has been thrown
+
+   *  Necessary to determine if it is safe to throw from the desctructor since
+   *  std::uncaught_exception is buggy on some platforms and generally
+   *  unreliable across dynamic library calls.
+   */
+  void set_in_exception() {
+    in_exception_ = true;
+  }
+
+ private:
+  InternalHandler internal_handler_;
+  at::WarningHandler* prev_handler_;
+  bool in_exception_;
+};
+
+namespace detail {
+
+struct noop_gil_scoped_release {
+  // user-defined constructor (i.e. not defaulted) to avoid
+  // unused-variable warnings at usage sites of this class
+  noop_gil_scoped_release() {}
+};
+
+template <bool release_gil>
+using conditional_gil_scoped_release = std::conditional_t<
+    release_gil,
+    pybind11::gil_scoped_release,
+    noop_gil_scoped_release>;
+
+template <typename Func, size_t i>
+using Arg = typename invoke_traits<Func>::template arg<i>::type;
+
+template <typename Func, size_t... Is, bool release_gil>
+auto wrap_pybind_function_impl_(
+    // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+    Func&& f,
+    std::index_sequence<Is...>,
+    std::bool_constant<release_gil>) {
+  namespace py = pybind11;
+
+  // f=f is needed to handle function references on older compilers
+  return [f = std::forward<Func>(f)](Arg<Func, Is>... args) {
+    HANDLE_TH_ERRORS
+    conditional_gil_scoped_release<release_gil> no_gil;
+    return c10::guts::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+    END_HANDLE_TH_ERRORS_PYBIND
+  };
+}
+} // namespace detail
+
+// Wrap a function with TH error and warning handling.
+// Returns a function object suitable for registering with pybind11.
+template <typename Func>
+auto wrap_pybind_function(Func&& f) {
+  using traits = invoke_traits<Func>;
+  return torch::detail::wrap_pybind_function_impl_(
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::false_type{});
+}
+
+// Wrap a function with TH error, warning handling and releases the GIL.
+// Returns a function object suitable for registering with pybind11.
+template <typename Func>
+auto wrap_pybind_function_no_gil(Func&& f) {
+  using traits = invoke_traits<Func>;
+  return torch::detail::wrap_pybind_function_impl_(
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::true_type{});
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Export.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Export.h
new file mode 100644
index 0000000000000000000000000000000000000000..b441b88375254ebffd50179e6dfe73f681ca6a80
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Export.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#ifdef THP_BUILD_MAIN_LIB
+#define TORCH_PYTHON_API C10_EXPORT
+#else
+#define TORCH_PYTHON_API C10_IMPORT
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Generator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..65f8c8556942cbbb655d0b67185545899eace415
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Generator.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPGenerator {
+  PyObject_HEAD at::Generator cdata;
+};
+
+// Creates a new Python object wrapping the default at::Generator. The reference
+// is borrowed. The caller should ensure that the at::Generator object lifetime
+// last at least as long as the Python wrapper.
+TORCH_PYTHON_API PyObject* THPGenerator_initDefaultGenerator(
+    at::Generator cdata);
+
+#define THPGenerator_Check(obj) PyObject_IsInstance(obj, THPGeneratorClass)
+
+TORCH_PYTHON_API extern PyObject* THPGeneratorClass;
+
+bool THPGenerator_init(PyObject* module);
+
+TORCH_PYTHON_API PyObject* THPGenerator_Wrap(at::Generator gen);
+
+// Creates a new Python object for a Generator. The Generator must not already
+// have a PyObject* associated with it.
+PyObject* THPGenerator_NewWithVar(PyTypeObject* type, at::Generator gen);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Layout.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..67a814a4d94bf5b6fada01e0c89052af337f5f5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Layout.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Layout.h>
+
+#include <string>
+
+const int LAYOUT_NAME_LEN = 64;
+
+struct THPLayout {
+  PyObject_HEAD at::Layout layout;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[LAYOUT_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPLayoutType;
+
+inline bool THPLayout_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPLayoutType;
+}
+
+PyObject* THPLayout_New(at::Layout layout, const std::string& name);
+
+void THPLayout_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6c47594783d152b88f04671da98a499e4da9cc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <c10/core/MemoryFormat.h>
+
+#include <string>
+
+const int MEMORY_FORMAT_NAME_LEN = 64;
+
+struct THPMemoryFormat {
+  PyObject_HEAD at::MemoryFormat memory_format;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[MEMORY_FORMAT_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPMemoryFormatType;
+
+inline bool THPMemoryFormat_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPMemoryFormatType;
+}
+
+PyObject* THPMemoryFormat_New(
+    at::MemoryFormat memory_format,
+    const std::string& name);
+
+void THPMemoryFormat_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9f59d4d1fe350b92baba30b684c995891b0404c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Module.h
@@ -0,0 +1,6 @@
+#ifndef THP_MODULE_INC
+#define THP_MODULE_INC
+
+#define THP_STATELESS_ATTRIBUTE_NAME "_torch"
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..443c27ee3085bb9bfe7633413b738b3ded731666
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <torch/csrc/Export.h>
+
+TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+TORCH_PYTHON_API bool isMainPyInterpreter();
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/QScheme.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/QScheme.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0d5b6407643671c9250787416af66a131fcb105
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/QScheme.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <c10/core/QScheme.h>
+
+#include <string>
+
+constexpr int QSCHEME_NAME_LEN = 64;
+
+struct THPQScheme {
+  PyObject_HEAD at::QScheme qscheme;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[QSCHEME_NAME_LEN + 1];
+};
+
+extern PyTypeObject THPQSchemeType;
+
+inline bool THPQScheme_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPQSchemeType;
+}
+
+PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name);
+
+void THPQScheme_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Size.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Size.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fa78d55b151d2999f89dc48c42fa5629c8753c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Size.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/python_headers.h>
+#include <cstdint>
+
+extern PyTypeObject THPSizeType;
+
+#define THPSize_Check(obj) (Py_TYPE(obj) == &THPSizeType)
+
+PyObject* THPSize_New(const torch::autograd::Variable& t);
+PyObject* THPSize_NewFromSizes(int64_t dim, const int64_t* sizes);
+PyObject* THPSize_NewFromSymSizes(const at::Tensor& t);
+
+void THPSize_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Storage.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f93e1d040445894200c00fc5cf459f28d2b85a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Storage.h
@@ -0,0 +1,60 @@
+#ifndef THP_STORAGE_INC
+#define THP_STORAGE_INC
+
+#include <Python.h>
+#include <c10/core/Storage.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/Types.h>
+
+#define THPStorageStr "torch.UntypedStorage"
+
+struct THPStorage {
+  PyObject_HEAD;
+  c10::MaybeOwned<c10::Storage> cdata;
+  bool is_hermetic;
+};
+
+TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
+TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
+    PyTypeObject* type,
+    c10::Storage _storage,
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj = false);
+extern PyTypeObject* THPStorageClass;
+
+static inline bool THPStorage_CheckTypeExact(PyTypeObject* tp) {
+  return tp == THPStorageClass;
+}
+
+static inline bool THPStorage_CheckExact(PyObject* obj) {
+  return THPStorage_CheckTypeExact(Py_TYPE(obj));
+}
+
+inline bool THPStorage_Check(PyObject* obj) {
+  if (!THPStorageClass)
+    return false;
+
+  const auto result = PyObject_IsInstance(obj, (PyObject*)THPStorageClass);
+  if (result == -1)
+    throw python_error();
+  return result;
+}
+
+bool THPStorage_init(PyObject* module);
+void THPStorage_postInit(PyObject* module);
+
+void THPStorage_assertNotNull(THPStorage* storage);
+void THPStorage_assertNotNull(PyObject* obj);
+
+extern PyTypeObject THPStorageType;
+
+inline const c10::Storage& THPStorage_Unpack(THPStorage* storage) {
+  return *storage->cdata;
+}
+
+inline const c10::Storage& THPStorage_Unpack(PyObject* obj) {
+  return THPStorage_Unpack(reinterpret_cast<THPStorage*>(obj));
+}
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h
new file mode 100644
index 0000000000000000000000000000000000000000..67918fc036f975bb2e58f2fbdedda1ad9d8d347d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h
@@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_METHODS_INC
+#define THP_STORAGE_METHODS_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getMethods();
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h
new file mode 100644
index 0000000000000000000000000000000000000000..d00ee368e3a556d23b628bdafb3da4d46f78becb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h
@@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_SHARING_INC
+#define THP_STORAGE_SHARING_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getSharingMethods();
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Stream.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..54f3f1284005d5b4551ec6def96dc609ed0c8ea9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Stream.h
@@ -0,0 +1,23 @@
+#ifndef THP_STREAM_INC
+#define THP_STREAM_INC
+
+#include <c10/core/Stream.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/python_headers.h>
+
+struct THPStream {
+  PyObject_HEAD int64_t stream_id;
+  int64_t device_type;
+  int64_t device_index;
+};
+extern TORCH_API PyTypeObject* THPStreamClass;
+
+void THPStream_init(PyObject* module);
+
+inline bool THPStream_Check(PyObject* obj) {
+  return THPStreamClass && PyObject_IsInstance(obj, (PyObject*)THPStreamClass);
+}
+
+PyObject* THPStream_Wrap(const c10::Stream& stream);
+
+#endif // THP_STREAM_INC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/THConcat.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/THConcat.h
new file mode 100644
index 0000000000000000000000000000000000000000..129571bade3d3e1e8cda01471d37d85078b394d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/THConcat.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#define TH_CONCAT_STRING_2(x, y) TH_CONCAT_STRING_2_EXPAND(x, y)
+#define TH_CONCAT_STRING_2_EXPAND(x, y) #x #y
+
+#define TH_CONCAT_STRING_3(x, y, z) TH_CONCAT_STRING_3_EXPAND(x, y, z)
+#define TH_CONCAT_STRING_3_EXPAND(x, y, z) #x #y #z
+
+#define TH_CONCAT_STRING_4(x, y, z, w) TH_CONCAT_STRING_4_EXPAND(x, y, z, w)
+#define TH_CONCAT_STRING_4_EXPAND(x, y, z, w) #x #y #z #w
+
+#define TH_CONCAT_2(x, y) TH_CONCAT_2_EXPAND(x, y)
+#define TH_CONCAT_2_EXPAND(x, y) x##y
+
+#define TH_CONCAT_3(x, y, z) TH_CONCAT_3_EXPAND(x, y, z)
+#define TH_CONCAT_3_EXPAND(x, y, z) x##y##z
+
+#define TH_CONCAT_4_EXPAND(x, y, z, w) x##y##z##w
+#define TH_CONCAT_4(x, y, z, w) TH_CONCAT_4_EXPAND(x, y, z, w)
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/THP.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/THP.h
new file mode 100644
index 0000000000000000000000000000000000000000..1094e98a519d54677f754e379ab2e6773bbb80de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/THP.h
@@ -0,0 +1,30 @@
+#ifndef THP_H
+#define THP_H
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+// Back-compatibility macros, Thanks to http://cx-oracle.sourceforge.net/
+// define PyInt_* macros for Python 3.x.  NB: We must include Python.h first,
+// otherwise we'll incorrectly conclude PyInt_Check isn't defined!
+#ifndef PyInt_Check
+#define PyInt_Check PyLong_Check
+#define PyInt_FromLong PyLong_FromLong
+#define PyInt_AsLong PyLong_AsLong
+#define PyInt_Type PyLong_Type
+#endif
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/Module.h>
+#include <torch/csrc/Size.h>
+#include <torch/csrc/Storage.h>
+#include <torch/csrc/Types.h>
+#include <torch/csrc/utils.h> // This requires defined Storage and Tensor types
+#include <torch/csrc/utils/byte_order.h>
+
+#include <torch/csrc/serialization.h>
+
+#include <torch/csrc/autograd/python_autograd.h>
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f23400b56dc44847c9cd5d28dda94564e705145
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/ATen.h>
+
+struct THPDTypeInfo {
+  PyObject_HEAD at::ScalarType type;
+};
+
+struct THPFInfo : THPDTypeInfo {};
+
+struct THPIInfo : THPDTypeInfo {};
+
+extern PyTypeObject THPFInfoType;
+extern PyTypeObject THPIInfoType;
+
+inline bool THPFInfo_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPFInfoType;
+}
+
+inline bool THPIInfo_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPIInfoType;
+}
+
+void THPDTypeInfo_init(PyObject* module);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/Types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4756ce33f64b2efad4b17676c9242554b900245
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/Types.h
@@ -0,0 +1,13 @@
+#ifndef THP_TYPES_INC
+#define THP_TYPES_INC
+
+#include <cstddef>
+
+#ifndef INT64_MAX
+#include <cstdint>
+#endif
+
+template <typename T>
+struct THPTypeInfo {};
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e1ceae3681972788479bf3419232cbf7208787c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#if !defined(_MSC_VER) && __cplusplus < 201703L
+#error C++17 or later compatible compiler is required to use PyTorch.
+#endif
+
+#include <torch/autograd.h>
+#include <torch/cuda.h>
+#include <torch/data.h>
+#include <torch/enum.h>
+#include <torch/fft.h>
+#include <torch/jit.h>
+#include <torch/linalg.h>
+#include <torch/mps.h>
+#include <torch/nested.h>
+#include <torch/nn.h>
+#include <torch/optim.h>
+#include <torch/serialize.h>
+#include <torch/sparse.h>
+#include <torch/special.h>
+#include <torch/types.h>
+#include <torch/utils.h>
+#include <torch/version.h>
+#include <torch/xpu.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h
new file mode 100644
index 0000000000000000000000000000000000000000..70c07e30aece4294c58c53a80b241a33b9c6425f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <utility>
+
+#define TORCH_ARG(T, name)                                                \
+ public:                                                                  \
+  inline auto name(const T& new_##name) -> decltype(*this) { /* NOLINT */ \
+    this->name##_ = new_##name;                                           \
+    return *this;                                                         \
+  }                                                                       \
+  inline auto name(T&& new_##name) -> decltype(*this) { /* NOLINT */      \
+    this->name##_ = std::move(new_##name);                                \
+    return *this;                                                         \
+  }                                                                       \
+  inline const T& name() const noexcept { /* NOLINT */                    \
+    return this->name##_;                                                 \
+  }                                                                       \
+  inline T& name() noexcept { /* NOLINT */                                \
+    return this->name##_;                                                 \
+  }                                                                       \
+                                                                          \
+ private:                                                                 \
+  T name##_ /* NOLINT */
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f6ae577c0b7ebcce2c56c177d590dd4f5426919
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+#include <torch/csrc/autograd/custom_function.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8dd1d2df4c6a966ab480f126aae5b9e533ca7b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch {
+namespace cuda {
+
+/// Returns the number of CUDA devices available.
+size_t TORCH_API device_count();
+
+/// Returns true if at least one CUDA device is available.
+bool TORCH_API is_available();
+
+/// Returns true if CUDA is available, and CuDNN is available.
+bool TORCH_API cudnn_is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Sets the seed for all available GPUs.
+void TORCH_API manual_seed_all(uint64_t seed);
+
+/// Waits for all kernels in all streams on a CUDA device to complete.
+void TORCH_API synchronize(int64_t device_index = -1);
+
+} // namespace cuda
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h
new file mode 100644
index 0000000000000000000000000000000000000000..491324bfeadd9474361c37a0bbadeeda3415ee43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/data/dataloader.h>
+#include <torch/data/datasets.h>
+#include <torch/data/samplers.h>
+#include <torch/data/transforms.h>
+
+// Some "exports".
+namespace torch {
+namespace data {
+using datasets::BatchDataset;
+using datasets::Dataset;
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a75cd2f3d91e5f96acd09eb09aec0e45683eac7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/data/dataloader/stateful.h>
+#include <torch/data/dataloader/stateless.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace data {
+
+/// Creates a `DataLoader` instance for a stateless `dataset`, a `sampler` and
+/// some `options`.
+template <typename Dataset, typename Sampler>
+torch::disable_if_t<
+    Dataset::is_stateful,
+    std::unique_ptr<StatelessDataLoader<Dataset, Sampler>>>
+make_data_loader(Dataset dataset, Sampler sampler, DataLoaderOptions options) {
+  return std::make_unique<StatelessDataLoader<Dataset, Sampler>>(
+      std::move(dataset), std::move(sampler), std::move(options));
+}
+
+/// Creates a `DataLoader` instance for a stateless `dataset` and some
+/// `options`. A sampler (by default a `RandomSampler`) will be constructed from
+/// the size of the dataset.
+template <typename Sampler = samplers::RandomSampler, typename Dataset>
+torch::disable_if_t<
+    Dataset::is_stateful || !std::is_constructible<Sampler, size_t>::value,
+    std::unique_ptr<StatelessDataLoader<Dataset, Sampler>>>
+make_data_loader(
+    Dataset dataset,
+    DataLoaderOptions options = DataLoaderOptions()) {
+  const optional<size_t> size = dataset.size();
+  TORCH_CHECK(
+      size.has_value(),
+      "Expected the dataset to be sized in "
+      "order to construct the Sampler");
+  return make_data_loader(
+      std::move(dataset), Sampler(*size), std::move(options));
+}
+
+/// Creates a `DataLoader` for a stateful `dataset` and some `options`.
+template <typename Dataset, typename = torch::enable_if_t<Dataset::is_stateful>>
+std::unique_ptr<StatefulDataLoader<Dataset>> make_data_loader(
+    Dataset dataset,
+    DataLoaderOptions options = DataLoaderOptions()) {
+  return std::make_unique<StatefulDataLoader<Dataset>>(
+      std::move(dataset), std::move(options));
+}
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..451e781562894279b53a41f1a7a6b04ffaf0bc95
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h
@@ -0,0 +1,255 @@
+#pragma once
+
+#include <torch/data/dataloader_options.h>
+#include <torch/data/detail/data_shuttle.h>
+#include <torch/data/detail/sequencers.h>
+#include <torch/data/iterator.h>
+#include <torch/data/samplers/random.h>
+#include <torch/data/worker_exception.h>
+#include <torch/types.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace data {
+template <typename Dataset, typename Batch, typename BatchRequest>
+class DataLoaderBase {
+ public:
+  using BatchType = Batch;
+  using BatchRequestType = BatchRequest;
+
+  /// Constructs a new DataLoader from a `dataset` to sample from, `options`
+  /// to configure the DataLoader with, and a `sampler` that specifies the
+  /// sampling strategy.
+  DataLoaderBase(
+      DataLoaderOptions options,
+      std::unique_ptr<Dataset> main_thread_dataset = nullptr)
+      : options_(std::move(options)),
+        main_thread_dataset_(std::move(main_thread_dataset)),
+        sequencer_(new_sequencer()) {}
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  virtual ~DataLoaderBase() {
+    join();
+  }
+
+  /// Returns an iterator into the DataLoader. The lifetime of the iterator is
+  /// bound to the DataLoader. In C++ standards language, the category of the
+  /// iterator is `OutputIterator`. See
+  /// https://en.cppreference.com/w/cpp/named_req/OutputIterator for what this
+  /// means. In short: you may increment the iterator and dereference it, but
+  /// cannot go back, or step forward more than one position at a time. When the
+  /// DataLoader is exhausted, it will compare equal with the special
+  /// "sentinel" iterator returned by `DataLoader::end()`. Most of the time, you
+  /// should only use range-for loops to loop over the DataLoader, but
+  /// standard algorithms like `std::copy(dataloader.begin(), dataloader.end(),
+  /// output_iterator)`  are supported too.
+  Iterator<Batch> begin() {
+    TORCH_CHECK(
+        shuttle_.in_flight_jobs() == 0,
+        "Attempted to get a new DataLoader iterator "
+        "while another iterator is not yet exhausted");
+    reset();
+    return Iterator<Batch>(std::make_unique<detail::ValidIterator<Batch>>(
+        [this] { return this->next(); }));
+  }
+
+  /// Returns a special "sentinel" iterator that compares equal with a
+  /// non-sentinel iterator once the DataLoader is exhausted.
+  Iterator<Batch> end() {
+    return Iterator<Batch>(std::make_unique<detail::SentinelIterator<Batch>>());
+  }
+
+  /// Joins the DataLoader's worker threads and drains internal queues.
+  /// This function may only be invoked from the main thread (in which the
+  /// DataLoader lives).
+  void join() {
+    if (joined_) {
+      return;
+    }
+    shuttle_.drain();
+    // Send one 'quit' message per worker. Since a worker dies (exits its
+    // thread) after receiving this message, each `QuitWorker()` message will be
+    // read by exactly one worker.
+    for (const auto w : c10::irange(options_.workers)) {
+      (void)w; // Suppress unused variable warning
+      push_job(QuitWorker());
+    }
+    for (auto& worker : workers_) {
+      worker.join();
+    }
+    joined_ = true;
+  }
+
+  /// Returns the options with which the DataLoader was configured.
+  const FullDataLoaderOptions& options() const noexcept {
+    return options_;
+  }
+
+ protected:
+  /// Simple mix-in to give something a sequence number.
+  struct Sequenced {
+    Sequenced() = default;
+    Sequenced(size_t sqn) : sequence_number(sqn) {}
+    size_t sequence_number;
+  };
+
+  struct QuitWorker {};
+
+  /// A `Job` is either a `BatchRequest` (new indices to fetch data at) or a
+  /// `QuitWorker` object, to indicate the worker should shut down.
+  struct Job : Sequenced {
+    Job() = default;
+    Job(QuitWorker q, size_t sqn) : Sequenced(sqn), quit(q) {}
+    Job(BatchRequest&& i, size_t sqn)
+        : Sequenced(sqn), batch_request(std::move(i)) {}
+    optional<QuitWorker> quit;
+    optional<BatchRequest> batch_request;
+  };
+
+  /// The finished result of a job.
+  struct Result : Sequenced {
+    Result() = default;
+    Result(optional<Batch>&& b, size_t sqn)
+        : Sequenced(sqn), batch(std::move(b)) {}
+    Result(std::exception_ptr exception, size_t sqn)
+        : Sequenced(sqn), exception(std::move(exception)) {}
+    optional<Batch> batch;
+    std::exception_ptr exception;
+  };
+
+  /// Subclass hook for getting the next batch request. The stateless case will
+  /// ask the sampler for a new batch request (e.g. a vector of indices), while
+  /// the stateful one will simply return the batch size.
+  virtual optional<BatchRequestType> get_batch_request() = 0;
+
+  /// Resets the internal state of the DataLoader, optionally pre-fetching
+  /// new jobs.
+  virtual void reset() {
+    shuttle_.drain();
+    sequence_number_ = 0;
+    sequencer_ = new_sequencer();
+    prefetch();
+  }
+
+  /// Schedules `requested_jobs` many new batches to be fetched. The actual
+  /// number of jobs scheduled may be less if the DataLoader exhausts.
+  void prefetch(size_t requested_jobs) {
+    for (const auto r : c10::irange(requested_jobs)) {
+      (void)r; // Suppress unused variable
+      if (auto batch_request = get_batch_request()) {
+        this->push_job(std::move(*batch_request));
+      } else {
+        break;
+      }
+    }
+  }
+
+  /// Schedules the maximum number of jobs (based on the `max_jobs` option).
+  void prefetch() {
+    prefetch(options_.max_jobs);
+  }
+
+  /// Returns the next batch of data, or an empty `optional` if the DataLoader
+  /// is exhausted. This operation will block until a batch is available if one
+  /// is still expected.
+  optional<BatchType> next() {
+    if (options_.workers > 0) {
+      while (optional<Result> result = this->pop_result()) {
+        if (result->exception) {
+          throw WorkerException(result->exception);
+        } else if (result->batch) {
+          prefetch(1);
+          return std::move(result->batch);
+        }
+      }
+    } else if (auto batch_request = get_batch_request()) {
+      return this->main_thread_dataset_->get_batch(std::move(*batch_request));
+    }
+    return nullopt;
+  }
+
+  /// The function that worker threads run.
+  void worker_thread(Dataset& dataset) {
+    while (true) {
+      auto job = shuttle_.pop_job();
+      if (job.quit) {
+        break;
+      }
+      try {
+        auto batch = dataset.get_batch(std::move(*job.batch_request));
+        shuttle_.push_result({std::move(batch), job.sequence_number});
+      } catch (...) {
+        shuttle_.push_result({std::current_exception(), job.sequence_number});
+      }
+    }
+  }
+
+  /// Convenience method that calls `shuttle_.push_job()` with the next sequence
+  /// number.
+  template <typename T>
+  void push_job(T value) {
+    shuttle_.push_job({std::move(value), sequence_number_++});
+  }
+
+  /// Convenience method that gets the next result from the sequencer.
+  optional<Result> pop_result() {
+    return sequencer_->next(
+        [this] { return this->shuttle_.pop_result(this->options_.timeout); });
+  }
+
+  /// Convenience method that creates a new sequencer based on the
+  /// `enforce_ordering` option.
+  std::unique_ptr<detail::sequencers::Sequencer<Result>> new_sequencer() {
+    if (options_.enforce_ordering) {
+      return std::make_unique<detail::sequencers::OrderedSequencer<Result>>(
+          options_.max_jobs);
+    }
+    return std::make_unique<detail::sequencers::NoSequencer<Result>>();
+  }
+
+  /// The options the DataLoader was configured with.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const FullDataLoaderOptions options_;
+
+  /// The dataset for the main thread, only has a value if the number of
+  /// worker threads was configured as zero, meaning the main thread has to do
+  /// all the work (synchronously). NOTE: Really want this to be on the heap
+  /// when empty, therefore `unique_ptr` and not `optional`.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<Dataset> main_thread_dataset_;
+
+  /// The sequence number for the *next* batch to be retrieved from the
+  /// dataset.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t sequence_number_ = 0;
+
+  /// The worker threads, running the `worker_thread()` method.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::thread> workers_;
+
+  /// The `DataShuttle` which takes care of the life cycle of a job.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  detail::DataShuttle<Job, Result> shuttle_;
+
+  /// The `Sequencer`, which handles optional ordering of batches.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<detail::sequencers::Sequencer<Result>> sequencer_;
+
+  /// True if the DataLoader has joined its worker threads.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool joined_ = false;
+};
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c96ba0c5968751ad6164c0be7c1e72883519e9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/data/dataloader/base.h>
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+
+namespace torch {
+namespace data {
+
+/// A dataloader for stateful datasets.
+///
+/// A dataloader for stateful datatasets differs from one for stateless
+/// datasets one in that the dataset is shared among worker threads, and that
+/// this dataset is itself responsible for producing batches rather than
+/// depending on a sampler. The statefulness here actually refers to the
+/// dataset. The StatefulDataLoader simply alters the data loading algorithm to
+/// accommodate the stateful, shared nature of the dataset. Note that the
+/// dataset must be thread safe if more than one worker thread is used.
+///
+/// A stateful dataloader is created by calling `make_data_loader` with a
+/// stateful dataset.
+template <typename Dataset>
+class StatefulDataLoader : public DataLoaderBase<
+                               Dataset,
+                               typename Dataset::BatchType::value_type,
+                               typename Dataset::BatchRequestType> {
+ public:
+  using super = DataLoaderBase<
+      Dataset,
+      typename Dataset::BatchType::value_type,
+      typename Dataset::BatchRequestType>;
+  using typename super::BatchRequestType;
+
+  /// Constructs the `StatefulDataLoader` from a `dataset` and some `options`.
+  StatefulDataLoader(Dataset dataset, DataLoaderOptions options)
+      : super(
+            std::move(options),
+            std::make_unique<Dataset>(std::move(dataset))) {
+    for (const auto w : c10::irange(this->options_.workers)) {
+      // As opposed to the stateless case, here all worker threads access the
+      // same underlying dataset.
+      this->workers_.emplace_back(
+          [this] { this->worker_thread(*this->main_thread_dataset_); });
+    }
+  }
+
+ private:
+  /// Resets the internal state of the dataloader and the dataset.
+  void reset() override {
+    this->main_thread_dataset_->reset();
+    // Call the base class method last because it calls `prefetch()`
+    super::reset();
+  }
+
+  /// For stateful datasets, the batch request is always the batch size. The
+  /// dataset is responsible for determining what goes into the batch next.
+  optional<BatchRequestType> get_batch_request() override {
+    return this->options_.batch_size;
+  }
+};
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf01f780cf881e5b67e52272ddc556254f57d43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <torch/data/dataloader/base.h>
+#include <torch/data/worker_exception.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+
+namespace torch {
+namespace data {
+
+/// A dataloader for stateless datasets.
+///
+/// This dataloader follows the traditional PyTorch dataloader design, whereby a
+/// (posssibly) stateful sampler produces *batch requests* for a stateless
+/// dataset, which acts as a simple batch request to batch mapping. The batch
+/// request will often be an array of indices, and if the dataset is a simple
+/// image dataset, the dataset would produce the images at those indices.
+template <typename Dataset, typename Sampler>
+class StatelessDataLoader : public DataLoaderBase<
+                                Dataset,
+                                typename Dataset::BatchType,
+                                typename Sampler::BatchRequestType> {
+ public:
+  using super = DataLoaderBase<
+      Dataset,
+      typename Dataset::BatchType,
+      typename Sampler::BatchRequestType>;
+  using typename super::BatchRequestType;
+
+  /// Constructs the `StatelessDataLoader` from a `dataset`, a `sampler` and
+  /// some `options`.
+  StatelessDataLoader(
+      Dataset dataset,
+      Sampler sampler,
+      DataLoaderOptions options)
+      : super(std::move(options)), sampler_(std::move(sampler)) {
+    for (const auto w : c10::irange(this->options_.workers)) {
+      // Here we copy the dataset into the worker thread closure. Each worker
+      // has its own copy of the dataset. This means the dataset must be
+      // trivially copiable, or else we don't expect more than one worker to
+      // be in use.
+      (void)w; // Suppress unused variable warning
+      this->workers_.emplace_back(
+          [this, dataset]() mutable { this->worker_thread(dataset); });
+    }
+    if (this->options_.workers == 0) {
+      this->main_thread_dataset_ =
+          std::make_unique<Dataset>(std::move(dataset));
+    }
+  }
+
+ private:
+  /// Resets the internal state of the dataloader and the sampler.
+  void reset() override {
+    sampler_.reset();
+    // Call the base class method last because it calls `prefetch()`
+    super::reset();
+  }
+
+  /// Queries the sampler for the next batch request (possibly progressing its
+  /// internal state).
+  optional<BatchRequestType> get_batch_request() override {
+    auto indices = sampler_.next(this->options_.batch_size);
+    if (!indices ||
+        (indices->size() < this->options_.batch_size &&
+         this->options_.drop_last)) {
+      return nullopt;
+    }
+    AT_ASSERT(indices->size() > 0);
+    return indices;
+  }
+
+  /// The `Sampler` used to produce batch requests.
+  Sampler sampler_;
+};
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c5785b260110293337a28dbba20703ca229cbca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/types.h>
+
+#include <chrono>
+#include <cstddef>
+
+namespace torch {
+namespace data {
+
+/// Options to configure a `DataLoader`.
+struct DataLoaderOptions {
+  DataLoaderOptions() = default;
+  /* implicit */ DataLoaderOptions(size_t batch_size)
+      : batch_size_(batch_size) {}
+
+  /// The size of each batch to fetch.
+  TORCH_ARG(size_t, batch_size) = 1;
+
+  /// The number of worker threads to launch. If zero, the main thread will
+  /// synchronously perform the data loading.
+  TORCH_ARG(size_t, workers) = 0;
+
+  /// The maximum number of jobs to enqueue for fetching by worker threads.
+  /// Defaults to two times the number of worker threads.
+  TORCH_ARG(optional<size_t>, max_jobs);
+
+  /// An optional limit on the time to wait for the next batch.
+  TORCH_ARG(optional<std::chrono::milliseconds>, timeout);
+
+  /// Whether to enforce ordering of batches when multiple are loaded
+  /// asynchronously by worker threads. Set to `false` for better performance if
+  /// you do not care about determinism.
+  TORCH_ARG(bool, enforce_ordering) = true;
+
+  /// Whether to omit the last batch if it contains less than `batch_size`
+  /// examples.
+  TORCH_ARG(bool, drop_last) = false;
+};
+
+/// Like `DataLoaderOptions`, but without any unconfigured state.
+/// `DataLoaderOptions` has some options that depend on other options
+/// (`max_jobs` => `2 * workers`). In the spirit of properly using the C++ type
+/// system, `DataLoaderOptions` allows only setting values. To access values,
+/// you must create a `FullDataLoaderOptions` from a `DataLoaderOptions`
+/// instance, which will do any necessary coalescing.
+struct FullDataLoaderOptions {
+  explicit FullDataLoaderOptions(DataLoaderOptions options)
+      : batch_size(options.batch_size()),
+        workers(options.workers()),
+        max_jobs(options.max_jobs().value_or(2 * workers)),
+        timeout(options.timeout()),
+        enforce_ordering(options.enforce_ordering()),
+        drop_last(options.drop_last()) {}
+
+  size_t batch_size;
+  size_t workers;
+  size_t max_jobs;
+  optional<std::chrono::milliseconds> timeout;
+  bool enforce_ordering;
+  bool drop_last;
+};
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9b55ddddbb15df9865b4f6470af9ee076eb256e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/datasets/chunk.h>
+#include <torch/data/datasets/map.h>
+#include <torch/data/datasets/mnist.h>
+#include <torch/data/datasets/shared.h>
+#include <torch/data/datasets/stateful.h>
+#include <torch/data/datasets/tensor.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6d66ab1c5e10003b0e08c2b79bb319ff9d6de01
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <c10/util/ArrayRef.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace datasets {
+template <typename S, typename T>
+class MapDataset;
+template <typename D, typename T>
+MapDataset<D, T> map(D, T); // NOLINT
+} // namespace datasets
+} // namespace data
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace datasets {
+namespace detail {
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<optional<T>> : std::true_type {};
+} // namespace detail
+
+/// A dataset that can yield data only in batches.
+template <
+    typename Self,
+    typename Batch = std::vector<Example<>>,
+    typename BatchRequest = ArrayRef<size_t>>
+class BatchDataset {
+ public:
+  using SelfType = Self;
+  using BatchType = Batch;
+  using BatchRequestType = BatchRequest;
+  constexpr static bool is_stateful = detail::is_optional<BatchType>::value;
+
+  virtual ~BatchDataset() = default;
+
+  /// Returns a batch of data given an index.
+  virtual Batch get_batch(BatchRequest request) = 0;
+
+  /// Returns the size of the dataset, or an empty optional if it is unsized.
+  virtual optional<size_t> size() const = 0;
+
+  /// Creates a `MapDataset` that applies the given `transform` to this dataset.
+  template <typename TransformType>
+  MapDataset<Self, TransformType> map(TransformType transform) & {
+    return datasets::map(static_cast<Self&>(*this), std::move(transform));
+  }
+
+  /// Creates a `MapDataset` that applies the given `transform` to this dataset.
+  template <typename TransformType>
+  MapDataset<Self, TransformType> map(TransformType transform) && {
+    return datasets::map(
+        std::move(static_cast<Self&>(*this)), std::move(transform));
+  }
+};
+
+/// A dataset that can yield data in batches, or as individual examples.
+///
+/// A `Dataset` is a `BatchDataset`, because it supports random access and
+/// therefore batched access is implemented (by default) by calling the random
+/// access indexing function for each index in the requested batch of indices.
+/// This can be customized.
+template <typename Self, typename SingleExample = Example<>>
+class Dataset : public BatchDataset<Self, std::vector<SingleExample>> {
+ public:
+  using ExampleType = SingleExample;
+
+  /// Returns the example at the given index.
+  virtual ExampleType get(size_t index) = 0;
+
+  /// Returns a batch of data.
+  /// The default implementation calls `get()` for every requested index
+  /// in the batch.
+  std::vector<ExampleType> get_batch(ArrayRef<size_t> indices) override {
+    std::vector<ExampleType> batch;
+    batch.reserve(indices.size());
+    for (const auto i : indices) {
+      batch.push_back(get(i));
+    }
+    return batch;
+  }
+};
+
+/// A `StreamDataset` represents a dataset that is a potentially infinite
+/// stream. It takes as batch index only a number, which is the batch size, and
+/// yields that many elements from the stream.
+template <typename Self, typename Batch = std::vector<Example<>>>
+using StreamDataset = BatchDataset<Self, Batch, /*BatchRequest=*/size_t>;
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2217795dba00c54acbe314e6cc3de2c851ddff5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h
@@ -0,0 +1,529 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/arg.h>
+#include <torch/data/datasets/stateful.h>
+#include <torch/data/samplers.h>
+#include <queue>
+#include <thread>
+
+#include <torch/serialize.h>
+
+namespace torch {
+namespace data {
+namespace datasets {
+
+/// Interface for chunk reader, which performs data chunking and reading of
+/// entire chunks.
+///
+/// A chunk could be an entire file, such as an audio data file or an image,
+/// or part of a file in the case of a large text-file split based on seek
+/// positions.
+template <
+    typename ExampleType_,
+    typename ChunkType_ = std::vector<ExampleType_>>
+class ChunkDataReader {
+ public:
+  virtual ~ChunkDataReader() = default;
+
+  using ChunkType = ChunkType_;
+  using ExampleType = ExampleType_;
+
+  /// Read an entire chunk.
+  virtual ChunkType read_chunk(size_t chunk_index) = 0;
+
+  /// Returns the number of chunks available in this reader.
+  virtual size_t chunk_count() = 0;
+
+  /// This will clear any internal state associate with this reader.
+  virtual void reset() = 0;
+};
+
+namespace detail {
+/// BatchDataBuffer manages a queue of UnwrappedBatchData. After a new chunk is
+/// loaded, BatchDataBuffer splits it into small batches and push them into the
+/// queue. When get_batch is called from data loader, it pops cached batches and
+/// return. If the cache is empty, it either waits to load more chunks or return
+/// null if all chunks are loaded.
+template <
+    typename UnwrappedBatch,
+    typename ExampleSampler = samplers::RandomSampler>
+class BatchDataBuffer {
+ public:
+  using UnwrappedBatchType = UnwrappedBatch;
+  using BatchType = torch::optional<UnwrappedBatchType>;
+  using BatchRequestType = typename ExampleSampler::BatchRequestType;
+
+  BatchDataBuffer(
+      size_t batch_size,
+      ExampleSampler& example_sampler,
+      size_t queue_capacity)
+      : batch_size_(batch_size),
+        example_sampler_(example_sampler),
+        queue_capacity_(queue_capacity) {}
+
+  /// Return batch data from the queue. Called from the ChunkDataset main
+  /// thread.
+  BatchType get_batch() {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_read_.wait(lock, [this] {
+      // wait till there is available data in the queue or if all chunks are
+      // loaded (i.e. the dataset is exhausted for this epoch)
+      return (
+          this->total_example_count_in_queue_ >= batch_size_ || this->stop_);
+    });
+    if (batch_queue_.empty()) {
+      AT_ASSERT(stop_);
+      // All batches have been retrieved. Return an empty batch.
+      return nullopt;
+    }
+
+    UnwrappedBatchData batch = std::move(batch_queue_.front());
+    batch_queue_.pop();
+    if (batch.exception) {
+      throw WorkerException(batch.exception);
+    }
+
+    total_example_count_in_queue_ -= batch.batch_data.size();
+    lock.unlock();
+    cv_write_.notify_all();
+
+    return batch.batch_data;
+  }
+
+  /// Push preloaded chunks to batch queue. Called from the ChunkDataset worker
+  /// threads.
+  void add_chunk_data(UnwrappedBatchType data) {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_write_.wait(lock, [this] {
+      // stop loading if we have preloaded enough data.
+      return this->total_example_count_in_queue_ < this->queue_capacity_ ||
+          this->stop_;
+    });
+    if (stop_) {
+      // When stop_ is true, it means no further chunk loading is necessary.
+      // Return without any further processing.
+      return;
+    }
+
+    auto data_size = data.size();
+    auto remaining_size = data_size;
+    example_sampler_.reset(data_size);
+
+    auto fill_batch = [&](size_t example_count, UnwrappedBatchType& batch) {
+      auto batch_example_indices = this->example_sampler_.next(example_count);
+      AT_ASSERT(
+          batch_example_indices &&
+          batch_example_indices.value().size() == example_count);
+      BatchRequestType& indices = batch_example_indices.value();
+      for (size_t i : indices) {
+        TORCH_CHECK(i < data_size, "Index out of range");
+        batch.emplace_back(std::move(data[i]));
+      }
+      remaining_size -= example_count;
+    };
+
+    if (!batch_queue_.empty()) {
+      // if the queue has existing data, and the last batch doesn't have enough
+      // examples to fill a batch_size batch, add more example to this batch
+      // first.
+      auto& batch = batch_queue_.back();
+      size_t current_count = batch.batch_data.size();
+      if (current_count < batch_size_) {
+        auto example_count =
+            std::min(remaining_size, batch_size_ - current_count);
+        fill_batch(example_count, batch.batch_data);
+      }
+    }
+
+    // If we still have data remaining after filling the last pushed batch, add
+    // them to the queue too.
+    // NOLINTNEXTLINE(bugprone-infinite-loop)
+    while (remaining_size > 0) {
+      UnwrappedBatchType current_batch;
+
+      // Allocate the batch memory ahead of time.
+      current_batch.reserve(batch_size_);
+
+      auto example_count = std::min(remaining_size, batch_size_);
+      fill_batch(example_count, current_batch);
+      batch_queue_.emplace(std::move(current_batch));
+    }
+    total_example_count_in_queue_ += data_size;
+    lock.unlock();
+    cv_read_.notify_all();
+  }
+
+  /// Push exceptions thrown during preloading into batch queue. Called from
+  /// the ChunkDataset worker threads.
+  void add_chunk_data(std::exception_ptr e_ptr) {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_write_.wait(lock, [this] {
+      // stop loading if we have preloaded enough data.
+      return (
+          this->total_example_count_in_queue_ < this->queue_capacity_ ||
+          this->stop_);
+    });
+    if (stop_) {
+      // When stop_ is true, it means this current thread needs to be tore down,
+      // the batch buffer will be discarded, so no need to enqueue any new
+      // exceptions.
+      return;
+    }
+
+    batch_queue_.emplace(e_ptr);
+    lock.unlock();
+    cv_read_.notify_all();
+  }
+
+  void stop() {
+    {
+      // Hold the lock before changing stop_ to prevent a race condition which
+      // can cause a deadlock. To be more specific, conditional variable
+      // cv_write_ waits on predicate stop_ in add_chunk_data(). The wait
+      // happens in two steps: 1) while still holding the lock, check if
+      // predicate is true; 2) if it is true, proceeds, otherwise, release the
+      // lock and wait until notified. Without holding a lock, cv_write_'s
+      // notification can happen in between step 1) and 2). In that case, as
+      // cv_write_ is not in waiting status yet, so the notification is lost and
+      // cv_write_ will sleep forever. By taking a lock before changing
+      // predicate stop_, it is ensured updating and evaluating stop_ always
+      // happen in a synchronized way
+      std::lock_guard<std::mutex> lock(queue_mutex_);
+      stop_ = true;
+    }
+
+    // notify all writers, wake them from wait to exit current method.
+    cv_write_.notify_all();
+    // notify all readers too.
+    cv_read_.notify_all();
+  }
+  /// The batch size is needed to create batches from the chunk data. Similar to
+  /// regular dataloader where the batches are created with prefetches,
+  /// BatchDataBuffer perform the batch creation using the provided batch size.
+  size_t batch_size_ = 0;
+
+  /// count of total example stored in the queue
+  size_t total_example_count_in_queue_ = 0;
+
+  /// struct that contains a raw unwrapped batch unit. An unwrapped batch unit
+  /// is the raw data without 'optional' wrapper. It can be a collection of
+  /// images, utterances, e.t.c.
+  struct UnwrappedBatchData {
+    explicit UnwrappedBatchData(UnwrappedBatchType data)
+        : batch_data(std::move(data)) {}
+
+    // NOLINTNEXTLINE(modernize-pass-by-value)
+    explicit UnwrappedBatchData(std::exception_ptr e) : exception(e) {}
+
+    /// batch data to return
+    UnwrappedBatchType batch_data;
+
+    /// exception pointer which captures any abnormal exceptions while creating
+    /// the batch.
+    std::exception_ptr exception;
+  };
+
+  /// local cache to store example batches from loaded chunk
+  std::queue<UnwrappedBatchData> batch_queue_;
+
+  // sync batch_queue_ update.
+  std::mutex queue_mutex_;
+
+  std::condition_variable cv_read_;
+  std::condition_variable cv_write_;
+
+  ExampleSampler& example_sampler_;
+
+  // configurable maximun number of elements the queue can hold at one time.
+  size_t queue_capacity_;
+
+  // When set to true, it wakes the writer threads from the wait and exit
+  // current function call. This is needed when ChunkDataSet.Reset is called
+  // while the previous epoch is not exhausted yet. When ChunkDataset is waiting
+  // its preloader to finish previous work before tearing down the thread, the
+  // preloader could be still waiting for the conditional variable, thus cause
+  // the program to hang. This boolean is used to break this waiting condition.
+  bool stop_ = false;
+};
+} // namespace detail
+
+/// Options to configure a `ChunkDataset`.
+struct ChunkDatasetOptions {
+  ChunkDatasetOptions() = delete;
+  ChunkDatasetOptions(
+      size_t preloader_count,
+      size_t batch_size,
+      size_t cache_size = 2048,
+      size_t cross_chunk_shuffle_count = 1)
+      : preloader_count_(preloader_count),
+        batch_size_(batch_size),
+        cache_size_(cache_size),
+        cross_chunk_shuffle_count_(cross_chunk_shuffle_count) {
+    TORCH_CHECK(
+        preloader_count_ > 0,
+        "Preloader count is 0. At least one preloader needs to be specified.");
+    TORCH_CHECK(
+        batch_size_ > 0,
+        "Batch size is 0. A positive batch size needs to be specified.");
+    TORCH_CHECK(
+        cache_size_ > 0,
+        "Cache size is 0. A positive cache size needs to be specified.");
+    TORCH_CHECK(
+        cache_size_ >= batch_size_,
+        "Cache size is less than batch size. Cache needs to be large enough to "
+        "hold at least one batch.");
+    TORCH_CHECK(
+        cross_chunk_shuffle_count_ > 0,
+        "cross_chunk_shuffle_count needs to be greater than 0.");
+  }
+
+  /// The number of worker thread to preload chunk data.
+  TORCH_ARG(size_t, preloader_count);
+
+  /// The size of each batch.
+  TORCH_ARG(size_t, batch_size);
+
+  /// The capacity of the queue for batch caching.
+  TORCH_ARG(size_t, cache_size) = 2048;
+
+  // The number of chunks to perfrom cross-chunk shuffling. Default to 1 meaning
+  // no cross-chunk shuffling. When it is equal to n (n > 1), n random
+  // chunks will be loaded at once and example shuffling will be performed
+  // across all those n chunks.
+  // Note: Usually the default config (1 chunk shuffle + example shuffle) is
+  // good enough to generate random distributed data. Use this parameter only if
+  // you know cross-shuffle is needed in your case. Also there is a performance
+  // penalty when this value is greater than 1, as we need to do extra merge
+  // between multiple chunks before performing example sampling.
+  TORCH_ARG(size_t, cross_chunk_shuffle_count) = 1;
+};
+
+/// A stateful dataset that support hierarchical sampling and prefetching of
+/// entre chunks.
+///
+/// Unlike regular dataset, chunk dataset require two samplers to operate and
+/// keeps an internal state. `ChunkSampler` selects, which chunk to load next,
+/// while the `ExampleSampler` determins the order of Examples that are returned
+/// in each `get_batch` call. The hierarchical sampling approach used here is
+/// inspired by this paper http://martin.zinkevich.org/publications/nips2010.pdf
+template <
+    typename ChunkReader,
+    typename ChunkSampler = samplers::RandomSampler,
+    typename ExampleSampler = samplers::RandomSampler>
+class ChunkDataset final
+    : public StatefulDataset<
+          ChunkDataset<ChunkReader, ChunkSampler, ExampleSampler>,
+          typename ChunkReader::BatchType,
+          size_t> {
+ public:
+  using BatchType = torch::optional<typename ChunkReader::BatchType>;
+  using UnwrappedBatchType = typename ChunkReader::BatchType;
+  using BatchRequestType = size_t;
+  using ChunkSamplerType = ChunkSampler;
+  using ExampleSamplerType = ExampleSampler;
+
+  ChunkDataset(
+      ChunkReader chunk_reader,
+      ChunkSampler chunk_sampler,
+      ExampleSampler example_sampler,
+      ChunkDatasetOptions options,
+      std::function<void(UnwrappedBatchType&)> preprocessing_policy =
+          std::function<void(UnwrappedBatchType&)>())
+      : chunk_reader_(std::move(chunk_reader)),
+        chunk_sampler_(std::move(chunk_sampler)),
+        example_sampler_(std::move(example_sampler)),
+        options_(std::move(options)),
+        preprocessing_policy_(std::move(preprocessing_policy)),
+        quit_worker_(false),
+        running_preloaders_(0),
+        load_checkpoint_(false) {}
+
+  ~ChunkDataset() override {
+    // stop batch buffer first.
+    if (batch_buffer_) {
+      batch_buffer_->stop();
+    }
+    free_workers();
+  }
+
+  /// Default get_batch method of BatchDataset. This method returns
+  /// Example batches created from the preloaded chunks. The implemenation
+  /// is dataset agnostic and does not need overriding in different chunk
+  /// datasets.
+  BatchType get_batch(size_t batch_size) override {
+    TORCH_CHECK(
+        batch_buffer_ != nullptr,
+        "Dataset needs to call reset() before calling get_batch().");
+
+    TORCH_CHECK(
+        batch_size == options_.batch_size(),
+        "The requested batch size does not match with the initialized batch size.\n"
+        " The requested batch size is ",
+        batch_size,
+        ", while the dataset is created with batch size equal to ",
+        options_.batch_size());
+    return batch_buffer_->get_batch();
+  }
+
+  /// Helper method around get_batch as `batch_size` is not strictly necessary
+  BatchType get_batch() {
+    return get_batch(options_.batch_size());
+  }
+
+  /// This will clear any internal state and starts the internal prefetching
+  /// mechanism for the chunk dataset.
+  void reset() override {
+    // We need this to support partial data reads via dataloader iterator.
+    if (batch_buffer_) {
+      batch_buffer_->stop();
+    }
+    // free workers from previous reset if there is any.
+    free_workers();
+    preload_threads_.clear();
+
+    if (!load_checkpoint_) {
+      chunk_reader_.reset();
+      chunk_sampler_.reset(chunk_reader_.chunk_count());
+      load_checkpoint_ = false;
+    }
+
+    // Throw out any existing cached batch in the buffer and re-creates a new
+    // chunk buffer.
+    batch_buffer_ = std::make_unique<
+        detail::BatchDataBuffer<UnwrappedBatchType, ExampleSamplerType>>(
+        options_.batch_size(), example_sampler_, options_.cache_size());
+
+    // create new workers for this new epoch.
+    quit_worker_ = false;
+
+    AT_ASSERT(running_preloaders_ == 0);
+    running_preloaders_ = options_.preloader_count();
+    for (const auto i : c10::irange(options_.preloader_count())) {
+      preload_threads_.emplace_back([this, i]() { this->preloader(i); });
+    }
+  }
+
+  /// size is not used for chunk dataset.
+  optional<size_t> size() const override {
+    return torch::nullopt;
+  }
+
+  // provide a references to chunk sampler. Used mainly in distributed data
+  // loading to set the epoch number for the sampler.
+  ChunkSamplerType& chunk_sampler() {
+    return chunk_sampler_;
+  }
+
+  void save(serialize::OutputArchive& archive) const override {
+    std::lock_guard<std::mutex> lock(chunk_index_guard_);
+    chunk_sampler_.save(archive);
+  }
+
+  void load(serialize::InputArchive& archive) override {
+    std::lock_guard<std::mutex> lock(chunk_index_guard_);
+    chunk_sampler_.load(archive);
+    load_checkpoint_ = true;
+  }
+
+ private:
+  /// running on worker thread to preload chunk data.
+  void preloader(size_t id) {
+    while (!quit_worker_.load()) {
+      try {
+        std::vector<size_t> chunk_idx;
+        {
+          std::lock_guard<std::mutex> lock(chunk_index_guard_);
+          if (auto chunk_sampler_result = chunk_sampler_.next(
+                  this->options_.cross_chunk_shuffle_count())) {
+            chunk_idx = chunk_sampler_result.value();
+          } else {
+            break;
+          }
+        }
+        UnwrappedBatchType data = chunk_reader_.read_chunk(chunk_idx[0]);
+        for (const auto i : c10::irange(1, chunk_idx.size())) {
+          auto chunk_data = chunk_reader_.read_chunk(chunk_idx[i]);
+          std::move(
+              chunk_data.begin(), chunk_data.end(), std::back_inserter(data));
+        }
+        if (preprocessing_policy_) {
+          preprocessing_policy_(data);
+        }
+        if (!data.empty()) { // skip empty chunks.
+          batch_buffer_->add_chunk_data(std::move(data));
+        }
+      } catch (...) {
+        batch_buffer_->add_chunk_data(std::current_exception());
+      }
+    }
+    AT_ASSERT(running_preloaders_.load() > 0);
+    --running_preloaders_;
+    if (running_preloaders_.load() == 0) {
+      // all preloaders are completed, so we can notify the batch_buffer.
+      batch_buffer_->stop();
+    }
+  }
+
+  /// Block the current thread until the workers finish execution and exit.
+  void free_workers() {
+    if (!quit_worker_.load()) {
+      quit_worker_ = true;
+      for (auto& worker_thread : preload_threads_) {
+        worker_thread.join();
+      }
+    }
+  }
+
+ private:
+  // Templated class that defines what is a chunk and how to read chunk data.
+  // When a chunk is returned by chunk_reader_, ChunkDataset split it into
+  // batches and caches them in batch_buffer_.
+  ChunkReader chunk_reader_;
+
+  // chunk sampler to shuffle different chunks
+  ChunkSamplerType chunk_sampler_;
+
+  // example sampler to shuffle examples in a specific chunk
+  ExampleSamplerType example_sampler_;
+
+  // batch data buffer which holds chunk data from preloading thread.
+  std::shared_ptr<
+      detail::BatchDataBuffer<UnwrappedBatchType, ExampleSamplerType>>
+      batch_buffer_;
+
+  // worker thread pool
+  std::vector<std::thread> preload_threads_;
+
+  /// The options the Dataset was configured with.
+  const ChunkDatasetOptions options_;
+
+  // function pointer wrapper to apply custom processing over chunk data. This
+  // is considered an advanced parameter for developers who want to apply a
+  // pre-process to the chunk data before sampling into minibatch.
+  // Different than the collate function, this policy is applied on the chunk
+  // level, instead of minibatch level. When a chunk of data is loaded (multiple
+  // chunks if cross_chunk_shuffle_count_ is greater than 1), this policy is
+  // applied to the full loaded data. It is useful if developers want to
+  // perform pre-processing (like bucketing) to the chunk data before
+  // example sampler samples the data. By default it's an empty pointer and no
+  // action will be taken.
+  std::function<void(UnwrappedBatchType&)> preprocessing_policy_;
+
+  // indicate whether the worker thread can be teared down
+  std::atomic<bool> quit_worker_;
+
+  // keep track of running preloaders to notify batch buffer. A value 0
+  // indicates that the chunk loading is completed.
+  std::atomic<size_t> running_preloaders_;
+
+  // mutex to synchronize chunk sampler next() call.
+  mutable std::mutex chunk_index_guard_;
+
+  // boolean value to indicate whether we need to load the checkpoint for
+  // chunk_sampler_.
+  bool load_checkpoint_;
+};
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fc8aec1f6467d1982b0e33f6b12aab1c98de008
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/types.h>
+
+#include <c10/util/ArrayRef.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace data {
+namespace datasets {
+namespace detail {
+template <bool C, typename T>
+using optional_if_t = typename std::conditional<C, torch::optional<T>, T>::type;
+} // namespace detail
+
+/// A `MapDataset` is a dataset that applies a transform to a source dataset.
+template <typename SourceDataset, typename AppliedTransform>
+class MapDataset : public BatchDataset<
+                       MapDataset<SourceDataset, AppliedTransform>,
+                       detail::optional_if_t<
+                           SourceDataset::is_stateful,
+                           typename AppliedTransform::OutputBatchType>,
+                       typename SourceDataset::BatchRequestType> {
+ public:
+  using DatasetType = SourceDataset;
+  using TransformType = AppliedTransform;
+  using BatchRequestType = typename SourceDataset::BatchRequestType;
+  using OutputBatchType = detail::optional_if_t<
+      SourceDataset::is_stateful,
+      typename AppliedTransform::OutputBatchType>;
+
+  MapDataset(DatasetType dataset, TransformType transform)
+      : dataset_(std::move(dataset)), transform_(std::move(transform)) {}
+
+  /// Gets a batch from the source dataset and applies the transform to it,
+  /// returning the result.
+  OutputBatchType get_batch(BatchRequestType indices) override {
+    return get_batch_impl(std::move(indices));
+  }
+
+  /// Returns the size of the source dataset.
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  optional<size_t> size() const noexcept override {
+    return dataset_.size();
+  }
+
+  /// Calls `reset()` on the underlying dataset.
+  /// NOTE: Stateless datasets do not have a reset() method, so a call to this
+  /// method will only compile for stateful datasets (which have a reset()
+  /// method).
+  void reset() {
+    dataset_.reset();
+  }
+
+  /// Returns the underlying dataset.
+  const SourceDataset& dataset() noexcept {
+    return dataset_;
+  }
+
+  /// Returns the transform being applied.
+  const AppliedTransform& transform() noexcept {
+    return transform_;
+  }
+
+ private:
+  /// The implementation of `get_batch()` for the stateless case, which simply
+  /// applies the transform to the output of `get_batch()` from the dataset.
+  template <
+      typename D = SourceDataset,
+      typename = torch::disable_if_t<D::is_stateful>>
+  OutputBatchType get_batch_impl(BatchRequestType indices) {
+    return transform_.apply_batch(dataset_.get_batch(std::move(indices)));
+  }
+
+  /// The implementation of `get_batch()` for the stateful case. Here, we follow
+  /// the semantics of `Optional.map()` in many functional languages, which
+  /// applies a transformation to the optional's content when the optional
+  /// contains a value, and returns a new optional (of a different type)  if the
+  /// original optional returned by `get_batch()` was empty.
+  template <typename D = SourceDataset>
+  torch::enable_if_t<D::is_stateful, OutputBatchType> get_batch_impl(
+      BatchRequestType indices) {
+    if (auto batch = dataset_.get_batch(std::move(indices))) {
+      return transform_.apply_batch(std::move(*batch));
+    }
+    return nullopt;
+  }
+
+  /// The underlying dataset being transformed.
+  SourceDataset dataset_;
+
+  // The transformation that is applied to batches received from the dataset.
+  AppliedTransform transform_;
+};
+
+/// Creates a `MapDataset` with the given dataset and transform.
+template <typename DatasetType, typename TransformType>
+MapDataset<DatasetType, TransformType> map(
+    DatasetType dataset,
+    TransformType transform) {
+  static_assert(
+      std::is_same<
+          typename std::conditional<
+              DatasetType::is_stateful,
+              typename DatasetType::BatchType::value_type,
+              typename DatasetType::BatchType>::type,
+          typename TransformType::InputBatchType>::value,
+      "BatchType type of dataset does not match input type of transform");
+  return {std::move(dataset), std::move(transform)};
+}
+
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h
new file mode 100644
index 0000000000000000000000000000000000000000..17699f5bc9dc3532960cf72d58cdaabc592f2863
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <string>
+
+namespace torch {
+namespace data {
+namespace datasets {
+/// The MNIST dataset.
+class TORCH_API MNIST : public Dataset<MNIST> {
+ public:
+  /// The mode in which the dataset is loaded.
+  enum class Mode { kTrain, kTest };
+
+  /// Loads the MNIST dataset from the `root` path.
+  ///
+  /// The supplied `root` path should contain the *content* of the unzipped
+  /// MNIST dataset, available from http://yann.lecun.com/exdb/mnist.
+  explicit MNIST(const std::string& root, Mode mode = Mode::kTrain);
+
+  /// Returns the `Example` at the given `index`.
+  Example<> get(size_t index) override;
+
+  /// Returns the size of the dataset.
+  optional<size_t> size() const override;
+
+  /// Returns true if this is the training subset of MNIST.
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  bool is_train() const noexcept;
+
+  /// Returns all images stacked into a single tensor.
+  const Tensor& images() const;
+
+  /// Returns all targets stacked into a single tensor.
+  const Tensor& targets() const;
+
+ private:
+  Tensor images_, targets_;
+};
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h
new file mode 100644
index 0000000000000000000000000000000000000000..09833b2584d81bb9563bcee2dbd526007f39ca32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch {
+namespace data {
+namespace datasets {
+
+/// A dataset that wraps another dataset in a shared pointer and implements the
+/// `BatchDataset` API, delegating all calls to the shared instance. This is
+/// useful when you want all worker threads in the dataloader to access the same
+/// dataset instance. The dataset must take care of synchronization and
+/// thread-safe access itself.
+///
+/// Use `torch::data::datasets::make_shared_dataset()` to create a new
+/// `SharedBatchDataset` like you would a `std::shared_ptr`.
+template <typename UnderlyingDataset>
+class SharedBatchDataset : public BatchDataset<
+                               SharedBatchDataset<UnderlyingDataset>,
+                               typename UnderlyingDataset::BatchType,
+                               typename UnderlyingDataset::BatchRequestType> {
+ public:
+  using BatchType = typename UnderlyingDataset::BatchType;
+  using BatchRequestType = typename UnderlyingDataset::BatchRequestType;
+
+  /// Constructs a new `SharedBatchDataset` from a `shared_ptr` to the
+  /// `UnderlyingDataset`.
+  /* implicit */ SharedBatchDataset(
+      std::shared_ptr<UnderlyingDataset> shared_dataset)
+      : dataset_(std::move(shared_dataset)) {}
+
+  /// Calls `get_batch` on the underlying dataset.
+  BatchType get_batch(BatchRequestType request) override {
+    return dataset_->get_batch(std::move(request));
+  }
+
+  /// Returns the `size` from the underlying dataset.
+  optional<size_t> size() const override {
+    return dataset_->size();
+  }
+
+  /// Accesses the underlying dataset.
+  UnderlyingDataset& operator*() {
+    return *dataset_;
+  }
+
+  /// Accesses the underlying dataset.
+  const UnderlyingDataset& operator*() const {
+    return *dataset_;
+  }
+
+  /// Accesses the underlying dataset.
+  UnderlyingDataset* operator->() {
+    return dataset_.get();
+  }
+
+  /// Accesses the underlying dataset.
+  const UnderlyingDataset* operator->() const {
+    return dataset_.get();
+  }
+
+  /// Calls `reset()` on the underlying dataset.
+  void reset() {
+    dataset_->reset();
+  }
+
+ private:
+  std::shared_ptr<UnderlyingDataset> dataset_;
+};
+
+/// Constructs a new `SharedBatchDataset` by creating a
+/// `shared_ptr<UnderlyingDatase>`. All arguments are forwarded to
+/// `make_shared<UnderlyingDataset>`.
+template <typename UnderlyingDataset, typename... Args>
+SharedBatchDataset<UnderlyingDataset> make_shared_dataset(Args&&... args) {
+  return std::make_shared<UnderlyingDataset>(std::forward<Args>(args)...);
+}
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e1f82134c0c7f21b66447f63d528b1dc8464bfe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace datasets {
+
+/// A stateful dataset is a dataset that maintains some internal state, which
+/// will be `reset()` at the beginning of each epoch. Subclasses can override
+/// the `reset()` method to configure this behavior. Further, the return type of
+/// a stateful dataset's `get_batch()` method is always an `optional`. When the
+/// stateful dataset wants to indicate to the dataloader that its epoch has
+/// ended, it should return an empty optional. The dataloader knows to modify
+/// its implementation based on whether the dataset is stateless or stateful.
+///
+/// Note that when subclassing a from `StatefulDataset<Self, T>`, the return
+/// type of `get_batch()`, which the subclass must override, will be
+/// `optional<T>` (i.e. the type specified in the `StatefulDataset`
+/// specialization is automatically boxed into an `optional` for the dataset's
+/// `BatchType`).
+template <
+    typename Self,
+    typename Batch = std::vector<Example<>>,
+    typename BatchRequest = size_t>
+class StatefulDataset
+    : public BatchDataset<Self, optional<Batch>, BatchRequest> {
+ public:
+  /// Resets internal state of the dataset.
+  virtual void reset() = 0;
+
+  /// Saves the statefulDataset's state to OutputArchive.
+  virtual void save(serialize::OutputArchive& archive) const = 0;
+
+  /// Deserializes the statefulDataset's state from the `archive`.
+  virtual void load(serialize::InputArchive& archive) = 0;
+};
+
+/// Serializes a statefulDataset to `OutputArchive`.
+template <typename... Args>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const StatefulDataset<Args...>& statefulDataset) {
+  statefulDataset.save(archive);
+  return archive;
+}
+
+/// Deserializes a statefulDataset from an `InputArchive`.
+template <typename... Args>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    StatefulDataset<Args...>& statefulDataset) {
+  statefulDataset.load(archive);
+  return archive;
+}
+
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a96c31086d37717f40410915305d236b3b2f24c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace datasets {
+
+/// A dataset of tensors.
+/// Stores a single tensor internally, which is then indexed inside `get()`.
+struct TensorDataset : public Dataset<TensorDataset, TensorExample> {
+  /// Creates a `TensorDataset` from a vector of tensors.
+  explicit TensorDataset(const std::vector<Tensor>& tensors)
+      : TensorDataset(torch::stack(tensors)) {}
+
+  explicit TensorDataset(torch::Tensor tensor) : tensor(std::move(tensor)) {}
+
+  /// Returns a single `TensorExample`.
+  TensorExample get(size_t index) override {
+    return tensor[index];
+  }
+
+  /// Returns the number of tensors in the dataset.
+  optional<size_t> size() const override {
+    return tensor.size(0);
+  }
+
+  Tensor tensor;
+};
+
+} // namespace datasets
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h
new file mode 100644
index 0000000000000000000000000000000000000000..f703f7859c1da0eb20612f8453dd7a097d36c6c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <torch/data/detail/queue.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <chrono>
+#include <utility>
+
+namespace torch {
+namespace data {
+namespace detail {
+
+/// Encapsulates the full life cycle of DataLoader jobs.
+///
+/// When a new job is enqueued to the `DataShuttle`, a counter for in-flight
+/// jobs is bumped. This job is said to be "in-flight" until its result is
+/// popped. Worker threads dequeue jobs as soon as they are available. When a
+/// worker finishes a job, it enqueues the result. Only when the main thread
+/// dequeues a result is the count of in-flight jobs decremented. When the main
+/// thread attempts to dequeue a job but no jobs are in-flight, that means the
+/// epoch is complete and `pop_result` returns an empty optional.
+template <typename Job, typename Result>
+class DataShuttle {
+ public:
+  /// Pushes a new job. Called by the main thread.
+  void push_job(Job job) {
+    new_jobs_.push(std::move(job));
+    ++in_flight_jobs_;
+  }
+
+  /// Pushes the result of a job. Called by worker threads.
+  void push_result(Result result) {
+    results_.push(std::move(result));
+  }
+
+  /// Returns the next job, blocking until there is one available. Called by
+  /// worker threads.
+  Job pop_job() {
+    return new_jobs_.pop();
+  }
+
+  /// Returns the result of a job, or nullopt if all jobs were exhausted. Called
+  /// by the main thread.
+  optional<Result> pop_result(
+      optional<std::chrono::milliseconds> timeout = nullopt) {
+    if (in_flight_jobs_ > 0) {
+      auto result = results_.pop(timeout);
+      --in_flight_jobs_;
+      return result;
+    }
+    return nullopt;
+  }
+
+  /// Discards any jobs that are not yet in flight, and waits for all in-flight
+  /// jobs to finish, discarding their result.
+  void drain() {
+    // Clear all inputs so that no further jobs are scheduled.
+    auto number_cleared = new_jobs_.clear();
+    in_flight_jobs_ -= number_cleared;
+    // Remove any outstanding results.
+    while (in_flight_jobs_ > 0) {
+      pop_result();
+    }
+  }
+
+  /// Returns the number of jobs that are still in progress.
+  /// When this number is zero, an epoch is finished.
+  size_t in_flight_jobs() const noexcept {
+    return in_flight_jobs_;
+  }
+
+ private:
+  /// The queue for jobs that are not yet in flight.
+  Queue<Job> new_jobs_;
+  /// The number of in-flight jobs.
+  /// NOTE: Not atomic because only manipulated by the main thread.
+  size_t in_flight_jobs_ = 0;
+  /// The queue for results of finished jobs.
+  Queue<Result> results_;
+};
+
+} // namespace detail
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f92236fcfd808e7c83e6c6267ebb57aa8537b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <queue>
+
+namespace torch {
+namespace data {
+namespace detail {
+
+/// A basic locked, blocking MPMC queue.
+///
+/// Every `push` and `pop` is guarded by a mutex. A condition variable is used
+/// to communicate insertion of new elements, such that waiting threads will be
+/// woken up if they are currently waiting inside a call to `pop()`.
+///
+/// Note that this data structure is written specifically for use with the
+/// `DataLoader`. Its behavior is tailored to this use case and may not be
+/// applicable to more general uses.
+template <typename T>
+class Queue {
+ public:
+  /// Pushes a new value to the back of the `Queue` and notifies one thread on
+  /// the waiting side about this event.
+  void push(T value) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      queue_.push(std::move(value));
+    }
+    cv_.notify_one();
+  }
+
+  /// Blocks until at least one element is ready to be popped from the front of
+  /// the queue. An optional `timeout` in seconds can be used to limit the time
+  /// spent waiting for an element. If the wait times out, an exception is
+  /// raised.
+  T pop(optional<std::chrono::milliseconds> timeout = nullopt) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (timeout) {
+      if (!cv_.wait_for(
+              lock, *timeout, [this] { return !this->queue_.empty(); })) {
+        // clang-format off
+        AT_ERROR(
+            "Timeout in DataLoader queue while waiting for next batch"
+            " (timeout was ", timeout->count(), " ms)");
+        // clang-format on
+      }
+    } else {
+      cv_.wait(lock, [this] { return !this->queue_.empty(); });
+    }
+    AT_ASSERT(!queue_.empty());
+    T value = queue_.front();
+    queue_.pop();
+    lock.unlock();
+    return value;
+  }
+
+  /// Empties the queue and returns the number of elements that were present at
+  /// the start of the function. No threads are notified about this event as it
+  /// is assumed to be used to drain the queue during shutdown of a
+  /// `DataLoader`.
+  size_t clear() {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    const auto size = queue_.size();
+    while (!queue_.empty()) {
+      queue_.pop();
+    }
+    return size;
+  }
+
+ private:
+  std::queue<T> queue_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+};
+} // namespace detail
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c08cdc56cedd79fb45d8907a5cd51b2601095f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace detail {
+namespace sequencers {
+namespace detail {
+template <typename Result>
+bool buffer_contains_result(const std::vector<optional<Result>>& buffer) {
+  return std::any_of(
+      buffer.begin(), buffer.end(), [](const optional<Result>& result) {
+        return result.has_value();
+      });
+}
+} // namespace detail
+
+/// A `Sequencer` accepts a function that yields the next result of a
+/// `DataLoader` and then has the opportunity to influence the order in which
+/// these results are returned. The `NoSequencer` does not enforce any
+/// sequencing and returns any result directly. The `OrderedSequencer` instead
+/// buffers results internally to return them in order of their sequence number.
+template <typename Result>
+struct Sequencer {
+  using ResultProducer = std::function<optional<Result>()>;
+  virtual ~Sequencer() = default;
+  virtual optional<Result> next(ResultProducer next_result) = 0;
+};
+
+/// A `Sequencer` that does not enforce any ordering. It is effectively the
+/// identity function.
+template <typename Result>
+struct NoSequencer final : public Sequencer<Result> {
+  using typename Sequencer<Result>::ResultProducer;
+  optional<Result> next(ResultProducer next_result) override {
+    return next_result();
+  }
+};
+
+/// A `Sequencer` that buffers results and returns them in order of their
+/// sequence number. The `OrderedSequencer` maintains an internal, monotonically
+/// incrementing counter for the next sequence number it expects. If it receives
+/// a result with a higher sequence number, it will buffer it for later (when
+/// the sequence number reaches that of this result). Otherwise, if the sequence
+/// numbers match, the result is returned.
+///
+/// Implementation note: The `OrderedSequencer` is implemented with a fixed-size
+/// buffer. Let `m` be the maximum number of jobs in the data loader's queue and
+/// `s` be the current sequence number. Assume `m` jobs are scheduled in the
+/// `DataLoader`. Any new result is stored at index `job.sqn mod m` in the
+/// `OrderedSequencer`. Why are we sure sequence numbers of new jobs will not
+/// collide with sequence numbers of buffered jobs? The `OrderedSequencer` will
+/// not return from `next()` until it receives the result with sqn `s`. This
+/// means no new jobs can be scheduled in the `DataLoader` in the meantime,
+/// which enforces that as long as sqn `s` has not been received, `s + m` (which
+/// would cause a collision in the fixed-size buffer) will not yet be scheduled.
+template <typename Result>
+struct OrderedSequencer : public Sequencer<Result> {
+  using typename Sequencer<Result>::ResultProducer;
+
+  /// Constructs the `OrderedSequencer` with the maximum number of results it
+  /// will ever hold at one point in time.
+  explicit OrderedSequencer(size_t max_jobs) : buffer_(max_jobs) {}
+
+  /// Buffers results until the next one in the expected order is received.
+  optional<Result> next(ResultProducer next_result) override {
+    // If we already have the result for the next sqn, return it.
+    if (auto& maybe_result = buffer(next_sequence_number_)) {
+      auto result = std::move(*maybe_result);
+      buffer(next_sequence_number_++).reset();
+      return result;
+    }
+    // Otherwise wait for the next result.
+    while (true) {
+      auto result = next_result();
+      if (!result) {
+        AT_ASSERT(!detail::buffer_contains_result(buffer_));
+        break;
+      }
+      // If it was not nullopt and the sequence numbers match, return it
+      // directly and bump the sequence number.
+      if (result->sequence_number == next_sequence_number_) {
+        ++next_sequence_number_;
+        return result;
+      }
+      // Stash the result for later.
+      AT_ASSERT(!buffer(result->sequence_number).has_value());
+      buffer(result->sequence_number) = std::move(result);
+    }
+    // The result was an empty optional, so we are done with this epoch.
+    return nullopt;
+  }
+
+  /// Accesses the buffer at the `index` modulo the buffer size.
+  optional<Result>& buffer(size_t index) {
+    return buffer_.at(index % buffer_.size());
+  }
+
+  /// The monotonically increasing sequence number we expect.
+  size_t next_sequence_number_ = 0;
+
+  /// A fixed-size buffer (after construction).
+  std::vector<optional<Result>> buffer_;
+};
+} // namespace sequencers
+} // namespace detail
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h
new file mode 100644
index 0000000000000000000000000000000000000000..c27f896c850c1da1aaa5f0b0b8cf82c7b45b3cf5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/types.h>
+
+namespace torch {
+namespace data {
+
+/// An `Example` from a dataset.
+///
+/// A dataset consists of data and an associated target (label).
+template <typename Data = at::Tensor, typename Target = at::Tensor>
+struct Example {
+  using DataType = Data;
+  using TargetType = Target;
+
+  Example() = default;
+  Example(Data data, Target target)
+      : data(std::move(data)), target(std::move(target)) {}
+
+  Data data;
+  Target target;
+};
+
+namespace example {
+using NoTarget = void;
+} // namespace example
+
+/// A specialization for `Example` that does not have a target.
+///
+/// This class exists so that code can be written for a templated `Example`
+/// type, and work both for labeled and unlabeled datasets.
+template <typename Data>
+struct Example<Data, example::NoTarget> {
+  using DataType = Data;
+  using TargetType = example::NoTarget;
+
+  Example() = default;
+  /* implicit */ Example(Data data) : data(std::move(data)) {}
+
+  // When a DataLoader returns an Example like this, that example should be
+  // implicitly convertible to the underlying data type.
+
+  operator Data&() {
+    return data;
+  }
+  operator const Data&() const {
+    return data;
+  }
+
+  Data data;
+};
+
+using TensorExample = Example<at::Tensor, example::NoTarget>;
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..be02913f3d5f01f890658318b77451ae0376b348
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h
@@ -0,0 +1,178 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace data {
+namespace detail {
+// For increased safety and more separated logic, this implementation of
+// `Iterator` consists of a `ValidIterator` and a `SentinelIterator`. A
+// `ValidIterator` yields new batches until the `DataLoader` is exhausted. While
+// the `DataLoader` is not exhausted, `ValidIterator`s compare equal if they are
+// the same object. When the `ValidIterator` becomes exhausted, it compares
+// equal to the `SentinelIterator`, but not before. Half the code here is to
+// implement double dispatch for the comparison. Got damnit, C++.
+
+template <typename Batch>
+struct ValidIterator;
+
+template <typename Batch>
+struct SentinelIterator;
+
+/// Base class for the `ValidIterator` and `SentinelIterator`
+template <typename Batch>
+struct IteratorImpl {
+  virtual ~IteratorImpl() = default;
+  virtual void next() = 0;
+  virtual Batch& get() = 0;
+  virtual bool operator==(const IteratorImpl& other) const = 0;
+  virtual bool operator==(const ValidIterator<Batch>& other) const = 0;
+  virtual bool operator==(const SentinelIterator<Batch>& other) const = 0;
+};
+
+template <typename Batch>
+struct ValidIterator : public IteratorImpl<Batch> {
+  using BatchProducer = std::function<optional<Batch>()>;
+
+  explicit ValidIterator(BatchProducer next_batch)
+      : next_batch_(std::move(next_batch)) {}
+
+  /// Fetches the next batch.
+  void next() override {
+    // If we didn't get the very first batch yet, get it now.
+    lazy_initialize();
+    TORCH_CHECK(
+        batch_.has_value(), "Attempted to increment iterator past the end");
+    // Increment to the next batch.
+    batch_ = next_batch_();
+  }
+
+  /// Returns the current batch. The precondition for this operation to not
+  /// throw an exception is that it has been compared to the `SentinelIterator`
+  /// and did not compare equal.
+  Batch& get() override {
+    // If we didn't get the very first batch yet, get it now.
+    lazy_initialize();
+    TORCH_CHECK(
+        batch_.has_value(),
+        "Attempted to dereference iterator that was past the end");
+    return batch_.value();
+  }
+
+  /// Does double dispatch.
+  bool operator==(const IteratorImpl<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// A `ValidIterator` is equal to the `SentinelIterator` iff. the
+  /// `ValidIterator` has reached the end of the dataloader.
+  bool operator==(const SentinelIterator<Batch>& /* unused */) const override {
+    lazy_initialize();
+    return !batch_;
+  }
+
+  /// Returns true if the memory address of `other` equals that of `this`.
+  bool operator==(const ValidIterator<Batch>& other) const override {
+    return &other == this;
+  }
+
+  /// Gets the very first batch if it has not yet been fetched.
+  void lazy_initialize() const {
+    if (!initialized_) {
+      batch_ = next_batch_();
+      initialized_ = true;
+    }
+  }
+
+  BatchProducer next_batch_;
+  mutable optional<Batch> batch_;
+  mutable bool initialized_ = false;
+};
+
+template <typename Batch>
+struct SentinelIterator : public IteratorImpl<Batch> {
+  void next() override {
+    AT_ERROR(
+        "Incrementing the DataLoader's past-the-end iterator is not allowed");
+  }
+
+  Batch& get() override {
+    AT_ERROR(
+        "Dereferencing the DataLoader's past-the-end iterator is not allowed");
+  }
+
+  /// Does double dispatch.
+  bool operator==(const IteratorImpl<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// Calls the comparison operator between `ValidIterator` and
+  /// `SentinelIterator`.
+  bool operator==(const ValidIterator<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// Sentinel iterators always compare equal.
+  bool operator==(const SentinelIterator<Batch>& other) const override {
+    return true;
+  }
+};
+} // namespace detail
+
+template <typename Batch>
+class Iterator {
+ public:
+  // Type aliases to make the class recognized as a proper iterator.
+  using difference_type = std::ptrdiff_t;
+  using value_type = Batch;
+  using pointer = Batch*;
+  using reference = Batch&;
+  using iterator_category = std::input_iterator_tag;
+
+  explicit Iterator(std::unique_ptr<detail::IteratorImpl<Batch>> impl)
+      : impl_(std::move(impl)) {}
+
+  /// Increments the iterator.
+  /// Only permitted for valid iterators (not past the end).
+  Iterator& operator++() {
+    impl_->next();
+    return *this;
+  }
+
+  /// Returns the current batch.
+  /// Only permitted for valid iterators (not past the end).
+  Batch& operator*() {
+    return impl_->get();
+  }
+
+  /// Returns a pointer to the current batch.
+  /// Only permitted for valid iterators (not past the end).
+  Batch* operator->() {
+    return &impl_->get();
+  }
+
+  /// Compares two iterators for equality.
+  bool operator==(const Iterator& other) const {
+    return *impl_ == *other.impl_;
+  }
+
+  /// Compares two iterators for inequality.
+  bool operator!=(const Iterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  /// Points either to a `ValidIterator` or to a `SentinelIterator`.
+  std::shared_ptr<detail::IteratorImpl<Batch>> impl_;
+};
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h
new file mode 100644
index 0000000000000000000000000000000000000000..6159a454950d276451fd45d5d001fa30b40db918
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/data/samplers/base.h>
+#include <torch/data/samplers/custom_batch_request.h>
+#include <torch/data/samplers/distributed.h>
+#include <torch/data/samplers/random.h>
+#include <torch/data/samplers/sequential.h>
+#include <torch/data/samplers/serialize.h>
+#include <torch/data/samplers/stream.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6471cb4447f8be17ca111c0a03867e0dd29413a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <mutex>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace samplers {
+/// A `Sampler` is an object that yields an index with which to access a
+/// dataset.
+template <typename BatchRequest = std::vector<size_t>>
+class Sampler {
+ public:
+  using BatchRequestType = BatchRequest;
+
+  virtual ~Sampler() = default;
+
+  /// Resets the `Sampler`'s internal state.
+  /// Typically called before a new epoch.
+  /// Optionally, accepts a new size when reseting the sampler.
+  virtual void reset(optional<size_t> new_size) = 0;
+
+  /// Returns the next index if possible, or an empty optional if the
+  /// sampler is exhausted for this epoch.
+  virtual optional<BatchRequest> next(size_t batch_size) = 0;
+
+  /// Serializes the `Sampler` to the `archive`.
+  virtual void save(serialize::OutputArchive& archive) const = 0;
+
+  /// Deserializes the `Sampler` from the `archive`.
+  virtual void load(serialize::InputArchive& archive) = 0;
+};
+
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5fe31369fc89092621a0d4d47effaa61e6ea320
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+
+namespace torch {
+namespace data {
+namespace samplers {
+/// A base class for custom index types.
+struct TORCH_API CustomBatchRequest {
+  CustomBatchRequest() = default;
+  CustomBatchRequest(const CustomBatchRequest&) = default;
+  CustomBatchRequest(CustomBatchRequest&&) noexcept = default;
+  virtual ~CustomBatchRequest() = default;
+
+  /// The number of elements accessed by this index.
+  virtual size_t size() const = 0;
+};
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h
new file mode 100644
index 0000000000000000000000000000000000000000..c891758fb38089230378156da365398e00b9d348
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h
@@ -0,0 +1,139 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace samplers {
+
+/// A `Sampler` that selects a subset of indices to sample from and defines a
+/// sampling behavior. In a distributed setting, this selects a subset of the
+/// indices depending on the provided num_replicas and rank parameters. The
+/// `Sampler` performs a rounding operation based on the `allow_duplicates`
+/// parameter to decide the local sample count.
+template <typename BatchRequest = std::vector<size_t>>
+class DistributedSampler : public Sampler<BatchRequest> {
+ public:
+  DistributedSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true)
+      : size_(size),
+        num_replicas_(num_replicas),
+        rank_(rank),
+        epoch_(0),
+        allow_duplicates_(allow_duplicates) {}
+
+  /// Set the epoch for the current enumeration. This can be used to alter the
+  /// sample selection and shuffling behavior.
+  void set_epoch(size_t epoch) {
+    epoch_ = epoch;
+  }
+
+  size_t epoch() const {
+    return epoch_;
+  }
+
+ protected:
+  size_t local_sample_count() {
+    if (allow_duplicates_) {
+      return (size_ + num_replicas_ - 1) / num_replicas_;
+    } else {
+      return size_ / num_replicas_;
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t size_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t num_replicas_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t rank_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t epoch_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool allow_duplicates_;
+};
+
+/// Select samples randomly. The sampling order is shuffled at each `reset()`
+/// call.
+class TORCH_API DistributedRandomSampler : public DistributedSampler<> {
+ public:
+  DistributedRandomSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true);
+
+  /// Resets the `DistributedRandomSampler` to a new set of indices.
+  void reset(optional<size_t> new_size = nullopt) override;
+
+  /// Returns the next batch of indices.
+  optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `DistributedRandomSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `DistributedRandomSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `DistributedRandomSampler`.
+  size_t index() const noexcept;
+
+ private:
+  void populate_indices();
+
+  size_t begin_index_;
+  size_t end_index_;
+  size_t sample_index_;
+  std::vector<size_t> all_indices_;
+};
+
+/// Select samples sequentially.
+class TORCH_API DistributedSequentialSampler : public DistributedSampler<> {
+ public:
+  DistributedSequentialSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true);
+
+  /// Resets the `DistributedSequentialSampler` to a new set of indices.
+  void reset(optional<size_t> new_size = nullopt) override;
+
+  /// Returns the next batch of indices.
+  optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `DistributedSequentialSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `DistributedSequentialSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `DistributedSequentialSampler`.
+  size_t index() const noexcept;
+
+ private:
+  void populate_indices();
+
+  size_t begin_index_;
+  size_t end_index_;
+  size_t sample_index_;
+  std::vector<size_t> all_indices_;
+};
+
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..724d2e54d5b7563e22c27b32ecfe423537c79915
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace samplers {
+
+/// A `Sampler` that returns random indices.
+class TORCH_API RandomSampler : public Sampler<> {
+ public:
+  /// Constructs a `RandomSampler` with a size and dtype for the stored indices.
+  ///
+  /// The constructor will eagerly allocate all required indices, which is the
+  /// sequence `0 ... size - 1`. `index_dtype` is the data type of the stored
+  /// indices. You can change it to influence memory usage.
+  explicit RandomSampler(int64_t size, Dtype index_dtype = torch::kInt64);
+
+  ~RandomSampler() override;
+
+  /// Resets the `RandomSampler` to a new set of indices.
+  void reset(optional<size_t> new_size = nullopt) override;
+
+  /// Returns the next batch of indices.
+  optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `RandomSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `RandomSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `RandomSampler`.
+  size_t index() const noexcept;
+
+ private:
+  at::Tensor indices_;
+  int64_t index_ = 0;
+};
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h
new file mode 100644
index 0000000000000000000000000000000000000000..184b77dfbf107092b6a307bb3d060bddae3de434
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace samplers {
+
+/// A `Sampler` that returns indices sequentially.
+class TORCH_API SequentialSampler : public Sampler<> {
+ public:
+  /// Creates a `SequentialSampler` that will return indices in the range
+  /// `0...size - 1`.
+  explicit SequentialSampler(size_t size);
+
+  /// Resets the `SequentialSampler` to zero.
+  void reset(optional<size_t> new_size = nullopt) override;
+
+  /// Returns the next batch of indices.
+  optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `SequentialSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `SequentialSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `SequentialSampler`.
+  size_t index() const noexcept;
+
+ private:
+  size_t size_;
+  size_t index_{0};
+};
+
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..814e279066b1abfee8cf6950ea2e5d5a191af6c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/data/samplers/base.h>
+#include <torch/serialize/archive.h>
+
+namespace torch {
+namespace data {
+namespace samplers {
+/// Serializes a `Sampler` into an `OutputArchive`.
+template <typename BatchRequest>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Sampler<BatchRequest>& sampler) {
+  sampler.save(archive);
+  return archive;
+}
+
+/// Deserializes a `Sampler` from an `InputArchive`.
+template <typename BatchRequest>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Sampler<BatchRequest>& sampler) {
+  sampler.load(archive);
+  return archive;
+}
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc75b8d7e096fb2b1779db3538004aaff8fd7a27
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/data/samplers/custom_batch_request.h>
+#include <torch/types.h>
+
+#include <cstddef>
+
+namespace torch {
+namespace serialize {
+class InputArchive;
+class OutputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace data {
+namespace samplers {
+
+/// A wrapper around a batch size value, which implements the
+/// `CustomBatchRequest` interface.
+struct TORCH_API BatchSize : public CustomBatchRequest {
+  explicit BatchSize(size_t size);
+  size_t size() const noexcept override;
+  operator size_t() const noexcept;
+  size_t size_;
+};
+
+/// A sampler for (potentially infinite) streams of data.
+///
+/// The major feature of the `StreamSampler` is that it does not return
+/// particular indices, but instead only the number of elements to fetch from
+/// the dataset. The dataset has to decide how to produce those elements.
+class TORCH_API StreamSampler : public Sampler<BatchSize> {
+ public:
+  /// Constructs the `StreamSampler` with the number of individual examples that
+  /// should be fetched until the sampler is exhausted.
+  explicit StreamSampler(size_t epoch_size);
+
+  /// Resets the internal state of the sampler.
+  void reset(optional<size_t> new_size = nullopt) override;
+
+  /// Returns a `BatchSize` object with the number of elements to fetch in the
+  /// next batch. This number is the minimum of the supplied `batch_size` and
+  /// the difference between the `epoch_size` and the current index. If the
+  /// `epoch_size` has been reached, returns an empty optional.
+  optional<BatchSize> next(size_t batch_size) override;
+
+  /// Serializes the `StreamSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `StreamSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  size_t examples_retrieved_so_far_ = 0;
+  size_t epoch_size_;
+};
+
+} // namespace samplers
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d532dc82ca26f56ad157c07a991b654d59a4e25
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/data/transforms/base.h>
+#include <torch/data/transforms/collate.h>
+#include <torch/data/transforms/lambda.h>
+#include <torch/data/transforms/stack.h>
+#include <torch/data/transforms/tensor.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..5037760dbb4a8a50b2f5507504d3543c3ab988cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace transforms {
+
+/// A transformation of a batch to a new batch.
+template <typename InputBatch, typename OutputBatch>
+class BatchTransform {
+ public:
+  using InputBatchType = InputBatch;
+  using OutputBatchType = OutputBatch;
+
+  virtual ~BatchTransform() = default;
+
+  /// Applies the transformation to the given `input_batch`.
+  virtual OutputBatch apply_batch(InputBatch input_batch) = 0;
+};
+
+/// A transformation of individual input examples to individual output examples.
+///
+/// Just like a `Dataset` is a `BatchDataset`, a `Transform` is a
+/// `BatchTransform` that can operate on the level of individual examples rather
+/// than entire batches. The batch-level transform is implemented (by default)
+/// in terms of the example-level transform, though this can be customized.
+template <typename Input, typename Output>
+class Transform
+    : public BatchTransform<std::vector<Input>, std::vector<Output>> {
+ public:
+  using InputType = Input;
+  using OutputType = Output;
+
+  /// Applies the transformation to the given `input`.
+  virtual OutputType apply(InputType input) = 0;
+
+  /// Applies the `transformation` over the entire `input_batch`.
+  std::vector<Output> apply_batch(std::vector<Input> input_batch) override {
+    std::vector<Output> output_batch;
+    output_batch.reserve(input_batch.size());
+    for (auto&& input : input_batch) {
+      output_batch.push_back(apply(std::move(input)));
+    }
+    return output_batch;
+  }
+};
+} // namespace transforms
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf691ac7837d556479ea68fed0831d7db587ee9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/lambda.h>
+
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace transforms {
+
+/// A `Collation` is a transform that reduces a batch into a single value.
+/// The result is a `BatchDataset` that has the type of the single value as its
+/// `BatchType`.
+template <typename T, typename BatchType = std::vector<T>>
+using Collation = BatchTransform<BatchType, T>;
+
+/// A `Collate` allows passing a custom function to reduce/collate a batch
+/// into a single value. It's effectively the lambda version of `Collation`,
+/// which you could subclass and override `operator()` to achieve the same.
+///
+/// \rst
+/// .. code-block:: cpp
+///   using namespace torch::data;
+///
+///   auto dataset = datasets::MNIST("path/to/mnist")
+///     .map(transforms::Collate<Example<>>([](std::vector<Example<>> e) {
+///       return std::move(e.front());
+///     }));
+/// \endrst
+template <typename T, typename BatchType = std::vector<T>>
+using Collate = BatchLambda<BatchType, T>;
+} // namespace transforms
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb3081261098a8cb11bf86308694d28995985004
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <torch/data/transforms/base.h>
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace transforms {
+
+/// A `BatchTransform` that applies a user-provided functor to a batch.
+template <typename Input, typename Output = Input>
+class BatchLambda : public BatchTransform<Input, Output> {
+ public:
+  using typename BatchTransform<Input, Output>::InputBatchType;
+  using typename BatchTransform<Input, Output>::OutputBatchType;
+  using FunctionType = std::function<OutputBatchType(InputBatchType)>;
+
+  /// Constructs the `BatchLambda` from the given `function` object.
+  explicit BatchLambda(FunctionType function)
+      : function_(std::move(function)) {}
+
+  /// Applies the user-provided function object to the `input_batch`.
+  OutputBatchType apply_batch(InputBatchType input_batch) override {
+    return function_(std::move(input_batch));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+// A `Transform` that applies a user-provided functor to individual examples.
+template <typename Input, typename Output = Input>
+class Lambda : public Transform<Input, Output> {
+ public:
+  using typename Transform<Input, Output>::InputType;
+  using typename Transform<Input, Output>::OutputType;
+  using FunctionType = std::function<Output(Input)>;
+
+  /// Constructs the `Lambda` from the given `function` object.
+  explicit Lambda(FunctionType function) : function_(std::move(function)) {}
+
+  /// Applies the user-provided function object to the `input`.
+  OutputType apply(InputType input) override {
+    return function_(std::move(input));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+} // namespace transforms
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..695141e2cdedec0e7f2f5fcf2f8f9c882b9ff248
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/collate.h>
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace data {
+namespace transforms {
+
+template <typename T = Example<>>
+struct Stack;
+
+/// A `Collation` for `Example<Tensor, Tensor>` types that stacks all data
+/// tensors into one tensor, and all target (label) tensors into one tensor.
+template <>
+struct Stack<Example<>> : public Collation<Example<>> {
+  Example<> apply_batch(std::vector<Example<>> examples) override {
+    std::vector<torch::Tensor> data, targets;
+    data.reserve(examples.size());
+    targets.reserve(examples.size());
+    for (auto& example : examples) {
+      data.push_back(std::move(example.data));
+      targets.push_back(std::move(example.target));
+    }
+    return {torch::stack(data), torch::stack(targets)};
+  }
+};
+
+/// A `Collation` for `Example<Tensor, NoTarget>` types that stacks all data
+/// tensors into one tensor.
+template <>
+struct Stack<TensorExample>
+    : public Collation<Example<Tensor, example::NoTarget>> {
+  TensorExample apply_batch(std::vector<TensorExample> examples) override {
+    std::vector<torch::Tensor> data;
+    data.reserve(examples.size());
+    for (auto& example : examples) {
+      data.push_back(std::move(example.data));
+    }
+    return torch::stack(data);
+  }
+};
+} // namespace transforms
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..d246eccec52cb6b69d0eac806e7b4ce65c19dd30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/base.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <utility>
+
+namespace torch {
+namespace data {
+namespace transforms {
+
+/// A `Transform` that is specialized for the typical `Example<Tensor, Tensor>`
+/// combination. It exposes a single `operator()` interface hook (for
+/// subclasses), and calls this function on input `Example` objects.
+template <typename Target = Tensor>
+class TensorTransform
+    : public Transform<Example<Tensor, Target>, Example<Tensor, Target>> {
+ public:
+  using E = Example<Tensor, Target>;
+  using typename Transform<E, E>::InputType;
+  using typename Transform<E, E>::OutputType;
+
+  /// Transforms a single input tensor to an output tensor.
+  virtual Tensor operator()(Tensor input) = 0;
+
+  /// Implementation of `Transform::apply` that calls `operator()`.
+  OutputType apply(InputType input) override {
+    input.data = (*this)(std::move(input.data));
+    return input;
+  }
+};
+
+/// A `Lambda` specialized for the typical `Example<Tensor, Tensor>` input type.
+template <typename Target = Tensor>
+class TensorLambda : public TensorTransform<Target> {
+ public:
+  using FunctionType = std::function<Tensor(Tensor)>;
+
+  /// Creates a `TensorLambda` from the given `function`.
+  explicit TensorLambda(FunctionType function)
+      : function_(std::move(function)) {}
+
+  /// Applies the user-provided functor to the input tensor.
+  Tensor operator()(Tensor input) override {
+    return function_(std::move(input));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+/// Normalizes input tensors by subtracting the supplied mean and dividing by
+/// the given standard deviation.
+template <typename Target = Tensor>
+struct Normalize : public TensorTransform<Target> {
+  /// Constructs a `Normalize` transform. The mean and standard deviation can be
+  /// anything that is broadcastable over the input tensors (like single
+  /// scalars).
+  Normalize(ArrayRef<double> mean, ArrayRef<double> stddev)
+      : mean(torch::tensor(mean, torch::kFloat32)
+                 .unsqueeze(/*dim=*/1)
+                 .unsqueeze(/*dim=*/2)),
+        stddev(torch::tensor(stddev, torch::kFloat32)
+                   .unsqueeze(/*dim=*/1)
+                   .unsqueeze(/*dim=*/2)) {}
+
+  torch::Tensor operator()(Tensor input) override {
+    return input.sub(mean).div(stddev);
+  }
+
+  torch::Tensor mean, stddev;
+};
+} // namespace transforms
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba7dba28ff4ce8c51d6ae9e48d810d8eb3b3ee13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <exception>
+#include <string>
+#include <utility>
+
+namespace torch {
+namespace data {
+
+/// An exception thrown when a DataLoader's worker thread throws an exception,
+/// which is caught. A `WorkerException` stores an `exception_ptr` to the
+/// original exception thrown in the worker thread.
+struct WorkerException : public std::exception {
+  /// Constructs a `WorkerException` from an `exception_ptr`.
+  explicit WorkerException(std::exception_ptr original)
+      : original_exception(std::move(original)),
+        message("Caught exception in DataLoader worker thread.") {
+    try {
+      std::rethrow_exception(original_exception);
+    } catch (std::exception& e) {
+      message += " Original message: ";
+      message += e.what();
+    }
+  }
+
+  const char* what() const noexcept override {
+    return message.c_str();
+  }
+
+  /// The original exception thrown in the worker thread.
+  std::exception_ptr original_exception;
+
+  /// This exception's message (not the original exception's message).
+  std::string message;
+};
+
+} // namespace data
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7bfc8668100299317790c036820d2e7bf2fc6f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -0,0 +1,372 @@
+#pragma once
+
+#include <ATen/Dispatch.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/grad_mode.h>
+
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#endif
+
+#include <initializer_list>
+
+namespace torch {
+
+namespace detail {
+
+enum class TensorDataContainerType { Scalar, InitList, Tensor };
+
+struct TensorDataContainer;
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorDataContainer& tensor_data_container);
+
+// FIXME: There is no `operator<<` overload for `at::kBFloat16` type,
+// and we need to convert it to `float` type using `operator float()` function
+// defined in `c10/util/BFloat16.h`.
+// Tracking issue: https://github.com/pytorch/pytorch/issues/28845
+inline std::ostream& operator<<(std::ostream& stream, c10::BFloat16 value) {
+  stream << static_cast<float>(value);
+  return stream;
+}
+
+inline c10::ScalarType compute_desired_dtype(c10::ScalarType scalar_type) {
+  if (scalar_type == at::kInt || scalar_type == at::kLong) {
+    // C++ `torch::tensor` with an integer type or an `at::ArrayRef` /
+    // `std::vector` / (nested) braced-init-list of integer types always
+    // produces a tensor of dtype `at::kLong` (aka. int64_t), matching Python
+    // `torch.tensor` behavior.
+    return at::kLong;
+  } else if (scalar_type == at::kFloat || scalar_type == at::kDouble) {
+    // C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` /
+    // `std::vector` / (nested) braced-init-list of floating-point types always
+    // produces a tensor of dtype `torch::get_default_dtype()`, matching Python
+    // `torch.tensor` behavior.
+    return at::typeMetaToScalarType(at::get_default_dtype());
+  } else {
+    return scalar_type;
+  }
+}
+
+// We use `TensorDataContainer` to support converting the following data
+// container types into the equivalent Tensor:
+//
+// 1. Arbitrarily nested braced-init-list (e.g. `{{1, 2}, {3, 4}}`).
+// 2. `at::ArrayRef` of supported tensor data types.
+// 3. `std::vector` of supported tensor data types.
+//
+// At any time, a `TensorDataContainer` object represents one of the following:
+//
+// 1. A scalar with value `scalar()` and type `scalar_type()`.
+// 2. A Tensor represented in `std::initializer_list<TensorDataContainer>` form,
+//    with value `init_list()`, Tensor scalar type `scalar_type()`, and Tensor
+//    sizes `sizes()`.
+// 3. A Tensor represented in `at::Tensor` form, with value `tensor()`, scalar
+// type `scalar_type()`,
+//    and Tensor sizes `sizes()`.
+//
+// All the infrastructure here is mostly to support converting an arbitrarily
+// nested braced-init-list to the equivalent Tensor successfully. Consider the
+// following example:
+//
+// `torch::tensor({{1}, {2}})`
+//
+// this will call into the `torch::tensor` function:
+//
+// `at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const
+// at::TensorOptions& options = {})`
+//
+// the compiler will first try to convert `{{1}, {2}}` to `TensorDataContainer`
+// type:
+//
+// `TensorDataContainer({{1}, {2}})`
+//
+// which matches to the
+// `TensorDataContainer(std::initializer_list<TensorDataContainer>)`
+// constructor, and in an attempt to convert `{1}` and `{2}` to
+// `TensorDataContainer`, it calls the following:
+//
+// `TensorDataContainer({1})`  (same call path happens for `{2}`, and we'll just
+// focus on `{1}` here)
+//
+// At this point, theoretically there are two plausible ways for `{1}` to be
+// matched to one of the constructors of `TensorDataContainer`:
+//
+// 1. It can be a list-initialization of a scalar value, thus matching
+// `TensorDataContainer(int value)`.
+// 2. It can be converted to `std::initializer_list<TensorDataContainer>`, thus
+// matching
+//    `TensorDataContainer(std::initializer_list<TensorDataContainer>)`.
+//
+// How does the compiler decide which one to choose? According to
+// `https://en.cppreference.com/w/cpp/language/list_initialization`,
+// braced-init-list always prefers the constructor that takes
+// `std::initializer_list`. Hence we happily move forward with constructor #2,
+// and it calls the following:
+//
+// `TensorDataContainer(1)`
+//
+// Now it matches `TensorDataContainer(int value)`, which stores `1` as a scalar
+// value. All is good.
+struct TensorDataContainer {
+  // NOTE: For tensors with zero-size dimensions (e.g. `torch::tensor({{},
+  // {}})`), the innermost empty braced-init-list `{}` matches the default
+  // constructor of the innermost `TensorDataContainer`.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  TensorDataContainer()
+      : sizes_({0}),
+        // NOTE: In Python, the dtype of tensors with zero-size dimensions (e.g.
+        // `torch.tensor([[], []])`) depends on the value of
+        // `torch.get_default_dtype()`, and we should do the same for the C++
+        // equivalent.
+        scalar_type_(at::typeMetaToScalarType(at::get_default_dtype())),
+        type_(TensorDataContainerType::InitList) {}
+#define TENSOR(T, S)                            \
+  TensorDataContainer(T value)                  \
+      : sizes_(),                               \
+        scalar_type_(at::k##S),                 \
+        type_(TensorDataContainerType::Scalar), \
+        scalar_(value) {}
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  TensorDataContainer(std::initializer_list<TensorDataContainer> init_list)
+      : sizes_(),
+        scalar_type_(init_list.begin()->scalar_type()),
+        type_(TensorDataContainerType::InitList),
+        init_list_(init_list) {
+    const TensorDataContainer& first_elem = *(init_list.begin());
+    for (const auto& elem : init_list) {
+      TORCH_CHECK(
+          elem.sizes() == first_elem.sizes(),
+          "Expected all sub-lists to have sizes: ",
+          first_elem.sizes(),
+          " (e.g. ",
+          first_elem,
+          "), ",
+          "but got sub-list ",
+          elem,
+          " with sizes: ",
+          elem.sizes());
+      TORCH_CHECK(
+          elem.scalar_type() == first_elem.scalar_type(),
+          "Expected all elements of the tensor to have the same scalar type: ",
+          first_elem.scalar_type(),
+          ", but got element of scalar type: ",
+          elem.scalar_type());
+    }
+    sizes_.reserve(first_elem.sizes().size() + 1);
+    sizes_.push_back(init_list.size());
+    sizes_.insert(
+        sizes_.end(), first_elem.sizes().begin(), first_elem.sizes().end());
+  }
+
+#define TENSOR(T, S)                                                          \
+  TensorDataContainer(at::ArrayRef<T> values)                                 \
+      : sizes_({(int64_t)values.size()}),                                     \
+        scalar_type_(at::k##S),                                               \
+        type_(TensorDataContainerType::Tensor) {                              \
+    at::AutoDispatchBelowAutograd mode;                                       \
+    if (scalar_type_ == at::kBool) {                                          \
+      tensor_ = at::tensor(values, at::TensorOptions().device(at::kCPU));     \
+    } else {                                                                  \
+      tensor_ = at::tensor(values, at::dtype(scalar_type_).device(at::kCPU)); \
+    }                                                                         \
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+
+  // NOTE: We need to handle `std::vector` explicitly instead of relying on an
+  // implicit conversion to `at::ArrayRef`, otherwise the following error can be
+  // thrown when calling `torch::tensor(std::vector<int>({1, 2}))`:
+  // ```
+  // error: no matching function for call to 'tensor(const std::vector<int>&)'
+  // no known conversion for argument 1 from 'const std::vector<int>' to
+  // 'torch::detail::TensorDataContainer'
+  // ```
+  //
+  // NOTE: `torch::tensor(std::vector<bool>)` is not supported for now, because
+  // ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.
+#define TENSOR(T, S)                                \
+  TensorDataContainer(const std::vector<T>& values) \
+      : TensorDataContainer(at::ArrayRef<T>(values)) {}
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TENSOR)
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+
+  bool is_scalar() const {
+    return type_ == TensorDataContainerType::Scalar;
+  }
+
+  const c10::Scalar& scalar() const {
+    TORCH_CHECK(
+        is_scalar(),
+        "Can only call `scalar()` on a TensorDataContainer that has `is_scalar() == true`");
+    return scalar_;
+  }
+
+  bool is_init_list() const {
+    return type_ == TensorDataContainerType::InitList;
+  }
+
+  const std::initializer_list<TensorDataContainer>& init_list() const {
+    TORCH_CHECK(
+        is_init_list(),
+        "Can only call `init_list()` on a TensorDataContainer that has `is_init_list() == true`");
+    return init_list_;
+  }
+
+  bool is_tensor() const {
+    return type_ == TensorDataContainerType::Tensor;
+  }
+
+  const at::Tensor& tensor() const {
+    TORCH_CHECK(
+        is_tensor(),
+        "Can only call `tensor()` on a TensorDataContainer that has `is_tensor() == true`");
+    return tensor_;
+  }
+
+  const std::vector<int64_t>& sizes() const {
+    return sizes_;
+  }
+
+  const c10::ScalarType& scalar_type() const {
+    return scalar_type_;
+  }
+
+  at::Tensor convert_to_tensor(at::TensorOptions options) const {
+    if (!options.has_dtype()) {
+      options = options.dtype(compute_desired_dtype(scalar_type_));
+    }
+
+    if (is_scalar()) {
+      at::AutoDispatchBelowAutograd mode;
+      return at::scalar_tensor(scalar_, options);
+    } else if (is_init_list()) {
+      // NOTE: Here we explicitly choose to initialize the tensor on CPU first,
+      // fill each element of the tensor, and then move the tensor to the
+      // desired device. For CUDA device, this approach only involves 1 CUDA
+      // kernel launch, and is much faster than initializing the tensor on CUDA
+      // first and then filling each element of it (which involves `N` CUDA
+      // kernel launches where `N` is the number of the elements in the tensor).
+      at::Tensor tensor = ([&]() {
+        at::AutoDispatchBelowAutograd mode;
+        return at::empty(sizes_, options.device(at::kCPU));
+      })();
+      fill_tensor(tensor);
+      return tensor.to(options.device());
+    } else if (is_tensor()) {
+      auto output = tensor_.to(options);
+      TORCH_CHECK(
+          !tensor_.is_complex() || output.is_complex(),
+          "can not do torch::tensor(complex, dtype=non-complex) because complex can not be casted to real number without loss of information");
+      return output;
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+  void pretty_print_recursive(std::ostream& stream) const {
+    if (is_scalar()) {
+      AT_DISPATCH_ALL_TYPES_AND3(
+          at::kBool,
+          at::kHalf,
+          at::kBFloat16,
+          scalar_type_,
+          "TensorDataContainer_pretty_print_scalar",
+          [&] { stream << scalar_.to<scalar_t>(); });
+    } else if (is_init_list()) {
+      stream << "{";
+      for (const TensorDataContainer* it = init_list_.begin();
+           it != init_list_.end();
+           it++) {
+        stream << *it;
+        if (std::next(it) != init_list_.end())
+          stream << ", ";
+      }
+      stream << "}";
+    } else if (is_tensor()) {
+      stream << "{";
+      for (const auto i : c10::irange(tensor_.sizes()[0])) {
+        AT_DISPATCH_ALL_TYPES_AND3(
+            at::kBool,
+            at::kHalf,
+            at::kBFloat16,
+            scalar_type_,
+            "TensorDataContainer_pretty_print_tensor_item",
+            [&] { stream << tensor_[i].item<scalar_t>(); });
+        if (i != tensor_.sizes()[0] - 1)
+          stream << ", ";
+      }
+      stream << "}";
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+ private:
+  void fill_tensor(at::Tensor& tensor) const {
+    if (is_scalar()) {
+      TORCH_INTERNAL_ASSERT(
+          tensor.dim() == 0,
+          "Expected a 0-dim Tensor, but got Tensor with dimensions: ",
+          tensor.dim());
+      at::NoGradGuard guard;
+      tensor.fill_(scalar_);
+    } else if (is_init_list()) {
+      TORCH_INTERNAL_ASSERT(
+          tensor.sizes()[0] == (int64_t)init_list_.size(),
+          "Expected a Tensor with size ",
+          init_list_.size(),
+          " in its first dimension, but got Tensor with size ",
+          tensor.sizes()[0],
+          " in its first dimension");
+      size_t index = 0;
+      for (const auto& elem : init_list_) {
+        at::Tensor slice = tensor[index];
+        elem.fill_tensor(slice);
+        index++;
+      }
+    } else if (is_tensor()) {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "TensorDataContainer is already a Tensor type, `fill_tensor` should not be called");
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+  std::vector<int64_t> sizes_;
+  c10::ScalarType scalar_type_;
+  TensorDataContainerType type_;
+  c10::Scalar scalar_;
+  std::initializer_list<TensorDataContainer> init_list_;
+  at::Tensor tensor_;
+};
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorDataContainer& tensor_data_container) {
+  tensor_data_container.pretty_print_recursive(stream);
+  return stream;
+}
+
+} // namespace detail
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddf24ae74bc32c9a145cbc564090e9c717ee6e38
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/types.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace torch {
+namespace nn {
+class Module;
+} // namespace nn
+} // namespace torch
+
+namespace torch {
+namespace detail {
+/// Detects if a type T has a forward() method.
+template <typename T>
+struct has_forward {
+  // Declare two types with differing size.
+  using yes = int8_t;
+  using no = int16_t;
+
+  // Here we declare two functions. The first is only enabled if `&U::forward`
+  // is well-formed and returns the `yes` type. In C++, the ellipsis parameter
+  // type (`...`) always puts the function at the bottom of overload resolution.
+  // This is specified in the standard as: 1) A standard conversion sequence is
+  // always better than a user-defined conversion sequence or an ellipsis
+  // conversion sequence. 2) A user-defined conversion sequence is always better
+  // than an ellipsis conversion sequence This means that if the first overload
+  // is viable, it will be preferred over the second as long as we pass any
+  // convertible type. The type of `&U::forward` is a pointer type, so we can
+  // pass e.g. 0.
+  template <typename U>
+  static yes test(decltype(&U::forward));
+  template <typename U>
+  static no test(...);
+
+  // Finally we test statically whether the size of the type returned by the
+  // selected overload is the size of the `yes` type.
+  static constexpr bool value = (sizeof(test<T>(nullptr)) == sizeof(yes));
+};
+
+template <typename Head = void, typename... Tail>
+constexpr bool check_not_lvalue_references() {
+  return (!std::is_lvalue_reference<Head>::value ||
+          std::is_const<typename std::remove_reference<Head>::type>::value) &&
+      check_not_lvalue_references<Tail...>();
+}
+
+template <>
+inline constexpr bool check_not_lvalue_references<void>() {
+  return true;
+}
+
+/// A type trait whose `value` member is true if `M` derives from `Module`.
+template <typename M>
+using is_module =
+    std::is_base_of<torch::nn::Module, typename std::decay<M>::type>;
+
+template <typename M, typename T = void>
+using enable_if_module_t =
+    typename std::enable_if<is_module<M>::value, T>::type;
+} // namespace detail
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h
new file mode 100644
index 0000000000000000000000000000000000000000..a192e2fbc8b1fd62d926e08f1e4740ca6d3fb1f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <string>
+#include <variant>
+
+#include <ATen/core/Reduction.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+
+#define TORCH_ENUM_DECLARE(name)                                      \
+  namespace torch {                                                   \
+  namespace enumtype {                                                \
+  /*                                                                  \
+    NOTE: We need to provide the default constructor for each struct, \
+    otherwise Clang 3.8 would complain:                               \
+    ```                                                               \
+    error: default initialization of an object of const type 'const   \
+    enumtype::Enum1' without a user-provided default constructor      \
+    ```                                                               \
+  */                                                                  \
+  struct k##name {                                                    \
+    k##name() {}                                                      \
+  };                                                                  \
+  }                                                                   \
+  TORCH_API extern const enumtype::k##name k##name;                   \
+  }
+
+#define TORCH_ENUM_DEFINE(name)    \
+  namespace torch {                \
+  const enumtype::k##name k##name; \
+  }
+
+#define TORCH_ENUM_PRETTY_PRINT(name)                        \
+  std::string operator()(const enumtype::k##name& v) const { \
+    std::string k("k");                                      \
+    return k + #name;                                        \
+  }
+
+// NOTE: Backstory on why we need the following two macros:
+//
+// Consider the following options class:
+//
+// ```
+// struct TORCH_API SomeOptions {
+//   typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+//   reduction_t; SomeOptions(reduction_t reduction = torch::kMean) :
+//   reduction_(reduction) {}
+//
+//   TORCH_ARG(reduction_t, reduction);
+// };
+// ```
+//
+// and the functional that uses it:
+//
+// ```
+// Tensor some_functional(
+//     const Tensor& input,
+//     SomeOptions options = {}) {
+//   ...
+// }
+// ```
+//
+// Normally, we would expect this to work:
+//
+// `F::some_functional(input, torch::kNone)`
+//
+// However, it throws the following error instead:
+//
+// ```
+// error: could not convert `torch::kNone` from `const torch::enumtype::kNone`
+// to `torch::nn::SomeOptions`
+// ```
+//
+// To get around this problem, we explicitly provide the following constructors
+// for `SomeOptions`:
+//
+// ```
+// SomeOptions(torch::enumtype::kNone reduction) : reduction_(torch::kNone) {}
+// SomeOptions(torch::enumtype::kMean reduction) : reduction_(torch::kMean) {}
+// SomeOptions(torch::enumtype::kSum reduction) : reduction_(torch::kSum) {}
+// ```
+//
+// so that the conversion from `torch::kNone` to `SomeOptions` would work.
+//
+// Note that we also provide the default constructor `SomeOptions() {}`, so that
+// `SomeOptions options = {}` can work.
+#define TORCH_OPTIONS_CTOR_VARIANT_ARG3(                                       \
+    OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3)                               \
+  OPTIONS_NAME() = default;                                                    \
+  OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {}
+
+#define TORCH_OPTIONS_CTOR_VARIANT_ARG4(                                       \
+    OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3, TYPE4)                        \
+  OPTIONS_NAME() = default;                                                    \
+  OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE4 ARG_NAME) : ARG_NAME##_(torch::TYPE4) {}
+
+TORCH_ENUM_DECLARE(Linear)
+TORCH_ENUM_DECLARE(Conv1D)
+TORCH_ENUM_DECLARE(Conv2D)
+TORCH_ENUM_DECLARE(Conv3D)
+TORCH_ENUM_DECLARE(ConvTranspose1D)
+TORCH_ENUM_DECLARE(ConvTranspose2D)
+TORCH_ENUM_DECLARE(ConvTranspose3D)
+TORCH_ENUM_DECLARE(Sigmoid)
+TORCH_ENUM_DECLARE(Tanh)
+TORCH_ENUM_DECLARE(ReLU)
+TORCH_ENUM_DECLARE(GELU)
+TORCH_ENUM_DECLARE(SiLU)
+TORCH_ENUM_DECLARE(Mish)
+TORCH_ENUM_DECLARE(LeakyReLU)
+TORCH_ENUM_DECLARE(FanIn)
+TORCH_ENUM_DECLARE(FanOut)
+TORCH_ENUM_DECLARE(Constant)
+TORCH_ENUM_DECLARE(Reflect)
+TORCH_ENUM_DECLARE(Replicate)
+TORCH_ENUM_DECLARE(Circular)
+TORCH_ENUM_DECLARE(Nearest)
+TORCH_ENUM_DECLARE(Bilinear)
+TORCH_ENUM_DECLARE(Bicubic)
+TORCH_ENUM_DECLARE(Trilinear)
+TORCH_ENUM_DECLARE(Area)
+TORCH_ENUM_DECLARE(NearestExact)
+TORCH_ENUM_DECLARE(Sum)
+TORCH_ENUM_DECLARE(Mean)
+TORCH_ENUM_DECLARE(Max)
+TORCH_ENUM_DECLARE(None)
+TORCH_ENUM_DECLARE(BatchMean)
+TORCH_ENUM_DECLARE(Zeros)
+TORCH_ENUM_DECLARE(Border)
+TORCH_ENUM_DECLARE(Reflection)
+TORCH_ENUM_DECLARE(RNN_TANH)
+TORCH_ENUM_DECLARE(RNN_RELU)
+TORCH_ENUM_DECLARE(LSTM)
+TORCH_ENUM_DECLARE(GRU)
+TORCH_ENUM_DECLARE(Valid)
+TORCH_ENUM_DECLARE(Same)
+
+namespace torch {
+namespace enumtype {
+
+struct _compute_enum_name {
+  TORCH_ENUM_PRETTY_PRINT(Linear)
+  TORCH_ENUM_PRETTY_PRINT(Conv1D)
+  TORCH_ENUM_PRETTY_PRINT(Conv2D)
+  TORCH_ENUM_PRETTY_PRINT(Conv3D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose1D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose2D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose3D)
+  TORCH_ENUM_PRETTY_PRINT(Sigmoid)
+  TORCH_ENUM_PRETTY_PRINT(Tanh)
+  TORCH_ENUM_PRETTY_PRINT(ReLU)
+  TORCH_ENUM_PRETTY_PRINT(GELU)
+  TORCH_ENUM_PRETTY_PRINT(SiLU)
+  TORCH_ENUM_PRETTY_PRINT(Mish)
+  TORCH_ENUM_PRETTY_PRINT(LeakyReLU)
+  TORCH_ENUM_PRETTY_PRINT(FanIn)
+  TORCH_ENUM_PRETTY_PRINT(FanOut)
+  TORCH_ENUM_PRETTY_PRINT(Constant)
+  TORCH_ENUM_PRETTY_PRINT(Reflect)
+  TORCH_ENUM_PRETTY_PRINT(Replicate)
+  TORCH_ENUM_PRETTY_PRINT(Circular)
+  TORCH_ENUM_PRETTY_PRINT(Nearest)
+  TORCH_ENUM_PRETTY_PRINT(Bilinear)
+  TORCH_ENUM_PRETTY_PRINT(Bicubic)
+  TORCH_ENUM_PRETTY_PRINT(Trilinear)
+  TORCH_ENUM_PRETTY_PRINT(Area)
+  TORCH_ENUM_PRETTY_PRINT(NearestExact)
+  TORCH_ENUM_PRETTY_PRINT(Sum)
+  TORCH_ENUM_PRETTY_PRINT(Mean)
+  TORCH_ENUM_PRETTY_PRINT(Max)
+  TORCH_ENUM_PRETTY_PRINT(None)
+  TORCH_ENUM_PRETTY_PRINT(BatchMean)
+  TORCH_ENUM_PRETTY_PRINT(Zeros)
+  TORCH_ENUM_PRETTY_PRINT(Border)
+  TORCH_ENUM_PRETTY_PRINT(Reflection)
+  TORCH_ENUM_PRETTY_PRINT(RNN_TANH)
+  TORCH_ENUM_PRETTY_PRINT(RNN_RELU)
+  TORCH_ENUM_PRETTY_PRINT(LSTM)
+  TORCH_ENUM_PRETTY_PRINT(GRU)
+  TORCH_ENUM_PRETTY_PRINT(Valid)
+  TORCH_ENUM_PRETTY_PRINT(Same)
+};
+
+template <typename V>
+std::string get_enum_name(V variant_enum) {
+  return std::visit(enumtype::_compute_enum_name{}, variant_enum);
+}
+
+template <typename V>
+at::Reduction::Reduction reduction_get_enum(V variant_enum) {
+  if (std::holds_alternative<enumtype::kNone>(variant_enum)) {
+    return at::Reduction::None;
+  } else if (std::holds_alternative<enumtype::kMean>(variant_enum)) {
+    return at::Reduction::Mean;
+  } else if (std::holds_alternative<enumtype::kSum>(variant_enum)) {
+    return at::Reduction::Sum;
+  } else {
+    TORCH_CHECK(
+        false,
+        get_enum_name(variant_enum),
+        " is not a valid value for reduction");
+    return at::Reduction::END;
+  }
+}
+
+} // namespace enumtype
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec6e7408e94816b0d28a70f42293dcd5b45cbe0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+/// A utility class that accepts either a container of `D`-many values, or a
+/// single value, which is internally repeated `D` times. This is useful to
+/// represent parameters that are multidimensional, but often equally sized in
+/// all dimensions. For example, the kernel size of a 2D convolution has an `x`
+/// and `y` length, but `x` and `y` are often equal. In such a case you could
+/// just pass `3` to an `ExpandingArray<2>` and it would "expand" to `{3, 3}`.
+template <size_t D, typename T = int64_t>
+class ExpandingArray {
+ public:
+  /// Constructs an `ExpandingArray` from an `initializer_list`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(std::initializer_list<T> list)
+      : ExpandingArray(at::ArrayRef<T>(list)) {}
+
+  /// Constructs an `ExpandingArray` from an `std::vector`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(std::vector<T> vec)
+      : ExpandingArray(at::ArrayRef<T>(vec)) {}
+
+  /// Constructs an `ExpandingArray` from an `at::ArrayRef`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(at::ArrayRef<T> values) {
+    // clang-format off
+    TORCH_CHECK(
+        values.size() == D,
+        "Expected ", D, " values, but instead got ", values.size());
+    // clang-format on
+    std::copy(values.begin(), values.end(), values_.begin());
+  }
+
+  /// Constructs an `ExpandingArray` from a single value, which is repeated `D`
+  /// times (where `D` is the extent parameter of the `ExpandingArray`).
+  /*implicit*/ ExpandingArray(T single_size) {
+    values_.fill(single_size);
+  }
+
+  /// Constructs an `ExpandingArray` from a correctly sized `std::array`.
+  /*implicit*/ ExpandingArray(const std::array<T, D>& values)
+      : values_(values) {}
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>& operator*() {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>& operator*() const {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>* operator->() {
+    return &values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>* operator->() const {
+    return &values_;
+  }
+
+  /// Returns an `ArrayRef` to the underlying `std::array`.
+  operator at::ArrayRef<T>() const {
+    return values_;
+  }
+
+  /// Returns the extent of the `ExpandingArray`.
+  size_t size() const noexcept {
+    return D;
+  }
+
+ protected:
+  /// The backing array.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::array<T, D> values_;
+};
+
+template <size_t D, typename T>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const ExpandingArray<D, T>& expanding_array) {
+  if (expanding_array.size() == 1) {
+    return stream << expanding_array->at(0);
+  }
+  return stream << static_cast<at::ArrayRef<T>>(expanding_array);
+}
+
+/// A utility class that accepts either a container of `D`-many
+/// `c10::optional<T>` values, or a single `c10::optional<T>` value, which is
+/// internally repeated `D` times. It has the additional ability to accept
+/// containers of the underlying type `T` and convert them to a container of
+/// `c10::optional<T>`.
+template <size_t D, typename T = int64_t>
+class ExpandingArrayWithOptionalElem
+    : public ExpandingArray<D, c10::optional<T>> {
+ public:
+  using ExpandingArray<D, c10::optional<T>>::ExpandingArray;
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `initializer_list`
+  /// of the underlying type `T`. The extent of the length is checked against
+  /// the `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(std::initializer_list<T> list)
+      : ExpandingArrayWithOptionalElem(at::ArrayRef<T>(list)) {}
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `std::vector` of
+  /// the underlying type `T`. The extent of the length is checked against the
+  /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(std::vector<T> vec)
+      : ExpandingArrayWithOptionalElem(at::ArrayRef<T>(vec)) {}
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `at::ArrayRef` of
+  /// the underlying type `T`. The extent of the length is checked against the
+  /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(at::ArrayRef<T> values)
+      : ExpandingArray<D, c10::optional<T>>(0) {
+    // clang-format off
+    TORCH_CHECK(
+        values.size() == D,
+        "Expected ", D, " values, but instead got ", values.size());
+    // clang-format on
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = values[i];
+    }
+  }
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from a single value of the
+  /// underlying type `T`, which is repeated `D` times (where `D` is the extent
+  /// parameter of the `ExpandingArrayWithOptionalElem`).
+  /*implicit*/ ExpandingArrayWithOptionalElem(T single_size)
+      : ExpandingArray<D, c10::optional<T>>(0) {
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = single_size;
+    }
+  }
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from a correctly sized
+  /// `std::array` of the underlying type `T`.
+  /*implicit*/ ExpandingArrayWithOptionalElem(const std::array<T, D>& values)
+      : ExpandingArray<D, c10::optional<T>>(0) {
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = values[i];
+    }
+  }
+};
+
+template <size_t D, typename T>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const ExpandingArrayWithOptionalElem<D, T>& expanding_array_with_opt_elem) {
+  if (expanding_array_with_opt_elem.size() == 1) {
+    const auto& elem = expanding_array_with_opt_elem->at(0);
+    stream << (elem.has_value() ? c10::str(elem.value()) : "None");
+  } else {
+    std::vector<std::string> str_array;
+    for (const auto& elem : *expanding_array_with_opt_elem) {
+      str_array.emplace_back(
+          elem.has_value() ? c10::str(elem.value()) : "None");
+    }
+    stream << at::ArrayRef<std::string>(str_array);
+  }
+  return stream;
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..f396bf2f3fc4b67ca32bdd745ede726531feed6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h
@@ -0,0 +1,389 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch {
+namespace fft {
+
+/// Computes the 1 dimensional fast Fourier transform over a given dimension.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kComplexDouble);
+/// torch::fft::fft(t);
+/// ```
+inline Tensor fft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_fft_symint(self, n, dim, norm);
+}
+
+/// Computes the 1 dimensional inverse Fourier transform over a given dimension.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kComplexDouble);
+/// torch::fft::ifft(t);
+/// ```
+inline Tensor ifft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ifft_symint(self, n, dim, norm);
+}
+
+/// Computes the 2-dimensional fast Fourier transform over the given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fft2(t);
+/// ```
+inline Tensor fft2(
+    const Tensor& self,
+    OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_fft2(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.fft2
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifft2(t);
+/// ```
+inline Tensor ifft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ifft2(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fftn(t);
+/// ```
+inline Tensor fftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    at::OptionalIntArrayRef dim = c10::nullopt,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_fftn(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifftn(t);
+/// ```
+inline Tensor ifftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    at::OptionalIntArrayRef dim = c10::nullopt,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ifftn(self, s, dim, norm);
+}
+
+/// Computes the 1 dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128);
+/// auto T = torch::fft::rfft(t);
+/// assert(T.is_complex() && T.numel() == 128 / 2 + 1);
+/// ```
+inline Tensor rfft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_rfft_symint(self, n, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfft
+///
+/// The input is a onesided Hermitian Fourier domain signal, with real-valued
+/// output. See https://pytorch.org/docs/master/fft.html#torch.fft.irfft
+///
+/// Example:
+/// ```
+/// auto T = torch::randn(128 / 2 + 1, torch::kComplexDouble);
+/// auto t = torch::fft::irfft(t, /*n=*/128);
+/// assert(t.is_floating_point() && T.numel() == 128);
+/// ```
+inline Tensor irfft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_irfft_symint(self, n, dim, norm);
+}
+
+/// Computes the 2-dimensional FFT of real input. Returns a onesided Hermitian
+/// output. See https://pytorch.org/docs/master/fft.html#torch.fft.rfft2
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfft2(t);
+/// ```
+inline Tensor rfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_rfft2(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfft2.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfft2(t);
+/// ```
+inline Tensor irfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_irfft2(self, s, dim, norm);
+}
+
+/// Computes the N dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfftn(t);
+/// ```
+inline Tensor rfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    at::OptionalIntArrayRef dim = c10::nullopt,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_rfftn(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfftn.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfftn(t);
+/// ```
+inline Tensor irfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    at::OptionalIntArrayRef dim = c10::nullopt,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_irfftn(self, s, dim, norm);
+}
+
+/// Computes the 1 dimensional FFT of a onesided Hermitian signal
+///
+/// The input represents a Hermitian symmetric time domain signal. The returned
+/// Fourier domain representation of such a signal is a real-valued. See
+/// https://pytorch.org/docs/master/fft.html#torch.fft.hfft
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128 / 2 + 1, torch::kComplexDouble);
+/// auto T = torch::fft::hfft(t, /*n=*/128);
+/// assert(T.is_floating_point() && T.numel() == 128);
+/// ```
+inline Tensor hfft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_hfft_symint(self, n, dim, norm);
+}
+
+/// Computes the inverse FFT of a real-valued Fourier domain signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.ihfft.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn(128, torch::kDouble);
+/// auto t = torch::fft::ihfft(T);
+/// assert(t.is_complex() && T.numel() == 128 / 2 + 1);
+/// ```
+inline Tensor ihfft(
+    const Tensor& self,
+    c10::optional<SymInt> n = c10::nullopt,
+    int64_t dim = -1,
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ihfft_symint(self, n, dim, norm);
+}
+
+/// Computes the 2-dimensional FFT of a Hermitian symmetric input signal.
+///
+/// The input is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.hfft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 65}, torch::kComplexDouble);
+/// auto T = torch::fft::hfft2(t, /*s=*/{128, 128});
+/// assert(T.is_floating_point() && T.numel() == 128 * 128);
+/// ```
+inline Tensor hfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_hfft2(self, s, dim, norm);
+}
+
+/// Computes the 2-dimensional IFFT of a real input signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See
+/// https://pytorch.org/docs/master/fft.html#torch.fft.ihfft2.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn({128, 128}, torch::kDouble);
+/// auto t = torch::fft::hfft2(T);
+/// assert(t.is_complex() && t.size(1) == 65);
+/// ```
+inline Tensor ihfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ihfft2(self, s, dim, norm);
+}
+
+/// Computes the N-dimensional FFT of a Hermitian symmetric input signal.
+///
+/// The input is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/master/fft.html#torch.fft.hfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 65}, torch::kComplexDouble);
+/// auto T = torch::fft::hfftn(t, /*s=*/{128, 128});
+/// assert(T.is_floating_point() && T.numel() == 128 * 128);
+/// ```
+inline Tensor hfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_hfftn(self, s, dim, norm);
+}
+
+/// Computes the N-dimensional IFFT of a real input signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See
+/// https://pytorch.org/docs/master/fft.html#torch.fft.ihfftn.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn({128, 128}, torch::kDouble);
+/// auto t = torch::fft::hfft2(T);
+/// assert(t.is_complex() && t.size(1) == 65);
+/// ```
+inline Tensor ihfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = c10::nullopt,
+    IntArrayRef dim = {-2, -1},
+    c10::optional<c10::string_view> norm = c10::nullopt) {
+  return torch::fft_ihfftn(self, s, dim, norm);
+}
+
+/// Computes the discrete Fourier Transform sample frequencies for a signal of
+/// size n.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble);
+/// ```
+inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options = {}) {
+  return torch::fft_fftfreq(n, d, options);
+}
+
+inline Tensor fftfreq(int64_t n, const TensorOptions& options = {}) {
+  return torch::fft_fftfreq(n, /*d=*/1.0, options);
+}
+
+/// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
+///
+/// Like torch.fft.rfft, only the positive frequencies are included.
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble);
+/// ```
+inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, d, options);
+}
+
+inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, /*d=*/1.0, options);
+}
+
+/// Reorders n-dimensional FFT output to have negative frequency terms first, by
+/// a torch.roll operation.
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
+/// ```
+inline Tensor fftshift(
+    const Tensor& x,
+    at::OptionalIntArrayRef dim = c10::nullopt) {
+  return torch::fft_fftshift(x, dim);
+}
+
+/// Inverse of torch.fft.fftshift
+///
+/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto shift = torch::fft::fftshift(x)
+/// auto unshift = torch::fft::ifftshift(shift);
+/// assert(torch::allclose(x, unshift));
+/// ```
+inline Tensor ifftshift(
+    const Tensor& x,
+    at::OptionalIntArrayRef dim = c10::nullopt) {
+  return torch::fft_ifftshift(x, dim);
+}
+
+} // namespace fft
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c7f520b157402bccb47cb136a5efe227e2c1a6a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <vector>
+
+namespace torch {
+
+class TORCH_API IMethod {
+  /*
+  IMethod provides a portable interface for torch methods, whether
+  they are backed by torchscript or python/deploy.
+
+  This is helpful since torchscript methods provide additional information
+  (e.g. FunctionSchema, Graph) which aren't available in pure python methods.
+
+  Higher level APIs should prefer depending on this interface rather
+  than a specific implementation of it, to promote portability and reuse, and
+  avoid unintentional dependencies on e.g. script methods.
+
+  Note: This API is experimental, and may evolve.
+  */
+ public:
+  using IValueList = std::vector<c10::IValue>;
+  using IValueMap = std::unordered_map<std::string, at::IValue>;
+
+  IMethod() = default;
+  IMethod(const IMethod&) = default;
+  IMethod& operator=(const IMethod&) = default;
+  IMethod(IMethod&&) noexcept = default;
+  IMethod& operator=(IMethod&&) noexcept = default;
+  virtual ~IMethod() = default;
+
+  virtual c10::IValue operator()(
+      std::vector<c10::IValue> args,
+      const IValueMap& kwargs = IValueMap()) const = 0;
+
+  virtual const std::string& name() const = 0;
+
+  // Returns an ordered list of argument names, possible in both
+  // script and python methods.  This is a more portable dependency
+  // than a ScriptMethod FunctionSchema, which has more information
+  // than can be generally expected from a python method.
+  const std::vector<std::string>& getArgumentNames() const;
+
+ protected:
+  virtual void setArgumentNames(
+      std::vector<std::string>& argumentNames) const = 0;
+
+ private:
+  mutable bool isArgumentNamesInitialized_{false};
+  mutable std::vector<std::string> argumentNames_;
+};
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e97eb6382eb9db096b7059bf7a6156b8d34b319
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <memory>
+#include <string>
+
+namespace torch {
+namespace jit {
+
+/// Compiles script code into an executable graph.
+///
+/// Takes a string containing functions in script syntax and compiles them into
+/// a module (graph). The returned module provides a `run_method` function
+/// that may be used to invoke the compiled functions.
+///
+/// For example:
+/// \rst
+/// .. code-block:: cpp
+///
+///   auto module = torch::jit::compile(R"JIT(
+///     def relu_script(a, b):
+///       return torch.relu(a + b)
+///     def test_while(a, i):
+///       while i < 10:
+///         a += a
+///         i += 1
+///       return a
+///   )JIT");
+///   IValue output = module->run_method("relu_script", a, b);
+/// \endrst
+TORCH_API std::shared_ptr<CompilationUnit> compile(const std::string& source);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/linalg.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/linalg.h
new file mode 100644
index 0000000000000000000000000000000000000000..e424b4d81ff773f42d63556741e51e6125519cd0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/linalg.h
@@ -0,0 +1,1065 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch {
+namespace linalg {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor cholesky(const Tensor& self) {
+  return torch::linalg_cholesky(self);
+}
+
+inline Tensor cholesky_out(Tensor& result, const Tensor& self) {
+  return torch::linalg_cholesky_out(result, self);
+}
+
+inline Tensor det(const Tensor& self) {
+  return torch::linalg_det(self);
+}
+
+inline std::tuple<Tensor, Tensor> slogdet(const Tensor& input) {
+  return torch::linalg_slogdet(input);
+}
+
+inline std::tuple<Tensor&, Tensor&> slogdet_out(
+    Tensor& sign,
+    Tensor& logabsdet,
+    const Tensor& input) {
+  return torch::linalg_slogdet_out(sign, logabsdet, input);
+}
+
+inline std::tuple<Tensor, Tensor> eig(const Tensor& self) {
+  return torch::linalg_eig(self);
+}
+
+inline std::tuple<Tensor&, Tensor&> eig_out(
+    Tensor& eigvals,
+    Tensor& eigvecs,
+    const Tensor& self) {
+  return torch::linalg_eig_out(eigvals, eigvecs, self);
+}
+
+inline Tensor eigvals(const Tensor& self) {
+  return torch::linalg_eigvals(self);
+}
+
+inline Tensor& eigvals_out(Tensor& result, const Tensor& self) {
+  return torch::linalg_eigvals_out(result, self);
+}
+
+inline std::tuple<Tensor, Tensor> eigh(
+    const Tensor& self,
+    c10::string_view uplo) {
+  return torch::linalg_eigh(self, uplo);
+}
+
+inline std::tuple<Tensor&, Tensor&> eigh_out(
+    Tensor& eigvals,
+    Tensor& eigvecs,
+    const Tensor& self,
+    c10::string_view uplo) {
+  return torch::linalg_eigh_out(eigvals, eigvecs, self, uplo);
+}
+
+inline Tensor eigvalsh(const Tensor& self, c10::string_view uplo) {
+  return torch::linalg_eigvalsh(self, uplo);
+}
+
+inline Tensor& eigvalsh_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::string_view uplo) {
+  return torch::linalg_eigvalsh_out(result, self, uplo);
+}
+
+inline Tensor householder_product(const Tensor& input, const Tensor& tau) {
+  return torch::linalg_householder_product(input, tau);
+}
+
+inline Tensor& householder_product_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& tau) {
+  return torch::linalg_householder_product_out(result, input, tau);
+}
+
+inline std::tuple<Tensor, Tensor> lu_factor(
+    const Tensor& self,
+    const bool pivot) {
+  return torch::linalg_lu_factor(self, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&> lu_factor_out(
+    Tensor& LU,
+    Tensor& pivots,
+    const Tensor& self,
+    const bool pivot) {
+  return torch::linalg_lu_factor_out(LU, pivots, self, pivot);
+}
+
+inline std::tuple<Tensor, Tensor, Tensor> lu(
+    const Tensor& self,
+    const bool pivot) {
+  return torch::linalg_lu(self, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> lu_out(
+    Tensor& P,
+    Tensor& L,
+    Tensor& U,
+    const Tensor& self,
+    const bool pivot) {
+  return torch::linalg_lu_out(P, L, U, self, pivot);
+}
+
+inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(
+    const Tensor& self,
+    const Tensor& b,
+    c10::optional<double> cond,
+    c10::optional<c10::string_view> driver) {
+  return torch::linalg_lstsq(self, b, cond, driver);
+}
+
+inline Tensor matrix_exp(const Tensor& self) {
+  return torch::linalg_matrix_exp(self);
+}
+
+inline Tensor norm(
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor norm(
+    const Tensor& self,
+    c10::string_view ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_norm(self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& norm_out(
+    Tensor& result,
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_norm_out(
+      result, self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& norm_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::string_view ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor vector_norm(
+    const Tensor& self,
+    Scalar ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_vector_norm(self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& vector_norm_out(
+    Tensor& result,
+    const Tensor& self,
+    Scalar ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return torch::linalg_vector_norm_out(
+      result, self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor matrix_norm(
+    const Tensor& self,
+    const Scalar& ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype) {
+  return torch::linalg_matrix_norm(self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor& matrix_norm_out(
+    const Tensor& self,
+    const Scalar& ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& result) {
+  return torch::linalg_matrix_norm_out(result, self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor matrix_norm(
+    const Tensor& self,
+    std::string ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype) {
+  return torch::linalg_matrix_norm(self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor& matrix_norm_out(
+    const Tensor& self,
+    std::string ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& result) {
+  return torch::linalg_matrix_norm_out(result, self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor matrix_power(const Tensor& self, int64_t n) {
+  return torch::linalg_matrix_power(self, n);
+}
+
+inline Tensor& matrix_power_out(const Tensor& self, int64_t n, Tensor& result) {
+  return torch::linalg_matrix_power_out(result, self, n);
+}
+
+inline Tensor matrix_rank(const Tensor& input, double tol, bool hermitian) {
+  return torch::linalg_matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    const Tensor& tol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    c10::optional<double> atol,
+    c10::optional<double> rtol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank(input, atol, rtol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    const c10::optional<Tensor>& atol,
+    const c10::optional<Tensor>& rtol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank(input, atol, rtol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    double tol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank_out(result, input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& tol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank_out(result, input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    c10::optional<double> atol,
+    c10::optional<double> rtol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    const c10::optional<Tensor>& atol,
+    const c10::optional<Tensor>& rtol,
+    bool hermitian) {
+  return torch::linalg_matrix_rank_out(result, input, atol, rtol, hermitian);
+}
+
+inline Tensor multi_dot(TensorList tensors) {
+  return torch::linalg_multi_dot(tensors);
+}
+
+inline Tensor& multi_dot_out(TensorList tensors, Tensor& result) {
+  return torch::linalg_multi_dot_out(result, tensors);
+}
+
+inline Tensor pinv(const Tensor& input, double rcond, bool hermitian) {
+  return torch::linalg_pinv(input, rcond, hermitian);
+}
+
+inline Tensor& pinv_out(
+    Tensor& result,
+    const Tensor& input,
+    double rcond,
+    bool hermitian) {
+  return torch::linalg_pinv_out(result, input, rcond, hermitian);
+}
+
+inline std::tuple<Tensor, Tensor> qr(
+    const Tensor& input,
+    c10::string_view mode) {
+  return torch::linalg_qr(input, mode);
+}
+
+inline std::tuple<Tensor&, Tensor&> qr_out(
+    Tensor& Q,
+    Tensor& R,
+    const Tensor& input,
+    c10::string_view mode) {
+  return torch::linalg_qr_out(Q, R, input, mode);
+}
+
+inline std::tuple<Tensor, Tensor> solve_ex(
+    const Tensor& input,
+    const Tensor& other,
+    bool left,
+    bool check_errors) {
+  return torch::linalg_solve_ex(input, other, left, check_errors);
+}
+
+inline std::tuple<Tensor&, Tensor&> solve_ex_out(
+    Tensor& result,
+    Tensor& info,
+    const Tensor& input,
+    const Tensor& other,
+    bool left,
+    bool check_errors) {
+  return torch::linalg_solve_ex_out(
+      result, info, input, other, left, check_errors);
+}
+
+inline Tensor solve(const Tensor& input, const Tensor& other, bool left) {
+  return torch::linalg_solve(input, other, left);
+}
+
+inline Tensor& solve_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& other,
+    bool left) {
+  return torch::linalg_solve_out(result, input, other, left);
+}
+
+inline Tensor solve_triangular(
+    const Tensor& input,
+    const Tensor& other,
+    bool upper,
+    bool left,
+    bool unitriangular) {
+  return torch::linalg_solve_triangular(
+      input, other, upper, left, unitriangular);
+}
+
+inline Tensor& solve_triangular_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& other,
+    bool upper,
+    bool left,
+    bool unitriangular) {
+  return torch::linalg_solve_triangular_out(
+      result, input, other, upper, left, unitriangular);
+}
+
+inline std::tuple<Tensor, Tensor, Tensor> svd(
+    const Tensor& input,
+    bool full_matrices,
+    c10::optional<c10::string_view> driver) {
+  return torch::linalg_svd(input, full_matrices, driver);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
+    Tensor& U,
+    Tensor& S,
+    Tensor& Vh,
+    const Tensor& input,
+    bool full_matrices,
+    c10::optional<c10::string_view> driver) {
+  return torch::linalg_svd_out(U, S, Vh, input, full_matrices, driver);
+}
+
+inline Tensor svdvals(
+    const Tensor& input,
+    c10::optional<c10::string_view> driver) {
+  return torch::linalg_svdvals(input, driver);
+}
+
+inline Tensor& svdvals_out(
+    Tensor& result,
+    const Tensor& input,
+    c10::optional<c10::string_view> driver) {
+  return torch::linalg_svdvals_out(result, input, driver);
+}
+
+inline Tensor tensorinv(const Tensor& self, int64_t ind) {
+  return torch::linalg_tensorinv(self, ind);
+}
+
+inline Tensor& tensorinv_out(Tensor& result, const Tensor& self, int64_t ind) {
+  return torch::linalg_tensorinv_out(result, self, ind);
+}
+
+inline Tensor tensorsolve(
+    const Tensor& self,
+    const Tensor& other,
+    OptionalIntArrayRef dims) {
+  return torch::linalg_tensorsolve(self, other, dims);
+}
+
+inline Tensor& tensorsolve_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other,
+    OptionalIntArrayRef dims) {
+  return torch::linalg_tensorsolve_out(result, self, other, dims);
+}
+
+inline Tensor inv(const Tensor& input) {
+  return torch::linalg_inv(input);
+}
+
+inline Tensor& inv_out(Tensor& result, const Tensor& input) {
+  return torch::linalg_inv_out(result, input);
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// Cholesky decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.cholesky
+///
+/// Example:
+/// ```
+/// auto A = torch::randn({4, 4});
+/// auto A = torch::matmul(A, A.t());
+/// auto L = torch::linalg::cholesky(A);
+/// assert(torch::allclose(torch::matmul(L, L.t()), A));
+/// ```
+inline Tensor cholesky(const Tensor& self) {
+  return detail::cholesky(self);
+}
+
+inline Tensor cholesky_out(Tensor& result, const Tensor& self) {
+  return detail::cholesky_out(result, self);
+}
+
+// C10_DEPRECATED_MESSAGE("linalg_det is deprecated, use det instead.")
+inline Tensor linalg_det(const Tensor& self) {
+  return detail::det(self);
+}
+
+/// See the documentation of torch.linalg.det
+inline Tensor det(const Tensor& self) {
+  return detail::det(self);
+}
+
+/// Computes the sign and (natural) logarithm of the determinant
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.slogdet
+inline std::tuple<Tensor, Tensor> slogdet(const Tensor& input) {
+  return detail::slogdet(input);
+}
+
+inline std::tuple<Tensor&, Tensor&> slogdet_out(
+    Tensor& sign,
+    Tensor& logabsdet,
+    const Tensor& input) {
+  return detail::slogdet_out(sign, logabsdet, input);
+}
+
+/// Computes eigenvalues and eigenvectors of non-symmetric/non-hermitian
+/// matrices
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eig
+inline std::tuple<Tensor, Tensor> eig(const Tensor& self) {
+  return detail::eig(self);
+}
+
+inline std::tuple<Tensor&, Tensor&> eig_out(
+    Tensor& eigvals,
+    Tensor& eigvecs,
+    const Tensor& self) {
+  return detail::eig_out(eigvals, eigvecs, self);
+}
+
+/// Computes eigenvalues of non-symmetric/non-hermitian matrices
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigvals
+inline Tensor eigvals(const Tensor& self) {
+  return detail::eigvals(self);
+}
+
+inline Tensor& eigvals_out(Tensor& result, const Tensor& self) {
+  return detail::eigvals_out(result, self);
+}
+
+/// Computes eigenvalues and eigenvectors
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigh
+inline std::tuple<Tensor, Tensor> eigh(
+    const Tensor& self,
+    c10::string_view uplo) {
+  return detail::eigh(self, uplo);
+}
+
+inline std::tuple<Tensor&, Tensor&> eigh_out(
+    Tensor& eigvals,
+    Tensor& eigvecs,
+    const Tensor& self,
+    c10::string_view uplo) {
+  return detail::eigh_out(eigvals, eigvecs, self, uplo);
+}
+
+/// Computes eigenvalues
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.eigvalsh
+inline Tensor eigvalsh(const Tensor& self, c10::string_view uplo) {
+  return detail::eigvalsh(self, uplo);
+}
+
+inline Tensor& eigvalsh_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::string_view uplo) {
+  return detail::eigvalsh_out(result, self, uplo);
+}
+
+/// Computes the product of Householder matrices
+///
+/// See
+/// https://pytorch.org/docs/master/linalg.html#torch.linalg.householder_product
+inline Tensor householder_product(const Tensor& input, const Tensor& tau) {
+  return detail::householder_product(input, tau);
+}
+
+inline Tensor& householder_product_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& tau) {
+  return detail::householder_product_out(result, input, tau);
+}
+
+inline std::tuple<Tensor, Tensor, Tensor, Tensor> lstsq(
+    const Tensor& self,
+    const Tensor& b,
+    c10::optional<double> cond,
+    c10::optional<c10::string_view> driver) {
+  return detail::lstsq(self, b, cond, driver);
+}
+
+/// Computes the matrix exponential
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_exp
+inline Tensor matrix_exp(const Tensor& input) {
+  return detail::matrix_exp(input);
+}
+
+// C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
+inline Tensor linalg_norm(
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+// C10_DEPRECATED_MESSAGE("linalg_norm is deprecated, use norm instead.")
+inline Tensor linalg_norm(
+    const Tensor& self,
+    c10::string_view ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm(self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+// C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out
+// instead.")
+inline Tensor& linalg_norm_out(
+    Tensor& result,
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+// C10_DEPRECATED_MESSAGE("linalg_norm_out is deprecated, use norm_out
+// instead.")
+inline Tensor& linalg_norm_out(
+    Tensor& result,
+    const Tensor& self,
+    c10::string_view ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+/// Computes the LU factorization with partial pivoting
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu_factor
+inline std::tuple<Tensor, Tensor> lu_factor(
+    const Tensor& input,
+    const bool pivot = true) {
+  return detail::lu_factor(input, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&> lu_factor_out(
+    Tensor& LU,
+    Tensor& pivots,
+    const Tensor& self,
+    const bool pivot = true) {
+  return detail::lu_factor_out(LU, pivots, self, pivot);
+}
+
+/// Computes the LU factorization with partial pivoting
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.lu
+inline std::tuple<Tensor, Tensor, Tensor> lu(
+    const Tensor& input,
+    const bool pivot = true) {
+  return detail::lu(input, pivot);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> lu_out(
+    Tensor& P,
+    Tensor& L,
+    Tensor& U,
+    const Tensor& self,
+    const bool pivot = true) {
+  return detail::lu_out(P, L, U, self, pivot);
+}
+
+inline Tensor norm(
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm(self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor norm(
+    const Tensor& self,
+    std::string ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm(self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& norm_out(
+    Tensor& result,
+    const Tensor& self,
+    const optional<Scalar>& opt_ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& norm_out(
+    Tensor& result,
+    const Tensor& self,
+    std::string ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.vector_norm
+inline Tensor vector_norm(
+    const Tensor& self,
+    Scalar ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::vector_norm(self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+inline Tensor& vector_norm_out(
+    Tensor& result,
+    const Tensor& self,
+    Scalar ord,
+    OptionalIntArrayRef opt_dim,
+    bool keepdim,
+    optional<ScalarType> opt_dtype) {
+  return detail::vector_norm_out(
+      result, self, ord, opt_dim, keepdim, opt_dtype);
+}
+
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_norm
+inline Tensor matrix_norm(
+    const Tensor& self,
+    const Scalar& ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype) {
+  return detail::matrix_norm(self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor& matrix_norm_out(
+    const Tensor& self,
+    const Scalar& ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& result) {
+  return detail::matrix_norm_out(self, ord, dim, keepdim, dtype, result);
+}
+
+inline Tensor matrix_norm(
+    const Tensor& self,
+    std::string ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype) {
+  return detail::matrix_norm(self, ord, dim, keepdim, dtype);
+}
+
+inline Tensor& matrix_norm_out(
+    const Tensor& self,
+    std::string ord,
+    IntArrayRef dim,
+    bool keepdim,
+    optional<ScalarType> dtype,
+    Tensor& result) {
+  return detail::matrix_norm_out(self, ord, dim, keepdim, dtype, result);
+}
+
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_power
+inline Tensor matrix_power(const Tensor& self, int64_t n) {
+  return detail::matrix_power(self, n);
+}
+
+inline Tensor& matrix_power_out(const Tensor& self, int64_t n, Tensor& result) {
+  return detail::matrix_power_out(self, n, result);
+}
+
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_rank
+inline Tensor matrix_rank(const Tensor& input, double tol, bool hermitian) {
+  return detail::matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    const Tensor& tol,
+    bool hermitian) {
+  return detail::matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    c10::optional<double> atol,
+    c10::optional<double> rtol,
+    bool hermitian) {
+  return detail::matrix_rank(input, atol, rtol, hermitian);
+}
+
+inline Tensor matrix_rank(
+    const Tensor& input,
+    const c10::optional<Tensor>& atol,
+    const c10::optional<Tensor>& rtol,
+    bool hermitian) {
+  return detail::matrix_rank(input, atol, rtol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    double tol,
+    bool hermitian) {
+  return detail::matrix_rank_out(result, input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& tol,
+    bool hermitian) {
+  return detail::matrix_rank_out(result, input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    c10::optional<double> atol,
+    c10::optional<double> rtol,
+    bool hermitian) {
+  return detail::matrix_rank_out(result, input, atol, rtol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(
+    Tensor& result,
+    const Tensor& input,
+    const c10::optional<Tensor>& atol,
+    const c10::optional<Tensor>& rtol,
+    bool hermitian) {
+  return detail::matrix_rank_out(result, input, atol, rtol, hermitian);
+}
+
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.multi_dot
+inline Tensor multi_dot(TensorList tensors) {
+  return detail::multi_dot(tensors);
+}
+
+inline Tensor& multi_dot_out(TensorList tensors, Tensor& result) {
+  return detail::multi_dot_out(tensors, result);
+}
+
+/// Computes the pseudo-inverse
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.pinv
+inline Tensor pinv(
+    const Tensor& input,
+    double rcond = 1e-15,
+    bool hermitian = false) {
+  return detail::pinv(input, rcond, hermitian);
+}
+
+inline Tensor& pinv_out(
+    Tensor& result,
+    const Tensor& input,
+    double rcond = 1e-15,
+    bool hermitian = false) {
+  return detail::pinv_out(result, input, rcond, hermitian);
+}
+
+/// Computes the QR decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.qr
+inline std::tuple<Tensor, Tensor> qr(
+    const Tensor& input,
+    c10::string_view mode = "reduced") {
+  // C++17 Change the initialisation to "reduced"sv
+  //       Same for qr_out
+  return detail::qr(input, mode);
+}
+
+inline std::tuple<Tensor&, Tensor&> qr_out(
+    Tensor& Q,
+    Tensor& R,
+    const Tensor& input,
+    c10::string_view mode = "reduced") {
+  return detail::qr_out(Q, R, input, mode);
+}
+
+/// Computes the LDL decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_factor_ex
+inline std::tuple<Tensor, Tensor, Tensor> ldl_factor_ex(
+    const Tensor& input,
+    bool hermitian,
+    bool check_errors) {
+  return torch::linalg_ldl_factor_ex(input, hermitian, check_errors);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> ldl_factor_ex_out(
+    Tensor& LD,
+    Tensor& pivots,
+    Tensor& info,
+    const Tensor& input,
+    bool hermitian,
+    bool check_errors) {
+  return torch::linalg_ldl_factor_ex_out(
+      LD, pivots, info, input, hermitian, check_errors);
+}
+
+/// Solve a system of linear equations using the LDL decomposition
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.ldl_solve
+inline Tensor ldl_solve(
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool hermitian) {
+  return torch::linalg_ldl_solve(LD, pivots, B, hermitian);
+}
+
+inline Tensor& ldl_solve_out(
+    Tensor& result,
+    const Tensor& LD,
+    const Tensor& pivots,
+    const Tensor& B,
+    bool hermitian) {
+  return torch::linalg_ldl_solve_out(result, LD, pivots, B, hermitian);
+}
+
+/// Solves a system linear system AX = B
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve_ex
+inline std::tuple<Tensor, Tensor> solve_ex(
+    const Tensor& input,
+    const Tensor& other,
+    bool left,
+    bool check_errors) {
+  return detail::solve_ex(input, other, left, check_errors);
+}
+
+inline std::tuple<Tensor&, Tensor&> solve_ex_out(
+    Tensor& result,
+    Tensor& info,
+    const Tensor& input,
+    const Tensor& other,
+    bool left,
+    bool check_errors) {
+  return detail::solve_ex_out(result, info, input, other, left, check_errors);
+}
+
+/// Computes a tensor `x` such that `matmul(input, x) = other`.
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.solve
+inline Tensor solve(const Tensor& input, const Tensor& other, bool left) {
+  return detail::solve(input, other, left);
+}
+
+inline Tensor& solve_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& other,
+    bool left) {
+  return detail::solve_out(result, input, other, left);
+}
+
+/// Computes a solution of a linear system AX = B for input = A and other = B
+/// whenever A is square upper or lower triangular and does not have zeros in
+/// the diagonal
+///
+/// See
+/// https://pytorch.org/docs/master/linalg.html#torch.linalg.solve_triangular
+inline Tensor solve_triangular(
+    const Tensor& input,
+    const Tensor& other,
+    bool upper,
+    bool left,
+    bool unitriangular) {
+  return detail::solve_triangular(input, other, upper, left, unitriangular);
+}
+
+inline Tensor& solve_triangular_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& other,
+    bool upper,
+    bool left,
+    bool unitriangular) {
+  return detail::solve_triangular_out(
+      result, input, other, upper, left, unitriangular);
+}
+
+/// Computes the singular values and singular vectors
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.svd
+inline std::tuple<Tensor, Tensor, Tensor> svd(
+    const Tensor& input,
+    bool full_matrices,
+    c10::optional<c10::string_view> driver) {
+  return detail::svd(input, full_matrices, driver);
+}
+
+inline std::tuple<Tensor&, Tensor&, Tensor&> svd_out(
+    Tensor& U,
+    Tensor& S,
+    Tensor& Vh,
+    const Tensor& input,
+    bool full_matrices,
+    c10::optional<c10::string_view> driver) {
+  return detail::svd_out(U, S, Vh, input, full_matrices, driver);
+}
+
+/// Computes the singular values
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.svdvals
+inline Tensor svdvals(
+    const Tensor& input,
+    c10::optional<c10::string_view> driver) {
+  return detail::svdvals(input, driver);
+}
+
+inline Tensor& svdvals_out(
+    Tensor& result,
+    const Tensor& input,
+    c10::optional<c10::string_view> driver) {
+  return detail::svdvals_out(result, input, driver);
+}
+
+/// Computes the inverse of a tensor
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorinv
+///
+/// Example:
+/// ```
+/// auto a = torch::eye(4*6).reshape({4, 6, 8, 3});
+/// int64_t ind = 2;
+/// auto ainv = torch::linalg::tensorinv(a, ind);
+/// ```
+inline Tensor tensorinv(const Tensor& self, int64_t ind) {
+  return detail::tensorinv(self, ind);
+}
+
+inline Tensor& tensorinv_out(Tensor& result, const Tensor& self, int64_t ind) {
+  return detail::tensorinv_out(result, self, ind);
+}
+
+/// Computes a tensor `x` such that `tensordot(input, x, dims=x.dim()) = other`.
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorsolve
+///
+/// Example:
+/// ```
+/// auto a = torch::eye(2*3*4).reshape({2*3, 4, 2, 3, 4});
+/// auto b = torch::randn(2*3, 4);
+/// auto x = torch::linalg::tensorsolve(a, b);
+/// ```
+inline Tensor tensorsolve(
+    const Tensor& input,
+    const Tensor& other,
+    OptionalIntArrayRef dims) {
+  return detail::tensorsolve(input, other, dims);
+}
+
+inline Tensor& tensorsolve_out(
+    Tensor& result,
+    const Tensor& input,
+    const Tensor& other,
+    OptionalIntArrayRef dims) {
+  return detail::tensorsolve_out(result, input, other, dims);
+}
+
+/// Computes a tensor `inverse_input` such that `dot(input, inverse_input) =
+/// eye(input.size(0))`.
+///
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.inv
+inline Tensor inv(const Tensor& input) {
+  return detail::inv(input);
+}
+
+inline Tensor& inv_out(Tensor& result, const Tensor& input) {
+  return detail::inv_out(result, input);
+}
+
+} // namespace linalg
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..84db2fa61b460fa9a55ace3edd7a09ff195ff7b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+using MTLCommandBuffer_t = id<MTLCommandBuffer>;
+using DispatchQueue_t = dispatch_queue_t;
+#else
+using MTLCommandBuffer_t = void*;
+using DispatchQueue_t = void*;
+#endif
+
+namespace torch {
+namespace mps {
+
+/// Returns true if MPS device is available.
+bool TORCH_API is_available();
+
+/// Sets the RNG seed for the MPS device.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Waits for all streams on the MPS device to complete.
+/// This blocks the calling CPU thread by using the 'waitUntilCompleted()'
+/// method to wait for Metal command buffers finish executing all the
+/// encoded GPU operations before returning.
+void TORCH_API synchronize();
+
+/// Submits the currently active command buffer to run on the MPS device.
+void TORCH_API commit();
+
+/// Get the current command buffer to encode the Metal commands.
+MTLCommandBuffer_t TORCH_API get_command_buffer();
+
+/// Get the dispatch_queue_t to synchronize encoding the custom kernels
+/// with the PyTorch MPS backend.
+DispatchQueue_t TORCH_API get_dispatch_queue();
+
+} // namespace mps
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..a76cfe48d5abd20d9144502f2d23695f79a34d1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ATen_fwd.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <algorithm>
+
+namespace torch {
+namespace nested {
+
+/// Nested tensor
+///
+/// See
+/// https://pytorch.org/docs/master/nested.html#torch.nested.nested_tensor
+///
+/// ```
+// implemented on python object to allow torch.nested.nested_tensor to be
+// constructed with arbitrarily nested python objects - for now, only arbitrary
+// python lists and lists of Tensors
+// See torch/csrc/autograd/python_nested_functions_manual.cpp for Python
+// implementation
+// See here for C++ implementation
+inline at::Tensor nested_tensor(
+    at::TensorList nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  auto out = at::_nested_tensor_from_tensor_list(
+      nested_tensor_data,
+      c10::typeMetaToScalarType(options.dtype()),
+      c10::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
+    out.requires_grad_(true);
+  }
+  return out;
+}
+
+inline at::Tensor nested_tensor(
+    at::ArrayRef<detail::TensorDataContainer> nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  for (const auto& tdc : nested_tensor_data) {
+    TORCH_CHECK(
+        tdc.is_init_list(),
+        "nested_tensor() not implemented for these parameters");
+  }
+  // Construct a TensorList using nested_tensor_data
+  std::vector<at::Tensor> tensor_list(nested_tensor_data.size());
+  std::transform(
+      nested_tensor_data.begin(),
+      nested_tensor_data.end(),
+      tensor_list.begin(),
+      [&](const detail::TensorDataContainer& tdc) {
+        return tdc.convert_to_tensor(options);
+      });
+  auto out = at::_nested_tensor_from_tensor_list(
+      tensor_list,
+      c10::typeMetaToScalarType(options.dtype()),
+      c10::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
+    out.requires_grad_(true);
+  }
+  return out;
+}
+
+/// As Nested Tensor
+///
+/// See
+/// https://pytorch.org/docs/master/nested.html#torch.nested.as_nested_tensor
+///
+/// ```
+inline at::Tensor as_nested_tensor(
+    at::TensorList list,
+    c10::optional<at::ScalarType> dtype = c10::nullopt,
+    c10::optional<at::Device> device = c10::nullopt) {
+  return at::_nested_tensor_from_tensor_list(
+      list, dtype, c10::nullopt, device, c10::nullopt);
+}
+
+/// Nested to padded tensor
+///
+/// See
+/// https://pytorch.org/docs/master/nested.html#torch.nested.to_padded_tensor
+///
+/// ```
+inline at::Tensor to_padded_tensor(
+    const at::Tensor& self,
+    double padding,
+    at::OptionalIntArrayRef output_size = c10::nullopt) {
+  return at::nested_to_padded_tensor(self, padding, output_size);
+}
+
+} // namespace nested
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a0d26cd949f8c354364c2b8a793bb060f6c28d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional.h>
+#include <torch/nn/init.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules.h>
+#include <torch/nn/options.h>
+#include <torch/nn/pimpl.h>
+#include <torch/nn/utils.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h
new file mode 100644
index 0000000000000000000000000000000000000000..027ddb775fa87f9639987ab66555f2e1928e25ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/types.h>
+#include <torch/utils.h>
+
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch {
+namespace nn {
+/// The `clone()` method in the base `Module` class does not have knowledge of
+/// the concrete runtime type of its subclasses. Therefore, `clone()` must
+/// either be called from within the subclass, or from a base class that has
+/// knowledge of the concrete type. `Cloneable` uses the CRTP to gain
+/// knowledge of the subclass' static type and provide an implementation of the
+/// `clone()` method. We do not want to use this pattern in the base class,
+/// because then storing a module would always require templatizing it.
+template <typename Derived>
+// NOLINTNEXTLINE(bugprone-exception-escape)
+class Cloneable : public Module {
+ public:
+  using Module::Module;
+
+  /// `reset()` must perform initialization of all members with reference
+  /// semantics, most importantly parameters, buffers and submodules.
+  virtual void reset() = 0;
+
+  /// Performs a recursive "deep copy" of the `Module`, such that all parameters
+  /// and submodules in the cloned module are different from those in the
+  /// original module.
+  std::shared_ptr<Module> clone(
+      const optional<Device>& device = nullopt) const override {
+    NoGradGuard no_grad;
+
+    const auto& self = static_cast<const Derived&>(*this);
+    auto copy = std::make_shared<Derived>(self);
+    copy->parameters_.clear();
+    copy->buffers_.clear();
+    copy->children_.clear();
+    copy->reset();
+    TORCH_CHECK(
+        copy->parameters_.size() == parameters_.size(),
+        "The cloned module does not have the same number of "
+        "parameters as the original module after calling reset(). "
+        "Are you sure you called register_parameter() inside reset() "
+        "and not the constructor?");
+    for (const auto& parameter : named_parameters(/*recurse=*/false)) {
+      auto& tensor = *parameter;
+      auto data = device && tensor.device() != *device
+          ? tensor.to(*device)
+          : autograd::Variable(tensor).clone();
+      copy->parameters_[parameter.key()].set_data(data);
+    }
+    TORCH_CHECK(
+        copy->buffers_.size() == buffers_.size(),
+        "The cloned module does not have the same number of "
+        "buffers as the original module after calling reset(). "
+        "Are you sure you called register_buffer() inside reset() "
+        "and not the constructor?");
+    for (const auto& buffer : named_buffers(/*recurse=*/false)) {
+      auto& tensor = *buffer;
+      auto data = device && tensor.device() != *device
+          ? tensor.to(*device)
+          : autograd::Variable(tensor).clone();
+      copy->buffers_[buffer.key()].set_data(data);
+    }
+    TORCH_CHECK(
+        copy->children_.size() == children_.size(),
+        "The cloned module does not have the same number of "
+        "child modules as the original module after calling reset(). "
+        "Are you sure you called register_module() inside reset() "
+        "and not the constructor?");
+    for (const auto& child : children_) {
+      copy->children_[child.key()]->clone_(*child.value(), device);
+    }
+    return copy;
+  }
+
+ private:
+  void clone_(Module& other, const optional<Device>& device) final {
+    // Here we are *pretty* certain that `other's` type is `Derived` (because it
+    // was registered under the same name as `this`), but you never know what
+    // crazy things `reset()` does, so `dynamic_cast` just to be safe.
+    auto clone = std::dynamic_pointer_cast<Derived>(other.clone(device));
+    TORCH_CHECK(
+        clone != nullptr,
+        "Attempted to clone submodule, but it is of a "
+        "different type than the submodule it was to be cloned into");
+    static_cast<Derived&>(*this) = *clone;
+  }
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..46c705adf8def92827e91b71632c4e61dd5db168
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/nn/functional/batchnorm.h>
+#include <torch/nn/functional/conv.h>
+#include <torch/nn/functional/distance.h>
+#include <torch/nn/functional/dropout.h>
+#include <torch/nn/functional/embedding.h>
+#include <torch/nn/functional/fold.h>
+#include <torch/nn/functional/instancenorm.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/functional/loss.h>
+#include <torch/nn/functional/normalization.h>
+#include <torch/nn/functional/padding.h>
+#include <torch/nn/functional/pixelshuffle.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/functional/upsampling.h>
+#include <torch/nn/functional/vision.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..599e84a8e34fcf025f5e0aa3d7dcd6c6ef7b712b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -0,0 +1,966 @@
+#pragma once
+
+#include <ATen/Dispatch.h>
+#include <torch/nn/functional/dropout.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/options/activation.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/options/linear.h>
+#include <torch/types.h>
+#include <limits>
+#include <utility>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor elu(Tensor input, double alpha, bool inplace) {
+  if (inplace) {
+    return torch::elu_(input, alpha);
+  } else {
+    return torch::elu(input, alpha);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.elu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::elu(x, F::ELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+inline Tensor elu(Tensor input, const ELUFuncOptions& options = {}) {
+  return detail::elu(std::move(input), options.alpha(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor selu(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::selu_(input);
+  } else {
+    return torch::selu(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.selu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::selu(input, F::SELUFuncOptions(false));
+/// ```
+inline Tensor selu(Tensor input, const SELUFuncOptions& options = {}) {
+  return detail::selu(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hardshrink(const Tensor& input, double lambda) {
+  return torch::hardshrink(input, lambda);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hardshrink
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HardshrinkFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardshrink(x, F::HardshrinkFuncOptions().lambda(0.42));
+/// ```
+inline Tensor hardshrink(
+    const Tensor& input,
+    const HardshrinkFuncOptions& options = {}) {
+  return detail::hardshrink(input, options.lambda());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hardtanh(
+    Tensor input,
+    double min_val,
+    double max_val,
+    bool inplace) {
+  if (inplace) {
+    return torch::hardtanh_(input, min_val, max_val);
+  } else {
+    return torch::hardtanh(input, min_val, max_val);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hardtanh
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HardtanhFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardtanh(x,
+/// F::HardtanhFuncOptions().min_val(-1.0).max_val(1.0).inplace(true));
+/// ```
+inline Tensor hardtanh(Tensor input, const HardtanhFuncOptions& options = {}) {
+  return detail::hardtanh(
+      std::move(input),
+      options.min_val(),
+      options.max_val(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor leaky_relu(Tensor input, double negative_slope, bool inplace) {
+  if (inplace) {
+    return torch::leaky_relu_(input, negative_slope);
+  } else {
+    return torch::leaky_relu(input, negative_slope);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.leaky_relu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LeakyReLUFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::leaky_relu(x,
+/// F::LeakyReLUFuncOptions().negative_slope(0.42).inplace(true));
+/// ```
+inline Tensor leaky_relu(
+    Tensor input,
+    const LeakyReLUFuncOptions& options = {}) {
+  return detail::leaky_relu(
+      std::move(input), options.negative_slope(), options.inplace());
+}
+
+// ============================================================================
+
+inline Tensor logsigmoid(const Tensor& input) {
+  return torch::log_sigmoid(input);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor gumbel_softmax(
+    const Tensor& logits,
+    double tau,
+    bool hard,
+    int dim) {
+  auto gumbels =
+      -torch::empty_like(logits).exponential_().log(); // ~Gumbel(0,1)
+  gumbels = (logits + gumbels) / tau; // ~Gumbel(logits, tau)
+  auto y_soft = gumbels.softmax(dim);
+
+  torch::Tensor ret;
+  if (hard) {
+    // Straight through.
+    auto index = std::get<1>(y_soft.max(dim, /*keepdim=*/true));
+    auto y_hard = torch::zeros_like(logits).scatter_(dim, index, 1.0);
+    ret = y_hard - y_soft.detach() + y_soft;
+  } else {
+    ret = y_soft;
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.gumbel_softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GumbelSoftmaxFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(-1));
+/// ```
+inline Tensor gumbel_softmax(
+    const Tensor& logits,
+    const GumbelSoftmaxFuncOptions& options = {}) {
+  return detail::gumbel_softmax(
+      logits, options.tau(), options.hard(), options.dim());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softmax(
+    const Tensor& input,
+    int64_t dim,
+    c10::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == c10::nullopt) {
+    ret = input.softmax(dim);
+  } else {
+    ret = input.softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftmaxFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmax(input, F::SoftmaxFuncOptions(1));
+/// ```
+inline Tensor softmax(const Tensor& input, const SoftmaxFuncOptions& options) {
+  return detail::softmax(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softmin(
+    const Tensor& input,
+    int64_t dim,
+    c10::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == c10::nullopt) {
+    ret = (-input).softmax(dim);
+  } else {
+    ret = (-input).softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softmin
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftminFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmin(input, F::SoftminFuncOptions(1));
+/// ```
+inline Tensor softmin(const Tensor& input, const SoftminFuncOptions& options) {
+  return detail::softmin(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor log_softmax(
+    const Tensor& input,
+    int64_t dim,
+    c10::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == c10::nullopt) {
+    ret = input.log_softmax(dim);
+  } else {
+    ret = input.log_softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.log_softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LogSoftmaxFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::log_softmax(input, LogSoftmaxFuncOptions(1));
+/// ```
+inline Tensor log_softmax(
+    const Tensor& input,
+    const LogSoftmaxFuncOptions& options) {
+  return detail::log_softmax(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor glu(const Tensor& input, int64_t dim) {
+  TORCH_CHECK(
+      input.dim() != 0,
+      "glu does not suppport scalars because halving size must be even");
+  return torch::glu(input, dim);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.glu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::glu(input, GLUFuncOptions(1));
+/// ```
+inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) {
+  return detail::glu(input, options.dim());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor gelu(const Tensor& input, string approximate) {
+  return torch::gelu(input, approximate);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+inline Tensor gelu(const Tensor& input, const GELUFuncOptions& options = {}) {
+  return detail::gelu(input, options.approximate());
+}
+
+// ============================================================================
+
+inline Tensor silu(const Tensor& input) {
+  return torch::silu(input);
+}
+
+// ============================================================================
+
+inline Tensor mish(const Tensor& input) {
+  return torch::mish(input);
+}
+
+// ============================================================================
+
+inline Tensor prelu(const Tensor& input, const Tensor& weight) {
+  return torch::prelu(input, weight);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor relu(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::relu_(input);
+  } else {
+    return torch::relu(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.relu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ReLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu(x, F::ReLUFuncOptions().inplace(true));
+/// ```
+inline Tensor relu(Tensor input, const ReLUFuncOptions& options = {}) {
+  return detail::relu(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor relu6(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::relu6_(input);
+  } else {
+    return torch::relu6(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.relu6
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ReLU6FuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu6(x, F::ReLU6FuncOptions().inplace(true));
+/// ```
+inline Tensor relu6(Tensor input, const ReLU6FuncOptions& options = {}) {
+  return detail::relu6(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor rrelu(
+    Tensor input,
+    double lower,
+    double upper,
+    bool training,
+    bool inplace) {
+  if (inplace) {
+    return torch::rrelu_(input, lower, upper, training);
+  } else {
+    return torch::rrelu(input, lower, upper, training);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.rrelu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::RReLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::rrelu(x, F::RReLUFuncOptions().lower(0.1).upper(0.4).inplace(true));
+/// ```
+inline Tensor rrelu(Tensor input, const RReLUFuncOptions& options = {}) {
+  return detail::rrelu(
+      std::move(input),
+      options.lower(),
+      options.upper(),
+      options.training(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor celu(Tensor input, double alpha, bool inplace) {
+  if (inplace) {
+    return torch::celu_(input, alpha);
+  } else {
+    return torch::celu(input, alpha);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.celu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::celu(x, F::CELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+inline Tensor celu(Tensor input, const CELUFuncOptions& options = {}) {
+  return detail::celu(std::move(input), options.alpha(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softplus(const Tensor& input, double beta, double threshold) {
+  return torch::softplus(input, beta, threshold);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softplus
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftplusFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softplus(x, F::SoftplusFuncOptions().beta(0.5).threshold(3.0));
+/// ```
+inline Tensor softplus(
+    const Tensor& input,
+    const SoftplusFuncOptions& options = {}) {
+  return detail::softplus(input, options.beta(), options.threshold());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softshrink(const Tensor& input, double lambda) {
+  return torch::softshrink(input, lambda);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.softshrink
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftshrinkFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softshrink(x, F::SoftshrinkFuncOptions(0.42));
+/// ```
+inline Tensor softshrink(
+    const Tensor& input,
+    const SoftshrinkFuncOptions& options = {}) {
+  return detail::softshrink(input, options.lambda());
+}
+
+// ============================================================================
+
+inline Tensor softsign(const Tensor& input) {
+  return input / (input.abs() + 1);
+}
+
+// ============================================================================
+
+inline Tensor tanhshrink(const Tensor& input) {
+  return input - input.tanh();
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor threshold(
+    Tensor input,
+    double threshold,
+    double value,
+    bool inplace) {
+  if (inplace) {
+    return torch::threshold_(input, threshold, value);
+  } else {
+    return torch::threshold(input, threshold, value);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.threshold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ThresholdFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::threshold(x, F::ThresholdFuncOptions(0.5, 0.5).inplace(true));
+/// ```
+inline Tensor threshold(Tensor input, const ThresholdFuncOptions& options) {
+  return detail::threshold(
+      std::move(input),
+      options.threshold(),
+      options.value(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> multi_head_attention_forward(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    int64_t embed_dim_to_check,
+    int64_t num_heads,
+    const Tensor& in_proj_weight,
+    const Tensor& in_proj_bias,
+    const Tensor& bias_k,
+    const Tensor& bias_v,
+    bool add_zero_attn,
+    double dropout_p,
+    const Tensor& out_proj_weight,
+    const Tensor& out_proj_bias,
+    bool training = true,
+    const Tensor& key_padding_mask = {},
+    bool need_weights = true,
+    const Tensor& attn_mask = {},
+    bool use_separate_proj_weight = false,
+    const Tensor& q_proj_weight = {},
+    const Tensor& k_proj_weight = {},
+    const Tensor& v_proj_weight = {},
+    const Tensor& static_k = {},
+    const Tensor& static_v = {},
+    bool average_attn_weights = true) {
+  namespace F = torch::nn::functional;
+
+  const auto query_sizes = query.sizes();
+  const auto& tgt_len = query_sizes[0];
+  const auto& bsz = query_sizes[1];
+  const auto& embed_dim = query_sizes[2];
+  TORCH_INTERNAL_ASSERT(embed_dim == embed_dim_to_check);
+  TORCH_INTERNAL_ASSERT(key.sizes() == value.sizes());
+
+  const auto head_dim = embed_dim / num_heads;
+  TORCH_CHECK(
+      head_dim * num_heads == embed_dim,
+      "embed_dim must be divisible by num_heads");
+  const auto scaling = 1 / std::sqrt(head_dim);
+
+  Tensor q, k, v;
+  if (!use_separate_proj_weight) {
+    if (torch::equal(query, key) && torch::equal(key, value)) {
+      // self-attention
+      const auto chunks =
+          F::linear(query, in_proj_weight, in_proj_bias).chunk(3, /*dim=*/-1);
+      q = chunks[0];
+      k = chunks[1];
+      v = chunks[2];
+    } else if (torch::equal(key, value)) {
+      // encoder-decoder attention
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      auto _b = in_proj_bias;
+      auto _start = 0;
+      auto _end = embed_dim;
+      auto _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      q = F::linear(query, _w, _b);
+
+      if (!key.defined()) {
+        TORCH_INTERNAL_ASSERT(!value.defined());
+        k.reset();
+        v.reset();
+      } else {
+        // This is inline in_proj function with in_proj_weight and in_proj_bias
+        _b = in_proj_bias;
+        _start = embed_dim;
+        _w = in_proj_weight.slice(/*dim=*/0, _start);
+        if (_b.defined()) {
+          _b = _b.slice(/*dim=*/0, _start);
+        }
+        const auto chunks = F::linear(key, _w, _b).chunk(2, /*dim=*/-1);
+        k = chunks[0];
+        v = chunks[1];
+      }
+    } else {
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      auto _b = in_proj_bias;
+      auto _start = 0;
+      auto _end = embed_dim;
+      auto _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      q = F::linear(query, _w, _b);
+
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      _b = in_proj_bias;
+      _start = embed_dim;
+      _end = embed_dim * 2;
+      _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      k = F::linear(key, _w, _b);
+
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      _b = in_proj_bias;
+      _start = embed_dim * 2;
+      _w = in_proj_weight.slice(/*dim=*/0, _start);
+      if (_b.defined()) {
+        _b = _b.slice(0, _start);
+      }
+      v = F::linear(value, _w, _b);
+    }
+  } else {
+    const auto& q_proj_weight_non_opt = q_proj_weight;
+    {
+      const auto sizes = q_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == query.size(-1));
+    }
+
+    const auto& k_proj_weight_non_opt = k_proj_weight;
+    {
+      const auto sizes = k_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == key.size(-1));
+    }
+
+    const auto& v_proj_weight_non_opt = v_proj_weight;
+    {
+      const auto sizes = v_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == value.size(-1));
+    }
+
+    if (in_proj_bias.defined()) {
+      q = F::linear(
+          query,
+          q_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, 0, embed_dim));
+      k = F::linear(
+          key,
+          k_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, embed_dim, (embed_dim * 2)));
+      v = F::linear(
+          value,
+          v_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, (embed_dim * 2)));
+    } else {
+      q = F::linear(query, q_proj_weight_non_opt, in_proj_bias);
+      k = F::linear(key, k_proj_weight_non_opt, in_proj_bias);
+      v = F::linear(value, v_proj_weight_non_opt, in_proj_bias);
+    }
+  }
+  q = q * scaling;
+  Tensor attn_mask_ = attn_mask;
+  Tensor key_padding_mask_ = key_padding_mask;
+  if (bias_k.defined() && bias_v.defined()) {
+    if (!static_k.defined() && !static_v.defined()) {
+      k = torch::cat({k, bias_k.repeat({1, bsz, 1})});
+      v = torch::cat({v, bias_v.repeat({1, bsz, 1})});
+      if (attn_mask_.defined()) {
+        attn_mask_ = torch::cat(
+            {attn_mask_,
+             torch::zeros(
+                 {attn_mask_.size(0), 1},
+                 at::TensorOptions(attn_mask_.dtype())
+                     .device(attn_mask_.device()))},
+            /*dim=*/1);
+      }
+      if (key_padding_mask_.defined()) {
+        key_padding_mask_ = torch::cat(
+            {key_padding_mask_,
+             torch::zeros(
+                 {key_padding_mask_.size(0), 1},
+                 at::TensorOptions(key_padding_mask_.dtype())
+                     .device(key_padding_mask_.device()))},
+            /*dim=*/1);
+      }
+    } else {
+      TORCH_CHECK(!static_k.defined(), "bias cannot be added to static key.");
+      TORCH_CHECK(!static_v.defined(), "bias cannot be added to static value.");
+    }
+  } else {
+    TORCH_CHECK(!bias_k.defined());
+    TORCH_CHECK(!bias_v.defined());
+  }
+  q = q.contiguous().view({tgt_len, bsz * num_heads, head_dim}).transpose(0, 1);
+  if (k.defined()) {
+    k = k.contiguous().view({-1, bsz * num_heads, head_dim}).transpose(0, 1);
+  }
+  if (v.defined()) {
+    v = v.contiguous().view({-1, bsz * num_heads, head_dim}).transpose(0, 1);
+  }
+  if (static_k.defined()) {
+    TORCH_CHECK(static_k.size(0) == bsz * num_heads);
+    TORCH_CHECK(static_k.size(2) == head_dim);
+    k = static_k;
+  }
+  if (static_v.defined()) {
+    TORCH_CHECK(static_v.size(0) == bsz * num_heads);
+    TORCH_CHECK(static_v.size(2) == head_dim);
+    v = static_v;
+  }
+  auto src_len = k.size(1);
+  if (key_padding_mask_.defined()) {
+    TORCH_CHECK(key_padding_mask_.size(0) == bsz);
+    TORCH_CHECK(key_padding_mask_.size(1) == src_len);
+  }
+  if (add_zero_attn) {
+    src_len += 1;
+    auto k_sizes = k.sizes().vec();
+    k_sizes[1] = 1;
+    k = torch::cat(
+        {k,
+         torch::zeros(
+             k_sizes, at::TensorOptions(k.dtype()).device(k.device()))},
+        /*dim=*/1);
+    auto v_sizes = v.sizes().vec();
+    v_sizes[1] = 1;
+    v = torch::cat(
+        {v,
+         torch::zeros(
+             v_sizes, at::TensorOptions(v.dtype()).device(v.device()))},
+        /*dim=*/1);
+    if (attn_mask_.defined()) {
+      attn_mask_ = torch::cat(
+          {attn_mask_,
+           torch::zeros(
+               {attn_mask_.size(0), 1},
+               at::TensorOptions(attn_mask_.dtype())
+                   .device(attn_mask_.device()))},
+          /*dim=*/1);
+    }
+    if (key_padding_mask_.defined()) {
+      key_padding_mask_ = torch::cat(
+          {key_padding_mask_,
+           torch::zeros(
+               {key_padding_mask_.size(0), 1},
+               at::TensorOptions(key_padding_mask_.dtype())
+                   .device(key_padding_mask_.device()))},
+          /*dim=*/1);
+    }
+  }
+  auto attn_output_weights = torch::bmm(q, k.transpose(1, 2));
+  TORCH_CHECK(
+      attn_output_weights.sizes() ==
+      IntArrayRef({bsz * num_heads, tgt_len, src_len}));
+  if (attn_mask_.defined()) {
+    attn_mask_ = attn_mask_.unsqueeze(0);
+    attn_output_weights += attn_mask_;
+  }
+  if (key_padding_mask_.defined()) {
+    attn_output_weights =
+        attn_output_weights.view({bsz, num_heads, tgt_len, src_len});
+    attn_output_weights = AT_DISPATCH_FLOATING_TYPES(
+        attn_output_weights.scalar_type(),
+        "attn_output_weights.masked_fill",
+        [&]() {
+          return attn_output_weights.masked_fill(
+              key_padding_mask_.unsqueeze(1).unsqueeze(2),
+              -std::numeric_limits<scalar_t>::infinity());
+        });
+    attn_output_weights =
+        attn_output_weights.view({bsz * num_heads, tgt_len, src_len});
+  }
+  // NOLINTNEXTLINE(bugprone-argument-comment)
+  attn_output_weights = F::softmax(attn_output_weights, /*dim=*/-1);
+  attn_output_weights = F::dropout(
+      attn_output_weights,
+      F::DropoutFuncOptions().p(dropout_p).training(training));
+  auto attn_output = torch::bmm(attn_output_weights, v);
+  TORCH_CHECK(
+      attn_output.sizes() == IntArrayRef({bsz * num_heads, tgt_len, head_dim}));
+  attn_output =
+      attn_output.transpose(0, 1).contiguous().view({tgt_len, bsz, embed_dim});
+  attn_output = F::linear(attn_output, out_proj_weight, out_proj_bias);
+  if (need_weights) {
+    attn_output_weights =
+        attn_output_weights.view({bsz, num_heads, tgt_len, src_len});
+    if (average_attn_weights) {
+      // average attention weights over heads
+      attn_output_weights = attn_output_weights.sum(/*dim=*/1) / num_heads;
+    }
+    return std::make_tuple(attn_output, attn_output_weights);
+  } else {
+    return std::make_tuple(attn_output, Tensor());
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+inline std::tuple<Tensor, Tensor> multi_head_attention_forward(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const MultiheadAttentionForwardFuncOptions& options) {
+  return detail::multi_head_attention_forward(
+      query,
+      key,
+      value,
+      options.embed_dim_to_check(),
+      options.num_heads(),
+      options.in_proj_weight(),
+      options.in_proj_bias(),
+      options.bias_k(),
+      options.bias_v(),
+      options.add_zero_attn(),
+      options.dropout_p(),
+      options.out_proj_weight(),
+      options.out_proj_bias(),
+      options.training(),
+      options.key_padding_mask(),
+      options.need_weights(),
+      options.attn_mask(),
+      options.use_separate_proj_weight(),
+      options.q_proj_weight(),
+      options.k_proj_weight(),
+      options.v_proj_weight(),
+      options.static_k(),
+      options.static_v(),
+      options.average_attn_weights());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda320e0eba43a36fb81bb52e5c56520bcec1a3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor batch_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    Tensor weight,
+    Tensor bias,
+    bool training,
+    c10::optional<double> momentum,
+    double eps) {
+  TORCH_CHECK(
+      input.dim() >= 2,
+      "Expected at least 2 input dimensions, but got ",
+      input.dim());
+  if (training) {
+    auto size = input.sizes();
+    int64_t size_prods = size[0];
+    for (const auto i : c10::irange(size.size() - 2)) {
+      size_prods *= size[i + 2];
+    }
+    TORCH_CHECK(
+        size_prods != 1,
+        "Expected more than 1 value per channel when training, got input size ",
+        size);
+  }
+
+  return torch::batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      training,
+      momentum.value(),
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.batch_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::BatchNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::batch_norm(input, mean, variance,
+/// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
+/// ```
+inline Tensor batch_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    const BatchNormFuncOptions& options = {}) {
+  return detail::batch_norm(
+      input,
+      running_mean,
+      running_var,
+      options.weight(),
+      options.bias(),
+      options.training(),
+      options.momentum(),
+      options.eps());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae31e42d44970d8989c3db93202cd63bdf597a51
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h
@@ -0,0 +1,301 @@
+#pragma once
+
+#include <torch/nn/options/conv.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline std::string padding_unwrap(enumtype::kValid) {
+  return "valid";
+}
+
+inline std::string padding_unwrap(enumtype::kSame) {
+  return "same";
+}
+
+template <size_t D>
+IntArrayRef padding_unwrap(const ExpandingArray<D>& array) {
+  return array;
+}
+
+inline Tensor conv1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<1> stride,
+    const Conv1dFuncOptions::padding_t& padding,
+    ExpandingArray<1> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv1d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv1dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv1d(x, weight, F::Conv1dFuncOptions().stride(1));
+/// ```
+inline Tensor conv1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv1dFuncOptions& options = {}) {
+  return detail::conv1d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<2> stride,
+    const Conv2dFuncOptions::padding_t& padding,
+    ExpandingArray<2> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv2d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv2dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv2d(x, weight, F::Conv2dFuncOptions().stride(1));
+/// ```
+inline Tensor conv2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv2dFuncOptions& options = {}) {
+  return detail::conv2d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<3> stride,
+    const Conv3dFuncOptions::padding_t& padding,
+    ExpandingArray<3> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv3d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv3dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv3d(x, weight, F::Conv3dFuncOptions().stride(1));
+/// ```
+inline Tensor conv3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv3dFuncOptions& options = {}) {
+  return detail::conv3d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose1d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose1d(x, weight, F::ConvTranspose1dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose1dFuncOptions& options = {}) {
+  return detail::conv_transpose1d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose2d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose2d(x, weight, F::ConvTranspose2dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose2dFuncOptions& options = {}) {
+  return detail::conv_transpose2d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose3d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.conv_transpose3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose3d(x, weight, F::ConvTranspose3dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose3dFuncOptions& options = {}) {
+  return detail::conv_transpose3d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..758acaddda9ac75b7bbe4da10fe34e8e89290e45
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <torch/nn/options/distance.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cosine_similarity(
+    const Tensor& x1,
+    const Tensor& x2,
+    int64_t dim,
+    double eps) {
+  return torch::cosine_similarity(x1, x2, dim, eps);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cosine_similarity
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::CosineSimilarityFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_similarity(input1, input2,
+/// F::CosineSimilarityFuncOptions().dim(1));
+/// ```
+inline Tensor cosine_similarity(
+    const Tensor& x1,
+    const Tensor& x2,
+    const CosineSimilarityFuncOptions& options = {}) {
+  return detail::cosine_similarity(x1, x2, options.dim(), options.eps());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pairwise_distance(
+    const Tensor& x1,
+    const Tensor& x2,
+    double p,
+    double eps,
+    bool keepdim) {
+  return torch::pairwise_distance(x1, x2, p, eps, keepdim);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pairwise_distance
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::PairwiseDistanceFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pairwise_distance(input1, input2, F::PairwiseDistanceFuncOptions().p(1));
+/// ```
+inline Tensor pairwise_distance(
+    const Tensor& x1,
+    const Tensor& x2,
+    const PairwiseDistanceFuncOptions& options = {}) {
+  return detail::pairwise_distance(
+      x1, x2, options.p(), options.eps(), options.keepdim());
+}
+
+// ============================================================================
+
+/// Computes the p-norm distance between every pair of row vectors in the input.
+/// This function will be faster if the rows are contiguous.
+inline Tensor pdist(const Tensor& input, double p = 2.0) {
+  return torch::pdist(input, p);
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..616b70b6f84d21faa0bcbbc9b76639e3093d512c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h
@@ -0,0 +1,234 @@
+#pragma once
+
+#include <torch/nn/options/dropout.h>
+
+#include <utility>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor dropout(Tensor input, double p, bool training, bool inplace) {
+  TORCH_CHECK(
+      p >= 0. && p <= 1.,
+      "dropout probability has to be between 0 and 1, but got ",
+      p);
+  if (inplace) {
+    return torch::dropout_(input, p, training);
+  } else {
+    return torch::dropout(input, p, training);
+  }
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::DropoutFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout(input, F::DropoutFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout(Tensor input, const DropoutFuncOptions& options = {}) {
+  return detail::dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+template <int64_t unbatched_dim, int64_t batched_dim>
+inline Tensor _dropoutNd_helper(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace,
+    const char* fn_name) {
+  TORCH_CHECK(
+      p >= 0. && p <= 1.,
+      "dropout probability has to be between 0 and 1, but got ",
+      p);
+
+  auto inp_dim = input.dim();
+  auto is_batched = inp_dim == batched_dim;
+  if (!is_batched) {
+    if (inplace) {
+      input = input.unsqueeze_(0);
+    } else {
+      input = input.unsqueeze(0);
+    }
+  }
+
+  Tensor result;
+  if (inplace) {
+    result = torch::feature_dropout_(input, p, training);
+  } else {
+    result = torch::feature_dropout(input, p, training);
+  }
+
+  if (!is_batched) {
+    if (inplace) {
+      result = result.squeeze_(0);
+    } else {
+      result = result.squeeze(0);
+    }
+  }
+  return result;
+}
+
+inline Tensor dropout2d(Tensor input, double p, bool training, bool inplace) {
+  return _dropoutNd_helper<3, 4>(
+      std::move(input), p, training, inplace, "dropout2d");
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Dropout2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout2d(input, F::Dropout2dFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout2d(
+    Tensor input,
+    const Dropout2dFuncOptions& options = {}) {
+  return detail::dropout2d(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor dropout3d(Tensor input, double p, bool training, bool inplace) {
+  return _dropoutNd_helper<4, 5>(
+      std::move(input), p, training, inplace, "dropout3d");
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.dropout3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Dropout3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout3d(input, F::Dropout3dFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout3d(
+    Tensor input,
+    const Dropout3dFuncOptions& options = {}) {
+  return detail::dropout3d(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor alpha_dropout(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace) {
+  if (p < 0. || p > 1.) {
+    TORCH_CHECK(
+        false, "dropout probability has to be between 0 and 1, but got ", p);
+  }
+  return inplace ? torch::alpha_dropout_(input, p, training)
+                 : torch::alpha_dropout(input, p, training);
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.alpha_dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AlphaDropoutFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::alpha_dropout(input,
+/// F::AlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+inline Tensor alpha_dropout(
+    Tensor input,
+    const AlphaDropoutFuncOptions& options = {}) {
+  return detail::alpha_dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor feature_alpha_dropout(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace) {
+  if (p < 0. || p > 1.) {
+    TORCH_CHECK(
+        false, "dropout probability has to be between 0 and 1, but got ", p);
+  }
+  return inplace ? torch::feature_alpha_dropout_(input, p, training)
+                 : torch::feature_alpha_dropout(input, p, training);
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.feature_alpha_dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::FeatureAlphaDropoutFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::feature_alpha_dropout(input,
+/// F::FeatureAlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+inline Tensor feature_alpha_dropout(
+    Tensor input,
+    const FeatureAlphaDropoutFuncOptions& options = {}) {
+  return detail::feature_alpha_dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea6605b43010a0603ec847b5a5b79ac049a84b0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <torch/nn/options/embedding.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+inline Tensor one_hot(const Tensor& tensor, int64_t num_classes = -1) {
+  return torch::one_hot(tensor, num_classes);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline void _no_grad_embedding_renorm_(
+    Tensor weight,
+    const Tensor& input,
+    float max_norm,
+    float norm_type) {
+  torch::NoGradGuard no_grad;
+  torch::embedding_renorm_(weight, input, max_norm, norm_type);
+}
+
+inline Tensor embedding(
+    const Tensor& input,
+    const Tensor& weight,
+    c10::optional<int64_t> padding_idx,
+    c10::optional<double> max_norm,
+    double norm_type,
+    bool scale_grad_by_freq,
+    bool sparse) {
+  auto input_ = input;
+
+  if (padding_idx != c10::nullopt) {
+    if (*padding_idx > 0) {
+      TORCH_CHECK(
+          *padding_idx < weight.size(0),
+          "Padding_idx must be within num_embeddings");
+    } else if (*padding_idx < 0) {
+      TORCH_CHECK(
+          *padding_idx >= -weight.size(0),
+          "Padding_idx must be within num_embedding");
+      padding_idx = weight.size(0) + *padding_idx;
+    }
+  } else {
+    padding_idx = -1;
+  }
+
+  if (max_norm != c10::nullopt) {
+    input_ = input_.contiguous();
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    _no_grad_embedding_renorm_(weight, input_, *max_norm, norm_type);
+  }
+  return torch::embedding(
+      weight, input_, *padding_idx, scale_grad_by_freq, sparse);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.embedding
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::EmbeddingFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding(input, weight,
+/// F::EmbeddingFuncOptions().norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+inline Tensor embedding(
+    const Tensor& input,
+    const Tensor& weight,
+    const EmbeddingFuncOptions& options = {}) {
+  return detail::embedding(
+      input,
+      weight,
+      options.padding_idx(),
+      options.max_norm(),
+      options.norm_type(),
+      options.scale_grad_by_freq(),
+      options.sparse());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor embedding_bag(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& offsets,
+    c10::optional<double> max_norm,
+    double norm_type,
+    bool scale_grad_by_freq,
+    EmbeddingBagMode mode,
+    bool sparse,
+    const Tensor& per_sample_weights,
+    bool include_last_offset,
+    c10::optional<int64_t> padding_idx) {
+  auto input_ = input;
+  auto offsets_ = offsets;
+  auto per_sample_weights_ = per_sample_weights;
+  TORCH_CHECK(
+      !per_sample_weights_.defined() ||
+          input_.sizes() == per_sample_weights_.sizes(),
+      "embedding_bag: If per_sample_weights (",
+      per_sample_weights_.sizes(),
+      ") is not null, then it must have the same shape as the input (",
+      input_.sizes(),
+      ")");
+  if (input_.dim() == 2) {
+    TORCH_CHECK(
+        !offsets_.defined(),
+        "If input is 2D, then offsets has to be null, as input is treated is a mini-batch of fixed length sequences. However, found offsets of type Tensor");
+    offsets_ = torch::arange(
+        0,
+        input_.numel(),
+        input_.size(1),
+        torch::TensorOptions().dtype(torch::kLong).device(input_.device()));
+    input_ = input_.reshape(-1);
+    if (per_sample_weights_.defined()) {
+      per_sample_weights_ = per_sample_weights_.reshape(-1);
+    }
+  } else if (input_.dim() == 1) {
+    TORCH_CHECK(
+        offsets_.defined(), "offsets has to be a 1D Tensor but got null");
+    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor");
+  } else {
+    TORCH_CHECK(
+        false,
+        "input has to be 1D or 2D Tensor, but got Tensor of dimension ",
+        input_.dim());
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int mode_enum;
+  if (std::holds_alternative<enumtype::kSum>(mode)) {
+    mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kMean>(mode)) {
+    mode_enum = 1;
+  } else if (std::holds_alternative<enumtype::kMax>(mode)) {
+    mode_enum = 2;
+    TORCH_CHECK(
+        !scale_grad_by_freq,
+        "max mode does not support scaling the gradient by the frequency");
+    TORCH_CHECK(!sparse, "max mode does not support sparse weights");
+  } else {
+    TORCH_CHECK(false, "mode has to be one of sum, mean or max");
+  }
+
+  if (max_norm != c10::nullopt) {
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    _no_grad_embedding_renorm_(weight, input_, *max_norm, norm_type);
+  }
+
+  TORCH_CHECK(
+      !per_sample_weights_.defined() || std::get_if<enumtype::kSum>(&mode),
+      "embedding_bag: per_sample_weights was not null. ",
+      "per_sample_weights is only supported for mode='kSum' (got mode='",
+      torch::enumtype::get_enum_name(mode),
+      "').Please open a feature request on GitHub.");
+
+  return std::get<0>(torch::embedding_bag(
+      weight,
+      input_,
+      offsets_,
+      scale_grad_by_freq,
+      mode_enum,
+      sparse,
+      per_sample_weights_,
+      include_last_offset,
+      padding_idx));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.embedding_bag
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::EmbeddingBagFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding_bag(input, weight,
+/// F::EmbeddingBagFuncOptions().mode(torch::kSum).offsets(offsets));
+/// ```
+inline Tensor embedding_bag(
+    const Tensor& input,
+    const Tensor& weight,
+    const EmbeddingBagFuncOptions& options = {}) {
+  return detail::embedding_bag(
+      input,
+      weight,
+      options.offsets(),
+      options.max_norm(),
+      options.norm_type(),
+      options.scale_grad_by_freq(),
+      options.mode(),
+      options.sparse(),
+      options.per_sample_weights(),
+      options.include_last_offset(),
+      options.padding_idx());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..031af0d8d2ea1ee4fc944f5c3a30257e40ee3611
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <torch/nn/options/fold.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fold(
+    const Tensor& input,
+    ExpandingArray<2> output_size,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> dilation,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> stride) {
+  if (input.dim() == 3 || input.dim() == 2) {
+    return torch::col2im(
+        input, output_size, kernel_size, dilation, padding, stride);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only unbatched (2D) or batched (3D) input Tensors are supported "
+        "(got ",
+        input.dim(),
+        "D)");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.fold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::FoldFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fold(input, F::FoldFuncOptions({3, 2}, {2, 2}));
+/// ```
+inline Tensor fold(const Tensor& input, const FoldFuncOptions& options) {
+  return detail::fold(
+      input,
+      options.output_size(),
+      options.kernel_size(),
+      options.dilation(),
+      options.padding(),
+      options.stride());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor unfold(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> dilation,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> stride) {
+  if (input.dim() == 4) {
+    return torch::im2col(input, kernel_size, dilation, padding, stride);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only 4D input Tensors are supported "
+        "(got ",
+        input.dim(),
+        "D)");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.unfold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::UnfoldFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::unfold(input, F::UnfoldFuncOptions({2, 2}).padding(1).stride(2));
+/// ```
+inline Tensor unfold(const Tensor& input, const UnfoldFuncOptions& options) {
+  return detail::unfold(
+      input,
+      options.kernel_size(),
+      options.dilation(),
+      options.padding(),
+      options.stride());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..6032c5ca71ab204d5fd0b258d15afd06e953a719
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/nn/options/instancenorm.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor instance_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    const Tensor& weight,
+    const Tensor& bias,
+    bool use_input_stats,
+    double momentum,
+    double eps) {
+  return torch::instance_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      use_input_stats,
+      momentum,
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.instance_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::InstanceNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::instance_norm(input,
+/// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
+/// ```
+inline Tensor instance_norm(
+    const Tensor& input,
+    const InstanceNormFuncOptions& options = {}) {
+  return detail::instance_norm(
+      input,
+      options.running_mean(),
+      options.running_var(),
+      options.weight(),
+      options.bias(),
+      options.use_input_stats(),
+      options.momentum(),
+      options.eps());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..f922a0ea57829ed19adee6ab6e73a60bccfd096d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+inline Tensor bilinear(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& weight,
+    const Tensor& bias = Tensor()) {
+  return torch::bilinear(input1, input2, weight, bias);
+}
+
+// ============================================================================
+
+inline Tensor linear(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias = {}) {
+  if (input.dim() == 2 && bias.defined()) {
+    // fused op is marginally faster
+    return torch::addmm(bias, input, weight.t());
+  } else {
+    auto output = input.matmul(weight.t());
+    if (bias.defined()) {
+      output += bias;
+    }
+    return output;
+  }
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ad5fa7c2e4e23b94385228543f38d5cae7e9c74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -0,0 +1,1044 @@
+#pragma once
+
+#include <ATen/ExpandUtils.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/options/loss.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    L1LossFuncOptions::reduction_t reduction) {
+  return torch::l1_loss(input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.l1_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::L1LossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::l1_loss(input, target, F::L1LossFuncOptions(torch::kNone));
+/// ```
+inline Tensor l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const L1LossFuncOptions& options = {}) {
+  return detail::l1_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor kl_div(
+    const Tensor& input,
+    const Tensor& target,
+    KLDivFuncOptions::reduction_t reduction,
+    bool log_target = false) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  torch::Reduction::Reduction reduction_enum;
+
+  if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    TORCH_WARN(
+        "reduction: 'mean' divides the total loss by both the batch size and the support size."
+        "'batchmean' divides only by the batch size, and aligns with the KL div math definition."
+        "'mean' will be changed to behave the same as 'batchmean' in the next major release.");
+  }
+
+  // special case for batchmean
+  if (std::holds_alternative<enumtype::kBatchMean>(reduction)) {
+    reduction_enum = torch::Reduction::Sum;
+  } else {
+    reduction_enum = enumtype::reduction_get_enum(reduction);
+  }
+
+  auto reduced = torch::kl_div(input, target, reduction_enum, log_target);
+
+  if (std::holds_alternative<enumtype::kBatchMean>(reduction) &&
+      input.dim() != 0) {
+    reduced = reduced / input.sizes()[0];
+  }
+
+  return reduced;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.kl_div
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::KLDivFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::kl_div(input, target,
+/// F::KLDivFuncOptions.reduction(torch::kNone).log_target(false));
+/// ```
+inline Tensor kl_div(
+    const Tensor& input,
+    const Tensor& target,
+    const KLDivFuncOptions& options = {}) {
+  return detail::kl_div(
+      input, target, options.reduction(), options.log_target());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor mse_loss(
+    const Tensor& input,
+    const Tensor& target,
+    MSELossFuncOptions::reduction_t reduction) {
+  if (!(target.sizes() == input.sizes())) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+  std::vector<torch::Tensor> broadcast_tensors =
+      torch::broadcast_tensors({input, target});
+  auto expanded_input = broadcast_tensors[0];
+  auto expanded_target = broadcast_tensors[1];
+  return torch::mse_loss(
+      expanded_input, expanded_target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.mse_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MSELossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::mse_loss(input, target, F::MSELossFuncOptions(torch::kNone));
+/// ```
+inline Tensor mse_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MSELossFuncOptions& options = {}) {
+  return detail::mse_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor binary_cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    BinaryCrossEntropyFuncOptions::reduction_t reduction) {
+  auto reduction_enum = enumtype::reduction_get_enum(reduction);
+
+  if (target.sizes() != input.sizes()) {
+    TORCH_CHECK(
+        false,
+        "Using a target size (",
+        target.sizes(),
+        ") ",
+        "that is different to the input size (",
+        input.sizes(),
+        ") is deprecated. ",
+        "Please ensure they have the same size.");
+  }
+
+  auto weight_ = weight;
+  if (weight_.defined()) {
+    auto new_size = at::infer_size(target.sizes(), weight_.sizes());
+    weight_ = weight_.expand(new_size);
+  }
+
+  return torch::binary_cross_entropy(input, target, weight_, reduction_enum);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.binary_cross_entropy
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::BinaryCrossEntropyFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy(input, target,
+/// F::BinaryCrossEntropyFuncOptions().weight(weight));
+/// ```
+inline Tensor binary_cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const BinaryCrossEntropyFuncOptions& options = {}) {
+  return detail::binary_cross_entropy(
+      input, target, options.weight(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hinge_embedding_loss(
+    const Tensor& input,
+    const Tensor& target,
+    double margin,
+    HingeEmbeddingLossFuncOptions::reduction_t reduction) {
+  return torch::hinge_embedding_loss(
+      input, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.hinge_embedding_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::HingeEmbeddingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hinge_embedding_loss(input, target,
+/// F::HingeEmbeddingLossFuncOptions().margin(2));
+/// ```
+inline Tensor hinge_embedding_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const HingeEmbeddingLossFuncOptions& options = {}) {
+  return detail::hinge_embedding_loss(
+      input, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multi_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t p,
+    double margin,
+    const Tensor& weight,
+    MultiMarginLossFuncOptions::reduction_t reduction) {
+  TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
+  if (weight.defined()) {
+    TORCH_CHECK(weight.dim() == 1, "weight must be one-dimensional");
+  }
+
+  return torch::multi_margin_loss(
+      input,
+      target,
+      p,
+      margin,
+      weight,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multi_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultiMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multi_margin_loss(input, target,
+/// F::MultiMarginLossFuncOptions().margin(2).weight(weight));
+/// ```
+inline Tensor multi_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultiMarginLossFuncOptions& options = {}) {
+  return detail::multi_margin_loss(
+      input,
+      target,
+      options.p(),
+      options.margin(),
+      options.weight(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cosine_embedding_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    double margin,
+    CosineEmbeddingLossFuncOptions::reduction_t reduction) {
+  return torch::cosine_embedding_loss(
+      input1, input2, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cosine_embedding_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::CosineEmbeddingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_embedding_loss(input1, input2, target,
+/// F::CosineEmbeddingLossFuncOptions().margin(0.5));
+/// ```
+inline Tensor cosine_embedding_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    const CosineEmbeddingLossFuncOptions& options = {}) {
+  return detail::cosine_embedding_loss(
+      input1, input2, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+inline Tensor _smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    double beta = 1.) {
+  auto t = torch::abs(input - target);
+  return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    SmoothL1LossFuncOptions::reduction_t reduction,
+    c10::optional<double> beta_opt = c10::nullopt) {
+  if (target.sizes() != input.sizes()) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+  double beta = beta_opt.value_or(1.0);
+
+  std::vector<Tensor> expanded_tensors =
+      torch::broadcast_tensors({input, target});
+  return torch::smooth_l1_loss(
+      expanded_tensors[0],
+      expanded_tensors[1],
+      enumtype::reduction_get_enum(reduction),
+      beta);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SmoothL1LossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, F::SmoothL1LossFuncOptions(torch::kNone));
+/// ```
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SmoothL1LossFuncOptions& options = {}) {
+  return detail::smooth_l1_loss(
+      input, target, options.reduction(), options.beta());
+}
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// about the exact behavior of this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, /*options=*/torch::kNone, /*beta=*/0.5);
+/// ```
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SmoothL1LossFuncOptions& options,
+    double beta) {
+  TORCH_CHECK(
+      options.beta() == c10::nullopt,
+      "expected beta not to be provided in 'options', but got ",
+      options.beta().value());
+  return detail::smooth_l1_loss(input, target, options.reduction(), beta);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor huber_loss(
+    const Tensor& input,
+    const Tensor& target,
+    HuberLossFuncOptions::reduction_t reduction,
+    double delta = 1.) {
+  if (target.sizes() != input.sizes()) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+
+  std::vector<Tensor> expanded_tensors =
+      torch::broadcast_tensors({input, target});
+  return torch::huber_loss(
+      expanded_tensors[0],
+      expanded_tensors[1],
+      enumtype::reduction_get_enum(reduction),
+      delta);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.huber_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HuberLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::huber_loss(input, target,
+/// F::HuberLossFuncOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+inline Tensor huber_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const HuberLossFuncOptions& options = {}) {
+  return detail::huber_loss(
+      input, target, options.reduction(), options.delta());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multilabel_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    MultilabelMarginLossFuncOptions::reduction_t reduction) {
+  return torch::multilabel_margin_loss(
+      input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multilabel_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultilabelMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_margin_loss(input, target,
+/// F::MultilabelMarginLossFuncOptions(torch::kNone));
+/// ```
+inline Tensor multilabel_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultilabelMarginLossFuncOptions& options = {}) {
+  return detail::multilabel_margin_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    SoftMarginLossFuncOptions::reduction_t reduction) {
+  return torch::soft_margin_loss(
+      input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.soft_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftMarginLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::soft_margin_loss(input, target,
+/// F::SoftMarginLossFuncOptions(torch::kNone));
+/// ```
+inline Tensor soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SoftMarginLossFuncOptions& options = {}) {
+  return detail::soft_margin_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multilabel_soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    MultilabelSoftMarginLossFuncOptions::reduction_t reduction) {
+  auto loss =
+      -(target * torch::log_sigmoid(input) +
+        (1 - target) * torch::log_sigmoid(-input));
+  if (weight.defined()) {
+    loss = loss * weight;
+  }
+
+  auto class_dim = input.dim() - 1;
+  auto C = input.size(class_dim);
+  loss = loss.sum(class_dim) / C; // only return N loss values
+
+  Tensor ret;
+
+  if (std::holds_alternative<enumtype::kNone>(reduction)) {
+    ret = loss;
+  } else if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    ret = loss.mean();
+  } else if (std::holds_alternative<enumtype::kSum>(reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = input;
+    TORCH_INTERNAL_ASSERT(
+        false, enumtype::get_enum_name(reduction), " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.multilabel_soft_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultilabelSoftMarginLossFuncOptions` class to learn
+/// what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_soft_margin_loss(input, target,
+/// F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone).weight(weight));
+/// ```
+inline Tensor multilabel_soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultilabelSoftMarginLossFuncOptions& options = {}) {
+  return detail::multilabel_soft_margin_loss(
+      input, target, options.weight(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    double margin,
+    double p,
+    double eps,
+    bool swap,
+    TripletMarginLossFuncOptions::reduction_t reduction) {
+  return torch::triplet_margin_loss(
+      anchor,
+      positive,
+      negative,
+      margin,
+      p,
+      eps,
+      swap,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::TripletMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_loss(anchor, positive, negative,
+/// F::TripletMarginLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginLossFuncOptions& options = {}) {
+  return detail::triplet_margin_loss(
+      anchor,
+      positive,
+      negative,
+      options.margin(),
+      options.p(),
+      options.eps(),
+      options.swap(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    c10::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t>
+        distance_function,
+    double margin,
+    bool swap,
+    TripletMarginWithDistanceLossFuncOptions::reduction_t reduction) {
+  Tensor dist_pos, dist_neg;
+  if (distance_function.has_value()) {
+    auto distance_function_impl = distance_function.value();
+    dist_pos = distance_function_impl(anchor, positive);
+    dist_neg = distance_function_impl(anchor, negative);
+  } else {
+    dist_pos = pairwise_distance(anchor, positive);
+    dist_neg = pairwise_distance(anchor, negative);
+  }
+
+  if (swap) {
+    Tensor dist_swap;
+    if (distance_function.has_value()) {
+      dist_swap = distance_function.value()(positive, negative);
+    } else {
+      dist_swap = pairwise_distance(positive, negative);
+    }
+    dist_neg = torch::min(dist_neg, dist_swap);
+  }
+
+  auto loss = torch::clamp_min(dist_pos - dist_neg + margin, 0);
+
+  Tensor ret;
+  if (std::holds_alternative<enumtype::kNone>(reduction)) {
+    ret = loss;
+  } else if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    ret = loss.mean();
+  } else if (std::holds_alternative<enumtype::kSum>(reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = anchor;
+    TORCH_INTERNAL_ASSERT(
+        false, enumtype::get_enum_name(reduction), " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::TripletMarginWithDistanceLossFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative,
+/// F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginWithDistanceLossFuncOptions& options = {}) {
+  return detail::triplet_margin_with_distance_loss(
+      anchor,
+      positive,
+      negative,
+      options.distance_function(),
+      options.margin(),
+      options.swap(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor ctc_loss(
+    const Tensor& log_probs,
+    const Tensor& targets,
+    const Tensor& input_lengths,
+    const Tensor& target_lengths,
+    int64_t blank,
+    CTCLossFuncOptions::reduction_t reduction,
+    bool zero_infinity) {
+  return torch::ctc_loss(
+      log_probs,
+      targets,
+      input_lengths,
+      target_lengths,
+      blank,
+      enumtype::reduction_get_enum(reduction),
+      zero_infinity);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.ctc_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CTCLossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::ctc_loss(log_probs, targets, input_lengths, target_lengths,
+/// F::CTCLossFuncOptions().reduction(torch::kNone));
+/// ```
+inline Tensor ctc_loss(
+    const Tensor& log_probs,
+    const Tensor& targets,
+    const Tensor& input_lengths,
+    const Tensor& target_lengths,
+    const CTCLossFuncOptions& options = {}) {
+  return detail::ctc_loss(
+      log_probs,
+      targets,
+      input_lengths,
+      target_lengths,
+      options.blank(),
+      options.reduction(),
+      options.zero_infinity());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor poisson_nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    bool log_input,
+    bool full,
+    double eps,
+    PoissonNLLLossFuncOptions::reduction_t reduction) {
+  return torch::poisson_nll_loss(
+      input,
+      target,
+      log_input,
+      full,
+      eps,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.poisson_nll_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PoissonNLLLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::poisson_nll_loss(input, target,
+/// F::PoissonNLLLossFuncOptions().reduction(torch::kNone));
+/// ```
+inline Tensor poisson_nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const PoissonNLLLossFuncOptions& options = {}) {
+  return detail::poisson_nll_loss(
+      input,
+      target,
+      options.log_input(),
+      options.full(),
+      options.eps(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor margin_ranking_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    double margin,
+    MarginRankingLossFuncOptions::reduction_t reduction) {
+  TORCH_CHECK(
+      input1.dim() == input2.dim() && input1.dim() == target.dim(),
+      "margin_ranking_loss : All input tensors should have same dimension but got sizes: "
+      "input1: ",
+      input1.sizes(),
+      ", input2: ",
+      input2.sizes(),
+      ", target: ",
+      target.sizes());
+  return torch::margin_ranking_loss(
+      input1, input2, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.margin_ranking_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MarginRankingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::margin_ranking_loss(input1, input2, target,
+/// F::MarginRankingLossFuncOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+inline Tensor margin_ranking_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    const MarginRankingLossFuncOptions& options = {}) {
+  return detail::margin_ranking_loss(
+      input1, input2, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t ignore_index,
+    const NLLLossFuncOptions::reduction_t reduction) {
+  if (input.dim() < 2) {
+    TORCH_CHECK(false, "Expected 2 or more dimensions (got ", input.dim(), ")");
+  }
+
+  if (input.sizes()[0] != target.sizes()[0]) {
+    TORCH_CHECK(
+        false,
+        "Expected input batch_size (",
+        input.sizes()[0],
+        ") to match target batch_size (",
+        target.sizes()[0],
+        ").");
+  }
+
+  return torch::nll_loss_nd(
+      input,
+      target,
+      weight,
+      enumtype::reduction_get_enum(reduction),
+      ignore_index);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.nll_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::NLLLossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::nll_loss(input, target,
+/// F::NLLLossFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+inline Tensor nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const NLLLossFuncOptions& options = {}) {
+  return detail::nll_loss(
+      input,
+      target,
+      options.weight(),
+      options.ignore_index(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t ignore_index,
+    CrossEntropyFuncOptions::reduction_t reduction,
+    double label_smoothing) {
+  return torch::cross_entropy_loss(
+      input,
+      target,
+      weight,
+      enumtype::reduction_get_enum(reduction),
+      ignore_index,
+      label_smoothing);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.cross_entropy
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CrossEntropyFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cross_entropy(input, target,
+/// F::CrossEntropyFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+inline Tensor cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const CrossEntropyFuncOptions& options = {}) {
+  return detail::cross_entropy(
+      input,
+      target,
+      options.weight(),
+      options.ignore_index(),
+      options.reduction(),
+      options.label_smoothing());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor binary_cross_entropy_with_logits(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    BinaryCrossEntropyWithLogitsFuncOptions::reduction_t reduction,
+    const Tensor& pos_weight) {
+  TORCH_CHECK(
+      target.sizes() == input.sizes(),
+      "Target size (",
+      target.sizes(),
+      ") must be the same as input size (",
+      input.sizes(),
+      ")");
+
+  return torch::binary_cross_entropy_with_logits(
+      input,
+      target,
+      weight,
+      pos_weight,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.binary_cross_entropy_with_logits
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::BinaryCrossEntropyWithLogitsFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy_with_logits(input, target,
+/// F::BinaryCrossEntropyWithLogitsFuncOptions().pos_weight(pos_weight).reduction(torch::kSum));
+/// ```
+inline Tensor binary_cross_entropy_with_logits(
+    const Tensor& input,
+    const Tensor& target,
+    const BinaryCrossEntropyWithLogitsFuncOptions& options = {}) {
+  return detail::binary_cross_entropy_with_logits(
+      input,
+      target,
+      options.weight(),
+      options.reduction(),
+      options.pos_weight());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..490003e8bd6d14b81e7f71decfc28bc3b8205db4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <torch/nn/functional/padding.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor normalize(
+    const Tensor& input,
+    double p,
+    int64_t dim,
+    double eps,
+    c10::optional<Tensor> out) {
+  if (out == c10::nullopt) {
+    auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input);
+    return input / denom;
+  } else {
+    auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input);
+    return torch::div_out(*out, input, denom);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.normalize
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::NormalizeFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::normalize(input, F::NormalizeFuncOptions().p(1).dim(-1));
+/// ```
+inline Tensor normalize(
+    const Tensor& input,
+    NormalizeFuncOptions options = {}) {
+  return detail::normalize(
+      input, options.p(), options.dim(), options.eps(), options.out());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor layer_norm(
+    const Tensor& input,
+    const std::vector<int64_t>& normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps) {
+  return torch::layer_norm(input, normalized_shape, weight, bias, eps);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.layer_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LayerNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::layer_norm(input, F::LayerNormFuncOptions({2, 2}).eps(2e-5));
+/// ```
+inline Tensor layer_norm(
+    const Tensor& input,
+    const LayerNormFuncOptions& options) {
+  return detail::layer_norm(
+      input,
+      options.normalized_shape(),
+      options.weight(),
+      options.bias(),
+      options.eps());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor local_response_norm(
+    const Tensor& input,
+    int64_t size,
+    double alpha,
+    double beta,
+    double k) {
+  auto dim = input.dim();
+  TORCH_CHECK(
+      dim >= 3,
+      "Expected 3D or higher dimensionality input (got ",
+      dim,
+      " dimensions)");
+  auto div = input.mul(input).unsqueeze(1);
+  if (dim == 3) {
+    div = detail::pad(
+        div,
+        /*pad=*/{0, 0, size / 2, (size - 1) / 2},
+        /*mode=*/torch::kConstant,
+        /*value=*/0);
+    div = detail::avg_pool2d(
+              div,
+              /*kernel_size=*/{size, 1},
+              /*stride=*/1,
+              /*padding=*/0,
+              /*ceil_mode=*/false,
+              /*count_include_pad=*/true,
+              /*divisor_override=*/c10::nullopt)
+              .squeeze(1);
+  } else {
+    auto sizes = input.sizes();
+    div = div.view({sizes[0], 1, sizes[1], sizes[2], -1});
+    div = detail::pad(
+        div,
+        /*pad=*/{0, 0, 0, 0, size / 2, (size - 1) / 2},
+        /*mode=*/torch::kConstant,
+        /*value=*/0);
+    div = detail::avg_pool3d(
+              div,
+              /*kernel_size=*/{size, 1, 1},
+              /*stride=*/1,
+              /*padding=*/0,
+              /*ceil_mode=*/false,
+              /*count_include_pad=*/true,
+              /*divisor_override=*/c10::nullopt)
+              .squeeze(1);
+    div = div.view(sizes);
+  }
+  div = div.mul(alpha).add(k).pow(beta);
+  return input / div;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.local_response_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::LocalResponseNormFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::local_response_norm(x, F::LocalResponseNormFuncOptions(2));
+/// ```
+inline Tensor local_response_norm(
+    const Tensor& input,
+    const LocalResponseNormFuncOptions& options) {
+  return detail::local_response_norm(
+      input, options.size(), options.alpha(), options.beta(), options.k());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor group_norm(
+    const Tensor& input,
+    int64_t num_groups,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps) {
+  return torch::group_norm(
+      input,
+      num_groups,
+      weight,
+      bias,
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.group_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GroupNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::group_norm(input, F::GroupNormFuncOptions(2).eps(2e-5));
+/// ```
+inline Tensor group_norm(
+    const Tensor& input,
+    const GroupNormFuncOptions& options) {
+  return detail::group_norm(
+      input,
+      options.num_groups(),
+      options.weight(),
+      options.bias(),
+      options.eps());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1b4a7fee81d86d3d1e977fbd0283a8c586770e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/PadNd.h>
+#include <torch/nn/options/padding.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pad(
+    const Tensor& input,
+    IntArrayRef pad,
+    PadFuncOptions::mode_t mode,
+    double value) {
+  const auto mode_enum = [&] {
+    if (std::holds_alternative<enumtype::kConstant>(mode)) {
+      return at::padding_mode::constant;
+    } else if (std::holds_alternative<enumtype::kReflect>(mode)) {
+      return at::padding_mode::reflect;
+    } else if (std::holds_alternative<enumtype::kReplicate>(mode)) {
+      return at::padding_mode::replicate;
+    } else if (std::holds_alternative<enumtype::kCircular>(mode)) {
+      return at::padding_mode::circular;
+    }
+    TORCH_CHECK(false, "Unrecognised padding mode");
+  }();
+
+  c10::optional<double> fill_value;
+  if (value != 0.0) {
+    fill_value = value;
+  }
+  return at::_pad_enum(input, pad, static_cast<int64_t>(mode_enum), fill_value);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pad
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PadFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1,
+/// 2}).mode(torch::kReplicate));
+/// ```
+inline Tensor pad(const Tensor& input, const PadFuncOptions& options) {
+  return detail::pad(input, options.pad(), options.mode(), options.value());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..824153671e83c2b0a8cc145d14e8540fe85ed2ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <torch/nn/options/pixelshuffle.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pixel_shuffle(const Tensor& input, int64_t upscale_factor) {
+  return torch::pixel_shuffle(input, upscale_factor);
+}
+
+inline Tensor pixel_unshuffle(const Tensor& input, int64_t downscale_factor) {
+  return torch::pixel_unshuffle(input, downscale_factor);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.pixel_shuffle
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PixelShuffleFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_shuffle(x, F::PixelShuffleFuncOptions(2));
+/// ```
+inline Tensor pixel_shuffle(
+    const Tensor& input,
+    const PixelShuffleFuncOptions& options) {
+  return detail::pixel_shuffle(input, options.upscale_factor());
+}
+
+inline Tensor pixel_unshuffle(
+    const Tensor& input,
+    const PixelUnshuffleFuncOptions& options) {
+  return detail::pixel_unshuffle(input, options.downscale_factor());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da4e4041d947dfa6027fabf24013def51bb027f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -0,0 +1,1153 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/modules/utils.h>
+#include <torch/nn/options/pooling.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    bool ceil_mode,
+    bool count_include_pad) {
+  return torch::avg_pool1d(
+      input, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool1d(x, F::AvgPool1dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool1d(
+    const Tensor& input,
+    const AvgPool1dFuncOptions& options) {
+  return avg_pool1d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  return torch::avg_pool2d(
+      input,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool2d(x, F::AvgPool2dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool2d(
+    const Tensor& input,
+    const AvgPool2dFuncOptions& options) {
+  return detail::avg_pool2d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad(),
+      options.divisor_override());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    c10::optional<int64_t> divisor_override) {
+  return torch::avg_pool3d(
+      input,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.avg_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool3d(x, F::AvgPool3dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool3d(
+    const Tensor& input,
+    const AvgPool3dFuncOptions& options) {
+  return detail::avg_pool3d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad(),
+      options.divisor_override());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    ExpandingArray<1> dilation,
+    bool ceil_mode) {
+  return torch::max_pool1d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool1d(
+    const Tensor& input,
+    const MaxPool1dFuncOptions& options) {
+  return detail::max_pool1d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    ExpandingArray<1> dilation,
+    bool ceil_mode) {
+  return torch::max_pool1d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d_with_indices(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+    const Tensor& input,
+    const MaxPool1dFuncOptions& options) {
+  return detail::max_pool1d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> dilation,
+    bool ceil_mode) {
+  return torch::max_pool2d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool2d(
+    const Tensor& input,
+    const MaxPool2dFuncOptions& options) {
+  return detail::max_pool2d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool2d_with_indices(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> dilation,
+    bool ceil_mode) {
+  return torch::max_pool2d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d_with_indices(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool2d_with_indices(
+    const Tensor& input,
+    const MaxPool2dFuncOptions& options) {
+  return detail::max_pool2d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    ExpandingArray<3> dilation,
+    bool ceil_mode) {
+  return torch::max_pool3d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool3d(
+    const Tensor& input,
+    const MaxPool3dFuncOptions& options) {
+  return detail::max_pool3d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool3d_with_indices(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    ExpandingArray<3> dilation,
+    bool ceil_mode) {
+  return torch::max_pool3d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d_with_indices(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool3d_with_indices(
+    const Tensor& input,
+    const MaxPool3dFuncOptions& options) {
+  return detail::max_pool3d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool1d_with_indices(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return torch::adaptive_max_pool1d(input, output_size);
+}
+} // namespace detail
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d_with_indices(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool1d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool1dFuncOptions& options) {
+  return detail::adaptive_max_pool1d_with_indices(input, options.output_size());
+}
+
+namespace detail {
+inline Tensor adaptive_max_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return std::get<0>(adaptive_max_pool1d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool1d(
+    const Tensor& input,
+    const AdaptiveMaxPool1dFuncOptions& options) {
+  return detail::adaptive_max_pool1d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool2d_with_indices(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_max_pool2d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d_with_indices(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool2d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool2dFuncOptions& options) {
+  return detail::adaptive_max_pool2d_with_indices(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_max_pool2d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  return std::get<0>(adaptive_max_pool2d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool2d(
+    const Tensor& input,
+    const AdaptiveMaxPool2dFuncOptions& options) {
+  return detail::adaptive_max_pool2d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool3d_with_indices(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_max_pool3d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d_with_indices(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool3d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool3dFuncOptions& options) {
+  return detail::adaptive_max_pool3d_with_indices(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_max_pool3d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  return std::get<0>(adaptive_max_pool3d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_max_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool3d(
+    const Tensor& input,
+    const AdaptiveMaxPool3dFuncOptions& options) {
+  return detail::adaptive_max_pool3d(input, options.output_size());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return torch::adaptive_avg_pool1d(input, output_size);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool1d(x, F::AdaptiveAvgPool1dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool1d(
+    const Tensor& input,
+    const AdaptiveAvgPool1dFuncOptions& options) {
+  return detail::adaptive_avg_pool1d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool2d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_avg_pool2d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool2d(x, F::AdaptiveAvgPool2dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool2d(
+    const Tensor& input,
+    const AdaptiveAvgPool2dFuncOptions& options) {
+  return detail::adaptive_avg_pool2d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool3d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_avg_pool3d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.adaptive_avg_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool3d(x, F::AdaptiveAvgPool3dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool3d(
+    const Tensor& input,
+    const AdaptiveAvgPool3dFuncOptions& options) {
+  return detail::adaptive_avg_pool3d(input, options.output_size());
+}
+
+// ============================================================================
+
+inline std::vector<int64_t> _unpool_output_size(
+    const Tensor& input,
+    const IntArrayRef& kernel_size,
+    const IntArrayRef& stride,
+    const IntArrayRef& padding,
+    const c10::optional<std::vector<int64_t>>& output_size) {
+  auto input_size = input.sizes();
+  std::vector<int64_t> default_size;
+  for (const auto d : c10::irange(kernel_size.size())) {
+    default_size.push_back(
+        (input_size[input_size.size() - kernel_size.size() + d] - 1) *
+            stride[d] +
+        kernel_size[d] - 2 * padding[d]);
+  }
+  if (!output_size) {
+    return default_size;
+  } else {
+    std::vector<int64_t> output_size_;
+    if (output_size->size() == kernel_size.size() + 2) {
+      output_size_ = IntArrayRef(*output_size).slice(2).vec();
+    }
+    if (output_size_.size() != kernel_size.size()) {
+      TORCH_CHECK(
+          false,
+          "output_size should be a sequence containing ",
+          kernel_size.size(),
+          " or ",
+          kernel_size.size() + 2,
+          " elements, but it has a length of '",
+          output_size_.size(),
+          "'");
+    }
+    for (const auto d : c10::irange(kernel_size.size())) {
+      const auto min_size = default_size[d] - stride[d];
+      const auto max_size = default_size[d] + stride[d];
+      if (!(min_size <= output_size_[d] && output_size_[d] <= max_size)) {
+        TORCH_CHECK(
+            false,
+            "invalid output_size ",
+            output_size_,
+            " (dim ",
+            d,
+            " must be between ",
+            min_size,
+            " and ",
+            max_size,
+            ")");
+      }
+    }
+    return output_size_;
+  }
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool1d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    const c10::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+  output_size_.push_back(1);
+  return torch::max_unpool2d(
+             input.unsqueeze(-1), indices.unsqueeze(-1), output_size_)
+      .squeeze(-1);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool1d(x, indices,
+/// F::MaxUnpool1dFuncOptions(3).stride(2).padding(1));
+/// ```
+inline Tensor max_unpool1d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool1dFuncOptions& options) {
+  return detail::max_unpool1d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool2d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    const c10::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+
+  return torch::max_unpool2d(input, indices, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool2d(x, indices,
+/// F::MaxUnpool2dFuncOptions(3).stride(2).padding(1));
+/// ```
+inline Tensor max_unpool2d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool2dFuncOptions& options) {
+  return detail::max_unpool2d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool3d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    const c10::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+
+  return torch::max_unpool3d(input, indices, output_size_, stride, padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.max_unpool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool3d(x, indices, F::MaxUnpool3dFuncOptions(3));
+/// ```
+inline Tensor max_unpool3d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool3dFuncOptions& options) {
+  return detail::max_unpool3d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
+    const Tensor& input,
+    const ExpandingArray<2>& kernel_size,
+    const c10::optional<ExpandingArray<2>>& output_size,
+    const c10::optional<ExpandingArray<2, double>>& output_ratio,
+    const Tensor& _random_samples) {
+  if (output_size == c10::nullopt && output_ratio == c10::nullopt) {
+    TORCH_CHECK(
+        false,
+        "fractional_max_pool2d requires specifying either ",
+        "an output_size or an output_ratio");
+  }
+  c10::optional<ExpandingArray<2>> output_size_ = output_size;
+  if (output_size_ == c10::nullopt) {
+    TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
+    output_size_ = {
+        (int64_t)(static_cast<double>(input.size(-2)) *
+                  (*output_ratio.value())[0]),
+        (int64_t)(static_cast<double>(input.size(-1)) *
+                  (*output_ratio.value())[1])};
+  }
+
+  Tensor _random_samples_ = _random_samples;
+  if (!_random_samples_.defined()) {
+    auto n_batch = input.dim() == 3 ? 1 : input.size(0);
+    _random_samples_ = torch::rand(
+        {n_batch, input.size(-3), 2},
+        torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+  }
+  return torch::fractional_max_pool2d(
+      input, kernel_size, *output_size_, _random_samples_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d_with_indices(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
+    const Tensor& input,
+    const FractionalMaxPool2dFuncOptions& options) {
+  return detail::fractional_max_pool2d_with_indices(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fractional_max_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    c10::optional<ExpandingArray<2>> output_size,
+    c10::optional<ExpandingArray<2, double>> output_ratio,
+    const Tensor& _random_samples) {
+  return std::get<0>(fractional_max_pool2d_with_indices(
+      input, kernel_size, output_size, output_ratio, _random_samples));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+inline Tensor fractional_max_pool2d(
+    const Tensor& input,
+    const FractionalMaxPool2dFuncOptions& options) {
+  return detail::fractional_max_pool2d(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
+    const Tensor& input,
+    const ExpandingArray<3>& kernel_size,
+    const c10::optional<ExpandingArray<3>>& output_size,
+    const c10::optional<ExpandingArray<3, double>>& output_ratio,
+    const Tensor& _random_samples) {
+  if (output_size == c10::nullopt && output_ratio == c10::nullopt) {
+    TORCH_CHECK(
+        false,
+        "fractional_max_pool3d requires specifying either ",
+        "an output_size or an output_ratio");
+  }
+
+  c10::optional<ExpandingArray<3>> output_size_ = output_size;
+  if (output_size_ == c10::nullopt) {
+    TORCH_INTERNAL_ASSERT(output_ratio != c10::nullopt);
+    output_size_ = {
+        (int64_t)(static_cast<double>(input.size(-3)) *
+                  (*output_ratio.value())[0]),
+        (int64_t)(static_cast<double>(input.size(-2)) *
+                  (*output_ratio.value())[1]),
+        (int64_t)(static_cast<double>(input.size(-1)) *
+                  (*output_ratio.value())[2])};
+  }
+
+  Tensor _random_samples_ = _random_samples;
+  if (!_random_samples_.defined()) {
+    auto n_batch = input.dim() == 4 ? 1 : input.size(0);
+    _random_samples_ = torch::rand(
+        {n_batch, input.size(-4), 3},
+        torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+  }
+  return torch::fractional_max_pool3d(
+      input, kernel_size, *output_size_, _random_samples_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d_with_indices(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
+    const Tensor& input,
+    const FractionalMaxPool3dFuncOptions& options) {
+  return detail::fractional_max_pool3d_with_indices(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fractional_max_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    c10::optional<ExpandingArray<3>> output_size,
+    c10::optional<ExpandingArray<3, double>> output_ratio,
+    const Tensor& _random_samples) {
+  return std::get<0>(fractional_max_pool3d_with_indices(
+      input, kernel_size, output_size, output_ratio, _random_samples));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+inline Tensor fractional_max_pool3d(
+    const Tensor& input,
+    const FractionalMaxPool3dFuncOptions& options) {
+  return detail::fractional_max_pool3d(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool1d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    bool ceil_mode) {
+  Tensor out = detail::avg_pool1d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul((*kernel_size)[0])
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool1dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool1d(x, F::LPPool1dFuncOptions(2, 3).stride(2));
+/// ```
+inline Tensor lp_pool1d(
+    const Tensor& input,
+    const LPPool1dFuncOptions& options) {
+  return detail::lp_pool1d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool2d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    bool ceil_mode) {
+  int kw = (*kernel_size)[0];
+  int kh = (*kernel_size)[1];
+  Tensor out = detail::avg_pool2d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true,
+      /*divisor_override=*/c10::nullopt);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul(kw * kh)
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool2dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool2d(x, F::LPPool2dFuncOptions(2, {2, 3}).stride(2));
+/// ```
+inline Tensor lp_pool2d(
+    const Tensor& input,
+    const LPPool2dFuncOptions& options) {
+  return detail::lp_pool2d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool3d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    bool ceil_mode) {
+  int kd = (*kernel_size)[0];
+  int kw = (*kernel_size)[1];
+  int kh = (*kernel_size)[2];
+  Tensor out = detail::avg_pool3d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true,
+      /*divisor_override=*/c10::nullopt);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul(kd * kw * kh)
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.lp_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool3dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool3d(x, F::LPPool3dFuncOptions(3, {3, 3, 5}).stride(3));
+/// ```
+inline Tensor lp_pool3d(
+    const Tensor& input,
+    const LPPool3dFuncOptions& options) {
+  return detail::lp_pool3d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..3772ce82ac031fe07d2354a0c4c9c573b6477723
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -0,0 +1,289 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/options/upsampling.h>
+
+#include <cmath>
+#include <utility>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+inline std::vector<int64_t> _interp_output_size(
+    int64_t dim,
+    std::tuple<
+        Tensor,
+        c10::optional<std::vector<int64_t>>,
+        c10::optional<std::vector<double>>,
+        c10::optional<bool>> closed_over_args) {
+  auto [input, size, scale_factor, recompute_scale_factor] = closed_over_args;
+  if (size == c10::nullopt && scale_factor == c10::nullopt) {
+    TORCH_CHECK(false, "either size or scale_factor should be defined");
+  }
+  if (size != c10::nullopt && scale_factor != c10::nullopt) {
+    TORCH_CHECK(false, "only one of size or scale_factor should be defined");
+  }
+  if (scale_factor != c10::nullopt) {
+    if (static_cast<int64_t>(scale_factor.value().size()) != dim) {
+      TORCH_CHECK(
+          false,
+          "scale_factor shape must match input shape. ",
+          "Input is ",
+          dim,
+          "D, scale_factor size is ",
+          torch::ArrayRef<double>(*scale_factor));
+    }
+  }
+  if (size != c10::nullopt) {
+    return *size;
+  }
+
+  TORCH_INTERNAL_ASSERT(scale_factor != c10::nullopt);
+  auto scale_factors = *scale_factor;
+
+  if (recompute_scale_factor == c10::nullopt) {
+    // only warn when the scales have floating values since
+    // the result for ints is the same with/without recompute_scale_factor
+    bool is_float_scale_factor = false;
+    for (double scale : scale_factors) {
+      is_float_scale_factor = floor(scale) != scale;
+      if (is_float_scale_factor) {
+        break;
+      }
+    }
+    if (is_float_scale_factor) {
+      TORCH_WARN(
+          "The default behavior for interpolate/upsample with float scale_factor changed "
+          "in 1.6.0 to align with other frameworks/libraries, and uses scale_factor directly, "
+          "instead of relying on the computed output size. "
+          "If you wish to keep the old behavior, please set recompute_scale_factor=True. "
+          "See the documentation of nn.Upsample for details. ");
+    }
+  }
+
+  std::vector<int64_t> ret;
+  for (const auto i : c10::irange(dim)) {
+    ret.emplace_back(static_cast<int64_t>(
+        floor(static_cast<double>(input.size(i + 2)) * scale_factors[i])));
+  }
+  return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor interpolate(
+    const Tensor& input,
+    const c10::optional<std::vector<int64_t>>& size,
+    const c10::optional<std::vector<double>>& scale_factor,
+    InterpolateFuncOptions::mode_t mode,
+    c10::optional<bool> align_corners,
+    c10::optional<bool> recompute_scale_factor,
+    bool antialias) {
+  if (std::holds_alternative<enumtype::kNearest>(mode) ||
+      std::get_if<enumtype::kArea>(&mode)) {
+    if (align_corners != c10::nullopt) {
+      TORCH_CHECK(
+          false,
+          "align_corners option can only be set with the "
+          "interpolating modes: linear | bilinear | bicubic | trilinear");
+    }
+  } else {
+    if (align_corners == c10::nullopt) {
+      TORCH_WARN(
+          "Default upsampling behavior when mode=",
+          enumtype::get_enum_name(mode),
+          " is changed "
+          "to align_corners=False since 0.4.0. Please specify "
+          "align_corners=True if the old behavior is desired. "
+          "See the documentation of nn.Upsample for details.");
+      align_corners = false;
+    }
+  }
+
+  TORCH_CHECK(
+      input.dim() >= 3 && input.dim() <= 5,
+      "Input Error: Only 3D, 4D and 5D input Tensors supported "
+      "(got ",
+      input.dim(),
+      "D) for the modes: nearest | linear | bilinear | bicubic | trilinear "
+      "(got ",
+      enumtype::get_enum_name(mode),
+      ")");
+
+  auto scale_factor_len = input.dim() - 2;
+  std::vector<c10::optional<double>> scale_factor_list(
+      scale_factor_len, c10::nullopt);
+  if (scale_factor != c10::nullopt && !recompute_scale_factor.value_or(false)) {
+    auto _scale_factor_repeated = *scale_factor;
+    scale_factor_list = {};
+    for (const auto& elem : _scale_factor_repeated) {
+      scale_factor_list.emplace_back(elem);
+    }
+  }
+
+  if (antialias &&
+      !(input.dim() == 4 &&
+        (std::get_if<enumtype::kBilinear>(&mode) ||
+         std::get_if<enumtype::kBicubic>(&mode)))) {
+    TORCH_CHECK(
+        false,
+        "Anti-alias option is only supported for bilinear and bicubic modes");
+  }
+
+  auto closed_over_args =
+      std::make_tuple(input, size, scale_factor, recompute_scale_factor);
+  if (input.dim() == 3 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        scale_factor_list.at(0));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        scale_factor_list.at(0));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool1d(
+        input, _interp_output_size(1, std::move(closed_over_args)));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool2d(
+        input, _interp_output_size(2, std::move(closed_over_args)));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool3d(
+        input, _interp_output_size(3, std::move(closed_over_args)));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
+    return torch::upsample_linear1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 3D input, but bilinear mode needs 4D input");
+  } else if (input.dim() == 3 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 3D input, but trilinear mode needs 5D input");
+  } else if (input.dim() == 4 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(false, "Got 4D input, but linear mode needs 3D input");
+  } else if (input.dim() == 4 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
+    if (antialias) {
+      return torch::_upsample_bilinear2d_aa(
+          input,
+          _interp_output_size(2, std::move(closed_over_args)),
+          *align_corners,
+          scale_factor_list.at(0),
+          scale_factor_list.at(1));
+    }
+    return torch::upsample_bilinear2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 4D input, but trilinear mode needs 5D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(false, "Got 5D input, but linear mode needs 3D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 5D input, but bilinear mode needs 4D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
+    return torch::upsample_trilinear3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kBicubic>(&mode)) {
+    TORCH_CHECK(
+        align_corners != c10::nullopt, "align_corners should be specified.");
+    if (antialias) {
+      return torch::_upsample_bicubic2d_aa(
+          input,
+          _interp_output_size(2, std::move(closed_over_args)),
+          *align_corners,
+          scale_factor_list.at(0),
+          scale_factor_list.at(1));
+    }
+    return torch::upsample_bicubic2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only 3D, 4D and 5D input Tensors supported "
+        "(got ",
+        input.dim(),
+        "D) for the modes: nearest | linear | bilinear | bicubic | trilinear "
+        "(got ",
+        enumtype::get_enum_name(mode),
+        ")");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.interpolate
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::InterpolateFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::interpolate(input,
+/// F::InterpolateFuncOptions().size({4}).mode(torch::kNearest));
+/// ```
+inline Tensor interpolate(
+    const Tensor& input,
+    const InterpolateFuncOptions& options = {}) {
+  return detail::interpolate(
+      input,
+      options.size(),
+      options.scale_factor(),
+      options.mode(),
+      options.align_corners(),
+      options.recompute_scale_factor(),
+      options.antialias());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..059c278a3ad17791ad0c29a26498b998f2319718
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <torch/nn/options/vision.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+inline Tensor affine_grid(
+    const Tensor& theta,
+    const IntArrayRef& size,
+    bool align_corners = false) {
+  // enforce floating point dtype on theta
+  TORCH_CHECK(
+      theta.is_floating_point(),
+      "Expected theta to have floating point type, but got ",
+      theta.dtype());
+
+  // check that shapes and sizes match
+  if (size.size() == 4) {
+    TORCH_CHECK(
+        theta.dim() == 3 && theta.size(-2) == 2 && theta.size(-1) == 3,
+        "Expected a batch of 2D affine matrices of shape Nx2x3 for size ",
+        size,
+        ". Got ",
+        theta.sizes(),
+        ".");
+  } else if (size.size() == 5) {
+    TORCH_CHECK(
+        theta.dim() == 3 && theta.size(-2) == 3 && theta.size(-1) == 4,
+        "Expected a batch of 3D affine matrices of shape Nx3x4 for size ",
+        size,
+        ". Got ",
+        theta.sizes(),
+        ".");
+  } else {
+    TORCH_CHECK(
+        false,
+        "affine_grid only supports 4D and 5D sizes, ",
+        "for 2D and 3D affine transforms, respectively. ",
+        "Got size ",
+        size);
+  }
+
+  if (*std::min_element(size.begin(), size.end()) <= 0) {
+    TORCH_CHECK(false, "Expected non-zero, positive output size. Got ", size);
+  }
+
+  return torch::affine_grid_generator(theta, size, align_corners);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor grid_sample(
+    const Tensor& input,
+    const Tensor& grid,
+    GridSampleFuncOptions::mode_t mode,
+    GridSampleFuncOptions::padding_mode_t padding_mode,
+    c10::optional<bool> align_corners) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t mode_enum, padding_mode_enum;
+
+  if (std::holds_alternative<enumtype::kBilinear>(mode)) {
+    mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kNearest>(mode)) {
+    mode_enum = 1;
+  } else { /// mode == 'bicubic'
+    mode_enum = 2;
+  }
+
+  if (std::holds_alternative<enumtype::kZeros>(padding_mode)) {
+    padding_mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kBorder>(padding_mode)) {
+    padding_mode_enum = 1;
+  } else { /// padding_mode == 'reflection'
+    padding_mode_enum = 2;
+  }
+
+  if (!align_corners.has_value()) {
+    TORCH_WARN(
+        "Default grid_sample and affine_grid behavior has changed ",
+        "to align_corners=False since 1.3.0. Please specify ",
+        "align_corners=True if the old behavior is desired. ",
+        "See the documentation of grid_sample for details.");
+    align_corners = false;
+  }
+
+  return torch::grid_sampler(
+      input, grid, mode_enum, padding_mode_enum, align_corners.value());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.grid_sample
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GridSampleFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::grid_sample(input, grid,
+/// F::GridSampleFuncOptions().mode(torch::kBilinear).padding_mode(torch::kZeros).align_corners(true));
+/// ```
+inline Tensor grid_sample(
+    const Tensor& input,
+    const Tensor& grid,
+    const GridSampleFuncOptions& options = {}) {
+  return detail::grid_sample(
+      input,
+      grid,
+      options.mode(),
+      options.padding_mode(),
+      options.align_corners());
+}
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..33e1d4f2920be383724d2fddac03ef1d185fc85c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace init {
+
+using NonlinearityType = std::variant<
+    enumtype::kLinear,
+    enumtype::kConv1D,
+    enumtype::kConv2D,
+    enumtype::kConv3D,
+    enumtype::kConvTranspose1D,
+    enumtype::kConvTranspose2D,
+    enumtype::kConvTranspose3D,
+    enumtype::kSigmoid,
+    enumtype::kTanh,
+    enumtype::kReLU,
+    enumtype::kLeakyReLU>;
+
+using FanModeType = std::variant<enumtype::kFanIn, enumtype::kFanOut>;
+
+} // namespace init
+} // namespace nn
+
+namespace nn {
+namespace init {
+
+/// Return the recommended gain value for the given nonlinearity function.
+TORCH_API double calculate_gain(
+    NonlinearityType nonlinearity,
+    double param = 0.01);
+
+/// Fills the given `tensor` with the provided `value` in-place, and returns it.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor constant_(Tensor tensor, Scalar value);
+
+/// Fills the given `tensor` with the Dirac delta function in-place, and returns
+/// it. No gradient will be recorded for this operation.
+TORCH_API Tensor dirac_(Tensor tensor);
+
+/// Fills the given 2-dimensional `matrix` with an identity matrix.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor eye_(Tensor matrix);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a normal
+/// distribution parameterized by `mean` and `std`.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor normal_(Tensor tensor, double mean = 0, double std = 1);
+
+/// Fills the given `tensor` with ones.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor ones_(Tensor tensor);
+
+/// Fills the input `Tensor` with a (semi) orthogonal matrix, as described in
+/// "Exact solutions to the nonlinear dynamics of learning in deep linear neural
+/// networks" - Saxe, A. et al. (2013). The input tensor must have at least 2
+/// dimensions, and for tensors with more than 2 dimensions the trailing
+/// dimensions are flattened.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor orthogonal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the 2D input `Tensor` as a sparse matrix, where the
+/// non-zero elements will be drawn from a centered normal distribution
+/// with the given standard deviation `std`, as described in "Deep learning via
+/// Hessian-free optimization" - Martens, J. (2010). The `sparsity` is a real
+/// value between 0 and 1 that controls the fraction of elements in each column
+/// to be set to zero.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a uniform
+/// distribution parameterized by `low` and `high`.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor uniform_(Tensor tensor, double low = 0, double high = 1);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Delving deep into rectifiers: Surpassing human-level
+/// performance on ImageNet classification" - He, K. et al. (2015), using a
+/// normal distribution. Also known as He initialization.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor kaiming_normal_(
+    Tensor tensor,
+    double a = 0,
+    FanModeType mode = torch::kFanIn,
+    NonlinearityType nonlinearity = torch::kLeakyReLU);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Delving deep into rectifiers: Surpassing human-level
+/// performance on ImageNet classification" - He, K. et al. (2015), using a
+/// uniform distribution. Also known as He initialization.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor kaiming_uniform_(
+    Tensor tensor,
+    double a = 0,
+    FanModeType mode = torch::kFanIn,
+    NonlinearityType nonlinearity = torch::kLeakyReLU);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010). Values are scaled by the
+/// `gain` parameter. No gradient will be recorded for this operation.
+TORCH_API Tensor xavier_normal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+/// distribution. Values are scaled by the `gain` parameter
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor xavier_uniform_(Tensor tensor, double gain = 1.0);
+
+/// Fills the given `tensor` with zeros.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor zeros_(Tensor tensor);
+
+TORCH_API std::tuple<int64_t, int64_t> _calculate_fan_in_and_fan_out(
+    const Tensor& tensor);
+
+} // namespace init
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..26d8ac5ef8a70eae7f08202e61ef8c5ad13ebbbd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h
@@ -0,0 +1,702 @@
+#pragma once
+
+#include <torch/nn/modules/container/any_module_holder.h>
+#include <torch/nn/modules/container/any_value.h>
+#include <torch/nn/pimpl.h>
+#include <torch/ordered_dict.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+
+#include <functional>
+#include <iosfwd>
+#include <map>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+namespace torch {
+namespace nn {
+
+/// The base class for all modules in PyTorch.
+///
+/// \rst
+/// .. note::
+///   The design and implementation of this class is largely based on the Python
+///   API. You may want to consult the python documentation for
+///   :py:class:`pytorch:torch.nn.Module` for further clarification on certain
+///   methods or behavior.
+/// \endrst
+///
+/// A `Module` is an abstraction over the implementation of some function or
+/// algorithm, possibly associated with some persistent data. A `Module` may
+/// contain further `Module`s ("submodules"), each with their own
+/// implementation, persistent data and further submodules. `Module`s can thus
+/// be said to form a recursive tree structure. A `Module` is registered as a
+/// submodule to another `Module` by calling `register_module()`, typically from
+/// within a parent module's constructor.
+///
+/// A distinction is made between three kinds of persistent data that may be
+/// associated with a `Module`:
+///
+/// 1. *Parameters*: tensors that record gradients, typically weights updated
+///    during the backward step (e.g. the `weight` of a `Linear` module),
+/// 2. *Buffers*: tensors that do not record gradients, typically updated during
+///    the forward step, such as running statistics (e.g. `mean` and `variance`
+///    in the `BatchNorm` module),
+/// 3. Any additional state, not necessarily tensors, required for the
+///    implementation or configuration of a `Module`.
+///
+/// The first two kinds of state are special in that they may be registered
+/// with the `Module` system to allow convenient access and batch configuration.
+/// For example, registered parameters in any `Module` may be iterated over via
+/// the `parameters()` accessor. Further, changing the data type of a `Module`'s
+/// registered parameters can be done conveniently via `Module::to()`, e.g.
+/// `module->to(torch::kCUDA)` to move all parameters to GPU memory. Lastly,
+/// registered parameters and buffers are handled specially during a `clone()`
+/// operation, which performs a deepcopy of a cloneable `Module` hierarchy.
+///
+/// Parameters are registered with a `Module` via `register_parameter`. Buffers
+/// are registered separately via `register_buffer`. These methods are part of
+/// the public API of `Module` and are typically invoked from within a
+/// concrete `Module`s constructor.
+class TORCH_API Module : public std::enable_shared_from_this<Module> {
+ public:
+  using ModuleApplyFunction = std::function<void(Module&)>;
+  using ConstModuleApplyFunction = std::function<void(const Module&)>;
+  using NamedModuleApplyFunction =
+      std::function<void(const std::string&, Module&)>;
+  using ConstNamedModuleApplyFunction =
+      std::function<void(const std::string&, const Module&)>;
+  using ModulePointerApplyFunction =
+      std::function<void(const std::shared_ptr<Module>&)>;
+  using NamedModulePointerApplyFunction =
+      std::function<void(const std::string&, const std::shared_ptr<Module>&)>;
+
+  /// Tells the base `Module` about the name of the submodule.
+  explicit Module(std::string name);
+
+  /// Constructs the module without immediate knowledge of the submodule's name.
+  /// The name of the submodule is inferred via RTTI (if possible) the first
+  /// time `.name()` is invoked.
+  Module();
+  Module(const Module&) = default;
+  Module& operator=(const Module&) = default;
+  Module(Module&&) noexcept = default;
+  Module& operator=(Module&&) noexcept = default;
+
+  virtual ~Module() = default;
+
+  /// Returns the name of the `Module`.
+  ///
+  /// A `Module` has an associated `name`, which is a string representation of
+  /// the kind of concrete `Module` it represents, such as `"Linear"` for the
+  /// `Linear` module. Under most circumstances, this name is automatically
+  /// inferred via runtime type information (RTTI). In the unusual circumstance
+  /// that you have this feature disabled, you may want to manually name your
+  /// `Module`s by passing the string name to the `Module` base class'
+  /// constructor.
+  const std::string& name() const noexcept;
+
+  /// Performs a recursive deep copy of the module and all its registered
+  /// parameters, buffers and submodules.
+  ///
+  /// Optionally, this method sets the current device
+  /// to the one supplied before cloning. If no device is given, each
+  /// parameter and buffer will be moved to the device of its source.
+  ///
+  /// \rst
+  /// .. attention::
+  ///   Attempting to call the `clone()` method inherited from the base `Module`
+  ///   class (the one documented here) will fail. To inherit an actual
+  ///   implementation of `clone()`, you must subclass `Cloneable`. `Cloneable`
+  ///   is templatized on the concrete module type, and can thus properly copy a
+  ///   `Module`. This method is provided on the base class' API solely for an
+  ///   easier-to-use polymorphic interface.
+  /// \endrst
+  virtual std::shared_ptr<Module> clone(
+      const optional<Device>& device = nullopt) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `Module&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](nn::Module& module) {
+  ///     std::cout << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ModuleApplyFunction& function);
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const Module&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const nn::Module& module) {
+  ///     std::cout << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ConstModuleApplyFunction& function) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `Module&`. The key of the module itself is the empty string. If
+  /// `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key, nn::Module& module) {
+  ///     std::cout << key << ": " << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const NamedModuleApplyFunction& function,
+      const std::string& name_prefix = std::string());
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `const Module&`. The key of the module itself is the empty string.
+  /// If `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key, const nn::Module& module) {
+  ///     std::cout << key << ": " << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const ConstNamedModuleApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::shared_ptr<Module>&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::shared_ptr<nn::Module>& module) {
+  ///     std::cout << module->name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ModulePointerApplyFunction& function) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `const std::shared_ptr<Module>&`. The key of the module itself is
+  /// the empty string. If `name_prefix` is given, it is prepended to every key
+  /// as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key,
+  ///                    const std::shared_ptr<nn::Module>& module) {
+  ///     std::cout << key << ": " << module->name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const NamedModulePointerApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Returns the parameters of this `Module` and if `recurse` is true, also
+  /// recursively of every submodule.
+  std::vector<Tensor> parameters(bool recurse = true) const;
+
+  /// Returns an `OrderedDict` with the parameters of this `Module` along with
+  /// their keys, and if `recurse` is true also recursively of every submodule.
+  OrderedDict<std::string, Tensor> named_parameters(bool recurse = true) const;
+
+  /// Returns the buffers of this `Module` and if `recurse` is true, also
+  /// recursively of every submodule.
+  std::vector<Tensor> buffers(bool recurse = true) const;
+
+  /// Returns an `OrderedDict` with the buffers of this `Module` along with
+  /// their keys, and if `recurse` is true also recursively of every submodule.
+  OrderedDict<std::string, Tensor> named_buffers(bool recurse = true) const;
+
+  /// Returns the submodules of this `Module` (the entire submodule hierarchy)
+  /// and if `include_self` is true, also inserts a `shared_ptr` to this module
+  /// in the first position.
+  ///
+  /// \rst
+  /// .. warning::
+  ///   Only pass `include_self` as `true` if this `Module` is stored in a
+  ///   `shared_ptr`! Otherwise an exception will be thrown. You may still call
+  ///   this method with `include_self` set to false if your `Module` is not
+  ///   stored in a `shared_ptr`.
+  /// \endrst
+  std::vector<std::shared_ptr<Module>> modules(bool include_self = true) const;
+
+  /// Returns an `OrderedDict` of the submodules of this `Module` (the entire
+  /// submodule hierarchy) and their keys, and if `include_self` is true, also
+  /// inserts a `shared_ptr` to this module in the first position. If
+  /// `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. warning::
+  ///   Only pass `include_self` as `true` if this `Module` is stored in a
+  ///   `shared_ptr`! Otherwise an exception will be thrown. You may still call
+  ///   this method with `include_self` set to false if your `Module` is not
+  ///   stored in a `shared_ptr`.
+  /// \endrst
+  OrderedDict<std::string, std::shared_ptr<Module>> named_modules(
+      const std::string& name_prefix = std::string(),
+      bool include_self = true) const;
+
+  /// Returns the direct submodules of this `Module`.
+  std::vector<std::shared_ptr<Module>> children() const;
+
+  /// Returns an `OrderedDict` of the direct submodules of this `Module` and
+  /// their keys.
+  OrderedDict<std::string, std::shared_ptr<Module>> named_children() const;
+
+  /// Enables "training" mode.
+  virtual void train(bool on = true);
+
+  /// Calls train(false) to enable "eval" mode.
+  /// Do not override this method, override `train()` instead.
+  void eval();
+
+  /// True if the module is in training mode.
+  ///
+  /// Every `Module` has a boolean associated with it that determines whether
+  /// the `Module` is currently in *training* mode (set via `.train()`) or in
+  /// *evaluation* (inference) mode (set via `.eval()`). This property is
+  /// exposed via `is_training()`, and may be used by the implementation of a
+  /// concrete module to modify its runtime behavior. See the `BatchNorm` or
+  /// `Dropout` modules for examples of `Module`s that use different code paths
+  /// depending on this property.
+  virtual bool is_training() const noexcept;
+
+  /// Recursively casts all parameters to the given `dtype` and `device`.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(
+      torch::Device device,
+      torch::Dtype dtype,
+      bool non_blocking = false);
+
+  /// Recursively casts all parameters to the given dtype.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Dtype dtype, bool non_blocking = false);
+
+  /// Recursively moves all parameters to the given device.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Device device, bool non_blocking = false);
+
+  /// Recursively zeros out the `grad` value of each registered parameter.
+  virtual void zero_grad(bool set_to_none = true);
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module->apply(initialize_weights);
+  /// \endrst
+  template <typename ModuleType>
+  typename ModuleType::ContainedType* as() noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module->apply(initialize_weights);
+  /// \endrst
+  template <typename ModuleType>
+  const typename ModuleType::ContainedType* as() const noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module.apply(initialize_weights);
+  /// \endrst
+  template <
+      typename ModuleType,
+      typename = torch::detail::disable_if_module_holder_t<ModuleType>>
+  ModuleType* as() noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module.apply(initialize_weights);
+  /// \endrst
+  template <
+      typename ModuleType,
+      typename = torch::detail::disable_if_module_holder_t<ModuleType>>
+  const ModuleType* as() const noexcept;
+
+  /// Serializes the `Module` into the given `OutputArchive`.
+  ///
+  /// If the `Module` contains unserializable submodules (e.g.
+  /// `nn::Functional`), those submodules are skipped when serializing.
+  virtual void save(serialize::OutputArchive& archive) const;
+
+  /// Deserializes the `Module` from the given `InputArchive`.
+  ///
+  /// If the `Module` contains unserializable submodules (e.g.
+  /// `nn::Functional`), we don't check the existence of those submodules in the
+  /// `InputArchive` when deserializing.
+  virtual void load(serialize::InputArchive& archive);
+
+  /// Streams a pretty representation of the `Module` into the given `stream`.
+  /// By default, this representation will be the name of the module (taken from
+  /// `name()`), followed by a recursive pretty print of all of the `Module`'s
+  /// submodules.
+  ///
+  /// Override this method to change the pretty print. The input
+  /// `stream` should be returned from the method, to allow easy chaining.
+  virtual void pretty_print(std::ostream& stream) const;
+
+  /// Returns whether the `Module` is serializable.
+  virtual bool is_serializable() const;
+
+  /// Registers a parameter with this `Module`.
+  ///
+  /// A parameter should be any gradient-recording tensor used in the
+  /// implementation of your `Module`. Registering it makes it available to
+  /// methods such as `parameters()`, `clone()` or `to().`
+  ///
+  /// Note that registering an undefined Tensor (e.g.
+  /// `module.register_parameter("param", Tensor())`) is allowed, and is
+  /// equivalent to `module.register_parameter("param", None)` in Python API.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     weight_ = register_parameter("weight", torch::randn({A, B}));
+  ///   }
+  /// \endrst
+  Tensor& register_parameter(
+      std::string name,
+      Tensor tensor,
+      bool requires_grad = true);
+
+  /// Registers a buffer with this `Module`.
+  ///
+  /// A buffer is intended to be state in your module that does not record
+  /// gradients, such as running statistics. Registering it makes it available
+  /// to methods such as `buffers()`, `clone()` or `to().
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     mean_ = register_buffer("mean", torch::empty({num_features_}));
+  ///   }
+  /// \endrst
+  Tensor& register_buffer(std::string name, Tensor tensor);
+
+  /// Registers a submodule with this `Module`.
+  ///
+  /// Registering a module makes it available to methods such as `modules()`,
+  /// `clone()` or `to()`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
+  /// \endrst
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      std::shared_ptr<ModuleType> module);
+
+  /// Registers a submodule with this `Module`.
+  ///
+  /// This method deals with `ModuleHolder`s.
+  ///
+  /// Registering a module makes it available to methods such as `modules()`,
+  /// `clone()` or `to()`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
+  /// \endrst
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      ModuleHolder<ModuleType> module_holder);
+
+  /// Replaces a registered submodule with this `Module`.
+  ///
+  /// This takes care of the registration, if you used submodule members, you
+  /// should
+  //  assign the submodule as well, i.e. use as
+  ///     module->submodule_ = module->replace_module("linear",
+  ///     torch::nn::Linear(3, 4));
+  /// It only works when a module of the name is already registered.
+  ///
+  /// This is useful for replacing a module after initialization, e.g.
+  /// for finetuning.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> replace_module(
+      const std::string& name,
+      std::shared_ptr<ModuleType> module);
+
+  /// Replaces a registered submodule with this `Module`.
+  /// This method deals with `ModuleHolder`s.
+  ///
+  /// This takes care of the registration, if you used submodule members, you
+  /// should
+  //  assign the submodule as well, i.e. use as
+  ///     module->submodule_ = module->replace_module("linear", linear_holder);
+  /// It only works when a module of the name is already registered.
+  ///
+  /// This is useful for replacing a module after initialization, e.g.
+  /// for finetuning.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> replace_module(
+      const std::string& name,
+      ModuleHolder<ModuleType> module_holder);
+
+  /// Unregisters a submodule from this `Module`. If there is no such module
+  /// with `name` an exception is thrown.
+  void unregister_module(const std::string& name);
+
+ protected:
+  /// The following three functions allow a module with default arguments in its
+  /// forward method to be used in a Sequential module.
+  /// You should NEVER override these functions manually. Instead, you should
+  /// use the `FORWARD_HAS_DEFAULT_ARGS` macro.
+  virtual bool _forward_has_default_args() {
+    return false;
+  }
+
+  virtual unsigned int _forward_num_required_args() {
+    TORCH_CHECK(
+        false,
+        "torch::nn::Module subclass that has default arguments in `forward` method ",
+        "must override `_forward_num_required_args` method. Please use ",
+        "`FORWARD_HAS_DEFAULT_ARGS` macro to do so.");
+  }
+
+  virtual std::vector<AnyValue> _forward_populate_default_args(
+      std::vector<AnyValue>&& arguments) {
+    TORCH_CHECK(
+        false,
+        "torch::nn::Module subclass that has default arguments in `forward` method ",
+        "must override `_forward_populate_default_args` method. Please use ",
+        "`FORWARD_HAS_DEFAULT_ARGS` macro to do so.");
+  }
+
+  /// The registered parameters of this `Module`.
+  /// Inorder to access parameters_ in ParameterDict and ParameterList
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  OrderedDict<std::string, Tensor> parameters_;
+
+ private:
+  // Friend classes.
+
+  template <typename Derived>
+  friend class Cloneable;
+
+  template <typename ModuleType, typename... ArgumentTypes>
+  friend struct AnyModuleHolder;
+
+  /// Pretty prints the given `Module` into the `ostream`.
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& stream,
+      const nn::Module& module);
+
+  // data parallel using this method to configure gradient edges during the
+  // replicate step.
+  template <typename ModuleType>
+  friend void replicate_grad_edges(
+      const std::shared_ptr<Module>& module,
+      const std::vector<std::shared_ptr<ModuleType>>& replicas,
+      const std::vector<Device>& devices);
+
+  // Private methods.
+
+  /// Used in the implementation of `Cloneable`.
+  virtual void clone_(Module& other, const optional<Device>& device);
+
+  /// The implementation of the various `to()` methods.
+  template <typename... Ts>
+  void to_impl(Ts&&... ts);
+
+  /// Implements pretty printing the module hierarchy.
+  void pretty_print_recursive(
+      std::ostream& stream,
+      const std::string& indentation) const;
+
+  /// Applies the `function` to every submodule recursively, starting at this
+  /// `Module`'s children (thus not including the module itself).
+  void apply_to_submodules(
+      const NamedModulePointerApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Returns a shared_ptr to `this` in a safe (checked) way.
+  std::shared_ptr<Module> shared_from_this_checked() const;
+
+  /// The registered buffers of this `Module`.
+  OrderedDict<std::string, Tensor> buffers_;
+
+  /// The registered (direct) submodules of this `Module`.
+  OrderedDict<std::string, std::shared_ptr<Module>> children_;
+
+  /// The module's name (e.g. "LSTM").
+  mutable optional<std::string> name_;
+
+  /// Whether the module is in training mode.
+  bool is_training_{true};
+};
+
+/// Serialize a `Module` pointer into an `OutputArchive`.
+TORCH_API serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const std::shared_ptr<nn::Module>& module);
+
+/// Deserializes a `Module` from an `InputArchive`.
+TORCH_API serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    const std::shared_ptr<nn::Module>& module);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nn::Module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType>
+typename ModuleType::ContainedType* Module::as() noexcept {
+  // Use the contained type of the `ModuleHolder`, e.g. `LinearImpl` for
+  // `Linear`, since `LinearImpl` inherits `nn::Module`.
+  return as<typename ModuleType::ContainedType>();
+}
+
+template <typename ModuleType>
+const typename ModuleType::ContainedType* Module::as() const noexcept {
+  // Use the contained type of the `ModuleHolder`, e.g. `LinearImpl` for
+  // `Linear`, since `LinearImpl` inherits `nn::Module`.
+  return as<typename ModuleType::ContainedType>();
+}
+
+template <typename ModuleType, typename>
+ModuleType* Module::as() noexcept {
+  return dynamic_cast<ModuleType*>(this);
+}
+
+template <typename ModuleType, typename>
+const ModuleType* Module::as() const noexcept {
+  return dynamic_cast<const ModuleType*>(this);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    std::shared_ptr<ModuleType> module) {
+  TORCH_CHECK(!name.empty(), "Submodule name must not be empty");
+  TORCH_CHECK(
+      name.find('.') == std::string::npos,
+      "Submodule name must not contain a dot (got '",
+      name,
+      "')");
+  auto& base_module = children_.insert(std::move(name), std::move(module));
+  return std::dynamic_pointer_cast<ModuleType>(base_module);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    ModuleHolder<ModuleType> module_holder) {
+  return register_module(std::move(name), module_holder.ptr());
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::replace_module(
+    const std::string& name,
+    std::shared_ptr<ModuleType> module) {
+  auto& base_module = (children_[name] = std::move(module));
+  return std::dynamic_pointer_cast<ModuleType>(base_module);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::replace_module(
+    const std::string& name,
+    ModuleHolder<ModuleType> module_holder) {
+  return replace_module(name, module_holder.ptr());
+}
+
+template <typename... Ts>
+void Module::to_impl(Ts&&... ts) {
+  // First call `to()` on every child module.
+  for (auto& child : children_) {
+    child.value()->to(ts...);
+  }
+  // Then move every parameter to the new dtype/device.
+  for (auto& parameter : named_parameters(/*recurse=*/false)) {
+    parameter->set_data(autograd::Variable(*parameter).to(ts...));
+  }
+  // Then move every buffer to the new dtype/device.
+  for (auto& buffer : named_buffers(/*recurse=*/false)) {
+    buffer->set_data(autograd::Variable(*buffer).to(ts...));
+  }
+}
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h
new file mode 100644
index 0000000000000000000000000000000000000000..79241fc5e9a8665c723ec8fff72f7d55f7bb82a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h
@@ -0,0 +1,36 @@
+#pragma once
+
+// Common
+#include <torch/nn/modules/common.h>
+
+// Containers
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/functional.h>
+#include <torch/nn/modules/container/moduledict.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/modules/container/named_any.h>
+#include <torch/nn/modules/container/parameterdict.h>
+#include <torch/nn/modules/container/parameterlist.h>
+#include <torch/nn/modules/container/sequential.h>
+
+// Layers
+#include <torch/nn/modules/activation.h>
+#include <torch/nn/modules/adaptive.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/distance.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/embedding.h>
+#include <torch/nn/modules/fold.h>
+#include <torch/nn/modules/instancenorm.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/loss.h>
+#include <torch/nn/modules/normalization.h>
+#include <torch/nn/modules/padding.h>
+#include <torch/nn/modules/pixelshuffle.h>
+#include <torch/nn/modules/pooling.h>
+#include <torch/nn/modules/rnn.h>
+#include <torch/nn/modules/transformer.h>
+#include <torch/nn/modules/transformercoder.h>
+#include <torch/nn/modules/transformerlayer.h>
+#include <torch/nn/modules/upsampling.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..47d5c1e891040c957ac017b46aa38e0d678df3bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functions {
+
+class CrossMapLRN2d : public torch::autograd::Function<CrossMapLRN2d> {
+ public:
+  static torch::autograd::Variable forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::autograd::Variable& input,
+      const CrossMapLRN2dOptions& options);
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+} // namespace functions
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fc7c280c95afdfd9f6ac87f9bc38402b638c77e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -0,0 +1,875 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/options/activation.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies elu over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ELU model(ELUOptions().alpha(42.42).inplace(true));
+/// ```
+class TORCH_API ELUImpl : public torch::nn::Cloneable<ELUImpl> {
+ public:
+  explicit ELUImpl(const ELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ELUImpl`.
+/// See the documentation for `ELUImpl` class to learn what methods it
+/// provides, and examples of how to use `ELU` with `torch::nn::ELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the selu function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.SELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SELU model(SELUOptions().inplace(true));
+/// ```
+class TORCH_API SELUImpl : public torch::nn::Cloneable<SELUImpl> {
+ public:
+  explicit SELUImpl(const SELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `SELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SELUImpl`.
+/// See the documentation for `SELUImpl` class to learn what methods it
+/// provides, and examples of how to use `SELU` with `torch::nn::SELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(SELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the hard shrinkage function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Hardshrink to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HardshrinkOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Hardshrink model(HardshrinkOptions().lambda(42.42));
+/// ```
+class TORCH_API HardshrinkImpl : public torch::nn::Cloneable<HardshrinkImpl> {
+ public:
+  explicit HardshrinkImpl(const HardshrinkOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Hardshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  HardshrinkOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HardshrinkImpl`.
+/// See the documentation for `HardshrinkImpl` class to learn what methods it
+/// provides, and examples of how to use `Hardshrink` with
+/// `torch::nn::HardshrinkOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Hardshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardtanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the HardTanh function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Hardtanh to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HardtanhOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Hardtanh
+/// model(HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true));
+/// ```
+class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
+ public:
+  explicit HardtanhImpl(const HardtanhOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `Hardtanh` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  HardtanhOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HardtanhImpl`.
+/// See the documentation for `HardtanhImpl` class to learn what methods it
+/// provides, and examples of how to use `Hardtanh` with
+/// `torch::nn::HardtanhOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Hardtanh);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LeakyReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LeakyReLU function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LeakyReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LeakyReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LeakyReLU model(LeakyReLUOptions().negative_slope(0.42).inplace(true));
+/// ```
+class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
+ public:
+  explicit LeakyReLUImpl(const LeakyReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `LeakyReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  LeakyReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LeakyReLUImpl`.
+/// See the documentation for `LeakyReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `LeakyReLU` with
+/// `torch::nn::LeakyReLUOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LeakyReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LogSigmoid function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSigmoid to learn
+/// about the exact behavior of this module.
+class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LogSigmoid` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `LogSigmoidImpl`.
+/// See the documentation for `LogSigmoidImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(LogSigmoid);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmax function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftmaxOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softmax model(SoftmaxOptions(1));
+/// ```
+class TORCH_API SoftmaxImpl : public torch::nn::Cloneable<SoftmaxImpl> {
+ public:
+  explicit SoftmaxImpl(int64_t dim) : SoftmaxImpl(SoftmaxOptions(dim)) {}
+  explicit SoftmaxImpl(const SoftmaxOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmax` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  SoftmaxOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftmaxImpl`.
+/// See the documentation for `SoftmaxImpl` class to learn what methods it
+/// provides, and examples of how to use `Softmax` with
+/// `torch::nn::SoftmaxOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softmax);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmin ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmin function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmin to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftminOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softmin model(SoftminOptions(1));
+/// ```
+class TORCH_API SoftminImpl : public torch::nn::Cloneable<SoftminImpl> {
+ public:
+  explicit SoftminImpl(int64_t dim) : SoftminImpl(SoftminOptions(dim)) {}
+  explicit SoftminImpl(const SoftminOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmin` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  SoftminOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftminImpl`.
+/// See the documentation for `SoftminImpl` class to learn what methods it
+/// provides, and examples of how to use `Softmin` with
+/// `torch::nn::SoftminOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softmin);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSoftmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LogSoftmax function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSoftmax to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LogSoftmaxOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LogSoftmax model(LogSoftmaxOptions(1));
+/// ```
+class TORCH_API LogSoftmaxImpl : public torch::nn::Cloneable<LogSoftmaxImpl> {
+ public:
+  explicit LogSoftmaxImpl(int64_t dim)
+      : LogSoftmaxImpl(LogSoftmaxOptions(dim)) {}
+  explicit LogSoftmaxImpl(const LogSoftmaxOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LogSoftmax` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  LogSoftmaxOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LogSoftmaxImpl`.
+/// See the documentation for `LogSoftmaxImpl` class to learn what methods it
+/// provides, and examples of how to use `LogSoftmax` with
+/// `torch::nn::LogSoftmaxOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LogSoftmax);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmax2d function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softmax2d to learn
+/// about the exact behavior of this module.
+class TORCH_API Softmax2dImpl : public torch::nn::Cloneable<Softmax2dImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmax2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Softmax2dImpl`.
+/// See the documentation for `Softmax2dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Softmax2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the PReLU function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.PReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PReLU model(PReLUOptions().num_parameters(42));
+/// ```
+class TORCH_API PReLUImpl : public torch::nn::Cloneable<PReLUImpl> {
+ public:
+  explicit PReLUImpl(const PReLUOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `PReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  PReLUOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `PReLUImpl`.
+/// See the documentation for `PReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `PReLU` with `torch::nn::PReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(PReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ReLU function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReLU model(ReLUOptions().inplace(true));
+/// ```
+class TORCH_API ReLUImpl : public torch::nn::Cloneable<ReLUImpl> {
+ public:
+  explicit ReLUImpl(const ReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ReLUImpl`.
+/// See the documentation for `ReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `ReLU` with `torch::nn::ReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ReLU6 function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReLU6 to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReLU6Options` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReLU6 model(ReLU6Options().inplace(true));
+/// ```
+class TORCH_API ReLU6Impl : public torch::nn::Cloneable<ReLU6Impl> {
+ public:
+  explicit ReLU6Impl(const ReLU6Options& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ReLU6` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReLU6Options options;
+};
+
+/// A `ModuleHolder` subclass for `ReLU6Impl`.
+/// See the documentation for `ReLU6Impl` class to learn what methods it
+/// provides, and examples of how to use `ReLU6` with `torch::nn::ReLU6Options`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ReLU6);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the RReLU function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.RReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RReLU model(RReLUOptions().lower(0.24).upper(0.42).inplace(true));
+/// ```
+class TORCH_API RReLUImpl : public torch::nn::Cloneable<RReLUImpl> {
+ public:
+  explicit RReLUImpl(const RReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `RReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  RReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `RReLUImpl`.
+/// See the documentation for `RReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `RReLU` with `torch::nn::RReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(RReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies celu over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.CELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CELU model(CELUOptions().alpha(42.42).inplace(true));
+/// ```
+class TORCH_API CELUImpl : public torch::nn::Cloneable<CELUImpl> {
+ public:
+  explicit CELUImpl(const CELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `CELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  CELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CELUImpl`.
+/// See the documentation for `CELUImpl` class to learn what methods it
+/// provides, and examples of how to use `CELU` with `torch::nn::CELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(CELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies glu over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GLU model(GLUOptions(1));
+/// ```
+class TORCH_API GLUImpl : public torch::nn::Cloneable<GLUImpl> {
+ public:
+  explicit GLUImpl(const GLUOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `GLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  GLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GLUImpl`.
+/// See the documentation for `GLUImpl` class to learn what methods it
+/// provides, and examples of how to use `GLU` with `torch::nn::GLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies gelu over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GELU to learn
+/// about the exact behavior of this module.
+class TORCH_API GELUImpl : public torch::nn::Cloneable<GELUImpl> {
+ public:
+  explicit GELUImpl(GELUOptions options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `GELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  GELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GELUImpl`.
+/// See the documentation for `GELUImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SiLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies silu over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.SiLU to learn
+/// about the exact behavior of this module.
+class TORCH_API SiLUImpl : public torch::nn::Cloneable<SiLUImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `SiLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SiLUImpl`.
+/// See the documentation for `SiLUImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(SiLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Mish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies mish over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Mish to learn
+/// about the exact behavior of this module.
+class TORCH_API MishImpl : public torch::nn::Cloneable<MishImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Mish` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `MishImpl`.
+/// See the documentation for `MishImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Mish);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies sigmoid over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Sigmoid to learn
+/// about the exact behavior of this module.
+class TORCH_API SigmoidImpl : public torch::nn::Cloneable<SigmoidImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Sigmoid` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SigmoidImpl`.
+/// See the documentation for `SigmoidImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Sigmoid);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softplus ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies softplus over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softplus to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftplusOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softplus model(SoftplusOptions().beta(0.24).threshold(42.42));
+/// ```
+class TORCH_API SoftplusImpl : public torch::nn::Cloneable<SoftplusImpl> {
+ public:
+  explicit SoftplusImpl(const SoftplusOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softplus` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SoftplusOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftplusImpl`.
+/// See the documentation for `SoftplusImpl` class to learn what methods it
+/// provides, and examples of how to use `Softplus` with
+/// `torch::nn::SoftplusOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softplus);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the soft shrinkage function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softshrink to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftshrinkOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softshrink model(SoftshrinkOptions(42.42));
+/// ```
+class TORCH_API SoftshrinkImpl : public torch::nn::Cloneable<SoftshrinkImpl> {
+ public:
+  explicit SoftshrinkImpl(const SoftshrinkOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SoftshrinkOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftshrinkImpl`.
+/// See the documentation for `SoftshrinkImpl` class to learn what methods it
+/// provides, and examples of how to use `Softshrink` with
+/// `torch::nn::SoftshrinkOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softsign ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Softsign over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Softsign to learn
+/// about the exact behavior of this module.
+class TORCH_API SoftsignImpl : public torch::nn::Cloneable<SoftsignImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softsign` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SoftsignImpl`.
+/// See the documentation for `SoftsignImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Softsign);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Tanh over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanh to learn
+/// about the exact behavior of this module.
+class TORCH_API TanhImpl : public torch::nn::Cloneable<TanhImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Tanh` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `TanhImpl`.
+/// See the documentation for `TanhImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Tanh);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanhshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Tanhshrink over a given input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Tanhshrink to learn
+/// about the exact behavior of this module.
+class TORCH_API TanhshrinkImpl : public torch::nn::Cloneable<TanhshrinkImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Tanhshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `TanhshrinkImpl`.
+/// See the documentation for `TanhshrinkImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Tanhshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Threshold ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Threshold function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Threshold to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ThresholdOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Threshold model(ThresholdOptions(42.42, 24.24).inplace(true));
+/// ```
+class TORCH_API ThresholdImpl : public torch::nn::Cloneable<ThresholdImpl> {
+ public:
+  ThresholdImpl(double threshold, double value)
+      : ThresholdImpl(ThresholdOptions(threshold, value)) {}
+  explicit ThresholdImpl(const ThresholdOptions& options_);
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `Threshold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ThresholdOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ThresholdImpl`.
+/// See the documentation for `ThresholdImpl` class to learn what methods it
+/// provides, and examples of how to use `Threshold` with
+/// `torch::nn::ThresholdOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Threshold);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiheadAttention ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the MultiheadAttention function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MultiheadAttention
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiheadAttentionOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiheadAttention model(MultiheadAttentionOptions(20, 10).bias(false));
+/// ```
+class TORCH_API MultiheadAttentionImpl
+    : public torch::nn::Cloneable<MultiheadAttentionImpl> {
+ public:
+  MultiheadAttentionImpl(int64_t embed_dim, int64_t num_heads)
+      : MultiheadAttentionImpl(
+            MultiheadAttentionOptions(embed_dim, num_heads)) {}
+  explicit MultiheadAttentionImpl(const MultiheadAttentionOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(
+      const Tensor& query,
+      const Tensor& key,
+      const Tensor& value,
+      const Tensor& key_padding_mask = {},
+      bool need_weights = true,
+      const Tensor& attn_mask = {},
+      bool average_attn_weights = true);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(true)},
+      {5, AnyValue(Tensor())},
+      {6, AnyValue(true)})
+
+ public:
+  void reset() override;
+
+  void _reset_parameters();
+
+  /// The options with which this `Module` was constructed.
+  MultiheadAttentionOptions options;
+
+  bool _qkv_same_embed_dim;
+  Tensor in_proj_weight;
+  Tensor in_proj_bias;
+  Tensor bias_k;
+  Tensor bias_v;
+  Linear out_proj = nullptr;
+  Tensor q_proj_weight;
+  Tensor k_proj_weight;
+  Tensor v_proj_weight;
+  int64_t head_dim;
+};
+
+/// A `ModuleHolder` subclass for `MultiheadAttentionImpl`.
+/// See the documentation for `MultiheadAttentionImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiheadAttention` with
+/// `torch::nn::MultiheadAttentionOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiheadAttention);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b35b21ec6814d189dd0730ccc8fb8556730999
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/modules/container/sequential.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/options/adaptive.h>
+
+namespace torch {
+namespace nn {
+
+/// The output of a single invocation of an AdaptiveLogSoftmaxWithLoss
+/// module's `forward()` method.
+struct TORCH_API ASMoutput {
+  ASMoutput(Tensor output_, double loss_);
+
+  /// Tensor containing computed target log probabilities for each example
+  Tensor output;
+
+  /// Scalar representing the computed negative log likelihood loss
+  double loss;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveLogSoftmaxWithLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Efficient softmax approximation as described in
+/// `Efficient softmax approximation for GPUs`_ by Edouard Grave, Armand Joulin,
+/// Moustapha Cissé, David Grangier, and Hervé Jégou.
+/// See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveLogSoftmaxWithLoss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveLogSoftmaxWithLossOptions`
+/// class to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveLogSoftmaxWithLoss model(AdaptiveLogSoftmaxWithLossOptions(8, 10,
+/// {4, 8}).div_value(2.).head_bias(true));
+/// ```
+class TORCH_API AdaptiveLogSoftmaxWithLossImpl
+    : public Cloneable<AdaptiveLogSoftmaxWithLossImpl> {
+ public:
+  AdaptiveLogSoftmaxWithLossImpl(
+      int64_t in_features,
+      int64_t n_classes,
+      std::vector<int64_t> cutoffs)
+      : AdaptiveLogSoftmaxWithLossImpl(AdaptiveLogSoftmaxWithLossOptions(
+            in_features,
+            n_classes,
+            cutoffs)) {}
+
+  explicit AdaptiveLogSoftmaxWithLossImpl(
+      AdaptiveLogSoftmaxWithLossOptions options_);
+
+  ASMoutput forward(const Tensor& input, const Tensor& target);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `AdaptiveLogSoftmaxWithLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Given input tensor, and output of `head`, computes the log of the full
+  /// distribution
+  Tensor _get_full_log_prob(const Tensor& input, const Tensor& head_output);
+
+  /// Computes log probabilities for all n_classes
+  Tensor log_prob(const Tensor& input);
+
+  /// This is equivalent to `log_pob(input).argmax(1)` but is more efficient in
+  /// some cases
+  Tensor predict(const Tensor& input);
+
+  /// The options with which this `Module` was constructed
+  AdaptiveLogSoftmaxWithLossOptions options;
+
+  /// Cutoffs used to assign targets to their buckets. It should be an ordered
+  /// Sequence of integers sorted in the increasing order
+  std::vector<int64_t> cutoffs;
+
+  int64_t shortlist_size;
+
+  /// Number of clusters
+  int64_t n_clusters;
+
+  /// Output size of head classifier
+  int64_t head_size;
+
+  Linear head = nullptr;
+
+  ModuleList tail;
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveLogSoftmaxWithLossImpl`.
+/// See the documentation for `AdaptiveLogSoftmaxWithLossImpl` class to learn
+/// what methods it provides, and examples of how to use
+/// `AdaptiveLogSoftmaxWithLoss` with
+/// `torch::nn::AdaptiveLogSoftmaxWithLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveLogSoftmaxWithLoss);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..0898302443c0edf96c71e1691a0a1e819c8863f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -0,0 +1,250 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/batchnorm.h>
+#include <torch/nn/init.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstdint>
+
+namespace torch {
+namespace nn {
+
+/// Base class for all (dimension-specialized) batchnorm and instancenorm
+/// modules.
+template <size_t D, typename Derived, typename DerivedOptions>
+class NormImplBase : public torch::nn::Cloneable<Derived> {
+ protected:
+  virtual void _check_input_dim(const Tensor& input) = 0;
+
+ public:
+  NormImplBase(const DerivedOptions& options_) : options(options_) {
+    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+    reset();
+  }
+
+  void reset() override {
+    if (options.affine()) {
+      weight = this->register_parameter(
+          "weight", torch::empty({options.num_features()}));
+      bias = this->register_parameter(
+          "bias", torch::empty({options.num_features()}));
+    } else {
+      weight =
+          this->register_parameter("weight", Tensor(), /*requires_grad=*/false);
+      bias =
+          this->register_parameter("bias", Tensor(), /*requires_grad=*/false);
+    }
+    if (options.track_running_stats()) {
+      running_mean = this->register_buffer(
+          "running_mean", torch::zeros({options.num_features()}));
+      running_var = this->register_buffer(
+          "running_var", torch::ones({options.num_features()}));
+      num_batches_tracked = this->register_buffer(
+          "num_batches_tracked", torch::tensor(0, torch::dtype(torch::kLong)));
+    } else {
+      running_mean = this->register_buffer("running_mean", Tensor());
+      running_var = this->register_buffer("running_var", Tensor());
+      num_batches_tracked =
+          this->register_buffer("num_batches_tracked", Tensor());
+    }
+    reset_parameters();
+  }
+
+  void reset_running_stats() {
+    if (options.track_running_stats()) {
+      running_mean.zero_();
+      running_var.fill_(1);
+      num_batches_tracked.zero_();
+    }
+  }
+
+  void reset_parameters() {
+    reset_running_stats();
+    if (options.affine()) {
+      torch::nn::init::ones_(weight);
+      torch::nn::init::zeros_(bias);
+    }
+  }
+
+  /// The options with which this module was constructed.
+  DerivedOptions options;
+
+  /// The learned weight.
+  /// Only defined if the `affine` option was `true` upon construction.
+  Tensor weight;
+
+  /// The learned bias.
+  /// Only defined if the `affine` option was `true` upon construction.
+  Tensor bias;
+
+  /// The running mean.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor running_mean;
+
+  /// The running variance.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor running_var;
+
+  /// The number of the forward call.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor num_batches_tracked;
+};
+
+/// Base class for all (dimension-specialized) batchnorm modules.
+template <size_t D, typename Derived>
+class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
+ public:
+  using NormImplBase<D, Derived, BatchNormOptions>::NormImplBase;
+
+  Tensor forward(const Tensor& input) {
+    this->_check_input_dim(input);
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    double exponential_average_factor;
+    if (this->options.momentum() == c10::nullopt) {
+      exponential_average_factor = 0.0;
+    } else {
+      exponential_average_factor = this->options.momentum().value();
+    }
+
+    if (this->is_training() && this->options.track_running_stats()) {
+      if (this->num_batches_tracked.defined()) {
+        this->num_batches_tracked += 1;
+        if (this->options.momentum() ==
+            c10::nullopt) { // use cumulative moving average
+          exponential_average_factor =
+              1.0 / this->num_batches_tracked.template item<double>();
+        } else { // use exponential moving average
+          exponential_average_factor = this->options.momentum().value();
+        }
+      }
+    }
+
+    return torch::nn::functional::detail::batch_norm(
+        input,
+        this->running_mean,
+        this->running_var,
+        this->weight,
+        this->bias,
+        this->is_training() || !this->options.track_running_stats(),
+        /*momentum=*/exponential_average_factor,
+        this->options.eps());
+  }
+
+  /// Pretty prints the `BatchNorm{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << std::boolalpha << "torch::nn::BatchNorm" << D << "d("
+           << this->options.num_features() << ", "
+           << "eps=" << this->options.eps() << ", "
+           << "momentum=";
+
+    if (this->options.momentum().has_value()) {
+      stream << this->options.momentum().value();
+    } else {
+      stream << "None";
+    }
+
+    stream << ", "
+           << "affine=" << this->options.affine() << ", "
+           << "track_running_stats=" << this->options.track_running_stats()
+           << ")";
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm1d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm1d
+/// model(BatchNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm1dImpl : public BatchNormImplBase<1, BatchNorm1dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<1, BatchNorm1dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm1dImpl`.
+/// See the documentation for `BatchNorm1dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm1d` with
+/// `torch::nn::BatchNorm1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm2d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm2d
+/// model(BatchNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm2dImpl : public BatchNormImplBase<2, BatchNorm2dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<2, BatchNorm2dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm2dImpl`.
+/// See the documentation for `BatchNorm2dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm2d` with
+/// `torch::nn::BatchNorm2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm3d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.BatchNorm3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm3d
+/// model(BatchNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm3dImpl : public BatchNormImplBase<3, BatchNorm3dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<3, BatchNorm3dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm3dImpl`.
+/// See the documentation for `BatchNorm3dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm3d` with
+/// `torch::nn::BatchNorm3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..45a028c5de51b5fcb13588e694938f1448b11899
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h
@@ -0,0 +1,97 @@
+#pragma once
+
+/// This macro enables a module with default arguments in its forward method
+/// to be used in a Sequential module.
+///
+/// Example usage:
+///
+/// Let's say we have a module declared like this:
+/// ```
+/// struct MImpl : torch::nn::Module {
+///  public:
+///   explicit MImpl(int value_) : value(value_) {}
+///   torch::Tensor forward(int a, int b = 2, double c = 3.0) {
+///     return torch::tensor(a + b + c);
+///   }
+///  private:
+///   int value;
+/// };
+/// TORCH_MODULE(M);
+/// ```
+///
+/// If we try to use it in a Sequential module and run forward:
+/// ```
+/// torch::nn::Sequential seq(M(1));
+/// seq->forward(1);
+/// ```
+///
+/// We will receive the following error message:
+/// ```
+/// MImpl's forward() method expects 3 argument(s), but received 1.
+/// If MImpl's forward() method has default arguments, please make sure
+/// the forward() method is declared with a corresponding
+/// `FORWARD_HAS_DEFAULT_ARGS` macro.
+/// ```
+///
+/// The right way to fix this error is to use the `FORWARD_HAS_DEFAULT_ARGS`
+/// macro when declaring the module:
+/// ```
+/// struct MImpl : torch::nn::Module {
+///  public:
+///   explicit MImpl(int value_) : value(value_) {}
+///   torch::Tensor forward(int a, int b = 2, double c = 3.0) {
+///     return torch::tensor(a + b + c);
+///   }
+///  protected:
+///   /*
+///   NOTE: looking at the argument list of `forward`:
+///   `forward(int a, int b = 2, double c = 3.0)`
+///   we saw the following default arguments:
+///   ----------------------------------------------------------------
+///   0-based index of default |         Default value of arg
+///   arg in forward arg list  |  (wrapped by `torch::nn::AnyValue()`)
+///   ----------------------------------------------------------------
+///               1            |       torch::nn::AnyValue(2)
+///               2            |       torch::nn::AnyValue(3.0)
+///   ----------------------------------------------------------------
+///   Thus we pass the following arguments to the `FORWARD_HAS_DEFAULT_ARGS`
+///   macro:
+///   */
+///   FORWARD_HAS_DEFAULT_ARGS({1, torch::nn::AnyValue(2)}, {2,
+///   torch::nn::AnyValue(3.0)})
+///  private:
+///   int value;
+/// };
+/// TORCH_MODULE(M);
+/// ```
+/// Now, running the following would work:
+/// ```
+/// torch::nn::Sequential seq(M(1));
+/// seq->forward(1);  // This correctly populates the default arguments for
+/// `MImpl::forward`
+/// ```
+#define FORWARD_HAS_DEFAULT_ARGS(...)                                         \
+  template <typename ModuleType, typename... ArgumentTypes>                   \
+  friend struct torch::nn::AnyModuleHolder;                                   \
+  bool _forward_has_default_args() override {                                 \
+    return true;                                                              \
+  }                                                                           \
+  unsigned int _forward_num_required_args() override {                        \
+    std::pair<unsigned int, torch::nn::AnyValue> args_info[] = {__VA_ARGS__}; \
+    return args_info[0].first;                                                \
+  }                                                                           \
+  std::vector<torch::nn::AnyValue> _forward_populate_default_args(            \
+      std::vector<torch::nn::AnyValue>&& arguments) override {                \
+    std::pair<unsigned int, torch::nn::AnyValue> args_info[] = {__VA_ARGS__}; \
+    unsigned int num_all_args = std::rbegin(args_info)->first + 1;            \
+    TORCH_INTERNAL_ASSERT(                                                    \
+        arguments.size() >= _forward_num_required_args() &&                   \
+        arguments.size() <= num_all_args);                                    \
+    std::vector<torch::nn::AnyValue> ret = std::move(arguments);              \
+    ret.reserve(num_all_args);                                                \
+    for (auto& arg_info : args_info) {                                        \
+      if (arg_info.first > ret.size() - 1)                                    \
+        ret.emplace_back(std::move(arg_info.second));                         \
+    }                                                                         \
+    return ret;                                                               \
+  }
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h
new file mode 100644
index 0000000000000000000000000000000000000000..8261a88943dcb75a40bc7170c954f0ae589a4e9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -0,0 +1,372 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/any_module_holder.h>
+#include <torch/nn/modules/container/any_value.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/Device.h>
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// Stores a type erased `Module`.
+///
+/// The PyTorch C++ API does not impose an interface on the signature of
+/// `forward()` in `Module` subclasses. This gives you complete freedom to
+/// design your `forward()` methods to your liking. However, this also means
+/// there is no unified base type you could store in order to call `forward()`
+/// polymorphically for any module. This is where the `AnyModule` comes in.
+/// Instead of inheritance, it relies on type erasure for polymorphism.
+///
+/// An `AnyModule` can store any `nn::Module` subclass that provides a
+/// `forward()` method. This `forward()` may accept any types and return any
+/// type. Once stored in an `AnyModule`, you can invoke the underlying module's
+/// `forward()` by calling `AnyModule::forward()` with the arguments you would
+/// supply to the stored module (though see one important limitation below).
+/// Example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct GenericTrainer {
+///     torch::nn::AnyModule module;
+///
+///     void train(torch::Tensor input) {
+///       module.forward(input);
+///     }
+///   };
+///
+///   GenericTrainer trainer1{torch::nn::Linear(3, 4)};
+///   GenericTrainer trainer2{torch::nn::Conv2d(3, 4, 2)};
+/// \endrst
+///
+/// As `AnyModule` erases the static type of the stored module (and its
+/// `forward()` method) to achieve polymorphism, type checking of arguments is
+/// moved to runtime. That is, passing an argument with an incorrect type to an
+/// `AnyModule` will compile, but throw an exception at runtime:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   // Linear takes a tensor as input, but we are passing an integer.
+///   // This will compile, but throw a `torch::Error` exception at runtime.
+///   module.forward(123);
+/// \endrst
+///
+/// \rst
+/// .. attention::
+///   One noteworthy limitation of `AnyModule` is that its `forward()` method
+///   does not support implicit conversion of argument types. For example, if
+///   the stored module's `forward()` method accepts a `float` and you call
+///   `any_module.forward(3.4)` (where `3.4` is a `double`), this will throw
+///   an exception.
+/// \endrst
+///
+/// The return type of the `AnyModule`'s `forward()` method is controlled via
+/// the first template argument to `AnyModule::forward()`. It defaults to
+/// `torch::Tensor`. To change it, you can write `any_module.forward<int>()`,
+/// for example.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   auto output = module.forward(torch::ones({2, 3}));
+///
+///   struct IntModule {
+///     int forward(int x) { return x; }
+///   };
+///   torch::nn::AnyModule module(IntModule{});
+///   int output = module.forward<int>(5);
+/// \endrst
+///
+/// The only other method an `AnyModule` provides access to on the stored
+/// module is `clone()`. However, you may acquire a handle on the module via
+/// `.ptr()`, which returns a `shared_ptr<nn::Module>`. Further, if you know
+/// the concrete type of the stored module, you can get a concrete handle to it
+/// using `.get<T>()` where `T` is the concrete module type.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   std::shared_ptr<nn::Module> ptr = module.ptr();
+///   torch::nn::Linear linear(module.get<torch::nn::Linear>());
+/// \endrst
+class AnyModule {
+ public:
+  /// A default-constructed `AnyModule` is in an empty state.
+  AnyModule() = default;
+
+  /// Constructs an `AnyModule` from a `shared_ptr` to concrete module object.
+  template <typename ModuleType>
+  explicit AnyModule(std::shared_ptr<ModuleType> module);
+
+  /// Constructs an `AnyModule` from a concrete module object.
+  template <
+      typename ModuleType,
+      typename = torch::detail::enable_if_module_t<ModuleType>>
+  explicit AnyModule(ModuleType&& module);
+
+  /// Constructs an `AnyModule` from a module holder.
+  template <typename ModuleType>
+  explicit AnyModule(const ModuleHolder<ModuleType>& module_holder);
+
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  AnyModule(AnyModule&&) = default;
+  AnyModule& operator=(AnyModule&&) = default;
+
+  /// Creates a shallow copy of an `AnyModule`.
+  AnyModule(const AnyModule& other);
+  AnyModule& operator=(const AnyModule& other);
+
+  /// Creates a deep copy of an `AnyModule` if it contains a module, else an
+  /// empty `AnyModule` if it is empty.
+  AnyModule clone(optional<Device> device = nullopt) const;
+
+  /// Assigns a module to the `AnyModule` (to circumvent the explicit
+  /// constructor).
+  template <typename ModuleType>
+  AnyModule& operator=(std::shared_ptr<ModuleType> module);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// returns the return value as an `AnyValue`. Use this method when chaining
+  /// `AnyModule`s in a loop.
+  template <typename... ArgumentTypes>
+  AnyValue any_forward(ArgumentTypes&&... arguments);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// casts the returned `AnyValue` to the supplied `ReturnType` (which defaults
+  /// to `torch::Tensor`).
+  template <typename ReturnType = torch::Tensor, typename... ArgumentTypes>
+  ReturnType forward(ArgumentTypes&&... arguments);
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  T& get();
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  const T& get() const;
+
+  /// Returns the contained module in a `nn::ModuleHolder` subclass if possible
+  /// (i.e. if `T` has a constructor for the underlying module type).
+  template <typename T, typename ContainedType = typename T::ContainedType>
+  T get() const;
+
+  /// Returns a `std::shared_ptr` whose dynamic type is that of the underlying
+  /// module.
+  std::shared_ptr<Module> ptr() const;
+
+  /// Like `ptr()`, but casts the pointer to the given type.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  std::shared_ptr<T> ptr() const;
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const;
+
+  /// Returns true if the `AnyModule` does not contain a module.
+  bool is_empty() const noexcept;
+
+ private:
+  /// Creates a `unique_ptr<AnyModulePlaceholder>` pointing to a
+  /// `AnyModuleHolder` of the correct type. This method is used to deduce the
+  /// arguments of the module's `forward()` method.
+  template <
+      typename ModuleType,
+      typename Class,
+      typename ReturnType,
+      typename... ArgumentTypes>
+  std::unique_ptr<AnyModulePlaceholder> make_holder(
+      std::shared_ptr<ModuleType>&& module,
+      ReturnType (Class::*)(ArgumentTypes...));
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
+  ModuleType& get_(ReturnType (ModuleType::*)(ArgumentTypes...)) const;
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename ModuleType>
+  ModuleType& get_() const;
+
+  /// The type erased module.
+  std::unique_ptr<AnyModulePlaceholder> content_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType>
+AnyModule::AnyModule(std::shared_ptr<ModuleType> module)
+    : content_(make_holder(
+          std::move(module),
+          &std::remove_reference<ModuleType>::type::forward)) {
+  // `AnyModule` can only store an `nn::Module` subclass object that provides
+  // a `forward()` method that has a non-templatized return type.
+  // (e.g. `AnyModule` cannot store `nn::Sequential`, because `nn::Sequential`'s
+  // `forward()` method has a templatized return type.)
+  static_assert(
+      torch::detail::is_module<ModuleType>::value,
+      "Can only store object derived from nn::Module into AnyModule");
+  static_assert(
+      torch::detail::has_forward<ModuleType>::value,
+      "Can only store module with a forward() method that has a non-templatized"
+      " argument type and return type into AnyModule (e.g. we cannot store nn::Sequential"
+      "into AnyModule, because its forward() method's argument type and return type are templatized."
+      " If you need to use nn::Sequentials inside each other you can subclass "
+      "nn::Sequential and write a non-templatized forward function for it. You can checkout "
+      "https://github.com/pytorch/vision/blob/2f46070f3cb1ea894d82578f3dc5677f82f34958/torchvision/csrc/models/mnasnet.cpp#L59 "
+      "for an example on how to do this.).");
+}
+
+template <typename ModuleType, typename>
+AnyModule::AnyModule(ModuleType&& module)
+    : AnyModule(
+          std::make_shared<ModuleType>(std::forward<ModuleType>(module))) {}
+
+template <typename ModuleType>
+AnyModule::AnyModule(const ModuleHolder<ModuleType>& module_holder)
+    : AnyModule(module_holder.ptr()) {}
+
+inline AnyModule::AnyModule(const AnyModule& other)
+    : content_(other.content_ ? other.content_->copy() : nullptr) {}
+
+inline AnyModule& AnyModule::operator=(const AnyModule& other) {
+  if (this != &other) {
+    content_ = other.content_ ? other.content_->copy() : nullptr;
+  }
+  return *this;
+}
+
+inline AnyModule AnyModule::clone(optional<Device> device) const {
+  AnyModule clone;
+  clone.content_ = content_ ? content_->clone_module(device) : nullptr;
+  return clone;
+}
+
+template <typename ModuleType>
+AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
+  // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature)
+  return (*this = AnyModule(std::move(module)));
+}
+
+template <typename... ArgumentTypes>
+AnyValue AnyModule::any_forward(ArgumentTypes&&... arguments) {
+  TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
+  std::vector<AnyValue> values;
+  values.reserve(sizeof...(ArgumentTypes));
+  torch::apply(
+      [&values](AnyValue&& value) { values.push_back(std::move(value)); },
+      AnyValue(std::forward<ArgumentTypes>(arguments))...);
+  return content_->forward(std::move(values));
+}
+
+template <typename ReturnType, typename... ArgumentTypes>
+ReturnType AnyModule::forward(ArgumentTypes&&... arguments) {
+  return any_forward(std::forward<ArgumentTypes>(arguments)...)
+      .template get<ReturnType>();
+}
+
+template <typename T, typename>
+T& AnyModule::get() {
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename>
+const T& AnyModule::get() const {
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename ContainedType>
+T AnyModule::get() const {
+  return T(ptr<ContainedType>());
+}
+
+inline std::shared_ptr<Module> AnyModule::ptr() const {
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  return content_->ptr();
+}
+
+template <typename T, typename>
+std::shared_ptr<T> AnyModule::ptr() const {
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  // Call get() but discard the value, just to do the type checking.
+  get_<T>();
+  return std::dynamic_pointer_cast<T>(ptr());
+}
+
+inline const std::type_info& AnyModule::type_info() const {
+  TORCH_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule");
+  return content_->type_info;
+}
+
+inline bool AnyModule::is_empty() const noexcept {
+  return content_ == nullptr;
+}
+
+// Private Methods
+
+template <
+    typename ModuleType,
+    typename Class,
+    typename ReturnType,
+    typename... ArgumentTypes>
+std::unique_ptr<AnyModulePlaceholder> AnyModule::make_holder(
+    std::shared_ptr<ModuleType>&& module,
+    ReturnType (Class::*)(ArgumentTypes...)) {
+  static_assert(
+      torch::detail::check_not_lvalue_references<ArgumentTypes...>(),
+      "Modules stored inside AnyModule must not take references. "
+      "Use pointers instead.");
+  static_assert(
+      !std::is_void<ReturnType>::value,
+      "AnyModule cannot store modules that return void "
+      "(you can return a dummy value).");
+  return std::make_unique<
+      AnyModuleHolder<decay_t<ModuleType>, ArgumentTypes...>>(
+      std::move(module));
+}
+
+template <typename ModuleType>
+ModuleType& AnyModule::get_() const {
+  using M = typename std::remove_reference<ModuleType>::type;
+  static_assert(
+      torch::detail::has_forward<M>::value,
+      "Can only call AnyModule::get<T> with a type T that has a forward method");
+  return get_(&M::forward);
+}
+
+template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
+ModuleType& AnyModule::get_(
+    ReturnType (ModuleType::*)(ArgumentTypes...)) const {
+  if (typeid(ModuleType).hash_code() == type_info().hash_code()) {
+    return *static_cast<AnyModuleHolder<ModuleType, ArgumentTypes...>&>(
+                *content_)
+                .module;
+  }
+  AT_ERROR(
+      "Attempted to cast module of type ",
+      c10::demangle(type_info().name()),
+      " to type ",
+      c10::demangle(typeid(ModuleType).name()));
+}
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2af908e5d74674497d85b807ac57f1c28beb37c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <torch/nn/modules/container/any_value.h>
+
+namespace torch {
+namespace nn {
+
+class Module;
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModulePlaceholder ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The static type of the object we store in the `AnyModule`, which erases
+/// the actual type, but allows us to call `forward()` on the underlying
+/// module.
+struct AnyModulePlaceholder : public AnyValue::Placeholder {
+  using AnyValue::Placeholder::Placeholder;
+
+  /// The "erased" `forward()` method.
+  virtual AnyValue forward(std::vector<AnyValue>&& arguments) = 0;
+
+  /// Returns std::shared_ptr<Module> pointing to the erased module.
+  virtual std::shared_ptr<Module> ptr() = 0;
+
+  /// Returns a `AnyModulePlaceholder` with a shallow copy of this `AnyModule`.
+  virtual std::unique_ptr<AnyModulePlaceholder> copy() const = 0;
+
+  /// Returns a `AnyModulePlaceholder` with a deep copy of this `AnyModule`.
+  virtual std::unique_ptr<AnyModulePlaceholder> clone_module(
+      optional<Device> device) const = 0;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModuleHolder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The dynamic type of the object stored in the `AnyModule`. It contains the
+/// concrete instance to which all calls are forwarded. It is parameterized
+/// over the concrete type of the module, and the types of the arguments the
+/// module takes in its `forward()` method.
+template <typename ModuleType, typename... ArgumentTypes>
+struct AnyModuleHolder : public AnyModulePlaceholder {
+  /// \internal
+  struct CheckedGetter {
+    template <typename T>
+    decay_t<T>&& operator()(size_t index) {
+      AT_ASSERT(index < arguments_.size());
+      auto& value = arguments_[index];
+      if (auto* maybe_value = value.template try_get<decay_t<T>>()) {
+        return std::move(*maybe_value);
+      }
+      AT_ERROR(
+          "Expected argument #",
+          index,
+          " to be of type ",
+          c10::demangle(typeid(T).name()),
+          ", but received value of type ",
+          c10::demangle(value.type_info().name()));
+    }
+    std::vector<AnyValue>& arguments_;
+  };
+
+  /// \internal
+  struct InvokeForward {
+    template <typename... Ts>
+    AnyValue operator()(Ts&&... ts) {
+      return AnyValue(module_->forward(std::forward<Ts>(ts)...));
+    }
+    std::shared_ptr<ModuleType>& module_;
+  };
+
+  /// Constructs the `AnyModuleHolder` from a concrete module.
+  explicit AnyModuleHolder(std::shared_ptr<ModuleType>&& module_)
+      : AnyModulePlaceholder(typeid(ModuleType)), module(std::move(module_)) {}
+
+  /// Calls `forward()` on the underlying module, casting each `AnyValue` in the
+  /// argument vector to a concrete value.
+  AnyValue forward(std::vector<AnyValue>&& arguments) override {
+    if (module->_forward_has_default_args()) {
+      TORCH_CHECK(
+          arguments.size() >= module->_forward_num_required_args() &&
+              arguments.size() <= sizeof...(ArgumentTypes),
+          c10::demangle(type_info.name()),
+          "'s forward() method expects at least ",
+          module->_forward_num_required_args(),
+          " argument(s) and at most ",
+          sizeof...(ArgumentTypes),
+          " argument(s), but received ",
+          arguments.size(),
+          ".");
+      arguments = std::move(
+          module->_forward_populate_default_args(std::move(arguments)));
+    } else {
+      std::string use_default_args_macro_prompt = " If " +
+          c10::demangle(type_info.name()) +
+          "'s forward() method has default arguments, " +
+          "please make sure the forward() method is declared with a corresponding `FORWARD_HAS_DEFAULT_ARGS` macro.";
+      TORCH_CHECK(
+          arguments.size() == sizeof...(ArgumentTypes),
+          c10::demangle(type_info.name()),
+          "'s forward() method expects ",
+          sizeof...(ArgumentTypes),
+          " argument(s), but received ",
+          arguments.size(),
+          ".",
+          (arguments.size() < sizeof...(ArgumentTypes))
+              ? use_default_args_macro_prompt
+              : "");
+    }
+
+    // FYI: During invocation of a module's `forward()` method, the values live
+    // in the `arguments` vector inside this function.
+    return torch::unpack<AnyValue, ArgumentTypes...>(
+        InvokeForward{module}, CheckedGetter{arguments});
+  }
+
+  std::shared_ptr<Module> ptr() override {
+    return module;
+  }
+
+  std::unique_ptr<AnyModulePlaceholder> copy() const override {
+    return std::make_unique<AnyModuleHolder>(*this);
+  }
+
+  std::unique_ptr<AnyModulePlaceholder> clone_module(
+      optional<Device> device) const override {
+    return std::make_unique<AnyModuleHolder>(
+        std::dynamic_pointer_cast<ModuleType>(module->clone(device)));
+  }
+
+  /// The actual concrete module instance.
+  std::shared_ptr<ModuleType> module;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..11ec5efaa61a81d11b527ad783dd2f798aef4a73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyValue ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// An implementation of `std::any` which stores
+/// a type erased object, whose concrete value can be retrieved at runtime by
+/// checking if the `typeid()` of a requested type matches the `typeid()` of
+/// the object stored.
+class AnyValue {
+ public:
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  AnyValue(AnyValue&&) = default;
+  AnyValue& operator=(AnyValue&&) = default;
+
+  /// Copy construction and assignment is allowed.
+  AnyValue(const AnyValue& other) : content_(other.content_->clone()) {}
+  AnyValue& operator=(const AnyValue& other) {
+    content_ = other.content_->clone();
+    return *this;
+  }
+
+  /// Constructs the `AnyValue` from value type.
+  template <typename T>
+  // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+  explicit AnyValue(T&& value)
+      : content_(std::make_unique<Holder<decay_t<T>>>(std::forward<T>(value))) {
+  }
+
+  /// Returns a pointer to the value contained in the `AnyValue` if the type
+  /// passed as template parameter matches the type of the value stored, and
+  /// returns a null pointer otherwise.
+  template <typename T>
+  T* try_get() {
+    static_assert(
+        !std::is_reference<T>::value,
+        "AnyValue stores decayed types, you cannot cast it to a reference type");
+    static_assert(
+        !std::is_array<T>::value,
+        "AnyValue stores decayed types, you must cast it to T* instead of T[]");
+    if (typeid(T).hash_code() == type_info().hash_code()) {
+      return &static_cast<Holder<T>&>(*content_).value;
+    }
+    return nullptr;
+  }
+
+  /// Returns the value contained in the `AnyValue` if the type passed as
+  /// template parameter matches the type of the value stored, and throws an
+  /// exception otherwise.
+  template <typename T>
+  T get() {
+    if (auto* maybe_value = try_get<T>()) {
+      return *maybe_value;
+    }
+    AT_ERROR(
+        "Attempted to cast AnyValue to ",
+        c10::demangle(typeid(T).name()),
+        ", but its actual type is ",
+        c10::demangle(type_info().name()));
+  }
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const noexcept {
+    return content_->type_info;
+  }
+
+ private:
+  friend struct AnyModulePlaceholder;
+  friend struct TestAnyValue;
+
+  /// \internal
+  /// The static type of the object we store in the `AnyValue`, which erases the
+  /// actual object's type, allowing us only to check the `type_info` of the
+  /// type stored in the dynamic type.
+  struct Placeholder {
+    explicit Placeholder(const std::type_info& type_info_) noexcept
+        : type_info(type_info_) {}
+    Placeholder(const Placeholder&) = default;
+    Placeholder(Placeholder&&) = default;
+    virtual ~Placeholder() = default;
+    virtual std::unique_ptr<Placeholder> clone() const {
+      TORCH_CHECK(false, "clone() should only be called on `AnyValue::Holder`");
+    }
+    const std::type_info& type_info;
+  };
+
+  /// \internal
+  /// The dynamic type of the object we store in the `AnyValue`, which hides the
+  /// actual object we have erased in this `AnyValue`.
+  template <typename T>
+  struct Holder : public Placeholder {
+    /// A template because T&& would not be universal reference here.
+    template <typename U>
+    // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+    explicit Holder(U&& value_) noexcept
+        : Placeholder(typeid(T)), value(std::forward<U>(value_)) {}
+    std::unique_ptr<Placeholder> clone() const override {
+      return std::make_unique<Holder<T>>(value);
+    }
+    T value;
+  };
+
+  /// The type erased object.
+  std::unique_ptr<Placeholder> content_;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..192eadaf56cd722fdb913e04e4d7a1fa3e77bce9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/variadic.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <utility>
+
+namespace torch {
+namespace nn {
+
+/// Wraps a function in a `Module`.
+///
+/// The `Functional` module allows wrapping an arbitrary function or function
+/// object in an `nn::Module`. This is primarily handy for usage in
+/// `Sequential`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Sequential sequential(
+///     Linear(3, 4),
+///     Functional(torch::relu),
+///     BatchNorm1d(3),
+///     Functional(torch::elu, /*alpha=*/1));
+/// \endrst
+///
+/// While a `Functional` module only accepts a single `Tensor` as input, it is
+/// possible for the wrapped function to accept further arguments. However,
+/// these have to be bound *at construction time*. For example, if
+/// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its
+/// second argument, with a particular value for its `slope` in a `Functional`
+/// module, you could write
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Functional(torch::leaky_relu, /*slope=*/0.5)
+/// \endrst
+///
+/// The value of `0.5` is then stored within the `Functional` object and
+/// supplied to the function call at invocation time. Note that such bound
+/// values are evaluated eagerly and stored a single time. See the documentation
+/// of [std::bind](https://en.cppreference.com/w/cpp/utility/functional/bind)
+/// for more information on the semantics of argument binding.
+///
+/// \rst
+/// .. attention::
+///   After passing any bound arguments, the function must accept a single
+///   tensor and return a single tensor.
+/// \endrst
+///
+/// Note that `Functional` overloads the call operator (`operator()`) such that
+/// you can invoke it with `my_func(...)`.
+class TORCH_API FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
+ public:
+  using Function = std::function<Tensor(Tensor)>;
+
+  /// Constructs a `Functional` from a function object.
+  explicit FunctionalImpl(Function function);
+
+  template <
+      typename SomeFunction,
+      typename... Args,
+      typename = torch::enable_if_t<(sizeof...(Args) > 0)>>
+  explicit FunctionalImpl(SomeFunction original_function, Args&&... args)
+      // NOLINTNEXTLINE(modernize-avoid-bind)
+      : function_(std::bind(
+            original_function,
+            /*input=*/std::placeholders::_1,
+            std::forward<Args>(args)...)) {
+    // std::bind is normally evil, but (1) gcc is broken w.r.t. handling
+    // parameter pack expansion in lambdas and (2) moving parameter packs into
+    // a lambda only works with C++14, so std::bind is the more move-aware
+    // solution here.
+  }
+
+  void reset() override;
+
+  /// Pretty prints the `Functional` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Forwards the `input` tensor to the underlying (bound) function object.
+  Tensor forward(Tensor input);
+
+  /// Calls forward(input).
+  Tensor operator()(Tensor input);
+
+  bool is_serializable() const override;
+
+ private:
+  Function function_;
+};
+
+/// A `ModuleHolder` subclass for `FunctionalImpl`.
+/// See the documentation for `FunctionalImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Functional);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
new file mode 100644
index 0000000000000000000000000000000000000000..dddcb694953067d87e317b81a1e015f6c1c8e77c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
@@ -0,0 +1,262 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/ordered_dict.h>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// An OrderedDict of `Module`s that registers its elements by their `key`s.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::OrderedDict<std::string, std::shared_ptr<Module>> ordereddict = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict1(ordereddict);
+///
+///   for (const auto &module : *dict1) {
+///     module->pretty_print(std::cout);
+///   }
+///
+///   std::vector<std::pair<std::string, std::shared_ptr<Module>>> list = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict2(list);
+///
+///   for (const auto &module : *dict2) {
+///     module->pretty_print(std::cout);
+///   }
+///
+/// \endrst
+///
+/// Why should you use `ModuleDict` instead of a simple `map` or `OrderedDict`?
+/// The value a `ModuleDict` provides over manually calling an ordered map of
+/// modules is that it allows treating the whole container *as a single module*,
+/// such that performing a transformation on the `ModuleDict` applies to each of
+/// the modules it stores (which are each a registered submodule of the
+/// `ModuleDict`). For example, calling `.to(torch::kCUDA)` on a `ModuleDict`
+/// will move each module in the map to CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::OrderedDict<std::string, std::shared_ptr<Module>> ordereddict = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict(ordereddict);
+///
+///   // Convert all modules to CUDA.
+///   dict->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `ModuleDict` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding new modules from a
+/// vector of key-module pairs or an `OrderedDict` or another `ModuleDict` after
+/// construction via `update`.
+class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
+ public:
+  using Iterator =
+      torch::OrderedDict<std::string, std::shared_ptr<Module>>::Iterator;
+  using ConstIterator =
+      torch::OrderedDict<std::string, std::shared_ptr<Module>>::ConstIterator;
+
+  ModuleDictImpl() = default;
+
+  /// Constructs the `ModuleDict` from a list of string-Module pairs.
+  explicit ModuleDictImpl(
+      const std::vector<std::pair<std::string, std::shared_ptr<Module>>>&
+          modules) {
+    update(modules);
+  }
+
+  /// Constructs the `ModuleDict` from an `OrderedDict`.
+  explicit ModuleDictImpl(
+      const torch::OrderedDict<std::string, std::shared_ptr<Module>>& modules) {
+    update(modules);
+  }
+
+  /// Return the items in the `ModuleDict`.
+  std::vector<std::pair<std::string, std::shared_ptr<Module>>> items() const {
+    return modules_.pairs();
+  }
+
+  /// Return the keys in the `ModuleDict`.
+  std::vector<std::string> keys() const {
+    return modules_.keys();
+  }
+
+  /// Return the values in the `ModuleDict`.
+  std::vector<std::shared_ptr<Module>> values() const {
+    return modules_.values();
+  }
+
+  /// Return an iterator to the start of `ModuleDict`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Return a const iterator to the start of `ModuleDict`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Return an iterator to the end of `ModuleDict`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Return a const iterator to the end of `ModuleDict`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Return the number of items currently stored in the `ModuleDict`.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// Return true if the `ModuleDict` is empty, otherwise return false.
+  bool empty() const noexcept {
+    return modules_.is_empty();
+  }
+
+  /// Check if the centain parameter with the key in the `ModuleDict`.
+  bool contains(const std::string& key) const noexcept {
+    return modules_.contains(key);
+  }
+
+  /// Remove all items from the `ModuleDict`.
+  void clear() {
+    // Not remove the registration of modules to make it consistent with python
+    // version.
+    modules_.clear();
+  }
+
+  /// Special cloning function for `ModuleDict` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const optional<Device>& device = nullopt) const override {
+    auto clone = std::make_shared<ModuleDictImpl>();
+    for (const auto& module : modules_) {
+      clone->insert(module.key(), module.value()->clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `ModuleDict`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `ModuleDict` into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ModuleDict";
+  }
+
+  /// Attempts to returns the `Module` associated with the given `key`. Throws
+  /// an exception if no such `key` is stored in the `ModuleDict`. Check
+  /// contains(key) before for a non-throwing way of access.
+  std::shared_ptr<Module> operator[](const std::string& key) const {
+    return modules_[key];
+  }
+
+  /// Attempts to return the module at the given key as the requested type.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  template <typename T>
+  T& at(const std::string& key) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return the module at the given key as the requested type.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  template <typename T>
+  const T& at(const std::string& key) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    const auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Removes and returns the `Module` associated with the given `key`.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  std::shared_ptr<Module> pop(const std::string& key) {
+    auto module = modules_[key];
+    modules_.erase(key);
+    // Not remove the registration of the module to make it consistent with
+    // python version.
+    return module;
+  }
+
+  /// Updated the `ModuleDict` with a vector of key-module pairs.
+  void update(
+      const std::vector<std::pair<std::string, std::shared_ptr<Module>>>&
+          modules) {
+    for (auto& item : modules) {
+      insert(item.first, item.second);
+    }
+  }
+
+  /// Updated the `ModuleDict` with key-value pairs from `OrderedDict` or
+  /// `ModuleDict`.
+  template <typename Container>
+  void update(const Container& container) {
+    for (auto& item : container) {
+      insert(item.key(), item.value());
+    }
+  }
+
+ private:
+  /// Private `OrderedDict` holding the key-Module pairs.
+  torch::OrderedDict<std::string, std::shared_ptr<Module>> modules_;
+
+  /// Insert a key-module pair by overwriting existing keys,
+  /// and register or replace the `Module`.
+  void insert(const std::string& key, std::shared_ptr<Module> module) {
+    if (contains(key)) {
+      modules_[key] = std::move(module);
+      replace_module(key, modules_[key]);
+    } else {
+      modules_.insert(key, std::move(module));
+      register_module(key, modules_.back().value());
+    }
+  }
+};
+
+/// A `ModuleHolder` subclass for `ModuleDictImpl`.
+/// See the documentation for `ModuleDictImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ModuleDict);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
new file mode 100644
index 0000000000000000000000000000000000000000..07e07ab0668374cc69206596e53e5ddb6459f07a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
@@ -0,0 +1,274 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// A list of `Module`s that registers its elements.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::ModuleList mlist(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   for (const auto &module : *mlist) {
+///     module->pretty_print(std::cout);
+///   }
+///
+/// \endrst
+///
+/// Why should you use `ModuleList` instead of a simple `std::vector`? The value
+/// a `ModuleList` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `ModuleList` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `ModuleList`). For example, calling
+/// `.to(torch::kCUDA)` on a `ModuleList` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::ModuleList mlist(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   mlist->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `ModuleList` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `ModuleList`s via
+/// `extend`.
+class ModuleListImpl : public Cloneable<ModuleListImpl> {
+ public:
+  using Iterator = std::vector<std::shared_ptr<Module>>::iterator;
+  using ConstIterator = std::vector<std::shared_ptr<Module>>::const_iterator;
+
+  ModuleListImpl() = default;
+
+  /// Constructs the `ModuleList` from a variadic list of modules.
+  template <typename... Modules>
+  explicit ModuleListImpl(Modules&&... modules) {
+    modules_.reserve(sizeof...(Modules));
+    push_back_var(std::forward<Modules>(modules)...);
+  }
+
+  /// Special cloning function for `ModuleList` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const optional<Device>& device = nullopt) const override {
+    auto clone = std::make_shared<ModuleListImpl>();
+    for (const auto& module : modules_) {
+      clone->push_back(module->clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `ModuleList`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `ModuleList` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ModuleList";
+  }
+
+  void push_back(std::shared_ptr<Module> module) {
+    modules_.push_back(std::move(module));
+    const auto index = modules_.size() - 1;
+    register_module(c10::to_string(index), modules_[index]);
+  }
+
+  /// Adds a new `Module` to the `ModuleList` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(M&& module) {
+    using Type = typename std::remove_reference<M>::type;
+    push_back(std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and adds it to the
+  /// `ModuleList`.
+  template <typename M>
+  void push_back(const ModuleHolder<M>& module_holder) {
+    push_back(module_holder.ptr());
+  }
+
+  /// Iterates over the container and calls `push_back()` on each value.
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& module : container) {
+      push_back(module);
+    }
+  }
+
+  /// Returns an iterator to the start of the `ModuleList`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Returns a const iterator to the start of the `ModuleList`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Returns an iterator to the end of the `ModuleList`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Returns a const iterator to the end of the `ModuleList`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  T& at(size_t index) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  const T& at(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    const auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
+  /// underlying module at the given index. Throws an exception if the index is
+  /// out of bounds.
+  std::shared_ptr<Module> ptr(size_t index) const {
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index];
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose type is the one provided.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  std::shared_ptr<T> ptr(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::ptr with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return std::dynamic_pointer_cast<T>(modules_[index]);
+  }
+
+  /// Like `ptr(index)`.
+  std::shared_ptr<Module> operator[](size_t index) const {
+    // This is the only method we can call without a type.
+    return ptr(index);
+  }
+
+  /// The current size of the `ModuleList` container.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// True if there are no modules in the `ModuleList`.
+  bool is_empty() const noexcept {
+    return size() == 0;
+  }
+
+  void insert(size_t index, std::shared_ptr<Module> module) {
+    TORCH_CHECK(index <= size(), "Index out of range");
+
+    if (index == size())
+      push_back(std::move(module));
+    else {
+      modules_.insert(
+          modules_.begin() + Iterator::difference_type(index),
+          std::move(module));
+
+      for (const auto i : c10::irange(index, size() - 1)) {
+        (void)i; // Suppress unused variable warning
+        replace_module(c10::to_string(index), modules_[index]);
+      }
+      register_module(c10::to_string(size() - 1), modules_.back());
+    }
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and inserts it in the
+  /// `ModuleList`.
+  template <typename M>
+  void insert(size_t index, const ModuleHolder<M>& module_holder) {
+    insert(index, module_holder.ptr());
+  }
+
+  /// inserts a new `Module` to the `ModuleList` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void insert(size_t index, M&& module) {
+    using Type = typename std::remove_reference<M>::type;
+    insert(index, std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+ private:
+  template <typename Head, typename... Tail>
+  void push_back_var(Head&& head, Tail&&... tail) {
+    push_back(std::forward<Head>(head));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back_var(std::forward<Tail>(tail)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back_var() {}
+
+  // Box the AnyModules to give ModuleList reference semantics, like the rest of
+  // the API. Note that this is not required otherwise, this could just be a
+  // `vector<AnyModule>`.
+  std::vector<std::shared_ptr<Module>> modules_;
+};
+
+/// A `ModuleHolder` subclass for `ModuleListImpl`.
+/// See the documentation for `ModuleListImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ModuleList);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2a25ec9385e6e9c1df2c88374745fe5c0a27c8d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/Device.h>
+
+#include <initializer_list>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// Stores a type erased `Module` with name.
+///
+/// The `NamedAnyModule` class enables the following API for constructing
+/// `nn::Sequential` with named submodules:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct M : torch::nn::Module {
+///     explicit M(int value_) : value(value_) {}
+///     int value;
+///     int forward() {
+///       return value;
+///     }
+///   };
+///
+///   Sequential sequential({
+///     {"m1", std::make_shared<M>(1)},  // shared pointer to `Module` is
+///     supported {std::string("m2"), M(2)},  // `Module` is supported
+///     {"linear1", Linear(10, 3)}  // `ModuleHolder` is supported
+///   });
+/// \endrst
+class NamedAnyModule {
+ public:
+  /// Creates a `NamedAnyModule` from a (boxed) `Module`.
+  template <typename ModuleType>
+  NamedAnyModule(std::string name, std::shared_ptr<ModuleType> module_ptr)
+      : NamedAnyModule(std::move(name), AnyModule(std::move(module_ptr))) {}
+
+  /// Creates a `NamedAnyModule` from a `Module`, moving or copying it
+  /// into a `shared_ptr` internally.
+  // NOTE: We need to use `std::remove_reference<M>::type` to get rid of
+  // any reference components for make_unique.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  NamedAnyModule(std::string name, M&& module)
+      : NamedAnyModule(
+            std::move(name),
+            std::make_shared<typename std::remove_reference<M>::type>(
+                std::forward<M>(module))) {}
+
+  /// Creates a `NamedAnyModule` from a `Module` that is unwrapped from
+  /// a `ModuleHolder`.
+  template <typename M>
+  NamedAnyModule(std::string name, const ModuleHolder<M>& module_holder)
+      : NamedAnyModule(std::move(name), module_holder.ptr()) {}
+
+  /// Creates a `NamedAnyModule` from a type-erased `AnyModule`.
+  NamedAnyModule(std::string name, AnyModule any_module)
+      : name_(std::move(name)), module_(std::move(any_module)) {}
+
+  /// Returns a reference to the name.
+  const std::string& name() const noexcept {
+    return name_;
+  }
+
+  /// Returns a reference to the module.
+  AnyModule& module() noexcept {
+    return module_;
+  }
+
+  /// Returns a const reference to the module.
+  const AnyModule& module() const noexcept {
+    return module_;
+  }
+
+ private:
+  std::string name_;
+  AnyModule module_;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
new file mode 100644
index 0000000000000000000000000000000000000000..051b6a1b3e90ab3c7c4e0c45d16fa0b05f44108e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/ordered_dict.h>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
+ public:
+  using Iterator = OrderedDict<std::string, Tensor>::Iterator;
+  using ConstIterator = OrderedDict<std::string, Tensor>::ConstIterator;
+
+  ParameterDictImpl() = default;
+
+  explicit ParameterDictImpl(
+      const torch::OrderedDict<std::string, torch::Tensor>& params) {
+    parameters_ = params;
+  }
+
+  /// `reset()` is empty for `ParameterDict`, since it does not have
+  /// parameters of its own.
+  void reset() override {}
+
+  /// Pretty prints the `ParameterDict` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ParameterDict(" << std::endl;
+    for (const auto& pair : parameters_) {
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
+      ;
+      stream << std::endl;
+    }
+    stream << ")";
+  }
+
+  /// Insert the parameter along with the key into ParameterDict
+  /// The parameter is set to be require grad by default
+  Tensor& insert(std::string key, Tensor param) {
+    bool requires_grad = param.requires_grad();
+    return register_parameter(std::move(key), std::move(param), requires_grad);
+  }
+
+  /// Remove key from the ParameterDict and return its value, throw exception
+  /// if the key is not contained. Please check contains(key) before for a
+  /// non-throwing access.
+  Tensor pop(const std::string& key) {
+    torch::Tensor v = parameters_[key];
+    parameters_.erase(key);
+    return v;
+  }
+
+  /// Return the keys in the dict
+  ::std::vector<std::string> keys() const {
+    return parameters_.keys();
+  }
+
+  /// Return the Values in the dict
+  ::std::vector<torch::Tensor> values() const {
+    return parameters_.values();
+  }
+
+  /// Return an iterator to the start of ParameterDict
+  Iterator begin() {
+    return parameters_.begin();
+  }
+
+  /// Return a const iterator to the start of ParameterDict
+  ConstIterator begin() const {
+    return parameters_.begin();
+  }
+
+  /// Return an iterator to the end of ParameterDict
+  Iterator end() {
+    return parameters_.end();
+  }
+
+  /// Return a const iterator to the end of ParameterDict
+  ConstIterator end() const {
+    return parameters_.end();
+  }
+
+  /// Return the number of items currently stored in the ParameterDict
+  size_t size() const noexcept {
+    return parameters_.size();
+  }
+
+  /// Return true if the ParameterDict is empty, otherwise return false
+  bool empty() const noexcept {
+    return parameters_.is_empty();
+  }
+
+  /// Update the ParameterDict with the key-value pairs from
+  /// another ParameterDict, overwriting existing key
+  template <typename Container>
+  void update(const Container& container) {
+    for (auto& item : container) {
+      parameters_[item.key()] = item.value();
+    }
+  }
+
+  /// Remove all parameters in the ParameterDict
+  void clear() {
+    parameters_.clear();
+  }
+
+  /// Check if the centain parameter with the key in the ParameterDict
+  bool contains(const std::string& key) const noexcept {
+    return parameters_.contains(key);
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  const Tensor& get(const std::string& key) const {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  Tensor& get(const std::string& key) {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  Tensor& operator[](const std::string& key) {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  const Tensor& operator[](const std::string& key) const {
+    return parameters_[key];
+  }
+};
+
+TORCH_MODULE(ParameterDict);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..425ec949e175bb391f8caf9d11b794e18812519b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -0,0 +1,169 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+
+#include <vector>
+
+namespace torch {
+namespace nn {
+class ParameterListImpl : public Cloneable<ParameterListImpl> {
+ public:
+  using Iterator = typename std::vector<
+      OrderedDict<std::string, torch::Tensor>::Item>::iterator;
+  using ConstIterator = typename std::vector<
+      OrderedDict<std::string, torch::Tensor>::Item>::const_iterator;
+
+  ParameterListImpl() = default;
+
+  /// Constructs the `ParameterList` from a variadic list of ParameterList.
+  template <typename... Tensors>
+  explicit ParameterListImpl(Tensors&&... params) {
+    parameters_.reserve(sizeof...(Tensors));
+    push_back_var(std::forward<Tensors>(params)...);
+  }
+
+  template <typename... Tensors>
+  explicit ParameterListImpl(const Tensors&... params) {
+    parameters_.reserve(sizeof...(Tensors));
+    push_back_var(std::forward<Tensors>(params)...);
+  }
+
+  /// `reset()` is empty for `ParameterList`, since it does not have parameters
+  /// of its own.
+  void reset() override {}
+
+  /// Pretty prints the `ParameterList` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ParameterList(" << std::endl;
+    for (const auto& pair : parameters_) {
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
+      ;
+      stream << std::endl;
+    }
+    stream << ")";
+  }
+
+  /// push the a given parameter at the end of the list
+  void append(torch::Tensor&& param) {
+    bool requires_grad = param.requires_grad();
+    register_parameter(
+        c10::to_string(parameters_.size()), std::move(param), requires_grad);
+  }
+
+  /// push the a given parameter at the end of the list
+  void append(const torch::Tensor& param) {
+    bool requires_grad = param.requires_grad();
+    register_parameter(
+        c10::to_string(parameters_.size()), param, requires_grad);
+  }
+
+  /// push the a given parameter at the end of the list
+  /// And the key of the pair will be discarded, only the value
+  /// will be added into the `ParameterList`
+  void append(const OrderedDict<std::string, torch::Tensor>::Item& pair) {
+    register_parameter(
+        c10::to_string(parameters_.size()),
+        pair.value(),
+        pair.value().requires_grad());
+  }
+
+  /// extend parameters from a container to the end of the list
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& param : container) {
+      append(param);
+    }
+  }
+
+  /// Returns an iterator to the start of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  Iterator begin() {
+    return parameters_.begin();
+  }
+
+  /// Returns a const iterator to the start of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  ConstIterator begin() const {
+    return parameters_.begin();
+  }
+
+  /// Returns an iterator to the end of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  Iterator end() {
+    return parameters_.end();
+  }
+
+  /// Returns a const iterator to the end of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  ConstIterator end() const {
+    return parameters_.end();
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  at::Tensor& at(size_t idx) {
+    TORCH_CHECK(idx < size(), "Index out of range");
+    return parameters_[c10::to_string(idx)];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  const at::Tensor& at(size_t idx) const {
+    TORCH_CHECK(idx < size(), "Index out of range");
+    return parameters_[c10::to_string(idx)];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  at::Tensor& operator[](size_t idx) {
+    return at(idx);
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  const at::Tensor& operator[](size_t idx) const {
+    return at(idx);
+  }
+
+  /// Return the size of the ParameterList
+  size_t size() const noexcept {
+    return parameters_.size();
+  }
+  /// True if the ParameterList is empty
+  bool is_empty() const noexcept {
+    return parameters_.is_empty();
+  }
+
+  /// Overload the +=, so that two ParameterList could be incrementally added
+  template <typename Container>
+  Container& operator+=(const Container& other) {
+    extend(other);
+    return *this;
+  }
+
+ private:
+  template <typename Head, typename... Tail>
+  void push_back_var(Head&& head, Tail&&... tail) {
+    append(std::forward<Head>(head));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back_var(std::forward<Tail>(tail)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back_var() {}
+};
+TORCH_MODULE(ParameterList);
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fdfbe86d87160f3b4c16d3d7e41c25448982bb2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h
@@ -0,0 +1,390 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/named_any.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// A list of `Module`s that acts as a `Module` itself.
+///
+/// A `Sequential` is fundamentally a list of `Module`s, each with a `forward()`
+/// method. `Sequential` provides a `forward()` method of its own, which accepts
+/// any input and forwards it to the first module it stores. It then "chains"
+/// outputs to inputs sequentially for each subsequent module, finally returning
+/// the output of the last module. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   auto output = seq->forward(torch::ones(3));
+///
+/// \endrst
+///
+/// This can conceptually be thought of as the following loop (using Python as
+/// pseudocode):
+///
+/// \rst
+/// .. code-block:: python
+///
+///   def forward(sequential, input):
+///     for module in sequential:
+///       input = module(input)
+///     return input
+///
+/// \endrst
+///
+/// Why should you use `Sequential` instead of a simple `std::vector`? The value
+/// a `Sequential` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `Sequential` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `Sequential`). For example, calling
+/// `.to(torch::kCUDA)` on a `Sequential` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   seq->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `Sequential` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `Sequential`s via
+/// `extend`.
+///
+/// \rst
+/// .. attention::
+///   One current limitation of `Sequential` is that all except the first module
+///   must accept a single argument. If your modules need to take multiple
+///   arguments, you should define them to take and return tuples.
+/// \endrst
+class SequentialImpl : public Cloneable<SequentialImpl> {
+ public:
+  using Iterator = std::vector<AnyModule>::iterator;
+  using ConstIterator = std::vector<AnyModule>::const_iterator;
+
+  SequentialImpl() = default;
+
+  /// Constructs the `Sequential` from a variadic list of modules.
+  template <typename... Modules>
+  explicit SequentialImpl(Modules&&... modules) {
+    modules_.reserve(sizeof...(Modules));
+    push_back(std::forward<Modules>(modules)...);
+  }
+
+  /// Constructs the `Sequential` from an `OrderedDict` of named `AnyModule`s.
+  explicit SequentialImpl(
+      torch::OrderedDict<std::string, AnyModule>&& ordered_dict) {
+    modules_.reserve(ordered_dict.size());
+    for (auto& item : ordered_dict) {
+      push_back(item.key(), std::move(item.value()));
+    }
+  }
+
+  /// Constructs the `Sequential` from a braced-init-list of named `AnyModule`s.
+  /// It enables the following use case:
+  /// `Sequential sequential({{"m1", M(1)}, {"m2", M(2)}})`
+  explicit SequentialImpl(std::initializer_list<NamedAnyModule> named_modules) {
+    modules_.reserve(named_modules.size());
+    for (const auto& named_module : named_modules) {
+      push_back(named_module.name(), named_module.module());
+    }
+  }
+
+  /// Special cloning function for `Sequential` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const optional<Device>& device = nullopt) const override {
+    auto clone = std::make_shared<SequentialImpl>();
+    for (const auto& module : modules_) {
+      clone->push_back(module.clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `Sequential`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `Sequential` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::Sequential";
+  }
+
+  /// Feeds `inputs` to the first module and then chains outputs to inputs,
+  /// returning the last output.
+  ///
+  /// Conceptually the following loop in Python:
+  ///
+  /// \rst
+  /// .. code-block:: python
+  ///
+  ///   def forward(sequential, input):
+  ///     for module in sequential:
+  ///       input = module(input)
+  ///     return input
+  ///
+  /// \endrst
+  ///
+  /// The return type is taken as the first template parameter. It defaults to
+  /// `Tensor`. If the last module in the `Sequential` returns another type `T`,
+  /// you should call `forward<T>(inputs)` instead of just `forward(inputs)`:
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   torch::Tensor tensor = sequential1->forward(inputs);
+  ///   int integer = sequential2->forward<int>(inputs);
+  ///   float value = sequential3->forward<float>(inputs);
+  ///
+  /// \endrst
+  template <typename ReturnType = Tensor, typename... InputTypes>
+  ReturnType forward(InputTypes&&... inputs) {
+    TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
+
+    auto iterator = modules_.begin();
+    auto input = iterator->any_forward(std::forward<InputTypes>(inputs)...);
+
+    for (++iterator; iterator != modules_.end(); ++iterator) {
+      input = iterator->any_forward(std::move(input));
+    }
+
+    // Check the return value and give a nice error message if the requested
+    // return type was incorrect.
+    if (auto* return_value = input.template try_get<ReturnType>()) {
+      return std::move(*return_value);
+    }
+    AT_ERROR(
+        "The type of the return value is ",
+        c10::demangle(input.type_info().name()),
+        ", but you asked for type ",
+        c10::demangle(typeid(ReturnType).name()));
+  }
+
+  /// Adds a new (boxed) `Module` to the `Sequential` container.
+  template <typename ModuleType>
+  void push_back(std::shared_ptr<ModuleType> module_ptr) {
+    push_back(c10::to_string(modules_.size()), std::move(module_ptr));
+  }
+
+  /// Adds a new named (boxed) `Module` to the `Sequential` container.
+  template <typename ModuleType>
+  void push_back(std::string name, std::shared_ptr<ModuleType> module_ptr) {
+    push_back(std::move(name), AnyModule(std::move(module_ptr)));
+  }
+
+  /// Adds a new `Module` to the `Sequential` container, moving or copying it
+  /// into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing. This means you can write
+  /// `Sequential(Module(3, 4))` instead of
+  /// `Sequential(std::make_shared<Module>(3, 4))`.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(M&& module) {
+    push_back(c10::to_string(modules_.size()), std::forward<M>(module));
+  }
+
+  /// Adds a new named `Module` to the `Sequential` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(std::string name, M&& module) {
+    using Type = typename std::remove_reference<M>::type;
+    push_back(std::move(name), std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and adds it to the
+  /// `Sequential`.
+  template <typename M>
+  void push_back(const ModuleHolder<M>& module_holder) {
+    push_back(c10::to_string(modules_.size()), module_holder);
+  }
+
+  /// Unwraps the contained named module of a `ModuleHolder` and adds it to the
+  /// `Sequential`.
+  template <typename M>
+  void push_back(std::string name, const ModuleHolder<M>& module_holder) {
+    push_back(std::move(name), module_holder.ptr());
+  }
+
+  /// Iterates over the container and calls `push_back()` on each value.
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& module : container) {
+      push_back(module);
+    }
+  }
+
+  /// Adds a type-erased `AnyModule` to the `Sequential`.
+  void push_back(AnyModule any_module) {
+    push_back(c10::to_string(modules_.size()), std::move(any_module));
+  }
+
+  void push_back(std::string name, AnyModule any_module) {
+    modules_.push_back(std::move(any_module));
+    const auto index = modules_.size() - 1;
+    register_module(std::move(name), modules_[index].ptr());
+  }
+
+  /// Returns an iterator to the start of the `Sequential`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Returns a const iterator to the start of the `Sequential`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Returns an iterator to the end of the `Sequential`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Returns a const iterator to the end of the `Sequential`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  T& at(size_t index) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  const T& at(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
+  /// underlying module at the given index. Throws an exception if the index is
+  /// out of bounds.
+  std::shared_ptr<Module> ptr(size_t index) const {
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose type is the one provided.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  std::shared_ptr<T> ptr(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::ptr with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr<T>();
+  }
+
+  /// Like `ptr(index)`.
+  std::shared_ptr<Module> operator[](size_t index) const {
+    // This is the only method we can call without a type.
+    return ptr(index);
+  }
+
+  /// The current size of the `Sequential` container.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// True if there are no modules in the `Sequential`.
+  bool is_empty() const noexcept {
+    return size() == 0;
+  }
+
+ private:
+  /// Takes a First *and* Second parameter, to avoid ambiguity when a parameter
+  /// pack has only one type, in which case the template would be preferred,
+  /// even if the other `push_back` functions are better fits (e.g. `unique_ptr`
+  /// -> `shared_ptr` overload).
+  /// NOTE: We explicitly avoid matching this template with
+  /// `push_back(std::string("name"), module)` or `push_back("name", module)`,
+  /// since they should be handled by their respective `push_back` functions.
+  template <
+      typename First,
+      typename Second,
+      typename... Rest,
+      typename = torch::disable_if_t<
+          std::is_same<First, std::string>::value ||
+          // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+          std::is_same<
+              typename std::decay<First>::type,
+              std::decay<const char (&)[]>::type>::value>>
+  void push_back(First&& first, Second&& second, Rest&&... rest) {
+    push_back(std::forward<First>(first));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back(std::forward<Second>(second), std::forward<Rest>(rest)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back() {}
+
+  // Box the AnyModules to give Sequential reference semantics, like the rest of
+  // the API. Note that this is not required otherwise, this could just be a
+  // `vector<AnyModule>`.
+  std::vector<AnyModule> modules_;
+};
+
+/// A `ModuleHolder` subclass for `SequentialImpl`.
+/// See the documentation for `SequentialImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+class Sequential : public torch::nn::ModuleHolder<SequentialImpl> {
+ public:
+  using torch::nn::ModuleHolder<SequentialImpl>::ModuleHolder;
+
+  Sequential() : ModuleHolder() {}
+
+  /// Constructs the `Sequential` from a braced-init-list of named `AnyModule`s.
+  /// It enables the following use case:
+  /// `Sequential sequential({{"m1", M(1)}, {"m2", M(2)}})`
+  Sequential(std::initializer_list<NamedAnyModule> named_modules)
+      : ModuleHolder(std::make_shared<SequentialImpl>(named_modules)) {}
+};
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7dc7c5d0ee6a92609e19b9456fe396b1ad80ce9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -0,0 +1,451 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <c10/util/overloaded.h>
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/init.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/utils.h>
+#include <torch/nn/options/conv.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// Base class for all (dimension-specialized) convolution modules.
+template <size_t D, typename Derived>
+class ConvNdImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit ConvNdImpl(detail::ConvNdOptions<D> options_)
+      : options(std::move(options_)) {
+    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+    reset();
+  }
+
+  void reset() override {
+    TORCH_CHECK(
+        options.in_channels() > 0 && options.groups() > 0 &&
+            options.out_channels() > 0,
+        "in_channels, groups and out_channels must be a positive integer.");
+    TORCH_CHECK(
+        options.in_channels() % options.groups() == 0,
+        "in_channels must be divisible by groups");
+    TORCH_CHECK(
+        options.out_channels() % options.groups() == 0,
+        "out_channels must be divisible by groups");
+
+    std::visit(
+        c10::overloaded(
+            [&](enumtype::kValid) {
+              _reversed_padding_repeated_twice.resize(2 * D);
+              std::fill_n(_reversed_padding_repeated_twice.begin(), 2 * D, 0);
+            },
+            [&](enumtype::kSame) {
+              for (const auto i : c10::irange(D)) {
+                const auto stride = (*options.stride())[i];
+                TORCH_CHECK(
+                    stride == 1,
+                    "padding='same' is not supported for strided convolutions");
+              }
+
+              _reversed_padding_repeated_twice.resize(2 * D);
+              for (const auto i : c10::irange(D)) {
+                const auto dilation = (*options.dilation())[i];
+                const auto kernel_size = (*options.kernel_size())[i];
+                const auto total_padding = dilation * (kernel_size - 1);
+                auto left_pad = total_padding / 2;
+                auto right_pad = total_padding - left_pad;
+                _reversed_padding_repeated_twice[2 * i] = left_pad;
+                _reversed_padding_repeated_twice[2 * i + 1] = right_pad;
+              }
+            },
+            [&](const ExpandingArray<D>& pad) {
+              _reversed_padding_repeated_twice =
+                  torch::nn::modules::utils::_reverse_repeat_vector(pad, 2);
+            }),
+        options.padding());
+
+    if (options.transposed()) {
+      std::vector<int64_t> weight_sizes = {
+          options.in_channels(), options.out_channels() / options.groups()};
+      weight_sizes.insert(
+          weight_sizes.end(),
+          (*options.kernel_size()).begin(),
+          (*options.kernel_size()).end());
+      weight = this->register_parameter("weight", torch::empty(weight_sizes));
+    } else {
+      std::vector<int64_t> weight_sizes = {
+          options.out_channels(), options.in_channels() / options.groups()};
+      weight_sizes.insert(
+          weight_sizes.end(),
+          (*options.kernel_size()).begin(),
+          (*options.kernel_size()).end());
+      weight = this->register_parameter("weight", torch::empty(weight_sizes));
+    }
+
+    if (options.bias()) {
+      bias = this->register_parameter(
+          "bias", torch::empty({options.out_channels()}));
+    } else {
+      this->register_parameter("bias", Tensor(), /*requires_grad=*/false);
+    }
+
+    reset_parameters();
+  }
+
+  void reset_parameters() {
+    init::kaiming_uniform_(
+        weight,
+        /*a=*/std::sqrt(5)); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
+
+    if (bias.defined()) {
+      auto [fan_in, fan_out] = init::_calculate_fan_in_and_fan_out(weight);
+      auto bound = 1 / std::sqrt(fan_in);
+      init::uniform_(bias, -bound, bound);
+    }
+  }
+
+  /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::Conv" << D << "d"
+           << "(" << options.in_channels() << ", " << options.out_channels()
+           << ", kernel_size=" << options.kernel_size()
+           << ", stride=" << options.stride();
+    std::visit(
+        c10::overloaded(
+            [&](enumtype::kValid) { stream << ", padding='valid'"; },
+            [&](enumtype::kSame) { stream << ", padding='same'"; },
+            [&](const ExpandingArray<D>& pad) {
+              if (*pad != *ExpandingArray<D>(0)) {
+                stream << ", padding=" << pad;
+              }
+            }),
+        options.padding());
+    if (*options.dilation() != *ExpandingArray<D>(1)) {
+      stream << ", dilation=" << options.dilation();
+    }
+    if (*options.output_padding() != *ExpandingArray<D>(0)) {
+      stream << ", output_padding=" << options.output_padding();
+    }
+    if (options.groups() != 1) {
+      stream << ", groups=" << options.groups();
+    }
+    if (!options.bias()) {
+      stream << ", bias=" << std::boolalpha << false;
+    }
+    if (!std::get_if<enumtype::kZeros>(&options.padding_mode())) {
+      stream << ", padding_mode="
+             << enumtype::get_enum_name(options.padding_mode());
+    }
+    stream << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  detail::ConvNdOptions<D> options;
+
+  /// The learned kernel (or "weight").
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Tensor weight;
+
+  /// The learned bias. Only defined if the `bias` option was true.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Tensor bias;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<int64_t> _reversed_padding_repeated_twice;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv1d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv1d model(Conv1dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv1dImpl : public ConvNdImpl<1, Conv1dImpl> {
+ public:
+  Conv1dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<1> kernel_size)
+      : Conv1dImpl(
+            Conv1dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv1dImpl(Conv1dOptions options_);
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `Conv1dImpl`.
+/// See the documentation for `Conv1dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv1d` with
+/// `torch::nn::Conv1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv2d model(Conv2dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv2dImpl : public ConvNdImpl<2, Conv2dImpl> {
+ public:
+  Conv2dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<2> kernel_size)
+      : Conv2dImpl(
+            Conv2dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv2dImpl(Conv2dOptions options_);
+  Tensor forward(const Tensor& input);
+
+ protected:
+  Tensor _conv_forward(const Tensor& input, const Tensor& weight);
+};
+
+/// A `ModuleHolder` subclass for `Conv2dImpl`.
+/// See the documentation for `Conv2dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv2d` with
+/// `torch::nn::Conv2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Conv3d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv3d model(Conv3dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv3dImpl : public ConvNdImpl<3, Conv3dImpl> {
+ public:
+  Conv3dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<3> kernel_size)
+      : Conv3dImpl(
+            Conv3dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv3dImpl(Conv3dOptions options_);
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `Conv3dImpl`.
+/// See the documentation for `Conv3dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv3d` with
+/// `torch::nn::Conv3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Base class for all (dimension-specialized) convolution transpose modules.
+template <size_t D, typename Derived>
+class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
+ public:
+  using torch::nn::ConvNdImpl<D, Derived>::ConvNdImpl;
+  explicit ConvTransposeNdImpl(detail::ConvNdOptions<D> options_)
+      : ConvNdImpl<D, Derived>(options_) {
+    TORCH_INTERNAL_ASSERT(
+        std::holds_alternative<ExpandingArray<D>>(this->options.padding()),
+        "ConvTranspose padding cannot be a string");
+  }
+
+  /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ConvTranspose" << D << "d"
+           << "(" << this->options.in_channels() << ", "
+           << this->options.out_channels()
+           << ", kernel_size=" << this->options.kernel_size()
+           << ", stride=" << this->options.stride();
+    const auto& pad = padding();
+    if (*pad != *ExpandingArray<D>(0)) {
+      stream << ", padding=" << pad;
+    }
+    if (*this->options.dilation() != *ExpandingArray<D>(1)) {
+      stream << ", dilation=" << this->options.dilation();
+    }
+    if (*this->options.output_padding() != *ExpandingArray<D>(0)) {
+      stream << ", output_padding=" << this->options.output_padding();
+    }
+    if (this->options.groups() != 1) {
+      stream << ", groups=" << this->options.groups();
+    }
+    if (!this->options.bias()) {
+      stream << ", bias=" << std::boolalpha << false;
+    }
+    if (!std::get_if<enumtype::kZeros>(&this->options.padding_mode())) {
+      stream << ", padding_mode="
+             << enumtype::get_enum_name(this->options.padding_mode());
+    }
+    stream << ")";
+  }
+
+ protected:
+  const ExpandingArray<D>& padding() const {
+    return std::get<ExpandingArray<D>>(this->options.padding());
+  }
+
+  std::vector<int64_t> _output_padding(
+      const Tensor& input,
+      const c10::optional<at::IntArrayRef>& output_size,
+      const ExpandingArray<D>& stride,
+      const ExpandingArray<D>& padding,
+      const ExpandingArray<D>& kernel_size);
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose1d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose1d model(ConvTranspose1dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose1dImpl
+    : public ConvTransposeNdImpl<1, ConvTranspose1dImpl> {
+ public:
+  ConvTranspose1dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<1> kernel_size)
+      : ConvTranspose1dImpl(ConvTranspose1dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose1dImpl(ConvTranspose1dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose1dImpl`.
+/// See the documentation for `ConvTranspose1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose1d` with
+/// `torch::nn::ConvTranspose1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose2d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose2d model(ConvTranspose2dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose2dImpl
+    : public ConvTransposeNdImpl<2, ConvTranspose2dImpl> {
+ public:
+  ConvTranspose2dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<2> kernel_size)
+      : ConvTranspose2dImpl(ConvTranspose2dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose2dImpl(ConvTranspose2dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose2dImpl`.
+/// See the documentation for `ConvTranspose2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose2d` with
+/// `torch::nn::ConvTranspose2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose3d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConvTranspose3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose3d model(ConvTranspose3dOptions(2, 2,
+/// 2).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose3dImpl
+    : public ConvTransposeNdImpl<3, ConvTranspose3dImpl> {
+ public:
+  ConvTranspose3dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<3> kernel_size)
+      : ConvTranspose3dImpl(ConvTranspose3dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose3dImpl(ConvTranspose3dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const c10::optional<at::IntArrayRef>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(c10::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose3dImpl`.
+/// See the documentation for `ConvTranspose3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose3d` with
+/// `torch::nn::ConvTranspose3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8b13eb190186a629ab0debbd6f9b98043218fa1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/distance.h>
+#include <torch/nn/options/distance.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace nn {
+
+/// Returns the cosine similarity between :math:`x_1` and :math:`x_2`, computed
+/// along `dim`.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.CosineSimilarity to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CosineSimilarityOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CosineSimilarity model(CosineSimilarityOptions().dim(0).eps(0.5));
+/// ```
+class TORCH_API CosineSimilarityImpl : public Cloneable<CosineSimilarityImpl> {
+ public:
+  explicit CosineSimilarityImpl(const CosineSimilarityOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CosineSimilarity` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options with which this `Module` was constructed.
+  CosineSimilarityOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CosineSimilarityImpl`.
+/// See the documentation for `CosineSimilarityImpl` class to learn what methods
+/// it provides, and examples of how to use `CosineSimilarity` with
+/// `torch::nn::CosineSimilarityOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CosineSimilarity);
+
+// ============================================================================
+
+/// Returns the batchwise pairwise distance between vectors :math:`v_1`,
+/// :math:`v_2` using the p-norm.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.PairwiseDistance to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PairwiseDistanceOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PairwiseDistance
+/// model(PairwiseDistanceOptions().p(3).eps(0.5).keepdim(true));
+/// ```
+class TORCH_API PairwiseDistanceImpl : public Cloneable<PairwiseDistanceImpl> {
+ public:
+  explicit PairwiseDistanceImpl(const PairwiseDistanceOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `PairwiseDistance` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options with which this `Module` was constructed.
+  PairwiseDistanceOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PairwiseDistanceImpl`.
+/// See the documentation for `PairwiseDistanceImpl` class to learn what methods
+/// it provides, and examples of how to use `PairwiseDistance` with
+/// `torch::nn::PairwiseDistanceOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PairwiseDistance);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..12214c1ffdf0df1ce8b5b9aa5b874ba6cf2443d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+namespace detail {
+
+template <typename Derived>
+class _DropoutNd : public torch::nn::Cloneable<Derived> {
+ public:
+  _DropoutNd(double p) : _DropoutNd(DropoutOptions().p(p)){};
+
+  explicit _DropoutNd(const DropoutOptions& options_ = {}) : options(options_) {
+    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+    reset();
+  }
+
+  void reset() override {
+    TORCH_CHECK(
+        options.p() >= 0. && options.p() <= 1.,
+        "dropout probability has to be between 0 and 1, but got ",
+        options.p());
+  }
+
+  /// The options with which this `Module` was constructed.
+  DropoutOptions options;
+};
+
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::DropoutOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout model(DropoutOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API DropoutImpl : public detail::_DropoutNd<DropoutImpl> {
+ public:
+  using detail::_DropoutNd<DropoutImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `DropoutImpl`.
+/// See the documentation for `DropoutImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout` with
+/// `torch::nn::DropoutOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Dropout2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout2d model(Dropout2dOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API Dropout2dImpl : public detail::_DropoutNd<Dropout2dImpl> {
+ public:
+  using detail::_DropoutNd<Dropout2dImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Dropout2dImpl`.
+/// See the documentation for `Dropout2dImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout2d` with
+/// `torch::nn::Dropout2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Dropout3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Dropout3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout3d model(Dropout3dOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API Dropout3dImpl : public detail::_DropoutNd<Dropout3dImpl> {
+ public:
+  using detail::_DropoutNd<Dropout3dImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout3d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Dropout3dImpl`.
+/// See the documentation for `Dropout3dImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout3d` with
+/// `torch::nn::Dropout3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AlphaDropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Alpha Dropout over the input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AlphaDropout to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AlphaDropoutOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AlphaDropout model(AlphaDropoutOptions(0.2).inplace(true));
+/// ```
+class TORCH_API AlphaDropoutImpl : public detail::_DropoutNd<AlphaDropoutImpl> {
+ public:
+  using detail::_DropoutNd<AlphaDropoutImpl>::_DropoutNd;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `AlphaDropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `AlphaDropoutImpl`.
+/// See the documentation for `AlphaDropoutImpl` class to learn what methods it
+/// provides, and examples of how to use `AlphaDropout` with
+/// `torch::nn::AlphaDropoutOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AlphaDropout);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FeatureAlphaDropout
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// See the documentation for `torch::nn::FeatureAlphaDropoutOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FeatureAlphaDropout model(FeatureAlphaDropoutOptions(0.2).inplace(true));
+/// ```
+class TORCH_API FeatureAlphaDropoutImpl
+    : public detail::_DropoutNd<FeatureAlphaDropoutImpl> {
+ public:
+  using detail::_DropoutNd<FeatureAlphaDropoutImpl>::_DropoutNd;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `FeatureAlphaDropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `FeatureAlphaDropoutImpl`.
+/// See the documentation for `FeatureAlphaDropoutImpl` class to learn what
+/// methods it provides, and examples of how to use `FeatureAlphaDropout` with
+/// `torch::nn::FeatureAlphaDropoutOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FeatureAlphaDropout);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..da9f90ab596f28e838d2637a5f0310a61696f9cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/embedding.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/embedding.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Embedding
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Performs a lookup in a fixed size embedding table.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Embedding to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::EmbeddingOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Embedding model(EmbeddingOptions(10,
+/// 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+class TORCH_API EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
+ public:
+  EmbeddingImpl(int64_t num_embeddings, int64_t embedding_dim)
+      : EmbeddingImpl(EmbeddingOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingImpl(EmbeddingOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Embedding` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Performs a lookup on the embedding table stored in `weight` using the
+  /// `indices` supplied and returns the result.
+  Tensor forward(const Tensor& indices);
+
+  /// The `Options` used to configure this `Embedding` module.
+  /// Changes to `EmbeddingOptions` *after construction* have no effect.
+  EmbeddingOptions options;
+
+  /// The embedding table.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `EmbeddingImpl`.
+/// See the documentation for `EmbeddingImpl` class to learn what methods it
+/// provides, and examples of how to use `Embedding` with
+/// `torch::nn::EmbeddingOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+class Embedding : public torch::nn::ModuleHolder<EmbeddingImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingImpl>::ModuleHolder;
+
+  /// See the documentation for `torch::nn::EmbeddingFromPretrainedOptions`
+  /// class to learn what optional arguments are supported for this function.
+  static Embedding from_pretrained(
+      const torch::Tensor& embeddings,
+      const EmbeddingFromPretrainedOptions& options = {}) {
+    TORCH_CHECK(
+        embeddings.dim() == 2,
+        "Embeddings parameter is expected to be 2-dimensional");
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t rows, cols;
+    rows = embeddings.size(0);
+    cols = embeddings.size(1);
+
+    Embedding embedding(EmbeddingOptions(rows, cols)
+                            ._weight(embeddings)
+                            .padding_idx(options.padding_idx())
+                            .max_norm(options.max_norm())
+                            .norm_type(options.norm_type())
+                            .scale_grad_by_freq(options.scale_grad_by_freq())
+                            .sparse(options.sparse()));
+    embedding->weight.set_requires_grad(!options.freeze());
+    return embedding;
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EmbeddingBag
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Computes sums or means of 'bags' of embeddings, without instantiating the
+/// intermediate embeddings.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.EmbeddingBag to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::EmbeddingBagOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// EmbeddingBag model(EmbeddingBagOptions(10,
+/// 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode(torch::kSum).padding_idx(1));
+/// ```
+class TORCH_API EmbeddingBagImpl
+    : public torch::nn::Cloneable<EmbeddingBagImpl> {
+ public:
+  EmbeddingBagImpl(int64_t num_embeddings, int64_t embedding_dim)
+      : EmbeddingBagImpl(EmbeddingBagOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingBagImpl(EmbeddingBagOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `EmbeddingBag` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The `Options` used to configure this `EmbeddingBag` module.
+  EmbeddingBagOptions options;
+  /// The embedding table.
+  Tensor weight;
+
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& offsets = {},
+      const Tensor& per_sample_weights = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+};
+
+/// A `ModuleHolder` subclass for `EmbeddingBagImpl`.
+/// See the documentation for `EmbeddingBagImpl` class to learn what methods it
+/// provides, and examples of how to use `EmbeddingBag` with
+/// `torch::nn::EmbeddingBagOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+class EmbeddingBag : public torch::nn::ModuleHolder<EmbeddingBagImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingBagImpl>::ModuleHolder;
+
+  /// See the documentation for `torch::nn::EmbeddingBagFromPretrainedOptions`
+  /// class to learn what optional arguments are supported for this function.
+  static EmbeddingBag from_pretrained(
+      const torch::Tensor& embeddings,
+      const EmbeddingBagFromPretrainedOptions& options = {}) {
+    TORCH_CHECK(
+        embeddings.dim() == 2,
+        "Embeddings parameter is expected to be 2-dimensional");
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t rows, cols;
+    rows = embeddings.size(0);
+    cols = embeddings.size(1);
+
+    EmbeddingBag embeddingbag(
+        EmbeddingBagOptions(rows, cols)
+            ._weight(embeddings)
+            .max_norm(options.max_norm())
+            .norm_type(options.norm_type())
+            .scale_grad_by_freq(options.scale_grad_by_freq())
+            .mode(options.mode())
+            .sparse(options.sparse())
+            .padding_idx(options.padding_idx()));
+    embeddingbag->weight.set_requires_grad(!options.freeze());
+    return embeddingbag;
+  }
+};
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..28e3ddbce9d4a004987d91fcbcce4fb0c7a78b78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/fold.h>
+#include <torch/nn/options/fold.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Applies fold over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Fold to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FoldOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Fold model(FoldOptions({8, 8}, {3, 3}).dilation(2).padding({2,
+/// 1}).stride(2));
+/// ```
+class TORCH_API FoldImpl : public torch::nn::Cloneable<FoldImpl> {
+ public:
+  FoldImpl(ExpandingArray<2> output_size, ExpandingArray<2> kernel_size)
+      : FoldImpl(FoldOptions(output_size, kernel_size)) {}
+  explicit FoldImpl(const FoldOptions& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Fold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FoldOptions options;
+};
+
+/// A `ModuleHolder` subclass for `FoldImpl`.
+/// See the documentation for `FoldImpl` class to learn what methods it
+/// provides, and examples of how to use `Fold` with `torch::nn::FoldOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Fold);
+
+// ============================================================================
+
+/// Applies unfold over a 4-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Unfold to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UnfoldOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Unfold model(UnfoldOptions({2, 4}).dilation(2).padding({2, 1}).stride(2));
+/// ```
+class TORCH_API UnfoldImpl : public Cloneable<UnfoldImpl> {
+ public:
+  UnfoldImpl(ExpandingArray<2> kernel_size)
+      : UnfoldImpl(UnfoldOptions(kernel_size)) {}
+  explicit UnfoldImpl(const UnfoldOptions& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Unfold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  UnfoldOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UnfoldImpl`.
+/// See the documentation for `UnfoldImpl` class to learn what methods it
+/// provides, and examples of how to use `Unfold` with
+/// `torch::nn::UnfoldOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Unfold);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cfc2c9c907c8473ae5385aed6af4395ccf50f20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/options/instancenorm.h>
+
+namespace torch {
+namespace nn {
+
+/// Base class for all (dimension-specialized) instance norm modules
+template <size_t D, typename Derived>
+class InstanceNormImpl
+    : public torch::nn::NormImplBase<D, Derived, InstanceNormOptions> {
+ private:
+  inline Tensor apply_instance_norm(const Tensor& input) {
+    return torch::nn::functional::detail::instance_norm(
+        input,
+        this->running_mean,
+        this->running_var,
+        this->weight,
+        this->bias,
+        this->is_training() || !this->options.track_running_stats(),
+        this->options.momentum(),
+        this->options.eps());
+  }
+
+  inline Tensor handle_no_batch_input(const Tensor& input) {
+    return this->apply_instance_norm(input.unsqueeze(0)).squeeze(0);
+  }
+
+ public:
+  using torch::nn::NormImplBase<D, Derived, InstanceNormOptions>::NormImplBase;
+
+  Tensor forward(const Tensor& input) {
+    this->_check_input_dim(input);
+
+    // For InstanceNorm1D, 2D is unbatched and 3D is batched
+    // For InstanceNorm2D, 3D is unbatched and 4D is batched
+    // For InstanceNorm3D, 4D is unbatched and 5D is batched
+    // check if input does not have a batch-dim
+    if (input.dim() == D + 1) {
+      return this->handle_no_batch_input(input);
+    }
+
+    return this->apply_instance_norm(input);
+  }
+
+  /// Pretty prints the `InstanceNorm{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << std::boolalpha << "torch::nn::InstanceNorm" << D << "d("
+           << this->options.num_features() << ", "
+           << "eps=" << this->options.eps() << ", "
+           << "momentum=" << this->options.momentum() << ", "
+           << "affine=" << this->options.affine() << ", "
+           << "track_running_stats=" << this->options.track_running_stats()
+           << ")";
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm1d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm1d
+/// model(InstanceNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm1dImpl
+    : public InstanceNormImpl<1, InstanceNorm1dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<1, InstanceNorm1dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm1dImpl`.
+/// See the documentation for `InstanceNorm1dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm1d` with
+/// `torch::nn::InstanceNorm1dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm2d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm2d
+/// model(InstanceNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm2dImpl
+    : public InstanceNormImpl<2, InstanceNorm2dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<2, InstanceNorm2dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm2dImpl`.
+/// See the documentation for `InstanceNorm2dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm2d` with
+/// `torch::nn::InstanceNorm2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm3d function.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.InstanceNorm3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm3d
+/// model(InstanceNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm3dImpl
+    : public InstanceNormImpl<3, InstanceNorm3dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<3, InstanceNorm3dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm3dImpl`.
+/// See the documentation for `InstanceNorm3dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm3d` with
+/// `torch::nn::InstanceNorm3dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..4773ce03d85ec352237a56b26ca411610ad0cc07
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -0,0 +1,214 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/module.h>
+#include <torch/nn/options/linear.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Identity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder identity operator that is argument-insensitive.
+/// See https://pytorch.org/docs/master/generated/torch.nn.Identity.html to
+/// learn about the exact behavior of this module.
+class TORCH_API IdentityImpl : public Cloneable<IdentityImpl> {
+ public:
+  void reset() override;
+
+  /// Pretty prints the `Identity` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `IdentityImpl`.
+/// See the documentation for `IdentityImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Identity);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Linear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies a linear transformation with optional bias.
+/// See https://pytorch.org/docs/master/generated/torch.nn.Linear.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LinearOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Linear model(LinearOptions(5, 2).bias(false));
+/// ```
+class TORCH_API LinearImpl : public Cloneable<LinearImpl> {
+ public:
+  LinearImpl(int64_t in_features, int64_t out_features)
+      : LinearImpl(LinearOptions(in_features, out_features)) {}
+  explicit LinearImpl(const LinearOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Linear` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Transforms the `input` tensor by multiplying with the `weight` and
+  /// optionally adding the `bias`, if `with_bias` is true in the options.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  LinearOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias. If `bias` is false in the `options`, this tensor is
+  /// undefined.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `LinearImpl`.
+/// See the documentation for `LinearImpl` class to learn what methods it
+/// provides, and examples of how to use `Linear` with
+/// `torch::nn::LinearOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Linear);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Flatten ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder for Flatten operator
+/// See https://pytorch.org/docs/master/generated/torch.nn.Flatten.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FlattenOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Flatten model(FlattenOptions().start_dim(2).end_dim(4));
+/// ```
+class TORCH_API FlattenImpl : public Cloneable<FlattenImpl> {
+ public:
+  explicit FlattenImpl(const FlattenOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `Flatten` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies a flatten transform on the `input`.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  FlattenOptions options;
+};
+
+/// A `ModuleHolder` subclass for `FlattenImpl`.
+/// See the documentation for `FlattenImpl` class to learn what methods it
+/// provides, and examples of how to use `Flatten` with
+/// `torch::nn::FlattenOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Flatten);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Unflatten
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder for unflatten operator
+/// See https://pytorch.org/docs/master/generated/torch.nn.Unflatten.html to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UnflattenOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Unflatten model(UnflattenOptions(0, {2, 2}));
+/// Unflatten model(UnflattenOptions("B", {{"B1", 2}, {"B2", 2}}));
+/// ```
+class TORCH_API UnflattenImpl : public Cloneable<UnflattenImpl> {
+ public:
+  UnflattenImpl(int64_t dim, std::vector<int64_t> sizes)
+      : UnflattenImpl(UnflattenOptions(dim, sizes)) {}
+  UnflattenImpl(std::string dimname, UnflattenOptions::namedshape_t namedshape)
+      : UnflattenImpl(UnflattenOptions(dimname, namedshape)) {}
+  explicit UnflattenImpl(UnflattenOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Unflatten` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies an unflatten transform on the `input`.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  UnflattenOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UnflattenImpl`.
+/// See the documentation for `UnflattenImpl` class to learn what methods it
+/// provides, and examples of how to use `Unflatten` with
+/// `torch::nn::UnflattenOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Unflatten);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bilinear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies a billinear transformation with optional bias.
+/// See https://pytorch.org/docs/master/generated/torch.nn.Bilinear.html to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BilinearOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Bilinear model(BilinearOptions(3, 2, 4).bias(false));
+/// ```
+class TORCH_API BilinearImpl : public Cloneable<BilinearImpl> {
+ public:
+  BilinearImpl(int64_t in1_features, int64_t in2_features, int64_t out_features)
+      : BilinearImpl(
+            BilinearOptions(in1_features, in2_features, out_features)) {}
+  explicit BilinearImpl(const BilinearOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Bilinear` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies a bilinear transform on the `input1` and `input2` tensor by
+  /// multiplying with the `weight` and optionally adding the `bias`, if
+  /// `with_bias` is true in the options.
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options used to configure this module.
+  BilinearOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias. If `with_bias` is false in the `options`, this tensor is
+  /// undefined.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `BilinearImpl`.
+/// See the documentation for `BilinearImpl` class to learn what methods it
+/// provides, and examples of how to use `Bilinear` with
+/// `torch::nn::BilinearOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Bilinear);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba5a419556c30fe2e468bc77ccd5b1fcf2938e79
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -0,0 +1,805 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/loss.h>
+#include <torch/nn/options/loss.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L1Loss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the mean absolute error (MAE) between each
+/// element in the input : math :`x` and target : `y`.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.L1Loss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::L1LossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// L1Loss model(L1LossOptions(torch::kNone));
+/// ```
+struct TORCH_API L1LossImpl : Cloneable<L1LossImpl> {
+  explicit L1LossImpl(L1LossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  L1LossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `L1LossImpl`.
+/// See the documentation for `L1LossImpl` class to learn what methods it
+/// provides, and examples of how to use `L1Loss` with
+/// `torch::nn::L1LossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(L1Loss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ KLDivLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The Kullback-Leibler divergence loss measure
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.KLDivLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::KLDivLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// KLDivLoss model(KLDivLossOptions().reduction(torch::kNone));
+/// ```
+struct TORCH_API KLDivLossImpl : Cloneable<KLDivLossImpl> {
+  explicit KLDivLossImpl(KLDivLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `KLDivLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  KLDivLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `KLDivLossImpl`.
+/// See the documentation for `KLDivLossImpl` class to learn what methods it
+/// provides, and examples of how to use `KLDivLoss` with
+/// `torch::nn::KLDivLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(KLDivLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MSELoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the mean squared error (squared L2 norm)
+/// between each element in the input :math:`x` and target :math:`y`.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MSELoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MSELossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MSELoss model(MSELossOptions(torch::kNone));
+/// ```
+struct TORCH_API MSELossImpl : Cloneable<MSELossImpl> {
+  explicit MSELossImpl(MSELossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MSELoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MSELossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MSELossImpl`.
+/// See the documentation for `MSELossImpl` class to learn what methods it
+/// provides, and examples of how to use `MSELoss` with
+/// `torch::nn::MSELossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MSELoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BCELoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the Binary Cross Entropy
+/// between the target and the output.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.BCELoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BCELossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BCELoss model(BCELossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCELossImpl : Cloneable<BCELossImpl> {
+  explicit BCELossImpl(BCELossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `BCELoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  BCELossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `BCELossImpl`.
+/// See the documentation for `BCELossImpl` class to learn what methods it
+/// provides, and examples of how to use `BCELoss` with
+/// `torch::nn::BCELossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BCELoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HingeEmbeddingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given an input tensor :math:`x`
+/// and a labels tensor :math:`y` (containing 1 or -1).
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.HingeEmbeddingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HingeEmbeddingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// HingeEmbeddingLoss
+/// model(HingeEmbeddingLossOptions().margin(4).reduction(torch::kNone));
+/// ```
+struct TORCH_API HingeEmbeddingLossImpl : Cloneable<HingeEmbeddingLossImpl> {
+  explicit HingeEmbeddingLossImpl(HingeEmbeddingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `HingeEmbeddingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  HingeEmbeddingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HingeEmbeddingLossImpl`.
+/// See the documentation for `HingeEmbeddingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `HingeEmbeddingLoss` with
+/// `torch::nn::HingeEmbeddingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(HingeEmbeddingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-class classification hinge
+/// loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+/// and output :math:`y` (which is a 1D tensor of target class indices, :math:`0
+/// \leq y \leq \text{x.size}(1)-1`). See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiMarginLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiMarginLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiMarginLoss model(MultiMarginLossOptions().margin(2).weight(weight));
+/// ```
+struct TORCH_API MultiMarginLossImpl : public Cloneable<MultiMarginLossImpl> {
+  explicit MultiMarginLossImpl(MultiMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MultiMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiMarginLossImpl`.
+/// See the documentation for `MultiMarginLossImpl` class to learn what methods
+/// it provides, and examples of how to use `MultiMarginLoss` with
+/// `torch::nn::MultiMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CosineEmbeddingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given input tensors
+/// `input1`, `input2`, and a `Tensor` label `target` with values 1 or
+/// -1. This is used for measuring whether two inputs are similar or
+/// dissimilar, using the cosine distance, and is typically used for learning
+/// nonlinear embeddings or semi-supervised learning.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.CosineEmbeddingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CosineEmbeddingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CosineEmbeddingLoss model(CosineEmbeddingLossOptions().margin(0.5));
+/// ```
+struct TORCH_API CosineEmbeddingLossImpl
+    : public Cloneable<CosineEmbeddingLossImpl> {
+  explicit CosineEmbeddingLossImpl(CosineEmbeddingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CosineEmbeddingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& input1,
+      const Tensor& input2,
+      const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  CosineEmbeddingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CosineEmbeddingLossImpl`.
+/// See the documentation for `CosineEmbeddingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `CosineEmbeddingLoss` with
+/// `torch::nn::CosineEmbeddingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CosineEmbeddingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SmoothL1Loss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that uses a squared term if the absolute
+/// element-wise error falls below beta and an L1 term otherwise.
+/// It is less sensitive to outliers than the `MSELoss` and in some cases
+/// prevents exploding gradients (e.g. see the paper `Fast R-CNN` by Ross
+/// Girshick). See https://pytorch.org/docs/master/nn.html#torch.nn.SmoothL1Loss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SmoothL1LossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SmoothL1Loss model(SmoothL1LossOptions().reduction(torch::kNone).beta(0.5));
+/// ```
+struct TORCH_API SmoothL1LossImpl : public Cloneable<SmoothL1LossImpl> {
+  explicit SmoothL1LossImpl(SmoothL1LossOptions options = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  SmoothL1LossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SmoothL1LossImpl`.
+/// See the documentation for `SmoothL1LossImpl` class to learn what methods it
+/// provides, and examples of how to use `SmoothL1Loss` with
+/// `torch::nn::SmoothL1LossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(SmoothL1Loss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HuberLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that uses a squared term if the absolute
+/// element-wise error falls below delta and a delta-scaled L1 term otherwise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.HuberLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HuberLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// HuberLoss model(HuberLossOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+struct TORCH_API HuberLossImpl : public Cloneable<HuberLossImpl> {
+  explicit HuberLossImpl(HuberLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `HuberLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  HuberLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HuberLossImpl`.
+/// See the documentation for `HuberLossImpl` class to learn what methods it
+/// provides, and examples of how to use `HuberLoss` with
+/// `torch::nn::HuberLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(HuberLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-class multi-classification
+/// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch
+/// `Tensor`) and output :math:`y` (which is a 2D `Tensor` of target class
+/// indices). See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiLabelMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiLabelMarginLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiLabelMarginLoss model(MultiLabelMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API MultiLabelMarginLossImpl
+    : public Cloneable<MultiLabelMarginLossImpl> {
+  explicit MultiLabelMarginLossImpl(MultiLabelMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiLabelMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiLabelMarginLossImpl`.
+/// See the documentation for `MultiLabelMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiLabelMarginLoss` with
+/// `torch::nn::MultiLabelMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiLabelMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SoftMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a two-class classification
+/// logistic loss between input tensor :math:`x` and target tensor :math:`y`
+/// (containing 1 or -1).
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.SoftMarginLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftMarginLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SoftMarginLoss model(SoftMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API SoftMarginLossImpl : public Cloneable<SoftMarginLossImpl> {
+  explicit SoftMarginLossImpl(SoftMarginLossOptions options_ = {});
+
+  /// Pretty prints the `SoftMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  SoftMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftMarginLossImpl`.
+/// See the documentation for `SoftMarginLossImpl` class to learn what methods
+/// it provides, and examples of how to use `SoftMarginLoss` with
+/// `torch::nn::SoftMarginLossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(SoftMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelSoftMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-label one-versus-all
+/// loss based on max-entropy, between input :math:`x` and target :math:`y` of
+/// size :math:`(N, C)`. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.MultiLabelSoftMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiLabelSoftMarginLossOptions` class
+/// to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiLabelSoftMarginLoss
+/// model(MultiLabelSoftMarginLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API MultiLabelSoftMarginLossImpl
+    : public Cloneable<MultiLabelSoftMarginLossImpl> {
+  explicit MultiLabelSoftMarginLossImpl(
+      MultiLabelSoftMarginLossOptions options_ = {});
+
+  /// Pretty prints the `MultiLabelSoftMarginLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiLabelSoftMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiLabelSoftMarginLossImpl`.
+/// See the documentation for `MultiLabelSoftMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiLabelSoftMarginLoss`
+/// with `torch::nn::MultiLabelSoftMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiLabelSoftMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given an input
+/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater
+/// than :math:`0`. This is used for measuring a relative similarity between
+/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`,
+/// `positive examples` and `negative examples` respectively). The
+/// shapes of all input tensors should be :math:`(N, D)`.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginLoss
+/// model(TripletMarginLossOptions().margin(3).p(2).eps(1e-06).swap(false));
+/// ```
+struct TORCH_API TripletMarginLossImpl
+    : public Cloneable<TripletMarginLossImpl> {
+  explicit TripletMarginLossImpl(TripletMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginLossImpl`.
+/// See the documentation for `TripletMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `TripletMarginLoss` with
+/// `torch::nn::TripletMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TripletMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginWithDistanceLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given input
+/// tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+/// positive, and negative examples, respectively); and a nonnegative,
+/// real-valued function
+/// ("distance function") used to compute the relationships between the anchor
+/// and positive example ("positive distance") and the anchor and negative
+/// example ("negative distance").
+/// See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginWithDistanceLoss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions`
+/// class to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss
+/// model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossImpl
+    : public Cloneable<TripletMarginWithDistanceLossImpl> {
+  explicit TripletMarginWithDistanceLossImpl(
+      TripletMarginWithDistanceLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginWithDistanceLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginWithDistanceLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginWithDistanceLossImpl`.
+/// See the documentation for `TripletMarginWithDistanceLossImpl` class to learn
+/// what methods it provides, and examples of how to use
+/// `TripletMarginWithDistanceLoss` with
+/// `torch::nn::TripletMarginWithDistanceLossOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TripletMarginWithDistanceLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The Connectionist Temporal Classification loss.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.CTCLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CTCLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CTCLoss
+/// model(CTCLossOptions().blank(42).zero_infinity(false).reduction(torch::kSum));
+/// ```
+struct TORCH_API CTCLossImpl : public Cloneable<CTCLossImpl> {
+  explicit CTCLossImpl(CTCLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CTCLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& log_probs,
+      const Tensor& targets,
+      const Tensor& input_lengths,
+      const Tensor& target_lengths);
+
+  /// The options with which this `Module` was constructed.
+  CTCLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CTCLossImpl`.
+/// See the documentation for `CTCLossImpl` class to learn what methods it
+/// provides, and examples of how to use `CTCLoss` with
+/// `torch::nn::CTCLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(CTCLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PoissonNLLLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Negative log likelihood loss with Poisson distribution of target.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.PoissonNLLLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PoissonNLLLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PoissonNLLLoss
+/// model(PoissonNLLLossOptions().log_input(false).full(true).eps(0.42).reduction(torch::kSum));
+/// ```
+struct TORCH_API PoissonNLLLossImpl : public Cloneable<PoissonNLLLossImpl> {
+  explicit PoissonNLLLossImpl(PoissonNLLLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `PoissonNLLLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& log_input, const Tensor& targets);
+
+  /// The options with which this `Module` was constructed.
+  PoissonNLLLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PoissonNLLLossImpl`.
+/// See the documentation for `PoissonNLLLossImpl` class to learn what methods
+/// it provides, and examples of how to use `PoissonNLLLoss` with
+/// `torch::nn::PoissonNLLLossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PoissonNLLLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MarginRankingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given
+/// inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
+/// and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MarginRankingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MarginRankingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MarginRankingLoss
+/// model(MarginRankingLossOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+struct TORCH_API MarginRankingLossImpl
+    : public Cloneable<MarginRankingLossImpl> {
+  explicit MarginRankingLossImpl(MarginRankingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MarginRankingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& input1,
+      const Tensor& input2,
+      const Tensor& targets);
+
+  /// The options with which this `Module` was constructed.
+  MarginRankingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MarginRankingLossImpl`.
+/// See the documentation for `MarginRankingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MarginRankingLoss` with
+/// `torch::nn::MarginRankingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MarginRankingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NLLLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The negative log likelihood loss. It is useful to train a classification
+/// problem with `C` classes.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::NLLLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// NLLLoss model(NLLLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API NLLLossImpl : public Cloneable<NLLLossImpl> {
+  explicit NLLLossImpl(NLLLossOptions options_ = {});
+
+  /// Pretty prints the `NLLLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  NLLLossOptions options;
+
+  /// A manual rescaling weight given to to each class.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `NLLLossImpl`.
+/// See the documentation for `NLLLossImpl` class to learn what methods it
+/// provides, and examples of how to use `NLLLoss` with
+/// `torch::nn::NLLLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(NLLLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CrossEntropyLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that computes cross entropy loss between input and
+/// target. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.CrossEntropyLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CrossEntropyLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CrossEntropyLoss
+/// model(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
+  explicit CrossEntropyLossImpl(CrossEntropyLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CrossEntropyLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  CrossEntropyLossOptions options;
+
+  /// A manual rescaling weight given to to each class.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `CrossEntropyLossImpl`.
+/// See the documentation for `CrossEntropyLossImpl` class to learn what methods
+/// it provides, and examples of how to use `CrossEntropyLoss` with
+/// `torch::nn::CrossEntropyLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CrossEntropyLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BCEWithLogitsLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+/// class. This version is more numerically stable than using a plain `Sigmoid`
+/// followed by a `BCELoss` as, by combining the operations into one layer,
+/// we take advantage of the log-sum-exp trick for numerical stability.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.BCEWithLogitsLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BCEWithLogitsLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BCEWithLogitsLoss
+/// model(BCEWithLogitsLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCEWithLogitsLossImpl
+    : public Cloneable<BCEWithLogitsLossImpl> {
+  explicit BCEWithLogitsLossImpl(BCEWithLogitsLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `BCEWithLogitsLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  BCEWithLogitsLossOptions options;
+
+  /// A manual rescaling weight given to the loss of each batch element.
+  Tensor weight;
+
+  /// A weight of positive examples.
+  Tensor pos_weight;
+};
+
+/// A `ModuleHolder` subclass for `BCEWithLogitsLossImpl`.
+/// See the documentation for `BCEWithLogitsLossImpl` class to learn what
+/// methods it provides, and examples of how to use `BCEWithLogitsLoss` with
+/// `torch::nn::BCEWithLogitsLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(BCEWithLogitsLoss);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..f40b4cd7841cb1783a540738d284cce5e91850de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/normalization.h>
+#include <torch/nn/modules/_functions.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LayerNorm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Layer Normalization over a mini-batch of inputs as described in
+/// the paper `Layer Normalization`_ .
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LayerNorm to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LayerNormOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LayerNorm model(LayerNormOptions({2,
+/// 2}).elementwise_affine(false).eps(2e-5));
+/// ```
+class TORCH_API LayerNormImpl : public torch::nn::Cloneable<LayerNormImpl> {
+ public:
+  LayerNormImpl(std::vector<int64_t> normalized_shape)
+      : LayerNormImpl(LayerNormOptions(normalized_shape)) {}
+  explicit LayerNormImpl(LayerNormOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `LayerNorm` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies layer normalization over a mini-batch of inputs as described in
+  /// the paper `Layer Normalization`_ .
+  ///
+  /// The mean and standard-deviation are calculated separately over the last
+  /// certain number dimensions which have to be of the shape specified by
+  /// input `normalized_shape`.
+  ///
+  /// `Layer Normalization`: https://arxiv.org/abs/1607.06450
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this module was constructed.
+  LayerNormOptions options;
+
+  /// The learned weight.
+  /// Initialized to ones if the `elementwise_affine` option is set to `true`
+  /// upon construction.
+  Tensor weight;
+
+  /// The learned bias.
+  /// Initialized to zeros `elementwise_affine` option is set to `true` upon
+  /// construction.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `LayerNormImpl`.
+/// See the documentation for `LayerNormImpl` class to learn what methods it
+/// provides, and examples of how to use `LayerNorm` with
+/// `torch::nn::LayerNormOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LayerNorm);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LocalResponseNorm
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies local response normalization over an input signal composed
+/// of several input planes, where channels occupy the second dimension.
+/// Applies normalization across channels.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LocalResponseNorm to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LocalResponseNormOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LocalResponseNorm
+/// model(LocalResponseNormOptions(2).alpha(0.0002).beta(0.85).k(2.));
+/// ```
+class TORCH_API LocalResponseNormImpl
+    : public Cloneable<LocalResponseNormImpl> {
+ public:
+  LocalResponseNormImpl(int64_t size)
+      : LocalResponseNormImpl(LocalResponseNormOptions(size)) {}
+  explicit LocalResponseNormImpl(const LocalResponseNormOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LocalResponseNormImpl` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  LocalResponseNormOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LocalResponseNormImpl`.
+/// See the documentation for `LocalResponseNormImpl` class to learn what
+/// methods it provides, and examples of how to use `LocalResponseNorm` with
+/// `torch::nn::LocalResponseNormOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(LocalResponseNorm);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CrossMapLRN2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// See the documentation for `torch::nn::CrossMapLRN2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CrossMapLRN2d model(CrossMapLRN2dOptions(3).alpha(1e-5).beta(0.1).k(10));
+/// ```
+class TORCH_API CrossMapLRN2dImpl
+    : public torch::nn::Cloneable<CrossMapLRN2dImpl> {
+ public:
+  CrossMapLRN2dImpl(int64_t size)
+      : CrossMapLRN2dImpl(CrossMapLRN2dOptions(size)) {}
+  explicit CrossMapLRN2dImpl(const CrossMapLRN2dOptions& options_)
+      : options(options_) {}
+
+  void reset() override;
+
+  /// Pretty prints the `CrossMapLRN2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  torch::Tensor forward(const torch::Tensor& input);
+
+  CrossMapLRN2dOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CrossMapLRN2dImpl`.
+/// See the documentation for `CrossMapLRN2dImpl` class to learn what methods it
+/// provides, and examples of how to use `CrossMapLRN2d` with
+/// `torch::nn::CrossMapLRN2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CrossMapLRN2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GroupNorm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Group Normalization over a mini-batch of inputs as described in
+/// the paper `Group Normalization`_ .
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GroupNorm to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GroupNormOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GroupNorm model(GroupNormOptions(2, 2).eps(2e-5).affine(false));
+/// ```
+class TORCH_API GroupNormImpl : public torch::nn::Cloneable<GroupNormImpl> {
+ public:
+  GroupNormImpl(int64_t num_groups, int64_t num_channels)
+      : GroupNormImpl(GroupNormOptions(num_groups, num_channels)) {}
+  explicit GroupNormImpl(const GroupNormOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `GroupNorm` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this module was constructed.
+  GroupNormOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `GroupNormImpl`.
+/// See the documentation for `GroupNormImpl` class to learn what methods it
+/// provides, and examples of how to use `GroupNorm` with
+/// `torch::nn::GroupNormOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(GroupNorm);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aa5d2a41d1ce45a61013a5d0a5b85ca64c6d49d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h
@@ -0,0 +1,378 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/padding.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace nn {
+
+/// Base class for all (dimension-specialized) ReflectionPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ReflectionPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ReflectionPadImpl(ExpandingArray<D * 2> padding)
+      : ReflectionPadImpl(ReflectionPadOptions<D>(padding)) {}
+  explicit ReflectionPadImpl(const ReflectionPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ReflectionPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReflectionPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad1d model(ReflectionPad1dOptions({3, 1}));
+/// ```
+class TORCH_API ReflectionPad1dImpl
+    : public ReflectionPadImpl<1, ReflectionPad1dImpl> {
+ public:
+  using ReflectionPadImpl<1, ReflectionPad1dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad1dImpl`.
+/// See the documentation for `ReflectionPad1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad1d` with
+/// `torch::nn::ReflectionPad1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad2d model(ReflectionPad2dOptions({1, 1, 2, 0}));
+/// ```
+class TORCH_API ReflectionPad2dImpl
+    : public ReflectionPadImpl<2, ReflectionPad2dImpl> {
+ public:
+  using ReflectionPadImpl<2, ReflectionPad2dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad2dImpl`.
+/// See the documentation for `ReflectionPad2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad2d` with
+/// `torch::nn::ReflectionPad2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReflectionPad3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad3d model(ReflectionPad3dOptions(1));
+/// ReflectionPad3d model(ReflectionPad3dOptions({1, 1, 2, 0, 1, 2}));
+/// ```
+class TORCH_API ReflectionPad3dImpl
+    : public ReflectionPadImpl<3, ReflectionPad3dImpl> {
+ public:
+  using ReflectionPadImpl<3, ReflectionPad3dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad3dImpl`.
+/// See the documentation for `ReflectionPad3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad3d` with
+/// `torch::nn::ReflectionPad3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ReplicationPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ReplicationPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ReplicationPadImpl(ExpandingArray<D * 2> padding)
+      : ReplicationPadImpl(ReplicationPadOptions<D>(padding)) {}
+  explicit ReplicationPadImpl(const ReplicationPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ReplicationPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReplicationPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad1d model(ReplicationPad1dOptions({3, 1}));
+/// ```
+class TORCH_API ReplicationPad1dImpl
+    : public ReplicationPadImpl<1, ReplicationPad1dImpl> {
+ public:
+  using ReplicationPadImpl<1, ReplicationPad1dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad1dImpl`.
+/// See the documentation for `ReplicationPad1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad1d` with
+/// `torch::nn::ReplicationPad1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad2d model(ReplicationPad2dOptions({1, 1, 2, 0}));
+/// ```
+class TORCH_API ReplicationPad2dImpl
+    : public ReplicationPadImpl<2, ReplicationPad2dImpl> {
+ public:
+  using ReplicationPadImpl<2, ReplicationPad2dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad2dImpl`.
+/// See the documentation for `ReplicationPad2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad2d` with
+/// `torch::nn::ReplicationPad2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ReplicationPad3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad3d model(ReplicationPad3dOptions({1, 2, 1, 2, 1, 2}));
+/// ```
+class TORCH_API ReplicationPad3dImpl
+    : public ReplicationPadImpl<3, ReplicationPad3dImpl> {
+ public:
+  using ReplicationPadImpl<3, ReplicationPad3dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad3dImpl`.
+/// See the documentation for `ReplicationPad3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad3d` with
+/// `torch::nn::ReplicationPad3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ZeroPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ZeroPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ZeroPadImpl(ExpandingArray<D * 2> padding)
+      : ZeroPadImpl(ZeroPadOptions<D>(padding)) {}
+  explicit ZeroPadImpl(const ZeroPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ZeroPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ZeroPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 1-D input.
+class TORCH_API ZeroPad1dImpl : public ZeroPadImpl<1, ZeroPad1dImpl> {
+ public:
+  using ZeroPadImpl<1, ZeroPad1dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad1dImpl`.
+/// See the documentation for `ZeroPad1dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad1d` with
+/// `torch::nn::ZeroPad1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 2-D input.
+class TORCH_API ZeroPad2dImpl : public ZeroPadImpl<2, ZeroPad2dImpl> {
+ public:
+  using ZeroPadImpl<2, ZeroPad2dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad2dImpl`.
+/// See the documentation for `ZeroPad2dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad2d` with
+/// `torch::nn::ZeroPad2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 3-D input.
+class TORCH_API ZeroPad3dImpl : public ZeroPadImpl<3, ZeroPad3dImpl> {
+ public:
+  using ZeroPadImpl<3, ZeroPad3dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad3dImpl`.
+/// See the documentation for `ZeroPad3dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad3d` with
+/// `torch::nn::ZeroPad3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ConstantPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ConstantPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ConstantPadImpl(ExpandingArray<D * 2> padding, double value)
+      : ConstantPadImpl(ConstantPadOptions<D>(padding, value)) {}
+  explicit ConstantPadImpl(const ConstantPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ConstantPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ConstantPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1}, 3.5));
+/// ```
+class TORCH_API ConstantPad1dImpl
+    : public ConstantPadImpl<1, ConstantPad1dImpl> {
+ public:
+  using ConstantPadImpl<1, ConstantPad1dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad1dImpl`.
+/// See the documentation for `ConstantPad1dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad1d` with
+/// `torch::nn::ConstantPad1dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({3, 0, 2, 1}, 3.5));
+/// ```
+class TORCH_API ConstantPad2dImpl
+    : public ConstantPadImpl<2, ConstantPad2dImpl> {
+ public:
+  using ConstantPadImpl<2, ConstantPad2dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad2dImpl`.
+/// See the documentation for `ConstantPad2dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad2d` with
+/// `torch::nn::ConstantPad2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.ConstantPad3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2}, 3.5));
+/// ```
+class TORCH_API ConstantPad3dImpl
+    : public ConstantPadImpl<3, ConstantPad3dImpl> {
+ public:
+  using ConstantPadImpl<3, ConstantPad3dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad3dImpl`.
+/// See the documentation for `ConstantPad3dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad3d` with
+/// `torch::nn::ConstantPad3dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e23c61af3ed8fadc2b7495ebcdf53778acafaf2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/pixelshuffle.h>
+#include <torch/nn/options/pixelshuffle.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelShuffle
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+/// to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an
+/// upscale factor. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelShuffle to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PixelShuffle model(PixelShuffleOptions(5));
+/// ```
+struct TORCH_API PixelShuffleImpl
+    : public torch::nn::Cloneable<PixelShuffleImpl> {
+  explicit PixelShuffleImpl(const PixelShuffleOptions& options_);
+
+  /// Pretty prints the `PixelShuffle` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// The options with which this `Module` was constructed.
+  PixelShuffleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PixelShuffleImpl`.
+/// See the documentation for `PixelShuffleImpl` class to learn what methods it
+/// provides, and examples of how to use `PixelShuffle` with
+/// `torch::nn::PixelShuffleOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PixelShuffle);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelUnshuffle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Reverses the PixelShuffle operation by rearranging elements in a tensor of
+/// shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape :math:`(*,
+/// C \times r^2, H, W)`, where r is a downscale factor. See
+/// https://pytorch.org/docs/master/nn.html#torch.nn.PixelUnshuffle to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleImpl
+    : public torch::nn::Cloneable<PixelUnshuffleImpl> {
+  explicit PixelUnshuffleImpl(const PixelUnshuffleOptions& options_);
+
+  /// Pretty prints the `PixelUnshuffle` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// The options with which this `Module` was constructed.
+  PixelUnshuffleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PixelUnshuffleImpl`.
+/// See the documentation for `PixelUnshuffleImpl` class to learn what methods
+/// it provides, and examples of how to use `PixelUnshuffle` with
+/// `torch::nn::PixelUnshuffleOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PixelUnshuffle);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..d495a9af2807bf3d66d664887470ba99a830c930
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -0,0 +1,779 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/pooling.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace nn {
+
+/// Base class for all (dimension-specialized) avgpool modules.
+template <size_t D, typename Derived>
+class TORCH_API AvgPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AvgPoolImpl(ExpandingArray<D> kernel_size)
+      : AvgPoolImpl(AvgPoolOptions<D>(kernel_size)) {}
+  explicit AvgPoolImpl(const AvgPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `AvgPool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  AvgPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool1d model(AvgPool1dOptions(3).stride(2));
+/// ```
+class TORCH_API AvgPool1dImpl : public AvgPoolImpl<1, AvgPool1dImpl> {
+ public:
+  using AvgPoolImpl<1, AvgPool1dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool1dImpl`.
+/// See the documentation for `AvgPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool1d` with
+/// `torch::nn::AvgPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool2d model(AvgPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+class TORCH_API AvgPool2dImpl : public AvgPoolImpl<2, AvgPool2dImpl> {
+ public:
+  using AvgPoolImpl<2, AvgPool2dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool2dImpl`.
+/// See the documentation for `AvgPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool2d` with
+/// `torch::nn::AvgPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AvgPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool3d model(AvgPool3dOptions(5).stride(2));
+/// ```
+class TORCH_API AvgPool3dImpl : public AvgPoolImpl<3, AvgPool3dImpl> {
+ public:
+  using AvgPoolImpl<3, AvgPool3dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool3dImpl`.
+/// See the documentation for `AvgPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool3d` with
+/// `torch::nn::AvgPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) maxpool modules.
+template <size_t D, typename Derived>
+class TORCH_API MaxPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  MaxPoolImpl(ExpandingArray<D> kernel_size)
+      : MaxPoolImpl(MaxPoolOptions<D>(kernel_size)) {}
+  explicit MaxPoolImpl(const MaxPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `MaxPool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  MaxPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool1d model(MaxPool1dOptions(3).stride(2));
+/// ```
+class TORCH_API MaxPool1dImpl : public MaxPoolImpl<1, MaxPool1dImpl> {
+ public:
+  using MaxPoolImpl<1, MaxPool1dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool1d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool1dImpl`.
+/// See the documentation for `MaxPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool1d` with
+/// `torch::nn::MaxPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool2d model(MaxPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+class TORCH_API MaxPool2dImpl : public MaxPoolImpl<2, MaxPool2dImpl> {
+ public:
+  using MaxPoolImpl<2, MaxPool2dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool2d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool2dImpl`.
+/// See the documentation for `MaxPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool2d` with
+/// `torch::nn::MaxPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool3d model(MaxPool3dOptions(3).stride(2));
+/// ```
+class TORCH_API MaxPool3dImpl : public MaxPoolImpl<3, MaxPool3dImpl> {
+ public:
+  using MaxPoolImpl<3, MaxPool3dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool3d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool3dImpl`.
+/// See the documentation for `MaxPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool3d` with
+/// `torch::nn::MaxPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) adaptive maxpool modules.
+template <size_t D, typename output_size_t, typename Derived>
+class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AdaptiveMaxPoolImpl(output_size_t output_size)
+      : AdaptiveMaxPoolImpl(
+            AdaptiveMaxPoolOptions<output_size_t>(output_size)) {}
+  explicit AdaptiveMaxPoolImpl(
+      const AdaptiveMaxPoolOptions<output_size_t>& options_)
+      : options(options_) {}
+
+  void reset() override{};
+
+  /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  AdaptiveMaxPoolOptions<output_size_t> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool1d model(AdaptiveMaxPool1dOptions(3));
+/// ```
+class TORCH_API AdaptiveMaxPool1dImpl
+    : public AdaptiveMaxPoolImpl<1, ExpandingArray<1>, AdaptiveMaxPool1dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<1, ExpandingArray<1>, AdaptiveMaxPool1dImpl>::
+      AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool1d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool1dImpl`.
+/// See the documentation for `AdaptiveMaxPool1dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool1d` with
+/// `torch::nn::AdaptiveMaxPool1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool2d model(AdaptiveMaxPool2dOptions({3, 2}));
+/// ```
+class TORCH_API AdaptiveMaxPool2dImpl : public AdaptiveMaxPoolImpl<
+                                            2,
+                                            ExpandingArrayWithOptionalElem<2>,
+                                            AdaptiveMaxPool2dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<
+      2,
+      ExpandingArrayWithOptionalElem<2>,
+      AdaptiveMaxPool2dImpl>::AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool2d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool2dImpl`.
+/// See the documentation for `AdaptiveMaxPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool2d` with
+/// `torch::nn::AdaptiveMaxPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveMaxPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool3d model(AdaptiveMaxPool3dOptions(3));
+/// ```
+class TORCH_API AdaptiveMaxPool3dImpl : public AdaptiveMaxPoolImpl<
+                                            3,
+                                            ExpandingArrayWithOptionalElem<3>,
+                                            AdaptiveMaxPool3dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<
+      3,
+      ExpandingArrayWithOptionalElem<3>,
+      AdaptiveMaxPool3dImpl>::AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool3d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool3dImpl`.
+/// See the documentation for `AdaptiveMaxPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool3d` with
+/// `torch::nn::AdaptiveMaxPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) adaptive avgpool modules.
+template <size_t D, typename output_size_t, typename Derived>
+class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AdaptiveAvgPoolImpl(output_size_t output_size)
+      : AdaptiveAvgPoolImpl(
+            AdaptiveAvgPoolOptions<output_size_t>(output_size)) {}
+  explicit AdaptiveAvgPoolImpl(
+      const AdaptiveAvgPoolOptions<output_size_t>& options_)
+      : options(options_) {}
+
+  void reset() override {}
+
+  /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  AdaptiveAvgPoolOptions<output_size_t> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool1d model(AdaptiveAvgPool1dOptions(5));
+/// ```
+class TORCH_API AdaptiveAvgPool1dImpl
+    : public AdaptiveAvgPoolImpl<1, ExpandingArray<1>, AdaptiveAvgPool1dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<1, ExpandingArray<1>, AdaptiveAvgPool1dImpl>::
+      AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool1dImpl`.
+/// See the documentation for `AdaptiveAvgPool1dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool1d` with
+/// `torch::nn::AdaptiveAvgPool1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool2d model(AdaptiveAvgPool2dOptions({3, 2}));
+/// ```
+class TORCH_API AdaptiveAvgPool2dImpl : public AdaptiveAvgPoolImpl<
+                                            2,
+                                            ExpandingArrayWithOptionalElem<2>,
+                                            AdaptiveAvgPool2dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<
+      2,
+      ExpandingArrayWithOptionalElem<2>,
+      AdaptiveAvgPool2dImpl>::AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool2dImpl`.
+/// See the documentation for `AdaptiveAvgPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool2d` with
+/// `torch::nn::AdaptiveAvgPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.AdaptiveAvgPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool3d model(AdaptiveAvgPool3dOptions(3));
+/// ```
+class TORCH_API AdaptiveAvgPool3dImpl : public AdaptiveAvgPoolImpl<
+                                            3,
+                                            ExpandingArrayWithOptionalElem<3>,
+                                            AdaptiveAvgPool3dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<
+      3,
+      ExpandingArrayWithOptionalElem<3>,
+      AdaptiveAvgPool3dImpl>::AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool3dImpl`.
+/// See the documentation for `AdaptiveAvgPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool3d` with
+/// `torch::nn::AdaptiveAvgPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) maxunpool modules.
+template <size_t D, typename Derived>
+class TORCH_API MaxUnpoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  MaxUnpoolImpl(ExpandingArray<D> kernel_size)
+      : MaxUnpoolImpl(MaxUnpoolOptions<D>(kernel_size)) {}
+  explicit MaxUnpoolImpl(const MaxUnpoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `MaxUnpool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  MaxUnpoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 1-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool1d model(MaxUnpool1dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool1dImpl : public MaxUnpoolImpl<1, MaxUnpool1dImpl> {
+ public:
+  using MaxUnpoolImpl<1, MaxUnpool1dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool1dImpl`.
+/// See the documentation for `MaxUnpool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool1d` with
+/// `torch::nn::MaxUnpool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool2d model(MaxUnpool2dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool2dImpl : public MaxUnpoolImpl<2, MaxUnpool2dImpl> {
+ public:
+  using MaxUnpoolImpl<2, MaxUnpool2dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool2dImpl`.
+/// See the documentation for `MaxUnpool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool2d` with
+/// `torch::nn::MaxUnpool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.MaxUnpool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool3d model(MaxUnpool3dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool3dImpl : public MaxUnpoolImpl<3, MaxUnpool3dImpl> {
+ public:
+  using MaxUnpoolImpl<3, MaxUnpool3dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const c10::optional<std::vector<int64_t>>& output_size = c10::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(c10::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool3dImpl`.
+/// See the documentation for `MaxUnpool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool3d` with
+/// `torch::nn::MaxUnpool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FractionalMaxPool2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies fractional maxpool over a 2-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.FractionalMaxPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FractionalMaxPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool2d model(FractionalMaxPool2dOptions(5).output_size(1));
+/// ```
+class TORCH_API FractionalMaxPool2dImpl
+    : public torch::nn::Cloneable<FractionalMaxPool2dImpl> {
+ public:
+  FractionalMaxPool2dImpl(ExpandingArray<2> kernel_size)
+      : FractionalMaxPool2dImpl(FractionalMaxPool2dOptions(kernel_size)) {}
+  explicit FractionalMaxPool2dImpl(FractionalMaxPool2dOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `FractionalMaxPool2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool2d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FractionalMaxPool2dOptions options;
+
+  Tensor _random_samples;
+};
+
+/// A `ModuleHolder` subclass for `FractionalMaxPool2dImpl`.
+/// See the documentation for `FractionalMaxPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `FractionalMaxPool2d` with
+/// `torch::nn::FractionalMaxPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FractionalMaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FractionalMaxPool3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies fractional maxpool over a 3-D input.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.FractionalMaxPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FractionalMaxPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool3d model(FractionalMaxPool3dOptions(5).output_size(1));
+/// ```
+class TORCH_API FractionalMaxPool3dImpl
+    : public torch::nn::Cloneable<FractionalMaxPool3dImpl> {
+ public:
+  FractionalMaxPool3dImpl(ExpandingArray<3> kernel_size)
+      : FractionalMaxPool3dImpl(FractionalMaxPool3dOptions(kernel_size)) {}
+  explicit FractionalMaxPool3dImpl(FractionalMaxPool3dOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `FractionalMaxPool3d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool3d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FractionalMaxPool3dOptions options;
+
+  Tensor _random_samples;
+};
+
+/// A `ModuleHolder` subclass for `FractionalMaxPool3dImpl`.
+/// See the documentation for `FractionalMaxPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `FractionalMaxPool3d` with
+/// `torch::nn::FractionalMaxPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FractionalMaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) lppool modules.
+template <size_t D, typename Derived>
+class TORCH_API LPPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  LPPoolImpl(double norm_type, ExpandingArray<D> kernel_size)
+      : LPPoolImpl(LPPoolOptions<D>(norm_type, kernel_size)) {}
+  explicit LPPoolImpl(const LPPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `LPPool{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  LPPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool1d function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool1d model(LPPool1dOptions(1, 2).stride(5).ceil_mode(true));
+/// ```
+class TORCH_API LPPool1dImpl : public LPPoolImpl<1, LPPool1dImpl> {
+ public:
+  using LPPoolImpl<1, LPPool1dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool1dImpl`.
+/// See the documentation for `LPPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool1d` with
+/// `torch::nn::LPPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool2d function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool2d model(LPPool2dOptions(1, std::vector<int64_t>({3, 4})).stride({5,
+/// 6}).ceil_mode(true));
+/// ```
+class TORCH_API LPPool2dImpl : public LPPoolImpl<2, LPPool2dImpl> {
+ public:
+  using LPPoolImpl<2, LPPool2dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool2dImpl`.
+/// See the documentation for `LPPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool2d` with
+/// `torch::nn::LPPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool3d function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LPPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool3d model(LPPool3dOptions(1, std::vector<int64_t>({3, 4, 5})).stride(
+/// {5, 6, 7}).ceil_mode(true));
+/// ```
+class TORCH_API LPPool3dImpl : public LPPoolImpl<3, LPPool3dImpl> {
+ public:
+  using LPPoolImpl<3, LPPool3dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool3dImpl`.
+/// See the documentation for `LPPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool3d` with
+/// `torch::nn::LPPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool3d);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..c48b4c1b5997ace9778b07f1b60ca5468cc2c73a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -0,0 +1,401 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/options/rnn.h>
+#include <torch/nn/pimpl.h>
+#include <torch/nn/utils/rnn.h>
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+namespace detail {
+/// Base class for all RNN implementations (intended for code sharing).
+template <typename Derived>
+class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit RNNImplBase(const RNNOptionsBase& options_);
+
+  /// Initializes the parameters of the RNN module.
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Overrides `nn::Module::to()` to call `flatten_parameters()` after the
+  /// original operation.
+  void to(torch::Device device, torch::Dtype dtype, bool non_blocking = false)
+      override;
+  void to(torch::Dtype dtype, bool non_blocking = false) override;
+  void to(torch::Device device, bool non_blocking = false) override;
+
+  /// Pretty prints the RNN module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Modifies the internal storage of weights for optimization purposes.
+  ///
+  /// On CPU, this method should be called if any of the weight or bias vectors
+  /// are changed (i.e. weights are added or removed). On GPU, it should be
+  /// called __any time the storage of any parameter is modified__, e.g. any
+  /// time a parameter is assigned a new value. This allows using the fast path
+  /// in cuDNN implementations of respective RNN `forward()` methods. It is
+  /// called once upon construction, inside `reset()`.
+  void flatten_parameters();
+
+  std::vector<Tensor> all_weights() const;
+
+  /// The RNN's options.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  RNNOptionsBase options_base;
+
+ protected:
+  // Resets flat_weights_
+  // Note: be v. careful before removing this, as 3rd party device types
+  // likely rely on this behavior to properly .to() modules like LSTM.
+  void reset_flat_weights();
+
+  void check_input(const Tensor& input, const Tensor& batch_sizes) const;
+
+  std::tuple<int64_t, int64_t, int64_t> get_expected_hidden_size(
+      const Tensor& input,
+      const Tensor& batch_sizes) const;
+
+  void check_hidden_size(
+      const Tensor& hx,
+      std::tuple<int64_t, int64_t, int64_t> expected_hidden_size,
+      std::string msg = "Expected hidden size {1}, got {2}") const;
+
+  void check_forward_args(Tensor input, Tensor hidden, Tensor batch_sizes)
+      const;
+
+  Tensor permute_hidden(Tensor hx, const Tensor& permutation) const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::string> flat_weights_names_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::vector<std::string>> all_weights_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<Tensor> flat_weights_;
+};
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer Elman RNN module with Tanh or ReLU activation.
+/// See https://pytorch.org/docs/master/generated/torch.nn.RNN.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RNNOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RNN model(RNNOptions(128,
+/// 64).num_layers(3).dropout(0.2).nonlinearity(torch::kTanh));
+/// ```
+class TORCH_API RNNImpl : public detail::RNNImplBase<RNNImpl> {
+ public:
+  RNNImpl(int64_t input_size, int64_t hidden_size)
+      : RNNImpl(RNNOptions(input_size, hidden_size)) {}
+  explicit RNNImpl(const RNNOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, Tensor>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      Tensor hx = {});
+
+  RNNOptions options;
+
+ protected:
+  std::tuple<Tensor, Tensor> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      Tensor hx);
+};
+
+/// A `ModuleHolder` subclass for `RNNImpl`.
+/// See the documentation for `RNNImpl` class to learn what methods it
+/// provides, and examples of how to use `RNN` with `torch::nn::RNNOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(RNN);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer long-short-term-memory (LSTM) module.
+/// See https://pytorch.org/docs/master/generated/torch.nn.LSTM.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LSTMOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LSTM model(LSTMOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+class TORCH_API LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
+ public:
+  LSTMImpl(int64_t input_size, int64_t hidden_size)
+      : LSTMImpl(LSTMOptions(input_size, hidden_size)) {}
+  explicit LSTMImpl(const LSTMOptions& options_);
+
+  std::tuple<Tensor, std::tuple<Tensor, Tensor>> forward(
+      const Tensor& input,
+      torch::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {1, AnyValue(torch::optional<std::tuple<Tensor, Tensor>>())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, std::tuple<Tensor, Tensor>>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      torch::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+  LSTMOptions options;
+
+ protected:
+  void check_forward_args(
+      const Tensor& input,
+      std::tuple<Tensor, Tensor> hidden,
+      const Tensor& batch_sizes) const;
+
+  std::tuple<int64_t, int64_t, int64_t> get_expected_cell_size(
+      const Tensor& input,
+      const Tensor& batch_sizes) const;
+
+  std::tuple<Tensor, Tensor> permute_hidden(
+      std::tuple<Tensor, Tensor> hx,
+      const Tensor& permutation) const;
+
+  std::tuple<Tensor, std::tuple<Tensor, Tensor>> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      torch::optional<std::tuple<Tensor, Tensor>> hx_opt);
+};
+
+/// A `ModuleHolder` subclass for `LSTMImpl`.
+/// See the documentation for `LSTMImpl` class to learn what methods it
+/// provides, and examples of how to use `LSTM` with `torch::nn::LSTMOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(LSTM);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer gated recurrent unit (GRU) module.
+/// See https://pytorch.org/docs/master/generated/torch.nn.GRU.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GRUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GRU model(GRUOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+class TORCH_API GRUImpl : public detail::RNNImplBase<GRUImpl> {
+ public:
+  GRUImpl(int64_t input_size, int64_t hidden_size)
+      : GRUImpl(GRUOptions(input_size, hidden_size)) {}
+  explicit GRUImpl(const GRUOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(torch::Tensor())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, Tensor>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      Tensor hx = {});
+
+  GRUOptions options;
+
+ protected:
+  std::tuple<Tensor, Tensor> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      Tensor hx);
+};
+
+/// A `ModuleHolder` subclass for `GRUImpl`.
+/// See the documentation for `GRUImpl` class to learn what methods it
+/// provides, and examples of how to use `GRU` with `torch::nn::GRUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GRU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNCellImplBase
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+/// Base class for all RNNCell implementations (intended for code sharing).
+template <typename Derived>
+class TORCH_API RNNCellImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit RNNCellImplBase(const RNNCellOptionsBase& options_);
+
+  /// Initializes the parameters of the RNNCell module.
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the RNN module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  RNNCellOptionsBase options_base;
+
+  Tensor weight_ih;
+  Tensor weight_hh;
+  Tensor bias_ih;
+  Tensor bias_hh;
+
+ protected:
+  void check_forward_input(const Tensor& input, const std::string name) const;
+  virtual std::string get_nonlinearity_str() const;
+};
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// An Elman RNN cell with tanh or ReLU non-linearity.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.RNNCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RNNCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RNNCell model(RNNCellOptions(20,
+/// 10).bias(false).nonlinearity(torch::kReLU));
+/// ```
+class TORCH_API RNNCellImpl : public detail::RNNCellImplBase<RNNCellImpl> {
+ public:
+  RNNCellImpl(int64_t input_size, int64_t hidden_size)
+      : RNNCellImpl(RNNCellOptions(input_size, hidden_size)) {}
+  explicit RNNCellImpl(const RNNCellOptions& options_);
+
+  Tensor forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  RNNCellOptions options;
+
+ protected:
+  std::string get_nonlinearity_str() const override;
+};
+
+/// A `ModuleHolder` subclass for `RNNCellImpl`.
+/// See the documentation for `RNNCellImpl` class to learn what methods it
+/// provides, and examples of how to use `RNNCell` with
+/// `torch::nn::RNNCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(RNNCell);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTMCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A long short-term memory (LSTM) cell.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LSTMCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LSTMCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LSTMCell model(LSTMCellOptions(20, 10).bias(false));
+/// ```
+class TORCH_API LSTMCellImpl : public detail::RNNCellImplBase<LSTMCellImpl> {
+ public:
+  LSTMCellImpl(int64_t input_size, int64_t hidden_size)
+      : LSTMCellImpl(LSTMCellOptions(input_size, hidden_size)) {}
+  explicit LSTMCellImpl(const LSTMCellOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(
+      const Tensor& input,
+      torch::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {1, AnyValue(torch::optional<std::tuple<Tensor, Tensor>>())})
+
+ public:
+  LSTMCellOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LSTMCellImpl`.
+/// See the documentation for `LSTMCellImpl` class to learn what methods it
+/// provides, and examples of how to use `LSTMCell` with
+/// `torch::nn::LSTMCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LSTMCell);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRUCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A gated recurrent unit (GRU) cell.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.GRUCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GRUCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GRUCell model(GRUCellOptions(20, 10).bias(false));
+/// ```
+class TORCH_API GRUCellImpl : public detail::RNNCellImplBase<GRUCellImpl> {
+ public:
+  GRUCellImpl(int64_t input_size, int64_t hidden_size)
+      : GRUCellImpl(GRUCellOptions(input_size, hidden_size)) {}
+  explicit GRUCellImpl(const GRUCellOptions& options_);
+
+  Tensor forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  GRUCellOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GRUCellImpl`.
+/// See the documentation for `GRUCellImpl` class to learn what methods it
+/// provides, and examples of how to use `GRUCell` with
+/// `torch::nn::GRUCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(GRUCell);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9df174f048d66345f228cdf323798807d60edf63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/transformer.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <ostream>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Transformer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A transformer model. User is able to modify the attributes as needed. The
+/// architecture is based on the paper "Attention Is All You Need". Ashish
+/// Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N
+/// Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need.
+/// In Advances in Neural Information Processing Systems, pages 6000-6010.
+///
+/// See https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html to
+/// learn about the exact behavior of this transformer model
+///
+/// See the documentation for `torch::nn::Transformer` class to learn what
+/// constructor arguments are supported for this encoder layer model
+///
+/// Example:
+/// ```
+/// Transformer trans(TransformerOptions(512, 8));
+/// ```
+class TORCH_API TransformerImpl : public Cloneable<TransformerImpl> {
+ public:
+  explicit TransformerImpl(TransformerOptions options_);
+
+  /// forward function for Transformer Module
+  /// Args:
+  ///   src: the sequence to the encoder (required).
+  ///   tgt: the sequence to the decoder (required).
+  ///   src_mask: the additive mask for the src sequence (optional).
+  ///   tgt_mask: the additive mask for the tgt sequence (optional).
+  ///   memory_mask: the additive mask for the encoder output (optional).
+  ///   src_key_padding_mask: the ByteTensor mask for src keys per batch
+  ///   (optional). tgt_key_padding_mask: the ByteTensor mask for tgt keys per
+  ///   batch (optional). memory_key_padding_mask: the ByteTensor mask for
+  ///   memory keys per batch (optional).
+  ///
+  /// Shape:
+  ///   src: `(S, N, E)`
+  ///   tgt: `(T, N, E)`
+  ///   src_mask: `(S, S)`
+  ///   tgt_mask: `(T, T)`
+  ///   memory_mask: `(T, S)`
+  ///   src_key_padding_mask: `(N, S)`
+  ///   tgt_key_padding_mask: `(N, T)`
+  ///   memory_key_padding_mask: `(N, S)`
+  ///
+  ///   Note:
+  ///     [src/tgt/memory]_mask ensures that position i is allowed to attend the
+  ///     unmasked positions. If a ByteTensor is provided, the non-zero
+  ///     positions are not allowed to attend while the zero positions will be
+  ///     unchanged. If a BoolTensor is provided, positions with `True` are not
+  ///     allowed to attend while `False` values will be unchanged. If a
+  ///     FloatTensor is provided, it will be added to the attention weight.
+  ///
+  ///     [src/tgt/memory]_key_padding_mask provides specified elements in the
+  ///     key to be ignored by the attention. If a ByteTensor is provided, the
+  ///     non-zero positions will be ignored while the zero positions will be
+  ///     unchanged. If a BoolTensor is provided, the positions with the value
+  ///     of `True` will be ignored while the position with the value of `False`
+  ///     will be unchanged.
+  ///
+  ///   output: `(T, N, E)`
+  ///
+  ///   Note:
+  ///     Due to the multi-head attention architecture in the transformer model,
+  ///     the output sequence length of a transformer is same as the input
+  ///     sequence (i.e. target) length of the decode.
+  ///
+  ///   where
+  ///   S is the source sequence length,
+  ///   T is the target sequence length,
+  ///   N is the batch size,
+  ///   E is the feature number.
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& tgt,
+      const Tensor& src_mask = {},
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& src_key_padding_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Generate a square mask for the sequence.
+  /// The masked positions are filled with `-inf` in float type.
+  /// Unmasked positions are filled with `0.0` in float type.
+  /// Note:
+  ///   1. This function will always return a CPU tensor.
+  ///   2. This function requires the platform support IEEE754, since `-inf` is
+  ///   guaranteed to
+  ///      be valid only when IEEE754 is supported. If the platform doesn't
+  ///      support IEEE754, this function will fill the mask with the smallest
+  ///      float number instead of `-inf`, a one time warning will pop up as
+  ///      well.
+  static Tensor generate_square_subsequent_mask(int64_t sz);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())},
+      {6, AnyValue(Tensor())},
+      {7, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `Transformer` was constructed
+  TransformerOptions options;
+
+  /// encoder module
+  AnyModule encoder;
+
+  /// decoder module
+  AnyModule decoder;
+};
+
+/// A `ModuleHolder` subclass for `TransformerImpl`.
+/// See the documentation for `TransformerImpl` class to learn what
+/// methods it provides, and examples of how to use `Transformer` with
+/// `torch::nn::TransformerOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Transformer);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ff854245d1b14127d22c153538a0c295ef8180b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/options/transformercoder.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <ostream>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerEncoder
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerEncoder module.
+/// See
+/// https://pytorch.org/docs/master/generated/torch.nn.TransformerEncoder.html
+/// to learn abouut the exact behavior of this encoder layer module.
+///
+/// See the documentation for `torch::nn::TransformerEncoder` class to learn
+/// what constructor arguments are supported for this encoder module.
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1)); TransformerEncoder
+/// encoder(TransformerEncoderOptions(encoderLayer,
+/// 6).norm(LayerNorm(LayerNormOptions({2}))));
+/// ```
+class TORCH_API TransformerEncoderImpl
+    : public Cloneable<TransformerEncoderImpl> {
+ public:
+  TransformerEncoderImpl(
+      TransformerEncoderLayer encoder_layer,
+      int64_t num_layers)
+      : TransformerEncoderImpl(
+            TransformerEncoderOptions(encoder_layer, num_layers)) {}
+  explicit TransformerEncoderImpl(TransformerEncoderOptions options_);
+
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& src_mask = {},
+      const Tensor& src_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `TransformerEncoder` was constructed
+  TransformerEncoderOptions options;
+
+  /// module list that contains all the encoder layers
+  ModuleList layers = nullptr;
+
+  /// optional normalization module
+  AnyModule norm;
+};
+
+/// A `ModuleHolder` subclass for `TransformerEncoderImpl`.
+/// See the documentation for `TransformerEncoderImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerEncoder` with
+/// `torch::nn::TransformerEncoderOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TransformerEncoder);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerDecoder
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerDecoder is a stack of N decoder layers.
+/// See
+/// https://pytorch.org/docs/master/generated/torch.nn.TransformerDecoder.html
+/// to learn abouut the exact behavior of this decoder module
+///
+/// See the documentation for `torch::nn::TransformerDecoderOptions` class to
+/// learn what constructor arguments are supported for this decoder module
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer decoder_layer(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.1)); TransformerDecoder
+/// transformer_decoder(TransformerDecoderOptions(decoder_layer,
+/// 6).norm(LayerNorm(LayerNormOptions({2})))); const auto memory =
+/// torch::rand({10, 32, 512}); const auto tgt = torch::rand({20, 32, 512});
+/// auto out = transformer_decoder(tgt, memory);
+/// ```
+class TORCH_API TransformerDecoderImpl
+    : public Cloneable<TransformerDecoderImpl> {
+ public:
+  TransformerDecoderImpl(
+      TransformerDecoderLayer decoder_layer,
+      int64_t num_layers)
+      : TransformerDecoderImpl(
+            TransformerDecoderOptions(decoder_layer, num_layers)) {}
+  explicit TransformerDecoderImpl(TransformerDecoderOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pass the inputs (and mask) through the decoder layer in turn.
+  /// Args:
+  ///       tgt: the sequence to the decoder layer (required).
+  ///       memory: the sequence from the last layer of the encoder (required).
+  ///       tgt_mask: the mask for the tgt sequence (optional).
+  ///       memory_mask: the mask for the memory sequence (optional).
+  ///       tgt_key_padding_mask: the mask for the tgt keys per batch
+  ///       (optional). memory_key_padding_mask: the mask for the memory keys
+  ///       per batch (optional).
+  Tensor forward(
+      const Tensor& tgt,
+      const Tensor& memory,
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  /// The options used to configure this module.
+  TransformerDecoderOptions options;
+
+  /// Cloned layers of decoder layers
+  ModuleList layers{nullptr};
+
+  /// optional layer normalization module
+  AnyModule norm;
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())})
+};
+
+/// A `ModuleHolder` subclass for `TransformerDecoderImpl`.
+/// See the documentation for `TransformerDecoderImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerDecoder` with
+/// `torch::nn::TransformerDecoderOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TransformerDecoder);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3f37c2a8f655c5e23da9261dfbe30dc94d35955
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
@@ -0,0 +1,195 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/activation.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/normalization.h>
+#include <torch/nn/options/transformerlayer.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <ostream>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerEncoderLayer
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerEncoderLayer module.
+/// See
+/// https://pytorch.org/docs/master/generated/torch.nn.TransformerEncoderLayer.html
+/// to learn abouut the exact behavior of this encoder layer model
+///
+/// See the documentation for `torch::nn::TransformerEncoderLayer` class to
+/// learn what constructor arguments are supported for this encoder layer model
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1));
+/// ```
+class TORCH_API TransformerEncoderLayerImpl
+    : public Cloneable<TransformerEncoderLayerImpl> {
+ public:
+  TransformerEncoderLayerImpl(int64_t d_model, int64_t nhead)
+      : TransformerEncoderLayerImpl(
+            TransformerEncoderLayerOptions(d_model, nhead)) {}
+  explicit TransformerEncoderLayerImpl(TransformerEncoderLayerOptions options_);
+
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& src_mask = {},
+      const Tensor& src_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `TransformerEncoderLayer` was constructed
+  TransformerEncoderLayerOptions options;
+
+  /// self attention
+  MultiheadAttention self_attn = nullptr;
+
+  /// feedforward first linear layer
+  Linear linear1 = nullptr;
+
+  /// feedforward dropout layer
+  Dropout dropout = nullptr;
+
+  /// feedforward second linear layer
+  Linear linear2 = nullptr;
+
+  /// pre feedforward, normalization layer
+  LayerNorm norm1 = nullptr;
+  /// post feedfastward, normalization layer
+  LayerNorm norm2 = nullptr;
+
+  /// pre feedfastward, dropout layer
+  Dropout dropout1 = nullptr;
+  /// post feedfastward, dropout layer
+  Dropout dropout2 = nullptr;
+};
+
+/// A `ModuleHolder` subclass for `TransformerEncoderLayerImpl``.
+/// See the documentation for `TransformerEncoderLayerImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerEncoderLayer`
+/// with `torch::nn::TransformerEncoderLayerOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TransformerEncoderLayer);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerDecoderLayer
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerDecoderLayer is made up of self-attn, multi-head-attn and
+/// feedforward network. This standard decoder layer is based on the paper
+/// "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, Niki Parmar,
+/// Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia
+/// Polosukhin. 2017. Attention is all you need. In Advances in Neural
+/// Information Processing Systems, pages 6000-6010. Users may modify or
+/// implement in a different way during application. See
+/// https://pytorch.org/docs/master/nn.html#transformer-layers to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TransformerDecoderLayerOptions` class
+/// to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer model(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.2));
+/// ```
+class TORCH_API TransformerDecoderLayerImpl
+    : public Cloneable<TransformerDecoderLayerImpl> {
+ public:
+  TransformerDecoderLayerImpl(int64_t d_model, int64_t nhead)
+      : TransformerDecoderLayerImpl(
+            TransformerDecoderLayerOptions(d_model, nhead)) {}
+  explicit TransformerDecoderLayerImpl(TransformerDecoderLayerOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pass the inputs (and mask) through the decoder layer.
+  /// Args:
+  ///       tgt: the sequence to the decoder layer (required).
+  ///       memory: the sequence from the last layer of the encoder (required).
+  ///       tgt_mask: the mask for the tgt sequence (optional).
+  ///       memory_mask: the mask for the memory sequence (optional).
+  ///       tgt_key_padding_mask: the mask for the tgt keys per batch
+  ///       (optional). memory_key_padding_mask: the mask for the memory keys
+  ///       per batch (optional).
+  Tensor forward(
+      Tensor tgt,
+      const Tensor& memory,
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  /// The options used to configure this module.
+  TransformerDecoderLayerOptions options;
+
+  /// self attention
+  MultiheadAttention self_attn{nullptr};
+
+  /// Dropout, post self attention
+  Dropout dropout1{nullptr};
+
+  /// Normalization, post self attention
+  LayerNorm norm1{nullptr};
+
+  /// Multi-headed attention
+  MultiheadAttention multihead_attn{nullptr};
+
+  /// Dropout, post multi-headed attention
+  Dropout dropout2{nullptr};
+
+  /// Normalization, post multi-headed attention
+  LayerNorm norm2{nullptr};
+
+  /// Feed forward first linear layer
+  Linear linear1{nullptr};
+
+  /// Feed forward dropout layer
+  Dropout dropout{nullptr};
+
+  /// Feed forward second linear layer
+  Linear linear2{nullptr};
+
+  /// Dropout, post feed forward
+  Dropout dropout3{nullptr};
+
+  /// Normalization, post feed forward
+  LayerNorm norm3{nullptr};
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())})
+
+  /// Apply activation based on configuration
+  Tensor activation(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `TransformerDecoderLayerImpl`.
+/// See the documentation for `TransformerDecoderLayerImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerDecoderLayer`
+/// with `torch::nn::TransformerDecoderLayerOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TransformerDecoderLayer);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f345d3b226f5f36ca3fa1a77163c55596df0c8b9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/upsampling.h>
+#include <torch/nn/options/upsampling.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <ostream>
+
+namespace torch {
+namespace nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Upsample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D
+/// (volumetric) data.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Upsample to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UpsampleOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Upsample
+/// model(UpsampleOptions().scale_factor({3}).mode(torch::kLinear).align_corners(false));
+/// ```
+class TORCH_API UpsampleImpl : public Cloneable<UpsampleImpl> {
+ public:
+  explicit UpsampleImpl(const UpsampleOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `Upsample` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  UpsampleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UpsampleImpl`.
+/// See the documentation for `UpsampleImpl` class to learn what methods it
+/// provides, and examples of how to use `Upsample` with
+/// `torch::nn::UpsampleOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Upsample);
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1e7397766dfb48976ff6d921b4269794050f6f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+
+#include <vector>
+
+namespace torch {
+namespace nn {
+namespace modules {
+namespace utils {
+
+// Reverse the order of `t` and repeat each element for `n` times.
+// This can be used to translate padding arg used by Conv and Pooling modules
+// to the ones used by `F::pad`.
+//
+// This mirrors `_reverse_repeat_tuple` in `torch/nn/modules/utils.py`.
+inline std::vector<int64_t> _reverse_repeat_vector(
+    at::ArrayRef<int64_t> t,
+    int64_t n) {
+  TORCH_INTERNAL_ASSERT(n >= 0);
+  std::vector<int64_t> ret;
+  ret.reserve(t.size() * n);
+  for (auto rit = t.rbegin(); rit != t.rend(); ++rit) {
+    for (const auto i : c10::irange(n)) {
+      (void)i; // Suppress unused variable
+      ret.emplace_back(*rit);
+    }
+  }
+  return ret;
+}
+
+inline std::vector<int64_t> _list_with_default(
+    torch::ArrayRef<c10::optional<int64_t>> out_size,
+    torch::IntArrayRef defaults) {
+  TORCH_CHECK(
+      defaults.size() > out_size.size(),
+      "Input dimension should be at least ",
+      out_size.size() + 1);
+  std::vector<int64_t> ret;
+  torch::IntArrayRef defaults_slice =
+      defaults.slice(defaults.size() - out_size.size(), out_size.size());
+  for (const auto i : c10::irange(out_size.size())) {
+    auto v = out_size.at(i);
+    auto d = defaults_slice.at(i);
+    ret.emplace_back(v.has_value() ? v.value() : d);
+  }
+  return ret;
+}
+
+} // namespace utils
+} // namespace modules
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb6b619bfa0316e6a87432c49a34db8f24e482fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/nn/options/batchnorm.h>
+#include <torch/nn/options/conv.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/options/fold.h>
+#include <torch/nn/options/linear.h>
+#include <torch/nn/options/loss.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/nn/options/padding.h>
+#include <torch/nn/options/pixelshuffle.h>
+#include <torch/nn/options/pooling.h>
+#include <torch/nn/options/rnn.h>
+#include <torch/nn/options/transformer.h>
+#include <torch/nn/options/transformercoder.h>
+#include <torch/nn/options/transformerlayer.h>
+#include <torch/nn/options/upsampling.h>
+#include <torch/nn/options/vision.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..14874c2fba4a5a51b436b5f0ca2beff5cfacec44
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h
@@ -0,0 +1,714 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `ELU` module.
+///
+/// Example:
+/// ```
+/// ELU model(ELUOptions().alpha(42.42).inplace(true));
+/// ```
+struct TORCH_API ELUOptions {
+  /// The `alpha` value for the ELU formulation. Default: 1.0
+  TORCH_ARG(double, alpha) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::elu`.
+///
+/// See the documentation for `torch::nn::ELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::elu(x, F::ELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+using ELUFuncOptions = ELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SELU` module.
+///
+/// Example:
+/// ```
+/// SELU model(SELUOptions().inplace(true));
+/// ```
+struct TORCH_API SELUOptions {
+  /* implicit */ SELUOptions(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::selu`.
+///
+/// See the documentation for `torch::nn::SELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::selu(input, F::SELUFuncOptions(false));
+/// ```
+using SELUFuncOptions = SELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GLU` module.
+///
+/// Example:
+/// ```
+/// GLU model(GLUOptions(1));
+/// ```
+struct TORCH_API GLUOptions {
+  /* implicit */ GLUOptions(int64_t dim = -1);
+
+  /// the dimension on which to split the input. Default: -1
+  TORCH_ARG(int64_t, dim);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::glu`.
+///
+/// See the documentation for `torch::nn::GLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::glu(input, GLUFuncOptions(1));
+/// ```
+using GLUFuncOptions = GLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GELU` module.
+///
+/// Example:
+/// ```
+/// GELU model(GELUOptions().approximate("none"));
+/// ```
+struct TORCH_API GELUOptions {
+  /// Specifies the approximation to apply to the output.
+  TORCH_ARG(std::string, approximate) = "none";
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::gelu`.
+///
+/// See the documentation for `torch::nn::GELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gelu(input, F::GELUFuncOptions().approximate("none"));
+/// ```
+using GELUFuncOptions = GELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Hardshrink` module.
+///
+/// Example:
+/// ```
+/// Hardshrink model(HardshrinkOptions().lambda(42.42));
+/// ```
+struct TORCH_API HardshrinkOptions {
+  /* implicit */ HardshrinkOptions(double lambda = 0.5);
+
+  /// the `lambda` value for the Hardshrink formulation. Default: 0.5
+  TORCH_ARG(double, lambda);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hardshrink`.
+///
+/// See the documentation for `torch::nn::HardshrinkOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardshrink(x, F::HardshrinkFuncOptions().lambda(0.42));
+/// ```
+using HardshrinkFuncOptions = HardshrinkOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Hardtanh` module.
+///
+/// Example:
+/// ```
+/// Hardtanh
+/// model(HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true));
+/// ```
+struct TORCH_API HardtanhOptions {
+  /// minimum value of the linear region range. Default: -1
+  TORCH_ARG(double, min_val) = -1.0;
+
+  /// maximum value of the linear region range. Default: 1
+  TORCH_ARG(double, max_val) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hardtanh`.
+///
+/// See the documentation for `torch::nn::HardtanhOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardtanh(x,
+/// F::HardtanhFuncOptions().min_val(-1.0).max_val(1.0).inplace(true));
+/// ```
+using HardtanhFuncOptions = HardtanhOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LeakyReLU` module.
+///
+/// Example:
+/// ```
+/// LeakyReLU model(LeakyReLUOptions().negative_slope(0.42).inplace(true));
+/// ```
+struct TORCH_API LeakyReLUOptions {
+  /// Controls the angle of the negative slope. Default: 1e-2
+  TORCH_ARG(double, negative_slope) = 1e-2;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::leaky_relu`.
+///
+/// See the documentation for `torch::nn::LeakyReLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::leaky_relu(x,
+/// F::LeakyReLUFuncOptions().negative_slope(0.42).inplace(true));
+/// ```
+using LeakyReLUFuncOptions = LeakyReLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softmax` module.
+///
+/// Example:
+/// ```
+/// Softmax model(SoftmaxOptions(1));
+/// ```
+struct TORCH_API SoftmaxOptions {
+  SoftmaxOptions(int64_t dim);
+
+  /// Dimension along which Softmax will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmax(input, F::SoftmaxFuncOptions(1));
+/// ```
+struct TORCH_API SoftmaxFuncOptions {
+  SoftmaxFuncOptions(int64_t dim);
+
+  /// Dimension along which Softmax will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softmin` module.
+///
+/// Example:
+/// ```
+/// Softmin model(SoftminOptions(1));
+/// ```
+struct TORCH_API SoftminOptions {
+  SoftminOptions(int64_t dim);
+
+  /// Dimension along which Softmin will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::softmin`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmin(input, F::SoftminFuncOptions(1));
+/// ```
+struct TORCH_API SoftminFuncOptions {
+  SoftminFuncOptions(int64_t dim);
+
+  /// Dimension along which Softmin will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LogSoftmax` module.
+///
+/// Example:
+/// ```
+/// LogSoftmax model(LogSoftmaxOptions(1));
+/// ```
+struct TORCH_API LogSoftmaxOptions {
+  LogSoftmaxOptions(int64_t dim);
+
+  /// Dimension along which LogSoftmax will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::log_softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::log_softmax(input, LogSoftmaxFuncOptions(1));
+/// ```
+struct TORCH_API LogSoftmaxFuncOptions {
+  LogSoftmaxFuncOptions(int64_t dim);
+
+  /// Dimension along which LogSoftmax will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(c10::optional<torch::Dtype>, dtype) = c10::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PReLU` module.
+///
+/// Example:
+/// ```
+/// PReLU model(PReLUOptions().num_parameters(42));
+/// ```
+struct TORCH_API PReLUOptions {
+  /// number of `a` to learn. Although it takes an int as input, there is only
+  /// two values are legitimate: 1, or the number of channels at input. Default:
+  /// 1
+  TORCH_ARG(int64_t, num_parameters) = 1;
+
+  /// the initial value of `a`. Default: 0.25
+  TORCH_ARG(double, init) = 0.25;
+};
+
+// ============================================================================
+
+/// Options for the `ReLU` module.
+///
+/// Example:
+/// ```
+/// ReLU model(ReLUOptions().inplace(true));
+/// ```
+struct TORCH_API ReLUOptions {
+  /* implicit */ ReLUOptions(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::relu`.
+///
+/// See the documentation for `torch::nn::ReLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu(x, F::ReLUFuncOptions().inplace(true));
+/// ```
+using ReLUFuncOptions = ReLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `ReLU6` module.
+///
+/// Example:
+/// ```
+/// ReLU6 model(ReLU6Options().inplace(true));
+/// ```
+struct TORCH_API ReLU6Options {
+  /* implicit */ ReLU6Options(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::relu6`.
+///
+/// See the documentation for `torch::nn::ReLU6Options` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu6(x, F::ReLU6FuncOptions().inplace(true));
+/// ```
+using ReLU6FuncOptions = ReLU6Options;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `RReLU` module.
+///
+/// Example:
+/// ```
+/// RReLU model(RReLUOptions().lower(0.24).upper(0.42).inplace(true));
+/// ```
+struct TORCH_API RReLUOptions {
+  /// lower bound of the uniform distribution. Default: 1/8
+  TORCH_ARG(double, lower) = 1.0 / 8.0;
+
+  /// upper bound of the uniform distribution. Default: 1/3
+  TORCH_ARG(double, upper) = 1.0 / 3.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::rrelu`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::rrelu(x, F::RReLUFuncOptions().lower(0.1).upper(0.4).inplace(true));
+/// ```
+struct TORCH_API RReLUFuncOptions {
+  /// lower bound of the uniform distribution. Default: 1/8
+  TORCH_ARG(double, lower) = 1.0 / 8.0;
+
+  /// upper bound of the uniform distribution. Default: 1/3
+  TORCH_ARG(double, upper) = 1.0 / 3.0;
+
+  TORCH_ARG(bool, training) = false;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CELU` module.
+///
+/// Example:
+/// ```
+/// CELU model(CELUOptions().alpha(42.42).inplace(true));
+/// ```
+struct TORCH_API CELUOptions {
+  /// The `alpha` value for the CELU formulation. Default: 1.0
+  TORCH_ARG(double, alpha) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::celu`.
+///
+/// See the documentation for `torch::nn::CELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::celu(x, F::CELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+using CELUFuncOptions = CELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softplus` module.
+///
+/// Example:
+/// ```
+/// Softplus model(SoftplusOptions().beta(0.24).threshold(42.42));
+/// ```
+struct TORCH_API SoftplusOptions {
+  /// the `beta` value for the Softplus formulation. Default: 1
+  TORCH_ARG(double, beta) = 1.0;
+
+  /// values above this revert to a linear function. Default: 20
+  TORCH_ARG(double, threshold) = 20.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::softplus`.
+///
+/// See the documentation for `torch::nn::SoftplusOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softplus(x, F::SoftplusFuncOptions().beta(0.5).threshold(3.0));
+/// ```
+using SoftplusFuncOptions = SoftplusOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softshrink` module.
+///
+/// Example:
+/// ```
+/// Softshrink model(SoftshrinkOptions(42.42));
+/// ```
+struct TORCH_API SoftshrinkOptions {
+  /* implicit */ SoftshrinkOptions(double lambda = 0.5);
+
+  /// the `lambda` value for the Softshrink formulation. Default: 0.5
+  TORCH_ARG(double, lambda);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::softshrink`.
+///
+/// See the documentation for `torch::nn::SoftshrinkOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softshrink(x, F::SoftshrinkFuncOptions(0.42));
+/// ```
+using SoftshrinkFuncOptions = SoftshrinkOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Threshold` module.
+///
+/// Example:
+/// ```
+/// Threshold model(ThresholdOptions(42.42, 24.24).inplace(true));
+/// ```
+struct TORCH_API ThresholdOptions {
+  ThresholdOptions(double threshold, double value)
+      : threshold_(threshold), value_(value) {}
+
+  /// The value to threshold at
+  TORCH_ARG(double, threshold);
+
+  /// The value to replace with
+  TORCH_ARG(double, value);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::threshold`.
+///
+/// See the documentation for `torch::nn::ThresholdOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::threshold(x, F::ThresholdFuncOptions(0.5, 0.5).inplace(true));
+/// ```
+using ThresholdFuncOptions = ThresholdOptions;
+} // namespace functional
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::gumbel_softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(-1));
+/// ```
+struct TORCH_API GumbelSoftmaxFuncOptions {
+  /// non-negative scalar temperature
+  TORCH_ARG(double, tau) = 1.0;
+
+  /// returned samples will be discretized as one-hot vectors,
+  /// but will be differentiated as if it is the soft sample in autograd.
+  /// Default: False
+  TORCH_ARG(bool, hard) = false;
+
+  /// dimension along which softmax will be computed. Default: -1
+  TORCH_ARG(int, dim) = -1;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiheadAttention` module.
+///
+/// Example:
+/// ```
+/// MultiheadAttention model(MultiheadAttentionOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API MultiheadAttentionOptions {
+  MultiheadAttentionOptions(int64_t embed_dim, int64_t num_heads);
+
+  /// total dimension of the model.
+  TORCH_ARG(int64_t, embed_dim);
+
+  /// parallel attention heads.
+  TORCH_ARG(int64_t, num_heads);
+
+  /// a Dropout layer on attn_output_weights. Default: 0.0.
+  TORCH_ARG(double, dropout) = 0.0;
+
+  /// add bias as module parameter. Default: true.
+  TORCH_ARG(bool, bias) = true;
+
+  /// add bias to the key and value sequences at dim=0.
+  TORCH_ARG(bool, add_bias_kv) = false;
+
+  /// add a new batch of zeros to the key and value sequences at dim=1.
+  TORCH_ARG(bool, add_zero_attn) = false;
+
+  /// total number of features in key. Default: c10::nullopt.
+  TORCH_ARG(int64_t, kdim);
+
+  /// total number of features in key. Default: c10::nullopt.
+  TORCH_ARG(int64_t, vdim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::multi_head_attention_forward`
+struct TORCH_API MultiheadAttentionForwardFuncOptions {
+  MultiheadAttentionForwardFuncOptions(
+      int64_t embed_dim_to_check,
+      int64_t num_heads,
+      Tensor in_proj_weight,
+      Tensor in_proj_bias,
+      Tensor bias_k,
+      Tensor bias_v,
+      bool add_zero_attn,
+      double dropout_p,
+      Tensor out_proj_weight,
+      Tensor out_proj_bias);
+
+  TORCH_ARG(int64_t, embed_dim_to_check);
+
+  TORCH_ARG(int64_t, num_heads);
+
+  TORCH_ARG(Tensor, in_proj_weight);
+
+  TORCH_ARG(Tensor, in_proj_bias);
+
+  TORCH_ARG(Tensor, bias_k);
+
+  TORCH_ARG(Tensor, bias_v);
+
+  TORCH_ARG(bool, add_zero_attn);
+
+  TORCH_ARG(double, dropout_p);
+
+  TORCH_ARG(Tensor, out_proj_weight);
+
+  TORCH_ARG(Tensor, out_proj_bias);
+
+  TORCH_ARG(bool, training) = true;
+
+  TORCH_ARG(Tensor, key_padding_mask) = {};
+
+  TORCH_ARG(bool, need_weights) = true;
+
+  TORCH_ARG(Tensor, attn_mask) = {};
+
+  TORCH_ARG(bool, use_separate_proj_weight) = false;
+
+  TORCH_ARG(Tensor, q_proj_weight) = {};
+
+  TORCH_ARG(Tensor, k_proj_weight) = {};
+
+  TORCH_ARG(Tensor, v_proj_weight) = {};
+
+  TORCH_ARG(Tensor, static_k) = {};
+
+  TORCH_ARG(Tensor, static_v) = {};
+
+  TORCH_ARG(bool, average_attn_weights) = true;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4f985c4ecf2c29d9c0549af87b24341c00acef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `AdaptiveLogSoftmaxWithLoss` module.
+///
+/// Example:
+/// ```
+/// AdaptiveLogSoftmaxWithLoss model(AdaptiveLogSoftmaxWithLossOptions(8, 10,
+/// {4, 8}).div_value(2.).head_bias(true));
+/// ```
+struct TORCH_API AdaptiveLogSoftmaxWithLossOptions {
+  /* implicit */ AdaptiveLogSoftmaxWithLossOptions(
+      int64_t in_features,
+      int64_t n_classes,
+      std::vector<int64_t> cutoffs);
+
+  /// Number of features in the input tensor
+  TORCH_ARG(int64_t, in_features);
+
+  /// Number of classes in the dataset
+  TORCH_ARG(int64_t, n_classes);
+
+  /// Cutoffs used to assign targets to their buckets
+  TORCH_ARG(std::vector<int64_t>, cutoffs);
+
+  /// value used as an exponent to compute sizes of the clusters. Default: 4.0
+  TORCH_ARG(double, div_value) = 4.;
+
+  /// If ``true``, adds a bias term to the 'head' of
+  /// the adaptive softmax. Default: false
+  TORCH_ARG(bool, head_bias) = false;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..e46f1a720b62cf6e7381995c376f0908ef0be83b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `BatchNorm` module.
+struct TORCH_API BatchNormOptions {
+  /* implicit */ BatchNormOptions(int64_t num_features);
+
+  /// The number of features of the input tensor.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, num_features);
+
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(double, eps) = 1e-5;
+
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(c10::optional<double>, momentum) = 0.1;
+
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, affine) = true;
+
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, track_running_stats) = true;
+};
+
+/// Options for the `BatchNorm1d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm1d
+/// model(BatchNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm1dOptions = BatchNormOptions;
+
+/// Options for the `BatchNorm2d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm2d
+/// model(BatchNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm2dOptions = BatchNormOptions;
+
+/// Options for the `BatchNorm3d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm3d
+/// model(BatchNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm3dOptions = BatchNormOptions;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::batch_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::batch_norm(input, mean, variance,
+/// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
+/// ```
+struct TORCH_API BatchNormFuncOptions {
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+
+  TORCH_ARG(bool, training) = false;
+
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(c10::optional<double>, momentum) = 0.1;
+
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb5abb4615638176475f952170a7646b5ccd835
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h
@@ -0,0 +1,415 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+namespace detail {
+
+typedef std::variant<
+    enumtype::kZeros,
+    enumtype::kReflect,
+    enumtype::kReplicate,
+    enumtype::kCircular>
+    conv_padding_mode_t;
+
+template <size_t D>
+using conv_padding_t =
+    std::variant<ExpandingArray<D>, enumtype::kValid, enumtype::kSame>;
+
+/// Options for a `D`-dimensional convolution or convolution transpose module.
+template <size_t D>
+struct ConvNdOptions {
+  using padding_t = conv_padding_t<D>;
+  ConvNdOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// If true, convolutions will be transpose convolutions (a.k.a.
+  /// deconvolutions).
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, transposed) = false;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(conv_padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+} // namespace detail
+
+// ============================================================================
+
+/// Options for a `D`-dimensional convolution module.
+template <size_t D>
+struct ConvOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+  using padding_t = detail::conv_padding_t<D>;
+
+  ConvOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+/// `ConvOptions` specialized for the `Conv1d` module.
+///
+/// Example:
+/// ```
+/// Conv1d model(Conv1dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv1dOptions = ConvOptions<1>;
+
+/// `ConvOptions` specialized for the `Conv2d` module.
+///
+/// Example:
+/// ```
+/// Conv2d model(Conv2dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv2dOptions = ConvOptions<2>;
+
+/// `ConvOptions` specialized for the `Conv3d` module.
+///
+/// Example:
+/// ```
+/// Conv3d model(Conv3dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv3dOptions = ConvOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional convolution functional.
+template <size_t D>
+struct ConvFuncOptions {
+  using padding_t = torch::nn::detail::conv_padding_t<D>;
+
+  /// optional bias of shape `(out_channels)`. Default: ``None``
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+
+  /// The stride of the convolving kernel.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// Implicit paddings on both sides of the input.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The spacing between kernel elements.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// Split input into groups, `in_channels` should be divisible by
+  /// the number of groups.
+  TORCH_ARG(int64_t, groups) = 1;
+};
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv1d(x, weight, F::Conv1dFuncOptions().stride(1));
+/// ```
+using Conv1dFuncOptions = ConvFuncOptions<1>;
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv2d(x, weight, F::Conv2dFuncOptions().stride(1));
+/// ```
+using Conv2dFuncOptions = ConvFuncOptions<2>;
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv3d(x, weight, F::Conv3dFuncOptions().stride(1));
+/// ```
+using Conv3dFuncOptions = ConvFuncOptions<3>;
+
+} // namespace functional
+
+// ============================================================================
+
+template <size_t D>
+struct ConvTransposeOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+
+  ConvTransposeOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose1d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose1d model(ConvTranspose1dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+using ConvTranspose1dOptions = ConvTransposeOptions<1>;
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose2d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose2d model(ConvTranspose2dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+using ConvTranspose2dOptions = ConvTransposeOptions<2>;
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose3d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose3d model(ConvTranspose3dOptions(2, 2,
+/// 2).stride(1).bias(false));
+/// ```
+using ConvTranspose3dOptions = ConvTransposeOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional convolution functional.
+template <size_t D>
+struct ConvTransposeFuncOptions {
+  /// optional bias of shape `(out_channels)`. Default: ``None``
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+
+  /// The stride of the convolving kernel.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// Implicit paddings on both sides of the input.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// Additional size added to one side of each dimension in the output shape.
+  /// Default: 0
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// Split input into groups, `in_channels` should be divisible by
+  /// the number of groups.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// The spacing between kernel elements.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+};
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose1d(x, weight, F::ConvTranspose1dFuncOptions().stride(1));
+/// ```
+using ConvTranspose1dFuncOptions = ConvTransposeFuncOptions<1>;
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose2d(x, weight, F::ConvTranspose2dFuncOptions().stride(1));
+/// ```
+using ConvTranspose2dFuncOptions = ConvTransposeFuncOptions<2>;
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose3d(x, weight, F::ConvTranspose3dFuncOptions().stride(1));
+/// ```
+using ConvTranspose3dFuncOptions = ConvTransposeFuncOptions<3>;
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..94bc4c0177c1101e169cdce666544ef9837fc9d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `CosineSimilarity` module.
+///
+/// Example:
+/// ```
+/// CosineSimilarity model(CosineSimilarityOptions().dim(0).eps(0.5));
+/// ```
+struct TORCH_API CosineSimilarityOptions {
+  /// Dimension where cosine similarity is computed. Default: 1
+  TORCH_ARG(int64_t, dim) = 1;
+  /// Small value to avoid division by zero. Default: 1e-8
+  TORCH_ARG(double, eps) = 1e-8;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cosine_similarity`.
+///
+/// See the documentation for `torch::nn::CosineSimilarityOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_similarity(input1, input2,
+/// F::CosineSimilarityFuncOptions().dim(1));
+/// ```
+using CosineSimilarityFuncOptions = CosineSimilarityOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PairwiseDistance` module.
+///
+/// Example:
+/// ```
+/// PairwiseDistance
+/// model(PairwiseDistanceOptions().p(3).eps(0.5).keepdim(true));
+/// ```
+struct TORCH_API PairwiseDistanceOptions {
+  /// The norm degree. Default: 2
+  TORCH_ARG(double, p) = 2.0;
+  /// Small value to avoid division by zero. Default: 1e-6
+  TORCH_ARG(double, eps) = 1e-6;
+  /// Determines whether or not to keep the vector dimension. Default: false
+  TORCH_ARG(bool, keepdim) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::pairwise_distance`.
+///
+/// See the documentation for `torch::nn::PairwiseDistanceOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pairwise_distance(input1, input2, F::PairwiseDistanceFuncOptions().p(1));
+/// ```
+using PairwiseDistanceFuncOptions = PairwiseDistanceOptions;
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..b96d7ed3b85492059eb2fbb37c916ca88acebf76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Dropout` module.
+///
+/// Example:
+/// ```
+/// Dropout model(DropoutOptions().p(0.42).inplace(true));
+/// ```
+struct TORCH_API DropoutOptions {
+  /* implicit */ DropoutOptions(double p = 0.5);
+
+  /// The probability of an element to be zeroed. Default: 0.5
+  TORCH_ARG(double, p) = 0.5;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for the `Dropout2d` module.
+///
+/// Example:
+/// ```
+/// Dropout2d model(Dropout2dOptions().p(0.42).inplace(true));
+/// ```
+using Dropout2dOptions = DropoutOptions;
+
+/// Options for the `Dropout3d` module.
+///
+/// Example:
+/// ```
+/// Dropout3d model(Dropout3dOptions().p(0.42).inplace(true));
+/// ```
+using Dropout3dOptions = DropoutOptions;
+
+/// Options for the `AlphaDropout` module.
+///
+/// Example:
+/// ```
+/// AlphaDropout model(AlphaDropoutOptions(0.2).inplace(true));
+/// ```
+using AlphaDropoutOptions = DropoutOptions;
+
+/// Options for the `FeatureAlphaDropout` module.
+///
+/// Example:
+/// ```
+/// FeatureAlphaDropout model(FeatureAlphaDropoutOptions(0.2).inplace(true));
+/// ```
+using FeatureAlphaDropoutOptions = DropoutOptions;
+
+namespace functional {
+
+/// Options for `torch::nn::functional::dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout(input, F::DropoutFuncOptions().p(0.5));
+/// ```
+struct TORCH_API DropoutFuncOptions {
+  /// The probability of an element to be zeroed. Default: 0.5
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = true;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for `torch::nn::functional::dropout2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout2d(input, F::Dropout2dFuncOptions().p(0.5));
+/// ```
+using Dropout2dFuncOptions = DropoutFuncOptions;
+
+/// Options for `torch::nn::functional::dropout3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout3d(input, F::Dropout3dFuncOptions().p(0.5));
+/// ```
+using Dropout3dFuncOptions = DropoutFuncOptions;
+
+/// Options for `torch::nn::functional::alpha_dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::alpha_dropout(input,
+/// F::AlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+struct TORCH_API AlphaDropoutFuncOptions {
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = false;
+
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for `torch::nn::functional::feature_alpha_dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::feature_alpha_dropout(input,
+/// F::FeatureAlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+struct TORCH_API FeatureAlphaDropoutFuncOptions {
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = false;
+
+  TORCH_ARG(bool, inplace) = false;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..d527509dd370c171d2bba16158edeeaee466968d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Embedding` module.
+///
+/// Example:
+/// ```
+/// Embedding model(EmbeddingOptions(10,
+/// 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+struct TORCH_API EmbeddingOptions {
+  EmbeddingOptions(int64_t num_embeddings, int64_t embedding_dim);
+
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad". For a newly constructed
+  /// Embedding, the embedding vector at `padding_idx` will default to all
+  /// zeros, but can be updated to another value to be used as the padding
+  /// vector.
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings,
+  /// embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+};
+
+// ============================================================================
+
+/// Options for the `Embedding::from_pretrained` function.
+struct TORCH_API EmbeddingFromPretrainedOptions {
+  /// If ``true``, the tensor does not get updated in the learning process.
+  /// Equivalent to ``embedding.weight.requires_grad_(false)``. Default:
+  /// ``true``
+  TORCH_ARG(bool, freeze) = true;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad".
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::embedding`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding(input, weight,
+/// F::EmbeddingFuncOptions().norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+struct TORCH_API EmbeddingFuncOptions {
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad".
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+typedef std::variant<enumtype::kSum, enumtype::kMean, enumtype::kMax>
+    EmbeddingBagMode;
+
+/// Options for the `EmbeddingBag` module.
+///
+/// Example:
+/// ```
+/// EmbeddingBag model(EmbeddingBagOptions(10,
+/// 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode(torch::kSum));
+/// ```
+struct TORCH_API EmbeddingBagOptions {
+  EmbeddingBagOptions(int64_t num_embeddings, int64_t embedding_dim);
+
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings,
+  /// embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". For a newly constructed
+  /// EmbeddingBag, the embedding vector at `padding_idx` will default to all
+  /// zeros, but can be updated to another value to be used as the padding
+  /// vector. Note that the embedding vector at `padding_idx` is excluded from
+  /// the reduction.
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+};
+
+// ============================================================================
+
+/// Options for the `EmbeddingBag::from_pretrained` function.
+struct TORCH_API EmbeddingBagFromPretrainedOptions {
+  /// If ``true``, the tensor does not get updated in the learning process.
+  /// Equivalent to ``embeddingbag.weight.requires_grad_(false)``. Default:
+  /// ``true``
+  TORCH_ARG(bool, freeze) = true;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format. Note:
+  /// this option is currently only supported when ``mode="sum"``.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". Note that the embedding
+  /// vector at `padding_idx` is excluded from the reduction.
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::embedding_bag`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding_bag(input, weight,
+/// F::EmbeddingBagFuncOptions().mode(torch::kSum).offsets(offsets));
+/// ```
+struct TORCH_API EmbeddingBagFuncOptions {
+  /// Only used when `input` is 1D. `offsets` determines
+  /// the starting index position of each bag (sequence) in `input`.
+  TORCH_ARG(torch::Tensor, offsets) = Tensor();
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(c10::optional<double>, max_norm) = c10::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// a tensor of float / double weights, or None to indicate all weights should
+  /// be taken to be 1. If specified, `per_sample_weights` must have exactly the
+  /// same shape as input and is treated as having the same `offsets`, if those
+  /// are not None.
+  TORCH_ARG(torch::Tensor, per_sample_weights) = Tensor();
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format. Note:
+  /// this option is currently only supported when ``mode="sum"``.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". Note that the embedding
+  /// vector at `padding_idx` is excluded from the reduction.
+  TORCH_ARG(c10::optional<int64_t>, padding_idx) = c10::nullopt;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4b8d700f1b676f0fdfd780acd338d20e8814436
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Fold` module.
+///
+/// Example:
+/// ```
+/// Fold model(FoldOptions({8, 8}, {3, 3}).dilation(2).padding({2,
+/// 1}).stride(2));
+/// ```
+struct TORCH_API FoldOptions {
+  FoldOptions(ExpandingArray<2> output_size, ExpandingArray<2> kernel_size)
+      : output_size_(std::move(output_size)),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// describes the spatial shape of the large containing tensor of the sliding
+  /// local blocks. It is useful to resolve the ambiguity when multiple input
+  /// shapes map to same number of sliding blocks, e.g., with stride > 0.
+  TORCH_ARG(ExpandingArray<2>, output_size);
+
+  /// the size of the sliding blocks
+  TORCH_ARG(ExpandingArray<2>, kernel_size);
+
+  /// controls the spacing between the kernel points; also known as the à trous
+  /// algorithm.
+  TORCH_ARG(ExpandingArray<2>, dilation) = 1;
+
+  /// controls the amount of implicit zero-paddings on both sides for padding
+  /// number of points for each dimension before reshaping.
+  TORCH_ARG(ExpandingArray<2>, padding) = 0;
+
+  /// controls the stride for the sliding blocks.
+  TORCH_ARG(ExpandingArray<2>, stride) = 1;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::fold`.
+///
+/// See the documentation for `torch::nn::FoldOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fold(input, F::FoldFuncOptions({3, 2}, {2, 2}));
+/// ```
+using FoldFuncOptions = FoldOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Unfold` module.
+///
+/// Example:
+/// ```
+/// Unfold model(UnfoldOptions({2, 4}).dilation(2).padding({2, 1}).stride(2));
+/// ```
+struct TORCH_API UnfoldOptions {
+  UnfoldOptions(ExpandingArray<2> kernel_size)
+      : kernel_size_(std::move(kernel_size)) {}
+
+  /// the size of the sliding blocks
+  TORCH_ARG(ExpandingArray<2>, kernel_size);
+
+  /// controls the spacing between the kernel points; also known as the à trous
+  /// algorithm.
+  TORCH_ARG(ExpandingArray<2>, dilation) = 1;
+
+  /// controls the amount of implicit zero-paddings on both sides for padding
+  /// number of points for each dimension before reshaping.
+  TORCH_ARG(ExpandingArray<2>, padding) = 0;
+
+  /// controls the stride for the sliding blocks.
+  TORCH_ARG(ExpandingArray<2>, stride) = 1;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::unfold`.
+///
+/// See the documentation for `torch::nn::UnfoldOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::unfold(input, F::UnfoldFuncOptions({2, 2}).padding(1).stride(2));
+/// ```
+using UnfoldFuncOptions = UnfoldOptions;
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..d021b61e8e06b9e6f65fa34f66c6c4852ce59648
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `InstanceNorm` module.
+struct TORCH_API InstanceNormOptions {
+  /* implicit */ InstanceNormOptions(int64_t num_features);
+
+  /// The number of features of the input tensor.
+  TORCH_ARG(int64_t, num_features);
+
+  /// The epsilon value added for numerical stability.
+  TORCH_ARG(double, eps) = 1e-5;
+
+  /// A momentum multiplier for the mean and variance.
+  TORCH_ARG(double, momentum) = 0.1;
+
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  TORCH_ARG(bool, affine) = false;
+
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module.
+  TORCH_ARG(bool, track_running_stats) = false;
+};
+
+/// Options for the `InstanceNorm1d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm1d
+/// model(InstanceNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm1dOptions = InstanceNormOptions;
+
+/// Options for the `InstanceNorm2d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm2d
+/// model(InstanceNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm2dOptions = InstanceNormOptions;
+
+/// Options for the `InstanceNorm3d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm3d
+/// model(InstanceNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm3dOptions = InstanceNormOptions;
+
+namespace functional {
+
+/// Options for `torch::nn::functional::instance_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::instance_norm(input,
+/// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
+/// ```
+struct TORCH_API InstanceNormFuncOptions {
+  TORCH_ARG(Tensor, running_mean) = Tensor();
+
+  TORCH_ARG(Tensor, running_var) = Tensor();
+
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+
+  TORCH_ARG(bool, use_input_stats) = true;
+
+  TORCH_ARG(double, momentum) = 0.1;
+
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ed8fb78420ec009619abbef59d4f5168b3e7b37
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Linear` module.
+///
+/// Example:
+/// ```
+/// Linear model(LinearOptions(5, 2).bias(false));
+/// ```
+struct TORCH_API LinearOptions {
+  LinearOptions(int64_t in_features, int64_t out_features);
+  /// size of each input sample
+  TORCH_ARG(int64_t, in_features);
+
+  /// size of each output sample
+  TORCH_ARG(int64_t, out_features);
+
+  /// If set to false, the layer will not learn an additive bias. Default: true
+  TORCH_ARG(bool, bias) = true;
+};
+
+// ============================================================================
+
+/// Options for the `Flatten` module.
+///
+/// Example:
+/// ```
+/// Flatten model(FlattenOptions().start_dim(2).end_dim(4));
+/// ```
+struct TORCH_API FlattenOptions {
+  /// first dim to flatten
+  TORCH_ARG(int64_t, start_dim) = 1;
+  /// last dim to flatten
+  TORCH_ARG(int64_t, end_dim) = -1;
+};
+
+// ============================================================================
+
+/// Options for the `Unflatten` module.
+///
+/// Note: If input tensor is named, use dimname and namedshape arguments.
+///
+/// Example:
+/// ```
+/// Unflatten unnamed_model(UnflattenOptions(0, {2, 2}));
+/// Unflatten named_model(UnflattenOptions("B", {{"B1", 2}, {"B2", 2}}));
+/// ```
+struct TORCH_API UnflattenOptions {
+  typedef std::vector<std::pair<std::string, int64_t>> namedshape_t;
+
+  UnflattenOptions(int64_t dim, std::vector<int64_t> sizes);
+  UnflattenOptions(const char* dimname, namedshape_t namedshape);
+  UnflattenOptions(std::string dimname, namedshape_t namedshape);
+
+  /// dim to unflatten
+  TORCH_ARG(int64_t, dim);
+  /// name of dim to unflatten, for use with named tensors
+  TORCH_ARG(std::string, dimname);
+  /// new shape of unflattened dim
+  TORCH_ARG(std::vector<int64_t>, sizes);
+  /// new shape of unflattened dim with names, for use with named tensors
+  TORCH_ARG(namedshape_t, namedshape);
+};
+
+// ============================================================================
+
+/// Options for the `Bilinear` module.
+///
+/// Example:
+/// ```
+/// Bilinear model(BilinearOptions(3, 2, 4).bias(false));
+/// ```
+struct TORCH_API BilinearOptions {
+  BilinearOptions(
+      int64_t in1_features,
+      int64_t in2_features,
+      int64_t out_features);
+  /// The number of features in input 1 (columns of the input1 matrix).
+  TORCH_ARG(int64_t, in1_features);
+  /// The number of features in input 2 (columns of the input2 matrix).
+  TORCH_ARG(int64_t, in2_features);
+  /// The number of output features to produce (columns of the output matrix).
+  TORCH_ARG(int64_t, out_features);
+  /// Whether to learn and add a bias after the bilinear transformation.
+  TORCH_ARG(bool, bias) = true;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..72187b67d532df1e23f0bfe3429c284cdf3b65a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h
@@ -0,0 +1,802 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `L1Loss` module.
+///
+/// Example:
+/// ```
+/// L1Loss model(L1LossOptions(torch::kNone));
+/// ```
+struct TORCH_API L1LossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(L1LossOptions, reduction, kNone, kMean, kSum)
+
+  /// Specifies the reduction to apply to the output.
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::l1_loss`.
+///
+/// See the documentation for `torch::nn::L1LossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::l1_loss(input, target, F::L1LossFuncOptions(torch::kNone));
+/// ```
+using L1LossFuncOptions = L1LossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `KLDivLoss` module.
+///
+/// Example:
+/// ```
+/// KLDivLoss
+/// model(KLDivLossOptions().reduction(torch::kNone).log_target(false));
+/// ```
+struct TORCH_API KLDivLossOptions {
+  typedef std::variant<
+      enumtype::kNone,
+      enumtype::kBatchMean,
+      enumtype::kSum,
+      enumtype::kMean>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG4(
+      KLDivLossOptions,
+      reduction,
+      kNone,
+      kBatchMean,
+      kSum,
+      kMean)
+
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+
+  /// Specifies whether `target` is accepted in the log space. Default: False
+  TORCH_ARG(bool, log_target) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::kl_div`.
+///
+/// See the documentation for `torch::nn::KLDivLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::kl_div(input, target,
+/// F::KLDivFuncOptions().reduction(torch::kNone).log_target(false));
+/// ```
+using KLDivFuncOptions = KLDivLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MSELoss` module.
+///
+/// Example:
+/// ```
+/// MSELoss model(MSELossOptions(torch::kNone));
+/// ```
+struct TORCH_API MSELossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(MSELossOptions, reduction, kNone, kMean, kSum)
+
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::mse_loss`.
+///
+/// See the documentation for `torch::nn::MSELossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::mse_loss(input, target, F::MSELossFuncOptions(torch::kNone));
+/// ```
+using MSELossFuncOptions = MSELossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `BCELoss` module.
+///
+/// Example:
+/// ```
+/// BCELoss model(BCELossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCELossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to the loss of each batch element.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::binary_cross_entropy`.
+///
+/// See the documentation for `torch::nn::BCELossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy(input, target,
+/// F::BinaryCrossEntropyFuncOptions().weight(weight));
+/// ```
+using BinaryCrossEntropyFuncOptions = BCELossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `HingeEmbeddingLoss` module.
+///
+/// Example:
+/// ```
+/// HingeEmbeddingLoss
+/// model(HingeEmbeddingLossOptions().margin(4).reduction(torch::kNone));
+/// ```
+struct TORCH_API HingeEmbeddingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hinge_embedding_loss`.
+///
+/// See the documentation for `torch::nn::HingeEmbeddingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hinge_embedding_loss(input, target,
+/// F::HingeEmbeddingLossFuncOptions().margin(2));
+/// ```
+using HingeEmbeddingLossFuncOptions = HingeEmbeddingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiMarginLoss model(MultiMarginLossOptions().margin(2).weight(weight));
+/// ```
+struct TORCH_API MultiMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Has a default value of :math:`1`. :math:`1` and :math:`2`
+  /// are the only supported values.
+  TORCH_ARG(int64_t, p) = 1;
+  /// Has a default value of :math:`1`.
+  TORCH_ARG(double, margin) = 1.0;
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = Tensor();
+  /// Specifies the reduction to apply to the output:
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be
+  /// applied,
+  /// ``'mean'``: the sum of the output will be divided by the number of
+  /// elements in the output, ``'sum'``: the output will be summed. Default:
+  /// ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multi_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiMarginLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multi_margin_loss(input, target,
+/// F::MultiMarginLossFuncOptions().margin(2).weight(weight));
+/// ```
+using MultiMarginLossFuncOptions = MultiMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CosineEmbeddingLoss` module.
+///
+/// Example:
+/// ```
+/// CosineEmbeddingLoss model(CosineEmbeddingLossOptions().margin(0.5));
+/// ```
+struct TORCH_API CosineEmbeddingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Should be a number from -1 to 1, 0
+  /// to 0.5 is suggested. Default: 0.0
+  TORCH_ARG(double, margin) = 0.0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cosine_embedding_loss`.
+///
+/// See the documentation for `torch::nn::CosineEmbeddingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_embedding_loss(input1, input2, target,
+/// F::CosineEmbeddingLossFuncOptions().margin(0.5));
+/// ```
+using CosineEmbeddingLossFuncOptions = CosineEmbeddingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiLabelMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiLabelMarginLoss model(MultiLabelMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API MultiLabelMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      MultiLabelMarginLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multilabel_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiLabelMarginLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_margin_loss(input, target,
+/// F::MultilabelMarginLossFuncOptions(torch::kNone));
+/// ```
+using MultilabelMarginLossFuncOptions = MultiLabelMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SoftMarginLoss` module.
+///
+/// Example:
+/// ```
+/// SoftMarginLoss model(SoftMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API SoftMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      SoftMarginLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::soft_margin_loss`.
+///
+/// See the documentation for `torch::nn::SoftMarginLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::soft_margin_loss(input, target,
+/// F::SoftMarginLossFuncOptions(torch::kNone));
+/// ```
+using SoftMarginLossFuncOptions = SoftMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiLabelSoftMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiLabelSoftMarginLoss
+/// model(MultiLabelSoftMarginLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API MultiLabelSoftMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multilabel_soft_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiLabelSoftMarginLossOptions` class
+/// to learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_soft_margin_loss(input, target,
+/// F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone).weight(weight));
+/// ```
+using MultilabelSoftMarginLossFuncOptions = MultiLabelSoftMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `TripletMarginLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginLoss
+/// model(TripletMarginLossOptions().margin(3).p(2).eps(1e-06).swap(false));
+/// ```
+struct TORCH_API TripletMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Specifies the norm degree for pairwise distance. Default: 2
+  TORCH_ARG(double, p) = 2.0;
+  TORCH_ARG(double, eps) = 1e-6;
+  /// The distance swap is described in detail in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_loss(anchor, positive, negative,
+/// F::TripletMarginLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginLossFuncOptions = TripletMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `TripletMarginWithDistanceLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss
+/// model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+  typedef std::function<Tensor(const Tensor&, const Tensor&)>
+      distance_function_t;
+
+  /// Specifies a nonnegative, real-valued function that quantifies the
+  /// closeness of two tensors. If not specified, `F::pairwise_distance` will
+  /// be used. Default: nullopt
+  TORCH_ARG(c10::optional<distance_function_t>, distance_function) =
+      c10::nullopt;
+  /// Specifies a nonnegative margin representing the minimum difference
+  /// between the positive and negative distances required for the loss to be 0.
+  /// Larger margins penalize cases where the negative examples are not distance
+  /// enough from the anchors, relative to the positives. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Whether to use the distance swap described in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. If True, and if the positive example is closer to the
+  /// negative example than the anchor is, swaps the positive example and the
+  /// anchor in the loss computation. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_with_distance_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions`
+/// class to learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative,
+/// F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginWithDistanceLossFuncOptions =
+    TripletMarginWithDistanceLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CTCLoss` module.
+///
+/// Example:
+/// ```
+/// CTCLoss
+/// model(CTCLossOptions().blank(42).zero_infinity(false).reduction(torch::kSum));
+/// ```
+struct TORCH_API CTCLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// blank label. Default `0`.
+  TORCH_ARG(int64_t, blank) = 0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Whether to zero infinite losses and the associated gradients.
+  /// Default: `false`. Infinite losses mainly occur when the inputs are
+  /// too short to be aligned to the targets.
+  TORCH_ARG(bool, zero_infinity) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::ctc_loss`.
+///
+/// See the documentation for `torch::nn::CTCLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::ctc_loss(log_probs, targets, input_lengths, target_lengths,
+/// F::CTCLossFuncOptions().reduction(torch::kNone));
+/// ```
+using CTCLossFuncOptions = CTCLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SmoothL1Loss` module.
+///
+/// Example:
+/// ```
+/// SmoothL1Loss model(SmoothL1LossOptions().reduction(torch::kNone).beta(0.5));
+/// ```
+struct TORCH_API SmoothL1LossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      SmoothL1LossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the threshold at which to change between L1 and L2 loss.
+  /// If beta is not specified, a value of 1.0 will be used.
+  /// Default: nullopt
+  TORCH_ARG(c10::optional<double>, beta) = c10::nullopt;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::smooth_l1_loss`.
+///
+/// See the documentation for `torch::nn::SmoothL1LossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, F::SmoothL1LossFuncOptions(torch::kNone));
+/// ```
+using SmoothL1LossFuncOptions = SmoothL1LossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `HuberLoss` module.
+///
+/// Example:
+/// ```
+/// HuberLoss model(HuberLossOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+struct TORCH_API HuberLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      HuberLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the threshold at which to change between L1 and L2 loss.
+  /// Default: 1.0
+  TORCH_ARG(double, delta) = 1.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::huber_loss`.
+///
+/// See the documentation for `torch::nn::HuberLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::huber_loss(input, target, F::HuberLossFuncOptions(torch::kNone));
+/// ```
+using HuberLossFuncOptions = HuberLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PoissonNLLLoss` module.
+///
+/// Example:
+/// ```
+/// PoissonNLLLoss
+/// model(PoissonNLLLossOptions().log_input(false).full(true).eps(0.42).reduction(torch::kSum));
+/// ```
+struct TORCH_API PoissonNLLLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// if true the loss is computed as `exp(input) - target * input`,
+  /// if false the loss is `input - target * log(input + eps)`.
+  TORCH_ARG(bool, log_input) = true;
+  /// whether to compute full loss, i.e. to add the Stirling approximation term
+  /// target * log(target) - target + 0.5 * log(2 * pi * target).
+  TORCH_ARG(bool, full) = false;
+  /// Small value to avoid evaluation of `log(0)` when `log_input = false`.
+  /// Default: 1e-8
+  TORCH_ARG(double, eps) = 1e-8;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::poisson_nll_loss`.
+///
+/// See the documentation for `torch::nn::PoissonNLLLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::poisson_nll_loss(input, target,
+/// F::PoissonNLLLossFuncOptions().reduction(torch::kNone));
+/// ```
+using PoissonNLLLossFuncOptions = PoissonNLLLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MarginRankingLoss` module.
+///
+/// Example:
+/// ```
+/// MarginRankingLoss
+/// model(MarginRankingLossOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+struct TORCH_API MarginRankingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Has a default value of `0`.
+  TORCH_ARG(double, margin) = 0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::margin_ranking_loss`.
+///
+/// See the documentation for `torch::nn::MarginRankingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::margin_ranking_loss(input1, input2, target,
+/// F::MarginRankingLossFuncOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+using MarginRankingLossFuncOptions = MarginRankingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `NLLLoss` module.
+///
+/// Example:
+/// ```
+/// NLLLoss model(NLLLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API NLLLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies a target value that is ignored
+  /// and does not contribute to the input gradient.
+  TORCH_ARG(int64_t, ignore_index) = -100;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::nll_loss`.
+///
+/// See the documentation for `torch::nn::NLLLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::nll_loss(input, target,
+/// F::NLLLossFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+using NLLLossFuncOptions = NLLLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CrossEntropyLoss` module.
+///
+/// Example:
+/// ```
+/// CrossEntropyLoss
+/// model(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API CrossEntropyLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each class. If given, has to be a
+  /// Tensor of size C
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies a target value that is ignored
+  /// and does not contribute to the input gradient.
+  TORCH_ARG(int64_t, ignore_index) = -100;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the amount of smoothing when computing the loss. Default: 0.0
+  TORCH_ARG(double, label_smoothing) = 0.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cross_entropy`.
+///
+/// See the documentation for `torch::nn::CrossEntropyLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cross_entropy(input, target,
+/// F::CrossEntropyFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+using CrossEntropyFuncOptions = CrossEntropyLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `BCEWithLogitsLoss` module.
+///
+/// Example:
+/// ```
+/// BCEWithLogitsLoss
+/// model(BCEWithLogitsLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCEWithLogitsLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+  /// A manual rescaling weight given to the loss of each batch element.
+  /// If given, has to be a Tensor of size `nbatch`.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// A weight of positive examples.
+  /// Must be a vector with length equal to the number of classes.
+  TORCH_ARG(Tensor, pos_weight) = {};
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::binary_cross_entropy_with_logits`.
+///
+/// See the documentation for `torch::nn::BCEWithLogitsLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy_with_logits(input, target,
+/// F::BinaryCrossEntropyWithLogitsFuncOptions().pos_weight(pos_weight).reduction(torch::kSum));
+/// ```
+using BinaryCrossEntropyWithLogitsFuncOptions = BCEWithLogitsLossOptions;
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f55d126aa1cf327c56df2c102c37dfb1a7f2ceb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h
@@ -0,0 +1,192 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `LayerNorm` module.
+///
+/// Example:
+/// ```
+/// LayerNorm model(LayerNormOptions({2,
+/// 2}).elementwise_affine(false).eps(2e-5));
+/// ```
+struct TORCH_API LayerNormOptions {
+  /* implicit */ LayerNormOptions(std::vector<int64_t> normalized_shape);
+  /// input shape from an expected input.
+  TORCH_ARG(std::vector<int64_t>, normalized_shape);
+  /// a value added to the denominator for numerical stability. ``Default:
+  /// 1e-5``.
+  TORCH_ARG(double, eps) = 1e-5;
+  /// a boolean value that when set to ``true``, this module
+  /// has learnable per-element affine parameters initialized to ones (for
+  /// weights) and zeros (for biases). ``Default: true``.
+  TORCH_ARG(bool, elementwise_affine) = true;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::layer_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::layer_norm(input, F::LayerNormFuncOptions({2, 2}).eps(2e-5));
+/// ```
+struct TORCH_API LayerNormFuncOptions {
+  /* implicit */ LayerNormFuncOptions(std::vector<int64_t> normalized_shape);
+  /// input shape from an expected input.
+  TORCH_ARG(std::vector<int64_t>, normalized_shape);
+
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+
+  /// a value added to the denominator for numerical stability. ``Default:
+  /// 1e-5``.
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LocalResponseNorm` module.
+///
+/// Example:
+/// ```
+/// LocalResponseNorm
+/// model(LocalResponseNormOptions(2).alpha(0.0002).beta(0.85).k(2.));
+/// ```
+struct TORCH_API LocalResponseNormOptions {
+  /* implicit */ LocalResponseNormOptions(int64_t size) : size_(size) {}
+  /// amount of neighbouring channels used for normalization
+  TORCH_ARG(int64_t, size);
+
+  /// multiplicative factor. Default: 1e-4
+  TORCH_ARG(double, alpha) = 1e-4;
+
+  /// exponent. Default: 0.75
+  TORCH_ARG(double, beta) = 0.75;
+
+  /// additive factor. Default: 1
+  TORCH_ARG(double, k) = 1.;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::local_response_norm`.
+///
+/// See the documentation for `torch::nn::LocalResponseNormOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::local_response_norm(x, F::LocalResponseNormFuncOptions(2));
+/// ```
+using LocalResponseNormFuncOptions = LocalResponseNormOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CrossMapLRN2d` module.
+///
+/// Example:
+/// ```
+/// CrossMapLRN2d model(CrossMapLRN2dOptions(3).alpha(1e-5).beta(0.1).k(10));
+/// ```
+struct TORCH_API CrossMapLRN2dOptions {
+  CrossMapLRN2dOptions(int64_t size);
+
+  TORCH_ARG(int64_t, size);
+
+  TORCH_ARG(double, alpha) = 1e-4;
+
+  TORCH_ARG(double, beta) = 0.75;
+
+  TORCH_ARG(int64_t, k) = 1;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::normalize`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::normalize(input, F::NormalizeFuncOptions().p(1).dim(-1));
+/// ```
+struct TORCH_API NormalizeFuncOptions {
+  /// The exponent value in the norm formulation. Default: 2.0
+  TORCH_ARG(double, p) = 2.0;
+  /// The dimension to reduce. Default: 1
+  TORCH_ARG(int64_t, dim) = 1;
+  /// Small value to avoid division by zero. Default: 1e-12
+  TORCH_ARG(double, eps) = 1e-12;
+  /// the output tensor. If `out` is used, this
+  /// operation won't be differentiable.
+  TORCH_ARG(c10::optional<Tensor>, out) = c10::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GroupNorm` module.
+///
+/// Example:
+/// ```
+/// GroupNorm model(GroupNormOptions(2, 2).eps(2e-5).affine(false));
+/// ```
+struct TORCH_API GroupNormOptions {
+  /* implicit */ GroupNormOptions(int64_t num_groups, int64_t num_channels);
+
+  /// number of groups to separate the channels into
+  TORCH_ARG(int64_t, num_groups);
+  /// number of channels expected in input
+  TORCH_ARG(int64_t, num_channels);
+  /// a value added to the denominator for numerical stability. Default: 1e-5
+  TORCH_ARG(double, eps) = 1e-5;
+  /// a boolean value that when set to ``true``, this module
+  /// has learnable per-channel affine parameters initialized to ones (for
+  /// weights) and zeros (for biases). Default: ``true``.
+  TORCH_ARG(bool, affine) = true;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::group_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::group_norm(input, F::GroupNormFuncOptions(2).eps(2e-5));
+/// ```
+struct TORCH_API GroupNormFuncOptions {
+  /* implicit */ GroupNormFuncOptions(int64_t num_groups);
+
+  /// number of groups to separate the channels into
+  TORCH_ARG(int64_t, num_groups);
+
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+
+  /// a value added to the denominator for numerical stability. Default: 1e-5
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe4ba737ca287bd6dd3faeff5433e6176fafe026
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for a `D`-dimensional ReflectionPad module.
+template <size_t D>
+struct TORCH_API ReflectionPadOptions {
+  ReflectionPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// If it is `int`, uses the same padding in all boundaries.
+  /// If it is a 2-`tuple` (for ReflectionPad1d), uses (padding_left,
+  /// padding_right). If it is a 4-`tuple` (for ReflectionPad2d), uses
+  /// (padding_left, padding_right, padding_top, padding_bottom). If it is a
+  /// 6-`tuple` (for ReflectionPad3d), uses (padding_left, padding_right,
+  /// padding_top, padding_bottom, padding_front, padding_back).
+
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad1d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad1d model(ReflectionPad1dOptions({3, 1}));
+/// ```
+using ReflectionPad1dOptions = ReflectionPadOptions<1>;
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad2d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad2d model(ReflectionPad2dOptions({1, 1, 2, 0}));
+/// ```
+using ReflectionPad2dOptions = ReflectionPadOptions<2>;
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad3d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad3d model(ReflectionPad3dOptions({1, 1, 2, 0, 1, 1}));
+/// ```
+using ReflectionPad3dOptions = ReflectionPadOptions<3>;
+
+// ============================================================================
+
+/// Options for a `D`-dimensional ReplicationPad module.
+template <size_t D>
+struct TORCH_API ReplicationPadOptions {
+  ReplicationPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ReplicationPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ReplicationPad2d), uses (padding_left,
+  /// padding_right, padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ReplicationPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad1d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad1d model(ReplicationPad1dOptions({3, 1}));
+/// ```
+using ReplicationPad1dOptions = ReplicationPadOptions<1>;
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad2d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad2d model(ReplicationPad2dOptions({1, 1, 2, 0}));
+/// ```
+using ReplicationPad2dOptions = ReplicationPadOptions<2>;
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad3d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad3d model(ReplicationPad3dOptions({1, 2, 1, 2, 1, 2}));
+/// ```
+using ReplicationPad3dOptions = ReplicationPadOptions<3>;
+
+// ============================================================================
+
+template <size_t D>
+struct TORCH_API ZeroPadOptions {
+  ZeroPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ZeroPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ZeroPad2d), uses (padding_left, padding_right,
+  /// padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ZeroPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ZeroPadOptions` specialized for the `ZeroPad1d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1});
+/// ```
+using ZeroPad1dOptions = ZeroPadOptions<1>;
+
+/// `ZeroPadOptions` specialized for the `ZeroPad2d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({1, 1, 2, 0});
+/// ```
+using ZeroPad2dOptions = ZeroPadOptions<2>;
+
+/// `ZeroPadOptions` specialized for the `ZeroPad3d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2});
+/// ```
+using ZeroPad3dOptions = ZeroPadOptions<3>;
+
+// ============================================================================
+
+/// Options for a `D`-dimensional ConstantPad module.
+template <size_t D>
+struct TORCH_API ConstantPadOptions {
+  ConstantPadOptions(ExpandingArray<D * 2> padding, double value)
+      : padding_(padding), value_(value) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ConstantPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ConstantPad2d), uses (padding_left,
+  /// padding_right, padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ConstantPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+
+  /// Fill value for constant padding.
+  TORCH_ARG(double, value);
+};
+
+/// `ConstantPadOptions` specialized for the `ConstantPad1d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1}, 3.5));
+/// ```
+using ConstantPad1dOptions = ConstantPadOptions<1>;
+
+/// `ConstantPadOptions` specialized for the `ConstantPad2d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({3, 0, 2, 1}, 3.5));
+/// ```
+using ConstantPad2dOptions = ConstantPadOptions<2>;
+
+/// `ConstantPadOptions` specialized for the `ConstantPad3d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2}, 3.5));
+/// ```
+using ConstantPad3dOptions = ConstantPadOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::pad`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1,
+/// 2}).mode(torch::kReplicate));
+/// ```
+struct TORCH_API PadFuncOptions {
+  typedef std::variant<
+      enumtype::kConstant,
+      enumtype::kReflect,
+      enumtype::kReplicate,
+      enumtype::kCircular>
+      mode_t;
+
+  PadFuncOptions(std::vector<int64_t> pad);
+
+  /// m-elements tuple, where m/2 <= input dimensions and m is even.
+  TORCH_ARG(std::vector<int64_t>, pad);
+
+  /// "constant", "reflect", "replicate" or "circular". Default: "constant"
+  TORCH_ARG(mode_t, mode) = torch::kConstant;
+
+  /// fill value for "constant" padding. Default: 0
+  TORCH_ARG(double, value) = 0;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..31dfd444d14a441d00cfed09ec61084b2e5d7f2d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `PixelShuffle` module.
+///
+/// Example:
+/// ```
+/// PixelShuffle model(PixelShuffleOptions(5));
+/// ```
+struct TORCH_API PixelShuffleOptions {
+  PixelShuffleOptions(int64_t upscale_factor)
+      : upscale_factor_(upscale_factor) {}
+
+  /// Factor to increase spatial resolution by
+  TORCH_ARG(int64_t, upscale_factor);
+};
+
+/// Options for the `PixelUnshuffle` module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleOptions {
+  /* implicit */ PixelUnshuffleOptions(int64_t downscale_factor)
+      : downscale_factor_(downscale_factor) {}
+
+  /// Factor to decrease spatial resolution by
+  TORCH_ARG(int64_t, downscale_factor);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::pixel_shuffle`.
+///
+/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_shuffle(x, F::PixelShuffleFuncOptions(2));
+/// ```
+using PixelShuffleFuncOptions = PixelShuffleOptions;
+
+/// Options for `torch::nn::functional::pixel_unshuffle`.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_unshuffle(x, F::PixelUnshuffleFuncOptions(2));
+/// ```
+using PixelUnshuffleFuncOptions = PixelUnshuffleOptions;
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..139da2115873e96a08eb99f856ff0859e22e60bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h
@@ -0,0 +1,596 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for a `D`-dimensional avgpool module.
+template <size_t D>
+struct AvgPoolOptions {
+  AvgPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take an average over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size`
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+
+  /// when True, will include the zero-padding in the averaging calculation
+  TORCH_ARG(bool, count_include_pad) = true;
+
+  /// if specified, it will be used as divisor, otherwise size of the pooling
+  /// region will be used.
+
+  TORCH_ARG(c10::optional<int64_t>, divisor_override) = c10::nullopt;
+};
+
+/// `AvgPoolOptions` specialized for the `AvgPool1d` module.
+///
+/// Example:
+/// ```
+/// AvgPool1d model(AvgPool1dOptions(3).stride(2));
+/// ```
+using AvgPool1dOptions = AvgPoolOptions<1>;
+
+/// `AvgPoolOptions` specialized for the `AvgPool2d` module.
+///
+/// Example:
+/// ```
+/// AvgPool2d model(AvgPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+using AvgPool2dOptions = AvgPoolOptions<2>;
+
+/// `AvgPoolOptions` specialized for the `AvgPool3d` module.
+///
+/// Example:
+/// ```
+/// AvgPool3d model(AvgPool3dOptions(5).stride(2));
+/// ```
+using AvgPool3dOptions = AvgPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool1d`.
+///
+/// See the documentation for `torch::nn::AvgPool1dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool1d(x, F::AvgPool1dFuncOptions(3).stride(2));
+/// ```
+using AvgPool1dFuncOptions = AvgPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool2d`.
+///
+/// See the documentation for `torch::nn::AvgPool2dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool2d(x, F::AvgPool2dFuncOptions(3).stride(2));
+/// ```
+using AvgPool2dFuncOptions = AvgPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool3d`.
+///
+/// See the documentation for `torch::nn::AvgPool3dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool3d(x, F::AvgPool3dFuncOptions(3).stride(2));
+/// ```
+using AvgPool3dFuncOptions = AvgPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional maxpool module.
+template <size_t D>
+struct MaxPoolOptions {
+  MaxPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// a parameter that controls the stride of elements in the window
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+};
+
+/// `MaxPoolOptions` specialized for the `MaxPool1d` module.
+///
+/// Example:
+/// ```
+/// MaxPool1d model(MaxPool1dOptions(3).stride(2));
+/// ```
+using MaxPool1dOptions = MaxPoolOptions<1>;
+
+/// `MaxPoolOptions` specialized for the `MaxPool2d` module.
+///
+/// Example:
+/// ```
+/// MaxPool2d model(MaxPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+using MaxPool2dOptions = MaxPoolOptions<2>;
+
+/// `MaxPoolOptions` specialized for the `MaxPool3d` module.
+///
+/// Example:
+/// ```
+/// MaxPool3d model(MaxPool3dOptions(3).stride(2));
+/// ```
+using MaxPool3dOptions = MaxPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool1d` and
+/// `torch::nn::functional::max_pool1d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+using MaxPool1dFuncOptions = MaxPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool2d` and
+/// `torch::nn::functional::max_pool2d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+using MaxPool2dFuncOptions = MaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool3d` and
+/// `torch::nn::functional::max_pool3d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+using MaxPool3dFuncOptions = MaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional adaptive maxpool module.
+template <typename output_size_t>
+struct AdaptiveMaxPoolOptions {
+  AdaptiveMaxPoolOptions(output_size_t output_size)
+      : output_size_(output_size) {}
+
+  /// the target output size
+  TORCH_ARG(output_size_t, output_size);
+};
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool1d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool1d model(AdaptiveMaxPool1dOptions(3));
+/// ```
+using AdaptiveMaxPool1dOptions = AdaptiveMaxPoolOptions<ExpandingArray<1>>;
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool2d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool2d model(AdaptiveMaxPool2dOptions({3, 2}));
+/// ```
+using AdaptiveMaxPool2dOptions =
+    AdaptiveMaxPoolOptions<ExpandingArrayWithOptionalElem<2>>;
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool3d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool3d model(AdaptiveMaxPool3dOptions(3));
+/// ```
+using AdaptiveMaxPool3dOptions =
+    AdaptiveMaxPoolOptions<ExpandingArrayWithOptionalElem<3>>;
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool1d` and
+/// `torch::nn::functional::adaptive_max_pool1d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool1dFuncOptions = AdaptiveMaxPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool2d` and
+/// `torch::nn::functional::adaptive_max_pool2d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool2dFuncOptions = AdaptiveMaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool3d` and
+/// `torch::nn::functional::adaptive_max_pool3d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool3dFuncOptions = AdaptiveMaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional adaptive avgpool module.
+template <typename output_size_t>
+struct AdaptiveAvgPoolOptions {
+  AdaptiveAvgPoolOptions(output_size_t output_size)
+      : output_size_(output_size) {}
+
+  /// the target output size
+  TORCH_ARG(output_size_t, output_size);
+};
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool1d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool1d model(AdaptiveAvgPool1dOptions(5));
+/// ```
+using AdaptiveAvgPool1dOptions = AdaptiveAvgPoolOptions<ExpandingArray<1>>;
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool2d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool2d model(AdaptiveAvgPool2dOptions({3, 2}));
+/// ```
+using AdaptiveAvgPool2dOptions =
+    AdaptiveAvgPoolOptions<ExpandingArrayWithOptionalElem<2>>;
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool3d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool3d model(AdaptiveAvgPool3dOptions(3));
+/// ```
+using AdaptiveAvgPool3dOptions =
+    AdaptiveAvgPoolOptions<ExpandingArrayWithOptionalElem<3>>;
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool1d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool1dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool1d(x, F::AdaptiveAvgPool1dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool1dFuncOptions = AdaptiveAvgPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool2d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool2dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool2d(x, F::AdaptiveAvgPool2dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool2dFuncOptions = AdaptiveAvgPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool3d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool3dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool3d(x, F::AdaptiveAvgPool3dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool3dFuncOptions = AdaptiveAvgPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional maxunpool module.
+template <size_t D>
+struct MaxUnpoolOptions {
+  MaxUnpoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+};
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool1d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool1d model(MaxUnpool1dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool1dOptions = MaxUnpoolOptions<1>;
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool2d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool2d model(MaxUnpool2dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool2dOptions = MaxUnpoolOptions<2>;
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool3d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool3d model(MaxUnpool3dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool3dOptions = MaxUnpoolOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional maxunpool functional.
+template <size_t D>
+struct MaxUnpoolFuncOptions {
+  MaxUnpoolFuncOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// the targeted output size
+  TORCH_ARG(c10::optional<std::vector<int64_t>>, output_size) = c10::nullopt;
+};
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool1d(x, indices,
+/// F::MaxUnpool1dFuncOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool1dFuncOptions = MaxUnpoolFuncOptions<1>;
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool2d(x, indices,
+/// F::MaxUnpool2dFuncOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool2dFuncOptions = MaxUnpoolFuncOptions<2>;
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool3d(x, indices, F::MaxUnpool3dFuncOptions(3));
+/// ```
+using MaxUnpool3dFuncOptions = MaxUnpoolFuncOptions<3>;
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional fractional maxpool module.
+template <size_t D>
+struct FractionalMaxPoolOptions {
+  FractionalMaxPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the target output size of the image
+  TORCH_ARG(c10::optional<ExpandingArray<D>>, output_size) = c10::nullopt;
+
+  /// If one wants to have an output size as a ratio of the input size, this
+  /// option can be given. This has to be a number or tuple in the range (0, 1)
+  using ExpandingArrayDouble = torch::ExpandingArray<D, double>;
+  TORCH_ARG(c10::optional<ExpandingArrayDouble>, output_ratio) = c10::nullopt;
+
+  TORCH_ARG(torch::Tensor, _random_samples) = Tensor();
+};
+
+/// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool2d` module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool2d model(FractionalMaxPool2dOptions(5).output_size(1));
+/// ```
+using FractionalMaxPool2dOptions = FractionalMaxPoolOptions<2>;
+
+/// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool3d` module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool3d model(FractionalMaxPool3dOptions(5).output_size(1));
+/// ```
+using FractionalMaxPool3dOptions = FractionalMaxPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::fractional_max_pool2d` and
+/// `torch::nn::functional::fractional_max_pool2d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+using FractionalMaxPool2dFuncOptions = FractionalMaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::fractional_max_pool3d` and
+/// `torch::nn::functional::fractional_max_pool3d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+using FractionalMaxPool3dFuncOptions = FractionalMaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional lppool module.
+template <size_t D>
+struct LPPoolOptions {
+  LPPoolOptions(double norm_type, ExpandingArray<D> kernel_size)
+      : norm_type_(norm_type),
+        kernel_size_(kernel_size),
+        stride_(kernel_size) {}
+
+  TORCH_ARG(double, norm_type);
+
+  // the size of the window to take an average over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  // the stride of the window. Default value is `kernel_size`
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  // when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+};
+
+/// `LPPoolOptions` specialized for the `LPPool1d` module.
+///
+/// Example:
+/// ```
+/// LPPool1d model(LPPool1dOptions(1, 2).stride(5).ceil_mode(true));
+/// ```
+using LPPool1dOptions = LPPoolOptions<1>;
+
+/// `LPPoolOptions` specialized for the `LPPool2d` module.
+///
+/// Example:
+/// ```
+/// LPPool2d model(LPPool2dOptions(1, std::vector<int64_t>({3, 4})).stride({5,
+/// 6}).ceil_mode(true));
+/// ```
+using LPPool2dOptions = LPPoolOptions<2>;
+
+/// `LPPoolOptions` specialized for the `LPPool3d` module.
+///
+/// Example:
+/// ```
+/// LPPool3d model(LPPool3dOptions(1, std::vector<int64_t>({3, 4, 5})).stride(
+/// {5, 6, 7}).ceil_mode(true));
+/// ```
+using LPPool3dOptions = LPPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool1d`.
+///
+/// See the documentation for `torch::nn::LPPool1dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool1d(x, F::LPPool1dFuncOptions(2, 3).stride(2));
+/// ```
+using LPPool1dFuncOptions = LPPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool2d`.
+///
+/// See the documentation for `torch::nn::LPPool2dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool2d(x, F::LPPool2dFuncOptions(2, {2, 3}).stride(2));
+/// ```
+using LPPool2dFuncOptions = LPPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool3d`.
+///
+/// See the documentation for `torch::nn::LPPool3dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool3d(x, F::LPPool3dFuncOptions(2, {2, 3, 4}).stride(2));
+/// ```
+using LPPool3dFuncOptions = LPPool3dOptions;
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fa6c6307f10d97fe76c9a5544e1911dea8371ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h
@@ -0,0 +1,236 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+namespace detail {
+
+/// Common options for RNN, LSTM and GRU modules.
+struct TORCH_API RNNOptionsBase {
+  typedef std::variant<
+      enumtype::kLSTM,
+      enumtype::kGRU,
+      enumtype::kRNN_TANH,
+      enumtype::kRNN_RELU>
+      rnn_options_base_mode_t;
+
+  RNNOptionsBase(
+      rnn_options_base_mode_t mode,
+      int64_t input_size,
+      int64_t hidden_size);
+
+  TORCH_ARG(rnn_options_base_mode_t, mode);
+  /// The number of features of a single sample in the input sequence `x`.
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`.
+  TORCH_ARG(int64_t, hidden_size);
+  /// The number of recurrent layers (cells) to use.
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// Whether a bias term should be added to all linear operations.
+  TORCH_ARG(bool, bias) = true;
+  /// If true, the input sequence should be provided as `(batch, sequence,
+  /// features)`. If false (default), the expected layout is `(sequence, batch,
+  /// features)`.
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, adds dropout with the given probability to the output of each
+  /// RNN layer, except the final layer.
+  TORCH_ARG(double, dropout) = 0.0;
+  /// Whether to make the RNN bidirectional.
+  TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added. Can only be
+  /// used for LSTMs.
+  TORCH_ARG(int64_t, proj_size) = 0;
+};
+
+} // namespace detail
+
+/// Options for the `RNN` module.
+///
+/// Example:
+/// ```
+/// RNN model(RNNOptions(128,
+/// 64).num_layers(3).dropout(0.2).nonlinearity(torch::kTanh));
+/// ```
+struct TORCH_API RNNOptions {
+  typedef std::variant<enumtype::kTanh, enumtype::kReLU> nonlinearity_t;
+
+  RNNOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two RNNs together to form a `stacked RNN`,
+  /// with the second RNN taking in outputs of the first RNN and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// The non-linearity to use. Can be either ``torch::kTanh`` or
+  /// ``torch::kReLU``. Default: ``torch::kTanh``
+  TORCH_ARG(nonlinearity_t, nonlinearity) = torch::kTanh;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as `(batch, seq, feature)`. Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// RNN layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional RNN. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+};
+
+/// Options for the `LSTM` module.
+///
+/// Example:
+/// ```
+/// LSTM model(LSTMOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+struct TORCH_API LSTMOptions {
+  LSTMOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two LSTMs together to form a `stacked LSTM`,
+  /// with the second LSTM taking in outputs of the first LSTM and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as (batch, seq, feature). Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// LSTM layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional LSTM. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added
+  TORCH_ARG(int64_t, proj_size) = 0;
+};
+
+/// Options for the `GRU` module.
+///
+/// Example:
+/// ```
+/// GRU model(GRUOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+struct TORCH_API GRUOptions {
+  GRUOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two GRUs together to form a `stacked GRU`,
+  /// with the second GRU taking in outputs of the first GRU and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as (batch, seq, feature). Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// GRU layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional GRU. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+};
+
+namespace detail {
+
+/// Common options for RNNCell, LSTMCell and GRUCell modules
+struct TORCH_API RNNCellOptionsBase {
+  RNNCellOptionsBase(
+      int64_t input_size,
+      int64_t hidden_size,
+      bool bias,
+      int64_t num_chunks);
+  TORCH_ARG(int64_t, input_size);
+  TORCH_ARG(int64_t, hidden_size);
+  TORCH_ARG(bool, bias);
+  TORCH_ARG(int64_t, num_chunks);
+};
+
+} // namespace detail
+
+/// Options for the `RNNCell` module.
+///
+/// Example:
+/// ```
+/// RNNCell model(RNNCellOptions(20,
+/// 10).bias(false).nonlinearity(torch::kReLU));
+/// ```
+struct TORCH_API RNNCellOptions {
+  typedef std::variant<enumtype::kTanh, enumtype::kReLU> nonlinearity_t;
+
+  RNNCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// The non-linearity to use. Can be either ``torch::kTanh`` or
+  /// ``torch::kReLU``. Default: ``torch::kTanh``
+  TORCH_ARG(nonlinearity_t, nonlinearity) = torch::kTanh;
+};
+
+/// Options for the `LSTMCell` module.
+///
+/// Example:
+/// ```
+/// LSTMCell model(LSTMCellOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API LSTMCellOptions {
+  LSTMCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+};
+
+/// Options for the `GRUCell` module.
+///
+/// Example:
+/// ```
+/// GRUCell model(GRUCellOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API GRUCellOptions {
+  GRUCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2be20c4d81912c7433ce4b958c7e72ad2c3f4ce4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/options/transformerlayer.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Transformer` module
+///
+/// Example:
+/// ```
+/// TransformerOptions options;
+/// TransformerOptions options(16, 4);
+/// auto options = TransformerOptions().d_model(4).nhead(2).dropout(0.0);
+/// ```
+struct TORCH_API TransformerOptions {
+  // The following constructors are commonly used
+  // Please don't add more unless it is proved as a common usage
+  TransformerOptions() = default;
+  TransformerOptions(int64_t d_model, int64_t nhead);
+  TransformerOptions(
+      int64_t d_model,
+      int64_t nhead,
+      int64_t num_encoder_layers,
+      int64_t num_decoder_layers);
+
+  /// the number of expected features in the encoder/decoder inputs
+  /// (default=512)
+  TORCH_ARG(int64_t, d_model) = 512;
+
+  /// the number of heads in the multiheadattention models (default=8)
+  TORCH_ARG(int64_t, nhead) = 8;
+
+  /// the number of sub-encoder-layers in the encoder (default=6)
+  TORCH_ARG(int64_t, num_encoder_layers) = 6;
+
+  /// the number of sub-decoder-layers in the decoder (default=6)
+  TORCH_ARG(int64_t, num_decoder_layers) = 6;
+
+  /// the dimension of the feedforward network model (default=2048)
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// the dropout value (default=0.1)
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// the activation function of encoder/decoder intermediate layer
+  /// (default=``torch::kReLU``)
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+
+  /// custom encoder (default=None)
+  TORCH_ARG(AnyModule, custom_encoder);
+
+  /// custom decoder (default=None)
+  TORCH_ARG(AnyModule, custom_decoder);
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..826d574920c5cb0123b54dd266f46f3c6f7b1873
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/transformerlayer.h>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `TransformerEncoder`
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1)); auto options = TransformerEncoderOptions(encoderLayer,
+/// 6).norm(LayerNorm(LayerNormOptions({2})));
+/// ```
+struct TORCH_API TransformerEncoderOptions {
+  // This constructor will keep a shallow copy of encoder_layer, so it keeps all
+  // the data in encoder_layer.
+  TransformerEncoderOptions(
+      TransformerEncoderLayer encoder_layer,
+      int64_t num_layers);
+  // This constructor will create a new TransformerEncoderLayer obj based on
+  // passed in encoder_layer_options.
+  TransformerEncoderOptions(
+      const TransformerEncoderLayerOptions& encoder_layer_options,
+      int64_t num_layers);
+
+  /// transformer Encoder Layer
+  TORCH_ARG(TransformerEncoderLayer, encoder_layer) = nullptr;
+
+  /// number of encoder layers
+  TORCH_ARG(int64_t, num_layers);
+
+  /// normalization module
+  TORCH_ARG(AnyModule, norm);
+};
+
+/// Options for the `TransformerDecoder` module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer decoder_layer(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.1)); auto options = TransformerDecoderOptions(decoder_layer,
+/// 6)norm(LayerNorm(LayerNormOptions({2}))); TransformerDecoder
+/// transformer_decoder(options);
+/// ```
+struct TORCH_API TransformerDecoderOptions {
+  // This constructor will keep the a ref of passed in decoder_layer,
+  // so it keeps all the data in decoder_layer.
+  TransformerDecoderOptions(
+      TransformerDecoderLayer decoder_layer,
+      int64_t num_layers);
+  // This constructor will create a new TransformerDecoderLayer obj,
+  // based on passed in decoder_layer_options.
+  TransformerDecoderOptions(
+      const TransformerDecoderLayerOptions& decoder_layer_options,
+      int64_t num_layers);
+
+  /// decoder layer to be cloned
+  TORCH_ARG(TransformerDecoderLayer, decoder_layer) = nullptr;
+
+  /// number of decoder layers
+  TORCH_ARG(int64_t, num_layers);
+
+  /// normalization module
+  TORCH_ARG(AnyModule, norm);
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5f7d38f8a89e9ac015f87277ae4b831ae970bb9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+
+using activation_t = std::variant<
+    enumtype::kReLU,
+    enumtype::kGELU,
+    std::function<Tensor(const Tensor&)>>;
+
+/// Options for the `TransformerEncoderLayer`
+///
+/// Example:
+/// ```
+/// auto options = TransformerEncoderLayer(512, 8).dropout(0.2);
+/// ```
+struct TORCH_API TransformerEncoderLayerOptions {
+  /* implicit */ TransformerEncoderLayerOptions(int64_t d_model, int64_t nhead);
+
+  /// the number of expected features in the input
+  TORCH_ARG(int64_t, d_model);
+
+  /// the number of heads in the multiheadattention models
+  TORCH_ARG(int64_t, nhead);
+
+  /// the dimension of the feedforward network model, default is 2048
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// the dropout value, default is 0.1
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// the activation function of intermediate layer, can be ``torch::kReLU``,
+  /// ``torch::GELU``, or a unary callable. Default: ``torch::kReLU``
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+};
+
+// ============================================================================
+
+/// Options for the `TransformerDecoderLayer` module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer model(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.2));
+/// ```
+struct TORCH_API TransformerDecoderLayerOptions {
+  TransformerDecoderLayerOptions(int64_t d_model, int64_t nhead);
+
+  /// number of expected features in the input
+  TORCH_ARG(int64_t, d_model);
+
+  /// number of heads in the multiheadattention models
+  TORCH_ARG(int64_t, nhead);
+
+  /// dimension of the feedforward network model. Default: 2048
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// dropout value. Default: 1
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// activation function of intermediate layer, can be ``torch::kGELU``,
+  /// ``torch::kReLU``, or a unary callable. Default: ``torch::kReLU``
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+};
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..78050f4189170503b3352fbcf57081dd9753ecb3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+/// Options for the `Upsample` module.
+///
+/// Example:
+/// ```
+/// Upsample
+/// model(UpsampleOptions().scale_factor(std::vector<double>({3})).mode(torch::kLinear).align_corners(false));
+/// ```
+struct TORCH_API UpsampleOptions {
+  /// output spatial sizes.
+  TORCH_ARG(c10::optional<std::vector<int64_t>>, size) = c10::nullopt;
+
+  /// multiplier for spatial size.
+  TORCH_ARG(c10::optional<std::vector<double>>, scale_factor) = c10::nullopt;
+
+  /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
+  /// "bicubic" and "trilinear". Default: "nearest"
+  typedef std::variant<
+      enumtype::kNearest,
+      enumtype::kLinear,
+      enumtype::kBilinear,
+      enumtype::kBicubic,
+      enumtype::kTrilinear>
+      mode_t;
+  TORCH_ARG(mode_t, mode) = torch::kNearest;
+
+  /// if "True", the corner pixels of the input and output tensors are
+  /// aligned, and thus preserving the values at those pixels. This only has
+  /// effect when :attr:`mode` is "linear", "bilinear", "bicubic", or
+  /// "trilinear". Default: "False"
+  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+};
+
+namespace functional {
+
+/// Options for `torch::nn::functional::interpolate`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::interpolate(input,
+/// F::InterpolateFuncOptions().size(std::vector<int64_t>({4})).mode(torch::kNearest));
+/// ```
+struct TORCH_API InterpolateFuncOptions {
+  typedef std::variant<
+      enumtype::kNearest,
+      enumtype::kLinear,
+      enumtype::kBilinear,
+      enumtype::kBicubic,
+      enumtype::kTrilinear,
+      enumtype::kArea,
+      enumtype::kNearestExact>
+      mode_t;
+
+  /// output spatial sizes.
+  TORCH_ARG(c10::optional<std::vector<int64_t>>, size) = c10::nullopt;
+
+  /// multiplier for spatial size.
+  TORCH_ARG(c10::optional<std::vector<double>>, scale_factor) = c10::nullopt;
+
+  /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
+  /// "bicubic", "trilinear", "area", "nearest-exact". Default: "nearest"
+  TORCH_ARG(mode_t, mode) = torch::kNearest;
+
+  /// Geometrically, we consider the pixels of the input and output as squares
+  /// rather than points. If set to "True", the input and output tensors are
+  /// aligned by the center points of their corner pixels, preserving the values
+  /// at the corner pixels. If set to "False", the input and output tensors
+  /// are aligned by the corner points of their corner pixels, and the
+  /// interpolation uses edge value padding for out-of-boundary values, making
+  /// this operation *independent* of input size when `scale_factor` is
+  /// kept the same.  It is *required* when interpolating mode is "linear",
+  /// "bilinear", "bicubic" or "trilinear". Default: "False"
+  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+
+  /// recompute the scale_factor for use in the
+  /// interpolation calculation.  When `scale_factor` is passed as a parameter,
+  /// it is used to compute the `output_size`.  If `recompute_scale_factor` is
+  /// `true` or not specified, a new `scale_factor` will be computed based on
+  /// the output and input sizes for use in the interpolation computation (i.e.
+  /// the computation will be identical to if the computed `output_size` were
+  /// passed-in explicitly).  Otherwise, the passed-in `scale_factor` will be
+  /// used in the interpolation computation.  Note that when `scale_factor` is
+  /// floating-point, the recomputed scale_factor may differ from the one passed
+  /// in due to rounding and precision issues.
+  TORCH_ARG(c10::optional<bool>, recompute_scale_factor) = c10::nullopt;
+
+  /// flag to apply anti-aliasing. Using anti-alias
+  /// option together with :attr:`align_corners` equals "False", interpolation
+  /// result would match Pillow result for downsampling operation. Supported
+  /// modes: "bilinear". Default: "False".
+  TORCH_ARG(bool, antialias) = false;
+};
+
+} // namespace functional
+
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6116223f91e6a0bf85d55bebad9363cfa7d617b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+/// Options for `torch::nn::functional::grid_sample`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::grid_sample(input, grid,
+/// F::GridSampleFuncOptions().mode(torch::kBilinear).padding_mode(torch::kZeros).align_corners(true));
+/// ```
+struct TORCH_API GridSampleFuncOptions {
+  typedef std::variant<enumtype::kBilinear, enumtype::kNearest> mode_t;
+  typedef std::
+      variant<enumtype::kZeros, enumtype::kBorder, enumtype::kReflection>
+          padding_mode_t;
+
+  /// interpolation mode to calculate output values. Default: Bilinear
+  TORCH_ARG(mode_t, mode) = torch::kBilinear;
+  /// padding mode for outside grid values. Default: Zeros
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+  /// Specifies perspective to pixel as point. Default: false
+  TORCH_ARG(c10::optional<bool>, align_corners) = c10::nullopt;
+};
+
+} // namespace functional
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e319be513a72e8cb10ab0a549532981acf28728
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -0,0 +1,297 @@
+#pragma once
+
+#include <torch/cuda.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/autograd/functions/comm.h>
+#include <torch/csrc/autograd/functions/utils.h>
+
+#include <ATen/Device.h>
+#include <ATen/Parallel.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace torch {
+namespace nn {
+
+namespace {
+
+// Note [Replicating Modules]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// Module replication is implemented in the following two steps:
+// 1) create a module replica on each destination device using Module.clone().
+// 2) manually add a gradient edge pointing from every parameter X in every
+//    module replica to the same parameter X in the original module, using
+//    ReduceAdd as the grad_fn.
+//
+// ReduceAdd can ONLY be used during the backward pass of data parallel. Forward
+// pass cannot use this function as it does not setup gradient function and
+// history at all. Do NOT try to use ReduceAdd for any other purposes.
+//
+// NB: An alternative is to add Broadcast and ReduceAddCoalesce to
+// torch/csrc/autograd/functions/comm.cpp as normal autograd functions,
+// implement a Replicatable (like cloneable) class and add it as a friend class
+// in Module.h. In the forward pass, the Replicatable could use the Broadcast
+// function to replicate every module parameter and set gradient functions using
+// ReduceAddCoalesce (like how it is implemented in Python). However, unlike in
+// Python, where changes to Linear._parameters["weight"] would also apply to
+// Linear.weight (using Linear as an example), Linear.weight and
+// Linear.parameters_["weight"] are two tensor objects pointing to the same
+// TensorImpl. Assigning a new tensor to Linear.parameters_["weight"] will not
+// change Linear.weight. To make this work, we will have to:
+// 1) force every module to also inherit from Replicatable
+// 2) force every module to implement an additional function, e.g.,
+//    Replicatable::load_params(), to pick up changes from parameters_ to their
+//    own member fields.
+// This will be an overkill as Replicatable will only be used in data_parallel,
+// not even ddp.
+
+// Autograd function for the replicate step in data parallel. This is only used
+// in data parallel, and should not be exposed as a user API.
+struct ReduceAdd : public autograd::Node {
+  explicit ReduceAdd(const at::Device& destination_device)
+      : destination_device_(destination_device){};
+  ~ReduceAdd() override {}
+
+  autograd::variable_list apply(autograd::variable_list&& inputs) override {
+    TORCH_CHECK(
+        !torch::autograd::compute_requires_grad(inputs),
+        "ReduceAdd can only be used during the backward pass of data parallel.");
+
+    Tensor output = torch::zeros_like(inputs[0], {destination_device_});
+
+    for (auto& input : inputs) {
+      TORCH_CHECK(
+          input.sizes() == inputs[0].sizes(),
+          "All inputs of ReduceAdd must have the same size, but got ",
+          input.sizes(),
+          " and ",
+          inputs[0].sizes());
+
+      TORCH_CHECK(
+          input.dtype() == inputs[0].dtype(),
+          "All inputs of ReduceAdd must have the same dtype, but got ",
+          input.dtype(),
+          " and ",
+          inputs[0].dtype());
+
+      // TODO: use nccl reduce
+      output.add_(input.to(destination_device_));
+    }
+
+    return {output};
+  }
+
+ private:
+  at::Device destination_device_;
+};
+
+} // namespace
+
+// A friend function to Module, it recursively sets gradient edges pointing from
+// every parameter X in every module replica to the same parameter X in the
+// original module. See [Replicating Modules]
+template <typename ModuleType>
+void replicate_grad_edges(
+    const std::shared_ptr<Module>& module,
+    const std::vector<std::shared_ptr<ModuleType>>& replicas,
+    const std::vector<Device>& devices) {
+  for (auto& parameter : module->named_parameters(/*recurse=*/false)) {
+    auto grad_fn = std::make_shared<ReduceAdd>((*parameter).device());
+    grad_fn->set_next_edges(autograd::collect_next_edges(*parameter));
+
+    for (const auto i : c10::irange(devices.size())) {
+      autograd::set_history(replicas[i]->parameters_[parameter.key()], grad_fn);
+    }
+  }
+
+  for (auto& buffer : module->named_buffers(/*recurse=*/false)) {
+    if (buffer.value().requires_grad()) {
+      auto grad_fn = std::make_shared<ReduceAdd>((*buffer).device());
+      grad_fn->set_next_edges(autograd::collect_next_edges(*buffer));
+
+      for (const auto i : c10::irange(devices.size())) {
+        autograd::set_history(replicas[i]->buffers_[buffer.key()], grad_fn);
+      }
+    }
+  }
+
+  for (auto& child : module->children_) {
+    std::vector<std::shared_ptr<Module>> child_replicas;
+    child_replicas.reserve(devices.size());
+    for (auto& replica : replicas) {
+      child_replicas.push_back(replica->children_[child.key()]);
+    }
+
+    // recursively set gradient edges for all children
+    replicate_grad_edges(*child, child_replicas, devices);
+  }
+}
+
+namespace parallel {
+
+/// Replicates a module on the given list of devices.
+/// A replica is created by calling `clone()` on the module. For this, the
+/// module must inherit from `nn::Cloneable`, or define its own `clone()`
+/// method, which is expected to perform a deep copy of the module.
+template <typename ModuleType>
+std::vector<std::shared_ptr<ModuleType>> replicate(
+    const std::shared_ptr<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  std::vector<std::shared_ptr<ModuleType>> replicas;
+  replicas.reserve(devices.size());
+  for (const auto& device : devices) {
+    replicas.push_back(
+        std::dynamic_pointer_cast<ModuleType>(module->clone(device)));
+  }
+  // Configure gradient edges to point from replcia parameters to original
+  // module parameters. See [Replicating Modules]
+  replicate_grad_edges(module, replicas, devices);
+  return replicas;
+}
+
+/// Replicates a module holder on the given list of devices.
+/// This method allows calling `replicate()` with a module holder, such as
+/// `Linear`.
+template <typename ModuleType>
+std::vector<ModuleHolder<ModuleType>> replicate(
+    const ModuleHolder<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  auto ptrs = replicate(module.ptr(), devices);
+  return std::vector<ModuleHolder<ModuleType>>(ptrs.begin(), ptrs.end());
+}
+
+/// Applies the given inputs to the given modules in a parallel fashion.
+/// Conceptually, a thread is spawned for each `(module, input)` pair, in which
+/// `forward()` is called on the module with its corresponding input. The
+/// outputs of the individual calls are stored in a vector and returned.
+///
+/// The first exception caught by any thread is stashed and rethrown after all
+/// threads have completed their operation.
+///
+/// Further remarks:
+/// 1. The length of the module container must match the length of the inputs.
+/// 2. If a list of devices is supplied, it must match the list of modules in
+/// length. Each device will be set to the current default device during the
+/// invocation of the respective module. This means any tensors allocated on the
+/// default device inside the module will be constructed on this device.
+template <typename ModuleType>
+std::vector<Tensor> parallel_apply(
+    std::vector<ModuleType>& modules,
+    const std::vector<Tensor>& inputs,
+    const optional<std::vector<Device>>& devices = nullopt) {
+  TORCH_CHECK(
+      modules.size() == inputs.size(), "Must have as many inputs as modules");
+  if (devices) {
+    TORCH_CHECK(
+        modules.size() == devices->size(),
+        "Must have as many devices as modules");
+  }
+
+  std::vector<Tensor> outputs(modules.size());
+  std::mutex mutex;
+
+  // std::exception_ptr can be passed between threads:
+  // > An instance of std::exception_ptr may be passed to another function,
+  // > possibly on another thread, where the exception may be rethrown [...].
+  // https://en.cppreference.com/w/cpp/error/exception_ptr
+  std::exception_ptr exception;
+
+  at::parallel_for(
+      /*begin=*/0,
+      /*end=*/modules.size(),
+      /*grain_size=*/1,
+      [&modules, &inputs, &devices, &outputs, &mutex, &exception](
+          int64_t index, int64_t stop) {
+        for (; index < stop; ++index) {
+          try {
+            auto output = modules[index]->forward(inputs[index]);
+            output =
+                output.to(devices ? (*devices)[index] : inputs[index].device());
+            std::lock_guard<std::mutex> lock(mutex);
+            outputs[index] = output;
+          } catch (...) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if (!exception) {
+              exception = std::current_exception();
+            }
+          }
+        }
+      });
+
+  if (exception) {
+    std::rethrow_exception(exception);
+  }
+
+  return outputs;
+}
+
+/// Evaluates `module(input)` in parallel across the given `devices`. If
+/// `devices` is not supplied, the invocation is parallelized across all
+/// available CUDA devices. If `output_device` is supplied, the final, combined
+/// tensor will be placed on this device. If not, it defaults to the first
+/// device in `devices`.
+///
+/// In detail, this method performs the following four distinct steps:
+/// 1. *Scatter* the input to the given devices,
+/// 2. *Replicate* (deep clone) the model on each device,
+/// 3. *Evaluate* each module with its input on its device,
+/// 4. *Gather* the outputs of each replica into a single output tensor, located
+/// on the `output_device`.
+template <typename ModuleType>
+Tensor data_parallel(
+    ModuleType module,
+    Tensor input,
+    optional<std::vector<Device>> devices = nullopt,
+    optional<Device> output_device = nullopt,
+    int64_t dim = 0) {
+  if (!devices) {
+    const auto device_count = torch::cuda::device_count();
+    TORCH_CHECK(
+        device_count > 0, "Expected at least one CUDA device to be available");
+    devices = std::vector<Device>();
+    devices->reserve(device_count);
+    for (const auto index : c10::irange(device_count)) {
+      devices->emplace_back(kCUDA, static_cast<torch::DeviceIndex>(index));
+    }
+  }
+  if (!output_device) {
+    output_device = devices->front();
+  }
+
+  if (devices->size() == 1) {
+    module->to(devices->front());
+    input = input.to(devices->front());
+    return module->forward(std::move(input)).to(*output_device);
+  }
+
+  autograd::Scatter scatter(*devices, /*chunk_sizes=*/nullopt, dim);
+  auto scattered_inputs = fmap<Tensor>(scatter.apply({std::move(input)}));
+  // Input tensor might not be big enough to scale across all available devices
+  if (scattered_inputs.size() < devices->size()) {
+    devices->resize(
+        scattered_inputs.size(),
+        Device(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES));
+  }
+
+  auto replicas = replicate(module, *devices);
+  auto outputs = parallel_apply(replicas, scattered_inputs, *devices);
+  return autograd::Gather(*output_device, dim)
+      .apply(fmap<autograd::Variable>(std::move(outputs)))
+      .front();
+}
+
+} // namespace parallel
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8bc521d9712eeaf31b2bdb2d6287647c2e74344
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h
@@ -0,0 +1,74 @@
+// This class exists  only to do SFINAE on abstract types `T` that are really
+// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
+// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
+// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
+struct ModuleHolderIndicator {};
+
+// A type trait that is true for types that are `ModuleHolder`s.
+template <typename T>
+using is_module_holder = std::is_base_of<ModuleHolderIndicator, decay_t<T>>;
+
+template <typename T>
+using disable_if_module_holder_t = disable_if_t<is_module_holder<T>::value>;
+
+// A collection of templates that answer the question whether a type `T` is a
+// `ModuleHolder`, and if so whether its contained type is of type `C`. This is
+// tricky because it is hard to short circuit in template metaprogramming. A
+// naive and incorrect solution to this problem would be something like
+// `disable_if<is_module_holder<T>::value && typename T::ContainedType == C>`.
+// This would disable all types that are not `ModuleHolder`s, because even
+// though the `is_module_holder<T>::value` may be `false` for such types the
+// `T::ContainedType` access would be ill-formed and thus fail the whole
+// expression by the rules of SFINAE. Instead we have to use template
+// specialization to statically branch on the first condition
+// (`is_module_holder<T>`) and are only then allowed to query
+// `T::ContainedType` in the branch for which the condition was true.
+
+// Base template.
+template <bool is_module_holder_value, typename T, typename C>
+struct is_module_holder_of_impl;
+
+// False branch. `T` is not a `ModuleHolder` and thus not a `ModuleHolder` with
+// contained type `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<false, T, C> : std::false_type {};
+
+// True branch. `T` is a `ModuleHolder` and thus we can legit access its
+// `ContainedType` and compare it against `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<true, T, C>
+    : std::is_same<typename T::ContainedType, C> {};
+
+// Helper template.
+template <typename T, typename C>
+struct is_module_holder_of : is_module_holder_of_impl<
+                                 is_module_holder<T>::value,
+                                 decay_t<T>,
+                                 decay_t<C>> {};
+
+// A collection of templates that allow deducing the return type of the
+// `forward()` method, but only if a module actually has a `forward()` method,
+// and otherwise deduces to the type `void`.
+
+template <bool has_forward_value, typename C, typename... Args>
+struct return_type_of_forward_impl;
+
+template <typename C, typename... Args>
+struct return_type_of_forward_impl<true, C, Args...> {
+  using type = decltype(::std::declval<C>().forward(::std::declval<Args>()...));
+};
+
+template <typename C, typename... Args>
+struct return_type_of_forward_impl<false, C, Args...> {
+  using type = void;
+};
+
+template <typename C, typename... Args>
+using return_type_of_forward = return_type_of_forward_impl<
+    torch::detail::has_forward<C>::value,
+    C,
+    Args...>;
+
+template <typename C, typename... Args>
+using return_type_of_forward_t =
+    typename return_type_of_forward<C, Args...>::type;
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..351a7ea53f2ae9ea081a4829459796a7e516a5d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h
@@ -0,0 +1,214 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/detail/static.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace detail {
+// Dump all the template metaprogramming in this file.
+#include <torch/csrc/api/include/torch/nn/pimpl-inl.h>
+} // namespace detail
+
+namespace nn {
+
+/// A `ModuleHolder` is essentially a wrapper around `std::shared_ptr<M>` where
+/// `M` is an `nn::Module` subclass, with convenient constructors defined for
+/// the kind of constructions we want to allow for our modules.
+template <typename Contained>
+class ModuleHolder : torch::detail::ModuleHolderIndicator {
+ protected:
+  /// The module pointer this class wraps.
+  /// NOTE: Must be placed at the top of the class so that we can use it with
+  /// trailing return types below.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Contained> impl_;
+
+ public:
+  using ContainedType = Contained;
+
+  /// Default constructs the contained module if if has a default constructor,
+  /// else produces a static error.
+  ///
+  /// NOTE: This uses the behavior of template
+  /// classes in C++ that constructors (or any methods) are only compiled when
+  /// actually used.
+  ModuleHolder() : impl_(default_construct()) {
+    static_assert(
+        std::is_default_constructible<Contained>::value,
+        "You are trying to default construct a module which has "
+        "no default constructor. Use = nullptr to give it the empty state "
+        "(e.g. `Linear linear = nullptr;` instead of `Linear linear;`).");
+  }
+
+  /// Constructs the `ModuleHolder` with an empty contained value. Access to
+  /// the underlying module is not permitted and will throw an exception, until
+  /// a value is assigned.
+  /* implicit */ ModuleHolder(std::nullptr_t) : impl_(nullptr) {}
+
+  /// Constructs the `ModuleHolder` with a contained module, forwarding all
+  /// arguments to its constructor.
+  template <
+      typename Head,
+      typename... Tail,
+      typename = typename std::enable_if<
+          !(torch::detail::is_module_holder_of<Head, ContainedType>::value &&
+            (sizeof...(Tail) == 0))>::type>
+  explicit ModuleHolder(Head&& head, Tail&&... tail)
+      : impl_(new Contained(
+            std::forward<Head>(head),
+            std::forward<Tail>(tail)...)) {}
+
+  /// Constructs the `ModuleHolder` from a pointer to the contained type.
+  /// Example: `Linear(std::make_shared<LinearImpl>(...))`.
+  /* implicit */ ModuleHolder(std::shared_ptr<Contained> module)
+      : impl_(std::move(module)) {}
+
+  /// Returns true if the `ModuleHolder` contains a module, or false if it is
+  /// `nullptr`.
+  explicit operator bool() const noexcept {
+    return !is_empty();
+  }
+
+  /// Forwards to the contained module.
+  Contained* operator->() {
+    return get();
+  }
+
+  /// Forwards to the contained module.
+  const Contained* operator->() const {
+    return get();
+  }
+
+  /// Returns a reference to the contained module.
+  Contained& operator*() {
+    return *get();
+  }
+
+  /// Returns a const reference to the contained module.
+  const Contained& operator*() const {
+    return *get();
+  }
+
+  /// Returns a shared pointer to the underlying module.
+  const std::shared_ptr<Contained>& ptr() const {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_;
+  }
+
+  /// Returns a pointer to the underlying module.
+  Contained* get() {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Returns a const pointer to the underlying module.
+  const Contained* get() const {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Calls the `forward()` method of the contained module.
+  template <typename... Args>
+  auto operator()(Args&&... args)
+      -> torch::detail::return_type_of_forward_t<Contained, Args...> {
+    // This will not compile if the module does not have a `forward()` method
+    // (as expected).
+    // NOTE: `std::forward` is qualified to prevent VS2017 emitting
+    // error C2872: 'std': ambiguous symbol
+    return impl_->forward(::std::forward<Args>(args)...);
+  }
+
+  /// Forwards to the subscript operator of the contained module.
+  /// NOTE: std::forward is qualified to prevent VS2017 emitting
+  ///       error C2872: 'std': ambiguous symbol
+  template <typename Arg>
+  decltype(auto) operator[](Arg&& arg) {
+    return (*impl_)[::std::forward<Arg>(arg)];
+  }
+
+  /// Returns true if the `ModuleHolder` does not contain a module.
+  bool is_empty() const noexcept {
+    return impl_ == nullptr;
+  }
+
+ private:
+  /// In C++17, the two methods below could be written as the following:
+  /// if constexpr (std::is_default_constructible_v<Contained>) {
+  ///   return std::make_shared<Contained>();
+  /// } else {
+  ///   return nullptr;
+  /// }
+  /// In C++11, we use SFINAE instead of `if constexpr`.
+
+  template <
+      typename T = Contained,
+      typename = torch::enable_if_t<std::is_default_constructible<T>::value>>
+  std::shared_ptr<Contained> default_construct() {
+    return std::make_shared<Contained>();
+  }
+
+  template <typename T = Contained>
+  torch::disable_if_t<
+      std::is_default_constructible<T>::value,
+      std::shared_ptr<Contained>>
+  default_construct() {
+    return nullptr;
+  }
+};
+
+/// Pretty prints the given `Module` into the `ostream`.
+template <typename ModuleType>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const nn::ModuleHolder<ModuleType>& module) {
+  return stream << *module;
+}
+
+/// Serializes a `ModuleHolder` into an `OutputArchive`.
+template <typename ModuleType>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const nn::ModuleHolder<ModuleType>& module) {
+  return archive << module.ptr();
+}
+
+/// Deserializes a `ModuleHolder` from an `InputArchive`.
+template <typename ModuleType>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    nn::ModuleHolder<ModuleType>& module) {
+  return archive >> module.ptr();
+}
+
+} // namespace nn
+} // namespace torch
+
+// Workaround for CUDA 10.2 and below not allowing attribute unused on
+// using declarations.
+#ifdef __CUDACC__
+#define TORCH_UNUSED_EXCEPT_CUDA
+#else
+#define TORCH_UNUSED_EXCEPT_CUDA C10_UNUSED
+#endif
+
+/// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
+/// wrapper over a `std::shared_ptr<ImplType>`.
+/// `Impl` is a type alias for `ImplType` which provides a way to call static
+/// method of `ImplType`.
+#define TORCH_MODULE_IMPL(Name, ImplType)                              \
+  class Name : public torch::nn::ModuleHolder<ImplType> { /* NOLINT */ \
+   public:                                                             \
+    using torch::nn::ModuleHolder<ImplType>::ModuleHolder;             \
+    using Impl TORCH_UNUSED_EXCEPT_CUDA = ImplType;                    \
+  }
+
+/// Like `TORCH_MODULE_IMPL`, but defaults the `ImplType` name to `<Name>Impl`.
+#define TORCH_MODULE(Name) TORCH_MODULE_IMPL(Name, Name##Impl)
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..47a5e2cd799f0ec56ab60e2a59b52dd9a98a653a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/nn/utils/clip_grad.h>
+#include <torch/nn/utils/convert_parameters.h>
+#include <torch/nn/utils/rnn.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..3112a23cb82f7bc7c39a4aba403f08d8d2447155
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <utility>
+
+namespace torch {
+namespace nn {
+namespace utils {
+
+// Clips gradient norm of a vector of Tensors.
+// See
+// https://pytorch.org/docs/stable/nn.html?highlight=clip_grad_norm#torch.nn.utils.clip_grad_norm_
+// for more details about this module.
+//
+// Difference with the python version: unlike the python version, even when
+// skipping the finiteness checks (error_if_nonfinite = false), this function
+// will introduce a device <=> CPU synchronization (for devices where that makes
+// sense!) in order to return a CPU-side `double`. This C++ version therefore
+// cannot be run fully asynchronously w.r.t. the device of the gradients.
+inline double clip_grad_norm_(
+    const std::vector<Tensor>& parameters,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  std::vector<Tensor> params_with_grad;
+
+  for (const auto& param : parameters) {
+    auto& grad = param.grad();
+    if (grad.defined()) {
+      params_with_grad.push_back(param);
+    }
+  }
+
+  if (params_with_grad.empty()) {
+    return 0.0;
+  }
+
+  Tensor total_norm_tensor;
+  if (norm_type == std::numeric_limits<double>::infinity()) {
+    std::vector<Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().abs().max());
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::max(torch::stack(norms));
+  } else if (norm_type == 0) {
+    total_norm_tensor =
+        torch::full({}, static_cast<double>(params_with_grad.size()));
+  } else {
+    std::vector<Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().norm(norm_type));
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::stack(norms).norm(norm_type);
+  }
+
+  // When possible (ie when skipping the finiteness check), we avoid
+  // synchronizing the CPU and the gradients' device until the very end to
+  // preserve async execution on the device. When checking for finite-ness, this
+  // optional ensures we only sync once.
+  c10::optional<double> total_norm = c10::nullopt;
+  if (error_if_nonfinite) {
+    total_norm = total_norm_tensor.item().toDouble();
+    TORCH_CHECK(
+        std::isfinite(*total_norm),
+        "The total norm of order ",
+        norm_type,
+        " for gradients from `parameters` ",
+        "is non-finite, so it cannot be clipped. To disable this error and scale ",
+        "the gradients with the non-finite norm anyway, set ",
+        "`error_if_nonfinite=false`");
+  }
+
+  auto clip_coef = max_norm / (total_norm_tensor + 1e-6);
+  auto clip_coef_clamped =
+      torch::clamp(clip_coef, c10::nullopt /* min */, 1.0 /* max */);
+  for (auto& param : params_with_grad) {
+    param.grad().data().mul_(clip_coef_clamped);
+  }
+
+  if (!total_norm.has_value()) {
+    total_norm = total_norm_tensor.item().toDouble();
+  }
+  return *total_norm;
+}
+
+// A wrapper around clip_grad_norm_ that allows us to call the function with a
+// braced-init-list of Tensors.
+inline double clip_grad_norm_(
+    std::initializer_list<Tensor> parameters,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  return clip_grad_norm_(
+      std::vector<Tensor>(parameters), max_norm, norm_type, error_if_nonfinite);
+}
+
+// A wrapper around clip_grad_norm_ that allows us to call the function with a
+// single Tensor.
+inline double clip_grad_norm_(
+    Tensor parameter,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  std::vector<Tensor> params = {std::move(parameter)};
+  return clip_grad_norm_(
+      std::move(params), max_norm, norm_type, error_if_nonfinite);
+}
+
+// Clips gradient of an iterable of parameters at specified value.
+// Gradients are modified in-place.
+// See https://pytorch.org/docs/stable/nn.html#clip-grad-value
+// for more details about this module.
+inline void clip_grad_value_(
+    const std::vector<Tensor>& parameters,
+    double clip_value) {
+  for (const auto& param : parameters) {
+    if (param.grad().defined()) {
+      param.grad().data().clamp_(-clip_value, clip_value);
+    }
+  }
+}
+
+// A wrapper around clip_grad_value_ that allows us to call the function with a
+// braced-init-list of Tensors.
+inline void clip_grad_value_(
+    std::initializer_list<Tensor> parameters,
+    double clip_value) {
+  clip_grad_value_(std::vector<Tensor>(parameters), clip_value);
+}
+
+// A wrapper around clip_grad_value_ that allows us to call the function with a
+// single Tensor.
+inline void clip_grad_value_(Tensor parameter, double clip_value) {
+  std::vector<Tensor> params = {std::move(parameter)};
+  clip_grad_value_(std::move(params), clip_value);
+}
+
+} // namespace utils
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0503858942f1852176a2865807f12894c569a62
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace nn {
+namespace utils {
+
+// This helper function is to check if the parameters are located
+// in the same device. Currently, the conversion between model parameters
+// and single vector form is not supported for multiple allocations,
+// e.g. parameters in different GPUs, or mixture of CPU/GPU.
+inline c10::optional<int64_t> _check_param_device(
+    const torch::Tensor& param,
+    c10::optional<int64_t> old_param_device) {
+  // Meet the first parameter
+  if (old_param_device == c10::nullopt) {
+    old_param_device = param.is_cuda() ? param.get_device() : -1;
+  } else {
+    bool warn = false;
+    if (param.is_cuda()) { // Check if in same GPU
+      warn = (param.get_device() != old_param_device.value());
+    } else { // Check if in CPU
+      warn = (old_param_device.value() != -1);
+    }
+    if (warn) {
+      TORCH_CHECK(
+          false,
+          "Found two parameters on different devices, ",
+          "this is currently not supported.");
+    }
+  }
+
+  return old_param_device;
+}
+
+// Convert parameters to one vector
+inline torch::Tensor parameters_to_vector(
+    const std::vector<torch::Tensor>& parameters) {
+  c10::optional<int64_t> param_device;
+
+  std::vector<torch::Tensor> vec;
+  vec.reserve(parameters.size());
+
+  for (const torch::Tensor& param : parameters) {
+    // Ensure the parameters are located in the same device
+    param_device = _check_param_device(param, param_device);
+
+    vec.push_back(param.view(-1));
+  }
+
+  return torch::cat(vec);
+}
+
+// Convert one vector to the parameters
+inline void vector_to_parameters(
+    const torch::Tensor& vec,
+    const std::vector<torch::Tensor>& parameters) {
+  // Flag for the device where the parameter is located
+  c10::optional<int64_t> param_device;
+
+  // Pointer for slicing the vector for each parameter
+  int64_t pointer = 0;
+  for (const torch::Tensor& param : parameters) {
+    // Ensure the parameters are located in the same device
+    param_device = _check_param_device(param, param_device);
+
+    // The length of the parameter
+    auto num_param = param.numel();
+    // Slice the vector, reshape it, and replace the old data of the parameter
+    param.set_data(
+        vec.slice(0, pointer, pointer + num_param).view_as(param).data());
+
+    // Increment the pointer
+    pointer += num_param;
+  }
+}
+
+} // namespace utils
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdbaa6e408588af83011238828a54d78b39ec4df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h
@@ -0,0 +1,351 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/types.h>
+
+#include <utility>
+
+namespace torch {
+namespace nn {
+namespace utils {
+namespace rnn {
+
+inline Tensor invert_permutation(const Tensor& permutation) {
+  if (!permutation.defined()) {
+    return torch::Tensor();
+  }
+  Tensor output =
+      torch::empty_like(permutation, torch::MemoryFormat::Contiguous);
+  output.scatter_(
+      0,
+      permutation,
+      torch::arange(0, permutation.numel(), permutation.device()));
+  return output;
+}
+
+/// Holds the data and list of `batch_sizes` of a packed sequence.
+///
+/// All RNN modules accept packed sequences as inputs.
+///
+/// Note:
+///     Instances of this class should never be created manually. They are meant
+///     to be instantiated by functions like `pack_padded_sequence`.
+///
+///     Batch sizes represent the number elements at each sequence step in
+///     the batch, not the varying sequence lengths passed to
+///     `pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
+///     the :class:`PackedSequence` would contain data ``axbc`` with
+///     ``batch_sizes=[2,1,1]``.
+///
+/// Attributes:
+///     data (Tensor): Tensor containing packed sequence
+///     batch_sizes (Tensor): Tensor of integers holding
+///         information about the batch size at each sequence step
+///     sorted_indices (Tensor, optional): Tensor of integers holding how this
+///         :class:`PackedSequence` is constructed from sequences.
+///     unsorted_indices (Tensor, optional): Tensor of integers holding how this
+///         to recover the original sequences with correct order.
+///
+/// .. note::
+///     `data` can be on arbitrary device and of arbitrary dtype.
+///     `sorted_indices` and `unsorted_indices` must be ``torch::kInt64``
+///     tensors on the same device as `data`.
+///
+///     However, `batch_sizes` should always be a CPU ``torch::kInt64`` tensor.
+///
+///     This invariant is maintained throughout `PackedSequence` class,
+///     and all functions that construct a `PackedSequence` in libtorch
+///     (i.e., they only pass in tensors conforming to this constraint).
+class PackedSequence {
+ public:
+  explicit PackedSequence(
+      Tensor data,
+      Tensor batch_sizes,
+      Tensor sorted_indices = {},
+      Tensor unsorted_indices = {}) {
+    // NB: if unsorted_indices is provided, it should be the inverse permutation
+    // to sorted_indices. Don't assert it here because the PackedSequence ctor
+    // should only be used internally.
+    if (!unsorted_indices.defined()) {
+      unsorted_indices = invert_permutation(sorted_indices);
+    }
+    TORCH_CHECK(
+        batch_sizes.device().type() == kCPU,
+        "batch_sizes should always be on CPU. "
+        "Instances of PackedSequence should never be created manually. "
+        "They should be instantiated by functions like pack_sequence "
+        "and pack_padded_sequences in nn::utils::rnn. "
+        "https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_sequence");
+    data_ = std::move(data);
+    batch_sizes_ = std::move(batch_sizes);
+    sorted_indices_ = std::move(sorted_indices);
+    unsorted_indices_ = std::move(unsorted_indices);
+  }
+
+  const Tensor& data() const {
+    return data_;
+  }
+
+  const Tensor& batch_sizes() const {
+    return batch_sizes_;
+  }
+
+  const Tensor& sorted_indices() const {
+    return sorted_indices_;
+  }
+
+  const Tensor& unsorted_indices() const {
+    return unsorted_indices_;
+  }
+
+  PackedSequence pin_memory() const {
+    // Why not convert `batch_sizes`?
+    // See NOTE [ device and dtype of a PackedSequence ]
+    return PackedSequence(
+        data_.pin_memory(),
+        batch_sizes_,
+        sorted_indices_.defined() ? sorted_indices_.pin_memory() : Tensor(),
+        unsorted_indices_.defined() ? unsorted_indices_.pin_memory()
+                                    : Tensor());
+  }
+
+  PackedSequence to(TensorOptions options) const {
+    // Performs dtype and/or device conversion on `data_`.
+    //
+    // If the ``data_`` Tensor already has the correct `torch::Dtype`
+    // and `torch::Device`, then ``self`` is returned.
+    // Otherwise, returns a copy with the desired configuration.
+
+    // Why not convert `batch_sizes`?
+    // See NOTE [ device and dtype of a PackedSequence ]
+    Tensor data = data_.to(options);
+    if (data.is_same(data_)) {
+      return *this;
+    } else {
+      // Does not forward device or dtype args, device is set from data.device()
+      Tensor sorted_indices = sorted_indices_.defined()
+          ? sorted_indices_.to(
+                options.device(data.device()).dtype(sorted_indices_.dtype()))
+          : Tensor();
+      Tensor unsorted_indices = unsorted_indices_.defined()
+          ? unsorted_indices_.to(
+                options.device(data.device()).dtype(unsorted_indices_.dtype()))
+          : Tensor();
+      return PackedSequence(
+          std::move(data),
+          batch_sizes_,
+          std::move(sorted_indices),
+          std::move(unsorted_indices));
+    }
+  }
+
+  PackedSequence cuda() const {
+    return to(kCUDA);
+  }
+
+  PackedSequence cpu() const {
+    return to(kCPU);
+  }
+
+  /// Returns true if `data_` stored on a gpu
+  bool is_cuda() const {
+    return data_.is_cuda();
+  }
+
+  /// Returns true if `data_` stored on in pinned memory
+  bool is_pinned() const {
+    return data_.is_pinned();
+  }
+
+ private:
+  Tensor data_;
+  Tensor batch_sizes_;
+  Tensor sorted_indices_;
+  Tensor unsorted_indices_;
+};
+
+/// Packs a Tensor containing padded sequences of variable length.
+///
+/// `input` can be of size ``T x B x *`` where `T` is the length of the
+/// longest sequence (equal to ``lengths[0]``), ``B`` is the batch size, and
+/// ``*`` is any number of dimensions (including 0). If ``batch_first`` is
+/// ``true``, ``B x T x *`` `input` is expected.
+///
+/// For unsorted sequences, use `enforce_sorted = false`. If `enforce_sorted` is
+/// ``true``, the sequences should be sorted by length in a decreasing order,
+/// i.e.
+/// ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the
+/// shortest one.
+///
+/// Note:
+///     This function accepts any input that has at least two dimensions. You
+///     can apply it to pack the labels, and use the output of the RNN with
+///     them to compute the loss directly. A Tensor can be retrieved from
+///     a `PackedSequence` object by calling its ``.data()`` function.
+///
+/// Arguments:
+///     input (Tensor): padded batch of variable length sequences.
+///     lengths (Tensor): list of sequences lengths of each batch element.
+///     batch_first (bool, optional): if ``true``, the input is expected in ``B
+///     x T x *``
+///         format. Default: ``false``.
+///     enforce_sorted (bool, optional): if ``true``, the input is expected to
+///         contain sequences sorted by length in a decreasing order. If
+///         ``false``, this condition is not checked. Default: ``true``.
+///
+/// Returns:
+///     a `PackedSequence` object
+inline PackedSequence pack_padded_sequence(
+    Tensor input,
+    Tensor lengths,
+    bool batch_first = false,
+    bool enforce_sorted = true) {
+  lengths = lengths.to(kInt64);
+  Tensor sorted_indices;
+  if (enforce_sorted) {
+    sorted_indices = Tensor();
+  } else {
+    std::tie(lengths, sorted_indices) =
+        torch::sort(lengths, /*dim=*/-1, /*descending=*/true);
+    sorted_indices = sorted_indices.to(input.device());
+    int64_t batch_dim = batch_first ? 0 : 1;
+    input = input.index_select(batch_dim, sorted_indices);
+  }
+
+  auto [data, batch_sizes] =
+      torch::_pack_padded_sequence(input, lengths, batch_first);
+  return PackedSequence(
+      std::move(data), std::move(batch_sizes), std::move(sorted_indices), {});
+}
+
+/// Pads a packed batch of variable length sequences.
+///
+/// It is an inverse operation to `pack_padded_sequence`.
+///
+/// The returned Tensor's data will be of size ``T x B x *``, where `T` is the
+/// length of the longest sequence and `B` is the batch size. If ``batch_first``
+/// is true, the data will be transposed into ``B x T x *`` format.
+///
+/// Batch elements will be ordered decreasingly by their length.
+///
+/// Arguments:
+///     sequence (PackedSequence): batch to pad
+///     batch_first (bool, optional): if ``true``, the output will be in ``B x T
+///     x *``
+///         format.
+///     padding_value (double, optional): values for padded elements.
+///     total_length (int64_t, optional): if specified, the output will be
+///     padded to
+///         have length `total_length`. This method will throw error
+///         if `total_length` is less than the max sequence length in
+///         `sequence`.
+///
+/// Returns:
+///     Tuple of Tensor containing the padded sequence, and a Tensor
+///     containing the list of lengths of each sequence in the batch.
+inline std::tuple<Tensor, Tensor> pad_packed_sequence(
+    PackedSequence sequence,
+    bool batch_first = false,
+    double padding_value = 0.0,
+    c10::optional<int64_t> total_length = torch::nullopt) {
+  int64_t max_seq_length = sequence.batch_sizes().size(0);
+  if (total_length.has_value()) {
+    int64_t total_length_val = total_length.value();
+    TORCH_CHECK(
+        total_length_val >= max_seq_length,
+        "Expected total_length to be at least the length "
+        "of the longest sequence in input, but got "
+        "total_length=",
+        total_length_val,
+        " and max sequence length being ",
+        max_seq_length);
+    max_seq_length = total_length_val;
+  }
+  auto [padded_output, lengths] = torch::_pad_packed_sequence(
+      sequence.data(),
+      sequence.batch_sizes(),
+      batch_first,
+      padding_value,
+      max_seq_length);
+  const Tensor& unsorted_indices = sequence.unsorted_indices();
+  if (unsorted_indices.defined()) {
+    int64_t batch_dim = batch_first ? 0 : 1;
+    return std::make_tuple(
+        padded_output.index_select(batch_dim, unsorted_indices),
+        lengths.index({unsorted_indices.cpu()}));
+  }
+  return std::make_tuple(padded_output, lengths);
+}
+
+/// Pad a list of variable length Tensors with ``padding_value``
+///
+/// ``pad_sequence`` stacks a list of Tensors along a new dimension,
+/// and pads them to equal length. For example, if the input is list of
+/// sequences with size ``L x *`` and if batch_first is false, and ``T x B x *``
+/// otherwise.
+///
+/// `B` is batch size. It is equal to the number of elements in ``sequences``.
+/// `T` is length of the longest sequence.
+/// `L` is length of the sequence.
+/// `*` is any number of trailing dimensions, including none.
+///
+/// Note:
+///     This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+///     where `T` is the length of the longest sequence. This function assumes
+///     trailing dimensions and type of all the Tensors in sequences are same.
+///
+/// Arguments:
+///     sequences (torch::ArrayRef<Tensor>): list of variable length sequences.
+///     batch_first (bool, optional): output will be in ``B x T x *`` if true,
+///     or in
+///         ``T x B x *`` otherwise
+///     padding_value (double, optional): value for padded elements. Default: 0.
+///
+/// Returns:
+///     Tensor of size ``T x B x *`` if `batch_first` is ``false``.
+///     Tensor of size ``B x T x *`` otherwise
+inline Tensor pad_sequence(
+    ArrayRef<Tensor> sequences,
+    bool batch_first = false,
+    double padding_value = 0) {
+  return at::pad_sequence(sequences, batch_first, padding_value);
+}
+
+/// Packs a list of variable length Tensors
+///
+/// ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
+/// the length of a sequence and `*` is any number of trailing dimensions,
+/// including zero.
+///
+/// For unsorted sequences, use `enforce_sorted = false`. If ``enforce_sorted``
+/// is ``true``, the sequences should be sorted in the order of decreasing
+/// length.
+///
+///
+/// Arguments:
+///     sequences (torch::ArrayRef<Tensor>): A list of sequences of decreasing
+///     length. enforce_sorted (bool, optional): if ``true``, checks that the
+///     input
+///         contains sequences sorted by length in a decreasing order. If
+///         ``false``, this condition is not checked. Default: ``true``.
+///
+/// Returns:
+///     a `PackedSequence` object
+inline PackedSequence pack_sequence(
+    ArrayRef<Tensor> sequences,
+    bool enforce_sorted = true) {
+  Tensor lengths = torch::empty({(int64_t)sequences.size()}, kInt64);
+  for (const auto i : c10::irange(sequences.size())) {
+    lengths[i] = sequences[i].size(0);
+  }
+  return pack_padded_sequence(
+      at::pad_sequence(sequences),
+      std::move(lengths),
+      /*batch_first=*/false,
+      /*enforce_sorted=*/enforce_sorted);
+}
+
+} // namespace rnn
+} // namespace utils
+} // namespace nn
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h
new file mode 100644
index 0000000000000000000000000000000000000000..285ea89d2e91e5aca22ec906f4c78ec090ef0af6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/optim/adagrad.h>
+#include <torch/optim/adam.h>
+#include <torch/optim/adamw.h>
+#include <torch/optim/lbfgs.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/rmsprop.h>
+#include <torch/optim/sgd.h>
+
+#include <torch/optim/schedulers/lr_scheduler.h>
+#include <torch/optim/schedulers/step_lr.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..f11b3636efb6b400d7faa5389ede12a90a4231bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <torch/nn/pimpl.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API AdagradOptions
+    : public OptimizerCloneableOptions<AdagradOptions> {
+  AdagradOptions(double lr = 1e-2);
+  TORCH_ARG(double, lr) = 1e-2;
+  TORCH_ARG(double, lr_decay) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, initial_accumulator_value) = 0;
+  TORCH_ARG(double, eps) = 1e-10;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdagradOptions& lhs,
+      const AdagradOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdagradParamState
+    : public OptimizerCloneableParamState<AdagradParamState> {
+  TORCH_ARG(torch::Tensor, sum);
+  TORCH_ARG(int64_t, step) = 0;
+
+ public:
+  AdagradParamState() = default;
+  AdagradParamState(const AdagradParamState&) = default;
+  AdagradParamState& operator=(const AdagradParamState&) = default;
+  AdagradParamState(AdagradParamState&&) noexcept = default;
+  AdagradParamState& operator=(AdagradParamState&&) noexcept = default;
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdagradParamState& lhs,
+      const AdagradParamState& rhs);
+};
+
+class TORCH_API Adagrad : public Optimizer {
+ public:
+  explicit Adagrad(
+      std::vector<OptimizerParamGroup> param_groups,
+      AdagradOptions defaults = {})
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<AdagradOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(
+        defaults.lr_decay() >= 0,
+        "Invalid lr_decay value: ",
+        defaults.lr_decay());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        defaults.initial_accumulator_value() >= 0,
+        "Invalid initial_accumulator_value value: ",
+        defaults.initial_accumulator_value());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+
+    for (const auto& group : param_groups_) {
+      for (const auto& p : group.params()) {
+        auto state = std::make_unique<AdagradParamState>();
+        state->step(0);
+        state->sum(torch::full_like(
+            p.data(),
+            defaults.initial_accumulator_value(),
+            at::MemoryFormat::Preserve));
+        state_[p.unsafeGetTensorImpl()] = std::move(state);
+      }
+    }
+  }
+
+  explicit Adagrad(std::vector<Tensor> params, AdagradOptions defaults = {})
+      : Adagrad({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(Adagrad);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h
new file mode 100644
index 0000000000000000000000000000000000000000..62db383763f905a604062bec12f77c1a4fcc6474
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API AdamOptions : public OptimizerCloneableOptions<AdamOptions> {
+  AdamOptions(double lr = 1e-3);
+  TORCH_ARG(double, lr) = 1e-3;
+  typedef std::tuple<double, double> betas_t;
+  TORCH_ARG(betas_t, betas) = std::make_tuple(0.9, 0.999);
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, amsgrad) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamOptions& lhs,
+      const AdamOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdamParamState
+    : public OptimizerCloneableParamState<AdamParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, exp_avg);
+  TORCH_ARG(torch::Tensor, exp_avg_sq);
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamParamState& lhs,
+      const AdamParamState& rhs);
+};
+
+class TORCH_API Adam : public Optimizer {
+ public:
+  explicit Adam(
+      std::vector<OptimizerParamGroup> param_groups,
+      AdamOptions defaults = {})
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<AdamOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    auto betas = defaults.betas();
+    TORCH_CHECK(
+        0 <= std::get<0>(betas) && std::get<0>(betas) < 1.0,
+        "Invalid beta parameter at index 0: ",
+        std::get<0>(betas));
+    TORCH_CHECK(
+        0 <= std::get<1>(betas) && std::get<1>(betas) < 1.0,
+        "Invalid beta parameter at index 1: ",
+        std::get<1>(betas));
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+  }
+  explicit Adam(std::vector<Tensor> params, AdamOptions defaults = {})
+      : Adam({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(Adam);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h
new file mode 100644
index 0000000000000000000000000000000000000000..24e3e4063e966ca425df5bf32da9b1a412ef2093
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API AdamWOptions : public OptimizerCloneableOptions<AdamWOptions> {
+  AdamWOptions(double lr = 1e-3);
+  TORCH_ARG(double, lr) = 1e-3;
+  typedef std::tuple<double, double> betas_t;
+  TORCH_ARG(betas_t, betas) = std::make_tuple(0.9, 0.999);
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 1e-2;
+  TORCH_ARG(bool, amsgrad) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamWOptions& lhs,
+      const AdamWOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdamWParamState
+    : public OptimizerCloneableParamState<AdamWParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, exp_avg);
+  TORCH_ARG(torch::Tensor, exp_avg_sq);
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamWParamState& lhs,
+      const AdamWParamState& rhs);
+};
+
+class TORCH_API AdamW : public Optimizer {
+ public:
+  explicit AdamW(
+      std::vector<OptimizerParamGroup> param_groups,
+      AdamWOptions defaults = {})
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<AdamWOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    auto betas = defaults.betas();
+    TORCH_CHECK(
+        0 <= std::get<0>(betas) && std::get<0>(betas) < 1.0,
+        "Invalid beta parameter at index 0: ",
+        std::get<0>(betas));
+    TORCH_CHECK(
+        0 <= std::get<1>(betas) && std::get<1>(betas) < 1.0,
+        "Invalid beta parameter at index 1: ",
+        std::get<1>(betas));
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+  }
+  explicit AdamW(std::vector<Tensor> params, AdamWOptions defaults = {})
+      : AdamW({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(AdamW);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h
new file mode 100644
index 0000000000000000000000000000000000000000..09676ffb3428eafd20be10b35f4e3afc3eea5a63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API LBFGSOptions : public OptimizerCloneableOptions<LBFGSOptions> {
+  LBFGSOptions(double lr = 1);
+  TORCH_ARG(double, lr) = 1;
+  TORCH_ARG(int64_t, max_iter) = 20;
+  TORCH_ARG(c10::optional<int64_t>, max_eval) = c10::nullopt;
+  TORCH_ARG(double, tolerance_grad) = 1e-7;
+  TORCH_ARG(double, tolerance_change) = 1e-9;
+  TORCH_ARG(int64_t, history_size) = 100;
+  TORCH_ARG(c10::optional<std::string>, line_search_fn) = c10::nullopt;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const LBFGSOptions& lhs,
+      const LBFGSOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API LBFGSParamState
+    : public OptimizerCloneableParamState<LBFGSParamState> {
+  TORCH_ARG(int64_t, func_evals) = 0;
+  TORCH_ARG(int64_t, n_iter) = 0;
+  TORCH_ARG(double, t) = 0;
+  TORCH_ARG(double, prev_loss) = 0;
+  TORCH_ARG(Tensor, d) = {};
+  TORCH_ARG(Tensor, H_diag) = {};
+  TORCH_ARG(Tensor, prev_flat_grad) = {};
+  TORCH_ARG(std::deque<Tensor>, old_dirs);
+  TORCH_ARG(std::deque<Tensor>, old_stps);
+  TORCH_ARG(std::deque<Tensor>, ro);
+  TORCH_ARG(c10::optional<std::vector<Tensor>>, al) = c10::nullopt;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const LBFGSParamState& lhs,
+      const LBFGSParamState& rhs);
+};
+
+class TORCH_API LBFGS : public Optimizer {
+ public:
+  explicit LBFGS(
+      std::vector<OptimizerParamGroup> param_groups,
+      LBFGSOptions defaults = {})
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<LBFGSOptions>(defaults)) {
+    TORCH_CHECK(
+        param_groups_.size() == 1,
+        "LBFGS doesn't support per-parameter options (parameter groups)");
+    if (defaults.max_eval() == c10::nullopt) {
+      auto max_eval_val = (defaults.max_iter() * 5) / 4;
+      static_cast<LBFGSOptions&>(param_groups_[0].options())
+          .max_eval(max_eval_val);
+      static_cast<LBFGSOptions&>(*defaults_.get()).max_eval(max_eval_val);
+    }
+    _numel_cache = c10::nullopt;
+  }
+  explicit LBFGS(std::vector<Tensor> params, LBFGSOptions defaults = {})
+      : LBFGS({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  Tensor step(LossClosure closure) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  c10::optional<int64_t> _numel_cache;
+  int64_t _numel();
+  Tensor _gather_flat_grad();
+  void _add_grad(const double step_size, const Tensor& update);
+  std::tuple<double, Tensor> _directional_evaluate(
+      const LossClosure& closure,
+      const std::vector<Tensor>& x,
+      double t,
+      const Tensor& d);
+  void _set_param(const std::vector<Tensor>& params_data);
+  std::vector<Tensor> _clone_param();
+
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(LBFGS);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..071e78239129eb6583a6f01fdf96a8fa418dc880
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/util/Exception.h>
+#include <c10/util/flat_hash_map.h>
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Forward declarations confuse Doxygen
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+namespace torch {
+namespace optim {
+
+class TORCH_API OptimizerParamState {
+ public:
+  OptimizerParamState() = default;
+  OptimizerParamState(const OptimizerParamState&) = default;
+  OptimizerParamState& operator=(const OptimizerParamState&) = default;
+  OptimizerParamState(OptimizerParamState&&) noexcept = default;
+  OptimizerParamState& operator=(OptimizerParamState&&) noexcept = default;
+  virtual std::unique_ptr<OptimizerParamState> clone() const;
+  virtual void serialize(torch::serialize::InputArchive& archive);
+  virtual void serialize(torch::serialize::OutputArchive& archive) const;
+  virtual ~OptimizerParamState() = default;
+};
+
+template <typename Derived>
+class OptimizerCloneableParamState : public OptimizerParamState {
+  std::unique_ptr<OptimizerParamState> clone() const override {
+    return std::make_unique<Derived>(static_cast<const Derived&>(*this));
+  }
+};
+
+class TORCH_API OptimizerOptions {
+ public:
+  OptimizerOptions() = default;
+  OptimizerOptions(const OptimizerOptions&) = default;
+  OptimizerOptions& operator=(const OptimizerOptions&) = default;
+  OptimizerOptions(OptimizerOptions&&) noexcept = default;
+  OptimizerOptions& operator=(OptimizerOptions&&) noexcept = default;
+  virtual std::unique_ptr<OptimizerOptions> clone() const;
+  virtual void serialize(torch::serialize::InputArchive& archive);
+  virtual void serialize(torch::serialize::OutputArchive& archive) const;
+  virtual ~OptimizerOptions() = default;
+  virtual double get_lr() const;
+  virtual void set_lr(const double lr);
+};
+
+template <typename Derived>
+class OptimizerCloneableOptions : public OptimizerOptions {
+ private:
+  std::unique_ptr<OptimizerOptions> clone() const override {
+    return std::make_unique<Derived>(static_cast<const Derived&>(*this));
+  }
+};
+
+/// Stores parameters in the param_group and stores a pointer to the
+/// OptimizerOptions
+class TORCH_API OptimizerParamGroup {
+ public:
+  // NOTE: In order to store `OptimizerParamGroup` in a `std::vector`, it has to
+  // be copy-constructible.
+  OptimizerParamGroup(const OptimizerParamGroup& param_group)
+      : params_(param_group.params()),
+        options_(
+            param_group.has_options() ? param_group.options().clone()
+                                      : nullptr) {}
+  OptimizerParamGroup(std::vector<Tensor> params)
+      : params_(std::move(params)) {}
+  OptimizerParamGroup(
+      std::vector<Tensor> params,
+      std::unique_ptr<OptimizerOptions> options)
+      : params_(std::move(params)), options_(std::move(options)) {}
+
+  bool has_options() const;
+  OptimizerOptions& options();
+  const OptimizerOptions& options() const;
+  void set_options(std::unique_ptr<OptimizerOptions> options);
+  std::vector<Tensor>& params();
+  const std::vector<Tensor>& params() const;
+
+ protected:
+  std::vector<Tensor> params_;
+  std::unique_ptr<OptimizerOptions> options_;
+};
+
+class TORCH_API Optimizer {
+ public:
+  // The copy constructor is deleted, because the user should use the
+  // `state_dict` / `load_state_dict` API to copy an optimizer instead.
+  Optimizer(const Optimizer& optimizer) = delete;
+  Optimizer(Optimizer&& optimizer) = default;
+
+  explicit Optimizer(
+      std::vector<OptimizerParamGroup> param_groups,
+      std::unique_ptr<OptimizerOptions> defaults)
+      : defaults_(std::move(defaults)) {
+    for (const auto& param_group : param_groups) {
+      add_param_group(param_group);
+    }
+  }
+
+  /// Constructs the `Optimizer` from a vector of parameters.
+  explicit Optimizer(
+      std::vector<Tensor> parameters,
+      std::unique_ptr<OptimizerOptions> defaults)
+      : Optimizer(
+            {OptimizerParamGroup(std::move(parameters))},
+            std::move(defaults)){};
+
+  /// Adds the given param_group to the optimizer's param_group list.
+  void add_param_group(const OptimizerParamGroup& param_group);
+
+  virtual ~Optimizer() = default;
+
+  using LossClosure = std::function<Tensor()>;
+  /// A loss function closure, which is expected to return the loss value.
+  virtual Tensor step(LossClosure closure = nullptr) = 0;
+
+  /// Adds the given vector of parameters to the optimizer's parameter list.
+  void add_parameters(const std::vector<Tensor>& parameters);
+
+  /// Zeros out the gradients of all parameters.
+  void zero_grad(bool set_to_none = true);
+
+  /// Provides a const reference to the parameters in the first param_group this
+  /// optimizer holds.
+  const std::vector<Tensor>& parameters() const noexcept;
+
+  /// Provides a reference to the parameters in the first param_group this
+  /// optimizer holds.
+  std::vector<Tensor>& parameters() noexcept;
+
+  /// Returns the number of parameters referenced by the optimizer.
+  size_t size() const noexcept;
+
+  OptimizerOptions& defaults() noexcept;
+
+  const OptimizerOptions& defaults() const noexcept;
+
+  /// Provides a reference to the param_groups this optimizer holds.
+  std::vector<OptimizerParamGroup>& param_groups() noexcept;
+
+  /// Provides a const reference to the param_groups this optimizer holds.
+  const std::vector<OptimizerParamGroup>& param_groups() const noexcept;
+
+  /// Provides a reference to the state this optimizer holds
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>&
+  state() noexcept;
+
+  /// Provides a const reference to the state this optimizer holds
+  const ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>& state()
+      const noexcept;
+
+  /// Serializes the optimizer state into the given `archive`.
+  virtual void save(serialize::OutputArchive& archive) const;
+
+  /// Deserializes the optimizer state from the given `archive`.
+  virtual void load(serialize::InputArchive& archive);
+
+ protected:
+  std::vector<OptimizerParamGroup> param_groups_;
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>> state_;
+  std::unique_ptr<OptimizerOptions> defaults_;
+};
+
+/* How do we decide whether to serialize undefined tensors or
+  c10::nullopt values into the output archive?
+Answer: we strictly follow the behavior of Python API. To be more specific:
+
+For optimizer options:
+a) For undefined tensor: currently no tensor is used as an options argument in
+Python API, so we don't need to worry about it now. b) For c10::nullopt value:
+we serialize c10::nullopt values into the output archive, to follow the exact
+same behavior as Python API.
+
+For optimizer param state:
+a) For undefined tensor: in param state, undefined tensor in C++ impl is
+equivalent to missing key in Python impl. Since we don't serialize missing keys
+in Python API, we skip undefined tensors when serializing the param state. b)
+For c10::nullopt value: in param state, c10::nullopt value in C++ impl is
+equivalent to missing key in Python impl. Since we don't serialize missing keys
+in Python API, we skip c10::nullopt values when serializing the param state. */
+
+/// Serializes an `Optimizer` into an `OutputArchive`.
+TORCH_API serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Optimizer& optimizer);
+
+/// Deserializes a `Tensor` from an `InputArchive`.
+TORCH_API serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Optimizer& optimizer);
+
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..6419ee3b6e503a7a09ef81a4849bbd252d3839a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API RMSpropOptions
+    : public OptimizerCloneableOptions<RMSpropOptions> {
+  RMSpropOptions(double lr = 1e-2);
+  TORCH_ARG(double, lr) = 1e-2;
+  TORCH_ARG(double, alpha) = 0.99;
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(bool, centered) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const RMSpropOptions& lhs,
+      const RMSpropOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API RMSpropParamState
+    : public OptimizerCloneableParamState<RMSpropParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, square_avg);
+  TORCH_ARG(torch::Tensor, momentum_buffer) = {};
+  TORCH_ARG(torch::Tensor, grad_avg) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const RMSpropParamState& lhs,
+      const RMSpropParamState& rhs);
+};
+
+class TORCH_API RMSprop : public Optimizer {
+ public:
+  explicit RMSprop(
+      std::vector<OptimizerParamGroup> param_groups,
+      RMSpropOptions defaults = {})
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<RMSpropOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    TORCH_CHECK(
+        defaults.momentum() >= 0,
+        "Invalid momentum value: ",
+        defaults.momentum());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        defaults.alpha() >= 0, "Invalid alpha value: ", defaults.alpha());
+  }
+
+  explicit RMSprop(std::vector<Tensor> params, RMSpropOptions defaults = {})
+      : RMSprop({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(RMSprop);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..215b966edb905beac525ad3195f0418cac26b732
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <torch/optim/optimizer.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace optim {
+
+class TORCH_API LRScheduler {
+ public:
+  // This class needs to take a reference of an optimizer from outside such that
+  // it can modify its learning rates; due to this the lifetime of said
+  // optimizer must be maintained
+  LRScheduler(torch::optim::Optimizer& optimizer);
+
+  virtual ~LRScheduler() = default;
+
+  void step();
+
+ protected:
+  // A vector of learning rates is calculated and returned from the specific
+  // subclass. A vector is returned with each element being a separate learning
+  // rate for each param group - although the normal use case would be to return
+  // a vector of identical elements.
+  virtual std::vector<double> get_lrs() = 0;
+
+  // Get current learning rates from the optimizer
+  std::vector<double> get_current_lrs() const;
+
+  unsigned step_count_{};
+
+ private:
+  void set_optimizer_lrs(const std::vector<double>& learning_rates);
+
+  torch::optim::Optimizer& optimizer_;
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddae181535acd19c0615eb56d4c789aea4f65a23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/optim/optimizer.h>
+
+#include <torch/csrc/Export.h>
+
+#include <string>
+
+#include <cmath>
+
+#include <iostream>
+
+namespace torch {
+namespace optim {
+
+class TORCH_API ReduceLROnPlateauScheduler {
+ public:
+  enum SchedulerMode { min, max };
+  enum ThresholdMode { rel, abs };
+  ReduceLROnPlateauScheduler(
+      Optimizer& optimizer,
+      SchedulerMode mode = min,
+      float factor = 0.1,
+      int patience = 10,
+      double threshold = 1e-4,
+      ThresholdMode threshold_mode = rel,
+      int cooldown = 0,
+      const std::vector<float>& min_lr = std::vector<float>(),
+      double eps = 1e-8,
+      bool verbose = false);
+
+  virtual ~ReduceLROnPlateauScheduler() = default;
+
+  void step(float metric);
+
+ private:
+  void reset();
+  void reduce_lr(int epoch);
+  bool in_cooldown();
+  bool is_better(float a);
+  void init_is_better(
+      SchedulerMode mode,
+      double threshold,
+      ThresholdMode threshold_mode);
+
+  Optimizer& optimizer;
+  SchedulerMode mode;
+  float mode_worse;
+  float factor;
+  int patience;
+  double threshold;
+  ThresholdMode threshold_mode;
+  int cooldown;
+  int cooldown_counter;
+  std::vector<float> min_lrs;
+  double eps;
+  float best;
+  bool verbose;
+  int last_epoch;
+  int num_bad_epochs;
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h
new file mode 100644
index 0000000000000000000000000000000000000000..e83d7f565b32ce242a99956aca0f3a0e621a0827
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/optim/schedulers/lr_scheduler.h>
+
+namespace torch {
+namespace optim {
+
+class TORCH_API StepLR : public LRScheduler {
+ public:
+  StepLR(
+      torch::optim::Optimizer& optimizer,
+      const unsigned step_size,
+      const double gamma = 0.1);
+
+ private:
+  std::vector<double> get_lrs() override;
+
+  const unsigned step_size_;
+  const double gamma_;
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..031b27dbb49708f59ed316bf4ecb9cabfb720642
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h
@@ -0,0 +1,309 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/optim/optimizer.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace optim {
+namespace detail {
+// Utility function to save state
+template <typename DerivedOptimizerParamState>
+void serialize(
+    serialize::OutputArchive& archive,
+    const ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>&
+        state) {
+  for (const auto& item : state) {
+    serialize::OutputArchive param_state_archive(archive.compilation_unit());
+    std::string tensorimpl_key =
+        std::to_string(reinterpret_cast<size_t>(item.first));
+    const DerivedOptimizerParamState& curr_state =
+        static_cast<const DerivedOptimizerParamState&>(*(item.second.get()));
+    curr_state.serialize(param_state_archive);
+    archive.write(tensorimpl_key, param_state_archive);
+  }
+}
+
+// Utility function to load state
+template <typename DerivedOptimizerParamState>
+void serialize(
+    serialize::InputArchive& archive,
+    ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>& state) {
+  std::vector<std::string> tensorimpl_keys = archive.keys();
+  for (const std::string& tensorimpl_key : tensorimpl_keys) {
+    serialize::InputArchive param_state_archive;
+    archive.read(tensorimpl_key, param_state_archive);
+    DerivedOptimizerParamState param_state;
+    param_state.serialize(param_state_archive);
+    state[reinterpret_cast<void*>(std::stoull(tensorimpl_key))] =
+        std::make_unique<DerivedOptimizerParamState>(param_state);
+  }
+}
+
+// Utility function to save param_groups
+template <typename DerivedOptimizerParamOptions>
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::vector<OptimizerParamGroup>& param_groups) {
+  archive.write(
+      "param_groups/size",
+      torch::tensor(static_cast<int64_t>(param_groups.size())));
+  for (const auto i : c10::irange(param_groups.size())) {
+    serialize::OutputArchive param_group_archive(archive.compilation_unit());
+    std::vector<Tensor> params = param_groups[i].params();
+    param_group_archive.write(
+        "params/size", torch::tensor(static_cast<int64_t>(params.size())));
+    for (const auto index : c10::irange(params.size())) {
+      param_group_archive.write(
+          "params/" + std::to_string(index),
+          IValue(std::to_string(
+              reinterpret_cast<size_t>(params[index].unsafeGetTensorImpl()))));
+    }
+    const DerivedOptimizerParamOptions& param_group_options =
+        static_cast<const DerivedOptimizerParamOptions&>(
+            param_groups[i].options());
+    serialize::OutputArchive param_group_options_archive(
+        param_group_archive.compilation_unit());
+    param_group_options.serialize(param_group_options_archive);
+    param_group_archive.write("options", param_group_options_archive);
+    archive.write("param_groups/" + std::to_string(i), param_group_archive);
+  }
+}
+
+// Utility function to load param_groups
+// We take as input vector of pair of string and unique_ptr to optimizer options
+// so that we can retain the state for each param by using the old tensor impl
+// keys (saved during serialization) and map the new tensor impl keys to the
+// correct state for each param
+template <typename DerivedOptimizerParamOptions>
+void serialize(
+    serialize::InputArchive& archive,
+    std::vector<
+        std::pair<std::vector<std::string>, std::unique_ptr<OptimizerOptions>>>&
+        param_groups) {
+  torch::Tensor param_groups_size_tensor;
+  archive.read("param_groups/size", param_groups_size_tensor);
+  const int64_t param_groups_size = param_groups_size_tensor.item<int64_t>();
+  for (const auto i : c10::irange(param_groups_size)) {
+    serialize::InputArchive param_group_archive;
+    archive.read("param_groups/" + std::to_string(i), param_group_archive);
+    torch::Tensor size_tensor;
+    param_group_archive.read("params/size", size_tensor);
+    const int64_t size = size_tensor.item<int64_t>();
+    std::vector<std::string> params;
+    for (const auto index : c10::irange(size)) {
+      IValue ivalue;
+      param_group_archive.read("params/" + std::to_string(index), ivalue);
+      std::string element = ivalue.toStringRef();
+      params.emplace_back(element);
+    }
+    serialize::InputArchive param_group_options_archive;
+    param_group_archive.read("options", param_group_options_archive);
+    DerivedOptimizerParamOptions param_group_options(0);
+    param_group_options.serialize(param_group_options_archive);
+    param_groups.emplace_back(std::make_pair(
+        params,
+        std::make_unique<DerivedOptimizerParamOptions>(param_group_options)));
+  }
+}
+} // namespace detail
+
+// Note: These functions are all called `serialize()` so they can be called
+// inside a template where the archive type is a template type and can thus be
+// passed such that the appropriate overload is selected.
+
+/// Utility function to save a value of `int64_t` type.
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const int64_t& value);
+
+/// Utility function to load a value of `int64_t` type.
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    int64_t& value);
+
+/// Utility function to save a vector of step buffers.
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const std::vector<int64_t>& steps);
+
+/// Utility function to load a vector of step buffers.
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    std::vector<int64_t>& steps);
+
+// Utility function to save state and param_groups
+template <
+    typename DerivedOptimizerParamState,
+    typename DerivedOptimizerParamOptions>
+void serialize(serialize::OutputArchive& archive, const Optimizer& optimizer) {
+  archive.write("pytorch_version", IValue("1.5.0"));
+  serialize::OutputArchive state_archive(archive.compilation_unit());
+  detail::serialize<DerivedOptimizerParamState>(
+      state_archive, optimizer.state());
+  archive.write("state", state_archive);
+
+  serialize::OutputArchive param_groups_archive(archive.compilation_unit());
+  detail::serialize<DerivedOptimizerParamOptions>(
+      param_groups_archive, optimizer.param_groups());
+  archive.write("param_groups", param_groups_archive);
+}
+
+// Utility function to load state and param_groups and update state
+template <
+    typename DerivedOptimizerParamState,
+    typename DerivedOptimizerParamOptions>
+void serialize(serialize::InputArchive& archive, Optimizer& optimizer) {
+  IValue pytorch_version;
+  archive.read("pytorch_version", pytorch_version);
+  TORCH_INTERNAL_ASSERT(pytorch_version.toStringRef() == "1.5.0");
+  serialize::InputArchive state_archive;
+  archive.read("state", state_archive);
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>> saved_state;
+  detail::serialize<DerivedOptimizerParamState>(state_archive, saved_state);
+
+  serialize::InputArchive param_groups_archive;
+  archive.read("param_groups", param_groups_archive);
+  std::vector<
+      std::pair<std::vector<std::string>, std::unique_ptr<OptimizerOptions>>>
+      saved_param_groups;
+  detail::serialize<DerivedOptimizerParamOptions>(
+      param_groups_archive, saved_param_groups);
+
+  // update state
+  TORCH_CHECK(
+      saved_param_groups.size() == optimizer.param_groups().size(),
+      "loaded state dict has a different number of parameter groups");
+  for (const auto i : c10::irange(saved_param_groups.size())) {
+    std::vector<std::string> param_group_old_keys = saved_param_groups[i].first;
+    std::vector<Tensor> params = optimizer.param_groups()[i].params();
+    TORCH_CHECK(
+        param_group_old_keys.size() == params.size(),
+        "loaded state dict contains a parameter group that has a different size than the optimizer's parameter group");
+
+    for (const auto idx : c10::irange(params.size())) {
+      auto param_group_old_key =
+          reinterpret_cast<void*>(std::stoull(param_group_old_keys[idx]));
+      if (saved_state.find(param_group_old_key) != saved_state.end()) {
+        optimizer.state()[params[idx].unsafeGetTensorImpl()] =
+            std::move(saved_state[param_group_old_key]);
+      }
+    }
+  }
+}
+
+/// Utility function to save a vector of buffers.
+template <typename BufferContainer>
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const BufferContainer& buffers) {
+  archive.write(
+      key + "/size", torch::tensor(static_cast<int64_t>(buffers.size())));
+  for (const auto index : c10::irange(buffers.size())) {
+    archive.write(
+        key + "/" + std::to_string(index), buffers[index], /*is_buffer=*/true);
+  }
+}
+
+/// Utility function to load a vector of buffers.
+template <typename BufferContainer>
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    BufferContainer& buffers) {
+  buffers.clear();
+  torch::Tensor size_tensor;
+  archive.read(key + "/size", size_tensor);
+  const size_t size = size_tensor.item<int64_t>();
+  for (const auto index : c10::irange(size)) {
+    buffers.emplace_back();
+    archive.read(
+        key + "/" + std::to_string(index), buffers.back(), /*is_buffer=*/true);
+  }
+}
+
+template <typename T>
+c10::List<T> deque_to_list(const std::deque<T>& dq) {
+  c10::List<T> list;
+  list.reserve(dq.size());
+  for (const auto& e : dq) {
+    list.emplace_back(e);
+  }
+  return list;
+}
+
+template <typename T>
+std::deque<T> list_to_deque(const c10::List<T>& list) {
+  std::deque<T> dq;
+  for (const auto& e : list) {
+    dq.emplace_back(e);
+  }
+  return dq;
+}
+
+#define _TORCH_OPTIM_SERIALIZE(name) \
+  torch::optim::serialize(archive, #name, self.name)
+
+#define _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(OptimizerName)               \
+  torch::optim::serialize<OptimizerName##ParamState, OptimizerName##Options>( \
+      archive, self)
+
+#define _TORCH_OPTIM_SERIALIZE_TORCH_ARG(name)           \
+  {                                                      \
+    auto ivalue = torch::IValue(name());                 \
+    /* do not serialize if name is an undefined tensor*/ \
+    if (!(ivalue.isTensor() &&                           \
+          ivalue.unsafeToTensorImpl() ==                 \
+              at::UndefinedTensorImpl::singleton())) {   \
+      archive.write(#name, ivalue);                      \
+    }                                                    \
+  }
+
+#define _TORCH_OPTIM_SERIALIZE_TORCH_ARG_DEQUE(name)           \
+  {                                                            \
+    c10::IValue ivalue = torch::IValue(deque_to_list(name())); \
+    archive.write(#name, ivalue);                              \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(T, name)                   \
+  {                                                                   \
+    c10::IValue ivalue;                                               \
+    bool exists = archive.try_read(#name, ivalue);                    \
+    if (exists) {                                                     \
+      name(ivalue.to<T>());                                           \
+    } else {                                                          \
+      bool is_tensor_type = std::is_base_of<torch::Tensor, T>::value; \
+      TORCH_INTERNAL_ASSERT(is_tensor_type);                          \
+    }                                                                 \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG_OPTIONAL(T, name) \
+  {                                                          \
+    c10::IValue ivalue;                                      \
+    bool exists = archive.try_read(#name, ivalue);           \
+    if (exists) {                                            \
+      name(ivalue.toOptional<T>());                          \
+    }                                                        \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG_DEQUE(T, name) \
+  {                                                       \
+    c10::IValue ivalue;                                   \
+    archive.read(#name, ivalue);                          \
+    auto list = ivalue.to<c10::List<T::value_type>>();    \
+    name(list_to_deque(list));                            \
+  }
+
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cda13639e72eb7e46d958e26650c8b5b1acb9d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+
+namespace torch {
+namespace optim {
+
+struct TORCH_API SGDOptions : public OptimizerCloneableOptions<SGDOptions> {
+  SGDOptions(double lr);
+  TORCH_ARG(double, lr);
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(double, dampening) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, nesterov) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const SGDOptions& lhs,
+      const SGDOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API SGDParamState
+    : public OptimizerCloneableParamState<SGDParamState> {
+  TORCH_ARG(torch::Tensor, momentum_buffer);
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const SGDParamState& lhs,
+      const SGDParamState& rhs);
+};
+
+class TORCH_API SGD : public Optimizer {
+ public:
+  explicit SGD(
+      std::vector<OptimizerParamGroup> param_groups,
+      SGDOptions defaults)
+      : Optimizer(
+            std::move(param_groups),
+            std::make_unique<SGDOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(
+        defaults.momentum() >= 0,
+        "Invalid momentum value: ",
+        defaults.momentum());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        !defaults.nesterov() ||
+            (defaults.momentum() > 0 && defaults.dampening() == 0),
+        "Nesterov momentum requires a momentum and zero dampening");
+  }
+
+  explicit SGD(std::vector<Tensor> params, SGDOptions defaults)
+      : SGD({OptimizerParamGroup(std::move(params))}, defaults) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(SGD);
+  }
+};
+} // namespace optim
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f04e0a08e694aa6345b2e87e9433423ef65f3ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h
@@ -0,0 +1,516 @@
+#pragma once
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+/// An ordered dictionary implementation, akin to Python's `OrderedDict`.
+template <typename Key, typename Value>
+class OrderedDict {
+ public:
+  /// A (key, value) pair.
+  class Item;
+
+  // The lifetime of an iterator is bound to the lifetime of the `OrderedDict`.
+  // Further, any `insert()` operation may invalidate all iterators
+  // pointing into the vector.
+  using Iterator = typename std::vector<Item>::iterator;
+  using ConstIterator = typename std::vector<Item>::const_iterator;
+
+  /// Constructs the `OrderedDict` with a short description of the kinds of keys
+  /// stored in the `OrderedDict`. This description is used in error messages
+  /// thrown by the `OrderedDict`.
+  explicit OrderedDict(std::string key_description = "Key");
+
+  /// Copy constructs this `OrderedDict` from `other`.
+  OrderedDict(const OrderedDict& other);
+
+  /// Assigns items from `other` to this `OrderedDict`.
+  OrderedDict& operator=(const OrderedDict& other);
+
+  // NB: Move works by default, because you can move-construct vectors of const
+  // values. I tried to make this noexcept (conditional on the move constructors
+  // of index_ and items_ being noexcept) but the obvious spelling didn't
+  // compile on Windows.
+  OrderedDict(OrderedDict&& other) noexcept = default;
+  OrderedDict& operator=(OrderedDict&& other) noexcept = default;
+
+  ~OrderedDict() = default;
+
+  /// Constructs a new `OrderedDict` and pre-populates it with the given
+  /// `Item`s.
+  /*implicit */ OrderedDict(std::initializer_list<Item> initializer_list);
+
+  /// Returns the key description string the `OrderedDict` was constructed with.
+  const std::string& key_description() const noexcept;
+
+  // Element Access
+
+  /// Returns the very first item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  Item& front();
+
+  /// Returns the very first item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  const Item& front() const;
+
+  /// Returns the very last item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  Item& back();
+
+  /// Returns the very last item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  const Item& back() const;
+
+  /// Returns the item at the `index`-th position in the `OrderedDict`. Throws
+  /// an exception if the index is out of bounds.
+  Item& operator[](size_t index);
+
+  /// Returns the item at the `index`-th position in the `OrderedDict`. Throws
+  /// an exception if the index is out of bounds.
+  const Item& operator[](size_t index) const;
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `OrderedDict`. Use `find()` for a
+  /// non-throwing way of accessing a value if it is present.
+  Value& operator[](const Key& key);
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `OrderedDict`. Use `find()` for a
+  /// non-throwing way of accessing a value if it is present.
+  const Value& operator[](const Key& key) const;
+
+  // Lookup
+
+  /// Returns a pointer to the value associated with the given key, or a
+  /// `nullptr` if no such key is stored in the `OrderedDict`.
+  Value* find(const Key& key) noexcept;
+
+  /// Returns a pointer to the value associated with the given key, or a
+  /// `nullptr` if no such key is stored in the `OrderedDict`.
+  const Value* find(const Key& key) const noexcept;
+
+  /// Returns true if the key is present in the `OrderedDict`.
+  bool contains(const Key& key) const noexcept;
+
+  // Iterators
+
+  /// Returns an iterator to the first item in the `OrderedDict`. Iteration is
+  /// ordered.
+  Iterator begin();
+
+  /// Returns an iterator to the first item in the `OrderedDict`. Iteration is
+  /// ordered.
+  ConstIterator begin() const;
+
+  /// Returns an iterator one past the last item in the `OrderedDict`.
+  Iterator end();
+
+  /// Returns an iterator one past the last item in the `OrderedDict`.
+  ConstIterator end() const;
+
+  // Capacity
+
+  /// Returns the number of items currently stored in the `OrderedDict`.
+  size_t size() const noexcept;
+
+  /// Returns true if the `OrderedDict` contains no elements.
+  bool is_empty() const noexcept;
+
+  /// Resizes internal storage to fit at least `requested_capacity` items
+  /// without requiring reallocation.
+  void reserve(size_t requested_capacity);
+
+  // Modifiers
+
+  /// Inserts a new `(key, value)` pair into the `OrderedDict`. Throws an
+  /// exception if the key is already present. If insertion is successful,
+  /// immediately returns a reference to the inserted value.
+  template <typename K, typename V>
+  Value& insert(K&& key, V&& value);
+
+  /// Inserts a new `(key, value)` pair into the `OrderedDict`. Throws an
+  /// exception if the key is already present. If insertion is successful,
+  /// immediately returns a reference to the inserted value.
+  Value& insert(Key key, Value&& value);
+
+  /// Inserts all items from `other` into this `OrderedDict`. If any key from
+  /// `other` is already present in this `OrderedDict`, an exception is thrown.
+  void update(OrderedDict&& other);
+
+  /// Inserts all items from `other` into this `OrderedDict`. If any key from
+  /// `other` is already present in this `OrderedDict`, an exception is thrown.
+  void update(const OrderedDict& other);
+
+  /// Removes the item that has `key` from this `OrderedDict` if exists and if
+  /// it doesn't an exception is thrown.
+  void erase(const Key& key);
+
+  /// Removes all items from this `OrderedDict`.
+  void clear();
+
+  // Observers
+
+  /// Returns the items stored in the `OrderedDict`.
+  const std::vector<Item>& items() const noexcept;
+
+  /// Returns a newly allocated vector and copies all keys from this
+  /// `OrderedDict` into the vector.
+  ::std::vector<Key> keys() const;
+
+  /// Returns a newly allocated vector and copies all values from this
+  /// `OrderedDict` into the vector.
+  ::std::vector<Value> values() const;
+
+  /// Returns a newly allocated vector and copies all keys and values from this
+  /// `OrderedDict` into a vector of `std::pair<Key, Value>`.
+  ::std::vector<std::pair<Key, Value>> pairs() const;
+
+  /// Returns true if both dicts contain the same keys and values, in the same
+  /// order.
+  template <typename K, typename V>
+  friend bool operator==(
+      const OrderedDict<K, V>& a,
+      const OrderedDict<K, V>& b);
+
+ private:
+  /// A mapping from a key to an index into the `items_` vector.
+  ::std::unordered_map<Key, size_t> index_;
+
+  /// The items stored in the `OrderedDict`.
+  ::std::vector<Item> items_;
+
+  /// A description of the keys stored in the `OrderedDict`.
+  ::std::string key_description_{"Key"};
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ OrderedDict::Item ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename Key, typename Value>
+class OrderedDict<Key, Value>::Item {
+ public:
+  /// Constructs a new item.
+  Item(Key key, Value value) : pair_(std::move(key), std::move(value)) {}
+
+  /// Returns a reference to the value.
+  Value& operator*() {
+    return value();
+  }
+
+  /// Returns a reference to the value.
+  const Value& operator*() const {
+    return value();
+  }
+
+  /// Allows access to the value using the arrow operator.
+  Value* operator->() {
+    return &value();
+  }
+
+  /// Allows access to the value using the arrow operator.
+  const Value* operator->() const {
+    return &value();
+  }
+
+  /// Returns a reference to the key.
+  const Key& key() const noexcept {
+    return pair_.first;
+  }
+
+  /// Returns a reference to the value.
+  Value& value() noexcept {
+    return pair_.second;
+  }
+
+  /// Returns a reference to the value.
+  const Value& value() const noexcept {
+    return pair_.second;
+  }
+
+  /// Returns a `(key, value)` pair.
+  const std::pair<Key, Value>& pair() const noexcept {
+    return pair_;
+  }
+
+ private:
+  /// This is stored as an std::pair because it will make Python binding a lot,
+  /// lot easier.
+  ::std::pair<Key, Value> pair_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ OrderedDict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(std::string key_description)
+    : key_description_(std::move(key_description)) {}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(const OrderedDict& other)
+    : index_(other.index_), key_description_(other.key_description_) {
+  // Copy we have to do ourselves, because items' keys are const, so we have to
+  // re-insert the items.
+  for (const auto& item : other.items_) {
+    items_.push_back(item);
+  }
+}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>& OrderedDict<Key, Value>::operator=(
+    const OrderedDict& other) {
+  index_ = other.index_;
+  items_.clear();
+  for (auto& item : other.items_) {
+    items_.push_back(item);
+  }
+  key_description_ = other.key_description_;
+  return *this;
+}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(
+    std::initializer_list<Item> initializer_list)
+    : OrderedDict("Key") {
+  items_.reserve(initializer_list.size());
+  for (auto& item : initializer_list) {
+    // Copy the key here and move it into the index.
+    items_.emplace_back(item.key(), std::move(item.value()));
+    index_.emplace(std::move(item.key()), size() - 1);
+  }
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Iterator OrderedDict<Key, Value>::begin() {
+  return items_.begin();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::ConstIterator OrderedDict<Key, Value>::begin()
+    const {
+  return items_.begin();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Iterator OrderedDict<Key, Value>::end() {
+  return items_.end();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::ConstIterator OrderedDict<Key, Value>::end()
+    const {
+  return items_.end();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front() {
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  return items_.front();
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front()
+    const {
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  return items_.front();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back() {
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  return items_.back();
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back()
+    const {
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  return items_.back();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::operator[](
+    size_t index) {
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  return items_[index];
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::
+operator[](size_t index) const {
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  return items_[index];
+}
+
+template <typename Key, typename Value>
+Value& OrderedDict<Key, Value>::operator[](const Key& key) {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  AT_ERROR(key_description_, " '", key, "' is not defined");
+}
+
+template <typename Key, typename Value>
+const Value& OrderedDict<Key, Value>::operator[](const Key& key) const {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  AT_ERROR(key_description_, " '", key, "' is not defined");
+}
+
+template <typename Key, typename Value>
+template <typename K, typename V>
+Value& OrderedDict<Key, Value>::insert(K&& key, V&& value) {
+  TORCH_CHECK(
+      index_.count(key) == 0, key_description_, " '", key, "' already defined");
+  // Copy `key` here and move it into the index.
+  items_.emplace_back(key, std::forward<V>(value));
+  index_.emplace(std::forward<K>(key), size() - 1);
+  return items_.back().value();
+}
+
+template <typename Key, typename Value>
+Value& OrderedDict<Key, Value>::insert(Key key, Value&& value) {
+  return insert<Key, Value>(std::move(key), std::move(value));
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::update(OrderedDict&& other) {
+  reserve(size() + other.size());
+  for (auto& item : other) {
+    // We want to call `insert()` to prevent duplicate keys.
+    insert(std::move(item.key()), std::move(item.value()));
+  }
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::update(const OrderedDict& other) {
+  reserve(size() + other.size());
+  for (auto& item : other) {
+    // We want to call `insert()` to prevent duplicate keys.
+    insert(item.key(), item.value());
+  }
+}
+
+template <typename Key, typename Value>
+Value* OrderedDict<Key, Value>::find(const Key& key) noexcept {
+  auto iterator = index_.find(key);
+  if (iterator == index_.end()) {
+    return nullptr;
+  }
+  return &items_[iterator->second].value();
+}
+
+template <typename Key, typename Value>
+const Value* OrderedDict<Key, Value>::find(const Key& key) const noexcept {
+  auto iterator = index_.find(key);
+  if (iterator == index_.end()) {
+    return nullptr;
+  }
+  return &items_[iterator->second].value();
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::erase(const Key& key) {
+  auto it = index_.find(key);
+  TORCH_CHECK(it != index_.end(), "Key '", key, "' doesn't exist");
+
+  auto index = it->second;
+  index_.erase(it);
+  items_.erase(items_.begin() + index);
+
+  for (auto& pair : index_)
+    if (pair.second > index)
+      --pair.second;
+}
+
+template <typename Key, typename Value>
+bool OrderedDict<Key, Value>::contains(const Key& key) const noexcept {
+  return find(key) != nullptr;
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::clear() {
+  index_.clear();
+  items_.clear();
+}
+
+template <typename Key, typename Value>
+size_t OrderedDict<Key, Value>::size() const noexcept {
+  return items_.size();
+}
+
+template <typename Key, typename Value>
+bool OrderedDict<Key, Value>::is_empty() const noexcept {
+  return items_.empty();
+}
+
+template <typename Key, typename Value>
+const std::string& OrderedDict<Key, Value>::key_description() const noexcept {
+  return key_description_;
+}
+
+template <typename Key, typename Value>
+const std::vector<typename OrderedDict<Key, Value>::Item>& OrderedDict<
+    Key,
+    Value>::items() const noexcept {
+  return items_;
+}
+
+template <typename Key, typename Value>
+::std::vector<Key> OrderedDict<Key, Value>::keys() const {
+  std::vector<Key> keys;
+  keys.reserve(size());
+  for (const auto& item : items_) {
+    keys.push_back(item.key());
+  }
+  return keys;
+}
+
+template <typename Key, typename Value>
+::std::vector<Value> OrderedDict<Key, Value>::values() const {
+  std::vector<Value> values;
+  values.reserve(size());
+  for (const auto& item : items_) {
+    values.push_back(item.value());
+  }
+  return values;
+}
+
+template <typename Key, typename Value>
+::std::vector<std::pair<Key, Value>> OrderedDict<Key, Value>::pairs() const {
+  std::vector<std::pair<Key, Value>> values;
+  values.reserve(size());
+  for (const auto& item : items_) {
+    values.push_back(item.pair());
+  }
+  return values;
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::reserve(size_t requested_capacity) {
+  index_.reserve(requested_capacity);
+  items_.reserve(requested_capacity);
+}
+
+template <typename K, typename V>
+bool operator==(
+    const torch::OrderedDict<K, V>& a,
+    const torch::OrderedDict<K, V>& b) {
+  using Item = typename torch::OrderedDict<K, V>::Item;
+  if (a.index_ != b.index_)
+    return false;
+  if (a.items_.size() != b.items_.size())
+    return false;
+  // NOTE: There's no point in comparing keys for items_, as we already know
+  // that index is equal.
+  return std::equal(
+      a.items_.begin(),
+      a.items_.end(),
+      b.items_.begin(),
+      [](const Item& a, const Item& b) { return a.value() == b.value(); });
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b8a6f9218c687e89451cfe4ce31cfff98de1bfe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h
@@ -0,0 +1,262 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/ordered_dict.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_tuples.h>
+
+#include <iterator>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace python {
+namespace detail {
+inline Device py_object_to_device(py::object object) {
+  PyObject* obj = object.ptr();
+  if (THPDevice_Check(obj)) {
+    return reinterpret_cast<THPDevice*>(obj)->device;
+  }
+  throw TypeError("Expected device");
+}
+
+inline Dtype py_object_to_dtype(py::object object) {
+  PyObject* obj = object.ptr();
+  if (THPDtype_Check(obj)) {
+    return reinterpret_cast<THPDtype*>(obj)->scalar_type;
+  }
+  throw TypeError("Expected dtype");
+}
+
+template <typename ModuleType>
+using PyModuleClass =
+    py::class_<ModuleType, torch::nn::Module, std::shared_ptr<ModuleType>>;
+
+/// Dynamically creates a subclass of `torch.nn.cpp.ModuleWrapper` that is also
+/// a subclass of `torch.nn.Module`, and passes it the user-provided C++ module
+/// to which it delegates all calls.
+template <typename ModuleType>
+void bind_cpp_module_wrapper(
+    py::module module,
+    PyModuleClass<ModuleType> cpp_class,
+    const char* name) {
+  // Grab the `torch.nn.cpp.ModuleWrapper` class, which we'll subclass
+  // with a dynamically created class below.
+  py::object cpp_module =
+      py::module::import("torch.nn.cpp").attr("ModuleWrapper");
+
+  // Grab the `type` class which we'll use as a metaclass to create a new class
+  // dynamically.
+  py::object type_metaclass =
+      py::reinterpret_borrow<py::object>((PyObject*)&PyType_Type);
+
+  // The `ModuleWrapper` constructor copies all functions to its own `__dict__`
+  // in its constructor, but we do need to give our dynamic class a constructor.
+  // Inside, we construct an instance of the original C++ module we're binding
+  // (the `torch::nn::Module` subclass), and then forward it to the
+  // `ModuleWrapper` constructor.
+  py::dict attributes;
+
+  // `type()` always needs a `str`, but pybind11's `str()` method always creates
+  // a `unicode` object.
+  py::object name_str = py::str(name);
+
+  // Dynamically create the subclass of `ModuleWrapper`, which is a subclass of
+  // `torch.nn.Module`, and will delegate all calls to the C++ module we're
+  // binding.
+  py::object wrapper_class =
+      type_metaclass(name_str, py::make_tuple(cpp_module), attributes);
+
+  // The constructor of the dynamic class calls `ModuleWrapper.__init__()`,
+  // which replaces its methods with those of the C++ module.
+  wrapper_class.attr("__init__") = py::cpp_function(
+      [cpp_module, cpp_class](
+          py::object self, py::args args, py::kwargs kwargs) {
+        cpp_module.attr("__init__")(self, cpp_class(*args, **kwargs));
+      },
+      py::is_method(wrapper_class));
+
+  // Calling `my_module.my_class` now means that `my_class` is a subclass of
+  // `ModuleWrapper`, and whose methods call into the C++ module we're binding.
+  module.attr(name) = wrapper_class;
+}
+} // namespace detail
+
+/// Adds method bindings for a pybind11 `class_` that binds an `nn::Module`
+/// subclass.
+///
+/// Say you have a pybind11 class object created with `py::class_<Net>(m,
+/// "Net")`. This function will add all the necessary `.def()` calls to bind the
+/// `nn::Module` base class' methods, such as `train()`, `eval()` etc. into
+/// Python.
+///
+/// Users should prefer to use `bind_module` if possible.
+template <typename ModuleType, typename... Extra>
+py::class_<ModuleType, Extra...> add_module_bindings(
+    py::class_<ModuleType, Extra...> module) {
+  // clang-format off
+  return module
+      .def("train",
+          [](ModuleType& module, bool mode) { module.train(mode); },
+          py::arg("mode") = true)
+      .def("eval", [](ModuleType& module) { module.eval(); })
+      .def("clone", [](ModuleType& module) { return module.clone(); })
+      .def_property_readonly(
+          "training", [](ModuleType& module) { return module.is_training(); })
+      .def("zero_grad", [](ModuleType& module) { module.zero_grad(); })
+      .def_property_readonly( "_parameters", [](ModuleType& module) {
+            return module.named_parameters(/*recurse=*/false);
+          })
+      .def("parameters", [](ModuleType& module, bool recurse) {
+            return module.parameters(recurse);
+          },
+          py::arg("recurse") = true)
+      .def("named_parameters", [](ModuleType& module, bool recurse) {
+            return module.named_parameters(recurse);
+          },
+          py::arg("recurse") = true)
+      .def_property_readonly("_buffers", [](ModuleType& module) {
+            return module.named_buffers(/*recurse=*/false);
+          })
+      .def("buffers", [](ModuleType& module, bool recurse) {
+            return module.buffers(recurse); },
+          py::arg("recurse") = true)
+      .def("named_buffers", [](ModuleType& module, bool recurse) {
+            return module.named_buffers(recurse);
+          },
+          py::arg("recurse") = true)
+      .def_property_readonly(
+        "_modules", [](ModuleType& module) { return module.named_children(); })
+      .def("modules", [](ModuleType& module) { return module.modules(); })
+      .def("named_modules",
+           [](ModuleType& module, py::object /* unused */, std::string prefix, bool remove_duplicate /* unused */) {
+            return module.named_modules(std::move(prefix));
+          },
+          py::arg("memo") = py::none(),
+          py::arg("prefix") = std::string(),
+          py::arg("remove_duplicate") = true)
+      .def("children", [](ModuleType& module) { return module.children(); })
+      .def("named_children",
+          [](ModuleType& module) { return module.named_children(); })
+      .def("to", [](ModuleType& module, py::object object, bool non_blocking) {
+            if (THPDevice_Check(object.ptr())) {
+              module.to(
+                  reinterpret_cast<THPDevice*>(object.ptr())->device,
+                  non_blocking);
+            } else {
+              module.to(detail::py_object_to_dtype(object), non_blocking);
+            }
+          },
+          py::arg("dtype_or_device"),
+          py::arg("non_blocking") = false)
+      .def("to",
+          [](ModuleType& module,
+             py::object device,
+             py::object dtype,
+             bool non_blocking) {
+              if (device.is_none()) {
+                module.to(detail::py_object_to_dtype(dtype), non_blocking);
+              } else if (dtype.is_none()) {
+                module.to(detail::py_object_to_device(device), non_blocking);
+              } else {
+                module.to(
+                    detail::py_object_to_device(device),
+                    detail::py_object_to_dtype(dtype),
+                    non_blocking);
+              }
+          },
+          py::arg("device"),
+          py::arg("dtype"),
+          py::arg("non_blocking") = false)
+      .def("cuda", [](ModuleType& module) { module.to(kCUDA); })
+      .def("cpu", [](ModuleType& module) { module.to(kCPU); })
+      .def("float", [](ModuleType& module) { module.to(kFloat32); })
+      .def("double", [](ModuleType& module) { module.to(kFloat64); })
+      .def("half", [](ModuleType& module) { module.to(kFloat16); })
+      .def("__str__", [](ModuleType& module) { return module.name(); })
+      .def("__repr__", [](ModuleType& module) { return module.name(); });
+  // clang-format on
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// Example usage:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <typename ModuleType, bool force_enable = false>
+torch::disable_if_t<
+    torch::detail::has_forward<ModuleType>::value && !force_enable,
+    detail::PyModuleClass<ModuleType>>
+bind_module(py::module module, const char* name) {
+  py::module cpp = module.def_submodule("cpp");
+  auto cpp_class =
+      add_module_bindings(detail::PyModuleClass<ModuleType>(cpp, name));
+  detail::bind_cpp_module_wrapper(module, cpp_class, name);
+  return cpp_class;
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// If the class has a `forward()` method, it is automatically exposed as
+/// `forward()` and `__call__` in Python.
+///
+/// Example usage:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <
+    typename ModuleType,
+    typename =
+        torch::enable_if_t<torch::detail::has_forward<ModuleType>::value>>
+detail::PyModuleClass<ModuleType> bind_module(
+    py::module module,
+    const char* name) {
+  return bind_module<ModuleType, /*force_enable=*/true>(module, name)
+      .def("forward", &ModuleType::forward)
+      .def("__call__", &ModuleType::forward);
+}
+} // namespace python
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aa3e2d7c4954683e5a73c20d28061d1bd0464d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/serialize/archive.h>
+#include <torch/serialize/tensor.h>
+
+#include <utility>
+
+namespace torch {
+
+/// Serializes the given `value`.
+/// There must be an overload of `operator<<` between `serialize::OutputArchive`
+/// and `Value` for this method to be well-formed. Currently, such an overload
+/// is provided for (subclasses of):
+///
+/// - `torch::nn::Module`,
+/// - `torch::optim::Optimizer`
+/// - `torch::Tensor`
+///
+/// To perform the serialization, a `serialize::OutputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `save_to` method.
+/// For example, you can pass a filename, or an `ostream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Linear model(3, 4);
+///   torch::save(model, "model.pt");
+///
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
+///   std::ostringstream stream;
+///   // Note that the same stream cannot be used in multiple torch::save(...)
+///   // invocations, otherwise the header will be corrupted.
+///   torch::save(sgd, stream);
+///
+///   auto tensor = torch::ones({3, 4});
+///   torch::save(tensor, "my_tensor.pt");
+/// \endrst
+template <typename Value, typename... SaveToArgs>
+void save(const Value& value, SaveToArgs&&... args) {
+  serialize::OutputArchive archive(std::make_shared<jit::CompilationUnit>());
+  archive << value;
+  archive.save_to(std::forward<SaveToArgs>(args)...);
+}
+
+/// Serializes the given `tensor_vec` of type `std::vector<torch::Tensor>`.
+///
+/// To perform the serialization, a `serialize::OutputArchive` is constructed,
+/// and all arguments after the `tensor_vec` are forwarded to its `save_to`
+/// method. For example, you can pass a filename, or an `ostream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   std::vector<torch::Tensor> tensor_vec = { torch::randn({1, 2}),
+///   torch::randn({3, 4}) }; torch::save(tensor_vec, "my_tensor_vec.pt");
+///
+///   std::vector<torch::Tensor> tensor_vec = { torch::randn({5, 6}),
+///   torch::randn({7, 8}) }; std::ostringstream stream;
+///   // Note that the same stream cannot be used in multiple torch::save(...)
+///   // invocations, otherwise the header will be corrupted.
+///   torch::save(tensor_vec, stream);
+/// \endrst
+template <typename... SaveToArgs>
+void save(const std::vector<torch::Tensor>& tensor_vec, SaveToArgs&&... args) {
+  serialize::OutputArchive archive(std::make_shared<jit::CompilationUnit>());
+  for (const auto i : c10::irange(tensor_vec.size())) {
+    auto& value = tensor_vec[i];
+    archive.write(std::to_string(i), value);
+  }
+  archive.save_to(std::forward<SaveToArgs>(args)...);
+}
+
+TORCH_API std::vector<char> pickle_save(const torch::IValue& ivalue);
+TORCH_API torch::IValue pickle_load(const std::vector<char>& data);
+
+/// Deserializes the given `value`.
+/// There must be an overload of `operator>>` between `serialize::InputArchive`
+/// and `Value` for this method to be well-formed. Currently, such an overload
+/// is provided for (subclasses of):
+///
+/// - `torch::nn::Module`,
+/// - `torch::optim::Optimizer`
+/// - `torch::Tensor`
+///
+/// To perform the serialization, a `serialize::InputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `load_from` method.
+/// For example, you can pass a filename, or an `istream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Linear model(3, 4);
+///   torch::load(model, "model.pt");
+///
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
+///   std::istringstream stream("...");
+///   torch::load(sgd, stream);
+///
+///   auto tensor = torch::ones({3, 4});
+///   torch::load(tensor, "my_tensor.pt");
+/// \endrst
+template <typename Value, typename... LoadFromArgs>
+void load(Value& value, LoadFromArgs&&... args) {
+  serialize::InputArchive archive;
+  archive.load_from(std::forward<LoadFromArgs>(args)...);
+  archive >> value;
+}
+
+/// Deserializes the given `tensor_vec` of type `std::vector<torch::Tensor>`.
+///
+/// To perform the serialization, a `serialize::InputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `load_from` method.
+/// For example, you can pass a filename, or an `istream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   std::vector<torch::Tensor> tensor_vec;
+///   torch::load(tensor_vec, "my_tensor_vec.pt");
+///
+///   std::vector<torch::Tensor> tensor_vec;
+///   std::istringstream stream("...");
+///   torch::load(tensor_vec, stream);
+/// \endrst
+template <typename... LoadFromArgs>
+void load(std::vector<torch::Tensor>& tensor_vec, LoadFromArgs&&... args) {
+  serialize::InputArchive archive;
+  archive.load_from(std::forward<LoadFromArgs>(args)...);
+
+  // NOTE: The number of elements in the serialized `std::vector<torch::Tensor>`
+  // is not known ahead of time, so we need a while-loop to increment the index,
+  // and use `archive.try_read(...)` to check whether we have reached the end of
+  // the serialized `std::vector<torch::Tensor>`.
+  size_t index = 0;
+  torch::Tensor value;
+  while (archive.try_read(std::to_string(index), value)) {
+    tensor_vec.push_back(std::move(value));
+    value = torch::Tensor();
+    index++;
+  }
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a80de488cc2d22ab3b671b2c422a8b75bef27fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/serialize/input-archive.h>
+#include <torch/serialize/output-archive.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..3eac0e14ca978a8787c58e69251390caeb76cde3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/types.h>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace jit {
+struct Module;
+} // namespace jit
+} // namespace torch
+
+namespace torch {
+namespace serialize {
+
+/// A recursive representation of tensors that can be deserialized from a file
+/// or stream. In most cases, users should not have to interact with this class,
+/// and should instead use `torch::load`.
+class TORCH_API InputArchive final {
+ public:
+  /// Default-constructs the `InputArchive`.
+  InputArchive();
+
+  // Move is allowed.
+  InputArchive(InputArchive&&) = default;
+  InputArchive& operator=(InputArchive&&) = default;
+
+  // Copy is disallowed.
+  InputArchive(InputArchive&) = delete;
+  InputArchive& operator=(InputArchive&) = delete;
+
+  ~InputArchive() = default;
+
+  /// Reads an `IValue` associated with a given `key`.
+  void read(const std::string& key, c10::IValue& ivalue);
+
+  /// Reads an `IValue` associated with a given `key`. If there is no `IValue`
+  /// associated with the `key`, this returns false, otherwise it returns true.
+  bool try_read(const std::string& key, c10::IValue& ivalue);
+
+  /// Reads a `tensor` associated with a given `key`. If there is no `tensor`
+  /// associated with the `key`, this returns false, otherwise it returns true.
+  /// If the tensor is expected to be a buffer (not differentiable), `is_buffer`
+  /// must be `true`.
+  bool try_read(const std::string& key, Tensor& tensor, bool is_buffer = false);
+
+  /// Reads a `tensor` associated with a given `key`.
+  /// If the tensor is expected to be a buffer (not differentiable), `is_buffer`
+  /// must be `true`.
+  void read(const std::string& key, Tensor& tensor, bool is_buffer = false);
+
+  /// Reads a `InputArchive` associated with a given `key`. If there is no
+  /// `InputArchive` associated with the `key`, this returns false, otherwise
+  /// it returns true.
+  bool try_read(const std::string& key, InputArchive& archive);
+
+  /// Reads an `InputArchive` associated with a given `key`.
+  /// The archive can thereafter be used for further deserialization of the
+  /// nested data.
+  void read(const std::string& key, InputArchive& archive);
+
+  /// Loads the `InputArchive` from a serialized representation stored in the
+  /// file at `filename`. Storage are remapped using device option. If device
+  /// is not specified, the module is loaded to the original device.
+  void load_from(
+      const std::string& filename,
+      c10::optional<torch::Device> device = c10::nullopt);
+
+  /// Loads the `InputArchive` from a serialized representation stored in the
+  /// given `stream`. Storage are remapped using device option. If device
+  /// is not specified, the module is loaded to the original device.
+  void load_from(
+      std::istream& stream,
+      c10::optional<torch::Device> device = c10::nullopt);
+
+  // Loads given the specified flat array.
+  void load_from(
+      const char* data,
+      size_t size,
+      c10::optional<torch::Device> device = c10::nullopt);
+
+  // Loads given the specified read and size functions.
+  void load_from(
+      const std::function<size_t(uint64_t pos, void* buf, size_t nbytes)>&
+          read_func,
+      const std::function<size_t(void)>& size_func,
+      c10::optional<torch::Device> device = c10::nullopt);
+
+  // Returns the vector of keys in the input archive.
+  std::vector<std::string> keys();
+
+  /// Forwards all arguments to `read()`.
+  /// Useful for generic code that can be re-used for both `InputArchive` and
+  /// `OutputArchive` (where `operator()` forwards to `write()`).
+  template <typename... Ts>
+  void operator()(Ts&&... ts) {
+    read(std::forward<Ts>(ts)...);
+  }
+
+ private:
+  jit::Module module_;
+  std::string hierarchy_prefix_;
+};
+} // namespace serialize
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..30ce4307419a2ac8ab784c1fc015b5efd0522c42
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace jit {
+struct Module;
+} // namespace jit
+} // namespace torch
+
+namespace torch {
+namespace serialize {
+class TORCH_API OutputArchive final {
+ public:
+  explicit OutputArchive(std::shared_ptr<jit::CompilationUnit> cu);
+  explicit OutputArchive()
+      : cu_(std::make_shared<jit::CompilationUnit>()),
+        module_("__torch__.Module", cu_) {}
+
+  // Move is allowed.
+  OutputArchive(OutputArchive&&) = default;
+  OutputArchive& operator=(OutputArchive&&) = default;
+
+  // Copy is disallowed.
+  OutputArchive(OutputArchive&) = delete;
+  OutputArchive& operator=(OutputArchive&) = delete;
+
+  std::shared_ptr<jit::CompilationUnit> compilation_unit() const {
+    return cu_;
+  }
+
+  /// Writes an `IValue` to the `OutputArchive`.
+  void write(const std::string& key, const c10::IValue& ivalue);
+
+  /// Writes a `(key, tensor)` pair to the `OutputArchive`, and marks it as
+  /// being or not being a buffer (non-differentiable tensor).
+  void write(
+      const std::string& key,
+      const Tensor& tensor,
+      bool is_buffer = false);
+
+  /// Writes a nested `OutputArchive` under the given `key` to this
+  /// `OutputArchive`.
+  void write(const std::string& key, OutputArchive& nested_archive);
+
+  /// Saves the `OutputArchive` into a serialized representation in a file at
+  /// `filename`.
+  void save_to(const std::string& filename);
+
+  /// Saves the `OutputArchive` into a serialized representation into the given
+  /// `stream`.
+  void save_to(std::ostream& stream);
+
+  /// Saves the `OutputArchive` into a serialized representation using the
+  /// given writer function.
+  void save_to(const std::function<size_t(const void*, size_t)>& func);
+
+  /// Forwards all arguments to `write()`.
+  /// Useful for generic code that can be re-used for both `OutputArchive` and
+  /// `InputArchive` (where `operator()` forwards to `read()`).
+  template <typename... Ts>
+  void operator()(Ts&&... ts) {
+    write(std::forward<Ts>(ts)...);
+  }
+
+ private:
+  std::shared_ptr<jit::CompilationUnit> cu_;
+  jit::Module module_;
+};
+} // namespace serialize
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1c43d1def971ff8fcf738cd208f897d580d92de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+namespace torch {
+inline serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Tensor& tensor) {
+  archive.write("0", tensor);
+  return archive;
+}
+
+inline serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Tensor& tensor) {
+  archive.read("0", tensor);
+  return archive;
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e1440e6c05d8e410b1f4cd6ed3ab7b01a9e7597
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch {
+namespace sparse {}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fafb7f3e7db03ccb28b581451a1c18f91731ece
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h
@@ -0,0 +1,1405 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/types.h>
+
+namespace torch {
+namespace special {
+
+/// Computes the natural logarithm of the absolute value of the gamma function
+/// See https://pytorch.org/docs/master/special.html#torch.special.gammaln.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::gammaln(t);
+/// ```
+inline Tensor gammaln(const Tensor& self) {
+  return torch::special_gammaln(self);
+}
+
+inline Tensor& gammaln_out(Tensor& result, const Tensor& self) {
+  return torch::special_gammaln_out(result, self);
+}
+
+/// Computes the regularized lower incomplete gamma function
+/// See https://pytorch.org/docs/master/special.html#torch.special.gammainc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// auto s = torch::randn(128, dtype=kDouble);
+/// torch::special::gammainc(s, t);
+/// ```
+inline Tensor gammainc(const Tensor& self, const Tensor& other) {
+  return torch::special_gammainc(self, other);
+}
+
+inline Tensor& gammainc_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_gammainc_out(result, self, other);
+}
+
+/// Computes the regularized upper incomplete gamma function
+/// See https://pytorch.org/docs/master/special.html#torch.special.gammainc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// auto s = torch::randn(128, dtype=kDouble);
+/// torch::special::gammaincc(s, t);
+/// ```
+inline Tensor gammaincc(const Tensor& self, const Tensor& other) {
+  return torch::special_gammaincc(self, other);
+}
+
+inline Tensor& gammaincc_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_gammaincc_out(result, self, other);
+}
+
+/// Computes the multivariate log-gamma function with dimension `p`, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.multigammaln.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::multigammaln(t, 1);
+/// ```
+inline Tensor multigammaln(const Tensor& self, int64_t p) {
+  return torch::special_multigammaln(self, p);
+}
+
+inline Tensor& multigammaln_out(Tensor& result, const Tensor& self, int64_t p) {
+  return torch::special_multigammaln_out(result, self, p);
+}
+
+/// Computes the nth derivative of the digamma function on the input.
+/// See https:://pytorch.org/docs/master/special.html#torch.special.polygamma.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::polygamma(2, t);
+/// ```
+inline Tensor polygamma(int64_t n, const Tensor& self) {
+  return torch::special_polygamma(n, self);
+}
+
+inline Tensor& polygamma_out(Tensor& result, int64_t n, const Tensor& self) {
+  return torch::special_polygamma_out(result, n, self);
+}
+
+/// Computes the logarithmic derivative of the gamma function on input
+/// See https://pytorch.org/docs/master/special.html#torch.special.psi
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::psi(t);
+/// ```
+inline Tensor psi(const Tensor& self) {
+  return torch::special_psi(self);
+}
+
+inline Tensor& psi_out(Tensor& result, const Tensor& self) {
+  return torch::special_psi_out(result, self);
+}
+
+/// Computes the logarithmic derivative of the gamma function on input
+/// See https://pytorch.org/docs/master/special.html#torch.special.digamma
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::digamma(t);
+/// ```
+inline Tensor digamma(const Tensor& self) {
+  return torch::special_digamma(self);
+}
+
+inline Tensor& digamma_out(Tensor& result, const Tensor& self) {
+  return torch::special_digamma_out(result, self);
+}
+
+/// Computes entropy of input, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.entr.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::entr(t);
+/// ```
+inline Tensor entr(const Tensor& self) {
+  return torch::special_entr(self);
+}
+
+inline Tensor& entr_out(Tensor& result, const Tensor& self) {
+  return torch::special_entr_out(result, self);
+}
+
+/// Computes the error function
+/// See https://pytorch.org/docs/master/special.html#torch.special.erf.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erf(t);
+/// ```
+inline Tensor erf(const Tensor& self) {
+  return torch::special_erf(self);
+}
+
+inline Tensor& erf_out(Tensor& result, const Tensor& self) {
+  return torch::special_erf_out(result, self);
+}
+
+/// Computes the complementary error function
+/// See https://pytorch.org/docs/master/special.html#torch.special.erfc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfc(t);
+/// ```
+inline Tensor erfc(const Tensor& self) {
+  return torch::special_erfc(self);
+}
+
+inline Tensor& erfc_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfc_out(result, self);
+}
+
+/// Computes the scaled complementary error function
+/// See https://pytorch.org/docs/master/special.html#torch.special.erfcx.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfcx(t);
+/// ```
+inline Tensor erfcx(const Tensor& self) {
+  return torch::special_erfcx(self);
+}
+
+inline Tensor& erfcx_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfcx_out(result, self);
+}
+
+/// Computes the inverse error function
+/// See https://pytorch.org/docs/master/special.html#torch.special.erfinv.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfinv(t);
+/// ```
+inline Tensor erfinv(const Tensor& self) {
+  return torch::special_erfinv(self);
+}
+
+inline Tensor& erfinv_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfinv_out(result, self);
+}
+
+/// Computes the log of summed exponentials of each row of input in the given
+/// dimension dim See
+/// https://pytorch.org/docs/master/special.html#torch.special.logsumexp.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(3, 3);
+/// torch::special::logsumexp(t, 1);
+/// ```
+inline Tensor logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) {
+  return torch::special_logsumexp(self, dims, keepdim);
+}
+
+inline Tensor& logsumexp_out(
+    Tensor& result,
+    const Tensor& self,
+    IntArrayRef dims,
+    bool keepdim) {
+  return torch::special_logsumexp_out(result, self, dims, keepdim);
+}
+
+/// Computes the argument, x, for which the area under the Gaussian probability
+/// density function (integrated from minus infinity to x) is equal to input,
+/// elementwise. See
+/// https://pytorch.org/docs/master/special.html#torch.special.ndtri
+///
+/// Example:
+/// ```
+/// auto t = torch::rand(128, dtype=kDouble);
+/// torch::special::ndtri(t);
+/// ```
+inline Tensor ndtri(const Tensor& self) {
+  return torch::special_ndtri(self);
+}
+
+inline Tensor& ndtri_out(Tensor& result, const Tensor& self) {
+  return torch::special_ndtri_out(result, self);
+}
+
+/// Computes the log of area under the standard Gaussian probability density
+/// function, integrated from minus infinity to :attr:`input`, elementwise See
+/// https://pytorch.org/docs/master/special.html#torch.special.log_ndtr
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::log_ndtr(t);
+/// ```
+inline Tensor log_ndtr(const Tensor& self) {
+  return torch::special_log_ndtr(self);
+}
+
+inline Tensor& log_ndtr_out(Tensor& result, const Tensor& self) {
+  return torch::special_log_ndtr_out(result, self);
+}
+
+/// Computes the logit of input, elementwise.
+/// See https://pytorch.org/docs/master/special.html#torch.special.logit.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::logit(t);
+/// ```
+inline Tensor logit(const Tensor& self) {
+  return torch::special_logit(self);
+}
+
+inline Tensor& logit_out(Tensor& result, const Tensor& self) {
+  return torch::special_logit_out(result, self);
+}
+
+/// Computes the expit (also known as the logistic sigmoid function) of input,
+/// elementwise See
+/// https://pytorch.org/docs/master/special.html#torch.special.expit.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::expit(t);
+/// ```
+inline Tensor expit(const Tensor& self) {
+  return torch::special_expit(self);
+}
+
+inline Tensor& expit_out(Tensor& result, const Tensor& self) {
+  return torch::special_expit_out(result, self);
+}
+
+/// Computes the base two exponential function of :attr:`input`, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.exp2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::exp2(t);
+/// ```
+inline Tensor exp2(const Tensor& self) {
+  return torch::special_exp2(self);
+}
+
+inline Tensor& exp2_out(Tensor& result, const Tensor& self) {
+  return torch::special_exp2_out(result, self);
+}
+
+/// Computes the exponential of the elements minus 1, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.expm1.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::expm1(t);
+/// ```
+inline Tensor expm1(const Tensor& self) {
+  return torch::special_expm1(self);
+}
+
+inline Tensor& expm1_out(Tensor& result, const Tensor& self) {
+  return torch::special_expm1_out(result, self);
+}
+
+/// Computes x * log(y) for inputs, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.xlogy.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::xlogy(x, y);
+/// ```
+inline Tensor xlogy(const Tensor& self, const Tensor& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor xlogy(const Scalar& self, const Tensor& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor xlogy(const Tensor& self, const Scalar& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+/// Computes x * log1p(y) for inputs, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.xlog1py.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::xlog1py(x, y);
+/// ```
+inline Tensor xlog1py(const Tensor& self, const Tensor& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor xlog1py(const Scalar& self, const Tensor& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor xlog1py(const Tensor& self, const Scalar& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+/// Computes Hurwitz Zeta function for inputs, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.zeta.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::zeta(x, y);
+/// ```
+inline Tensor zeta(const Tensor& self, const Tensor& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor zeta(const Scalar& self, const Tensor& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor zeta(const Tensor& self, const Scalar& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+/// Computes the zeroth order modified Bessel function of the first kind of
+/// input, elementwise See
+/// https://pytorch.org/docs/master/special.html#torch.special.i0
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i0(t);
+/// ```
+inline Tensor i0(const Tensor& self) {
+  return torch::special_i0(self);
+}
+
+inline Tensor& i0_out(Tensor& result, const Tensor& self) {
+  return torch::special_i0_out(result, self);
+}
+
+/// Computes the area under the standard Gaussian probability density function,
+/// integrated from minus infinity to :attr:`input`, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.ndtr
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::ndtr(t);
+/// ```
+inline Tensor ndtr(const Tensor& self) {
+  return torch::special_ndtr(self);
+}
+
+inline Tensor& ndtr_out(Tensor& result, const Tensor& self) {
+  return torch::special_ndtr_out(result, self);
+}
+
+/// Computes the exponentially scaled zeroth order modified Bessel function of
+/// the first kind See
+/// https://pytorch.org/docs/master/special.html#torch.special.i0e.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i0e(t);
+/// ```
+inline Tensor i0e(const Tensor& self) {
+  return torch::special_i0e(self);
+}
+
+inline Tensor& i0e_out(Tensor& result, const Tensor& self) {
+  return torch::special_i0e_out(result, self);
+}
+
+/// Computes the first order modified Bessel function of the first kind
+/// See https://pytorch.org/docs/master/special.html#torch.special.i1.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i1(t);
+/// ```
+inline Tensor i1(const Tensor& self) {
+  return torch::special_i1(self);
+}
+
+inline Tensor& i1_out(Tensor& result, const Tensor& self) {
+  return torch::special_i1_out(result, self);
+}
+
+/// Computes the exponentially scaled first order modified Bessel function of
+/// the first kind See
+/// https://pytorch.org/docs/master/special.html#torch.special.i1e.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i1e(t);
+/// ```
+inline Tensor i1e(const Tensor& self) {
+  return torch::special_i1e(self);
+}
+
+inline Tensor& i1e_out(Tensor& result, const Tensor& self) {
+  return torch::special_i1e_out(result, self);
+}
+
+/// Computes the sinc of input, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.sinc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::sinc(t);
+/// ```
+inline Tensor sinc(const Tensor& self) {
+  return torch::special_sinc(self);
+}
+
+inline Tensor& sinc_out(Tensor& result, const Tensor& self) {
+  return torch::special_sinc_out(result, self);
+}
+
+/// Rounds the elements of the input
+/// See https://pytorch.org/docs/master/special.html#torch.special.round.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::round(t);
+/// ```
+inline Tensor round(const Tensor& self) {
+  return torch::special_round(self);
+}
+
+inline Tensor& round_out(Tensor& result, const Tensor& self) {
+  return torch::special_round_out(result, self);
+}
+
+/// Computes log(1 + x) of the input, elementwise
+/// See https://pytorch.org/docs/master/special.html#torch.special.log1p.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::log1p(t);
+/// ```
+inline Tensor log1p(const Tensor& self) {
+  return torch::special_log1p(self);
+}
+
+inline Tensor& log1p_out(Tensor& result, const Tensor& self) {
+  return torch::special_log1p_out(result, self);
+}
+
+/// Computes log followed by softmax(x) of the input
+/// See https://pytorch.org/docs/master/special.html#torch.special.log_softmax.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, 128, dtype=kDouble);
+/// torch::special::log_softmax(t, 0);
+/// ```
+inline Tensor log_softmax(
+    const Tensor& self,
+    int64_t dim,
+    c10::optional<ScalarType> dtype) {
+  return torch::special_log_softmax(self, dim, dtype);
+}
+
+/// Computes softmax of the input along a given dimension
+/// See https://pytorch.org/docs/master/special.html#torch.special.softmax.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, 128, dtype=kDouble);
+/// torch::special::softmax(t, 0);
+/// ```
+inline Tensor softmax(
+    const Tensor& self,
+    int64_t dim,
+    c10::optional<ScalarType> dtype) {
+  return torch::special_softmax(self, dim, dtype);
+}
+
+/// Airy function Ai.
+///
+/// See https://pytorch.org/docs/master/special.html#torch.special.airy_ai.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::airy_ai(x);
+/// ```
+inline Tensor airy_ai(const Tensor& x) {
+  return torch::special_airy_ai(x);
+}
+
+inline Tensor& airy_ai_out(Tensor& y, const Tensor& x) {
+  return torch::special_airy_ai_out(y, x);
+}
+
+/// Bessel function of the first kind of order 0.
+///
+/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_j0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_j0(x);
+/// ```
+inline Tensor bessel_j0(const Tensor& self) {
+  return torch::special_bessel_j0(self);
+}
+
+inline Tensor& bessel_j0_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_j0_out(result, self);
+}
+
+/// Bessel function of the first kind of order 1.
+///
+/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_j1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_j1(x);
+/// ```
+inline Tensor bessel_j1(const Tensor& self) {
+  return torch::special_bessel_j1(self);
+}
+
+inline Tensor& bessel_j1_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_j1_out(result, self);
+}
+
+/// Bessel function of the second kind of order 0.
+///
+/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_y0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_y0(x);
+/// ```
+inline Tensor bessel_y0(const Tensor& self) {
+  return torch::special_bessel_y0(self);
+}
+
+inline Tensor& bessel_y0_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_y0_out(result, self);
+}
+
+/// Bessel function of the second kind of order 1.
+///
+/// See https://pytorch.org/docs/master/special.html#torch.special.bessel_y1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_y1(x);
+/// ```
+inline Tensor bessel_y1(const Tensor& self) {
+  return torch::special_bessel_y1(self);
+}
+
+inline Tensor& bessel_y1_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_y1_out(result, self);
+}
+
+/// Chebyshev polynomial of the first kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_t.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_t(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_t(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor chebyshev_polynomial_t(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor chebyshev_polynomial_t(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the second kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_u.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_u(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_u(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor chebyshev_polynomial_u(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor chebyshev_polynomial_u(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the third kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_v.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_v(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_v(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor chebyshev_polynomial_v(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor chebyshev_polynomial_v(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the fourth kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.chebyshev_polynomial_w.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_w(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_w(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor chebyshev_polynomial_w(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor chebyshev_polynomial_w(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+/// Physicist’s Hermite polynomial.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.hermite_polynomial_h.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::hermite_polynomial_h(x, n);
+/// ```
+inline Tensor hermite_polynomial_h(const Tensor& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor hermite_polynomial_h(const Scalar& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor hermite_polynomial_h(const Tensor& x, const Scalar& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+/// Probabilist’s Hermite polynomial.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.hermite_polynomial_he.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::hermite_polynomial_he(x, n);
+/// ```
+inline Tensor hermite_polynomial_he(const Tensor& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor hermite_polynomial_he(const Scalar& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor hermite_polynomial_he(const Tensor& x, const Scalar& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+/// Laguerre polynomial.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.laguerre_polynomial_l.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::laguerre_polynomial_l(x, n);
+/// ```
+inline Tensor laguerre_polynomial_l(const Tensor& x, const Tensor& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor laguerre_polynomial_l(const Scalar& x, const Tensor& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor laguerre_polynomial_l(const Tensor& x, const Scalar& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+/// Legendre polynomial.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.legendre_polynomial_p.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::legendre_polynomial_p(x, n);
+/// ```
+inline Tensor legendre_polynomial_p(const Tensor& x, const Tensor& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor legendre_polynomial_p(const Scalar& x, const Tensor& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor legendre_polynomial_p(const Tensor& x, const Scalar& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+/// Modified Bessel function of the first kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_i0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_i0(x);
+/// ```
+inline Tensor modified_bessel_i0(const Tensor& self) {
+  return torch::special_modified_bessel_i0(self);
+}
+
+inline Tensor& modified_bessel_i0_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_i0_out(result, self);
+}
+
+/// Modified Bessel function of the first kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_i1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_i1(x);
+/// ```
+inline Tensor modified_bessel_i1(const Tensor& self) {
+  return torch::special_modified_bessel_i1(self);
+}
+
+inline Tensor& modified_bessel_i1_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_i1_out(result, self);
+}
+
+/// Modified Bessel function of the second kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_k0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_k0(x);
+/// ```
+inline Tensor modified_bessel_k0(const Tensor& self) {
+  return torch::special_modified_bessel_k0(self);
+}
+
+inline Tensor& modified_bessel_k0_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_k0_out(result, self);
+}
+
+/// Modified Bessel function of the second kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.modified_bessel_k1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_k1(x);
+/// ```
+inline Tensor modified_bessel_k1(const Tensor& self) {
+  return torch::special_modified_bessel_k1(self);
+}
+
+inline Tensor& modified_bessel_k1_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_k1_out(result, self);
+}
+
+/// Scaled modified Bessel function of the second kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.scaled_modified_bessel_k0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::scaled_modified_bessel_k0(x);
+/// ```
+inline Tensor scaled_modified_bessel_k0(const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k0(x);
+}
+
+inline Tensor& scaled_modified_bessel_k0_out(Tensor& y, const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k0_out(y, x);
+}
+
+/// Scaled modified Bessel function of the second kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.scaled_modified_bessel_k1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::scaled_modified_bessel_k1(x);
+/// ```
+inline Tensor scaled_modified_bessel_k1(const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k1(x);
+}
+
+inline Tensor& scaled_modified_bessel_k1_out(Tensor& y, const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k1_out(y, x);
+}
+
+/// Shifted Chebyshev polynomial of the first kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_t.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_t(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_t(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_t(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_t(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the second kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_u.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_u(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_u(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_u(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_u(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the third kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_v.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_v(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_v(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_v(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_v(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the fourth kind.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.shifted_chebyshev_polynomial_w.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_w(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_w(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_w(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_w(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+/// Spherical Bessel function of the first kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/master/special.html#torch.special.spherical_bessel_j0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::spherical_bessel_j0(x);
+/// ```
+inline Tensor spherical_bessel_j0(const Tensor& x) {
+  return torch::special_spherical_bessel_j0(x);
+}
+
+inline Tensor& spherical_bessel_j0_out(Tensor& y, const Tensor& x) {
+  return torch::special_spherical_bessel_j0_out(y, x);
+}
+} // namespace special
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed9f82b0d0177b14e32bc1b381228a4b87ce896
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/all.h>
+
+#ifdef TORCH_API_INCLUDE_EXTENSION_H
+#include <torch/extension.h>
+
+#endif // defined(TORCH_API_INCLUDE_EXTENSION_H)
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..490589e383cab907dff72af2950f7ba65cb4d158
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <c10/util/Optional.h>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+
+// TODO: These don't really belong here but torchvision builds in CI need them
+// Remove once the torchvision version being compiled in CI is updated
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <torch/library.h>
+
+namespace torch {
+
+// NOTE [ Exposing declarations in `at::` to `torch::` ]
+//
+// The following line `using namespace at;` is responsible for exposing all
+// declarations in `at::` namespace to `torch::` namespace.
+//
+// According to the rules laid out in
+// https://en.cppreference.com/w/cpp/language/qualified_lookup, section
+// "Namespace members":
+// ```
+// Qualified lookup within the scope of a namespace N first considers all
+// declarations that are located in N and all declarations that are located in
+// the inline namespace members of N (and, transitively, in their inline
+// namespace members). If there are no declarations in that set then it
+// considers declarations in all namespaces named by using-directives found in N
+// and in all transitive inline namespace members of N.
+// ```
+//
+// This means that if both `at::` and `torch::` namespaces have a function with
+// the same signature (e.g. both `at::func()` and `torch::func()` exist), after
+// `namespace torch { using namespace at; }`, when we call `torch::func()`, the
+// `func()` function defined in `torch::` namespace will always be called, and
+// the `func()` function defined in `at::` namespace is always hidden.
+using namespace at; // NOLINT
+
+using c10::nullopt;
+using c10::optional;
+
+using Dtype = at::ScalarType;
+
+/// Fixed width dtypes.
+constexpr auto kUInt8 = at::kByte;
+constexpr auto kInt8 = at::kChar;
+constexpr auto kInt16 = at::kShort;
+constexpr auto kInt32 = at::kInt;
+constexpr auto kInt64 = at::kLong;
+constexpr auto kFloat16 = at::kHalf;
+constexpr auto kFloat32 = at::kFloat;
+constexpr auto kFloat64 = at::kDouble;
+
+/// Rust-style short dtypes.
+constexpr auto kU8 = kUInt8;
+constexpr auto kI8 = kInt8;
+constexpr auto kI16 = kInt16;
+constexpr auto kI32 = kInt32;
+constexpr auto kI64 = kInt64;
+constexpr auto kF16 = kFloat16;
+constexpr auto kF32 = kFloat32;
+constexpr auto kF64 = kFloat64;
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..30b0e66e7bfb0a6a0e4eba6a3802fe631f7dab73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+#include <torch/csrc/api/include/torch/types.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <cstdint>
+
+namespace torch {
+
+/// A RAII, thread-local guard that disabled gradient calculation.
+///
+/// Disabling gradient calculation is useful for inference, when you are sure
+/// that you will not call `at::Tensor::backward`. It will reduce memory
+/// consumption for computations that would otherwise have `requires_grad() ==
+/// true`.
+///
+/// In this mode, the result of every computation will have
+/// `requires_grad() == false`, even when the inputs have `requires_grad() ==
+/// true`.
+///
+/// This context manager is thread-local; it will not affect computation
+/// in other threads.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::NoGradGuard no_grad;
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `false`
+/// }
+/// {
+///   auto doubler = [](torch::Tensor x) {
+///     torch::NoGradGuard no_grad;
+///     return x * 2;
+///   };
+///   auto z = doubler(x);
+///   std::cout << z.requires_grad() << std::endl; // prints `false`
+/// }
+/// @endcode
+using NoGradGuard = at::NoGradGuard;
+
+/// A RAII, thread-local guard that sets gradient calculation to on or off.
+///
+/// ``AutoGradMode`` will enable or disable grads based on its argument
+/// `enabled`.
+///
+/// This context manager is thread-local; it will not affect computation
+/// in other threads.
+///
+/// \param enabled: Flag whether to enable grad (``true``), or disable
+///              (``false``). This can be used to conditionally enable
+///              gradients.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::AutoGradMode enable_grad(true);
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `true`
+/// }
+/// {
+///   torch::AutoGradMode enable_grad(false);
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `false`
+/// }
+/// @endcode
+using AutoGradMode = at::AutoGradMode;
+
+/// Sets the global random seed for all newly created CPU and CUDA tensors.
+using at::manual_seed;
+
+// Called during new thread initialization
+using at::init_num_threads;
+
+// Returns the number of threads used in parallel region.
+using at::get_num_threads;
+
+// Sets the number of threads to be used in parallel region.
+using at::set_num_threads;
+
+// Returns the number of threads used for inter-op parallelism.
+using at::get_num_interop_threads;
+
+// Sets the number of threads to be used for inter-op parallelism.
+using at::set_num_interop_threads;
+
+// Returns true if both t1, t2 are undefined or both are defined and equal
+inline bool equal_if_defined(Tensor t1, Tensor t2) {
+  return (
+      (!t1.defined() && !t2.defined()) ||
+      (t1.defined() && t2.defined() && torch::equal(t1, t2)));
+}
+
+// RecordFunction API
+using at::addGlobalCallback;
+using at::addThreadLocalCallback;
+using at::CallbackHandle;
+using at::clearCallbacks;
+using at::clearGlobalCallbacks;
+using at::clearThreadLocalCallbacks;
+using at::DisableRecordFunctionGuard;
+using at::enableRecordFunction;
+using at::hasCallbacks;
+using at::hasGlobalCallbacks;
+using at::hasThreadLocalCallbacks;
+using at::isRecordFunctionEnabled;
+using at::RecordFunction;
+using at::RecordFunctionCallback;
+using at::RecordFunctionGuard;
+using at::removeCallback;
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..b292e74e57c32d9294d02cb5b9d123b263f64e04
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h
@@ -0,0 +1,14 @@
+#pragma once
+
+/// Indicates the major version of LibTorch.
+#define TORCH_VERSION_MAJOR 2
+
+/// Indicates the minor version of LibTorch.
+#define TORCH_VERSION_MINOR 3
+
+/// Indicates the patch version of LibTorch.
+#define TORCH_VERSION_PATCH 1
+
+/// Indicates the version of LibTorch.
+#define TORCH_VERSION \
+  "2.3.1"
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..edb607ee00a69b4c09cedb40ce01d24388c53626
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::xpu {
+
+/// Returns the number of XPU devices available.
+size_t TORCH_API device_count();
+
+/// Returns true if at least one XPU device is available.
+bool TORCH_API is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Sets the seed for all available GPUs.
+void TORCH_API manual_seed_all(uint64_t seed);
+
+/// Waits for all kernels in all streams on a XPU device to complete.
+void TORCH_API synchronize(int64_t device_index);
+
+} // namespace torch::xpu
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9f61669cc8e05d915cfbc2efd9b83ec9b98bfa9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h
@@ -0,0 +1,1101 @@
+#pragma once
+
+// NB: Must be at the top of file to avoid including the deprecated "math.h".
+// https://stackoverflow.com/questions/6563810/m-pi-works-with-math-h-but-not-with-cmath-in-visual-studio
+#ifdef _MSC_VER
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
+#endif
+
+#include <ATen/ATen.h>
+#include <torch/csrc/autograd/generated/Functions.h>
+
+namespace torch::autograd::generated::details {
+
+extern const char* kCudnnDoubleBackwardMsg;
+
+// A simple way to imperatively compute index ranges for slots
+// that have been flattened
+struct TORCH_API IndexRangeGenerator {
+  IndexRange range(size_t range_size) {
+    i += range_size;
+    return {i - range_size, i};
+  }
+  size_t size() {
+    return i;
+  }
+
+ private:
+  size_t i = 0;
+};
+
+TORCH_API Tensor toNonOptFwGrad(const c10::optional<Tensor>& t);
+TORCH_API Tensor toNonOptPrimal(const c10::optional<Tensor>& t);
+TORCH_API Tensor toNonOptTensor(const c10::optional<Tensor>& t);
+
+TORCH_API inline c10::optional<Tensor> wrap_opt_if(
+    const Tensor& t,
+    const bool cond) {
+  using OptTensor = c10::optional<Tensor>;
+  return cond ? OptTensor(t) : static_cast<OptTensor>(c10::nullopt);
+}
+
+TORCH_API Tensor
+apply_loss_reduction(const Tensor& unreduced, int64_t reduction);
+TORCH_API bool any_variable_defined(const variable_list& variables);
+TORCH_API void copy_range(
+    variable_list& out,
+    IndexRange range,
+    const at::Tensor& t);
+TORCH_API void copy_range(
+    variable_list& out,
+    IndexRange range,
+    at::ArrayRef<at::Tensor> t);
+TORCH_API at::Tensor copysign_tensor_self_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+TORCH_API at::Tensor not_implemented(const char* name, const char* reason = "");
+TORCH_API std::vector<Tensor> not_implemented_list(
+    const char* name,
+    const char* reason = "");
+at::Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result);
+at::Tensor maybe_multiply(const at::Tensor& t, const at::Scalar& s);
+int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim);
+Tensor restore_reduced_dims(
+    const Tensor& output,
+    IntArrayRef dims,
+    bool keepdim);
+Tensor scale_grad_by_count(
+    const Tensor& grad,
+    const Tensor& mask,
+    IntArrayRef dims);
+at::Tensor norm_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const optional<at::Scalar>& p_,
+    const at::Tensor& norm);
+at::Tensor norm_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const optional<at::Scalar>& p_,
+    at::Tensor norm,
+    at::IntArrayRef dim,
+    bool keepdim);
+Tensor norm_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const optional<Scalar>& p_,
+    Tensor norm,
+    IntArrayRef dim,
+    bool keepdim);
+Tensor norm_jvp(
+    const Tensor& grad,
+    const Tensor& self,
+    const optional<Scalar>& p_,
+    Tensor norm);
+Tensor _nested_from_padded_backward(
+    const Tensor& grad,
+    const Tensor& input,
+    const bool do_transform_0213);
+std::tuple<Tensor, Tensor, Tensor> linear_double_backward(
+    const variable_list& grads,
+    const Tensor& self,
+    const Tensor& grad_output,
+    const Tensor& weight);
+Tensor linalg_vector_norm_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const Scalar& scalar_ord,
+    Tensor norm,
+    const at::OptionalIntArrayRef& opt_dim,
+    bool keepdim);
+at::Tensor linalg_vector_norm_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const at::Scalar& ord,
+    at::Tensor norm,
+    const at::OptionalIntArrayRef& opt_dim,
+    bool keepdim);
+at::Tensor pow_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const at::Scalar& exponent_);
+at::Tensor pow_backward_self(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& exponent);
+at::Tensor pow_backward_exponent(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& exponent,
+    const at::Tensor& result);
+at::Tensor pow_backward_exponent(
+    const at::Tensor& grad,
+    const at::Scalar& base,
+    const at::Tensor& exponent,
+    const at::Tensor& result);
+at::Tensor angle_backward(const at::Tensor& grad, const at::Tensor& self);
+template <typename T>
+at::Tensor mul_tensor_backward(const Tensor& grad, T other, ScalarType self_st);
+template <typename T>
+at::Tensor div_tensor_self_backward(
+    const Tensor& grad,
+    T other,
+    ScalarType self_st);
+at::Tensor div_tensor_other_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& other);
+template <typename T>
+at::Tensor div_tensor_self_backward(
+    const Tensor& grad,
+    T other,
+    ScalarType self_st,
+    const c10::optional<c10::string_view>& rounding_mode);
+at::Tensor div_tensor_other_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& other,
+    const c10::optional<c10::string_view>& rounding_mode);
+at::Tensor mvlgamma_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    int64_t p);
+at::Tensor permute_backwards(const at::Tensor& grad, at::IntArrayRef fwd_dims);
+at::Tensor rad2deg_backward(const at::Tensor& grad);
+at::Tensor deg2rad_backward(const at::Tensor& grad);
+at::Tensor unsqueeze_multiple(
+    const at::Tensor& t,
+    at::OptionalIntArrayRef opt_dim,
+    size_t n_dims);
+at::Tensor sum_backward(
+    const at::Tensor& grad,
+    at::SymIntArrayRef sizes,
+    at::OptionalIntArrayRef opt_dims,
+    bool keepdim);
+at::Tensor sum_backward(
+    const at::Tensor& grad,
+    c10::SymIntArrayRef sizes,
+    c10::IntArrayRef dims,
+    bool keepdim);
+at::Tensor nansum_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dims,
+    bool keepdim);
+std::vector<int64_t> reverse_list(const at::IntArrayRef list);
+std::vector<c10::SymInt> reverse_list_symint(const c10::SymIntArrayRef list);
+at::Tensor reverse_dim(const at::Tensor& t, int64_t dim);
+at::Tensor prod_safe_zeros_backward(
+    const at::Tensor& grad,
+    const at::Tensor& inp,
+    int64_t dim);
+at::Tensor prod_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& result);
+at::Tensor prod_backward(
+    at::Tensor grad,
+    const at::Tensor& input,
+    at::Tensor result,
+    int64_t dim,
+    bool keepdim);
+at::Tensor solve_jvp(
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& dA,
+    const Tensor& dB);
+at::Tensor solve_backward_self(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& A);
+at::Tensor solve_backward_A(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& A,
+    const at::Tensor& solution);
+at::Tensor cumsum_backward(const at::Tensor& grad, int64_t dim);
+at::Tensor logsumexp_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    at::Tensor result,
+    at::IntArrayRef dim,
+    bool keepdim);
+at::Tensor logsumexp_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    IntArrayRef dim,
+    bool keepdim);
+at::Tensor logcumsumexp_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    at::Tensor result,
+    int64_t dim);
+at::Tensor logcumsumexp_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    int64_t dim);
+at::Tensor unbind_backward(const variable_list& grads, int64_t dim);
+at::Tensor unbind_backward_nested(
+    const variable_list& grads,
+    const Tensor& nt_sizes,
+    int64_t dim,
+    const at::TensorOptions& options);
+at::Tensor unsqueeze_to(const at::Tensor& self, c10::SymIntArrayRef sym_sizes);
+at::Tensor unsqueeze_to(
+    const at::Tensor& self,
+    int64_t dim,
+    c10::SymIntArrayRef sym_sizes);
+at::Tensor unsqueeze_to(
+    const at::Tensor& self,
+    IntArrayRef dim,
+    c10::SymIntArrayRef sym_sizes);
+std::vector<at::Tensor> cat_tensors_backward(
+    const at::Tensor& grad,
+    const std::vector<std::vector<c10::SymInt>>& sizes,
+    const std::vector<ScalarType>& dtypes,
+    int64_t dim);
+std::vector<at::Tensor> stack_tensors_backward(
+    const at::Tensor& grad,
+    int64_t dim,
+    const std::vector<ScalarType>& dtypes);
+std::vector<at::Tensor> block_diag_backward(
+    const at::Tensor& grad,
+    const std::vector<std::vector<int64_t>>& sizes,
+    const std::vector<ScalarType>& dtypes);
+at::Tensor clamp_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const optional<at::Scalar>& min,
+    const optional<at::Scalar>& max);
+at::Tensor clamp_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& min,
+    const at::Tensor& max);
+std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& min,
+    const at::Tensor& max,
+    const std::array<bool, 2>&);
+at::Tensor clamp_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const Tensor& min_p,
+    const Tensor& min_t,
+    const Tensor& max_p,
+    const Tensor& max_t);
+at::SymIntArrayRef strides_or_error(
+    const Tensor& input,
+    c10::string_view const& input_name);
+at::Tensor mm_mat1_backward(
+    const Tensor& grad,
+    const Tensor& mat2,
+    at::SymIntArrayRef mat1_sizes,
+    at::SymIntArrayRef mat1_strides,
+    c10::Layout mat1_layout,
+    const Scalar& alpha);
+at::Tensor mm_mat2_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    at::SymIntArrayRef sizes,
+    at::SymIntArrayRef strides,
+    c10::Layout layout,
+    const at::Scalar& alpha);
+at::Tensor mm_mat1_sparse_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha);
+std::tuple<Tensor, Tensor, Tensor> sparse_sampled_addmm_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const c10::optional<Tensor>& mat1,
+    const c10::optional<Tensor>& mat2,
+    const Scalar& alpha,
+    const Scalar& beta,
+    const std::array<bool, 3>& grad_input_mask);
+at::Tensor sparse_mask_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    c10::Layout self_layout);
+at::Tensor sparse_sparse_matmul_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    int64_t grad_order);
+at::Tensor renorm_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Scalar& p,
+    int64_t dim,
+    const at::Scalar& maxnorm);
+at::Tensor renorm_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    const at::Scalar& p,
+    int64_t dim,
+    const at::Scalar& maxnorm);
+at::Tensor repeat_backward(
+    at::Tensor grad,
+    at::SymIntArrayRef repeats,
+    at::SymIntArrayRef input_shape);
+at::Tensor _fused_dropout_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double p1m);
+at::Tensor infinitely_differentiable_native_dropout_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double scale);
+at::Tensor native_dropout_double_backward(
+    const at::Tensor& ggI,
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double scale);
+at::Tensor evenly_distribute_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& value);
+Tensor sgn_backward(const Tensor& x, const Tensor& gx, const Tensor& sgn);
+Tensor masked_fill_backward(const Tensor& grad, const Tensor& mask);
+at::Tensor var_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dim,
+    const c10::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor var_jvp(
+    const at::Tensor& self_t,
+    const at::Tensor& self_p,
+    const at::Tensor& result,
+    at::OptionalIntArrayRef dim_opt,
+    const c10::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor std_backward(
+    const at::Tensor& result,
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dim,
+    const c10::optional<c10::Scalar>& correction,
+    bool keepdim);
+Tensor mean_backward(
+    const Tensor& grad,
+    c10::SymIntArrayRef shape,
+    at::OptionalIntArrayRef opt_dim,
+    c10::SymInt numel,
+    bool keepdim);
+Tensor var_mean_backward(
+    const Tensor& gvar,
+    const Tensor& gmean,
+    const Tensor& self,
+    at::OptionalIntArrayRef dim_opt,
+    const c10::optional<c10::Scalar>& correction,
+    bool keepdim);
+Tensor std_mean_backward(
+    const Tensor& gstd,
+    const Tensor& gmean,
+    const Tensor& self,
+    const Tensor& std,
+    at::OptionalIntArrayRef dim_opt,
+    const c10::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor cholesky_backward(
+    const at::Tensor& grad,
+    bool upper,
+    const at::Tensor& L);
+at::Tensor cholesky_jvp(
+    const at::Tensor& input_tangent,
+    const at::Tensor& L,
+    bool upper);
+at::Tensor cholesky_inverse_backward(
+    const at::Tensor& grad,
+    const at::Tensor& L,
+    bool upper,
+    const at::Tensor& inverse);
+at::Tensor cholesky_inverse_jvp(
+    const at::Tensor& F,
+    const at::Tensor& dF,
+    const at::Tensor& X,
+    bool upper);
+Tensor pinv_jvp(const Tensor& A, const Tensor& pinvA, const Tensor& dA);
+Tensor pinv_backward(const Tensor& grad, const Tensor& pinvA, const Tensor& A);
+at::Tensor split_with_sizes_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    c10::SymIntArrayRef split_sizes,
+    int64_t dim,
+    c10::SymIntArrayRef sizes,
+    const at::TensorOptions& options);
+at::Tensor _nested_split_with_sizes_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    c10::SymIntArrayRef split_sizes,
+    int64_t dim,
+    const Tensor& nt_sizes,
+    const at::TensorOptions& options);
+at::Tensor split_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    const c10::SymInt& split_size,
+    int64_t dim,
+    c10::SymIntArrayRef sizes,
+    const at::TensorOptions& options);
+at::Tensor max_pool_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& indices,
+    int dim);
+at::Tensor error_for_max_pool2d_double_backward();
+at::Tensor glu_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    int64_t dim);
+at::Tensor glu_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    int64_t dim);
+at::Tensor infinitely_differentiable_silu_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& input);
+at::Tensor infinitely_differentiable_mish_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& input);
+Tensor infinitely_differentiable_logit_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    c10::optional<double> eps);
+Tensor binary_cross_entropy_target_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight,
+    int64_t reduction);
+Tensor binary_cross_entropy_double_backward_target(
+    const Tensor& grad,
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight,
+    int64_t reduction);
+Tensor binary_cross_entropy_with_logits_backward(
+    const Tensor& grad,
+    const Tensor& input,
+    const Tensor& target,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& pos_weight_opt,
+    int64_t reduction);
+at::Tensor binary_cross_entropy_with_logits_target_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& self,
+    const at::Tensor& target,
+    const c10::optional<at::Tensor>& weight,
+    const c10::optional<at::Tensor>& pos_weight,
+    int64_t reduction);
+at::Tensor log_sigmoid_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input);
+at::Tensor softmax_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    int dim,
+    const at::Tensor& output);
+at::Tensor binary_cross_entropy_double_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    const c10::optional<at::Tensor>& weight,
+    int64_t reduction);
+at::Tensor binary_cross_entropy_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    const c10::optional<at::Tensor>& weight,
+    int64_t reduction);
+at::Tensor smooth_l1_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double beta);
+at::Tensor huber_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double delta);
+at::Tensor huber_loss_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double delta);
+at::Tensor mse_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    int64_t reduction);
+at::Tensor soft_margin_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction);
+at::Tensor soft_margin_loss_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction);
+at::Tensor softplus_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Scalar& beta,
+    const at::Scalar& threshold);
+std::tuple<at::Tensor, at::Tensor> slogdet_jvp(
+    const at::Tensor& LU,
+    const at::Tensor& pivots,
+    const at::Tensor& dA,
+    const at::Tensor& sign,
+    const bool use_A_T);
+at::Tensor slogdet_backward(
+    const at::Tensor& grad_sign,
+    const at::Tensor& grad_logabsdet,
+    const at::Tensor& A,
+    const at::Tensor& signdet,
+    const at::Tensor& LU,
+    const at::Tensor& pivots);
+at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self);
+at::Tensor sinc_backward(const at::Tensor& grad, const at::Tensor& self);
+at::Tensor sparse_constructor_values_backward(
+    const at::Tensor& sparse_grad_out,
+    const at::Tensor& indices);
+at::Tensor embedding_dense_double_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& indices,
+    const c10::SymInt& padding_idx);
+at::Tensor index_backward(
+    at::Tensor zeros_like_self,
+    const torch::List<c10::optional<Tensor>>& indices,
+    const at::Tensor& grad);
+at::Tensor _cudnn_ctc_loss_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& loss,
+    const at::Tensor& raw_grad,
+    bool zero_infinity);
+at::Tensor elu_double_backward(
+    const Tensor& grad,
+    const Tensor& grad_output,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    bool is_result,
+    const Tensor& self_or_result);
+
+Tensor svd_backward(
+    const Tensor& gU,
+    const Tensor& gS,
+    const Tensor& gVh,
+    const Tensor& U,
+    const Tensor& S,
+    const Tensor& Vh);
+
+std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(
+    const Tensor& dA,
+    const Tensor& U,
+    const Tensor& S,
+    const Tensor& Vh,
+    const bool full_matrices);
+Tensor slice_backward_wrapper(
+    const at::Tensor& grad,
+    const c10::SymIntArrayRef& input_sizes,
+    int64_t dim,
+    c10::optional<c10::SymInt> start,
+    c10::optional<c10::SymInt> end,
+    c10::SymInt step);
+std::tuple<Tensor, Tensor> linalg_eig_jvp(
+    const Tensor& dA,
+    const Tensor& L,
+    const Tensor& V,
+    const bool is_hermitian);
+Tensor linalg_eig_backward(
+    const Tensor& gL,
+    const Tensor& gV,
+    const Tensor& L,
+    const Tensor& V,
+    const bool is_hermitian,
+    const bool symeig_eigenvectors = true);
+Tensor linalg_lstsq_jvp(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& dA,
+    const Tensor& dB);
+std::tuple<Tensor, Tensor> triangular_solve_backward(
+    const Tensor& grad_x,
+    const Tensor& grad_m,
+    const Tensor& b,
+    const Tensor& a,
+    const Tensor& x,
+    const bool upper,
+    const bool transpose,
+    const bool unitriangular,
+    std::array<bool, 2> output_mask);
+Tensor triangular_solve_jvp(
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& dA,
+    const Tensor& dB,
+    const bool upper,
+    const bool transpose,
+    const bool unitriangular);
+Tensor linalg_solve_triangular_forward_AD(
+    const Tensor& A_t,
+    const Tensor& B_t,
+    const Tensor& A,
+    const Tensor& X,
+    const bool upper,
+    const bool left,
+    const bool unitriangular);
+std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
+    const Tensor& grad,
+    const Tensor& A,
+    const Tensor& X,
+    const bool upper,
+    const bool left,
+    const bool unitriangular,
+    std::array<bool, 2> output_mask);
+std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
+    const Tensor& grad_out,
+    const c10::optional<Tensor>& i1,
+    const c10::optional<Tensor>& i2,
+    const c10::optional<Tensor>& i3,
+    IntArrayRef expand1,
+    IntArrayRef expand2,
+    IntArrayRef expand3,
+    IntArrayRef sumdim,
+    std::array<bool, 3> grad_mask);
+std::tuple<Tensor, Tensor> linalg_qr_jvp(
+    const Tensor& dA,
+    const Tensor& Q,
+    const Tensor& R,
+    const c10::string_view mode);
+Tensor linalg_qr_backward(
+    const Tensor& gQ,
+    const Tensor& gR,
+    const Tensor& Q,
+    const Tensor& R,
+    const c10::string_view mode);
+Tensor linalg_matrix_exp_differential(
+    const Tensor& self,
+    const Tensor& grad,
+    bool adjoint);
+std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
+    const Tensor& input,
+    const c10::optional<Tensor>& gamma,
+    const Tensor& ggI,
+    const Tensor& ggG,
+    const Tensor& ggB,
+    const Tensor& gO,
+    const c10::optional<Tensor>& running_mean,
+    const c10::optional<Tensor>& running_var,
+    bool training,
+    double eps,
+    const c10::optional<Tensor>& save_mean,
+    const c10::optional<Tensor>& save_invstd,
+    std::array<bool, 3> output_mask);
+std::tuple<Tensor, Tensor> _euclidean_dist_backward(
+    const Tensor& grad,
+    const Tensor& x1,
+    const Tensor& x2,
+    const Tensor& res);
+Tensor fft_backward(
+    const Tensor& self,
+    const Tensor& grad,
+    int64_t signal_ndim,
+    bool complex_input,
+    bool complex_output,
+    bool inverse,
+    IntArrayRef checked_signal_sizes,
+    int64_t normalization,
+    bool onesided,
+    IntArrayRef output_sizes);
+Tensor fft_r2c_backward(
+    const Tensor& grad,
+    at::IntArrayRef dim,
+    int64_t normalization,
+    bool onesided,
+    const c10::SymInt& last_dim_size);
+Tensor fft_c2r_backward(
+    const Tensor& grad,
+    IntArrayRef dim,
+    int64_t normalization);
+Tensor constant_pad_nd_backward(const Tensor& grad, c10::SymIntArrayRef pad);
+std::tuple<Tensor, Tensor> cholesky_solve_backward(
+    const Tensor& grad_x,
+    const Tensor& self,
+    const Tensor& input2,
+    const Tensor& result,
+    const bool upper,
+    std::array<bool, 2> output_mask);
+Tensor cholesky_solve_jvp(
+    const Tensor& X,
+    const Tensor& U,
+    const Tensor& dU,
+    const Tensor& dB,
+    const bool upper);
+std::tuple<Tensor, Tensor, Tensor>
+infinitely_differentiable_native_group_norm_backward(
+    const Tensor& dY,
+    const Tensor& dmean,
+    const Tensor& drstd,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const c10::optional<Tensor>& gamma,
+    c10::SymInt N,
+    const c10::SymInt& C,
+    c10::SymInt HxW,
+    int64_t group,
+    double eps,
+    std::array<bool, 3> grad_input_mask);
+Tensor gelu_double_backward(
+    const Tensor& ggI,
+    const Tensor& gO,
+    const Tensor& input,
+    c10::string_view approximate);
+Tensor as_strided_backward(
+    Tensor grad,
+    const TensorGeometry& input_geometry,
+    c10::SymIntArrayRef sizes,
+    c10::SymIntArrayRef strides,
+    const optional<c10::SymInt>& storage_offset_);
+Tensor as_strided_scatter_backward(
+    const Tensor& grad,
+    const TensorGeometry& input_geometry,
+    const TensorGeometry& src_geometry,
+    c10::SymIntArrayRef sizes,
+    c10::SymIntArrayRef strides,
+    optional<c10::SymInt> storage_offset);
+std::tuple<Tensor, Tensor> atan2_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& other,
+    std::array<bool, 2> output_mask);
+Tensor amaxamin_jvp(
+    const Tensor& x,
+    const Tensor& dx,
+    const Tensor& result,
+    IntArrayRef dim,
+    bool keepdim);
+std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
+    const Tensor& input,
+    const c10::optional<Tensor>& gamma,
+    const Tensor& ggI,
+    const Tensor& ggG,
+    const Tensor& ggB,
+    const Tensor& gO,
+    const Tensor& save_mean,
+    const Tensor& save_invstd,
+    c10::SymIntArrayRef normalized_shape,
+    std::array<bool, 3> output_mask);
+
+std::tuple<Tensor, Tensor> householder_product_backward(
+    const Tensor& grad,
+    const Tensor& result,
+    const Tensor& input,
+    const Tensor& tau,
+    const bool flip_order = false);
+Tensor householder_product_jvp(
+    const Tensor& dV,
+    const Tensor& dtau,
+    const Tensor& prod,
+    const Tensor& V,
+    const Tensor& tau);
+std::tuple<Tensor, Tensor, Tensor> ormqr_backward(
+    const Tensor& grad,
+    const Tensor& result,
+    const Tensor& self,
+    const Tensor& tau,
+    const Tensor& other,
+    bool left,
+    bool transpose,
+    std::array<bool, 3> grad_output_mask);
+std::tuple<Tensor, Tensor> polar_backward(
+    const Tensor& grad,
+    const Tensor& result);
+Tensor i1_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+Tensor i1e_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+Tensor linalg_lu_solve_LU(
+    const Tensor& grad,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const Tensor& X,
+    const bool left,
+    const bool adjoint);
+Tensor linalg_lu_solve_jvp(
+    const Tensor& X,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const Tensor& dLU,
+    const Tensor& dB,
+    const bool left,
+    const bool adjoint);
+std::tuple<Tensor, Tensor> linalg_solve_backward(
+    const Tensor& gX,
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool left,
+    const bool B_requires_grad);
+Tensor linalg_solve_jvp(
+    const Tensor& dA,
+    const Tensor& dB,
+    const Tensor& X,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool left,
+    const bool use_A_T);
+Tensor lu_unpack_backward(
+    const Tensor& L_grad,
+    const Tensor& U_grad,
+    const c10::SymInt& m,
+    const c10::SymInt& n);
+
+Tensor linalg_det_backward(
+    const Tensor& grad,
+    const Tensor& det,
+    const Tensor& A,
+    const Tensor& LU,
+    const Tensor& pivots);
+Tensor linalg_det_jvp(
+    const Tensor& dA,
+    const Tensor& det,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool use_A_T);
+std::tuple<Tensor, Tensor> linalg_lstsq_backward(
+    const Tensor& grad,
+    const Tensor& A,
+    const Tensor& B_,
+    const std::array<bool, 2>& grad_input_mask);
+Tensor linalg_lu_backward(
+    const Tensor& L_grad,
+    const Tensor& U_grad,
+    const Tensor& P,
+    const Tensor& L,
+    const Tensor& U,
+    const bool pivot);
+
+std::tuple<Tensor, Tensor> linalg_lu_jvp(
+    const Tensor& dA,
+    const Tensor& P,
+    const Tensor& L,
+    const Tensor& U,
+    const bool pivot);
+
+Tensor lu_factor_ex_backward(
+    const Tensor& grad,
+    const Tensor& LU,
+    const Tensor& pivs,
+    const bool pivot);
+Tensor lu_factor_ex_jvp(
+    const Tensor& dX,
+    const Tensor& LU,
+    const Tensor& pivs,
+    const bool pivot);
+
+Tensor batch_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const c10::optional<Tensor>& running_mean,
+    const c10::optional<Tensor>& running_var,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    bool train,
+    double eps);
+
+Tensor layer_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    c10::SymIntArrayRef normalized_shape);
+
+Tensor group_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    int64_t groups);
+Tensor group_norm_mean_jvp(
+    const Tensor& input_t,
+    const Tensor& mean_p,
+    int64_t groups);
+Tensor group_norm_invstd_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& mean_p,
+    const Tensor& invstd_p,
+    int64_t groups);
+
+Tensor convolution_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    at::SymIntArrayRef output_padding,
+    const c10::SymInt& groups);
+
+Tensor _convolution_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    at::SymIntArrayRef output_padding,
+    const c10::SymInt& groups,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32);
+
+Tensor convolution_backward_jvp_grad_bias(
+    const Tensor& grad_out_t,
+    const Tensor& grad_bias);
+
+Tensor cat_jvp(const at::ITensorListRef& tensors, int64_t dim);
+Tensor block_diag_jvp(at::TensorList tensors);
+Tensor stack_jvp(at::TensorList tensors, int64_t dim);
+Tensor cumprod_jvp(
+    const Tensor& self_t,
+    const Tensor& self_p,
+    const Tensor& result,
+    int dim);
+Tensor gather_with_keepdimed_indices(
+    const Tensor& input,
+    int64_t dim,
+    const Tensor& indices,
+    bool keepdim);
+Tensor evenly_read_jvp(
+    const Tensor& fw_grad,
+    const Tensor& input,
+    const Tensor& value);
+Tensor warn_backwards(const Tensor& grad_output);
+
+std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
+    const at::Tensor& self,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef output_padding,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    c10::SymInt groups,
+    ::std::array<bool, 2> output_mask);
+
+Tensor scatter_reduce_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    int dim,
+    const Tensor& index,
+    const Tensor& src_p,
+    const Tensor& src_t,
+    c10::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+std::tuple<Tensor, Tensor> scatter_reduce_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    int dim,
+    const Tensor& index,
+    const Tensor& src,
+    c10::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+Tensor _to_copy_backward(
+    const Tensor& grad,
+    const c10::TensorOptions& self_options);
+
+std::tuple<Tensor, Tensor> index_reduce_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    int dim,
+    const Tensor& index,
+    const Tensor& source,
+    c10::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+Tensor take_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& indices);
+
+Tensor to_sparse_backward(
+    const Tensor& grad,
+    const c10::Layout self_layout,
+    const c10::OptionalArrayRef<c10::SymInt>& self_blocksize);
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>
+mkldnn_rnn_layer_differentiable_backward(
+    const Tensor& input,
+    const Tensor& weight0,
+    const Tensor& weight1,
+    const Tensor& weight2,
+    const Tensor& weight3,
+    const Tensor& hx_,
+    const Tensor& cx_tmp,
+    const Tensor& output,
+    const Tensor& hy_,
+    const Tensor& cy_,
+    const c10::optional<Tensor>& grad_output_r_opt,
+    const c10::optional<Tensor>& grad_hy_r_opt,
+    const c10::optional<Tensor>& grad_cy_r_opt,
+    bool reverse,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t num_layers,
+    bool has_biases,
+    bool train,
+    bool bidirectional,
+    at::IntArrayRef batch_sizes,
+    bool batch_first,
+    const at::Tensor& workspace);
+
+Tensor values_backward(const Tensor& grad, const Tensor& self);
+
+} // namespace torch::autograd::generated::details
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c5516e20af354b2dacb023834d23d4dafe13be0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <c10/core/InferenceMode.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::autograd {
+
+using InferenceMode = c10::InferenceMode;
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6c0888a2ec68f0932694bf8c8f0ba54ce533225
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h
@@ -0,0 +1,445 @@
+#pragma once
+
+#include <c10/util/irange.h>
+
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/functions/tensor.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/jit_decomp_interface.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#ifdef _MSC_VER
+#ifdef Type
+#undef Type
+#endif
+#endif
+
+namespace torch {
+namespace autograd {
+enum class can_mutate_inplace_result {
+  success,
+  non_default_backward_view,
+  view_of_leaf,
+  is_leaf,
+};
+
+// The requires_grad argument is used to know if the inplace operation needs
+// gradient to be setup for it.
+// In particular, we can have tensor.requires_grad() != requires_grad when
+// writing a Tensor that requires gradients inplace into a Tensor that does not
+// require gradients: a = torch.rand(2) b = torch.rand(2, requires_grad=True)
+// a.copy_(b)
+inline can_mutate_inplace_result can_mutate_inplace(
+    const at::Tensor& tensor,
+    bool requires_grad) {
+  if (!requires_grad || !GradMode::is_enabled()) {
+    return can_mutate_inplace_result::success;
+  }
+  auto diff_view_meta = impl::get_view_autograd_meta(tensor);
+  if (diff_view_meta && diff_view_meta->has_bw_view()) {
+    if (diff_view_meta->get_creation_meta() != CreationMeta::DEFAULT) {
+      return can_mutate_inplace_result::non_default_backward_view;
+    }
+    if (tensor.requires_grad() && tensor._base().is_leaf()) {
+      return can_mutate_inplace_result::view_of_leaf;
+    }
+  }
+  if (tensor.requires_grad() && tensor.is_leaf()) {
+    return can_mutate_inplace_result::is_leaf;
+  }
+  return can_mutate_inplace_result::success;
+}
+
+inline void check_inplace(const at::Tensor& tensor, bool requires_grad) {
+  switch (can_mutate_inplace(tensor, requires_grad)) {
+    case can_mutate_inplace_result::success:
+      return;
+    case can_mutate_inplace_result::non_default_backward_view: {
+      return handle_view_on_rebase(impl::get_view_autograd_meta(tensor));
+    }
+    case can_mutate_inplace_result::view_of_leaf:
+      TORCH_CHECK(
+          false,
+          "a view of a leaf Variable that requires grad is being used in an in-place operation.");
+      break;
+
+    case can_mutate_inplace_result::is_leaf:
+      TORCH_CHECK(
+          false,
+          "a leaf Variable that requires grad is being used in an in-place operation.");
+      break;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+inline void check_inplace(at::ITensorListRef tensors, bool requires_grad) {
+  for (const auto& tensor : tensors) {
+    check_inplace(tensor, requires_grad);
+  }
+}
+
+inline void throw_error_out_requires_grad(const char* name) {
+  AT_ERROR(
+      name,
+      "(): functions with out=... arguments don't support automatic differentiation, "
+      "but one of the arguments requires grad.");
+}
+
+inline void throw_error_for_complex_autograd(
+    const at::Tensor& tensor,
+    const char* name) {
+  if (tensor.requires_grad()) {
+    TORCH_CHECK(
+        !tensor.is_complex(),
+        name,
+        " does not support automatic differentiation for outputs with complex dtype.");
+  }
+}
+
+inline void throw_error_if_base_and_tensor_are_same(
+    const at::Tensor& base,
+    const at::Tensor& tensor) {
+  TORCH_CHECK(
+      base.unsafeGetTensorImpl() != tensor.unsafeGetTensorImpl(),
+      "View operation returned a tensor that is the same as the input base tensor.  This "
+      "is no longer allowed; you must explicitly create a new tensor (e.g., using .detach()). "
+      "As a user, you could have made a mistake implementing __torch_dispatch__ or a Python "
+      "operator decomposition or meta registration; if that's not the case, please "
+      "report a bug to PyTorch or the backend you are using.");
+}
+
+inline void throw_error_for_complex_autograd(
+    at::ITensorListRef tensorlist,
+    const char* name) {
+  for (const auto& tensor : tensorlist) {
+    throw_error_for_complex_autograd(tensor, name);
+  }
+}
+
+// TODO: Blegh, bare references
+
+inline void rebase_history(const Variable& var, std::shared_ptr<Node> grad_fn) {
+  if (grad_fn && var.defined()) {
+    grad_fn->add_input_metadata(var);
+    impl::rebase_history(var, {std::move(grad_fn), 0});
+  }
+}
+
+inline void rebase_history(
+    const std::vector<Variable>& vars,
+    const std::shared_ptr<Node>& grad_fn) {
+  if (grad_fn) {
+    for (auto& var : vars) {
+      if (var.defined()) {
+        auto output_nr = grad_fn->add_input_metadata(var);
+        impl::rebase_history(var, {grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Node::undefined_input());
+      }
+    }
+  }
+}
+
+inline void increment_version(const at::Tensor& t) {
+  impl::bump_version(t);
+}
+
+struct Flatten : IterArgs<Flatten> {
+  Flatten(variable_list& out) : out(out) {}
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  variable_list& out;
+  void operator()(const at::Tensor& x) {
+    out.emplace_back(x);
+  }
+  void operator()(const c10::optional<at::Tensor>& x) {
+    if (x.has_value())
+      out.emplace_back(x.value());
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    out.insert(out.end(), xs.begin(), xs.end());
+  }
+};
+
+template <typename... Args>
+inline variable_list flatten_tensor_args(Args&&... args) {
+  variable_list out;
+  out.reserve(count_tensors(std::forward<Args>(args)...));
+  Flatten(out).apply(std::forward<Args>(args)...);
+  return out; // RVO
+}
+
+// See NOTE [ Autograd View Variables ] for details.
+inline at::Tensor as_view(
+    const at::Tensor& base,
+    const at::Tensor& tensor,
+    bool is_bw_differentiable,
+    bool is_fw_differentiable,
+    std::unique_ptr<ViewFunc> view_func = nullptr,
+    std::function<at::Tensor(const at::Tensor&)> rev_view_func = nullptr,
+    CreationMeta creation_meta = CreationMeta::DEFAULT,
+    bool allow_tensor_metadata_change = true) {
+  // Note [View of inference tensor]
+  // For inference tensor this code can only be hit outside InferenceMode
+  // since ADInplaceOrView is in the default_included_set.
+  // If Inplace and View were separate dispatch keys we can just put Inplace
+  // in the default_included_set, so that view ops on inference tensor doesn't
+  // have to go through as_view even outside InferenceMode.
+  if (base.is_inference())
+    return tensor;
+
+  auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(base);
+
+  // To speed up the most common case, we specially handle when both the forward
+  // and backward view infos are the same, and so a single shared ViewInfo can
+  // be used for both of them.
+  if ((!diff_view_meta || diff_view_meta->shared_view_info()) &&
+      is_bw_differentiable && is_fw_differentiable) {
+    throw_error_if_base_and_tensor_are_same(base, tensor);
+    if (diff_view_meta) {
+      creation_meta = propagate_creation_meta(
+          diff_view_meta->get_creation_meta(), creation_meta);
+      return make_variable_differentiable_view(
+          tensor,
+          diff_view_meta->get_backward_view().chain(
+              base, tensor, std::move(view_func), std::move(rev_view_func)),
+          c10::nullopt,
+          /*shared_view_info*/ true,
+          creation_meta,
+          allow_tensor_metadata_change);
+    } else {
+      return make_variable_differentiable_view(
+          tensor,
+          ViewInfo(base, std::move(view_func), std::move(rev_view_func)),
+          c10::nullopt,
+          /*shared_view_info*/ true,
+          creation_meta,
+          allow_tensor_metadata_change);
+    }
+  }
+
+  // If they cannot be shared, create the required view infos
+  c10::optional<ViewInfo> new_bw_info;
+  c10::optional<ViewInfo> new_fw_info;
+
+  if (is_bw_differentiable) {
+    auto bw_view_func = view_func ? view_func->clone_and_set() : nullptr;
+    if (diff_view_meta && diff_view_meta->has_bw_view()) {
+      const auto& base_bw_info = diff_view_meta->get_backward_view();
+      new_bw_info = base_bw_info.chain(
+          base, tensor, std::move(bw_view_func), rev_view_func);
+    } else {
+      new_bw_info = ViewInfo(base, std::move(bw_view_func), rev_view_func);
+    }
+  } else {
+    TORCH_CHECK(
+        creation_meta == CreationMeta::DEFAULT,
+        "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
+  }
+
+  if (is_fw_differentiable) {
+    // Check if base is a forward differentiable view
+    if (diff_view_meta && diff_view_meta->has_fw_view()) {
+      const auto& base_fw_info = diff_view_meta->get_forward_view();
+      new_fw_info = base_fw_info.chain(
+          base, tensor, std::move(view_func), std::move(rev_view_func));
+    } else {
+      new_fw_info =
+          ViewInfo(base, std::move(view_func), std::move(rev_view_func));
+    }
+  }
+
+  if (is_fw_differentiable || is_bw_differentiable) {
+    if (diff_view_meta && diff_view_meta->has_bw_view()) {
+      creation_meta = propagate_creation_meta(
+          diff_view_meta->get_creation_meta(), creation_meta);
+    }
+    throw_error_if_base_and_tensor_are_same(base, tensor);
+    return make_variable_differentiable_view(
+        tensor,
+        std::move(new_bw_info),
+        std::move(new_fw_info),
+        /*shared_view_info*/ false,
+        creation_meta,
+        allow_tensor_metadata_change);
+  } else {
+    return make_variable_non_differentiable_view(
+        base, tensor, allow_tensor_metadata_change);
+  }
+}
+
+inline void check_no_requires_grad(
+    const at::Tensor& tensor,
+    const char* name,
+    const char* fn_name = "",
+    bool check_grad_mode = true) {
+  TORCH_CHECK(
+      !(tensor.defined() && tensor.requires_grad()) ||
+          !(check_grad_mode && GradMode::is_enabled()),
+      "The function '",
+      fn_name,
+      "' is not differentiable with respect to argument '",
+      name,
+      "'. This input cannot have requires_grad True.");
+}
+
+inline void check_no_requires_grad(
+    const c10::optional<at::Tensor>& tensor,
+    const char* name,
+    const char* fn_name = "") {
+  if (tensor.has_value()) {
+    check_no_requires_grad(*tensor, name, fn_name);
+  }
+}
+
+inline void check_no_requires_grad(
+    at::ITensorListRef tensors,
+    const char* name,
+    const char* fn_name = "") {
+  // GradMode check is expensive, so check it only once for TensorLists
+  if (!GradMode::is_enabled()) {
+    return;
+  }
+  for (auto& tensor : tensors) {
+    check_no_requires_grad(tensor, name, fn_name, /*check_grad_mode*/ false);
+  }
+}
+
+inline void check_no_requires_grad(
+    const c10::List<c10::optional<at::Tensor>>& tensors,
+    const char* name,
+    const char* fn_name = "") {
+  // GradMode check is expensive, so check it only once for TensorLists
+  if (!GradMode::is_enabled()) {
+    return;
+  }
+  for (c10::optional<at::Tensor> tensor : tensors) {
+    if (tensor.has_value()) {
+      check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false);
+    }
+  }
+}
+
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(
+    at::ITensorListRef tensors,
+    const bool is_output = false) {
+  return fmap(tensors, [&is_output](const at::Tensor& tensor) -> SavedVariable {
+    return SavedVariable{tensor, is_output /* is output */};
+  });
+}
+
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(
+    const c10::List<c10::optional<at::Tensor>>& tensors,
+    const bool is_output = false) {
+  return fmap(
+      tensors,
+      [&is_output](const c10::optional<at::Tensor>& tensor) -> SavedVariable {
+        if (tensor.has_value()) {
+          return SavedVariable{*tensor, is_output /* is output */};
+        } else {
+          return SavedVariable{at::Tensor(), is_output /* is output */};
+        }
+      });
+}
+
+inline std::vector<std::vector<int64_t>> to_args_sizes(
+    at::ITensorListRef tensors) {
+  std::vector<std::vector<int64_t>> args_sizes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_sizes[i++] = t.sizes().vec();
+  }
+  return args_sizes;
+}
+
+inline std::vector<std::vector<c10::SymInt>> to_args_sizes_symint(
+    at::ITensorListRef tensors) {
+  std::vector<std::vector<c10::SymInt>> args_sizes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_sizes[i++] = t.sym_sizes().vec();
+  }
+  return args_sizes;
+}
+
+inline std::vector<c10::ScalarType> to_args_scalartypes(
+    at::ITensorListRef tensors) {
+  std::vector<c10::ScalarType> args_scalartypes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_scalartypes[i++] = t.scalar_type();
+  }
+  return args_scalartypes;
+}
+
+namespace impl {
+
+namespace {
+
+// If run_jit_decomposition were not a member function, we would be able
+// to pass this as a template parameter to c10::Boxedkernel::makeFromFunction.
+// However, member functions cannot be passed this way - instead we wrap our
+// call in this functor so it can be passed to c10::BoxedKernel::makeFromFunctor
+class WrapperFunctor final : public c10::OperatorKernel {
+ public:
+  WrapperFunctor(JitDecompInterface* impl) : impl_(impl){};
+
+  void operator()(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet ks,
+      torch::jit::Stack* stack) {
+    impl_->run_jit_decomposition(op, stack);
+  }
+  JitDecompInterface* impl_;
+};
+
+} // namespace
+
+template <class Return, class... Args>
+Return run_jit_decomposition_with_args_for_jvp(
+    c10::string_view name,
+    const c10::OperatorHandle& opHandle,
+    c10::DispatchKeySet dispatchKeySet,
+    Args&&... args) {
+  // see NOTE: [Jit Decomposition Interface]
+  JitDecompInterface* impl = getJitDecompImpl();
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      impl && impl->has_jit_decomposition(opHandle.schema()),
+      "Trying to use forward AD with ",
+      name,
+      " that does not support it because it has not been implemented yet.\nPlease file an issue "
+      "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+      "so that we can prioritize its implementation.\n"
+      "Note that forward AD support for some operators require PyTorch to be built with "
+      "TorchScript and for JIT to be enabled. "
+      "If the environment var PYTORCH_JIT=0 is set or if the library is not built with TorchScript, "
+      "some operators may no longer be used with forward AD.");
+
+  return c10::KernelFunction::makeFromBoxedKernel(
+             c10::BoxedKernel::makeFromFunctor(
+                 std::make_unique<WrapperFunctor>(impl)))
+      .call<Return, Args...>(
+          opHandle, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+} // namespace impl
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..51c09f3eddd59ca17f0ae02d4bb1a7909acec92f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <string>
+
+namespace torch::autograd {
+
+// forward declaration of Node from function.h
+struct Node;
+
+struct TORCH_API AnomalyMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static bool should_check_nan() {
+    return _check_nan;
+  }
+  static void set_enabled(bool enabled, bool check_nan = true) {
+    _enabled = enabled;
+    _check_nan = check_nan;
+  }
+
+ private:
+  static bool _enabled;
+  static bool _check_nan;
+};
+
+/// A RAII guard that enables Anomaly Detection Mode.
+///
+/// Anomaly detection mode is useful for debugging problems happening
+/// in the backward, such as unexpectedly modified tensors or NaNs
+/// occuring in the backward.
+///
+/// The enabling of anomaly mode is global - as soon as there is one
+/// such guard, it is enabled for all computation and threads. It also
+/// comes with a significant performance penalty.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::autograd::DetectAnomalyGuard detect_anomaly;
+///   auto x = torch::tensor({5.0}, torch::requires_grad());
+///   auto y = x * x;
+///   auto z = y * y;
+///   y += 1;
+///   z.backward();
+/// }
+/// @endcode
+class TORCH_API DetectAnomalyGuard {
+ public:
+  DetectAnomalyGuard(bool check_nan = true);
+  ~DetectAnomalyGuard();
+
+ private:
+  bool prev_check_nan_;
+};
+
+struct TORCH_API AnomalyMetadata {
+  virtual ~AnomalyMetadata();
+  virtual void store_stack();
+  virtual void print_stack(const std::string& current_node_name);
+  virtual void assign_parent(const std::shared_ptr<Node>& parent_node);
+
+ private:
+  std::string traceback_;
+  std::shared_ptr<Node> parent_;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d9c856dabe3413f708752517d8cba93a32d5b97
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+/// Computes the sum of gradients of given tensors with respect to graph leaves.
+///
+/// The graph is differentiated using the chain rule. If any of ``tensors``
+/// are non-scalar (i.e. their data has more than one element) and require
+/// gradient, then the Jacobian-vector product would be computed, in this case
+/// the function additionally requires specifying `grad_tensors`. It should be a
+/// sequence of matching length, that contains the "vector" in the
+/// Jacobian-vector product, usually the gradient of the differentiated function
+/// w.r.t. corresponding tensors
+/// (`torch::Tensor()` is an acceptable value for all tensors that don't need
+/// gradient tensors).
+///
+/// This function accumulates gradients in the leaves - you might need to zero
+/// them before calling it.
+///
+/// \param tensors Tensors of which the derivative will be computed.
+/// \param grad_tensors The "vector" in the Jacobian-vector product, usually
+/// gradients
+///     w.r.t. each element of corresponding tensors. `torch::Tensor()` values
+///     can be specified for scalar Tensors or ones that don't require grad. If
+///     a `torch::Tensor()` value would be acceptable for all grad_tensors, then
+///     this argument is optional.
+/// \param retain_graph If `false`, the graph used to compute the grad will be
+/// freed.
+///     Note that in nearly all cases setting this option to `true` is not
+///     needed and often can be worked around in a much more efficient way.
+///     Defaults to the value of `create_graph`.
+/// \param create_graph If `true`, graph of the derivative will be constructed,
+/// allowing
+///     to compute higher order derivative products. Defaults to `false`.
+/// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+///     `at::Tensor::grad`. All other Tensors will be ignored. If not provided,
+///     the gradient is accumulated into all the leaf Tensors that were used to
+///     compute param `tensors`.
+//      When inputs are provided and a given input is not a leaf,
+//      the current implementation will call its grad_fn (even though it is not
+//      strictly needed to get this gradients). It is an implementation detail
+//      on which the user should not rely. See
+//      https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for
+//      more details.
+TORCH_API void backward(
+    const variable_list& tensors,
+    const variable_list& grad_tensors = {},
+    c10::optional<bool> retain_graph = c10::nullopt,
+    bool create_graph = false,
+    const variable_list& inputs = {});
+
+/// Computes and returns the sum of gradients of outputs with respect to the
+/// inputs.
+///
+/// ``grad_outputs`` should be a sequence of length matching ``output``
+/// containing the "vector" in Jacobian-vector product, usually the pre-computed
+/// gradients w.r.t. each of the outputs. If an output doesn't require_grad,
+/// then the gradient can be ``torch::Tensor()``).
+///
+/// \param outputs outputs of the differentiated function.
+/// \param inputs Inputs w.r.t. which the gradient will be
+///     returned (and not accumulated into ``at::Tensor::grad``).
+/// \param grad_outputs The "vector" in the Jacobian-vector product.
+///     Usually gradients w.r.t. each output. `torch::Tensor()` values can be
+///     specified for scalar Tensors or ones that don't require grad. If a
+///     `torch::Tensor()` value would be acceptable for all grad_tensors, then
+///     this argument is optional. Default: `{}`.
+/// \param retain_graph If ``false``, the graph used to compute the grad
+///     will be freed. Note that in nearly all cases setting this option to
+///     ``true`` is not needed and often can be worked around in a much more
+///     efficient way. Defaults to the value of ``create_graph``.
+/// \param create_graph If ``true``, graph of the derivative will
+///     be constructed, allowing to compute higher order derivative products.
+///     Default: ``false``.
+/// \param allow_unused If ``false``, specifying inputs that were not
+///     used when computing outputs (and therefore their grad is always zero)
+///     is an error. Defaults to ``false``.
+TORCH_API variable_list grad(
+    const variable_list& outputs,
+    const variable_list& inputs,
+    const variable_list& grad_outputs = {},
+    c10::optional<bool> retain_graph = c10::nullopt,
+    bool create_graph = false,
+    bool allow_unused = false);
+
+namespace forward_ad {
+
+/// Creates a new dual level and returns its index. This level index should then
+/// be used to call into the other functions below. This API supports entering a
+/// new level before the previous one is exited. We call them nested forward AD
+/// levels. These can be used to compute higher order derivatives.
+TORCH_API uint64_t enter_dual_level();
+
+/// Exits the given level. This will clear up all the gradients from this level
+/// and all dual Tensors that had gradients for this level will become regular
+/// Tensors again. This function can only be used to exit the innermost nesting
+/// level and so exiting must happen in reverse order compared to the entering
+/// that was done with the function above.
+TORCH_API void exit_dual_level(uint64_t level);
+
+} // namespace forward_ad
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..f406dde432ce8e37313026c654dfe5cd05a0d1df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/library.h>
+
+namespace torch::autograd {
+
+// Default DispatchKey::Autograd fallback for built-in operators.
+// Can be registered for custom operators.
+TORCH_API torch::CppFunction autogradNotImplementedFallback();
+
+// Default DispatchKey::AdInplaceOrView fallback for built-in operators
+// Can be registered for custom operators.
+TORCH_API torch::CppFunction autogradNotImplementedInplaceOrViewFallback();
+
+// Default DispatchKey::Autograd fallback for all other operators (i.e. custom
+// operators)
+TORCH_API torch::CppFunction basicAutogradNotImplementedFallback();
+
+enum class AutogradFallbackMode {
+  Nothing, // Fallback is a redispatch
+  Warn, // Fallback raises a warning if backward is called
+  Error, // Fallback raises an error if backward is called
+};
+
+// Change the behavior of "basicAutogradNotImplementedFallback"
+// In Python this is:
+// - torch._C._set_autograd_fallback_mode(str) -> None
+// - torch._C._get_autograd_fallback_mode() -> str
+TORCH_API void setAutogradFallbackMode(AutogradFallbackMode mode);
+TORCH_API AutogradFallbackMode getAutogradFallbackMode();
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbac20568e6167aff807eabf7f446e8074d7d2ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <torch/csrc/autograd/function_hook.h>
+#include <functional>
+#include <memory>
+
+namespace torch::autograd {
+
+using hooks_list =
+    std::vector<std::function<at::TensorBase(const at::TensorBase&)>>;
+
+struct CppFunctionTensorPreHook : public FunctionPreHook {
+  CppFunctionTensorPreHook(std::shared_ptr<hooks_list> hooks, size_t value_idx);
+  variable_list operator()(const variable_list& values) override;
+
+  std::shared_ptr<hooks_list> hooks_;
+  size_t value_idx_;
+};
+
+struct CppFunctionSingleTensorPreHook : public FunctionPreHook {
+  CppFunctionSingleTensorPreHook(
+      std::function<at::TensorBase(const at::TensorBase&)> hook,
+      size_t value_idx);
+  variable_list operator()(const variable_list& values) override;
+
+  std::function<at::TensorBase(const at::TensorBase&)> hook_;
+  size_t value_idx_;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cf3b51ee1db1d6675299a3de52c9abaca805fd6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h
@@ -0,0 +1,425 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/core/SymInt.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/variable_info.h>
+#include <vector>
+
+namespace torch::autograd {
+
+using optional_variable_list = std::vector<c10::optional<Variable>>;
+using _jvp_fn_t = std::function<variable_list(variable_list, variable_list)>;
+using _view_as_self_fn_t = std::function<at::Tensor(at::Tensor)>;
+
+TORCH_API std::vector<c10::optional<Variable>> _wrap_outputs(
+    const variable_list& input_vars,
+    const std::unordered_set<at::TensorImpl*>& non_differentiable,
+    const std::unordered_set<at::TensorImpl*>& dirty_inputs,
+    const at::ArrayRef<c10::optional<Variable>> raw_outputs,
+    const std::shared_ptr<Node>& cdata,
+    const _jvp_fn_t& jvp_user_function,
+    const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
+    const _view_as_self_fn_t& view_as_self_fn);
+
+TORCH_API void check_variable_result(
+    const at::TensorBase& original,
+    const at::TensorBase& result,
+    const std::string& hook_name);
+
+// Get the return type of the forward function of the custom Function class X
+template <typename X, typename... Args>
+using forward_t = decltype(X::forward(nullptr, std::declval<Args>()...));
+
+/// To use custom autograd operations, implement a Function subclass with
+/// static forward and backward functions:
+///
+/// `forward` can take as many arguments as you want and should return either a
+/// variable list or a Variable. Use of any direct Variable arguments will be
+/// registered in the graph but no vectors/sets or any other data structures
+/// will be traversed. You can use c10::optional<Tensor> as one of the arguments
+/// and it will be registered as a variable in the graph if the argument has a
+/// value. It should take a pointer to `torch::autograd::AutogradContext` as the
+/// first argument. Variables can be saved in the `ctx` using
+/// `ctx->save_for_backward`
+/// (see `torch::autograd::AutogradContext::save_for_backward`) and other data
+/// can be saved in the `ctx->saved_data` map
+/// (see `torch::autograd::AutogradContext::saved_data`)
+/// in the form of `<std::string, at::IValue>` pairs.
+///
+/// `backward` should take a pointer to `torch::autograd::AutogradContext`
+/// and a variable list containing as many Variables as there were outputs from
+/// `forward` as arguments. It should return as many Variables as there were
+/// inputs with each of them containing the gradient w.r.t. its corresponding
+/// input. Variables saved in `forward` can be accessed with
+/// `ctx->get_saved_variables` (see
+/// `torch::autograd::AutogradContext::get_saved_variables`) and other saved
+/// data can be accessed from `ctx->saved_data`.
+///
+/// For example:
+/// ```
+/// class MyFunction : public Function<MyFunction> {
+///   public:
+///   static variable_list forward(AutogradContext *ctx, int n, Variable var) {
+///      // Save data for backward in context
+///      ctx->saved_data["n"] = n;
+///      var.mul_(2);
+///      // Mark var as modified by inplace operation
+///      ctx->mark_dirty({var});
+///      return {var};
+///   }
+///
+///   static variable_list backward(AutogradContext *ctx, variable_list
+///   grad_output) {
+///      // Use data saved in forward
+///      auto n = ctx->saved_data["n"].toInt();
+///      return {grad_output[0]*n};
+///   }
+/// };
+/// ```
+///
+/// To use `MyFunction`:
+/// ```
+/// Variable x;
+/// auto y = MyFunction::apply(6, x);
+/// // Example backward call
+/// y[0].sum().backward();
+/// ```
+template <class T>
+struct TORCH_API Function {
+  // We need to use a different template parameter than T here because T will
+  // inherit from Function, and when Function<T> is instantiated, T::forward
+  // is not declared yet.
+  // The enable_if check is to ensure that the user doesn't explicitly provide
+  // the parameter X.
+  template <typename X = T, typename... Args>
+  static auto apply(Args&&... args)
+      -> std::enable_if_t<std::is_same_v<X, T>, forward_t<X, Args...>>;
+};
+
+/// Context to save information during `forward` that can be accessed in
+/// `backward` in custom autograd operations (see `torch::autograd::Function`
+/// for details).
+struct TORCH_API AutogradContext {
+  AutogradContext() = default;
+  AutogradContext(const AutogradContext& other) = delete;
+  AutogradContext& operator=(const AutogradContext& other) = delete;
+
+  /// Can be used to save non-variable data for `backward`.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  ska::flat_hash_map<std::string, at::IValue> saved_data;
+
+  /// Saves the list of variables for a future call to `backward`. This
+  /// should be called at most once from inside of `forward`.
+  void save_for_backward(variable_list to_save);
+  /// Marks variables in the list as modified in an in-place operation. This
+  /// should be called at most once from inside of `forward` and all arguments
+  /// should be inputs.
+  void mark_dirty(const variable_list& inputs);
+  /// Marks outputs in the list as not requiring gradients. This should be
+  /// called at most once from inside of `forward` and all arguments should be
+  /// outputs.
+  void mark_non_differentiable(const variable_list& outputs);
+  // Sets whether undefined output grad tensors should be expanded to tensors
+  // full of zeros before calling backward function. Default value is true.
+  void set_materialize_grads(bool value);
+
+  /// Get the list of variables that were saved in `forward` using
+  /// `save_for_backward()`. Before returning them to the user, a check is made
+  /// to ensure that they were not modified by any in-place operations.
+  variable_list get_saved_variables() const;
+  const std::unordered_set<at::TensorImpl*>& get_and_bump_dirty() const;
+  const std::unordered_set<at::TensorImpl*>& get_non_differentiable() const;
+
+  /// Expose the Node's `task_should_compute_output` method to the cpp
+  /// custom autograd Function as `needs_input_grad`.
+  bool needs_input_grad(size_t output_edge_index) const;
+  bool needs_input_grad(std::initializer_list<IndexRange> idxs) const;
+
+ private:
+  std::unordered_set<at::TensorImpl*> non_differentiable_;
+  std::unordered_set<at::TensorImpl*> dirty_inputs_;
+  std::vector<torch::autograd::SavedVariable> saved_variables_;
+  variable_list to_save_;
+  bool materialize_grads_{true};
+
+  // The CppNode in the autograd graph that owns this AutogradContext. We need a
+  // weak_ptr to avoid a refcycle. Since grad_fn_ owns this AutogradContext, it
+  // will always be alive when we want to use it.
+  std::weak_ptr<Node> grad_fn_;
+  bool has_freed_buffers_{false};
+
+  void save_variables();
+
+  template <class T>
+  friend struct CppNode;
+};
+
+// CppNode<T> is the Node in the autograd graph that represents the user defined
+// backward function for Function<T>. Calls to CppNode::apply are forward to
+// T::backward().
+template <class T>
+struct CppNode : public Node {
+  variable_list apply(variable_list&& inputs) override;
+  AutogradContext ctx_;
+  std::vector<bool> is_variable_input_;
+  std::vector<VariableInfo> input_info_;
+  std::vector<VariableInfo> output_info_;
+
+  void release_variables() override;
+
+  void set_ctx_grad_fn(const std::shared_ptr<Node>& node);
+  void save_variables_to_ctx();
+};
+
+struct ExtractVariables : IterArgs<ExtractVariables> {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  std::vector<bool>& is_var_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  variable_list& list_;
+  ExtractVariables(std::vector<bool>& is_var, variable_list& list)
+      : is_var_(is_var), list_(list) {}
+  void operator()(const c10::optional<at::Tensor>& x) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (x.has_value() && x.value().defined()) {
+      is_var_.push_back(true);
+      list_.emplace_back(x.value());
+    } else {
+      is_var_.push_back(false);
+    }
+  }
+  void operator()(const at::Tensor& x) {
+    is_var_.push_back(true);
+    list_.emplace_back(x);
+  }
+  void operator()(const at::TensorList& list) {
+    for (const at::Tensor& x : list) {
+      is_var_.push_back(true);
+      list_.emplace_back(x);
+    }
+  }
+  template <typename T>
+  void operator()(const T& x) {
+    is_var_.push_back(false);
+  }
+};
+
+template <typename... Args>
+inline void extract_vars(
+    std::vector<bool>& is_var,
+    variable_list& list,
+    Args&&... args) {
+  ExtractVariables(is_var, list).apply(std::forward<Args>(args)...);
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, variable_list>, T> to_output_type(
+    std::vector<c10::optional<Variable>>& output_list) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  variable_list result;
+  std::transform(
+      output_list.begin(),
+      output_list.end(),
+      std::back_inserter(result),
+      [](const c10::optional<Variable>& var) { return *var; });
+  return result;
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, Variable>, T> to_output_type(
+    std::vector<c10::optional<Variable>>& output_list) {
+  return *output_list[0];
+}
+
+inline std::vector<c10::optional<Variable>> to_optional(Variable& output) {
+  return std::vector<c10::optional<Variable>>{output};
+}
+
+inline std::vector<c10::optional<Variable>> to_optional(variable_list& output) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<c10::optional<Variable>> result;
+  std::transform(
+      output.begin(),
+      output.end(),
+      std::back_inserter(result),
+      [](const Variable& var) { return var; });
+  return result;
+}
+
+template <class T>
+template <typename X, typename... Args>
+auto Function<T>::apply(Args&&... args)
+    -> std::enable_if_t<std::is_same_v<X, T>, forward_t<X, Args...>> {
+  const auto& functorch_tls = at::functorch::functorchTLSAccessor();
+  if (functorch_tls) {
+    // Function support for functorch is handled in Python.
+    // Here we are dealing with a (C++) Function, which is not supported.
+    // Let's raise an error instead of being silently incorrect.
+    functorch_tls->checkSupportsCppAutogradFunction();
+  }
+
+  std::shared_ptr<CppNode<T>> node(new CppNode<T>(), deleteNode);
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  variable_list input_vars;
+
+  const size_t num_inputs = sizeof...(Args);
+  input_vars.reserve(num_inputs);
+  node->is_variable_input_.reserve(num_inputs);
+  // TODO Add tracing here
+  extract_vars(node->is_variable_input_, input_vars, args...);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool is_executable =
+      GradMode::is_enabled() && any_variable_requires_grad(input_vars);
+  auto next_edges =
+      (is_executable ? collect_next_edges(input_vars) : edge_list());
+  node->set_ctx_grad_fn(node);
+  node->set_next_edges(std::move(next_edges));
+  node->clear_input_metadata();
+
+  node->input_info_.reserve(input_vars.size());
+  for (auto& var : input_vars) {
+    node->input_info_.emplace_back(var);
+  }
+
+  using forward_return_t = forward_t<X, Args...>;
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  forward_return_t outputs;
+  {
+    AutoGradMode grad_mode(false);
+    outputs = T::forward(&node->ctx_, std::forward<Args>(args)...);
+  }
+
+  _jvp_fn_t jvp_fn = [](const variable_list& inputs,
+                        const variable_list& gI) -> variable_list {
+    TORCH_CHECK(
+        false,
+        "jvp is not implemented for the c++ API of custom Function yet.",
+        "Please open a feature request on GitHub if you need this.");
+  };
+
+  auto view_as_self_fn = [](const at::Tensor& x) -> at::Tensor {
+    return x.view_as(x);
+  };
+
+  auto wrapped_outputs = _wrap_outputs(
+      input_vars,
+      node->ctx_.get_non_differentiable(),
+      node->ctx_.get_and_bump_dirty(),
+      to_optional(outputs),
+      is_executable ? node : nullptr,
+      jvp_fn,
+      {},
+      view_as_self_fn);
+
+  node->output_info_.reserve(wrapped_outputs.size());
+  for (auto& output : wrapped_outputs) {
+    if (is_executable && output.has_value()) {
+      node->output_info_.emplace_back(output.value());
+    } else if (is_executable) {
+      node->output_info_.emplace_back();
+    }
+  }
+
+  if (is_executable) {
+    node->save_variables_to_ctx();
+  }
+
+  // wrapped_outputs will be a variable_list so, convert it to the correct
+  // return type. Only Variable and variable_list are accepted as return types.
+  return to_output_type<forward_return_t>(wrapped_outputs);
+}
+
+// The logic here is the same as PyNode::apply, so changes to it should be done
+// in both the places
+template <class T>
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+variable_list CppNode<T>::apply(variable_list&& inputs) {
+  at::OptionalDeviceGuard _device_guard;
+
+  auto num_inputs = inputs.size();
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  variable_list backward_inputs;
+  backward_inputs.reserve(num_inputs);
+  for (const auto i : c10::irange(num_inputs)) {
+    if (inputs[i].defined() || !ctx_.materialize_grads_) {
+      backward_inputs.emplace_back(std::move(inputs[i]));
+    } else {
+      backward_inputs.emplace_back(output_info_[i].zeros(_device_guard));
+    }
+  }
+
+  // Acquire lock to here protect thread safety on custom C++ Autograd Node
+  // This is needed for the custom Autograd Node since we don't know if the
+  // user defined Node will write to the shared data during backward.
+  // see Note [Thread Safety on Autograd Node]
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  auto outputs = T::backward(&ctx_, backward_inputs);
+
+  const auto num_forward_inputs =
+      static_cast<int64_t>(is_variable_input_.size());
+  auto num_outputs = static_cast<int64_t>(outputs.size());
+  // Returning too many results is ok, but only as long as they're all
+  // undefined. Truncate the result vector in that case.
+  if (num_outputs > num_forward_inputs) {
+    bool all_undef = true;
+    for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+      all_undef &= (!outputs[i].defined());
+    }
+    if (all_undef) {
+      outputs.resize(num_forward_inputs);
+      num_outputs = num_forward_inputs;
+    }
+  }
+
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name() + " returned an incorrect number of gradients (expected ";
+    msg += c10::to_string(num_forward_inputs) + ", got ";
+    msg += c10::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  variable_list results;
+  results.reserve(num_outputs);
+  for (const auto i : c10::irange(num_outputs)) {
+    if (!is_variable_input_[i]) {
+      if (outputs[i].defined()) {
+        std::string msg("function ");
+        msg += name() +
+            " returned a gradient different that is defined at position ";
+        msg += c10::to_string(i + 1) +
+            ", but the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+      continue;
+    }
+    results.emplace_back(outputs[i]);
+  }
+  return results;
+}
+
+template <class T>
+void CppNode<T>::release_variables() {
+  // lock to ensure thread safety, see [Thread Safety on Autograd Node]
+  std::lock_guard<std::mutex> lock(mutex_);
+  ctx_.saved_variables_.clear();
+  ctx_.has_freed_buffers_ = true;
+}
+
+template <class T>
+void CppNode<T>::save_variables_to_ctx() {
+  ctx_.save_variables();
+}
+
+template <class T>
+void CppNode<T>::set_ctx_grad_fn(const std::shared_ptr<Node>& node) {
+  ctx_.grad_fn_ = node;
+}
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..501d7d85ed0554b4a2bf667dd0d22b061a5031d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include <c10/util/hash.h>
+
+namespace torch::autograd {
+
+struct Node;
+
+/// Represents a particular input of a function.
+struct Edge {
+  Edge() noexcept : function(nullptr), input_nr(0) {}
+
+  Edge(std::shared_ptr<Node> function_, uint32_t input_nr_) noexcept
+      : function(std::move(function_)), input_nr(input_nr_) {}
+
+  /// Convenience method to test if an edge is valid.
+  bool is_valid() const noexcept {
+    return function != nullptr;
+  }
+
+  // Required for use in associative containers.
+  bool operator==(const Edge& other) const noexcept {
+    return this->function == other.function && this->input_nr == other.input_nr;
+  }
+
+  bool operator!=(const Edge& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// The function this `Edge` points to.
+  std::shared_ptr<Node> function;
+
+  /// The identifier of a particular input to the function.
+  uint32_t input_nr;
+};
+} // namespace torch::autograd
+
+// The idiomatic way of enabling use of a custom type as the key of hash
+// containers in C++11. This method removes the requirement of having to pass
+// a custom hasher to std::unordered_{map, set}.
+// See http://en.cppreference.com/w/cpp/utility/hash for more information.
+namespace std {
+template <>
+struct hash<torch::autograd::Edge> {
+  // These type aliases are required by the standard.
+  using argument_type = torch::autograd::Edge;
+  using return_type = size_t;
+  return_type operator()(const argument_type& edge) const noexcept {
+    return c10::get_hash(edge.function, edge.input_nr);
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..f04ca0f05fa2a7b9775115899b6d48bfd91566cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h
@@ -0,0 +1,288 @@
+#pragma once
+
+// Engine implements backpropagation from output variables and their gradients
+// to "root" variables (variables created by the user with requires_grad=True).
+
+#include <ATen/Tensor.h>
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/graph_task.h>
+#include <torch/csrc/autograd/input_buffer.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+#include <torch/csrc/autograd/utils/warnings.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <exception>
+#include <functional>
+#include <memory>
+#include <queue>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+struct ReadyQueue;
+}
+
+namespace torch::autograd {
+
+// Maximum reentrant backward depth before switching to a new thread
+// This limit is based on the TSAN's deadlock detector, where it will
+// fail if a program hold more than 65 locks in one thread at once.
+// As we hold mutex in every of our custom C++ autograd Node, we would
+// like to avoid TSAN complains on this when doing reentrant backwards
+// For reference, see https://github.com/google/sanitizers/issues/950
+static constexpr int MAX_DEPTH = 60;
+
+void set_device(int device);
+TORCH_API void validate_outputs(
+    const edge_list& edges,
+    variable_list& grads,
+    const std::function<std::string(const std::string&)>& format_error);
+
+struct NodeTask {
+  std::weak_ptr<GraphTask> base_;
+  std::shared_ptr<Node> fn_;
+  // This buffer serves as an implicit "addition" node for all of the
+  // gradients flowing here.  Once all the dependencies are finished, we
+  // use the contents of this buffer to run the function.
+  InputBuffer inputs_;
+  // When worker receives a task with isShutdownTask = true, it will immediately
+  // exit. The engine sends a shutdown task to every queue upon its destruction.
+  bool isShutdownTask_;
+
+  int getReentrantDepth() const;
+
+  NodeTask(
+      std::weak_ptr<GraphTask> base,
+      std::shared_ptr<Node> fn,
+      InputBuffer inputs,
+      bool isShutdownTask = false)
+      : base_(std::move(base)),
+        fn_(std::move(fn)),
+        inputs_(std::move(inputs)),
+        isShutdownTask_(isShutdownTask) {}
+};
+
+// Guard that sets and restores checkpoint_valid
+class CheckpointValidGuard {
+ public:
+  explicit CheckpointValidGuard(
+      const std::shared_ptr<const GraphTask>& graph_task);
+  ~CheckpointValidGuard();
+
+ private:
+  bool prev_checkpoint_valid_state;
+};
+
+struct ReadyQueue {
+ private:
+  // Returns true when t2 should be (weakly) BEFORE t1 in the queue.
+  // Shutdown tasks are first and then empty NodeTask are next.
+  struct CompareNodeTaskTime {
+    bool operator()(NodeTask const& t1, NodeTask const& t2) {
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+      if (t2.isShutdownTask_) {
+        return true;
+      } else if (!t1.fn_ || t1.isShutdownTask_) {
+        return false;
+      } else if (!t2.fn_) {
+        return true;
+      } else if (t1.getReentrantDepth() == t2.getReentrantDepth()) {
+        return t1.fn_->sequence_nr() < t2.fn_->sequence_nr();
+      } else {
+        return t1.getReentrantDepth() < t2.getReentrantDepth();
+      }
+    }
+  };
+
+  // To notify threads waiting on the ReadyQueue of available tasks on the heap_
+  std::condition_variable not_empty_;
+  // To protect read and writes to heap_
+  mutable std::mutex mutex_;
+
+  std::priority_queue<NodeTask, std::vector<NodeTask>, CompareNodeTaskTime>
+      heap_;
+
+ public:
+  // incrementOutstandingTasks indicates whether or not we should increment
+  // 'outstanding_tasks_' for the associated GraphTask. This should mostly
+  // always be true and is only set false in certain cases (see docs for
+  // DistEngine.execute_graph_task_until_ready_queue_empty)
+  void push(NodeTask item, bool incrementOutstandingTasks = true);
+  void pushShutdownTask();
+  NodeTask pop();
+  bool empty() const;
+  size_t size() const;
+};
+
+// A single instance of this struct should be created through the whole process
+// lifetime. The worker thread creation logic and Engine's destructor rely on
+// this.
+struct TORCH_API Engine {
+  /// Returns a reference to a static `Engine` instance.
+  static Engine& get_default_engine();
+
+  static Engine& get_base_engine();
+
+  // compiled_autograd needs to live in a different .so file so that it
+  // can have python symbols, so we add a layer of indirection
+  // see [Note: Compiled Autograd]
+  typedef variable_list (*compiled_autograd_fn)(
+      const std::shared_ptr<Node>& graph_root,
+      GraphTask& graph_task,
+      bool accumulate_grad,
+      const edge_list& outputs);
+  static void set_compiled_autograd(compiled_autograd_fn fn);
+
+  Engine(const Engine&) = delete;
+  Engine(Engine&&) = delete;
+  virtual ~Engine();
+
+  // Given a list of (Node, input number) pairs computes the value of the graph
+  // by following next_edge references.
+  virtual variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      bool accumulate_grad,
+      const edge_list& outputs = {});
+
+  // Given a pre-populated GraphTask and GraphRoot, computes the backward pass
+  // for the graph.
+  //
+  // NB: This API should only be used by internal autograd specific
+  // machinery and shouldn't be exposed to users in anyway.
+  virtual c10::intrusive_ptr<at::ivalue::Future> execute_with_graph_task(
+      const std::shared_ptr<GraphTask>& graph_task,
+      std::shared_ptr<Node> graph_root,
+      InputBuffer&& input_buffer);
+
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
+    return std::make_unique<AnomalyMetadata>();
+  }
+
+  virtual std::unique_ptr<SavedVariableHooks> get_default_saved_variable_hooks() {
+    return nullptr;
+  }
+
+  // We pass cpu_ready_queue to evaluate_function, so that it knows
+  // the correct ready queue to push to after a NodeTask is ready
+  void evaluate_function(
+      std::shared_ptr<GraphTask>& graph_task,
+      Node* func,
+      InputBuffer& inputs,
+      const std::shared_ptr<ReadyQueue>& cpu_ready_queue);
+
+  void initialize_device_threads_pool();
+  virtual void thread_on_exception(
+      std::shared_ptr<GraphTask> graph_task,
+      const std::shared_ptr<Node>& fn,
+      std::exception& e);
+
+  void queue_callback(std::function<void()> callback);
+
+  bool is_checkpoint_valid();
+
+  // Should be called after fork to notify that worker threads are gone
+  void release_workers();
+
+  // Must be called by subclass before destructing to avoid a data-race-on-vptr.
+  void stop();
+
+  // Initializes a device thread for the autograd engine.
+  virtual void thread_init(
+      int device,
+      const std::shared_ptr<ReadyQueue>& ready_queue,
+      bool should_increment = true);
+
+ protected:
+  Engine();
+  void compute_dependencies(Node* root, GraphTask& task, uint64_t min_topo_nr);
+
+  // initialize the thread local ready queue with the ready queue that is
+  // created elsewhere (i.e. thread_init, Engine::execute, etc), or create a new
+  // ready queue if ready_queue is not provided.
+  void init_local_ready_queue(
+      std::shared_ptr<ReadyQueue> ready_queue = nullptr);
+
+  std::shared_ptr<ReadyQueue> ready_queue(
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      at::Device device);
+  std::shared_ptr<ReadyQueue> ready_queue_by_index(
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      int device_index);
+  // start device threads (CUDA, XLA, etc.) in Engine,
+  // note that it does NOT start CPU thread.
+  void start_device_threads();
+  void increment_non_reentrant_thread_count();
+  void decrement_non_reentrant_thread_count();
+  virtual void thread_main(const std::shared_ptr<GraphTask>& task);
+  void reentrant_thread_init();
+  void add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task);
+
+  // Ensures device_ready_queues_ are initialized only once
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::once_flag start_device_threads_flag_;
+  // Safe to read device_ready_queues_ without synchronization after
+  // initialization
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::shared_ptr<ReadyQueue>> device_ready_queues_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::function<void()>> final_callbacks_;
+  // To protect reads and writes to final_callbacks_
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex post_callbacks_lock_;
+
+  // How many nested reentrant calls are allowed until a new thread is used
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  int max_recursion_depth_;
+
+  struct ThreadPoolShared {
+    // Data structures used by the threads for executing reentrant backwards
+    // tasks. See Note [Reentrant backwards]
+    // Number of available threads for processing new GraphTasks.
+    unsigned int num_workers_{0};
+    // The threads will wait on work_ to be notified of GraphTasks
+    std::condition_variable work_;
+    // To protect reads and writes to graphtask_queue_ and num_workers_
+    // and for synchronizing creating new threads when needed
+    std::mutex mutex_;
+    // Workers will process the GraphTasks added to this queue. A GraphTask is
+    // allocated inside Engine::execute and lives for the duration of execute
+    std::queue<std::weak_ptr<GraphTask>> graphtasks_queue_;
+
+    ThreadPoolShared() = default;
+  };
+
+  // Temporary workaround until shutting down threads is done
+  // We need shared ownership of all these objects because the threads are
+  // leaked when Engine shuts down, so there may be threads waiting on work_ for
+  // the graphtasks_queue_ to be nonempty.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<ThreadPoolShared> thread_pool_shared_;
+
+ private:
+  // Number of non-reentrant threads
+  std::atomic<uint32_t> non_reentrant_device_thread_count_;
+  // Destructor will wait for non-reentrant threads to finish
+  std::condition_variable non_reentrant_device_thread_condvar_;
+  std::mutex non_reentrant_device_thread_mutex_;
+  // stop() must be called before the destruction path goes down to the base
+  // class, in order to avoid a data-race-on-vptr. Use this boolean to guard
+  // whether stop() has already been called, so we can call this in every
+  // destructor of the class hierarchy.
+  bool stopped_{false};
+};
+
+// allow python_engine to override the default engine when it loads
+using EngineStub = Engine& (*)();
+TORCH_API void set_default_engine_stub(EngineStub stub);
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..1744b55708d7a1173c22b505dc5c58a8e8419f8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <unordered_set>
+
+namespace torch::autograd {
+
+// [ Using ForwardGrad ]
+// ForwardGrad needs to be a shared_ptr to satisfy constraints of its inner
+// design. But this shared_ptr must be uniquely associated with the object that
+// stores it (as of writing, either AutogradMeta or SavedVariable). This object
+// is called the "owning object" in the discussions below. This owning object
+// must call `ForwardGrad::clear()` when it is destroyed to ensure that the
+// ForwardGrad is properly de-allocated.
+
+struct ForwardGrad;
+
+// This file contains two classes that are used to store forward AD gradients
+// and ensure that they are scoped properly. Because forward AD runs
+// concurrently with the evaluation of the function, we need a mechanism to
+// separate different forward AD invocations and be able to compute the right
+// gradients. We model such invocations as levels here. The particular scoping
+// issue mentioned above has two main drivers:
+//   - Ensure that we can conveniently use forward AD within a high level API
+//   without
+//     leaking the forward AD states outside.
+//   - Ensure that we can keep the level that we expose to the user API simple
+//   (an integer
+//     that represents the nesting depth) while avoiding confusions when the
+//     level index is re-used.
+
+// The important external APIs from this file are:
+//   - ForwardADLevel::get_next_idx() that can be used to enter a new level and
+//   get its index
+//   - ForwardADLevel::release_idx() that can be used to exit a given level.
+//   - ForwardGrad() can be used to store a given forward gradient that will
+//   handle the level
+//     tracking automatically.
+
+// The basic implementation strategy is as follows:
+// Every tensor has a ForwardGrad, maintaining a map from levels to tangents.
+// ForwardGrad is responsible for registering itself to the appropriate
+// ForwardADLevel when a new tangent is added to it via ForwardGrad::set_value
+// and to un-register itself from this same level if that tangent is removed via
+// ForwardGrad::reset. The ForwardADLevel is created when a new level is entered
+// via ForwardADLevel::get_next_idx. A reference to the new ForwardADLevel is
+// stored into a global (for the whole process) vector that ensure it can be
+// accessed via ForwardADLevel::get_by_idx. This reference is deleted when the
+// index is released by the user when calling ForwardADLevel::release_idx. When
+// it is destructed, the ForwardADLevel is responsible for clearing all the
+// tangents for its level stored in all the ForwardGrad that registered with it.
+//
+// This process-wide level design, compared to a thread local one, allows us to
+// use very simple user facing handle for the level (an int) while enabling
+// cross-thread forward AD. The only required synchronization for the user is
+// when entering and exiting the levels. Some discussion on alternative design
+// is in https://github.com/pytorch/pytorch/pull/49097#discussion_r543716453 and
+// can be refined in the future.
+
+// Correctness of concurrency:
+// Each class uses its own lock when reading or modifying internal storages.
+// This allows in particular to safely remove tangents from ForwardGrad when the
+// ForwardADLevel is being exited. We ensure no deadlock by ensuring that a
+// methods never calls into another class's method while the local class's lock
+// is held except in one single case: calling from ForwardADLevel's destructor
+// into ForwardGrad::reset with update_level=false.
+
+// The lifetime of these objects is as follows:
+// The ForwardADLevel can be in three states:
+//      - Initialized: where one of its reference is held by the global vector
+//      and there may be more
+//        references held by temporary variables in ForwardGrad's methods.
+//      - About to be destructed: where "release_idx" has been called and the
+//      only reason for the
+//        ForwardADLevel not to be destructed right away is that some methods in
+//        ForwardGrad have owning reference to it. This is done so that a
+//        ForwardADLevel can never be destructed when a ForwardGrad is
+//        registered with it and in the process of adding something to its
+//        internal state.
+//      - Being destructed: Here the ForwardADLevel is not referenced anymore
+//      and can be safely reset
+//        all of the ForwardGrad. Note that we can have more than one reset
+//        being called here (which is ok) but we are guaranteed that there is at
+//        least one.
+// The ForwardGrad is simpler as there is no intermediary state and no special
+// destructor for. The logic to unregister it from the different ForwardADLevel
+// is done when the owning object (AutogradMeta or SavedVariable) is being
+// destroyed.
+
+// Other considered design:
+// To avoid having the ForwardGrad::clear, we considered storing weak_ptr inside
+// the ForwardADLevel. While this would work, it would mean that the set inside
+// the ForwardADLevel would only grow unless we do an expensive linear scan to
+// remove all the dangling weak pointers. Hence this approach was not used.
+
+// Data structures in this file are optimized for this maximum number of levels.
+// The number of levels corresponds to the degree of the gradient being
+// computed using forward AD and we don't expect more than second order
+// gradients to be common.
+#define EXPECTED_MAX_LEVEL 2
+
+struct TORCH_API ForwardADLevel {
+  ForwardADLevel(uint64_t idx) : idx_(idx) {}
+  ~ForwardADLevel();
+
+  static uint64_t get_next_idx();
+  static void release_idx(uint64_t idx);
+  static std::shared_ptr<ForwardADLevel> get_by_idx(uint64_t idx);
+  static std::shared_ptr<ForwardADLevel> try_get_by_idx(uint64_t idx);
+
+  void erase(const std::shared_ptr<ForwardGrad>& grad) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grads_.erase(grad);
+  }
+
+  void insert(const std::shared_ptr<ForwardGrad>& grad) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grads_.insert(grad);
+  }
+
+ private:
+  std::unordered_set<std::shared_ptr<ForwardGrad>> grads_;
+  std::mutex mutex_;
+  uint64_t idx_;
+};
+
+struct TORCH_API ForwardGrad : std::enable_shared_from_this<ForwardGrad> {
+  ForwardGrad() = default;
+
+  // This function must only be called when AutogradMeta or SavedVariable is
+  // being destructed as it ensures that:
+  //   - The only (potential) other references to this ForwardGrad are the
+  //     different level it is registered to
+  //   - No other thread will try to call `set_value` or `value` ever from now
+  //   on
+  //   - Any of the ForwardADLevel that this ForwardGrad is registered with
+  //   might
+  //     call `reset` at any point during this function
+  void clear() {
+    c10::SmallVector<uint64_t, EXPECTED_MAX_LEVEL> levels_idx;
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      for (auto& c : content_) {
+        levels_idx.push_back(c.first);
+      }
+    }
+
+    for (auto l_idx : levels_idx) {
+      // Use "try" version here as another thread might have deleted this
+      // level before we got here
+      // This is an owning reference as we want to keep the level alive
+      // until we successfully unregister ourselves
+      auto level = ForwardADLevel::try_get_by_idx(l_idx);
+      if (level) {
+        level->erase(shared_from_this());
+      }
+    }
+  }
+
+  void set_value(const at::Tensor& value, uint64_t level) {
+    // Owning reference to ensure the forward_level is not destroyed
+    // while we are updating our internal state
+    auto forward_level = ForwardADLevel::get_by_idx(level);
+    forward_level->insert(shared_from_this());
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    content_.insert({level, value});
+  }
+
+  // This function removes the tangent for a given level from this ForwardGrad
+  // Use the update_level flag to disable notifying the level about this reset
+  // This flag is most notably used by the ForwardADLevel destructor.
+  void reset(uint64_t level, bool update_level = true) {
+    if (update_level) {
+      ForwardADLevel::get_by_idx(level)->erase(shared_from_this());
+    }
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    const auto& it = content_.find(level);
+    TORCH_INTERNAL_ASSERT(
+        it != content_.end(), "Resetting a non-existent level.");
+    // Keep the Tensor alive until we have released the lock
+    // This is needed as we can be in a case where this function is called by
+    // ForwardADLevel destructor
+    auto t = (*it).second;
+    content_.erase(level);
+    lock.unlock();
+  }
+
+  const at::Tensor& value(uint64_t level) const;
+
+  bool contains(uint64_t level) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return content_.count(level) > 0;
+  }
+
+  bool empty() const {
+    return content_.empty();
+  }
+
+  static const at::Tensor& undef_grad();
+
+ private:
+  // TODO(albanD): replace this with a SmallVector
+  std::unordered_map<uint64_t, at::Tensor> content_;
+  mutable std::mutex mutex_;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..f962c8661fcad988a239f3b51d636b57c0f081f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function.h
@@ -0,0 +1,763 @@
+#pragma once
+
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/graph_task.h>
+#include <torch/csrc/autograd/input_metadata.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/SequenceNumber.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/record_function.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+
+struct Edge;
+struct FunctionPostHook;
+struct FunctionPreHook;
+
+using tensor_list = std::vector<at::Tensor>;
+using variable_list = std::vector<Variable>;
+using edge_list = std::vector<Edge>;
+using saved_variable_list = std::vector<SavedVariable>;
+using IndexRange = std::pair<size_t, size_t>;
+using torch::dynamo::autograd::CompiledNodeArgs;
+using torch::dynamo::autograd::SwapSavedVariables;
+
+// Custom deleter to prevent stack overflows.
+TORCH_API void deleteNode(Node* function);
+
+// Guard that sets and restores the evaluating node
+class NodeGuard {
+ public:
+  explicit NodeGuard(std::shared_ptr<Node> node);
+  ~NodeGuard();
+
+ private:
+  std::shared_ptr<Node> last_evaluating_node_;
+};
+
+// Return the Node currently being evaluated (if any)
+// This is only set during the backward pass while a Node is being
+// executed.
+TORCH_API std::shared_ptr<Node> get_current_node();
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                               Node
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A `Node` is an abstract class that represents an operation taking zero
+// or more input `Variable`s and producing zero or more output `Variable`s. All
+// functions in PyTorch's autograd machinery derive from this class and
+// override its `apply` method. Instances of such subclasses will then be
+// invokable via the call operator.
+//
+//                    Nodes in the Autograd Graph
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// When viewing the autograd system as a graph, `Node`s are the vertices or
+// nodes, connected to each other via (directed) `Edge`s, which themselves are
+// represented via (`Node`, input_nr) pairs. `Variable`s are the outputs to
+// and inputs of `Node`s, and travel between these edges during execution
+// of the graph. When two or more `Edge`s (from different sources) point at the
+// same input to a `Node`, the values produced along all of these edges are
+// implicitly summed prior to being forwarded to the target `Node`.
+//
+//                              Hierarchy
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Subclasses usually represent differentiable functions as well as their
+// gradient operators. Note, however, that due to the very general definition
+// of a `Node` taking *zero* or more inputs and producing *zero* or more
+// outputs, uses of `Node`s are flexible and extend beyond purely
+// mathematical operations. For example, the `AccumulateGrad` function is a
+// *sink*: it takes one input, but produces no outputs, instead accumulating
+// the input as a side effect. At the other extreme, the `GraphRoot` function
+// receives no inputs from other functions, but produces multiple outputs.
+//
+//                              Interface
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// The most important method on `Node` is the call operator, which takes in
+// a list of variables and produces a list of variables. The precise size of
+// these lists can be determined with `num_inputs()` and `num_outputs()`.
+// `Node`s are stitched together via their `next_edge` interface, which let
+// you manipulate the set of outgoing edges of a `Node`. You can add an
+// edge with `add_next_edge()`, retrieve an edge with `next_edge(index)` and
+// iterate over them via the `next_edges()` method. Other methods exist for
+// integration with the JIT and other parts of PyTorch. Every `Node` has a
+// *sequence number* that increases monotonically in the order of `Node`
+// construction. It can be retrieved via the `sequence_nr()` method. Note that
+// this sequence number is *thread local*. This means that when `Node`s
+// `A`, `B` and `C` are created consecutively in the same thread, their
+// sequence numbers will be ordered `A` < `B` < `C`. If, however, `A` and `B`
+// are created in one thread and `C` is created in a new thread, there are *no
+// guarantees* w.r.t. the ordering of `C` relative to `A` or `B`.
+// See NOTE [ Sequence Number] for more details on the usages of sequence
+// number.
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+struct TORCH_API Node : std::enable_shared_from_this<Node> {
+ public:
+  /// Construct a new `Node` with the given `next_edges`
+  explicit Node(uint64_t sequence_nr, edge_list&& next_edges = edge_list())
+      : sequence_nr_(sequence_nr), next_edges_(std::move(next_edges)) {
+    for (const Edge& edge : next_edges_) {
+      update_topological_nr(edge);
+    }
+
+    if (AnomalyMode::is_enabled()) {
+      metadata()->store_stack();
+
+      // If anomaly mode is enabled and graph is constructed, then assign the
+      // currently evaluating node as the parent of this node.
+      // A parent is a Node where this Node is created.
+      // We are tracking the parents to track multiple backward operations.
+      assign_parent();
+    }
+
+    // Store the thread_id of the forward operator.
+    // See NOTE [ Sequence Numbers ]
+    thread_id_ = at::RecordFunction::currentThreadId();
+  }
+
+  explicit Node(edge_list&& next_edges = edge_list())
+      : Node(
+            /*sequence_nr=*/at::sequence_number::get_and_increment(),
+            std::move(next_edges)) {}
+
+  /// Nodes are neither copyable nor moveable.
+  Node(const Node& other) = delete;
+  Node(Node&& other) = delete;
+  Node& operator=(const Node& other) = delete;
+  Node& operator=(Node&& other) = delete;
+  virtual ~Node() = default;
+
+  std::shared_ptr<Node> getptr() {
+    return shared_from_this();
+  }
+  /// Evaluates the function on the given inputs and returns the result of the
+  /// function call.
+  variable_list operator()(variable_list&& inputs) {
+    // In the first iteration of named tensors, autograd ignores names and
+    // operates on unnamed tensors. In the long term, autograd should
+    // probably operate with names.
+    at::NoNamesGuard no_names_guard;
+
+#ifdef USE_ROCM
+    // Keep track of backward pass for rocblas.
+    at::ROCmBackwardPassGuard in_backward;
+#endif
+
+    auto step_callbacks =
+        at::getStepCallbacksUnlessEmpty(at::RecordScope::BACKWARD_FUNCTION);
+    if (C10_UNLIKELY(step_callbacks.has_value())) {
+      at::RecordFunction guard(std::move(*step_callbacks));
+      // Using sequence number and thread id to correlate with
+      // the forward pass function
+      guard.setForwardThreadId(thread_id_);
+      if (guard.needsInputs()) {
+        std::vector<c10::IValue> inputs_vec(inputs.begin(), inputs.end());
+        guard.before(
+            name(),
+            c10::ArrayRef<const c10::IValue>(
+                inputs_vec.data(), inputs_vec.size()),
+            static_cast<int64_t>(sequence_nr()));
+      } else {
+        guard.before(name(), static_cast<int64_t>(sequence_nr()));
+      }
+      return apply(std::move(inputs));
+    } else {
+      return apply(std::move(inputs));
+    }
+  }
+
+  // Graph Connectivity API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // Inputs. NOTE: inputs of the grad_fn correspond to Tensor outputs of the
+  // forward function.
+
+  // Marker for expected undefined input
+  struct undefined_input {};
+
+  /// Adds the type and shape metadata for a new input. Returns the index of
+  /// of the new input.
+  uint32_t add_input_metadata(
+      const at::TensorOptions& options,
+      c10::SymIntArrayRef shape,
+      bool is_tensor_subclass,
+      bool is_nested) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    auto meta_shape = MetadataShape{std::in_place_type<SymIntSmallVec>, shape};
+    input_metadata_.emplace_back(
+        options, meta_shape, is_tensor_subclass, is_nested);
+    return input_nr;
+  }
+
+  uint32_t add_input_metadata(const at::Tensor& t) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back(t);
+    return input_nr;
+  }
+
+  /// Adds a placeholder for an input that will not be used.
+  uint32_t add_input_metadata(undefined_input u) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back();
+    return input_nr;
+  }
+
+  uint32_t num_inputs() const noexcept {
+    return input_metadata_.size();
+  }
+
+  const InputMetadata& input_metadata(size_t index) const {
+    return input_metadata_[index];
+  }
+
+  // Danger: not thread safe, caller must protect with lock
+  InputMetadata& mutable_input_metadata(size_t index) {
+    return input_metadata_[index];
+  }
+
+  /**
+   * Note: Function Streams
+   * A function's stream (for a given device type) is the stream of the first
+   * element of its input buffer on a device of that type.
+   *
+   * If all elements are on the same device they MUST share a stream. If
+   * elements are on different devices (across multiple GPUs, for example)
+   * they may have different streams.
+   */
+  c10::optional<c10::Stream> stream() {
+    auto opt_device_type = at::getAccelerator();
+    if (!opt_device_type.has_value()) {
+      return c10::nullopt;
+    }
+    for (const auto& metadata : input_metadata_) {
+      if (metadata.device().type() == opt_device_type.value())
+        return metadata.stream();
+    }
+
+    return c10::nullopt;
+  }
+
+  void clear_input_metadata() {
+    input_metadata_.clear();
+  }
+
+  // Outputs ("Next Edges")
+
+  void update_topological_nr(const Edge& edge) {
+    TORCH_INTERNAL_ASSERT(
+        !has_parent_,
+        "Cannot update a node's topological_nr after it already has a parent."
+        " If we allow this, we can no longer guarantee that a parent's"
+        " topo_nr is always greater than those of all its children")
+    Node* node = edge.function.get();
+    if (node) {
+      auto topo_nr = node->topological_nr();
+      if (topological_nr_ <= topo_nr) {
+        topological_nr_ = topo_nr + 1;
+      }
+    }
+  }
+
+  void set_next_edge(size_t index, Edge edge) {
+    update_topological_nr(edge);
+    next_edges_[index] = std::move(edge);
+  }
+
+  void add_next_edge(Edge edge) {
+    update_topological_nr(edge);
+    next_edges_.emplace_back(std::move(edge));
+  }
+
+  void set_next_edges(edge_list&& next_edges) {
+    next_edges_ = std::move(next_edges);
+    for (const auto& next_edge : next_edges_) {
+      update_topological_nr(next_edge);
+    }
+  }
+
+  const Edge& next_edge(size_t index) const noexcept {
+    return next_edges_[index];
+  }
+
+  const edge_list& next_edges() const noexcept {
+    return next_edges_;
+  }
+
+  edge_list& next_edges() noexcept {
+    return next_edges_;
+  }
+
+  uint32_t num_outputs() const noexcept {
+    return next_edges_.size();
+  }
+
+  // Miscellaneous Methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// NOTE [ Sequence Number]
+  ///
+  /// The sequence_nr has two main usages in autograd:
+  ///
+  /// 1) Helps determine the node's execution priority in the engine.
+  ///    All else being equal, nodes with higher priority numbers are executed
+  ///    first. Thus, nodes corresponding to ops executed later are the first to
+  ///    be executed in the backward pass. One caveat is that we prioritize
+  ///    AccumulateGrad nodes by explicitly setting its sequence_nr to be
+  ///    UINT64_MAX.
+  /// 2) The sequence number of this `Node` is paired with with thread_id it was
+  /// created in
+  ///    as a unique identifier by the profiler to annotate recorded events.
+  ///    The purpose of this is to help users (and possibly programs)
+  ///    interpreting the profiler's output to correlate backward nodes with its
+  ///    forward ops. We need both sequence_nr and thread_id to identify a node
+  ///    because sequence_nr is thread_local, i.e., starts counting up from zero
+  ///    in a new thread
+  uint64_t sequence_nr() const noexcept {
+    return sequence_nr_;
+  }
+
+  void set_sequence_nr(uint64_t sequence_nr) {
+    sequence_nr_ = sequence_nr;
+  }
+
+  // NOTE [ Topological Number ]
+  //
+  // topological_nr is used to prune branches in the DAG during autograd
+  // discovery as maintaining topological_nr helps us check in O(1) if there
+  // does NOT exist a directed path between two nodes.
+  //
+  // The topological order number of this `Node` representing the length of the
+  // longest possible path from this Node to any leaf node. If you are leaf
+  // node, aka AccumulateGrad, this will be zero. This value has the property
+  // that For every pair of nodes X, Y in G, existence of a directed path from X
+  // to Y implies topo_nr(X) > topo_nr(Y). The converse is not true, however, so
+  // we cannot prove existence of a path from X to Y, only non-existence.
+  //
+  // One assumption we make when using topo_nr is that once a node
+  // has been used, i.e., has a parent node, its own topo_nr does not change
+  // we have added some checks with the `has_parent_` field to enforce this.
+  //
+  // What NOT to do:
+  //
+  //   1) 2 -> 1 -> 0               In this diagram we label nodes with their
+  //   topo_nr.
+  //      2 -> 1 -> 0               We have two simple graphs that can each
+  //      arise from
+  //                                `t.exp().exp()`, for example.
+  //   2)        2 -> 1 -> 0
+  //            /
+  //      2 -> 1 -> 0               We add 2 as a next edge to 1 even though 1
+  //      already
+  //                                has a parent.
+  //   3)        2 -> 1 -> 0
+  //            /
+  //      2 -> 3 -> 0               2 < 3, yet there exists a path from 2 to 3!
+  //
+  uint64_t topological_nr() const noexcept {
+    has_parent_ = true;
+    return topological_nr_;
+  }
+
+  // assigning a node as a parent to this node
+  void assign_parent();
+
+  /// Id of the thread that created Node
+  uint64_t thread_id() const noexcept {
+    return thread_id_;
+  }
+
+  /// Returns the name of the dynamic type of the function, for debugging.
+  virtual std::string name() const;
+
+  /// The difference between functions `should_compute_output` and
+  /// `task_should_compute_output`:
+  /// - `should_compute_output` should only be used during graph construction
+  /// and takes into account only requires_grad information
+  /// - `task_should_compute_output` should only be called during the backward
+  /// pass (unless called directly through grad_fn) and takes into account the
+  /// current graph task.  Specifically, the autograd engine trims unnecessary
+  /// edges when `inputs` are specified, and during backward untrimmed nodes
+  /// left on the graph can/should check `task_should_compute_output` to see if
+  /// any outgoing edges have been trimmed by the engine. If that is the case,
+  /// gradient computation wrt those edges can be omitted.
+  ///
+  /// Returns true if the particular output edge is active, and that particular
+  /// output of this function should be computed.
+  bool should_compute_output(size_t output_edge_index) const {
+    TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range");
+    return next_edges_[output_edge_index].is_valid();
+  }
+
+  /// Returns true if any of the output edges in any of the ranges are active.
+  bool should_compute_output(std::initializer_list<IndexRange> idxs) const {
+    return std::any_of(idxs.begin(), idxs.end(), [this](IndexRange range) {
+      for (const auto i : c10::irange(range.first, range.second)) {
+        if (should_compute_output(i))
+          return true;
+      }
+      return false;
+    });
+  }
+
+  /// Same as the above `should_compute_output` function but will also
+  /// check whether this edge is needed within the current graph task.
+  bool task_should_compute_output(size_t output_edge_index) const {
+    TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range");
+    const auto& next = next_edges_[output_edge_index];
+    if (next.is_valid()) {
+      const auto exec_info = get_current_graph_task_exec_info();
+      if (exec_info && !exec_info->empty()) {
+        auto it = exec_info->find(next.function.get());
+        if (it == exec_info->end() || !it->second.should_execute()) {
+          return false; // this edge is not needed for the current graph_task
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  /// Returns true if any of the output edges in any of the ranges are active
+  /// and should be computed in the current graph task.
+  bool task_should_compute_output(
+      std::initializer_list<IndexRange> idxs) const {
+    return std::any_of(idxs.begin(), idxs.end(), [this](IndexRange range) {
+      for (const auto i : c10::irange(range.first, range.second)) {
+        if (task_should_compute_output(i))
+          return true;
+      }
+      return false;
+    });
+  }
+
+  /// Returns the `PyObject` stored for this `Node` (for Python
+  /// interaction).
+  PyObject* pyobj() const noexcept {
+    return pyobj_;
+  }
+
+  /// Sets the `PyObject` stored for this `Node` (for Python interaction).
+  void set_pyobj(PyObject* pyobj) noexcept {
+    pyobj_ = pyobj;
+  }
+
+  /// Returns the anomaly metadata stored for this `Node`.
+  /// If none exist, creates a new empty one.
+  AnomalyMetadata* metadata() noexcept;
+
+  // Hook API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  uintptr_t add_post_hook(std::unique_ptr<FunctionPostHook>&& post_hook) {
+    post_hooks_.emplace_back(std::move(post_hook));
+    // Use the raw pointer as the unique key to identify this hook. This key
+    // can then be used in del_post_hook(key) to remove this hook.
+    return reinterpret_cast<std::uintptr_t>(post_hooks_.back().get());
+  }
+
+  const std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks()
+      const noexcept {
+    return post_hooks_;
+  }
+
+  // delete a post hook matching the key
+  bool del_post_hook(const uintptr_t& key) {
+    for (auto it = post_hooks_.begin(); it != post_hooks_.end(); ++it) {
+      if (key == reinterpret_cast<std::uintptr_t>(it->get())) {
+        post_hooks_.erase(it);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks() noexcept {
+    return post_hooks_;
+  }
+
+  void add_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
+    pre_hooks_.emplace_back(std::move(pre_hook));
+  }
+
+  void add_tensor_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
+    tensor_pre_hooks_.emplace_back(std::move(pre_hook));
+  }
+
+  void add_retains_grad_hook(
+      std::unique_ptr<FunctionPreHook>&& pre_hook,
+      size_t output_idx) {
+    retains_grad_hooks_[output_idx] = std::move(pre_hook);
+  }
+
+  std::unique_ptr<FunctionPreHook> pop_retains_grad_hook(size_t output_idx) {
+    auto ret = std::move(retains_grad_hooks_[output_idx]);
+    retains_grad_hooks_.erase(output_idx);
+    return ret;
+  }
+
+  const std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks()
+      const noexcept {
+    return pre_hooks_;
+  }
+
+  std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks() noexcept {
+    return pre_hooks_;
+  }
+
+  virtual std::vector<std::unique_ptr<FunctionPreHook>>&
+  tensor_pre_hooks() noexcept {
+    return tensor_pre_hooks_;
+  }
+
+  virtual std::unique_ptr<PostAccumulateGradHook>&
+  tensor_post_acc_grad_hooks() noexcept {
+    static std::unique_ptr<PostAccumulateGradHook> empty = nullptr;
+    return empty;
+  }
+
+  std::unordered_map<size_t, std::unique_ptr<FunctionPreHook>>&
+  retains_grad_hooks() noexcept {
+    return retains_grad_hooks_;
+  }
+
+  // Customization Points for Subclasses
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Releases saved variables if the operation won't be reused.
+  virtual void release_variables() {}
+
+  /// Called before an apply if `release_variables()` is going to be called.
+  /// Allows larger ops like `InterpreterAutogradFunction` to incrementally
+  /// release variables as they run.
+  virtual void will_release_variables() {}
+
+  /// Returns true if this function is traceable. An op is traceable if all
+  /// operations happening within `apply()` are performed on autograd
+  /// `Variables` (i.e. apply mostly instantiates and applies other functions).
+  virtual bool is_traceable() {
+    return false;
+  }
+
+  /// A `Node` is said to pass state transparently to backward, if the
+  /// state consists only of (Saved)Variables and only non-variable objects
+  /// that parameterize the operation in some way that defines the graph
+  /// structure AND the backward function is traceable. In particular,
+  /// parametrization MUST NOT depend on the data of any `Variable`.
+  /// TODO: it might be possible to handle cases where backward is
+  /// non-traceable but state passing could be considered transparent. This
+  /// will probably depend on saved_variable_list being mutable.
+  /// NOTE: this value matters only if is_traceable() returns false.
+  virtual bool passes_state_transparently() {
+    return false;
+  }
+
+  // see [Note: Compiled Autograd]
+  // Used by compiled autograd to
+  //   1) Extract tensors/symint args
+  //   2) Collect node information for specialization and caching
+  // Implementations in subclasses should call args.collect() with all node
+  // attrs. These functions are only called durring backward.
+  virtual void compiled_args(CompiledNodeArgs& args) {
+    throw std::runtime_error(
+        std::string("compiled_args not implemented: ") + name());
+  }
+
+  // Used by compiled autograd to call apply() with different saved tensors
+  // Implementations should call saved.before() on all attrs, then apply(), then
+  // saved.after() on all attrs in the same order.
+  virtual variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) {
+    throw std::runtime_error(
+        std::string("apply_with_saved not implemented: ") + name());
+  }
+
+ protected:
+  /// Performs the `Node`'s actual operation.
+  virtual variable_list apply(variable_list&& inputs) = 0;
+
+  /// Calls `apply()`, but instruments it with tracing machinery.
+  variable_list traced_apply(variable_list inputs);
+
+  // Sequence number used to correlate backward nodes with forward ops in the
+  // profiler and provide determinism in the engine.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  uint64_t sequence_nr_;
+
+  // See NOTE [ Topological Number ]
+  uint64_t topological_nr_ = 0;
+
+  // Tracks whether this node has been added as the next_edge of another node
+  // via set_next_edge(s), which always calls topological_nr() of all its
+  // children See NOTE [ Topological Number ] for why we need this.
+  mutable bool has_parent_ = false;
+
+  // Id of the thread that created the instance
+  uint64_t thread_id_ = 0;
+
+  // Note [Thread Safety on Autograd Node]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Autograd Engine let the owning thread which calls Engine::execute to drive
+  // the GraphTask execution, there might be cases that part of the GraphTask is
+  // shared across different `backward()` or `grad()` calls, i.e. fork new
+  // threads in the middle of the forward and call `backward()` separately from
+  // different threads. We need to protect the thread safety on NodeTask to
+  // prevent data racing on shared variables read/write.
+  //
+  // NB: This is only needed for Autograd Nodes that runs on CPU, technically
+  // "CUDA", "XLA" nodes don't need locking because device threads are always
+  // single threaded.
+  //
+  // Here we add a thread mutex to help protect the Node's thread safety, so
+  // that different threads cannot race the shared data when executing the same
+  // NodeTask from multiple CPU threads. It IS the user/developer responsibility
+  // to take advantage of this mutex to protect the thread safety of their
+  // autograd Node. The general strategy of thread safety on autograd Node:
+  //
+  // 1. User should lock the mutex during Node::release_variables() if the Node
+  // needs
+  //    to release the variables on the fly, this serve the purpose that when we
+  //    release saved_variables from one thread, no other threads can release
+  //    the saved variables concurrently. call the Node::apply(),
+  // 2. User should lock the mutex during Node::apply(), this is to ensure Node
+  // that
+  //    writing to the shared variable are not racing across threads (i.e.
+  //    AccumulateGrad and custom C++ Autograd Node if writing to shared
+  //    variables )
+  // 3. item 2 and item 3 should work together so that when we release saved
+  // variables
+  //    from one thread, no other threads can call Node::apply(), this ensures
+  //    the variable references from other threads aren't dangling.
+  // 4. if the Node don't release any variables and no shared data read/write in
+  // the Node
+  //    i.e. purely functional, user don't need to lock the mutex
+  //
+  // This way we could protect the thread safety on Autograd Node, but we could
+  // still not protect the thread safety on Node pre/post C++ hooks (python
+  // hooks are automatically thread safe), we rely on the user to write thread
+  // safe C++ hooks if they want the hook to be correctly applied in
+  // multithreading environment.
+  std::mutex mutex_;
+
+  edge_list next_edges_;
+  PyObject* pyobj_ = nullptr; // weak reference
+  std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
+
+  // NOTE [Hooks ordering]
+  // We have 3 separate fields for pre hooks registered to the autograd nodes
+  // because the conditions under which they execute are different, and we
+  // want more fine-grained control over the order in which different types
+  // of hooks are executed.
+  // - pre_hooks  are only executed when the node itself is executed
+  // - tensor_pre_hook is executed as long as the engine traverses over it
+  //   even if that node won't be executed.
+  // - retains_grad_hook are like tensor_pre_hooks except they are always
+  //   ordered after all other tensor pre hooks
+  std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
+  std::vector<std::unique_ptr<FunctionPreHook>> tensor_pre_hooks_;
+  std::unordered_map<size_t, std::unique_ptr<FunctionPreHook>>
+      retains_grad_hooks_;
+  std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
+  at::SmallVector<InputMetadata, 2> input_metadata_;
+};
+
+/// See Node::is_traceable() for definition.
+struct TraceableFunction : public Node {
+  using Node::Node;
+  bool is_traceable() final {
+    return true;
+  }
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                       Associated Free Nodes
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+// Implementation of `collect_next_edges` (see below).
+struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
+  edge_list next_edges;
+  using IterArgs<MakeNextFunctionList>::operator();
+  void operator()(const Variable& variable) {
+    if (variable.defined()) {
+      next_edges.emplace_back(impl::gradient_edge(variable));
+    } else {
+      next_edges.emplace_back();
+    }
+  }
+  void operator()(const Variable* variable) {
+    operator()(*variable);
+  }
+  void operator()(const c10::optional<Variable>& variable) {
+    if (variable.has_value()) {
+      operator()(*variable);
+    } else {
+      next_edges.emplace_back();
+    }
+  }
+};
+} // namespace detail
+
+/// Create an `Edge` between the given `variable` and the `function`, which is
+/// assumed to be the gradient function of this variable (i.e. the function
+/// through which this variable is backpropagated during the backward pass).
+/// This sets the `grad_fn` property of the `variable`. This function assumes
+/// that the `Variable` is a new input to the gradient function and its
+/// `input_nr` thus equal to `function->num_inputs()`. Additionally, it
+/// increments the `Node`'s number of inputs by one. Approximately
+/// equivalent to `variable.set_gradient_edge(function,
+/// function->add_input_metadata(variable.dispatch_type(), variable.sizes()))`.
+/// If you don't want the `Node`'s `num_inputs` to be incremented, use
+/// `set_gradient_edge` directly.
+inline void create_gradient_edge(
+    Variable& variable,
+    std::shared_ptr<Node> function) {
+  // Copy before move.
+  const auto input_nr = function->add_input_metadata(variable);
+  impl::set_gradient_edge(variable, {std::move(function), input_nr});
+}
+
+/// Return true if any of the variables in the list require a gradient.
+inline bool any_variable_requires_grad(const variable_list& variables) {
+  return std::any_of(
+      variables.begin(), variables.end(), [](const Variable& variable) {
+        return variable.defined() && variable.requires_grad();
+      });
+}
+
+/// Return the next edges of all the given variables, or tuples of variables.
+template <typename... Variables>
+edge_list collect_next_edges(Variables&&... variables) {
+  detail::MakeNextFunctionList make;
+  make.apply(std::forward<Variables>(variables)...);
+  return std::move(make.next_edges);
+}
+
+struct TypeAndSize {
+  TypeAndSize() : options(at::TensorOptions()) {}
+  /* implicit */
+  TypeAndSize(const at::Tensor& t)
+      : sym_sizes(t.sym_sizes().vec()), options(t.options()) {}
+
+  at::Tensor zeros();
+
+  std::vector<c10::SymInt> sym_sizes;
+  at::TensorOptions options;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..9307727737aeed01af3ab7b0cf758a08aa3209a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/Export.h>
+#include <string>
+#include <vector>
+
+namespace torch::dynamo::autograd {
+class CompiledNodeArgs;
+class SwapSavedVariables;
+} // namespace torch::dynamo::autograd
+
+// A hook that's called on gradients
+
+namespace torch::autograd {
+
+using Variable = at::Tensor;
+using variable_list = std::vector<Variable>;
+
+struct TORCH_API FunctionPreHook {
+  virtual ~FunctionPreHook() = default;
+  virtual variable_list operator()(const variable_list& grads) = 0;
+  // only implemented for python hooks, registers hook with compiled autograd
+  virtual void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) {
+    throw std::runtime_error(
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+        typeid(*this).name());
+  }
+};
+
+struct TORCH_API FunctionPostHook {
+  virtual ~FunctionPostHook() = default;
+  virtual variable_list operator()(
+      const variable_list& outputs /* grad_inputs */,
+      const variable_list& inputs /* grad_outputs */) = 0;
+  // only implemented for python hooks, registers hook with compiled autograd
+  virtual void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) {
+    throw std::runtime_error(
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+        typeid(*this).name());
+  }
+};
+
+struct TORCH_API PostAccumulateGradHook {
+  virtual ~PostAccumulateGradHook() = default;
+  virtual void operator()(const Variable& tensor) = 0;
+  // only implemented for python hooks on nodes, registers hook with compiled
+  // autograd
+  virtual void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) {
+    throw std::runtime_error(
+        std::string("not yet implemented for compiled autograd: ") +
+        typeid(*this).name());
+  }
+
+  virtual void apply_with_saved(
+      Variable&,
+      torch::dynamo::autograd::SwapSavedVariables&) {
+    throw std::runtime_error(
+        std::string("not yet implemented for compiled autograd: ") +
+        typeid(*this).name());
+  }
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbcaa0a52c192c178a3aacb8445b0d90d1b110a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h
@@ -0,0 +1,277 @@
+#pragma once
+
+#include <ATen/CachedTensorUtils.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/TensorOperators.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/utils/grad_layout_contract.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#endif
+
+#include <mutex>
+
+namespace torch {
+namespace autograd {
+
+#define CHECK_RESULT(RESULT, VAR)                                          \
+  if (!(RESULT.is_sparse() || VAR.is_sparse() || RESULT.is_sparse_csr() || \
+        VAR.is_sparse_csr())) {                                            \
+    if (!utils::obeys_layout_contract(RESULT, VAR)) {                      \
+      TORCH_WARN_ONCE(                                                     \
+          "grad and param do not obey the gradient layout contract. "      \
+          "This is not an error, but may impair performance.\n"            \
+          "grad.sizes() = ",                                               \
+          RESULT.sizes(),                                                  \
+          ", strides() = ",                                                \
+          RESULT.strides(),                                                \
+          "\n",                                                            \
+          "param.sizes() = ",                                              \
+          VAR.sizes(),                                                     \
+          ", strides() = ",                                                \
+          VAR.strides());                                                  \
+    }                                                                      \
+  }
+
+struct TORCH_API AccumulateGrad : public Node {
+  explicit AccumulateGrad(Variable variable_);
+
+  variable_list apply(variable_list&& grads) override;
+
+  std::vector<std::unique_ptr<FunctionPreHook>>& tensor_pre_hooks() noexcept
+      override {
+    // NB: Since the AccumulateGrad Node is only a weak ref from the Tensor,
+    //     it can be destroyed even though the Tensor is still alive (contrary
+    //     to all other Nodes). So we must lazily read the Tensor hooks here.
+    return impl::hooks(variable);
+  }
+
+  std::unique_ptr<PostAccumulateGradHook>& tensor_post_acc_grad_hooks() noexcept
+      override {
+    // NB: Since the AccumulateGrad Node is only a weak ref from the Tensor,
+    //     it can be destroyed even though the Tensor is still alive (contrary
+    //     to all other Nodes). So we must lazily read the Tensor hooks here.
+    return impl::post_acc_grad_hooks(variable);
+  }
+
+  // Given a variable with its current grad as variable_grad, accumulates
+  // new_grad into variable_grad if in place accumulation is possible.
+  // Otherwise, uses 'update_grad' to update the grad for the variable.
+
+  // "Gradient Layout Contract"
+  //
+  // AccumulateGrad tries to stash strided (non-sparse) grads with memory layout
+  // (strides) such that variables and grads interact efficiently in later
+  // optimizer kernels, and grads interact efficiently with c10d::Reducer.cpp.
+  //
+  // Specifically, AccumulateGrad tries to ensure the following
+  // (cf torch/csrc/autograd/utils/grad_layout_contract.h):
+  //   (1) if variable.is_non_overlapping_and_dense(), the stashed grad's
+  //       strides match variable.
+  //   (2) else, stashed grad is rowmajor contiguous.
+  // If variable's grad does not exist (!variable_grad.defined())
+  // AccumulateGrad steals new_grad if it's stealable and obeys the contract
+  // already, otherwise it deep copies new_grad into an obedient clone.
+  //
+  // If variable's grad already exists (variable_grad.defined()), new_grad must
+  // be added to variable_grad.  If we aren't setting up for double backward
+  // (!GradMode::is_enabled()), AccumulateGrad performs "variable_grad +=
+  // new_grad" in-place, which keeps variable_grad's layout. We assume (hope)
+  // variable_grad was created obeying (1) or (2) at some point in the past.
+  //
+  // If we are setting up for double backward, AccumulateGrad updates the grad
+  // out-of-place via "variable_grad + new_grad."  TensorIterator operator+
+  // decides result's layout.  Typically TensorIterator matches strides of the
+  // first arg, so we once again assume (hope) variable_grad was originally
+  // created obeying (1) or (2).
+  //
+  // AccumulateGrad does not enforce the contract with 100% certainty. Examples:
+  //  - If a user manually permutes a param or its grad, then runs a fwd+bwd,
+  //    variable_grad += new_grad keeps variable_grad's layout without
+  //    rechecking the contract.
+  //  - If TensorIterator changes its corner cases about operator+'s result
+  //    (for example, giving more or less priority to channels_last inputs, see
+  //    https://github.com/pytorch/pytorch/pull/37968) the result may not obey.
+  //
+  // Fortunately, if a given grad doesn't satisfy (1) or (2), the penalty is
+  // degraded performance in Reducer.cpp or optimizer kernels, not death by
+  // assert or silently bad numerics.
+
+  // variable: the variable whose grad we're accumulating.
+  // variable_grad: the current grad for the variable.
+  // new_grad: new grad we want to accumulate for the variable.
+  // num_expected_refs: the number of refs we expect to hold internally
+  //                    such that it is safe to avoid cloning the grad
+  //                    if use_count() of the grad is less than or equal
+  //                    to this value (in addition to post_hooks).
+  // update_grad: Function that is used to update grad for the variable.
+  //              The argument to the function is a Tensor which
+  //              is used to set a new value for the grad.
+  template <typename T>
+  static void accumulateGrad(
+      const Variable& variable,
+      at::Tensor& variable_grad,
+      const at::Tensor& new_grad,
+      size_t num_expected_refs,
+      const T& update_grad) {
+    if (!variable_grad.defined()) {
+      if (!GradMode::is_enabled() && !new_grad.is_sparse() &&
+          !new_grad.is_sparse_csr() &&
+          !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
+          at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
+          (new_grad.is_mkldnn() ||
+           utils::obeys_layout_contract(new_grad, variable))) {
+        // we aren't setting up for double-backward
+        // not sparse
+        // no other user-visible tensor references new_grad
+        // new_grad obeys the "Gradient Layout Contract", there has a special
+        // case, For MKLDNN tensor, which is a opaque tensor, assuming it obeys
+        // layout_contract. Under these conditions, we can steal new_grad
+        // without a deep copy.
+        update_grad(new_grad.detach());
+      } else if (
+          !GradMode::is_enabled() && new_grad.is_sparse() &&
+          new_grad._indices().is_contiguous() &&
+          new_grad._values().is_contiguous() &&
+          // Use count for indices and values should always be <=1 since the
+          // SparseTensor should be the only one holding a reference to these.
+          new_grad._indices().use_count() <= 1 &&
+          new_grad._values().use_count() <= 1 &&
+          new_grad.use_count() <= num_expected_refs) {
+        // Can't detach sparse tensor (since metadata changes are not allowed
+        // after detach), so just create a new one for the grad which is a
+        // shallow copy. We need a shallow copy so that modifying the original
+        // grad tensor doesn't modify the grad we accumulate.
+        // We only skip clone if indices and values themselves are contiguous
+        // for backward compatibility reasons. Since without this optimization,
+        // earlier we would clone the entire SparseTensor which cloned indices
+        // and values.
+        // For details see https://github.com/pytorch/pytorch/issues/34375.
+
+        // No scenario where we expect this to be true currently
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !at::caching::is_cached_tensor(new_grad._indices()) &&
+            !at::caching::is_cached_tensor(new_grad._values()) &&
+            !at::caching::is_cached_tensor(new_grad));
+
+        update_grad(at::_sparse_coo_tensor_unsafe(
+            new_grad._indices(),
+            new_grad._values(),
+            new_grad.sizes(),
+            new_grad.options()));
+      } else {
+        if (new_grad.is_sparse() || new_grad.is_sparse_csr() ||
+            new_grad.is_nested()) {
+          update_grad(new_grad.clone());
+        } else {
+          if (new_grad.is_mkldnn()) {
+            update_grad(new_grad.clone());
+          } else {
+            // Deep copies new_grad according to the "Gradient Layout Contract."
+            update_grad(utils::clone_obey_contract(new_grad, variable));
+          }
+        }
+      }
+    } else if (!GradMode::is_enabled()) {
+      // This case is not strictly necessary, but it makes the first-order only
+      // case slightly more efficient.
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // If `variable_grad` is sparse and `new_grad` is not sparse, their
+        // sum is not sparse, and we must change the TensorImpl type of
+        // `variable_grad` for it to store the result. However, changing the
+        // TensorImpl type of a tensor requires changing the tensor itself, and
+        // thus in this case we have to change the grad tensor.
+        auto result = new_grad + variable_grad;
+        CHECK_RESULT(result, variable);
+        update_grad(std::move(result));
+      } else if (!at::inplaceIsVmapCompatible(variable_grad, new_grad)) {
+        // Ideally we'd perform an in-place operation to avoid changing
+        // the grad tensor. However, if that's impossible because the grads
+        // are vmap-incompatible (See NOTE: [vmap-incompatible in-place
+        // operations]), then we just add them out-of-place.
+        auto result = variable_grad + new_grad;
+        CHECK_RESULT(result, variable);
+        update_grad(std::move(result));
+      } else {
+        // In this case we can avoid changing the grad tensor. There are three
+        // scenarios when we'll hit this case:
+        //
+        // 1. `variable_grad` is sparse, and `new_grad` is sparse.
+        // 2. `variable_grad` is dense, and `new_grad` is sparse.
+        // 3. `variable_grad` is dense, and `new_grad` is dense.
+        // 4. `variable_grad` is mkldnn, and `new_grad` is mkldnn.
+        //
+        // In all of these four cases, `variable_grad += new_grad` is a
+        // valid operation which adds `new_grad` to `variable_grad` in
+        // place. `variable_grad` is thus still referring to the same tensor
+        // after the operation.
+        // Also DistributedDataParallel(DDP) package relies on grad being
+        // mutated in place for saving peak memory usage. DDP will still
+        // work correctly if it is mutated out of place here, but DDP will
+        // maintain one extra copy of grad tensors in buffer and thus
+        // increase peak memory usage.
+        variable_grad += new_grad;
+        CHECK_RESULT(variable_grad, variable);
+        // ^ We could enforce the contract more aggressively here by writing:
+        // if (variable_grad.is_sparse() || new_grad.is_sparse()) {
+        //   variable_grad += new_grad;
+        // } else if (obeys_layout_contract(variable_grad, variable)) {
+        //   variable_grad += new_grad;
+        // } else {
+        //   result = at::empty_strided(variable.sizes(), variable.strides(),
+        //                              variable.options().memory_format(c10::nullopt));
+        //   update_grad(at::native::add_out(result, variable_grad,
+        //   new_grad, 1.0);
+        // }
+        // However, that accumulation is sometimes in place and sometimes not,
+        // which may break user code.
+      }
+    } else {
+      at::Tensor result;
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // CPU backend throws an error on sparse + dense, so prefer dense +
+        // sparse here.
+        result = new_grad + variable_grad;
+      } else {
+        // Assumes operator+ result typically matches strides of first arg,
+        // and hopes variable_grad was originally created obeying layout
+        // contract.
+        result = variable_grad + new_grad;
+      }
+      CHECK_RESULT(result, variable);
+      update_grad(std::move(result));
+      // ^ We could enforce the contract more aggressively here by saying
+      // if (obeys_layout_contract(new_grad, variable)) {
+      //   update_grad(new_grad + variable_grad);
+      // } else {
+      //   update_grad(variable_grad + new_grad);
+      // }
+      // such that the stashed grad is likely to have the right strides if
+      // either variable_grad or new_grad already has the right strides.
+      // We could enforce the contract with certainty by saying
+      // auto result = variable_grad + new_grad (or vice versa), checking
+      // result's layout, and copying to an obedient clone if necessary before
+      // update_grad. The copy would require another gmem pass.  We can't create
+      // empty result with the right layout then add_out into it with a single
+      // kernel, because GradMode is enabled in this branch, and add_out isn't
+      // differentiable. Maybe more trouble than it's worth.
+    }
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  Variable variable;
+};
+
+#undef CHECK_RESULT
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..6515f4b1d2e5dd8dd784e8028dbfb89a9e6b6c65
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+
+struct TORCH_API Error : public Node {
+  Error(std::string msg, edge_list&& next_edges)
+      : Node(std::move(next_edges)), msg(std::move(msg)) {}
+
+  Error(std::string msg) : msg(std::move(msg)) {}
+
+  variable_list apply(variable_list&& inputs) override;
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  std::string msg;
+};
+
+// We print grad_fn names in tensor printing. For functions with backward
+// NYI, grad_fn=<Error> will be printed if we use Error, which is confusing. So
+// special case with a new NotImplemented function here.
+struct TORCH_API NotImplemented : public Error {
+  NotImplemented(const std::string& forward_fn, edge_list&& next_edges)
+      : Error(
+            "derivative for " + forward_fn + " is not implemented",
+            std::move(next_edges)) {}
+
+  NotImplemented(const std::string& forward_fn)
+      : Error("derivative for " + forward_fn + " is not implemented") {}
+};
+
+// Identity in forward, Error in backward. Used to implement
+// @once_differentiable
+struct TORCH_API DelayedError : public Node {
+  DelayedError(std::string msg, int64_t num_inputs) : msg(std::move(msg)) {
+    // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
+    for (const auto i : c10::irange(num_inputs)) {
+      (void)i; // Suppress unused variable warning
+      add_input_metadata(Node::undefined_input());
+    }
+  }
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::string msg;
+};
+
+struct TORCH_API UndefinedGrad : public Node {
+  UndefinedGrad() {
+    add_input_metadata(Node::undefined_input());
+  }
+
+  variable_list apply(variable_list&& inputs) override;
+};
+
+struct TORCH_API UndefinedGradBackward : public Node {
+  UndefinedGradBackward(edge_list&& next_edges) : Node(std::move(next_edges)) {}
+
+  UndefinedGradBackward() = default;
+
+  variable_list apply(variable_list&& inputs) override;
+
+  void compiled_args(CompiledNodeArgs& args) override {}
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override {
+    return apply(variable_list(inputs));
+  }
+};
+
+struct TORCH_API GraphRoot : public Node {
+  GraphRoot(edge_list functions, variable_list inputs)
+      : Node(std::move(functions)), outputs(std::move(inputs)) {
+    // Ensures calls to stream() on a GraphRoot instance reflect current
+    // stream(s) on devices of root grad tensors at the time the instance is
+    // constructed.
+    for (const auto& t : outputs) {
+      add_input_metadata(t);
+    }
+  }
+
+  variable_list apply(variable_list&& inputs) override {
+    return outputs;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  variable_list outputs;
+};
+
+struct TORCH_API Identity : public Node {
+  variable_list apply(variable_list&& inputs) override;
+};
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a621f3204dca704b23a5ed8a5c57d050f54a03f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <c10/util/Optional.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+
+struct TORCH_CUDA_CU_API Scatter : public Node {
+  explicit Scatter(
+      std::vector<at::Device> devices,
+      c10::optional<std::vector<int64_t>> chunk_sizes = c10::nullopt,
+      int64_t dim = 0,
+      c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams =
+          c10::nullopt,
+      bool unsqueeze_scalars = false);
+  ~Scatter() override;
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::vector<at::Device> devices_;
+  c10::optional<std::vector<int64_t>> chunk_sizes_;
+  int64_t dim_;
+  c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>> streams_;
+  bool unsqueeze_scalars_;
+};
+
+struct TORCH_CUDA_CU_API Gather : public Node {
+  explicit Gather(const at::Device& destination_device, int64_t dim = 0);
+  ~Gather() override;
+
+  variable_list apply(variable_list&& inputs) override;
+
+  at::Device destination_device_;
+  int64_t dim_;
+};
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..92fafedb900a4aac00ea3af745f6b9c90cf9ab1f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <torch/csrc/autograd/python_cpp_function.h>
+#include <torch/csrc/autograd/python_function.h>
+
+namespace py = pybind11;
+
+namespace pybind11 {
+namespace detail {}
+} // namespace pybind11
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..3baf47e4072404d6ae968117cb8fcf012156ca55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/TensorGeometry.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <c10/util/Optional.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace torch {
+namespace autograd {
+
+struct TORCH_API CopyBackwards : public Node {
+  variable_list apply(variable_list&& grads) override;
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  at::TensorOptions src_options;
+};
+
+// Note [View + Inplace update for base tensor]
+//
+// This note covers a few important topics related to view + inplace handling.
+//   - It explains what is the CopySlices Node and why we need it.
+//   - It explains the considerations on what is saved for backward in
+//   CopySlices.
+//   - It explains why we need to sometimes change the exec_info of the current
+//   backward
+//
+// What is CopySlices?
+// ~~~~~~~~~~~~~~~~~~~
+//
+// We support autograd with inplace mutation; e.g., if you write x.mul_(2)
+// the autograd will work as if you now had multiple Tensors under the hood and
+// you did
+//   x = t.clone()
+//   x0 = x
+//   x1 = x0 * 2
+//   x = x1
+// As you can see here, after this operation, x.grad_fn now points to x1.grad_fn
+// (the MulBackward node) and this node points to x's original grad_fn (which is
+// also x0.grad_fn). It is important to keep in mind that after the inplace,
+// there is no Tensor object that represents the x0 state anymore. But the graph
+// for it is still around in autograd (in case x was used before being modified
+// inplace). See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+// We call this rebasing the history of the Tensor.
+//
+// Now, a difficult situation is what happens if x is a differentiable view
+// of a base b.
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= 2
+// With the same approach as above, this will become
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   b0 = b
+//   x0 = x
+//   x1 = x0 * 2
+//   b1 = b0.select_scatter(x1, 0, 0)
+//   x2 = b1.select(0, 0)
+//   x = x2
+//   b = b1
+// As you can see here, not only we need to modify x's grad_fn, we also need to
+// modify the one from b. We also need to ensure that the new grad_fn on x is
+// linked to b's new grad_fn. The chain the select_scatter, multiplication and
+// select is what CopySlices does, all wrapped into a single Node.
+//
+// See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+//
+// What do we need to save in CopySlices to run backward?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// We need to perform grad_view = fn(grad_view), but out-of-place.
+// view_fn_ is an optional function saved in DifferentiableViewMeta
+// from forward pass, so that we can recover we when as_strided is not
+// supported. It preserves the invariants:
+//   view = view_fn_(base)
+//   grad_view = view_fn_(grad_base)
+//
+// When as_strided is supported (e.g. strided CPU/CUDA Tensors), view_fn_
+// is empty and we save TensorGeometry(view) instead.
+// With the TensorGeometry information we can use `as_strided` call which
+// is more efficient to recover views in backward.
+//
+// For example:
+//   view_1 = view_op_1(base)
+//   view_2 = view_op_2(view_1)
+//   ...
+//   view_n = view_op_n(view_n-1)
+//   view_n = inplace_op(view_n)
+//
+// In CPU/CUDA case where we support efficient as_strided implementation,
+// grad_view_n can be calculated through 1 step.
+//
+//   grad_view_n = grad_base.as_strided(view_sizes, view_strides, view_offset);
+//
+// But in XLA backend where we don't have full support of as_strided,
+// it has to save a chained lambda function view_fn_, to exactly
+// replay how the view was done in forward.
+//
+//   view_fn_ = view_op_n(...(view_op_2(view_op_1())))
+//   grad_view_n = view_fn_(grad_base)
+//
+// This chain view_fn_ works as long as forward view ops are implemented,
+// e.g XLA simulates view without a real Storage behind Tensor, but it's less
+// efficient than the as_strided one so we should be careful to only use it when
+// necessary.
+//
+//   - For CPU/CUDA we save TensorGeometry of both base and view tensors,
+//     That's all we need to pass into as_strided.
+//     E.g. int[] sizes, int[] strides, and int storage_offset.
+//   - For XLA we use view_fn_, which captures all forward view op arguments
+//     by **value**.
+//     E.g for at::narrow, int dim, int start, in length are saved.
+//
+// Theoretically we could also save Tensor `view` in CopySlices Node, but
+// it's far more expensive than what we currently save.
+//   1. We cannot afford keeping large tensors alive to recover views only.
+//   2. There are inplace checks when Tensors are loaded back to make sure
+//      they haven't been changed (including size metadata).
+// So saving metadata like TensorGeometry/view arguments is much better
+// because it is minimal information needed to recover views, as well as it
+// allows the user to modify the original Tensor without preventing the
+// backward pass from running.
+//
+// Why do we manually change exec_info in the apply?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// Using the same example as before,
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= y
+//
+// You can see the visualization at
+// https://docs.google.com/drawings/d/1Bx-Hcz-zlIv7PabQqnPhUIVIs9F8WWi48svqMsAUMFs
+// which contains the wrapped MulBackward Node and show what it links to.
+// Since a backward can happen between any subset of the inputs (t and y) and
+// outputs (o, x, b). It is possible to get into a state where CopySlices's 0th
+// next function (CloneBackward) needs gradient but MulBackward's 0th next
+// function (SelectBackward) is not. This happens if you do autograd.grad
+// between x and t for example.
+// In such a case, we do need to mark SelectBackward as requiring gradient such
+// that, during the execution of MulBackward, we will actually compute gradient
+// for the 0th input.
+//
+// All the other next functions are always shared (this is asserted in the apply
+// code) and so nothing needs to be done for them.
+
+// See Note [View + Inplace update for view tensor] for what we do to view
+// tensor when an in-place operation happens.
+struct TORCH_API CopySlices : public Node {
+  CopySlices(
+      const Variable& base_var,
+      at::TensorGeometry view_,
+      std::unique_ptr<ViewFunc> view_fn_,
+      std::shared_ptr<Node> fn_);
+
+  // common code between apply/apply_with_saved
+  template <typename T>
+  variable_list apply_impl(variable_list&& inputs, const T& call_fn);
+
+  variable_list apply(variable_list&& inputs) override;
+  void release_variables() override;
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  at::TensorGeometry base;
+  // view and view_fn are redundant and view_fn will be used if available.
+  // See Note [View + Inplace update for base tensor] for details.
+  at::TensorGeometry view;
+  std::unique_ptr<ViewFunc> view_fn;
+  std::shared_ptr<Node> fn;
+};
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b394c3aea2fbeafbe95e77dd6d2d786e5770db5f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/InferenceMode.h>
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/core/Tensor.h>
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+
+using function_constructor = std::function<std::shared_ptr<Node>(edge_list&&)>;
+
+/**
+ * Wraps the tensor outputs in variables and creates the grad_fn and sets the
+ * grad_fn if necessary.
+ */
+TORCH_API variable_list wrap_outputs(
+    const variable_list& inputs,
+    tensor_list&& outputs,
+    const function_constructor& ctr);
+
+///  Checks that inputs contains exactly `args` items and that the first
+///  `required_args`
+/// items are not nullptr. If not specified, `required_args` defaults to `args`.
+TORCH_API void check_input_variables(
+    const char* name,
+    const variable_list& inputs,
+    int args,
+    int required_args = -1,
+    bool allow_undefined = false);
+
+struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
+  bool out = false;
+  using IterArgs<ComputeRequiresGrad>::operator();
+  void operator()(const at::Tensor& tensor) {
+    const auto& var = static_cast<const Variable&>(tensor);
+    if (var.defined() && var.requires_grad()) {
+      out = true;
+    }
+  }
+  void operator()(const c10::optional<at::Tensor>& tensor) {
+    if (tensor.has_value()) {
+      (*this)(*tensor);
+    }
+  }
+  bool short_circuit() {
+    return out;
+  }
+};
+
+template <typename... Args>
+inline bool compute_requires_grad(Args&&... args) {
+  if (!GradMode::is_enabled()) {
+    return false;
+  }
+  return ComputeRequiresGrad().apply(std::forward<Args>(args)...).out;
+}
+
+inline void set_history(
+    const at::Tensor& variable,
+    const std::shared_ptr<Node>& grad_fn) {
+  TORCH_CHECK(grad_fn != nullptr);
+  if (variable.defined()) {
+    // If the codegen triggers this, you most likely want to add your newly
+    // added function to the DONT_REQUIRE_DERIVATIVE list in
+    // tools/autograd/gen_variable_type.py
+    TORCH_INTERNAL_ASSERT(isDifferentiableType(variable.scalar_type()));
+    auto output_nr = grad_fn->add_input_metadata(variable);
+    impl::set_gradient_edge(variable, {grad_fn, output_nr});
+  } else {
+    grad_fn->add_input_metadata(Node::undefined_input());
+  }
+}
+
+inline void set_history(
+    const std::vector<Variable>& variables,
+    const std::shared_ptr<Node>& grad_fn) {
+  for (auto& variable : variables) {
+    set_history(variable, grad_fn);
+  }
+}
+
+inline bool isFwGradDefined(const c10::optional<at::Tensor>& t) {
+  return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
+}
+
+inline bool isFwGradDefinedTensorList(const at::ITensorListRef& variables) {
+  bool ret = false;
+  for (auto& variable : variables) {
+    ret |= isFwGradDefined(variable);
+  }
+  return ret;
+}
+
+inline bool isFwGradDefinedTensorList(
+    const c10::List<c10::optional<at::Tensor>>& li) {
+  bool ret = false;
+  for (auto i : c10::irange(li.size())) {
+    auto t = li.get(i);
+    ret |= (t.has_value() && isFwGradDefined(t.value()));
+  }
+  return ret;
+}
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2541c55520155d88fcdc9d549bd2fe9581013f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h
@@ -0,0 +1,14788 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/Functions.h
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include <torch/csrc/Export.h>
+
+#include <c10/core/SymIntArrayRef.h>
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::TensorGeometry;
+using at::ScalarType;
+using c10::optional;
+using c10::fmap;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [&saved_for](const SavedVariable& x) {
+    // TODO(crcrpar): Use `std::move(saved_for)` to avoid incrementing refcount, which would need refactoring.
+    return static_cast<Tensor>(x.unpack(saved_for));
+  });
+}
+
+inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    auto var = v.unpack(saved_for);
+    result.push_back(var.defined() ? c10::optional<Tensor>(var) : c10::nullopt);
+  }
+  return result;
+}
+
+using torch::autograd::TypeAndSize;
+
+#ifdef _WIN32
+struct AbsBackward0 : public TraceableFunction {
+  TORCH_API AbsBackward0() = default;
+#else
+struct TORCH_API AbsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AbsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcosBackward0 : public TraceableFunction {
+  TORCH_API AcosBackward0() = default;
+#else
+struct TORCH_API AcosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AddBackward0 : public TraceableFunction {
+  TORCH_API AddBackward0() = default;
+#else
+struct TORCH_API AddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct AddBackward1 : public TraceableFunction {
+  TORCH_API AddBackward1() = default;
+#else
+struct TORCH_API AddBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct AddbmmBackward0 : public TraceableFunction {
+  TORCH_API AddbmmBackward0() = default;
+#else
+struct TORCH_API AddbmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddbmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    batch1_.reset_data();
+    batch2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable batch1_;
+  c10::SymInt batch1_sym_argsize_0;
+  c10::SymInt batch1_sym_argsize_1;
+  SavedVariable batch2_;
+  c10::SymInt batch2_sym_argsize_2;
+  at::Scalar beta;
+
+};
+#ifdef _WIN32
+struct AddcdivBackward0 : public TraceableFunction {
+  TORCH_API AddcdivBackward0() = default;
+#else
+struct TORCH_API AddcdivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddcdivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    tensor1_.reset_data();
+    tensor2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable tensor1_;
+  at::ScalarType tensor1_scalar_type;
+  SavedVariable tensor2_;
+  at::ScalarType tensor2_scalar_type;
+  at::Scalar value;
+
+};
+#ifdef _WIN32
+struct AddcmulBackward0 : public TraceableFunction {
+  TORCH_API AddcmulBackward0() = default;
+#else
+struct TORCH_API AddcmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddcmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    tensor1_.reset_data();
+    tensor2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable tensor1_;
+  at::ScalarType tensor1_scalar_type;
+  SavedVariable tensor2_;
+  at::ScalarType tensor2_scalar_type;
+  at::Scalar value;
+
+};
+#ifdef _WIN32
+struct AddmmBackward0 : public TraceableFunction {
+  TORCH_API AddmmBackward0() = default;
+#else
+struct TORCH_API AddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  at::Layout mat1_layout;
+  std::vector<c10::SymInt> mat1_sym_sizes;
+  std::vector<c10::SymInt> mat1_sym_strides;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+
+};
+#ifdef _WIN32
+struct SparseAddmmBackward0 : public TraceableFunction {
+  TORCH_API SparseAddmmBackward0() = default;
+#else
+struct TORCH_API SparseAddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseAddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+
+};
+#ifdef _WIN32
+struct AddmvBackward0 : public TraceableFunction {
+  TORCH_API AddmvBackward0() = default;
+#else
+struct TORCH_API AddmvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddmvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat_.reset_data();
+    vec_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat_;
+  SavedVariable vec_;
+
+};
+#ifdef _WIN32
+struct AddrBackward0 : public TraceableFunction {
+  TORCH_API AddrBackward0() = default;
+#else
+struct TORCH_API AddrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    vec1_.reset_data();
+    vec2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable vec1_;
+  SavedVariable vec2_;
+
+};
+#ifdef _WIN32
+struct AffineGridGeneratorBackward0 : public TraceableFunction {
+  TORCH_API AffineGridGeneratorBackward0() = default;
+#else
+struct TORCH_API AffineGridGeneratorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AffineGridGeneratorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> size;
+
+};
+#ifdef _WIN32
+struct AliasBackward0 : public Node {
+  TORCH_API AliasBackward0() = default;
+#else
+struct TORCH_API AliasBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AliasBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AngleBackward0 : public TraceableFunction {
+  TORCH_API AngleBackward0() = default;
+#else
+struct TORCH_API AngleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AngleBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcoshBackward0 : public TraceableFunction {
+  TORCH_API AcoshBackward0() = default;
+#else
+struct TORCH_API AcoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcoshBackward1 : public TraceableFunction {
+  TORCH_API AcoshBackward1() = default;
+#else
+struct TORCH_API AcoshBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcoshBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsinhBackward0 : public TraceableFunction {
+  TORCH_API AsinhBackward0() = default;
+#else
+struct TORCH_API AsinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AsinhBackward1 : public TraceableFunction {
+  TORCH_API AsinhBackward1() = default;
+#else
+struct TORCH_API AsinhBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinhBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AtanhBackward0 : public TraceableFunction {
+  TORCH_API AtanhBackward0() = default;
+#else
+struct TORCH_API AtanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AtanhBackward1 : public TraceableFunction {
+  TORCH_API AtanhBackward1() = default;
+#else
+struct TORCH_API AtanhBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanhBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsStridedBackward0 : public Node {
+  TORCH_API AsStridedBackward0() = default;
+#else
+struct TORCH_API AsStridedBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  c10::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct AsStridedBackward1 : public TraceableFunction {
+  TORCH_API AsStridedBackward1() = default;
+#else
+struct TORCH_API AsStridedBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  c10::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct AsinBackward0 : public TraceableFunction {
+  TORCH_API AsinBackward0() = default;
+#else
+struct TORCH_API AsinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AtanBackward0 : public TraceableFunction {
+  TORCH_API AtanBackward0() = default;
+#else
+struct TORCH_API AtanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Atan2Backward0 : public TraceableFunction {
+  TORCH_API Atan2Backward0() = default;
+#else
+struct TORCH_API Atan2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Atan2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct BaddbmmBackward0 : public TraceableFunction {
+  TORCH_API BaddbmmBackward0() = default;
+#else
+struct TORCH_API BaddbmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BaddbmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    batch1_.reset_data();
+    batch2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable batch1_;
+  SavedVariable batch2_;
+  at::Scalar beta;
+
+};
+#ifdef _WIN32
+struct BernoulliBackward0 : public TraceableFunction {
+  TORCH_API BernoulliBackward0() = default;
+#else
+struct TORCH_API BernoulliBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct BernoulliBackward1 : public TraceableFunction {
+  TORCH_API BernoulliBackward1() = default;
+#else
+struct TORCH_API BernoulliBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize p_info;
+
+};
+#ifdef _WIN32
+struct BernoulliBackward2 : public TraceableFunction {
+  TORCH_API BernoulliBackward2() = default;
+#else
+struct TORCH_API BernoulliBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct BmmBackward0 : public TraceableFunction {
+  TORCH_API BmmBackward0() = default;
+#else
+struct TORCH_API BmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mat2_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MatmulBackward0 : public TraceableFunction {
+  TORCH_API MatmulBackward0() = default;
+#else
+struct TORCH_API MatmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MatmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CatBackward0 : public TraceableFunction {
+  TORCH_API CatBackward0() = default;
+#else
+struct TORCH_API CatBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CatBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  ::std::vector<::std::vector<c10::SymInt>> tensors_args_sizes_symint;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct CauchyBackward0 : public TraceableFunction {
+  TORCH_API CauchyBackward0() = default;
+#else
+struct TORCH_API CauchyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CauchyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CeilBackward0 : public TraceableFunction {
+  TORCH_API CeilBackward0() = default;
+#else
+struct TORCH_API CeilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CholeskyBackward0 : public TraceableFunction {
+  TORCH_API CholeskyBackward0() = default;
+#else
+struct TORCH_API CholeskyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgCholeskyExBackward0 : public TraceableFunction {
+  TORCH_API LinalgCholeskyExBackward0() = default;
+#else
+struct TORCH_API LinalgCholeskyExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgCholeskyExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    L_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool upper;
+  SavedVariable L_;
+
+};
+#ifdef _WIN32
+struct CholeskySolveBackward0 : public TraceableFunction {
+  TORCH_API CholeskySolveBackward0() = default;
+#else
+struct TORCH_API CholeskySolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskySolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input2_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input2_;
+  SavedVariable self_;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CholeskyInverseBackward0 : public TraceableFunction {
+  TORCH_API CholeskyInverseBackward0() = default;
+#else
+struct TORCH_API CholeskyInverseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskyInverseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ClampBackward0 : public TraceableFunction {
+  TORCH_API ClampBackward0() = default;
+#else
+struct TORCH_API ClampBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    max_.reset_data();
+    min_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable max_;
+  SavedVariable min_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampBackward1 : public TraceableFunction {
+  TORCH_API ClampBackward1() = default;
+#else
+struct TORCH_API ClampBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> max;
+  c10::optional<at::Scalar> min;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMinBackward0 : public TraceableFunction {
+  TORCH_API ClampMinBackward0() = default;
+#else
+struct TORCH_API ClampMinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar min;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMinBackward1 : public TraceableFunction {
+  TORCH_API ClampMinBackward1() = default;
+#else
+struct TORCH_API ClampMinBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMinBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    min_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable min_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMaxBackward0 : public TraceableFunction {
+  TORCH_API ClampMaxBackward0() = default;
+#else
+struct TORCH_API ClampMaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMaxBackward1 : public TraceableFunction {
+  TORCH_API ClampMaxBackward1() = default;
+#else
+struct TORCH_API ClampMaxBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMaxBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    max_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable max_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CloneBackward0 : public TraceableFunction {
+  TORCH_API CloneBackward0() = default;
+#else
+struct TORCH_API CloneBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CloneBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LazyCloneBackward0 : public TraceableFunction {
+  TORCH_API LazyCloneBackward0() = default;
+#else
+struct TORCH_API LazyCloneBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LazyCloneBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ToCopyBackward0 : public TraceableFunction {
+  TORCH_API ToCopyBackward0() = default;
+#else
+struct TORCH_API ToCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToCopyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct CoalesceBackward0 : public TraceableFunction {
+  TORCH_API CoalesceBackward0() = default;
+#else
+struct TORCH_API CoalesceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CoalesceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ComplexBackward0 : public TraceableFunction {
+  TORCH_API ComplexBackward0() = default;
+#else
+struct TORCH_API ComplexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ComplexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    imag_.reset_data();
+    real_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable imag_;
+  SavedVariable real_;
+
+};
+#ifdef _WIN32
+struct PolarBackward0 : public TraceableFunction {
+  TORCH_API PolarBackward0() = default;
+#else
+struct TORCH_API PolarBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolarBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ConjBackward0 : public Node {
+  TORCH_API ConjBackward0() = default;
+#else
+struct TORCH_API ConjBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NegViewBackward0 : public Node {
+  TORCH_API NegViewBackward0() = default;
+#else
+struct TORCH_API NegViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ConjPhysicalBackward0 : public TraceableFunction {
+  TORCH_API ConjPhysicalBackward0() = default;
+#else
+struct TORCH_API ConjPhysicalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjPhysicalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ConjPhysicalBackward1 : public TraceableFunction {
+  TORCH_API ConjPhysicalBackward1() = default;
+#else
+struct TORCH_API ConjPhysicalBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjPhysicalBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CopysignBackward0 : public TraceableFunction {
+  TORCH_API CopysignBackward0() = default;
+#else
+struct TORCH_API CopysignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CopysignBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CopysignBackward1 : public TraceableFunction {
+  TORCH_API CopysignBackward1() = default;
+#else
+struct TORCH_API CopysignBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CopysignBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CosBackward0 : public TraceableFunction {
+  TORCH_API CosBackward0() = default;
+#else
+struct TORCH_API CosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CoshBackward0 : public TraceableFunction {
+  TORCH_API CoshBackward0() = default;
+#else
+struct TORCH_API CoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LinalgCrossBackward0 : public TraceableFunction {
+  TORCH_API LinalgCrossBackward0() = default;
+#else
+struct TORCH_API LinalgCrossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgCrossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogcumsumexpBackward0 : public TraceableFunction {
+  TORCH_API LogcumsumexpBackward0() = default;
+#else
+struct TORCH_API LogcumsumexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogcumsumexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CumprodBackward0 : public TraceableFunction {
+  TORCH_API CumprodBackward0() = default;
+#else
+struct TORCH_API CumprodBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumprodBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CumsumBackward0 : public TraceableFunction {
+  TORCH_API CumsumBackward0() = default;
+#else
+struct TORCH_API CumsumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumsumBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct CummaxBackward0 : public TraceableFunction {
+  TORCH_API CummaxBackward0() = default;
+#else
+struct TORCH_API CummaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CummaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct CumminBackward0 : public TraceableFunction {
+  TORCH_API CumminBackward0() = default;
+#else
+struct TORCH_API CumminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct ConvTbcBackward0 : public TraceableFunction {
+  TORCH_API ConvTbcBackward0() = default;
+#else
+struct TORCH_API ConvTbcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvTbcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  int64_t pad = 0;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CtcLossBackward0 : public TraceableFunction {
+  TORCH_API CtcLossBackward0() = default;
+#else
+struct TORCH_API CtcLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CtcLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    log_probs_.reset_data();
+    targets_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t blank = 0;
+  std::vector<int64_t> input_lengths;
+  SavedVariable log_probs_;
+  std::vector<int64_t> target_lengths;
+  SavedVariable targets_;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CtcLossBackward1 : public TraceableFunction {
+  TORCH_API CtcLossBackward1() = default;
+#else
+struct TORCH_API CtcLossBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CtcLossBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_lengths_.reset_data();
+    log_probs_.reset_data();
+    target_lengths_.reset_data();
+    targets_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t blank = 0;
+  SavedVariable input_lengths_;
+  SavedVariable log_probs_;
+  SavedVariable target_lengths_;
+  SavedVariable targets_;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct Deg2RadBackward0 : public TraceableFunction {
+  TORCH_API Deg2RadBackward0() = default;
+#else
+struct TORCH_API Deg2RadBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Deg2RadBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LinalgDetBackward0 : public TraceableFunction {
+  TORCH_API LinalgDetBackward0() = default;
+#else
+struct TORCH_API LinalgDetBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgDetBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgSlogdetBackward0 : public TraceableFunction {
+  TORCH_API LinalgSlogdetBackward0() = default;
+#else
+struct TORCH_API LinalgSlogdetBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSlogdetBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    sign_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable sign_;
+
+};
+#ifdef _WIN32
+struct BlockDiagBackward0 : public TraceableFunction {
+  TORCH_API BlockDiagBackward0() = default;
+#else
+struct TORCH_API BlockDiagBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BlockDiagBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  ::std::vector<::std::vector<int64_t>> tensors_args_sizes;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct DiagEmbedBackward0 : public TraceableFunction {
+  TORCH_API DiagEmbedBackward0() = default;
+#else
+struct TORCH_API DiagEmbedBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagEmbedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+
+};
+#ifdef _WIN32
+struct DiagonalBackward0 : public Node {
+  TORCH_API DiagonalBackward0() = default;
+#else
+struct TORCH_API DiagonalBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct DiagonalBackwardBackward0 : public TraceableFunction {
+  TORCH_API DiagonalBackwardBackward0() = default;
+#else
+struct TORCH_API DiagonalBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+
+};
+#ifdef _WIN32
+struct DistBackward0 : public TraceableFunction {
+  TORCH_API DistBackward0() = default;
+#else
+struct TORCH_API DistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct DivBackward0 : public TraceableFunction {
+  TORCH_API DivBackward0() = default;
+#else
+struct TORCH_API DivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward1 : public TraceableFunction {
+  TORCH_API DivBackward1() = default;
+#else
+struct TORCH_API DivBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward2 : public TraceableFunction {
+  TORCH_API DivBackward2() = default;
+#else
+struct TORCH_API DivBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  c10::optional<std::string> rounding_mode;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward3 : public TraceableFunction {
+  TORCH_API DivBackward3() = default;
+#else
+struct TORCH_API DivBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  c10::optional<std::string> rounding_mode;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DotBackward0 : public TraceableFunction {
+  TORCH_API DotBackward0() = default;
+#else
+struct TORCH_API DotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    tensor_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable tensor_;
+
+};
+#ifdef _WIN32
+struct VdotBackward0 : public TraceableFunction {
+  TORCH_API VdotBackward0() = default;
+#else
+struct TORCH_API VdotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VdotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FusedDropoutBackward0 : public TraceableFunction {
+  TORCH_API FusedDropoutBackward0() = default;
+#else
+struct TORCH_API FusedDropoutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FusedDropoutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct NativeDropoutBackward0 : public TraceableFunction {
+  TORCH_API NativeDropoutBackward0() = default;
+#else
+struct TORCH_API NativeDropoutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeDropoutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  c10::optional<bool> train;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct NativeDropoutBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeDropoutBackwardBackward0() = default;
+#else
+struct TORCH_API NativeDropoutBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeDropoutBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable mask_;
+  double scale;
+
+};
+#ifdef _WIN32
+struct EqBackward0 : public TraceableFunction {
+  TORCH_API EqBackward0() = default;
+#else
+struct TORCH_API EqBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EqBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct EqBackward1 : public TraceableFunction {
+  TORCH_API EqBackward1() = default;
+#else
+struct TORCH_API EqBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EqBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ErfBackward0 : public TraceableFunction {
+  TORCH_API ErfBackward0() = default;
+#else
+struct TORCH_API ErfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ErfcBackward0 : public TraceableFunction {
+  TORCH_API ErfcBackward0() = default;
+#else
+struct TORCH_API ErfcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialErfcxBackward0 : public TraceableFunction {
+  TORCH_API SpecialErfcxBackward0() = default;
+#else
+struct TORCH_API SpecialErfcxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialErfcxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ErfinvBackward0 : public TraceableFunction {
+  TORCH_API ErfinvBackward0() = default;
+#else
+struct TORCH_API ErfinvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfinvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ExpBackward0 : public TraceableFunction {
+  TORCH_API ExpBackward0() = default;
+#else
+struct TORCH_API ExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct Exp2Backward0 : public TraceableFunction {
+  TORCH_API Exp2Backward0() = default;
+#else
+struct TORCH_API Exp2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Exp2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct Expm1Backward0 : public TraceableFunction {
+  TORCH_API Expm1Backward0() = default;
+#else
+struct TORCH_API Expm1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Expm1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ExpandBackward0 : public Node {
+  TORCH_API ExpandBackward0() = default;
+#else
+struct TORCH_API ExpandBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpandBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ExponentialBackward0 : public TraceableFunction {
+  TORCH_API ExponentialBackward0() = default;
+#else
+struct TORCH_API ExponentialBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExponentialBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerTensorAffineCachemaskBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerTensorAffineCachemaskBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerTensorAffineCachemaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerTensorAffineCachemaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizeLearnablePerTensorAffineBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizeLearnablePerTensorAffineBackward0() = default;
+#else
+struct TORCH_API FakeQuantizeLearnablePerTensorAffineBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizeLearnablePerTensorAffineBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scale_.reset_data();
+    self_.reset_data();
+    zero_point_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double grad_factor;
+  int64_t quant_max = 0;
+  int64_t quant_min = 0;
+  SavedVariable scale_;
+  SavedVariable self_;
+  SavedVariable zero_point_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerChannelAffineCachemaskBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerChannelAffineCachemaskBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerChannelAffineCachemaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerChannelAffineCachemaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizeLearnablePerChannelAffineBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizeLearnablePerChannelAffineBackward0() = default;
+#else
+struct TORCH_API FakeQuantizeLearnablePerChannelAffineBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizeLearnablePerChannelAffineBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scale_.reset_data();
+    self_.reset_data();
+    zero_point_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t axis = 0;
+  double grad_factor;
+  int64_t quant_max = 0;
+  int64_t quant_min = 0;
+  SavedVariable scale_;
+  SavedVariable self_;
+  SavedVariable zero_point_;
+
+};
+#ifdef _WIN32
+struct FusedMovingAvgObsFqHelperBackward0 : public TraceableFunction {
+  TORCH_API FusedMovingAvgObsFqHelperBackward0() = default;
+#else
+struct TORCH_API FusedMovingAvgObsFqHelperBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FusedMovingAvgObsFqHelperBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FillBackward0 : public TraceableFunction {
+  TORCH_API FillBackward0() = default;
+#else
+struct TORCH_API FillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward1 : public TraceableFunction {
+  TORCH_API FillBackward1() = default;
+#else
+struct TORCH_API FillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward2 : public TraceableFunction {
+  TORCH_API FillBackward2() = default;
+#else
+struct TORCH_API FillBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward3 : public TraceableFunction {
+  TORCH_API FillBackward3() = default;
+#else
+struct TORCH_API FillBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FloorBackward0 : public TraceableFunction {
+  TORCH_API FloorBackward0() = default;
+#else
+struct TORCH_API FloorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FloorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FmodBackward0 : public TraceableFunction {
+  TORCH_API FmodBackward0() = default;
+#else
+struct TORCH_API FmodBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmodBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FmodBackward1 : public TraceableFunction {
+  TORCH_API FmodBackward1() = default;
+#else
+struct TORCH_API FmodBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmodBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FracBackward0 : public TraceableFunction {
+  TORCH_API FracBackward0() = default;
+#else
+struct TORCH_API FracBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FracBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FrexpBackward0 : public TraceableFunction {
+  TORCH_API FrexpBackward0() = default;
+#else
+struct TORCH_API FrexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FrexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+
+};
+#ifdef _WIN32
+struct GatherBackward0 : public TraceableFunction {
+  TORCH_API GatherBackward0() = default;
+#else
+struct TORCH_API GatherBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GatherBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable self_;
+  bool sparse_grad;
+
+};
+#ifdef _WIN32
+struct GeBackward0 : public TraceableFunction {
+  TORCH_API GeBackward0() = default;
+#else
+struct TORCH_API GeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GeBackward1 : public TraceableFunction {
+  TORCH_API GeBackward1() = default;
+#else
+struct TORCH_API GeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GeometricBackward0 : public TraceableFunction {
+  TORCH_API GeometricBackward0() = default;
+#else
+struct TORCH_API GeometricBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeometricBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct GeqrfBackward0 : public TraceableFunction {
+  TORCH_API GeqrfBackward0() = default;
+#else
+struct TORCH_API GeqrfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeqrfBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct GridSampler2DBackward0 : public TraceableFunction {
+  TORCH_API GridSampler2DBackward0() = default;
+#else
+struct TORCH_API GridSampler2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GridSampler3DBackward0 : public TraceableFunction {
+  TORCH_API GridSampler3DBackward0() = default;
+#else
+struct TORCH_API GridSampler3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GridSampler2DCpuFallbackBackward0 : public TraceableFunction {
+  TORCH_API GridSampler2DCpuFallbackBackward0() = default;
+#else
+struct TORCH_API GridSampler2DCpuFallbackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler2DCpuFallbackBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GtBackward0 : public TraceableFunction {
+  TORCH_API GtBackward0() = default;
+#else
+struct TORCH_API GtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GtBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GtBackward1 : public TraceableFunction {
+  TORCH_API GtBackward1() = default;
+#else
+struct TORCH_API GtBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GtBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct HardsigmoidBackward0 : public TraceableFunction {
+  TORCH_API HardsigmoidBackward0() = default;
+#else
+struct TORCH_API HardsigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardsigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardswishBackward0 : public TraceableFunction {
+  TORCH_API HardswishBackward0() = default;
+#else
+struct TORCH_API HardswishBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardswishBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardswishBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardswishBackwardBackward0() = default;
+#else
+struct TORCH_API HardswishBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardswishBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct HypotBackward0 : public TraceableFunction {
+  TORCH_API HypotBackward0() = default;
+#else
+struct TORCH_API HypotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HypotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct I0Backward0 : public TraceableFunction {
+  TORCH_API I0Backward0() = default;
+#else
+struct TORCH_API I0Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "I0Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialI0EBackward0 : public TraceableFunction {
+  TORCH_API SpecialI0EBackward0() = default;
+#else
+struct TORCH_API SpecialI0EBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI0EBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialI1Backward0 : public TraceableFunction {
+  TORCH_API SpecialI1Backward0() = default;
+#else
+struct TORCH_API SpecialI1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialI1EBackward0 : public TraceableFunction {
+  TORCH_API SpecialI1EBackward0() = default;
+#else
+struct TORCH_API SpecialI1EBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI1EBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct IgammaBackward0 : public TraceableFunction {
+  TORCH_API IgammaBackward0() = default;
+#else
+struct TORCH_API IgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct IgammacBackward0 : public TraceableFunction {
+  TORCH_API IgammacBackward0() = default;
+#else
+struct TORCH_API IgammacBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IgammacBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct IndexBackward0 : public TraceableFunction {
+  TORCH_API IndexBackward0() = default;
+#else
+struct TORCH_API IndexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeIndexBackward0 : public TraceableFunction {
+  TORCH_API UnsafeIndexBackward0() = default;
+#else
+struct TORCH_API UnsafeIndexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeIndexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct IndexAddBackward0 : public TraceableFunction {
+  TORCH_API IndexAddBackward0() = default;
+#else
+struct TORCH_API IndexAddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexAddBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable source_;
+  int64_t source_dim = 0;
+
+};
+#ifdef _WIN32
+struct IndexReduceBackward0 : public TraceableFunction {
+  TORCH_API IndexReduceBackward0() = default;
+#else
+struct TORCH_API IndexReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+    source_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool include_self;
+  SavedVariable index_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable source_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct IndexCopyBackward0 : public TraceableFunction {
+  TORCH_API IndexCopyBackward0() = default;
+#else
+struct TORCH_API IndexCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexCopyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable source_;
+  int64_t source_dim = 0;
+
+};
+#ifdef _WIN32
+struct IndexFillBackward0 : public TraceableFunction {
+  TORCH_API IndexFillBackward0() = default;
+#else
+struct TORCH_API IndexFillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexFillBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct IndexFillBackward1 : public TraceableFunction {
+  TORCH_API IndexFillBackward1() = default;
+#else
+struct TORCH_API IndexFillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexFillBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct IndexPutBackward0 : public TraceableFunction {
+  TORCH_API IndexPutBackward0() = default;
+#else
+struct TORCH_API IndexPutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexPutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct UnsafeIndexPutBackward0 : public TraceableFunction {
+  TORCH_API UnsafeIndexPutBackward0() = default;
+#else
+struct TORCH_API UnsafeIndexPutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeIndexPutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct IndexPutImplBackward0 : public TraceableFunction {
+  TORCH_API IndexPutImplBackward0() = default;
+#else
+struct TORCH_API IndexPutImplBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexPutImplBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct IndexSelectBackward0 : public TraceableFunction {
+  TORCH_API IndexSelectBackward0() = default;
+#else
+struct TORCH_API IndexSelectBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexSelectBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LinalgInvExBackward0 : public TraceableFunction {
+  TORCH_API LinalgInvExBackward0() = default;
+#else
+struct TORCH_API LinalgInvExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgInvExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    inverse_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable inverse_;
+
+};
+#ifdef _WIN32
+struct LinalgPinvBackward0 : public TraceableFunction {
+  TORCH_API LinalgPinvBackward0() = default;
+#else
+struct TORCH_API LinalgPinvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgPinvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct KthvalueBackward0 : public TraceableFunction {
+  TORCH_API KthvalueBackward0() = default;
+#else
+struct TORCH_API KthvalueBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "KthvalueBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct LeBackward0 : public TraceableFunction {
+  TORCH_API LeBackward0() = default;
+#else
+struct TORCH_API LeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LeBackward1 : public TraceableFunction {
+  TORCH_API LeBackward1() = default;
+#else
+struct TORCH_API LeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LerpBackward0 : public TraceableFunction {
+  TORCH_API LerpBackward0() = default;
+#else
+struct TORCH_API LerpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LerpBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar weight;
+
+};
+#ifdef _WIN32
+struct LerpBackward1 : public TraceableFunction {
+  TORCH_API LerpBackward1() = default;
+#else
+struct TORCH_API LerpBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LerpBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    end_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable end_;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LgammaBackward0 : public TraceableFunction {
+  TORCH_API LgammaBackward0() = default;
+#else
+struct TORCH_API LgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct DigammaBackward0 : public TraceableFunction {
+  TORCH_API DigammaBackward0() = default;
+#else
+struct TORCH_API DigammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DigammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PolygammaBackward0 : public TraceableFunction {
+  TORCH_API PolygammaBackward0() = default;
+#else
+struct TORCH_API PolygammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolygammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t n = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PolygammaBackward1 : public TraceableFunction {
+  TORCH_API PolygammaBackward1() = default;
+#else
+struct TORCH_API PolygammaBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolygammaBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t n = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogBackward0 : public TraceableFunction {
+  TORCH_API LogBackward0() = default;
+#else
+struct TORCH_API LogBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log10Backward0 : public TraceableFunction {
+  TORCH_API Log10Backward0() = default;
+#else
+struct TORCH_API Log10Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log10Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log1PBackward0 : public TraceableFunction {
+  TORCH_API Log1PBackward0() = default;
+#else
+struct TORCH_API Log1PBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log1PBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log2Backward0 : public TraceableFunction {
+  TORCH_API Log2Backward0() = default;
+#else
+struct TORCH_API Log2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogaddexpBackward0 : public TraceableFunction {
+  TORCH_API LogaddexpBackward0() = default;
+#else
+struct TORCH_API LogaddexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogaddexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Logaddexp2Backward0 : public TraceableFunction {
+  TORCH_API Logaddexp2Backward0() = default;
+#else
+struct TORCH_API Logaddexp2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Logaddexp2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct XlogyBackward0 : public TraceableFunction {
+  TORCH_API XlogyBackward0() = default;
+#else
+struct TORCH_API XlogyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct XlogyBackward1 : public TraceableFunction {
+  TORCH_API XlogyBackward1() = default;
+#else
+struct TORCH_API XlogyBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct XlogyBackward2 : public TraceableFunction {
+  TORCH_API XlogyBackward2() = default;
+#else
+struct TORCH_API XlogyBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward0 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward0() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward1 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward1() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward2 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward2() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward0 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward0() = default;
+#else
+struct TORCH_API SpecialZetaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward1 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward1() = default;
+#else
+struct TORCH_API SpecialZetaBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward2 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward2() = default;
+#else
+struct TORCH_API SpecialZetaBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LogNormalBackward0 : public TraceableFunction {
+  TORCH_API LogNormalBackward0() = default;
+#else
+struct TORCH_API LogNormalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogNormalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LogsumexpBackward0 : public TraceableFunction {
+  TORCH_API LogsumexpBackward0() = default;
+#else
+struct TORCH_API LogsumexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogsumexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgLstsqBackward0 : public TraceableFunction {
+  TORCH_API LinalgLstsqBackward0() = default;
+#else
+struct TORCH_API LinalgLstsqBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLstsqBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    b_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable b_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LtBackward0 : public TraceableFunction {
+  TORCH_API LtBackward0() = default;
+#else
+struct TORCH_API LtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LtBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LtBackward1 : public TraceableFunction {
+  TORCH_API LtBackward1() = default;
+#else
+struct TORCH_API LtBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LtBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LinalgLuFactorExBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuFactorExBackward0() = default;
+#else
+struct TORCH_API LinalgLuFactorExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuFactorExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    LU_.reset_data();
+    pivots_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool pivot;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+
+};
+#ifdef _WIN32
+struct LinalgLuBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuBackward0() = default;
+#else
+struct TORCH_API LinalgLuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    L_.reset_data();
+    P_.reset_data();
+    U_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool pivot;
+  SavedVariable L_;
+  SavedVariable P_;
+  SavedVariable U_;
+
+};
+#ifdef _WIN32
+struct LinalgLuSolveBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuSolveBackward0() = default;
+#else
+struct TORCH_API LinalgLuSolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuSolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable LU_;
+  bool adjoint;
+  bool left;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LuUnpackBackward0 : public TraceableFunction {
+  TORCH_API LuUnpackBackward0() = default;
+#else
+struct TORCH_API LuUnpackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LuUnpackBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt LU_data_sym_argsize_minus_1;
+  c10::SymInt LU_data_sym_argsize_minus_2;
+
+};
+#ifdef _WIN32
+struct MaskedFillBackward0 : public TraceableFunction {
+  TORCH_API MaskedFillBackward0() = default;
+#else
+struct TORCH_API MaskedFillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedFillBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedFillBackward1 : public TraceableFunction {
+  TORCH_API MaskedFillBackward1() = default;
+#else
+struct TORCH_API MaskedFillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedFillBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedScatterBackward0 : public TraceableFunction {
+  TORCH_API MaskedScatterBackward0() = default;
+#else
+struct TORCH_API MaskedScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedScatterBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  std::vector<c10::SymInt> source_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MaskedScatterBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaskedScatterBackwardBackward0() = default;
+#else
+struct TORCH_API MaskedScatterBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedScatterBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize grad_output_info;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedSelectBackward0 : public TraceableFunction {
+  TORCH_API MaskedSelectBackward0() = default;
+#else
+struct TORCH_API MaskedSelectBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedSelectBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LinalgMatrixExpBackward0 : public TraceableFunction {
+  TORCH_API LinalgMatrixExpBackward0() = default;
+#else
+struct TORCH_API LinalgMatrixExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgMatrixExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MaxBackward0 : public TraceableFunction {
+  TORCH_API MaxBackward0() = default;
+#else
+struct TORCH_API MaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MaxBackward1 : public TraceableFunction {
+  TORCH_API MaxBackward1() = default;
+#else
+struct TORCH_API MaxBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MaximumBackward0 : public TraceableFunction {
+  TORCH_API MaximumBackward0() = default;
+#else
+struct TORCH_API MaximumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaximumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FmaxBackward0 : public TraceableFunction {
+  TORCH_API FmaxBackward0() = default;
+#else
+struct TORCH_API FmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MeanBackward0 : public TraceableFunction {
+  TORCH_API MeanBackward0() = default;
+#else
+struct TORCH_API MeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MeanBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt self_sym_numel;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MeanBackward1 : public TraceableFunction {
+  TORCH_API MeanBackward1() = default;
+#else
+struct TORCH_API MeanBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MeanBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  c10::SymInt self_sym_numel;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MedianBackward0 : public TraceableFunction {
+  TORCH_API MedianBackward0() = default;
+#else
+struct TORCH_API MedianBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MedianBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NanmedianBackward0 : public TraceableFunction {
+  TORCH_API NanmedianBackward0() = default;
+#else
+struct TORCH_API NanmedianBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanmedianBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MedianBackward1 : public TraceableFunction {
+  TORCH_API MedianBackward1() = default;
+#else
+struct TORCH_API MedianBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MedianBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct NanmedianBackward1 : public TraceableFunction {
+  TORCH_API NanmedianBackward1() = default;
+#else
+struct TORCH_API NanmedianBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanmedianBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MinBackward0 : public TraceableFunction {
+  TORCH_API MinBackward0() = default;
+#else
+struct TORCH_API MinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MinBackward1 : public TraceableFunction {
+  TORCH_API MinBackward1() = default;
+#else
+struct TORCH_API MinBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MinimumBackward0 : public TraceableFunction {
+  TORCH_API MinimumBackward0() = default;
+#else
+struct TORCH_API MinimumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinimumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FminBackward0 : public TraceableFunction {
+  TORCH_API FminBackward0() = default;
+#else
+struct TORCH_API FminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AmaxBackward0 : public TraceableFunction {
+  TORCH_API AmaxBackward0() = default;
+#else
+struct TORCH_API AmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct AminBackward0 : public TraceableFunction {
+  TORCH_API AminBackward0() = default;
+#else
+struct TORCH_API AminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MmBackward0 : public TraceableFunction {
+  TORCH_API MmBackward0() = default;
+#else
+struct TORCH_API MmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+  SavedVariable self_;
+  at::Layout self_layout;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> self_sym_strides;
+
+};
+#ifdef _WIN32
+struct ModeBackward0 : public TraceableFunction {
+  TORCH_API ModeBackward0() = default;
+#else
+struct TORCH_API ModeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ModeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MulBackward0 : public TraceableFunction {
+  TORCH_API MulBackward0() = default;
+#else
+struct TORCH_API MulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::ScalarType other_scalar_type;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct MulBackward1 : public TraceableFunction {
+  TORCH_API MulBackward1() = default;
+#else
+struct TORCH_API MulBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MulBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct MvBackward0 : public TraceableFunction {
+  TORCH_API MvBackward0() = default;
+#else
+struct TORCH_API MvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    vec_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable vec_;
+
+};
+#ifdef _WIN32
+struct MvlgammaBackward0 : public TraceableFunction {
+  TORCH_API MvlgammaBackward0() = default;
+#else
+struct TORCH_API MvlgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MvlgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t p = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NanToNumBackward0 : public TraceableFunction {
+  TORCH_API NanToNumBackward0() = default;
+#else
+struct TORCH_API NanToNumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanToNumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormLegitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitNoTrainingBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitNoTrainingBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormLegitNoTrainingBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitNoTrainingBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitBackward1 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitBackward1() = default;
+#else
+struct TORCH_API NativeBatchNormLegitBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_out_.reset_data();
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_invstd_.reset_data();
+    save_mean_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable grad_out_;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_invstd_;
+  SavedVariable save_mean_;
+  bool train;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NativeLayerNormBackward0 : public TraceableFunction {
+  TORCH_API NativeLayerNormBackward0() = default;
+#else
+struct TORCH_API NativeLayerNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeLayerNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  SavedVariable input_;
+  std::vector<c10::SymInt> normalized_shape;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeLayerNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeLayerNormBackwardBackward0() = default;
+#else
+struct TORCH_API NativeLayerNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeLayerNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_out_.reset_data();
+    input_.reset_data();
+    mean_.reset_data();
+    rstd_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_out_;
+  SavedVariable input_;
+  SavedVariable mean_;
+  std::vector<c10::SymInt> normalized_shape;
+  SavedVariable rstd_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NativeGroupNormBackward0 : public TraceableFunction {
+  TORCH_API NativeGroupNormBackward0() = default;
+#else
+struct TORCH_API NativeGroupNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeGroupNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt C;
+  c10::SymInt HxW;
+  c10::SymInt N;
+  double eps;
+  int64_t group = 0;
+  SavedVariable input_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NeBackward0 : public TraceableFunction {
+  TORCH_API NeBackward0() = default;
+#else
+struct TORCH_API NeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct NeBackward1 : public TraceableFunction {
+  TORCH_API NeBackward1() = default;
+#else
+struct TORCH_API NeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct NegBackward0 : public TraceableFunction {
+  TORCH_API NegBackward0() = default;
+#else
+struct TORCH_API NegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NextafterBackward0 : public TraceableFunction {
+  TORCH_API NextafterBackward0() = default;
+#else
+struct TORCH_API NextafterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NextafterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormBackward0 : public TraceableFunction {
+  TORCH_API NormBackward0() = default;
+#else
+struct TORCH_API NormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward1 : public TraceableFunction {
+  TORCH_API NormBackward1() = default;
+#else
+struct TORCH_API NormBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  c10::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward2 : public TraceableFunction {
+  TORCH_API NormBackward2() = default;
+#else
+struct TORCH_API NormBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward3 : public TraceableFunction {
+  TORCH_API NormBackward3() = default;
+#else
+struct TORCH_API NormBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward3"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  c10::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgVectorNormBackward0 : public TraceableFunction {
+  TORCH_API LinalgVectorNormBackward0() = default;
+#else
+struct TORCH_API LinalgVectorNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgVectorNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  at::Scalar ord;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PdistBackward0 : public TraceableFunction {
+  TORCH_API PdistBackward0() = default;
+#else
+struct TORCH_API PdistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PdistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PdistBackwardBackward0 : public TraceableFunction {
+  TORCH_API PdistBackwardBackward0() = default;
+#else
+struct TORCH_API PdistBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PdistBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct EuclideanDistBackward0 : public TraceableFunction {
+  TORCH_API EuclideanDistBackward0() = default;
+#else
+struct TORCH_API EuclideanDistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EuclideanDistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    x1_.reset_data();
+    x2_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable x1_;
+  SavedVariable x2_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CdistBackward0 : public TraceableFunction {
+  TORCH_API CdistBackward0() = default;
+#else
+struct TORCH_API CdistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CdistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    x1_.reset_data();
+    x2_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable x1_;
+  SavedVariable x2_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CdistBackwardBackward0 : public TraceableFunction {
+  TORCH_API CdistBackwardBackward0() = default;
+#else
+struct TORCH_API CdistBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CdistBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormalBackward0 : public TraceableFunction {
+  TORCH_API NormalBackward0() = default;
+#else
+struct TORCH_API NormalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormalBackward1 : public TraceableFunction {
+  TORCH_API NormalBackward1() = default;
+#else
+struct TORCH_API NormalBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> mean_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NormalBackward2 : public TraceableFunction {
+  TORCH_API NormalBackward2() = default;
+#else
+struct TORCH_API NormalBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> std_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NormalBackward3 : public TraceableFunction {
+  TORCH_API NormalBackward3() = default;
+#else
+struct TORCH_API NormalBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> mean_sym_sizes;
+  std::vector<c10::SymInt> std_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LinalgHouseholderProductBackward0 : public TraceableFunction {
+  TORCH_API LinalgHouseholderProductBackward0() = default;
+#else
+struct TORCH_API LinalgHouseholderProductBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgHouseholderProductBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    tau_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input_;
+  SavedVariable tau_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct OrmqrBackward0 : public TraceableFunction {
+  TORCH_API OrmqrBackward0() = default;
+#else
+struct TORCH_API OrmqrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "OrmqrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input2_.reset_data();
+    input3_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input2_;
+  SavedVariable input3_;
+  bool left;
+  SavedVariable self_;
+  bool transpose;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PermuteBackward0 : public Node {
+  TORCH_API PermuteBackward0() = default;
+#else
+struct TORCH_API PermuteBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PermuteBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct PoissonBackward0 : public TraceableFunction {
+  TORCH_API PoissonBackward0() = default;
+#else
+struct TORCH_API PoissonBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PoissonBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct PowBackward0 : public TraceableFunction {
+  TORCH_API PowBackward0() = default;
+#else
+struct TORCH_API PowBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar exponent;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PowBackward1 : public TraceableFunction {
+  TORCH_API PowBackward1() = default;
+#else
+struct TORCH_API PowBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PowBackward2 : public TraceableFunction {
+  TORCH_API PowBackward2() = default;
+#else
+struct TORCH_API PowBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+  at::Scalar self;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ProdBackward0 : public TraceableFunction {
+  TORCH_API ProdBackward0() = default;
+#else
+struct TORCH_API ProdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ProdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ProdBackward1 : public TraceableFunction {
+  TORCH_API ProdBackward1() = default;
+#else
+struct TORCH_API ProdBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ProdBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PutBackward0 : public TraceableFunction {
+  TORCH_API PutBackward0() = default;
+#else
+struct TORCH_API PutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  SavedVariable index_;
+  SavedVariable source_;
+  torch::autograd::generated::TypeAndSize source_info;
+
+};
+#ifdef _WIN32
+struct LinalgQrBackward0 : public TraceableFunction {
+  TORCH_API LinalgQrBackward0() = default;
+#else
+struct TORCH_API LinalgQrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgQrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Q_.reset_data();
+    R_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string mode;
+  SavedVariable Q_;
+  SavedVariable R_;
+
+};
+#ifdef _WIN32
+struct Rad2DegBackward0 : public TraceableFunction {
+  TORCH_API Rad2DegBackward0() = default;
+#else
+struct TORCH_API Rad2DegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Rad2DegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward0 : public TraceableFunction {
+  TORCH_API RandomBackward0() = default;
+#else
+struct TORCH_API RandomBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward1 : public TraceableFunction {
+  TORCH_API RandomBackward1() = default;
+#else
+struct TORCH_API RandomBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward2 : public TraceableFunction {
+  TORCH_API RandomBackward2() = default;
+#else
+struct TORCH_API RandomBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ReciprocalBackward0 : public TraceableFunction {
+  TORCH_API ReciprocalBackward0() = default;
+#else
+struct TORCH_API ReciprocalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReciprocalBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct RemainderBackward0 : public TraceableFunction {
+  TORCH_API RemainderBackward0() = default;
+#else
+struct TORCH_API RemainderBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RemainderBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RemainderBackward1 : public TraceableFunction {
+  TORCH_API RemainderBackward1() = default;
+#else
+struct TORCH_API RemainderBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RemainderBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct RenormBackward0 : public TraceableFunction {
+  TORCH_API RenormBackward0() = default;
+#else
+struct TORCH_API RenormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RenormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::Scalar maxnorm;
+  at::Scalar p;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct RepeatBackward0 : public TraceableFunction {
+  TORCH_API RepeatBackward0() = default;
+#else
+struct TORCH_API RepeatBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RepeatBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> repeats;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SpecialEntrBackward0 : public TraceableFunction {
+  TORCH_API SpecialEntrBackward0() = default;
+#else
+struct TORCH_API SpecialEntrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialEntrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialNdtriBackward0 : public TraceableFunction {
+  TORCH_API SpecialNdtriBackward0() = default;
+#else
+struct TORCH_API SpecialNdtriBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialNdtriBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialLogNdtrBackward0 : public TraceableFunction {
+  TORCH_API SpecialLogNdtrBackward0() = default;
+#else
+struct TORCH_API SpecialLogNdtrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialLogNdtrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ReshapeAliasBackward0 : public Node {
+  TORCH_API ReshapeAliasBackward0() = default;
+#else
+struct TORCH_API ReshapeAliasBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeAliasBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct RoundBackward0 : public TraceableFunction {
+  TORCH_API RoundBackward0() = default;
+#else
+struct TORCH_API RoundBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RoundBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RoundBackward1 : public TraceableFunction {
+  TORCH_API RoundBackward1() = default;
+#else
+struct TORCH_API RoundBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RoundBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RsqrtBackward0 : public TraceableFunction {
+  TORCH_API RsqrtBackward0() = default;
+#else
+struct TORCH_API RsqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ScatterBackward0 : public TraceableFunction {
+  TORCH_API ScatterBackward0() = default;
+#else
+struct TORCH_API ScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct ScatterBackward1 : public TraceableFunction {
+  TORCH_API ScatterBackward1() = default;
+#else
+struct TORCH_API ScatterBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct ScatterAddBackward0 : public TraceableFunction {
+  TORCH_API ScatterAddBackward0() = default;
+#else
+struct TORCH_API ScatterAddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterAddBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct SelectBackward0 : public Node {
+  TORCH_API SelectBackward0() = default;
+#else
+struct TORCH_API SelectBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SelectBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SelectBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SelectBackwardBackward0 : public TraceableFunction {
+  TORCH_API SelectBackwardBackward0() = default;
+#else
+struct TORCH_API SelectBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+
+};
+#ifdef _WIN32
+struct SigmoidBackward0 : public TraceableFunction {
+  TORCH_API SigmoidBackward0() = default;
+#else
+struct TORCH_API SigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LogitBackward0 : public TraceableFunction {
+  TORCH_API LogitBackward0() = default;
+#else
+struct TORCH_API LogitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogitBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<double> eps;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SignBackward0 : public TraceableFunction {
+  TORCH_API SignBackward0() = default;
+#else
+struct TORCH_API SignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SignBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct SgnBackward0 : public TraceableFunction {
+  TORCH_API SgnBackward0() = default;
+#else
+struct TORCH_API SgnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SgnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SinBackward0 : public TraceableFunction {
+  TORCH_API SinBackward0() = default;
+#else
+struct TORCH_API SinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SincBackward0 : public TraceableFunction {
+  TORCH_API SincBackward0() = default;
+#else
+struct TORCH_API SincBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SincBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SinhBackward0 : public TraceableFunction {
+  TORCH_API SinhBackward0() = default;
+#else
+struct TORCH_API SinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SliceBackward0 : public Node {
+  TORCH_API SliceBackward0() = default;
+#else
+struct TORCH_API SliceBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::optional<c10::SymInt> end;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceBackwardBackward0 : public TraceableFunction {
+  TORCH_API SliceBackwardBackward0() = default;
+#else
+struct TORCH_API SliceBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt end;
+  c10::SymInt start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceInverseBackward0 : public Node {
+  TORCH_API SliceInverseBackward0() = default;
+#else
+struct TORCH_API SliceInverseBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceInverseBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::optional<c10::SymInt> end;
+  torch::autograd::generated::TypeAndSize self_info;
+  c10::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceScatterBackward0 : public TraceableFunction {
+  TORCH_API SliceScatterBackward0() = default;
+#else
+struct TORCH_API SliceScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::optional<c10::SymInt> end;
+  torch::autograd::generated::TypeAndSize src_info;
+  c10::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SelectScatterBackward0 : public TraceableFunction {
+  TORCH_API SelectScatterBackward0() = default;
+#else
+struct TORCH_API SelectScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  torch::autograd::generated::TypeAndSize src_info;
+
+};
+#ifdef _WIN32
+struct DiagonalScatterBackward0 : public TraceableFunction {
+  TORCH_API DiagonalScatterBackward0() = default;
+#else
+struct TORCH_API DiagonalScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  torch::autograd::generated::TypeAndSize src_info;
+
+};
+#ifdef _WIN32
+struct AsStridedScatterBackward0 : public TraceableFunction {
+  TORCH_API AsStridedScatterBackward0() = default;
+#else
+struct TORCH_API AsStridedScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  at::TensorGeometry src_geometry;
+  c10::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct LinalgSolveExBackward0 : public TraceableFunction {
+  TORCH_API LinalgSolveExBackward0() = default;
+#else
+struct TORCH_API LinalgSolveExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSolveExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  bool left;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SortBackward0 : public TraceableFunction {
+  TORCH_API SortBackward0() = default;
+#else
+struct TORCH_API SortBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SortBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct SortBackward1 : public TraceableFunction {
+  TORCH_API SortBackward1() = default;
+#else
+struct TORCH_API SortBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SortBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct SplitBackward0 : public Node {
+  TORCH_API SplitBackward0() = default;
+#else
+struct TORCH_API SplitBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct UnsafeSplitBackward0 : public TraceableFunction {
+  TORCH_API UnsafeSplitBackward0() = default;
+#else
+struct TORCH_API UnsafeSplitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeSplitBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackward0 : public Node {
+  TORCH_API SplitWithSizesBackward0() = default;
+#else
+struct TORCH_API SplitWithSizesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SplitWithSizesBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SplitWithSizesBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeSplitWithSizesBackward0 : public TraceableFunction {
+  TORCH_API UnsafeSplitWithSizesBackward0() = default;
+#else
+struct TORCH_API UnsafeSplitWithSizesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeSplitWithSizesBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SqrtBackward0 : public TraceableFunction {
+  TORCH_API SqrtBackward0() = default;
+#else
+struct TORCH_API SqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward0 : public Node {
+  TORCH_API SqueezeBackward0() = default;
+#else
+struct TORCH_API SqueezeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward1 : public Node {
+  TORCH_API SqueezeBackward1() = default;
+#else
+struct TORCH_API SqueezeBackward1 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SqueezeBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward2 : public Node {
+  TORCH_API SqueezeBackward2() = default;
+#else
+struct TORCH_API SqueezeBackward2 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor1 : public Node {
+  TORCH_API SqueezeBackwardAutogradNestedTensor1() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor1 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t self_dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward3 : public TraceableFunction {
+  TORCH_API SqueezeBackward3() = default;
+#else
+struct TORCH_API SqueezeBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward4 : public TraceableFunction {
+  TORCH_API SqueezeBackward4() = default;
+#else
+struct TORCH_API SqueezeBackward4 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward4"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward5 : public TraceableFunction {
+  TORCH_API SqueezeBackward5() = default;
+#else
+struct TORCH_API SqueezeBackward5 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward5"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct StdBackward0 : public TraceableFunction {
+  TORCH_API StdBackward0() = default;
+#else
+struct TORCH_API StdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct StdMeanBackward0 : public TraceableFunction {
+  TORCH_API StdMeanBackward0() = default;
+#else
+struct TORCH_API StdMeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StdMeanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result0_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result0_;
+
+};
+#ifdef _WIN32
+struct SubBackward0 : public TraceableFunction {
+  TORCH_API SubBackward0() = default;
+#else
+struct TORCH_API SubBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SubBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct SubBackward1 : public TraceableFunction {
+  TORCH_API SubBackward1() = default;
+#else
+struct TORCH_API SubBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SubBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct RsubBackward0 : public TraceableFunction {
+  TORCH_API RsubBackward0() = default;
+#else
+struct TORCH_API RsubBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsubBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct RsubBackward1 : public TraceableFunction {
+  TORCH_API RsubBackward1() = default;
+#else
+struct TORCH_API RsubBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsubBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct SumBackward0 : public TraceableFunction {
+  TORCH_API SumBackward0() = default;
+#else
+struct TORCH_API SumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SumBackward1 : public TraceableFunction {
+  TORCH_API SumBackward1() = default;
+#else
+struct TORCH_API SumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SumBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API SumBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SumBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NansumBackward0 : public TraceableFunction {
+  TORCH_API NansumBackward0() = default;
+#else
+struct TORCH_API NansumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NansumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct LinalgSvdBackward0 : public TraceableFunction {
+  TORCH_API LinalgSvdBackward0() = default;
+#else
+struct TORCH_API LinalgSvdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSvdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    S_.reset_data();
+    U_.reset_data();
+    Vh_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool full_matrices;
+  SavedVariable S_;
+  c10::SymInt S_sym_argsize_minus_1;
+  SavedVariable U_;
+  SavedVariable Vh_;
+
+};
+#ifdef _WIN32
+struct LinalgEighBackward0 : public TraceableFunction {
+  TORCH_API LinalgEighBackward0() = default;
+#else
+struct TORCH_API LinalgEighBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgEighBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    eigenvalues_.reset_data();
+    eigenvectors_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable eigenvalues_;
+  SavedVariable eigenvectors_;
+
+};
+#ifdef _WIN32
+struct LinalgEigBackward0 : public TraceableFunction {
+  TORCH_API LinalgEigBackward0() = default;
+#else
+struct TORCH_API LinalgEigBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgEigBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    eigenvalues_.reset_data();
+    eigenvectors_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable eigenvalues_;
+  SavedVariable eigenvectors_;
+
+};
+#ifdef _WIN32
+struct TBackward0 : public Node {
+  TORCH_API TBackward0() = default;
+#else
+struct TORCH_API TBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TBackward1 : public TraceableFunction {
+  TORCH_API TBackward1() = default;
+#else
+struct TORCH_API TBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FlipBackward0 : public TraceableFunction {
+  TORCH_API FlipBackward0() = default;
+#else
+struct TORCH_API FlipBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FlipBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct RollBackward0 : public TraceableFunction {
+  TORCH_API RollBackward0() = default;
+#else
+struct TORCH_API RollBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RollBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+  std::vector<c10::SymInt> shifts;
+
+};
+#ifdef _WIN32
+struct Rot90Backward0 : public TraceableFunction {
+  TORCH_API Rot90Backward0() = default;
+#else
+struct TORCH_API Rot90Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Rot90Backward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+  int64_t k = 0;
+
+};
+#ifdef _WIN32
+struct TakeBackward0 : public TraceableFunction {
+  TORCH_API TakeBackward0() = default;
+#else
+struct TORCH_API TakeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TakeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable index_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TanBackward0 : public TraceableFunction {
+  TORCH_API TanBackward0() = default;
+#else
+struct TORCH_API TanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TanhBackward0 : public TraceableFunction {
+  TORCH_API TanhBackward0() = default;
+#else
+struct TORCH_API TanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TopkBackward0 : public TraceableFunction {
+  TORCH_API TopkBackward0() = default;
+#else
+struct TORCH_API TopkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TopkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct TraceBackward0 : public TraceableFunction {
+  TORCH_API TraceBackward0() = default;
+#else
+struct TORCH_API TraceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TraceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TransposeBackward0 : public Node {
+  TORCH_API TransposeBackward0() = default;
+#else
+struct TORCH_API TransposeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct TransposeBackward1 : public TraceableFunction {
+  TORCH_API TransposeBackward1() = default;
+#else
+struct TORCH_API TransposeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct TriangularSolveBackward0 : public TraceableFunction {
+  TORCH_API TriangularSolveBackward0() = default;
+#else
+struct TORCH_API TriangularSolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TriangularSolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    self_.reset_data();
+    solution_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable self_;
+  bool transpose;
+  bool unitriangular;
+  bool upper;
+  SavedVariable solution_;
+
+};
+#ifdef _WIN32
+struct LinalgSolveTriangularBackward0 : public TraceableFunction {
+  TORCH_API LinalgSolveTriangularBackward0() = default;
+#else
+struct TORCH_API LinalgSolveTriangularBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSolveTriangularBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool left;
+  SavedVariable self_;
+  bool unitriangular;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TrilBackward0 : public TraceableFunction {
+  TORCH_API TrilBackward0() = default;
+#else
+struct TORCH_API TrilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TrilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t diagonal = 0;
+
+};
+#ifdef _WIN32
+struct TriuBackward0 : public TraceableFunction {
+  TORCH_API TriuBackward0() = default;
+#else
+struct TORCH_API TriuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TriuBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t diagonal = 0;
+
+};
+#ifdef _WIN32
+struct TruncBackward0 : public TraceableFunction {
+  TORCH_API TruncBackward0() = default;
+#else
+struct TORCH_API TruncBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TruncBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ToDenseBackward0 : public TraceableFunction {
+  TORCH_API ToDenseBackward0() = default;
+#else
+struct TORCH_API ToDenseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToDenseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<bool> masked_grad;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ToSparseBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBackward0() = default;
+#else
+struct TORCH_API ToSparseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBackward1 : public TraceableFunction {
+  TORCH_API ToSparseBackward1() = default;
+#else
+struct TORCH_API ToSparseBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseCsrBackward0 : public TraceableFunction {
+  TORCH_API ToSparseCsrBackward0() = default;
+#else
+struct TORCH_API ToSparseCsrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseCsrBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseCscBackward0 : public TraceableFunction {
+  TORCH_API ToSparseCscBackward0() = default;
+#else
+struct TORCH_API ToSparseCscBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseCscBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBsrBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBsrBackward0() = default;
+#else
+struct TORCH_API ToSparseBsrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBsrBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBscBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBscBackward0() = default;
+#else
+struct TORCH_API ToSparseBscBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBscBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToMkldnnBackward0 : public TraceableFunction {
+  TORCH_API ToMkldnnBackward0() = default;
+#else
+struct TORCH_API ToMkldnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToMkldnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UnfoldBackward0 : public Node {
+  TORCH_API UnfoldBackward0() = default;
+#else
+struct TORCH_API UnfoldBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dimension = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct UnfoldBackwardBackward0 : public TraceableFunction {
+  TORCH_API UnfoldBackwardBackward0() = default;
+#else
+struct TORCH_API UnfoldBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct UniformBackward0 : public TraceableFunction {
+  TORCH_API UniformBackward0() = default;
+#else
+struct TORCH_API UniformBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniformBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueBackward0 : public TraceableFunction {
+  TORCH_API UniqueBackward0() = default;
+#else
+struct TORCH_API UniqueBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueDimBackward0 : public TraceableFunction {
+  TORCH_API UniqueDimBackward0() = default;
+#else
+struct TORCH_API UniqueDimBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueDimBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueConsecutiveBackward0 : public TraceableFunction {
+  TORCH_API UniqueConsecutiveBackward0() = default;
+#else
+struct TORCH_API UniqueConsecutiveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueConsecutiveBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueDimConsecutiveBackward0 : public TraceableFunction {
+  TORCH_API UniqueDimConsecutiveBackward0() = default;
+#else
+struct TORCH_API UniqueDimConsecutiveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueDimConsecutiveBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct Unique2Backward0 : public TraceableFunction {
+  TORCH_API Unique2Backward0() = default;
+#else
+struct TORCH_API Unique2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Unique2Backward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsafeViewBackward0 : public TraceableFunction {
+  TORCH_API UnsafeViewBackward0() = default;
+#else
+struct TORCH_API UnsafeViewBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LiftBackward0 : public TraceableFunction {
+  TORCH_API LiftBackward0() = default;
+#else
+struct TORCH_API LiftBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LiftFreshBackward0 : public TraceableFunction {
+  TORCH_API LiftFreshBackward0() = default;
+#else
+struct TORCH_API LiftFreshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftFreshBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward0 : public Node {
+  TORCH_API UnsqueezeBackward0() = default;
+#else
+struct TORCH_API UnsqueezeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward1 : public TraceableFunction {
+  TORCH_API UnsqueezeBackward1() = default;
+#else
+struct TORCH_API UnsqueezeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct VarBackward0 : public TraceableFunction {
+  TORCH_API VarBackward0() = default;
+#else
+struct TORCH_API VarBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VarBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct VarMeanBackward0 : public TraceableFunction {
+  TORCH_API VarMeanBackward0() = default;
+#else
+struct TORCH_API VarMeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VarMeanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewBackward0 : public Node {
+  TORCH_API ViewBackward0() = default;
+#else
+struct TORCH_API ViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ViewBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API ViewBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API ViewBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewAsRealBackward0 : public Node {
+  TORCH_API ViewAsRealBackward0() = default;
+#else
+struct TORCH_API ViewAsRealBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsRealBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ViewAsComplexBackward0 : public Node {
+  TORCH_API ViewAsComplexBackward0() = default;
+#else
+struct TORCH_API ViewAsComplexBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsComplexBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct WhereBackward0 : public TraceableFunction {
+  TORCH_API WhereBackward0() = default;
+#else
+struct TORCH_API WhereBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "WhereBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    condition_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable condition_;
+
+};
+#ifdef _WIN32
+struct WeightNormInterfaceBackward0 : public TraceableFunction {
+  TORCH_API WeightNormInterfaceBackward0() = default;
+#else
+struct TORCH_API WeightNormInterfaceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "WeightNormInterfaceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    g_.reset_data();
+    v_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable g_;
+  SavedVariable v_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct ZeroBackward0 : public TraceableFunction {
+  TORCH_API ZeroBackward0() = default;
+#else
+struct TORCH_API ZeroBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ZeroBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct SparseMaskBackward0 : public TraceableFunction {
+  TORCH_API SparseMaskBackward0() = default;
+#else
+struct TORCH_API SparseMaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseMaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  at::Layout self_layout;
+
+};
+#ifdef _WIN32
+struct SparseCooTensorWithDimsAndTensorsBackward0 : public TraceableFunction {
+  TORCH_API SparseCooTensorWithDimsAndTensorsBackward0() = default;
+#else
+struct TORCH_API SparseCooTensorWithDimsAndTensorsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseCooTensorWithDimsAndTensorsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseCompressedTensorBackward0 : public TraceableFunction {
+  TORCH_API SparseCompressedTensorBackward0() = default;
+#else
+struct TORCH_API SparseCompressedTensorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseCompressedTensorBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    values_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable values_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSumBackward0 : public TraceableFunction {
+  TORCH_API SparseSumBackward0() = default;
+#else
+struct TORCH_API SparseSumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct StandardGammaBackward0 : public TraceableFunction {
+  TORCH_API StandardGammaBackward0() = default;
+#else
+struct TORCH_API StandardGammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StandardGammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct StandardGammaGradBackward0 : public TraceableFunction {
+  TORCH_API StandardGammaGradBackward0() = default;
+#else
+struct TORCH_API StandardGammaGradBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StandardGammaGradBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ValuesBackward0 : public Node {
+  TORCH_API ValuesBackward0() = default;
+#else
+struct TORCH_API ValuesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ValuesBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API ValuesBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API ValuesBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TrilinearBackward0 : public TraceableFunction {
+  TORCH_API TrilinearBackward0() = default;
+#else
+struct TORCH_API TrilinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TrilinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    i1_.reset_data();
+    i2_.reset_data();
+    i3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> expand1;
+  std::vector<int64_t> expand2;
+  std::vector<int64_t> expand3;
+  SavedVariable i1_;
+  SavedVariable i2_;
+  SavedVariable i3_;
+  std::vector<int64_t> sumdim;
+
+};
+#ifdef _WIN32
+struct ConstantPadNdBackward0 : public TraceableFunction {
+  TORCH_API ConstantPadNdBackward0() = default;
+#else
+struct TORCH_API ConstantPadNdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConstantPadNdBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> pad;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyBackwardBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyBackwardBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyWithLogitsBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyWithLogitsBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyWithLogitsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyWithLogitsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    pos_weight_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable pos_weight_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct EmbeddingBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBackward0() = default;
+#else
+struct TORCH_API EmbeddingBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  c10::SymInt padding_idx;
+  bool scale_grad_by_freq;
+  bool sparse;
+  c10::SymInt weight_sym_argsize_0;
+
+};
+#ifdef _WIN32
+struct EmbeddingDenseBackwardBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingDenseBackwardBackward0() = default;
+#else
+struct TORCH_API EmbeddingDenseBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingDenseBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  c10::SymInt padding_idx;
+
+};
+#ifdef _WIN32
+struct EmbeddingBagBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBagBackward0() = default;
+#else
+struct TORCH_API EmbeddingBagBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBagBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+    offsets_.reset_data();
+    per_sample_weights_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  int64_t mode = 0;
+  SavedVariable offsets_;
+  int64_t padding_idx = 0;
+  SavedVariable per_sample_weights_;
+  bool scale_grad_by_freq;
+  bool sparse;
+  SavedVariable weight_;
+  c10::SymInt weight_sym_argsize_0;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct EmbeddingRenormBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingRenormBackward0() = default;
+#else
+struct TORCH_API EmbeddingRenormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingRenormBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct MseLossBackward0 : public TraceableFunction {
+  TORCH_API MseLossBackward0() = default;
+#else
+struct TORCH_API MseLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MseLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct MultiMarginLossBackward0 : public TraceableFunction {
+  TORCH_API MultiMarginLossBackward0() = default;
+#else
+struct TORCH_API MultiMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MultiMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar margin;
+  at::Scalar p;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MultilabelMarginLossBackward0 : public TraceableFunction {
+  TORCH_API MultilabelMarginLossBackward0() = default;
+#else
+struct TORCH_API MultilabelMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MultilabelMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    is_target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable is_target_;
+
+};
+#ifdef _WIN32
+struct NllLossBackward0 : public TraceableFunction {
+  TORCH_API NllLossBackward0() = default;
+#else
+struct TORCH_API NllLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+    total_weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+  SavedVariable total_weight_;
+
+};
+#ifdef _WIN32
+struct NllLoss2DBackward0 : public TraceableFunction {
+  TORCH_API NllLoss2DBackward0() = default;
+#else
+struct TORCH_API NllLoss2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLoss2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+    total_weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+  SavedVariable total_weight_;
+
+};
+#ifdef _WIN32
+struct SmoothL1LossBackward0 : public TraceableFunction {
+  TORCH_API SmoothL1LossBackward0() = default;
+#else
+struct TORCH_API SmoothL1LossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SmoothL1LossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double beta;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct HuberLossBackward0 : public TraceableFunction {
+  TORCH_API HuberLossBackward0() = default;
+#else
+struct TORCH_API HuberLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HuberLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double delta;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftMarginLossBackward0 : public TraceableFunction {
+  TORCH_API SoftMarginLossBackward0() = default;
+#else
+struct TORCH_API SoftMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct ReluBackward0 : public TraceableFunction {
+  TORCH_API ReluBackward0() = default;
+#else
+struct TORCH_API ReluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SiluBackward0 : public TraceableFunction {
+  TORCH_API SiluBackward0() = default;
+#else
+struct TORCH_API SiluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SiluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MishBackward0 : public TraceableFunction {
+  TORCH_API MishBackward0() = default;
+#else
+struct TORCH_API MishBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MishBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct EluBackward0 : public TraceableFunction {
+  TORCH_API EluBackward0() = default;
+#else
+struct TORCH_API EluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar input_scale;
+  at::Scalar scale;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct EluBackward1 : public TraceableFunction {
+  TORCH_API EluBackward1() = default;
+#else
+struct TORCH_API EluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar input_scale;
+  at::Scalar scale;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CeluBackward0 : public TraceableFunction {
+  TORCH_API CeluBackward0() = default;
+#else
+struct TORCH_API CeluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CeluBackward1 : public TraceableFunction {
+  TORCH_API CeluBackward1() = default;
+#else
+struct TORCH_API CeluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct GeluBackward0 : public TraceableFunction {
+  TORCH_API GeluBackward0() = default;
+#else
+struct TORCH_API GeluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string approximate;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct GeluBackwardBackward0 : public TraceableFunction {
+  TORCH_API GeluBackwardBackward0() = default;
+#else
+struct TORCH_API GeluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string approximate;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct GluBackward0 : public TraceableFunction {
+  TORCH_API GluBackward0() = default;
+#else
+struct TORCH_API GluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardshrinkBackward0 : public TraceableFunction {
+  TORCH_API HardshrinkBackward0() = default;
+#else
+struct TORCH_API HardshrinkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardshrinkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardshrinkBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardshrinkBackwardBackward0() = default;
+#else
+struct TORCH_API HardshrinkBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardshrinkBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardtanhBackward0 : public TraceableFunction {
+  TORCH_API HardtanhBackward0() = default;
+#else
+struct TORCH_API HardtanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardtanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max_val;
+  at::Scalar min_val;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackward0 : public TraceableFunction {
+  TORCH_API LeakyReluBackward0() = default;
+#else
+struct TORCH_API LeakyReluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackward1 : public TraceableFunction {
+  TORCH_API LeakyReluBackward1() = default;
+#else
+struct TORCH_API LeakyReluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LogSigmoidBackward0 : public TraceableFunction {
+  TORCH_API LogSigmoidBackward0() = default;
+#else
+struct TORCH_API LogSigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    buffer_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable buffer_;
+
+};
+#ifdef _WIN32
+struct LogSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API LogSoftmaxBackward0() = default;
+#else
+struct TORCH_API LogSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseLogSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SparseLogSoftmaxBackward0() = default;
+#else
+struct TORCH_API SparseLogSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseLogSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MaskedSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API MaskedSoftmaxBackward0() = default;
+#else
+struct TORCH_API MaskedSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::optional<int64_t> dim;
+  SavedVariable mask_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PreluKernelBackward0 : public TraceableFunction {
+  TORCH_API PreluKernelBackward0() = default;
+#else
+struct TORCH_API PreluKernelBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PreluKernelBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct PreluKernelBackwardBackward0 : public TraceableFunction {
+  TORCH_API PreluKernelBackwardBackward0() = default;
+#else
+struct TORCH_API PreluKernelBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PreluKernelBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  at::TensorOptions grad_output_options;
+  SavedVariable self_;
+  torch::autograd::generated::TypeAndSize self_info;
+  at::TensorOptions self_options;
+  SavedVariable weight_;
+  at::TensorOptions weight_options;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackward0 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackward0() = default;
+#else
+struct TORCH_API RreluWithNoiseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  SavedVariable self_;
+  bool training;
+  at::Scalar upper;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackward1 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackward1() = default;
+#else
+struct TORCH_API RreluWithNoiseBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  bool training;
+  at::Scalar upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SoftmaxBackward0() = default;
+#else
+struct TORCH_API SoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SparseSoftmaxBackward0() = default;
+#else
+struct TORCH_API SparseSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSparseMatmulBackward0 : public TraceableFunction {
+  TORCH_API SparseSparseMatmulBackward0() = default;
+#else
+struct TORCH_API SparseSparseMatmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSparseMatmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SoftplusBackward0 : public TraceableFunction {
+  TORCH_API SoftplusBackward0() = default;
+#else
+struct TORCH_API SoftplusBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftplusBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar beta;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct SoftshrinkBackward0 : public TraceableFunction {
+  TORCH_API SoftshrinkBackward0() = default;
+#else
+struct TORCH_API SoftshrinkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftshrinkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ThresholdBackward0 : public TraceableFunction {
+  TORCH_API ThresholdBackward0() = default;
+#else
+struct TORCH_API ThresholdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct ThresholdBackward1 : public TraceableFunction {
+  TORCH_API ThresholdBackward1() = default;
+#else
+struct TORCH_API ThresholdBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct ReflectionPad1DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad1DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad1DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReflectionPad2DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad2DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReflectionPad3DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad3DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad1DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad1DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad1DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad2DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad2DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad3DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad3DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UpsampleLinear1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleLinear1DBackward0() = default;
+#else
+struct TORCH_API UpsampleLinear1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleLinear1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DAaBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DAaBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DAaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DAaBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DAaBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DAaBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DAaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DAaBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleTrilinear3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleTrilinear3DBackward0() = default;
+#else
+struct TORCH_API UpsampleTrilinear3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleTrilinear3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest1DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact1DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest2DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact2DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest3DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact3DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct PixelShuffleBackward0 : public TraceableFunction {
+  TORCH_API PixelShuffleBackward0() = default;
+#else
+struct TORCH_API PixelShuffleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PixelShuffleBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t upscale_factor = 0;
+
+};
+#ifdef _WIN32
+struct PixelUnshuffleBackward0 : public TraceableFunction {
+  TORCH_API PixelUnshuffleBackward0() = default;
+#else
+struct TORCH_API PixelUnshuffleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PixelUnshuffleBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t downscale_factor = 0;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool2DBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool3DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool3DBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool2DBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool3DBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct AvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API AvgPool2DBackward0() = default;
+#else
+struct TORCH_API AvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  c10::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AvgPool3DBackward0 : public TraceableFunction {
+  TORCH_API AvgPool3DBackward0() = default;
+#else
+struct TORCH_API AvgPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  c10::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool2DBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> output_size;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool3DBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> output_size;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct LinearBackward0 : public TraceableFunction {
+  TORCH_API LinearBackward0() = default;
+#else
+struct TORCH_API LinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LinearBackwardBackward0 : public TraceableFunction {
+  TORCH_API LinearBackwardBackward0() = default;
+#else
+struct TORCH_API LinearBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinearBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DBackward0() = default;
+#else
+struct TORCH_API MaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionBackwardBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionBackwardBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DWithIndicesBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DWithIndicesBackward0() = default;
+#else
+struct TORCH_API MaxPool2DWithIndicesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DWithIndicesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct MaxPool3DWithIndicesBackward0 : public TraceableFunction {
+  TORCH_API MaxPool3DWithIndicesBackward0() = default;
+#else
+struct TORCH_API MaxPool3DWithIndicesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool3DWithIndicesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct MaxUnpool2DBackward0 : public TraceableFunction {
+  TORCH_API MaxUnpool2DBackward0() = default;
+#else
+struct TORCH_API MaxUnpool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxUnpool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MaxUnpool3DBackward0 : public TraceableFunction {
+  TORCH_API MaxUnpool3DBackward0() = default;
+#else
+struct TORCH_API MaxUnpool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxUnpool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackward1 : public TraceableFunction {
+  TORCH_API ConvolutionBackward1() = default;
+#else
+struct TORCH_API ConvolutionBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackwardBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackwardBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionOverrideableBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionOverrideableBackward0() = default;
+#else
+struct TORCH_API ConvolutionOverrideableBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionOverrideableBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackwardOverrideableBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackwardOverrideableBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackwardOverrideableBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackwardOverrideableBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvTranspose2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvTranspose2DBackward0() = default;
+#else
+struct TORCH_API SlowConvTranspose2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvTranspose2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvTranspose3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvTranspose3DBackward0() = default;
+#else
+struct TORCH_API SlowConvTranspose3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvTranspose3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConv2DBackward0() = default;
+#else
+struct TORCH_API SlowConv2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> kernel_size;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API SlowConv2DBackwardBackward0() = default;
+#else
+struct TORCH_API SlowConv2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvDepthwise2DBackward0 : public TraceableFunction {
+  TORCH_API ConvDepthwise2DBackward0() = default;
+#else
+struct TORCH_API ConvDepthwise2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvDepthwise2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvDepthwise3DBackward0 : public TraceableFunction {
+  TORCH_API ConvDepthwise3DBackward0() = default;
+#else
+struct TORCH_API ConvDepthwise3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvDepthwise3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConv3DBackward0() = default;
+#else
+struct TORCH_API SlowConv3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvDilated2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvDilated2DBackward0() = default;
+#else
+struct TORCH_API SlowConvDilated2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvDilated2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvDilated3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvDilated3DBackward0() = default;
+#else
+struct TORCH_API SlowConvDilated3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvDilated3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct Col2ImBackward0 : public TraceableFunction {
+  TORCH_API Col2ImBackward0() = default;
+#else
+struct TORCH_API Col2ImBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Col2ImBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct Im2ColBackward0 : public TraceableFunction {
+  TORCH_API Im2ColBackward0() = default;
+#else
+struct TORCH_API Im2ColBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Im2ColBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  c10::SymInt self_sym_argsize_minus_1;
+  c10::SymInt self_sym_argsize_minus_2;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt grad_output_sym_argsize_minus_1;
+  c10::SymInt grad_output_sym_argsize_minus_2;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt grad_output_sym_argsize_minus_1;
+  c10::SymInt grad_output_sym_argsize_minus_2;
+  c10::SymInt grad_output_sym_argsize_minus_3;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool3DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AvgPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AvgPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AvgPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  c10::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AvgPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AvgPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AvgPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  c10::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct EluBackwardBackward0 : public TraceableFunction {
+  TORCH_API EluBackwardBackward0() = default;
+#else
+struct TORCH_API EluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_or_result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable grad_output_;
+  at::Scalar input_scale;
+  bool is_result;
+  at::Scalar scale;
+  SavedVariable self_or_result_;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool3DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GluBackwardBackward0 : public TraceableFunction {
+  TORCH_API GluBackwardBackward0() = default;
+#else
+struct TORCH_API GluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardtanhBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardtanhBackwardBackward0() = default;
+#else
+struct TORCH_API HardtanhBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardtanhBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max_val;
+  at::Scalar min_val;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogSigmoidBackwardBackward0 : public TraceableFunction {
+  TORCH_API LogSigmoidBackwardBackward0() = default;
+#else
+struct TORCH_API LogSigmoidBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSigmoidBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffer_.reset_data();
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable buffer_;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogSoftmaxBackwardDataBackward0 : public TraceableFunction {
+  TORCH_API LogSoftmaxBackwardDataBackward0() = default;
+#else
+struct TORCH_API LogSoftmaxBackwardDataBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSoftmaxBackwardDataBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackwardBackward0 : public TraceableFunction {
+  TORCH_API LeakyReluBackwardBackward0() = default;
+#else
+struct TORCH_API LeakyReluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MaxPool2DWithIndicesBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DWithIndicesBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool2DWithIndicesBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DWithIndicesBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MaxPool3DWithIndicesBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool3DWithIndicesBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool3DWithIndicesBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool3DWithIndicesBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MseLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API MseLossBackwardBackward0() = default;
+#else
+struct TORCH_API MseLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MseLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct NllLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API NllLossBackwardBackward0() = default;
+#else
+struct TORCH_API NllLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NllLoss2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API NllLoss2DBackwardBackward0() = default;
+#else
+struct TORCH_API NllLoss2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLoss2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackwardBackward0 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackwardBackward0() = default;
+#else
+struct TORCH_API RreluWithNoiseBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  SavedVariable self_;
+  bool training;
+  at::Scalar upper;
+
+};
+#ifdef _WIN32
+struct ReflectionPad1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad1DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReflectionPad2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad2DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReflectionPad3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad3DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad1DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad2DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad3DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct SparseSampledAddmmBackward0 : public TraceableFunction {
+  TORCH_API SparseSampledAddmmBackward0() = default;
+#else
+struct TORCH_API SparseSampledAddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSampledAddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  SavedVariable mat2_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SparseMmReduceImplBackward0 : public TraceableFunction {
+  TORCH_API SparseMmReduceImplBackward0() = default;
+#else
+struct TORCH_API SparseMmReduceImplBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseMmReduceImplBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct SmoothL1LossBackwardBackward0 : public TraceableFunction {
+  TORCH_API SmoothL1LossBackwardBackward0() = default;
+#else
+struct TORCH_API SmoothL1LossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SmoothL1LossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double beta;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct HuberLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API HuberLossBackwardBackward0() = default;
+#else
+struct TORCH_API HuberLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HuberLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double delta;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftplusBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftplusBackwardBackward0() = default;
+#else
+struct TORCH_API SoftplusBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftplusBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar beta;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct SoftmaxBackwardDataBackward0 : public TraceableFunction {
+  TORCH_API SoftmaxBackwardDataBackward0() = default;
+#else
+struct TORCH_API SoftmaxBackwardDataBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftmaxBackwardDataBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  at::ScalarType input_dtype;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct SoftMarginLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftMarginLossBackwardBackward0() = default;
+#else
+struct TORCH_API SoftMarginLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftMarginLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftshrinkBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftshrinkBackwardBackward0() = default;
+#else
+struct TORCH_API SoftshrinkBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftshrinkBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ThresholdBackwardBackward0 : public TraceableFunction {
+  TORCH_API ThresholdBackwardBackward0() = default;
+#else
+struct TORCH_API ThresholdBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct UpsampleLinear1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleLinear1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleLinear1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleLinear1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DAaBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DAaBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DAaBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DAaBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DAaBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DAaBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DAaBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DAaBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleTrilinear3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleTrilinear3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleTrilinear3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleTrilinear3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  c10::optional<double> scales_d;
+  c10::optional<double> scales_h;
+  c10::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct SigmoidBackwardBackward0 : public TraceableFunction {
+  TORCH_API SigmoidBackwardBackward0() = default;
+#else
+struct TORCH_API SigmoidBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SigmoidBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct TanhBackwardBackward0 : public TraceableFunction {
+  TORCH_API TanhBackwardBackward0() = default;
+#else
+struct TORCH_API TanhBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanhBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct CudnnCtcLossBackward0 : public TraceableFunction {
+  TORCH_API CudnnCtcLossBackward0() = default;
+#else
+struct TORCH_API CudnnCtcLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnCtcLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CudnnCtcLossBackward1 : public TraceableFunction {
+  TORCH_API CudnnCtcLossBackward1() = default;
+#else
+struct TORCH_API CudnnCtcLossBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnCtcLossBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CudnnConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API CudnnConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API CudnnConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CudnnConvolutionBackward0 : public TraceableFunction {
+  TORCH_API CudnnConvolutionBackward0() = default;
+#else
+struct TORCH_API CudnnConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CudnnGridSamplerBackward0 : public TraceableFunction {
+  TORCH_API CudnnGridSamplerBackward0() = default;
+#else
+struct TORCH_API CudnnGridSamplerBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnGridSamplerBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grid_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CudnnAffineGridGeneratorBackward0 : public TraceableFunction {
+  TORCH_API CudnnAffineGridGeneratorBackward0() = default;
+#else
+struct TORCH_API CudnnAffineGridGeneratorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnAffineGridGeneratorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t C = 0;
+  int64_t H = 0;
+  int64_t N = 0;
+  int64_t W = 0;
+
+};
+#ifdef _WIN32
+struct CudnnBatchNormBackward0 : public TraceableFunction {
+  TORCH_API CudnnBatchNormBackward0() = default;
+#else
+struct TORCH_API CudnnBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct CudnnBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API CudnnBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API CudnnBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    reserveSpace_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_mean_.reset_data();
+    save_var_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable grad_output_;
+  SavedVariable input_;
+  SavedVariable reserveSpace_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_mean_;
+  SavedVariable save_var_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NnpackSpatialConvolutionBackward0 : public TraceableFunction {
+  TORCH_API NnpackSpatialConvolutionBackward0() = default;
+#else
+struct TORCH_API NnpackSpatialConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NnpackSpatialConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  SavedVariable input_;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LstmMpsBackward0 : public TraceableFunction {
+  TORCH_API LstmMpsBackward0() = default;
+#else
+struct TORCH_API LstmMpsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LstmMpsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    hx_.clear();
+    hx_released_ = true;
+    input_.reset_data();
+    params_.clear();
+    params_released_ = true;
+    result3_.reset_data();
+    result4_.reset_data();
+    result5_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  bool bidirectional;
+  double dropout;
+  bool has_biases;
+  std::vector<SavedVariable> hx_;
+  bool hx_released_ = false;
+  SavedVariable input_;
+  int64_t num_layers = 0;
+  std::vector<SavedVariable> params_;
+  bool params_released_ = false;
+  bool train;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  SavedVariable result5_;
+  size_t hx_size_;
+  size_t params_size_;
+};
+#ifdef _WIN32
+struct CudnnRnnBackward0 : public TraceableFunction {
+  TORCH_API CudnnRnnBackward0() = default;
+#else
+struct TORCH_API CudnnRnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnRnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    dropout_state_.reset_data();
+    hx_.reset_data();
+    input_.reset_data();
+    weight_.clear();
+    weight_released_ = true;
+    result0_.reset_data();
+    result3_.reset_data();
+    result4_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<c10::SymInt> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx_;
+  double dropout;
+  SavedVariable dropout_state_;
+  c10::SymInt hidden_size;
+  SavedVariable hx_;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  c10::SymInt proj_size;
+  bool train;
+  std::vector<SavedVariable> weight_;
+  bool weight_released_ = false;
+  int64_t weight_stride0 = 0;
+  SavedVariable result0_;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct CudnnRnnBackwardBackward0 : public TraceableFunction {
+  TORCH_API CudnnRnnBackwardBackward0() = default;
+#else
+struct TORCH_API CudnnRnnBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnRnnBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct MiopenConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API MiopenConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API MiopenConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MiopenConvolutionBackward0() = default;
+#else
+struct TORCH_API MiopenConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenDepthwiseConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MiopenDepthwiseConvolutionBackward0() = default;
+#else
+struct TORCH_API MiopenDepthwiseConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenDepthwiseConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenBatchNormBackward0 : public TraceableFunction {
+  TORCH_API MiopenBatchNormBackward0() = default;
+#else
+struct TORCH_API MiopenBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct MiopenBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API MiopenBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API MiopenBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_mean_.reset_data();
+    save_var_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable grad_output_;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_mean_;
+  SavedVariable save_var_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenRnnBackward0 : public TraceableFunction {
+  TORCH_API MiopenRnnBackward0() = default;
+#else
+struct TORCH_API MiopenRnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenRnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    dropout_state_.reset_data();
+    hx_.reset_data();
+    input_.reset_data();
+    weight_.clear();
+    weight_released_ = true;
+    result0_.reset_data();
+    result3_.reset_data();
+    result4_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<int64_t> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx_;
+  double dropout;
+  SavedVariable dropout_state_;
+  int64_t hidden_size = 0;
+  SavedVariable hx_;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  bool train;
+  std::vector<SavedVariable> weight_;
+  bool weight_released_ = false;
+  int64_t weight_stride0 = 0;
+  SavedVariable result0_;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct MkldnnRnnLayerBackward0 : public TraceableFunction {
+  TORCH_API MkldnnRnnLayerBackward0() = default;
+#else
+struct TORCH_API MkldnnRnnLayerBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnRnnLayerBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx__.reset_data();
+    hx__.reset_data();
+    input_.reset_data();
+    weight0_.reset_data();
+    weight1_.reset_data();
+    weight2_.reset_data();
+    weight3_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<int64_t> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx__;
+  bool has_biases;
+  int64_t hidden_size = 0;
+  SavedVariable hx__;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  bool reverse;
+  bool train;
+  SavedVariable weight0_;
+  SavedVariable weight1_;
+  SavedVariable weight2_;
+  SavedVariable weight3_;
+  SavedVariable result0_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct MkldnnConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MkldnnConvolutionBackward0() = default;
+#else
+struct TORCH_API MkldnnConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MkldnnLinearBackward0 : public TraceableFunction {
+  TORCH_API MkldnnLinearBackward0() = default;
+#else
+struct TORCH_API MkldnnLinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnLinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MkldnnMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnMaxPool2DBackward0() = default;
+#else
+struct TORCH_API MkldnnMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MkldnnMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnMaxPool3DBackward0() = default;
+#else
+struct TORCH_API MkldnnMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MkldnnAdaptiveAvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnAdaptiveAvgPool2DBackward0() = default;
+#else
+struct TORCH_API MkldnnAdaptiveAvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnAdaptiveAvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MkldnnReshapeBackward0 : public TraceableFunction {
+  TORCH_API MkldnnReshapeBackward0() = default;
+#else
+struct TORCH_API MkldnnReshapeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnReshapeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NestedTensorFromTensorListBackward0 : public TraceableFunction {
+  TORCH_API NestedTensorFromTensorListBackward0() = default;
+#else
+struct TORCH_API NestedTensorFromTensorListBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedTensorFromTensorListBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    list_.clear();
+    list_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> list_;
+  bool list_released_ = false;
+  size_t list_size_;
+};
+#ifdef _WIN32
+struct NestedTensorFromMaskBackward0 : public TraceableFunction {
+  TORCH_API NestedTensorFromMaskBackward0() = default;
+#else
+struct TORCH_API NestedTensorFromMaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedTensorFromMaskBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> t_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NestedFromPaddedBackward0 : public TraceableFunction {
+  TORCH_API NestedFromPaddedBackward0() = default;
+#else
+struct TORCH_API NestedFromPaddedBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedFromPaddedBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    padded_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool fuse_transform_0213;
+  SavedVariable padded_;
+
+};
+#ifdef _WIN32
+struct ToPaddedTensorBackward0 : public TraceableFunction {
+  TORCH_API ToPaddedTensorBackward0() = default;
+#else
+struct TORCH_API ToPaddedTensorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToPaddedTensorBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NestedViewFromBufferBackward0 : public Node {
+  TORCH_API NestedViewFromBufferBackward0() = default;
+#else
+struct TORCH_API NestedViewFromBufferBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromBufferBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedViewFromJaggedBackward0 : public Node {
+  TORCH_API NestedViewFromJaggedBackward0() = default;
+#else
+struct TORCH_API NestedViewFromJaggedBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromJaggedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedGetValuesBackward0 : public Node {
+  TORCH_API NestedGetValuesBackward0() = default;
+#else
+struct TORCH_API NestedGetValuesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedGetValuesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductEfficientAttentionBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductEfficientAttentionBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductEfficientAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductEfficientAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_bias_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    log_sumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_bias_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  c10::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable log_sumexp_;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductFlashAttentionBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductFlashAttentionBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductFlashAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductFlashAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  c10::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductFlashAttentionForCpuBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductFlashAttentionForCpuBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductFlashAttentionForCpuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductFlashAttentionForCpuBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_mask_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_mask_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  c10::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable logsumexp_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct FlashAttentionBackward0 : public TraceableFunction {
+  TORCH_API FlashAttentionBackward0() = default;
+#else
+struct TORCH_API FlashAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FlashAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+    softmax_logsumexp_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable query_;
+  c10::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+  SavedVariable softmax_logsumexp_;
+
+};
+#ifdef _WIN32
+struct EfficientAttentionBackward0 : public TraceableFunction {
+  TORCH_API EfficientAttentionBackward0() = default;
+#else
+struct TORCH_API EfficientAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EfficientAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    cu_seqlens_k_.reset_data();
+    cu_seqlens_q_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  SavedVariable cu_seqlens_k_;
+  SavedVariable cu_seqlens_q_;
+  int64_t custom_mask_type = 0;
+  double dropout_p;
+  SavedVariable key_;
+  SavedVariable query_;
+  c10::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_seqlen_batch_k;
+  c10::SymInt max_seqlen_batch_q;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct FftR2CBackward0 : public TraceableFunction {
+  TORCH_API FftR2CBackward0() = default;
+#else
+struct TORCH_API FftR2CBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftR2CBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t normalization = 0;
+  bool onesided;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FftC2RBackward0 : public TraceableFunction {
+  TORCH_API FftC2RBackward0() = default;
+#else
+struct TORCH_API FftC2RBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftC2RBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t normalization = 0;
+
+};
+#ifdef _WIN32
+struct FftC2CBackward0 : public TraceableFunction {
+  TORCH_API FftC2CBackward0() = default;
+#else
+struct TORCH_API FftC2CBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftC2CBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dim;
+  bool forward;
+  int64_t normalization = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackward0 : public Node {
+  TORCH_API UnbindBackward0() = default;
+#else
+struct TORCH_API UnbindBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API UnbindBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API UnbindBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct StackBackward0 : public TraceableFunction {
+  TORCH_API StackBackward0() = default;
+#else
+struct TORCH_API StackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StackBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct ThnnFusedLstmCellBackward0 : public TraceableFunction {
+  TORCH_API ThnnFusedLstmCellBackward0() = default;
+#else
+struct TORCH_API ThnnFusedLstmCellBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThnnFusedLstmCellBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    hidden_bias_.reset_data();
+    hidden_gates_.reset_data();
+    input_bias_.reset_data();
+    input_gates_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable cx_;
+  SavedVariable hidden_bias_;
+  SavedVariable hidden_gates_;
+  SavedVariable input_bias_;
+  SavedVariable input_gates_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct ThnnFusedGruCellBackward0 : public TraceableFunction {
+  TORCH_API ThnnFusedGruCellBackward0() = default;
+#else
+struct TORCH_API ThnnFusedGruCellBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThnnFusedGruCellBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    hidden_bias_.reset_data();
+    hidden_gates_.reset_data();
+    hx_.reset_data();
+    input_bias_.reset_data();
+    input_gates_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable hidden_bias_;
+  SavedVariable hidden_gates_;
+  SavedVariable hx_;
+  SavedVariable input_bias_;
+  SavedVariable input_gates_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct PackPaddedSequenceBackward0 : public TraceableFunction {
+  TORCH_API PackPaddedSequenceBackward0() = default;
+#else
+struct TORCH_API PackPaddedSequenceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PackPaddedSequenceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<c10::SymInt> input_sym_sizes;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct SegmentReduceBackward0 : public TraceableFunction {
+  TORCH_API SegmentReduceBackward0() = default;
+#else
+struct TORCH_API SegmentReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SegmentReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    data_.reset_data();
+    lengths_.reset_data();
+    offsets_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t axis = 0;
+  SavedVariable data_;
+  c10::optional<at::Scalar> initial;
+  SavedVariable lengths_;
+  SavedVariable offsets_;
+  std::string reduce;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PinMemoryBackward0 : public TraceableFunction {
+  TORCH_API PinMemoryBackward0() = default;
+#else
+struct TORCH_API PinMemoryBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PinMemoryBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestWarnInAutogradBackward0 : public TraceableFunction {
+  TORCH_API TestWarnInAutogradBackward0() = default;
+#else
+struct TORCH_API TestWarnInAutogradBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestWarnInAutogradBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackward0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackward0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradCUDA0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradCUDA0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradCUDA0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradCUDA0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradNestedTensor1 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor1() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradNestedTensor1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackward0 : public Node {
+  TORCH_API TestAutogradMultipleDispatchViewBackward0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackwardAutogradCUDA0 : public Node {
+  TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackwardAutogradCUDA0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ScatterReduceBackward0 : public TraceableFunction {
+  TORCH_API ScatterReduceBackward0() = default;
+#else
+struct TORCH_API ScatterReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+    src_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool include_self;
+  SavedVariable index_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable src_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ReshapeCopyBackward0 : public TraceableFunction {
+  TORCH_API ReshapeCopyBackward0() = default;
+#else
+struct TORCH_API ReshapeCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeCopyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ForeachDivBackward0 : public TraceableFunction {
+  TORCH_API ForeachDivBackward0() = default;
+#else
+struct TORCH_API ForeachDivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward0 : public TraceableFunction {
+  TORCH_API ForeachPowBackward0() = default;
+#else
+struct TORCH_API ForeachPowBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.clear();
+    exponent_released_ = true;
+    self_.clear();
+    self_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> exponent_;
+  bool exponent_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+  size_t exponent_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward1 : public TraceableFunction {
+  TORCH_API ForeachPowBackward1() = default;
+#else
+struct TORCH_API ForeachPowBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> exponent;
+  bool exponent_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward2 : public TraceableFunction {
+  TORCH_API ForeachPowBackward2() = default;
+#else
+struct TORCH_API ForeachPowBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.clear();
+    exponent_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> exponent_;
+  bool exponent_released_ = false;
+  at::Scalar self;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t exponent_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward0 : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward0() = default;
+#else
+struct TORCH_API ForeachMinimumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward1 : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward1() = default;
+#else
+struct TORCH_API ForeachMinimumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward0 : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward0() = default;
+#else
+struct TORCH_API ForeachMaximumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward1 : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward1() = default;
+#else
+struct TORCH_API ForeachMaximumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachNormBackward0 : public TraceableFunction {
+  TORCH_API ForeachNormBackward0() = default;
+#else
+struct TORCH_API ForeachNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar ord;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct AliasBackward0_copy : public TraceableFunction {
+  TORCH_API AliasBackward0_copy() = default;
+#else
+struct TORCH_API AliasBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AliasBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsStridedBackward0_copy : public TraceableFunction {
+  TORCH_API AsStridedBackward0_copy() = default;
+#else
+struct TORCH_API AsStridedBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  c10::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct ConjBackward0_copy : public TraceableFunction {
+  TORCH_API ConjBackward0_copy() = default;
+#else
+struct TORCH_API ConjBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NegViewBackward0_copy : public TraceableFunction {
+  TORCH_API NegViewBackward0_copy() = default;
+#else
+struct TORCH_API NegViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegViewBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct DiagonalBackward0_copy : public TraceableFunction {
+  TORCH_API DiagonalBackward0_copy() = default;
+#else
+struct TORCH_API DiagonalBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ExpandBackward0_copy : public TraceableFunction {
+  TORCH_API ExpandBackward0_copy() = default;
+#else
+struct TORCH_API ExpandBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpandBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct PermuteBackward0_copy : public TraceableFunction {
+  TORCH_API PermuteBackward0_copy() = default;
+#else
+struct TORCH_API PermuteBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PermuteBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct ReshapeAliasBackward0_copy : public TraceableFunction {
+  TORCH_API ReshapeAliasBackward0_copy() = default;
+#else
+struct TORCH_API ReshapeAliasBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeAliasBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackward0_copy : public TraceableFunction {
+  TORCH_API SelectBackward0_copy() = default;
+#else
+struct TORCH_API SelectBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SelectBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SelectBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SliceBackward0_copy : public TraceableFunction {
+  TORCH_API SliceBackward0_copy() = default;
+#else
+struct TORCH_API SliceBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::optional<c10::SymInt> end;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SplitBackward0_copy : public TraceableFunction {
+  TORCH_API SplitBackward0_copy() = default;
+#else
+struct TORCH_API SplitBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackward0_copy : public TraceableFunction {
+  TORCH_API SplitWithSizesBackward0_copy() = default;
+#else
+struct TORCH_API SplitWithSizesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SplitWithSizesBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SplitWithSizesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward0_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward0_copy() = default;
+#else
+struct TORCH_API SqueezeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward1_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward1_copy() = default;
+#else
+struct TORCH_API SqueezeBackward1_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward1_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SqueezeBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward2_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward2_copy() = default;
+#else
+struct TORCH_API SqueezeBackward2_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward2_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor1_copy : public TraceableFunction {
+  TORCH_API SqueezeBackwardAutogradNestedTensor1_copy() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor1_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor1_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t self_dim = 0;
+
+};
+#ifdef _WIN32
+struct TBackward0_copy : public TraceableFunction {
+  TORCH_API TBackward0_copy() = default;
+#else
+struct TORCH_API TBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TransposeBackward0_copy : public TraceableFunction {
+  TORCH_API TransposeBackward0_copy() = default;
+#else
+struct TORCH_API TransposeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct UnfoldBackward0_copy : public TraceableFunction {
+  TORCH_API UnfoldBackward0_copy() = default;
+#else
+struct TORCH_API UnfoldBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dimension = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct LiftFreshBackward0_copy : public TraceableFunction {
+  TORCH_API LiftFreshBackward0_copy() = default;
+#else
+struct TORCH_API LiftFreshBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftFreshBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward0_copy : public TraceableFunction {
+  TORCH_API UnsqueezeBackward0_copy() = default;
+#else
+struct TORCH_API UnsqueezeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct ViewBackward0_copy : public TraceableFunction {
+  TORCH_API ViewBackward0_copy() = default;
+#else
+struct TORCH_API ViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ViewBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API ViewBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API ViewBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewAsRealBackward0_copy : public TraceableFunction {
+  TORCH_API ViewAsRealBackward0_copy() = default;
+#else
+struct TORCH_API ViewAsRealBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsRealBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ViewAsComplexBackward0_copy : public TraceableFunction {
+  TORCH_API ViewAsComplexBackward0_copy() = default;
+#else
+struct TORCH_API ViewAsComplexBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsComplexBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ValuesBackward0_copy : public TraceableFunction {
+  TORCH_API ValuesBackward0_copy() = default;
+#else
+struct TORCH_API ValuesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ValuesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API ValuesBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API ValuesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NestedViewFromBufferBackward0_copy : public TraceableFunction {
+  TORCH_API NestedViewFromBufferBackward0_copy() = default;
+#else
+struct TORCH_API NestedViewFromBufferBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromBufferBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedViewFromJaggedBackward0_copy : public TraceableFunction {
+  TORCH_API NestedViewFromJaggedBackward0_copy() = default;
+#else
+struct TORCH_API NestedViewFromJaggedBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromJaggedBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedGetValuesBackward0_copy : public TraceableFunction {
+  TORCH_API NestedGetValuesBackward0_copy() = default;
+#else
+struct TORCH_API NestedGetValuesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedGetValuesBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UnbindBackward0_copy : public TraceableFunction {
+  TORCH_API UnbindBackward0_copy() = default;
+#else
+struct TORCH_API UnbindBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API UnbindBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API UnbindBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackward0_copy : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchViewBackward0_copy() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ForeachAbsBackward0 : public TraceableFunction {
+  TORCH_API ForeachAbsBackward0() = default;
+#else
+struct TORCH_API ForeachAbsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAbsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAcosBackward0 : public TraceableFunction {
+  TORCH_API ForeachAcosBackward0() = default;
+#else
+struct TORCH_API ForeachAcosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAcosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachAddBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachAddBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward0List : public TraceableFunction {
+  TORCH_API ForeachAddBackward0List() = default;
+#else
+struct TORCH_API ForeachAddBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachAddBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachAddBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachAddBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcdivBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachAddcdivBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachAddcdivBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcdivBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  at::Scalar value;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcdivBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddcdivBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachAddcdivBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcdivBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcmulBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachAddcmulBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachAddcmulBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcmulBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  at::Scalar value;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcmulBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddcmulBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachAddcmulBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcmulBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAsinBackward0 : public TraceableFunction {
+  TORCH_API ForeachAsinBackward0() = default;
+#else
+struct TORCH_API ForeachAsinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAsinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAtanBackward0 : public TraceableFunction {
+  TORCH_API ForeachAtanBackward0() = default;
+#else
+struct TORCH_API ForeachAtanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAtanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCeilBackward0 : public TraceableFunction {
+  TORCH_API ForeachCeilBackward0() = default;
+#else
+struct TORCH_API ForeachCeilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCeilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward1List : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward1List() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachClampMinBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward1List : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward1List() = default;
+#else
+struct TORCH_API ForeachClampMinBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachClampMinBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCosBackward0 : public TraceableFunction {
+  TORCH_API ForeachCosBackward0() = default;
+#else
+struct TORCH_API ForeachCosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCoshBackward0 : public TraceableFunction {
+  TORCH_API ForeachCoshBackward0() = default;
+#else
+struct TORCH_API ForeachCoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachDivBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachDivBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachDivBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachDivBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachDivBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachDivBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachErfBackward0 : public TraceableFunction {
+  TORCH_API ForeachErfBackward0() = default;
+#else
+struct TORCH_API ForeachErfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachErfBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachErfcBackward0 : public TraceableFunction {
+  TORCH_API ForeachErfcBackward0() = default;
+#else
+struct TORCH_API ForeachErfcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachErfcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachExpBackward0 : public TraceableFunction {
+  TORCH_API ForeachExpBackward0() = default;
+#else
+struct TORCH_API ForeachExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachExpm1Backward0 : public TraceableFunction {
+  TORCH_API ForeachExpm1Backward0() = default;
+#else
+struct TORCH_API ForeachExpm1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachExpm1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachFloorBackward0 : public TraceableFunction {
+  TORCH_API ForeachFloorBackward0() = default;
+#else
+struct TORCH_API ForeachFloorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachFloorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachFracBackward0 : public TraceableFunction {
+  TORCH_API ForeachFracBackward0() = default;
+#else
+struct TORCH_API ForeachFracBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachFracBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLerpBackward1List : public TraceableFunction {
+  TORCH_API ForeachLerpBackward1List() = default;
+#else
+struct TORCH_API ForeachLerpBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLerpBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensors1_.clear();
+    tensors1_released_ = true;
+    weights_.clear();
+    weights_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensors1_;
+  bool tensors1_released_ = false;
+  std::vector<SavedVariable> weights_;
+  bool weights_released_ = false;
+  size_t self_size_;
+  size_t tensors1_size_;
+  size_t weights_size_;
+};
+#ifdef _WIN32
+struct ForeachLerpBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachLerpBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachLerpBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLerpBackward0Scalar"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar weight;
+  size_t self_size_;
+  size_t tensors1_size_;
+};
+#ifdef _WIN32
+struct ForeachLgammaBackward0 : public TraceableFunction {
+  TORCH_API ForeachLgammaBackward0() = default;
+#else
+struct TORCH_API ForeachLgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLogBackward0 : public TraceableFunction {
+  TORCH_API ForeachLogBackward0() = default;
+#else
+struct TORCH_API ForeachLogBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLogBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog10Backward0 : public TraceableFunction {
+  TORCH_API ForeachLog10Backward0() = default;
+#else
+struct TORCH_API ForeachLog10Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog10Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog1PBackward0 : public TraceableFunction {
+  TORCH_API ForeachLog1PBackward0() = default;
+#else
+struct TORCH_API ForeachLog1PBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog1PBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog2Backward0 : public TraceableFunction {
+  TORCH_API ForeachLog2Backward0() = default;
+#else
+struct TORCH_API ForeachLog2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward0List : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward0List() = default;
+#else
+struct TORCH_API ForeachMaximumBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward0List : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward0List() = default;
+#else
+struct TORCH_API ForeachMinimumBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachMulBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachMulBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward0List : public TraceableFunction {
+  TORCH_API ForeachMulBackward0List() = default;
+#else
+struct TORCH_API ForeachMulBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachMulBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachMulBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachMulBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachMulBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachNegBackward0 : public TraceableFunction {
+  TORCH_API ForeachNegBackward0() = default;
+#else
+struct TORCH_API ForeachNegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachNegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachPowBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachPowBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar exponent;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachReciprocalBackward0 : public TraceableFunction {
+  TORCH_API ForeachReciprocalBackward0() = default;
+#else
+struct TORCH_API ForeachReciprocalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachReciprocalBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachRoundBackward0 : public TraceableFunction {
+  TORCH_API ForeachRoundBackward0() = default;
+#else
+struct TORCH_API ForeachRoundBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachRoundBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSigmoidBackward0 : public TraceableFunction {
+  TORCH_API ForeachSigmoidBackward0() = default;
+#else
+struct TORCH_API ForeachSigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSignBackward0 : public TraceableFunction {
+  TORCH_API ForeachSignBackward0() = default;
+#else
+struct TORCH_API ForeachSignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSignBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSinBackward0 : public TraceableFunction {
+  TORCH_API ForeachSinBackward0() = default;
+#else
+struct TORCH_API ForeachSinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSinhBackward0 : public TraceableFunction {
+  TORCH_API ForeachSinhBackward0() = default;
+#else
+struct TORCH_API ForeachSinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSqrtBackward0 : public TraceableFunction {
+  TORCH_API ForeachSqrtBackward0() = default;
+#else
+struct TORCH_API ForeachSqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachSubBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachSubBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward0List : public TraceableFunction {
+  TORCH_API ForeachSubBackward0List() = default;
+#else
+struct TORCH_API ForeachSubBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachSubBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachSubBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTanBackward0 : public TraceableFunction {
+  TORCH_API ForeachTanBackward0() = default;
+#else
+struct TORCH_API ForeachTanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTanhBackward0 : public TraceableFunction {
+  TORCH_API ForeachTanhBackward0() = default;
+#else
+struct TORCH_API ForeachTanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTruncBackward0 : public TraceableFunction {
+  TORCH_API ForeachTruncBackward0() = default;
+#else
+struct TORCH_API ForeachTruncBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTruncBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+
+}}} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h
new file mode 100644
index 0000000000000000000000000000000000000000..c30db2631ffc040f5d3133261ecb82d5df91b15b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h
@@ -0,0 +1,59 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/VariableType.h
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace at {
+  struct Quantizer;
+};
+
+namespace torch { namespace autograd {
+
+using Variable = at::Tensor;
+using at::Context;
+using at::Device;
+using at::Dimname;
+using at::DimnameList;
+using at::Generator;
+using at::IntArrayRef;
+using at::MemoryFormat;
+using at::QScheme;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::Quantizer;
+// This is temporary typedef to enable Quantizer in aten native function API
+// we'll remove them when we are actually exposing Quantizer class
+// to frontend
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using c10::optional;
+
+namespace VariableType {
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
+
+  at::Tensor & unpack(Tensor & t, const char * name, int pos);
+  const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  std::vector<at::Tensor> unpack(const at::ITensorListRef& tl, const char *name, int pos);
+};
+
+}} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5f76829893128a5b5e5e9023e33149e2fe32b66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h
@@ -0,0 +1,953 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/ViewFuncs.h
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/_conj_ops.h>
+#include <ATen/ops/_indices_ops.h>
+#include <ATen/ops/_neg_view_ops.h>
+#include <ATen/ops/_nested_get_values_ops.h>
+#include <ATen/ops/_nested_view_from_buffer_ops.h>
+#include <ATen/ops/_nested_view_from_jagged_ops.h>
+#include <ATen/ops/_reshape_alias_ops.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_ops.h>
+#include <ATen/ops/_values_ops.h>
+#include <ATen/ops/alias_ops.h>
+#include <ATen/ops/as_strided_ops.h>
+#include <ATen/ops/ccol_indices_ops.h>
+#include <ATen/ops/chunk_ops.h>
+#include <ATen/ops/col_indices_ops.h>
+#include <ATen/ops/crow_indices_ops.h>
+#include <ATen/ops/diagonal_ops.h>
+#include <ATen/ops/expand_ops.h>
+#include <ATen/ops/indices_ops.h>
+#include <ATen/ops/narrow_ops.h>
+#include <ATen/ops/permute_ops.h>
+#include <ATen/ops/row_indices_ops.h>
+#include <ATen/ops/select_ops.h>
+#include <ATen/ops/slice_ops.h>
+#include <ATen/ops/slice_inverse_ops.h>
+#include <ATen/ops/split_ops.h>
+#include <ATen/ops/split_with_sizes_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/t_ops.h>
+#include <ATen/ops/transpose_ops.h>
+#include <ATen/ops/unbind_ops.h>
+#include <ATen/ops/unfold_ops.h>
+#include <ATen/ops/unsqueeze_ops.h>
+#include <ATen/ops/values_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/view_as_complex_ops.h>
+#include <ATen/ops/view_as_real_ops.h>
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using c10::optional;
+using c10::fmap;
+
+#define _CONJ_VIEW_FUNC_AVAILABLE
+struct _ConjViewFunc : public torch::autograd::ViewFunc {
+  _ConjViewFunc() 
+  {};
+  virtual ~_ConjViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _INDICES_VIEW_FUNC_AVAILABLE
+struct _IndicesViewFunc : public torch::autograd::ViewFunc {
+  _IndicesViewFunc() 
+  {};
+  virtual ~_IndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NEG_VIEW_VIEW_FUNC_AVAILABLE
+struct _NegViewViewFunc : public torch::autograd::ViewFunc {
+  _NegViewViewFunc() 
+  {};
+  virtual ~_NegViewViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NESTED_GET_VALUES_VIEW_FUNC_AVAILABLE
+struct _NestedGetValuesViewFunc : public torch::autograd::ViewFunc {
+  _NestedGetValuesViewFunc() 
+  {};
+  virtual ~_NestedGetValuesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NESTED_VIEW_FROM_BUFFER_VIEW_FUNC_AVAILABLE
+struct _NestedViewFromBufferViewFunc : public torch::autograd::ViewFunc {
+  _NestedViewFromBufferViewFunc(const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets) : nested_size(nested_size), nested_strides(nested_strides), offsets(offsets)
+  {};
+  virtual ~_NestedViewFromBufferViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor nested_size;
+  at::Tensor nested_strides;
+  at::Tensor offsets;
+};
+
+#define _NESTED_VIEW_FROM_JAGGED_VIEW_FUNC_AVAILABLE
+struct _NestedViewFromJaggedViewFunc : public torch::autograd::ViewFunc {
+  _NestedViewFromJaggedViewFunc(const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths, int64_t ragged_idx) : offsets(offsets), dummy(dummy), lengths(lengths), ragged_idx(ragged_idx)
+  {};
+  virtual ~_NestedViewFromJaggedViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor offsets;
+  at::Tensor dummy;
+  c10::optional<at::Tensor> lengths;
+  int64_t ragged_idx;
+};
+
+#define _RESHAPE_ALIAS_VIEW_FUNC_AVAILABLE
+struct _ReshapeAliasViewFunc : public torch::autograd::ViewFunc {
+  _ReshapeAliasViewFunc(c10::SymIntArrayRef size, c10::SymIntArrayRef stride) : size(size.vec()), stride(stride.vec())
+  {};
+  virtual ~_ReshapeAliasViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  ::std::vector<c10::SymInt> stride;
+};
+
+#define _TEST_AUTOGRAD_MULTIPLE_DISPATCH_VIEW_VIEW_FUNC_AVAILABLE
+struct _TestAutogradMultipleDispatchViewViewFunc : public torch::autograd::ViewFunc {
+  _TestAutogradMultipleDispatchViewViewFunc() 
+  {};
+  virtual ~_TestAutogradMultipleDispatchViewViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _VALUES_VIEW_FUNC_AVAILABLE
+struct _ValuesViewFunc : public torch::autograd::ViewFunc {
+  _ValuesViewFunc() 
+  {};
+  virtual ~_ValuesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define ALIAS_VIEW_FUNC_AVAILABLE
+struct AliasViewFunc : public torch::autograd::ViewFunc {
+  AliasViewFunc() 
+  {};
+  virtual ~AliasViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define AS_STRIDED_VIEW_FUNC_AVAILABLE
+struct AsStridedViewFunc : public torch::autograd::ViewFunc {
+  AsStridedViewFunc(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) : size(size.vec()), stride(stride.vec()), storage_offset(storage_offset)
+  {};
+  virtual ~AsStridedViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  ::std::vector<c10::SymInt> stride;
+  c10::optional<c10::SymInt> storage_offset;
+};
+
+#define CCOL_INDICES_VIEW_FUNC_AVAILABLE
+struct CcolIndicesViewFunc : public torch::autograd::ViewFunc {
+  CcolIndicesViewFunc() 
+  {};
+  virtual ~CcolIndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define CHUNK_VIEW_FUNC_AVAILABLE
+struct ChunkViewFunc : public torch::autograd::ViewFunc {
+  ChunkViewFunc(int64_t chunks, int64_t dim, int64_t view_idx) : chunks(chunks), dim(dim), view_idx(view_idx)
+  {};
+  virtual ~ChunkViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t chunks;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define COL_INDICES_VIEW_FUNC_AVAILABLE
+struct ColIndicesViewFunc : public torch::autograd::ViewFunc {
+  ColIndicesViewFunc() 
+  {};
+  virtual ~ColIndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define CROW_INDICES_VIEW_FUNC_AVAILABLE
+struct CrowIndicesViewFunc : public torch::autograd::ViewFunc {
+  CrowIndicesViewFunc() 
+  {};
+  virtual ~CrowIndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define DIAGONAL_VIEW_FUNC_AVAILABLE
+struct DiagonalViewFunc : public torch::autograd::ViewFunc {
+  DiagonalViewFunc(int64_t offset, int64_t dim1, int64_t dim2) : offset(offset), dim1(dim1), dim2(dim2)
+  {};
+  virtual ~DiagonalViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t offset;
+  int64_t dim1;
+  int64_t dim2;
+};
+
+#define EXPAND_VIEW_FUNC_AVAILABLE
+struct ExpandViewFunc : public torch::autograd::ViewFunc {
+  ExpandViewFunc(c10::SymIntArrayRef size, bool implicit) : size(size.vec()), implicit(implicit)
+  {};
+  virtual ~ExpandViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  bool implicit;
+};
+
+#define INDICES_VIEW_FUNC_AVAILABLE
+struct IndicesViewFunc : public torch::autograd::ViewFunc {
+  IndicesViewFunc() 
+  {};
+  virtual ~IndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define NARROW_VIEW_FUNC_AVAILABLE
+struct NarrowViewFunc : public torch::autograd::ViewFunc {
+  NarrowViewFunc(int64_t dim, c10::SymInt start, c10::SymInt length) : dim(dim), start(start), length(length)
+  {};
+  virtual ~NarrowViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  c10::SymInt start;
+  c10::SymInt length;
+};
+
+#define PERMUTE_VIEW_FUNC_AVAILABLE
+struct PermuteViewFunc : public torch::autograd::ViewFunc {
+  PermuteViewFunc(at::IntArrayRef dims) : dims(dims.vec())
+  {};
+  virtual ~PermuteViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<int64_t> dims;
+};
+
+#define ROW_INDICES_VIEW_FUNC_AVAILABLE
+struct RowIndicesViewFunc : public torch::autograd::ViewFunc {
+  RowIndicesViewFunc() 
+  {};
+  virtual ~RowIndicesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define SELECT_INT_VIEW_FUNC_AVAILABLE
+struct SelectIntViewFunc : public torch::autograd::ViewFunc {
+  SelectIntViewFunc(int64_t dim, c10::SymInt index) : dim(dim), index(index)
+  {};
+  virtual ~SelectIntViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  c10::SymInt index;
+};
+
+#define SLICE_TENSOR_VIEW_FUNC_AVAILABLE
+struct SliceTensorViewFunc : public torch::autograd::ViewFunc {
+  SliceTensorViewFunc(int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) : dim(dim), start(start), end(end), step(step)
+  {};
+  virtual ~SliceTensorViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  c10::optional<c10::SymInt> start;
+  c10::optional<c10::SymInt> end;
+  c10::SymInt step;
+};
+
+#define SLICE_INVERSE_VIEW_FUNC_AVAILABLE
+struct SliceInverseViewFunc : public torch::autograd::ViewFunc {
+  SliceInverseViewFunc(const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step) : src(src), dim(dim), start(start), end(end), step(step)
+  {};
+  virtual ~SliceInverseViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor src;
+  int64_t dim;
+  c10::optional<c10::SymInt> start;
+  c10::optional<c10::SymInt> end;
+  c10::SymInt step;
+};
+
+#define SPLIT_TENSOR_VIEW_FUNC_AVAILABLE
+struct SplitTensorViewFunc : public torch::autograd::ViewFunc {
+  SplitTensorViewFunc(c10::SymInt split_size, int64_t dim, int64_t view_idx) : split_size(split_size), dim(dim), view_idx(view_idx)
+  {};
+  virtual ~SplitTensorViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  c10::SymInt split_size;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define SPLIT_WITH_SIZES_VIEW_FUNC_AVAILABLE
+struct SplitWithSizesViewFunc : public torch::autograd::ViewFunc {
+  SplitWithSizesViewFunc(c10::SymIntArrayRef split_sizes, int64_t dim, int64_t view_idx) : split_sizes(split_sizes.vec()), dim(dim), view_idx(view_idx)
+  {};
+  virtual ~SplitWithSizesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> split_sizes;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define SQUEEZE_VIEW_FUNC_AVAILABLE
+struct SqueezeViewFunc : public torch::autograd::ViewFunc {
+  SqueezeViewFunc() 
+  {};
+  virtual ~SqueezeViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define SQUEEZE_DIM_VIEW_FUNC_AVAILABLE
+struct SqueezeDimViewFunc : public torch::autograd::ViewFunc {
+  SqueezeDimViewFunc(int64_t dim) : dim(dim)
+  {};
+  virtual ~SqueezeDimViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+};
+
+#define SQUEEZE_DIMS_VIEW_FUNC_AVAILABLE
+struct SqueezeDimsViewFunc : public torch::autograd::ViewFunc {
+  SqueezeDimsViewFunc(at::IntArrayRef dim) : dim(dim.vec())
+  {};
+  virtual ~SqueezeDimsViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<int64_t> dim;
+};
+
+#define T_VIEW_FUNC_AVAILABLE
+struct TViewFunc : public torch::autograd::ViewFunc {
+  TViewFunc() 
+  {};
+  virtual ~TViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define TRANSPOSE_INT_VIEW_FUNC_AVAILABLE
+struct TransposeIntViewFunc : public torch::autograd::ViewFunc {
+  TransposeIntViewFunc(int64_t dim0, int64_t dim1) : dim0(dim0), dim1(dim1)
+  {};
+  virtual ~TransposeIntViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim0;
+  int64_t dim1;
+};
+
+#define UNBIND_INT_VIEW_FUNC_AVAILABLE
+struct UnbindIntViewFunc : public torch::autograd::ViewFunc {
+  UnbindIntViewFunc(int64_t dim, int64_t view_idx) : dim(dim), view_idx(view_idx)
+  {};
+  virtual ~UnbindIntViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define UNFOLD_VIEW_FUNC_AVAILABLE
+struct UnfoldViewFunc : public torch::autograd::ViewFunc {
+  UnfoldViewFunc(int64_t dimension, int64_t size, int64_t step) : dimension(dimension), size(size), step(step)
+  {};
+  virtual ~UnfoldViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dimension;
+  int64_t size;
+  int64_t step;
+};
+
+#define UNSQUEEZE_VIEW_FUNC_AVAILABLE
+struct UnsqueezeViewFunc : public torch::autograd::ViewFunc {
+  UnsqueezeViewFunc(int64_t dim) : dim(dim)
+  {};
+  virtual ~UnsqueezeViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+};
+
+#define VALUES_VIEW_FUNC_AVAILABLE
+struct ValuesViewFunc : public torch::autograd::ViewFunc {
+  ValuesViewFunc() 
+  {};
+  virtual ~ValuesViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define VIEW_VIEW_FUNC_AVAILABLE
+struct ViewViewFunc : public torch::autograd::ViewFunc {
+  ViewViewFunc(c10::SymIntArrayRef size) : size(size.vec())
+  {};
+  virtual ~ViewViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+};
+
+#define VIEW_DTYPE_VIEW_FUNC_AVAILABLE
+struct ViewDtypeViewFunc : public torch::autograd::ViewFunc {
+  ViewDtypeViewFunc(at::ScalarType dtype) : dtype(dtype)
+  {};
+  virtual ~ViewDtypeViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::ScalarType dtype;
+};
+
+#define VIEW_AS_COMPLEX_VIEW_FUNC_AVAILABLE
+struct ViewAsComplexViewFunc : public torch::autograd::ViewFunc {
+  ViewAsComplexViewFunc() 
+  {};
+  virtual ~ViewAsComplexViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define VIEW_AS_REAL_VIEW_FUNC_AVAILABLE
+struct ViewAsRealViewFunc : public torch::autograd::ViewFunc {
+  ViewAsRealViewFunc() 
+  {};
+  virtual ~ViewAsRealViewFunc() override {};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cdafc4b28febd1fd3e44334f6f98c66799cf8b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <Python.h>
+
+// @generated from ..\tools\autograd\templates/python_functions.h
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+void initialize_autogenerated_functions_0(PyObject* module);
+void initialize_autogenerated_functions_1(PyObject* module);
+void initialize_autogenerated_functions_2(PyObject* module);
+void initialize_autogenerated_functions_3(PyObject* module);
+void initialize_autogenerated_functions_4(PyObject* module);
+
+inline void initialize_autogenerated_functions(PyObject* module) {
+  initialize_autogenerated_functions_0(module);
+  initialize_autogenerated_functions_1(module);
+  initialize_autogenerated_functions_2(module);
+  initialize_autogenerated_functions_3(module);
+  initialize_autogenerated_functions_4(module);
+}
+
+}}} // namespace torch::autograd::generated
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e858e5143443a95ff667efc5d35da812f35ddaef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h
@@ -0,0 +1,98 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+namespace generated {
+
+PyTypeObject* get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_structseq();
+PyTypeObject* get__fused_moving_avg_obs_fq_helper_structseq();
+PyTypeObject* get__linalg_det_structseq();
+PyTypeObject* get__linalg_det_out_structseq();
+PyTypeObject* get__linalg_eigh_structseq();
+PyTypeObject* get__linalg_eigh_out_structseq();
+PyTypeObject* get__linalg_slogdet_structseq();
+PyTypeObject* get__linalg_slogdet_out_structseq();
+PyTypeObject* get__linalg_solve_ex_structseq();
+PyTypeObject* get__linalg_solve_ex_out_structseq();
+PyTypeObject* get__linalg_svd_structseq();
+PyTypeObject* get__linalg_svd_out_structseq();
+PyTypeObject* get__lu_with_info_structseq();
+PyTypeObject* get__scaled_dot_product_cudnn_attention_structseq();
+PyTypeObject* get__scaled_dot_product_efficient_attention_structseq();
+PyTypeObject* get__scaled_dot_product_flash_attention_structseq();
+PyTypeObject* get__scaled_dot_product_flash_attention_for_cpu_structseq();
+PyTypeObject* get__unpack_dual_structseq();
+PyTypeObject* get_aminmax_structseq();
+PyTypeObject* get_aminmax_out_structseq();
+PyTypeObject* get_cummax_structseq();
+PyTypeObject* get_cummax_out_structseq();
+PyTypeObject* get_cummin_structseq();
+PyTypeObject* get_cummin_out_structseq();
+PyTypeObject* get_frexp_structseq();
+PyTypeObject* get_frexp_out_structseq();
+PyTypeObject* get_geqrf_out_structseq();
+PyTypeObject* get_geqrf_structseq();
+PyTypeObject* get_histogram_out_structseq();
+PyTypeObject* get_histogram_structseq();
+PyTypeObject* get_histogramdd_structseq();
+PyTypeObject* get_kthvalue_structseq();
+PyTypeObject* get_kthvalue_out_structseq();
+PyTypeObject* get_linalg_cholesky_ex_structseq();
+PyTypeObject* get_linalg_cholesky_ex_out_structseq();
+PyTypeObject* get_linalg_eig_structseq();
+PyTypeObject* get_linalg_eig_out_structseq();
+PyTypeObject* get_linalg_eigh_structseq();
+PyTypeObject* get_linalg_eigh_out_structseq();
+PyTypeObject* get_linalg_inv_ex_structseq();
+PyTypeObject* get_linalg_inv_ex_out_structseq();
+PyTypeObject* get_linalg_ldl_factor_structseq();
+PyTypeObject* get_linalg_ldl_factor_out_structseq();
+PyTypeObject* get_linalg_ldl_factor_ex_structseq();
+PyTypeObject* get_linalg_ldl_factor_ex_out_structseq();
+PyTypeObject* get_linalg_lstsq_structseq();
+PyTypeObject* get_linalg_lstsq_out_structseq();
+PyTypeObject* get_linalg_lu_structseq();
+PyTypeObject* get_linalg_lu_out_structseq();
+PyTypeObject* get_linalg_lu_factor_structseq();
+PyTypeObject* get_linalg_lu_factor_out_structseq();
+PyTypeObject* get_linalg_lu_factor_ex_structseq();
+PyTypeObject* get_linalg_lu_factor_ex_out_structseq();
+PyTypeObject* get_linalg_qr_structseq();
+PyTypeObject* get_linalg_qr_out_structseq();
+PyTypeObject* get_linalg_slogdet_structseq();
+PyTypeObject* get_linalg_slogdet_out_structseq();
+PyTypeObject* get_linalg_solve_ex_structseq();
+PyTypeObject* get_linalg_solve_ex_out_structseq();
+PyTypeObject* get_linalg_svd_structseq();
+PyTypeObject* get_linalg_svd_out_structseq();
+PyTypeObject* get_lu_unpack_structseq();
+PyTypeObject* get_lu_unpack_out_structseq();
+PyTypeObject* get_max_structseq();
+PyTypeObject* get_max_out_structseq();
+PyTypeObject* get_median_structseq();
+PyTypeObject* get_median_out_structseq();
+PyTypeObject* get_min_structseq();
+PyTypeObject* get_min_out_structseq();
+PyTypeObject* get_mode_structseq();
+PyTypeObject* get_mode_out_structseq();
+PyTypeObject* get_nanmedian_structseq();
+PyTypeObject* get_nanmedian_out_structseq();
+PyTypeObject* get_qr_out_structseq();
+PyTypeObject* get_qr_structseq();
+PyTypeObject* get_slogdet_structseq();
+PyTypeObject* get_slogdet_out_structseq();
+PyTypeObject* get_sort_out_structseq();
+PyTypeObject* get_sort_structseq();
+PyTypeObject* get_svd_out_structseq();
+PyTypeObject* get_svd_structseq();
+PyTypeObject* get_topk_out_structseq();
+PyTypeObject* get_topk_structseq();
+PyTypeObject* get_triangular_solve_out_structseq();
+PyTypeObject* get_triangular_solve_structseq();
+
+}
+
+void initReturnTypes(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h
new file mode 100644
index 0000000000000000000000000000000000000000..e877255412a6e4853c7a0d0d4b3d143757b1e8ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h
@@ -0,0 +1,736 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/variable_factories.h
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/grad_mode.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/_make_dep_token.h>
+#include <ATen/ops/_cudnn_init_dropout_state.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_permuted.h>
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/from_file.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand_like.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn_like.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/_to_copy.h>
+#include <ATen/ops/tril_indices.h>
+#include <ATen/ops/triu_indices.h>
+#include <ATen/ops/normal.h>
+#include <ATen/ops/fft_fftfreq.h>
+#include <ATen/ops/fft_rfftfreq.h>
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+/// NOTE: Currently `torch::tensor(...)` doesn't support mixed data types
+/// (i.e. `torch::tensor({{bool, 2.0}})` doesn't work). We might be able to
+/// support it in the future by iterating over all sub-lists to find
+/// the largest data type that can represent all of the elements, or by using
+/// variadic templates.
+///
+/// NOTE: C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of floating-point types always produces a tensor of dtype
+/// `torch::get_default_dtype()`, matching Python `torch.tensor` behavior.
+///
+/// NOTE: C++ `torch::tensor` with an integer type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of integer types always produces a tensor of dtype `at::kLong`
+/// (aka. int64_t), matching Python `torch.tensor` behavior.
+///
+/// NOTE: The following dtypes are not supported by `torch::tensor` currently:
+/// - `unsigned int`
+/// - `unsigned long int`
+/// - `unsigned long long int`
+/// - `long long int`
+inline at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const at::TensorOptions& options = {}) {
+  return autograd::make_variable(
+    // note: we remove the requires_grad setting from the TensorOptions because
+    // it is ignored anyways (and we actually have an assertion that it isn't set
+    // which would fail otherwise). We handle requires_grad explicitly here
+    // instead of passing it through to the kernel.
+    tensor_data_container.convert_to_tensor(options.requires_grad(c10::nullopt)),
+    options.requires_grad());
+}
+
+/// A generic deleter function.
+using Deleter = std::function<void(void*)>;
+using at::MemoryFormat;
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `deleter` function (a
+/// `std::function<void(void*)>`) will be called on the `data` when the Tensor
+/// data would normally be deallocated. The `TensorOptions` specify additional
+/// configuration options for the returned tensor, such as what type to
+/// interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, deleter, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `TensorOptions`
+/// specify additional configuration options for the returned tensor, such as
+/// what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The `deleter`
+/// (a `std::function<void(void*)>`) function will be called on the `data` when
+/// the Tensor data would normally be deallocated. The `TensorOptions` specify
+/// additional configuration options for the returned tensor, such as what type
+/// to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, deleter, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The
+/// `TensorOptions` specify additional configuration options for the returned
+/// tensor, such as what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, options.requires_grad(c10::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+inline at::Tensor _make_dep_token(at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_make_dep_token(at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_cudnn_init_dropout_state(dropout, train, dropout_seed, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(end, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & start, const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(start, end, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(start, end, step, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor bartlett_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::bartlett_window(window_length, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor bartlett_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::bartlett_window(window_length, periodic, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor blackman_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::blackman_window(window_length, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor blackman_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::blackman_window(window_length, periodic, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty(size, names, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty(at::IntArrayRef size, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty(size, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_permuted(at::IntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_permuted(size, physical_layout, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_permuted_symint(c10::SymIntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_permuted_symint(size, physical_layout, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_affine_quantized(at::IntArrayRef size, at::TensorOptions options = {}, double scale = 1, int64_t zero_point = 0, c10::optional<at::MemoryFormat> memory_format = MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_affine_quantized(size, at::TensorOptions(options).requires_grad(c10::nullopt), scale, zero_point, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_affine_quantized_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}, double scale = 1, int64_t zero_point = 0, c10::optional<at::MemoryFormat> memory_format = MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_affine_quantized_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt), scale, zero_point, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_per_channel_affine_quantized(size, scales, zero_points, axis, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_per_channel_affine_quantized_symint(size, scales, zero_points, axis, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_quantized(at::IntArrayRef size, const at::Tensor & qtensor, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_quantized(size, qtensor, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_like(self, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_strided(size, stride, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_strided_symint(size, stride, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye(int64_t n, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye(n, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye_symint(c10::SymInt n, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye_symint(n, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye(int64_t n, int64_t m, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye(n, m, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye_symint(n, m, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full(at::IntArrayRef size, const at::Scalar & fill_value, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full(size, fill_value, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full(at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full(size, fill_value, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full_symint(size, fill_value, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full_like(self, fill_value, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor from_file(c10::string_view filename, c10::optional<bool> shared = c10::nullopt, c10::optional<int64_t> size = 0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::from_file(filename, shared, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hann_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hann_window(window_length, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hann_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hann_window(window_length, periodic, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, double alpha, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, alpha, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, double alpha, double beta, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, alpha, beta, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, periodic, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, bool periodic, double beta, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, periodic, beta, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones_like(self, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor scalar_tensor(const at::Scalar & s, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::scalar_tensor(s, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, generator, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, generator, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_like(const at::Tensor & self, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_like(self, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t high, at::IntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(high, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(high, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(high, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(high, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t low, int64_t high, at::IntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(low, high, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(low, high, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t low, int64_t high, at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(low, high, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(low, high, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like(const at::Tensor & self, int64_t high, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like(self, high, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like_symint(const at::Tensor & self, c10::SymInt high, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like_symint(self, high, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like(const at::Tensor & self, int64_t low, int64_t high, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like(self, low, high, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like_symint(const at::Tensor & self, c10::SymInt low, c10::SymInt high, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like_symint(self, low, high, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, generator, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, generator, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_like(const at::Tensor & self, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_like(self, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm(int64_t n, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm(n, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm_symint(c10::SymInt n, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm_symint(n, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm(int64_t n, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm(n, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm_symint(c10::SymInt n, c10::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm_symint(n, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor range(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step = 1, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::range(start, end, step, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor range(const at::Scalar & start, const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::range(start, end, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros(size, names, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _efficientzerotensor(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_efficientzerotensor(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _efficientzerotensor_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_efficientzerotensor_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros_symint(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros_like(const at::Tensor & self, at::TensorOptions options = {}, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros_like(self, at::TensorOptions(options).requires_grad(c10::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor_symint(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csr_tensor(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csc_tensor(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsr_tensor(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsc_tensor(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor(compressed_indices, plain_indices, values, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csr_tensor(crow_indices, col_indices, values, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csc_tensor(ccol_indices, row_indices, values, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsr_tensor(crow_indices, col_indices, values, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsc_tensor(ccol_indices, row_indices, values, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_compressed_tensor_unsafe(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_compressed_tensor_unsafe_symint(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_csr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_csr_tensor_unsafe(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_csc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_csc_tensor_unsafe(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_bsr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_bsr_tensor_unsafe(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_bsc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_bsc_tensor_unsafe(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options = {}, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(indices, values, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_unsafe(indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_unsafe_symint(const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options = {}, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_unsafe_symint(indices, values, size, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims(sparse_dim, dense_dim, size, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims_and_tensors(sparse_dim, dense_dim, size, indices, values, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, c10::optional<bool> is_coalesced = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims_and_tensors_symint(sparse_dim, dense_dim, size, indices, values, at::TensorOptions(options).requires_grad(c10::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options = {}, bool non_blocking = false, c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_to_copy(self, at::TensorOptions(options).requires_grad(c10::nullopt), non_blocking, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset = 0, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::tril_indices(row, col, offset, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset = 0, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::triu_indices(row, col, offset, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor normal(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator = c10::nullopt, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::normal(mean, std, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor normal_symint(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator = c10::nullopt, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::normal_symint(mean, std, size, generator, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor fft_fftfreq(int64_t n, double d = 1.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::fft_fftfreq(n, d, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor fft_rfftfreq(int64_t n, double d = 1.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::fft_rfftfreq(n, d, at::TensorOptions(options).requires_grad(c10::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..1935068fb8bb48aeaac06a9697ccc95f2d7efe74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/grad_mode.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::autograd {
+
+using GradMode = at::GradMode;
+using AutoGradMode = at::AutoGradMode;
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h
new file mode 100644
index 0000000000000000000000000000000000000000..d50ece0a478e798389ef64cca80935af468710b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h
@@ -0,0 +1,226 @@
+#pragma once
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/ThreadLocal.h>
+#include <torch/csrc/autograd/input_buffer.h>
+#include <torch/csrc/autograd/utils/warnings.h>
+#include <vector>
+
+namespace torch::autograd {
+
+using edge_list = std::vector<Edge>;
+struct ReadyQueue;
+
+static constexpr int NO_DEVICE = -2;
+static constexpr int CPU_DEVICE = -1;
+
+// GraphTask holds metadata needed for a single execution of backward()
+struct GraphTask : std::enable_shared_from_this<GraphTask> {
+  std::atomic<uint64_t> outstanding_tasks_{0};
+  // Indicates if an error occurred while executing any task.  When this is
+  // true, it signals all threads to stop executing.
+  std::atomic_bool has_error_{false};
+  std::atomic_bool future_completed_{false};
+  // It is safe to read keep_graph_ without synchronization
+  bool keep_graph_;
+
+  // To protect reads/writes to not_ready_, dependencies_, captured_vars_,
+  // has_error_, future_result_, cpu_ready_queue_, and leaf_streams.
+  std::mutex mutex_;
+  std::unordered_map<Node*, InputBuffer> not_ready_;
+  std::unordered_map<Node*, int> dependencies_;
+
+  // Records the nodes that are in the graph
+  std::unordered_set<Node*> nodes_in_graph_;
+  c10::SmallVector<Node*, 4> graph_roots_;
+  // Note [Exec info]
+  // Exec info is created for each GraphTask, which allows filtering paths on
+  // the graph that are not needed. It has a bit complicated semantics. If it's
+  // empty, it means the task is run in a "default" mode, which means that all
+  // next_edges we encounter should get executed. If it's not empty, only
+  // functions that have an entry and this entry has needed == True should be
+  // executed. exec_info is only empty when the graph is executed via
+  // .backward() and the inputs parameter is not passed. Otherwise, when
+  // executed through .grad(), or when inputs arg is specified for .backward(),
+  // exec_info will be non-empty.
+  //
+  struct ExecInfo {
+    struct Capture {
+      Capture(const Capture&) = delete;
+      Capture(Capture&&) = default;
+
+      Capture(int input_idx, int output_idx)
+          : input_idx_(input_idx), output_idx_(output_idx) {}
+      int input_idx_; // within Node inputs
+      int output_idx_; // within the output vector of a GraphTask
+
+      // This hook will be executed after a grad is captured. The captured
+      // grad will be replaced by the return value of the hook.
+      struct GradCaptureHook {
+        virtual ~GradCaptureHook() = default;
+        virtual at::Tensor operator()(const at::Tensor& grad) = 0;
+      };
+      // NOTE [Deprecated capture hooks]
+      //
+      // The current status of capture hooks is that we continue to support
+      // the single usage of it by distributed in the dist_engine. If anyone
+      // else needs to use it for other purposes, they should file an issue.
+      //
+      // Capture hooks were originally created because there did not exist
+      // any way to register pre/post hooks to grad_fn in a way such that it
+      // would still be executed even if that is the grad_fn of a Tensor
+      // passed as input= of .grad. As far as I know, only dist_engine uses
+      // this hook.
+      //
+      // However, there are other alternatives today like tensor hooks that can
+      // replace the usage that originally motivated its creation. Also,
+      // Captures hooks are an outlier in terms of the types of hook that
+      // autograd offers in how it is registered and behaves, e.g. it is a hook
+      // registered not to the graph, but to a particular graph_task! This makes
+      // it a burden to maintain.
+      //
+      // It would be very nice to clean up/do a migration from pre/post
+      // hooks used in distributed to use tensor hooks, but for now we just
+      // mark this method as deprecated to prevent additional usage.
+      //
+      // If you still think you really need to capture hooks, please file an
+      // issue (and tag autograd).
+      const std::vector<std::unique_ptr<GradCaptureHook>>&
+      DO_NOT_USE_DEPRECATED_get_capture_hooks() const {
+        return hooks_;
+      }
+      // See NOTE [deprecated capture hooks]
+      void DO_NOT_USE_DEPRECATED_register_capture_hook(
+          std::unique_ptr<GradCaptureHook> hook) {
+        hooks_.push_back(std::move(hook));
+      }
+
+     private:
+      // The hooks will be called one by one in the order as they were added.
+      // The input grad of a hook will be the output of its preceding hook. The
+      // first hook will take the captured grad as the input. The output of the
+      // last hook will replace the captured grad.
+      std::vector<std::unique_ptr<GradCaptureHook>> hooks_;
+    };
+
+    bool should_execute() const {
+      return needed_ || captures_;
+    }
+
+    bool needed_ = false;
+    std::unique_ptr<std::vector<Capture>> captures_;
+  };
+  // exec_info_ is safe to read without synchronization
+  std::unordered_map<Node*, ExecInfo> exec_info_;
+  // Captures variables are grads captured that we return to the user. After
+  // execution of the GraphTask is completed, the captured_vars_ are moved
+  // out of the GraphTask and are no longer valid.
+  std::vector<Variable> captured_vars_;
+
+  // Note: this field is not ready to be used until the proper
+  // `thread_locals_.set_grad_mode()` call in the constructor.
+  at::ThreadLocalState thread_locals_ = at::ThreadLocalState();
+
+  std::unordered_set<c10::Stream> leaf_streams;
+
+  // Per-device current streams of the execute() that called this GraphTask.
+  // These will be synced with leaf_streams in exec_post_processing.
+  std::vector<c10::optional<c10::Stream>> caller_current_streams_;
+
+  // Collects caller_current_streams_ for the accelerator device.
+  void stash_current_streams();
+
+  void init_to_execute(
+      Node& graph_root,
+      const edge_list& outputs,
+      bool accumulate_grad,
+      uint64_t min_topo_nr);
+
+  // The value of worker_device in the thread that created this task.
+  // See Note [Reentrant backwards]
+  // Safe to read owner_ and reentrant_depth_ without synchronization
+  int owner_;
+  // The number of parent graph tasks for this graph task
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int reentrant_depth_;
+
+  bool can_checkpoint() const {
+    return exec_info_.empty();
+  }
+
+  // check if the GraphTask is completed or not
+  bool completed();
+  // mark the graph task as completed and trigger post processing
+  void mark_as_completed_and_run_post_processing();
+
+  // Set an appropriate exception on this graph_task which was encountered while
+  // running the provided function.
+  void set_exception(std::exception_ptr eptr, const std::shared_ptr<Node>& fn);
+
+  // Set an appropriate exception on this graph_task which was encountered while
+  // running the provided function. But doesn't signal completion on
+  // 'future_result_' right away. The user needs to explicitly mark
+  // 'future_result_' completed with an appropriate exception.
+  void set_exception_without_signal(const std::shared_ptr<Node>& fn);
+
+  // Whether or not to stop execution for this GraphTask when an error is
+  // encountered. When set to true, this would cause Engine::execute() to throw
+  // an exception as soon as the autograd engine receives an exception.
+  bool exit_on_error_;
+
+  // CPU threads are dedicated to processing CPU work for the backward they
+  // invoked. So any given graph task maintains its own cpu_ready_queue_ where
+  // you should send work for it to be done. We memoize the cpu_ready_queue_ per
+  // GraphTask so that we know which ready queue we should push to if we are on
+  // device thread (i.e. GPU) and but next NodeTask should be run on CPU.
+  std::shared_ptr<ReadyQueue> cpu_ready_queue_;
+
+  // Future representing the completion of the graph task. Notified when all
+  // tasks are done.
+  c10::intrusive_ptr<at::ivalue::Future> future_result_;
+
+  // Final callbacks installed during execution of this GraphTask
+  std::vector<std::function<void()>> final_callbacks_;
+  // To protect reads and writes to final_callbacks_. Intentionally no reusing
+  // mutex_ as the two are protecting different data structures.
+  std::mutex final_callbacks_lock_;
+
+  utils::DelayWarningHandler warning_handler_;
+
+  uint64_t id_;
+
+  GraphTask(
+      bool keep_graph,
+      bool grad_mode,
+      int reentrant_depth,
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      c10::SmallVector<Node*, 4> graph_roots,
+      bool exit_on_error = false);
+
+ private:
+  // run GraphTask post processing
+  void exec_post_processing();
+};
+
+// The guard that sets and restores current_graph_task.
+class GraphTaskGuard {
+ public:
+  explicit GraphTaskGuard(std::shared_ptr<GraphTask> graph_task);
+  ~GraphTaskGuard();
+
+  void restore_current_graph_task();
+
+ private:
+  std::shared_ptr<GraphTask> last_graph_task_;
+};
+
+TORCH_API const std::unordered_map<Node*, GraphTask::ExecInfo>*
+get_current_graph_task_exec_info();
+TORCH_API const std::unordered_set<Node*>*
+get_current_graph_task_nodes_in_graph();
+TORCH_API bool get_current_graph_task_keep_graph();
+TORCH_API std::vector<Node*> get_current_graph_task_execution_order();
+TORCH_API int get_current_graph_task_id();
+void add_node_to_current_graph_task_exec_info(Node* fn);
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4d6107e3bae1b2042db2c353b018d1c5e083301
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h
@@ -0,0 +1,45 @@
+#pragma once
+
+// The InputBuffer class accumulates a list of Variables for use by a
+// function. It implements logic to avoid modifying the passed
+// values in-place (adding an input twice will accumulate the result).
+// This behaviour is needed and used only in backward graphs.
+
+#include <utility>
+#include <vector>
+
+#include <c10/core/Stream.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+struct InputBuffer {
+  explicit InputBuffer(size_t size) : buffer(size) {}
+  InputBuffer(const InputBuffer& other) = delete;
+  InputBuffer(InputBuffer&& other) = default;
+  explicit InputBuffer(variable_list&& inputs) : buffer(std::move(inputs)){};
+  InputBuffer& operator=(InputBuffer&& other) = default;
+
+  // Accumulates the variable at a specified index.
+  // The optional CUDA streams determine which stream the accumulation
+  // is run on and how the addition is synchronized.
+  TORCH_API void add(
+      size_t pos,
+      Variable&& var,
+      const c10::optional<c10::Stream>& opt_producer_stream,
+      const c10::optional<c10::Stream>& opt_consumer_stream);
+
+  at::Device device() const;
+
+  Variable operator[](size_t pos) {
+    return buffer[pos];
+  }
+
+  // Returns the inputs as a list of variables. Destroys given InputBuffer.
+  static std::vector<Variable> variables(InputBuffer&& g);
+
+  std::vector<Variable> buffer;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a2633cc61abc3d76538f059a40bc7f284885d3d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <ATen/ExpandUtils.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace torch::autograd {
+
+using SymIntSmallVec = c10::SmallVector<c10::SymInt, c10::kDimVectorStaticSize>;
+using MetadataShape = std::variant<SymIntSmallVec, at::Tensor>;
+
+/**
+ * Records TensorOptions, shape of the tensor, whether or not the Python
+ * dispatch key is set (tensor subclass), and, where applicable, the stream the
+ * corresponding operation took place on.
+ *
+ * If is_valid() is false, then the corresponding input is not used and may be
+ * an undefined tensor.
+ */
+struct TORCH_API InputMetadata {
+  InputMetadata() = default;
+  InputMetadata(
+      const at::TensorOptions& options,
+      MetadataShape input_shape,
+      bool is_tensor_subclass,
+      bool is_nested);
+  InputMetadata(const at::Tensor& t);
+
+  const at::TensorOptions& options() const {
+    return options_;
+  }
+
+  caffe2::TypeMeta dtype() const {
+    return options_.dtype();
+  }
+
+  at::Device device() const {
+    return options_.device();
+  }
+
+  at::Layout layout() const {
+    return options_.layout();
+  }
+
+  c10::Stream stream() const {
+    return stream_;
+  }
+
+  bool is_tensor_subclass() const {
+    return is_tensor_subclass_;
+  }
+
+  at::Tensor zeros_like() const;
+
+  bool is_same_shape(const at::Tensor& grad) const;
+
+  bool is_expandable_to_shape(const at::Tensor& grad) const;
+
+  at::Tensor reduce_grad(at::Tensor& grad) const;
+
+  at::Tensor maybe_reduce(
+      const size_t index,
+      at::Tensor grad,
+      const std::function<std::string(const std::string&)>& format_error) const;
+
+  std::stringstream incompatible_shape_error_message(
+      const size_t index,
+      const at::Tensor& grad) const;
+
+  bool was_default_constructed() const {
+    return was_default_constructed_;
+  }
+
+  bool is_cpp_nested_tensor() const;
+
+  bool is_nested_tensor() const {
+    return is_nested_;
+  }
+
+  c10::SymIntArrayRef shape_as_dim_vector() const;
+
+  // Danger: not thread safe, caller must protect with lock
+  SymIntSmallVec& mutable_shape_as_dim_vector();
+
+ private:
+  at::Tensor shape_as_tensor() const;
+  bool is_nestedness_same(const at::Tensor& grad) const;
+  bool maybe_expandable_to(const at::Tensor& grad) const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::TensorOptions options_;
+  MetadataShape shape_;
+  c10::Stream stream_ = c10::Stream(c10::Stream::Default::DEFAULT, device());
+  bool is_tensor_subclass_ = false;
+  bool is_nested_ = false;
+  bool was_default_constructed_ = true;
+};
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..6992206d39d5fbfa0fdc7202b1aa69a2c69fe067
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/function_schema.h>
+#include <c10/macros/Export.h>
+
+// NOTE: [Jit Decomposition Interface]
+//
+// For some context of why we need this at all, see NOTE: [forward-mode AD
+// decompositions mechanism]
+//
+// Introducing that mechanism from the NOTE is problematic because:
+// - it relies on TorchScript, so now VariableTypeX.cpp depends on TorchScript.
+// - there exist internal builds like lite_trainer, which depend on VariableType
+//   but do not depend on TorchScript.
+//
+// For internal builds like lite_trainer builds to pass, and for OSS builds that
+// do depend on TorchScript to still support the forward AD decomp mechanism, we
+// implement a PImpl pattern to avoid a static dependency in favor of a dynamic
+// one
+// - during static initialization time, if the library is built with TorchScript
+//   setJitDecompImpl is called in decomposition_registry.cpp setting a global
+//   ptr to the impl
+// - when the program is run,if getJitDecompImpl returns a non null ptr, we can
+//   carry on normally, otherwise we gracefully error out
+//
+// For extra context, see VariableHooksInterface.h, where a similar technique
+// is used
+
+namespace torch::autograd::impl {
+
+struct TORCH_API JitDecompInterface {
+  virtual ~JitDecompInterface() = default;
+  virtual bool has_jit_decomposition(
+      const c10::FunctionSchema& schema) const = 0;
+  virtual void run_jit_decomposition(
+      const c10::OperatorHandle& op,
+      jit::Stack* stack) const = 0;
+};
+
+TORCH_API void setJitDecompImpl(JitDecompInterface* impl);
+TORCH_API JitDecompInterface* getJitDecompImpl();
+
+struct TORCH_API JitDecompRegisterer {
+  explicit JitDecompRegisterer(JitDecompInterface* impl) {
+    setJitDecompImpl(impl);
+  }
+};
+
+} // namespace torch::autograd::impl
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bee2efbcbf215e65d3a058588b19cffd7b944f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/autograd/profiler_legacy.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b213f22a6fbf4043251d8d2866c5c3fe74d499e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/events.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+
+namespace profiler::impl {
+struct Result;
+namespace kineto {
+struct ActivityTraceWrapper;
+} // namespace kineto
+} // namespace profiler::impl
+
+namespace autograd::profiler {
+using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
+using extra_meta_t = std::unordered_map<std::string, std::string>;
+
+struct TORCH_API KinetoEvent {
+  KinetoEvent(
+      const std::shared_ptr<const torch::profiler::impl::Result>&,
+      const bool verbose);
+
+  uint64_t startThreadId() const;
+  uint64_t endThreadId() const;
+  uint8_t activityType() const;
+  uint64_t fwdThreadId() const;
+  bool hasShapes() const;
+  const c10::ArrayRef<std::vector<int64_t>> shapes() const;
+  bool hasTypes() const;
+  const c10::ArrayRef<std::string> dtypes() const;
+  bool hasConcreteInputs() const;
+  const c10::ArrayRef<c10::IValue> concreteInputs() const;
+  uint64_t flops() const;
+  int64_t sequenceNr() const;
+  bool hasStack() const;
+  const c10::ArrayRef<std::string> stack() const;
+  uint8_t scope() const;
+  bool hasModuleHierarchy() const;
+  const c10::ArrayRef<std::string> moduleHierarchy() const;
+  int64_t debugHandle() const;
+  std::string name() const;
+  c10::DeviceType deviceType() const;
+  int deviceIndex() const;
+  int64_t nBytes() const;
+  uint64_t startUs() const;
+  uint64_t durationUs() const;
+  bool isAsync() const;
+  uint64_t correlationId() const;
+  uint64_t linkedCorrelationId() const;
+  int64_t deviceResourceId() const;
+  std::string backend() const;
+  bool isPythonFunction() const;
+  int64_t cudaElapsedUs() const;
+  int64_t privateuse1ElapsedUs() const;
+  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
+  extra_meta_t extraMeta() const;
+
+ private:
+  torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
+  torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const;
+
+  std::shared_ptr<const torch::profiler::impl::Result> result_;
+  std::vector<std::string> python_stack_;
+
+  // Copy fields from result so we can return ArrayRefs.
+  std::vector<std::vector<int64_t>> shapes_;
+  std::vector<std::string> dtypes_;
+  std::vector<c10::IValue> concrete_inputs_;
+};
+
+// Consolidating events returned directly from Kineto
+// with events manually created by us (e.g. start/stop marks,
+// memory allocation events)
+struct TORCH_API ProfilerResult {
+  ProfilerResult();
+  ProfilerResult(
+      uint64_t start_time,
+      std::vector<KinetoEvent> events,
+      std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
+          trace,
+      std::vector<experimental_event_t>&& event_tree);
+  ~ProfilerResult();
+
+  uint64_t trace_start_us() const {
+    return trace_start_us_;
+  }
+
+  const std::vector<KinetoEvent>& events() const {
+    return events_;
+  }
+
+  const std::vector<experimental_event_t>& event_tree() const {
+    return event_tree_;
+  }
+
+  void save(const std::string& path);
+
+ private:
+  uint64_t trace_start_us_ = 0;
+  std::vector<KinetoEvent> events_;
+  std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
+  std::vector<experimental_event_t> event_tree_;
+};
+
+/*
+ * This API is used by backends to record latency of events that
+ * happened in the backend but were not visible to pytorch runtime.
+ * For example, if part of the model is lowered to a dsp backend, then
+ * the execution of that part of the model is delegated to the backend.
+ * When backend finishes execution it has an option to provide profiling
+ * information (latency only at the moment) corresponding to different operators
+ * that were executed in the backend.
+ * When such events are recorded by backend using this API, the event
+ * records will be collected by active kineto profiler. If no kineto profiler
+ * is active then the event is ignored.
+ * This provides us with a way to generate all the profiling information
+ * for a model regardless of where model (or part of it) executed.
+ * @param start_time_us: start time in us of the event
+ * @param end_time_us: end time in us of the event
+ * @param debug_handle: debug handle to correlate this event/op with
+ * model level module/source information
+ * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
+ * @param event_name: name of the event, e.g. op name
+ * @param backend_name: name of the backend where the event took place.
+ */
+TORCH_API void reportBackendEventToActiveKinetoProfiler(
+    const int64_t start_time_us,
+    const int64_t end_time_us,
+    const int64_t debug_handle,
+    const at::RecordScope scope,
+    const std::string& event_name,
+    const std::string& backend_name);
+
+TORCH_API void enableProfiler(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
+/*
+ * Same as enableProfiler but with callback to do post-processing of
+ * KinetoEvents.
+ * enableProfilerWithEventPostProcess enables profiler to capture
+ * specified activities, with specified RecordFunction scope, if any.
+ * Additionally, it takes a functor that does in-place post processing of
+ * events, e.g. populate stack trace or module hierarchy information lazily
+ * using debug_handle.
+ * Example usage is with lite interpreter that has recording scope of
+ * LITE_INTERPRETER. In this case lite interpreter runtime, records debug
+ * handles in RecordFunction, along with other information. Debug handles are
+ * eventually passed down to KinetoEvent and recorded as part of the event.
+ * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
+ * profiler using post-processing callback, via
+ * enableProfilerWithEventPostProcess, that takes these debug handles and
+ * generates stack trace and module hierarchy information, once profiling is
+ * done.
+ */
+using post_process_t = std::function<void(
+    /*debug_handle */ int64_t,
+    /*jit_stack    */ std::vector<std::string>&,
+    /*jit_modules  */ std::vector<std::string>&)>;
+TORCH_API void enableProfilerWithEventPostProcess(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities,
+    post_process_t&& cb,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
+TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
+
+TORCH_API void prepareProfiler(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities);
+
+} // namespace autograd::profiler
+
+namespace profiler::impl {
+
+// Experimental.
+TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
+
+} // namespace profiler::impl
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f60dbca0ba300333d7f5b071672c80656c7de60
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h
@@ -0,0 +1,406 @@
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch::autograd {
+
+struct Node;
+
+namespace profiler {
+
+enum class C10_API_ENUM EventKind : uint16_t {
+  Mark,
+  PushRange,
+  PopRange,
+  MemoryAlloc,
+};
+
+// To be deprecated, once we switch to Kineto profiling
+struct TORCH_API LegacyEvent {
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      bool record_cuda,
+      at::RecordFunctionHandle handle = 0,
+      std::vector<std::vector<int64_t>>&& shapes = {},
+      int64_t node_id = -1,
+      bool is_async = false)
+      : name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(std::move(shapes)),
+        node_id_(node_id),
+        is_async_(is_async) {
+    record(record_cuda);
+  }
+
+  // Constructor to be used in conjunction with LegacyEvent::fromIValue.
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      at::RecordFunctionHandle handle,
+      std::vector<std::vector<int64_t>>&& shapes,
+      int64_t node_id,
+      bool is_remote,
+      int64_t cpu_memory_usage,
+      int64_t cpu_ns,
+      bool cuda_recorded,
+      int64_t cuda_memory_usage = 0,
+      c10::DeviceIndex device = -1,
+      double cuda_us = -1)
+      : cpu_ns_(cpu_ns),
+        name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(std::move(shapes)),
+        cpu_memory_usage_(cpu_memory_usage),
+        cuda_memory_usage_(cuda_memory_usage),
+        device_(device),
+        node_id_(node_id),
+        is_remote_(is_remote),
+        cuda_us_(static_cast<int64_t>(cuda_us)) {
+    // Sanity check values that were deserialized
+    TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
+    if (cuda_recorded) {
+      TORCH_INTERNAL_ASSERT(device_ >= 0);
+      TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
+    }
+  }
+
+  // Returns IValues corresponding to event structure, to be used for
+  // serialization.
+  at::IValue toIValue() const;
+
+  // Reconstructs an event from IValues given by toIValue.
+  static LegacyEvent fromIValue(const at::IValue& eventIValue);
+
+  void record(bool record_cuda);
+
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark:
+        return "mark";
+      case EventKind::PushRange:
+        return "push";
+      case EventKind::PopRange:
+        return "pop";
+      case EventKind::MemoryAlloc:
+        return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
+  EventKind kind() const {
+    return kind_;
+  }
+
+  const char* name() const {
+    return name_.str();
+  }
+
+  uint64_t threadId() const {
+    return thread_id_;
+  }
+
+  std::vector<std::vector<int64_t>> shapes() const {
+    return shapes_;
+  }
+
+  double cpuElapsedUs(const LegacyEvent& e) const {
+    return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (1000.0);
+  }
+
+  void setCpuUs(int64_t cpu_us) {
+    cpu_ns_ = cpu_us * 1000;
+  }
+
+  double cpuUs() const {
+    return static_cast<double>(cpu_ns_) / (1000.0);
+  }
+
+  double cudaElapsedUs(const LegacyEvent& e) const;
+
+  bool hasCuda() const {
+    return cuda_event != nullptr || (isRemote() && device_ != -1);
+  }
+
+  c10::DeviceIndex device() const {
+    return device_;
+  }
+
+  void updateMemoryStats(int64_t alloc_size, c10::Device device) {
+    if (device.is_cuda() || device.type() == c10::DeviceType::HIP) {
+      cuda_memory_usage_ = alloc_size;
+    } else if (
+        device.is_cpu() || device.type() == c10::DeviceType::MKLDNN ||
+        device.type() == c10::DeviceType::IDEEP) {
+      cpu_memory_usage_ = alloc_size;
+    } else {
+      LOG(WARNING) << "Unsupported memory profiling device: " << device;
+    }
+  }
+
+  int64_t cpuMemoryUsage() const {
+    return cpu_memory_usage_;
+  }
+
+  int64_t cudaMemoryUsage() const {
+    return cuda_memory_usage_;
+  }
+
+  at::RecordFunctionHandle handle() const {
+    return handle_;
+  }
+
+  // Node ID corresponding to this event.
+  int64_t nodeId() const {
+    return node_id_;
+  }
+
+  // Set Node ID on this event.
+  void setNodeId(int64_t node_id) {
+    node_id_ = node_id;
+  }
+
+  void setName(at::StringView newName_) {
+    name_ = std::move(newName_);
+  }
+
+  bool isRemote() const {
+    return is_remote_;
+  }
+
+  void setCudaUs(int64_t cuda_us) {
+    cuda_us_ = cuda_us;
+  }
+
+  void setSequenceNr(int64_t sequence_nr) {
+    sequence_nr_ = sequence_nr;
+  }
+
+  int64_t sequenceNr() const {
+    return sequence_nr_;
+  }
+
+  void setCorrelationId(uint64_t correlation_id) {
+    correlation_id_ = correlation_id;
+  }
+
+  uint64_t correlationId() const {
+    return correlation_id_;
+  }
+
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  void setStack(const std::vector<std::string>& stack) {
+    stack_ = stack;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setFwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  void setScope(uint8_t scope) {
+    scope_ = scope;
+  }
+
+  const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
+    return extra_args_;
+  }
+
+  void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
+    extra_args_ = std::move(save_args);
+  }
+
+  uint64_t flops() {
+    return flops_;
+  }
+
+  bool isAsync() {
+    return is_async_;
+  }
+
+  void setFlops(uint64_t flops) {
+    flops_ = flops;
+  }
+
+ private:
+  // signed to allow for negative intervals, initialized for safety.
+  int64_t cpu_ns_ = 0;
+  at::StringView name_;
+  EventKind kind_;
+  uint64_t thread_id_;
+  uint64_t fwd_thread_id_{0};
+  at::RecordFunctionHandle handle_{0};
+  std::vector<std::vector<int64_t>> shapes_;
+  int64_t cpu_memory_usage_ = 0;
+  int64_t cuda_memory_usage_ = 0;
+  c10::DeviceIndex device_ = -1;
+  torch::profiler::impl::ProfilerVoidEventStub cuda_event = nullptr;
+  int64_t node_id_ = 0;
+  bool is_remote_ = false;
+  int64_t cuda_us_ = -1;
+  int64_t sequence_nr_ = -1;
+  bool is_async_ = false;
+
+  std::vector<std::string> stack_;
+  uint8_t scope_{0};
+  uint64_t correlation_id_{0};
+  // Extra arguments for computing op flops
+  std::unordered_map<std::string, c10::IValue> extra_args_;
+  uint64_t flops_ = 0;
+};
+
+// a linked-list of fixed sized vectors, to avoid
+// a std::vector resize from taking a large amount of time inside
+// a profiling  event
+struct RangeEventList {
+  RangeEventList() {
+    events_.reserve(kReservedCapacity);
+  }
+
+  template <typename... Args>
+  void record(Args&&... args) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<LegacyEvent> consolidate() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<LegacyEvent> result;
+    result.insert(
+        result.begin(),
+        std::make_move_iterator(events_.begin()),
+        std::make_move_iterator(events_.end()));
+    events_.erase(events_.begin(), events_.end());
+    return result;
+  }
+
+  size_t size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return events_.size();
+  }
+
+ private:
+  // This mutex is used to serialize access when different threads are writing
+  // to the same instance of RangeEventList.
+  std::mutex mutex_;
+  std::vector<LegacyEvent> events_;
+
+  static const size_t kReservedCapacity = 1024;
+};
+
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
+// NOTE: profiler mode is thread local, with automatic propagation
+// across thread boundary (e.g. at::launch tasks)
+TORCH_API void enableProfilerLegacy(
+    const torch::profiler::impl::ProfilerConfig&);
+using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
+TORCH_API thread_event_lists disableProfilerLegacy(
+    c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+        c10::nullopt);
+
+// adds profiledEvents to the current thread local recorded events. Each event
+// will be marked with node ID given by fromNodeId.
+TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
+// Writes profiled events to a stream.
+TORCH_API void writeProfilerEventsToStream(
+    std::ostream& out,
+    const std::vector<LegacyEvent*>& events);
+
+// Usage:
+//   {
+//     RecordProfile guard("filename.trace");
+//     // code you want to profile
+//   }
+// Then open filename.trace in chrome://tracing
+struct TORCH_API RecordProfile {
+  RecordProfile(std::ostream& out);
+  RecordProfile(const std::string& filename);
+
+  ~RecordProfile();
+
+ private:
+  void init();
+  std::unique_ptr<std::ofstream> file_;
+  std::ostream& out_;
+  void processEvents(const std::vector<LegacyEvent*>& events);
+};
+
+// A guard that enables the legacy profiler, taking in an optional callback to
+// process the results Usage:
+// {
+//   TLSLegacyProfilerGuard g([](thread_event_lists profilerResults) {
+//     // process profilerResults
+//   });
+//   Code to profile
+// }
+struct TORCH_API TLSLegacyProfilerGuard {
+  explicit TLSLegacyProfilerGuard(
+      const torch::profiler::impl::ProfilerConfig& cfg,
+      c10::optional<std::function<void(const thread_event_lists&)>>
+          resultCallback = c10::nullopt,
+      c10::optional<ProfilerDisableOptions> profilerDisableOptions =
+          c10::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(profilerDisableOptions) {
+    enableProfilerLegacy(cfg);
+  }
+  ~TLSLegacyProfilerGuard() {
+    thread_event_lists event_lists =
+        disableProfilerLegacy(profilerDisableOptions_);
+    if (cb_) {
+      try {
+        (*cb_)(event_lists);
+      } catch (const std::exception& e) {
+        LOG(ERROR) << "Got error processing profiler events: " << e.what();
+      }
+    }
+  }
+
+ private:
+  c10::optional<std::function<void(const thread_event_lists&)>> cb_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const c10::optional<ProfilerDisableOptions> profilerDisableOptions_;
+};
+
+} // namespace profiler
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c0d1cde2141100c90e845a62922faa9ea3e1f7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd::profiler::python_tracer {
+
+void init();
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..930ab5adfb7c53d37fd290006f6ec3bce3e97250
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace autograd {
+
+struct PyAnomalyMetadata : public AnomalyMetadata {
+  static constexpr const char* ANOMALY_TRACE_KEY = "traceback_";
+  static constexpr const char* ANOMALY_PARENT_KEY = "parent_";
+
+  PyAnomalyMetadata() {
+    pybind11::gil_scoped_acquire gil;
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    dict_ = PyDict_New();
+  }
+  ~PyAnomalyMetadata() override {
+    // If python is already dead, leak the wrapped python objects
+    if (Py_IsInitialized()) {
+      pybind11::gil_scoped_acquire gil;
+      Py_DECREF(dict_);
+    }
+  }
+  void store_stack() override;
+  void print_stack(const std::string& current_node_name) override;
+  void assign_parent(const std::shared_ptr<Node>& parent_node) override;
+
+  PyObject* dict() {
+    return dict_;
+  }
+
+ private:
+  PyObject* dict_{nullptr};
+};
+void _print_stack(
+    PyObject* trace_stack,
+    const std::string& current_node_name,
+    bool is_parent);
+
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..f25cb58c53a2e8700a05657af98fe9ba12c3beb0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h
@@ -0,0 +1,17 @@
+#ifndef THP_AUTOGRAD_H
+#define THP_AUTOGRAD_H
+
+PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused);
+void THPAutograd_initFunctions();
+
+namespace torch::autograd {
+
+PyMethodDef* python_functions();
+
+}
+
+#include <torch/csrc/autograd/python_engine.h>
+#include <torch/csrc/autograd/python_function.h>
+#include <torch/csrc/autograd/python_variable.h>
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b779176da47d09de57c7f66c8a0862dcc472c9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <memory>
+#include <typeinfo>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::autograd {
+
+struct THPCppFunction {
+  PyObject_HEAD std::shared_ptr<Node> cdata;
+};
+
+template <typename Ctor>
+PyObject* CppFunction_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwds) {
+  THPObjectPtr obj(type->tp_alloc(type, 0));
+  if (!obj)
+    return nullptr;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  HANDLE_TH_ERRORS
+  new (&f->cdata) std::shared_ptr<Node>(Ctor()(args));
+  END_HANDLE_TH_ERRORS
+  if (!f->cdata) {
+    return nullptr;
+  }
+  return obj.release();
+}
+
+#define THP_FUNCTION_DEFAULT_METHODS                                           \
+  {(char*)"_register_hook_dict",                                               \
+   THPCppFunction_register_hook_dict,                                          \
+   METH_O,                                                                     \
+   nullptr},                                                                   \
+      {(char*)"register_hook", THPCppFunction_register_hook, METH_O, nullptr}, \
+      {(char*)"register_prehook",                                              \
+       THPCppFunction_register_prehook,                                        \
+       METH_O,                                                                 \
+       nullptr},                                                               \
+      {(char*)"name", THPCppFunction_name, METH_NOARGS, nullptr},              \
+      {(char*)"_sequence_nr",                                                  \
+       THPCppFunction_sequence_nr,                                             \
+       METH_NOARGS,                                                            \
+       nullptr},                                                               \
+  {                                                                            \
+    (char*)"_set_sequence_nr", THPCppFunction_set_sequence_nr, METH_O, nullptr \
+  }
+
+#define THP_FUNCTION_DEFAULT_PROPERTIES                                   \
+  {(char*)"next_functions",                                               \
+   THPCppFunction_next_functions,                                         \
+   nullptr,                                                               \
+   nullptr,                                                               \
+   nullptr},                                                              \
+      {(char*)"requires_grad",                                            \
+       THPCppFunction_requires_grad,                                      \
+       nullptr,                                                           \
+       nullptr,                                                           \
+       nullptr},                                                          \
+  {                                                                       \
+    (char*)"metadata", THPCppFunction_metadata, nullptr, nullptr, nullptr \
+  }
+
+PyObject* THPCppFunction_next_functions(PyObject* self, void* _unused);
+PyObject* THPCppFunction_metadata(PyObject* self, void* _unused);
+PyObject* THPCppFunction_requires_grad(PyObject* self, void* _unused);
+PyObject* THPCppFunction_register_hook_dict(PyObject* self, PyObject* _var);
+PyObject* THPCppFunction_register_hook(PyObject* self, PyObject* hook);
+PyObject* THPCppFunction_register_prehook(PyObject* self, PyObject* hook);
+
+PyObject* THPCppFunction_name(PyObject* self, PyObject* noargs);
+PyObject* THPCppFunction_sequence_nr(PyObject* self, PyObject* noargs);
+
+PyTypeObject* _initFunctionPyTypeObject(
+    PyTypeObject& type,
+    const char* name,
+    PyGetSetDef* function_properties,
+    PyMethodDef* function_methods);
+
+PyObject* registerFunctionHook(Node& fn, PyObject* hook);
+
+PyObject* registerFunctionPreHook(Node& fn, PyObject* hook);
+
+template <typename Ctor>
+PyTypeObject* createForwardFunctionPyTypeObject(
+    PyTypeObject& type,
+    const char* name,
+    PyGetSetDef* function_properties = nullptr,
+    PyMethodDef* function_methods = nullptr) {
+  type.tp_new = &CppFunction_pynew<Ctor>;
+  return _initFunctionPyTypeObject(
+      type, name, function_properties, function_methods);
+}
+
+void registerCppFunction(const std::type_info& type, PyTypeObject* pytype);
+PyObject* functionToPyObject(const std::shared_ptr<Node>& cdata);
+
+bool THPCppFunction_Check(PyObject* obj);
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..7052a960a5258153f7b663bc67d27588a33c8759
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/function.h>
+
+bool THPEngine_initModule(PyObject* module);
+
+namespace torch::autograd::python {
+
+struct PythonEngine : public Engine {
+  static Engine& get_python_engine();
+  ~PythonEngine() override;
+  void thread_init(
+      int device,
+      const std::shared_ptr<ReadyQueue>& ready_queue,
+      bool should_increment) override;
+  void thread_on_exception(
+      std::shared_ptr<GraphTask> graph_task,
+      const std::shared_ptr<Node>& fn,
+      std::exception& e) override;
+  variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      bool accumulate_grad,
+      const edge_list& outputs = {}) override;
+
+  c10::intrusive_ptr<at::ivalue::Future> execute_with_graph_task(
+      const std::shared_ptr<GraphTask>& graph_task,
+      std::shared_ptr<Node> graph_root,
+      InputBuffer&& input_buffer) override;
+
+  std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() override;
+  std::unique_ptr<SavedVariableHooks> get_default_saved_variable_hooks()
+      override;
+
+ private:
+  PythonEngine();
+};
+
+} // namespace torch::autograd::python
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3fd1c1bb78d1c1485ece005b1b8e53530e5b7ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::autograd {
+void initEnumTag(PyObject* module);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..80957d55dd45e9ab348fa31cf8fcf477f05d5cf7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd {
+
+void initFFTFunctions(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2f52b295f251d6e5c43bb8318ab2931dccf1bf1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/util/Optional.h>
+
+#include <memory>
+#include <optional>
+#include <vector>
+
+namespace torch::jit {
+struct Graph;
+}
+
+namespace torch::autograd {
+
+// A Function which is implemented by a Python object (i.e., a THPFunction).
+// Calls to 'apply' are forwarded to the Python method implementation.
+struct PyNode : public Node {
+  PyNode(THPObjectPtr obj) : obj(obj.release()) {}
+
+  PyObject* to_py_args(
+      const variable_list& inputs,
+      at::OptionalDeviceGuard* device_guard);
+  variable_list to_variable_list(
+      const PyObject* r,
+      const std::vector<bool>& is_variable_input);
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list compiled_apply(
+      variable_list&& inputs,
+      std::optional<PyObject*> compiler);
+
+  void release_variables() override;
+  std::string name() const override;
+  bool is_traceable() override;
+
+  void compiled_args(CompiledNodeArgs& args) override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  bool compiled_autograd_should_lift() const;
+
+  // THPFunction this Function is wrapping.  Owning!
+  PyObject* obj;
+
+  // The AutogradCompilerCall::hooks idx corresponding to this node's backward
+  std::optional<int> _backward_idx;
+
+  // The AutogradCompilerCall::hooks idx corresponding to this node's
+  // backward_state
+  std::optional<int> _backward_state_idx;
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~PyNode() override {
+    // Can't use THPObjectPtr as a field in this class; destructor won't take
+    // out GIL!  When I forgot to do this by hand
+    // TestAutograd.test_inplace_view_python called me out about it.
+    // If python is already dead, leak the wrapped python objects
+    if (Py_IsInitialized()) {
+      pybind11::gil_scoped_acquire gil;
+      Py_DECREF(obj);
+    }
+  }
+};
+
+/**
+ * Cast an object into a tuple, if it is not a tuple already. Returns true
+ * if the original object was not a tuple.
+ */
+inline bool ensure_tuple(THPObjectPtr& obj) {
+  if (PyTuple_Check(obj.get()))
+    return false;
+
+  PyObject* tuple = PyTuple_New(1);
+  if (!tuple)
+    throw python_error();
+  PyTuple_SET_ITEM(tuple, 0, obj.release());
+  obj = tuple;
+  return true;
+}
+
+} // namespace torch::autograd
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPFunction {
+  PyObject_HEAD
+
+      PyObject* needs_input_grad;
+
+  // Python tuple of tensors whose variables we should save.  Set
+  // by Python with 'save_for_backward'.  If nullptr, no tensors were
+  // saved.
+  PyObject* to_save;
+  // Python tuple of tensors which are not differentiable.  Set by
+  // Python with 'mark_non_differentiable'.  If nullptr, no tensors were
+  // non-differentiable.
+  PyObject* non_differentiable;
+  // Python tuple of tensors which had inplace updates in the forward()
+  // pass.  Set by Python with 'mark_dirty'.  If nullptr, no tensors were
+  // modified inplace.
+  PyObject* dirty_tensors;
+
+  // boolean indicating whether to materialize undefined output grad tensors
+  // into tensors full of zeros. Set by Python with 'set_materialize_grads'.
+  // Default is true.
+  bool materialize_grads;
+
+  // boolean indicating whether to materialize output grad tensors
+  // corresponding to non-differentiable outputs. Normally, someone would
+  // already get this behavior by switching off materialize_grads,
+  // but there are certain use cases where that is not feasible:
+  // https://github.com/pytorch/pytorch/pull/98659#pullrequestreview-1376822560
+  bool materialize_non_diff_grads;
+
+  // This is enabled by compiled autograd as a way to signal to AotAutograd it
+  // should call the original FX graph rather than compiling.
+  bool compiled_autograd_tracing;
+  PyObject* compiled_autograd_backward_state;
+  std::vector<c10::SymInt> compiled_autograd_symints;
+
+  std::vector<torch::autograd::VariableInfo> output_info;
+  std::vector<torch::autograd::VariableInfo> input_info;
+  std::vector<torch::autograd::SavedVariable> saved_variables;
+  // For each input, true if the input is a THPVariable
+  std::vector<bool> is_variable_input;
+  char has_freed_buffers;
+
+  PyObject* saved_for_forward;
+  // The actual PyNode (in the autograd graph) that this data was
+  // saved for.  This field may be NULL (because a user can construct
+  // a THPFunction directly from Python), but when this field is non-NULL,
+  // it is guaranteed that cdata.lock()->obj == this
+  //
+  // In most ordinary use, this field should always be non-NULL; e.g.,
+  // when we allocate a THPFunction because we are running Node.apply,
+  // after constructing a THPFunction, we immediately allocate a PyNode
+  // for it.  We can't enforce this directly in the constructor of
+  // THPFunction though, because there's no way to keep it live long enough
+  // to save an owning reference to PyNode into the grad_fn of a Variable.
+  std::weak_ptr<torch::autograd::PyNode> cdata;
+};
+
+bool THPFunction_initModule(PyObject* module);
+extern PyTypeObject THPFunctionType;
+extern PyObject* THPFunctionClass;
+extern PyObject* THPGradientEdgeClass;
+
+inline bool THPFunction_Check(PyObject* obj) {
+  return PyObject_IsInstance(obj, (PyObject*)&THPFunctionType);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f03ea32074357190693e5c9e7c7ee5578e78b11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/csrc/autograd/function_hook.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::dynamo::autograd {
+class SwapSavedVariables;
+} // namespace torch::dynamo::autograd
+
+namespace torch::autograd {
+
+struct PyFunctionTensorPreHook : public FunctionPreHook {
+  PyFunctionTensorPreHook(PyObject* dict, size_t value_idx);
+  ~PyFunctionTensorPreHook() override;
+  variable_list operator()(const variable_list& values) override;
+  void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) override;
+  PyObject* dict;
+  size_t value_idx;
+};
+
+struct PyFunctionPreHook : public FunctionPreHook {
+  PyFunctionPreHook(PyObject* dict);
+  ~PyFunctionPreHook() override;
+  variable_list operator()(const variable_list& values) override;
+  void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) override;
+  PyObject* dict;
+};
+
+struct PyFunctionPostHook : public FunctionPostHook {
+  PyFunctionPostHook(PyObject* dict);
+  ~PyFunctionPostHook() override;
+  variable_list operator()(
+      const variable_list& outputs,
+      const variable_list& inputs) override;
+  void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) override;
+  PyObject* dict;
+};
+
+// PyFunctionTensorPostAccGradHooks is a dictionary of PostAccumulateGradHooks,
+// and it is understandable if you are confused by why it's a subclass. We are
+// simply following the precedent of PyFunctionPreHook and PyFunctionPostHook
+// above to easily enroll into existing infrastructure.
+struct PyFunctionTensorPostAccGradHooks : public PostAccumulateGradHook {
+  PyFunctionTensorPostAccGradHooks(PyObject* dict);
+  ~PyFunctionTensorPostAccGradHooks() override;
+  void operator()(const Variable& tensor) override;
+  void compiled_args(torch::dynamo::autograd::CompiledNodeArgs& args) override;
+  void apply_with_saved(
+      Variable& tensor,
+      torch::dynamo::autograd::SwapSavedVariables& saved) override;
+  PyObject* dict;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..819e397ea9711986fac66a85ce7e1acb850bf81f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// Instantiates torch._C._LegacyVariableBase, which defines the Python
+// constructor (__new__) for torch.autograd.Variable.
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::autograd {
+
+void init_legacy_variable(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bf9fd96e4325afd139bef58dec3c69bed158eb8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd {
+
+void initLinalgFunctions(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0072b21ab833e2b1acad9199869073d5375a596
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace torch::autograd {
+
+PyMethodDef* get_nested_functions_manual();
+
+void initNestedFunctions(PyObject* module);
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..12cf8646acd02d3503e84bb11b673bb6a0144ec8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd {
+
+void initNNFunctions(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..13e826bc5995c299949f4767df745d94ad18f93f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace torch::autograd {
+
+struct PySavedVariableHooks : public SavedVariableHooks {
+  PySavedVariableHooks(py::function& pack_hook, py::function& unpack_hook);
+  void call_pack_hook(const at::Tensor& tensor) override;
+  at::Tensor call_unpack_hook() override;
+  ~PySavedVariableHooks() override;
+
+ private:
+  PyObject* pack_hook_;
+  PyObject* unpack_hook_;
+  PyObject* data_ = nullptr;
+};
+
+struct PyDefaultSavedVariableHooks {
+  static void push_hooks(py::function& pack_hook, py::function& unpack_hook);
+  static void pop_hooks();
+  static std::unique_ptr<SavedVariableHooks> get_hooks();
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f6848c8bcfe1aeb97288ebe9740b2db5e3128ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd {
+
+void initSparseFunctions(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2965a0e43b7451c0ec81d9e38620069e32b907b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd {
+
+void initSpecialFunctions(PyObject* module);
+
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6b8240cd92e7a10aff278aa8437bcddb69c3125
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h
@@ -0,0 +1,25 @@
+#include <Python.h>
+
+namespace torch::autograd {
+
+extern PyObject* THPVariableFunctionsModule;
+
+// Wrapper converts a raised TypeError into returning NotImplemented
+// Used to implement binary arithmetic operators
+template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
+inline PyObject* TypeError_to_NotImplemented_(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  PyObject* ret = Func(self, args, kwargs);
+  if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) {
+    PyErr_Clear();
+    Py_INCREF(Py_NotImplemented);
+    ret = Py_NotImplemented;
+  }
+  return ret;
+}
+
+void initTorchFunctions();
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e6e6f6c35aa23d96720255388acbd17ac93656f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+#include <ATen/core/function_schema.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+// Python object that backs torch.autograd.Variable
+struct THPVariable {
+  PyObject_HEAD;
+  // Payload
+  c10::MaybeOwned<at::Tensor> cdata;
+  // Hooks to be run on backwards pass (corresponds to Python attr
+  // '_backwards_hooks', set by 'register_hook')
+  PyObject* backward_hooks = nullptr;
+  // Hooks to be run in the backwards pass after accumulate grad,
+  // i.e., after the .grad has been set (corresponds to Python attr
+  // '_post_accumulate_grad_hooks', set by 'register_post_accumulate_grad_hook')
+  PyObject* post_accumulate_grad_hooks = nullptr;
+};
+
+TORCH_PYTHON_API void registerPythonTensorClass(
+    const std::string& device,
+    PyObject* python_tensor_class);
+
+TORCH_PYTHON_API void activateCUDATrace();
+
+TORCH_PYTHON_API extern PyObject* THPVariableClass;
+TORCH_PYTHON_API extern PyObject* ParameterClass;
+
+bool THPVariable_initModule(PyObject* module);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(at::TensorBase var);
+
+static inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
+  // Check that a python object is a `Tensor`, but not a `Tensor` subclass.
+  // (A subclass could have different semantics.) The one exception is
+  // Parameter, which is used for Python bookkeeping but is equivalent to
+  // Tensor as far as C++ is concerned.
+  return (
+      tp == (PyTypeObject*)THPVariableClass ||
+      tp == (PyTypeObject*)ParameterClass);
+}
+
+static inline bool THPVariable_CheckExact(PyObject* obj) {
+  return THPVariable_CheckTypeExact(Py_TYPE(obj));
+}
+
+inline bool THPVariable_Check(PyObject* obj) {
+  if (!THPVariableClass)
+    return false;
+
+  // Fast path
+  if (THPVariable_CheckExact(obj)) {
+    return true;
+  }
+
+  const auto result = PyObject_IsInstance(obj, THPVariableClass);
+  if (result == -1)
+    throw python_error();
+  return result;
+}
+
+inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
+  return *var->cdata;
+}
+
+inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
+  return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
+}
+
+std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
+    const c10::OperatorHandle& op,
+    const std::vector<c10::IValue>& arguments);
+
+void pushPyOutToStack(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    py::object out,
+    const char* msg);
+
+inline PyObject* THPVariable_WrapList(
+    const torch::autograd::variable_list& inputs) {
+  PyObject* pyinput = PyList_New(static_cast<Py_ssize_t>(inputs.size()));
+  for (const auto i : c10::irange(inputs.size())) {
+    PyList_SET_ITEM(pyinput, i, THPVariable_Wrap(inputs[i]));
+  }
+  return pyinput;
+}
+
+inline torch::autograd::variable_list THPVariable_UnpackList(
+    PyObject* pyresult) {
+  TORCH_CHECK(PyList_CheckExact(pyresult));
+  auto result_len = PyList_GET_SIZE(pyresult);
+  torch::autograd::variable_list result;
+  result.reserve(result_len);
+  for (const auto i : c10::irange(result_len)) {
+    PyObject* item = PyList_GET_ITEM(pyresult, i);
+    if (!Py_IsNone(item)) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(THPVariable_Check(item));
+      result.emplace_back(THPVariable_Unpack(item));
+    } else {
+      result.emplace_back();
+    }
+  }
+  return result;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fb1555e395039987360aad6eccc76d3808bf7c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_symnode.h>
+
+namespace torch::autograd {
+
+struct UnpackedSlice {
+  c10::SymInt start;
+  c10::SymInt stop;
+  c10::SymInt step;
+};
+
+// This mirrors Cpython's PySlice_Unpack method
+static inline UnpackedSlice __PySlice_Unpack(PyObject* _r) {
+  PySliceObject* r = (PySliceObject*)_r;
+  /* this is harder to get right than you might think */
+
+  c10::SymInt start_sym, stop_sym, step_sym;
+
+  auto clip_val = [](Py_ssize_t val) {
+    if (val < c10::SymInt::min_representable_int()) {
+      auto r = PyErr_WarnEx(
+          PyExc_UserWarning,
+          "Truncating the start/stop/step "
+          "of slice. This is likely because of "
+          "saved old models when the start/stop/step were larger.",
+          1);
+      if (r != 0) {
+        throw python_error();
+      }
+      return (Py_ssize_t)(c10::SymInt::min_representable_int());
+    }
+    return val;
+  };
+
+  if (r->step == Py_None) {
+    step_sym = c10::SymInt(1);
+  } else {
+    if (torch::is_symint(r->step)) {
+      auto step_sym = py::handle(r->step).cast<c10::SymInt>();
+    } else {
+      // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+      Py_ssize_t step;
+      if (!_PyEval_SliceIndex(r->step, &step)) {
+        throw python_error();
+      }
+      if (step == 0) {
+        PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
+      }
+
+      step = clip_val(step);
+      step_sym = c10::SymInt(step);
+    }
+  }
+
+  if (torch::is_symint(r->start)) {
+    start_sym = py::handle(r->start).cast<c10::SymInt>();
+  } else if (r->start == Py_None) {
+    start_sym = c10::SymInt(step_sym < 0 ? PY_SSIZE_T_MAX : 0);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    Py_ssize_t start;
+    if (!_PyEval_SliceIndex(r->start, &start)) {
+      throw python_error();
+    }
+    start = clip_val(start);
+    start_sym = c10::SymInt(start);
+  }
+
+  if (torch::is_symint(r->stop)) {
+    stop_sym = py::handle(r->stop).cast<c10::SymInt>();
+  } else if (r->stop == Py_None) {
+    stop_sym = c10::SymInt(
+        step_sym < 0 ? c10::SymInt::min_representable_int() : PY_SSIZE_T_MAX);
+  } else {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    Py_ssize_t stop;
+    if (!_PyEval_SliceIndex(r->stop, &stop)) {
+      throw python_error();
+    }
+    stop = clip_val(stop);
+    stop_sym = c10::SymInt(stop);
+  }
+
+  return UnpackedSlice{
+      std::move(start_sym), std::move(stop_sym), std::move(step_sym)};
+}
+
+Py_ssize_t THPVariable_length(PyObject* self);
+PyObject* THPVariable_getitem(PyObject* self, PyObject* index);
+int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* value);
+
+Variable valueToTensor(
+    c10::TensorOptions options,
+    PyObject* value,
+    const at::Device& device);
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e628ee18a4330a8f19e0b3cc12447b22caff45d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <ATen/record_function.h>
+#include <c10/util/Optional.h>
+#include <torch/custom_class.h>
+
+namespace torch::autograd::profiler {
+
+struct PythonRecordFunction : public torch::CustomClassHolder {
+  at::RecordFunction record;
+
+  explicit PythonRecordFunction(
+      at::RecordScope scope = at::RecordScope::FUNCTION)
+      : record(scope) {}
+};
+
+// Creates a new profiling scope using RecordFunction and invokes its starting
+// callbacks.
+TORCH_API c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
+    const std::string& name,
+    const c10::optional<std::string>& args = c10::nullopt);
+
+// Schedules RecordFunction's end callbacks to be run on completion of a future.
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_new(
+    const c10::intrusive_ptr<PythonRecordFunction>& record,
+    const c10::intrusive_ptr<c10::ivalue::Future>& fut);
+
+} // namespace torch::autograd::profiler
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0bb895bf5414a3bdecf1d87cfac61024fa6928a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/forward_grad.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+
+#include <ATen/core/Tensor.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace torch::autograd {
+
+using Variable = at::Tensor;
+struct Node;
+
+TORCH_API extern const char* ERR_BACKWARD_TWICE;
+
+/// A snapshot of a variable at a certain version. A `SavedVariable` stores
+/// enough information to reconstruct a variable from a certain point in time.
+class TORCH_API SavedVariable {
+ public:
+  SavedVariable() = default;
+  SavedVariable(
+      const Variable& variable,
+      bool is_output,
+      bool is_inplace_on_view = false);
+  SavedVariable(
+      const c10::optional<Variable>& variable,
+      bool is_output,
+      bool is_inplace_on_view = false);
+  SavedVariable(SavedVariable&&) = default;
+  SavedVariable& operator=(SavedVariable&&) = default;
+  ~SavedVariable() {
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
+
+  /// Reconstructs the saved variable. Pass `saved_for` as the gradient
+  /// function if constructing the `SavedVariable` with it would have caused a
+  /// circular reference.
+  Variable unpack(std::shared_ptr<Node> saved_for = nullptr) const;
+
+  void register_hooks(std::unique_ptr<SavedVariableHooks>&& hooks);
+
+  void reset_data();
+
+  bool has_hooks() const {
+    return (bool)hooks_;
+  }
+
+ private:
+  // This field contains either:
+  // 1. the variable to save
+  // 2. or its tensor_data.
+  // If storing the variable itself would create a circular reference,
+  // we fall into the second case and its metadata is also saved separately.
+  // In that case, the grad_fn must be passed in to the unpack function when
+  // reconstructing the Variable (except when we are doing an inplace operation
+  // on a view, see below). The field saved_original_ below reflects the two
+  // cases: its value is true in the first case and false in the second case.
+  // The value data_.defined() can be false in three cases:
+  // 1. SavedVariable was constructed without a Tensor (the value to save is
+  // None), in that case was_default_constructed_ will be kept at true
+  // 2. The saved variable has been released by calling
+  // SavedVariable::reset_data(), typically during the backward pass
+  // 3. Hooks have been registered. In that case, hooks_ will be defined
+  // instead. Note that the value of saved_original_ only reflects what happened
+  // during the construction of the SavedVariable. If saved_original_ is true,
+  // we saved the original tensor in data_, but if the user registers hooks, we
+  // will no longer have it (despite the saved_original_ still being true)
+  at::Tensor data_;
+
+  // This field is used to store the forward AD gradients associated with
+  // the saved Tensor. Note that this shared_ptr must never be shared with
+  // either the saved Tensor or the unpacked Tensor. See note [ Using
+  // ForwardGrad ]
+  std::shared_ptr<ForwardGrad> fw_grad_;
+
+  // Weak version of grad_fn_ that prevents leaks in rebase_history() for
+  // inplace views.
+  // This variable is used when the user chooses to create a SavedVariable with
+  // is_inplace_on_view = true.
+  // In that case, the grad_fn passed in to the unpack function at unwrapping
+  // time is unused.
+  std::weak_ptr<Node> weak_grad_fn_;
+  c10::VariableVersion version_counter_;
+
+  uint32_t saved_version_ = 0;
+  uint32_t output_nr_ = 0;
+  bool was_default_constructed_ = true;
+  bool is_inplace_on_view_ = false;
+  bool saved_original_ = false;
+  bool is_leaf_ = false;
+  bool is_output_ = false;
+
+  // Hooks are a pair of functions pack_hook/unpack_hook that provides
+  // fine-grained control over how the SavedVariable should save its data.
+  // pack_hook is called upon registration, while unpack_hook is called when
+  // unpacking.
+  std::unique_ptr<SavedVariableHooks> hooks_;
+  // Fields grad_fn_, grad_accumulator_, and requires_grad_ are only used if
+  // hooks are defined. They are set before pack_hook is called and used after
+  // unpack_hook is called.
+  std::shared_ptr<Node> grad_fn_;
+  // For the usual case where leaf tensors are the input, we expect its
+  // grad_acc to be kept alive by the graph. The reason SavedVariable holds
+  // a owning reference is to support the case where a custom autograd Function
+  // saves an intermediate.
+  std::shared_ptr<Node> grad_accumulator_;
+  bool requires_grad_ = false;
+
+  void save_metadata(const Variable& data);
+  static std::unique_ptr<SavedVariableHooks> get_default_hooks();
+  void set_hooks_and_pack_data(
+      std::unique_ptr<SavedVariableHooks>&& hooks,
+      const Variable& data);
+};
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..11a247831b71bf76d2eed8215da9dc80815ef934
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace torch::autograd {
+
+struct TORCH_API SavedVariableHooks {
+  virtual void call_pack_hook(const at::Tensor& tensor) = 0;
+  virtual at::Tensor call_unpack_hook() = 0;
+  virtual ~SavedVariableHooks() = default;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h
new file mode 100644
index 0000000000000000000000000000000000000000..99f27f0aba2df896d6fd98d0a841548cdd1985f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
+
+namespace torch::autograd {
+
+struct SymbolicContext {
+  jit::Block* block;
+};
+
+struct symbolic_unconvertible : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe3afbad249ae354f29320956330d641b78b0f77
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <sstream>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+inline std::string requires_grad_leaf_error(bool requires_grad) {
+  std::ostringstream oss;
+  oss << "you can only change requires_grad flags of leaf variables.";
+  if (requires_grad == false) {
+    oss << " If you want to use a computed variable in a subgraph "
+           "that doesn't require differentiation use "
+           "var_no_grad = var.detach().";
+  }
+  return oss.str();
+}
+
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h
new file mode 100644
index 0000000000000000000000000000000000000000..5020835fd594b990dfc07c9c69ca8ee218b0a882
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+// Helper functions to enforce the "Gradient Layout Contract" described in
+// torch/csrc/autograd/functions/accumulate_grad.h.
+
+// Checks if grad obeys the contract with variable.
+inline bool obeys_layout_contract(
+    const at::Tensor& grad,
+    const at::Tensor& variable) {
+  TORCH_INTERNAL_ASSERT(!grad.is_sparse());
+  TORCH_INTERNAL_ASSERT(!grad.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT(!variable.is_sparse_csr());
+
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  if (variable.is_nested()) {
+    // TODO: Nested Tensor does not have an implementation of detach. The
+    // current implementation of nested tensor likely does obey the gradient
+    // contract and should return true, but this would likely change in the
+    // future
+    return false;
+  } else if (variable.is_sparse()) {
+    // Gradient Layout Contract is not applicable for sparse layouts
+    return false;
+  } else if (variable.is_non_overlapping_and_dense()) {
+    // Only look at stride for dimensions that are not of size 1.
+    const auto& grad_sizes = grad.sym_sizes();
+    const auto& grad_strides = grad.sym_strides();
+    const auto& variable_strides = variable.sym_strides();
+    for (const auto idx : c10::irange(grad_sizes.size())) {
+      if (grad_sizes[idx] != 1) {
+        if (grad_strides[idx] != variable_strides[idx]) {
+          return false;
+        }
+      } else {
+        // This should not be needed but we don't check if a Tensor has views
+        // before stashing it. And 0-strided Tensors of size 1 are actually
+        // views for ops like cat.
+        // TODO: Actually detect views in the accumulateGrad function so that
+        // this Tensor is not considered at all.
+        if (grad_strides[idx] == 0) {
+          return false;
+        }
+      }
+    }
+    return true;
+  } else {
+    return grad.is_contiguous(at::MemoryFormat::Contiguous);
+  }
+}
+
+// Creates a clone of new_grad that obeys the contract with variable.
+// The clone should attach to new_grad's history if GradMode::is_enabled().
+inline at::Tensor clone_obey_contract(
+    const at::Tensor& new_grad,
+    const at::Tensor& variable) {
+  if (variable.is_non_overlapping_and_dense()) {
+    // (1)
+    // Does this dicey-looking sequence attach the result to new_grad's
+    // history if GradMode::is_enabled()?  Yes, and @alband says it should.
+    return std::move(new_grad
+                         .new_empty_strided_symint(
+                             variable.sym_sizes(),
+                             variable.sym_strides(),
+                             variable.options().memory_format(c10::nullopt))
+                         .copy_(new_grad));
+  } else {
+    // (2)
+    return new_grad.clone(at::MemoryFormat::Contiguous);
+  }
+}
+
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..c21f3ce49d94ab4fc413839ae9d6bc6982a9b09f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <torch/csrc/autograd/function_hook.h>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+// Turns lambda into a torch::autograd::FunctionPostHook.
+class LambdaPostHook : public torch::autograd::FunctionPostHook {
+  using variable_list = std::vector<torch::autograd::Variable>;
+  using fn_type =
+      std::function<variable_list(const variable_list&, const variable_list&)>;
+  using compiled_fn_type = std::function<void(CompiledNodeArgs&)>;
+
+ public:
+  // The lambda function takes as arguments the outputs and inputs of the
+  // autograd function and can modify the outputs of the autograd function by
+  // returning a new output if needed.
+  /* implicit */ LambdaPostHook(fn_type fn) : fn_(std::move(fn)) {}
+
+  LambdaPostHook(fn_type fn, compiled_fn_type compiled_fn)
+      : fn_(std::move(fn)), compiled_fn_(std::move(compiled_fn)) {}
+
+  variable_list operator()(
+      const variable_list& outputs,
+      const variable_list& inputs) override {
+    return fn_(outputs, inputs);
+  }
+
+  void compiled_args(CompiledNodeArgs& args) override {}
+
+ protected:
+  std::function<variable_list(const variable_list&, const variable_list&)> fn_;
+  compiled_fn_type compiled_fn_;
+};
+
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h
new file mode 100644
index 0000000000000000000000000000000000000000..6927599c5e8508f8e3710a70822bc761c5497711
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/utils/python_arg_parser.h>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+// The parameter allow_copy is to accept copy for Tensor.to (and by proxy
+// PackedSequences.to) but not nn.Module.to.
+inline std::tuple<
+    c10::optional<at::Device>,
+    c10::optional<at::ScalarType>,
+    bool,
+    bool,
+    c10::optional<at::MemoryFormat>>
+parse_to_conversion(PythonArgs& r, bool allow_copy) {
+  if (r.idx == 0) {
+    if (!allow_copy && !r.isNone(3))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        r.deviceOptional(0),
+        r.scalartypeOptional(1),
+        r.toBool(2),
+        r.toBool(3),
+        r.memoryformatOptional(4));
+  } else if (r.idx == 1) {
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        c10::nullopt,
+        r.scalartype(0),
+        r.toBool(1),
+        r.toBool(2),
+        r.memoryformatOptional(3));
+  } else {
+    auto tensor = r.tensor(0);
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        tensor.device(),
+        tensor.scalar_type(),
+        r.toBool(1),
+        r.toBool(2),
+        r.memoryformatOptional(3));
+  }
+}
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc8866e95b775cc0496c033f04a81d19f2583eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+#include <mutex>
+#include <vector>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+// Warning handler for multi-threaded contexts. Gather warnings from
+// all threads into a single queue, then process together at the end
+// in the main thread.
+class DelayWarningHandler : public at::WarningHandler {
+ public:
+  ~DelayWarningHandler() override = default;
+  void replay_warnings();
+
+ private:
+  void process(const c10::Warning& warning) override;
+
+  std::vector<c10::Warning> warnings_;
+  std::mutex mutex_;
+};
+
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h
new file mode 100644
index 0000000000000000000000000000000000000000..c38740368c93e3d521508996bcd99d16a6ad87f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h
@@ -0,0 +1,155 @@
+#pragma once
+
+// Wrap tensor operation outputs as PyObject*
+
+#include <ATen/ScalarOps.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/python_headers.h>
+#include <initializer_list>
+#include <tuple>
+
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Layout.h>
+#include <torch/csrc/QScheme.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/tensor_qschemes.h>
+
+namespace torch {
+namespace autograd {
+namespace utils {
+
+inline PyObject* wrap(bool value) {
+  if (value) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+inline PyObject* wrap(c10::DeviceIndex value) {
+  return THPUtils_packDeviceIndex(value);
+}
+
+inline PyObject* wrap(int64_t value) {
+  return THPUtils_packInt64(value);
+}
+
+inline PyObject* wrap(double value) {
+  return PyFloat_FromDouble(value);
+}
+
+inline PyObject* wrap(c10::complex<double> value) {
+  // I could probably also use FromComplex with a reinterpret cast,
+  // but... eh.
+  return PyComplex_FromDoubles(value.real(), value.imag());
+}
+
+inline PyObject* wrap(void* value) {
+  return THPUtils_packInt64(reinterpret_cast<intptr_t>(value));
+}
+
+inline PyObject* wrap(THPDtype* dtype) {
+  Py_INCREF(dtype);
+  return (PyObject*)dtype;
+}
+
+inline PyObject* wrap(at::ScalarType scalarType) {
+  return wrap(getTHPDtype(scalarType));
+}
+
+inline PyObject* wrap(THPLayout* layout) {
+  Py_INCREF(layout);
+  return (PyObject*)layout;
+}
+
+inline PyObject* wrap(at::Layout layout) {
+  return wrap(getTHPLayout(layout));
+}
+
+inline PyObject* wrap(at::Tensor tensor) {
+  return THPVariable_Wrap(Variable(std::move(tensor)));
+}
+
+inline PyObject* wrap(const at::Scalar& scalar) {
+  return wrap(scalar_to_tensor(scalar));
+}
+
+inline PyObject* wrap(at::QScheme qscheme) {
+  auto* thp_qscheme = torch::utils::getTHPQScheme(qscheme);
+  Py_INCREF(thp_qscheme);
+  return thp_qscheme;
+}
+
+inline PyObject* wrap(at::TensorList tl) {
+  auto r = THPObjectPtr{PyTuple_New(tl.size())};
+  if (!r)
+    throw python_error();
+  for (const auto i : c10::irange(tl.size())) {
+    PyTuple_SET_ITEM(r.get(), i, wrap(tl[i]));
+  }
+  return r.release();
+}
+
+inline PyObject* wrap(at::IntArrayRef list) {
+  auto r = THPObjectPtr{PyTuple_New(list.size())};
+  if (!r)
+    throw python_error();
+  for (const auto i : c10::irange(list.size())) {
+    PyTuple_SET_ITEM(r.get(), i, wrap(list[i]));
+  }
+  return r.release();
+}
+
+inline PyObject* wrap(at::Stream stream) {
+  return THPStream_Wrap(stream);
+}
+
+namespace detail {
+template <typename F, typename Tuple, size_t... Is>
+void apply_with_idx_impl(
+    const F& f,
+    Tuple& t,
+    std::index_sequence<Is...> /*indices*/) {
+  (void)std::initializer_list<int>{(f(std::get<Is>(t), Is), 0)...};
+}
+
+// For tuple(a, b, c), calls f(a, 0), f(b, 1), f(c, 2)
+template <typename F, typename... Ts>
+void apply_with_idx(const F& f, std::tuple<Ts...>& t) {
+  apply_with_idx_impl(f, t, std::index_sequence_for<Ts...>{});
+}
+} // namespace detail
+
+template <typename... Ts>
+PyObject* wrap(std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyTuple_New(sizeof...(Ts))};
+  if (!r)
+    throw python_error();
+  detail::apply_with_idx(
+      [&](auto& value, size_t idx) {
+        PyTuple_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+      },
+      values);
+  return r.release();
+}
+
+template <typename... Ts>
+PyObject* wrap(PyTypeObject* type, std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyStructSequence_New(type)};
+  if (!r)
+    throw python_error();
+  detail::apply_with_idx(
+      [&](auto& value, size_t idx) {
+        PyStructSequence_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+      },
+      values);
+  return r.release();
+}
+
+} // namespace utils
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae5f4e16baa8bff0d0df6ac103349e7cf86d5841
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h
@@ -0,0 +1,948 @@
+#pragma once
+
+#include <torch/csrc/utils/python_stub.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/cpp_hook.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/forward_grad.h>
+#include <torch/csrc/autograd/function_hook.h>
+
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/VariableHooksInterface.h>
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+
+/// `Variable` is exactly the same as `Tensor` (i.e. we have `using Variable =
+/// at::Tensor`). This means you can perform all the usual mathematical and
+/// other operations you can perform on `Tensor`s also on `Variable`s.
+///
+/// The only reason we are keeping the `Variable` class is backward
+/// compatibility with external user's legacy C++ frontend code. Our intention
+/// is to eliminate the `Variable` class in the near future.
+using Variable = at::Tensor;
+
+} // namespace torch::autograd
+
+// The following are all internal APIs and should not be shown in libtorch docs.
+// Therefore, we wrap the following code with `#ifndef DOXYGEN_SHOULD_SKIP_THIS
+// ... #endif`
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+namespace torch::autograd {
+
+/// Check if this type is supported by the autograd engine.
+/// If you change this, update the doc at the top of the
+/// torch/autograd/__init__.py file and
+/// "test_set_requires_grad_only_for_continuous_types" in test/test_autograd.py
+static inline bool isDifferentiableType(at::ScalarType t) {
+  return isFloatingType(t) || isComplexType(t);
+}
+
+struct Node;
+
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+///                                Variable
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// A `Variable` augments a `Tensor` with the ability to interact in our
+/// autograd machinery. Conceptually, `Variable`s travel along `Edge`s between
+/// `Node`s in the autograd graph. A `Variable` can either be a leaf, like a
+/// weight in a neural network, or an interior variable, when it is the result
+/// of an operation between variables. Every `Variable` also stores another
+/// `Variable` called its `grad` (gradient). If the variable is a leaf, its
+/// gradient will be accumulated into this variable.
+///
+/// Every Tensor is a Variable, but sometimes we colloquially refer to Variables
+/// that don't require gradients as Tensors (since none of the autograd
+/// machinery for Variables applies).  Historically, Variables and Tensors
+/// were separate concepts, but now they are exactly the same (i.e. we have
+/// `using Variable = at::Tensor`).
+///
+///                              Gradient Edges
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Furthermore, `Variable`s have the notion of a `gradient_edge`, which is the
+/// edge in the autograd graph that connects the variable to a particular input
+/// of the gradient function that will be invoked with the variable during the
+/// backward pass. More precisely, this gradient function can be one of two
+/// things:
+/// 1. A `grad_fn`, if the variable is in the interior of the graph. This is the
+///    gradient of the function that produced the variable.
+/// 2. A `grad_accumulator`, if the variable is a leaf, which accumulates a
+///    scalar gradient value into its `grad` variable.
+///
+///                               Versioning
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Another major feature of `Variable`s are *versions*. Versions are
+/// incremented when an in-place mutation of a variable occurs. Versions are
+/// useful when constructing `SavedVariable`s, which take a snapshot of a
+/// `Variable` at a certain version. You can retrieve a `Variable`'s version
+/// through its `current_version()` method.
+///
+///                                 Views
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// It is possible for a  `Variable` to be a *view* of another `Variable`, in
+/// which case it tracks that `Variable`'s data and autograd history. Beyond
+/// construction, the interface of a view is identical to that of a regular
+/// `Variable`. You can determine whether `Variable` is in fact a view by
+/// probing its `is_view()` method. Note that the *view* semantics are only
+/// meaningful for `Variable` relations that are relevant to autograd.
+/// See NOTE [ Autograd View Variables ] for more details.
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct AutogradMeta;
+struct DifferentiableViewMeta;
+
+// Private-ish functions for manipulating variables; we don't want to put them
+// on Tensor proper
+namespace impl {
+
+// WARNING: This may return a nullptr.  If you require AutogradMeta to return
+// a materialized structure, use materialize_autograd_meta instead.
+TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase&);
+
+// WARNING: This will return a nullptr if the Tensor is not a view.
+TORCH_API DifferentiableViewMeta* get_view_autograd_meta(const at::TensorBase&);
+
+// Returns the current autograd meta, materializing it if it was previously
+// none.  This counts as a *mutating* operation, so do not call it on
+// "read-only" operators; in particular, this is NOT thread safe
+TORCH_API AutogradMeta* materialize_autograd_meta(const at::TensorBase&);
+
+/// Set the gradient accumulator of the `Variable`. This is only applicable to
+/// leaf variables. Interior variables should call `set_gradient_edge()`.
+TORCH_API void set_grad_accumulator(
+    const Variable&,
+    std::weak_ptr<Node> grad_accumulator);
+
+/// Attempts to get a pointer to the gradient accumulator of the `Variable`,
+/// if it still exists. If the gradient accumulator function has been
+/// destroyed, returns a `nullptr`.
+TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(const Variable&);
+
+/// Gets the gradient accumulator of the `Variable` if it has one, or else
+/// create one on the fly and return it.
+TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable&);
+
+/// Returns the "canonical" gradient edge of this `Variable`, i.e. either the
+/// gradient function if this is an interior `Variable`, or the gradient
+/// accumulator otherwise. If the `Variable` is interior, the returned `Edge`
+/// will store the input index of the `Node` to which this variable is
+/// connected in its `input_nr` field. For leaves, the `input_nr` is always
+/// zero. Note that `set_gradient_edge` and `gradient_edge` are not
+/// symmetric. You must use `set_gradient_edge` to set the `grad_fn` and
+/// `set_grad_accumulator` to set the accumulator.
+TORCH_API Edge gradient_edge(const Variable&);
+
+/// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
+/// `Variable`.
+/// NOTE: This will always set the `grad_fn`, even if this is a leaf variable,
+/// and never the `grad_accumulator`. For the latter, use
+/// `set_grad_accumulator`. This allows late construction of an interior
+/// `Variable`.
+TORCH_API void set_gradient_edge(const Variable&, Edge edge);
+
+// Autograd Graph Interaction
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Update the `grad_fn` of an existing Variable. Called after in-place
+/// modifications.
+///
+/// For View Variables:
+/// Called after in-place modifications. Modifies the grad_fn of the base
+/// Variable.
+TORCH_API void rebase_history(const Variable&, Edge gradient_edge);
+
+/// Gets the raw gradient function pointer, whatever it currently is.
+TORCH_API Node* grad_fn_unsafe(const Variable&);
+
+/// Increments the version count of this `Variable`.
+TORCH_API void bump_version(const Variable&);
+TORCH_API void set_version_counter(
+    const Variable&,
+    const c10::VariableVersion& version_counter);
+
+/// Retrieves this `Variable`s version counter.
+TORCH_API const c10::VariableVersion& version_counter(const Variable&);
+
+TORCH_API void set_name(const Variable&, const std::string& name);
+
+TORCH_API void add_hook(
+    const at::TensorBase&,
+    std::unique_ptr<FunctionPreHook> hook);
+TORCH_API std::vector<std::unique_ptr<FunctionPreHook>>& hooks(const Variable&);
+TORCH_API void clear_hooks(const at::TensorBase&);
+
+TORCH_API void set_post_acc_grad_hooks(
+    const at::TensorBase&,
+    std::unique_ptr<PostAccumulateGradHook> dict);
+TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
+    const Variable&);
+
+TORCH_API void create_cpp_hook(
+    const at::TensorBase&,
+    bool is_retains_grad_hooks = false);
+} // namespace impl
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                            AutogradMeta
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Each `Variable` has one unique `AutogradMeta` struct, which stores autograd
+/// metadata fields that are necessary for tracking the Variable's autograd
+/// history. As an optimization, a Variable may store a nullptr, in lieu of a
+/// default constructed AutogradMeta.
+
+struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
+  std::string name_;
+
+  Variable grad_;
+  std::shared_ptr<Node> grad_fn_;
+  std::weak_ptr<Node> grad_accumulator_;
+
+  // This field is used to store all the forward AD gradients
+  // associated with this AutogradMeta (and the Tensor it corresponds to)
+  // There is a semantic 1:1 correspondence between AutogradMeta and
+  // ForwardGrad but:
+  //   - This field is lazily populated.
+  //   - This field is a shared_ptr but it must never be
+  //     shared by multiple Tensors. See Note [ Using ForwardGrad ]
+  // Any transition from not_initialized to initialized
+  // must be protected by mutex_
+  mutable std::shared_ptr<ForwardGrad> fw_grad_;
+
+  // The hooks_ field is actually reused by both python and cpp logic
+  // For both cases, we have a data structure, cpp_hooks_list_ (cpp)
+  // or dict (python) which is the canonical copy.
+  // Then, for both cases, we always register a single hook to
+  // hooks_ which wraps all the hooks in the list/dict.
+  // And, again in both cases, if the grad_fn exists on that tensor
+  // we will additionally register a single hook to the grad_fn.
+  //
+  // Note that the cpp and python use cases aren't actually aware of
+  // each other, so using both is not defined behavior.
+  std::vector<std::unique_ptr<FunctionPreHook>> hooks_;
+  std::shared_ptr<hooks_list> cpp_hooks_list_;
+
+  // The post_acc_grad_hooks_ field stores only Python hooks
+  // (PyFunctionTensorPostAccGradHooks) that are called after the
+  // .grad field has been accumulated into. This is less complicated
+  // than the hooks_ field, which encapsulates a lot more.
+  std::unique_ptr<PostAccumulateGradHook> post_acc_grad_hooks_ = nullptr;
+
+  // Only meaningful on leaf variables (must be false otherwise)
+  bool requires_grad_{false};
+
+  // Only meaningful on non-leaf variables (must be false otherwise)
+  bool retains_grad_{false};
+
+  bool is_view_{false};
+
+  // The "output number" of this variable; e.g., if this variable
+  // was the second output of a function, then output_nr == 1.
+  // We use this to make sure we can setup the backwards trace
+  // correctly when this variable is passed to another function.
+  uint32_t output_nr_;
+
+  // Mutex to ensure that concurrent read operations that modify internal
+  // state are still thread-safe. Used by grad_fn(), grad_accumulator(),
+  // fw_grad() and set_fw_grad()
+  // This is mutable because we need to be able to acquire this from const
+  // version of this class for the functions above
+  mutable std::mutex mutex_;
+
+  /// Sets the `requires_grad` property of `Variable`. This should be true for
+  /// leaf variables that want to accumulate gradients, and false for all other
+  /// variables.
+  void set_requires_grad(bool requires_grad, at::TensorImpl* self_impl) final {
+    TORCH_CHECK(
+        !requires_grad ||
+            isDifferentiableType(at::typeMetaToScalarType(self_impl->dtype())),
+        "Only Tensors of floating point and complex dtype can require gradients");
+    requires_grad_ = requires_grad;
+  }
+
+  bool requires_grad() const override {
+    return requires_grad_ || grad_fn_;
+  }
+
+  /// Accesses the gradient `Variable` of this `Variable`.
+  Variable& mutable_grad() override {
+    return grad_;
+  }
+
+  const Variable& grad() const override {
+    return grad_;
+  }
+
+  const Variable& fw_grad(uint64_t level, const at::TensorBase& self)
+      const override;
+
+  void set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op) override;
+
+  AutogradMeta(
+      at::TensorImpl* self_impl = nullptr,
+      bool requires_grad = false,
+      Edge gradient_edge = Edge())
+      : grad_fn_(std::move(gradient_edge.function)),
+
+        output_nr_(gradient_edge.input_nr) {
+    // set_requires_grad also checks error conditions.
+    if (requires_grad) {
+      TORCH_INTERNAL_ASSERT(self_impl);
+      set_requires_grad(requires_grad, self_impl);
+    }
+    TORCH_CHECK(
+        !grad_fn_ || !requires_grad_,
+        "requires_grad should be false if grad_fn is set");
+  }
+
+  ~AutogradMeta() override {
+    // If AutogradMeta is being destroyed, it means that there is no other
+    // reference to its corresponding Tensor. It implies that no other thread
+    // can be using this object and so there is no need to lock mutex_ here to
+    // guard the check if fw_grad_ is populated.
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
+};
+
+/// Base class for view functions, providing reapplication of a view on a new
+/// base. Each view op should get a codegenerated subclass of this class
+/// containing any state needed to reconstruct the view. The class also provides
+/// convenience accessors for saved SymInts / tensor state. This is useful for
+/// e.g. fake-ification, where we want to use symbolic values or fake tensors
+/// instead.
+struct TORCH_API ViewFunc {
+  virtual ~ViewFunc() {}
+  /// Returns any SymInts in the saved state.
+  virtual std::vector<c10::SymInt> get_symints() const {
+    return {};
+  }
+  /// Returns the number of SymInts in the saved state.
+  virtual size_t num_symints() const {
+    return 0;
+  }
+  /// Returns any tensors in the saved state.
+  virtual std::vector<at::Tensor> get_tensors() const {
+    return {};
+  }
+  /// Returns the number of tensors in the saved state.
+  virtual size_t num_tensors() const {
+    return 0;
+  }
+  /// Reapplies the view on the given base using the saved state.
+  virtual at::Tensor operator()(const at::Tensor&) const = 0;
+  /// Returns a clone of this ViewFunc, optionally with the specified saved
+  /// state.
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const = 0;
+
+ protected:
+  /// Sets the values of any SymInts in the saved state. The input vector size
+  /// must match the number of SymInts in the saved state (i.e. the size of the
+  /// list returned by get_symints()).
+  virtual void set_symints(std::vector<c10::SymInt>) {}
+  /// Sets the values of any Tensors in the saved state. The input vector size
+  /// must match the number of Tensors in the saved state (i.e. the size of the
+  /// list returned by get_tensors()).
+  virtual void set_tensors(std::vector<at::Tensor>) {}
+};
+
+/// ViewFunc that represents a chain of two ViewFuncs.
+struct ChainedViewFunc : public ViewFunc {
+  ChainedViewFunc(
+      std::unique_ptr<ViewFunc> first,
+      std::unique_ptr<ViewFunc> second)
+      : first(std::move(first)), second(std::move(second)) {}
+  virtual ~ChainedViewFunc() override{};
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override {
+    return first->num_symints() + second->num_symints();
+  }
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override {
+    return first->num_tensors() + second->num_tensors();
+  }
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override;
+
+ private:
+  std::unique_ptr<ViewFunc> first;
+  std::unique_ptr<ViewFunc> second;
+};
+
+/// ViewFunc that errors with a specified error message when called.
+struct ErroringViewFunc : public ViewFunc {
+  ErroringViewFunc(const std::string& error_msg) : error_msg(error_msg) {}
+  virtual ~ErroringViewFunc() override{};
+  virtual at::Tensor operator()(const at::Tensor&) const override {
+    TORCH_CHECK(false, error_msg);
+  }
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = c10::nullopt,
+      std::optional<std::vector<at::Tensor>> = c10::nullopt) const override {
+    return std::make_unique<ErroringViewFunc>(error_msg);
+  }
+
+ private:
+  std::string error_msg;
+};
+
+struct TORCH_API ViewInfo {
+  /// The base `Variable`
+  /// If this ViewInfo represents a forward (respectively backward) AD gradient,
+  /// then this Tensor cannot be a forward (respectively backward) view.
+  Variable base_;
+
+  /// By default we use as_strided to recover views which is more efficient.
+  /// view_fn is only saved when as_strided is not supported.
+  /// If view_fn has value, we use it to recover views in backward.
+  std::unique_ptr<ViewFunc> view_fn_;
+
+  /// Analogue of view_fn but in reverse: given a view -> produce the base by
+  /// applying the inverse view.
+  std::function<Variable(const Variable&)> rev_view_fn_;
+
+  /// Accessors for the view function
+  bool has_view_fn() const {
+    // assume either BOTH or NEITHER of view_fn_ and rev_view_fn_ exist
+    return view_fn_ != nullptr;
+  }
+
+  const ViewFunc& view_fn() const {
+    TORCH_CHECK(
+        has_view_fn(), "Can only access the view function if it exists.");
+    return *view_fn_;
+  }
+
+  std::function<Variable(const Variable&)> rev_view_fn() const {
+    TORCH_CHECK(
+        has_view_fn(),
+        "Can only access the reverse view function if it exists.");
+    return rev_view_fn_;
+  }
+
+  /// The chain function can be used to build a new ViewInfo for a
+  /// differentiable view function. It will return a new view info that
+  /// accurately represents how "tensor" is a view of this instance's "base_".
+  /// The "base" and "tensor" are respectively the input and output of the
+  /// differentiable view function that happened. They are required to properly
+  /// set the optional view_fn_ when it is not provided. The "view_func", if
+  /// provided, should be a function that allows to re-do the view between
+  /// "base" and "tensor".
+  ViewInfo chain(
+      const Variable& base,
+      const Variable& tensor,
+      std::unique_ptr<ViewFunc> view_func = nullptr,
+      std::function<Variable(const Variable&)> rev_view_func = nullptr) const;
+
+  ViewInfo(
+      Variable base,
+      std::unique_ptr<ViewFunc> view_fn,
+      std::function<Variable(const Variable&)> rev_view_fn)
+      : base_(std::move(base)),
+        view_fn_(std::move(view_fn)),
+        rev_view_fn_(std::move(rev_view_fn)) {
+    TORCH_CHECK(base_.defined(), "base is undefined");
+  }
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                     DifferentiableViewMeta
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// NOTE [ Autograd View Variables ]
+///
+/// Many operations return Variable that shares storage with an input Variable.
+/// The returned Variable is called a **view** Variable on the input **base**
+/// Variable.
+///
+/// In PyTorch, we have two types of views: differentiable views, and
+/// non-differentiable views. In either type, to support proper version
+/// checking, the base and view Variables must always share the same
+/// version_counter.
+///
+///
+/// Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// This class allows to track both forward and backward AD differentiable
+/// views. These views can have different base as non-differentiable view for
+/// forward and backward mode AD are not the same.
+///
+/// Most function are either both forward and backward differentiable views (for
+/// example: view, select, narrow, transpose, etc) or both not forward and not
+/// backward differentiable views (for example: indices, values, eq, lt, etc).
+/// But there are also functions that are forward but not backward
+/// differentiable views (only detach for now) or functions that are backward
+/// but not forward differentiable view (only make_dual and unpack dual for
+/// now).
+///
+/// A concrete example of two views with different bases is as follow:
+///
+///     # Have:
+///     #   dual is a dual Tensor that is neither a forward or backward view
+///     detached_dual = dual.detach()
+///     view = detached_dual.view_as(dual)
+///     # The forward base of view is dual
+///     # The backward base of view is detached_dual
+///
+/// - Backward Mode View
+/// Differentiable views are the view variables where you want gradients to flow
+/// back to the base variables. Out-of-place operations on views are quite
+/// straightforward, but in-place ones are very tricky. Even if the base
+/// variable may not require grad when we create the view, we still need to
+/// track the view relation because future in-place ops may require back-proping
+/// through it. For example, we need to support
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     torch.autograd.grad(base.sum(), var)  <- should return an all ones
+///     tensor
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     view = base[1]
+///     base.copy_(var)
+///     torch.autograd.grad(view.sum(), var)  <- should return a tensor with
+///                                              var[1] filled with all ones and
+///                                              zeros everywhere else
+///
+/// - Forward Mode View
+/// Forward differentiable views follow the same semantic as backward ones but
+/// show up differently as they are computed along with the forward evaluation.
+/// The hard examples above are thus very similar
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     # Now, base is a dual Tensor
+///     _, fw_grad = fwAD.unpack_dual(base) <- fw_grad should be a tensor with
+///                                              fw_grad[1] filled with all ones
+///                                              and zeros everywhere else
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     view = base[1]
+///     base.copy_(var)
+///     _, fw_grad = fwAD.unpack_dual(view) <- fw_grad should be an all ones
+///     tensor
+///
+/// See Note [Forward Grad View/inplace] for more details on how we handle these
+/// hard cases.
+///
+///
+/// DifferentiableViewMeta is created to support gradient tracking of
+/// such **in-place** operations. In particular,
+///   + if an in-place op is done on base, the grad_fn field of the view may
+///     become stale. So accesses should always go through grad_fn(), which
+///     reconstructs an updated grad_fn if the version_counter has incremented.
+///     All other fields are always valid.
+///   + if an in-place op is done on view, in rebase_history() of view, which is
+///     called after every in-place op in VariableType.cpp, the grad_fn of base
+///     is updated.
+///   + if a single autograd Node returns multiple differentiable views, if any
+///     output is modified by an inplace operation, the autograd engine will
+///     make an equivalent graph (corresponding to the view operations) without
+///     using equivalent graph, where each output is treated as if it were
+///     produced by a distinct view operation. This discards the original (e.g.,
+///     user provided) grad_fn. If the provided grad_fn does more than the
+///     backward of the view, then the DifferentiableViewMeta must be created
+///     with creation_meta= CreationMeta::MULTI_OUTPUT_NODE to prevent the
+///     engine from ignoring the provided grad_fn.
+///
+/// Interaction with GradMode:
+/// The particular case that we consider here is:
+///
+///     # Have:
+///     #   base.requires_grad = True or False
+///     with torch.no_grad():
+///         view = base[1]
+///     base.requires_grad_()
+///     view.copy_(var)
+///     torch.autograd.grad(base.sum(), var)  <- what should it return?
+///
+/// Given that this particular code example is ambiguous and can easily be
+/// replace by either moving both inside the no_grad block or both outside, we
+/// explicitly forbid it. For now, it is deprecated by a warning. This is
+/// achieved by setting creation_meta=CreationMeta::NO_GRAD_MODE for all
+/// differentiable views created in no_grad mode.
+///
+/// See Note [View + Inplace update for base tensor]
+/// and Note [View + Inplace update for view tensor] for the details how
+/// autograd handles inplace update with view ops.
+///
+/// Non-Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// In certain cases, although function outputs share storage with inputs, they
+/// will **never** require gradient history tracking. Instead of registering the
+/// view relation via DifferentiableViewMeta in autograd, the views will be
+/// using usual AutogradMeta and just share the version counters with the base
+/// Variables.
+/// Such views include:
+///   1. Views created from .detach()
+///   2. Views that are non-differentiable by its nature.
+///      E.g., `sparse_tensor.indices()` is a integral view on a (possibly)
+///      floating point tensor.
+///      See top of `derivatives.yaml` on how to specify that outputs of a
+///      function are non-differentiable.
+/// These are called non-differentiable views as the gradients do not flow
+/// through the view relation.
+///
+/// Relevant logic for both differentiable and non-differentiable views is
+/// implemented in make_variable_(non_)differentiable_view below, and
+/// wrap_output of gen_variable_type.py.
+
+/// NOTE [ View + Inplace detection ]
+///
+/// We want to detect views followed by inplace as they are often forbidden to
+/// ensure correctness of the computed gradients. But since we want to only
+/// notify the user when both happen, we tag the DifferentiableViewMeta when the
+/// view is created via the `make_variable_*_view()` functions. This tag is then
+/// checked by the `check_inplace()` function from `VariableTypeUtils.h` that
+/// should be called before every inplace operation and to detect cases where
+/// other views are modified and this one is rebased by side effect, we also
+/// check in the `VariableHooks::grad_fn()`.
+
+/// Flag that gives more information about when this view was created:
+/// - IN_CUSTOM_FUNCTION should be set when the view is created inside a custom
+///   autograd Function is returned.
+/// - NO_GRAD_MODE should be set when a view in created when GradMode is
+/// disabled
+/// - MULTI_OUTPUT_NODE should be set when a Node created by codegen code
+/// returns
+///   multiple differentiable views
+/// - Inference_MODE should be set when a view of normal tensor is created in
+/// InferenceMode.
+/// - DEFAULT is for all other cases
+enum class CreationMeta : uint8_t {
+  DEFAULT,
+  IN_CUSTOM_FUNCTION,
+  MULTI_OUTPUT_NODE,
+  NO_GRAD_MODE,
+  INFERENCE_MODE
+};
+
+/// Handles correctly propagating CreationMeta when a new view is created from a
+/// previous view. In general, we don't want the new view to be _less_
+/// restrictive than the previous view (it's okay to be _more_ restrictive). A
+/// CreationMeta value of DEFAULT is currently the least restrictive, as the
+/// behavior for all other CreationMeta values is to error out for in-place ops.
+/// A CreationMeta value of INFERENCE_MODE is currently the most restrictive, so
+/// it takes precedence in propagation. If this changes, the logic here will
+/// need to be updated to properly handle the new semantics.
+inline CreationMeta propagate_creation_meta(
+    CreationMeta prev_view_creation_meta,
+    CreationMeta new_view_creation_meta) {
+  return (new_view_creation_meta == CreationMeta::DEFAULT)
+      ? prev_view_creation_meta
+      : (prev_view_creation_meta == CreationMeta::INFERENCE_MODE
+             ? prev_view_creation_meta
+             : new_view_creation_meta);
+}
+
+/// Unified function to handle error checking when rebase happens
+/// indirect=true means that the caller is not doing the inplace, but the
+/// inplace happened somewhere else.
+TORCH_API void handle_view_on_rebase(
+    DifferentiableViewMeta* diff_view_meta,
+    bool indirect = false);
+
+struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
+ private:
+  /// Informations about the views
+  c10::optional<ViewInfo> backward_info_;
+  c10::optional<ViewInfo> forward_info_;
+
+  // Optimization to reduce the number of ViewInfo we create.
+  // In the (very common) case where backward_info_ == forward_info_, we only
+  // populate backward_info_ (that should be used as both the forward and
+  // backward view information) and set shared_view_info_ = true. Invariants:
+  //   - If shared_view_info_ is false, there is no special constraints on
+  //     backward_info_ and forward_info_
+  //   - If shared_view_info_ is true, we must have:
+  //      - backward_info_.has_value() == true
+  //      - forward_info_.has_value() == false
+  bool shared_view_info_;
+
+  /// The two following fields are extra information that we track to ensure
+  /// that any operation on this backward view is valid.
+
+  /// The value of the version_counter at the time grad_fn was created. The
+  /// grad_fn field is stale if attr_version_ !=
+  /// version_counter.current_version().
+  uint32_t attr_version_;
+  CreationMeta creation_meta_;
+
+ public:
+  /// requires_grad is a backward AD field so we only use the view specific
+  /// logic for backward differentiable views
+  bool requires_grad() const override {
+    return requires_grad_ || grad_fn_ ||
+        (has_bw_view() && get_backward_view().base_.requires_grad());
+  }
+
+  bool shared_view_info() const {
+    return shared_view_info_;
+  }
+
+  bool has_bw_view() const {
+    return backward_info_.has_value();
+  }
+
+  const ViewInfo& get_backward_view() const {
+    TORCH_CHECK(
+        has_bw_view(), "backward view info can only exist for backward views.");
+    return backward_info_.value();
+  }
+
+  uint32_t get_attr_version() const {
+    TORCH_CHECK(
+        has_bw_view(), "attr_version can only exist for backward views.");
+    return attr_version_;
+  }
+
+  void set_attr_version(uint32_t new_attr_version) {
+    TORCH_CHECK(
+        has_bw_view(), "attr_version can only exist for backward views.");
+    attr_version_ = new_attr_version;
+  }
+
+  CreationMeta get_creation_meta() const {
+    TORCH_CHECK(
+        has_bw_view(), "creation_meta can only exist for backward views.");
+    return creation_meta_;
+  }
+
+  void set_creation_meta(CreationMeta new_creation_meta) {
+    TORCH_CHECK(
+        has_bw_view(), "creation_meta can only exist for backward views.");
+    creation_meta_ = new_creation_meta;
+  }
+
+  bool has_fw_view() const {
+    return shared_view_info_ || forward_info_.has_value();
+  }
+
+  const ViewInfo& get_forward_view() const {
+    TORCH_CHECK(
+        has_fw_view(), "forward view info can only exist for forward views.");
+    TORCH_CHECK(
+        !shared_view_info_ || has_bw_view(),
+        "forward view info can only exist for forward views.");
+    return shared_view_info_ ? backward_info_.value() : forward_info_.value();
+  }
+
+  DifferentiableViewMeta(
+      at::TensorImpl* self_impl,
+      c10::optional<ViewInfo> backward_info,
+      c10::optional<ViewInfo> forward_info,
+      bool shared_view_info,
+      CreationMeta creation_meta = CreationMeta::DEFAULT);
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                        Variable Implementation
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Factory Functions
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a `Variable` that is a *view* of another (*base*) variable.
+/// The `gradient_edge` is an optional (gradient_function, input_number) pair.
+/// `is_differentiable` is a bool that specifies whether this view is
+/// differentiable, i.e., whether the relation should be tracked by autograd.
+/// See NOTE [ Autograd View Variables ] for details.
+
+/// NOTE: `allow_tensor_metadata_change` is set to true by default, because
+/// there are a lot of call sites to these factory functions that need to change
+/// the variable's size or storage afterwards, and they don't expect the
+/// original tensor (where the variable is created from) to be updated. Setting
+/// `allow_tensor_metadata_change_` to false by default would unnecessarily
+/// prevent those changes from happening and is undesirable.
+
+// See NOTE [ Autograd View Variables ] for details.
+// Differentiable view. Track history with DifferentiableViewMeta.
+inline Variable make_variable_differentiable_view(
+    const at::Tensor& data,
+    c10::optional<ViewInfo> backward_info,
+    c10::optional<ViewInfo> forward_info,
+    bool shared_view_info,
+    CreationMeta creation_meta,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    TORCH_CHECK(
+        data.getIntrusivePtr()->autograd_meta() == nullptr,
+        "Attempted to make a tensor into a differentiable view, but the "
+        "tensor already had autograd metadata associated with it.  If you are "
+        "using a __torch_dispatch__ mode, the most common cause for this "
+        "problem is that you used torch.overrides.enable_reentrant_dispatch() "
+        "improperly; tensors created within the extent of reentrant dispatch "
+        "MUST NOT be directly returned from __torch_dispatch__; instead, they "
+        "must be wrapped into fresh tensors that serve as the output.  If you "
+        "are not using wrappers, you probably don't need reentrant dispatch.  "
+        "If this doesn't seem applicable, please file a bug to PyTorch.");
+    at::TensorImpl* data_impl = data.unsafeGetTensorImpl();
+    data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    data_impl->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
+        data_impl,
+        std::move(backward_info),
+        std::move(forward_info),
+        shared_view_info,
+        creation_meta));
+    return data;
+  }
+  return Variable();
+}
+
+// See NOTE [ Autograd View Variables ] for details.
+// Non-differentiable view. Just share version counter.
+inline Variable make_variable_non_differentiable_view(
+    const Variable& base,
+    const at::Tensor& data,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    // Currently all of non-differentiable view ops(detach/_indices/_values)
+    // share the same TensorImpl as their base Tensor. Thus a new TensorImpl
+    // allocation here is required.
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/impl::version_counter(base),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    data_impl_copy->set_autograd_meta(nullptr);
+    return Variable(data_impl_copy);
+  }
+  return Variable();
+}
+
+/// Creates a `Variable` from the given `Tensor`, copying its underlying
+/// `TensorImpl`. `requires_grad` should be set only for leaves, and determines
+/// whether the `Variable` will accumulate gradients. NOTE: `data` must *not* be
+/// a `Variable` already. Its dynamic type *must* be `Tensor`.
+///
+/// TODO: Eliminate this function as much as possible, as it can be expressed
+/// more clearly as detach() or a no-op in most call sites (especially when
+/// there is only one use of the variable).
+inline Variable make_variable(
+    at::Tensor data,
+    bool requires_grad = false,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    if (data.getIntrusivePtr().use_count() == 1 &&
+        data.getIntrusivePtr()->unique_version()) {
+      auto data_impl = data.unsafeReleaseIntrusivePtr();
+      data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+      if (requires_grad) {
+        data_impl->set_autograd_meta(
+            std::make_unique<AutogradMeta>(data_impl.get(), requires_grad));
+      } else {
+        data_impl->set_autograd_meta(nullptr);
+      }
+      return Variable(std::move(data_impl));
+    } else {
+      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+          /*version_counter=*/0,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      if (requires_grad) {
+        data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
+            data_impl_copy.get(), requires_grad));
+      } else {
+        data_impl_copy->set_autograd_meta(nullptr);
+      }
+      return Variable(data_impl_copy);
+    }
+  }
+  return Variable();
+}
+
+/// Creates a `Variable` from the given `Tensor`, copying its underlying
+/// `TensorImpl`. `gradient_edge` should be a (function, input_nr) pair
+/// specifying the function in the autograd graph, and what particular input of
+/// that function, this variable is connected to.
+inline Variable make_variable(
+    const at::Tensor& data,
+    Edge gradient_edge,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/0,
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
+        data_impl_copy.get(), false, std::move(gradient_edge)));
+    return Variable(data_impl_copy);
+  }
+  return Variable();
+}
+
+struct VariableHooks final : at::impl::VariableHooksInterface {
+  at::TensorBase tensor_data(const at::TensorBase&) const override;
+  at::TensorBase variable_data(const at::TensorBase&) const override;
+  const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const at::TensorBase&) const override;
+  unsigned _register_hook(
+      const at::TensorBase&,
+      std::function<at::TensorBase(const at::TensorBase&)> hook) const override;
+  void remove_hook(const at::TensorBase&, unsigned pos) const override;
+  bool is_view(const at::TensorBase&) const override;
+  const at::TensorBase& base(const at::TensorBase&) const override;
+  const std::string& name(const at::TensorBase&) const override;
+  bool is_leaf(const at::TensorBase&) const override;
+  int64_t output_nr(const at::TensorBase&) const override;
+  void set_data(const at::TensorBase& self, const at::TensorBase& new_data)
+      const override;
+  at::TensorBase data(const at::TensorBase& self) const override;
+  int64_t _version(const at::TensorBase& self) const override;
+  void retain_grad(const at::TensorBase& self) const override;
+  bool retains_grad(const at::TensorBase& self) const override;
+  void _backward(
+      const at::Tensor& self,
+      at::TensorList inputs,
+      const c10::optional<at::Tensor>& gradient,
+      c10::optional<bool> keep_graph,
+      bool create_graph) const override;
+  void requires_grad_(const at::TensorBase& self, bool _requires_grad)
+      const override;
+  void basic_autograd_not_implemented_fallback(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet dispatch_keys,
+      torch::jit::Stack* stack) const override;
+};
+
+namespace utils {
+
+TORCH_API bool has_same_meta(const Variable& base, const Variable& other);
+
+} // namespace utils
+} // namespace torch::autograd
+
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..c19156d0b1ba5ce261c0c72731413766fafd0521
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+struct TORCH_API VariableInfo {
+  explicit VariableInfo();
+  explicit VariableInfo(const Variable& var);
+
+  Variable zeros(at::OptionalDeviceGuard& device_guard) const;
+
+  at::Layout layout = at::Layout::Strided;
+  at::Device device = at::kCPU;
+  at::ScalarType scalar_type = at::kFloat;
+  std::vector<c10::SymInt> size;
+  bool requires_grad;
+  bool is_empty;
+};
+
+} // namespace torch::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/copy_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/copy_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f76fbaed843b4883ae0e28768b97578f103fe0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/copy_utils.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/Types.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils.h>
+#include <functional>
+#include <vector>
+
+typedef std::function<void(PyObject*, PyObject*, bool)> THPCopyFunction;
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPCopyInfo {
+  PyTypeObject* srcType; // Python type of src tensor/storage
+  THPCopyFunction copy; // copy function
+  bool non_blocking; // true if copy implements an 'non_blocking' copy
+  bool broadcast; // true if the copy implements a broadcast copy
+};
+typedef std::vector<THPCopyInfo> THPCopyList;
+
+inline bool tryTHPCopy(
+    const THPCopyList& v,
+    PyObject* dst,
+    PyObject* src,
+    bool non_blocking,
+    bool broadcast) {
+  for (auto& i : v) {
+    if (i.non_blocking == non_blocking &&
+        PyType_IsSubtype(Py_TYPE(src), i.srcType)) {
+      (i.copy)(dst, src, broadcast);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool THPCopy(
+    const THPCopyList& v,
+    PyObject* dst,
+    PyObject* src,
+    bool non_blocking,
+    bool broadcast) {
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  if (tryTHPCopy(v, dst, src, non_blocking, broadcast)) {
+    return true;
+  } else if (non_blocking && tryTHPCopy(v, dst, src, false, broadcast)) {
+    return true;
+  }
+  THPUtils_setError(
+      "copy from %s to %s isn't implemented",
+      THPUtils_typename(src),
+      THPUtils_typename(dst));
+  return false;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b50b8cc251da65d49981238e04445464cb5f646
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <mutex>
+
+namespace torch::cuda::CUDAPluggableAllocator {
+
+#if defined(TORCH_HIP_VERSION)
+using streamType = c10::hip::HIPStream;
+#else
+using streamType = c10::cuda::CUDAStream;
+#endif
+
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+getCurrentAllocator();
+std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
+createCustomAllocator(
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+void changeCurrentAllocator(
+    const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
+        allocator);
+
+struct _AllocationMetadata {
+  _AllocationMetadata();
+  _AllocationMetadata(
+      size_t size,
+      c10::DeviceIndex device_idx,
+      cudaStream_t stream);
+  size_t size;
+  c10::DeviceIndex device_idx;
+  cudaStream_t stream;
+};
+
+struct CUDAPluggableAllocator
+    : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
+  CUDAPluggableAllocator(
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
+
+  CUDAPluggableAllocator(CUDAPluggableAllocator& other);
+
+  void set_init_fn(std::function<void(int)> init_fn);
+
+  void set_reset_fn(std::function<void()> reset_fn);
+
+  void set_memory_fraction_fn(
+      std::function<void(double, int)> memory_fraction_fn);
+
+  void set_base_alloc_fn(std::function<void*(void*, size_t*)> base_alloc_fn);
+
+  void set_record_stream_fn(
+      std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn);
+
+  void set_begin_allocate_to_pool(
+      std::function<
+          void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)>
+          capture_begin_fn);
+
+  void set_end_allocate_to_pool_fn(
+      std::function<void(int, c10::cuda::MempoolId_t)> capture_about_to_end_fn);
+
+  void set_release_pool(
+      std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn);
+
+  void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream);
+
+  c10::DataPtr allocate(size_t size) override;
+  c10::DeleterFnPtr raw_deleter() const override;
+
+  void* raw_alloc(size_t nbytes) override;
+  void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override;
+  void raw_delete(void* ptr) override;
+  void init(int device_count) override;
+  bool initialized() override;
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+  void emptyCache() override;
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
+  void* getBaseAllocation(void* ptr, size_t* size) override;
+
+  void recordStream(const c10::DataPtr&, streamType stream) override;
+
+  c10::cuda::CUDACachingAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) override;
+  void resetAccumulatedStats(c10::DeviceIndex device) override;
+  void resetPeakStats(c10::DeviceIndex device) override;
+  c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot() override;
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      c10::cuda::MempoolId_t mempool_id,
+      std::function<bool(cudaStream_t)>) override;
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      c10::cuda::MempoolId_t mempool_id) override;
+  void releasePool(c10::DeviceIndex device, c10::cuda::MempoolId_t mempool_id)
+      override;
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
+  void recordHistory(
+      bool enabled,
+      c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      c10::cuda::CUDACachingAllocator::RecordContext when) override;
+  void attachOutOfMemoryObserver(
+      c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) override;
+  void attachAllocatorTraceTracker(
+      c10::cuda::CUDACachingAllocator::AllocatorTraceTracker tracker) override;
+  std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>
+  getCheckpointState(c10::DeviceIndex device, at::cuda::MempoolId_t id)
+      override;
+  c10::cuda::CUDACachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps)
+      override;
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override;
+  cudaError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      cudaStream_t stream,
+      bool p2p_enabled) override;
+  std::string name() override;
+  void copy_data(void* dest, const void* src, std::size_t count) const final;
+
+ protected:
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
+  std::function<void(int)> init_fn_;
+  std::function<void()> reset_fn_;
+  std::function<void(double, int)> memory_fraction_fn_;
+  std::function<void*(void*, size_t*)> base_alloc_fn_;
+  std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn_;
+  std::function<
+      void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)>
+      begin_allocate_to_pool_fn_;
+  std::function<void(int, c10::cuda::MempoolId_t)> end_allocate_to_pool_fn_;
+  std::function<void(int, c10::cuda::MempoolId_t)> relase_pool_fn_;
+  std::mutex allocator_mutex_;
+  // We do the bookeeping here in order to simplify custom allocators
+  std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
+
+  bool initialized_ = false;
+};
+} // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b60a8d94cd755dea2763ec900cf1e4a4cf6921d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h
@@ -0,0 +1,18 @@
+#ifndef THCP_EVENT_INC
+#define THCP_EVENT_INC
+
+#include <ATen/cuda/CUDAEvent.h>
+#include <torch/csrc/python_headers.h>
+
+struct THCPEvent {
+  PyObject_HEAD at::cuda::CUDAEvent cuda_event;
+};
+extern PyObject* THCPEventClass;
+
+void THCPEvent_init(PyObject* module);
+
+inline bool THCPEvent_Check(PyObject* obj) {
+  return THCPEventClass && PyObject_IsInstance(obj, THCPEventClass);
+}
+
+#endif // THCP_EVENT_INC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2a71c50d2b134a7362d6f90f42cd36b2aab7b41
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h
@@ -0,0 +1,11 @@
+#ifndef THCP_CUDA_MODULE_INC
+#define THCP_CUDA_MODULE_INC
+
+PyObject* THCPModule_getDevice_wrap(PyObject* self);
+PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDeviceName_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDriverVersion(PyObject* self);
+PyObject* THCPModule_isDriverSufficient(PyObject* self);
+PyObject* THCPModule_getCurrentBlasHandle_wrap(PyObject* self);
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..472a77a130ebbc6292d93ec1770c8ca09c1dc66f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h
@@ -0,0 +1,20 @@
+#ifndef THCP_STREAM_INC
+#define THCP_STREAM_INC
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THCPStream : THPStream {
+  at::cuda::CUDAStream cuda_stream;
+};
+extern PyObject* THCPStreamClass;
+
+void THCPStream_init(PyObject* module);
+
+inline bool THCPStream_Check(PyObject* obj) {
+  return THCPStreamClass && PyObject_IsInstance(obj, THCPStreamClass);
+}
+
+#endif // THCP_STREAM_INC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h
new file mode 100644
index 0000000000000000000000000000000000000000..8538a65f3e8bf19abad8a5c522eee854eb0f6caa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h
@@ -0,0 +1,10 @@
+#ifndef THCP_H
+#define THCP_H
+
+#include <torch/csrc/THP.h>
+#include <torch/csrc/cuda/Event.h>
+#include <torch/csrc/cuda/Module.h>
+#include <torch/csrc/cuda/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ea6ffa10c1144e19a567523a91dc60a99f1df8e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::cuda {
+
+using tensor_list2d = std::vector<std::vector<at::Tensor>>;
+
+TORCH_CUDA_CU_API std::vector<at::Tensor>& broadcast_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors);
+TORCH_CUDA_CU_API std::vector<at::Tensor> broadcast(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices);
+TORCH_CUDA_CU_API tensor_list2d broadcast_coalesced(
+    at::TensorList tensors,
+    at::IntArrayRef devices,
+    size_t buffer_size);
+
+TORCH_CUDA_CU_API std::vector<at::Tensor>& scatter_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors,
+    int64_t dim = 0,
+    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+        streams = c10::nullopt);
+
+TORCH_CUDA_CU_API std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices,
+    const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
+    int64_t dim = 0,
+    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>&
+        streams = c10::nullopt);
+
+TORCH_CUDA_CU_API at::Tensor& gather_out(
+    at::TensorList tensors,
+    at::Tensor& out_tensor,
+    int64_t dim);
+
+TORCH_CUDA_CU_API at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    c10::optional<int32_t> destination_index);
+
+} // namespace torch::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h
new file mode 100644
index 0000000000000000000000000000000000000000..5798dbae72862222f1627b6f70e09f5f8d68c3d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+#include <bitset>
+#include <cstddef>
+
+namespace torch {
+
+using device_set = std::bitset<C10_COMPILE_TIME_MAX_GPUS>;
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h
new file mode 100644
index 0000000000000000000000000000000000000000..17d23b9122d3fdda0503868fe2a113ee46a7a660
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <string>
+
+namespace torch::cuda {
+
+// C++-only versions of these, for python use
+// those defined in cuda/Module.cpp which also record python state.
+TORCH_CUDA_CU_API void _record_memory_history(
+    bool enabled,
+    bool record_context = true,
+    int64_t trace_alloc_max_entries = 1,
+    bool trace_alloc_record_context = false,
+    bool record_cpp_context = false);
+
+TORCH_CUDA_CU_API void _record_memory_history(
+    c10::optional<std::string> enabled = "all",
+    c10::optional<std::string> context = "all",
+    const std::string& stacks = "all",
+    size_t max_entries = SIZE_MAX);
+
+TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
+
+} // namespace torch::cuda
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2813b56ba4876682924e052471de6430a7bf79e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/util/Optional.h>
+
+#include <cstddef>
+#include <vector>
+
+// NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.10+, or for
+// HIP 3.1+
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+#define HAS_NCCL_BF16_DATATYPE \
+  ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
+#elif defined(USE_ROCM) && (TORCH_HIP_VERSION >= 301)
+#define HAS_NCCL_BF16_DATATYPE 1
+#else
+#define HAS_NCCL_BF16_DATATYPE 0
+#endif
+
+namespace torch::cuda::nccl {
+
+/* The following are copied from <nccl.h> and redefined in torch::cuda::nccl
+ * namespace */
+/* pytorch should only use the following definition within pytorch scope */
+
+/* Opaque handle to communicator to ncclComm*, this will reinterpret as ncclComm
+ * in nccl.cpp */
+typedef void* ncclComm_t;
+
+/** redefine nccl unique ID in torch scope. this should be identical to native
+ * nccl impp. */
+#define NCCL_UNIQUE_ID_BYTES 128
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+typedef struct {
+  char internal[NCCL_UNIQUE_ID_BYTES];
+} ncclUniqueId;
+
+/* Error type */
+enum class ncclResult {
+  Success = 0,
+  UnhandledCudaError = 1,
+  SystemError = 2,
+  InternalError = 3,
+  InvalidArgument = 4,
+  InvalidUsage = 5,
+  NumResults = 6,
+  InProgress = 7
+};
+
+/* Reduction operation selector */
+enum class ncclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3, NumOps = 4 };
+
+/* Data types */
+enum class ncclDataType {
+  Int8 = 0,
+  Char = 0,
+  Uint8 = 1,
+  Int32 = 2,
+  Int = 2,
+  Uint32 = 3,
+  Int64 = 4,
+  Uint64 = 5,
+  Float16 = 6,
+  Half = 6,
+  Float32 = 7,
+  Float = 7,
+  Float64 = 8,
+  Double = 8,
+  Bfloat16 = 9,
+  NumTypes = 10
+};
+
+// RAII helper class to manage NCCL group API and CUDA free mutex.
+// The destructor is allowed to throw since this helper class only
+// manages group and lock lifetimes.
+struct AutoNcclGroup {
+  AutoNcclGroup();
+  AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking);
+  ~AutoNcclGroup() noexcept(false);
+  ncclComm_t comm_;
+  bool comm_nonblocking_;
+};
+
+// NOTE: this is exposed only so that python_nccl.cpp can some of these helpers.
+// Don't use them outside of these files.
+namespace detail {
+
+TORCH_CUDA_CPP_API void throw_nccl_error(ncclResult status);
+
+static inline void NCCL_CHECK(ncclResult status) {
+  if (status != ncclResult::Success) {
+    throw_nccl_error(status);
+  }
+}
+
+TORCH_CUDA_CPP_API at::ArrayRef<ncclComm_t> get_communicators(
+    at::TensorList inputs);
+TORCH_CUDA_CPP_API void check_inputs(
+    at::TensorList inputs,
+    at::TensorList outputs,
+    int input_multiplier,
+    int output_multiplier);
+TORCH_CUDA_CPP_API void check_inputs(
+    at::TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    int input_multiplier,
+    int output_multiplier);
+
+} // namespace detail
+
+using comm_list = std::vector<ncclComm_t>;
+using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
+
+TORCH_CUDA_CPP_API std::uint64_t version();
+TORCH_CUDA_CPP_API const char* version_suffix();
+
+bool is_available(at::TensorList tensors);
+
+TORCH_CUDA_CPP_API void get_unique_id(ncclUniqueId& id);
+TORCH_CUDA_CPP_API ncclComm_t
+comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
+TORCH_CUDA_CPP_API void comm_destroy(ncclComm_t comm);
+
+TORCH_CUDA_CPP_API void broadcast(
+    at::TensorList tensors,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+size_t get_max_count();
+
+TORCH_CUDA_CPP_API void reduce(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& output,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void reduce(
+    std::vector<at::Tensor>& inputs,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void reduce_scatter(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void scatter(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& outputs,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root = 0);
+
+TORCH_CUDA_CPP_API void all_gather(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root = 0);
+
+TORCH_CUDA_CPP_API void all2all_single_equal_split(
+    at::Tensor& input,
+    at::Tensor& output,
+    int size,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void all2all_single_unequal_split(
+    void* sendbuff,
+    const size_t* sendcounts,
+    const size_t* senddispls,
+    void* recvbuff,
+    const size_t* recvcounts,
+    const size_t* recvdispls,
+    size_t size,
+    c10::ScalarType type,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void all2all(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void send(
+    const at::Tensor& input,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int dst);
+
+TORCH_CUDA_CPP_API void recv(
+    at::Tensor& output,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int src);
+} // namespace torch::cuda::nccl
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..78dce05e5d67f253b10830fb7a1cb0032f7f49ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::cuda::python {
+
+void initCommMethods(PyObject* module);
+
+} // namespace torch::cuda::python
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0b3bcce6698f2dd34e8613e156c3f0f278b1b73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_unique_id(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfa88c028edb4922d67cbbd94dd137298da092f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/autograd/context/context.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Singleton class per worker which is responsible for storing the distributed
+// autograd context for each autograd pass and also cleans up data for an
+// autograd pass once its done.
+//
+// Each autograd pass is assigned a unique autograd_context_id and all data for
+// that pass (DistAutogradContext) is stored in this container indexed by the
+// autograd_context_id. The autograd_context_id itself is a 64 bit globally
+// unique id. The first 16 bits is the worker_id and the next 48 bits is an
+// auto-incrementing id for each worker.
+//
+// This container is also responsible for maintaining a globally unique message
+// id, which is used to associate send/recv autograd function pairs. The format
+// is similar to the autograd_context_id where we have a 64 bit integer with
+// first 16 bits being the worker id and next 48 bits are auto-incrementing.
+class TORCH_API DistAutogradContainer {
+ public:
+  explicit DistAutogradContainer(uint32_t num_shards);
+
+  // One time initialization of the container.
+  static DistAutogradContainer& init(int64_t worker_id);
+
+  // Retrieve the singleton instance of the container, ensures we have
+  // initialized the container.
+  static DistAutogradContainer& getInstance();
+
+  // Create a new context for a distributed autograd pass.
+  const ContextPtr newContext();
+
+  // Clean up resources for a given context_id once the autograd pass is done.
+  // Sends RPC to other workers this worker knows about, telling them to clean
+  // up their context as well. Throws an exception if the context_id does not
+  // exist.
+  void releaseContext(int64_t context_id);
+
+  // Releases an autograd context if it is present on this node. Also sends RPC
+  // to other workers this worker knows about, telling them to clean up their
+  // context. Does nothing if it is not present.
+  void releaseContextIfPresent(int64_t context_id);
+
+  // Checks if the passed in context_id is valid.
+  void isValidContext(int64_t context_id);
+
+  // Retrieve the autograd context for a given context_id.
+  ContextPtr retrieveContext(int64_t context_id);
+
+  // Retrieves the currently active autograd context for the current thread.
+  ContextPtr currentContext();
+
+  // Checks whether or not the current thread has a valid autograd context.
+  bool hasValidContext() const;
+
+  // Generate a new autograd_message_id for send/recv autograd functions.
+  int64_t newAutogradMessageId();
+
+  // Creates a new autograd context with the provided context_id. If a context
+  // already exists with the provided context_id, we just return it.
+  // This does not set the current context for the current thread.
+  ContextPtr getOrCreateContext(int64_t context_id);
+
+  // Retrieves the maximum possible autograd_context_id/autograd_message_id that
+  // can be generated by this worker.
+  int64_t getMaxId();
+
+  // Retrieves the worker ID for this node
+  rpc::worker_id_t getWorkerId() const;
+
+  // Can set current context id if there is no valid context yet
+  static void setCurrentContextId(int64_t contextId);
+
+  // Forcibly sets the thread local current context id. Should only be used in
+  // cases where you know what you're doing and need to override the thread
+  // local. Otherwise, use setCurrentContextId instead.
+  static void forceCurrentContextId(int64_t contextId);
+
+  // Clear current context id
+  void clearCurrentContext();
+
+  // Returns the number of autograd contexts in the container.
+  size_t numAutogradContexts() const;
+
+  // Returns the current thread local context id for this thread.
+  static int64_t currentContextId();
+
+  DistAutogradContainer(const DistAutogradContainer&) = delete;
+  DistAutogradContainer& operator=(const DistAutogradContainer&) = delete;
+  DistAutogradContainer(DistAutogradContainer&&) = delete;
+  DistAutogradContainer& operator=(DistAutogradContainer&&) = delete;
+
+ private:
+  // Number of shards for the map storing autograd contexts. We'd like this
+  // to be a power of 2 and we don't expect a value much higher than the
+  // number of cores would provide much benefit.
+  static constexpr uint32_t kNumDefaultShards = 128;
+
+  // Use cache line size for alignment.
+  static constexpr int kCacheLineSize = 64;
+
+  // Structure holding one shard of the sharded autograd context map with its
+  // associated lock. Align to cache line size to avoid contention between
+  // adjacent entries.
+  struct alignas(kCacheLineSize) ContextsShard {
+    // Lock for this shard.
+    mutable std::mutex lock;
+
+    // Map storing autograd contexts for this shard.
+    std::unordered_map<int64_t, ContextPtr> contexts;
+  };
+
+  DistAutogradContainer() = delete;
+  ~DistAutogradContainer() = default;
+
+  static DistAutogradContainer& getInstanceInternal();
+
+  // Retrieve the shard for given context_id.
+  ContextsShard& getShard(int64_t context_id);
+
+  // Sends an RPC to the workers that have a context corresponding to passed in
+  // context_id. This function should be called with the lock.
+  void sendReleaseContextRpc(
+      const std::unordered_set<rpc::worker_id_t>& workerIds,
+      int64_t context_id);
+
+  // Erase context_id from the autograd context map, and reset the thread local
+  // current context id if it corresponds to the passed in context id. This
+  // function should be called with the lock.
+  void eraseContextIdAndReset(ContextsShard& shard, int64_t context_id);
+
+  // Compute the number of shards for the autograd_contexts_ map.
+  static uint32_t computeNumShards();
+
+  // Auto incrementing context id used to identify unique autograd passes.
+  // Initialized with the first 16 bits being the worker_id.
+  std::atomic<int64_t> next_context_id_;
+
+  // Unique id to identify a worker in the distributed setting.
+  int16_t worker_id_;
+
+  // Whether or not the container has been initialized appropriately.
+  bool initialized_;
+
+  // Sharded autograd context map.
+  std::vector<ContextsShard> autograd_contexts_;
+
+  // Number of shards for the sharded autograd_contexts_ map.
+  uint32_t num_shards_;
+
+  // Autograd message id to identify unique send/recv autograd function pairs.
+  std::atomic<int64_t> next_autograd_message_id_;
+
+  // Maximum allowed value for autograd_context_id or autograd_message_id.
+  int64_t max_id_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..5450df1e6ccda3e09c73ddfff59c153b446d0468
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <ATen/core/Dict.h>
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
+#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+class RecvRpcBackward;
+
+// DistAutogradContext which stores information for a single distributed
+// autograd pass on a worker.
+class TORCH_API DistAutogradContext {
+ public:
+  using GradCallback = std::function<bool(torch::Tensor&)>;
+
+  explicit DistAutogradContext(int64_t contextId);
+
+  // Retrieves the autograd context id for this context.
+  int64_t contextId() const;
+
+  // Records a 'send' autograd function for this context with the provided
+  // message id.
+  void addSendFunction(
+      const std::shared_ptr<SendRpcBackward>& func,
+      int64_t autograd_message_id);
+
+  // Records a 'recv' autograd function for this context with the provided
+  // message id.
+  void addRecvFunction(
+      std::shared_ptr<RecvRpcBackward>& func,
+      int64_t autograd_message_id);
+
+  // Given an autograd_message_id, retrieve the appropriate send function.
+  std::shared_ptr<SendRpcBackward> retrieveSendFunction(
+      int64_t autograd_message_id);
+
+  // Return all send functions for this context.
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>> sendFunctions()
+      const;
+
+  // Return all recv functions for this context.
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>> recvFunctions()
+      const;
+
+  // Adds a future message recording an outstanding RPC.
+  void addOutstandingRpc(const c10::intrusive_ptr<rpc::JitFuture>& jitFuture);
+
+  // Returns all gradients.
+  const c10::Dict<torch::Tensor, torch::Tensor> getGradients() const;
+
+  // This function gives a mutable grad reference to the callback.
+  // If the callback returns true, it means the grad in the context
+  // needs to be updated.
+  void runGradCallbackForVariable(
+      const torch::autograd::Variable& variable,
+      GradCallback&& cb);
+
+  DistAutogradContext(const DistAutogradContext&) = delete;
+  DistAutogradContext& operator=(const DistAutogradContext&) = delete;
+  DistAutogradContext(DistAutogradContext&&) = delete;
+  DistAutogradContext& operator=(DistAutogradContext&&) = delete;
+
+  // records the workerID of a node that we sent an RPC to.
+  // workerIDs are added here when we attach a send function to this autograd
+  // context
+  void addKnownWorkerId(const rpc::worker_id_t workerId);
+
+  // Retrieves a set containing the known workerIds for this context
+  // These are the different workers that this context has sent RPCs to.
+  std::unordered_set<rpc::worker_id_t> getKnownWorkerIds() const;
+
+ private:
+  friend class BackwardPassCleanupGuard;
+  friend class DistEngine;
+  friend class RecvRpcBackward;
+  friend class DistAccumulateGradCaptureHook;
+
+  // Record that we would like to accumulate the provided gradient on the given
+  // variable.
+  void accumulateGrad(
+      const torch::autograd::Variable& variable,
+      const torch::Tensor& grad,
+      size_t num_expected_refs);
+
+  // Retrieve the GraphTask.
+  std::shared_ptr<torch::autograd::GraphTask> retrieveGraphTask();
+
+  // Set the appropriate graph task for the backward pass. Can be called only
+  // once.
+  void setGraphTask(std::shared_ptr<torch::autograd::GraphTask> graphTask);
+
+  // Resets the graph task to ensure we can run another distributed backward
+  // pass for the same autograd context.
+  void resetGraphTask();
+
+  // Waits for all outstanding RPCs for this context to finish and clears all
+  // outstanding rpcs held in this context. This should be called only once.
+  c10::intrusive_ptr<c10::ivalue::Future> clearAndWaitForOutstandingRpcsAsync();
+
+  void clearOutstandingRpcs();
+
+  // Record an event to mark the completion of gradient computation. These
+  // events will later help to properly synchronize gradients consumptions
+  // in getGradients(). We need these events because backward and
+  // optimizer.step are separate RPC calls, and will occur on different CUDA
+  // streams. Without synchronization, it is possible that gradients are
+  // consumed before they are ready.
+  void recordGradEvent(c10::Device device);
+
+  const int64_t contextId_;
+
+  // Set containing known worker IDs, used in cleaning up autograd context.
+  // Whenever a sendRpcBackward is attached to the autograd graph for this
+  // context, the destination is added here.
+  std::unordered_set<rpc::worker_id_t> knownWorkerIds_;
+
+  // Map from autograd_message_id to appropriate 'send' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>>
+      sendAutogradFunctions_;
+
+  // Map from autograd_message_id to appropriate 'recv' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>>
+      recvAutogradFunctions_;
+
+  // Gradients accumulated in this context so far. The key is the variable on
+  // which the gradient needs to be accumulated and the value is the gradient
+  // that needs to be accumulated on that variable..
+  c10::Dict<torch::Tensor, torch::Tensor> accumulatedGrads_;
+
+  // See comments for recordGradEvent(c10::Device device);
+  std::unordered_map<c10::Device, c10::Event> gradReadyEvents_;
+  const c10::impl::VirtualGuardImpl impl_;
+
+  // The autograd GraphTask for the backward pass on this node for this context.
+  std::shared_ptr<torch::autograd::GraphTask> graphTask_;
+
+  // List of futures for RPCs initiated by this node to propagate gradients to
+  // other nodes. The distributed autograd engine on this node can return
+  // successfully only if all these futures are done and are successful.
+  std::vector<c10::intrusive_ptr<rpc::JitFuture>> outStandingRpcs_;
+
+  // Lock to protect concurrent modification of the context.
+  mutable std::mutex lock_;
+};
+
+using ContextPtr = std::shared_ptr<DistAutogradContext>;
+
+// This class stores a shared_ptr to a DistAutogradContext instance in a
+// thread local variable. The instance is given by the call site. The class
+// doesn't know the current context. It's just a util class.
+class TORCH_API ThreadLocalDistAutogradContext {
+ public:
+  // Store 'new_context' to the thread local variable maintained by this class.
+  explicit ThreadLocalDistAutogradContext(ContextPtr&& new_context);
+  ~ThreadLocalDistAutogradContext();
+
+  // Retrieve the stored DistAutogradContext instance.
+  static ContextPtr getContextPtr();
+
+ private:
+  ContextPtr prev_context_ptr_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..654939926dc4735d4de558c8511cdb26f98706ca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/distributed/autograd/context/context.h>
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Forward declarations.
+class DistAutogradContext;
+
+// As part of our distributed autograd implementation, whenever we receive an
+// RPC from a node, we add a 'RecvRpcBackward' autograd function to the
+// autograd graph. This is more or less a placeholder function that is used to
+// pass gradients to the remote host during the backward pass. The inputs to the
+// RPC function are the inputs to this autograd function.
+class TORCH_API RecvRpcBackward : public torch::autograd::Node {
+ public:
+  explicit RecvRpcBackward(
+      const AutogradMetadata& autogradMetadata,
+      std::shared_ptr<DistAutogradContext> autogradContext,
+      rpc::worker_id_t fromWorkerId,
+      rpc::DeviceMap deviceMap);
+
+  torch::autograd::variable_list apply(
+      torch::autograd::variable_list&& grads) override;
+
+ private:
+  const AutogradMetadata autogradMetadata_;
+
+  // Hold a weak reference to the autograd context to avoid circular
+  // dependencies with the context (since it holds a reference to
+  // RecvRpcBackward).
+  std::weak_ptr<DistAutogradContext> autogradContext_;
+
+  // The worker id from which the RPC was received. During the backward pass,
+  // we need to propagate the gradients to this workerId.
+  rpc::worker_id_t fromWorkerId_;
+
+  // Device mapping for tensors sent over RPC.
+  const rpc::DeviceMap deviceMap_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bbe96d9ff7d3a3d5db9145dcd18871cbd05fd57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// As part of our distributed autograd implementation, whenever we send an RPC
+// from one node to another, we add a 'SendRpcBackward' autograd function to the
+// autograd graph. This is more or less a placeholder function that is used to
+// kickoff the autograd engine on the current worker on the backward pass. The
+// edges for this autograd function are the inputs to the RPC method.
+//
+// During the backward pass, this function is queued for execution in the
+// autograd engine which eventually runs the rest of the autograd graph.
+struct TORCH_API SendRpcBackward : public torch::autograd::Node {
+ public:
+  torch::autograd::variable_list apply(
+      torch::autograd::variable_list&& inputs) override;
+
+  // SendRpcBackward is actually the root of an autograd graph on the local
+  // node. As a result, it doesn't receive any 'inputs', but rather the RPC
+  // framework passes gradients over to this function to kickoff local autograd
+  // computation.
+  void setGrads(const torch::autograd::variable_list& grads);
+
+  // Retrieve the grads for the function.
+  const torch::autograd::variable_list& getGrads() const;
+
+ private:
+  torch::autograd::variable_list grads_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..68671c8c82d41eabeca4bb771022a72a9dad7ff9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstdint>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// This structure represents autograd metadata that we need to pass across
+// different nodes when we call an RPC which needs autograd computation.
+struct TORCH_API AutogradMetadata {
+  AutogradMetadata(int64_t autogradContextId, int64_t autogradMessageId);
+
+  // autogradContextId_ is a globally unique integer that identifies a
+  // particular distributed autograd pass.
+  int64_t autogradContextId;
+  // autogradMessageId_ is a globally unique integer that identifies a pair
+  // of send/recv autograd functions.
+  int64_t autogradMessageId;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..999d6fc0126f7de2c27f744e528aa99b058b91cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Used to request other workers to clean up their autograd context.
+class TORCH_API CleanupAutogradContextReq : public rpc::RpcCommandBase {
+ public:
+  explicit CleanupAutogradContextReq(int64_t context_id);
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<CleanupAutogradContextReq> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieve the context id we are cleaning up with this message.
+  int64_t getContextId();
+
+ private:
+  int64_t context_id_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..222e2f903fc4eba97cd5c127d06e4706c4653331
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Empty response for CleanupAutogradContextReq. Send to acknowledge receipt of
+// a CleanupAutogradContextReq.
+class TORCH_API CleanupAutogradContextResp : public rpc::RpcCommandBase {
+ public:
+  CleanupAutogradContextResp() = default;
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<CleanupAutogradContextResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dfbac341a2b6c3d86c08df27419d48020c7aa3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Used to propagate gradients from one node to another during a distributed
+// backwards pass. This RPC call is invoked when we hit a `recv` autograd
+// function during backward pass execution.
+class TORCH_API PropagateGradientsReq : public rpc::RpcCommandBase {
+ public:
+  PropagateGradientsReq(
+      const AutogradMetadata& autogradMetadata,
+      std::vector<torch::autograd::Variable> grads,
+      bool retainGraph = false);
+
+  const AutogradMetadata& getAutogradMetadata();
+
+  const std::vector<torch::autograd::Variable>& getGrads();
+
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<PropagateGradientsReq> fromMessage(
+      const rpc::Message& message);
+
+  // Whether or not to retain the autograd graph.
+  bool retainGraph();
+
+ private:
+  AutogradMetadata autogradMetadata_;
+  std::vector<torch::autograd::Variable> grads_;
+  bool retainGraph_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0593a77d94ad32aa9809e522c71caa26619864b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Response for the PropagateGradients call. Currently, this class is mostly
+// just a placeholder and sends an empty message over the wire. The purpose of
+// this RPC command is to indicate whether or not the PropagateGradientsReq call
+// was successfully or not.
+class TORCH_API PropagateGradientsResp : public rpc::RpcCommandBase {
+ public:
+  PropagateGradientsResp() = default;
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<PropagateGradientsResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..56ecbfd566eb671e36ab8f401459f29465b84525
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Represents an RPC that includes autograd information. This class basically
+// wraps another `RpcCommandBase` object which represents the actual RPC and has
+// additional autograd information associated with that RPC.
+class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
+ public:
+  // Used when we are sending an RPC over the wire.
+  RpcWithAutograd(
+      rpc::worker_id_t fromWorkerId,
+      rpc::MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      rpc::DeviceMap deviceMap = {});
+
+  // Used when receiving an RPC over the wire.
+  RpcWithAutograd(
+      rpc::worker_id_t fromWorkerId,
+      rpc::MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      rpc::DeviceMap deviceMap = {});
+
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+
+  static std::unique_ptr<RpcWithAutograd> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieves tensors as part of this RPC, which need to be considered for
+  // autograd computations.
+  std::vector<torch::Tensor>& tensors();
+
+  const AutogradMetadata& autogradMetadata() const;
+
+  RpcCommandBase& wrappedRpc();
+
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+
+  // Message type of the wrapped RPC.
+  rpc::MessageType wrappedMessageType() const;
+
+  // Retrieve the worker id from which the RPC originated.
+  rpc::worker_id_t fromWorkerId() const;
+
+  // Retrieve the device map.
+  const rpc::DeviceMap& deviceMap();
+
+ private:
+  // WorkerId from which this RPC originated. This is necessary for knowing
+  // which worker we need to contact during the backward pass.
+  rpc::worker_id_t fromWorkerId_;
+
+  // Message type for this call.
+  rpc::MessageType messageType_;
+
+  AutogradMetadata autogradMetadata_;
+
+  // Since wrappedMessage_ is destructively constructed from wrappedRpc_,
+  // they are valid exclusively. They are used for different purpose.
+  // wrappedRpc_ is used while constructing receive rpcWithAutograd;
+  // wrappedMessage_ is used while constructing send rpcWithAutograd;
+
+  // When receive rpcWithAutograd is constructed fromMessage, it is valid;
+  // When send rpcWithAutograd is constructed before toMessage, it is nullptr;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+
+  // Serialized message representing wrappedRpc_. Used mostly as a cache to
+  // avoid serializing the request twice.
+  // When receive rpcWithAutograd is constructed fromMessage, it is nullptr;
+  // When send rpcWithAutograd is constructed before toMessage, it is valid;
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+
+  // message type of the wrappedMessage, this is stored separately since
+  // wrappedMessage_ is not always guaranteed to be populated.
+  rpc::MessageType wrappedMessageType_;
+
+  // Tensors part of the wrappedRpc that need to be considered for autograd.
+  std::vector<torch::Tensor> tensors_;
+
+  // Device mapping for tensors that are sent across an RPC to another node.
+  rpc::DeviceMap deviceMap_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b28542eb9bfe1cd6b27edccde20e1b08b32a012
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+class TORCH_API RpcWithProfilingReq : public rpc::RpcCommandBase {
+ public:
+  // For sending RPCs, invoked when client is creating this RPC command.
+  RpcWithProfilingReq(
+      rpc::MessageType messageType,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      torch::autograd::profiler::ProfilerConfig&& profilerConfig,
+      rpc::ProfilingId profilingKeyId);
+
+  // For receiving an RPC
+  // Used in fromMessage.
+  RpcWithProfilingReq(
+      rpc::MessageType messageType,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      torch::autograd::profiler::ProfilerConfig&& profilerConfig,
+      rpc::ProfilingId profilingKeyId);
+
+  // Convert this RPC Command to a Message that can be sent over the wire.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RpcWithProfilingReq> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieve the profiling data that is associated with this command.
+  torch::autograd::profiler::ProfilerConfig getProfilingConfig() const;
+  // Retrieve the globally unique profiling ID corresponding to this command.
+  const rpc::ProfilingId& getProfilingId() const;
+  // Retrieve the original RPC which this ProfilingRPC wraps.
+  RpcCommandBase& wrappedRpc();
+  // Destructively move the wrapped RPC.
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+  // Message type of the wrapped RPC
+  rpc::MessageType wrappedMessageType() const;
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+ private:
+  // message type
+  const rpc::MessageType messageType_;
+  // wrapped message
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+  rpc::MessageType wrappedMessageType_;
+  std::vector<torch::Tensor> tensors_;
+  const torch::autograd::profiler::ProfilerConfig profilerConfig_;
+  const rpc::ProfilingId profilingKeyId_;
+};
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bb380db04bd0888674d8c548f6a1dc231ef4336
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
+ public:
+  // For sending RPCs over the wire
+  RpcWithProfilingResp(
+      rpc::MessageType messageType,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
+      rpc::ProfilingId profilingId);
+
+  // For receiving RPCs. Used in from message when converting a message received
+  // over the wire.
+  RpcWithProfilingResp(
+      rpc::MessageType messageType,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
+      rpc::ProfilingId profilingId);
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RpcWithProfilingResp> fromMessage(
+      const rpc::Message& message);
+  // Retrieve remote Events
+  std::vector<torch::autograd::profiler::LegacyEvent> getProfiledEvents() const;
+  // Retrieve the globally unique profiling ID corresponding to this command.
+  const rpc::ProfilingId& getProfilingId() const;
+  // Retrieve the original RPC which this ProfilingRPC wraps.
+  RpcCommandBase& wrappedRpc();
+  // Destructively move the wrapped RPC.
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+  // Message type of the wrapped RPC
+  rpc::MessageType wrappedMessageType() const;
+  // Set the wrapped RPC for this RPC.
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+ private:
+  // message type
+  const rpc::MessageType messageType_;
+  // wrapped message
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+  rpc::MessageType wrappedMessageType_;
+  std::vector<torch::Tensor> tensors_;
+  const std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents_;
+  const rpc::ProfilingId profilingId_;
+};
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..e075b1dee456f1f38d45d60ad2f7a8da5a1d15e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Internal system RPC to invoke distributed backward pass on remote nodes when
+// 'rref.backward()' is invoked.
+class TORCH_API RRefBackwardReq : public rpc::RpcCommandBase {
+ public:
+  RRefBackwardReq(
+      const rpc::RRefId& rrefId,
+      int64_t autogradContextId,
+      bool retainGraph = false);
+
+  const rpc::RRefId& getRRefId() const;
+
+  int64_t getAutogradContextId() const;
+
+  bool retainGraph() const;
+
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefBackwardReq> fromMessage(
+      const rpc::Message& message);
+
+ private:
+  const rpc::RRefId rrefId_;
+  const int64_t autogradContextId_;
+  const bool retainGraph_;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..43ef1b754d0a01ddec2aaeeb1018289a5da78566
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// Response for the RRefBackwardReq.
+class TORCH_API RRefBackwardResp : public rpc::RpcCommandBase {
+ public:
+  RRefBackwardResp() = default;
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefBackwardResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6c915309156622ccba64b21e5713f3ed60568098
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp
@@ -0,0 +1,408 @@
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/distributed/c10d/debug.h>
+
+constexpr auto kBackendDefaultTimeout =
+    std::chrono::milliseconds(30 * 60 * 1000);
+
+namespace c10d {
+
+class TORCH_API Backend : public torch::CustomClassHolder {
+ public:
+  // Backend Options is a base struct that defines the basic options
+  // when constructing a Backend. Each Backend subclass should
+  // extend this struct and define its options if it wants to provide more
+  // config options (beyond basic ones defined here) to end user.
+  struct TORCH_API Options : torch::CustomClassHolder {
+    explicit Options(
+        std::string backend,
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout)
+        : timeout(timeout), backend(std::move(backend)) {}
+    ~Options() override = default;
+
+    std::chrono::milliseconds timeout;
+
+    // backend name
+    const std::string backend;
+  };
+
+  explicit Backend(int rank, int size);
+  ~Backend() override = 0;
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getSize() const {
+    return size_;
+  }
+
+  // Returns an unique opaque ID of this backend that can be used to correlate
+  // with its collectives.
+  int64_t getID() const {
+    return reinterpret_cast<std::intptr_t>(this);
+  }
+
+  virtual bool supportsSplitting() const {
+    return false;
+  }
+
+  virtual void startCoalescing() {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not implement startCoalescing"));
+  }
+
+  virtual c10::intrusive_ptr<Work> endCoalescing() {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not implement endCoalescing"));
+  }
+
+  // Subclasses must override this method to return the backend name
+  virtual const std::string getBackendName() const {
+    TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
+  };
+
+  virtual c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& /* tensors */,
+      const BroadcastOptions& /* opts */ = BroadcastOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support broadcast"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support allreduce"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allreduce sparse"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allreduce_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const ReduceOptions& /* opts */ = ReduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support reduce"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support allgather"));
+  }
+
+  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
+  // For implementers of ProcessGroup API and advanced users only.
+  // Note: this function will be deprecated in near future.
+  virtual c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support _allgather_base"));
+  }
+
+  // This function is deprecated and will be moved out of Backend to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your Backend, implement _allgather_base
+  //   instead.
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allgather_coalesced"));
+  }
+
+  // This function is a coalesced version of `allgather_into_tensor` (currently
+  // still named as `_allgather_base`). Each tensor in the vector corresponds to
+  // an input/output of one `allgather_into_tensor` operation.
+  virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allgather_into_tensor_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const GatherOptions& /* opts */ = GatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support gather"));
+  }
+
+  virtual c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ScatterOptions& /* opts */ = ScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support scatter"));
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support reduce_scatter"));
+  }
+
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support _reduce_scatter_base"));
+  }
+
+  // This function is a coalesced version of `reduce_scatter_tensor` (currently
+  // still named as `_reduce_scatter_base`). Each tensor in the vector
+  // corresponds to an input/output of one `reduce_scatter_tensor` operation.
+  virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support reduce_scatter_tensor_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      std::vector<int64_t>& /* outputSplitSizes */,
+      std::vector<int64_t>& /* inputSplitSizes */,
+      const AllToAllOptions& /* opts */ = AllToAllOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support alltoall_base"));
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support alltoall"));
+  }
+
+  virtual void monitoredBarrier(
+      const BarrierOptions& /* unused */,
+      bool /* unused */ = false) {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not support monitoredBarrier, only GLOO supports monitored barrier."));
+  }
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  virtual void setSequenceNumberForGroup() {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not yet support sequence numbers."));
+  }
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  virtual uint64_t getSequenceNumberForGroup() {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not yet support sequence numbers."));
+  }
+
+  virtual c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* dstRank */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support send"));
+  }
+
+  virtual c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* srcRank */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support recv"));
+  }
+
+  virtual c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support recvAnysource"));
+  }
+
+  virtual c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& /* opts */ = BarrierOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support barrier"));
+  }
+
+  virtual void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
+    TORCH_CHECK(
+        false,
+        "Only ProcessGrouppNCCL supports onCompletion hook, but got ",
+        getBackendName(),
+        " backend.");
+  }
+
+  virtual void waitForPendingWorks() {
+    TORCH_CHECK(
+        false,
+        "Only ProcessGrouppNCCL supports waitForPendingWorks, but got ",
+        getBackendName(),
+        " backend.");
+  }
+
+  virtual void enableCollectivesTiming() {
+    TORCH_CHECK(
+        false,
+        "Backend ",
+        getBackendName(),
+        " is missing implementation of enableCollectivesTiming.");
+  }
+
+  bool hasHooks() const {
+    return onCompletionHook_ != nullptr;
+  }
+
+  // Do not call this directly, use ProcessGroup::setGroupName instead.
+  void setGroupName(const std::string& name) {
+    pg_name_ = name;
+  }
+
+  const std::string& getGroupName() const {
+    return pg_name_;
+  }
+
+  // See similar functions in ProcessGroup.hpp for context.
+  c10::optional<at::Device> getBoundDeviceId() const {
+    return bound_device_id_;
+  }
+
+  // Perform an eager connect to the specified device if the backend supports
+  // it.
+  virtual void eagerConnectSingleDevice(at::Device device) {
+    // no-op in the default case; this is an optimization some
+    // backends may perform
+  }
+
+  void setBoundDeviceId(c10::optional<at::Device> device) {
+    if (device) {
+      TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
+    }
+    bound_device_id_ = device;
+  }
+
+ protected:
+  // Implementations of this interface need to call this to setup
+  // appropriate logging etc.
+  void init();
+
+  const int rank_;
+  const int size_;
+  // Debug level setting. It is parsed once when ProcessGroup is constructed and
+  // remains the same across use of this process group.
+  DebugLevel dist_debug_level_;
+  std::string pg_name_;
+
+  std::function<void(std::shared_ptr<WorkInfo>)> onCompletionHook_;
+
+  c10::optional<at::Device> bound_device_id_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ab677240163d4c58a4ebd40cadd0fa0adc90509
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -0,0 +1,186 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+
+namespace c10d {
+
+class FakeWork : public Work {
+ public:
+  bool wait(std::chrono::milliseconds timeout) override {
+    return true;
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+    auto fut = c10::make_intrusive<c10::ivalue::Future>(c10::NoneType::get());
+    fut->markCompleted();
+    return fut;
+  }
+};
+
+class FakeProcessGroup : public Backend {
+ public:
+  FakeProcessGroup(int rank, int size) : Backend(rank, size) {}
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& /* tensors */,
+      const BroadcastOptions& /* opts */ = BroadcastOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const ReduceOptions& /* opts */ = ReduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  // NOTE [allgather on FakeProcessGroup]
+  // Assume each rank have the same input tensor so we just copy to the results
+  // since it's not a real allgather, we simply make this copying logic to let
+  // some simple validation works (i.e. calling allgather to see if each rank
+  // have the same tensor or not).
+  //
+  // NOTE: in general it's not good form to try to make FakeProcessGroup work
+  // with real data, but the reasoning here is that we want FakeProcessGroup to
+  // work with DeviceMesh's init code that have the data validation, which
+  // makes it worth the tradeoff.
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (auto& tensor : outputTensors[0]) {
+      tensor.copy_(inputTensors[0]);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    auto chunks = outputBuffer.chunk(size_);
+    for (auto& tensor : chunks) {
+      tensor.copy_(inputBuffer);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto chunks = outputs[i].chunk(size_);
+      for (auto& chunk : chunks) {
+        chunk.copy_(inputs[i]);
+      }
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const GatherOptions& /* opts */ = GatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ScatterOptions& /* opts */ = ScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      std::vector<int64_t>& /* outputSplitSizes */,
+      std::vector<int64_t>& /* inputSplitSizes */,
+      const AllToAllOptions& /* opts */ = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* dstRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* srcRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& /* opts */ = BarrierOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..74a01f869c0c3cf7d1d8e70bc809d5766c872ad9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+
+class TORCH_API FileStore : public Store {
+ public:
+  explicit FileStore(std::string path, int numWorkers);
+
+  ~FileStore() override;
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  int64_t getNumKeys() override;
+
+  bool deleteKey(const std::string& key) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  // Returns the path used by the FileStore.
+  const std::string& getPath() const noexcept {
+    return path_;
+  }
+
+ protected:
+  int64_t addHelper(const std::string& key, int64_t i);
+
+  std::string path_;
+  off_t pos_{0};
+
+  int numWorkers_;
+  const std::string cleanupKey_;
+  const std::string refCountKey_;
+  const std::string regularPrefix_;
+  const std::string deletePrefix_;
+
+  std::unordered_map<std::string, std::vector<uint8_t>> cache_;
+
+  std::mutex activeFileOpLock_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..eb5906848dbcb2ddd31057e61dd1f63e81835355
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <string>
+
+#include <c10/util/Registry.h>
+#include <gloo/config.h>
+#include <gloo/transport/device.h>
+
+namespace c10d {
+
+class TORCH_API GlooDeviceFactory {
+ public:
+  // Create new device instance for specific interface.
+  static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
+      const std::string& interface);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
+      const std::string& hostname);
+};
+
+TORCH_DECLARE_SHARED_REGISTRY(
+    GlooDeviceRegistry,
+    ::gloo::transport::Device,
+    const std::string&, /* interface */
+    const std::string& /* hostname */);
+
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd02a71d900e29d8e47b6f5901324fb2ecacfe59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+C10_EXPORT void set_thread_isolation_mode(bool enable);
+
+bool get_thread_isolation_mode();
+
+C10_EXPORT void register_process_group(
+    const std::string& group_name,
+    c10::intrusive_ptr<c10d::ProcessGroup> group);
+
+C10_EXPORT c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
+    const std::string& group_name);
+
+C10_EXPORT void unregister_process_group(const std::string& group_name);
+
+C10_EXPORT void unregister_all_process_groups();
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a0c4ce09fc50b332be26f8348ec6d02b856e057c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <condition_variable>
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+
+class TORCH_API HashStore : public Store {
+ public:
+  ~HashStore() override = default;
+
+  void set(const std::string& key, const std::vector<uint8_t>& data) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  void wait(const std::vector<std::string>& keys) override {
+    wait(keys, Store::kDefaultTimeout);
+  }
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  int64_t getNumKeys() override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  // Returns true if this store support append, multiGet and multiSet
+  bool hasExtendedApi() const override;
+
+ protected:
+  std::unordered_map<std::string, std::vector<uint8_t>> map_;
+  std::mutex m_;
+  std::condition_variable cv_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..58aa79803fbb5abe94f5056b6cc70d4cc08f0b4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -0,0 +1,529 @@
+#pragma once
+
+#ifdef USE_C10D_NCCL
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <nccl.h>
+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 14)
+#define NCCL_HAS_COMM_NONBLOCKING
+#endif
+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 18)
+#define NCCL_HAS_COMM_SPLIT
+#endif
+
+// ncclGetLastError() is enabled only for NCCL versions 2.13+
+// ncclRemoteError only exists in NCCL versions 2.13+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 13)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#define NCCL_REMOTE_ERROR
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#define NCCL_REMOTE_ERROR
+#endif
+
+// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
+// and ncclCommGetAsyncError() are not supported in earlier versions.
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 4)
+#define ENABLE_NCCL_ERROR_CHECKING
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_ERROR_CHECKING
+#endif
+
+// P2P is enabled only for NCCL versions 2.7+ since ncclSend()
+// and ncclRecv() are not supported in earlier versions.
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 7)
+#define ENABLE_NCCL_P2P_SUPPORT
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_P2P_SUPPORT
+#endif
+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 11)
+#define ENABLE_NCCL_PREMUL_SUM_SUPPORT
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_PREMUL_SUM_SUPPORT
+#endif
+
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 17)
+#define NCCL_HAS_COMM_CTA_CGA
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define NCCL_HAS_COMM_CTA_CGA
+#endif
+
+#if defined(NCCL_REGISTRATION_SUPPORTED) ||                              \
+    ((defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+      (NCCL_MINOR >= 19)))
+#define NCCL_HAS_COMM_REGISTER
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define NCCL_HAS_COMM_REGISTER
+#endif
+
+// Macro to throw on a non-successful NCCL return value.
+#define C10D_NCCL_CHECK(cmd, failureReason)                                   \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    if (result != ncclSuccess) {                                              \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
+// Macro to throw on a non-successful NCCL return value for NONBLOCKING calls.
+#define C10D_NCCL_CHECK_NONBLOCKING(cmd, failureReason)                       \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    if (result != ncclSuccess && result != ncclInProgress) {                  \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
+// Macro to throw on a non-successful NCCL return value, non-blocking.
+#define C10D_NCCL_CHECK_TIMEOUT(cmd, comm, failureReason)                     \
+  ncclResult_t result = cmd;                                                  \
+  auto startTimepoint = std::chrono::steady_clock::now();                     \
+  while (result == ncclInProgress) {                                          \
+    if (nccl_nonblocking_timeout() > 0) {                                     \
+      auto currentTimepoint = std::chrono::steady_clock::now();               \
+      auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(    \
+                             currentTimepoint - startTimepoint)               \
+                             .count();                                        \
+      if (timeElapsed > nccl_nonblocking_timeout()) {                         \
+        std::string err = "NCCL timeout in: " + std::string(__FILE__) + ":" + \
+            std::to_string(__LINE__) + ", " +                                 \
+            ncclGetErrorWithVersion(result) + "\n" +                          \
+            getNcclErrorDetailStr(result, failureReason);                     \
+        TORCH_CHECK_WITH(DistBackendError, false, err);                       \
+      }                                                                       \
+    }                                                                         \
+    ncclCommGetAsyncError(comm, &result);                                     \
+  }                                                                           \
+  if (result != ncclSuccess) {                                                \
+    std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +       \
+        std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) +   \
+        "\n" + getNcclErrorDetailStr(result, failureReason);                  \
+    TORCH_CHECK_WITH(DistBackendError, false, err);                           \
+  }
+
+#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comm, failureReason)           \
+  ncclResult_t state = cmd;                                                  \
+  auto startTimepoint = std::chrono::steady_clock::now();                    \
+  if (state == ncclInProgress) {                                             \
+    do {                                                                     \
+      if (nccl_nonblocking_timeout() > 0) {                                  \
+        auto currentTimepoint = std::chrono::steady_clock::now();            \
+        auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>( \
+                               currentTimepoint - startTimepoint)            \
+                               .count();                                     \
+        if (timeElapsed > nccl_nonblocking_timeout()) {                      \
+          std::string err = "NCCL timeout in: " + std::string(__FILE__) +    \
+              ":" + std::to_string(__LINE__) + ", " +                        \
+              ncclGetErrorWithVersion(state) + "\n" +                        \
+              getNcclErrorDetailStr(state, failureReason);                   \
+          TORCH_CHECK_WITH(DistBackendError, false, err);                    \
+        }                                                                    \
+      }                                                                      \
+      ncclCommGetAsyncError(comm->getNcclComm(), &state);                    \
+    } while (state == ncclInProgress);                                       \
+  }                                                                          \
+  if (state != ncclSuccess) {                                                \
+    std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +      \
+        std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) +   \
+        "\n" + getNcclErrorDetailStr(state, failureReason);                  \
+    TORCH_CHECK_WITH(DistBackendError, false, err);                          \
+  }
+
+// Macro to print and abort on a non-successful NCCL return value.
+#define C10D_NCCL_ASSERT(cmd)                            \
+  do {                                                   \
+    ncclResult_t result = cmd;                           \
+    if (result != ncclSuccess) {                         \
+      std::string err = ncclGetErrorWithVersion(result); \
+      fprintf(                                           \
+          stderr,                                        \
+          "NCCL error in: %s:%d, %s\n",                  \
+          __FILE__,                                      \
+          __LINE__,                                      \
+          err.c_str());                                  \
+      abort();                                           \
+    }                                                    \
+  } while (0)
+
+namespace c10d {
+
+TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
+std::string getNcclVersion();
+std::string ncclGetErrorWithVersion(ncclResult_t error);
+bool nccl_use_nonblocking();
+int nccl_nonblocking_timeout();
+
+// Provides additional detail into NCCL error codes based on when these are
+// thrown in the NCCL codebase.
+std::string getNcclErrorDetailStr(
+    ncclResult_t error,
+    c10::optional<std::string> processGroupFailureReason = c10::nullopt);
+
+// Write NCCL debug info to local disk or any storage users define.
+// There are some constrains we set for the debug info writer:
+// 1. The writer should only be registered once.
+// 2. Once registered, users cannot change it including un-register.
+// 3. It is recommended to register the customized writer in the trainer setup,
+//    If users don't register before calling launchAsyncDebugDump, then users
+//    lose the chance to register (and the default writer will be
+//    auto-registered).
+class TORCH_API DebugInfoWriter {
+ public:
+  virtual ~DebugInfoWriter();
+  virtual void write(const std::string& ncclTrace);
+  static DebugInfoWriter& getWriter(int rank);
+  static void registerWriter(std::unique_ptr<DebugInfoWriter> writer);
+
+ protected:
+  DebugInfoWriter(std::string namePrefix, int rank) {
+    filename_ = c10::str(namePrefix, rank);
+  }
+  std::string filename_;
+
+ private:
+  static std::unique_ptr<DebugInfoWriter> writer_;
+  static std::atomic<bool> hasWriterRegistered_;
+};
+
+// RAII wrapper for NCCL communicator
+class NCCLComm {
+ public:
+  explicit NCCLComm(ncclComm_t ncclComm)
+      : ncclComm_(ncclComm),
+        aborted_(false),
+        ncclAsyncErr_(ncclSuccess),
+        commFailureReason_(c10::nullopt),
+        initialized_(false) {}
+
+  NCCLComm() : NCCLComm(nullptr) {}
+
+  ~NCCLComm() noexcept {
+    // Add lock in this destructor, as aborted_ needs to be read after memory
+    // barrier here.
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (ncclComm_ && !aborted_) {
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+      // Use ncclCommAbort instead of ncclCommDestroy here since
+      // ncclCommDestroy could block forever waiting for work to complete on
+      // the communicator.
+      C10D_NCCL_ASSERT(::ncclCommAbort(ncclComm_));
+#else
+      C10D_NCCL_ASSERT(::ncclCommDestroy(ncclComm_));
+#endif
+    }
+  }
+
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId) {
+    auto comm = std::make_shared<NCCLComm>();
+    C10D_NCCL_CHECK(
+        ncclCommInitRank(&(comm->ncclComm_), numRanks, commId, rank),
+        c10::nullopt);
+    comm->ncclId_ = commId;
+    comm->rank_ = rank;
+    comm->initialized_ = true;
+    return comm;
+  }
+
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId,
+      ncclConfig_t& config) {
+    auto comm = std::make_shared<NCCLComm>();
+    bool isInitialized = false;
+    if (nccl_use_nonblocking()) {
+      config.blocking = 0;
+      LOG(INFO) << "Rank " << rank
+                << ": creating NCCL communicator in nonblocking mode";
+      C10D_NCCL_CHECK_NONBLOCKING(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          c10::nullopt);
+    } else {
+      C10D_NCCL_CHECK(
+          ncclCommInitRankConfig(
+              &(comm->ncclComm_), numRanks, commId, rank, &config),
+          c10::nullopt);
+      // under blocking mode, comm is initialized after NCCL CHECK
+      isInitialized = true;
+    }
+    comm->ncclId_ = commId;
+    comm->rank_ = rank;
+    comm->initialized_ = isInitialized;
+    return comm;
+  }
+#endif
+
+#ifdef NCCL_HAS_COMM_SPLIT
+  static std::shared_ptr<NCCLComm> split(
+      NCCLComm* source,
+      int color_id,
+      int rank,
+      ncclConfig_t& config) {
+    auto comm = std::make_shared<NCCLComm>();
+    C10D_NCCL_CHECK(
+        ncclCommSplit(
+            source->ncclComm_, color_id, rank, &(comm->ncclComm_), &config),
+        c10::nullopt);
+    ++source->ncclCommSplitCounter_;
+    comm->rank_ = rank;
+    return comm;
+  }
+#endif
+
+#if defined(IS_NCCL_EXP) && defined(NCCL_COMM_DUMP)
+  std::unordered_map<std::string, std::string> ncclCommDump() {
+    std::unordered_map<std::string, std::string> dump;
+    if (isAborted()) {
+      LOG(INFO) << "Communicator was aborted before trying to dump its state.";
+      return dump;
+    }
+    C10D_NCCL_CHECK(::ncclCommDump(ncclComm_, dump), c10::nullopt);
+    return dump;
+  }
+#endif
+
+  ncclUniqueId getNcclId() {
+    return ncclId_;
+  }
+
+  // Must not be copyable
+  NCCLComm(const NCCLComm&) = delete;
+  NCCLComm& operator=(const NCCLComm&) = delete;
+
+  // Do not support move assignment as there is no valid use case
+  NCCLComm& operator=(NCCLComm&& other) = delete;
+
+  // Move constructable
+  NCCLComm(NCCLComm&& other) {
+    // Using other's lock, as it reads other's states
+    // Can not use this.mutex_, as this object is being constructed.
+    std::unique_lock<std::mutex> lock(other.mutex_);
+    std::swap(ncclComm_, other.ncclComm_);
+    std::swap(aborted_, other.aborted_);
+    std::swap(ncclAsyncErr_, other.ncclAsyncErr_);
+    std::swap(initialized_, other.initialized_);
+  }
+
+  ncclComm_t getNcclComm();
+
+  c10::optional<std::string> getNcclCommFailureReason() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return commFailureReason_;
+  }
+
+  void ncclCommAbort(
+      c10::optional<std::string> commFailureReason = c10::nullopt) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+    if (aborted_) {
+      // Should not abort twice.
+      return;
+    }
+
+#ifdef NCCL_HAS_COMM_REGISTER
+    // Deregister all registered segments before aborting.
+    for (auto& it : registeredSegmentHandles_) {
+      void* handle = it.second;
+      C10D_NCCL_CHECK(
+          ::ncclCommDeregister(ncclComm_, handle),
+          c10::str(
+              "Failed to deregister segment handle ",
+              handle,
+              " on ncclComm_ ",
+              ncclComm_));
+    }
+    registeredSegmentHandles_.clear();
+#endif
+
+    // Set true failure reason if provided by ProcessGroupNCCL (e.g. work
+    // timeout)
+    commFailureReason_ = commFailureReason;
+    LOG(INFO) << "Aborting ncclComm_ " << ncclComm_ << " with reason: "
+              << (commFailureReason ? *commFailureReason
+                                    : "No abort reason provided.");
+#ifndef NCCL_HAS_COMM_NONBLOCKING
+    C10D_NCCL_CHECK(::ncclCommAbort(ncclComm_), commFailureReason_);
+#else
+    C10D_NCCL_CHECK_TIMEOUT(
+        ::ncclCommAbort(ncclComm_), ncclComm_, commFailureReason_);
+#endif
+    aborted_ = true;
+    ncclComm_ = nullptr;
+
+    // Set an appropriate error so that we avoid using the communicator.
+    if (ncclAsyncErr_ == ncclSuccess) {
+      ncclAsyncErr_ = ncclSystemError;
+    }
+#else
+    // This is a NOOP, if error checks are disabled.
+    return;
+#endif
+  }
+
+  bool isAborted() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return aborted_;
+  }
+
+  uint64_t getCommSplitCounter() const {
+    return ncclCommSplitCounter_;
+  }
+
+  ncclResult_t checkForNcclError() {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+    if (ncclAsyncErr_ != ncclSuccess) {
+      return ncclAsyncErr_;
+    }
+    C10D_NCCL_CHECK(
+        ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_), commFailureReason_);
+    return ncclAsyncErr_;
+#else
+    // Always return success, if error checks are disabled.
+    return ncclSuccess;
+#endif
+  }
+
+  ncclResult_t registerSegment(void* ptr, size_t size) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef NCCL_HAS_COMM_REGISTER
+    // We register only segments from cache allocator
+    // which are guaranteed to be with disjoint addr ranges. Thus, a ptr always
+    // maps to a unique handle and should not be registered before the current
+    // ptr is deregistered and freed.
+    TORCH_CHECK(
+        registeredSegmentHandles_.count(ptr) == 0,
+        "Segment with ptr ",
+        ptr,
+        " has already been registered on ncclComm_ ",
+        ncclComm_);
+
+    void* handle;
+    C10D_NCCL_CHECK(
+        ncclCommRegister(ncclComm_, ptr, size, &handle),
+        c10::str(
+            "Failed to register segment with ptr ",
+            ptr,
+            ", size ",
+            size,
+            " on ncclComm_ ",
+            ncclComm_));
+    registeredSegmentHandles_[ptr] = handle;
+    return ncclSuccess;
+#else
+    return ncclInvalidUsage;
+#endif
+  }
+
+  ncclResult_t deregisterSegment(void* ptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+#ifdef NCCL_HAS_COMM_REGISTER
+    TORCH_CHECK(
+        registeredSegmentHandles_.count(ptr) == 1,
+        "Segment with ptr ",
+        ptr,
+        " is not registered on ncclComm_ ",
+        ncclComm_);
+
+    void* handle = registeredSegmentHandles_[ptr];
+    C10D_NCCL_CHECK(
+        ncclCommDeregister(ncclComm_, handle),
+        c10::str(
+            "Failed to deregister segment handle ",
+            handle,
+            ", with ptr ",
+            ptr,
+            " on ncclComm_ ",
+            ncclComm_));
+    registeredSegmentHandles_.erase(ptr);
+    return ncclSuccess;
+#else
+    return ncclInvalidUsage;
+#endif
+  }
+
+  friend class ProcessGroupNCCL;
+
+ protected:
+  // a helper function to wait until the communicator is initialized;
+  void waitUntilInitialized(int timeoutSecs);
+  ncclComm_t ncclComm_;
+  // Unique nccl_id for this communicator.
+  ncclUniqueId ncclId_;
+  bool aborted_;
+  uint64_t ncclCommSplitCounter_{0};
+  ncclResult_t ncclAsyncErr_;
+  mutable std::mutex mutex_;
+  // Rank that this communicator corresponds to.
+  int rank_;
+  // Optional reason for communicator failure, provided by ProcessGroupNCCL for
+  // better error messaging.
+  c10::optional<std::string> commFailureReason_;
+  bool initialized_{false};
+#ifdef NCCL_HAS_COMM_REGISTER
+  // Stores handlers for tensors registered by NCCL
+  std::unordered_map<void*, void*> registeredSegmentHandles_;
+#endif
+};
+
+// Helper that automatically cleans up premul sums.
+struct ncclRedOpRAII {
+  ncclRedOpRAII() = default;
+  ncclRedOpRAII(ncclRedOp_t op) : op_(op) {}
+  ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm)
+      : op_(op), comm_(comm), premul_sum_(true) {}
+  ncclRedOpRAII(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII(ncclRedOpRAII&& tmp) : ncclRedOpRAII() {
+    std::swap(tmp.op_, this->op_);
+    std::swap(tmp.comm_, this->comm_);
+    std::swap(tmp.premul_sum_, this->premul_sum_);
+  }
+#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT)
+  ~ncclRedOpRAII() {
+    if (premul_sum_) {
+      ncclRedOpDestroy(op_, comm_);
+    }
+  }
+#endif
+  operator ncclRedOp_t() const {
+    return op_;
+  }
+  ncclRedOp_t op_;
+  ncclComm_t comm_;
+  bool premul_sum_ = false;
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed2f9e5515dc4e823318977457b989b9689d3023
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/record_function.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
+ public:
+  ParamCommsDebugInfo() = default;
+  ParamCommsDebugInfo(
+      int pgId,
+      int rank,
+      std::string&& colName,
+      int inNelems,
+      int outNelems,
+      at::ScalarType dType,
+      std::vector<int64_t> inSplitSizes,
+      std::vector<int64_t> outSplitSizes,
+      int globalRankStart,
+      int globalRankStride,
+      int worldSize);
+
+  ~ParamCommsDebugInfo() override = default;
+
+  int getProcessGroupId() const {
+    return pgId_;
+  }
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getWorldSize() const {
+    return worldSize_;
+  }
+
+  int getGlobalRankStart() const {
+    return globalRankStart_;
+  }
+
+  int getGlobalRankStride() const {
+    return globalRankStride_;
+  }
+
+  const std::string getColumnName() const {
+    return columnName_;
+  }
+
+  int getInMessageNelems() const {
+    return inMessageNelems_;
+  }
+
+  int getOutMessageNelems() const {
+    return outMessageNelems_;
+  }
+
+  at::ScalarType getDType() const {
+    return dType_;
+  }
+
+  const std::vector<int64_t>& getInputSplitSizes() const {
+    return inputSplitSizes_;
+  }
+
+  const std::vector<int64_t>& getOutputSplitSizes() const {
+    return outputSplitSizes_;
+  }
+
+  const std::vector<int64_t>& getGroupRanks() const {
+    return groupRanks_;
+  }
+
+ private:
+  int pgId_{};
+  int rank_{};
+  int worldSize_{};
+  std::string columnName_;
+  int inMessageNelems_{};
+  int outMessageNelems_{};
+  at::ScalarType dType_ = at::kByte;
+  std::vector<int64_t> inputSplitSizes_;
+  std::vector<int64_t> outputSplitSizes_;
+  int globalRankStart_;
+  int globalRankStride_;
+  std::vector<int64_t> groupRanks_{};
+};
+
+#define RECORD_PARAM_COMMS(                                                    \
+    seq,                                                                       \
+    pgId,                                                                      \
+    rank,                                                                      \
+    colName,                                                                   \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
+    dType,                                                                     \
+    inSplitSizes,                                                              \
+    outSplitSizes,                                                             \
+    globalRankStart,                                                           \
+    globalRankStride,                                                          \
+    worldSize)                                                                 \
+  auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgId,                                                                    \
+      rank,                                                                    \
+      colName,                                                                 \
+      inNelems,                                                                \
+      outNelems,                                                               \
+      dType,                                                                   \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize);                                                              \
+  c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
+  std::initializer_list<const c10::IValue> paramList = {                       \
+      c10::IValue(seq),                                                        \
+      pgId,                                                                    \
+      rank,                                                                    \
+      colName,                                                                 \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize};                                                              \
+  c10::ArrayRef<const c10::IValue> paramInputs(paramList);                     \
+  RECORD_FUNCTION(at::kParamCommsCallName, paramInputs);
+
+#define RECORD_PARAM_COMMS_DATA(                                               \
+    seq,                                                                       \
+    pgId,                                                                      \
+    InputTensors,                                                              \
+    OutputTensors,                                                             \
+    rank,                                                                      \
+    colName,                                                                   \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
+    dType,                                                                     \
+    inSplitSizes,                                                              \
+    outSplitSizes,                                                             \
+    globalRankStart,                                                           \
+    globalRankStride,                                                          \
+    worldSize)                                                                 \
+  auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgId,                                                                    \
+      rank,                                                                    \
+      colName,                                                                 \
+      inNelems,                                                                \
+      outNelems,                                                               \
+      dType,                                                                   \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize);                                                              \
+  c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
+  std::initializer_list<const c10::IValue> paramList = {                       \
+      c10::IValue(InputTensors),                                               \
+      c10::IValue(seq),                                                        \
+      pgId,                                                                    \
+      rank,                                                                    \
+      colName,                                                                 \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize};                                                              \
+  c10::ArrayRef<const c10::IValue> paramInputs(paramList);                     \
+  RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(                                         \
+      at::kParamCommsCallName,                                                 \
+      paramInputs,                                                             \
+      std::vector<c10::IValue>(1, c10::IValue(OutputTensors)));
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..368a228703a1cf439b4a897f411be4e6e0409aa2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <memory>
+
+namespace c10d {
+
+class TORCH_API PrefixStore : public Store {
+ public:
+  explicit PrefixStore(std::string prefix, c10::intrusive_ptr<Store> store);
+
+  using Store::set;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  using Store::compareSet;
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  int64_t getNumKeys() override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  const std::chrono::milliseconds& getTimeout() const noexcept override;
+
+  void setTimeout(const std::chrono::milliseconds& timeout) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  // Returns true if this store support append, multiGet and multiSet
+  bool hasExtendedApi() const override;
+
+  c10::intrusive_ptr<Store> getUnderlyingStore();
+
+  // Recursively to fetch the store before layers of wrapping with PrefixStore.
+  c10::intrusive_ptr<Store> getUnderlyingNonPrefixStore();
+
+ protected:
+  std::string prefix_;
+  c10::intrusive_ptr<Store> store_;
+
+  std::string joinKey(const std::string& key);
+  std::vector<std::string> joinKeys(const std::vector<std::string>& keys);
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b60fbbd80b336cda9a0247cad381d9fbd9435d11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -0,0 +1,743 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/macros/Macros.h>
+
+#include <torch/csrc/distributed/c10d/Work.hpp>
+// *************************************************************************
+// PROCESS GROUP collective communication API IS BEING CHANGED BETWEEN
+// versions 1.7 and 1.8.
+// PLEASE DO NOT ADD ANY DEPENDENCIES.
+// SEE RFC: https://github.com/pytorch/pytorch/issues/39662
+// *************************************************************************
+
+constexpr auto kProcessGroupDefaultTimeout =
+    std::chrono::milliseconds(30 * 60 * 1000);
+
+namespace c10d {
+
+// ProcessGroup is a base class that captures collective and point to
+// point communication in a fixed set of processes.
+//
+// The functions specified in the class below describe the API alone;
+// implementations are provided in subclasses.
+//
+// Every function that performs I/O is executed asynchronously by a
+// thread pool owned by the ProcessGroup (by default). They return an
+// object that can be used to wait for completion or error.
+//
+// The ProcessGroup can instantiate subgroups with fewer or an equal
+// number of members. Implementations must take care that multiple
+// process groups can be used in parallel and synchronize accordingly.
+//
+// The ProcessGroup assumes a fixed set of processes. If the set
+// changes, existing instances must be destructed and instantiation
+// and initialization must start from scratch. For members of the
+// process group to find each other (referred to as rendezvous from
+// hereon)
+//
+class TORCH_API ProcessGroup : public torch::CustomClassHolder {
+ public:
+  // ProcessGroup Options is a base struct that defines the basic options
+  // when constructing a ProcessGroup. Each ProcessGroup subclass should
+  // extend this struct and define its options if it wants to provide more
+  // config options (beyond basic ones defined here) to end user.
+  struct TORCH_API Options : torch::CustomClassHolder {
+    explicit Options(
+        std::string backend,
+        std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)
+        : timeout(timeout), backend(std::move(backend)) {}
+    ~Options() override = default;
+
+    std::chrono::milliseconds timeout;
+
+    // backend name
+    const std::string backend;
+  };
+
+  enum BackendType {
+    UNDEFINED = 0,
+    GLOO = 1,
+    NCCL = 2,
+    UCC = 3,
+    MPI = 4,
+    CUSTOM = 5,
+  };
+
+  // Not used, set for backwards compatibility and only used for TypeDef in
+  // Ops.cpp
+  explicit ProcessGroup(int rank, int size);
+
+  explicit ProcessGroup(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options);
+  ~ProcessGroup() override;
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getSize() const {
+    return size_;
+  }
+
+  // Returns an unique opaque ID of this process group object.
+  int64_t getID() const {
+    return reinterpret_cast<std::intptr_t>(this);
+  }
+
+  // Returns an unique opaque ID of a backend for the specific backend type
+  // that can correlate with this process group's collectives.
+  int64_t getBackendID(BackendType backend_type) const {
+    return reinterpret_cast<std::intptr_t>(getBackend(backend_type).get());
+  }
+
+  virtual const std::string getBackendName() const {
+    return options_->backend;
+  };
+
+  BackendType getBackendType() const {
+    return backendType_;
+  };
+
+  virtual void startCoalescing(c10::DeviceType deviceType) {
+    // only nccl has implemented startCoalescing so only execute for nccl
+    // backends
+    auto backend = getBackend(deviceType);
+    backend->startCoalescing();
+  }
+
+  virtual c10::intrusive_ptr<Work> endCoalescing(c10::DeviceType deviceType) {
+    // only nccl has implemented endCoalescing so only execute for nccl
+    // backends
+    auto backend = getBackend(deviceType);
+    auto work = backend->endCoalescing();
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::broadcast_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    at::TensorList,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    int64_t,
+                    int64_t,
+                    bool,
+                    int64_t)>();
+    // It's awakward to unbox the opts here and box them again in the custom C++
+    // op. But it's also complicated to make opts as a CustomClassHolder. Leave
+    // it as it is now.
+    return std::get<1>(op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.rootTensor,
+        opts.asyncOp,
+        opts.timeout.count()));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::allreduce_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    at::TensorList,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                    const c10::optional<at::Tensor>& sparse_indices,
+                    int64_t)>();
+
+    return std::get<1>(op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.sparseIndices,
+        opts.timeout.count()));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allreduce_coalesced_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                             int64_t)>();
+
+    return op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.timeout.count());
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::reduce_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                             int64_t,
+                             int64_t,
+                             int64_t)>();
+    return op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.rootRank,
+        opts.rootTensor,
+        opts.timeout.count());
+  }
+
+  virtual c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allgather_", "")
+                         .typed<std::tuple<
+                             std::vector<std::vector<at::Tensor>>,
+                             c10::intrusive_ptr<Work>>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t)>();
+
+    return std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.timeout.count()));
+  }
+
+  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
+  // For implementers of ProcessGroup API and advanced users only.
+  // Note: this function will be deprecated in near future.
+  virtual c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::_allgather_base_", "")
+            .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
+                at::Tensor&,
+                at::Tensor&,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                bool,
+                int64_t)>();
+
+    return std::get<1>(op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp,
+        opts.timeout.count()));
+  }
+
+  // This function is deprecated and will be moved out of ProcessGroup to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your ProcessGroup, implement _allgather_base
+  //   instead.
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::allgather_coalesced_", "")
+            .typed<c10::intrusive_ptr<Work>(
+                const std::vector<std::vector<at::Tensor>>&,
+                const at::TensorList&,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+
+    return op.call(
+        outputTensorLists,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
+  }
+
+  // This function is a coalesced version of `allgather_into_tensor` (currently
+  // still named as `_allgather_base`). Each tensor in the vector corresponds to
+  // an input/output of one `allgather_into_tensor` operation.
+  virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::allgather_into_tensor_coalesced_", "")
+            .typed<c10::intrusive_ptr<Work>(
+                const at::TensorList,
+                const at::TensorList,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&)>();
+
+    return op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this));
+  }
+
+  virtual c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::gather_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             const at::TensorList&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             int64_t)>();
+    return op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.timeout.count());
+  }
+
+  virtual c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::scatter_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const std::vector<std::vector<at::Tensor>>&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    int64_t,
+                    bool,
+                    int64_t)>();
+    return std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.asyncOp,
+        opts.timeout.count()));
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::reduce_scatter_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const std::vector<std::vector<at::Tensor>>&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                    int64_t)>();
+    return std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.timeout.count()));
+  }
+
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
+            .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
+                at::Tensor&,
+                at::Tensor&,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                bool,
+                int64_t)>();
+    return std::get<1>(op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
+        opts.timeout.count()));
+  }
+
+  // This function is a coalesced version of `reduce_scatter_tensor` (currently
+  // still named as `_reduce_scatter_base`). Each tensor in the vector
+  // corresponds to an input/output of one `reduce_scatter_tensor` operation.
+  virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::reduce_scatter_tensor_coalesced_", "")
+            .typed<c10::intrusive_ptr<Work>(
+                const at::TensorList,
+                const at::TensorList,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                int64_t)>();
+
+    return op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.timeout.count());
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::alltoall_base_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::Tensor&,
+                             at::Tensor&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             std::vector<int64_t>,
+                             std::vector<int64_t>,
+                             int64_t)>();
+    return op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        outputSplitSizes,
+        inputSplitSizes,
+        opts.timeout.count());
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::alltoall_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const at::TensorList&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    int64_t)>();
+    return std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.timeout.count()));
+  }
+
+  virtual void monitoredBarrier(
+      const BarrierOptions& opts,
+      bool wait_all_ranks = false) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::monitored_barrier_", "")
+                         .typed<void(
+                             at::Tensor,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const std::vector<int64_t>&,
+                             int64_t,
+                             bool)>();
+    // Default to using cpu implementation, monitored barrier is only for GLOO
+    at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
+    op.call(
+        tensor,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.device_ids,
+        opts.timeout.count(),
+        wait_all_ranks);
+  }
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  virtual void setSequenceNumberForGroup() {
+    auto backendType = getBackendType();
+    // TODO: HACK for backend name to get sequence number for that backend.
+    if (backendType == ProcessGroup::BackendType::GLOO ||
+        backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::UCC) {
+      getDefaultBackend()->setSequenceNumberForGroup();
+    } else {
+      TORCH_CHECK(
+          false,
+          c10::str(
+              "ProcessGroup ",
+              getBackendName(),
+              " does not yet support sequence numbers."));
+    }
+  }
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  virtual uint64_t getSequenceNumberForGroup() {
+    auto backendType = getBackendType();
+
+    // TODO: HACK for backend name to get sequence number for that backend.
+    if (backendType == ProcessGroup::BackendType::GLOO ||
+        backendType == ProcessGroup::BackendType::NCCL ||
+        backendType == ProcessGroup::BackendType::UCC) {
+      return getDefaultBackend()->getSequenceNumberForGroup();
+    } else {
+      TORCH_CHECK(
+          false,
+          c10::str(
+              "ProcessGroup ",
+              getBackendName(),
+              " does not yet support sequence numbers."));
+    }
+  }
+
+  virtual c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::send", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             int64_t)>();
+    return op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        dstRank,
+        tag);
+  }
+
+  virtual c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::recv_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             int64_t)>();
+    return op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        srcRank,
+        tag);
+  }
+
+  virtual c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::recv_any_source_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t)>();
+    return op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        tag);
+  }
+
+  virtual c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) {
+    static at::Tensor tensor;
+    // TODO: if nccl was specified then use it
+    auto device = opts.device;
+    if (device.has_value()) {
+      // set device tensor from argument
+      tensor = at::empty(
+          {1}, at::TensorOptions().device(device.value()).dtype(at::kByte));
+    } else if (backendType_ == c10d::ProcessGroup::BackendType::NCCL) {
+      // set cuda tensor
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte));
+    } else {
+      // Default to using cpu implementation
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::CPU).dtype(at::kByte));
+    }
+
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::barrier", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::Tensor,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const std::vector<int64_t>&,
+                             int64_t)>();
+
+    return op.call(
+        tensor,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.device_ids,
+        opts.timeout.count());
+  }
+
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
+  bool hasBackends() {
+    return !deviceTypeToBackendType_.empty();
+  }
+
+  void setBackend(
+      c10::DeviceType deviceType,
+      BackendType backendType,
+      const c10::optional<c10::intrusive_ptr<Backend>>& backend) {
+    // TODO: should we add these entries after the backend setting succeeds?
+    deviceTypeToBackendType_[deviceType] = backendType;
+    deviceTypes_.insert(deviceType);
+    // if the backendType is already set then reuse it for this device
+    if (backendTypeToBackend_.find(backendType) !=
+        backendTypeToBackend_.end()) {
+      auto existingBackend = backendTypeToBackend_.at(backendType);
+      deviceTypeToBackend_[deviceType] = existingBackend;
+      TORCH_CHECK(
+          existingBackend->getBoundDeviceId() ==
+          (*backend)->getBoundDeviceId());
+    } else {
+      // check if backend has value
+      if (backend.has_value()) {
+        deviceTypeToBackend_[deviceType] = backend.value();
+        backendTypeToBackend_[backendType] = backend.value();
+        (*backend)->setBoundDeviceId(bound_device_id_);
+      }
+    }
+  }
+
+  c10::intrusive_ptr<Backend> getDefaultBackend() const {
+    TORCH_CHECK(
+        backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
+        "Could not find the default backend type ",
+        backendType_,
+        " for Process Group with name ",
+        getBackendName(),
+        ".");
+    return backendTypeToBackend_.at(backendType_);
+  }
+
+  c10::intrusive_ptr<Backend> getBackend(c10::DeviceType deviceType);
+
+  c10::intrusive_ptr<Backend> getBackend(BackendType backendType) const {
+    TORCH_CHECK(
+        backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
+        "Could not find backend type ",
+        backendType,
+        ".");
+    return backendTypeToBackend_.at(backendType);
+  }
+
+  // Return device types supported by this ProcessGroup.
+  // Note: the return type is `Device` rather than `DeviceType` for the purpose
+  // of easy comparison at Python level. The `Device` will have default index
+  // (-1).
+  std::vector<c10::Device> getDeviceTypes() const {
+    std::vector<c10::Device> devices;
+    devices.reserve(deviceTypes_.size());
+    for (auto& dt : deviceTypes_) {
+      devices.push_back(c10::Device(dt));
+    }
+    return devices;
+  }
+
+  void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
+    getDefaultBackend()->registerOnCompletionHook(std::move(hook));
+  }
+
+  void waitForPendingWorks() {
+    getDefaultBackend()->waitForPendingWorks();
+  }
+
+  bool hasHooks() const {
+    return getDefaultBackend()->hasHooks();
+  }
+
+  const std::string& getGroupName() const;
+  void setGroupName(const std::string& name);
+  void enableCollectivesTiming();
+
+  void release_resources() override;
+
+  // ProcessGroups optionally can be "bound" to a specific device.
+  // Currently this is only for nccl and allows for some opt-in
+  // optimizations such as automatic use of ncclCommSplit.  The device
+  // is specified in `init_process_group` and eventually makes it
+  // here and then down into the actual backend instances.
+  c10::optional<at::Device> getBoundDeviceId() const {
+    return bound_device_id_;
+  }
+
+  void setBoundDeviceId(c10::optional<at::Device> device) {
+    if (device) {
+      TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
+    }
+    bound_device_id_ = device;
+  }
+
+ protected:
+  // Implementations of this interface need to call this to setup
+  // appropriate logging etc.
+  void init();
+
+  c10::intrusive_ptr<c10d::Store> store_;
+  const int rank_;
+  const int size_;
+  const c10::intrusive_ptr<Options> options_;
+  const BackendType backendType_;
+
+  // Debug level setting. It is parsed once when ProcessGroup is constructed and
+  // remains the same across use of this process group.
+  DebugLevel dist_debug_level_;
+
+  // Backend classes for this ProcessGroup
+  std::unordered_set<c10::DeviceType> deviceTypes_;
+  std::unordered_map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
+  std::unordered_map<c10::DeviceType, c10::intrusive_ptr<Backend>>
+      deviceTypeToBackend_;
+  std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
+      backendTypeToBackend_;
+
+  c10::optional<at::Device> bound_device_id_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7a7583f6d7d54a639c11a9efe380b364a6b275c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -0,0 +1,448 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+#include <gloo/rendezvous/store.h>
+#include <gloo/transport/device.h>
+
+#include <c10/util/hash.h>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d {
+
+constexpr const char* GLOO_BACKEND_NAME = "gloo";
+
+// ProcessGroupGloo implements Gloo bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes. For
+// multi-threaded usage of process groups, you can use consider using
+// multiple process group instances.
+//
+// The Gloo algorithms that this class calls into are cached by their
+// signature (see description of AlgorithmKey above). This cache works
+// as follows: every function call instantiates an AlgorithmKey and
+// looks in the cache for existing entries. If there is one, it is
+// removed from the cache and returned to the caller. If there are
+// none, a new entry is created and returned. If an entry was created
+// before, but is still in use, the call will block and wait until the
+// entry is returned to the cache.
+//
+// In the future, we hope to extend this to allow multiple entries per
+// key, to enable parallelism for a single key. The number of entries
+// per key must always be identical for all processes. This maximum
+// number can be automatically tuned, but only if we let a single
+// process take charge, and have it broadcast the limits.
+//
+class TORCH_API ProcessGroupGloo : public Backend {
+ public:
+  // AsyncWork is the Gloo specific superclass for asynchronous work items.
+  // We can split asynchronous work into 3 phases:
+  // 1) Sanity checks and prepare input (e.g. memcpy)
+  // 2) Run operation on background thread
+  // 3) Synchronize with completion on foreground thread
+  //
+  // There is state to be shared between these 3 phases and all of this state
+  // is captured in the AsyncWork class and its derivatives.
+  //
+  // Note: while we are porting operations to use new style collectives, there
+  // is a split between operations using the existing caching approach and
+  // operations using the new AsyncWork base class. Over time we will port
+  // all operations and perform needed cleanup.
+  //
+  // FIXME: This probably should be called WorkGloo since the work is executed
+  // in sync mode by a background thread.
+  class TORCH_API AsyncWork : public Work {
+   public:
+    explicit AsyncWork(
+        std::vector<std::vector<at::Tensor>> outputTensors,
+        OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr,
+        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+            c10::nullopt);
+
+    ~AsyncWork() override = default;
+
+    static void execute(c10::intrusive_ptr<AsyncWork> work);
+
+    virtual void run() = 0;
+
+    std::vector<at::Tensor> result() override;
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+    uint64_t getSequencenumber() const override;
+
+   protected:
+    friend class ProcessGroupGloo;
+
+   private:
+    void finishWorkGloo();
+    void finishWorkGlooError(std::exception_ptr eptr);
+    inline void recordAsyncWorkProfilingInfo(
+        const char* profilingTitle,
+        const c10::optional<std::vector<at::Tensor>>& inputTensors);
+
+    const std::vector<std::vector<at::Tensor>> outputTensors_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+    std::function<void()> recordFunctionBeforeCallback_;
+    const uint64_t seq_;
+  };
+
+  // Wrap c10d store as Gloo store
+  class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
+
+    void setUint(const std::string& key, const std::vector<uint8_t>& value) {
+      store_->set(key, value);
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      store_->set(key, tmp);
+    }
+
+    std::vector<uint8_t> getUint(const std::string& key) {
+      auto value = store_->get(key);
+      return value;
+    }
+
+    std::vector<char> get(const std::string& key) override {
+      auto value = store_->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      store_->wait(keys, ::c10d::Store::kDefaultTimeout);
+    }
+
+    void wait(
+        const std::vector<std::string>& keys,
+        const std::chrono::milliseconds& timeout) override {
+      store_->wait(keys, timeout);
+    }
+
+#ifdef GLOO_STORE_HAS_STORE_V2
+    bool has_v2_support() override {
+      return store_->hasExtendedApi();
+    }
+
+    std::vector<std::vector<char>> multi_get(
+        const std::vector<std::string>& keys) override {
+      std::vector<std::vector<char>> res;
+      for (auto& value : store_->multiGet(keys)) {
+        res.emplace_back(std::vector<char>(value.begin(), value.end()));
+      }
+      return res;
+    }
+
+    void multi_set(
+        const std::vector<std::string>& keys,
+        const std::vector<std::vector<char>>& values) override {
+      std::vector<std::vector<uint8_t>> u_values;
+      for (auto& value : values) {
+        u_values.emplace_back(std::vector<uint8_t>(value.begin(), value.end()));
+      }
+      store_->multiSet(keys, u_values);
+    }
+
+    void append(const std::string& key, const std::vector<char>& value)
+        override {
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      return store_->append(key, tmp);
+    }
+
+    int64_t add(const std::string& key, int64_t value) override {
+      return store_->add(key, value);
+    }
+#endif
+
+   protected:
+    c10::intrusive_ptr<::c10d::Store> store_;
+  };
+
+  // For send and recv operations there is no need to pass them to the
+  // thread pool as they are entirely completed by the device thread.
+  // This work object is used to synchronize completion of the send or
+  // recv operation. It keeps a reference to the tensor it is
+  // operating on to prevent it from being deallocated while the
+  // operation is still in flight.
+  class TORCH_API SendWork : public Work {
+   public:
+    explicit SendWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+        uint64_t seq);
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    uint64_t getSequencenumber() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+    const uint64_t seq_;
+  };
+
+  class TORCH_API RecvWork : public Work {
+   public:
+    explicit RecvWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+        OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr);
+
+    int sourceRank() const override;
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    uint64_t getSequencenumber() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+    int srcRank_;
+    const uint64_t seq_;
+  };
+
+  struct TORCH_API Options : public Backend::Options {
+    explicit Options(
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout);
+
+    // return intrusive_ptr of the object
+    static c10::intrusive_ptr<Options> create(
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout) {
+      return c10::make_intrusive<Options>(timeout);
+    }
+
+    std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
+    int threads;
+  };
+
+  const std::string getBackendName() const override {
+    return std::string(GLOO_BACKEND_NAME);
+  }
+
+  // Helper functions to create a new device object.
+  // They are static functions on this class to keep them logically
+  // separate from the rest of the code base (e.g. torch/csrc/distributed).
+
+  // Create new device instance for specific interface.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& interface);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+
+  // Create new device instance.
+  // It tries to resolve this machine's hostname and bind to that address.
+  // If that fails (i.e. the hostname doesn't resolve to an address), it
+  // falls back to binding to the loopback address.
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+
+  // Create ProcessGroupGloo instance.
+  static c10::intrusive_ptr<ProcessGroupGloo> createProcessGroupGloo(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      std::chrono::milliseconds timeout);
+
+  explicit ProcessGroupGloo(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options = Options::create());
+
+  ~ProcessGroupGloo() override;
+
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& output_tensor,
+      at::Tensor& input_tensor,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& output_lists,
+      std::vector<at::Tensor>& input_list,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputCounts,
+      std::vector<int64_t>& inputCounts,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  void enableCollectivesTiming() override;
+
+  const std::unique_ptr<::gloo::rendezvous::Store>& _getStore() const {
+    return store_;
+  }
+
+  // Similar to barrier(), but blocks rank 0 until all other ranks have
+  // acknowledged that they are alive (through send/recv from rank 0). Rank 0
+  // is able to report all failed ranks if waitAllRanks = true, otherwise
+  // reports the first rank it detected as failed.
+  void monitoredBarrier(
+      const BarrierOptions& opts = BarrierOptions(),
+      bool waitAllRanks = false) override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  int getNumThreads() {
+    return options_->threads;
+  }
+
+ protected:
+  std::unique_ptr<::gloo::rendezvous::Store> store_;
+  const c10::intrusive_ptr<Options> options_;
+
+  // Every Gloo context represents a set of connections to its peers.
+  // In order to use more than one device (or allow for parallelism on
+  // a single device), you need multiple contexts.
+  std::vector<std::shared_ptr<::gloo::Context>> contexts_;
+  std::vector<std::thread> threads_;
+  bool stop_;
+
+  // Incremented for every collective we kick off.
+  // The value is used as tag for collective operations. Collectives are kicked
+  // off in identical order across processes. Therefore the tag can be used
+  // to match up operations during concurrent execution.
+  uint32_t collectiveCounter_;
+
+  // Returns next collective tag to use (uses collectiveCounter_).
+  uint32_t nextTag();
+
+  // Returns the context to use for the specified tag.
+  // With `nextTag` returning an increasing number, this should lead
+  // to contexts being used in a round-robin fashion.
+  std::shared_ptr<::gloo::Context> getContext(uint32_t tag);
+
+  // Entrypoint for worker threads.
+  void runLoop(int workerIndex);
+
+  // Queue work to run on worker thread.
+  void enqueue(c10::intrusive_ptr<AsyncWork> work);
+
+  // Keep both a queue of pending work, and a vector with in progress work.
+  // Both of these can only be mutated when holding the queue lock.
+  // We keep both around instead of just the queue, so we can grab a weak_ptr
+  // to all in progress and pending work when executing a barrier.
+  // When executing a barrier, we need to ensure that all prior work
+  // has completed before completing itself.
+  std::deque<c10::intrusive_ptr<AsyncWork>> workQueue_;
+  std::vector<c10::intrusive_ptr<AsyncWork>> workInProgress_;
+  std::mutex workMutex_;
+  std::condition_variable workProduceCV_;
+  std::condition_variable workConsumeCV_;
+  uint64_t seq_{0};
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1129aaff47c5f61a09bda6cf49dc2567d48cdbc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -0,0 +1,271 @@
+#pragma once
+
+#ifdef USE_C10D_MPI
+
+#include <condition_variable>
+#include <deque>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/ivalue_inl.h>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+#include <c10/util/CallOnce.h>
+
+#include <mpi.h>
+
+namespace c10d {
+
+constexpr const char* MPI_BACKEND_NAME = "mpi";
+
+// WorkEntry is the state associated with a single MPI run instance.
+// It include the source Tensor list and destination Tensor list, as well as
+// The actual run function that will operate either on src or dst or both.
+struct WorkEntry {
+  explicit WorkEntry(
+      std::vector<at::Tensor>* srcPtr,
+      std::vector<at::Tensor>* dstPtr,
+      std::function<void(std::unique_ptr<WorkEntry>&)> run)
+      : dst(dstPtr ? *dstPtr : std::vector<at::Tensor>()), run(std::move(run)) {
+    if (srcPtr) {
+      src = *srcPtr;
+    }
+  }
+
+  // Not copyable
+  WorkEntry(const WorkEntry&) = delete;
+  // Not copy assignable
+  WorkEntry& operator=(const WorkEntry&) = delete;
+
+  // For input and output tensors (in-place), we will always use src
+  std::vector<at::Tensor> src;
+
+  // Copy of user provided outputs.
+  const std::vector<at::Tensor> dst;
+
+  // src rank returned, for recv only
+  int* srcRank = nullptr;
+  std::function<void(std::unique_ptr<WorkEntry>&)> run;
+};
+
+// ProcessGroupMPI implements MPI bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes.
+//
+// All MPI functions provided by this class is asynchronously scheduled on a
+// Worker thread. Therefore, ProcessGroupMPI requires the MPI implementation
+// that is used to have a minimum thread support value of MPI_THREAD_SERIALIZED.
+// That is, The process may be multi-threaded, and multiple threads may make
+// MPI calls, but only one at a time: MPI calls are not made concurrently from
+// two distinct threads (all MPI calls are serialized). However, with
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
+// group. In other words, no more than 1 process group can be created globally.
+//
+// If you would like to use multiple ProcessGroupMPI, it requires your MPI
+// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that
+// is, multiple threads may call MPI, with no restriction.
+//
+// Also note that ProcessGroupMPI only supports a single Tensor operation. In
+// other words, the size of the input Tensor vector should always be 1.
+//
+// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
+// ProcessGroupMPI will automatically detect this support.
+class TORCH_API ProcessGroupMPI : public Backend {
+ public:
+  class WorkMPI : public Work {
+   public:
+    explicit WorkMPI(
+        std::vector<at::Tensor> outputTensors,
+        const char* profilingTitle = nullptr,
+        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+            c10::nullopt)
+        : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
+          outputTensors_(std::move(outputTensors)),
+          future_(c10::make_intrusive<at::ivalue::Future>(
+              c10::ListType::create(c10::TensorType::get()))) {}
+
+    std::vector<at::Tensor> result() override;
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+   protected:
+    friend class ProcessGroupMPI;
+
+   private:
+    void finishWorkMPI();
+    void finishWorkMPIError(std::exception_ptr eptr);
+
+    std::vector<at::Tensor> outputTensors_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+  };
+
+  class AsyncWork : public Work {
+   public:
+    AsyncWork(
+        MPI_Request request,
+        std::vector<at::Tensor> outputTensors,
+        const char* profilingTitle = nullptr,
+        const c10::optional<std::vector<at::Tensor>>& inputTensors =
+            c10::nullopt);
+
+    ~AsyncWork() override;
+
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    int sourceRank() const override;
+
+    bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
+
+    void abort() override;
+
+    std::vector<at::Tensor> result() override;
+
+   protected:
+    void populateException();
+
+   private:
+    const std::vector<at::Tensor> outputTensors_;
+    MPI_Request request_;
+    MPI_Status status_;
+  };
+
+  // Constructor will spawn up the worker thread loop
+  explicit ProcessGroupMPI(int rank, int size, MPI_Comm pgComm);
+
+  ~ProcessGroupMPI() override;
+
+  // Abort the MPI program, needs to be called when exception is detected
+  void abort();
+
+  const std::string getBackendName() const override {
+    return std::string(MPI_BACKEND_NAME);
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensor,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  // Creating a new ProcessGroupMPI, will initialize MPI if not initialized
+  static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
+      std::vector<int> ranks = {});
+
+ protected:
+  using WorkType =
+      std::tuple<std::unique_ptr<WorkEntry>, c10::intrusive_ptr<WorkMPI>>;
+  // Worker thread loop
+  void runLoop();
+  // Helper function that is called by the destructor
+  void destroy();
+
+  c10::intrusive_ptr<Work> enqueue(
+      std::unique_ptr<WorkEntry> entry,
+      const char* profilingTitle = nullptr,
+      const c10::optional<std::vector<at::Tensor>>& inputTensors =
+          c10::nullopt);
+
+  bool stop_;
+
+  std::mutex pgMutex_;
+  std::thread workerThread_;
+
+  std::deque<WorkType> queue_;
+  std::condition_variable queueProduceCV_;
+  std::condition_variable queueConsumeCV_;
+
+  // Global states
+  static void initMPIOnce();
+  static void mpiExit();
+  static c10::once_flag onceFlagInitMPI;
+
+  static std::mutex pgGlobalMutex_;
+  static int mpiThreadSupport_;
+
+  MPI_Comm pgComm_;
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_MPI
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6bf74aa9566969901b175a7a6a7020a619af512
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -0,0 +1,1097 @@
+#pragma once
+
+#if defined(__linux__)
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#ifdef USE_C10D_NCCL
+
+#include <atomic>
+#include <chrono>
+#include <future>
+#include <iostream>
+#include <list>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/intra_node_comm.hpp>
+
+#include <ATen/DynamicLibrary.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/core/Stream.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+// Control whether or not wait() is blocking or non-blocking.
+static std::vector<std::string> TORCH_NCCL_BLOCKING_WAIT = {
+    "TORCH_NCCL_BLOCKING_WAIT",
+    "NCCL_BLOCKING_WAIT"};
+
+// Control whether or not we perform Async Error Handling with NCCL.
+static std::vector<std::string> TORCH_NCCL_ASYNC_ERROR_HANDLING = {
+    "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+    "NCCL_ASYNC_ERROR_HANDLING"};
+
+// Control whether dumping debug info on watchdog
+// timeout is enabled. This variable must be set together with
+// TORCH_NCCL_ENABLE_MONITORING=1 and TORCH_NCCL_TRACE_BUFFER_SIZE > 0.
+static std::vector<std::string> TORCH_NCCL_DUMP_ON_TIMEOUT = {
+    "TORCH_NCCL_DUMP_ON_TIMEOUT"};
+
+// Control whether Desync Debug is enabled. This variable must be set
+// together with TORCH_NCCL_ASYNC_ERROR_HANDLING.
+static std::vector<std::string> TORCH_NCCL_DESYNC_DEBUG = {
+    "TORCH_NCCL_DESYNC_DEBUG",
+    "NCCL_DESYNC_DEBUG"};
+
+// Enable recording start-events for all ProcessGroupNCCL collectives, and
+// compute accurate collective timing per-collective. (Note: end-events are
+// recorded by default. Turn on this flag can increase chances of a watchdog
+// hang due to performing a CUDA event query which eventually calls
+// cudaEventElapsedTime() API.
+static std::vector<std::string> TORCH_NCCL_ENABLE_TIMING = {
+    "TORCH_NCCL_ENABLE_TIMING",
+    "NCCL_ENABLE_TIMING"};
+
+// Enable monitoring thread which aborts the process when the ProcessGroupNCCL
+// Watchdog thread gets stuck and no heartbeat is detected after
+// TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC. This can happen due to calling CUDA/NCCL
+// APIs that may hang. It is Useful to prevent jobs being stuck for a prolonged
+// time than necessary tying up cluster resources.
+static std::vector<std::string> TORCH_NCCL_ENABLE_MONITORING = {
+    "TORCH_NCCL_ENABLE_MONITORING"};
+
+// Control the watchdog heartbeat timeout period after which the monitoring
+// thread will abort the process.
+static std::vector<std::string> TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC = {
+    "TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"};
+
+// The maximum number of events we store in the flight recorder's ring buffer.
+// (One event could be the start or end of a collective, for example).
+static std::vector<std::string> TORCH_NCCL_TRACE_BUFFER_SIZE = {
+    "TORCH_NCCL_TRACE_BUFFER_SIZE"};
+
+// Control how much extra time we will wait for dumping the debugging info
+// before we exit and throws timeout exception.
+static std::vector<std::string> TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC = {
+    "TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"};
+
+// Control the interval inside the watchdog thread to check the coordinated
+// signal from other ranks, e.g. to dump the debugging information.
+static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
+    "TORCH_NCCL_COORD_CHECK_MILSEC"};
+
+// Whether to abort the communicators when users call destroy_process_group().
+// If yes, communicators will be aborted when destroy_process_group is called,
+// but not in destructor.
+static std::vector<std::string> TORCH_NCCL_ABORT_IN_DESTROY_PG = {
+    "TORCH_NCCL_ABORT_IN_DESTROY_PG"};
+
+constexpr const char* NCCL_BACKEND_NAME = "nccl";
+
+constexpr const char* TIMEOUT_DUMP = "timeout_dump";
+
+constexpr const int kWorkStatusUpdatePeriodMs = 10 * 1000; // 10 seconds
+
+constexpr auto kProcessGroupNCCLDefaultTimeout =
+    std::chrono::milliseconds(10 * 60 * 1000);
+
+// NoHandling: do not handle asynchronous NCCL errors
+// TearDown: tear down process upon error, see `WorkNCCL::handleException`
+// CleanUpOnly: just clean up collectives and abort communicators without
+// tearing down process SkipCleanUp: (this is a temporary option and can be
+// removed in future) tear down process without cleaning up NCCL communicators.
+// This should be used as a last resort in case `ncclCommAbort` itself is
+// hanging
+enum ErrorHandlingMode {
+  NoHandling = 0,
+  TearDown = 1,
+  CleanUpOnly = 2,
+  SkipCleanUp = 3
+};
+
+#define SHOULD_CLEAN_UP(a) (a != NoHandling && a != SkipCleanUp)
+
+#define SHOULD_TEAR_DOWN(a) (a != NoHandling && a != CleanUpOnly)
+
+#define PRINT_COLLECTIVE_HASH_SIGNATURE(phase, opType, numel, hashValue)      \
+  LOG(WARNING) << logPrefix() << "Hash of " << phase << " to NCCL " << opType \
+               << " with size " << numel << " is " << hashValue;
+
+// If set, ProcessGroupNCCL doesn't use recordStream calls to ensure
+// caching allocator safety for tensors used on both user-facing and
+// internal comm streams.
+// Instead, it stashes live references to those tensors until after
+// user-facing streams are synced with comm streams.
+// See stashed_for_allocator_safety_ below.
+static std::vector<std::string> TORCH_NCCL_AVOID_RECORD_STREAMS = {
+    "TORCH_NCCL_AVOID_RECORD_STREAMS"};
+
+// If set, ProcessGroupNCCL registers postAlloc and preFree hooks to cuda cache
+// allocator so that whenever a tensor is allocated or freed, ProcessGroupNCCL
+// can register/deregister the tensor on all available NCCL communicators.
+static std::vector<std::string> TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK =
+    {"TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK",
+     "NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"};
+
+#if defined(__linux__)
+struct DumpPipe {
+  DumpPipe(int rank) {
+    std::string fileStem =
+        getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
+    if (fileStem.empty() ||
+        getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
+      return;
+    }
+    TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_TEMP_FILE is empty");
+    std::string filename = c10::str(fileStem, rank, ".pipe");
+    TORCH_CHECK(
+        unlink(filename.c_str()) != -1 || errno == ENOENT,
+        "Error removing existing named pipe ",
+        filename);
+    TORCH_CHECK(
+        mkfifo(filename.c_str(), 0666) != -1,
+        "Error creating named pipe ",
+        filename);
+    fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
+    LOG(INFO) << "Pipe file " << filename
+              << " has been opened, write to it to trigger NCCL Debug Dump.";
+    TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
+  }
+  bool shouldDump() {
+    if (fd_ == -1) {
+      return false;
+    }
+    char buf[128];
+    // non-blocking from O_NONBLOCK above.
+    // Ignore EINTR because we already will poll this
+    // again later.
+    ssize_t bytesRead = read(fd_, &buf, 128);
+    return bytesRead > 0;
+  }
+  ~DumpPipe() {
+    if (fd_ != -1) {
+      close(fd_);
+    }
+  }
+
+ private:
+  int fd_ = -1;
+};
+#else
+struct DumpPipe {
+  DumpPipe(int rank) {}
+  bool shouldDump() {
+    return false;
+  }
+};
+#endif
+
+// ProcessGroupNCCL implements NCCL bindings for c10d.
+//
+// All functions of the class are expected to be called in the same order
+// across all processes in the process group.  This is the only way that we
+// can guarantee to match up the same calls among all processes.
+//
+// All NCCL functions provided by this class are asynchronous functions. More
+// specifically, each NCCL call is scheduled on a separate CUDA stream that is
+// different from the current CUDA stream. This is for the purpose of
+// achieving potentially concurrency and better performance. As a result,
+// it is the callers' responsibility to make sure that the CUDA stream their
+// code works on needs to wait for the NCCL operation from
+// this class.
+//
+// This can be done by calling:
+//
+// either WorkNCCL::wait() or WorkNCCL::synchronize(), both achieves the same
+// functionality and are synonyms.
+//
+// Also note that WorkNCCL::finishedGPUExecution() is a helper function only
+// provided by ProcessGroupNCCL to check if the NCCL operation of WorkNCCL has
+// finished execution on the GPU (not just scheduled).
+//
+// Example on using the NCCL process group
+//
+//   ProcessGroupNCCL pg(store, rank, size);
+//   std::shared_ptr<WorkNCCL> work = pg.allreduce(tensors);
+//
+//   // At this point, NCCL kernel has already by queued successfully
+//   // Now, let current stream wait for the NCCL to finish, this function is
+//   // async operation as well
+//
+//   work->wait()
+//
+//   // Now continue on other work in the current stream.
+class TORCH_API ProcessGroupNCCL : public Backend {
+ public:
+  class WorkNCCL : public Work, public std::enable_shared_from_this<WorkNCCL> {
+   public:
+    friend struct WorkInfo;
+
+    // Constructor takes a list of CUDA devices
+    WorkNCCL(
+        at::Device& device,
+        int rank,
+        OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr,
+        const c10::optional<std::vector<at::Tensor>>& inputs = c10::nullopt,
+        bool desyncDebug = false,
+        bool enableTiming = false,
+        DebugLevel distDebugLevel = DebugLevel::Off);
+    // Copy constructor doing partial copy without outputs_. Cleanup thread
+    // monitors and removes finished works. However it will deadlock when
+    // destructs outputs_ tensors who are view tensors in autograd graph.
+    WorkNCCL(const WorkNCCL& w);
+
+    ~WorkNCCL() override;
+
+    // Checks if the NCCL kernel has started to execute.
+    bool isStarted();
+
+    // Checks if request has completed. In this specific case of NCCL, it checks
+    // if the NCCL operation has completed on the GPU in its own NCCL stream.
+    // Non-blocking operation.
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    // Same as calling synchronize() for NCCL work.
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    // Let current stream wait on the completing of the NCCL work
+    // Throws on exceptions. Blocking operation, which will wait for work
+    // completion.
+    void synchronize() override;
+
+    // Synchronize streams by blocking each on the NCCL stream
+    void synchronizeStream();
+
+    // Helper function to handle exception (throw if needed).
+    void handleException(ErrorHandlingMode asyncErrorHandling);
+
+    // Helper function that checks if the NCCL kernels have finished
+    // execution on the GPUs
+    bool finishedGPUExecution();
+
+    // Get a Future object that will be marked as completed internally.
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+    float getDuration() const override;
+
+    uint64_t getSequencenumber() const override;
+
+    const std::string& logPrefix() const;
+
+    // Helper function that sets an exception_ptr on the WorkNCCL object.
+    void setException(std::exception_ptr exception_ptr);
+
+    // Helper function that returns True if the WorkNCCL object has timed out
+    // and False otherwise.
+    // In case of timeout, set exception on the WorkNCCL object.
+    bool checkTimeout(
+        c10::optional<std::chrono::milliseconds> timeout = c10::nullopt);
+
+    std::vector<at::Tensor> result() override;
+
+   protected:
+    // The cached list of CUDA devices to operate on
+    at::Device device_;
+
+    // The start CUDA event of NCCL operator tracking this work item. These
+    // start CUDA events are needed by desync debugging if enabled.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclStartEvent_;
+
+    // The end CUDA event of NCCL operator tracking this work item.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclEndEvent_;
+
+    // The NCCL communicator used for this work item.
+    std::shared_ptr<NCCLComm> ncclComm_;
+
+    // Tensors used for barrier op
+    at::Tensor barrierTensor_;
+
+    // Clone of blockingWait_ from ProcessGroupNCCL.
+    bool blockingWait_ = false;
+
+    // Clone of avoidRecordStreams_ from ProcessGroupNCCL.
+    bool avoidRecordStreams_ = false;
+
+    // Clone of opTimeout_ from ProcessGroupNCCL.
+    std::chrono::milliseconds opTimeout_;
+
+    // Time point representing when the work started.
+    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+
+    // Record the collective sequential number.
+    uint64_t seq_;
+
+    // Indicates if the nccl start event has been updated to the store trace.
+    // This will be used by desync debug.
+    bool startTraceUpdated_{false};
+
+    // Record collective sizes for debug. We only record the size on the first
+    // device as multi-device per process is deprecated
+    size_t numelIn_ = -1;
+    size_t numelOut_ = -1;
+
+    // Wrapper method for the static checkForNCCLErrors which can be overridden
+    // for tests.
+    virtual std::exception_ptr checkForNCCLErrors();
+
+    friend std::ostream& operator<<(
+        std::ostream& output,
+        const WorkNCCL& workNCCL);
+
+   private:
+    // Helper function for synchronize
+    void synchronizeInternal(std::chrono::milliseconds timeout);
+
+    // Checks for NCCL errors and sets an appropriate exception_ptr.
+    void checkAndSetException();
+
+    // Just checks whether GPU execution has started, without modifying
+    // exception_ptr.
+    bool startedGPUExecutionInternal() const;
+
+    // Just checks whether GPU execution has completed, without modifying
+    // exception_ptr.
+    bool finishedGPUExecutionInternal() const;
+
+    // Reference to the store so that we can write aborted communicators
+    // to the store.
+    c10::intrusive_ptr<Store> store_;
+
+    // Store a reference to NCCL collective's outputs, used by result and to
+    // give a more descriptive message when representing the Work as a string.
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
+
+    // TORCH_NCCL_AVOID_RECORD_STREAMS implementation helper.
+    // Stores references to participating non-output tensors (ie inputs,
+    // flattened intermediates).
+    // We'll clear this list in synchronizeStream, just after user-facing
+    // stream(s) are synced with the nccl work stream(s).
+    // By keeping these refs (as well as outputs_) alive until after the
+    // collective's work rejoins the user-facing streams, we achieve
+    // caching allocator safety without any recordStream calls.
+    // For in-place collectives, some refs stashed here may alias outputs_,
+    // but that doesn't do any harm.
+    std::shared_ptr<std::vector<at::Tensor>> stashed_for_allocator_safety_;
+
+    // The future returned by getFuture.
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+
+    bool timingEnabled_;
+    // unique id used to tell the trace buffer that this
+    // work has completed
+    c10::optional<uint64_t> trace_id_;
+    DebugLevel distDebugLevel_;
+    friend class ProcessGroupNCCL;
+  };
+
+  struct Options : Backend::Options {
+    // NOTE: timeout in ProcessGroupNCCL::Options denote the timeout for
+    // operations. This is only used when blockingWait_ is enabled.
+    explicit Options(bool is_high_priority_stream = false);
+
+    // return intrusive_ptr of the object
+    static c10::intrusive_ptr<Options> create(
+        bool is_high_priority_stream = false) {
+      return c10::make_intrusive<Options>(is_high_priority_stream);
+    }
+
+    // Schedule NCCL operations on high priority CUDA streams
+    bool is_high_priority_stream;
+
+#ifdef NCCL_HAS_COMM_NONBLOCKING
+    // Configure ranks
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+#endif
+
+    // Optional "parent" backend and color to create communicators from
+    // via `ncclCommSplit`
+    std::shared_ptr<ProcessGroupNCCL> split_from;
+    int64_t split_color{0};
+    std::vector<uint64_t> global_ranks_in_group;
+  };
+
+  // If you wish to create multiple process groups, each with a potentially
+  // different rank and size, you can do so by passing a new store instance
+  // to each one. If you have only a single store object, you can
+  // use the `c10d::PrefixStore` to derive scoped instances.
+  // This is also what the Python API in torch.distributed does.
+  //
+  // The process group instance keeps a reference to the store because
+  // it may be used long after the constructor runs. In fact, the constructor
+  // doesn't create any NCCL communicators. A single NCCL communicator can
+  // only be used on a specific set of devices, and are therefore created
+  // on-demand when a collective runs. If another collective is executed later,
+  // against a different set of devices, the process group creates another NCCL
+  // communicator. These NCCL communicators are cached and reused if possible.
+  //
+  ProcessGroupNCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options = Options::create());
+
+  // This constructor includes the deprecated `groupName` argument.
+  // If you have existing code that uses the `groupName`, you can replace
+  // it by specifying a `c10d::PrefixStore(groupName, store)` for store.
+  C10_DEPRECATED ProcessGroupNCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      const std::string& groupName,
+      c10::intrusive_ptr<Options> options = Options::create())
+      : ProcessGroupNCCL(store, rank, size, options) {}
+
+  ~ProcessGroupNCCL() override;
+
+  uint64_t getUid() {
+    return static_cast<uint64_t>(uid_);
+  }
+
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
+  const std::string getBackendName() const override {
+    return std::string(NCCL_BACKEND_NAME);
+  }
+
+  bool supportsSplitting() const override {
+    return true;
+  }
+
+  void startCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing() override;
+
+  // For specifying a composite optype, such as ALLGATHER and REDUCE_SCATTER
+  c10::intrusive_ptr<Work> endCoalescing(OpType optype);
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> _broadcast_oop(
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
+      const BroadcastOptions& opts = BroadcastOptions());
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_oop(
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
+      const ReduceOptions& opts = ReduceOptions());
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  void groupStart();
+
+  void groupEnd();
+
+  void groupEndNonblocking(std::shared_ptr<NCCLComm> comm);
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  // Unsupported Ops
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  // Return the total number of splits the communicators held by this process
+  // group have performed.
+  uint64_t getCommSplitCounter() const;
+
+  void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) override;
+  void waitForPendingWorks() override;
+
+  void enableCollectivesTiming() override;
+
+  // Helper function for iteratively aborting communicators in the provided map
+  void abortCommsFromMap(
+      std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
+      c10::optional<std::string> abortReason);
+
+  c10::intrusive_ptr<intra_node_comm::IntraNodeComm> initIntraNodeComm();
+
+  // Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
+  // instead of relying on ProcessGroupNCCL destructor.
+  // return true if abort is successful, otherwise false
+  bool abort(c10::optional<std::string> abortReason = c10::nullopt);
+
+  void shutdown(c10::optional<std::string> reason = c10::nullopt);
+
+  void eagerConnectSingleDevice(at::Device device) override;
+
+  void performNocolorSplit(at::Device device);
+
+ protected:
+  // Helper that broadcasts nccl unique ID to all ranks through the store
+  void broadcastUniqueNCCLID(
+      ncclUniqueId* ncclID,
+      bool isSingleP2POp,
+      const std::string& devicesKey,
+      int p2pRank);
+
+  // Helper that either looks up the cached NCCL communicators or creates
+  // a new set of NCCL communicators as a cache entry
+  std::shared_ptr<NCCLComm> getNCCLComm(
+      const std::string& deviceKey,
+      at::Device& device,
+      OpType opType,
+      int p2pRank = 0,
+      bool isSendRecvSelf = false);
+
+  // Wrapper method which can be overridden for tests.
+  virtual std::exception_ptr checkForNCCLErrors(
+      std::shared_ptr<NCCLComm>& ncclComm);
+
+  // Ensure thaht if record is True, the work obj will be enqueued via
+  // workEnqueue
+  virtual c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
+      at::Device& device,
+      int rank,
+      OpType opType,
+      const char* profilingTitle = nullptr,
+      const std::vector<at::Tensor>& inputs = {},
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false);
+
+  // In the timeout case and we will dump debug info such as the NCCL flight
+  // recorder to storage. Down the road, if we have more complicated or blocking
+  // operations, we might need to use a side thread to do it.
+  bool dumpDebuggingInfo();
+
+ private:
+  int globalRankStart;
+  int globalRankStride;
+
+  // Helper that encapsulates work shared across all collective communication
+  // primitives.  The callbacks have the following signatures:
+  //
+  //    ncclResult_t fn(at::Tensor& input, at::Tensor& output,
+  //                    ncclComm_t, at::cuda::CUDAStream&);
+  //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      OpType opType,
+      const char* profilingTitle = nullptr,
+      bool avoidRecordStreams = false);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType,
+      const char* profilingTitle = nullptr,
+      bool avoidRecordStreams = false);
+
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collectiveCoalesced(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn,
+      OpType opType,
+      const char* profilingTitle = nullptr,
+      bool avoidRecordStreams = false);
+
+  // Helper that encapsulates work shared across point-to-point communication
+  // primitives. It is the same structure as the helper used for collective
+  // communication primitives.
+  template <typename Fn>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      const char* profilingTitle = nullptr);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      PreProcess pre,
+      PostProcess post,
+      const char* profilingTitle);
+
+  c10::intrusive_ptr<Work> allreduce_impl(
+      at::Tensor& tensor,
+      const AllreduceOptions& opts = AllreduceOptions());
+
+  // Checks for NCCL errors on each of the communicators and returns an
+  // appropriate exception_ptr (nullptr if no errors).
+  static std::exception_ptr checkForNCCLErrorsInternal(
+      std::shared_ptr<NCCLComm>& ncclComm);
+
+  // Function that runs as part of a separate thread and checks for errors on
+  // NCCL communicators. We need a separate thread to check for NCCL errors
+  // since we can't rely on the user calling certain methods like wait(),
+  // isCompleted() etc. to detect and remediate errors. In addition to this, we
+  // need a mechanism to safely abort and remove NCCL communicators from our
+  // cache. This can be done cleanly by having a thread for the ProcessGroupNCCL
+  // class. Attempting to modify the communicator cache from the WorkNCCL class
+  // might run into issues with object lifetime since the ProcessGroupNCCL
+  // object might get destroyed before the WorkNCCL object.
+  void ncclCommWatchdog();
+
+  // Return the CUDA device most likely associated with this backend.
+  // If we aren't bound to a specific device, there is no strict
+  // guarantee that this heuristic is the correct assignment of ranks
+  // to GPUs that Python layers use, but in practice it tends to be.
+  // Fortunately we don't rely on this for correctness of any tensor
+  // operations, just for ancillary uses like barriers.
+  at::Device guessDeviceForRank() const;
+
+  // Destroys initialized NCCL communicators in devNCCLComMap_ given by input
+  // key. Throws if there are no communicators to destroy. Also removes
+  // communicators from the cache and clears used device indices.
+  void destroyNCCLComms(const std::string& devNCCLCommMapKey);
+
+  // Watchdog's inside loop.
+  // Takes care of cleaning up completed work, and aborting upon failure or
+  // timeout.
+  void watchdogHandler();
+
+  void runHookLoop();
+
+  // Desync debug helper
+  void logWorkStart(WorkNCCL& work);
+
+  // Desync debug helper
+  void logWorkEnd(WorkNCCL& work);
+
+  // Generates a prefix that is unique to this process group and rank, for
+  // disambiguating logs
+  std::string createLogPrefix() const;
+
+  // Returns the unique prefix created in createLogPrefix
+  const std::string& logPrefix() const;
+
+  // Returns the global rank of the device. This function assumes that users
+  // always create a default global process group(PG) which includes all
+  // devices. It is called in the constructor of ProcessGroupNCCL, so it always
+  // return the rank_ of the the very first PG created, aka, default global PG.
+  const int& globalRank() const;
+
+  // Returns the global ranks of a PG.
+  const std::vector<uint64_t>& groupRanks() const;
+
+ protected:
+  // Function that runs as part of a separate thread aside from watchdog
+  // thread because we need to check the heartbeat from watchdog thread
+  // so that when we get stuck in some NCCL/CUDA calls,
+  // we can dump the debugging information and abort the process.
+  virtual void heartbeatMonitor();
+
+  // Function that directly trigger std::abort so that the whole process
+  // gets terminated.
+  virtual void terminateProcess(std::string errMsg);
+
+  // A helper function to wait for a future to complete or timeout.
+  void waitForFutureOrTimeout(
+      std::future<bool>& fut,
+      const std::chrono::milliseconds& timeOutMilSec,
+      const std::string& futDescription,
+      bool throwException = false);
+
+  // When watchdog timeout, this function will be called and return debug info
+  // for users. For now we only get information from retrieveDesyncReport.
+  // We are working on enabling more useful debug information for watchdog
+  // timeout.
+  virtual std::string getNCCLWatchdogDebugInfo();
+
+  static const int64_t kWatchdogThreadSleepMillis;
+
+  // The store is used to broadcast the NCCL unique ID of rank 0. This store
+  // comes with prefix and it is different across ProcessGroup NCCL instances
+  // (aka, different ProcessGroups).
+  c10::intrusive_ptr<Store> store_;
+
+  // Reference to the store without prefix so that keys are same across all
+  // ProcessGroup NCCL instances and (key, value) pairs written to the store are
+  // global.
+  c10::intrusive_ptr<Store> globalStore_;
+
+  bool storeError_{false};
+
+  const c10::intrusive_ptr<Options> options_;
+
+  // The number of NCCL communicators that have been created during
+  // the lifetime of this process group. This sequence number is
+  // used to scope keys used in the store.
+  uint64_t ncclCommCounter_{0};
+
+  // The store keys to trace the last NCCL collective kernel CUDA events - start
+  // event and end event respectively. These are used to do desync root cause
+  // analysis.
+  const std::string traceKeyStart_;
+  const std::string traceKeyEnd_;
+
+  // The NCCL communicator that the process group has cached.
+  //
+  // For collective operations:
+  // The key is a list of GPU devices that an operation is operating on
+  // The GPU devices are stored in a device sequence and the cache NCCL
+  // communicator is associated with this GPU device sequence
+  //
+  // e.g. If the process group op only uses device 0, then the value of
+  // the used device string stored (value of the hashmap) would be "0".
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
+  //      then the value of the used device string (key) stored would be
+  //      "0,1,2,3,4,5,6,7"
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
+  //      then the value of the used device string stored would be
+  //      "0,4,5,6,7,1,2,3"
+  //
+  //      Note that the order of the device for the tensor list matters.
+  //
+  // For point-to-point operations:
+  // The key is a string of my current rank and the peer process rank.
+  // e.g. If process 1 and process 2 are involved in a point-to-point
+  // communication, the key will be "1:2" on both processes. Note: this is for
+  // the scenario where there is only 1 GPU per process. When it comes to
+  // multiple GPUs per process, this part may need to redesigned.
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
+
+  // The NCCL communicators currently in process of being initialized.
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>>
+      inInitializationCommMap_;
+
+  // Map from ncclUniqueId to appropriate communicator.
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> ncclIdToCommMap_;
+
+  // Mutex to guard maps like devNCCLCommMap_ and ncclIdToCommMap_.
+  std::mutex mutex_;
+
+  // Heartbeat of watchdog thread.
+  std::atomic_uint64_t heartbeat_;
+
+  // The time interval used for deciding whether there is no watchdog heartbeat.
+  int heartbeatTimeoutInSec_;
+
+  // timeout for the dump to finish.
+  int waitTimeoutDumpInMilSec_;
+
+  // Interval of check coordinated signals in ProcessGroupNCCL from other ranks
+  // e.g., trigger the dump of the debugging info for timeout when notified.
+  int coordCheckIntervalMilSec_;
+
+  // Size of ring buffer where we store NCCL Traces for debugging.
+  int ncclTraceBufferSize_;
+
+  // We gate the heartbeat monitor thread so that we can roll it out gradually.
+  std::atomic<bool> monitorThreadEnabled_;
+
+  // Monitor thread which checks the heartbeat of Watchdog thread.
+  // If the monitor thread finds there is no heartbeat, it will dump debug info
+  // and then kill the watchdog thread to avoid hang.
+  std::thread ncclHeartbeatMonitorThread_;
+
+  // Watchdog thread which looks for errors on the cached NCCL communicators.
+  std::thread ncclCommWatchdogThread_;
+
+  std::thread onCompletionHookThread_;
+
+  // Whether or not we should terminate the watchdog and workCleanup threads.
+  std::atomic<bool> terminateProcessGroup_;
+
+  // Whether or not we should terminate the heartbeat monitoring threads.
+  std::atomic<bool> terminateHeartbeatMonitorThread_;
+
+  // Whether we are in the shutdown mode when we are trying to get debug info,
+  // such as desync report.
+  std::atomic<bool> collectiveDebugInfoMode_;
+
+  // Whether there are hooks pending to be fired
+  std::atomic<bool> hasPendingHooks_;
+
+  // This is the signal from watchdog threads to indicate whether the monitor
+  // thread should dump. Making it static so that it is accessiable from all the
+  // PGs. With this flag, monitor thread would dump debug info under any one of
+  // the 3 conditions: 1: this flag is set to true by the watchdog thread when
+  // it detects a timeout. 2: timeout signal is received from
+  // other ranks through tcpstore 3: no heartbeat of watchdog Note that only the
+  // monitor thread from PG0 should dump the debug info and only once
+  static std::atomic<bool> shouldDump_;
+
+  // Mutex to Guard workMetaList_
+  std::mutex workMetaListMutex_;
+
+  // Mutex to Guard monitorWakeUpCV_
+  std::mutex monitorMutex_;
+
+  bool writeDebugInfo_ = false;
+
+  // Condition Variable for watchdog thread sleep
+  std::condition_variable workMetaListCV_;
+
+  // Condition Variable for monitor thread to wake up early
+  std::condition_variable monitorWakeUpCV_;
+
+  // Vector to Store WorkNCCL pointers
+  std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
+
+  std::chrono::time_point<std::chrono::steady_clock> lastWorkListUpdateTime_;
+
+  // Mutex to Guard workMetaList_
+  std::mutex completedWorkListMutex_;
+
+  // Condition Variable for watchdog thread sleep
+  std::condition_variable completedWorkListCV_;
+
+  std::list<ProcessGroupNCCL::WorkNCCL> completedWorkList_;
+
+  // Add Work Pointer to workVector
+  void workEnqueue(c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>);
+
+  // The CUDA streams used by NCCL kernels
+  std::unordered_map<std::string, at::cuda::CUDAStream> ncclStreams_;
+
+  // The CUDA events used to sync NCCL streams
+  std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
+
+  // Device Indexes used for all collectives in this group
+  std::set<int> usedDeviceIdxs_;
+
+  // Flag to denote if a coalescing groupStart/groupEnd block is active
+  int coalescing_state_ = 0;
+
+  // Stores device indexes for all collectives run inside a coalescing block
+  std::vector<at::Device> coalescedDevices_;
+
+  // Stores communicators for all collectives run inside a coalescing block
+  std::vector<std::shared_ptr<NCCLComm>> coalescedComms_;
+
+  // map from the key: "group name + pg counter (ID)" to the
+  // unique NCCL ID count. This needs to be group and pg specific
+  //
+  // For each process group, we need a uniform unique NCCL ID counter to ensure
+  // that NCCL operation in this process group can be completed successfully.
+  // Since each process group ID belongs to a group name, the key to this map
+  // is a combination of group name and ProcessGroupNCCL ID.
+  static std::unordered_map<std::string, ssize_t> pgUniqueNCCLIDCnt_;
+
+  // map from group name to the pg counter (ID) within that group
+  //
+  // For each group with the "group name" (which is the key), we need to
+  // keep track of a unique process group ID when creating a new
+  // ProcessGroupNCCL for this "group name". Therefore, the value of this
+  // map keeps the unique ProcessGroupNCCL's ID for a specific group with
+  // the "group name". The reason we need a per-group process group ID counter
+  // is that different group can have different ranks and we need ensure that
+  // each group has its own uniform process group ID for all its ranks.
+  static std::unordered_map<std::string, ssize_t> processGroupCounterMap_;
+
+  // Whether or not wait() and synchronize() are blocking operations that wait
+  // for the operation to complete.
+  bool blockingWait_ = false;
+
+  // Whether to abort the communicators when users call destroy_process_group().
+  // If yes, communicators will be aborted when destroy_process_group is called,
+  // but not in destructor.
+  bool abortInDestroyProcessGroup_ = false;
+
+  // Whether or not to hook the cache allocator to register all allocated
+  // tensors
+  bool useTensorRegisterAllocatorHook_ = false;
+
+  // Whether or not the workCleanupThread is used to perform async error
+  // handling.
+  ErrorHandlingMode asyncErrorHandling_ = NoHandling;
+
+  // Whether or not to enable timeout root cause analysis.
+  bool desyncDebug_;
+
+  // Whether or not to dump debug info on timeout
+  bool dumpOnTimeout_;
+
+  // Whether or not to create start CUDAEvent and enable timing for start
+  // and end events. Note that enableTiming_ is always true if desyncDebug_
+  // is set to true.
+  std::atomic<bool> enableTiming_;
+
+  // Flag to enable the print of hash value of input/output of collectives for
+  // verification.
+  std::atomic<bool> enableCollecticeHashDebug_;
+
+  // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
+  bool avoidRecordStreams_ = false;
+
+  // Set of communicators that this process group has aborted and their
+  // ncclUniqueId has been written to the store. We don't need a lock
+  // for this map since only the watchdog thread accesses this set. The
+  // set contains the string representation of ncclUniqueId.
+  std::unordered_set<std::string> abortedComms_;
+
+  // The number of active ncclGroupStart() calls. This counter will be increased
+  // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
+  // is called.
+  static thread_local uint64_t ncclActiveGroupCounter_;
+
+  // Counting for the sequential number of NCCL collective call.
+  // (specifically, how many actual kernels we launched, which differs from
+  // op_id_ when coalescing is enabled)
+  uint64_t seq_{0};
+
+  // Incrementing counter for logical operations (collective or p2p) issued on
+  // the ProcessGroup
+  uint64_t op_id_{0};
+
+  // the sequential number of the last colletive enqueued into workMetaList_
+  // This is useful for indentifying a rank that has not join a collective
+  uint64_t lastEnqueuedSeq_;
+
+  // the sequential number of the last colletive completed marked by
+  // the watchdog thread
+  uint64_t lastCompletedSeq_;
+
+  std::exception_ptr watchDogException_ = nullptr;
+
+  size_t uid_;
+
+  std::string logPrefix_;
+
+  c10::intrusive_ptr<intra_node_comm::IntraNodeComm> intraNodeComm_;
+};
+
+TORCH_API std::string dump_nccl_trace();
+
+// Gets a mutable reference to a global optional function.  Heartbeat Monitor
+// will query this function and if available, call it to dump traces. Inside
+// fbcode, we store a function here that uses an internal tool for process
+// tracing
+TORCH_API c10::optional<std::function<std::string()>>& get_cpp_trace_dumper();
+
+// Similar to get_cpp_trace_dumper, this stores a function defined in
+// torch-python layer that lets us check whether the GIL can be acquired,
+// helpful for instrumenting in cases where a hang was observed.
+typedef bool (*gil_checker_t)();
+
+TORCH_API gil_checker_t& get_gil_checker();
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb4b84435866071195080726ee6dd25b31a8d713
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <vector>
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+constexpr const char* ROUND_ROBIN_BACKEND_NAME = "round_robin";
+
+// ProcessGroupRoundRobin implements simple load balancing.
+//
+// It is constructed with multiple processes groups. Each call is dispatched to
+// one of the specified process groups in a round robin fashion. Each process
+// group instance must have the same rank and size.
+//
+// All functions of the class are expected to be called in the same order
+// across all processes in the process group. This is the only way that we
+// can guarantee to match up the same calls among all processes.
+//
+class TORCH_API ProcessGroupRoundRobin final : public ProcessGroup {
+ public:
+  explicit ProcessGroupRoundRobin(
+      int rank,
+      int size,
+      std::vector<c10::intrusive_ptr<ProcessGroup>> processGroups);
+
+  ~ProcessGroupRoundRobin() override;
+
+  const std::string getBackendName() const override {
+    return std::string(ROUND_ROBIN_BACKEND_NAME);
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+ private:
+  std::vector<c10::intrusive_ptr<ProcessGroup>> processGroups_;
+  std::vector<c10::intrusive_ptr<ProcessGroup>>::const_iterator iterator_;
+
+  // Returns the next ProcessGroup to use.
+  const c10::intrusive_ptr<ProcessGroup>& next();
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..121366ffe982ca574a3cb6cb66a81f1b31a80651
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
@@ -0,0 +1,353 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+namespace c10d {
+
+#define TORCH_UCC_DEVICE_NOT_SET -2
+
+#ifdef USE_CUDA
+#define SAVE_TENSORS(_TENSORS, _DATA)                       \
+  do {                                                      \
+    if ((_TENSORS)[0].device().is_cuda()) {                 \
+      for (const auto i : c10::irange((_TENSORS).size())) { \
+        c10::cuda::CUDACachingAllocator::recordStream(      \
+            (_TENSORS)[i].storage().data_ptr(), (*stream)); \
+      }                                                     \
+    } else {                                                \
+      (_DATA) = (_TENSORS);                                 \
+    }                                                       \
+  } while (0)
+
+#else
+#define SAVE_TENSORS(_TENSORS, _DATA) (_DATA) = (_TENSORS);
+#endif
+
+constexpr const char* UCC_BACKEND_NAME = "ucc";
+
+struct event_pool_t {
+#ifdef USE_CUDA
+  std::queue<std::unique_ptr<at::cuda::CUDAEvent>> event_pool;
+#endif
+  std::mutex event_pool_mutex;
+};
+
+class Comm;
+
+// UCC does not support multiple CUDA devices per process.
+class TORCH_API ProcessGroupUCC : public Backend {
+ private:
+  void set_timeout(ucc_coll_args_t& args);
+
+ public:
+  class WorkData {
+   public:
+    std::vector<at::Tensor> src;
+    std::vector<at::Tensor> dst;
+    std::vector<at::Tensor> flat;
+    WorkData() {}
+    virtual ~WorkData() = default;
+  };
+  class AlltoallWorkData : public WorkData {
+   public:
+    AlltoallWorkData(int size)
+        : send_lengths(size),
+          send_offsets(size),
+          recv_lengths(size),
+          recv_offsets(size) {}
+    std::vector<uint64_t> send_lengths;
+    std::vector<uint64_t> send_offsets;
+    std::vector<uint64_t> recv_lengths;
+    std::vector<uint64_t> recv_offsets;
+  };
+
+  class AllgathervWorkData : public WorkData {
+   public:
+    AllgathervWorkData(int size) : recv_lengths(size), recv_offsets(size) {}
+    std::vector<uint64_t> recv_lengths;
+    std::vector<uint64_t> recv_offsets;
+  };
+
+  class ScattervWorkData : public WorkData {
+   public:
+    ScattervWorkData(int size) : send_lengths(size), send_offsets(size) {}
+    std::vector<uint64_t> send_lengths;
+    std::vector<uint64_t> send_offsets;
+  };
+
+  class ProgressEntry {
+    friend class ProcessGroupUCC;
+    friend class Comm;
+
+   public:
+    ProgressEntry(CommBase* comm, ucc_coll_req_h request)
+        : status_(UCC_INPROGRESS), comm_(comm), request_(request) {}
+    // Finalizes UCC status or exception of collective request.
+    void finalize(std::exception_ptr eptr = nullptr);
+    ucc_status_t status_;
+    CommBase* comm_;
+    ucc_coll_req_h request_;
+    std::unique_ptr<WorkData> data;
+    c10::intrusive_ptr<c10::ivalue::Future> future_;
+    std::exception_ptr eptr_;
+  };
+
+  class WorkUCC : public Work {
+    friend class ProcessGroupUCC;
+    friend class Comm;
+
+   public:
+    WorkUCC(
+        OpType opType,
+        uint64_t seq,
+        const char* prof_title,
+        const c10::optional<std::vector<at::Tensor>>& inputs,
+        const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
+        : Work(-1, opType, prof_title, inputs), logger_(logger), seq_(seq) {}
+    ~WorkUCC();
+    void setException();
+    void setAndThrowException();
+    bool isCompleted() override;
+    bool isSuccess() const override;
+    bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+    std::vector<at::Tensor> result() override;
+    int sourceRank() const override;
+#ifdef USE_CUDA
+    std::unique_ptr<at::cuda::CUDAEvent> fence = nullptr;
+    event_pool_t* ep = nullptr;
+#endif
+    int sourceRank_;
+
+   protected:
+    std::shared_ptr<ProgressEntry> entry_;
+    c10::intrusive_ptr<ProcessGroupUCCLogger> logger_;
+    uint64_t seq_;
+
+   private:
+    // The future returned by getFuture.
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+    // Store a reference to collective's outputs, used by result
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
+  };
+
+  explicit ProcessGroupUCC(
+      const c10::intrusive_ptr<Store>& store,
+      int rank = -1,
+      int size = -1,
+      std::chrono::duration<float> timeout = kBackendDefaultTimeout);
+
+  void initComm(c10::Device dev);
+
+  ~ProcessGroupUCC() override;
+
+  const std::string getBackendName() const override {
+    return std::string(UCC_BACKEND_NAME);
+  }
+
+#ifdef USE_CUDA
+  std::unique_ptr<at::cuda::CUDAEvent> getPooledEvent();
+#endif
+
+  // Performs a health check by initializing dummy UCC & UCX communicators and
+  // then destroying them. This will help indicate and signal any
+  // UCC/UCX-related issues prior to the first collective. The actual
+  // initialization and subsequent destruction is ran on a separate thread and
+  // the main thread is signalled about timeouts/errors to report to the
+  // application.
+  void runHealthCheck();
+
+  template <typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective_post(
+      OpType opType,
+      PreProcess preproc,
+      PostProcess postproc,
+      ucc_coll_args_t& coll,
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::Device dev,
+      std::vector<at::Tensor>& inputTensors,
+      std::vector<at::Tensor>& outputTensors,
+      const char* prof_title);
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  // Counting for the sequential number of UCC collective_post call.
+  uint64_t seq_{0};
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  static c10::intrusive_ptr<Backend> createProcessGroupUCC(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      int rank,
+      int size,
+      const std::chrono::duration<float>& timeout);
+
+ protected:
+  const std::chrono::duration<float> timeout_;
+  std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
+  std::shared_ptr<Comm> comm = {nullptr};
+  uint32_t comm_id;
+  ucc_team_h team{nullptr};
+  ucc_ee_h cuda_ee{nullptr};
+  ucc_ee_h cuda_ee_p2p[2]{nullptr, nullptr};
+
+#ifdef USE_CUDA
+  std::unique_ptr<at::cuda::CUDAStream> stream = nullptr;
+  std::unique_ptr<at::cuda::CUDAStream> stream_p2p[2] = {nullptr, nullptr};
+  event_pool_t ep;
+#endif
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+};
+
+class Comm {
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+  std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
+  CommUCC ucc_comm;
+  std::mutex mutex;
+  std::thread progress_thread;
+  std::condition_variable queue_produce_cv;
+  std::condition_variable queue_consume_cv;
+  std::deque<std::shared_ptr<ProcessGroupUCC::ProgressEntry>> progress_queue;
+  bool stop_progress_loop;
+  bool collective_inprogress;
+  torch_ucc_phase_t finalize_phase;
+
+ public:
+  c10::DeviceIndex cuda_device_index;
+  Comm(
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      c10::Device dev,
+      bool is_health_check);
+
+  ~Comm();
+
+  void ucc_create_team(
+      ucc_team_h& team,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob);
+
+  void ucc_destroy_team(ucc_team_h& team);
+
+  c10::intrusive_ptr<Work> enqueue_p2p(
+      OpType opType,
+      ucc_coll_req_h request,
+      const char* prof_title);
+
+#ifdef USE_CUDA
+  void enqueue_cuda_collective(
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::intrusive_ptr<ProcessGroupUCC::WorkUCC> work,
+      ucc_coll_args_t& coll,
+      ucc_team_h team,
+      ucc_ee_h ee);
+#endif
+
+  void enqueue_collective(
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::intrusive_ptr<ProcessGroupUCC::WorkUCC> work,
+      ucc_coll_args_t& coll,
+      ucc_team_h team);
+
+  static std::shared_ptr<Comm> get_comm(
+      uint32_t& id,
+      c10::Device dev,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger,
+      bool is_health_check = false);
+
+  void progress_loop();
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..557689efc14041b4a6cde58a1893ca293390a854
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d {
+
+class TORCH_API ProcessGroupWrapper : public Backend {
+ public:
+  explicit ProcessGroupWrapper(
+      c10::intrusive_ptr<Backend> backend,
+      c10::intrusive_ptr<Backend> glooBackend);
+
+  const std::string getBackendName() const override;
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& data,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  // This function is deprecated and will be moved out of ProcessGroup to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your ProcessGroup, implement _allgather_base
+  //   instead.
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  void monitoredBarrier(const BarrierOptions& opts, bool waitAllRanks = false)
+      override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  // dont implement this
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override; // just call underlying
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const ReduceScatterOptions& opts) override;
+
+  void startCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing() override;
+
+  c10::intrusive_ptr<Backend> getWrappedPg() const;
+
+ private:
+  // Underlying process group that actual application collectives will be
+  // dispatched to
+  c10::intrusive_ptr<Backend> backend_;
+  // Gloo process group responsible for internal coordination such as monitored
+  // barrier, sequence number checking, collective fingerprint collecting.
+  c10::intrusive_ptr<Backend> glooBackend_;
+  // Conducts several checks to ensure that the underlying collective is well
+  // formed with the goal of notifying the user about incorrect collective use
+  // in the application.
+  void runCollectiveChecks(
+      OpType op_type,
+      const std::vector<at::Tensor>& tensors);
+};
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ccfd6426d1dc35391f9f75ccf296298a5565a898
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace c10d {
+
+// PyProcessGroup is a pybind11 trampoline class to allow a Python
+// class to inherit from torch.distributed.ProcessGroup
+class PyProcessGroup : public ProcessGroup {
+ public:
+  // PyWork is a pybind11 trampoline class to allow a Python
+  // class to inherit from torch.distributed.Work
+  class PyWork : public Work {
+   public:
+    PyWork() = default;
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+      PYBIND11_OVERRIDE(
+          bool, /* Return type */
+          Work, /* Parent class */
+          wait, /* Name of function in C++ */
+          timeout);
+    }
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+      // We cannot use PYBIND11_OVERRIDE because:
+      // 1. We have to >MANUALLY< unwrap the PyFutureWrapper and
+      // 2. The python name is get_future
+      pybind11::gil_scoped_acquire gil;
+      auto override =
+          pybind11::get_override(static_cast<const Work*>(this), "get_future");
+
+      if (override) {
+        py::object o = override();
+        auto futWrapper =
+            o.cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>();
+        return futWrapper->fut;
+      }
+
+      return Work::getFuture();
+    }
+  };
+
+  using ProcessGroup::ProcessGroup;
+
+  const std::string getBackendName() const override {
+    PYBIND11_OVERRIDE_PURE(
+        std::string, /* Return type */
+        ProcessGroup, /* Parent class */
+        getBackendName, /* Name of function in C++ */
+    );
+  }
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allgather, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allgather_into_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allreduce, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        allreduce_coalesced, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        barrier, /* Name of function in C++ */
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        broadcast, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        reduce_scatter, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        reduce_scatter_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        send, /* Name of function in C++ */
+        tensors,
+        dstRank,
+        tag);
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override {
+    PYBIND11_OVERRIDE(
+        c10::intrusive_ptr<Work>, /* Return type */
+        ProcessGroup, /* Parent class */
+        recv, /* Name of function in C++ */
+        tensors,
+        srcRank,
+        tag);
+  }
+};
+
+class TORCH_PYTHON_API PythonOnCompletionHook {
+ public:
+  // Wraps a py::object hook and acquires Python GIL in dtor before
+  // destructing the hook object.
+  PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
+
+  ~PythonOnCompletionHook() {
+    py::gil_scoped_acquire ag;
+    hook_.dec_ref();
+    // Explicitly set hook_ to nullptr to prevent py::object's dtor
+    // to decref on the PyObject again.
+    // See Note [Destructing py::object] in python_ivalue.h
+    hook_.ptr() = nullptr;
+  }
+
+  void operator()(std::shared_ptr<WorkInfo> workInfo) const {
+    std::exception_ptr eptr;
+    {
+      py::gil_scoped_acquire acquire;
+      try {
+        hook_(workInfo);
+      } catch (py::error_already_set& e) {
+        // py::error_already_set requires GIL to destruct, take
+        // special care.
+        eptr = std::make_exception_ptr(std::runtime_error(e.what()));
+        e.restore();
+        PyErr_Clear();
+      } catch (std::exception& e) {
+        eptr = std::current_exception();
+      }
+    }
+    // No more Python-related stuff at this point, i.e., this
+    // exception can be captured and handled by PG backend.
+    if (eptr)
+      std::rethrow_exception(eptr);
+  }
+
+ private:
+  py::object hook_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2ec7cc4d8db8c174986a32558506b6359fa211b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp
@@ -0,0 +1,73 @@
+
+#pragma once
+
+#include <shared_mutex>
+
+#include <torch/csrc/autograd/function.h>
+
+namespace c10d {
+
+// `RankLocal` maintains a unique instance of T for each non-autograd thread.
+// For non-autograd threads, `RankLocal<T>::get()` functions similar to
+// thread_local. For autograd threads, `RankLocal<T>::get()` returns the
+// instance of T corresponding to the enqueuing non-autograd thread. The
+// mechanism allows for rank-specific context shared between forward and
+// backward. It works for both the one-rank-per-process and one-rank-per-thread
+// scenarios.
+//
+// NOTE: RankLocal doesn't make the underlying objects thread-safe.
+template <typename T>
+class RankLocal {
+ public:
+  RankLocal(const RankLocal&) = delete;
+  RankLocal& operator=(const RankLocal&) = delete;
+
+  static T& get() {
+    // Fast path: non-autograd threads can simply return
+    // the object reference cached in TLS.
+    if (cached_ != nullptr) {
+      return *cached_;
+    }
+    const auto node = torch::autograd::get_current_node();
+    auto fwd_thread_id = node == nullptr ? at::RecordFunction::currentThreadId()
+                                         : node->thread_id();
+    // Optimistically acquire the read lock first, since most likely we are in
+    // an autograd thread and the object has already been constructed.
+    {
+      std::shared_lock read_lock(lock_);
+      auto it = thread_id_to_rank_local_.find(fwd_thread_id);
+      if (it != thread_id_to_rank_local_.end()) {
+        // Cache for non-autograd threads
+        if (node == nullptr) {
+          cached_ = &it->second;
+        }
+        return it->second;
+      }
+    }
+
+    std::unique_lock write_lock(lock_);
+    auto [it, _] = thread_id_to_rank_local_.try_emplace(fwd_thread_id);
+    // Cache for non-autograd threads
+    if (node == nullptr) {
+      cached_ = &it->second;
+    }
+    return it->second;
+  }
+
+ private:
+  RankLocal(){};
+  thread_local static T* cached_;
+  static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
+  static std::shared_mutex lock_;
+};
+
+template <typename T>
+thread_local T* RankLocal<T>::cached_ = nullptr;
+
+template <typename T>
+std::unordered_map<uint64_t, T> RankLocal<T>::thread_id_to_rank_local_;
+
+template <typename T>
+std::shared_mutex RankLocal<T>::lock_;
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c3f290031918a514846e86596123b483455f358
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+// callback function will be given arguments (optional<string> oldValue,
+// optional<string> newValue)
+using WatchKeyCallback =
+    std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
+
+class TORCH_API Store : public torch::CustomClassHolder {
+ public:
+  static constexpr std::chrono::milliseconds kDefaultTimeout =
+      std::chrono::seconds(300);
+  static constexpr std::chrono::milliseconds kNoTimeout =
+      std::chrono::milliseconds::zero();
+
+  Store() : timeout_(kDefaultTimeout) {}
+
+  explicit Store(const std::chrono::milliseconds& timeout)
+      : timeout_(timeout) {}
+
+  Store(const Store&) = default;
+  Store(Store&&) noexcept = default;
+
+  ~Store() override = default;
+
+  void set(const std::string& key, const std::string& value);
+
+  virtual void set(
+      const std::string& key,
+      const std::vector<uint8_t>& value) = 0;
+
+  std::string compareSet(
+      const std::string& key,
+      const std::string& currentValue,
+      const std::string& newValue);
+
+  virtual std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& currentValue,
+      const std::vector<uint8_t>& newValue) {
+    TORCH_INTERNAL_ASSERT(false, "Not implemented.");
+  }
+
+  std::string get_to_str(const std::string& key);
+
+  virtual std::vector<uint8_t> get(const std::string& key) = 0;
+
+  virtual int64_t add(const std::string& key, int64_t value) = 0;
+
+  virtual bool deleteKey(const std::string& key) = 0;
+
+  virtual bool check(const std::vector<std::string>& keys) = 0;
+
+  virtual int64_t getNumKeys() = 0;
+
+  virtual void wait(const std::vector<std::string>& keys) = 0;
+
+  virtual void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) = 0;
+
+  virtual const std::chrono::milliseconds& getTimeout() const noexcept;
+
+  virtual void setTimeout(const std::chrono::milliseconds& timeout);
+
+  // watchKey() is deprecated and no longer supported.
+  virtual void watchKey(
+      const std::string& /* unused */,
+      WatchKeyCallback /* unused */) {
+    TORCH_CHECK(false, "watchKey is deprecated, no implementation support it.");
+  }
+
+  virtual void append(
+      const std::string& key,
+      const std::vector<uint8_t>& value);
+
+  virtual std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys);
+
+  virtual void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values);
+
+  // Returns true if this store support append, multiGet and multiSet
+  virtual bool hasExtendedApi() const;
+
+ protected:
+  std::chrono::milliseconds timeout_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f32d95c1abdc63c39a9324ffcf31b8f8ef5e041
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+namespace detail {
+
+class TCPServer;
+
+class TCPClient;
+
+struct SocketAddress {
+  std::string host{};
+  std::uint16_t port{};
+};
+
+class Counter {
+ public:
+  void update(double val);
+  std::unordered_map<std::string, double> observe() const;
+
+  double mean() const noexcept {
+    return mean_;
+  }
+  int64_t count() const noexcept {
+    return count_;
+  }
+  double variance() const noexcept {
+    return m2_ / count_;
+  }
+  double sample_variance() const noexcept {
+    return m2_ / (count_ - 1);
+  }
+
+ private:
+  int64_t count_ = 0;
+  double mean_ = 0;
+  double m2_ = 0;
+};
+
+} // namespace detail
+
+struct TCPStoreOptions {
+  static constexpr std::uint16_t kDefaultPort = 29500;
+
+  std::uint16_t port = kDefaultPort;
+  bool isServer = false;
+  c10::optional<std::size_t> numWorkers = c10::nullopt;
+  bool waitWorkers = true;
+  std::chrono::milliseconds timeout = Store::kDefaultTimeout;
+
+  // A boolean value indicating whether multiple store instances can be
+  // initialized with the same host:port pair.
+  bool multiTenant = false;
+
+  // If specified, and if isServer is true, the underlying TCPServer will take
+  // over the bound socket associated to this fd. This option is useful to avoid
+  // port assignment races in certain scenarios.
+  c10::optional<int> masterListenFd = c10::nullopt;
+
+  // A boolean value indicating whether to use the experimental libUV backend.
+  bool useLibUV = false;
+};
+
+class TORCH_API TCPStore : public Store {
+ public:
+  explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});
+
+  [[deprecated("Use TCPStore(host, opts) instead.")]] explicit TCPStore(
+      const std::string& masterAddr,
+      std::uint16_t masterPort,
+      c10::optional<int> numWorkers = c10::nullopt,
+      bool isServer = false,
+      const std::chrono::milliseconds& timeout = kDefaultTimeout,
+      bool waitWorkers = true);
+
+  ~TCPStore() override;
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  int64_t getNumKeys() override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  bool hasExtendedApi() const override;
+
+  // Waits for all workers to join.
+  void waitForWorkers();
+
+  // Returns the hostname used by the TCPStore.
+  const std::string& getHost() const noexcept {
+    return addr_.host;
+  }
+
+  // Returns the port used by the TCPStore.
+  std::uint16_t getPort() const noexcept {
+    return addr_.port;
+  }
+
+  std::unordered_map<std::string, std::unordered_map<std::string, double>>
+  collectClientCounters() const noexcept;
+
+  bool isLibUvBackend() const noexcept {
+    return usingLibUv_;
+  }
+
+  // note(xilunwu): this function is only for internal testing
+  void _splitSet(const std::string& key, const std::vector<uint8_t>& data);
+
+ private:
+  int64_t incrementValueBy(const std::string& key, int64_t delta);
+
+  void validate(void);
+
+  std::vector<uint8_t> doGet(const std::string& key);
+
+  void doWait(
+      c10::ArrayRef<std::string> keys,
+      std::chrono::milliseconds timeout);
+
+  detail::SocketAddress addr_;
+  std::shared_ptr<detail::TCPServer> server_;
+  std::unique_ptr<detail::TCPClient> client_;
+  c10::optional<std::size_t> numWorkers_;
+
+  const std::string initKey_ = "init/";
+  const std::string keyPrefix_ = "/";
+  std::mutex activeOpLock_;
+  std::unordered_map<std::string, detail::Counter> clientCounters_;
+  bool usingLibUv_ = false;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f79b6775419b37b1046bf72ecd3a4e8654fd72f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include <torch/csrc/distributed/c10d/TCPStore.hpp>
+#include <torch/csrc/distributed/c10d/socket.h>
+
+#ifdef _WIN32
+#include <io.h>
+#include <winsock2.h>
+#else
+#include <poll.h>
+#include <unistd.h>
+#endif
+
+namespace c10d {
+namespace detail {
+
+// Magic number for client validation.
+static const uint32_t validationMagicNumber = 0x3C85F7CE;
+
+enum class QueryType : uint8_t {
+  VALIDATE,
+  SET,
+  COMPARE_SET,
+  GET,
+  ADD,
+  CHECK,
+  WAIT,
+  GETNUMKEYS,
+  DELETE_KEY,
+  APPEND,
+  MULTI_GET,
+  MULTI_SET,
+  CANCEL_WAIT,
+};
+
+enum class CheckResponseType : uint8_t { READY, NOT_READY };
+
+enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
+
+// Abstract base class to handle thread state for TCPStoreMasterDaemon.
+// Contains the windows/unix implementations to signal a
+// shutdown sequence for the thread
+class BackgroundThread {
+ public:
+  explicit BackgroundThread();
+
+  virtual ~BackgroundThread() = 0;
+  virtual std::uint16_t port() const = 0;
+
+  void start();
+  bool stop_requested();
+
+ protected:
+  void dispose();
+  virtual void run() = 0;
+  virtual void stop() = 0;
+  bool is_running() {
+    return is_running_.load();
+  }
+
+ private:
+  std::atomic<bool> is_running_;
+  std::thread daemonThread_{};
+};
+
+std::unique_ptr<BackgroundThread> create_tcpstore_backend(
+    const TCPStoreOptions& opts);
+std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
+    const TCPStoreOptions& opts);
+bool is_libuv_tcpstore_backend_available();
+
+} // namespace detail
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..55087f62b3c558fd4bf515fbb0184c80e2bbfb00
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h
@@ -0,0 +1,723 @@
+#pragma once
+
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/profiler/combined_traceback.h>
+
+#include <sys/types.h>
+
+#include <cstdlib>
+#include <string>
+#include <system_error>
+#include <vector>
+namespace c10d {
+
+/* Trace Utils Related to TORCH_NCCL_DESYNC_DEBUG */
+
+inline std::string getTraceStartKey(const std::string& pgName, int rank) {
+  return pgName + "_" + std::to_string(rank) + "_trace_start";
+}
+
+inline std::string getTraceEndKey(const std::string& pgName, int rank) {
+  return pgName + "_" + std::to_string(rank) + "_trace_end";
+}
+
+inline bool traceUpdate(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& key,
+    uint64_t seq,
+    const std::string& col) {
+  std::vector<uint8_t> value(col.size() + sizeof(seq) + 1);
+  memcpy(value.data(), &seq, sizeof(seq));
+  memcpy(value.data() + sizeof(seq), col.data(), col.size());
+  try {
+    store->set(key, value);
+    return true;
+  } catch (...) {
+    LOG(ERROR) << "Store is down while updating #" << seq << " with key "
+               << key;
+    return false;
+  }
+  return true;
+}
+
+enum TraceDebugEvent {
+  kEventStart,
+  kEventEnd,
+};
+// <seq, <rank, <col, start/end>>>
+using TraceMap =
+    std::map<uint64_t, std::map<int, std::pair<std::string, TraceDebugEvent>>>;
+
+inline std::string ranksToString(const std::vector<int>& ranks) {
+  std::string str;
+  for (int rank : ranks) {
+    if (str.empty()) {
+      str = std::to_string(rank);
+    } else {
+      str += ", " + std::to_string(rank);
+    }
+  }
+  return str;
+}
+
+inline std::string ranksFromTrace(
+    const std::vector<std::pair<int, std::string>>& items) {
+  std::string ranks;
+  for (auto& p : items) {
+    if (ranks.empty()) {
+      ranks = std::to_string(p.first);
+    } else {
+      ranks += ", " + std::to_string(p.first);
+    }
+  }
+  return ranks;
+}
+
+inline std::string analyzeMissingRanks(const std::vector<int>& missingRanks) {
+  return c10::str(
+      "\n\t - To our best knowledge, ranks [",
+      ranksToString(missingRanks),
+      "] are the lagging ranks that caused this timeout. "
+      "They never joined any collectives");
+}
+
+inline std::string analyzeLaggingRanks(const TraceMap& traceMap) {
+  uint64_t lagSeq = traceMap.begin()->first;
+  std::vector<int> startRanks;
+  std::vector<int> endRanks;
+  for (auto& p : traceMap.begin()->second) {
+    if (p.second.second == kEventStart) {
+      startRanks.push_back(p.first);
+    } else {
+      endRanks.push_back(p.first);
+    }
+  }
+  std::string report =
+      "\n\t - To our best knowledge, the lagging/dead/mismatched ranks "
+      "that caused the desync are:";
+  if (startRanks.size()) {
+    report += c10::str(
+        "\n\t   - [",
+        ranksToString(startRanks),
+        "] joined but didn't finish collective #",
+        lagSeq,
+        " (count from 1)");
+  }
+  if (endRanks.size()) {
+    report += c10::str(
+        "\n\t     [",
+        ranksToString(endRanks),
+        "] finished collective #",
+        lagSeq,
+        ", but didn't join collective #",
+        lagSeq + 1,
+        " (count from 1)");
+  }
+  return report;
+}
+
+inline std::string dumpSnapshot(TraceMap& traceMap) {
+  std::string report = "\n\t - Snapshot of ranks' latest states:";
+  for (auto& tracePair : traceMap) {
+    uint64_t seq = tracePair.first;
+    std::map<int, std::pair<std::string, TraceDebugEvent>>& subMap =
+        tracePair.second;
+
+    std::unordered_map<std::string, std::vector<int>> collectivesStart;
+    std::unordered_map<std::string, std::vector<int>> collectivesEnd;
+    for (auto& p : subMap) {
+      int rank = p.first;
+      const std::string& col = p.second.first;
+      if (p.second.second == kEventStart) {
+        collectivesStart[col].push_back(rank);
+      } else {
+        collectivesEnd[col].push_back(rank);
+      }
+    }
+
+    if (collectivesStart.size()) {
+      report += c10::str("\n\t   #", seq, " started ranks:");
+      for (auto& mapPair : collectivesStart) {
+        report += c10::str(
+            "\n\t     [",
+            ranksToString(mapPair.second),
+            "] started ",
+            mapPair.first);
+      }
+    }
+    if (collectivesEnd.size()) {
+      report += c10::str("\n\t   #", seq, " finished ranks:");
+      for (auto& mapPair : collectivesEnd) {
+        report += c10::str(
+            "\n\t     [",
+            ranksToString(mapPair.second),
+            "] finished ",
+            mapPair.first);
+      }
+    }
+  }
+  return report;
+}
+
+inline bool parseTraceValue(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& key,
+    uint64_t& seq,
+    std::string& col) {
+  try {
+    std::vector<uint8_t> traceValue = store->get(key);
+    memcpy(&seq, traceValue.data(), sizeof(seq));
+    std::string colName((char*)traceValue.data() + sizeof(seq));
+    col = colName;
+    return true;
+  } catch (...) {
+    LOG(ERROR) << "Store is down while getting key " << key;
+    return false;
+  }
+  return true;
+}
+
+inline std::string retrieveDesyncReport(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& pgName,
+    int myRank,
+    int worldSize) {
+  std::string report;
+
+  uint64_t thisSeq;
+  std::string thisCol;
+
+  std::vector<int> missingRanks;
+  TraceMap traceMap;
+
+  for (const auto rank : c10::irange(worldSize)) {
+    // Build traceMapStart.
+    uint64_t seqStart;
+    {
+      std::string traceKeyStart = getTraceStartKey(pgName, rank);
+      if (!store->check({traceKeyStart})) {
+        missingRanks.push_back(rank);
+        continue;
+      }
+      std::string col;
+      if (!parseTraceValue(store, traceKeyStart, seqStart, col)) {
+        return report;
+      }
+      traceMap[seqStart].emplace(rank, std::make_pair(col, kEventStart));
+      if (rank == myRank) {
+        thisSeq = seqStart;
+        thisCol = std::move(col);
+      }
+    }
+
+    // Build traceMapEnd.
+    {
+      std::string traceKeyEnd = getTraceEndKey(pgName, rank);
+      if (!store->check({traceKeyEnd})) {
+        continue;
+      }
+      uint64_t seq;
+      std::string col;
+      if (!parseTraceValue(store, traceKeyEnd, seq, col)) {
+        return report;
+      }
+      if (seq == seqStart) {
+        traceMap[seq][rank].second = kEventEnd;
+      }
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      !missingRanks.empty() || !traceMap.empty(),
+      "Trace shouldn't be empty while enabled GLOO_ASYNC_TIMEOUT_DEBUG");
+  TORCH_INTERNAL_ASSERT(
+      !thisCol.empty(),
+      "Timeout rank [",
+      myRank,
+      "] must have collective tracking iteam in c10::Store trace");
+  TORCH_INTERNAL_ASSERT(
+      traceMap[thisSeq][myRank].second == kEventStart,
+      "Timeout rank [",
+      myRank,
+      "] last trace item must be kEventStart. thisSeq = ",
+      thisSeq,
+      ", col = ",
+      thisCol);
+
+  report += c10::str(
+      "\n\t - [", myRank, "] Timeout at collective: ", thisCol, ", #", thisSeq);
+
+  if (!missingRanks.empty()) {
+    report += analyzeMissingRanks(missingRanks);
+  } else {
+    report += analyzeLaggingRanks(traceMap);
+    report += dumpSnapshot(traceMap);
+  }
+
+  return report;
+}
+
+/* Trace Utils Related to Flight Recorder */
+
+/* Note: this is only used by PGNCCL (could be generalized in an ideal world but
+ * wasn't done that way, so isn't expected to be fully general at the moment) */
+
+#ifdef USE_C10D_NCCL
+
+/* Helper used by work::getDuration() and nccl flight recorder */
+float getDurationFromEvent(
+    at::cuda::CUDAEvent& ncclStartEvent,
+    at::cuda::CUDAEvent& ncclEndEvent) {
+  TORCH_CHECK(
+      ncclEndEvent.query(),
+      "getDuration can only be called after work is succeeded.")
+  return ncclStartEvent.elapsed_time(ncclEndEvent);
+}
+
+DebugInfoWriter::~DebugInfoWriter() = default;
+
+void DebugInfoWriter::write(const std::string& ncclTrace) {
+  // Open a file for writing. The ios::binary flag is used to write data as
+  // binary.
+  std::ofstream file(filename_, std::ios::binary);
+
+  // Check if the file was opened successfully.
+  if (!file.is_open()) {
+    LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
+               << filename_;
+    return;
+  }
+
+  file.write(ncclTrace.data(), ncclTrace.size());
+  LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_;
+}
+
+DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
+  if (writer_ == nullptr) {
+    std::string fileNamePrefix = getCvarString(
+        {"TORCH_NCCL_DEBUG_INFO_TEMP_FILE"}, "/tmp/nccl_trace_rank_");
+    // Using std::unique_ptr here to auto-delete the writer object
+    // when the pointer itself is destroyed.
+    std::unique_ptr<DebugInfoWriter> writerPtr(
+        new DebugInfoWriter(fileNamePrefix, rank));
+    DebugInfoWriter::registerWriter(std::move(writerPtr));
+  }
+  return *writer_;
+}
+
+void DebugInfoWriter::registerWriter(std::unique_ptr<DebugInfoWriter> writer) {
+  TORCH_CHECK_WITH(
+      DistBackendError,
+      hasWriterRegistered_.load() == false,
+      "debugInfoWriter already registered");
+  hasWriterRegistered_.store(true);
+  writer_ = std::move(writer);
+}
+
+std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
+std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
+
+inline std::string pickle_str(const c10::IValue& v) {
+  std::vector<char> result;
+  {
+    auto writer = [&](const char* data, size_t size) {
+      result.insert(result.end(), data, data + size);
+    };
+    torch::jit::Pickler pickler(
+        writer, nullptr, nullptr, nullptr, nullptr, false);
+    pickler.protocol();
+    pickler.pushIValue(v);
+    pickler.stop();
+  }
+  return std::string(result.begin(), result.end());
+}
+
+inline std::string get_python_cpp_trace() {
+  // usage:
+  // LOG(INFO) << "stacktrace: "
+  //           << get_python_cpp_trace();
+  // warn: might be slow in getting cpp traces
+  // because of slow/broken addr2line
+  // in different system libs
+  std::shared_ptr<torch::CapturedTraceback> tb =
+      torch::CapturedTraceback::gather(
+          /*python=*/true, /*script=*/true, /*cpp=*/true);
+  torch::SymbolizedTracebacks s_tbs = torch::symbolize({tb.get()});
+  const auto& s_tb = s_tbs.tracebacks.at(0);
+  std::stringstream oss;
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+        << ":" << frame.lineno << std::endl;
+  }
+  return oss.str();
+}
+
+inline c10::Dict<c10::IValue, c10::IValue> new_dict() {
+  return c10::Dict<c10::IValue, c10::IValue>(
+      c10::AnyType::get(), c10::AnyType::get());
+}
+
+inline c10::List<c10::IValue> new_list() {
+  return c10::List<c10::IValue>(c10::AnyType::get());
+}
+
+struct NCCLTraceBuffer {
+  static NCCLTraceBuffer* get() {
+    // intentionally leak on exit
+    // because this will hold python state that may get destructed
+    static NCCLTraceBuffer* instance = new NCCLTraceBuffer();
+    return instance;
+  }
+  NCCLTraceBuffer() {
+    max_entries_ = getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0);
+    capture_cpp_stack_ = getCvarBool({"TORCH_NCCL_TRACE_CPP_STACK"}, false);
+    enabled_ = max_entries_ > 0;
+    pg_id_to_ranks_ = {};
+  }
+  using Event = at::cuda::CUDAEvent;
+  struct Entry {
+    size_t id_; // incremented id in the trace buffer
+                // used to figure out where in the circular entries
+                // buffer this entry will be located to
+                // update state information
+    size_t pg_id_;
+
+    // Both seq_id_ and op_id_ are per_pg incrementing counters
+    // seq_id refers to actual kernel launches (e.g. 1 per coalesced group)
+    // op_id refers to logical operations (e.g. one per op inside coalesced
+    // group)
+    size_t seq_id_;
+    size_t op_id_;
+    std::string profiling_name_;
+
+    std::shared_ptr<torch::CapturedTraceback> traceback_;
+    // we borrow pointers to start_ and end_ so we can query the state
+    // on reporting. However, once the event is completed, the call
+    // to `complete` will clear these.
+    Event *start_, *end_;
+
+    // timestamp when the entry was created, likely close to the time the work
+    // was 'enqueued'- not necessarily started
+    c10::time_t time_created_;
+    c10::optional<float> duration_;
+
+    // timestamp when our CPU threads discovered that the kernel started.
+    // will always be _after_ it actually started, and can be very late
+    // if the watchdog thread got stuck on CUDA APIs.
+    c10::optional<c10::time_t> time_discovered_started_;
+
+    // timestamp when our CPU threads discovered that the kernel completed.
+    // will always be _after_ it actually complated, and can be the same time
+    // as the discovery of the start if the watchdog thread is stuck on CUDA
+    // APIs
+    c10::optional<c10::time_t> time_discovered_completed_;
+
+    // size information for input/output tensors
+    c10::SmallVector<int, 4> input_dims_;
+    c10::SmallVector<int, 4> output_dims_;
+    c10::SmallVector<int64_t, 8> sizes_; // flattened from inputs, outputs
+    bool retired_ = false; // is this work entry no longer in the workMetaList_?
+                           // a retired but not completed event has timed out
+  };
+
+  bool enabled_ = false;
+  bool capture_cpp_stack_ = false;
+  std::mutex mutex_;
+  std::vector<Entry> entries_;
+  size_t max_entries_ = 0;
+  size_t next_ = 0;
+  size_t id_ = 0;
+  std::map<size_t, std::vector<uint64_t>> pg_id_to_ranks_;
+
+  c10::optional<size_t> record(
+      size_t pg_id,
+      size_t seq_id,
+      size_t op_id,
+      std::string profiling_name,
+      const std::vector<at::Tensor>& inputs,
+      const std::vector<at::Tensor>& outputs,
+      Event* start,
+      Event* end) {
+    if (!enabled_) {
+      return c10::nullopt;
+    }
+    auto traceback =
+        torch::CapturedTraceback::gather(true, true, capture_cpp_stack_);
+    std::lock_guard<std::mutex> guard(mutex_);
+
+    auto te = Entry{
+        id_,
+        pg_id,
+        seq_id,
+        op_id,
+        std::move(profiling_name),
+        std::move(traceback),
+        std::move(start),
+        std::move(end),
+        c10::getTime()};
+
+    for (const auto& input : inputs) {
+      c10::IntArrayRef sizes = input.sizes();
+      te.input_dims_.push_back(sizes.size());
+      te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+    }
+
+    for (const auto& output : outputs) {
+      c10::IntArrayRef sizes = output.sizes();
+      te.output_dims_.push_back(sizes.size());
+      te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+    }
+
+    if (entries_.size() < max_entries_) {
+      entries_.emplace_back(std::move(te));
+    } else {
+      entries_[next_++] = std::move(te);
+      if (next_ == max_entries_) {
+        next_ = 0;
+      }
+    }
+    return id_++;
+  }
+
+  void record_pg_ranks(size_t pg_id, std::vector<uint64_t> ranks) {
+    if (!enabled_) {
+      return;
+    }
+    std::lock_guard<std::mutex> guard(mutex_);
+    pg_id_to_ranks_[pg_id] = ranks;
+  }
+
+  void update_state(Entry& r) {
+    if (r.start_ != nullptr) {
+      bool started = r.start_->query();
+      if (started && !r.time_discovered_started_) {
+        r.time_discovered_started_ = c10::getTime();
+      }
+    }
+    if (r.end_ != nullptr) {
+      bool completed = r.end_->query();
+      if (completed && !r.time_discovered_completed_) {
+        r.time_discovered_completed_ = c10::getTime();
+      }
+    }
+  }
+
+  std::vector<Entry> dump_entries() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    std::vector<Entry> result;
+    result.reserve(entries_.size());
+    result.insert(result.end(), entries_.begin() + next_, entries_.end());
+    result.insert(result.end(), entries_.begin(), entries_.begin() + next_);
+    // query any remaining events
+    for (auto& r : result) {
+      update_state(r);
+      r.start_ = r.end_ = nullptr;
+    }
+    return result;
+  }
+
+  /*
+  Mark an Event as completed and free its events.
+
+  This is called by the watchdog thread, and is asynchronous from the
+  perspective of the main thread.
+
+  compute_duration defaults to true since retire_id is only called in the
+  watchdog thread, which is currently a place we call cuda APIs which may hang,
+  but care should be taken to avoid computing duration in any function that must
+  never hang. (timing must also be enabled for compute_duration - see
+  TORCH_NCCL_ENABLE_TIMING).
+  */
+  void retire_id(c10::optional<size_t> id, bool compute_duration = true) {
+    if (!enabled_ || !id) {
+      return;
+    }
+
+    bool can_compute_duration = false;
+    Event* startEvent = nullptr;
+    Event* endEvent = nullptr;
+    c10::optional<float> duration = c10::nullopt;
+
+    std::unique_lock<std::mutex> guard(mutex_);
+
+    Entry* entry = &entries_.at(*id % max_entries_);
+    if (entry->id_ == *id) {
+      update_state(*entry);
+
+      if (compute_duration) {
+        can_compute_duration = entry->time_discovered_completed_.has_value() &&
+            entry->start_ && entry->end_;
+        startEvent = entry->start_;
+        endEvent = entry->end_;
+      }
+    }
+
+    if (can_compute_duration) {
+      // Compute duration without without holding the lock, because
+      // cudaEventDuration() can hang, and we need to acquire the lock before we
+      // can dump(), which we never want to block.
+      guard.unlock();
+      duration = getDurationFromEvent(*startEvent, *endEvent);
+      guard.lock();
+
+      // Refresh the entry pointer, see if the entry has been overwritten
+      entry = &entries_.at(*id % max_entries_);
+      if (entry->id_ != *id) {
+        LOG(INFO)
+            << "retire_id abandoned for id " << *id
+            << ", event was overwritten while waiting to compute duration.";
+        return;
+      }
+      if (duration.has_value()) {
+        entry->duration_ = duration.value();
+      }
+    }
+
+    entry->retired_ = true;
+    entry->start_ = entry->end_ = nullptr;
+  }
+
+  std::string dump(
+      const c10::optional<std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>>& ncclDumpMap) {
+    auto result = dump_entries();
+    auto entries = new_list();
+    c10::IValue entries_key = "entries";
+    c10::IValue nccl_comm_key = "nccl_comm_state";
+    c10::IValue version_key = "version";
+    // Update whenever changing contents or formatting of the dump
+    // (minor when adding fields, major when changing existing fields)
+    c10::IValue version_val = "1.4";
+    c10::IValue pg_config_key = "pg_config";
+    c10::IValue record_id_key = "record_id";
+    c10::IValue pg_id_key = "pg_id";
+    c10::IValue seq_id_key = "seq_id";
+    c10::IValue op_id_key = "op_id";
+    c10::IValue profiling_name_key = "profiling_name";
+    c10::IValue input_sizes_key = "input_sizes";
+    c10::IValue output_sizes_key = "output_sizes";
+    c10::IValue time_created_key = "time_created_ns";
+    c10::IValue duration_key = "duration_ms";
+
+    c10::IValue frames_key = "frames";
+    c10::IValue state_key = "state";
+    c10::IValue line_key = "line";
+    c10::IValue name_key = "name";
+    c10::IValue filename_key = "filename";
+    c10::IValue retired_key = "retired";
+    c10::IValue time_discovered_started_key = "time_discovered_started_ns";
+    c10::IValue time_discovered_completed_key = "time_discovered_completed_ns";
+
+    std::vector<torch::CapturedTraceback*> tracebacks;
+    for (auto& e : result) {
+      tracebacks.push_back(e.traceback_.get());
+    }
+    torch::SymbolizedTracebacks stracebacks = torch::symbolize(tracebacks);
+    std::vector<c10::IValue> all_frames;
+    for (const auto& f : stracebacks.all_frames) {
+      auto d = new_dict();
+      d.insert(name_key, f.funcname);
+      d.insert(filename_key, f.filename);
+      d.insert(line_key, int64_t(f.lineno));
+      all_frames.emplace_back(std::move(d));
+    }
+
+    for (auto i : c10::irange(result.size())) {
+      auto& e = result.at(i);
+      auto& tb = stracebacks.tracebacks.at(i);
+      auto dict = new_dict();
+      dict.insert(record_id_key, int64_t(e.id_));
+      dict.insert(pg_id_key, int64_t(e.pg_id_));
+      dict.insert(seq_id_key, int64_t(e.seq_id_));
+      dict.insert(op_id_key, int64_t(e.op_id_));
+      dict.insert(profiling_name_key, e.profiling_name_);
+      dict.insert(time_created_key, int64_t(e.time_created_));
+      if (e.duration_) {
+        dict.insert(duration_key, *e.duration_);
+      }
+
+      auto it = e.sizes_.begin();
+      auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
+        auto sizes = new_list();
+        for (auto dim : dims) {
+          auto arg_sizes = new_list();
+          for (auto i : c10::irange(dim)) {
+            (void)i;
+            arg_sizes.push_back(*it++);
+          }
+          sizes.push_back(arg_sizes);
+        }
+        return sizes;
+      };
+
+      dict.insert(input_sizes_key, read_sizes(e.input_dims_));
+      dict.insert(output_sizes_key, read_sizes(e.output_dims_));
+      if (e.time_discovered_completed_.has_value()) {
+        dict.insert(state_key, "completed");
+      } else if (e.time_discovered_started_.has_value()) {
+        dict.insert(state_key, "started");
+      } else {
+        dict.insert(state_key, "scheduled");
+      }
+
+      dict.insert(
+          time_discovered_started_key,
+          e.time_discovered_started_.has_value()
+              ? int64_t(*e.time_discovered_started_)
+              : c10::IValue());
+      dict.insert(
+          time_discovered_completed_key,
+          e.time_discovered_completed_.has_value()
+              ? int64_t(*e.time_discovered_completed_)
+              : c10::IValue());
+      dict.insert(retired_key, e.retired_);
+
+      auto frames = new_list();
+      for (int64_t frame : tb) {
+        frames.push_back(all_frames.at(frame));
+      }
+      dict.insert(frames_key, frames);
+      entries.push_back(dict);
+    }
+    auto pg_config = new_dict();
+    for (const auto& [pg_id, ranks] : pg_id_to_ranks_) {
+      auto pg_ranks = new_list();
+      for (const auto& rank : ranks) {
+        pg_ranks.push_back(static_cast<int>(rank));
+      }
+      pg_config.insert(static_cast<int>(pg_id), pg_ranks);
+    }
+
+    // convert ncclDumpMap into a dictionary
+    auto per_comm_dict = new_dict();
+    if (ncclDumpMap.has_value()) {
+      for (const auto& [ncclId, ncclDump] : ncclDumpMap.value()) {
+        auto inner_dict = new_dict();
+        for (const auto& [key, value] : ncclDump) {
+          inner_dict.insert(key, value);
+        }
+        per_comm_dict.insert(ncclId, inner_dict);
+      }
+    }
+
+    auto dict = new_dict();
+    dict.insert(entries_key, entries);
+    dict.insert(version_key, version_val);
+    if (per_comm_dict.size() > 0) {
+      dict.insert(nccl_comm_key, per_comm_dict);
+    }
+    dict.insert(pg_config_key, pg_config);
+
+    return pickle_str(dict);
+  }
+};
+
+#endif
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..85e0c3db13ca80c2f3a77ee86c0946c2d5c778fe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp
@@ -0,0 +1,180 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+#include <chrono>
+#include <cstdint>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10d {
+
+// Base class for supplementary data potentially needed by ReduceOps
+struct TORCH_API _SupplementBase : torch::CustomClassHolder {
+  ~_SupplementBase() override = default;
+};
+
+// Supplementary data specific to NCCL PREMUL_SUM
+// The point of use in ProcessGroupNCCL knows how to unpack it.
+struct NCCLPreMulSumSupplement : _SupplementBase {
+  double double_factor{0.0};
+  at::Tensor tensor_factor;
+  NCCLPreMulSumSupplement(double f) : double_factor{f} {}
+  NCCLPreMulSumSupplement(at::Tensor t) : tensor_factor{std::move(t)} {
+    TORCH_CHECK_EQ(tensor_factor.numel(), 1);
+  }
+};
+
+// Other ReduceOps that need different supplementary data can also
+// derive from _SupplementBase.
+struct TORCH_API ReduceOp : torch::CustomClassHolder {
+  // note(crcrpar): RedOpType could be defined outside of `ReduceOp`
+  enum RedOpType : uint8_t {
+    SUM = 0,
+    AVG = 1,
+    PRODUCT = 2,
+    MIN = 3,
+    MAX = 4,
+    BAND = 5, // Bitwise AND
+    BOR = 6, // Bitwise OR
+    BXOR = 7, // Bitwise XOR
+    PREMUL_SUM = 8, // Multiply by a user-supplied constant before summing.
+    UNUSED = 9
+  };
+
+  ReduceOp() = default;
+
+  ReduceOp(RedOpType op) : op_(op) {
+    TORCH_INTERNAL_ASSERT(
+        op_ != PREMUL_SUM,
+        "Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM");
+  }
+
+  ReduceOp(
+      RedOpType op,
+      c10::intrusive_ptr<_SupplementBase> optional_supplement) {
+    if (optional_supplement.get()) {
+      op_ = op;
+    } else {
+      supplement_ = optional_supplement;
+    }
+  }
+
+  // The heap resource supplement_, if it exists, is managed by a
+  // c10::intrusive_ptr, so constructors and operator= can be simple
+  ReduceOp(const ReduceOp& other)
+      : op_(other.op_), supplement_(other.supplement_) {}
+
+  const ReduceOp& operator=(const ReduceOp& other) {
+    op_ = other.op_;
+    supplement_ = other.supplement_;
+    return *this;
+  }
+
+  operator RedOpType() const {
+    return op_;
+  }
+
+  bool operator==(const std::uint8_t other) {
+    TORCH_INTERNAL_ASSERT(other < 9, "Invalid other op value");
+    return other == op_;
+  }
+
+  bool operator==(const ReduceOp::RedOpType other) {
+    return *this == static_cast<std::uint8_t>(other);
+  }
+
+  // todo(crcrpar): Handle `RedOpType::PREMUL_SUM` with its scaling factor.
+  bool operator==(const ReduceOp& other) {
+    return *this == other.op_;
+  }
+
+  RedOpType op_ = SUM;
+  // supplement_ is "type-erased" storage for optional supplementary
+  // data the op might need.
+  // The point of use will know the derived type supplement_ really is,
+  // and downcast its pointer to extract the data as the needed type(s).
+  // Right now, only PREMUL_SUM needs supplementary data, but the same
+  // mechanism could extend to support other nontrivial reduce ops with
+  // different supplementary payloads.
+  c10::intrusive_ptr<_SupplementBase> supplement_;
+};
+
+template <typename T>
+ReduceOp makeNCCLPreMulSum(const T& factor) {
+  ReduceOp rop;
+  rop.op_ = ReduceOp::PREMUL_SUM;
+  rop.supplement_ = c10::make_intrusive<NCCLPreMulSumSupplement>(factor);
+  return rop;
+}
+
+constexpr auto kUnsetTimeout = std::chrono::milliseconds(-1);
+
+struct BroadcastOptions {
+  int64_t rootRank = 0;
+  int64_t rootTensor = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct AllreduceOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  c10::optional<at::Tensor> sparseIndices = c10::nullopt;
+};
+
+struct AllreduceCoalescedOptions : AllreduceOptions {};
+
+struct ReduceOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  int64_t rootRank = 0;
+  int64_t rootTensor = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+};
+
+struct AllgatherOptions {
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct GatherOptions {
+  int64_t rootRank = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+};
+
+struct ScatterOptions {
+  int64_t rootRank = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct ReduceScatterOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct AllToAllOptions {
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+};
+
+struct BarrierOptions {
+  std::vector<int64_t> device_ids;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  c10::optional<at::Device> device;
+};
+
+struct DistributedBackendOptions {
+  c10::intrusive_ptr<::c10d::Store> store;
+  int group_rank;
+  int group_size;
+  std::chrono::duration<float> timeout;
+  std::string group_id;
+  std::vector<int64_t> global_ranks_in_group;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6a34920f6918f4437017e95894a56d7f71c95ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+
+namespace c10d {
+
+#define RECORD_COMMS_TRACE(                                                    \
+    _comms_tracer, _work, _opType, _rank, _comm_size, _inTensors, _outTensors) \
+  do {                                                                         \
+    if (torch_ucc_config.enable_comms_logger) {                                \
+      _comms_tracer->recordComms(                                              \
+          opTypeToString(_opType),                                             \
+          (uintptr_t)_work.get(),                                              \
+          _rank,                                                               \
+          _comm_size,                                                          \
+          _inTensors,                                                          \
+          _outTensors);                                                        \
+    }                                                                          \
+  } while (0)
+
+// interfaces to collect communication traces
+class TORCH_API CommTraceLogger : public torch::CustomClassHolder {
+ private:
+  std::vector<std::string> comms_trace_;
+  std::vector<std::string> curBlocks_; /* unused */
+  std::vector<int64_t> curOutSplitSizes_;
+  std::vector<int64_t> curInSplitSizes_;
+  int curRoot_ = -1;
+  unsigned long seqnum = 0;
+
+ public:
+  void setCurBlock(const std::string& name); /* unused */
+  void popBlock(); /* unused */
+  // record root info if applicable, e.g., broadcast, gather, scatter
+  void recordOptionalInfo(int root = -1);
+  // record input/output splits of Alltoallv
+  void recordOptionalInfo(
+      const std::vector<int64_t>& outputSplitSizes = {},
+      const std::vector<int64_t>& inputSplitSizes = {});
+  // record essential comms information
+  void recordComms(
+      const std::string& collName,
+      const uintptr_t workReq = 0,
+      const int rank = -1,
+      const int world_size = -1,
+      const std::vector<at::Tensor>& inputTensors = {},
+      const std::vector<at::Tensor>& outputTensor = {});
+  // return collected comms traces
+  std::vector<std::string>& getCommsTrace() {
+    return comms_trace_;
+  }
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ada1ac5412cc7e284559c1c2dab3bca03db0ad30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp
@@ -0,0 +1,187 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <ucc/api/ucc.h>
+
+namespace c10d {
+
+// Macro to generate the error message on a non-successful UCC return value.
+#define TORCH_UCC_GET_ERROR_MSG(_err, _error_msg, _result) \
+  do {                                                     \
+    _err = c10::str(                                       \
+        "[",                                               \
+        std::string(__FILE__),                             \
+        ":",                                               \
+        std::to_string(__LINE__),                          \
+        "] ",                                              \
+        logger->getLogPrefix(),                            \
+        _error_msg,                                        \
+        ", error code ",                                   \
+        _result,                                           \
+        ": ",                                              \
+        ucc_status_string(_result),                        \
+        ", system error code ",                            \
+        errno);                                            \
+  } while (0)
+
+// Macro to throw on a non-successful UCC return value.
+#define TORCH_UCC_CHECK(_cmd, _error_msg)               \
+  do {                                                  \
+    ucc_status_t result = _cmd;                         \
+    if (result != UCC_OK) {                             \
+      std::string err;                                  \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result); \
+      TORCH_CHECK(false, err);                          \
+    }                                                   \
+  } while (0)
+
+// Macro and throw on a non-successful UCC return value and free its request.
+#define TORCH_UCC_CHECK_REQUEST(_request, _cmd, _error_msg) \
+  do {                                                      \
+    ucc_status_t result = _cmd;                             \
+    if (result != UCC_OK) {                                 \
+      std::string err;                                      \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result);     \
+      if (_request != nullptr) {                            \
+        ucc_collective_finalize(_request);                  \
+      }                                                     \
+      TORCH_CHECK(false, err);                              \
+    }                                                       \
+  } while (0)
+
+// Macros to print logs with unified format
+#define TORCH_UCC_LOG_ERROR(_phase, _msg) \
+  LOG(ERROR) << logger->getLogPrefix(_phase) << "[ERROR] " << _msg;
+#define TORCH_UCC_LOG_INFO(_phase, _msg) \
+  LOG(INFO) << logger->getLogPrefix(_phase) << "[INFO] " << _msg;
+#define TORCH_UCC_LOG_DEBUG(_phase, _msg) \
+  VLOG(1) << logger->getLogPrefix(_phase) << "[DEBUG] " << _msg;
+
+enum torch_ucc_phase_t {
+  TORCH_UCC_UNKNOWN = -1,
+  TORCH_UCC_INIT,
+  TORCH_UCC_HEALTH_CHECK,
+  TORCH_UCC_READY,
+  TORCH_UCC_COLL_POST,
+  TORCH_UCC_COLL_PROGRESS,
+  TORCH_UCC_FINALIZE,
+};
+
+const std::map<torch_ucc_phase_t, std::string> ucc_phase_map = {
+    {TORCH_UCC_UNKNOWN, "UNKNOWN"},
+    {TORCH_UCC_INIT, "INIT"},
+    {TORCH_UCC_HEALTH_CHECK, "HEALTH_CHECK"},
+    {TORCH_UCC_READY, "READY"},
+    {TORCH_UCC_COLL_POST, "COLL_POST"},
+    {TORCH_UCC_COLL_PROGRESS, "COLL_PROGRESS"},
+    {TORCH_UCC_FINALIZE, "FINALIZE"},
+};
+
+class CommTraceLogger;
+
+class TORCH_API ProcessGroupUCCLogger : public torch::CustomClassHolder {
+ public:
+  ProcessGroupUCCLogger();
+  ProcessGroupUCCLogger(std::string log_prefix, torch_ucc_phase_t phase);
+
+  std::string getLogPrefix(torch_ucc_phase_t phase = TORCH_UCC_UNKNOWN);
+  void setLogPrefix(std::string log_prefix);
+  inline void setPhase(torch_ucc_phase_t phase) {
+    local_phase = phase;
+  }
+
+  void initCommsTracer();
+  void flushComms(int rank, int world_size);
+  std::shared_ptr<CommTraceLogger> trace_generator = nullptr;
+
+ protected:
+  std::string log_prefix;
+  torch_ucc_phase_t local_phase = TORCH_UCC_UNKNOWN;
+  bool initialized_CommTraceLogger = false;
+};
+
+struct torch_ucc_oob_coll_info_t {
+  c10::intrusive_ptr<Store> store;
+  uint32_t comm_id;
+  int rank;
+  int size;
+  void* rbuf;
+  size_t msglen;
+  std::string getKey(std::string key) {
+    return std::to_string(comm_id) + key;
+  }
+};
+
+class CommBase {
+ public:
+  CommBase(const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger_)
+      : logger(logger_) {}
+  virtual void progress() = 0;
+  virtual void free_request(ucc_coll_req_h request) = 0;
+  virtual ~CommBase() {}
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+};
+class CommUCC : public CommBase {
+ public:
+  ucc_lib_h lib{nullptr};
+  ucc_context_h context{nullptr};
+
+ public:
+  void progress() override;
+  CommUCC(
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger);
+  void free_request(ucc_coll_req_h request) override;
+  ~CommUCC();
+};
+
+ucc_status_t oob_allgather(
+    void* sbuf,
+    void* rbuf,
+    size_t msglen,
+    void* coll_info,
+    void** req);
+
+ucc_status_t oob_allgather_test(void* req);
+
+ucc_status_t oob_allgather_free(void* req);
+
+// trim: remove spaces before and after the string view
+// implementation borrowed from https://stackoverflow.com/a/17976541
+inline c10::string_view trim(c10::string_view s) {
+  auto wsfront = std::find_if_not(
+      s.begin(), s.end(), [](int c) { return std::isspace(c); });
+  auto wsback = std::find_if_not(s.rbegin(), s.rend(), [](int c) {
+                  return std::isspace(c);
+                }).base();
+  return (
+      wsback <= wsfront ? "" : s.substr(wsfront - s.begin(), wsback - wsfront));
+}
+
+inline std::string tolower(c10::string_view s) {
+  std::string result;
+  result.reserve(s.size());
+  for (auto c : s) {
+    result.push_back(std::tolower(c));
+  }
+  return result;
+}
+
+inline std::vector<std::string> parse_list(std::string list) {
+  std::vector<std::string> result;
+  list = tolower(trim(list));
+  while (!list.empty()) {
+    const auto end_pos = list.find_first_of(',');
+    const auto token = trim(list.substr(0, end_pos));
+    result.push_back(std::string(token));
+    list = (end_pos != c10::string_view::npos) ? list.substr(end_pos + 1) : "";
+  }
+  return result;
+}
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..66eb8919327ff83fb8cb4e77ba78c5e86ddd6a3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d {
+namespace tcputil {
+
+#define CONNECT_SOCKET_OFFSET 2
+
+inline int poll(struct pollfd* fds, unsigned long nfds, int timeout) {
+  return ::poll(fds, nfds, timeout);
+}
+
+inline void addPollfd(
+    std::vector<struct pollfd>& fds,
+    int socket,
+    short events) {
+  fds.push_back({.fd = socket, .events = events});
+}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {.fd = socket, .events = events};
+  return res;
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a33db035525d510cc62da5bc8b891e5854f49e53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp
@@ -0,0 +1,731 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+typedef SSIZE_T ssize_t;
+#pragma comment(lib, "Ws2_32.lib")
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <sys/types.h>
+
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <vector>
+
+namespace c10d {
+
+TORCH_API size_t getTensorsNumel(const std::vector<at::Tensor>& tensors);
+
+// Retrieve tensor shapes from a given tensor.
+TORCH_API std::vector<at::Tensor> getTensorShapes(
+    const std::vector<at::Tensor>& tensors);
+
+// Use -2 to represent unset state of env vars
+#define C10D_ENV_NOT_SET -2
+
+// Turns at::IntArrayRef into "(1, 2, 3, 4)".
+inline std::string toString(at::IntArrayRef l) {
+  std::stringstream ss;
+  ss << "(";
+  for (const auto i : c10::irange(l.size())) {
+    if (i > 0) {
+      ss << ", ";
+    }
+    ss << l[i];
+  }
+  ss << ")";
+  return ss.str();
+}
+
+inline std::string toString(const c10::Layout& layout) {
+  std::stringstream ss;
+  ss << layout;
+  return ss.str();
+}
+
+inline void assertSameType(
+    const at::DeprecatedTypeProperties& type,
+    const std::vector<at::Tensor>& tensors) {
+  for (const auto i : c10::irange(tensors.size())) {
+    if (!tensors[i].options().type_equal(type.options())) {
+      const std::string expected = type.toString();
+      const std::string actual = tensors[i].toString();
+      throw std::invalid_argument(
+          "mixed types (" + expected + " and " + actual + ")");
+    }
+  }
+}
+
+inline std::vector<std::string> split(
+    char separator,
+    const std::string& string) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (std::getline(ss, item, separator)) {
+    pieces.push_back(std::move(item));
+  }
+  return pieces;
+}
+
+inline std::string getCvarString(
+    const std::vector<std::string>& env,
+    const char* def) {
+  const char* ret = def;
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (int i = env.size() - 1; i >= 0; i--) {
+    const char* val = std::getenv(env[i].c_str());
+    if (val == nullptr) {
+      continue;
+    } else if (i) {
+      TORCH_WARN(
+          "Environment variable " + env[i] + " is deprecated; use " + env[0] +
+          " instead");
+    }
+
+    ret = val;
+  }
+
+  return ret;
+}
+
+inline int getCvarInt(const std::vector<std::string>& env, int def) {
+  int ret = def;
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (int i = env.size() - 1; i >= 0; i--) {
+    char* val = std::getenv(env[i].c_str());
+    if (val == nullptr) {
+      continue;
+    } else if (i) {
+      TORCH_WARN(
+          "Environment variable " + env[i] + " is deprecated; use " + env[0] +
+          " instead");
+    }
+
+    try {
+      ret = std::stoi(val);
+    } catch (std::exception& e) {
+      TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
+    }
+  }
+
+  return ret;
+}
+
+inline bool getCvarBool(const std::vector<std::string>& env, bool def) {
+  bool ret = def;
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (int i = env.size() - 1; i >= 0; i--) {
+    char* val_ = std::getenv(env[i].c_str());
+    if (val_ == nullptr) {
+      continue;
+    } else if (i) {
+      TORCH_WARN(
+          "Environment variable " + env[i] + " is deprecated; use " + env[0] +
+          " instead");
+    }
+
+    std::string val = std::string(val_);
+    for (auto& x : val) {
+      x = std::tolower(x);
+    }
+
+    if (val == "y" || val == "yes" || val == "1" || val == "t" ||
+        val == "true") {
+      ret = true;
+    } else if (
+        val == "n" || val == "no" || val == "0" || val == "f" ||
+        val == "false") {
+      ret = false;
+    } else {
+      TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+inline void assertSameSizes(
+    const at::IntArrayRef& sizes,
+    const std::vector<at::Tensor>& tensors) {
+  for (const auto i : c10::irange(tensors.size())) {
+    if (!tensors[i].sizes().equals(sizes)) {
+      const auto expected = toString(sizes);
+      const auto actual = toString(tensors[i].sizes());
+      throw std::invalid_argument(
+          "mixed sizes (" + expected + " and " + actual + ")");
+    }
+  }
+}
+
+inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
+  // Ensure we have at least one tensor
+  if (tensors.empty()) {
+    throw std::invalid_argument("argument is empty");
+  }
+
+  // Ensure all tensors have identical type and shape
+  auto options = tensors[0].options();
+  auto sizes = tensors[0].sizes();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (!tensors[i].options().type_equal(options)) {
+      const auto expected = toString(options);
+      const auto actual = toString(tensors[i].options());
+      throw std::invalid_argument(
+          "argument contains mixed types (" + expected + " and " + actual +
+          ")");
+    }
+    if (!tensors[i].sizes().equals(sizes)) {
+      const auto expected = toString(sizes);
+      const auto actual = toString(tensors[i].sizes());
+      throw std::invalid_argument(
+          "argument contains mixed sizes (" + expected + " and " + actual +
+          ")");
+    }
+  }
+}
+
+inline void assertTypeMatch(
+    std::function<void(const std::string&)> fn,
+    const at::DeprecatedTypeProperties& type,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (!tensors[index].options().type_equal(type.options())) {
+    fn("invalid tensor type at index " + std::to_string(index) + " (expected " +
+       type.toString() + ", got " + tensors[index].toString() + ")");
+  }
+}
+
+inline void assertTypeMatch(
+    std::function<void(const std::string&)> fn,
+    const at::TensorOptions& options,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (!tensors[index].options().type_equal(options)) {
+    fn("invalid tensor type at index " + std::to_string(index) + " (expected " +
+       toString(options) + ", got " + toString(tensors[index].options()) + ")");
+  }
+}
+
+inline void assertSizesMatch(
+    std::function<void(const std::string&)> fn,
+    const at::IntArrayRef& sizes,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (tensors[index].sizes() != sizes) {
+    fn("invalid tensor size at index " + std::to_string(index) + " (expected " +
+       toString(sizes) + ", got " + toString(tensors[index].sizes()) + ")");
+  }
+}
+
+inline void assertLayoutMatch(
+    std::function<void(const std::string&)> fn,
+    const c10::Layout& expected,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  const auto& actual = tensors[index].layout();
+  if (actual != expected) {
+    fn("invalid tensor layout at index " + std::to_string(index) +
+       " (expected " + toString(expected) + ", got " + toString(actual) + ")");
+  }
+}
+
+inline void assertLayoutMatch(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& layout = tensors[0].layout();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    assertLayoutMatch(fn, layout, tensors, i);
+  }
+}
+
+inline void assertNonEmpty(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.empty()) {
+    fn("requires non-empty tensor list");
+  }
+}
+
+inline void assertSingleElement(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element tensor list");
+  }
+}
+
+inline void assertSingleElementInput(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element input tensor list");
+  }
+}
+
+inline void assertSingleElementOutput(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element output tensor list");
+  }
+}
+
+inline void assertRootRank(
+    std::function<void(const std::string&)> fn,
+    int rank,
+    int size) {
+  if (rank < 0 || rank >= size) {
+    fn("invalid root rank: " + std::to_string(rank));
+  }
+}
+
+inline void assertRootTensor(
+    std::function<void(const std::string&)> fn,
+    int rank,
+    int size) {
+  if (rank < 0 || rank >= size) {
+    fn("invalid root tensor: " + std::to_string(rank));
+  }
+}
+
+inline void assertDense(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& layout = tensors[0].layout();
+  if (layout != at::kStrided) {
+    fn("only supports dense tensors");
+  }
+}
+
+inline void assertCPU(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& device = tensors[0].device();
+  if (device.type() != at::kCPU) {
+    fn("only supports CPU tensors");
+  }
+}
+
+inline void assertSameDevice(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() < 2) {
+    return;
+  }
+  const auto& device = tensors[0].device();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (tensors[i].device() != device) {
+      fn("tensors should be on the same device");
+    }
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors,
+    const at::DeprecatedTypeProperties& type,
+    const at::IntArrayRef& sizes) {
+  for (const auto i : c10::irange(tensors.size())) {
+    assertTypeMatch(fn, type, tensors, i);
+    assertSizesMatch(fn, sizes, tensors, i);
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors,
+    const at::TensorOptions& options,
+    const at::IntArrayRef& sizes) {
+  for (const auto i : c10::irange(tensors.size())) {
+    assertTypeMatch(fn, options, tensors, i);
+    assertSizesMatch(fn, sizes, tensors, i);
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    std::function<void(const std::string&)> fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& options = tensors[0].options();
+  const auto sizes = tensors[0].sizes();
+  assertTypeAndSizesMatch(fn, tensors.slice(1), options, sizes);
+}
+
+// Copied from ATen/core/functional.h.
+template <typename F, typename T>
+inline auto fmap(T& inputs, const F& fn)
+    -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for (auto& input : inputs) {
+    r.push_back(fn(input));
+  }
+  return r;
+}
+
+// Copied from torch/csrc/utils/tensor_flatten.h.
+inline at::Tensor flattenDenseTensors(at::TensorList tensors) {
+  static const auto flatten = [](const at::Tensor& t) {
+    return t.contiguous().view({-1});
+  };
+  if (tensors.size() == 1) {
+    return flatten(tensors[0]);
+  }
+  return at::cat(::c10d::fmap(tensors, flatten));
+}
+
+inline at::Tensor newLikeFlat(
+    std::vector<std::vector<at::Tensor>>& tensors,
+    size_t deviceIdx) {
+  if (tensors.empty() || tensors[0].empty()) {
+    TORCH_CHECK(false, "Received an empty list");
+  }
+  if (deviceIdx >= tensors.size()) {
+    TORCH_CHECK(false, "Invalid device index");
+  }
+  auto& t = tensors[deviceIdx][0];
+  auto device = t.device();
+  for (const auto i : c10::irange(1, tensors[deviceIdx].size())) {
+    if (tensors[deviceIdx][i].device() != device) {
+      TORCH_CHECK(false, "Expecting all tensors on the same device");
+    }
+  }
+  at::DeviceGuard gpuGuard(device);
+  std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
+  std::vector<int64_t> strides{static_cast<int64_t>(t.numel())};
+  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
+  strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+  return at::empty_strided(
+      sizes, strides, t.options().memory_format(c10::nullopt));
+}
+
+inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+  if (tensors.empty()) {
+    TORCH_CHECK(false, "Received an empty list");
+  }
+  auto& t = tensors[0];
+  at::DeviceGuard gpuGuard(t.device());
+  std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
+  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
+  return at::empty(sizes, t.options());
+}
+
+inline std::vector<std::vector<int64_t>> getSizes(
+    const std::vector<at::Tensor>& tensors) {
+  std::vector<std::vector<int64_t>> sizes(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    sizes[i] = tensors[i].sizes().vec();
+  }
+  return sizes;
+}
+
+inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
+  std::vector<int> devices(tensors.size(), -1);
+  if (tensors[0].device().is_cuda()) {
+    for (const auto i : c10::irange(tensors.size())) {
+      devices[i] = tensors[i].storage().device().index();
+    }
+  }
+  return devices;
+}
+
+template <typename T>
+inline T* getDataPointer(const at::Tensor& tensor) {
+  // This method is only used in ProcessGroupGloo for now. Call sites must make
+  // sure that the input tensor is contiguous. It is OK if the tensor does not
+  // start from the beginning of the storage. For example, it could come from
+  // chunk(..., dim=0)[1]. Hence, we need to use data_ptr() instead of
+  // tensor.storage().data()
+  // NB: not using tensor.data<T>() because tensor is not aware of gloo::TYPE
+  return static_cast<T*>(tensor.data_ptr());
+}
+
+template <typename T>
+std::vector<T*> getDataPointers(const std::vector<at::Tensor>& tensors) {
+  std::vector<T*> ptrs(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    ptrs[i] = getDataPointer<T>(tensors[i]);
+  }
+  return ptrs;
+}
+
+// For alltoall split size sanity check
+inline void checkSplitSizes(
+    const std::vector<int64_t>& split_sizes,
+    const at::Tensor& tensor,
+    int group_size) {
+  if (split_sizes.empty()) {
+    TORCH_CHECK(
+        tensor.size(0) % group_size == 0,
+        "Tensor's dim 0 does not divide equally across group size");
+  } else {
+    TORCH_CHECK(
+        split_sizes.size() == static_cast<size_t>(group_size),
+        "Number of tensor splits not equal to group size");
+    const auto sum = c10::sum_integers(split_sizes);
+    TORCH_CHECK(
+        sum == tensor.size(0), "Split sizes doesn't match total dim 0 size");
+  }
+}
+
+// Compute alltoall lengths and offsets, handling multi-dimension tensors
+template <typename T>
+size_t computeLengthsAndOffsets(
+    const std::vector<int64_t>& split_sizes,
+    const at::Tensor& tensor,
+    std::vector<T>* lengths,
+    std::vector<T>* offsets) {
+  size_t group_size = lengths->size();
+  bool equal_splits = false;
+  size_t dim0_size = tensor.size(0);
+  size_t row_size = (dim0_size ? tensor.numel() / dim0_size : 1);
+  size_t split_size = 0;
+  size_t offset = 0;
+
+  if (split_sizes.empty()) {
+    equal_splits = true;
+    split_size = tensor.size(0) / group_size;
+  }
+  for (const auto i : c10::irange(group_size)) {
+    size_t length = row_size * (equal_splits ? split_size : split_sizes[i]);
+    (*lengths)[i] = length;
+    (*offsets)[i] = offset;
+    // TODO: see if we should add overflow protection for offset
+    offset += length;
+  }
+  return offset;
+}
+
+template <typename T>
+size_t computeLengthsAndOffsets(
+    const std::vector<at::Tensor>& tensors,
+    std::vector<T>* lengths,
+    std::vector<T>* offsets) {
+  size_t group_size = lengths->size();
+  size_t offset = 0;
+  for (const auto i : c10::irange(group_size)) {
+    size_t length = tensors[i].numel();
+    (*lengths)[i] = length;
+    (*offsets)[i] = offset;
+    offset += length;
+  }
+  return offset;
+}
+
+using RankType = uint32_t;
+using SizeType = uint64_t;
+
+// `errno` is only meaningful when it fails. E.g., a  successful `fork()` sets
+// `errno` to `EINVAL` in child process on some macos
+// (https://stackoverflow.com/a/20295079), and thus `errno` should really only
+// be inspected if an error occurred.
+//
+// `success_cond` is an expression used to check if an error has happend. So for
+// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
+// is stored in variable `__output` and may be used in `success_cond`.
+#ifdef _WIN32
+#define SYSCHECK(expr, success_cond)                                      \
+  while (true) {                                                          \
+    auto __output = (expr);                                               \
+    auto errno_local = WSAGetLastError();                                 \
+    (void)__output;                                                       \
+    if (!(success_cond)) {                                                \
+      if (errno == EINTR) {                                               \
+        continue;                                                         \
+      } else if (                                                         \
+          errno_local == WSAETIMEDOUT || errno_local == WSAEWOULDBLOCK) { \
+        C10_THROW_ERROR(DistNetworkError, "Socket Timeout");              \
+      } else {                                                            \
+        C10_THROW_ERROR(DistNetworkError, std::strerror(errno_local));    \
+      }                                                                   \
+    } else {                                                              \
+      break;                                                              \
+    }                                                                     \
+  }
+#else
+#define SYSCHECK(expr, success_cond)                             \
+  while (true) {                                                 \
+    auto __output = (expr);                                      \
+    (void)__output;                                              \
+    if (!(success_cond)) {                                       \
+      if (errno == EINTR) {                                      \
+        continue;                                                \
+      } else if (errno == EAGAIN || errno == EWOULDBLOCK) {      \
+        C10_THROW_ERROR(DistNetworkError, "Socket Timeout");     \
+      } else {                                                   \
+        C10_THROW_ERROR(DistNetworkError, std::strerror(errno)); \
+      }                                                          \
+    } else {                                                     \
+      break;                                                     \
+    }                                                            \
+  }
+#endif
+
+// Most functions indicate error by returning `-1`. This is a helper macro for
+// this common case with `SYSCHECK`.
+// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
+#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
+
+namespace tcputil {
+
+// Send and receive
+template <typename T>
+void sendBytes(
+    int socket,
+    const T* buffer,
+    size_t length,
+    bool moreData = false) {
+  size_t bytesToSend = sizeof(T) * length;
+  if (bytesToSend == 0) {
+    return;
+  }
+
+  auto bytes = reinterpret_cast<const uint8_t*>(buffer);
+  uint8_t* currentBytes = const_cast<uint8_t*>(bytes);
+
+  int flags = 0;
+
+#ifdef MSG_MORE
+  if (moreData) { // there is more data to send
+    flags |= MSG_MORE;
+  }
+#endif
+
+// Ignore SIGPIPE as the send() return value is always checked for error
+#ifdef MSG_NOSIGNAL
+  flags |= MSG_NOSIGNAL;
+#endif
+
+  while (bytesToSend > 0) {
+    ssize_t bytesSent;
+    SYSCHECK_ERR_RETURN_NEG1(
+        bytesSent =
+            ::send(socket, (const char*)currentBytes, bytesToSend, flags))
+    if (bytesSent == 0) {
+      C10_THROW_ERROR(DistNetworkError, std::strerror(ECONNRESET));
+    }
+
+    bytesToSend -= bytesSent;
+    currentBytes += bytesSent;
+  }
+}
+
+template <typename T>
+void recvBytes(int socket, T* buffer, size_t length) {
+  size_t bytesToReceive = sizeof(T) * length;
+  if (bytesToReceive == 0) {
+    return;
+  }
+
+  auto bytes = reinterpret_cast<uint8_t*>(buffer);
+  uint8_t* currentBytes = bytes;
+
+  while (bytesToReceive > 0) {
+    ssize_t bytesReceived;
+    SYSCHECK_ERR_RETURN_NEG1(
+        bytesReceived = recv(socket, (char*)currentBytes, bytesToReceive, 0))
+    if (bytesReceived == 0) {
+      C10_THROW_ERROR(DistNetworkError, std::strerror(ECONNRESET));
+    }
+
+    bytesToReceive -= bytesReceived;
+    currentBytes += bytesReceived;
+  }
+}
+
+// send a vector's length and data
+template <typename T>
+void sendVector(int socket, const std::vector<T>& vec, bool moreData = false) {
+  SizeType size = vec.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<T>(socket, vec.data(), size, moreData);
+}
+
+// receive a vector as sent in sendVector
+template <typename T>
+std::vector<T> recvVector(int socket) {
+  SizeType valueSize;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<T> value(valueSize);
+  recvBytes<T>(socket, value.data(), value.size());
+  return value;
+}
+
+// this is only for convenience when sending rvalues
+template <typename T>
+void sendValue(int socket, const T& value, bool moreData = false) {
+  sendBytes<T>(socket, &value, 1, moreData);
+}
+
+template <typename T>
+T recvValue(int socket) {
+  T value;
+  recvBytes<T>(socket, &value, 1);
+  return value;
+}
+
+// send a string's length and data
+inline void sendString(
+    int socket,
+    const std::string& str,
+    bool moreData = false) {
+  SizeType size = str.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<char>(socket, str.data(), size, moreData);
+}
+
+// receive a string as sent in sendString
+inline std::string recvString(int socket) {
+  SizeType valueSize;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<char> value(valueSize);
+  recvBytes<char>(socket, value.data(), value.size());
+  return std::string(value.data(), value.size());
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7db37dd5878e3573f3844fb9d2797598bf6e3ecd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d {
+namespace tcputil {
+
+#define CONNECT_SOCKET_OFFSET 1
+
+inline int poll(struct pollfd* fdArray, unsigned long fds, int timeout) {
+  return WSAPoll(fdArray, fds, timeout);
+}
+
+inline void addPollfd(
+    std::vector<struct pollfd>& fds,
+    int socket,
+    short events) {
+  fds.push_back({(SOCKET)socket, events});
+}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {(SOCKET)socket, events};
+  return res;
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a34aa47457e46dfb71520eee0785ec79ac110afc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <stdexcept>
+#include <vector>
+
+constexpr auto kNoTimeout = std::chrono::milliseconds(0);
+
+namespace c10d {
+
+constexpr const char* const kSeqNumStoreKey = "SEQ_NUM_STORE_KEY";
+
+enum class OpType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_COALESCED = 2,
+  REDUCE = 3,
+  ALLGATHER = 4,
+  _ALLGATHER_BASE = 5,
+  ALLGATHER_COALESCED = 6,
+  GATHER = 7,
+  SCATTER = 8,
+  REDUCE_SCATTER = 9,
+  ALLTOALL_BASE = 10,
+  ALLTOALL = 11,
+  SEND = 12,
+  RECV = 13,
+  RECVANYSOURCE = 14,
+  BARRIER = 15,
+  _REDUCE_SCATTER_BASE = 16,
+  COALESCED = 17,
+  _ALLREDUCE_SPARSE = 18,
+  UNKNOWN = 100,
+};
+
+// Converts OpType to human readable string.
+TORCH_API std::string opTypeToString(OpType opType);
+
+// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
+TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
+
+// Please do not use Work API, it is going away, to be
+// replaced by ivalue::Future.
+// Python binding for this class might change, please do not assume
+// this will be bound using pybind.
+class TORCH_API Work : public torch::CustomClassHolder {
+ public:
+  Work(
+      int rank = -1,
+      OpType opType = OpType::UNKNOWN,
+      const char* profilingTitle = nullptr,
+      const c10::optional<std::vector<at::Tensor>>& inputTensors =
+          c10::nullopt);
+
+  ~Work() override;
+
+  // Checks if request has completed. Non-blocking operation.
+  virtual bool isCompleted();
+
+  // Returns if the work completed successfully.
+  // If false, the exception function can be called to get details.
+  virtual bool isSuccess() const;
+
+  // Returns exception if isSuccess() returned false.
+  virtual std::exception_ptr exception() const;
+
+  // Returns source rank if this objects represents a recv-from-any.
+  virtual int sourceRank() const;
+
+  // Returns result tensors, if applicable.
+  // If work is not supposed to have result, we return empty list.
+  virtual std::vector<at::Tensor> result();
+
+  // Ensures that operations on the output tensors that are invoked
+  // after this function returns are correctly sequenced after the
+  // asynchronous completion of this work.
+  //
+  // For CUDA tensors, it inserts stream synchronization such that
+  // the streams of the caller wait for completion of the
+  // asynchronous operations on the destination tensors.
+  //
+  // For CPU tensors, it is currently a nop.
+  //
+  // This function should only be used if the caller polls for
+  // completion through the `isCompleted` function, it has returned
+  // true, and the `isSuccess` function also has returned true.
+  //
+  virtual void synchronize();
+
+  // Waits until request completes. Blocking operation.
+  // Throws if the work completed with an exception.
+  // Returns false if the work is aborted.
+  // Otherwise, it always returns true, indicating the work is completed.
+  //
+  // Functionally equivalent to:
+  //
+  //   while (!isCompleted()) { /* nop */ }
+  //   auto success = isSuccess();
+  //   if (!success) { std::rethrow_exception(exception()); }
+  //   return success;
+  //
+  virtual bool wait(std::chrono::milliseconds timeout = kNoTimeout);
+
+  virtual void abort();
+
+  // Returns a Future object that will be associated with the completion of
+  // work. Only NCCL backend is currently supported.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> getFuture();
+
+  virtual float getDuration() const;
+
+  virtual uint64_t getSequencenumber() const;
+
+  OpType retrieveOpType() const;
+
+  static c10::intrusive_ptr<Work> create_from_future(
+      const c10::intrusive_ptr<c10::ivalue::Future>&);
+
+ protected:
+  // Completes the work object and optionally sets the exception in a
+  // thread-safe manner. Notifies all waiting condition variables as well.
+  void finish(std::exception_ptr exception = nullptr);
+
+  // Similar to finish, but throws an exception if one is already set or
+  // provided by the user.
+  void finishAndThrow(std::exception_ptr exception);
+
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool completed_ = false;
+  std::exception_ptr exception_;
+
+  // Current rank of the node.
+  const int rank_;
+
+  // Operation type that this work object refers to.
+  OpType opType_;
+
+  // When profiling, the callback to record end of operation event. This
+  // callback needs to be called when collective operation is complete.
+  std::function<void()> recordFunctionEndCallback_;
+};
+
+struct TORCH_API WorkInfo {
+  WorkInfo(
+      const OpType& opType,
+      const uint64_t seq,
+      const std::chrono::time_point<std::chrono::system_clock>& timeStarted,
+      const std::chrono::time_point<std::chrono::system_clock>& timeFinished,
+      const std::chrono::duration<float>& activeDuration)
+      : opType(opType),
+        seq(seq),
+        timeStarted(timeStarted),
+        timeFinished(timeFinished),
+        activeDuration(activeDuration) {}
+
+  OpType opType;
+  uint64_t seq;
+  std::chrono::time_point<std::chrono::system_clock> timeStarted;
+  std::chrono::time_point<std::chrono::system_clock> timeFinished;
+  std::chrono::duration<float> activeDuration;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h
new file mode 100644
index 0000000000000000000000000000000000000000..4764bb3082430f0aa80b1f7c7ffa300e3d4de587
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace distributed {
+namespace c10d {
+
+PyMethodDef* python_functions();
+
+} // namespace c10d
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e648733f0e95abcc2b79ed57c873c51c7515aeaa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <utility>
+
+namespace c10d {
+
+// Broadcast many tensors to all processes in the process group.
+TORCH_API void broadcast_coalesced(
+    const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
+    at::TensorList tensors,
+    size_t buffer_size,
+    int rank = 0);
+
+// This class passes bucket contents tensor to DDP communication hook.
+class TORCH_API GradBucket {
+ public:
+  explicit GradBucket(
+      size_t index,
+      size_t bucket_count,
+      at::Tensor tensor,
+      std::vector<size_t> offsets,
+      std::vector<size_t> lengths,
+      std::vector<c10::IntArrayRef> sizes_vec,
+      std::vector<at::Tensor> parameters,
+      c10::optional<at::Tensor> sparse_grad_indices)
+      : index_(index),
+        bucket_count_(bucket_count),
+        buffer_(std::move(tensor)),
+        offsets_(std::move(offsets)),
+        lengths_(std::move(lengths)),
+        sizes_vec_(std::move(sizes_vec)),
+        parameters_(std::move(parameters)),
+        sparse_grad_indices_(std::move(sparse_grad_indices)) {}
+
+  // Returns the index of the bucket, which is unique across all the buckets.
+  size_t getIndex() const {
+    return index_;
+  }
+
+  const at::Tensor& getBuffer() const {
+    return buffer_;
+  }
+
+  // Returns a mutable buffer compared with the above method.
+  at::Tensor& getBufferRef() {
+    return buffer_;
+  }
+
+  // Overwrites the buffer at a specific index.
+  void setBuffer(at::Tensor& buffer) {
+    buffer_ = buffer;
+  }
+
+  // Each tensor in the list that getGradients corresponds to a
+  // parameter.
+  std::vector<at::Tensor> getGradients() const;
+
+  // Returns model parameters belonging to this bucket. They are returned in the
+  // same order as gradient tensors via getGradients(). For example,
+  // getParameters[i] will have its gradient stored in
+  // getGradients[i]
+  const std::vector<at::Tensor> getParameters() const {
+    return parameters_;
+  }
+
+  // Returns whther this bucket is the last bucket to allreduce in an iteration.
+  bool isLast() const {
+    return index_ == bucket_count_ - 1;
+  }
+
+  c10::optional<at::Tensor>& getSparseGradIndices() {
+    return sparse_grad_indices_;
+  }
+
+ private:
+  size_t index_;
+  size_t bucket_count_;
+  at::Tensor buffer_;
+
+  // Per-variable info in buffer_.
+  std::vector<size_t> offsets_;
+  std::vector<size_t> lengths_;
+  std::vector<c10::IntArrayRef> sizes_vec_;
+
+  // Model parameters for this bucket.
+  const std::vector<at::Tensor> parameters_;
+
+  // Predefined sparse indices for this bucket (only used for sparse tensors).
+  // The gradients will be updated to have indices with these tensor values
+  c10::optional<at::Tensor> sparse_grad_indices_;
+};
+
+// Base class of both `PythonCommHook` and `CppCommHook`.
+// Requires implementing 1) `runHook` method that communicates gradients
+// asynchronously, and 2) `parseHookResult` method that converts the hook
+// result into a tensor.
+class TORCH_API CommHookInterface {
+ public:
+  virtual ~CommHookInterface() = default;
+
+  // Passes the input grad bucket to the registered communication hook.
+  // Once the tensor in the bucket are ready, kicks off the hook asynchronously
+  // and returns a future that holds the communication results.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runHook(
+      GradBucket& bucket) = 0;
+
+  // Returns the resulting tensor once the communication hook result is
+  // ready. The resulting tensor will then be copied to the grads of
+  // individual parameters.
+  virtual at::Tensor parseHookResult(const c10::IValue& result) = 0;
+};
+
+namespace detail {
+// This helper function is called both by CppCommHookInterface below and inside
+// reducer.
+TORCH_API at::Tensor parseCppCommHookResult(const c10::IValue& result);
+} // namespace detail
+
+// This CppCommHook interface only requires implementing runHook method that
+// potentially uses a state.
+template <typename T>
+class CppCommHookInterface : public CommHookInterface {
+ public:
+  explicit CppCommHookInterface(T state) : state_(std::move(state)) {}
+
+  ~CppCommHookInterface() override = default;
+
+  at::Tensor parseHookResult(const c10::IValue& result) override {
+    return detail::parseCppCommHookResult(result);
+  }
+
+ protected:
+  T state_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..72090bc8961c13fb2fd207e021cfe6832186459d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h
@@ -0,0 +1,23 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace c10d {
+
+enum class DebugLevel { Off = 0, Info = 1, Detail = 2 };
+
+TORCH_API void setDebugLevel(DebugLevel level);
+
+// Sets the debug level based on the value of the `TORCH_DISTRIBUTED_DEBUG`
+// environment variable.
+TORCH_API void setDebugLevelFromEnvironment();
+
+TORCH_API DebugLevel debug_level() noexcept;
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..09f74007edbd4bec532c1f6b03c4d2a3f0264485
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/comm.hpp>
+
+namespace c10d {
+
+enum class BuiltinCommHookType {
+  ALLREDUCE = 1,
+  FP16_COMPRESS = 2,
+};
+
+class AllReduceCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit AllReduceCommHook(const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~AllReduceCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+class FP16CompressCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit FP16CompressCommHook(const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~FP16CompressCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+// Almost same as AllReduceCommHook, but without division inside the hook.
+// This enables the optimization of fusing copy and division and saves one scan
+// over all the input parameters, when no communication hook is provided by the
+// user. Only used internally and not released as a public built-in
+// communication hook.
+class _AllReduceBySumCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit _AllReduceBySumCommHook(
+      const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~_AllReduceBySumCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b453b631a625eca0d68bb4518c6e665c27cff31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h
@@ -0,0 +1,56 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cstring>
+#include <system_error>
+
+#include <fmt/format.h>
+
+namespace fmt {
+
+template <>
+struct formatter<std::error_category> {
+  constexpr decltype(auto) parse(format_parse_context& ctx) const {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  decltype(auto) format(const std::error_category& cat, FormatContext& ctx)
+      const {
+    if (std::strcmp(cat.name(), "generic") == 0) {
+      return fmt::format_to(ctx.out(), "errno");
+    } else {
+      return fmt::format_to(ctx.out(), "{} error", cat.name());
+    }
+  }
+};
+
+template <>
+struct formatter<std::error_code> {
+  constexpr decltype(auto) parse(format_parse_context& ctx) const {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  decltype(auto) format(const std::error_code& err, FormatContext& ctx) const {
+    return fmt::format_to(
+        ctx.out(), "({}: {} - {})", err.category(), err.value(), err.message());
+  }
+};
+
+} // namespace fmt
+
+namespace c10d {
+namespace detail {
+
+inline std::error_code lastError() noexcept {
+  return std::error_code{errno, std::generic_category()};
+}
+
+} // namespace detail
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..99662775bc6aeb7e3bd5a293ccf7f4d4a965ab96
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h
@@ -0,0 +1,33 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdexcept>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+// Utility macro similar to C10_THROW_ERROR, the major difference is that this
+// macro handles exception types defined in the c10d namespace, whereas
+// C10_THROW_ERROR requires an exception to be defined in the c10 namespace.
+#define C10D_THROW_ERROR(err_type, msg) \
+  throw ::c10d::err_type(               \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, msg)
+
+namespace c10d {
+
+using c10::DistNetworkError;
+
+class TORCH_API SocketError : public DistNetworkError {
+  using DistNetworkError::DistNetworkError;
+};
+
+class TORCH_API TimeoutError : public DistNetworkError {
+  using DistNetworkError::DistNetworkError;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/intra_node_comm.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/intra_node_comm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5ff3d3a628108238b911c26df969277d53cfe04
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/intra_node_comm.hpp
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+
+namespace c10d {
+namespace intra_node_comm {
+
+constexpr size_t kMaxDevices = 8;
+constexpr size_t kDefaultBufferSize = 10 * 1024 * 1024;
+
+using NvlMesh = std::array<std::array<size_t, kMaxDevices>, kMaxDevices>;
+using HybridCubeMesh = std::array<std::array<int, 4>, kMaxDevices>;
+
+enum class Topology { UNKNOWN = 0, FULLY_CONNECTED = 1, HYBRID_CUBE_MESH = 2 };
+
+enum class AllReduceAlgo { NONE = 0, ONE_SHOT = 1, TWO_SHOT = 2, HCM = 3 };
+
+class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
+ public:
+  IntraNodeComm(
+      Topology topology,
+      std::array<void*, kMaxDevices> p2pStates,
+      std::array<void*, kMaxDevices> buffers,
+      void* p2pStatesDev,
+      void* buffersDev,
+      void* topoInfo,
+      size_t rank,
+      size_t worldSize,
+      size_t bufferSize = kDefaultBufferSize);
+
+  ~IntraNodeComm();
+
+  /**
+   * Rendezvous via a c10d::Store.
+   * This function may return nullptr if intra-node comm is not applicable.
+   * It guarantees all participants either succeeds or abort.
+   */
+  static c10::intrusive_ptr<IntraNodeComm> rendezvous(
+      c10::intrusive_ptr<c10d::Store> store,
+      const std::string& prefix,
+      size_t rank,
+      size_t worldSize,
+      size_t bufferSize = kDefaultBufferSize);
+
+  /**
+   * Selects a AllReduceAlgo that we think will outperform nccl.
+   * Returns AllReduceAlgo::NONE if we don't think we can outperform nccl.
+   */
+  AllReduceAlgo selectAllReduceAlgo(const at::Tensor& input);
+
+  at::Tensor allReduce(const at::Tensor& input, AllReduceAlgo algo);
+
+ private:
+  at::Tensor oneShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor twoShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor hybridCubeMeshAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  Topology topology_;
+  std::array<void*, kMaxDevices> p2pStates_;
+  std::array<void*, kMaxDevices> buffers_;
+  void* p2pStatesDev_;
+  void* buffersDev_;
+  void* topoInfo_;
+  size_t rank_;
+  size_t worldSize_;
+  size_t bufferSize_;
+};
+
+/**
+ * NOTE [IntraNodeComm Stream Semantics]
+ *
+ * ProcessGroupNCCL launches kernels differently from the conventional PyTorch
+ * CUDA semantics: it always launches collective kernels onto a dedicated
+ * communication stream. Therefore, it needs to:
+ *
+ * - Synchronize the calling stream and the comm stream.
+ * - Ensure the memory safety of the operands (via record_stream or stashing).
+ * - Synchronize the waiting stream with the comm stream.
+ *
+ * Unconditionally performing these tasks makes sense when we expect most of the
+ * communication to benefit from compute/comm overlap. However, IntraNodeComm
+ * primarily aims to optimize small, latency-sensitive, blocking communication,
+ * in which the overhead incurred by the above steps can be quite pronounced.
+ *
+ * Thus, IntraNodeComm follows the conventional PyTorch CUDA semantics and
+ * launches kernels onto the stream specified by the user. Although the user
+ * can perform neccessary synchronization via wait_stream, to provide a UX
+ * consistent to that of ProcessGroupNCCL, the neccessary stream
+ * synchronization can also be performed via IntraNodeWork::wait().
+ */
+class IntraNodeCommWork : public c10d::Work {
+ public:
+  IntraNodeCommWork() : c10d::Work() {
+    event_.record();
+  }
+
+  bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+    event_.block(at::cuda::getCurrentCUDAStream());
+    return true;
+  }
+
+ private:
+  at::cuda::CUDAEvent event_;
+};
+
+TORCH_API int64_t getIntraNodeCommUsageCounter();
+
+} // namespace intra_node_comm
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f458f49fe7b1c66b1446eaa67bc16ac6521d617
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp
@@ -0,0 +1,104 @@
+#include <c10/util/Logging.h>
+#include <torch/csrc/distributed/c10d/reducer.hpp>
+
+#include <mutex>
+
+namespace c10d {
+
+class TORCH_API Logger {
+ public:
+  explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
+  // Set logging data that can be got during DistributedDataParallel
+  // construction time.
+  void set_construction_data_and_log(
+      const std::string& module_name,
+      const std::vector<int>& device_ids,
+      int output_device,
+      bool broadcast_buffers,
+      bool has_sync_bn,
+      bool static_graph);
+
+  void set_static_graph();
+
+  // An interface for users to get DDPLoggingData and log them
+  // in the applications. Explanation of logging fields are in
+  // "struct DDPLoggingData" of "torch/c10/util/Logging.h".
+  at::DDPLoggingData get_ddp_logging_data();
+
+  // Stream insertion operator for logging data to stream under
+  // TORCH_DISTRIBUTED_DEBUG.
+  friend std::ostream& operator<<(std::ostream& output, const Logger& logger);
+
+  ~Logger() noexcept(false) {
+    // Log if DDP graph is static in Logger dtor instead of Reducer dtor since
+    // Logger is deleted before Reducer.
+    log_if_graph_static(reducer_->ddp_graph_static());
+  }
+
+  // Set environment variables.
+  void set_env_variables();
+  // Set parameters stats.
+  void set_parameter_stats();
+  // Get size of each bucket (Bytes).
+  std::vector<int64_t> get_bucket_sizes();
+  // Get variable indices for each bucket.
+  std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
+  // Set comm. hook, if used
+  void set_comm_hook(const std::string& hook);
+  // Set running with uneven input detection (model.join() context manager)
+  void set_uneven_input_join();
+
+  // Reset performance stats at current iteration
+  void reset_performance_stats();
+
+  // Calculate avg stats using cpu timer and gpu timer
+  // that has been recorded in reducer.
+  void calculate_avg_time(
+      int64_t& avg_time,
+      int64_t& time_duration,
+      Timer& timer,
+      Timer::Event start_event,
+      Timer::Event end_event);
+
+  // Set the absolute time of the event that has been recorded in reducer.
+  void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event);
+  // Set stats that can be collected only during
+  // training loop. It is called at the beginning of forward call
+  // to record the run time stats of sampled iterations that previously ran.
+  // GPU performance stats are collected only for single process
+  // single device program and single device module right now.
+  // TODO to support single process multiple devices and multi device modules,
+  // events need to be created and recorded on multiple devices.
+  void set_runtime_stats_and_log();
+
+  // Called when DDP/reducer is failing with an error. The
+  // logging data structure will have two fields filled: "has_error" indicating
+  // that this iteration encountered an error and other fields are not valid,
+  // and "error", a string which contains the error message that DDP failed
+  // with.
+  template <typename... Args>
+  void set_error_and_log(const std::string& ddp_error, const Args&... args) {
+    ddp_logging_data_->ints_map["has_error"] = 1;
+    auto err = c10::str(ddp_error, args...);
+    ddp_logging_data_->strs_map["error"] = err;
+    // Report the iteration we are erroring at so user knows how many examples
+    // successfully processed before this error was hit.
+    ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
+    at::LogPyTorchDDPUsage(*ddp_logging_data_);
+  }
+
+  // When running without static graph, called when reducer is destroyed to log
+  // if graph was actually static and is a candidate for static graph
+  // optimization.
+  void log_if_graph_static(bool is_static);
+
+ private:
+  // ddp_logging_data_ is used to hold all the ddp related logging
+  // data fields.
+  std::unique_ptr<at::DDPLoggingData> ddp_logging_data_;
+  std::shared_ptr<c10d::Reducer> reducer_;
+  // track the number of iterations when runtime stats are collected so far.
+  long num_iterations_stats_recorded_ = 0;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc0fadf9825cbd5cfeeadf6e62bd2057ecb679d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h
@@ -0,0 +1,51 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <string>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Logging.h>
+#include <fmt/format.h>
+
+namespace c10d {
+namespace detail {
+
+enum class LogLevel { Trace, Debug, Info, Warning, Error };
+
+TORCH_API bool isLogLevelEnabled(LogLevel level) noexcept;
+
+template <typename... T>
+std::string formatLogMessage(fmt::string_view fmt, T&&... args) {
+  return fmt::vformat(fmt, fmt::make_format_args(args...));
+}
+
+} // namespace detail
+} // namespace c10d
+
+#define C10D_ERROR(...)                                                      \
+  LOG_IF(                                                                    \
+      ERROR, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Error)) \
+      << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_WARNING(...)                                               \
+  LOG_IF(                                                               \
+      WARNING,                                                          \
+      c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Warning)) \
+      << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_INFO(...)                                                        \
+  LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Info)) \
+      << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_DEBUG(...)                                                        \
+  LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Debug)) \
+      << "[c10d - debug] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_TRACE(...)                                                        \
+  LOG_IF(INFO, c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Trace)) \
+      << "[c10d - trace] " << c10d::detail::formatLogMessage(__VA_ARGS__)
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..854f1c2ed4592b94fc0bbea14ed4317917b01188
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/comm.hpp>
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/utils/pybind.h>
+
+namespace c10d {
+
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
+ public:
+  // Takes a state and a callable hook. The inputs are Python objects.
+  // The state is passed to the hook in runHook method, and it can be used to
+  // maintain and update any state information during the execution of the hook.
+  // The hook performs user-specified processing and returns a future indicating
+  // asychronous communication of gradients.
+  PythonCommHook(py::object state, py::object hook)
+      : state_(std::move(state)), hook_(std::move(hook)) {}
+
+  ~PythonCommHook() override;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+
+  at::Tensor parseHookResult(const c10::IValue& result) override;
+
+ private:
+  // Only needed for stateful communication.
+  py::object state_;
+  py::object hook_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4e505a4413037c1f9822dd5cc8455081304a9eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp
@@ -0,0 +1,589 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue_inl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/comm.hpp>
+#include <torch/csrc/distributed/c10d/debug.h>
+#include <torch/csrc/distributed/c10d/default_comm_hooks.hpp>
+#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
+#ifndef _WIN32
+#include <torch/csrc/distributed/autograd/context/context.h>
+#endif
+
+namespace c10d {
+
+constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
+constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
+// Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
+constexpr int kDDPRuntimeLoggingSampleRate = 100;
+
+// Forward declaration
+class Logger;
+
+// Local accumulator type for a single bucket.
+struct BucketAccumulator {
+  std::vector<size_t> indices;
+  size_t size = 0;
+  size_t size_limit = 0;
+};
+
+class TORCH_API Reducer {
+ public:
+  // The constructor takes a list of variables (i.e. parameters) for this
+  // process's single model replica (as DDP assumes single-process
+  // single-device). The bucket assignment for this reducer, `bucket_indices`,
+  // is specified as a list of buckets, each of which is specified as a list of
+  // indices into the bucket's `variables` list.
+  explicit Reducer(
+      std::vector<at::Tensor> params,
+      std::vector<std::vector<size_t>> bucket_indices,
+      std::vector<size_t> per_bucket_size_limits,
+      c10::intrusive_ptr<c10d::ProcessGroup> process_group,
+      std::vector<bool> expect_sparse_gradients,
+      int64_t bucket_bytes_cap,
+      bool find_unused_parameters,
+      bool gradient_as_bucket_view,
+      std::unordered_map<size_t, std::string> param_names,
+      int64_t first_bucket_bytes_cap);
+
+  ~Reducer() noexcept(false);
+
+  // To (re-)initialize bucket assignment, pass a list of buckets, each of
+  // which is specified by a list of indices in the bucket's `variables` list.
+  // This function performs validation that the variables within a bucket
+  // all live on the same device and have the same dimensionality.
+  void initialize_buckets(std::vector<std::vector<size_t>> bucket_indices);
+
+  void autograd_hook(size_t index);
+
+  // This function is called when the forward function has produced an output,
+  // and the user wishes to reduce gradients in the backwards pass.
+  // If they don't, and wish to accumulate gradients before reducing them,
+  // a call to this function can simply be omitted.
+  void prepare_for_backward(const std::vector<at::Tensor>& outputs);
+
+  // Called at the beginning of forward() inside DistributedDataParallel,
+  // right now it captures the starting time of forward in each iteration.
+  void prepare_for_forward();
+
+  // Returns the relative time in nanoseconds when gradients were ready,
+  // with respect to the time `prepare_for_backward` was called. The
+  // vector is for parameters for a single model replica.
+  std::vector<int64_t> get_backward_stats() const {
+    return backward_stats_;
+  }
+
+  // Registers a hook to the reducer. The hook is `CommHookInterface`
+  // type to allow both Python and CPP hooks. This function can only
+  // be called once before calling backward.
+  // Cannot combine with the call of `register_builtin_comm_hook`.
+  void register_comm_hook(std::unique_ptr<CommHookInterface> iface);
+
+  // Registers a built-in C++ comm hook to the reducer. This function can only
+  // be called once before calling backward.
+  // Cannot combine with the call of `register_comm_hook`.
+  void register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_type);
+
+  // Informs reducer that optimizer is running in backward, so gradients
+  // don't need to be copied from buckets as the optimizer would've already
+  // been applied.
+  void set_optimizer_in_backward() {
+    optim_in_backward_ = true;
+  };
+
+  // Runs allreduce or installed communication hook given GradBucket instance.
+  c10::intrusive_ptr<c10::ivalue::Future> run_comm_hook(
+      GradBucket& grad_bucket);
+
+  // Runs default allreduce hook.
+  c10::intrusive_ptr<c10::ivalue::Future> run_allreduce_hook(
+      GradBucket& grad_bucket);
+
+  // Returns gradient buckets in sequential order of buckets_. This is the order
+  // in which buckets are reduced across processes. If return_zero_tensors=true,
+  // will return zero tensors of the same shape instead of the true tensors.
+  std::vector<c10d::GradBucket> get_grad_buckets(
+      bool return_zero_tensors = true) const;
+
+  // Rebuild buckets based on rebuilt_params_ and rebuilt_param_indices_
+  // according to when tensors received grads in the backward pass.
+  // TODO this function makes broadcast communication call and
+  // could be overlapped with next forward() call, thus
+  // it could be async. Will make it async when rebuilding buckets for
+  // find_unused_parameters = true case, as we could rebuild buckets more than
+  // once for find_unused_parameters = true case, where subgraphs are trained
+  // and parameter indices order may change more frequently.
+  // For find_unused_parameters = false case, buckets are only rebuilt once,
+  // the performance cost is negligible. Returns true if the buckets were
+  // rebuilt.
+  bool rebuild_buckets();
+
+  void setSparseMetadata(std::map<std::string, at::Tensor>& metadata);
+
+  // Install futures that should be awaited at end of backwards. Currently these
+  // are only used by user-defined custom buffer reduction hooks, but can be
+  // generalized to any user-originating futures that need to be awaited.
+  void install_futures(c10::List<c10::intrusive_ptr<c10::ivalue::Future>> futs);
+
+  // Returns true if we should rebuild buckets, else false. We only rebuild
+  // buckets once after the first iteration and never rebuild them if
+  // find_unused_parameters_.
+  inline bool should_rebuild_buckets() const {
+    return (static_graph_ || !find_unused_parameters_) && !has_rebuilt_bucket_;
+  }
+
+  // Pushes all parameters to be rebuilt.
+  void push_rebuilt_params_for_all_indices();
+
+  // Creates and sets ForwardPassWorkHandle given a Work and the
+  // corresponding tensor being reduced.
+  void set_forward_pass_work_handle(
+      c10::intrusive_ptr<c10d::Work> forwardPassWorkHandle,
+      bool useStaticWorldSize);
+
+  // Retrieve on-device tensors used to track locally unused parameters. It is
+  // a tensor where index i = 1 if the Variable with that index has been used.
+  at::Tensor get_local_used_map_on_device() const;
+
+  // An function for users to set sample_rate of collecting
+  // runtime stats. The time stats will be recorded for the
+  // first 10 iterations, after 10 iterations time stats will be
+  // recorded once every "sample_rate" training iterations.
+  void set_ddp_runtime_logging_sample_rate(int sample_rate);
+
+  // Specify the training graph is static.
+  void set_static_graph();
+
+  // Delay all reduce to be after all gradients' calculation is complete.
+  void delay_all_reduce();
+
+  void set_mixed_precision_param_dtype(c10::ScalarType dtype);
+
+  // Weak reference to associated DDP logger. The reference is weak to avoid
+  // refcycle between reducer and logger.
+  void set_logger(std::weak_ptr<c10d::Logger> logger);
+
+  // When graph is not explicitly set by user as static and has unused
+  // parameters, this will return whether the graph has been static until the
+  // current iteration, which means unused params set has not changed.
+  bool ddp_graph_static();
+
+  // Removes autograd hooks registered by the Reducer on the model parameters.
+  void remove_autograd_hooks();
+
+  // Checks whether or not the reducer has finalized the current backward
+  // iteration.
+  void check_finalized();
+
+  // Updates the underlying process group used by DDP with the new process
+  // group.
+  void update_process_group(
+      c10::intrusive_ptr<c10d::ProcessGroup> new_process_group);
+
+  // Resets reducer state.
+  void reset_state();
+
+ protected:
+  // Forward declaration.
+  struct Bucket;
+
+  void push_rebuilt_params(const size_t& index);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  mutable std::mutex mutex_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<at::Tensor> params_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::intrusive_ptr<::c10d::ProcessGroup> process_group_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<bool> expect_sparse_gradients_;
+
+  std::vector<std::shared_ptr<torch::autograd::Node>>
+      grad_accumulators_; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_map<torch::autograd::Node*, size_t> gradAccToVariableMap_;
+  std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Node>>>
+      hooks_; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool expect_autograd_hooks_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool require_finalize_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t next_bucket_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool has_marked_unused_parameters_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const bool find_unused_parameters_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const bool gradient_as_bucket_view_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<size_t> unused_parameters_;
+  // Previous iteration's unused params, used for checking if unused parameters
+  // change between iterations. Only filled during the first backwards call.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<size_t> prev_iteration_unused_parameters_;
+  // Whether graph is static or not. When user does not explicitly set static
+  // graph, the only possible dynamism is set of unused parameters changing
+  // between iterations which is tracked by this flag.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool ddp_graph_static_{true};
+  // Locally used parameter maps indicating if parameters are used locally
+  // during the current iteration or no_sync session if no_sync is on.
+  // Each map is a one-dim int32 tensor of number of parameters. These tensors
+  // are marked in autograd_hook to indicate the corresponding param has been
+  // used, and get allreduced in the end of backward step of current iteration
+  // or no_sync session for figuring out the globally unused parameters.
+  //
+  // local_used_map_:     CPU tensor for bookkeeping locally used params
+  // local_used_map_dev_: dev tensor for reducing globally unused params
+  at::Tensor local_used_map_;
+  at::Tensor local_used_map_dev_;
+  // Indicate that reduction is done and D2H copy is done as well.
+  bool local_used_map_reduced_;
+
+  // Weak pointer to associated DDP logger.
+  std::weak_ptr<c10d::Logger> logger_;
+  // List of futures installed by Reducer::install_futures that should be
+  // awaited at the end of backwards pass.
+  c10::optional<c10::List<c10::intrusive_ptr<c10::ivalue::Future>>>
+      installed_futures_{c10::nullopt};
+  // Mixed precision parameter dtype for bucket type checking.
+  c10::optional<c10::ScalarType> mixed_precision_param_dtype_{c10::nullopt};
+
+  // Work handle for allreduce on local_used_map_
+  c10::intrusive_ptr<c10d::Work> local_used_work_;
+
+  void mark_variable_ready_dense(size_t variable_index);
+
+  void mark_variable_ready_sparse(size_t variable_index);
+
+  void mark_variable_ready(size_t variable_index);
+
+  void mark_bucket_ready(size_t bucket_index);
+
+  void finalize_bucket_dense(Bucket& bucket);
+
+  void finalize_backward();
+
+  // Returns list of model parameters corresponding to the given bucket.
+  // bucket_index is a key to cache after buckets are rebuilt, after which this
+  // mapping never changes.
+  std::vector<at::Tensor> get_variables_for_bucket(
+      size_t bucket_index,
+      const Bucket& bucket) const;
+
+  // Asserts that the reduction for the previous iteration has finished before
+  // rebuilding buckets or kicking off the next one.
+  void ensure_prior_reduction_finished();
+
+  // Broadcast rebuilt buckets from rank 0 to other ranks before initializing
+  // the buckets
+  void sync_bucket_indices(std::vector<std::vector<size_t>>& bucket_indices);
+
+  // We'd like to use DistAutogradContext::GradCallback here but dist autograd
+  // doesn't exist under Windows. So we just directly use the concrete type but
+  // to preserve and enforce our original intent we do a static assert when dist
+  // autograd is available.
+  using GradCallback = std::function<bool(at::Tensor&)>;
+#ifndef _WIN32
+  static_assert(
+      std::is_same<
+          GradCallback,
+          torch::distributed::autograd::DistAutogradContext::GradCallback>::
+          value,
+      "");
+#endif
+  void runGradCallbackForVariable(at::Tensor& variable, GradCallback&& cb);
+
+  // This function is called inside `initialize_buckets()`. It initializes both
+  // `bucket_views_in` and `bucket_views_out` with views for each variable's
+  // gradient into the bucket's flattened `gradients` tensor. Views serve as
+  // entry points to `copy_()` each grad's data in/out of the flattened
+  // `gradients` tensor.
+  void initialize_bucket_views(Bucket& bucket);
+
+  // This function is called inside `finalize_backward`, it happens only if
+  // DDP communication hook was registered to recreate just bucket_views_out
+  // with the result of `future_work`.
+  void populate_bucket_views_out(Bucket& bucket, at::Tensor& tensor);
+
+  // If gradient_as_bucket_view_ is false, after allreduce buckets,
+  // copy bucket results back to grads.
+  void copy_bucket_to_grad(
+      at::Tensor& variable,
+      Reducer::Bucket& bucket,
+      size_t intra_bucket_index,
+      bool global_unused);
+  // Check layout of grad and bucket_view before copying the grad to bucket.
+  void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view);
+
+  // A bucket contains [1..N] gradients to be reduced, where the gradients
+  // have the same dtype and device.
+  // Coalescing gradients together before reducing can result in lower overhead
+  // and/or faster time to completion. Coalescing requires the constituent
+  // gradients to have the same dtype and device, and the resulting flattened
+  // tensor uses that common dtype and device. The flattened tensor is filled
+  // as the corresponding gradients are computed (triggered by autograd hooks),
+  // and the buckets are reduced in a predetermined order consistent across
+  // processes.
+  struct Bucket {
+    // Gradients of the bucket flattened into a 1-dimensional tensor
+    at::Tensor gradients;
+
+    // Views into the `gradients` tensor for each individual gradient
+    // Each view is created with layout (size and stride) matching the
+    // gradient's expected layout (see the "Gradient Layout Contract" in
+    // torch/csrc/autograd/functions/accumulate_grad.h).
+    // `bucket_views_in[i].copy_(grad)` and `grad.copy_(bucket_views_out[i])`
+    // provide convenient ways to copy gradient data in/out of `gradients`,
+    // respectively.
+    // We keep both `bucket_views_in` and `bucket_views_out` because
+    // registering a DDP communication hook may re-initialize
+    // `bucket_views_out` with the value of the hook's `future_work` but we
+    // still need separate views into the bucket's original flattened gradient
+    // to copy in gradient data.
+    std::vector<at::Tensor> bucket_views_in;
+    std::vector<at::Tensor> bucket_views_out;
+
+    // Variables whose gradients are held in this bucket
+    // We use refcounted tensors here so that we can easily unflatten the
+    // bucket's flattened `gradients` tensor into the participating variables
+    // after reduction has completed.
+    std::vector<at::Tensor> variables;
+
+    // Per-variable offset/length into the flattened `gradients` tensor and
+    // the corresponding `GradBucket` instance for communication hooks
+    std::vector<size_t> offsets;
+    std::vector<size_t> lengths;
+
+    // Per-variable sizes slicing into the bucket's `gradients` tensor
+    std::vector<c10::IntArrayRef> sizes_vec;
+
+    // Number of gradients left to be computed before the bucket is ready to
+    // be reduced
+    size_t pending;
+
+    // Global indices of participating variables in the bucket
+    std::vector<size_t> variable_indices;
+
+    // Future work handle for DDP communication hook
+    // If no hook is registered, a temporary vanilla allreduce hook is used.
+    c10::intrusive_ptr<at::ivalue::Future> future_work;
+
+    // If this bucket should expect a single sparse gradient
+    // If `true`, then this implies that `bucket.variables.size() == 1`.
+    bool expect_sparse_gradient = false;
+
+    // Sparse indices tensor
+    c10::optional<at::Tensor> sparse_tensor_indices = c10::nullopt;
+
+    // TODO(@pietern)
+    // Memory copies from gradient tensors into the bucket are potentially
+    // done on different CUDA streams. We record an event for every copy
+    // so that we can synchronize with them prior to kicking off the reduction.
+    // std::vector<at::cuda::CUDAEvent> events;
+  };
+
+  std::vector<Bucket> buckets_;
+
+  // A variable locator locates a particular variable in the reducer's buckets
+  struct VariableLocator {
+    // Index of the bucket containing the variable in the `buckets_` vector
+    size_t bucket_index;
+    // Index of the variable in the bucket, which may be used consistently
+    // across `bucket_views_in`, `bucket_views_out`, `variables`, `offsets`,
+    // `lengths`, `sizes_vec`, and `variable_indices` in `Bucket`
+    size_t intra_bucket_index;
+
+    VariableLocator() = default;
+
+    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_)
+        : bucket_index(bucket_index_),
+          intra_bucket_index(intra_bucket_index_) {}
+  };
+
+  // Map the index of a variable to its location in the bucket structure.
+  std::vector<VariableLocator> variable_locators_;
+
+  // track the number of iterations to synchronize grads in training so far.
+  long num_iterations_;
+  // track distinct iteration of backward call. This is distinct from
+  // num_iterations_, for example in the case of multiple forward before
+  // backward.
+  long num_bwd_calls_;
+  // whether the first autograd hook for a distinct backward pass has been
+  // called.
+  bool first_autograd_hook_called_;
+  // track the number of buckets that have been ready for
+  // communication calls like allReduce or communication hooks.
+  int num_buckets_ready_;
+
+  // Timing information.
+  int64_t backward_compute_start_time_ = -1;
+  std::unique_ptr<Timer> timer_;
+
+  // We collect the relative timestamp of every gradient being ready
+  // when executing autograd. This can be used to derive a timeline of
+  // the point in time buckets were ready, or ideal bucket assignment/ordering.
+  std::vector<int64_t> backward_stats_;
+
+  bool should_collect_runtime_stats();
+  void record_forward_compute_start_time();
+  void record_backward_compute_start_time();
+  void record_backward_compute_end_time();
+  void record_backward_comm_start_time();
+  void record_backward_comm_end_time();
+
+  int get_ddp_runtime_logging_sample_rate();
+  int ddp_runtime_logging_sample_rate_ = kDDPRuntimeLoggingSampleRate;
+
+  bool is_multi_device_module_ = false;
+
+  // Following variables are to help build dynamic bucket order
+  bool has_rebuilt_bucket_;
+  std::vector<at::Tensor> rebuilt_params_;
+  std::vector<int64_t> rebuilt_param_indices_;
+  const int64_t bucket_bytes_cap_;
+
+#ifndef _WIN32
+  struct RpcContext {
+    using ContextPtr = torch::distributed::autograd::ContextPtr;
+    // The shared_ptr is to hold the context instance.
+    ContextPtr context_ptr_holder;
+    std::atomic<ContextPtr::element_type*> context_ptr{nullptr};
+
+    void set(ContextPtr&& new_context_ptr);
+  };
+  RpcContext rpc_context_;
+#endif
+
+  // A struct containing work handle and tensor for allreduce scheduled in
+  // forward pass, if applicable.
+  struct ForwardPassAllreduceWork {
+    c10::intrusive_ptr<c10d::Work> workHandle;
+    at::Tensor resultTensor;
+    // whether we should divide by the initial world_size or the no. of
+    // remaining DDP ranks.
+    bool useStaticWorldSize;
+  };
+
+  // Handle for the currently scheduled allreduce in the forward pass, if
+  // applicable.
+  ForwardPassAllreduceWork forwardPassWorkHandle_;
+
+  // Division factor for reduction of gradients.
+  // Equal to the process group size, with an exception of handling uneven
+  // input.
+  int div_factor_;
+
+  bool static_graph_;
+
+  // Key: size_t (index), Value: the number of times that a variable's
+  // autograd_hook() should be triggered before marking this variable's grad as
+  // ready for communication. Map will not change after 1st iteration.
+  std::unordered_map<size_t, int> numGradHooksTriggeredMap_;
+  // Key: size_t (index), Value: the number of times that a variable's
+  // autograd_hook() are left to be triggered before marking this variable's
+  // grad as ready for communication. Map will change after 1st iteration to
+  // track a grad is ready for communication or not.
+  std::unordered_map<size_t, int> numGradHooksTriggeredMapPerIteration_;
+
+ private:
+  // reset counting for buckets before backward starts
+  void reset_bucket_counting();
+  // search unused parameters beore backward starts
+  void search_unused_parameters(
+      const std::vector<torch::autograd::Variable>& outputs);
+  void set_divide_factor();
+  // kick off all reduce for the ready bucket
+  void all_reduce_bucket(Bucket& bucket);
+  // kick off all reduce to local used map, it can help find global unused
+  // parameters
+  void all_reduce_local_used_map();
+  // initialize locally used parameter maps
+  void initialize_local_used_map();
+  // get current cuda stream
+  const c10::Stream get_current_stream();
+  bool dynamic_graph_find_unused();
+  bool static_graph_first_iteration();
+  bool static_graph_after_first_iteration();
+
+  // comm_hook_ is used to access the DDP communication hook if registered.
+  std::unique_ptr<CommHookInterface> comm_hook_;
+
+  // Sparse metadata contains the indices that will be used
+  // when calling into sparse allreduce.
+  // This is only used in the sparse allreduce collective calls
+  std::unique_ptr<std::map<std::string, at::Tensor>> sparse_metadata_;
+
+  // Debug level setting. It is parsed once when Reducer is constructed, and
+  // remains the same across a single invocation of DDP training.
+  DebugLevel ddp_debug_level_;
+  // Mapping of variable index to fully qualified name of model to notify users
+  // about errors when certain parameters do not get gradient.
+  std::unordered_map<size_t, std::string> param_names_;
+  // Variable indices stored sequentially in order of when the gradient is ready
+  // for the current backwards pass.
+  std::vector<int> grad_ready_order_indices_;
+  // Bytes capacity of first bucket, can be configured by user
+  int64_t first_bucket_bytes_cap_;
+  // Per iteration set of parameter indices that have been marked ready.
+  std::unordered_set<size_t> perIterationReadyParams_;
+  // Retrieves parameter names that have not been marked as ready as part of
+  // previous iteration.
+  std::vector<std::string> getUnmarkedParamsForIteration();
+  // Retrieves parameter indices that have not been marked as ready as part of
+  // previous iteration.
+  std::vector<size_t> getUnmarkedParamIndicesForIteration();
+  // Raises appropriate error if mark_variable_ready is called on the same
+  // variable twice, which is unexpected.
+  void checkAndRaiseMarkedTwiceError(size_t curVariableIndex);
+  // Retrieves parameter corresponding to the given VariableIndex.
+  at::Tensor& get_param_from_index(size_t index);
+
+  // Cached bucket index to model parameter mapping. Populated after buckets
+  // are rebuilt after which this mapping is static.
+  mutable std::unordered_map<size_t, std::vector<at::Tensor>>
+      cached_variables_for_bucket_;
+
+  bool optim_in_backward_{false};
+  friend class Logger;
+};
+
+// This is equivalent to take_tensors but returns indices into the
+// tensor list argument for bucket assignment. Also, it is aware
+// of device placement and will not allow buckets to span devices.
+// The index of tensors[i] assigned to bucket is tensor_indices[i],
+// when tensor_indices is empty, the index of tensors[i] assigned to
+// bucket is i.
+TORCH_API std::tuple<std::vector<std::vector<size_t>>, std::vector<size_t>>
+compute_bucket_assignment_by_size(
+    const std::vector<at::Tensor>& tensors,
+    const std::vector<size_t>& bucket_size,
+    const std::vector<bool>& expect_sparse_gradient = {},
+    const std::vector<int64_t>& tensor_indices = {},
+    const c10::optional<std::weak_ptr<c10d::Logger>>& logger = {});
+
+// Verify models across all processes are the same as model on rank 0 with
+// respect to no. of params and matching dtype/size/layout.
+TORCH_API void verify_params_across_processes(
+    const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
+    const std::vector<at::Tensor>& params,
+    const c10::optional<std::weak_ptr<c10d::Logger>>& logger);
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..539e8820bd0d99a152f35b6cb2fe5adfb59bdea0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp
@@ -0,0 +1,81 @@
+#pragma once
+#include <c10/util/ApproximateClock.h>
+#include <torch/csrc/autograd/profiler.h>
+
+namespace c10d {
+constexpr int kUnsetTime = -1;
+
+inline int64_t current_time_in_nanos() {
+  return c10::getTime();
+}
+
+class TORCH_API Timer {
+ private:
+  // The timestamp of forward call start time in each iteration.
+  int64_t forward_start_time = kUnsetTime;
+  // The timestamp of backward computation start and end time in each
+  // iteration.
+  int64_t backward_compute_start_time = kUnsetTime;
+  int64_t backward_compute_end_time = kUnsetTime;
+  // The timestamp of first communication call start time in each iteration.
+  int64_t backward_comm_start_time = kUnsetTime;
+  // The timestamp of last communication call end time in each iteration.
+  int64_t backward_comm_end_time = kUnsetTime;
+
+ public:
+  enum class Event {
+    kForwardStart,
+    kBackwardComputeStart,
+    kBackwardComputeEnd,
+    kBackwardCommStart,
+    kBackwardCommEnd,
+  };
+
+  // Record the current event, i.e., mark it as having occurred now. Default
+  // CPU implementation.
+  virtual void record(Event event) {
+    getTimeRef(event) = current_time_in_nanos();
+  }
+
+  // Return the difference between when two events occurred, in nanoseconds.
+  // Or nullopt if one of them hasn't been recorded.
+  virtual c10::optional<int64_t> measureDifference(Event start, Event end) = 0;
+
+  virtual ~Timer() = default;
+
+  // Return host-side timestamp, or nullopt if it has not yet been recorded.
+  c10::optional<int64_t> getTimestamp(Event event) {
+    auto time = getTimeRef(event);
+    if (time == kUnsetTime) {
+      return c10::nullopt;
+    } else {
+      return time;
+    }
+  }
+
+  // Return host-side time member variable corresponding to the given event.
+  int64_t& getTimeRef(Event event) {
+    switch (event) {
+      case Event::kForwardStart:
+        return forward_start_time;
+      case Event::kBackwardComputeStart:
+        return backward_compute_start_time;
+      case Event::kBackwardComputeEnd:
+        return backward_compute_end_time;
+      case Event::kBackwardCommStart:
+        return backward_comm_start_time;
+      case Event::kBackwardCommEnd:
+        return backward_comm_end_time;
+      default:
+        TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+};
+
+TORCH_DECLARE_TYPED_REGISTRY(
+    TimerRegistry,
+    c10::DeviceType,
+    Timer,
+    std::unique_ptr,
+    c10::Device);
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f3632fe784dff362696e066ff18ee73fd6893db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+#include <vector>
+
+namespace c10d {
+const int kUnsetSeqNum = 0;
+
+namespace {
+constexpr int kByteOffset = 8;
+}
+
+// Converts from int to char vec to write in store
+template <typename T>
+inline std::vector<T> toVec(uint64_t num, int numBytes) {
+  std::vector<T> values;
+  // Read off bytes from right to left, pushing them into
+  // char array.
+  for (const auto i : c10::irange(numBytes)) {
+    uint8_t x = (num >> (kByteOffset * i)) & 0xff;
+    values.push_back(static_cast<T>(x));
+  }
+  return values;
+}
+
+// Converts from char vec (such as from store read) to int.
+template <typename T>
+inline uint64_t fromVec(const std::vector<T>& values) {
+  uint64_t num = 0;
+  // Set each byte at the correct location on num
+  for (const auto i : c10::irange(values.size())) {
+    uint8_t x = static_cast<uint8_t>(values[i]);
+    num |= (static_cast<int64_t>(x) << (kByteOffset * i));
+  }
+  return num;
+}
+
+class TORCH_API SequenceNum {
+ public:
+  SequenceNum();
+  explicit SequenceNum(const uint64_t num);
+  // Retrieve num_. Will throw if not set.
+  uint64_t get() const;
+  // Increment num_. Will throw if not set.
+  void increment();
+  // Increment num_ and return the old value. Will throw if not set.
+  uint64_t getAndIncrement();
+  // Sets num_
+  void set(const uint64_t num);
+  // Returns true if this SequenceNum is properly initialized with a value, else
+  // false.
+  bool isSet() const;
+
+  SequenceNum& operator=(const SequenceNum& other);
+
+  SequenceNum(const SequenceNum& other);
+
+ private:
+  c10::optional<uint64_t> num_;
+  mutable std::mutex lock_;
+};
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h
new file mode 100644
index 0000000000000000000000000000000000000000..c925cdada4025a052b9220e45067b7afa49fc691
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h
@@ -0,0 +1,93 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/distributed/c10d/exception.h>
+
+namespace c10d {
+namespace detail {
+
+class SocketOptions {
+ public:
+  SocketOptions& prefer_ipv6(bool value) noexcept {
+    prefer_ipv6_ = value;
+
+    return *this;
+  }
+
+  bool prefer_ipv6() const noexcept {
+    return prefer_ipv6_;
+  }
+
+  SocketOptions& connect_timeout(std::chrono::seconds value) noexcept {
+    connect_timeout_ = value;
+
+    return *this;
+  }
+
+  std::chrono::seconds connect_timeout() const noexcept {
+    return connect_timeout_;
+  }
+
+ private:
+  bool prefer_ipv6_ = true;
+  std::chrono::seconds connect_timeout_{30};
+};
+
+class SocketImpl;
+
+class Socket {
+ public:
+  // This function initializes the underlying socket library and must be called
+  // before any other socket function.
+  static void initialize();
+
+  static Socket listen(std::uint16_t port, const SocketOptions& opts = {});
+
+  static Socket listenFromFd(int fd, std::uint16_t expected_port);
+
+  static Socket connect(
+      const std::string& host,
+      std::uint16_t port,
+      const SocketOptions& opts = {});
+
+  Socket() noexcept = default;
+
+  Socket(const Socket& other) = delete;
+
+  Socket& operator=(const Socket& other) = delete;
+
+  Socket(Socket&& other) noexcept;
+
+  Socket& operator=(Socket&& other) noexcept;
+
+  ~Socket();
+
+  Socket accept() const;
+
+  int handle() const noexcept;
+
+  std::uint16_t port() const;
+
+  bool waitForInput(std::chrono::milliseconds timeout);
+
+ private:
+  explicit Socket(std::unique_ptr<SocketImpl>&& impl) noexcept;
+
+  std::unique_ptr<SocketImpl> impl_;
+};
+
+} // namespace detail
+
+} // namespace c10d
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..36616bc2406cebbc908e91f8617f3b3a80c3d006
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/rpc/utils.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// All RPC peers should call into this function at the same time. Each peer
+// provides its own id and name, and this function uses the given Store to
+// gather global name-to-id mapping on all peers.
+TORCH_API std::unordered_map<std::string, worker_id_t> collectNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName,
+    const int worldSize);
+
+// Ranks in dynamic RPC groups will initially call into this to establish the
+// name-to-id mapping for the current peers in the group. The current rank will
+// put its own worker info in the store and discover all the ranks that came
+// before it. NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
+// Remove name frmo Store, used in dynamic RPC groups.
+// NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+TORCH_API void removeCurrentName(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
+// This performs a synchronization of all call counts by using store.
+// All RPC peers wait for others to join to exit at the same time.
+TORCH_API int syncCallCount(
+    ::c10d::PrefixStore store,
+    const int worldSize,
+    int activeCalls = 0);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa4262a4836aefc7685a5ce9152146cdfa04aa66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <torch/types.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// An enum denoting common RPC errors to allow specific error handling for them.
+enum RPCErrorType {
+  UNKNOWN_ERROR = 0, /* Indicates that error type could not be parsed */
+  TIMEOUT = 1, /* Indicates that the RPC has timed out */
+  INTENTIONAL_FAILURE = 2 /* Deliberate failure, such as those injected by
+                             FaultyAgent for testing */
+};
+
+// The enum values are bitwise ORed with MessageType
+// They are bit flags starting from 0x100 and should have
+// value such as 0x100, 0x200, 0x400, 0x800, 0xF00, etc.
+enum MessageTypeFlags {
+  REQUEST_TYPE = 0x100,
+  RESPONSE_TYPE = 0x200,
+};
+
+// Message types must have values between 0x00 to 0xff
+enum MessageType {
+  // messages for dist.rpc on builtin operators
+  SCRIPT_CALL = 0x00 | MessageTypeFlags::REQUEST_TYPE,
+  SCRIPT_RET = 0x01 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // messages for dist.rpc on Python UDF
+  PYTHON_CALL = 0x02 | MessageTypeFlags::REQUEST_TYPE,
+  PYTHON_RET = 0x03 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // messages for dist.remote on builtin operators and Python UDF
+  SCRIPT_REMOTE_CALL = 0x04 |
+      MessageTypeFlags::REQUEST_TYPE, // A remote call on a builtin operator
+  PYTHON_REMOTE_CALL =
+      0x05 | MessageTypeFlags::REQUEST_TYPE, // A remote call on a Python UDF
+  REMOTE_RET =
+      0x06 | MessageTypeFlags::RESPONSE_TYPE, // Response for remote calls for
+                                              // UDF, builtin, or script
+
+  // RRef related internal messages
+  SCRIPT_RREF_FETCH_CALL =
+      0x07 | MessageTypeFlags::REQUEST_TYPE, // A UserRRef<IValue> fetches value
+                                             // from owner
+  PYTHON_RREF_FETCH_CALL =
+      0x08 | MessageTypeFlags::REQUEST_TYPE, // A UserRRef<py::object> fetches
+                                             // value from owner
+  SCRIPT_RREF_FETCH_RET = 0x09 |
+      MessageTypeFlags::RESPONSE_TYPE, // An OwnerRRef sends ivalue to user
+  PYTHON_RREF_FETCH_RET = 0x0a |
+      MessageTypeFlags::RESPONSE_TYPE, // An OwnerRRef sends py::object to user
+  RREF_USER_DELETE = 0x0b |
+      MessageTypeFlags::REQUEST_TYPE, // A UserRRef tells the owner to deref
+  RREF_FORK_REQUEST =
+      0x0c | MessageTypeFlags::REQUEST_TYPE, // A child UserRRef tells the owner
+                                             // about itself
+  RREF_CHILD_ACCEPT =
+      0x0d | MessageTypeFlags::REQUEST_TYPE, // A child UserRRef tells parent
+                                             // that owner knows it
+  RREF_ACK =
+      0x0e | MessageTypeFlags::RESPONSE_TYPE, // ACK to internal RRef messages
+
+  // Messages with autograd info
+  FORWARD_AUTOGRAD_REQ = 0x0f | MessageTypeFlags::REQUEST_TYPE,
+  FORWARD_AUTOGRAD_RESP = 0x10 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to propagate gradients on the backward pass.
+  BACKWARD_AUTOGRAD_REQ = 0x11 | MessageTypeFlags::REQUEST_TYPE,
+  BACKWARD_AUTOGRAD_RESP = 0x12 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to tell workers to clean up their autograd context.
+  CLEANUP_AUTOGRAD_CONTEXT_REQ = 0x13 | MessageTypeFlags::REQUEST_TYPE,
+  CLEANUP_AUTOGRAD_CONTEXT_RESP = 0x14 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages that tell workers to run requests with profiling enabled.
+  RUN_WITH_PROFILING_REQ = 0x15 | MessageTypeFlags::REQUEST_TYPE,
+  RUN_WITH_PROFILING_RESP = 0x16 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to support RRef.backward().
+  RREF_BACKWARD_REQ = 0x17 | MessageTypeFlags::REQUEST_TYPE,
+  RREF_BACKWARD_RESP = 0x18 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Other internal message types
+  EXCEPTION = 0x37 | MessageTypeFlags::RESPONSE_TYPE,
+  UNKNOWN = 0x3c
+};
+
+// A message to be sent/received by an RpcAgent.
+//
+// A Message object contains 4 fields:
+//    payload (std::vector<char>): a binary chunk of data.
+//    tensors (std::vector<torch::Tensor>): all tensors. Tensor data are not
+//        included in the payload, and it is up to the RpcAgent implementation
+//        to determine how to serialize them. This design is helpful for
+//        communicating super large tensors where serializing all the data at
+//        once leads to excessively large memory footprint. An implementation
+//        can then serialize and send tensors chunk-by-chunk, in the streaming
+//        fashion.
+//    type (MessageType): type of the message.
+//    id (int64_t): message id, this is used to match request and response.
+//               Other implementation can ignore it if they have their own
+//               ways to do matching.
+//
+// Layers above ``RpcAgent`` only converts ScriptCall, ScriptResp, PythonCall,
+// and PythonResp into a Message, and it is up to the RpcAgent
+// implementation to determine how to serialize a message.
+class TORCH_API Message final : public torch::CustomClassHolder {
+ private:
+  // Keep these private in order to force users to go through make_intrusive and
+  // thus prevent creating a Message that's not held by an intrusive_ptr.
+  Message();
+
+  Message(
+      std::vector<char>&& payload,
+      std::vector<torch::Tensor>&& tensors,
+      MessageType type);
+
+  Message(
+      std::vector<char>&& payload,
+      std::vector<torch::Tensor>&& tensors,
+      MessageType type,
+      int64_t id);
+
+  friend c10::intrusive_ptr<Message>;
+
+ public:
+  Message(const Message& other) = delete;
+  Message(Message&& other) = delete;
+  Message& operator=(Message const& rhs) = delete;
+  Message& operator=(Message&& rhs) = delete;
+
+  // Destructively retrieves the payload.
+  std::vector<char>&& movePayload() &&;
+  std::vector<torch::Tensor>&& moveTensors() &&;
+
+  std::vector<char>& payload();
+  const std::vector<char>& payload() const;
+  std::vector<torch::Tensor>& tensors();
+  const std::vector<torch::Tensor>& tensors() const;
+  MessageType type() const;
+
+  bool isRequest() const;
+  bool isResponse() const;
+  bool isShutdown() const;
+
+  // id is an optional field to match request/response. If an RpcAgent
+  // implementation is able to do the matching without using this id, it can be
+  // dropped during message serialization.
+  int64_t id() const;
+  void setId(int64_t id);
+
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> getStorages() const;
+
+ private:
+  std::vector<char> payload_;
+  std::vector<torch::Tensor> tensors_;
+  MessageType type_ = MessageType::UNKNOWN;
+  int64_t id_ = -1;
+};
+
+// Create a response Message of type Exception.
+// The exception string representation will be used as the message's payload.
+// A message ID corresponding to the request that resulted in this response can
+// be provided for matching requests/responses.
+TORCH_API c10::intrusive_ptr<Message> createExceptionResponse(
+    const std::exception& e,
+    int64_t id);
+
+// Create a response Message of type Exception.
+// The passed in string representation will be used as the message's payload.
+// A message ID corresponding to the request that resulted in this response can
+// be provided for matching requests/responses.
+TORCH_API c10::intrusive_ptr<Message> createExceptionResponse(
+    const std::string& exceptionStr,
+    int64_t id);
+
+inline std::tuple<
+    c10::intrusive_ptr<Message>,
+    std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>>>
+withStorages(c10::intrusive_ptr<Message> message) {
+  auto storages = message->getStorages();
+  return std::make_tuple(std::move(message), std::move(storages));
+}
+
+using JitFuture = c10::ivalue::Future;
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h
new file mode 100644
index 0000000000000000000000000000000000000000..74685278ac4f79ab0c6472c5b5782c35c2bba411
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+enum RRefProxyType { RPC_SYNC, RPC_ASYNC, REMOTE };
+
+// Python wrapper of an RRef shared_ptr that supports Python
+// pickle and unpickle.
+class PYBIND11_EXPORT PyRRef {
+ public:
+  // The first ctor can only be called while holding GIL. See its implementation
+  // for more explanations.
+  explicit PyRRef(const py::object& value, const py::object& type_hint);
+  explicit PyRRef(c10::intrusive_ptr<RRef> rref);
+  PyRRef(const PyRRef&) = default;
+  ~PyRRef();
+
+  bool isOwner() const;
+  bool confirmedByOwner() const;
+  WorkerInfo owner() const;
+  std::string ownerName() const;
+  py::object toHere(
+      const float timeoutSeconds =
+          torch::distributed::rpc::kUnsetRpcTimeout) const;
+  py::object localValue() const;
+  std::string str() const;
+  py::tuple pickle() const;
+  static PyRRef unpickle(const py::tuple& t);
+  c10::IValue toIValue() const;
+  // Future that is associated with the creation of this RRef on the remote end.
+  // This is only used to get the future corresponding to the rref for profiling
+  // use cases.
+  c10::intrusive_ptr<JitFuture> getFuture() const;
+  // Keeps track of the future responsible for profiling owner creation
+  // acknowledgement
+  c10::intrusive_ptr<JitFuture> getProfilingFuture() const;
+  // Sets the future responsible for profiling owner creation acknowledgement.
+  // This future is set from python to be a future that returns when profiling
+  // callbacks have been run.
+  void setProfilingFuture(c10::intrusive_ptr<JitFuture> profilingFuture);
+
+  // create a proxy on this RRef, which can be used to launch RPC on the owner
+  // of this RRef to run functions on the object referenced by this RRef.
+  py::object createRRefProxy(
+      const RRefProxyType& mode,
+      float timeoutSeconds = rpc::kUnsetRpcTimeout) const;
+
+  // get the type of the data object referenced by this RRef. Timeout argument
+  // is only used in the first invocation of this function as an argument to the
+  // RPC to the owner node of the RRef.
+  py::object getRRefType(
+      float timeout = rpc::kUnsetRpcTimeout,
+      bool blocking = true);
+
+  // Run the backward pass with the RRef as the root.
+  void backward(int64_t autogradContextId, bool retainGraph);
+
+  // Helper static function to run backward on a given rref.
+  static void backward(
+      int64_t autogradContextId,
+      bool retainGraph,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // Specialization of backward if the rref is an OwnerRRef.
+  static void backwardOwnerRRef(
+      int64_t autogradContextId,
+      bool retainGraph,
+      IValue value);
+
+ private:
+  c10::intrusive_ptr<RRef> rref_;
+  c10::optional<c10::intrusive_ptr<JitFuture>> profilingFuture_;
+  c10::optional<py::object> type_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd56cc9df0e3f36f6af4ecebeb4456964d6afd56
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// RPC call representing calling a Python function over RPC.
+class TORCH_API PythonCall final : public RpcCommandBase {
+ public:
+  PythonCall(SerializedPyObj&& serializedPyObj, bool isAsyncExecution);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+  static std::unique_ptr<PythonCall> fromMessage(const Message& message);
+
+  const SerializedPyObj& serializedPyObj() const;
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+ private:
+  SerializedPyObj serializedPyObj_;
+  const bool isAsyncExecution_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8da447afc455953a69a583c9bbff82268ecfe9f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Converts an internal ivalue::Future of Message into a user-facing
+// ivalue::Future of py::object type by creating a new ivalue::Future and call
+// its  markCompleted as a callback in the given ivalue::Future.
+// If hasValue is true, the Message will be converted into a py::object and then
+// wrap it with an IValue. If hasValue is false, this ivalue::Future is only
+// used for signaling and launching callbacks. In this case, the message will be
+// discarded and then set the ivalue::Future using an empty IValue or the given
+// FutureError if there is an error.
+c10::intrusive_ptr<JitFuture> toPyJitFuture(
+    const c10::intrusive_ptr<JitFuture>& messageJitFuture,
+    bool hasValue = true);
+
+c10::intrusive_ptr<JitFuture> pyRpcBuiltin(
+    const WorkerInfo& dst,
+    const std::string& opName,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    const float rpcTimeoutSeconds);
+
+c10::intrusive_ptr<JitFuture> pyRpcPythonUdf(
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+c10::intrusive_ptr<JitFuture> pyRpcTorchscript(
+    const std::string& dstWorkerName,
+    const std::string& qualifiedNameStr,
+    const py::tuple& argsTuple,
+    const py::dict& kwargsDict,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+PyRRef pyRemoteBuiltin(
+    const WorkerInfo& dst,
+    const std::string& opName,
+    const float rpcTimeoutSeconds,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+PyRRef pyRemotePythonUdf(
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+PyRRef pyRemoteTorchscript(
+    const std::string& dstWorkerName,
+    const std::string& qualifiedNameStr,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b8e057ce20a6fa2fcba622703caa7fd1d6ab43d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+class TORCH_API PythonRemoteCall : public RpcCommandBase {
+ public:
+  PythonRemoteCall(
+      SerializedPyObj&& serializedPyObj,
+      at::IValue retRRefId,
+      at::IValue retForkId,
+      const bool isAsyncExecution);
+
+  inline const SerializedPyObj& serializedPyObj() const {
+    return serializedPyObj_;
+  }
+
+  inline const at::IValue& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const at::IValue& retForkId() const {
+    return retForkId_;
+  }
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<PythonRemoteCall> fromMessage(const Message& message);
+
+ private:
+  SerializedPyObj serializedPyObj_;
+  const at::IValue retRRefId_;
+  const at::IValue retForkId_;
+  const bool isAsyncExecution_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d19f5904fc30fa15b824e46a93f9058a0394116
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// RPC call representing the response of a Python UDF over RPC.
+class TORCH_API PythonResp final : public RpcCommandBase {
+ public:
+  explicit PythonResp(SerializedPyObj&& serializedPyObj);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+  static std::unique_ptr<PythonResp> fromMessage(const Message& message);
+
+  const SerializedPyObj& serializedPyObj() const;
+
+ private:
+  SerializedPyObj serializedPyObj_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4c74aeae71f4a923374ecf35cf642eb247c3e49
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Singleton class provides interface to execute python UDF remote call
+// and deserialize the returned results by running python function
+// in internal_rpc_utilities.
+// The singleton object is constructed at first when RPC agent is
+// constructed, where the python function in
+// torch/distributed/internal_rpc_utils.py are imported only once.
+class PYBIND11_EXPORT PythonRpcHandler {
+ public:
+  struct RRefProxyFunctions {
+    py::object rrefProxyCtor_;
+    py::object rpcSync_;
+    py::object rpcAsync_;
+    py::object remote_;
+  };
+
+  struct RRefTypeFunctions {
+    py::object onOwner_;
+    py::object onUser_;
+  };
+
+  static PythonRpcHandler& getInstance();
+
+  // Run a pickled Python UDF and return the result py::object
+  py::object runPythonUdf(const py::object& pythonUdf);
+
+  // Serialized a py::object into a string
+  SerializedPyObj serialize(const py::object& obj);
+
+  // Deserialize a string into a py::object
+  py::object deserialize(const SerializedPyObj& serializedObj);
+
+  // Check if obj is RemoteException, then throw it
+  void handleException(const py::object& obj);
+  // Alternative if the caller is already holding the GIL.
+  void handleExceptionGILHeld(const py::object& obj);
+  // Check if obj is an RemoteException instance.
+  bool isRemoteException(const py::object& obj);
+
+  // Explicitly clean up py::objects to avoid segment faults when
+  // py::objects with CPython are cleaned up later at program exit
+  // See similar issues reported https://github.com/pybind/pybind11/issues/1598
+  // and https://github.com/pybind/pybind11/issues/1493
+  // Our local tests also caught this segment faults if py::objects are cleaned
+  // up at program exit. The explanation is: CPython cleans up most critical
+  // utilities before cleaning up PythonRpcHandler singleton, so when
+  // PythonRpcHandler singleton cleans up py::objects and call dec_ref(), it
+  // will crash.
+  // The solution is to clean up py::objects earlier when Rpc agent join().
+  // Be note that py::objects can not be cleaned up when Rpc agent is destroyed
+  // as well, as Rpc agent is global variable and it will have same issue as
+  // PythonRpcHandler.
+  void cleanup();
+
+  std::shared_ptr<torch::jit::CompilationUnit> jitCompilationUnit();
+
+  // Parse the string to recover the jit_type, this is used for RRef python
+  // pickling/unpickling type recovery. The type string inference rule is as
+  // follows:
+  // 1. first try to parse if this is primitive types.
+  //    i.e. TensorType, IntType, PyObjectType, etc.
+  // 2. if not primitive type, we query the python_cu to see if it is a
+  //    class type or interface type registered in python
+  // We use a ScriptTypeParser instance with custom PythonTypeResolver
+  // to resolve types according to the above rules.
+  TypePtr parseTypeFromStr(const std::string& typeStr);
+
+  // Return a set of Python functions for RRef helpers.
+  const RRefProxyFunctions& getRRefProxyFunctions() const;
+
+  // Return a set of Python functions to retrieve the type of the object
+  // referenced by a given RRef.
+  const RRefTypeFunctions& getRRefTypeFunctions() const;
+
+  PythonRpcHandler(const PythonRpcHandler&) = delete;
+  PythonRpcHandler& operator=(const PythonRpcHandler&) = delete;
+  PythonRpcHandler(PythonRpcHandler&&) = delete;
+  PythonRpcHandler& operator=(PythonRpcHandler&&) = delete;
+
+ private:
+  void init();
+  PythonRpcHandler();
+  ~PythonRpcHandler() = default;
+
+  // Ref to `torch.distributed.rpc.internal._run_function`.
+  py::object pyRunFunction_;
+
+  // Ref to `torch.distributed.rpc.internal.serialize`.
+  py::object pySerialize_;
+
+  // Ref to `torch.distributed.rpc.internal.deserialize`.
+  py::object pyDeserialize_;
+
+  // Ref to 'torch.distributed.rpc.internal._handle_exception'
+  py::object pyHandleException_;
+
+  // Python functions for RRef proxy
+  RRefProxyFunctions rrefProxyFunctions_;
+
+  // Ref to 'torch.distributed.rpc.api._rref_typeof_on_'
+  RRefTypeFunctions rrefTypeFunctions_;
+
+  // Shared ptr to python compilation unit in jit, it is constructed in python
+  // side (see _python_cu = torch._C.CompilationUnit() in jit/__init__.py)
+  // and imported in C++ (see get_python_cu() in
+  // csrc/jit/python/pybind_utils.h). We import the compilation unit here only
+  // once for less cost and thread safety.
+  std::shared_ptr<torch::jit::CompilationUnit> jitCompilationUnit_;
+
+  // jit type parser to parse type_str back to TypePtr for RRef type
+  // recovery when pickling and unpickling RRef
+  std::shared_ptr<jit::ScriptTypeParser> typeParser_;
+
+  // Indicates whether or not we have properly initialized the handler.
+  bool initialized_;
+
+  // Lock to protect initialization.
+  std::mutex init_lock_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3583136d7252933cc64ac1e8d0950b5dbc4051a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Functor which is invoked to process an RPC message. This is an abstract class
+// with some common functionality across all request handlers. Users need to
+// implement this interface to perform the actual business logic.
+class TORCH_API RequestCallback {
+ public:
+  // Invoke the callback.
+  c10::intrusive_ptr<JitFuture> operator()(
+      Message& request,
+      std::vector<c10::Stream> streams) const;
+
+  virtual ~RequestCallback() = default;
+
+ protected:
+  // RpcAgent implementation should invoke ``RequestCallback`` to process
+  // received requests. There is no restriction on the implementation's
+  // threading model. This function takes an rvalue reference of the Message
+  // object. It is expected to return the future to a response message or
+  // message containing an exception. Different rpc agent implementations are
+  // expected to ensure delivery of the response/exception based on their
+  // implementation specific mechanisms.
+  virtual c10::intrusive_ptr<JitFuture> processMessage(
+      Message& request,
+      std::vector<c10::Stream> streams) const = 0;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0ee4cc4a33a4c88c4b5588ea3ba60acf43e893a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback_no_python.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/python/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython {
+ public:
+  std::unique_ptr<RpcCommandBase> deserializePythonRpcCommand(
+      std::unique_ptr<RpcCommandBase> rpc,
+      const MessageType& messageType) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processScriptCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processScriptRemoteCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonRemoteCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonRRefFetchCall(
+      RpcCommandBase& rpc) const override;
+
+  void handleRRefDelete(c10::intrusive_ptr<RRef>& rref) const override;
+
+  c10::intrusive_ptr<JitFuture> processRpcWithErrors(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      std::vector<c10::Stream> streams) const override;
+
+  bool cudaAvailable() const override;
+
+  c10::intrusive_ptr<JitFuture> processRRefBackward(
+      RpcCommandBase& rpc) const override;
+
+  // Helpers to run user-defined functions, operators and other computations.
+
+  c10::intrusive_ptr<JitFuture> runJitFunction(
+      const c10::QualifiedName& name,
+      std::vector<at::IValue>& stack,
+      std::vector<c10::Stream> streams,
+      bool isAsyncExecution) const;
+
+  c10::intrusive_ptr<JitFuture> runPythonFunction(
+      const py::object& function,
+      std::vector<c10::Stream> streams,
+      bool isAsyncExecution) const;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..63354d35da087c97a1a02839a5443521ebbabb20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// RequestCallback implementation with no Python dependencies.
+class TORCH_API RequestCallbackNoPython : public RequestCallback {
+ public:
+  c10::intrusive_ptr<JitFuture> processMessage(
+      Message& request,
+      std::vector<c10::Stream> streams) const override;
+
+ protected:
+  virtual std::unique_ptr<RpcCommandBase> deserializePythonRpcCommand(
+      std::unique_ptr<RpcCommandBase> rpc,
+      const MessageType& messageType) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processScriptCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  c10::intrusive_ptr<JitFuture> assignOwnerRRef(
+      const RRefId& rrefId,
+      const RRefId& forkId,
+      c10::intrusive_ptr<JitFuture> valueFuture) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processScriptRemoteCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonRemoteCall(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  c10::intrusive_ptr<JitFuture> retrieveOwnerRRef(const RRefId& rrefId) const;
+
+  c10::intrusive_ptr<JitFuture> processScriptRRefFetchCall(
+      RpcCommandBase& rpc) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonRRefFetchCall(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefUserDelete(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefChildAccept(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefForkRequest(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processForwardAutogradReq(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  c10::intrusive_ptr<JitFuture> processBackwardAutogradReq(
+      RpcCommandBase& rpc,
+      std::vector<c10::Stream> streams) const;
+
+  c10::intrusive_ptr<JitFuture> processCleanupAutogradContextReq(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRunWithProfilingReq(
+      RpcCommandBase& rpc) const;
+
+  virtual void handleRRefDelete(c10::intrusive_ptr<RRef>& rref) const;
+
+  c10::intrusive_ptr<JitFuture> processRpc(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      std::vector<c10::Stream> streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processRpcWithErrors(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      std::vector<c10::Stream> streams) const;
+
+  c10::intrusive_ptr<Message> handleError(
+      const std::exception& e,
+      const MessageType messageType,
+      int64_t messageId) const;
+
+  virtual bool cudaAvailable() const;
+
+  virtual c10::intrusive_ptr<JitFuture> processRRefBackward(
+      RpcCommandBase& rpc) const;
+
+  // Helpers to run user-defined functions, operators and other computations.
+
+  c10::intrusive_ptr<JitFuture> runJitOperator(
+      const jit::Operator& op,
+      std::vector<at::IValue>& stack,
+      std::vector<c10::Stream> streams) const;
+
+  // Helpers to convert various kinds of objects into already-completed futures.
+
+  c10::intrusive_ptr<JitFuture> asFuture(IValue value, TypePtr type) const;
+
+  c10::intrusive_ptr<JitFuture> asFuture(
+      c10::intrusive_ptr<Message> message) const;
+
+  c10::intrusive_ptr<JitFuture> asFuture(std::exception_ptr err) const;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h
new file mode 100644
index 0000000000000000000000000000000000000000..86aa71c20d19da94f24718328ec8df2931c0c1c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+PyMethodDef* python_functions();
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c6207ae675986d2b5bed9ad7f553bae592b98dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h
@@ -0,0 +1,341 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+#include <algorithm>
+#include <cctype>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using DeviceMap = std::unordered_map<c10::Device, c10::Device>;
+
+// Default RPC timeout
+constexpr float kDefaultRpcTimeoutSeconds = 60;
+// Unset RPC timeout. This is the value agent::send() will have if user does not
+// pass in a specific timeout, and indicates that we must use the default
+// timeout for RPCs.
+constexpr float kUnsetRpcTimeout = -1;
+constexpr auto kDefaultInitMethod = "env://";
+constexpr float kSecToMsConversion = 1000;
+constexpr auto kRpcTimeoutErrorStr =
+    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
+
+using steady_clock_time_point =
+    std::chrono::time_point<std::chrono::steady_clock>;
+// Input is qualified name string, output is JIT StrongTypePtr
+// Same as jit::TypeResolver, did not import jit::TypeResolver to here
+// because it could introduce cyclic dependencies.
+using TypeResolver =
+    std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
+
+struct TORCH_API RpcBackendOptions {
+  RpcBackendOptions()
+      : RpcBackendOptions(kDefaultRpcTimeoutSeconds, kDefaultInitMethod) {}
+
+  RpcBackendOptions(float rpcTimeoutSeconds, std::string initMethod)
+      : rpcTimeoutSeconds(rpcTimeoutSeconds),
+        initMethod(std::move(initMethod)) {
+    TORCH_CHECK(rpcTimeoutSeconds >= 0, "RPC Timeout must be non-negative");
+  }
+
+  float rpcTimeoutSeconds;
+  std::string initMethod;
+};
+
+// A globally unique ID to identify an RpcAgent
+struct TORCH_API WorkerInfo : torch::CustomClassHolder {
+  WorkerInfo(std::string name, int64_t id);
+
+  WorkerInfo(std::string name, worker_id_t id);
+
+  bool operator==(const WorkerInfo& rhs) {
+    return (id_ == rhs.id_) && (name_ == rhs.name_);
+  }
+
+  static constexpr size_t MAX_NAME_LEN = 128;
+
+  const std::string name_;
+  const worker_id_t id_;
+};
+
+struct TORCH_API RegisterWorkerInfoOnce {
+  RegisterWorkerInfoOnce();
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const WorkerInfo& workerInfo);
+
+// Struct for options to configure the RPC Retry protocol.
+struct TORCH_API RpcRetryOptions {
+  // Using a default constructor like all other Options structs in the RPC
+  // codebase. TORCH_CHECKs for input validation are done in the
+  // sendWithRetries function.
+  RpcRetryOptions() = default;
+  // Maximum number of times we will retry the RPC
+  int maxRetries{5};
+  // Initial duration between consecutive RPC send attempts
+  std::chrono::milliseconds rpcRetryDuration{std::chrono::milliseconds(1000)};
+  // Constant for exponential backoff used while calculating future wait
+  // durations
+  float retryBackoff{1.5};
+};
+
+// Struct that stores all the metadata needed to retry a given RPC.
+struct TORCH_API RpcRetryInfo {
+  RpcRetryInfo(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      c10::intrusive_ptr<JitFuture> originalFuture,
+      int retryCount,
+      RpcRetryOptions options)
+      : to_(to),
+        message_(std::move(message)),
+        originalFuture_(std::move(originalFuture)),
+        retryCount_(retryCount),
+        options_(options) {}
+
+  const WorkerInfo& to_;
+  c10::intrusive_ptr<Message> message_;
+  // Future that is returned to the caller of sendWithRetries().
+  c10::intrusive_ptr<JitFuture> originalFuture_;
+  // Number of send attempts completed so far.
+  int retryCount_;
+  RpcRetryOptions options_;
+};
+
+// ``RpcAgent`` is the base class for sending and receiving RPC messages. It
+// provides a unified ``send`` API for both request and response messages, and
+// will invoke the given ``RequestCallback`` to process received requests. It
+// should immediately become ready to serve request and accept response after
+// construction.
+class TORCH_API RpcAgent {
+ public:
+  // `WorkerInfo` is the globally unique identifier for this RpcAgent instance.
+  // It contains a ``name_`` field and an ``id_`` field. ``name_`` is the
+  // globally unique name for this ``RpcAgent``. It is up to the ``RpcAgent``
+  // implementation to determine how to resolve names. ``id_`` is the globally
+  // unique ID for this ``RpcAgent``. This should be determined by the
+  // ``RpcAgent`` implementation.
+  // The ``RequestCallback`` will be invoked to handle received requests. This
+  // ``RpcAgent`` base class makes no assumption on the thread-safeness of the
+  // ``RequestCallback``. ``RpcAgent`` implementations need to make sure that
+  // its threading model conform to ``RequestCallback``'s requirement.
+  // NB: RpcAgent implementations should not start serving requests until
+  // ``start()`` is called, as there could be other contexts that have not been
+  // initialized yet at this time.
+  RpcAgent(
+      WorkerInfo id,
+      std::unique_ptr<RequestCallback> cb,
+      std::chrono::milliseconds rpcTimeout);
+
+  virtual ~RpcAgent();
+
+  // Send a message to the ``RpcAgent`` of id ``to`` and returns a
+  // ``JitFuture`` ptr. The implementation must be asynchronous, i.e., it
+  // cannot block until it receives the response.
+  //
+  // If ``message.isRequest()`` is true, the ``JitFuture`` will be
+  // completed when the response arrives. For other message types, the Future
+  // should be ignored by the caller.
+  virtual c10::intrusive_ptr<JitFuture> send(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const DeviceMap& deviceMap = {}) = 0;
+
+  // Retries sending the message up to maxRetries times until an ACK is
+  // received. The duration between consecutive sends is increased over
+  // time using an exponential backoff algorithm.
+  //
+  // Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
+  // ``JitFuture`` ptr, just like send(). Caller can specify the maximum
+  // number of retries for this RPC (default is 5), initial duration between
+  // sends (default is 1000ms), and backoff constant (default is 1.5) by
+  // passing in the RpcRetryOptions struct. This API might end up
+  // executing a method twice on the remote end (it does not guarantee
+  // exactly-once semantics). Therefore, the user must ensure their requests
+  // are idempotent.
+  c10::intrusive_ptr<JitFuture> sendWithRetries(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      RpcRetryOptions retryOptions = RpcRetryOptions());
+
+  // Return a reference to the ``WorkerInfo`` of this RpcAgent.
+  // NB: not using ``c10::optional<const std::string&>`` here because we might
+  // need to create a separate RPC API lib and avoid forcing all ``RpcAgent``
+  // implementations to depend on libtorch.
+  const WorkerInfo& getWorkerInfo() const;
+
+  // Return a reference to the ``WorkerInfo`` of the given ``workerName``.
+  virtual const WorkerInfo& getWorkerInfo(
+      const std::string& workerName) const = 0;
+
+  virtual const WorkerInfo& getWorkerInfo(worker_id_t id) const = 0;
+
+  virtual std::vector<WorkerInfo> getWorkerInfos() const = 0;
+
+  // Retrieve the timeout for all RPCs.
+  inline std::chrono::milliseconds getRpcTimeout() const {
+    return rpcTimeout_.load();
+  }
+
+  // Set the timeout for all RPCs
+  inline void setRpcTimeout(const std::chrono::milliseconds& rpcTimeout) {
+    rpcTimeout_.store(rpcTimeout);
+  }
+
+  // Call sync and join all internal threads. This method should be called
+  // before every RPC process exits.
+  virtual void join(bool shutdown = false, float timeout = 0) = 0;
+
+  // Synchronize the this process with other ``RpcAgent`` processes. Block until
+  // all ``RpcAgent``s reach this method and send all pending messages.
+  virtual void sync() = 0;
+
+  // Sets up backend-agnostic state for accepting requests. Currently, this
+  // entails setting rpcAgentRunning_ to true, creating the retry thread, and
+  // calling the backend's startImpl.
+  void start();
+
+  // Derived classes must override this function to start accepting requests.
+  // This is used to initialize any backend-specific state. Users must call
+  // start, not startImpl, to initialize the RPC Agent.
+  virtual void startImpl() = 0;
+
+  // Stop accepting requests and shutdown the RPC framework as soon as possible
+  // by terminating all RPC threads.
+  void shutdown();
+
+  // Derived classes must override this function to start accepting requests.
+  // THis is used to clean up any backend-specific state. Users must call
+  // shutdown, not shutdownImpl, to shutdown the RPC Agent.
+  virtual void shutdownImpl() = 0;
+
+  // Check if current RPC agent is set.
+  static bool isCurrentRpcAgentSet();
+
+  // Retrieve the valid current RPC agent.
+  static std::shared_ptr<RpcAgent> getCurrentRpcAgent();
+
+  // Set the current RPC agent.
+  static void setCurrentRpcAgent(std::shared_ptr<RpcAgent> rpcAgent);
+
+  // Retrieve metrics as KV map
+  virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
+
+  // Retrieve debug info in addition to metrics as KV map
+  virtual std::unordered_map<std::string, std::string> getDebugInfo();
+
+  // Flag to control whether GIL wait times
+  // should be profiled or not.
+  void enableGILProfiling(bool flag);
+
+  // Retrieve wheher we should profile GIL wait times or not.
+  bool isGILProfilingEnabled();
+
+  // Set type resolver that will be passed to JIT pickler to resolver type Ptr
+  // based on type str.
+  void setTypeResolver(std::shared_ptr<TypeResolver> typeResolver);
+
+  // Get the type resolver
+  std::shared_ptr<TypeResolver> getTypeResolver();
+
+  // Retrieves the device map for the provided destination worker.
+  virtual DeviceMap getDeviceMap(const WorkerInfo& dst) const;
+
+  // Retrieve the (non-CPU) devices that are supported by the agent.
+  virtual const std::vector<c10::Device>& getDevices() const;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const WorkerInfo workerInfo_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::unique_ptr<RequestCallback> cb_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<std::chrono::milliseconds> rpcTimeout_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> profilingEnabled_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<TypeResolver> typeResolver_;
+  // Atomic boolean indicating whether this agent is running. It controls
+  // whether several background threads should be running. It is set in
+  // RpcAgent::start() and unset in the derived class shutdown().
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> rpcAgentRunning_;
+
+ private:
+  static std::shared_ptr<RpcAgent> currentRpcAgent_;
+  // Add GIL wait time data point to metrics
+  virtual void addGilWaitTime(const std::chrono::microseconds gilWaitTime) = 0;
+  friend class PythonRpcHandler;
+
+  // Map that stores metadata for RPC's that may need to be re-tried as well as
+  // the timepoint at which we should re-try them.
+  std::map<
+      steady_clock_time_point,
+      std::unordered_set<std::shared_ptr<RpcRetryInfo>>>
+      rpcRetryMap_;
+
+  // Thread that checks for retryable RPC's in the rpcRetryMap_ and sleeps until
+  // the next unACKed RPC's timeout has expired.
+  std::thread rpcRetryThread_;
+
+  // Function that rpcRetryThread_ calls in a loop as long as RpcAgent is
+  // running.
+  void retryExpiredRpcs();
+
+  // This is the callback attached to futures corresponding to send retries.
+  // This handles 3 cases: 1). send was completed, 2). send failed with an
+  // error and we've done maxRetries failed send attempts, and 3). send
+  // failed with an error and we have more retries to go. In case 1, we mark
+  // the original future as complete. In case 2, we mark the future with an
+  // error and do not retry again. In case 3, we move the RpcRetryInfo struct
+  // to another time point in the map to schedule the RPC for a future send.
+  void rpcRetryCallback(
+      JitFuture& message,
+      steady_clock_time_point newTime,
+      std::shared_ptr<RpcRetryInfo> earliestRpc);
+
+  // Function that uses the exponential backoff algorithm to compute the next
+  // time point to retry a given RPC.
+  inline steady_clock_time_point computeNewRpcRetryTime(
+      RpcRetryOptions& options,
+      int retryCount) {
+    // The exponential backoff algorithm being used here is:
+    // newTime = timeNow + (retryDuration * (backoffConstant ^ retryCount)).
+    std::chrono::milliseconds timedelta =
+        std::chrono::duration_cast<std::chrono::milliseconds>(
+            options.rpcRetryDuration * pow(options.retryBackoff, retryCount));
+    return std::chrono::time_point_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() + timedelta);
+  }
+
+  // Condition Variable to signal when the rpcRetryMap_ has been populated.
+  std::condition_variable rpcRetryMapCV_;
+
+  // Mutex to protect RpcRetryMap_.
+  std::mutex rpcRetryMutex_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
+
+namespace std {
+template <>
+struct hash<torch::distributed::rpc::WorkerInfo> {
+  std::size_t operator()(
+      const torch::distributed::rpc::WorkerInfo& worker_info) const noexcept {
+    return worker_info.id_;
+  }
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..4937920173e3e54b60a49238428d92cbd4a63004
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Base class for all RPC request and responses.
+class RpcCommandBase {
+ public:
+  // Need to override this to serialize the RPC. This should destructively
+  // create a message for the RPC (Hence the &&).
+  c10::intrusive_ptr<Message> toMessage() && {
+    JitRRefPickleGuard jitPickleGuard;
+    return std::move(*this).toMessageImpl();
+  }
+  virtual c10::intrusive_ptr<Message> toMessageImpl() && = 0;
+  virtual ~RpcCommandBase() = 0;
+};
+
+inline RpcCommandBase::~RpcCommandBase() = default;
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..116b8439b8f150e6390c1a71042529646f0b02f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h
@@ -0,0 +1,339 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/distributed/rpc/utils.h>
+
+#include <atomic>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+namespace callback {
+// It's the callback for RemoteCall.
+void TORCH_API
+confirmPendingUser(const JitFuture& jitFuture, const ForkId& expectedForkId);
+
+// It's the callback for finishing creating owner rref, it returned deletedRRef,
+// so that the deletedRRef can be handled under GIL in python_functions.cpp if
+// deletedRRef contains python object.
+c10::intrusive_ptr<RRef> TORCH_API
+finishCreatingOwnerRRef(const JitFuture& jitFuture, const RRefId& rrefId);
+} // namespace callback
+
+// Manages RRef lifetime and keeps track of RRef forks.
+class TORCH_API RRefContext {
+ public:
+  static RRefContext& getInstance();
+  // NB: This method must be called before destructing RRefContext singleton.
+  // Similar to delForkOfOwner, this method returns a vector of OwnerRRefs that
+  // hold py::object. The call-site is also responsible for resetting those
+  // shared_ptr objects with a GIL. See comments at delForkOfOwner() for more
+  // details.
+  static std::vector<c10::intrusive_ptr<RRef>> destroyInstance(
+      bool ignoreRRefLeak = true);
+
+  static void handleException(const JitFuture& jitFuture);
+
+  // handle exception without throw ::c10::Error again
+  static void handleExceptionSilent(const JitFuture& jitFuture);
+
+  RRefContext(const RRefContext&) = delete;
+  RRefContext(RRefContext&& other) = delete;
+  void operator=(const RRefContext&) = delete;
+  RRefContext& operator=(RRefContext&& other) = delete;
+
+  ~RRefContext();
+
+  // get the worker id of the current worker
+  inline worker_id_t getWorkerId() const {
+    return agent_->getWorkerInfo().id_;
+  }
+
+  // get the worker name of the current worker
+  inline const std::string& getWorkerName() const {
+    return agent_->getWorkerInfo().name_;
+  }
+
+  //  generate a globally unique ID
+  inline GloballyUniqueId genGloballyUniqueId() {
+    return GloballyUniqueId(getWorkerId(), nextLocalId_++);
+  }
+
+  inline const std::shared_ptr<RpcAgent>& agent() const {
+    return agent_;
+  }
+
+  // create a ``UserRRef`` owned by the worker ``ownerId``
+  c10::intrusive_ptr<UserRRef> createUserRRef(
+      worker_id_t ownerId,
+      const TypePtr& type);
+
+  // Convert an RRefForkData into an RRef. This RRef could be user or owner.
+  // This RRef could have already existed before, or could be created in this
+  // method, we pass type here to validate or help the rref creation.
+  c10::intrusive_ptr<RRef> getOrCreateRRef(
+      const RRefForkData& rfd,
+      const TypePtr& type);
+
+  // Get the ``OwnerRRef`` of id ``rrefId``. If it does not exist, create a new
+  // one. This function is called in two places:
+  // 1. when processing ``rpc.remote()``, i.e., ``SCRIPT_REMOTE_CALL``
+  //    ``PYTHON_REMOTE_CALL``.
+  // 2. when unpickling ``OwnerRRef``.
+  // What's common in these two cases are, 1) the RRefId is already generated
+  // 2) the TypePtr is presented. So it can always create the ``OwnerRRef`` if
+  // it is not yet available.
+  c10::intrusive_ptr<OwnerRRef> getOrCreateOwnerRRef(
+      const RRefId& rrefId,
+      const TypePtr& type);
+
+  // Create an empty owner rref of type.
+  // This method is called to first time generate an ``OwnerRRef``, e.g.,
+  // 1) ``rpc.RRef(obj)``
+  // 2) create the ``OwnerRRef`` on `rpc.remote()` caller side.
+  // What's common in these two cases are, 1) the RRefId hasn't been generated
+  // 2) the TypePtr is presented.
+  c10::intrusive_ptr<OwnerRRef> createOwnerRRef(const TypePtr& type);
+
+  // Returns a Future of the OwnerRRef, which will be marked completed when
+  // ``OwnerRRef`` is created. This method is used when the TypePtr is not
+  // available, e.g., when processing to_here(). The forceCreated flag can be
+  // used to ensure that the rref is created on the owner, otherwise throw in
+  // cases where the user of this API expects this to return a completed future.
+  // Note that the return value is a intrusive_ptr to a c10::ivalue::Future that
+  // holds the RRef.
+  c10::intrusive_ptr<JitFuture> getOwnerRRef(
+      const RRefId& rrefId,
+      bool forceCreated = false);
+
+  // Adding the RRefId of an OwnerRRef into the forks_ map. This is useful when
+  // making a remote call to self, which as for now, still goes through serde
+  // and invokes request callback. In this case, the OwnerRRef has already been
+  // created on the send side, and we need to pass it to the receive side,
+  // instead of creating a new OwnerRRef. This is done by adding the OwnerRRef
+  // into owners_. However, that alone is not enough, as it could be deleted
+  // when all UserRRef die, which would then remove the OwnerRRef from owners_
+  // and this could happen before the self remote call finishes. To prevent
+  // that, this API adds the RRefId as a ForkId, which will then delete the
+  // ForkId when the self remote is done.
+  void addSelfAsFork(c10::intrusive_ptr<OwnerRRef>& rref);
+
+  // Register a fork of the ``OwnerRRef``, and inserts a intrusive_ptr of the
+  // ``OwnerRRef`` in a map to keep it alive.
+  void addForkOfOwner(const RRefId& rrefId, const ForkId& forkId);
+  // Performs the same function as addForkOfOwner but ignores duplicate
+  // requests. This idempotent function is used with RREF_FORK_REQUEST calls,
+  // whereas all other message types use the non-idempotent variant.
+  void addForkOfOwnerIfNotPresent(const RRefId& rrefId, const ForkId& forkId);
+  // Delete a fork of the ``OwnerRRef``. NB: this could trigger deletion on the
+  // IValue or py::object. For the later, this method will acquire GIL.
+  // NB: If this fork deletion triggered deleting OwnerRRef, this method will
+  // return a shared_ptr to the OwnerRRef, which is likely to be the last
+  // shared_ptr instance for it. Therefore, deleting this shared_ptr<OwnerRRef>
+  // will also trigger deleting the object it points to. If OwnerRRef holds a
+  // py::object, deleting it require GIL. The call site should guarded it with
+  // a GIL and reset the shared_ptr. The GIL-guarded deletion is intentionally
+  // left out of this function to avoid creating dependency on pybind.
+  c10::intrusive_ptr<RRef> delForkOfOwner(
+      const RRefId& rrefId,
+      const ForkId& forkId);
+
+  // Invoked when pickling an RRef to setup child/fork properly
+  RRefForkData prepareChildFork(const c10::intrusive_ptr<RRef>& rref);
+  // Invoked when unpickling an RRef to send RREF_FORK_REQUEST to owner and
+  // send RREF_CHILD_ACCEPT to the parent.
+  // NB: forkId is necessary here as the rref could be an OwnerRRef
+  void notifyOwnerAndParentOfFork(
+      const ForkId& forkId,
+      worker_id_t parent,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // When a UserRRef is forked to another worker (user or owner), it is added
+  // into pendingChildren_ to be held alive until it receives RREF_CHILD_ACCEPT
+  // from the child.
+  // NB: This is necessary for both user and owner child. As we do not have FIFO
+  // communication between workers, we need this strategy to make sure that all
+  // previously submitted rpc/remote calls are acked before sending out the
+  // RREF_USER_DELETE message. Otherwise, the OwnerRRef could be deleted too
+  // soon.
+  void addPendingChild(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+  void delPendingChild(const ForkId& forkId);
+
+  // When a UserRRef is created, it is added into pendingUsers_ to be held alive
+  // until it receives RREF_USER_ACCEPT from the owner.
+  void addPendingUser(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+  void delPendingUser(const ForkId& forkId);
+  void addConfirmedUser(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // Retrieve a pending user given the fork ID. Throws if the user has already
+  // been confirmed (i.e. is no longer in the pendingUsers_ map).
+  c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);
+
+  // Start recording new pending UserRRefs. All pending UserRRefs introduced
+  // after this point will be put into the thread_local userTable_, which will
+  // then be consumed and cleared in waitForThreadLocalPendingRRefs().
+  void recordThreadLocalPendingRRefs();
+  // End recording new pending UserRRefs, and clear the thread_local userTable_.
+  // Returns a Future which will be marked as completed when all pending
+  // UserRRefs in the current userTable_ are confirmed by their owners. The bool
+  // value in the Future is unused.
+  // This method is useful to make sure RRefs in user function arguments are
+  // confirmed before launching user code.
+  // NB: Callers of this method does not need to keep the returned Future alive,
+  // because this Future is already captured in callbacks of the
+  // PendingUserState. If there is no pending UserRRefs, this method returns a
+  // completed future.
+  c10::intrusive_ptr<JitFuture> waitForThreadLocalPendingRRefs();
+  // Only call this function when there are errors during a recording session,
+  // and it is likely that waitForThreadLocalPendingRRefs() cannot be invoked
+  // properly.
+  // TODO: make this a context guard
+  void clearRecordedPendingRRefsOnError();
+
+  void delUser(
+      const worker_id_t owner,
+      const RRefId& rrefId,
+      const ForkId& forkId);
+  void delAllUsersAndUnforkedOwners(std::chrono::milliseconds timeoutMillis);
+
+  std::unordered_map<std::string, std::string> getDebugInfo();
+
+ private:
+  struct PendingUserState {
+    PendingUserState(c10::intrusive_ptr<RRef> rref)
+        : rref_(std::move(rref)),
+          confirmationFuture_(c10::make_intrusive<JitFuture>(BoolType::get())) {
+    }
+
+    inline void confirm() {
+      c10::static_intrusive_pointer_cast<UserRRef>(rref_)->confirm();
+      confirmationFuture_->markCompleted();
+    }
+
+    c10::intrusive_ptr<RRef> rref_;
+    // Use Future.wait() and Future.markCompleted() to block and unblock user
+    // functions. The bool value wrapped by the future_ is not used.
+    c10::intrusive_ptr<JitFuture> confirmationFuture_;
+  };
+
+  RRefContext(std::shared_ptr<RpcAgent>);
+
+  c10::intrusive_ptr<UserRRef> createUserRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      const TypePtr& type);
+
+  void finishForkRequest(const ForkId& forkId, worker_id_t parent);
+
+  // If there is any leak on any RRef, this method will throw an error.
+  void checkRRefLeaks(bool ignoreRRefLeak);
+
+  static std::atomic<local_id_t> nextLocalId_;
+
+  const std::shared_ptr<RpcAgent> agent_;
+  mutable std::mutex mutex_;
+  // Keep OwnerRRefs alive until there is no living UserRRefs.
+  std::unordered_map<RRefId, c10::intrusive_ptr<RRef>, RRefId::Hash> owners_;
+  // A map to track OwnerRRefs that are requested but not yet created. This can
+  // happen if the to_here() message is processed on the owner before the
+  // corresponding creator rpc.remote() message. If this happens, instead of
+  // to_here() RPC thread to block waiting for the OwnerRRef creation, the
+  // RRefContext returns a Future, so that the RPC request processing logic can
+  // attach subsequent code as a callback to that Future.
+  // NB: the OwnerRRefs in this map must be cleared when the corresponding
+  // OwnerRRef is created. Note that the values in this map are intrusive_ptrs
+  // to c10::ivalue::Future that will be marked completed with the owner RRef.
+  std::unordered_map<RRefId, c10::intrusive_ptr<JitFuture>, RRefId::Hash>
+      pendingOwners_;
+  // Tracks known living UserRRefs of an OwnerRRef
+  std::unordered_map<
+      RRefId,
+      std::unordered_set<ForkId, ForkId::Hash>,
+      RRefId::Hash>
+      forks_;
+
+  // This cond var is used by deleteAllUsers(), a event notification is sent if
+  // number of pending UserRRef or UserRRef children is reduced, or
+  // number of owned OwnerRRef is reduced.
+  std::condition_variable deleteAllUsersCV_;
+  // The follow 3 maps keep UserRRefs alive by holding a intrusive_ptr to the
+  // RRef instances. A UserRRef must be added into this map if any of the
+  // following two conditions is true:
+  //
+  // (1) A UserRRef has not been accepted by owner yet.
+  //
+  //     It can be used or shared, but cannot be deleted, and hence kept alive
+  //     in this map. A message of type RREF_USER_ACCEPT will move the
+  //     corresponding RRef from pendingUsers_ map to confirmedUsers_ map.
+  std::unordered_map<ForkId, std::shared_ptr<PendingUserState>, ForkId::Hash>
+      pendingUsers_;
+  //     UserRRefs are added into this map when it is confirmed by the owner.
+  //     When destroying RRefContext this map helps to find local UserRRefs
+  //     and send delete messages if they are still not deleted by Python
+  //     garbage collection.
+  std::unordered_map<ForkId, c10::weak_intrusive_ptr<RRef>, ForkId::Hash>
+      confirmedUsers_;
+
+  // (2) A UserRRef has forked a child UserRRef which has not been accepted by
+  //     the owner yet.
+  //
+  //     In this case, this UserRRef cannot send out RREF_USER_DELETE message,
+  //     as it could potentially trigger the OwnerRRef been deleted before the
+  //     owner learns about the forked child.
+  std::unordered_map<ForkId, c10::intrusive_ptr<RRef>, ForkId::Hash>
+      pendingChildren_;
+
+  // The RRef context performs its operations through async RPC requests, in
+  // order to not block the user code. Therefore the RRef context's state may be
+  // lagging a bit behind what it is intended to be, while it waits for these
+  // requests to complete. To allow syncing when needed, we store the count of
+  // these pending requests, so that users can wait for it to reach zero.
+  std::atomic<int64_t> numPendingFutures_{0};
+
+  std::mutex destroyedMutex_;
+  bool destroyed_{false};
+
+  // Thread local states to keep UserRRefs deserialized from user function
+  // arguments.
+  static thread_local std::vector<std::shared_ptr<PendingUserState>> userTable_;
+  // A flag indicating whether subsequently created UserRRefs should be added to
+  // the thread_local userTable_. The flag is set to true before serializing
+  // RPC arguments and then set to false before running the corresponding
+  // user code. See addPendingUser and delPendingUser for more details.
+  // NB: The reason for having this flag is because addPendingUser are called in
+  // two cases, and we only want to track the 2nd case.
+  // (1) RRef as the return value: when calling rpc.remote, the UserRRef on the
+  //     caller side is added to the context using addPendingUser.
+  // (2) RRef as an argument: When running an RPC using RRefs as arguments, the
+  //     RRef is forwarded to the callee as new UserRRefs (if the callee is not
+  //     the owner). In this case, we block running the user function until all
+  //     UserRRefs are confirmed by the owner.
+  // This contract gurantees that no UserRRefs can be used remotely without
+  // confirmation. Note that, however, the UserRRef created by rpc.remote can
+  // still be passed to local functions as arguments and used there. This is by
+  // design, because this feature is especially useful when, say a master node
+  // creates multiple UserRRefs in a loop and then shares them with other nodes.
+  // Blocking every iteration in the loop until RRefs are confirmed will slow
+  // this down. This nuance on UserRRef can be interpreted as we only make
+  // exceptions for UserRRef creators. And using the UserRRef on its creator
+  // without confirmation is OK, because the creator would either call to_here
+  // or forward the UserRRef, and both would then require confirmations from the
+  // owner.
+  static thread_local bool recording_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..73d7e4f43b1a65a8313def293f58cfd644b4fc90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h
@@ -0,0 +1,420 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/rref_interface.h>
+#include <c10/core/Event.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+#include <atomic>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+class RRef;
+class RRefContext;
+class UserRRef;
+
+constexpr int OWNER_IDX = 0; // index of ownerId in the tuple
+constexpr int RREFID_ON_IDX = 1; // index of RRefId.createdOn_ in the tuple
+constexpr int RREFID_ID_IDX = 2; // index of RRefId.localId_ in the tuple
+constexpr int FORKID_ON_IDX = 3; // index of ForkId.createdOn_ in the tuple
+constexpr int FORKID_ID_IDX = 4; // index of ForkId.localId_ in the tuple
+constexpr int PARENT_IDX = 5; // index of parent in the tuple
+constexpr int TYPE_IDX = 6; // index of parent in the tuple
+
+// NB: if more fields are added, make sure this field is also bumped
+constexpr int RFD_TUPLE_SIZE = 7; // number of RRefForkData fields in py::tuple
+
+// Represents fork of an RRef to be sent over the wire.
+struct TORCH_API RRefForkData {
+  const worker_id_t ownerId_;
+  const RRefId rrefId_;
+  const ForkId forkId_;
+  const worker_id_t parent_;
+  const std::string typeStr_;
+
+  RRefForkData(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      worker_id_t parent,
+      std::string typeStr);
+};
+
+// Note [RRef Protocol]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// [Background]
+//
+// RRef stands for Remote REFerence. Each RRef is owned by a single worker
+// (i.e., owner) and can be used by multiple users. The owner stores the real
+// data referenced by its RRefs. RRef needs to support fast and scalable RPC.
+// Hence, in the design, we avoid using a single global master to keep RRef
+// states, instead owners will keep track of the global reference counts
+// for its RRefs. Every RRef can be uniquely identified by a global RRefId,
+// which is assigned at the time it is first created either on a user or on the
+// owner.
+//
+// On the owner worker, there is only one OwnerRRef instance, which contains the
+// real data, while on user workers, there can be as many UserRRefs as
+// necessary, and UserRRef does not hold the data. All usage on the OwnerRRef
+// should retrieve the unique OwnerRRef instance using the globally unique
+// RRefId. //A UserRRef will be created when it is used as an argument or return
+// value in dist.rpc or dist.remote call, but RRef forking and reference
+// counting (RC) are completely transparent to applications. Every UserRRef will
+// also have its globally unique ForkId.
+//
+// [Assumptions]
+//
+// 1. Transient Network Failures
+//
+// TODO: current RRef implementation does not tolerate failures
+//
+// The RRef design handles transient network failures by retrying
+// messages. Node crashes or permanent network partition is beyond the scope.
+// When those incidents occur, the application may take down all workers, revert
+// to the previous checkpoint, and resume training.
+//
+// 2. Non-idempotent UDFs
+//
+// We assume UDFs are not idempotent and therefore cannot be retried. However,
+// internal RRef control messages are idempotent and retried upon message
+// failure.
+//
+// TODO: RRef internal messages are not yet idempotent
+//
+// 3. Out of Order Message Delivery
+//
+// We do not assume message delivery order between any pair of nodes, because
+// both sender and receiver are using multiple threads. There is no guarantee on
+// which message will be processed first.
+//
+// [RRef Lifetime]
+//
+// The goal of the protocol is to delete an OwnerRRef at an appropriate time.
+// The right time to delete an OwnerRRef is when there are no living UserRRefs
+// and Python GC also agrees to delete the OwnerRRef instance on the owner. The
+// tricky part is to determine if there are any living UserRRefs.
+//
+// A user can get a UserRRef in three situations:
+//
+// (1). Receiving a UserRRef from the owner.
+// (2). Receiving a UserRRef from another user.
+// (3). Creating a new UserRRef owned by another worker.
+//
+// (1) is the simplest case where the owner initiates the fork, and hence it can
+// easily increment local RC. The only requirement is that any UserRRef must
+// notify the owner before destruction. Hence, we need the first guarantee:
+//
+// G1. The owner will be notified when any UserRRef is deleted.
+//
+// As messages might come delayed or out-of-order, we need more one guarantee to
+// make sure the delete message is not sent out too soon. Let us first introduce
+// a new concept. If A sends an RPC to B that involves an RRef, we call the RRef
+// on A the parent RRef and the RRef on B the child RRef.
+//
+// G2. Parent RRef cannot be deleted until the child RRef is confirmed by the
+//     owner.
+//
+// Under (1), where the caller is UserRRef and callee is OwnerRRef, it simply
+// means that the user will not send out the delete message until all previous
+// messages are ACKed. Note that ACKed does not mean the owner finishes
+// executing the function, instead, it only means the owner has retrieved its
+// local OwnerRRef and about to pass it to the function, which is sufficient to
+// keep the OwnerRRef alive even if the delete message from the user arrives at
+// the owner before the function finishes execution.
+//
+// With (2) and (3), it is possible that the owner only partially knows the RRef
+// fork graph or not even knowing it at all. For example, the RRef could be
+// constructed on a user, and before the owner receives the RPC call, the
+// creator user might have already shared the RRef with other users, and those
+// users could further share the RRef. One invariant is that the fork graph of
+// any RRef is always a tree rooted at the owner, because forking an RRef always
+// creates a new RRef instance, and hence every RRef has a single parent. One
+// nasty detail is that when an RRef is created on a user, technically the owner
+// is not its parent but we still consider it that way and it does not break the
+// argument below.
+//
+// The owner's view on any node (fork) in the tree has three stages:
+//
+//       1) unknown -> 2) known -> 3) deleted.
+//
+// The owner's view on the entire tree keeps changing. The owner deletes its
+// OwnerRRef instance when it thinks there are no living UserRRefs, i.e., when
+// OwnerRRef is deleted, all UserRRefs could be either indeed deleted or
+// unknown. The dangerous case is when some forks are unknown and others are
+// deleted.
+//
+// G2 trivially guarantees that no parent UserRRef Y can be deleted before the
+// owner knows all of Y's children UserRRefs.
+//
+// However, it is possible that the child UserRRef Z may be deleted before the
+// owner knows its parent Y. More specifically, this can happen when all of Z's
+// messages are processed by the owner before all messages from Y, including the
+// delete message. Nevertheless, this does not cause any problem. Because, at
+// least one of Y's ancestor will be alive, and it will prevent the owner from
+// deleting the OwnerRRef. Consider the following example: (NB: this scenario
+// will no longer relevant when we block UDF until all RRefs are confirmed by
+// the owner)
+//
+//     OwnerRRef -> A -> Y -> Z
+//
+// OwnerRRef forks to A, then A forks to Y, and Y forks to Z. Z can be deleted
+// without OwnerRRef knowing Y. However, the OwnerRRef will at least know A, as
+// the owner directly forks the RRef to A. A won't die before the owner knows Y.
+//
+// Things get a little trickier if the RRef is created on a user:
+//
+//  OwnerRRef
+//      ^
+//      |
+//      A -> Y -> Z
+//
+// If Z calls to_here on the UserRRef, the owner at least knows A when Z is
+// deleted, because otherwise to_here wouldn't finish. If Z does not call
+// to_here, it is possible that the owner receives all messages from Z before
+// any message from A and Y. In this case, as the real data of the OwnerRRef has
+// not been created yet, there is nothing to be deleted either. It is the same
+// as Z does not exist at all Hence, it's still OK.
+//
+// See #26759 for more details and discussions.
+//
+// TODO: make RRef an IValue, and edit createStackForSchema accordingly
+// TODO: make RRef system messages idempotent and retry on failures.
+//
+// ``RRef`` is the base type for both ``UserRRef`` and ``OwnerRRef``.
+// Each ``RRef`` has a globally unique ``RRefId``.
+class TORCH_API RRef : public RRefInterface {
+ public:
+  // RRef is made NOT copyable NOT movable to prevent messing up reference
+  // counting.
+  explicit RRef(const RRef& other) = delete;
+  explicit RRef(RRef&& other) = delete;
+  RRef& operator=(RRef&& other) = delete;
+
+  ~RRef() override = default;
+
+  // returns the worker id of the owner
+  inline worker_id_t owner() const override {
+    return ownerId_;
+  }
+
+  // returns the worker name of the owner
+  inline std::string ownerName() const override {
+    return RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_;
+  }
+
+  // returns the worker info of the owner
+  inline WorkerInfo ownerWorkerInfo() const {
+    return RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_);
+  }
+
+  // Returns the globally unique RRefId of this RRef
+  inline const RRefId& rrefId() const {
+    return rrefId_;
+  }
+
+  inline bool isPyObj() const {
+    return type_ == PyObjectType::get();
+  }
+  inline const TypePtr type() const override {
+    return type_;
+  }
+
+  // Save the future corresponding to the creation of this RRef on a remote
+  // node. Note that this is only set when processing requests invoked with
+  // rpc.remote. This is only used to get the future corresponding to the rref
+  // for profiling use cases.
+  inline void registerOwnerCreationFuture(c10::intrusive_ptr<JitFuture> fut) {
+    ownerCreationFuture_ = std::move(fut);
+  }
+
+  // Get the future corresponding to the creation of this rref.
+  inline c10::intrusive_ptr<JitFuture> getOwnerCreationFuture() const {
+    return ownerCreationFuture_;
+  }
+
+  // Check if creation of this RRef on owner node has timed out.
+  inline bool getTimedOut() const {
+    return timedOut_.load();
+  }
+
+  // Dispatches an error to the correct handler based on its RPCErrorType.
+  void handleError(RPCErrorType errorType, const JitFuture& JitFuture);
+
+  // Send delete UserRRef request to Owner,
+  // if the request hasn't been sent yet.
+  // There are 2 cases to call it,
+  // 1, Python GC decides end of UserRRef lifetime, calling destructor.
+  // 2, RPC module graceful shutdown calls it on all UserRRefs tracked
+  //    in the RRefContext.
+  virtual void tryDel() {}
+
+ protected:
+  // Indicates that the creation of this RRef on owner node has timed out.
+  inline void setTimedOut() {
+    timedOut_ = true;
+  }
+  friend class RRefContext;
+
+  RRef(worker_id_t ownerId, const RRefId& rrefId, TypePtr type);
+
+  virtual RRefForkData fork() const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const worker_id_t ownerId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> timedOut_{false};
+
+  // type field to denote the type of the element that the RRef is holding
+  // it could be any TypePtr that JIT support, including PyObjectType
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const TypePtr type_;
+  // Future corresponding to request to create RRef on remote node.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::intrusive_ptr<JitFuture> ownerCreationFuture_;
+};
+
+// ``UserRRef`` represents a user of an RRef. Besides the ``RRefId``, each user
+// also has a globally unique ``ForkId`` to identify this user. ``UserRRef``
+// never owns the real value, the only way to get the value of the ``RRef`` is
+// to call ``to_here()`` and get a copy..
+class TORCH_API UserRRef final : public RRef {
+ public:
+  UserRRef(const UserRRef& other) = delete;
+  UserRRef(UserRRef&& other) = delete;
+  UserRRef& operator=(const UserRRef& other) = delete;
+  UserRRef& operator=(UserRRef&& other) = delete;
+
+  UserRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      TypePtr type);
+
+  inline bool isOwner() const override {
+    return false;
+  }
+
+  inline bool confirmedByOwner() const override {
+    return confirmedByOwner_;
+  }
+
+  // Returns the globally unique ForkId of this RRef
+  const ForkId& forkId() const;
+
+  // Get of copy of the value from the ``OwnerRRef``. If the value is not ready
+  // yet, this call will block.
+  IValue toHere(
+      const float timeoutSeconds =
+          torch::distributed::rpc::kUnsetRpcTimeout) const;
+
+  void tryDel() override;
+
+  // Will be called when refcount reaches 0.
+  // Upon destruction, this ``UserRRef`` will tell the owner to deref.
+  void release_resources() override;
+
+  // Will be called when both refcount and weakcount reach 0. See
+  // https://github.com/pytorch/pytorch/blob/9116f02bebf3a5260feef5732d36c54ecb3b4033/c10/util/intrusive_ptr.h#L204
+  // This is called on destructing the wrapping intrusive_ptr_target instance
+  // and it's data members.
+  ~UserRRef() override;
+
+ private:
+  friend class RRefContext;
+
+  RRefForkData fork() const override;
+  inline void confirm() {
+    confirmedByOwner_ = true;
+  }
+
+  const ForkId forkId_;
+
+  // Indicates if this user has sent delete message to it's owner.
+  // Note, thread safety is needed because delete message could be sent by
+  // either the destructor called by Python garbage collection or RRefContext
+  // proactive cleanup on RPC graceful shutdown.
+  std::mutex deletedOnOwnerMutex_;
+  bool deletedOnOwner_{false};
+  // Indicating whether this UserRRef has been confirmed by its owner.
+  std::atomic<bool> confirmedByOwner_;
+};
+
+// Keep the template only on the derived class because ``RRefContext`` needs to
+// erase the type on ``RRef`` and keep them in one map.
+class TORCH_API OwnerRRef final : public RRef {
+ public:
+  OwnerRRef(const OwnerRRef& other) = delete;
+  OwnerRRef(OwnerRRef&& other) = delete;
+  OwnerRRef& operator=(const OwnerRRef& other) = delete;
+  OwnerRRef& operator=(OwnerRRef&& other) = delete;
+
+  OwnerRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      TypePtr type,
+      std::vector<c10::Device> devices);
+
+  OwnerRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      TypePtr type,
+      c10::optional<IValue> value,
+      std::vector<c10::Device> devices);
+
+  inline bool isOwner() const override {
+    return true;
+  }
+
+  // OwnerRRef is always confirmed, while UserRRef is only confirmed when the
+  // owner knows about it.
+  inline bool confirmedByOwner() const override {
+    return true;
+  }
+
+  // Get a constant reference of the real value. This method will block if the
+  // value is not ready. This method does not need GIL as it does not create
+  // any new py::object. It will throw if there is an error.
+  const IValue& getValue() const;
+
+  // Set the value of this ``OwnerRRef``. This method does not need GIL as it
+  // does not create any new py::object.
+  void setValue(IValue&& value);
+  // Sets the value of this ``OwnerRRef`` to contain an exception.
+  void setError(std::exception_ptr eptr);
+
+  // Has a value or error been set?
+  bool hasValue() const;
+  // Gets a future that is satisfied when the value or error is set.
+  c10::intrusive_ptr<JitFuture> getFuture();
+
+ private:
+  friend class RRefContext;
+
+  c10::intrusive_ptr<JitFuture> future_;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& os, const RRef& rref);
+
+// Helper function that casts from c10::RRefInterface to OwnerRRef
+inline TORCH_API c10::intrusive_ptr<OwnerRRef> fromRRefInterface(
+    const c10::intrusive_ptr<c10::RRefInterface>& rrefInterface) {
+  return c10::static_intrusive_pointer_cast<OwnerRRef>(rrefInterface);
+}
+
+// Helper function that casts from OwnerRRef to c10::RRefInterface
+inline TORCH_API c10::intrusive_ptr<c10::RRefInterface> fromOwnerRRef(
+    const c10::intrusive_ptr<RRef>& ownerRRef) {
+  return c10::static_intrusive_pointer_cast<c10::RRefInterface>(ownerRRef);
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe78b4ce8b61889ad50543cb4e19768e2e66b05
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Temporary solution of RRef operations.
+// TODO: Remove all these messages and use rpc + registered functions instead.
+class TORCH_API RRefMessageBase : public RpcCommandBase {
+ public:
+  RRefMessageBase(const RRefId& rrefId, MessageType type)
+      : rrefId_(rrefId), type_(type) {}
+
+  const RRefId& rrefId();
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const MessageType type_;
+};
+
+class TORCH_API ForkMessageBase : public RRefMessageBase {
+ public:
+  ForkMessageBase(const RRefId& rrefId, const ForkId& forkId, MessageType type)
+      : RRefMessageBase(rrefId, type), forkId_(forkId) {}
+
+  const ForkId& forkId();
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::pair<RRefId, ForkId> fromMessage(
+      const Message& message,
+      MessageType type);
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const ForkId forkId_;
+};
+
+// UserRRef uses this message to fetch the remote RRef value from the owner.
+class TORCH_API ScriptRRefFetchCall final : public RRefMessageBase {
+ public:
+  ScriptRRefFetchCall(worker_id_t fromWorkerId, const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::SCRIPT_RREF_FETCH_CALL),
+        fromWorkerId_(fromWorkerId) {}
+
+  inline worker_id_t fromWorkerId() const {
+    return fromWorkerId_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptRRefFetchCall> fromMessage(
+      const Message& message);
+
+ private:
+  const worker_id_t fromWorkerId_;
+};
+
+class TORCH_API PythonRRefFetchCall final : public RRefMessageBase {
+ public:
+  PythonRRefFetchCall(worker_id_t fromWorkerId, const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::PYTHON_RREF_FETCH_CALL),
+        fromWorkerId_(fromWorkerId) {}
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<PythonRRefFetchCall> fromMessage(
+      const Message& message);
+
+ private:
+  const worker_id_t fromWorkerId_;
+};
+
+// OwnerRRef uses this message to send the RRef value to a remote UserRRef
+class TORCH_API RRefFetchRet : public RpcCommandBase {
+ public:
+  RRefFetchRet(std::vector<at::IValue> values, MessageType type)
+      : values_(std::move(values)), type_(type) {}
+
+  const std::vector<at::IValue>& values();
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+ private:
+  std::vector<at::IValue> values_;
+  const MessageType type_;
+};
+
+class TORCH_API ScriptRRefFetchRet final : public RRefFetchRet {
+ public:
+  explicit ScriptRRefFetchRet(std::vector<at::IValue> values)
+      : RRefFetchRet(std::move(values), MessageType::SCRIPT_RREF_FETCH_RET) {}
+
+  static std::unique_ptr<ScriptRRefFetchRet> fromMessage(
+      const Message& message);
+};
+
+class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
+ public:
+  explicit PythonRRefFetchRet(std::vector<at::IValue> values)
+      : RRefFetchRet(std::move(values), MessageType::PYTHON_RREF_FETCH_RET) {}
+
+  static std::unique_ptr<PythonRRefFetchRet> fromMessage(
+      const Message& message);
+};
+
+// UserRRef (regardless it's the creator or not) uses this message to notify
+// OwnerRRef on delete.
+class TORCH_API RRefUserDelete final : public ForkMessageBase {
+ public:
+  RRefUserDelete(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_USER_DELETE) {}
+
+  static std::unique_ptr<RRefUserDelete> fromMessage(const Message& message);
+};
+
+class TORCH_API RemoteRet final : public ForkMessageBase {
+ public:
+  RemoteRet(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::REMOTE_RET) {}
+
+  static std::unique_ptr<RemoteRet> fromMessage(const Message& message);
+};
+
+// A child RRef uses this message to notify its parent that the child has been
+// confirmed by the owner.
+class TORCH_API RRefChildAccept final : public RpcCommandBase {
+ public:
+  explicit RRefChildAccept(const ForkId& forkId) : forkId_(forkId) {}
+
+  const ForkId& forkId() const;
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefChildAccept> fromMessage(const Message& message);
+
+ private:
+  const ForkId forkId_;
+};
+
+// A child RRef uses this message to send a fork request to the owner.
+class TORCH_API RRefForkRequest final : public ForkMessageBase {
+ public:
+  RRefForkRequest(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_FORK_REQUEST) {}
+
+  static std::unique_ptr<RRefForkRequest> fromMessage(const Message& message);
+};
+
+class TORCH_API RRefAck final : public RpcCommandBase {
+ public:
+  RRefAck() = default;
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefAck> fromMessage(const Message& message);
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c28e8537d77e54c9c8da0af354b78dbe35bc99d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using torch::jit::Operator;
+
+// A ScriptCall instance represents an invocation of a builtin operator for a
+// TorchScript function. If it is a builtin operator, it
+// contains a shared ptr to the `Operator` and a list of arguments.
+// If it is a TorchScript function, it contains a non empty qualifiedName string
+// to the TorchScript function schema name and a list of arguments.
+class TORCH_API ScriptCall : public RpcCommandBase {
+ public:
+  // Constructor for builitin operator call.
+  ScriptCall(std::shared_ptr<Operator> op, std::vector<at::IValue>&& stack);
+  // Constructor for TorchScript function call.
+  ScriptCall(
+      const c10::QualifiedName& qualifiedName,
+      std::vector<at::IValue>&& stack,
+      const bool isAsyncExecution = false);
+
+  bool hasOp() const;
+  std::shared_ptr<Operator> op() const;
+  bool hasQualifiedName() const;
+  const c10::QualifiedName& qualifiedName() const;
+  // return the argument stack of this builtin operator
+  const std::vector<at::IValue>& stack() const;
+  std::vector<at::IValue>& stackRef();
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptCall> fromMessage(const Message& message);
+
+  ~ScriptCall() override = default;
+
+ protected:
+  virtual void toIValues(std::vector<at::IValue>& ivalues) const;
+  static std::unique_ptr<ScriptCall> fromIValues(
+      std::vector<at::IValue>& ivalues);
+
+ private:
+  // Given an operator symbol and a string schema, return the matched operator.
+  static std::shared_ptr<Operator> matchOperator(const std::string& str_schema);
+
+  static const std::string BUILTIN_OP_NAMESPACE_;
+  static const std::string ATEN_PREFIX_;
+
+  // This field has value if this ScriptCall represents invocation of a builtin
+  // operator.
+  c10::optional<std::shared_ptr<Operator>> op_;
+  // This field has non empty string if this ScriptCall represents invocation of
+  // an annotated torchscript function defined by users.
+  c10::optional<const c10::QualifiedName> qualifiedName_;
+  std::vector<at::IValue> stack_;
+  const bool isAsyncExecution_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a8f25f70543d1cd7d669958e7da0246ff4ea4d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using torch::jit::Operator;
+
+// A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
+// builtin operator. Currently, it does not support using RRef as arguments yet.
+// Besides the operator and a vector of arguments, ScriptRemoteCall also
+// contains the RRefId and the ForkId of the return value RRef.
+class TORCH_API ScriptRemoteCall final : public ScriptCall {
+ public:
+  // Constructor for builitin operator call.
+  ScriptRemoteCall(
+      std::shared_ptr<Operator> op,
+      std::vector<at::IValue>&& stack,
+      const RRefId& retRRefId,
+      const ForkId& retForkId);
+
+  // Constructor for TorchScript function call.
+  ScriptRemoteCall(
+      const c10::QualifiedName& qualifiedName,
+      std::vector<at::IValue>&& stack,
+      const RRefId& retRRefId,
+      const ForkId& retForkId,
+      const bool isAsyncExecution);
+
+  inline const RRefId& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const ForkId& retForkId() const {
+    return retForkId_;
+  }
+
+  static std::unique_ptr<ScriptRemoteCall> fromIValues(
+      std::vector<at::IValue>& ivalues);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptRemoteCall> fromMessage(const Message& message);
+
+ private:
+  const RRefId retRRefId_;
+  const ForkId retForkId_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8c36e2ab3ba6502a51f7c8a59145785b7db2598
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Return value of a builtin operator or a TorchScript function.
+class TORCH_API ScriptResp final : public RpcCommandBase {
+ public:
+  explicit ScriptResp(at::IValue&& values);
+
+  const at::IValue& value();
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptResp> fromMessage(const Message& message);
+
+ private:
+  const at::IValue value_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cc01b74c7a590783e87e8ff6f588e0c2ec5bebe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -0,0 +1,495 @@
+#pragma once
+
+#ifdef USE_TENSORPIPE
+
+#include <atomic>
+#include <thread>
+
+#include <c10/core/thread_pool.h>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+
+// Forward-declare the TensorPipe classes we need, to avoid including its
+// headers in PyTorch's ones and thus have it become a public dependency.
+
+namespace tensorpipe {
+
+class Context;
+class Error;
+class Listener;
+class Message;
+class Pipe;
+
+namespace transport {
+class Context;
+} // namespace transport
+
+namespace channel {
+class Context;
+} // namespace channel
+
+} // namespace tensorpipe
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// These priorities instruct TensorPipe on which transport/channel to pick
+// during handshake. Higher priorities will take precedence over lower ones.
+// The transport with lowest priority will be the one used to bootstrap pipes.
+
+constexpr int64_t kShmTransportPriority = 200;
+constexpr int64_t kIbvTransportPriority = 100;
+// The UV transport just uses TCP and should work everywhere, thus keep it last.
+constexpr int64_t kUvTransportPriority = 0;
+
+constexpr int64_t kCmaChannelPriority = 1200;
+constexpr int64_t kMultiplexedUvChannelPriority = 1100;
+// The basic channel reuses a transport as a channel, and is thus our fallback.
+constexpr int64_t kBasicChannelPriority = 1000;
+
+// CPU channel have higher priority than CUDA channels, since the latter might
+// handle CPU-to-CPU transfers, but will always be less efficient than their
+// CPU-only counterparts.
+constexpr int64_t kCudaIpcChannelPriority = 300;
+constexpr int64_t kCudaGdrChannelPriority = 200;
+constexpr int64_t kCudaXthChannelPriority = 400;
+constexpr int64_t kCudaBasicChannelPriority = 0;
+
+using steady_clock_time_point =
+    std::chrono::time_point<std::chrono::steady_clock>;
+
+struct TORCH_API TransportRegistration {
+  std::shared_ptr<tensorpipe::transport::Context> transport;
+  int64_t priority;
+  std::string address;
+};
+
+C10_DECLARE_REGISTRY(TensorPipeTransportRegistry, TransportRegistration);
+
+struct TORCH_API ChannelRegistration {
+  std::shared_ptr<tensorpipe::channel::Context> channel;
+  int64_t priority;
+};
+
+C10_DECLARE_REGISTRY(TensorPipeChannelRegistry, ChannelRegistration);
+
+constexpr auto kDefaultNumWorkerThreads = 16;
+
+struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
+  TensorPipeRpcBackendOptions(
+      int numWorkerThreads,
+      optional<std::vector<std::string>> transports,
+      optional<std::vector<std::string>> channels,
+      float rpc_timeout,
+      std::string init_method,
+      std::unordered_map<std::string, DeviceMap> device_maps = {},
+      std::vector<c10::Device> devices = {})
+      : RpcBackendOptions(rpc_timeout, init_method),
+        numWorkerThreads(numWorkerThreads),
+        transports(std::move(transports)),
+        channels(std::move(channels)),
+        deviceMaps(std::move(device_maps)),
+        devices(std::move(devices)) {
+    TORCH_CHECK(
+        numWorkerThreads > 0,
+        "num_worker_threads must be positive, got ",
+        numWorkerThreads);
+
+    if (this->transports.has_value()) {
+      for (const std::string& transportName : this->transports.value()) {
+        TORCH_CHECK(
+            TensorPipeTransportRegistry()->Has(transportName),
+            "Unknown transport: ",
+            transportName);
+      }
+    }
+
+    if (this->channels.has_value()) {
+      for (const std::string& channelName : this->channels.value()) {
+        TORCH_CHECK(
+            TensorPipeChannelRegistry()->Has(channelName),
+            "Unknown channel: ",
+            channelName);
+      }
+    }
+  }
+
+  void setDeviceMap(const std::string& workerName, const DeviceMap& deviceMap) {
+    auto iter = deviceMaps.find(workerName);
+    if (iter == deviceMaps.end()) {
+      deviceMaps[workerName] = deviceMap;
+    } else {
+      for (auto& entry : deviceMap) {
+        // c10::Device has no default constructor, hence map[device] dosn't work
+        // In C++-17 we can use insert_or_assign.
+        auto entryIter = iter->second.find(entry.first);
+        if (entryIter == iter->second.end()) {
+          iter->second.emplace(entry.first, entry.second);
+        } else {
+          entryIter->second = entry.second;
+        }
+      }
+    }
+  }
+
+  int numWorkerThreads;
+  const optional<std::vector<std::string>> transports;
+  const optional<std::vector<std::string>> channels;
+  std::unordered_map<std::string, DeviceMap> deviceMaps;
+  std::vector<c10::Device> devices;
+};
+
+// Struct to track the network source metrics
+struct TORCH_API NetworkSourceInfo {
+  worker_id_t srcRank;
+  std::vector<uint8_t> srcMachineAddr;
+};
+
+// Struct to track aggregated network metrics
+struct TORCH_API AggregatedNetworkData {
+  uint64_t numCalls{0};
+  uint64_t totalSentBytes{0};
+  uint64_t totalRecvBytes{0};
+  uint64_t totalErrors{0};
+};
+
+// TensorPipeAgent leverages TensorPipe (https://github.com/pytorch/tensorpipe)
+// to transparently move tensors and payloads through the fastest available
+// transport or channel. It acts like a hybrid RPC transport, providing shared
+// memory (linux) and TCP (linux & mac) support. CUDA support is in progress.
+class TORCH_API TensorPipeAgent : public RpcAgent {
+ public:
+  TensorPipeAgent(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      std::string selfName,
+      worker_id_t selfId,
+      optional<int> worldSize,
+      TensorPipeRpcBackendOptions opts,
+      std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
+      std::vector<c10::Device> devices,
+      std::unique_ptr<RequestCallback> cb);
+
+  TensorPipeAgent(const TensorPipeAgent&) = delete;
+  TensorPipeAgent& operator=(const TensorPipeAgent&) = delete;
+
+  c10::intrusive_ptr<JitFuture> send(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const DeviceMap& deviceMap = {}) override;
+
+  // join() and sync() would be deprecated -
+  // https://github.com/pytorch/pytorch/issues/27647
+  void join(bool shutdown = false, float timeout = 0) override;
+  void sync() override{};
+  void startImpl() override;
+  void shutdownImpl() override;
+
+  ~TensorPipeAgent() override;
+
+  const WorkerInfo& getWorkerInfo(const std::string& workerName) const override;
+  const WorkerInfo& getWorkerInfo(worker_id_t workerId) const override;
+  std::vector<WorkerInfo> getWorkerInfos() const override;
+  void updateGroupMembership(
+      const WorkerInfo& workerInfo,
+      const std::vector<c10::Device>& devices,
+      const std::unordered_map<std::string, DeviceMap>& reverseDeviceMaps,
+      bool isJoin);
+
+  std::unordered_map<std::string, std::string> getMetrics() override;
+
+  void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override;
+
+  TensorPipeRpcBackendOptions getBackendOptions() const;
+
+  const c10::intrusive_ptr<::c10d::Store> getStore() const;
+
+  DeviceMap getDeviceMap(const WorkerInfo& dest) const override;
+
+  const std::vector<c10::Device>& getDevices() const override;
+
+  using NetworkDataDict =
+      std::unordered_map<std::string, AggregatedNetworkData>;
+
+  // Returns metrics tracked by the NetworkDataDict
+  NetworkDataDict getNetworkData();
+  // Returns NetworkSourceInfo struct
+  NetworkSourceInfo getNetworkSourceInfo();
+
+  static const std::string& guessAddress();
+
+  // For testing purposes.
+  size_t timeoutMapSize();
+  size_t numPendingResponses();
+  size_t messageIdToTimeoutMapSize();
+
+  const bool isStaticGroup_;
+
+ protected:
+  // TensorPipe write function that could be used to write response
+  // messages by server, and write request messages by client. This
+  // is a protected method since it is overwritten by FaultyTensorPipeAgent
+  virtual void pipeWrite(
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      c10::intrusive_ptr<Message> message,
+      std::vector<c10::Device>&& devices,
+      std::vector<c10::Stream> streams,
+      std::function<void(const tensorpipe::Error&)>) noexcept;
+
+ private:
+  // Removes the given messageId with the given expirationTime from the
+  // timeoutMap_.
+  void removeFromTimeoutMap(uint64_t messageId);
+
+  // Populates workerIdToInfo_ and workerNameToInfo_ using addressStore_
+  void prepareNames(bool isStaticGroup);
+
+  // Check the static group attribute with the value set in store
+  void checkAndSetStaticGroup(const c10::intrusive_ptr<::c10d::Store>& store);
+
+  const std::string& findWorkerURL(const WorkerInfo& worker) const;
+
+  // Only use for Dynamic RPC groups, method to have worker leave group
+  void leaveGroup();
+
+  // TensorPipe read function that could be used to read response messages
+  // by client, and read request messages by server.
+  void pipeRead(
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      std::function<void(
+          const tensorpipe::Error&,
+          c10::intrusive_ptr<Message>,
+          std::vector<c10::Stream>)>) noexcept;
+
+  // Callback of listener accept()
+  void onListenerAccepted(
+      const tensorpipe::Error& error,
+      std::shared_ptr<tensorpipe::Pipe>& pipe);
+
+  // Respond to a call from a peer
+  void respond(std::shared_ptr<tensorpipe::Pipe>& pipe);
+
+  void sendCompletedResponseMessage(
+      std::shared_ptr<tensorpipe::Pipe>& pipe,
+      JitFuture& futureResponseMessage,
+      uint64_t messageId,
+      std::vector<c10::Stream> stream);
+
+  // Collects metrics from successful RPC calls
+  void trackNetworkData(
+      uint64_t requestSize,
+      uint64_t responseSize,
+      const std::string& destWorkerName);
+
+  // Collects metrics from failed RPC calls
+  void trackNetworkError(
+      uint64_t requestSize,
+      const std::string& destWorkerName);
+
+  inline std::vector<c10::Device> getDevicesForRemote(
+      const std::string& remoteName,
+      const Message& message) const;
+
+  // When a request+response completes, we need to mark the future message as
+  // complete. However, if its timeout has already expired, it already has an
+  // error set. There is no atomic "test-and-set" way to mark a future complete
+  // only if it isn't yet. It does exist for errors (setErrorIfNeeded) but, even
+  // then, it ends up printing a log message, which may worry the user. To solve
+  // both issues we use a separate atomic flag to know the status of the future.
+  struct AtomicJitFuture {
+    explicit AtomicJitFuture(const std::vector<c10::Device>& devices) {
+      jitFuture = c10::make_intrusive<at::ivalue::Future>(
+          at::AnyClassType::get(), devices);
+    }
+
+    std::atomic_flag isComplete = ATOMIC_FLAG_INIT;
+    c10::intrusive_ptr<JitFuture> jitFuture;
+  };
+
+  // Maintains state per client pipe to track pending response messages and
+  // error states. pendingResponseMessage_ should be protected by a mutex since
+  // it can be raced with user send() call.
+  // TODO: To achieve better performance we can have a pipe pool per
+  // client that can be configured using RpcBackendOptions.
+  struct ClientPipe {
+    explicit ClientPipe(std::shared_ptr<tensorpipe::Pipe> pipe)
+        : pipe_(std::move(pipe)) {}
+    std::shared_ptr<tensorpipe::Pipe> pipe_;
+    mutable std::mutex mutex_;
+    bool inError_{false};
+    // Map from Message Request ID's to corresponding futures.
+    std::unordered_map<uint64_t, std::shared_ptr<AtomicJitFuture>>
+        pendingResponseMessage_;
+  };
+
+  const c10::intrusive_ptr<::c10d::Store> store_;
+
+  const TensorPipeRpcBackendOptions opts_;
+  // For dynamic RPC, the reverse device maps are updated whenever a new rank
+  // joins or leaves the group
+  std::unordered_map<std::string, DeviceMap> reverseDeviceMaps_;
+  // Local devices used by this agent. If application didn't specify this
+  // field, it will be initialized using corresponding local devices in
+  // opts_.deviceMaps and reverseDeviceMaps_;
+  std::vector<c10::Device> devices_;
+
+  ThreadPool threadPool_;
+  std::shared_ptr<tensorpipe::Context> context_;
+  std::shared_ptr<tensorpipe::Listener> listener_;
+
+  mutable std::mutex connectedPipesMutex_;
+  std::unordered_map<worker_id_t, ClientPipe> connectedPipes_;
+
+  // Maps keyed on name and id for easy WorkerInfo lookup.
+  std::unordered_map<worker_id_t, WorkerInfo> workerIdToInfo_;
+  std::unordered_map<std::string, WorkerInfo> workerNameToInfo_;
+  std::unordered_map<std::string, std::string> workerNameToURL_;
+
+  ::c10d::PrefixStore rankToNameStore_;
+  ::c10d::PrefixStore nameToAddressStore_;
+  // Store keys that will used to count joined processes and active calls during
+  // the shutdown process
+  ::c10d::PrefixStore shutdownStore_;
+  int worldSize_ = 0;
+  std::atomic<uint64_t> nextMessageID_{0};
+
+  // Metadata used for tracking of whether certain RPCs have timed out or not.
+  struct TimeoutMessageMetadata {
+    TimeoutMessageMetadata(
+        uint64_t messageId_,
+        std::shared_ptr<AtomicJitFuture> responseFuture_,
+        std::chrono::milliseconds timeout_)
+        : messageId(messageId_),
+          responseFuture(std::move(responseFuture_)),
+          timeout(timeout_) {}
+    uint64_t messageId;
+    std::shared_ptr<AtomicJitFuture> responseFuture;
+    std::chrono::milliseconds timeout;
+  };
+
+  // Map to store the expiration times for each message.
+  std::map<steady_clock_time_point, std::vector<TimeoutMessageMetadata>>
+      timeoutMap_;
+
+  // Map to store the messageId to expiry time.
+  std::unordered_map<uint64_t, steady_clock_time_point> messageIdToTimeout_;
+
+  // Thread that will poll the timeoutMap_ for timed out messages and mark them
+  // with an error accordingly
+  std::thread timeoutThread_;
+
+  // Function run by the timeoutThread_ to check for timed out RPCs
+  void pollTimeoutRpcs();
+
+  // Mutex to guard the timeoutMap_
+  std::mutex timeoutMapMutex_;
+
+  // Condition Variable to signal population of the timeoutMap_
+  std::condition_variable timeoutThreadCV_;
+
+  // Returns the expiration time for an RPC by adding the current time to the
+  // passed in timeout.
+  inline steady_clock_time_point computeRpcMessageExpiryTime(
+      std::chrono::milliseconds timeout) const {
+    return std::chrono::time_point_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() + timeout);
+  }
+
+  // Handle error on an outgoing pipe
+  void handleClientError(
+      ClientPipe& clientPipe,
+      const tensorpipe::Error& error);
+
+  // This is a generic struct for capturing Time-Series Metrics. It keeps a
+  // running sum and count of data points (observations), and can return an
+  // average of the data points seen so far. This is currently only used for
+  // tracking the GIL Wait Time in RPC Agents, but can be used for other metrics
+  // as well.
+  struct TimeSeriesMetricsTracker {
+    // Running sum of the data points seen so far
+    uint64_t currentSum_;
+    // Running count of the data points seen so far
+    uint64_t currentCount_;
+
+    explicit TimeSeriesMetricsTracker(
+        uint64_t currentSum = 0,
+        uint64_t currentCount = 0);
+
+    // Adds a data point (which is basically one observation for the metric
+    // being tracked) to the running sum and count.
+    void addData(uint64_t dataPoint);
+    // Returns the average of all the data points seen so far.
+    float computeAverage() const;
+  };
+
+  // Map of Time-Series metrics tracked by the RPC Agent
+  std::unordered_map<std::string, TimeSeriesMetricsTracker> timeSeriesMetrics_;
+  // Mutex to guard timeSeriesMetrics_
+  std::mutex metricsMutex_;
+
+  // Custom lock guard used to check if the RPC group is dynamic and lock the
+  // mutex if so
+  struct GroupMembershipLockGuard {
+    GroupMembershipLockGuard(std::mutex& mutex, bool isStaticGroup)
+        : ref_(mutex), isStaticGroup_(isStaticGroup) {
+      if (isStaticGroup_) {
+        ref_.lock();
+      }
+    }
+
+    ~GroupMembershipLockGuard() {
+      if (isStaticGroup_) {
+        ref_.unlock();
+      }
+    }
+
+    GroupMembershipLockGuard(const GroupMembershipLockGuard&) = delete;
+
+   private:
+    std::mutex& ref_;
+    bool isStaticGroup_;
+  };
+  // Mutex to guard access to group membership data
+  // e.g. updates to (workerIdToInfo_, workerNameToInfo_, workerNameToURL_)
+  mutable std::mutex groupMembershipMutex_;
+
+  // Map to Track Network Data
+  NetworkDataDict networkData_;
+  // Mutex to guard networkData_
+  std::mutex networkDataMutex_;
+
+  // A mutex and a cv to guard access to the call counts and watch for changes.
+  std::mutex callCountMutex_;
+  std::condition_variable callCountCV_;
+  // Running total of un-processed, un-errored RPC calls sent
+  int32_t clientActiveCalls_{0};
+  // Running total of un-processed RPC requests received
+  int32_t serverActiveCalls_{0};
+  // Running total of RPC requests that will be completed asynchronously
+  int32_t serverActiveAsyncCalls_{0};
+
+  // Whether a global graceful shutdown has begun, in which case we'll silence
+  // error messages due to remote workers closing their pipes.
+  std::atomic<bool> shuttingDown_{false};
+
+  // Helpers to modify the counts while correctly dealing with the mutex and cv.
+  void increaseCallCount(int32_t& count);
+  void decreaseCallCount(int32_t& count);
+
+  // Helpers to set the state of the requests.
+  void markFutureAsComplete(
+      std::shared_ptr<AtomicJitFuture> atomicFuture,
+      c10::intrusive_ptr<Message> message,
+      std::vector<c10::Stream> streams);
+  void markFutureWithError(
+      std::shared_ptr<AtomicJitFuture> atomicFuture,
+      std::string errorMsg);
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
+
+#endif // USE_TENSORPIPE
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40a76f914e4ac875e27a59d91f8ccb87801a227
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#ifdef USE_TENSORPIPE
+
+#include <torch/csrc/distributed/rpc/utils.h>
+
+namespace tensorpipe {
+class Message;
+class Allocation;
+class Descriptor;
+} // namespace tensorpipe
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+TORCH_API const c10::Stream& getStreamForDevice(
+    const std::vector<c10::Stream>& streams,
+    const c10::Device& device);
+
+// Inspired by c10/core/impl/DeviceGuardImplInterface.h.
+
+class TensorpipeDeviceTypeConverter {
+ public:
+  // Ideally we'd want this to also return a tensorpipe::Message::Tensor object
+  // but we cannot forward-declare that class (because it's nested), and we
+  // cannot include the TensorPipe headers because it's a private dependency.
+  // Thus we bend over backwards and entrust this method with appending that
+  // object to the `tensors` field of the tensorpipe::Message object we pass.
+  virtual c10::optional<std::vector<char>> prepareTensorForSending(
+      const c10::Storage& storage,
+      const std::vector<c10::Stream>& streams,
+      tensorpipe::Message& message) const = 0;
+
+  // Same as above: this method cannot return a tensorpipe::Allocation::Tensor,
+  // thus it appends it to the `tensors` field of the tensorpipe::Allocation.
+  virtual at::DataPtr allocateTensorForReceiving(
+      c10::DeviceIndex deviceIndex,
+      size_t length,
+      const std::vector<c10::Stream>& streams,
+      tensorpipe::Allocation& allocation) const = 0;
+
+  virtual ~TensorpipeDeviceTypeConverter() = default;
+};
+
+extern TORCH_API std::array<
+    std::atomic<const TensorpipeDeviceTypeConverter*>,
+    static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
+    device_type_converter_registry;
+
+class TORCH_API TensorpipeDeviceTypeConverterRegistrar {
+ public:
+  TensorpipeDeviceTypeConverterRegistrar(
+      DeviceType,
+      const TensorpipeDeviceTypeConverter*);
+};
+
+#define C10_REGISTER_TENSORPIPE_DEVICE_TYPE_CONVERTER(                     \
+    DevType, TensorpipeDeviceTypeConverter)                                \
+  static ::torch::distributed::rpc::TensorpipeDeviceTypeConverterRegistrar \
+      C10_ANONYMOUS_VARIABLE(g_##DeviceType)(                              \
+          ::c10::DeviceType::DevType, new TensorpipeDeviceTypeConverter());
+
+inline const TensorpipeDeviceTypeConverter* getDeviceTypeConverter(
+    DeviceType type) {
+  return device_type_converter_registry[static_cast<size_t>(type)].load();
+}
+
+// A struct that holds pointers that keep alive all the memory that will be
+// accessed by TensorPipe during a write operation.
+struct TensorpipeWriteBuffers {
+  // Allocate on heap so pointers stay valid as we move the holder.
+  std::unique_ptr<MessageType> type;
+  std::unique_ptr<int64_t> id;
+  std::vector<char> payload;
+  std::vector<char> pickle;
+  // This contains the original tensors and the clones of the sparse tensors.
+  std::vector<torch::Tensor> tensors;
+  // This contains the copies of the data of the tensors that didn't own their
+  // memory, e.g., the ones created from torch::from_blob() with no deleter.
+  std::vector<std::vector<char>> copiedTensors;
+};
+
+// A struct that holds pointers that keep alive all the memory that will be
+// accessed by TensorPipe during a read operation.
+struct TensorpipeReadBuffers {
+  // Allocate on heap so pointers stay valid as we move the holder.
+  std::unique_ptr<MessageType> type;
+  std::unique_ptr<int64_t> id;
+  std::vector<char> payload;
+  std::vector<char> pickle;
+  std::vector<c10::DataPtr> tensors;
+};
+
+// Convert an RPC message into a TensorPipe message, plus a holder to all the
+// data that must be kept alive while the write is performed asynchronously.
+TORCH_API std::tuple<tensorpipe::Message, TensorpipeWriteBuffers>
+tensorpipeSerialize(
+    c10::intrusive_ptr<Message> rpcMessage,
+    std::vector<c10::Device> devices,
+    const std::vector<c10::Stream>& streams);
+
+// Allocate the buffers that will hold the incoming data. They will be managed
+// by the returned holder, which must be kept alive until the asynchronous read
+// has finished. Pointers to these buffers will be stored in the returned
+// tensorpipe::Allocation struct.
+TORCH_API std::pair<tensorpipe::Allocation, TensorpipeReadBuffers>
+tensorpipeAllocate(
+    const tensorpipe::Descriptor& tpDescriptor,
+    const std::vector<c10::Stream>& streams);
+
+// Convert a TensorPipe message back into an RPC message. This requires the data
+// to be available and can thus only be performed once the asynchronous read has
+// completed. The holder can be destroyed once this function returns.
+TORCH_API c10::intrusive_ptr<Message> tensorpipeDeserialize(
+    tensorpipe::Descriptor&& tpDescriptor,
+    TensorpipeReadBuffers&& holder);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
+
+#endif // USE_TENSORPIPE
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8900e7397f4f972e97962e8e1bf04566626347cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// This function sends an rpc call to run torchscript function, currently the
+// torchscript function could only be a user defined python function with
+// "@torch.jit.script" annotation. The torchscript function could not be
+// a class constructor, class method, instance method or a script module.
+//   dst: destination worker name
+//   qualifiedName: torchscript function qualified name string like
+//                  "moduleName::torchscriptFunctionName", e.g,
+//                  "dist_autograd_test::my_py_add"
+//   stack: a bag of IValue args passed to torchscriptFunctionName
+// It returns c10::intrusive_ptr<ivalue::Future>
+c10::intrusive_ptr<c10::ivalue::Future> TORCH_API rpcTorchscript(
+    const std::string& dstWorkerName,
+    const c10::QualifiedName& qualifiedName,
+    const c10::FunctionSchema& functionSchema,
+    std::vector<c10::IValue>& stack,
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    const bool isAsyncExecution = false);
+
+c10::intrusive_ptr<RRef> TORCH_API remoteTorchscript(
+    const std::string& dstWorkerName,
+    const c10::QualifiedName& qualifiedName,
+    const c10::FunctionSchema& functionSchema,
+    std::vector<c10::IValue>& stack,
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    const bool isAsyncExecution = false);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..436145a305c11365ea0fe5d4e67eefc374204bc9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <atomic>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using worker_id_t = int16_t;
+using local_id_t = int64_t;
+
+bool getAllowJitRRefPickle();
+TORCH_API void enableJitRRefPickle();
+TORCH_API void disableJitRRefPickle();
+
+struct TORCH_API JitRRefPickleGuard {
+  JitRRefPickleGuard();
+  ~JitRRefPickleGuard();
+};
+
+struct TORCH_API GloballyUniqueId final {
+  GloballyUniqueId(worker_id_t createdOn, local_id_t localId);
+  GloballyUniqueId(const GloballyUniqueId& other) = default;
+  GloballyUniqueId& operator=(const GloballyUniqueId& other) = delete;
+
+  bool operator==(const GloballyUniqueId& other) const;
+  bool operator!=(const GloballyUniqueId& other) const;
+
+  at::IValue toIValue() const;
+  static GloballyUniqueId fromIValue(const at::IValue&);
+
+  struct Hash {
+    size_t operator()(const GloballyUniqueId& key) const {
+      return (uint64_t(key.createdOn_) << kLocalIdBits) | key.localId_;
+    }
+  };
+
+  static constexpr int kLocalIdBits = 48;
+
+  const worker_id_t createdOn_;
+  const local_id_t localId_;
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const GloballyUniqueId& globalId);
+
+using RRefId = GloballyUniqueId;
+using ForkId = GloballyUniqueId;
+using ProfilingId = GloballyUniqueId;
+
+struct TORCH_API SerializedPyObj final {
+  SerializedPyObj(std::string&& payload, std::vector<at::Tensor>&& tensors)
+      : payload_(std::move(payload)), tensors_(std::move(tensors)) {}
+
+  std::vector<at::IValue> toIValues() &&;
+  static SerializedPyObj fromIValues(std::vector<at::IValue> value);
+
+  std::string payload_;
+  std::vector<at::Tensor> tensors_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..da7679b820a959783a1a53c4af5a2a8f05f6024a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// This class converts the content in a PythonCall into py::object. This is a
+// helper class to make sure that all arguments deserialization is done before
+// entering RequestCallbackImpl::processRpc(...), so that the deserialization
+// related logic can be carried out in one spot instead of scattered in multiple
+// places for different message types.
+// NB: The reason for not consolidating class into PythonCall is because
+// PythonCall is a libtorch type which should not depend on Python types.
+class TORCH_API UnpickledPythonCall : public RpcCommandBase {
+ public:
+  UnpickledPythonCall(
+      const SerializedPyObj& serializedPyObj,
+      bool isAsyncExecution);
+  ~UnpickledPythonCall() override;
+
+  // toMessage() method is not implemented, as objects of this class should
+  // never be directly converted into a Message object.
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  const py::object& pythonUdf() const;
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+ private:
+  py::object pythonUdf_;
+  const bool isAsyncExecution_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..c535c41d59035415b3eb75102c3bb94c64cb9db7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/distributed/rpc/unpickled_python_call.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// This class converts the content in a PythonRemoteCall into py::object. This
+// is a helper class to make sure that all arguments deserialization is done
+// before entering RequestCallbackImpl::processRpc(...), so that the
+// deserialization related logic can be carried out in one spot instead of
+// scattered in multiple places for different message types.
+// NB: The reason for not consolidating class into PythonRemoteCall is because
+// PythonRemoteCall is a libtorch type which should not depend on Python types.
+class TORCH_API UnpickledPythonRemoteCall final : public UnpickledPythonCall {
+ public:
+  explicit UnpickledPythonRemoteCall(
+      const SerializedPyObj& serializedPyObj,
+      const at::IValue& retRRefId,
+      const at::IValue& retForkId,
+      const bool isAsyncExecution);
+
+  const RRefId& rrefId() const;
+  const ForkId& forkId() const;
+
+ private:
+  RRefId rrefId_;
+  ForkId forkId_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c1f33384ad875b9fa00aba904103069cb74a8b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Event.h>
+#include <c10/core/Stream.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/csrc/utils/byte_order.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Parse error message and return RPCErrorType based on the message.
+TORCH_API RPCErrorType getRPCErrorType(const JitFuture& jitFuture);
+// Create an error string given the error description and error type
+TORCH_API std::string makeRPCError(
+    const std::string& rpcErrorStr,
+    RPCErrorType errorType);
+
+// Given an RPC message received as a request over the wire, deserialize it into
+// the appropriate 'RpcCommandBase' type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeRequest(
+    const Message& request);
+
+// Given an RPC message received as a response over the wire, deserialize it
+// into the appropriate 'RpcCommandBase' type, if the response is
+// FORWARD_AUTOGRAD_RESP type, unwrap it, attach recvBackward() functions
+// to received tensors and set the wrappedMsgType to its wrapped message type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeResponse(
+    const Message& response,
+    MessageType& wrappedMsgType);
+
+// Given an RPC message received as a response over the wire, deserialize it
+// into the valid IValue if the message is for a script rpc result,
+// otherwise deserialize it into dummy none ivalue that will never be used.
+// In this deserialization, we also attach recv rpc backward functions if
+// needed.
+IValue deserializeResptoIValueInternal(
+    RpcCommandBase& rpc,
+    MessageType messageType);
+TORCH_API IValue deserializeRespToIValue(const Message& message);
+
+// Note: format is subject to change and intended for RPCs.
+// For saving persistently to disk, use torch::save().
+TORCH_API std::string wireSerialize(
+    const std::vector<char>& payload,
+    const std::vector<at::Tensor>& tensors);
+
+TORCH_API std::pair<std::vector<char>, std::vector<at::Tensor>> wireDeserialize(
+    const void* data,
+    size_t data_size);
+
+// We use vector<char> as the type of blobs because it's what rpc::Message uses
+// for its payload, even though it has the disadvantage that it cannot be
+// allocated with uninitialized memory: it is always zeroed out.
+
+// Some Tensors are effectively views of larger Tensors, where only a small
+// subset of the Storage data is referenced. This normally is good and avoids
+// copies when kept locally, but if we naively push the whole Storage over the
+// wire, we'll end up with excess network traffic. This change clones tensors if
+// we'd save at least half the data, and over a minimum hurdle.
+TORCH_API c10::List<at::Tensor> cloneSparseTensors(
+    const std::vector<at::Tensor>& tensors);
+
+// Combines an original payload and wrapped payload into the original payload.
+// Used to generate the overall payload for the wrapped RPC.
+TORCH_API void writeWrappedPayload(
+    std::vector<char>& originalPayload,
+    std::vector<char>& additionalPayload);
+
+// Reads the additional, wrapped payload from a wrapped RPC off of the input
+// payload. After this, payload will contain the payload of the original,
+// un-wrapped RPC.
+TORCH_API std::vector<at::IValue> readWrappedPayload(
+    std::vector<char>& payload,
+    const rpc::Message& message);
+
+// Takes a list of events from autograd profiler and populates them into
+// profiledEvents to be carried over RPC.
+TORCH_API void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::LegacyEvent>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilerConfig,
+    const std::vector<std::vector<torch::autograd::profiler::LegacyEvent>>&
+        eventLists);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h
new file mode 100644
index 0000000000000000000000000000000000000000..48f8e28605c97e39962c484b54d0e5e32dc01865
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <Python.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+namespace py = pybind11;
+
+extern "C" {
+
+#endif
+
+/*
+Our cache resides on the extra scratch space of the code object. The structure
+of the cache is as follows:
+
+-> ExtraState
+  -> CacheEntry (list)
+    -> check_fn
+    -> code
+  -> FrameState
+
+CacheEntry is a linked list node containing the check_fn for guards
+and the optimized code.
+
+The FrameState is a PyDict that enables sharing between different frames. This
+is used to detect dynamism in automatic dynamic shapes.
+
+These two are encapsulated into a ExtraState.
+*/
+
+typedef struct CacheEntry CacheEntry;
+typedef struct ExtraState ExtraState;
+
+#ifdef __cplusplus
+
+typedef struct VISIBILITY_HIDDEN CacheEntry {
+  // check the guards: lambda: <locals of user function>: bool
+  py::object check_fn;
+  // modified user bytecode (protected by check_fn's guards)
+  py::object code;
+  // Reference to owning ExtraState
+  ExtraState* _owner{nullptr};
+  // Reference to this CacheEntry's location in owner's linked list
+  std::list<CacheEntry>::iterator _owner_loc;
+
+  CacheEntry(const py::handle& guarded_code);
+  ~CacheEntry();
+
+  // Warning: returns a reference whose lifetime is controlled by C++
+  py::object next();
+} CacheEntry;
+
+#endif
+
+// Returns borrowed reference
+PyCodeObject* CacheEntry_get_code(CacheEntry* e);
+
+// Returns a borrowed reference to CacheEntry as a PyObject
+// Warning: lifetime is controlled by C++
+PyObject* CacheEntry_to_obj(CacheEntry* e);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce27d42223890566ca22d776af072793c55b2113
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h
@@ -0,0 +1,713 @@
+#pragma once
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/variable_info.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/torch_dispatch_mode.h>
+#include <typeindex>
+#include <vector>
+
+// see [Note: Compiled Autograd]
+
+namespace torch::dynamo::autograd {
+using namespace torch::autograd;
+
+struct SizeInput {
+  // Note: int value is still needed when dynamic to pass as an arg
+  enum DynType : uint8_t { STATIC = 0, DYNAMIC = 1 };
+  SizeInput(DynType dt, int64_t v) : dyn_type(dt), value(v) {}
+  DynType dyn_type;
+  int64_t value;
+};
+
+struct CacheKeyBuffer {
+  CacheKeyBuffer(const uint8_t* key, uint16_t len) : data(new uint8_t[len]) {
+    std::memcpy(data.get(), key, len);
+  }
+  const uint8_t* get() const {
+    return data.get();
+  }
+
+ private:
+  std::unique_ptr<uint8_t[]> data;
+};
+
+struct CacheKey {
+  // Key to find the next node in the shadow graph.  We use C++ RTTI for the
+  // type of the node (ntype), then a key generated with a visitor pattern.
+  CacheKey(const std::type_index& ntype, const uint8_t* key, uint16_t len)
+      : node_type(ntype), key_size(len), key(key) {}
+
+  bool operator<(const CacheKey& other) const {
+    if (node_type != other.node_type) {
+      return node_type < other.node_type;
+    }
+    if (key_size != other.key_size) {
+      return key_size < other.key_size;
+    }
+    return std::memcmp(key, other.key, key_size) < 0;
+  }
+
+  bool operator==(const CacheKey& other) const {
+    return node_type == other.node_type && key_size == other.key_size &&
+        std::memcmp(key, other.key, key_size) == 0;
+  }
+
+  size_t hash() const {
+    // don't bother hashing the key data, common case 1 cache entry per node
+    return std::hash<std::type_index>()(node_type) ^ key_size;
+  }
+
+  std::type_index node_type;
+  uint16_t key_size;
+  const uint8_t* key;
+};
+
+struct NodeCall {
+  NodeCall(uint32_t id_, std::shared_ptr<Node> node_)
+      : id(id_), node(std::move(node_)) {}
+
+  void mark_output(int input_nr, int output_idx) {
+    graph_output.emplace_back(std::make_pair(input_nr, output_idx));
+  }
+
+  uint32_t id;
+  std::shared_ptr<Node> node;
+  std::vector<std::pair<int, int>> tensor_pre_hooks;
+  std::vector<int> pre_hooks;
+  std::vector<int> post_hooks;
+  std::vector<int> post_acc_grad_hooks;
+  std::vector<std::pair<int, int>> graph_output;
+  bool needed = true;
+};
+
+struct NodeCalls : public std::unordered_map<Node*, NodeCall> {
+  NodeCall& lookup(const std::shared_ptr<Node>& function) {
+    auto it = find(function.get());
+    if (it == end()) {
+      it = emplace(function.get(), NodeCall(_next_id++, function)).first;
+    }
+    return it->second;
+  }
+
+ private:
+  uint32_t _next_id = 0;
+};
+
+struct TensorArg {
+  // Represents a de-duplicated tensor that will be passed into the graph
+  TensorArg(uint32_t i = 0) : id(i) {}
+  uint32_t index() const {
+    TORCH_INTERNAL_ASSERT(defined());
+    return id - 1;
+  }
+  bool defined() const {
+    return id != 0;
+  }
+  uint32_t id;
+  at::Tensor proxy_tensor;
+};
+
+struct TensorArgs {
+  // Manages a collection of TensorArgs and mappings from Tensors/SavedVariables
+  // to them.  This also allows us to unpack SavedVariable exactly once and
+  // store the unpacked Tensor.
+
+  TensorArg& lookup(const at::Tensor& tensor, bool create = false) {
+    if (!tensor.defined()) {
+      return _undefined;
+    }
+    auto impl = tensor.unsafeGetTensorImpl();
+    auto it = _args.find(impl);
+    if (it == _args.end()) {
+      TORCH_INTERNAL_ASSERT(create && inputs.size() == _next_id - 1);
+      it = _args.emplace(impl, TensorArg(_next_id++)).first;
+      inputs.emplace_back(tensor);
+    }
+    return it->second;
+  }
+
+  TensorArg& lookup(const SavedVariable& sv) {
+    auto it = _saved_variables.find(&sv);
+    TORCH_INTERNAL_ASSERT(it != _saved_variables.end());
+    return *it->second;
+  }
+
+  TensorArg& add(const at::Tensor& tensor) {
+    return lookup(tensor, true);
+  }
+
+  TensorArg& add(const SavedVariable& sv, const std::shared_ptr<Node>& node) {
+    // TODO(jansel): Here we unpack the SavedVariable exactly once.  This might
+    // fire SavedTensor hooks.  In the future we should try to put saved tensor
+    // hooks into the graph.
+    at::Tensor tensor = sv.unpack(node);
+    TensorArg& arg = add(tensor);
+    _saved_variables.emplace(&sv, &arg);
+    return arg;
+  }
+
+  // the concrete tensors that will get passed into the graph as inputs
+  std::vector<at::Tensor> inputs;
+
+ private:
+  std::unordered_map<const c10::TensorImpl*, TensorArg> _args;
+  // Every TensorArg from this is actually owned by _args (or _undefined) and
+  // that's why we have an un-owned pointer here.
+  std::unordered_map<const SavedVariable*, TensorArg*> _saved_variables;
+  TensorArg _undefined;
+  uint32_t _next_id = 1; // id=0 used by _undefined
+};
+
+struct AutogradCompilerCall {
+  void add_size_input(const c10::SymInt& s) {
+    all_size_inputs.emplace_back(
+        SizeInput(default_dyn_type, s.guard_int(__FILE__, __LINE__)));
+  }
+
+  int emplace_hook(c10::SafePyObject&& fn) {
+    hooks.emplace_back(std::move(fn));
+    return hooks.size() - 1;
+  }
+
+  TensorArgs tensor_args;
+  std::vector<SizeInput> all_size_inputs;
+  std::vector<int64_t> dyn_size_inputs;
+  std::vector<c10::SafePyObject> hooks;
+  NodeCalls node_calls;
+  SizeInput::DynType default_dyn_type = SizeInput::STATIC;
+};
+
+class CompiledNodeArgs {
+  // CompiledNodeArgs builds a representation of the constant values found
+  // across all the nodes in the compiled graph, via 'collect' overloads. The
+  // collected constants are specialized on by concatenation into a cache key.
+  // Tensor, symint arguments (which are lifted to become graph inputs rather
+  // than specialized on) are forwarded to the compiler and not included in the
+  // key.
+ public:
+  void collect(const TensorArg& t) {
+    collect_size(t.id);
+    if (t.defined()) {
+      const at::Tensor& tensor = _compiler.tensor_args.inputs[t.index()];
+      // including these in the cache key means dynamo-level tensor guards can
+      // be skipped
+      collect(tensor.device());
+      collect(tensor.dtype());
+      collect(tensor.requires_grad());
+    }
+  }
+
+  void collect(const at::Tensor& t) {
+    collect(_compiler.tensor_args.add(t));
+  }
+  void collect(const SavedVariable& t) {
+    collect(_compiler.tensor_args.add(t, _node_call.node));
+  }
+  void collect(const c10::SymInt& t) {
+    _compiler.add_size_input(t);
+  }
+  template <typename T>
+  void collect(const std::vector<T>& t) {
+    collect_size(t.size());
+    for (const T& i : t) {
+      collect(i);
+    }
+  }
+  template <typename T>
+  void collect(const c10::ArrayRef<T>& t) {
+    collect_size(t.size());
+    for (const T& i : t) {
+      collect(i);
+    }
+  }
+  template <typename T>
+  void collect(const c10::OptionalArray<T>& t) {
+    collect(t.list);
+  }
+  template <typename T>
+  void collect(const c10::optional<T>& t) {
+    if (cond(t.has_value())) {
+      collect(*t);
+    }
+  }
+  template <typename A, typename B>
+  void collect(const std::pair<A, B>& t) {
+    collect(t.first);
+    collect(t.second);
+  }
+  void collect(const c10::Scalar& t) {
+    auto type = t.type();
+    specialize_on_bytes(type);
+    if (type == c10::ScalarType::Double) {
+      collect(t.toDouble());
+    } else if (type == c10::ScalarType::Long) {
+      collect(t.toLong());
+    } else if (type == c10::ScalarType::Bool) {
+      collect(t.toBool());
+    } else if (type == c10::ScalarType::ComplexDouble) {
+      auto c = t.toComplexDouble();
+      collect(c.real());
+      collect(c.imag());
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+  void collect(const c10::TensorOptions& t) {
+    collect(t.device());
+    collect(t.dtype());
+    collect(t.layout());
+    collect(t.requires_grad());
+    collect(t.pinned_memory());
+    collect(t.memory_format_opt());
+  }
+  void collect(const at::TensorGeometry& t) {
+    collect(t.sym_sizes());
+    collect(t.sym_strides());
+    collect(t.sym_storage_offset());
+  }
+  void collect(const torch::autograd::TypeAndSize& t) {
+    collect(t.sym_sizes);
+    collect(t.options);
+  }
+  void collect(const c10::Device& t) {
+    collect(t.type());
+    collect(t.index());
+  }
+  void collect(const std::string& t) {
+    collect_size(t.size());
+    for (char c : t) {
+      collect(c);
+    }
+  }
+  void collect(const caffe2::TypeMeta& t) {
+    specialize_on_bytes(t.id());
+  }
+  void collect(const std::shared_ptr<Node>& t) {
+    // Note: this is only capturing the ID of the node not everything
+    // contained inside it.  This is used for tracking connections between
+    // nodes and the actual details of the node itself must be handled by
+    // a seperate call to `node->compiled_args()`.
+    if (cond((bool)t)) {
+      collect(_compiler.node_calls.lookup(t));
+    }
+  }
+  void collect(const NodeCall& t) {
+    collect_size(t.id);
+    collect(t.graph_output);
+    collect_hooks_from(t.node.get());
+  }
+  void collect(const Edge& t) {
+    if (cond(t.is_valid())) {
+      collect_size(_compiler.node_calls.lookup(t.function).id);
+      collect_size(t.input_nr);
+      collect(t.function->input_metadata(t.input_nr)); // for validate_outputs
+    }
+  }
+  void collect(const InputMetadata& t) {
+    TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
+    collect(t.options());
+    collect(t.is_tensor_subclass());
+    collect(t.shape_as_dim_vector());
+  }
+  void collect(const VariableInfo& t) {
+    collect(t.layout);
+    collect(t.device);
+    collect(t.scalar_type);
+    collect(t.size);
+    collect(t.requires_grad);
+    collect(t.is_empty);
+  }
+  bool cond(bool cond) {
+    collect(cond);
+    return cond;
+  }
+
+#define COLLECT_AS_BYTES(T) \
+  void collect(T t) {       \
+    specialize_on_bytes(t); \
+  }
+  COLLECT_AS_BYTES(c10::ScalarType);
+  COLLECT_AS_BYTES(c10::DeviceType);
+  COLLECT_AS_BYTES(c10::Layout);
+  COLLECT_AS_BYTES(c10::MemoryFormat);
+  COLLECT_AS_BYTES(int8_t);
+  COLLECT_AS_BYTES(int16_t);
+  COLLECT_AS_BYTES(int32_t);
+  COLLECT_AS_BYTES(int64_t);
+  COLLECT_AS_BYTES(uint8_t);
+  COLLECT_AS_BYTES(uint16_t);
+  COLLECT_AS_BYTES(uint32_t);
+  COLLECT_AS_BYTES(uint64_t);
+  COLLECT_AS_BYTES(bool);
+  COLLECT_AS_BYTES(float);
+  COLLECT_AS_BYTES(double);
+#undef COLLECT_AS_BYTES
+
+  void collect_hooks_from(Node* fn) {
+    TORCH_CHECK(
+        fn->retains_grad_hooks().empty(),
+        "retains_grad_hooks not implemented for compiled autograd");
+    for (auto& i : fn->tensor_pre_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& i : fn->pre_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& i : fn->post_hooks()) {
+      i->compiled_args(*this);
+    }
+    collect_size(_node_call.tensor_pre_hooks.size());
+    collect_size(_node_call.pre_hooks.size());
+    collect_size(_node_call.post_hooks.size());
+    for (const auto& h : _node_call.tensor_pre_hooks) {
+      collect_size(h.second); // index
+    }
+  }
+
+  CacheKey key() const {
+    Node* node = _node_call.node.get();
+    return CacheKey(
+        typeid(*node), _specialization_key, _specialization_key_size);
+  }
+
+  int add_backward(c10::SafePyObject&& obj) {
+    return _compiler.emplace_hook(std::move(obj));
+  }
+
+  int add_backward_state(c10::SafePyObject&& obj) {
+    return _compiler.emplace_hook(std::move(obj));
+  }
+
+  void add_tensor_pre_hook(c10::SafePyObject&& obj, int index) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.tensor_pre_hooks.emplace_back(std::make_pair(fn_id, index));
+  }
+
+  void add_pre_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.pre_hooks.emplace_back(fn_id);
+  }
+
+  void add_post_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.post_hooks.emplace_back(fn_id);
+  }
+
+  void add_post_acc_grad_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.post_acc_grad_hooks.emplace_back(fn_id);
+  }
+
+  void collect_size(size_t s) {
+    // we expect sizes to be small, so try to cram them into a single byte
+    constexpr uint8_t encode_as_u64 = std::numeric_limits<uint8_t>::max();
+    constexpr uint8_t encode_as_u32 = encode_as_u64 - 1;
+    constexpr uint8_t encode_as_u16 = encode_as_u64 - 2;
+    if (C10_UNLIKELY(s >= encode_as_u16)) {
+      // first write a byte indicating the path we followed, then the data
+      if (s <= std::numeric_limits<uint16_t>::max()) {
+        // 3 bytes
+        specialize_on_bytes(encode_as_u16);
+        specialize_on_bytes(static_cast<uint16_t>(s));
+      } else if (s <= std::numeric_limits<uint32_t>::max()) {
+        // 5 bytes
+        specialize_on_bytes(encode_as_u32);
+        specialize_on_bytes(static_cast<uint32_t>(s));
+      } else {
+        // 9 bytes
+        specialize_on_bytes(encode_as_u64);
+        specialize_on_bytes(s);
+      }
+    } else {
+      // happy case, 1 byte
+      specialize_on_bytes(static_cast<uint8_t>(s));
+    }
+  }
+
+  SizeInput::DynType set_default_dyn_type(SizeInput::DynType default_dyn_type) {
+    return std::exchange(_compiler.default_dyn_type, default_dyn_type);
+  }
+
+  CompiledNodeArgs(AutogradCompilerCall& compiler, NodeCall& node_call)
+      : _compiler(compiler),
+        _node_call(node_call),
+        _specialization_key_size(0),
+        _specialization_key_storage(1024),
+        _specialization_key(
+            (uint8_t*)std::malloc(_specialization_key_storage)) {}
+  ~CompiledNodeArgs() {
+    std::free(_specialization_key);
+  }
+  CompiledNodeArgs(const CompiledNodeArgs&) = delete;
+
+ private:
+  template <typename T>
+  void specialize_on_bytes(const T& t) {
+    while (C10_UNLIKELY(
+        _specialization_key_size + sizeof(T) > _specialization_key_storage)) {
+      _specialization_key_storage *= 2;
+      _specialization_key = (uint8_t*)std::realloc(
+          _specialization_key, _specialization_key_storage);
+    }
+    std::memcpy(_specialization_key + _specialization_key_size, &t, sizeof(T));
+    _specialization_key_size += sizeof(T);
+  }
+
+  AutogradCompilerCall& _compiler;
+  NodeCall& _node_call;
+  size_t _specialization_key_size;
+  size_t _specialization_key_storage;
+  uint8_t* _specialization_key;
+};
+
+struct TraceState {
+  TraceState(
+      const std::vector<c10::optional<c10::SymInt>>& ss,
+      size_t num_outputs)
+      : sym_sizes_index(0), sym_sizes(ss), outputs(num_outputs) {}
+
+  void debug_asserts() {
+    TORCH_INTERNAL_ASSERT(sym_sizes_index == sym_sizes.size());
+  }
+  c10::optional<c10::SymInt> next_sym_size() {
+    TORCH_INTERNAL_ASSERT(sym_sizes_index < sym_sizes.size());
+    return sym_sizes[sym_sizes_index++];
+  }
+
+  size_t sym_sizes_index;
+  std::vector<c10::optional<c10::SymInt>> sym_sizes;
+  variable_list outputs;
+};
+
+class SwapSavedVariables {
+  // SwapSavedVariables is used during the tracing/compilation phase after a
+  // cache-miss. It swaps any 'lifted' inputs (tensors, symints) to proxy nodes,
+  // allows tracing to happen, then swaps them back afterwards.
+ public:
+  void before(at::Tensor& t) {
+    TensorArg& arg = compiler.tensor_args.lookup(t);
+    stashed_tensors.save(&t, std::move(t));
+    if (arg.defined()) {
+      TORCH_INTERNAL_ASSERT(arg.proxy_tensor.defined());
+      t = arg.proxy_tensor;
+    }
+  }
+  void after(at::Tensor& t) {
+    stashed_tensors.restore(&t);
+  }
+
+  void before(SavedVariable& t) {
+    TensorArg& arg = compiler.tensor_args.lookup(t);
+    stashed_variables.save(&t, std::move(t));
+    if (arg.defined()) {
+      TORCH_INTERNAL_ASSERT(arg.proxy_tensor.defined());
+      t = SavedVariable(arg.proxy_tensor, false);
+    }
+  }
+  void after(SavedVariable& t) {
+    stashed_variables.restore(&t);
+  }
+
+  void before(c10::SymInt& t) {
+    stashed_symints.save(&t, c10::SymInt(t));
+    auto opt_value = state.next_sym_size();
+    if (opt_value.has_value()) {
+      t = *opt_value; // dynamic shape
+    }
+  }
+  void after(c10::SymInt& t) {
+    stashed_symints.restore(&t);
+  }
+
+  void before(Edge& t) {
+    if (t.is_valid()) {
+      // need for symints used by validate_outputs
+      before(t.function->mutable_input_metadata(t.input_nr));
+    }
+  }
+  void after(Edge& t) {
+    if (t.is_valid()) {
+      after(t.function->mutable_input_metadata(t.input_nr));
+    }
+  }
+  void before(InputMetadata& t) {
+    before(t.mutable_shape_as_dim_vector());
+  }
+  void after(InputMetadata& t) {
+    after(t.mutable_shape_as_dim_vector());
+  }
+  void before(at::TensorGeometry& t) {
+    before(t.mutable_sizes());
+    before(t.mutable_strides());
+    before(t.mutable_storage_offset());
+    t.recompute();
+  }
+  void after(at::TensorGeometry& t) {
+    after(t.mutable_sizes());
+    after(t.mutable_strides());
+    after(t.mutable_storage_offset());
+    t.recompute();
+  }
+  void before(torch::autograd::TypeAndSize& t) {
+    before(t.sym_sizes);
+    before(t.options);
+  }
+  void after(torch::autograd::TypeAndSize& t) {
+    after(t.sym_sizes);
+    after(t.options);
+  }
+  void before(VariableInfo& t) {
+    before(t.size);
+  }
+  void after(VariableInfo& t) {
+    after(t.size);
+  }
+
+  template <typename T>
+  void before(std::vector<T>& t) {
+    for (T& i : t) {
+      before(i);
+    }
+  }
+  template <typename T>
+  void after(std::vector<T>& t) {
+    for (T& i : t) {
+      after(i);
+    }
+  }
+  template <typename T, unsigned N>
+  void before(c10::SmallVector<T, N>& t) {
+    for (T& i : t) {
+      before(i);
+    }
+  }
+  template <typename T, unsigned N>
+  void after(c10::SmallVector<T, N>& t) {
+    for (T& i : t) {
+      after(i);
+    }
+  }
+
+  template <typename T>
+  void before(c10::OptionalArray<T>& t) {
+    before(t.list);
+  }
+  template <typename T>
+  void after(c10::OptionalArray<T>& t) {
+    after(t.list);
+  }
+
+  template <typename T>
+  void before(c10::optional<T>& t) {
+    if (t.has_value()) {
+      before(*t);
+    }
+  }
+  template <typename T>
+  void after(c10::optional<T>& t) {
+    if (t.has_value()) {
+      after(*t);
+    }
+  }
+
+#define NO_OP_VISIT(T)     \
+  void before(const T&) {} \
+  void after(const T&) {}
+  NO_OP_VISIT(caffe2::TypeMeta);
+  NO_OP_VISIT(c10::Device);
+  NO_OP_VISIT(c10::DeviceType);
+  NO_OP_VISIT(c10::Layout);
+  NO_OP_VISIT(c10::MemoryFormat);
+  NO_OP_VISIT(c10::ScalarType);
+  NO_OP_VISIT(c10::Scalar);
+  NO_OP_VISIT(c10::TensorOptions);
+  NO_OP_VISIT(std::string);
+  NO_OP_VISIT(int64_t);
+  NO_OP_VISIT(bool);
+  NO_OP_VISIT(double);
+#undef NO_OP_VISIT
+
+  SwapSavedVariables(
+      AutogradCompilerCall& c,
+      TraceState& s,
+      PyObject* p,
+      const NodeCall& n)
+      : compiler(c), state(s), py_compiler(p), curr_node_call(n) {}
+
+  PyObject* get_py_compiler() {
+    return py_compiler;
+  }
+
+  const NodeCall& get_curr_node_call() {
+    return curr_node_call;
+  }
+
+  void debug_asserts() {
+    stashed_variables.debug_assert();
+    stashed_tensors.debug_assert();
+    stashed_symints.debug_assert();
+  }
+
+ private:
+  template <typename T>
+  struct Stashed {
+    Stashed(T&& v) : prior_value(std::move(v)) {}
+    T prior_value;
+    // Note: we need count here to support duplicate calls to before()
+    // which happen when we have multiple autograd::Edge objects pointing
+    // to the same autograd::Node
+    int count = 1;
+  };
+
+  template <typename T>
+  struct StashedVars : public std::unordered_map<const T*, Stashed<T>> {
+    void save(const T* key, T&& value) {
+      auto it = this->find(key);
+      if (it == this->end()) {
+        this->emplace(key, std::move(value));
+      } else {
+        // keep the value from the prior save()
+        it->second.count++;
+      }
+    }
+    void restore(T* var) {
+      auto it = this->find(var);
+      TORCH_INTERNAL_ASSERT(it != this->end(), "missing before())");
+      if (--it->second.count == 0) {
+        // restore the value on the last restore()
+        *var = std::move(it->second.prior_value);
+        this->erase(it);
+      }
+    }
+    void debug_assert() {
+      TORCH_INTERNAL_ASSERT(this->empty(), "missing call to after()");
+    }
+  };
+
+  AutogradCompilerCall& compiler;
+  TraceState& state;
+  // This is a borrowed reference, we do not increment ownership, or lower it,
+  // it's lifecycle is entirely longer than this objects.
+  PyObject* py_compiler;
+  const NodeCall& curr_node_call;
+
+  // These mappings are used to save the prior values when we overwrite things
+  // in before(). In after(), we use these to cleanup after ourselves.
+  StashedVars<SavedVariable> stashed_variables;
+  StashedVars<at::Tensor> stashed_tensors;
+  StashedVars<c10::SymInt> stashed_symints;
+};
+
+} // namespace torch::dynamo::autograd
+
+template <>
+struct std::hash<torch::dynamo::autograd::CacheKey> {
+  size_t operator()(const torch::dynamo::autograd::CacheKey& k) const {
+    return k.hash();
+  }
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5de6c3df954484922658891f7e17d4b90665ecf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct _PytorchRecordFunctionState;
+typedef struct _PytorchRecordFunctionState _PytorchRecordFunctionState;
+
+_PytorchRecordFunctionState* _pytorch_record_function_enter(const char* name);
+void _pytorch_record_function_exit(_PytorchRecordFunctionState* state);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..31f6fd344798978d3af7d61842cc3c93aef8a8f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+
+// Functions that need to be copied from the CPython source
+// should go in cpython_defs.c. Copying is required when, e.g.,
+// we need to call internal CPython functions that are not exposed.
+
+#if IS_PYTHON_3_11_PLUS
+
+#include <internal/pycore_frame.h>
+
+int THP_PyFrame_FastToLocalsWithError(
+    _PyInterpreterFrame* frame,
+    int* free_vars_copied);
+
+PyFunctionObject* _PyFunction_CopyWithNewCode(
+    PyFunctionObject* o,
+    PyCodeObject* code);
+
+void THP_PyFrame_Clear(_PyInterpreterFrame* frame);
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0335c443f2b0592c6ad9f1a14ae90a589acf1cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <stdio.h>
+
+#ifdef _WIN32
+#define unlikely(x) (x)
+#else
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
+#define NULL_CHECK(val)                                         \
+  if (unlikely((val) == NULL)) {                                \
+    fprintf(stderr, "NULL ERROR: %s:%d\n", __FILE__, __LINE__); \
+    PyErr_Print();                                              \
+    abort();                                                    \
+  } else {                                                      \
+  }
+
+// CHECK might be previously declared
+#undef CHECK
+#define CHECK(cond)                                                     \
+  if (unlikely(!(cond))) {                                              \
+    fprintf(stderr, "DEBUG CHECK FAILED: %s:%d\n", __FILE__, __LINE__); \
+    abort();                                                            \
+  } else {                                                              \
+  }
+
+// Uncomment next line to print debug message
+// #define TORCHDYNAMO_DEBUG 1
+#ifdef TORCHDYNAMO_DEBUG
+
+#define DEBUG_CHECK(cond) CHECK(cond)
+#define DEBUG_NULL_CHECK(val) NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__, __VA_ARGS__)
+#define DEBUG_TRACE0(msg) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__)
+
+#else
+
+#define DEBUG_CHECK(cond)
+#define DEBUG_NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...)
+#define DEBUG_TRACE0(msg)
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..2832cdfab80ee02610c711fd7375467fa820a91f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <Python.h>
+
+extern "C" {
+PyObject* torch_c_dynamo_eval_frame_init(void);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..c41b74c353e56f2917f7d356b059abe7a55e77c8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <Python.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+namespace py = pybind11;
+
+extern "C" {
+
+#endif
+
+// Flag to just run a frame normally
+#define SKIP_CODE ((void*)0x1)
+
+// Points to the extra scratch space on the code object
+extern Py_ssize_t extra_index;
+
+// function to call when cache lookup errors
+extern PyObject* guard_error_hook;
+
+typedef PyObject FrameState;
+typedef struct CacheEntry CacheEntry;
+
+// ExtraState encasulates CacheEntry and FrameState. ExtraState is the highest
+// level of abstraction of what is stored on the extra code object. Previously,
+// we saved different parts on different extra indexes.  We prefer this way
+// because of cleaner abstraction and faster SetExtra access.
+
+#ifdef __cplusplus
+
+typedef struct VISIBILITY_HIDDEN ExtraState {
+  // List of cache entries for compiled code objects
+  std::list<CacheEntry> cache_entry_list;
+  // Frame state to detect dynamic shape dims
+  py::dict frame_state;
+
+  CacheEntry* get_first_entry();
+  void move_to_front(CacheEntry* cache_entry);
+  void invalidate(CacheEntry* cache_entry);
+} ExtraState;
+
+#else
+
+typedef struct ExtraState ExtraState;
+
+#endif
+
+// Helper to extra the cache_entry from the extra state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - CacheEntry: Borrowed.
+CacheEntry* extract_cache_entry(ExtraState* extra_state);
+
+// Returns either the previously stored frame state or an empty dict.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - extra_state->frame_state: Borrowed.
+FrameState* extract_frame_state(ExtraState* extra_state);
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return
+//  - extra_state: Borrowed.
+ExtraState* get_extra_state(PyCodeObject* code);
+
+// This is passed as freefunc to _PyEval_RequestCodeExtraIndex. This acts as a
+// deleter for the object on extra scratch space. This function is called
+// internally in _PyCode_SetExtra and also during the code deallocation.
+
+// Destroys the extra state by deleting cache_entry, frame state and finally
+// freeing the constructed extra state.
+
+// Developer note - You should not call this function directly. This is called
+// directly inside set_extra_state. If you are in a situation trying to call
+// this function, consider if set_extra_state should be called.
+void destroy_extra_state(void* obj);
+
+// Clears the existing object sitting on the extra scratch spance and sets it
+// up with the new state. Note that _PyCode_SetExtra calls the
+// destroy_extra_state deleter internally, and therefore we don't call it
+// explicity here.
+
+// Ownership contract
+// args
+//  - extra_state: Stolen
+// return
+//  - there is no return, but the extra_state is stolen, so it becomes
+//  set_extra_state responsibility to clean it up. It will be deleted during
+//  the reset_code/skip, when the set_extra_state is called with
+//  NULL/SKIP_CODE.
+
+// Invariant - Dont set the extra state for the extra state that is already on
+// the code object. Otherwise, we will first free up the old extra state
+// (which is also the new extra state) and write something invalid on the
+// scratch space.
+void set_extra_state(PyCodeObject* code, ExtraState* extra_state);
+
+// Creates a new extra state and put it on the extra scrach space of the code
+// object.
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return:
+//   - extra_state: New reference.
+// These references are then further passed to set_extra_state which becomes
+// the final owner of these references.
+ExtraState* init_and_set_extra_state(PyCodeObject* code);
+
+// Lookup the cache held by extra_state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+//  - f_locals: Borrowed
+// return:
+//   - Py_None or PyCodeObject: Borrowed reference.
+PyObject* lookup(ExtraState* extra_state, PyObject* f_locals);
+
+// Create a new cache entry at extra_state holding on to guarded_code.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+//  - guarded_code: Borrowed
+// return:
+//  - cache_entry: Borrowed reference
+CacheEntry* create_cache_entry(ExtraState* extra_state, PyObject* guraded_code);
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+// Returns the list of CacheEntry corresponding to code_obj.
+// Warning: returns references whose lifetimes are controlled by C++
+py::list _debug_get_cache_entry_list(const py::handle& code_obj);
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c2b9fd7df69af16e2f5a68a2d7e5b240e9fd142
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h
@@ -0,0 +1,4 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+PyObject* torch_c_dynamo_guards_init();
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc139819d39e214fcbd29b922c7552ec928b94f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h
@@ -0,0 +1,13 @@
+#pragma once
+
+// C2039 MSVC
+#include <pybind11/complex.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <Python.h>
+
+namespace torch {
+namespace dynamo {
+void initDynamoBindings(PyObject* torch);
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..19d284b30f0ff76509d7f7e670ead7466818b8ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <torch/csrc/utils/python_stub.h>
+
+// see [Note: Compiled Autograd]
+namespace torch::dynamo::autograd {
+PyObject* torch_c_dynamo_compiled_autograd_init();
+} // namespace torch::dynamo::autograd
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..69f2d82a1ee3bcfe3a6dbee17bc461bf0ac90d2b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e1757f705a61d82c844423eda486749d3c3e29a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -0,0 +1,84 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+
+// Forward declare DynamicLibrary
+namespace at {
+struct DynamicLibrary;
+}
+
+namespace torch::inductor {
+using TensorConstantMap = std::unordered_map<std::string, at::Tensor*>;
+
+class TORCH_API AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunner() = delete;
+  AOTIModelContainerRunner(const AOTIModelContainerRunner& other) = delete;
+  AOTIModelContainerRunner(AOTIModelContainerRunner&& other) = delete;
+  AOTIModelContainerRunner& operator=(const AOTIModelContainerRunner& other) =
+      delete;
+  AOTIModelContainerRunner& operator=(AOTIModelContainerRunner&& other) =
+      delete;
+  ~AOTIModelContainerRunner();
+
+  std::vector<at::Tensor> run(
+      std::vector<at::Tensor>& inputs,
+      AOTInductorStreamHandle cuda_stream_handle = nullptr);
+
+  std::unordered_map<std::string, std::string> getConstantNamesToOriginalFQNs()
+      const;
+  std::unordered_map<std::string, int32_t> getConstantNamesToDtypes() const;
+  void update_inactive_constant_buffer(const TensorConstantMap& const_map);
+  void update_constant_buffer(
+      const TensorConstantMap& const_map,
+      bool use_inactive,
+      bool validate_full_updates);
+  void run_const_fold(
+      bool use_inactive,
+      AOTInductorStreamHandle cuda_stream_handle = nullptr);
+  void swap_constant_buffer();
+
+  std::vector<std::string> get_call_spec();
+
+ protected:
+  AOTIModelContainerRunner(
+      const std::string& model_so_path,
+      size_t num_models,
+      const std::string& device_str,
+      const std::string& cubin_dir);
+
+  std::unique_ptr<at::DynamicLibrary> model_so_;
+  decltype(&AOTInductorModelContainerCreateWithDevice) create_func_{nullptr};
+  decltype(&AOTInductorModelContainerDelete) delete_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetNumOutputs) get_num_outputs_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerRun) run_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetNumConstants) get_num_constants_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerGetConstantName) get_constant_name_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerGetConstantOriginalFQN)
+      get_constant_original_fqn_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetConstantDtype) get_constant_dtype_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerUpdateConstantBuffer)
+      update_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerUpdateInactiveConstantBuffer)
+      update_inactive_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerRunConstantFolding) run_const_fold_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerSwapConstantBuffer)
+      swap_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
+
+  AOTInductorModelContainerHandle container_handle_ = nullptr;
+
+  // TODO: need an OSS proxy executor implementation. For now,
+  // proxy_executor_handle_ will always be nullptr.
+  AOTIProxyExecutorHandle proxy_executor_handle_ = nullptr;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c3b9d32da0993e8012508ff51a4c850359ef343
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h
@@ -0,0 +1,19 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelContainerRunnerCpu : public AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunnerCpu(
+      const std::string& model_so_path,
+      size_t num_models = 1);
+
+  ~AOTIModelContainerRunnerCpu();
+
+  std::vector<at::Tensor> run(std::vector<at::Tensor>& inputs);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..76d57f23a5e251bd15e07cf5700a619b16acbd0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
@@ -0,0 +1,30 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+
+// NOTICE: Following APIs are subject to change due to active development
+// We provide NO BC guarantee for these APIs
+class TORCH_API AOTIModelContainerRunnerCuda : public AOTIModelContainerRunner {
+ public:
+  // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
+  AOTIModelContainerRunnerCuda(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const std::string& device_str = "cuda",
+      const std::string& cubin_dir = "");
+
+  ~AOTIModelContainerRunnerCuda();
+
+  std::vector<at::Tensor> run(std::vector<at::Tensor>& inputs);
+
+  std::vector<at::Tensor> run_with_cuda_stream(
+      std::vector<at::Tensor>& inputs,
+      at::cuda::CUDAStream cuda_stream);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..d651e480afad5b9831c0748f56b674f0b42bb373
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h
@@ -0,0 +1,7 @@
+#include <torch/csrc/python_headers.h>
+
+namespace torch::inductor {
+
+void initAOTIRunnerBindings(PyObject* module);
+
+} // namespace torch::inductor
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..012e685577810e1dbd5b556cb8a280596d7c92d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -0,0 +1,367 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <assert.h>
+#include <cstdint>
+#include <cstring>
+
+namespace torch {
+namespace aot_inductor {
+
+// Can't use c10::ArrayRef because it's not truly header-only and
+// pulls in other c10 headers. This is (sadly) copy-pasted and
+// adapted.
+template <typename T>
+class MiniArrayRef final {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty MiniArrayRef.
+  /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct an MiniArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr MiniArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an MiniArrayRef from a pointer and length.
+  constexpr MiniArrayRef(T* data, size_t length) : Data(data), Length(length) {}
+
+  /// Construct an MiniArrayRef from a range.
+  constexpr MiniArrayRef(T* begin, T* end) : Data(begin), Length(end - begin) {}
+
+  template <
+      typename Container,
+      typename = std::enable_if_t<std::is_same<
+          std::remove_const_t<decltype(std::declval<Container>().data())>,
+          T*>::value>>
+  /* implicit */ MiniArrayRef(Container& container)
+      : Data(container.data()), Length(container.size()) {}
+
+  /// Construct an MiniArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because MiniArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ MiniArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same<T, bool>::value,
+        "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct an MiniArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr MiniArrayRef(std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an MiniArrayRef from a C array.
+  template <size_t N>
+  /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  /// Construct an MiniArrayRef from a std::initializer_list.
+  /* implicit */ constexpr MiniArrayRef(const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  // These are actually the same as iterator, since MiniArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return Data;
+  }
+  constexpr const_iterator cend() const {
+    return Data + Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(MiniArrayRef RHS) const {
+    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, T>::value, MiniArrayRef<T>>::type&
+  operator=(U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  typename std::enable_if<std::is_same<U, T>::value, MiniArrayRef<T>>::type&
+  operator=(std::initializer_list<U>) = delete;
+};
+
+using MiniIntArrayRef = MiniArrayRef<int64_t>;
+
+inline bool is_contiguous_strides_for_shape(
+    int64_t ndim,
+    const int64_t* strides_ptr,
+    const int64_t* sizes_ptr) {
+  int64_t z = 1;
+  for (int64_t d = ndim - 1; d >= 0; d--) {
+    const auto& size_d = sizes_ptr[d];
+    if (size_d != 1) {
+      if (strides_ptr[d] == z) {
+        z *= size_d;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Shim for AOTI generated code to pretend a raw array works like an
+// AtenTensorHandle.
+template <typename T>
+class ArrayRefTensor {
+ public:
+  ArrayRefTensor() = default;
+
+  explicit ArrayRefTensor(
+      MiniArrayRef<T> arr,
+      MiniArrayRef<const int64_t> sizes,
+      MiniArrayRef<const int64_t> strides,
+      int32_t device_type,
+      int32_t device_idx)
+      : arrayRef_(arr),
+        sizes_(sizes),
+        strides_(strides),
+        device_type_(device_type),
+        device_idx_(device_idx),
+        numel_(arr.size()) {
+    assert(sizes.size() == strides.size());
+    assert(is_contiguous_strides_for_shape(
+        sizes.size(), strides.data(), sizes.data()));
+  }
+
+  AtenTensorHandle expensiveCopyToTensor() const {
+    AtenTensorHandle result;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(
+        sizes_.size(),
+        sizes_.data(),
+        strides_.data(),
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        device_type_,
+        device_idx_,
+        &result));
+    void* dataPtr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr));
+    std::memcpy(dataPtr, data(), numel() * sizeof(T));
+    return result;
+  }
+
+  // We need to look the same as RAIIAtenTensorHandle, which returns
+  // an owning AtenTensorHandle from release(). So, we allocate one!
+  AtenTensorHandle release() {
+    return expensiveCopyToTensor();
+  }
+
+  // We don't need to free any memory.
+  void reset() {}
+
+  auto sizes() const {
+    return sizes_;
+  }
+
+  auto strides() const {
+    return strides_;
+  }
+
+  auto device_type() const {
+    return device_type_;
+  }
+
+  auto device_idx() const {
+    return device_idx_;
+  }
+
+  T* data() const {
+    return arrayRef_.data();
+  }
+
+  auto numel() const {
+    return numel_;
+  }
+
+  void set_arrayref(MiniArrayRef<T> new_arrayref) {
+    arrayRef_ = new_arrayref;
+  }
+
+ private:
+  MiniArrayRef<T> arrayRef_;
+  // We expect generated code to have statically available sizes &
+  // strides for us.
+  MiniArrayRef<const int64_t> sizes_;
+  MiniArrayRef<const int64_t> strides_;
+  int32_t device_type_ = 0;
+  int32_t device_idx_ = 0;
+  int32_t numel_ = 0;
+};
+
+inline AtenTensorHandle reinterpret_tensor_wrapper(
+    AtenTensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset) {
+  AtenTensorHandle result;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__reinterpret_tensor(
+      self, ndim, sizes_ptr, strides_ptr, storage_offset, &result));
+  return result;
+}
+
+template <typename T>
+inline ArrayRefTensor<T> reinterpret_tensor_wrapper(
+    const ArrayRefTensor<T>& self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset) {
+  // REVIEW: we should add a way to build the DSO in debug mode during
+  // tests so we can have checks like this!
+  assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr));
+  return ArrayRefTensor<T>(
+      MiniArrayRef<T>(
+          self.data() + storage_offset, self.numel() - storage_offset),
+      MiniArrayRef<const int64_t>(sizes_ptr, ndim),
+      MiniArrayRef<const int64_t>(strides_ptr, ndim),
+      self.device_type(),
+      self.device_idx());
+}
+
+inline void* get_data_ptr_wrapper(AtenTensorHandle tensor) {
+  void* result;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(tensor, &result));
+  return result;
+}
+
+template <typename T>
+inline T* get_data_ptr_wrapper(ArrayRefTensor<T>& tensor) {
+  return tensor.data();
+}
+
+template <typename T>
+inline T* get_data_ptr_wrapper(const MiniArrayRef<T>& arr) {
+  return arr.data();
+}
+
+inline AtenTensorHandle unwrap_raii_handle_if_needed(
+    const RAIIAtenTensorHandle& handle) {
+  return handle.get();
+}
+
+template <typename T>
+inline const ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
+    ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+inline RAIIAtenTensorHandle wrap_with_raii_handle_if_needed(
+    AtenTensorHandle handle) {
+  return RAIIAtenTensorHandle(handle);
+}
+
+template <typename T>
+inline const ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
+    ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor.expensiveCopyToTensor();
+}
+
+inline AtenTensorHandle expensive_copy_to_tensor_if_needed(
+    AtenTensorHandle handle) {
+  return handle;
+}
+
+template <typename T>
+const T& convert_arrayref_tensor_to_tensor(const T& t) {
+  return t;
+}
+
+template <typename T>
+RAIIAtenTensorHandle convert_arrayref_tensor_to_tensor(
+    const ArrayRefTensor<T>& art) {
+  return art.expensiveCopyToTensor();
+}
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..48c46cb0d5a44afbf2d825e4aaa84e817c2ccbb5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h
@@ -0,0 +1,51 @@
+#pragma once
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+
+#ifdef USE_CUDA
+
+// FIXME: Currently, CPU and CUDA backend are mutually exclusive.
+// This is a temporary workaround. We need a better way to support
+// multi devices.
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                    \
+  do {                                                     \
+    const cudaError_t code = EXPR;                         \
+    const char* msg = cudaGetErrorString(code);            \
+    if (code != cudaSuccess) {                             \
+      throw std::runtime_error(                            \
+          std::string("CUDA error: ") + std::string(msg)); \
+    }                                                      \
+  } while (0)
+
+namespace torch {
+namespace aot_inductor {
+
+using DeviceStreamType = cudaStream_t;
+
+} // namespace aot_inductor
+} // namespace torch
+
+#else // !USE_CUDA
+
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
+  bool ok = EXPR;                                  \
+  if (!ok) {                                       \
+    throw std::runtime_error("CPU runtime error"); \
+  }
+
+namespace torch {
+namespace aot_inductor {
+
+using DeviceStreamType = void*;
+
+} // namespace aot_inductor
+} // namespace torch
+
+#endif // USE_CUDA
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c8db65ae0f24b1732e0cbce5e9e2a520b867130
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h
@@ -0,0 +1,182 @@
+#pragma once
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+extern "C" {
+struct AOTInductorModelOpaque;
+using AOTInductorModelHandle = AOTInductorModelOpaque*;
+
+struct AOTInductorModelContainerOpaque;
+using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
+
+struct AOTInductorStreamOpaque;
+using AOTInductorStreamHandle = AOTInductorStreamOpaque*;
+
+struct AOTInductorConstantMap;
+using AOTInductorConstantMapHandle = AOTInductorConstantMap*;
+
+// TODO: Deprecate this API. This was kept for BC compatibility.
+// Please use AOTInductorModelContainerCreateWithDevice instead.
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir);
+
+// Creates an AOTInductor model container. The parameter num_models
+// specifies the number of model instances that may be run concurrently for
+// the same input model.
+// `device_str` MUST NOT be nullptr. It must be a valid device string, e.g.
+// "cpu", "cuda", "cuda:0", etc. If the device index is not specified for CUDA
+// device, runtime will use the device index returned by
+// "cudaGetDevice(&device_idx)"
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir);
+
+// Deletes the AOTInductor model container.
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle);
+
+// Runs the inference.
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Retrieves the number of constants for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+// Retrieves a constant's name.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name);
+
+// Retrieves a constant's original FQN.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn);
+
+// Retrieves whether a constant is from folded.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded);
+
+// Retrieves a constant's dtype.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype);
+
+// Setup the constant buffer in model container with provided ConstantMap
+// use_inactive should be set as true if the inactive buffer is to be updated.
+// validate_full_update checks if all constants are included in the ConstantMap
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update);
+
+// Setup the inactive constant buffer in model container with provided
+// ConstantMap
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Run constant folding on constant buffer.
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Swap the constant buffer being used to the inactive one.
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+// Retrieves the number of inputs for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs);
+
+// Retrieves the input name at the given index.
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names);
+
+// Retrieves the number of outputs for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs);
+
+// Retrieves the output name at the given index.
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names);
+
+// Creates an AOTInductorModel instance.  This is a thin and light wrapper
+// around the compiled model; it doesn't handle concurrency, queueing, device
+// management, etc.  Use this if bare-metal performance is needed and you are
+// willing to handle other "management" aspects yourself.
+//
+// constant_map_handle is an opaque type to satisfy the C ABI.  It should be a
+// std::unordered_map<std::string, at::Tensor*>*.
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Run an AOTInductorModel (see AOTInductorModelCreate for when one should use
+// this function versus AOTInductorModelContainerRun).
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles);
+
+// Replace AOTInductorModel's constant map. Note it doesn't handle concurrency
+// so be sure to handle ordering if AOTInductorModelRun is ran concurrently.
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Delete an AOTInductorModel created by AOTInductorModelCreate.
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle);
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs);
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec);
+
+} // extern "C"
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab538fa484c6f9c38388a75b362bb65ee5e5039b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h
@@ -0,0 +1,522 @@
+#pragma once
+
+#include <optional>
+#include <regex>
+#include <unordered_map>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#define AOTI_RUNTIME_CHECK(EXPR, MSG) \
+  do {                                \
+    bool ok = EXPR;                   \
+    if (!ok) {                        \
+      throw std::runtime_error(MSG);  \
+    }                                 \
+  } while (0)
+
+// At codegen time, we write out a binary file called constants.bin.
+// We then turn the raw binary to an object file that exposes this
+// symbol and link it into the final .so.
+// For information on the binary format, see `man objcopy`, under
+// the "binary-architecture" flag:
+// https://man7.org/linux/man-pages/man1/objcopy.1.html
+// todo: use #embed in C++ 23 once available
+extern const uint8_t _binary_constants_bin_start[];
+extern const uint8_t _binary_constants_bin_end[];
+
+#define AOTI_CONST_GPU_ALIGNMENT 64
+
+namespace {
+
+#ifdef USE_CUDA
+
+using CUDAPtr = std::unique_ptr<void, std::function<void(void*)>>;
+
+CUDAPtr RAII_cudaMalloc(size_t num_bytes) {
+  void* data_ptr;
+  AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
+  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
+  return CUDAPtr(data_ptr, deleter);
+}
+
+#endif // USE_CUDA
+
+} // anonymous namespace
+
+namespace torch {
+namespace aot_inductor {
+using ConstantMap = std::unordered_map<std::string, RAIIAtenTensorHandle>;
+
+// valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
+// Update the list here if more devices are supported in the future
+inline void parse_device_str(
+    const std::string& device_str,
+    int32_t& device_type,
+    int32_t& device_idx) {
+  std::regex re("(cpu|cuda)(:([0-9]+))?");
+  std::smatch sm;
+  bool matched = std::regex_match(device_str, sm, re);
+  AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+
+  if (sm[1].str() == "cpu") {
+    device_type = aoti_torch_device_type_cpu();
+  } else if (sm[1].str() == "cuda") {
+    device_type = aoti_torch_device_type_cuda();
+  } else {
+    AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+  }
+
+  if (sm[3].matched) {
+    device_idx = stoi(sm[3].str());
+  } else {
+    device_idx = -1;
+  }
+}
+
+// Defines the base class for AOTInductorModel, which is generated by the
+// AOTInductor cpp codegen. Since we do not need dynamic dispatch, we rely
+// on curiously recurring template pattern (CRTP) to save some runtime
+// v-table overhead. The generated AOTInductorModel is specialized with
+// methods such as run_impl.
+template <typename Model>
+class AOTInductorModelBase {
+ public:
+  AOTInductorModelBase(
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_constants,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir)
+      : inputs_info_(num_inputs),
+        outputs_info_(num_outputs),
+        constants_info_(num_constants),
+        cubin_dir_(cubin_dir) {
+    parse_device_str(device_str, device_type_, device_idx_);
+
+#ifdef USE_CUDA
+    if (device_idx_ == -1) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+    }
+#endif // USE_CUDA
+  }
+
+  ~AOTInductorModelBase() {
+#ifdef USE_CUDA
+    if (run_finished_) {
+      auto code = cudaEventDestroy(*run_finished_);
+      if (code != cudaSuccess) {
+        std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
+                  << cudaGetErrorString(code) << std::endl;
+      }
+    }
+#endif // USE_CUDA
+  }
+
+  AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase(const AOTInductorModelBase&) = delete;
+  AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
+
+  void run(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#else // !USE_CUDA
+    run_finished_ = false;
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+    run_finished_ = true;
+#endif // USE_CUDA
+  }
+
+  std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+
+    auto* model = static_cast<Model*>(this);
+    auto folded_constants =
+        model->const_run_impl(stream, proxy_executor, initialization);
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+    return folded_constants;
+#else // !USE_CUDA
+    return {};
+#endif // USE_CUDA
+  }
+
+  void load_constants() {
+    size_t num_constants = this->num_constants();
+    constants_map_->reserve(num_constants);
+
+    std::vector<size_t> constants_internal_offset(num_constants);
+    if (device_type_ != aoti_torch_device_type_cpu()) {
+      size_t blob_size = 0;
+      compute_cuda_constant_blob(blob_size, constants_internal_offset);
+#ifdef USE_CUDA
+      constant_blob_ = RAII_cudaMalloc(blob_size);
+#endif
+    }
+
+    size_t bytes_read = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      std::string name = this->constant_name(i);
+      size_t data_size = this->constant_data_size(i);
+      bool from_folded = this->constant_from_folded(i);
+      uint8_t* internal_ptr = (data_size != 0)
+          ? constant_ptr(
+                constants_internal_offset[i],
+                bytes_read,
+                data_size,
+                from_folded)
+          : nullptr;
+      bytes_read += data_size;
+
+      // Create at::Tensor from copied memory.
+      auto dtype = this->constant_dtype(i);
+      auto ndim = this->constant_ndim(i);
+      auto size = this->constant_shape(i);
+      auto stride = this->constant_stride(i);
+      auto offset = this->constant_offset(i);
+
+      AtenTensorHandle tensor_handle;
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+          internal_ptr,
+          ndim,
+          size,
+          stride,
+          offset,
+          dtype,
+          device_type_,
+          device_idx_,
+          &tensor_handle));
+      constants_map_->emplace(std::move(name), tensor_handle);
+    }
+    if (constants_map_) {
+      this->update_constants_array_from_map();
+    }
+  }
+
+#ifdef USE_CUDA
+  CUDAPtr&& release_constant_blob() {
+    return std::move(constant_blob_);
+  }
+#endif
+
+  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
+    return constants_;
+  }
+
+  const int32_t get_device_idx() const {
+    return device_idx_;
+  }
+
+  uint8_t* constant_ptr(
+      size_t constant_offset,
+      size_t bytes_read,
+      size_t data_size,
+      bool skip_copy) {
+#ifdef USE_CUDA
+    auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
+    uint8_t* internal_ptr = constants_ptr + constant_offset;
+    // Copy data to GPU memory
+    // TODO: Handle shared storage case.
+    if (!skip_copy) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_ptr,
+          _binary_constants_bin_start + bytes_read,
+          data_size,
+          cudaMemcpyHostToDevice));
+    }
+    return internal_ptr;
+#else // !USE_CUDA
+    // get pointer to constant which is packed in model during compile time.
+    AOTI_RUNTIME_CHECK(!skip_copy, "pure cpu mode doesn't support skip copy");
+    return const_cast<uint8_t*>(_binary_constants_bin_start) + bytes_read;
+#endif // USE_CUDA
+  }
+
+  void compute_cuda_constant_blob(
+      size_t& blob_size,
+      std::vector<size_t>& constants_internal_offset) {
+#ifdef USE_CUDA
+    size_t num_constants = this->num_constants();
+    // Compute required blob size with 64-alignment if on GPU.
+    blob_size = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      size_t data_size = this->constant_data_size(i);
+      if (data_size % AOTI_CONST_GPU_ALIGNMENT) {
+        data_size = AOTI_CONST_GPU_ALIGNMENT +
+            (data_size / AOTI_CONST_GPU_ALIGNMENT) * AOTI_CONST_GPU_ALIGNMENT;
+      }
+      constants_internal_offset[i] = blob_size;
+      blob_size += data_size;
+    }
+#endif // USE_CUDA
+  }
+
+  size_t num_inputs() const {
+    return inputs_info_.size();
+  }
+
+  size_t num_outputs() const {
+    return outputs_info_.size();
+  }
+
+  size_t num_constants() const {
+    return constants_info_.size();
+  }
+
+  const char* input_name(int64_t idx) const {
+    return inputs_info_.at(idx).name;
+  }
+
+  const char* output_name(int64_t idx) const {
+    return outputs_info_.at(idx).name;
+  }
+
+  const char* constant_name(int64_t idx) const {
+    return constants_info_.at(idx).name;
+  }
+
+  size_t constant_ndim(int64_t idx) {
+    return constants_info_.at(idx).shape.size();
+  }
+
+  const int64_t* constant_shape(int64_t idx) const {
+    return constants_info_.at(idx).shape.data();
+  }
+
+  const int64_t* constant_stride(int64_t idx) const {
+    return constants_info_.at(idx).stride.data();
+  }
+
+  int32_t constant_dtype(int64_t idx) const {
+    return constants_info_.at(idx).dtype;
+  }
+
+  size_t constant_offset(int64_t idx) const {
+    return constants_info_.at(idx).offset;
+  }
+
+  size_t constant_data_size(int64_t idx) const {
+    return constants_info_.at(idx).data_size;
+  }
+
+  const char* constant_original_fqn(int64_t idx) const {
+    return constants_info_.at(idx).original_fqn;
+  }
+
+  bool constant_from_folded(int64_t idx) const {
+    return constants_info_.at(idx).from_folded;
+  }
+
+  const char* get_in_spec() const {
+    return in_spec_.c_str();
+  }
+
+  const char* get_out_spec() const {
+    return out_spec_.c_str();
+  }
+
+  void update_constants_array_from_map() {
+    if (!constants_map_) {
+      throw std::runtime_error{
+          "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+    }
+    if (!constants_) {
+      constants_ =
+          std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
+    } else {
+      constants_->resize(constants_info_.size());
+    }
+    int idx = 0;
+    for (const auto& info : constants_info_) {
+      const auto it = constants_map_->find(info.name);
+      if (it != constants_map_->end()) {
+        constants_->at(idx) = ConstantHandle(it->second);
+      }
+      idx++;
+    }
+  }
+
+  void update_constants_map(
+      std::shared_ptr<ConstantMap> constants_map,
+      bool remap_constants_array = true) {
+    constants_map_ = std::move(constants_map);
+    if (remap_constants_array) {
+      update_constants_array_from_map();
+    }
+  }
+
+  // This function allows us to update the constants_ that is used to look up
+  // the corresponding constant tensor during runtime.
+  void update_constants_array(
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
+    constants_ = std::move(constants_array);
+  }
+
+  /// Returns true if the model is complete.
+  bool is_finished() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model CUDA event was not initialized"};
+    }
+
+    auto event_status = cudaEventQuery(*run_finished_);
+    if (event_status == cudaSuccess) {
+      return true;
+    } else if (event_status == cudaErrorNotReady) {
+      return false;
+    }
+
+    throw std::runtime_error(
+        std::string("The model did not finish successfully. Error: ") +
+        cudaGetErrorString(cudaGetLastError()));
+#else // !USE_CUDA
+    return run_finished_;
+#endif // USE_CUDA
+  }
+
+  /// Synchronizes completion event.
+  void wait_for_completion() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
+#endif // USE_CUDA
+  }
+
+ protected:
+  struct ParamInfo {
+    const char* name = nullptr;
+  };
+
+  struct ConstInfo {
+    const char* name = nullptr;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> stride;
+    int32_t dtype;
+    int64_t offset;
+    size_t data_size;
+    const char* original_fqn = nullptr;
+    bool from_folded;
+  };
+
+  std::vector<ParamInfo> inputs_info_;
+  std::vector<ParamInfo> outputs_info_;
+  std::vector<ConstInfo> constants_info_;
+  std::string in_spec_;
+  std::string out_spec_;
+
+  std::shared_ptr<ConstantMap> constants_map_;
+  std::shared_ptr<std::vector<ConstantHandle>> constants_;
+
+#ifdef USE_CUDA
+  // Holds the blob storage for constants' at::Tensor for CUDA.
+  CUDAPtr constant_blob_;
+#endif // USE_CUDA
+
+  // A directory with CUDA binary files, e.g. compiled kernels, etc.
+  const std::optional<std::string> cubin_dir_;
+
+  // Record if the model finishes an inference run so that its owning
+  // AOTModelContainer can re-use this instance.
+#ifdef USE_CUDA
+  std::optional<cudaEvent_t> run_finished_;
+#else // !USE_CUDA
+  bool run_finished_;
+#endif
+
+  // Generated model uses this device index to create CUDA guards.
+  int32_t device_type_;
+  int32_t device_idx_;
+};
+
+// Codegen-ed classes can derive from this to keep pointers to loaded kernels.
+class AOTInductorModelKernelsBase {
+ public:
+  virtual ~AOTInductorModelKernelsBase() = default;
+};
+
+class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
+ public:
+  AOTInductorModel(
+      std::shared_ptr<ConstantMap> constants_map,
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir);
+
+  std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false);
+
+  void _const_run_impl(
+      std::vector<AtenTensorHandle>& output_handles,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  void run_impl(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  template <typename Inputs, typename Outputs>
+  Outputs run_impl_minimal_arrayref_interface(
+      const Inputs& inputs,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  static std::unique_ptr<AOTInductorModel> Create(
+      std::shared_ptr<ConstantMap> constants_map,
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir) {
+    return std::make_unique<AOTInductorModel>(
+        std::move(constants_map),
+        std::move(constants_array),
+        device_str,
+        cubin_dir);
+  }
+
+ private:
+  std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
+};
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ef88271b117d8b94a39ca8d565ecda885b2c68e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -0,0 +1,510 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <future>
+#include <mutex>
+#include <shared_mutex>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/model.h>
+
+namespace torch {
+namespace aot_inductor {
+
+class AOTInductorModelContainer {
+ public:
+  AOTInductorModelContainer(
+      size_t num_models,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir = std::nullopt) {
+    constants_map_ = std::make_shared<ConstantMap>();
+    constants_array_ = std::make_shared<std::vector<ConstantHandle>>();
+    use_secondary_ = false;
+    constant_folded_ = false;
+    models_.reserve(num_models);
+    available_models_.reserve(num_models);
+    for (size_t i = 0; i < num_models; ++i) {
+      models_.push_back(AOTInductorModel::Create(
+          constants_map_, constants_array_, device_str, cubin_dir));
+      available_models_.push_back(models_.back().get());
+    }
+
+    // Note that the all following fields (input_names_, output_names,
+    // etc) can be filled in by the AOT
+    // codegen. However, we choose to query such information from
+    // the owned AOTInductorModel for a couple of reasons:
+    //   * simplify the codegen templates
+    //   * reduce information fragmentation and duplication
+    //   * the initialization process below is done only once when the container
+    //     is constructed, so it would have little performance impact
+    auto* model = available_models_[0];
+    size_t num_inputs = model->num_inputs();
+    input_names_.reserve(num_inputs);
+    for (size_t i = 0; i < num_inputs; i++) {
+      input_names_.push_back(model->input_name(i));
+    }
+
+    size_t num_outputs = model->num_outputs();
+    output_names_.reserve(num_outputs);
+    for (size_t i = 0; i < num_outputs; i++) {
+      output_names_.push_back(model->output_name(i));
+    }
+
+    model->load_constants();
+#ifdef USE_CUDA
+    constant_blob_ = model->release_constant_blob();
+    constants_internal_offset_.resize(model->num_constants());
+    model->compute_cuda_constant_blob(blob_size_, constants_internal_offset_);
+#endif
+
+    for (auto& model : models_) {
+      model->update_constants_map(constants_map_);
+    }
+
+    in_spec_ = model->get_in_spec();
+    out_spec_ = model->get_out_spec();
+  }
+
+  void run(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    std::shared_lock model_lk(model_exec_mutex_);
+    auto* model = get_available_model();
+
+    if (!constant_folded_) {
+      // At this point, constant is not ready yet. We need to call constant
+      // folding before we execute the model. We obtain a unique lock at this
+      // point to make sure constant is ready for all.
+      model_lk.unlock();
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      // Double locking to make sure constant folding is only ran once.
+      if (!constant_folded_) {
+        auto folded_const_map = model->run_const_fold(
+            stream, proxy_executor, /* initialization = */ true);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+        constant_folded_ = true;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    }
+
+    try {
+      model->run(input_handles, output_handles, stream, proxy_executor);
+    } catch (...) {
+      std::lock_guard lk(models_mutex_);
+      available_models_.push_back(model);
+      throw;
+    }
+
+    {
+      std::lock_guard lk(models_mutex_);
+      pending_models_.push_back(model);
+    }
+    pending_models_available_.notify_one();
+  }
+
+  size_t num_constants() const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->num_constants();
+  }
+
+  // retrieve the constant name of constants_info_[idx]
+  const char* constant_name(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_name(idx);
+  }
+
+  // retrieve original FQN of constants_info_[idx]
+  const char* constant_original_fqn(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_original_fqn(idx);
+  }
+
+  // retrieve whether constant is from folded of constants_info_[idx]
+  bool constant_from_folded(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_from_folded(idx);
+  }
+
+  // retrieve dtype of constants_info_[idx]
+  int32_t constant_dtype(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_dtype(idx);
+  }
+
+  void run_const_fold(
+      bool inactive_buffer,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    std::shared_lock model_lk(model_exec_mutex_);
+    auto* model = get_available_model();
+
+    if (!inactive_buffer) {
+      // We would need to acquire a unique lock if we want to run constant
+      // folding on the active buffer.
+      model_lk.unlock();
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      try {
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    } else {
+      // We swap the constant mapping to the inactive buffer in the model to run
+      // const run.
+      auto constants_map = get_constants_map(/* get_inactive= */ true);
+      auto constants_array = get_constants_array(/* get_inactive= */ true);
+
+      try {
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            folded_const_map,
+            /* use_inactive = */ true,
+            /* validate_full_update = */ false);
+
+        // Swap back the model's constants mapping
+        constants_map = get_constants_map(/* get_inactive= */ false);
+        constants_array = get_constants_array(/* get_inactive= */ false);
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+    }
+
+    {
+      std::lock_guard lk(models_mutex_);
+      pending_models_.push_back(model);
+    }
+    pending_models_available_.notify_one();
+  }
+
+  // This function updates the buffer for storing constants.
+  // It will update the buffer, the mapping and the array mapping.
+  void update_constant_buffer(
+      const std::unordered_map<std::string, AtenTensorHandle>& constants_map,
+      bool use_inactive,
+      bool validate_full_update) {
+#ifdef USE_CUDA
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No model available in container!");
+    }
+    auto num_constants = models_[0]->num_constants();
+
+    auto* constants_blob_ptr =
+        static_cast<uint8_t*>(get_constant_blob_ptr(use_inactive));
+    auto constants_map_to_update = get_constants_map(use_inactive);
+
+    if (validate_full_update) {
+      for (size_t idx = 0; idx < num_constants; idx++) {
+        if (models_[0]->constant_from_folded(idx)) {
+          continue;
+        }
+
+        auto constant_name = std::string(models_[0]->constant_name(idx));
+        auto it = constants_map.find(constant_name);
+        if (it == constants_map.end()) {
+          throw std::runtime_error(
+              std::string("Cannot find constants ") + constant_name +
+              std::string(" in constants_map!"));
+        }
+      }
+    }
+
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      auto constant_name = std::string(models_[0]->constant_name(idx));
+      auto it = constants_map.find(constant_name);
+      if (it == constants_map.end()) {
+        continue;
+      }
+
+      // Move the data to container handled blob.
+      uint8_t* internal_constants_ptr =
+          constants_blob_ptr + constants_internal_offset_[idx];
+      void* user_constant_ptr;
+      int64_t constant_size;
+      aoti_torch_get_data_ptr(it->second, &user_constant_ptr);
+      aoti_torch_get_storage_size(it->second, &constant_size);
+
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_constants_ptr,
+          user_constant_ptr,
+          constant_size,
+          cudaMemcpyDefault));
+
+      // Generate Tensor from container handled blob.
+      // We extract stride and offset from provided Tensor since we do not
+      // guarantee that the tensor is contiguous.
+      AtenTensorHandle tensor_handle;
+      int64_t* stride;
+      int64_t offset;
+      int device_idx = models_[0]->get_device_idx();
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(it->second, &stride));
+      AOTI_TORCH_ERROR_CODE_CHECK(
+          aoti_torch_get_storage_offset(it->second, &offset));
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+          internal_constants_ptr,
+          models_[0]->constant_ndim(idx),
+          models_[0]->constant_shape(idx),
+          stride,
+          offset,
+          models_[0]->constant_dtype(idx),
+          aoti_torch_device_type_cuda(),
+          device_idx,
+          &tensor_handle));
+
+      // Now place the tensor to constants_map. Note at this point the ownership
+      // of the tensor_handle will be taken over.
+      constants_map_to_update->emplace(constant_name, tensor_handle);
+    }
+
+    // Update the inactive constant array.
+    update_array_from_map(
+        get_constants_array(use_inactive), constants_map_to_update);
+#endif // USE_CUDA
+  }
+
+  void update_array_from_map(
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      std::shared_ptr<ConstantMap> constants_map) {
+    auto num_constants = models_[0]->num_constants();
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      if (constants_map->find(models_[0]->constant_name(idx)) !=
+          constants_map->end()) {
+        constants_array->at(idx) = ConstantHandle(
+            constants_map->find(models_[0]->constant_name(idx))->second);
+      }
+    }
+  }
+
+  void swap_constant_buffer() {
+    std::lock_guard unique_lk(model_exec_mutex_);
+
+    auto constants_map = get_constants_map(/* get_inactive= */ true);
+    auto constants_array = get_constants_array(/* get_inactive= */ true);
+
+    for (auto& model : models_) {
+      model->update_constants_map(
+          constants_map, /* remap_constants_array = */ false);
+      model->update_constants_array(constants_array);
+    }
+
+    use_secondary_ = !use_secondary_;
+  }
+
+  size_t num_inputs() const {
+    return input_names_.size();
+  }
+
+  size_t num_outputs() const {
+    return output_names_.size();
+  }
+
+  const char* input_name(size_t idx) const {
+    return input_names_.at(idx).c_str();
+  }
+
+  const char* output_name(size_t idx) const {
+    return output_names_.at(idx).c_str();
+  }
+
+  size_t num_models() const {
+    return models_.size();
+  }
+
+  const char* get_in_spec() const {
+    return in_spec_;
+  }
+
+  const char* get_out_spec() const {
+    return out_spec_;
+  }
+
+ private:
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  const char* in_spec_;
+  const char* out_spec_;
+
+#ifdef USE_CUDA
+  // Holds the blob storage for constants' at::Tensor for CUDA.
+  CUDAPtr constant_blob_;
+  CUDAPtr constant_blob_secondary_;
+
+  // Let's place this within USE_CUDA at the moment before we fully support
+  // update for CPU cases.
+  size_t blob_size_;
+  std::vector<size_t> constants_internal_offset_;
+#endif // USE_CUDA
+
+  // Determine which constants is being used for the model.
+  // If true,
+  // constants_map_secondary/constant_blob_secondary/constants_array_secondary
+  // is being used.
+  bool use_secondary_;
+
+  // Determine whether we have ran constant folding
+  bool constant_folded_;
+
+  // Holds the mapping of constants to at::Tensor.
+  // The underlying data of at::Tensor is in either constant_blob_ (for CUDA).
+  // or _binary_constants_bin_start (for CPU).
+  std::shared_ptr<ConstantMap> constants_map_;
+  std::shared_ptr<ConstantMap> constants_map_secondary_;
+
+  // Holds the indexed array of constant for faster lookup during runtime.
+  std::shared_ptr<std::vector<ConstantHandle>> constants_array_;
+  std::shared_ptr<std::vector<ConstantHandle>> constants_array_secondary_;
+
+  // Holds all the AOTInductorModel instances owned by this container.
+  std::vector<std::unique_ptr<AOTInductorModel>> models_;
+
+  // Holds the AOTInductorModel instances available for inference.
+  std::vector<AOTInductorModel*> available_models_;
+
+  // Holds the AOTInductorModel instances that have started running
+  // inference and can be placed onto available_models_ upon their
+  // completion.
+  std::deque<AOTInductorModel*> pending_models_;
+
+  // Protects available_models_ and pending_models_.
+  std::mutex models_mutex_;
+
+  // Notified whenever a model is placed onto pending_models_.
+  std::condition_variable pending_models_available_;
+
+  AOTInductorModel* get_available_model() {
+    std::unique_lock lk(models_mutex_);
+    if (available_models_.empty()) {
+      reclaim_finished_models(lk);
+    }
+    auto* result = available_models_.back();
+    available_models_.pop_back();
+    return result;
+  }
+
+  // This mutex is used to protect execution of model.
+  // We acquire the mutex in shared mode if we allow concurrent execution.
+  // We acquire the mutex in unique mode when we want exclusive access of the
+  // model. One such case is when we want to do a weight swapping. We want to
+  // make sure no one is executing the model.
+  std::shared_mutex model_exec_mutex_;
+
+#ifdef USE_CUDA
+  void* get_constant_blob_ptr(bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      return constant_blob_.get();
+    } else {
+      if (!constant_blob_secondary_) {
+        constant_blob_secondary_ = RAII_cudaMalloc(blob_size_);
+      }
+      return constant_blob_secondary_.get();
+    }
+  }
+#endif // USE_CUDA
+
+  std::shared_ptr<ConstantMap> get_constants_map(bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      return constants_map_;
+    } else {
+      if (!constants_map_secondary_) {
+        constants_map_secondary_ = std::make_shared<ConstantMap>();
+      }
+      return constants_map_secondary_;
+    }
+  }
+
+  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array(
+      bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      return constants_array_;
+    } else {
+      if (!constants_array_secondary_) {
+        constants_array_secondary_ =
+            std::make_shared<std::vector<ConstantHandle>>(
+                models_[0]->num_constants());
+      }
+      return constants_array_secondary_;
+    }
+  }
+
+  void reclaim_finished_models(std::unique_lock<std::mutex>& lk) {
+    // push finished model instances to the end of pending_models_
+    auto it = std::stable_partition(
+        pending_models_.begin(),
+        pending_models_.end(),
+        [](AOTInductorModel* m) { return !m->is_finished(); });
+
+    if (it != pending_models_.end()) {
+      // We have finished model instances that can be pushed into
+      // available_models_ so that we don't have to be blocked on waiting
+      // the pending_models_available_ condition.
+      available_models_.insert(
+          available_models_.end(), it, pending_models_.end());
+      pending_models_.erase(it, pending_models_.end());
+      return;
+    }
+
+    pending_models_available_.wait(
+        lk, [this]() { return !pending_models_.empty(); });
+    // Let's make the schedule simple first. We always wait on the first
+    // pending_models_ to be complete.
+    auto* model = pending_models_.front();
+    pending_models_.pop_front();
+    lk.unlock();
+    try {
+      model->wait_for_completion();
+    } catch (...) {
+      lk.lock();
+      available_models_.push_back(model);
+      throw;
+    }
+    lk.lock();
+    available_models_.push_back(model);
+  }
+};
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..07014dd7ce124cb5e9934abddf3aab07112ee2d3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+namespace torch {
+namespace aot_inductor {
+
+template <typename T>
+inline RAIIAtenTensorHandle scalar_to_tensor_handle(T value) {
+  throw std::runtime_error("Unsupported scalar_to_tensor_handle");
+}
+
+// Specialize for supported C++ primitive types
+#define AOTI_RUNTIME_SCALAR_TO_TENSOR(dtype, ctype)                         \
+  template <>                                                               \
+  inline RAIIAtenTensorHandle scalar_to_tensor_handle<ctype>(ctype value) { \
+    AtenTensorHandle tensor_handle;                                         \
+    AOTI_TORCH_ERROR_CODE_CHECK(                                            \
+        aoti_torch_scalar_to_tensor_##dtype(value, &tensor_handle));        \
+    return RAIIAtenTensorHandle(tensor_handle);                             \
+  }
+
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float32, float)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float64, double)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint8, uint8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint16, uint16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint32, uint32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint64, uint64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int8, int8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int16, int16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int32, int32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int64, int64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(bool, bool)
+#undef AOTI_RUNTIME_SCALAR_TO_TENSOR
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ebfe4e5abbb6c74617fd6020205e7f37e0c01d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+
+namespace torch {
+namespace aot_inductor {
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor;
+
+template <>
+struct ThreadLocalCachedOutputTensor<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const RAIIAtenTensorHandle&) {}
+  void copy_data_from(const RAIIAtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<AtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const AtenTensorHandle&) {}
+  void copy_data_from(const AtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<ConstantHandle> {
+  explicit ThreadLocalCachedOutputTensor(const ConstantHandle&) {}
+  void copy_data_from(const ConstantHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputTensor(const ArrayRefTensor<T>& t) {
+    realloc(t);
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      realloc(t);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+  }
+
+  AtenTensorHandle tensor() const {
+    return tensor_.get();
+  }
+
+ private:
+  void realloc(const ArrayRefTensor<T>& t) {
+    capacity_ = t.numel();
+    storage_ = std::make_unique<T[]>(t.numel());
+    AtenTensorHandle handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+        storage_.get(),
+        t.sizes().size(),
+        t.sizes().data(),
+        t.strides().data(),
+        0,
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        t.device_type(),
+        t.device_idx(),
+        &handle));
+    tensor_ = handle;
+  }
+
+  std::unique_ptr<T[]> storage_;
+  size_t capacity_ = 0;
+  RAIIAtenTensorHandle tensor_;
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray;
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputArray(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<ConstantHandle> {
+  explicit ThreadLocalCachedOutputArray(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputArray(const ArrayRefTensor<T>& t) {}
+
+  template <
+      typename U,
+      std::enable_if_t<
+          std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>,
+          bool> = true>
+  ArrayRefTensor<T> arrayref_tensor() const {
+    return tensor_;
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      capacity_ = t.numel();
+      storage_ = std::make_unique<T[]>(capacity_);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+    tensor_ = t;
+    tensor_.set_arrayref(MiniArrayRef<T>(storage_.get(), t.numel()));
+  }
+
+ private:
+  std::unique_ptr<T[]> storage_;
+  uint32_t capacity_ = 0;
+  ArrayRefTensor<T> tensor_;
+};
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3f3987a804e38687e24c9c171b1b75e33827d5b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define AOTI_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define AOTI_NOINLINE __declspec(noinline)
+#else
+#define AOTI_NOINLINE
+#endif
+
+AOTI_NOINLINE static void throw_exception(
+    const char* call,
+    const char* file,
+    int64_t line) {
+  std::stringstream ss;
+  ss << call << " API call failed at " << file << ", line " << line;
+  throw std::runtime_error(ss.str());
+}
+
+#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
+  if ((call) != AOTI_TORCH_SUCCESS) {           \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+using AOTIRuntimeError = int32_t;
+#define AOTI_RUNTIME_SUCCESS 0
+#define AOTI_RUNTIME_FAILURE 1
+
+#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)     \
+  if ((call) != AOTI_RUNTIME_SUCCESS) {         \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+namespace torch::aot_inductor {
+
+using DeleterFnPtr = void (*)(void*);
+
+inline void noop_deleter(void*) {}
+
+inline void delete_tensor_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
+}
+
+// RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
+class RAIIAtenTensorHandle {
+ public:
+  RAIIAtenTensorHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIAtenTensorHandle(const RAIIAtenTensorHandle& other) = delete;
+  RAIIAtenTensorHandle& operator=(const RAIIAtenTensorHandle& other) = delete;
+
+  // Steal the ownership from another RAIIAtenTensorHandle using std::move
+  RAIIAtenTensorHandle(RAIIAtenTensorHandle&& other) = default;
+  RAIIAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) = default;
+
+  // Steal the ownership from raw AtenTensorHandle
+  RAIIAtenTensorHandle(AtenTensorHandle handle)
+      : handle_(handle, delete_tensor_object) {}
+
+  ~RAIIAtenTensorHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw AtenTensorHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenTensorHandle() const {
+    return handle_.get();
+  }
+
+  AtenTensorHandle release() {
+    return handle_.release();
+  }
+
+  AtenTensorHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+  int64_t size(int64_t d) {
+    int64_t size;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_.get(), d, &size));
+    return size;
+  }
+
+  int64_t stride(int64_t d) {
+    int64_t stride;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_stride(handle_.get(), d, &stride));
+    return stride;
+  }
+
+  int64_t storage_offset() {
+    int64_t storage_offset;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_storage_offset(handle_.get(), &storage_offset));
+    return storage_offset;
+  }
+
+ private:
+  std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
+};
+
+// Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
+inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
+    AtenTensorHandle* handles,
+    size_t size) {
+  std::vector<RAIIAtenTensorHandle> result;
+  result.reserve(size);
+  for (size_t i = 0; i < size; i++) {
+    result.emplace_back(handles[i]);
+    handles[i] = nullptr;
+  }
+  return result;
+}
+
+class ConstantHandle {
+ public:
+  ConstantHandle() = default;
+
+  explicit ConstantHandle(AtenTensorHandle handle) : handle_(handle) {
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &data_));
+  }
+
+  operator AtenTensorHandle() const {
+    return handle_;
+  }
+
+  AtenTensorHandle tensor() const {
+    return handle_;
+  }
+
+  void* data_ptr() const {
+    return data_;
+  }
+
+ private:
+  AtenTensorHandle handle_;
+  void* data_ = nullptr;
+};
+
+inline void* get_data_ptr_wrapper(const ConstantHandle& constant) {
+  return constant.data_ptr();
+}
+
+inline const ConstantHandle& unwrap_raii_handle_if_needed(
+    const ConstantHandle& handle) {
+  return handle;
+}
+
+// Shouldn't be called.
+inline AtenTensorHandle wrap_with_raii_handle_if_needed(
+    const ConstantHandle& handle) = delete;
+
+#define CACHE_TORCH_DTYPE(typename) \
+  static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()
+
+#define CACHE_TORCH_DEVICE(device)                \
+  static auto cached_torch_device_type_##device = \
+      aoti_torch_device_type_##device()
+
+} // namespace torch::aot_inductor
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dfbe7450937de8d452ae7c241fc2333b17f4579
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifdef USE_CUDA
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace torch::aot_inductor {
+
+inline void delete_cuda_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_cuda_guard(reinterpret_cast<CUDAGuardHandle>(ptr)));
+}
+
+inline void delete_cuda_stream_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
+      reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
+}
+
+class AOTICudaGuard {
+ public:
+  AOTICudaGuard(int32_t device_index) : guard_(nullptr, delete_cuda_guard) {
+    CUDAGuardHandle ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(int32_t device_index) {
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_cuda_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<CUDAGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class AOTICudaStreamGuard {
+ public:
+  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index)
+      : guard_(nullptr, delete_cuda_stream_guard) {
+    CUDAStreamGuardHandle ptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+ private:
+  std::unique_ptr<CUDAStreamGuardOpaque, DeleterFnPtr> guard_;
+};
+
+} // namespace torch::aot_inductor
+#endif // USE_CUDA
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aca44088ead17ac85570bd75714069b663a9ef7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -0,0 +1,571 @@
+#ifndef AOTI_TORCH_SHIM
+#define AOTI_TORCH_SHIM
+
+#include <stddef.h>
+#include <stdint.h>
+
+// This header defines a stable C API for certain ATen functionality in
+// libtorch. The AOTInductor compiled model.so will only refer to this header
+// instead of other headers from aten/c10, which means it will NOT be able to
+// directly use any data structures or call functions from libtorch.
+//
+// What problems are we trying to solve here?  Direct use of aten/c10 APIs
+// means use of C++ APIs on a library that doesn't have any ABI compatibility
+// guarantees.  However, we want model.so to remain usable across updates
+// to the PyTorch C++ libraries, which requires a stable ABI.  By introducing
+// a C shim layer, we can minimize the surface that will cause breakage. The
+// corresponding software stack can be illustrated as follows:
+//
+// |--------------------------------|
+// |     inference service code     |
+// |--------------------------------|
+// |           model.so             |
+// |--------------|-----------------|
+// |           <c shim>             |
+// |          libtorch.so           |
+// |--------------------------------|
+//
+// The general guidelines for the C API:
+//
+//  - No exceptions, return an explicit error code to be checked at call site
+//  - Only pointers (AtenTensorHandle counts), integers and floats in headers
+//
+// If you want to make changes to this header, you MUST MAINTAIN ABI
+// compatibility.  Typically, this means you will have to add a _v2 version
+// of a function that you, e.g., want to add a new function parameter to, and
+// maintain the old and new versions of the APIs until all old model.so
+// go out of use.
+
+#ifdef __GNUC__
+#define AOTI_TORCH_EXPORT __attribute__((__visibility__("default")))
+#else // !__GNUC__
+#ifdef _WIN32
+#define AOTI_TORCH_EXPORT __declspec(dllexport)
+#else // !_WIN32
+#define AOTI_TORCH_EXPORT
+#endif // _WIN32
+#endif // __GNUC__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AtenTensorHandle represents an abstract notion of Tensor that can be passed
+// between model.so and libtorch.so.  The contents of the structure itself
+// are private; model.so is not allowed to access any fields directly, it must
+// go through functions defined in this ABI.  Under the hood, this is
+// represented as at::Tensor*, but we reserve the right to change this (and in
+// fact, we probably should change it to at::TensorImpl* at least).
+//
+// An AtenTensorHandle can be owning (please check the API reference for exact
+// ownership/borrow semantics).  If you have an owning AtenTensorHandle
+// in model.so, you are obligated to aoti_torch_delete_tensor_object when you
+// are done.  You can use the helper C++ class RAIIAtenTensorHandle
+// (see aot_runtime/model.h) to ensure the deallocator is called in RAII style
+// (note that RAIIAtenTensorHandle is private to model.so, and never crosses
+// the ABI boundary.)
+struct AtenTensorOpaque;
+using AtenTensorHandle = AtenTensorOpaque*;
+
+struct AOTIProxyExecutorOpaque;
+using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
+
+using AOTITorchError = int32_t;
+#define AOTI_TORCH_SUCCESS 0
+#define AOTI_TORCH_FAILURE 1
+
+// Getter functions for retrieving various constants from the runtime, that
+// can subsequently be passed to other aoti_* functions.  By hiding these
+// behind functions, the precise value of device/dtype is NOT part of the
+// ABI contract.  (In practice, aten/c10 is pretty good about not renumbering
+// these, so we probably could later switch to having these in the ABI, if
+// desired for perf reasons.)
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
+
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fn();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2fnuz();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fnuz();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint8();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bool();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex128();
+
+// Functions for converting a single-element tensor to a scalar value
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float32(AtenTensorHandle tensor, float* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float64(AtenTensorHandle tensor, double* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int8(AtenTensorHandle tensor, int8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int16(AtenTensorHandle tensor, int16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int32(AtenTensorHandle tensor, int32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int64(AtenTensorHandle tensor, int64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_bool(AtenTensorHandle tensor, bool* ret_value);
+
+// Functions for wrapping a scalar value to a single-element tensor
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(
+    float value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(
+    double value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(
+    uint8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(
+    uint16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(
+    uint32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(
+    uint64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(
+    int8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(
+    int16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(
+    int32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(
+    int64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle* ret_new_tensor);
+
+AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled();
+AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
+
+// Free the tensor object
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
+
+// Get a pointer to the underlying storage data
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
+    AtenTensorHandle tensor,
+    void** ret_data_ptr // returns borrowed reference
+);
+
+// Get the nbytes of the underlying storage
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t* ret_size);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_dim(AtenTensorHandle tensor, int64_t* ret_dim);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_numel(AtenTensorHandle tensor, int64_t* ret_numel);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes(
+    AtenTensorHandle tensor,
+    int64_t** ret_sizes // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t* ret_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides(
+    AtenTensorHandle tensor,
+    int64_t** ret_strides // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t* ret_stride);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t* ret_dtype);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
+    AtenTensorHandle tensor,
+    int64_t* ret_storage_offset);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool(
+    AtenTensorHandle self,
+    int64_t offset_bytes,
+    int32_t dtype,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    AtenTensorHandle* ret_new_tensor);
+
+// This function will create a new tensor object and its pointer is returned
+// through *out. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
+    AtenTensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    AtenTensorHandle* ret_new_tensor // returns new reference
+);
+
+// This function will create a new tensor object and its pointer is returned
+// through *out. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret_new_tensor // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
+    AtenTensorHandle weight,
+    AtenTensorHandle indices,
+    AtenTensorHandle offsets,
+    int32_t scale_grad_by_freq,
+    int32_t mode,
+    int32_t sparse,
+    AtenTensorHandle per_sample_weights, // optional argument
+    int32_t include_last_offset,
+    int32_t padding_idx,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
+    AtenTensorHandle self,
+    const int64_t* dim_ptr,
+    int64_t dim_size,
+    int64_t normalization,
+    int32_t forward,
+    AtenTensorHandle* ret // returns new reference
+);
+
+// This version is deprecated. We will remove it later
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    double scale,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_flash_attention_v2(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    int is_causal,
+    int return_debug_mask,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_efficient_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    AtenTensorHandle attn_bias, // optional argument
+    int compute_log_sumexp,
+    double dropout_p,
+    int is_causal,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
+    AtenTensorHandle self,
+    AtenTensorHandle mat2,
+    AtenTensorHandle bias,
+    int32_t* out_dtype,
+    AtenTensorHandle scale_a,
+    AtenTensorHandle scale_b,
+    AtenTensorHandle scale_result,
+    int8_t use_fast_accum,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias, // optional argument
+    const int64_t* stride_ptr,
+    int64_t stride_size,
+    const int64_t* padding_ptr,
+    int64_t padding_size,
+    const int64_t* dilation_ptr,
+    int64_t dilation_size,
+    int transposed,
+    const int64_t* output_padding_ptr,
+    int64_t output_padding_size,
+    int64_t groups,
+    AtenTensorHandle* ret // returns new reference
+);
+
+// This function will create a new uninitialized tensor object
+// and its pointer is returned through *ret.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret);
+
+// WARNING: This will be deprecated. Use aoti_torch_copy_ instead.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
+
+// Make the tensor referred to by dst an alias for the tensor referred
+// to by src. The two tensors must still be deleted with
+// aoti_torch_delete_tensor separately (or not) as before the call.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst);
+
+// Make a shallow copy of the tensor referred to by src and assign
+// it to the handle in the ret_dst. This is similar to the above
+// aoti_torch_assign_tensors function, but creates and sets the
+// ret_dst from within.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle* ret_dst);
+
+// This function will create a new tensor object and its pointer is returned
+// through *ret. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat1,
+    AtenTensorHandle mat2,
+    float beta,
+    float alpha);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
+    AtenTensorHandle self,
+    AtenTensorHandle src,
+    int32_t non_blocking);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(
+    AtenTensorHandle repeats,
+    int64_t* output_size,
+    AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_check_inf_and_nan(AtenTensorHandle tensor);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    int64_t dim,
+    AtenTensorHandle index,
+    AtenTensorHandle src);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    int64_t dim,
+    AtenTensorHandle index,
+    AtenTensorHandle src,
+    const char* reduce,
+    int32_t include_self);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    const AtenTensorHandle* indices,
+    const uint32_t num_indices,
+    const AtenTensorHandle values,
+    bool accumulate);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
+    AtenTensorHandle self,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
+    AtenTensorHandle self,
+    int32_t dtype,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
+    AtenTensorHandle self,
+    const char* msg);
+
+#ifdef USE_CUDA
+
+struct CUDAGuardOpaque;
+using CUDAGuardHandle = CUDAGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
+
+struct CUDAStreamGuardOpaque;
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
+
+#endif
+
+// See `ProxyExecutor Design Note` in ir.py for more details
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(
+    AOTIProxyExecutorHandle proxy_executor,
+    int extern_node_index,
+    int num_ints,
+    int64_t* flatten_int_args,
+    int num_tensors,
+    AtenTensorHandle* flatten_tensor_args);
+
+AOTI_TORCH_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+#ifdef STRIP_ERROR_MESSAGES
+#define AOTI_TORCH_CHECK(cond, ...)    \
+  aoti_torch_check(                    \
+      cond,                            \
+      __func__,                        \
+      __FILE__,                        \
+      static_cast<uint32_t>(__LINE__), \
+      TORCH_CHECK_MSG(cond, "", __VA_ARGS__));
+#else
+#define AOTI_TORCH_CHECK(cond, ...)    \
+  aoti_torch_check(                    \
+      cond,                            \
+      __func__,                        \
+      __FILE__,                        \
+      static_cast<uint32_t>(__LINE__), \
+      TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__));
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+
+template <typename T>
+int32_t aoti_torch_dtype();
+
+#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \
+  template <>                                        \
+  inline int32_t aoti_torch_dtype<ctype>() {         \
+    return aoti_torch_dtype_##typename();            \
+  }
+
+// REVIEW: bfloat16 and half don't seem to actually build? Do I have
+// the wrong types?
+//  DEFINE_DTYPE_SPECIALIZATION(__bfloat16, bfloat16)
+//  DEFINE_DTYPE_SPECIALIZATION(half, float16)
+DEFINE_DTYPE_SPECIALIZATION(float, float32)
+DEFINE_DTYPE_SPECIALIZATION(double, float64)
+DEFINE_DTYPE_SPECIALIZATION(uint8_t, uint8)
+DEFINE_DTYPE_SPECIALIZATION(int8_t, int8)
+DEFINE_DTYPE_SPECIALIZATION(int16_t, int16)
+DEFINE_DTYPE_SPECIALIZATION(int32_t, int32)
+DEFINE_DTYPE_SPECIALIZATION(int64_t, int64)
+DEFINE_DTYPE_SPECIALIZATION(bool, bool)
+
+#endif
+
+#endif // AOTI_TORCH_SHIM
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..216cb83bda54f9ad3bfc304f7ca45fdef17986c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -0,0 +1,1280 @@
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__new_zeros_with_same_feature_meta(AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__has_same_storage_numel(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_async(AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_async_msg(AtenTensorHandle self, const char* assert_msg);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__assert_scalar(double self, const char* assert_msg);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_assert_scalar(double self, const char* assert_msg, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_assert_async_msg(AtenTensorHandle self, const char* assert_msg, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__print(const char* s);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sym_constrain_range(double size, int64_t* min, int64_t* max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_sym_constrain_range(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__functional_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dep_token(int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_abs_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_physical(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu__Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu__Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr_(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_affine_grid_generator(AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__is_all_true(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__is_any_true(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_functorch_fallback(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_allclose(AtenTensorHandle self, AtenTensorHandle other, double rtol, double atol, int32_t equal_nan, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange(double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_out(AtenTensorHandle out, double end);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_arange_start_out(AtenTensorHandle out, double start, double end, double step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argmin_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_with_logits(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bincount(AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copysign__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__lazy_clone(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_block_diag(const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_complex(AtenTensorHandle real, AtenTensorHandle imag, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_complex_out(AtenTensorHandle out, AtenTensorHandle real, AtenTensorHandle imag);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar_out(AtenTensorHandle out, AtenTensorHandle abs, AtenTensorHandle angle);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_constant_pad_nd(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_overrideable(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_overrideable(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_tbc(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_dim_IntList(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero(AtenTensorHandle self, int64_t* dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cummax_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cummin_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss(AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_Tensor(AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward_Tensor(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diag_embed(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dot(AtenTensorHandle self, AtenTensorHandle tensor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_vdot(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_vdot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding(AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_dense_backward(AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm_(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_memory_format(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_permuted(const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_full(AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_zeros(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_ones(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_affine_quantized(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, double scale, int64_t zero_point, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_per_channel_affine_quantized(const int64_t* size, int64_t size_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_strided(const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_m(int64_t n, int64_t m, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_out(AtenTensorHandle out, int64_t n);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_eye_m_out(AtenTensorHandle out, int64_t n, int64_t m);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide__Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full(const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double fill_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_like(AtenTensorHandle self, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_from_file(const char* filename, int32_t* shared, int64_t* size, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__grid_sampler_2d_cpu_fallback(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha(int64_t window_length, int32_t periodic, double alpha, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_beta(int64_t window_length, int32_t periodic, double alpha, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_beta(int64_t window_length, int32_t periodic, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2r(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2r_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__validate_compressed_sparse_indices(int32_t is_crow, AtenTensorHandle compressed_idx, AtenTensorHandle plain_idx, int64_t cdim, int64_t dim, int64_t nnz);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle elements, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle elements, double test_element, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isin_Scalar_Tensor_out(AtenTensorHandle out, double element, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_same_size(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linear_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace(double start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_out(AtenTensorHandle out, double start, double end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_xlogy__Scalar_Other(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace(double start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_out(AtenTensorHandle out, double start, double end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logsumexp(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logsumexp_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_aminmax_out(AtenTensorHandle min, AtenTensorHandle max, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__compute_linear_combination(AtenTensorHandle input, AtenTensorHandle coefficients, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__compute_linear_combination_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle coefficients);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_dim_max(AtenTensorHandle max, AtenTensorHandle max_values, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_amax_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mean(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mean_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min_dim_min(AtenTensorHandle min, AtenTensorHandle min_indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_amin_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_convolution(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer(AtenTensorHandle input, AtenTensorHandle weight0, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle hx_, AtenTensorHandle cx_, int32_t reverse, const int64_t* batch_sizes, int64_t batch_sizes_len_, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t bidirectional, int32_t batch_first, int32_t train, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_backward(AtenTensorHandle input, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle weight4, AtenTensorHandle hx_, AtenTensorHandle cx_tmp, AtenTensorHandle output, AtenTensorHandle hy_, AtenTensorHandle cy_, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int32_t reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, int32_t batch_first, AtenTensorHandle workspace, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5, AtenTensorHandle* ret6);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_weight_to_int4pack(AtenTensorHandle self, int64_t innerKTiles, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mv(AtenTensorHandle self, AtenTensorHandle vec, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow_copy(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t start, int64_t length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_training(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_stats(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_stats_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_update_stats(AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nnpack_spatial_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__euclidean_dist(AtenTensorHandle x1, AtenTensorHandle x2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_shuffle(AtenTensorHandle self, int64_t upscale_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_unshuffle(AtenTensorHandle self, int64_t downscale_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_channel_shuffle(AtenTensorHandle self, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_channel_shuffle(AtenTensorHandle self, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_pinned(AtenTensorHandle self, int32_t* device, int32_t device_index_, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rad2deg_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scalar_tensor(double s, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_out(AtenTensorHandle out, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like(AtenTensorHandle self, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_low_dtype(AtenTensorHandle self, int64_t low, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm_out(AtenTensorHandle out, int64_t n);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_out_(AtenTensorHandle out, double start, double end);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_range_out(AtenTensorHandle out, double start, double end, double step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat(AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__prelu_kernel(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__prelu_kernel_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gelu_out(AtenTensorHandle out, AtenTensorHandle self, const char* approximate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gelu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const char* approximate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu_(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mish_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_inverse(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_scatter(AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__softmax_backward_data_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze__dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze__dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sspaddmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__chunk_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__chunk_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nansum(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nansum_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_threshold_out(AtenTensorHandle out, AtenTensorHandle self, double threshold, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_threshold_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_flip(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_roll(AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rot90(AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transform_bias_rescale_qkv(AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask(AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask_left_aligned(AtenTensorHandle t, AtenTensorHandle mask, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded(AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer_copy(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_jagged_copy(AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_get_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim(AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_consecutive(AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_consecutive(AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique2(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_where_self_out(AtenTensorHandle out, AtenTensorHandle condition, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface(AtenTensorHandle v, AtenTensorHandle g, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_backward(AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__standard_gamma_grad(AtenTensorHandle self, AtenTensorHandle output, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dirichlet_grad(AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_dim(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__spdiags(AtenTensorHandle diagonals, AtenTensorHandle offsets, const int64_t* shape, int64_t shape_len_, int32_t* layout, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_ScalarOpt_dtype(AtenTensorHandle self, double* p, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_Scalar(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_addmm(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__addmm_activation_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_compressed_tensor_comp_plain_value_size(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_compressed_tensor_comp_plain_value(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_coo_tensor_size(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_dim(AtenTensorHandle self, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dense_dim(AtenTensorHandle self, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_coalesced(AtenTensorHandle self, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_sparse_dim(AtenTensorHandle self, int64_t sparse_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csr(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csc(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsr(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsc(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_mkldnn(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_dynamic(AtenTensorHandle self, int32_t dtype, int32_t reduce_range, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor(AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_channel(AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_self(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_tensor_quantized_tensor(AtenTensorHandle self, double scale, int64_t zero_point, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_channel_quantized_tensor(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_tensor_affine_cachemask(AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_per_tensor_affine_cachemask_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_channel_affine_cachemask(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_copy(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t non_blocking, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__local_scalar_dense(AtenTensorHandle self, double* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pack_padded_sequence(AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_is_set_to(AtenTensorHandle self, AtenTensorHandle tensor, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill__Scalar(AtenTensorHandle self, AtenTensorHandle mask, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill__Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax(AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_backward(AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put_(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill__int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill__int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, const char* reduce);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___irshift___Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___irshift___Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm_(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_trace(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_take_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_take(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_select_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_select(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_select_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_static_out(AtenTensorHandle out, AtenTensorHandle self, int64_t size, int64_t fill_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero_static(AtenTensorHandle self, int64_t size, int64_t fill_value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gather_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, int32_t sparse_grad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve_X(AtenTensorHandle X, AtenTensorHandle M, AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_check_errors(AtenTensorHandle info, const char* api_name, int32_t is_matrix);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_solve_triangular_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_solve_triangular(AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cholesky_solve_helper(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf_a(AtenTensorHandle a, AtenTensorHandle tau, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lu_unpack_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dist(AtenTensorHandle self, AtenTensorHandle other, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t bins, double min, double max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bins_tensor_out(AtenTensorHandle hist, AtenTensorHandle bin_edges, AtenTensorHandle self, AtenTensorHandle bins, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bins_tensor(AtenTensorHandle self, AtenTensorHandle bins, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct_out(AtenTensorHandle hist, AtenTensorHandle bin_edges, AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_tensors(AtenTensorHandle self, const AtenTensorHandle* bins, int64_t bins_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_min_unary_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unary_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_values_stable(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argsort_stable(AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_all_all_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_any_all_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_renorm_out(AtenTensorHandle out, AtenTensorHandle self, double p, int64_t dim, double maxnorm);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_backward(AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_foreach_non_finite_check_and_unscale_(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale_(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_abs_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_acos_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_asin_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_atan_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_ceil_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cos_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cosh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erf_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erfc_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_exp_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_expm1_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_floor_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_frac_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lgamma_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log10_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log1p_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log2_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_neg_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__Scalar(const AtenTensorHandle* self, int64_t self_len_, double exponent);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_reciprocal_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_round_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sigmoid_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sign_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sin_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sinh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sqrt_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tan_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tanh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_trunc_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_zero_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_copy_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Scalar(double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Tensor_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Scalar_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_indices_from_coo_to_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t size, int32_t out_int32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convert_indices_from_csr_to_coo_out(AtenTensorHandle out, AtenTensorHandle crow_indices, AtenTensorHandle col_indices, int32_t out_int32, int32_t transpose);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mse_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss(AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multi_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_forward_output(AtenTensorHandle output, AtenTensorHandle is_target, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_forward(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_multilabel_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_forward(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nll_loss2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_smooth_l1_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_backward_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_huber_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_elu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha, double scale, double input_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_elu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, double alpha, double scale, double input_scale, int32_t is_result, AtenTensorHandle self_or_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward(AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_jvp(AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_jvp(AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardsigmoid_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardsigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_out(AtenTensorHandle out, AtenTensorHandle self, double min_val, double max_val);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_backward(AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardtanh_(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_leaky_relu_out(AtenTensorHandle out, AtenTensorHandle self, double negative_slope);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_leaky_relu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double negative_slope, int32_t self_is_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_forward_output(AtenTensorHandle output, AtenTensorHandle buffer, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_forward(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_log_sigmoid_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rrelu_with_noise_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softplus_out(AtenTensorHandle out, AtenTensorHandle self, double beta, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softplus_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double beta, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_softshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d(AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reflection_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bilinear2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bilinear2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bilinear2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bilinear2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bicubic2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_bicubic2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_nearest3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__upsample_nearest_exact3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_transpose3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_forward_output(AtenTensorHandle output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_forward(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_weight, AtenTensorHandle grad_bias, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_output_mask(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv3d_forward_output(AtenTensorHandle output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv3d_forward(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated2d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col2im_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col2im(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_im2col_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_im2col(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_fftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_fftfreq_out(AtenTensorHandle out, int64_t n, double d);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_rfftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fft_rfftfreq_out(AtenTensorHandle out, int64_t n, double d);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_cholesky_ex_L(AtenTensorHandle L, AtenTensorHandle info, AtenTensorHandle self, int32_t upper, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_cross_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_factor_ex_out(AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, int32_t pivot, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle A, int32_t pivot);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lu_solve_out(AtenTensorHandle out, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle B, int32_t left, int32_t adjoint);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_det_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_ldl_factor_ex_out(AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle self, int32_t hermitian, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_ldl_solve_out(AtenTensorHandle out, AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle B, int32_t hermitian);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lstsq(AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_lstsq_out(AtenTensorHandle solution, AtenTensorHandle residuals, AtenTensorHandle rank, AtenTensorHandle singular_values, AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_matrix_exp(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_slogdet_sign(AtenTensorHandle sign, AtenTensorHandle logabsdet, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eig(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eig_out(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_eigvals(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_eigvals_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_eigh_eigenvalues(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle A, const char* UPLO, int32_t compute_v);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_householder_product(AtenTensorHandle input, AtenTensorHandle tau, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_householder_product_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle tau);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_inv_ex_inverse(AtenTensorHandle inverse, AtenTensorHandle info, AtenTensorHandle A, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_vector_norm_out(AtenTensorHandle out, AtenTensorHandle self, double ord, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_svd_U(AtenTensorHandle U, AtenTensorHandle S, AtenTensorHandle Vh, AtenTensorHandle A, int32_t full_matrices, int32_t compute_uv, const char** driver);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_pinv_atol_rtol_tensor(AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_pinv_atol_rtol_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linalg_solve_ex_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, AtenTensorHandle B, int32_t left, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_qr_out(AtenTensorHandle Q, AtenTensorHandle R, AtenTensorHandle A, const char* mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_parallel_materialize(AtenTensorHandle self, int64_t num_parallel, int32_t skip_first, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_intlist(AtenTensorHandle values, const int64_t** addends, int64_t addends_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_filled_intlist(AtenTensorHandle values, const int64_t** addends, int64_t addends_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_floatlist(AtenTensorHandle values, const double** addends, int64_t addends_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_warn_in_autograd(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_fullcoverage(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_tensor_list(const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal_copy(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual_copy(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_broadcast_to_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_copy(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute_copy(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_copy_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_copy_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_copy_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_copy(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unbind_copy_int_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_split_copy_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_split_with_sizes_copy_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_copy(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transformer_encoder_layer_fwd(AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_multi_head_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sdp_choice(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_mask, double dropout_p, int32_t is_causal, double* scale, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foobar(AtenTensorHandle self, int32_t arg1, int32_t arg2, int32_t arg3, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__new_zeros_with_same_feature_meta_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t deterministic, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_flatten_weight_out(AtenTensorHandle out, const AtenTensorHandle* weight_arr, int64_t weight_arr_len_, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, int32_t bidirectional);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cudnn_init_dropout_state_out(AtenTensorHandle out, double dropout, int32_t train, int64_t dropout_seed);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double p, int32_t* train);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle mask, double scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_physical_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__add_relu_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_add_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_functorch_fallback_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bartlett_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_batch_norm_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle mean, AtenTensorHandle var, double eps, double output_scale, int64_t output_zero_point);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_binary_cross_entropy_with_logits_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bincount_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_blackman_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_block_diag_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_constant_pad_nd_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_overrideable_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward_overrideable_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_tbc_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__copy_from_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__copy_from_and_resize_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_dim_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_count_nonzero_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, int64_t N, int64_t C, int64_t H, int64_t W);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_affine_grid_generator_backward_out(AtenTensorHandle out, AtenTensorHandle grad, int64_t N, int64_t C, int64_t H, int64_t W);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon, AtenTensorHandle reserveSpace);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mps_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mps_convolution_transpose_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_convolution_add_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle z, double* alpha, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_grid_sampler_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle grid);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cudnn_grid_sampler_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grid, AtenTensorHandle grad_output);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_Tensor_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__ctc_loss_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diag_embed_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_div_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_div_Scalar_mode_out(AtenTensorHandle out, AtenTensorHandle self, double other, const char** rounding_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_out(AtenTensorHandle out, AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_embedding_renorm(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_permuted_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_empty_strided_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_full_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_zeros_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_new_ones_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double scale, int64_t zero_point, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__empty_per_channel_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__resize_output_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__resize_output(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle qtensor, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_empty_strided_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_floor_divide_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_full_like_out(AtenTensorHandle out, AtenTensorHandle self, double fill_value, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_from_file_out(AtenTensorHandle out, const char* filename, int32_t* shared, int64_t* size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__grid_sampler_2d_cpu_fallback_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_3d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hamming_window_periodic_alpha_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kaiser_window_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_group_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__index_put_impl(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isnan_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_layer_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_input_out(AtenTensorHandle out, const int64_t* input_size, int64_t input_size_len_, AtenTensorHandle grad_output, AtenTensorHandle weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_weights_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, int32_t bias_defined);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_matmul_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle other, const int32_t* mask, int64_t mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__aminmax_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_max_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantized_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mps_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mps_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight0, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle hx_, AtenTensorHandle cx_, int32_t reverse, const int64_t* batch_sizes, int64_t batch_sizes_len_, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t bidirectional, int32_t batch_first, int32_t train);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle out6, AtenTensorHandle input, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle weight4, AtenTensorHandle hx_, AtenTensorHandle cx_tmp, AtenTensorHandle output, AtenTensorHandle hy_, AtenTensorHandle cy_, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int32_t reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, int32_t batch_first, AtenTensorHandle workspace);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_depthwise_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_miopen_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sparse_matmul_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_functional(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_batch_norm_legit_no_training_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_gather_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, int64_t count);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_gather_stats_with_counts_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, AtenTensorHandle counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_backward_reduce_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, int32_t input_g, int32_t weight_g, int32_t bias_g);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_backward_elemt_out(AtenTensorHandle out, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, AtenTensorHandle sum_dy, AtenTensorHandle sum_dy_xmu, AtenTensorHandle count);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_batch_norm_update_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nnpack_spatial_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ones_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__euclidean_dist_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t upscale_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pixel_unshuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t downscale_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_channel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pin_memory_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* device, int32_t device_index_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scalar_tensor_out(AtenTensorHandle out, double s);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_out(AtenTensorHandle out, AtenTensorHandle self, int64_t high, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_like_low_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int64_t low, int64_t high, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor_out(AtenTensorHandle out, AtenTensorHandle repeats, int64_t* output_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkldnn_reshape_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shape, int64_t shape_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_relu_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_celu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsafe_split_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsafe_split_with_sizes_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sum_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_std_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_prod_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkldnn_transpose_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_flip_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_roll_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rot90_out(AtenTensorHandle out, AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transform_bias_rescale_qkv_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_mask_out(AtenTensorHandle out, AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_size_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_strides_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_storage_offsets_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_from_padded_and_nested_example_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle nt_example);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_buffer_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_view_from_jagged_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_get_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear_out(AtenTensorHandle out, AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int32_t sorted, int32_t return_inverse);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unique_dim_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unique2_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__unsafe_view_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_var_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle v, AtenTensorHandle g, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_norm_interface_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zeros_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__standard_gamma_grad_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle output);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dirichlet_grad_out(AtenTensorHandle out, AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_norm_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_norm_ScalarOpt_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_dim_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_sum_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_csr_sum_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_csr_prod_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__spdiags_out(AtenTensorHandle out, AtenTensorHandle diagonals, AtenTensorHandle offsets, const int64_t* shape, int64_t shape_len_, int32_t* layout);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_ScalarOpt_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_norm_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_clone_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_sparse(AtenTensorHandle self, AtenTensorHandle the_template, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_zero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rsub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_coo_tensor_size_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_coo_tensor_with_dims_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_coo_tensor_with_dims_and_tensors_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_, AtenTensorHandle indices, AtenTensorHandle values, int32_t* is_coalesced);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_and_clear_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_resize_and_clear(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sparse_mask_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_mask_projection_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int32_t accumulate_matches);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_dense_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype, int32_t* masked_grad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesce_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesced_out(AtenTensorHandle out, AtenTensorHandle self, int32_t coalesced);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__coalesced(AtenTensorHandle self, int32_t coalesced, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_sparse_to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_copy_sparse_to_sparse(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_sparse_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t sparse_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_csc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsr_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse_bsc_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_mkldnn_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_reorder_conv2d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int64_t** input_size, int64_t input_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_reorder_conv3d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_dynamic_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype, int32_t reduce_range);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensor_qparams_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_tensor_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_quantize_per_channel_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_self_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dequantize_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_q_per_channel_scales_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_q_per_channel_zero_points_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_int_repr_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_tensor_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_per_channel_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_tensor_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_tensor_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fake_quantize_per_channel_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fake_quantize_learnable_per_channel_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_copy_out(AtenTensorHandle out, AtenTensorHandle self, int32_t non_blocking, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__lstm_mps_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle input, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lstm_mps_backward_out(AtenTensorHandle out0, const AtenTensorHandle* out1, int64_t out1_len_, const AtenTensorHandle* out2, int64_t out2_len_, AtenTensorHandle* grad_y, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle z_state, AtenTensorHandle cell_state_fwd, AtenTensorHandle input, AtenTensorHandle layersOutputs, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_lstm_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_lstm_cell_backward_impl_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle cx, AtenTensorHandle cy, AtenTensorHandle workspace, int32_t has_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_gru_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle hx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__thnn_fused_gru_cell_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle grad_hy, AtenTensorHandle workspace, int32_t has_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pack_padded_sequence_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_source_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lift_fresh_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__masked_softmax_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_put_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_fill_int_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_and_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_or_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_xor_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___lshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___lshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_left_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___rshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu___rshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bitwise_right_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_tril_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triu_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_trace_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cholesky_solve_helper_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle A, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_dist_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_bin_edges_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_tensors_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle* bins, int64_t bins_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_remainder_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_argsort_stable_out(AtenTensorHandle out, AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_backward_out(AtenTensorHandle out, AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_foreach_non_finite_check_and_unscale_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__amp_update_scale(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_add_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sub_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_mul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_div_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_max_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_clamp_min_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_maximum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_minimum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcdiv_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_addcmul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_abs_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_acos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_asin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_atan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_ceil_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_cosh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erf_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_erfc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_exp_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_expm1_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_floor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_frac_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lerp_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_lgamma_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log10_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log1p_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_log2_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_neg_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_norm_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double ord);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double exponent);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_pow_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_reciprocal_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_round_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sigmoid_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sign_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sinh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_sqrt_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_tanh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_trunc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_zero_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foreach_copy_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Scalar_out(AtenTensorHandle out, double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_jvp_out(AtenTensorHandle out, AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_glu_backward_jvp_out(AtenTensorHandle out, AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hardswish_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rrelu_with_noise_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__slow_conv2d_backward_output_mask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_conv_depthwise3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slow_conv_dilated3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_isinf_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_linalg_matrix_exp_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_filled_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_optional_floatlist_out(AtenTensorHandle out, AtenTensorHandle values, const double** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_warn_in_autograd_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_fullcoverage_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__test_autograd_multiple_dispatch_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce_out(AtenTensorHandle out, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__nested_tensor_from_tensor_list_out(AtenTensorHandle out, const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fw_primal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t level);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__make_dual_copy_out(AtenTensorHandle out, AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__conj_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__neg_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_as_strided_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__sparse_broadcast_to_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_diagonal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_expand_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__reshape_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_select_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_detach_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_copy_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_copy_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_t_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_transpose_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unsqueeze_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_crow_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_col_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ccol_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_row_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_copy_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_unfold_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_padded_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double padding, const int64_t** output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__transformer_encoder_layer_fwd_out(AtenTensorHandle out, AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__native_multi_head_attention_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__triton_scaled_dot_attention_out(AtenTensorHandle out, AtenTensorHandle q, AtenTensorHandle k, AtenTensorHandle v, double dropout_p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__triton_multi_head_attention_out(AtenTensorHandle out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__foobar_out(AtenTensorHandle out, AtenTensorHandle self, int32_t arg1, int32_t arg2, int32_t arg3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adam_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adam_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adamw_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_adamw_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sgd_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, double lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_sgd_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, AtenTensorHandle lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..894f8487b0bf49655d71a2331953c0be77d73ca9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -0,0 +1,1316 @@
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fw_primal(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_dual(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__new_zeros_with_same_feature_meta(AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__has_same_storage_numel(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__assert_async(AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__assert_async_msg(AtenTensorHandle self, const char* assert_msg);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__assert_scalar(double self, const char* assert_msg);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__functional_assert_scalar(double self, const char* assert_msg, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__print(const char* s);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sym_constrain_range(double size, int64_t* min, int64_t* max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__functional_sym_constrain_range(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__functional_sym_constrain_range_for_size(double size, int64_t* min, int64_t* max, AtenTensorHandle dep_token, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__use_cudnn_ctc_loss(AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__use_cudnn_ctc_loss_Tensor(AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_ctc_loss(AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t deterministic, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_ctc_loss_Tensor(AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t deterministic, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn_flatten_weight(const AtenTensorHandle* weight_arr, int64_t weight_arr_len_, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, int32_t bidirectional, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn(AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_init_dropout_state(double dropout, int32_t train, int64_t dropout_seed, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_scale(AtenTensorHandle self, AtenTensorHandle mask, double scale, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_abs_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conj(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conj_physical(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__neg_view(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addr(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addr_(AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec1, AtenTensorHandle vec2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_affine_grid_generator(AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__is_all_true(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__is_any_true(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_all_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_all_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_all_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_allclose(AtenTensorHandle self, AtenTensorHandle other, double rtol, double atol, int32_t equal_nan, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_any_dims(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_any_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_any_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_arange(double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_arange_start(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_arange_start_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_arange_out(AtenTensorHandle out, double end);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_arange_start_out(AtenTensorHandle out, double start, double end, double step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_argmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_argmin_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bartlett_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bartlett_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy_with_logits(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bincount(AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copysign__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__lazy_clone(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_blackman_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_blackman_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_block_diag(const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_complex(AtenTensorHandle real, AtenTensorHandle imag, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_complex_out(AtenTensorHandle out, AtenTensorHandle real, AtenTensorHandle imag);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_polar_out(AtenTensorHandle out, AtenTensorHandle abs, AtenTensorHandle angle);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_constant_pad_nd(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_overrideable(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward_overrideable(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_conv_tbc(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copy(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copy_(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_count_nonzero_dim_IntList(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_count_nonzero(AtenTensorHandle self, int64_t* dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_affine_grid_generator(AtenTensorHandle theta, int64_t N, int64_t C, int64_t H, int64_t W, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_affine_grid_generator_backward(AtenTensorHandle grad, int64_t N, int64_t C, int64_t H, int64_t W, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_batch_norm(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_batch_norm_backward(AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon, AtenTensorHandle reserveSpace, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_transpose(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_relu(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_add_relu(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle z, double* alpha, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_grid_sampler(AtenTensorHandle self, AtenTensorHandle grid, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_grid_sampler_backward(AtenTensorHandle self, AtenTensorHandle grid, AtenTensorHandle grad_output, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummax_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cummax_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin_out(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cummin_helper(AtenTensorHandle self, AtenTensorHandle values, AtenTensorHandle indices, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumprod_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumsum_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss(AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_Tensor(AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_backward(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_backward_Tensor(AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diag_embed(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dot(AtenTensorHandle self, AtenTensorHandle tensor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_vdot(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_vdot_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding(AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_dense_backward(AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_renorm_(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_memory_format(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_permuted(const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_empty(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_empty_strided(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_full(AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_zeros(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_ones(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_strided(const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_expand(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_eye(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_eye_m(int64_t n, int64_t m, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_eye_out(AtenTensorHandle out, int64_t n);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_eye_m_out(AtenTensorHandle out, int64_t n, int64_t m);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill_Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill_Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill__Scalar(AtenTensorHandle self, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill__Tensor(AtenTensorHandle self, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide__Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide__Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_full(const int64_t* size, int64_t size_len_, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_full_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double fill_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_full_like(AtenTensorHandle self, double fill_value, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__grid_sampler_2d_cpu_fallback(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_3d(AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_3d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic_alpha(int64_t window_length, int32_t periodic, double alpha, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic_alpha_beta(int64_t window_length, int32_t periodic, double alpha, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window_periodic(int64_t window_length, int32_t periodic, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window_beta(int64_t window_length, int32_t periodic, double beta, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_group_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_group_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_r2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2r(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2r_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int64_t last_dim_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2c_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__validate_compressed_sparse_indices(int32_t is_crow, AtenTensorHandle compressed_idx, AtenTensorHandle plain_idx, int64_t cdim, int64_t dim, int64_t nnz);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unsafe_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unsafe_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__index_put_impl_(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_isin_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle elements, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_isin_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle elements, double test_element, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_isin_Scalar_Tensor_out(AtenTensorHandle out, double element, AtenTensorHandle test_elements, int32_t assume_unique, int32_t invert);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_is_same_size(AtenTensorHandle self, AtenTensorHandle other, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_layer_norm(AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_layer_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linear_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cslt_compress(AtenTensorHandle input, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cslt_sparse_mm(AtenTensorHandle compressed_A, AtenTensorHandle dense_B, AtenTensorHandle* bias, AtenTensorHandle* alpha, int32_t* out_dtype, int32_t transpose_result, int64_t alg_id, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cslt_sparse_mm_search(AtenTensorHandle compressed_A, AtenTensorHandle dense_B, AtenTensorHandle* bias, AtenTensorHandle* alpha, int32_t* out_dtype, int32_t transpose_result, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_semi_structured_linear(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle meta, AtenTensorHandle* bias, const char** activation, int32_t* out_dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__mixed_dtypes_linear(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle scale, AtenTensorHandle* bias, const char** activation, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace(double start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_out(AtenTensorHandle out, double start, double end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_xlogy__Scalar_Other(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace(double start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Tensor_Tensor(AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Tensor_Scalar(AtenTensorHandle start, double end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Scalar_Tensor(double start, AtenTensorHandle end, int64_t steps, double base, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_out(AtenTensorHandle out, double start, double end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Tensor_Tensor_out(AtenTensorHandle out, AtenTensorHandle start, AtenTensorHandle end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Tensor_Scalar_out(AtenTensorHandle out, AtenTensorHandle start, double end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logspace_Scalar_Tensor_out(AtenTensorHandle out, double start, AtenTensorHandle end, int64_t steps, double base);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_log_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logsumexp(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logsumexp_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__aminmax(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__aminmax_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_aminmax_out(AtenTensorHandle min, AtenTensorHandle max, AtenTensorHandle self, int64_t* dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__compute_linear_combination(AtenTensorHandle input, AtenTensorHandle coefficients, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__compute_linear_combination_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle coefficients);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_dim_max(AtenTensorHandle max, AtenTensorHandle max_values, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_amax_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mean(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mean_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian_dim(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian_dim_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_min_dim_min(AtenTensorHandle min, AtenTensorHandle min_indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_amin_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_convolution(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_batch_norm(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_batch_norm_backward(AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution_transpose(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_depthwise_convolution(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution_relu(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution_add_relu(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle z, double* alpha, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_rnn(AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__int_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__convert_weight_to_int4pack(AtenTensorHandle self, int64_t innerKTiles, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mv(AtenTensorHandle self, AtenTensorHandle vec, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mv_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle vec);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_narrow_copy(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_batch_norm(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_batch_norm_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_no_training(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_no_stats(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_no_stats_out(AtenTensorHandle out, AtenTensorHandle save_mean, AtenTensorHandle save_invstd, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int32_t training, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_stats(AtenTensorHandle input, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_elemt(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle mean, AtenTensorHandle invstd, double eps, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_elemt_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle mean, AtenTensorHandle invstd, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_gather_stats(AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, int64_t count, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_gather_stats_with_counts(AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, AtenTensorHandle counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_batch_norm_backward(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_backward_reduce(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, int32_t input_g, int32_t weight_g, int32_t bias_g, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_backward_elemt(AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, AtenTensorHandle sum_dy, AtenTensorHandle sum_dy_xmu, AtenTensorHandle count, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_update_stats(AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nnpack_spatial_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ones(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ones_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ones_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__euclidean_dist(AtenTensorHandle x1, AtenTensorHandle x2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pixel_shuffle(AtenTensorHandle self, int64_t upscale_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pixel_unshuffle(AtenTensorHandle self, int64_t downscale_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_channel_shuffle(AtenTensorHandle self, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_is_pinned(AtenTensorHandle self, int32_t* device, int32_t device_index_, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pin_memory(AtenTensorHandle self, int32_t* device, int32_t device_index_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rad2deg(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rad2deg_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rad2deg_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scalar_tensor(double s, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_out(AtenTensorHandle out, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_like(AtenTensorHandle self, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_like_low_dtype(AtenTensorHandle self, int64_t low, int64_t high, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm_out(AtenTensorHandle out, int64_t n);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_range_step(double start, double end, double step, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_range(double start, double end, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_range_out_(AtenTensorHandle out, double start, double end);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_range_out(AtenTensorHandle out, double start, double end, double step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat(AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__reshape_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__reshape_alias(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__prelu_kernel(AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__prelu_kernel_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gelu_out(AtenTensorHandle out, AtenTensorHandle self, const char* approximate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gelu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const char* approximate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_celu(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_celu_(AtenTensorHandle self, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mish_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_detach(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_detach_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_backward(AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_inverse(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_scatter(AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided_scatter(AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_softmax_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__softmax_backward_data_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, int32_t input_dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze__dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze__dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sspaddmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__chunk_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__chunk_cat_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, int64_t num_chunks);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__stack(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__stack_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sum(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sum_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nansum(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nansum_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_std_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_std_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_std_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_prod(AtenTensorHandle self, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_prod_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_t(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_t_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_threshold_out(AtenTensorHandle out, AtenTensorHandle self, double threshold, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_threshold_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_transpose_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_transpose_(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_flip(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_roll(AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rot90(AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__transform_bias_rescale_qkv(AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_from_mask(AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_from_mask_left_aligned(AtenTensorHandle t, AtenTensorHandle mask, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_from_padded(AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_view_from_buffer(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_view_from_buffer_copy(AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_view_from_jagged_copy(AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_get_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unique(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_dim(AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_consecutive(AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_dim_consecutive(AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unique2(AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unsafe_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsqueeze(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsqueeze_(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_var_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_var_correction_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_var_mean_correction(AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_where_self_out(AtenTensorHandle out, AtenTensorHandle condition, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_norm_interface(AtenTensorHandle v, AtenTensorHandle g, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_norm_interface_backward(AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zeros(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zeros_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zeros_like(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__standard_gamma_grad(AtenTensorHandle self, AtenTensorHandle output, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__dirichlet_grad(AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_sum_dim(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_ScalarOpt_dtype(AtenTensorHandle self, double* p, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_Scalar(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zero_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rsub_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_addmm(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__addmm_activation_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle* bias, int32_t* out_dtype, AtenTensorHandle* scale_a, AtenTensorHandle* scale_b, AtenTensorHandle* scale_result, int32_t use_fast_accum, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle out_amax, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle* bias, int32_t* out_dtype, AtenTensorHandle* scale_a, AtenTensorHandle* scale_b, AtenTensorHandle* scale_result, int32_t use_fast_accum);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_compressed_tensor_comp_plain_value_size(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_compressed_tensor_comp_plain_value(AtenTensorHandle compressed_indices, AtenTensorHandle plain_indices, AtenTensorHandle values, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_coo_tensor_size(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_dim(AtenTensorHandle self, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dense_dim(AtenTensorHandle self, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_is_coalesced(AtenTensorHandle self, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_values(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_crow_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_col_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ccol_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_row_indices(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_sparse_dim(AtenTensorHandle self, int64_t sparse_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_csr(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_csc(AtenTensorHandle self, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_bsr(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_bsc(AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_semi_structured(AtenTensorHandle dense, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_dynamic(AtenTensorHandle self, int32_t dtype, int32_t reduce_range, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor(AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_channel(AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dequantize_self(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_per_tensor_quantized_tensor(AtenTensorHandle self, double scale, int64_t zero_point, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_per_channel_quantized_tensor(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fake_quantize_per_tensor_affine_cachemask(AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_per_tensor_affine_cachemask_tensor_qparams(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_tensor_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_tensor_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fake_quantize_per_channel_affine_cachemask(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_channel_affine(AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_channel_affine_backward(AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_copy(AtenTensorHandle self, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, int32_t non_blocking, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__local_scalar_dense(AtenTensorHandle self, double* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell_backward_impl(AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle cx, AtenTensorHandle cy, AtenTensorHandle workspace, int32_t has_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_gru_cell(AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle hx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_gru_cell_backward(AtenTensorHandle grad_hy, AtenTensorHandle workspace, int32_t has_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pack_padded_sequence(AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lift(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lift_fresh(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lift_fresh_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_is_set_to(AtenTensorHandle self, AtenTensorHandle tensor, int32_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_fill__Scalar(AtenTensorHandle self, AtenTensorHandle mask, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_fill__Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_fill_Tensor(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter_(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_softmax(AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_softmax_backward(AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_put_(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_put(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill__int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill_int_Scalar(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill__int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill_int_Tensor(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_reduce_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value, const char* reduce);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_add_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___irshift___Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___irshift___Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addbmm_(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_tril_out(AtenTensorHandle out, AtenTensorHandle self, int64_t diagonal);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_tril_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triu_indices(int64_t row, int64_t col, int64_t offset, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_trace(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_take_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_take(AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_select_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_select(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_select_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gather_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, int32_t sparse_grad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve_X(AtenTensorHandle X, AtenTensorHandle M, AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_check_errors(AtenTensorHandle info, const char* api_name, int32_t is_matrix);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_solve_triangular_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_solve_triangular(AtenTensorHandle self, AtenTensorHandle B, int32_t upper, int32_t left, int32_t unitriangular, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_solve_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cholesky_solve_helper(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_inverse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf_a(AtenTensorHandle a, AtenTensorHandle tau, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lu_unpack_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dist(AtenTensorHandle self, AtenTensorHandle other, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t bins, double min, double max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_min(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_min_unary_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unary_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_values_stable(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_argsort_stable(AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk_values(AtenTensorHandle values, AtenTensorHandle indices, AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_all_all_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_any_all_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_renorm_out(AtenTensorHandle out, AtenTensorHandle self, double p, int64_t dim, double maxnorm);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unfold(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unfold_backward(AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_alias(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__amp_foreach_non_finite_check_and_unscale_(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__amp_update_scale_(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div__Tensor(const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum__Scalar(const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul__Tensor(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_abs_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_acos_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_asin_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_atan_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_ceil_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_cos_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_cosh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_erf_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_erfc_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_exp_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_expm1_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_floor_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_frac_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lerp__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lerp__Scalar(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lgamma_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log10_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log1p_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log2_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_neg_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow__List(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow__Scalar(const AtenTensorHandle* self, int64_t self_len_, double exponent);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow__ScalarList(const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_reciprocal_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_round_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sigmoid_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sign_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sin_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sinh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sqrt_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_tan_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_tanh_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_trunc_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_zero_(const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_copy_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Scalar(double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Tensor_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Scalar_out(AtenTensorHandle out, AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__convert_indices_from_coo_to_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t size, int32_t out_int32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__convert_indices_from_csr_to_coo_out(AtenTensorHandle out, AtenTensorHandle crow_indices, AtenTensorHandle col_indices, int32_t out_int32, int32_t transpose);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mse_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mse_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mse_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multi_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multi_margin_loss(AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multi_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multi_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, double p, double margin, AtenTensorHandle* weight, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multilabel_margin_loss_forward_output(AtenTensorHandle output, AtenTensorHandle is_target, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multilabel_margin_loss_forward(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multilabel_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_multilabel_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle is_target, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss2d_forward_output(AtenTensorHandle output, AtenTensorHandle total_weight, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss2d_forward(AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nll_loss2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, int64_t reduction, int64_t ignore_index, AtenTensorHandle total_weight, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_smooth_l1_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_smooth_l1_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_smooth_l1_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double beta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_huber_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_huber_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_huber_loss_backward_out(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_huber_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, double delta, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss(AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_elu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha, double scale, double input_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_elu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, double alpha, double scale, double input_scale, int32_t is_result, AtenTensorHandle self_or_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_backward(AtenTensorHandle grad_output, AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_jvp(AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_backward_jvp(AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardsigmoid_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardsigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardtanh_out(AtenTensorHandle out, AtenTensorHandle self, double min_val, double max_val);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardtanh(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardtanh_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardtanh_backward(AtenTensorHandle grad_output, AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardtanh_(AtenTensorHandle self, double min_val, double max_val, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardswish_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardswish(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardswish_(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardswish_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_leaky_relu_out(AtenTensorHandle out, AtenTensorHandle self, double negative_slope);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_leaky_relu_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double negative_slope, int32_t self_is_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_log_sigmoid_forward_output(AtenTensorHandle output, AtenTensorHandle buffer, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_log_sigmoid_forward(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_log_sigmoid_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_log_sigmoid_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle buffer, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rrelu_with_noise_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_softplus_out(AtenTensorHandle out, AtenTensorHandle self, double beta, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_softplus_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double beta, double threshold);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_softshrink_out(AtenTensorHandle out, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_softshrink_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, double lambd);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_output(AtenTensorHandle output, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices_out(AtenTensorHandle out, AtenTensorHandle indices, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad2d(AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reflection_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_linear1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_linear1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bilinear2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bilinear2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_bilinear2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_bilinear2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_bicubic2d_aa_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_bicubic2d_aa_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_trilinear3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_trilinear3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact1d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_nearest3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__upsample_nearest_exact3d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, double* scales_d, double* scales_h, double* scales_w);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_transpose2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_transpose3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_transpose3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__slow_conv2d_forward_output(AtenTensorHandle output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__slow_conv2d_forward(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__slow_conv2d_backward_grad_input(AtenTensorHandle grad_input, AtenTensorHandle grad_weight, AtenTensorHandle grad_bias, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__slow_conv2d_backward_output_mask(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conv_depthwise2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conv_depthwise2d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_conv_depthwise3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_dilated2d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_dilated3d(AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_col2im_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_col2im(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_im2col_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_im2col(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* dilation, int64_t dilation_len_, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fft_fftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fft_fftfreq_out(AtenTensorHandle out, int64_t n, double d);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fft_rfftfreq(int64_t n, double d, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fft_rfftfreq_out(AtenTensorHandle out, int64_t n, double d);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_cholesky_ex_L(AtenTensorHandle L, AtenTensorHandle info, AtenTensorHandle self, int32_t upper, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_cross_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_lu_factor_ex_out(AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, int32_t pivot, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_lu_out(AtenTensorHandle P, AtenTensorHandle L, AtenTensorHandle U, AtenTensorHandle A, int32_t pivot);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_lu_solve_out(AtenTensorHandle out, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle B, int32_t left, int32_t adjoint);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_det_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_ldl_factor_ex_out(AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle self, int32_t hermitian, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_ldl_solve_out(AtenTensorHandle out, AtenTensorHandle LD, AtenTensorHandle pivots, AtenTensorHandle B, int32_t hermitian);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_lstsq(AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_lstsq_out(AtenTensorHandle solution, AtenTensorHandle residuals, AtenTensorHandle rank, AtenTensorHandle singular_values, AtenTensorHandle self, AtenTensorHandle b, double* rcond, const char** driver);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_matrix_exp(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_slogdet_sign(AtenTensorHandle sign, AtenTensorHandle logabsdet, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle A);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_eig(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_eig_out(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_eigvals(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_eigvals_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_eigh_eigenvalues(AtenTensorHandle eigenvalues, AtenTensorHandle eigenvectors, AtenTensorHandle A, const char* UPLO, int32_t compute_v);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_householder_product(AtenTensorHandle input, AtenTensorHandle tau, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_householder_product_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle tau);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_inv_ex_inverse(AtenTensorHandle inverse, AtenTensorHandle info, AtenTensorHandle A, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_vector_norm_out(AtenTensorHandle out, AtenTensorHandle self, double ord, const int64_t** dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_svd_U(AtenTensorHandle U, AtenTensorHandle S, AtenTensorHandle Vh, AtenTensorHandle A, int32_t full_matrices, int32_t compute_uv, const char** driver);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_pinv_atol_rtol_tensor(AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_pinv_atol_rtol_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* atol, AtenTensorHandle* rtol, int32_t hermitian);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__linalg_solve_ex_result(AtenTensorHandle result, AtenTensorHandle LU, AtenTensorHandle pivots, AtenTensorHandle info, AtenTensorHandle A, AtenTensorHandle B, int32_t left, int32_t check_errors);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_qr_out(AtenTensorHandle Q, AtenTensorHandle R, AtenTensorHandle A, const char* mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_parallel_materialize(AtenTensorHandle self, int64_t num_parallel, int32_t skip_first, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_warn_in_autograd(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_autograd_multiple_dispatch_fullcoverage(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_autograd_multiple_dispatch_view(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_autograd_multiple_dispatch_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_from_tensor_list(const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fw_primal_copy(AtenTensorHandle self, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_dual_copy(AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_real_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_complex_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conj_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__neg_view_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_broadcast_to_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_copy(AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_expand_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_permute_copy(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__reshape_alias_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_copy_int(AtenTensorHandle self, int64_t dim, int64_t index, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_detach_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_copy_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy_dims(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_t_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_transpose_copy_int(AtenTensorHandle self, int64_t dim0, int64_t dim1, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsqueeze_copy(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_values_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_crow_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_col_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ccol_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_row_indices_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unbind_copy_int_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_split_copy_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_split_with_sizes_copy_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_copy(AtenTensorHandle self, const int64_t* size, int64_t size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_copy_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unfold_copy(AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_alias_copy(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__transformer_encoder_layer_fwd(AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_multi_head_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_sdp_choice(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_mask, double dropout_p, int32_t is_causal, double* scale, int64_t* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, int32_t compute_log_sumexp, double dropout_p, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double dropout_p, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_cudnn_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t* max_seqlen_q, int64_t* max_seqlen_k, double dropout_p, int64_t custom_mask_type, int32_t compute_log_sumexp, double* scale, AtenTensorHandle* causal_diagonal, AtenTensorHandle* seqlen_k, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle out, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, AtenTensorHandle logsumexp, double dropout_p, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, int64_t custom_mask_type, int32_t bias_requires_grad, double* scale, int64_t* num_splits_key, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__triton_scaled_dot_attention(AtenTensorHandle q, AtenTensorHandle k, AtenTensorHandle v, double dropout_p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fill_mem_eff_dropout_mask_(AtenTensorHandle self, double dropout_p, int64_t seed, int64_t offset, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__triton_multi_head_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adam_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adam__tensor_lr(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adamw_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adamw__tensor_lr(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_sgd_(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, double lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_sgd__tensor_lr(const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, AtenTensorHandle lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__new_zeros_with_same_feature_meta_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, int64_t self_num_batch_dims);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t deterministic, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn_flatten_weight_out(AtenTensorHandle out, const AtenTensorHandle* weight_arr, int64_t weight_arr_len_, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, int32_t bidirectional);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_init_dropout_state_out(AtenTensorHandle out, double dropout, int32_t train, int64_t dropout_seed);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double p, int32_t* train);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle mask, double scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conj_physical_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__add_relu_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_add_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, const int64_t* size, int64_t size_len_, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_functorch_fallback_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bartlett_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bartlett_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantized_batch_norm_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle mean, AtenTensorHandle var, double eps, double output_scale, int64_t output_zero_point);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_binary_cross_entropy_with_logits_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle target, AtenTensorHandle* weight, AtenTensorHandle* pos_weight, int64_t reduction);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bincount_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle* weights, int64_t minlength);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_blackman_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_blackman_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_block_diag_out(AtenTensorHandle out, const AtenTensorHandle* tensors, int64_t tensors_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_constant_pad_nd_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_overrideable_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward_overrideable_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t cudnn_enabled, int32_t allow_tf32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_conv_tbc_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle bias, int64_t pad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__copy_from_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__copy_from_and_resize_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle dst);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_count_nonzero_dim_IntList_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_count_nonzero_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_affine_grid_generator_out(AtenTensorHandle out, AtenTensorHandle theta, int64_t N, int64_t C, int64_t H, int64_t W);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_affine_grid_generator_backward_out(AtenTensorHandle out, AtenTensorHandle grad, int64_t N, int64_t C, int64_t H, int64_t W);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon, AtenTensorHandle reserveSpace);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic, int32_t allow_tf32);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__mps_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mps_convolution_transpose_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_convolution_add_relu_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle z, double* alpha, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_grid_sampler_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle grid);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cudnn_grid_sampler_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle grid, AtenTensorHandle grad_output);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_Tensor_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle log_probs, AtenTensorHandle targets, AtenTensorHandle input_lengths, AtenTensorHandle target_lengths, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__ctc_loss_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle log_probs, AtenTensorHandle targets, const int64_t* input_lengths, int64_t input_lengths_len_, const int64_t* target_lengths, int64_t target_lengths_len_, AtenTensorHandle neg_log_likelihood, AtenTensorHandle log_alpha, int64_t blank, int32_t zero_infinity);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diag_embed_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_div_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_div_Scalar_mode_out(AtenTensorHandle out, AtenTensorHandle self, double other, const char** rounding_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_out(AtenTensorHandle out, AtenTensorHandle weight, AtenTensorHandle indices, int64_t padding_idx, int32_t scale_grad_by_freq, int32_t sparse);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle indices, int64_t num_weights, int64_t padding_idx, int32_t scale_grad_by_freq);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_renorm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_embedding_renorm(AtenTensorHandle self, AtenTensorHandle indices, double max_norm, double norm_type, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_forward_only_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_dense_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_per_sample_weights_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_permuted_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* physical_layout, int64_t physical_layout_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_empty_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_empty_strided_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_full_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, double fill_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_zeros_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_new_ones_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__empty_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, double scale, int64_t zero_point, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__empty_per_channel_affine_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__resize_output_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__resize_output(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t device, int32_t device_index_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_quantized_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, AtenTensorHandle qtensor, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_empty_strided_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_floor_divide_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_full_like_out(AtenTensorHandle out, AtenTensorHandle self, double fill_value, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_from_file_out(AtenTensorHandle out, const char* filename, int32_t* shared, int64_t* size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__grid_sampler_2d_cpu_fallback_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_3d_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_3d_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic_alpha_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hamming_window_periodic_alpha_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double alpha, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window_out(AtenTensorHandle out, int64_t window_length);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window_periodic_out(AtenTensorHandle out, int64_t window_length, int32_t periodic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kaiser_window_beta_out(AtenTensorHandle out, int64_t window_length, int32_t periodic, double beta);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_group_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_group_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, int64_t N, int64_t C, int64_t HxW, int64_t group, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__index_put_impl_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__index_put_impl(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, int32_t unsafe, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_isnan_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_layer_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle* weight, AtenTensorHandle* bias, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_layer_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, const int64_t* normalized_shape, int64_t normalized_shape_len_, AtenTensorHandle mean, AtenTensorHandle rstd, AtenTensorHandle* weight, AtenTensorHandle* bias, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_linear_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_linear_backward_input_out(AtenTensorHandle out, const int64_t* input_size, int64_t input_size_len_, AtenTensorHandle grad_output, AtenTensorHandle weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_linear_backward_weights_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, int32_t bias_defined);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_linear_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_matmul_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad, AtenTensorHandle self, AtenTensorHandle other, const int32_t* mask, int64_t mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__aminmax_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__aminmax_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int64_t dim, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_max_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_max_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle input, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantized_max_pool1d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantized_max_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantized_max_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__mps_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mps_convolution_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, AtenTensorHandle grad_output, AtenTensorHandle weight, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_rnn_layer_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle input, AtenTensorHandle weight0, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle hx_, AtenTensorHandle cx_, int32_t reverse, const int64_t* batch_sizes, int64_t batch_sizes_len_, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t bidirectional, int32_t batch_first, int32_t train);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_rnn_layer_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle out6, AtenTensorHandle input, AtenTensorHandle weight1, AtenTensorHandle weight2, AtenTensorHandle weight3, AtenTensorHandle weight4, AtenTensorHandle hx_, AtenTensorHandle cx_tmp, AtenTensorHandle output, AtenTensorHandle hy_, AtenTensorHandle cy_, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int32_t reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t has_biases, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, int32_t batch_first, AtenTensorHandle workspace);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_batch_norm_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, int32_t training, double exponential_average_factor, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle grad_output, AtenTensorHandle weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_var, double epsilon);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_convolution_transpose_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* output_padding, int64_t output_padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_depthwise_convolution_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, int32_t benchmark, int32_t deterministic);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_rnn_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_miopen_rnn_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, const AtenTensorHandle* out3, int64_t out3_len_, AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, AtenTensorHandle output, AtenTensorHandle* grad_output, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle reserve, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_sparse_matmul_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_functional(AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, int32_t training, double momentum, double eps, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_batch_norm_legit_no_training_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* bias, AtenTensorHandle running_mean, AtenTensorHandle running_var, double momentum, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, double eps);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_gather_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, int64_t count);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_gather_stats_with_counts_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum, double eps, AtenTensorHandle counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_batch_norm_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle* weight, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, AtenTensorHandle* save_mean, AtenTensorHandle* save_invstd, int32_t train, double eps, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_backward_reduce_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, int32_t input_g, int32_t weight_g, int32_t bias_g);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_backward_elemt_out(AtenTensorHandle out, AtenTensorHandle grad_out, AtenTensorHandle input, AtenTensorHandle mean, AtenTensorHandle invstd, AtenTensorHandle* weight, AtenTensorHandle sum_dy, AtenTensorHandle sum_dy_xmu, AtenTensorHandle count);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_batch_norm_update_stats_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle* running_mean, AtenTensorHandle* running_var, double momentum);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nnpack_spatial_convolution_out(AtenTensorHandle out, AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ones_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__euclidean_dist_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_forward_out(AtenTensorHandle out, AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_forward_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pixel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t upscale_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pixel_unshuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t downscale_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_channel_shuffle_out(AtenTensorHandle out, AtenTensorHandle self, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pin_memory_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* device, int32_t device_index_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scalar_tensor_out(AtenTensorHandle out, double s);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_like_out(AtenTensorHandle out, AtenTensorHandle self, int64_t high, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_like_low_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int64_t low, int64_t high, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* repeats, int64_t repeats_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor_out(AtenTensorHandle out, AtenTensorHandle repeats, int64_t* output_size);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__mkldnn_reshape_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shape, int64_t shape_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_relu_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_celu_out(AtenTensorHandle out, AtenTensorHandle self, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t start, int64_t end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t* start, int64_t* end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsafe_split_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, int64_t split_size, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsafe_split_with_sizes_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* split_sizes, int64_t split_sizes_len_, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sum_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_std_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_prod_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__mkldnn_transpose_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_flip_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_roll_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* shifts, int64_t shifts_len_, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rot90_out(AtenTensorHandle out, AtenTensorHandle self, int64_t k, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__transform_bias_rescale_qkv_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle qkv, AtenTensorHandle qkv_bias, int64_t num_heads);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_from_mask_out(AtenTensorHandle out, AtenTensorHandle t, AtenTensorHandle mask, int32_t mask_check);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_from_padded_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle cpu_nested_shape_example, int32_t fuse_transform_0213);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_size_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_strides_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_storage_offsets_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_from_padded_and_nested_example_out(AtenTensorHandle out, AtenTensorHandle padded, AtenTensorHandle nt_example);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_view_from_buffer_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle nested_size, AtenTensorHandle nested_strides, AtenTensorHandle offsets);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_view_from_jagged_copy_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle offsets, AtenTensorHandle dummy, AtenTensorHandle* lengths, int64_t ragged_idx);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_get_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear_out(AtenTensorHandle out, AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unique_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, int32_t sorted, int32_t return_inverse);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_dim_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t sorted, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t return_inverse, int32_t return_counts, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unique_dim_consecutive_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int64_t dim, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unique2_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle self, int32_t sorted, int32_t return_inverse, int32_t return_counts);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__unsafe_view_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_var_mean_correction_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, const int64_t** dim, int64_t dim_len_, double* correction, int32_t keepdim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_norm_interface_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle v, AtenTensorHandle g, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_norm_interface_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle grad_w, AtenTensorHandle saved_v, AtenTensorHandle saved_g, AtenTensorHandle saved_norms, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficientzerotensor_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zeros_like_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__standard_gamma_grad_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle output);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__dirichlet_grad_out(AtenTensorHandle out, AtenTensorHandle x, AtenTensorHandle alpha, AtenTensorHandle total);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_norm_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_norm_ScalarOpt_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_sum_dim_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_sum_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_csr_sum_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_csr_prod_dim_dtype_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int32_t keepdim, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_log_softmax_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int32_t half_to_float);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_log_softmax_backward_data_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, int64_t dim, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__spdiags_out(AtenTensorHandle out, AtenTensorHandle diagonals, AtenTensorHandle offsets, const int64_t* shape, int64_t shape_len_, int32_t* layout);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_ScalarOpt_dtype_out(AtenTensorHandle out, AtenTensorHandle self, double* p, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_norm_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_clone_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle the_template);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_sparse(AtenTensorHandle self, AtenTensorHandle the_template, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zero_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_zero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rsub_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rsub_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_coo_tensor_size_out(AtenTensorHandle out, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_coo_tensor_with_dims_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_coo_tensor_with_dims_and_tensors_out(AtenTensorHandle out, int64_t sparse_dim, int64_t dense_dim, const int64_t* size, int64_t size_len_, AtenTensorHandle indices, AtenTensorHandle values, int32_t* is_coalesced);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_resize_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_resize(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_resize_and_clear_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_resize_and_clear(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int64_t sparse_dim, int64_t dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sparse_mask_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_mask_projection_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int32_t accumulate_matches);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_dense_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype, int32_t* masked_grad);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__coalesce_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__coalesced_out(AtenTensorHandle out, AtenTensorHandle self, int32_t coalesced);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__coalesced(AtenTensorHandle self, int32_t coalesced, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copy_sparse_to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_copy_sparse_to_sparse(AtenTensorHandle self, AtenTensorHandle src, int32_t non_blocking, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_sparse_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t sparse_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_csr_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_csc_out(AtenTensorHandle out, AtenTensorHandle self, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_bsr_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse_bsc_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* blocksize, int64_t blocksize_len_, int64_t* dense_dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_to_mkldnn_out(AtenTensorHandle out, AtenTensorHandle self, int32_t* dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_reorder_conv2d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups, const int64_t** input_size, int64_t input_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_reorder_conv3d_weight_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, const int64_t* stride, int64_t stride_len_, const int64_t* dilation, int64_t dilation_len_, int64_t groups);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_dynamic_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype, int32_t reduce_range);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_tensor_qparams_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_tensor_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_, AtenTensorHandle scales, AtenTensorHandle zero_points, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_quantize_per_channel_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scales, AtenTensorHandle zero_points, int64_t axis, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dequantize_self_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dequantize_tensors_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* tensors, int64_t tensors_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_q_per_channel_scales_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_q_per_channel_zero_points_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_int_repr_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_per_tensor_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double scale, int64_t zero_point);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_per_channel_quantized_tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fake_quantize_per_tensor_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, AtenTensorHandle fake_quant_enabled, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_tensor_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fake_quantize_per_channel_affine_cachemask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fake_quantize_learnable_per_channel_affine_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle scale, AtenTensorHandle zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_copy_out(AtenTensorHandle out, AtenTensorHandle self, int32_t non_blocking, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__lstm_mps_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle out5, AtenTensorHandle input, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lstm_mps_backward_out(AtenTensorHandle out0, const AtenTensorHandle* out1, int64_t out1_len_, const AtenTensorHandle* out2, int64_t out2_len_, AtenTensorHandle* grad_y, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle z_state, AtenTensorHandle cell_state_fwd, AtenTensorHandle input, AtenTensorHandle layersOutputs, const AtenTensorHandle* hx, int64_t hx_len_, const AtenTensorHandle* params, int64_t params_len_, int32_t has_biases, int64_t num_layers, double dropout, int32_t train, int32_t bidirectional, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell_backward_impl_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle* grad_hy, AtenTensorHandle* grad_cy, AtenTensorHandle cx, AtenTensorHandle cy, AtenTensorHandle workspace, int32_t has_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_gru_cell_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle hx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_gru_cell_backward_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle out3, AtenTensorHandle out4, AtenTensorHandle grad_hy, AtenTensorHandle workspace, int32_t has_bias);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pack_padded_sequence_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle input, AtenTensorHandle lengths, int32_t batch_first);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set_source_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set_source_Tensor(AtenTensorHandle self, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lift_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lift_fresh_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_fill_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_fill_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_softmax_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mask, int64_t* dim, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__masked_softmax_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle output, AtenTensorHandle mask, int64_t* dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_put_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle index, AtenTensorHandle source, int32_t accumulate);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill_int_Scalar_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_fill_int_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bitwise_and_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bitwise_or_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bitwise_xor_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___lshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___lshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bitwise_left_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___rshift___Scalar_out(AtenTensorHandle out, AtenTensorHandle self, double other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda___rshift___Tensor_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bitwise_right_shift_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_tril_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triu_indices_out(AtenTensorHandle out, int64_t row, int64_t col, int64_t offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_trace_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cholesky_solve_helper_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle A, int32_t upper);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_dist_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle other, double p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__histogramdd_bin_edges_out(const AtenTensorHandle* out, int64_t out_len_, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__histogramdd_from_bin_cts_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__histogramdd_from_bin_tensors_out(AtenTensorHandle out, AtenTensorHandle self, const AtenTensorHandle* bins, int64_t bins_len_, AtenTensorHandle* weight, int32_t density);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_remainder_Scalar_Tensor_out(AtenTensorHandle out, double self, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_argsort_stable_out(AtenTensorHandle out, AtenTensorHandle self, int32_t stable, int64_t dim, int32_t descending);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unfold_backward_out(AtenTensorHandle out, AtenTensorHandle grad_in, const int64_t* input_sizes, int64_t input_sizes_len_, int64_t dim, int64_t size, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__amp_foreach_non_finite_check_and_unscale_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle found_inf, AtenTensorHandle inv_scale);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__amp_update_scale_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__amp_update_scale(AtenTensorHandle self, AtenTensorHandle growth_tracker, AtenTensorHandle found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_add_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sub_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_mul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_div_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, AtenTensorHandle other);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_max_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_clamp_min_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_maximum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double scalar);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* other, int64_t other_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_minimum_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcdiv_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, const double* scalars, int64_t scalars_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_addcmul_Tensor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensor1, int64_t tensor1_len_, const AtenTensorHandle* tensor2, int64_t tensor2_len_, AtenTensorHandle scalars);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_abs_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_acos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_asin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_atan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_ceil_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_cos_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_cosh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_erf_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_erfc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_exp_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_expm1_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_floor_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_frac_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lerp_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, const AtenTensorHandle* weights, int64_t weights_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lerp_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* tensors1, int64_t tensors1_len_, double weight);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_lgamma_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log10_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log1p_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_log2_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_neg_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_norm_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double ord);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow_List_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow_Scalar_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, double exponent);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_pow_ScalarList_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const double* exponent, int64_t exponent_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_reciprocal_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_round_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sigmoid_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sign_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sin_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sinh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_sqrt_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_tan_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_tanh_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_trunc_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_zero_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foreach_copy_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* src, int64_t src_len_, int32_t non_blocking);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Scalar_out(AtenTensorHandle out, double self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_jvp_out(AtenTensorHandle out, AtenTensorHandle glu, AtenTensorHandle x, AtenTensorHandle dx, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_glu_backward_jvp_out(AtenTensorHandle out, AtenTensorHandle grad_x, AtenTensorHandle grad_glu, AtenTensorHandle x, AtenTensorHandle dgrad_glu, AtenTensorHandle dx, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hardswish_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rrelu_with_noise_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle noise, double lower, double upper, int32_t training, int32_t self_is_result);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mkldnn_adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d_backward_out(AtenTensorHandle out, AtenTensorHandle grad_output, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__slow_conv2d_backward_output_mask_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle out2, AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int32_t* output_mask, int64_t output_mask_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_conv_depthwise3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_dilated2d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slow_conv_dilated3d_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle weight, const int64_t* kernel_size, int64_t kernel_size_len_, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_isinf_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_linalg_matrix_exp_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_optional_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_optional_filled_intlist_out(AtenTensorHandle out, AtenTensorHandle values, const int64_t** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_optional_floatlist_out(AtenTensorHandle out, AtenTensorHandle values, const double** addends, int64_t addends_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_warn_in_autograd_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_autograd_multiple_dispatch_fullcoverage_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__test_autograd_multiple_dispatch_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce_out(AtenTensorHandle out, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward_out(AtenTensorHandle out, AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__nested_tensor_from_tensor_list_out(AtenTensorHandle out, const AtenTensorHandle* list, int64_t list_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fw_primal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t level);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__make_dual_copy_out(AtenTensorHandle out, AtenTensorHandle primal, AtenTensorHandle tangent, int64_t level);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_real_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_complex_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__conj_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__neg_view_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_as_strided_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_, int64_t* storage_offset);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__sparse_broadcast_to_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_diagonal_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t offset, int64_t dim1, int64_t dim2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_expand_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t implicit);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_permute_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dims, int64_t dims_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__reshape_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_, const int64_t* stride, int64_t stride_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_select_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t index);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_detach_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_copy_Tensor_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy_dim_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_copy_dims_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* dim, int64_t dim_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_t_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_transpose_copy_int_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim0, int64_t dim1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unsqueeze_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_values_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_crow_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_col_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ccol_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_row_indices_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_copy_out(AtenTensorHandle out, AtenTensorHandle self, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_copy_dtype_out(AtenTensorHandle out, AtenTensorHandle self, int32_t dtype);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_unfold_copy_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dimension, int64_t size, int64_t step);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_alias_copy_out(AtenTensorHandle out, AtenTensorHandle self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_to_padded_tensor_out(AtenTensorHandle out, AtenTensorHandle self, double padding, const int64_t** output_size, int64_t output_size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__transformer_encoder_layer_fwd_out(AtenTensorHandle out, AtenTensorHandle src, int64_t embed_dim, int64_t num_heads, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, int32_t use_gelu, int32_t norm_first, double eps, AtenTensorHandle norm_weight_1, AtenTensorHandle norm_bias_1, AtenTensorHandle norm_weight_2, AtenTensorHandle norm_bias_2, AtenTensorHandle ffn_weight_1, AtenTensorHandle ffn_bias_1, AtenTensorHandle ffn_weight_2, AtenTensorHandle ffn_bias_2, AtenTensorHandle* mask, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__native_multi_head_attention_out(AtenTensorHandle out0, AtenTensorHandle out1, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask, int32_t need_weights, int32_t average_attn_weights, int64_t* mask_type);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__triton_scaled_dot_attention_out(AtenTensorHandle out, AtenTensorHandle q, AtenTensorHandle k, AtenTensorHandle v, double dropout_p);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__triton_multi_head_attention_out(AtenTensorHandle out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, int64_t embed_dim, int64_t num_head, AtenTensorHandle qkv_weight, AtenTensorHandle qkv_bias, AtenTensorHandle proj_weight, AtenTensorHandle proj_bias, AtenTensorHandle* mask);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__foobar_out(AtenTensorHandle out, AtenTensorHandle self, int32_t arg1, int32_t arg2, int32_t arg3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adam_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adam_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adamw_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, double lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_adamw_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* exp_avgs, int64_t exp_avgs_len_, const AtenTensorHandle* exp_avg_sqs, int64_t exp_avg_sqs_len_, const AtenTensorHandle* max_exp_avg_sqs, int64_t max_exp_avg_sqs_len_, const AtenTensorHandle* state_steps, int64_t state_steps_len_, AtenTensorHandle lr, double beta1, double beta2, double weight_decay, double eps, int32_t amsgrad, int32_t maximize, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_sgd_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, double lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_sgd_tensor_lr_out(const AtenTensorHandle* out, int64_t out_len_, const AtenTensorHandle* self, int64_t self_len_, const AtenTensorHandle* grads, int64_t grads_len_, const AtenTensorHandle* momentum_buffer_list, int64_t momentum_buffer_list_len_, double weight_decay, double momentum, AtenTensorHandle lr, double dampening, int32_t nesterov, int32_t maximize, int32_t is_first_step, AtenTensorHandle* grad_scale, AtenTensorHandle* found_inf);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..053ceaa34de9ec250f30b098cfb213cce030b3ec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+namespace torch {
+namespace aot_inductor {
+
+class ProxyExecutor {
+ public:
+  ProxyExecutor() {}
+  virtual ~ProxyExecutor() {}
+
+  virtual void call_function(
+      int extern_node_index,
+      int num_ints,
+      int64_t* flatten_int_args,
+      int num_tensors,
+      AtenTensorHandle* flatten_tensor_args) = 0;
+};
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..82dd148d3bf61505865cd39a31b9f3da47d35d9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace aot_inductor {
+
+// Functions declared here are not meant to be called from the AOTInductor
+// generated model.so
+
+// No ownership transfer, just pointer type conversion
+TORCH_API at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle);
+
+// No ownership transfer, just pointer type conversion
+TORCH_API AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor);
+
+TORCH_API AtenTensorHandle new_tensor_handle(at::Tensor&& tensor);
+
+// unsafe_alloc_new_handles_from_tensors is used for allocating new aten
+// tensor objects and return them as a vector of AtenTensorHandle (raw
+// pointers), and those pointers will be stolen by model.so.
+TORCH_API std::vector<AtenTensorHandle> unsafe_alloc_new_handles_from_tensors(
+    std::vector<at::Tensor>& tensors);
+
+// alloc_tensors_by_stealing_from_handles is used for creating a vector of aten
+// tensors by stealing from an array of handles. Only the handles are stolen,
+// and the array itself is borrowed.
+//
+// WARNING: Can NOT be called in model.so unless in the non-ABI-compatible mode
+TORCH_API std::vector<at::Tensor> alloc_tensors_by_stealing_from_handles(
+    AtenTensorHandle* handles,
+    size_t length);
+
+} // namespace aot_inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8155d33f5e0bee4178e68b81f93c67e6dc427aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Logging.h>
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+
+#define AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(...)    \
+  try {                                                    \
+    __VA_ARGS__                                            \
+  } catch (const std::exception& e) {                      \
+    LOG(ERROR) << "Exception in aoti_torch: " << e.what(); \
+    return AOTI_TORCH_FAILURE;                             \
+  } catch (...) {                                          \
+    LOG(ERROR) << "Exception in aoti_torch: UNKNOWN";      \
+    return AOTI_TORCH_FAILURE;                             \
+  }                                                        \
+  return AOTI_TORCH_SUCCESS;
+
+namespace torch::aot_inductor {
+
+// utility functions to convert a pointer to an optional value
+template <class T>
+inline c10::optional<T> pointer_to_optional(T* ptr) {
+  return ptr ? c10::make_optional(*ptr) : c10::nullopt;
+}
+
+template <class T, class U, typename = std::enable_if_t<!std::is_same_v<T, U>>>
+inline c10::optional<T> pointer_to_optional(U* ptr) {
+  return ptr ? c10::make_optional<T>(T(*ptr)) : c10::nullopt;
+}
+
+template <>
+inline c10::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
+  return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : c10::nullopt;
+}
+
+template <>
+inline c10::optional<at::Tensor> pointer_to_optional(
+    const AtenTensorHandle* ptr) {
+  return ptr ? c10::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : c10::nullopt;
+}
+
+inline c10::optional<c10::Device> pointer_to_optional_device(
+    int32_t* device_type,
+    int32_t device_index) {
+  return device_type ? c10::make_optional(c10::Device(
+                           static_cast<c10::DeviceType>(*device_type),
+                           static_cast<c10::DeviceIndex>(device_index)))
+                     : c10::nullopt;
+}
+
+// utility functions to convert a pointer to a list
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<c10::optional<T>> : std::true_type {};
+
+template <class T>
+inline c10::ArrayRef<T> pointer_to_list(T* ptr, int64_t len) {
+  return c10::ArrayRef<T>(ptr, len);
+}
+
+template <
+    class T,
+    class U,
+    typename = std::enable_if_t<!std::is_same_v<T, U>>,
+    typename = std::enable_if_t<!is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U* ptr, int64_t len) {
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(T(ptr[i]));
+  }
+  return result;
+}
+
+template <class T, class U, typename = std::enable_if_t<is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U** ptr, int64_t len) {
+  // Here U** denotes a list of optional arguments
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional(ptr[i]));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<at::Tensor> pointer_to_list(
+    const AtenTensorHandle* ptr,
+    int64_t len) {
+  std::vector<at::Tensor> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(*tensor_handle_to_tensor_pointer(*ptr));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<c10::optional<at::Tensor>> pointer_to_list(
+    const AtenTensorHandle** ptr,
+    int64_t len) {
+  std::vector<c10::optional<at::Tensor>> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional<at::Tensor>(ptr[i]));
+  }
+  return result;
+}
+
+template <int N>
+inline std::array<bool, N> pointer_to_list(const int32_t* ptr) {
+  std::array<bool, N> result;
+  std::copy(ptr, ptr + N, result.begin());
+  return result;
+}
+
+// utility functions to convert a pointer to a list of optional values
+template <class T, class U>
+inline c10::optional<c10::ArrayRef<T>> pointer_to_optional_list(
+    U** ptr,
+    int64_t len) {
+  return ptr
+      ? c10::make_optional<c10::ArrayRef<T>>(pointer_to_list<T>(*ptr, len))
+      : c10::nullopt;
+}
+
+} // namespace torch::aot_inductor
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4818c7426ed3d98f323eb5afc26398b08a9b744
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace inductor {
+
+TORCH_API at::Tensor _mm_plus_mm(
+    const at::Tensor& a,
+    const at::Tensor& b,
+    const at::Tensor& c,
+    const at::Tensor& d,
+    at::Tensor& out);
+
+TORCH_API at::Tensor _alloc_from_pool(
+    const at::Tensor& self,
+    int64_t offset_bytes,
+    at::ScalarType dtype,
+    at::IntArrayRef size,
+    at::IntArrayRef stride);
+
+// Similar to as_strided with the following differences
+// - offset is added to the existing offset (rather than replacing it)
+// - view tracking is disabled similar to unsafe_view
+TORCH_API at::Tensor _reinterpret_tensor(
+    const at::Tensor& self,
+    at::IntArrayRef size,
+    at::IntArrayRef stride,
+    int64_t offset_increment = 0);
+
+} // namespace inductor
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..288d4155774a2350c39f5fa008d41de58e110428
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h
@@ -0,0 +1,12 @@
+#ifndef PROFILER_ITT_H
+#define PROFILER_ITT_H
+#include <c10/macros/Export.h>
+
+namespace torch::profiler {
+TORCH_API bool itt_is_available();
+TORCH_API void itt_range_push(const char* msg);
+TORCH_API void itt_range_pop();
+TORCH_API void itt_mark(const char* msg);
+} // namespace torch::profiler
+
+#endif // PROFILER_ITT_H
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h
new file mode 100644
index 0000000000000000000000000000000000000000..28c8ea9e9671062e3be8930e581882dd98f5709d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h
@@ -0,0 +1,351 @@
+#pragma once
+#include <ATen/core/function.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/frontend/name_mangler.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <torch/csrc/Export.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+
+struct Def;
+struct Property;
+struct ClassDef;
+struct SugaredValue;
+struct Resolver;
+
+using ResolverPtr = std::shared_ptr<Resolver>;
+struct Self {
+  virtual ~Self() = default;
+  virtual std::shared_ptr<SugaredValue> makeSugared(Value* v) const = 0;
+  virtual ClassTypePtr getClassType() const = 0;
+};
+
+// A CompilationUnit is a list of named Functions
+// with helper methods to iterate the list or invoke the function.
+// Classes have a CompilationUnit holding the class methods,
+// and Modules have a CompilationUnit holding the Functions that
+// are used to implement their Methods
+
+struct TORCH_API CompilationUnit {
+  enum class FunctionType { Method, Hook, PreHook };
+  // constructor that takes a set of functions to compile using the native
+  // resolver
+  explicit CompilationUnit(const std::string& source);
+  CompilationUnit() = default;
+
+  CompilationUnit& operator=(CompilationUnit&&) = default;
+  CompilationUnit(CompilationUnit&&) = default;
+  CompilationUnit& operator=(const CompilationUnit&) = delete;
+  CompilationUnit(const CompilationUnit&) = delete;
+
+  Function* find_function(const c10::QualifiedName& name) const {
+    auto it = dict_.find(name);
+    if (it == dict_.end()) {
+      return nullptr;
+    }
+    return functions_[it->second].get();
+  }
+
+  Function& get_function(const c10::QualifiedName& name) const {
+    if (auto r = find_function(name)) {
+      return *r;
+    }
+    TORCH_CHECK(false, "attempted to get undefined function ", name.name());
+  }
+
+  void set_optimized(bool o) {
+    TORCH_WARN(
+        "CompilationUnit::set_optimized() is deprecated and has no effect. "
+        "Please use setGraphExecutorOptimize()");
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "CompilationUnit::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  // for historic reasons, these are defined in ir_emitter.cpp
+  // Returns the list of Functions just defined.
+  std::vector<Function*> define(
+      const c10::optional<c10::QualifiedName>& prefix,
+      const std::vector<Property>& properties,
+      const std::vector<ResolverPtr>& propResolvers,
+      const std::vector<Def>& definitions,
+      const std::vector<ResolverPtr>&
+          defResolvers, /* determines how we handle free
+                     variables in each definition*/
+      // if non-null, the first argument to each def, is bound to this value
+      const Self* self,
+      // see [name mangling]
+      bool shouldMangle = false,
+      c10::optional<size_t> operator_set_version = c10::nullopt);
+
+  void define_hooks(
+      const c10::optional<c10::QualifiedName>& prefix,
+      const std::vector<Def>& hookDefs,
+      const std::vector<ResolverPtr>& hookResolvers,
+      const std::vector<Def>& preHookDefs,
+      const std::vector<ResolverPtr>& preHookResolvers,
+      const Self* self,
+      bool shouldMangle = false);
+
+  // same as above but parse the definitions from source
+  // Returns the list of Functions just defined.
+  std::vector<Function*> define(
+      // prefix namespace to put all the defined functions into
+      const c10::optional<c10::QualifiedName>& prefix,
+      const std::string& source,
+      const ResolverPtr& resolver,
+      const Self* self);
+
+  void define_interface(
+      const c10::QualifiedName& qualifiedName,
+      const ClassDef& classDef,
+      ResolverPtr rcb,
+      bool is_module = false);
+
+  Function* create_function(
+      c10::QualifiedName name,
+      std::shared_ptr<Graph> graph,
+      bool shouldMangle = false) {
+    if (shouldMangle) {
+      name = mangle(name);
+    }
+    auto fn = std::make_unique<GraphFunction>(
+        std::move(name), std::move(graph), nullptr);
+    auto ret = fn.get();
+    register_function(std::move(fn));
+    return ret;
+  }
+
+  std::vector<Function*> get_functions() const {
+    return fmap(functions_, [](const std::unique_ptr<Function>& fn) {
+      return fn.get();
+    });
+  }
+
+  /// Run a method from this compilation.
+  ///
+  /// For example:
+  /// @code
+  ///   IValue output = module->run("relu_script", a, b);
+  /// @endcode
+  ///
+  /// To get a compile a module from a source string, see torch::jit::compile
+  ///
+  /// @param method_name The name of the method to run
+  /// @param args Arguments to be passed to the method
+  /// @return An IValue containing the return value (or values if it is a tuple)
+  /// from the method
+  template <typename... Types>
+  IValue run_method(const c10::QualifiedName& method_name, Types&&... args) {
+    return get_function(method_name)({IValue(std::forward<Types>(args))...});
+  }
+
+  void drop_all_functions() {
+    dict_.clear();
+    functions_.clear();
+  }
+
+  /**
+   * Register a class as being owned by this compilation unit.
+   */
+  void register_type(c10::NamedTypePtr namedType) {
+    // TODO: class types cannot be redefined because we have no way right now
+    // of invalidating their methods. NamedTuples are fine though, since they
+    // don't have methods.
+    TORCH_CHECK(
+        0 == classDict_.count(*namedType->name()),
+        "class '",
+        namedType->name()->qualifiedName(),
+        "' already defined.");
+    classes_.push_back(std::move(namedType));
+    classDict_[*classes_.back()->name()] = classes_.size() - 1;
+  };
+
+  c10::ClassTypePtr get_class(const c10::QualifiedName& name) const {
+    auto type = get_type(name);
+    if (!type) {
+      return nullptr;
+    }
+    return type->cast<c10::ClassType>();
+  }
+
+  c10::InterfaceTypePtr get_interface(const c10::QualifiedName& name) const {
+    auto type = get_type(name);
+    if (!type) {
+      return nullptr;
+    }
+    return type->cast<c10::InterfaceType>();
+  }
+
+  c10::TupleTypePtr get_named_tuple(const c10::QualifiedName& name) const {
+    for (const auto& cls : classes_) {
+      if (cls->name()->qualifiedName() == name.qualifiedName()) {
+        return cls->expect<TupleType>();
+      }
+    }
+    return nullptr;
+  }
+
+  c10::NamedTypePtr get_type(const c10::QualifiedName& name) const {
+    auto it = classDict_.find(name);
+    if (it == classDict_.end()) {
+      return nullptr;
+    }
+    return classes_[it->second];
+  }
+
+  // For testing: clear all Python-defined classes to ensure that unit tests
+  // have isolation.
+  void _clear_python_cu() {
+    // Delete all the associated class methods
+    for (const auto& type : classes_) {
+      if (auto cls = type->cast<ClassType>()) {
+        for (auto method : cls->methods()) {
+          // Tombstone the method in the compilation unit.
+          // Don't erase because the dict_
+          auto it = dict_.find(method->qualname());
+          if (it != dict_.end()) {
+            functions_[it->second] = nullptr;
+            // Erase in our big lookup table
+            dict_.erase(it);
+          }
+        }
+        // Classes can have multiple pointers to the same hook,
+        // need to make sure to not delete it twice
+        std::unordered_set<Function*> hooks_to_delete;
+        for (const auto& hook : cls->getForwardHooks()) {
+          hooks_to_delete.insert(hook);
+        }
+        for (const auto& pre_hook : cls->getForwardPreHooks()) {
+          hooks_to_delete.insert(pre_hook);
+        }
+        for (const auto& hook : hooks_to_delete) {
+          // Tombstone the hook in the compilation unit.
+          auto it = dict_.find(hook->qualname());
+          if (it != dict_.end()) {
+            functions_[it->second] = nullptr;
+            // Erase in our big lookup table
+            dict_.erase(it);
+          }
+        }
+      }
+    }
+    classes_.clear();
+    classDict_.clear();
+  }
+
+  // [Internal Only] Remove method.
+  // Note Used for freezing.
+  void unsafeRemoveMethod(const c10::QualifiedName& method_name) {
+    auto it = dict_.find(method_name);
+    TORCH_CHECK(
+        it != dict_.end(),
+        "method '",
+        method_name.qualifiedName(),
+        "' does not exist.");
+    functions_[it->second] = nullptr;
+    dict_.erase(it);
+  }
+
+  // [name mangling] All code objects must have a unique qualified name in a
+  // CompilationUnit. In Python, sometimes functions won't have unique qualified
+  // name (for example, nested functions). So we mangle Python functions to
+  // ensure that they are uniquely named.
+  //
+  // We also use mangling to distinguish different Module instances. Since each
+  // Module is a singleton class instance, different instances of the same
+  // Python Module will have different types but the same qualified name.
+  c10::QualifiedName mangle(const c10::QualifiedName& name) const {
+    auto mangled = name;
+    while (get_type(mangled) || find_function(mangled)) {
+      mangled = mangler_.mangle(mangled);
+    }
+    return mangled;
+  }
+
+ private:
+  std::unique_ptr<Function> define(
+      const c10::optional<c10::QualifiedName>& prefix,
+      const Def& def,
+      const ResolverPtr& resolver,
+      const Self* self,
+      const std::unordered_map<std::string, Function*>& function_table,
+      bool shouldMangle = false,
+      FunctionType type = FunctionType::Method,
+      c10::optional<size_t> version = c10::nullopt) const;
+
+  // Define a property on \p self.
+  struct PropertyPair;
+  PropertyPair define_property(
+      const c10::optional<c10::QualifiedName>& prefix,
+      const Property& prop,
+      const ResolverPtr& resolver,
+      const Self* self,
+      const std::unordered_map<std::string, Function*>& function_table,
+      bool shouldMangle = false) const;
+
+  Function& register_function(std::unique_ptr<Function> fn) {
+    TORCH_CHECK(
+        0 == dict_.count(fn->qualname().qualifiedName()),
+        "method '",
+        fn->qualname().qualifiedName(),
+        "' already defined.");
+    functions_.emplace_back(std::move(fn));
+    dict_[functions_.back()->qualname()] = functions_.size() - 1;
+    return *functions_.back();
+  }
+  std::vector<std::unique_ptr<Function>> functions_;
+  // for fast lookup
+  std::unordered_map<c10::QualifiedName, size_t> dict_;
+  std::unordered_map<c10::QualifiedName, size_t> classDict_;
+
+  // [class ownership] Right now there are two relationships between classes
+  // and compilation units:
+  // 1. Classes have compilation units internally that hold their methods.
+  // 2. On load, the TypePtrs of any imported classes are owned by the main
+  // module's compilation unit.
+  std::vector<c10::NamedTypePtr> classes_;
+
+  mutable NameMangler mangler_;
+};
+
+// An owning pointer to a Function. Just a pair of a raw Function ptr and it's
+// owning CU. We need this because pybind requires a ref-counted way to refer to
+// Functions.
+struct StrongFunctionPtr {
+  StrongFunctionPtr(std::shared_ptr<CompilationUnit> cu, Function* function)
+      : cu_(std::move(cu)), function_(function) {
+    TORCH_INTERNAL_ASSERT(cu_);
+    TORCH_INTERNAL_ASSERT(function_);
+  }
+  std::shared_ptr<CompilationUnit> cu_;
+  Function* function_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using CompilationUnit = ::torch::jit::CompilationUnit;
+} // namespace script
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..36cf09c3623fa80058893598d6245d0c6c477e9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h
@@ -0,0 +1,181 @@
+#pragma once
+
+#include <ATen/core/function.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch::jit {
+
+struct TORCH_API GraphFunction : public Function {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  GraphFunction(
+      c10::QualifiedName name,
+      std::shared_ptr<Graph> graph,
+      std::function<void(GraphFunction&)> function_creator,
+      c10::optional<ExecutorExecutionMode> executor_execution_mode =
+          c10::nullopt)
+      : name_(std::move(name)),
+        graph_(std::move(graph)),
+        executor_execution_mode_(executor_execution_mode),
+        function_creator_(std::move(function_creator)) {}
+
+  bool isGraphFunction() const override {
+    return true;
+  }
+
+  void run(Stack& stack) override;
+
+  std::function<void(GraphFunction&)> function_creator() const {
+    return function_creator_;
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch) override;
+
+  std::shared_ptr<Graph> graph() const {
+    return graph_;
+  }
+
+  std::shared_ptr<Graph> optimized_graph() const;
+
+  const c10::QualifiedName& qualname() const override {
+    return name_;
+  }
+
+  // private/unstable api. sets the initial execution mode
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_initial_executor_execution_mode(ExecutorExecutionMode mode) {
+    executor_execution_mode_ = mode;
+  }
+  // private/unstable api. sets flag of whether or not to ignore amp.
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_ignore_amp(bool ignore_amp) {
+    force_no_amp_ = ignore_amp;
+  }
+
+  // if this isn't yet defined, run its method_creator function
+  void ensure_defined() override;
+
+  size_t num_inputs() const override {
+    return graph()->inputs().size();
+  }
+
+  Function& setSchema(FunctionSchema schema) override {
+    schema_ = std::make_unique<FunctionSchema>(std::move(schema));
+    return *this;
+  }
+
+  const FunctionSchema& getSchema() const override;
+
+  GraphExecutorState getDebugState() {
+    return get_executor().getDebugState();
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "GraphFunction::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  void check_single_output() {
+    TORCH_CHECK(
+        graph()->outputs().size() == 1,
+        "Method (but not graphs in general) require a single output. Use None/Tuple for 0 or 2+ outputs");
+  }
+
+  GraphExecutor& get_executor() {
+    ensure_defined();
+    std::lock_guard<std::recursive_mutex> lock(compile_mutex);
+    auto& executor = executors_[currentSpecialization()];
+    if (executor) {
+      return *executor;
+    }
+    check_single_output();
+    const std::string& name = name_.name();
+    std::shared_ptr<Graph> opt_graph = optimized_graph();
+    if (!executor_execution_mode_) {
+      executor = GraphExecutor(opt_graph, name);
+    } else {
+      executor = GraphExecutor(opt_graph, name, *executor_execution_mode_);
+    }
+    return *executor;
+  }
+
+  using Function::call;
+  bool call(
+      Stack& stack,
+      c10::optional<size_t> bailOut,
+      c10::function_ref<void(const Code&)> f) override {
+    f(get_executor().getPlanFor(stack, bailOut).code);
+    return true;
+  }
+
+  void clear_optimized_graphs() {
+    optimized_graphs_.fill(nullptr);
+  }
+
+ private:
+  enum SpecializationKey {
+    AutocastOff,
+    CpuAutocastOn,
+    GpuAutocastOn,
+    CpuGpuAutocastOn,
+
+    // This provides the number of specializations
+    // (Must be last entry)
+    TotalCount
+  };
+
+  SpecializationKey currentSpecialization() const;
+
+ private:
+  c10::QualifiedName name_;
+  // The original, non-optimized graph
+  std::shared_ptr<Graph> graph_; // for debugging and for inlining
+
+  // allows users to specify Simple/Profiling Executor for function
+  // TODO: add more executors
+  mutable c10::optional<ExecutorExecutionMode> executor_execution_mode_;
+
+  // if invoked on a graph that has already traced through amp
+  // don't invoke amp pass
+  mutable bool force_no_amp_ = false;
+  // Optimized graph, computed lazily. Used for inlining.
+  mutable std::array<std::shared_ptr<Graph>, SpecializationKey::TotalCount>
+      optimized_graphs_;
+
+  // GraphFunctions are invokable from multiple threads, so this lock needs to
+  // be held when we're initializing graph executor for the first time or
+  // computing the optimized graph. We're using reentrant mutex so that we don't
+  // need to worry about causing a deadlock by calling one method from another
+  // (e.g. optimized_graph() from get_executor()).
+  mutable std::recursive_mutex compile_mutex;
+
+  // executor_[0] - autocast off
+  // executor_[1] - autocast cpu on
+  // executor_[2] - autocast gpu on
+  // executor_[3] - autocast cpu & gpu on
+  std::array<c10::optional<GraphExecutor>, SpecializationKey::TotalCount>
+      executors_;
+
+  // an optional function that actually creates the method when
+  // ensure_defined() is called. This is used by the compiler so
+  // that it can construct methods out of order
+  std::function<void(GraphFunction&)> function_creator_;
+
+  // if absent, then we generate a default schema based on the graph
+  // mutable because getSchema caches the default schema if one is requested
+  // before a call to setSchema
+  mutable std::unique_ptr<FunctionSchema> schema_;
+};
+
+// Short hands for dynamic_cast<GraphFunction*>.
+TORCH_API GraphFunction* tryToGraphFunction(Function&) noexcept;
+TORCH_API GraphFunction& toGraphFunction(Function&);
+TORCH_API const GraphFunction& toGraphFunction(const Function&);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ffaa04ac4c6357589894c21a341bd60a95c6495
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <ATen/core/function.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/api/include/torch/imethod.h>
+#include <torch/csrc/jit/api/function_impl.h>
+
+namespace torch::jit {
+
+using ObjectPtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// A method in a module, e.g. f in:
+//
+// class M(ScriptModule):
+//   @script_method
+//   def f(self, x):
+//     ...
+// Note: because Method/Module are exposed to python these
+// classes use python method naming conventions
+struct TORCH_API Method : public torch::IMethod {
+  Method(ObjectPtr owner, Function* function);
+
+  // the module that contains this method.
+  Module owner() const;
+  // the raw objectptr that owns this method, for when the method is owned by a
+  // torchbind object.
+  ObjectPtr raw_owner() const;
+  void run(Stack& stack);
+  void run(Stack&& stack) {
+    run(stack);
+  }
+
+  c10::IValue operator()(
+      std::vector<c10::IValue> stack,
+      const Kwargs& kwargs = Kwargs()) const override;
+
+  // Run method async. Invocation on this function would invokes a JIT
+  // interpreter that executes ops inline, one by one, on caller's thread. A
+  // model can utilize async op, i.e. `fork`, to launch an asynchronous task
+  // which will be launched on provided `taskLauncher`.
+  c10::intrusive_ptr<c10::ivalue::Future> run_async(
+      std::vector<c10::IValue> stack,
+      const Kwargs& kwargs = Kwargs(),
+      TaskLauncher taskLauncher = at::launch);
+
+  std::shared_ptr<Graph> graph() const {
+    return toGraphFunction(*function_).graph();
+  }
+
+  const std::string& name() const override {
+    return function_->name();
+  }
+
+  size_t num_inputs() const {
+    return function_->num_inputs();
+  }
+
+  GraphExecutor& get_executor() {
+    return toGraphFunction(*function_).get_executor();
+  }
+
+  Function& function() const {
+    return *function_;
+  }
+
+ private:
+  void setArgumentNames(std::vector<std::string>&) const override;
+
+  // Methods are uniqued onwed by a single module. This raw pointer allows
+  // looking up the module.
+  ObjectPtr owner_;
+
+  // Underlying unbound function
+  Function* function_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Method = ::torch::jit::Method;
+} // namespace script
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..f69b6f6aa1a3e02c88794e54e2b19d55fcdd0d4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h
@@ -0,0 +1,685 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/object.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/named_value.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/api/include/torch/ordered_dict.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// This file contains classes which assist in desugaring Python style
+// modules and their methods into flattened graphs which don't have any
+// function calls.
+
+namespace torch::jit {
+
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::QualifiedName;
+// Map which stores filename to content.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+using ModulePtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+struct Module;
+
+template <typename T>
+struct slot_list_impl;
+
+template <typename T>
+struct Named {
+  std::string name;
+  T value;
+};
+
+using NameModule = Named<Module>;
+using NameValue = Named<IValue>;
+using NameTensor = Named<at::Tensor>;
+
+namespace detail {
+struct TORCH_API ModulePolicy;
+struct TORCH_API ParameterPolicy;
+struct TORCH_API AttributePolicy;
+struct TORCH_API BufferPolicy;
+template <typename P>
+struct NamedPolicy;
+} // namespace detail
+
+using module_list = slot_list_impl<detail::ModulePolicy>;
+using named_module_list =
+    slot_list_impl<detail::NamedPolicy<detail::ModulePolicy>>;
+
+using parameter_list = slot_list_impl<detail::ParameterPolicy>;
+using named_parameter_list =
+    slot_list_impl<detail::NamedPolicy<detail::ParameterPolicy>>;
+
+using attribute_list = slot_list_impl<detail::AttributePolicy>;
+using named_attribute_list =
+    slot_list_impl<detail::NamedPolicy<detail::AttributePolicy>>;
+
+using buffer_list = slot_list_impl<detail::BufferPolicy>;
+using named_buffer_list =
+    slot_list_impl<detail::NamedPolicy<detail::BufferPolicy>>;
+
+using ModuleLookup = std::function<Module(const std::vector<std::string>&)>;
+
+struct TORCH_API Module : public Object {
+  explicit Module(c10::QualifiedName class_name);
+  Module(std::shared_ptr<CompilationUnit> cu, const c10::ClassTypePtr& type);
+  Module() = default;
+  Module(const Module&) = default;
+  Module& operator=(const Module&) = default;
+  Module(Module&&) noexcept = default;
+  Module& operator=(Module&&) noexcept = default;
+  Module(
+      c10::QualifiedName,
+      std::shared_ptr<CompilationUnit> cu,
+      bool shouldMangle = false);
+  Module(ModulePtr module_value) : Object(std::move(module_value)) {}
+  ~Module() = default;
+
+  void set_optimized(bool o) {
+    TORCH_WARN(
+        "Module::set_optimized() is deprecated and has no effect. "
+        "Please use setGraphExecutorOptimize()");
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "Module::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  IValue forward(std::vector<IValue> inputs, const Kwargs& kwargs = Kwargs()) {
+    return get_method("forward")(std::move(inputs), kwargs);
+  }
+
+  // In script modules, buffers are Tensors attribute that are _not_ registered
+  // as parameters. This is different than in nn.Module where there is a special
+  // register_buffer method. With this simplification, we only need to track
+  // whether a slot is a parameter to be able to classify it.
+  void register_buffer(const std::string& name, at::Tensor v) {
+    bool is_param = false;
+    bool is_buffer = true;
+    std::lock_guard<std::mutex> lock(*register_mutex_);
+    type()->addOrCheckAttribute(name, TensorType::get(), is_param, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_parameter(
+      const std::string& name,
+      at::Tensor v,
+      bool is_buffer) {
+    std::lock_guard<std::mutex> lock(*register_mutex_);
+    type()->addOrCheckAttribute(name, TensorType::get(), !is_buffer, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_attribute(
+      const std::string& name,
+      const TypePtr& t,
+      IValue v,
+      bool is_param = false,
+      bool is_buffer = false) {
+    type()->addOrCheckAttribute(name, t, is_param, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_module(const std::string& name, const Module& module) {
+    type()->addOrCheckAttribute(name, module.type());
+    _ivalue()->setAttr(name, module._ivalue());
+  }
+
+  void apply(const std::function<void(Module&)>& fn);
+
+  buffer_list buffers(bool recurse = true) const;
+  named_buffer_list named_buffers(bool recurse = true) const;
+
+  module_list children() const; // direct modules
+  named_module_list named_children() const;
+  module_list modules() const; // all modules, including this one, recursively
+  named_module_list named_modules() const;
+
+  // all tensors involved in gradient optimization
+  parameter_list parameters(bool recurse = true) const;
+  named_parameter_list named_parameters(bool recurse = true) const;
+
+  // all members of the object, similar to iterating over dir(obj) in python
+  attribute_list attributes(bool recurse = true) const;
+  named_attribute_list named_attributes(bool recurse = true) const;
+
+  void dump(
+      bool print_method_bodies,
+      bool print_attr_values,
+      bool print_param_values) const;
+
+  std::string dump_to_str(
+      bool print_method_bodies,
+      bool print_attr_values,
+      bool print_param_values) const;
+
+  /// Enables "training" mode.
+  void train(bool on = true);
+  /// Calls train(false) to enable "eval" mode.
+  /// Do not override this method, override `train()` instead.
+  void eval() {
+    train(/*on=*/false);
+  }
+  /// True if the module is in training mode.
+  bool is_training() const {
+    return attr("training", true).toBool();
+  }
+
+  /// Recursively casts all parameters to the given `dtype` and `device`.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::Device device, at::ScalarType dtype, bool non_blocking = false);
+
+  /// Recursively casts all parameters to the given dtype.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::ScalarType dtype, bool non_blocking = false);
+
+  /// Recursively moves all parameters to the given device.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::Device device, bool non_blocking = false);
+
+  void save(
+      std::ostream& out,
+      const ExtraFilesMap& extra_files = ExtraFilesMap()) const;
+
+  void save(
+      const std::string& filename,
+      const ExtraFilesMap& extra_files = ExtraFilesMap()) const;
+
+  void _save_for_mobile(
+      std::ostream& out,
+      const ExtraFilesMap& extra_files = ExtraFilesMap(),
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
+
+  void _save_for_mobile(
+      const std::string& filename,
+      const ExtraFilesMap& extra_files = ExtraFilesMap(),
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
+
+  Module copy() const;
+
+  Module deepcopy(c10::optional<at::Device> device = c10::nullopt) const;
+
+  // Clones both the underlying `ClassType` and the module instance(data), this
+  // function creates a new `ClassType` and returns a new instance that has the
+  // same data as the current instance but with the new type, shared ClassType
+  // will be preserved as well
+  Module clone(bool inplace = false) const;
+
+  // Clones both the underlying `ClassType` and the module instance(data), this
+  // function creates a new `ClassType` and returns a new instance that has the
+  // same data as the current instance but with the new type, shared ClassType
+  // will be preserved as well. Also allows the caller to specify a set of
+  // method and attribute names to not clone.
+  Module clone(
+      bool inplace,
+      const std::unordered_set<std::string>& ignored_method,
+      const std::unordered_set<std::string>& ignored_attributes) const;
+
+  void clone_method(const Module& orig, const std::string& name);
+
+  IValue operator()(std::vector<IValue> inputs);
+
+  template <typename... Types>
+  IValue create_class(const c10::QualifiedName& name, Types&&... args) const {
+    return create_class(name, {IValue(std::forward<Types>(args))...});
+  }
+
+  IValue create_class(const c10::QualifiedName& name, Stack stack) const;
+
+  inline bool operator==(const Module& y) const noexcept {
+    return _ivalue() == y._ivalue();
+  }
+
+  void set_delete_memory(std::shared_ptr<char> delete_mem) {
+    mem_to_delete_ = std::move(delete_mem);
+  }
+
+  // A set of functions to maintain input shapes through torch.jit.save and
+  // torch.jit.load. It only works on tensors and lists/dicts of tensors
+  // because tracing is only supported by these types.
+  void store_traced_inputs(std::string func_name, std::vector<IValue> inputs) {
+    if (inputs.size() == 0) {
+      return;
+    }
+    auto c10_inputs = c10::impl::GenericList(AnyType::get());
+    for (IValue& value : inputs) {
+      // Not checking whether this is traceable type as that is already checked
+      // higher up in the stack and changing that would require a larger
+      // restructuring.
+      c10_inputs.emplace_back(std::move(value));
+    }
+    traced_inputs_.insert_or_assign(func_name, c10_inputs);
+  }
+
+  c10::Dict<std::string, c10::impl::GenericList> retrieve_traced_inputs()
+      const {
+    return traced_inputs_;
+  }
+
+ private:
+  Module clone_impl(
+      std::unordered_map<TypePtr, TypePtr>& type_remap,
+      bool inplace,
+      IValue::HashAliasedIValueMap memo,
+      const std::unordered_set<std::string>& ignored_methods,
+      const std::unordered_set<std::string>& ignored_attributes) const;
+
+  void clone_method(
+      const Module& orig,
+      const Function& method,
+      const std::unordered_map<TypePtr, TypePtr>& type_remap);
+
+  c10::QualifiedName getNameForMethod(std::string basename) const {
+    return QualifiedName(*type()->name(), std::move(basename));
+  }
+
+  void to_impl(
+      const c10::optional<at::Device>& device,
+      const c10::optional<at::ScalarType>& dtype,
+      bool non_blocking);
+
+  // Extra handle for the module to delete when itself is deleted
+  std::shared_ptr<char> mem_to_delete_;
+
+  // Map of function names to the traced inputs that they have been traced with
+  c10::Dict<std::string, c10::impl::GenericList> traced_inputs_;
+
+  // Mutex to keep registring buffer or parameter thread safe.
+  std::shared_ptr<std::mutex> register_mutex_ = std::make_shared<std::mutex>();
+};
+
+// C++ equivalent api of `torch.jit.freeze`. See documentation there for
+// details.
+TORCH_API Module freeze(
+    const Module& module,
+    const c10::optional<std::vector<std::string>>& preserved_attrs =
+        c10::nullopt,
+    bool optimize_numerics = true);
+
+// C++ equivalent api of `torch.jit.optimize_for_inference`. See documentation
+// there for details.
+TORCH_API Module optimize_for_inference(
+    Module& module,
+    const std::vector<std::string>& other_methods = {});
+
+enum class FusionBehavior { STATIC, DYNAMIC };
+
+using FusionStrategy = std::vector<std::pair<FusionBehavior, size_t>>;
+// clang-format off
+/*
+Sets the type and number of specializations that can occur during fusion.
+
+Usage: provide a list of pairs (type, depth) where type is one of STATIC or DYNAMIC
+and depth is an integer.
+
+Behavior - static vs dynamic:
+    In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined
+    based on some initial profiling runs.
+    In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
+    shapes are possible.
+
+In both cases, we also recompile on new striding behavior, device, or dtype.
+
+Behavior - fallback functions & depth:
+    When an input doesn't match the format required by the specialized compiled op, it will run
+    a fallback function. Fallback functions are recursively be compiled and specialized based
+    on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to
+    limit the number of specializations that can be compiled, before giving up on recompiling and
+    falling back to a completely un-fused, un-specialized implementation.
+
+The list of (type, depth) pairs controls the type of specializations and the number of
+specializations. For example: [(STATIC, 2), (DYNAMIC, 2)] indicates that the first
+two specializations will use static fusions, the following two specializations will use
+dynamic fusion, and any inputs that satisfy none of the 4 options will run an
+unfused implementation.
+
+NB: in the future, if more as more fusion backends are added there may be more granular
+apis for specific fusers.
+*/
+// clang-format on
+TORCH_API FusionStrategy getFusionStrategy();
+// returns previous strategy
+TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy);
+
+namespace detail {
+
+struct TORCH_API SlotCursor {
+  Module module_;
+  int64_t i_; // slot offset, -1 indicates the module itself
+};
+
+} // namespace detail
+
+// This iterator allows the (optionally recursive) enumeration of
+// the  members of a Module. It performs a depth-first pre-order
+// traversal of the module. The Policy template parameter determines
+// which slots of the object should be included. For instance,
+// when iterating parameters, we return the parameter tensors,
+// but skip modules, buffers, and other attributes.
+// See ModulePolicy for comments about Policy object's API.
+template <typename Policy>
+struct slot_iterator_impl {
+  using SlotCursor = detail::SlotCursor;
+  using value_type = typename Policy::value_type;
+  slot_iterator_impl(
+      Module root,
+      bool recurse, // if true, do a depth-first search, otherwise, just look at
+                    // slots of root
+      bool return_module) // if true include root itself as the first thing
+                          // visited (used in modules())
+      : cursors_({SlotCursor{std::move(root), return_module ? -1 : 0}}),
+        recurse_(recurse) {
+    // advance iterator to first valid element (or the end, if empty)
+    while_not_valid_next();
+  }
+  // empty cursors_, represents end of iteration
+  slot_iterator_impl() : recurse_(false) {}
+  value_type operator*() const {
+    return Policy::create(cursors_, cur());
+  }
+  value_type operator->() const {
+    return **this;
+  }
+  slot_iterator_impl& operator++() {
+    next_valid();
+    return *this;
+  }
+  slot_iterator_impl operator++(int) {
+    // this is really expensive, should we delete it so people don't use it
+    // instead of prefix?
+    slot_iterator_impl old = *this;
+    ++(*this);
+    return old;
+  }
+
+ private:
+  // return_module() is a corner case where instead of returning a submodule
+  // of root, we are returning root itself, because we are iterating modules(),
+  // which contains the root module itself.
+  // It is represented with a single SlotCursor whose index is -1.
+  bool return_module() const {
+    return top().i_ == -1;
+  }
+  const SlotCursor& top() const {
+    return cursors_.back();
+  }
+  SlotCursor& top() {
+    return cursors_.back();
+  }
+  IValue cur() const {
+    return return_module() ? top().module_._ivalue()
+                           : top().module_._ivalue()->getSlot(top().i_);
+  }
+
+  // advance to the next slot in a depth first pre-order traversal of the
+  // modules slots. This function does not guarantee the next slot is a
+  // valid element of the iteration. That is done by valid().
+  // invariant: !cursors_.empty()
+  void next() {
+    // we just returned the module itself, advance i_ to 0 so we are now
+    // at the first slot of the module.
+    if (return_module()) {
+      ++top().i_;
+      return;
+    }
+    // the last traversal action advanced beyond the number of slots in the
+    // module so continue the iteration in the parent.
+    if (top().i_ >= int64_t(top().module_._ivalue()->type()->numAttributes())) {
+      cursors_.pop_back();
+      if (!cursors_.empty()) {
+        ++top().i_;
+      }
+      return;
+    }
+    // if the current thing is a module, we have to scan it for recursive
+    // traversals. We do this by adding a new SlotCursor to track the traversal.
+    if (recurse_ &&
+        top().module_._ivalue()->type()->getAttribute(top().i_)->is_module()) {
+      cursors_.emplace_back(SlotCursor{cur().toModule(), 0});
+      return;
+    }
+    // common case: advance to the next slot.
+    ++top().i_;
+  }
+  // is the current position of the iterator a valid one?
+  // otherwise, we have to continue advancing.
+  bool valid() const {
+    return top().i_ <
+        int64_t(top().module_._ivalue()->type()->numAttributes()) &&
+        Policy::valid(
+               top().module_._ivalue()->type(),
+               top().i_,
+               top().module_._ivalue()->getSlot(top().i_));
+  }
+  void while_not_valid_next() {
+    // advance iteration until we are either at the end (cursors_.empty())
+    // or in a valid state. return_module() is a special case,
+    // and is always considered valid, regardless of Policy, because it is
+    // it is only true when we are iterating modules.
+    while (!cursors_.empty() && !return_module() && !valid()) {
+      next();
+    }
+  }
+  void next_valid() {
+    // avoid crashing if this is empty
+    if (cursors_.empty()) {
+      return;
+    }
+    // advance to next element, which is maybe not valid
+    next();
+    while_not_valid_next();
+  }
+
+  std::vector<SlotCursor> cursors_;
+  bool recurse_;
+
+  friend inline bool operator!=(
+      const slot_iterator_impl<Policy>& a,
+      const slot_iterator_impl<Policy>& b) {
+    // we are finished iteration when we have no more iteration SlotCursors.
+    // end is always an empty iterator with no cursors.
+    return (a.cursors_.empty() != b.cursors_.empty());
+  }
+};
+
+// This type represents lists of parameters, attributes, and
+// submodules contained in the module. It is abstract because
+// they are not stored directly in std::vectors but inside the
+// module's IValue object itself.
+template <typename Policy>
+struct slot_list_impl {
+  using iterator = slot_iterator_impl<Policy>;
+  using const_iterator = slot_iterator_impl<Policy>;
+  using value_type = typename iterator::value_type;
+  slot_iterator_impl<Policy> begin() const {
+    return slot_iterator_impl<Policy>(module_, recurse_, return_module_);
+  }
+  slot_iterator_impl<Policy> end() const {
+    return slot_iterator_impl<Policy>();
+  }
+  size_t size() const {
+    if (!size_) {
+      size_ = size_t(0);
+      // NOLINTNEXTLINE(clang-diagnostic-unused-variable)
+      for (const value_type& s : *(this)) {
+        (void)s; // Suppress unused variable warning
+        ++*size_;
+      }
+    }
+    return *size_;
+  }
+
+  slot_list_impl(Module module, bool recurse, bool return_module)
+      : module_(std::move(module)),
+        recurse_(recurse),
+        return_module_(return_module),
+        size_(c10::nullopt) {
+    if (!recurse && !return_module && Policy::all_slots) {
+      size_ = module_.num_slots();
+    }
+  }
+
+ private:
+  Module module_;
+  bool recurse_;
+  bool return_module_;
+  // size of this list, cached on first request
+  // when we need to filter the slot list
+  mutable c10::optional<size_t> size_;
+  friend struct Module;
+};
+
+namespace detail {
+
+// slot_iterator_impl always iterate over all the slots in a module,
+// the Policy template argument determines slots should be returned and their
+// types
+struct TORCH_API ModulePolicy {
+  // the type of the value being returned
+  using value_type = Module;
+
+  // the logic for creating the type being returned, given the raw IValue
+  // of that object.
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return Module(std::move(v).toObject());
+  }
+  // is slot i in typ something that this iterator should return, otherwise,
+  // we skip it.
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->getAttribute(i)->is_module();
+  }
+  // are we going to return everything? If so, we can optimize the calculate
+  // of the size of the list.
+  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+};
+
+struct TORCH_API ParameterPolicy {
+  using value_type = at::Tensor;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return std::move(v).toTensor();
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->is_parameter(i) && v.isTensor();
+  }
+  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+};
+
+struct TORCH_API BufferPolicy {
+  using value_type = at::Tensor;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return std::move(v).toTensor();
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->getAttribute(i)->isSubtypeOf(*TensorType::get()) &&
+        typ->is_buffer(i);
+  }
+  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false;
+};
+
+struct TORCH_API AttributePolicy {
+  using value_type = IValue;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return v;
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return true;
+  }
+  static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = true;
+};
+
+// take a Policy object, and make a version of it that returns the slot.
+// along with the fully qualified name of that slot. This is used for the named_
+// variants like named_parameters().
+template <typename Policy>
+struct NamedPolicy {
+  using value_type = Named<typename Policy::value_type>;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    std::string name;
+    if (cursors.size() == 1) {
+      name = (cursors.back().i_ == -1) ? "" : nameFragment(cursors.back());
+    } else {
+      std::ostringstream ss;
+      for (const auto i : c10::irange(cursors.size())) {
+        if (i > 0) {
+          ss << ".";
+        }
+        ss << nameFragment(cursors[i]);
+      }
+      name = ss.str();
+    }
+    return value_type{std::move(name), Policy::create(cursors, std::move(v))};
+  }
+  static bool valid(const ClassTypePtr& t, size_t i, const IValue& v) {
+    return Policy::valid(t, i, v);
+  }
+  static constexpr bool all_slots = Policy::all_slots;
+
+ private:
+  static std::string nameFragment(const detail::SlotCursor& f) {
+    return f.module_.type()->getAttributeName(f.i_);
+  }
+};
+
+} // namespace detail
+
+TORCH_API bool& getInlineEverythingMode();
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Module = ::torch::jit::Module;
+using ExtraFilesMap = ::torch::jit::ExtraFilesMap;
+} // namespace script
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h
new file mode 100644
index 0000000000000000000000000000000000000000..a25d23beb8cc72f35ef2b26e6daeeca872b85b5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <ATen/core/functional.h>
+#include <ATen/core/ivalue.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/api/method.h>
+
+#include <utility>
+
+namespace torch::jit {
+
+struct Resolver;
+using ResolverPtr = std::shared_ptr<Resolver>;
+
+using ObjectPtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// Throw this in C++ land if `attr` fails. This will be converted to a Python
+// AttributeError by the Python binding code
+class ObjectAttributeError : public std::runtime_error {
+ public:
+  ObjectAttributeError(const std::string& what) : std::runtime_error(what) {}
+};
+
+struct TORCH_API Object {
+  Object() = default;
+  Object(const Object&) = default;
+  Object& operator=(const Object&) = default;
+  Object(Object&&) noexcept = default;
+  Object& operator=(Object&&) noexcept = default;
+  Object(ObjectPtr _ivalue) : _ivalue_(std::move(_ivalue)) {}
+  Object(std::shared_ptr<CompilationUnit> cu, const c10::ClassTypePtr& type);
+  Object(
+      c10::QualifiedName,
+      std::shared_ptr<CompilationUnit> cu,
+      bool shouldMangle = false);
+
+  ObjectPtr _ivalue() const {
+    TORCH_INTERNAL_ASSERT(_ivalue_);
+    return _ivalue_;
+  }
+
+  c10::ClassTypePtr type() const {
+    return _ivalue()->type();
+  }
+
+  struct Property {
+    std::string name;
+    Method getter_func;
+    c10::optional<Method> setter_func;
+  };
+
+  void setattr(const std::string& name, c10::IValue v) {
+    if (_ivalue()->type()->hasConstant(name)) {
+      TORCH_CHECK(
+          false,
+          "Can't set constant '",
+          name,
+          "' which has value:",
+          _ivalue()->type()->getConstant(name));
+    } else if (auto slot = _ivalue()->type()->findAttributeSlot(name)) {
+      const c10::TypePtr& expected = _ivalue()->type()->getAttribute(*slot);
+      TORCH_CHECK(
+          v.type()->isSubtypeOf(*expected),
+          "Expected a value of type '",
+          expected->repr_str(),
+          "' for field '",
+          name,
+          "', but found '",
+          v.type()->repr_str(),
+          "'");
+      _ivalue()->setSlot(*slot, std::move(v));
+    } else {
+      TORCH_CHECK(false, "Module has no attribute '", name, "'");
+    }
+  }
+
+  c10::IValue attr(const std::string& name) const {
+    if (auto r = _ivalue()->type()->findAttributeSlot(name)) {
+      return _ivalue()->getSlot(*r);
+    }
+    if (auto r = _ivalue()->type()->findConstantSlot(name)) {
+      return _ivalue()->type()->getConstant(*r);
+    }
+    std::stringstream err;
+    err << _ivalue()->type()->repr_str() << " does not have a field with name '"
+        << name.c_str() << "'";
+    throw ObjectAttributeError(err.str());
+  }
+
+  c10::IValue attr(const std::string& name, c10::IValue or_else) const {
+    if (auto r = _ivalue()->type()->findAttributeSlot(name)) {
+      return _ivalue()->getSlot(*r);
+    }
+    if (auto r = _ivalue()->type()->findConstantSlot(name)) {
+      return _ivalue()->type()->getConstant(*r);
+    }
+    return or_else;
+  }
+
+  bool hasattr(const std::string& name) const {
+    return _ivalue()->type()->hasAttribute(name) ||
+        _ivalue()->type()->hasConstant(name);
+  }
+
+  // each object owns its methods. The reference returned here
+  // is guaranteed to stay valid until this module has been destroyed
+  Method get_method(const std::string& name) const {
+    if (auto method = find_method(name)) {
+      return *method;
+    }
+    AT_ERROR("Method '", name, "' is not defined.");
+  }
+
+  const std::vector<Method> get_methods() const {
+    return c10::fmap(type()->methods(), [&](Function* func) {
+      return Method(_ivalue(), func);
+    });
+  }
+
+  bool has_property(const std::string& name) const {
+    for (const auto& prop : type()->properties()) {
+      if (prop.name == name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const Property get_property(const std::string& name) const {
+    for (const auto& prop : type()->properties()) {
+      if (prop.name == name) {
+        c10::optional<Method> setter = c10::nullopt;
+        if (prop.setter) {
+          setter = Method(_ivalue(), prop.setter);
+        }
+        return Property{
+            prop.name, Method(_ivalue(), prop.getter), std::move(setter)};
+      }
+    }
+    AT_ERROR("Property '", name, "' is not defined.");
+  }
+
+  const std::vector<Property> get_properties() const {
+    return c10::fmap(type()->properties(), [&](ClassType::Property prop) {
+      c10::optional<Method> setter = c10::nullopt;
+      if (prop.setter) {
+        setter = Method(_ivalue(), prop.setter);
+      }
+      return Property{
+          std::move(prop.name),
+          Method(_ivalue(), prop.getter),
+          std::move(setter)};
+    });
+  }
+
+  c10::optional<Method> find_method(const std::string& basename) const;
+
+  /// Run a method from this module.
+  ///
+  /// For example:
+  /// @code
+  ///   IValue output = module->run("relu_script", a, b);
+  /// @endcode
+  ///
+  /// To get a compile a module from a source string, see torch::jit::compile
+  ///
+  /// @param method_name The name of the method to run
+  /// @param args Arguments to be passed to the method
+  /// @return An IValue containing the return value (or values if it is a tuple)
+  /// from the method
+  template <typename... Types>
+  IValue run_method(const std::string& method_name, Types&&... args) {
+    return get_method(method_name)({IValue(std::forward<Types>(args))...});
+  }
+
+  // so that C++ users can easily add methods
+  void define(const std::string& src, const ResolverPtr& resolver = nullptr);
+
+  size_t num_slots() const {
+    return _ivalue()->slots().size();
+  }
+
+  // shallow copy the object
+  Object copy() const;
+
+  // Copies all the attributes of the object recursively without creating new
+  // `ClassType`, including deepcopy of Tensors
+  Object deepcopy() const;
+
+ private:
+  // mutable be we lazily initialize in module_object.
+  mutable ObjectPtr _ivalue_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Object = ::torch::jit::Object;
+} // namespace script
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..782310f03a027c279d0ab1fb66fc240bc297419d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <ATen/core/builtin_function.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/backends/backend_interface.h>
+#include <torch/custom_class.h>
+
+namespace torch {
+namespace jit {
+namespace {
+// NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+inline c10::FunctionSchema getIsAvailableSchema() {
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument available("available", c10::BoolType::get());
+  c10::FunctionSchema preprocessor_schema(
+      "is_available",
+      /*overload_name=*/"",
+      /*arguments=*/{self},
+      /*returns=*/{available});
+  return preprocessor_schema;
+}
+
+constexpr static auto kBackendsNamespace = "__backends__";
+
+// NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+inline c10::FunctionSchema getCompileSchema() {
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument mod("processed", c10::AnyType::get());
+  auto any_dict_ty =
+      c10::DictType::create(c10::StringType::get(), c10::AnyType::get());
+  c10::Argument method_compile_spec("method_compile_spec", any_dict_ty);
+  c10::Argument handles("handles", any_dict_ty);
+
+  c10::FunctionSchema compile_schema(
+      "compile",
+      /*overload_name=*/"",
+      /*arguments=*/{self, mod, method_compile_spec},
+      /*returns=*/{handles});
+  return compile_schema;
+}
+
+// NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
+inline c10::FunctionSchema getExecuteSchema() {
+  auto any_list_ty = c10::ListType::create(c10::AnyType::get());
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument handle("handle", c10::AnyType::get());
+  c10::Argument input("input", any_list_ty);
+  c10::Argument output("output", any_list_ty);
+  return c10::FunctionSchema(
+      "execute",
+      /*overload_name=*/"",
+      /*arguments=*/{self, handle, input},
+      /*returns=*/{output});
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getIsAvailableFunc() {
+  return [](Stack& stack) {
+    auto self = pop(stack).toCustomClass<TBackendInterface>();
+    auto ret = self->is_available();
+    push(stack, ret);
+  };
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getCompileFunc() {
+  return [](Stack& stack) {
+    auto method_compile_spec = pop(stack).toGenericDict();
+    auto processed = pop(stack);
+    auto self = pop(stack).toCustomClass<TBackendInterface>();
+    auto ret = self->compile(processed, method_compile_spec);
+    push(stack, ret);
+  };
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getExecuteFunc() {
+  return [](Stack& stack) {
+    auto args = pop(stack);
+    auto handle = pop(stack);
+    auto self = pop(stack);
+    auto backend = self.toCustomClass<TBackendInterface>();
+    auto res = backend->execute(handle, args.toList());
+    push(stack, res);
+  };
+}
+} // namespace
+
+// Static registration API for backends.
+template <class TBackendInterface>
+class backend {
+  static_assert(
+      std::is_base_of<PyTorchBackendInterface, TBackendInterface>::value,
+      "torch::jit::backend<T> requires T to inherit from PyTorchBackendInterface");
+  std::string backend_name_;
+
+ public:
+  // Registers a new backend with /p name, and the given /p preprocess
+  // function.
+  backend(const std::string& name) : backend_name_(name) {
+    static auto cls = torch::class_<TBackendInterface>(kBackendsNamespace, name)
+                          .def(torch::init<>())
+                          ._def_unboxed(
+                              "is_available",
+                              getIsAvailableFunc<TBackendInterface>(),
+                              getIsAvailableSchema())
+                          ._def_unboxed(
+                              "compile",
+                              getCompileFunc<TBackendInterface>(),
+                              getCompileSchema())
+                          ._def_unboxed(
+                              "execute",
+                              getExecuteFunc<TBackendInterface>(),
+                              getExecuteSchema());
+  }
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f340620326b089684984da23617cd48ca9a3422
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h
@@ -0,0 +1,140 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+
+#include <torch/csrc/jit/backends/backend_detail.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+#include <atomic>
+
+namespace torch {
+namespace jit {
+
+/*
+ *  BackendDebugHandleManager is responsible for issuing debug handles to
+ *  backends. Debug handles are associated with nodes of a graph.
+ *  BackendDebugHandleManager also maintains a map
+ *  [debug-handle, DebugInfoTuple = {source range, inlined callstack ptr]} that
+ *  will help generate a callstack for exception raised using debug handles.
+ *  Effectively debug handles are something that is given to backend and later
+ *  when an exception occurs in the backend, backend can tell, using debug
+ *  handle, that an exception occurred here. Then the runtime can generate
+ *  callstack correspoding to the exception.
+ *  There are two parts to BackendDebugHandleManager:
+ *  1. static std::atomic debug_handle
+ *  2. Map of [debug-handle, DebugInfoTuple]
+ *
+ *  About 1:
+ *  Why do they have to be unique. The reason is that by ensuring
+ *  uniqueness of debug handles, we remove the burden of another layer of
+ *  mapping where we need to say this set of debug handles were generated for
+ *  this lowered module or this bytecode function. This simplifies the API for
+ *  serialization since debug handles can uniquely identify DebugInfoTuple.
+ *  Thus simplifies the runtime API for throwing exception. Exception throwing
+ *  only needs to know debug_handle and not which module or method threw it.
+ *  There are 2 issues to keep in mind, though,for static std::atomic
+ *  debug_handle: A. Performance implications of using atomic variable. However
+ *  this is only used for compilation so we assume to absorb some of that
+ *  penalty. Plus if there is no contention then we should have less to worry
+ *  about. B. If repeated compilation is part of a long running process then we
+ *  may overflow int64_t. We may detect and fail on this. For now this is not
+ *  done.
+ *
+ *  Now about 2:
+ *  There are two usecases for [debug-handle, DebugInfoTuple]
+ *  A. During bytecode generation the DebugInfoTuple corresponding to the nodes
+ *  of the inlined graph being serialized, are stored in this object and a
+ *  unique debug handle is returned. This unique debug handle is stored in
+ *  mobile_debug info for pytorch lite models. It will be used for raising
+ *  exceptions as well as profiling. B. During backend lowering, each backend's
+ *  preprocess/compile method can compile method's graph and serialize those
+ *  methods. Once the method is lowered to backend, graph is essentially lost.
+ *  Without access to graph it is hard to generate model level debug info. Thus
+ *  the debug handles provide a way to map nodes of the graph to the model level
+ *  debug info.
+ *
+ *  During byte-code model serialization, [debug-handle, DebugInfoTuple] is
+ *  serialized. Now we know a. debug handles and b. how to map debug handles to
+ *  model source code. Thus we can either do eager symbolication by converting
+ *  debug handles to corresponding source code at runtime, or do lazy
+ *  symbolicattion offline.
+ *
+ *  Note that it is not necessary to serialize [debug-handle, DebugInfoTuple]
+ *  corresponding to lowered backend if the lowering process, that is
+ *  preprocess/compile, and execution happens in the same session, then eager
+ *  symbolication can be employed.
+ *
+ *  Now how does BackendDebugHandleManager capture all of the above?
+ *  By providing two API.
+ *  1. getNextDebugHandle which given a Node* returns a unique debug handle,
+ *     that will uniquely identify DebugInfoTuple.
+ *     and
+ *  2. getCallStackPtrMap which returns the map
+ *     [debug-handle, DebugInfoTuple]
+ *
+ *  1 provides debug handles to backends and 2 provides runtime a way to map
+ *  debug handles to source level debug info.
+ *
+ *  So why does debug handle map to DebugInfoTuple = {source range and inlined
+ *  cs}? {debug_handle, source_range_tag, serialized_callstack} Take this
+ *  example: class L(nn.Module): def __init__(self):
+ *      ...
+ *    def forward(self, x):
+ *      return x * 5
+ *  class M(nn.Module):
+ *    def __init__(self):
+ *      ...
+ *    def forward(self, x):
+ *      return x - 2
+ *  class N(nn.Module):
+ *    def __init__(self):
+ *      self.m = M()
+ *    def forward(self, x):
+ *      return self.m(x) + 3
+ *  m = torch.jit.script(N())
+ *  Once you inline m's forward method, m.forward.graph will look something
+ *  like this
+ *  graph(%self...):
+ *   %x = aten::mul(..)
+ *   %x = aten::sub(x, ..)
+ *   %y = aten::add(x, ..)
+ *   ..
+ *  Inlined callstack ptr for these two nodes will look like:
+ *  aten::mul's inlined CS (callstack): [N.forward, source range] -> [M.forward,
+ *  source range] aten::sub's inlined CS (callstack): [N.forward, source range]
+ *  aten::add's inlined CS: null
+ *  mul node's inlined CS contains only information about the callsites' source
+ *  range The information about mul node's source range ('return x * 5') is not
+ *  available in its inlined CS. It is rather part of node's source range
+ *  instead of inlined CS. Thus to get full stack: [N.forward, source range] ->
+ *  [M.forward, source range] -> [aten::mul's source range] We need to track
+ *  mul's source range and inlined CS both.
+ */
+
+using BackendDebugInfoMapType =
+    std::unordered_map<torch::jit::DebugHandleType, DebugInfoTuple>;
+
+/*
+ * This class is used to generate debug info map.
+ * backend's preprocess will call generate_debug_handles (see
+ * backend_detail.cpp), which uses debug_handle_manager to generate debug
+ * handles. When lowering process finishes, calling stopRecording will
+ * return debug info map from debug_handle_manager
+ */
+class TORCH_API BackendDebugInfoRecorder {
+ public:
+  BackendDebugInfoRecorder() = default;
+  int64_t getNextDebugHandle(const Node* node);
+  // Reason this is not done as RAII is that work done in stopRecording
+  // can throw, and throwing with dtor will call terminate and thus voids any
+  // exception catching at a higher level.
+  BackendDebugInfoMapType stopRecording();
+  NodeToDebugHandle generate_debug_handles(const std::shared_ptr<Graph>& graph);
+
+ private:
+  static std::atomic<DebugHandleType> unique_debug_handle_;
+  BackendDebugInfoMapType handles_to_inlined_callstack_ptrs_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c7e41c88eee0052185b18c36c1e252637ab9a2f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#ifndef BUILD_LITE_INTERPRETER
+#include <torch/csrc/jit/backends/backend_debug_handler.h>
+#endif
+#include <torch/custom_class.h>
+
+namespace torch {
+namespace jit {
+
+constexpr static auto kBackendUtilsNamespace = "backendutils";
+constexpr static auto kBackendDebugInfoClass = "BackendDebugInfo";
+
+#ifndef BUILD_LITE_INTERPRETER
+/*
+ * Custom class for holding debug information in lowered modules, intended
+ * purely for keeping this information to be later serialized outside of the
+ * lowered module itself.
+ * Its usage pattern is:
+ * 1. LoweredModule declares an instance of this class in __backend_debug_info
+ * 2. During serialization, __backend_debug_info is used to obtain the debug
+ *    information.
+ * 3. The contents of LoweredModule.__backend_debug_info are not serialized
+ *    within the LoweredModule itself.
+ */
+class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendDebugInfo() = default;
+
+  c10::optional<BackendDebugInfoMapType>& getDebugInfoMap() {
+    return debug_info_map_;
+  }
+
+  void setDebugInfoMap(BackendDebugInfoMapType&& debug_info_map) {
+    debug_info_map_ = std::move(debug_info_map);
+  }
+
+ private:
+  c10::optional<BackendDebugInfoMapType> debug_info_map_;
+};
+
+#else
+
+/*
+ * Dummy instance exists for the following reason:
+ * __backend_debug_info is of type BackendDebugInfo which is a torchbind'
+ * class backed by cpp class PyTorchBackendDebugInfo.
+ * PyTorchBackendDebugInfo, depends on ir.h., scope.h, source_range etc.
+ * We dont include this on lite interpreter side. Thus on lite interpreter side
+ * we cannot have valid definition of PyTorchBackendDebugInfo. However we do not
+ * need valid instance of __backend_debug_info in lite interpreter anyway as we
+ * dont serialize this info as part of LowerdModule as mentioned ealrier.
+ * However since LoweredModule has registered attribute of __backend_debug_info
+ * we still need to make sure that BackendDebugInfo is registered with
+ * TorchScript. However in this instance it does not have to be backed by
+ * PyTorchBackendDebugInfo, so we create a dummy PyTorchBackendDebugInfoDummy
+ * just for this purpose.
+ */
+class PyTorchBackendDebugInfoDummy : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendDebugInfoDummy() = default;
+};
+#endif
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8cd5a4f34dce91872ebb1e7916b2384eceaea29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+#include <ATen/core/jit_type.h>
+
+#include <functional>
+
+namespace torch {
+namespace jit {
+
+using DebugHandleType = int64_t;
+
+using NodeToDebugHandle = std::unordered_map<Node*, DebugHandleType>;
+
+using BackendDebugHandleGenerator =
+    std::function<NodeToDebugHandle(const std::shared_ptr<Graph>&)>;
+
+namespace detail {
+
+using BackendPreprocessFunction = std::function<c10::IValue(
+    const Module&,
+    const c10::Dict<IValue, IValue>&,
+    const BackendDebugHandleGenerator& generate_debug_handles)>;
+
+TORCH_API void registerBackendPreprocessFunction(
+    const std::string& name,
+    const BackendPreprocessFunction& preprocess);
+
+bool hasBackendPreprocessFunction(const std::string& name);
+
+BackendPreprocessFunction getBackendPreprocessFunction(const std::string& name);
+
+TORCH_API Module codegen_backend_module(
+    const std::string& backend_name,
+    const Module& orig_module,
+    const c10::Dict<IValue, IValue>& method_compile_spec,
+    const c10::DictTypePtr& any_dict_ty);
+} // namespace detail
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..867200d378b71edf4b06b25b5a069fad3744ab21
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h
@@ -0,0 +1,54 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+namespace c10 {
+class TORCH_API BackendRuntimeException : public c10::Error {
+ public:
+  // Use debug_handle to throw exception
+  BackendRuntimeException(
+      SourceLocation loc,
+      std::string msg,
+      int64_t debug_handle)
+      : c10::Error(loc, msg) {
+    debug_handles.push_back(debug_handle);
+  }
+  // If rethrowing, can push another debug_handle
+  // This is useful in couple of scenarios.
+  // 1. A submodule is lowered and lite interperter has CallMethod
+  //    to lowered module's method. In this case lowered module will throw with
+  //    a handle, plus there will be another debug handle corresponding
+  //    to the CallMethod node in lite interpreter. Both together give complete
+  //    trace. This function allows lite interpreter to rethrow with debug
+  //    handle it has for CallMethod.
+  // 2. Another scenarios is when lite interperter can make function calls or
+  //    the lowered backend also has function call ability. Thus we have
+  //    multiple function frames. Now we need a stack of handles to symbolicate
+  //    entire stack trace.
+  void pushDebugHandle(int64_t debug_handle) {
+    debug_handles.push_back(debug_handle);
+  }
+  const std::vector<int64_t>& getDebugHandles() {
+    return debug_handles;
+  }
+
+ private:
+  // Stores stack of debug handles.
+  std::vector<int64_t> debug_handles;
+};
+
+} // namespace c10
+#define TORCH_DELEGATED_BACKEND_THROW(cond, msg, debug_handle) \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                        \
+    throw ::c10::BackendRuntimeException(                      \
+        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+        msg,                                                   \
+        debug_handle);                                         \
+  }
+
+#define TORCH_DELEGATED_BACKEND_RETHROW(e, debug_handle) \
+  do {                                                   \
+    e.pushDebugHandle(debug_handle);                     \
+    throw;                                               \
+  } while (false)
+
+#define DEBUG_HANDLE_UNKNOWN -1
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..284caed5f9243ad4e9bd8b40efe239d900a3ec24
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace jit {
+// Initialize Python bindings for JIT to_<backend> functions.
+void initJitBackendBindings(PyObject* module);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fcdcdbcec027401d8e87f13f908e5cbe613f4da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/custom_class.h>
+
+namespace torch {
+namespace jit {
+
+// Interface for a JIT backend.
+class TORCH_API PyTorchBackendInterface : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendInterface() noexcept;
+  ~PyTorchBackendInterface() override;
+
+  // Returns true if the backend is available to process delegation calls.
+  virtual bool is_available() = 0;
+
+  // Compile the module contained in \p processed using the details provided in
+  // \p method_compile_spec for each module method that should be compiled for
+  // the backend. \p method_compile_spec should be of type Dict<string, Any>.
+  // \returns a dictionary of type Dict<string, Any> that contains a backend
+  // handle each method that can run on the backend (i.e. each key in \p
+  // method_compile_spec).
+  virtual c10::impl::GenericDict compile(
+      c10::IValue processed,
+      c10::impl::GenericDict method_compile_spec) = 0;
+
+  // Execute the method specified by \p handle using \p inputs. \returns the
+  // outputs as a tuple.
+  virtual c10::impl::GenericList execute(
+      c10::IValue handle,
+      c10::impl::GenericList inputs) = 0;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a4043f018ec747f9214c3d0769092b9b63f9c7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/backends/backend_detail.h>
+namespace torch {
+namespace jit {
+class backend_preprocess_register {
+  std::string backend_name_;
+
+ public:
+  backend_preprocess_register(
+      const std::string& name,
+      const detail::BackendPreprocessFunction& preprocess)
+      : backend_name_(name) {
+    detail::registerBackendPreprocessFunction(name, preprocess);
+  }
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..281304c1b6d2b9ca761a80f4d61cb0b5a946b8d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/resolver.h>
+
+namespace torch {
+namespace jit {
+// Create a Resolver for use in generating LoweredModules for specific backends.
+TORCH_API std::shared_ptr<Resolver> loweredModuleResolver();
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..697c93369fbfcadc1c23cd52d77bcc87759a5dfd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+/*
+ * This file contains APIs for cuda fuser;
+ *
+ * We use an empty static struct to hold the function pointers, which are
+ * registered separately. This is to support cpu-only compilation.
+ * Registration is done in torch/csrc/jit/codegen/cuda/register_interface.cpp
+ */
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+TORCH_API std::atomic<bool>& getCudaFusionGuardMode();
+
+TORCH_API bool getSingletonFusion();
+TORCH_API bool setSingletonFusion(bool value);
+TORCH_API bool getHorizontalFusion();
+TORCH_API bool setHorizontalFusion(bool value);
+
+// dummy struct to allow API registration
+struct CudaFuserInterface {
+  void (*fn_compile_n)(Node*) = nullptr;
+  void (*fn_run_n_s)(const Node*, Stack&) = nullptr;
+  void (*fn_fuse_graph)(std::shared_ptr<Graph>&) = nullptr;
+  bool (*fn_can_fuse_n)(const Node*) = nullptr;
+  void (*fn_insert_profile_inodes)(ProfilingRecord* pr) = nullptr;
+  bool (*fn_profile_n)(const Node*) = nullptr;
+  bool (*fn_skip_n)(const std::string&, bool flip) = nullptr;
+};
+
+// Get interface, this is used by registration and user facing API internally
+TORCH_API CudaFuserInterface* getFuserInterface();
+
+TORCH_API void compileFusionGroup(Node* fusion_node);
+TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack);
+TORCH_API void fuseGraph(std::shared_ptr<Graph>&);
+TORCH_API bool canFuseNode(const Node* node);
+TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
+TORCH_API bool profileNode(const Node* node);
+
+TORCH_API bool skipNode(const std::string& symbol_str, bool flip = true);
+
+TORCH_API bool isEnabled();
+TORCH_API bool setEnabled(bool is_enabled);
+TORCH_API bool canBeEnabled();
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..004ab5c7872b01e2004c1012763c335ebb08fa17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API const std::vector<Function*>& getAllBuiltinFunctionsFor(Symbol name);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h
new file mode 100644
index 0000000000000000000000000000000000000000..afa456b69835b3b9e7172e93dbd6952e477aab05
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <memory>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+
+// Transforms loops so that they can be represented as python
+// for or while loops
+TORCH_API void CanonicalizeModifiedLoops(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d51ad5643be675b689d98dd010273cd8a363c6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h
@@ -0,0 +1,241 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+enum class IterableModuleKind { NONE, LIST, DICT, PARAMLIST, PARAMDICT };
+class ConcreteModuleType;
+
+// You can think of an nn.Module as a template that corresponds to a family of
+// JIT types. The template "arguments" are things like the constant values.
+// e.g.
+//   class M(nn.Module):
+//        __constants__ = ["const"]
+//        ...
+//
+// Is similar to writing the following in C++:
+//
+//    template<TConst>
+//    class M {
+//       ...
+//    }
+//
+// We need to consider each different member of the type family a different JIT
+// type because, e.g. different constant values lead to different versions of
+// the same method.
+//
+// ConcreteModuleType corresponds to a single member of the type family, with
+// all template arguments fully specified. Two Modules that share a
+// ConcreteModuleType can share a JIT type, and vice versa.
+//
+// Why not just use a JIT type to represent concrete types? Because constants,
+// function attributes, etc. are currently not representable in the type system,
+// so this acts a non-first-class way of tracking concrete types.
+//
+// ConcreteModuleType is also the source of truth for servicing all
+// ModuleValue::attr calls. This is so we can guarantee that if two Module's
+// share a JIT type (and thus a ConcreteModuleType), then they behave the same
+// way when you access attributes on them.
+
+// ConcreteModuleType has two phases.
+// 1. Creation: First we build it up, during the ScriptModule conversion
+// process. This is represented by ConcreteModuleTypeBuilder.
+//    ...then the converter calls ConcreteModuleTypeBuilder::build(), producing
+//    a
+//       ConcreteModuleType ready for querying.
+// 2. Querying: We use ConcreteModuleType as a source of truth for
+// ModuleValue::attr calls during method compilation.
+
+// Represents a concrete type during in the process for construction. We use
+// this to decide whether we can share types between modules.
+class VISIBILITY_HIDDEN ConcreteModuleTypeBuilder {
+ public:
+  explicit ConcreteModuleTypeBuilder(py::object pyClass) {
+    TORCH_INTERNAL_ASSERT(pyClass);
+    pyClass_ = std::move(pyClass);
+  }
+
+  void addConstant(std::string name, py::object value);
+  void addConstant(std::string name, IValue value);
+  void addAttribute(
+      std::string name,
+      const TypePtr& type,
+      bool isParameter,
+      bool isBuffer);
+  void addFunctionAttribute(
+      std::string name,
+      const TypePtr& type,
+      py::object pyFunction);
+
+  void addModule(std::string name, std::shared_ptr<ConcreteModuleType> meta);
+
+  void addForwardHook(py::object hook);
+  void addForwardPreHook(py::object pre_hook);
+
+  void addOverload(
+      std::string methodName,
+      std::vector<std::string> overloadedMethodNames);
+  void addBuiltinFunction(std::string name, const std::string& symbol_name);
+  void addFailedAttribute(std::string name, std::string failureReason);
+  void addIgnoredAttribute(std::string name);
+  void setIterableModuleKind(IterableModuleKind kind);
+
+  // If a ConcreteModuleType is poisoned, it will never compare equal to any
+  // other concrete type
+  void setPoisoned();
+
+  std::shared_ptr<ConcreteModuleType> build() const {
+    return std::make_shared<ConcreteModuleType>(*this);
+  }
+
+  // This determines whether two modules can share a type. The container structs
+  // used by ConcreteModuleType have been defined such that operator==
+  // implements a meaningful comparison in that context.
+  bool equals(const ConcreteModuleTypeBuilder& other) const;
+
+  struct FunctionAttribute {
+    FunctionTypePtr function_;
+    py::object pyFunction_;
+
+    friend bool operator==(
+        const FunctionAttribute& lhs,
+        const FunctionAttribute& rhs) {
+      // Functions are not first class, so we can't do type comparison like a
+      // regular attribute. So we do a pointer equality check on the actual
+      // Python function object.
+      return lhs.pyFunction_.is(rhs.pyFunction_);
+    }
+  };
+
+  struct Attribute {
+    Attribute(TypePtr type, bool isParam, bool isBuffer)
+        : type_(std::move(type)), isParam_(isParam), isBuffer_(isBuffer) {}
+
+    friend bool operator==(const Attribute& lhs, const Attribute& rhs) {
+      return *(lhs.type_) == *(rhs.type_) && lhs.isParam_ == rhs.isParam_;
+    }
+    TypePtr type_;
+    bool isParam_;
+    bool isBuffer_;
+  };
+
+  struct ModuleInfo {
+    ModuleInfo(std::string name, std::shared_ptr<ConcreteModuleType> meta)
+        : name_(std::move(name)), meta_(std::move(meta)) {}
+
+    friend bool operator==(const ModuleInfo& lhs, const ModuleInfo& rhs);
+
+    std::string name_;
+    std::shared_ptr<ConcreteModuleType> meta_;
+  };
+
+ private:
+  ConcreteModuleTypeBuilder() = default;
+  ClassTypePtr createTypeFromThis() const;
+
+  // If true, this type will never compare equally to anything else. This is
+  // used if we want to ensure that this type is not shared (for example, if it
+  // came from a traced module)
+  bool isPoisoned_ = false;
+
+  // The value of any constants defined by the module.
+  std::unordered_map<std::string, IValue> constants_;
+  // The types of any attributes
+  OrderedDict<std::string, Attribute> attributes_;
+  // Overloads, in the same format as `__overloads__` in Python
+  std::unordered_map<std::string, std::vector<std::string>> overloads_;
+  // Any attributes we failed to convert to TorchScript, along with a hint as to
+  // why
+  std::unordered_map<std::string, std::string> failedAttributes_;
+  // Any attributes that were marked as ignored. They cannot be used in
+  // TorchScript but can still be used in ignored function in Python.
+  std::unordered_set<std::string> ignoredAttributes_;
+  // Any function attributes. These are special right now because functions are
+  // not first-class in the type system.
+  std::unordered_map<std::string, FunctionAttribute> functionAttributes_;
+  // Function attributes that are calls to builtin functions. These get
+  // de-sugared directly into the corresponding aten:: call. The map is
+  // attribute name -> aten symbol name
+  std::unordered_map<std::string, c10::Symbol> builtinFunctions_;
+  // The concrete types of any submodules
+  std::vector<ModuleInfo> modules_;
+  // Hooks to be called before/after forward when the module
+  // is called directly. Used to ensure modules have different types
+  // when they have different python hooks
+  // Actual hooks are added to ClassType directly during compilation
+  std::vector<py::object> forwardHooks_;
+  std::vector<py::object> forwardPreHooks_;
+
+  // If something is a ModuleDict/ModuleList, it means:
+  //   1. The order of the submodules matters for comparing the type
+  //   2. The compiler is allowed to treat it like a dict/tuple
+  IterableModuleKind iterableModuleKind_ = IterableModuleKind::NONE;
+
+  // The original `nn.Module` class that we derived this ScriptModule from.
+  py::object pyClass_;
+
+  // NOTE: If you ever add any more state to this struct, you need to make sure
+  // operator== still makes sense!
+  friend ConcreteModuleType;
+};
+
+// Represents a finalized concrete type, used to service ModuleValue::attr calls
+// during method compilation.
+class VISIBILITY_HIDDEN ConcreteModuleType {
+ public:
+  explicit ConcreteModuleType(ConcreteModuleTypeBuilder data);
+
+  static std::shared_ptr<ConcreteModuleType> fromJitType(TypePtr type);
+
+  TypePtr getJitType() const;
+  c10::optional<py::object> getPyClass() const;
+  IterableModuleKind getIterableModuleKind() const;
+  c10::optional<std::vector<std::string>> findOverloads(
+      const std::string& name) const;
+  c10::optional<Function*> findFunctionAttribute(const std::string& name) const;
+  c10::optional<c10::Symbol> findBuiltinFunction(const std::string& name) const;
+  std::shared_ptr<ConcreteModuleType> findSubmoduleConcreteType(
+      const std::string& name) const;
+  c10::optional<std::string> findFailedAttribute(const std::string& name) const;
+  bool isIgnoredAttribute(const std::string& name) const;
+
+  // These getters are only here to return things as types that can be
+  // automatically converted by pybind.
+  std::unordered_map<std::string, py::object> getConstantsPy() const;
+  std::unordered_map<std::string, std::pair<TypePtr, bool>> getAttributesPy()
+      const;
+  std::vector<std::pair<std::string, std::shared_ptr<ConcreteModuleType>>>
+  getModulesPy() const;
+
+  bool equals(const ConcreteModuleType& other) const {
+    if (jitType_ == other.jitType_) {
+      // If the computed types are the same, these modules can (obviously) share
+      // a type.
+      return true;
+    }
+
+    return data_.equals(other.data_);
+  }
+  bool equals(const ConcreteModuleTypeBuilder& other) const {
+    return data_.equals(other);
+  }
+
+  void dump() const;
+
+ private:
+  ConcreteModuleType() = default;
+
+  // The JIT type derived from this ConcreteModuleType.
+  ConcreteModuleTypeBuilder data_;
+  TypePtr jitType_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h
new file mode 100644
index 0000000000000000000000000000000000000000..00d54f447c2f8085734d2a2f4e9acbec45b11878
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Convert a graph with Loads & Stores into SSA form
+TORCH_API void ConvertToSSA(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7b9d8f7382bb2d0f2adfbba8d29500b53922e4a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+
+namespace torch {
+namespace jit {
+
+TORCH_API size_t ComputeEditDistance(
+    const char* word1,
+    const char* word2,
+    size_t maxEditDistance);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h
new file mode 100644
index 0000000000000000000000000000000000000000..347808b4a2f40f29fdd385dec8a3e31a5af87eb0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/frontend/tree.h>
+
+namespace torch {
+namespace jit {
+
+struct Call {
+  std::string fn_name;
+  SourceRange caller_range;
+};
+
+struct TORCH_API ErrorReport : public std::exception {
+  ErrorReport(const ErrorReport& e);
+
+  explicit ErrorReport(SourceRange r);
+  explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
+  explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
+
+  const char* what() const noexcept override;
+
+  struct TORCH_API CallStack {
+    // These functions are used to report why a function was being compiled
+    // (i.e. what was the call stack of user functions at compilation time that
+    // led to this error)
+    CallStack(const std::string& name, const SourceRange& range);
+    ~CallStack();
+
+    // Change the range that is relevant for the current function (i.e. after
+    // each successful expression compilation, change it to the next expression)
+    static void update_pending_range(const SourceRange& range);
+  };
+
+  static std::string current_call_stack();
+
+ private:
+  template <typename T>
+  friend const ErrorReport& operator<<(const ErrorReport& e, const T& t);
+
+  mutable std::stringstream ss;
+  OwnedSourceRange context;
+  mutable std::string the_message;
+  std::vector<Call> error_stack;
+};
+
+template <typename T>
+const ErrorReport& operator<<(const ErrorReport& e, const T& t) {
+  e.ss << t;
+  return e;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..0235f5bbb980fa867122aeb7be9937df1fa98a43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void TransformExits(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac8bda354b21fbde98b89b00e37160691d05ac1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <c10/macros/Macros.h>
+#include <string>
+#include <variant>
+
+namespace torch {
+namespace jit {
+
+TORCH_API std::variant<c10::OperatorName, c10::FunctionSchema> parseSchemaOrName(
+    const std::string& schemaOrName);
+TORCH_API c10::FunctionSchema parseSchema(const std::string& schema);
+TORCH_API c10::OperatorName parseName(const std::string& name);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h
new file mode 100644
index 0000000000000000000000000000000000000000..0956697a2e6b4e63ad1ade3ee874e04fb6eadbee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void InlineLoopCondition(std::shared_ptr<Graph>& graph);
+TORCH_API void InlineBlockBeforeNode(Node* before_node, Block* block);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94cf1aea3d81c1c21e93d6b2b2a92c02f57feeb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void runCleanupPasses(std::shared_ptr<Graph>& to_clean);
+
+TORCH_API bool meaningfulName(const std::string& name);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..41104525957713a23505dedd391f06bd1ee69278
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h
@@ -0,0 +1,575 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/parser_constants.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/frontend/strtod.h>
+#include <algorithm>
+#include <clocale>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace torch {
+namespace jit {
+
+// single character tokens are just the character itself '+'
+// multi-character tokens need an entry here
+// if the third entry is not the empty string, it is used
+// in the lexer to match this token.
+
+// These kinds are also used in Tree.h as the kind of the AST node.
+// Some kinds TK_APPLY, TK_LIST are only used in the AST and are not seen in the
+// lexer.
+
+#define TC_FORALL_TOKEN_KINDS(_)                 \
+  _(TK_EOF, "eof", "")                           \
+  _(TK_WHITESPACE, "whitespace", "")             \
+  _(TK_WHITESPACE_EOF, "whitespace_eof", "")     \
+  _(TK_NUMBER, "number", "")                     \
+  _(TK_NEWLINE, "newline", "")                   \
+  _(TK_INDENT, "indent", "")                     \
+  _(TK_DEDENT, "dedent", "")                     \
+  _(TK_DEF, "def", "def")                        \
+  _(TK_EQUIVALENT, "equivalent", "<=>")          \
+  _(TK_IDENT, "ident", "")                       \
+  _(TK_STRING, "string", "")                     \
+  _(TK_STRINGLITERAL, "string_literal", "")      \
+  _(TK_CONST, "const", "")                       \
+  _(TK_LIST, "list", "")                         \
+  _(TK_DICT, "dict", "")                         \
+  _(TK_OPTION, "option", "")                     \
+  _(TK_APPLY, "apply", "")                       \
+  _(TK_COMPREHENSION, "comprehension", "")       \
+  _(TK_RANGE_CONSTRAINT, "range_constraint", "") \
+  _(TK_PARAM, "param", "")                       \
+  _(TK_INFERRED, "inferred", "")                 \
+  _(TK_ACCESS, "access", "")                     \
+  _(TK_ASSIGN, "assign", "")                     \
+  _(TK_AUG_ASSIGN, "aug_assign", "")             \
+  _(TK_ATTRIBUTE, "attribute", "")               \
+  _(TK_IF, "if", "if")                           \
+  _(TK_ELSE, "else", "else")                     \
+  _(TK_ELIF, "elif", "elif")                     \
+  _(TK_WHILE, "while", "while")                  \
+  _(TK_EXPR_STMT, "expression statement", "")    \
+  _(TK_RETURN, "return", "return")               \
+  _(TK_IS, "is", "is")                           \
+  _(TK_ISNOT, "is not", "is not")                \
+  _(TK_NE, "ne", "!=")                           \
+  _(TK_EQ, "eq", "==")                           \
+  _(TK_LE, "le", "<=")                           \
+  _(TK_GE, "ge", ">=")                           \
+  _(TK_FLOOR_DIV, "floordiv", "//")              \
+  _(TK_IF_EXPR, "if", "")                        \
+  _(TK_TRUE, "True", "True")                     \
+  _(TK_FALSE, "False", "False")                  \
+  _(TK_NONE, "None", "None")                     \
+  _(TK_AND, "and", "and")                        \
+  _(TK_OR, "or", "or")                           \
+  _(TK_NOT, "not", "not")                        \
+  _(TK_LSHIFT, "<<", "<<")                       \
+  _(TK_RSHIFT, ">>", ">>")                       \
+  _(TK_CAST, "cast", "")                         \
+  _(TK_PLUS_EQ, "+=", "+=")                      \
+  _(TK_MINUS_EQ, "-=", "-=")                     \
+  _(TK_TIMES_EQ, "*=", "*=")                     \
+  _(TK_DIV_EQ, "/=", "/=")                       \
+  _(TK_MOD_EQ, "%=", "%=")                       \
+  _(TK_BIT_OR_EQ, "|=", "|=")                    \
+  _(TK_BIT_AND_EQ, "&=", "&=")                   \
+  _(TK_BIT_XOR_EQ, "^=", "^=")                   \
+  _(TK_LSHIFT_EQ, "<<=", "<<=")                  \
+  _(TK_RSHIFT_EQ, ">>=", ">>=")                  \
+  _(TK_POW_EQ, "**=", "**=")                     \
+  _(TK_GLOBAL, "global", "global")               \
+  _(TK_BUILT_IN, "built-in", "")                 \
+  _(TK_SUBSCRIPT, "subscript", "")               \
+  _(TK_VAR, "variable", "")                      \
+  _(TK_NOTHING, "nothing", "")                   \
+  _(TK_DICT_LITERAL, "dict-literal", "")         \
+  _(TK_LIST_LITERAL, "list-literal", "")         \
+  _(TK_TUPLE_LITERAL, "tuple-literal", "")       \
+  _(TK_FOR, "for", "for")                        \
+  _(TK_IN, "in", "in")                           \
+  _(TK_NOTIN, "not in", "not in")                \
+  _(TK_STARRED, "starred", "")                   \
+  _(TK_UNARY_MINUS, "unary minus", "")           \
+  _(TK_POW, "pow operator", "**")                \
+  _(TK_ARROW, "arrow", "->")                     \
+  _(TK_DECL, "decl", "")                         \
+  _(TK_SLICE_EXPR, "slice expr", "")             \
+  _(TK_TYPE_COMMENT, "type comment", "# type:")  \
+  _(TK_RAISE, "raise", "raise")                  \
+  _(TK_ASSERT, "assert", "assert")               \
+  _(TK_DOTS, "dots", "...")                      \
+  _(TK_LIST_COMP, "list comprehension", "")      \
+  _(TK_DICT_COMP, "dict comprehension", "")      \
+  _(TK_BREAK, "break", "break")                  \
+  _(TK_CONTINUE, "continue", "continue")         \
+  _(TK_DELETE, "del", "del")                     \
+  _(TK_PASS, "pass", "pass")                     \
+  _(TK_CLASS_DEF, "class", "class")              \
+  _(TK_IMPORT, "import", "import")               \
+  _(TK_WITH, "with", "with")                     \
+  _(TK_WITH_ITEM, "withitem", "")                \
+  _(TK_AS, "as", "as")                           \
+  _(TK_PROP, "property", "")                     \
+  _(TK_ELLIPSIS, "Ellipsis", "Ellipsis")         \
+  _(TK_NONE_TYPE, "NoneType", "NoneType")
+
+enum TokenKind {
+  // we use characters to represent themselves so skip all valid characters
+  // before
+  // assigning enum values to multi-char tokens.
+  TK_DUMMY_START = 256,
+#define DEFINE_TOKEN(tok, _, _2) tok,
+  TC_FORALL_TOKEN_KINDS(DEFINE_TOKEN)
+#undef DEFINE_TOKEN
+};
+
+TORCH_API std::string kindToString(int kind);
+TORCH_API int stringToKind(const std::string& str);
+
+// nested hash tables that indicate char-by-char what is a valid token.
+struct TokenTrie;
+using TokenTrieRef = std::unique_ptr<TokenTrie>;
+struct TokenTrie {
+  TokenTrie() : kind(0) {}
+  void insert(const char* str, int tok) {
+    if (*str == '\0') {
+      AT_ASSERT(kind == 0);
+      kind = tok;
+      return;
+    }
+
+    for (size_t i = 0, e = child_chars.size(); i < e; ++i) {
+      if (child_chars[i] == *str) {
+        child_tries[i]->insert(str + 1, tok);
+        return;
+      }
+    }
+
+    child_chars.emplace_back(*str);
+    child_tries.emplace_back(std::make_unique<TokenTrie>());
+    child_tries.back()->insert(str + 1, tok);
+  }
+  int kind; // 0 == invalid token
+
+  std::vector<char> child_chars;
+  std::vector<TokenTrieRef> child_tries;
+};
+
+// stuff that is shared against all TC lexers/parsers and is initialized only
+// once.
+struct TORCH_API SharedParserData {
+  SharedParserData() : head(new TokenTrie()) {
+    std::stringstream ss;
+    for (const char* c = valid_single_char_tokens; *c; c++) {
+      std::string str(1, *c);
+      head->insert(str.c_str(), *c);
+    }
+
+#define ADD_CASE(tok, _, tokstring)   \
+  if (*(tokstring) != '\0') {         \
+    head->insert((tokstring), (tok)); \
+  }
+    TC_FORALL_TOKEN_KINDS(ADD_CASE)
+#undef ADD_CASE
+  }
+
+  bool match(
+      StringCordView::Iterator pos,
+      bool continuation, // are we inside a scope where newlines don't count
+                         // (e.g. inside parens)
+      bool whitespace_token, // should we treat whitespace as a token
+      int* kind,
+      StringCordView::Iterator* start,
+      StringCordView::Iterator* end) {
+    *start = pos;
+    // skip whitespace
+    while (pos.has_next() && isblank(*pos)) {
+      ++pos;
+    }
+
+    // special handling
+    if (pos.has_next()) {
+      if (*pos == '#' && !isTypeComment(pos)) {
+        // skip comments
+        while (pos.has_next() && *pos != '\n')
+          ++pos;
+        // tail call, handle whitespace and more comments
+        return match(pos, continuation, whitespace_token, kind, start, end);
+      }
+      if (*pos == '\\') {
+        auto newiter = pos;
+        ++newiter;
+        if (newiter.has_next() && *newiter == '\n' && !whitespace_token) {
+          ++newiter;
+          return match(newiter, continuation, false, kind, start, end);
+        }
+      }
+      if (*pos == '\n') {
+        return match(++pos, continuation, !continuation, kind, start, end);
+      }
+    }
+    // we handle white space before EOF because in the case we have something
+    // like the following where we need to generate the dedent token if foo:
+    //   ...
+    // else:
+    //   pass
+    if (whitespace_token) {
+      *kind = !pos.has_next() ? TK_WHITESPACE_EOF : TK_WHITESPACE;
+      *end = pos;
+      return true;
+    }
+    if (!pos.has_next()) {
+      *kind = TK_EOF;
+      *start = pos;
+      *end = *start;
+      return true;
+    }
+    // invariant: the next token is not whitespace or newline
+    *start = pos;
+    // check for a valid number
+    size_t len;
+    if (isNumber(pos.rest_line(), 0, &len)) {
+      *end = *start;
+      *end += len;
+      *kind = TK_NUMBER;
+      return true;
+    }
+    // check for string
+    if (isString(pos.rest_line(), 0, &len)) {
+      *kind = TK_STRINGLITERAL;
+      *end = *start;
+      *end += len;
+      return true;
+    }
+
+    // check for either an ident or a token
+    // ident tracks whether what we have scanned so far could be an identifier
+    // matched indicates if we have found any match.
+    bool matched = false;
+    bool ident = true;
+    TokenTrie* cur = head.get();
+    // for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr);
+    // i++)
+    for (size_t i = 0; pos.has_next() && (ident || cur != nullptr);
+         ++pos, ++i) {
+      ident = ident && validIdent(i, *pos);
+      if (ident) {
+        matched = true;
+        *end = pos.next_iter();
+        *kind = TK_IDENT;
+      }
+      // check for token second, so that e.g. 'max' matches the token TK_MAX
+      // rather the
+      // identifier 'max'
+      if (cur) {
+        const auto begin_it = cur->child_chars.begin();
+        const auto end_it = cur->child_chars.end();
+        const auto ch_it = std::find(begin_it, end_it, *pos);
+
+        cur = (ch_it == end_it) ? nullptr
+                                : cur->child_tries[ch_it - begin_it].get();
+
+        if (cur && cur->kind != 0) {
+          matched = true;
+          *end = pos.next_iter();
+          *kind = cur->kind;
+        }
+      }
+    }
+    return matched;
+  }
+
+  bool isUnary(int kind, int* prec);
+  bool isBinary(int kind, int* prec);
+  bool isRightAssociative(int kind) {
+    switch (kind) {
+      case '?':
+      case TK_POW:
+      case TK_IF:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ private:
+  bool validIdent(size_t i, char n) {
+    return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+  }
+
+  // 1. skip whitespace
+  // 2. handle comment or newline
+  //
+  bool isNumber(c10::string_view str, size_t start, size_t* len) {
+    char first = str[start];
+    // strtod allows numbers to start with + or - or nan or inf
+    // http://en.cppreference.com/w/cpp/string/byte/strtof
+    // but we want only the number part, otherwise 1+3 will turn into two
+    // adjacent numbers in the lexer
+    if (first == '-' || first == '+' || isalpha(first))
+      return false;
+    const char* startptr = str.data() + start;
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    char* endptr;
+    torch::jit::strtod_c(startptr, &endptr);
+    *len = endptr - startptr;
+    // check if the number is complex valued
+    // access is safe because string is assumed to be null terminated
+    if (endptr != nullptr && *endptr == 'j') {
+      *len += 1;
+    }
+    return *len > 0;
+  }
+
+  bool isCharCount(char c, c10::string_view str, size_t start, int len) {
+    // count checks from [start, start + len)
+    return start + len <= str.size() &&
+        std::count(str.begin() + start, str.begin() + start + len, c) == len;
+  }
+
+  // python concatenates all adjacent strings "a" "b" == "ab"
+  // strings can be enclosed with 1 or 3 single or double quotes
+  // if enclosed with 3 quotes newlines are valid
+  // as elsewhere, backslash and new line should be ignored
+  bool isString(c10::string_view str, size_t start, size_t* len) {
+    char quote = str[start];
+    if (quote != '\"' && quote != '\'')
+      return false;
+    int quote_len = isCharCount(quote, str, start, 3) ? 3 : 1;
+
+    // end is now set past the opening quotation marks
+    size_t end = start + quote_len;
+    while (end < str.size() && !isCharCount(quote, str, end, quote_len)) {
+      if (str[end] == '\n' && quote_len != 3) {
+        return false;
+      }
+      // handle escaped characters. advances past escaped quotation marks,
+      // escaped newlines and escaped backslashes
+      // multi-char escapes like \x1A are handled fine here because the
+      // remainder of the escape are valid string characters anyway
+      if (str[end] == '\\') {
+        end++;
+      }
+      end++;
+    }
+    // set length equal to the complete string including quotations
+    *len = end - start + quote_len;
+    // if end finished without going past the last character of the string than
+    // there is a match
+    return end < str.size();
+  }
+
+  bool isblank(int n) {
+    return isspace(n) && n != '\n';
+  }
+
+  bool isTypeComment(StringCordView::Iterator str_iter) {
+    c10::string_view rest_line = str_iter.rest_line();
+    const std::string type_string = "# type:";
+    if (rest_line.size() < type_string.length()) {
+      return false;
+    }
+    auto match_string = rest_line.substr(0, type_string.size());
+    return match_string == type_string;
+  }
+
+  // Make an exception ignoring comments for type annotation comments
+  bool isTypeComment(StringCordView str, size_t pos) {
+    const std::string type_string = "# type:";
+    if (str.size() < pos + type_string.length()) {
+      return false;
+    }
+    auto match_string = str.substr(pos, type_string.size());
+    return match_string == type_string;
+  }
+
+  TokenTrieRef head;
+};
+
+TORCH_API SharedParserData& sharedParserData();
+
+struct Token {
+  int kind;
+  SourceRange range;
+  Token(int kind, SourceRange range) : kind(kind), range(std::move(range)) {}
+  std::string text() {
+    return std::string(range.token_text());
+  }
+  std::string kindString() const {
+    return kindToString(kind);
+  }
+};
+
+struct Lexer {
+  explicit Lexer(std::shared_ptr<Source> source)
+      : source(std::move(source)),
+        pos(0),
+        nesting(0),
+        indent_stack(),
+        next_tokens(),
+        shared(sharedParserData()) {
+    auto first_indent = lexRaw(true);
+    indent_stack.push_back(first_indent.range.size());
+    lex();
+  }
+  // Return the current token, and then move to the next one
+  Token next() {
+    if (next_tokens.empty())
+      reportError("Lexer invariant violated: empty token queue");
+    Token r = std::move(next_tokens.front());
+    next_tokens.erase(next_tokens.begin());
+    if (next_tokens.empty()) {
+      lex();
+    }
+    return r;
+  }
+  // Skip the current token if it matches the given kind
+  bool nextIf(int kind) {
+    if (cur().kind != kind)
+      return false;
+    next();
+    return true;
+  }
+
+  [[noreturn]] void reportError(const std::string& what) {
+    reportError(what, cur());
+  }
+  [[noreturn]] void reportError(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << what << ":\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << "expected " << what << " but found '" << t.kindString()
+       << "' here:\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what) {
+    expected(what, cur());
+  }
+  // Check that the current token has a given kind, return the current token,
+  // and advance to the next one.
+  Token expect(int kind) {
+    if (cur().kind != kind) {
+      expected(kindToString(kind));
+    }
+    return next();
+  }
+  Token& lookahead() {
+    if (next_tokens.size() < 2) {
+      lex();
+    }
+    return next_tokens[1];
+  }
+  Token& cur() {
+    return next_tokens.front();
+  }
+
+ private:
+  void lex() {
+    auto r = lexRaw();
+    switch (r.kind) {
+      case '(':
+      case '[':
+      case '{':
+        nesting++;
+        break;
+      case ')':
+      case ']':
+      case '}':
+        nesting--;
+        break;
+      case TK_WHITESPACE:
+      case TK_WHITESPACE_EOF: {
+        const auto depth = static_cast<int64_t>(
+            r.kind == TK_WHITESPACE_EOF ? indent_stack.front()
+                                        : r.range.size());
+        // note: TK_WHITESPACE_EOF is whitespace right before the EOF token
+        // just like we allow the code to be indented to a particular initial
+        // indent level, we allow the final indent to be anything and set
+        // it back to the initial indent level. This allows the code to be
+        // put into string literals inside code without worrying about final
+        // whitespace
+        if (depth > indent_stack.back()) {
+          indent_stack.push_back(depth);
+          r.kind = TK_INDENT;
+        } else if (depth == indent_stack.back()) {
+          r.kind = TK_NEWLINE;
+        } else {
+          next_tokens.emplace_back(TK_NEWLINE, r.range);
+          while (indent_stack.back() != depth) {
+            indent_stack.pop_back();
+            next_tokens.emplace_back(TK_DEDENT, r.range);
+            if (indent_stack.empty()) {
+              reportError("invalid indent level " + std::to_string(depth), r);
+            }
+          }
+          return; // We've already queued the tokens
+        }
+      } break;
+      default:
+        break;
+    }
+    next_tokens.push_back(std::move(r));
+  }
+  Token lexRaw(bool whitespace_token = false) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int kind;
+    AT_ASSERT(source);
+    if (current == nullptr) {
+      AT_ASSERT(pos == 0);
+      current = std::make_unique<StringCordView::Iterator>(
+          source->text_str().begin());
+    }
+
+    StringCordView::Iterator start_iter = *current;
+    StringCordView::Iterator end_iter = *current;
+    if (!shared.match(
+            *current,
+            nesting > 0,
+            whitespace_token,
+            &kind,
+            &start_iter,
+            &end_iter)) {
+      expected(
+          "a valid token",
+          Token(
+              **current,
+              SourceRange(source, start_iter, start_iter.pos() + 1)));
+    }
+
+    auto t = Token(kind, SourceRange(source, start_iter, end_iter.pos()));
+    pos = end_iter.pos();
+    *current = end_iter;
+    return t;
+  }
+
+  std::shared_ptr<Source> source;
+  std::unique_ptr<StringCordView::Iterator> current;
+  size_t pos;
+  size_t nesting; // depth of ( [ { nesting...
+  std::vector<int> indent_stack; // stack of indentation level of blocks
+  // Invariant: this should always contain at least a single element
+  std::vector<Token> next_tokens;
+  SharedParserData& shared;
+};
+} // namespace jit
+} // namespace torch
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a7ad0a81219bd3a0155f439f92bfe1f95672fb2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Simple data structure for containing a type T in nested control blocks
+// Should only be used after initial compilation where type checking and
+// loads and stores are emitted
+
+template <typename T>
+struct MiniEnvironment {
+  MiniEnvironment(Block* b, std::shared_ptr<MiniEnvironment> next = nullptr)
+      : next(std::move(next)) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<MiniEnvironment<T>> next;
+
+  T findInThisFrame(const std::string& name) {
+    auto it = table.find(name);
+    if (it != table.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+  T findInAnyFrame(const std::string& name) {
+    for (auto runner = this; runner; runner = runner->next.get()) {
+      if (auto r = runner->findInThisFrame(name)) {
+        return r;
+      }
+    }
+    return nullptr;
+  }
+
+  void setVar(const std::string& name, T value) {
+    table[name] = value;
+  }
+
+  std::vector<std::string> definedVariables() {
+    std::vector<std::string> result;
+    result.reserve(table.size());
+    for (auto& kv : table) {
+      result.push_back(kv.first);
+    }
+    std::sort(result.begin(), result.end());
+    return result;
+  }
+
+ private:
+  std::unordered_map<std::string, T> table;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h
new file mode 100644
index 0000000000000000000000000000000000000000..48e730547e8e4bc13c2e0eb60065e044d83fb998
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace jit {
+
+/**
+ * class NameMangler
+ *
+ * Utility to mangle qualified names in order to make them unique. We use this
+ * in various places where we to de-duplicate qualified names.
+ */
+class TORCH_API NameMangler {
+ public:
+  // Given a qualified name, return a mangled version that is guaranteed to be
+  // unique with respect to previous/future calls of `mangled()` on this name
+  // mangler instance.
+  c10::QualifiedName mangle(const c10::QualifiedName& name);
+
+ private:
+  size_t mangleIndex_ = 0;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h
new file mode 100644
index 0000000000000000000000000000000000000000..baf1b8c21f0e814a0f6945602967fbb8b0e04e7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h
@@ -0,0 +1,87 @@
+#pragma once
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch {
+namespace jit {
+
+inline bool isCharCount(char c, const std::string& str, size_t start, int len) {
+  // count checks from [start, start + len)
+  return start + len <= str.size() &&
+      std::count(str.begin() + start, str.begin() + start + len, c) == len;
+}
+
+inline c10::optional<char> parseOctal(const std::string& str, size_t pos) {
+  //\xxx where x are 0-7
+  if (pos + 3 >= str.size())
+    return c10::nullopt;
+  size_t c = 0;
+  for (size_t i = 1, b = 64; i < 4; ++i, b /= 8) {
+    // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+    int d = str[pos + i];
+    if (d < '0' || d > '7')
+      return c10::nullopt;
+    c += b * (d - '0');
+  }
+  if (c >= 256)
+    return c10::nullopt;
+  return c;
+}
+
+inline std::string parseStringLiteral(
+    const SourceRange& range,
+    const std::string& str) {
+  int quote_len = isCharCount(str[0], str, 0, 3) ? 3 : 1;
+  auto ret_str = str.substr(quote_len, str.size() - quote_len * 2);
+  size_t pos = ret_str.find('\\');
+  while (pos != std::string::npos) {
+    // invariant: pos has to escape a character because it is a valid string
+    char c = ret_str[pos + 1];
+    size_t to_erase = 2;
+    switch (ret_str[pos + 1]) {
+      case '\\':
+      case '\'':
+      case '\"':
+      case '\n':
+        break;
+      case 'a':
+        c = '\a';
+        break;
+      case 'b':
+        c = '\b';
+        break;
+      case 'f':
+        c = '\f';
+        break;
+      case 'n':
+        c = '\n';
+        break;
+      case 'v':
+        c = '\v';
+        break;
+      case 't':
+        c = '\t';
+        break;
+      case 'x':
+        throw ErrorReport(range) << "unsupported hex specifier";
+      case 'u':
+      case 'U':
+        throw ErrorReport(range) << "unsupported unicode specifier";
+      default:
+        // octal value in format \nnn, n is [0-7]
+        if (auto v = parseOctal(ret_str, pos)) {
+          to_erase = 4;
+          c = *v;
+        } else {
+          throw ErrorReport(range) << " ill formed octal specifier";
+        }
+    }
+    ret_str.replace(pos, to_erase, /* num copies */ 1, c);
+    pos = ret_str.find('\\', pos + 1);
+  }
+  return ret_str;
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..f80eea5ec765d2e07416500f0b686cc5fadaa38f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/tree.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+
+struct Decl;
+struct ParserImpl;
+struct Lexer;
+
+TORCH_API Decl mergeTypesFromTypeComment(
+    const Decl& decl,
+    const Decl& type_annotation_decl,
+    bool is_method);
+
+struct TORCH_API Parser {
+  explicit Parser(const std::shared_ptr<Source>& src);
+  TreeRef parseFunction(bool is_method);
+  TreeRef parseClass();
+  Decl parseTypeComment();
+  Expr parseExp();
+  Lexer& lexer();
+  ~Parser();
+
+ private:
+  std::unique_ptr<ParserImpl> pImpl;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a1d9c60b0696ed23c6cc5f1407cb78dd4764acb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+static const char* valid_single_char_tokens = "+-*/%@()[]:,={}><.?!&^|~";
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d73111dd708b46e7228fd29b2d2967686e78b4e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+
+namespace torch {
+namespace jit {
+
+struct Resolver;
+using ResolverPtr = std::shared_ptr<Resolver>;
+
+/**
+ * class Resolver
+ *
+ * Represents an "outer environment" in which we an look up names and return
+ * a corresponding SugaredValue. This is used during compilation to resolve
+ * references to names which are not defined internal to the graph.
+ *
+ * Example: PythonResolver looks at the enclosing Python scope for `name`.
+ *
+ * NOTE: When adding methods, keep this an abstract class (i.e. all new methods
+ * should be purely virtual). Resist the urge to provide a default
+ * implementation; you should explicitly think about how each resolver would
+ * handle the method.
+ */
+struct Resolver {
+  virtual ~Resolver() = default;
+
+  // Resolve a given name to a SugaredValue. This takes the method `m` that the
+  // caller is currently constructing, since we may need to insert nodes into
+  // the graph to create a value.
+  virtual std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) {
+    return nullptr;
+  }
+
+  // Resolve `name` to a TypePtr.
+  virtual TypePtr resolveType(const std::string& name, const SourceRange& loc) {
+    return nullptr;
+  }
+};
+
+// A resolver that only understands "torch.foo()" lookups.
+struct NativeResolver : public Resolver {
+  std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) override {
+    if (name == "torch") {
+      return std::make_shared<BuiltinModule>("aten");
+    }
+    return nullptr;
+  }
+
+  TypePtr resolveType(const std::string& name, const SourceRange& loc)
+      override {
+    return nullptr;
+  }
+};
+
+inline std::shared_ptr<NativeResolver> nativeResolver() {
+  return std::make_shared<NativeResolver>();
+}
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h
new file mode 100644
index 0000000000000000000000000000000000000000..638cccfab35d52ba93eec3c71615896994348d49
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h
@@ -0,0 +1,70 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/named_value.h>
+
+#include <ATen/core/function_schema.h>
+
+namespace torch {
+namespace jit {
+
+// Try to match a list of inputs and keyword 'attributes' to this
+// schema. Return the flat list of positional inputs to the call or
+// `c10::nullopt` on failure (`failure_messages` contains a good error
+// report in this case)
+
+struct MatchedSchema {
+  std::vector<Value*> inputs;
+  std::vector<TypePtr> return_types;
+  c10::OptNameList return_field_names;
+  std::string schema_name;
+};
+
+TORCH_API bool isBlockListedSchema(const FunctionSchema& schema);
+
+TORCH_API MatchedSchema matchSchema(
+    const ::c10::FunctionSchema& schema,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const c10::optional<NamedValue>& self = c10::nullopt);
+
+TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
+    const std::vector<const ::c10::FunctionSchema*>& schemas,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const c10::optional<NamedValue>& self = c10::nullopt,
+    bool render_errors = false);
+
+TORCH_API bool convertibleToList(
+    const TypePtr& type,
+    const TypePtr& list_type_);
+
+TORCH_API std::string getFullSchemaName(const ::c10::FunctionSchema& schema);
+
+TORCH_API Value* emitBuiltinCall(
+    const SourceRange& loc,
+    Graph& graph,
+    Symbol name,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const c10::optional<NamedValue>& self = c10::nullopt);
+
+TORCH_API c10::optional<size_t> findInputWithName(
+    const std::string& name,
+    at::ArrayRef<NamedValue> kwargs,
+    bool is_aten = false);
+
+// applies implicit conversion from value trying to turn it into type
+// concrete_type it succeeds if the return_value->isSubtypeOf(concrete_type)
+TORCH_API Value* tryConvertToType(
+    const SourceRange& loc,
+    Graph& graph,
+    const TypePtr& concrete_type,
+    Value* value,
+    bool allow_conversions);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..323a916149cbbda9d80f9d72360bdfe6c9084f5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <ATen/core/alias_info.h>
+#include <ATen/core/jit_type.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/FunctionRef.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch {
+namespace jit {
+
+using TypePtr = c10::TypePtr;
+
+struct TORCH_API SchemaTypeParser {
+  TypePtr parseBaseType();
+  c10::optional<c10::AliasInfo> parseAliasAnnotation();
+  std::pair<TypePtr, c10::optional<c10::AliasInfo>> parseType();
+  std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, c10::optional<c10::AliasInfo>>
+  parseFakeAndRealType();
+  c10::optional<at::ScalarType> parseTensorDType(const std::string& dtype);
+  TypePtr parseRefinedTensor();
+
+  SchemaTypeParser(Lexer& L, bool parse_complete_tensor_types)
+      : complete_tensor_types(parse_complete_tensor_types), L(L) {}
+
+ private:
+  c10::optional<bool> tryToParseRequiresGrad();
+  c10::optional<c10::Device> tryToParseDeviceType();
+  void parseList(
+      int begin,
+      int sep,
+      int end,
+      c10::function_ref<void()> callback);
+
+  bool complete_tensor_types;
+  Lexer& L;
+  size_t next_id = 0;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..29dd2d0ecfc0eb987d677c162550beba779f27a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+
+namespace torch {
+namespace jit {
+
+/**
+ * class ScriptTypeParser
+ *
+ * Parses expressions in our typed AST format (TreeView) into types and
+ * typenames.
+ */
+class TORCH_API ScriptTypeParser {
+ public:
+  explicit ScriptTypeParser() = default;
+  explicit ScriptTypeParser(ResolverPtr resolver)
+      : resolver_(std::move(resolver)) {}
+
+  c10::TypePtr parseTypeFromExpr(const Expr& expr) const;
+
+  c10::optional<std::pair<c10::TypePtr, int32_t>> parseBroadcastList(
+      const Expr& expr) const;
+
+  c10::TypePtr parseType(const std::string& str);
+
+  FunctionSchema parseSchemaFromDef(const Def& def, bool skip_self);
+
+  c10::IValue parseClassConstant(const Assign& assign);
+
+ private:
+  c10::TypePtr parseTypeFromExprImpl(const Expr& expr) const;
+
+  c10::optional<std::string> parseBaseTypeName(const Expr& expr) const;
+  at::TypePtr subscriptToType(
+      const std::string& typeName,
+      const Subscript& subscript) const;
+  std::vector<IValue> evaluateDefaults(
+      const SourceRange& r,
+      const std::vector<Expr>& default_types,
+      const std::vector<Expr>& default_exprs);
+  std::vector<Argument> parseArgsFromDecl(const Decl& decl, bool skip_self);
+
+  std::vector<Argument> parseReturnFromDecl(const Decl& decl);
+
+  ResolverPtr resolver_ = nullptr;
+
+  // Need to use `evaluateDefaults` in serialization
+  friend struct ConstantTableValue;
+  friend struct SourceImporterImpl;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bf96939b6dbf28d5075a4f53e0b91331447126b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h
@@ -0,0 +1,457 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <ostream>
+#include <regex>
+#include <sstream>
+#include <unordered_map>
+
+namespace torch::jit {
+
+class SourceRangeUnpickler;
+struct SourceRange;
+
+// A stringlike class backed by a vector of string_view
+// the string represented are logically the concatenation of  the string_views
+// This has advantage of not needing continues memory.
+struct TORCH_API StringCordView {
+  StringCordView();
+  StringCordView(const StringCordView&) = default;
+  StringCordView(StringCordView&&) noexcept = default;
+  StringCordView(
+      std::vector<c10::string_view> inputs,
+      std::vector<std::shared_ptr<std::string>> ownerships);
+
+  StringCordView& operator=(const StringCordView&) = default;
+  StringCordView& operator=(StringCordView&&) noexcept = default;
+
+  size_t size() const {
+    return accumulated_sizes_.back();
+  }
+
+  size_t find(const std::string& tok, size_t start) const;
+  size_t find_regex(const std::string& tok, size_t start) const;
+  StringCordView substr(size_t start, size_t size) const;
+
+  char at(size_t index) const {
+    return *iter_for_pos(index);
+  }
+  char operator[](size_t index) const {
+    return at(index);
+  }
+
+  std::string str() const {
+    std::stringstream ss;
+    for (auto s : pieces_) {
+      ss << std::string(s);
+    }
+    return ss.str();
+  }
+
+  bool operator==(const std::string& rhs) const;
+
+  bool operator==(const StringCordView& rhs) const;
+
+  c10::string_view piece(size_t index) const {
+    return pieces_[index];
+  }
+
+  struct Iterator {
+    Iterator(
+        const StringCordView* str,
+        size_t start_line,
+        size_t start_pos,
+        size_t size)
+        : line_(start_line), pos_(start_pos), str_(str), size_(size) {}
+    explicit Iterator(const StringCordView* str)
+        : Iterator(str, 0, 0, str->size()) {}
+
+    Iterator() : Iterator(nullptr, 0, 0, 0) {}
+
+    Iterator(const Iterator&) = default;
+    Iterator(Iterator&&) = default;
+    Iterator& operator=(const Iterator&) = default;
+    Iterator& operator=(Iterator&&) = default;
+
+    Iterator operator++() {
+      if (size_ == 0) {
+        return *this;
+      }
+      if ((pos_ + 1) < str_->pieces_[line_].size()) {
+        pos_++;
+      } else {
+        line_++;
+        pos_ = 0;
+      }
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator prev(*this);
+      ++(*this);
+      return prev;
+    }
+
+    Iterator next_iter() const {
+      Iterator next(*this);
+      ++next;
+      return next;
+    }
+
+    Iterator& operator+=(size_t num) {
+      if (!has_next()) {
+        return *this;
+      }
+      size_t target_pos = pos_ + num;
+      if (target_pos >= str_->accumulated_sizes_[line_] &&
+          (line_ + 1) < str_->accumulated_sizes_.size() &&
+          target_pos < str_->accumulated_sizes_[line_ + 1]) {
+        pos_ = target_pos;
+        return *this;
+      }
+
+      size_t target_abs_pos = pos() + num;
+      *this = str_->iter_for_pos(target_abs_pos);
+      return *this;
+    }
+
+    bool operator==(const Iterator& rhs) const {
+      if (!has_next() && !rhs.has_next()) {
+        return true;
+      }
+      return (str_ == rhs.str_) && (line_ == rhs.line_) && (pos_ == rhs.pos_);
+    }
+    bool operator!=(const Iterator& rhs) {
+      return !((*this) == rhs);
+    }
+    bool has_next() const {
+      return size_ > 0 && (line_ < str_->pieces_.size());
+    }
+
+    char operator*() const {
+      TORCH_INTERNAL_ASSERT(line_ < str_->pieces_.size());
+      TORCH_INTERNAL_ASSERT(pos_ < str_->pieces_[line_].size());
+      return str_->pieces_[line_].at(pos_);
+    }
+
+    // returns rest of the line of the current iterator
+    c10::string_view rest_line() const {
+      if (line_ >= str_->pieces_.size()) {
+        return "";
+      }
+
+      c10::string_view cur_line = str_->pieces_[line_];
+      return cur_line.substr(pos_, std::string::npos);
+    }
+
+    size_t pos() const {
+      if (size_ == 0) {
+        return 0;
+      }
+      return str_->accumulated_sizes_[line_] + pos_;
+    }
+
+   private:
+    size_t line_;
+    size_t pos_;
+    const StringCordView* str_;
+    size_t size_;
+    friend struct StringCordView;
+  };
+
+  Iterator begin() const {
+    return Iterator(this, 0, 0, size());
+  }
+  Iterator end() const {
+    return Iterator(this, pieces_.size(), 0, 0);
+  }
+  Iterator iter_for_pos(size_t pos) const;
+
+ private:
+  std::vector<c10::string_view> pieces_;
+  std::vector<size_t> accumulated_sizes_;
+  std::vector<std::shared_ptr<std::string>> owned_strings_;
+};
+
+// Source represents a code segment. It keeps track of:
+//  - text_view : the view into text of the code segment
+//  - filename (optional) : if present, represents the name of the file from
+//                          which the code segment originated.
+//  - starting_line_no : represents the line in the original file where the
+//                       code segment started.
+struct TORCH_API Source {
+  // Whether or not Source should copy the string passed in the constructor.
+  enum CopiesString { COPIES_STRING, DONT_COPY };
+
+  explicit Source(
+      c10::string_view text_view,
+      c10::optional<std::string> filename = c10::nullopt,
+      size_t starting_line_no = 0,
+      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
+      CopiesString copies_str = COPIES_STRING)
+      : filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
+        gen_ranges_(std::move(gen_ranges)) {
+    if (copies_str == COPIES_STRING) {
+      std::shared_ptr<std::string> allocated_str =
+          std::make_shared<std::string>(text_view.data(), text_view.size());
+      text_view_ = StringCordView({*allocated_str}, {allocated_str});
+    } else {
+      text_view_ = StringCordView({text_view}, {});
+    }
+
+    calc_line_start_offsets();
+  }
+
+  explicit Source(
+      StringCordView str,
+      c10::optional<std::string> filename = c10::nullopt,
+      size_t starting_line_no = 0,
+      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
+      : text_view_(std::move(str)),
+        filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
+        gen_ranges_(std::move(gen_ranges)) {
+    calc_line_start_offsets();
+  }
+  // Given a line number (within source_), return the byte offset of the
+  // beginning of that line.
+  size_t offset_for_line(size_t line) const {
+    return line_starting_offsets_.at(line);
+  }
+
+  // Returns number of lines present.
+  size_t num_lines() const {
+    return line_starting_offsets_.size();
+  }
+
+  // Calculate the line (within the code segment) on which `offset` resides.
+  size_t lineno_for_offset(size_t offset) const {
+    auto iter = std::upper_bound(
+        line_starting_offsets_.begin(), line_starting_offsets_.end(), offset);
+    return iter - line_starting_offsets_.begin() - 1;
+  }
+
+  // Calculate the line (within the original source file, if present) on which
+  // `lineno` resides.
+  size_t lineno_to_source_lineno(size_t lineno) const {
+    if (filename_) {
+      return lineno + starting_line_no_;
+    } else {
+      return lineno;
+    }
+  }
+
+  StringCordView get_line(size_t lineno) const {
+    auto start = offset_for_line(lineno);
+    auto size = (lineno + 1) < num_lines() ? offset_for_line(lineno + 1) - start
+                                           : text_view_.size() - start;
+    return text_view_.substr(start, size);
+  }
+
+  const StringCordView& text_str() const {
+    return text_view_;
+  }
+
+  char char_at(size_t index) const {
+    return text_view_.at(index);
+  }
+
+  size_t size() const {
+    return text_view_.size();
+  }
+
+  c10::optional<std::string>& filename() {
+    return filename_;
+  }
+
+  size_t starting_line_no() const {
+    return starting_line_no_;
+  }
+
+  c10::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range);
+
+  ~Source() = default;
+
+ private:
+  void calc_line_start_offsets() {
+    line_starting_offsets_.clear();
+    line_starting_offsets_.push_back(0);
+    size_t pos = 0;
+    while ((pos = text_view_.find("\n", pos)) != std::string::npos) {
+      line_starting_offsets_.push_back(++pos);
+    }
+  }
+
+  StringCordView text_view_;
+
+  c10::optional<std::string> filename_;
+  // If filename_ is not present, starting_line_no_ is don't care
+  size_t starting_line_no_;
+  // Starting offsets for lines into the source. e.g. line 0 starts at
+  // line_starting_offsets_[0], etc.
+  std::vector<size_t> line_starting_offsets_;
+
+  std::shared_ptr<SourceRangeUnpickler> gen_ranges_;
+};
+
+// A SourceRange is a reference to subset of a Source, specified by `start` and
+// `end` byte offsets into the source text.
+struct TORCH_API SourceRange {
+  SourceRange(std::shared_ptr<Source> source_view, size_t start_, size_t end_)
+      : source_view_(std::move(source_view)), start_(start_), end_(end_) {
+    if (source_view_) {
+      start_iter_ = source_view_->text_str().iter_for_pos(start_);
+    }
+  }
+
+  SourceRange() : source_view_(nullptr), start_(0), end_(0) {}
+
+  SourceRange(
+      std::shared_ptr<Source> source_view_,
+      StringCordView::Iterator start_iter,
+      size_t end_)
+      : source_view_(std::move(source_view_)),
+        start_(start_iter.pos()),
+        end_(end_),
+        start_iter_(start_iter) {}
+
+  const c10::string_view token_text() const {
+    size_t size = end() - start();
+    return start_iter_.rest_line().substr(0, size);
+  }
+
+  const StringCordView text() const {
+    return source_view_->text_str().substr(start(), end() - start());
+  }
+  size_t size() const {
+    return end() - start();
+  }
+  static const size_t CONTEXT = 3;
+  void highlight(std::ostream& out) const;
+
+  // Customizable version of 'highlight' method.
+  void print_with_context(
+      std::ostream& out,
+      size_t context,
+      bool highlight,
+      const std::string& funcname) const;
+
+  const std::shared_ptr<Source>& source() const {
+    return source_view_;
+  }
+  size_t start() const {
+    return start_;
+  }
+  size_t end() const {
+    return end_;
+  }
+  std::string str() const {
+    std::stringstream ss;
+    highlight(ss);
+    return ss.str();
+  }
+
+  c10::optional<std::tuple<std::string, size_t, size_t>> file_line_col() const {
+    if (!source_view_ || !source()->filename()) {
+      return c10::nullopt;
+    }
+
+    auto lineno = source_view_->lineno_for_offset(start_);
+    auto col_offset = (int)start_ - (int)source_view_->offset_for_line(lineno);
+    // TODO: c10::optional<>::value returns an rvalue ref so can't use it here??
+    return std::make_tuple<std::string, size_t, size_t>(
+        source_view_->filename().value_or(""),
+        source_view_->lineno_to_source_lineno(lineno),
+        (size_t)col_offset);
+  }
+
+  bool operator==(const SourceRange& rhs) const {
+    return start() == rhs.start() && end() == rhs.end() &&
+        source() == rhs.source();
+  }
+
+  bool operator!=(const SourceRange& rhs) const {
+    return !(*this == rhs);
+  }
+
+  c10::optional<SourceRange> findSourceRangeThatGenerated() const {
+    if (!source_view_) {
+      return c10::nullopt;
+    }
+    return source_view_->findSourceRangeThatGenerated(*this);
+  }
+
+ protected:
+  std::shared_ptr<Source> source_view_;
+
+ private:
+  size_t start_;
+  size_t end_;
+  StringCordView::Iterator start_iter_;
+};
+
+// OwnedSourceRange is just like a SourceRange except that it owns a `Source`
+// instead of `Source`. Thus OwnedSourceRange owns a copy of source text.
+struct OwnedSourceRange : public SourceRange {
+  explicit OwnedSourceRange(const SourceRange& source_range)
+      : SourceRange(source_range) {
+    const auto& source = source_range.source();
+    if (source) {
+      source_view_ = std::make_shared<Source>(
+          source->text_str().str(),
+          source->filename(),
+          source->starting_line_no());
+    }
+  }
+};
+
+struct TORCH_API SourceRangeHasher {
+ public:
+  size_t operator()(const torch::jit::SourceRange& key) const;
+};
+
+struct StackEntry {
+  std::string filename;
+  SourceRange range;
+};
+
+TORCH_API void format_stack_trace(
+    std::ostream& out,
+    const std::vector<StackEntry>& entries);
+
+inline std::ostream& operator<<(std::ostream& out, const SourceRange& range) {
+  range.highlight(out);
+  return out;
+}
+
+// A pair of (byte offset, SourceRange) describing a specific segment
+// of the output stream
+struct TaggedRange {
+  TaggedRange(size_t bytes, SourceRange range)
+      : bytes(bytes), range(std::move(range)) {}
+  size_t bytes;
+  SourceRange range;
+};
+using SourceRangeRecords = std::vector<TaggedRange>;
+using SourceRangeTagMap =
+    std::unordered_map<SourceRange, int64_t, SourceRangeHasher>;
+
+} // namespace torch::jit
+
+namespace std {
+template <>
+struct iterator_traits<torch::jit::StringCordView::Iterator> {
+  using value_type = char;
+  using difference_type = ptrdiff_t;
+  using pointer = char*;
+  using reference = char&;
+  using iterator_category = std::forward_iterator_tag;
+};
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..e271f9591c67057356d96197bfc5c168a40e05f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+namespace torch {
+namespace jit {
+
+/**
+ * SourceRef does two things:
+ *   1. Owns a Source object.
+ *   2. Serves as lookup key to the owned Source in associative containers, for
+ *      runtime data aggregation.
+ * We don't want to use std::shared_ptr<Source> directly because we want to
+ * support heteogeneous lookup, and also shared_ptr is an implementation detail
+ * which should be encapsulated.
+ */
+class TORCH_API SourceRef : public CustomClassHolder {
+ public:
+  explicit SourceRef(std::shared_ptr<Source> source_view)
+      : source_view_(std::move(source_view)) {}
+  bool operator==(const SourceRef& other) const {
+    return source_view_ == other.source_view_;
+  }
+  bool operator<(const Source& other) const {
+    return source_view_.get() < &other;
+  }
+  friend bool operator<(const Source& other, const SourceRef& self) {
+    return &other < self.source_view_.get();
+  }
+  bool operator<(const SourceRef& other) const {
+    return *this < *other.source_view_.get();
+  }
+  const Source* operator->() const {
+    return source_view_.get();
+  }
+
+ private:
+  std::shared_ptr<Source> source_view_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ec797be36b6c0514aac83dd73a5d0ec9380531e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API double strtod_c(const char* nptr, char** endptr);
+TORCH_API float strtof_c(const char* nptr, char** endptr);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5d49c281bebbf889d311c5bdfc20bdb19901980
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h
@@ -0,0 +1,857 @@
+#pragma once
+#include <c10/util/Optional.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <ATen/core/symbol.h>
+#include <caffe2/serialize/versions.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <torch/csrc/jit/frontend/versioned_symbols.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+using SugaredValuePtr = std::shared_ptr<SugaredValue>;
+
+// The AST can contain nodes like `self`, `self.b` or `python_fn` that
+// are not first-class values in the graph representation, but instead
+// will be desugared based on how they are used in the AST.
+
+// SugaredValue is used to temporarily represent these values in a way
+// that separates their behavior from the AST -> IR converter itself.
+// This allows us to keep dependencies on python minimal.
+
+struct TORCH_API SugaredValue
+    : public std::enable_shared_from_this<SugaredValue> {
+  // what is this node? for error reporting (e.g. Module, python function)
+  virtual std::string kind() const = 0;
+
+  // what can we do with this thing?
+  // use it as a value e.g.  `this + 4`
+  virtual Value* asValue(const SourceRange& loc, GraphFunction& m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a value";
+  }
+
+  // select an attribute on it, e.g. `this.field`
+  virtual std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) {
+    throw ErrorReport(loc) << "attribute lookup is not defined on " << kind();
+  }
+
+  virtual bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) {
+    throw ErrorReport(loc) << "attribute lookup is not defined on " << kind();
+  }
+
+  // assign an attribute on it, e.g. `this.field = newValue`
+  virtual void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) {
+    throw ErrorReport(loc) << "attribute assignment is not defined on "
+                           << kind();
+  }
+
+  // use it as a vector of values, e.g. a tuple of values as return value from
+  // a method invocation
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const c10::optional<size_t>& size_hint = {}) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a tuple";
+  }
+
+  // TODO @wconstab refactor to use ModuleValue::asTuple instead of new API
+  virtual SugaredValuePtr asTupleValue(
+      const SourceRange& loc,
+      GraphFunction& m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a tuplevalue";
+  }
+
+  virtual std::vector<std::shared_ptr<SugaredValue>> asType(
+      const SourceRange& loc,
+      Method& m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as a type";
+  }
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  virtual std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      // note: names for args will be 'argument 0', 'argument 1', etc..
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) {
+    // n_binders is always set to the number of variables an expression is
+    // syntactically bound to:
+    //     a = foo() # 1 binder (note in this case the single binder might be a
+    //     tuple) a, * b = foo() # 1 binder a, b = foo() # 2 binders foo() # 0
+    //     binders
+    //
+    // In subexpressions, like bar() in foo(bar()), n_binders is always set to
+    // 1. n_binders is used as a hint to subexpressions to determine how many
+    // values they should return when that number is ambiguous statically. In
+    // particular it is currently used to decide how many tensors a call to a
+    // python function will return. It is only a hint, functions do not have to
+    // check that n_binders match the number of things they are returning, the
+    // assignment logic will do that anyway.
+
+    throw ErrorReport(loc) << "cannot call a " << kind();
+  }
+
+  // This function is called when to convert a SugaredValue to its iterator.
+  // For example, when iterating through a Dict we iterate over its keys
+  virtual std::shared_ptr<SugaredValue> iter(
+      const SourceRange& loc,
+      GraphFunction& m) {
+    throw ErrorReport(loc) << kind() << " cannot be used as an iterable";
+  }
+
+  // If we are iterating over a Sugared Value and it returns a value from this
+  // function, then we emit an unrolled loop over the variable. This allows us
+  // to support containers of Heterogenous types, like Module Containers &
+  // Tuples
+  virtual c10::optional<int64_t> staticLen() {
+    return c10::nullopt;
+  }
+
+  // When iterating over this SugaredValue, should we emit the for loop as an
+  // unrolled loop.
+  bool shouldEmitUnrolled() {
+    return staticLen() != c10::nullopt;
+  }
+
+  // return length of this thing, if not then it can't be iterated.
+  // If it does not have a statically-determinable length, then it cannot
+  // be iterated over with a modulelist. If it does it must return a constant
+  // Value *
+  virtual Value* len(const SourceRange& loc, GraphFunction& m) {
+    throw ErrorReport(loc) << "'" << kind() << "'"
+                           << " object is not iterable";
+  }
+
+  // expression for ith elemement for iterable value
+  virtual std::shared_ptr<SugaredValue> getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) {
+    throw ErrorReport(loc) << "'" << kind() << "'"
+                           << " object is not subscriptable";
+  }
+
+  virtual ~SugaredValue() = default;
+};
+
+// most things in the environment are just simple value types
+// and not special python syntax sugar types
+struct TORCH_API SimpleValue : public SugaredValue {
+  SimpleValue(Value* value) : value_(value) {}
+  std::string kind() const override {
+    std::stringstream ss;
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    ss << "value of type '" << value_->type()->annotation_str() << "'";
+    return ss.str();
+  }
+  Value* asValue(const SourceRange& range, GraphFunction& m) override {
+    return value_;
+  }
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const c10::optional<size_t>& size_hint = {}) override;
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) override;
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      // note: names for args will be 'argument 0', 'argument 1', etc..
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  Value* getValue() const {
+    return value_;
+  }
+
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+
+ private:
+  Value* value_;
+};
+
+struct TORCH_API BuiltinFunction : public SugaredValue {
+  BuiltinFunction(Symbol symbol, c10::optional<NamedValue> self)
+      : symbol(symbol), self(std::move(self)) {}
+
+  // The symbol of the function (e.g. `aten::relu`).
+  Symbol symbol;
+
+  // if this is method, then this is the self argument.
+  c10::optional<NamedValue> self;
+  std::string kind() const override {
+    return "builtin";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  // try to create this builtin but if it doesn't exist or the self argument
+  // cannot possibly match, then return nullptr. Use in situations where it is
+  // not clear if it is a valid builtin
+  static std::shared_ptr<BuiltinFunction> tryCreate(
+      Symbol symbol,
+      c10::optional<NamedValue> self);
+};
+
+struct TORCH_API SugaredTupleValue : public SugaredValue {
+  explicit SugaredTupleValue(std::vector<std::shared_ptr<SugaredValue>> tup)
+      : tup_(std::move(tup)){};
+
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const c10::optional<size_t>& size_hint = {}) override {
+    return tup_;
+  };
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override {
+    std::vector<Value*> vec;
+    vec.reserve(tup_.size());
+    for (const auto& sv : tup_) {
+      vec.push_back(sv->asValue(loc, m));
+    }
+    Graph& g = *m.graph();
+    return g.insertNode(g.createTuple(vec))->output();
+  }
+
+  std::string kind() const override {
+    return "Tuple";
+  }
+
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override {
+    if (!(idx->type()->cast<IntType>() && toIValue(idx))) {
+      throw ErrorReport(loc)
+          << "Expected integer literal for index but got a variable or non-integer. "
+          << "ModuleList/Sequential indexing is only supported with integer literals. "
+          << "For example, 'i = 4; self.layers[i](x)' will fail because i is not a literal. "
+          << "Enumeration is supported, e.g. 'for index, v in enumerate(self): out = v(inp)'";
+    }
+    auto index = toIValue(idx)->toInt();
+    int64_t adj_index =
+        (index < 0) ? index + static_cast<int64_t>(tup_.size()) : index;
+    if (!(adj_index >= 0 && adj_index < static_cast<int64_t>(tup_.size()))) {
+      throw ErrorReport(loc)
+          << "Index " << index << " out of range of length " << tup_.size();
+    }
+    return tup_.at(adj_index);
+  }
+
+  // This function is called when a SugaredValue is used to convert a
+  // SugaredValue to its iterator. For example, when iterating through a Dict we
+  // iterate over its keys
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override {
+    return shared_from_this();
+  };
+
+  // Because this is used to contain SugaredValues of Heterogenous types,
+  // we define staticLen() so that when this is iterated over it is emitted
+  // as an unrolled loop.
+  c10::optional<int64_t> staticLen() override {
+    return static_cast<int64_t>(tup_.size());
+  }
+
+  std::vector<std::shared_ptr<SugaredValue>> tup_;
+};
+
+struct TORCH_API BuiltinModule : public SugaredValue {
+  BuiltinModule(std::string name, c10::optional<int64_t> version = at::nullopt)
+      : name(std::move(name)), version(version) {}
+
+  std::string kind() const override {
+    return "builtin module";
+  }
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override {
+    if (field == "autograd") {
+      // When refering torch.autograd, it is also considered to be a
+      // BuiltinModule and we will dispatch to the aten operators for the
+      // methods under its module.
+      return std::make_shared<BuiltinModule>("aten", version);
+    }
+
+    auto sym = Symbol::fromQualString(name + "::" + field);
+    return std::make_shared<BuiltinFunction>(sym, c10::nullopt);
+  }
+
+ private:
+  std::string name;
+  // when we add operator versioning, emit this op as it exising at 'version'
+  // if not set, use the latest version
+  c10::optional<int64_t> version;
+};
+
+// Represents a class, analagous to `int` or `dict`. Instances of classes,
+// like `1` or `{"foo": 5}`, are represented as SimpleValues
+struct TORCH_API ClassValue : public SugaredValue {
+  explicit ClassValue(ClassTypePtr type) : type_(std::move(type)) {}
+
+  // Call the type's constructor, as in:
+  //    n = Foo(constructor_arg)
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  std::string kind() const override {
+    return type_->str();
+  }
+
+  ClassTypePtr type_;
+};
+
+struct TORCH_API NamedTupleConstructor : public SugaredValue {
+  explicit NamedTupleConstructor(TupleTypePtr type) : type_(std::move(type)) {}
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::string kind() const override {
+    return type_->str();
+  }
+
+  TupleTypePtr type_;
+};
+
+struct FunctionValue : public SugaredValue {
+  FunctionValue(Function* callee) : callees_({callee}) {}
+  FunctionValue(const StrongFunctionPtr& p)
+      : callees_({p.function_}), cu_(p.cu_) {}
+  FunctionValue(const std::vector<StrongFunctionPtr>& callees) {
+    for (const StrongFunctionPtr& callee : callees) {
+      cu_ = cu_ ? cu_ : callee.cu_;
+      TORCH_INTERNAL_ASSERT(callee.cu_ == cu_);
+      callees_.push_back(callee.function_);
+    }
+  }
+
+  std::string kind() const override {
+    return "function";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    std::vector<const FunctionSchema*> schemas;
+    for (Function* callee : callees_) {
+      try {
+        callee->ensure_defined();
+      } catch (const RecursiveMethodCallError&) {
+        throw ErrorReport(loc)
+            << " function '" << callee->name() << "' is called recursively. "
+            << "Recursive calls are not supported";
+      }
+      schemas.push_back(&callee->getSchema());
+    }
+    auto match = matchSchemas(schemas, loc, *f.graph(), args, kwargs);
+    Value* output =
+        f.graph()->insertFunctionCall(callees_[match.first], match.second);
+    output->node()->setSourceRange(loc);
+    return std::make_shared<SimpleValue>(output);
+  }
+
+  const std::vector<Function*>& callees() {
+    return callees_;
+  }
+
+ private:
+  std::vector<Function*> callees_;
+  // TODO holding this thing is creepy
+  std::shared_ptr<CompilationUnit> cu_;
+};
+
+struct TORCH_API ClosureValue : public SugaredValue {
+  ClosureValue(Value* value) : value_(value) {
+    TORCH_INTERNAL_ASSERT(value_->node()->kind() == prim::Closure);
+  }
+  std::string kind() const override {
+    return "closure";
+  }
+  Value* asValue(const SourceRange& range, GraphFunction& m) override {
+    return value_;
+  }
+  Value* value_;
+};
+
+// defines how a method obtained from a module/class/interface behaves in script
+struct MethodValue : public SugaredValue {
+  MethodValue(Value* self, std::vector<std::string> method_names)
+      : self_(self), method_names_(std::move(method_names)) {}
+  MethodValue(Value* self, std::string method_name)
+      : MethodValue(self, std::vector<std::string>({std::move(method_name)})) {}
+
+  std::string kind() const override {
+    return "method";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    std::vector<NamedValue> argsWithSelf = {self_};
+    argsWithSelf.insert(argsWithSelf.end(), args.begin(), args.end());
+    std::vector<const FunctionSchema*> schemas;
+    for (const std::string& method_name : method_names_) {
+      if (auto class_type = self_->type()->cast<ClassType>()) {
+        Function& method = class_type->getMethod(method_name);
+        try {
+          method.ensure_defined();
+        } catch (const RecursiveMethodCallError&) {
+          throw ErrorReport(loc)
+              << " method '" << method.name() << "' is called recursively. "
+              << "Recursive calls are not supported";
+        }
+        schemas.push_back(&method.getSchema());
+      } else if (auto interface_type = self_->type()->cast<InterfaceType>()) {
+        schemas.push_back(interface_type->getMethod(method_name));
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false, "method constructed that is not a class or interface");
+      }
+    }
+    auto match = matchSchemas(schemas, loc, *f.graph(), argsWithSelf, kwargs);
+    Value* output =
+        f.graph()->insertMethodCall(method_names_[match.first], match.second);
+    output->node()->setSourceRange(loc);
+    return std::make_shared<SimpleValue>(output);
+  }
+
+ private:
+  Value* self_;
+  std::vector<std::string> method_names_;
+};
+
+struct TORCH_API PrintValue : public SugaredValue {
+  std::string kind() const override {
+    return "print";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+};
+
+// expressions like int(x)
+// these are the same as call prim::Int or equivalent except it
+// is a noop when the input is a subtype of 'type'
+struct TORCH_API CastValue : public BuiltinFunction {
+  CastValue(TypePtr type, c10::Symbol method)
+      : BuiltinFunction(method, c10::nullopt), type_(std::move(type)) {}
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    if (args.size() == 1 && kwargs.empty()) {
+      auto len_op = std::make_shared<BuiltinFunction>(aten::len, at::nullopt);
+      auto gt_op = std::make_shared<BuiltinFunction>(aten::gt, at::nullopt);
+      auto zero = m.graph()->insertConstant(0);
+
+      auto v = args[0].value(*m.graph());
+      if (v->type()->isSubtypeOf(*type_)) {
+        return std::make_shared<SimpleValue>(v);
+      } else if (
+          *type_ == *BoolType::get() &&
+          (v->type()->isSubtypeOf(*AnyListType::get()) ||
+           v->type()->isSubtypeOf(*StringType::get()) ||
+           v->type()->cast<DictType>())) {
+        auto len = len_op->call(loc, m, {v}, {}, 1);
+        return gt_op->call(loc, m, {len->asValue(loc, m), zero}, {}, 1);
+      }
+    }
+    return BuiltinFunction::call(loc, m, args, kwargs, n_binders);
+  }
+
+ private:
+  TypePtr type_;
+};
+
+struct TORCH_API TensorCastValue : public SugaredValue {
+  TensorCastValue(at::ScalarType type, NamedValue self)
+      : dtype_(type), self_(std::move(self)) {}
+
+  std::string kind() const override {
+    return "Cast";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    TORCH_INTERNAL_ASSERT(args.empty() && kwargs.empty());
+    Value* dtype_const = m.graph()->insertConstant(dtype_, loc);
+    std::vector<NamedValue> kwargs_{
+        self_, NamedValue(loc, "dtype", dtype_const)};
+    Value* casted_val = m.graph()->insert(
+        /*opname=*/Symbol::fromQualString("aten::to"),
+        /*args=*/args,
+        /*kwargs=*/kwargs_,
+        /*range=*/loc);
+    return std::make_shared<SimpleValue>(casted_val);
+  }
+
+  at::ScalarType dtype_;
+  NamedValue self_;
+};
+
+// builtins operators and functions that call a method if it exists
+// on a class type, like 'len(x)' and 'x + y'
+struct TORCH_API MagicMethod : public SugaredValue {
+  MagicMethod(std::string desugared_name, SugaredValuePtr base)
+      : base_value_(std::move(base)),
+        desugared_name_(std::move(desugared_name)) {}
+
+  std::string kind() const override {
+    return desugared_name_;
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  SugaredValuePtr base_value_;
+  std::string desugared_name_;
+};
+
+// things that look like function applications, but
+// perform non-standard evaluation are represented
+// with SpecialFormValues, e.g.
+//   isinstance(x, int)
+//   fork(fn)
+//   annotate(int, 3)
+// The implementation of each value is handled by a case inside emitApplyExpr
+struct TORCH_API SpecialFormValue : public SugaredValue {
+  SpecialFormValue(Symbol form) : form_(form) {}
+  std::string kind() const override {
+    return form_.toUnqualString();
+  }
+  Symbol form() const {
+    return form_;
+  }
+  static std::shared_ptr<SpecialFormValue> create(Symbol form) {
+    return std::make_shared<SpecialFormValue>(form);
+  }
+
+ private:
+  Symbol form_;
+};
+
+struct TORCH_API LegacyTensorConstructor : public SpecialFormValue {
+  LegacyTensorConstructor(Symbol form, at::ScalarType dtype, at::Device device)
+      : SpecialFormValue(form), device_(device), dtype_(dtype) {}
+
+  static std::shared_ptr<LegacyTensorConstructor> create(
+      Symbol form,
+      at::ScalarType dtype,
+      at::Device device) {
+    return std::make_shared<LegacyTensorConstructor>(form, dtype, device);
+  }
+  at::ScalarType dtype() const {
+    return dtype_;
+  }
+
+ private:
+  at::Device device_;
+  at::ScalarType dtype_;
+};
+
+// matched against for special handling of range expressions
+struct TORCH_API RangeValue : SugaredValue {
+  RangeValue(
+      const SourceRange& loc,
+      GraphFunction& m,
+      std::vector<Value*> input,
+      c10::optional<int64_t> static_len = c10::nullopt);
+
+  std::string kind() const override {
+    return "range";
+  }
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  // When Range is instantiated via enumerate(iterable_with_static_len),
+  // then it takes the static length of the iterable
+  c10::optional<int64_t> staticLen() override {
+    return static_len_;
+  }
+
+ private:
+  Value* start_{};
+  Value* end_{};
+  Value* step_{};
+  // a flag to determine if it's a simple range() call with only end_ from
+  // arguments If true, we will not insert length calculation and index
+  // derivation nodes to simplify the graph and enable more possible
+  // optimizations
+  bool has_only_end_{};
+  c10::optional<int64_t> static_len_;
+};
+
+// Specialized Tree structure to matched against for special handling
+// of builtin functions iterables expressions like zip(), enumerate(), etc.
+// zip and enumerate can be modeled as a tree of SimpleValue/RangeValue:
+//    zip(x, y) ->  (x, y) with tuple assignment to each loop target
+//    enumerate(x) -> (range(0, math.inf, 1), x)
+// So a complicated expression like zip(a, enumerate(b), range(0, 100)) will be:
+// (a, (range(0, math.inf, 1), b), range(0, 100))
+// We use those base iterables to fill in the loop information like
+// max_trip_count and set the value table for loop targets
+// Iterables can contain lists of SugaredValues like ModuleLists. If it
+// does, then we emit it unrolled and require that all values it contains
+// have a statically-determinable length.
+struct TORCH_API IterableTree : SugaredValue {
+  IterableTree() = default;
+  IterableTree(
+      const SourceRange& range,
+      GraphFunction& m,
+      at::ArrayRef<SugaredValuePtr> children) {
+    for (const auto& child : children) {
+      addChild(range, m, child);
+    }
+  }
+  std::string kind() const override {
+    return "iterabletree";
+  }
+
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override {
+    return shared_from_this();
+  }
+
+  void addChild(
+      const SourceRange& range,
+      GraphFunction& m,
+      const SugaredValuePtr& iter_value);
+
+  std::vector<SugaredValuePtr> get_children() {
+    return children_;
+  }
+
+  // If this iterable contains a ModuleList or Tuple, then it will have a
+  // static length, and we will emit it as an unrolled for loop.
+  c10::optional<int64_t> staticLen() override {
+    return unroll_length_;
+  }
+
+  // given a IterableTree node, get all the base iterables/leaves under the
+  // IterableTree node. This enables
+  // us to get all the basic SugaredValues that contains valid loop information
+  // with len() and getitem()
+  std::vector<SugaredValuePtr> get_base_iterables();
+
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+
+ private:
+  c10::optional<int64_t> unroll_length_ = c10::nullopt;
+  std::vector<SugaredValuePtr> children_;
+};
+
+static inline std::vector<Value*> toValues(
+    Graph& g,
+    at::ArrayRef<NamedValue> nvs) {
+  return fmap(nvs, [&](const NamedValue& v) { return v.value(g); });
+}
+
+struct SimpleSelf : public Self {
+  explicit SimpleSelf(ClassTypePtr classType)
+      : Self(), classType_(std::move(classType)) {}
+  std::shared_ptr<SugaredValue> makeSugared(Value* v) const override {
+    v->setType(classType_);
+    return std::make_shared<SimpleValue>(v);
+  }
+  ClassTypePtr getClassType() const override {
+    return classType_;
+  }
+
+ private:
+  ClassTypePtr classType_;
+};
+
+// This is not a SimpleValue so it can not pass through the code paths that
+// expect a SimpleValue as a sugared value.
+struct TORCH_API ExceptionMessageValue : public SugaredValue {
+  explicit ExceptionMessageValue(
+      Value* value,
+      Value* qualified_class_name = nullptr)
+      : value_(value), qualified_class_name_(qualified_class_name) {}
+
+  std::string kind() const override {
+    return "exception message";
+  }
+
+  Value* getValue() {
+    return value_;
+  }
+
+  // qualified python class name
+  Value* getQualifiedClassName() {
+    return qualified_class_name_;
+  }
+
+ private:
+  Value* value_;
+  Value* qualified_class_name_;
+};
+
+struct TORCH_API ExceptionValue : public SugaredValue {
+  explicit ExceptionValue(std::string message) : message_(std::move(message)) {}
+
+  std::string kind() const override {
+    return "exception";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> /*attributes*/,
+      size_t /*n_binders*/) override {
+    auto exception_message = insertConstant(*m.graph(), message_ + ": ", loc);
+    for (auto& input : args) {
+      auto input_str = input.value(*m.graph());
+      if (!input_str->type()->isSubtypeOf(*StringType::get())) {
+        input_str =
+            emitBuiltinCall(loc, *m.graph(), aten::str, {input_str}, {});
+      }
+      exception_message = emitBuiltinCall(
+          loc, *m.graph(), aten::add, {exception_message, input_str}, {});
+    }
+    return std::make_shared<ExceptionMessageValue>(exception_message);
+  }
+
+  std::string message_;
+};
+
+struct TORCH_API SugaredEnumClass : public SugaredValue {
+  explicit SugaredEnumClass(EnumTypePtr enum_type)
+      : enum_type_(std::move(enum_type)) {}
+
+  std::string kind() const override {
+    return "EnumClass";
+  }
+
+  SugaredValuePtr attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override;
+
+ private:
+  EnumTypePtr enum_type_;
+};
+
+struct TORCH_API SliceValue : public SugaredValue {
+  explicit SliceValue(Value* start, Value* stop, Value* step)
+      : start_(start), stop_(stop), step_(step) {}
+
+  std::string kind() const override {
+    return "Python slice value";
+  }
+
+  Value* start() {
+    return start_;
+  };
+  Value* stop() {
+    return stop_;
+  };
+  Value* step() {
+    return step_;
+  };
+
+ private:
+  Value* start_;
+  Value* stop_;
+  Value* step_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7908a647a4bea389580a91ea59927023f2cfae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h
@@ -0,0 +1,412 @@
+#pragma once
+
+#include <ATen/core/Dimname.h>
+#include <ATen/core/class_type.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+struct Node;
+struct Value;
+struct Graph;
+struct Module;
+
+namespace tracer {
+
+using ::c10::ivalue::Shared;
+
+using ::c10::IValue;
+using ::c10::ivalue::Future;
+
+using ::c10::ArrayRef;
+using ::c10::TupleType;
+using ::c10::TupleTypePtr;
+using ::c10::ivalue::ConstantString;
+
+using torch::autograd::Variable;
+using variable_list = std::vector<Variable>;
+
+TORCH_API std::atomic<bool>& getTracerStateWarnMode();
+
+struct TORCH_API TracingState
+    : public std::enable_shared_from_this<TracingState> {
+  TracingState();
+  ~TracingState();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Graph> graph;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool warn = getTracerStateWarnMode();
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool strict = true;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool force_outplace = false;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::function<std::string(const Variable& var)> lookup_var_name_fn =
+      [](const Variable& var) { return ""; };
+
+  void enterFrame() {
+    env_stack.emplace_back();
+  }
+
+  void leaveFrame() {
+    env_stack.pop_back();
+  }
+
+  void setValue(const IValue& v, Value* value);
+  void delValue(const IValue& var);
+  Value* getValue(const IValue& var);
+  Value* getOutput(const IValue& var, size_t i);
+  bool hasValue(const IValue& var) const;
+
+  Node* createNode(c10::Symbol op_name, size_t num_outputs);
+  void insertNode(Node* node);
+
+ private:
+  using WeakIValue = at::WeakIValue;
+
+  struct WeakIValueHasher {
+    size_t operator()(const WeakIValue& t) const {
+      return t.hash();
+    }
+  };
+
+  struct WeakIValueEq {
+    bool operator()(const WeakIValue& t1, const WeakIValue& t2) const {
+      return t1.isSameIdentity(t2);
+    }
+  };
+
+  using Frame =
+      std::unordered_map<WeakIValue, Value*, WeakIValueHasher, WeakIValueEq>;
+  std::vector<Frame> env_stack;
+};
+
+// This is meant to be used as a thread local place, where we can store extra
+// info that gets lost when we call into ATen from Python bindings. One example
+// for when this happens is when we get an IntArrayRef argument with e.g. sizes
+// for view. When tracing, those might be tensors, which let us encode extra
+// data dependencies, but once they get to the ATen call where we actually have
+// the tracing logic, they get converted into a raw IntArrayRef, and we loose
+// all information. To prevent this, we temporarily stash it in here.
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct ArgumentStash {
+  struct IntArrayRefTrace : std::vector<Value*> {
+    IntArrayRefTrace(int size) : std::vector<Value*>(size, nullptr) {}
+  };
+
+  static bool empty() {
+    return stash.intlists.empty();
+  }
+
+  TORCH_API static void stashIntArrayRefElem(
+      const std::string& arg_name,
+      size_t size,
+      size_t idx,
+      const Variable& var);
+
+  static bool hasIntArrayRef(const std::string& arg_name) {
+    return stash.intlists.count(arg_name) > 0;
+  }
+
+  static IntArrayRefTrace popIntArrayRef(const std::string& arg_name) {
+    auto info = std::move(stash.intlists.at(arg_name));
+    stash.intlists.erase(arg_name);
+    return info;
+  }
+
+  // Value stashing: Use these methods to stash arguments which correspond
+  // to regular Value*'s in the graph. i.e. they don't require special
+  // handling like in the case of IntArrayRefs
+  TORCH_API static void stashValue(
+      const std::string& arg_name,
+      size_t idx,
+      const Variable& var,
+      const c10::TypePtr& type = nullptr);
+
+  static bool hasValue(const std::string& arg_name) {
+    return stash.values.count(arg_name) > 0;
+  }
+
+  static Value* popValue(const std::string& arg_name) {
+    auto info = stash.values.at(arg_name);
+    stash.values.erase(arg_name);
+    return info;
+  }
+
+ private:
+  static thread_local ArgumentStash stash;
+  std::unordered_map<std::string, IntArrayRefTrace> intlists;
+  std::unordered_map<std::string, Value*> values;
+};
+
+// Retrieve or set the current tracing state. Returns a nullptr if tracing is
+// disabled.
+TORCH_API const std::shared_ptr<TracingState>& getTracingState();
+TORCH_API void setTracingState(std::shared_ptr<TracingState> state);
+
+inline bool isTracing() {
+  return static_cast<bool>(getTracingState());
+}
+
+using warn_fn_type = void (*)(const std::string& msg);
+TORCH_API extern const char* WARN_PYTHON_DATAFLOW;
+TORCH_API extern const char* WARN_CONSTRUCTOR;
+TORCH_API extern const char* WARN_RESIZE;
+TORCH_API extern const char* STRICT_TRACER_MSG;
+TORCH_API void _do_warn(const char* _reason, const char* _kind);
+inline void warn(const char* _reason, const char* _kind = nullptr) {
+  if (const auto& state = getTracingState()) {
+    if (!state->warn)
+      return;
+    _do_warn(_reason, _kind);
+  }
+}
+TORCH_API void setWarn(warn_fn_type fn);
+
+struct TORCH_API NoWarn {
+  NoWarn() : state(getTracingState()) {
+    if (state) {
+      prev = state->warn;
+      state->warn = false;
+    }
+  }
+  ~NoWarn() {
+    if (state) {
+      state->warn = prev;
+    }
+  }
+  std::shared_ptr<TracingState> state;
+  bool prev{false};
+};
+
+struct WithNestedTracingFrame {
+  WithNestedTracingFrame() {
+    getTracingState()->enterFrame();
+  }
+
+  ~WithNestedTracingFrame() {
+    getTracingState()->leaveFrame();
+  }
+};
+TORCH_API void recordSourceLocation(Node* n);
+TORCH_API void setRecordSourceLocation(void (*v)(Node*));
+
+TORCH_API std::vector<StackEntry> pythonCallstack();
+TORCH_API void setPythonCallstack(std::vector<StackEntry> (*v)());
+
+// Having finished adding a new 'node' to the graph IR 'setValueTrace'
+// associates this node with an output variable, so that further operations
+// involving this variable know which node in the IR to reference.
+TORCH_API void setValueTrace(const IValue& v, Value* value);
+
+TORCH_API void delValueTrace(const IValue& var);
+
+TORCH_API std::function<void()> pauseTracing();
+
+TORCH_API Value* getValueTrace(const IValue& var);
+
+TORCH_API std::pair<std::shared_ptr<TracingState>, Stack> trace(
+    Stack inputs,
+    const std::function<Stack(Stack)>& traced_fn,
+    std::function<std::string(const Variable&)> var_name_lookup_fn,
+    bool strict = true,
+    bool force_outplace = false,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+
+TORCH_API void abandon();
+
+// NB: those serve both as an intermediate steps in addInputs below,
+// as well as the overloads that terminate template recursion
+TORCH_API void addInputs(Node* n, const char* name, int64_t value);
+TORCH_API void addInputs(Node* n, const char* name, c10::SymInt value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    c10::optional<int64_t> value);
+TORCH_API void addInputs(Node* n, const char* name, bool value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<bool>& value);
+TORCH_API void addInputs(Node* n, const char* name, double value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<double>& value);
+TORCH_API void addInputs(Node* n, const char* name, const at::Scalar& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::Scalar>& value);
+TORCH_API void addInputs(Node* n, const char* name, const at::Tensor& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::Tensor>& value);
+TORCH_API void addInputs(Node* n, const char* name, ArrayRef<int64_t> value);
+TORCH_API void addInputs(Node* n, const char* name, c10::SymIntArrayRef value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    c10::optional<c10::SymInt> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<ArrayRef<int64_t>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalIntArrayRef& opt_value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalSymIntArrayRef& opt_value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    ArrayRef<at::Tensor> value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    std::vector<at::Tensor> value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    at::ITensorListRef value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<c10::optional<at::Tensor>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    ArrayRef<c10::intrusive_ptr<c10::ivalue::Object>> value,
+    const c10::ClassTypePtr& class_type);
+TORCH_API void addInputs(Node* n, const char* name, ArrayRef<double> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<ArrayRef<double>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::string_view value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<c10::string_view>& value);
+TORCH_API void addInputs(Node* n, const char* name, at::Device value);
+TORCH_API void addInputs(Node* n, const char* name, c10::Stream stream);
+TORCH_API void addInputs(Node* n, const char* name, at::Layout value);
+TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::ScalarType>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::Device>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::Layout>& value);
+TORCH_API void addInputs(Node* n, const char* name, at::MemoryFormat value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    c10::optional<at::DimnameList> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::MemoryFormat>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::optional<at::Generator>& value);
+
+inline void addInputs(
+    Node* n,
+    const char* name,
+    const std::vector<bool>& value) {
+  AT_ERROR("Tracing a list of bool type is currently not supported!");
+}
+
+template <typename T>
+void addInputs(Node* n, const char* name, ArrayRef<T> value) {
+  AT_ERROR("Tracing a list of arbitrary type is currently not supported!");
+}
+template <typename K, typename V>
+void addInputs(
+    Node* n,
+    const char* name,
+    const std::unordered_map<K, V>& value) {
+  AT_ERROR("Tracing a dict of arbitrary types is currently not supported!");
+}
+
+template <size_t N>
+void addInputs(Node* n, const char* name, std::array<bool, N> value) {
+  throw std::runtime_error(
+      "Found an unsupported argument type in the JIT tracer. File a bug report.");
+}
+
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::intrusive_ptr<c10::ivalue::Object>& obj);
+
+TORCH_API void ensureUniqueIfOutOfPlaced(
+    const char* name,
+    const at::Tensor& tensor);
+TORCH_API void ensureUniqueIfOutOfPlaced(
+    const char* name,
+    const c10::optional<at::Tensor>& tensor);
+
+template <
+    typename T,
+    typename = torch::enable_if_t<
+        (!std::is_convertible_v<torch::decay_t<T>, at::TensorList> &&
+         !std::is_convertible_v<torch::decay_t<T>, c10::List<at::Tensor>> &&
+         !std::is_convertible_v<torch::decay_t<T>, at::Tensor> &&
+         !std::is_convertible_v<
+             torch::decay_t<T>,
+             c10::intrusive_ptr<c10::ivalue::Object>>)>>
+void addOutput(Node* node, T&&) {
+  AT_ERROR(
+      "Found an unsupported argument type ",
+      c10::demangle_type<T>(),
+      " in the JIT tracer. File a bug report.");
+}
+TORCH_API void addOutput(Node* node, const at::Tensor& tensor);
+TORCH_API void setOutput(Value* value, const at::Tensor& output);
+TORCH_API void addOutput(Node* node, const std::vector<at::Tensor>& list);
+TORCH_API void addOutput(Node* node, const c10::List<at::Tensor>& list);
+TORCH_API void addOutput(
+    Node* node,
+    const c10::intrusive_ptr<c10::ivalue::Object>& output);
+
+TORCH_API autograd::Variable getSizeOf(
+    const autograd::Variable& var,
+    int64_t dim);
+
+TORCH_API autograd::Variable getNumelOf(const autograd::Variable& var);
+
+} // namespace tracer
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h
new file mode 100644
index 0000000000000000000000000000000000000000..5899d6e23e3ef73d0bfbf1a6471a57cd00451f30
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h
@@ -0,0 +1,220 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/util/SmallVector.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch {
+namespace jit {
+
+// Trees are used to represent all forms of TC IR, pre- and post-typechecking.
+// Rather than have a full class hierarchy for all TC statements, trees are a
+// slight variation of Lisp s-expressions. For instance, the expression a*b+1
+// is represented as:
+// (+ (* (ident a) (ident b)) (const 1))
+// Atoms like 'a', 'b', and '1' are represented by subclasses of Tree which
+// define stringValue(). Everything else is a Compound object, which has a
+// 'kind' that is a token from lexer.h's TokenKind enum. Single-character
+// operators like '+' are represented using the character itself (so, add.kind()
+// would be '+'). Each Compound object also contains a list of subtrees and is
+// associated with a SourceRange for error reporting.
+// Memory management of trees is done using intrusive_ptr.
+
+struct Tree;
+using TreeRef = c10::intrusive_ptr<Tree>;
+using TreeList = at::SmallVector<TreeRef, 4>;
+
+struct Tree : c10::intrusive_ptr_target {
+  Tree(int kind_) : kind_(kind_) {}
+  int kind() const {
+    return kind_;
+  }
+  virtual bool isAtom() const {
+    return true;
+  }
+  virtual const SourceRange& range() const {
+    throw std::runtime_error("is an Atom");
+  }
+  virtual const std::string& stringValue() const {
+    throw std::runtime_error("stringValue can only be called on TK_STRING");
+  }
+  virtual const TreeList& trees() const {
+    static const TreeList empty_trees = {};
+    return empty_trees;
+  }
+  const TreeRef& tree(size_t i) const {
+    return trees().at(i);
+  }
+  virtual TreeRef map(const std::function<TreeRef(TreeRef)>& fn) {
+    (void)fn;
+    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                           // from a raw `this` pointer
+                                           // so we need to bump the refcount
+                                           // to account for this ownership
+    return TreeRef::reclaim(this);
+  }
+  template <typename... Args>
+  void match(int k, Args&... args) const {
+    matchD(k, "unknown", 0, args...);
+  }
+  template <typename... Args>
+  void matchD(int k, const char* filename, int lineno, Args&... args) const {
+    std::initializer_list<TreeRef*> vars = {args...};
+    matchNumSubtreesD(k, filename, lineno, vars.size(), true);
+    size_t i = 0;
+    for (TreeRef* v : vars) {
+      *v = trees()[i++];
+    }
+  }
+  void matchNumSubtrees(int k, size_t expected_subtrees) {
+    return matchNumSubtreesD(k, "unknown", 0, expected_subtrees, false);
+  }
+  void matchNumSubtreesD(
+      int k,
+      const char* filename,
+      int lineno,
+      size_t expected_subtrees,
+      bool allow_more) const {
+    if (kind() != k) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expecting kind '" << kindToString(k)
+         << "' but found '" << kindToString(kind()) << "'\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+    if (trees().size() < expected_subtrees ||
+        (!allow_more && trees().size() != expected_subtrees)) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expected at least "
+         << expected_subtrees << " subtrees, but found only " << trees().size()
+         << "\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+  }
+  ~Tree() override = default;
+
+ private:
+  int kind_;
+};
+
+struct String : public Tree {
+  String(std::string value) : Tree(TK_STRING), value_(std::move(value)) {}
+  const std::string& stringValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return c10::make_intrusive<String>(std::forward<Args>(args)...);
+  }
+
+ private:
+  std::string value_;
+};
+
+static SourceRange mergeRanges(SourceRange c, const TreeList& others) {
+  for (const auto& t : others) {
+    if (t->isAtom())
+      continue;
+    size_t s = std::min(c.start(), t->range().start());
+    size_t e = std::max(c.end(), t->range().end());
+    c = SourceRange(c.source(), s, e);
+  }
+  return c;
+}
+
+struct Compound : public Tree {
+  Compound(int kind, SourceRange range)
+      : Tree(kind), range_(std::move(range)) {}
+  Compound(int kind, const SourceRange& range_, TreeList&& trees_)
+      : Tree(kind),
+        range_(mergeRanges(range_, trees_)),
+        trees_(std::move(trees_)) {}
+  const TreeList& trees() const override {
+    return trees_;
+  }
+  static TreeRef create(
+      int kind,
+      const SourceRange& range_,
+      TreeList&& trees_) {
+    return c10::make_intrusive<Compound>(kind, range_, std::move(trees_));
+  }
+  bool isAtom() const override {
+    return false;
+  }
+  TreeRef map(const std::function<TreeRef(TreeRef)>& fn) override {
+    TreeList ret;
+    for (auto& t : trees()) {
+      ret.push_back(fn(t));
+    }
+    return Compound::create(kind(), range(), std::move(ret));
+  }
+
+  const SourceRange& range() const override {
+    return range_;
+  }
+
+ private:
+  SourceRange range_;
+  TreeList trees_;
+};
+
+// tree pretty printer
+struct pretty_tree {
+  pretty_tree(const TreeRef& tree, size_t col = 40) : tree(tree), col(col) {}
+  const TreeRef& tree;
+  size_t col;
+  std::unordered_map<TreeRef, std::string> flat_strings;
+  const std::string& get_flat(const TreeRef& t) {
+    auto it = flat_strings.find(t);
+    if (it != flat_strings.end())
+      return it->second;
+
+    std::stringstream out;
+    switch (t->kind()) {
+      case TK_STRING:
+        out << t->stringValue();
+        break;
+      default:
+        out << "(" << kindToString(t->kind());
+        for (const auto& e : t->trees()) {
+          out << " " << get_flat(e);
+        }
+        out << ")";
+        break;
+    }
+    auto it_ = flat_strings.emplace(t, out.str());
+    return it_.first->second;
+  }
+  void print(std::ostream& out, const TreeRef& t, int indent) {
+    const std::string& s = get_flat(t);
+    if (indent + s.size() < col || t->isAtom()) {
+      out << s;
+      return;
+    }
+    std::string k = kindToString(t->kind());
+    out << "(" << k;
+    for (const auto& e : t->trees()) {
+      out << "\n" << std::string(indent + 2, ' ');
+      print(out, e, indent + 2);
+    }
+    out << ")";
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& out, pretty_tree t_) {
+  t_.print(out, t_.tree, 0);
+  return out << std::endl;
+}
+
+static inline std::ostream& operator<<(std::ostream& out, const TreeRef& t) {
+  return out << pretty_tree(t);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..26ae40819b32c29e3c69d61e9a5f05e031bac72d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h
@@ -0,0 +1,1275 @@
+#pragma once
+#include <c10/util/string_utils.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/strtod.h>
+#include <torch/csrc/jit/frontend/tree.h>
+
+#include <c10/util/complex.h>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+
+namespace torch {
+namespace jit {
+
+// clang-format off
+// TreeView provides a statically-typed way to traverse the tree, which should
+// be formed according to the grammar below.
+//
+// A few notes on types and their aliases:
+// - List<T> is really a Tree with kind TK_LIST and elements as subtrees
+// - Maybe<T> is really a Tree with kind TK_OPTION that has 0 or 1 subtree of type T
+// - Builtin types are: Ident (TK_IDENT), String (TK_STRING)
+//
+// Param = Param(Maybe<Expr> type, Ident name)                          TK_PARAM
+//
+// Decl  = Decl(List<Param> params, Maybe<Expr> return_type)            TK_DECL
+// Def   = Def(Ident name, Decl decl, List<Stmt> body)                  TK_DEF
+// ClassDef = ClassDef(Ident name,                                      TK_CLASS_DEF
+//                     Maybe<Expr> superclass,
+//                     List<Stmt> body)
+//
+// Stmt  = If(Expr cond, List<Stmt> true_body, List<Stmt> false_body)   TK_IF
+//       | For(List<Expr> targets, List<Expr> iters, List<Stmt> body)   TK_FOR
+//       | While(Expr cond, List<Stmt> body)                            TK_WHILE
+//       | Global(List<Ident> idents)                                   TK_GLOBAL
+//       -- NB: the only type of Expr's allowed on lhs are Var
+//          Or a tuple containing Var with an optional terminating Starred
+//       | Assign(Expr lhs, Maybe<Expr> rhs, Maybe<Expr> type)          TK_ASSIGN
+//       | AugAssign(Expr lhs, AugAssignKind aug_op, Expr rhs)          TK_AUG_ASSIGN
+//       | Return(List<Expr> values)                                    TK_RETURN
+//       | ExprStmt(List<Expr> expr)                                    TK_EXPR_STMT
+//       | Raise(Expr expr)                                             TK_RAISE
+//       | Def                                                          TK_DEF
+//       | With(List<WithItem> targets, List<Stmt> body)                TK_WITH
+//
+// Expr  = TernaryIf(Expr cond, Expr true_expr, Expr false_expr)        TK_IF_EXPR
+//       | BinOp(Expr lhs, Expr rhs)
+//       |     And                                                      TK_AND
+//       |     Or                                                       TK_OR
+//       |     Lt                                                       '<'
+//       |     Gt                                                       '>'
+//       |     Eq                                                       TK_EQ
+//       |     Le                                                       TK_LE
+//       |     Ge                                                       TK_GE
+//       |     Ne                                                       TK_NE
+//       |     Is                                                       TK_IS
+//       |     IsNot                                                    TK_ISNOT
+//       |     Add                                                      '+'
+//       |     Sub                                                      '-'
+//       |     Mul                                                      '*'
+//       |     Div                                                      '/'
+//       |     Mod                                                      '%'
+//       |     MatMult                                                  '@'
+//       |     Pow                                                      TK_POW
+//       | UnaryOp(Expr expr)
+//       |     Not                                                      TK_NOT
+//       |     USub                                                     '-'
+//       | Const(String value)                                          TK_CONST
+//       -- NB: x.name(y) is desugared into name(x, y)
+//       | Apply(Ident name, List<Expr> args, List<Attribute> kwargs)   TK_APPLY
+//       | Select(Expr value, Ident selector)                           '.'
+//       | Subscript(Expr value, List<Expr> subscript_exprs)            TK_SUBSCRIPT
+//       | SliceExpr(Maybe<Expr> start, Maybe<Expr> end)                TK_SLICE_EXPR
+//       | Var(Ident name)                                              TK_VAR
+//       | ListLiteral(List<Expr> inputs)                               TK_LIST_LITERAL
+//       | TupleLiteral(List<Expr> inputs)                              TK_TUPLE_LITERAL
+//       | Starred(Expr expr)                                           TK_STARRED
+//       | WithItem(Expr target, Maybe<Var> var)                        TK_WITH_ITEM
+// -- NB: only allowed expressions are Const or List(Const)
+//        (List as a value, not type constructor)
+// Attribute = Attribute(Ident name, Expr value)                        TK_ATTRIBUTE
+//
+// AugAssignKind =
+//            | Add()                                                   TK_PLUS_EQ
+//            | Sub()                                                   TK_MINUS_EQ
+//            | Mul()                                                   TK_TIMES_EQ
+//            | Div()                                                   TK_DIV_EQ
+//            | Mod()                                                   TK_MOD_EQ
+//
+
+// Each subclass of TreeView should provide:
+// 1. Constructor that takes a TreeRef, and checks that it's of the right type.
+// 2. Accessors that get underlying information out of the object. If they
+//    return subtrees, they should wrap them in appropriate views too.
+// 3. Static method 'create' that creates the underlying TreeRef object
+//    for every TreeRef kind that has a TreeView, the parser always uses
+//    (e.g.) Ident::create rather than Compound::Create, this means that
+//    changes to the structure of Ident are always made right here rather
+//    than both in the parser and in this code.
+// XXX: these structs should have no fields to prevent slicing when passing by value
+// clang-format on
+struct TreeView {
+  explicit TreeView(TreeRef tree) : tree_(std::move(tree)) {}
+  TreeRef tree() const {
+    return tree_;
+  }
+  const SourceRange& range() const {
+    return tree_->range();
+  }
+  operator TreeRef() const {
+    return tree_;
+  }
+  const TreeRef& get() const {
+    return tree_;
+  }
+  int kind() const {
+    return tree_->kind();
+  }
+  void dump() const {
+    std::cout << tree_;
+  }
+
+ protected:
+  const TreeRef& subtree(size_t i) const {
+    return tree_->trees().at(i);
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  TreeRef tree_;
+};
+
+template <typename T>
+struct ListIterator {
+  ListIterator(TreeList::const_iterator it) : it(it) {}
+  bool operator!=(const ListIterator& rhs) const {
+    return it != rhs.it;
+  }
+  bool operator==(const ListIterator& rhs) const {
+    return it == rhs.it;
+  }
+  T operator*() const {
+    return T(*it);
+  }
+  ListIterator& operator+=(std::ptrdiff_t n) {
+    it += n;
+    return *this;
+  }
+  ListIterator& operator++() {
+    ++it;
+    return *this;
+  }
+  ListIterator& operator--() {
+    --it;
+    return *this;
+  }
+
+ private:
+  TreeList::const_iterator it;
+};
+
+template <typename T>
+struct List : public TreeView {
+  using iterator = ListIterator<T>;
+  using const_iterator = ListIterator<T>;
+
+  List(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_LIST);
+    // Iterate over list to temporarily instantiate Ts that will check the type
+    for (const T& elem : *this) {
+      (void)elem; // silence unused warning
+    }
+  }
+  iterator begin() const {
+    return iterator(tree_->trees().begin());
+  }
+  iterator end() const {
+    return iterator(tree_->trees().end());
+  }
+  bool empty() const {
+    return tree_->trees().begin() == tree_->trees().end();
+  }
+  T operator[](size_t i) const {
+    return T(subtree(i));
+  }
+  TreeRef map(const std::function<TreeRef(const T&)>& fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static List create(const SourceRange& range, const std::vector<T>& subtrees) {
+    TreeList type_erased_sub{subtrees.begin(), subtrees.end()};
+    return List(Compound::create(TK_LIST, range, std::move(type_erased_sub)));
+  }
+  static List unsafeCreate(const SourceRange& range, TreeList&& subtrees) {
+    return List(Compound::create(TK_LIST, range, std::move(subtrees)));
+  }
+  size_t size() const {
+    return tree_->trees().size();
+  }
+};
+
+template <typename T>
+struct Maybe : public TreeView {
+  explicit Maybe(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_OPTION);
+    if (tree_->trees().size() > 1)
+      throw ErrorReport(tree) << "Maybe trees can have at most one subtree";
+  }
+  /* implicit */ Maybe(const T& tree) : TreeView(tree) {}
+  bool present() const {
+    return tree_->trees().size() > 0;
+  }
+  T get() const {
+    return T(tree_->trees().at(0));
+  }
+  TreeRef map(const std::function<TreeRef(const T&)>& fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static Maybe<T> create(const SourceRange& range) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {}));
+  }
+  static Maybe<T> create(const SourceRange& range, const T& value) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {value}));
+  }
+};
+
+struct Ident : public TreeView {
+  explicit Ident(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_IDENT);
+  }
+  const std::string& name() const {
+    return subtree(0)->stringValue();
+  }
+  static Ident create(const SourceRange& range, std::string name) {
+    return Ident(
+        Compound::create(TK_IDENT, range, {String::create(std::move(name))}));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Base types (production LHS)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Stmt : public TreeView {
+  explicit Stmt(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF:
+      case TK_FOR:
+      case TK_WHILE:
+      case TK_GLOBAL:
+      case TK_ASSIGN:
+      case TK_AUG_ASSIGN:
+      case TK_RETURN:
+      case TK_EXPR_STMT:
+      case TK_RAISE:
+      case TK_ASSERT:
+      case TK_PASS:
+      case TK_BREAK:
+      case TK_DELETE:
+      case TK_CONTINUE:
+      case TK_DEF:
+      case TK_WITH:
+        return;
+      default:
+        throw ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid Stmt";
+    }
+  }
+};
+
+struct Expr : public TreeView {
+  explicit Expr(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF_EXPR:
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_IS:
+      case TK_ISNOT:
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '-':
+      case TK_UNARY_MINUS:
+      case '~':
+      case '*':
+      case TK_STARRED:
+      case '/':
+      case '%':
+      case TK_NOT:
+      case TK_CONST:
+      case TK_STRINGLITERAL:
+      case TK_TRUE:
+      case TK_FALSE:
+      case TK_NONE:
+      case TK_NONE_TYPE:
+      case TK_CAST:
+      case TK_APPLY:
+      case '.':
+      case TK_SUBSCRIPT:
+      case TK_SLICE_EXPR:
+      case TK_VAR:
+      case TK_LIST_LITERAL:
+      case TK_TUPLE_LITERAL:
+      case TK_DICT_LITERAL:
+      case '@':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+      case TK_FLOOR_DIV:
+      case '&':
+      case '^':
+      case '|':
+      case TK_LIST_COMP:
+      case TK_DICT_COMP:
+      case TK_DOTS:
+      case TK_IN:
+      case TK_WITH_ITEM:
+        return;
+      default:
+        throw ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid Expr";
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper nodes (mostly for function arguments)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Attribute : public TreeView {
+  explicit Attribute(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_ATTRIBUTE);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Expr value() const {
+    return Expr(subtree(1));
+  }
+  static Attribute create(
+      const SourceRange& range,
+      const Ident& name,
+      const TreeRef& value) {
+    return Attribute(Compound::create(TK_ATTRIBUTE, range, {name, value}));
+  }
+};
+
+struct Param : public TreeView {
+  explicit Param(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_PARAM);
+  }
+  static Param create(
+      const SourceRange& range,
+      const Ident& ident,
+      const Maybe<Expr>& type,
+      const Maybe<Expr>& def,
+      bool kwarg_only) {
+    TreeRef kwarg_only_tree =
+        Compound::create(kwarg_only ? TK_TRUE : TK_FALSE, range, {});
+    return Param(Compound::create(
+        TK_PARAM, range, {ident, type, def, std::move(kwarg_only_tree)}));
+  }
+  Ident ident() const {
+    return Ident(subtree(0));
+  }
+  Maybe<Expr> type() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  Maybe<Expr> defaultValue() const {
+    return Maybe<Expr>(subtree(2));
+  }
+  bool kwarg_only() const {
+    return TK_TRUE == subtree(3)->kind();
+  }
+  Param withType(const Maybe<Expr>& typ) const {
+    return Param::create(range(), ident(), typ, defaultValue(), kwarg_only());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Top level definitions
+////////////////////////////////////////////////////////////////////////////////
+
+struct Decl : public TreeView {
+  explicit Decl(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DECL);
+  }
+  List<Param> params() const {
+    return List<Param>(subtree(0));
+  }
+  Maybe<Expr> return_type() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  static Decl create(
+      const SourceRange& range,
+      const List<Param>& params,
+      const Maybe<Expr>& return_type) {
+    return Decl(Compound::create(TK_DECL, range, {params, return_type}));
+  }
+};
+
+struct Def : public TreeView {
+  explicit Def(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DEF);
+  }
+  Def withName(std::string new_name) const {
+    auto new_ident = Ident::create(name().range(), std::move(new_name));
+    return create(range(), new_ident, decl(), statements());
+  }
+  Def withDecl(const Decl& decl) const {
+    return create(range(), name(), decl, statements());
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Decl decl() const {
+    return Decl(subtree(1));
+  }
+  List<Stmt> statements() const {
+    return List<Stmt>(subtree(2));
+  }
+  static Def create(
+      const SourceRange& range,
+      const Ident& name,
+      const Decl& decl,
+      const List<Stmt>& stmts) {
+    return Def(Compound::create(TK_DEF, range, {name, decl, stmts}));
+  }
+};
+
+// Property represents a named attribute combined with a getter and setter
+// method to access and mutate that attribute.
+struct Property : public TreeView {
+  explicit Property(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_PROP);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Def getter() const {
+    return Def(subtree(1));
+  }
+  Maybe<Def> setter() const {
+    return Maybe<Def>(subtree(2));
+  }
+  static Property create(
+      const SourceRange& range,
+      const Ident& name,
+      const Def& getter,
+      const Maybe<Def>& setter) {
+    return Property(Compound::create(TK_PROP, range, {name, getter, setter}));
+  }
+};
+
+struct Assign;
+
+struct ClassDef : public TreeView {
+  explicit ClassDef(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_CLASS_DEF);
+  }
+  explicit ClassDef(TreeRef&& tree) : TreeView(std::move(tree)) {
+    tree_->match(TK_CLASS_DEF);
+  }
+  ClassDef withName(std::string new_name) const {
+    auto new_ident = Ident::create(name().range(), std::move(new_name));
+    return create(range(), new_ident, superclass(), body());
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Maybe<Expr> superclass() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(2));
+  }
+  Maybe<List<Property>> properties() const {
+    return Maybe<List<Property>>(subtree(3));
+  }
+  Maybe<List<Assign>> assigns() const {
+    return Maybe<List<Assign>>(subtree(4));
+  }
+  static ClassDef create(
+      const SourceRange& range,
+      const Ident& name,
+      const Maybe<Expr>& superclass,
+      const List<Stmt>& body) {
+    return ClassDef(Compound::create(
+        TK_CLASS_DEF,
+        range,
+        {name,
+         superclass,
+         body,
+         Maybe<List<Property>>::create(range),
+         Maybe<List<Assign>>::create(range)}));
+  }
+  static ClassDef create(
+      const SourceRange& range,
+      const Ident& name,
+      const Maybe<Expr>& superclass,
+      const List<Stmt>& body,
+      const List<Property>& properties,
+      const List<Assign>& assigns);
+};
+
+TORCH_API std::vector<std::string> getUnresolvedClassAttributes(
+    const ClassDef& def);
+
+////////////////////////////////////////////////////////////////////////////////
+// Statements
+////////////////////////////////////////////////////////////////////////////////
+
+struct If : public Stmt {
+  explicit If(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_IF);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> trueBranch() const {
+    return List<Stmt>(subtree(1));
+  }
+  List<Stmt> falseBranch() const {
+    return List<Stmt>(subtree(2));
+  }
+  If withNewBranches(
+      const List<Stmt>& true_branch,
+      const List<Stmt>& false_branch) const {
+    return create(range(), cond(), true_branch, false_branch);
+  }
+  static If create(
+      const SourceRange& range,
+      const Expr& cond,
+      const List<Stmt>& true_branch,
+      const List<Stmt>& false_branch) {
+    return If(
+        Compound::create(TK_IF, range, {cond, true_branch, false_branch}));
+  }
+};
+
+struct While : public Stmt {
+  explicit While(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_WHILE);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(1));
+  }
+  static While create(
+      const SourceRange& range,
+      const Expr& cond,
+      const List<Stmt>& body) {
+    return While(Compound::create(TK_WHILE, range, {cond, body}));
+  }
+};
+
+struct For : public Stmt {
+  explicit For(const TreeRef& tree) : Stmt(tree) {
+    tree->match(TK_FOR);
+  }
+  List<Expr> targets() const {
+    return List<Expr>(subtree(0));
+  }
+  List<Expr> itrs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(2));
+  }
+  static For create(
+      const SourceRange& range,
+      const List<Expr>& targets,
+      const List<Expr>& itrs,
+      const List<Stmt>& body) {
+    return For(Compound::create(TK_FOR, range, {targets, itrs, body}));
+  }
+};
+
+// TODO: supports only single comprehension for now
+struct ListComp : public Expr {
+  explicit ListComp(const TreeRef& tree) : Expr(tree) {
+    tree->match(TK_LIST_COMP);
+  }
+  Expr elt() const {
+    return Expr(subtree(0));
+  }
+  Expr target() const {
+    return Expr(subtree(1));
+  }
+  Expr iter() const {
+    return Expr(subtree(2));
+  }
+  // TODO: no ifs for now
+  static ListComp create(
+      const SourceRange& range,
+      const Expr& elt,
+      const Expr& target,
+      const Expr& iter) {
+    return ListComp(Compound::create(TK_LIST_COMP, range, {elt, target, iter}));
+  }
+};
+
+// TODO: supports only single comprehension for now
+struct DictComp : public Expr {
+  explicit DictComp(const TreeRef& tree) : Expr(tree) {
+    tree->match(TK_DICT_COMP);
+  }
+  Expr key() const {
+    return Expr(subtree(0));
+  }
+  Expr value() const {
+    return Expr(subtree(1));
+  }
+  Expr target() const {
+    return Expr(subtree(2));
+  }
+  Expr iter() const {
+    return Expr(subtree(3));
+  }
+  // TODO: no ifs for now
+  static DictComp create(
+      const SourceRange& range,
+      const Expr& key,
+      const Expr& value,
+      const Expr& target,
+      const Expr& iter) {
+    return DictComp(
+        Compound::create(TK_DICT_COMP, range, {key, value, target, iter}));
+  }
+};
+
+struct Global : public Stmt {
+  explicit Global(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_GLOBAL);
+  }
+  List<Ident> names() {
+    return List<Ident>(subtree(0));
+  }
+  static Global create(const SourceRange& range, const List<Ident>& names) {
+    return Global(Compound::create(TK_GLOBAL, range, {names}));
+  }
+};
+
+struct AugAssignKind : public TreeView {
+  explicit AugAssignKind(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case '+':
+      case '-':
+      case '*':
+      case '/':
+      case '%':
+      case '|':
+      case '&':
+      case '^':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+        return;
+      default:
+        throw ErrorReport(tree) << "is not a valid AugAssignKind";
+    }
+  }
+};
+
+// Augmented assignment, like "foo += bar"
+struct AugAssign : public Stmt {
+  explicit AugAssign(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_AUG_ASSIGN);
+  }
+  static AugAssign create(
+      const SourceRange& range,
+      const Expr& lhs,
+      const AugAssignKind& aug_op,
+      const Expr& rhs) {
+    return AugAssign(
+        Compound::create(TK_AUG_ASSIGN, range, {lhs, aug_op, rhs}));
+  }
+  Expr lhs() const {
+    return Expr(subtree(0));
+  }
+  int aug_op() const {
+    return subtree(1)->kind();
+  }
+  Expr rhs() const {
+    return Expr(subtree(2));
+  }
+};
+
+struct Assign : public Stmt {
+  explicit Assign(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_ASSIGN);
+  }
+  static Assign create(
+      const SourceRange& range,
+      const List<Expr>& lhs,
+      const Maybe<Expr>& rhs,
+      const Maybe<Expr>& type) {
+    return Assign(Compound::create(TK_ASSIGN, range, {lhs, rhs, type}));
+  }
+
+  List<Expr> lhs_list() const {
+    return List<Expr>(subtree(0));
+  }
+
+  Expr lhs() const {
+    const auto& li = lhs_list();
+    TORCH_INTERNAL_ASSERT(li.size() == 1);
+    return *li.begin();
+  }
+
+  Maybe<Expr> rhs() const {
+    return Maybe<Expr>(subtree(1));
+  }
+
+  Maybe<Expr> type() const {
+    return Maybe<Expr>(subtree(2));
+  }
+};
+
+struct Return : public Stmt {
+  explicit Return(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_RETURN);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Return create(const SourceRange& range, const Expr& value) {
+    return Return(Compound::create(TK_RETURN, range, {value}));
+  }
+};
+
+struct Raise : public Stmt {
+  explicit Raise(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_RAISE);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Raise create(const SourceRange& range, const Expr& expr) {
+    return Raise(Compound::create(TK_RAISE, range, {expr}));
+  }
+};
+
+struct Assert : public Stmt {
+  explicit Assert(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_ASSERT);
+  }
+  Expr test() const {
+    return Expr(subtree(0));
+  }
+  Maybe<Expr> msg() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  static Assert create(
+      const SourceRange& range,
+      const Expr& test,
+      const Maybe<Expr>& msg) {
+    return Assert(Compound::create(TK_ASSERT, range, {test, msg}));
+  }
+};
+
+struct Pass : public Stmt {
+  explicit Pass(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_PASS);
+  }
+  static Pass create(const SourceRange& range) {
+    return Pass(Compound::create(TK_PASS, range, {}));
+  }
+};
+
+struct Dots : public Expr {
+  explicit Dots(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_DOTS);
+  }
+  static Dots create(const SourceRange& range) {
+    return Dots(Compound::create(TK_DOTS, range, {}));
+  }
+};
+
+struct Break : public Stmt {
+  explicit Break(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_BREAK);
+  }
+  static Break create(const SourceRange& range) {
+    return Break(Compound::create(TK_BREAK, range, {}));
+  }
+};
+
+struct Continue : public Stmt {
+  explicit Continue(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_CONTINUE);
+  }
+  static Continue create(const SourceRange& range) {
+    return Continue(Compound::create(TK_CONTINUE, range, {}));
+  }
+};
+
+struct ExprStmt : public Stmt {
+  explicit ExprStmt(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_EXPR_STMT);
+  }
+  Expr expr() {
+    return Expr(subtree(0));
+  }
+  static ExprStmt create(const SourceRange& range, const Expr& list) {
+    return ExprStmt(Compound::create(TK_EXPR_STMT, range, {list}));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Expressions
+////////////////////////////////////////////////////////////////////////////////
+
+struct BinOp : public Expr {
+  explicit BinOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_IS:
+      case TK_ISNOT:
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '*':
+      case '/':
+      case '-':
+      case '@':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+      case '%':
+      case '&':
+      case '^':
+      case '|':
+      case TK_FLOOR_DIV:
+      case TK_IN:
+        if (tree->trees().size() != 2)
+          throw ErrorReport(tree)
+              << "BinOp expected 2 subtrees, found " << tree->trees().size();
+        return;
+      default:
+        throw ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid BinOp";
+    }
+  }
+  Expr lhs() const {
+    return Expr(subtree(0));
+  }
+  Expr rhs() const {
+    return Expr(subtree(1));
+  }
+  static BinOp create(
+      const SourceRange& range,
+      int kind,
+      const Expr& lhs,
+      const Expr& rhs) {
+    return BinOp(Compound::create(kind, range, {lhs, rhs}));
+  }
+};
+
+struct UnaryOp : public Expr {
+  explicit UnaryOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_UNARY_MINUS:
+      case '~':
+      case TK_NOT:
+        if (tree->trees().size() != 1)
+          throw ErrorReport(tree)
+              << "UnaryOp expected 1 subtree, found " << tree->trees().size();
+        return;
+      default:
+        throw ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid UnaryOp";
+    }
+  }
+  static UnaryOp create(const SourceRange& range, int kind, const Expr& expr) {
+    return UnaryOp(Compound::create(kind, range, {expr}));
+  }
+};
+
+struct Const : public Expr {
+  explicit Const(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_CONST, 1);
+  }
+  bool isFloatingPoint() const {
+    if (isComplex())
+      return false;
+
+    bool is_inf = subtree(0)->stringValue() == "inf";
+    return is_inf ||
+        subtree(0)->stringValue().find_first_of(".eE") != std::string::npos;
+  }
+  bool isIntegral() const {
+    return !isFloatingPoint() && !isComplex();
+  }
+  bool isComplex() const {
+    return subtree(0)->stringValue().find_first_of('j') != std::string::npos;
+  }
+  int64_t asIntegral() const {
+    try {
+      // NOLINTNEXTLINE(modernize-use-nullptr)
+      return std::stoll(subtree(0)->stringValue(), /*__idx=*/0, /*base=*/0);
+    } catch (const std::out_of_range&) {
+      throw ErrorReport(range()) << "Integral constant out of range "
+                                    "(must fit in a signed 64 bit integer)";
+    }
+  }
+  double asFloatingPoint() const {
+    // We can't pass in nullptr as the dummy pointer gets dereferenced for
+    // Android version of strtod_c().
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    char* dummy;
+    return torch::jit::strtod_c(subtree(0)->stringValue().c_str(), &dummy);
+  }
+  c10::complex<double> asComplex() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    char* dummy;
+    auto str = subtree(0)->stringValue();
+    // Complex numbers (a+bj, where a is non-zero) are parsed as an addition
+    // between float/int a and a complex number "bj". When a is 0, a complex
+    // number bj is created as above. So, while parsing the string, we don't
+    // have to worry about the real component of the complex number.
+    auto imag =
+        torch::jit::strtod_c(str.substr(0, str.size() - 1).c_str(), &dummy);
+    return c10::complex<double>(0, imag);
+  }
+  const std::string& text() const {
+    return subtree(0)->stringValue();
+  }
+  static Const create(const SourceRange& range, const std::string& value) {
+    return Const(Compound::create(TK_CONST, range, {String::create(value)}));
+  }
+};
+
+struct StringLiteral : public Expr {
+  explicit StringLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_STRINGLITERAL, 1);
+  }
+  const std::string& text() const {
+    return subtree(0)->stringValue();
+  }
+  static StringLiteral create(
+      const SourceRange& range,
+      const std::string& value) {
+    return StringLiteral(
+        Compound::create(TK_STRINGLITERAL, range, {String::create(value)}));
+  }
+};
+
+struct Apply : public Expr {
+  explicit Apply(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_APPLY);
+  }
+  Expr callee() const {
+    return Expr(subtree(0));
+  }
+  List<Expr> inputs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Attribute> attributes() const {
+    return List<Attribute>(subtree(2));
+  }
+  static Apply create(
+      const SourceRange& range,
+      const Expr& callee,
+      const List<Expr>& inputs,
+      const List<Attribute>& attributes) {
+    return Apply(
+        Compound::create(TK_APPLY, range, {callee, inputs, attributes}));
+  }
+};
+
+struct Select : public Expr {
+  explicit Select(const TreeRef& tree) : Expr(tree) {
+    tree_->match('.');
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  Ident selector() const {
+    return Ident(subtree(1));
+  }
+  static Select create(
+      const SourceRange& range,
+      const Expr& value,
+      const Ident& selector) {
+    return Select(Compound::create('.', range, {value, selector}));
+  }
+};
+
+struct SliceExpr : public Expr {
+  explicit SliceExpr(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_SLICE_EXPR);
+  }
+  Maybe<Expr> start() const {
+    return Maybe<Expr>(subtree(0));
+  }
+  Maybe<Expr> end() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  Maybe<Expr> step() const {
+    return Maybe<Expr>(subtree(2));
+  }
+  Expr startOr(int64_t alternative) const {
+    const auto startOption = start();
+    return startOption.present() ? startOption.get() : createInt(alternative);
+  }
+  Expr endOr(int64_t alternative) const {
+    const auto endOption = end();
+    return endOption.present() ? endOption.get() : createInt(alternative);
+  }
+  Expr stepOr(int64_t alternative) const {
+    const auto stepOption = step();
+    return stepOption.present() ? stepOption.get() : createInt(alternative);
+  }
+  static SliceExpr create(
+      const SourceRange& range,
+      const Maybe<Expr>& start,
+      const Maybe<Expr>& end,
+      const Maybe<Expr>& step) {
+    return SliceExpr(
+        Compound::create(TK_SLICE_EXPR, range, {start, end, step}));
+  }
+
+ private:
+  Expr createInt(int64_t value) const {
+    return Expr(Const::create(range(), c10::to_string(value)));
+  }
+};
+
+struct Subscript : public Expr {
+  explicit Subscript(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_SUBSCRIPT);
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  List<Expr> subscript_exprs() const {
+    return List<Expr>(subtree(1));
+  }
+  static Subscript create(
+      const SourceRange& range,
+      const Expr& value,
+      const List<Expr>& subscript_exprs) {
+    auto whole_range = SourceRange(
+        range.source(), range.start(), subscript_exprs.range().end() + 1);
+    return Subscript(
+        Compound::create(TK_SUBSCRIPT, whole_range, {value, subscript_exprs}));
+  }
+};
+
+struct Var : public Expr {
+  explicit Var(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_VAR);
+  };
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  static Var create(const SourceRange& range, const Ident& name) {
+    return Var(Compound::create(TK_VAR, range, {name}));
+  }
+};
+
+// WithItem represents an item using with a WithStmt.
+struct WithItem : public Expr {
+  explicit WithItem(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_WITH_ITEM);
+  }
+
+  Expr target() const {
+    return Expr(subtree(0));
+  }
+
+  Maybe<Var> var() const {
+    return Maybe<Var>(subtree(1));
+  }
+
+  static WithItem create(
+      const SourceRange& range,
+      const Expr& target,
+      const Maybe<Var>& var) {
+    return WithItem(Compound::create(TK_WITH_ITEM, range, {target, var}));
+  }
+};
+
+// With represents a with statement consisting of a list of with items and a
+// body of statements.
+struct With : public Stmt {
+  explicit With(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_WITH);
+  }
+
+  List<WithItem> targets() const {
+    return List<WithItem>(subtree(0));
+  }
+
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(1));
+  }
+
+  static With create(
+      const SourceRange& range,
+      const List<WithItem>& targets,
+      const List<Stmt>& body) {
+    return With(Compound::create(TK_WITH, range, {targets, body}));
+  }
+};
+
+struct TernaryIf : public Expr {
+  explicit TernaryIf(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_IF_EXPR, 3);
+  };
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  Expr true_expr() const {
+    return Expr(subtree(1));
+  }
+  Expr false_expr() const {
+    return Expr(subtree(2));
+  }
+  static TernaryIf create(
+      const SourceRange& range,
+      const Expr& cond,
+      const Expr& true_expr,
+      const Expr& false_expr) {
+    return TernaryIf(
+        Compound::create(TK_IF_EXPR, range, {cond, true_expr, false_expr}));
+  };
+};
+
+struct ListLiteral : public Expr {
+  explicit ListLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_LIST_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static ListLiteral create(
+      const SourceRange& range,
+      const List<Expr>& inputs) {
+    return ListLiteral(Compound::create(TK_LIST_LITERAL, range, {inputs}));
+  }
+};
+
+struct TupleLiteral : public Expr {
+  explicit TupleLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_TUPLE_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static TupleLiteral create(
+      const SourceRange& range,
+      const List<Expr>& inputs) {
+    return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs}));
+  }
+};
+
+struct DictLiteral : public Expr {
+  explicit DictLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_DICT_LITERAL);
+  }
+  List<Expr> key_inputs() const {
+    return subtree(0);
+  }
+  List<Expr> value_inputs() const {
+    return subtree(1);
+  }
+  static DictLiteral create(
+      const SourceRange& range,
+      const List<Expr>& keys,
+      const List<Expr>& values) {
+    return DictLiteral(
+        Compound::create(TK_DICT_LITERAL, range, {keys, values}));
+  }
+};
+
+struct Starred : public Expr {
+  explicit Starred(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_STARRED);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Starred create(const SourceRange& range, const Expr& expr) {
+    return Starred(Compound::create(TK_STARRED, range, {expr}));
+  }
+};
+
+struct Delete : public Stmt {
+  explicit Delete(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_DELETE);
+  }
+  List<Expr> targets() const {
+    return subtree(0);
+  }
+  static Delete create(const SourceRange& range, const List<Expr>& targets) {
+    return Delete(Compound::create(TK_DELETE, range, {targets}));
+  }
+};
+
+/*
+ * NOTE: transforming PEP 604 union into equivalent union type
+ *
+ * NOTE: Union[int, float] parses into:
+ * <EXPR> expr:(subscript
+ *  (variable (ident Union))
+ *  (list
+ *    (variable (ident int))
+ *    (variable (ident float))))
+ * <KIND> subscript
+ *
+ * NOTE: (int | float) parses into:
+ * <EXPR> expr:(|
+ *  (variable (ident int))
+ *  (variable (ident float)))
+ * <KIND> |
+ */
+
+inline void _flatten_pep604_union(
+    const torch::jit::Expr& node,
+    std::vector<torch::jit::Expr>* result) {
+  // flatten possibly nested union expressions like (int | (float | str))
+  // into a flat list of expressions like [int, float, str]
+  if (node.kind() == '|') {
+    auto as_binop = torch::jit::BinOp(node);
+    _flatten_pep604_union(as_binop.lhs(), result);
+    _flatten_pep604_union(as_binop.rhs(), result);
+  } else {
+    result->push_back(node);
+  }
+}
+
+inline std::vector<Expr> get_pep604_union_members(const Expr& node) {
+  std::vector<Expr> result;
+  _flatten_pep604_union(node, &result);
+  return result;
+}
+
+// Flattens a PEP 604 union into a classical union.
+// For example, ((x | y) | z) is transformed into Union[x, y, z].
+inline Expr pep604union_to_union(const Expr& expr) {
+  // noop if not a pep604 union
+  if (expr.kind() != '|')
+    return expr;
+
+  // In order to support unions with more than 2 operands ((x|y)|z), we need to
+  // recursively flatten the tree of | expressions.
+  auto members = get_pep604_union_members(expr);
+  auto synthesised_union = Subscript::create(
+      expr.range(),
+      Var::create(expr.range(), Ident::create(expr.range(), "Union")),
+      List<Expr>::create(expr.range(), members));
+  return std::move(synthesised_union);
+}
+
+} // namespace jit
+} // namespace torch
+
+namespace std {
+
+template <typename T>
+struct iterator_traits<torch::jit::ListIterator<T>>
+    : std::iterator_traits<torch::jit::TreeList::const_iterator> {};
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..beee99c5e316f6694aaae9598efe5379d88d9242
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <caffe2/serialize/versions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <cstdint>
+
+namespace torch {
+namespace jit {
+// Maps the given symbol into an implementation of its behavior at the
+// given version.
+// See note [Versioned Symbols]
+TORCH_API Symbol
+get_symbol_for_version(const Symbol name, const uint64_t version);
+
+// Maps the given kind to the minimum version that supports it.
+// See note [Dynamic Versions and torch.jit.save vs. torch.save]
+TORCH_API uint64_t get_min_version_for_kind(const NodeKind& kind);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..95fa3a6510169f8900dcc817a7873d64e32b6235
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h
@@ -0,0 +1,322 @@
+#pragma once
+
+#include <ATen/core/alias_info.h>
+#include <c10/util/flat_hash_map.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+#include <torch/csrc/jit/passes/create_functional_graphs.h>
+#include <torch/csrc/jit/passes/utils/memory_dag.h>
+
+namespace torch {
+namespace jit {
+
+/**
+ * Alias analysis pass.
+ *
+ * This pass produces an AliasDb that contains aliasing and mutation
+ * information about the graph. Users can use this information to determine
+ * whether mutations to the graph are safe, i.e. they don't reorder/change
+ * nodes in a way that affects output.
+ *
+ * Every value with a mutable type (Tensors, Lists, Tuples, etc.) will be
+ * associated with one or more "alias sets". If two values share an alias set,
+ * that means they may alias, implying that a mutation to one value cannot be
+ * reordered past a use of the other. Only reordering two reads of an alias set
+ * is considered safe.
+ *
+ * There is a special alias set called the "wildcard set", which indicates that
+ * we're not sure what this value may alias. To be conservative, we consider the
+ * wildcard alias set as potentially aliasing any other wildcard value within
+ * the same type class. Whenever a value becomes contained by another value,
+ * such as when a Tensor is appended to a List[Tensor], the contained element
+ * becomes part of the wildcard set.
+ *
+ * Values that contain other mutable types, such as List[Tensor], are
+ * initialized as containing the Wildcard set for all contained mutable types.
+ *
+ * The AliasDb API references the idea of "mutable" vs "immutable"
+ * types. "Mutable" means that the object's value can change, while
+ * "immutable" means that the value is fixed. (For example, `List` is
+ * mutable, so you can add and delete elements from it. On the other
+ * hand, you can't modify a Tuple once you create it, making `Tuple` an
+ * immutable container.)
+ *
+ * `isFrozen` - if the Module is frozen then consider attributes as freshly
+ * created objects. Freezing API invokes alias analysis to check if they are
+ * mutated internally.
+ *
+ * `descendFunctionCalls` - recursively analyze function and method calls
+ * instead of conservative analysis. Generally analysis should be done after
+ * inlining so the implmentation for recursive analysis is unoptimized.
+ */
+class AliasDb {
+ public:
+  TORCH_API explicit AliasDb(
+      std::shared_ptr<Graph> graphi,
+      bool isFrozen = false,
+      bool descendFunctionCalls = false);
+  TORCH_API ~AliasDb();
+
+  // There are limitations to what effects the alias analysis can track. Two
+  // kinds of nodes may have untracked effects:
+  // 1. Nodes that write to a value that may alias the graph inputs (since
+  //    the inputs can be used outside the graph).
+  // 2. Nodes that write to something in the wildcard set.
+  //
+  // These nodes are considered not safe to eliminate or mutate under any
+  // circumstances.
+  bool writesToWildcard(Node* n) const;
+
+  // Does `n` write to an alias of one of the values in `vs`?
+  // if `recurseBlocks` is true, consider writes on the nodes in `n`s sub-blocks
+  TORCH_API bool writesToAlias(Node* n, const ValueSet& vs) const;
+
+  // Does `a` and `b` potentially share a memory location or do either
+  // hold in memory any element that exists in the other
+  TORCH_API bool mayContainAlias(Value* a, Value* b) const;
+
+  TORCH_API bool mayContainAlias(Value* a, const at::ArrayRef<Value*> b) const;
+
+  // Do any values in group `a` share a memory location or hold in memory
+  // any element that exists in group `b`
+  TORCH_API bool mayContainAlias(
+      const at::ArrayRef<Value*> a,
+      const at::ArrayRef<Value*> b) const;
+
+  // Do `a` and `b` potentially share a memory location?
+  TORCH_API bool mayAlias(const Value* a, const Value* b) const;
+  // Do any values in group `a` potentially share a memory location with any
+  // value in group `b`? i.e. may they overlap?
+  TORCH_API bool mayAlias(const ValueSet& a, const ValueSet& b) const;
+
+  // Do any nodes write to an alias set input to `n`?
+  TORCH_API bool hasInputWriters(const Node* n) const;
+
+  // Do any nodes write to an alias set output by `n`?
+  TORCH_API bool hasOutputWriters(const Node* n) const;
+
+  // Do any nodes write to an alias set inputed/outputed by `n`?
+  TORCH_API bool hasWriters(const Node* n) const;
+
+  // Do any nodes write to `v`s memory location?
+  TORCH_API bool hasWriters(const Value* v) const;
+
+  // Is the operation in-place? i.e. doesn't write anywhere but locations it
+  // reads from.
+  TORCH_API bool isMutable(Node* n) const;
+
+  TORCH_API bool escapesScope(const at::ArrayRef<Value*>& vs) const;
+
+  // Is it safe to change whether `a` and `b` alias each other ?
+  TORCH_API bool safeToChangeAliasingRelationship(
+      const at::ArrayRef<Value*>& a,
+      const at::ArrayRef<Value*>& b) const;
+
+  // Move `n` (already in the graph) after `movePoint` in the topological order.
+  //
+  // Tries to preserve value dependencies, so other nodes might be moved. We
+  // make two guarantees about the postcondition of the node list:
+  //   - `n` is directly after `movePoint`.
+  //   - only nodes between `n` and `movePoint` have been moved.
+  //
+  // Returns `false` if it's impossible to move `n` after `MovePoint` without
+  // violating dependencies, otherwise executes the move and returns `true`
+  TORCH_API bool moveAfterTopologicallyValid(Node* n, Node* movePoint);
+  TORCH_API bool moveBeforeTopologicallyValid(Node* n, Node* movePoint);
+
+  bool couldMoveAfterTopologically(Node* n, Node* movePoint);
+  bool couldMoveBeforeTopologically(Node* n, Node* movePoint);
+
+  // For debugging: print alias db state to stdout
+  TORCH_API void dump() const;
+  TORCH_API std::string toString() const;
+
+  // Generates a DOT (www.graphviz.org) graph representation
+  //
+  // Returns `true` if the output file was successfully generated
+  //
+  // WARNING: The output dot file path can't include shell specific notations,
+  //  for example you can't use "~/temp/aliasdb.dot"
+  //  (instead, use "/home/user/temp/aliasdb.dot")
+  //
+  TORCH_API bool dumpToGraphvizFile(const char* filename) const;
+  TORCH_API std::string toGraphviz() const;
+
+  // Returns `true` if the given element is mutable or if it is a
+  // container type with an internal mutable element (e.g.
+  // `Tuple[int, Tensor]` has an internal mutable type `Tensor`, so
+  // it would be considered a "mutable type" in AliasDb)
+  static bool isMutableType(const Value* v);
+  static bool isMutableType(const TypePtr& type);
+
+  /**
+   * Mutation API
+   *
+   * These methods allow you to update AliasDb in-place if you are performing
+   * graph mutation.
+   *
+   * WARNING: These methods should be considered INTERNAL. They do not perform
+   * very many correctness checks, the user is responsible for making sure they
+   * are updating AliasDb correctly. `Lint()`ing the AliasDb can help with
+   * this.
+   */
+  // Copy `existing`s aliasing info to `new_value`, and remove `existing`.
+  TORCH_API void replaceWithNewValue(Value* existing, Value* new_value);
+  // Copy `from`s aliasing info to `to`.
+  TORCH_API void copyValue(Value* from, Value* to);
+  // Create a new `value` that does not alias anything else.
+  TORCH_API void createValue(const Value* value);
+
+  // Enable more precise treatment of prim::TupleConstruct.
+  void enablePreciseTupleContainerAnalysis();
+
+  friend struct MutationRemover;
+
+ private:
+  // Helper for topologically-safe node moves.
+  class WorkingSet;
+  enum class MoveSide { BEFORE, AFTER };
+  bool tryMove(Node* toMove, Node* movePoint, MoveSide moveSide, bool dryRun);
+  void move(Node* toMove, Node* movePoint, MoveSide moveSide);
+  bool isBeforeOrAfter(const Node* n, MoveSide moveSide) const;
+
+  bool isMutableTypeInternal(const Value* v) const;
+  bool isMutableTypeInternal(const TypePtr& type) const;
+
+  /**
+   * Write and read internal API
+   */
+  // Get all the values that `n` writes to.
+  // NOTE: this only returns values directly written to, not aliases thereof
+  //
+  // if `recurseBlocks` is true, gather writes on the nodes in `n`s sub-blocks
+  MemoryLocations getWrites(Node* n) const;
+  void getWritesImpl(Node* n, MemoryLocations& ret) const;
+  // Register the fact that `n` writes to `v`.
+  void registerWrite(const Value* v, Node* n, bool writeToContained = false);
+  // Get all the values that `n` reads from.
+  // if `recurseBlocks` is true, gather reads on the nodes in `n`s sub-blocks
+  MemoryLocations getReads(Node* n) const;
+  void getReadsImpl(Node* n, MemoryLocations& ret) const;
+
+  /**
+   * Wildcard methods
+   */
+  // Register `v` as a wildcard value.
+  c10::optional<Element*> setWildcard(const Value* v);
+
+  // Is this a value which will not alias?
+  bool nonAliasingValue(const Value* elem) const;
+
+  /**
+   * Special analysis methods
+   */
+  void analyze(const std::shared_ptr<Graph>& graph);
+  void analyze(Block* block);
+  void analyze(Node* node);
+  void analyzeImpl(Node* node);
+  void analyzeIf(Node* node);
+  void analyzeLoop(Node* node);
+  void analyzeSubgraph(Node* node, std::shared_ptr<Graph> subgraph);
+  void analyzeSubgraph(Node* node);
+  void analyzeCreator(Node* node);
+  void analyzeExtractor(Node* node);
+  void analyzeChunk(Node* node);
+  void analyzeBroadcastingChunk(Node* node);
+  void analyzeFork(Node* node);
+  void analyzeWait(Node* node);
+  void analyzeAwaitable(Node* node);
+  void analyzeAwaitableWait(Node* node);
+  void analyzeRpcAsync(Node* node);
+  void analyzeBatchNorm(Node* node);
+  void analyzeInstanceNorm(Node* node);
+  void analyzeGradOf(Node* node);
+  void analyzeSetAttr(Node* node);
+  void analyzeConservative(Node* node);
+  void analyzeContainerConstruct(Node* node);
+  bool tryRegisteredAnalysis(Node* node);
+
+  /**
+   * Alias manipulation methods
+   */
+  void makeAllAlias(const std::vector<Value*>& values);
+  void makePointerTo(const Value* value, const Value* to);
+  TORCH_API void addToContainedElements(
+      const Value* element,
+      const Value* container);
+  void mapAliases(at::ArrayRef<Value*> to, at::ArrayRef<Value*> from);
+  void giveFreshAlias(
+      const Value* value,
+      bool add_wildcard_to_contained_elems = true);
+  Element* getOrCreateElement(const Value* value);
+
+  const AliasTypeSet* mapTypeToAliasTypeSetPtr(const TypePtr& type) const;
+  bool functionalNonEscapingListUse(const Use& use) const;
+  bool functionalNonEscapingTupleUse(const Use& use) const;
+
+  std::shared_ptr<Graph> graph_;
+
+  // If the Module is frozen then consider attributes as freshly created
+  // objects. Freezing API invokes alias analysis to check if they are mutated
+  // internally.
+  bool isFrozen_;
+
+  bool descend_function_calls_;
+  std::unordered_map<Graph*, std::vector<std::shared_ptr<Graph>>>
+      function_call_copies_;
+
+  // The points-to graph that stores aliasing relationships
+  std::unique_ptr<MemoryDAGBuilder> memoryDAGBuilder_;
+  std::unique_ptr<MemoryDAG> memoryDAG_;
+
+  // Mapping of values to MemoryDAG elements
+  ska::flat_hash_map<const Value*, Element*> elementMap_;
+  // All wildcard Elements (one for each unique mutable type)
+  ska::flat_hash_map<TypePtr, Element*, HashType, EqualType> wildcardIndex_;
+  Element* getWildcard(const TypePtr& type) const;
+  c10::optional<Element*> tryGetOrCreateWildcard(const TypePtr& type);
+  void addContainedTypesToFreshElement(
+      Element* container_elem,
+      const AliasTypeSet& mut_types);
+  void pointUnionTypeElementToAllContainedTypes(
+      Element* container_elem,
+      const AliasTypeSet& mut_types);
+
+  std::vector<Element*> getElements(at::ArrayRef<Value*> vs) const;
+  bool mayAliasWildcard(const Value* v) const;
+  bool mayAliasWildcard(const at::ArrayRef<Value*> vs) const;
+  bool hasWriters(const at::ArrayRef<Value*>& values) const;
+
+  // Cached mapping of type ptrs to their mutable types
+  mutable ska::flat_hash_map<TypePtr, AliasTypeSet> mapped_mutable_types_;
+
+  /**
+   * State for tracking write info.
+   */
+  // Write registry where the analysis can record the writes as it sees them.
+  // This information is later denormalized into various caches to improve query
+  // efficiency.
+  struct WriteRegistry;
+  std::unique_ptr<WriteRegistry> writeRegistry_;
+
+  // Map of nodes to the memory locations that they write to
+  using TWriteIndex = ska::flat_hash_map<Node*, MemoryLocations>;
+  c10::optional<TWriteIndex> writeIndex_;
+  // Collection of all memory locations that are written to.
+  c10::optional<MemoryLocations> writtenToLocationsIndex_;
+  void buildWrittenToLocationsIndex();
+
+  std::unordered_set<const Value*> wildcards_;
+
+  std::string getElementName(const Element* e) const;
+
+  friend void Lint(const AliasDb* db);
+};
+
+// Helper check that invariants over AliasDb are maintained.
+// Useful if you are using the AliasDb mutation API and want to check you did
+// the right thing.
+TORCH_API void Lint(const AliasDb* db);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h
new file mode 100644
index 0000000000000000000000000000000000000000..9364be498fd2a9bbabf3458f609c0605f18f0cf2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h
@@ -0,0 +1,184 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <string>
+#include <vector>
+
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/symbol.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace jit {
+
+using ::c10::Symbol;
+
+constexpr int max_tensor_display_size = 10;
+
+enum class AttributeKind {
+  f,
+  fs,
+  c,
+  cs,
+  i,
+  is,
+  s,
+  ss,
+  t,
+  ts,
+  g,
+  gs,
+  ty,
+  tys,
+  ival
+};
+static inline const char* toString(AttributeKind kind) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  static const char* names[] = {
+      "f",
+      "c",
+      "cs",
+      "fs",
+      "i",
+      "is",
+      "s",
+      "ss",
+      "t",
+      "ts",
+      "g",
+      "gs",
+      "ty",
+      "tys",
+      "ival"};
+  AT_ASSERT(size_t(kind) < sizeof(names) / sizeof(*names));
+  return names[int(kind)];
+}
+
+struct AttributeValue {
+  AttributeValue(Symbol name) : name(name) {}
+  using Ptr = std::unique_ptr<AttributeValue>;
+  Symbol name;
+  virtual AttributeKind kind() const = 0;
+  virtual Ptr clone() const = 0;
+  virtual ~AttributeValue() = default;
+};
+
+template <typename T, AttributeKind Kind>
+struct ScalarAttributeValue : public AttributeValue {
+  using ConstructorType = T;
+  using ValueType = T;
+  ScalarAttributeValue(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  Ptr clone() const override {
+    return Ptr(new ScalarAttributeValue(name, value_));
+  }
+  AttributeKind kind() const override {
+    return Kind;
+  }
+
+ private:
+  ValueType value_;
+};
+
+template <typename T, AttributeKind Kind>
+struct VectorAttributeValue : public AttributeValue {
+  using ConstructorType = std::vector<T>;
+  using ValueType = std::vector<T>;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  VectorAttributeValue(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  AttributeKind kind() const override {
+    return Kind;
+  }
+  std::unique_ptr<AttributeValue> clone() const override {
+    auto copy = value_;
+    return Ptr(new VectorAttributeValue(name, std::move(copy)));
+  }
+
+ private:
+  ValueType value_;
+};
+
+using ComplexAttr =
+    ScalarAttributeValue<c10::complex<double>, AttributeKind::c>;
+using ComplexValsAttr =
+    VectorAttributeValue<c10::complex<double>, AttributeKind::cs>;
+using FloatAttr = ScalarAttributeValue<double, AttributeKind::f>;
+using FloatsAttr = VectorAttributeValue<double, AttributeKind::fs>;
+using IntAttr = ScalarAttributeValue<int64_t, AttributeKind::i>;
+using IntsAttr = VectorAttributeValue<int64_t, AttributeKind::is>;
+using StringAttr = ScalarAttributeValue<std::string, AttributeKind::s>;
+using StringsAttr = VectorAttributeValue<std::string, AttributeKind::ss>;
+using TensorAttr = ScalarAttributeValue<at::Tensor, AttributeKind::t>;
+using TensorsAttr = VectorAttributeValue<at::Tensor, AttributeKind::ts>;
+using TypeAttr = ScalarAttributeValue<c10::TypePtr, AttributeKind::ty>;
+using TypesAttr = VectorAttributeValue<c10::TypePtr, AttributeKind::tys>;
+using IValueAttr = ScalarAttributeValue<at::IValue, AttributeKind::ival>;
+
+struct Graph;
+
+// We special case Graph attributes like this because we want to ensure that
+// Graph::copy() is called when we clone() these attributes.
+struct TORCH_API GraphAttr : public AttributeValue {
+  using ConstructorType = std::shared_ptr<Graph>;
+  using ValueType = std::shared_ptr<Graph>;
+  GraphAttr(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  Ptr clone() const override;
+  AttributeKind kind() const override {
+    return AttributeKind::g;
+  }
+
+ private:
+  std::shared_ptr<Graph> value_;
+};
+
+struct TORCH_API GraphsAttr : public AttributeValue {
+  using ConstructorType = std::vector<std::shared_ptr<Graph>>;
+  using ValueType = std::vector<std::shared_ptr<Graph>>;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  GraphsAttr(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  AttributeKind kind() const override {
+    return AttributeKind::gs;
+  }
+  std::unique_ptr<AttributeValue> clone() const override;
+
+ private:
+  ValueType value_;
+};
+
+struct IRAttributeError : public std::exception {
+  IRAttributeError(Symbol name, bool defined) {
+    std::stringstream ss;
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (!defined) {
+      ss << "required keyword attribute '" << name.toUnqualString()
+         << "' is undefined";
+    } else {
+      ss << "required keyword attribute '" << name.toUnqualString()
+         << "' has the wrong type";
+    }
+    msg = ss.str();
+  }
+  const char* what() const noexcept override {
+    return msg.c_str();
+  }
+
+ private:
+  std::string msg;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..919bb0633a159ecca8a334f34849d04113391540
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+// helpers for handling constants in the IR
+// - create constant nodes from ints, floats, complex, intlist, Tensors, and
+// other types
+// - implement primitive constant ops.
+namespace torch {
+namespace jit {
+
+using ::c10::IValue;
+
+struct Graph;
+struct Value;
+
+// thrown when insertConstant cannot encode the IValue into a graph
+struct TORCH_API constant_not_supported_error : public std::runtime_error {
+  using runtime_error::runtime_error;
+};
+
+TORCH_API Value* insertConstant(
+    Graph& g,
+    const IValue& val,
+    c10::optional<SourceRange> loc = c10::nullopt,
+    c10::optional<ScopePtr> scope = c10::nullopt);
+
+// note: prefer g.insertConsant(val, loc) which does exactly the same thing
+// this function is only declared/defined here because its implementation is
+// closely related to the implementation of prim::Constant that is also in
+// constants.cpp.
+//
+// returns a c10::nullopt if the IValue kind cannot be inserted as a constant
+TORCH_API c10::optional<Value*> tryInsertConstant(
+    Graph& g,
+    const IValue& val,
+    c10::optional<SourceRange> loc = c10::nullopt,
+    c10::optional<ScopePtr> scope = c10::nullopt);
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper for retrieving constants
+////////////////////////////////////////////////////////////////////////////////
+
+// attempt to convert a (possibly constant) Value* into an interpreter value
+// (IValue). returns c10::nullopt if the Value* was not constant
+TORCH_API c10::optional<IValue> toIValue(const Value* v);
+
+// if a value is a constant then try to turn into type T using the
+// same rules as the interpreter
+template <typename T>
+c10::optional<T> constant_as(const Value* v) {
+  if (auto ivalue = toIValue(v)) {
+    return ivalue->to<T>();
+  }
+  return c10::nullopt;
+}
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..87b6325e2c8098486d4f34e10015527ee15b0839
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h
@@ -0,0 +1,201 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace jit {
+
+// Intrusive doubly linked lists with sane reverse iterators.
+// The header file is named generic_graph_node_list.h because it is ONLY
+// used for Graph's Node lists, and if you want to use it for other
+// things, you will have to do some refactoring.
+//
+// At the moment, the templated type T must support a few operations:
+//
+//  - It must have a field: T* next_in_graph[2] = { nullptr, nullptr };
+//    which are used for the intrusive linked list pointers.
+//
+//  - It must have a method 'destroy()', which removes T from the
+//    list and frees a T.
+//
+// In practice, we are only using it with Node and const Node.  'destroy()'
+// needs to be renegotiated if you want to use this somewhere else.
+//
+// Regardless of the iteration direction, iterators always physically point
+// to the element they logically point to, rather than
+// the off-by-one behavior for all standard library reverse iterators like
+// std::list.
+
+// The list is includes two sentinel nodes, one at the beginning and one at the
+// end with a circular link between them. It is an error to insert nodes after
+// the end sentinel node but before the beginning node:
+
+// Visualization showing only the next() links:
+//  HEAD -> first -> second  -> ... -> last -> TAIL
+//   ^------------------------------------------
+
+// Visualization showing only the prev() links:
+//  HEAD <- first <- second  <- ... <- last <- TAIL
+//   ------------------------------------------^
+
+static constexpr int kNextDirection = 0;
+static constexpr int kPrevDirection = 1;
+
+template <typename T>
+struct generic_graph_node_list;
+
+template <typename T>
+struct generic_graph_node_list_iterator;
+
+struct Node;
+using graph_node_list = generic_graph_node_list<Node>;
+using const_graph_node_list = generic_graph_node_list<const Node>;
+using graph_node_list_iterator = generic_graph_node_list_iterator<Node>;
+using const_graph_node_list_iterator =
+    generic_graph_node_list_iterator<const Node>;
+
+template <typename T>
+struct generic_graph_node_list_iterator {
+  generic_graph_node_list_iterator() : cur(nullptr), d(kNextDirection) {}
+  generic_graph_node_list_iterator(T* cur, int d) : cur(cur), d(d) {}
+  generic_graph_node_list_iterator(
+      const generic_graph_node_list_iterator& rhs) = default;
+  generic_graph_node_list_iterator(
+      generic_graph_node_list_iterator&& rhs) noexcept = default;
+  generic_graph_node_list_iterator& operator=(
+      const generic_graph_node_list_iterator& rhs) = default;
+  generic_graph_node_list_iterator& operator=(
+      generic_graph_node_list_iterator&& rhs) noexcept = default;
+  T* operator*() const {
+    return cur;
+  }
+  T* operator->() const {
+    return cur;
+  }
+  generic_graph_node_list_iterator& operator++() {
+    AT_ASSERT(cur);
+    cur = cur->next_in_graph[d];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator++(int) {
+    generic_graph_node_list_iterator old = *this;
+    ++(*this);
+    return old;
+  }
+  generic_graph_node_list_iterator& operator--() {
+    AT_ASSERT(cur);
+    cur = cur->next_in_graph[reverseDir()];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator--(int) {
+    generic_graph_node_list_iterator old = *this;
+    --(*this);
+    return old;
+  }
+
+  // erase cur without invalidating this iterator
+  // named differently from destroy so that ->/. bugs do not
+  // silently cause the wrong one to be called.
+  // iterator will point to the previous entry after call
+  void destroyCurrent() {
+    T* n = cur;
+    cur = cur->next_in_graph[reverseDir()];
+    n->destroy();
+  }
+  generic_graph_node_list_iterator reverse() {
+    return generic_graph_node_list_iterator(cur, reverseDir());
+  }
+
+ private:
+  int reverseDir() {
+    return d == kNextDirection ? kPrevDirection : kNextDirection;
+  }
+  T* cur;
+  int d; // direction 0 is forward 1 is reverse, see next_in_graph
+};
+
+template <typename T>
+struct generic_graph_node_list {
+  using iterator = generic_graph_node_list_iterator<T>;
+  using const_iterator = generic_graph_node_list_iterator<const T>;
+  generic_graph_node_list_iterator<T> begin() {
+    return generic_graph_node_list_iterator<T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<const T> begin() const {
+    return generic_graph_node_list_iterator<const T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<T> end() {
+    return generic_graph_node_list_iterator<T>(head->next_in_graph[!d], d);
+  }
+  generic_graph_node_list_iterator<const T> end() const {
+    return generic_graph_node_list_iterator<const T>(
+        head->next_in_graph[!d], d);
+  }
+  generic_graph_node_list_iterator<T> rbegin() {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<const T> rbegin() const {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<T> rend() {
+    return reverse().end();
+  }
+  generic_graph_node_list_iterator<const T> rend() const {
+    return reverse().end();
+  }
+  generic_graph_node_list reverse() {
+    return generic_graph_node_list(head->next_in_graph[!d], !d);
+  }
+  const generic_graph_node_list reverse() const {
+    return generic_graph_node_list(head->next_in_graph[!d], !d);
+  }
+  T* front() {
+    return head->next_in_graph[d];
+  }
+  const T* front() const {
+    return head->next_in_graph[d];
+  }
+  T* back() {
+    return head->next_in_graph[!d];
+  }
+  const T* back() const {
+    return head->next_in_graph[!d];
+  }
+  generic_graph_node_list(T* head, int d) : head(head), d(d) {}
+
+ private:
+  T* head; // both head and tail are sentinel nodes
+           // the first real node is head->next_in_graph[d]
+           // the tail sentinel is head->next_in_graph[!d]
+  int d;
+};
+
+template <typename T>
+static inline bool operator==(
+    generic_graph_node_list_iterator<T> a,
+    generic_graph_node_list_iterator<T> b) {
+  return *a == *b;
+}
+
+template <typename T>
+static inline bool operator!=(
+    generic_graph_node_list_iterator<T> a,
+    generic_graph_node_list_iterator<T> b) {
+  return *a != *b;
+}
+
+} // namespace jit
+} // namespace torch
+
+namespace std {
+
+template <typename T>
+struct iterator_traits<torch::jit::generic_graph_node_list_iterator<T>> {
+  using difference_type = int64_t;
+  using value_type = T*;
+  using pointer = T**;
+  using reference = T*&;
+  using iterator_category = bidirectional_iterator_tag;
+};
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..84590a830e47321c402f238957f35e500b533a35
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+TORCH_API TypePtr getTensorType(const at::Tensor& t, bool complete);
+
+TORCH_API TypePtr inferShapeAndTypeForInput(
+    TypePtr input_type,
+    Stack::const_iterator& s_iter,
+    const Stack::const_iterator& s_iter_end,
+    bool complete);
+
+TORCH_API void setInputTensorTypes(
+    Graph& g,
+    const Stack& stack,
+    bool complete,
+    const std::vector<int>& param_count_list = {});
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..b51a86dd55bf008e7a5fcf384bf1c4fa4f975d22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
@@ -0,0 +1,1841 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/attributes.h>
+#include <torch/csrc/jit/ir/graph_node_list.h>
+#include <torch/csrc/jit/ir/named_value.h>
+#include <torch/csrc/jit/ir/scope.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/schema_info.h>
+
+#include <ATen/Utils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/enum_type.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+
+#include <functional>
+#include <iosfwd>
+#include <unordered_set>
+#include <vector>
+
+// Forward declare, the real meat is in python_ir.cpp
+template <class T>
+class THPPointer;
+using THPObjectPtr = THPPointer<PyObject>;
+using pyobj_list = std::vector<THPObjectPtr>;
+
+namespace torch {
+namespace jit {
+namespace utils {
+TORCH_API std::string getNodesModuleHierarchy(const Node& n);
+} // namespace utils
+class AliasDb;
+
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::Symbol;
+
+using ::c10::ivalue::Shared;
+
+using ::c10::IValue;
+using ::c10::ivalue::Future;
+
+using ::c10::ivalue::ConstantString;
+
+#define C10_USING(T) using ::c10::T;
+C10_FORALL_TYPES(C10_USING)
+#undef C10_USING
+
+#define C10_USING(T) using ::c10::T##Ptr;
+C10_FORALL_TYPES(C10_USING)
+#undef C10_USING
+
+using ::c10::Type;
+using ::c10::TypeEnv;
+using ::c10::TypePtr;
+
+using ::c10::getTypePtr;
+using ::c10::MatchTypeReturn;
+using ::c10::TypeKind;
+
+using ::c10::fmap;
+
+namespace prim {
+using namespace ::c10::prim;
+}
+namespace attr {
+using namespace ::c10::attr;
+}
+namespace aten {
+using namespace ::c10::aten;
+}
+namespace cuda {
+#if !defined(USE_ROCM)
+using namespace ::c10::cuda;
+#endif
+} // namespace cuda
+
+struct Function;
+struct GraphFunction;
+struct MatchedSchema;
+
+// A Graph represents one "function" of computation.
+// It uses a simple ownership model where the graph owns all the nodes inside
+// it. All references inside the graph are raw pointers. Destroying the Graph
+// will invalidate any pointers to nodes in the graph.
+struct Graph;
+
+// Node is the base class of the IR graph. It represents one computation
+// and dependencies on a list of Values. The "prim-ops", so to speak.
+struct Node;
+
+// A Value represents an input or output to node that is either a
+// Tensor or an opaque Handle object, as determined by type().
+struct Value;
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Graph& g);
+TORCH_API std::ostream& operator<<(std::ostream& out, const Node& n);
+
+// A list of nodes, with inputs and outputs
+struct Block;
+
+// Each use is represented by this type, see 'Node::uses()'
+// 'user' is the consumer of the value, 'offset' is the index into
+// 'user's input this where the producers will be found.
+struct Use {
+  Use(Node* user, size_t offset) : user(user), offset(offset) {}
+  Node* user;
+  size_t offset;
+
+  bool operator==(const Use& b) {
+    return user == b.user && offset == b.offset;
+  }
+};
+
+// Note [User node does not uniquely identify use]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A while back, we wrote some code manipulating uses that looked like this:
+//
+//    for (auto& use : used_val->uses_) {
+//      if (use.user == this_node) {
+//        use.offset += 1;
+//        break;
+//      }
+//    }
+//
+// This code is trying to find a particular use (our node's use) to update it.
+// However, it's wrong: there may be *multiple* uses of a value %x in a node,
+// as might be the case in this IR:
+//
+//    %y = Add %x %x
+//
+// In this case, there are two uses of %x whose user is the node 'Add %x %x'.
+// So, "use induced by this node" is not a well-formed concept.
+//
+// If you are looking for "use induced by an input", it's best to use
+// findUseForInput() to get it.
+
+// the list types are intentionally simple, but we type-def
+// them here so if we need to change them, refactoring will be easier
+using node_list = std::vector<Node*>;
+using value_list = std::vector<Value*>;
+using use_list = std::vector<Use>;
+template <typename T>
+using ArrayRef = at::ArrayRef<T>;
+using NodeKind = Symbol;
+using topo_position_t = int64_t;
+using ValueSet = std::unordered_set<const Value*>;
+
+struct OperatorSet;
+template <typename T>
+struct OperatorMap;
+
+// This is a wrapper to allow invalidating the Python object
+// safely when the C++ object for a Node/Value/Block is deleted
+// like much of graph, it isn't safe for different threads to
+// access the same graph
+template <typename T>
+struct Wrap {
+  explicit Wrap(T* p) : elem(p), clear_cb(nullptr) {}
+  void clear() {
+    if (clear_cb) {
+      clear_cb(elem);
+    }
+    elem = nullptr;
+  }
+  T* elem;
+  void (*clear_cb)(void*);
+};
+
+struct Value {
+  AT_DISALLOW_COPY_AND_ASSIGN(Value);
+  Value(Node* node_, size_t offset_);
+
+ private:
+  friend struct Node;
+  friend struct Graph;
+  Node* node_;
+  size_t offset_;
+  size_t unique_ = 0; // unique id
+  use_list uses_;
+  std::string unique_name_;
+  TypePtr type_;
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Value>> wrap_;
+
+ public:
+  Value* setType(TypePtr type);
+  TORCH_API void inferTypeFrom(const at::Tensor& output);
+  TORCH_API void inferTypeFrom(
+      const c10::intrusive_ptr<c10::ivalue::Object>& output);
+  const TypePtr& type() const {
+    AT_ASSERT(type_ != nullptr);
+    return type_;
+  }
+  bool requires_grad() const {
+    return type()->requires_grad();
+  }
+  bool isCompleteTensor() const {
+    if (auto pt = type()->cast<TensorType>()) {
+      return pt->isComplete();
+    }
+    return false;
+  }
+  TORCH_API bool mustBeNone() const;
+  TORCH_API bool mustNotBeNone() const;
+  size_t unique() const {
+    return unique_;
+  }
+  bool hasDebugName() const {
+    return !unique_name_.empty();
+  }
+  static bool isValidName(const std::string& name);
+  TORCH_API Value* setDebugName(const std::string& name);
+  std::string debugName() const {
+    if (hasDebugName()) {
+      return unique_name_;
+    }
+    return c10::to_string(unique());
+  }
+  TORCH_API std::string debugNameBase() const;
+  Node* node() {
+    return node_;
+  }
+  size_t offset() const {
+    return offset_;
+  }
+  void setOffset(size_t offset) {
+    offset_ = offset;
+  }
+  const Node* node() const {
+    return node_;
+  }
+
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph();
+  const Graph* owningGraph() const;
+  // TODO: make this more const correct
+  const use_list& uses() const {
+    return uses_;
+  }
+
+  bool hasUses() const {
+    return !uses().empty();
+  }
+
+  TORCH_API void replaceFirstUseWith(Value* newValue);
+
+  // Replaces all uses of this value with 'newValue'.
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%3, %3)
+  // Execute: %3.replaceAllUsesWith(%6)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%6)
+  //          %5 = h(%6, %6)
+  TORCH_API void replaceAllUsesWith(Value* newValue);
+
+  // Replaces all uses of this value with 'newValue' after 'node'.
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = inplace_(%3)
+  //          %6 = h(%3, %3)
+  // Execute: %3.replaceAllUsesAfterNodeWith(%5.node(), %5)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = inplace_(%3)
+  //          %6 = h(%5, %5)
+  // XXX: does not check scoping legality, consider using
+  // replaceAllUsesDominatedByNodeWith
+  TORCH_API void replaceAllUsesAfterNodeWith(const Node* node, Value* newValue);
+
+  // Replaces all uses of this value with 'newValue' that are dominated by
+  // 'node'. Given:
+  // x = op(...).
+  // if cond:
+  //    z = foo(..)
+  //    bar(x)
+  // else:
+  //    print(x)
+  // x.replaceAllUsesDominatedByNodeWith(foo, z) would replace bar(x)
+  // but not print(x) because print is not dominated by foo.
+  // replaceAllUsesAfterNode does not check domination, so in this example
+  // it would produce invalid IR.
+  TORCH_API void replaceAllUsesDominatedByNodeWith(
+      const Node* node,
+      Value* newValue);
+
+  TORCH_API Value* copyMetadata(Value* from);
+
+  TORCH_API std::shared_ptr<Wrap<Value>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Value>>(this);
+    }
+    return wrap_;
+  }
+
+  virtual ~Value() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+};
+
+struct TORCH_API Node {
+  AT_DISALLOW_COPY_AND_ASSIGN(Node);
+  friend struct Graph;
+  friend struct Block;
+  friend struct Value;
+  friend graph_node_list;
+  friend const_graph_node_list;
+  friend graph_node_list_iterator;
+  friend const_graph_node_list_iterator;
+
+ private:
+  const NodeKind kind_;
+  std::vector<Value*> inputs_;
+  std::vector<Value*> outputs_;
+  // subblocks
+  std::vector<Block*> blocks_;
+  Graph* graph_;
+  Block* owning_block_;
+  c10::optional<SourceRange> source_range_;
+  ScopePtr scope_;
+  c10::optional<InlinedCallStackPtr> callstack_;
+  // Assumes FunctionSchemas are persistent, so we don't manage their lifetime.
+  // This field is effective a cache that's populated on attribute lookups and
+  // invalidated every time we perform an operation that could potentially
+  // change the schema. note: mutable because schema_ is effectively a cache
+  mutable const Operator* op_;
+  topo_position_t topo_position_ = 0;
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Node>> wrap_;
+  // Stores the full schema name, if the operator is historic
+  // When the operator is deprecated or the name of the operator
+  // is changed, we need to rely on this name
+  // to retrieve old schemas to successfully apply upgraders
+  // for this operator.
+  c10::optional<std::string> historic_schema_name_ = c10::nullopt;
+
+ protected:
+  Node(Graph* graph_, NodeKind kind_); // defined after graph
+ public:
+  // Each Node but Return/Param Nodes are associated with exactly one
+  // place in the Node list of the Graph. The Graph itself is a circular
+  // doubly-linked list. The Return Node is used as the sentinel for the
+  // "beginning"/"end" of the list. This means that you can tell when
+  // you've traversed the entire list without means worrying about null
+  // pointers. `next_in_graph[0]` is the pointer to the next Node, while
+  // `next_in_graph[1]` is the pointer to the previous Node. The
+  // linked list is implemented as an array to allow the same iterator
+  // class for forward and reversed Node lists. Taken together, this
+  // list also represents a topological sort of the Nodes in the Graph.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-non-private-member-variables-in-classes,modernize-avoid-c-arrays)
+  Node* next_in_graph[2] = {nullptr, nullptr};
+
+  std::shared_ptr<Wrap<Node>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Node>>(this);
+    }
+    return wrap_;
+  }
+
+  const c10::optional<std::string> getHistoricSchemaName() {
+    return historic_schema_name_;
+  }
+
+  void setHistoricSchemaName(const std::string& name) {
+    historic_schema_name_ = name;
+  }
+
+  Node*& next() {
+    return next_in_graph[kNextDirection];
+  }
+  Node*& prev() {
+    return next_in_graph[kPrevDirection];
+  }
+  Node* const& next() const {
+    return next_in_graph[kNextDirection];
+  }
+  Node* const& prev() const {
+    return next_in_graph[kPrevDirection];
+  }
+
+  NodeKind kind() const {
+    return kind_;
+  }
+  Node* setSourceRange(SourceRange r) {
+    source_range_ = std::move(r);
+    return this;
+  }
+  SourceRange sourceRange() const;
+
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph() {
+    return graph_;
+  }
+  const Graph* owningGraph() const {
+    return graph_;
+  }
+  Block* owningBlock() {
+    return owning_block_;
+  }
+  const Block* owningBlock() const {
+    return owning_block_;
+  }
+  ScopePtr scope() {
+    return scope_;
+  }
+  void setScope(ScopePtr scope) {
+    scope_ = std::move(scope);
+  }
+  std::string scopeName() const {
+    if (!scope_) {
+      return "";
+    }
+    return scope_->namesFromRoot();
+  }
+
+  // Copies the source range, scope and callstack from another node.
+  Node* copyMetadata(Node* from) {
+    this->setSourceRange(from->sourceRange());
+    this->setScope(from->scope());
+    if (auto cs = from->callstack()) {
+      this->setCallStack(*cs);
+    }
+    return this;
+  }
+
+  c10::optional<InlinedCallStackPtr> callstack() const {
+    return callstack_;
+  }
+  void setCallStack(InlinedCallStackPtr cs) {
+    callstack_ = std::move(cs);
+  }
+
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> inputs() {
+    return inputs_;
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {inputs_.data(), inputs_.size()};
+  }
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> outputs() {
+    return outputs_;
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {outputs_.data(), outputs_.size()};
+  }
+  Value* output(size_t i) const {
+    return outputs_.at(i);
+  }
+  bool hasUses() const {
+    for (auto o : outputs()) {
+      if (!o->uses().empty()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void replaceAllUsesWith(Node* n);
+
+  // replaces `this` with a new node with the same inputs and outputs
+  // but a new node symbol. does not destroy `this`
+  Node* replaceWithNewSymbol(Symbol new_symbol);
+
+  // Checks if this node is dominated by `dominator` which means that
+  // `dominator` will always be executed before `this` and `dominator`
+  // is in scope of `this.
+  bool isDominatedBy(const Node* dominator) const;
+
+  // lots of things like chunk have a single input or single output, so we have
+  // a helper to make accessing it easier
+  Value* input() {
+    AT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  Value* output() {
+    AT_ASSERT(outputs_.size() == 1);
+    return outputs_.at(0);
+  }
+  const Value* output() const {
+    AT_ASSERT(outputs_.size() == 1);
+    return outputs_.at(0);
+  }
+  const Value* input() const {
+    AT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  // Access a particular input.  This is a checked index.
+  Value* input(size_t i) const {
+    return inputs_.at(i);
+  }
+
+  bool hasNamedInput(const std::string& unqualName) const;
+  Value* namedInput(const std::string& unqualName) const;
+  Value* namedInput(Symbol name) const;
+
+  c10::optional<IValue> get(Symbol name) const;
+
+  template <typename T>
+  c10::optional<T> get(Symbol name) const {
+    if (auto v = get(name)) {
+      return v->template to<T>();
+    }
+    return c10::nullopt;
+  }
+
+  // Returns true if the value of input name is statically known
+  bool is_constant(Symbol name) const {
+    return static_cast<bool>(get(name));
+  }
+  bool mustBeNone() const;
+
+  bool isNondeterministic() const;
+  bool hasSideEffects() const;
+
+  // instructions lowered by the interpreter and not run in the optimized graph
+  bool notExecutedOp() const {
+    return kind_ == prim::Constant || kind_ == prim::profile ||
+        kind_ == prim::profile_ivalue;
+  }
+
+  // Graphs
+
+  // Note [Topological invariant]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // We always maintain an up-to-date topological ordering of all nodes via
+  // the next()/prev() links.  All transformations to graphs must preserve
+  // this topological ordering: for example, it is only valid to 'addInput'
+  // with an input which is topologically before the current node.
+  //
+  // Usually, it is obvious whether or not topological order is maintained;
+  // for example, if you are adding nodes to the end of the topsort, it's
+  // impossible for them to refer to inputs that are not in the topsort.
+  // If it is not obvious, please comment accordingly.
+
+  // Add 'node' as an input to 'this' at the end of existing
+  // arguments.  Returns the added node for ease of chaining.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.addInput(%4)
+  // Result:  %3 = f(%1, %2, %4)
+  Value* addInput(Value* value);
+
+  // Add 'value' as an input to 'this' at the specified position in the
+  // arguments. Returns the added value for ease of chaining.
+  Value* insertInput(size_t i, Value* value);
+
+  // Replace the input of 'this' at position 'i' with
+  // 'newValue', returning the old node.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.replaceInput(1, %4)
+  // Result:  %3 = f(%1, %4)
+  Value* replaceInput(size_t i, Value* newValue);
+
+  // Replace all occurrences of 'from' in the inputs of this
+  // node with 'to'. Corresponds to llvm's replaceUsesOfWith.
+  //
+  // Given:   %3 = f(%1, %2, %1)
+  // Execute: %3.replaceInputWith(%1, %4)
+  // Result:  %3 = f(%4, %2, %4)
+  void replaceInputWith(Value* from, Value* to);
+
+  Value* addOutput();
+
+  Value* insertOutput(size_t i);
+
+  void eraseOutput(size_t i);
+
+  Block* addBlock();
+  void eraseBlock(size_t i);
+
+  // Each Node can have a list of subblocks. These are used to define structured
+  // nested control flow operators such as If and Loop.
+  // The meaning of a block is specific to the kind of node it is in, but
+  // all blocks share these semantics:
+  // * Nested lexical scoping: If a node 'Parent' has a subblock which contains
+  //   a node 'Child', Child can use any value that was in scope for the Parent
+  //   node in addition to any values defined before 'Child' in the subblock.
+  // * The list of inputs to the block are in scope for the duration of the
+  //   block
+  // * the outputs of the Parent node are not in scope for the subblocks
+  // Typically the inputs to a block that represents control flow act as
+  // as the equivalents phi-nodes in standard SSA form,
+  // defining a new Value to represent any term that has multiple
+  // definitions depending on how control flowed. Outputs of the node containing
+  // control flow serve a similiar purpose defining new values for variables
+  // that would have different definitions depending on which way control
+  // flowed.
+
+  at::ArrayRef<Block*> blocks() {
+    return blocks_;
+  }
+  at::ArrayRef<const Block*> blocks() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {blocks_.data(), blocks_.size()};
+  }
+
+  // Is 'this' before 'n' in the topological order?
+  bool isBefore(const Node* n) const;
+
+  // Is 'this' after 'n' in the topological order?
+  bool isAfter(const Node* n) const;
+
+  // Insert unattached 'this' node before 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertBefore(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %5 = h(%1)
+  //          %4 = g(%3)
+  Node* insertBefore(Node* n);
+
+  // Insert unattached 'this' node after 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given: %3 = f(%1, %2)
+  //        %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertAfter(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%1)
+  Node* insertAfter(Node* n);
+
+  // Move 'this' (already in the graph) after 'n' in the topological order.
+  //
+  // NOTE: Does not check that value dependencies are preserved, see
+  //   AliasDb::moveAfterTopologicallyValid
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.moveAfter(%3)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  //
+  void moveAfter(Node* n);
+
+  // Move a node 'n' (already in the graph) before 'this' in the topological
+  // order.
+  //
+  // NOTE: Does not check that value dependencies are preserved, see
+  //   AliasDb::moveBeforeTopologicallyValid
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %3.moveBefore(%2)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  void moveBefore(Node* n);
+
+  // Remove the input at 'i' from this node.
+  //
+  // WARNING: This is O(n) in the number of inputs, so avoid repeatedly calling
+  // removeInput.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeInput(1)
+  // Result: %3 = f(%1)
+  void removeInput(size_t i);
+
+  // Remove all inputs from a node.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeAllInputs()
+  // Result: %3 = f()
+  void removeAllInputs();
+
+  // Remove all outputs from a node.
+  //
+  // Given: %1, %2 = f()
+  // Execute:removeAllInputs()
+  // Result: = f()
+  void removeAllOutputs();
+
+  // Rearrange the ordering of inputs or outputs of a node
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.permuteInputs({1, 0})
+  // Result: %3 = f(%2, %1)
+  // Each index must appear exactly once
+  void permuteInputs(const std::vector<size_t>& new_inputs);
+  void permuteOutputs(const std::vector<size_t>& new_inputs);
+
+  // iterators of the node list starting at this node
+  // useful for resuming a search starting at this node
+  inline graph_node_list_iterator iterator() {
+    return {this, 0};
+  }
+  inline graph_node_list_iterator reverseIterator() {
+    return iterator().reverse();
+  }
+  inline const_graph_node_list_iterator iterator() const {
+    return {this, 0};
+  }
+  inline const_graph_node_list_iterator reverseIterator() const {
+    return iterator().reverse();
+  }
+
+  // Remove 'this' from the instruction list and deallocate it.
+  //
+  // Invariant: no outputs of 'this' may have any uses.
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.destroy()
+  // Result: %3 = g(%1)
+  void destroy();
+
+  // Dynamically cast this node to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid..
+  //
+  // Example usage: if(auto s = n.cast<Select>()) { ... }
+  template <typename T>
+  T* cast() {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  const T* cast() const {
+    if (T::Kind == kind()) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
+  template <typename T>
+  T* expect() {
+    TORCH_CHECK(
+        T::Kind == kind(),
+        "expected a ",
+        T::Kind.toDisplayString(),
+        " but found a ",
+        kind().toDisplayString());
+    return static_cast<T*>(this);
+  }
+
+  bool matches(const FunctionSchema& schema) const;
+
+  // XXX: this function is meant to be used with string literals only!
+  bool matches(
+      const char* signature_literal,
+      at::ArrayRef<Symbol> const_inputs = {}) const;
+
+  bool isMemberOf(const OperatorSet& os) const;
+  template <typename T>
+  bool isMemberOf(const OperatorMap<T>& om) const {
+    auto it = om.map.find(kind());
+    if (it == om.map.end()) {
+      return false;
+    }
+    for (auto& op : it->second) {
+      if (matches(op.first->schema())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const FunctionSchema& schema() const;
+  const FunctionSchema* maybeSchema() const;
+  const Operator& getOperator() const;
+  Operation getOperation() const;
+
+  const Operator* maybeOperator() const;
+
+  void dump() const;
+
+  std::ostream& print(
+      std::ostream& out,
+      size_t level,
+      std::vector<const Node*>* groups,
+      bool print_source_locations = true,
+      bool print_attributes = true,
+      bool print_scopes = true,
+      bool print_body = true) const;
+
+  virtual ~Node() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+
+  // Methods for accessing attributes
+  Node* copyAttributes(const Node& rhs) {
+    values_.clear();
+    for (const AVPtr& i : rhs.values_) {
+      values_.push_back(i->clone());
+    }
+    return this;
+  }
+  bool hasAttribute(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    return findAttr(name, false) != values_.end();
+  }
+  bool hasAttributeS(const std::string& name) const {
+    return hasAttribute(Symbol::attr(name));
+  }
+  AttributeKind kindOf(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    return (*findAttr(name, true))->kind();
+  }
+  AttributeKind kindOfS(const std::string& name) const {
+    return kindOf(Symbol::attr(name));
+  }
+  Node* removeAttribute(Symbol name) {
+    AT_ASSERT(name.is_attr());
+    values_.erase(findAttr(name, true));
+    return this;
+  }
+  Node* removeAttributeS(const std::string& name) {
+    return removeAttribute(Symbol::attr(name));
+  }
+  bool hasAttributes() const {
+    return !values_.empty();
+  }
+  size_t numAttributes() const {
+    return values_.size();
+  }
+  // The names are returned in order, since name actually is the index.
+  std::vector<Symbol> attributeNames() const {
+    std::vector<Symbol> names;
+    names.reserve(values_.size());
+    for (const AVPtr& a : values_) {
+      names.push_back(a->name);
+    }
+    return names;
+  }
+  std::vector<const char*> attributeNamesS() const {
+    std::vector<const char*> names;
+    names.reserve(values_.size());
+    for (const AVPtr& a : values_) {
+      names.push_back(a->name.toUnqualString());
+    }
+    return names;
+  }
+
+#define CREATE_ACCESSOR(Kind, method)                           \
+  Node* method##_(Symbol name, Kind##Attr::ConstructorType v) { \
+    return setAttr<Kind##Attr>(                                 \
+        name, std::forward<Kind##Attr::ConstructorType>(v));    \
+  }                                                             \
+  const Kind##Attr::ValueType& method(Symbol name) const {      \
+    return getAttr<Kind##Attr>(name);                           \
+  }
+
+  CREATE_ACCESSOR(Float, f)
+  CREATE_ACCESSOR(Complex, c)
+  CREATE_ACCESSOR(Floats, fs)
+  CREATE_ACCESSOR(ComplexVals, cs)
+  CREATE_ACCESSOR(String, s)
+  CREATE_ACCESSOR(Strings, ss)
+  CREATE_ACCESSOR(Int, i)
+  CREATE_ACCESSOR(Ints, is)
+  CREATE_ACCESSOR(Graph, g)
+  CREATE_ACCESSOR(Graphs, gs)
+  CREATE_ACCESSOR(Type, ty)
+  CREATE_ACCESSOR(Types, tys)
+  CREATE_ACCESSOR(IValue, ival)
+
+#undef CREATE_ACCESSOR
+
+  // Our Graphs are not very const-correct, so we need to allow returning
+  // non-const references too
+  GraphAttr::ValueType& g(Symbol name) {
+    return getAttr<GraphAttr>(name);
+  }
+
+  // does not use CREATE_ACCESSOR because we need additional asserts
+  Node* t_(Symbol name, TensorAttr::ConstructorType v) {
+    return setAttr<TensorAttr>(
+        name, std::forward<TensorAttr::ConstructorType>(v));
+  }
+  const TensorAttr::ValueType& t(Symbol name) const {
+    return getAttr<TensorAttr>(name);
+  }
+
+  Node* ts_(Symbol name, TensorsAttr::ConstructorType v) {
+    return setAttr<TensorsAttr>(
+        name, std::forward<TensorsAttr::ConstructorType>(v));
+  }
+  const TensorsAttr::ValueType& ts(Symbol name) const {
+    return getAttr<TensorsAttr>(name);
+  }
+
+  Block* findCommonAncestorBlockWith(Node* n);
+
+  size_t blocksFromGraphBlock();
+
+ private:
+  void printAttrValue(std::ostream& out, const Symbol& name) const;
+  void printAttributes(std::ostream& out, bool ignore_subgraph) const;
+
+  template <typename T>
+  Node* setAttr(Symbol name, typename T::ConstructorType v) {
+    AT_ASSERT(name.is_attr());
+    auto it = findAttr(name, false);
+    auto nv = AVPtr(new T(name, std::forward<typename T::ConstructorType>(v)));
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (it == values_.end()) {
+      values_.push_back(std::move(nv));
+    } else {
+      *it = std::move(nv);
+    }
+    return this;
+  }
+  template <typename T>
+  typename T::ValueType& getAttr(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    auto it = findAttr(name, true);
+    auto* child = dynamic_cast<T*>(it->get());
+    if (child == nullptr) {
+      throw IRAttributeError(name, true);
+    }
+    return child->value();
+  }
+  using AVPtr = AttributeValue::Ptr;
+  // NB: For determinism, we use a vector rather than a hash map.  This does
+  // mean that lookups are O(n), so you shouldn't use Attributes to store
+  // a big pile of messages.
+  std::vector<AVPtr> values_;
+  std::vector<AVPtr>::iterator findAttr(Symbol name, bool required) {
+    AT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(), [&](const AVPtr& v) {
+      return v->name == name;
+    });
+    if (required && it == values_.end()) {
+      throw IRAttributeError(name, false);
+    }
+    AT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+  std::vector<AVPtr>::const_iterator findAttr(Symbol name, bool required)
+      const {
+    AT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(), [&](const AVPtr& v) {
+      return v->name == name;
+    });
+    if (required && it == values_.end()) {
+      throw IRAttributeError(name, false);
+    }
+    AT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+
+  enum class MoveSide { BEFORE, AFTER };
+  bool isBeforeOrAfter(const Node* n, MoveSide moveSide) const;
+
+  std::pair<Value*, const Argument&> findInput(Symbol name);
+  // Lookup iterator in use list of _input i_ that corresponds to its use of
+  // _this_
+  use_list::iterator findUseForInput(size_t i);
+
+  // remove the use of input i, this sets input i to nullptr, but
+  // is only used internally to Node before setting it to a new value
+  // or erasing the entry from the list.
+  Value* dropInput(size_t i);
+
+  bool inBlockList() const {
+    if (next() == nullptr) {
+      AT_ASSERT(prev() == nullptr);
+    }
+    return next() != nullptr;
+  }
+
+  void removeFromList();
+  void lint() const;
+
+  void assignTopoPosition();
+
+ protected:
+  // subclasses must override
+  // this function is used by createClone to initialize a new version
+  // of a node in another graph. It should allocate a new instance of the same
+  // concrete type as 'this', but in graph 'g' which might be different
+  // than graph_
+  virtual Node* allocNewInstance(Graph* g) {
+    return new Node(g, kind());
+  }
+  // create a copy of all properties of Node s into this.
+  // subclasses should extend if they have additional information to copy.
+  // 'this' will be allocated with s->allocNewInstance(g) so it should have
+  // the same concrete type as 's'
+  virtual void cloneFrom(Node* s);
+};
+
+struct Block {
+  friend struct Node;
+  friend struct Graph;
+
+  AT_DISALLOW_COPY_AND_ASSIGN(Block);
+  TORCH_API Block(Graph* graph_, Node* node_);
+
+  at::ArrayRef<Value*> inputs() {
+    return input_->outputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const auto& inputs = input_->outputs();
+    return {inputs.data(), inputs.size()};
+  }
+  at::ArrayRef<Value*> outputs() {
+    return output_->inputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    return static_cast<const Node*>(output_)->inputs();
+  }
+  graph_node_list nodes() {
+    return {input_, kNextDirection};
+  }
+  const_graph_node_list nodes() const {
+    return {input_, kNextDirection};
+  }
+  Node* return_node() {
+    return output_;
+  }
+  const Node* return_node() const {
+    return output_;
+  }
+  Node* param_node() {
+    return input_;
+  }
+  const Node* param_node() const {
+    return input_;
+  }
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph() {
+    return graph_;
+  }
+  const Graph* owningGraph() const {
+    return graph_;
+  }
+  Node* owningNode() {
+    return owning_node_;
+  }
+  const Node* owningNode() const {
+    return owning_node_;
+  }
+
+  Value* addInput(const std::string& name = "") {
+    Value* v = input_->addOutput();
+    v->setDebugName(name);
+    return v;
+  }
+  Value* insertInput(size_t i, const std::string& name = "") {
+    Value* v = input_->insertOutput(i);
+    v->setDebugName(name);
+    return v;
+  }
+  void eraseInput(size_t i) {
+    input_->eraseOutput(i);
+  }
+  void removeAllInputs() {
+    input_->removeAllOutputs();
+  }
+  size_t registerOutput(Value* v) {
+    output_->addInput(v);
+    return outputs().size() - 1;
+  }
+  size_t insertOutput(size_t i, Value* n) {
+    output_->insertInput(i, n);
+    return i;
+  }
+  void eraseOutput(size_t i) {
+    output_->removeInput(i);
+  }
+  void removeAllOutputs() {
+    output_->removeAllInputs();
+  }
+
+  void replaceOutput(size_t i, Value* n) {
+    output_->replaceInput(i, n);
+  }
+  void permuteOutputs(const std::vector<size_t>& new_inputs) {
+    output_->permuteInputs(new_inputs);
+  }
+  void permuteInputs(const std::vector<size_t>& new_inputs) {
+    input_->permuteOutputs(new_inputs);
+  }
+
+  Node* appendNode(Node* n) {
+    AT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertBefore(output_);
+    return n;
+  }
+  Node* prependNode(Node* n) {
+    AT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertAfter(input_);
+    return n;
+  }
+
+  // clone all inputs, nodes, and outputs from src and append them
+  // to the inputs, nodes, and outputs of this block
+  // value_map is used whenever a node in src references a free variable
+  // in src to look up its corresponding value
+  TORCH_API void cloneFrom(Block* src, std::function<Value*(Value*)> value_map);
+  TORCH_API void remapTypes(const std::function<TypePtr(TypePtr)>& type_map);
+
+  TORCH_API std::shared_ptr<Wrap<Block>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Block>>(this);
+    }
+    return wrap_;
+  }
+
+  virtual ~Block() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+
+  void clear() {
+    removeAllOutputs();
+    for (auto it = nodes().rbegin(); it != nodes().rend(); it++) {
+      it.destroyCurrent();
+    }
+    removeAllInputs();
+  }
+
+ private:
+  void reIndexTopology();
+
+  // get rid of all nodes
+  // destroys in reverse order so that uses internal to this block
+  // do not have to be removed before you can destroy the block
+  void destroy();
+
+  Graph* const graph_;
+  // holds outputs in a way that can be reflected
+  // as a Use object
+  // also used as the beginning/end of the circular node list to avoid
+  // having corner cases where the list is empty.
+  Node* const output_;
+  Node* const input_;
+  Node* const
+      owning_node_; // either the node that has this block or nullptr for root
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Block>> wrap_;
+};
+
+struct Graph : std::enable_shared_from_this<Graph> {
+  AT_DISALLOW_COPY_AND_ASSIGN(Graph);
+  friend struct Node;
+  friend struct Value;
+  friend struct Block;
+
+ private:
+  // only used to keep track of allocated nodes
+  // actual representation of Graph is done with
+  // inputs, outputs, nodes
+
+  std::unordered_set<const Node*> all_nodes;
+  std::unordered_set<const Value*> all_values;
+  std::unordered_set<const Block*> all_blocks;
+  size_t next_unique_;
+
+  std::unordered_map<std::string, Value*> unique_names_;
+  // name_base_suffix tracks largest suffix currently used by all names sharing
+  // same name_base. Key of this map is name_base, value is largest suffix
+  // numeric value.
+  std::unordered_map<std::string, size_t> name_base_suffix_;
+
+  ScopePtr current_scope_;
+
+  Block* const block_;
+  // when insertNode() is called, the node is inserted before this node
+  // by default this is set to append to the top level block
+  Node* insert_before_;
+  int64_t predicted_insert_count_ = 0;
+
+  c10::optional<size_t> op_version_;
+
+ public:
+  Graph(ScopePtr scope_root = c10::make_intrusive<Scope>())
+      : next_unique_(0),
+        current_scope_(std::move(scope_root)),
+        block_(new Block(this, nullptr)),
+        insert_before_(return_node()) {}
+
+  at::ArrayRef<Value*> inputs() {
+    return block_->inputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const Block& block = *block_;
+    return block.inputs();
+  }
+  at::ArrayRef<Value*> outputs() {
+    return block_->outputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    const Block& block = *block_;
+    return block.outputs();
+  }
+  graph_node_list nodes() {
+    return block_->nodes();
+  }
+  const_graph_node_list nodes() const {
+    const Block& block = *block_;
+    return block.nodes();
+  }
+  Node* param_node() {
+    return block_->param_node();
+  }
+  const Node* param_node() const {
+    return block_->param_node();
+  }
+  Node* return_node() {
+    return block_->return_node();
+  }
+  const Node* return_node() const {
+    return block_->return_node();
+  }
+  const std::unordered_map<std::string, Value*>& debugNames() const {
+    return unique_names_;
+  }
+
+  TORCH_API void push_scope(const std::string& scope_name);
+  TORCH_API void pop_scope();
+
+  ScopePtr current_scope() {
+    return current_scope_;
+  }
+
+  void set_op_version(c10::optional<size_t> version) {
+    op_version_ = version;
+  }
+
+  c10::optional<size_t> get_op_version() {
+    return op_version_;
+  }
+
+  void set_current_scope(ScopePtr scope) {
+    current_scope_ = std::move(scope);
+  }
+
+  Value* addInput(const std::string& name = "") {
+    return block_->addInput(name);
+  }
+  Value* insertInput(size_t i, const std::string& name = "") {
+    return block_->insertInput(i, name);
+  }
+  void eraseInput(size_t i) {
+    block_->eraseInput(i);
+  }
+  size_t registerOutput(Value* n) {
+    return block_->registerOutput(n);
+  }
+  void eraseOutput(size_t i) {
+    block_->eraseOutput(i);
+  }
+
+  TORCH_API Node* create(NodeKind kind, size_t num_outputs = 1);
+  TORCH_API Node* create(
+      NodeKind kind,
+      ArrayRef<Value*> inputs,
+      size_t num_outputs = 1);
+
+  TORCH_API Node* createNone();
+  TORCH_API Node* createAutogradZero();
+  TORCH_API Node* createUninitialized(TypePtr typ);
+  TORCH_API Node* createWithSubgraph(Symbol kind);
+  TORCH_API Node* createDifferentiableSubgraph();
+  TORCH_API Node* createTuple(
+      at::ArrayRef<Value*> values,
+      TupleTypePtr optional_named_tuple = nullptr);
+  TORCH_API Node* createTupleUnpack(Value* v);
+  TORCH_API Node* createTupleIndex(
+      Value* tup,
+      Value* idx,
+      const TypePtr& output_type);
+  TORCH_API Node* createTupleSlice(
+      Value* tup,
+      int64_t beg,
+      int64_t step_size,
+      int64_t num_values);
+  TORCH_API Node* createEnumName(Value* e);
+  TORCH_API Node* createEnumValue(Value* e);
+  TORCH_API Node* createList(
+      const TypePtr& contained_type,
+      at::ArrayRef<Value*> values);
+  TORCH_API Node* createListUnpack(Value* v, size_t size);
+  TORCH_API Node* createDict(
+      const TypePtr& key_type,
+      const TypePtr& value_type,
+      at::ArrayRef<Value*> keys,
+      at::ArrayRef<Value*> values);
+  TORCH_API Node* createNumToTensor(Value* value);
+  TORCH_API Node* createObject(const ClassTypePtr& type);
+  TORCH_API Node* createSetAttr(
+      Value* obj,
+      const std::string& field,
+      Value* newValue);
+  TORCH_API Node* createGetAttr(Value* obj, const std::string& field);
+  Value* insertGetAttr(Value* obj, const std::string& field) {
+    return insertNode(createGetAttr(obj, field))->output();
+  }
+  TORCH_API Node* createStore(const std::string& name, Value* v);
+  TORCH_API Node* createLoad(const std::string& name, const TypePtr& type);
+  TORCH_API Node* createIsInstance(Value* v, at::ArrayRef<TypePtr> types);
+
+  TORCH_API Value* insertUncheckedCast(Value* v, TypePtr type);
+
+  // Insert a ToList operator with argument \p v and output type \p type.
+  // \returns the output of the operation.
+  TORCH_API Value* insertToList(Value* v, TypePtr type);
+
+  TORCH_API Value* insertFunctionCall(
+      Function* callee,
+      const MatchedSchema& matched);
+  TORCH_API Value* insertMethodCall(
+      std::string method_name,
+      const MatchedSchema& matched);
+
+  // Note: defined in python_ir.cpp and can be used only in python extension
+  Node* createPythonOp(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args);
+  // clone n, making a new node in _this_ graph.
+  // use value_map to translate inputs of n to inputs of the cloned node
+  // if copy_blocks is false, it will not recursively clone the nested blocks
+  // this node contains.
+  TORCH_API Node* createClone(
+      Node* n,
+      const std::function<Value*(Value*)>& value_map,
+      bool copy_blocks = true);
+
+  // Insert constant IValue into the graph.
+  TORCH_API Value* insertConstant(
+      const IValue& val,
+      c10::optional<SourceRange> loc = c10::nullopt,
+      c10::optional<ScopePtr> scope = c10::nullopt);
+
+  // Schema-driven insert:
+  // This inserts a node into the graph with inputs determined from args and
+  // kwargs using Python argument matching rules, and checks that the op matches
+  // a known schema.
+  //
+  // If this node successfully completes, it guarentees the node
+  // is a correctly-formed invocation of opname
+  TORCH_API Value* insert(
+      Symbol opname,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs = {},
+      const c10::optional<SourceRange>& range = {});
+
+  Node* appendNode(Node* n) {
+    return block_->appendNode(n);
+  }
+
+  Node* prependNode(Node* n) {
+    return block_->prependNode(n);
+  }
+
+  // insert before insert_before_ node
+  // initialized to insert at the end of the top level block
+  // can be changed with setInsertPoint()
+  Node* insertNode(Node* n) {
+    AT_ASSERT(
+        insert_before_->inBlockList() &&
+        "insert point node is no longer in a block list");
+    return n->insertBefore(insert_before_);
+  }
+  // set where nodes are inserted to append to the end of this block
+  void setInsertPoint(Block* b) {
+    AT_ASSERT(b->owningGraph() == this);
+    setInsertPoint(b->return_node());
+  }
+  // set where nodes are inserted to insert _before_ this node
+  // for implementation simplicity we only support inserting before a node for
+  // now
+  void setInsertPoint(Node* n) {
+    AT_ASSERT(n->owningGraph() == this && n->inBlockList());
+    insert_before_ = n;
+    predicted_insert_count_ = 0;
+  }
+  Node* insertPoint() {
+    return insert_before_;
+  }
+
+  // the top level block
+  Block* block() {
+    return block_;
+  }
+  const Block* block() const {
+    return block_;
+  }
+
+  // Checks well-formedness and invariants of graph
+  TORCH_API void lint() const;
+  // for use in debugger
+  TORCH_API void dump() const;
+
+  TORCH_API ~Graph();
+
+  TORCH_API std::string toString(bool print_source_locations = true) const;
+
+  TORCH_API std::ostream& print(
+      std::ostream& out,
+      bool print_source_locations = true) const;
+
+  friend TORCH_API std::ostream& operator<<(std::ostream& out, const Graph& g);
+
+  TORCH_API std::shared_ptr<Graph> copy();
+  TORCH_API std::unique_ptr<Graph> copyUnique();
+  TORCH_API void remapTypes(const std::function<TypePtr(TypePtr)>& type_map);
+
+ private:
+  friend TORCH_API void Lint(const AliasDb* db);
+  TORCH_API void freeNode(Node* n);
+  TORCH_API void freeValue(Value* v);
+  TORCH_API void freeBlock(Block* b);
+  void cloneFrom(Graph& src);
+};
+
+/** \brief An utility class for setting temporary insertion points.
+ *
+ * When an object of this class is created, it stores the current insertion
+ * point, sets the new one, and restores the original insertion point when the
+ * object is destroyed.
+ */
+struct WithInsertPoint {
+  WithInsertPoint(Node* n) : prev_(n->owningGraph()->insertPoint()) {
+    n->owningGraph()->setInsertPoint(n);
+  }
+  WithInsertPoint(Block* b) : WithInsertPoint(b->return_node()) {}
+
+  ~WithInsertPoint() {
+    prev_->owningGraph()->setInsertPoint(prev_);
+  }
+
+ private:
+  Node* prev_;
+};
+
+/** \brief An utility class for setting temporary scopes.
+ *
+ * When an object of this class is created, it stores the current scope, sets
+ * the new one, and restores the original scope when the object is destroyed.
+ */
+struct WithCurrentScope {
+  WithCurrentScope(Graph& g, ScopePtr scope)
+      : graph_(&g), prev_scope_(g.current_scope()) {
+    g.set_current_scope(std::move(scope));
+  }
+  ~WithCurrentScope() {
+    graph_->set_current_scope(prev_scope_);
+  }
+
+ private:
+  Graph* graph_;
+  ScopePtr prev_scope_;
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+inline Value::Value(Node* node_, size_t offset_)
+    : node_(node_),
+      offset_(offset_),
+      unique_(node_->graph_->next_unique_++),
+      type_(TensorType::get()) {
+  node_->graph_->all_values.emplace(this);
+}
+
+inline Value* Value::setType(TypePtr type) {
+  AT_ASSERT(type);
+  if (auto dyn = type->castRaw<c10::DynamicType>()) {
+    type = dyn->fallback();
+  }
+  type_ = std::move(type);
+  for (Use& use : uses_) {
+    use.user->op_ = nullptr;
+  }
+  return this;
+}
+
+inline Graph* Value::owningGraph() {
+  return node()->owningGraph();
+}
+
+inline const Graph* Value::owningGraph() const {
+  return node()->owningGraph();
+}
+
+/************* All nodes not required to be defined before Graph **************/
+struct ProfileOp : public Node {
+  static const Symbol Kind;
+  ProfileOp(Graph* graph, std::function<void(std::vector<IValue>&)> callback)
+      : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {}
+
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override;
+
+  const std::function<void(std::vector<IValue>&)>& getCallback() const {
+    return callback_;
+  }
+
+  void setCallback(std::function<void(std::vector<IValue>&)> callback) {
+    callback_ = std::move(callback);
+  }
+
+  bool hasSeenTensor() const {
+    return has_seen_tensor_;
+  }
+
+  void setHasSeenTensor(bool has_seen_tensor) {
+    has_seen_tensor_ = has_seen_tensor;
+  }
+
+ private:
+  std::function<void(std::vector<IValue>&)> callback_;
+  bool has_seen_tensor_ = false;
+};
+
+struct TORCH_API ProfileIValueOp : public Node {
+  static const Symbol Kind;
+  ProfileIValueOp(
+      Graph* graph,
+      std::function<void(std::vector<IValue>&)> callback)
+      : Node(graph, ::c10::prim::profile_ivalue),
+        callback_(std::move(callback)) {}
+
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override;
+
+  const std::function<void(std::vector<IValue>&)>& getCallback() const {
+    return callback_;
+  }
+
+  void setCallback(std::function<void(std::vector<IValue>&)> callback) {
+    callback_ = std::move(callback);
+  }
+
+ private:
+  std::function<void(std::vector<IValue>&)> callback_;
+};
+
+// execute a Python function, used for Ops we can't optimize but that we want to
+// optimize around
+//
+// Note: actual implementation (ConcretePythonOp) is defined in python_ir.cpp
+// which is not included in libtorch.so. We still include some bits and pieces
+// of PythonOp here to enable writing simple passes generically. In general,
+// python-aware bits need to be moved to the descendant classes.
+struct TORCH_API PythonOp : public Node {
+  using Node::Node;
+
+  virtual std::string name() const = 0;
+  virtual void writeScalars(std::ostream& out) const = 0;
+  void cloneFrom(Node* other_) override = 0;
+  Node* allocNewInstance(Graph* g) override = 0;
+  // recover the autograd.Function instance, if this PythonOp's function
+  // was originally SomeFunction.apply
+  // used in ONNX for discovering symbolics
+  virtual c10::optional<THPObjectPtr> autogradFunction() const = 0;
+
+  virtual void lint_python() const = 0;
+};
+
+TORCH_API void LintGraph(const std::shared_ptr<Graph>& graph);
+
+TORCH_API at::ArrayRef<Value*> createTupleUnpack(Value* v);
+
+/** Insert graph \p CALLEE into graph \p G using \p INPUTS as input values.
+ * The insertion happens at the current insertion point.
+ * Optionally, one can also pass \p VALUE_MAP to get a map between \p CALLEE
+ * values and their cloned copies in \p G.
+ */
+TORCH_API std::vector<Value*> insertGraph(
+    Graph& g,
+    Graph& callee,
+    ArrayRef<Value*> inputs);
+TORCH_API std::vector<Value*> insertGraph(
+    Graph& g,
+    Graph& callee,
+    ArrayRef<Value*> inputs,
+    std::unordered_map<Value*, Value*>& value_map);
+
+/** Insert function \p CALLEE after node \p TO_REPLACE, remove the node and
+ * replace all its uses with corresponding outputs of the inserted function.
+ * This asserts that the number of outputs of the original node and the
+ * graph are the same.
+ */
+TORCH_API std::vector<Value*> inlineCallTo(
+    Node* to_replace,
+    GraphFunction* callee,
+    bool use_graph = true);
+
+TORCH_API std::vector<Value*> inlineCallTo(
+    Node* to_replace,
+    GraphFunction* callee,
+    Graph* callee_graph);
+
+/** If there is only one value in \p OUTPUTS and its kind is Tuple, insert a
+ * tuple unpack node and return the resulting values.
+ */
+TORCH_API std::vector<Value*> unpackOutputs(const std::vector<Value*>& outputs);
+
+TORCH_API std::vector<Node*> findAllNodes(Graph& g, Symbol kind, bool recurse);
+TORCH_API std::vector<Node*> findAllNodes(Block& b, Symbol kind, bool recurse);
+TORCH_API std::vector<Node*> findAllNodes(
+    at::ArrayRef<Block*> a,
+    Symbol kind,
+    bool recurse);
+
+struct TORCH_API OperatorSet {
+  OperatorSet(std::initializer_list<const char*> sig_literals);
+  std::vector<std::shared_ptr<Operator>> getOps() const;
+  void insert(std::initializer_list<const char*> sig_literals);
+
+ private:
+  friend struct Node;
+  std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>> ops;
+};
+
+template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct OperatorMap {
+  // Type aliasing
+  using OpMapType = typename std::pair<std::shared_ptr<Operator>, T>;
+  using ValueType = std::vector<OpMapType>;
+  using MapType = std::unordered_map<Symbol, ValueType>;
+
+  OperatorMap() = default;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit OperatorMap(
+      std::initializer_list<std::pair<std::shared_ptr<Operator>, T>> init) {
+    insert(init);
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit OperatorMap(std::initializer_list<std::pair<const char*, T>> init) {
+    insert(init);
+  }
+
+  void insert(const std::shared_ptr<Operator>& op, T val) {
+    // Remove if exists before insert
+    erase(op);
+    map[Symbol::fromQualString(op->schema().name())].emplace_back(
+        std::make_pair(op, val));
+  }
+
+  void insert(const OperatorSet& op_set, T val) {
+    for (auto& op : op_set.getOps()) {
+      insert(op, val);
+    }
+  }
+
+  void insert(
+      std::initializer_list<std::pair<std::shared_ptr<Operator>, T>> v) {
+    for (auto& el : v) {
+      insert(el.first, el.second);
+    }
+  }
+
+  void insert(std::initializer_list<std::pair<const char*, T>> v) {
+    for (auto& el : v) {
+      insert(getOperatorForLiteral(el.first), el.second);
+    }
+  }
+
+  void erase(const std::shared_ptr<Operator>& op) {
+    auto it = map.find(Symbol::fromQualString(op->schema().name()));
+    if (it == map.end()) {
+      return;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op->schema()) {
+        it->second.erase(vit);
+        break;
+      }
+    }
+    if (it->second.size() == 0) {
+      map.erase(Symbol::fromQualString(op->schema().name()));
+    }
+  }
+
+  bool contains(const Operator& op) const {
+    const auto it = map.find(Symbol::fromQualString(op.schema().name()));
+    if (it == map.end()) {
+      return false;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op.schema()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool contains(const Node* n) const {
+    return n->maybeOperator() && contains(n->getOperator());
+  }
+
+  c10::optional<T> find(const Operator& op) {
+    const auto it = map.find(Symbol::fromQualString(op.schema().name()));
+    if (it == map.end()) {
+      return c10::nullopt;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op.schema()) {
+        return vit->second;
+      }
+    }
+    return c10::nullopt;
+  }
+
+  // TODO: return iterator
+  std::vector<OpMapType> getAllKeysAndValues() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<OpMapType> keys_values;
+    for (auto& symbol_mapping : map) {
+      auto& vec = symbol_mapping.second;
+      for (auto& pair : vec) {
+        keys_values.push_back(pair);
+      }
+    }
+    return keys_values;
+  }
+
+ private:
+  friend struct Node;
+  MapType map;
+};
+
+template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct FunctionSchemaMap {
+  // Type aliasing
+  using FuncSchemaMapType = typename std::pair<FunctionSchema, T>;
+  using ValueType = std::vector<FuncSchemaMapType>;
+  using MapType = std::unordered_map<Symbol, ValueType>;
+
+  FunctionSchemaMap() = default;
+  void insert(const FunctionSchema& schema, T val) {
+    // Remove if exists before insert
+    erase(schema);
+    map[Symbol::fromQualString(schema.name())].emplace_back(
+        std::make_pair(schema, val));
+  }
+
+  void erase(const FunctionSchema& schema) {
+    auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first == schema) {
+        it->second.erase(vit);
+        break;
+      }
+    }
+    if (it->second.size() == 0) {
+      map.erase(Symbol::fromQualString(schema.name()));
+    }
+  }
+
+  bool contains(const FunctionSchema& schema) const {
+    const auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return false;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == schema) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  c10::optional<T> find(const FunctionSchema& schema) const {
+    const auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return c10::nullopt;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first == schema) {
+        return vit->second;
+      }
+    }
+    return c10::nullopt;
+  }
+
+  // TODO: return iterator
+  std::vector<FuncSchemaMapType> getAllKeysAndValues() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<FuncSchemaMapType> keys_values;
+    for (auto& symbol_mapping : map) {
+      auto& vec = symbol_mapping.second;
+      for (auto& pair : vec) {
+        keys_values.push_back(pair);
+      }
+    }
+    return keys_values;
+  }
+
+ private:
+  friend struct Node;
+  MapType map;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..5be7342b527daf54695eba6db7aa3f8fb3ee24bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+struct IfView {
+  explicit IfView(Node* node) : node_(node) {
+    AT_ASSERT(node->kind() == ::c10::prim::If);
+  }
+  Value* cond() const {
+    return node_->input(0);
+  }
+  Block* thenBlock() const {
+    return node_->blocks().at(0);
+  }
+  Block* elseBlock() const {
+    return node_->blocks().at(1);
+  }
+  ArrayRef<Value*> thenOutputs() const {
+    return thenBlock()->outputs();
+  }
+  ArrayRef<Value*> elseOutputs() const {
+    return elseBlock()->outputs();
+  }
+  ArrayRef<Value*> outputs() const {
+    return node_->outputs();
+  }
+  Node* node() const {
+    return node_;
+  }
+  operator Node*() const {
+    return node_;
+  }
+
+  void permuteOutputs(const std::vector<size_t>& new_output_order) {
+    node_->permuteOutputs(new_output_order);
+    thenBlock()->permuteOutputs(new_output_order);
+    elseBlock()->permuteOutputs(new_output_order);
+  }
+
+ private:
+  Node* node_;
+};
+
+struct LoopView {
+  explicit LoopView(Node* node) : node_(node) {
+    AT_ASSERT(
+        node->kind() == ::c10::prim::Loop || node->kind() == ::c10::onnx::Loop);
+  }
+  Block* bodyBlock() const {
+    return node_->blocks().at(0);
+  }
+  Value* cond() const {
+    return node_->input(0);
+  }
+  Value* maxTripCount() const {
+    return node_->input(0);
+  }
+  Value* inputCond() const {
+    return node_->input(1);
+  }
+  Value* nextCond() const {
+    return bodyBlock()->outputs().at(0);
+  }
+  Value* currentTripCount() const {
+    return bodyBlock()->inputs().at(0);
+  }
+  ArrayRef<Value*> carriedInputs() const {
+    // skip trip count and cond
+    return node_->inputs().slice(2);
+  }
+  ArrayRef<Value*> carriedInputsWithCond() const {
+    // skip trip count and cond
+    return node_->inputs().slice(1);
+  }
+  ArrayRef<Value*> carriedOutputs() const {
+    return node_->outputs();
+  }
+  ArrayRef<Value*> bodyCarriedInputs() const {
+    // skip trip count and cond
+    return bodyBlock()->inputs().slice(1);
+  }
+  ArrayRef<Value*> bodyCarriedOutputs() const {
+    return bodyBlock()->outputs().slice(1);
+  }
+  Node* node() const {
+    return node_;
+  }
+  operator Node*() const {
+    return node_;
+  }
+
+  void permuteLoopCarried(const std::vector<size_t>& new_output_order) {
+    node_->permuteOutputs(new_output_order);
+    // skip trip count and cond
+    node_->permuteInputs(adjustIndices(2, new_output_order));
+    auto adjusted_block_order = adjustIndices(1, new_output_order);
+    bodyBlock()->permuteOutputs(adjusted_block_order);
+    bodyBlock()->permuteInputs(adjusted_block_order);
+  }
+
+  void replaceMaxTripCount(Value* new_max_trip_count) {
+    node_->replaceInput(0, new_max_trip_count);
+  }
+  void replaceInputCondition(Value* new_input_condition) {
+    node_->replaceInput(1, new_input_condition);
+  }
+
+  // our way of encoding loops makes them difficult to turn back into python
+  // syntax. we have to check properties of the condition and trip count inputs
+  // to figure out which one it initially was. ModifiedLoops are not directly
+  // mappable to either For or While
+  enum LoopType { While, For, ModifiedLoop };
+
+  LoopType loopType() {
+    auto trip_count = toIValue(maxTripCount());
+    auto cond_input = toIValue(inputCond());
+    auto cond_next = toIValue(nextCond());
+
+    bool condition_is_always_true =
+        cond_input && cond_input->toBool() && cond_next && cond_next->toBool();
+    bool trip_count_is_specified = !trip_count || // trip is not a constant
+        trip_count->toInt() !=
+            std::numeric_limits<int64_t>::max() || // it is a constant but not
+                                                   // the default one
+        !currentTripCount()
+             ->uses()
+             .empty(); // it is actually being used in the body.
+
+    if (condition_is_always_true) {
+      // if the trip count was not specified this was a user-written while True:
+      return trip_count_is_specified ? For : While;
+    } else {
+      if (trip_count_is_specified) {
+        return ModifiedLoop;
+      }
+      return While;
+    }
+  }
+
+ private:
+  Node* node_;
+
+  // adjust index_ordering by adding indices 0 - thorugh adjust, and
+  // incrementing all existing inputs by adjust
+  static std::vector<size_t> adjustIndices(
+      size_t adjust,
+      const std::vector<size_t>& index_ordering) {
+    std::vector<size_t> adjusted;
+    adjusted.reserve(adjust + index_ordering.size());
+    for (const auto i : c10::irange(adjust)) {
+      adjusted.push_back(i);
+    }
+    for (auto index : index_ordering) {
+      adjusted.push_back(index + adjust);
+    }
+    return adjusted;
+  }
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h
new file mode 100644
index 0000000000000000000000000000000000000000..c81af2cb1480c65ad379ca5332cdb5f3d44d7775
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <string>
+#include <unordered_map>
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+struct Value;
+
+// \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
+// if parse_tensor_constants is true will construct empty tensors
+// for Tensor constants with random or unitialized contents, otherwise will
+// throw
+TORCH_API void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    bool parse_tensor_constants = false);
+
+/** \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
+ *
+ * \p VMAP is filled with String to Value pairs allowing to index Values in the
+ * newly created graph by their name in the original IR string.
+ * if parse_tensor_constants is true will construct empty tensors
+ * for Tensor constants with random or unitialized contents, otherwise will
+ * throw
+ */
+TORCH_API void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    std::unordered_map<std::string, Value*>& vmap,
+    bool parse_tensor_constants = false);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bbed435e04fd56952ef145b943e52fb02cc1fd9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h
@@ -0,0 +1,84 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/utils/variadic.h>
+
+namespace torch {
+namespace jit {
+
+struct Value;
+
+/**
+ * A value with optional extra name and location information. Used during
+ * schema matching to provide extra error information and resolve kwargs.
+ */
+struct NamedValue {
+  NamedValue(const SourceRange& loc, const std::string& name, Value* value)
+      : loc_(loc), name_(name), value_(value) {}
+  NamedValue(const SourceRange& loc, Value* value) : loc_(loc), value_(value) {}
+
+  /* implicit */ NamedValue(Value* value) : value_(value) {}
+  NamedValue(const std::string& name, Value* value)
+      : name_(name), value_(value) {}
+
+  /* implicit */ NamedValue(IValue value)
+      : value_(nullptr), ivalue_(std::move(value)) {}
+
+  NamedValue(const std::string& name, IValue value)
+      : name_(name), ivalue_(std::move(value)) {}
+
+  template <
+      typename T,
+      typename = enable_if_t<
+          (!std::is_same<decay_t<T>, NamedValue>::value &&
+           !std::is_same<decay_t<T>, Value*>::value &&
+           !std::is_same<decay_t<T>, IValue>::value)>>
+  // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+  NamedValue(T&& t) : NamedValue(IValue(std::forward<T>(t))) {}
+
+  template <
+      typename T,
+      typename = enable_if_t<
+          (!std::is_same<decay_t<T>, Value*>::value &&
+           !std::is_same<decay_t<T>, IValue>::value)>>
+  NamedValue(const std::string& name, T&& t)
+      : NamedValue(name, IValue(std::forward<T>(t))) {}
+
+  SourceRange locOr(const SourceRange& backup_location) const {
+    if (!loc_)
+      return backup_location;
+    return loc();
+  }
+
+  // note: this will insert a constant node into the graph at the current
+  // insert point if this NamedValue is actually a constant
+  Value* value(Graph& g) const {
+    if (!value_)
+      return insertConstant(
+          g, ivalue_); // use insertConstant to remove need to include ir.h here
+    return value_;
+  }
+
+  const std::string& name() const {
+    AT_ASSERT(name_);
+    return *name_;
+  }
+
+  const SourceRange& loc() const {
+    AT_ASSERT(loc_);
+    return *loc_;
+  }
+
+  at::TypePtr type() const;
+
+ private:
+  c10::optional<SourceRange> loc_;
+  c10::optional<std::string> name_;
+  Value* value_{nullptr};
+  // only valid if value_ == nullptr;
+  IValue ivalue_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h
new file mode 100644
index 0000000000000000000000000000000000000000..97fa1c59496784cf58ef7df17bc78dd7de7baad7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API HashNode {
+  size_t operator()(const Node* k) const;
+};
+
+struct TORCH_API EqualNode {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d43afa7df4a0b9a7e8868f75aeb7814114306e6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h
@@ -0,0 +1,220 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/Optional.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+struct ModuleInstanceInfo;
+constexpr size_t kModuleInstanceInfo = 2;
+
+namespace utils {
+std::string get_module_info(const ModuleInstanceInfo& module_instance_info);
+} // namespace utils
+
+// Scope is a node of a trie that represents the tree of nested scopes.
+// Individual scopes are pushed and popped from Graph, which holds a
+// pointer to the current scope. Each Node in Graph holds a pointer
+// to the scope that was current when the node was created.
+// The trie never needs to shrink, it only grows until it is disposed
+// of when Graph is deallocated. Hence, pointers to scopes held by nodes
+// will always be valid as long as Graph is alive.
+struct Scope;
+using ScopePtr = c10::intrusive_ptr<Scope>;
+using c10::Symbol;
+
+struct TORCH_API Scope : public c10::intrusive_ptr_target {
+ private:
+  ScopePtr parent_;
+  Symbol name_;
+  ScopePtr intrusive_from_this();
+
+ public:
+  Scope();
+
+  Scope(ScopePtr parent, Symbol name);
+
+  ScopePtr push(Symbol name);
+
+  ScopePtr parent();
+
+  bool isRoot() const;
+
+  bool isBlank() const;
+
+  ScopePtr getRoot();
+
+  size_t getDepth();
+
+  Symbol name() const;
+
+  std::string namesFromRoot(const std::string& separator = "/") const;
+};
+
+struct Function;
+struct InlinedCallStack;
+
+/**
+ * ModuleInstanceInfo is a structure to include the module type and instance
+ * name. It also provide public methods to get the pointer to module type and
+ * instance name.
+ *
+ * This structure is mainly used as a private member in InlinedCallStack, such
+ * that one can follow the callstack to find the relevant module hierarchy.
+ */
+struct ModuleInstanceInfo {
+ private:
+  c10::ClassTypePtr module_type_{nullptr};
+  std::string instance_name_;
+
+ public:
+  ModuleInstanceInfo() = default;
+  ModuleInstanceInfo(c10::ClassTypePtr module_type, std::string instance_name);
+  c10::ClassTypePtr class_type() {
+    return module_type_;
+  }
+  c10::ClassTypePtr class_type() const {
+    return module_type_;
+  }
+  std::string instance_name() const {
+    return instance_name_;
+  }
+
+  bool operator==(const ModuleInstanceInfo& rhs) const {
+    return (class_type() == rhs.class_type()) &&
+        (instance_name() == rhs.instance_name());
+  }
+};
+
+/**
+ * InlinedCallStack is an element in a list representing callstack of functions
+ * that have been inlined.
+ *
+ * Each such element holds info about the current callsite (Function and
+ * SourceRange) and a pointer to the next element in the list. The last element
+ * in the list represents the innermost function that was inlined.
+ *
+ * For instance, if a node has a callstack
+ *    [foo, source_range1] -> [bar, source_range2]
+ * it means that this node was originally from function 'bar' that was called
+ * at 'source_range2' in function 'foo' that was called in the current function
+ * at 'source_range1'.
+ *
+ * If a node did not come from any inlined function, its callstack will be
+ * empty.
+ *
+ * The callstack lists only grow, we never remove elements from them, which
+ * allows us to reuse same elements in different lists. For instance, if we
+ * inline function 'bar' to 'foo' and then inline 'foo' to two functions 'ham'
+ * and 'baz', the callstacks would look like:
+ *
+ *  [baz, source_range3]  --
+ *                           \
+ *                             --> [foo, source_range1] -> [bar, source_range2]
+ *                           /
+ *  [ham, source_range4]  --
+ */
+using InlinedCallStackPtr = c10::intrusive_ptr<InlinedCallStack>;
+using InlinedCallStackEntry =
+    std::tuple<Function*, SourceRange, c10::optional<ModuleInstanceInfo>>;
+
+struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
+ private:
+  c10::optional<InlinedCallStackPtr> callee_;
+  Function* fn_;
+  // Reason for fn_name_ even though we have fn_
+  // Serialized callstack is used in circustmances where InlinedCallstack
+  // cannot be constructed during runtime, e.g. mobile runtime or
+  // delegated backends.
+  // Since in those cases we do not have Function* we store function name
+  // fn_name does not give you access to the same information that Function*
+  // does, however in mobile/delegated backend runtime we use InlindedCallStack
+  // for exception stack and for that purpose fn_name_ suffices.
+  const std::string fn_name_;
+  SourceRange source_range_;
+  InlinedCallStackPtr intrusive_from_this();
+  c10::optional<ModuleInstanceInfo> module_instance_info_;
+
+ public:
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(Function* fn, SourceRange source_range);
+
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(
+      Function* fn,
+      SourceRange source_range,
+      c10::optional<ModuleInstanceInfo> module_instance_info);
+
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(
+      Function* fn,
+      SourceRange source_range,
+      c10::optional<ModuleInstanceInfo> module_instance_info,
+      std::string& function_name);
+
+  // Constructor for an inner callstack node.
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range);
+
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range,
+      c10::optional<ModuleInstanceInfo> module_instance_info);
+
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range,
+      c10::optional<ModuleInstanceInfo> module_instance_info,
+      std::string& function_name);
+
+  // Return next element in the callstack list.
+  c10::optional<InlinedCallStackPtr> callee() const;
+
+  // Return module instance associated with the current element.
+  c10::optional<ModuleInstanceInfo> module_instance() const;
+
+  // Returns the source range of the node
+  SourceRange source_range() const;
+
+  Function* function() const;
+
+  const std::string& function_name() const;
+
+  // Return callstack as a vector of [Function, SourceRange] pairs.
+  std::vector<InlinedCallStackEntry> vec();
+
+  void setCallee(c10::optional<InlinedCallStackPtr>);
+
+  bool operator==(const InlinedCallStack& rhs) const {
+    // No need to compare fn_, since source_range equivalence check
+    // should suffice.
+    return (module_instance().has_value() ==
+            rhs.module_instance().has_value()) &&
+        (module_instance().has_value() &&
+         module_instance().value() == rhs.module_instance().value()) &&
+        callee() == rhs.callee() && source_range() == rhs.source_range();
+  }
+
+  bool operator!=(const InlinedCallStack& rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+// {source range, node name, InlinedCallStack}
+// We store node name because same debug infor will be used for
+// profiling as well, so we need to know op names as well.
+using DebugInfoTuple =
+    std::tuple<SourceRange, std::string, InlinedCallStackPtr>;
+constexpr size_t kDebugInfoTupleSourceRangeIndex{0};
+constexpr size_t kDebugInfoTupleNodeNameIndex{1};
+constexpr size_t kDebugInfoTupleInlinedCSIndex{2};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..203c2384cc860e9f934bb3ff9e99e665bc2d6bbe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+/**
+ * \brief A structure describing a match of a pattern in a graph.
+ *
+ * The structure contains an anchor node, from which the match was found, and
+ * match-maps for nodes and values. A match-map specifies the correspondance
+ * between nodes in the pattern graph (match-map keys) with nodes in the actual
+ * graph (match-map values). We keep such maps for both nodes and values.
+ */
+struct Match {
+  Node* anchor;
+  std::unordered_map<const Node*, Node*> nodes_map;
+  std::unordered_map<const Value*, Value*> values_map;
+};
+
+/**
+ * \brief Find all matches of a \p PATTERN in a \p GRAPH.
+ *
+ * The function returns a vector of match-descriptors (see description of
+ * `struct Match`).
+ *
+ * Matching rules:
+ *  - Pattern graph must contain a single block.
+ *  - Matched subgraphs do not span across different blocks.
+ *  - No uses outside the match are allowed, except for Param and Return nodes.
+ *  Basically, we're matching hammocks, not arbitrary subgraphs.
+ *  - The pattern graph must return only one value (i.e. it must have a single
+ *  node leading to return).
+ *  - Nodes that are not used in computation of the return value in the pattern
+ * graph are ignored during matching (IOW, we're essentially performing DCE on
+ * the pattern).
+ *  - Pattern graph nodes cannot alias. TODO: the check not implemented yet.
+ *  - Aliasing nodes in the graph cannot consitute a match (i.e. through all
+ * found matches, no nodes in the subgraph alias with each other). TODO: check
+ * not implemented yet.
+ *  - The matcher will not mutate either the pattern graph or the matched graph.
+ * The matched graph is taken as non-const so that Match may contain non-const
+ * pointers.  This enables clients of this API to use Match to drive mutations.
+ *
+ * Note [Multi-output Patterns]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Subgraph matcher provides limited support for multi-output patterns. With a
+ * single output pattern, a single scan through the graph is sufficient to
+ * find all the matches: given a starting node (an "anchor"), we can
+ * deterministically check whether a pattern matches a subgraph corresponding to
+ * this anchor node. For a general case of multi-output patterns, we would have
+ * N anchors, which would result in M^N comparisons (M is the size of the
+ * graph). Clearly this is computationally prohibitive.
+ *
+ * To overcome this, we impose some constraints on the multi-output patterns
+ * that we accept. We require that checking whether the pattern matches a
+ * subgraph would still be fully determined by a single node in the graph. To
+ * achieve this, we designate the first output in the pattern as the "main"
+ * output and assume that we can traverse up from this node to match the
+ * entire pattern.
+ *
+ * Corrolary 1: the order of outputs in the pattern matters!
+ * Corollary 2: patterns cannot contain any nodes not participating in the main
+ * output computation.
+ */
+std::vector<Match> TORCH_API
+findPatternMatches(const Graph& pattern, Graph& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h
new file mode 100644
index 0000000000000000000000000000000000000000..55850fcccde51d08d02ec321aa06e07b6b213414
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+struct HashType {
+  size_t operator()(const TypePtr& type) const;
+  size_t operator()(const c10::ConstTypePtr& type) const;
+};
+
+struct EqualType {
+  bool operator()(const TypePtr& a, const TypePtr& b) const;
+  bool operator()(const c10::ConstTypePtr& a, const c10::ConstTypePtr& b) const;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbb3c7152eac32121b979d9d7f71ee53795d5c03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h
@@ -0,0 +1,128 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+
+// `TorchScript` offers a simple logging facility that can enabled by setting an
+// environment variable `PYTORCH_JIT_LOG_LEVEL`.
+
+// Logging is enabled on a per file basis. To enable logging in
+// `dead_code_elimination.cpp`, `PYTORCH_JIT_LOG_LEVEL` should be
+// set to `dead_code_elimination.cpp` or, simply, to `dead_code_elimination`
+// (i.e. `PYTORCH_JIT_LOG_LEVEL=dead_code_elimination`).
+
+// Multiple files can be logged by separating each file name with a colon `:` as
+// in the following example,
+// `PYTORCH_JIT_LOG_LEVEL=dead_code_elimination:guard_elimination`
+
+// There are 3 logging levels available for your use ordered by the detail level
+// from lowest to highest.
+
+// * `GRAPH_DUMP` should be used for printing entire graphs after optimization
+// passes
+// * `GRAPH_UPDATE` should be used for reporting graph transformations (i.e.
+// node deletion, constant folding, etc)
+// * `GRAPH_DEBUG` should be used for providing information useful for debugging
+//   the internals of a particular optimization pass or analysis
+
+// The default logging level is `GRAPH_DUMP` meaning that only `GRAPH_DUMP`
+// statements will be enabled when one specifies a file(s) in
+// `PYTORCH_JIT_LOG_LEVEL`.
+
+// `GRAPH_UPDATE` can be enabled by prefixing a file name with an `>` as in
+// `>alias_analysis`.
+// `GRAPH_DEBUG` can be enabled by prefixing a file name with an `>>` as in
+// `>>alias_analysis`.
+// `>>>` is also valid and **currently** is equivalent to `GRAPH_DEBUG` as there
+// is no logging level that is higher than `GRAPH_DEBUG`.
+
+namespace torch {
+namespace jit {
+
+struct Node;
+struct Graph;
+
+enum class JitLoggingLevels {
+  GRAPH_DUMP = 0,
+  GRAPH_UPDATE,
+  GRAPH_DEBUG,
+};
+
+TORCH_API std::string get_jit_logging_levels();
+
+TORCH_API void set_jit_logging_levels(std::string level);
+
+TORCH_API void set_jit_logging_output_stream(std::ostream& out_stream);
+
+TORCH_API std::ostream& get_jit_logging_output_stream();
+
+TORCH_API std::string getHeader(const Node* node);
+
+TORCH_API std::string log_function(const std::shared_ptr<Graph>& graph);
+
+TORCH_API ::torch::jit::JitLoggingLevels jit_log_level();
+
+// Prefix every line in a multiline string \p IN_STR with \p PREFIX.
+TORCH_API std::string jit_log_prefix(
+    const std::string& prefix,
+    const std::string& in_str);
+
+TORCH_API std::string jit_log_prefix(
+    ::torch::jit::JitLoggingLevels level,
+    const char* fn,
+    int l,
+    const std::string& in_str);
+
+TORCH_API bool is_enabled(
+    const char* cfname,
+    ::torch::jit::JitLoggingLevels level);
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& out,
+    ::torch::jit::JitLoggingLevels level);
+
+#define JIT_LOG(level, ...)                                         \
+  if (is_enabled(__FILE__, level)) {                                \
+    ::torch::jit::get_jit_logging_output_stream()                   \
+        << ::torch::jit::jit_log_prefix(                            \
+               level, __FILE__, __LINE__, ::c10::str(__VA_ARGS__)); \
+  }
+
+// tries to reconstruct original python source
+#define SOURCE_DUMP(MSG, G)                       \
+  JIT_LOG(                                        \
+      ::torch::jit::JitLoggingLevels::GRAPH_DUMP, \
+      MSG,                                        \
+      "\n",                                       \
+      ::torch::jit::log_function(G));
+// use GRAPH_DUMP for dumping graphs after optimization passes
+#define GRAPH_DUMP(MSG, G) \
+  JIT_LOG(                 \
+      ::torch::jit::JitLoggingLevels::GRAPH_DUMP, MSG, "\n", (G)->toString());
+// use GRAPH_UPDATE for reporting graph transformations (i.e. node deletion,
+// constant folding, CSE)
+#define GRAPH_UPDATE(...) \
+  JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_UPDATE, __VA_ARGS__);
+// use GRAPH_DEBUG to provide information useful for debugging a particular opt
+// pass
+#define GRAPH_DEBUG(...) \
+  JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_DEBUG, __VA_ARGS__);
+// use GRAPH_EXPORT to export a graph so that the IR can be loaded by a script
+#define GRAPH_EXPORT(MSG, G)                       \
+  JIT_LOG(                                         \
+      ::torch::jit::JitLoggingLevels::GRAPH_DEBUG, \
+      MSG,                                         \
+      "\n<GRAPH_EXPORT>\n",                        \
+      (G)->toString(),                             \
+      "</GRAPH_EXPORT>");
+
+#define GRAPH_DUMP_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_DUMP))
+#define GRAPH_UPDATE_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_UPDATE))
+#define GRAPH_DEBUG_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_DEBUG))
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ed0c113c28b8fbbd11453c9b0a116edcb710b14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <string>
+#include <unordered_map>
+
+// `TorchScript` offers a simple optimization limit checker
+// that can be configured through environment variable `PYTORCH_JIT_OPT_LIMIT`.
+// The purpose is to limit how many optimization you can make per pass.
+// This is useful for debugging any passes.
+
+// Opt limit checker is enabled on a per file basis (hence per pass). For
+// example, in `constant_propagation.cpp`, `PYTORCH_JIT_OPT_LIMIT` should be set
+// to `constant_propagation=<opt_limit>` or, simply, to
+// `constant_propagation=<opt_limit>` where <opt_limit> is the number of
+// optimizations you want to make for the pass. (i.e.
+// `PYTORCH_JIT_OPT_LIMIT="constant_propagation=<opt_limit>"`).
+
+// Multiple files can be configured by separating each file name with a colon
+// `:` as in the following example,
+// `PYTORCH_JIT_OPT_LIMIT="constant_propagation=<opt_limit>:dead_code_elimination=<opt_limit>"`
+
+// You can call opt limiter by calling JIT_OPT_ALLOWED. It will return true if
+// we haven't reached the optimization limit yet. Otherwise, it will return
+// false. Typical usage:
+
+// if (!JIT_OPT_ALLOWED) {
+//     GRAPH_DUMP(...); //supplied from jit_log
+//     return;
+// }
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool opt_limit(const char* pass_name);
+
+#define JIT_OPT_ALLOWED opt_limit(__FILE__)
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h
new file mode 100644
index 0000000000000000000000000000000000000000..40498e7d4e61e25b904953cbebab692ae1dd56c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/operator_name.h>
+#include <torch/csrc/jit/runtime/instruction.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+using Stack = std::vector<c10::IValue>;
+using DebugHandle = int64_t;
+
+class Function;
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct Code {
+  std::vector<Instruction> instructions_;
+  std::vector<DebugHandle> debug_handles_;
+  std::vector<c10::OperatorName> op_names_;
+  std::vector<int> operator_input_sizes_;
+  std::vector<std::function<void(Stack&)>> operators_;
+  std::vector<c10::IValue> constants_;
+  std::vector<c10::TypePtr> types_;
+  // TODO After we actually export CALL instructions we can remove this.
+  // We may need a two-stage importing scheme, where we firstly construct all
+  // function objects, and then append referenced function pointers. This could
+  // be done in parseMethods().
+  std::vector<mobile::Function*> functions_;
+  size_t register_size_ = 0; // Aggregated output size.
+  // initialized means operators_ array is filled with operators
+  bool initialized = false;
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..e99301fc71712b9f39d7ca9cc4c6f40408c27548
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h
@@ -0,0 +1,57 @@
+#pragma once
+#include <c10/util/flat_hash_map.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/ir/scope.h>
+#include <torch/csrc/jit/serialization/source_range_serialization.h>
+
+namespace torch {
+namespace jit {
+/*
+ * MobileDebugTable:
+ * Deserializes debug_pkl and callstack_map records from PT model's zip archive
+ * and stores them in a map of debug handles to DebugInfoPair. Debug handles are
+ * unique per model and runtime, be in lite interpreter or delegate, an
+ * exception of BackendRuntimeException should raised using debug handles.
+ * getSourceDebugString method is responsible for translating debug
+ * handles to correspond debug information.
+ * This debug informatin includes stack trace of model level source code and
+ * module hierarchy where the exception occurred.
+ */
+class MobileDebugTable {
+ public:
+  MobileDebugTable() = default;
+  MobileDebugTable(
+      std::unique_ptr<caffe2::serialize::PyTorchStreamReader>& reader,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+  template <typename It>
+  MobileDebugTable(It begin, It end) : callstack_ptr_map_(begin, end) {}
+
+  std::string getSourceDebugString(
+      const int64_t debug_handle,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getSourceDebugString(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getModuleHierarchyInfo(
+      const int64_t debug_handle,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getModuleHierarchyInfo(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+
+  const ska::flat_hash_map<int64_t, DebugInfoTuple>& getCallStackPtrMap()
+      const {
+    return callstack_ptr_map_;
+  }
+
+ private:
+  std::pair<std::string, std::string> getSourceDebugModuleHierarchyInfo(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  ska::flat_hash_map<int64_t, DebugInfoTuple> callstack_ptr_map_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bb7892d4a2b5da4bcd0466e46335544c168fd03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h
@@ -0,0 +1,196 @@
+#pragma once
+
+#include <array>
+#include <cerrno>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <istream>
+#include <memory>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/core/impl/alloc_cpu.h>
+#include <caffe2/serialize/read_adapter_interface.h>
+
+#if defined(HAVE_MMAP)
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+/**
+ * @file
+ *
+ * Helpers for identifying file formats when reading serialized data.
+ *
+ * Note that these functions are declared inline because they will typically
+ * only be called from one or two locations per binary.
+ */
+
+namespace torch {
+namespace jit {
+
+/**
+ * The format of a file or data stream.
+ */
+enum class FileFormat {
+  UnknownFileFormat = 0,
+  FlatbufferFileFormat,
+  ZipFileFormat,
+};
+
+/// The size of the buffer to pass to #getFileFormat(), in bytes.
+constexpr size_t kFileFormatHeaderSize = 8;
+constexpr size_t kMaxAlignment = 16;
+
+/**
+ * Returns the likely file format based on the magic header bytes in @p header,
+ * which should contain the first bytes of a file or data stream.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(const char* data) {
+  // The size of magic strings to look for in the buffer.
+  static constexpr size_t kMagicSize = 4;
+
+  // Bytes 4..7 of a Flatbuffer-encoded file produced by
+  // `flatbuffer_serializer.h`. (The first four bytes contain an offset to the
+  // actual Flatbuffer data.)
+  static constexpr std::array<char, kMagicSize> kFlatbufferMagicString = {
+      'P', 'T', 'M', 'F'};
+  static constexpr size_t kFlatbufferMagicOffset = 4;
+
+  // The first four bytes of a ZIP file.
+  static constexpr std::array<char, kMagicSize> kZipMagicString = {
+      'P', 'K', '\x03', '\x04'};
+
+  // Note that we check for Flatbuffer magic first. Since the first four bytes
+  // of flatbuffer data contain an offset to the root struct, it's theoretically
+  // possible to construct a file whose offset looks like the ZIP magic. On the
+  // other hand, bytes 4-7 of ZIP files are constrained to a small set of values
+  // that do not typically cross into the printable ASCII range, so a ZIP file
+  // should never have a header that looks like a Flatbuffer file.
+  if (std::memcmp(
+          data + kFlatbufferMagicOffset,
+          kFlatbufferMagicString.data(),
+          kMagicSize) == 0) {
+    // Magic header for a binary file containing a Flatbuffer-serialized mobile
+    // Module.
+    return FileFormat::FlatbufferFileFormat;
+  } else if (std::memcmp(data, kZipMagicString.data(), kMagicSize) == 0) {
+    // Magic header for a zip file, which we use to store pickled sub-files.
+    return FileFormat::ZipFileFormat;
+  }
+  return FileFormat::UnknownFileFormat;
+}
+
+/**
+ * Returns the likely file format based on the magic header bytes of @p data.
+ * If the stream position changes while inspecting the data, this function will
+ * restore the stream position to its original offset before returning.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(std::istream& data) {
+  FileFormat format = FileFormat::UnknownFileFormat;
+  std::streampos orig_pos = data.tellg();
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  std::array<char, kFileFormatHeaderSize> header;
+  data.read(header.data(), header.size());
+  if (data.good()) {
+    format = getFileFormat(header.data());
+  }
+  data.seekg(orig_pos, data.beg);
+  return format;
+}
+
+/**
+ * Returns the likely file format based on the magic header bytes of the file
+ * named @p filename.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(const std::string& filename) {
+  std::ifstream data(filename, std::ifstream::binary);
+  return getFileFormat(data);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static void file_not_found_error() {
+  std::stringstream message;
+  message << "Error while opening file: ";
+  if (errno == ENOENT) {
+    message << "no such file or directory" << std::endl;
+  } else {
+    message << "error no is: " << errno << std::endl;
+  }
+  TORCH_CHECK(false, message.str());
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_file_content(
+    const char* filename) {
+#if defined(HAVE_MMAP)
+  int fd = open(filename, O_RDONLY);
+  if (fd < 0) {
+    // failed to open file, chances are it's no such file or directory.
+    file_not_found_error();
+  }
+  struct stat statbuf {};
+  fstat(fd, &statbuf);
+  size_t size = statbuf.st_size;
+  void* ptr = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  close(fd);
+  auto deleter = [statbuf](char* ptr) { munmap(ptr, statbuf.st_size); };
+  std::shared_ptr<char> data(reinterpret_cast<char*>(ptr), deleter);
+#else
+  FILE* f = fopen(filename, "rb");
+  if (f == nullptr) {
+    file_not_found_error();
+  }
+  fseek(f, 0, SEEK_END);
+  size_t size = ftell(f);
+  fseek(f, 0, SEEK_SET);
+  // make sure buffer size is multiple of alignment
+  size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  fread(data.get(), size, 1, f);
+  fclose(f);
+#endif
+  return std::make_tuple(data, size);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_stream_content(
+    std::istream& in) {
+  // get size of the stream and reset to orig
+  std::streampos orig_pos = in.tellg();
+  in.seekg(orig_pos, std::ios::end);
+  const long size = in.tellg();
+  in.seekg(orig_pos, in.beg);
+
+  // read stream
+  // NOLINT make sure buffer size is multiple of alignment
+  size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  in.read(data.get(), size);
+
+  // reset stream to original position
+  in.seekg(orig_pos, in.beg);
+  return std::make_tuple(data, size);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_rai_content(
+    caffe2::serialize::ReadAdapterInterface* rai) {
+  size_t buffer_size = (rai->size() / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  rai->read(
+      0, data.get(), rai->size(), "Loading ReadAdapterInterface to bytes");
+  return std::make_tuple(data, buffer_size);
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1bafaae58f47683d1307ae1d97a40b42e53efb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <istream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/core/Device.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+/**
+ * Defines the public API for loading flatbuffer-serialized mobile modules.
+ * Note that this header must not include or depend on flatbuffer-defined
+ * types, to avoid leaking those details to PyTorch clients.
+ */
+
+namespace torch {
+namespace jit {
+
+/// All non-copied data pointers provided to `parse_and_initialize_*` functions
+/// must be aligned to this boundary. Since the Module will point directly into
+/// the data, this alignment is necessary to ensure that certain types/structs
+/// are properly aligned.
+constexpr size_t kFlatbufferDataAlignmentBytes = 16;
+
+/// Maps file names to file contents.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+// On high level, to produce a Module from a file on disk, we need to go
+// through the follow steps:
+// 1. Read: Read the file from disk -> memory
+// 2. Deserialize: Parse the bytes to produce some in memory manipulable
+//    structure
+// 3. Module initialization: Produce mobile::Module out of the structure
+//    produced in 2.
+// Under this context, the structure described in 2. is the flatbuffer-defined
+// type mobile::serialization::Module. However, this step/type is not visible in
+// the public API.
+
+// Parse a mobile::Module from raw bytes.
+//
+// This function does steps 2+3 described above.
+//
+// Does not take ownership of `data`; if you want it to take ownership, see the
+// shared_ptr overload of this function.
+//
+// If should_copy_tensor_memory is true, then the returned module will NOT have
+// refences to `data`, so `data` can be freed immediately.
+//
+// If should_copy_tensor_memory is false, then returned module will have tensors
+// that points inside of `data`; the caller will need to make sure that `data`
+// outlives the returned Module. Also, `data` must be aligned to
+// kFlatbufferDataAlignmentBytes.
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t size, // of `data`, in bytes.
+    c10::optional<at::Device> device = c10::nullopt,
+    ExtraFilesMap* extra_files = nullptr,
+    bool should_copy_tensor_memory = false);
+
+// Parse a mobile::Module from raw bytes.
+//
+// This function does steps 2+3 described above.
+//
+// The returned Module holds a reference to `data`, which must be aligned to
+// kFlatbufferDataAlignmentBytes.
+//
+// If you do not want the Module to hold a reference to `data`, see the raw
+// pointer overload of this function.
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    std::shared_ptr<char> data,
+    size_t size, // of `data`, in bytes.
+    c10::optional<at::Device> device = c10::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+// Parse a mobile::Module from raw bytes, also returning JIT-related metadata.
+//
+// This is the same as parse_and_initialize_mobile_module() except that it also
+// extracts JIT source files and constants. Can be used to construct a
+// jit::Module.
+TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit(
+    void* data,
+    size_t size, // of `data`, in bytes.
+    ExtraFilesMap& jit_sources,
+    std::vector<IValue>& jit_constants,
+    c10::optional<at::Device> device = c10::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+// Load a mobile::Module from a filepath.
+//
+// This function does steps 1+2+3 described above.
+//
+// We need to have this as a convienience because Python API will need to wrap
+// this. C++ clients should use one of the versions of
+// parse_and_initialize_mobile_module() so they can manage the raw data more
+// directly.
+TORCH_API mobile::Module load_mobile_module_from_file(
+    const std::string& filename,
+    c10::optional<at::Device> device = c10::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+TORCH_API uint64_t get_bytecode_version(std::istream& in);
+TORCH_API uint64_t get_bytecode_version(const std::string& filename);
+TORCH_API uint64_t get_bytecode_version_from_bytes(char* flatbuffer_content);
+
+TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer(
+    char* flatbuffer_content);
+
+// The methods below are less efficient because it need to read the stream in
+// its entirity to a buffer
+TORCH_API mobile::Module load_mobile_module_from_stream_with_copy(
+    std::istream& in,
+    c10::optional<at::Device> device = c10::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+TORCH_API mobile::Module parse_flatbuffer_no_object(
+    std::shared_ptr<char> data,
+    size_t size,
+    c10::optional<at::Device> device);
+
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t,
+    c10::optional<at::Device>,
+    ExtraFilesMap* extra_files,
+    bool should_copy_tensor_memory);
+
+// no op, TODO(qihan) delete
+TORCH_API bool register_flatbuffer_loader();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bb3a84d4fb539f7d91b4ffc1ac141b58632f034
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstddef>
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/mobile/code.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+class Frame {
+ public:
+  explicit Frame(const Code& code) : code_(code) {}
+  const Code& getCode() const {
+    return code_;
+  }
+
+  void step() {
+    pc_++;
+  }
+
+  void jump(size_t n) {
+    pc_ += n;
+  }
+
+  size_t getPC() const {
+    return pc_;
+  }
+
+  const Instruction& getInstruction() const {
+    return code_.instructions_.at(pc_);
+  }
+
+  c10::optional<int64_t> getDebugHandle() const {
+    return getDebugHandle(pc_);
+  }
+
+  c10::optional<int64_t> getDebugHandle(size_t pc) const {
+    if (pc >= code_.debug_handles_.size()) {
+      return {};
+    }
+    return code_.debug_handles_[pc];
+  }
+
+ private:
+  const Code& code_;
+  size_t pc_{0};
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..a02b04bb38416b30600a9ac422808fc1e0aec52b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/function.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/mobile/code.h>
+
+namespace torch {
+namespace jit {
+enum OpCode : uint8_t;
+struct Instruction;
+struct OperatorString;
+
+namespace mobile {
+
+class TORCH_API Function : public torch::jit::Function {
+ public:
+  explicit Function(c10::QualifiedName name);
+  Function(
+      c10::QualifiedName name,
+      Code code,
+      at::optional<c10::FunctionSchema> schema);
+  void run(Stack& stack) override;
+  at::IValue operator()(Stack& stack);
+  void ensure_defined() override {}
+  size_t num_inputs() const override;
+  const c10::QualifiedName& qualname() const override;
+  bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) override;
+
+  // NOTE: the APIs below is dangerous: if you call append_instruction with
+  // dbg_handle and then call it without; then the dbg_handle will become
+  // misaligned. Therefore only use ONE variant at time.
+  void append_instruction(OpCode op, int X, int N, int64_t dbg_handle);
+  void append_instruction(OpCode op, int X, int N);
+  void append_operator(
+      const std::string& name,
+      const std::string& overload_name,
+      const c10::optional<int>& num_specified_args);
+  void append_constant(const c10::IValue& constant);
+  void append_type(const c10::TypePtr& type);
+  void append_function(mobile::Function& func);
+
+  void set_register_size(size_t size);
+
+  int64_t get_debug_handle(size_t pc) const;
+  const Code& get_code() const;
+  Code& get_code();
+
+  torch::jit::Function& setSchema(c10::FunctionSchema schema) override;
+  bool hasSchema() const;
+  const c10::FunctionSchema& getSchema() const override;
+
+  // Returns the debug handle corresponding to where the execution
+  // is halted due to exception.
+  // If no corresponding debug handle is found then -1 is returned.
+  const std::vector<int64_t>& getExceptionDebugHandles() const;
+  static Function& registerFunc(
+      const std::string& qualified_name,
+      const std::vector<Instruction>& instructions,
+      const std::vector<c10::IValue>& constants,
+      const std::vector<c10::TypePtr>& types,
+      const size_t register_size);
+
+  // if not initialize, initialize by loading operators.
+  // return true of all op loaded, return false if some op is not found
+  // in the current runtime. Then, the ops that did not found will be filled
+  // in unsupported_op_names
+  bool initialize_operators(bool should_check_operators);
+
+ private:
+  c10::QualifiedName name_;
+  Code code_;
+  at::optional<c10::FunctionSchema> schema_; // (byte-code version 4+)
+};
+
+c10::optional<std::function<void(Stack&)>> makeOperatorFunction(
+    c10::OperatorName opname,
+    c10::optional<int> num_specified_args);
+
+TORCH_API std::string operator_str(const c10::OperatorName& opname);
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa3310f5e6d58d84b52403e86ebb5d64fcfacff0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h
@@ -0,0 +1,112 @@
+#pragma once
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/parse_operators.h>
+
+#include <istream>
+#include <memory>
+
+#include <caffe2/serialize/file_adapter.h>
+
+namespace torch {
+namespace jit {
+using caffe2::serialize::FileAdapter;
+using caffe2::serialize::IStreamAdapter;
+using caffe2::serialize::ReadAdapterInterface;
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+constexpr const char* kArchiveNameBytecode = "bytecode";
+constexpr const char* kArchiveNameConstants = "constants";
+constexpr const char* kArchiveNameVersion = "version";
+
+// The family of methods below load a serialized Mobile Module
+// into a mobile::Module object.
+TORCH_API mobile::Module _load_for_mobile(
+    std::istream& in,
+    c10::optional<at::Device> device,
+    ExtraFilesMap& extra_file,
+    uint64_t module_load_options = kDefaultMobileLoadOptions);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    c10::optional<at::Device> device,
+    ExtraFilesMap& extra_files);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::unique_ptr<ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    uint64_t module_load_options = kDefaultMobileLoadOptions);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    c10::optional<at::Device> device,
+    ExtraFilesMap& extra_files,
+    uint64_t module_load_options);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::istream& in,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::unique_ptr<ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device = c10::nullopt);
+
+/**
+ * Load only the contents of the "extra/" files whose names are
+ * passed in the map (extra_files). Populate the corresponding values
+ * with the contents of those files. Do not attempt to load the entire
+ * model, and stop once the extra files have been extracted.
+ *
+ * This API is needed to be able to load GPU models on linux CPU
+ * machines and extract only the extra files so that we can inspect
+ * the metadata that was added to the .ptl archive when it was
+ * generated.
+ *
+ */
+void _load_extra_only_for_mobile(
+    const std::string& filename,
+    c10::optional<at::Device> device,
+    ExtraFilesMap& extra_files);
+
+// Currently used by both mobile/import.cpp and model_compatibility.cpp.
+// Should be removed after model_compatibility.cpp start using simplified
+// version type_resolver and obj_loader.
+at::TypePtr resolveTypeNameMobile(
+    const c10::QualifiedName& qn,
+    std::shared_ptr<CompilationUnit> compilation_unit);
+c10::StrongTypePtr typeResolverMobile(
+    const c10::QualifiedName& qn,
+    const std::shared_ptr<CompilationUnit>& compilation_unit);
+c10::intrusive_ptr<c10::ivalue::Object> objLoaderMobile(
+    const at::StrongTypePtr& type,
+    const at::IValue& input,
+    mobile::CompilationUnit& mobile_compilation_unit);
+
+// Given a reader, which has access to a model file,
+// return true if there exists tensors in `bytecode` archive
+bool isTensorInBytecodeArchive(
+    caffe2::serialize::PyTorchStreamReader& stream_reader);
+
+namespace mobile {
+
+/**
+ * Given a torch::jit::mobile::Module, return a set of operator names
+ * (with overload name) that are used by any method in this mobile
+ * Mobile. This method runs through the bytecode for all methods
+ * in the specified model (module), and extracts all the root
+ * operator names. Root operators are operators that are called
+ * directly by the model (as opposed to non-root operators, which
+ * may be called transitively by the root operators).
+ *
+ */
+TORCH_API std::set<std::string> _export_operator_list(
+    torch::jit::mobile::Module& module);
+
+} // namespace mobile
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..e449bfbfae03115fd0d65b1985d1622207aacba9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Device.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+#include <istream>
+#include <map>
+#include <string>
+
+namespace torch {
+namespace jit {
+
+/**
+ * Loads named parameters from the serialized data in @p in.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
+TORCH_API std::map<std::string, at::Tensor> _load_parameters(
+    std::istream& in,
+    c10::optional<at::Device> device = c10::nullopt);
+
+/**
+ * Loads named parameters from the serialized data in @p filename.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
+TORCH_API std::map<std::string, at::Tensor> _load_parameters(
+    const std::string& filename,
+    c10::optional<at::Device> device = c10::nullopt);
+
+// NOTE: Please prefer using _load_parameters over using the function below.
+TORCH_API std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
+    const mobile::Module& module);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4d91d68f5d5b162c47607d40155e5d2974c174
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h
@@ -0,0 +1,23 @@
+#pragma once
+
+/**
+ * @file
+ * Declarations shared between import_data.cpp and export_data.cpp
+ */
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+namespace internal {
+/**
+ * The name of the mobile::Module attribute which contains saved parameters, as
+ * a Dict of names to Tensors. Only used for Flatbuffer serialization.
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+constexpr char kSavedParametersAttributeName[] = "data";
+} // namespace internal
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b685f1a84c93dfec3e64904b65173217dad7aa1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <vector>
+
+#include <torch/csrc/jit/mobile/code.h>
+#include <torch/csrc/jit/mobile/frame.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+struct InterpreterState {
+  TORCH_API explicit InterpreterState(const Code& code);
+  TORCH_API bool run(Stack& stack);
+
+ private:
+  void enterFrame(const Code&);
+  void leaveFrame();
+  void saveExceptionDebugHandles();
+  void callFunction(torch::jit::Function& f, Stack& stack);
+
+  c10::IValue& reg(size_t reg);
+  std::vector<c10::IValue> registers_;
+  std::vector<Frame> frames_;
+};
+
+const std::vector<DebugHandle>& getInterpretersExceptionDebugHandles();
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebd19c1c85b47342a7aa7bca44509c9112338888
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+class Module;
+
+struct TORCH_API Method {
+  Method(const Module* owner, Function* function);
+
+  void run(Stack& stack) const;
+  void run(Stack&& stack) const {
+    run(stack);
+  }
+
+  c10::IValue operator()(std::vector<c10::IValue> stack) const;
+
+  const std::string& name() const {
+    return function_->name();
+  }
+
+  int64_t get_debug_handle(size_t pc) const {
+    return function_->get_debug_handle(pc);
+  }
+
+  Function& function() const {
+    return *function_;
+  }
+
+ private:
+  // Methods are uniquely owned by a single module.
+  // This raw pointer allows referencing the module
+  const Module* owner_;
+
+  // Underlying unbound function
+  Function* function_;
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..2002eea4ed6da489ab7f50adc6b06313e377c1b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h
@@ -0,0 +1,197 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/mobile/debug_info.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/method.h>
+#include <torch/csrc/jit/mobile/quantization.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+using Stack = std::vector<c10::IValue>;
+
+// A CompilationUnit object is the one that gets executed by the lite
+// interpreter.
+//
+// A CompilationUnit object contains a list of Method Objects. These are methods
+// that appear in the original PyTorch Model. These method correspond to Python
+// member functions of the Model class.
+//
+// Methods in turn contain a Function, and a back-pointer to the Module that
+// owns this Method instance.
+//
+// A Function contains a Code Object (code_) which is defined in interpreter.h
+//
+// A Code object contains the following:
+//
+// std::vector<Instruction> instructions_;
+// std::vector<c10::OperatorName> op_names_;
+// std::vector<std::function<void(Stack&)>> operators_;
+// std::vector<c10::IValue> constants_;
+// std::vector<c10::TypePtr> types_;
+// size_t register_size_; // Aggregated output size.
+//
+class CompilationUnit {
+ public:
+  void register_function(std::unique_ptr<Function> fn);
+  std::vector<std::unique_ptr<Function>>& methods() {
+    return methods_;
+  }
+  const std::vector<std::unique_ptr<Function>>& methods() const {
+    return methods_;
+  }
+  Function* find_function(const c10::QualifiedName& qn);
+  const Function* find_function(const c10::QualifiedName& qn) const;
+
+  void unsafeRemoveFunction(const int64_t index) {
+    methods_.erase(methods_.begin() + index);
+  }
+
+ private:
+  std::vector<std::unique_ptr<Function>> methods_;
+};
+
+// A Torch Mobile Module is a representation of the model (trained in case
+// of inference). A Mobile Module contains
+//
+// 1. data (object_)
+// 2. metadata (optional) about the model (metadata_ from the metadata.pkl
+//    file added after training)
+// 3. Compilation Unit (cu_)
+//
+class TORCH_API Module {
+ public:
+  Module(
+      c10::intrusive_ptr<c10::ivalue::Object> object,
+      std::shared_ptr<CompilationUnit> cu)
+      : object_(std::move(object)), cu_(std::move(cu)) {}
+  Module() = default;
+  Method get_method(const std::string& method_name) const;
+  template <typename... Types>
+  c10::IValue run_method(const std::string& method_name, Types&&... args) {
+    return get_method(method_name)({IValue(std::forward<Types>(args))...});
+  }
+  c10::IValue forward(std::vector<c10::IValue> inputs) {
+    return get_method("forward")(std::move(inputs));
+  }
+  c10::optional<Method> find_method(const std::string& basename) const;
+
+  const std::string name() const {
+    return object_->name();
+  }
+  const std::vector<at::IValue>& slots() const {
+    return object_->slots();
+  }
+  const c10::intrusive_ptr<c10::ivalue::Object> _ivalue() const {
+    return object_;
+  }
+  const std::vector<at::Tensor> parameters() const;
+  const std::map<std::string, at::Tensor> named_parameters() const;
+  std::string get_forward_method_debug_info(int64_t debug_handle) const;
+  std::string getModuleHierarchy(const int64_t debug_handle) const;
+  std::string getCallStack(const int64_t debug_handle) const;
+  /// Enables "training" mode.
+  void train(bool on = true);
+  /// Calls train(false) to enable "eval" mode.
+  void eval() {
+    train(/*on=*/false);
+  }
+  /// True if the module is in training mode.
+  bool is_training() const;
+  const std::unordered_map<std::string, std::string> getMetadata() const {
+    return metadata_;
+  }
+  void setMetadata(
+      const std::unordered_map<std::string, std::string>& metadata) {
+    metadata_ = metadata;
+  }
+  const std::vector<Method> get_methods() const;
+
+  c10::IValue attr(const std::string& name, c10::IValue or_else) const {
+    if (auto r = object_->type()->findAttributeSlot(name)) {
+      return object_->getSlot(*r);
+    }
+    if (auto r = object_->type()->findConstantSlot(name)) {
+      return object_->type()->getConstant(*r);
+    }
+    return or_else;
+  }
+
+  void setDebugTable(MobileDebugTable&& debug_table) {
+    debug_table_ = std::move(debug_table);
+  }
+  const MobileDebugTable& getDebugTable() const {
+    return debug_table_;
+  }
+
+  void setHasDebugHandles(bool has_debug_handles) {
+    has_debug_handles_ = has_debug_handles;
+  }
+
+  bool hasDebugHandles() const {
+    return has_debug_handles_;
+  }
+
+  const CompilationUnit& compilation_unit() const {
+    return *cu_.get();
+  }
+
+  void set_delete_memory(std::shared_ptr<char> delete_mem) {
+    mem_to_delete_ = std::move(delete_mem);
+  }
+
+  void set_min_operator_version(int64_t version) {
+    min_operator_version_ = version;
+  }
+
+  int64_t min_operator_version() const {
+    return min_operator_version_;
+  }
+
+  void set_bytecode_version(int64_t version) {
+    bytecode_version_ = version;
+  }
+
+  int64_t bytecode_version() const {
+    return bytecode_version_;
+  }
+
+ private:
+  friend class quantization::PTQQuanizationHelper;
+
+  bool compareMethodSchemas(
+      const std::string& name_1,
+      const std::string& name_2);
+
+  void unsafeRemoveMethod(const std::string& basename);
+
+  void unsafeCopyMethod(
+      const std::string& new_method_name,
+      const Function& to_be_copied);
+
+  c10::intrusive_ptr<c10::ivalue::Object> object_;
+  std::unordered_map<std::string, std::string> metadata_;
+  std::shared_ptr<CompilationUnit> cu_;
+  MobileDebugTable debug_table_;
+  bool has_debug_handles_ = false;
+  int64_t min_operator_version_ = 4;
+  int64_t bytecode_version_ = 4;
+
+  // Extra handle for the module to delete when itself is deleted
+  std::shared_ptr<char> mem_to_delete_;
+};
+
+struct TORCH_API ModuleInfo {
+  uint64_t bytecode_version;
+  uint64_t operator_version;
+  std::unordered_map<std::string, int> opname_to_num_args;
+  std::unordered_set<std::string> function_names;
+  std::unordered_set<std::string> type_names;
+};
+TORCH_API ModuleInfo get_module_info(const mobile::Module& module);
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..114d4cee924e16259e0244eef044f5c37b6948f9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+
+class MobileDebugInfo : public c10::DebugInfoBase {
+ public:
+  const std::string& getModelName() {
+    return model_name_;
+  }
+
+  void setModelName(const std::string& model_name) {
+    model_name_ = model_name;
+  }
+
+  const std::string& getMethodName() {
+    return method_name_;
+  }
+
+  void setMethodName(const std::string& method_name) {
+    method_name_ = method_name;
+  }
+
+  size_t getOpIdx() {
+    return op_idx_;
+  }
+
+  void setOpIdx(size_t op_idx) {
+    op_idx_ = op_idx;
+  }
+
+ private:
+  std::string model_name_;
+  std::string method_name_;
+  // TODO: Kimish
+  // If we launch a thread such as for at::launch, interepter continuation
+  // and if the caching allocator is enabled in the base thread
+  // then, in order to propagate this information, that is caching allocator
+  // is enabled, across thread boundaries we can use the mechanism provided
+  // by ThreadLocalDebugInfo
+  // Once the thread local MobileDebugInfo is accessible in the launched
+  // thread, it can be accessed in that thread and that thread can set
+  // its own thread local CachingAllocatorInfo.
+  // However, we cannot expect every launched thread to extract and set
+  // its own thread local copy of CachingAllocatorInfo.
+  // But this can be done in lite interpreter, where in the run method
+  // it can do info =
+  // c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO))
+  // .get_caching_allocator_info();
+  // GetThreadLocalCachingAllocatorInfo() = info;
+  // Other option is to have MobileDebugInfo itself be the place where thread
+  // local copy of CachingAllocatorInfo is stored. Then
+  // DefaultMobileCPUAllocator inspects this to decide if to use
+  // CachingAllocator. However, current lite interpreter does not support FORK,
+  // thus from the run method of lite interpreter we are not really gonna launch
+  // another instance of lite interpreter in a different thread. So for now not
+  // getting bothered about passing CachingAllocatorInfo across thread
+  // boundaries. c10::CachingAllocatorInfo caching_allocator_info;
+  size_t op_idx_ = 0;
+};
+
+class MobileModuleObserver {
+ public:
+  virtual ~MobileModuleObserver() = default;
+
+  virtual void onEnterRunMethod(const int32_t) {}
+  virtual void onExitRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t) {}
+  virtual void onFailRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t,
+      const char*) {}
+  virtual void onEnterLoadModel(const int32_t) {}
+  virtual void onExitLoadModel(
+      const int32_t,
+      const std::unordered_map<std::string, std::string>&) {
+  } // key: filename, value: file content
+  virtual void onFailLoadModel(const int32_t, const char*) {}
+  virtual void onFailLoadModel(
+      const int32_t,
+      const char*,
+      const std::unordered_map<std::string, std::string>&) {}
+  virtual std::vector<std::string> getDefaultExtraFiles() = 0;
+  virtual std::unordered_map<std::string, std::string> processMetadataFromExtra(
+      const std::unordered_map<std::string, std::string>&) = 0;
+};
+
+class MobileObserverConfig {
+ public:
+  void setModuleObserver(std::unique_ptr<MobileModuleObserver> reporter) {
+    module_observer_ = std::move(reporter);
+  }
+  MobileModuleObserver* getModuleObserver() {
+    return module_observer_.get();
+  }
+
+ private:
+  std::unique_ptr<MobileModuleObserver> module_observer_;
+};
+
+MobileObserverConfig& observerConfig();
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h
new file mode 100644
index 0000000000000000000000000000000000000000..450178646b42ad16cbdf4b427f8bfcb775b3c3cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+using c10::IValue;
+TORCH_API void parseInstructions(
+    const std::string& function_name,
+    c10::ivalue::TupleElements&& ins_list,
+    c10::ivalue::TupleElements& debug_handles_m_tuple,
+    mobile::Function* function);
+TORCH_API void parseConstants(
+    const c10::ivalue::TupleElements& consts_list,
+    mobile::Function* function);
+TORCH_API void parseTypes(
+    const c10::ivalue::TupleElements& types_list,
+    mobile::Function* function);
+TORCH_API void parseRegisterSize(size_t rsize, mobile::Function* function);
+TORCH_API void applyUpgrader(
+    mobile::Function* function,
+    uint64_t operator_version);
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..6369c258c39cccc36aaf0949e6effcb6a015efef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch {
+namespace jit {
+using c10::IValue;
+
+enum MobileModuleLoadOptions {
+  OPERATOR_CHECK = 1,
+  // PARSE_ALL_EXTRA_FILE_MAPS is used to gate for ExtraFileMaps to pull all
+  // files automatically without explicit entries mapping. Refer to PR for a
+  // detail: https://github.com/pytorch/pytorch/pull/99747
+  PARSE_ALL_EXTRA_FILE_MAPS = 2,
+};
+
+const uint64_t kDefaultMobileLoadOptions =
+    MobileModuleLoadOptions::OPERATOR_CHECK;
+
+namespace mobile {
+
+TORCH_API void parseOperators(
+    c10::ivalue::TupleElements&& ops_list,
+    const uint64_t& module_load_options,
+    mobile::Function* function);
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d878b59374103e97d16aaab6b1fbaa2acfdcaa6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <functional>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+using Stack = std::vector<c10::IValue>;
+
+void registerPrimOpsFunction(
+    const std::string& name,
+    const std::function<void(Stack&)>& fn);
+
+bool hasPrimOpsFn(const std::string& name);
+
+std::function<void(Stack&)>& getPrimOpsFn(const std::string& name);
+
+class prim_op_fn_register {
+ public:
+  prim_op_fn_register(
+      const std::string& name,
+      const std::function<void(Stack&)>& fn) {
+    registerPrimOpsFunction(name, fn);
+  }
+};
+
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..58fd8f6d6a107b00509bfef666e106b7376bb3a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h
@@ -0,0 +1,119 @@
+#pragma once
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+
+// If we dont have kineto available then edge profiler does not
+// work since it relies on Kineto
+#ifdef USE_KINETO
+class TORCH_API KinetoEdgeCPUProfiler {
+ public:
+  // This profiler only profiles KINETO events
+  // No GPU_FALLBACK or NVTX
+  /*
+   * @param m is the instance of mobile Module which is being profiled.
+   *        Note that this implies that KinetoEdgeCPUProfiler can be used
+   *        to profile specific Module (see usage below), unliked ProfilerKineto
+   *        which can profile pytorch runtime in arbitrary scope.
+   * @param fname is the name of the file to which chrome trace is written.
+   * @param report_input_shapes: whether to record shapes of op's inputs.
+   * @param with_stack: whether to record model's python stacktrace for the op.
+   * @param with_flops: whether to report flops corresponding to the op.
+   * @param with_modules: whether to report original python module
+   *        hierarchy to which the op belongs.
+   * @param events
+   * @param adjust_vulkan_timestamps: whether to adjust vulkan timestamps from
+   *        query pool to align with cpu event times
+   *
+   * Usage pattern for this profiler must be as follows:
+   *
+   * {
+   *   KinetoEdgeCPUProfiler(m, filename, args);
+   *   m.forward(...);
+   * }
+   *
+   * The reason being that KinetoEdgeCPUProfiler has a dependency on Module
+   * and thus it must not outlive it.
+   *
+   * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
+   * within certain scope. In that scope, the captured reference to
+   * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+   * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
+   *
+   * An example of the anti-pattern and wrong usage is:
+   *
+   * std::shared_ptr<KinetoMobileCPUProfiler> profiler(m, filename, args);
+   * m.forward(...);
+   *
+   * Since KinetoEdgeCPUProfiler object would then be constructed on heap
+   * with its lifetime managed manually or via smart pointers.
+   */
+  KinetoEdgeCPUProfiler(
+      const torch::jit::mobile::Module& m,
+      const std::string& fname,
+      const bool report_input_shapes = false,
+      const bool profile_memory = false,
+      const bool with_stack = false,
+      const bool with_flops = false,
+      const bool with_modules = false,
+      std::vector<std::string> events = {},
+      const bool adjust_vulkan_timestamps = false);
+
+  const std::unique_ptr<torch::autograd::profiler::ProfilerResult>&
+  disableProfiler();
+  const std::unique_ptr<torch::autograd::profiler::ProfilerResult>&
+  getProfilerResult();
+  void recordBackendEvent(
+      const int64_t start_time_us,
+      const int64_t end_time_us,
+      const int64_t debug_handle,
+      const std::string& event_name,
+      const std::string& backend_name);
+  void recordBackendMemoryEvent(
+      void* ptr,
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      c10::Device device);
+
+  ~KinetoEdgeCPUProfiler();
+
+ private:
+  /*
+   * We store a reference to Module to make such dependency explicit, since
+   * a Module reference is already stored in a functor.
+   */
+  const mobile::Module& m_;
+  std::string trace_file_name_;
+  std::unique_ptr<torch::autograd::profiler::ProfilerResult> profiler_result_;
+};
+
+TORCH_API KinetoEdgeCPUProfiler* getCurrentEdgeProfiler();
+
+#define RECORD_BACKEND_EVENT_TO_EDGE_PROFILER(                               \
+    start_time_us, end_time_us, debug_handle, event_name, backend_name)      \
+  if (mobile::getCurrentEdgeProfiler()) {                                    \
+    mobile::getCurrentEdgeProfiler()->recordBackendEvent(                    \
+        start_time_us, end_time_us, debug_handle, event_name, backend_name); \
+  }
+
+#define RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER(              \
+    ptr, alloc_size, total_allocated, total_reserved, device)      \
+  if (mobile::getCurrentEdgeProfiler()) {                          \
+    mobile::getCurrentEdgeProfiler()->recordBackendMemoryEvent(    \
+        ptr, alloc_size, total_allocated, total_reserved, device); \
+  }
+#else
+
+#define RECORD_BACKEND_EVENT_TO_EDGE_PROFILER( \
+    start_time_us, end_time_us, debug_handle, event_name, backend_name)
+
+#define RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER( \
+    ptr, alloc_size, total_allocated, total_reserved, device)
+#endif
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3eb33805f77546917dbb76ed36661b9b6c5c104
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h
@@ -0,0 +1,63 @@
+#pragma once
+#include <torch/csrc/jit/mobile/prim_ops_registery.h>
+#include <torch/csrc/jit/mobile/register_ops_common_utils.h>
+
+namespace torch {
+namespace jit {
+
+void tupleIndex(Stack& stack);
+
+void raiseException(Stack& stack);
+
+void is(Stack& stack);
+
+void unInitialized(Stack& stack);
+
+void isNot(Stack& stack);
+
+void aten_format(Stack& stack);
+
+void size(Stack& stack);
+
+void sym_size(Stack& stack);
+
+void sym_size_int(Stack& stack);
+
+void sym_stride_int(Stack& stack);
+
+void sym_numel(Stack& stack);
+
+void sym_storage_offset(Stack& stack);
+
+void sym_stride(Stack& stack);
+
+void device(Stack& stack);
+
+void device_with_index(Stack& stack);
+
+void dtype(Stack& stack);
+
+void layout(Stack& stack);
+
+void toPrimDType(Stack& stack);
+
+void dim(Stack& stack);
+
+void _not(Stack& stack);
+
+void boolTensor(Stack& stack);
+
+void toList(Stack& stack);
+
+void numToTensorScalar(Stack& stack);
+
+void isCuda(Stack& stack);
+
+void numToTensorBool(Stack& stack);
+
+void dictIndex(Stack& stack);
+
+void raiseExceptionWithMessage(Stack& stack);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fc865215a703484a9c55479c2cb1b096047cca5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+class Module;
+namespace quantization {
+/*
+ * Device side PTQ API.
+ * Once the model has been prepared for quantization on server side, such model
+ * is sent to device. On device side the model is further trained. At the end of
+ * the training, before the model is readied for inference, we need to quantize
+ * the model.
+ * Usage of this API is as follows.
+ * PTQQuanizationHelper ptq_helper;
+ * ptq_helper.quantize_dynamic(m, "forward");
+ * Args:
+ * m: Captured by reference, an instance of mobile::Module. This module will be
+ * mutated in place to replace its <method_name> method with quantized
+ * equivalent. method:name: Name of the method to be quantized. AOT preparation
+ * for quantization must also have been done for this method. Returns: In place
+ * mutated `m` whose size should be smaller due to weight quantization and whose
+ * <method_name> method should use quantized ops
+ */
+class TORCH_API PTQQuanizationHelper {
+ public:
+  PTQQuanizationHelper() = default;
+  void quantize_dynamic(
+      torch::jit::mobile::Module& m,
+      const std::string& method_name);
+};
+} // namespace quantization
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff64ca34a8d5b5dbb79dd054d09c5c6ba8b94dea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+
+namespace torch {
+namespace jit {
+
+inline void noop(Stack& n) {}
+
+int64_t normalizeIndex(int64_t idx, int64_t list_size);
+
+// reference function THPVariable_to in python_variable_methods.cpp
+static C10_UNUSED at::Tensor to_dispatch(
+    at::Tensor self,
+    c10::optional<at::Device> device,
+    c10::optional<at::ScalarType> scalarType,
+    bool non_blocking,
+    bool copy) {
+  if (device && device->is_cuda()) {
+    at::globalContext().lazyInitCUDA();
+  }
+  if (!device && !scalarType && !copy) {
+    return self;
+  } else if (!device) {
+    return self.to(*scalarType, non_blocking, copy);
+  } else if (!scalarType) {
+    return self.to(*device, non_blocking, copy);
+  } else {
+    return self.to(*device, *scalarType, non_blocking, copy);
+  }
+}
+
+// Convert the tensor pointed to by \p data to a nested list. \p dim is the
+// number of dimensions in the tensor and \p cur_dim is the dimension being
+// processed by the current invocation. \p ty is the expected output IR type of
+// the operation. \p is the scalar type of \p data. \p sizes and \p strides are
+// the sizes and strides of the tensor operand and \p element_size is the size
+// in bytes of one tensor element.
+IValue tensorToListRecursive(
+    char* data,
+    int64_t cur_dim,
+    int64_t num_tensor_dims,
+    at::TypePtr ty,
+    at::ScalarType scalar_ty,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    size_t element_size);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..39745d38b57922b73d278402e3b0b68212c3498d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/jit_type.h>
+#include <unordered_set>
+
+namespace c10 {
+
+class TORCH_API TypeParser {
+ public:
+  explicit TypeParser(std::string pythonStr);
+  explicit TypeParser(std::vector<std::string>& pythonStrs);
+
+  TypePtr parse();
+  std::vector<TypePtr> parseList();
+  static const std::unordered_set<std::string>& getNonSimpleType();
+  static const std::unordered_set<std::string>& getCustomType();
+  std::unordered_set<std::string> getContainedTypes();
+
+ private:
+  TypePtr parseNamedTuple(const std::string& qualified_name);
+  TypePtr parseCustomType();
+  TypePtr parseTorchbindClassType();
+  TypePtr parseNonSimple(const std::string& token);
+
+  void expect(const char* s);
+  void expectChar(char c);
+  template <typename T>
+  TypePtr parseSingleElementType();
+
+  void lex();
+
+  std::string next();
+  c10::string_view nextView();
+  void advance();
+  C10_NODISCARD c10::string_view cur() const;
+
+  std::string pythonStr_;
+  size_t start_;
+  c10::string_view next_token_;
+
+  // Used for parsing string list
+  std::vector<std::string> pythonStrs_;
+  std::unordered_map<std::string, c10::TypePtr> str_type_ptr_map_;
+
+  // Store all contained types when parsing a string
+  std::unordered_set<std::string> contained_types_;
+};
+
+TORCH_API TypePtr parseType(const std::string& pythonStr);
+
+TORCH_API std::vector<TypePtr> parseType(std::vector<std::string>& pythonStr);
+
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..471ee24c6f8f91f4a32bfc2aff31ebc88f4b6435
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// #include <ATen/core/ivalue.h>
+#include <ATen/core/ivalue_inl.h>
+
+#include <torch/csrc/jit/mobile/code.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/serialization/import_export_functions.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+namespace jit {
+struct Instruction;
+struct Upgrader {
+  int min_version;
+  int max_version;
+  std::string upgrader_name;
+  int index;
+};
+
+// From operator_versions.yaml
+TORCH_API const std::unordered_map<std::string, std::vector<Upgrader>>
+getOperatorVersionMapForMobile();
+
+struct OperatorString {
+  const std::string name;
+  const std::string overload_name;
+  const c10::optional<int> num_specified_args;
+};
+
+struct ByteCodeFunctionWithOperator {
+  mobile::Function& function;
+  std::vector<OperatorString> operators;
+};
+
+TORCH_API const std::vector<ByteCodeFunctionWithOperator>&
+getUpgraderBytecodeList();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h
new file mode 100644
index 0000000000000000000000000000000000000000..24015573b7c79c7a03d220c39c727310869d5f17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool AddIfThenElseOp(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h
new file mode 100644
index 0000000000000000000000000000000000000000..67eee98f4c7449510db6e353442c92417322cce8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void AnnotateWarns(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h
new file mode 100644
index 0000000000000000000000000000000000000000..f53e526c095f03a5414160ef35970947fbf10d03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h
@@ -0,0 +1,15 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void Autocast(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool setAutocastMode(bool value);
+TORCH_API bool autocastEnabled();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..45e2adefd04ed985a4828d9e6bdf207a74f03156
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+// Replaces prim::Guard nodes with prim::BailOut nodes and
+// computes sets of inputs needed to resume execution at
+// bailout points
+TORCH_API void InsertBailOuts(std::shared_ptr<Graph> graph);
+
+// Builds a bailout graph into `target` (which is an empty graph)
+// for a given bailout point `bailout_index`
+// from the original graph `orig` (the original unoptimized graph)
+// BailOut graphs allow Interpreter to resume
+// execution of the (un/de)optimized graph (i.e.
+// a graph that doesn't rely on any assumptions derived from
+// on profiling information) from a given BailOut point
+// should any of the assumptions fail for an actual input.
+TORCH_API std::shared_ptr<Graph> BuildBailOutGraphFrom(
+    int64_t bailout_index,
+    const std::shared_ptr<Graph>& orig,
+    const std::shared_ptr<Graph>& target);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h
new file mode 100644
index 0000000000000000000000000000000000000000..41566c8534a73657eae0082619ac52051f1d0776
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void BatchMM(std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h
new file mode 100644
index 0000000000000000000000000000000000000000..08f0a310922b0901e4878d72d0e65d9167017376
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API std::shared_ptr<Graph> Canonicalize(
+    const std::shared_ptr<Graph>& graph,
+    bool keep_unique_names = true);
+
+TORCH_API void CanonicalizeOutputs(std::shared_ptr<Graph>& graph);
+
+TORCH_API c10::optional<const Use> firstOrLastUse(Value* v, bool find_first);
+
+TORCH_API bool isBeforeOrAfter(
+    const Use& a,
+    const Use& b,
+    bool checking_before);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..98ef42349ed6d683bec4cda60b6e7be31b4b4262
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void CanonicalizeOps(const std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..85d57142e3bf289d872a0d3dc9ec960a984df47a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h
@@ -0,0 +1,12 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void CheckStrictFusion(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7fe6d83945062bca7d19058bfebd39fa0d4dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void unprofileGraphInputs(const std::shared_ptr<Graph>& graph);
+TORCH_API void unprofileBlock(Block* start_block);
+// Unprofiles all the node outputs in a block.
+
+TORCH_API void ClearProfilingInformation(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h
new file mode 100644
index 0000000000000000000000000000000000000000..19ed1c5a1c31b0e3f5cbbf751af65145a65028d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Undefinedness makes argument matching fail for regular tensor operations
+// if 1+ arguments are undefined or possibly undefined tensors.
+// Technically, undefined tensors are **not** tensors as the regular tensor
+// operations do not know how to handle them.
+// However, in practice, there are guards and conversion operators that
+// **always** gate regular operations if undefined tensors may be present
+// Eventually, we would love to move to the world where we use optionals
+// in lieu of undefined tensors.
+// When this happens, this pass will be removed
+TORCH_API void ClearUndefinedness(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a788a82d4d7ebaa3ee0571c79ec51366aa0b632
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool EliminateCommonSubexpression(
+    const std::shared_ptr<Graph>& graph);
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab684d4ae01cee7a295033e0171d8453ab04cd8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Eliminates common inputs among `aten::cat` ops.
+TORCH_API bool EliminateConcatCommonInputs(const std::shared_ptr<Graph>& graph);
+
+// Expands `aten::cat` ops into `aten::copy` ops and eliminates redudancies
+// in the buffers used for concatenation if possible.
+TORCH_API void ExpandConcatAndEliminateRedundancy(
+    const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool CombineConcats(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc8bacde117a1a792b06ca7f9ea114f33471a484
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void ConstantPooling(const std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h
new file mode 100644
index 0000000000000000000000000000000000000000..c44a4a7cfab7e4f2309172a39de896702fb062df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Runs constant propagation on all objects unless ignore_custom_classes is
+// specified as true, in which case user defined classes are skipped.  This is
+// useful to prevent early fusion of packing operations, which end up lowering
+// away information about their constructors (e.g. packed::linear_clamp_prepack
+// and prepacked::conv2d_clamp_prepack)
+// Returns True if the pass made a change to the graph
+TORCH_API bool ConstantPropagation(
+    std::shared_ptr<Graph>& graph,
+    bool ignore_custom_classes = false);
+
+// runs constant propagation only on ops that have non-aliasing inputs & outputs
+// Returns True if the pass made a change to the graph
+TORCH_API bool ConstantPropagationImmutableTypes(std::shared_ptr<Graph>& graph);
+
+// Runs the node if its inputs are constants. Callers of this function must
+// make their own determination if constant prop is appropriate - for example
+// non-deterministic ops or ops with side effects.  If ignore_custom_classes is
+// specified, nodes that output user defined classes are not run.
+TORCH_API c10::optional<Stack> runNodeIfInputsAreConstant(
+    const Node* node,
+    bool ignore_custom_classes = false,
+    AliasDb* db = nullptr);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..f50272c09843b0f8b9670eb5850629446b06fb5b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <cstddef>
+
+namespace torch {
+namespace jit {
+
+// insert GraphExecutor nodes that group together
+// subgraphs that are differentiable by the jit's autodiff passes
+// threshold - minimum number of nodes that will appear in a block
+// returns all differentiable blocks that have been found
+TORCH_API std::vector<Node*> CreateAutodiffSubgraphs(
+    const std::shared_ptr<Graph>& graph,
+    size_t threshold = 2);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..bccf93d3df0a8ec8a8ebeeea2fc40c68011a6ecb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void CreateFunctionalGraphs(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void InlineFunctionalGraphs(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..5120e60b77bdab8ed38a7443df14129a64b435a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// If given a top-level graph, DCE will construct do alias analysis that allows
+// for "smarter" dead code elimination (we will eliminate mutable ops if we can
+// prove the mutated values are not used). Otherwise, we will not allow DCE to
+// eliminate mutable ops.
+//
+// So, prefer to use the graph version if you can.
+enum class DCESideEffectPolicy : uint8_t {
+  // default behavior: dead code elimination will check if a node has side
+  // effects
+  // and not delete it if it does.
+  DONT_DELETE_NODES_WITH_SIDE_EFFECTS,
+  // with this flag, dead code elimination will not check if a node has side
+  // effects and treat nodes with side effects like any other node,
+  // i.e. delete them if their outputs aren't used anywhere.
+  ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS
+};
+
+TORCH_API void EliminateDeadCode(
+    const std::shared_ptr<Graph>& graph,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+TORCH_API void EliminateDeadCode(
+    Block* block,
+    bool recurse = true,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+
+// Invoke the user-provided callback on all live values before deleting anything
+TORCH_API void EliminateDeadCode(
+    Block* block,
+    std::function<void(const std::unordered_set<const Value*>&)> cb,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..347a410281141af60d97787d85956cea824802e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void DecomposeOps(std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..df95154406c15f43a1de07e7f8fbd2947bd82612
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+struct Graph;
+
+// Propagates Device type info throughout the given graph.
+TORCH_API bool DeviceTypePropagation(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec6ee8305035386fe96ca05c362178fe52b5a73a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+struct Graph;
+
+// Propagate tensor properties (e.g., dtype, device, is_contiguous, layout)
+// propagation on all tensor objects. Currently, we only support dtype
+// propagation
+TORCH_API bool DtypePropagation(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1769474849c3207768ee46c6188af60dcc628c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Remove ops that do nothing on the forward pass (like aten::detach).
+// This pass is invoked as a part of freeze_module.
+// This function also takes a set of custom ops to eliminate. All ops in this
+// set must take their output as their first input, i.e. x = f(x, ...)
+TORCH_API bool EliminateNoOps(
+    std::shared_ptr<Graph>& graph,
+    std::unordered_set<c10::Symbol> custom_ops = {});
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..7210285d6380388a8faa681da63c776d66de1870
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Erase NumberType information. This is necessary for and only used in
+// exporting to ONNX. This pass ensures that no remaining Values have
+// NumberType types, replacing them with tensors.
+// The following things are done to erase NumberType info:
+// - NumberType outputs are changed to DynamicType.
+// - prim::Constant nodes which are numbers get changed into 0-dim tensors of
+//   the corresponding type
+// - prim::TensorToNum, aten::Float, aten::Int and prim::NumToTensor nodes
+//   are erased.
+//
+// The pass assumes that DCE will be called sometime after.
+TORCH_API void EraseNumberTypes(const std::shared_ptr<Graph>& graph);
+TORCH_API void EraseNumberTypesOnBlock(Block* block);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4a9a83bba740a5e704ee90aaa72749dc34f5c23
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Directly after tracing, we have an ill-formed graph with blocks inserted.
+// Example:
+//
+// graph(%self : ClassType<Module>,
+//       %input.1 : Float(3, 4)):
+//   %1 : ClassType<Module> = prim::GetAttr[name="relu1"](%self)
+//   %2 : ClassType<Module> = prim::GetAttr[name="relu2"](%self)
+//   %3 : ClassType<Module> = prim::GetAttr[name="rrr"](%2)
+//    = prim::TracedModuleForward[scope="__module.relu1"]()
+//     block0():
+//       %input : Float(3, 4) = aten::relu(%input.1),
+//       -> ()
+//    = prim::TracedModuleForward[scope="__module.relu2"](),
+//     block0():
+//        = prim::TracedModuleForward[scope="__module.relu2.rrr"](),
+//         block0():
+//           %6 : Float(3, 4) = aten::relu(%input),
+//           -> ()
+//       -> ()
+//   return (%6)
+//
+// In this pass, we:
+//   1) Lift Value defs to as high of a scope as needed to ensure that
+//      they dominate all their uses. For example, `input` in the above
+//      graph needs to be lifted to the top-level block so that its use
+//      in the second `relu` operator is dominated.
+//   2) Lambda lift the blocks. This ensures that all values used within
+//      each scope have their defs captured.
+//   3) Convert the scope blocks into methods on their respective Modules,
+//      and convert TracedModuleForward nodes to CallMethod nodes into those
+//      methods.
+//
+//  Then, we'll have a well-formed graph with proper method calls.
+TORCH_API void FixupTraceScopeBlocks(
+    std::shared_ptr<Graph>& graph,
+    Module* self);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5367f56f0ef65f2fdae2e20d4e3b06803d81937
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch {
+namespace jit {
+
+/** \brief Fold Conv2d-BatchNorm2d into Conv2d in all methods of this
+ * module and all its submodules, forward is included by default.
+ *
+ * The weight and bias of the Conv2d are correspondingly updated. Should only be
+ * used on modules in eval mode.
+ */
+TORCH_API Module FoldConvBatchNorm(const Module& module);
+
+struct TORCH_API ConvBNParameters {
+  at::Tensor conv_w;
+  at::Tensor conv_b;
+  at::Tensor bn_rm;
+  at::Tensor bn_rv;
+  double bn_eps = 0.0;
+  at::Tensor bn_w;
+  at::Tensor bn_b;
+};
+
+/**
+ * Given the current weight and bias tensors of a Conv module and parameters
+ * of the BatchNorm module we're folding with, compute the updated values
+ * for the weight and bias.
+ *
+ * The function is basically copied from torch/nn/utils/fusion.py
+ */
+TORCH_API std::tuple<at::Tensor, at::Tensor> computeUpdatedConvWeightAndBias(
+    const ConvBNParameters& p);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h
new file mode 100644
index 0000000000000000000000000000000000000000..74053960366fd827f8bbc661a94570286a47fe8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API LinearBNParameters {
+  at::Tensor linear_w;
+  at::Tensor linear_b;
+  at::Tensor bn_rm;
+  at::Tensor bn_rv;
+  double bn_eps = 0.0;
+  at::Tensor bn_w;
+  at::Tensor bn_b;
+};
+
+/**
+ * Given the current weight and bias tensors of a Linear module and parameters
+ * of the BatchNorm module we're folding with, compute the updated values
+ * for the weight and bias.
+ *
+ * The function is basically copied from torch/nn/utils/fusion.py
+ */
+TORCH_API std::tuple<at::Tensor, at::Tensor> computeUpdatedLinearWeightAndBias(
+    const LinearBNParameters& p);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b3823b6f7e82ddd48ad8c6d0d948da62679b0c1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h
@@ -0,0 +1,36 @@
+/** \brief This file defines freezing Torchscript module API.
+ *
+ * This API has python-binding and can be invoked directly or as a part of
+ * general optimization pipeline.
+ */
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+/** \brief Freeze Module, i.e., Assume all attributes are constants.
+ *
+ * Freezing module is a functionality that allows the JIT to internalize
+ * immutable attributes. Combined with inlining, the module is aggressively
+ * optimized and significant overhead is optimized away. The freezeModule API
+ * produces a cloned frozen module.
+ */
+
+namespace torch {
+namespace jit {
+
+TORCH_API Module freeze_module(
+    const Module& module,
+    std::vector<std::string> preservedAttrs = std::vector<std::string>(),
+    bool freezeInterfaces = true,
+    bool preserveParameters = false);
+
+// Clone-free version of freeze_module. This modifies the module inplace.
+// Use this version to avoid extra memory usage incurred by cloning the module.
+TORCH_API void freeze_module_inplace(
+    Module* module,
+    std::vector<std::string> preservedAttrs = std::vector<std::string>(),
+    bool freezeInterfaces = true,
+    bool preserveParameters = false);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f86d19e9e483ec9dc4bc79cea39a04848961ef0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Concats multiple linear ops with the same Tensor input
+// into a single linear op.
+TORCH_API bool FrozenConcatLinear(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bfbb8975ad524d402e9a219290016524edcb0e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API extern std::function<void(std::shared_ptr<Graph>&)>&
+getFuseFrozenConvAddReluImpl();
+
+TORCH_API void FuseFrozenConvAddRelu(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..695029bca7fd38917d0d07aadb1c2837d820f681
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Fuses Convolution -> Batchnorm into a single Convolution by
+// folding batchnorm weights into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph);
+
+// Fuses Convolution -> Add/Sub into a single Convolution by
+// folding add constant tensor into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph);
+
+// Fuses Convolution -> Mul/Div into a single Convolution by
+// folding add constant tensor into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h
new file mode 100644
index 0000000000000000000000000000000000000000..80e041fb9255018b07070733d01029ca2f434117
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+/** \brief Runs a set of Optimizations that Optimize Frozen Graphs
+ *
+ * Currently this set of optimizations is:
+ * - FoldFrozenConvBatchnorm
+ * - FoldFrozenConvAddOrSub
+ * - FoldFrozenConvMulOrDiv
+ * - FoldFrozenLinearBatchnorm
+ */
+
+namespace torch {
+namespace jit {
+
+TORCH_API void OptimizeFrozenGraph(
+    std::shared_ptr<Graph>& graph,
+    bool optimize_numerics = true);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..55ac5c032340cbee93e52261eb836a881178aae1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Fuses Linear -> BatchNormNd into a single Linear by
+// folding batchnorm weights into linear weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenLinearBatchnorm(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a70cdc6d77badb5b5f4e547e1369dac3dd97562
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Transposes the weight matrix for frozen linear modules.
+// and converts it into a matmul
+TORCH_API bool FrozenLinearTranspose(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..be870d9a52b6b8922910ece6532f83b7e148caf3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Converts operators & their parameters to mkldnn if it is profitable
+// Currently encompassing Conv2d and Conv3d, and Linear
+// Op must be in float32 and mkldnn must be built
+// This pass only works on frozen graph
+TORCH_API void ConvertFrozenOpsToMKLDNN(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..c200417bd3323b62d7bb0171337f96f59041c545
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h
@@ -0,0 +1,24 @@
+/** \brief Fusing linear patterns as single at::linear for easier pattern
+ * matching in later passes
+ */
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+/** \brief Match the at::linear pattern and fuse it into a single at::linear
+ * This pass fuse the addmm or matmul + add generated by JIT back to linear
+ * This pass can be deleted once the JIT can emit the aten::linear in the future
+ */
+TORCH_API void FuseLinear(std::shared_ptr<Graph>& graph);
+
+/** Swap functional linear CallFunctions to aten::linear
+ */
+TORCH_API void SwapFunctionalLinear(std::shared_ptr<Graph>& graph);
+/** Swap all functional linear CallFunctions in module
+ */
+TORCH_API void SwapFunctionalLinear(Module& module);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..67a46212ac825f30d8eb977ade4e4e6ce8709fca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+TORCH_API void FuseAddRelu(script::Module& module);
+TORCH_API void FuseAddRelu(std::shared_ptr<Graph>& graph);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..e77947eaf0dc1b0b21597c2c9c2bdbc6f5aff359
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool canFuseOnCPULegacy();
+TORCH_API void overrideCanFuseOnCPULegacy(bool value);
+
+// NB: Be sure to run DCE before fusion, because dead instructions
+// can prevent fusion opportunities from being exploited.
+// On Windows will noop, NYI
+TORCH_API void FuseGraph(
+    std::shared_ptr<Graph>& graph,
+    bool strict_fuser_check = false);
+
+// \brief Custom fusion pass using a node-level callback to
+// determine the inclusion of nodes in a subgraph.
+//
+// This helper omits aliased inputs and fusion across control flow
+// boundaries.
+//
+// \arg graph The graph to be modified in-place
+// \arg is_fusable A callback run on each fusable node in the graph.
+// \arg kind The label given to the resultant fused subgraph
+// \arg arg_limit The maximum number of args the resultant fused subgraph
+//                should have.  Note: This will likely develop into a general
+//                post condition on the fused subgraph.
+TORCH_API void CustomFuseGraph(
+    std::shared_ptr<Graph>& graph,
+    const std::function<bool(Node*)>& is_fusable,
+    Symbol kind,
+    size_t arg_limit = std::numeric_limits<size_t>::max());
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..262b98501778457e2c136f5c67a8e1728623b195
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace torch {
+namespace jit {
+namespace graph_rewrite_helper {
+
+std::string getFuncName(Value* func_value);
+Value* getValue(
+    const std::string& name,
+    const std::unordered_map<const Value*, Value*>& match_vmap,
+    const std::unordered_map<std::string, Value*>& vmap);
+c10::optional<IValue> getIValue(
+    const std::string& name,
+    const std::unordered_map<const Value*, Value*>& match_vmap,
+    const std::unordered_map<std::string, Value*>& vmap);
+TORCH_API void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph);
+
+bool isClampFusable(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// This struct contains a compiled IR patterns slated for use in the
+// findPatternMatches function. The struct encapsulates the common
+// information from parseIR that is used in conjunction with the
+// pattern matching facility. A const instance of this struct can
+// also be stored away to cache the compiled IR pattern and reduce
+// runtime cost
+struct PatternInfo {
+  std::string pattern_string;
+  std::unique_ptr<Graph> pattern_graph;
+  std::unordered_map<std::string, Value*> vmap;
+  std::vector<MatchFilter> filters;
+
+  static PatternInfo parse_from_str(
+      std::string pattern_string,
+      const std::vector<MatchFilter>& filters = {}) {
+    PatternInfo rv{
+        std::move(pattern_string),
+        std::make_unique<Graph>(),
+        decltype(vmap){},
+        filters};
+    parseIR(rv.pattern_string, rv.pattern_graph.get(), rv.vmap);
+    return rv;
+  }
+};
+
+} // namespace graph_rewrite_helper
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d9615cf8e4fefba99a398a05866c32ffe1bce2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void EliminateRedundantGuards(std::shared_ptr<Graph> graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eaa73133f380561723450e8c82fdf5aa746e90a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+void HoistConvPackedParams(script::Module& m);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..d97df93165020aea285454daec3ab212ffb7977c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool canRunWithAutograd(Node* node);
+
+TORCH_API void InlineAutodiffSubgraphs(
+    std::shared_ptr<Graph>& graph,
+    size_t threshold = 5);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h
new file mode 100644
index 0000000000000000000000000000000000000000..786a8a4f180ed3ebf4934b54fc343c49fd2b564a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Inline Fork and Wait calls. This is used, for example, in ONNX export, where
+// we do not support the explicit parallelism structures and would rather
+// just have a flat graph. This inlines the forked section in the fork()
+// callsite and replaces uses of the result of wait() calls with the values
+// produced from the (now-inlined) forked section.
+TORCH_API void InlineForkWait(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h
new file mode 100644
index 0000000000000000000000000000000000000000..748d41bc6ff49d664516ab8011bfe76859374abd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void inlineForkedClosures(std::shared_ptr<Graph>& to_clean);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h
new file mode 100644
index 0000000000000000000000000000000000000000..51266b9c0c6eb7218327f9c0a5a26c75d165ba0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Inline function and method calls.
+TORCH_API void Inline(Graph& graph);
+
+TORCH_API GraphFunction* tryToGraphFunction(Node* n);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..daed697df32360d06d69681b2e28fede6417fec2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void CheckInplace(std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ed21787d881ef2c82a1673e4794864d3bb2fa26
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void InsertGuards(std::shared_ptr<Graph> graph);
+
+TORCH_API void RemoveProfilingNodes(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h
new file mode 100644
index 0000000000000000000000000000000000000000..35789189eec687047a6f256386af8b1b9c81654c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// return true if graph is modified
+TORCH_API bool RefineIntegerValues(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h
new file mode 100644
index 0000000000000000000000000000000000000000..b810dbb1e84a21ef04747905afe14b93922f6b3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void liftClosures(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h
new file mode 100644
index 0000000000000000000000000000000000000000..4767b61729bf505967656ada94f87e0a36bd45a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/util/sparse_bitset.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <list>
+#include <unordered_map>
+#include <vector>
+namespace torch {
+namespace jit {
+
+using SparseBitVector = ::c10::SparseBitVector<256>;
+
+// BuildLivenessSets computes "bailout" liveness which is equivalent to
+// "{LIVE_IN} or {GEN}" or "{LIVE_OUT} - {KILL}"
+TORCH_API std::unordered_map<Node*, std::vector<Value*>> BuildLivenessSets(
+    std::shared_ptr<Graph> graph);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h
new file mode 100644
index 0000000000000000000000000000000000000000..2394eb9c074809d93d39e76ed2321d98d9516c9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// return true if graph is modified
+TORCH_API bool UnrollLoops(std::shared_ptr<Graph>& graph);
+
+// Only unrolls constant loops. Will unroll them regardless of loop block size
+TORCH_API bool UnrollConstantLoops(std::shared_ptr<Graph>& graph);
+
+TORCH_API Node* PeelLoop(Node* n, size_t times);
+
+// return true if graph is modified
+TORCH_API bool PeelProfilingLoops(const std::shared_ptr<Graph>& graph);
+
+struct TORCH_API LoopsPeeler {
+  LoopsPeeler(std::function<bool(Node* n)> callback, size_t num_iterations = 1)
+      : callback_(std::move(callback)), num_iterations_(num_iterations) {}
+
+  bool run(const std::shared_ptr<Graph>& graph);
+
+ private:
+  void collectLoop(Node* n);
+  void collectLoops(Block* block);
+  void peelLoops();
+
+  std::function<bool(Node* n)> callback_ = nullptr;
+  Node* in_loop_ = nullptr;
+  std::list<Node*> loops_to_peel_;
+  size_t num_iterations_ = 1;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2156ad20e7ff60a65ff08d4fbaeca1014551aa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// This pass removes 'grad_of' nodes, replacing them with conditionals of
+// the form:
+// if any_defined(inputs):
+//  outputs = <original_computation>
+// else:
+//  outputs = undefineds
+TORCH_API void LowerGradOf(Graph& g);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..60ab78ccca7446fc6b09d92ef1bb80ee90deac4b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+using ModulePtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// Given a graph with of a method which first argument is %self, lower it to a
+// graph where all attributes accesses are replaced with explicit inputs of the
+// graph (rather than results of prim::GetAttr executed on %self).
+//
+// Returns a tuple (graph, parameters) where the last module.parameters.size()
+// inputs to the graph are the trainable parameters used in this method. The
+// remaining inputs are the true inputs to the function.
+TORCH_API std::pair<std::shared_ptr<Graph>, std::vector<IValue>> LowerGraph(
+    Graph& graph,
+    const ModulePtr& self);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4a1b82986fcbafac0fbc937b18f5be4bef24464
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// removes tuples where TupleConstruct and TupleUnpack are matched
+// but leaves tuples in place across if statements, loops, and as inputs/outputs
+TORCH_API void LowerSimpleTuples(const std::shared_ptr<Graph>& graph);
+
+// removes _all_ tuples and raises an error if some cannot be removed
+// this is used by ONNX to ensure there are not tuples before conversion,
+// but will not work on graphs whose inputs contain tuples.
+TORCH_API void LowerAllTuples(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void LowerSimpleTuples(Block* block);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c179b32c259d0207730e0b802f9847e758f4a71
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+TORCH_API void metalInsertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void metalInsertPrePackedOps(script::Module& module);
+TORCH_API void metalFusePrePackedConvWithClamp(script::Module& module);
+TORCH_API void metalFoldPrePackingOps(script::Module& module);
+TORCH_API script::Module metalOptimizeForMobile(
+    const script::Module& module,
+    const std::vector<std::string>& preserved_methods);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..1093c61c6c437e8dbffd90102834190c66ca21bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+#if AT_MKLDNN_ENABLED()
+
+#include <ideep/tensor.hpp>
+
+#endif // AT_MKLDNN_ENABLED()
+
+namespace torch {
+namespace jit {
+
+#if AT_MKLDNN_ENABLED()
+
+namespace mkldnn {
+
+const static std::map<std::string, std::vector<torch::jit::MatchFilter>>
+    fusion_rewrite_map = {
+        {"none", {}},
+        {"relu", {}},
+};
+
+} // namespace mkldnn
+
+#endif // AT_MKLDNN_ENABLED()
+
+void FuseConvWithEltwise(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fed54551b3c96ea5cae8851cf8bdeb37a896c1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <cstdint>
+
+enum class MobileOptimizerType : int8_t {
+  CONV_BN_FUSION,
+  INSERT_FOLD_PREPACK_OPS,
+  REMOVE_DROPOUT,
+  FUSE_ADD_RELU,
+  HOIST_CONV_PACKED_PARAMS,
+  CONV_1D_TO_2D,
+  VULKAN_AUTOMATIC_GPU_TRANSFER,
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..08d927b54f773b5503a841d5165011ca295f3661
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// This pass converts aten ops to a normalized form. It is
+// run immediately after IR generation in both the tracer and compiler,
+// so downstream consumers of the IR do not need handle ops in their
+// pre-normalized form.
+// Currently only handles normalization of op aliases.
+TORCH_API void NormalizeOps(const std::shared_ptr<Graph>& graph);
+
+const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fa24138a86460d121515f8e7261c5452d732ba1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+
+#include <ATen/Config.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+static std::atomic<bool> onednn_enabled{true};
+
+static std::atomic<bool>& getLlgaEnabled() {
+  return onednn_enabled;
+}
+
+TORCH_API void fuseGraph(std::shared_ptr<Graph>& g);
+
+} // namespace onednn
+} // namespace fuser
+
+struct C10_EXPORT RegisterLlgaFuseGraph
+    : public PassManager<RegisterLlgaFuseGraph> {
+  static bool setEnabled(bool enabled) {
+    TORCH_CHECK(
+        AT_MKLDNN_ENABLED(),
+        "Running oneDNN Graph fuser is only supported with MKLDNN builds.");
+    bool oldState = fuser::onednn::getLlgaEnabled();
+    fuser::onednn::getLlgaEnabled() = enabled;
+    if (enabled) {
+      registerPass(fuser::onednn::fuseGraph);
+    } else {
+      clearPass();
+    }
+    return oldState;
+  }
+
+  static bool isEnabled() {
+    return fuser::onednn::getLlgaEnabled();
+  }
+
+  // override PassManager::registerPass to register pre-pass
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      passID(registerPrePass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // override PassManager::clearPass to clear pre-pass
+  static void clearPass() {
+    if (isRegistered()) {
+      clearPrePass(passID());
+      isRegistered(true);
+    }
+  }
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..64df1071f73c7b275a4c8023492d08e3f3938748
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+
+TORCH_API std::shared_ptr<Graph> ToONNX(
+    std::shared_ptr<Graph>& state,
+    ::torch::onnx::OperatorExportTypes operator_export_type);
+TORCH_API std::unordered_map<Value*, Value*> BlockToONNX(
+    Block* old_block,
+    Block* new_block,
+    ::torch::onnx::OperatorExportTypes operator_export_type,
+    std::unordered_map<Value*, Value*>& env,
+    bool is_sub_block = false);
+TORCH_API void NodeToONNX(
+    Node* old_node,
+    Block* new_block,
+    ::torch::onnx::OperatorExportTypes operator_export_type,
+    std::unordered_map<Value*, Value*>& env);
+TORCH_API void RemovePrintOps(std::shared_ptr<Graph>& graph);
+TORCH_API void PreprocessCaffe2Ops(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..3adf7774e04374ab4620a10e6f6acc4b3fe2cbe2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h
@@ -0,0 +1,136 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+/* `getCustomPrePasses()` returns a vector of passes that will be executed
+ * after differentiation but before any fusion. This is the de-facto location
+ * for compiler backends to insert passes.
+ *
+ * `getCustomPostPasses()` returns a vector of passes that will be
+ * executed after differentiation and after fusion (if any). This is the
+ * location for fusion cleanup passes if they are needed.
+ *
+ * Static registration of a pass can be done by creating a global
+ * `Register{Pre,Post}Pass r(Pass)` variable in a compilation unit.
+ *
+ * pass_manager.h uses a Meyer's singleton to store a vector of `Pass`es, which
+ * modify the IR graph in place.
+ */
+
+namespace torch {
+namespace jit {
+
+// A pass modifies a Graph in place.
+using GraphPass = std::function<void(std::shared_ptr<Graph>&)>;
+
+// Since Passes are std::functions, we associate a UUID to each pass, this way
+// if we want to deregister a pass, we have something to reference it by.
+using GraphPassNameType = unsigned int;
+
+// Graph pass entries have a name associated with them
+using GraphPassEntry = std::pair<GraphPass, GraphPassNameType>;
+
+// Return currently registered passes. Passes are stored in a static vector
+TORCH_API std::vector<std::pair<GraphPass, GraphPassNameType>>&
+getCustomPostPasses();
+TORCH_API std::vector<std::pair<GraphPass, GraphPassNameType>>&
+getCustomPrePasses();
+
+TORCH_API GraphPassNameType registerPostPass(GraphPass p);
+TORCH_API GraphPassNameType registerPrePass(GraphPass p);
+
+// Look up pass by name passed in, remove it from registered passes
+TORCH_API void clearPostPass(GraphPassNameType p);
+TORCH_API void clearPrePass(GraphPassNameType p);
+
+// Remove all passes
+TORCH_API void clearAllPostPasses();
+TORCH_API void clearAllPrePasses();
+
+// LEGACY CALL
+struct TORCH_API RegisterPostPass {
+  RegisterPostPass(GraphPass p);
+};
+
+using RegisterPass = RegisterPostPass;
+
+/*
+ * PassManager is a wrapper on the register/clear PostPass functions above. It
+ * will register the pass provided in "registerPass" and will hold on to its
+ * associated name that way clearPass can be later called and will delete the
+ * pass used to register when called.
+ *
+ * PassManager is templated because we want static variables based on a
+ * particular GraphPass. When deriving from PassManager, you should send as the
+ * template parameter your derived class as you would for the curiously
+ * recurring template pattern. This template parameter isn't actually used and
+ * is simply done to prevent static members from being shared across derived
+ * types.
+ */
+template <typename DerivedType>
+struct C10_EXPORT PassManager {
+ private:
+  // We want this class to be abstract because it's
+  virtual void abstract() = 0;
+
+ protected:
+  /*
+   * isRegistered() will return if a pass has been registered
+   * isRegistered(true) will change the value of the internal static bool
+   *
+   * There's an internal static bool to this function to keep track of the
+   * state, this is so when functions are derived from this class, they don't
+   * have to worry about initializing the static members.
+   */
+  static bool isRegistered(bool flip_bit = false) {
+    static bool val = false;
+    if (flip_bit)
+      val = !val;
+    return val;
+  }
+
+  /*
+   * name() will return the name of the registered pass
+   * name(pass_name, true) will set the name of the pass
+   * Similarly to isRegistered we use an internal static variable to hold the
+   * name.
+   */
+  static GraphPassNameType passID(
+      GraphPassNameType PassID = 0,
+      bool set = false) {
+    static GraphPassNameType pass_id = 0;
+    if (set)
+      pass_id = PassID;
+    return pass_id;
+  }
+
+ public:
+  // registerPass(pass) will register the pass provided and set the
+  // name/isRegistered functions appropriately, it returns a bool value
+  // indicating whether the given pass is already registered previously.
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      // If we don't already have a registered pass, register pass
+      // hold on to its name, change isRegistered to true
+      passID(registerPostPass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // Calls ClearPostPass(passID())
+  static void clearPass() {
+    // If the pass is registered, clear it and change isRegistered to false.
+    if (isRegistered()) {
+      clearPostPass(passID());
+      isRegistered(true);
+    }
+  }
+
+  // clang-tidy requires virtual destructor;
+  virtual ~PassManager() = default;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h
new file mode 100644
index 0000000000000000000000000000000000000000..f50f458e556f7574997ad2d99a0548fc1aec9b35
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// return true if graph is modified
+TORCH_API bool PeepholeOptimize(
+    const std::shared_ptr<Graph>& graph,
+    bool disable_shape_peepholes = false);
+// return true if graph is modified
+TORCH_API bool PeepholeOptimize(
+    Block* block,
+    bool disable_shape_peepholes = false);
+// return true if graph is modified
+TORCH_API bool FuseAddMM(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h
new file mode 100644
index 0000000000000000000000000000000000000000..fafa6a0238c22cc32ec19c73d1a47d04aa2507d3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Peephole Optimizes alias sensitive peepholes
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified
+// Optimizes on TensorType if shape_peepholes is true
+TORCH_API bool PeepholeOptimizeAliasSensitive(
+    const std::shared_ptr<Graph>& graph,
+    bool shape_peepholes);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h
new file mode 100644
index 0000000000000000000000000000000000000000..479cbac7ba5729b4071cf35f0035ba59d1233fda
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Peephole Optimizes Dict Ops such as len() and __getitem__
+// 1. getitem optimizations
+// Given a function like this:
+//     def foo():
+//         d = {0 : 1}
+//         x = d[0]
+//         return x
+// This pass produces (after dead code elimination):
+//     def foo(a, b):
+//         return 1
+//
+// This optimization can only happen if the dict is not modified
+// and the dict has constant, non overlapping keys.
+//
+// 2. len optimizations
+// Given a function like this:
+//     def foo():
+//         d = {0 : 1}
+//         return len(d)
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return 1
+//
+// This has the same requirements as the getitem optimizations.
+//
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified.
+TORCH_API bool PeepholeOptimizeDictIdioms(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ca03f81bb6c836f7f4ab1cdbd1b353631f95f58
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Peephole Optimizes List ops such as len(li) and li[1].
+// 1. Construct/Unpack optimizations
+// Given a function like this:
+//    def foo(a, b):
+//        li = [a, b]
+//        x, y = li
+//        return x, y
+// This pass produces (after dead code elimination):
+//    def foo(a, b):
+//        return a, b
+//
+// This is only applied to lists that are not modified.
+//
+// 2. getitem optimizations
+// Given a function like this:
+//     def foo(a, b):
+//         li = [a, b]
+//         x = li[0]
+//         return x
+// This pass produces (after dead code elimination):
+//     def foo(a, b):
+//         return a
+//
+// This optimization can only happen if the list is not modified.
+//
+// 3. len optimizations
+// Given a function like this:
+//     def foo():
+//         li = [1, 2]
+//         return len(li)
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return 2
+//
+// This has the same requirements as the getitem optimizations.
+//
+// 4. ListConstruct + ListConstruct
+// Given a function like this:
+//     def foo():
+//         return [1, 2] + [3, 4]
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return [1, 2, 3, 4]
+//
+// This is only applied to lists that are not modified.
+//
+// 5. Slice
+// Given a function like this:
+//     def foo():
+//         return [1, 2, 3, 4, 5][0:2]
+// This pass produces (after deadcode elimination):
+//     def foo():
+//         return [1, 2]
+//
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified.
+// If `refine_list_len` is true will attempt to refine the len of lists through
+// len comparisons and assertions. This does not generally optimize pytorch
+// programs so it is not called by default in PeepholeOptimize.
+TORCH_API bool PeepholeOptimizeListIdioms(
+    const std::shared_ptr<Graph>& graph,
+    bool refine_list_len = false);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42633b76b0165875a443fdb533f587320d062b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// return true if graph is modified
+// Optimizing General Graph Patterns that
+// are not covered in peephole.cpp and peephole_list_idioms
+TORCH_API bool PeepholeOptimizeNonTensor(const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca8e5f4c5b0fe884c21f7645a243dfda4dc6e9a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+using PrePackingOpsFilterFn = std::function<bool(Node*)>;
+
+void PrePackingOpsFolder(
+    script::Module& m,
+    const PrePackingOpsFilterFn& is_foldable_op,
+    const std::string& attr_prefix);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h
new file mode 100644
index 0000000000000000000000000000000000000000..229cde8287d877fbb1d6649f8d89a9096d8136bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch {
+namespace jit {
+
+/** Recursively deduplicate multiple uses of the same module by
+ *  creating an instance clone for each use of the module, which means
+ *  the type will be the same as before and all the attributes will be
+ *  copied, then we'll change the use of the original module to the use
+ *  of cloned module in the Graph.
+ *
+ *  This is done to ensure that modules can survive destructive passes
+ *  without changing model behavior. For example, here:
+ *
+ *    x = self.conv1(x)
+ *    x = self.relu(x)
+ *    x = self.conv2(x)
+ *    x = self.relu(x)
+ *
+ *  self.relu needs to be deduplicated for potential future destructive passes
+ *  to work properly.
+ */
+TORCH_API void DedupModuleUses(Module& module);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b5f985124cc2a307b73975e858b42bacbb73a68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace torch {
+namespace jit {
+
+/** \brief Backend specific pass to fuse dequantize - op - quantize calls
+ * as quantized_op calls.
+ *
+ * Right now this is a fusion for fbgemm backend and only works for quantized
+ * conv op, we'll extend to more ops and more backends in the future.
+ *
+ * Currently supported fusion:
+ * q(conv2d(dq(a), dq(w), dq(b))) --> to_nchw(fbgemm_conv2d(prepack(to_nhwc(a)),
+ *                                                          prepack(to_nhwc(w)),
+ *                                                          prepack(to_nhwc(b))))
+ *
+ * q(linear(dq(a), dq(w), dq(b))) --> to_nchw(fbgemm_linear(prepack(to_nhwc(a)),
+ *                                                          prepack(to_nhwc(w)),
+ *                                                          prepack(to_nhwc(b))))
+ *
+ * \param graph the graph we want to apply fusion
+ */
+TORCH_API void QuantFusion(
+    std::shared_ptr<Graph>& graph,
+    QuantType quant_type = QuantType::STATIC);
+
+/** \brief Insert prepack and unpack function in graph
+ *  We want add pack/unpack functions for quantized weight because later we want
+ * to fold the packed weight as an attribute of the module, in order to reduce
+ * the cost of packing the weight on the fly in quantized models.
+ *
+ *  Each quantized op has it's corresponding prepack/unpack function,
+ *  right now, we only need to do prepack/unpack for quantized::linear
+ * and quantized::conv2d.
+ */
+TORCH_API void InsertPrepackUnpack(std::shared_ptr<Graph>& graph);
+
+/** \brief Insert pack and unpack function in all graphs
+ *   of module
+ *
+ *   Go through graphs of all the methods of all child modules
+ *   and call InsertPrepackUnpack on the graph.
+ */
+TORCH_API void InsertPrepackUnpack(Module& module);
+
+TORCH_API script::Module Finalize(
+    script::Module& module,
+    QuantType quant_type = QuantType::STATIC,
+    const std::vector<std::string>& preserved_attrs =
+        std::vector<std::string>());
+
+TORCH_API void FoldQuantizedPrepackingOps(Module& module);
+
+TORCH_API Module FinalizeOnDevicePTQ(
+    Module& module,
+    QuantType quant_type,
+    const std::string& method_name);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..3007208588faec119b88431bd0efb7482e2ab789
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+TORCH_API void FuseQuantizedAddRelu(std::shared_ptr<Graph>& graph);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d7c48ce3101a0a02067d4c2c5b4cdce2046e271
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h
@@ -0,0 +1,216 @@
+#pragma once
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/passes/graph_rewrite_helper.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+#include <functional>
+#include <regex>
+
+namespace torch {
+namespace jit {
+
+using graph_rewrite_helper::getFuncName;
+
+// Vector of a module and the name of its method
+using ModuleMethodVector = std::vector<std::pair<Module, std::string>>;
+// Map of quantization parameter name and value
+// for example _scale, _zero_point,
+// _scalar_type and _axis(for per channel quantization)
+using QParamVector = std::vector<std::pair<std::string, IValue>>;
+
+// =========== helper functions for Value =========
+// Check if a value is weight, since we need to use weight observer
+// for weight
+TORCH_API bool isWeight(Value* v);
+
+// Check if a value is bias for conv and linear, which we do not
+// quantize
+TORCH_API bool isBiasOfConvOrLinear(Value* v);
+
+TORCH_API bool isEmbeddingBagNonInput(Value* v);
+
+// Get the use as scalar input of clamp ops for the input value
+c10::optional<Use> getClampScalarInputUse(Value* v);
+
+// For a given value `v`, get the list of values that we need to check
+// if they are observed/quantized or not, if so, we can say the
+// `v` is also observed/quantized, since we can derive
+// the quantization parameters for `v` given the list of values
+TORCH_API std::vector<Value*> getPassThroughInputs(Value* v);
+
+// Clones the method by the name of orig_method_name into new_method_name method
+TORCH_API void cloneMethod(
+    Module& module,
+    const std::string& orig_method_name,
+    const std::string& new_method_name);
+
+// Check if a value in the graph is a Scalar value
+TORCH_API bool isScalar(Value* v);
+
+// Check if value is the input of the graph
+TORCH_API bool hitGraphInput(Value* value);
+
+// Converts a mangled name, such as
+//   __torch__.torch.ao.nn.quantized.modules.conv.___torch_mangle_7.Conv2d
+// into an unmangled name, such as
+//   __torch__.torch.ao.nn.quantized.modules.conv.Conv2d
+TORCH_API std::string removeTorchMangle(const std::string& orig_name);
+
+// Return the module name that corresponds to the value.
+TORCH_API c10::optional<std::string> getModuleName(Value* value);
+
+// =========== helper functions for Node =========
+TORCH_API bool isSingleInputGeneralShapeAtenFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralValueAtenFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralCallFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralAtenFunction(Node* n);
+
+TORCH_API bool isClamp(Node* n);
+
+// Check if the node will produce the same result regardless of whether
+// the input tensor is quantized or not, example: aten::size
+TORCH_API bool isTensorInfoNode(Node* n);
+
+// Check if this the propagate op that has single input, e.g. aten::cat
+TORCH_API bool isPropagateQuantSingleInputOp(Node* n);
+
+// Check if this is the propagate op that has two inputs, e.g. aten::add
+TORCH_API bool isPropagateQuantBinaryOp(Node* n);
+
+// Check if this is the node that we'll quantize or not quantize depending on
+// whether the input of the node is quantized, example: aten::cat
+TORCH_API bool isPropagateQuantOp(Node* n);
+
+// Check if the node is a binary op like aten::add and aten::mul and
+// if the input 1 is a scalar, these ops will be quantized to
+// quantized::{op}_scalar
+TORCH_API bool isBinaryOpWithScalarInput(Node* n);
+
+TORCH_API c10::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(
+    Node* n);
+
+// We don't want to analyze the graph for some `builtin` CallFunctions
+// like `linear` because we want to preserve the op boundary
+TORCH_API bool userDefinedCallFunction(Node* n);
+
+// Check if the node has scalar input
+TORCH_API bool hasScalarInput(Node* n);
+
+// Check if a node is quantizable
+TORCH_API bool nodeQuantizable(
+    Node* n,
+    QuantType quant_type = QuantType::STATIC);
+
+// Nodes which only require quantization of weight value, eg. embedding_bag
+bool isWeightOnlyStaticQuantOp(Node* n);
+
+// Check if a use of the value is quantizable, this depends on
+// both the use node and the offset
+TORCH_API bool useQuantizable(const Use& use, QuantType quant_type);
+
+// Given a CallFunction node, extract the graph of the called function
+TORCH_API std::shared_ptr<Graph> getCallFunctionGraph(Node* n);
+
+// Check if `use` is a CallFunction of name `func_name` and if value
+// `v` is the nth argument (if provided) of the function
+bool matchCallFuncToUse(
+    const Use& use,
+    const std::string& func_name,
+    c10::optional<int> nth_arg);
+
+// Check if `use` is a AtenFunction of name `func_name` and if value
+// `v` is the nth argument (if provided) of the function
+bool matchAtenFuncToUse(
+    const Use& use,
+    const std::string& func_name,
+    c10::optional<int> nth_arg);
+
+// =========== helper functions for Block =========
+// checks if a block will always raise an Exception
+TORCH_API bool alwaysRaisesException(Block* block);
+
+// =========== helper functions for Module  ==========
+// TODO: remove
+TORCH_API std::vector<std::string> getModuleAccessPath(
+    Value* instance,
+    Value* self);
+// TODO: remove
+TORCH_API Module
+findChildModule(const Module& module, const std::vector<std::string>& path);
+
+// Given an CallMethod node, get the module instance corresponding
+// to the instance Value
+// TODO: refactor all current uses of this function to the Opt one
+TORCH_API Module getInvokedModule(Module& module, Node* n, Value* self);
+
+// Given an CallMethod node, get the module instance corresponding
+// to the instance Value if the instance is a module, otherwise return
+// c10::nullopt
+c10::optional<Module> getInvokedModuleOpt(
+    const Module& module,
+    Node* n,
+    Value* self);
+
+// ==================== filter functions for matches ==============
+// filter to check Value `vname` is a constant of int value `value`
+bool is_int_constant(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap,
+    const std::string& vname,
+    int value);
+
+// filter to check if the %alpha argument of aten::add is constant 1
+bool aten_add_alpha_is_one(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// filter to check if the functional in CallFunction is relu
+bool is_functional_relu(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// filter to check if the module is torch.nn.ReLU
+bool is_relu_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_linear_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// TODO: add a macro to declare the filters
+bool is_conv1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv3d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_batchnorm2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_batchnorm3d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h
new file mode 100644
index 0000000000000000000000000000000000000000..42f520a2e59a2d3b4a9e68d59e874ee8da1747be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace std {
+
+template <>
+struct hash<torch::jit::Module> {
+  inline size_t operator()(const torch::jit::Module& arg) const {
+    return std::hash<c10::intrusive_ptr<c10::ivalue::Object>>()(arg._ivalue());
+  }
+};
+
+} // namespace std
+
+namespace torch {
+namespace jit {
+
+using QConfig = std::tuple<Module, Module>;
+using QConfigDict = std::unordered_map<std::string, c10::optional<QConfig>>;
+
+/** \brief Insert observer module and observer function call for
+ *  the Tensors that needs to be observed.
+ *
+ * For each Tensor that needs to be observed in the method, insert observer
+ * module to the input module and add forward calls of observer to the specified
+ * method.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert observers for
+ * \param qconfig_dict the qconfig dictionary that specifies how
+ * each module is going to be quantized
+ * \param inplace whether we want to do inplace modification to the input module
+ * or clone the module
+ * \param is_dynamic whether the dynamic quantization script is being used.
+ */
+TORCH_API Module InsertObservers(
+    Module& module,
+    const std::string& method_name,
+    const QConfigDict& qconfig_dict,
+    bool inplace,
+    QuantType quant_type = QuantType::STATIC);
+
+/** \brief Insert observer module and observer method for
+ *  the Tensors that needs to be observed.
+ *
+ * For each Tensor that needs to be observed in the method, insert observer
+ * module to the input module and observe_<method-name> methods to the module.
+ * This method is clone of mehtod_name with forward calls of observer added.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert observers for
+ * \param qconfig_dict the qconfig dictionary that specifies how
+ * each module is going to be quantized
+ * \param inplace whether we want to do inplace modification to the input module
+ * or clone the module
+ * \param is_dynamic whether the dynamic quantization script is being used.
+ */
+TORCH_API Module InsertObserversForOnDevicePTQ(
+    Module& module,
+    const std::string& method_name,
+    const QConfigDict& qconfig_dict,
+    bool inplace,
+    QuantType quant_type = QuantType::STATIC);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
new file mode 100644
index 0000000000000000000000000000000000000000..557623cfc9c6c6de2d4bb9ce8e6819664f4dce67
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace torch {
+namespace jit {
+
+/** Replicate quantize node for prim::If blocks, so that we can match
+ *  quantization patterns in prim::If blocks
+ */
+TORCH_API void ReplicateQuant(std::shared_ptr<Graph>& graph);
+
+/** Replicate dequantize node for each use, so that we can match
+ *  quantization patterns
+ */
+TORCH_API void ReplicateDeQuant(std::shared_ptr<Graph>& graph);
+
+/** \brief Insert quantize - dequantize calls to the Tensors
+ *  that are observed in insert_observers pass
+ *
+ * For each Tensor that is observed, get the observer module and call
+ * calculate_qparam on the observer module to get quantization parameters
+ * and add quantize - int_repr - dequantize function calls using these
+ * parameters we also have special handling for quantizing "bias" right now.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert quantization calls for
+ */
+TORCH_API Module InsertQuantDeQuant(
+    Module& module,
+    const std::string& method_name,
+    bool inplace,
+    bool debug,
+    QuantType quant_type = QuantType::STATIC);
+
+TORCH_API Module InsertQuantDeQuantOnDevicePTQ(
+    Module& module,
+    const std::string& method_name,
+    bool inplace,
+    bool debug,
+    QuantType quant_type = QuantType::STATIC);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h
new file mode 100644
index 0000000000000000000000000000000000000000..933836fbca790a86d6e5f5f0794b72163aab97f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -0,0 +1,1272 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/quantization/helper.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace torch {
+namespace jit {
+
+struct QuantFusionInfo {
+  std::string quantized_op_name;
+  std::string pattern;
+  std::string replacement;
+  std::vector<MatchFilter> filters = {};
+};
+
+namespace {
+std::string getExtraArgList(std::vector<std::string> extra_args) {
+  return std::accumulate(
+      extra_args.begin(),
+      extra_args.end(),
+      std::string(),
+      [](std::string acc, const std::string& arg) { return acc + ", " + arg; });
+}
+
+// Get the pattern we want to replace the match with
+std::string getAtenOpPattern(
+    const std::string& graph_header,
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    bool scalar_args = false) {
+  std::vector<std::string> _extra_op_args = extra_op_args;
+  std::string aten_op_pattern = graph_header;
+  if (scalar_args) {
+    for (const auto& extra_arg : _extra_op_args) {
+      aten_op_pattern
+          .append(R"(
+          )")
+          .append(extra_arg)
+          .append("_scalar = aten::item(")
+          .append(extra_arg)
+          .append(")");
+    }
+
+    for (auto& _extra_op_arg : _extra_op_args) {
+      _extra_op_arg.append("_scalar");
+    }
+  }
+  const auto& extra_op_arg_list = getExtraArgList(std::move(_extra_op_args));
+  aten_op_pattern += R"(
+          %r = )";
+  aten_op_pattern += op_name + "(" + "%a_quant" + extra_op_arg_list + ")";
+  aten_op_pattern += R"(
+          return (%r) )";
+  return aten_op_pattern;
+}
+
+// generate ops for quantize pattern for a scalar value
+std::string getQuantizeForScalar(const std::string& value) {
+  // 6 is `torch.float` ScalarType, we are creating a float scalar
+  // tensor from a scalar value
+  std::string quantize_pattern = R"(
+          )" +
+      value + "_float_scalar_type : int = prim::Constant[value=6]()";
+  quantize_pattern += R"(
+          )" +
+      value + "_none : None = prim::Constant()";
+  quantize_pattern += R"(
+          )" +
+      value + "_tensor : Tensor = aten::scalar_tensor(" + value + ", " + value +
+      "_float_scalar_type";
+  for (const auto i : c10::irange(3)) {
+    (void)i; // Suppress unused variable warning
+    quantize_pattern += ", " + value + "_none";
+  }
+  quantize_pattern += ")";
+  quantize_pattern +=
+      R"(
+          )" +
+      value + "_quant = aten::quantize_per_tensor(" + value + "_tensor" +
+      getExtraArgList(
+          {value + "_scale", value + "_zero_point", value + "_dtype"}) +
+      ")";
+  return quantize_pattern;
+}
+
+std::string getDequantize(const std::string& value) {
+  return R"(
+          )" +
+      value + "_dequant = aten::dequantize(" + value + "_quant)";
+}
+
+std::string getItem(const std::string& value) {
+  return R"(
+          )" +
+      value + "_scalar : float = aten::item(" + value + "_dequant)";
+}
+
+// Patterns for the ops that inherit parameters from input
+std::string getInputTensorQParamOpPattern(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string op_pattern = "graph(%a_quant" + extra_op_arg_list + "):" + R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )" +
+      op_name + "(" + "%a_dequant" + extra_op_arg_list + ")" + R"(
+          %r_scale : float = aten::q_scale(%a_quant)
+          %r_zero_point : int = aten::q_zero_point(%a_quant)
+          %r_dtype : int = prim::dtype(%a_quant)
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+  return op_pattern;
+}
+
+// QuantFusionInfo for the ops that inherit parameters from input
+QuantFusionInfo getInputTensorQParamOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  std::string op_pattern =
+      getInputTensorQParamOpPattern(op_name, extra_op_args);
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_replacement =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(op_replacement)};
+}
+
+// quant fusion for ops like `quantized::add_scalar`, `quantized::mul_scalar`
+QuantFusionInfo getBinaryOpScalarFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    const std::string& quantized_op_name,
+    const std::vector<std::string>& extra_quantized_op_args,
+    const std::vector<MatchFilter>& filters = {}) {
+  std::string op_pattern =
+      getInputTensorQParamOpPattern(op_name, extra_op_args);
+
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_replacement = getAtenOpPattern(
+      graph_header, quantized_op_name, extra_quantized_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(op_replacement), filters};
+}
+
+QuantFusionInfo getClampOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  std::vector<std::string> header_args = extra_op_args;
+  std::vector<std::string> input_qparams = {"_scale", "_zero_point", "_dtype"};
+  for (const auto& arg : extra_op_args) {
+    for (const auto& qparam : input_qparams) {
+      header_args.push_back(arg + qparam);
+    }
+  }
+  for (const auto& qparam : input_qparams) {
+    header_args.push_back("%r" + qparam);
+  }
+  const auto& extra_header_arg_list = getExtraArgList(std::move(header_args));
+  std::string graph_header = "graph(%a_quant" + extra_header_arg_list + "):";
+  std::string op_pattern = graph_header;
+  for (const auto& arg : extra_op_args) {
+    op_pattern += getQuantizeForScalar(arg);
+    op_pattern += getDequantize(arg);
+    op_pattern += getItem(arg);
+  }
+  op_pattern += getDequantize("%a");
+  op_pattern += R"(
+          %r = )";
+  std::vector<std::string> scalar_extra_args;
+  scalar_extra_args.reserve(extra_op_args.size());
+  for (const auto& arg : extra_op_args) {
+    scalar_extra_args.push_back(arg + "_scalar");
+  }
+  op_pattern += op_name + "(" + "%a_dequant" +
+      getExtraArgList(std::move(scalar_extra_args)) + ")";
+  // IR pattern common to all ops that inherit qparam from input
+  op_pattern += R"(
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string aten_op_pattern =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+// Patterns for the ops that has fixed quantization parameters
+QuantFusionInfo getFixedQParamOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    bool is_symmetric) {
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_pattern = graph_header;
+  op_pattern += R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )";
+  op_pattern += op_name + "(" + "%a_dequant" + extra_op_arg_list + ")";
+  // IR pattern common to all ops with fixed quantization parameters for
+  // asymetric quantization
+  std::string asym_fixed_qparam_op_suffix = R"(
+          %r_scale : float = prim::Constant[value=0.00390625]()
+          %r_zero_point : int = prim::Constant[value=0]()
+          %r_dtype : int = prim::Constant[value=13]()
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string sym_fixed_qparam_op_suffix = R"(
+          %r_scale : float = prim::Constant[value=0.0078125]()
+          %r_zero_point : int = prim::Constant[value=128]()
+          %r_dtype : int = prim::Constant[value=13]()
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+  op_pattern +=
+      is_symmetric ? sym_fixed_qparam_op_suffix : asym_fixed_qparam_op_suffix;
+
+  std::string aten_op_pattern =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+// filter that checks %b_scalar is a scalar
+bool input_b_is_scalar(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  const auto& match_vmap = match.values_map;
+  auto b_scalar = match_vmap.at(vmap.at("b_scalar"));
+  return isScalar(b_scalar);
+}
+
+// Patterns for ops that require observation for output quantization parameters
+// Example:
+//
+// before fusion:
+//
+// graph(%a_quant, %r_scale, %r_zero_point, %r_dtype):
+//     %a_dequant = aten::dequantize(%a_quant)
+//     %r = {op_name}(%a_dequant, {extra_args})
+//     %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point,
+//     %r_dtype) return (%r_quant)
+//
+// after fusion:
+//
+// graph(%a_quant, %r_scale, %r_zero_point, %r_dtype):
+//     %r_quant = {quantized_op_name}(%a_quant, {extra_args}, %r_scale,
+//     %r_zero_point) return (%r_quant)
+QuantFusionInfo getObservedQParamOpFusionInfo(
+    const std::string& fp_op_name,
+    const std::string& q_op_name,
+    const std::vector<std::string>& fp_extra_args,
+    const std::vector<std::string>& q_extra_args) {
+  const auto& fp_extra_arg_list = getExtraArgList(fp_extra_args);
+  const auto& q_extra_arg_list = getExtraArgList(q_extra_args);
+
+  std::string op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+      ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )" +
+      fp_op_name + "(" + "%a_dequant" + fp_extra_arg_list + ")" + R"(
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string aten_op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+      ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+          %r_quant = )" +
+      q_op_name + "(%a_quant" + q_extra_arg_list +
+      ", %r_scale, %r_zero_point)" + R"(
+          return (%r_quant) )";
+
+  return {q_op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+} // namespace
+
+static std::vector<QuantFusionInfo> quant_fusion_pattern_and_replacements() {
+  // aten::conv1d
+  std::string conv1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv1d - aten::relu
+  std::string conv1d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv1d - aten::relu_
+  std::string conv1d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv1d
+  std::string quantized_conv1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv1d_relu
+  std::string quantized_conv1d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv1d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv2d
+  std::string conv2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv2d - aten::relu
+  std::string conv2d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv2d - aten::relu_
+  std::string conv2d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv2d
+  std::string quantized_conv2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv2d_relu
+  std::string quantized_conv2d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv2d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv3d
+  std::string conv3d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv3d - aten::relu
+  std::string conv3d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv3d - aten::relu_
+  std::string conv3d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv3d
+  std::string quantized_conv3d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv3d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv3d_relu
+  std::string quantized_conv3d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv3d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose1d
+  std::string conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose2d
+  std::string conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  std::string add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string add_inplace_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu_(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_add_inplace_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu_(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string quantized_add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %r = quantized::add_relu(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::linear
+  std::string linear = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string linear_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %linear_out = aten::linear(%a_dequant, %w_dequant, %b)
+        %r = aten::relu(%linear_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string linear_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %linear_out = aten::linear(%a_dequant, %w_dequant, %b)
+        %r = aten::relu_(%linear_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::linear
+  std::string quantized_linear = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %r = quantized::linear(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r) )";
+
+  std::string quantized_linear_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %r = quantized::linear_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r) )";
+
+  std::string cat = R"(
+graph(%input_quant, %dim, %r_scale, %r_zero_point, %r_dtype):
+        %input_dequant = aten::dequantize(%input_quant)
+        %r = aten::cat(%input_dequant, %dim)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string quantized_cat = R"(
+graph(%input_quant, %dim, %r_scale, %r_zero_point, %r_dtype):
+         %r_quant = quantized::cat(%input_quant, %dim, %r_scale, %r_zero_point)
+         return (%r_quant) )";
+
+  // aten::add
+  std::string add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r = aten::quantize_per_tensor(%r_add, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // TODO: add %dtype after when https://github.com/pytorch/pytorch/issues/34351
+  // is fixed
+  // quantized::add
+  std::string quantized_add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %r = quantized::add(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::add_
+  std::string inplace_add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r = aten::quantize_per_tensor(%r_add, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  auto add_scalar = getBinaryOpScalarFusionInfo(
+      "aten::add",
+      {"%b_scalar", "%alpha"},
+      "quantized::add_scalar",
+      {"%b_scalar"},
+      {aten_add_alpha_is_one, input_b_is_scalar});
+
+  auto add_scalar_out = getBinaryOpScalarFusionInfo(
+      "aten::add_",
+      {"%b_scalar", "%alpha"},
+      "quantized::add_scalar_out",
+      {"%b_scalar", "%a_quant"},
+      {aten_add_alpha_is_one, input_b_is_scalar});
+
+  // quantized::add_scalar_relu -- fusing quantized::add_scalar
+  // and aten::relu
+  auto quantized_add_scalar_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar(%a_quant, %b_scalar)
+         %r = aten::relu(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_inplace_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar(%a_quant, %b_scalar)
+         %r = aten::relu_(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_relu_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::add_scalar_relu(%a_quant, %b_scalar)
+         return (%r) )";
+
+  // quantized::add_scalar_relu_out -- fusing quantized::add_scalarOut
+  // and aten::relu
+  auto quantized_add_scalar_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_inplace_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu_(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_relu_out_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::add_scalar_relu_out(%a_quant, %b_scalar, %a_quant)
+         return (%r) )";
+
+  // quantized::batch_norm
+  std::string batch_norm = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r_bn = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %r = aten::quantize_per_tensor(%r_bn, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+  std::string quantized_batch_norm = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %r = quantized::batch_norm(%a_quant, %weight, %bias, %mean, %var, %eps, %scale, %zero_point)
+         return (%r) )";
+
+  std::string batch_norm_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %bn_out = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %relu = aten::relu(%bn_out)
+         %r = aten::quantize_per_tensor(%relu, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+  std::string batch_norm_inplace_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %bn_out = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %relu = aten::relu_(%bn_out)
+         %r = aten::quantize_per_tensor(%relu, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+
+  std::string quantized_batch_norm_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %r = quantized::batch_norm_relu(%a_quant, %weight, %bias, %mean, %var, %eps, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::mul
+  std::string mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r = aten::quantize_per_tensor(%r_mul, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // aten::mul_
+  std::string inplace_mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r = aten::quantize_per_tensor(%r_mul, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // quantized::mul
+  std::string quantized_mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %r = quantized::mul(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  auto mul_scalar = getBinaryOpScalarFusionInfo(
+      "aten::mul",
+      {"%b_scalar"},
+      "quantized::mul_scalar",
+      {"%b_scalar"},
+      {input_b_is_scalar});
+
+  auto mul_scalar_out = getBinaryOpScalarFusionInfo(
+      "aten::mul_",
+      {"%b_scalar"},
+      "quantized::mul_scalar_out",
+      {"%b_scalar", "%a_quant"},
+      {input_b_is_scalar});
+
+  // quantized::mul_relu
+  std::string mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r_relu = aten::relu(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string mul_inplace_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r_relu = aten::relu_(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r_relu = aten::relu(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_mul_inplace_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r_relu = aten::relu_(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string quantized_mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %r = quantized::mul_relu(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // quantized::mul_scalar_relu -- fusing quantized::mul_scalar
+  // and aten::relu
+  auto quantized_mul_scalar_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar(%a_quant, %b_scalar)
+         %r = aten::relu(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_inplace_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar(%a_quant, %b_scalar)
+         %r = aten::relu_(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_relu_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::mul_scalar_relu(%a_quant, %b_scalar)
+         return (%r) )";
+
+  // quantized::mul_scalar_relu_out -- fusing quantized::mul_scalarOut
+  // and aten::relu
+  auto quantized_mul_scalar_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_inplace_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu_(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_relu_out_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::mul_scalar_relu_out(%a_quant, %b_scalar, %a_quant)
+         return (%r) )";
+
+  // quantized::elu
+  std::string elu = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r = aten::elu(%a_dequant, %alpha, %scale, %input_scale)
+         %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+         return (%r_quant) )";
+
+  std::string quantized_elu = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %r_quant = quantized::elu(%a_quant, %r_scale, %r_zero_point, %alpha, %scale, %input_scale)
+         return (%r_quant) )";
+
+  std::string elu_ = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r = aten::elu_(%a_dequant, %alpha, %scale, %input_scale)
+         %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+         return (%r_quant) )";
+
+  // ============= General Ops that inherit quantization parameters from input
+  // tensor =============
+  auto avg_pool1d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool1d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad"});
+
+  auto avg_pool2d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool2d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad",
+       "%divisor_override"});
+
+  std::string common_general_value_op = R"(
+          %r_scale : float = aten::q_scale(%a_quant)
+          %r_zero_point : int = aten::q_zero_point(%a_quant)
+          %r_dtype : int = prim::dtype(%a_quant)
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  auto avg_pool3d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool3d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad",
+       "%divisor_override"});
+
+  auto adaptive_avg_pool1d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool1d", {"%output_size"});
+
+  auto adaptive_avg_pool2d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool2d", {"%output_size"});
+
+  auto adaptive_avg_pool3d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool3d", {"%output_size"});
+
+  auto mean1 = getInputTensorQParamOpFusionInfo("aten::mean", {"%dim"});
+
+  auto mean2 = getInputTensorQParamOpFusionInfo(
+      "aten::mean", {"%dim", "%keepdim", "%out"});
+
+  auto upsample_nearest1d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest1d", {"%output_size", "%scale_factors"});
+
+  auto upsample_nearest2d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest2d", {"%output_size", "%scale_factors"});
+
+  auto upsample_nearest3d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest3d", {"%output_size", "%scale_factors"});
+
+  auto upsample_linear1d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_linear1d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_bilinear2d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_bilinear2d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_trilinear3d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_trilinear3d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_nearest1d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest1d", {"%output_size", "%scales"});
+
+  auto upsample_nearest2d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest2d", {"%output_size", "%scale_h", "%scale_w"});
+
+  auto upsample_nearest3d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest3d",
+      {"%output_size", "%scale_d", "%scale_h", "%scale_w"});
+
+  auto upsample_linear1d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_linear1d", {"%output_size", "%align_corners", "%scales"});
+
+  auto upsample_bilinear2d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_bilinear2d",
+      {"%output_size", "%align_corners", "%scale_h", "%scale_w"});
+
+  auto upsample_trilinear3d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_trilinear3d",
+      {"%output_size", "%align_corners", "%scale_d", "%scale_h", "%scale_w"});
+
+  auto clamp = getClampOpFusionInfo("aten::clamp", {"%min", "%max"});
+
+  auto hardtanh = getClampOpFusionInfo("aten::hardtanh", {"%min", "%max"});
+
+  auto hardtanh_ = getClampOpFusionInfo("aten::hardtanh_", {"%min", "%max"});
+
+  auto leaky_relu =
+      getInputTensorQParamOpFusionInfo("aten::leaky_relu", {"%negative_slope"});
+
+  auto leaky_relu_ = getInputTensorQParamOpFusionInfo(
+      "aten::leaky_relu_", {"%negative_slope"});
+
+  // Ops with fixed quantization parameters
+  auto hardsigmoid = getFixedQParamOpFusionInfo("aten::hardsigmoid", {}, false);
+
+  auto hardsigmoid_ =
+      getFixedQParamOpFusionInfo("aten::hardsigmoid_", {}, false);
+
+  auto sigmoid = getFixedQParamOpFusionInfo("aten::sigmoid", {}, false);
+
+  auto sigmoid_ = getFixedQParamOpFusionInfo("aten::sigmoid_", {}, false);
+
+  auto tanh = getFixedQParamOpFusionInfo("aten::tanh", {}, true);
+
+  auto tanh_ = getFixedQParamOpFusionInfo("aten::tanh_", {}, true);
+
+  auto hardswish = getObservedQParamOpFusionInfo(
+      "aten::hardswish", "quantized::hardswish", {}, {});
+
+  auto hardswish_ = getObservedQParamOpFusionInfo(
+      "aten::hardswish_", "quantized::hardswish", {}, {});
+
+  auto layer_norm = getObservedQParamOpFusionInfo(
+      "aten::layer_norm",
+      "quantized::layer_norm",
+      {"%normalized_shape", "%weight", "%bias", "%eps", "%cudnn_enabled"},
+      {"%normalized_shape", "%weight", "%bias", "%eps"});
+
+  auto group_norm = getObservedQParamOpFusionInfo(
+      "aten::group_norm",
+      "quantized::group_norm",
+      {"%num_groups", "%weight", "%bias", "%eps", "%cudnn_enabled"},
+      {"%num_groups", "%weight", "%bias", "%eps"});
+
+  auto instance_norm = getObservedQParamOpFusionInfo(
+      "aten::instance_norm",
+      "quantized::instance_norm",
+      {"%weight",
+       "%bias",
+       "%running_mean",
+       "%running_var",
+       "%use_input_stats",
+       "%momentum",
+       "%eps",
+       "%cudnn_enabled"},
+      {"%weight", "%bias", "%eps"});
+
+  return {
+      {"quantized::conv1d", std::move(conv1d), std::move(quantized_conv1d)},
+      {"quantized::conv1d_relu", std::move(conv1d_relu), quantized_conv1d_relu},
+      {"quantized::conv1d_relu",
+       std::move(conv1d_inplace_relu),
+       std::move(quantized_conv1d_relu)},
+      {"quantized::conv2d", std::move(conv2d), std::move(quantized_conv2d)},
+      {"quantized::conv2d_relu", std::move(conv2d_relu), quantized_conv2d_relu},
+      {"quantized::conv2d_relu",
+       std::move(conv2d_inplace_relu),
+       std::move(quantized_conv2d_relu)},
+      {"quantized::conv3d", std::move(conv3d), std::move(quantized_conv3d)},
+      {"quantized::conv3d_relu", std::move(conv3d_relu), quantized_conv3d_relu},
+      {"quantized::conv3d_relu",
+       std::move(conv3d_inplace_relu),
+       std::move(quantized_conv3d_relu)},
+      {"quantized::conv_transpose1d",
+       std::move(conv_transpose1d),
+       std::move(quantized_conv_transpose1d)},
+      {"quantized::conv_transpose2d",
+       std::move(conv_transpose2d),
+       std::move(quantized_conv_transpose2d)},
+      {"quantized::linear", std::move(linear), std::move(quantized_linear)},
+      {"quantized::linear_relu", std::move(linear_relu), quantized_linear_relu},
+      {"quantized::linear_relu",
+       std::move(linear_inplace_relu),
+       std::move(quantized_linear_relu)},
+      {"quantized::add_relu",
+       std::move(add_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(add_inplace_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(inplace_add_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(inplace_add_inplace_relu),
+       std::move(quantized_add_relu),
+       {aten_add_alpha_is_one}},
+      std::move(add_scalar),
+      std::move(add_scalar_out),
+      // note that these must come after quantized::add_scalar and
+      // quantized::add_scalar_out patterns
+      {"quantized::add_scalar_relu",
+       quantized_add_scalar_relu_pattern,
+       quantized_add_scalar_relu_replacement},
+      {"quantized::add_scalar_relu",
+       quantized_add_scalar_inplace_relu_pattern,
+       quantized_add_scalar_relu_replacement},
+      {"quantized::add_scalar_relu_out",
+       quantized_add_scalar_relu_out_pattern,
+       quantized_add_scalar_relu_out_replacement},
+      {"quantized::add_scalar_relu_out",
+       quantized_add_scalar_inplace_relu_out_pattern,
+       quantized_add_scalar_relu_out_replacement},
+      {"quantized::add",
+       std::move(add),
+       quantized_add,
+       {aten_add_alpha_is_one}},
+      {"quantized::add",
+       std::move(inplace_add),
+       std::move(quantized_add),
+       {aten_add_alpha_is_one}},
+      {"quantized::cat", std::move(cat), std::move(quantized_cat)},
+      {"quantized::batch_norm",
+       std::move(batch_norm),
+       std::move(quantized_batch_norm)},
+      {"quantized::batch_norm_relu",
+       std::move(batch_norm_relu),
+       quantized_batch_norm_relu},
+      {"quantized::batch_norm_relu",
+       std::move(batch_norm_inplace_relu),
+       std::move(quantized_batch_norm_relu)},
+      std::move(mul_scalar),
+      std::move(mul_scalar_out),
+      // note that these must come after quantized::mul_scalar and
+      // quantized::mul_scalar_out patterns
+      {"quantized::mul_scalar_relu",
+       quantized_mul_scalar_relu_pattern,
+       quantized_mul_scalar_relu_replacement},
+      {"quantized::mul_scalar_relu",
+       quantized_mul_scalar_inplace_relu_pattern,
+       quantized_mul_scalar_relu_replacement},
+      {"quantized::mul_scalar_relu_out",
+       quantized_mul_scalar_relu_out_pattern,
+       quantized_mul_scalar_relu_out_replacement},
+      {"quantized::mul_scalar_relu_out",
+       quantized_mul_scalar_inplace_relu_out_pattern,
+       quantized_mul_scalar_relu_out_replacement},
+      {"quantized::mul_relu", std::move(mul_relu), quantized_mul_relu},
+      {"quantized::mul_relu", std::move(mul_inplace_relu), quantized_mul_relu},
+      {"quantized::mul_relu", std::move(inplace_mul_relu), quantized_mul_relu},
+      {"quantized::mul_relu",
+       std::move(inplace_mul_inplace_relu),
+       std::move(quantized_mul_relu)},
+      {"quantized::mul", std::move(mul), quantized_mul},
+      {"quantized::mul", std::move(inplace_mul), std::move(quantized_mul)},
+      std::move(hardswish),
+      std::move(hardswish_),
+      std::move(layer_norm),
+      std::move(group_norm),
+      std::move(instance_norm),
+      {"quantized::elu", std::move(elu), quantized_elu},
+      {"quantized::elu_", std::move(elu_), std::move(quantized_elu)},
+      std::move(avg_pool1d),
+      std::move(avg_pool2d),
+      std::move(avg_pool3d),
+      std::move(adaptive_avg_pool1d),
+      std::move(adaptive_avg_pool2d),
+      std::move(adaptive_avg_pool3d),
+      std::move(mean1),
+      std::move(mean2),
+      std::move(upsample_nearest1d),
+      std::move(upsample_nearest2d),
+      std::move(upsample_nearest3d),
+      std::move(upsample_linear1d),
+      std::move(upsample_bilinear2d),
+      std::move(upsample_trilinear3d),
+      std::move(upsample_nearest1d_vec),
+      std::move(upsample_nearest2d_vec),
+      std::move(upsample_nearest3d_vec),
+      std::move(upsample_linear1d_vec),
+      std::move(upsample_bilinear2d_vec),
+      std::move(upsample_trilinear3d_vec),
+      std::move(clamp),
+      std::move(hardtanh),
+      std::move(hardtanh_),
+      std::move(leaky_relu),
+      std::move(leaky_relu_),
+      // fixed qparam ops
+      std::move(hardsigmoid),
+      std::move(hardsigmoid_),
+      std::move(sigmoid),
+      std::move(sigmoid_),
+      std::move(tanh),
+      std::move(tanh_),
+  };
+}
+
+inline std::vector<QuantFusionInfo>
+dynamic_quantized_linear_pattern_and_replacements() {
+  std::string linear_dynamic = R"(
+graph(%packed_params, %a):
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a, %w_dequant, %b)
+        return (%r) )";
+
+  // This pattern ignores reduce range
+  // Set the reduce range to default to true, since qnnpack backend ignores this
+  // argument.
+  std::string quantized_linear_dynamic = R"(
+graph(%packed_params, %a):
+        %reduce_range : bool = prim::Constant[value=1]()
+        %r = quantized::linear_dynamic(%a, %packed_params, %reduce_range)
+        return (%r) )";
+
+  return {
+      {"quantized::linear_dynamic",
+       std::move(linear_dynamic),
+       std::move(quantized_linear_dynamic)},
+  };
+}
+
+static std::vector<QuantFusionInfo>
+dynamic_quant_fusion_pattern_and_replacements() {
+  std::string linear_dynamic = R"(
+graph(%packed_params, %a, %reduce_range, %a_dtype):
+        %a_scale : float, %a_zero_point : int = aten::_choose_qparams_per_tensor(%a, %reduce_range)
+        %a_quant = aten::quantize_per_tensor(%a, %a_scale, %a_zero_point, %a_dtype)
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        return (%r) )";
+
+  std::string quantized_linear_dynamic = R"(
+graph(%packed_params, %a, %reduce_range, %a_dtype):
+        %r = quantized::linear_dynamic(%a, %packed_params, %reduce_range)
+        return (%r) )";
+
+  std::string linear_dynamic_fp16 = R"(
+graph(%packed_params, %a):
+        %w_unpacked : Tensor, %b : Tensor? = quantized::linear_unpack_fp16(%packed_params)
+        %r = aten::linear(%a, %w_unpacked, %b)
+        return (%r) )";
+
+  std::string quantized_linear_dynamic_fp16 = R"(
+graph(%packed_params, %a):
+        %r = quantized::linear_dynamic_fp16(%a, %packed_params)
+        return (%r) )";
+
+  return {
+      {"quantized::linear_dynamic",
+       std::move(linear_dynamic),
+       std::move(quantized_linear_dynamic)},
+      {"quantized::linear_dynamic_fp16",
+       std::move(linear_dynamic_fp16),
+       std::move(quantized_linear_dynamic_fp16)},
+  };
+}
+
+static std::vector<QuantFusionInfo> linear_prepack_unpack_patterns() {
+  std::string linear_with_quant = R"(
+graph(%a_dequant, %w_quant, %b):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        return (%r) )";
+
+  std::string linear_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b):
+        %packed_params = quantized::linear_prepack(%w_quant, %b)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::linear(%a_dequant, %w_dequant, %b_unpacked)
+        return (%r) )";
+  std::string linear_fp16_with_cast = R"(
+graph(%w, %a_dq, %b):
+        %fp16_tensor = aten::_saturate_weight_to_fp16(%w)
+        %r = aten::linear(%a_dq, %fp16_tensor, %b)
+        return (%r) )";
+  std::string linear_fp16_with_prepack = R"(
+graph(%w, %a_dq, %b):
+        %packed_params = quantized::linear_prepack_fp16(%w, %b)
+        %w_unpacked : Tensor, %b_unpacked : Tensor? = quantized::linear_unpack_fp16(%packed_params)
+        %r = aten::linear(%a_dq, %w_unpacked, %b_unpacked)
+        return (%r) )";
+
+  return {
+      {"linear_prepack_unpack",
+       std::move(linear_with_quant),
+       std::move(linear_with_quant_prepack)},
+      {"linear_fp16_prepack_unpack",
+       std::move(linear_fp16_with_cast),
+       std::move(linear_fp16_with_prepack)},
+  };
+}
+
+static std::vector<QuantFusionInfo> conv_prepack_unpack_patterns() {
+  std::string conv1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv1d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv2d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv3d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv3d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv3dPackedParamsBase = quantized::conv3d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose1d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose2d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  return {
+      {"conv1d_prepack_unpack",
+       std::move(conv1d_with_quant),
+       std::move(conv1d_with_quant_prepack)},
+      {"conv2d_prepack_unpack",
+       std::move(conv2d_with_quant),
+       std::move(conv2d_with_quant_prepack)},
+      {"conv3d_prepack_unpack",
+       std::move(conv3d_with_quant),
+       std::move(conv3d_with_quant_prepack)},
+      {"conv_transpose1d_prepack_unpack",
+       std::move(conv_transpose1d_with_quant),
+       std::move(conv_transpose1d_with_quant_prepack)},
+      {"conv_transpose2d_prepack_unpack",
+       std::move(conv_transpose2d_with_quant),
+       std::move(conv_transpose2d_with_quant_prepack)}};
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e2342c71a445421e9edd01d55aad358cef6e73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <cstdint>
+#include <ostream>
+
+namespace torch {
+namespace jit {
+
+// Quantization type (dynamic quantization, static quantization).
+// Should match the Python enum in quantize_jit.py
+enum QuantType : std::uint8_t { DYNAMIC = 0, STATIC };
+
+std::ostream& operator<<(std::ostream& os, QuantType t);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c020ffb5752af2d2fcbaa8475e061e6aa79724e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+
+using PrePackParamFilterFn = std::function<bool(Node*)>;
+
+TORCH_API std::unordered_set<std::string> RegisterPrePackParams(
+    Module& m,
+    const std::string& method_name,
+    const PrePackParamFilterFn& is_packed_param,
+    const std::string& attr_prefix);
+
+TORCH_API std::string joinPaths(const std::vector<std::string>& paths);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1aa0384ff8fe5ecbb6ff1856a8c4f3af20cc70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// updates the types of tuples according to the type of their current inputs.
+TORCH_API void RefineTupleTypes(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..836f7b0cf59657d7ee901cfb233482b5975dbb2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void removeDropout(std::shared_ptr<Graph>& graph);
+
+TORCH_API void removeDropout(script::Module& module);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..67d014c7d60295da24de994a9457f0977c523ff8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Considering prim::RaiseException nodes unreachable, simplify prim::If nodes
+// when one of the branches contains prim::RaiseException.
+//
+// This pass is illegal in general case as the modified graph might not throw
+// an exception that the original graph would throw. The purpose of the pass is
+// to cleanup the graph in a "risky" way by removing pathways leading to
+// RaiseExceptions nodes. In some sense, this pass could be considered as a
+// "Release" mode, while the original graph was in a "Debug" mode.
+// The pass should only be used when such transformation is guaranteed to be
+// safe by some other mechanisms. For instance, when we know exact shapes of
+// tensors flowing through the graph and tensors with such shapes never cause
+// exceptions.
+TORCH_API void EliminateExceptions(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h
new file mode 100644
index 0000000000000000000000000000000000000000..792aca0c4492fe97c4a1305f333ce1256d596ff2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void RemoveExpands(const std::shared_ptr<Graph>& graph);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..de976c01f05e00b5a43ed1413589cd9ee9c4445c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <memory>
+
+namespace torch {
+namespace jit {
+// see .cpp for docs
+TORCH_API void RemoveInplaceOps(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void ImplicitCastForBinaryInplaceOps(Block* block);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h
new file mode 100644
index 0000000000000000000000000000000000000000..972214457dae61228a2ef3cb793d70cda40e88b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API MutationRemover {
+  MutationRemover(
+      std::shared_ptr<Graph> graph,
+      c10::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt)
+      : mutation_filter_(mutation_filter),
+        aliasDb_(nullptr),
+        graph_(std::move(graph)) {}
+
+  // return true if graph is modified
+  bool removeListMutation();
+
+  // return true if graph is modified
+  bool removeTensorMutation();
+
+  bool isSpecialMappedOp(Node* n) {
+    return n->matches("aten::zero_(Tensor(a!) self) -> Tensor(a!)") ||
+        n->matches(
+            "aten::fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)") ||
+        n->matches(
+            "aten::normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)");
+  }
+
+  bool inplaceOpVariant(Node* n);
+
+  static bool hasSideEffectOrAlias(Value* v, AliasDb* aliasDb);
+
+ private:
+  Node* createSpecialMappedOp(Node* n);
+  bool listMutationFollowingListConstruct(Node* n);
+  bool tryMakeCreationAndMutationAtomic(
+      Value* mutated_value,
+      Node* mutating_op);
+  bool tryMakeUnaliasedIfOutputAndMutationAtomic(
+      Value* mutated_value,
+      Node* mutating_op);
+  // return true if graph is modified
+  bool RemoveListMutation(Block* block);
+  // return true if graph is modified
+  bool RemoveTensorMutation(Block* block);
+
+  AliasDb* getOrCreateAliasDb() {
+    if (!aliasDb_) {
+      aliasDb_ = std::make_unique<AliasDb>(graph_);
+    }
+    return aliasDb_.get();
+  }
+
+  c10::optional<std::function<bool(Node*)>> mutation_filter_;
+  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
+  std::shared_ptr<Graph> graph_;
+};
+
+// Removes list mutation with functional equivalents
+// return true if graph is modified
+TORCH_API bool RemoveListMutation(const std::shared_ptr<Graph>& graph);
+
+// Replaces in-place aten ops with their functional equivalents
+// when it can be proven that this does not change graph semantics
+// if `mutation_filter` is present, the pass will only attempt to
+// remove mutation on nodes which return true for the filter
+// return true if graph is modified
+TORCH_API bool RemoveTensorMutation(
+    const std::shared_ptr<Graph>& graph,
+    c10::optional<std::function<bool(Node*)>> mutation_filter = c10::nullopt);
+
+// Replaces in-place aten activation ops with their functional equivalence
+TORCH_API bool InplaceToFunctionalActivation(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h
new file mode 100644
index 0000000000000000000000000000000000000000..52cac1101b64b02e3c5bbf33819b7fc68b9db28f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void RemoveRedundantProfiles(std::shared_ptr<Graph>& graph);
+TORCH_API void RemoveRedundantProfiles(Block* block, AliasDb& db);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..f228979c37a85abe280863e91893e7a3dbb656dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Find the valid upgrader graph for the upgrader and cache the result
+// for later lookups. Will error out if there is no valid upgrader graph
+// provided for the upgrader name.
+std::shared_ptr<Graph> getUpgraderGraph(const std::string& upgrader_name);
+
+TORCH_API void ReplaceOldOperatorsWithUpgraders(std::shared_ptr<Graph> graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..29aadf657e124dcd7a45ad69b0368217a3fbe765
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <memory>
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+struct ArgumentSpec;
+
+TORCH_API void PropagateRequiresGrad(std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7644222261e1a10675a01c24d70ed81270ed121
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// A map which stores if an activation operator can perform type promotion
+const std::unordered_map<Symbol, bool> activation_type_promotion_mapping = {
+    {aten::sigmoid, true},
+    {aten::tanh, true},
+    {aten::celu, false},
+    {aten::elu, false},
+    {aten::gelu, false},
+    {aten::glu, false},
+    {aten::hardshrink, false},
+    {aten::hardsigmoid, false},
+    {aten::hardswish, false},
+    {aten::hardtanh, false},
+    {aten::leaky_relu, false},
+    {aten::prelu, false},
+    {aten::relu6, false},
+    {aten::relu, false},
+    {aten::rrelu, false},
+    {aten::selu, false},
+    {aten::silu, false}};
+
+class FunctionalToInplaceRewriter {
+ public:
+  FunctionalToInplaceRewriter(std::shared_ptr<Graph> graph);
+
+  bool FunctionalToInplace(Block* block);
+
+ private:
+  AliasDb* getOrCreateAliasDb() {
+    if (!aliasDb_) {
+      aliasDb_ = std::make_unique<AliasDb>(graph_);
+    }
+    return aliasDb_.get();
+  }
+
+  bool CanBeInplace(Node* node);
+
+  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
+  std::shared_ptr<Graph> graph_;
+};
+
+// A common application scenario is to apply InplaceToFunctionalActivation
+// before some JIT optimization passes, so that those passes are less
+// constrained by in-place ops. After those passes are done, we can call
+// FunctionalToInplaceActivation to recover in-place activation ops,
+// so that we won't lose the performance benefit coming from memory reduction.
+
+// Replaces functional aten activation ops with their in-place equivalents
+TORCH_API bool FunctionalToInplaceActivation(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..f81bb7715ff865eabdc81dd17adb68eb6cdeb465
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+
+struct propagation_error : std::exception {};
+
+class PropertyPropBase {
+  // Used for both Shape Propagation and Dtype/Device Propagation
+ public:
+  explicit PropertyPropBase(std::shared_ptr<Graph> graph)
+      : graph_(std::move(graph)) {}
+  virtual ~PropertyPropBase() = default;
+
+  void propagateBlock(Block* block, bool insert_expands = true);
+  // insert_expands is used for shape inference
+
+  void processIf(Node* node);
+  void processLoop(Node* node);
+
+ protected:
+  virtual void propagateNode(Node* node, bool insert_expands = true) = 0;
+  void setUnshapedType(Value* o);
+  void setUnshapedType(Node* node);
+  std::shared_ptr<Graph> graph_;
+};
+
+TORCH_API void EraseShapeInformation(const std::shared_ptr<Graph>& graph);
+TORCH_API void PropagateInputShapes(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool mergeTypes(
+    ArrayRef<Value*> lhs,
+    ArrayRef<Value*> rhs,
+    ArrayRef<Value*> outputs);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h
new file mode 100644
index 0000000000000000000000000000000000000000..70b1394cb939e8c303e3ef7b2610c82de9779911
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// propagate autograd zero information through a gradient graph and
+// remove grad_of blocks if present.
+// Note: this is a very limited pass. It only propagates autograd zeros for
+// operations generated by the symbolic autodiff code and cleans up
+// AutogradAdds when possible. Outputs of other nodes are conservatively
+// marked Unknown and not optimized.
+TORCH_API void specializeAutogradZero(std::shared_ptr<Graph> g);
+
+struct ProfilingRecord;
+
+TORCH_API void InsertProfileNodesForSpecializeAutogradZero(ProfilingRecord* pr);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..e570995b34980af8286ab59b6e336472c4cc225c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h
@@ -0,0 +1,117 @@
+/** This file defines API for pattern-based subgraph rewrites.
+ *
+ * The API can be used for finding concrete patterns in the model and replacing
+ * the corresponding subgraphs with another subgraph. A special case of such
+ * rewrites is fusion, where the new subgraph consists of just a single node.
+ *
+ * There is a default set of the most common patterns that everyone could use.
+ * Alternatively, an arbitrary pattern can be registered.
+ */
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+// Forward declarations.
+struct RewritePatternDescr;
+struct Match;
+
+using MatchFilter = std::function<
+    bool(const Match&, const std::unordered_map<std::string, Value*>&)>;
+
+/** Run pattern-based subgraph rewrites on all methods in the module.
+ *
+ * This pass will go through all methods in the module and try to replace all
+ * recognized patterns (see SubgraphRewriter::RegisterDefaultPatterns for the
+ * list of these patterns).
+ */
+TORCH_API Module PatternBasedRewrite(const Module& module);
+
+/** A class implementing API for pattern-based subgraph rewrites.
+ *
+ * To perform pattern-based subgraph rewrites on a module using this API, one
+ * needs to create an object of such class, register rewrite patterns and run
+ * the transformation pass (`runOnModule`).
+ *
+ * To use standard patterns, one could use `RegisterDefaultPatterns`.
+ *
+ * To enable rewrites of custom patterns, the custom patterns must be registered
+ * with `RegisterRewritePattern`.
+ */
+class TORCH_API SubgraphRewriter {
+ public:
+  // Run pattern-based subgraph rewrite pass on the module.
+  Module runOnModule(const Module& module);
+
+  // Run pattern-based subgraph rewrite pass on the graph (used in testing).
+  // `filter` is a function that does extra filtering on the match. If it
+  // returns false for a given Match, we'll skip the Match. The filter
+  // function's arguments consist of a Match and a value map from parsing the
+  // pattern graph. Both the Match and the value map are necessary because we
+  // need to 1) do extra filtering on the matched result as well as 2) refer to
+  // the values in the matched result through the values in the pattern graph.
+  void runOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const std::vector<MatchFilter>& filters);
+
+  void runOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const MatchFilter& filter =
+          [](const Match&, const std::unordered_map<std::string, Value*>&) {
+            return true;
+          }) {
+    runOnGraph(graph, std::vector<MatchFilter>({filter}));
+  }
+
+  // Register standard rewrite patterns.
+  void RegisterDefaultPatterns();
+
+  /** Register a custom rewrite pattern.
+   *
+   * The method takes two parameters specifying the pattern:
+   * \p PATTERN - IR string representing the pattern subgraph.
+   * \p REPLACEMENT - IR string representing the replacement subgraph.
+   * \p value name map - vector of pairs mapping values in the replacement graph
+   * to the values in the pattern graph. Used for preserving source range info
+   * across graph rewrite.
+   *
+   * See examples of pattern registering in `RegisterDefaultPatterns`.
+   */
+  void RegisterRewritePattern(
+      const std::string& pattern,
+      const std::string& replacement,
+      const std::vector<std::pair<std::string, std::string>>& value_name_pair =
+          {});
+
+ private:
+  std::vector<RewritePatternDescr> patterns_;
+  std::unordered_set<Node*> nodes_to_delete_;
+
+  void rewriteSinglePatternOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const RewritePatternDescr& pattern,
+      const std::vector<MatchFilter>& filters);
+
+  bool overlapsWithPreviousMatches(const Match* match);
+};
+
+/** Rewrite pattern descriptor.
+ *
+ * This structure is used in the implementation of `SubgraphRewriter` and
+ * is not supposed to be used externally.
+ */
+struct RewritePatternDescr {
+  std::string pattern;
+  std::string replacement;
+  std::unordered_map<std::string, std::string> value_name_map;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef6a9f964a6ab7765c6cc3849a3cf5feb6fcf732
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+
+namespace torch {
+namespace jit {
+
+// CAUTION NOT TO BE USED, STILL A WIP, NOT STABLE
+
+TORCH_API void PropagateShapesOnGraph(std::shared_ptr<Graph>& graph);
+
+// CAUTION NOT TO BE USED, STILL A WIP, NOT STABLE
+// From [beg, end) attempt to propagate shapes and
+// build up a graph that will compute all remaining symbolic
+// shapes in [beg, end) that can be executed before beg
+
+struct ShapeComputeGraphMapping {
+  ShapeComputeGraphMapping(
+      std::shared_ptr<Graph> partial_eval_shape_graph,
+      std::unordered_map<Value*, Value*>
+          enclosing_graph_value_to_shape_graph_input,
+      std::unordered_map<Value*, int64_t> graph_output_to_symbolic_shape_dim)
+      : partial_eval_shape_graph(std::move(partial_eval_shape_graph)),
+        enclosing_graph_value_to_shape_graph_input_(
+            std::move(enclosing_graph_value_to_shape_graph_input)),
+        graph_output_to_symbolic_shape_dim_(
+            std::move(graph_output_to_symbolic_shape_dim)){};
+
+  std::shared_ptr<Graph> partial_eval_shape_graph;
+  std::unordered_map<Value*, Value*>
+      enclosing_graph_value_to_shape_graph_input_;
+  std::unordered_map<Value*, int64_t> graph_output_to_symbolic_shape_dim_;
+};
+
+TORCH_API c10::optional<ShapeComputeGraphMapping>
+PropagateShapesAndBuildLargeShapeComputeGraph(
+    std::shared_ptr<Graph>& graph,
+    Node* beg,
+    Node* end);
+
+// don't insert complete tensor shapes in shape compute graphs and instead
+// rely on our partial evaluation pipeline to propagate information.
+// this is a good proxy for our ability to propagate non-complete shape
+// information.
+TORCH_API bool setSymbolicShapeAnalysisTestMode(bool value);
+TORCH_API bool symbolicShapeAnalysisTestModeEnabled();
+
+using SSAInput = std::variant<IValue, c10::SymbolicShape>;
+TORCH_API c10::optional<std::vector<c10::SymbolicShape>>
+calculateSymbolicShapesOnOp(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& inputs);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd4588a91dfbbe66108cfc04679d0342eda09d64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API CanonicalizedSymbolicShape {
+  // TODO: Consider in the future if it is reasonable to
+  // merge code with SymbolicShape or VaryingShape while keeping
+  // the two not implicitly convertable (and cause bugs).
+  CanonicalizedSymbolicShape(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map) {
+    init(orig_shape, ss_map);
+  }
+
+  CanonicalizedSymbolicShape(c10::SymbolicShape& orig_shape) {
+    std::unordered_map<int64_t, int64_t> new_ssmap;
+    init(orig_shape, new_ssmap);
+  }
+
+  size_t hash() const;
+
+  c10::SymbolicShape toSymbolicShape(
+      std::unordered_map<int64_t, int64_t>& inverse_ss_map) const;
+
+  TORCH_API friend bool operator==(
+      const CanonicalizedSymbolicShape& a,
+      const CanonicalizedSymbolicShape& b);
+
+ private:
+  c10::optional<std::vector<int64_t>> values_;
+
+  void init(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map);
+};
+
+// SHAPE CACHE API
+TORCH_API c10::optional<std::vector<at::SymbolicShape>>
+get_cached_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec);
+
+TORCH_API void cache_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec,
+    const std::vector<at::SymbolicShape>& ret_vec);
+
+// For use in test code
+TORCH_API void clear_shape_cache();
+TORCH_API size_t get_shape_cache_size();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..a24d395907786836cc67a7945a7597ff7c5f0a57
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+
+// Takes in a TensorExprGraph of static shapes and generalizes the input shapes
+// to symbolic dimensions. Dimensions of value 1 will be preserved, otherwise
+// dimensions with the same value will be bucketed to the same symbolic shape.
+// E.g. Tensor(5, 3), Tensor(3, 1) -> Tensor(SS(-1), SS(-2)), Tensor(SS(-2), 1)
+// From there, runs symbolic shape inference on the graph, and creates a
+// versioning if in the graph with prim::TensorExprDynamicGuard checking if
+// the inputs at runtime match the Generalized Symbolic Shapes that are inputs
+// to the TE Kernel. The computate to calculate all symbolic dimensions is
+// inlined in to the if block with the TE Kernel. All Sym Dim Value* are
+// appended to the end of the TE Kernel Graph/Node inputs, and the Node is
+// augmented with a integer list attr `symbolic_shape_inputs` that gives the
+// mapping from Value * -> Symbolic Shape int64_t value. For more lengthy IR
+// examples and walkthrough look at ShapeAnalysisTest.DynamicShapesFusion in
+// `test_shape_analysis` Returns True on Success, False on Failure, can fail if
+// shape propagation fails to propagate # of dims or if complete shapes on
+// inputs not set
+
+TORCH_API bool GenerateGuard(
+    Node* tensorexpr_graph_node,
+    bool add_composed_op = false);
+
+TORCH_API void runTensorExprDynamicGroup(const Code& code, Stack& stack);
+
+enum class StrideInput {
+  // Tensors natively store whether they are contiguous or not as a property
+  // this makes it faster to query `is_contiguous` or
+  // `is_contiguous(memory_format=channels_last)`
+  // than looping through the sizes/strides yourself
+  // For tensors with these properties, we only store one value:
+  TENSOR_CONT,
+  TENSOR_CONT_CHANNELS_LAST,
+  // now, we describe other cases, where there is one stride enum
+  // per dimension
+  S_ONE, // STRIDE_ONE: packed
+  S_CONT, // STRIDE_CONTIGUOUS: stride[i + 1] * sizes[i + 1]
+  S_TRAN_CONT, // STRIDE_TRANSPOSED_CONTIGUOUS: stride[i-1] * sizes[i-1]
+  S_AS_ARG, // STRIDE_AS_ARG: stride passed in as runtime value
+};
+
+TORCH_API std::string toString(StrideInput si);
+TORCH_API StrideInput strideInputFromString(const std::string& si);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..8baea80d0cb8dc4607882e6e5b021dd26932473b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+
+// Run TensorExpressions-based fuser.
+// If add_composed_op is true, creates a single operation that
+// performs both the runtime check that types align
+// and then the dispatch to the kernel/unoptimized graph
+TORCH_API void FuseTensorExprs(
+    std::shared_ptr<Graph>& graph,
+    size_t min_group_size = 2,
+    bool add_composed_op = false,
+    bool fuse_to_dynamic_shapes = false);
+
+TORCH_API void setTensorExprFuserEnabled(bool val);
+TORCH_API bool tensorExprFuserEnabled();
+TORCH_API void setTensorExprDynamicShapeFusionEnabled(bool val);
+TORCH_API bool tensorExprDynamicShapeFusionEnabled();
+TORCH_API bool setTexprReductionsEnabled(bool value);
+TORCH_API bool texprReductionsEnabled();
+
+TORCH_API void RemoveProfileNodesAndSpecializeTypes(
+    std::shared_ptr<Graph>& graph);
+TORCH_API bool hasTensorTypeSpecialization(Value* v);
+TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
+TORCH_API void removeTensorTypeSpecializations(Block* block);
+
+using tensor_type_converter_t =
+    c10::function_ref<TensorTypePtr(const TensorTypePtr& t)>;
+
+// inserts a TypeCheck pattern
+//
+// around the guarded node that has a Subgraph attribute, this inserts a pattern
+//
+//   if TypeCheck(...):
+//     guarded_node
+//   else:
+//     FallbackGraph(...)
+//
+// The TypeCheck includes the types of all Tensor inputs to the guarded_node,
+// as processed by the type_converter, a lambda
+// TensorTypePtr(const TensorTypePtr& t). This allows to erase irrelevant
+// aspects of the type.
+//
+// The Fallback graph will have the same subgraph as the guarded node (with the
+// expectation that the guarded_node's subgraph will then be optimized.
+TORCH_API void insertTypeGuard(
+    Node* guarded_node,
+    tensor_type_converter_t type_converter,
+    c10::Symbol kind);
+
+TORCH_API bool usedOnlyInSize(Value* v);
+TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
+
+namespace tensorexpr {
+TORCH_API bool isSupported(Node* node);
+
+/// Get the modifiable custom operator set object.
+///
+/// For static shapes, if a custom operator has been added to the custom
+/// operator set, it will be pulled into the NNC fusion group. But it doesn't
+/// work with dynamic shapes unless explicitly register the shape function via
+/// `torch::jit::RegisterShapeComputeGraphForSchema` for the custom operator.
+///
+/// @return Reference of the custome operator set
+///
+TORCH_API OperatorSet& getCustomOperatorSet();
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c5610ac22016ca66509ebca91a363f9d8125ab4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Because differentiable graphs detach the gradients of input Tensors,
+// creating and inlining differentiable graphs changes the requires_grad
+// property of tensors in the graph. This pass updates prim::profiles
+// requires_grad to keep profiled properties up to date, it does not update
+// grad properties of other nodes like graph inputs bc the only downstream
+// user of the grad property is the profiling executor, which just uses
+// the types of prim::profiles
+TORCH_API void UpdateDifferentiableGraphRequiresGrad(
+    std::shared_ptr<Graph>& diff_forward_graph,
+    c10::optional<bool> new_requires_grad);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h
new file mode 100644
index 0000000000000000000000000000000000000000..ead3ce6969cf04529181b4c5ef309bb56dde31ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+// Verify that alias annotations are correct. See impl for definition of
+// "correct".
+//
+// This function expects a graph with a single op with `unqualifiedOpName`, plus
+// the inputs that you would otherwise have passed to the graph executor.
+TORCH_API void checkAliasAnnotation(
+    const std::shared_ptr<Graph>& graph,
+    std::vector<IValue> pythonInputs,
+    const std::string& unqualifiedOpName);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa50bd9043442a8e06f3bd01646cd85a923fe551
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Optional.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/sparse_bitset.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+
+// Uses a compressed index representation for faster comparisons
+typedef c10::SparseBitVector<256> MemoryLocations;
+namespace torch {
+namespace jit {
+
+struct Value;
+
+using AliasTypeSet = std::vector<TypePtr>;
+
+// `Element` represents a vertex in the points-to graph. It represents
+// anything that could have an aliasing relationship--mostly IR
+// `Value`s, but also wildcards or the type inside a container (e.g. `T`
+// in `List[T]`)
+struct Element {
+  Element(const Value* value_, unsigned index_);
+  // wildcard constructor
+  explicit Element(unsigned index_);
+
+  // Index into the owning DAG's bit vector that represents this element.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  unsigned index;
+
+  // All elements that this element *may* point to. It's possible to have
+  // multiple elements that you might point to due to control flow/complex ops
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointsTo;
+  // Backreference for points-to.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointedFrom;
+
+  // Elements can contain other elements (e.g. List[Tensor])
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations containedElements;
+
+  // The values that this element corresponds to. May be empty if this element
+  // doesn't represent a first-class value.
+  // This is for debug information only.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_set<const Value*> values;
+
+ private:
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
+
+  friend class MemoryDAG;
+  // We memoize the results of `getMemoryLocations` to speed up queries.
+  // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is
+  // immutable, this cache should never need to be invalidated.
+  mutable c10::optional<MemoryLocations> cachedMemoryLocations_;
+
+  mutable c10::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
+};
+
+// class MemoryDAG
+//
+// This class tracks the "A points to B" graph for all values. It is used by
+// AliasDb to provide a higher-level API.
+//
+// We maintain a DAG where:
+//   - Vertices (called "Elements") represent Values and
+//     other aliasing entities (e.g. the stuff inside a list)
+//   - Edges represent a "points-to" relationship.
+//
+// Leaves in this DAG are entities that don't point to anything, and thus
+// correspond to unique "memory locations".
+//
+// So, by traversing the "points-to" graph to the leaves, you can determine
+// which memory locations an element may point to.
+class TORCH_API MemoryDAG {
+ public:
+  explicit MemoryDAG(std::vector<std::unique_ptr<Element>> indexToElementMap)
+      : indexToElementMap_(std::move(indexToElementMap)) {}
+  // explicitly delete copy constructor because otherwise windows build is
+  // confused for an exported class see
+  // https://stackoverflow.com/a/51033485/105137
+  MemoryDAG(const MemoryDAG&) = delete;
+  MemoryDAG& operator=(const MemoryDAG&) = delete;
+
+  // Return the unique memory locations that `Element` might represent.
+  const MemoryLocations& getMemoryLocations(const Element* e) const;
+
+  // Do `a` and `b` potentially share a memory location?
+  bool mayAlias(const Element* a, const Element* b) const;
+
+  // Does `a` hold reference to any memory that is stored in `b`, or vice versa?
+  bool mayContainAlias(const Element* a, const Element* b) const;
+
+  bool mayContainAlias(const Element* a, const at::ArrayRef<Element*> b) const;
+
+  bool mayContainAlias(
+      const at::ArrayRef<Element*> a,
+      const at::ArrayRef<Element*> b) const;
+
+  // Converts from the compressed index representation
+  const Element* fromIndex(unsigned x) const;
+  Element* fromIndex(unsigned x);
+  void collectAllContainedMemoryLocations(
+      const Element* elem,
+      MemoryLocations& cont) const;
+
+  /**
+   * The following methods are special cases where we need to mutate the
+   * internals of MemoryDAG for efficiency reasons. Don't call them unless you
+   * know what you're doing! In particular, don't add new mutating methods
+   * without ensuring that you are maintaining cache consistency for memory
+   * locations.
+   */
+
+  // Adding wildcards can trigger extremely expensive cache invalidations. This
+  // method adds them in a more efficient cache-aware way.
+  void setWildcards(
+      const std::unordered_set<const Value*>& wildcards,
+      const ska::flat_hash_map<const Value*, Element*>& elementMap,
+      const std::function<Element*(const Value*)>& getWildcardElement);
+  Element* unsafeMakeFreshValue(const Value* v);
+
+ private:
+  const MemoryLocations& getAllContainedMemoryLocations(
+      const Element* elem) const;
+  void collectAllContainedMemoryLocationsImpl(
+      const Element* elem,
+      MemoryLocations& cont) const;
+  std::vector<std::unique_ptr<Element>> indexToElementMap_;
+};
+
+/**
+ * Helper to build up the points-to graph.
+ *
+ * We separate the "building" into a different class because it allows us to
+ * cache internally to MemoryDAG without worrying about how the DAG structure
+ * is mutated.
+ */
+class TORCH_API MemoryDAGBuilder {
+ public:
+  MemoryDAGBuilder() = default;
+  MemoryDAGBuilder(const MemoryDAGBuilder&) = delete;
+  MemoryDAGBuilder& operator=(const MemoryDAGBuilder&) = delete;
+
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
+
+  void addToContainedElements(Element* contained, Element* container);
+
+  std::unique_ptr<MemoryDAG> createMemoryDAG() && {
+    return std::make_unique<MemoryDAG>(std::move(indexToElementMap_));
+  }
+
+  // Make a fresh Element (i.e. an Element that doesn't point to anything) and
+  // return it.
+  Element* makeFreshValue(const Value* v);
+
+  friend MemoryDAG;
+
+ private:
+  // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses
+  // the map to construct the `MemoryDAG`
+  std::vector<std::unique_ptr<Element>> indexToElementMap_;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..6accf6b359830664fc0fe69c62192bec3a625dc6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+// Moved from shape_analysis.cpp
+
+// Requirements:
+//   dims           : preserved from the first argument
+//   scalar type    : preserved from the first argument (doesn't have to
+//                    match other arguments)
+//   device         : always matching and preserved
+//   tensor inputs  : *
+//   tensor outputs : 1
+// NB: those ops (with slight adjustments) are good candidates for restarts.
+//     Knowing the type and device of weights or biases is usually enough to
+//     infer the output type.
+std::shared_ptr<OperatorSet> nn_ops_first_input_preserving();
+
+// Requirements:
+//   dims           : Changed from first argument
+//   scalar type    : preserved from the first argument
+//   device         : always matching and preserved
+//   tensor inputs  : 1
+//   tensor outputs : 1
+std::shared_ptr<OperatorSet> ops_one_tensor_in_shape_transform();
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b0f960f93dc58ce43c2907715c5aff78bf13592
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h
@@ -0,0 +1,14 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Checks if the parameters, not including the
+// first param are all constants.
+bool nonConstantParameters(Node* n);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..002d1d577d3ec12500711a068025d6f1c1646dde
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Utilities for dealing with nodes that contain subgraphs.
+//
+// They handle the complexity of editing inputs/outputs as you merge nodes in
+// and out of subgraphs.
+namespace SubgraphUtils {
+
+// Create a new subgraph node that contains only `n`. The new subgraph will have
+// `subgraphKind` as its type.
+//
+// `n` is destroyed.
+//
+// Returns the new subgraph node.
+TORCH_API Node* createSingletonSubgraph(Node* n, Symbol subgraphKind);
+
+// Creates a new subgraph that only contains `n`, amd updates the new outputs
+// of the subgraph to have the aliasing properties of the original `n` outputs
+TORCH_API Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db);
+
+// Merge a node into a subgraph node. If `toMerge` is also a subgraph, the
+// subgraphs are merged.
+// If `destroyNode` is true `toMerge` is destroyed.
+// An optional argument 'vmap' could be used to retrieve value mappings.
+// Values will be mapped to their new subgraph values
+TORCH_API void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    bool destroyNode = true);
+
+// Merges a node into a subgraph node, and updates the new outputs of the
+// subgraph to have the aliasing properties of the corresponding `to_merge`
+// outputs
+TORCH_API void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db);
+
+TORCH_API std::vector<Node*> unmergeAliasedOutputs(
+    Node* subgraphNode,
+    AliasDb& db);
+
+// Move nodes from a subgraph node to the outer graph.
+// `subgraphNode` is destroyed.
+TORCH_API void unmergeSubgraph(Node* subgraphNode);
+
+// Move `node_to_unmerge` and its descendants after `subgraphNode`
+// promotes any dependencies of `node_to_unmerge` to subgraphNode outputs
+TORCH_API void unmergeNode(Node* node_to_unmerge, Node* subgraphNode);
+
+TORCH_API bool unmergeOutputsAlisingInputs(Node* subgraphNode);
+
+TORCH_API bool unmergeAliasedOutputs(Node* subgraphNode);
+
+// Convenience function
+std::shared_ptr<Graph> getSubgraph(Node* n);
+
+TORCH_API std::string generateNameForGraph(
+    const std::shared_ptr<Graph>& graph,
+    size_t maxlen = 40,
+    const std::string& prefix = "fused");
+
+} // namespace SubgraphUtils
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac4cf4fbb9faf49f44de4ee5f0ac329c0612d8f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir_views.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/passes/peephole_list_idioms.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch {
+namespace jit {
+
+// Refine from Value of type List -> len of list
+// If a refinement mapping of List Value * -> len is present in a block
+// the list is guaranteed to be that length
+// TODO: vector may be faster
+using ListRefinement = std::unordered_map<Value*, int64_t>;
+
+TORCH_API ListRefinement
+intersectRefinements(const ListRefinement& ref1, const ListRefinement& ref2);
+
+TORCH_API ListRefinement
+unionRefinements(const ListRefinement& ref1, const ListRefinement& ref2);
+
+// Represents the refinement information that can be carried on a boolean
+struct BooleanRefinementMapping {
+  BooleanRefinementMapping(
+      ListRefinement true_refine,
+      ListRefinement false_refine)
+      : true_refine_(std::move(true_refine)),
+        false_refine_(std::move(false_refine)){};
+  BooleanRefinementMapping() = default; // empty
+
+  static BooleanRefinementMapping FalseRefinements(
+      ListRefinement false_refine) {
+    return BooleanRefinementMapping({}, std::move(false_refine));
+  }
+
+  static BooleanRefinementMapping TrueRefinements(ListRefinement true_refine) {
+    return BooleanRefinementMapping(std::move(true_refine), {});
+  }
+
+  BooleanRefinementMapping intersectBooleanRefinementMapping(
+      BooleanRefinementMapping& other) {
+    return BooleanRefinementMapping(
+        intersectRefinements(true_refine_, other.true_refine()),
+        intersectRefinements(false_refine_, other.false_refine()));
+  }
+
+  ListRefinement& true_refine() {
+    return true_refine_;
+  }
+
+  ListRefinement& false_refine() {
+    return false_refine_;
+  }
+
+ private:
+  ListRefinement true_refine_;
+  ListRefinement false_refine_;
+};
+
+TORCH_API void joinIfRefinements(
+    Node* if_node,
+    std::unordered_set<Block*>& throwing_blocks,
+    ListRefinement& curr_block_refinements,
+    ListRefinement& true_block_refinements,
+    ListRefinement& false_block_refinements,
+    std::unordered_map<Value*, BooleanRefinementMapping>& info);
+
+// handles adding blocks to throwing blocks and propagating refinements via
+// boolean comparisons
+TORCH_API bool handleCommonRefinentOperators(
+    Node* n,
+    std::unordered_set<Block*>& throwing_blocks,
+    std::unordered_map<Value*, BooleanRefinementMapping>& info);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5398c5812906d43fabae9927fa26ca4e628a71a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+// Try to replace an op that takes a list input with another op that takes a
+// variadic number of arguments.
+TORCH_API bool UseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op);
+
+TORCH_API bool RemoveListMutationAndUseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op);
+
+// Convenient functions for replacing aten::stack/aten::cat with their
+// variadic versions.
+TORCH_API bool UseVariadicCat(const std::shared_ptr<Graph>& graph);
+TORCH_API bool RemoveListMutationAndUseVariadicCat(
+    const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool UseVariadicStack(const std::shared_ptr<Graph>& graph);
+TORCH_API bool RemoveListMutationAndUseVariadicStack(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..50e9f16cde51f3a7d1e9ad732f0d7a428eeffccf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
+
+namespace torch {
+namespace jit {
+TORCH_API void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void vulkanInsertPrePackedOps(script::Module& module);
+TORCH_API void vulkanFusePrePackedConvWithClamp(script::Module& module);
+TORCH_API void vulkanFoldPrePackingOps(script::Module& module);
+TORCH_API script::Module vulkanOptimizeForMobile(
+    const script::Module& module,
+    const std::set<MobileOptimizerType>& optimization_blocklist,
+    const std::vector<std::string>& preserved_methods);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd0b7e7ad37cf97719da11659b01dc1c9a73c475
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API void transformConv1dToConv2d(std::shared_ptr<Graph>& graph);
+TORCH_API void transformConv1dToConv2d(script::Module& module);
+TORCH_API void insertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void insertPrePackedOps(script::Module& module);
+TORCH_API void fusePrePackedLinearConvWithClamp(script::Module& module);
+TORCH_API void FoldPrePackingOps(script::Module& module);
+TORCH_API script::Module optimizeForMobile(
+    const script::Module& module,
+    const std::set<MobileOptimizerType>& optimization_blocklist = {},
+    const std::vector<std::string>& preserved_methods = {});
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d03243c6d5cc7d6be586734c73421d1985c7eee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+void initJITBindings(PyObject* module);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..043c36038081757de485c0cc9d02ea4f207e177e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace torch::jit {
+
+inline c10::optional<Module> as_module(py::handle obj) {
+  static py::handle ScriptModule =
+      py::module::import("torch.jit").attr("ScriptModule");
+  if (py::isinstance(obj, ScriptModule)) {
+    return py::cast<Module>(obj.attr("_c"));
+  }
+  return c10::nullopt;
+}
+
+inline c10::optional<Object> as_object(py::handle obj) {
+  static py::handle ScriptObject =
+      py::module::import("torch").attr("ScriptObject");
+  if (py::isinstance(obj, ScriptObject)) {
+    return py::cast<Object>(obj);
+  }
+
+  static py::handle RecursiveScriptClass =
+      py::module::import("torch.jit").attr("RecursiveScriptClass");
+  if (py::isinstance(obj, RecursiveScriptClass)) {
+    return py::cast<Object>(obj.attr("_c"));
+  }
+  return c10::nullopt;
+}
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2a10d711dc87fdc0bf188748a2b1a2d2548fca1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h
@@ -0,0 +1,213 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace torch::jit {
+
+// This is a variant of shared_ptr that "sees through" a wrapper.
+// We use it to convert Value, Node, Block and node to "wrapped" Python
+// values. When we destruct the C++ object, the wrapper's pointer will
+// be set to 0 and any future dereferencing will throw. We need this
+// because the Python objects may hang around after the C++ object
+// has already been destroyed.
+// This also needs the magic type_caster below, which is from the
+// workaround offered in https://github.com/pybind/pybind11/issues/2751
+template <typename T>
+class unwrapping_shared_ptr {
+  static_assert(
+      std::is_same<T, torch::jit::Value>::value ||
+          std::is_same<T, torch::jit::Node>::value ||
+          std::is_same<T, torch::jit::Block>::value,
+      "unwrapping type only defined for Graph object types");
+
+ private:
+  std::shared_ptr<torch::jit::Wrap<T>> impl;
+
+ public:
+  unwrapping_shared_ptr() : impl({}) {}
+  explicit unwrapping_shared_ptr(T* p) : impl(p->wrap()) {
+    impl->clear_cb = &clear_registered_instances;
+  }
+  T* get() const {
+    if (!impl->elem) {
+      throw std::logic_error("has been invalidated");
+    }
+    return impl->elem;
+  }
+  // we need to disable the overloaded & for PyBind11 < 2.3 due.
+  // see https://github.com/pybind/pybind11/pull/1435
+#if (PYBIND11_VERSION_MAJOR > 2) || \
+    ((PYBIND11_VERSION_MAJOR == 2) && (PYBIND11_VERSION_MINOR >= 3))
+  T** operator&() {
+    if (!impl->elem) {
+      throw std::logic_error("has been invalidated");
+    }
+    return &(impl->elem);
+  }
+#endif
+};
+
+} // namespace torch::jit
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, torch::jit::unwrapping_shared_ptr<T>, true);
+
+namespace pybind11::detail {
+
+#define CREATE_UNWRAPPING_CASTER(Class)                                                   \
+  template <>                                                                             \
+  struct type_caster<Class> : public type_caster_base<Class> {                            \
+   public:                                                                                \
+    using type = Class;                                                                   \
+    using holder_type = torch::jit::unwrapping_shared_ptr<Class>;                         \
+                                                                                          \
+    bool load(handle src, bool convert) {                                                 \
+      return load_impl<type_caster<Class>>(src, convert);                                 \
+    }                                                                                     \
+                                                                                          \
+    explicit operator type*() {                                                           \
+      return static_cast<type*>(value);                                                   \
+    }                                                                                     \
+    explicit operator type&() {                                                           \
+      return *static_cast<type*>(value);                                                  \
+    }                                                                                     \
+                                                                                          \
+   protected:                                                                             \
+    friend class type_caster_generic;                                                     \
+                                                                                          \
+    bool load_value(value_and_holder&& v_h) {                                             \
+      if (v_h.holder_constructed()) {                                                     \
+        value = v_h.template holder<holder_type>().get();                                 \
+        return true;                                                                      \
+      } else {                                                                            \
+        throw cast_error(                                                                 \
+            "Unable to cast from non-held to held instance (#Class& to Holder<#Class>)"); \
+      }                                                                                   \
+    }                                                                                     \
+  }
+
+CREATE_UNWRAPPING_CASTER(torch::jit::Node);
+CREATE_UNWRAPPING_CASTER(torch::jit::Value);
+CREATE_UNWRAPPING_CASTER(torch::jit::Block);
+
+#undef CREATE_UNWRAPPING_CASTER
+
+template <>
+struct type_caster<torch::jit::IValue> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
+
+  bool load(handle src, bool) {
+    try {
+      value = torch::jit::toTypeInferredIValue(src);
+      return true;
+    } catch (std::exception& e) {
+      return false;
+    }
+  }
+
+  static handle cast(
+      torch::jit::IValue src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return torch::jit::toPyObject(std::move(src)).release();
+  }
+};
+
+template <>
+struct type_caster<torch::jit::Symbol> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::Symbol, _("Symbol"));
+
+  bool load(handle src, bool) {
+    // TODO: Is there a way to py::cast that doesn't raise an exception on
+    // failure?  Can we catch pybind11::cast_error here instead?
+    std::string src_str;
+    try {
+      src_str = py::cast<std::string>(src);
+    } catch (std::exception& e) {
+      return false;
+    }
+    value = torch::jit::Symbol::fromQualString(src_str);
+    return true;
+  }
+
+  static handle cast(
+      torch::jit::Symbol src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return py::cast(std::string(src.toQualString()), return_value_policy::copy)
+        .release();
+  }
+};
+
+template <>
+struct type_caster<torch::jit::AttributeKind> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::AttributeKind, _("AttributeKind"));
+
+  bool load(handle src, bool) {
+    return false;
+  }
+
+  static handle cast(
+      torch::jit::AttributeKind src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return py::cast(
+               std::string(torch::jit::toString(src)),
+               return_value_policy::copy)
+        .release();
+  }
+};
+
+// See https://github.com/pybind/pybind11/issues/637
+using ListCasterBase = pybind11::detail::
+    list_caster<std::vector<torch::jit::Node*>, torch::jit::Node*>;
+template <>
+struct type_caster<std::vector<torch::jit::Node*>> : ListCasterBase {
+  static handle cast(
+      const std::vector<torch::jit::Node*>& src,
+      return_value_policy,
+      handle parent) {
+    return ListCasterBase::cast(src, return_value_policy::reference, parent);
+  }
+  static handle cast(
+      const std::vector<torch::jit::Node*>* src,
+      return_value_policy pol,
+      handle parent) {
+    return cast(*src, pol, parent);
+  }
+};
+
+} // namespace pybind11::detail
+
+namespace torch::jit {
+
+static inline py::tuple tuple_tail(const py::tuple& tup) {
+  py::tuple r(tup.size() - 1);
+  for (const auto i : c10::irange(1, tup.size())) {
+    r[i - 1] = tup[i];
+  }
+  return r;
+}
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..436ede8dfe5e21cae3304cb9fd1d3831be66278b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h
@@ -0,0 +1,1158 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <ATen/core/stack.h>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/Layout.h>
+#include <torch/csrc/QScheme.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/python/module_python.h>
+#include <torch/csrc/jit/python/python_custom_class.h>
+#include <torch/csrc/jit/python/python_tracer.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/six.h>
+#ifdef USE_DISTRIBUTED
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#endif
+
+#include <ATen/core/function_schema.h>
+#include <c10/core/Stream.h>
+#ifdef USE_C10D_NCCL
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <utility>
+#include <vector>
+
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+namespace torch::jit {
+
+using ResolutionCallback = std::function<py::object(std::string)>;
+
+void clear_registered_instances(void* ptr);
+
+TORCH_PYTHON_API IValue toIValue(
+    py::handle obj,
+    const TypePtr& type,
+    c10::optional<int32_t> N = c10::nullopt);
+
+TORCH_PYTHON_API py::object toPyObject(IValue ivalue);
+
+// Hack to overload the behavior of toIValue to accept Python
+// numbers in places where a Tensor is expected
+// See also torch::should_allow_numbers_as_tensors
+class ToIValueAllowNumbersAsTensors {
+  bool old_;
+
+ public:
+  ToIValueAllowNumbersAsTensors(bool enable);
+  ~ToIValueAllowNumbersAsTensors();
+};
+
+// Wrap Python function to guard deref
+// NB: Need VISIBILITY_HIDDEN for silencing compiler error,
+// 'torch::jit::PythonFunctionGuard' declared with greater visibility than the
+// type of its field 'torch::jit::PythonFunctionGuard::func_'
+struct VISIBILITY_HIDDEN PythonFunctionGuard {
+  explicit PythonFunctionGuard(py::function func) : func_(std::move(func)) {}
+
+  ~PythonFunctionGuard() {
+    pybind11::gil_scoped_acquire ag;
+    func_.dec_ref();
+    // explicitly setting PyObject* to nullptr to prevent py::object's dtor to
+    // decref on the PyObject again.
+    // See Note [Destructing py::object] in python_ivalue.h
+    func_.ptr() = nullptr;
+  }
+
+  py::function func_;
+};
+
+// The PythonFutureWrapper for ivalue::Future
+//
+// NB: VISIBILITY_HIDDEN is for silencing compiling error,
+// "error: 'torch::jit::PythonFutureWrapper' declared with greater visibility
+// than the type of its field 'torch::jit::PythonFutureWrapper::unwrap_func'
+// [-Werror=attributes]"
+//
+// NB: inherit from enable_shared_from_this because then(py::function) needs to
+//     get a shared_ptr from this pointer.
+struct VISIBILITY_HIDDEN PythonFutureWrapper
+    : std::enable_shared_from_this<PythonFutureWrapper> {
+  using UnwrapFunc = std::function<void(py::object)>;
+
+  explicit PythonFutureWrapper(
+      c10::intrusive_ptr<c10::ivalue::Future> fut,
+      c10::optional<UnwrapFunc> unwrap_func = c10::nullopt)
+      : fut(std::move(fut)), unwrap_func(std::move(unwrap_func)) {}
+
+  explicit PythonFutureWrapper(const PythonFutureWrapper&) = delete;
+  PythonFutureWrapper& operator=(const PythonFutureWrapper&) = delete;
+
+  bool done() {
+    return fut->completed();
+  }
+
+  py::object value() {
+    // acquiring GIL as toPyObject creates new py::object
+    // without grabbing the GIL.
+    py::gil_scoped_acquire acquire;
+    py::object py_obj = toPyObject(fut->value());
+    // unwrap_func is a general compositional function that takes in a
+    // py::object and executes some python function. It is currently mostly used
+    // to throw python exceptions.
+    if (unwrap_func) {
+      (*unwrap_func)(py_obj);
+    }
+    return py_obj;
+  }
+
+  py::object wait() {
+    fut->wait();
+    if (jit::tracer::isTracing()) {
+      auto graph = jit::tracer::getTracingState()->graph;
+
+      Value* fut_val = jit::tracer::getValueTrace(fut);
+      auto output = graph->insert(aten::wait, {fut_val});
+      jit::tracer::setValueTrace(fut->value(), output);
+    }
+    return value();
+  }
+
+  // The py::function cb arg must take a std::shared_ptr<PythonFutureWrapper>
+  // (i.e., torch._C.Future) as the only argument. If the type mismatches, an
+  // error will be thrown when waiting for the value of this returned Future.
+  std::shared_ptr<PythonFutureWrapper> then(py::function cb) {
+    // We need this an additional layer of wrapper here to guard the
+    // destruction of the py::function object. Because, the
+    // Future owns a reference to the py::function in its callback
+    // vector, but Future does not acquire GIL on destruction.
+    auto pf = std::make_shared<PythonFunctionGuard>(std::move(cb));
+
+    return std::make_shared<jit::PythonFutureWrapper>(fut->then(
+        // Capture a copy of the ivalue::Future instead of the `this` pointer
+        // because the PythonFutureWrapper object could have been deleted
+        // when the callbacks are fired. For example, RPC only captures the
+        // ivalue::Future instead of PythonFutureWrapper in JitFuture's
+        // callback functions. Hence, if user code does not hold a reference to
+        // this PythonFutureWrapper object, there is no guarantee that the
+        // PythonFutureWrapper is still valid when running the callback.
+        [pyFut(this->getPtr()),
+         pf(std::move(pf))](c10::ivalue::Future& /* unused */) -> IValue {
+          try {
+            pybind11::gil_scoped_acquire ag;
+            return toIValue(pf->func_(pyFut), PyObjectType::get());
+          } catch (py::error_already_set& e) {
+            auto err = std::runtime_error(c10::str(
+                "Got the following error when running the callback: ",
+                e.what()));
+            {
+              pybind11::gil_scoped_acquire ag;
+              // Release ownership on py::objects and also restore Python
+              // Error Indicator.
+              e.restore();
+              // Clear the Python Error Indicator as we has recorded the
+              // exception in the response message.
+              PyErr_Clear();
+            }
+
+            throw err;
+          }
+        },
+        PyObjectType::get()));
+  }
+
+  void add_done_callback(py::function cb) {
+    auto pf = std::make_shared<PythonFunctionGuard>(std::move(cb));
+    // NOLINTNEXTLINE(modernize-avoid-bind)
+    fut->addCallback(std::bind(
+        [pyFut(this->getPtr())](std::shared_ptr<PythonFunctionGuard> pf) {
+          try {
+            pybind11::gil_scoped_acquire ag;
+            pf->func_(pyFut);
+          } catch (py::error_already_set& e) {
+            {
+              pybind11::gil_scoped_acquire ag;
+              // Release ownership on py::objects and also restore Python
+              // Error Indicator.
+              e.restore();
+              // Clear the Python Error Indicator as we has recorded the
+              // exception in the response message.
+              PyErr_Clear();
+            }
+            // Log and ignore exceptions raised through the callback
+            LOG(ERROR) << "Got the following error when running the callback: "
+                       << e.what();
+
+          } catch (const std::exception& e) {
+            // Log and ignore exceptions raised through the callback
+            LOG(ERROR) << "Got the following error when running the callback: "
+                       << e.what();
+          }
+        },
+        std::move(pf)));
+  }
+
+  void markCompleted(const py::object& pyValue) {
+    DCHECK(PyGILState_Check());
+    IValue value = toIValue(pyValue, PyObjectType::get());
+
+    py::gil_scoped_release release;
+    fut->markCompleted(std::move(value));
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> fut;
+  // unwrap_func works like a callback for the value returned by
+  // PythonFutureWrapper::wait().
+  c10::optional<UnwrapFunc> unwrap_func;
+
+ private:
+  std::shared_ptr<PythonFutureWrapper> getPtr() {
+    return shared_from_this();
+  }
+};
+
+// The PythonAwaitWrapper for ivalue::Await
+//
+// Expresses delayed function execution with Lazy semantic.
+// i.e. Await[W] in eager mode can be used as W.
+// When the attribute of W type is requested, Await[W] will return the
+// attribute of W, transparently calling wait() beforehand.
+// No Lazy semantic for script, explicit wait(Await[W]) -> W must be called to
+// convert to type W.
+//
+// The Await object takes shared ownership of specified function and the
+// arguments. After first call for wait() it owns the result. Deliberately no
+// type inference for eager mode.
+struct VISIBILITY_HIDDEN PythonAwaitWrapper
+    : std::enable_shared_from_this<PythonAwaitWrapper> {
+  explicit PythonAwaitWrapper(c10::intrusive_ptr<c10::ivalue::Await> aw)
+      : aw_(std::move(aw)) {}
+  explicit PythonAwaitWrapper(py::handle input) {
+    args_ = py::tuple(1u);
+    args_[0] = input;
+    auto type = PyObjectType::get();
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(type);
+    aw_->markCompleted(toIValue(input, type));
+  }
+
+  explicit PythonAwaitWrapper(py::function pf, py::tuple args) {
+    pyfg_ = std::make_shared<torch::jit::PythonFunctionGuard>(std::move(pf));
+    args_ = std::move(args);
+    std::function<IValue()> f = [fg(pyfg_), &args(args_)]() {
+      pybind11::gil_scoped_acquire ag;
+      return toIValue(fg->func_(*args), PyObjectType::get());
+    };
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(
+        PyObjectType::get(), std::move(f));
+  }
+
+  explicit PythonAwaitWrapper(const PythonAwaitWrapper&) = delete;
+  PythonAwaitWrapper& operator=(const PythonAwaitWrapper&) = delete;
+
+  py::object wait() {
+    py::gil_scoped_acquire acquire;
+    return toPyObject(aw_->wait());
+  }
+
+  // Nowait semantic means trivial case when Await is constructed from the
+  // result
+  bool is_nowait() {
+    return pyfg_ == nullptr;
+  }
+
+  const py::function fn() {
+    TORCH_CHECK(
+        pyfg_, "Await constructed as awaitable_nowait does not have fn");
+    return pyfg_->func_;
+  }
+
+  const py::tuple args() {
+    return args_;
+  }
+
+  TypePtr type() {
+    return aw_->type();
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Await> aw_;
+  std::shared_ptr<torch::jit::PythonFunctionGuard> pyfg_;
+  py::tuple args_;
+
+ private:
+  std::shared_ptr<PythonAwaitWrapper> getPtr() {
+    return shared_from_this();
+  }
+};
+
+// error reporting: when reporting user-caused errors, these functions should
+// not use AT_ERROR macros, since these macros add stack trace information
+// that is confusing to display to the end user since it always reports
+// locations in libtorch code rather than user code.
+
+inline std::shared_ptr<CompilationUnit> get_python_cu() {
+  return py::module::import("torch.jit._state")
+      .attr("_python_cu")
+      .cast<std::shared_ptr<CompilationUnit>>();
+}
+
+struct TypedIValue : public std::pair<IValue, TypePtr> {
+  using pair::pair;
+
+  IValue& ivalue() {
+    return this->first;
+  }
+  TypePtr& type() {
+    return this->second;
+  }
+};
+
+inline TypedIValue toDictKeyIValue(py::handle key) {
+  if (py::isinstance<py::str>(key)) {
+    return TypedIValue(
+        ConstantString::create(py::cast<std::string>(key)), StringType::get());
+  } else if (py::isinstance<py::int_>(key)) {
+    return TypedIValue(py::cast<int64_t>(key), IntType::get());
+  } else if (py::isinstance<py::float_>(key)) {
+    return TypedIValue(py::cast<double>(key), FloatType::get());
+  } else {
+    AT_ERROR("Dictionary inputs may only have string, int, or float keys");
+  }
+}
+
+inline c10::optional<TypePtr> unifyOrInitializeType(
+    const TypePtr& accum,
+    const TypePtr& unify) {
+  if (!accum) {
+    return unify;
+  }
+  return unifyTypes(accum, unify);
+}
+
+using InferredType = c10::InferredType;
+
+InferredType tryToInferContainerType(py::handle input, bool primitiveTypeOnly);
+
+// Try to infer the type of a Python object
+// The type cannot be inferred if:
+//   input is an empty container (list, dict)
+//   input is an list with element types that cannot be unified
+//   input is an dict with key or value types that cannot be unified
+inline InferredType tryToInferType(py::handle input) {
+  // Try tensor types
+  if (THPVariable_Check(input.ptr())) {
+    return InferredType(TensorType::get());
+  }
+
+  if (input.is_none()) {
+    return InferredType(NoneType::get());
+  }
+
+  if (py::isinstance<StrongFunctionPtr>(input)) {
+    auto fn = py::cast<StrongFunctionPtr>(input).function_;
+    return InferredType(FunctionType::create(fn));
+  }
+
+  // Try basic types first
+  if (py::isinstance<py::bool_>(input)) {
+    return InferredType(BoolType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::int_>(input)) {
+    return InferredType(IntType::get());
+  } else if (py::isinstance<py::float_>(input)) {
+    return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexType::get());
+  } else if (py::isinstance<py::str>(input)) {
+    return InferredType(StringType::get());
+  } else if (THPLayout_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPDevice_Check(input.ptr())) {
+    return InferredType(DeviceObjType::get());
+  } else if (THPGenerator_Check(input.ptr())) {
+    return InferredType(GeneratorType::get());
+  } else if (THPStream_Check(input.ptr())) {
+    return InferredType(StreamObjType::get());
+  } else if (THPDtype_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPQScheme_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPLayout_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  }
+
+  auto enum_type = py::module::import("enum").attr("Enum");
+  py::bool_ isEnumValue = py::isinstance(input, enum_type);
+  if (py::cast<bool>(isEnumValue)) {
+    auto enum_class = input.attr("__class__");
+    auto enum_type = py::cast<TypePtr>(
+        py::module::import("torch.jit.annotations")
+            .attr("try_ann_to_type")(enum_class, SourceRange()));
+    return InferredType(std::move(enum_type));
+  }
+
+  py::bool_ isClass =
+      py::module::import("inspect").attr("isclass")(input.get_type());
+  if (py::cast<bool>(isClass)) {
+    // Assume that the class is compiled already or will compile. Invalidate
+    // this later if needed.
+    bool class_compiled = true;
+
+    // Check if the type is already compiled.
+    py::object existing_ty = py::module::import("torch.jit._state")
+                                 .attr("_get_script_class")(input.get_type());
+
+    if (existing_ty.is_none()) {
+      // If not, try to compile it.
+      py::bool_ can_compile = py::module::import("torch._jit_internal")
+                                  .attr("can_compile_class")(input.get_type());
+
+      if (py::cast<bool>(can_compile)) {
+        // Try to compile the class. This is wrapped in a try-catch because
+        // compilation of class types can raise an Exception and in that case,
+        // we want to defer to other attempts at type inference below rather
+        // than fail compilation altogether.
+        try {
+          py::module::import("torch.jit._script")
+              .attr("_recursive_compile_class")(
+                  input.get_type(), SourceRange());
+        } catch (...) {
+          // Invalidate the assumption that the class compiled so that we don't
+          // look up and return its JIT type as the type for the input.
+          class_compiled = false;
+        }
+      }
+    }
+
+    // If the class compiled successfully, look up the existing JIT type by
+    // qualified name and return it.
+    if (class_compiled) {
+      auto script_class = py::module::import("torch.jit._state")
+                              .attr("_get_script_class")(input.get_type());
+
+      if (!script_class.is_none()) {
+        auto class_type = py::cast<ClassTypePtr>(script_class);
+
+        if (class_type && !class_type->is_module()) {
+          return InferredType(std::move(class_type));
+        }
+      }
+    }
+  }
+
+  if (py::isinstance<Object>(input)) {
+    auto object = py::cast<Object>(input);
+    return InferredType(object.type());
+#ifdef USE_RPC
+  } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
+    auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
+    return InferredType(rref_ivalue.type());
+#endif
+  }
+
+  auto await_type = py::module::import("torch._awaits").attr("_Await");
+  py::bool_ is_await = py::isinstance(input, await_type);
+  if (py::cast<bool>(is_await)) {
+    auto awptr = input.cast<std::shared_ptr<PythonAwaitWrapper>>();
+    return InferredType(AwaitType::create(awptr->aw_->elementType()));
+  }
+
+  if (as_module(py::cast<py::object>(input))) {
+    return InferredType("Cannot infer type of ScriptModule");
+  }
+
+  auto module_type = py::module::import("torch.nn").attr("Module");
+  py::bool_ is_module = py::isinstance(input, module_type);
+  if (py::cast<bool>(is_module)) {
+    return InferredType("Cannot infer concrete type of torch.nn.Module");
+  }
+
+  // Try container types
+  return tryToInferContainerType(input, false);
+}
+
+// This function is similar to tryToInferType, but it only tries to infer
+// primitive types (int, float, bool, complex) or nested container of primitive
+// types.
+inline InferredType tryToInferPrimitiveType(py::handle input) {
+  if (input.is_none()) {
+    return InferredType(NoneType::get());
+  }
+
+  // Only primitive data type
+  if (py::isinstance<py::bool_>(input)) {
+    return InferredType(BoolType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::int_>(input)) {
+    return InferredType(IntType::get());
+  } else if (py::isinstance<py::float_>(input)) {
+    return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexType::get());
+  }
+
+  // Try container types
+  return tryToInferContainerType(input, true);
+}
+
+inline InferredType tryToInferContainerType(
+    py::handle input,
+    bool primitiveTypeOnly = false) {
+  if (six::isTuple(input)) {
+    py::tuple tuple = py::cast<py::tuple>(input);
+    std::vector<TypePtr> element_types;
+    element_types.reserve(tuple.size());
+
+    for (py::handle elem : tuple) {
+      auto type_match = primitiveTypeOnly ? tryToInferPrimitiveType(elem)
+                                          : tryToInferType(elem);
+      if (type_match.success()) {
+        element_types.push_back(type_match.type());
+      } else {
+        // Forward error message along
+        return type_match.reason();
+      }
+    }
+    return InferredType(TupleType::create(std::move(element_types)));
+  } else if (PyDict_Check(input.ptr())) {
+    // Check to make sure we can generate useful input/output types
+    auto dict = py::cast<py::dict>(input);
+    size_t len = py::len(dict);
+    if (!len) {
+      return InferredType("Dictionary inputs must have entries");
+    }
+
+    TypePtr key_type = nullptr;
+    TypePtr value_type = nullptr;
+
+    for (auto entry : dict) {
+      // Try to infer the key type and unify it with the existing one
+      auto entry_key_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.first)
+          : tryToInferType(entry.first);
+      if (!entry_key_type_match.success()) {
+        return entry_key_type_match.reason();
+      }
+      auto unified_key =
+          unifyOrInitializeType(key_type, entry_key_type_match.type());
+      if (!unified_key) {
+        return InferredType(c10::str(
+            "Dictionary inputs to traced functions must have consistent type. Found ",
+            key_type->repr_str(),
+            " and ",
+            (entry_key_type_match.type())->repr_str()));
+      }
+
+      // Try to infer the value type and unify it with the existing one
+      auto entry_value_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.second)
+          : tryToInferType(entry.second);
+      if (!entry_value_type_match.success()) {
+        return entry_value_type_match.reason();
+      }
+      auto unified_value =
+          unifyOrInitializeType(value_type, entry_value_type_match.type());
+      if (!unified_value) {
+        return InferredType(c10::str(
+            "Dictionary inputs to traced functions must have consistent type. Found ",
+            value_type->repr_str(),
+            " and ",
+            (entry_value_type_match.type())->repr_str()));
+      }
+
+      key_type = *unified_key;
+      value_type = *unified_value;
+    }
+    return InferredType(
+        DictType::create(std::move(key_type), std::move(value_type)));
+  } else if (PyList_Check(input.ptr())) {
+    auto list = py::cast<py::list>(input);
+    size_t len = py::len(list);
+    if (!len) {
+      return InferredType("List trace inputs must have elements");
+    }
+
+    TypePtr element_type = nullptr;
+    for (auto elem : list) {
+      auto element_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(elem)
+          : tryToInferType(elem);
+      if (!element_type_match.success()) {
+        return InferredType(c10::str(
+            "Could not infer type of list element: ",
+            element_type_match.reason()));
+      }
+      auto unified_type =
+          unifyOrInitializeType(element_type, element_type_match.type());
+      if (!unified_type) {
+        return InferredType(c10::str(
+            "List inputs to traced functions must have consistent element type. Found ",
+            element_type->repr_str(),
+            " and ",
+            (element_type_match.type())->repr_str()));
+      }
+      element_type = *unified_type;
+    }
+    return InferredType(ListType::create(element_type));
+  } else {
+    if (primitiveTypeOnly) {
+      return InferredType(c10::str(
+          "Only tuple, list, or dict (possibly nested) of primitive types (bool, float, int, complex)",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(input.get_type().attr("__name__")),
+          "."));
+    } else {
+      // TODO: this message is not correct anymore, since this InferredType is
+      // used from a bunch of circumstances unrelated to tracing. We can re-use
+      // this instead of the attribute_failure stuff in concreteType
+      return InferredType(c10::str(
+          "Only tensors and (possibly nested) tuples of tensors, lists, or dicts",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(input.get_type().attr("__name__")),
+          "."));
+    }
+  }
+}
+
+inline bool isTraceableType(const TypePtr& type) {
+  if (type->isSubtypeOf(*TensorType::get())) {
+    return true;
+  }
+
+  if (auto list_type = type->cast<ListType>()) {
+    return isTraceableType(list_type->getElementType());
+  }
+
+  if (auto tuple_type = type->cast<TupleType>()) {
+    return std::all_of(
+        tuple_type->elements().begin(),
+        tuple_type->elements().end(),
+        [](const TypePtr& element_type) {
+          return isTraceableType(element_type);
+        });
+  }
+
+  if (auto dict_type = type->cast<DictType>()) {
+    return isTraceableType(dict_type->getValueType());
+  }
+
+  return false;
+}
+
+inline IValue toTypeInferredIValue(py::handle input) {
+  auto match = tryToInferType(input);
+  if (!match.success()) {
+    auto object = py::cast<py::object>(input);
+    if (auto mod = as_module(object)) {
+      // if obj is already a ScriptModule, just return its ivalue
+      auto ptr = mod.value()._ivalue();
+      // explict copy semantics for strong ownership of the resource.
+      return c10::intrusive_ptr<c10::ivalue::Object>::reclaim_copy(
+          ptr.release());
+    }
+
+    // Check if the obj is a ScriptObject.
+    if (auto script_obj = as_object(object)) {
+      auto ptr = script_obj.value()._ivalue();
+      return c10::intrusive_ptr<c10::ivalue::Object>::reclaim_copy(
+          ptr.release());
+    }
+    AT_ERROR(
+        "Tracer cannot infer type of ", py::str(input), "\n:", match.reason());
+  }
+  return toIValue(input, match.type());
+}
+
+inline Stack toTraceableStack(const py::tuple& inputs) {
+  auto info = toTypeInferredIValue(inputs);
+  TORCH_CHECK(
+      isTraceableType(info.type()),
+      "Type '",
+      info.type()->repr_str(),
+      "' cannot be traced. Only Tensors and (possibly nested) Lists, Dicts, and"
+      " Tuples of Tensors can be traced");
+  return info.toTupleRef().elements().vec();
+}
+
+// Serialize the python dictionary into a traceable stack.
+inline Stack toTraceableStack(const py::dict& inputs) {
+  Stack res;
+  for (auto it = inputs.begin(); it != inputs.end(); it++) {
+    if (THPVariable_Check(it->second.ptr())) {
+      res.push_back(toIValue(it->second, tryToInferType(it->second).type()));
+    }
+  }
+  return res;
+}
+
+inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
+  auto elems = c10::impl::GenericList(elem_type);
+  for (auto elem : obj) {
+    elems.push_back(toIValue(elem, elem_type));
+  }
+  return IValue(elems);
+}
+
+inline IValue createGenericDict(
+    const py::dict& obj,
+    const TypePtr& key_type,
+    const TypePtr& value_type) {
+  c10::impl::GenericDict elems(key_type, value_type);
+  elems.reserve(py::len(obj));
+  for (auto& entry : obj) {
+    elems.insert(
+        toIValue(entry.first, key_type), toIValue(entry.second, value_type));
+  }
+  return IValue(elems);
+}
+
+template <class T>
+inline void guardAgainstNamedTensor(const T& var) {
+  TORCH_CHECK(
+      !var.has_names(),
+      "NYI: Named tensors are currently unsupported in TorchScript. As a  "
+      "workaround please drop names via `tensor = tensor.rename(None)`.");
+}
+
+// Extract custom class registered with torchbind
+template <typename T>
+c10::intrusive_ptr<T> toCustomClass(py::handle obj) {
+  static_assert(
+      std::is_base_of<CustomClassHolder, T>::value, "T is not a CustomClass");
+  const auto& type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
+  c10::IValue ivalue = toIValue(obj, type);
+  return std::move(ivalue).toCustomClass<T>();
+}
+
+// Small wrapper around getting the type name string from Python to make
+// types easier to interpret, e.g. give the structural type for a NamedTuple
+inline std::string friendlyTypeName(py::handle obj) {
+  if (py::isinstance<py::tuple>(obj) && py::hasattr(obj, "_fields")) {
+    auto field_names =
+        py::cast<std::vector<std::string>>(py::getattr(obj, "_fields"));
+    std::stringstream ss;
+    ss << py::str(obj.get_type().attr("__name__"));
+    ss << " (aka NamedTuple(";
+    bool first = true;
+    for (auto& field_name : field_names) {
+      if (!first) {
+        ss << ", ";
+      }
+      ss << field_name;
+      first = false;
+    }
+    ss << "))";
+    return ss.str();
+  } else {
+    return py::str(obj.get_type().attr("__name__"));
+  }
+}
+
+// Thrown when trying to create a schema for a list of python
+// arguments that cannot be converted.
+// Can be caught by the caller to attempt to use other schema
+// when there is an overloaded operator.
+struct schema_match_error : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+inline IValue argumentToIValue(
+    const FunctionSchema& schema,
+    size_t argumentPosition,
+    py::handle object) {
+  const auto& argument = schema.arguments().at(argumentPosition);
+  try {
+    return toIValue(object, argument.real_type(), argument.N());
+  } catch (const py::cast_error& error) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object)),
+        "\nCast error details: ",
+        error.what()));
+  } catch (const py::error_already_set& error) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object)),
+        "\n Python error details: ",
+        error.what()));
+  }
+}
+
+inline IValue returnToIValue(const TypePtr& type, py::handle object) {
+  try {
+    return toIValue(object, type);
+  } catch (const py::cast_error& error) {
+    throw std::runtime_error(c10::str(
+        " expected value of type ",
+        type->str(),
+        " for return value but instead got value of type ",
+        py::str(object.get_type().attr("__name__")),
+        ".",
+        "\nValue: ",
+        py::repr(object),
+        "\nCast error details: ",
+        error.what()));
+  }
+}
+
+inline py::object getScriptedClassOrError(const c10::NamedTypePtr& classType) {
+  auto py_class =
+      py::module::import("torch.jit._state")
+          .attr("_get_python_class")(classType->name()->qualifiedName());
+  if (py_class.is_none()) {
+    std::stringstream err;
+    err << "Unknown reference to ScriptClass ";
+    err << classType->name()->qualifiedName();
+    err << ". (Did you forget to import it?)";
+    throw std::runtime_error(err.str());
+  }
+  return py_class;
+}
+
+struct VISIBILITY_HIDDEN tuple_slice {
+  /*implicit*/ tuple_slice(py::tuple tup_)
+      : tup(std::move(tup_)), b(0), e(tup.size()) {}
+  tuple_slice(py::tuple tup_, int64_t b_)
+      : tup(std::move(tup_)), b(b_), e(tup.size()) {}
+  tuple_slice(py::tuple tup_, int64_t b_, int64_t e_)
+      : tup(std::move(tup_)), b(b_), e(e_) {}
+  py::detail::tuple_iterator begin() const {
+    return {tup, static_cast<pybind11::ssize_t>(b)};
+  }
+  py::detail::tuple_iterator end() const {
+    return {tup, static_cast<pybind11::ssize_t>(e)};
+  }
+  size_t size() const {
+    return e - b;
+  }
+  py::detail::tuple_accessor operator[](size_t index) const {
+    return {tup, static_cast<size_t>(b + index)};
+  }
+
+ private:
+  py::tuple tup;
+  int64_t b;
+  int64_t e;
+};
+
+inline Stack createStackForSchema(
+    const FunctionSchema& schema,
+    const tuple_slice& args,
+    const py::kwargs& kwargs,
+    c10::optional<IValue> self) {
+  size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
+  if (all_arguments > schema.arguments().size()) {
+    throw schema_match_error(c10::str(
+        schema.name(),
+        "() expected at most ",
+        schema.arguments().size(),
+        " argument(s) but received ",
+        all_arguments,
+        " argument(s). Declaration: ",
+        schema));
+  }
+  Stack stack;
+  stack.reserve(schema.arguments().size());
+
+  int64_t arg_idx = 0;
+  if (self) {
+    push(stack, std::move(*self));
+    arg_idx++;
+  }
+  // First push all positional args.
+  for (const auto& arg : args) {
+    // ...but refuse to do it if the schema says that this was supposed
+    // to be keyword only
+    if (schema.arguments()[arg_idx].kwarg_only()) {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() takes ",
+          arg_idx,
+          " positional argument(s) but ",
+          self ? 1 + args.size() : args.size(),
+          " was/were given.  Declaration: ",
+          schema));
+    }
+    // Use the type information from the schema to convert the PyObject.
+    push(stack, argumentToIValue(schema, stack.size(), arg));
+    arg_idx++;
+  }
+
+  // Now for every remaining non-positional argument in the schema, look for it
+  // in the kwargs dict and push it if found, or use its default value if it
+  // has one.
+  size_t consumed_kwargs = 0;
+  for (size_t i = stack.size(); i < schema.arguments().size(); ++i) {
+    const auto& arg = schema.arguments()[i];
+    if (kwargs.contains(arg.name().c_str())) {
+      push(stack, argumentToIValue(schema, i, kwargs[arg.name().c_str()]));
+      consumed_kwargs += 1;
+    } else if (arg.default_value()) {
+      push(stack, *arg.default_value());
+    } else {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() is missing value for argument '",
+          arg.name(),
+          "'. Declaration: ",
+          schema));
+    }
+  }
+
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    for (const auto& kwarg : kwargs) {
+      names.emplace_back(py::cast<std::string>(kwarg.first));
+    }
+    throw schema_match_error(schema.findErrorInKwargs(names));
+  }
+
+  return stack;
+}
+
+inline py::object createPyObjectForStack(Stack&& stack) {
+  if (stack.empty()) {
+    return py::none();
+  }
+
+  // Return a simple value and not a single-element tuple if there is only one
+  // return value.
+  if (stack.size() == 1) {
+    return toPyObject(std::move(stack[0]));
+  }
+
+  // If there is more than one return value, pop them into a py::tuple.
+  py::tuple return_values(stack.size());
+  for (const auto ret : c10::irange(return_values.size())) {
+    return_values[ret] = toPyObject(std::move(stack[ret]));
+  }
+
+  return std::move(return_values);
+}
+
+// TODO: Remove once we clean up the GraphExecutor usage.
+inline Stack evilDeprecatedBadCreateStackDoNotUse(
+    const py::tuple& tuple,
+    at::ArrayRef<Value*> inputs,
+    size_t reserve_extra_space = 0) {
+  if (tuple.size() != inputs.size()) {
+    AT_ERROR(
+        "expected " + std::to_string(inputs.size()) + " inputs, but got " +
+        std::to_string(tuple.size()));
+  }
+  Stack result;
+  result.reserve(tuple.size() + reserve_extra_space);
+  for (const auto i : c10::irange(inputs.size())) {
+    result.push_back(toIValue(std::move(tuple[i]), inputs[i]->type()));
+  }
+  return result;
+}
+
+// Run `callee`, potentially inserting a CallFunction/CallMethod node into the
+// tracing graph.
+inline py::object runAndInsertCall(
+    Function& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs,
+    c10::optional<IValue> self,
+    // Lambda that tells this function how to insert `callee` into the graph if
+    // we're tracing.
+    const std::function<Value*(Graph&, const MatchedSchema& match)>&
+        callInserter) {
+  auto stack =
+      createStackForSchema(callee.getSchema(), args, kwargs, std::move(self));
+  const auto& tracing_state = tracer::getTracingState();
+  if (!tracing_state) {
+    pybind11::gil_scoped_release no_gil_guard;
+    // If we're not tracing, just run the callee as normal.
+    callee.run(stack);
+  } else {
+    // If we are tracing, insert the appropriate CallFunction or CallMethod node
+    // and then run the callee with tracing disabled.
+
+    // Get the graph `Value`s that represent the input IValues
+    auto inputs = last(stack, callee.num_inputs());
+    auto input_values =
+        fmap(inputs, [](const IValue& v) { return tracer::getValueTrace(v); });
+    TORCH_INTERNAL_ASSERT(callee.getSchema().returns().size() == 1)
+    auto return_type = callee.getSchema().returns().at(0).type();
+    auto graph = tracing_state->graph;
+    std::vector<NamedValue> named_values;
+    named_values.reserve(input_values.size());
+    for (Value* v : input_values) {
+      named_values.emplace_back(v);
+    }
+
+    // Add a call node.
+    MatchedSchema match = matchSchema(
+        callee.getSchema(),
+        tracer::getPythonInterpreterSourceRange(),
+        *graph,
+        named_values,
+        {});
+    auto output_value = callInserter(*graph, match);
+
+    // Actually run the callee. Pause the tracer so that we don't double-add the
+    // callee nodes.
+    {
+      pybind11::gil_scoped_release no_gil_guard;
+      ResourceGuard guard(tracer::pauseTracing());
+      callee.run(stack);
+    }
+
+    // Associate the output IValues with the output `Value`s in the graph
+    tracer::setValueTrace(stack.back(), output_value);
+  }
+
+  TORCH_CHECK(
+      !stack.empty(),
+      "Expected values in the stack after execution but found none");
+  return toPyObject(std::move(stack.back()));
+}
+
+inline c10::optional<py::object> maybeTorchFunctionDispatch(
+    const py::object& callee,
+    const tuple_slice& args_no_self,
+    const py::kwargs& kwargs,
+    const c10::QualifiedName qualname) {
+  std::vector<py::handle> args_vec;
+  for (const auto& arg : args_no_self) {
+    args_vec.push_back(arg);
+  }
+  py::tuple args = py::cast(args_vec);
+
+  // Handle __torch_function__ dispatch
+  std::vector<PyObject*> overloaded_args;
+  size_t total_arg_num = args.size() + kwargs.size();
+  for (const auto& arg : args) {
+    is_tensor_and_append_overloaded(arg.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        arg.ptr(),
+        &overloaded_args,
+        static_cast<int>(total_arg_num),
+        false /* throw_error */);
+  }
+  // NB: for kwargs, we cannot guarantee the order of appending
+  // is the same as the argument order in operator's schema.
+  // This is suboptimal, but should be fine. Later when we have
+  // better schema matching and argument parsing, we could
+  // match the operator in `operations` first, then the order will
+  // be guaranteed.
+  for (auto item : kwargs) {
+    is_tensor_and_append_overloaded(item.second.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        item.second.ptr(),
+        &overloaded_args,
+        total_arg_num,
+        false /* throw_error */);
+  }
+  if (!overloaded_args.empty()) {
+    return pybind11::reinterpret_steal<py::object>(
+        handle_torch_function_no_python_arg_parser(
+            /*overloaded_args=*/overloaded_args,
+            /*args=*/args.ptr(),
+            /*kwargs=*/kwargs.ptr(),
+            /*func_name=*/qualname.name().c_str(),
+            /*torch_api_function=*/callee.ptr(),
+            /*module_name=*/qualname.prefix().c_str()));
+  }
+
+  return c10::nullopt;
+}
+
+inline py::object invokeScriptFunctionFromPython(
+    Function& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  // TODO: we could add __torch_function__ dispatch here but I don't know
+  // the implications of doing so
+
+  return runAndInsertCall(
+      callee,
+      args,
+      kwargs,
+      /*self=*/c10::nullopt,
+      [&](Graph& graph, const MatchedSchema& match) {
+        return graph.insertFunctionCall(&callee, match);
+      });
+}
+
+inline py::object invokeScriptMethodFromPython(
+    Method& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  auto self = callee.owner()._ivalue();
+
+  if (auto torch_fn_result = maybeTorchFunctionDispatch(
+          py::cast(callee), args, kwargs, callee.name())) {
+    return *torch_fn_result;
+  }
+
+  return runAndInsertCall(
+      callee.function(),
+      args,
+      kwargs,
+      self,
+      [&](Graph& graph, const MatchedSchema& match) {
+        return graph.insertMethodCall(callee.name(), match);
+      });
+}
+
+TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    py::args args,
+    const py::kwargs& kwargs);
+
+TORCH_PYTHON_API py::object invokeOperatorFromPython(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    py::args args,
+    const py::kwargs& kwargs,
+    c10::optional<c10::DispatchKey> dk = c10::nullopt);
+
+TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    Symbol symbol,
+    py::args args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    c10::optional<c10::DispatchKey> dk = c10::nullopt);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h
new file mode 100644
index 0000000000000000000000000000000000000000..2626ebe72b3f5df2c61139a356291dfa92a34dbb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/python/pybind.h>
+
+#include <ATen/ATen.h>
+#include <functional>
+#include <tuple>
+#include <vector>
+
+namespace torch::jit::python {
+
+struct IODescriptor {
+  struct VariableMetadata {
+    VariableMetadata(const autograd::Variable& var)
+        : sizes(var.sizes().vec()),
+          type(var.scalar_type()),
+          device(var.device()),
+          requires_grad(var.requires_grad()) {}
+
+    bool operator==(const VariableMetadata& o) const {
+      return std::tie(device, requires_grad, type, sizes) ==
+          std::tie(o.device, o.requires_grad, o.type, o.sizes);
+    }
+
+    static size_t hash(const VariableMetadata& m) {
+      return c10::get_hash(m.sizes, m.device, m.requires_grad, m.type);
+    }
+
+    std::vector<int64_t> sizes;
+    at::ScalarType type;
+    at::Device device;
+    bool requires_grad;
+  };
+
+  bool operator==(const IODescriptor& o) const {
+    return std::tie(structure, metadata, grad_enabled) ==
+        std::tie(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  static size_t hash(const IODescriptor& o) {
+    return c10::get_hash(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  void extend(const autograd::variable_list& list) {
+    metadata.reserve(metadata.size() + list.size());
+    for (auto& var : list)
+      metadata.emplace_back(var);
+  }
+
+  // Description of argument structure. Variables are replaced with
+  // different characters, depending on their flags, beginnings and
+  // ends of tuples and lists are denoted by a pair of parenthesis
+  // of their corresponding kind. They should always be paired.
+  // Example desc: (vv[v(v)v])
+  // NOTE: if extend() was ever called then metadata.size() can be
+  // different than the number of 'v's in structure.
+  std::string structure;
+  std::vector<std::string> strings;
+  std::vector<VariableMetadata> metadata;
+  bool grad_enabled = false;
+};
+
+static inline std::ostream& operator<<(
+    std::ostream& out,
+    const IODescriptor::VariableMetadata& meta) {
+  at::Device meta_device = meta.device;
+  auto& t = at::getDeprecatedTypeProperties(
+      meta_device.is_cpu() ? at::Backend::CPU : at::Backend::CUDA, meta.type);
+  out << t << "(requires_grad=" << meta.requires_grad;
+  if (meta_device.is_cuda()) {
+    out << ", device=" << meta_device.index();
+  }
+  out << ") {";
+  for (const auto i : c10::irange(meta.sizes.size())) {
+    if (i > 0)
+      out << ", ";
+    out << meta.sizes[i];
+  }
+  out << "}";
+  return out;
+}
+
+static inline std::ostream& operator<<(
+    std::ostream& out,
+    const IODescriptor& desc) {
+  out << desc.structure << "\n";
+  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
+  for (const auto i : c10::irange(desc.metadata.size())) {
+    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
+  }
+  return out;
+}
+
+struct ParsedArgs {
+  // Flat vector of Variables found in arguments
+  autograd::variable_list vars;
+  // Metadata describing nesting of objects received from Python and
+  // metadata of vars and whether grad is enabled.
+  IODescriptor desc;
+
+  void extend(const autograd::variable_list& list) {
+    if (list.empty())
+      return;
+    vars.reserve(vars.size() + list.size());
+    for (auto& var : list)
+      vars.emplace_back(var);
+    desc.extend(list);
+  }
+};
+
+ParsedArgs flatten(py::handle obj);
+PyObject* unflatten(
+    at::ArrayRef<autograd::Variable> vars,
+    const IODescriptor& structure);
+
+} // namespace torch::jit::python
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c8b7e01ba35eabb45bb3448f9a4bbd07df75b2f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+
+void initPythonCustomClassBindings(PyObject* module);
+
+struct ScriptClass {
+  ScriptClass(c10::StrongTypePtr class_type)
+      : class_type_(std::move(class_type)) {}
+
+  py::object __call__(py::args args, py::kwargs kwargs);
+
+  c10::StrongTypePtr class_type_;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2980bf0f93f6aa34d5389107bcd8e1837c930e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+void initScriptDictBindings(PyObject* module);
+
+/// An iterator over the keys of ScriptDict. This is used to support
+/// .keys() and iteration.
+class ScriptDictKeyIterator final {
+ public:
+  ScriptDictKeyIterator(
+      c10::impl::GenericDict::iterator iter,
+      c10::impl::GenericDict::iterator end)
+      : iter_(std::move(iter)), end_(std::move(end)) {}
+  IValue next();
+
+ private:
+  c10::impl::GenericDict::iterator iter_;
+  c10::impl::GenericDict::iterator end_;
+};
+
+/// An iterator over the key-value pairs of ScriptDict. This is used to support
+/// .items().
+class ScriptDictIterator final {
+ public:
+  ScriptDictIterator(
+      c10::impl::GenericDict::iterator iter,
+      c10::impl::GenericDict::iterator end)
+      : iter_(std::move(iter)), end_(std::move(end)) {}
+  IValue next();
+
+ private:
+  c10::impl::GenericDict::iterator iter_;
+  c10::impl::GenericDict::iterator end_;
+};
+
+/// A wrapper around c10::Dict that can be exposed in Python via pybind
+/// with an API identical to the Python dictionary class. This allows
+/// dictionaries to have reference semantics across the Python/TorchScript
+/// boundary.
+class ScriptDict final {
+ public:
+  // Constructor.
+  ScriptDict(IValue data) : dict_(AnyType::get(), AnyType::get()) {
+    TORCH_INTERNAL_ASSERT(data.isGenericDict());
+    dict_ = data.toGenericDict();
+  }
+
+  // Get the type of the dictionary.
+  DictTypePtr type() const {
+    return DictType::create(dict_.keyType(), dict_.valueType());
+  }
+
+  // Return a string representation that can be used
+  // to reconstruct the instance.
+  std::string repr() const {
+    std::ostringstream s;
+    s << '{';
+    bool f = false;
+    for (auto const& kv : dict_) {
+      if (f) {
+        s << ", ";
+      }
+      s << kv.key() << ": " << kv.value();
+      f = true;
+    }
+    s << '}';
+    return s.str();
+  }
+
+  // Return an iterator over the keys of the dictionary.
+  ScriptDictKeyIterator iter() const {
+    auto begin = dict_.begin();
+    auto end = dict_.end();
+    return ScriptDictKeyIterator(begin, end);
+  }
+
+  // Return an iterator over the key-value pairs of the dictionary.
+  ScriptDictIterator items() const {
+    auto begin = dict_.begin();
+    auto end = dict_.end();
+    return ScriptDictIterator(begin, end);
+  }
+
+  // Interpret the dictionary as a boolean; empty means false, non-empty means
+  // true.
+  bool toBool() const {
+    return !(dict_.empty());
+  }
+
+  // Get the value for the given key. Throws std::out_of_range if the key does
+  // not exist.
+  IValue getItem(const IValue& key) {
+    return dict_.at(key);
+  };
+
+  // Set the value for the given key.
+  void setItem(const IValue& key, const IValue& value) {
+    dict_.insert_or_assign(key, value);
+  };
+
+  // Check whether the dictionary contains the given key.
+  bool contains(const IValue& key) {
+    return dict_.contains(key);
+  }
+
+  // Delete the given key from the dictionary.
+  bool delItem(const IValue& key) {
+    return dict_.erase(key);
+  }
+
+  // Get the size of the dictionary.
+  int64_t len() const {
+    return dict_.size();
+  }
+
+  // A c10::Dict instance that holds the actual data.
+  c10::impl::GenericDict dict_;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..442431b272e179ef44a250a48f6f4f43ea64d3e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::jit {
+
+void initPythonIRBindings(PyObject* module);
+
+// execute a Python function, used for Ops we can't optimize but that we want to
+// optimize around
+struct ConcretePythonOp : public PythonOp {
+  static Symbol Kind;
+
+  ConcretePythonOp(Graph* graph) : PythonOp(graph, ::c10::prim::PythonOp) {}
+  ConcretePythonOp* init(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args) {
+    this->pyobj = std::move(pyobj);
+    this->scalar_args = std::move(scalar_args);
+    this->cconv = cconv;
+    return this;
+  }
+  // The Python object which contains the implementation of this function.
+  // This is either a class (non-legacy) or an object (legacy).  See
+  // TraceInterpreterState for execution semantics.
+  THPObjectPtr pyobj;
+  // The calling convention for the Python function.
+  // 'c' -- constant argument
+  // 'd' -- dynamic argument
+  std::string cconv;
+  // Scalar arguments to the Python function.  Not necessarily passed to
+  // the function in this order; see cconv for the correct order.
+  std::vector<THPObjectPtr> scalar_args;
+
+  std::string name() const override;
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override {
+    return new ConcretePythonOp(g);
+  }
+  // recover the autograd.Function instance, if this PythonOp's function
+  // was originally SomeFunction.apply
+  // used in ONNX for discovering symbolics
+  c10::optional<THPObjectPtr> autogradFunction() const override;
+  void writeScalars(std::ostream& out) const override;
+  void lint_python() const override;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h
new file mode 100644
index 0000000000000000000000000000000000000000..088fb94baf30142b5483806f8e2695caac163db3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h
@@ -0,0 +1,97 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace c10::ivalue {
+
+// concrete ivalue Holder that hold a py::object
+struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder {
+ public:
+  static c10::intrusive_ptr<PyObjectHolder> create(py::object py_obj) {
+    return c10::make_intrusive<ConcretePyObjectHolder>(std::move(py_obj));
+  }
+
+  static c10::intrusive_ptr<PyObjectHolder> create(const py::handle& handle) {
+    py::gil_scoped_acquire ag;
+    return c10::make_intrusive<ConcretePyObjectHolder>(
+        handle.cast<py::object>());
+  }
+
+  PyObject* getPyObject() override {
+    return py_obj_.ptr();
+  }
+
+  InferredType tryToInferType() override {
+    pybind11::gil_scoped_acquire ag;
+    return torch::jit::tryToInferType(py_obj_);
+  }
+
+  IValue toIValue(const TypePtr& type, c10::optional<int32_t> N = c10::nullopt)
+      override {
+    pybind11::gil_scoped_acquire ag;
+    return torch::jit::toIValue(py_obj_, type, N);
+  }
+
+  std::string toStr() override {
+    pybind11::gil_scoped_acquire ag;
+    return py::str(py_obj_);
+  }
+
+  std::vector<at::Tensor> extractTensors() override {
+    // We could implement this entirely in C++ via pybind11 but it turns out to
+    // be substantially slower. Namely, the total time taken by markCompleted on
+    // a CUDAFuture is 21.5us with this implementation, but goes up to 58.7us
+    // when using C++. The reason is unclear.
+    try {
+      pybind11::gil_scoped_acquire ag;
+      static py::object& extractorFn = *new py::object(
+          py::module::import("torch._jit_internal").attr("_extract_tensors"));
+      return extractorFn(py_obj_).cast<std::vector<at::Tensor>>();
+    } catch (py::error_already_set& e) {
+      auto err = std::runtime_error(
+          c10::str("Cannot extract tensors from value: ", e.what()));
+      {
+        pybind11::gil_scoped_acquire ag;
+        e.restore();
+        PyErr_Clear();
+      }
+      throw err;
+    }
+  }
+
+  // Note [Destructing py::object]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~
+  //
+  // (1) Why py_obj_ = py::none(); does not work. Because we also need to
+  // acquire GIL when destructing py::object of None that de-references None.
+  // https://docs.python.org/3/c-api/none.html#c.Py_RETURN_NONE
+  //
+  // https://stackoverflow.com/questions/15287590/why-should-py-increfpy-none-be-required-before-returning-py-none-in-c
+  //
+  // (2) Why we need to call dec_ref() explicitly. Because py::object of
+  // nullptr, on destruction, effectively does nothing because of it calls
+  // Py_XDECREF(NULL) underlying.
+  // https://docs.python.org/3/c-api/refcounting.html#c.Py_XDECREF
+  ~ConcretePyObjectHolder() override {
+    pybind11::gil_scoped_acquire ag;
+    py_obj_.dec_ref();
+    // explicitly setting PyObject* to nullptr to prevent py::object's dtor to
+    // decref on the PyObject again.
+    py_obj_.ptr() = nullptr;
+  }
+
+  // explicit construction to avoid errornous implicit conversion and
+  // copy-initialization
+  explicit ConcretePyObjectHolder(py::object py_obj)
+      : py_obj_(std::move(py_obj)) {}
+
+ private:
+  py::object py_obj_;
+};
+
+} // namespace c10::ivalue
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e518d5e63e3a26ab93598dc03d0df9504eee7c9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h
@@ -0,0 +1,228 @@
+#pragma once
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/List.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/Optional.h>
+#include <pybind11/detail/common.h>
+#include <torch/csrc/utils/pybind.h>
+#include <cstddef>
+#include <stdexcept>
+
+namespace torch::jit {
+
+void initScriptListBindings(PyObject* module);
+
+/// An iterator over the elements of ScriptList. This is used to support
+/// __iter__(), .
+class ScriptListIterator final {
+ public:
+  ScriptListIterator(
+      c10::impl::GenericList::iterator iter,
+      c10::impl::GenericList::iterator end)
+      : iter_(iter), end_(end) {}
+  IValue next();
+  bool done() const;
+
+ private:
+  c10::impl::GenericList::iterator iter_;
+  c10::impl::GenericList::iterator end_;
+};
+
+/// A wrapper around c10::List that can be exposed in Python via pybind
+/// with an API identical to the Python list class. This allows
+/// lists to have reference semantics across the Python/TorchScript
+/// boundary.
+class ScriptList final {
+ public:
+  // TODO: Do these make sense?
+  using size_type = size_t;
+  using diff_type = ptrdiff_t;
+  using ssize_t = Py_ssize_t;
+
+  // Constructor for empty lists created during slicing, extending, etc.
+  ScriptList(const TypePtr& type) : list_(AnyType::get()) {
+    auto list_type = type->expect<ListType>();
+    list_ = c10::impl::GenericList(list_type);
+  }
+
+  // Constructor for instances based on existing lists (e.g. a
+  // Python instance or a list nested inside another).
+  ScriptList(IValue data) : list_(AnyType::get()) {
+    TORCH_INTERNAL_ASSERT(data.isList());
+    list_ = data.toList();
+  }
+
+  ListTypePtr type() const {
+    return ListType::create(list_.elementType());
+  }
+
+  // Return a string representation that can be used
+  // to reconstruct the instance.
+  std::string repr() const {
+    std::ostringstream s;
+    s << '[';
+    bool f = false;
+    for (auto const& elem : list_) {
+      if (f) {
+        s << ", ";
+      }
+      s << IValue(elem);
+      f = true;
+    }
+    s << ']';
+    return s.str();
+  }
+
+  // Return an iterator over the elements of the list.
+  ScriptListIterator iter() const {
+    auto begin = list_.begin();
+    auto end = list_.end();
+    return ScriptListIterator(begin, end);
+  }
+
+  // Interpret the list as a boolean; empty means false, non-empty means
+  // true.
+  bool toBool() const {
+    return !(list_.empty());
+  }
+
+  // Get the value for the given index.
+  IValue getItem(diff_type idx) {
+    idx = wrap_index(idx);
+    return list_.get(idx);
+  };
+
+  // Set the value corresponding to the given index.
+  void setItem(diff_type idx, const IValue& value) {
+    idx = wrap_index(idx);
+    return list_.set(idx, value);
+  }
+
+  // Check whether the list contains the given value.
+  bool contains(const IValue& value) {
+    for (const auto& elem : list_) {
+      if (elem == value) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  // Delete the item at the given index from the list.
+  void delItem(diff_type idx) {
+    idx = wrap_index(idx);
+    auto iter = list_.begin() + idx;
+    list_.erase(iter);
+  }
+
+  // Get the size of the list.
+  ssize_t len() const {
+    return list_.size();
+  }
+
+  // Count the number of times a value appears in the list.
+  ssize_t count(const IValue& value) const {
+    ssize_t total = 0;
+
+    for (const auto& elem : list_) {
+      if (elem == value) {
+        ++total;
+      }
+    }
+
+    return total;
+  }
+
+  // Remove the first occurrence of a value from the list.
+  void remove(const IValue& value) {
+    auto list = list_;
+
+    int64_t idx = -1, i = 0;
+
+    for (const auto& elem : list) {
+      if (elem == value) {
+        idx = i;
+        break;
+      }
+
+      ++i;
+    }
+
+    if (idx == -1) {
+      throw py::value_error();
+    }
+
+    list.erase(list.begin() + idx);
+  }
+
+  // Append a value to the end of the list.
+  void append(const IValue& value) {
+    list_.emplace_back(value);
+  }
+
+  // Clear the contents of the list.
+  void clear() {
+    list_.clear();
+  }
+
+  // Append the contents of an iterable to the list.
+  void extend(const IValue& iterable) {
+    list_.append(iterable.toList());
+  }
+
+  // Remove and return the element at the specified index from the list. If no
+  // index is passed, the last element is removed and returned.
+  IValue pop(c10::optional<size_type> idx = c10::nullopt) {
+    IValue ret;
+
+    if (idx) {
+      idx = wrap_index(*idx);
+      ret = list_.get(*idx);
+      list_.erase(list_.begin() + *idx);
+    } else {
+      ret = list_.get(list_.size() - 1);
+      list_.pop_back();
+    }
+
+    return ret;
+  }
+
+  // Insert a value before the given index.
+  void insert(const IValue& value, diff_type idx) {
+    // wrap_index cannot be used; idx == len() is allowed
+    if (idx < 0) {
+      idx += len();
+    }
+
+    if (idx < 0 || idx > len()) {
+      throw std::out_of_range("list index out of range");
+    }
+
+    list_.insert(list_.begin() + idx, value);
+  }
+
+  // A c10::List instance that holds the actual data.
+  c10::impl::GenericList list_;
+
+ private:
+  // Wrap an index so that it can safely be used to access
+  // the list. For list of size sz, this function can successfully
+  // wrap indices in the range [-sz, sz-1]
+  diff_type wrap_index(diff_type idx) {
+    auto sz = len();
+    if (idx < 0) {
+      idx += sz;
+    }
+
+    if (idx < 0 || idx >= sz) {
+      throw std::out_of_range("list index out of range");
+    }
+
+    return idx;
+  }
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..76aac1d21ba1a81c98565b0798539e3f37ed0b90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h
@@ -0,0 +1,376 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/concrete_module_type.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::jit {
+
+std::string typeString(py::handle h);
+
+inline std::shared_ptr<SugaredValue> toSimple(Value* v) {
+  return std::make_shared<SimpleValue>(v);
+}
+
+// NB: This should be the single entry-point for instantiating a SugaredValue
+// from a Python object. If you are adding support for converting a new Python
+// type, *add it in this function's implementation*.
+std::shared_ptr<SugaredValue> toSugaredValue(
+    py::object obj,
+    GraphFunction& m,
+    const SourceRange& loc,
+    bool is_constant = false);
+
+c10::optional<StrongFunctionPtr> as_function(const py::object& obj);
+
+struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
+  PythonValue(
+      py::object the_self,
+      c10::optional<py::object> rcb = c10::nullopt,
+      Value* module_self = nullptr)
+      : self(std::move(the_self)),
+        rcb(std::move(rcb)),
+        moduleSelf_(module_self) {}
+
+  FunctionSchema getSchema(
+      const size_t n_args,
+      const size_t n_binders,
+      const SourceRange& loc);
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::string kind() const override;
+
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const c10::optional<size_t>& size_hint = {}) override;
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override {
+    throw ErrorReport(loc)
+        << kind() << " cannot be used as a value. "
+        << "Perhaps it is a closed over global variable? If so, please "
+        << "consider passing it in as an argument or use a local varible "
+        << "instead.";
+  }
+
+ protected:
+  py::object getattr(const SourceRange& loc, const std::string& name);
+
+  void checkForAddToConstantsError(std::stringstream& ss);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  py::object self;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::optional<py::object> rcb;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Value* moduleSelf_ = nullptr;
+};
+
+struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue {
+  explicit PythonModuleValue(py::object mod) : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+};
+
+// Used for desugaring uses of the torch.cuda module. All the CUDA APIs with
+// torch.cuda.* are resolved using CUDAPythonModuleValue.
+struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue {
+  explicit CUDAPythonModuleValue(py::object mod)
+      : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+};
+
+// Represents all the parameters of a module as a List[Tensor]
+struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
+  ConstantParameterList(Value* the_list) : the_list_(the_list) {}
+  std::string kind() const override {
+    return "constant parameter list";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    return toSimple(the_list_);
+  }
+
+ private:
+  Value* the_list_;
+};
+
+struct VISIBILITY_HIDDEN ModuleDictMethod : public SugaredValue {
+  explicit ModuleDictMethod(SugaredValuePtr iterable, std::string name)
+      : iterable_(std::move(iterable)), name_(std::move(name)){};
+
+  std::string kind() const override {
+    return name_;
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    if (!args.empty() || !kwargs.empty()) {
+      throw ErrorReport(loc)
+          << name_ << " method does not accept any arguments";
+    }
+    return iterable_;
+  }
+
+  SugaredValuePtr iterable_;
+  const std::string name_;
+};
+
+struct SugaredDict;
+
+// defines how modules/methods behave inside the script subset.
+// for now this does not have any interaction with python.
+// in the future, we will add the ability to resolve `self.foo` to python
+// {functions, modules, constants} so this SugaredValue is defined here
+// anticipating we will eventually need to replace Module with a py::object
+// holding the actual nn.Module class.
+
+struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
+  ModuleValue(Value* self, std::shared_ptr<ConcreteModuleType> concreteType)
+      : self_(self), concreteType_(std::move(concreteType)) {}
+
+  std::string kind() const override {
+    return "module";
+  }
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override;
+
+  SugaredValuePtr asTupleValue(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  // select an attribute on it, e.g. `this.field`
+  std::shared_ptr<SugaredValue> tryGetAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field);
+
+  // select an attribute on it, e.g. `this.field`
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  // select an attribute on it, e.g. `this.field`
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  // call module.forward with pre_hooks and hooks
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredDict> getSugaredDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedBufferDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterList(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override;
+
+  std::shared_ptr<SugaredValue> getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint) override;
+
+ private:
+  // Check that the type of all submodules is a subtype of ty. If the function
+  // returns false, more information about why it returns false (e.g. which
+  // submodule's type is not a subtype of ty) is printed it why_not if it is not
+  // null.
+  bool areAllSubmodulesSubtypeOf(
+      const TypePtr& ty,
+      std::ostream* why_not = nullptr) const;
+
+  Value* self_;
+  std::shared_ptr<ConcreteModuleType> concreteType_;
+};
+
+bool isNamedTupleClass(const py::object& obj);
+TypePtr registerNamedTuple(
+    const py::object& obj,
+    const SourceRange& loc,
+    const ResolutionCallback& rcb);
+
+void recurseThroughNestedModules(
+    const SourceRange& loc,
+    GraphFunction& m,
+    std::vector<SugaredValuePtr>& keys,
+    std::vector<SugaredValuePtr>& values,
+    std::shared_ptr<ModuleValue>& self,
+    const std::string& prefix,
+    const std::string& field);
+
+// Used to support named_modules()
+struct VISIBILITY_HIDDEN SugaredDict : public SugaredValue {
+  explicit SugaredDict(
+      std::shared_ptr<ModuleValue> self,
+      std::shared_ptr<SugaredTupleValue> keys,
+      std::shared_ptr<SugaredTupleValue> modules)
+      : self_(std::move(self)),
+        keys_(std::move(keys)),
+        modules_(std::move(modules)) {}
+
+  std::string kind() const override {
+    return "ModuleDict";
+  }
+
+  std::shared_ptr<SugaredTupleValue> getKeys() {
+    return keys_;
+  }
+
+  std::shared_ptr<SugaredTupleValue> getModules() {
+    return modules_;
+  }
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override {
+    return keys_;
+  };
+
+  std::shared_ptr<ModuleValue> self_;
+  std::shared_ptr<SugaredTupleValue> keys_;
+  std::shared_ptr<SugaredTupleValue> modules_;
+};
+
+struct VISIBILITY_HIDDEN BooleanDispatchValue : public SugaredValue {
+  BooleanDispatchValue(py::dict dispatched_fn)
+      : dispatched_fn_(std::move(dispatched_fn)) {}
+
+  std::string kind() const override {
+    return "boolean dispatch";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  py::dict dispatched_fn_;
+};
+
+struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue {
+  PythonClassValue(ClassTypePtr type, py::object py_type)
+      : ClassValue(std::move(type)), py_type_(std::move(py_type)) {}
+
+  std::string kind() const override {
+    return "Python type";
+  }
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+ private:
+  py::object py_type_;
+};
+
+struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue {
+  explicit PythonExceptionValue(const py::object& exception_class)
+      : ExceptionValue(
+            py::str(py::getattr(exception_class, "__name__", py::str("")))),
+        exception_class_qualified_name_(
+            py::str(py::module::import("torch._jit_internal")
+                        .attr("_qualified_name")(
+                            exception_class,
+                            /*mangle_name=*/false))) {}
+
+  std::string kind() const override {
+    return "Python exception";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  std::string exception_class_qualified_name_;
+};
+
+// Python Slice class.
+struct VISIBILITY_HIDDEN PythonSliceClass : public SugaredValue {
+  explicit PythonSliceClass() = default;
+
+  std::string kind() const override {
+    return "Python slice class";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8394d846692d0db47a362332932e68d6e505f1b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <memory>
+#include <string>
+
+namespace torch::jit {
+
+struct Module;
+
+namespace tracer {
+void initPythonTracerBindings(PyObject* module);
+
+SourceRange getPythonInterpreterSourceRange();
+
+Node* preRecordPythonTrace(
+    THPObjectPtr pyobj,
+    const std::string& arg_types,
+    at::ArrayRef<autograd::Variable> inputs,
+    std::vector<THPObjectPtr> scalar_args);
+
+std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracingWithDict(
+    const py::function& func,
+    const py::dict& inputs_dict,
+    Stack inputs,
+    const py::function& var_name_lookup_fn,
+    bool strict,
+    bool force_outplace,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+
+std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracing(
+    const py::function& func,
+    Stack inputs,
+    const py::function& var_name_lookup_fn,
+    bool strict,
+    bool force_outplace,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+} // namespace tracer
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae90f30d7b589d83d19becc03e482aa87c7255e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::jit {
+
+void initTreeViewBindings(PyObject* module);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb511660acc99cc946860c84f21d650a3f849076
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+
+namespace torch::jit {
+void initJitScriptBindings(PyObject* module);
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8afe3ea8f50810ffe5e4730bda8c49cbbee6456c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/csrc/Export.h>
+namespace torch::jit {
+TORCH_API void setGraphExecutorOptimize(bool o);
+TORCH_API bool getGraphExecutorOptimize();
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h
new file mode 100644
index 0000000000000000000000000000000000000000..293ef8994cefe91eb0b94f31ea2f1af816180c64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/csrc/Export.h>
+namespace torch::jit {
+TORCH_API void setUTF8DecodingIgnore(bool o);
+TORCH_API bool getUTF8DecodingIgnore();
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c644ed60af333336c61ff24f403ce9a5fb81c7b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <functional>
+
+namespace torch {
+namespace jit {
+
+class ResourceGuard {
+  std::function<void()> _destructor;
+  bool _released{false};
+
+ public:
+  ResourceGuard(std::function<void()> destructor)
+      : _destructor(std::move(destructor)) {}
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~ResourceGuard() {
+    if (!_released)
+      _destructor();
+  }
+
+  void release() {
+    _released = true;
+  }
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..51612e1870d5defcce5bf42c843eafa3e9943340
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h
@@ -0,0 +1,511 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <ostream>
+#include <vector>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace torch::jit {
+
+// GraphExecutor creates specializations of Graphs for different
+// dimensionalitities and types of inputs.
+
+struct ArgumentInfo {
+  friend struct ArgumentSpec;
+  using plain_data_type = uint64_t;
+
+  bool defined() const {
+    return defined_;
+  }
+  at::Device device() const {
+    return at::Device(DeviceType(dev_type_), device_);
+  }
+  // XXX: It is guaranteed that this will return false when called on non-tensor
+  // arguments
+  bool requires_grad() const {
+    return requires_grad_;
+  }
+  int dim() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    return dim_;
+  }
+  at::ScalarType type() const {
+    return at::ScalarType(type_);
+  }
+  TypePtr toType() const {
+    if (!defined())
+      return TensorType::get();
+
+    return TensorType::create(
+        type(), device(), c10::optional<size_t>(dim()), requires_grad());
+  }
+  operator TypePtr() const {
+    return toType();
+  }
+
+ private:
+  unsigned defined_ : 1;
+  unsigned requires_grad_ : 1;
+  unsigned : 5;
+  unsigned dim_ : 8;
+  unsigned device_ : 8;
+  unsigned type_ : 8;
+  unsigned dev_type_ : 16;
+  unsigned : 16;
+};
+
+static_assert(
+    std::is_standard_layout<ArgumentInfo>::value,
+    "ArgumentInfo is to be a POD struct");
+static_assert(
+    sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
+    "ArgumentInfo is expected to be a 32-bit struct");
+
+struct ArgumentSpec {
+  ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs)
+      : hash_code(c10::hash_combine(
+            num_flat_tensor_inputs,
+            num_flat_optional_inputs)) {
+    tensor_args.reserve(num_flat_tensor_inputs);
+    optional_presence.reserve(num_flat_optional_inputs);
+  }
+
+  void addOptional(const IValue& input) {
+    bool is_present = !input.isNone();
+    optional_presence.push_back(is_present);
+    hash_code = c10::hash_combine(hash_code, is_present);
+  }
+
+  void addTensor(const IValue& input, bool with_grad) {
+    AT_ASSERT(input.isTensor(), "Expected Tensor but found ", input.tagKind());
+    tensor_args.emplace_back();
+    auto& arg = tensor_args.back();
+    // Initialize all fields to 0. This is convenient, because e.g.
+    // requires_grad() can be checked even on tensors AND will make
+    // padding bits all 0s.
+    std::memset(&arg, 0, sizeof(ArgumentInfo));
+
+    // [argspec refcounting] reinterpret the IValue to avoid having to refcount
+    // the Tensor microbenchmarks
+    // https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10
+    // show overhead in extra refcounting along this path
+    const at::Tensor* t = reinterpret_cast<const at::Tensor*>(&input);
+    arg.defined_ = t->defined();
+    if (arg.defined_) {
+      arg.requires_grad_ = with_grad && autograd::Variable(*t).requires_grad();
+      arg.dim_ = t->dim();
+      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+      at::Device device = t->device();
+      arg.dev_type_ =
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          static_cast<std::underlying_type<DeviceType>::type>(device.type());
+      // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+      arg.device_ = device.index();
+      arg.type_ = static_cast<unsigned>(t->scalar_type());
+    }
+    combineHash(arg);
+  }
+
+  void combineHash(const ArgumentInfo& arg) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    ArgumentInfo::plain_data_type arg_data;
+    std::memcpy(&arg_data, &arg, sizeof(ArgumentInfo));
+    hash_code = c10::hash_combine(hash_code, arg_data);
+  }
+
+  // equality is fast: check ninputs, and then check the raw array data,
+  // there are no size/stride indirections
+  // hopefully std::vector<bool> has fast equality
+  bool operator==(const ArgumentSpec& spec) const {
+    if (optional_presence != spec.optional_presence) {
+      return false;
+    }
+    if (tensor_args.size() != spec.tensor_args.size())
+      return false;
+    // NB: we need to break out early when there are no elements, because
+    // passing a nullptr to memcmp is UB.
+    if (tensor_args.empty())
+      return true;
+    return std::memcmp(
+               tensor_args.data(),
+               spec.tensor_args.data(),
+               tensor_args.size() * sizeof(ArgumentInfo)) == 0;
+  }
+  bool operator!=(const ArgumentSpec& spec) const {
+    return !(*this == spec);
+  }
+  size_t numTensors() const {
+    return tensor_args.size();
+  }
+  const ArgumentInfo& tensorAt(size_t i) const {
+    return tensor_args[i];
+  }
+  size_t numOptionals() const {
+    return optional_presence.size();
+  }
+  bool isPresent(size_t i) const {
+    return optional_presence[i];
+  }
+  size_t hashCode() const {
+    return hash_code;
+  }
+
+ private:
+  size_t hash_code; // precomputed on construction
+  std::vector<ArgumentInfo> tensor_args;
+  std::vector<bool> optional_presence;
+};
+
+namespace {
+static constexpr size_t ARG_SPEC_DEPTH_LIMIT = 128;
+}
+
+// ArgumentSpecCreator takes an initial graph and comes up with a set
+// of simple instructions to compute the ArgumentSpec given a set of
+// input tensors.
+struct TORCH_API ArgumentSpecCreator {
+  // instructs acts on a stack of a list of input IValues
+  // at the beginning the stack contains a single list of the inputs to the
+  // function the ENTER_ instructs descend into subobjects and push new lists
+  // onto the stack
+  enum Inst : char {
+    ENTER_TUPLE, // consume a tuple ivalue from the top-most list, and push the
+                 // list of its elements onto the stack as a new list
+    ENTER_OBJECT, // same as ENTER_TUPLE, but the input is a class
+    LEAVE, // pop the top-most list from the stack
+    SKIP, // consume an element from the top-most list, and discard
+    SPECIALIZE_OPTIONAL_TENSOR, // consume a optional tensor for the top-most
+                                // list, and add it to the ArgSpec key being
+                                // created
+    SPECIALIZE_TENSOR, // consume a tensor for the top-most
+                       // list, and add it to the ArgSpec key being created
+    SPECIALIZE_OPTIONAL,
+    // consume a nontensor optional from the top-most list,
+    // and add it to the ArgSpec key being created
+  };
+  ArgumentSpecCreator(Graph& graph);
+  ArgumentSpec create(bool with_grad, const Stack& stack) const;
+  void specializeTypes(Graph& g, const ArgumentSpec& spec) const;
+  void dump() const;
+  using WrittenSlots = std::unordered_set<std::string>;
+
+ private:
+  void scan(
+      const TypePtr& typ,
+      size_t depth,
+      const WrittenSlots& written_slots);
+  size_t num_inputs_;
+  size_t num_tensors_ = 0;
+  size_t num_optionals_ = 0;
+  std::vector<Inst> instructions_;
+};
+
+// CompleteArgumentSpec represents one particular specialization.
+// It is designed so that it can be created, hashed, and compared quickly
+// since it is used along the hot-path of the JIT to check if the code
+// we have created is valid for the given inputs.
+
+// COmpleteArgumentInfoPOD is only used internally in CompleteArgumentSpec
+// API users should use ArgumentInfo
+struct CompleteArgumentInfoPOD {
+  // total size is 64-bit
+  unsigned is_tensor : 8; // all other fields are invalid if this is false
+  unsigned type : 8; // scalar type
+  unsigned defined : 1;
+  unsigned requires_grad : 1;
+  signed device : 14;
+  unsigned dev_type : 16;
+  unsigned
+      total_dims : 16; // all TensorInfoPODs are in CompleteArgumentSpec's
+                       // tensor_info() array. total_dims is the total number of
+                       // dimensions seen so far in all previous members of
+                       // tensor_info(), including this tensor 2*total_dims
+                       // becomes the offset into the sizes_strides list for the
+                       // _next_ tensor in the tensor_info array for tensor 0,
+                       // the offset is always 0
+};
+
+static_assert(
+    sizeof(CompleteArgumentInfoPOD) == sizeof(int64_t),
+    "CompleteArgumentInfoPOD must be 64-bit struct for CompleteArgumentSpec encoding to work");
+
+struct CompleteArgumentInfo;
+
+struct CompleteArgumentSpec {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CompleteArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
+      : hash_code(0), ninputs(inputs.size()) {
+    int32_t all_dims = 0;
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    const int32_t num_inputs = inputs.size();
+    for (const auto i : c10::irange(num_inputs)) {
+      if (!inputs[i].isTensor())
+        continue;
+      auto& tensor = inputs[i].toTensor();
+      all_dims += tensor.defined() ? tensor.ndimension() : 0;
+    }
+    // allocate enough room for all TensorPODs and dimensions
+    data.resize(ninputs + all_dims * 2);
+
+    // and reinterpret our data array as these structs
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    auto* pods = reinterpret_cast<CompleteArgumentInfoPOD*>(data.data());
+    int64_t* next_dim = sizes_strides();
+    int32_t total_dims = 0;
+    for (const auto i : c10::irange(num_inputs)) {
+      auto& pod = pods[i];
+      pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
+      if (pod.is_tensor) {
+        at::Tensor t = inputs[i].toTensor();
+        pod.defined = t.defined();
+        if (pod.defined) {
+          pod.type = static_cast<int>(t.scalar_type());
+          // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+          at::Device device = t.device();
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          pod.dev_type = static_cast<std::underlying_type<DeviceType>::type>(
+              device.type());
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          pod.device = device.index();
+          pod.requires_grad = with_grad && t.requires_grad();
+          total_dims += t.ndimension();
+          auto sizes = t.sizes();
+          std::copy(sizes.begin(), sizes.end(), next_dim);
+          next_dim += sizes.size();
+          auto strides = t.strides();
+          std::copy(strides.begin(), strides.end(), next_dim);
+          next_dim += strides.size();
+        }
+      }
+      // each POD has a running tally of all dimensions including its own
+      TORCH_CHECK(
+          total_dims < std::numeric_limits<uint16_t>::max(),
+          "The number of dims cannot be packed into CompleteArgumentSpec:",
+          total_dims);
+      pod.total_dims = total_dims;
+    }
+    // we precompute the hash_code to minimize the time inside of hash
+    // table operations where we may need to hold a compiler cache lock.
+    hash_code = c10::hash_combine(0, ninputs);
+    for (auto d : data) {
+      hash_code = c10::hash_combine(hash_code, d);
+    }
+  }
+
+  // equality is fast: check ninputs, and then check the raw array data,
+  // there are no size/stride indirections
+  bool operator==(const CompleteArgumentSpec& spec) const {
+    return ninputs == spec.ninputs && data == spec.data;
+  }
+  bool operator!=(const CompleteArgumentSpec& spec) const {
+    return !(*this == spec);
+  }
+  friend struct CompleteArgumentInfo;
+  CompleteArgumentInfo at(size_t i) const;
+  size_t size() const {
+    return ninputs;
+  }
+  size_t hashCode() const {
+    return hash_code;
+  }
+
+ private:
+  ArrayRef<CompleteArgumentInfoPOD> tensor_info() const {
+    return ArrayRef<CompleteArgumentInfoPOD>(
+        reinterpret_cast<const CompleteArgumentInfoPOD*>(data.data()), ninputs);
+  }
+  // the start of the sizes_strides information, which comes after the
+  // CompleteArgumentInfoPOD list.
+  const int64_t* sizes_strides() const {
+    return data.data() + ninputs;
+  }
+  int64_t* sizes_strides() {
+    return data.data() + ninputs;
+  }
+  size_t hash_code; // precomputed on construction
+  size_t ninputs;
+  // layout is ninputs of TensorPOD (each 64-bit) followed by their size and
+  // stride info for 3 tensors:
+  // [t0POD][t1POD][t2POD]...
+  // [t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
+  std::vector<int64_t> data;
+};
+
+// public view of compressed CompleteArgumentInfo
+struct CompleteArgumentInfo {
+  CompleteArgumentInfo(const CompleteArgumentSpec& spec, const int i)
+      : spec(spec), i(i) {}
+  bool isTensor() const {
+    return pod(i).is_tensor;
+  }
+  at::ScalarType type() const {
+    return at::ScalarType(pod(i).type);
+  }
+  bool defined() const {
+    return pod(i).defined;
+  }
+  bool requires_grad() const {
+    return pod(i).requires_grad;
+  }
+  at::Device device() const {
+    return at::Device(
+        DeviceType(pod(i).dev_type),
+        static_cast<c10::DeviceIndex>(pod(i).device));
+  }
+  int ndimension() const {
+    // See [valid range], it is always valid to ask for offset for (i + 1)
+    return (sizes_strides_offset(i + 1) - sizes_strides_offset(i)) / 2;
+  }
+  at::IntArrayRef sizes() const {
+    return at::IntArrayRef(
+        spec.sizes_strides() + sizes_strides_offset(i), ndimension());
+  }
+  at::IntArrayRef strides() const {
+    int ndim = ndimension();
+    return at::IntArrayRef(
+        spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim);
+  }
+  operator TypePtr() const {
+    if (!defined())
+      return TensorType::get();
+    return TensorType::create(
+        type(),
+        device(),
+        c10::VaryingShape<int64_t>{sizes()},
+        c10::VaryingShape<int64_t>{strides()},
+        requires_grad());
+  }
+
+ private:
+  // offsetinto sizes_strides() array where the sizes start for tensor j
+  // [valid range] valid range is [0, ninputs]
+  // (i.e. you can ask for the offset at ninputs, which would be the offset of
+  // the next tensor if it existed)
+  int sizes_strides_offset(int j) const {
+    if (j == 0)
+      return 0;
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    return 2 * pod(j - 1).total_dims;
+  }
+  const CompleteArgumentInfoPOD& pod(int j) const {
+    return spec.tensor_info().at(j);
+  }
+  const CompleteArgumentSpec& spec;
+  const int i;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
+  if (!info.defined()) {
+    return out << "<undefined>";
+  }
+  out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
+      << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
+      << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
+  out << "{";
+  for (const auto i : c10::irange(spec.numTensors())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.tensorAt(i);
+  }
+  out << "; ";
+  for (const auto i : c10::irange(spec.numOptionals())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.isPresent(i);
+  }
+  out << "}";
+  return out;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const CompleteArgumentInfo& info) {
+  if (!info.defined()) {
+    return out << "<undefined>";
+  }
+  out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
+      << ", requires_grad=" << info.requires_grad()
+      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const CompleteArgumentSpec& spec) {
+  out << "{";
+  for (const auto i : c10::irange(spec.size())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.at(i);
+  }
+  out << "}";
+  return out;
+}
+
+inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const {
+  return CompleteArgumentInfo(*this, i);
+}
+
+inline c10::optional<int8_t> convertOptional(
+    c10::optional<c10::ScalarType> const& from) {
+  return (from) ? c10::optional<int8_t>(static_cast<int8_t>(*from))
+                : c10::optional<int8_t>{};
+}
+
+} // namespace torch::jit
+
+namespace std {
+
+template <typename T>
+struct hash<c10::VaryingShape<T>> {
+  size_t operator()(const c10::VaryingShape<T>& vs) const {
+    return c10::get_hash(
+        vs.size(),
+        vs.size() ? vs.sizes().value() : std::vector<c10::optional<T>>());
+  }
+};
+
+template <>
+struct hash<c10::TensorType> {
+  size_t operator()(const c10::TensorType& ptt) const {
+    return c10::get_hash<
+        c10::optional<int8_t>,
+        c10::VaryingShape<int64_t>,
+        c10::VaryingShape<int64_t>,
+        c10::optional<bool>>(
+        torch::jit::convertOptional(ptt.scalarType()),
+        ptt.sizes(),
+        ptt.strides(),
+        ptt.requiresGrad());
+  }
+};
+
+template <>
+struct hash<torch::jit::ArgumentSpec> {
+  size_t operator()(const torch::jit::ArgumentSpec& spec) const {
+    return spec.hashCode();
+  }
+};
+template <>
+struct hash<torch::jit::CompleteArgumentSpec> {
+  size_t operator()(const torch::jit::CompleteArgumentSpec& spec) const {
+    return spec.hashCode();
+  }
+};
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h
new file mode 100644
index 0000000000000000000000000000000000000000..380985ee24aef00553c0042b500c1f2f01ab6030
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <memory>
+#include <vector>
+
+namespace torch::jit {
+
+using value_list = std::vector<Value*>;
+// clang-format off
+// Example showcasing how Gradient is constructed:
+//
+// Let's assume we have a function f, `m` and `n` do not require grad
+// (`n` can depend only on `m`):
+//   y, n = f(x, m)
+//
+// Now, let's assume that the reverse of f (called f') needs to use values of `x`, `t` and `y`.
+// `t` is an intermediate value produced in the body of f, and let's assume that it requires
+// grad too.
+//
+// In this case differentiate(f) will return this:
+//   y, n, t = f(x, m)        // `t` is appended to the output list
+//   dx = f'(dy, dt, x, t, y) // No `dm` or `dn` because they do not require gradient
+//                            // All needed values from f are prepended to the input list
+//
+//   f_real_outputs = 2       // Only first two outputs were present in f originally
+//   df_input_vjps = {0, 2}   // i.e. connect grad_fn of y and t variables produced by f,
+//                    y  t    // with y's output_nr = 0 and t's output_nr = 1
+//   df_input_captures = {I0, O2, O0} // Order matches the prefix of inputs to df
+//                        x   t   y
+//   df_output_vjps = {0}     // i.e. connect next_edge[0] of grad_fn to x's (grad_fn, output_nr).
+//
+// Terminology: vjp = vector-jacobian product
+// clang-format on
+
+struct Gradient {
+  explicit operator bool() const {
+    return df != nullptr;
+  }
+  std::shared_ptr<Graph> f;
+  std::shared_ptr<Graph> df;
+
+  // Describes how to construct outputs of f from what its graph will return.
+  // This is necessary because some trailing outputs are intermediates produced
+  // only to be saved for df (and should be ignored).
+  size_t f_real_outputs = 0; // initialized for safety.
+
+  // df inputs are split into two sections: vjps (aka grad_outputs) and
+  // captures. VJPs are "seeds" for the gradient computation given for each
+  // input capture of an Output kind. Captures are values the need to be saved
+  // when f is run. We handle inputs specially, because this allows us to avoid
+  // adding extra vjps as df inputs.
+
+  std::vector<size_t> df_input_vjps; // Offsets into f's outputs.
+  // capture can come from inputs or outputs
+  std::vector<size_t> df_input_captured_inputs; // Offsets into f's inputs
+  std::vector<size_t> df_input_captured_outputs; // Offsets into f's outputs
+
+  // df will produce vjps for a subset of inputs of f that required grad.
+  // df_output_vjps[idx] == inp_idx means that idx-th output of df produces a
+  // vjp for inp_idx-th input of f.
+  std::vector<size_t> df_output_vjps; // Offsets into f's inputs.
+
+  // How to use gradient to implement a differentiable autograd function:
+  // When running f:
+  //   - Unwrap input Variables
+  //   - Run f's graph
+  //   - Create grad_fn
+  //   - Wrap outputs in Variables (assume we have a tensor_outputs array):
+  //       outputs = map(Variable, tensor_output)
+  //       for i, offset in enumerate(df_input_vjps):
+  //         outputs[offset].set_grad_fn(grad_fn, output_nr=i)
+  //   - Use df_output_vjps to connect next_edges of grad_fn:
+  //       for idx in df_output_vjps:
+  //         grad_fn.add_next_edge(inputs[idx].gradient_edge())
+  //   - Save captures for df (care needs to be taken to use SavedVariables for
+  //                           inputs and outputs that we will actually return)
+  //   - Return outputs[:f_real_outputs]
+  //
+  // When running df:
+  //   - Concatenate received vjps and captured Variables
+  //   - Interpret df
+  //   - Wrap outputs of df into Variables (that don't require grad)
+};
+TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph);
+
+// can we take a derivative of this node symbolically?
+TORCH_API bool isDifferentiable(const Node* n);
+TORCH_API bool isDifferentiable(Graph& g);
+TORCH_API bool isZero(Value* v);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h
new file mode 100644
index 0000000000000000000000000000000000000000..215c645d72832bfe7243f985256e63844a945087
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <cstddef>
+
+namespace torch::jit {
+
+// Calculates the number of args that need to be passed in.
+// Less args may be needed if defaults are provided.
+// Returns: {number args needed, number of out args}
+inline std::pair<int64_t, int64_t> CalculateNecessaryArgs(
+    const std::vector<Argument>& schema_args,
+    at::ArrayRef<Value*> actual_inputs,
+    bool allow_trailing_out_args) {
+  if (schema_args.empty()) {
+    return std::make_pair(0, 0);
+  }
+
+  // count number of out arguments
+  int64_t schema_idx = static_cast<int64_t>(schema_args.size()) - 1;
+  if (allow_trailing_out_args) {
+    // skip over out arguments in the end.
+    while (schema_idx >= 0) {
+      const auto& current_arg = schema_args.at(schema_idx);
+      if (!current_arg.is_out()) {
+        break;
+      }
+      schema_idx--;
+    }
+  }
+
+  int64_t num_out = static_cast<int64_t>(schema_args.size()) - schema_idx - 1;
+
+  if (schema_args.size() < actual_inputs.size()) {
+    return std::make_pair(actual_inputs.size(), num_out);
+  }
+
+  // if it is the default args, we reset the index to the last element
+  if (!allow_trailing_out_args) {
+    schema_idx = schema_args.size() - 1;
+  }
+  // keeps track of trailing unnecessary args
+  while (schema_idx >= 0) {
+    // this means it is not default argument, so it is necessary
+    if (!schema_args.at(schema_idx).default_value().has_value()) {
+      return std::make_pair(schema_idx + 1, num_out);
+    } else {
+      auto schema_value =
+          schema_args.at(schema_idx).default_value().value().toIValue();
+      // non-const value will become nullptr here, so will be marked necessary
+      // non-const would include prim::ListConstruct, prim::DictConstruct as
+      // well.
+      auto actual_value = toIValue(actual_inputs[schema_idx]);
+      if (!actual_value.has_value()) {
+        return std::make_pair(schema_idx + 1, num_out);
+      }
+      // if the IR has same value as default value of the schema,
+      // it is not necessary argument.
+      if (schema_value != actual_value.value()) {
+        return std::make_pair(schema_idx + 1, num_out);
+      }
+    }
+    schema_idx--;
+  }
+  return std::make_pair(0, num_out);
+}
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb0e7a38b650277cc0c8e62bb68794af97407b7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace torch::jit {
+
+/// Registration class for new operators. Effectively calls
+/// `torch::jit::registerOperator` for every supplied operator, but allows doing
+/// so in the global scope when a `RegisterOperators` object is assigned to a
+/// static variable.
+/// Note: This is *not* the custom operator API. If you want to register custom
+/// operators, take a look at torch::RegisterOperators.
+struct TORCH_API RegisterOperators {
+  RegisterOperators() = default;
+
+  /// Registers a vector of already created `Operator`s.
+  /// The operator element is now optional to filter null ops. It's backward
+  /// compatible and works for selective operator registration.
+  explicit RegisterOperators(std::vector<c10::optional<Operator>> operators) {
+    for (c10::optional<Operator>& o : operators) {
+      if (o) {
+        registerOperator(std::move(o.value()));
+      }
+    }
+  }
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7de5a55c7d056c64e14cdc888fddab89d40ee7b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h
@@ -0,0 +1,33 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API c10::optional<std::shared_ptr<Graph>> GetDecomposition(
+    const FunctionSchema& schema);
+
+TORCH_API void RegisterDecomposition(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g);
+
+TORCH_API void RunDecompositions(std::shared_ptr<Graph> g);
+
+TORCH_API c10::optional<GraphFunction*> GetDecompositionFunction(
+    const FunctionSchema& schema);
+
+// For invocation in C++, recommended is to assign to static local variable
+TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
+
+TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
+
+TORCH_API void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+TORCH_API bool has_jit_decomposition(const FunctionSchema& schema);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..26083e44659785919d9184dcb47d0ed3f592cca5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedDecompositions();
+
+TORCH_API const OperatorMap<std::string>& GetDecompositionMapping();
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab634fecfe30a264a65243e6e99cad59f2c266d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <stdexcept>
+
+namespace torch::jit {
+
+struct ExceptionMessage {
+  ExceptionMessage(const std::exception& e) : e_(e) {}
+
+ private:
+  const std::exception& e_;
+  friend std::ostream& operator<<(
+      std::ostream& out,
+      const ExceptionMessage& msg);
+};
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const ExceptionMessage& msg) {
+  auto c10_error = dynamic_cast<const c10::Error*>(&msg.e_);
+  if (c10_error) {
+    out << c10_error->what_without_backtrace();
+  } else {
+    out << msg.e_.what();
+  }
+  return out;
+}
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..9337dc5f6295f12f5fe47a2c33f1c6ea5c2b2403
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/python/update_graph_executor_opt.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/variable_tensor_list.h>
+
+C10_DECLARE_bool(torch_jit_enable_new_executor);
+
+C10_DECLARE_bool(torch_jit_execution_plan_reuse_code_graph);
+
+namespace torch::jit {
+struct GraphExecutorState;
+struct Code;
+
+enum ExecutorExecutionMode {
+  SIMPLE,
+  PROFILING,
+};
+
+struct ExecutionPlan {
+  ExecutionPlan() = default;
+  ExecutionPlan(std::shared_ptr<Graph> graph, std::string function_name)
+      : code(graph, std::move(function_name)),
+        graph(
+            FLAGS_torch_jit_execution_plan_reuse_code_graph
+                ? code.graph()
+                : std::move(graph)) {}
+
+  operator bool() const {
+    return static_cast<bool>(graph);
+  }
+
+  Code code;
+  std::shared_ptr<Graph> graph;
+};
+
+// Notice that those structs don't manage lifetime of their members.
+// They are only valid only right after you call getDebugState() and should
+// never be used again once another GraphExecutor function is called.
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct GraphExecutorState {
+  const Graph* graph = nullptr;
+  ExecutionPlan fallback; // XXX: members of this field are optional
+  std::unordered_map<ArgumentSpec, ExecutionPlan> execution_plans;
+};
+
+struct TORCH_API EnableProfilingGuard {
+  EnableProfilingGuard();
+  ~EnableProfilingGuard();
+
+ private:
+  bool old_executor_mode = false;
+  bool old_get_optimize = false;
+};
+
+struct GraphExecutorImplBase;
+struct TORCH_API GraphExecutor {
+  GraphExecutor() = default;
+  GraphExecutor(const std::shared_ptr<Graph>& graph, std::string function_name);
+
+  GraphExecutor(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      ExecutorExecutionMode executor_mode);
+
+  void run(Stack& inputs);
+  c10::intrusive_ptr<Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch);
+
+  // `remaining_bailout_depth` stands for the maximum number of profiled and
+  // specialized recompilations allowed for the current `GraphExecutor`. if
+  // remaining_bailout_depth is equal to 0, `GraphExecutor` won't perform any
+  // profiling and specialization. This is also equivalent to the
+  // SIMPLE_EXECUTOR mode. if remaining_bailout_depth is greater than 0,
+  // `GraphExecutor` will profile and specialize its input graph based on the
+  // profiled information whenever a bailout check is failed/triggered, a new
+  // `GraphExecutor` will be created. This new `GraphExecutor`'s
+  // remaining_bailout_depth will be reduced by 1.
+  // If no bailout depth is passed, the depth will be initialized from the
+  // current global fusion strategy settings.
+  const ExecutionPlan& getPlanFor(
+      Stack& inputs,
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt);
+  GraphExecutorState getDebugState();
+
+  void debugFlushCompilationCache();
+
+  bool isOptimized() const;
+
+ private:
+  std::shared_ptr<GraphExecutorImplBase> pImpl;
+};
+
+TORCH_API Node* replaceBlockWithFallbackGraph(
+    Block* b,
+    ArrayRef<Value*> inputs);
+
+// These passes need to run before it is valid to pass to the interpreter
+// regardless of whether sizes have been specialized or not.
+TORCH_API void runRequiredPasses(const std::shared_ptr<Graph>& g);
+
+TORCH_API void debugSetFusionGroupInlining(bool state);
+TORCH_API bool getFusionGroupInlining();
+
+TORCH_API void debugSetAutodiffSubgraphInlining(bool state);
+TORCH_API std::shared_ptr<Graph> lastExecutedOptimizedGraph();
+
+TORCH_API std::atomic<bool>& getProfilingMode();
+TORCH_API std::atomic<bool>& getExecutorMode();
+TORCH_API std::atomic<size_t>& getNumProfiledRuns();
+TORCH_API size_t getBailoutDepth();
+TORCH_API bool IsNewExecutorEnabled();
+
+struct TORCH_API GraphOptimizerEnabledGuard {
+  GraphOptimizerEnabledGuard(bool state)
+      : old_state_(getGraphExecutorOptimize()) {
+    setGraphExecutorOptimize(state);
+  }
+
+  ~GraphOptimizerEnabledGuard() {
+    setGraphExecutorOptimize(old_state_);
+  }
+
+  bool old_state_;
+};
+
+namespace detail {
+
+GraphExecutor* getGradExecutor(Operation& op);
+
+GraphExecutor* getDifferentiableGraphOpExecutor(Operation& op);
+
+// for debugging information we expose a way to get the last actually
+// run graph. Previous approaches allowed querying the GraphExecutor
+// for what graph it would run in certain circumstances (graphFor), but
+// this is fragile because we sometimes change how these decisions are made.
+// This interface still allows our tests to look at optimized graphs, but
+// with less plumbing.
+} // namespace detail
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..23c454d99ae8dd2296a6df1b30d4e34eef24f42f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -0,0 +1,113 @@
+#pragma once
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/autodiff.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/runtime/logging.h>
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch::jit {
+
+void packGradient(const Gradient& gradient, Node* dnode);
+bool needsGradient(const std::shared_ptr<const Graph>& graph);
+void runOptimization(
+    std::shared_ptr<Graph>& graph,
+    bool unroll_non_constant_loops = true,
+    bool const_prop_user_classes = true);
+void runNondiffOptimization(
+    std::shared_ptr<Graph>& graph,
+    bool strict_fuser_check = false);
+void debugSetAutodiffSubgraphInlining(bool state);
+bool TORCH_API getAutodiffSubgraphInlining();
+
+void debugSetFusionGroupInlining(bool state);
+bool getFusionGroupInlining();
+
+// Tunable parameters for deciding when to create/keep subgraphs of
+// differentiable code
+const size_t autodiffSubgraphNodeThreshold = 2;
+const size_t autodiffSubgraphInlineThreshold = 5;
+
+// a Graph can be created via tracing, or via a language-based frontend
+// GraphExecutor runs it. It can run the same graph on many different sizes
+// and different requires_grad states, and handles specializations for each
+// situation. GraphExecutor is completely unaware of tracing or module
+// parameters to keep the tracing concerns separated.
+struct GraphExecutorImplBase {
+  static std::shared_ptr<Graph> prepareGraph(
+      const std::shared_ptr<Graph>& graph) {
+    auto copy = graph->copy();
+    EraseShapeInformation(copy);
+    return copy;
+  }
+
+  GraphExecutorImplBase(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name)
+      : graph(prepareGraph(graph)),
+        function_name_(std::move(function_name)),
+        num_inputs(this->graph->inputs().size()),
+        num_outputs(this->graph->outputs().size()) {}
+
+  // entry point where execution begins
+  void run(Stack& stack);
+  c10::intrusive_ptr<Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch);
+
+  virtual const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt) = 0;
+  virtual GraphExecutorState getDebugState() = 0;
+  virtual ~GraphExecutorImplBase() = default;
+
+  virtual bool isOptimized() const {
+    return false;
+  }
+
+ protected:
+  friend struct GraphExecutor;
+
+  // The unoptimized starting graph. This field is effectively const, but we
+  // can't make it so because Graph::copy() is not const (and making it const is
+  // not that easy at this point).
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Graph> graph;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::string function_name_;
+
+  // If false, we'll run the graph as we get it, without any optimizations.
+  // Useful for debugging.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const size_t num_inputs;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const size_t num_outputs;
+
+  // GraphExecutors can be accessed from multiple threads, so this thread needs
+  // to be held every time we access the fallback or plan_cache.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex compile_mutex;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..46e4df0a1dcc8a788cdf51f67bafbb187583059a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h
@@ -0,0 +1,147 @@
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// This class facilitates depth-first iteration over all nodes in a graph.
+class DepthFirstGraphNodeIterator {
+  Node* current_;
+
+ public:
+  // Constructor.
+  explicit DepthFirstGraphNodeIterator(std::shared_ptr<Graph>& graph)
+      : current_(*(graph->block()->nodes().begin())) {}
+
+  // Moves up and to the next node (may move up recursively).
+  void move_up() {
+    if (current_ == nullptr) {
+      return;
+    }
+    // Basically we start from the child block (which is current_)
+    // and we try to find the block that owns it. Now we need to check
+    // if that block is the graph root block, or if it is an If/Loop/etc
+    // block.
+    //
+    // If it's the graph root block we can stop because there is no "up"
+    // but if it is a node (e.g. If/Loop/etc) we need to apply logic
+    // based on where we are coming from to move to the next block.
+    // This might mean that we need to traverse up again (e.g. if we've
+    // reached the end of the else clause in an if block we need to go)
+    // up to the parent block that contains the if.
+    //
+    // Similarly if we've reached the end of the parent block containing
+    // the else clause we might need to go up again so this is a recursive
+    // function.
+    //
+    //              BlockNode (if/loop/with)
+    //                       |
+    //            [Block1]  ... [Block2]
+    //                |
+    //   [ Node1, Node2, Node3, FromNode]
+    //
+    auto parent_block = current_->owningBlock();
+    TORCH_INTERNAL_ASSERT(parent_block, "Every node must be owned by a block");
+
+    // Get the node that owns the parent block. This node has to be an if,
+    // loop, or with.
+    auto parent_node = parent_block->owningNode();
+    if (parent_node == nullptr) {
+      // If there's no node that owns this current block then we're at the
+      // top of the graph and since we're trying to move up we have reached
+      // the end of the traversal.
+      current_ = nullptr;
+      return;
+    }
+
+    // Check the type of node this root is.
+    if (parent_node->kind() == prim::If) {
+      // Need to check if we came from the `then` branch or the `else` branch.
+      auto* then_block = parent_node->blocks().at(0);
+      auto* else_block = parent_node->blocks().at(1);
+
+      if (parent_block == else_block) {
+        // If else block then we move to the next node in the parent block.
+        current_ = parent_node->next();
+        if (current_->kind() == prim::Return) {
+          move_up();
+        }
+      } else {
+        // If then block then move to the else block if it is not empty.
+        TORCH_INTERNAL_ASSERT(parent_block == then_block);
+        bool else_block_empty =
+            else_block->nodes().begin() == else_block->nodes().end();
+
+        if (!else_block_empty) {
+          current_ = *(else_block->nodes().begin());
+        } else {
+          // Since it's empty we move to the next node.
+          current_ = parent_node->next();
+          if (current_->kind() == prim::Return) {
+            move_up();
+          }
+        }
+      }
+    } else if (
+        parent_node->kind() == prim::Loop ||
+        parent_node->kind() == prim::With) {
+      current_ = parent_node->next();
+      if (current_->kind() == prim::Return) {
+        move_up();
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false, "Only if/loop/with nodes should have child blocks");
+    }
+  }
+
+  // Moves to the next adjacent node or up in to the parent if that is not
+  // possible.
+  void move_next() {
+    if (current_ == nullptr) {
+      return;
+    }
+
+    // Increment to the next node in the current block.
+    current_ = current_->next();
+
+    // Check if we're at the end of the block. If so we need
+    // to move upwards (if it makes sense to).
+    if (current_->kind() == prim::Return) {
+      move_up();
+    }
+  }
+
+  // Moves to the next node in the graph into children if it can.
+  void move_into() {
+    if (current_ == nullptr) {
+      return;
+    }
+
+    // Check if we're currently on a node that contains sub-nodes.
+    if (current_->kind() == prim::If || current_->kind() == prim::Loop ||
+        current_->kind() == prim::With) {
+      auto* first_block = current_->blocks().at(0);
+      current_ = first_block->param_node();
+      // Move next will move up and out of the current node if the block is
+      // empty. `move_up` which is called by `move_next` will handle the
+      // difference between If, Loop, and With blocks appropriately.
+      move_next();
+    } else {
+      move_next();
+    }
+  }
+
+  // Get the next Node in the graph. \returns nullptr if there are no nodes
+  // left.
+  Node* next() {
+    auto result = current_;
+
+    // Try move into the existing node to set the next node to be returned.
+    // This will move to the next node if not possible, or move upwards and
+    // to the next.
+    move_into();
+
+    return result;
+  }
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1ece04acc8af1df61ff5a6e42b53e08158a415
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <cstdint>
+#include <typeinfo>
+#include <unordered_set>
+
+namespace torch::jit {
+// instruction look like:
+// op_code X, N
+// meaning of X, N depend on the op:
+// O - index into operator table
+// R - index into register table
+// I - literal integer
+// C - index into constant table
+// P - jump offset relative to beginning of current instruction
+// F - index into function table
+// T - index into the type table, used for guard instructions
+// S - index into object slots
+// C - index into code table
+
+#define FORALL_OPCODES(_)                                                      \
+  _(OP, "O") /* invoke operator X */                                           \
+  _(OPN, "OI") /* invoke vararg operator X with N arguments */                 \
+  _(LOAD, "R") /* push a value from a register X */                            \
+  _(MOVE, "R") /* push a value from register X, clearing the register */       \
+  _(STOREN, "RI") /* store N values to registers [X, X+N) */                   \
+  _(STORE, "R") /* store 1 value to registers X */                             \
+  _(DROP, "") /* drop 1 value from the top of the stack */                     \
+  _(DROPR, "R") /* clear register X */                                         \
+  _(LOADC, "C") /* push the constant X */                                      \
+  _(JF, "P") /* pop the top of the stack, if false, branch to P */             \
+  _(JMP, "P") /* unconditional branch to X */                                  \
+  _(LOOP, "PI") /* perform a loop, X is where to branch if cond is false */    \
+  _(RET, "") /* exit execution */                                              \
+  _(WAIT, "") /* wait for a future to be complete */                           \
+  _(CALL, "F") /* call function X */                                           \
+  _(GUARD, "T") /* check a guard against type_table, true if passes */         \
+  _(TYPECHECK, "TN") /* check each type of input[i] against type_table[X+N] */ \
+  _(FAIL_GUARD, "T") /* fail a guard, patch back to GUARD */                   \
+  _(PROFILE_OP, "F") /* get a callback from profile_function_table at X */     \
+  _(TAIL_CALL, "F") /* replace current frame with function F */                \
+  _(INTERFACE_CALL, "CI") /* call method X on the first argument (of N) */     \
+  _(GET_ATTR, "S") /* get attribute from slot X in an Object */                \
+  _(SET_ATTR, "S") /* set attribute to slot X in an Object */                  \
+  _(LIST_UNPACK, "I") /* unpack list expecting length I */                     \
+  _(TUPLE_CONSTRUCT, "I") /* construct a tuple using X inputs */               \
+  _(NAMED_TUPLE_CONSTRUCT,                                                     \
+    "TI") /* construct a tuple of type X, using N inputs */                    \
+  _(LIST_CONSTRUCT, "TI") /* construct a list of type X, using N inputs */     \
+  _(DICT_CONSTRUCT, "TI") /* construct a dict of type X, using N inputs */     \
+  _(CREATE_OBJECT, "T") /* create an object of type X */                       \
+  _(ISINSTANCE, "TI") /* check object is one of  types[X:X+N]  */              \
+  _(TUPLE_SLICE, "II") /* slice tup[X:(X+N)] */                                \
+  _(TUPLE_INDEX, "") /* get the value from a tuple at that index */            \
+  _(RAISE_EXCEPTION, "") /* throws the exception from Python */                \
+  _(DICT_INDEX, "") /* gets the value from the dict for given key */           \
+  _(UNCHECKED_CAST, "") /* perform an unchecked cast operation */              \
+  _(__IS__, "") /* performs `is` operator from Python */                       \
+  _(UN_INITIALIZED,                                                            \
+    "") /* sets default values to variables that are uninitialized */          \
+  _(__ISNOT__, "") /* performs `is not` operator from Python  */               \
+  _(FORMAT, "I") /* performs string format function `f strings` or `{}.format` \
+                     the number of inputs in stored in X */                    \
+  _(DEVICE, "") /* invokes aten::device for a Tensor */                        \
+  _(DTYPE, "") /* invokes aten::dtype for a Tensor */                          \
+  _(DIM, "") /* invokes aten::dim for a Tensor */                              \
+  _(__NOT__, "") /* performs `not` operator from Python  */                    \
+  _(TO_LIST, "") /* convert the input to a list */                             \
+  _(NUM_TO_TENSOR,                                                             \
+    "") /* performs the conversion of a number/scalar to Tensor */             \
+  _(IS_CUDA, "") /* invokes aten::is_cuda for a Tensor */                      \
+  _(FORK, "CN") /* launch a thread to run code entry x with N inputs  */       \
+  _(WARN, "I") /* emit a warning with line information */                      \
+  _(ENTER, "EN") /* enter scope of a contextmanager */                         \
+  _(EXIT, "EX") /* exit the last entered contextmanager */                     \
+  _(AWAITABLE, "CN") /* initialize await for code entry x with N inputs  */
+
+enum OpCode : uint8_t {
+#define DEFINE_OP(op, _) op,
+  FORALL_OPCODES(DEFINE_OP)
+#undef DEFINE_OP
+};
+
+struct Instruction {
+  OpCode op;
+  uint8_t unused;
+  uint16_t N;
+  int32_t X;
+  // TODO: check for overflow
+  Instruction(OpCode op, int32_t X, uint16_t N)
+      : op(op), unused(0), N(N), X(X) {}
+};
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+
+bool isOpSupportedInMobile(OpCode op);
+char const* toString(OpCode op);
+OpCode parseOpCode(const char* str);
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5e55f339d5ff8c617e352c30cccb5af1e5b1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h
@@ -0,0 +1,159 @@
+#pragma once
+#include <c10/util/Optional.h>
+#include <memory>
+#include <vector>
+
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+C10_DECLARE_bool(torch_jit_disable_warning_prints);
+C10_DECLARE_bool(torch_jit_enable_rethrow_caught_exception);
+
+namespace at {
+class Tensor;
+TORCH_API void launch(std::function<void()> func);
+} // namespace at
+namespace c10 {
+struct IValue;
+struct OperatorName;
+} // namespace c10
+
+namespace torch::jit {
+
+// The interpreter run Graphs with Tensor inputs and Tensor outputs
+// a separate component in the autograd handles unwrapping and wrapping
+// variable objects for use in the interpreter.
+namespace interpreter {
+struct CodeImpl;
+}
+
+struct Node;
+struct GraphExecutor;
+struct InterpreterStateImpl;
+struct Graph;
+struct Node;
+struct Instruction;
+using Stack = std::vector<c10::IValue>;
+using c10::ivalue::Future;
+using TaskLauncher = std::function<void(std::function<void()>)>;
+
+struct TORCH_API Code {
+  Code() = default;
+  explicit Code(interpreter::CodeImpl* pImpl);
+  // remaining_bailout_depth is irrelevant in a `Code` object unless the `Code`
+  // is directly created by `GraphExecutor` in which case it's likely to contain
+  // `prim::BailOut`s to control the maximum depth of bailout chains
+  explicit Code(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      size_t remaining_bailout_depth = 0);
+
+  const std::vector<GraphExecutor*>& grad_executors();
+  const std::vector<GraphExecutor*>& diff_graph_op_executors();
+
+  explicit operator bool() const {
+    return pImpl != nullptr;
+  }
+  size_t num_inputs() const;
+  size_t num_outputs() const;
+  size_t num_bailouts() const;
+  const std::vector<c10::IValue>& constant_table() const;
+  const std::vector<c10::TypePtr>& type_table() const;
+  const std::vector<Instruction>& instructions() const;
+  const std::unordered_map<std::string, size_t>& op_to_num_specified_args()
+      const;
+  const std::vector<Node*>& instructions_source() const;
+  void request_bailout(size_t index);
+  size_t register_size() const;
+  std::shared_ptr<Graph> graph() const;
+
+ private:
+  std::shared_ptr<interpreter::CodeImpl> pImpl;
+  friend struct InterpreterStateImpl;
+  friend std::ostream& operator<<(std::ostream& out, const Code& code);
+};
+
+struct TORCH_API MobileCode : Code {
+  explicit MobileCode(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      bool emit_default_input_instructions = true,
+      bool support_default_args_before_out = true,
+      bool emit_promoted_ops = true,
+      size_t remaining_bailout_depth = 0);
+};
+
+struct InterpreterState {
+  TORCH_API InterpreterState(
+      const Code& code,
+      TaskLauncher taskLauncher = at::launch);
+  TORCH_API void run(Stack& stack);
+  TORCH_API c10::intrusive_ptr<Future> runAsync(Stack& stack);
+  c10::intrusive_ptr<Future> getFuture();
+
+ private:
+  InterpreterState(c10::intrusive_ptr<c10::intrusive_ptr_target> pImpl);
+  // Ideally we should use c10::intrusive_ptr<InterpreterStateImpl> for pImpl;
+  // but intrusive_ptr requires full definition of InterpreterStateImpl,
+  // which we need to hide in the header.
+  c10::intrusive_ptr<c10::intrusive_ptr_target> pImpl;
+  friend struct InterpreterStateImpl;
+};
+
+// Created by wait()
+struct Suspend : public std::exception {
+  const char* what() const noexcept override {
+    return "Suspend";
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit Suspend(c10::intrusive_ptr<Future> future_)
+      : future(std::move(future_)) {}
+
+  c10::intrusive_ptr<Future> future;
+};
+
+// InterpreterContinuation propagates dist_autograd_context_id
+// through (and only through) the forward pass manually, other
+// thread local settings are propagated with ThreadLocalState
+struct InterpreterContinuation {
+  InterpreterContinuation(
+      InterpreterState state_,
+      Stack stack_,
+      int64_t dist_autograd_context_id = 0,
+      c10::optional<at::ThreadLocalState> tls_state = c10::nullopt)
+      : state(std::move(state_)),
+        stack(std::move(stack_)),
+        tls_state_(std::move(tls_state))
+#ifdef USE_DISTRIBUTED
+        ,
+        dist_autograd_context_id_(dist_autograd_context_id)
+#endif
+  {
+  }
+
+  void operator()();
+
+ private:
+  InterpreterState state;
+  Stack stack;
+  c10::optional<at::ThreadLocalState> tls_state_ = c10::nullopt;
+#ifdef USE_DISTRIBUTED
+  int64_t dist_autograd_context_id_;
+#endif
+};
+
+// what is the tensors type, including state from the current execution context
+// that modifies how the tensor behaves. For instance if no_grad is enabled
+// this will cause the TensorType to have requires_grad=False.
+TORCH_API at::TensorTypePtr tensorTypeInCurrentExecutionContext(
+    const at::Tensor& t);
+
+// current (TLS) TorchScript interpreter callstack
+TORCH_API std::vector<StackEntry> currentCallstack();
+TORCH_API std::vector<std::string> currentModuleHierarchy();
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd4eff70685a5c54d76010aca4bbcaa6e5fd9d4e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <stdexcept>
+
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+#include <string>
+
+namespace torch::jit {
+
+struct TORCH_API JITException : public std::runtime_error {
+  explicit JITException(
+      const std::string& msg,
+      c10::optional<std::string> python_class_name = c10::nullopt,
+      c10::optional<std::string> original_msg = c10::nullopt);
+
+  c10::optional<std::string> getPythonClassName() const {
+    return python_class_name_;
+  }
+
+  // the original msg if this is from a python exception. The interpretor has
+  // changed the original message by adding "The following operation failed in
+  // the TorchScript interpreter." in front of it in the handleError function.
+  c10::optional<std::string> getOriginalMsg() const {
+    return original_msg_;
+  }
+
+  static const std::string& getCaughtOriginalMsg();
+  static const std::string& getCaughtPythonClassName();
+  static void setCaughtOriginalMsg(const std::string& msg);
+  static void setCaughtPythonClassName(const std::string& pythonClassName);
+
+ private:
+  c10::optional<std::string> python_class_name_;
+  c10::optional<std::string> original_msg_;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..eff2e8eadc2f4f79ea707dc6c5e01d620d9be60d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h
@@ -0,0 +1,8 @@
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+TORCH_API std::shared_ptr<Graph> TraceGraph(
+    std::shared_ptr<Graph> graph,
+    Stack& stack);
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddcff7fd7bac409e44d1ec0fcfd602bda0ad9725
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::jit::logging {
+
+class LoggerBase {
+ public:
+  TORCH_API virtual void addStatValue(
+      const std::string& stat_name,
+      int64_t val) = 0;
+  virtual ~LoggerBase() = default;
+};
+
+TORCH_API LoggerBase* getLogger();
+TORCH_API LoggerBase* setLogger(LoggerBase* logger);
+
+// No-op logger. This is the default and is meant to incur almost no runtime
+// overhead.
+
+class NoopLogger : public LoggerBase {
+ public:
+  void addStatValue(const std::string& stat_name, int64_t val) override {}
+  ~NoopLogger() override = default;
+};
+
+// Trivial locking logger. Pass in an instance of this to setLogger() to use it.
+// This keeps track of the sum of all statistics.
+//
+// NOTE: this is not written in a scalable way and should probably only be used
+// in the single-threaded case or for testing.
+class TORCH_API LockingLogger : public LoggerBase {
+ public:
+  void addStatValue(const std::string& stat_name, int64_t val) override;
+  virtual int64_t getCounterValue(const std::string& name) const;
+  enum class AggregationType { SUM = 0, AVG = 1 };
+  void setAggregationType(const std::string& stat_name, AggregationType type);
+  ~LockingLogger() override = default;
+
+ private:
+  mutable std::mutex m;
+  struct RawCounter {
+    RawCounter() : sum(0), count(0) {}
+    int64_t sum;
+    size_t count;
+  };
+  std::unordered_map<std::string, RawCounter> raw_counters;
+  std::unordered_map<std::string, AggregationType> agg_types;
+};
+
+// Make this struct so the timer internals are opaque to the user.
+struct JITTimePoint {
+  std::chrono::time_point<std::chrono::high_resolution_clock> point;
+};
+
+TORCH_API JITTimePoint timePoint();
+TORCH_API void recordDurationSince(
+    const std::string& name,
+    const JITTimePoint& tp);
+
+namespace runtime_counters {
+constexpr const char* GRAPH_EXECUTORS_CONSTRUCTED =
+    "pytorch_runtime.graph_executors_constructed";
+constexpr const char* GRAPH_EXECUTOR_INVOCATIONS =
+    "pytorch_runtime.graph_executor_invocations";
+constexpr const char* EXECUTION_PLAN_CACHE_HIT =
+    "pytorch_runtime.execution_plan_cache_hit";
+constexpr const char* EXECUTION_PLAN_CACHE_MISS =
+    "pytorch_runtime.execution_plan_cache_miss";
+
+inline std::vector<const char*> allRuntimeCounters() {
+  return {
+      GRAPH_EXECUTORS_CONSTRUCTED,
+      GRAPH_EXECUTOR_INVOCATIONS,
+      EXECUTION_PLAN_CACHE_HIT,
+      EXECUTION_PLAN_CACHE_MISS};
+}
+
+} // namespace runtime_counters
+
+} // namespace torch::jit::logging
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5036707c2b68b6274501fef1743843a9623fbbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h
@@ -0,0 +1,345 @@
+// in memory description of all ATen Ops similar to Caffe2 schema
+// once C10 exists this can be removed, or stubbed out, but we need
+// it now to implement correct semantic checking for script
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/core/op_registration/op_allowlist.h>
+#include <ATen/core/stack.h>
+#include <c10/util/Exception.h>
+#include <c10/util/overloaded.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#include <torch/csrc/jit/runtime/operator_options.h>
+#include <torch/library.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace torch::jit {
+
+struct Node;
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::Symbol;
+
+using OperationCreator = Operation (*)(const Node*);
+
+namespace {
+const std::array<at::Tag, 1> kJitOnlyOperatorTags = {
+    at::Tag::pt2_compliant_tag};
+}
+
+/*
+ * Note: JIT relies on Operator instances having static lifetime, because
+ * it for example stores a non-owning FunctionSchema* pointer in the Node class,
+ * which points to the function schema stored in the Operator instance.
+ * Also, jit::Operator is meant to store more operator related information like
+ * symbolic derivatives, which also requires them to have static lifetime
+ * so that changes to symbolic derivatives are remembered.
+ *
+ * Currently, the JIT operator library contains a jit::Operator instance
+ * with a wrapper for each c10 operator. The c10 operator library registers
+ * those wrappers using listeners in register_c10_ops.cpp.
+ * TODO Instead of doing it this way, we should only have pure-jit ops in
+ * the jit library but have the JIT operator lookup look into the c10 library
+ * too.
+ */
+
+// An Operator is a thin wrapper around either a pure JIT operator (e.g. prim
+// ops) or a c10 operator, allowing some common operations and abstracting away
+// the concrete operator nature.
+struct TORCH_API Operator {
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  struct C10Operator final {
+    c10::OperatorHandle handle_;
+    Operation op_;
+  };
+  struct UnparsedFunctionSchema final {
+    std::string schema_string_;
+    mutable c10::optional<c10::AliasAnalysisKind> alias_analysis_;
+  };
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  struct JitOnlyOperator final {
+    // The only valid transition for schema_ is from right->left, i.e.
+    // when the schema gets parsed.
+    mutable std::variant<FunctionSchema, UnparsedFunctionSchema> schema_;
+
+    std::variant<Operation, OperationCreator> op_;
+  };
+
+ public:
+  Operator(c10::OperatorHandle opHandle, Operation operation)
+      : op_(C10Operator{std::move(opHandle), std::move(operation)}) {}
+
+  Operator(
+      std::string schema,
+      Operation op,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            UnparsedFunctionSchema{std::move(schema), alias_analysis},
+            Operation(std::move(op))}) {}
+
+  Operator(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      Operation op,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            FunctionSchema(varArgSchemaWithName(
+                std::move(name),
+                std::move(overload_name),
+                std::move(arguments),
+                std::move(returns),
+                alias_analysis)),
+            std::move(op)}) {}
+
+  Operator(
+      std::string schema,
+      OperationCreator op_creator,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            UnparsedFunctionSchema{std::move(schema), alias_analysis},
+            op_creator}) {}
+
+  // Helper constructor to register `op` to run
+  // run for _every_ IR Node where n.kind() == name, regardless of arguments.
+  // This is accomplished by marking the schema varargs and having no required
+  // arguments.
+  Operator(
+      Symbol name,
+      OperationCreator op_creator,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            FunctionSchema(varArgSchemaWithName(name, alias_analysis)),
+            op_creator}) {}
+
+  Operation getOperation(const Node* node = nullptr) const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) { return op.op_; },
+            [node](const JitOnlyOperator& op) {
+              return std::visit(
+                  c10::overloaded(
+                      [](const Operation& op) { return op; },
+                      [node](const OperationCreator& op_creator) {
+                        return op_creator(node);
+                      }),
+                  op.op_);
+            }),
+        op_);
+  }
+
+  Operation getOperationForDispatchKey(c10::DispatchKey dk) const {
+    // TODO: some sort of caching mechanism?
+    return std::visit(
+        c10::overloaded(
+            [dk](const C10Operator& op) {
+              return Operation([op, dk](Stack& stack) {
+                op.handle_.callBoxedForDispatchKey(dk, stack);
+              });
+            },
+            [](const JitOnlyOperator& op) {
+              TORCH_CHECK(
+                  false,
+                  "calling a JIT operator for dispatch key is not supported");
+              return Operation(nullptr);
+            }),
+        op_);
+  }
+
+  const FunctionSchema& schema() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) -> const FunctionSchema& {
+              return op.handle_.schema();
+            },
+            [](const JitOnlyOperator& op) -> const FunctionSchema& {
+              // we lazily parse schema initialized from strings so that
+              // we do less work during static operator registration
+              if (op.schema_.index() == 1) {
+                auto& unmaterializedSchema =
+                    std::get<UnparsedFunctionSchema>(op.schema_);
+                FunctionSchema schema =
+                    parseSchema(unmaterializedSchema.schema_string_);
+                if (unmaterializedSchema.alias_analysis_.has_value()) {
+                  // TODO What if it gets set later?
+                  schema.setAliasAnalysis(
+                      *unmaterializedSchema.alias_analysis_);
+                }
+                op.schema_ = std::move(schema);
+              }
+              return std::get<FunctionSchema>(op.schema_);
+            }),
+        op_);
+  }
+
+  c10::ArrayRef<at::Tag> getTags() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) { return op.handle_.getTags(); },
+            [](const JitOnlyOperator& op) {
+              // JitOnlyOperators don't have an c10::OperatorHandle or a way to
+              // specify tags. We're grandfathering them all into
+              // pt2_compliant_tag, but for anything else, please just stop
+              // using JitOnlyOperator.
+              return c10::ArrayRef<at::Tag>(kJitOnlyOperatorTags);
+            }),
+        op_);
+  }
+
+  bool isC10Op() const {
+    return op_.index() == 0;
+  }
+
+  c10::AliasAnalysisKind aliasAnalysisKind() const {
+    const FunctionSchema& schemaRef = schema();
+    c10::AliasAnalysisKind alias_analysis = schemaRef.aliasAnalysis();
+
+    TORCH_CHECK(
+        alias_analysis == AliasAnalysisKind::FROM_SCHEMA ||
+            !schemaRef.hasAnyAliasInfo(),
+        "In operator registration: Tried to register operator ",
+        schemaRef,
+        " with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA.");
+    return alias_analysis;
+  }
+
+  bool hasOperation() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator&) { return true; },
+            [](const JitOnlyOperator& op) { return op.op_.index() == 0; }),
+        op_);
+  }
+
+ private:
+  static FunctionSchema varArgSchemaWithName(
+      Symbol name,
+      AliasAnalysisKind alias_analysis) {
+    auto result = FunctionSchema(
+        name,
+        "",
+        {},
+        {},
+        /*is_vararg*/ true,
+        /*is_varret*/ true);
+    result.setAliasAnalysis(alias_analysis);
+    return result;
+  }
+
+  static FunctionSchema varArgSchemaWithName(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      AliasAnalysisKind alias_analysis) {
+    auto result = FunctionSchema(
+        std::move(name),
+        std::move(overload_name),
+        std::move(arguments),
+        std::move(returns),
+        /*is_vararg*/ false,
+        /*is_varret*/ false);
+    result.setAliasAnalysis(alias_analysis);
+    return result;
+  }
+
+  std::variant<C10Operator, JitOnlyOperator> op_;
+};
+
+TORCH_API std::string canonicalSchemaString(const FunctionSchema& schema);
+
+TORCH_API const std::vector<std::shared_ptr<Operator>> getAllOperators();
+TORCH_API const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(
+    Symbol name);
+// Returns operators in the order which OpOverloadPacket resolves them.
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(
+    Symbol name);
+
+// given a operator with an overload name, find the specific operator related to
+// it, may return nullptr if no operator exists.
+TORCH_API std::shared_ptr<Operator> findOperatorFor(
+    const c10::OperatorName& full_name);
+
+TORCH_API std::vector<Symbol> findSimilarOperators(Symbol input_op);
+
+TORCH_API void registerOperator(Operator&& op);
+TORCH_API void deregisterOperator(const FunctionSchema& schema);
+
+// XXX: this function is meant to be used with string literals only!
+TORCH_API std::shared_ptr<Operator> getOperatorForLiteral(
+    const char* signature);
+
+// Ensure the thing that registers c10 ops is defined.
+// Otherwise, our registry will not have c10 ops. You can run into this
+// scenario if you're querying registered ops during static init.
+//
+// This fn is defined in register_c10_ops.cpp
+TORCH_API void ensure_c10_registerer_defined();
+
+// Used to assert that unschematized operators have an analysis method written
+TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym);
+
+// A factory function to generate an optional operator. It has two
+// instantiations depending on the template bool arg value. The arg can be a
+// compile-time function for the selective op registration based on schema
+// string.
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    const char* schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return c10::optional<Operator>(Operator(
+      std::string(schema_str), std::forward<Func>(op), alias_analysis));
+}
+
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    torch::detail::SelectiveStr<true> schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return OperatorGenerator(
+      static_cast<const char*>(schema_str),
+      std::forward<Func>(op),
+      alias_analysis);
+}
+
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    torch::detail::SelectiveStr<false> schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return c10::nullopt;
+}
+
+template <typename Func>
+c10::optional<Operator> OperatorGenerator(
+    const std::string name,
+    const std::string overload_name,
+    const std::vector<c10::Argument> arguments,
+    const std::vector<c10::Argument> returns,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return c10::optional<Operator>(Operator(
+      name,
+      overload_name,
+      arguments,
+      returns,
+      std::forward<Func>(op),
+      alias_analysis));
+}
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf4c732c8df80a96577ee22dd10cea78c27196
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/core/dispatch/OperatorOptions.h>
+
+namespace torch::jit {
+
+using AliasAnalysisKind = c10::AliasAnalysisKind;
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eb4f509270dcd3b42256168f780831829b341bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <string>
+
+namespace torch::jit {
+
+using PrintHandler = void (*)(const std::string&);
+
+TORCH_API PrintHandler getDefaultPrintHandler();
+TORCH_API PrintHandler getPrintHandler();
+TORCH_API void setPrintHandler(PrintHandler ph);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..16bf0d92580f786090dc3f81bfe0ec39f2b2ac16
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
@@ -0,0 +1,73 @@
+#pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/runtime/graph_executor_impl.h>
+
+C10_DECLARE_bool(torch_jit_static_then_dynamic);
+
+C10_DECLARE_bool(torch_jit_always_dynamic);
+
+namespace torch::jit {
+
+TORCH_API void runNooptPassPipeline(std::shared_ptr<Graph>& graph);
+
+struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
+  ProfilingGraphExecutorImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name);
+
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override;
+  GraphExecutorState getDebugState() override;
+  ~ProfilingGraphExecutorImpl() override = default;
+
+  void debugFlushCompilationCache();
+
+  bool isOptimized() const override {
+    return optimized_plan_.has_value();
+  }
+
+ private:
+  const ExecutionPlan& getOptimizedPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth);
+  void runProfilingInsensitiveOptimizations(std::shared_ptr<Graph>& graph);
+  void runProfilingOptimizations(
+      std::shared_ptr<Graph>& graph,
+      size_t remaining_depth);
+  void replaceFallbackGraphWithFallbackFunction(Block* b);
+  FusionBehavior getCurrentBehavior(size_t remaining_depth);
+  size_t getInstantiatedBailoutDepth();
+  void runNoGradOptimizations(
+      std::shared_ptr<Graph>& graph,
+      size_t remaining_bailout_depth);
+  void runFinalOptimizations(std::shared_ptr<Graph>& graph);
+
+  void clearTheGraphCompilationIntermediateGraphs();
+
+  std::unique_ptr<ProfilingRecord> pr_;
+  c10::optional<ExecutionPlan>
+      profiling_plan_; // plan to run in order to profiling the code
+  c10::optional<ExecutionPlan> optimized_plan_;
+  FusionStrategy fusion_strategy_;
+
+  // this plan is used if getGraphExecutorOptimize is unset
+  c10::optional<ExecutionPlan> fallback_plan_;
+  // fallback functions are inserted for tensorexpr fusion groups
+  // and by specialize_autogradzero. Whenever, at runtime, input
+  // tensor don't match profiled properties, fallback functions are called
+  // They are the deoptimized version of the logic in fusion groups
+  // and/or autograd.
+  // The fallback functions are owned by a GraphExecutor instance
+  // They only exist in the optimized graph which is a private property
+  // of the GraphExecutor and only shared with InterpreterState
+  std::vector<std::unique_ptr<Function>> fallback_functions_;
+  c10::optional<size_t> remaining_bailout_depth_;
+  // The time the optimized_plan_ is created.
+  int32_t time_optimized_plan_created_ = 0;
+  // Has the extra memory used by the graph for profiling is released?
+  bool is_graph_extra_memory_released_ = false;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h
new file mode 100644
index 0000000000000000000000000000000000000000..0884fdd50a370d110a8f8b2d3340f50c41dbd449
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+// We would like to assign each position/axis of a tensor an abstract size
+// * For each `tensor` we have a profiled `Value` of a `TensorType` describing
+// the properties of the `tensor`.
+// * `TensorType` has a property called `symbolic_sizes_` to describe observed
+// `tensor.sizes()`
+// * `symbolic_sizes_` is a vector of abstract sizes (or
+// `std::vector<ShapeSymbol>`) where
+//   * `ShapeSymbol`at `symbolic_sizes_[i]`  describes the size value
+//   (`Dimension`) at `tensor.sizes()[i]`
+// * We may see the same `Dimension` at different positions `i` in
+// `tensor.sizes()` or even in different `tensor`
+//   * First, we would like associate the same `ShapeSymbol` to the same
+//   `Dimension` across **one** profiling execution or run of a TorchScript
+//   function.
+//     * The same `ShapeSymbol`s in different positions of `symbolic_shapes_` in
+//     possibly different `TensorType`s (i.e. `TensorType`s for different
+//     profiled values) form an implicit set. The elements of such a set are
+//     called *dimension locations*.
+//     * These sets allow us to track how the shapes of input arguments of some
+//     operation relate to operation's output shapes as the input and output
+//     shapes might share the same `ShapeSymbol`s
+// * For **every** profiling run, we would like to maintain the invariant that
+// *the same `ShapeSymbol` is always associated with the same `Dimension`*.
+// * To maintain this invariant we merge the profiling information from all
+// profiling runs,
+//   * For every two runs, we iterate over all `symbic_shapes_`  and compare
+//   their `ShapeSymbol`s in the same position.
+//     * if we observe that for every dimension location that has
+//     the`ShapeSymbol S1`  in run #1 there is **only one** `ShapeSymbol S2` in
+//     the same dimension location in run #2, we conclude that the invariant
+//     holds.
+//     * However, if we observe some dimension locations in run #2 have
+//     `ShapeSymbol S2` and the other ones have `ShapeSymbol S3` we would like
+//     to partition the virtual set of dimension locations associated with
+//     `ShapeSymbol S1` into two new subsets, so the invariant holds.
+//     * The partitioning works by assigning a new symbol to the dimension
+//     locations (associated with `ShapeSymbol S1`) that have `ShapeSymbol S2`
+//     and another new symbol to the dimension locations that have `ShapeSymbol
+//     S3`. In other words,
+//       * Subset #1 will consist of the dimension locations that in run #2 have
+//       `ShapeSymbol S2`  and will have `ShapeSymbol S4`  in those dimension
+//       locations
+//       * Subset #2 will consist of the dimension locations that in run #2 have
+//       `ShapeSymbol S4`  and will have `ShapeSymbol S5`  in those dimension
+//       locations
+//     * The effective result of merging the profiling information from two runs
+//     is new `TensorTypes` whose `symbolic_sizes_` /dimension locations have
+//     either `ShapeSymbol S4` or `ShapeSymbol S5`.
+//     * Partitioning can be done even before we have seen all the dimension
+//     locations associated with `ShapeSymbol S1`
+//       * We use `getSymbolInSet` of `ShapeSymbolTable` to remember all
+//       `ShapeSymbols` from run #2 we observed in the dimension locations
+//       associated with `ShapeSymbol S1` .
+//       * For every `ShapeSymbol` from run #2 in the dimension location
+//       associated with `ShapeSymbol S1`  `getSymbolInSet` returns a symbol
+//       that we assign to the dimension location in a new TensorType.
+//         * It's important to point out that the same `ShapeSymbol S2` from run
+//         #2 in two dimension locations that have different `ShapeSymbol`s in
+//         run #1 are different! These dimension locations will belong to
+//         different subsets and have different `ShapeSymbol`s after merge.
+//         * On the other hand, for the same `ShapeSymbol S2` in two dimension
+//         locations that have `ShapeSymbol S1` in run #1`getSymbolInSet` will
+//         return the same symbol.
+
+namespace torch::jit {
+
+using ::c10::TensorTypePtr;
+using Dimension = int64_t;
+
+TORCH_API void RegisterProfilingNode(const std::function<bool(const Node*)>&);
+
+struct ProfilingRecord;
+
+// `SetPartitioningHelper` is used to maintain the following invariant:
+// For **every** profiling run, *the same `ShapeSymbol` is always associated
+// with the same `Dimension`*.
+// while merging the profiling information from multiple runs.
+struct SetPartitioningHelper {
+  std::map<c10::ShapeSymbol, std::map<Dimension, c10::ShapeSymbol>>
+      sets2subsets_;
+
+  // `partitionSetByDimension` partitions a virtual set
+  // of dimension locations associated with ShapeSymbol `symbol` into subsets.
+  // Partitioning is equivalent to giving (or renaming) a particular
+  // dimension location a new `ShapeSymbol`.
+  // The same `Dimension` value in different dimension locations
+  // that used to have `symbol` will receive the same
+  // new `ShapeSymbol`, effectively forming a new set.
+  c10::ShapeSymbol partitionSetByDimension(
+      Dimension new_size,
+      c10::ShapeSymbol symbol) {
+    auto& dims2symbols = getSetForSymbol(symbol);
+
+    if (dims2symbols.count(new_size) == 0) {
+      auto new_sym = c10::ShapeSymbol::newSymbol();
+      dims2symbols[new_size] = new_sym;
+      return new_sym;
+    }
+
+    return dims2symbols[new_size];
+  }
+
+ private:
+  std::map<Dimension, c10::ShapeSymbol>& getSetForSymbol(c10::ShapeSymbol s) {
+    auto& set = sets2subsets_[s];
+    // N.B. adding a mapping { s.static_size(), s }
+    // makes sure we preserve the fact that
+    // some dimension values remain the same
+    // across all profiled runs
+    if (s.is_static()) {
+      set.insert({s.static_size(), s});
+    }
+    return set;
+  }
+};
+
+// ShapeSymbolTable is used by Interpreter
+// to assign dimension values to ShapeSymbols
+// and fail a guard if the same symbol
+// is assigned more than one dimension value.
+struct ShapeSymbolTable {
+  // N.B. we treat static symbols as always assigned
+  // to themselves
+  bool isBound(c10::ShapeSymbol s) {
+    if (s.is_static()) {
+      return true;
+    }
+    return data_.count(s) != 0;
+  }
+
+  // N.B. we treat static symbols as always assigned
+  // to themselves
+  Dimension getValue(c10::ShapeSymbol s) {
+    if (s.is_static()) {
+      return s.static_size();
+    }
+    return data_[s];
+  }
+  void assign(c10::ShapeSymbol s, Dimension v) {
+    TORCH_INTERNAL_ASSERT(!s.is_static());
+    data_[s] = v;
+  }
+  std::map<c10::ShapeSymbol, Dimension> data_;
+  // Tries to assign dimension values from `new_sizes` to
+  // `ShapeSymbol`s `sym_shapes`.
+  // Returns `true` if every dimension value from `new_sizes`
+  // can be assigned to the corresponding `ShapeSymbol` from
+  // `sym_shapes`
+  // A dimension value can be assigned to a `ShapeSymbol`
+  // * if the symbol isn't assigned yet any dimension value
+  // * if the symbol is assigned and its value is equal to
+  // the dimension value from `new_sizes`
+  bool bindSymbolicShapes(
+      at::IntArrayRef new_sizes,
+      const c10::SymbolicShape& sym_shapes);
+};
+
+struct ProfilingRecord {
+  // N.B. ProfilingRecord's copy and move c-tor are disabled, so we won't
+  // end up accidentally copying or moving ProfilingRecords whose addresses
+  // are captured in callbacks_
+  ProfilingRecord(const ProfilingRecord&) = delete;
+  ProfilingRecord(ProfilingRecord&&) noexcept = delete;
+  TORCH_API static std::unique_ptr<ProfilingRecord> instrumentGraph(
+      const std::shared_ptr<Graph>& graph);
+  TORCH_API static void removeProfilingNodes(Block* b);
+  TORCH_API static void removeProfileCounter(Block* b);
+
+  std::shared_ptr<Graph> profiled_graph_;
+  mutable std::mutex mutex_;
+  size_t profiling_count_;
+
+  bool ready() const;
+
+  std::shared_ptr<Graph> graph() const {
+    return profiled_graph_;
+  }
+
+  TORCH_API ProfileIValueOp* createProfileIValueNode(Value* in_val);
+  TORCH_API ProfileIValueOp* createProfileIValueNode(ArrayRef<Value*> inputs);
+
+ private:
+  ProfileOp* createProfileNode(
+      const std::function<void(Stack&)>& fp,
+      at::ArrayRef<Value*> inputs);
+  void instrumentBlock(Block* block);
+  void insertShapeProfile(Node* n, size_t offset, const TypePtr& input_type);
+  ProfilingRecord(std::shared_ptr<Graph> g);
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..949bf5c6e49279ad100bc4ae915a1b734670aef9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h
@@ -0,0 +1,884 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/core/DeviceType.h>
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/mobile/register_ops_common_utils.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/jit/runtime/logging.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/print_handler.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+
+#include <ATen/ExpandUtils.h>
+#include <ATen/Parallel.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/Dict.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/Device.h>
+#include <c10/core/thread_pool.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
+#include <c10/util/string_utils.h>
+
+namespace torch::jit {
+constexpr inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+constexpr inline c10::AliasAnalysisKind aliasAnalysisConservative() {
+  return c10::AliasAnalysisKind::CONSERVATIVE;
+}
+
+constexpr inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() {
+  return c10::AliasAnalysisKind::INTERNAL_SPECIAL_CASE;
+}
+
+template <class T>
+c10::List<T> make_result_list(const TypePtr& elemType) {
+  return c10::List<T>();
+}
+
+template <>
+c10::impl::GenericList make_result_list<IValue>(const TypePtr& elemType);
+
+// As described in https://docs.python.org/3/library/functions.html#round
+// When a number is exactly halfway between two integers, python builtin round
+// function will round to even number. We use round(x/2)*2 to handle the
+// special halfway case. For positive 'x', round(x/2)*2 =
+// round((x_e + x_r)/2)*2 = x_e + round(x_r/2)*2, where x_e is an even integer,
+// x_r is either 0.5 of 1.5, round(x_r/2)*2 results a 0 or 2, so the final
+// result will always be a even number. Due to symmetricity, it also applies to
+// negative cases.
+inline double round_to_even(double a) {
+  return a - std::floor(a) == 0.5 ? (std::round(a * 0.5) * 2.0) : std::round(a);
+}
+
+// using the rules from python_arg_parser FunctionParameter::check
+// tensor cannot have grad set, tensor must be 0 dim,
+// and if the dest is an int the source must be integral type
+void checkImplicitTensorToNum(const at::Tensor& t, bool toInt);
+
+static C10_UNUSED int64_t floordiv(int64_t a, int64_t b) {
+  if (b == 0) {
+    throw std::runtime_error("division by 0");
+  }
+  if ((a > 0) == (b > 0)) {
+    // simple case, both have same sign
+    return a / b;
+  } else {
+    // in python division rounds down, it doesn't not truncate like in c++
+    auto r = lldiv(a, b);
+    return (r.rem) ? r.quot - 1 : r.quot;
+  }
+}
+TORCH_API void checkDoubleInRange(double a);
+static C10_UNUSED int64_t floor(double a) {
+  checkDoubleInRange(a);
+  return std::floor(a);
+}
+static C10_UNUSED int64_t ceil(double a) {
+  checkDoubleInRange(a);
+  return std::ceil(a);
+}
+
+static C10_UNUSED int64_t gcd(int64_t a, int64_t b) {
+  while (b != 0) {
+    int64_t r = a % b;
+    a = b;
+    b = r;
+  }
+  // in python gcd returns non-negative values
+  return std::abs(a);
+}
+
+int64_t partProduct(int n, int m);
+
+void loop(int n, int64_t& p, int64_t& r);
+
+int nminussumofbits(int v);
+
+int64_t factorial(int n);
+static const double degToRad = std::acos(-1.0) / 180.0;
+static const double radToDeg = 180.0 / std::acos(-1.0);
+double degrees(double x);
+double radians(double x);
+
+// Convert an python index (which may be negative) into an index usable for a
+// C++ container
+
+// Equivalent to list.at(idx)
+template <typename T>
+decltype(auto) getItem(const c10::List<T>& list, int64_t idx) {
+  const int64_t list_size = list.size();
+  const int64_t normalized_idx = normalizeIndex(idx, list_size);
+  if (normalized_idx < 0 || normalized_idx >= list_size) {
+    throw std::out_of_range("list index out of range");
+  }
+  return list.get(normalized_idx);
+}
+
+template <typename T>
+void setItem(const c10::List<T>& list, int64_t idx, T&& value) {
+  const int64_t list_size = list.size();
+  const int64_t normalized_idx = normalizeIndex(idx, list_size);
+  if (normalized_idx < 0 || normalized_idx >= list_size) {
+    throw std::out_of_range("list index out of range");
+  }
+  list.set(normalized_idx, std::forward<T>(value));
+}
+
+void listAppend(Stack& stack);
+
+void listReverse(Stack& stack);
+
+template <typename T>
+void minList(Stack& stack) {
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+
+  size_t min_size = std::min(a.size(), b.size());
+  for (const auto i : c10::irange(min_size)) {
+    if (a[i] == b[i]) {
+      continue;
+    }
+
+    push(stack, a[i] < b[i] ? a : b);
+    return;
+  }
+
+  push(stack, b.size() < a.size() ? b : a);
+}
+
+template <typename T>
+void maxList(Stack& stack) {
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+
+  size_t min_size = std::min(a.size(), b.size());
+  for (const auto i : c10::irange(min_size)) {
+    if (a[i] == b[i]) {
+      continue;
+    }
+
+    push(stack, a[i] > b[i] ? a : b);
+    return;
+  }
+
+  push(stack, b.size() > a.size() ? b : a);
+}
+
+void listPopImpl(Stack& stack, const char* empty_message);
+
+void listPop(Stack& stack);
+
+void listClear(Stack& stack);
+
+void listDelete(Stack& stack);
+
+void listInsert(Stack& stack);
+
+template <typename T>
+void listRemove(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  auto pos = std::find(list.begin(), list.end(), elem);
+
+  if (pos != list.end()) {
+    list.erase(pos);
+  } else {
+    AT_ERROR("list.remove(x): x not in list");
+  }
+}
+
+template <typename T>
+void listMin(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  size_t list_size = list.size();
+  if (list_size == 0) {
+    throw std::runtime_error("min() arg is an empty sequence");
+  }
+
+  T min_elem = list[0];
+  for (const auto i : c10::irange(1, list_size)) {
+    T elem = list[i];
+    min_elem = elem < min_elem ? elem : min_elem;
+  }
+
+  stack.push_back(min_elem);
+}
+
+template <typename T>
+void listMax(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  size_t list_size = list.size();
+  if (list_size == 0) {
+    throw std::runtime_error("max() arg is an empty sequence");
+  }
+
+  T max_elem = list[0];
+  for (const auto i : c10::irange(1, list_size)) {
+    T elem = list[i];
+    max_elem = elem > max_elem ? elem : max_elem;
+  }
+
+  stack.push_back(max_elem);
+}
+
+template <>
+void listRemove<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listIndex(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  auto pos = std::find(list.begin(), list.end(), elem);
+
+  if (pos != list.end()) {
+    push(stack, static_cast<int64_t>(std::distance(list.begin(), pos)));
+  } else {
+    AT_ERROR("'", elem, "' is not in list");
+  }
+}
+
+template <>
+void listIndex<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listCount(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  const int64_t count = std::count(list.begin(), list.end(), elem);
+  push(stack, count);
+}
+
+template <>
+void listCount<at::Tensor>(Stack& stack);
+
+void listExtend(Stack& stack);
+
+void listCopy(Stack& stack);
+
+void listSelect(Stack& stack);
+
+void listLen(Stack& stack);
+
+template <typename T>
+void listEq(Stack& stack) {
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  push(stack, a == b);
+}
+
+template <typename T>
+void listNe(Stack& stack) {
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  push(stack, a != b);
+}
+
+inline bool tensor_list_equal(
+    const c10::List<at::Tensor>& a,
+    const c10::List<at::Tensor>& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (const auto i : c10::irange(a.size())) {
+    const at::Tensor& a_element = a[i];
+    const at::Tensor& b_element = b[i];
+    // This preserves Python's semantics, which uses eq() to compare two
+    // elements, then passes the result to bool().
+    // see: https://docs.python.org/3.4/reference/datamodel.html#object.__ge__
+    const auto cmp_result = a_element.eq(b_element);
+    if (!at::native::is_nonzero(cmp_result)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Specialization for at::Tensor, since it doesn't define operator==
+template <>
+void listEq<at::Tensor>(Stack& stack);
+
+// Specialization for at::Tensor, since it doesn't define operator==
+template <>
+void listNe<at::Tensor>(Stack& stack);
+
+void listList(Stack& stack);
+
+template <typename T>
+void listContains(Stack& stack) {
+  auto key = pop(stack).to<T>();
+  auto list = pop(stack).to<c10::List<T>>();
+  // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
+  for (const T& item : list) {
+    if (item == key) {
+      push(stack, true);
+      return;
+    }
+  }
+  push(stack, false);
+}
+
+void listAdd(Stack& stack);
+
+void listInplaceAdd(Stack& stack);
+
+void listMulIntLeftInPlace(Stack& stack);
+
+void listMulIntLeft(Stack& stack);
+
+void listMulIntRight(Stack& stack);
+
+void listSlice(Stack& stack);
+
+template <typename T>
+void listSort(Stack& stack) {
+  bool reverse = pop(stack).toBool();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  std::sort(list.begin(), list.end(), [reverse](const T& a, const T& b) {
+    // FBCode errors without this check - "strict weak ordering"
+    // TODO: remove when possible, since it just slows down
+    // sorting and doesn't do anything useful
+    if (a == b) {
+      return false;
+    }
+    return (a < b) != reverse;
+  });
+}
+
+// Specialization for at::Tensor
+template <>
+void listSort<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listCopyAndSort(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  auto list_copied = list.copy();
+  std::sort(list_copied.begin(), list_copied.end(), [](const T& a, const T& b) {
+    // "strict weak ordering" issue - see other sort
+    if (a == b) {
+      return false;
+    }
+    return a < b;
+  });
+  push(stack, list_copied);
+}
+
+// Specialization for at::Tensor
+template <>
+void listCopyAndSort<at::Tensor>(Stack& stack);
+
+void listSetItem(Stack& stack);
+
+struct OperatorGeneratorArgs {
+  const char* schema_str;
+  bool isOperationCreator;
+  union {
+    void (*operation)(Stack&);
+    OperationCreator operationCreator;
+  };
+  AliasAnalysisKind aliasAnalysis;
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      void (*op)(Stack&),
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(false),
+        operation(op),
+        aliasAnalysis(aa) {}
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      OperationCreator opCreator,
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(true),
+        operationCreator(opCreator),
+        aliasAnalysis(aa) {}
+
+  template <typename... Args>
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<false>,
+      Args...)
+      : schema_str(nullptr),
+        isOperationCreator(false),
+        operation(nullptr),
+        aliasAnalysis(AliasAnalysisKind::INTERNAL_SPECIAL_CASE) {}
+};
+
+#define DEFINE_GENERIC_BINARY_OP(                                             \
+    aten_op, op, int_float_result, complex_result)                            \
+  OperatorGeneratorArgs(                                                      \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                         \
+                             ".int_int(int a, int b) -> " #int_float_result), \
+      [](Stack& stack) {                                                      \
+        int64_t a, b;                                                         \
+        pop(stack, a, b);                                                     \
+        push(stack, op);                                                      \
+      },                                                                      \
+      aliasAnalysisFromSchema()),                                             \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op                                                        \
+              ".float_float(float a, float b) -> " #int_float_result),        \
+          [](Stack& stack) {                                                  \
+            double a, b;                                                      \
+            pop(stack, a, b);                                                 \
+            push(stack, op);                                                  \
+          },                                                                  \
+          aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op                                                        \
+              ".complex_complex(complex a, complex b) -> " #complex_result),  \
+          [](Stack& stack) {                                                  \
+            c10::complex<double> a, b;                                        \
+            pop(stack, a, b);                                                 \
+            push(stack, op);                                                  \
+          },                                                                  \
+          aliasAnalysisFromSchema())
+
+// define implementations for primitive number ops
+#define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \
+  OperatorGeneratorArgs(                                                       \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),   \
+      [](Stack& stack) {                                                       \
+        int64_t a, b;                                                          \
+        pop(stack, a, b);                                                      \
+        push(stack, int_op);                                                   \
+      },                                                                       \
+      aliasAnalysisFromSchema()),                                              \
+      OperatorGeneratorArgs(                                                   \
+          TORCH_SELECTIVE_SCHEMA(                                              \
+              #aten_op ".float(float a, float b) -> " #float_result),          \
+          [](Stack& stack) {                                                   \
+            double a, b;                                                       \
+            pop(stack, a, b);                                                  \
+            push(stack, float_op);                                             \
+          },                                                                   \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_FLOAT_OP(aten_op, op, result)                            \
+  OperatorGeneratorArgs(                                                    \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
+                             ".int_float(int a, float b) -> " #result),     \
+      [](Stack& stack) {                                                    \
+        int64_t a;                                                          \
+        double b;                                                           \
+        pop(stack, a, b);                                                   \
+        push(stack, op);                                                    \
+      },                                                                    \
+      aliasAnalysisFromSchema()),                                           \
+      OperatorGeneratorArgs(                                                \
+          TORCH_SELECTIVE_SCHEMA(#aten_op                                   \
+                                 ".float_int(float a, int b) -> " #result), \
+          [](Stack& stack) {                                                \
+            double a;                                                       \
+            int64_t b;                                                      \
+            pop(stack, a, b);                                               \
+            push(stack, op);                                                \
+          },                                                                \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_OP(aten_op, op)                                  \
+  OperatorGeneratorArgs(                                            \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \
+      [](Stack& stack) {                                            \
+        int64_t a, b;                                               \
+        pop(stack, a, b);                                           \
+        push(stack, op); /* NOLINT(hicpp-signed-bitwise) */         \
+      },                                                            \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_STR_CMP_OP(aten_op, op)                               \
+  OperatorGeneratorArgs(                                             \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \
+      [](Stack& stack) {                                             \
+        auto b = pop(stack).toStringRef();                           \
+        auto a = pop(stack).toStringRef();                           \
+        push(stack, op);                                             \
+      },                                                             \
+      aliasAnalysisFromSchema())
+
+// define a primitive op over Scalar operands.
+// it's necessary to register this overload following
+// int/float variations to avoid trapping Scalar args
+// in unintended implicit conversions
+#define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC(          \
+    aten_op, int_op, float_op, result, string_val)                \
+  OperatorGeneratorArgs(                                          \
+      TORCH_SELECTIVE_SCHEMA(#aten_op string_val                  \
+                             "(Scalar a, Scalar b) -> " #result), \
+      [](Stack& stack) {                                          \
+        IValue x, y;                                              \
+        pop(stack, x, y);                                         \
+        if (x.isDouble()) {                                       \
+          if (y.isDouble()) {                                     \
+            double a = x.toDouble();                              \
+            double b = y.toDouble();                              \
+            push(stack, float_op);                                \
+          } else {                                                \
+            double a = x.toDouble();                              \
+            int64_t b = y.toInt();                                \
+            push(stack, float_op);                                \
+          }                                                       \
+        } else {                                                  \
+          if (y.isDouble()) {                                     \
+            int64_t a = x.toInt();                                \
+            double b = y.toDouble();                              \
+            push(stack, float_op);                                \
+          } else {                                                \
+            int64_t a = x.toInt();                                \
+            int64_t b = y.toInt();                                \
+            push(stack, int_op);                                  \
+          }                                                       \
+        }                                                         \
+      },                                                          \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP(aten_op, int_op, float_op, result) \
+  DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC(                 \
+      aten_op, int_op, float_op, result, "")
+
+#define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(   \
+    aten_op, int_op, float_op, result)             \
+  DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC( \
+      aten_op, int_op, float_op, result, ".Scalar_Scalar")
+
+#define DEFINE_BINARY_OP(aten_op, op)             \
+  DEFINE_GENERIC_OP(aten_op, op, op, int, float), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),    \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, Scalar)
+
+#define DEFINE_BINARY_FLOAT_OP(aten_op, op)         \
+  DEFINE_GENERIC_OP(aten_op, op, op, float, float), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),      \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, float)
+
+#define DEFINE_COMPARISON_OP(aten_op, op)             \
+  DEFINE_GENERIC_OP(aten_op, op, op, bool, bool),     \
+      DEFINE_INT_FLOAT_OP(aten_op, op, bool),         \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, bool), \
+      DEFINE_STR_CMP_OP(aten_op, op)
+
+#define DEFINE_UNARY_INT_OP(aten_op, op, result)                  \
+  OperatorGeneratorArgs(                                          \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \
+      [](Stack& stack) {                                          \
+        int64_t a;                                                \
+        pop(stack, a);                                            \
+        push(stack, op);                                          \
+      },                                                          \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_FLOAT_OP(aten_op, op, result)                    \
+  OperatorGeneratorArgs(                                              \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \
+      [](Stack& stack) {                                              \
+        double a;                                                     \
+        pop(stack, a);                                                \
+        push(stack, op);                                              \
+      },                                                              \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_OP(aten_op, op, int_result, float_result)            \
+  DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
+      DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
+          [](Stack& stack) {                                              \
+            IValue x;                                                     \
+            pop(stack, x);                                                \
+            if (x.isDouble()) {                                           \
+              double a = x.toDouble();                                    \
+              push(stack, static_cast<float_result>(op));                 \
+            } else {                                                      \
+              int64_t a = x.toInt();                                      \
+              push(stack, static_cast<int_result>(op));                   \
+            }                                                             \
+          },                                                              \
+          aliasAnalysisFromSchema())
+#define DEFINE_BOOL_OP(aten_op, op)                                     \
+  OperatorGeneratorArgs(                                                \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \
+      [](Stack& stack) {                                                \
+        bool a, b;                                                      \
+        pop(stack, a, b);                                               \
+        push(stack, op);                                                \
+      },                                                                \
+      aliasAnalysisFromSchema())
+#define DEFINE_STRING_OP(op_name, string_op, result)                    \
+  OperatorGeneratorArgs(                                                \
+      TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \
+      [](Stack& stack) {                                                \
+        auto b = pop(stack).toStringRef();                              \
+        auto a = pop(stack).toStringRef();                              \
+        push(stack, string_op);                                         \
+      },                                                                \
+      aliasAnalysisFromSchema())
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+#define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result)                      \
+  OperatorGeneratorArgs(                                                  \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \
+      [](Stack& stack) {                                                  \
+        c10::complex<double> a;                                           \
+        pop(stack, a);                                                    \
+        push(stack, op);                                                  \
+      },                                                                  \
+      aliasAnalysisFromSchema())
+
+// Some complex unary ops (like abs, angle) return real valued output, but most
+// other unary ops return complex valued output. So, this macro is used in the
+// former case where we can explicitly pass complex_result_cast argument, which
+// is set to c10::complex<float> in the macro `DEFINE_UNARY_OP_WITH_COMPLEX`
+// defined below.
+#define DEFINE_UNARY_OP_WITH_COMPLEX_CAST(                                \
+    aten_op,                                                              \
+    op,                                                                   \
+    int_result,                                                           \
+    float_result,                                                         \
+    complex_result,                                                       \
+    complex_result_cast)                                                  \
+  DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
+      DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
+      DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result),               \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
+          [](Stack& stack) {                                              \
+            IValue x;                                                     \
+            pop(stack, x);                                                \
+            if (x.isDouble()) {                                           \
+              double a = x.toDouble();                                    \
+              push(stack, static_cast<float_result>(op));                 \
+            } else if (x.isComplexDouble()) {                             \
+              c10::complex<double> a = x.toComplexDouble();               \
+              push(stack, static_cast<complex_result_cast>(op));          \
+            } else {                                                      \
+              int64_t a = x.toInt();                                      \
+              push(stack, static_cast<int_result>(op));                   \
+            }                                                             \
+          },                                                              \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_OP_WITH_COMPLEX(aten_op, op, int_result, float_result) \
+  DEFINE_UNARY_OP_WITH_COMPLEX_CAST(                                        \
+      aten_op, op, int_result, float_result, complex, c10::complex<double>)
+
+#define DEFINE_GENERIC_OP_WITH_COMPLEX(                                       \
+    aten_op,                                                                  \
+    int_op,                                                                   \
+    float_op,                                                                 \
+    complex_op,                                                               \
+    int_result,                                                               \
+    float_result,                                                             \
+    complex_result)                                                           \
+  OperatorGeneratorArgs(                                                      \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),  \
+      [](Stack& stack) {                                                      \
+        int64_t a, b;                                                         \
+        pop(stack, a, b);                                                     \
+        push(stack, int_op);                                                  \
+      },                                                                      \
+      aliasAnalysisFromSchema()),                                             \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op ".complex(complex a, complex b) -> " #complex_result), \
+          [](Stack& stack) {                                                  \
+            c10::complex<double> a, b;                                        \
+            pop(stack, a, b);                                                 \
+            push(stack, complex_op);                                          \
+          },                                                                  \
+          aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op ".float(float a, float b) -> " #float_result),         \
+          [](Stack& stack) {                                                  \
+            double a, b;                                                      \
+            pop(stack, a, b);                                                 \
+            push(stack, float_op);                                            \
+          },                                                                  \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_COMPLEX_OP(aten_op, op, result)                          \
+  OperatorGeneratorArgs(                                                    \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
+                             ".int_complex(int a, complex b) -> " #result), \
+      [](Stack& stack) {                                                    \
+        int64_t a;                                                          \
+        c10::complex<double> b;                                             \
+        pop(stack, a, b);                                                   \
+        push(stack, op);                                                    \
+      },                                                                    \
+      aliasAnalysisFromSchema()),                                           \
+      OperatorGeneratorArgs(                                                \
+          TORCH_SELECTIVE_SCHEMA(                                           \
+              #aten_op ".complex_int(complex a, int b) -> " #result),       \
+          [](Stack& stack) {                                                \
+            c10::complex<double> a;                                         \
+            int64_t b;                                                      \
+            pop(stack, a, b);                                               \
+            push(stack, op);                                                \
+          },                                                                \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_FLOAT_COMPLEX_OP(aten_op, op, result)                      \
+  OperatorGeneratorArgs(                                                  \
+      TORCH_SELECTIVE_SCHEMA(                                             \
+          #aten_op ".float_complex(float a, complex b) -> " #result),     \
+      [](Stack& stack) {                                                  \
+        double a;                                                         \
+        c10::complex<double> b;                                           \
+        pop(stack, a, b);                                                 \
+        push(stack, op);                                                  \
+      },                                                                  \
+      aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(                                         \
+              #aten_op ".complex_float(complex a, float b) -> " #result), \
+          [](Stack& stack) {                                              \
+            c10::complex<double> a;                                       \
+            double b;                                                     \
+            pop(stack, a, b);                                             \
+            push(stack, op);                                              \
+          },                                                              \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \
+    aten_op, int_op, float_op, complex_op, result, string_val)        \
+  OperatorGeneratorArgs(                                              \
+      TORCH_SELECTIVE_SCHEMA(#aten_op string_val                      \
+                             "(Scalar a, Scalar b) -> " #result),     \
+      [](Stack& stack) {                                              \
+        IValue x, y;                                                  \
+        pop(stack, x, y);                                             \
+        if (x.isComplexDouble()) {                                    \
+          c10::complex<double> a = x.toComplexDouble();               \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, complex_op);                                  \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, complex_op);                                  \
+          }                                                           \
+        } else if (x.isDouble()) {                                    \
+          double a = x.toDouble();                                    \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, float_op);                                    \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, float_op);                                    \
+          }                                                           \
+        } else {                                                      \
+          int64_t a = x.toInt();                                      \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, float_op);                                    \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, int_op);                                      \
+          }                                                           \
+        }                                                             \
+      },                                                              \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR(     \
+    aten_op, int_op, float_op, complex_op, result)                         \
+  OperatorGeneratorArgs(                                                   \
+      TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \
+      [](Stack& stack) {                                                   \
+        IValue x, y;                                                       \
+        pop(stack, x, y);                                                  \
+        if (x.isComplexDouble()) {                                         \
+          c10::complex<double> a = x.toComplexDouble();                    \
+          if (y.isComplexDouble()) {                                       \
+            c10::complex<double> b = y.toComplexDouble();                  \
+            push(stack, complex_op);                                       \
+          } else if (y.isDouble()) {                                       \
+            double b = y.toDouble();                                       \
+            push(stack, complex_op);                                       \
+          }                                                                \
+        } else if (x.isDouble()) {                                         \
+          double a = x.toDouble();                                         \
+          if (y.isComplexDouble()) {                                       \
+            c10::complex<double> b = y.toComplexDouble();                  \
+            push(stack, complex_op);                                       \
+          } else if (y.isDouble()) {                                       \
+            double b = y.toDouble();                                       \
+            push(stack, float_op);                                         \
+          } else {                                                         \
+            int64_t b = y.toInt();                                         \
+            push(stack, float_op);                                         \
+          }                                                                \
+        } else {                                                           \
+          int64_t a = x.toInt();                                           \
+          if (y.isDouble()) {                                              \
+            double b = y.toDouble();                                       \
+            push(stack, float_op);                                         \
+          } else if (y.isInt()) {                                          \
+            int64_t b = y.toInt();                                         \
+            push(stack, int_op);                                           \
+          }                                                                \
+        }                                                                  \
+      },                                                                   \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX(                   \
+    aten_op, int_op, float_op, complex_op, result)              \
+  DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \
+      aten_op, int_op, float_op, complex_op, result, "")
+
+#define DEFINE_BINARY_OP_WITH_COMPLEX(aten_op, op)                          \
+  DEFINE_GENERIC_OP_WITH_COMPLEX(aten_op, op, op, op, int, float, complex), \
+      DEFINE_INT_COMPLEX_OP(aten_op, op, complex),                          \
+      DEFINE_FLOAT_COMPLEX_OP(aten_op, op, complex),                        \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),                              \
+      DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX(aten_op, op, op, op, Scalar)
+
+#define DEFINE_COMPARISON_OP_WITH_COMPLEX(aten_op, op)                   \
+  DEFINE_GENERIC_OP_WITH_COMPLEX(aten_op, op, op, op, bool, bool, bool), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, bool),                            \
+      DEFINE_FLOAT_COMPLEX_OP(aten_op, op, bool),                        \
+      DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR(     \
+          aten_op, op, op, op, bool),                                    \
+      DEFINE_STR_CMP_OP(aten_op, op)
+
+TORCH_API at::Generator make_generator_for_device(
+    c10::Device device,
+    c10::optional<int64_t> seed = c10::nullopt);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h
new file mode 100644
index 0000000000000000000000000000000000000000..88c801d9ebe231f3b5dd0e0fde5886993559349c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <string>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/jit/frontend/source_ref.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+namespace profiling {
+
+struct Datapoint {
+  using Timepoint = std::chrono::time_point<std::chrono::steady_clock>;
+  SourceRange sourceRange;
+  Timepoint start;
+  Timepoint end;
+
+  explicit Datapoint(SourceRange sr)
+      : sourceRange(std::move(sr)), start(std::chrono::steady_clock::now()) {}
+};
+
+class TORCH_API InstructionSpan {
+ public:
+  explicit InstructionSpan(Node&);
+  ~InstructionSpan();
+  InstructionSpan(InstructionSpan&&) = delete;
+  InstructionSpan& operator=(InstructionSpan&&) = delete;
+
+ private:
+  std::unique_ptr<Datapoint> datapoint_;
+};
+
+bool TORCH_API isProfilingOngoing();
+
+} // namespace profiling
+
+struct TORCH_API InstructionStats : public CustomClassHolder {
+  int64_t count{0};
+  std::chrono::nanoseconds duration{0};
+};
+
+class TORCH_API SourceStats : public CustomClassHolder {
+ public:
+  using LineMap = c10::Dict<int64_t, c10::intrusive_ptr<InstructionStats>>;
+
+  SourceStats(SourceRef source, LineMap lineMap)
+      : source_(std::move(source)), lineMap_(std::move(lineMap)) {}
+
+  const SourceRef& getSourceRef() const {
+    return source_;
+  }
+
+  const LineMap& getLineMap() const {
+    return lineMap_;
+  }
+
+ private:
+  SourceRef source_;
+  LineMap lineMap_;
+};
+
+/**
+ * ScriptProfile is an underlying C++ implementation for TorchScript profiling.
+ * The profiling section is specified by calling enable() and disable():
+ *
+ * ...
+ * scriptProfile.enable();
+ * ...
+ * (scripts)
+ * ...
+ * scriptProfile.disable();
+ * ...
+ *
+ * NOTE: you cannot attach the profiler while the script is running.
+ *
+ * To retrieve collected runtime data, users may call dumpStats() and do
+ * arbitrary filtering on the data they want. Note that dumpStats() should
+ * not be called inside a profiling section.
+ * In general, stats are aggregated per source function body, and then by line
+ * number.
+ */
+class TORCH_API ScriptProfile : public CustomClassHolder {
+  // Aggregates datapoints by function source id, then by line number.
+  using LineMap = std::map<int64_t, InstructionStats>;
+  using SourceMap = std::map<SourceRef, LineMap, std::less<>>;
+
+ public:
+  void enable();
+  void disable();
+  const SourceMap& dumpStats();
+  void addDatapoint(std::shared_ptr<profiling::Datapoint>);
+  ~ScriptProfile() override;
+
+ private:
+  bool enabled_{false};
+  std::vector<std::shared_ptr<profiling::Datapoint>> datapoints_;
+  SourceMap sourceMap_;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..508d82e59db95bcf8614d32850aaa91aeba716a6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedShapeFunctions();
+
+TORCH_API const OperatorMap<std::string>& GetShapeFunctionMappings();
+
+TORCH_API const OperatorMap<std::pair<std::string, std::string>>&
+GetBoundedShapeMappings();
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb5c77277b3039430b5d246fed506e2eae6b1244
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedFuncs();
+
+TORCH_API const OperatorMap<std::string>& GetFuncMapping();
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e2b90016a13f4b5ff05f2ce01de30c691692e51
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/runtime/graph_executor_impl.h>
+
+namespace torch::jit {
+
+struct TORCH_API SimpleGraphExecutorImpl : public GraphExecutorImplBase {
+  SimpleGraphExecutorImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name);
+
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override;
+  GraphExecutorState getDebugState() override;
+  ~SimpleGraphExecutorImpl() override = default;
+
+ private:
+  c10::optional<ExecutionPlan> execution_plan_;
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h
new file mode 100644
index 0000000000000000000000000000000000000000..da752c1c9e36aac5dc949d9abe8076cc5c61bdb0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::jit {
+
+// Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+// 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software
+// Foundation; All Rights Reserved
+//
+// Stolen (with appropriate modifications) by @agolynski
+// (https://github.com/pytorch/pytorch/pull/33019) from cpython repo
+// Objects/sliceobject.c with comment: this is harder to get right than you
+// might think
+//
+// This adjusts indexes according to python list semantics and returns number
+// of elements in the resulting list.
+TORCH_API int64_t slice_indices_adjust(
+    int64_t length,
+    int64_t* start,
+    int64_t* stop,
+    int64_t step);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1fa6d34083e718a4de5484bbd813a56c80fcf8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h
@@ -0,0 +1,18 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <c10/util/Optional.h>
+#include <c10/util/StringUtil.h>
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+struct GradientPair {
+  std::shared_ptr<Graph> forward;
+  std::shared_ptr<Graph> backward;
+};
+
+TORCH_API c10::optional<GradientPair> gradientInfoForSchema(
+    const FunctionSchema& schema);
+TORCH_API bool hasGradientInfoForSchema(const FunctionSchema& schema);
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe7c552de6a352bcea4e5da807f49b049154990
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h
@@ -0,0 +1,69 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+/*
+ADDING A NEW SHAPE GRAPH:
+- For one node schema, there is one corresponding registered shape compute
+graph. The schema of the graph should be the same except for Tensor arguments.
+For every Tensor input in operator schema, there should be a List[int]
+corresponding to that Tensor's shape. For example: "aten::linear(Tensor input,
+Tensor weight, Tensor? bias=None) -> Tensor" ==> def linear(input: List[int],
+weight: List[int], bias: Optional[List[int]])
+
+Additionally, arguments which are unused at the end of the schema may be left
+off. This allows sharing a single graph for multiple function schemas, such as
+unary operators with different trailing arguments that do not affect the output
+shape.
+
+The shape graph should return a new, unaliased List[int] (or tuple of lists for
+multiple returns) and should not modify any input lists. This allows the shape
+graphs to be composed and executed.
+
+The shape analysis (particularly for non-complete, or symbolic shapes) works by
+partially evaluating the JIT IR. It may be possible for a Graph to be registered
+that we cannot currently partially evaluate. If this happens, please file an
+issue. There are lints registered to avoid particular known patterns (continue
+or break or early return in a loop). Those may be improved in the future, please
+file an issue if necessary.
+
+To debug (and write initially) the recommended flow is to define these functions
+in python and iterate there. Functions should be added to
+torch/jit/_shape_functions.
+
+To test operators, the preferred flow is through OpInfos, with
+`assert_jit_shape_analysis=True`. If this is not feasible, you can look at tests
+in `test_symbolic_shape_analysis.py` such as `test_adaptive_avg_pool2d`.
+
+Operators which take in a list of tensors, such as concat, are not yet
+supported. Concat has been special cased and could be generalized as needed.
+Please file an issue.
+*/
+
+struct BoundedShapeGraphs {
+  std::shared_ptr<Graph> lower_bound;
+  std::shared_ptr<Graph> upper_bound;
+};
+
+TORCH_API void RegisterShapeComputeGraphForSchema(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g);
+
+TORCH_API c10::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
+    const FunctionSchema& schema);
+
+TORCH_API c10::optional<BoundedShapeGraphs> boundedGraphsForSchema(
+    const FunctionSchema& schema);
+
+TORCH_API std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas();
+
+TORCH_API void LintShapeComputeGraph(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..be3c1b6e67d25c471850e244cf383e980600c05e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h
@@ -0,0 +1,12 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const OperatorMap<std::string>& get_tensorexpr_elementwise_set();
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..205d7b502c838a6f1a2f506177c0806c80d1d9cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+
+namespace torch::jit {
+
+void tupleUnpack(Stack& stack);
+
+void format(Stack& stack, size_t num_inputs);
+
+void einsum(Stack& stack, size_t num_inputs);
+
+void percentFormat(Stack& stack, size_t num_inputs);
+
+void listUnpack(Stack& stack, size_t num_outputs);
+
+void tupleConstruct(Stack& stack, size_t num_inputs);
+
+void namedTupleConstruct(Stack& stack, c10::TypePtr type, size_t num_inputs);
+
+void listConstruct(Stack& stack, const c10::Type& list_type, size_t num_inputs);
+
+void dictConstruct(Stack& stack, const c10::Type& type, size_t num_inputs);
+
+// as weak_ref will create a Object with a non-owning CompilationUnit reference,
+// for use as a constant in the Graph to avoid a reference cycle
+void createObject(
+    Stack& stack,
+    const at::ClassTypePtr& type,
+    bool as_weak_ref = false);
+
+void isinstance(Stack& stack, at::ArrayRef<at::TypePtr> types);
+
+void tupleSlice(Stack& stack, size_t begin, size_t end);
+
+void dequantize(Stack& stack);
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..4370dc89244766b45289f756c1138e00fcc81d93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace torch::jit {
+
+// a wrapper to mark places where we expect all the at::Tensors to be
+// variables
+struct variable_tensor_list : public std::vector<at::Tensor> {
+  variable_tensor_list() = default;
+  template <class InputIt>
+  variable_tensor_list(InputIt first, InputIt last)
+      : std::vector<at::Tensor>(first, last) {}
+  explicit variable_tensor_list(std::vector<at::Tensor>&& tensor)
+      : std::vector<at::Tensor>(std::move(tensor)) {}
+};
+
+} // namespace torch::jit
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1c28877a8fae86214ab90654a275fa72c313b53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+#include <ATen/core/ivalue.h>
+
+#include <vector>
+
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+struct IValue;
+}
+
+namespace torch {
+namespace jit {
+
+class Pickler;
+class InlinedCallStackSerializer {
+ public:
+  // Serialize InlinedCallStack as
+  // SerializedInlinedCallStack =
+  // [module_info, source range tag, SerializedInlinedCallStack]
+  // module_info = [ClassType.qualifiedName, instance_name]
+  // source_range_tag = unique source range id
+  c10::IValue serialize(
+      const InlinedCallStackPtr& cs_ptr,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  // module_info = [ClassType.qualifiedName, instance_name]
+  c10::IValue serialize_module_instance_info(
+      const c10::optional<ModuleInstanceInfo>& m);
+
+  // This caches serialized inlined callstack ptr, since many
+  // InlinedCallStackPtr can refer to the same one.
+  ska::flat_hash_map<InlinedCallStackPtr, c10::IValue>
+      serialized_inlined_callstack_;
+  // This caches serialized module instance info.
+  // There might be many nodes that are part of the same
+  // parent, grandparent etc. module.
+  ska::flat_hash_map<std::string, c10::IValue> serialized_module_instance_info_;
+};
+
+class TORCH_API CallStackDebugInfoPickler {
+ public:
+  CallStackDebugInfoPickler() = default;
+
+  std::vector<char> pickle(
+      const std::unordered_map<int64_t, DebugInfoTuple>& callstack_ptrs,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  InlinedCallStackSerializer css_;
+};
+
+class InlinedCallStackDeserializer {
+ public:
+  InlinedCallStackPtr deserialize(
+      const c10::IValue& iv,
+      const ska::flat_hash_map<int64_t, SourceRange>& source_range_map,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+ private:
+  c10::optional<ModuleInstanceInfo> deserialize_module_instance_info(
+      const c10::IValue& iv,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+  ska::
+      flat_hash_map<c10::intrusive_ptr<c10::ivalue::Tuple>, InlinedCallStackPtr>
+          cached_inlined_callstacks_;
+  ska::flat_hash_map<c10::intrusive_ptr<c10::ivalue::Tuple>, ModuleInstanceInfo>
+      cached_module_instance_info_;
+};
+
+class TORCH_API CallStackDebugInfoUnpickler {
+ public:
+  ska::flat_hash_map<int64_t, DebugInfoTuple> unpickle(
+      at::DataPtr&& data,
+      size_t size,
+      const ska::flat_hash_map<int64_t, SourceRange>& source_range_map,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+ private:
+  InlinedCallStackDeserializer csds_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea988ece1d6dc9cb54a72ee485108a6a47edf960
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h
@@ -0,0 +1,280 @@
+#pragma once
+
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/jit/serialization/python_print.h>
+#include <torch/csrc/jit/serialization/storage_context.h>
+#include <torch/csrc/jit/serialization/type_name_uniquer.h>
+#include <torch/csrc/onnx/onnx.h>
+#include <ostream>
+
+namespace ONNX_NAMESPACE {
+class ModelProto;
+}
+
+namespace torch {
+namespace jit {
+
+// This map is used to keep track of parameters that should be exported
+// externally. When `defer_weight_export` is true, the returned map contains
+// kv pairs that map {external reference name} -> {at::Tensor to be exported}.
+// It is the responsibility of the caller to export these appropriately.
+//
+// For example, when exporting to a zip archive, the caller may write out files
+// for each entry in the export map, with the filename being the key and the
+// file contents being the raw tensor data.
+using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
+
+using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+
+using NodeNameMap = std::unordered_map<const Node*, std::string>;
+
+// Used for modularized export settling function and node attributes.
+using NodeAttrNameMap = std::
+    unordered_map<const Node*, std::unordered_map<std::string, std::string>>;
+
+TORCH_API std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap,
+    bool,
+    NodeNameMap>
+export_onnx(
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    bool defer_weight_export = false,
+    ::torch::onnx::OperatorExportTypes operator_export_type =
+        ::torch::onnx::OperatorExportTypes::ONNX,
+    bool strip_doc_string = true,
+    bool keep_initializers_as_inputs = true,
+    const std::map<std::string, int>& custom_opsets = {},
+    bool add_node_names = true,
+    bool use_external_data_format = false,
+    const std::string& onnx_file_path = std::string(),
+    const NodeAttrNameMap& node_attr_to_name = {});
+
+TORCH_API std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
+
+TORCH_API void check_onnx_proto(const std::string& proto_string);
+
+// Serializer for both oldsyle and unified format TorchScript serialization
+class TORCH_API ScriptModuleSerializer {
+ public:
+  explicit ScriptModuleSerializer(
+      caffe2::serialize::PyTorchStreamWriter& export_writer)
+      : writer_(export_writer), current_source_range_tag_(0) {}
+
+  void writeFiles(const std::string& code_dir);
+  void serialize(
+      const Module& module,
+      const ExtraFilesMap& extra_files,
+      bool bytecode_format,
+      bool save_mobile_debug_info);
+  void serialize_unified_format(Module& module, uint64_t script_module_id);
+  SerializationStorageContext& storage_context();
+
+  ~ScriptModuleSerializer() = default;
+
+ private:
+  void convertNamedType(const c10::NamedTypePtr& class_type);
+  void convertTypes(const at::NamedTypePtr& root_type);
+  void writeExtraFiles(const Module& module, const ExtraFilesMap& extra_files);
+  void writeByteCode(const Module& module, bool save_mobile_debug_info);
+  void writeArchive(
+      const IValue& value,
+      const std::string& archive_name,
+      const std::string& archive_dir,
+      const std::string& tensor_dir,
+      bool use_storage_context = false,
+      bool skip_tensor_data = false);
+  void updateSourceRangeTags(const SourceRangeRecords& ranges);
+
+  caffe2::serialize::PyTorchStreamWriter& writer_;
+  std::vector<at::IValue> constant_table_;
+
+  std::unordered_set<c10::NamedTypePtr> converted_types_;
+  PrintDepsTable class_deps_;
+  TypeNameUniquer type_name_uniquer_;
+  // qualifier, e.g. '__torch__.Bar' -> PythonPrint for the file that will be
+  // created
+  OrderedDict<std::string, PythonPrint> file_streams_;
+  // Used to keep references of storages around during serialization to solve
+  // for ABA memory reuse problem hit when storages are created/destroyed
+  // during serialization process. Also used to coordinate sharing of storages
+  // between Script and eager modules in torch.package.
+  SerializationStorageContext storage_context_;
+
+  // Uniquely identifies a SourceRange in a model.
+  // SourceRanges are associated with Nodes of Graphs.
+  // However for mobile deployment we dont intend to ship
+  // full JIT with capabilities of reading code and constructing
+  // graphs.
+  // Instead we serialize the Code generated from graph of the methods.
+  // Code is serialized in bytecode format that contains instructions
+  // corresponding to the nodes of the graph. Since original graph is gone, the
+  // question is how do we identify where the ops, in serialized bytecode, come
+  // from in original model code. We do this in two parts.
+  // 1. Associate a unique tag to SourceRange.
+  // 2. Serialize this unique_tag.
+  //  2.1 Meaning save <byte_offset, source_range_tag, source range> instead of
+  //      <byte_offset, source range>
+  // 3. During serializing model for mobile, i.e. bytecode generation,
+  //    save unique tag of SourceRange corresponding to the Node.
+  // 4. During deserialization, read all the debug_pkl, to construct a map
+  //    of <unique_tag, SourceRange> and use tag saved with OPs in bytecode
+  //    to lookup the source range.
+  // Strictly speaking we will serialize InlinedCallStack directly, which
+  // contains SourceRange. This way we have access to entire callstack and not
+  // just source information about where the node is, since bytecode inlines the
+  // graph before saving it.
+  SourceRangeTagMap source_range_tags_;
+  int64_t current_source_range_tag_;
+};
+
+// For testing purposes
+TORCH_API std::string pretty_print_onnx(
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    bool defer_weight_export,
+    ::torch::onnx::OperatorExportTypes operator_export_type =
+        ::torch::onnx::OperatorExportTypes::ONNX,
+    bool google_printer = false,
+    bool keep_initializers_as_inputs = true,
+    const std::map<std::string, int>& custom_opsets = {},
+    bool add_node_names = true);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    std::ostream& out,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+// Write the bytes of a pickle archive and the tensors referenced inside that
+// archive
+TORCH_API void writeArchiveAndTensors(
+    const std::string& archive_name,
+    const char* pickle_bytes,
+    size_t size,
+    const std::vector<at::Tensor>& tensors,
+    caffe2::serialize::PyTorchStreamWriter& out);
+
+// Surrounding system can install an additional hook to produce extra files
+// with metadata based on environment every time a module is serialized.
+using ExportModuleExtraFilesHook = std::function<ExtraFilesMap(const Module&)>;
+TORCH_API void SetExportModuleExtraFilesHook(ExportModuleExtraFilesHook hook);
+
+/**
+ * Generates new bytecode for a Script module and returns what the op list
+ * would be for a LiteScriptModule based off the current code base. If you
+ * have a LiteScriptModule and want to get the currently present
+ * list of ops call _export_operator_list instead.
+ */
+TORCH_API std::vector<std::string> export_opnames(const Module& m);
+
+struct TORCH_API BytecodeEmitMode {
+  static bool is_default_value_for_unspecified_arg_enabled();
+  static void set_default_value_for_unspecified_arg_enabled(bool enabled);
+
+  static bool is_default_args_before_out_args_enabled();
+  static void set_default_args_before_out_args_enabled(bool enabled);
+
+  static bool is_emit_promoted_ops_enabled();
+  static void set_default_emit_promoted_ops_enabled(bool enabled);
+};
+
+// RAII guard to switch the way JIT emits the bytecode for inputs.
+// default_value_for_unspecified_arg:
+// true: instruction of default argument values (like LOADC) is emitted.
+// false: instruction of default argument values are not emitted. Instead
+// they are fetched from operator schema.
+// default_args_before_out_args (to forward compatibile support
+// operators allowing out arguments and default arguments):
+// true: the number of specified arguments will deserialized to (#all_args -
+// #default_args). false: the number of specified arguments will deserialized to
+// (#all_args).
+struct TORCH_API BytecodeEmitModeGuard {
+  BytecodeEmitModeGuard(
+      bool enable_default_value_for_unspecified_arg,
+      bool enable_default_args_before_out_args,
+      bool enable_emit_promoted_ops)
+      : prev_default_value_for_unspecified_arg_mode(
+            BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled()),
+        prev_default_args_before_out_args(
+            BytecodeEmitMode::is_default_args_before_out_args_enabled()),
+        prev_default_emit_promoted_ops(
+            BytecodeEmitMode::is_emit_promoted_ops_enabled()) {
+    BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
+        enable_default_value_for_unspecified_arg);
+    BytecodeEmitMode::set_default_args_before_out_args_enabled(
+        enable_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        enable_emit_promoted_ops);
+  }
+  ~BytecodeEmitModeGuard() {
+    BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
+        prev_default_value_for_unspecified_arg_mode);
+    BytecodeEmitMode::set_default_args_before_out_args_enabled(
+        prev_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        prev_default_emit_promoted_ops);
+  }
+  bool prev_default_value_for_unspecified_arg_mode;
+  bool prev_default_args_before_out_args;
+  bool prev_default_emit_promoted_ops;
+};
+
+TORCH_API IValue to_tuple(std::vector<IValue> ivalues);
+TORCH_API IValue
+Table(const std::vector<std::pair<std::string, IValue>>& entries);
+
+// TODO remove these switches once interface call is rolled out.
+TORCH_API void enableMobileInterfaceCallExport();
+bool getMobileInterfaceCallExport();
+
+TORCH_API CompilationOptions getOptionsFromGlobal();
+
+TORCH_API void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API void save_jit_module_to_write_func(
+    const Module& module,
+    const ExtraFilesMap& extra_files,
+    bool save_mobile_debug_info,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h
new file mode 100644
index 0000000000000000000000000000000000000000..616da5e7478d03ff4bf6a5a76e2b07e4da769801
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <tuple>
+#include <unordered_map>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/backends/backend_debug_handler.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/serialization/type_name_uniquer.h>
+
+namespace torch {
+namespace jit {
+
+struct TORCH_API CompilationOptions {
+  bool incl_interface_call = false;
+  bool enable_default_value_for_unspecified_arg = false;
+  bool enable_default_args_before_out_args = true;
+  bool enable_emit_promoted_ops = true;
+  int model_version = caffe2::serialize::kProducedBytecodeVersion;
+};
+
+TORCH_API mobile::Module jitModuleToMobile(
+    const Module& module,
+    const CompilationOptions& options);
+
+mobile::Code compileGraphToMobileCode(
+    const std::string& name,
+    const std::shared_ptr<Graph>& graph,
+    const CompilationOptions& compilation_options,
+    BackendDebugInfoRecorder& debug_info_recorder);
+
+TORCH_API std::unique_ptr<mobile::Function> convertJitFunctionToMobileFunction(
+    const GraphFunction& function,
+    const CompilationOptions& options);
+
+TORCH_API IValue convertMobileFunctionToCodeTable(
+    const mobile::Function& func,
+    const CompilationOptions& compilation_options);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b84653d64a4fdb9ab5d7ef2547af8b405b9c17bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+/**
+ * Defines the public API for serializing mobile modules to flatbuffer.
+ * Note that this header must not include or depend on flatbuffer-defined
+ * types, to avoid leaking those details to PyTorch clients.
+ */
+
+namespace torch {
+namespace jit {
+
+/// Maps file names to file contents.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+/**
+ * Represents a span of data. Typically owned by a UniqueDetachedBuffer.
+ */
+class TORCH_API DetachedBuffer final {
+ public:
+  /// Creates a new DetachedBuffer with an optional data owner. This interface
+  /// is provided to let users create objects of this type for testing.
+  DetachedBuffer(void* data, size_t size, void* internal_data_owner = nullptr)
+      : data_(data), size_(size), data_owner_(internal_data_owner) {}
+
+  /// Returns a pointer to the data.
+  C10_NODISCARD void* data() {
+    return data_;
+  }
+  /// Returns a pointer to the data.
+  C10_NODISCARD const void* data() const {
+    return data_;
+  }
+  /// Returns the size of the data, in bytes.
+  C10_NODISCARD size_t size() const {
+    return size_;
+  }
+
+  /// Wrapper type that typically owns data_owner_.
+  using UniqueDetachedBuffer =
+      std::unique_ptr<DetachedBuffer, std::function<void(DetachedBuffer*)>>;
+
+ private:
+  /// Deletes the owner, if present, and the buf itself.
+  /// Note: we could have provided a movable type with a destructor that did
+  /// this work, but the unique wrapper was easier in practice.
+  static void destroy(DetachedBuffer* buf);
+
+  /// Provides access to destroy() for implementation and testing.
+  friend struct DetachedBufferFriend;
+  friend struct DetachedBufferTestingFriend;
+
+  /// Pointer to the data. Not owned by this class.
+  void* data_;
+  /// The size of `data_`, in bytes.
+  size_t size_;
+  /// Opaque pointer to the underlying owner of `data_`. This class
+  /// (DetachedBuffer) does not own the owner or the data. It will typically be
+  /// owned by a UniqueDetachedBuffer that knows how to delete the owner along
+  /// with this class.
+  void* data_owner_;
+};
+
+TORCH_API void save_mobile_module(
+    const mobile::Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
+
+TORCH_API DetachedBuffer::UniqueDetachedBuffer save_mobile_module_to_bytes(
+    const mobile::Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
+
+TORCH_API void save_mobile_module_to_func(
+    const mobile::Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+// TODO(qihan): delete
+TORCH_API bool register_flatbuffer_serializer();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..e74a5b352d32b93158fea885eb083d889c45a5bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API bool register_flatbuffer_all();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..913f410dc3fc29c6fb4ce39f03e4095360ba9c78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <istream>
+
+namespace caffe2 {
+namespace serialize {
+class ReadAdapterInterface;
+} // namespace serialize
+} // namespace caffe2
+
+namespace torch {
+namespace jit {
+
+class DeserializationStorageContext;
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    const std::string& filename,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::istream& in,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    const std::string& filename,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true,
+    bool restore_shapes = false);
+
+// For reading unified serialization format from torch.Package
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
+    std::shared_ptr<torch::jit::DeserializationStorageContext> storage_context,
+    c10::optional<at::Device> device,
+    std::string ts_id /* torchscript identifier inside package */);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::istream& in,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true,
+    bool restore_shapes = false);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given `istream`.
+///
+/// The istream must contain a serialized `Module`, exported via
+/// `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    std::istream& in,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    std::istream& in,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given `filename`.
+///
+/// The file stored at the location given in `filename` must contain a
+/// serialized `Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    const std::string& filename,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    const std::string& filename,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given shared_ptr `rai`.
+///
+/// The reader adapter, which is for customized input stream, must contain a
+/// serialized `Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device = c10::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    c10::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+TORCH_API Module jitModuleFromSourceAndConstants(
+    const IValue& ivalue,
+    const ExtraFilesMap& source,
+    const std::vector<IValue>& constants,
+    int32_t version);
+
+TORCH_API Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device = c10::nullopt);
+
+TORCH_API Module parse_and_initialize_jit_module(
+    std::shared_ptr<char> data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    c10::optional<at::Device> device);
+
+TORCH_API c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
+    const at::StrongTypePtr& type,
+    IValue input);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..7496072be0d00331d0b616ef7bb9b07791c4bca4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <cstddef>
+
+namespace torch {
+namespace jit {
+constexpr size_t BYTECODE_INDEX_INSTRUCTION = 0;
+constexpr size_t BYTECODE_INDEX_OPERATOR = 1;
+constexpr size_t BYTECODE_INDEX_CONSTANT = 2;
+constexpr size_t BYTECODE_INDEX_TYPE = 3;
+constexpr size_t BYTECODE_INDEX_REGISTER_SIZE = 4;
+
+constexpr size_t BYTECODE_INDEX_SCHEMA_ARGUMENTS = 0;
+constexpr size_t BYTECODE_INDEX_SCHEMA_RETURNS = 1;
+
+constexpr size_t BYTECODE_INDEX_ARGUMENT_NAME = 0;
+constexpr size_t BYTECODE_INDEX_ARGUMENT_TYPE = 1;
+constexpr size_t BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE = 2;
+
+constexpr size_t BYTECODE_INDEX_MODULE_DEBUG_HANDLES = 0;
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..70cae45dbc6311fa23889d501fb7db1734f0fb69
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h
@@ -0,0 +1,16 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+
+// Functions that are used in both import and export processes
+namespace torch {
+namespace jit {
+using c10::IValue;
+IValue expect_field(
+    c10::ivalue::TupleElements& elements,
+    const std::string& expected_name,
+    size_t entry);
+std::string operator_str(
+    const std::string& name,
+    const std::string& overloadname);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a31c4a50ee11c09e288376d8725b90990edf34a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+namespace caffe2 {
+namespace serialize {
+class PyTorchStreamReader;
+}
+} // namespace caffe2
+
+namespace torch {
+namespace jit {
+
+struct Source;
+
+// Convert a class type's qualifier name to the corresponding path the source
+// file it should be written to.
+//
+// Qualifier is like: foo.bar.baz
+// Returns: libs/foo/bar/baz.py
+std::string qualifierToArchivePath(
+    const std::string& qualifier,
+    const std::string& export_prefix);
+
+std::shared_ptr<Source> findSourceInArchiveFromQualifier(
+    caffe2::serialize::PyTorchStreamReader& reader,
+    const std::string& export_prefix,
+    const std::string& qualifier);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_legacy.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..02d59363ff66c60a12d09be61e893451c43029ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_legacy.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace caffe2 {
+namespace serialize {
+class PyTorchStreamReader;
+} // namespace serialize
+} // namespace caffe2
+
+namespace torch {
+namespace jit {
+
+struct CompilationUnit;
+
+// Deserializes a model in legacy format.
+Module LEGACY_deserialize(
+    std::shared_ptr<CompilationUnit> cu,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
+    const c10::optional<c10::Device>& device);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1359d6d570b50fe5c1030fe506c29c34212b557
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/unpickler.h>
+#include <memory>
+
+namespace caffe2 {
+namespace serialize {
+class PyTorchStreamReader;
+} // namespace serialize
+} // namespace caffe2
+
+namespace torch {
+namespace jit {
+
+TORCH_API IValue readArchiveAndTensors(
+    const std::string& archive_name,
+    const std::string& pickle_prefix,
+    const std::string& tensor_prefix,
+    c10::optional<TypeResolver> type_resolver,
+    c10::optional<ObjLoader> obj_loader,
+    c10::optional<at::Device> device,
+    caffe2::serialize::PyTorchStreamReader& stream_reader,
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser,
+    std::shared_ptr<DeserializationStorageContext> storage_context = nullptr);
+
+bool check_zip_file(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f6df80e50fb0be52827584593e6812a8a884253
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <ATen/core/ivalue_inl.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/parser.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/custom_class.h>
+#include <functional>
+#include <memory>
+#include <regex>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+using SourceLoader = std::function<std::shared_ptr<Source>(const std::string&)>;
+
+struct SourceImporterImpl : public Resolver,
+                            std::enable_shared_from_this<SourceImporterImpl> {
+  SourceImporterImpl(
+      std::shared_ptr<CompilationUnit> cu,
+      const std::vector<at::IValue>* constant_table,
+      SourceLoader source_loader,
+      size_t version);
+  TypePtr findNamedType(const QualifiedName& name);
+  Function* findFunction(const QualifiedName& name);
+  void parseSourceIfNeeded(const std::string& qualifier);
+  void LEGACY_import_methods(
+      const Module& mod,
+      const std::shared_ptr<Source>& src);
+
+  std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) override;
+  TypePtr resolveType(const std::string& name, const SourceRange& loc) override;
+
+ private:
+  void importFunction(const std::string& qualifier, const Def& def);
+  void importNamedType(const std::string& qualifier, const ClassDef& class_def);
+  c10::optional<Assign> attributeAssignmentSpecialHandlingHack(
+      const QualifiedName& qualified_classname,
+      const Assign& assign);
+  void importClass(
+      const QualifiedName& qualified_classname,
+      const ClassDef& class_def,
+      bool is_module);
+  void importEnum(
+      const QualifiedName& qualified_name,
+      const ClassDef& enum_def);
+  void importNamedTuple(
+      const QualifiedName& qualified_name,
+      const ClassDef& named_tuple_def);
+
+  void parsePossibleVersionNumber(Lexer& L);
+
+  void parseImports(Lexer& L);
+
+  std::shared_ptr<CompilationUnit> cu_;
+  std::unordered_map<std::string, std::shared_ptr<SugaredValue>> env_;
+  SourceLoader source_loader_;
+  c10::optional<size_t> version_ = c10::nullopt;
+  std::unordered_set<std::string> loaded_sources_;
+  // named types and functions loaded from a file but not yet defined because
+  // their type has not been requested yet.
+  std::unordered_map<QualifiedName, TreeRef> to_be_defined_;
+};
+
+// Given a directory of serialized TorchScript sources,
+// This class allows the loading of individual named types in source.
+// Resolves the dependencies between source files and parses
+// the source files as necessary.
+
+struct TORCH_API SourceImporter {
+  SourceImporter(
+      // The compilation unit that will own the imported source
+      std::shared_ptr<CompilationUnit> cu,
+      const std::vector<at::IValue>* constant_table,
+      SourceLoader loader,
+      size_t version);
+
+  TypePtr loadType(const QualifiedName& name) const;
+
+  // Add the methods defined in `src` to the module `mod`, using SourceImporter
+  // to resolve any classes via loadType
+  void LEGACY_import_methods(
+      const Module& mod,
+      const std::shared_ptr<Source>& src);
+  ~SourceImporter();
+
+ private:
+  std::shared_ptr<SourceImporterImpl> pImpl;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d61f09c6fba2a4e4febab973eed7fc31940759
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h
@@ -0,0 +1,2599 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+#define FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 3 &&
+              FLATBUFFERS_VERSION_REVISION == 3,
+             "Non-compatible flatbuffers version included");
+
+namespace torch {
+namespace jit {
+namespace mobile {
+namespace serialization {
+
+struct Int;
+
+struct Bool;
+
+struct Double;
+
+struct PerTensorAffineSchema;
+
+struct QuantizedSchema;
+struct QuantizedSchemaBuilder;
+
+struct TensorMetadata;
+struct TensorMetadataBuilder;
+
+struct String;
+struct StringBuilder;
+
+struct Device;
+struct DeviceBuilder;
+
+struct List;
+struct ListBuilder;
+
+struct IntList;
+struct IntListBuilder;
+
+struct DoubleList;
+struct DoubleListBuilder;
+
+struct BoolList;
+struct BoolListBuilder;
+
+struct Tuple;
+struct TupleBuilder;
+
+struct Dict;
+struct DictBuilder;
+
+struct ObjectType;
+struct ObjectTypeBuilder;
+
+struct Object;
+struct ObjectBuilder;
+
+struct ComplexDouble;
+
+struct EnumValue;
+struct EnumValueBuilder;
+
+struct Instruction;
+
+struct Operator;
+struct OperatorBuilder;
+
+struct Arg;
+struct ArgBuilder;
+
+struct Schema;
+struct SchemaBuilder;
+
+struct DebugInfo;
+struct DebugInfoBuilder;
+
+struct Function;
+struct FunctionBuilder;
+
+struct StorageData;
+struct StorageDataBuilder;
+
+struct IValue;
+struct IValueBuilder;
+
+struct ExtraFile;
+struct ExtraFileBuilder;
+
+struct Module;
+struct ModuleBuilder;
+
+enum class TypeType : uint8_t {
+  UNSET = 0,
+  CLASS_WITH_FIELD = 1,
+  CUSTOM_CLASS = 2,
+  CLASS_WITH_SETSTATE = 3,
+  NON_OBJ = 4,
+  MIN = UNSET,
+  MAX = NON_OBJ
+};
+
+inline const TypeType (&EnumValuesTypeType())[5] {
+  static const TypeType values[] = {
+    TypeType::UNSET,
+    TypeType::CLASS_WITH_FIELD,
+    TypeType::CUSTOM_CLASS,
+    TypeType::CLASS_WITH_SETSTATE,
+    TypeType::NON_OBJ
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTypeType() {
+  static const char * const names[6] = {
+    "UNSET",
+    "CLASS_WITH_FIELD",
+    "CUSTOM_CLASS",
+    "CLASS_WITH_SETSTATE",
+    "NON_OBJ",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTypeType(TypeType e) {
+  if (::flatbuffers::IsOutRange(e, TypeType::UNSET, TypeType::NON_OBJ)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTypeType()[index];
+}
+
+enum class IValueUnion : uint8_t {
+  NONE = 0,
+  Int = 1,
+  Bool = 2,
+  Double = 3,
+  ComplexDouble = 4,
+  TensorMetadata = 5,
+  String = 6,
+  List = 7,
+  Tuple = 8,
+  Dict = 9,
+  Object = 10,
+  IntList = 11,
+  DoubleList = 12,
+  BoolList = 13,
+  Device = 14,
+  EnumValue = 15,
+  Function = 16,
+  MIN = NONE,
+  MAX = Function
+};
+
+inline const IValueUnion (&EnumValuesIValueUnion())[17] {
+  static const IValueUnion values[] = {
+    IValueUnion::NONE,
+    IValueUnion::Int,
+    IValueUnion::Bool,
+    IValueUnion::Double,
+    IValueUnion::ComplexDouble,
+    IValueUnion::TensorMetadata,
+    IValueUnion::String,
+    IValueUnion::List,
+    IValueUnion::Tuple,
+    IValueUnion::Dict,
+    IValueUnion::Object,
+    IValueUnion::IntList,
+    IValueUnion::DoubleList,
+    IValueUnion::BoolList,
+    IValueUnion::Device,
+    IValueUnion::EnumValue,
+    IValueUnion::Function
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesIValueUnion() {
+  static const char * const names[18] = {
+    "NONE",
+    "Int",
+    "Bool",
+    "Double",
+    "ComplexDouble",
+    "TensorMetadata",
+    "String",
+    "List",
+    "Tuple",
+    "Dict",
+    "Object",
+    "IntList",
+    "DoubleList",
+    "BoolList",
+    "Device",
+    "EnumValue",
+    "Function",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameIValueUnion(IValueUnion e) {
+  if (::flatbuffers::IsOutRange(e, IValueUnion::NONE, IValueUnion::Function)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesIValueUnion()[index];
+}
+
+template<typename T> struct IValueUnionTraits {
+  static const IValueUnion enum_value = IValueUnion::NONE;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Int> {
+  static const IValueUnion enum_value = IValueUnion::Int;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Bool> {
+  static const IValueUnion enum_value = IValueUnion::Bool;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Double> {
+  static const IValueUnion enum_value = IValueUnion::Double;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::ComplexDouble> {
+  static const IValueUnion enum_value = IValueUnion::ComplexDouble;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::TensorMetadata> {
+  static const IValueUnion enum_value = IValueUnion::TensorMetadata;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::String> {
+  static const IValueUnion enum_value = IValueUnion::String;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::List> {
+  static const IValueUnion enum_value = IValueUnion::List;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Tuple> {
+  static const IValueUnion enum_value = IValueUnion::Tuple;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Dict> {
+  static const IValueUnion enum_value = IValueUnion::Dict;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Object> {
+  static const IValueUnion enum_value = IValueUnion::Object;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::IntList> {
+  static const IValueUnion enum_value = IValueUnion::IntList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::DoubleList> {
+  static const IValueUnion enum_value = IValueUnion::DoubleList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::BoolList> {
+  static const IValueUnion enum_value = IValueUnion::BoolList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Device> {
+  static const IValueUnion enum_value = IValueUnion::Device;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::EnumValue> {
+  static const IValueUnion enum_value = IValueUnion::EnumValue;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Function> {
+  static const IValueUnion enum_value = IValueUnion::Function;
+};
+
+bool VerifyIValueUnion(::flatbuffers::Verifier &verifier, const void *obj, IValueUnion type);
+bool VerifyIValueUnionVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<IValueUnion> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int FLATBUFFERS_FINAL_CLASS {
+ private:
+  int64_t int_val_;
+
+ public:
+  Int()
+      : int_val_(0) {
+  }
+  Int(int64_t _int_val)
+      : int_val_(::flatbuffers::EndianScalar(_int_val)) {
+  }
+  int64_t int_val() const {
+    return ::flatbuffers::EndianScalar(int_val_);
+  }
+  void mutate_int_val(int64_t _int_val) {
+    ::flatbuffers::WriteScalar(&int_val_, _int_val);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t bool_val_;
+
+ public:
+  Bool()
+      : bool_val_(0) {
+  }
+  Bool(bool _bool_val)
+      : bool_val_(::flatbuffers::EndianScalar(static_cast<uint8_t>(_bool_val))) {
+  }
+  bool bool_val() const {
+    return ::flatbuffers::EndianScalar(bool_val_) != 0;
+  }
+  void mutate_bool_val(bool _bool_val) {
+    ::flatbuffers::WriteScalar(&bool_val_, static_cast<uint8_t>(_bool_val));
+  }
+};
+FLATBUFFERS_STRUCT_END(Bool, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Double FLATBUFFERS_FINAL_CLASS {
+ private:
+  double double_val_;
+
+ public:
+  Double()
+      : double_val_(0) {
+  }
+  Double(double _double_val)
+      : double_val_(::flatbuffers::EndianScalar(_double_val)) {
+  }
+  double double_val() const {
+    return ::flatbuffers::EndianScalar(double_val_);
+  }
+  void mutate_double_val(double _double_val) {
+    ::flatbuffers::WriteScalar(&double_val_, _double_val);
+  }
+};
+FLATBUFFERS_STRUCT_END(Double, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) PerTensorAffineSchema FLATBUFFERS_FINAL_CLASS {
+ private:
+  double q_scale_;
+  int32_t q_zero_point_;
+  int32_t padding0__;
+
+ public:
+  PerTensorAffineSchema()
+      : q_scale_(0),
+        q_zero_point_(0),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  PerTensorAffineSchema(double _q_scale, int32_t _q_zero_point)
+      : q_scale_(::flatbuffers::EndianScalar(_q_scale)),
+        q_zero_point_(::flatbuffers::EndianScalar(_q_zero_point)),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  double q_scale() const {
+    return ::flatbuffers::EndianScalar(q_scale_);
+  }
+  void mutate_q_scale(double _q_scale) {
+    ::flatbuffers::WriteScalar(&q_scale_, _q_scale);
+  }
+  int32_t q_zero_point() const {
+    return ::flatbuffers::EndianScalar(q_zero_point_);
+  }
+  void mutate_q_zero_point(int32_t _q_zero_point) {
+    ::flatbuffers::WriteScalar(&q_zero_point_, _q_zero_point);
+  }
+};
+FLATBUFFERS_STRUCT_END(PerTensorAffineSchema, 16);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) ComplexDouble FLATBUFFERS_FINAL_CLASS {
+ private:
+  double real_;
+  double imag_;
+
+ public:
+  ComplexDouble()
+      : real_(0),
+        imag_(0) {
+  }
+  ComplexDouble(double _real, double _imag)
+      : real_(::flatbuffers::EndianScalar(_real)),
+        imag_(::flatbuffers::EndianScalar(_imag)) {
+  }
+  double real() const {
+    return ::flatbuffers::EndianScalar(real_);
+  }
+  void mutate_real(double _real) {
+    ::flatbuffers::WriteScalar(&real_, _real);
+  }
+  double imag() const {
+    return ::flatbuffers::EndianScalar(imag_);
+  }
+  void mutate_imag(double _imag) {
+    ::flatbuffers::WriteScalar(&imag_, _imag);
+  }
+};
+FLATBUFFERS_STRUCT_END(ComplexDouble, 16);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Instruction FLATBUFFERS_FINAL_CLASS {
+ private:
+  int8_t op_;
+  int8_t padding0__;
+  uint16_t n_;
+  int32_t x_;
+
+ public:
+  Instruction()
+      : op_(0),
+        padding0__(0),
+        n_(0),
+        x_(0) {
+    (void)padding0__;
+  }
+  Instruction(int8_t _op, uint16_t _n, int32_t _x)
+      : op_(::flatbuffers::EndianScalar(_op)),
+        padding0__(0),
+        n_(::flatbuffers::EndianScalar(_n)),
+        x_(::flatbuffers::EndianScalar(_x)) {
+    (void)padding0__;
+  }
+  int8_t op() const {
+    return ::flatbuffers::EndianScalar(op_);
+  }
+  void mutate_op(int8_t _op) {
+    ::flatbuffers::WriteScalar(&op_, _op);
+  }
+  uint16_t n() const {
+    return ::flatbuffers::EndianScalar(n_);
+  }
+  void mutate_n(uint16_t _n) {
+    ::flatbuffers::WriteScalar(&n_, _n);
+  }
+  int32_t x() const {
+    return ::flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(int32_t _x) {
+    ::flatbuffers::WriteScalar(&x_, _x);
+  }
+};
+FLATBUFFERS_STRUCT_END(Instruction, 8);
+
+struct QuantizedSchema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizedSchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_QSCHEME = 4,
+    VT_SCALE = 6,
+    VT_ZERO_POINT = 8,
+    VT_SCALES = 10,
+    VT_ZERO_POINTS = 12,
+    VT_AXIS = 14
+  };
+  int8_t qscheme() const {
+    return GetField<int8_t>(VT_QSCHEME, 0);
+  }
+  bool mutate_qscheme(int8_t _qscheme = 0) {
+    return SetField<int8_t>(VT_QSCHEME, _qscheme, 0);
+  }
+  double scale() const {
+    return GetField<double>(VT_SCALE, 0.0);
+  }
+  bool mutate_scale(double _scale = 0.0) {
+    return SetField<double>(VT_SCALE, _scale, 0.0);
+  }
+  int32_t zero_point() const {
+    return GetField<int32_t>(VT_ZERO_POINT, 0);
+  }
+  bool mutate_zero_point(int32_t _zero_point = 0) {
+    return SetField<int32_t>(VT_ZERO_POINT, _zero_point, 0);
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *scales() const {
+    return GetPointer<const torch::jit::mobile::serialization::TensorMetadata *>(VT_SCALES);
+  }
+  torch::jit::mobile::serialization::TensorMetadata *mutable_scales() {
+    return GetPointer<torch::jit::mobile::serialization::TensorMetadata *>(VT_SCALES);
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *zero_points() const {
+    return GetPointer<const torch::jit::mobile::serialization::TensorMetadata *>(VT_ZERO_POINTS);
+  }
+  torch::jit::mobile::serialization::TensorMetadata *mutable_zero_points() {
+    return GetPointer<torch::jit::mobile::serialization::TensorMetadata *>(VT_ZERO_POINTS);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool mutate_axis(int32_t _axis = 0) {
+    return SetField<int32_t>(VT_AXIS, _axis, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_QSCHEME, 1) &&
+           VerifyField<double>(verifier, VT_SCALE, 8) &&
+           VerifyField<int32_t>(verifier, VT_ZERO_POINT, 4) &&
+           VerifyOffset(verifier, VT_SCALES) &&
+           verifier.VerifyTable(scales()) &&
+           VerifyOffset(verifier, VT_ZERO_POINTS) &&
+           verifier.VerifyTable(zero_points()) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct QuantizedSchemaBuilder {
+  typedef QuantizedSchema Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_qscheme(int8_t qscheme) {
+    fbb_.AddElement<int8_t>(QuantizedSchema::VT_QSCHEME, qscheme, 0);
+  }
+  void add_scale(double scale) {
+    fbb_.AddElement<double>(QuantizedSchema::VT_SCALE, scale, 0.0);
+  }
+  void add_zero_point(int32_t zero_point) {
+    fbb_.AddElement<int32_t>(QuantizedSchema::VT_ZERO_POINT, zero_point, 0);
+  }
+  void add_scales(::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> scales) {
+    fbb_.AddOffset(QuantizedSchema::VT_SCALES, scales);
+  }
+  void add_zero_points(::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> zero_points) {
+    fbb_.AddOffset(QuantizedSchema::VT_ZERO_POINTS, zero_points);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(QuantizedSchema::VT_AXIS, axis, 0);
+  }
+  explicit QuantizedSchemaBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizedSchema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizedSchema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizedSchema> CreateQuantizedSchema(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t qscheme = 0,
+    double scale = 0.0,
+    int32_t zero_point = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> scales = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> zero_points = 0,
+    int32_t axis = 0) {
+  QuantizedSchemaBuilder builder_(_fbb);
+  builder_.add_scale(scale);
+  builder_.add_axis(axis);
+  builder_.add_zero_points(zero_points);
+  builder_.add_scales(scales);
+  builder_.add_zero_point(zero_point);
+  builder_.add_qscheme(qscheme);
+  return builder_.Finish();
+}
+
+struct TensorMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorMetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STORAGE_LOCATION_INDEX = 4,
+    VT_SCALAR_TYPE = 6,
+    VT_STORAGE_OFFSET = 8,
+    VT_SIZES = 10,
+    VT_STRIDES = 12,
+    VT_REQUIRES_GRAD = 14,
+    VT_QUANTIZED_SCHEMA = 16
+  };
+  uint32_t storage_location_index() const {
+    return GetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, 0);
+  }
+  bool mutate_storage_location_index(uint32_t _storage_location_index = 0) {
+    return SetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, _storage_location_index, 0);
+  }
+  int8_t scalar_type() const {
+    return GetField<int8_t>(VT_SCALAR_TYPE, 0);
+  }
+  bool mutate_scalar_type(int8_t _scalar_type = 0) {
+    return SetField<int8_t>(VT_SCALAR_TYPE, _scalar_type, 0);
+  }
+  int32_t storage_offset() const {
+    return GetField<int32_t>(VT_STORAGE_OFFSET, 0);
+  }
+  bool mutate_storage_offset(int32_t _storage_offset = 0) {
+    return SetField<int32_t>(VT_STORAGE_OFFSET, _storage_offset, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SIZES);
+  }
+  ::flatbuffers::Vector<int32_t> *mutable_sizes() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_SIZES);
+  }
+  const ::flatbuffers::Vector<int32_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_STRIDES);
+  }
+  ::flatbuffers::Vector<int32_t> *mutable_strides() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_STRIDES);
+  }
+  bool requires_grad() const {
+    return GetField<uint8_t>(VT_REQUIRES_GRAD, 0) != 0;
+  }
+  bool mutate_requires_grad(bool _requires_grad = 0) {
+    return SetField<uint8_t>(VT_REQUIRES_GRAD, static_cast<uint8_t>(_requires_grad), 0);
+  }
+  const torch::jit::mobile::serialization::QuantizedSchema *quantized_schema() const {
+    return GetPointer<const torch::jit::mobile::serialization::QuantizedSchema *>(VT_QUANTIZED_SCHEMA);
+  }
+  torch::jit::mobile::serialization::QuantizedSchema *mutable_quantized_schema() {
+    return GetPointer<torch::jit::mobile::serialization::QuantizedSchema *>(VT_QUANTIZED_SCHEMA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_STORAGE_LOCATION_INDEX, 4) &&
+           VerifyField<int8_t>(verifier, VT_SCALAR_TYPE, 1) &&
+           VerifyField<int32_t>(verifier, VT_STORAGE_OFFSET, 4) &&
+           VerifyOffset(verifier, VT_SIZES) &&
+           verifier.VerifyVector(sizes()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           VerifyField<uint8_t>(verifier, VT_REQUIRES_GRAD, 1) &&
+           VerifyOffset(verifier, VT_QUANTIZED_SCHEMA) &&
+           verifier.VerifyTable(quantized_schema()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorMetadataBuilder {
+  typedef TensorMetadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_location_index(uint32_t storage_location_index) {
+    fbb_.AddElement<uint32_t>(TensorMetadata::VT_STORAGE_LOCATION_INDEX, storage_location_index, 0);
+  }
+  void add_scalar_type(int8_t scalar_type) {
+    fbb_.AddElement<int8_t>(TensorMetadata::VT_SCALAR_TYPE, scalar_type, 0);
+  }
+  void add_storage_offset(int32_t storage_offset) {
+    fbb_.AddElement<int32_t>(TensorMetadata::VT_STORAGE_OFFSET, storage_offset, 0);
+  }
+  void add_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> sizes) {
+    fbb_.AddOffset(TensorMetadata::VT_SIZES, sizes);
+  }
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> strides) {
+    fbb_.AddOffset(TensorMetadata::VT_STRIDES, strides);
+  }
+  void add_requires_grad(bool requires_grad) {
+    fbb_.AddElement<uint8_t>(TensorMetadata::VT_REQUIRES_GRAD, static_cast<uint8_t>(requires_grad), 0);
+  }
+  void add_quantized_schema(::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema) {
+    fbb_.AddOffset(TensorMetadata::VT_QUANTIZED_SCHEMA, quantized_schema);
+  }
+  explicit TensorMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorMetadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorMetadata> CreateTensorMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t storage_location_index = 0,
+    int8_t scalar_type = 0,
+    int32_t storage_offset = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> sizes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> strides = 0,
+    bool requires_grad = false,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema = 0) {
+  TensorMetadataBuilder builder_(_fbb);
+  builder_.add_quantized_schema(quantized_schema);
+  builder_.add_strides(strides);
+  builder_.add_sizes(sizes);
+  builder_.add_storage_offset(storage_offset);
+  builder_.add_storage_location_index(storage_location_index);
+  builder_.add_requires_grad(requires_grad);
+  builder_.add_scalar_type(scalar_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorMetadata> CreateTensorMetadataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t storage_location_index = 0,
+    int8_t scalar_type = 0,
+    int32_t storage_offset = 0,
+    const std::vector<int32_t> *sizes = nullptr,
+    const std::vector<int32_t> *strides = nullptr,
+    bool requires_grad = false,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema = 0) {
+  auto sizes__ = sizes ? _fbb.CreateVector<int32_t>(*sizes) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<int32_t>(*strides) : 0;
+  return torch::jit::mobile::serialization::CreateTensorMetadata(
+      _fbb,
+      storage_location_index,
+      scalar_type,
+      storage_offset,
+      sizes__,
+      strides__,
+      requires_grad,
+      quantized_schema);
+}
+
+struct String FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StringBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::String *data() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA);
+  }
+  ::flatbuffers::String *mutable_data() {
+    return GetPointer<::flatbuffers::String *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyString(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StringBuilder {
+  typedef String Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::String> data) {
+    fbb_.AddOffset(String::VT_DATA, data);
+  }
+  explicit StringBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<String> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<String>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<String> CreateString(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> data = 0) {
+  StringBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<String> CreateStringDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *data = nullptr) {
+  auto data__ = data ? _fbb.CreateString(data) : 0;
+  return torch::jit::mobile::serialization::CreateString(
+      _fbb,
+      data__);
+}
+
+struct Device FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DeviceBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STR = 4
+  };
+  const ::flatbuffers::String *str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STR);
+  }
+  ::flatbuffers::String *mutable_str() {
+    return GetPointer<::flatbuffers::String *>(VT_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STR) &&
+           verifier.VerifyString(str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DeviceBuilder {
+  typedef Device Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_str(::flatbuffers::Offset<::flatbuffers::String> str) {
+    fbb_.AddOffset(Device::VT_STR, str);
+  }
+  explicit DeviceBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Device> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Device>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Device> CreateDevice(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> str = 0) {
+  DeviceBuilder builder_(_fbb);
+  builder_.add_str(str);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Device> CreateDeviceDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *str = nullptr) {
+  auto str__ = str ? _fbb.CreateString(str) : 0;
+  return torch::jit::mobile::serialization::CreateDevice(
+      _fbb,
+      str__);
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4,
+    VT_ANNOTATION_STR = 6
+  };
+  const ::flatbuffers::Vector<uint32_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  const ::flatbuffers::String *annotation_str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  ::flatbuffers::String *mutable_annotation_str() {
+    return GetPointer<::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           VerifyOffset(verifier, VT_ANNOTATION_STR) &&
+           verifier.VerifyString(annotation_str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ListBuilder {
+  typedef List Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items) {
+    fbb_.AddOffset(List::VT_ITEMS, items);
+  }
+  void add_annotation_str(::flatbuffers::Offset<::flatbuffers::String> annotation_str) {
+    fbb_.AddOffset(List::VT_ANNOTATION_STR, annotation_str);
+  }
+  explicit ListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<List> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<List>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<List> CreateList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> annotation_str = 0) {
+  ListBuilder builder_(_fbb);
+  builder_.add_annotation_str(annotation_str);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<List> CreateListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *items = nullptr,
+    const char *annotation_str = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint32_t>(*items) : 0;
+  auto annotation_str__ = annotation_str ? _fbb.CreateString(annotation_str) : 0;
+  return torch::jit::mobile::serialization::CreateList(
+      _fbb,
+      items__,
+      annotation_str__);
+}
+
+struct IntList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<int64_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<int64_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntListBuilder {
+  typedef IntList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> items) {
+    fbb_.AddOffset(IntList::VT_ITEMS, items);
+  }
+  explicit IntListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IntList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IntList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IntList> CreateIntList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> items = 0) {
+  IntListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<IntList> CreateIntListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<int64_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateIntList(
+      _fbb,
+      items__);
+}
+
+struct DoubleList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DoubleListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<double> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<double> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<double> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<double> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DoubleListBuilder {
+  typedef DoubleList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<double>> items) {
+    fbb_.AddOffset(DoubleList::VT_ITEMS, items);
+  }
+  explicit DoubleListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DoubleList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DoubleList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DoubleList> CreateDoubleList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<double>> items = 0) {
+  DoubleListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DoubleList> CreateDoubleListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<double> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<double>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateDoubleList(
+      _fbb,
+      items__);
+}
+
+struct BoolList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BoolListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BoolListBuilder {
+  typedef BoolList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> items) {
+    fbb_.AddOffset(BoolList::VT_ITEMS, items);
+  }
+  explicit BoolListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BoolList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BoolList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BoolList> CreateBoolList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> items = 0) {
+  BoolListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BoolList> CreateBoolListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint8_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateBoolList(
+      _fbb,
+      items__);
+}
+
+struct Tuple FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TupleBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<uint32_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TupleBuilder {
+  typedef Tuple Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items) {
+    fbb_.AddOffset(Tuple::VT_ITEMS, items);
+  }
+  explicit TupleBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Tuple> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Tuple>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Tuple> CreateTuple(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items = 0) {
+  TupleBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Tuple> CreateTupleDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint32_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateTuple(
+      _fbb,
+      items__);
+}
+
+struct Dict FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEYS = 4,
+    VT_VALUES = 6,
+    VT_ANNOTATION_STR = 8
+  };
+  const ::flatbuffers::Vector<uint32_t> *keys() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_KEYS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_keys() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_KEYS);
+  }
+  const ::flatbuffers::Vector<uint32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_values() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  const ::flatbuffers::String *annotation_str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  ::flatbuffers::String *mutable_annotation_str() {
+    return GetPointer<::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_KEYS) &&
+           verifier.VerifyVector(keys()) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           VerifyOffset(verifier, VT_ANNOTATION_STR) &&
+           verifier.VerifyString(annotation_str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DictBuilder {
+  typedef Dict Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keys(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> keys) {
+    fbb_.AddOffset(Dict::VT_KEYS, keys);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values) {
+    fbb_.AddOffset(Dict::VT_VALUES, values);
+  }
+  void add_annotation_str(::flatbuffers::Offset<::flatbuffers::String> annotation_str) {
+    fbb_.AddOffset(Dict::VT_ANNOTATION_STR, annotation_str);
+  }
+  explicit DictBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Dict> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Dict>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Dict> CreateDict(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> keys = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> annotation_str = 0) {
+  DictBuilder builder_(_fbb);
+  builder_.add_annotation_str(annotation_str);
+  builder_.add_values(values);
+  builder_.add_keys(keys);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Dict> CreateDictDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *keys = nullptr,
+    const std::vector<uint32_t> *values = nullptr,
+    const char *annotation_str = nullptr) {
+  auto keys__ = keys ? _fbb.CreateVector<uint32_t>(*keys) : 0;
+  auto values__ = values ? _fbb.CreateVector<uint32_t>(*values) : 0;
+  auto annotation_str__ = annotation_str ? _fbb.CreateString(annotation_str) : 0;
+  return torch::jit::mobile::serialization::CreateDict(
+      _fbb,
+      keys__,
+      values__,
+      annotation_str__);
+}
+
+struct ObjectType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ObjectTypeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_NAME = 4,
+    VT_TYPE = 6,
+    VT_ATTR_NAMES = 8
+  };
+  const ::flatbuffers::String *type_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  ::flatbuffers::String *mutable_type_name() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  torch::jit::mobile::serialization::TypeType type() const {
+    return static_cast<torch::jit::mobile::serialization::TypeType>(GetField<uint8_t>(VT_TYPE, 0));
+  }
+  bool mutate_type(torch::jit::mobile::serialization::TypeType _type = static_cast<torch::jit::mobile::serialization::TypeType>(0)) {
+    return SetField<uint8_t>(VT_TYPE, static_cast<uint8_t>(_type), 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *attr_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTR_NAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_attr_names() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTR_NAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE_NAME) &&
+           verifier.VerifyString(type_name()) &&
+           VerifyField<uint8_t>(verifier, VT_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ATTR_NAMES) &&
+           verifier.VerifyVector(attr_names()) &&
+           verifier.VerifyVectorOfStrings(attr_names()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectTypeBuilder {
+  typedef ObjectType Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_name(::flatbuffers::Offset<::flatbuffers::String> type_name) {
+    fbb_.AddOffset(ObjectType::VT_TYPE_NAME, type_name);
+  }
+  void add_type(torch::jit::mobile::serialization::TypeType type) {
+    fbb_.AddElement<uint8_t>(ObjectType::VT_TYPE, static_cast<uint8_t>(type), 0);
+  }
+  void add_attr_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attr_names) {
+    fbb_.AddOffset(ObjectType::VT_ATTR_NAMES, attr_names);
+  }
+  explicit ObjectTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ObjectType> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ObjectType>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ObjectType> CreateObjectType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type_name = 0,
+    torch::jit::mobile::serialization::TypeType type = torch::jit::mobile::serialization::TypeType::UNSET,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attr_names = 0) {
+  ObjectTypeBuilder builder_(_fbb);
+  builder_.add_attr_names(attr_names);
+  builder_.add_type_name(type_name);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ObjectType> CreateObjectTypeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type_name = nullptr,
+    torch::jit::mobile::serialization::TypeType type = torch::jit::mobile::serialization::TypeType::UNSET,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *attr_names = nullptr) {
+  auto type_name__ = type_name ? _fbb.CreateString(type_name) : 0;
+  auto attr_names__ = attr_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*attr_names) : 0;
+  return torch::jit::mobile::serialization::CreateObjectType(
+      _fbb,
+      type_name__,
+      type,
+      attr_names__);
+}
+
+struct Object FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ObjectBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_INDEX = 4,
+    VT_STATE = 6,
+    VT_ATTRS = 8,
+    VT_SETSTATE_FUNC = 10
+  };
+  uint32_t type_index() const {
+    return GetField<uint32_t>(VT_TYPE_INDEX, 0);
+  }
+  bool mutate_type_index(uint32_t _type_index = 0) {
+    return SetField<uint32_t>(VT_TYPE_INDEX, _type_index, 0);
+  }
+  uint32_t state() const {
+    return GetField<uint32_t>(VT_STATE, 0);
+  }
+  bool mutate_state(uint32_t _state = 0) {
+    return SetField<uint32_t>(VT_STATE, _state, 0);
+  }
+  const ::flatbuffers::Vector<uint32_t> *attrs() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ATTRS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_attrs() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ATTRS);
+  }
+  uint32_t setstate_func() const {
+    return GetField<uint32_t>(VT_SETSTATE_FUNC, 0);
+  }
+  bool mutate_setstate_func(uint32_t _setstate_func = 0) {
+    return SetField<uint32_t>(VT_SETSTATE_FUNC, _setstate_func, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_TYPE_INDEX, 4) &&
+           VerifyField<uint32_t>(verifier, VT_STATE, 4) &&
+           VerifyOffset(verifier, VT_ATTRS) &&
+           verifier.VerifyVector(attrs()) &&
+           VerifyField<uint32_t>(verifier, VT_SETSTATE_FUNC, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectBuilder {
+  typedef Object Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_index(uint32_t type_index) {
+    fbb_.AddElement<uint32_t>(Object::VT_TYPE_INDEX, type_index, 0);
+  }
+  void add_state(uint32_t state) {
+    fbb_.AddElement<uint32_t>(Object::VT_STATE, state, 0);
+  }
+  void add_attrs(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> attrs) {
+    fbb_.AddOffset(Object::VT_ATTRS, attrs);
+  }
+  void add_setstate_func(uint32_t setstate_func) {
+    fbb_.AddElement<uint32_t>(Object::VT_SETSTATE_FUNC, setstate_func, 0);
+  }
+  explicit ObjectBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Object> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Object>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Object> CreateObject(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t type_index = 0,
+    uint32_t state = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> attrs = 0,
+    uint32_t setstate_func = 0) {
+  ObjectBuilder builder_(_fbb);
+  builder_.add_setstate_func(setstate_func);
+  builder_.add_attrs(attrs);
+  builder_.add_state(state);
+  builder_.add_type_index(type_index);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Object> CreateObjectDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t type_index = 0,
+    uint32_t state = 0,
+    const std::vector<uint32_t> *attrs = nullptr,
+    uint32_t setstate_func = 0) {
+  auto attrs__ = attrs ? _fbb.CreateVector<uint32_t>(*attrs) : 0;
+  return torch::jit::mobile::serialization::CreateObject(
+      _fbb,
+      type_index,
+      state,
+      attrs__,
+      setstate_func);
+}
+
+struct EnumValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EnumValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_NAME = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *type_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  ::flatbuffers::String *mutable_type_name() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  uint32_t value() const {
+    return GetField<uint32_t>(VT_VALUE, 0);
+  }
+  bool mutate_value(uint32_t _value = 0) {
+    return SetField<uint32_t>(VT_VALUE, _value, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE_NAME) &&
+           verifier.VerifyString(type_name()) &&
+           VerifyField<uint32_t>(verifier, VT_VALUE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumValueBuilder {
+  typedef EnumValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_name(::flatbuffers::Offset<::flatbuffers::String> type_name) {
+    fbb_.AddOffset(EnumValue::VT_TYPE_NAME, type_name);
+  }
+  void add_value(uint32_t value) {
+    fbb_.AddElement<uint32_t>(EnumValue::VT_VALUE, value, 0);
+  }
+  explicit EnumValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EnumValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EnumValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EnumValue> CreateEnumValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type_name = 0,
+    uint32_t value = 0) {
+  EnumValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_type_name(type_name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EnumValue> CreateEnumValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type_name = nullptr,
+    uint32_t value = 0) {
+  auto type_name__ = type_name ? _fbb.CreateString(type_name) : 0;
+  return torch::jit::mobile::serialization::CreateEnumValue(
+      _fbb,
+      type_name__,
+      value);
+}
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_OVERLOAD_NAME = 6,
+    VT_NUM_ARGS_SERIALIZED = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *overload_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_OVERLOAD_NAME);
+  }
+  ::flatbuffers::String *mutable_overload_name() {
+    return GetPointer<::flatbuffers::String *>(VT_OVERLOAD_NAME);
+  }
+  int32_t num_args_serialized() const {
+    return GetField<int32_t>(VT_NUM_ARGS_SERIALIZED, -1);
+  }
+  bool mutate_num_args_serialized(int32_t _num_args_serialized = -1) {
+    return SetField<int32_t>(VT_NUM_ARGS_SERIALIZED, _num_args_serialized, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_OVERLOAD_NAME) &&
+           verifier.VerifyString(overload_name()) &&
+           VerifyField<int32_t>(verifier, VT_NUM_ARGS_SERIALIZED, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperatorBuilder {
+  typedef Operator Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Operator::VT_NAME, name);
+  }
+  void add_overload_name(::flatbuffers::Offset<::flatbuffers::String> overload_name) {
+    fbb_.AddOffset(Operator::VT_OVERLOAD_NAME, overload_name);
+  }
+  void add_num_args_serialized(int32_t num_args_serialized) {
+    fbb_.AddElement<int32_t>(Operator::VT_NUM_ARGS_SERIALIZED, num_args_serialized, -1);
+  }
+  explicit OperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Operator> CreateOperator(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> overload_name = 0,
+    int32_t num_args_serialized = -1) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_num_args_serialized(num_args_serialized);
+  builder_.add_overload_name(overload_name);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Operator> CreateOperatorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *overload_name = nullptr,
+    int32_t num_args_serialized = -1) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto overload_name__ = overload_name ? _fbb.CreateString(overload_name) : 0;
+  return torch::jit::mobile::serialization::CreateOperator(
+      _fbb,
+      name__,
+      overload_name__,
+      num_args_serialized);
+}
+
+struct Arg FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TYPE = 6,
+    VT_DEFAULT_VALUE = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
+  }
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
+  }
+  uint32_t default_value() const {
+    return GetField<uint32_t>(VT_DEFAULT_VALUE, 0);
+  }
+  bool mutate_default_value(uint32_t _default_value = 0) {
+    return SetField<uint32_t>(VT_DEFAULT_VALUE, _default_value, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyField<uint32_t>(verifier, VT_DEFAULT_VALUE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ArgBuilder {
+  typedef Arg Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Arg::VT_NAME, name);
+  }
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
+    fbb_.AddOffset(Arg::VT_TYPE, type);
+  }
+  void add_default_value(uint32_t default_value) {
+    fbb_.AddElement<uint32_t>(Arg::VT_DEFAULT_VALUE, default_value, 0);
+  }
+  explicit ArgBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Arg> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Arg>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Arg> CreateArg(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    uint32_t default_value = 0) {
+  ArgBuilder builder_(_fbb);
+  builder_.add_default_value(default_value);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Arg> CreateArgDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *type = nullptr,
+    uint32_t default_value = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  return torch::jit::mobile::serialization::CreateArg(
+      _fbb,
+      name__,
+      type__,
+      default_value);
+}
+
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ARGUMENTS = 4,
+    VT_RETURNS = 6
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *arguments() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_ARGUMENTS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *mutable_arguments() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_ARGUMENTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *returns() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_RETURNS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *mutable_returns() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_RETURNS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ARGUMENTS) &&
+           verifier.VerifyVector(arguments()) &&
+           verifier.VerifyVectorOfTables(arguments()) &&
+           VerifyOffset(verifier, VT_RETURNS) &&
+           verifier.VerifyVector(returns()) &&
+           verifier.VerifyVectorOfTables(returns()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_arguments(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> arguments) {
+    fbb_.AddOffset(Schema::VT_ARGUMENTS, arguments);
+  }
+  void add_returns(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> returns) {
+    fbb_.AddOffset(Schema::VT_RETURNS, returns);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Schema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> arguments = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> returns = 0) {
+  SchemaBuilder builder_(_fbb);
+  builder_.add_returns(returns);
+  builder_.add_arguments(arguments);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *arguments = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *returns = nullptr) {
+  auto arguments__ = arguments ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>(*arguments) : 0;
+  auto returns__ = returns ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>(*returns) : 0;
+  return torch::jit::mobile::serialization::CreateSchema(
+      _fbb,
+      arguments__,
+      returns__);
+}
+
+struct DebugInfo FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DebugInfoBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEBUG_HANDLE = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *debug_handle() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_DEBUG_HANDLE);
+  }
+  ::flatbuffers::Vector<int64_t> *mutable_debug_handle() {
+    return GetPointer<::flatbuffers::Vector<int64_t> *>(VT_DEBUG_HANDLE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DEBUG_HANDLE) &&
+           verifier.VerifyVector(debug_handle()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DebugInfoBuilder {
+  typedef DebugInfo Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_debug_handle(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> debug_handle) {
+    fbb_.AddOffset(DebugInfo::VT_DEBUG_HANDLE, debug_handle);
+  }
+  explicit DebugInfoBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DebugInfo> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DebugInfo>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DebugInfo> CreateDebugInfo(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> debug_handle = 0) {
+  DebugInfoBuilder builder_(_fbb);
+  builder_.add_debug_handle(debug_handle);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DebugInfo> CreateDebugInfoDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *debug_handle = nullptr) {
+  auto debug_handle__ = debug_handle ? _fbb.CreateVector<int64_t>(*debug_handle) : 0;
+  return torch::jit::mobile::serialization::CreateDebugInfo(
+      _fbb,
+      debug_handle__);
+}
+
+struct Function FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FunctionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_QN = 4,
+    VT_INSTRUCTIONS = 6,
+    VT_OPERATORS = 8,
+    VT_CONSTANTS = 10,
+    VT_TYPE_ANNOTATIONS = 12,
+    VT_REGISTER_SIZE = 14,
+    VT_SCHEMA = 16,
+    VT_DEBUG_INFO = 18,
+    VT_CLASS_TYPE = 20
+  };
+  const ::flatbuffers::String *qn() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_QN);
+  }
+  ::flatbuffers::String *mutable_qn() {
+    return GetPointer<::flatbuffers::String *>(VT_QN);
+  }
+  const ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *instructions() const {
+    return GetPointer<const ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *>(VT_INSTRUCTIONS);
+  }
+  ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *mutable_instructions() {
+    return GetPointer<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *>(VT_INSTRUCTIONS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *operators() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *>(VT_OPERATORS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *mutable_operators() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *>(VT_OPERATORS);
+  }
+  const ::flatbuffers::Vector<uint32_t> *constants() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_CONSTANTS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_constants() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_CONSTANTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *type_annotations() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_TYPE_ANNOTATIONS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_type_annotations() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_TYPE_ANNOTATIONS);
+  }
+  int32_t register_size() const {
+    return GetField<int32_t>(VT_REGISTER_SIZE, 0);
+  }
+  bool mutate_register_size(int32_t _register_size = 0) {
+    return SetField<int32_t>(VT_REGISTER_SIZE, _register_size, 0);
+  }
+  const torch::jit::mobile::serialization::Schema *schema() const {
+    return GetPointer<const torch::jit::mobile::serialization::Schema *>(VT_SCHEMA);
+  }
+  torch::jit::mobile::serialization::Schema *mutable_schema() {
+    return GetPointer<torch::jit::mobile::serialization::Schema *>(VT_SCHEMA);
+  }
+  const torch::jit::mobile::serialization::DebugInfo *debug_info() const {
+    return GetPointer<const torch::jit::mobile::serialization::DebugInfo *>(VT_DEBUG_INFO);
+  }
+  torch::jit::mobile::serialization::DebugInfo *mutable_debug_info() {
+    return GetPointer<torch::jit::mobile::serialization::DebugInfo *>(VT_DEBUG_INFO);
+  }
+  uint32_t class_type() const {
+    return GetField<uint32_t>(VT_CLASS_TYPE, 0);
+  }
+  bool mutate_class_type(uint32_t _class_type = 0) {
+    return SetField<uint32_t>(VT_CLASS_TYPE, _class_type, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_QN) &&
+           verifier.VerifyString(qn()) &&
+           VerifyOffset(verifier, VT_INSTRUCTIONS) &&
+           verifier.VerifyVector(instructions()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.VerifyVector(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_CONSTANTS) &&
+           verifier.VerifyVector(constants()) &&
+           VerifyOffset(verifier, VT_TYPE_ANNOTATIONS) &&
+           verifier.VerifyVector(type_annotations()) &&
+           verifier.VerifyVectorOfStrings(type_annotations()) &&
+           VerifyField<int32_t>(verifier, VT_REGISTER_SIZE, 4) &&
+           VerifyOffset(verifier, VT_SCHEMA) &&
+           verifier.VerifyTable(schema()) &&
+           VerifyOffset(verifier, VT_DEBUG_INFO) &&
+           verifier.VerifyTable(debug_info()) &&
+           VerifyField<uint32_t>(verifier, VT_CLASS_TYPE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FunctionBuilder {
+  typedef Function Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_qn(::flatbuffers::Offset<::flatbuffers::String> qn) {
+    fbb_.AddOffset(Function::VT_QN, qn);
+  }
+  void add_instructions(::flatbuffers::Offset<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *>> instructions) {
+    fbb_.AddOffset(Function::VT_INSTRUCTIONS, instructions);
+  }
+  void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>> operators) {
+    fbb_.AddOffset(Function::VT_OPERATORS, operators);
+  }
+  void add_constants(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> constants) {
+    fbb_.AddOffset(Function::VT_CONSTANTS, constants);
+  }
+  void add_type_annotations(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> type_annotations) {
+    fbb_.AddOffset(Function::VT_TYPE_ANNOTATIONS, type_annotations);
+  }
+  void add_register_size(int32_t register_size) {
+    fbb_.AddElement<int32_t>(Function::VT_REGISTER_SIZE, register_size, 0);
+  }
+  void add_schema(::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema) {
+    fbb_.AddOffset(Function::VT_SCHEMA, schema);
+  }
+  void add_debug_info(::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info) {
+    fbb_.AddOffset(Function::VT_DEBUG_INFO, debug_info);
+  }
+  void add_class_type(uint32_t class_type) {
+    fbb_.AddElement<uint32_t>(Function::VT_CLASS_TYPE, class_type, 0);
+  }
+  explicit FunctionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Function> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Function>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Function> CreateFunction(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> qn = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *>> instructions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> constants = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> type_annotations = 0,
+    int32_t register_size = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info = 0,
+    uint32_t class_type = 0) {
+  FunctionBuilder builder_(_fbb);
+  builder_.add_class_type(class_type);
+  builder_.add_debug_info(debug_info);
+  builder_.add_schema(schema);
+  builder_.add_register_size(register_size);
+  builder_.add_type_annotations(type_annotations);
+  builder_.add_constants(constants);
+  builder_.add_operators(operators);
+  builder_.add_instructions(instructions);
+  builder_.add_qn(qn);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Function> CreateFunctionDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *qn = nullptr,
+    const std::vector<torch::jit::mobile::serialization::Instruction> *instructions = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *operators = nullptr,
+    const std::vector<uint32_t> *constants = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *type_annotations = nullptr,
+    int32_t register_size = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info = 0,
+    uint32_t class_type = 0) {
+  auto qn__ = qn ? _fbb.CreateString(qn) : 0;
+  auto instructions__ = instructions ? _fbb.CreateVectorOfStructs<torch::jit::mobile::serialization::Instruction>(*instructions) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>(*operators) : 0;
+  auto constants__ = constants ? _fbb.CreateVector<uint32_t>(*constants) : 0;
+  auto type_annotations__ = type_annotations ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*type_annotations) : 0;
+  return torch::jit::mobile::serialization::CreateFunction(
+      _fbb,
+      qn__,
+      instructions__,
+      operators__,
+      constants__,
+      type_annotations__,
+      register_size,
+      schema,
+      debug_info,
+      class_type);
+}
+
+struct StorageData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StorageDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StorageDataBuilder {
+  typedef StorageData Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(StorageData::VT_DATA, data);
+  }
+  explicit StorageDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StorageData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StorageData>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StorageData> CreateStorageData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
+  StorageDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StorageData> CreateStorageDataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return torch::jit::mobile::serialization::CreateStorageData(
+      _fbb,
+      data__);
+}
+
+struct IValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VAL_TYPE = 4,
+    VT_VAL = 6
+  };
+  torch::jit::mobile::serialization::IValueUnion val_type() const {
+    return static_cast<torch::jit::mobile::serialization::IValueUnion>(GetField<uint8_t>(VT_VAL_TYPE, 0));
+  }
+  const void *val() const {
+    return GetPointer<const void *>(VT_VAL);
+  }
+  template<typename T> const T *val_as() const;
+  const torch::jit::mobile::serialization::Int *val_as_Int() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Int ? static_cast<const torch::jit::mobile::serialization::Int *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Bool *val_as_Bool() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Bool ? static_cast<const torch::jit::mobile::serialization::Bool *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Double *val_as_Double() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Double ? static_cast<const torch::jit::mobile::serialization::Double *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::ComplexDouble *val_as_ComplexDouble() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::ComplexDouble ? static_cast<const torch::jit::mobile::serialization::ComplexDouble *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *val_as_TensorMetadata() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::TensorMetadata ? static_cast<const torch::jit::mobile::serialization::TensorMetadata *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::String *val_as_String() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::String ? static_cast<const torch::jit::mobile::serialization::String *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::List *val_as_List() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::List ? static_cast<const torch::jit::mobile::serialization::List *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Tuple *val_as_Tuple() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Tuple ? static_cast<const torch::jit::mobile::serialization::Tuple *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Dict *val_as_Dict() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Dict ? static_cast<const torch::jit::mobile::serialization::Dict *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Object *val_as_Object() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Object ? static_cast<const torch::jit::mobile::serialization::Object *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::IntList *val_as_IntList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::IntList ? static_cast<const torch::jit::mobile::serialization::IntList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::DoubleList *val_as_DoubleList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::DoubleList ? static_cast<const torch::jit::mobile::serialization::DoubleList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::BoolList *val_as_BoolList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::BoolList ? static_cast<const torch::jit::mobile::serialization::BoolList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Device *val_as_Device() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Device ? static_cast<const torch::jit::mobile::serialization::Device *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::EnumValue *val_as_EnumValue() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::EnumValue ? static_cast<const torch::jit::mobile::serialization::EnumValue *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Function *val_as_Function() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Function ? static_cast<const torch::jit::mobile::serialization::Function *>(val()) : nullptr;
+  }
+  void *mutable_val() {
+    return GetPointer<void *>(VT_VAL);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VAL_TYPE, 1) &&
+           VerifyOffset(verifier, VT_VAL) &&
+           VerifyIValueUnion(verifier, val(), val_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const torch::jit::mobile::serialization::Int *IValue::val_as<torch::jit::mobile::serialization::Int>() const {
+  return val_as_Int();
+}
+
+template<> inline const torch::jit::mobile::serialization::Bool *IValue::val_as<torch::jit::mobile::serialization::Bool>() const {
+  return val_as_Bool();
+}
+
+template<> inline const torch::jit::mobile::serialization::Double *IValue::val_as<torch::jit::mobile::serialization::Double>() const {
+  return val_as_Double();
+}
+
+template<> inline const torch::jit::mobile::serialization::ComplexDouble *IValue::val_as<torch::jit::mobile::serialization::ComplexDouble>() const {
+  return val_as_ComplexDouble();
+}
+
+template<> inline const torch::jit::mobile::serialization::TensorMetadata *IValue::val_as<torch::jit::mobile::serialization::TensorMetadata>() const {
+  return val_as_TensorMetadata();
+}
+
+template<> inline const torch::jit::mobile::serialization::String *IValue::val_as<torch::jit::mobile::serialization::String>() const {
+  return val_as_String();
+}
+
+template<> inline const torch::jit::mobile::serialization::List *IValue::val_as<torch::jit::mobile::serialization::List>() const {
+  return val_as_List();
+}
+
+template<> inline const torch::jit::mobile::serialization::Tuple *IValue::val_as<torch::jit::mobile::serialization::Tuple>() const {
+  return val_as_Tuple();
+}
+
+template<> inline const torch::jit::mobile::serialization::Dict *IValue::val_as<torch::jit::mobile::serialization::Dict>() const {
+  return val_as_Dict();
+}
+
+template<> inline const torch::jit::mobile::serialization::Object *IValue::val_as<torch::jit::mobile::serialization::Object>() const {
+  return val_as_Object();
+}
+
+template<> inline const torch::jit::mobile::serialization::IntList *IValue::val_as<torch::jit::mobile::serialization::IntList>() const {
+  return val_as_IntList();
+}
+
+template<> inline const torch::jit::mobile::serialization::DoubleList *IValue::val_as<torch::jit::mobile::serialization::DoubleList>() const {
+  return val_as_DoubleList();
+}
+
+template<> inline const torch::jit::mobile::serialization::BoolList *IValue::val_as<torch::jit::mobile::serialization::BoolList>() const {
+  return val_as_BoolList();
+}
+
+template<> inline const torch::jit::mobile::serialization::Device *IValue::val_as<torch::jit::mobile::serialization::Device>() const {
+  return val_as_Device();
+}
+
+template<> inline const torch::jit::mobile::serialization::EnumValue *IValue::val_as<torch::jit::mobile::serialization::EnumValue>() const {
+  return val_as_EnumValue();
+}
+
+template<> inline const torch::jit::mobile::serialization::Function *IValue::val_as<torch::jit::mobile::serialization::Function>() const {
+  return val_as_Function();
+}
+
+struct IValueBuilder {
+  typedef IValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_val_type(torch::jit::mobile::serialization::IValueUnion val_type) {
+    fbb_.AddElement<uint8_t>(IValue::VT_VAL_TYPE, static_cast<uint8_t>(val_type), 0);
+  }
+  void add_val(::flatbuffers::Offset<void> val) {
+    fbb_.AddOffset(IValue::VT_VAL, val);
+  }
+  explicit IValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IValue> CreateIValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    torch::jit::mobile::serialization::IValueUnion val_type = torch::jit::mobile::serialization::IValueUnion::NONE,
+    ::flatbuffers::Offset<void> val = 0) {
+  IValueBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_val_type(val_type);
+  return builder_.Finish();
+}
+
+struct ExtraFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExtraFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_CONTENT = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *content() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CONTENT);
+  }
+  ::flatbuffers::String *mutable_content() {
+    return GetPointer<::flatbuffers::String *>(VT_CONTENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_CONTENT) &&
+           verifier.VerifyString(content()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ExtraFileBuilder {
+  typedef ExtraFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(ExtraFile::VT_NAME, name);
+  }
+  void add_content(::flatbuffers::Offset<::flatbuffers::String> content) {
+    fbb_.AddOffset(ExtraFile::VT_CONTENT, content);
+  }
+  explicit ExtraFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExtraFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExtraFile>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExtraFile> CreateExtraFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> content = 0) {
+  ExtraFileBuilder builder_(_fbb);
+  builder_.add_content(content);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExtraFile> CreateExtraFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *content = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto content__ = content ? _fbb.CreateString(content) : 0;
+  return torch::jit::mobile::serialization::CreateExtraFile(
+      _fbb,
+      name__,
+      content__);
+}
+
+struct Module FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModuleBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BYTECODE_VERSION = 4,
+    VT_EXTRA_FILES = 6,
+    VT_METHODS = 8,
+    VT_STATE_OBJ = 10,
+    VT_IVALUES = 12,
+    VT_STORAGE_DATA_SIZE = 14,
+    VT_STORAGE_DATA = 16,
+    VT_OBJECT_TYPES = 18,
+    VT_JIT_SOURCES = 20,
+    VT_JIT_CONSTANTS = 22,
+    VT_OPERATOR_VERSION = 24,
+    VT_MOBILE_IVALUE_SIZE = 26
+  };
+  uint32_t bytecode_version() const {
+    return GetField<uint32_t>(VT_BYTECODE_VERSION, 0);
+  }
+  bool mutate_bytecode_version(uint32_t _bytecode_version = 0) {
+    return SetField<uint32_t>(VT_BYTECODE_VERSION, _bytecode_version, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_EXTRA_FILES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *mutable_extra_files() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_EXTRA_FILES);
+  }
+  const ::flatbuffers::Vector<uint32_t> *methods() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_METHODS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_methods() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_METHODS);
+  }
+  uint32_t state_obj() const {
+    return GetField<uint32_t>(VT_STATE_OBJ, 0);
+  }
+  bool mutate_state_obj(uint32_t _state_obj = 0) {
+    return SetField<uint32_t>(VT_STATE_OBJ, _state_obj, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *>(VT_IVALUES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *mutable_ivalues() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *>(VT_IVALUES);
+  }
+  int32_t storage_data_size() const {
+    return GetField<int32_t>(VT_STORAGE_DATA_SIZE, 0);
+  }
+  bool mutate_storage_data_size(int32_t _storage_data_size = 0) {
+    return SetField<int32_t>(VT_STORAGE_DATA_SIZE, _storage_data_size, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *>(VT_STORAGE_DATA);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *mutable_storage_data() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *>(VT_STORAGE_DATA);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *>(VT_OBJECT_TYPES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *mutable_object_types() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *>(VT_OBJECT_TYPES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *mutable_jit_sources() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  const ::flatbuffers::Vector<uint32_t> *jit_constants() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_jit_constants() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  uint32_t operator_version() const {
+    return GetField<uint32_t>(VT_OPERATOR_VERSION, 0);
+  }
+  bool mutate_operator_version(uint32_t _operator_version = 0) {
+    return SetField<uint32_t>(VT_OPERATOR_VERSION, _operator_version, 0);
+  }
+  uint32_t mobile_ivalue_size() const {
+    return GetField<uint32_t>(VT_MOBILE_IVALUE_SIZE, 0);
+  }
+  bool mutate_mobile_ivalue_size(uint32_t _mobile_ivalue_size = 0) {
+    return SetField<uint32_t>(VT_MOBILE_IVALUE_SIZE, _mobile_ivalue_size, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_BYTECODE_VERSION, 4) &&
+           VerifyOffset(verifier, VT_EXTRA_FILES) &&
+           verifier.VerifyVector(extra_files()) &&
+           verifier.VerifyVectorOfTables(extra_files()) &&
+           VerifyOffset(verifier, VT_METHODS) &&
+           verifier.VerifyVector(methods()) &&
+           VerifyField<uint32_t>(verifier, VT_STATE_OBJ, 4) &&
+           VerifyOffset(verifier, VT_IVALUES) &&
+           verifier.VerifyVector(ivalues()) &&
+           verifier.VerifyVectorOfTables(ivalues()) &&
+           VerifyField<int32_t>(verifier, VT_STORAGE_DATA_SIZE, 4) &&
+           VerifyOffset(verifier, VT_STORAGE_DATA) &&
+           verifier.VerifyVector(storage_data()) &&
+           verifier.VerifyVectorOfTables(storage_data()) &&
+           VerifyOffset(verifier, VT_OBJECT_TYPES) &&
+           verifier.VerifyVector(object_types()) &&
+           verifier.VerifyVectorOfTables(object_types()) &&
+           VerifyOffset(verifier, VT_JIT_SOURCES) &&
+           verifier.VerifyVector(jit_sources()) &&
+           verifier.VerifyVectorOfTables(jit_sources()) &&
+           VerifyOffset(verifier, VT_JIT_CONSTANTS) &&
+           verifier.VerifyVector(jit_constants()) &&
+           VerifyField<uint32_t>(verifier, VT_OPERATOR_VERSION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_MOBILE_IVALUE_SIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ModuleBuilder {
+  typedef Module Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_bytecode_version(uint32_t bytecode_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_BYTECODE_VERSION, bytecode_version, 0);
+  }
+  void add_extra_files(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files) {
+    fbb_.AddOffset(Module::VT_EXTRA_FILES, extra_files);
+  }
+  void add_methods(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> methods) {
+    fbb_.AddOffset(Module::VT_METHODS, methods);
+  }
+  void add_state_obj(uint32_t state_obj) {
+    fbb_.AddElement<uint32_t>(Module::VT_STATE_OBJ, state_obj, 0);
+  }
+  void add_ivalues(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>> ivalues) {
+    fbb_.AddOffset(Module::VT_IVALUES, ivalues);
+  }
+  void add_storage_data_size(int32_t storage_data_size) {
+    fbb_.AddElement<int32_t>(Module::VT_STORAGE_DATA_SIZE, storage_data_size, 0);
+  }
+  void add_storage_data(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>> storage_data) {
+    fbb_.AddOffset(Module::VT_STORAGE_DATA, storage_data);
+  }
+  void add_object_types(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types) {
+    fbb_.AddOffset(Module::VT_OBJECT_TYPES, object_types);
+  }
+  void add_jit_sources(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources) {
+    fbb_.AddOffset(Module::VT_JIT_SOURCES, jit_sources);
+  }
+  void add_jit_constants(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> jit_constants) {
+    fbb_.AddOffset(Module::VT_JIT_CONSTANTS, jit_constants);
+  }
+  void add_operator_version(uint32_t operator_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_OPERATOR_VERSION, operator_version, 0);
+  }
+  void add_mobile_ivalue_size(uint32_t mobile_ivalue_size) {
+    fbb_.AddElement<uint32_t>(Module::VT_MOBILE_IVALUE_SIZE, mobile_ivalue_size, 0);
+  }
+  explicit ModuleBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Module> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Module>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Module> CreateModule(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t bytecode_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> methods = 0,
+    uint32_t state_obj = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>> ivalues = 0,
+    int32_t storage_data_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>> storage_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> jit_constants = 0,
+    uint32_t operator_version = 0,
+    uint32_t mobile_ivalue_size = 0) {
+  ModuleBuilder builder_(_fbb);
+  builder_.add_mobile_ivalue_size(mobile_ivalue_size);
+  builder_.add_operator_version(operator_version);
+  builder_.add_jit_constants(jit_constants);
+  builder_.add_jit_sources(jit_sources);
+  builder_.add_object_types(object_types);
+  builder_.add_storage_data(storage_data);
+  builder_.add_storage_data_size(storage_data_size);
+  builder_.add_ivalues(ivalues);
+  builder_.add_state_obj(state_obj);
+  builder_.add_methods(methods);
+  builder_.add_extra_files(extra_files);
+  builder_.add_bytecode_version(bytecode_version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Module> CreateModuleDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t bytecode_version = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files = nullptr,
+    const std::vector<uint32_t> *methods = nullptr,
+    uint32_t state_obj = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues = nullptr,
+    int32_t storage_data_size = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources = nullptr,
+    const std::vector<uint32_t> *jit_constants = nullptr,
+    uint32_t operator_version = 0,
+    uint32_t mobile_ivalue_size = 0) {
+  auto extra_files__ = extra_files ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*extra_files) : 0;
+  auto methods__ = methods ? _fbb.CreateVector<uint32_t>(*methods) : 0;
+  auto ivalues__ = ivalues ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>(*ivalues) : 0;
+  auto storage_data__ = storage_data ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>(*storage_data) : 0;
+  auto object_types__ = object_types ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>(*object_types) : 0;
+  auto jit_sources__ = jit_sources ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*jit_sources) : 0;
+  auto jit_constants__ = jit_constants ? _fbb.CreateVector<uint32_t>(*jit_constants) : 0;
+  return torch::jit::mobile::serialization::CreateModule(
+      _fbb,
+      bytecode_version,
+      extra_files__,
+      methods__,
+      state_obj,
+      ivalues__,
+      storage_data_size,
+      storage_data__,
+      object_types__,
+      jit_sources__,
+      jit_constants__,
+      operator_version,
+      mobile_ivalue_size);
+}
+
+inline bool VerifyIValueUnion(::flatbuffers::Verifier &verifier, const void *obj, IValueUnion type) {
+  switch (type) {
+    case IValueUnion::NONE: {
+      return true;
+    }
+    case IValueUnion::Int: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Int>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::Bool: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Bool>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    case IValueUnion::Double: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Double>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::ComplexDouble: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::ComplexDouble>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::TensorMetadata: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::TensorMetadata *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::String: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::String *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::List: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::List *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Tuple: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Tuple *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Dict: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Dict *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Object: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Object *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::IntList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::IntList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::DoubleList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::DoubleList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::BoolList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::BoolList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Device: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Device *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::EnumValue: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::EnumValue *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Function: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Function *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyIValueUnionVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<IValueUnion> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyIValueUnion(
+        verifier,  values->Get(i), types->GetEnum<IValueUnion>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const torch::jit::mobile::serialization::Module *GetModule(const void *buf) {
+  return ::flatbuffers::GetRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline const torch::jit::mobile::serialization::Module *GetSizePrefixedModule(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline Module *GetMutableModule(void *buf) {
+  return ::flatbuffers::GetMutableRoot<Module>(buf);
+}
+
+inline torch::jit::mobile::serialization::Module *GetMutableSizePrefixedModule(void *buf) {
+  return ::flatbuffers::GetMutableSizePrefixedRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline const char *ModuleIdentifier() {
+  return "PTMF";
+}
+
+inline bool ModuleBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModuleIdentifier());
+}
+
+inline bool SizePrefixedModuleBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModuleIdentifier(), true);
+}
+
+inline bool VerifyModuleBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
+}
+
+inline bool VerifySizePrefixedModuleBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
+}
+
+inline void FinishModuleBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
+  fbb.Finish(root, ModuleIdentifier());
+}
+
+inline void FinishSizePrefixedModuleBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
+  fbb.FinishSizePrefixed(root, ModuleIdentifier());
+}
+
+}  // namespace serialization
+}  // namespace mobile
+}  // namespace jit
+}  // namespace torch
+
+#endif  // FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..89f97cb923e5e85d9a55c0122b45d4b1beae2656
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <onnx/onnx_pb.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+
+TORCH_API std::string prettyPrint(const ::ONNX_NAMESPACE::ModelProto& model);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h
new file mode 100644
index 0000000000000000000000000000000000000000..928c8c7c2a39a92ce4486ab8d890643ac1589e11
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h
@@ -0,0 +1,123 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/ArrayRef.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/jit/serialization/unpickler.h>
+
+namespace torch {
+namespace jit {
+
+/// Pickle an IValue by calling a function to handle writing the data.
+///
+/// `writer` is a function that takes in a pointer to a chunk of memory and its
+/// size and consumes it.
+///
+/// See `jit::pickle` for more details.
+TORCH_API void pickle(
+    std::function<void(const char* data_start, size_t data_len)> writer,
+    const IValue& ivalue,
+    std::vector<at::Tensor>* tensor_table = nullptr);
+
+/// Save a `torch::IValue` in a format compatible with Python's `pickle` module
+///
+/// If present, `tensor_table` is a pointer to a table in which tensors that
+/// are contained within `ivalue` are stored, and the bytes returned by the
+/// pickler will only include references to these tensors in the table. This can
+/// be used to keep the binary blob size small.
+/// If not provided, tensors are stored in the same byte stream as the pickle
+/// data, similar to `torch.save()` in eager Python.
+///
+/// Pickled values can be loaded in Python and C++:
+/// \rst
+/// .. code-block:: cpp
+///
+///  torch::IValue float_value(2.3);
+///
+///  // TODO: when tensors are stored in the pickle, delete this
+///  std::vector<at::Tensor> tensor_table;
+///  auto data = torch::jit::pickle(float_value, &tensor_table);
+///
+///  std::vector<torch::IValue> ivalues =
+///      torch::jit::unpickle(data.data(), data.size());
+///
+/// .. code-block:: python
+///
+///   values = torch.load('data.pkl')
+///   print(values)
+///
+/// \endrst
+TORCH_API std::vector<char> pickle(
+    const IValue& ivalue,
+    std::vector<at::Tensor>* tensor_table = nullptr);
+
+/// Save a `torch::IValue` in a format that can be loaded by both
+/// `torch::pickle_load` in C++ and `torch.load` in Python.
+TORCH_API std::vector<char> pickle_save(const IValue& ivalue);
+
+/// Deserialize a `torch::IValue` from bytes produced by either
+/// `torch::pickle_save` in C++ or `torch.save` in Python
+TORCH_API IValue pickle_load(const std::vector<char>& data);
+
+/// `reader` is a function that takes in a size to read from some pickled
+/// binary. `reader` should remember where it last read, and return
+/// the number of bytes read.
+/// See `torch::pickle` for details.
+/// type_resolver is used to resolve any JIT type based on type str
+TORCH_API IValue unpickle(
+    std::function<size_t(char*, size_t)> reader,
+    TypeResolver type_resolver,
+    c10::ArrayRef<at::Tensor> tensor_table,
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser,
+    ObjLoader obj_loader = nullptr);
+
+/// Decode a chunk of memory containing pickled data into its `torch::IValue`s.
+///
+/// If any `torch::IValue`s in the pickled data are `Object`s, then a
+/// `class_resolver` function must be provided.
+///
+/// See `torch::pickle` for details.
+TORCH_API IValue unpickle(
+    const char* data,
+    size_t size,
+    TypeResolver type_resolver = nullptr,
+    c10::ArrayRef<at::Tensor> tensor_table = {},
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser);
+
+/// Decode a chunk of memory containing pickled data into its `torch::IValue`s.
+///
+/// If any `torch::IValue`s in the pickled data are `Object`s, then a
+/// `class_resolver` function must be provided.
+///
+/// See `torch::pickle` for details.
+TORCH_API IValue unpickle(
+    const char* data,
+    size_t size,
+    ObjLoader obj_loader,
+    TypeResolver type_resolver = nullptr,
+    c10::ArrayRef<at::Tensor> tensor_table = {},
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser);
+
+#ifndef C10_MOBILE
+class VectorReader : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  VectorReader(std::vector<char> data) : data_(std::move(data)) {}
+
+  size_t size() const override {
+    return data_.size();
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what)
+      const override;
+
+ private:
+  std::vector<char> data_;
+};
+#endif
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h
new file mode 100644
index 0000000000000000000000000000000000000000..744f7722e6553a57f7305d3b7a23a0ed8a627f93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h
@@ -0,0 +1,429 @@
+#pragma once
+
+#include <ATen/core/qualified_name.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <ATen/Utils.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/string_view.h>
+#include <torch/csrc/Export.h>
+
+namespace torch {
+namespace jit {
+
+// See Python's pickletools.py for a detailed description of each of these codes
+enum class PickleOpCode : char {
+  MARK = '(',
+  STOP = '.',
+  POP = '0',
+  POP_MARK = '1',
+  DUP = '2',
+  FLOAT = 'F',
+  INT = 'I',
+  BININT = 'J',
+  BININT1 = 'K',
+  LONG = 'L',
+  BININT2 = 'M',
+  NONE = 'N',
+  PERSID = 'P',
+  BINPERSID = 'Q',
+  REDUCE = 'R',
+  STRING = 'S',
+  BINSTRING = 'T',
+  SHORT_BINSTRING = 'U',
+  // NB: Avoid using UNICODE as it is a macro in the Windows API
+  UNICODE_ = 'V',
+  BINUNICODE = 'X',
+  APPEND = 'a',
+  BUILD = 'b',
+  GLOBAL = 'c',
+  DICT = 'd',
+  EMPTY_DICT = '}',
+  APPENDS = 'e',
+  GET = 'g',
+  BINGET = 'h',
+  INST = 'i',
+  LONG_BINGET = 'j',
+  LIST = 'l',
+  EMPTY_LIST = ']',
+  OBJ = 'o',
+  PUT = 'p',
+  BINPUT = 'q',
+  LONG_BINPUT = 'r',
+  SETITEM = 's',
+  TUPLE = 't',
+  EMPTY_TUPLE = ')',
+  SETITEMS = 'u',
+  BINFLOAT = 'G',
+
+  // Protocol 2
+  PROTO = char('\x80'),
+  NEWOBJ = '\x81',
+  EXT1 = '\x82',
+  EXT2 = '\x83',
+  EXT4 = '\x84',
+  TUPLE1 = '\x85',
+  TUPLE2 = '\x86',
+  TUPLE3 = '\x87',
+  NEWTRUE = '\x88',
+  NEWFALSE = '\x89',
+  LONG1 = '\x8a',
+  LONG4 = '\x8b',
+
+  // Protocol 3 (Python 3.x)
+  BINBYTES = 'B',
+  SHORT_BINBYTES = 'C',
+
+  // Protocol 4
+  SHORT_BINUNICODE = char('\x8c'),
+  BINUNICODE8 = '\x8d',
+  BINBYTES8 = '\x8e',
+  EMPTY_SET = '\x8f',
+  ADDITEMS = '\x90',
+  FROZENSET = '\x91',
+  NEWOBJ_EX = '\x92',
+  STACK_GLOBAL = '\x93',
+  MEMOIZE = '\x94',
+  FRAME = '\x95'
+};
+
+using ::c10::IValue;
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct WriteableTensorData {
+  const char* data() const {
+    return static_cast<const char*>(tensor_.storage().data());
+  }
+  size_t sizeInBytes() const {
+    return size_;
+  }
+  size_t nbytes() const {
+    return tensor_.storage().nbytes();
+  }
+  bool storageHasDeleter() const {
+    return tensor_.storage().data_ptr().get_context() != nullptr;
+  }
+
+ private:
+  friend TORCH_API WriteableTensorData
+  getWriteableTensorData(const at::Tensor& tensor, bool to_cpu);
+  at::Tensor tensor_;
+  uint64_t size_;
+};
+
+void setTypeTags(bool state);
+bool getTypeTags();
+
+class TORCH_API Pickler {
+  AT_DISALLOW_COPY_AND_ASSIGN(Pickler);
+
+ public:
+  Pickler(std::function<void(const char*, size_t)> writer)
+      : Pickler(std::move(writer), nullptr, nullptr, nullptr) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Pickler(
+      std::function<void(const char*, size_t)> writer,
+      std::vector<at::Tensor>* tensor_table,
+      std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer,
+      std::vector<c10::ClassTypePtr>* memoized_class_types,
+      std::function<std::string(const at::Tensor&)> get_tensor_id = nullptr,
+      bool tag_aggregates = true)
+      : writer_(std::move(writer)),
+        tensor_table_(tensor_table),
+        type_renamer_(std::move(type_renamer)),
+        memoized_class_types_(memoized_class_types),
+        get_tensor_id_(std::move(get_tensor_id)),
+        tag_aggregates_(tag_aggregates) {}
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~Pickler();
+
+  // Push protocol onto the stack
+  void protocol();
+
+  // Push STOP PickleOpCode onto the stack
+  void stop();
+
+  void pushIValue(const IValue& ivalue);
+
+  void startTuple();
+  void endTuple();
+
+  const std::vector<at::Tensor>& tensorData() {
+    return tensor_data_;
+  }
+
+  void pushEmptyDict();
+  void pushDict(const IValue& ivalue);
+  void pushInt(int64_t value);
+  void pushLong(const std::string& data);
+
+ private:
+  void pushIValueImpl(const IValue& ivalue);
+  void startTypeTag();
+  void endTypeTag(const IValue& value);
+  void pushBool(bool value);
+  void pushDouble(double value);
+  void pushComplexDouble(const IValue& value);
+  void pushGenericList(const IValue& ivalue);
+  void pushIntList(const IValue& ivalue);
+  void pushList(const IValue& ivalue);
+  void pushTensor(const IValue& ivalue);
+  void pushTensorReference(const IValue& ivalue);
+  void pushLiteralTensor(const IValue& ivalue);
+  void pushLiteralSparseTensor(const at::Tensor& tensor);
+  void pushTuple(const IValue& ivalue);
+  void pushString(const std::string& string);
+  void pushDevice(const IValue& ivalue);
+#ifdef USE_DISTRIBUTED
+  void pushRRef(const IValue& ivalue);
+#endif
+  // unmemoized version
+  void pushStringImpl(const std::string& string);
+  void pushStorageOfTensor(const at::Tensor& tensor);
+
+  void pushBinGet(uint32_t memo_id);
+  void pushSpecializedList(
+      const IValue& ivalue,
+      const char* list_name,
+      const std::function<void(const IValue&)>& item_pusher);
+  void pushGlobal(c10::string_view module_name, c10::string_view class_name);
+  // raw string data is appended directly to the byte stream
+  void pushBytes(const std::string& string);
+  void pushTensorData(const at::Tensor& tensor);
+
+  // Add a BINPUT op and return the memoization id used
+  size_t pushNextBinPut();
+
+  const void* getPointer(const IValue& ivalue);
+
+  // Caller checks that bufferPos_ > 0
+  void flushNonEmpty() {
+    writer_(buffer_.data(), bufferPos_);
+    bufferPos_ = 0;
+  }
+
+  void flush() {
+    if (bufferPos_ != 0) {
+      flushNonEmpty();
+    }
+  }
+
+  // These convert values to bytes and add them to the stack (NB: since T is to
+  // the left of a '::', its type cannot be deduced by the compiler so one must
+  // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
+  // does not)
+  static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256;
+  template <typename T>
+  void push(typename std::common_type<T>::type value) {
+    const char* begin = reinterpret_cast<const char*>(&value);
+    if (bufferPos_ + sizeof(T) > buffer_.size()) {
+      flushNonEmpty();
+    }
+    static_assert(sizeof(T) <= kBufferSize, "Buffer size assumption");
+    memcpy(buffer_.data() + bufferPos_, begin, sizeof(T));
+    bufferPos_ += sizeof(T);
+  }
+
+  // Stream to write binary data to
+  // Code shouldn't call writer_ directly without first flush()ing.
+  std::function<void(const char*, size_t)> writer_;
+
+  // Buffer to avoid calling a writer_ on a per-byte basis.
+  std::array<char, kBufferSize> buffer_;
+  size_t bufferPos_{0};
+
+  // Stack of opcodes/data
+  std::vector<char> stack_;
+
+  // External table of tensors to serialize. If this is missing, then tensors
+  // are serialized directly into the pickle
+  std::vector<at::Tensor>* tensor_table_;
+
+  // TODO: only use this if necessary (add a pass to find all shared ivalues,
+  // and only memoize those)
+  uint32_t memo_id_ = 0;
+
+  // Memoization of IValues that have been written (index in table is used for
+  // BINPUT opcodes) to enable shared references
+  c10::FastMap<const void*, uint32_t> memoized_ivalue_map_;
+
+  // because we de-dup ivalues based on their raw pointer address in the above
+  // map we need to keep all the memoized values alive during the pickle.
+  // Otherwise, it is possible that a raw address gets reused for another
+  // object, and we will alias it to the old object at that address.
+  std::vector<IValue> memoized_ivalues_;
+
+  std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer_;
+
+  // List of all the types that it wrote, inspect from the IValues it wrote.
+  std::vector<c10::ClassTypePtr>* memoized_class_types_;
+
+  // Function to grab next id_name for tensor storage, function is responsible
+  // for returning unique ids
+  std::function<std::string(const at::Tensor&)> get_tensor_id_;
+
+  // List of tensor storages to serialize in the same binary as the pickle data
+  // similar to ivalues, they are memoized using BINPUT
+  std::vector<at::Tensor> tensor_data_;
+  c10::FastMap<const void*, uint32_t> memoized_storage_map_;
+
+  c10::FastMap<std::string, uint32_t> memoized_globals_map_;
+  c10::FastMap<std::string, uint32_t> memoized_strings_map_;
+  c10::FastMap<std::string, uint32_t> memoized_devices_map_;
+  // when true, List and Dict objects will be wrapped in a
+  // torch.jit._pickle.restore_type_tag call to correctly set the dynamic
+  // TorchScript type for the object. When true the thing unpickling must have
+  // torch installed.
+  bool tag_aggregates_;
+};
+
+// returns a (tensor, record_size) for a tensor, converting it to a CPU tensor
+// if it was CUDA and to_cpu is True.
+TORCH_API WriteableTensorData
+getWriteableTensorData(const at::Tensor& tensor, bool to_cpu = true);
+
+// return the value of the tensor's storage pointer
+uint64_t getStorageKey(const at::Tensor& tensor);
+
+// if the cls has __getstate__/__setstate__
+// assert they have the right schema and return true,
+// otherwise return false
+bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
+
+// Declare BackendMeta serialization and deserialization function pointer types.
+using BackendMetaPtr = std::function<
+    void(const at::Tensor&, std::unordered_map<std::string, bool>&)>;
+
+// A allowlist of device type, currently available is PrivateUse1
+inline std::unordered_set<c10::DeviceType>& GetBackendMetaAllowlist() {
+  static std::unordered_set<c10::DeviceType> DeviceTypeAllowlist{
+      c10::DeviceType::PrivateUse1};
+  return DeviceTypeAllowlist;
+}
+
+// Dynamically obtain serialization function pairs
+// that require the corresponding backend.
+inline std::array<
+    c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+    at::COMPILE_TIME_MAX_DEVICE_TYPES>&
+GetBackendMetaSerialization() {
+  // The array to save function pointer for BackendMeta serialization.
+  // key is the DeviceType, value is std::pair obj.
+  // value.first represent get function and value.seconde represent set function
+  static std::array<
+      c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+      at::COMPILE_TIME_MAX_DEVICE_TYPES>
+      BackendMetaSerialization;
+  return BackendMetaSerialization;
+}
+
+// Register function pointer of Tensor BackendMetadata for serialization.
+TORCH_API inline void TensorBackendMetaRegistry(
+    c10::DeviceType t,
+    const BackendMetaPtr& get_fptr,
+    const BackendMetaPtr& set_fptr) {
+  // allowlist verification
+  // Only if the devicetype is in the allowlist,
+  // we allow the serialization extension to be registered for backendmeta data.
+  const auto& DeviceTypeAllowlist = GetBackendMetaAllowlist();
+  TORCH_CHECK(
+      DeviceTypeAllowlist.find(t) != DeviceTypeAllowlist.end(),
+      "It is not allowed to register the serialization method ",
+      "of backendMeta data for PrivateUse1. ",
+      "If you have related serialization requirements, ",
+      "please expand the allowlist");
+  // Register function pointer
+  int device_type = static_cast<int>(t);
+  auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  TORCH_CHECK(
+      !BackendMetaSerialization[device_type].has_value(),
+      "The tensor BackendMeta serialization function pointer for ",
+      t,
+      " has been registered.");
+  BackendMetaSerialization[device_type] =
+      c10::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>(
+          std::make_pair(get_fptr, set_fptr));
+}
+
+// Return a map of Tensor Metadata which including BackendMetaData for
+// serialization. For now, it only takes care of `conj` and `neg` bit.
+inline std::unordered_map<std::string, bool> getTensorMetadata(
+    const at::Tensor& t) {
+  // We don't support serializing `ZeroTensor` as it is not public
+  // facing yet.
+  TORCH_CHECK(
+      !t._is_zerotensor(),
+      "ZeroTensor is not serializable,",
+      " please file an issue if required.");
+  std::unordered_map<std::string, bool> metadata{};
+
+  // Only add meta-data if the value is not default.
+  if (t.is_conj()) {
+    metadata["conj"] = true;
+  }
+  if (t.is_neg()) {
+    metadata["neg"] = true;
+  }
+  // Only add BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // serialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().first;
+    fptr(t, metadata);
+  }
+  return metadata;
+}
+
+// set Tensor Metadata based on the map.
+// Refer: getTensorMetadata
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool> metadata) {
+  auto iter_end = metadata.end();
+  auto iter_temp = metadata.find("conj");
+  if (iter_temp != iter_end) {
+    t._set_conj(true);
+    metadata.erase(iter_temp);
+  }
+  iter_temp = metadata.find("neg");
+  if (iter_temp != iter_end) {
+    t._set_neg(true);
+    metadata.erase(iter_temp);
+  }
+  // Only set BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // deserialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().second;
+    fptr(t, metadata);
+  }
+}
+
+// set Tensor metadata based on the map.
+// NOTE: This overload is required by unpickler.cpp
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    const c10::Dict<c10::IValue, c10::IValue>& metadata_idict) {
+  std::unordered_map<std::string, bool> metadata;
+  for (auto& pair : metadata_idict) {
+    auto key = *pair.key().toString();
+    metadata[key] = pair.value().toBool();
+  }
+  setTensorMetadata(t, std::move(metadata));
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f599a4dddd8565e298ec33b6a97fda63f590d59
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h
@@ -0,0 +1,58 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <vector>
+
+namespace torch {
+namespace jit {
+
+struct Method;
+struct Module;
+struct PythonPrintImpl;
+
+struct PrintDepsTable {
+  void add(const c10::NamedTypePtr& type);
+
+  size_t size() const {
+    return table_.size();
+  }
+
+  const c10::NamedTypePtr& operator[](size_t index) const {
+    return table_[index];
+  }
+
+ private:
+  std::vector<c10::NamedTypePtr> table_;
+  std::unordered_set<c10::NamedTypePtr> non_unique_;
+};
+
+struct TORCH_API PythonPrint {
+  PythonPrint(
+      std::vector<IValue>& constant_table,
+      PrintDepsTable& deps_table,
+      c10::TypePrinter type_printer = nullptr,
+      bool enforce_importable = false);
+
+  void printNamedType(const c10::NamedTypePtr& classType);
+  void printFunction(const Function& callee);
+  void printMethod(const Function& callee);
+
+  std::string str() const;
+  const SourceRangeRecords& ranges() const;
+  uint64_t minVersion() const;
+
+ private:
+  std::shared_ptr<PythonPrintImpl> pImpl;
+};
+
+TORCH_API bool printerHasSpecialCaseFor(c10::Symbol sym);
+
+TORCH_API void jitModuleToPythonCodeAndConstants(
+    const Module& module,
+    ExtraFilesMap* jit_sources, // output
+    std::vector<IValue>* constants // output
+);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..399ffd050ad2e921449a81fb32c54760edfbd073
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+#include <ATen/core/ivalue.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace c10 {
+struct IValue;
+}
+
+namespace torch {
+namespace jit {
+
+class Pickler;
+class SourceRangeSerializer;
+static constexpr size_t kByteOffsetIndex = 0;
+static constexpr size_t kSourceRangeIndex = 1;
+static constexpr size_t kSourceRangeTagIndex = 2;
+constexpr c10::string_view kFormatWithStringTable = "FORMAT_WITH_STRING_TABLE";
+
+class SourceRangePickler {
+ public:
+  SourceRangePickler();
+
+  std::vector<char> pickle(
+      const SourceRangeRecords& ranges,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  std::shared_ptr<SourceRangeSerializer> srs;
+};
+
+class SourceRangeDeserializer {
+ public:
+  SourceRangeDeserializer() = default;
+  explicit SourceRangeDeserializer(const c10::IValue& text_table) {
+    for (const auto& x : text_table.toTuple()->elements()) {
+      text_table_.emplace_back(std::make_shared<std::string>(x.toStringRef()));
+    }
+  }
+  SourceRange deserialize(const c10::IValue& iv);
+
+ private:
+  std::shared_ptr<Source> deserialize_source(const c10::IValue& iv);
+  std::unordered_map<
+      c10::intrusive_ptr<c10::ivalue::Tuple>,
+      std::shared_ptr<Source>>
+      cached_sources;
+  std::vector<std::shared_ptr<std::string>> text_table_;
+};
+
+class SourceRangeUnpickler {
+ public:
+  virtual c10::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range) = 0;
+
+  virtual ~SourceRangeUnpickler() = default;
+};
+
+TORCH_API void setShouldUseFormatWithStringTable(
+    bool should_use_format_with_string_table);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7012ccd1772b0bf9c36c6479d9ead5feee68cc9c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/source_range_serialization.h>
+
+namespace torch {
+namespace jit {
+
+// Do this clownyness with virtual functions because of the split
+// between ATen core and torch
+
+class ConcreteSourceRangeUnpickler : public SourceRangeUnpickler {
+ public:
+  ConcreteSourceRangeUnpickler(at::DataPtr&& data, size_t size);
+
+  c10::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range) override;
+
+ private:
+  at::DataPtr data;
+  size_t size;
+
+  void unpickle();
+
+  std::mutex mutex;
+  std::shared_ptr<SourceRangeDeserializer> deserializer;
+  std::shared_ptr<SourceRangeRecords> unpickled_records;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..d40381c5b36f8d86c3e240b4608bc7b05f9efd1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace torch {
+namespace jit {
+
+// Used in torch.package and TorchScript serialization to coordinate
+// sharing of storages between models. Also used to create deterministic
+// naming for storages.
+class TORCH_API SerializationStorageContext {
+ public:
+  explicit SerializationStorageContext() = default;
+  SerializationStorageContext operator=(const SerializationStorageContext&) =
+      delete;
+  SerializationStorageContext(const SerializationStorageContext&) = delete;
+
+  uint64_t getOrAddStorage(const c10::Storage& storage) {
+    if (!hasStorage(storage)) {
+      uint64_t size = storage_id_map_.size();
+      storage_id_map_[storage] = size;
+    }
+    return storage_id_map_[storage];
+  }
+
+  bool hasStorage(const c10::Storage& storage) {
+    return storage_id_map_.find(storage) != storage_id_map_.end();
+  }
+
+  ~SerializationStorageContext() = default;
+
+ private:
+  class StorageSerializationHash {
+   public:
+    size_t operator()(const c10::Storage& storage) const {
+      return std::hash<void*>()(
+          reinterpret_cast<void*>(storage.unsafeGetStorageImpl()));
+    }
+  };
+
+  class StorageSerializationEqual {
+   public:
+    bool operator()(const c10::Storage& lhs, const c10::Storage& rhs) const {
+      return lhs.unsafeGetStorageImpl() == rhs.unsafeGetStorageImpl();
+    }
+  };
+
+  std::unordered_map<
+      c10::Storage,
+      uint64_t,
+      StorageSerializationHash,
+      StorageSerializationEqual>
+      storage_id_map_;
+};
+
+// Used in torch.package and TorchScript deserialization to coordinate
+// sharing of storages between models.
+class TORCH_API DeserializationStorageContext {
+ public:
+  explicit DeserializationStorageContext() = default;
+  DeserializationStorageContext operator=(
+      const DeserializationStorageContext&) = delete;
+  DeserializationStorageContext(const DeserializationStorageContext&) = delete;
+
+  void addStorage(std::string name, c10::Storage storage) {
+    TORCH_INTERNAL_ASSERT(!hasStorage(name));
+    name_storage_map_.emplace(std::move(name), std::move(storage));
+  }
+
+  bool hasStorage(const std::string& name) {
+    return name_storage_map_.find(name) != name_storage_map_.end();
+  }
+
+  c10::Storage getStorage(const std::string& name) {
+    TORCH_INTERNAL_ASSERT(hasStorage(name));
+    return name_storage_map_.find(name)->second;
+  }
+  ~DeserializationStorageContext() = default;
+
+ private:
+  std::unordered_map<std::string, c10::Storage> name_storage_map_;
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h
new file mode 100644
index 0000000000000000000000000000000000000000..10f01871c085e6a58fb494f63a5887cc24799a4e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/name_mangler.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+
+namespace torch {
+namespace jit {
+
+/**
+ * class TypeNameUniquer
+ *
+ * Generates a unique name for every type `t` passed in. Types that compare
+ * equal with EqualType will receive the same unique name.
+ *
+ * This is used during Module::save(), to resolve type name collisions during
+ * serialization.
+ */
+class TORCH_API TypeNameUniquer {
+ public:
+  c10::QualifiedName getUniqueName(c10::ConstNamedTypePtr t);
+
+ private:
+  NameMangler mangler_;
+  std::unordered_set<c10::QualifiedName> used_names_;
+  std::unordered_map<
+      c10::ConstNamedTypePtr,
+      c10::QualifiedName,
+      HashType,
+      EqualType>
+      name_map_;
+};
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h
new file mode 100644
index 0000000000000000000000000000000000000000..337814f6ae11335c92b6680193a1f67a2a5c52d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/ArrayRef.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace torch {
+namespace jit {
+
+using TypeResolver =
+    std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
+
+using ObjLoader = std::function<
+    c10::intrusive_ptr<c10::ivalue::Object>(const at::StrongTypePtr&, IValue)>;
+
+class DeserializationStorageContext;
+
+// [unpickler refactor] there is some cruft around PickleOpCode::BUILD,
+// PickleOpCode::NEWOBJ, and the last_opcode_ member below that should be
+// deleted at some point, the Pickler doesn't produce it and it's only around to
+// support models saved before 1.1
+class TORCH_API Unpickler {
+  AT_DISALLOW_COPY_AND_ASSIGN(Unpickler);
+
+  using TypeParserT = c10::TypePtr (*)(const std::string&);
+
+ public:
+  // tensors inside the pickle are references to the tensor_table.
+  // class_resolver is to resolve strong class type, type_resolver_ is
+  // to resolve any JIT type. class_resolver and type_resolver are not merged
+  // here because some use cases need to get strong class type that
+  // type_resolver_ can not return.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      c10::ArrayRef<at::Tensor> tensor_table,
+      TypeParserT type_parser = defaultTypeParser)
+      : reader_(std::move(reader)),
+        tensor_table_(tensor_table),
+        type_resolver_(std::move(type_resolver)),
+        use_storage_device_(false),
+        type_parser_(type_parser),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      c10::ArrayRef<at::Tensor> tensor_table,
+      ObjLoader obj_loader,
+      TypeParserT type_parser = defaultTypeParser)
+      : reader_(std::move(reader)),
+        tensor_table_(tensor_table),
+        type_resolver_(std::move(type_resolver)),
+        obj_loader_(std::move(obj_loader)),
+        use_storage_device_(false),
+        type_parser_(type_parser),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  // tensors inside the pickle contain meta-data, the raw tensor
+  // dead is retrieved by calling `read_record`.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      ObjLoader obj_loader,
+      std::function<at::DataPtr(const std::string&)> read_record,
+      c10::optional<at::Device> device,
+      bool use_storage_device = false,
+      TypeParserT type_parser = defaultTypeParser,
+      std::shared_ptr<DeserializationStorageContext> storage_context = nullptr)
+      : reader_(std::move(reader)),
+        tensor_table_(),
+        type_resolver_(std::move(type_resolver)),
+        obj_loader_(std::move(obj_loader)),
+        read_record_(std::move(read_record)),
+        // NOLINTNEXTLINE(performance-move-const-arg)
+        device_(std::move(device)),
+        use_storage_device_(use_storage_device),
+        type_parser_(type_parser),
+        storage_context_(std::move(storage_context)),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  // consume the pickle stream, producing an IValue from the contents.
+  // Type Tags: the pickler will restore the type tags on
+  // List and Dict objects when possible IValue is an Object.
+  // Otherwise, Dict and List objects will end up with Any as their tag.
+  // If you know the type of the ivalue, tags can be restored with
+  // restoreAccurateTypeTags
+  IValue parse_ivalue();
+
+  // [type tag serialization]
+  // This is used to determine whether to restore type tags be recursively
+  // descending into the returned stack object (if version_number <= 2), or
+  // if version_number >= 3, to use the type strings included in the pickle
+  // archive for container types. By default this is set to
+  // `kProducedFileFormatVersion` so unless you're loading a pickle file
+  // from alongside a corresponding `version` file, you don't need to set
+  // the version manually.
+  void set_version(uint64_t version_number) {
+    version_ = version_number;
+  }
+
+  static c10::TypePtr defaultTypeParser(const std::string& str) {
+    ScriptTypeParser parser;
+    return parser.parseType(str);
+  }
+
+ private:
+  // No arguments ensures that a template argument must be specified
+  // so that the number of bytes read / type read is explicit
+  template <typename T>
+  T read() {
+    T item;
+    if (sizeof(T) <= buffer_remaining_) {
+      // Fast path: entirely from buffer.
+      memcpy(&item, buffer_.data() + buffer_pos_, sizeof(T));
+      buffer_remaining_ -= sizeof(T);
+      buffer_pos_ += sizeof(T);
+    } else {
+      // Don't over-template the slow path, to avoid code size bloat.
+      readSlowWithBuffer(reinterpret_cast<char*>(&item), sizeof(T));
+    }
+    return item;
+  }
+  void readSlowWithBuffer(char* dest, size_t sz);
+  std::string readBytes(size_t num_bytes);
+
+  double readFloat();
+  void readGlobal(
+      const std::string& module_name,
+      const std::string& class_name);
+  void rebuildTensor(bool quantized);
+  void rebuildTensorFromTypeV2();
+  void rebuildSparseTensor();
+#ifdef USE_DISTRIBUTED
+  void rebuildRRef();
+#endif
+  PickleOpCode readInstruction();
+  PickleOpCode readOpCode() {
+    return static_cast<PickleOpCode>(read<uint8_t>());
+  }
+  std::string readString();
+  void readList(IValue list_ivalue);
+  void readListElements(IValue list_ivalue, size_t start);
+  void setInput(size_t memo_id);
+  void run();
+
+  // Returns the number of bytes read. This should statefully
+  // remember the position. Don't call reader_ directly.
+  std::function<size_t(char*, size_t)> reader_;
+  // Small buffer to avoid calling reader_ on a per-byte basis.
+  std::array<char, 256> buffer_;
+  size_t buffer_pos_{0};
+  size_t buffer_remaining_{0};
+
+  std::vector<IValue> stack_;
+
+  // globals are represented on the stack as IValue integer indices
+  // into this list
+  std::vector<std::function<void(void)>> globals_;
+  std::vector<IValue> memo_table_;
+  std::vector<size_t> marks_;
+  c10::ArrayRef<at::Tensor> tensor_table_;
+
+  // When deserializing types on lists and dicts, cache the type here
+  // so we don't have to parse the same type multiple times. Strings
+  // are already de-duplicated and replaced with BINGETs in the
+  // pickler, so we can just use the actual data pointer of each string.
+  std::unordered_map<std::string, c10::TypePtr> type_cache_;
+
+  // optionally nullptr, needs to be present for creating classes
+  TypeResolver type_resolver_;
+  ObjLoader obj_loader_;
+  IValue empty_tuple_;
+
+  std::function<at::DataPtr(const std::string&)> read_record_;
+  c10::optional<at::Device> device_;
+  // When set to true, Unpickler will ignore the pickled device and use the
+  // device of the DataPtr returned by the read_record_ function. The default
+  // value of this flag is false.
+  const bool use_storage_device_;
+
+  TypeParserT type_parser_{defaultTypeParser};
+
+  // Used for torch.package to enable sharing of storages across
+  // ScriptModules and eager modules
+  std::shared_ptr<DeserializationStorageContext> storage_context_;
+
+  // See [type tag serialization]
+  uint64_t version_;
+
+  // See [NOTE] skip_next_read_global
+  uint8_t skip_next_read_global = 0;
+};
+
+void restoreAccurateTypeTags(const IValue& root, const c10::TypePtr& type_tag);
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..901a0015d7b5cb0434e7341f574f0bde86efe1ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h
@@ -0,0 +1,406 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+class HasRand : public IRVisitor {
+ public:
+  HasRand(StmtPtr stmt) : stmt_(std::move(stmt)) {
+    stmt_->accept(this);
+  }
+
+  bool has_rand() const {
+    return has_rand_;
+  }
+
+ private:
+  void visit(IntrinsicsPtr v) override {
+    if (v->op_type() == IntrinsicsOp::kRand) {
+      has_rand_ = true;
+    } else {
+      IRVisitor::visit(std::move(v));
+    }
+  }
+  StmtPtr stmt_;
+  bool has_rand_ = false;
+};
+
+template <typename Op>
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class NodeFinder : public IRVisitor {
+ public:
+  void visit(NodePtr<Op> v) override {
+    nodes.push_back((NodePtr<Op>)v);
+    IRVisitor::visit(v);
+  }
+
+  static std::vector<NodePtr<Op>> find(StmtPtr s) {
+    NodeFinder<Op> nf;
+    s->accept(&nf);
+    return nf.nodes;
+  }
+
+  static std::vector<NodePtr<Op>> find(ExprPtr e) {
+    NodeFinder<Op> nf;
+    e->accept(&nf);
+    return nf.nodes;
+  }
+
+  std::vector<NodePtr<Op>> nodes;
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class VarFinder : public IRVisitor {
+ public:
+  void visit(VarPtr v) override {
+    vars_.insert(v);
+    IRVisitor::visit(std::move(v));
+  }
+
+  static std::unordered_set<VarPtr> find(StmtPtr s) {
+    VarFinder nf;
+    s->accept(&nf);
+    return nf.vars();
+  }
+
+  static std::unordered_set<VarPtr> find(ExprPtr e) {
+    VarFinder nf;
+    e->accept(&nf);
+    return nf.vars();
+  }
+
+  const std::unordered_set<VarPtr>& vars() {
+    return vars_;
+  }
+
+ private:
+  std::unordered_set<VarPtr> vars_;
+};
+
+class BufFinder : public IRVisitor {
+ public:
+  void visit(BufPtr v) override {
+    bufs_.insert(v);
+    IRVisitor::visit(std::move(v));
+  }
+
+  static std::unordered_set<BufPtr> find(StmtPtr s) {
+    BufFinder nf;
+    s->accept(&nf);
+    return nf.bufs();
+  }
+
+  static std::unordered_set<BufPtr> find(ExprPtr e) {
+    BufFinder nf;
+    e->accept(&nf);
+    return nf.bufs();
+  }
+
+  const std::unordered_set<BufPtr>& bufs() {
+    return bufs_;
+  }
+
+ private:
+  std::unordered_set<BufPtr> bufs_;
+};
+
+// Finds all kinds of write operations to the provided Buf.
+class WritesToBuf : public IRVisitor {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  WritesToBuf(BufPtr target) : target_(std::move(target)) {}
+
+  std::vector<StmtPtr> writes() {
+    return writes_;
+  }
+
+  static std::vector<StmtPtr> find(StmtPtr s, BufPtr b) {
+    WritesToBuf finder(std::move(b));
+    s->accept(&finder);
+    return finder.writes();
+  }
+
+ private:
+  void visit(StorePtr v) override {
+    if (v->buf() == target_) {
+      writes_.push_back(v);
+    }
+  }
+
+  void visit(AtomicAddPtr v) override {
+    if (v->buf() == target_) {
+      writes_.push_back(v);
+    }
+  }
+
+  BufPtr target_;
+  std::vector<StmtPtr> writes_;
+};
+
+class StmtsReadingBuf : public IRVisitor {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  StmtsReadingBuf(BufPtr target) : target_(std::move(target)) {}
+
+  std::vector<StmtPtr> reads() {
+    return reads_;
+  }
+
+  static std::vector<StmtPtr> find(StmtPtr s, BufPtr b) {
+    StmtsReadingBuf finder(std::move(b));
+    s->accept(&finder);
+    return finder.reads();
+  }
+
+ private:
+  bool readsBuffer(StmtPtr s) {
+    auto loads = NodeFinder<Load>::find(std::move(s));
+    for (const auto& l : loads) {
+      if (l->buf() == target_) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void visit(StorePtr v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(LetPtr v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(CondPtr v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(AtomicAddPtr v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  BufPtr target_;
+  std::vector<StmtPtr> reads_;
+};
+
+class ExternalAllocBufFinder : public IRVisitor {
+ public:
+  void visit(ExternalCallWithAllocPtr v) override {
+    const auto& bufs_out = v->buf_out_args();
+    bufs_.insert(bufs_out.begin(), bufs_out.end());
+    IRVisitor::visit(std::move(v));
+  }
+
+  static std::unordered_set<BufPtr> find(StmtPtr s) {
+    ExternalAllocBufFinder f;
+    s->accept(&f);
+    return f.bufs();
+  }
+
+  static std::unordered_set<BufPtr> find(ExprPtr e) {
+    ExternalAllocBufFinder f;
+    e->accept(&f);
+    return f.bufs();
+  }
+
+  const std::unordered_set<BufPtr>& bufs() {
+    return bufs_;
+  }
+
+ private:
+  std::unordered_set<BufPtr> bufs_;
+};
+
+// Traverses the IR to determine if a particular Var is modified within it.
+class ModifiesVarChecker : public IRVisitor {
+ public:
+  ModifiesVarChecker(VarPtr v) : var_(std::move(v)) {}
+
+  static bool check(StmtPtr s, VarPtr v) {
+    ModifiesVarChecker checker(std::move(v));
+    s->accept(&checker);
+    return checker.found();
+  }
+
+  bool found() {
+    return found_;
+  }
+
+ private:
+  void visit(StorePtr v) override {
+    if (v->buf()->base_handle() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(std::move(v));
+  }
+
+  void visit(AtomicAddPtr v) override {
+    if (v->buf()->base_handle() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(std::move(v));
+  }
+
+  void visit(LetPtr v) override {
+    if (v->var() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(std::move(v));
+  }
+
+  void visit(ForPtr v) override {
+    if (v->var() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(std::move(v));
+  }
+
+  VarPtr var_;
+  bool found_{false};
+};
+
+// Traverse the Block stmt to identify the live range of the specified buf. The
+// live range, indicated by a pair of integers, specifies the first and last
+// stmt in block stmts that access to the buf.
+class BufLiveRange : public IRVisitor {
+ public:
+  BufLiveRange(BufPtr b) : buf_(std::move(b)) {}
+
+  static std::tuple<int32_t, int32_t> liveRange(StmtPtr s, BufPtr b) {
+    BlockPtr block = to<Block>(std::move(s));
+    // We Only analyze buffer live ranges for block stmts.
+    if (!block) {
+      return std::make_tuple(0, 0);
+    }
+
+    BufLiveRange analyzer(std::move(b));
+    block->accept(&analyzer);
+    return analyzer.getLiveRange();
+  }
+
+ private:
+  std::tuple<int32_t, int32_t> getLiveRange() {
+    return std::make_tuple(begin_, end_);
+  }
+
+  bool hasBufReads(StmtPtr s) {
+    auto loads1 = NodeFinder<Load>::find(s);
+    for (const auto& l : loads1) {
+      if (l->buf() == buf_) {
+        return true;
+      }
+    }
+    auto loads2 = NodeFinder<ExternalCall>::find(s);
+    for (const auto& l : loads2) {
+      for (const auto& lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
+    auto loads3 = NodeFinder<ExternalCallWithAlloc>::find(std::move(s));
+    for (const auto& l : loads3) {
+      for (const auto& lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool hasBufWrites(StmtPtr s) {
+    auto writes1 = NodeFinder<Store>::find(s);
+    for (const auto& w : writes1) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    auto writes2 = NodeFinder<ExternalCall>::find(s);
+    for (const auto& w : writes2) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    auto writes3 = NodeFinder<ExternalCallWithAlloc>::find(std::move(s));
+    for (const auto& w : writes3) {
+      for (const auto& wb : w->buf_out_args()) {
+        if (wb == buf_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void findAccAndUpdateLiveRange(StmtPtr s) {
+    bool has_reads = hasBufReads(s), has_writes = hasBufWrites(std::move(s));
+    if (has_reads || has_writes) {
+      if (begin_ == -1) {
+        begin_ = curr_index_;
+      };
+      end_ = curr_index_;
+    }
+  }
+
+  void visit(BlockPtr v) override {
+    for (const StmtPtr& s : *v) {
+      curr_index_ += 1;
+      findAccAndUpdateLiveRange(s);
+    }
+  }
+
+  BufPtr buf_;
+  int32_t begin_ = -1;
+  int32_t end_ = -1;
+  int32_t curr_index_ = -1;
+};
+
+// A class that analyzes the given program relevant for Block backend
+// It creates a map of multi dim buffers and their flat versions
+class CreateBufferMap : public IRVisitor {
+ public:
+  const std::unordered_map<std::string, BufPtr>& getBufferMap() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  void visit(StorePtr v) override {
+    auto load_node = to<Load>(v->value());
+    if (load_node) {
+      auto t_buf = load_node->buf();
+      map_input_to_tensor_bufs_.emplace(t_buf->name_hint(), v->buf());
+    } else {
+      auto add_node = to<Add>(v->value());
+      auto mul_node = to<Mul>(v->value());
+      // This means for now, v->value() can be Add or Mul
+      TORCH_INTERNAL_ASSERT(add_node || mul_node, buildErrorMessage());
+      map_input_to_tensor_bufs_.emplace(v->buf()->name_hint(), v->buf());
+    }
+    v->value()->accept(this);
+  }
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f2264c552146428008811310d9562a70c3c9b29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// A class that analyzes the given program relevant for Block backend.
+class BlockAnalysis : public IRVisitor {
+ public:
+  bool is_buf_store_target(BufPtr buf) const {
+    return store_targets_.count(buf) > 0;
+  }
+
+  const std::unordered_set<BufPtr>& loads() const {
+    return loads_;
+  }
+
+  const std::unordered_set<BufPtr>& stores() const {
+    return store_targets_;
+  }
+
+  int block_size() const {
+    return block_size_;
+  }
+
+  bool areBufsInMap(const std::unordered_set<BufPtr>& bufs) const;
+
+  BufPtr getMultiDimBuf(BufPtr buf) const;
+
+  std::string getInputName(BufPtr buf) const;
+
+  std::string getFlatInputName(BufPtr buf) const {
+    return getInputName(std::move(buf)) + "_flat";
+  }
+
+  std::unordered_map<std::string, BufPtr> getBufferMap() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  void visit(StorePtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(ForPtr v) override;
+
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+  std::unordered_set<BufPtr> store_targets_;
+  std::unordered_set<BufPtr> loads_;
+  int block_size_ = 32;
+};
+
+// A class that overrides the underlying IRPrinter to produce Block.
+class BlockPrinter : public IRPrinter {
+ public:
+  BlockPrinter(std::ostream* os, BlockAnalysis* block_analysis)
+      : IRPrinter(*os), block_analysis_(block_analysis) {}
+
+  using IRPrinter::name_manager;
+  using IRPrinter::visit;
+
+ private:
+  BlockAnalysis* block_analysis_;
+  std::unordered_map<std::string, int> dim_values_map;
+  std::vector<std::string> dim_names = {"N", "H", "W", "C"};
+  std::vector<std::string> flat_dim_names = {"N", "NH", "NHW", "NHWC"};
+  void PrintTensorInfo(const std::unordered_set<BufPtr>& bufs);
+  void PrintArguments(const std::unordered_set<BufPtr>& bufs);
+  void PrintBufferInfo(const std::unordered_set<BufPtr>& bufs);
+  void PrintDistribution(const std::unordered_set<BufPtr>& bufs);
+  void PrintLoop(const std::unordered_set<BufPtr>& bufs, bool block_idx = true);
+  void PrintReshapeInfo(
+      const std::unordered_set<BufPtr>& bufs,
+      bool reverse = false);
+  void PrintDMAs(const std::unordered_set<BufPtr>& bufs);
+  void PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs);
+
+  void visit(ForPtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(StorePtr v) override;
+  void visit(BlockPtr v) override;
+  void visit(AddPtr v) override;
+  void visit(MulPtr v) override;
+};
+
+class TORCH_API BlockCodeGen : public CodeGen {
+ public:
+  template <typename... Ts>
+  /* implicit */
+  BlockCodeGen(StmtPtr stmt, Ts... ts)
+      : CodeGen(
+            stmt,
+            std::vector<BufferArg>({BufferArg(ts)...}),
+            at::Device(at::kCPU)) {
+    Initialize();
+  }
+
+  BlockCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::Device(at::kCPU),
+      const std::string& kernel_func_name = "func")
+      : CodeGen(stmt, buffer_args, device, kernel_func_name) {
+    Initialize();
+  }
+
+  ~BlockCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  void Initialize();
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  UniqueNameManager* name_manager() {
+    if (!printer_) {
+      throw std::runtime_error("Null IRPrinter is not expected");
+    }
+    return printer_->name_manager();
+  }
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<BlockPrinter> printer_;
+  std::unique_ptr<BlockAnalysis> block_analysis_;
+
+  std::string GetUniqueFuncName(const std::string& func_prefix);
+};
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..99d17f7b8d40e91da9be587c6507b53ac1744046
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class Expr;
+class Buf;
+class Stmt;
+
+enum C10_API_ENUM TensorAccessKind { kLoad, kStore, kMutate };
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct TORCH_API TensorAccessBoundsInfo {
+  TensorAccessKind kind;
+  std::vector<ExprPtr> start;
+  std::vector<ExprPtr> stop;
+};
+
+using BoundsInfo =
+    std::unordered_map<BufPtr, std::vector<TensorAccessBoundsInfo>>;
+
+TORCH_API BoundsInfo inferBounds(StmtPtr s, bool distinctAccessKinds = true);
+
+// Bounds inference caching the analysis. The MemDependencyChecker must already
+// have been run.
+TORCH_API BoundsInfo getInferredBounds(
+    analysis::MemDependencyChecker& analyzer,
+    StmtPtr s,
+    bool distinctAccessKinds = true);
+TORCH_API BoundsInfo getInferredBounds(
+    analysis::MemDependencyChecker& analyzer,
+    ExprPtr e,
+    bool distinctAccessKinds = true);
+
+TORCH_API void printBoundsInfo(const BoundsInfo& v);
+
+TORCH_API std::vector<ExprPtr> getBoundExtents(
+    const std::vector<TensorAccessBoundsInfo>& infos);
+
+// The kind of dependency found, in increasing order of exclusivity.
+enum class HazardKind {
+  ReadAfterWrite,
+  WriteAfterRead,
+  WriteAfterWrite,
+  NoDependency,
+};
+TORCH_API HazardKind getPotentialHazards(
+    analysis::MemDependencyChecker& analyzer,
+    StmtPtr A,
+    StmtPtr B);
+
+// Returns true if there is a conflicting overlap between accesses in
+// statements A and B. A conflicting overlap is an overlap in buffer accesses
+// where at least one of the accesses is a Store.
+TORCH_API bool hasConflictingOverlap(
+    analysis::MemDependencyChecker& analyzer,
+    StmtPtr A,
+    StmtPtr B);
+// Same as above, between accesses in stores S1 and S2.
+TORCH_API bool isOverlapping(
+    analysis::MemDependencyChecker& analyzer,
+    StorePtr S1,
+    StorePtr S2);
+// Same as above, between accesses in store S and load L.
+TORCH_API bool isOverlapping(
+    analysis::MemDependencyChecker& analyzer,
+    StorePtr S,
+    LoadPtr L);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ef492aab10992d74cd0b6498d1f50bdb81ca591
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+
+#include <deque>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+namespace analysis {
+
+// A simple class containing the start and end of a range in a single dimension.
+struct TORCH_API Bound {
+  ExprPtr start{nullptr};
+  ExprPtr end{nullptr};
+
+  // This stores whether or not the start and end of this Bound have previously
+  // been swapped. This occurs when the bound is in a loop with a negative
+  // stride.
+  bool swapped{false};
+
+  Bound() = default;
+  Bound(ExprPtr s, ExprPtr e) : start(std::move(s)), end(std::move(e)) {}
+
+  void print() const;
+  bool equals(const Bound& other) const;
+
+  // The comparison operators are conservative. If the compare operator returns
+  // true, it means that all the elements satisfy the logical expression. But
+  // the false does not mean the opposite comparison is satisfied. It could be
+  // but not always.
+  bool operator==(const Bound& other) const;
+  bool operator!=(const Bound& other) const;
+  bool operator<(const Bound& other) const;
+  bool operator<=(const Bound& other) const;
+  bool operator>(const Bound& other) const;
+  bool operator>=(const Bound& other) const;
+
+  void swap() {
+    std::swap(start, end);
+    swapped = !swapped;
+  }
+};
+
+struct BoundHash {
+  size_t operator()(const Bound& b) const {
+    return std::hash<ExprPtr>()(b.start) ^ std::hash<ExprPtr>()(b.end);
+  }
+};
+
+// The type of overlap found. Each condition is true only if none of the
+// previous conditions hold.
+//     ContainedOrEqual: All elements in the Bound A are in the Bound B (this
+//                       includes the case where the bounds are equal).
+//     Contains: All elements in the Bound B are in the Bound B.
+//     PartialOverlap: Any elements in the Bound B are in the Bound A.
+//     NoOverlap: No elements in the Bound A are in the bound B.
+enum class OverlapKind {
+  ContainedOrEqual,
+  Contains,
+  PartialOverlap,
+  NoOverlap
+};
+
+// The Bound comparison result.
+//     True: Every Bound element always satisfies the given comparison operator
+//     False: Every Bound element always does NOT satisfy the given comparison
+//     operator
+//     NotDetermined: Some elements satisfy the given comparison operator and
+//     some elements not
+enum class CmpEvalResult { True, False, NotDetermined };
+
+// Returns the kind of overlap between Bound A and Bound A in a single
+// dimension.
+OverlapKind TORCH_API boundOverlap(Bound A, Bound B);
+
+// The comparison is conservative and the compare result is deterministic.
+// It means that every element of the Bound to be compared needs to satisfy
+// the given comparison operator.
+CmpEvalResult TORCH_API compareBound(
+    const Bound& a,
+    const Bound& b,
+    const CompareSelectOperation& cmp_op);
+
+// A multi dimensional bound representing the bound of a set of indices.
+using IndexBounds = std::vector<Bound>;
+
+// Returns true if two IndexBounds are equivalent.
+bool TORCH_API indexBoundsEquals(const IndexBounds& A, const IndexBounds& B);
+
+// Flattens a multi dimensional bound to a single dimension. The IndexBounds "a"
+// *must* encapsulate the entire range of the buffer.
+Bound TORCH_API flattenBounds(const IndexBounds& a);
+
+// Determines the kind of overlap in X dimensions.
+OverlapKind TORCH_API overlaps(const IndexBounds& a, const IndexBounds& b);
+
+// Returns the Bound slices created by subtracing bound B from bound A.
+// Multiple Bounds can be returned in the case where B slices A into two
+// distinct regions with no overlap.
+//
+// For example:
+//    subtractBound((0, 10), (2, 4)) => [(0, 1), (5, 10)]
+//       bound A: (0, 10)
+//       bound B: (2, 4)
+//       If we remove slice (2, 4) from the slice (0, 10), we will be left
+//       with 2 slices, one at the start (0, 1), and one at the end (5, 10).
+//       So, the result of this subtraction is [(0, 1), (5, 10)].
+//
+// Note: this doesn't use IndexBounds because the Bounds returned do not
+// represent multiple different dimensions.
+std::vector<Bound> TORCH_API subtractBound(Bound a, Bound b);
+
+// Returns the bound slices created by subtracting the IndexBounds B from A.
+std::vector<IndexBounds> TORCH_API subtractIndicesBounds(
+    const IndexBounds& A,
+    const IndexBounds& B,
+    OverlapKind overlap);
+std::vector<IndexBounds> TORCH_API
+subtractIndicesBounds(const IndexBounds& A, const IndexBounds& B);
+
+} // namespace analysis
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dbc1c69701abeea62030c84430f1f400c4fff73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h
@@ -0,0 +1,283 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+template <typename T>
+class PaddedBuffer;
+
+class TORCH_API CodeGen {
+ public:
+  class BufferArg;
+  class CallArg;
+
+  template <typename... Ts>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CodeGen(StmtPtr stmt, Ts... ts)
+      : stmt_(std::move(stmt)), buffer_args_({BufferArg(ts)...}) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CodeGen(
+      StmtPtr stmt,
+      std::vector<BufferArg> buffer_args,
+      at::Device device = at::kCPU,
+      std::string kernel_func_name = "func");
+
+  virtual ~CodeGen() = default;
+
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  void set_stmt(StmtPtr s) {
+    stmt_ = s;
+  }
+
+  void apply_mutator(IRMutator* mutator) {
+    stmt_ = stmt_->accept_mutator(mutator);
+  }
+
+  void apply_visitor(IRVisitor* visitor) {
+    stmt_->accept(visitor);
+  }
+
+  std::vector<BufferArg>& buffer_args() {
+    return buffer_args_;
+  }
+
+  const std::vector<BufferArg>& buffer_args() const {
+    return buffer_args_;
+  }
+
+  at::Device device() {
+    return device_;
+  }
+
+  // This function returns the generated code as
+  // a string.
+  virtual std::string getCodeText(const std::string& attr = "") {
+    return ("");
+  }
+
+  // TODO: Figure out how to unify these call interfaces.
+
+  /// Call a function with a vector of CallArgs, which are tagged
+  /// unions that properly type the arguments.
+  virtual void call(const std::vector<CallArg>& args) = 0;
+
+  /// Call a function faster than a regular `call` by assuming that
+  /// the generated kernel already knows the type of the arguments, so
+  /// they can be type-punned with `void*`s.
+  virtual void call_raw(const std::vector<void*>& args) = 0;
+
+  /// Call a function even faster than a regular call, by assuming
+  /// that the number of thread blocks can be derived from `numel` via
+  /// a simple division, rather than evaluating an expression.
+  virtual void call_with_numel(void** args, int64_t numel);
+
+  virtual at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      c10::optional<c10::ScalarType> dtype_opt,
+      c10::optional<c10::Layout> layout_opt,
+      c10::optional<c10::Device> device_opt,
+      c10::optional<bool> pin_memory_opt) {
+    return at::empty_strided(
+        size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+  }
+
+  const std::string& kernel_func_name() const {
+    return kernel_func_name_;
+  }
+
+  void allocIntermediateBufs();
+
+ protected:
+  static void* argToPtr(const BufferArg& bufferArg, const CallArg& callArg);
+
+ private:
+  StmtPtr stmt_;
+  std::vector<BufferArg> buffer_args_;
+  at::Device device_ = at::kCPU;
+  std::string kernel_func_name_ = "func";
+};
+
+class TORCH_API ExtCallMemoryReuse : public IRMutator {
+  static std::unordered_map<std::string, std::string> makeExtCallFuncNameMap();
+  static const std::unordered_map<std::string, std::string> extCallFuncNameMap_;
+
+ public:
+  explicit ExtCallMemoryReuse(
+      const std::vector<CodeGen::BufferArg>& bufferArgs);
+  ~ExtCallMemoryReuse() override = default;
+  StmtPtr mutate(ExternalCallPtr v) override;
+
+ private:
+  std::unordered_set<BufPtr> bufferArgs_;
+};
+
+class CodeGen::BufferArg {
+ public:
+  BufferArg(const Tensor& tensor) : buf_(tensor.buf()) {}
+  BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {}
+  BufferArg(const BufHandle& buf) : buf_(buf.node()) {}
+  BufferArg(BufPtr buf) : buf_(std::move(buf)) {}
+
+  VarPtr var() const {
+    return isVar_ ? var_ : buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  bool isVar() const {
+    return isVar_;
+  }
+
+  Dtype dtype() const {
+    return isVar_ ? var_->dtype() : buf_->dtype();
+  }
+
+ private:
+  VarPtr var_ = nullptr;
+  BufPtr buf_ = nullptr;
+  bool isVar_ = false;
+};
+
+class CodeGen::CallArg {
+ public:
+  template <typename T>
+  CallArg(const PaddedBuffer<T>& buffer);
+
+  template <typename T>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init,cppcoreguidelines-pro-type-const-cast)
+  CallArg(const std::vector<T>& buffer)
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+      : data_(const_cast<T*>(buffer.data())) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CallArg(void* ptr) : data_(ptr) {}
+
+#define ARG_TYPE_CTOR(Type, Name)      \
+  CallArg(Type v) {                    \
+    memcpy(buffer_, &v, sizeof(Type)); \
+    data_ = (void*)buffer_;            \
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_TYPE_CTOR);
+#undef ARG_TYPE_CTOR
+
+  void* data() const {
+    return data_;
+  }
+
+  CallArg(const CallArg& rhs) {
+    if (rhs.data_ == rhs.buffer_) {
+      memcpy(this->buffer_, rhs.buffer_, sizeof(rhs.buffer_));
+      this->data_ = (void*)(this->buffer_);
+    } else {
+      this->data_ = rhs.data_;
+    }
+  }
+
+  CallArg& operator=(const CallArg& rhs) {
+    if (rhs.data_ == rhs.buffer_) {
+      memcpy(this->buffer_, rhs.buffer_, sizeof(rhs.buffer_));
+      this->data_ = (void*)(this->buffer_);
+    } else {
+      this->data_ = rhs.data_;
+    }
+    return *this;
+  }
+
+#define ARG_PTR_DEFINE(Type, Name)                  \
+  Type* Name##Ptr() const {                         \
+    TORCH_INTERNAL_ASSERT(data_ == (void*)buffer_); \
+    return (Type*)data_;                            \
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_PTR_DEFINE);
+#undef ARG_PTR_DEFINE
+
+ private:
+  void* data_;
+  // Regarding a scalar value, CallArg uses void**=&data_ to store it. But the
+  // bit width of a pointer is 32bit on a 32bit platform. It cannot store the
+  // scalar if the bit width of the scalar is larger than 32bit, such as double
+  // and long. Hence, we add 8 bytes buffer dedicated to storing the scalar
+  // value regardless its bit width is less or greater than 32bits.
+  char buffer_[8] = {0}; // 64bits
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class RegisterCodeGenList {
+ public:
+  TORCH_API static RegisterCodeGenList& GetInstance() {
+    static RegisterCodeGenList codegen_list;
+    return codegen_list;
+  }
+
+  using StmtFactoryMethod = std::function<std::unique_ptr<CodeGen>(
+      StmtPtr stmt,
+      const std::vector<CodeGen::BufferArg>&,
+      at::Device device,
+      const std::string& kernel_func_name)>;
+
+  TORCH_API StmtFactoryMethod FindStmtFactoryMethod(const std::string& name);
+  RegisterCodeGenList(const RegisterCodeGenList&) = delete;
+  RegisterCodeGenList& operator=(const RegisterCodeGenList&) = delete;
+
+ private:
+  template <class CodeGenType>
+  friend class RegisterCodeGen;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  RegisterCodeGenList() = default;
+  TORCH_API void AddStmtFactoryMethod(
+      const std::string& name,
+      const StmtFactoryMethod& stmt_factory_method);
+
+  std::unordered_map<std::string, StmtFactoryMethod> stmt_factory_methods_;
+};
+
+template <class CodeGenType>
+class RegisterCodeGen {
+ public:
+  explicit RegisterCodeGen(const std::string& name) {
+    RegisterCodeGenList& codegen_list = RegisterCodeGenList::GetInstance();
+    codegen_list.AddStmtFactoryMethod(
+        name,
+        [](StmtPtr stmt,
+           const std::vector<CodeGen::BufferArg>& params,
+           at::Device device,
+           const std::string& kernel_func_name) {
+          // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+          std::unique_ptr<CodeGen> method(
+              new CodeGenType(stmt, params, device, kernel_func_name));
+          return method;
+        });
+  }
+};
+
+TORCH_API std::unique_ptr<CodeGen> CreateCodeGen(
+    const std::string& name,
+    StmtPtr stmt,
+    const std::vector<CodeGen::BufferArg>& params,
+    at::Device device = at::kCPU,
+    const std::string& kernel_func_name = "func");
+
+class TORCH_API GenericIntrinsicsExpander : public IRMutator {
+ protected:
+  ExprPtr mutate(IntrinsicsPtr v) override;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb13abbf1d32a6ed046a1437531e7443ab823d66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class CppVarNameRewriter;
+
+// Generates C++ code from the IR.
+//
+// Vector operations are unrolled.
+// For example:
+// C[Ramp(0, 1, 3)] = A[Ramp(0, 2, 3)] + B[Ramp(0, 3, 3)];
+// is unrolled into:
+// C[0] = A[0] + B[0];
+// C[1] = A[2] + B[3];
+// C[2] = A[4] + B[6];
+class TORCH_API CppPrinter : public IRPrinter {
+ public:
+  explicit CppPrinter(std::ostream* os);
+  ~CppPrinter() override;
+
+  void printPrologue();
+
+  using IRPrinter::visit;
+
+  // Binary expressions.
+  void visit(ModPtr) override;
+  void visit(MaxPtr) override;
+  void visit(MinPtr) override;
+
+  // Conditional expressions.
+  void visit(CompareSelectPtr) override;
+  void visit(IfThenElsePtr) override;
+
+  // Tensor operations.
+  void visit(AllocatePtr) override;
+  void visit(FreePtr) override;
+  void visit(LoadPtr) override;
+  void visit(StorePtr) override;
+
+  // Casts.
+  void visit(CastPtr) override;
+  void visit(BitCastPtr) override;
+
+  // Calls.
+  void visit(IntrinsicsPtr) override;
+  void visit(ExternalCallPtr) override;
+
+  // Vars.
+  void visit(LetPtr) override;
+  void visit(VarPtr) override;
+
+  // Vector data types.
+  void visit(RampPtr) override;
+  void visit(BroadcastPtr) override;
+
+ private:
+  int lane_;
+  std::unordered_map<VarPtr, ExprPtr> vector_vars_;
+};
+
+class TORCH_API CppCodeGen : public CodeGen {
+ public:
+  CppCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func");
+
+  ~CppCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    call(std::vector<CallArg>({CallArg(ts)...}));
+  }
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  void init();
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<CppPrinter> printer_;
+  std::unique_ptr<CppVarNameRewriter> var_name_rewriter_;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..b94e7ebac605716b2eb3773cbbfb9b4290406daa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
@@ -0,0 +1,36 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+constexpr auto cpp_intrinsics_definition = R"(
+namespace std {
+
+template <typename T,
+          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+T rsqrt(T v) {
+  return 1.0f / std::sqrt(v);
+}
+
+template <typename T,
+          typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+T frac(T v) {
+  T intpart;
+  return std::modf(v, &intpart);
+}
+
+template <typename From, typename To>
+To bitcast(const From& v) {
+  assert(sizeof(To) == sizeof(From));
+  To res;
+  std::memcpy(&res, &v, sizeof(From));
+  return res;
+}
+
+} // namespace std
+)";
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..c02736bc893d84b4e53710eecc5738387659b0e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -0,0 +1,295 @@
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// A class that analyzes the given program relevant for Cuda backends.
+class CudaAnalysis : public IRVisitor {
+ public:
+  CudaAnalysis() {
+    gpu_block_extents_ = {alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+    gpu_thread_extents_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+  }
+  bool is_buf_store_target(BufPtr buf) const {
+    return store_targets_.count(buf) > 0;
+  }
+
+  const std::unordered_set<VarPtr>& thread_local_bufs() const {
+    return thread_local_bufs_;
+  }
+
+  const std::unordered_set<VarPtr>& cross_block_bufs() const {
+    return cross_block_bufs_;
+  }
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return gpu_block_extents_;
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return gpu_thread_extents_;
+  }
+
+ private:
+  void visit(StorePtr v) override {
+    store_targets_.insert(v->buf());
+  }
+
+  void visit(AllocatePtr v) override;
+  void visit(FreePtr v) override;
+  void visit(PlacementAllocatePtr v) override;
+  void visit(ForPtr v) override;
+
+  std::unordered_set<BufPtr> store_targets_;
+  std::unordered_set<VarPtr> thread_local_bufs_;
+  std::unordered_set<VarPtr> cross_block_bufs_;
+
+  std::vector<ExprPtr> gpu_block_extents_;
+  std::vector<ExprPtr> gpu_thread_extents_;
+};
+
+// An IRMutator that replaces binding loop options with Cuda metavars, and masks
+// statements blocks which should execute with less reach than the launch
+// parameter extent.
+//
+// We do this by segmenting each block into chunks which should have the same
+// execution parameters, then if those params differ from the max mask each dim.
+class GPUMetaVarRewriter : public IRMutator {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit GPUMetaVarRewriter(const CudaAnalysis* cuda_analysis)
+      : cuda_analysis_(cuda_analysis) {
+    gpu_block_vars_ = {
+        alloc<Var>("blockIdx.x", kInt),
+        alloc<Var>("blockIdx.y", kInt),
+        alloc<Var>("blockIdx.z", kInt)};
+    gpu_thread_vars_ = {
+        alloc<Var>("threadIdx.x", kInt),
+        alloc<Var>("threadIdx.y", kInt),
+        alloc<Var>("threadIdx.z", kInt)};
+
+    current_block_reach_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+    current_thread_reach_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+  }
+
+  StmtPtr mutate(ForPtr v) override;
+  StmtPtr mutate(BlockPtr v) override;
+
+  const std::vector<VarPtr>& gpu_block_vars() const {
+    return gpu_block_vars_;
+  }
+
+  const std::vector<VarPtr>& gpu_thread_vars() const {
+    return gpu_thread_vars_;
+  }
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return cuda_analysis_->gpu_block_extents();
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return cuda_analysis_->gpu_thread_extents();
+  }
+
+ private:
+  // When processing a block, stores the contents of each sub-segment.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  class Segment {
+   public:
+    void reset(bool mask) {
+      stmts_.clear();
+      mask_ = mask;
+    }
+
+    bool empty() const {
+      return stmts_.empty();
+    }
+
+    std::vector<StmtPtr>& stmts() {
+      return stmts_;
+    }
+    bool mask() {
+      return mask_;
+    }
+
+   private:
+    std::vector<StmtPtr> stmts_;
+    bool mask_{true};
+  };
+
+  // Returns true if the current execution scope is equivalent to the launch
+  // parameters.
+  bool isFullExtent();
+
+  std::vector<VarPtr> gpu_block_vars_;
+  std::vector<VarPtr> gpu_thread_vars_;
+
+  std::vector<ExprPtr> current_block_reach_;
+  std::vector<ExprPtr> current_thread_reach_;
+
+  const CudaAnalysis* cuda_analysis_;
+};
+
+// A class that overrides the underlying IRPrinter to produce Cuda C.
+class CudaPrinter : public IRPrinter {
+ public:
+  explicit CudaPrinter(
+      std::ostream* os,
+      const CudaAnalysis* cuda_analysis,
+      bool has_random)
+      : IRPrinter(*os), cuda_analysis_(cuda_analysis) {
+    if (has_random) {
+      rand_func_ = alloc<Var>("rand", kHandle);
+    }
+  }
+
+  void visit(CastPtr v) override;
+  void visit(IntrinsicsPtr v) override;
+  void visit(ForPtr v) override;
+
+  void visit(LoadPtr v) override;
+  void visit(StorePtr v) override;
+  void visit(AtomicAddPtr v) override;
+  void visit(MaxPtr v) override;
+  void visit(MinPtr v) override;
+  void visit(IfThenElsePtr v) override;
+  void visit(BlockPtr v) override;
+  void visit(AllocatePtr v) override;
+  void visit(FreePtr v) override;
+  void visit(LetPtr v) override;
+
+  void visit(ExternalCallPtr v) override;
+
+  VarPtr rand_func() const {
+    return rand_func_;
+  }
+
+  std::string dtypeToCppString(const Dtype& dtype) override;
+
+  using IRPrinter::name_manager;
+  using IRPrinter::visit;
+
+ private:
+  VarPtr rand_func_;
+  const CudaAnalysis* cuda_analysis_;
+
+  void print_flat_alloc(AllocatePtr alloc);
+};
+
+// Construct Cuda C from the buffer and tensor input, and invoke the kernel
+// when real arguments are provided.
+class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
+ public:
+  template <typename... Ts>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CudaCodeGen(StmtPtr stmt, Ts... ts)
+      : CodeGen(
+            stmt,
+            std::vector<BufferArg>({BufferArg(ts)...}),
+            at::Device(at::kCUDA, at::cuda::current_device())) {
+    Initialize();
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CudaCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::Device(at::kCUDA, at::cuda::current_device()),
+      const std::string& kernel_func_name = "func")
+      : CodeGen(stmt, buffer_args, device, kernel_func_name) {
+    Initialize();
+  }
+
+  ~CudaCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+  void call_with_numel(void** args, int64_t numel) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    call(std::vector<CallArg>({CallArg(ts)...}));
+  }
+
+  at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      c10::optional<c10::ScalarType> dtype_opt,
+      c10::optional<c10::Layout> layout_opt,
+      c10::optional<c10::Device> device_opt,
+      c10::optional<bool> pin_memory_opt) override;
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return cuda_analysis_->gpu_block_extents();
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return cuda_analysis_->gpu_thread_extents();
+  }
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  void Initialize();
+
+  void CompileToNVRTC(const std::string& code, const std::string& func_name);
+
+  UniqueNameManager* name_manager() {
+    if (!printer_) {
+      throw std::runtime_error("Null IRPrinter is not expected");
+    }
+    return printer_->name_manager();
+  }
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<CudaPrinter> printer_;
+  std::unique_ptr<CudaAnalysis> cuda_analysis_;
+  std::unique_ptr<GPUMetaVarRewriter> metavar_rewriter_;
+  std::unordered_set<std::string> taken_func_names;
+  std::mutex eval_lock_;
+  CUfunction function_;
+  bool has_random_ = false;
+  int thread_block_size_ = -1;
+
+  std::vector<bool> arg_pos_in_extents_;
+#ifdef TORCH_ENABLE_LLVM
+  std::vector<ExprEval<LLVMCodeGen>> block_extents_eval_;
+  std::vector<ExprEval<LLVMCodeGen>> thread_extents_eval_;
+#else
+  std::vector<ExprEval<SimpleIREvaluator>> block_extents_eval_;
+  std::vector<ExprEval<SimpleIREvaluator>> thread_extents_eval_;
+#endif
+
+  std::string GetUniqueFuncName(const std::string& func_prefix);
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h
new file mode 100644
index 0000000000000000000000000000000000000000..8351c6ae05d234ff3729968bf189a9e45228abd7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h
@@ -0,0 +1,104 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+constexpr auto philox_random_string = R"(
+
+class Philox {
+public:
+  __device__ inline Philox(unsigned long long seed,
+                           unsigned long long subsequence,
+                           unsigned long long offset) {
+    key.x = (unsigned int)seed;
+    key.y = (unsigned int)(seed >> 32);
+    counter = make_uint4(0, 0, 0, 0);
+    counter.z = (unsigned int)(subsequence);
+    counter.w = (unsigned int)(subsequence >> 32);
+    STATE = 0;
+    incr_n(offset / 4);
+  }
+
+  __device__ inline unsigned long operator()() {
+    if(STATE == 0) {
+      uint4 counter_ = counter;
+      uint2 key_ = key;
+      for(int i = 0; i < 9; i++) {
+        counter_ = single_round(counter_, key_);
+        key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+      }
+      output = single_round(counter_, key_);
+      incr();
+    }
+    unsigned long ret;
+    switch(STATE) {
+      case 0: ret = output.x; break;
+      case 1: ret = output.y; break;
+      case 2: ret = output.z; break;
+      case 3: ret = output.w; break;
+    }
+    STATE = (STATE + 1) % 4;
+    return ret;
+  }
+
+private:
+  uint4 counter;
+  uint4 output;
+  uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ inline void incr() {
+    if (++counter.x)
+      return;
+    if (++counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                    unsigned int *result_high) {
+    *result_high = __umulhi(a, b);
+    return a*b;
+  }
+
+  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+    unsigned int hi0;
+    unsigned int hi1;
+    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+
+    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    return ret;
+  }
+
+  static const unsigned long kPhilox10A = 0x9E3779B9;
+  static const unsigned long kPhilox10B = 0xBB67AE85;
+  static const unsigned long kPhiloxSA = 0xD2511F53;
+  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+};
+
+// Inverse of 2^32.
+#define M_RAN_INVM32 2.3283064e-10f
+__device__  __inline__ float Uint32ToFloat(unsigned int x) {
+  return x * M_RAN_INVM32;
+}
+
+)";
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ab73ad228f5303a3b23cb815dd0550f0df979dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h
@@ -0,0 +1,346 @@
+#pragma once
+
+#include <cmath>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Logging.h>
+#include <c10/util/string_utils.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+#include <torch/csrc/jit/tensorexpr/var_substitutor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class InterpValue {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  InterpValue() : dtype_(kInt) {
+    Intvalues.push_back(0);
+  }
+
+  template <typename T>
+  InterpValue(Dtype dtype, T v) : dtype_(dtype) {
+#define TYPE_CASE(Type, Name)  \
+  if (dtype == k##Name) {      \
+    Name##values.push_back(v); \
+    return;                    \
+  }
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+    throw unsupported_dtype();
+  }
+
+#define VALUE_CTOR(Type, Name)            \
+  InterpValue(Type v) : dtype_(k##Name) { \
+    Name##values.push_back(v);            \
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_CTOR);
+#undef VALUE_CTOR
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit InterpValue(c10::quint8 v) : dtype_(kQUInt8) {
+    QUInt8values.emplace_back(v.val_);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit InterpValue(c10::qint8 v) : dtype_(kQInt8) {
+    QInt8values.emplace_back(v.val_);
+  }
+
+#define VALUE_VEC_CTOR(Type, Name)        \
+  InterpValue(const std::vector<Type>& v) \
+      : dtype_(Dtype(k##Name, v.size())), Name##values(v) {}
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_VEC_CTOR);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  VALUE_VEC_CTOR(c10::quint8, QUInt8)
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  VALUE_VEC_CTOR(c10::qint8, QInt8)
+#undef VALUE_VEC_CTOR
+
+  template <typename T>
+  T as() const;
+
+  template <typename T>
+  const std::vector<T>& as_vec() const;
+
+  int64_t intValue() const;
+
+  Dtype dtype() const {
+    return dtype_;
+  }
+
+ private:
+  Dtype dtype_;
+
+#define VALUE_STORAGE(Type, Name) std::vector<Type> Name##values;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_STORAGE);
+  VALUE_STORAGE(c10::qint8, QInt8);
+  VALUE_STORAGE(c10::quint8, QUInt8);
+#undef VALUE_STORAGE
+  void* ptr;
+};
+
+#define VALUE_AS_DISPATCH(Type, Name)         \
+  template <>                                 \
+  inline Type InterpValue::as<Type>() const { \
+    if (dtype_ != k##Name) {                  \
+      throw unsupported_dtype();              \
+    }                                         \
+    return Name##values[0];                   \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_DISPATCH);
+VALUE_AS_DISPATCH(c10::quint8, QUInt8);
+VALUE_AS_DISPATCH(c10::qint8, QInt8);
+#undef VALUE_AS_DISPATCH
+
+#define VALUE_AS_VEC_DISPATCH(Type, Name)                             \
+  template <>                                                         \
+  inline const std::vector<Type>& InterpValue::as_vec<Type>() const { \
+    if (dtype_.scalar_type() != ScalarType::Name) {                   \
+      throw unsupported_dtype();                                      \
+    }                                                                 \
+    return Name##values;                                              \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_VEC_DISPATCH);
+VALUE_AS_VEC_DISPATCH(c10::quint8, QUInt8);
+VALUE_AS_VEC_DISPATCH(c10::qint8, QInt8);
+#undef VALUE_AS_VEC_DISPATCH
+
+template <typename Type>
+auto underlyingValue(Type x) {
+  return x;
+}
+
+template <>
+inline auto underlyingValue<c10::quint8>(c10::quint8 x) {
+  return x.val_;
+}
+
+template <>
+inline auto underlyingValue<c10::qint8>(c10::qint8 x) {
+  return x.val_;
+}
+
+template <typename To, typename From>
+To raw_bitcast(const From& src) {
+  TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation");
+  To storage;
+  std::memcpy(&storage, &src, sizeof(To));
+  return reinterpret_cast<To&>(storage);
+}
+
+class SimpleIREvaluatorImpl;
+class TORCH_API SimpleIREvaluator : public CodeGen {
+ public:
+  SimpleIREvaluator(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func");
+
+  ~SimpleIREvaluator() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<CallArg> args({CallArg(ts)...});
+    call(args);
+  }
+
+  void bindVar(VarPtr v, ExprPtr e);
+  InterpValue value() const;
+
+ private:
+  void bindArg(const BufferArg& buf, void* data);
+  void expand_intrinsics() {
+    GenericIntrinsicsExpander intrinsics_expander;
+    apply_mutator(&intrinsics_expander);
+  }
+
+  std::unique_ptr<SimpleIREvaluatorImpl> impl_;
+};
+
+template <class CodeGenType>
+class ExprEval {
+ public:
+  using BufferArg = CodeGen::BufferArg;
+  using CallArg = CodeGen::CallArg;
+
+  template <typename... Ts>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ExprEval(const ExprHandle& expr, Ts... ts)
+      : ExprEval(expr, {BufferArg(ts)...}) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ExprEval(const ExprHandle& expr, const std::vector<BufferArg>& buffer_args)
+      : dtype_(expr.dtype()) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<BufferArg> buffer_args_extended = buffer_args;
+    BufHandle ret_buf("ret_val", {1}, dtype_);
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<ExprHandle> indices;
+    ExprHandle zero = IntImm::make(0);
+    for (size_t i = 0; i < ret_buf.ndim(); i++) {
+      indices.push_back(zero);
+    }
+    StmtPtr store_stmt = Store::make(ret_buf, indices, expr);
+    buffer_args_extended.emplace_back(ret_buf);
+    codegen_.reset(new CodeGenType(store_stmt, buffer_args_extended));
+  }
+
+  template <typename... Ts>
+  void operator()(Ts... ts) {
+    call(ts...);
+  }
+
+  void operator()(const std::vector<CallArg>& call_args) {
+    call(call_args);
+  }
+
+  void bindVar(VarPtr v, ExprPtr e) {
+    codegen_->bindVar(v, e);
+  }
+
+  void bindVar(const VarHandle& v, const ExprHandle& e) {
+    codegen_->bindVar(v.node(), e.node());
+  }
+
+  template <typename... Ts>
+  void call(Ts... ts) {
+    call({CallArg(ts)...});
+  }
+
+  void call(const std::vector<CallArg>& call_args) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<CallArg> call_args_extended = call_args;
+    switch (dtype_.scalar_type()) {
+#define TYPE_CASE(Type, Name)                           \
+  case ScalarType::Name: {                              \
+    std::vector<Type> ret_val_arg(1);                   \
+    call_args_extended.push_back(CallArg(ret_val_arg)); \
+    codegen_->call(call_args_extended);                 \
+    ret_value_ = InterpValue(ret_val_arg[0]);           \
+  } break;
+      // NOLINTNEXTLINE(modernize-use-emplace)
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
+      // NOLINTNEXTLINE(modernize-use-emplace)
+      TYPE_CASE(c10::quint8, QUInt8);
+      // NOLINTNEXTLINE(modernize-use-emplace)
+      TYPE_CASE(c10::qint8, QInt8);
+#undef TYPE_CASE
+      case ScalarType::Bool: {
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        std::vector<unsigned char> ret_val_arg(1);
+        call_args_extended.emplace_back(ret_val_arg.data());
+        codegen_->call(call_args_extended);
+        ret_value_ = InterpValue((bool)ret_val_arg[0]);
+      } break;
+      default:
+        throw unsupported_dtype();
+    }
+  }
+
+  void call_raw(const std::vector<void*>& args) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<void*> args_extended = args;
+    switch (dtype_.scalar_type()) {
+#define TYPE_CASE(Type, Name)                    \
+  case ScalarType::Name: {                       \
+    std::vector<Type> ret_val_arg(1);            \
+    args_extended.push_back(ret_val_arg.data()); \
+    codegen_->call_raw(args_extended);           \
+    ret_value_ = InterpValue(ret_val_arg[0]);    \
+  } break;
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
+      TYPE_CASE(c10::quint8, QUInt8);
+      TYPE_CASE(c10::qint8, QInt8);
+#undef TYPE_CASE
+      case ScalarType::Bool: {
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        std::vector<unsigned char> ret_val_arg(1);
+        args_extended.push_back(ret_val_arg.data());
+        codegen_->call_raw(args_extended);
+        ret_value_ = InterpValue((bool)ret_val_arg[0]);
+      } break;
+      default:
+        throw unsupported_dtype();
+    }
+  }
+
+  template <typename T>
+  T value(const std::vector<void*>& args) {
+    call_raw(args);
+    return ret_value_.as<T>();
+  }
+
+  template <typename T, typename... Ts>
+  T value(Ts... ts) {
+    call(std::forward<Ts>(ts)...);
+    return ret_value_.as<T>();
+  }
+
+  Dtype dtype() {
+    return dtype_;
+  }
+
+ private:
+  Dtype dtype_;
+  std::unique_ptr<CodeGenType> codegen_;
+  InterpValue ret_value_;
+};
+
+// Evaluates the given expression and returns an int64_t value if the result of
+// the given expression is int64_t.
+c10::optional<int64_t> evalInt(ExprPtr e);
+
+// Substitutes the given vars with their corresponding expressions in the input
+// expression.
+inline ExprPtr Substitute(ExprPtr expr, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return expr->accept_mutator(&var_sub);
+}
+
+// Substitutes the given vars with their corresponding expressions in the input
+// statement.
+inline StmtPtr Substitute(StmtPtr stmt, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return stmt->accept_mutator(&var_sub);
+}
+
+// Creates a clone of the input expression and substitutes the given vars with
+// their corresponding expressions in the clone.
+// NOTE: This works because cloning reuses variables and does not create new
+// ones, and `VarMapping` input has variables as the key.
+inline ExprPtr SubstituteInClone(ExprPtr expr, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return Expr::clone(std::move(expr))->accept_mutator(&var_sub);
+}
+
+// Creates a clone of the input statement and substitutes the given vars with
+// their corresponding expressions in the clone.
+// NOTE: This works because cloning reuses variables and does not create new
+// ones, and `VarMapping` input has variables as the key.
+inline StmtPtr SubstituteInClone(StmtPtr stmt, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return Stmt::clone(std::move(stmt))->accept_mutator(&var_sub);
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..23355175389d408eca2aa628a7b50e0321143f27
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+#include <sstream>
+#include <stdexcept>
+
+// Forward declarations of types
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+class Expr;
+class Stmt;
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+// Forward declarations of functions
+namespace std {
+TORCH_API std::string to_string(const torch::jit::tensorexpr::ExprPtr);
+TORCH_API std::string to_string(const torch::jit::tensorexpr::StmtPtr);
+} // namespace std
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class unsupported_dtype : public std::runtime_error {
+ public:
+  explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {}
+  explicit unsupported_dtype(const std::string& err)
+      : std::runtime_error("UNSUPPORTED DTYPE: " + err) {}
+};
+
+class out_of_range_index : public std::runtime_error {
+ public:
+  explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {}
+  explicit out_of_range_index(const std::string& err)
+      : std::runtime_error("OUT OF RANGE INDEX: " + err) {}
+};
+
+class unimplemented_lowering : public std::runtime_error {
+ public:
+  explicit unimplemented_lowering()
+      : std::runtime_error("UNIMPLEMENTED LOWERING") {}
+  explicit unimplemented_lowering(ExprPtr expr)
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
+  explicit unimplemented_lowering(StmtPtr stmt)
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
+};
+
+class malformed_input : public std::runtime_error {
+ public:
+  explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {}
+  explicit malformed_input(const std::string& err)
+      : std::runtime_error("MALFORMED INPUT: " + err) {}
+  explicit malformed_input(ExprPtr expr)
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {}
+  explicit malformed_input(const std::string& err, ExprPtr expr)
+      : std::runtime_error(
+            "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {}
+  explicit malformed_input(StmtPtr stmt)
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
+  explicit malformed_input(const std::string& err, StmtPtr stmt)
+      : std::runtime_error(
+            "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {}
+};
+
+class malformed_ir : public std::runtime_error {
+ public:
+  explicit malformed_ir() : std::runtime_error("MALFORMED IR") {}
+  explicit malformed_ir(const std::string& err)
+      : std::runtime_error("MALFORMED IR: " + err) {}
+  explicit malformed_ir(ExprPtr expr)
+      : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {}
+  explicit malformed_ir(const std::string& err, ExprPtr expr)
+      : std::runtime_error(
+            "MALFORMED IR: " + err + " - " + std::to_string(expr)) {}
+  explicit malformed_ir(StmtPtr stmt)
+      : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {}
+  explicit malformed_ir(const std::string& err, StmtPtr stmt)
+      : std::runtime_error(
+            "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {}
+};
+
+TORCH_API std::string buildErrorMessage(const std::string& s = "");
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h
new file mode 100644
index 0000000000000000000000000000000000000000..476c8a6b7e6a1f7bd288f4f209384bd7e0025e7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h
@@ -0,0 +1,499 @@
+/**
+ * This file implements the core classes for Tensor Expressions.
+ *
+ * The structure of the expressions is inspired by Halide/TVM IR.
+ */
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+enum IRNodeType {
+  kPrimitive,
+  kAdd,
+  kSub,
+  kMul,
+  kDiv,
+  kMod,
+  kMax,
+  kMin,
+  kAnd,
+  kOr,
+  kLshift,
+  kRshift,
+  kXor,
+  kCompareSelect,
+  kCast,
+  kBitCast,
+  kOther,
+};
+
+// The common base between all expression node.
+class TORCH_API Expr : public std::enable_shared_from_this<Expr> {
+ public:
+  explicit Expr(Dtype dtype, IRNodeType expr_type = kOther)
+      : dtype_(dtype), expr_type_(expr_type) {}
+  virtual ~Expr() = default;
+  Dtype dtype() const {
+    return dtype_;
+  }
+  virtual void accept(IRVisitor* visitor) = 0;
+  virtual ExprPtr accept_mutator(IRMutator* mutator) = 0;
+
+  IRNodeType expr_type() const {
+    return expr_type_;
+  }
+  // Is this a fixed (constant) immediate value.
+  virtual bool isConstant() const {
+    return false;
+  }
+
+  void set_dtype(Dtype dtype) {
+    dtype_ = dtype;
+  }
+
+  /*
+   * Make a deep copy of the given expression.
+   *
+   * All sub-expressions inside the given expressions are also cloned. Note
+   * that the variables are not deep-copied since they are immutable.
+   */
+  static ExprPtr clone(ExprPtr s);
+
+ protected:
+  std::shared_ptr<Expr> getptr() {
+    return shared_from_this();
+  }
+
+ private:
+  Dtype dtype_;
+  IRNodeType expr_type_;
+};
+
+// A CRTP pattern to accept visitors for children class,
+// and dispatch back to the children.
+template <class Op, class Base = Expr>
+class ExprNode : public Base {
+ public:
+  using ExprNodeBase = ExprNode<Op>;
+  void accept(IRVisitor* visitor) override {
+    visitor->visit(static_to<Op>(Base::getptr()));
+  }
+  ExprPtr accept_mutator(IRMutator* mutator) override;
+  // pass the constructor to the base class
+  using Base::Base;
+};
+
+// A wrapper object to the underlying ExprNode.
+// Also serves the primary way to build and operate on other expressions.
+class TORCH_API ExprHandle {
+ public:
+  ExprHandle() = default;
+  explicit ExprHandle(ExprPtr node) : base_expr_node_(std::move(node)) {}
+
+  ExprPtr node() {
+    return base_expr_node_;
+  }
+
+  ExprPtr node() const {
+    return base_expr_node_;
+  }
+
+  bool empty() const {
+    return base_expr_node_ == nullptr;
+  }
+
+#define IMM_EXPR_DECLARE(Type, Name) ExprHandle(Type v);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE);
+#undef IMM_EXPR_DECLARE
+
+  template <class Op>
+  NodePtr<Op> AsNode() {
+    return to<Op>(this->node());
+  }
+
+  template <class Op>
+  NodePtr<Op> AsNode() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return const_cast<ExprHandle*>(this)->AsNode<Op>();
+  }
+
+  Dtype dtype() const {
+    return node()->dtype();
+  }
+
+  // Handling the math operators.
+  ExprHandle operator+(const ExprHandle& other) const;
+  ExprHandle operator-(const ExprHandle& other) const;
+  ExprHandle operator*(const ExprHandle& other) const;
+  ExprHandle operator/(const ExprHandle& other) const;
+  ExprHandle operator%(const ExprHandle& other) const;
+  ExprHandle operator==(const ExprHandle& other) const;
+  ExprHandle operator!=(const ExprHandle& other) const;
+  ExprHandle operator>(const ExprHandle& other) const;
+  ExprHandle operator>=(const ExprHandle& other) const;
+  ExprHandle operator<(const ExprHandle& other) const;
+  ExprHandle operator<=(const ExprHandle& other) const;
+  ExprHandle operator&(const ExprHandle& other) const;
+  ExprHandle operator|(const ExprHandle& other) const;
+  ExprHandle operator&&(const ExprHandle& other) const;
+  ExprHandle operator||(const ExprHandle& other) const;
+  ExprHandle operator^(const ExprHandle& other) const;
+  ExprHandle operator<<(const ExprHandle& other) const;
+  ExprHandle operator>>(const ExprHandle& other) const;
+
+ private:
+  ExprPtr base_expr_node_ = nullptr;
+};
+
+// The underlying representation node to a Var.
+// Currently, each Var object represents a unique variable, even though the
+// names might be the same. We should consider add a unique_name as well.
+class TORCH_API Var : public ExprNode<Var> {
+ public:
+  static ExprHandle make(const std::string& name_hint, Dtype dtype) {
+    return ExprHandle(alloc<Var>(name_hint, dtype));
+  }
+  static ExprHandle make(Dtype dtype) {
+    return ExprHandle(alloc<Var>("", dtype));
+  }
+
+  // TODO: unique_name
+  const std::string& name_hint() const {
+    return name_hint_;
+  }
+
+  void set_name_hint(const std::string& name) {
+    name_hint_ = name;
+  }
+
+  void set_name_hint(std::string&& name) {
+    name_hint_ = std::move(name);
+  }
+
+  Var(std::string name_hint, Dtype dtype)
+      : ExprNodeBase(dtype, kPrimitive), name_hint_(std::move(name_hint)) {}
+
+ private:
+  std::string name_hint_;
+};
+
+TORCH_API std::vector<ExprPtr> make_contiguous_strides(
+    const std::vector<ExprHandle>& dims);
+TORCH_API std::vector<ExprPtr> make_channels_last_strides(
+    const std::vector<ExprHandle>& dims);
+
+class TORCH_API Buf : public ExprNode<Buf> {
+ public:
+  static BufHandle make(const std::vector<ExprHandle>& dims, Dtype dtype);
+
+  static BufHandle make(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
+      Dtype dtype);
+
+  static BufHandle make(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      Dtype dtype,
+      c10::optional<ExprHandle> initializer = c10::nullopt,
+      c10::optional<std::vector<ExprHandle>> strides = c10::nullopt,
+      c10::optional<ExprHandle> qscale = c10::nullopt,
+      c10::optional<ExprHandle> qzero = c10::nullopt);
+
+  // TODO: unique_name
+  VarPtr base_handle() const {
+    return base_handle_;
+  }
+  void set_base_handle(VarPtr base_handle) {
+    base_handle_ = std::move(base_handle);
+  }
+
+  const std::string& name_hint() const {
+    return base_handle_->name_hint();
+  }
+  void set_name_hint(const std::string& name_hint) {
+    base_handle_->set_name_hint(name_hint);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Buf(const std::string& name_hint,
+      const std::vector<ExprPtr>& dims,
+      Dtype dtype,
+      ExprPtr initializer = nullptr,
+      c10::optional<std::vector<ExprPtr>> strides = c10::nullopt,
+      ExprPtr qscale = nullptr,
+      ExprPtr qzero = nullptr)
+      : Buf(alloc<Var>(name_hint, kHandle),
+            dims,
+            dtype,
+            std::move(initializer),
+            std::move(strides),
+            std::move(qscale),
+            std::move(qzero)) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Buf(VarPtr var,
+      std::vector<ExprPtr> dims,
+      Dtype dtype,
+      ExprPtr initializer = nullptr,
+      c10::optional<std::vector<ExprPtr>> strides = c10::nullopt,
+      ExprPtr qscale = nullptr,
+      ExprPtr qzero = nullptr);
+
+  size_t ndim() const {
+    return dims_.size();
+  }
+  ExprPtr dim(size_t index) const {
+    if (index >= ndim()) {
+      throw out_of_range_index();
+    }
+    return dims_[index];
+  }
+  std::vector<ExprPtr> dims() const {
+    return dims_;
+  }
+  void set_dims(std::vector<ExprPtr> dims) {
+    dims_ = std::move(dims);
+  }
+
+  std::vector<ExprPtr> strides() const {
+    return strides_;
+  }
+
+  void set_strides(std::vector<ExprPtr> strides) {
+    strides_ = std::move(strides);
+  }
+
+  ExprPtr initializer() const {
+    return initializer_;
+  };
+
+  ExprPtr qzero() const {
+    return qzero_;
+  }
+
+  ExprPtr qscale() const {
+    return qscale_;
+  }
+
+  void set_qzero(ExprPtr qzero) {
+    qzero_ = std::move(qzero);
+  }
+
+  void set_qscale(ExprPtr qscale) {
+    qscale_ = std::move(qscale);
+  }
+
+  bool hasConstantDims() const {
+    for (const auto& d : dims_) {
+      if (!d->isConstant()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const;
+
+  // The channels-last 1d can benefit the performance of some operators like
+  // conv1d. But the MemoryFormat enum has not covered this layout yet. Hence,
+  // we abstract a dedicated function to check channels-last 1d contiguous.
+  //
+  // Channels-last 1d:
+  //   dims:              n   c    l
+  //   strides(nlc):    c*l   1    c
+  bool is_channels_last_1d_contiguous() const {
+    if (dims_.size() != 3) {
+      return false;
+    }
+    return is_stride_one(1) && is_cont_with(2, 1) && is_cont_with(0, 2);
+  }
+
+ private:
+  bool is_cont_with(int cur_dim, int adjacent_dim) const;
+  bool is_stride_one(int cur_dim) const;
+
+  VarPtr base_handle_;
+  std::vector<ExprPtr> dims_;
+  std::vector<ExprPtr> strides_;
+  ExprPtr initializer_;
+  // qscale_ and qzero_ are used only for quantized dtypes Bufs: kQUInt8, kQInt8
+  ExprPtr qscale_;
+  ExprPtr qzero_;
+};
+
+class TORCH_API BufHandle : public ExprHandle {
+ public:
+  BufHandle(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      Dtype dtype)
+      : ExprHandle(Buf::make(name_hint, dims, dtype)) {}
+
+  BufHandle(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
+      Dtype dtype)
+      : ExprHandle(Buf::make(name_hint, dims, strides, dtype)) {}
+
+  BufHandle(const std::vector<ExprHandle>& dims, Dtype dtype)
+      : ExprHandle(Buf::make("_", dims, dtype)) {}
+
+  explicit BufHandle(Dtype dtype) : ExprHandle(Buf::make("_", {}, dtype)) {}
+
+  explicit BufHandle(BufPtr node) : ExprHandle(std::move(node)) {}
+  BufPtr node() const {
+    return static_to<Buf>(ExprHandle::node());
+  }
+  BufPtr node() {
+    return static_to<Buf>(ExprHandle::node());
+  }
+
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
+
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+
+  inline ExprHandle load(const std::vector<ExprHandle>& args) const;
+
+  StorePtr store(const std::vector<ExprHandle>& args, const ExprHandle& val)
+      const;
+
+  bool operator==(const BufHandle& other) const {
+    return this->node() == other.node();
+  }
+  bool operator!=(const BufHandle& other) const {
+    return !(*this == other);
+  }
+
+  const std::string& name_hint() const {
+    return this->node()->name_hint();
+  }
+
+  bool empty() const {
+    return (this->node() == nullptr);
+  }
+
+  size_t ndim() const {
+    return node()->ndim();
+  }
+
+  std::vector<ExprHandle> dims() const;
+
+  ExprHandle dim(size_t index) const {
+    return ExprHandle(node()->dim(index));
+  }
+
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    return node()->is_contiguous(memory_format);
+  }
+
+  bool is_channels_last_1d_contiguous() const {
+    return node()->is_channels_last_1d_contiguous();
+  }
+};
+
+// An expression to construct the underlying variable node.
+// Note: do not store any info here, since it is often possible to slice this
+// object. For example: VarHandle x('x'); ExprHandle x2 = x;
+class TORCH_API VarHandle : public ExprHandle {
+ public:
+  // Creates an empty VarHandle whose base Var is set to nullptr.
+  VarHandle() : ExprHandle() {}
+
+  explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {}
+
+  VarHandle(const std::string& name_hint, Dtype dtype)
+      : ExprHandle(Var::make(name_hint, dtype)) {}
+
+  explicit VarHandle(VarPtr node) : ExprHandle(std::move(node)) {}
+
+  VarPtr node() const {
+    return static_to<Var>(ExprHandle::node());
+  }
+  bool operator==(const VarHandle& other) const {
+    return this->node() == other.node();
+  }
+  bool operator!=(const VarHandle& other) const {
+    return !(*this == other);
+  }
+
+  const std::string& name_hint() const {
+    return this->node()->name_hint();
+  }
+  bool empty() const {
+    return (this->node() == nullptr);
+  }
+};
+
+template <class Op, class Base>
+ExprPtr ExprNode<Op, Base>::accept_mutator(IRMutator* mutator) {
+  return mutator->mutate(static_to<Op>(Base::getptr()));
+}
+
+inline bool same_node(const ExprHandle& expr1, const ExprHandle& expr2) {
+  return expr1.AsNode<Expr>() == expr2.AsNode<Expr>();
+}
+
+TORCH_API ExprHandle sin(const ExprHandle& v);
+TORCH_API ExprHandle cos(const ExprHandle& v);
+TORCH_API ExprHandle tan(const ExprHandle& v);
+TORCH_API ExprHandle asin(const ExprHandle& v);
+TORCH_API ExprHandle acos(const ExprHandle& v);
+TORCH_API ExprHandle atan(const ExprHandle& v);
+TORCH_API ExprHandle sinh(const ExprHandle& v);
+TORCH_API ExprHandle cosh(const ExprHandle& v);
+TORCH_API ExprHandle tanh(const ExprHandle& v);
+TORCH_API ExprHandle sigmoid(const ExprHandle& v);
+TORCH_API ExprHandle exp(const ExprHandle& v);
+TORCH_API ExprHandle expm1(const ExprHandle& v);
+TORCH_API ExprHandle abs(const ExprHandle& v);
+TORCH_API ExprHandle log(const ExprHandle& v);
+TORCH_API ExprHandle fast_tanh(const ExprHandle& v);
+TORCH_API ExprHandle fast_sigmoid(const ExprHandle& v);
+TORCH_API ExprHandle fast_log(const ExprHandle& v);
+TORCH_API ExprHandle log_vml(const ExprHandle& v);
+TORCH_API ExprHandle log2(const ExprHandle& v);
+TORCH_API ExprHandle log10(const ExprHandle& v);
+TORCH_API ExprHandle log1p(const ExprHandle& v);
+TORCH_API ExprHandle erf(const ExprHandle& v);
+TORCH_API ExprHandle erfc(const ExprHandle& v);
+TORCH_API ExprHandle sqrt(const ExprHandle& v);
+TORCH_API ExprHandle rsqrt(const ExprHandle& v);
+TORCH_API ExprHandle ceil(const ExprHandle& v);
+TORCH_API ExprHandle floor(const ExprHandle& v);
+TORCH_API ExprHandle round(const ExprHandle& v);
+TORCH_API ExprHandle trunc(const ExprHandle& v);
+TORCH_API ExprHandle frac(const ExprHandle& v);
+TORCH_API ExprHandle lgamma(const ExprHandle& v);
+TORCH_API ExprHandle atan2(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle pow(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle fmod(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle remainder(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle isnan(const ExprHandle& v1);
+TORCH_API ExprHandle Relu(const ExprHandle& v1);
+
+TORCH_API ExprHandle
+ifThenElse(const ExprHandle& c, const ExprHandle& t, const ExprHandle& f);
+
+TORCH_API ExprHandle expr_to_vec(ExprHandle v, int lanes);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3a683f081dd101224e7fb36dcc5cd212923bbb4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Functions.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <vector>
+
+#define FOR_ALL_EXTERNAL_FUNCTIONS(_)   \
+  _(nnc_aten_adaptive_avg_pool2d)       \
+  _(nnc_aten_addmm)                     \
+  _(nnc_aten_conv2d)                    \
+  _(nnc_aten_conv1d)                    \
+  _(nnc_aten_conv1d_out)                \
+  _(nnc_aten_dequantize)                \
+  _(nnc_aten_dequantize_out)            \
+  _(nnc_aten_embedding)                 \
+  _(nnc_aten_matmul)                    \
+  _(nnc_aten_mv)                        \
+  _(nnc_aten_mm)                        \
+  _(nnc_aten_mean)                      \
+  _(nnc_aten_max_red)                   \
+  _(nnc_aten_max_red_out)               \
+  _(nnc_aten_quantized_conv1d)          \
+  _(nnc_aten_quantized_conv1d_out)      \
+  _(nnc_aten_quantized_conv2d)          \
+  _(nnc_aten_quantized_conv2d_out)      \
+  _(nnc_aten_quantized_conv2d_relu)     \
+  _(nnc_aten_quantized_conv2d_relu_out) \
+  _(nnc_aten_quantized_linear)          \
+  _(nnc_aten_quantized_linear_out)      \
+  _(nnc_aten_quantized_linear_relu)     \
+  _(nnc_aten_quantized_add)             \
+  _(nnc_aten_quantized_cat)             \
+  _(nnc_aten_quantized_mul)             \
+  _(nnc_aten_quantized_mul_out)         \
+  _(nnc_aten_quantized_mul_scalar)      \
+  _(nnc_aten_quantized_mul_scalar_out)  \
+  _(nnc_aten_quantized_relu)            \
+  _(nnc_aten_quantized_sigmoid)         \
+  _(nnc_aten_quantized_sigmoid_out)     \
+  _(nnc_aten_quantize_per_tensor)       \
+  _(nnc_aten_quantize_per_tensor_out)   \
+  _(nnc_aten_triangular_solve)          \
+  _(nnc_aten_upsample_nearest2d)        \
+  _(nnc_aten_upsample_nearest2d_out)    \
+  _(nnc_prepacked_conv2d_clamp_run)     \
+  _(nnc_prepacked_linear_clamp_run)
+
+#define DECLARE_EXTERNAL_FUNCTION(NAME) \
+  TORCH_API void NAME(                  \
+      int64_t bufs_num,                 \
+      void** buf_data,                  \
+      int64_t* buf_ranks,               \
+      int64_t* buf_dims,                \
+      int64_t* buf_strides,             \
+      int8_t* buf_dtypes,               \
+      int64_t args_num,                 \
+      int64_t* extra_args);
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+struct QIData final {
+  double scale;
+  int64_t zero;
+  c10::ScalarType scalarType;
+};
+std::vector<at::Tensor> constructTensors(
+    int64_t bufs_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+        c10::nullopt);
+
+std::vector<at::Tensor> constructTensors2(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    c10::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+        c10::nullopt,
+    size_t bufs_out_num = 0);
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept;
+
+FOR_ALL_EXTERNAL_FUNCTIONS(DECLARE_EXTERNAL_FUNCTION)
+#if AT_MKLDNN_ENABLED()
+DECLARE_EXTERNAL_FUNCTION(nnc_mkldnn_prepacked_conv_run);
+#endif
+
+TORCH_API void nnc_aten_free(int64_t bufs_num, void** ptrs) noexcept;
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+#undef DECLARE_EXTERNAL_FUNCTION
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfe67e1d165b589feb8b4abb6a7fb54aefbef808
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept;
+
+TORCH_API void nnc_aten_free(int64_t bufs_num, void** ptrs) noexcept;
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..e554c1e340b3a1d5cfb4b89073701d3c271e6388
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// The external functions that could be called from NNC must have the same
+// signature defined by `NNCExternalFunction`.
+//
+// Why this signature?
+// It was picked for two reasons: 1) it should be generic enough to represent
+// most of the ops we might want to call, 2) it should be possible to generate a
+// code for this call in LLVM codegen.
+// The first 5 parameters allow to pass any number of contiguous CPU tensors in
+// case we need to run aten ops (TODO: support different devices). The first
+// buffer in the array is assumed to be the output buffer. We couldn't use
+// `at::Tensor` (or `c10::IValue`) type there directly as it would mean that
+// we'd need to declare it in LLVM codegen in LLVM IR form, which would be very
+// cumbersome and hard to maintain. Note that the dimensions of all tensors are
+// concatenated into a single array buf_dims. We do not need to pass its length,
+// since it can be deduced from total number of buffers and their ranks.
+//
+// The last 2 arguments allow to pass any non-tensor arguments encoded as an
+// array of int64_t values. The way they are encoded is not specified and could
+// be arbitrary - whatever the most convenient for the specific bridge function
+// is.
+//
+// The bridge functions must not throw exceptions - properly propagating them
+// from the generated code is too cumbersome, and thus all calls to functions
+// that could throw must be wrapped with try-catch blocks.
+using NNCExternalFunction = void (*)(
+    int64_t bufs_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t args_num,
+    int64_t* extra_args);
+
+// Return a global map "function-name" -> "function-pointer" for all registered
+// in NNC external functions
+TORCH_API std::unordered_map<std::string, NNCExternalFunction>&
+getNNCFunctionRegistry();
+
+// To register a new external function in NNC one needs to create an instance of
+// this struct
+struct RegisterNNCExternalFunction {
+  RegisterNNCExternalFunction(const std::string& name, NNCExternalFunction fn) {
+    getNNCFunctionRegistry()[name] = fn;
+  }
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7cefd688c0c701bfc758d68c9c6ced410c0f0fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -0,0 +1,129 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <memory>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+template <typename Node>
+using NodePtr = std::shared_ptr<Node>;
+
+template <typename To, typename From>
+NodePtr<To> to(NodePtr<From> x) {
+  return std::dynamic_pointer_cast<To>(x);
+}
+
+template <typename To, typename From>
+NodePtr<To> static_to(NodePtr<From> x) {
+  return std::static_pointer_cast<To>(x);
+}
+
+template <typename Node, typename... Args>
+NodePtr<Node> alloc(Args&&... args) {
+  return std::make_shared<Node>(std::forward<Args>(args)...);
+}
+
+class Buf;
+class Expr;
+class Stmt;
+class Var;
+
+using BufPtr = NodePtr<Buf>;
+using ExprPtr = NodePtr<Expr>;
+using StmtPtr = NodePtr<Stmt>;
+using VarPtr = NodePtr<Var>;
+
+class ExprHandle;
+class VarHandle;
+class BufHandle;
+
+class Add;
+class And;
+class BitCast;
+class Broadcast;
+class Cast;
+class CompareSelect;
+class Div;
+class IfThenElse;
+class Intrinsics;
+class Let;
+class Load;
+class Lshift;
+class Max;
+class MaxTerm;
+class Min;
+class MinTerm;
+class Mod;
+class Mul;
+class Or;
+class Polynomial;
+class Ramp;
+class ReduceOp;
+class RoundOff;
+class Rshift;
+class Store;
+class Sub;
+class Term;
+class Xor;
+using AddPtr = NodePtr<Add>;
+using AndPtr = NodePtr<And>;
+using BitCastPtr = NodePtr<BitCast>;
+using BroadcastPtr = NodePtr<Broadcast>;
+using CastPtr = NodePtr<Cast>;
+using CompareSelectPtr = NodePtr<CompareSelect>;
+using DivPtr = NodePtr<Div>;
+using IfThenElsePtr = NodePtr<IfThenElse>;
+using IntrinsicsPtr = NodePtr<Intrinsics>;
+using LetPtr = NodePtr<Let>;
+using LoadPtr = NodePtr<Load>;
+using LshiftPtr = NodePtr<Lshift>;
+using MaxPtr = NodePtr<Max>;
+using MaxTermPtr = NodePtr<MaxTerm>;
+using MinPtr = NodePtr<Min>;
+using MinTermPtr = NodePtr<MinTerm>;
+using ModPtr = NodePtr<Mod>;
+using MulPtr = NodePtr<Mul>;
+using OrPtr = NodePtr<Or>;
+using PolynomialPtr = NodePtr<Polynomial>;
+using RampPtr = NodePtr<Ramp>;
+using ReduceOpPtr = NodePtr<ReduceOp>;
+using RoundOffPtr = NodePtr<RoundOff>;
+using RshiftPtr = NodePtr<Rshift>;
+using StorePtr = NodePtr<Store>;
+using SubPtr = NodePtr<Sub>;
+using TermPtr = NodePtr<Term>;
+using XorPtr = NodePtr<Xor>;
+
+class Allocate;
+class AtomicAdd;
+class Block;
+class Cond;
+class ExternalCall;
+class ExternalCallWithAlloc;
+class For;
+class Free;
+class FreeExt;
+class PlacementAllocate;
+class SyncThreads;
+using AllocatePtr = NodePtr<Allocate>;
+using AtomicAddPtr = NodePtr<AtomicAdd>;
+using BlockPtr = NodePtr<Block>;
+using CondPtr = NodePtr<Cond>;
+using ExternalCallPtr = NodePtr<ExternalCall>;
+using ExternalCallWithAllocPtr = NodePtr<ExternalCallWithAlloc>;
+using ForPtr = NodePtr<For>;
+using FreePtr = NodePtr<Free>;
+using FreeExtPtr = NodePtr<FreeExt>;
+using PlacementAllocatePtr = NodePtr<PlacementAllocate>;
+using SyncThreadsPtr = NodePtr<SyncThreads>;
+
+#define IMM_DECLARE(Type, Name) \
+  class Name##Imm;              \
+  using Name##ImmPtr = NodePtr<Name##Imm>;
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
+#undef IMM_DECLARE
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..380e4faa5f5144ce3b388a68b1b136015eab5ee5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// Optimize aten::cat ops in the given subgraph.
+//
+// Moving users of cat to its inputs.
+//    Cat ops get lowered into multiple loops, one per input. When the result
+//    of cat is used by some other op, it results in a situation where inlining
+//    of cat does not happen. This in turn results in intermediate buffers
+//    being created for the result of cat, since it is not inlined.
+//
+//    For example, consider the following graph:
+//       graph(%x : Float(10, strides=[1], device=cpu),
+//             %y : Float(20, strides=[1], device=cpu)):
+//         %dim : int = prim::Constant[value=0]()
+//         %xy_list : Tensor[] = prim::ListConstruct(%x, %y)
+//         %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
+//         %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
+//         return (%5))IR";
+//
+//     This will get lowered into:
+//         Allocate(aten_cat);
+//         for (...)
+//           aten_cat[...] = x[...]
+//         for (...)
+//           aten_cat[...] = y[...]
+//         for (...)
+//           aten_log[...] = log(aten_cat[...])
+//         Free(aten_cat);
+//     Note that aten_cat is not inlined into aten_log and it results in
+//     an intermediate buffer allocation as well.
+//
+//     Optimization:
+//        We move the ops that use the result of `cat` into its inputs whenever
+//     possible.
+//
+//     The graph above will be transformed to:
+//        graph(%x : Float(10, strides=[1], device=cpu),
+//              %y : Float(20, strides=[1], device=cpu)):
+//          %3 : int = prim::Constant[value=0]()
+//          %7 : Float(10, strides=[1], device=cpu) = aten::log(%x)
+//          %8 : Float(20, strides=[1], device=cpu) = aten::log(%y)
+//          %9 : Tensor[] = prim::ListConstruct(%7, %8)
+//          %10 : Float(60, strides=[1], device=cpu) = aten::cat(%9, %3)
+//          return (%10)
+//
+//     This will get lowered into:
+//         for (...)
+//           aten_cat[...] = log(x[...])
+//         for (...)
+//           aten_cat[...] = log(y[...])
+//     aten_cat is the output buffer here.
+
+bool OptimizeCat(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void annotateInputShapes(
+    const std::shared_ptr<Graph>& graph,
+    const std::vector<c10::optional<at::Tensor>>& example_inputs);
+TORCH_API std::shared_ptr<Graph> removeUnusedSelfArgument(
+    const std::shared_ptr<Graph>& graph);
+TORCH_API std::shared_ptr<Graph> removeGraphOutput(
+    const std::shared_ptr<Graph>& graph,
+    size_t idx);
+TORCH_API std::shared_ptr<Graph> replaceListOutputWithTuple(
+    const std::shared_ptr<Graph>& graph);
+
+// Perform \p ITERS rounds of "trimming" for the given \p GRAPH.
+//
+// Trimming means that we try to remove a small portion of the graph while
+// keeping it valid. This is useful for debugging when we try to find a minimal
+// example reproducing the issue at hand. When ITERS is 0, the graph remains
+// unchanged, when ITERS is a big number, the graph usually becomes empty.
+TORCH_API std::shared_ptr<Graph> trimGraph(
+    const std::shared_ptr<Graph>& graph,
+    int64_t iters);
+
+// Scan all values in the given graph and replace each dimension with a size Xi
+// present in \p SIZES with a symbolic shape Yi. Return a vector of symbol
+// values [Y0, Y1, .., Yn].
+//
+// For example:
+// Input:
+// graph(%x : Float(10, 20, 30, 40)):
+//   %y : Float(10, 20, 30, 40) = aten::relu(%x)
+//   return %y
+//
+// If we run makeShapesSymbolic(graph, {20, 40}), then we'll get:
+//
+// graph(%x : Float(10, SS(-3), 30, SS(-5))):
+//   %y : Float(10, SS(-3), 30, SS(-5)) = aten::relu(%x)
+//   return %y
+//
+// and get {-3, -5} as the return value.
+TORCH_API std::vector<int64_t> makeShapesSymbolic(
+    std::shared_ptr<Graph>& graph,
+    const std::vector<int64_t>& sizes);
+
+// Inspect the graph and report whether it can be converted to TE IR.
+// TODO: add error reporting for graphs that can't be converted.
+TORCH_API bool isGraphCompilable(const std::shared_ptr<Graph>& graph);
+
+// Examine the graph and (hackily) fill in missing tensor type info, such as
+// scalar type, device, and strides. Ideally, this should be done by a proper
+// dtype/device/shape propagation passes, but until they are ready we can use
+// this, not always correct, workaround pass.
+TORCH_API void fixupMissingShapeInfo(const std::shared_ptr<Graph>& graph);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e03409f9b45b3c3fa2d48cbea798a5ead0b7a81
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// Walk the Statement looking for Half size loads/stores.
+class HalfChecker : public IRVisitor {
+ public:
+  HalfChecker(const std::vector<CodeGen::BufferArg>& args) {
+    for (const auto& BA : args) {
+      hasHalf_ |= BA.dtype().scalar_type() == ScalarType::Half;
+    }
+  }
+
+  bool hasHalf() const {
+    return hasHalf_;
+  }
+
+  bool hasBFloat16() const {
+    return hasBFloat16_;
+  }
+
+  void visit(LoadPtr v) override {
+    hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+  void visit(StorePtr v) override {
+    hasHalf_ |= v->buf()->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->buf()->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+  void visit(HalfImmPtr v) override {
+    hasHalf_ = true;
+  }
+
+  void visit(BFloat16ImmPtr v) override {
+    hasBFloat16_ = true;
+  }
+
+  void visit(CastPtr v) override {
+    hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+ private:
+  bool hasHalf_{false};
+  bool hasBFloat16_{false};
+};
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class HalfRewriter : public IRMutator {
+  ExprPtr mutate(LoadPtr v) override {
+    ExprPtr child = IRMutator::mutate(v);
+    if (!isHalf(child)) {
+      return child;
+    }
+
+    ExprPtr ret = alloc<Cast>(
+        child->dtype().cloneWithScalarType(ScalarType::Float), child);
+
+    inserted_half_casts_.insert(ret);
+    return ret;
+  }
+
+  StmtPtr mutate(StorePtr v) override {
+    // Since mutation changes the `value()` expression in-place, we need to
+    // get the dtype of the `value()` before that is mutated.
+    auto newType = v->value()->dtype();
+    ExprPtr new_val = v->value()->accept_mutator(this);
+    auto bufType = v->buf()->dtype();
+
+    if (isHalf(newType.scalar_type())) {
+      new_val = alloc<Cast>(newType, new_val);
+      inserted_half_casts_.insert(new_val);
+    }
+
+    // The scalar_type of value is not Half while the buf is Half
+    if (!isHalf(newType.scalar_type()) && isHalf(bufType.scalar_type())) {
+      new_val = alloc<Cast>(
+          newType.cloneWithScalarType(bufType.scalar_type()), new_val);
+      inserted_half_casts_.insert(new_val);
+    }
+
+    v->set_value(new_val);
+    return v;
+  }
+
+  ExprPtr mutate(HalfImmPtr v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
+  ExprPtr mutate(BFloat16ImmPtr v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
+  ExprPtr mutate(CastPtr v) override {
+    ExprPtr child = v->src_value()->accept_mutator(this);
+
+    // just don't allow half casts we didn't insert.
+    if (isHalf(v)) {
+      if (inserted_half_casts_.count(v) < 1) {
+        v->set_src_value(child);
+        v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
+        return v;
+      }
+    }
+
+    // Remove Half(Float()) and friends.
+    CastPtr cast_child = to<Cast>(child);
+    if (cast_child) {
+      auto cast_to_double = v->dtype().scalar_type() == ScalarType::Double;
+      auto from_half = isHalf(cast_child->src_value());
+      // Cannot simplify the double(float(half)) to double(half) as NNC does
+      // not support cast BF16 to double directly.
+      auto not_cast_half_to_doulbe = !(cast_to_double && from_half);
+      if (v->dtype().is_floating_point() &&
+          cast_child->dtype().is_floating_point() && not_cast_half_to_doulbe) {
+        return alloc<Cast>(v->dtype(), cast_child->src_value());
+      }
+    }
+
+    if (child == v->src_value()) {
+      return v;
+    }
+
+    return alloc<Cast>(v->dtype(), child);
+  }
+
+  StmtPtr mutate(LetPtr v) override {
+    if (isHalf(v->var()->dtype().scalar_type())) {
+      VarPtr load_new_var = alloc<Var>(v->var()->name_hint(), kFloat);
+      ExprPtr new_value = alloc<Cast>(
+          v->var()->dtype().cloneWithScalarType(ScalarType::Float),
+          v->value()->accept_mutator(this));
+      var_map[v->var()] = load_new_var;
+
+      return alloc<Let>(load_new_var, new_value);
+    }
+
+    return IRMutator::mutate(v);
+  }
+
+  ExprPtr mutate(VarPtr v) override {
+    auto it = var_map.find(v);
+    if (it != var_map.end()) {
+      return it->second;
+    }
+
+    return v;
+  }
+
+  template <typename T>
+  ExprPtr mutateArithmetic(T v) {
+    IRMutator::mutate(v);
+    if (isHalf(v)) {
+      v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
+    }
+    return v;
+  }
+
+  ExprPtr mutate(AddPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(SubPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MulPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(DivPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MaxPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(MinPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(CompareSelectPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(BroadcastPtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(IfThenElsePtr v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(IntrinsicsPtr v) override {
+    return mutateArithmetic(v);
+  }
+
+ private:
+  static bool isHalf(ScalarType st) {
+    return st == ScalarType::Half || st == ScalarType::BFloat16;
+  }
+
+  static bool isHalf(ExprPtr v) {
+    return isHalf(v->dtype().scalar_type());
+  }
+
+  std::unordered_set<ExprPtr> inserted_half_casts_;
+  std::unordered_map<VarPtr, VarPtr> var_map;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9210cf4d17e9bf7829571c8d9fd4f4dd682af76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h
@@ -0,0 +1,304 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+struct TORCH_API SimplifierHashType {
+  SimplifierHashType() = default;
+  explicit SimplifierHashType(size_t s) : _h(s) {}
+
+  bool operator==(const SimplifierHashType& other) const;
+  bool operator!=(const SimplifierHashType& other) const;
+  bool operator<(const SimplifierHashType& other) const;
+  bool operator==(const size_t other) const;
+  bool operator!=(const size_t other) const;
+
+  size_t _h{0};
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+namespace std {
+template <>
+struct hash<torch::jit::tensorexpr::SimplifierHashType> {
+  size_t operator()(const torch::jit::tensorexpr::SimplifierHashType& k) const {
+    return k._h;
+  }
+};
+
+} // namespace std
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+#define CACHE_GUARD()  \
+  if (cachedHash(v)) { \
+    return;            \
+  }
+
+class Term;
+class Polynomial;
+
+/* Expression hasher providing comparable values representing sub-exprs.
+ * Uses memoization to avoid excessive recursion. */
+class TORCH_API HashProvider : public IRVisitor {
+ public:
+  template <class T>
+  SimplifierHashType hash(T e) {
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    e->accept(this);
+    return hashOf(e);
+  }
+
+  bool cachedHash(ExprPtr e) {
+    return exprToHash_.find(e) != exprToHash_.end();
+  }
+  bool cachedHash(StmtPtr s) {
+    return stmtToHash_.find(s) != stmtToHash_.end();
+  }
+
+  void clearCache() {
+    exprToHash_.clear();
+    stmtToHash_.clear();
+  }
+
+  void visit(AddPtr v) override;
+  void visit(SubPtr v) override;
+  void visit(MulPtr v) override;
+  void visit(DivPtr v) override;
+  void visit(ModPtr v) override;
+  void visit(RoundOffPtr v) override;
+  void visit(MaxPtr v) override;
+  void visit(MinPtr v) override;
+  void visit(AndPtr v) override;
+  void visit(OrPtr v) override;
+  void visit(XorPtr v) override;
+  void visit(LshiftPtr v) override;
+  void visit(RshiftPtr v) override;
+  void visit(CompareSelectPtr v) override;
+
+// NOLINTNEXTLINE
+#define IMM_VISIT(Type, Name)                    \
+  void visit(Name##ImmPtr v) override {          \
+    CACHE_GUARD();                               \
+    putHash(v, hash_combine(#Name, v->value())); \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT);
+#undef IMM_VISIT
+
+  void visit(CastPtr v) override;
+  void visit(VarPtr v) override;
+  void visit(RampPtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(StorePtr v) override;
+  void visit(BlockPtr v) override;
+  void visit(ForPtr v) override;
+  void visit(BroadcastPtr v) override;
+  void visit(IfThenElsePtr v) override;
+  void visit(IntrinsicsPtr v) override;
+  void visit(AllocatePtr v) override;
+  void visit(FreePtr v) override;
+  void visit(CondPtr v) override;
+  void visit(TermPtr v) override;
+  void visit(PolynomialPtr v) override;
+  void visit(MaxTermPtr v) override;
+  void visit(MinTermPtr v) override;
+
+  template <typename... Types>
+  SimplifierHashType hash_combine(const Types&... args) {
+    SimplifierHashType seed;
+    _hash_combine(seed, args...);
+    return seed;
+  }
+
+ private:
+  SimplifierHashType hashOf(ExprPtr e) {
+    auto it = exprToHash_.find(e);
+    if (it != exprToHash_.end()) {
+      return it->second;
+    }
+
+    // As a failsafe fall back to IRPrinter.
+    std::stringstream ss;
+    IRPrinter printer(ss);
+    e->accept(&printer);
+    SimplifierHashType hash = SimplifierHashType(te_hash(ss.str()));
+    putHash(std::move(e), hash);
+
+    return hash;
+  }
+
+  SimplifierHashType hashOf(StmtPtr s) {
+    auto it = stmtToHash_.find(s);
+    if (it != stmtToHash_.end()) {
+      return it->second;
+    }
+
+    // As a failsafe fall back to IRPrinter.
+    std::stringstream ss;
+    IRPrinter printer(ss);
+    s->accept(&printer);
+    SimplifierHashType hash = SimplifierHashType(te_hash(ss.str()));
+    putHash(std::move(s), hash);
+
+    return hash;
+  }
+
+  // Hash funcs for various types, numbers are random.
+  template <typename T>
+  void _hash_combine(SimplifierHashType& seed, const T& val) {
+    seed._h ^= te_hash(val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, const char* val) {
+    seed._h ^= te_hash(val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  // at:::Half doesn't have a prime_number_hash, so cast to short.
+  void _hash_combine(SimplifierHashType& seed, const at::Half& val) {
+    seed._h ^=
+        te_hash((uint16_t)val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, const Dtype& val) {
+    seed._h ^= te_hash(val.ToCppString()) + 0x1f752c19 + (seed._h << 7) +
+        (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, ExprPtr e) {
+    _hash_combine(seed, hash(std::move(e)));
+  }
+
+  template <typename T, typename... Types>
+  void _hash_combine(
+      SimplifierHashType& seed,
+      const T& val,
+      const Types&... args) {
+    _hash_combine(seed, val);
+    _hash_combine(seed, args...);
+  }
+
+  void putHash(ExprPtr e, SimplifierHashType h) {
+    auto res = exprToHash_.emplace(e, h);
+    if (res.second == false) {
+      // This is always a logic bug since we should check the cache first.
+      throw std::runtime_error("hash collision");
+    }
+  }
+  void putHash(StmtPtr s, SimplifierHashType h) {
+    auto res = stmtToHash_.emplace(s, h);
+    if (res.second == false) {
+      // This is always a logic bug since we should check the cache first.
+      throw std::runtime_error("hash collision");
+    }
+  }
+
+  std::unordered_map<ExprPtr, SimplifierHashType> exprToHash_;
+  std::unordered_map<StmtPtr, SimplifierHashType> stmtToHash_;
+  UniqueNameManager name_manager_;
+
+  size_t te_hash(SimplifierHashType val) {
+    return val._h;
+  }
+
+  size_t te_hash(int64_t val) {
+    // put the thing down.
+    size_t h = val ^ 0x647AA4D20C0B;
+    // bit flip it.
+    size_t h2 = ~h;
+    // and reverse byte order.
+    size_t h3 = 0;
+    for (unsigned int i = 0; i < 64; i += 8) {
+      h3 |= ((h2 >> i) & 0xFF) << (64 - i - 8);
+    }
+    return h3;
+  }
+
+  size_t te_hash(int32_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(uint32_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(uint64_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(int16_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(std::string val) {
+    size_t hash{0};
+    int64_t intval{0};
+    int64_t s = val.size() - 1;
+    while (s >= 0) {
+      for (unsigned int i = 0; i < 8; ++i) {
+        if (s < 0)
+          break;
+        // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+        int64_t c = val.data()[s];
+        intval |= (c << (i * 8));
+
+        s--;
+      }
+      hash ^= te_hash(intval);
+      intval = 0;
+    }
+
+    return hash;
+  }
+
+  size_t te_hash(double d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int64_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
+
+  size_t te_hash(float d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int32_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
+
+  size_t te_hash(at::Half d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int16_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
+
+  size_t te_hash(at::BFloat16 d) {
+    // memcpy as type punning. Should be optimized out.
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    int16_t n;
+    std::memcpy(&n, &d, sizeof d);
+    return te_hash(n);
+  }
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb8018ae39f0b70605c993240f096e3b61f7e2fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <c10/util/ArrayRef.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+struct SymbolAddress {
+  const char* symbol;
+  void* address;
+
+  SymbolAddress(const char* sym, void* addr) : symbol(sym), address(addr) {}
+};
+
+c10::ArrayRef<SymbolAddress> getIntrinsicSymbols();
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+#endif // TORCH_ENABLE_LLVM
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd6c28e825814a610d41b78de3325a9dd07c96f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h
@@ -0,0 +1,934 @@
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <c10/util/string_utils.h>
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+
+#include <ATen/core/ivalue.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+enum CompareSelectOperation {
+  kEQ = 0,
+  kGT,
+  kGE,
+  kLT,
+  kLE,
+  kNE,
+};
+
+enum CompareSelectBias {
+  kUnbiased,
+  kLikely,
+  kUnlikely,
+};
+
+inline int getPrecedence(IRNodeType ty) {
+  // Match C++ operator precedence rules, since some pretty-print expressions to
+  // C++. SEE: https://en.cppreference.com/w/cpp/language/operator_precedence
+  switch (ty) {
+    case kPrimitive:
+      return 0;
+    case kCast:
+    case kBitCast:
+      return 2;
+    case kAdd:
+    case kSub:
+      return 6;
+    case kMul:
+    case kDiv:
+    case kMod:
+      return 5;
+    case kMax:
+    case kMin:
+      return 99;
+    case kAnd:
+      return 11;
+    case kOr:
+      return 13;
+    case kLshift:
+    case kRshift:
+      return 7;
+    case kXor:
+      return 12;
+    case kCompareSelect:
+      return 16;
+    default:
+      return 99;
+  }
+}
+
+class TORCH_API Cast : public ExprNode<Cast> {
+ public:
+  ExprPtr src_value() const {
+    return src_value_;
+  }
+
+  void set_src_value(ExprPtr src_value) {
+    src_value_ = std::move(src_value);
+  }
+
+  static ExprHandle make(Dtype dtype, const ExprHandle& src_value) {
+    return ExprHandle(alloc<Cast>(dtype, src_value.node()));
+  }
+  Cast(Dtype dtype, ExprPtr src_value)
+      : ExprNodeBase(dtype, kCast), src_value_(std::move(src_value)) {}
+
+  bool isConstant() const override {
+    return src_value_->isConstant();
+  }
+
+ private:
+  ExprPtr src_value_;
+};
+
+template <typename T>
+ExprHandle cast(const ExprHandle& src_value) {
+  return Cast::make(Dtype(ToDtype<T>(), src_value.dtype().lanes()), src_value);
+}
+
+// This is a bitwise cast, akin to bitcast in LLVM
+class TORCH_API BitCast : public ExprNode<BitCast> {
+ public:
+  ExprPtr src_value() const {
+    return src_value_;
+  }
+
+  void set_src_value(ExprPtr src_value) {
+    src_value_ = std::move(src_value);
+  }
+
+  static ExprHandle make(Dtype dtype, const ExprHandle& src_value) {
+    return ExprHandle(alloc<BitCast>(dtype, src_value.node()));
+  }
+  BitCast(Dtype dtype, ExprPtr src_value)
+      : ExprNodeBase(dtype, kBitCast), src_value_(std::move(src_value)) {
+    TORCH_CHECK(src_value_->dtype().byte_size() == dtype.byte_size());
+  }
+
+  bool isConstant() const override {
+    return src_value_->isConstant();
+  }
+
+ private:
+  ExprPtr src_value_;
+};
+
+template <typename T>
+ExprHandle bitcast(const ExprHandle& src_value) {
+  return BitCast::make(
+      Dtype(ToDtype<T>(), src_value.dtype().lanes()), src_value);
+}
+
+// Represent the expression node for binary operators.
+// A CRTP pattern to share common code among the operators.
+template <typename Op>
+class BinaryOpNode : public ExprNode<Op> {
+ public:
+  ExprPtr lhs() const {
+    return this->lhs_;
+  }
+  ExprPtr rhs() const {
+    return this->rhs_;
+  }
+
+  void set_lhs(ExprPtr lhs) {
+    lhs_ = std::move(lhs);
+  }
+
+  void set_rhs(ExprPtr rhs) {
+    rhs_ = std::move(rhs);
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) {
+    return ExprHandle(alloc<Op>(lhs.node(), rhs.node()));
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  BinaryOpNode(
+      ExprPtr lhs_v,
+      ExprPtr rhs_v,
+      IRNodeType expr_type,
+      ScalarType ret_type = ScalarType::Undefined)
+      : ExprNode<Op>(
+            // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+            BinaryOpDtype(lhs_v->dtype(), rhs_v->dtype(), ret_type),
+            expr_type),
+        lhs_(CastIfNeeded(std::move(lhs_v), ExprNode<Op>::dtype())),
+        rhs_(CastIfNeeded(std::move(rhs_v), ExprNode<Op>::dtype())) {}
+
+ private:
+  static ExprPtr CastIfNeeded(ExprPtr expr, Dtype dst_dtype) {
+    if (expr->dtype() == dst_dtype) {
+      return expr;
+    }
+    return Cast::make(dst_dtype, ExprHandle(std::move(expr))).node();
+  }
+
+  ExprPtr lhs_;
+  ExprPtr rhs_;
+};
+
+namespace detail {
+template <typename T>
+void bin_op_deducer(BinaryOpNode<T>);
+bool bin_op_deducer(...);
+} // namespace detail
+
+class TORCH_API Add : public BinaryOpNode<Add> {
+ public:
+  Add(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kAdd) {}
+};
+
+class TORCH_API Sub : public BinaryOpNode<Sub> {
+ public:
+  Sub(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kSub) {}
+};
+
+class TORCH_API Mul : public BinaryOpNode<Mul> {
+ public:
+  Mul(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMul) {}
+};
+
+class TORCH_API Div : public BinaryOpNode<Div> {
+ public:
+  Div(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kDiv) {}
+};
+
+class TORCH_API Mod : public BinaryOpNode<Mod> {
+ public:
+  Mod(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMod) {}
+};
+
+template <typename Op>
+class BitwiseOpNode : public BinaryOpNode<Op> {
+ public:
+  BitwiseOpNode(ExprPtr lhs, ExprPtr rhs, IRNodeType type)
+      : BinaryOpNode<Op>(std::move(lhs), std::move(rhs), type) {}
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) {
+    if (!lhs.dtype().is_integral()) {
+      throw unsupported_dtype();
+    }
+    if (lhs.dtype() != rhs.dtype()) {
+      throw malformed_input("lhs/rhs dtype mismatch");
+    }
+    return BinaryOpNode<Op>::make(lhs, rhs);
+  }
+};
+
+class TORCH_API And : public BitwiseOpNode<And> {
+ public:
+  And(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kAnd) {}
+};
+
+class TORCH_API Or : public BitwiseOpNode<Or> {
+ public:
+  Or(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kOr) {}
+};
+
+class TORCH_API Xor : public BitwiseOpNode<Xor> {
+ public:
+  Xor(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kXor) {}
+};
+
+class TORCH_API Lshift : public BitwiseOpNode<Lshift> {
+ public:
+  Lshift(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kLshift) {}
+};
+
+class TORCH_API Rshift : public BitwiseOpNode<Rshift> {
+ public:
+  Rshift(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kRshift) {}
+};
+
+// TODO: add TORCH_API
+// Currently adding it results in a compilation error on Windows
+class Max : public BinaryOpNode<Max> {
+ private:
+  bool propagate_nans_;
+
+ public:
+  Max(ExprPtr lhs, ExprPtr rhs, bool propagate_nans)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMax),
+        propagate_nans_(propagate_nans) {}
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) = delete;
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      bool propagate_nans) {
+    return ExprHandle(alloc<Max>(lhs.node(), rhs.node(), propagate_nans));
+  }
+};
+
+// TODO: add TORCH_API
+// Currently adding it results in a compilation error on Windows
+class Min : public BinaryOpNode<Min> {
+ private:
+  bool propagate_nans_;
+
+ public:
+  Min(ExprPtr lhs, ExprPtr rhs, bool propagate_nans)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMin),
+        propagate_nans_(propagate_nans) {}
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) = delete;
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      bool propagate_nans) {
+    return ExprHandle(alloc<Min>(lhs.node(), rhs.node(), propagate_nans));
+  }
+};
+
+// Encode typed immediate values e.g. IntImm, FloatImm.
+#define IMM_DECLARE(Type, Name)                               \
+  class TORCH_API Name##Imm : public ExprNode<Name##Imm> {    \
+   public:                                                    \
+    Name##Imm(Type value)                                     \
+        : ExprNodeBase(k##Name, kPrimitive), value_(value) {} \
+    bool isConstant() const override {                        \
+      return true;                                            \
+    }                                                         \
+    Type value() const {                                      \
+      return value_;                                          \
+    }                                                         \
+    static ExprHandle make(Type value) {                      \
+      return ExprHandle(alloc<Name##Imm>(value));             \
+    }                                                         \
+                                                              \
+   private:                                                   \
+    Type value_;                                              \
+  };
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE);
+#undef IMM_DECLARE
+
+// Get immediate by ScalarType.
+template <typename T>
+ExprPtr getImmediateByType(ScalarType immType, T initialVal) {
+  switch (immType) {
+#define TYPE_CASE(Type, Name) \
+  case ScalarType::Name:      \
+    return alloc<Name##Imm>(Type(initialVal));
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+  return nullptr;
+}
+
+template <typename T>
+ExprPtr getImmediateByType(Dtype dtype, T initialVal) {
+  return getImmediateByType<T>(dtype.scalar_type(), initialVal);
+}
+
+template <typename T>
+ExprPtr immLike(const ExprPtr& e, T v) {
+  return getImmediateByType<T>(e->dtype(), v);
+}
+
+template <typename T>
+ExprPtr immLike(const ExprHandle& e, T v) {
+  return immLike(e.node(), v);
+}
+
+inline c10::optional<int64_t> intValue(const ExprPtr& e) {
+#define TYPE_CASE(Type, Name)      \
+  if (auto v = to<Name##Imm>(e)) { \
+    return v->value();             \
+  }
+  AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+  return c10::nullopt;
+}
+
+inline c10::optional<int64_t> intValue(const ExprHandle& e) {
+  return intValue(e.node());
+}
+
+template <typename T>
+T immediateAs(const ExprPtr& e) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value();                     \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+  throw unsupported_dtype();
+  return 0;
+}
+
+template <typename T>
+T immediateAs(const ExprHandle& e) {
+  return immediateAs<T>(e.node());
+}
+
+template <typename T>
+bool immediateEquals(const ExprPtr& e, T val) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value() == val;              \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+  throw unsupported_dtype();
+  return false;
+}
+
+TORCH_API bool immediateIsNegative(const ExprPtr& e);
+
+TORCH_API bool immediateIsPositive(const ExprPtr& e);
+
+TORCH_API bool immediateIsZero(const ExprPtr& e);
+
+// Represents a ramp vector node:
+//     [base, base + 1 * stride, ... , base + (lanes - 1) * stride]
+class TORCH_API Ramp : public ExprNode<Ramp> {
+ public:
+  ExprPtr base() const {
+    return base_;
+  }
+  ExprPtr stride() const {
+    return stride_;
+  }
+
+  void set_base(ExprPtr base) {
+    base_ = std::move(base);
+  }
+
+  void set_stride(ExprPtr stride) {
+    stride_ = std::move(stride);
+  }
+
+  static ExprHandle make(
+      const ExprHandle& base,
+      const ExprHandle& stride,
+      int lanes) {
+    if (stride.dtype() != base.dtype()) {
+      throw malformed_input("Bad stride in Ramp");
+    }
+    return ExprHandle(alloc<Ramp>(base.node(), stride.node(), lanes));
+  }
+  int lanes() const {
+    return lanes_;
+  }
+
+  Ramp(ExprPtr base, ExprPtr stride, int lanes)
+      : ExprNodeBase(Dtype(base->dtype(), lanes)),
+        base_(std::move(base)),
+        stride_(std::move(stride)),
+        lanes_(lanes) {}
+
+ private:
+  ExprPtr base_;
+  ExprPtr stride_;
+  int lanes_;
+};
+
+class TORCH_API Load : public ExprNode<Load> {
+ public:
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+  std::vector<ExprPtr> indices() const {
+    return indices_;
+  }
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  static ExprHandle make(
+      Dtype dtype,
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices);
+  static ExprHandle make(
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices);
+
+  Load(Dtype dtype, BufPtr base_handle, std::vector<ExprPtr> indices);
+  Load(BufPtr base_handle, const std::vector<ExprPtr>& indices);
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+};
+
+class TORCH_API Broadcast : public ExprNode<Broadcast> {
+ public:
+  ExprPtr value() const {
+    return value_;
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+  int lanes() const {
+    return lanes_;
+  }
+  static ExprHandle make(const ExprHandle& value, int lanes) {
+    return ExprHandle(alloc<Broadcast>(value.node(), lanes));
+  }
+  Broadcast(ExprPtr value, int lanes)
+      : ExprNodeBase(Dtype(value->dtype(), lanes)),
+        value_(std::move(value)),
+        lanes_(lanes) {}
+
+ private:
+  ExprPtr value_;
+  int lanes_;
+};
+
+class TORCH_API IfThenElse : public ExprNode<IfThenElse> {
+ public:
+  ExprPtr condition() const {
+    return condition_;
+  }
+
+  // Lazily evaluated only if condition is true
+  ExprPtr true_value() const {
+    return true_;
+  }
+
+  // Lazily evaluated only if condition is false
+  ExprPtr false_value() const {
+    return false_;
+  }
+
+  void set_condition(ExprPtr condition) {
+    condition_ = std::move(condition);
+  }
+
+  void set_true_value(ExprPtr true_value) {
+    true_ = std::move(true_value);
+  }
+
+  void set_false_value(ExprPtr false_value) {
+    false_ = std::move(false_value);
+  }
+
+  static ExprHandle make(
+      const ExprHandle& c,
+      const ExprHandle& t,
+      const ExprHandle& f) {
+    if (!c.dtype().is_integral()) {
+      throw unsupported_dtype();
+    }
+    if (c.dtype().lanes() != 1) {
+      throw unsupported_dtype();
+    }
+    if (t.dtype() != f.dtype()) {
+      throw malformed_input("Bad dtype in IfThenElse");
+    }
+    return ExprHandle(alloc<IfThenElse>(c.node(), t.node(), f.node()));
+  }
+
+  IfThenElse(ExprPtr c, ExprPtr t, ExprPtr f)
+      : ExprNodeBase(t->dtype()),
+        condition_(std::move(c)),
+        true_(std::move(t)),
+        false_(std::move(f)) {}
+
+ private:
+  ExprPtr condition_;
+  ExprPtr true_;
+  ExprPtr false_;
+};
+
+class TORCH_API CompareSelect : public ExprNode<CompareSelect> {
+ public:
+  CompareSelectOperation compare_select_op() const {
+    return compare_op_;
+  }
+  ExprPtr lhs() const {
+    return this->lhs_;
+  }
+  ExprPtr rhs() const {
+    return this->rhs_;
+  }
+  ExprPtr ret_val1() const {
+    return this->ret_val1_;
+  }
+  ExprPtr ret_val2() const {
+    return this->ret_val2_;
+  }
+
+  void set_lhs(ExprPtr lhs) {
+    lhs_ = std::move(lhs);
+  }
+
+  void set_rhs(ExprPtr rhs) {
+    rhs_ = std::move(rhs);
+  }
+
+  void set_ret_val1(ExprPtr ret_val1) {
+    ret_val1_ = std::move(ret_val1);
+  }
+
+  void set_ret_val2(ExprPtr ret_val2) {
+    ret_val2_ = std::move(ret_val2);
+  }
+
+  CompareSelectBias bias() const {
+    return bias_;
+  }
+
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased) {
+    if (lhs.dtype() != rhs.dtype()) {
+      throw malformed_input("bad dtype in CompareSelect");
+    }
+    return ExprHandle(alloc<CompareSelect>(
+        lhs.node(),
+        rhs.node(),
+        IntImm::make(1).node(),
+        IntImm::make(0).node(),
+        cmp_op,
+        bias));
+  }
+
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      const ExprHandle& ret_val1,
+      const ExprHandle& ret_val2,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased) {
+    if (lhs.dtype() != rhs.dtype() || ret_val1.dtype() != ret_val2.dtype()) {
+      throw malformed_input("bad dtype in CompareSelect");
+    }
+    return ExprHandle(alloc<CompareSelect>(
+        lhs.node(),
+        rhs.node(),
+        ret_val1.node(),
+        ret_val2.node(),
+        cmp_op,
+        bias));
+  }
+
+  CompareSelect(
+      ExprPtr lhs,
+      ExprPtr rhs,
+      ExprPtr ret_val1,
+      ExprPtr ret_val2,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased)
+      : ExprNodeBase(ret_val1->dtype()),
+        lhs_(std::move(lhs)),
+        rhs_(std::move(rhs)),
+        ret_val1_(std::move(ret_val1)),
+        ret_val2_(std::move(ret_val2)),
+        compare_op_(cmp_op),
+        bias_(bias) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  CompareSelect(
+      ExprPtr lhs,
+      ExprPtr rhs,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased)
+      : ExprNodeBase(kInt),
+        lhs_(std::move(lhs)),
+        rhs_(std::move(rhs)),
+        ret_val1_(alloc<IntImm>(1)),
+        ret_val2_(alloc<IntImm>(0)),
+        compare_op_(cmp_op),
+        bias_(bias) {}
+
+ private:
+  ExprPtr lhs_;
+  ExprPtr rhs_;
+  ExprPtr ret_val1_;
+  ExprPtr ret_val2_;
+  CompareSelectOperation compare_op_;
+  CompareSelectBias bias_;
+};
+
+enum IntrinsicsOp {
+  kSin,
+  kCos,
+  kTan,
+  kAsin,
+  kAcos,
+  kAtan,
+  kAtan2,
+  kSinh,
+  kCosh,
+  kTanh,
+  kSigmoid,
+  kExp,
+  kExpm1,
+  kAbs,
+  kLog,
+  kLog2,
+  kLog10,
+  kLog1p,
+  kErf,
+  kErfc,
+  kSqrt,
+  kRsqrt,
+  kPow,
+  kCeil,
+  kFloor,
+  kRound,
+  kTrunc,
+  kFmod,
+  kRemainder,
+  kLgamma,
+  kFrac,
+  kIsNan,
+  kRand, // We need more discussions on this. Should we consider stateful?
+  kMaxIntrinsicsOp,
+};
+
+class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
+ public:
+  static ExprHandle make(IntrinsicsOp op_type, const ExprHandle& v1) {
+    return ExprHandle(alloc<Intrinsics>(op_type, v1.node()));
+  }
+
+  static ExprHandle make(
+      IntrinsicsOp op_type,
+      const ExprHandle& v1,
+      const ExprHandle& v2) {
+    return ExprHandle(alloc<Intrinsics>(op_type, v1.node(), v2.node()));
+  }
+
+  static ExprHandle make(
+      IntrinsicsOp op_type,
+      const std::vector<ExprHandle>& params) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<ExprPtr> params_nodes(params.size());
+    for (size_t i = 0; i < params.size(); i++) {
+      params_nodes[i] = params[i].node();
+    }
+    return ExprHandle(alloc<Intrinsics>(op_type, params_nodes));
+  }
+
+  static ExprHandle make(IntrinsicsOp op_type, Dtype dtype) {
+    return ExprHandle(alloc<Intrinsics>(op_type, dtype));
+  }
+
+  IntrinsicsOp op_type() const {
+    return op_type_;
+  }
+
+  std::string func_name() const {
+    switch (op_type()) {
+      case kSin:
+        return "sin";
+      case kCos:
+        return "cos";
+      case kTan:
+        return "tan";
+      case kAsin:
+        return "asin";
+      case kAcos:
+        return "acos";
+      case kAtan:
+        return "atan";
+      case kAtan2:
+        return "atan2";
+      case kSinh:
+        return "sinh";
+      case kCosh:
+        return "cosh";
+      case kTanh:
+        return "tanh";
+      case kSigmoid:
+        return "sigmoid";
+      case kExp:
+        return "exp";
+      case kAbs:
+        return "abs";
+      case kLog:
+        return "log";
+      case kLog2:
+        return "log2";
+      case kLog10:
+        return "log10";
+      case kLog1p:
+        return "log1p";
+      case kErf:
+        return "erf";
+      case kSqrt:
+        return "sqrt";
+      case kRsqrt:
+        return "rsqrt";
+      case kPow:
+        return "pow";
+      case kCeil:
+        return "ceil";
+      case kFloor:
+        return "floor";
+      case kRound:
+        return "round";
+      case kTrunc:
+        return "trunc";
+      case kRand:
+        return "rand";
+      case kFmod:
+        return "fmod";
+      case kRemainder:
+        return "remainder";
+      case kLgamma:
+        return "lgamma";
+      case kExpm1:
+        return "expm1";
+      case kErfc:
+        return "erfc";
+      case kFrac:
+        return "frac";
+      case kIsNan:
+        return "isnan";
+      default:
+        throw std::runtime_error(
+            "invalid op_type: " + c10::to_string(op_type()));
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Intrinsics(IntrinsicsOp op_type, Dtype dtype)
+      : ExprNodeBase(IntrinsicsDtype(op_type, dtype)),
+        params_({}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 0) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Intrinsics(IntrinsicsOp op_type, ExprPtr v1)
+      : ExprNodeBase(IntrinsicsDtype(op_type, v1->dtype())),
+        params_({std::move(v1)}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 1) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Intrinsics(IntrinsicsOp op_type, ExprPtr v1, ExprPtr v2)
+      : ExprNodeBase(IntrinsicsDtype(op_type, v1->dtype(), v2->dtype())),
+        params_({std::move(v1), std::move(v2)}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 2) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Intrinsics(IntrinsicsOp op_type, const std::vector<ExprPtr>& params)
+      : ExprNodeBase(IntrinsicsDtype(op_type, params)),
+        params_(params),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != nparams()) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Intrinsics(
+      IntrinsicsOp op_type,
+      Dtype dtype,
+      const std::vector<ExprPtr>& params)
+      : ExprNodeBase(IntrinsicsDtype(op_type, dtype)),
+        params_(params),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != nparams()) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  bool isPure() const {
+    return op_type_ != kRand;
+  }
+
+  int nparams() const {
+    return params_.size();
+  }
+
+  ExprPtr param(int index) const {
+    return params_[index];
+  }
+  const std::vector<ExprPtr>& params() const {
+    return params_;
+  }
+
+  void set_params(std::vector<ExprPtr> params) {
+    params_ = std::move(params);
+  }
+
+  static int OpArgCount(IntrinsicsOp op_type);
+
+ private:
+  static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1);
+  static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1, Dtype dt2);
+  static Dtype IntrinsicsDtype(
+      IntrinsicsOp op_type,
+      const std::vector<ExprPtr>& params);
+
+  std::vector<ExprPtr> params_;
+  IntrinsicsOp op_type_;
+};
+
+TORCH_API std::vector<ExprPtr> ExprHandleVectorToExprVector(
+    const std::vector<ExprHandle>&);
+TORCH_API std::vector<ExprHandle> ExprVectorToExprHandleVector(
+    const std::vector<ExprPtr>&);
+TORCH_API std::vector<VarPtr> VarHandleVectorToVarVector(
+    const std::vector<VarHandle>&);
+TORCH_API std::vector<VarHandle> VarVectorToVarHandleVector(
+    const std::vector<VarPtr>&);
+TORCH_API ExprPtr flatten_index(
+    const std::vector<ExprPtr>& dims,
+    const std::vector<ExprPtr>& indices,
+    const std::vector<ExprPtr>& strides);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2ccb0d5b17b10b3f2048d0a4bdb087a005bd296
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class TORCH_API IRCloner : public IRMutator {
+ public:
+  ~IRCloner() override = default;
+  ExprPtr mutate(AddPtr v) override;
+  ExprPtr mutate(SubPtr v) override;
+  ExprPtr mutate(MulPtr v) override;
+  ExprPtr mutate(DivPtr v) override;
+  ExprPtr mutate(ModPtr v) override;
+  ExprPtr mutate(MaxPtr v) override;
+  ExprPtr mutate(MinPtr v) override;
+  ExprPtr mutate(AndPtr v) override;
+  ExprPtr mutate(OrPtr v) override;
+  ExprPtr mutate(XorPtr v) override;
+  ExprPtr mutate(LshiftPtr v) override;
+  ExprPtr mutate(RshiftPtr v) override;
+  ExprPtr mutate(CompareSelectPtr v) override;
+#define IMM_MUTATE_DECLARE(Type, Name) ExprPtr mutate(Name##ImmPtr v) override;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
+#undef IMM_MUTATE_DECLARE
+  ExprPtr mutate(CastPtr v) override;
+  ExprPtr mutate(BitCastPtr v) override;
+  ExprPtr mutate(VarPtr v) override;
+  ExprPtr mutate(BufPtr v) override;
+  ExprPtr mutate(RampPtr v) override;
+  ExprPtr mutate(LoadPtr v) override;
+  ExprPtr mutate(BroadcastPtr v) override;
+  ExprPtr mutate(IfThenElsePtr v) override;
+  ExprPtr mutate(IntrinsicsPtr v) override;
+
+  ExprPtr mutate(TermPtr v) override;
+  ExprPtr mutate(PolynomialPtr v) override;
+  ExprPtr mutate(RoundOffPtr v) override;
+  ExprPtr mutate(MaxTermPtr v) override;
+  ExprPtr mutate(MinTermPtr v) override;
+
+  ExprPtr mutate(ReduceOpPtr v) override;
+
+  StmtPtr mutate(ForPtr v) override;
+  StmtPtr mutate(BlockPtr v) override;
+  StmtPtr mutate(StorePtr v) override;
+  StmtPtr mutate(AtomicAddPtr v) override;
+  StmtPtr mutate(SyncThreadsPtr v) override;
+  StmtPtr mutate(ExternalCallPtr v) override;
+  StmtPtr mutate(ExternalCallWithAllocPtr v) override;
+
+  StmtPtr mutate(AllocatePtr v) override;
+  StmtPtr mutate(FreePtr v) override;
+  StmtPtr mutate(LetPtr v) override;
+  StmtPtr mutate(CondPtr v) override;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h
new file mode 100644
index 0000000000000000000000000000000000000000..32303eb2153a9810df1901a6b7e8c34c6afdc817
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -0,0 +1,66 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class TORCH_API IRMutator {
+ public:
+  virtual ~IRMutator() = default;
+  virtual ExprPtr mutate(AddPtr v);
+  virtual ExprPtr mutate(SubPtr v);
+  virtual ExprPtr mutate(MulPtr v);
+  virtual ExprPtr mutate(DivPtr v);
+  virtual ExprPtr mutate(ModPtr v);
+  virtual ExprPtr mutate(MaxPtr v);
+  virtual ExprPtr mutate(MinPtr v);
+  virtual ExprPtr mutate(AndPtr v);
+  virtual ExprPtr mutate(OrPtr v);
+  virtual ExprPtr mutate(XorPtr v);
+  virtual ExprPtr mutate(LshiftPtr v);
+  virtual ExprPtr mutate(RshiftPtr v);
+  virtual ExprPtr mutate(CompareSelectPtr v);
+#define IMM_MUTATE_DECLARE(Type, Name) virtual ExprPtr mutate(Name##ImmPtr v);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE);
+#undef IMM_MUTATE_DECLARE
+  virtual ExprPtr mutate(CastPtr v);
+  virtual ExprPtr mutate(BitCastPtr v);
+  virtual ExprPtr mutate(VarPtr v);
+  virtual ExprPtr mutate(BufPtr v);
+  virtual ExprPtr mutate(RampPtr v);
+  virtual ExprPtr mutate(LoadPtr v);
+  virtual ExprPtr mutate(BroadcastPtr v);
+  virtual ExprPtr mutate(IfThenElsePtr v);
+  virtual ExprPtr mutate(IntrinsicsPtr v);
+
+  virtual ExprPtr mutate(TermPtr v);
+  virtual ExprPtr mutate(PolynomialPtr v);
+  virtual ExprPtr mutate(RoundOffPtr v);
+  virtual ExprPtr mutate(MaxTermPtr v);
+  virtual ExprPtr mutate(MinTermPtr v);
+
+  virtual ExprPtr mutate(ReduceOpPtr v);
+
+  virtual StmtPtr mutate(ForPtr v);
+  virtual StmtPtr mutate(BlockPtr v);
+  virtual StmtPtr mutate(StorePtr v);
+  virtual StmtPtr mutate(AtomicAddPtr v);
+  virtual StmtPtr mutate(SyncThreadsPtr v);
+  virtual StmtPtr mutate(ExternalCallPtr v);
+  virtual StmtPtr mutate(ExternalCallWithAllocPtr v);
+
+  virtual StmtPtr mutate(AllocatePtr v);
+  virtual StmtPtr mutate(FreePtr v);
+  virtual StmtPtr mutate(FreeExtPtr v);
+  virtual StmtPtr mutate(PlacementAllocatePtr v);
+  virtual StmtPtr mutate(LetPtr v);
+  virtual StmtPtr mutate(CondPtr v);
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..72eeebc30848d31c0eafb83575ac8eb43d8a862e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ostream>
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class Tensor;
+
+class TORCH_API IRPrinter : public IRVisitor {
+ public:
+  explicit IRPrinter(std::ostream& os) : printer_os_(this, os) {}
+
+  void print(ExprHandle);
+  void print(Expr&);
+  void print(Stmt&);
+  void visit(AddPtr v) override;
+  void visit(SubPtr v) override;
+  void visit(MulPtr v) override;
+  void visit(DivPtr v) override;
+  void visit(ModPtr v) override;
+  void visit(MaxPtr v) override;
+  void visit(MinPtr v) override;
+  void visit(AndPtr v) override;
+  void visit(OrPtr v) override;
+  void visit(XorPtr v) override;
+  void visit(LshiftPtr v) override;
+  void visit(RshiftPtr v) override;
+  void visit(CompareSelectPtr v) override;
+#define IMM_PRINT_VISIT(Type, Name) void visit(Name##ImmPtr v) override;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT);
+#undef IMM_PRINT_VISIT
+  void visit(CastPtr v) override;
+  void visit(BitCastPtr v) override;
+  void visit(VarPtr v) override;
+  void visit(BufPtr v) override;
+  void visit(RampPtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(BroadcastPtr v) override;
+  void visit(IfThenElsePtr v) override;
+  void visit(IntrinsicsPtr v) override;
+  void visit(TermPtr v) override;
+  void visit(PolynomialPtr v) override;
+  void visit(RoundOffPtr v) override;
+  void visit(MaxTermPtr v) override;
+  void visit(MinTermPtr v) override;
+  void visit(ReduceOpPtr v) override;
+
+  void visit(AtomicAddPtr v) override;
+  void visit(SyncThreadsPtr v) override;
+  void visit(ExternalCallPtr v) override;
+  void visit(ExternalCallWithAllocPtr v) override;
+  void visit(StorePtr v) override;
+  void visit(ForPtr v) override;
+  void visit(CondPtr v) override;
+  void visit(BlockPtr v) override;
+  void visit(AllocatePtr v) override;
+  void visit(FreePtr v) override;
+  void visit(FreeExtPtr v) override;
+  void visit(PlacementAllocatePtr v) override;
+  void visit(LetPtr v) override;
+
+  // A child class may have a difference rule for generating dtype
+  // string, e.g. CUDA needs int64_t to be generated as long long.
+  virtual std::string dtypeToCppString(const Dtype& dtype);
+
+  std::ostream& os() {
+    return printer_os_;
+  }
+
+  class PrinterStream : public std::ostream {
+   public:
+    PrinterStream(IRPrinter* printer, std::ostream& os)
+        : std::ostream(os.rdbuf()), printer_(printer) {}
+
+    IRPrinter* printer() {
+      return printer_;
+    }
+
+   private:
+    IRPrinter* printer_ = nullptr;
+  };
+
+ protected:
+  std::string to_string(CompareSelectOperation op);
+
+  UniqueNameManager* name_manager() {
+    return &name_manager_;
+  }
+  void emitIndent();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  int indent_ = 0;
+
+ private:
+  PrinterStream printer_os_;
+  UniqueNameManager name_manager_;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const ExprHandle&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
+
+TORCH_API void print(ExprPtr expr);
+TORCH_API void print(StmtPtr stmt);
+TORCH_API void print(const Tensor& t);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+namespace std {
+
+using torch::jit::tensorexpr::Expr;
+using torch::jit::tensorexpr::ExprPtr;
+using torch::jit::tensorexpr::Stmt;
+using torch::jit::tensorexpr::StmtPtr;
+using torch::jit::tensorexpr::Tensor;
+
+TORCH_API std::string to_string(ExprPtr expr);
+TORCH_API std::string to_string(StmtPtr stmt);
+TORCH_API std::string to_string(const Tensor& t);
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..88a64e486483b7ec672c8de34a92c27a896f15b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -0,0 +1,554 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/hash_provider.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+
+#include <utility>
+
+/* IR Simplification
+ *
+ * Simplifies expressions in two stages:
+ *  1. Recursively traverse the map combining similar operations into Terms
+ * (interacted via Multiplication) and Polynomials (interacted via Addition). We
+ * reorder the components of each Term or Polynomial into a consistent order to
+ * allow combination or cancelling of like terms.
+ *  2. Once the format of the tree is minimal, expand each Term into a sequence
+ * of Muls, and each Polynomial into a sequence of Ads.
+ */
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// A bunch of helpers for determine the Dtype of the output of a multi argument
+// Term or Polynomial.
+template <class ExprType>
+Dtype promoteTypesVec(ExprPtr s, std::vector<ExprType>& v) {
+  Dtype t = s->dtype();
+  bool first = true;
+
+  for (const auto& e : v) {
+    if (first) {
+      t = Dtype(t.scalar_type(), e->dtype().lanes());
+      first = false;
+    }
+    t = promoteTypes(t, e->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesVec(std::vector<ExprType>& v) {
+  if (v.empty()) {
+    throw malformed_input("empty list of types");
+  }
+
+  Dtype t = v[0]->dtype();
+  for (const auto& e : v) {
+    t = promoteTypes(t, e->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesMap(
+    ExprPtr s,
+    std::unordered_map<SimplifierHashType, ExprType>& m) {
+  Dtype t = s->dtype();
+  bool first = true;
+  for (auto& e : m) {
+    if (first) {
+      t = Dtype(t.scalar_type(), e.second->dtype().lanes());
+      first = false;
+    }
+    t = promoteTypes(t, e.second->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesVar(ExprType e) {
+  return e->dtype();
+}
+
+template <class ExprType, class... Args>
+Dtype promoteTypesVar(ExprType e, Args... es) {
+  Dtype lhs = e->dtype();
+  Dtype rhs = promoteTypesVar(es...);
+  if (e->isConstant()) {
+    lhs = Dtype(lhs.scalar_type(), rhs.lanes());
+  }
+
+  return promoteTypes(lhs, rhs);
+}
+
+// Uses the evaluator to fold an Expression with constant terms.
+// E.g. evaluateOp(Add(3, 4)) => 7.
+// Expr v must not have any unbound Vars.
+inline ExprPtr evaluateOp(ExprPtr v) {
+  ExprHandle handle(v);
+  ExprEval<SimpleIREvaluator> eval(handle);
+
+  switch (v->dtype().scalar_type()) {
+#define TYPE_CASE(Type, Name)                                 \
+  case ScalarType::Name: {                                    \
+    Type val = eval.value<Type>();                            \
+    return getImmediateByType(v->dtype().scalar_type(), val); \
+  }
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE);
+#undef TYPE_CASE
+    default:
+      LOG(FATAL) << "Unsupported datatype: " << v->dtype();
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// A Term represents a grouping of Exprs through multiplication.
+// E.g. product(scalar, *variables).
+class Term : public ExprNode<Term> {
+ public:
+  template <class... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Term(HashProvider& hasher, ExprPtr s, Args... ts)
+      : ExprNodeBase(promoteTypesVar(s, ts...)), scalar_(s), hasher_(hasher) {
+    CHECK(s->isConstant());
+    addComponent(ts...);
+    sort();
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Term(HashProvider& hasher, ExprPtr s, std::vector<ExprPtr> v)
+      : ExprNodeBase(promoteTypesVec(s, v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Convenience constructor from a map of hash -> var, used when merging Terms.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Term(
+      HashProvider& hasher,
+      ExprPtr s,
+      std::unordered_map<SimplifierHashType, ExprPtr> varmap)
+      : ExprNodeBase(promoteTypesMap(s, varmap)), scalar_(s), hasher_(hasher) {
+    for (auto& p : varmap) {
+      addComponent(p.second);
+    }
+    sort();
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+  // Produce a hash of just the variable components of this term, to determine
+  // if it can be combined with another term.
+  SimplifierHashType hashVars() const;
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Sort by hash to normalize order of components.
+  void sort();
+};
+
+// Polynomial represents a grouping of Exprs by addition.
+// E.g. sum(*variables, scalar).
+// This would better be called Expression, but, naming conflict...
+class Polynomial : public ExprNode<Polynomial> {
+ public:
+  template <class... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Polynomial(HashProvider& hasher, ExprPtr s, Args... ts)
+      : ExprNodeBase(promoteTypesVar(s, ts...)), scalar_(s), hasher_(hasher) {
+    CHECK(s->isConstant());
+    addTerm(ts...);
+    sort();
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Polynomial(HashProvider& hasher, ExprPtr s, std::vector<TermPtr> v)
+      : ExprNodeBase(promoteTypesVec(s, v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Helper constructor for list of terms with no scalar component.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Polynomial(HashProvider& hasher, std::vector<TermPtr> terms)
+      : ExprNodeBase(promoteTypesVec(terms)),
+        variables_(std::move(terms)),
+        scalar_(getImmediateByType(dtype(), 0)),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Convenience constructor for map of hash -> var, used when merging
+  // Polynomials.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Polynomial(
+      HashProvider& hasher,
+      ExprPtr s,
+      std::unordered_map<SimplifierHashType, TermPtr> varmap)
+      : ExprNodeBase(promoteTypesMap(s, varmap)), scalar_(s), hasher_(hasher) {
+    for (auto& p : varmap) {
+      addTerm(p.second);
+    }
+    sort();
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<TermPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+  SimplifierHashType hashVars() const;
+
+ private:
+  std::vector<TermPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+
+  void addTerm(TermPtr t) {
+    variables_.push_back(std::move(t));
+  }
+  template <class... Ts>
+  void addTerm(TermPtr t, Ts&&... ts) {
+    addTerm(std::move(t));
+    addTerm(std::forward<Ts>(ts)...);
+  }
+
+  // Sort by hash to normalize order of terms.
+  void sort();
+};
+
+class RoundOff : public BinaryOpNode<RoundOff> {
+ public:
+  RoundOff(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(lhs, rhs, IRNodeType::kOther) {}
+};
+
+class MaxTerm : public ExprNode<MaxTerm> {
+ public:
+  template <class... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  MaxTerm(HashProvider& hasher, ExprPtr s, bool p, Args... ts)
+      : ExprNodeBase(s ? promoteTypesVar(s, ts...) : promoteTypesVar(ts...)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    addComponent(ts...);
+    uniquefy();
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  MaxTerm(HashProvider& hasher, ExprPtr s, bool p, std::vector<ExprPtr> v)
+      : ExprNodeBase(s ? promoteTypesVec(s, v) : promoteTypesVec(v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    uniquefy();
+  }
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+  bool propagate_nans_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Uniquefy the terms using their hash.
+  void uniquefy();
+};
+
+class MinTerm : public ExprNode<MinTerm> {
+ public:
+  template <class... Args>
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  MinTerm(HashProvider& hasher, ExprPtr s, bool p, Args... ts)
+      : ExprNodeBase(s ? promoteTypesVar(s, ts...) : promoteTypesVar(ts...)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    addComponent(ts...);
+    uniquefy();
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  MinTerm(HashProvider& hasher, ExprPtr s, bool p, std::vector<ExprPtr> v)
+      : ExprNodeBase(s ? promoteTypesVec(s, v) : promoteTypesVec(v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    uniquefy();
+  }
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+  bool propagate_nans_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Uniquefy the terms using their hash.
+  void uniquefy();
+};
+
+// Context-sensitive IR simplification
+using VarBoundInfo = std::unordered_map<VarPtr, analysis::Bound>;
+
+class TORCH_API SimplifierUnderContext : public IRMutator {
+ public:
+  ~SimplifierUnderContext() override = default;
+  // Add boundary info for index variables in for-loops
+  StmtPtr mutate(ForPtr v) override;
+
+  ExprPtr mutate(DivPtr v) override;
+  ExprPtr mutate(ModPtr v) override;
+  ExprPtr mutate(CompareSelectPtr v) override;
+  ExprPtr mutate(IfThenElsePtr v) override;
+
+ protected:
+  bool getLoopBoundInfo(const ExprPtr& expr, analysis::Bound* loop_bound_info);
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  HashProvider hasher_;
+  VarBoundInfo var_bound_info_;
+};
+
+// Stmt simplification should occur in both modes.
+class TORCH_API PolynomialBase : public IRMutator {
+ public:
+  ~PolynomialBase() override = default;
+
+  StmtPtr mutate(BlockPtr v) override;
+
+  StmtPtr mutate(CondPtr v) override;
+
+  StmtPtr mutate(ForPtr v) override;
+
+  // Trivially factorize terms by GCD of scalar components.
+  TermPtr factorizePolynomial(PolynomialPtr poly);
+
+  HashProvider& hasher() {
+    return hasher_;
+  }
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  HashProvider hasher_;
+};
+
+// Simplify the IR by combining arithmetic expressions over common terms.
+class TORCH_API PolynomialTransformer : public PolynomialBase {
+ public:
+  using PolynomialBase::mutate;
+  // Inserts term into the provided map, in the case of a hash collision
+  // combines the term with the existing and updates the map.
+  void addOrUpdateTerm(
+      std::unordered_map<SimplifierHashType, TermPtr>& varmap,
+      TermPtr term);
+
+  // Add Polynomial expressions, combining Terms representing the same
+  // variables.
+  ExprPtr addPolynomials(PolynomialPtr lhs, PolynomialPtr rhs);
+
+  // Insert a new Term into the provided polynomial. If the new term has
+  // common variables to an existing term it is combined.
+  ExprPtr insertTerm(PolynomialPtr poly, TermPtr term);
+
+  // Merge and simplify addition.
+  ExprPtr mutate(AddPtr v) override;
+
+  // Subtract one term from another, cancelling if necessary.
+  ExprPtr subTerms(TermPtr lhs, TermPtr rhs, bool negated);
+
+  // Subtract the RHS Polynomial from the LHS Polynomial, cancelling out where
+  // possible.
+  ExprPtr subPolynomials(PolynomialPtr lhs, PolynomialPtr rhs);
+
+  // Merge and simplify subtraction.
+  ExprPtr mutate(SubPtr v) override;
+
+  // Multiply two terms together, usually creating a new term with the variable
+  // lists concatenated.
+  TermPtr mulTerms(TermPtr lhs, TermPtr rhs);
+
+  // Multiply a Polynomial by a Term.
+  ExprPtr polyByTerm(PolynomialPtr poly, TermPtr term);
+
+  // Match a rounding pattern and create a RoundOff if found.
+  ExprPtr isRoundOff(ExprPtr lhs, ExprPtr rhs);
+
+  // Inserts a new component into a term, simplifying if possible.
+  ExprPtr insertIntoTerm(TermPtr term, ExprPtr expr);
+
+  // Merge and simplify multiplication.
+  ExprPtr mutate(MulPtr v) override;
+
+  ExprPtr mutate(DivPtr v) override;
+
+  ExprPtr mutate(ModPtr v) override;
+
+  ExprPtr mutate(AndPtr v) override;
+
+  ExprPtr mutate(XorPtr v) override;
+
+  ExprPtr mutate(LshiftPtr v) override;
+
+  ExprPtr mutate(RshiftPtr v) override;
+
+  ExprPtr mutate(MaxPtr v) override;
+
+  ExprPtr mutate(MinPtr v) override;
+
+  ExprPtr mutate(CompareSelectPtr v) override;
+
+  ExprPtr mutate(IntrinsicsPtr v) override;
+
+  ExprPtr mutate(CastPtr v) override;
+
+  ExprPtr mutate(IfThenElsePtr v) override;
+
+  static ExprPtr simplify(ExprPtr e);
+  static ExprHandle simplify(const ExprHandle& e);
+  static StmtPtr simplify(StmtPtr e);
+};
+
+// Expands Terms and Polynomial expressions into primitive operations.
+// Does some simple factorization and reordering.
+class TORCH_API TermExpander : public PolynomialBase {
+  PolynomialTransformer* simplifier_;
+  std::set<VarPtr> eliminated_allocations_;
+
+ public:
+  using PolynomialBase::mutate;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  TermExpander(PolynomialTransformer* simplifier) : simplifier_(simplifier) {}
+  bool check_safe() {
+    return eliminated_allocations_.empty();
+  }
+
+  // Expand Terms out to a series of Muls.
+  ExprPtr mutate(TermPtr v) override;
+
+  // Expand Polynomials out to a series of Adds.
+  ExprPtr mutate(PolynomialPtr v) override;
+
+  // Expand MaxTerms to a series of Max ops.
+  ExprPtr mutate(MaxTermPtr v) override;
+
+  // Expand MinTerms to a series of Min ops.
+  ExprPtr mutate(MinTermPtr v) override;
+
+  // Expand RoundOff to it's component: Mul(Div(lhs, rhs), rhs).
+  ExprPtr mutate(RoundOffPtr v) override;
+
+  // Eliminate zero length allocations.
+  StmtPtr mutate(AllocatePtr v) override;
+  StmtPtr mutate(FreePtr v) override;
+
+  // Override to enable condition fusing.
+  BlockPtr fuseConditions(BlockPtr v);
+  StmtPtr fuseSyncThreads(BlockPtr block);
+  StmtPtr mutate(BlockPtr v) override;
+};
+
+class TORCH_API IRSimplifier {
+ public:
+  static StmtPtr simplify(StmtPtr s);
+  static ExprPtr simplify(ExprPtr e);
+  static ExprHandle simplify(const ExprHandle& e) {
+    return ExprHandle(simplify(e.node()));
+  }
+};
+
+// Flattens the buf and performs the simplifier on the flattened dims.
+ExprPtr buf_flat_size(BufPtr v);
+// Returns true if expressions A and B can be simplified to an equal expression.
+TORCH_API bool exprEquals(ExprPtr A, ExprPtr B);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..af51d601248f7528b86c50324d7c38148fc40314
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class Expr;
+class ExprHandle;
+class Mod;
+class And;
+class Or;
+class Xor;
+class Lshift;
+class Rshift;
+class CompareSelect;
+class Ramp;
+class Load;
+class IfThenElse;
+class Intrinsics;
+
+class Stmt;
+class ExternalCall;
+class Store;
+class For;
+class Block;
+
+class TORCH_API IRVerifier : public IRVisitor {
+ public:
+  IRVerifier() = default;
+
+  void visit(ModPtr v) override;
+  void visit(AndPtr v) override;
+  void visit(OrPtr v) override;
+  void visit(XorPtr v) override;
+  void visit(LshiftPtr v) override;
+  void visit(RshiftPtr v) override;
+  void visit(CompareSelectPtr v) override;
+  void visit(RampPtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(IfThenElsePtr v) override;
+  void visit(IntrinsicsPtr v) override;
+
+  void visit(ExternalCallPtr v) override;
+  void visit(StorePtr v) override;
+  void visit(ForPtr v) override;
+  void visit(BlockPtr v) override;
+};
+
+TORCH_API void verify(StmtPtr);
+TORCH_API void verify(ExprPtr);
+TORCH_API void verify(ExprHandle);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd0497a2f94aa9ac425a2aa549249d4fb0e9ec2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class TORCH_API IRVisitor {
+ public:
+  virtual ~IRVisitor() = default;
+  virtual void visit(AddPtr v);
+  virtual void visit(SubPtr v);
+  virtual void visit(MulPtr v);
+  virtual void visit(DivPtr v);
+  virtual void visit(ModPtr v);
+  virtual void visit(MaxPtr v);
+  virtual void visit(MinPtr v);
+  virtual void visit(AndPtr v);
+  virtual void visit(OrPtr v);
+  virtual void visit(XorPtr v);
+  virtual void visit(LshiftPtr v);
+  virtual void visit(RshiftPtr v);
+  virtual void visit(CompareSelectPtr v);
+
+#define IMM_PRINT_VISIT(Type, Name) virtual void visit(Name##ImmPtr v);
+
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
+#undef IMM_PRINT_VISIT
+
+  virtual void visit(CastPtr v);
+  virtual void visit(BitCastPtr v);
+  virtual void visit(VarPtr v);
+  virtual void visit(BufPtr v);
+  virtual void visit(RampPtr v);
+  virtual void visit(LoadPtr v);
+  virtual void visit(ForPtr v);
+  virtual void visit(BlockPtr v);
+  virtual void visit(StorePtr v);
+  virtual void visit(BroadcastPtr v);
+  virtual void visit(IfThenElsePtr v);
+  virtual void visit(IntrinsicsPtr v);
+  virtual void visit(AllocatePtr v);
+  virtual void visit(FreePtr v);
+  virtual void visit(FreeExtPtr v);
+  virtual void visit(PlacementAllocatePtr v);
+  virtual void visit(LetPtr v);
+  virtual void visit(CondPtr v);
+  virtual void visit(TermPtr v);
+  virtual void visit(PolynomialPtr v);
+  virtual void visit(RoundOffPtr v);
+  virtual void visit(MaxTermPtr v);
+  virtual void visit(MinTermPtr v);
+  virtual void visit(ReduceOpPtr v);
+  virtual void visit(AtomicAddPtr v);
+  virtual void visit(SyncThreadsPtr v);
+  virtual void visit(ExternalCallPtr v);
+  virtual void visit(ExternalCallWithAllocPtr v);
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2c9af08f4c3952aa0fb1ceb725a237ed9508d1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h
@@ -0,0 +1,382 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/lowerings.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+struct SmallSizeTPairHash {
+ public:
+  std::size_t operator()(const std::pair<size_t, size_t>& x) const {
+    // hashing input index and then dim index
+    return x.first * 128 + x.second;
+  }
+};
+
+// Returns true if the TE fuser supports this conv2d.
+bool conv2dIsSupportedJit(const Node* node);
+// Returns true if the TE fuser supports this conv2d with mkldnn prepacked conv.
+bool mkldnnPrepackedConvIsSupportedJit(const Node* node);
+// Returns true if the TE _convolution node is Conv2d.
+bool isConv2d(const Node* node);
+// Returns true if the TE fuser supports this matmul.
+bool matmulIsSupported(const Node* node);
+template <typename T>
+inline std::vector<int64_t> bufferSizes(const T& t) {
+  std::vector<int64_t> sizes;
+  for (size_t i = 0; i < t->ndim(); i++) {
+    sizes.push_back(*intValue(t->dim(i)));
+  }
+  return sizes;
+}
+
+// Get the dimensions of a value.
+std::vector<ExprHandle> valueShape(const ArgValue& v);
+
+// If v is a tensor, broadcast it to match the shape of axes, or return
+// directly if v is a constant.
+ExprHandle tensorOrConstant(
+    const ArgValue& v,
+    const std::vector<ExprHandle>& axes);
+
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
+
+ExprHandle broadcast(BufHandle b, const std::vector<ExprHandle>& axes);
+
+ExprHandle constant(const ArgValue& v);
+
+std::vector<ExprHandle> computeIndicesToBroadcast(
+    const std::vector<ExprHandle>& outputAxes,
+    const std::vector<ExprHandle>& inputSizes);
+
+inline std::string getArgValueName(const ArgValue& a) {
+  if (std::holds_alternative<tensorexpr::BufHandle>(a)) {
+    return "BufHandle";
+  } else if (std::holds_alternative<tensorexpr::VarHandle>(a)) {
+    return "VarHandle";
+  } else if (std::holds_alternative<double>(a)) {
+    return "double";
+  } else if (std::holds_alternative<int64_t>(a)) {
+    return "int64_t";
+  } else if (std::holds_alternative<bool>(a)) {
+    return "bool";
+  } else if (std::holds_alternative<BufList>(a)) {
+    return "BufList";
+  } else if (std::holds_alternative<DoubleList>(a)) {
+    return "DoubleList";
+  } else if (std::holds_alternative<IntList>(a)) {
+    return "IntList";
+  } else if (std::holds_alternative<ArgNone>(a)) {
+    return "None";
+  } else {
+    throw std::runtime_error("ArgValue type not handled in string conversion");
+  }
+}
+
+template <class T>
+std::vector<T> convertVecArgValue(const std::vector<ArgValue>& v) {
+  std::vector<T> res;
+  for (auto& x : v) {
+    auto val = std::get_if<T>(&x);
+    if (val) {
+      res.push_back(*val);
+    } else {
+      throw std::runtime_error(
+          "vector type not homogeneous - found " + getArgValueName(x) +
+          ", expected " + getArgValueName(v[0]));
+    }
+  }
+  return res;
+}
+
+class TORCH_API TensorExprKernel {
+  struct ConstantDescr {
+    BufPtr buf;
+    // Only one of ptr and node is used at a time
+    // 1) ptr for the constant tensors
+    // 2) node for the constant custom class objects
+    void* ptr = nullptr;
+    Node* node = nullptr;
+  };
+
+ public:
+  // Constructor Params:
+  //  * subgraph
+  //      - the graph that needs to be compiled.
+  //  * kernel_func_name
+  //      - the name that should be used for the generated kernel.
+  //  * custom_lowerings
+  //      - map that represents custom lowering definitions for a set of ops.
+  //  * symbolic_shape_inputs
+  //      - a list of symbolic graph inputs that represent the symbolic dims of
+  //        the input tensors.
+  //  * pre_alloc
+  //      - a flag to control pre-allocation of buffers.
+  explicit TensorExprKernel(
+      const std::shared_ptr<Graph>& subgraph,
+      const std::string& kernel_func_name,
+      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
+          {},
+      std::vector<int64_t> symbolic_shape_inputs = {},
+      bool pre_alloc = false,
+      std::unordered_map<
+          const torch::jit::Value*,
+          std::vector<torch::jit::StrideInput>> symbolic_strides = {});
+
+  explicit TensorExprKernel(
+      const std::shared_ptr<Graph>& subgraph,
+      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
+          {},
+      std::vector<int64_t> symbolic_shape_inputs = {},
+      bool pre_alloc = false,
+      std::unordered_map<
+          const torch::jit::Value*,
+          std::vector<torch::jit::StrideInput>> symbolic_strides = {})
+      : TensorExprKernel(
+            subgraph,
+            SubgraphUtils::generateNameForGraph(subgraph),
+            custom_lowerings,
+            symbolic_shape_inputs,
+            pre_alloc,
+            symbolic_strides) {}
+
+  void run(Stack& stack) const;
+  void runFast(
+      const std::vector<void*>& inputs,
+      const std::vector<void*>& outputs) const;
+  // Expected format of stack:
+  //  ... <outputs> <inputs>
+  // i.e., output IValues must be below the input IValues in the stack.
+  void runWithAllocatedOutputs(Stack& stack) const;
+
+  void fallback(Stack& stack) const {
+    InterpreterState(code_).run(stack);
+  }
+  void recompile();
+
+  StmtPtr getCodeGenStmt();
+
+  std::string getCodeText(const std::string& attr = "") {
+    return codegen_->getCodeText(attr);
+  }
+
+  const std::shared_ptr<Graph> graph() {
+    return graph_;
+  }
+
+  const std::vector<ConstantDescr>& getConstantDescriptors() const {
+    return constants_;
+  }
+
+  const std::vector<CodeGen::BufferArg>& getBufferArgs() const {
+    return bufferArgs_;
+  }
+
+  const std::string& getKernelName() const {
+    return codegen_->kernel_func_name();
+  }
+
+  const std::vector<int64_t>& getSymbolicShapeInputs() const {
+    return symbolic_shape_inputs_;
+  }
+
+ private:
+  enum BackendType {
+    kUninitialized,
+    kSimpleIREval,
+    kLLVMCodeGen,
+    kCudaCodeGen,
+    kBlockCodeGen,
+  };
+
+  enum MemoryLayoutPolicy {
+    kContiguous,
+    kChannelsLastNdContiguous,
+  };
+
+  void compile();
+  void genInputDebugNames();
+  void runKernel(Stack& stack) const;
+
+  std::vector<ExprHandle> sizesForValue(const torch::jit::Value* v);
+
+  // These functions broadcast shape and also store a `hasBroadcast_` variable.
+  std::vector<ExprHandle> broadcastShapesMut(
+      const std::vector<ExprHandle>& a,
+      const std::vector<ExprHandle>& b);
+  std::vector<ExprHandle> broadcastShapesMut(
+      std::vector<std::vector<ExprHandle>> shapes);
+
+  ArgValue toArg(const torch::jit::Value* v) const;
+  ExprHandle constant(const torch::jit::Value* v);
+
+  Tensor computeValue(const torch::jit::Value* v);
+
+  void bindConstant(const torch::jit::Value* v);
+
+  StmtPtr transformLoops(BackendType backendType, StmtPtr st);
+
+  std::string getCodeGenName(BackendType backendType);
+
+  void getStaticOutputSizesAndStrides(
+      const at::ArrayRef<IValue>& inputs,
+      std::vector<std::vector<int64_t>>* static_sizes,
+      std::vector<std::vector<int64_t>>* static_strides) const;
+
+  std::vector<CodeGen::CallArg> prepareRunArgs(
+      const at::ArrayRef<IValue>& inputs,
+      std::vector<at::Tensor>& outputs) const;
+  BackendType inferBackendTypeFromDevice(at::Device device);
+
+  Tensor bindInput(const torch::jit::Value* input);
+  BlockPtr bindAllInputs();
+
+  // Deduce the memory layout policy to be propagated within
+  // NNC fusion group. The memory layout policy could be `kContiguous`
+  // or `kChannelsLastNdContiguous`.
+  //    `kContiguous`: Always convert the non-contiguous input tensors and
+  //        internal buffers to contiguous.
+  //    `kChannelsLastNdContiguous`: Always convert the input tensors and
+  //        internal buffers to channels-last contiguous.
+  // Currently, the rule is simple.
+  //    If all the input and out tensors of NNC fusion group are channels-last
+  //    contiguous, the policy is `kChannelsLastNdContiguous`. Otherwise, it
+  //    is always `kContiguous`.
+  void deduceMemoryLayoutPolicy();
+
+  Tensor convertSymbolicOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertStaticShapeOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertSymbolicOutputToCorrectStrides(
+      const std::vector<ExprHandle>& sizes,
+      const std::vector<size_t>& sorted_stride_indices_descending,
+      const std::vector<ExprPtr>& strides,
+      BufPtr& buf);
+
+  NNCLoweringFunction getCustomLoweringFor(c10::Symbol op) const;
+  std::unordered_map<c10::Symbol, NNCLoweringFunction> getCustomLowerings()
+      const {
+    return custom_lowerings_;
+  }
+
+  // Allocate memory for intermediate buffers at compile time.
+  // Specifically, we pre-allocate memory for intermediate buffers with static
+  // size and manage these buffers in the way we manage JIT constant tensors:
+  // push the buf args into the stack so NNC IR can access them at runtime.
+  std::vector<BufPtr> preAllocIntermediateBufs(
+      const std::vector<BufPtr>& interm_bufs);
+
+  struct UnpackedTensorOptions {
+    c10::optional<c10::ScalarType> dtype;
+    c10::optional<c10::Layout> layout;
+    c10::optional<c10::Device> device;
+    c10::optional<bool> pinned_memory;
+
+    UnpackedTensorOptions(const c10::TensorOptions& opts)
+        : dtype(c10::optTypeMetaToScalarType(opts.dtype_opt())),
+          layout(opts.layout_opt()),
+          device(opts.device_opt()),
+          pinned_memory(opts.pinned_memory_opt()) {}
+  };
+
+  ExprHandle getVarForShape(const c10::ShapeSymbol& ss);
+  std::vector<ExprHandle> computeInputTensorDims(
+      const torch::jit::Value* input);
+  ExprHandle getStrideArg(size_t tensor_input, size_t stride_index);
+  std::vector<ExprHandle> sizesFromSymbolicShape(
+      const c10::SymbolicShape& shape);
+  std::vector<ExprHandle> getInputStrides(
+      const torch::jit::Value* input,
+      const std::vector<ExprHandle>& inputTensorDims);
+  std::vector<torch::jit::StrideInput>& getSymbolicStrideDesc(
+      const torch::jit::Value* value);
+
+  // Apply the optimizations to the graph owned by the current fusion group,
+  // like concatenation optimization, post-op fusion, and some other graph-level
+  // optimizations.
+  void optimizeOwningGraph();
+
+  int64_t nInputs_ = 0;
+  int64_t nOutputs_ = 0;
+  std::vector<CodeGen::BufferArg> bufferArgs_;
+  std::vector<std::vector<int64_t>> tensorOutputSizes_;
+  std::vector<std::vector<int64_t>> tensorOutputStrides_;
+  std::vector<torch::jit::StrideInput> tensorOutputStrideDesc_;
+  std::vector<bool> isOutputScalar_;
+  std::vector<UnpackedTensorOptions> tensorOutputTensorOptions_;
+  std::unordered_set<BufPtr> bufOutputs_;
+  std::unordered_set<BufPtr> bufsToBeParallelized_;
+  std::unordered_map<const torch::jit::Value*, BufPtr> bufs_;
+  std::unordered_map<const torch::jit::Value*, VarHandle> scalars_;
+  std::unordered_map<const torch::jit::Value*, std::string> input_name_map_;
+  std::unique_ptr<CodeGen> codegen_;
+  at::Device device_ = at::kCPU;
+  std::shared_ptr<Graph> graph_;
+  Code code_;
+  bool allow_fallback_{false};
+  bool use_fallback_{false};
+  bool hasRandom_{false};
+  bool hasBroadcast_{false};
+  std::unordered_map<const torch::jit::Value*, std::vector<ExprHandle>>
+      known_sizes_;
+
+  std::vector<std::vector<ExprHandle>> tensorOutputSymbolicSizes_;
+  // A map from ShapeSymbol.value() to the corresponding Var.
+  std::unordered_map<int64_t, VarHandle> shapeSymbolToVar_;
+  std::unordered_map<ExprPtr, size_t> shapeSymbolInputPos_;
+  // List of values corresponding to the ShapeSymbols that are inputs to
+  // kernel being compiled. The order of these values correspond to the order
+  // of the symbolic inputs at the end of the list of inputs to the kernel.
+  std::vector<int64_t> symbolic_shape_inputs_;
+  bool has_symbolic_shapes_{false};
+
+  std::vector<at::Tensor> unpacked_constant_tensors_;
+  std::vector<ConstantDescr> constants_;
+
+  std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings_;
+  StmtPtr stmt_ = nullptr;
+  bool pre_alloc_{false};
+  std::string kernel_func_name_;
+
+  // index of stack, stride index of tensor that will be appended as a codegen
+  // arg
+  std::vector<std::pair<size_t, size_t>> input_stride_args_;
+  // map from <input index, tensor dimension> to stride as arg VarHandle
+  std::unordered_map<std::pair<size_t, size_t>, VarHandle, SmallSizeTPairHash>
+      strideArgToVar_;
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides_;
+
+  // Memory layout to be propagated with fusion group
+  MemoryLayoutPolicy memory_layout_policy_ = MemoryLayoutPolicy::kContiguous;
+};
+
+TORCH_API int& getTECudaPointwiseLoopLevels();
+TORCH_API int& getTECudaPointwiseBlockCount();
+TORCH_API int& getTECudaPointwiseBlockSize();
+TORCH_API bool& getTEGenerateBlockCode();
+TORCH_API bool& getTEMustUseLLVMOnCPU();
+TORCH_API bool fallbackAllowed();
+TORCH_API bool setFallbackAllowed(bool value);
+TORCH_API bool& getCatWoConditionals();
+TORCH_API bool& getOptConditionals();
+
+TORCH_API c10::optional<at::Device> pickDeviceType(
+    const at::ArrayRef<torch::jit::Value*>& inputs);
+
+bool isContiguous(
+    const torch::jit::Value* v,
+    at::MemoryFormat memory_format = at::MemoryFormat::Contiguous);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d8857e580d6bcffe3a673a40751ea2440c53c0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+#include <c10/util/Optional.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class LLVMCodeGenImpl;
+class LLVMCodeGenCallee;
+
+class TORCH_API LLVMCodeGen : public CodeGen {
+ public:
+  explicit LLVMCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func",
+      Dtype dtype = kInt,
+      c10::optional<std::string> triple = c10::nullopt,
+      c10::optional<std::string> cpu = c10::nullopt,
+      c10::optional<std::string> attrs = c10::nullopt);
+  explicit LLVMCodeGen(StmtPtr stmt);
+
+  LLVMCodeGen() = delete;
+  ~LLVMCodeGen() override;
+
+  // Cleans up all the memory used during LLVM code generation pass except
+  // the generated kernel. After calling this method, users should not call
+  // methods like `getCodeText` that require the LLVMCodeGenImpl data. However,
+  // users can continue to call this kernel using `call` and `call_raw`.
+  void cleanup_memory();
+
+  TORCH_API void call(const std::vector<CallArg>& args) override;
+  TORCH_API void call_raw(const std::vector<void*>& args) override;
+  TORCH_API void call_with_numel(void** args, int64_t numel) override;
+
+  at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      c10::optional<c10::ScalarType> dtype_opt,
+      c10::optional<c10::Layout> layout_opt,
+      c10::optional<c10::Device> device_opt,
+      c10::optional<bool> pin_memory_opt) override;
+
+  template <typename T>
+  T value() {
+    return value<T>(nullptr);
+  }
+
+  template <typename T>
+  T value(std::vector<void*>& args) {
+    return value<T>(args.data());
+  }
+
+  template <typename T>
+  T value(void** args) {
+    T (*fp)(void**) = (T(*)(void**))getKernelAddress(callee_.get());
+    T rv = fp(args);
+    return rv;
+  }
+
+  std::string getCodeText(const std::string& attr = "") override;
+
+ private:
+  void* getKernelAddress(LLVMCodeGenCallee* callee);
+
+  std::unique_ptr<LLVMCodeGenCallee> callee_;
+  std::unique_ptr<LLVMCodeGenImpl> impl_;
+};
+
+struct TORCH_API LLVMCodeGenBuilder {
+  using BufferArg = CodeGen::BufferArg;
+
+  LLVMCodeGenBuilder(StmtPtr stmt, std::vector<BufferArg> args)
+      : stmt_(stmt), args_(std::move(args)) {}
+
+  LLVMCodeGenBuilder& device(at::Device device) {
+    device_ = device;
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& kernelFuncName(std::string name) {
+    kernelFuncName_ = std::move(name);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& dtype(Dtype d) {
+    dtype_ = d;
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& triple(std::string triple) {
+    triple_ = std::move(triple);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& cpu(std::string cpu) {
+    cpu_ = std::move(cpu);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& attrs(std::string attrs) {
+    attrs_ = std::move(attrs);
+    return *this;
+  }
+
+  std::unique_ptr<LLVMCodeGen> build() {
+    return std::make_unique<LLVMCodeGen>(
+        stmt_, args_, device_, kernelFuncName_, dtype_, triple_, cpu_, attrs_);
+  }
+
+ private:
+  StmtPtr stmt_;
+  std::vector<BufferArg> args_;
+  at::Device device_ = at::kCPU;
+  std::string kernelFuncName_ = "func";
+  Dtype dtype_ = kInt;
+  c10::optional<std::string> triple_ = c10::nullopt;
+  c10::optional<std::string> cpu_ = c10::nullopt;
+  c10::optional<std::string> attrs_ = c10::nullopt;
+};
+
+TORCH_API c10::optional<std::string>& LLVMTargetTriple();
+TORCH_API c10::optional<std::string>& LLVMTargetCPU();
+TORCH_API c10::optional<std::string>& LLVMTargetAttrs();
+TORCH_API bool& LLVMAOTWorkflow();
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+#endif // TORCH_ENABLE_LLVM
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..8269e4509b0b3fe4f72fca3d406b3d188073f517
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
+#include <llvm/ExecutionEngine/JITSymbol.h>
+C10_DIAGNOSTIC_POP()
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <memory>
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+inline std::string formatError(llvm::Error&& err, const char* msg) {
+  static constexpr const char* defaultErrorMsg =
+      "Unexpected failure in LLVM JIT";
+  std::string errorMsg(msg ? msg : defaultErrorMsg);
+  llvm::raw_string_ostream ss(errorMsg);
+  ss << ": " << err;
+  return ss.str();
+}
+
+template <typename T>
+T assertSuccess(llvm::Expected<T> valOrErr, const char* msg = nullptr) {
+  TORCH_INTERNAL_ASSERT(valOrErr, formatError(valOrErr.takeError(), msg));
+  return std::move(*valOrErr);
+}
+
+inline void assertSuccess(llvm::Error err, const char* msg = nullptr) {
+  TORCH_INTERNAL_ASSERT(!err, formatError(std::move(err), msg));
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+namespace llvm {
+namespace orc {
+
+class PytorchLLVMJITImpl;
+
+class TORCH_API PytorchLLVMJIT {
+ public:
+  PytorchLLVMJIT(
+      c10::optional<std::string> triple,
+      c10::optional<std::string> cpu,
+      c10::optional<std::string> attrs);
+  ~PytorchLLVMJIT();
+
+  void addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
+
+  JITSymbol findSymbol(const std::string Name);
+
+  bool hasSymbol(const std::string& Name);
+
+  TargetMachine& getTargetMachine();
+
+  const DataLayout& getDataLayout();
+
+ private:
+  // Use the PImpl idiom here to hide the no-rtti parts of the JIT structure.
+  std::unique_ptr<PytorchLLVMJITImpl> impl_;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // ENABLE LLVM
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e6e7fa351d7f9c1afb8f68da077c3946dae6be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h
@@ -0,0 +1,606 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class Expr;
+class Var;
+class Buf;
+class Tensor;
+class Function;
+class Stmt;
+class For;
+class Block;
+class Store;
+class Dtype;
+
+class TORCH_API LoopNest {
+ public:
+  // A constructor for building a LoopNest from a list of Tensors
+  LoopNest(
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
+
+  // A convenience constructor for the case when all tensors are output tensors
+  LoopNest(const std::vector<Tensor>& output_tensors);
+
+  // A constructor for building a LoopNest from an Stmt and a list of output
+  // buffers.
+  LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs);
+
+  // A constructor for building a LoopNest from another loopnest. It clones the
+  // other loopnest's stmt.
+  LoopNest(const LoopNest& other);
+
+  StmtPtr root_stmt() const {
+    return root_stmt_;
+  }
+
+  std::vector<ForPtr> getLoopStmtsFor(Tensor) const;
+  std::vector<ForPtr> getLoopStmtsFor(BufPtr) const;
+  std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
+  StmtPtr getLoopBodyFor(Tensor) const;
+  StmtPtr getLoopBodyFor(BufPtr) const;
+
+  // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
+  //'indices' indicates the path to the returned loop from 'root' in AST, e.g.,
+  //
+  // root: for(int i...){
+  // j_loop: for (int j...){
+  // k1_loop:  for (int k1...){
+  //            A[i, j, k1] = ....
+  //          }
+  //          B[i, j] = ...
+  // k2_loop:  for (int k2...){
+  //            A[i, j, k2] = ...
+  //          }
+  //        }
+  //      }
+  //
+  // the path from 'root' to 'j_loop' is [0]
+  // the path from 'root' to 'k1_loop' is [0, 0]
+  // the path from 'root' to 'k2_loop' is [0, 2]
+  ForPtr getLoopAt(ForPtr root, const std::vector<int>& indices) const;
+
+  // Returns the For stmt that is immediately enclosing the given stmt.
+  static ForPtr getParentLoop(StmtPtr st);
+
+  // Returns the list of For stmts corresponding to the loopnest that is
+  // enclosing the given stmt.
+  static std::vector<ForPtr> getEnclosingLoopNest(StmtPtr st);
+
+  // Returns a list of all Stmts that write to the given buf.
+  std::vector<StmtPtr> getAllWritesToBuf(BufPtr) const;
+
+  // The following methods return the For loops that contain writes to
+  // the given buf.
+  //
+  // For example, consider the following code:
+  //   for i1
+  //     for j1
+  //       a[i1,j1] =
+  //   for i2
+  //     for j2
+  //       for k2
+  //         a[i2,j2] =
+  //     for j3
+  //       a[i2,j3] =
+
+  // Returns a list of For loops which directly contain a Stmt that writes
+  // to buf.
+  // For the above example:
+  //   getAllInnermostLoopsWritingToBuf(a) => {j1, k2, j3}
+  std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr) const;
+
+  // Returns a list of For loopnests which contain a Stmt that writes to
+  // the given buf. Each loopnest here is a vector For loops.
+  // For the above example:
+  //   getAllLoopNestsWritingToBuf(a) => {{i1,j1}, {i2,j2,k2}, {i2,j3}}
+  std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(BufPtr) const;
+
+  StmtPtr simplify();
+
+  // Sanitize variables and buffer names.
+  // The pass assigns predefined names for loop index variables
+  // (i,j,k,l,m,n,o,p,i1,j1,k1,...) and ensures these names are not conflicting
+  // anywhere. It also removes duplicates from other Buf nad Var names as well
+  // as replaces illegal characters in them with underscores.
+  //
+  // Note: since it's currently technically possible to use the same variable
+  // as index in two different loops, this transformation finds such cases and
+  // introduces new variables to avoid duplication.
+  static StmtPtr sanitizeNames(StmtPtr s);
+
+  bool computeInline(StmtPtr s);
+  bool computeInline(BufPtr b);
+  void inlineIntermediateBufs(bool allow_duplicated_work);
+
+  // Optimizes conditionals.
+  //
+  // Currently, only the following pattern of conditionals is optimized.
+  // This corresponds to the conditional format that is generated to handle
+  // `aten::cat` op.
+  //
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //   }
+  //
+  // Constraints that must be satisfied for this optimization:
+  //   * All conditions should be of the form "var < expr".
+  //   * All conditions should have the same variable, say v.
+  //   * The condition variable found should be the same as the inner-most
+  //     loop variable. TODO: Remove this constraint.
+  //   * If there are multiple stores that contain conditionals using the same
+  //     loop variable, only the first conditional will be optimized.
+  //     TODO: Remove this constraint.
+  bool optimizeConditionals();
+
+  // Splits the given loop into 2 nested loops with the given factor as the
+  // inner loop bound. If the factor does not evenly divide the loop bound,
+  // then the remaining iterations are extracted into a tail loop that is
+  // added after the given loop.
+  //
+  // For example, consider the following code:
+  //   for (int i = 0; i < 100; ++i) {
+  //     A[i] =
+  //   }
+  //
+  // splitWithTail(i, 8, ...) will result in:
+  //   for (int i_outer = 0; i_outer < 12; ++i_outer) {
+  //     for (int i_inner = 0; i_inner < 8; ++i_inner) {
+  //       A[i_outer * 8 + i_inner] =
+  //     }
+  //   }
+  //   for (int i_tail = 0; i_tail < 4; ++i_tail) {
+  //     A[i_tail + 96] =
+  //   }
+  //
+  // The given loop will be transformed to the outer loop after splitting.
+  // So, the pointer to the input loop should be valid after splitting and
+  // will point to the outer loop. The `inner` and `tail` parameters will be
+  // set to point to the inner and tail loops that are generated.
+  static void splitWithTail(ForPtr f, int factor, ForPtr* inner, ForPtr* tail);
+  // A convenience wrapper when the caller does not need to access the
+  // split loops.
+  static void splitWithTail(ForPtr f, int factor);
+
+  // Splits the given loop into 2 nested loops with the given factor as the
+  // inner loop bound. If the factor does not evenly divide the loop bound,
+  // then a conditional is inserted into the body to handle the remaining
+  // iterations appropriately.
+  //
+  // For example, consider the following code:
+  //   for (int i = 0; i < 100; ++i) {
+  //     A[i] =
+  //   }
+  //
+  // splitWithMask(i, 8, ...) will result in:
+  //   for (int i_outer = 0; i_outer < 13; ++i_outer) {
+  //     for (int i_inner = 0; i_inner < 8; ++i_inner) {
+  //       if (i_outer * 8 + i_inner < 100) {
+  //         A[i_outer * 8 + i_inner] =
+  //       }
+  //     }
+  //   }
+  //
+  // The given loop will be transformed to the outer loop after splitting.
+  // So, the pointer to the input loop should be valid after splitting and
+  // will point to the outer loop. The `inner` parameter will be set to point
+  // to the inner loop that is generated.
+  static void splitWithMask(ForPtr f, int factor, ForPtr* inner);
+  // A convenience wrapper when the caller does not need to access the
+  // split loops.
+  static void splitWithMask(ForPtr f, int factor);
+
+  // The following methods support loop distribution.
+  // For example, consider the following code. This will be used to
+  // demonstrate the methods below.
+  //
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+
+  // This method distributes the given loop over its body by splitting
+  // after every given pivot stmt.
+  //
+  // NOTE: Pivot stmts that are not in the given loop's body will be ignored.
+  //
+  // For the above example:
+  //   distributeLoop(S1, {S3, S5})
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoop(
+      ForPtr loop,
+      const std::unordered_set<StmtPtr>& pivots);
+
+  // This method distributes the given loop over every stmt in its body.
+  //
+  // For the above example:
+  //   distributeLoop(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  //   :    for i
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoop(ForPtr loop);
+  // Same as above, but also distribute parent loops.
+  // Returns the result of distributing the outermost loop.
+  //
+  // For the above example:
+  //   distributeLoopAndParents(S1) will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  //   :  for m
+  //   :    for i
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :  for m
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :  for m
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopAndParents(ForPtr loop);
+
+  // This method distributes the given loop over its body by splitting
+  // after every For stmt in its body.
+  //
+  // For the above example:
+  //   distributeLoopOverInnerLoops(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopOverInnerLoops(ForPtr loop);
+  // Same as above, but also distribute parent loops.
+  // Returns the result of distributing the outermost loop.
+  //
+  // For the above example:
+  //   distributeLoopAndParentsOverInnerLoops(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :  for m
+  //   :    for i
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopAndParentsOverInnerLoops(
+      ForPtr loop);
+
+  // This method performs loop fusion.
+  // For example, consider the following code.
+  //
+  // S1:  for m
+  // S2:    A[m] = 0
+  // S3:    for j
+  // S4:      A[m] = A[m] +
+  // S5:  for n
+  // S5:    B[n] = A[n]
+  // S6:    for k
+  // S7:      B[n] = B[n] +
+  //
+  // fuseLoops({S1, S5}), will return the following loop:
+  // S1:  for m
+  // S2:    A[m] = 0
+  // S3:    for j
+  // S4:      A[m] = A[m] +
+  // S5:    B[m] = A[m]
+  // S6:    for k
+  // S7:      B[m] = B[m] +
+  //
+  // This transformation is unsafe as it simply add all loops into the body of
+  // the first loop for fusion without correctness checks.
+  //
+  // Below are the two requirements to apply unsafeFuseLoops:
+  //  * All the loops have the same parent.
+  //  * There are no statements between these loops in their parent body.
+  static bool unsafeFuseLoops(const std::vector<ForPtr>& loops, ForPtr* fused);
+
+  // Loop fusion is done only when all the conditions below are satisfied.
+  //  * All the loops have the same parent.
+  //  * There are no statements between these loops in their parent body.
+  //  * The start bounds are the same for all loops.
+  //  * The stop bounds are the same for all loops.
+  //  * Fusing the loops does not violate or add any dependencies.
+  static bool fuseLoops(const std::vector<ForPtr>& loops, ForPtr* fused);
+
+  static void reorderAxis(ForPtr a, ForPtr b);
+
+  // Reorder the given list of loops according to the permutation specified.
+  // Here `permutation[i]` represents the position of the loop in the input
+  // which will end up at position `i` after the reorder.
+  //
+  // For example, consider the following code:
+  //   for p
+  //     for q
+  //       for r
+  //         for s
+  //           A[p,q,r,s] =
+  //
+  // reorder({p, q, r, s}, {2, 3, 0, 1}) will return the list of loops in the
+  // following form:
+  //    for r
+  //      for s
+  //        for p
+  //          for q
+  //            A[p,q,r,s] =
+  static std::vector<ForPtr> reorder(
+      const std::vector<ForPtr>& loops,
+      const std::vector<size_t>& permutation);
+
+  // Tile takes a 2d domain (x, y) and splits it into small rectangular blocks
+  // each with shape (x_factor, y_factor). The traversal over the domain turns
+  // into an outer iteration over the blocks and an inner traversal over all
+  // points in the block.
+  // Note that if x dim % x_factor or y dim % y_factor does not equal to 0, the
+  // loop body will generate corresponding tailing loops.
+  // The transformation is in-place and returns 'xtail'.
+  //
+  // For example, consider the following code:
+  //   for i: [0, 64)
+  //     for j: [0, 64)
+  //       for k: [0, 32)
+  //         A[i, j] = B[i, k] + C[j, k]
+  //
+  // tile(i, j, 4, 8) will transform "i" for-stmt into the following nested
+  // loop:
+  //   for i_outer: [0, 16)
+  //     for j_outer: [0, 8)
+  //       for i_inner: [0, 4)
+  //         for j_inner: [0, 8)
+  //           for k: [0, 32)
+  //             A[i_outer * 4 + i_inner, j_outer * 8 + j_inner] =
+  //             B[i_outer * 4 + i_inner, k] + C[j_outer * 8 + j_inner, k]
+  //
+  // tile(i, j, 4, 9) will transform "i" for-stmt into the following nested
+  // loop:
+  //   for i_outer: [0, 16)
+  //     for j_outer: [0, 7)
+  //       for i_inner: [0, 4)
+  //         for j_inner: [0, 9)
+  //           for k: (0, 32)
+  //             A[i_outer * 4 + i_inner, j_outer * 9 + j_inner] =
+  //             B[i_outer * 4 + i_inner, k] + C[j_outer * 9 + j_inner, k]
+  //     for j_tail: [0, 1)
+  //       for i_inner: [0, 4)
+  //         for k: (0, 32)
+  //           A[i_outer * 4 + i_inner, 7 * 9 + j_tail] =
+  //           B[i_outer * 4 + i_inner, k] + C[7 * 9 + j_tail, k]
+  ForPtr tile(ForPtr x, ForPtr y, int x_factor, int y_factor);
+
+  // Returns true if the given loops are perfectly nested, i.e., every loop
+  // (except the innermost) should have exactly one statement in its body
+  // and that statement must be the next inner loop.
+  static bool areLoopsPerfectlyNested(const std::vector<ForPtr>& loops);
+
+  // Returns true if the given loop has a loop-carried dependence.
+  static bool hasLoopCarriedDependence(ForPtr loop);
+
+  // Unrolls all the iterations of the given loop.
+  // Requires that the loop bounds are constant.
+  static void fullUnroll(ForPtr f, StmtPtr* unrolled);
+  static void fullUnroll(ForPtr f);
+
+  // Unrolls the given loop for the specified factor.
+  // This does not require constant bounds for the loop being unrolled.
+  static void unroll(ForPtr f, int factor, ForPtr* tail);
+  static void unroll(ForPtr f, int factor);
+
+  static bool normalize(ForPtr f);
+  static bool isNormalized(ForPtr f);
+
+  static bool flatten(const std::vector<ForPtr>& f, ForPtr* flattened);
+  static bool flatten(const std::vector<ForPtr>& f);
+
+  // Compresses the given buffer based on its use in the given Stmts.
+  //
+  // NOTE: This API assumes that there are no accesses to the given buffer
+  // outside the given statement. So, this should be called with the entire
+  // kernel statement to avoid incorrect buffer compressions.
+  //
+  // For example, given the input:
+  //
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[i,j] + A[i, j+1]
+  //   }
+  // }
+  //
+  // compressBuffer(A, ...) will compress buffer A from
+  // [100, 200] to [1, 200] and modify the code as follows:
+  //
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[0,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[0,j] + A[0, j+1]
+  //   }
+  // }
+  static void compressBuffer(BufPtr buf, StmtPtr stmt);
+
+  // Compresses all buffers in the given statement.
+  //
+  // NOTE: This API assumes that there are no accesses to buffers outside
+  // the given statement. So, this should be called with the entire
+  // kernel statement to avoid incorrect buffer compressions.
+  //
+  // TODO: Add an IR verifier check to detect invalidly compressed buffers.
+  static void compressAllBuffers(StmtPtr stmt);
+
+  // Get 'num' loops from the loopnest starting at 'f'.
+  static std::vector<ForPtr> getLoopStmtsInLoopNest(ForPtr f, size_t num);
+
+  // LoopOptions are propagated to tail.
+  static void sliceHead(ForPtr f, int factor, ForPtr* head, ForPtr* tail);
+  static void sliceHead(ForPtr f, int factor);
+  // LoopOptions are propagated to head.
+  static void sliceTail(ForPtr f, int factor, ForPtr* head, ForPtr* tail);
+  static void sliceTail(ForPtr f, int factor);
+
+  using AccessResult = std::pair<BufPtr, StmtPtr>;
+  // Insert a cache for the consumer's usages of the buffer produced in
+  // consumer, and redirect reads and writes in the consumer to that cache.
+  // Returns a pair of the new cache buffer, and the new rewritten consumer.
+  static AccessResult cacheAccesses(
+      BufPtr producer,
+      const std::string& name,
+      StmtPtr consumer);
+
+  // Insert a temporary computation of statement S in the scope of loop AT.
+  // S is assumed to be a Store or a Block containing a Store. Along with the
+  // computation itself, this transformation inserts Alloc/Free statements for
+  // the temporary buffer used in the computation.
+  static void computeAt(StmtPtr s, ForPtr at);
+
+  // Rfactor a reduction axis into a normal axis.
+  //
+  // Requirements:
+  //  * S is the reduction store
+  //  * S is the only statement in the innermost loop
+  //  * There is at least two reduction arguments in S
+  //  * OUTER_REDUCTION_FOR loop corresponds to the outermost reduction variable
+  //  used in the store and all other reduction variables are index variables of
+  //  children loops of OUTER_REDUCTION_FOR
+  //  * OUTER_REDUCTION_FOR is a perfect loop nest, i.e. it has only loops
+  //  corresponding to the other reduction variables and the store, nested into
+  //  each other
+  //
+  // What it does:
+  //   * Introduce a new buffer with an extra dimension of a size equal to the
+  //   span of the loop OUTER_REDUCTION_FOR (the new buffer is returned via
+  //   RFAC_BUF_PTR)
+  //   * Insert an initialization store for the new buffer in
+  //   OUTER_REDUCTION_FOR before its nested loop
+  //   * Replace the reduction store to the original buffer with the reduction
+  //   store to the temp buffer, removing the index var of OUTER_REDUCTION_FOR
+  //   from reduction arguments
+  //   * Insert a final reduction store over the extra dimension of the new
+  //   buffer to the original buffer
+  //   * Returns TRUE if the transformation succeeded and FALSE otherwise
+  //
+  // Example:
+  // Original IR:
+  // S1: for i      # normal axis
+  // S2:   X[i] = 0
+  // S3:   for j    # reduction axis
+  // S4:     for k  # reduction axis
+  // S5:       X[i] = ReduceOp(X[i] + Y[i,j,k], reduce_axis={j,k})
+  //
+  // After RFACTOR(S5, S3)
+  // S1: for i               # normal axis
+  // S2:   X[i] = 0
+  // S3:   for j             # reduction axis for X, normal axis for X_rfac
+  //         X_rfac[i,j] = 0
+  // S4:     for k           # reduction axis
+  //           X_rfac[i,j] = ReduceOp(X_rfac[i,j] + Y[i,j,k], reduce_axis={k})
+  //         X[i] = ReduceOp(X[i] + X_rfac[i,j], reduce_axis={j})
+  static bool rfactor(StmtPtr s, ForPtr outer_reduction_for);
+  static bool rfactor(
+      StmtPtr s,
+      ForPtr outer_reduction_for,
+      BufPtr* rfac_buf_ptr);
+
+  // Vectorize the given loop. This method requires that the given loop
+  // does not perform a reduction.
+  // It returns true if vectorization is successful and false otherwise.
+  static bool vectorize(ForPtr);
+
+  // Find the inner-most loops and vectorize them. Currently, this only works
+  // for the LLVM backend, when no reductions are involved.
+  void vectorizeInnerLoops();
+
+  void eliminateDeadStores();
+
+  void prepareForCodegen();
+
+  const std::unordered_set<BufPtr> getInputBufs() const;
+  const std::unordered_set<BufPtr> getOutputBufs() const {
+    return output_bufs_;
+  }
+  std::vector<BufPtr> getIntermediateBufs() const;
+
+  // Finds which is the outer For between a and b for loops. If neither of the 2
+  // Fors is an ancestor of the other, it returns nullptr.
+  static ForPtr findOuterFor(ForPtr a, ForPtr b);
+
+ private:
+  void initialize(
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
+
+  StmtPtr root_stmt_;
+
+  std::unordered_set<BufPtr> output_bufs_;
+};
+
+TORCH_API StmtPtr FlattenIndexes(StmtPtr s);
+
+// TODO: Revisit this once we decide on how dependencies analysis should look
+// like. Maybe we would choose to use a different API and BufUse would be
+// removed, or if we decide to keep it we need to properly document its API.
+struct BufLoadOrStoreUse {
+  StmtPtr s;
+  bool isStore;
+};
+
+/*
+ * Returns a map ( Buf -> uses of this Buf), uses are represented as vectors of
+ * BufUse elements, which are StmtPtr and a bool isStore flag. The order of uses
+ * in the vectors reflects the order in which the uses appear in the given
+ * statement.
+ */
+std::unordered_map<BufPtr, std::vector<BufLoadOrStoreUse>> findLoadOrStoreUses(
+    StmtPtr s);
+
+// replaces all invalid characters with underscore
+TORCH_API std::string sanitizeName(const std::string& input_name);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0c975fb97c4a06ef663c246f90c6ab97aa6613b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// Applies a series of loop optimizations chosen randomly. This is only for
+// testing purposes. This allows automatic stress testing of NNC loop
+// transformations.
+void loopnestRandomization(int64_t seed, LoopNest& l);
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6778ddc0c299565f3a09171f9183703ead5cd33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h
@@ -0,0 +1,49 @@
+// This file defines classes for registering standard lowerings from JIT to TE
+// IR.
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+using ArgNone = std::monostate;
+using BufList = std::vector<tensorexpr::BufHandle>;
+using DoubleList = std::vector<double>;
+using IntList = std::vector<int64_t>;
+using ArgValue = std::variant<
+    tensorexpr::BufHandle,
+    tensorexpr::VarHandle,
+    double,
+    int64_t,
+    bool,
+    BufList,
+    DoubleList,
+    IntList,
+    std::string,
+    ArgNone>;
+
+using NNCLoweringFunction = std::function<Tensor(
+    const std::vector<ArgValue>&,
+    const std::vector<ExprHandle>&,
+    const std::vector<ExprHandle>&,
+    const c10::optional<ScalarType>&,
+    at::Device)>;
+
+TORCH_API FunctionSchemaMap<NNCLoweringFunction>& getNNCLoweringRegistry();
+TORCH_API NNCLoweringFunction getStandardLoweringFor(const std::string& op);
+
+struct RegisterNNCLoweringsFunction {
+  RegisterNNCLoweringsFunction(
+      const std::vector<std::string>& schemas,
+      NNCLoweringFunction fn);
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..35278426441d22d6ec97fed82ab129712f43e215
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
@@ -0,0 +1,415 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+namespace analysis {
+
+enum class AccessType {
+  Input,
+  Output,
+  Load,
+  Store,
+  Call,
+  AtomicAdd,
+  Alloc,
+  Free
+};
+const char* AccessToString(AccessType a);
+
+class AccessInfo;
+using DependencySet = std::unordered_set<std::shared_ptr<AccessInfo>>;
+
+/* AccessInfo
+ *
+ * Represents a single bounded memory access to a buffer, for instance a Load or
+ * a Store. Holds information relating to the specific access and links to
+ * connected accesses in the dependency graph.
+ */
+class TORCH_API AccessInfo {
+ public:
+  AccessInfo(
+      size_t id,
+      AccessType type,
+      StmtPtr stmt,
+      VarPtr var,
+      IndexBounds bounds)
+      : id_(id),
+        type_(type),
+        stmt_(std::move(stmt)),
+        expr_(nullptr),
+        var_(std::move(var)),
+        bounds_(std::move(bounds)) {}
+
+  AccessInfo(
+      size_t id,
+      AccessType type,
+      ExprPtr expr,
+      StmtPtr stmt,
+      VarPtr var,
+      IndexBounds bounds)
+      : id_(id),
+        type_(type),
+        stmt_(std::move(stmt)),
+        expr_(std::move(expr)),
+        var_(std::move(var)),
+        bounds_(std::move(bounds)) {}
+
+  // Id is a unique int representing the order this access occurred in the
+  // graph.
+  size_t id() const {
+    return id_;
+  }
+
+  // The type of the access (Load, Store, etc).
+  AccessType type() const {
+    return type_;
+  }
+
+  // The enclosing Stmt this access represents. E.g. if this is a Store then
+  // Stmt is the Store itself, while if the access is caused by an Expr, this is
+  // the most immediate parent Stmt.
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  // If the access is represented by an Expr (such as Load or Call) then this is
+  // it, otherwise it's nullptr.
+  ExprPtr expr() const {
+    return expr_;
+  }
+
+  // The Var representing the underlying Buffer.
+  VarPtr var() const {
+    return var_;
+  }
+
+  // A vector of Bounds representing the start and end expression for each
+  // dimension.
+  IndexBounds& bounds() {
+    return bounds_;
+  }
+
+  // Each access that this depends upon,
+  // eg. if this is a Load, then it contains every Store that immediately
+  // contributes to a load of the bounds.
+  // or: if this is a Store, it contains all reads on the RHS of the Store.
+  const std::map<size_t, std::shared_ptr<AccessInfo>>& dependencies() const {
+    return dependencies_;
+  }
+
+  // Each access that depends on this one.
+  // ie. this access is present in the dependencies map of all accesses that are
+  // dependent.
+  std::map<size_t, std::shared_ptr<AccessInfo>> dependents() const {
+    std::map<size_t, std::shared_ptr<AccessInfo>> res;
+    for (const auto& kv : dependents_) {
+      res.emplace(kv.first, kv.second.lock());
+    }
+    return res;
+  }
+
+  // Returns the symbolic expression of the indices of this access.
+  std::vector<ExprPtr> getIndices() const;
+
+  // Establishes a dependency or dependent relationship with another access.
+  void addDependency(const std::shared_ptr<AccessInfo>& write);
+  void addDependent(const std::shared_ptr<AccessInfo>& read);
+
+  // helper for checking dependencies.
+  bool hasDependency(const std::shared_ptr<AccessInfo>& info) const;
+
+  // Returns the set of all nodes that are direct (immediate) dependencies of
+  // this access.
+  DependencySet getDirectDependencies();
+  // likewise, returns all nodes that directly depend on this one.
+  DependencySet getDirectDependents();
+
+  // Returns the full list of all nodes in the graph that this access depends
+  // on, and all nodes they depend on, and so forth, back to the inputs.
+  DependencySet getIndirectDependencies();
+  // likewise, returns the full list of all nodes that depend on this node, and
+  // all nodes that depend on those nodes and so on down to the outputs.
+  DependencySet getIndirectDependents();
+
+  // Does this access represent a read of memory (Load, ReduceOp, Call, etc).
+  bool isRead() const;
+  // Does this access represent a write of memory (Store, etc).
+  bool isWrite() const;
+
+  // Helpers for dumping accesses in various formats.
+  void print() const;
+  void dumpDOT(std::ostream& os) const;
+  const char* AccessTypeColour() const;
+
+ private:
+  size_t id_;
+  AccessType type_;
+  StmtPtr stmt_;
+  ExprPtr expr_;
+  VarPtr var_;
+  IndexBounds bounds_;
+
+  // Yes these should be sorted.
+  std::map<size_t, std::shared_ptr<AccessInfo>> dependencies_;
+  std::map<size_t, std::weak_ptr<AccessInfo>> dependents_;
+};
+
+using VarBoundMap = std::unordered_map<VarPtr, Bound>;
+
+/* MemDependencyChecker analyses a IR fragment and builds a dependency graph of
+ * accesses contained within.
+ *
+ * It's possible to retrieve the entire graph in node-object form, or can be
+ * used as an oracle for answering dependency questions. e.g:
+ *
+ *  analyzer.hasIndirectDependency(BufA, BufB); or,
+ *  analyzer.hasDirectDependency(LoadA, StoreB);
+ */
+class TORCH_API MemDependencyChecker : public IRVisitor {
+  struct Scope;
+
+ public:
+  MemDependencyChecker();
+  MemDependencyChecker(
+      const std::unordered_set<BufPtr>& inputs,
+      const std::unordered_set<BufPtr>& outputs);
+  MemDependencyChecker(
+      const std::vector<BufHandle>& inputs,
+      const std::vector<BufHandle>& outputs);
+
+  ~MemDependencyChecker() override = default;
+
+  // Whether or not to allow loop execution order to influence dependency
+  // calculation. If the loop may later be parallelized you don't want this.
+  bool allowLoopExecutionOrderAnalysis(bool allow = true);
+
+  // Dependency Checking API.
+  // The goal is to have enough overloads here so you don't really have to think
+  // about it.
+
+  // Returns true if any read in A has a direct dependence on a write in B.
+  bool dependsDirectly(StmtPtr A, StmtPtr B);
+  bool dependsDirectly(ExprPtr A, StmtPtr B);
+
+  // Returns true of the output depends directly on a write contained in B.
+  bool dependsDirectly(BufPtr output, StmtPtr B);
+
+  // Returns true if a read in A depends directly on the provided input.
+  bool dependsDirectly(StmtPtr A, BufPtr input);
+  bool dependsDirectly(ExprPtr A, BufPtr input);
+
+  // Outputs/inputs cannot depend directly.
+
+  // Returns true if the access A has B as an immediate dependency.
+  bool dependsDirectly(
+      const std::shared_ptr<AccessInfo>& A,
+      const std::shared_ptr<AccessInfo>& B);
+
+  // Returns true if any read in A has an ancestor write contained in B.
+  bool dependsIndirectly(StmtPtr A, StmtPtr B);
+  bool dependsIndirectly(ExprPtr A, StmtPtr B);
+
+  // Returns true of the output depends indirectly on a write contained in B.
+  bool dependsIndirectly(BufPtr output, StmtPtr B);
+
+  // Returns true if a read in A depends indirectly on the provided input.
+  bool dependsIndirectly(StmtPtr A, BufPtr input);
+  bool dependsIndirectly(ExprPtr A, BufPtr input);
+
+  // returns true if the output uses any load of the input.
+  bool dependsIndirectly(BufPtr output, BufPtr input);
+
+  // Returns true if the access A has a dependency chain to access B.
+  bool dependsIndirectly(
+      const std::shared_ptr<AccessInfo>& A,
+      const std::shared_ptr<AccessInfo>& B);
+
+  // Returns the AccessInfo
+  std::shared_ptr<AccessInfo> accessFor(StmtPtr A) const;
+  std::shared_ptr<AccessInfo> accessFor(ExprPtr A) const;
+
+  // Returns all AccessInfos.
+  std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
+      StmtPtr A) const;
+  // TODO: this will return only the AccessInfo for A. It's included for
+  // completeness but be aware it wont return accesses used in the computation
+  // of A.
+  std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
+      ExprPtr A) const;
+
+  // Accesses relating to input and output buffers.
+  std::shared_ptr<AccessInfo> input(BufPtr B) const;
+  std::shared_ptr<AccessInfo> output(BufPtr B) const;
+
+  // Returns the full history of reads and writes.
+  const std::vector<std::shared_ptr<AccessInfo>>& getHistory() const;
+
+  // Dumps the dependency graph in DOT format.
+  void dumpDAG(const std::string& filename) const;
+
+ private:
+  // Node visitors.
+  void visit(StorePtr v) override;
+  void visit(LoadPtr v) override;
+  void visit(ForPtr v) override;
+  void visit(CondPtr v) override;
+  void visit(IfThenElsePtr v) override;
+  void visit(CompareSelectPtr v) override;
+  void visit(BlockPtr v) override;
+  void visit(LetPtr v) override;
+  void visit(AtomicAddPtr v) override;
+  void visit(AllocatePtr v) override;
+  void visit(FreePtr v) override;
+
+  using BoundRelationship = std::pair<IndexBounds, std::shared_ptr<AccessInfo>>;
+
+  // An internal struct holding the accesses found within a scope Block.
+  struct Scope {
+    Scope(BlockPtr b, std::shared_ptr<Scope> p)
+        : block(std::move(b)), parent(std::move(p)) {}
+
+    BlockPtr block;
+    std::shared_ptr<Scope> parent;
+
+    std::unordered_map<VarPtr, Bound> shadowedVarBounds;
+    std::unordered_set<VarPtr> localVars;
+
+    std::vector<std::shared_ptr<AccessInfo>> accesses_;
+
+    std::unordered_map<VarPtr, std::list<BoundRelationship>> openWrites_;
+  };
+  std::shared_ptr<Scope> currentScope_;
+
+  bool allowExecutionOrderAnalysis_{false};
+
+  std::unordered_multimap<StmtPtr, std::shared_ptr<AccessInfo>> stmtToAccess_;
+  std::unordered_multimap<ExprPtr, std::shared_ptr<AccessInfo>> exprToAccess_;
+  std::unordered_map<StmtPtr, std::vector<std::shared_ptr<AccessInfo>>>
+      scopeToAccesses_;
+
+  VarBoundMap knownVarBounds_;
+
+  // Finds all accesses that are reads within the scope of v.
+  template <typename StmtOrExprPtr>
+  DependencySet getAllReadsWithin(StmtOrExprPtr v) {
+    DependencySet reads;
+    auto insertAllReads = [&](const auto& nodes) {
+      for (const auto& l : nodes) {
+        auto bound = exprToAccess_.equal_range(l);
+        for (auto it = bound.first; it != bound.second; ++it) {
+          if (it->second->isRead()) {
+            reads.insert(it->second);
+          }
+        }
+      }
+    };
+
+    // Look for and insert accesses belonging to all nodes that act like
+    // reads.
+    insertAllReads(NodeFinder<Load>::find(v));
+    insertAllReads(NodeFinder<ReduceOp>::find(v));
+
+    return reads;
+  }
+
+  // Finds all accesses that are writes within the scope of v.
+  // Writes cannot occur in Exprs, so this is a little simpler.
+  DependencySet getAllWritesWithin(StmtPtr v) {
+    DependencySet writes;
+
+    // writes just Store currently.
+    auto stores = NodeFinder<Store>::find(std::move(v));
+    for (const auto& s : stores) {
+      auto bound = stmtToAccess_.equal_range(s);
+      for (auto it = bound.first; it != bound.second; ++it) {
+        if (it->second->isWrite()) {
+          writes.insert(it->second);
+        }
+      }
+    }
+    return writes;
+  }
+
+  // Templated helpers to work on either Exprs or Stmts.
+  template <typename StmtOrExprPtr>
+  bool dependsDirectlyHelper(StmtOrExprPtr A, StmtPtr B) {
+    auto aReads = getAllReadsWithin(A);
+    auto bWrites = getAllWritesWithin(B);
+
+    for (auto& read : aReads) {
+      for (auto& depPair : read->dependencies()) {
+        if (bWrites.count(depPair.second) != 0) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  template <typename StmtOrExprPtr>
+  bool dependsIndirectlyHelper(StmtOrExprPtr A, StmtPtr B) {
+    auto aReads = getAllReadsWithin(A);
+    auto bWrites = getAllWritesWithin(B);
+
+    auto aDeps = getAllWriteDependencies(aReads);
+
+    for (auto& dependency : aDeps) {
+      if (bWrites.count(dependency) != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  DependencySet getAllWriteDependencies(const DependencySet& products);
+
+  // Maps for inputs and outputs, since they aren't present directly in the IR.
+  std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>> inputs_;
+  std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>> outputs_;
+  std::unordered_map<VarPtr, std::shared_ptr<AccessInfo>> intermediates_;
+
+  // Inserts accesses for Buf's: specifically for inputs and outputs.
+  void insertBuffers(
+      std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>>& bufs,
+      AccessType type);
+
+  // Update the write history with a new write, adding dependencies and closing
+  // any overlapped writes (if possible).
+  void updateWriteHistory(
+      std::list<BoundRelationship>& writeHistory,
+      const std::shared_ptr<AccessInfo>& info,
+      size_t latestAccessToClose,
+      bool closeOverlapped = true,
+      bool insert = true);
+
+  // Merge a child scope into a parent scope, adding dependencies for open
+  // writes in the parent to accesses in the child.
+  void mergeScope(
+      const std::shared_ptr<Scope>& child,
+      const std::shared_ptr<Scope>& parent,
+      bool closeOverlapped = true);
+
+  // Binds symbolic vars in indices with the low and high bound for those vars.
+  std::vector<Bound> getIndicesBounds(const std::vector<ExprPtr>& indices);
+
+  size_t nextAccess_{0};
+  StmtPtr lastStmt_{nullptr};
+};
+
+} // namespace analysis
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..87b02b35a58e2b32574f766cb8c11d196470694c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/operators/misc.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// An API to compute 2D depthwise convolutions with bias.
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    BufHandle bias,
+    int stride,
+    int pad,
+    int groups);
+
+// An API to compute 2D depthwise convolutions without bias.
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    int stride,
+    int pad,
+    int groups);
+
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    BufHandle bias,
+    ExprHandle N,
+    ExprHandle C,
+    ExprHandle H,
+    ExprHandle W,
+    ExprHandle K,
+    ExprHandle CperG,
+    ExprHandle R,
+    ExprHandle S,
+    ExprHandle stride,
+    ExprHandle pad,
+    ExprHandle groups);
+
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    ExprHandle N,
+    ExprHandle C,
+    ExprHandle H,
+    ExprHandle W,
+    ExprHandle K,
+    ExprHandle CperG,
+    ExprHandle R,
+    ExprHandle S,
+    ExprHandle stride,
+    ExprHandle pad,
+    ExprHandle groups);
+
+bool conv2dIsSupported(
+    const TensorInfo& input,
+    const TensorInfo& weight,
+    const TensorInfo& bias,
+    const std::vector<int64_t>& stride,
+    const std::vector<int64_t>& pad,
+    const std::vector<int64_t>& dilation,
+    int64_t groups);
+bool mkldnnPrepackedConvIsSupported(
+    const TensorInfo& input,
+    const TensorInfo& weight,
+    const std::vector<int64_t>& stride,
+    const std::vector<int64_t>& pad,
+    const std::vector<int64_t>& dilation,
+    int64_t groups);
+Tensor computeConv2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computePrepackedConv2dClampRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computePrepackedLinearClampRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeMkldnnPrepackedConvRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf8b503bfb9e134c642791d4d7541f29ee4e652c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+Tensor computeMatmul(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeAddMM(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h
new file mode 100644
index 0000000000000000000000000000000000000000..07c808a83c0849ff708e59813507d216545a8f96
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/lowerings.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+struct TensorInfo {
+  std::vector<int64_t> dims;
+  c10::ScalarType dtype;
+};
+c10::optional<TensorInfo> getTensorInfo(BufHandle b);
+
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
+
+// Convert boolean to integer, if needed.
+ExprHandle boolToInteger(const ExprHandle& x);
+ExprHandle promoteToDtype(ExprHandle e, ScalarType dt);
+void promoteInputs(
+    std::vector<ExprHandle>& inputs,
+    const int typeConstraints = kAllTypes);
+ExprHandle promoteIntegerToDefaultType(const ExprHandle& e);
+ExprHandle promoteHalfToFloat(const ExprHandle& e);
+ExprHandle demoteOutput(
+    const ExprHandle& e,
+    const c10::optional<ScalarType> type);
+
+std::vector<ExprHandle> broadcastShapes(
+    std::vector<std::vector<ExprHandle>> shapes);
+std::vector<ExprHandle> broadcastShapes(
+    const std::vector<ExprHandle>& a,
+    const std::vector<ExprHandle>& b);
+
+std::vector<ExprHandle> valueShape(const ArgValue& v);
+ExprHandle tensorOrConstant(
+    const ArgValue& v,
+    const std::vector<ExprHandle>& axes);
+ExprHandle scalarOrConstant(const ArgValue& v);
+ExprHandle broadcast(BufHandle b, const std::vector<ExprHandle>& axes);
+ExprHandle constant(const ArgValue& v);
+
+ExprHandle clamp(
+    const ExprHandle& cmin,
+    const ExprHandle& cmax,
+    const ExprHandle& input);
+
+Tensor computeChunk(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeTranspose(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeExpand(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeReshape(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeFlatten(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeCatWoConditionals(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape);
+Tensor computeCat(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeEmbedding(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d635d0f56d7605c6147a5f16072194ba7d0865e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+Tensor computeBatchNorm(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..68701fa557d20a1f20e0c482548bce3df3fb38b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/operators/conv2d.h>
+#include <torch/csrc/jit/tensorexpr/operators/matmul.h>
+#include <torch/csrc/jit/tensorexpr/operators/misc.h>
+#include <torch/csrc/jit/tensorexpr/operators/norm.h>
+#include <torch/csrc/jit/tensorexpr/operators/pointwise.h>
+#include <torch/csrc/jit/tensorexpr/operators/quantization.h>
+#include <torch/csrc/jit/tensorexpr/operators/reduction.h>
+#include <torch/csrc/jit/tensorexpr/operators/softmax.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..759489571bdc74dd0257b683de36b6882db6c328
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+TORCH_API Tensor computeSign(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    c10::optional<std::vector<ExprHandle>> outputStrides = c10::nullopt);
+
+Tensor computeOneOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
+    const int checkParamTypes = kAllTypes);
+Tensor computeTwoOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeTwoOperandWithAlpha(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeConditionWithTwoOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<
+        ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeThreeOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<
+        ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
+        innerExpr,
+    bool promote_inputs = true);
+Tensor computeFourOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(
+        const ExprHandle&,
+        const ExprHandle&,
+        const ExprHandle&,
+        const ExprHandle&)>& innerExpr);
+Tensor computeNoop(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+Tensor computeScalar(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f60afa20d707b2177fbea86ae33300e97f61af4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+TORCH_API ExprHandle quantizePerTensorQParamFromArg(ArgValue arg);
+
+TORCH_API double immQScale(const BufHandle& qx);
+
+TORCH_API int64_t immQZero(const BufHandle& qx);
+
+TORCH_API ScalarType immQDType(const BufHandle& qx);
+
+TORCH_API bool isQuantized(const BufHandle& qx);
+
+TORCH_API Tensor computeQuantizePerTensor(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizePerTensorExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2dPrepack(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2dRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedLinear(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedLinearRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedAdd(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+Tensor computeQuantizedAddExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedMul(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedMulScalar(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedCat(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeDequantize(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeDequantizeExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeUpsampleNearest2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeUpsampleNearest2dExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedSigmoidExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device);
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4e9786ead6f0182bfd33caef7bfc8a3164f658c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+TORCH_API Tensor computeSum(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+TORCH_API Tensor computeMean(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+TORCH_API Tensor computeAdaptiveAvgPool2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeMax(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const c10::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..41ca3f81ae46908f2c32ee6079642aeae4248793
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+Tensor computeSoftmax(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    bool log_softmax);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..f21d2a42529a361e21d6cc4d5da3103dbb11edf5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h
@@ -0,0 +1,305 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+using ParameterList = const std::vector<VarHandle>;
+using ReduceInteraction = std::function<ExprHandle(ExprHandle, ExprHandle)>;
+
+// A Reducer is a user interface describing a particular reduction
+// operation. It has three components: An initialization value, a way of
+// interacting each value with the accumulation, and a method for obtaining the
+// current value to be reduced. It is materialized into a ReduceOp when loop
+// variables are known.
+class TORCH_API Reducer {
+ public:
+  Reducer(ExprHandle init, ReduceInteraction& interaction)
+      : init_(init.node()), interaction_(interaction) {}
+
+  template <typename RI>
+  Reducer(ExprHandle init, RI interaction)
+      : init_(init.node()), interaction_(std::move(interaction)) {}
+
+  ExprPtr initializer() const {
+    return init_;
+  }
+
+  ExprHandle operator()(
+      BufHandle result_buf,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output,
+      const std::vector<VarHandle>& inner) const;
+
+  ReduceOpPtr operator()(
+      BufPtr result_buf,
+      ExprPtr body,
+      const std::vector<ExprPtr>& output,
+      const std::vector<VarPtr>& inner) const;
+
+  ExprHandle operator()(
+      BufHandle result_buf,
+      BufHandle acc_buf,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output,
+      const std::vector<VarHandle>& inner) const;
+
+  // Polymorphic handling of Body functions with a variety of parameters.
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(ParameterList&)>& func,
+      const std::vector<VarHandle>& vars) {
+    return func(vars);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 1) {
+      throw malformed_input("mismatch between reduce body and arg size (1)");
+    }
+
+    return func(vars[0]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(const VarHandle&, const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 2) {
+      throw malformed_input("mismatch between reduce body and arg size (2)");
+    }
+    return func(vars[0], vars[1]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<
+          ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+          func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 3) {
+      throw malformed_input("mismatch between reduce body and arg size (3)");
+    }
+    return func(vars[0], vars[1], vars[2]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(
+          const VarHandle&,
+          const VarHandle&,
+          const VarHandle&,
+          const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 4) {
+      throw malformed_input("mismatch between reduce body and arg size (4)");
+    }
+    return func(vars[0], vars[1], vars[2], vars[3]);
+  }
+
+  // Completes the reduction operator by applying the interaction function to
+  // the accumulation and the body expression.
+  static ExprPtr complete(
+      BufPtr accumulator,
+      ReduceInteraction interaction,
+      ExprHandle body,
+      const std::vector<ExprPtr>& output_args,
+      const std::vector<VarPtr>& reduce_args) {
+    ExprHandle accum =
+        ExprHandle(alloc<Load>(body.dtype(), accumulator, output_args));
+    auto e = interaction(std::move(accum), std::move(body));
+    return e.node();
+  }
+  static ExprHandle complete(
+      BufHandle accumulator,
+      ReduceInteraction interaction,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output_args,
+      const std::vector<VarHandle>& reduce_args) {
+    ExprHandle accum = Load::make(body.dtype(), accumulator, output_args);
+    auto e = interaction(std::move(accum), std::move(body));
+    return e;
+  }
+
+ private:
+  ExprPtr init_;
+  ReduceInteraction interaction_;
+};
+
+// An expression representing a Reduction operation (e.g. Sum, Max) broken into
+// it's component parts: initialization, accumulation var, acquisition of value
+// to be reduced and interaction.
+//
+// This is intended to be expanded in the loopnest and not make it to codegen.
+class TORCH_API ReduceOp : public ExprNode<ReduceOp> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ReduceOp(
+      ExprPtr body,
+      std::vector<VarPtr> reduce_args,
+      const Reducer& reducer)
+      : ExprNodeBase(body->dtype()),
+        body_(body),
+        reduce_args_(std::move(reduce_args)),
+        reducer_(reducer) {
+    result_buf_ = nullptr;
+    acc_buf_ = nullptr;
+    ri_operand_ = nullptr;
+  }
+
+  ReduceOp(
+      ExprPtr body,
+      std::vector<VarPtr> reduce_args,
+      BufPtr result_buf,
+      BufPtr acc_buf,
+      ExprPtr ri_operand,
+      const Reducer& reducer)
+      : ExprNodeBase(body->dtype()),
+        body_(body),
+        reduce_args_(std::move(reduce_args)),
+        result_buf_(std::move(result_buf)),
+        acc_buf_(std::move(acc_buf)),
+        ri_operand_(std::move(ri_operand)),
+        reducer_(reducer) {}
+
+  static ExprHandle make(
+      ExprHandle body,
+      std::vector<VarHandle> reduce_args,
+      const Reducer& reducer);
+
+  static ExprHandle make(
+      ExprHandle body,
+      std::vector<VarHandle> reduce_args,
+      BufHandle result_buf,
+      BufHandle acc_buf,
+      ExprHandle ri_operand,
+      const Reducer& reducer);
+
+  // return the body expression which obtains the value to be reduced.
+  ExprPtr body() const {
+    return body_;
+  }
+
+  // Returns the original Reducer factory that can create ReduceOps.
+  const Reducer& reducer() const {
+    return reducer_;
+  }
+
+  // returns variables associated with the axes of reduction.
+  const std::vector<VarPtr>& reduce_args() const {
+    return reduce_args_;
+  }
+
+  void setAccBuf(BufHandle acc_buf) {
+    acc_buf_ = acc_buf.node();
+  }
+  BufPtr getAccBuf() {
+    return acc_buf_;
+  }
+
+  void setResultBuf(BufHandle buf) {
+    result_buf_ = buf.node();
+  }
+  BufPtr getResultBuf() {
+    return result_buf_;
+  }
+
+  void setRiOperand(ExprHandle ri_operand) {
+    ri_operand_ = ri_operand.node();
+  }
+  ExprPtr getRiOperand() {
+    return ri_operand_;
+  }
+
+ private:
+  // body_ = reducer_->interaction_(result_buf_, ri_operand_)
+  ExprPtr body_;
+  std::vector<VarPtr> reduce_args_;
+
+  BufPtr result_buf_;
+  BufPtr acc_buf_;
+  ExprPtr ri_operand_;
+
+  const Reducer reducer_;
+};
+
+class Sum : public Reducer {
+ public:
+  Sum()
+      : Reducer(ExprHandle(0), [](ExprHandle a, ExprHandle b) {
+          return a + b;
+        }) {}
+};
+
+inline ExprHandle maximumVal(ScalarType type) {
+  switch (type) {
+#define MAX_BY_TYPE_CASE(Type, Name) \
+  case ScalarType::Name:             \
+    return ExprHandle(std::numeric_limits<Type>::max());
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
+#undef MAX_BY_TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+  return ExprHandle();
+}
+
+inline ExprHandle minimumVal(ScalarType type) {
+  switch (type) {
+#define MAX_BY_TYPE_CASE(Type, Name) \
+  case ScalarType::Name:             \
+    return ExprHandle(std::numeric_limits<Type>::min());
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
+#undef MAX_BY_TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+}
+
+class Maximum : public Reducer {
+ public:
+  // TODO possible to remove this arg by deferring the init value until we
+  // know the dtype of the body.
+  Maximum(Dtype dtype)
+      : Reducer(
+            minimumVal(dtype.scalar_type()),
+            [](ExprHandle a, ExprHandle b) { return Max::make(a, b, true); }) {}
+  Maximum(ExprHandle initializer)
+      : Reducer(initializer, [](ExprHandle a, ExprHandle b) {
+          return Max::make(a, b, true);
+        }) {}
+};
+
+class Minimum : public Reducer {
+ public:
+  Minimum(Dtype dtype)
+      : Reducer(
+            maximumVal(dtype.scalar_type()),
+            [](ExprHandle a, ExprHandle b) { return Min::make(a, b, true); }) {}
+  Minimum(ExprHandle initializer)
+      : Reducer(initializer, [](ExprHandle a, ExprHandle b) {
+          return Min::make(a, b, true);
+        }) {}
+};
+
+class ReductionExpander : public IRMutator {
+ public:
+  StmtPtr expand(StmtPtr s) {
+    return s->accept_mutator(this);
+  }
+
+  ExprPtr mutate(ReduceOpPtr v) override {
+    return v->body();
+  }
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b480d5c92e7f6ca9bd37701a2ce1f6845fd6c77d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h
@@ -0,0 +1,433 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/hash_provider.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+namespace registerizer {
+
+/* The Registerizer performs scalar replacement by looking for common Stores and
+Loads to a single item in a buffer and replacing them with a local temporary
+scalar which is cheaper to write.
+
+For example it can replace:
+
+{
+  A[0] = 0;
+  for(const auto x : c10::irange(10)) {
+    A[0] = (A[0]) + x;
+  }
+}
+
+with:
+
+{
+  int A_ = 0;
+  for(const auto x : c10::irange(10)) {
+    A_ = x + A_;
+  }
+  A[0] = A_;
+}
+
+This is particularly useful on GPUs when parallelizing, since after replacing
+loops with metavars we have a lot of accesses like this. */
+
+class Scope;
+
+/*  Holds analysis information about accesses to a specific range of a
+ buffer, including the number of loads and stores and the lowest common parent
+ Block.
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class AccessInfo {
+ public:
+  AccessInfo() = default;
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AccessInfo(
+      SimplifierHashType h,
+      BufPtr b,
+      std::vector<ExprPtr> i,
+      size_t accessOrder)
+      : hash_(h),
+        buf_(std::move(b)),
+        indices_(std::move(i)),
+        store_cost_(alloc<IntImm>(0)),
+        load_cost_(alloc<IntImm>(0)),
+        accessOrder_(accessOrder) {}
+
+  // Adds a Store to this access, which is in the provided scope.
+  void addStore(StorePtr store, const std::shared_ptr<Scope>& scope);
+
+  // Adds a Load to this access, which occurs in the usage Stmt in the provided
+  // scope.
+  void addLoad(
+      LoadPtr load,
+      const std::shared_ptr<Scope>& scope,
+      StmtPtr usage);
+
+  // Merge another AccessInfo into this one.
+  void merge(const std::shared_ptr<AccessInfo>& other);
+
+  // Returns true if the other AccessInfo's bounds may overlap this one.
+  bool overlaps(const std::shared_ptr<AccessInfo>& other);
+
+  // Returns true if the indices of this access depend on the provided Var.
+  bool dependsOnVar(VarPtr v);
+
+  // Clone this AccessInfo, and set this as the new accesses' hiddenAccess.
+  static std::shared_ptr<AccessInfo> cloneWithHiddenInfo(
+      const std::shared_ptr<AccessInfo>& orig);
+
+  // print for debugging.
+  void print() const;
+
+  SimplifierHashType hash() const {
+    return hash_;
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  const std::vector<ExprPtr>& indices() const {
+    return indices_;
+  }
+
+  BlockPtr block() const {
+    return block_;
+  }
+
+  void setEnclosingBlock(BlockPtr b) {
+    block_ = b;
+  }
+
+  StmtPtr first_usage() const {
+    return first_usage_;
+  }
+  StmtPtr last_usage() const {
+    return last_usage_;
+  }
+
+  void setUsageMarks(StmtPtr first, StmtPtr last) {
+    first_usage_ = first;
+    last_usage_ = last;
+  }
+
+  bool firstUsageOverlapped() const {
+    return firstUsageOverlapped_;
+  }
+
+  ExprPtr store_cost() const {
+    return store_cost_;
+  }
+
+  ExprPtr load_cost() const {
+    return load_cost_;
+  }
+
+  const std::vector<StorePtr>& stores() const {
+    return stores_;
+  }
+
+  const std::vector<LoadPtr>& loads() const {
+    return loads_;
+  }
+
+  void hoistCosts(ExprPtr extent) {
+    store_cost_ = IRSimplifier::simplify(alloc<Mul>(store_cost_, extent));
+    load_cost_ = IRSimplifier::simplify(alloc<Mul>(load_cost_, extent));
+  }
+
+  size_t conditionId() const {
+    return conditionId_;
+  }
+
+  void setConditionId(size_t c) {
+    conditionId_ = c;
+  }
+
+  size_t accessOrder() const {
+    return accessOrder_;
+  }
+
+  std::shared_ptr<AccessInfo> hiddenAccess() const {
+    return hiddenAccess_;
+  }
+
+  // Holds state relating to the scalar variable we will insert to replace some
+  // number of loads and stores.
+  struct ScalarReplacement {
+    VarPtr var{nullptr};
+    BufPtr var_wrapper{nullptr};
+    LetPtr initializer{nullptr};
+  };
+
+  ScalarReplacement& replacement() {
+    return replacement_;
+  }
+
+ private:
+  SimplifierHashType hash_;
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  BlockPtr block_{nullptr};
+
+  StmtPtr first_usage_{nullptr};
+  StmtPtr last_usage_{nullptr};
+
+  // Whether or not this access is overlapped in the first Stmt it appears. This
+  // means we cannot use it's first Store as the initializer.
+  bool firstUsageOverlapped_{false};
+
+  // The cost in real ops that this access represents, to enable
+  // filtering accesses that wont save any loads or stores.
+  ExprPtr store_cost_;
+  ExprPtr load_cost_;
+
+  // The actual Stores and Loads which represent this access.
+  // Be careful with these, any mutator will invalidate these pointers.
+  std::vector<StorePtr> stores_;
+  std::vector<LoadPtr> loads_;
+
+  // An identifier representing the conditional block, if any, this access
+  // depends on.
+  size_t conditionId_{0};
+
+  // An identifier representing the order this access was first encountered, for
+  // sorting returned results.
+  size_t accessOrder_{0};
+
+  // Sometimes when traversing the tree we need to record what would happen if
+  // we hoisted an access, but sometimes it doesn't work out. This lets us
+  // "undo" some mutation and return to the internal hidden AccessInfo.
+  // It will be removed after any further additions to this AccessInfo.
+  std::shared_ptr<AccessInfo> hiddenAccess_;
+
+  ScalarReplacement replacement_;
+};
+
+using AccessHashMap =
+    std::unordered_map<SimplifierHashType, std::shared_ptr<AccessInfo>>;
+
+// Represents a scope block and holds all accesses contained within it.
+class Scope {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Scope(BlockPtr b, std::shared_ptr<Scope> parent, size_t conditionId = 0)
+      : block_(std::move(b)),
+        parent_(std::move(parent)),
+        conditionId_(conditionId) {}
+
+  AccessHashMap& getAccessMapByBuf(BufPtr b);
+
+  std::unordered_map<BufPtr, AccessHashMap>& openAccesses() {
+    return openAccesses_;
+  }
+
+  std::vector<std::shared_ptr<AccessInfo>>& closedAccesses() {
+    return closedAccesses_;
+  }
+
+  BlockPtr block() const {
+    return block_;
+  }
+
+  std::shared_ptr<Scope> parent() const {
+    return parent_;
+  }
+
+  size_t conditionId() const {
+    return conditionId_;
+  }
+
+  const std::unordered_set<VarPtr>& localVars() const {
+    return localVars_;
+  }
+  void addLocalVar(VarPtr v) {
+    localVars_.insert(v);
+  }
+
+  void closeAccess(const std::shared_ptr<AccessInfo>& info);
+
+  void filterClosed();
+
+ private:
+  // Map of map to access, narrowing by Buf then by hash(Buf+Indices).
+  // This allows us to find a candidate access easily, and also check for
+  // overlap with other accesses to the same buf. Buf ->
+  //    Hash ->
+  //        Access
+  std::unordered_map<BufPtr, AccessHashMap> openAccesses_;
+  std::vector<std::shared_ptr<AccessInfo>> closedAccesses_;
+
+  // The Block object this scope represents.
+  BlockPtr block_;
+
+  // The enclosing scope object.
+  std::shared_ptr<Scope> parent_;
+
+  // An identifier representing the condition block this scope depends on.
+  size_t conditionId_;
+
+  // A set of variables local to this scope (e.g. loop vars).
+  std::unordered_set<VarPtr> localVars_;
+};
+
+/* Analyzes the graph and collects accesses to the same symbolic tensor element
+ * which can be replaced by a single local scalar.
+ *
+ * This works by recursively walking the tree in postfix order, building sets of
+ * accesses to the same symbolic element by scope and then merging lower scopes
+ * into their enclosing scope.
+ *
+ * It is safe to move two accesses of the same Tensor element to a local scalar
+ * Var if between all usages of the element there are no other Loads or Stores
+ * that may refer to it. In the comments I refer to this as overlapping the
+ * access, or "cutting" the existing AccessInfo. In the case where a candidate
+ * for registerization is cut, it may be possible to finalize the access early
+ * by writing it back to the Tensor and then create a new scalar variable after
+ * the overlapping access is complete. We will attempt to do this when it saves
+ * memory accesses.
+ *
+ * There are a few cases that make this more challenging:
+ *
+ *  - For: Loops change the number of real usages of a buffer by the loop
+ * extent, but only if we can pull the definition and finalization of the scalar
+ * variable out of the loop block.
+ *
+ * - Cond: Conditions complicate lifting scalars out of internal scopes.
+ * Generally we cannot lift an access outside of a conditional scope unless
+ * there is already a reference to that same access at the higher scope, since
+ * we don't know if the condition was guarding an array access not safe at the
+ * higher scope. In the comments I refer to this as the condition "hiding" the
+ * access, and the outer access "unhiding" it.
+ *
+ * - IfThenElse: Same situation as Cond, except since IfThenElse is an Expr
+ * rather than a Stmt we cannot insert the scalar definition or finalizer
+ * within the conditional scope. Accesses inside an IfThenElse can be safely
+ * combined with external accesses but cannot exist completely within.
+ *
+ * - Let: Accesses dependent on local variables via Let Stmts, or loop vars,
+ * cannot be raised outside of the scope of the dependent var.
+ */
+class TORCH_API RegisterizerAnalysis : public IRVisitor {
+ public:
+  RegisterizerAnalysis()
+      : currentScope_(std::make_shared<Scope>(nullptr, nullptr, 0)) {}
+  ~RegisterizerAnalysis() override = default;
+
+  void visit(ForPtr v) override;
+
+  void visit(CondPtr v) override;
+
+  void visit(BlockPtr v) override;
+
+  void visit(StorePtr v) override;
+
+  void visit(LoadPtr v) override;
+
+  void visit(IfThenElsePtr v) override;
+
+  void visit(LetPtr v) override;
+
+#define STMT_ON_STACK(Op)          \
+  void visit(Op##Ptr v) override { \
+    stmtStack_.push_front(v);      \
+    IRVisitor::visit(v);           \
+    stmtStack_.pop_front();        \
+  }
+
+  STMT_ON_STACK(AtomicAdd);
+  STMT_ON_STACK(Allocate);
+  STMT_ON_STACK(Free);
+
+#undef STMT_ON_STACK
+
+  std::vector<std::shared_ptr<AccessInfo>> getCandidates();
+
+ private:
+  void mergeCurrentScopeIntoParent();
+  void mergeHiddenScope(bool allowClosed);
+  void closeAccessIntoScope(
+      const std::shared_ptr<AccessInfo>& info,
+      const std::shared_ptr<Scope>& scope);
+
+  std::unordered_set<size_t> exprConditionals_;
+
+  // A stack of enclosing Stmts for tracking the usage Stmt of Loads.
+  std::deque<StmtPtr> stmtStack_;
+
+  // The current scope being analyzed.
+  std::shared_ptr<Scope> currentScope_;
+
+  HashProvider hasher_;
+
+  size_t conditionId_{0};
+  size_t accessOrder_{0};
+};
+
+/* Replaces each registerizable access with a Scalar variable, including
+ * definition, initializer and finalizer.
+ */
+class TORCH_API RegisterizerReplacer : public IRMutator {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  RegisterizerReplacer(std::vector<std::shared_ptr<AccessInfo>>& vec)
+      : infoSet_(vec) {
+    buildReplacements();
+  }
+
+  ExprPtr mutate(LoadPtr v) override;
+
+  StmtPtr mutate(StorePtr v) override;
+
+  StmtPtr mutate(BlockPtr v) override;
+
+ private:
+  struct ReplacerScope {
+    std::unordered_map<StmtPtr, std::deque<std::shared_ptr<AccessInfo>>>
+        initializerPoints_;
+    std::unordered_map<StmtPtr, std::deque<std::shared_ptr<AccessInfo>>>
+        finalizePoints_;
+  };
+
+  // Creates the various ReplacerScope objects and builds internal maps.
+  void buildReplacements();
+
+  // State relating to the accesses yet to be replaced.
+  std::vector<std::shared_ptr<AccessInfo>>& infoSet_;
+  std::unordered_map<StorePtr, std::shared_ptr<AccessInfo>> storeToAccess_;
+  std::unordered_map<LoadPtr, std::shared_ptr<AccessInfo>> loadToAccess_;
+  std::unordered_map<BlockPtr, ReplacerScope> parentToAccesses_;
+
+  // Holds the set of Stores that should be pulled into an initializer, so they
+  // can be eliminated.
+  std::set<StorePtr> eliminatedIntializers_;
+
+  // Tracks the number of times we've seen each buffer, so we can name the
+  // scalar Vars appropriately.
+  std::unordered_map<BufPtr, unsigned int> bufferAccessCounts_;
+  unsigned int getBufferAccessCount(BufPtr b) {
+    return ++bufferAccessCounts_[b];
+  }
+};
+} // namespace registerizer
+
+// Apply scalar replacement to all accesses in s.
+// To produce safe code, this must occur after handling parallelized axes and
+// atomics.
+TORCH_API StmtPtr registerize(StmtPtr s);
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..42b0a433e06fa599c20f664d850347537eea8f64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h
@@ -0,0 +1,1024 @@
+#pragma once
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+// The common base between all statement node.
+class TORCH_API Stmt : public std::enable_shared_from_this<Stmt> {
+ public:
+  Stmt() = default;
+  virtual ~Stmt() = default;
+  virtual void accept(IRVisitor* visitor) = 0;
+  virtual StmtPtr accept_mutator(IRMutator* mutator) = 0;
+
+  StmtPtr get_parent() const {
+    return parent_ ? parent_->getptr() : nullptr;
+  }
+
+  /*
+   * Make a deep copy of the given statement.
+   *
+   * All statements and expressions used in children of the statement are
+   * cloned. Note that the variables are not deep-copied since they are
+   * immutable.
+   */
+  static StmtPtr clone(StmtPtr s);
+
+ protected:
+  static void set_parent(StmtPtr s, Stmt* new_parent) {
+    s->parent_ = new_parent;
+  }
+  std::shared_ptr<Stmt> getptr() {
+    return shared_from_this();
+  }
+
+ private:
+  Stmt* parent_ = nullptr;
+};
+
+template <class Op>
+class StmtNode : public Stmt {
+ public:
+  using StmtNodeBase = StmtNode<Op>;
+  void accept(IRVisitor* visitor) override {
+    visitor->visit(static_to<Op>(getptr()));
+  }
+  StmtPtr accept_mutator(IRMutator* mutator) override;
+  StmtNode() = default;
+};
+
+template <class Op>
+StmtPtr StmtNode<Op>::accept_mutator(IRMutator* mutator) {
+  return mutator->mutate(static_to<Op>(getptr()));
+}
+
+// Concrete Stmt classes
+class TORCH_API Block : public StmtNode<Block> {
+ public:
+  static BlockPtr make(const std::vector<StmtPtr>& stmts) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<StmtPtr> valid_stmts;
+    for (auto& stmt : stmts) {
+      if (!stmt) {
+        continue;
+      }
+      valid_stmts.push_back(stmt);
+    }
+    if (valid_stmts.empty()) {
+      return nullptr;
+    }
+    return alloc<Block>(valid_stmts);
+  }
+
+  int nstmts() const {
+    return stmts_.size();
+  }
+  bool empty() const {
+    return stmts_.empty();
+  }
+
+  void prepend_stmt(StmtPtr s) {
+    if (s->get_parent()) {
+      throw malformed_input(
+          "Block prepend Stmt with existing parent", std::move(s));
+    }
+
+    stmts_.push_front(s);
+    set_parent(std::move(s), this);
+  }
+  void append_stmt(StmtPtr s) {
+    if (s->get_parent()) {
+      throw malformed_input(
+          "Block append Stmt with existing parent", std::move(s));
+    }
+
+    stmts_.push_back(s);
+    set_parent(std::move(s), this);
+  }
+
+  void insert_stmt_before(StmtPtr s, StmtPtr before) {
+    if (s->get_parent()) {
+      throw malformed_input(
+          "Block append Stmt with existing parent", std::move(s));
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), before);
+    if (pos == stmts_.end()) {
+      throw malformed_input(
+          "Inserting after statement that is not in block", std::move(s));
+    }
+
+    stmts_.insert(pos, s);
+    set_parent(std::move(s), this);
+  }
+
+  void insert_stmt_after(StmtPtr s, StmtPtr after) {
+    if (s->get_parent()) {
+      throw malformed_input(
+          "Block append Stmt with existing parent", std::move(s));
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), after);
+    if (pos == stmts_.end()) {
+      throw malformed_input(
+          "Inserting after statement that is not in block", std::move(s));
+    }
+
+    ++pos;
+
+    stmts_.insert(pos, s);
+    set_parent(std::move(s), this);
+  }
+
+  bool replace_stmt(StmtPtr old_stmt, StmtPtr new_stmt) {
+    if (new_stmt->get_parent()) {
+      throw malformed_input(
+          "Block replace Stmt with existing parent", std::move(new_stmt));
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), old_stmt);
+    if (pos == stmts_.end()) {
+      return false;
+    }
+    stmts_.insert(pos, new_stmt);
+    stmts_.erase(pos);
+    set_parent(std::move(old_stmt), nullptr);
+    set_parent(std::move(new_stmt), this);
+    return true;
+  }
+
+  // Creates a new block by cloning `this` block and replacing the given
+  // statement with a new statement. Note that `old_stmt` refers to a statement
+  // in `this` block. If the `old_stmt` is not found, it will return `nullptr`.
+  BlockPtr clone_and_replace(StmtPtr old_stmt, StmtPtr new_stmt) {
+    if (new_stmt->get_parent()) {
+      throw malformed_input(
+          "Block replace Stmt with existing parent", std::move(new_stmt));
+    }
+
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<StmtPtr> stmts(stmts_.begin(), stmts_.end());
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<StmtPtr> cloned_stmts(stmts.size());
+    bool found = false;
+    for (int i = 0; i < static_cast<int>(stmts.size()); ++i) {
+      if (stmts[i] == old_stmt) {
+        found = true;
+        cloned_stmts[i] = new_stmt;
+      } else {
+        cloned_stmts[i] = Stmt::clone(stmts[i]);
+      }
+    }
+    if (!found) {
+      return nullptr;
+    }
+    return alloc<Block>(cloned_stmts);
+  }
+
+  bool remove_stmt(StmtPtr stmt) {
+    auto pos = std::find(stmts_.begin(), stmts_.end(), stmt);
+    if (pos == stmts_.end()) {
+      return false;
+    }
+
+    set_parent(std::move(stmt), nullptr);
+    stmts_.erase(pos);
+    return true;
+  }
+
+  std::list<StmtPtr> stmts() const {
+    return stmts_;
+  }
+
+  void clear() {
+    for (const auto& s : stmts_) {
+      set_parent(s, nullptr);
+    }
+    stmts_.clear();
+  }
+
+  void set_stmts(const std::vector<StmtPtr>& stmts) {
+    clear();
+    init(stmts);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  explicit Block(const std::vector<StmtPtr>& stmts) {
+    init(stmts);
+  }
+
+  typedef std::list<StmtPtr>::iterator iterator;
+  typedef std::list<StmtPtr>::const_iterator const_iterator;
+
+  iterator begin() {
+    return stmts_.begin();
+  }
+
+  const_iterator begin() const {
+    return stmts_.begin();
+  }
+
+  iterator end() {
+    return stmts_.end();
+  }
+
+  const_iterator end() const {
+    return stmts_.end();
+  }
+
+  StmtPtr front() {
+    return stmts_.front();
+  }
+
+  StmtPtr front() const {
+    return stmts_.front();
+  }
+
+  StmtPtr back() {
+    return stmts_.back();
+  }
+
+  StmtPtr back() const {
+    return stmts_.back();
+  }
+
+  void splice(Block::iterator it, BlockPtr other) {
+    for (const StmtPtr& s : *other) {
+      set_parent(s, this);
+    }
+
+    stmts_.splice(it, other->stmts_);
+  }
+
+  static BlockPtr getSharedParent(StmtPtr p1, StmtPtr p2) {
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::unordered_set<BlockPtr> enclosing;
+
+    StmtPtr p1_p = std::move(p1);
+    while (p1_p) {
+      if (BlockPtr b = to<Block>(p1_p)) {
+        if (b) {
+          enclosing.insert(b);
+        }
+      }
+      p1_p = p1_p->get_parent();
+    }
+
+    StmtPtr p2_p = std::move(p2);
+    while (p2_p) {
+      if (BlockPtr b = to<Block>(p2_p)) {
+        if (enclosing.count(b) != 0) {
+          return b;
+        }
+      }
+      p2_p = p2_p->get_parent();
+    }
+
+    return nullptr;
+  }
+
+  // returns the immediate child containing statement s.
+  StmtPtr getEnclosedRoot(StmtPtr s) const {
+    while (s && s->get_parent().get() != this) {
+      s = s->get_parent();
+    }
+    return s;
+  }
+
+ private:
+  std::list<StmtPtr> stmts_;
+
+  void init(const std::vector<StmtPtr>& stmts) {
+    for (const StmtPtr& s : stmts) {
+      if (!s) {
+        continue;
+      }
+      if (!s->get_parent()) {
+        // If we get here, it's a bug, but we cannot throw an error from a
+        // constructor. But IR verifier would catch this.
+        set_parent(s, this);
+      }
+
+      stmts_.push_back(s);
+    }
+  }
+};
+
+class TORCH_API Store : public StmtNode<Store> {
+ public:
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+  std::vector<ExprPtr> indices() const {
+    return indices_;
+  }
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+  ExprPtr value() const {
+    return value_;
+  }
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+  static StorePtr make(
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices,
+      const ExprHandle& value);
+
+  Store(BufPtr buf, std::vector<ExprPtr> indices, ExprPtr value);
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  ExprPtr value_;
+};
+
+// Allocate a buffer of given shapes and dtypes and bind it with the given
+// buffer var. The life span is at most through the current program, until it is
+// explicitly freed. An unfreed memory is likely considered an error.
+class TORCH_API Allocate : public StmtNode<Allocate> {
+ public:
+  static AllocatePtr make(const BufHandle& buf_handle) {
+    return alloc<Allocate>(buf_handle.node());
+  }
+
+  VarPtr buffer_var() const {
+    return buf_->base_handle();
+  }
+
+  Dtype dtype() const {
+    return buf_->dtype();
+  }
+
+  const std::vector<ExprPtr> dims() const {
+    return buf_->dims();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  explicit Allocate(BufPtr buf) : buf_(std::move(buf)) {}
+
+ private:
+  BufPtr buf_;
+  // TODO: add memory types.
+};
+
+// PlacementAllocate is a variation of the Allocate operator in NNC IR. It does
+// not allocate memory but reuse the memory of another buffer for the given
+// buffer.
+class TORCH_API PlacementAllocate : public StmtNode<PlacementAllocate> {
+ public:
+  static PlacementAllocatePtr make(
+      const BufHandle& buf_handle,
+      const BufHandle& buf_handle_to_reuse) {
+    return alloc<PlacementAllocate>(
+        buf_handle.node(), buf_handle_to_reuse.node());
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  BufPtr buf_to_reuse() const {
+    return buf_to_reuse_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_buf_to_reuse(BufPtr buf) {
+    buf_to_reuse_ = std::move(buf);
+  }
+
+  explicit PlacementAllocate(BufPtr buf, BufPtr buf_to_reuse)
+      : buf_(std::move(buf)), buf_to_reuse_(std::move(buf_to_reuse)) {}
+
+ private:
+  BufPtr buf_;
+  BufPtr buf_to_reuse_;
+};
+
+// Free the specific buffer. It is an error.
+class TORCH_API Free : public StmtNode<Free> {
+ public:
+  static FreePtr make(const BufHandle& buf_handle) {
+    return alloc<Free>(buf_handle.node());
+  }
+
+  VarPtr buffer_var() const {
+    return buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  explicit Free(BufPtr buf) : buf_(std::move(buf)) {}
+
+ private:
+  BufPtr buf_;
+};
+
+class TORCH_API FreeExt : public StmtNode<FreeExt> {
+ public:
+  static FreeExtPtr make(const std::vector<BufHandle>& bufs);
+
+  std::vector<BufPtr> bufs() const {
+    return bufs_;
+  }
+
+  void set_bufs(std::vector<BufPtr> bufs) {
+    bufs_ = std::move(bufs);
+  }
+
+  explicit FreeExt(std::vector<BufPtr> bufs) : bufs_(std::move(bufs)) {}
+
+ private:
+  std::vector<BufPtr> bufs_;
+};
+
+class TORCH_API Let : public StmtNode<Let> {
+ public:
+  static LetPtr make(const VarHandle& var, const ExprHandle& val) {
+    return alloc<Let>(var.node(), val.node());
+  }
+
+  Let(VarPtr var, ExprPtr val) : var_(std::move(var)), val_(std::move(val)) {}
+
+  VarPtr var() const {
+    return var_;
+  }
+
+  ExprPtr value() const {
+    return val_;
+  }
+
+  void set_var(VarPtr var) {
+    var_ = std::move(var);
+  }
+
+  void set_val(ExprPtr val) {
+    val_ = std::move(val);
+  }
+
+ private:
+  VarPtr var_;
+  ExprPtr val_;
+};
+
+class TORCH_API Cond : public StmtNode<Cond> {
+ public:
+  static CondPtr make(
+      const ExprHandle& condition,
+      StmtPtr true_stmt,
+      StmtPtr false_stmt) {
+    return alloc<Cond>(condition.node(), true_stmt, false_stmt);
+  }
+
+  ExprPtr condition() const {
+    return condition_;
+  }
+
+  BlockPtr true_stmt() const {
+    return true_stmt_;
+  }
+
+  BlockPtr false_stmt() const {
+    return false_stmt_;
+  }
+
+  void set_condition(ExprPtr condition) {
+    condition_ = std::move(condition);
+  }
+
+  void set_true_stmt(StmtPtr true_stmt) {
+    if (true_stmt) {
+      BlockPtr b = to<Block>(true_stmt);
+      if (!b) {
+        b = alloc<Block>(std::vector<StmtPtr>({std::move(true_stmt)}));
+      }
+      true_stmt_ = b;
+      set_parent(true_stmt_, this);
+    }
+  }
+
+  void set_false_stmt(StmtPtr false_stmt) {
+    if (false_stmt) {
+      BlockPtr b = to<Block>(false_stmt);
+      if (!b) {
+        b = alloc<Block>(std::vector<StmtPtr>({std::move(false_stmt)}));
+      }
+      false_stmt_ = b;
+      set_parent(false_stmt_, this);
+    }
+  }
+
+  Cond(ExprPtr condition, StmtPtr true_stmt, StmtPtr false_stmt)
+      : condition_(std::move(condition)) {
+    set_true_stmt(std::move(true_stmt));
+    set_false_stmt(std::move(false_stmt));
+  }
+
+  CondPtr cloneWithNewBodies(StmtPtr true_stmt, StmtPtr false_stmt) {
+    return alloc<Cond>(condition_, true_stmt, false_stmt);
+  }
+
+  CondPtr cloneWithNewBody(StmtPtr true_stmt) {
+    return alloc<Cond>(condition_, true_stmt, nullptr);
+  }
+
+ private:
+  ExprPtr condition_;
+  BlockPtr true_stmt_ = nullptr;
+  BlockPtr false_stmt_ = nullptr;
+};
+
+class TORCH_API LoopOptions {
+ public:
+  enum {
+    IDX_UNSET = -1,
+    IDX_X = 0,
+    IDX_Y = 1,
+    IDX_Z = 2,
+    IDX_W = 3,
+    IDX_MAX = IDX_W,
+  };
+  // GPU Block Index
+  bool is_gpu_block_index() const {
+    return gpu_block_index_ != IDX_UNSET;
+  }
+
+  int gpu_block_index() const {
+    return gpu_block_index_;
+  }
+
+  std::string gpu_block_index_str() const {
+    if (!is_gpu_block_index()) {
+      throw malformed_input("Has no GPU block index");
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+    static const char* kBlockIndexNames[] = {
+        "blockIdx.x",
+        "blockIdx.y",
+        "blockIdx.z",
+        "blockIdx.w",
+    };
+
+    if (gpu_block_index_ < IDX_X || gpu_block_index_ > IDX_MAX) {
+      throw malformed_input("invalid GPU block index");
+    }
+
+    return kBlockIndexNames[gpu_block_index_];
+  }
+
+  void set_gpu_block_index(int index) {
+    if (index == IDX_UNSET) {
+      gpu_block_index_ = IDX_UNSET;
+    }
+
+    if (is_gpu_thread_index()) {
+      throw std::runtime_error("Cannot set both gpu block and thread index");
+    }
+    if (is_gpu_block_index() && gpu_block_index() != index) {
+      throw std::runtime_error("Cannot set a previously set block index");
+    }
+    gpu_block_index_ = index;
+  }
+
+  // GPU Thread Index
+  bool is_gpu_thread_index() const {
+    return gpu_thread_index() != IDX_UNSET;
+  }
+
+  int gpu_thread_index() const {
+    return gpu_thread_index_;
+  }
+
+  std::string gpu_thread_index_str() const {
+    if (!is_gpu_thread_index()) {
+      throw malformed_input("has no GPU thread index");
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+    static const char* kThreadIndexNames[] = {
+        "threadIdx.x", "threadIdx.y", "threadIdx.z", "threadIdx.w"};
+
+    if (gpu_thread_index_ < IDX_X || gpu_thread_index_ > IDX_MAX) {
+      throw malformed_input("invalid GPU thread index");
+    }
+
+    return kThreadIndexNames[gpu_thread_index_];
+  }
+
+  void set_gpu_thread_index(int index) {
+    if (index == IDX_UNSET) {
+      gpu_thread_index_ = IDX_UNSET;
+    }
+
+    if (is_gpu_block_index()) {
+      throw std::runtime_error("Cannot set both gpu thread and block index");
+    }
+    if (is_gpu_thread_index() && gpu_thread_index() != index) {
+      throw std::runtime_error("Cannot set a previously set thread index");
+    }
+    gpu_thread_index_ = index;
+  }
+
+  void set_parallel() {
+    is_parallel_ = true;
+  }
+
+  bool is_parallel() const {
+    return is_parallel_;
+  }
+
+  std::string ToString() const {
+    if (is_gpu_block_index()) {
+      return gpu_block_index_str();
+    } else if (is_gpu_thread_index()) {
+      return gpu_thread_index_str();
+    } else if (is_parallel()) {
+      return "parallel";
+    }
+    return "";
+  }
+
+  bool isDefault() const {
+    return gpu_block_index_ == IDX_UNSET && gpu_thread_index_ == IDX_UNSET &&
+        !is_parallel_;
+  }
+
+  void set_buffer_mapping(const std::unordered_map<std::string, BufPtr>& map) {
+    map_input_to_tensor_bufs_ = map;
+  }
+
+  std::unordered_map<std::string, BufPtr> get_buffer_mapping() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  int gpu_block_index_{IDX_UNSET};
+  int gpu_thread_index_{IDX_UNSET};
+  bool is_parallel_{false};
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+};
+
+class TORCH_API For : public StmtNode<For> {
+ public:
+  VarPtr var() const {
+    return var_;
+  }
+  ExprPtr start() const {
+    return start_;
+  }
+  ExprPtr stop() const {
+    return stop_;
+  }
+  BlockPtr body() const {
+    return body_;
+  }
+  static ForPtr make(
+      const VarHandle& var,
+      const ExprHandle& start,
+      const ExprHandle& stop,
+      StmtPtr body) {
+    if (!body) {
+      return nullptr;
+    }
+    return alloc<For>(var.node(), start.node(), stop.node(), body);
+  }
+  static ForPtr make(
+      const VarHandle& var,
+      const ExprHandle& start,
+      const ExprHandle& stop,
+      StmtPtr body,
+      const LoopOptions& loop_options) {
+    if (!body) {
+      return nullptr;
+    }
+    return alloc<For>(
+        var.node(), start.node(), stop.node(), body, loop_options);
+  }
+  const LoopOptions loop_options() const {
+    return loop_options_;
+  }
+
+  For(VarPtr var, ExprPtr start, ExprPtr stop, StmtPtr body)
+      : var_(std::move(var)), start_(std::move(start)), stop_(std::move(stop)) {
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  For(VarPtr var,
+      ExprPtr start,
+      ExprPtr stop,
+      StmtPtr body,
+      LoopOptions loop_options)
+      : var_(var),
+        start_(start),
+        stop_(stop),
+        loop_options_(std::move(loop_options)) {
+    if (!var) {
+      throw malformed_input("invalid Var in For loop");
+    } else if (!start) {
+      throw malformed_input("invalid Start in For loop");
+    } else if (!stop) {
+      throw malformed_input("invalid Stop in For loop");
+    } else if (!body || body->get_parent()) {
+      throw malformed_input("invalid Body in For loop");
+    }
+
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  void set_gpu_block_index(int block_index) {
+    loop_options_.set_gpu_block_index(block_index);
+  }
+
+  void set_gpu_thread_index(int thread_index) {
+    loop_options_.set_gpu_thread_index(thread_index);
+  }
+
+  void set_parallel() {
+    loop_options_.set_parallel();
+  }
+
+  bool is_parallel() const {
+    return loop_options_.is_parallel();
+  }
+
+  void set_buffer_map(const std::unordered_map<std::string, BufPtr>& map) {
+    loop_options_.set_buffer_mapping(map);
+  }
+
+  ForPtr cloneWithNewBody(StmtPtr body) const {
+    return alloc<For>(var_, start_, stop_, body, loop_options_);
+  }
+
+  BlockPtr removeBody() {
+    auto res = body_;
+    set_parent(res, nullptr);
+    body_ = nullptr;
+    return res;
+  }
+
+  void set_body(StmtPtr body) {
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  void set_start(ExprPtr start) {
+    start_ = std::move(start);
+  }
+
+  void set_stop(ExprPtr stop) {
+    stop_ = std::move(stop);
+  }
+
+  void set_var(VarPtr var) {
+    var_ = std::move(var);
+  }
+
+ private:
+  VarPtr var_;
+  ExprPtr start_;
+  ExprPtr stop_;
+  BlockPtr body_;
+  LoopOptions loop_options_;
+};
+
+// A backend specific IR Node that implements atomic-add.
+// This node could only shows up as an internal with GPU backends.
+// TODO: move to this an internal IR.
+// TODO: make IR nodes extensible.
+class TORCH_API AtomicAdd : public StmtNode<AtomicAdd> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  AtomicAdd(BufPtr buf, std::vector<ExprPtr> indices, ExprPtr value)
+      : buf_(std::move(buf)),
+        indices_(std::move(indices)),
+        value_(std::move(value)) {}
+
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+
+  ExprPtr value() const {
+    return value_;
+  }
+
+  const std::vector<ExprPtr>& indices() const {
+    return indices_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  ExprPtr value_;
+};
+
+class TORCH_API SyncThreads : public StmtNode<SyncThreads> {
+ public:
+  SyncThreads() = default;
+};
+
+/*
+ * ExternalCall statement represents a call to an external function that would
+ * compute the contents of the output buffer. An ExternalCall statement consists
+ * of:
+ *   1) output buffer - the buffer that'll be initialized by the call
+ *   2) external function name - a key from the NNC function registry to lookup
+ *      the actual function to call
+ *   3) buffer arguments - the input buffers used by the function
+ *   4) non-buffer arguments - scalar arguments to pass to the function
+ *
+ * An example:
+ *   A = nnc_conv2d(buf_args={Input, Weight, Bias}, args={1})
+ * Here 'A' is the output buffer, "nnc_conv2d" is the function name, the buffer
+ * arguments are 'Input', 'Weight', and 'Bias', and there is a single non-buffer
+ * argument - 1.
+ *
+ * The semantics of the scalar arguments is defined solely by the implementation
+ * of the external function.
+ */
+class TORCH_API ExternalCall : public StmtNode<ExternalCall> {
+ public:
+  static ExternalCallPtr make(
+      BufHandle buf,
+      const std::string& func_name,
+      const std::vector<BufHandle>& buf_args,
+      const std::vector<ExprHandle>& args);
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  std::string func_name() const {
+    return func_name_;
+  }
+
+  std::vector<BufPtr> buf_args() const {
+    return buf_args_;
+  }
+
+  std::vector<ExprPtr> args() const {
+    return args_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_buf_args(std::vector<BufPtr> buf_args) {
+    buf_args_ = std::move(buf_args);
+  }
+
+  void set_args(std::vector<ExprPtr> args) {
+    args_ = std::move(args);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ExternalCall(
+      BufPtr buf,
+      std::string func_name,
+      std::vector<BufPtr> buf_args,
+      std::vector<ExprPtr> args)
+      : buf_(std::move(buf)),
+        func_name_(std::move(func_name)),
+        buf_args_(std::move(buf_args)),
+        args_(std::move(args)) {}
+
+ private:
+  BufPtr buf_;
+  std::string func_name_;
+  std::vector<BufPtr> buf_args_;
+  std::vector<ExprPtr> args_;
+};
+
+class TORCH_API ExternalCallWithAlloc : public StmtNode<ExternalCallWithAlloc> {
+ public:
+  static ExternalCallWithAllocPtr make(
+      const std::string& func_name,
+      const std::vector<BufHandle>& buf_out_args,
+      const std::vector<BufHandle>& buf_args,
+      const std::vector<ExprHandle>& args);
+
+  std::vector<BufPtr> buf_out_args() const {
+    return buf_out_args_;
+  }
+
+  std::string func_name() const {
+    return func_name_;
+  }
+
+  std::vector<BufPtr> buf_args() const {
+    return buf_args_;
+  }
+
+  std::vector<ExprPtr> args() const {
+    return args_;
+  }
+
+  void set_buf_out_args(std::vector<BufPtr> buf_out_args) {
+    buf_out_args_ = std::move(buf_out_args);
+  }
+
+  void set_buf_args(std::vector<BufPtr> buf_args) {
+    buf_args_ = std::move(buf_args);
+  }
+
+  void set_args(std::vector<ExprPtr> args) {
+    args_ = std::move(args);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  ExternalCallWithAlloc(
+      std::string func_name,
+      std::vector<BufPtr> buf_out_args,
+      std::vector<BufPtr> buf_args,
+      std::vector<ExprPtr> args)
+      : func_name_(std::move(func_name)),
+        buf_out_args_(std::move(buf_out_args)),
+        buf_args_(std::move(buf_args)),
+        args_(std::move(args)) {}
+
+ private:
+  std::string func_name_;
+  std::vector<BufPtr> buf_out_args_;
+  std::vector<BufPtr> buf_args_;
+  std::vector<ExprPtr> args_;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..19a6f677f9a37064cd23b9be17755a36a52e39c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h
@@ -0,0 +1,329 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class TORCH_API Tensor {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Tensor(BufPtr buf, const std::vector<VarPtr>& args, ExprPtr body)
+      : buf_(std::move(buf)) {
+    stmt_ = constructStmt(args, std::move(body), {}, {});
+  }
+  Tensor(BufHandle buf, const std::vector<VarHandle>& args, ExprHandle body)
+      : Tensor(buf.node(), VarHandleVectorToVarVector(args), body.node()) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Tensor(
+      BufPtr buf,
+      const std::vector<VarPtr>& args,
+      const std::vector<ExprPtr>& reduce_dims,
+      const std::vector<VarPtr>& reduce_args,
+      ExprPtr body)
+      : buf_(std::move(buf)) {
+    stmt_ = constructStmt(args, std::move(body), reduce_dims, reduce_args);
+  }
+  Tensor(
+      BufHandle buf,
+      const std::vector<VarHandle>& args,
+      const std::vector<ExprHandle>& reduce_dims,
+      const std::vector<VarHandle>& reduce_args,
+      ExprHandle body)
+      : Tensor(
+            buf.node(),
+            VarHandleVectorToVarVector(args),
+            ExprHandleVectorToExprVector(reduce_dims),
+            VarHandleVectorToVarVector(reduce_args),
+            body.node()) {}
+
+  Tensor(BufPtr buf, StmtPtr stmt)
+      : buf_(std::move(buf)), stmt_(std::move(stmt)) {}
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
+
+ private:
+  StmtPtr constructStmt(
+      const std::vector<VarPtr>& args,
+      ExprPtr body,
+      const std::vector<ExprPtr>& reduce_dims,
+      const std::vector<VarPtr>& reduce_args) const;
+
+  BufPtr buf_;
+  StmtPtr stmt_;
+};
+
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
+
+inline std::vector<VarHandle> create_index_vars(
+    const std::vector<ExprHandle>& dims) {
+  std::vector<VarHandle> vars;
+  vars.reserve(dims.size());
+  for (const ExprHandle& dim : dims) {
+    vars.emplace_back(alloc<Var>(
+        "i", dim.dtype().scalar_type() == ScalarType::Long ? kLong : kInt));
+  }
+  return vars;
+}
+
+// Handle reductions over a Reducer and a body_func which produces values.
+template <typename InitFunc, typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    const InitFunc& init_func,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  std::vector<VarHandle> vars = create_index_vars(dims);
+  std::vector<VarHandle> reduce_vars = create_index_vars(reduce_dims);
+
+  // If reduce_vars is empty, then it's not a reduction, but rather a simple
+  // copy
+  if (reduce_vars.empty()) {
+    ExprHandle body = Reducer::getReduceBody(body_func, vars);
+    BufHandle func_result = Buf::make(
+        func_name, dims, body.dtype(), c10::nullopt, std::move(strides));
+    return Tensor(std::move(func_result), vars, std::move(body));
+  }
+
+  std::vector<VarHandle> all_vars;
+  all_vars.insert(all_vars.end(), vars.begin(), vars.end());
+  all_vars.insert(all_vars.end(), reduce_vars.begin(), reduce_vars.end());
+
+  ExprHandle body = Reducer::getReduceBody(body_func, all_vars);
+  std::vector<ExprHandle> output_args(vars.begin(), vars.end());
+  ExprHandle init_expr = Cast::make(body.dtype(), init_func(vars));
+  BufHandle func_result = Buf::make(func_name, dims, body.dtype(), init_expr);
+
+  ExprHandle reduce_op = reducer(func_result, body, output_args, reduce_vars);
+  if (body.dtype() == kBFloat16) {
+    ExprHandle init_expr_acc = Cast::make(kFloat, init_func(vars));
+    BufHandle func_result_acc =
+        Buf::make(func_name + "_acc", dims, kFloat, init_expr_acc);
+    reduce_op = reducer(
+        func_result,
+        std::move(func_result_acc),
+        std::move(body),
+        output_args,
+        reduce_vars);
+  }
+
+  Tensor t = Tensor(
+      std::move(func_result),
+      vars,
+      reduce_dims,
+      reduce_vars,
+      std::move(reduce_op));
+  return t;
+}
+template <typename InitFunc, typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const InitFunc& init_func,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<InitFunc, BodyFunc>(
+      func_name,
+      dims,
+      c10::nullopt,
+      reducer,
+      init_func,
+      body_func,
+      reduce_dims);
+}
+
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(
+      func_name,
+      dims,
+      strides,
+      reducer,
+      [&](ParameterList p) { return ExprHandle(reducer.initializer()); },
+      body_func,
+      reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<BodyFunc>(
+      func_name, dims, c10::nullopt, reducer, body_func, reduce_dims);
+}
+
+// Overload which allows inline lambda functions for the body_func.
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    const BodyFunc&& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, strides, reducer, body_func, reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BodyFunc&& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, c10::nullopt, reducer, body_func, reduce_dims);
+}
+
+TORCH_API Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims);
+
+// Overload for the common case of all dimensions of a previously Computed
+// Tensor.
+TORCH_API Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    c10::optional<std::vector<ExprHandle>> strides,
+    const Reducer& reducer,
+    Tensor tensor,
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    Tensor tensor,
+    const std::vector<ExprHandle>& reduce_dims);
+
+template <typename... Ts>
+inline ExprHandle Tensor::load(const Ts&... ts) const {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return Load::make(BufHandle(this->buf()), params);
+}
+
+template <typename T>
+inline ExprHandle Tensor::load(const std::vector<T>& args) const {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return Load::make(BufHandle(this->buf()), params);
+}
+
+template <typename... Ts>
+inline ExprHandle BufHandle::load(const Ts&... ts) const {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return ExprHandle(alloc<Load>(node(), ExprHandleVectorToExprVector(params)));
+}
+
+template <typename T>
+inline ExprHandle BufHandle::load(const std::vector<T>& args) const {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return ExprHandle(alloc<Load>(node(), ExprHandleVectorToExprVector(params)));
+}
+
+inline ExprHandle BufHandle::load(const std::vector<ExprHandle>& args) const {
+  return this->template load<ExprHandle>(args);
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a364e588d86996b8368b05b47c07b851475bcf7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace jit {
+// Initialize Python bindings for Tensor Expressions
+void initTensorExprBindings(PyObject* module);
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3e3c571084832f9f8d51e9899cc64b46a3f60e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Logging.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+using int32 = std::int32_t;
+
+class Dtype;
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Dtype& dtype);
+
+using ScalarType = c10::ScalarType;
+
+enum ElementType {
+  kAllTypes = 0,
+  kIntegralTypes = 1 << 0,
+  kFloatingPointTypes = 1 << 1,
+  kBoolType = 1 << 2,
+  kComplexTypes = 1 << 3,
+  kQintTypes = 1 << 4,
+  kNonComplexOrQintTypes = kIntegralTypes | kBoolType | kFloatingPointTypes,
+};
+
+// Data types for scalar and vector elements.
+class TORCH_API Dtype {
+ public:
+  explicit Dtype(int8_t type)
+      : scalar_type_(static_cast<ScalarType>(type)), lanes_(1) {}
+  explicit Dtype(ScalarType type) : scalar_type_(type), lanes_(1) {}
+  Dtype(int8_t type, int lanes)
+      : scalar_type_(static_cast<ScalarType>(type)), lanes_(lanes) {}
+  Dtype(ScalarType type, int lanes) : scalar_type_(type), lanes_(lanes) {}
+  Dtype(Dtype type, int lanes)
+      : scalar_type_(type.scalar_type_), lanes_(lanes) {
+    if (type.lanes() != 1) {
+      throw malformed_input("dtype lanes dont match");
+    }
+  }
+  int lanes() const {
+    return lanes_;
+  }
+  ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  Dtype scalar_dtype() const;
+  bool operator==(const Dtype& other) const {
+    return scalar_type_ == other.scalar_type_ && lanes_ == other.lanes_;
+  }
+  bool operator!=(const Dtype& other) const {
+    return !(*this == other);
+  }
+  int byte_size() const;
+  std::string ToCppString() const;
+
+  bool is_integral() const {
+    return c10::isIntegralType(scalar_type_, true);
+  }
+  bool is_floating_point() const {
+    return c10::isFloatingType(scalar_type_);
+  }
+  bool is_signed() const {
+    return c10::isSignedType(scalar_type_);
+  }
+
+  Dtype cloneWithScalarType(ScalarType nt) const {
+    return Dtype(nt, lanes_);
+  }
+
+ private:
+  friend TORCH_API std::ostream& operator<<(
+      std::ostream& stream,
+      const Dtype& dtype);
+  ScalarType scalar_type_;
+  int lanes_; // the width of the element for a vector time
+};
+
+extern TORCH_API Dtype kHandle;
+
+#define NNC_DTYPE_DECLARATION(ctype, name) extern TORCH_API Dtype k##name;
+
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_DTYPE_DECLARATION)
+NNC_DTYPE_DECLARATION(c10::quint8, QUInt8);
+NNC_DTYPE_DECLARATION(c10::qint8, QInt8);
+#undef NNC_DTYPE_DECLARATION
+
+template <typename T>
+TORCH_API Dtype ToDtype();
+
+#define NNC_TODTYPE_DECLARATION(ctype, name) \
+  template <>                                \
+  inline Dtype ToDtype<ctype>() {            \
+    return k##name;                          \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_TODTYPE_DECLARATION)
+NNC_TODTYPE_DECLARATION(c10::quint8, QUInt8);
+NNC_TODTYPE_DECLARATION(c10::qint8, QInt8);
+#undef NNC_TODTYPE_DECLARATION
+
+TORCH_API Dtype ToDtype(ScalarType type);
+
+inline Dtype promoteTypes(Dtype a, Dtype b) {
+  if (a.lanes() != b.lanes()) {
+    throw malformed_input("promoting types with different lanes");
+  }
+  return Dtype(
+      static_cast<ScalarType>(c10::promoteTypes(
+          static_cast<c10::ScalarType>(a.scalar_type()),
+          static_cast<c10::ScalarType>(b.scalar_type()))),
+      a.lanes());
+}
+
+inline Dtype BinaryOpDtype(
+    Dtype op1_dtype,
+    Dtype op2_dtype,
+    ScalarType ret_type = ScalarType::Undefined) {
+  if (op1_dtype == op2_dtype) {
+    if (ret_type == ScalarType::Undefined) {
+      return op1_dtype;
+    }
+
+    return ToDtype(ret_type);
+  }
+
+  if (op1_dtype.lanes() != op2_dtype.lanes()) {
+    throw malformed_input("lanes dont match");
+  }
+  int lanes = op1_dtype.lanes();
+
+  Dtype resultType = promoteTypes(op1_dtype, op2_dtype);
+  if (resultType.scalar_type() == ScalarType::Undefined) {
+    throw malformed_input("scalar type doesn't match");
+  }
+
+  if (lanes == 1) {
+    // Use the fixed scalar Dtypes.
+    return ToDtype(resultType.scalar_type());
+  }
+
+  return resultType;
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+namespace std {
+
+using torch::jit::tensorexpr::Dtype;
+std::string to_string(const Dtype& dtype);
+using torch::jit::tensorexpr::ScalarType;
+std::string to_string(const ScalarType& dtype);
+
+} // namespace std
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf13f6091505a70b9df4db222d464bc440e6e9d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class VarHandle;
+class Var;
+
+using VarNameMap = std::unordered_map<VarPtr, std::string>;
+
+// A manager to get unique names from vars.
+// It starts with the name hints of the var and append "_" + $counter until it
+// hits a unique name.
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+class TORCH_API UniqueNameManager {
+ public:
+  const std::string& get_unique_name(const VarHandle& v);
+
+  const std::string& get_unique_name(VarPtr v);
+
+ private:
+  friend class ScopedVarName;
+  VarNameMap unique_name_mapping_;
+  std::unordered_map<std::string, int> unique_name_count_;
+  std::unordered_set<std::string> all_unique_names_;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c30d32c4dcf02930396a7b41d6311f6aa8ed2f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+using VarMapping = std::vector<std::pair<VarPtr, ExprPtr>>;
+
+class VarSubMutator : public IRMutator {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  VarSubMutator(const VarMapping& var_mapping) {
+    for (auto& entry : var_mapping) {
+      VarPtr key_var = entry.first;
+      ExprPtr value = entry.second;
+      if (!key_var) {
+        throw malformed_input("missing key in VarSubMutator");
+      }
+      var_mapping_[std::move(key_var)] = std::move(value);
+    }
+  }
+
+  ExprPtr mutate(VarPtr var) override {
+    auto iter = var_mapping_.find(var);
+    if (iter == var_mapping_.end()) {
+      return var;
+    }
+    return iter->second;
+  }
+
+  ExprPtr mutate(ReduceOpPtr var) override {
+    auto body = var->body()->accept_mutator(this);
+    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+    std::vector<VarPtr> new_inner;
+
+    for (const auto& v : var->reduce_args()) {
+      ExprPtr e = v->accept_mutator(this);
+      if (VarPtr new_var = to<Var>(e)) {
+        new_inner.push_back(std::move(new_var));
+      } else {
+        VarFinder varFinder;
+        e->accept(&varFinder);
+        auto varlist = varFinder.vars();
+        new_inner.insert(new_inner.end(), varlist.begin(), varlist.end());
+      }
+    }
+
+    return alloc<ReduceOp>(body, new_inner, var->reducer());
+  }
+
+ private:
+  std::unordered_map<VarPtr, ExprPtr> var_mapping_;
+};
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd21894618656fdca8425c30068c6f588f5f194f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <string>
+
+namespace torch {
+namespace jit {
+
+struct Graph;
+
+namespace testing {
+
+struct FileCheckImpl;
+
+struct FileCheck {
+ public:
+  TORCH_API explicit FileCheck();
+  TORCH_API ~FileCheck();
+
+  // Run FileCheck against test string
+  TORCH_API void run(const std::string& test_string);
+
+  // Run FileCheck against dump of graph IR
+  TORCH_API void run(const Graph& graph);
+
+  // Parsing input checks string and run against test string / dump of graph IR
+  TORCH_API void run(
+      const std::string& input_checks_string,
+      const std::string& test_string);
+  TORCH_API void run(
+      const std::string& input_checks_string,
+      const Graph& graph);
+
+  // Checks that the string occurs, starting at the end of the most recent match
+  TORCH_API FileCheck* check(const std::string& str);
+
+  // Checks that the string does not occur between the previous match and next
+  // match. Consecutive check_nots test against the same previous match and next
+  // match
+  TORCH_API FileCheck* check_not(const std::string& str);
+
+  // Checks that the string occurs on the same line as the previous match
+  TORCH_API FileCheck* check_same(const std::string& str);
+
+  // Checks that the string occurs on the line immediately following the
+  // previous match
+  TORCH_API FileCheck* check_next(const std::string& str);
+
+  // Checks that the string occurs count number of times, starting at the end
+  // of the previous match. If exactly is true, checks that there are exactly
+  // count many matches
+  TORCH_API FileCheck* check_count(
+      const std::string& str,
+      size_t count,
+      bool exactly = false);
+
+  // A series of consecutive check_dags get turned into a group of checks
+  // which can appear in any order relative to each other. The checks begin
+  // at the end of the previous match, and the match for the check_dag group
+  // is the minimum match of all individual checks to the maximum match of all
+  // individual checks.
+  TORCH_API FileCheck* check_dag(const std::string& str);
+
+  // Checks that source token is highlighted in str (usually an error message).
+  TORCH_API FileCheck* check_source_highlighted(const std::string& str);
+
+  // Checks that the regex matched string occurs, starting at the end of the
+  // most recent match
+  TORCH_API FileCheck* check_regex(const std::string& str);
+
+  // reset checks
+  TORCH_API void reset();
+
+ private:
+  bool has_run = false;
+  std::unique_ptr<FileCheckImpl> fcImpl;
+};
+} // namespace testing
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h
new file mode 100644
index 0000000000000000000000000000000000000000..aad7a4b365e4dfb0fe95ae800cb2d1500c125ab9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <functional>
+#include <memory>
+
+namespace torch {
+namespace jit {
+struct Module;
+
+using ModuleHook = std::function<void(Module module)>;
+using FunctionHook = std::function<void(StrongFunctionPtr function)>;
+
+TORCH_API void didFinishEmitModule(Module module);
+TORCH_API void didFinishEmitFunction(StrongFunctionPtr defined);
+TORCH_API void setEmitHooks(ModuleHook for_module, FunctionHook for_fn);
+
+TORCH_API std::pair<ModuleHook, FunctionHook> getEmitHooks();
+
+} // namespace jit
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..95d26cfa2834fd0fd1bcb7595fb23f13300a0361
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <cstring>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API BackendData {
+ public:
+  struct Info {
+    /**
+     * Used by Lazy Graph Executor to tag info on BackendData objs
+     * */
+    virtual ~Info() = default;
+  };
+  /**
+   * Represents (Tensor) data stored on a backend device
+   * in its native format.
+   * */
+  using Handle = int64_t;
+
+  BackendData(BackendDevice device, Shape shape)
+      : device_(std::move(device)), shape_(std::move(shape)) {}
+
+  virtual ~BackendData() = default;
+
+  const BackendDevice& device() const {
+    return device_;
+  }
+
+  const Shape& shape() const {
+    return shape_;
+  }
+
+  Info* info() const {
+    return info_.get();
+  }
+
+  std::shared_ptr<Info> SetInfo(std::shared_ptr<Info> info) {
+    std::swap(info, info_);
+    return info;
+  }
+
+  virtual Handle GetHandle() = 0;
+
+  virtual void Assign(const BackendData& data) = 0;
+
+  virtual bool HasValue() const = 0;
+
+ private:
+  BackendDevice device_;
+  Shape shape_;
+  std::shared_ptr<Info> info_;
+};
+
+using BackendDataPtr = std::shared_ptr<BackendData>;
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..a45299e1f51c6361eec21a1697b7348b229c4796
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include <ATen/Tensor.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+namespace c10 {
+struct Device;
+}
+
+namespace torch {
+namespace lazy {
+
+// Backend should extend it and define their own supported hardware types.
+struct TORCH_API BackendDeviceType {
+  int8_t type{(int8_t)at::kCPU};
+  // Note: previous default value was '0', which actually maps to at::kCPU, at
+  // least now it is explicit, we may want to make default/undefined semantics
+  // more clear though
+  BackendDeviceType() : type((int8_t)at::kCPU) {}
+  BackendDeviceType(int8_t type) : type(type) {}
+
+  virtual ~BackendDeviceType() = default;
+  virtual std::string toString() const {
+    return "Unknown";
+  }
+};
+
+class TORCH_API BackendDevice {
+ public:
+  // The default constructor will set both the device type and ordinal
+  // to backend specific defaults.
+  BackendDevice();
+  BackendDevice(std::shared_ptr<BackendDeviceType>&& type, int64_t ordinal);
+
+  int8_t type() const;
+  int64_t ordinal() const {
+    return ordinal_;
+  }
+
+  bool operator==(const BackendDevice& other) const {
+    return compare(other) == 0;
+  }
+  bool operator!=(const BackendDevice& other) const {
+    return compare(other) != 0;
+  }
+  bool operator<(const BackendDevice& rhs) const {
+    return compare(rhs) < 0;
+  }
+
+  std::string toString() const;
+
+ private:
+  int compare(const BackendDevice& rhs) const;
+
+  // Use shared_ptr instead of unique_ptr so that BackendDevice can be copied.
+  std::shared_ptr<BackendDeviceType> type_;
+  int64_t ordinal_;
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const BackendDevice& device);
+
+// Helpers for converting a c10::Device to BackendDevice and vice versa.
+TORCH_API BackendDevice atenDeviceToBackendDevice(const c10::Device& device);
+TORCH_API c10::Device backendDeviceToAtenDevice(const BackendDevice& device);
+
+// Tries to extract the backend device out of the lazy tensor. Returns nullopt
+// if the input is not a lazy tensor.
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+    const at::ITensorListRef tensors);
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+    const at::TensorList tensors);
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+    const at::Tensor& tensor);
+TORCH_API c10::optional<BackendDevice> GetBackendDevice(
+    const c10::optional<c10::Device>& device);
+
+// For variadic template.
+TORCH_API c10::optional<BackendDevice> GetBackendDevice();
+
+template <typename T, typename... Args>
+c10::optional<BackendDevice> GetBackendDevice(
+    const T& tensor,
+    const Args&... forward_tensors) {
+  auto optional_device = GetBackendDevice(tensor);
+  if (optional_device) {
+    return optional_device;
+  }
+  return GetBackendDevice(forward_tensors...);
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd62fd381701de6d68a6a55aa37e0b71daff640a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <atomic>
+
+namespace torch {
+namespace lazy {
+
+struct IrBuilder;
+
+/**
+ * Work in progress- don't treat this as a stable interface yet!
+ */
+class TORCH_API BackendImplInterface {
+ public:
+  virtual ~BackendImplInterface() = default;
+
+  /**
+   * Initialization/Teardown
+   * */
+  // No-op by default. Allows custom functionality to be exposed through
+  // extension bindings.
+  virtual void InitializeAtenBindings() const {}
+
+  virtual void PrepareToExit() const = 0;
+
+  /**
+   * Configuration
+   * */
+
+  virtual void SetRngSeed(size_t seed) const = 0;
+
+  /**
+   * IR Tracing
+   * */
+
+  virtual const IrBuilder* GetIrBuilder() const = 0;
+
+  /**
+   * Data Transfer
+   * */
+
+  virtual BackendDataPtr MakeComputationDataFromTensor(
+      const at::Tensor& tensor,
+      const Shape& shape,
+      const BackendDevice& device) const = 0;
+  virtual BackendDataPtr MakeComputationDataFromScalar(
+      const at::Scalar& scalar,
+      const torch::lazy::BackendDevice& device) const = 0;
+  virtual BackendDataPtr CreateDataPlaceholder(
+      const BackendDevice& device,
+      const Shape& shape) const = 0;
+
+  // Gets backend data if the node is a device data node. Otherwise returns
+  // nullptr
+  virtual BackendDataPtr GetComputationDataFromNode(const Node*) const = 0;
+
+  virtual at::Tensor MakeTensorFromComputationData(
+      const BackendDataPtr data,
+      c10::optional<at::ScalarType> logical_scalar_type) const = 0;
+
+  /**
+   * Lowering, Compilation, Execution
+   * */
+
+  virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status) const = 0;
+
+  virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      BackendDevice device) const = 0;
+
+  // TODO(whc) need to keep this?
+  virtual std::vector<std::string> GetCompilationDevices(
+      const std::string& device,
+      c10::ArrayRef<std::string> devices) const = 0;
+
+  virtual std::vector<ComputationPtr> Compile(
+      std::vector<ComputationPtr> instances) const = 0;
+
+  virtual std::vector<BackendDataPtr> ExecuteComputation(
+      torch::lazy::ComputationPtr computation,
+      c10::ArrayRef<BackendDataPtr> arguments,
+      const BackendDevice& device) const = 0;
+
+  /**
+   * Device Configuration
+   * */
+
+  // Set or get the default device type.
+  // For backends used with virtual c10::Devices, this configures what real
+  // device type the backend should use, and matters if the backend supports
+  // more than one type of real device.
+  virtual std::shared_ptr<BackendDeviceType> GetDefaultDeviceType() const = 0;
+  virtual void SetDefaultDeviceType(int8_t type) = 0;
+
+  // Set or get the default device ordinal.
+  // For backends that supports multi-device, this configures what the
+  // default device the backend should use.
+  virtual int64_t GetDefaultDeviceOrdinal() const = 0;
+  virtual void SetDefaultDeviceOrdinal(int64_t) = 0;
+
+  // Specify which aten device should be used for eager fallback
+  // may change depending on current 'Default' DeviceType
+  virtual at::DeviceType EagerFallbackDeviceType() const = 0;
+
+  // Query all available backend devices
+  virtual std::vector<BackendDevice> GetBackendDevices() const = 0;
+
+  virtual std::string CreateMetricReport() const {
+    return "";
+  }
+
+  // Map a particular c10:: device to a concrete backend device
+  // Note:: c10:: devices may be virtual or concrete.  xla:: and lazy:: are
+  // virtual devices, meaning they may map to a gpu, tpu, etc. behind the
+  // scenes. In the future, non-virtual c10:: devices may also use lazy tensors
+  // through a mode, in which case these APIs should still work, but should be
+  // identity mappings.
+  virtual BackendDevice GetBackendDevice(c10::Device device) const = 0;
+
+  // TODO(whc)
+  // Additional APIs expected for supporting distributed training, to be
+  // designed
+
+  /**
+   * Debug/Metrics
+   * */
+
+  //   virtual std::map<std::string, Metric> GetMetrics() const = 0;
+
+  //   virtual MemoryInfo GetMemoryInfo(const std::string& device) = 0;
+
+  virtual std::string GetComputationBackendText(
+      const ComputationPtr computation) const = 0;
+};
+
+class TORCH_API BackendRegistrar {
+ public:
+  BackendRegistrar(const BackendImplInterface* backend_impl_interface);
+};
+
+TORCH_API bool hasBackend();
+TORCH_API const BackendImplInterface* getBackend();
+
+TORCH_API const IrBuilder* getIrBuilder();
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c37bdd47e8200bfd4bb851fd8b5a684d1b98402
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Computation {
+ public:
+  virtual int parameters_size() const = 0;
+
+  virtual const std::vector<Shape>& parameter_shapes() const = 0;
+
+  virtual const std::vector<std::string>& parameter_names() const = 0;
+
+  virtual const Shape& result_shape() const = 0;
+
+  virtual const std::string to_string() const = 0;
+
+  virtual ~Computation() = default;
+
+  // Indicates whether this computation is being executed inside a mark step
+  // Assume false unless set otherwise
+  bool in_mark_step = false;
+};
+
+using ComputationPtr = std::shared_ptr<Computation>;
+
+// Keeps track of the code generation state.
+class TORCH_API LoweringContext {
+ public:
+  LoweringContext(const std::string& name, BackendDevice device);
+  LoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  virtual ~LoweringContext() = default;
+
+  static std::unique_ptr<LoweringContext> Create(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  static std::unique_ptr<LoweringContext> Create(
+      const std::string& name,
+      BackendDevice device);
+
+  const BackendDevice& device() const {
+    return device_;
+  };
+
+  // Retrieves the vector holding all the tensors associated with the parameter
+  // instructions which have been created.
+  const std::vector<BackendDataPtr>& GetParametersData() const;
+
+  // Adds a new input/output alias.
+  virtual void SetUpAlias(
+      const std::vector<int64_t>& output_index,
+      int64_t param_number,
+      const std::vector<int64_t>& param_index,
+      bool must_alias = false) {
+    // Dummy default implementation to do nothing.
+  }
+
+  // Check if parameter shape matches result at index.
+  virtual bool CheckResultShape(
+      const BackendDataPtr& parameter_data,
+      size_t result_idx) {
+    // Dummy default implementation to do nothing.
+    return false;
+  }
+
+  // Adds the given output as a component of the result tuple and returns its
+  // assigned position within the tuple.
+  virtual size_t AddResult(const torch::lazy::Output& output) = 0;
+
+  // Associates the given output with the input parameter of the given index and
+  // shape. Only used for the operator-by-operator execution, mostly for
+  // debugging purposes.
+  virtual void AddParameter(
+      const torch::lazy::Output& output,
+      size_t index,
+      const Shape& shape,
+      const std::string& name) = 0;
+
+  // Build the computation capturing all the operations created with the
+  // embedded builder (returned by the builder() API).
+  virtual ComputationPtr Build() = 0;
+
+  size_t GetEmittedNodeCount() const {
+    return emit_status_.size();
+  }
+
+ protected:
+  BackendDevice device_;
+  std::vector<BackendDataPtr> parameters_;
+  std::vector<size_t> parameter_sequence_;
+  Util::EmissionMap emit_status_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..f52ec1ddc8798190e93f84c93dca08491dca27bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h
@@ -0,0 +1,144 @@
+/**
+ * Cache utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/cache.h
+ */
+
+#pragma once
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+namespace torch {
+namespace lazy {
+
+// Generic key and object cache with LRU expiration policy. The objects of type
+// T will be stored as std::shared_ptr<T> and taken and returned as such, by the
+// cache API.
+template <
+    typename K,
+    typename T,
+    typename H = std::hash<K>,
+    typename E = std::equal_to<K>>
+class Cache {
+ public:
+  using TypePtr = std::shared_ptr<T>;
+  using Element = std::pair<K, TypePtr>;
+
+  explicit Cache(size_t max_size) : max_size_(max_size) {}
+
+  // Adds an object to the cache, unless it already exists. If the cache grows
+  // beyond the limit set during construction, the oldest used object will be
+  // removed from the cache.
+  TypePtr Add(K key, TypePtr object) {
+    if (!max_size_) {
+      return object;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    element_list_.emplace_front(Element(std::move(key), std::move(object)));
+    auto it = element_list_.begin();
+    auto emplace_result = element_map_.emplace(&it->first, it);
+    if (!emplace_result.second) {
+      element_list_.erase(it);
+      DoLRU(emplace_result.first->second);
+    } else if (element_list_.size() > max_size_) {
+      Element* last = &element_list_.back();
+      element_map_.erase(&last->first);
+      element_list_.pop_back();
+    }
+    return emplace_result.first->second->second;
+  }
+
+  // Retrieves the existing object if it exists. If it does, its position in
+  // the LRU list gets moved to the head of the list.
+  // Returns nullptr if no object with the specified key is found within the
+  // cache.
+  TypePtr Get(const K& key) {
+    if (!max_size_) {
+      return nullptr;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    auto it = element_map_.find(&key);
+    if (it == element_map_.end()) {
+      return nullptr;
+    }
+    DoLRU(it->second);
+    return it->second->second;
+  }
+
+  TypePtr GetLatest() {
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(!element_list_.empty());
+    return element_list_.front().second;
+  }
+
+  bool Erase(const K& key) {
+    if (!max_size_) {
+      return false;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    auto it = element_map_.find(&key);
+    if (it == element_map_.end()) {
+      return false;
+    }
+    auto lit = it->second;
+    element_map_.erase(it);
+    element_list_.erase(lit);
+    return true;
+  }
+
+  void Clear() {
+    if (!max_size_) {
+      return;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    element_map_.clear();
+    element_list_.clear();
+  }
+
+  int Numel() const {
+    if (!max_size_) {
+      return 0;
+    }
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(element_map_.size() == element_list_.size());
+    return element_map_.size();
+  }
+
+ private:
+  using ElementList = std::list<Element>;
+
+  struct Hasher {
+    size_t operator()(const K* key) const {
+      return hasher(*key);
+    }
+
+    H hasher;
+  };
+
+  struct Equaler {
+    bool operator()(const K* k1, const K* k2) const {
+      return equaler(*k1, *k2);
+    }
+
+    E equaler;
+  };
+
+  using ElementMap = std::
+      unordered_map<const K*, typename ElementList::iterator, Hasher, Equaler>;
+
+  void DoLRU(typename ElementList::iterator it) {
+    element_list_.splice(element_list_.begin(), element_list_, it);
+  }
+
+  mutable std::mutex lock_;
+  const size_t max_size_ = 0;
+  ElementList element_list_;
+  ElementMap element_map_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0d755798133419853c868f5af7b2104b0729c0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h
@@ -0,0 +1,28 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+
+C10_DECLARE_bool(torch_lazy_ir_debug);
+C10_DECLARE_bool(torch_lazy_handle_special_scalars);
+C10_DECLARE_bool(torch_lazy_all_numbers_special_scalars);
+C10_DECLARE_bool(torch_lazy_param_aliasing);
+C10_DECLARE_bool(torch_lazy_reuse_ir);
+C10_DECLARE_bool(torch_lazy_use_thread_pool);
+C10_DECLARE_bool(torch_lazy_enable_device_data_cache);
+
+C10_DECLARE_int(torch_lazy_compilation_cache_size);
+C10_DECLARE_int(torch_lazy_device_data_cache_size);
+C10_DECLARE_int(torch_lazy_io_thread_pool_size);
+C10_DECLARE_int(torch_lazy_metrics_samples);
+C10_DECLARE_int(torch_lazy_trim_graph_check_frequency);
+C10_DECLARE_int(torch_lazy_trim_graph_size);
+
+C10_DECLARE_string(torch_lazy_metrics_percentiles);
+
+C10_DECLARE_int(torch_lazy_shape_cache_size);
+
+namespace torch {
+namespace lazy {
+TORCH_API std::string& getLTCForceFallback();
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..5379d229e47c4bac7db24cb58b398271191dd503
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API std::function<std::vector<SourceLocation>()>&
+GetPythonFramesFunction();
+
+TORCH_API std::string GetFirstUserFrameInPython();
+
+class TORCH_API DebugUtil {
+ public:
+  enum GraphFormat {
+    kText,
+    kDot,
+    kBackend,
+  };
+
+  static GraphFormat GetDefaultGraphFormat();
+
+  // Dumps the current Python frame and the IR Graph whose roots are the IR
+  // values held at the tensors. If indices is not nullptr, it selects the
+  // indices of the tensors whose graph will be emitted.
+  static std::string GetTensorsGraphInfo(
+      c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+      const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  // If the environment variable LTC_SAVE_TENSORS_FILE is set to the proper
+  // output path, an instance of the report returned by GetTensorsGraphInfo() is
+  // saved.
+  static void SaveTensorsGraphInfo(
+      const char* name,
+      c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+      const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  static bool ExperimentEnabled(const std::string& name);
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..02f0867d18ff1bacdf81261b67a9f1d0cf74f61d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+/**
+ * The goal of "dynamic" Nodes is to patch a hole in our tracing.
+ * Previously, if a user called `sizes` on a Tensor, it would leak out
+ * of our tracing system, as `sizes` returns a torch.Size or an int. To
+ * prevent this from happening, we introduce DimensionNode, a new type
+ * of Node that abstracts the operation of getting the dimensions of a
+ * Tensor.
+ *
+ * Consider the following example:
+ * ```
+ * numel = x.shape()[0] * x.shape()[1]
+ * ```
+ *
+ * Here, `x.shape()[i]` will be a SizeNode (subclass of DimensionNode),
+ * and the multiplication of the two SizeNodes will be represented by
+ * a SizeMul (also a subclass of DimensionNode). Through this, we can
+ * prevent `numel` from being represented as a Python int and thus
+ * burned into the Graph.
+ */
+
+class TORCH_API DimensionNode {
+ public:
+  virtual bool isSymbolic() const {
+    return false;
+  };
+  virtual int64_t getDynamicValue() const {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual int64_t getStaticValue() const {
+    TORCH_CHECK(false, "NYI");
+  };
+  virtual ~DimensionNode() = default;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..83a57a3039ae612e77b103da047fd4f4f1eb68c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h
@@ -0,0 +1,238 @@
+/**
+ * Hash utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/util.h
+ */
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/int128.h>
+#include <torch/csrc/Export.h>
+#include <cstring>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+using size_t = std::size_t;
+
+class TORCH_API hash_t : public c10::uint128 {
+ public:
+  // Swich from typedef hash_t = uint128 to provide explicit casters
+  hash_t(int8_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int16_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int32_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int64_t val) : uint128(static_cast<uint64_t>(val)) {}
+  hash_t(uint32_t val) : uint128(val) {}
+  hash_t(uint64_t val) : uint128(val) {}
+  hash_t(uint128 val) : uint128(val) {}
+  hash_t(uint64_t top, uint64_t bottom) : uint128(top, bottom) {}
+  hash_t() : uint128() {}
+};
+
+// Std* functions use 64-bit hash
+size_t TORCH_API StdDataHash(const void* data, size_t size);
+
+size_t TORCH_API StdHashCombine(uintmax_t a, uintmax_t b);
+
+// Other functions are all 128-bit
+hash_t TORCH_API HashBlock(const void* data, size_t n, const hash_t& seed);
+
+hash_t TORCH_API DataHash(const void* data, size_t size);
+
+hash_t TORCH_API HashCombine(const hash_t& a, const hash_t& b);
+
+size_t TORCH_API HashReduce(const hash_t& a);
+
+// Returns a string representation of a hash
+std::string TORCH_API HashToString(const hash_t& a);
+
+struct HashReducer {
+  size_t operator()(const hash_t& value) const {
+    return HashReduce(value);
+  }
+};
+
+static inline hash_t StringHash(const char* data) {
+  return DataHash(data, std::strlen(data));
+}
+
+// Automatic templated implementation for 'arithmetic' types
+template <
+    typename T,
+    typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr>
+hash_t Hash(const T& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+// added because on macos builds the vector<bool> specialization
+// breaks falling through to the templated arithmetic types above
+hash_t TORCH_API Hash(const std::vector<bool>& value);
+
+// Specialiazed implementations for proprietary types
+static inline hash_t Hash(const c10::ScalarType& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::MemoryFormat& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::DeviceType& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::Device& value) {
+  return HashCombine(Hash(value.type()), Hash(value.index()));
+}
+
+static inline hash_t Hash(const c10::Layout& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::Scalar& value) {
+  switch (value.type()) {
+    case c10::ScalarType::ComplexDouble:
+      return Hash(value.toComplexDouble());
+    case c10::ScalarType::Double:
+      return Hash(value.toDouble());
+    case c10::ScalarType::Long:
+      return Hash(value.toLong());
+    case c10::ScalarType::Bool:
+      return Hash(value.toBool());
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown scalar type.", value.type());
+  }
+}
+
+static inline hash_t TensorHash(const at::Tensor& tensor) {
+  at::Tensor ctensor = tensor.contiguous();
+  int64_t size = ctensor.numel() * ctensor.element_size();
+  switch (ctensor.scalar_type()) {
+    case at::ScalarType::Bool:
+      return DataHash(ctensor.const_data_ptr<bool>(), size);
+    case at::ScalarType::Byte:
+      return DataHash(ctensor.const_data_ptr<uint8_t>(), size);
+    case at::ScalarType::Char:
+      return DataHash(ctensor.const_data_ptr<int8_t>(), size);
+    case at::ScalarType::Short:
+      return DataHash(ctensor.const_data_ptr<int16_t>(), size);
+    case at::ScalarType::Int:
+      return DataHash(ctensor.const_data_ptr<int32_t>(), size);
+    case at::ScalarType::Long:
+      return DataHash(ctensor.const_data_ptr<int64_t>(), size);
+    case at::ScalarType::Float:
+      return DataHash(ctensor.const_data_ptr<float>(), size);
+    case at::ScalarType::Double:
+      return DataHash(ctensor.const_data_ptr<double>(), size);
+    case at::ScalarType::BFloat16:
+      return DataHash(ctensor.const_data_ptr<at::BFloat16>(), size);
+    case at::ScalarType::Half:
+      return DataHash(ctensor.const_data_ptr<at::Half>(), size);
+    case at::ScalarType::ComplexFloat:
+      return DataHash(ctensor.const_data_ptr<c10::complex<float>>(), size);
+    case at::ScalarType::ComplexDouble:
+      return DataHash(ctensor.const_data_ptr<c10::complex<double>>(), size);
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unsupported scalar type:", ctensor.scalar_type());
+  }
+}
+
+static inline hash_t Hash(const std::string& value) {
+  return DataHash(value.data(), value.size());
+}
+
+static inline hash_t Hash(const c10::string_view& value) {
+  return DataHash(value.data(), value.size());
+}
+
+static inline hash_t Hash(const at::Generator& value) {
+  return TensorHash(value.get_state());
+}
+
+// Taken from glibc's implementation of hashing optionals,
+// we want to include a contribution to the hash to distinguish
+// cases where one or another option was null, but we hope it doesn't
+// collide with an actually scalar value.
+//
+// Use an arbitrary randomly-selected 64-bit integer rather than a
+// small constant that we then hash at runtime so we don't have to
+// repeatedly hash a constant at runtime.
+static const int64_t kNullOpt = 0x8655d738f3678dda;
+
+// Hashing for c10::optional types contributes to hash
+// for optionals with null value, important to distinguish
+// between <nullopt, non-nullopt> and <non-nullopt, nullopt> cases
+template <typename T>
+hash_t Hash(const c10::optional<T>& value) {
+  if (value.has_value()) {
+    return Hash(value.value());
+  } else {
+    return kNullOpt;
+  }
+}
+
+// Hashing of containers
+// Forward declare to allow hashes of vectors of vectors to work.
+template <typename T>
+hash_t ContainerHash(const T& values);
+
+template <typename T>
+hash_t Hash(const std::vector<T>& values) {
+  return ContainerHash(values);
+}
+
+// Need a special case for optional<container>?
+template <typename T>
+hash_t Hash(const c10::optional<std::vector<T>>& value) {
+  if (value.has_value()) {
+    return ContainerHash(value.value());
+  } else {
+    return kNullOpt;
+  }
+}
+
+template <typename T>
+hash_t Hash(const std::set<T>& values) {
+  return ContainerHash(values);
+}
+
+template <typename T, typename S>
+hash_t Hash(const std::pair<T, S>& values) {
+  return HashCombine(Hash(values.first), Hash(values.second));
+}
+
+static inline hash_t Hash(const hash_t& value) {
+  return value;
+}
+
+template <typename T>
+hash_t Hash(c10::ArrayRef<T> values) {
+  return ContainerHash(values);
+}
+
+template <typename T>
+hash_t ContainerHash(const T& values) {
+  hash_t h(static_cast<uint64_t>(0x85ebca77c2b2ae63));
+  for (const auto& value : values) {
+    h = HashCombine(h, Hash(value));
+  }
+  return h;
+}
+
+// Varargs hashing
+template <typename T = void>
+hash_t MHash() {
+  return hash_t(static_cast<uint64_t>(0x165667b19e3779f9));
+}
+
+template <typename T, typename... Targs>
+hash_t MHash(T value, Targs... Fargs) {
+  return HashCombine(Hash(value), MHash(Fargs...));
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..2da875b98899910fa589bff01894ba4ef2ad6285
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <c10/core/Scalar.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/util.h>
+
+#include <complex>
+#include <functional>
+#include <tuple>
+#include <vector>
+
+// TODO: Consolidate this file with util.h
+
+namespace torch {
+namespace lazy {
+
+// Converts an iterable container to a vector of int64's.
+template <typename S>
+static std::vector<int64_t> ToI64Vector(const S& input) {
+  return ToVector<int64_t>(input);
+}
+
+// Creates a set of dimension by dropping the drop_dims ones.
+TORCH_API std::vector<int64_t> DropDimensions(
+    c10::ArrayRef<int64_t> sizes,
+    c10::ArrayRef<int64_t> drop_dims);
+
+// Get the canonical dimension index in the [0, rank) interval. Negative
+// indices are interpreted as follows: -1 is rank-1, -2 is rank-2 etc.
+TORCH_API int64_t GetCanonicalDimensionIndex(int64_t dim, int64_t rank);
+
+// Same as above, for multiple dimensions.
+TORCH_API std::vector<int64_t> GetCanonicalDimensionIndices(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t rank);
+
+// Returns the canonical position in the dim dimension, handling negative
+// values for the position.
+TORCH_API int64_t GetCanonicalPosition(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t dim,
+    int64_t pos);
+
+// Creates a transposition from the given input and dimensions.
+TORCH_API std::vector<int64_t> MakeTransposePermutation(
+    int64_t dim0,
+    int64_t dim1,
+    int64_t rank);
+
+// Calculates the protomoted shape to which the input shapes should be
+// broadcasted for an elementwise operation. The size of the common dimensions
+// (2,3,4 for shape1, and 0,1,2 for shape2) must either match, or either one
+// of the two be 1.
+// Example:
+//   shape1       = [9, 7, 6, 1, 2]
+//   shape2       =       [6, 5, 2]
+//   result_shape = [9, 7, 6, 5, 2]
+TORCH_API std::vector<int64_t> GetPromotedShape(
+    c10::ArrayRef<int64_t> shape1_dims,
+    c10::ArrayRef<int64_t> shape2_dims);
+
+TORCH_API Shape
+GetPromotedBinaryOpShape(const Shape& shape1, const Shape& shape2);
+
+TORCH_API std::vector<std::string> StrSplit(c10::string_view text, char delim);
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f7070f7d14375507dd99240f427ecf0dda8123c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <mutex>
+#include <string>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API OpKindWrapper {
+ public:
+  explicit OpKindWrapper(const char* name) : name_(name) {}
+
+  const OpKind& operator*() const {
+    return get();
+  }
+
+  operator OpKind() const {
+    return get();
+  }
+
+ private:
+  const OpKind& get() const {
+    c10::call_once(once_, [this]() { op_kind_ = OpKind::Get(name_); });
+    return op_kind_;
+  }
+
+  const char* name_;
+  mutable OpKind op_kind_;
+  mutable c10::once_flag once_;
+};
+
+const OpKindWrapper ltc_all_to_all("lazy_tensors::all_to_all");
+const OpKindWrapper ltc_cast("lazy_tensors::cast");
+const OpKindWrapper ltc_collective_permute("lazy_tensors::collective_permute");
+const OpKindWrapper ltc_cross_replica_sum("lazy_tensors::cross_replica_sum");
+const OpKindWrapper ltc_device_data("lazy_tensors::device_data");
+const OpKindWrapper ltc_get_dimensions_size(
+    "lazy_tensors::ltc_get_dimensions_size");
+const OpKindWrapper ltc_moving_average("lazy_tensors::moving_average");
+const OpKindWrapper ltc_nms("lazy_tensors::nms");
+const OpKindWrapper ltc_not_supported("lazy_tensors::not_supported");
+const OpKindWrapper ltc_replication_pad("lazy_tensors::replication_pad");
+const OpKindWrapper ltc_replication_pad_backward(
+    "lazy_tensors::replication_pad_backward");
+const OpKindWrapper ltc_tensor_data("lazy_tensors::tensor_data");
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..adf2bac81255225c3065442934df0caa1de7edea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h
@@ -0,0 +1,298 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/core/shape.h>
+
+C10_DECLARE_bool(ltc_enable_dynamic_shapes);
+
+namespace torch {
+namespace lazy {
+
+static const hash_t kHashSeed(static_cast<uint32_t>(0x5a2d296e9));
+
+class Node;
+struct Output;
+struct Value;
+
+using NodePtr = std::shared_ptr<Node>;
+
+// The Kind of operation a Node can be associated to.
+struct TORCH_API OpKind {
+  OpKind() = default;
+  explicit OpKind(c10::Symbol op) : op(op) {}
+
+  bool operator==(const OpKind& rhs) const {
+    return op == rhs.op;
+  }
+  bool operator!=(const OpKind& rhs) const {
+    return !operator==(rhs);
+  }
+  bool operator<(const OpKind& rhs) const {
+    return c10::unique_t(op) < c10::unique_t(rhs.op);
+  }
+
+  hash_t hash() const;
+
+  std::string ToString() const {
+    return op.toQualString();
+  }
+
+  // Retrieves an existing operation object, or creates a new one. Operations
+  // that are specific to lazy tensors, should live within the 'lazy_tensors::'
+  // namespace.
+  static OpKind Get(const std::string& name);
+
+  c10::Symbol op;
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) {
+  stream << op.ToString();
+  return stream;
+}
+
+using OpList = c10::ArrayRef<Value>;
+
+hash_t OperandHashes(
+    const OpList& operands,
+    const hash_t& seed,
+    bool bakeInSizes);
+// A node in the graph. Nodes for operations which require extra data to be
+// stored for lowering should inherit from this class and add an operation
+// specific member there. For example, a constant might create a new
+// NodeConstant class (inheriting from Node) with an extra lazy_tensors::Literal
+// field, or a tensor value might create a new NodeTensor with a computation
+// client data handle in it.
+class TORCH_API Node {
+ public:
+  static bool enableDynamicShape();
+
+  // Creates a new node with the given op name. The op is a unique identifier
+  // for the operation. The num_outputs tells how many outputs a given operation
+  // generates.
+  //
+  // None leaf node's node_hash does not contains shape information always.
+  // So we pass in the hash value rather than a function.
+  Node(OpKind op, size_t num_outputs);
+
+  // Construct node with operands and shapes
+  Node(
+      OpKind op,
+      OpList operands,
+      std::vector<Shape>&& shapes,
+      size_t num_outputs = 1);
+
+  // Construct node with operands and shape generated from a function
+  Node(
+      OpKind op,
+      OpList operands,
+      const std::function<Shape()>& shape_fn,
+      size_t num_outputs = 1);
+
+  // Construct node with operands and no shape
+  Node(OpKind op, OpList operands, size_t num_outputs = 1);
+
+  // Construct node with shape and no operands
+  Node(OpKind op, Shape shape, size_t num_outputs = 1);
+
+  virtual ~Node();
+
+  const OpKind& op() const {
+    return op_;
+  }
+
+  size_t num_outputs() const {
+    return num_outputs_;
+  }
+
+  // Retrieves the full shape of the IR Node.
+  virtual c10::ArrayRef<Shape> shapes() const;
+
+  virtual const Shape& shape(size_t output_index = 0) const;
+
+  // Add the shape computed by the shape_fn
+  void addComputedShape(const std::function<Shape()>& shape_fn);
+
+  // Compute the shape using the provided shape_fn if not previously cached
+  Shape computeShape(const std::function<Shape()>& shape_fn);
+
+  virtual const std::vector<Output>& operands() const;
+
+  virtual const Output& operand(size_t i) const;
+
+  // Gets operand at index i if index is valid, or kNullOutput otherwise.
+  virtual const Output& nullable_operand(size_t i) const;
+
+  // Returns the hash of the dag used to look up the compiled graph
+  virtual hash_t hash() const = 0;
+
+  // Returns the hash of the dag used to for shape caching
+  virtual hash_t shapeHash() const = 0;
+
+  const MetaData& metadata() const {
+    return metadata_;
+  }
+
+  UserMetaData* user_metadata() const {
+    return user_metadata_.get();
+  }
+
+  std::shared_ptr<UserMetaData> SetUserMetadata(
+      std::shared_ptr<UserMetaData> user_meta) {
+    std::swap(user_metadata_, user_meta);
+    return user_meta;
+  }
+
+  virtual std::string ToString() const;
+
+ private:
+  // The ID of the operation captured by this node.
+  OpKind op_;
+  size_t num_outputs_ = 1;
+
+  // The IR specific metadata attached to the IR node.
+  MetaData metadata_;
+  // The IR framework user can attach a user defined metadata object deriving
+  // from UserMetaData.
+  std::shared_ptr<UserMetaData> user_metadata_;
+
+ protected:
+  // Adds node's index output number as operand.
+  void AddOperand(NodePtr node, size_t index = 0);
+
+  std::vector<Shape> shapes_;
+  // A node holds a real reference to its operands.
+  std::vector<NodePtr> operands_;
+  // Outputs do not hold references on the nodes, and neither do the uses, since
+  // otherwise we get into circular reference counting.
+  std::vector<Output> operands_as_outputs_;
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const Node& node) {
+  stream << node.ToString();
+  return stream;
+}
+
+// Note: Keep this version of NodeCast for smooth PyTorch/XLA migration, and
+// clean up once the migration is done.
+template <typename T>
+const T* NodeCast(const Node* node, OpKind op) {
+  if (op != node->op()) {
+    return nullptr;
+  }
+#ifdef NDEBUG
+  return static_cast<const T*>(node);
+#else
+  return &dynamic_cast<const T&>(*node);
+#endif
+}
+
+template <typename T>
+const T* NodeCast(const Node* node) {
+  if (T::ClassOpKind() != node->op()) {
+    return nullptr;
+  }
+  // TODO: Some IR classes share the same opkind, such as Mean and MeanDim, so
+  // static_cast is not safe here. Unless we have opkind unique for each class,
+  // we have to use dynamic_cast here.
+  return dynamic_cast<const T*>(node);
+}
+
+// Represents a specific output produced by a node. Since the output of a node
+// can be composed by multiple outputs, the node+index coordinates fully qualify
+// each single output.
+struct TORCH_API Output {
+  struct Hasher {
+    size_t operator()(const Output& output) const;
+  };
+
+  Output() = default;
+  explicit Output(const Node* node, size_t index = 0)
+      : node(node), index(index) {}
+
+  hash_t hash() const;
+  hash_t shapeHash() const;
+
+  bool operator==(const Output& rhs) const {
+    return node == rhs.node && index == rhs.index;
+  }
+
+  // To compare the operands of to-be-constructed node and to-be-reused node
+  bool operator==(const Value& rhs) const;
+
+  bool operator!=(const Output& rhs) const {
+    return !operator==(rhs);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  std::string ToString() const;
+
+  // The node providing the output.
+  const Node* node{nullptr};
+  // The index in the node's output this output refers to.
+  size_t index{0};
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const Output& output) {
+  stream << output.ToString();
+  return stream;
+}
+
+template <typename T>
+using OutputMap = std::unordered_map<Output, T, Output::Hasher>;
+
+// Represents an input/operand for a Node object.
+struct TORCH_API Value {
+  Value() = default;
+  /* implicit */ Value(NodePtr&& node, size_t index = 0)
+      : node(std::move(node)), index(index) {}
+  /* implicit */ Value(const NodePtr& node, size_t index = 0)
+      : node(node), index(index) {}
+
+  hash_t hash() const;
+  hash_t shapeHash() const;
+
+  operator bool() const {
+    return node != nullptr;
+  }
+
+  operator Output() const {
+    return Output(node.get(), index);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  Node* operator->() const {
+    return node.get();
+  }
+
+  NodePtr node;
+  size_t index = 0;
+};
+
+} // namespace lazy
+} // namespace torch
+
+namespace c10 {
+// Explicit template instantiation to make ArrayRef<Value> work
+template class at::ArrayRef<torch::lazy::Value>;
+} // namespace c10
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1352971ad0e0c4d684a03f5300e00c42d392d09d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/config.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/trie.h>
+#include <vector>
+
+// This file is part of the backend interface. So, ops shouldn't be added or
+// removed without due process The exception to this being the view ops which
+// will be removed soon pending functionalization
+
+namespace torch {
+namespace lazy {
+
+template <typename T, typename... Args>
+NodePtr ReuseNode(Args&&... args) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    return LookupNodeFromTrieCache<T>(std::forward<Args>(args)...);
+  }
+  return nullptr;
+}
+
+// Caching an IR node into TrieCache
+static inline void CacheNode(NodePtr node) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    TrieCache::Get()->Insert(std::move(node));
+  }
+}
+
+template <typename T, typename... Args>
+NodePtr MakeNode(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
+}
+
+// op is passed in for a more efficient node casting, see the implementation of
+// NodeCast
+template <typename T, typename... Args>
+NodePtr ReuseOrMakeNode(Args&&... args) {
+  NodePtr node = ReuseNode<T>(std::forward<Args>(args)...);
+  if (!node) {
+    node = MakeNode<T>(std::forward<Args>(args)...);
+    CacheNode(node);
+  }
+  return node;
+}
+
+struct IrBuilder {
+  virtual NodePtr MakeDeviceData(
+      const std::shared_ptr<BackendData>& data) const = 0;
+  virtual NodePtr MakeScalar(
+      const at::Scalar& value,
+      const at::ScalarType& type) const = 0;
+  virtual NodePtr MakeExpand(
+      const Value& input0,
+      const std::vector<int64_t>& size,
+      const bool& is_scalar_expand) const = 0;
+  virtual NodePtr MakeCast(
+      const Value& input0,
+      const at::ScalarType& dtype,
+      const c10::optional<at::ScalarType>& stype = c10::nullopt) const = 0;
+  virtual NodePtr MakeTensorList(const OpList& inputs) const = 0;
+  virtual NodePtr MakeGeneric(
+      const OpKind& op,
+      const OpList& operands,
+      const Shape& shape,
+      const size_t& num_outputs = 1,
+      const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) const = 0;
+
+  // dynamic ir nodes
+  virtual NodePtr MakeSizeNode(const Value& input, size_t dim) const = 0;
+  virtual NodePtr MakeSizeAdd(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeMul(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeDiv(const Value& a, const Value& b) const = 0;
+
+  virtual ~IrBuilder() = default;
+};
+
+static inline NodePtr MakeDeviceData(const std::shared_ptr<BackendData>& data) {
+  return getIrBuilder()->MakeDeviceData(data);
+}
+static inline NodePtr MakeScalar(
+    const at::Scalar& value,
+    const at::ScalarType& type) {
+  return getIrBuilder()->MakeScalar(value, type);
+}
+static inline NodePtr MakeExpand(
+    const Value& input0,
+    const std::vector<int64_t>& size,
+    const bool& is_scalar_expand) {
+  return getIrBuilder()->MakeExpand(input0, size, is_scalar_expand);
+}
+static inline NodePtr MakeCast(
+    const Value& input0,
+    const at::ScalarType& dtype,
+    const c10::optional<at::ScalarType>& stype = c10::nullopt) {
+  return getIrBuilder()->MakeCast(input0, dtype, stype);
+}
+static inline NodePtr MakeTensorList(const OpList& inputs) {
+  return getIrBuilder()->MakeTensorList(inputs);
+}
+static inline NodePtr MakeGeneric(
+    const OpKind& op,
+    const OpList& operands,
+    const Shape& shape,
+    const size_t& num_outputs = 1,
+    const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) {
+  return getIrBuilder()->MakeGeneric(
+      op, operands, shape, num_outputs, hash_seed);
+}
+
+// dynamic ir nodes
+static inline NodePtr MakeSizeNode(const Value& input, size_t dim) {
+  return getIrBuilder()->MakeSizeNode(input, dim);
+}
+static inline NodePtr MakeSizeAdd(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeMul(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeDiv(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeDiv(a, b);
+}
+
+inline Value GetSymIntValue(c10::SymInt a) {
+  if (auto ma = a.maybe_as_int()) {
+    return Value(MakeScalar(*ma, at::kLong), 0);
+  } else {
+    return Value(
+        dynamic_cast<torch::lazy::SymNodeImpl*>(a.toSymNodeImplUnowned())
+            ->node_,
+        0);
+  }
+}
+
+// TODO: this should return Value
+inline std::vector<int64_t> GetSymIntArrayRefValue(c10::SymIntArrayRef arr) {
+  std::vector<int64_t> r;
+  for (const auto& a : arr) {
+    r.emplace_back(a.guard_int(__FILE__, __LINE__));
+  }
+  return r;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2459c0677229e604b3200cb9b0f3621c3571a71
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+#include <string>
+
+namespace torch {
+namespace lazy {
+
+class BackendDevice;
+
+class TORCH_API DumpUtil {
+ public:
+  static std::string ToDot(c10::ArrayRef<const Node*> nodes);
+
+  static std::string PostOrderToDot(
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
+
+  static std::string ToText(c10::ArrayRef<const Node*> nodes);
+
+  static std::string PostOrderToText(
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
+
+  static std::string ToBackend(
+      c10::ArrayRef<Value> values,
+      const BackendDevice& device);
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0855304d968310207afb3ea1ce581882ff34044
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+struct SourceLocation {
+  std::string file;
+  std::string function;
+  int line = -1;
+};
+
+TORCH_API void EmitShortFrameInfo(
+    std::ostream& stream,
+    const std::vector<SourceLocation>& frames);
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& stream,
+    const std::vector<SourceLocation>& frames);
+
+// The base class for user defined metadata which is possible to attach to IR
+// nodes.
+struct TORCH_API UserMetaData {
+  virtual ~UserMetaData() = default;
+};
+
+struct TORCH_API MetaData {
+  std::string scope;
+  std::vector<SourceLocation> frame_info;
+};
+
+// TODO(whc) is this going to be used outside of in IR decompositions?
+// RAII data structure to be used a stack variable to enter a new IR scope. IR
+// scope names will appear in the IR and will help identifying the source of the
+// single IR nodes.
+struct TORCH_API ScopePusher {
+  explicit ScopePusher(const std::string& name);
+  ~ScopePusher();
+
+  static void ResetScopes();
+};
+
+TORCH_API MetaData GetMetaDataIfDebugging();
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..41738f7fa355548763ff28e5871b8acf7a797724
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/lazy/core/ir.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Util {
+ public:
+  // Tracks the emission status of the nodes during the post-order generation.
+  // It helps tracking loops within the computation graphs.
+  enum EmitStatus {
+    kNotEmitted,
+    kEmitting,
+    kEmitted,
+  };
+
+  using EmissionMap = std::unordered_map<const Node*, EmitStatus>;
+
+  // Computes the post order from the given node, without using recursion. The
+  // emission map can be used as saved state, for multiple separate calls to
+  // this API. The returned post-order can be empty if the node has already been
+  // emitted inside the emission map. An error is generated if a loop is
+  // detected.
+  static std::vector<const Node*> ComputePostOrder(
+      const Node* node,
+      EmissionMap* emap);
+
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes,
+      EmissionMap* emap);
+
+  // Same as above, but computes the post order on the set of nodes specified as
+  // argument.
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes);
+
+  // Retrieves the number of nodes within the graph whose sink are passed in the
+  // nodes argument.
+  static size_t GetGraphSize(c10::ArrayRef<const Node*> nodes);
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..96e139603e30fee07f088a2491edc8dab58c3760
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -0,0 +1,426 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/cache.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <torch/csrc/lazy/core/multi_wait.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API LazyGraphExecutor {
+ public:
+  struct DeviceDataInfo : public BackendData::Info {
+    DeviceDataInfo(int64_t tensor_id, bool read_only)
+        : tensor_id(tensor_id), read_only(read_only) {}
+
+    int64_t tensor_id = 0;
+    bool read_only = false;
+  };
+
+  // Register a lazy graph executor instance that can be retrieved using Get()
+  static void Register(LazyGraphExecutor*);
+  static LazyGraphExecutor* Get();
+
+  virtual ~LazyGraphExecutor() = default;
+
+  // Override these methods to perform custom tensor registration and
+  // unregistration Note: It is vital that the parent implementations are also
+  // called in order for the tensors to show up in the live tensor list
+  virtual void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
+  virtual void UnregisterTensor(LazyTensor::Data* data);
+
+  // Seed for random generator.
+  // Override to supply your own DeviceContextArena.
+  virtual Value GetRngSeed(const BackendDevice& device);
+  virtual uint64_t GetRunningSeed(const BackendDevice& device);
+  virtual void SetRngSeed(const BackendDevice& device, uint64_t seed);
+
+  void DeviceBarrier(const BackendDevice& device);
+
+  BackendDataPtr GetDeviceData(
+      const at::Tensor& tensor,
+      const BackendDevice& device);
+
+  BackendDataPtr GetDeviceData(
+      const at::Scalar& value,
+      at::ScalarType scalar_type,
+      const BackendDevice& device);
+
+  // Retrieves the set of lazy tensors which are currently live in the system,
+  // for the given device. If device is nullptr, the live tensors for all
+  // devices will be returned. Returned tensors are sorted by device as primary
+  // key, and by unique ID as secondary key.
+  std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device);
+
+  // Makes sure that any outstanding IR operation accumulated over live tensors,
+  // gets turned into device data. If wait is true, the sync operation will be
+  // run synchronously. The devices argument, if not empty, tells the devices
+  // which should be partecipating into the replicated computation.
+  virtual void SyncLiveTensorsGraph(
+      const BackendDevice* device,
+      c10::ArrayRef<std::string> devices,
+      bool wait);
+
+  // Applies all the pending IR operations queued over the input tensors. All
+  // the tensors must be on the same device. If wait is true, the sync operation
+  // will be run synchronously. The devices argument, if not empty, tells the
+  // devices which should be partecipating into the replicated computation.
+  void SyncTensorsGraph(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<std::string> devices,
+      bool wait,
+      bool sync_ltc_data);
+
+  // Marks an execution step, which allows the tensor framework to understand
+  // the computation boundaries.
+  // Override to supply your own DeviceContextArena.
+  virtual void MarkStep(const BackendDevice& device);
+
+  // Waits for all the outstanding operations on all the supplied devices.
+  // If devices is empty, the wait will happen for all local devices.
+  void WaitDeviceOps(c10::ArrayRef<BackendDevice> devices);
+
+  // Retrieves the PyTorch CPU tensors behind the lazy tensors IR operations.
+  // All the tensors must be on the same device.
+  std::vector<at::Tensor> GetTensors(std::vector<LazyTensorPtr>* tensors);
+
+  size_t IncTrimCounter() const;
+
+  // Dumps the backend specific text of the computation accumulated in the graph
+  // which is attached the tensors.
+  std::string DumpBackendComputation(const std::vector<LazyTensorPtr>& tensors);
+
+  Value GetDeviceDataIrValue(
+      const at::Scalar& value,
+      c10::ScalarType type,
+      const BackendDevice& device);
+  Value GetIrValueForScalar(
+      const at::Scalar& value,
+      c10::ScalarType type,
+      const BackendDevice& device);
+  Value GetIrValueForScalar(
+      const at::Scalar& value,
+      const BackendDevice& device);
+
+  // TODO: even though this API is currently used **only** in codegen to
+  // generate real scalar IR values vs scalar tensors, we would like to
+  // use it in other cases where `GetIrValueForXXXScalar` is used, as well
+  // In order to do that, we need to untangle the cases where we don't need
+  // `expand` and where we don't expect a scalar tensor
+  Value GetIrValueForScalarFromCodegen(
+      const at::Scalar& value,
+      const BackendDevice& device);
+  Value GetIrValueForExpandedScalar(
+      const at::Scalar& value,
+      const Shape& shape,
+      const BackendDevice& device);
+
+  struct CachedComputation {
+    explicit CachedComputation(ComputationPtr computation)
+        : computation(std::move(computation)) {}
+
+    ComputationPtr computation;
+  };
+
+  using ComputationCache = Cache<hash_t, CachedComputation, HashReducer>;
+
+  ComputationCache* GetComputationCache();
+
+  hash_t GetGraphHash(const std::vector<LazyTensorPtr>& tensors);
+
+ protected:
+  // TODO(alanwaketan): Revisit if all of them need to be accessible to
+  // derived classes.
+
+  struct SyncTensorsConfig {
+    // Whether we want to force data on the target tensors (hence trimming
+    // the IR graph above them).
+    bool force_ltc_data = true;
+    // Whether when setting the data, the other properties of the tensor
+    // state should be reset.
+    bool sync_ltc_data = true;
+  };
+
+  struct SyncTensorCollection {
+    SyncTensorCollection() : hash(0) {}
+
+    SyncTensorsConfig config;
+    std::vector<size_t> indices;
+    hash_t hash;
+    std::vector<ExceptionCleanup> unlocker;
+    BackendDevice device;
+  };
+
+  struct PostOrderData {
+    std::vector<const Node*> post_order;
+    Util::EmissionMap emission_map;
+    std::vector<BackendDataPtr> parameters_data;
+    std::vector<size_t> parameter_sequence;
+  };
+
+  // Locking:
+  // We perform two kinds of operations of tensors, synchronous and
+  // asynchronous. The ApplyPendingGraph() are synchronous, as we need the
+  // device data result immediately. Before the synchronous operations can
+  // start, they need to wait that the pending asynchronous operations have
+  // completed. Synchronous operations do not hold device locks, since they are
+  // strictly sequential, dictated by the PyTorch execution order. The
+  // SyncTensorsGraph() is asynchronous, and returns immediately after having
+  // scheduled the asynchronous operation. While executing, the asynchronous
+  // operations will hold locks on all the participating devices (in most common
+  // cases there will be only one device).
+  // Since asynchronous operations capture device locks, only one asynchronous
+  // operation can execute at the same time, on a given device. Tensor
+  // operations which send data to device do not need to hold any device locks
+  // while doing so. Only operations which _use_ device data (computations, and
+  // transfer from server) need to wait for asynchronous operations to complete
+  // (barrier).
+
+  class DeviceLocker {
+   public:
+    explicit DeviceLocker(BackendDevice device) : device_(std::move(device)) {}
+
+    const BackendDevice& device() const {
+      return device_;
+    }
+
+    void Lock();
+    void Unlock(std::exception_ptr exptr);
+    void Barrier();
+
+   private:
+    void CheckResetException();
+
+    BackendDevice device_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    bool locked_ = false;
+    std::exception_ptr exptr_;
+  };
+
+  class DeviceLockerArena {
+   public:
+    static DeviceLockerArena* Get();
+
+    std::shared_ptr<DeviceLocker> GetLocker(const BackendDevice& device);
+
+    void DeviceBarrier(const BackendDevice& device);
+
+    // Use a set to impose an order on the device locking sequence (ABBA
+    // prevention).
+    std::vector<ExceptionCleanup> LockDevices(
+        const std::set<BackendDevice>& devices);
+
+   private:
+    ExceptionCleanup LockDevice(const BackendDevice& device);
+
+    std::mutex mutex_;
+    std::map<BackendDevice, std::shared_ptr<DeviceLocker>> lockers_;
+  };
+
+  class DataCacheArena {
+   public:
+    static DataCacheArena* Get();
+
+    BackendDataPtr GetDeviceData(
+        const at::Tensor& tensor,
+        const BackendDevice& device);
+
+    BackendDataPtr GetDeviceData(
+        const at::Scalar& value,
+        at::ScalarType scalar_type,
+        const BackendDevice& device);
+
+   private:
+    struct TensorHasher {
+      size_t operator()(const at::Tensor& tensor) const;
+    };
+    struct TensorComparer {
+      bool operator()(const at::Tensor& tensor1, const at::Tensor& tensor2)
+          const;
+    };
+
+    explicit DataCacheArena(size_t max_cache_size);
+
+    using DataCache =
+        Cache<at::Tensor, BackendData, TensorHasher, TensorComparer>;
+
+    DataCache* GetDataCache(const BackendDevice& device);
+
+    size_t max_cache_size_ = 0;
+    std::mutex mutex_;
+    std::map<BackendDevice, std::unique_ptr<DataCache>> device_caches_;
+  };
+
+  // The DeviceContextArena holds per device live information and statistics,
+  // among which the lazy tensors which are currently alive in the system. This
+  // is used to create computation "barriers" in order to flush pending
+  // operations and ensure the same computations are created during the training
+  // loops.
+  // TODO(alanwaketan): Add a registry such that we don't need to make all
+  // related methods virtual.
+  class DeviceContextArena {
+   protected:
+    struct DeviceContext {
+      std::mutex lock;
+      std::map<int64_t, std::weak_ptr<LazyTensor::Data>> tensors_data;
+      uint64_t seed = 101;
+      uint64_t running_seed = 101;
+      Value seed_ir_value;
+    };
+
+   public:
+    static DeviceContextArena* Get();
+    virtual ~DeviceContextArena() = default;
+
+    void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
+    void UnregisterTensor(LazyTensor::Data* data);
+
+    std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device);
+
+    // Overriding it allow derived class to use their own IRs for Value.
+    virtual Value GetRngSeed(const BackendDevice& device);
+    uint64_t GetRunningSeed(const BackendDevice& device);
+    void SetRngSeed(const BackendDevice& device, uint64_t seed);
+
+    void MarkStep(const BackendDevice& device);
+
+    std::vector<BackendDevice> GetActiveDevices();
+
+   protected:
+    DeviceContext* GetDeviceContext(const BackendDevice& device);
+
+    void ForAllDeviceContexts(
+        const std::function<void(DeviceContext*)>& fn,
+        const BackendDevice* device);
+
+    // Overriding it allow derived class to use their own conversions.
+    virtual Value IrValueFromScalar(
+        const at::Scalar& value,
+        at::ScalarType scalar_type,
+        const BackendDevice& device);
+
+   private:
+    std::vector<DeviceContext*> GetAllDeviceContexts();
+
+    std::mutex lock_;
+    std::map<BackendDevice, DeviceContext*> device_contexts_;
+  };
+
+  struct Async {
+    Async(
+        SyncTensorCollection* coll,
+        std::vector<BackendDataPtr> parameters_data,
+        std::vector<BackendDataPtr> tensors_data,
+        ComputationCache::TypePtr cached_computation);
+    virtual ~Async() = default;
+
+    void Wait();
+
+    MultiWait mwait;
+    std::vector<size_t> indices;
+    std::vector<ExceptionCleanup> unlocker;
+    std::vector<BackendDataPtr> parameters_data;
+    BackendDevice device;
+    ComputationCache::TypePtr cached_computation;
+    std::vector<BackendDataPtr> tensors_data;
+  };
+
+  void ResetTrimCounter() const;
+
+  // Waits for this SyncTensorCollection's device barrier and acquire the lock.
+  virtual void TensorCollectionBarrier(SyncTensorCollection* coll);
+
+  // One can override to insert your own profiler.
+  virtual PostOrderData RunPostOrder(
+      const std::vector<Value>& ir_values,
+      SyncTensorCollection* coll);
+
+ private:
+  struct CompilationResult {
+    BackendDevice device;
+    size_t emitted_nodes = 0;
+    ComputationPtr computation;
+    std::vector<BackendDataPtr> parameters_data;
+  };
+
+  virtual bool ShouldSyncTensor(const LazyTensorPtr& tensor) const;
+
+  SyncTensorCollection CollectSyncTensors(
+      const std::vector<LazyTensorPtr>& tensors,
+      const SyncTensorsConfig& config);
+
+  std::vector<Value> CollectRoots(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<size_t> indices);
+
+  std::vector<BackendDataPtr> SetTensorData(
+      std::vector<LazyTensorPtr>* tensors,
+      const SyncTensorsConfig& config,
+      c10::ArrayRef<size_t> indices,
+      const std::vector<torch::lazy::BackendDataPtr>& tensor_data_vec);
+
+  void ExtractIRAndPrepareTensorData(
+      std::vector<LazyTensorPtr>* tensors,
+      const SyncTensorsConfig& config,
+      c10::ArrayRef<size_t> indices,
+      std::vector<Value>& ir_values,
+      std::vector<BackendDataPtr>& tensor_data_vec);
+
+  std::shared_ptr<Async> TryRunCachedSync(
+      std::vector<LazyTensorPtr>* tensors,
+      SyncTensorCollection* coll,
+      PostOrderData* po_data,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
+
+  CompilationResult Compile(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<std::string> devices,
+      const SyncTensorCollection& coll,
+      PostOrderData* po_data,
+      const std::vector<Value>& ir_values);
+
+  ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash);
+
+  std::shared_ptr<Async> SyncTensorsGraphInternal(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<std::string> devices,
+      const SyncTensorsConfig& config);
+
+  // Schedules the execution of a sync tensors operation in background. The
+  // asynchronous operation will hold the device locks by capturing the ones
+  // present within the coll structure.
+  std::shared_ptr<Async> ScheduleSyncTensorsGraph(
+      SyncTensorCollection* coll,
+      std::vector<BackendDataPtr> parameters_data,
+      std::vector<BackendDataPtr> tensors_data,
+      ComputationCache::TypePtr cached_computation);
+
+  std::shared_ptr<Async> ScheduleSyncTensorsGraph(
+      std::vector<LazyTensorPtr>* tensors,
+      SyncTensorCollection* coll,
+      std::vector<BackendDataPtr> parameters_data,
+      ComputationCache::TypePtr cached_computation,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
+
+  std::vector<at::Tensor> GetTensorsFused(std::vector<LazyTensorPtr>* tensors);
+
+  std::vector<at::Tensor> FetchTensors(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<BackendDataPtr> tensors_data,
+      const std::vector<size_t>* indices);
+
+  // Gathers the device data for all the input tensors, after an
+  // asynchronous operation.
+  std::vector<BackendDataPtr> GatherTensorsData(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<size_t> indices,
+      c10::ArrayRef<BackendDataPtr> tensors_data);
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca9ec44c6c3a662b0d51a55f3402195e5975de87
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h
@@ -0,0 +1,286 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
+ */
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Export.h>
+
+namespace torch {
+namespace lazy {
+
+struct TORCH_API Sample {
+  Sample() = default;
+  Sample(int64_t timestamp_ns, double value)
+      : timestamp_ns(timestamp_ns), value(value) {}
+
+  int64_t timestamp_ns = 0;
+  double value = 0;
+};
+
+using MetricReprFn = std::function<std::string(double)>;
+
+// Class used to collect time-stamped numeric samples. The samples are stored in
+// a circular buffer whose size can be configured at constructor time.
+class TORCH_API MetricData {
+ public:
+  // Creates a new MetricData object with the internal circular buffer storing
+  // max_samples samples. The repr_fn argument allow to specify a function which
+  // pretty-prints a sample value.
+  MetricData(MetricReprFn repr_fn, size_t max_samples);
+
+  // Returns the total values of all the samples being posted to this metric.
+  double Accumulator() const;
+
+  size_t TotalSamples() const;
+
+  void AddSample(int64_t timestamp_ns, double value);
+
+  // Returns a vector with all the current samples, from the oldest to the
+  // newer. If accumulator is not nullptr, it will receive the current value of
+  // the metrics' accumulator (the sum of all posted values). If total_samples
+  // is not nullptr, it will receive the count of the posted values.
+  std::vector<Sample> Samples(double* accumulator, size_t* total_samples) const;
+
+  std::string Repr(double value) const {
+    return repr_fn_(value);
+  }
+
+  void Reset();
+
+  bool IsValid() const {
+    return TotalSamples() > 0;
+  }
+
+ private:
+  mutable std::mutex lock_;
+  MetricReprFn repr_fn_;
+  size_t count_ = 0;
+  std::vector<Sample> samples_;
+  double accumulator_ = 0.0;
+};
+
+// Counters are a very lightweight form of metrics which do not need to track
+// sample time.
+class TORCH_API CounterData {
+ public:
+  CounterData() : value_(0) {}
+
+  void AddValue(int64_t value) {
+    value_ += value;
+  }
+
+  int64_t Value() const {
+    return value_;
+  }
+
+  void Reset() {
+    value_ = 0;
+  }
+
+  bool IsValid() const {
+    return value_ > 0;
+  }
+
+ private:
+  std::atomic<int64_t> value_;
+};
+
+class TORCH_API MetricsArena {
+ public:
+  static MetricsArena* Get();
+
+  void ResetCounters();
+  void ResetMetrics();
+
+  // Registers a new metric in the global arena.
+  void RegisterMetric(
+      const std::string& name,
+      MetricReprFn repr_fn,
+      size_t max_samples,
+      std::shared_ptr<MetricData>* data);
+
+  void RegisterCounter(
+      const std::string& name,
+      std::shared_ptr<CounterData>* data);
+
+  void ForEachMetric(
+      const std::function<void(const std::string&, MetricData*)>& metric_func);
+
+  void ForEachCounter(
+      const std::function<void(const std::string&, CounterData*)>&
+          counter_func);
+
+  std::vector<std::string> GetMetricNames();
+
+  MetricData* GetMetric(const std::string& name);
+
+  std::vector<std::string> GetCounterNames();
+
+  CounterData* GetCounter(const std::string& name);
+
+ private:
+  std::mutex lock_;
+  std::map<std::string, std::shared_ptr<MetricData>> metrics_;
+  std::map<std::string, std::shared_ptr<CounterData>> counters_;
+};
+
+// Emits the value in a to_string() conversion.
+TORCH_API std::string MetricFnValue(double value);
+// Emits the value in a humanized bytes representation.
+TORCH_API std::string MetricFnBytes(double value);
+// Emits the value in a humanized time representation. The value is expressed in
+// nanoseconds EPOCH time.
+TORCH_API std::string MetricFnTime(double value);
+
+// The typical use of a Metric is one in which it gets created either in a
+// global scope context:
+//   static Metric* metric = new Metric("RpcCount");
+// Or within a function scope:
+//   void MyFunction(...) {
+//     static Metric* metric = new Metric("RpcCount");
+//     ...
+//     metric->AddSample(ts_nanos, some_value);
+//   }
+class TORCH_API Metric {
+ public:
+  explicit Metric(
+      std::string name,
+      MetricReprFn repr_fn = MetricFnValue,
+      size_t max_samples = 0);
+
+  const std::string& Name() const {
+    return name_;
+  }
+
+  double Accumulator() const;
+
+  void AddSample(int64_t timestamp_ns, double value);
+
+  void AddSample(double value);
+
+  std::vector<Sample> Samples(double* accumulator, size_t* total_samples) const;
+
+  std::string Repr(double value) const;
+
+ private:
+  MetricData* GetData() const;
+
+  std::string name_;
+  MetricReprFn repr_fn_;
+  size_t max_samples_;
+  mutable std::shared_ptr<MetricData> data_ptr_;
+  mutable std::atomic<MetricData*> data_;
+};
+
+// A Counter is a lightweight form of metric which tracks an integer value which
+// can increase or decrease.
+// A typical use is as:
+//   static Counter* counter = new Counter("MyCounter");
+//   ...
+//   counter->AddValue(+1);
+class TORCH_API Counter {
+ public:
+  explicit Counter(std::string name);
+
+  void AddValue(int64_t value) {
+    GetData()->AddValue(value);
+  }
+
+  int64_t Value() const {
+    return GetData()->Value();
+  }
+
+ private:
+  CounterData* GetData() const;
+
+  std::string name_;
+  mutable std::shared_ptr<CounterData> data_ptr_;
+  mutable std::atomic<CounterData*> data_;
+};
+
+#define TORCH_LAZY_COUNTER(name, value)        \
+  do {                                         \
+    static ::torch::lazy::Counter* __counter = \
+        new ::torch::lazy::Counter(name);      \
+    __counter->AddValue(value);                \
+  } while (0)
+
+#define TORCH_LAZY_FN_COUNTER(ns) TORCH_LAZY_COUNTER(c10::str(ns, __func__), 1)
+
+#define TORCH_LAZY_VALUE_METRIC(name, value)                         \
+  do {                                                               \
+    static ::torch::lazy::Metric* __metric =                         \
+        new ::torch::lazy::Metric(name, torch::lazy::MetricFnValue); \
+    __metric->AddSample(value);                                      \
+  } while (0)
+
+// Creates a report with the current metrics statistics.
+TORCH_API std::string CreateMetricReport();
+
+// Creates a report with the selected metrics statistics.
+TORCH_API std::string CreateMetricReport(
+    const std::vector<std::string>& counter_names,
+    const std::vector<std::string>& metric_names);
+
+// Returns the currently registered metric names. Note that the list can grow
+// since metrics are usually function intialized (they are static function
+// variables).
+TORCH_API std::vector<std::string> GetMetricNames();
+
+// Retrieves the metric data of a given metric, or nullptr if such metric does
+// not exist.
+TORCH_API MetricData* GetMetric(const std::string& name);
+
+// Returns the currently registered counter names. Note that the list can grow
+// since counters are usually function intialized (they are static function
+// variables).
+TORCH_API std::vector<std::string> GetCounterNames();
+
+// Retrieves the counter data of a given counter, or nullptr if such counter
+// does not exist.
+TORCH_API CounterData* GetCounter(const std::string& name);
+
+// Retrieves the current EPOCH time in nanoseconds.
+TORCH_API int64_t NowNs();
+
+// Scope based utility class TORCH_API to measure the time the code takes within
+// a given C++ scope.
+class TORCH_API TimedSection {
+ public:
+  explicit TimedSection(Metric* metric) : metric_(metric), start_(NowNs()) {}
+
+  ~TimedSection() {
+    int64_t now = NowNs();
+    metric_->AddSample(now, now - start_);
+  }
+
+  double Elapsed() const {
+    return 1e-9 * static_cast<double>(NowNs() - start_);
+  }
+
+ private:
+  Metric* metric_;
+  int64_t start_;
+};
+
+#define TORCH_LAZY_TIMED(name)                                  \
+  static torch::lazy::Metric* timed_metric =                    \
+      new torch::lazy::Metric(name, torch::lazy::MetricFnTime); \
+  torch::lazy::TimedSection timed_section(timed_metric)
+
+#define TORCH_LAZY_FN_COUNTER_TIMED_TRACING(ns) \
+  TORCH_LAZY_FN_COUNTER(ns);                    \
+  TORCH_LAZY_TIMED("LazyTracing")
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h
new file mode 100644
index 0000000000000000000000000000000000000000..46a6cee9303e47811006ed9aee9aa7b414cdf6c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h
@@ -0,0 +1,62 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/multi_wait.h
+ */
+
+#pragma once
+
+#include <condition_variable>
+#include <exception>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include <c10/macros/Export.h>
+
+namespace torch {
+namespace lazy {
+
+// Support waiting for a number of tasks to complete.
+class TORCH_API MultiWait {
+ public:
+  explicit MultiWait(size_t count) : count_(count) {}
+
+  // Signal the completion of a single task.
+  void Done();
+
+  // Waits until at least count (passed as constructor value) completions
+  // happened.
+  void Wait();
+
+  // Same as above, but waits up to wait_seconds.
+  void Wait(double wait_seconds);
+
+  // Resets the threshold counter for the MultiWait object. The completed count
+  // is also reset to zero.
+  void Reset(size_t count);
+
+  // Creates a completer functor which signals the mult wait object once func
+  // has completed. Handles exceptions by signaling the multi wait with the
+  // proper status value. This API returns a function which captures a MultiWait
+  // reference, so care must be taken such that the reference remains valid for
+  // the whole lifetime of the returned function.
+  std::function<void()> Completer(std::function<void()> func);
+
+  // Similar as the above API, but with explicit capture of the MultiWait shared
+  // pointer.
+  static std::function<void()> Completer(
+      std::shared_ptr<MultiWait> mwait,
+      std::function<void()> func);
+
+ private:
+  void Complete(const std::function<void()>& func);
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  size_t count_ = 0;
+  size_t completed_count_ = 0;
+  std::exception_ptr exptr_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e227287999c20db33e0f3a465c2131096fba0a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API NodePtr operator+(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator-(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator*(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator/(const Value& node1, const Value& node2);
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4983d4aba773a6d085a1950adf986defec89e139
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h
@@ -0,0 +1,41 @@
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API bool StrideIsSupported(c10::ArrayRef<int64_t> stride);
+
+TORCH_API std::vector<int64_t> GetArrayStridePermutation(
+    c10::ArrayRef<int64_t> stride);
+
+TORCH_API Shape MakeDiagonalShape(
+    const Shape& shape,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2);
+
+TORCH_API Shape
+MakePermuteShape(const Shape& source_shape, c10::ArrayRef<int64_t> permutation);
+
+TORCH_API Shape MakeSelectShape(
+    const Shape& shape,
+    int64_t dim,
+    int64_t start,
+    int64_t end,
+    int64_t stride);
+
+TORCH_API int64_t GetStride(int64_t start, int64_t end, int64_t stride);
+
+TORCH_API std::vector<int64_t> BuildSqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim);
+
+TORCH_API std::vector<int64_t> BuildUnsqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim);
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b22c220f7a00870bdf237c89da05c86c0be4d4b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API std::vector<int64_t> InversePermutation(
+    c10::ArrayRef<int64_t> input_permutation);
+
+TORCH_API bool IsPermutation(c10::ArrayRef<int64_t> permutation);
+
+// Gathers the input using the order specified by the permutation. For each i,
+// output[i] = dimensions[permutation[i]]. The given permutation must be the
+// same size as the input.
+template <typename Container>
+std::vector<typename Container::value_type> PermuteDimensions(
+    c10::ArrayRef<int64_t> permutation,
+    const Container& dimensions) {
+  using T = typename Container::value_type;
+  TORCH_CHECK(
+      dimensions.size() == permutation.size(),
+      "Invalid permutation specified. dimensions.size() != permutation.size()  (",
+      dimensions.size(),
+      " vs. ",
+      permutation.size(),
+      ")");
+  TORCH_CHECK(
+      IsPermutation(permutation),
+      "Invalid permutation specified. Permutation is not permutation");
+  std::vector<T> output(dimensions.size());
+  for (const auto i : c10::irange(permutation.size())) {
+    output[i] = dimensions[permutation[i]];
+  }
+  return output;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..21e2ba1759509ed820e1b22892ce7ae0c44822a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <ostream>
+#include <vector>
+
+#include <c10/core/Scalar.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+#include <torch/csrc/lazy/core/hash.h>
+
+C10_DECLARE_bool(ltc_enable_symbolic_shapes);
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Shape {
+ public:
+  Shape() = default;
+
+  Shape(
+      at::ScalarType scalar_type,
+      c10::ArrayRef<int64_t> sizes,
+      c10::optional<std::vector<bool>> is_symbolic = c10::nullopt);
+
+  std::string to_string() const;
+
+  c10::ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  void set_scalar_type(at::ScalarType value) {
+    scalar_type_ = value;
+  }
+
+  int64_t dim() const {
+    return sizes_.size();
+  }
+  c10::ArrayRef<int64_t> sizes() const {
+    return sizes_;
+  }
+  int64_t size(int64_t dim) const {
+    return sizes_.at(dim);
+  }
+  void set_size(int64_t dim, int64_t size) {
+    sizes_.at(dim) = size;
+  }
+
+  const c10::optional<std::vector<bool>>& is_symbolic() const {
+    return is_symbolic_;
+  }
+
+  // Makes a copy with symbolic dims applied
+  Shape with_symbolic_dims(
+      c10::optional<std::vector<bool>> symbolic_dims) const;
+
+  size_t numel() const;
+  hash_t hash(bool bakeInSizes) const;
+
+  bool operator==(const Shape& other) const;
+
+ private:
+  c10::ScalarType scalar_type_{c10::ScalarType::Undefined};
+
+  // Sizes are the upper bound sizes for a tensor, used by XLA.
+  std::vector<int64_t> sizes_;
+  // Stores which dimmensions are symbolic
+  // If nullopt, either it hasn't been initialized or the symbolic
+  // dimmensions are not calculatable
+  c10::optional<std::vector<bool>> is_symbolic_ = c10::nullopt;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Shape& shape);
+
+TORCH_API bool symbolicShapeEnabled();
+// Calculate and applies symbolic shapes onto the
+// Shape objects passed to result_shapes
+TORCH_API void applySymbolicShapesOnLT(
+    const char* schema_str,
+    std::vector<c10::IValue> args,
+    std::vector<Shape>& result_shapes);
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..a79e606f67508e15caf726d435a81a74a04e4965
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+// Turn clang-format off, as we rely on the whole signature being on one line
+// for codegen.
+// clang-format off
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_abs(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_cat(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_cholesky(const at::Tensor & self, bool upper);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clamp_min(const at::Tensor & self, const at::Scalar & min);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_expand(const at::Tensor & self, at::IntArrayRef size, bool implicit);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_expand(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_flip(const at::Tensor & self, at::IntArrayRef dims);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_inverse(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_isnan(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_forward(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logdet(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_and(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_not(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_or(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_xor(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_max(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mean(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_min(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mv(const at::Tensor & self, const at::Tensor & vec);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout(const at::Tensor & input, double p, c10::optional<bool> train);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_new_empty_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nonzero(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_slogdet(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_smooth_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sort(const at::Tensor & self, int64_t dim, bool descending);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_stack(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, bool unbiased);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, c10::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, c10::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_take(const at::Tensor & self, const at::Tensor & index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_trace(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_zero(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_narrow_copy_symint(const at::Tensor & self, int64_t dim, int64_t start, c10::SymInt length);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_selu(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, c10::optional<at::Generator> generator);
+
+// Non-Native ops
+TORCH_API std::vector<Shape> compute_shape_scalar(const at::Scalar& value, const at::ScalarType& type);
+TORCH_API std::vector<Shape> compute_shape_expand(const Output& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand);
+TORCH_API std::vector<Shape> compute_shape_view(const Output& input0, const std::vector<int64_t>& output_sizes);
+TORCH_API std::vector<Shape> compute_shape_cast(const Output& input0, const at::ScalarType& dtype, const c10::optional<at::ScalarType>& stype);
+
+// View Ops
+// (Now that functionalization pass is used, we should kill these in a later PR)
+TORCH_API std::vector<Shape> compute_shape_as_strided_view_update(const Output& target, const Output& input, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset);
+TORCH_API std::vector<Shape> compute_shape_as_strided(const Output& input, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset);
+TORCH_API std::vector<Shape> compute_shape_diagonal_view_update(const Output& target, const Output& input, const int64_t& offset, const int64_t& dim1, const int64_t& dim2);
+TORCH_API std::vector<Shape> compute_shape_diagonal(const Output& input, const int64_t& offset, const int64_t& dim1, const int64_t& dim2);
+TORCH_API std::vector<Shape> compute_shape_narrow_view_update(const Output& input, const Output& source, const std::vector<int64_t>& base_indices);
+TORCH_API std::vector<Shape> compute_shape_narrow(const Output& input, const std::vector<int64_t>& base_indices, const std::vector<int64_t>& sizes);
+TORCH_API std::vector<Shape> compute_shape_permute(const Output& input, const std::vector<int64_t>& dims);
+TORCH_API std::vector<Shape> compute_shape_resize(const Output& input, const std::vector<int64_t>& size);
+TORCH_API std::vector<Shape> compute_shape_select_view_update(const Output& target, const Output& source, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride);
+TORCH_API std::vector<Shape> compute_shape_select(const Output& input, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride);
+TORCH_API std::vector<Shape> compute_shape_squeeze(const Output& input, const int& dim);
+TORCH_API std::vector<Shape> compute_shape_unsqueeze(const Output& input, const int& dim);
+
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_select_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_diagonal_scatter(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_slice_scatter_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, c10::optional<c10::SymInt> start, c10::optional<c10::SymInt> end, c10::SymInt step);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset);
+// clang-format on
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce39865f113fbc1efbbcc1d85710e76a6391cd8f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API SymNodeImpl : public c10::SymNodeImpl {
+ public:
+  SymNodeImpl(NodePtr ptr) : node_(std::move(ptr)){};
+  NodePtr node_;
+};
+
+class LazyTensor;
+using LazyTensorPtr = c10::intrusive_ptr<LazyTensor>;
+
+class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
+ public:
+  // This is the core lazy tensor data structure where all the tensor data is
+  // held. The lazy tensor is nothing more than a shared pointer to a Data
+  // object.
+  struct Data {
+    Data(BackendDataPtr handle, BackendDevice device)
+        : handle(std::move(handle)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    Data(Value ir_value, BackendDevice device)
+        : ir_value(std::move(ir_value)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    Data(at::Tensor tensor_data, BackendDevice device)
+        : tensor_data(std::move(tensor_data)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    // TODO(alanwaketan): Remove this ctor. This is a
+    // temporary ctor to ease XLA LTC migration. It depends on
+    // XLA's Functionalization integration.
+    Data(BackendDevice device)
+        : device(std::move(device)), unique_id(GetNextTensorId()) {}
+
+    virtual ~Data();
+
+    BackendDataPtr handle;
+    Value ir_value;
+    c10::optional<at::Tensor> tensor_data;
+    const BackendDevice device;
+    const int64_t unique_id = 0;
+    size_t generation = 1;
+  };
+
+  static LazyTensorPtr Create(
+      const at::Tensor& tensor,
+      const BackendDevice& device);
+  static LazyTensorPtr Create(Value ir_value, const BackendDevice& device);
+  static LazyTensorPtr Create(BackendDataPtr handle);
+  static LazyTensorPtr Create(std::shared_ptr<Data> data);
+
+  // The default ctor previously created a null LazyTensor (one with no 'data'
+  // obj). Creating a null LazyTensor is no longer possible, since the same can
+  // be achieved by creating a null LazyTensorPtr and it is way too confusing to
+  // have to check both lazy_tensor_ptr && *lazy_tensor_ptr, so everywhere that
+  // used to rely on a LazyTensor obj with a null Data can now rely on a null
+  // LazyTensorPtr instead.
+  LazyTensor() = delete;
+  LazyTensor(const LazyTensor&) = default;
+  LazyTensor(LazyTensor&&) noexcept = default;
+
+  ~LazyTensor() override = default;
+
+  size_t generation() const {
+    return data()->generation;
+  }
+
+  // Override it to use your own Shape.
+  virtual int64_t size(int64_t dim) const;
+
+  // Override it to use your own graph executor.
+  virtual at::Tensor ToTensor(bool detached);
+
+  void ShallowCopyTo(LazyTensorPtr dest) const;
+
+  // Assigns the tensor value to the lazy tensor.
+  void SetTensor(at::Tensor tensor);
+
+  void UpdateFromTensor(at::Tensor tensor, bool sync);
+  void UpdateFromTensorOut(at::Tensor tensor);
+  void UpdateFromTensorOut(const LazyTensorPtr& tensor);
+
+  const std::shared_ptr<Data>& data() const;
+
+  // Override it to use your own type conversion.
+  virtual at::ScalarType dtype() const;
+
+  MaybeRef<Shape> shape() const;
+
+  const BackendDevice& GetDevice() const;
+  int64_t GetUniqueId() const;
+
+  // Fetches the data behind the tensor. If the tensor has a graph defining
+  // its current value, executes the graph and fetches the data result.
+  BackendDataPtr GetDataHandle();
+
+  // Fetches the current value of the data, which can be missing (nullptr)
+  // in case the tensor has a graph defining its current value,
+  BackendDataPtr CurrentDataHandle() const;
+
+  void SetDataHandle(BackendDataPtr handle);
+  void SetDataHandle(BackendDataPtr handle, bool sync);
+
+  // Retrieves the current IR Node, or nullptr in case no active IR Node is
+  // available.
+  Value CurrentIrValue() const;
+
+  // Retrieves the IR Node representing this LazyTensor. One will be created if
+  // missing. Note that although this is a const API, it actually changes the
+  // internal state ofthe object.
+  Value GetIrValue() const;
+
+  void SetIrValue(Value ir_value);
+  void SetInPlaceIrValue(Value ir_value);
+
+  c10::optional<at::Tensor> CurrentTensorData() const;
+
+  std::vector<LazyTensorPtr> MakeOutputTensors(NodePtr node) const;
+
+  LazyTensorPtr CopyTensorToDevice(const BackendDevice& device);
+
+  // Applies the queue of operations in preparation for using the data.
+  // Override it to use your own graph executor.
+  virtual void ApplyPendingGraph();
+
+  // Override it to set extra information.
+  virtual void AssignIrValue(Value ir_value) const;
+
+ protected:
+  explicit LazyTensor(std::shared_ptr<Data> data);
+
+  void SetTensorData(at::Tensor tensor_data);
+
+  // We build a graph accumulating operations, but at a given point we
+  // need to force a rendering, otherwise the graph can grow without control.
+  // Think:
+  //   for i in range(0, 100000):
+  //     a = a + b
+  void TryLimitGraphSize();
+
+  // Override it to instantiate your own data.
+  virtual Value GetIrValueForTensor(
+      const at::Tensor& tensor,
+      const BackendDevice& device) const;
+
+  Value CreateTensorNode(BackendDataPtr data, bool read_only) const;
+
+ private:
+  LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
+  LazyTensor(Value ir_value, const BackendDevice& device);
+  explicit LazyTensor(BackendDataPtr handle);
+
+  static int64_t GetNextTensorId();
+
+  std::shared_ptr<Data> data_;
+};
+
+// Utils to convert at::Tensor to LazyTensor, and vice versa.
+
+// Section 0: c10::Tensorlist ==> lazy::TensorList
+// note: GetTensorList is not totally parallel to GetLtcTensor; A TensorList
+// skips
+//       the LazyTensor wrappers, assuming that the list of underlying IR nodes
+//       is actually more useful for downstream computations.  TBD.
+TORCH_API torch::lazy::Value GetTensorList(at::ITensorListRef tensors);
+
+// Section 1: at::Tensor => LazyTensor.
+// Extracts the LazyTensor out of an at::Tensor. Returns a null LazyTensor
+// if the tensor is not a lazy tensor.
+TORCH_API LazyTensorPtr TryGetLtcTensor(const at::Tensor& tensor);
+
+// Extracts the LazyTensor out of an at::Tensor. Throws an exception
+// if the tensor is not a lazy tensor.
+TORCH_API LazyTensorPtr GetLtcTensor(const at::Tensor& tensor);
+
+// Same as above, applied to a list of tensors.
+TORCH_API std::vector<LazyTensorPtr> GetLtcTensors(
+    c10::ArrayRef<at::Tensor> tensors);
+
+// If tensor is a lazy tensor type, returns the LazyTensor embedded within it,
+// otherwise creates a new lazy tensor type with tensor as data.
+TORCH_API LazyTensorPtr GetOrCreateLtcTensor(
+    const c10::optional<at::Tensor>& tensor,
+    const BackendDevice& device);
+
+TORCH_API LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber(
+    const at::Tensor& tensor,
+    const BackendDevice& device);
+
+// Section 2: LazyTensor => at::Tensor.
+// Creates an ATen tensor from an LazyTensor.
+TORCH_API at::Tensor CreateAtenFromLtcTensor(const LazyTensorPtr& ltc_tensor);
+TORCH_API at::Tensor CreateAtenFromLtcTensor(LazyTensor&& ltc_tensor);
+
+// Note [Lazy Tensor Functionalization]
+// The functionalization pass is implemented by wrapping all TensorImpl
+// objects in C++ with an extra FunctionalTensorWrapper object,
+// that knows how to perform functionalization
+//
+// Certain functions in the aten API serve as entry/exit points for
+// functionalization, where we need to perform the wrapping/unwrapping:
+// - aten::to.device
+// - aten::empty
+
+// Given a non-lazy tensor, this function creates a lazy tensor on the specified
+// (lazy) device. The functionalize_output determines whether or not we should
+// wrap the output in a "functional wrapper".
+//
+// How do you know whether to pass true/false for functionalize_output?
+//
+// Case 1: nonlazy -> lazy
+//   If you're implementing a function that takes in nonlazy tensors and returns
+//   lazy tensors, then you should think of that function as an "entrypoint" to
+//   functionalization, and use functionalize_output=true Examples include:
+//   - factory functions (the LTC kernel for at::empty)
+//   - CPU -> Lazy device converions (the LTC kernel for at::to_device)
+//
+// Case 2: lazy -> lazy
+//   If you're implementing a function that takes in lazy tensors and returns
+//   lazy tensors,
+//   **but** requires creating lazy tensors internally,
+//   then you can assume that the current function is running inside of some
+//   outer context where functionalization is already running, that will take
+//   care of doing the wrapping for you, and use functionalize_output=true
+//   Examples include:
+//   - CPU fallback (takes in lazy tensors, converts to cpu, calls kernel,
+//   converts returns back to lazy tensors).
+TORCH_API at::Tensor to_lazy_tensor(
+    const at::Tensor& self,
+    const c10::TensorOptions& options,
+    at::Device device,
+    bool non_blocking,
+    bool functionalize_output);
+
+template <size_t... Indices>
+auto TupleAtenFromLtcTensorsImpl(
+    const std::vector<LazyTensorPtr>& tensors,
+    std::index_sequence<Indices...>) {
+  return std::make_tuple(CreateAtenFromLtcTensor(tensors[Indices])...);
+}
+
+template <size_t N>
+auto TupleAtenFromLtcTensors(const std::vector<LazyTensorPtr>& tensors) {
+  return TupleAtenFromLtcTensorsImpl(tensors, std::make_index_sequence<N>{});
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e383394332b2b1e04038b74ee144760831f57af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+// Tensor implementation class used to be fed to the at::Tensor.
+// Its scope is just to handle an LazyTensor.
+class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
+ public:
+  explicit LTCTensorImpl(const LazyTensorPtr& tensor);
+  explicit LTCTensorImpl(const LazyTensor& tensor);
+  explicit LTCTensorImpl(LazyTensor&& tensor);
+
+  LazyTensorPtr tensor() {
+    return tensor_;
+  }
+
+  void set_tensor(const LazyTensorPtr& lazy_tensor);
+
+  void force_refresh_sizes() {
+    generation_ = 0;
+  }
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t numel_custom() const override;
+  int64_t storage_offset_custom() const override;
+  int64_t dim_custom() const override;
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  bool is_strides_like_custom(at::MemoryFormat memory_format) const override;
+  bool is_non_overlapping_and_dense_custom() const override;
+
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  c10::SymInt sym_numel_custom() const override;
+
+ private:
+  void setup_size_properties();
+
+  LazyTensorPtr tensor_;
+  mutable c10::optional<std::vector<c10::SymInt>> sym_sizes_;
+  size_t generation_{0};
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..17f42723de6c862284cf8ad41b3b25840b1f944b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/shape.h>
+
+#include <ATen/FunctionalTensorWrapper.h>
+
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+TORCH_API std::vector<int64_t> ComputeArrayStrides(
+    c10::ArrayRef<int64_t> sizes);
+
+TORCH_API std::vector<at::Tensor> DataHandlesToTensors(
+    c10::ArrayRef<BackendDataPtr> data_handles,
+    at::ScalarType dest_element_type);
+
+// Uploads an ATEN tensor data to the device and fetches the corresponding
+// device data handle.
+TORCH_API BackendDataPtr
+TensorToDataHandle(const at::Tensor& tensor, const BackendDevice& device);
+
+// Retrieves the device data handles by parallel uploading data onto the
+// corresponding devices.
+TORCH_API std::vector<BackendDataPtr> CreateTensorsData(
+    const std::vector<at::Tensor>& tensors,
+    const std::vector<BackendDevice>& devices);
+
+// Makes a deep copy of an ATEN tensor.
+inline at::Tensor CopyTensor(const at::Tensor& ref) {
+  return ref.to(ref.options(), /*non_blocking=*/false, /*copy=*/true);
+}
+
+// Same as above, with an additional cast.
+inline at::Tensor CopyTensor(
+    const at::Tensor& ref,
+    at::ScalarType dest_type,
+    bool copy = true) {
+  return ref.to(ref.options().dtype(dest_type), /*non_blocking=*/false, copy);
+}
+
+template <typename T, typename S>
+T OptionalOr(const c10::optional<S>& value, T defval) {
+  return value ? static_cast<T>(*value) : defval;
+}
+
+// Unwraps tensor to target dtype if it's a wrapped number.
+inline at::Tensor UnwrapNumber(const at::Tensor& tensor, at::ScalarType dtype) {
+  return tensor.unsafeGetTensorImpl()->is_wrapped_number() ? tensor.to(dtype)
+                                                           : tensor;
+}
+
+template <typename T>
+at::Scalar MakeIntScalar(T value) {
+  return at::Scalar(static_cast<int64_t>(value));
+}
+
+// Routing values to device data maximizes the changes for compilation cache
+// hits, but it can prevent the compiler to perform optimizations. So tensor
+// values which are within a given set, are routed to constant scalars if this
+// API returns true.
+TORCH_API bool IsSpecialScalar(const at::Scalar& value);
+
+// Note: returns a reference instead of a fresh tensor to avoid refcount bumps.
+inline const at::Tensor& maybe_unwrap_functional(const at::Tensor& tensor) {
+  if (at::functionalization::impl::isFunctionalTensor(tensor)) {
+    return at::functionalization::impl::unsafeGetFunctionalWrapper(tensor)
+        ->value();
+  } else {
+    return tensor;
+  }
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..e842e9c37ab4e02342f32debdfb8a30c3f161464
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h
@@ -0,0 +1,37 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <thread>
+
+#include <c10/macros/Export.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API Completion {
+ public:
+  class Data;
+
+  explicit Completion(std::shared_ptr<Data> data);
+
+  ~Completion();
+
+  void Wait();
+
+ private:
+  std::shared_ptr<Data> data_;
+};
+
+// Schedules a closure which might wait for IO or other events/conditions.
+TORCH_API void ScheduleIoClosure(std::function<void()> closure);
+TORCH_API Completion
+ScheduleIoClosureWithCompletion(std::function<void()> closure);
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h
new file mode 100644
index 0000000000000000000000000000000000000000..c58246b25e93af2c35fdc04a4031de822c1152fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <atomic>
+#include <list>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/metrics.h>
+
+namespace torch {
+namespace lazy {
+
+struct TORCH_API TrieNode {
+  static size_t GetNextUniqueId() {
+    static thread_local size_t id_generator = 0;
+    return id_generator++;
+  }
+
+  size_t unique_id;
+  size_t hit_counter;
+  NodePtr ir_node;
+  std::list<std::shared_ptr<TrieNode>> successors;
+
+  TrieNode() : unique_id(GetNextUniqueId()), hit_counter(0), ir_node(nullptr) {}
+  explicit TrieNode(NodePtr node)
+      : unique_id(GetNextUniqueId()),
+        hit_counter(0),
+        ir_node(std::move(node)) {}
+};
+
+class TORCH_API TrieCache {
+ public:
+  static TrieCache* Get();
+
+  TrieNode* Current() const;
+  // Take an iterator as the input because we want to move the corresponding
+  // node in the successor list to achieve a LRU caching effect
+  void SetCurrent(std::list<std::shared_ptr<TrieNode>>::iterator& iter);
+  // Used in MarkStep to indicate the end of one tracing
+  void ResetCurrent();
+
+  // Create a new TrieNode for ir_node and insert into the TrieCache
+  void Insert(NodePtr ir_node);
+
+  // Clear all TrieCache nodes
+  // TODO: Because we don't expect user to explicitly call this function via
+  // a Python API, we may need to introduce a threshold on the size of the cache
+  // to avoid holding tensors for too long.
+  void Clear();
+
+  void DumpToDotFile(const std::string& file_name);
+
+ private:
+  TrieCache();
+
+  std::shared_ptr<TrieNode> root_;
+  TrieNode* current_;
+};
+
+template <typename T, typename... Args>
+NodePtr LookupNodeFromTrieCache(Args&&... args) {
+  auto& successors = TrieCache::Get()->Current()->successors;
+  for (auto it = successors.begin(); it != successors.end(); it++) {
+    NodePtr ir_node = (*it)->ir_node;
+    const T* concrete_node = NodeCast<T>(ir_node.get());
+    if (concrete_node &&
+        concrete_node->CanBeReused(std::forward<Args>(args)...)) {
+      TORCH_LAZY_COUNTER(
+          "IrNodeReused_" + c10::demangle((typeid(T).name())), 1);
+      (*it)->hit_counter++;
+      TrieCache::Get()->SetCurrent(it);
+      return ir_node;
+    }
+  }
+  return nullptr;
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..b728c101447bfe8a351891656df7f07cf371145b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h
@@ -0,0 +1,56 @@
+/**
+ * Unique in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/unique.h
+ */
+
+#pragma once
+
+#include <c10/util/Optional.h>
+
+#include <functional>
+#include <set>
+
+namespace torch {
+namespace lazy {
+
+// Helper class to allow tracking zero or more things, which should be forcibly
+// be one only thing.
+template <typename T, typename C = std::equal_to<T>>
+class Unique {
+ public:
+  std::pair<bool, const T&> set(const T& value) {
+    if (value_) {
+      TORCH_CHECK(C()(*value_, value), "'", *value_, "' vs '", value);
+      return std::pair<bool, const T&>(false, *value_);
+    }
+    value_ = value;
+    return std::pair<bool, const T&>(true, *value_);
+  }
+
+  operator bool() const {
+    return value_.has_value();
+  }
+  operator const T&() const {
+    return *value_;
+  }
+  const T& operator*() const {
+    return *value_;
+  }
+  const T* operator->() const {
+    return value_.operator->();
+  }
+
+  std::set<T> AsSet() const {
+    std::set<T> vset;
+    if (value_.has_value()) {
+      vset.insert(*value_);
+    }
+    return vset;
+  }
+
+ private:
+  c10::optional<T> value_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..872c8a9d62e7b212eb5f355141ff4965cb171c19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h
@@ -0,0 +1,126 @@
+/**
+ * Most of the utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/master/third_party/xla_client/util.h
+ */
+
+#pragma once
+
+#include <exception>
+#include <functional>
+#include <vector>
+
+#include <c10/util/Optional.h>
+#include <c10/util/OptionalArrayRef.h>
+
+namespace torch {
+namespace lazy {
+
+// Similar to c10::scope_exit but with a status.
+// TODO(alanwaketan): Consolidate it with c10::scope_exit.
+template <typename T>
+class Cleanup {
+ public:
+  using StatusType = T;
+
+  explicit Cleanup(std::function<void(StatusType&&)>&& func)
+      : func_(std::move(func)) {}
+  Cleanup(Cleanup&& ref) noexcept
+      : func_(std::move(ref.func_)), status_(std::move(ref.status_)) {}
+  Cleanup(const Cleanup&) = delete;
+
+  ~Cleanup() {
+    if (func_ != nullptr) {
+      func_(std::move(status_));
+    }
+  }
+
+  Cleanup& operator=(const Cleanup&) = delete;
+
+  Cleanup& operator=(Cleanup&& ref) noexcept {
+    if (this != &ref) {
+      func_ = std::move(ref.func_);
+      status_ = std::move(ref.status_);
+    }
+    return *this;
+  }
+
+  void Release() {
+    func_ = nullptr;
+  }
+
+  void SetStatus(StatusType&& status) {
+    status_ = std::move(status);
+  }
+
+  const StatusType& GetStatus() const {
+    return status_;
+  }
+
+ private:
+  std::function<void(StatusType&&)> func_;
+  StatusType status_;
+};
+
+using ExceptionCleanup = Cleanup<std::exception_ptr>;
+
+// Allows APIs which might return const references and values, to not be forced
+// to return values in the signature.
+// TODO(alanwaketan): This is clever, but is there really no std or c10
+// supports? Needs more investigations.
+template <typename T>
+class MaybeRef {
+ public:
+  /* implicit */ MaybeRef(const T& ref) : ref_(ref) {}
+  /* implicit */ MaybeRef(T&& value)
+      : storage_(std::move(value)), ref_(*storage_) {}
+
+  const T& Get() const {
+    return ref_;
+  }
+  const T& operator*() const {
+    return Get();
+  }
+  operator const T&() const {
+    return Get();
+  }
+
+  bool IsStored() const {
+    return storage_.has_value();
+  }
+
+ private:
+  c10::optional<T> storage_;
+  const T& ref_;
+};
+
+template <typename T>
+std::vector<T> Iota(size_t size, T init = 0, T incr = 1) {
+  std::vector<T> result(size);
+  T value = init;
+  for (size_t i = 0; i < size; ++i, value += incr) {
+    result[i] = value;
+  }
+  return result;
+}
+
+template <typename T, typename S>
+std::vector<T> ToVector(const S& input) {
+  return std::vector<T>(input.begin(), input.end());
+}
+
+template <typename T>
+c10::optional<std::vector<T>> ToOptionalVector(
+    c10::OptionalArrayRef<T> arrayRef) {
+  if (arrayRef) {
+    return arrayRef->vec();
+  }
+  return c10::nullopt;
+}
+
+template <typename T>
+typename std::underlying_type<T>::type GetEnumValue(T value) {
+  return static_cast<typename std::underlying_type<T>::type>(value);
+}
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..21b49d7c5f5ea39301f98a8146a61c3694362aea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <c10/util/Optional.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <vector>
+
+namespace torch {
+namespace lazy {
+
+c10::optional<SourceLocation> TORCH_PYTHON_API GetPythonFrameTop();
+
+std::vector<SourceLocation> TORCH_PYTHON_API GetPythonFrames();
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..99dc538bbeb3f70e6c76a47f2f779d57357ef9a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <c10/util/Flags.h>
+
+// TODO(whc) unclear if this is useful, has only been tested as true
+C10_DECLARE_bool(torch_lazy_ts_tensor_update_sync);
+
+C10_DECLARE_bool(torch_lazy_ts_cuda);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6f2225bd8f3aea0471d86d6a711610d828da33a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/dynamic_ir.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+C10_DECLARE_bool(ltc_enable_dynamic_shapes);
+
+namespace torch {
+namespace lazy {
+
+/**
+ * The goal of "dynamic" Nodes is to patch a hole in our tracing.
+ * Previously, if a user called `sizes` on a Tensor, it would leak out
+ * of our tracing system, as `sizes` returns a torch.Size or an int. To
+ * prevent this from happening, we introduce DimensionNode, a new type
+ * of Node that abstracts the operation of getting the dimensions of a
+ * Tensor.
+ *
+ * Consider the following example:
+ * ```
+ * numel = x.shape()[0] * x.shape()[1]
+ * ```
+ *
+ * Here, `x.shape()[i]` will be a SizeNode (subclass of DimensionNode),
+ * and the multiplication of the two SizeNodes will be represented by
+ * a SizeMul (also a subclass of DimensionNode). Through this, we can
+ * prevent `numel` from being represented as a Python int and thus
+ * burned into the Graph.
+ */
+
+// Represents the result of calling `size` on a Tensor
+class TORCH_API SizeNode : public TsNode, public DimensionNode {
+ public:
+  SizeNode(Value input, size_t dim);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+  size_t dim_ = 0;
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const override;
+};
+
+class TORCH_API SizeAdd : public TsNode, public DimensionNode {
+ public:
+  SizeAdd(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeMul : public TsNode, public DimensionNode {
+ public:
+  SizeMul(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeDiv : public TsNode, public DimensionNode {
+ public:
+  SizeDiv(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cc83ad5737f90a8808bec3b517c14e9f67eb69e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/generated/LazyNonNativeIr.h>
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
+#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
+#include <torch/csrc/lazy/ts_backend/ops/generic.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch {
+namespace lazy {
+
+struct TorchScriptIrBuilder : IrBuilder {
+  NodePtr MakeDeviceData(
+      const std::shared_ptr<BackendData>& data) const override {
+    return DeviceData::Create(data);
+  }
+  // TODO: Scalar node is not currently used by ts_backend. Enable reusing
+  // Scalar node later if needed.
+  NodePtr MakeScalar(const at::Scalar& value, const at::ScalarType& type)
+      const override {
+    return MakeNode<Scalar>(value, type);
+  }
+  NodePtr MakeExpand(
+      const Value& input0,
+      const std::vector<int64_t>& size,
+      const bool& is_scalar_expand) const override {
+    return ReuseOrMakeNode<Expand>(input0, size, is_scalar_expand);
+  }
+  NodePtr MakeCast(
+      const Value& input0,
+      const at::ScalarType& dtype,
+      const c10::optional<at::ScalarType>& stype =
+          c10::nullopt) const override {
+    return ReuseOrMakeNode<Cast>(input0, dtype, stype);
+  }
+  NodePtr MakeTensorList(const OpList& inputs) const override {
+    return ReuseOrMakeNode<TensorList>(inputs);
+  }
+  // Generic needs cleanup
+  NodePtr MakeGeneric(
+      const OpKind& op,
+      const OpList& operands,
+      const Shape& shape,
+      const size_t& num_outputs = 1,
+      const hash_t& hash_seed =
+          static_cast<uint32_t>(0x5a2d296e9)) const override {
+    return MakeNode<Generic>(op, operands, shape, num_outputs, hash_seed);
+  }
+
+  // dynamic ir nodes
+  // TODO: verify if IR node reusing works for Dynamic shape ops
+  NodePtr MakeSizeNode(const Value& input, size_t dim) const override {
+    return MakeNode<SizeNode>(input, dim);
+  }
+  NodePtr MakeSizeAdd(const Value& a, const Value& b) const override {
+    return MakeNode<SizeAdd>(a, b);
+  }
+  NodePtr MakeSizeMul(const Value& a, const Value& b) const override {
+    return MakeNode<SizeMul>(a, b);
+  }
+  NodePtr MakeSizeDiv(const Value& a, const Value& b) const override {
+    return MakeNode<SizeDiv>(a, b);
+  }
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..87c623343d9368a2f9aa7dc8394a53fb1716c454
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch {
+namespace lazy {
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
+// Fills the input with the given value.
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..095a87bc3c16235aa00702a9b2bbdb0142e364b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+
+namespace torch {
+namespace lazy {
+
+struct MaxPool3dAutogradFunctionTS
+    : public torch::autograd::Function<MaxPool3dAutogradFunctionTS> {
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      at::Tensor self,
+      at::IntArrayRef kernel_size,
+      at::IntArrayRef stride,
+      at::IntArrayRef padding,
+      at::IntArrayRef dilation,
+      bool ceil_mode);
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a01b129589a4f2059d2b23cc79b05e838532696
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_interface.h>
+
+namespace torch {
+namespace lazy {
+
+class TORCH_API TSData : public torch::lazy::BackendData {
+ public:
+  TSData(const at::Scalar& scalar, const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, torch::lazy::Shape(scalar.type(), {})),
+        scalar(scalar) {}
+
+  TSData(
+      const at::Tensor& data,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape), data_(data) {}
+
+  TSData(
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape) {}
+
+  Handle GetHandle() override {
+    return reinterpret_cast<int64_t>(this);
+  }
+
+  void Assign(const torch::lazy::BackendData& data) override {
+    data_ = static_cast<const TSData&>(data).data_;
+  }
+
+  bool HasValue() const override {
+    return data_.defined();
+  }
+
+  at::Tensor data() {
+    return data_;
+  }
+
+  c10::optional<at::Scalar> scalar;
+
+ private:
+  at::Tensor data_;
+};
+
+TORCH_API torch::lazy::BackendImplInterface* GetTSBackendImpl();
+
+TORCH_API void InitTorchScriptBackend();
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..96270d1e0f8309fd8f1549bf55c8ac203498bb4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <functional>
+
+namespace torch {
+namespace lazy {
+
+bool force_eager_fallback(c10::Symbol op);
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type);
+
+// The TorchScript backend does not register itself with pytorch dispatcher
+// until it is explicitly initialized.  This function should only be called
+// by the main Torchscript backend init function.
+void register_ts_ltc_eager_fallback();
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..471b8cd6ef44435e102b312a7764667847b615af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <sstream>
+
+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
+
+namespace torch {
+namespace lazy {
+
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+class TORCH_API TSComputation : public Computation {
+ public:
+  TSComputation(const std::shared_ptr<torch::jit::Graph>& graph)
+      : graph_(graph), graph_executor_(graph, "") {
+    for (torch::jit::Value* input : graph_->inputs()) {
+      parameter_names_.push_back(input->debugName());
+    }
+  }
+
+  int parameters_size() const override {
+    return parameter_names_.size();
+  }
+
+  const std::vector<Shape>& parameter_shapes() const override {
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+    return parameter_shapes_;
+  }
+
+  const std::vector<std::string>& parameter_names() const override {
+    return parameter_names_;
+  }
+
+  const Shape& result_shape() const override {
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+    return result_shape_;
+  }
+
+  const std::string to_string() const override {
+    std::ostringstream oss;
+    oss << *graph_;
+    return oss.str();
+  }
+
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
+
+  torch::jit::GraphExecutor& graph_executor() {
+    return graph_executor_;
+  }
+
+ private:
+  std::shared_ptr<torch::jit::Graph> graph_;
+  torch::jit::GraphExecutor graph_executor_;
+  std::vector<std::string> parameter_names_;
+  std::vector<Shape> parameter_shapes_;
+  Shape result_shape_;
+};
+
+class TORCH_API TSLoweringContext : public LoweringContext {
+ public:
+  TSLoweringContext(const std::string& name, const BackendDevice device);
+
+  TSLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  size_t AddResult(const Output& output) override {
+    return AddResult(GetOutputOp(output));
+  }
+
+  void AddParameter(
+      const torch::lazy::Output& output,
+      size_t index,
+      const Shape& shape,
+      const std::string& name) override {
+    TORCH_INTERNAL_ASSERT(false, "not implemented");
+  }
+
+  void Lower(const Node* node);
+
+  ComputationPtr Build() override {
+    for (torch::jit::Value* output : root_tuple_) {
+      graph_->block()->registerOutput(output);
+    }
+    return std::shared_ptr<Computation>(new TSComputation(graph_));
+  }
+
+  // Retrieves the lowered operation for an output. If the requested output is
+  // not available yet, the graph behind the output's Node is lowered, and the
+  // corresponding TS operation returned.
+  torch::jit::Value* GetOutputOp(const Output& output) {
+    auto it = emitted_outputs_.find(output);
+    if (it == emitted_outputs_.end()) {
+      auto post_order = Util::ComputePostOrder(output.node, &emit_status_);
+      for (auto node : post_order) {
+        Lower(node);
+      }
+      // At this point the output better be present, otherwise there is an issue
+      // with the lowering code.
+      it = emitted_outputs_.find(output);
+      TORCH_CHECK(
+          it != emitted_outputs_.end(),
+          "No TS operation emitted for output: ",
+          output.ToString());
+    }
+    return it->second;
+  }
+
+  // Assigns the given TS operation to the specified output. As outputs are
+  // lowered in a post-order fashion, later nodes should always find their
+  // operands among the emitted outputs.
+  void AssignOutputOp(const Output& output, torch::jit::Value* op);
+
+  // If a parameter associated with data has already been declared, it will be
+  // returned. Otherwise a new one will be created, associated with the tensor
+  // held in data.
+  torch::jit::Value* GetParameter(BackendDataPtr data);
+
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
+
+ private:
+  struct Parameter {
+    torch::jit::Value* param{nullptr};
+    size_t index = 0;
+  };
+
+  size_t AddResult(torch::jit::Value* op) {
+    root_tuple_.push_back(std::move(op));
+    return root_tuple_.size() - 1;
+  }
+
+  std::shared_ptr<torch::jit::Graph> graph_;
+  std::shared_ptr<torch::jit::GraphFunction> function_;
+  std::unordered_map<BackendData::Handle, Parameter> parameters_map_;
+  std::vector<torch::jit::Value*> root_tuple_;
+  OutputMap<torch::jit::Value*> emitted_outputs_;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..6513a2ac3a53ca1c7796557b926dbe215e2d5e0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch {
+namespace lazy {
+
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+class TORCH_API TsNode : public lazy::Node {
+ public:
+  TsNode(
+      OpKind op,
+      OpList operands,
+      std::vector<Shape>&& shapes,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      OpList operands,
+      const std::function<Shape()>& shape_fn,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      OpList operands,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      Shape shape,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  ~TsNode() override = default;
+
+  hash_t hash() const override;
+
+  hash_t shapeHash() const override;
+
+  const std::string getPythonStacktrace() const;
+
+  // Lower is a backend-specific method since it returns a backend specific
+  // type. hence, it is convenient to define it differently per-backend rather
+  // than at Node API
+  virtual TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const;
+
+ private:
+  // The hash of the dag WITH size info. Used for shape caching
+  hash_t shape_hash_;
+  // The hash of the dag used to look up the compiled graph by a hash
+  // in this case, we will use the dag hash WITHOUT size info if dynamic shape
+  // is enabled and use the dag hash WITH size info otherwise.
+  hash_t dag_hash_;
+};
+
+// Note: this OpKind is separate from ltc_ops.h since it would be a circular
+// import otherwise, I like leaving TensorList in this file, and I think most of
+// ltc_ops special cases will be deleted anyway
+const OpKind tensor_list_opkind = OpKind::Get("lazy_tensors::tensor_list");
+
+// TensorList represents an at::TensorList which is a vector[Tensor] but is also
+// a first-class IValue and can be fed as a single input to a TS program.  It is
+// much easier to handle TensorLists in Lazy Tensor code if they are represented
+// as a single Node so there can be more than one TensorList and more than one
+// Tensor side-by-side as operands to an op.
+//
+// Note: shape is undefined for TensorList.  We assert in some places that
+// #shapes matches #outputs and this stems from
+//       the fact that currently all IR nodes represent tensors (there is no
+//       type system for this IR).  Becuase of this, TensorList is a bit of a
+//       hack.
+//
+// TODO(whc) once Shape() API is moved to Node base, also make it virtual, and
+// then implement it as NotImplemented for TensorList, also fixing the assertion
+// that would fail.
+struct TORCH_API TensorList : public TsNode {
+  static OpKind ClassOpKind() {
+    return tensor_list_opkind;
+  }
+
+  TensorList() = delete;
+  TensorList(OpList values);
+
+  bool CanBeReused(OpList values) const {
+    return operands() == std::vector<Output>(values.begin(), values.end());
+  }
+
+  TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const override;
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c0dca1ae0c0e48a3fd5197dbe4c668eba022638
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+
+namespace torch {
+namespace lazy {
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+TORCH_API TSOpVector LowerTSBuiltin(
+    std::shared_ptr<torch::jit::GraphFunction> function,
+    c10::Symbol sym,
+    const std::vector<torch::jit::NamedValue>& arguments,
+    const std::vector<torch::jit::NamedValue>& kwarguments = {});
+
+} // namespace lazy
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..de6c7f73e510e8391f4c01ba4841ada6b1f7e376
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <onnx/onnx_pb.h>
+
+namespace torch::onnx {
+
+// The following constants are defined here to avoid breaking Meta's internal
+// usage of ONNX which pre-dates ONNX 1.14 and thus does not support FLOAT8:
+// cf. https://github.com/pytorch/pytorch/pull/106379#issuecomment-1675189340
+// -abock, 2023-08-25
+//
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN
+constexpr auto TensorProto_DataType_FLOAT8E4M3FN =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(17);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E4M3FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(18);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2
+constexpr auto TensorProto_DataType_FLOAT8E5M2 =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(19);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E5M2FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(20);
+
+} // namespace torch::onnx
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..d58f7d4bf160317b2525d498167924a4be7efef9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::onnx {
+
+void initONNXBindings(PyObject* module);
+
+} // namespace torch::onnx
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..57ba402cb686f8c2da1329f0a17a51a3e28c422d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h
@@ -0,0 +1,20 @@
+#pragma once
+
+namespace torch::onnx {
+
+enum class OperatorExportTypes {
+  ONNX, // Strict ONNX export
+  ONNX_ATEN, // ONNX With ATen op everywhere
+  ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback
+  ONNX_FALLTHROUGH, // Export supported ONNX ops. Pass through unsupported ops.
+};
+
+enum class TrainingMode {
+  EVAL, // Inference mode
+  PRESERVE, // Preserve model state (eval/training)
+  TRAINING, // Training mode
+};
+
+constexpr char kOnnxNodeNameAttribute[] = "onnx_name";
+
+} // namespace torch::onnx
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/api.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ad02a7ca208cf588f398892d8de81d75927426e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/api.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/profiler/orchestration/observer.h>
+
+// There are some components which use these symbols. Until we migrate them
+// we have to mirror them in the old autograd namespace.
+namespace torch {
+namespace autograd {
+namespace profiler {
+using torch::profiler::impl::ActivityType;
+using torch::profiler::impl::getProfilerConfig;
+using torch::profiler::impl::ProfilerConfig;
+using torch::profiler::impl::profilerEnabled;
+using torch::profiler::impl::ProfilerState;
+} // namespace profiler
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h
new file mode 100644
index 0000000000000000000000000000000000000000..59150d0a3bef633fdd91b483d92cb990da010b51
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h
@@ -0,0 +1,661 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include <ATen/Context.h>
+#include <c10/core/Device.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/strong_type.h>
+#include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/data_flow.h>
+#include <torch/csrc/profiler/events.h>
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+#include <torch/csrc/profiler/perf.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+#include <torch/csrc/utils/python_stub.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+enum class EventType : uint8_t {
+  TorchOp = 0,
+  Backend,
+  Vulkan,
+  Allocation,
+  OutOfMemory,
+  PyCall,
+  PyCCall,
+  Kineto
+};
+
+// ============================================================================
+// == Value (Tensor, Scalar) summary ==========================================
+// ============================================================================
+struct TORCH_API RawTensorMetadataBase {
+  RawTensorMetadataBase() = default;
+  explicit RawTensorMetadataBase(const at::Tensor& t);
+
+  StorageImplData data_;
+  c10::ScalarType dtype_{c10::ScalarType::Undefined};
+  c10::Layout layout_{c10::Layout::Strided};
+  uint32_t dim_{0};
+};
+
+// Collected during profiling.
+struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
+  RawTensorMetadata() = default;
+  RawTensorMetadata(const RawTensorMetadata&) = default;
+  RawTensorMetadata(RawTensorMetadata&&) noexcept = default;
+  RawTensorMetadata& operator=(const RawTensorMetadata&) = default;
+  RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default;
+  explicit RawTensorMetadata(const at::Tensor& t);
+
+  // Wrap `weak_self_` in `c10::optional` and split device into components to
+  // keep struct default constructable. (which the std::array initializer needs)
+  c10::optional<WeakTensor> weak_self_;
+  c10::DeviceType device_type_{c10::DeviceType::CPU};
+  c10::DeviceIndex device_index_{-1};
+};
+
+// Used during post processing.
+struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
+  TensorMetadata(
+      const RawTensorMetadata& r,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides);
+
+  TensorImplAddress impl() const {
+    return weak_self_.get();
+  }
+
+  WeakTensor weak_self_;
+  c10::Device device_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  // Set during `calculateUniqueTensorIDs`.
+  c10::optional<TensorID> id_;
+  c10::optional<AllocationID> allocation_id_;
+};
+
+using op_input_t = std::variant<
+    TensorMetadata,
+    std::vector<TensorMetadata>,
+    c10::IValue,
+    c10::nullopt_t>;
+
+// ============================================================================
+// == ExtraFields =============================================================
+// ============================================================================
+template <EventType>
+struct ExtraFields;
+
+struct TorchOpBasicFields {
+  int64_t sequence_number_{0};
+  uint64_t forward_tid_{0};
+  at::RecordScope scope_{};
+  bool is_async_{false};
+  int64_t debug_handle_{0};
+  std::string name_;
+
+  // Set in the exit callback.
+  uint64_t end_tid_{0};
+};
+
+using jit_stack_t = std::vector<std::string>;
+using jit_modules_t = std::vector<std::string>;
+using extra_args_t = std::unordered_map<std::string, c10::IValue>;
+using extra_meta_t = std::unordered_map<std::string, std::string>;
+
+struct FallbackPair {
+  ProfilerVoidEventStub device_event_start_ = nullptr;
+  ProfilerVoidEventStub device_event_end_ = nullptr;
+};
+
+template <>
+struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
+  ExtraFields(
+      TorchOpBasicFields&& f,
+      uint64_t correlation_id,
+      c10::time_t end_time_ns,
+      std::vector<op_input_t>&& inputs,
+      std::vector<op_input_t>&& concrete_inputs,
+      jit_stack_t&& jit_stack,
+      jit_modules_t&& jit_modules,
+      extra_args_t&& extra_args,
+      extra_meta_t&& extra_meta,
+      FallbackPair&& device_fallback,
+      bool allow_tf32_cublas,
+      std::unique_ptr<perf_counters_t>&& perf_event_counters)
+      : TorchOpBasicFields(std::move(f)),
+        correlation_id_{correlation_id},
+        end_time_ns_{end_time_ns},
+        inputs_{std::move(inputs)},
+        concrete_inputs_{std::move(concrete_inputs)},
+        jit_stack_{std::move(jit_stack)},
+        jit_modules_{std::move(jit_modules)},
+        extra_args_{std::move(extra_args)},
+        extra_meta_{std::move(extra_meta)},
+        device_fallback_{std::move(device_fallback)},
+        allow_tf32_cublas_{allow_tf32_cublas},
+        perf_event_counters_{std::move(perf_event_counters)} {}
+  uint64_t correlation_id_;
+  c10::time_t end_time_ns_;
+  std::vector<op_input_t> inputs_;
+  std::vector<op_input_t> concrete_inputs_;
+  jit_stack_t jit_stack_;
+  jit_modules_t jit_modules_;
+  extra_args_t extra_args_;
+  extra_meta_t extra_meta_;
+  FallbackPair device_fallback_;
+  bool allow_tf32_cublas_;
+  std::unique_ptr<perf_counters_t> perf_event_counters_;
+};
+
+template <>
+struct ExtraFields<EventType::Backend> {
+  int64_t start_time_us_;
+  int64_t end_time_us_;
+  int64_t debug_handle_;
+  at::RecordScope scope_;
+  std::string name_;
+  std::string backend_;
+  jit_stack_t jit_stack_;
+  jit_modules_t jit_modules_;
+};
+
+template <>
+struct ExtraFields<EventType::Vulkan> {
+  using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
+  std::string name_;
+  int64_t duration_ns_{0};
+  // While building the event tree, we want to report a vulkan event's duration
+  // as 0 so that its end time doesn't exceed that of its parent cpu op
+  bool in_tree_building_{false};
+};
+
+struct RawAllocation {
+  c10::approx_time_t start_time_;
+  void* ptr_;
+  int64_t alloc_size_;
+  size_t total_allocated_;
+  size_t total_reserved_;
+  c10::DeviceType device_type_;
+  c10::DeviceIndex device_index_;
+};
+
+// For performance.
+static_assert(c10::is_pod_v<RawAllocation>, "Non-POD member of RawAllocation.");
+
+template <>
+struct ExtraFields<EventType::Allocation> : RawAllocation {
+  ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {}
+
+  c10::Device device() const {
+    return {device_type_, device_index_};
+  }
+
+  c10::optional<TensorID> id_;
+  c10::optional<AllocationID> allocation_id_;
+};
+
+template <>
+struct ExtraFields<EventType::OutOfMemory> {
+  c10::approx_time_t start_time_;
+  int64_t alloc_size_;
+  size_t total_allocated_;
+  size_t total_reserved_;
+  c10::DeviceType device_type_;
+  c10::DeviceIndex device_index_;
+};
+
+// For performance.
+static_assert(
+    c10::is_pod_v<ExtraFields<EventType::OutOfMemory>>,
+    "Non-POD member of ExtraFields<EventType::OutOfMemory>.");
+
+struct PyFrameState {
+  int line_no_;
+  at::StringView filename_;
+  at::StringView funcname_;
+};
+
+template <typename T, typename Tag>
+using strong_t = strong::
+    type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
+
+using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>;
+using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>;
+using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>;
+using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>;
+using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>;
+
+struct NNModuleInfo {
+  struct ParameterInfo {
+    std::string name_;
+    TensorMetadata metadata_;
+    c10::optional<TensorMetadata> grad_metadata_;
+  };
+
+  PyModuleSelf self_;
+  PyModuleCls cls_;
+  at::StringView cls_name_;
+
+  std::vector<ParameterInfo> parameters_;
+  // Indicates that `self_` is the kth instance of `cls_` observed.
+  size_t id_{std::numeric_limits<size_t>::max()};
+};
+
+struct OptimizerInfo {
+  struct ParameterInfo {
+    TensorMetadata metadata_;
+    c10::optional<TensorMetadata> grad_metadata_;
+    std::vector<std::pair<std::string, TensorMetadata>> state_;
+  };
+
+  PyOptimizerSelf self_;
+  PyOptimizerCls cls_;
+  at::StringView cls_name_;
+
+  std::vector<ParameterInfo> parameters_;
+};
+
+struct PyExtraFieldsBase {
+  PyExtraFieldsBase(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller)
+      : end_time_ns_{end_time_ns},
+        python_tid_{python_tid},
+        caller_{std::move(caller)} {}
+
+  c10::time_t end_time_ns_;
+  size_t python_tid_;
+  PyFrameState caller_;
+
+  // kth python event observed. (Used by TensorBoard)
+  size_t id_{std::numeric_limits<size_t>::max()};
+};
+
+template <>
+struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
+  struct args_t {
+    PyFrameState frame_state_;
+    c10::optional<NNModuleInfo> module_info_;
+    c10::optional<OptimizerInfo> optimizer_info_;
+  };
+
+  ExtraFields(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller,
+      args_t args)
+      : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
+        callsite_{std::move(args.frame_state_)},
+        module_{std::move(args.module_info_)},
+        optimizer_{std::move(args.optimizer_info_)} {}
+
+  PyFrameState callsite_;
+  c10::optional<NNModuleInfo> module_;
+  c10::optional<OptimizerInfo> optimizer_;
+};
+
+template <>
+struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
+  using args_t = at::StringView;
+
+  ExtraFields(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller,
+      args_t args)
+      : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
+        function_name_{std::move(args)} {}
+
+  at::StringView function_name_;
+};
+
+template <>
+struct ExtraFields<EventType::Kineto> {
+  // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used
+  // during post processing to properly embed Kineto events into the broader
+  // profiler tree structure. End users are not generally expected to use these
+  // fields directly, but they are available for debugging.
+  struct Flow {
+    uint32_t id{0};
+    uint32_t type{0};
+    uint32_t start{0};
+  };
+
+  std::string name_;
+  int64_t duration_us_{0};
+  uint64_t correlation_id_{0};
+  libkineto::ActivityType activity_type_;
+  Flow flow;
+  std::weak_ptr<Result> linked_activity_{};
+};
+
+struct TORCH_API Result : public std::enable_shared_from_this<Result> {
+  template <typename... Args>
+  [[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
+    return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
+  }
+
+  template <typename T>
+  decltype(auto) visit(T&& visitor) {
+    return std::visit(std::forward<T>(visitor), extra_fields_);
+  }
+
+  template <typename T>
+  decltype(auto) visit(T&& visitor) const {
+    return std::visit(std::forward<T>(visitor), extra_fields_);
+  }
+
+  template <typename T, typename Fn>
+  void visit_if_base(Fn&& fn) const {
+    visit([&](const auto& extra_fields) {
+      using extra_fields_t = typename std::remove_cv_t<
+          typename std::remove_reference_t<decltype(extra_fields)>>;
+
+      if constexpr (std::is_base_of_v<T, extra_fields_t>) {
+        fn(extra_fields);
+      }
+    });
+  }
+
+  EventType tag() const {
+    return visit([](const auto& i) { return deduceTag(i); });
+  }
+
+  std::string name() const;
+  libkineto::ActivityType kinetoType() const;
+  uint64_t correlationID() const;
+  int64_t endTimeNS() const;
+  uint64_t endTID() const;
+  c10::DeviceType deviceType() const;
+
+  int64_t start_time_ns_;
+  uint64_t start_tid_;
+  kineto::DeviceAndResource kineto_info_;
+  std::variant<
+      ExtraFields<EventType::TorchOp>,
+      ExtraFields<EventType::Backend>,
+      ExtraFields<EventType::Vulkan>,
+      ExtraFields<EventType::Allocation>,
+      ExtraFields<EventType::OutOfMemory>,
+      ExtraFields<EventType::PyCall>,
+      ExtraFields<EventType::PyCCall>,
+      ExtraFields<EventType::Kineto>>
+      extra_fields_;
+
+  std::weak_ptr<Result> parent_;
+  std::vector<std::shared_ptr<Result>> children_;
+  bool finished_{false};
+
+  const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
+
+ private:
+  template <EventType E>
+  Result(
+      int64_t start_time_ns,
+      uint64_t start_tid,
+      kineto::DeviceAndResource kineto_info,
+      ExtraFields<E>&& extra_fields)
+      : start_time_ns_{start_time_ns},
+        start_tid_{start_tid},
+        kineto_info_{kineto_info},
+        extra_fields_{std::move(extra_fields)} {}
+
+  template <EventType E>
+  static EventType deduceTag(const ExtraFields<E>&) {
+    return E;
+  }
+};
+
+struct KinetoObserverContext : public at::ObserverContext {
+  struct Event {
+    TorchOpBasicFields basic_fields_;
+    c10::approx_time_t start_time_;
+
+    // Set in the exit callback.
+    c10::approx_time_t end_time_{
+        std::numeric_limits<c10::approx_time_t>::min()};
+
+    bool allow_tf32_cublas_;
+    std::unique_ptr<perf_counters_t> counters_;
+  };
+
+  explicit KinetoObserverContext(Event* event) : event_{event} {}
+
+  Event* event_;
+  FallbackPair* fallback_{nullptr};
+};
+
+constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
+
+constexpr int SCALAR_LIST_LENGTH_LIMIT = 30;
+
+// InputOutputEncoder
+// Stores each op_events' shapes and dtypes, and concrete values into a
+// contiguous AppendOnlyList so that we no longer create vectors for shapes
+// and dtypes on every op. Those vectors can be created during
+// post-processing.
+// It splits the data into two categories: input shapes and concrete inputs.
+class InputOutputEncoder final {
+ public:
+  void push(c10::ArrayRef<const c10::IValue> values);
+
+  // Used during post-processing to unpack the encoded data.
+  // Each method returns a "supplier" lambda which takes no arguments;
+  // invoking the lambda once will return a list of args that represent
+  // the inputs for one op.
+  // The data is split into two streams: "input shapes" and "concrete inputs".
+  // Note: "auto" only works because these are only used in collection.cpp,
+  // where they are implemented.
+  auto getInputShapeGenerator();
+  auto getConcreteInputGenerator();
+
+  bool isSupportedScalarList(const c10::IValue& list_candidate);
+
+  void clear();
+
+  enum class Tag {
+    Tensor = 0,
+    UndefinedTensor,
+    TensorListBegin, // TODO: generalize to other lists.
+    ScalarList,
+    Scalar,
+    Other,
+    TERMINATOR
+  };
+
+  enum class IOType { Shapes, ConcreteInputs, None };
+
+ private:
+  void push(const at::Tensor& t);
+
+  // Implementation detail for getInputShapeGenerator and
+  // getConcreteInputGenerator
+  auto getIValueGenerator(const IOType& io_type);
+
+  AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
+  AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
+      tensor_metadata_;
+  AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_;
+  AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
+};
+
+using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
+
+class TORCH_API ThreadLocalSubqueue {
+ public:
+  ThreadLocalSubqueue(const uint64_t tid, ProfilerConfig config);
+
+  std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
+
+  template <class... Args>
+  void emplace_backend_event(Args&&... args) {
+    backend_events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_vulkan_event(Args&&... args) {
+    vulkan_events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_allocation_event(Args&&... args) {
+    allocations_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_ooms_event(Args&&... args) {
+    ooms_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_py_call(Args&&... args) {
+    py_calls_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  uint64_t tid() const {
+    return tid_;
+  }
+
+  const kineto::DeviceAndResource& kineto_info() const {
+    return kineto_info_;
+  }
+
+  inline void disable_perf_profiler(perf_counters_t& counters) const {
+    perf_profiler_->Disable(counters);
+  }
+
+ private:
+  uint64_t tid_;
+  ProfilerConfig config_;
+  kineto::DeviceAndResource kineto_info_;
+  std::unique_ptr<perf_profiler_t> perf_profiler_;
+
+  friend class RecordQueue;
+  // See `containers.h` for block size benchmarks.
+  static constexpr size_t BlockSize = 512;
+
+  struct TorchOpStorage {
+    // NB: This is a destructive operation.
+    void materialize(
+        std::vector<std::shared_ptr<Result>>& out,
+        const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
+        const uint64_t tid,
+        const kineto::DeviceAndResource& kineto_info);
+
+    template <typename T, size_t ChunkSize>
+    class EventBlock : public std::array<T, ChunkSize> {
+     public:
+      EventBlock();
+      uint64_t correlation_id(const T* ptr) const;
+
+     private:
+      uint64_t id_start_;
+    };
+
+    using event_t = KinetoObserverContext::Event;
+    class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> {
+     public:
+      template <class... Args>
+      std::pair<event_t*, uint64_t> emplace_back(Args&&... args);
+      static uint64_t correlationID(const OpList::Iterator& e);
+    } op_events_;
+
+    // report_input_shapes
+    InputOutputEncoder inputs_outputs_;
+
+    // with_stack (JIT)
+    AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
+
+    // with_modules
+    AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
+
+    // with_flops
+    AppendOnlyList<extra_args_t, BlockSize> extra_args_;
+
+    // report extra metadata, i.e. collective communication meta
+    AppendOnlyList<extra_meta_t, BlockSize> extra_meta_;
+
+    // ProfilerState::KINETO_GPU_FALLBACK or
+    // ProfilerState::KINETO_PRIVATEUSE1_FALLBACK
+    AppendOnlyList<FallbackPair, BlockSize> device_fallback_;
+  } torch_ops_;
+
+  // reportBackendEventToActiveKinetoProfiler
+  AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
+
+  // _reportVulkanEventToProfiler
+  AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>
+      vulkan_events_;
+
+  // reportMemoryUsage
+  AppendOnlyList<RawAllocation, BlockSize> allocations_;
+
+  // reportOOMs
+  AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
+
+  // with_stack (Python)
+  AppendOnlyList<
+      std::pair<python_tracer::TraceKey, c10::approx_time_t>,
+      BlockSize>
+      py_calls_;
+};
+
+class TORCH_API RecordQueue {
+ public:
+  RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
+
+  bool tracePython() const;
+  ThreadLocalSubqueue* getSubqueue();
+  void stop();
+
+  // NB: This is a destructive operation.
+  std::pair<
+      std::vector<std::shared_ptr<Result>>,
+      std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
+  getRecords(
+      std::function<c10::time_t(c10::approx_time_t)> time_converter,
+      uint64_t start_time_us,
+      uint64_t end_time_us);
+
+ private:
+  uint32_t id_;
+  ProfilerConfig config_;
+  std::set<ActivityType> activities_;
+  ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
+      sub_queues_;
+  std::mutex sub_queue_mutex_;
+  std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
+};
+
+TORCH_API bool get_record_concrete_inputs_enabled();
+TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>);
+TORCH_API void set_record_concrete_inputs_enabled_val(bool);
+
+TORCH_API bool get_fwd_bwd_enabled();
+TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>);
+TORCH_API void set_fwd_bwd_enabled_val(bool);
+
+TORCH_API bool get_cuda_sync_enabled();
+TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>);
+TORCH_API void set_cuda_sync_enabled_val(bool);
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..902d654cba7d690234421bd3b82a1ad5fbf0eba2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch {
+
+// struct that holds the result of symbolizing multiple tracebacks
+// each traceback is a list of indices into all_frames
+// (lots of Frames get duplicated across traces)
+struct TORCH_API SymbolizedTracebacks {
+  std::vector<unwind::Frame> all_frames;
+  // index into all_frames, so that
+  // it is possible to dedupe frame objects in
+  // construction of python objects
+  std::vector<std::vector<uint64_t>> tracebacks;
+};
+
+struct TORCH_API CapturedTraceback : public c10::GatheredContext {
+  struct PyFrame {
+    void* code; // PyCodeObject*, but python headers not present
+    int lasti;
+  };
+
+  static std::shared_ptr<CapturedTraceback> gather(
+      bool python,
+      bool script,
+      bool cpp);
+  CapturedTraceback() = default;
+  CapturedTraceback(const CapturedTraceback&) = delete;
+  CapturedTraceback& operator=(const CapturedTraceback&) = delete;
+  CapturedTraceback(CapturedTraceback&&) noexcept = default;
+  CapturedTraceback& operator=(CapturedTraceback&&) noexcept = delete;
+  ~CapturedTraceback() override;
+
+  using visitproc = int (*)(void* self, void* arg);
+
+  struct Python {
+    virtual std::vector<PyFrame> gather() = 0;
+    virtual void release(std::vector<PyFrame>& frames) = 0;
+    virtual void appendSymbolized(
+        const std::vector<PyFrame>& to_symbolize,
+        SymbolizedTracebacks& st) = 0;
+    // tp_traverse/tp_clear implementations
+    virtual int traverse(
+        std::vector<PyFrame>& frames,
+        visitproc visit,
+        void* arg) = 0;
+    virtual int clear(std::vector<PyFrame>& frames) = 0;
+    virtual ~Python() = default;
+    Python* next_ = nullptr;
+  };
+  // called once by each python interpreter to
+  // register python stack recording functionality
+  // p cannot be deleted once added.
+  static void addPythonUnwinder(Python* p);
+
+  int traversePython(visitproc visit, void* arg);
+  int clearPython();
+
+ private:
+  std::vector<PyFrame> frames_;
+  std::vector<void*> cpp_frames_;
+  std::vector<jit::StackEntry> script_frames_;
+  friend TORCH_API SymbolizedTracebacks
+  symbolize(const std::vector<CapturedTraceback*>& to_symbolize);
+
+  // non-owning reference to one of the immortal Python* objects
+  // registered above.
+  Python* python_ = nullptr;
+};
+
+TORCH_API SymbolizedTracebacks
+symbolize(const std::vector<CapturedTraceback*>& to_symbolize);
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5c39a0d55ee2d7ba1a80cef111d477b832295a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <new>
+#include <utility>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ============================================================================
+// == AppendOnlyList ==========================================================
+// ============================================================================
+//   During profiling, we have a very predictable access pattern: we only
+// append to the end of the container. We can specialize and outperform both
+// std::vector (which must realloc) and std::deque (which performs a double
+// indirection), and this class of operation is sufficiently important to the
+// profiling hot path to warrant specializing:
+//   https://godbolt.org/z/rTjozf1c4
+//   https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4    (Prototype #1,
+//   int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no    (Prototype
+//   #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q
+//   (Prototype #2, int pair)
+//   https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg    (Prototype #3, int
+//   pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo    (Full impl,
+//   int pair)
+// AppendOnlyList has 2x lower emplace overhead compared to more generic STL
+// containers.
+//
+//   The optimal value of `ChunkSize` will vary by use case, but testing shows
+// that a value of 1024 does a good job amortizing the `malloc` cost of growth.
+// Performance drops off for larger values, so testing on a case-by-case basis
+// is recommended if performance is absolutely critical.
+
+template <
+    typename T,
+    size_t ChunkSize,
+    template <typename U, size_t N> class block_t = std::array>
+class AppendOnlyList {
+ public:
+  using array_t = block_t<T, ChunkSize>;
+  static_assert(
+      std::is_base_of_v<std::array<T, ChunkSize>, array_t>,
+      "AppendOnlyList expects raw low level pointer storage.");
+  static_assert(ChunkSize > 0, "Block cannot be empty.");
+
+  AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
+  AppendOnlyList(const AppendOnlyList&) = delete;
+  AppendOnlyList& operator=(const AppendOnlyList&) = delete;
+
+  size_t size() const {
+    return n_blocks_ * ChunkSize - (size_t)(end_ - next_);
+  }
+
+  template <class... Args>
+  T* emplace_back(Args&&... args) {
+    maybe_grow();
+    if constexpr (
+        std::is_trivially_destructible_v<T> &&
+        std::is_trivially_destructible_v<array_t>) {
+      ::new ((void*)next_) T{std::forward<Args>(args)...};
+    } else {
+      *next_ = T{std::forward<Args>(args)...};
+    }
+    return next_++;
+  }
+
+  template <typename T0>
+  typename std::enable_if<
+      std::is_same<T0, T>::value && std::is_trivially_copyable<T>::value>::type
+  copy(c10::ArrayRef<T0> src) {
+    size_t n = src.size();
+    if (C10_UNLIKELY(n == 0)) {
+      return;
+    }
+    maybe_grow();
+    if (C10_LIKELY(next_ && (next_ + n <= end_))) {
+      std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0));
+      next_ += n;
+    } else {
+      // We could chunk this into several `memcpy`s, but because we expect this
+      // fallback to be infrequent (n << ChunkSize) the performance impact is
+      // negligible.
+      for (auto i : src) {
+        emplace_back(i);
+      }
+    }
+  }
+
+  void clear() {
+    buffer_.clear();
+    buffer_last_ = buffer_.before_begin();
+    n_blocks_ = 0;
+    next_ = nullptr;
+    end_ = nullptr;
+  }
+
+  struct Iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = T;
+    using pointer = T*;
+    using reference = T&;
+
+    Iterator(std::forward_list<array_t>& buffer, const size_t size)
+        : block_{buffer.begin()}, size_{size} {}
+
+    // End iterator.
+    Iterator() = default;
+
+    bool exhausted() const {
+      return current_ >= size_;
+    }
+
+    reference operator*() const {
+      return *current_ptr(/*checked=*/true);
+    }
+    pointer operator->() {
+      return current_ptr(/*checked=*/true);
+    }
+
+    // Prefix increment
+    Iterator& operator++() {
+      if (!(++current_ % ChunkSize)) {
+        block_++;
+      }
+      return *this;
+    }
+
+    // Postfix increment
+    Iterator operator++(int) {
+      Iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    friend bool operator==(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() == b.current_ptr();
+    }
+    friend bool operator!=(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() != b.current_ptr();
+    }
+
+    std::pair<array_t*, size_t> address() const {
+      if (current_ >= size_) {
+        return {nullptr, 0};
+      }
+      return {&(*block_), current_ % ChunkSize};
+    }
+
+   private:
+    T* current_ptr(bool checked = false) const {
+      auto a = address();
+      if (a.first == nullptr) {
+        TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList.");
+        return nullptr;
+      }
+      return a.first->data() + a.second;
+    }
+
+    typename std::forward_list<array_t>::iterator block_;
+    size_t current_{0};
+    size_t size_{0};
+  };
+
+  Iterator begin() {
+    return Iterator(buffer_, size());
+  }
+  Iterator end() {
+    return Iterator();
+  }
+  // TODO: cbegin and cend()
+
+ private:
+  void maybe_grow() {
+    if (C10_UNLIKELY(next_ == end_)) {
+      buffer_last_ = buffer_.emplace_after(buffer_last_);
+      n_blocks_++;
+      next_ = buffer_last_->data();
+      end_ = next_ + ChunkSize;
+    }
+  }
+
+  std::forward_list<array_t> buffer_;
+
+  // We maintain a pointer to the last element of `buffer_` so that we can
+  // insert at the end in O(1) time.
+  size_t n_blocks_{0};
+  T* next_{nullptr};
+  T* end_{nullptr};
+
+ protected:
+  typename std::forward_list<array_t>::iterator buffer_last_;
+};
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1f760cabe652801532f4cf6a3c0000e3abae4d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <memory>
+
+#include <ATen/core/TensorBody.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/strong_type.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// Identity is a complex concept in PyTorch. A Tensor might not have a
+// an associated storage, multiple Tensors might share the same underlying
+// storage, the storage of a Tensor might change over time, etc.
+//
+// For the purpose of profiling we're mostly interested in data flow
+// analysis. As a result, we can take an expansive view of identity:
+// Tensors share an ID if they share a TensorImpl or storage data.
+//
+// This identity equality is transitive; If Tensors T0 and T1 share a storage
+// S0 and T1 later points to a different storage S1 then all Tensors which
+// point to either S0 or S1 are considered to have the same identity. (Since
+// profiler cannot reason beyond that.)
+//
+// The profiler will handle lifetime analysis to ensure that identities do
+// not run afoul of the ABA problem. This does, however, mean that identities
+// can only be assigned when memory profiling is enabled.
+using TensorID = strong::type<size_t, struct TensorID_, strong::regular>;
+
+// Uniquely identifies an allocation. (Generally a StorageImpl's data ptr.)
+using AllocationID = strong::type<
+    size_t,
+    struct StorageID_,
+    strong::ordered,
+    strong::regular,
+    strong::hashable>;
+
+// We use a Tensor's TensorImpl adress and StorageImpl data start to build the
+// data flow graph. We do not hold an owning reference so we wrap them in strong
+// types to prevent direct access.
+using TensorImplAddress = strong::type<
+    const c10::TensorImpl*,
+    struct TensorImplAddress_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+using StorageImplData = strong::type<
+    const void*,
+    struct StorageImplData_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+// ============================================================================
+// == weak_intrusive_ptr and the ABA problem for TensorImpl* ==================
+// ============================================================================
+// Tracking `TensorImpl`s is an important part of identity tracking, because
+// a Tensor might change storage; however when it does we want to retain the
+// fact that the old and new storage belong to the same logical Tensor. We
+// cannot take an owning reference to the Tensor because that would change
+// program semantics by extending the lifetime of the Tensor. However if we
+// store a raw TensorImpl* pointer the TensorImpl might be deleted and a new
+// TensorImpl might be created that reuses the address. (ABA problem)
+//
+// Fortunately, there is a feature of `c10::intrusive_ptr` that we can use to
+// prevent address reuse for the duration of profiling: the weak intrusive ptr.
+// When a Tensor's refcount reaches zero but there are outstanding weak
+// references (`weakcount_ > 0`) it will free the underlying managed resources
+// by calling `target_->release_resources()`, but it will not call `delete`.
+// (Instead, `delete` is called when the last weak reference is destroyed.)
+// This means that we can safely use address identity to track `TensorImpls`.
+class WeakTensor {
+ public:
+  explicit WeakTensor(const at::Tensor& t) : weak_self_(t.getIntrusivePtr()) {}
+
+  auto get() const {
+    return TensorImplAddress{weak_self_._unsafe_get_target()};
+  }
+
+ private:
+  c10::weak_intrusive_ptr<c10::TensorImpl> weak_self_;
+};
+
+struct Result;
+
+void calculateUniqueTensorIDs(
+    std::vector<std::shared_ptr<Result>>& sorted_results);
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/events.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/events.h
new file mode 100644
index 0000000000000000000000000000000000000000..15fab8272244ab023b77b78240ae71df5255795c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/events.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace torch {
+namespace profiler {
+
+/* A vector type to hold a list of performance counters */
+using perf_counters_t = std::vector<uint64_t>;
+
+/* Standard list of performance events independent of hardware or backend */
+constexpr std::array<const char*, 2> ProfilerPerfEvents = {
+    /*
+     * Number of Processing Elelement (PE) cycles between two points of interest
+     * in time. This should correlate positively with wall-time. Measured in
+     * uint64_t. PE can be non cpu. TBD reporting behavior for multiple PEs
+     * participating (i.e. threadpool).
+     */
+    "cycles",
+
+    /* Number of PE instructions between two points of interest in time. This
+     * should correlate positively with wall time and the amount of computation
+     * (i.e. work). Across repeat executions, the number of instructions should
+     * be more or less invariant. Measured in uint64_t. PE can be non cpu.
+     */
+    "instructions"};
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb4256422721d6d7ef50a0644ae0b185f7cf97ca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+// Skip Kineto dependency on mobile unless explicitly asked for.
+// When is it explicitly asked for?
+//   KinetoEdgeCPUProfiler uses KinetoProfiler for cpu
+//   event profiling. This has a dependency on cpu only libkineto
+#if defined(USE_KINETO) && defined(C10_MOBILE) && \
+    !defined(EDGE_PROFILER_USE_KINETO)
+#undef USE_KINETO
+#endif
+
+#include <ActivityType.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/api.h>
+
+#ifdef USE_KINETO
+// Forward declarations so we don't have to include `libkineto.h` in a header.
+namespace libkineto {
+class GenericTraceActivity;
+struct CpuTraceBuffer;
+class ActivityTraceInterface;
+} // namespace libkineto
+#endif
+
+namespace torch {
+namespace profiler {
+
+#ifdef USE_KINETO
+constexpr bool kKinetoAvailable{true};
+#else
+constexpr bool kKinetoAvailable{false};
+#endif
+
+namespace impl {
+namespace kineto {
+
+// ----------------------------------------------------------------------------
+// -- Interface (Does not require Kineto) -------------------------------------
+// ----------------------------------------------------------------------------
+struct DeviceAndResource {
+  int32_t device;
+  int32_t resource;
+};
+const DeviceAndResource kineto_ids();
+
+#ifdef USE_KINETO
+using trace_t = libkineto::CpuTraceBuffer;
+using interface_trace_t = libkineto::ActivityTraceInterface;
+using activity_t = libkineto::GenericTraceActivity;
+#else
+struct DummyTraceBuffer {};
+struct DummyTraceInterface {};
+
+using trace_t = DummyTraceBuffer;
+using interface_trace_t = DummyTraceBuffer;
+struct activity_t;
+#endif // USE_KINETO
+
+void addMetadata(
+    activity_t* activity,
+    const std::string& key,
+    const std::string& value);
+
+// Wraps: libkineto::CpuTraceBuffer
+struct TraceWrapper {
+  TraceWrapper(const int64_t start_time, const std::string& name);
+  TraceWrapper(TraceWrapper&&) = default;
+  TraceWrapper(const TraceWrapper&) = delete;
+  ~TraceWrapper();
+
+  // The caller is expected to hold a mutex when calling `addCPUActivity`.
+  activity_t* addCPUActivity(
+      const std::string& name,
+      const libkineto::ActivityType type,
+      const DeviceAndResource device_and_resource,
+      const uint64_t correlation_id,
+      const int64_t start_time,
+      const int64_t end_time);
+
+  void transferCpuTrace(int64_t end_time);
+
+  explicit operator bool() const;
+
+  std::unique_ptr<trace_t>& get() {
+    return cpu_trace_;
+  }
+
+ private:
+  std::unique_ptr<trace_t> cpu_trace_;
+};
+
+// Wraps libkineto::ActivityTraceInterface
+struct ActivityTraceWrapper {
+  explicit ActivityTraceWrapper(std::unique_ptr<interface_trace_t>&& trace);
+  ActivityTraceWrapper() = default;
+  ActivityTraceWrapper(ActivityTraceWrapper&&) = default;
+  ActivityTraceWrapper(const ActivityTraceWrapper&) = delete;
+  explicit operator bool() const;
+  void save(const std::string& path);
+
+  const std::unique_ptr<interface_trace_t>& get() {
+    return trace_;
+  }
+
+ private:
+  std::unique_ptr<interface_trace_t> trace_;
+#ifdef USE_KINETO
+  bool saved_ = false; // Kineto's save is destructive
+#endif
+};
+
+using ActivitySet = std::set<torch::autograd::profiler::ActivityType>;
+void prepareTrace(
+    const bool cpuOnly,
+    const ActivitySet& activities,
+    const torch::profiler::impl::ExperimentalConfig& config);
+void startTrace();
+ActivityTraceWrapper stopTrace();
+void pushCorrelationId(uint64_t correlation_id);
+void pushUserCorrelationId(uint64_t correlation_id);
+void popCorrelationId();
+void popUserCorrelationId();
+void recordThreadInfo();
+
+void logInvariantViolation(
+    const std::string& assertion,
+    const std::string& error,
+    const std::string& profile_id,
+    const std::string& group_profile_id);
+
+} // namespace kineto
+} // namespace impl
+} // namespace profiler
+
+namespace autograd {
+namespace profiler {
+c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type);
+
+TORCH_API void addMetadataJson(
+    const std::string& key,
+    const std::string& value);
+
+TORCH_API void profilerStep();
+
+} // namespace profiler
+} // namespace autograd
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..24e968d36d41bd2fe27d4c48f45453069596a701
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <torch/csrc/Export.h>
+
+#include <utility>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ----------------------------------------------------------------------------
+// -- Profiler Config ---------------------------------------------------------
+// ----------------------------------------------------------------------------
+enum class C10_API_ENUM ActivityType {
+  CPU = 0,
+  XPU, // XPU kernels, runtime
+  CUDA, // CUDA kernels, runtime
+  MTIA, // MTIA kernels, runtime
+  NUM_KINETO_ACTIVITIES, // must be the last one
+};
+
+enum class C10_API_ENUM ProfilerState {
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX, // only emit NVTX markers
+  ITT, // only emit ITT markers
+  KINETO, // use libkineto
+  KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
+  KINETO_PRIVATEUSE1_FALLBACK, // use PrivateUse1 events
+  KINETO_ONDEMAND, // run the profiler in on-demand mode
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+enum class C10_API_ENUM ActiveProfilerType {
+  NONE = 0,
+  LEGACY,
+  KINETO,
+  NVTX,
+  ITT
+};
+
+struct TORCH_API ExperimentalConfig {
+  ExperimentalConfig(
+      std::vector<std::string> profiler_metrics = {},
+      bool profiler_measure_per_kernel = false,
+      bool verbose = false,
+      std::vector<std::string> performance_events = {},
+      bool enable_cuda_sync_events = false,
+      bool adjust_timestamps = false);
+  explicit operator bool() const;
+
+  std::vector<std::string> profiler_metrics;
+  bool profiler_measure_per_kernel;
+  bool verbose;
+  /*
+   * List of performance events to be profiled.
+   * An empty list will disable performance event based profiling altogether.
+   */
+  std::vector<std::string> performance_events;
+  /*
+   * For CUDA profiling mode, enable adding CUDA synchronization events
+   * that expose CUDA device, stream and event synchronization activities.
+   * This feature is new and currently disabled by default.
+   */
+  bool enable_cuda_sync_events;
+  /*
+   * Controls whether or not timestamp adjustment occurs after profiling.
+   * The purpose of this is to adjust Vulkan event timelines to align with those
+   * of their parent CPU events.
+   * This sometimes requires increasing CPU event durations (to fully contain
+   * their child events) and delaying CPU event start times (to
+   * prevent overlaps), so this should not be used unless Vulkan events are
+   * being profiled and it is ok to use this modified timestamp/duration
+   * information instead of the original information.
+   */
+  bool adjust_timestamps;
+};
+
+struct TORCH_API ProfilerConfig {
+  ProfilerConfig(
+      ProfilerState state,
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false,
+      bool with_flops = false,
+      bool with_modules = false,
+      ExperimentalConfig experimental_config = ExperimentalConfig());
+
+  bool disabled() const;
+  bool global() const;
+
+  ProfilerState state;
+  ExperimentalConfig experimental_config;
+  bool report_input_shapes;
+  bool profile_memory;
+  bool with_stack;
+  bool with_flops;
+  bool with_modules;
+
+  // For serialization
+  at::IValue toIValue() const;
+  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
+};
+
+// ----------------------------------------------------------------------------
+// -- Profiler base class -----------------------------------------------------
+// ----------------------------------------------------------------------------
+struct TORCH_API ProfilerStateBase : public c10::MemoryReportingInfoBase {
+  explicit ProfilerStateBase(ProfilerConfig config);
+  ~ProfilerStateBase() override;
+
+  static ProfilerStateBase* get(bool global);
+  static ProfilerStateBase* get() {
+    auto* out = get(/*global=*/true);
+    return out ? out : get(/*global=*/false);
+  }
+
+  static void push(std::shared_ptr<ProfilerStateBase>&& state);
+
+  static std::shared_ptr<ProfilerStateBase> pop(bool global);
+  static std::shared_ptr<ProfilerStateBase> pop() {
+    auto out = pop(/*global=*/true);
+    return out ? std::move(out) : pop(/*global=*/false);
+  }
+
+  const ProfilerConfig& config() const {
+    return config_;
+  }
+
+  void setCallbackHandle(at::CallbackHandle handle);
+  void removeCallback();
+
+  bool memoryProfilingEnabled() const override {
+    return config_.profile_memory;
+  }
+
+  virtual ActiveProfilerType profilerType() = 0;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex state_mutex_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  at::CallbackHandle handle_ = 0;
+};
+
+// Note: The following are only for the active *thread local* profiler.
+TORCH_API bool profilerEnabled();
+TORCH_API ActiveProfilerType profilerType();
+TORCH_API ProfilerConfig getProfilerConfig();
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2267a4aebb0013e32e8b93cd4c4ff19e21861bae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/strong_type.h>
+
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+class RecordQueue;
+struct Result;
+namespace python_tracer {
+
+using TraceKey = strong::type<
+    uint64_t,
+    struct TraceKey_,
+    strong::regular,
+    strong::hashable,
+    strong::ostreamable>;
+
+struct CompressedEvent {
+  TraceKey key_;
+  uint64_t system_tid_{};
+  kineto::DeviceAndResource kineto_info_{};
+  c10::time_t enter_t_{};
+};
+
+/*
+Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
+when we call the profiler from libtorch_python we need the profiler to be able
+to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)
+
+In order to solve this dependency issue we define a virtual base and a function
+to register a getter. The python tracer then implements these functions and
+exposes itself by calling `registerTracer` from `torch/csrc/autograd/init.cpp`.
+This pattern of registration for faux python dependencies in libtorch is common
+in the PyTorch codebase.
+*/
+struct TORCH_API PythonTracerBase {
+  static std::unique_ptr<PythonTracerBase> make(RecordQueue* queue);
+  virtual ~PythonTracerBase() = default;
+
+  virtual void stop() = 0;
+  virtual std::vector<std::shared_ptr<Result>> getEvents(
+      std::function<c10::time_t(c10::approx_time_t)> time_converter,
+      std::vector<CompressedEvent>& enters,
+      c10::time_t end_time_ns) = 0;
+};
+
+using MakeFn = std::unique_ptr<PythonTracerBase> (*)(RecordQueue*);
+TORCH_API void registerTracer(MakeFn make_tracer);
+} // namespace python_tracer
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h
new file mode 100644
index 0000000000000000000000000000000000000000..017ee213ae94d3990b1cdc8d02ce505beaa9a803
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+#include <cstdint>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace vulkan {
+
+// Using function pointer i.e. [std::tuple<std::string, uint64_t> (*)(int64_t)]
+// doesn't work because we need to capture the QueryPool in the lambda context
+// https://stackoverflow.com/a/28746827
+using GetShaderNameAndDurationNsFn =
+    std::function<std::tuple<std::string, uint64_t>(int64_t)>;
+TORCH_API void registerGetShaderNameAndDurationNs(
+    GetShaderNameAndDurationNsFn get_shader_name_and_duration_ns);
+
+TORCH_API void deregisterGetShaderNameAndDurationNs();
+
+std::tuple<std::string, uint64_t> getShaderNameAndDurationNs(
+    const vulkan_id_t& vulkan_id);
+
+} // namespace vulkan
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..fefb0a7acddeded939cc411f2d37f094ea4087ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/perf_event.h>
+
+#endif /* __ANDROID__ || __linux__ */
+
+#include <torch/csrc/profiler/perf.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace linux_perf {
+
+/*
+ * PerfEvent
+ * ---------
+ */
+
+inline void PerfEvent::Disable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Enable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Reset() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+/*
+ * PerfProfiler
+ * ------------
+ */
+
+inline uint64_t PerfProfiler::CalcDelta(uint64_t start, uint64_t end) const {
+  if (end < start) { // overflow
+    return end + (std::numeric_limits<uint64_t>::max() - start);
+  }
+  // not possible to wrap around start for a 64b cycle counter
+  return end - start;
+}
+
+inline void PerfProfiler::StartCounting() const {
+  for (auto& e : events_) {
+    e.Enable();
+  }
+}
+
+inline void PerfProfiler::StopCounting() const {
+  for (auto& e : events_) {
+    e.Disable();
+  }
+}
+
+} // namespace linux_perf
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2eec6930ae487ca0e5dc2f67ee1c427c93bfabf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/profiler/events.h>
+
+#include <c10/util/Exception.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+namespace linux_perf {
+
+/*
+ * Maximum number of events supported
+ * This stems from the hardware limitation on CPU performance counters, and the
+ * fact that we don't support time multiplexing just yet.
+ * Time multiplexing involves scaling the counter values proportional to
+ * the enabled and running time or running the workload multiple times.
+ */
+constexpr uint8_t MAX_EVENTS = 4;
+
+struct PerfCounter {
+  uint64_t value; /* The value of the event */
+  uint64_t time_enabled; /* for TIME_ENABLED */
+  uint64_t time_running; /* for TIME_RUNNING */
+};
+
+/*
+ * Basic perf event handler for Android and Linux
+ */
+class PerfEvent {
+ public:
+  explicit PerfEvent(std::string& name) : name_(name) {}
+
+  PerfEvent& operator=(PerfEvent&& other) noexcept {
+    if (this != &other) {
+      fd_ = other.fd_;
+      other.fd_ = -1;
+      name_ = std::move(other.name_);
+    }
+    return *this;
+  }
+
+  PerfEvent(PerfEvent&& other) noexcept {
+    *this = std::move(other);
+  }
+
+  ~PerfEvent();
+
+  /* Setup perf events with the Linux Kernel, attaches perf to this process
+   * using perf_event_open(2) */
+  void Init();
+
+  /* Stop incrementing hardware counters for this event */
+  void Disable() const;
+
+  /* Start counting hardware event from this point on */
+  void Enable() const;
+
+  /* Zero out the counts for this event */
+  void Reset() const;
+
+  /* Returns PerfCounter values for this event from kernel, on non supported
+   * platforms this always returns zero */
+  uint64_t ReadCounter() const;
+
+ private:
+  /* Name of the event */
+  std::string name_;
+
+  int fd_ = -1;
+};
+
+class PerfProfiler {
+ public:
+  /* Configure all the events and track them as individual PerfEvent */
+  void Configure(std::vector<std::string>& event_names);
+
+  /* Enable events counting from here */
+  void Enable();
+
+  /* Disable counting and fill in the caller supplied container with delta
+   * calculated from the start count values since last Enable() */
+  void Disable(perf_counters_t&);
+
+ private:
+  uint64_t CalcDelta(uint64_t start, uint64_t end) const;
+  void StartCounting() const;
+  void StopCounting() const;
+
+  std::vector<PerfEvent> events_;
+  std::stack<perf_counters_t> start_values_;
+};
+} // namespace linux_perf
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b70a81a9dcbf67a89ea08d08e34b06ea86306db0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <c10/core/Device.h>
+#include <c10/util/strong_type.h>
+#include <torch/csrc/Export.h>
+
+struct CUevent_st;
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+// ----------------------------------------------------------------------------
+// -- Annotation --------------------------------------------------------------
+// ----------------------------------------------------------------------------
+using ProfilerEventStub = std::shared_ptr<CUevent_st>;
+using ProfilerVoidEventStub = std::shared_ptr<void>;
+
+struct TORCH_API ProfilerStubs {
+  virtual void record(
+      c10::DeviceIndex* device,
+      ProfilerVoidEventStub* event,
+      int64_t* cpu_ns) const = 0;
+  virtual float elapsed(
+      const ProfilerVoidEventStub* event,
+      const ProfilerVoidEventStub* event2) const = 0;
+  virtual void mark(const char* name) const = 0;
+  virtual void rangePush(const char* name) const = 0;
+  virtual void rangePop() const = 0;
+  virtual bool enabled() const {
+    return false;
+  }
+  virtual void onEachDevice(std::function<void(int)> op) const = 0;
+  virtual void synchronize() const = 0;
+  virtual ~ProfilerStubs();
+};
+
+TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* cudaStubs();
+TORCH_API void registerITTMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* ittStubs();
+TORCH_API void registerPrivateUse1Methods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* privateuse1Stubs();
+
+using vulkan_id_t = strong::type<
+    int64_t,
+    struct _VulkanID,
+    strong::regular,
+    strong::convertible_to<int64_t>,
+    strong::hashable>;
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h
new file mode 100644
index 0000000000000000000000000000000000000000..abb4cb61d81bbf615cca5f555582b2b502b3bb2c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <stdint.h>
+#include <ostream>
+
+enum {
+  A_UNDEFINED = 0x0,
+  A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0
+  A_LOAD_CFA_OFFSET = 0x2, // exp = *(cfa + data0)
+  A_REG_PLUS_DATA_DEREF = 0x3 // exp = *(REG[reg] + data0)
+};
+
+// register numbers in dwarf info
+enum {
+  D_UNDEFINED = -1,
+  D_RBP = 6,
+  D_RSP = 7,
+  D_RIP = 16,
+  D_REG_SIZE = 17,
+};
+
+struct Action {
+  uint8_t kind = A_UNDEFINED;
+  int32_t reg = -1;
+  int64_t data = 0;
+  static Action undefined() {
+    return Action{A_UNDEFINED};
+  }
+  static Action regPlusData(int32_t reg, int64_t offset) {
+    return Action{A_REG_PLUS_DATA, reg, offset};
+  }
+  static Action regPlusDataDeref(int32_t reg, int64_t offset) {
+    return Action{A_REG_PLUS_DATA_DEREF, reg, offset};
+  }
+  static Action loadCfaOffset(int64_t offset) {
+    return Action{A_LOAD_CFA_OFFSET, D_UNDEFINED, offset};
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const Action& self) {
+    switch (self.kind) {
+      case A_UNDEFINED:
+        out << "u";
+        break;
+      case A_REG_PLUS_DATA:
+        out << "r" << (int)self.reg << " + " << self.data;
+        break;
+      case A_REG_PLUS_DATA_DEREF:
+        out << "*(r" << (int)self.reg << " + " << self.data << ")";
+        break;
+      case A_LOAD_CFA_OFFSET:
+        out << "*(cfa + " << self.data << ")";
+        break;
+    }
+    return out;
+  }
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/communicate.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/communicate.h
new file mode 100644
index 0000000000000000000000000000000000000000..05ae7a5239a00ba01163b4ac937e21232b6543ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/communicate.h
@@ -0,0 +1,65 @@
+#pragma once
+#include <ext/stdio_filebuf.h>
+#include <sys/wait.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <unistd.h>
+#include <memory>
+
+// helper to open a process with stdin/stdout/stderr streams.
+struct Communicate {
+  Communicate(const char* command, const char** args) {
+    if (pipe(inpipe_) < 0 || pipe(outpipe_) < 0 || pipe(errpipe_) < 0) {
+      throw UnwindError("pipe() failed");
+    }
+    pid_t pid = fork();
+    if (pid < 0) {
+      throw UnwindError("fork() failed");
+    } else if (pid == 0) { // child process
+      close(inpipe_[1]);
+      close(outpipe_[0]);
+      close(errpipe_[0]);
+
+      dup2(inpipe_[0], STDIN_FILENO);
+      dup2(outpipe_[1], STDOUT_FILENO);
+      dup2(errpipe_[1], STDERR_FILENO);
+      execvp(command, (char* const*)args);
+      throw UnwindError("failed execvp");
+    } else { // parent process
+      close(inpipe_[0]);
+      close(outpipe_[1]);
+      close(errpipe_[1]);
+      outbuf_.reset(
+          new __gnu_cxx::stdio_filebuf<char>(inpipe_[1], std::ios::out));
+      inbuf_.reset(
+          new __gnu_cxx::stdio_filebuf<char>(outpipe_[0], std::ios::in));
+      errbuf_.reset(
+          new __gnu_cxx::stdio_filebuf<char>(errpipe_[0], std::ios::in));
+      in_.reset(new std::istream(inbuf_.get()));
+      out_.reset(new std::ostream(outbuf_.get()));
+      err_.reset(new std::ostream(errbuf_.get()));
+    }
+  }
+  ~Communicate() {
+    close(inpipe_[1]);
+    close(outpipe_[0]);
+    close(errpipe_[0]);
+  }
+  std::ostream& out() {
+    return *out_;
+  }
+  std::ostream& err() {
+    return *err_;
+  }
+  std::istream& in() {
+    return *in_;
+  }
+
+ private:
+  int inpipe_[2];
+  int outpipe_[2];
+  int errpipe_[2];
+  std::unique_ptr<__gnu_cxx::stdio_filebuf<char>> outbuf_, inbuf_, errbuf_;
+  std::unique_ptr<std::istream> in_;
+  std::unique_ptr<std::ostream> out_;
+  std::unique_ptr<std::ostream> err_;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/dwarf_enums.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/dwarf_enums.h
new file mode 100644
index 0000000000000000000000000000000000000000..a896a04295a8e64124cab2671c2d30652f91ad36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/dwarf_enums.h
@@ -0,0 +1,46 @@
+#pragma once
+
+enum {
+  DW_EH_PE_absptr = 0x00,
+  DW_EH_PE_omit = 0xff,
+  /* FDE data encoding.  */
+  DW_EH_PE_uleb128 = 0x01,
+  DW_EH_PE_udata2 = 0x02,
+  DW_EH_PE_udata4 = 0x03,
+  DW_EH_PE_udata8 = 0x04,
+  DW_EH_PE_sleb128 = 0x09,
+  DW_EH_PE_sdata2 = 0x0a,
+  DW_EH_PE_sdata4 = 0x0b,
+  DW_EH_PE_sdata8 = 0x0c,
+  DW_EH_PE_signed = 0x08,
+  /* FDE flags.  */
+  DW_EH_PE_pcrel = 0x10,
+  DW_EH_PE_textrel = 0x20,
+  DW_EH_PE_datarel = 0x30,
+  DW_EH_PE_funcrel = 0x40,
+  DW_EH_PE_aligned = 0x50,
+  DW_EH_PE_indirect = 0x80,
+};
+
+enum {
+  DW_CFA_nop = 0x0,
+  DW_CFA_advance_loc = 0x01,
+  DW_CFA_offset = 0x02,
+  DW_CFA_restore = 0x03,
+  DW_CFA_advance_loc1 = 0x02,
+  DW_CFA_advance_loc2 = 0x03,
+  DW_CFA_advance_loc4 = 0x04,
+  DW_CFA_restore_extended = 0x06,
+  DW_CFA_undefined = 0x07,
+  DW_CFA_register = 0x09,
+  DW_CFA_remember_state = 0x0a,
+  DW_CFA_restore_state = 0x0b,
+  DW_CFA_def_cfa = 0x0c,
+  DW_CFA_def_cfa_register = 0x0d,
+  DW_CFA_def_cfa_offset = 0x0e,
+  DW_CFA_def_cfa_expression = 0xf,
+  DW_CFA_expression = 0x10,
+  DW_CFA_offset_extended_sf = 0x11,
+  DW_CFA_GNU_args_size = 0x2e,
+  DW_OP_deref = 0x6,
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/eh_frame_hdr.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/eh_frame_hdr.h
new file mode 100644
index 0000000000000000000000000000000000000000..d41b86fd81fb2e9c6b6ed05d33cf7a1152216823
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/eh_frame_hdr.h
@@ -0,0 +1,95 @@
+#pragma once
+#include <stdint.h>
+#include <ostream>
+
+#include <torch/csrc/profiler/unwind/lexer.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+
+// Overview of the format described in
+// https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/ehframehdr.html
+
+struct EHFrameHdr {
+  EHFrameHdr(void* base) : base_(base) {
+    Lexer L(base, base);
+    version_ = L.read<uint8_t>();
+    eh_frame_ptr_enc_ = L.read<uint8_t>();
+    fde_count_enc_ = L.read<uint8_t>();
+    table_enc_ = L.read<uint8_t>();
+    if (table_enc_ == DW_EH_PE_omit) {
+      table_size_ = 0;
+    } else {
+      switch (table_enc_ & 0xF) {
+        case DW_EH_PE_udata2:
+        case DW_EH_PE_sdata2:
+          table_size_ = 2;
+          break;
+        case DW_EH_PE_udata4:
+        case DW_EH_PE_sdata4:
+          table_size_ = 4;
+          break;
+        case DW_EH_PE_udata8:
+        case DW_EH_PE_sdata8:
+          table_size_ = 8;
+          break;
+        case DW_EH_PE_uleb128:
+        case DW_EH_PE_sleb128:
+          throw UnwindError("uleb/sleb table encoding not supported");
+          break;
+        default:
+          throw UnwindError("unknown table encoding");
+      }
+    }
+    eh_frame_ = (void*)L.readEncodedOr(eh_frame_ptr_enc_, 0);
+    fde_count_ = L.readEncodedOr(fde_count_enc_, 0);
+    table_start_ = L.loc();
+  }
+  size_t nentries() const {
+    return fde_count_;
+  }
+
+  uint64_t lowpc(size_t i) const {
+    return Lexer(table_start_, base_)
+        .skip(2 * i * table_size_)
+        .readEncoded(table_enc_);
+  }
+  void* fde(size_t i) const {
+    return (void*)Lexer(table_start_, base_)
+        .skip((2 * i + 1) * table_size_)
+        .readEncoded(table_enc_);
+  }
+
+  void* entryForAddr(uint64_t addr) const {
+    if (!table_size_ || !nentries()) {
+      throw UnwindError("search table not present");
+    }
+    uint64_t low = 0;
+    uint64_t high = nentries();
+    while (low + 1 < high) {
+      auto mid = (low + high) / 2;
+      if (addr < lowpc(mid)) {
+        high = mid;
+      } else {
+        low = mid;
+      }
+    }
+    return fde(low);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const EHFrameHdr& self) {
+    out << "EHFrameHeader(version=" << self.version_
+        << ",table_size=" << self.table_size_
+        << ",fde_count=" << self.fde_count_ << ")";
+    return out;
+  }
+
+ private:
+  void* base_;
+  void* table_start_;
+  uint8_t version_;
+  uint8_t eh_frame_ptr_enc_;
+  uint8_t fde_count_enc_;
+  uint8_t table_enc_;
+  void* eh_frame_ = nullptr;
+  int64_t fde_count_;
+  uint32_t table_size_;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/fde.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/fde.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b720007091b383f915029e9f4bf6ce8ef098d52
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/fde.h
@@ -0,0 +1,400 @@
+#pragma once
+#include <c10/util/irange.h>
+#include <torch/csrc/profiler/unwind/action.h>
+#include <torch/csrc/profiler/unwind/lexer.h>
+#include <array>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+struct TableState {
+  Action cfa;
+  std::array<Action, D_REG_SIZE> registers;
+  friend std::ostream& operator<<(std::ostream& out, const TableState& self) {
+    out << "cfa = " << self.cfa << "; ";
+    for (auto r : c10::irange(self.registers.size())) {
+      if (self.registers.at(r).kind != A_UNDEFINED) {
+        out << "r" << r << " = " << self.registers.at(r) << "; ";
+      }
+    }
+    return out;
+  }
+};
+
+// FDE - Frame Description Entry (Concept in ELF spec)
+// This format is explained well by
+// https://www.airs.com/blog/archives/460
+// Details of different dwarf actions are explained
+// in the spec document:
+// https://web.archive.org/web/20221129184704/https://dwarfstd.org/doc/DWARF4.doc
+// An overview of how DWARF unwinding works is given in
+// https://dl.acm.org/doi/pdf/10.1145/3360572
+// A similar implementation written in rust is:
+// https://github.com/mstange/framehop/
+
+template <bool LOG = false>
+struct FDE {
+  FDE(void* data, const char* library_name, uint64_t load_bias)
+      : library_name_(library_name), load_bias_(load_bias) {
+    Lexer L(data);
+    auto length = L.read4or8Length();
+    void* fde_start = L.loc();
+    void* cie_data = (void*)((int64_t)fde_start - L.read<uint32_t>());
+    Lexer LC(cie_data);
+    auto cie_length = LC.read4or8Length();
+    void* cie_start = LC.loc();
+    auto zero = LC.read<uint32_t>();
+    TORCH_INTERNAL_ASSERT(zero == 0, "expected 0 for CIE");
+    auto version = LC.read<uint8_t>();
+    TORCH_INTERNAL_ASSERT(
+        version == 1 || version == 3, "non-1 version for CIE");
+    augmentation_string_ = LC.readCString();
+    if (hasAugmentation("eh")) {
+      throw UnwindError("unsupported 'eh' augmentation string");
+    }
+    code_alignment_factor_ = LC.readULEB128();
+    data_alignment_factor_ = LC.readSLEB128();
+    if (version == 1) {
+      ra_register_ = LC.read<uint8_t>();
+    } else {
+      ra_register_ = LC.readULEB128();
+    }
+    // we assume this in the state
+    TORCH_INTERNAL_ASSERT(ra_register_ == 16, "unexpected number of registers");
+    if (augmentation_string_ && *augmentation_string_ == 'z') {
+      augmentation_length_ = LC.readULEB128();
+      Lexer A(LC.loc());
+      for (auto ap = augmentation_string_ + 1; *ap; ap++) {
+        switch (*ap) {
+          case 'L':
+            lsda_enc = A.read<uint8_t>();
+            break;
+          case 'R':
+            fde_enc = A.read<uint8_t>();
+            break;
+          case 'P': {
+            uint8_t personality_enc = A.read<uint8_t>();
+            A.readEncoded(personality_enc);
+          } break;
+          case 'S': {
+            // signal handler
+          } break;
+          default: {
+            throw UnwindError("unknown augmentation string");
+          } break;
+        }
+      }
+    }
+    LC.skip(augmentation_length_);
+    low_pc_ = L.readEncoded(fde_enc);
+    high_pc_ = low_pc_ + L.readEncodedValue(fde_enc);
+
+    if (hasAugmentation("z")) {
+      augmentation_length_fde_ = L.readULEB128();
+    }
+    L.readEncodedOr(lsda_enc, 0);
+
+    cie_begin_ = LC.loc();
+    fde_begin_ = L.loc();
+    cie_end_ = (void*)((const char*)cie_start + cie_length);
+    fde_end_ = (void*)((const char*)fde_start + length);
+  }
+
+  // OP Code implementations
+
+  void advance_raw(int64_t amount) {
+    auto previous_pc = current_pc_;
+    current_pc_ += amount;
+    if (LOG) {
+      (*out_) << (void*)(previous_pc - load_bias_) << "-"
+              << (void*)(current_pc_ - load_bias_) << ": " << state() << "\n";
+    }
+  }
+
+  void advance_loc(int64_t amount) {
+    if (LOG) {
+      (*out_) << "advance_loc " << amount << "\n";
+    }
+    advance_raw(amount * code_alignment_factor_);
+  }
+
+  void offset(int64_t reg, int64_t offset) {
+    if (LOG) {
+      (*out_) << "offset " << reg << " " << offset << "\n";
+    }
+    if (reg > (int64_t)state().registers.size()) {
+      if (LOG) {
+        (*out_) << "OFFSET OF BIG REGISTER " << reg << "ignored...\n";
+      }
+      return;
+    }
+    state().registers.at(reg) =
+        Action{A_LOAD_CFA_OFFSET, -1, offset * data_alignment_factor_};
+  }
+
+  void restore(int64_t reg) {
+    if (LOG) {
+      (*out_) << "restore " << reg << "\n";
+    }
+    if (reg > (int64_t)state().registers.size()) {
+      if (LOG) {
+        (*out_) << "RESTORE OF BIG REGISTER " << reg << "ignored...\n";
+      }
+      return;
+    }
+    state().registers.at(reg) = initial_state_.registers.at(reg);
+  }
+
+  void def_cfa(int64_t reg, int64_t off) {
+    if (LOG) {
+      (*out_) << "def_cfa " << reg << " " << off << "\n";
+    }
+    last_reg_ = reg;
+    last_offset_ = off;
+    state().cfa = Action::regPlusData(reg, off);
+  }
+  void def_cfa_register(int64_t reg) {
+    def_cfa(reg, last_offset_);
+  }
+  void def_cfa_offset(int64_t off) {
+    def_cfa(last_reg_, off);
+  }
+
+  void remember_state() {
+    if (LOG) {
+      (*out_) << "remember_state\n";
+    }
+    state_stack_.push_back(state());
+  }
+  void restore_state() {
+    if (LOG) {
+      (*out_) << "restore_state\n";
+    }
+    state_stack_.pop_back();
+  }
+
+  void undefined(int64_t reg) {
+    if (LOG) {
+      (*out_) << "undefined " << reg << "\n";
+    }
+    state().registers.at(reg) = Action::undefined();
+  }
+  void register_(int64_t reg, int64_t rhs_reg) {
+    if (LOG) {
+      (*out_) << "register " << reg << " " << rhs_reg << "\n";
+    }
+    state().registers.at(reg) = Action::regPlusData(reg, 0);
+  }
+
+  TableState& state() {
+    return state_stack_.back();
+  }
+
+  void dump(std::ostream& out) {
+    out_ = &out;
+    out << "FDE(augmentation_string=" << augmentation_string_
+        << ", low_pc=" << (void*)(low_pc_ - load_bias_)
+        << ",high_pc=" << (void*)(high_pc_ - load_bias_)
+        << ",code_alignment_factor=" << code_alignment_factor_
+        << ", data_alignment_factor=" << data_alignment_factor_
+        << ", ra_register_=" << ra_register_ << ")\n";
+    readUpTo(high_pc_);
+    out_ = &std::cout;
+  }
+
+  TableState readUpTo(uint64_t addr) {
+    if (addr < low_pc_ || addr > high_pc_) {
+      throw UnwindError("Address not in range");
+    }
+    if (LOG) {
+      (*out_) << "readUpTo " << (void*)addr << " for " << library_name_
+              << " at " << (void*)load_bias_ << "\n";
+    }
+    state_stack_.emplace_back();
+    current_pc_ = low_pc_;
+    // parse instructions...
+    Lexer LC(cie_begin_);
+    while (LC.loc() < cie_end_ && current_pc_ <= addr) {
+      readInstruction(LC);
+    }
+    if (current_pc_ > addr) {
+      return state();
+    }
+
+    initial_state_ = state_stack_.back();
+
+    if (LOG) {
+      (*out_) << "--\n";
+    }
+
+    Lexer L(fde_begin_);
+    while (L.loc() < fde_end_ && current_pc_ <= addr) {
+      readInstruction(L);
+    }
+    // so that we print the full range in debugging
+    if (current_pc_ <= addr) {
+      advance_raw(addr - current_pc_);
+    }
+    return state();
+  }
+
+  void dumpAddr2Line() {
+    std::cout << "addr2line -f -e " << library_name_ << " "
+              << (void*)(low_pc_ - load_bias_) << "\n";
+  }
+
+  void readInstruction(Lexer& L) {
+    uint8_t bc = L.read<uint8_t>();
+    auto op = bc >> 6;
+    auto lowbits = bc & 0x3F;
+    switch (op) {
+      case 0x0: {
+        switch (lowbits) {
+          case DW_CFA_nop: {
+            return; // nop
+          }
+          case DW_CFA_advance_loc1: {
+            auto delta = L.read<uint8_t>();
+            return advance_loc(delta);
+          }
+          case DW_CFA_advance_loc2: {
+            auto delta = L.read<uint16_t>();
+            return advance_loc(delta);
+          }
+          case DW_CFA_advance_loc4: {
+            auto delta = L.read<uint32_t>();
+            return advance_loc(delta);
+          }
+          case DW_CFA_restore_extended: {
+            auto reg = L.readULEB128();
+            return restore(reg);
+          }
+          case DW_CFA_undefined: {
+            auto reg = L.readULEB128();
+            return undefined(reg);
+          }
+          case DW_CFA_register: {
+            auto reg = L.readULEB128();
+            auto rhs_reg = L.readULEB128();
+            return register_(reg, rhs_reg);
+          }
+          case DW_CFA_def_cfa: {
+            auto reg = L.readULEB128();
+            auto off = L.readULEB128();
+            return def_cfa(reg, off);
+          }
+          case DW_CFA_def_cfa_register: {
+            auto reg = L.readULEB128();
+            return def_cfa_register(reg);
+          }
+          case DW_CFA_def_cfa_offset: {
+            auto off = L.readULEB128();
+            return def_cfa_offset(off);
+          }
+          case DW_CFA_offset_extended_sf: {
+            auto reg = L.readULEB128();
+            auto off = L.readSLEB128();
+            return offset(reg, off);
+          }
+          case DW_CFA_remember_state: {
+            return remember_state();
+          }
+          case DW_CFA_restore_state: {
+            return restore_state();
+          }
+          case DW_CFA_GNU_args_size: {
+            // GNU_args_size, we do not need to know it..
+            L.readULEB128();
+            return;
+          }
+          case DW_CFA_expression: {
+            auto reg = L.readULEB128();
+            auto len = L.readULEB128();
+            auto end = (void*)((uint64_t)L.loc() + len);
+            auto op = L.read<uint8_t>();
+            if ((op & 0xF0) == 0x70) { // DW_bregX
+              auto rhs_reg = (op & 0xF);
+              auto addend = L.readSLEB128();
+              if (L.loc() == end) {
+                state().registers.at(reg) =
+                    Action::regPlusDataDeref(rhs_reg, addend);
+                return;
+              }
+            }
+            throw UnwindError("Unsupported dwarf expression");
+          }
+          case DW_CFA_def_cfa_expression: {
+            auto len = L.readULEB128();
+            auto end = (void*)((uint64_t)L.loc() + len);
+            auto op = L.read<uint8_t>();
+            if ((op & 0xF0) == 0x70) { // DW_bregX
+              auto rhs_reg = (op & 0xF);
+              auto addend = L.readSLEB128();
+              if (L.loc() != end) {
+                auto op2 = L.read<uint8_t>();
+                if (op2 == DW_OP_deref && L.loc() == end) { // deref
+                  state().cfa = Action::regPlusDataDeref(rhs_reg, addend);
+                  return;
+                }
+              }
+            }
+            throw UnwindError("Unsupported def_cfa dwarf expression");
+          }
+          default: {
+            std::stringstream ss;
+            ss << "unknown op code " << (void*)(uint64_t)lowbits;
+            throw UnwindError(ss.str());
+          }
+        }
+      }
+      case DW_CFA_advance_loc: {
+        return advance_loc(lowbits);
+      }
+      case DW_CFA_offset: {
+        auto off = L.readULEB128();
+        return offset(lowbits, off);
+      }
+      case DW_CFA_restore: {
+        return restore(lowbits);
+      }
+    }
+  }
+  // used for debug printing
+  const char* library_name_;
+  uint64_t load_bias_;
+
+  // parsed from the eh_string data structures:
+  const char* augmentation_string_ = nullptr;
+  int64_t augmentation_length_ = 0;
+  int64_t augmentation_length_fde_ = 0;
+
+  int64_t code_alignment_factor_;
+  int64_t data_alignment_factor_;
+  void* cie_data_;
+
+  int64_t ra_register_;
+  uint8_t lsda_enc = DW_EH_PE_omit;
+  uint8_t fde_enc = DW_EH_PE_absptr;
+  uint64_t low_pc_ = UINT64_MAX;
+  uint64_t high_pc_ = UINT64_MAX;
+
+  void* cie_begin_;
+  void* fde_begin_;
+  void* cie_end_;
+  void* fde_end_;
+
+  // state accumulated while parsing instructions
+  int64_t last_reg_ = 0;
+  int64_t last_offset_ = 0;
+  uint64_t current_pc_;
+
+  TableState
+      initial_state_; // state after the initial instructions, used by restore
+  std::vector<TableState> state_stack_;
+
+  std::ostream* out_ = &std::cout; // for debug dumping
+ private:
+  bool hasAugmentation(const char* s) {
+    return strstr(augmentation_string_, s) != nullptr;
+  }
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/lexer.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed279149c4d505ab325b6f726749ec7ff13e3161
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/lexer.h
@@ -0,0 +1,124 @@
+#pragma once
+#include <stdint.h>
+#include <string.h>
+
+#include <torch/csrc/profiler/unwind/dwarf_enums.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+
+struct Lexer {
+  Lexer(void* data, void* base = nullptr)
+      : next_((const char*)data), base_((int64_t)base) {}
+
+  template <typename T>
+  T read() {
+    T result;
+    memcpy(&result, next_, sizeof(T));
+    next_ += sizeof(T);
+    return result;
+  }
+
+  // SLEB/ULEB code adapted from LLVM equivalents
+  int64_t readSLEB128() {
+    int64_t Value = 0;
+    unsigned Shift = 0;
+    uint8_t Byte;
+    do {
+      Byte = read<uint8_t>();
+      uint64_t Slice = Byte & 0x7f;
+      if ((Shift >= 64 && Slice != (Value < 0 ? 0x7f : 0x00)) ||
+          (Shift == 63 && Slice != 0 && Slice != 0x7f)) {
+        throw UnwindError("sleb128 too big for int64");
+      }
+      Value |= Slice << Shift;
+      Shift += 7;
+    } while (Byte >= 128);
+    // Sign extend negative numbers if needed.
+    if (Shift < 64 && (Byte & 0x40)) {
+      Value |= (-1ULL) << Shift;
+    }
+    return Value;
+  }
+
+  uint64_t readULEB128() {
+    uint64_t Value = 0;
+    unsigned Shift = 0;
+    uint8_t p;
+    do {
+      p = read<uint8_t>();
+      uint64_t Slice = p & 0x7f;
+      if ((Shift >= 64 && Slice != 0) || Slice << Shift >> Shift != Slice) {
+        throw UnwindError("uleb128 too big for uint64");
+      }
+      Value += Slice << Shift;
+      Shift += 7;
+    } while (p >= 128);
+    return Value;
+  }
+  const char* readCString() {
+    auto result = next_;
+    next_ += strlen(next_) + 1;
+    return result;
+  }
+  int64_t readEncoded(uint8_t enc) {
+    int64_t r = 0;
+    switch (enc & (~DW_EH_PE_indirect & 0xF0)) {
+      case DW_EH_PE_absptr:
+        break;
+      case DW_EH_PE_pcrel:
+        r = (int64_t)next_;
+        break;
+      case DW_EH_PE_datarel:
+        r = base_;
+        break;
+      default:
+        throw UnwindError("unknown encoding");
+    }
+    return r + readEncodedValue(enc);
+  }
+  int64_t readEncodedOr(uint8_t enc, int64_t orelse) {
+    if (enc == DW_EH_PE_omit) {
+      return orelse;
+    }
+    return readEncoded(enc);
+  }
+  int64_t read4or8Length() {
+    int64_t length = read<uint32_t>();
+    if (length == 0xFFFFFFFF) {
+      length = read<int64_t>();
+    }
+    return length;
+  }
+  void* loc() const {
+    return (void*)next_;
+  }
+  Lexer& skip(int64_t bytes) {
+    next_ += bytes;
+    return *this;
+  }
+  int64_t readEncodedValue(uint8_t enc) {
+    switch (enc & 0xF) {
+      case DW_EH_PE_udata2:
+        return read<uint16_t>();
+      case DW_EH_PE_sdata2:
+        return read<int16_t>();
+      case DW_EH_PE_udata4:
+        return read<uint32_t>();
+      case DW_EH_PE_sdata4:
+        return read<int32_t>();
+      case DW_EH_PE_udata8:
+        return read<uint64_t>();
+      case DW_EH_PE_sdata8:
+        return read<int64_t>();
+      case DW_EH_PE_uleb128:
+        return readULEB128();
+      case DW_EH_PE_sleb128:
+        return readSLEB128();
+      default:
+        throw UnwindError("not implemented");
+    }
+  }
+
+ private:
+  const char* next_;
+  int64_t base_;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind.h
new file mode 100644
index 0000000000000000000000000000000000000000..81dc212d9e97d8b05b0cd6b1f39c0ada1022a6f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/util/Optional.h>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace unwind {
+// gather current stack, relatively fast.
+// gets faster once the cache of program counter locations is warm.
+TORCH_API std::vector<void*> unwind();
+
+struct Frame {
+  std::string filename;
+  std::string funcname;
+  uint64_t lineno;
+};
+
+// note: symbolize is really slow
+// it will launch an addr2line process that has to parse dwarf
+// information from the libraries that frames point into.
+// Callers should first batch up all the unique void* pointers
+// across a number of unwind states and make a single call to
+// symbolize.
+TORCH_API std::vector<Frame> symbolize(const std::vector<void*>& frames);
+
+// returns path to the library, and the offset of the addr inside the library
+TORCH_API c10::optional<std::pair<std::string, uint64_t>> libraryFor(
+    void* addr);
+
+struct Stats {
+  size_t hits = 0;
+  size_t misses = 0;
+  size_t unsupported = 0;
+  size_t resets = 0;
+};
+Stats stats();
+
+} // namespace unwind
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind_error.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind_error.h
new file mode 100644
index 0000000000000000000000000000000000000000..f22c5426b95520346ffd2ea2261bb3beeaeedc49
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwind_error.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <stdexcept>
+
+struct UnwindError : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwinder.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwinder.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d02940db096bdec37549f89e4353d921a3ab364
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/unwinder.h
@@ -0,0 +1,77 @@
+#pragma once
+#include <torch/csrc/profiler/unwind/action.h>
+#include <torch/csrc/profiler/unwind/unwind_error.h>
+#include <cstdint>
+#include <limits>
+
+struct UnwindState {
+  int64_t rip, rbp, rsp;
+};
+
+struct Unwinder {
+  Unwinder(Action rsp, Action rip, Action rbp)
+      : kind_(rip.kind == A_UNDEFINED ? END : STANDARD),
+        reg_(rsp.reg),
+        off_(rsp.data),
+        rip_off_(rip.data),
+        rbp_off_(
+            rbp.kind == A_UNDEFINED ? std::numeric_limits<int64_t>::max()
+                                    : rbp.data),
+        deref_(rsp.kind == A_REG_PLUS_DATA_DEREF) {
+    check(rsp.reg == D_RSP || rsp.reg == D_RBP);
+    check(rip.kind == A_UNDEFINED || rip.kind == A_LOAD_CFA_OFFSET);
+    if (rsp.kind == A_REG_PLUS_DATA) {
+      check(rbp.kind == A_LOAD_CFA_OFFSET || rbp.kind == A_UNDEFINED);
+    } else if (rsp.kind == A_REG_PLUS_DATA_DEREF) {
+      if (rbp.kind == A_REG_PLUS_DATA_DEREF) {
+        check(rbp.reg == rsp.reg);
+        rbp_off_ -= rsp.data;
+      } else {
+        check(rbp.kind == A_UNDEFINED);
+      }
+    } else {
+      check(false);
+    }
+  }
+  void check(bool cond) {
+    if (!cond) {
+      throw UnwindError("Unwinding actions do not follow supported patterns");
+    }
+  }
+  bool terminator() const {
+    return kind_ != STANDARD;
+  }
+  bool isUnknown() const {
+    return kind_ == UNKNOWN;
+  }
+  // unwinder representing some pattern unsupported in
+  // current implementation
+  static Unwinder unknown() {
+    return Unwinder();
+  }
+  UnwindState run(const UnwindState& cur) const {
+    UnwindState r = cur;
+    r.rsp = (reg_ == D_RSP ? cur.rsp : cur.rbp) + off_;
+    r.rbp = rbp_off_ == std::numeric_limits<int64_t>::max()
+        ? cur.rbp
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
+        : *(int64_t*)(r.rsp + rbp_off_);
+    if (deref_) {
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
+      r.rsp = *(int64_t*)r.rsp;
+    }
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    r.rip = *(int64_t*)(r.rsp + rip_off_);
+
+    return r;
+  }
+
+ private:
+  Unwinder() : kind_(UNKNOWN), reg_(0), off_(0), rip_off_(0), rbp_off_(0) {}
+  enum Kind { STANDARD, END, UNKNOWN } kind_;
+  uint32_t reg_;
+  int64_t off_;
+  int64_t rip_off_;
+  int64_t rbp_off_;
+  bool deref_{false};
+};
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/util.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4418e0b92a6ee1bd4c03090417d3312f86279ff0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/profiler/util.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/record_function.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+#include <c10/util/hash.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+// TODO: replace with pytorch/rfcs#43 when it is ready.
+#define SOFT_ASSERT(cond, ...)                         \
+  [&]() -> bool {                                      \
+    if (C10_UNLIKELY(!(cond))) {                       \
+      torch::profiler::impl::logSoftAssert(            \
+          __func__,                                    \
+          __FILE__,                                    \
+          static_cast<uint32_t>(__LINE__),             \
+          #cond,                                       \
+          ::c10::str(__VA_ARGS__));                    \
+      if (torch::profiler::impl::softAssertRaises()) { \
+        TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__);      \
+      } else {                                         \
+        TORCH_WARN(__VA_ARGS__);                       \
+      }                                                \
+      return false;                                    \
+    }                                                  \
+    return true;                                       \
+  }()
+
+namespace torch {
+namespace profiler {
+namespace impl {
+TORCH_API bool softAssertRaises();
+TORCH_API void setSoftAssertRaises(c10::optional<bool> value);
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const char* args);
+TORCH_API inline void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    ::c10::detail::CompileTimeEmptyString args) {
+  logSoftAssert(func, file, line, cond, (const char*)args);
+}
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const std::string& args);
+
+using shape =
+    std::variant<std::vector<int64_t>, std::vector<std::vector<int64_t>>>;
+constexpr int TENSOR_LIST_DISPLAY_LENGTH_LIMIT = 30;
+
+std::string getNvtxStr(
+    const char* name,
+    int64_t sequence_nr,
+    const std::vector<std::vector<int64_t>>& shapes,
+    at::RecordFunctionHandle op_id = 0,
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids =
+        {});
+
+struct TORCH_API FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+
+TORCH_API std::vector<FileLineFunc> prepareCallstack(
+    const std::vector<jit::StackEntry>& cs);
+TORCH_API std::vector<std::string> callstackStr(
+    const std::vector<FileLineFunc>& cs);
+TORCH_API std::string stacksToStr(
+    const std::vector<std::string>& stacks,
+    const char* delim);
+TORCH_API std::vector<std::vector<int64_t>> inputSizes(
+    const at::RecordFunction& fn,
+    const bool flatten_list_enabled = false);
+TORCH_API std::string variantShapesToStr(const std::vector<shape>& shapes);
+TORCH_API std::string shapesToStr(
+    const std::vector<std::vector<int64_t>>& shapes);
+TORCH_API std::string strListToStr(const std::vector<std::string>& types);
+TORCH_API std::string inputOpIdsToStr(
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids);
+TORCH_API std::string ivalueListToStr(const std::vector<c10::IValue>& list);
+TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn);
+
+std::unordered_map<std::string, c10::IValue> TORCH_API
+saveExtraArgs(const at::RecordFunction& fn);
+std::unordered_map<std::string, std::string> TORCH_API
+saveNcclMeta(const at::RecordFunction& fn);
+
+uint64_t TORCH_API computeFlops(
+    const std::string& op_name,
+    const std::unordered_map<std::string, c10::IValue>& extra_args);
+
+std::string shapeToStr(const std::vector<int64_t>& shape);
+
+template <typename T>
+class TORCH_API GlobalStateManager {
+ public:
+  static GlobalStateManager& singleton() {
+    static GlobalStateManager singleton_;
+    return singleton_;
+  }
+
+  static void push(std::shared_ptr<T>&& state) {
+    if (singleton().state_) {
+      LOG(WARNING) << "GlobalStatePtr already exists!";
+    } else {
+      singleton().state_ = std::move(state);
+    }
+  }
+
+  static auto* get() {
+    return singleton().state_.get();
+  }
+
+  static std::shared_ptr<T> pop() {
+    auto out = singleton().state_;
+    singleton().state_.reset();
+    return out;
+  }
+
+ private:
+  GlobalStateManager() = default;
+
+  std::shared_ptr<T> state_;
+};
+
+struct HashCombine {
+  template <typename T0, typename T1>
+  size_t operator()(const std::pair<T0, T1>& i) {
+    return c10::get_hash((*this)(i.first), (*this)(i.second));
+  }
+
+  template <typename... Args>
+  size_t operator()(const std::tuple<Args...>& i) {
+    return c10::get_hash(i);
+  }
+
+  template <typename T>
+  size_t operator()(const T& i) {
+    return c10::get_hash(i);
+  }
+};
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/python_dimname.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/python_dimname.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc0689a5d2ade7a0c9a1dac6e522f22c34d511a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/python_dimname.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <ATen/Dimname.h>
+#include <torch/csrc/python_headers.h>
+
+at::Dimname THPDimname_parse(PyObject* obj);
+bool THPUtils_checkDimname(PyObject* obj);
+bool THPUtils_checkDimnameList(PyObject* obj);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/python_headers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/python_headers.h
new file mode 100644
index 0000000000000000000000000000000000000000..552a39fcca97c33368c434ea6452c9352c7f76ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/python_headers.h
@@ -0,0 +1,25 @@
+#pragma once
+// workaround for https://github.com/python/cpython/pull/23326
+#include <cmath>
+#include <complex>
+// workaround for Python 2 issue: https://bugs.python.org/issue17120
+// NOTE: It looks like this affects Python 3 as well.
+#pragma push_macro("_XOPEN_SOURCE")
+#pragma push_macro("_POSIX_C_SOURCE")
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+
+#include <Python.h>
+#include <frameobject.h>
+#include <structseq.h>
+
+#pragma pop_macro("_XOPEN_SOURCE")
+#pragma pop_macro("_POSIX_C_SOURCE")
+
+#ifdef copysign
+#undef copysign
+#endif
+
+#if PY_MAJOR_VERSION < 3
+#error "Python 2 has reached end-of-life and is no longer supported by PyTorch."
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/serialization.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c22a9ee3281eb748d2b1438f6908c7dc73ed17b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/serialization.h
@@ -0,0 +1,27 @@
+#ifndef THP_SERIALIZATION_INC
+#define THP_SERIALIZATION_INC
+
+#include <c10/core/StorageImpl.h>
+#include <c10/util/intrusive_ptr.h>
+template <class io>
+void doRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+void doWrite(io fildes, void* buf, size_t nbytes);
+
+// Note that this takes a mutable storage because it may pass through
+// to at::from_blob.
+template <class io>
+void THPStorage_writeFileRaw(
+    c10::StorageImpl* self,
+    io fd,
+    bool save_size,
+    uint64_t element_size);
+
+template <class io>
+c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
+    io fd,
+    c10::intrusive_ptr<c10::StorageImpl> storage,
+    uint64_t element_size);
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/tensor/python_tensor.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/tensor/python_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..22eab5fba771e4986e538cc5fec5095835de9e02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/tensor/python_tensor.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/python_headers.h>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+namespace tensors {
+
+// Initializes the Python tensor type objects: torch.FloatTensor,
+// torch.DoubleTensor, etc. and binds them in their containing modules.
+void initialize_python_bindings();
+
+// Same as set_default_tensor_type() but takes a PyObject*
+void py_set_default_tensor_type(PyObject* type_obj);
+
+// Same as py_set_default_tensor_type, but only changes the dtype (ScalarType).
+void py_set_default_dtype(PyObject* dtype_obj);
+
+// Gets the DispatchKey for the default tensor type.
+//
+// TODO: This is nuts!  There is no reason to let the default tensor type id
+// change.  Probably only store ScalarType, as that's the only flex point
+// we support.
+TORCH_API c10::DispatchKey get_default_dispatch_key();
+at::Device get_default_device();
+
+// Gets the ScalarType for the default tensor type.
+at::ScalarType get_default_scalar_type();
+} // namespace tensors
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d965e6e196a985bbd75975966695b26d17b3c376
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils.h
@@ -0,0 +1,217 @@
+#ifndef THP_UTILS_H
+#define THP_UTILS_H
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Storage.h>
+#include <torch/csrc/THConcat.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_compat.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#ifdef USE_CUDA
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+#define THPUtils_(NAME) TH_CONCAT_4(THP, Real, Utils_, NAME)
+
+#define THPUtils_typename(obj) (Py_TYPE(obj)->tp_name)
+
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define THP_EXPECT(x, y) (__builtin_expect((x), (y)))
+#else
+#define THP_EXPECT(x, y) (x)
+#endif
+
+#define THPUtils_checkReal_FLOAT(object) \
+  (PyFloat_Check(object) || PyLong_Check(object))
+
+#define THPUtils_unpackReal_FLOAT(object)           \
+  (PyFloat_Check(object) ? PyFloat_AsDouble(object) \
+       : PyLong_Check(object)                       \
+       ? PyLong_AsLongLong(object)                  \
+       : (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_checkReal_INT(object) PyLong_Check(object)
+
+#define THPUtils_unpackReal_INT(object) \
+  (PyLong_Check(object)                 \
+       ? PyLong_AsLongLong(object)      \
+       : (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_unpackReal_BOOL(object) \
+  (PyBool_Check(object)                  \
+       ? object                          \
+       : (throw std::runtime_error("Could not parse real"), Py_False))
+
+#define THPUtils_unpackReal_COMPLEX(object)                                   \
+  (PyComplex_Check(object)                                                    \
+       ? (c10::complex<double>(                                               \
+             PyComplex_RealAsDouble(object), PyComplex_ImagAsDouble(object))) \
+       : PyFloat_Check(object)                                                \
+       ? (c10::complex<double>(PyFloat_AsDouble(object), 0))                  \
+       : PyLong_Check(object)                                                 \
+       ? (c10::complex<double>(PyLong_AsLongLong(object), 0))                 \
+       : (throw std::runtime_error("Could not parse real"),                   \
+          c10::complex<double>(0, 0)))
+
+#define THPUtils_checkReal_BOOL(object) PyBool_Check(object)
+
+#define THPUtils_checkReal_COMPLEX(object)                                    \
+  PyComplex_Check(object) || PyFloat_Check(object) || PyLong_Check(object) || \
+      PyInt_Check(object)
+
+#define THPUtils_newReal_FLOAT(value) PyFloat_FromDouble(value)
+#define THPUtils_newReal_INT(value) PyInt_FromLong(value)
+
+#define THPUtils_newReal_BOOL(value) PyBool_FromLong(value)
+
+#define THPUtils_newReal_COMPLEX(value) \
+  PyComplex_FromDoubles(value.real(), value.imag())
+
+#define THPDoubleUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPDoubleUtils_unpackReal(object) \
+  (double)THPUtils_unpackReal_FLOAT(object)
+#define THPDoubleUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPFloatUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPFloatUtils_unpackReal(object) \
+  (float)THPUtils_unpackReal_FLOAT(object)
+#define THPFloatUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPHalfUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPHalfUtils_unpackReal(object) \
+  (at::Half) THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newReal(value) PyFloat_FromDouble(value)
+#define THPHalfUtils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+#define THPComplexDoubleUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexDoubleUtils_unpackReal(object) \
+  THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexDoubleUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPComplexFloatUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexFloatUtils_unpackReal(object) \
+  (c10::complex<float>)THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexFloatUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPBFloat16Utils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPBFloat16Utils_unpackReal(object) \
+  (at::BFloat16) THPUtils_unpackReal_FLOAT(object)
+#define THPBFloat16Utils_newReal(value) PyFloat_FromDouble(value)
+#define THPBFloat16Utils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+
+#define THPBoolUtils_checkReal(object) THPUtils_checkReal_BOOL(object)
+#define THPBoolUtils_unpackReal(object) THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newReal(value) THPUtils_newReal_BOOL(value)
+#define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
+#define THPBoolUtils_unpackAccreal(object) \
+  (int64_t) THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
+#define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackReal(object) \
+  (int64_t) THPUtils_unpackReal_INT(object)
+#define THPLongUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPIntUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPIntUtils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPIntUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPShortUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPShortUtils_unpackReal(object) (short)THPUtils_unpackReal_INT(object)
+#define THPShortUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPCharUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPCharUtils_unpackReal(object) (char)THPUtils_unpackReal_INT(object)
+#define THPCharUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPByteUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPByteUtils_unpackReal(object) \
+  (unsigned char)THPUtils_unpackReal_INT(object)
+#define THPByteUtils_newReal(value) THPUtils_newReal_INT(value)
+// quantized types
+#define THPQUInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt32Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt32Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt32Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt4x2Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt4x2Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt4x2Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt2x4Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt2x4Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt2x4Utils_newReal(value) THPUtils_newReal_INT(value)
+
+/*
+   From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
+   If compiled as a shared library, some compilers don't allow addresses of
+   Python objects defined in other libraries to be used in static PyTypeObject
+   initializers. The DEFERRED_ADDRESS macro is used to tag the slots where such
+   addresses appear; the module init function that adds the PyTypeObject to the
+   module must fill in the tagged slots at runtime. The argument is for
+   documentation -- the macro ignores it.
+*/
+#define DEFERRED_ADDRESS(ADDR) nullptr
+
+TORCH_PYTHON_API void THPUtils_setError(const char* format, ...);
+TORCH_PYTHON_API void THPUtils_invalidArguments(
+    PyObject* given_args,
+    PyObject* given_kwargs,
+    const char* function_name,
+    size_t num_options,
+    ...);
+
+bool THPUtils_checkIntTuple(PyObject* arg);
+std::vector<int> THPUtils_unpackIntTuple(PyObject* arg);
+
+TORCH_PYTHON_API void THPUtils_addPyMethodDefs(
+    std::vector<PyMethodDef>& vector,
+    PyMethodDef* methods);
+
+int THPUtils_getCallable(PyObject* arg, PyObject** result);
+
+typedef THPPointer<THPGenerator> THPGeneratorPtr;
+typedef class THPPointer<THPStorage> THPStoragePtr;
+
+TORCH_PYTHON_API std::vector<int64_t> THPUtils_unpackLongs(PyObject* arg);
+PyObject* THPUtils_dispatchStateless(
+    PyObject* tensor,
+    const char* name,
+    PyObject* args,
+    PyObject* kwargs);
+
+template <typename _real, typename = void>
+struct mod_traits {};
+
+template <typename _real>
+struct mod_traits<_real, std::enable_if_t<std::is_floating_point_v<_real>>> {
+  static _real mod(_real a, _real b) {
+    return fmod(a, b);
+  }
+};
+
+template <typename _real>
+struct mod_traits<_real, std::enable_if_t<std::is_integral_v<_real>>> {
+  static _real mod(_real a, _real b) {
+    return a % b;
+  }
+};
+
+void setBackCompatBroadcastWarn(bool warn);
+bool getBackCompatBroadcastWarn();
+
+void setBackCompatKeepdimWarn(bool warn);
+bool getBackCompatKeepdimWarn();
+bool maybeThrowBackCompatKeepdimWarn(char* func);
+
+// NB: This is in torch/csrc/cuda/utils.cpp, for whatever reason
+#ifdef USE_CUDA
+std::vector<c10::optional<at::cuda::CUDAStream>>
+THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
+#endif
+
+void storage_fill(const at::Storage& self, uint8_t value);
+void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
+uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
+
+#endif
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/byte_order.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/byte_order.h
new file mode 100644
index 0000000000000000000000000000000000000000..84f0ffea6e98c31dfdce47619c975eb18d687160
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/byte_order.h
@@ -0,0 +1,227 @@
+#pragma once
+
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <torch/csrc/Export.h>
+#include <cstddef>
+#include <cstdint>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#include <sys/types.h>
+#define thp_bswap16(x) bswap16(x)
+#define thp_bswap32(x) bswap32(x)
+#define thp_bswap64(x) bswap64(x)
+#elif defined(__APPLE__)
+#include <libkern/OSByteOrder.h>
+#define thp_bswap16(x) OSSwapInt16(x)
+#define thp_bswap32(x) OSSwapInt32(x)
+#define thp_bswap64(x) OSSwapInt64(x)
+#elif defined(__GNUC__) && !defined(__MINGW32__)
+#include <byteswap.h>
+#define thp_bswap16(x) bswap_16(x)
+#define thp_bswap32(x) bswap_32(x)
+#define thp_bswap64(x) bswap_64(x)
+#elif defined _WIN32 || defined _WIN64
+#define thp_bswap16(x) _byteswap_ushort(x)
+#define thp_bswap32(x) _byteswap_ulong(x)
+#define thp_bswap64(x) _byteswap_uint64(x)
+#endif
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define to_be16(x) thp_bswap16(x)
+#define from_be16(x) thp_bswap16(x)
+#define to_be32(x) thp_bswap32(x)
+#define from_be32(x) thp_bswap32(x)
+#define to_be64(x) thp_bswap64(x)
+#define from_be64(x) thp_bswap64(x)
+#define to_le16(x) (x)
+#define from_le16(x) (x)
+#define to_le32(x) (x)
+#define from_le32(x) (x)
+#define to_le64(x) (x)
+#define from_le64(x) (x)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define to_be16(x) (x)
+#define from_be16(x) (x)
+#define to_be32(x) (x)
+#define from_be32(x) (x)
+#define to_be64(x) (x)
+#define from_be64(x) (x)
+#define to_le16(x) thp_bswap16(x)
+#define from_le16(x) thp_bswap16(x)
+#define to_le32(x) thp_bswap32(x)
+#define from_le32(x) thp_bswap32(x)
+#define to_le64(x) thp_bswap64(x)
+#define from_le64(x) thp_bswap64(x)
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
+
+namespace torch {
+namespace utils {
+
+enum THPByteOrder { THP_LITTLE_ENDIAN = 0, THP_BIG_ENDIAN = 1 };
+
+TORCH_API THPByteOrder THP_nativeByteOrder();
+
+TORCH_API void THP_decodeInt16Buffer(
+    int16_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeInt32Buffer(
+    int32_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeInt64Buffer(
+    int64_t* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeHalfBuffer(
+    c10::Half* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeFloatBuffer(
+    float* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeDoubleBuffer(
+    double* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeBoolBuffer(
+    bool* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeBFloat16Buffer(
+    at::BFloat16* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeComplexFloatBuffer(
+    c10::complex<float>* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+TORCH_API void THP_decodeComplexDoubleBuffer(
+    c10::complex<double>* dst,
+    const uint8_t* src,
+    bool do_byte_swap,
+    size_t len);
+
+TORCH_API void THP_decodeInt16Buffer(
+    int16_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeInt32Buffer(
+    int32_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeInt64Buffer(
+    int64_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeHalfBuffer(
+    c10::Half* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeFloatBuffer(
+    float* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeDoubleBuffer(
+    double* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeBoolBuffer(
+    bool* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeBFloat16Buffer(
+    at::BFloat16* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeFloat8_e5m2Buffer(
+    at::Float8_e5m2* dst,
+    const uint8_t* src,
+    size_t len);
+TORCH_API void THP_decodeFloat8_e4m3fnBuffer(
+    at::Float8_e4m3fn* dst,
+    const uint8_t* src,
+    size_t len);
+TORCH_API void THP_decodeFloat8_e5m2fnuzBuffer(
+    at::Float8_e5m2fnuz* dst,
+    const uint8_t* src,
+    size_t len);
+TORCH_API void THP_decodeFloat8_e4m3fnuzBuffer(
+    at::Float8_e4m3fnuz* dst,
+    const uint8_t* src,
+    size_t len);
+TORCH_API void THP_decodeComplexFloatBuffer(
+    c10::complex<float>* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeComplexDoubleBuffer(
+    c10::complex<double>* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+
+TORCH_API void THP_encodeInt16Buffer(
+    uint8_t* dst,
+    const int16_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeInt32Buffer(
+    uint8_t* dst,
+    const int32_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeInt64Buffer(
+    uint8_t* dst,
+    const int64_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeFloatBuffer(
+    uint8_t* dst,
+    const float* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeDoubleBuffer(
+    uint8_t* dst,
+    const double* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeComplexFloatBuffer(
+    uint8_t* dst,
+    const c10::complex<float>* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeComplexDoubleBuffer(
+    uint8_t* dst,
+    const c10::complex<double>* src,
+    THPByteOrder order,
+    size_t len);
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cpp_stacktraces.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cpp_stacktraces.h
new file mode 100644
index 0000000000000000000000000000000000000000..21cd2ed5dc554de9510d094a9e03aea2c0736344
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cpp_stacktraces.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+namespace torch {
+TORCH_API bool get_cpp_stacktraces_enabled();
+TORCH_API bool get_disable_addr2line();
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cuda_enabled.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cuda_enabled.h
new file mode 100644
index 0000000000000000000000000000000000000000..b744f0ecf987e17df234b88efcf17b47ecd55df9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/cuda_enabled.h
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace torch {
+namespace utils {
+
+static inline bool cuda_enabled() {
+#ifdef USE_CUDA
+  return true;
+#else
+  return false;
+#endif
+}
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/device_lazy_init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/device_lazy_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..612ba857c76744829d27551947e7222e639f4445
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/device_lazy_init.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/core/TensorOptions.h>
+
+// device_lazy_init() is always compiled, even for CPU-only builds.
+
+namespace torch::utils {
+
+/**
+ * This mechanism of lazy initialization is designed for each device backend.
+ * Currently, CUDA and XPU follow this design. This function `device_lazy_init`
+ * MUST be called before you attempt to access any Type(CUDA or XPU) object
+ * from ATen, in any way. It guarantees that the device runtime status is lazily
+ * initialized when the first runtime API is requested.
+ *
+ * Here are some common ways that a device object may be retrieved:
+ *   - You call getNonVariableType or getNonVariableTypeOpt
+ *   - You call toBackend() on a Type
+ *
+ * It's important to do this correctly, because if you forget to add it you'll
+ * get an oblique error message seems like "Cannot initialize CUDA without
+ * ATen_cuda library" or "Cannot initialize XPU without ATen_xpu library" if you
+ * try to use CUDA or XPU functionality from a CPU-only build, which is not good
+ * UX.
+ */
+void device_lazy_init(at::DeviceType device_type);
+void set_requires_device_init(at::DeviceType device_type, bool value);
+
+static inline void maybe_initialize_device(at::Device& device) {
+  // Add more devices here to enable lazy initialization.
+  if (device.is_cuda() || device.is_xpu()) {
+    device_lazy_init(device.type());
+  }
+}
+
+static inline void maybe_initialize_device(c10::optional<at::Device>& device) {
+  if (!device.has_value()) {
+    return;
+  }
+  maybe_initialize_device(device.value());
+}
+
+static inline void maybe_initialize_device(const at::TensorOptions& options) {
+  auto device = options.device();
+  maybe_initialize_device(device);
+}
+
+} // namespace torch::utils
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/disable_torch_function.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/disable_torch_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eac2f97966a8a7cc1601fa0a892417ec60538f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/disable_torch_function.h
@@ -0,0 +1,42 @@
+#pragma once
+#include <c10/core/DispatchKey.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+// Sometimes we don't want infinite recursion for subclasses,
+// Or a way to achieve the old behaviour.
+
+// This is an internal utility, not exposed to users.
+bool torch_function_enabled();
+PyObject* disabled_torch_function_impl();
+PyObject* disabled_torch_dispatch_impl();
+void set_disabled_torch_function_impl(PyObject* value);
+void set_disabled_torch_dispatch_impl(PyObject* value);
+// Set ignore_mode to true if you're trying to collect overloaded arguments;
+// using mode here will improperly cause you to add ALL objects to the
+// overloaded list even if they don't actually have __torch_function__
+bool check_has_torch_function(PyObject* obj, bool ignore_mode = false);
+
+struct DisableTorchDispatch {
+  DisableTorchDispatch()
+      : guard_(c10::DispatchKeySet(
+            {c10::DispatchKey::Python, c10::DispatchKey::PreDispatch})),
+        guard_tls_snapshot_(c10::DispatchKey::PythonTLSSnapshot) {}
+  c10::impl::ExcludeDispatchKeyGuard guard_;
+  c10::impl::ExcludeDispatchKeyGuard guard_tls_snapshot_;
+};
+
+} // namespace torch
+
+PyObject* THPModule_isEnabledTorchFunction(PyObject* self, PyObject* unused);
+PyObject* THPModule_DisableTorchFunctionType();
+PyObject* THPModule_DisableTorchFunctionSubclassType();
+PyObject* THPModule_disable_torch_function(PyObject* self, PyObject* args);
+PyObject* THPModule_disable_torch_dispatch(PyObject* self, PyObject* args);
+PyObject* THPModule_has_torch_function(PyObject*, PyObject* arg);
+PyObject* THPModule_has_torch_function_unary(PyObject*, PyObject* obj);
+PyObject* THPModule_has_torch_function_variadic(
+    PyObject*,
+    PyObject* const* args,
+    Py_ssize_t nargs);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/init.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..06fa22ae4354f6195e8f81a558f5249227f83fa3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/init.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace throughput_benchmark {
+
+void initThroughputBenchmarkBindings(PyObject* module);
+
+} // namespace throughput_benchmark
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/invalid_arguments.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/invalid_arguments.h
new file mode 100644
index 0000000000000000000000000000000000000000..dba0af4eb0367199d239ed52faf4517404ae271d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/invalid_arguments.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+std::string format_invalid_args(
+    PyObject* given_args,
+    PyObject* given_kwargs,
+    const std::string& function_name,
+    const std::vector<std::string>& options);
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/nested.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0b9dc220a88c49915f429ab5cbeebed28b0a86f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/nested.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+
+#include <ATen/core/Tensor.h>
+
+namespace torch {
+namespace utils {
+
+at::Tensor nested_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/numpy_stub.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/numpy_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7cbb904fb6bf8dbeadfd55430235da5ee1e047a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/numpy_stub.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#ifdef USE_NUMPY
+
+#if !defined(NO_IMPORT_ARRAY) && !defined(WITH_NUMPY_IMPORT_ARRAY)
+#define NO_IMPORT_ARRAY
+#endif
+
+#ifndef PY_ARRAY_UNIQUE_SYMBOL
+#define PY_ARRAY_UNIQUE_SYMBOL __numpy_array_api
+#endif
+
+#ifndef NPY_NO_DEPRECATED_API
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#endif
+
+#include <numpy/arrayobject.h>
+
+#endif // USE_NUMPY
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/object_ptr.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/object_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e522e0794b45ac2f655f25370da236616723e09
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/object_ptr.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+#include <utility>
+
+template <class T>
+class TORCH_PYTHON_API THPPointer {
+ public:
+  THPPointer() : ptr(nullptr){};
+  explicit THPPointer(T* ptr) noexcept : ptr(ptr){};
+  THPPointer(THPPointer&& p) noexcept : ptr(std::exchange(p.ptr, nullptr)) {}
+
+  ~THPPointer() {
+    free();
+  };
+  T* get() {
+    return ptr;
+  }
+  const T* get() const {
+    return ptr;
+  }
+  T* release() {
+    T* tmp = ptr;
+    ptr = nullptr;
+    return tmp;
+  }
+  operator T*() {
+    return ptr;
+  }
+  THPPointer& operator=(T* new_ptr) noexcept {
+    free();
+    ptr = new_ptr;
+    return *this;
+  }
+  THPPointer& operator=(THPPointer&& p) noexcept {
+    free();
+    ptr = p.ptr;
+    p.ptr = nullptr;
+    return *this;
+  }
+  T* operator->() {
+    return ptr;
+  }
+  explicit operator bool() const {
+    return ptr != nullptr;
+  }
+
+ private:
+  void free();
+  T* ptr = nullptr;
+};
+
+/**
+ * An RAII-style, owning pointer to a PyObject.  You must protect
+ * destruction of this object with the GIL.
+ *
+ * WARNING: Think twice before putting this as a field in a C++
+ * struct.  This class does NOT take out the GIL on destruction,
+ * so if you will need to ensure that the destructor of your struct
+ * is either (a) always invoked when the GIL is taken or (b) takes
+ * out the GIL itself.  Easiest way to avoid this problem is to
+ * not use THPPointer in this situation.
+ */
+using THPObjectPtr = THPPointer<PyObject>;
+using THPCodeObjectPtr = THPPointer<PyCodeObject>;
+using THPFrameObjectPtr = THPPointer<PyFrameObject>;
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/out_types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/out_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..251386aebfdb3910023e332b8dd63bb20527ba96
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/out_types.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace torch {
+namespace utils {
+
+TORCH_API void check_out_type_matches(
+    const at::Tensor& result,
+    c10::optional<at::ScalarType> scalarType,
+    bool scalarType_is_none,
+    c10::optional<at::Layout> layout,
+    c10::optional<at::Device> device,
+    bool device_is_none);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pybind.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e9ebefadb1bbbf4109524e70723891bd358443e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pybind.h
@@ -0,0 +1,381 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/util/irange.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <torch/csrc/Device.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/MemoryFormat.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/utils/tensor_memoryformats.h>
+
+namespace py = pybind11;
+
+// This makes intrusive_ptr to be available as a custom pybind11 holder type,
+// see
+// https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers
+PYBIND11_DECLARE_HOLDER_TYPE(T, c10::intrusive_ptr<T>, true);
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, c10::SingletonOrSharedTypePtr<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, c10::SingletonTypePtr<T>, true);
+
+namespace pybind11::detail {
+
+// torch.Tensor <-> at::Tensor conversions (without unwrapping)
+template <>
+struct TORCH_PYTHON_API type_caster<at::Tensor> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::Tensor, _("torch.Tensor"));
+
+  bool load(handle src, bool);
+
+  static handle cast(
+      const at::Tensor& src,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+// torch._StorageBase <-> at::Storage
+template <>
+struct type_caster<at::Storage> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::Storage, _("torch.StorageBase"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (torch::isStorage(obj)) {
+      value = torch::createStorage(obj);
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      const at::Storage& src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return handle(torch::createPyObject(src));
+  }
+};
+
+template <>
+struct type_caster<at::Generator> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::Generator, _("torch.Generator"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (THPGenerator_Check(obj)) {
+      value = reinterpret_cast<THPGenerator*>(obj)->cdata;
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      const at::Generator& src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return handle(THPGenerator_Wrap(src));
+  }
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<at::IntArrayRef> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::IntArrayRef, _("Tuple[int, ...]"));
+
+  bool load(handle src, bool);
+  static handle cast(
+      at::IntArrayRef src,
+      return_value_policy /* policy */,
+      handle /* parent */);
+
+ private:
+  std::vector<int64_t> v_value;
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<at::SymIntArrayRef> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::SymIntArrayRef, _("List[int]"));
+
+  bool load(handle src, bool);
+  static handle cast(
+      at::SymIntArrayRef src,
+      return_value_policy /* policy */,
+      handle /* parent */);
+
+ private:
+  std::vector<c10::SymInt> v_value;
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<at::ArrayRef<c10::SymNode>> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::ArrayRef<c10::SymNode>, _("List[SymNode]"));
+
+  bool load(handle src, bool);
+  static handle cast(
+      at::ArrayRef<c10::SymNode> src,
+      return_value_policy /* policy */,
+      handle /* parent */);
+
+ private:
+  std::vector<c10::SymNode> v_value;
+};
+
+template <>
+struct type_caster<at::MemoryFormat> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::MemoryFormat, _("torch.memory_format"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (THPMemoryFormat_Check(obj)) {
+      value = reinterpret_cast<THPMemoryFormat*>(obj)->memory_format;
+      return true;
+    }
+    return false;
+  }
+  static handle cast(
+      at::MemoryFormat src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return handle(torch::utils::getTHPMemoryFormat(src));
+  }
+};
+
+template <>
+struct type_caster<at::Device> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(at::Device, _("torch.device"));
+
+  // PYBIND11_TYPE_CASTER defines a member field called value. Since at::Device
+  // cannot be default-initialized, we provide this constructor to explicitly
+  // initialize that field. The value doesn't matter as it will be overwritten
+  // after a successful call to load.
+  type_caster() : value(c10::kCPU) {}
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (THPDevice_Check(obj)) {
+      value = reinterpret_cast<THPDevice*>(obj)->device;
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      const at::Device& src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return handle(THPDevice_New(src));
+  }
+};
+
+template <>
+struct type_caster<c10::Stream> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(c10::Stream, _("torch.Stream"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+    if (THPStream_Check(obj)) {
+      value = c10::Stream::unpack3(
+          ((THPStream*)obj)->stream_id,
+          ((THPStream*)obj)->device_index,
+          static_cast<c10::DeviceType>(((THPStream*)obj)->device_type));
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      const c10::Stream& src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return handle(THPStream_Wrap(src));
+  }
+};
+
+template <>
+struct type_caster<c10::DispatchKey>
+    : public type_caster_base<c10::DispatchKey> {
+  using base = type_caster_base<c10::DispatchKey>;
+  c10::DispatchKey tmp;
+
+ public:
+  bool load(handle src, bool convert) {
+    if (base::load(src, convert)) {
+      return true;
+    } else if (py::isinstance(
+                   src, py::module_::import("builtins").attr("str"))) {
+      tmp = c10::parseDispatchKey(py::cast<std::string>(src));
+      value = &tmp;
+      return true;
+    }
+    return false;
+  }
+
+  static handle cast(
+      c10::DispatchKey src,
+      return_value_policy policy,
+      handle parent) {
+    return base::cast(src, policy, parent);
+  }
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<c10::Scalar> {
+ public:
+  PYBIND11_TYPE_CASTER(
+      c10::Scalar,
+      _("Union[Number, torch.SymInt, torch.SymFloat, torch.SymBool]"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      const c10::Scalar& si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<c10::SymInt> {
+ public:
+  PYBIND11_TYPE_CASTER(c10::SymInt, _("Union[int, torch.SymInt]"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      const c10::SymInt& si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<c10::SymFloat> {
+ public:
+  PYBIND11_TYPE_CASTER(c10::SymFloat, _("float"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      const c10::SymFloat& si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+template <>
+struct TORCH_PYTHON_API type_caster<c10::SymBool> {
+ public:
+  PYBIND11_TYPE_CASTER(c10::SymBool, _("Union[bool, torch.SymBool]"));
+  bool load(py::handle src, bool);
+
+  static py::handle cast(
+      const c10::SymBool& si,
+      return_value_policy /* policy */,
+      handle /* parent */);
+};
+
+template <typename T>
+struct type_caster<c10::complex<T>> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(c10::complex<T>, _("complex"));
+
+  bool load(handle src, bool) {
+    PyObject* obj = src.ptr();
+
+    // Refered from `THPUtils_unpackComplexDouble`
+    Py_complex py_complex = PyComplex_AsCComplex(obj);
+    if (py_complex.real == -1.0 && PyErr_Occurred()) {
+      return false;
+    }
+
+    // Python's Complex is always double precision.
+    value = c10::complex<double>(py_complex.real, py_complex.imag);
+    return true;
+  }
+
+  static handle cast(
+      const c10::complex<T>& complex,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    // Python only knows double precision complex.
+    return handle(PyComplex_FromDoubles(complex.real(), complex.imag()));
+  }
+};
+
+} // namespace pybind11::detail
+
+namespace torch::impl {
+
+// Use this function if you have a C++ object that is used from both C++
+// and Python contexts, and you need its GIL to be released when you
+// destruct it in the Python context.
+//
+// This function is a valid shared_ptr destructor and can be used to
+// conveniently allocate a shared_ptr to an object whose destructor will be run
+// without the GIL.  Pass it as the second argument to shared_ptr, e.g.,
+//
+//    shared_ptr<T>(new T(), destroy_without_gil<T>)
+//
+// Attaching the GIL release logic to the holder pointer rather than the
+// actual destructor of T is helpful when T is Python-agnostic and
+// shouldn't refer to the PYthon API.
+//
+// Note there are limitations to the correctness of code that makes use of this.
+// In particular, if a shared_ptr is constructed from C++ code without this
+// destructor and then passed to pybind11, pybind11 will happily take ownership
+// of the shared_ptr (and be willing to destruct it from a context where it is
+// holding the GIL).  unique_ptr with a type branded deleter is less prone to
+// this problem, because a stock deleter unique_ptr is not convertible with it.
+// I plan to mitigate this problem by adding DEBUG-only asserts to the true C++
+// destructors that the GIL is not held (using a virtual call to get to the
+// Python interpreter); alternately, we could use a virtual call to simply
+// ensure we release the GIL in the C++ destructor, however, this is a layering
+// violation (why does code that is ostensibly Python agnostic calling into the
+// GIL).
+//
+// Adapted from
+// https://github.com/pybind/pybind11/issues/1446#issuecomment-406341510
+template <typename T>
+inline void destroy_without_gil(T* ptr) {
+  // Because the ownership of a shared_ptr is diffuse, it's not possible to
+  // necessarily predict whether or not the last reference to an object will
+  // be destructed from Python or C++.  This means that in the destructor here,
+  // we don't necessarily know if we actually have the GIL or not; in fact,
+  // we don't even know if the Python interpreter still exists!  Thus, we have
+  // to test for it before releasing the GIL.
+  //
+  // PyGILState_Check is hopefully self explanatory.  But Py_IsInitialized or
+  // _PyIsFinalizing?  Both get set at the same time during the Python
+  // destruction process:
+  // https://github.com/python/cpython/blob/d92513390a1a0da781bb08c284136f4d7abea36d/Python/pylifecycle.c#L1716-L1717
+  // so the operant question is whether or not you want to release the GIL after
+  // finalization has completed (and there is just no Python interpreter).
+  // Clearly there is no need to release GIL in that state, so we want
+  // Py_IsInitialized.
+  if (Py_IsInitialized() && PyGILState_Check()) {
+    pybind11::gil_scoped_release nogil;
+    delete ptr;
+  } else {
+    delete ptr;
+  }
+}
+
+} // namespace torch::impl
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pycfunction_helpers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pycfunction_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..14459101dda9451ed7d755ee89229e38c78bfd0b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pycfunction_helpers.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <Python.h>
+
+inline PyCFunction castPyCFunctionWithKeywords(PyCFunctionWithKeywords func) {
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wcast-function-type")
+  C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wcast-function-type-strict")
+  return reinterpret_cast<PyCFunction>(func);
+  C10_DIAGNOSTIC_POP()
+  C10_DIAGNOSTIC_POP()
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pyobject_preservation.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pyobject_preservation.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f0a05a048eab4a33206ca5f4269a5f7845cb9af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pyobject_preservation.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+// This file contains utilities used for handling PyObject preservation
+
+void clear_slots(PyTypeObject* type, PyObject* self);
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..f91dd0810f26d927fd5678d8a5a754b6a2dc115c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_arg_parser.h
@@ -0,0 +1,1286 @@
+#pragma once
+
+// Parse arguments to Python functions implemented in C++
+// This is similar to PyArg_ParseTupleAndKeywords(), but specifically handles
+// the types relevant to PyTorch and distinguishes between overloaded function
+// signatures.
+//
+// Example:
+//
+//   static PythonArgParser parser({
+//     "norm(Scalar p, int64_t dim, bool keepdim=False)",
+//     "norm(Scalar p=2)",
+//   });
+//   ParsedArgs<3> parsed_args;
+//   auto r = parser.parse(args, kwargs, parsed_args);
+//   if (r.idx == 0) {
+//     norm(r.scalar(0), r.int64(1), r.bool(0));
+//   } else {
+//     norm(r.scalar(0));
+//   }
+//
+// We auto-generate most uses of PythonArgParser; the generated files
+// are torch/csrc/autograd/generated/python_*.cpp
+//
+// Some gotchas that you should watch out for:
+//
+//    - Note [Order of overloads matters]
+//      Order of overloads matters.  A set of input arguments may
+//      bind to multiple argument specs; we will always pick the
+//      first one in PythonArgParser.  However, when you are writing
+//      overloads in, e.g., native_functions.yaml, you don't have to
+//      worry about what order you write them, because the code
+//      generation logic always gives the overloads a canonical
+//      order, where Tensor overloads come first, before Scalar overloads.
+//      This logic is in sort_declarations in
+//      tools/autograd/gen_python_functions.py
+//
+//    - Zero-dim tensors (e.g., torch.tensor(2)) bind to both
+//      Scalar and Tensor, UNLESS they require grad (in which case
+//      they only bind to Tensor).
+
+#include <pybind11/pytypes.h>
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/Layout.h>
+#include <torch/csrc/MemoryFormat.h>
+#include <torch/csrc/QScheme.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/dynamo/eval_frame.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/python_dimname.h>
+#include <torch/csrc/tensor/python_tensor.h>
+#include <torch/csrc/utils/disable_torch_function.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_strings.h>
+#include <torch/csrc/utils/python_symnode.h>
+#include <torch/csrc/utils/six.h>
+
+#include <ATen/PythonTorchFunctionTLS.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymNodeImpl.h>
+
+#include <c10/core/DispatchKeySet.h>
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+inline bool THPUtils_checkScalar(PyObject* obj) {
+#ifdef USE_NUMPY
+  if (torch::utils::is_numpy_scalar(obj)) {
+    return true;
+  }
+#endif
+  return PyFloat_Check(obj) || PyLong_Check(obj) || PyComplex_Check(obj) ||
+      torch::is_symint(py::handle(obj)) ||
+      torch::is_symfloat(py::handle(obj)) || torch::is_symbool(py::handle(obj));
+}
+
+namespace torch {
+
+bool should_allow_numbers_as_tensors(const std::string& name);
+
+enum class ParameterType {
+  TENSOR,
+  SCALAR,
+  INT64,
+  SYM_INT,
+  DOUBLE,
+  COMPLEX,
+  TENSOR_LIST,
+  INT_LIST,
+  GENERATOR,
+  BOOL,
+  STORAGE,
+  PYOBJECT,
+  SCALARTYPE,
+  LAYOUT,
+  MEMORY_FORMAT,
+  DEVICE,
+  STREAM,
+  STRING,
+  DIMNAME,
+  DIMNAME_LIST,
+  QSCHEME,
+  FLOAT_LIST,
+  SCALAR_LIST,
+  SYM_INT_LIST,
+  DISPATCH_KEY_SET
+};
+
+struct FunctionParameter;
+struct FunctionSignature;
+struct PythonArgs;
+
+// Contains bound Python arguments in declaration order
+template <int N>
+struct ParsedArgs {
+  ParsedArgs() : args() {}
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  PyObject* args[N];
+};
+
+// A PythonArgParser contains a list of valid signatures. Instances are
+// typically global variables and should be immutable.
+struct PYBIND11_EXPORT PythonArgParser {
+  explicit PythonArgParser(
+      const std::vector<std::string>& fmts,
+      bool traceable = false);
+
+  // meant only for `torch` functions.
+  template <int N>
+  inline PythonArgs parse(
+      PyObject* self,
+      PyObject* args,
+      PyObject* kwargs,
+      ParsedArgs<N>& dst);
+
+  template <int N>
+  inline PythonArgs parse(PyObject* args, PyObject* kwargs, ParsedArgs<N>& dst);
+
+  inline PythonArgs parse(PyObject* self, ParsedArgs<0>& dst);
+
+  // Formatted strings of non-hidden signatures
+  std::vector<std::string> get_signatures() const;
+
+ private:
+  [[noreturn]] void print_error(
+      PyObject* self,
+      PyObject* args,
+      PyObject* kwargs,
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+      PyObject* parsed_args[]);
+  void check_deprecated(const FunctionSignature& signature);
+  PythonArgs raw_parse(
+      PyObject* self,
+      PyObject* args,
+      PyObject* kwargs,
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+      PyObject* parsed_args[]);
+
+  std::vector<FunctionSignature> signatures_;
+  std::string function_name;
+  size_t max_args;
+  bool traceable;
+};
+
+// FunctionSignature represents a single valid signature for a Python function.
+// It is immutable once constructed. The contained data can be concurrently
+// accessed by multiple calls.
+struct FunctionSignature {
+  explicit FunctionSignature(const std::string& fmt, int index);
+
+  bool parse(
+      PyObject* self,
+      PyObject* args,
+      PyObject* kwargs,
+      // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+      PyObject* dst[],
+      std::vector<PyObject*>& overloaded_args,
+      bool raise_exception);
+
+  std::string toString() const;
+
+  std::string name;
+  std::vector<FunctionParameter> params;
+  size_t min_args;
+  size_t max_args;
+  size_t max_pos_args;
+  int index;
+  bool hidden;
+  bool deprecated;
+};
+
+// PythonArgs contains bound Python arguments for an actual invocation
+// along with references to the matched signature.
+struct PythonArgs {
+  PythonArgs(
+      bool traceable,
+      const FunctionSignature& signature,
+      PyObject** args,
+      std::vector<PyObject*> overloaded_args)
+      : idx(signature.index),
+        traceable(traceable),
+        signature(signature),
+        args(args),
+        overloaded_args(std::move(overloaded_args)) {}
+
+  int idx;
+  bool traceable;
+  const FunctionSignature& signature;
+  PyObject** args;
+  std::vector<PyObject*> overloaded_args; // NOTE: borrowed references
+
+  inline bool has_torch_function();
+  inline std::string get_func_name();
+  inline at::Tensor tensor(int i);
+  inline c10::optional<at::Tensor> optionalTensor(int i);
+  inline at::Scalar scalar(int i);
+  inline at::Scalar scalarWithDefault(int i, const at::Scalar& default_scalar);
+  inline std::vector<at::Scalar> scalarlist(int i);
+  inline std::vector<at::Tensor> tensorlist(int i);
+  inline torch::List<c10::optional<at::Tensor>> list_of_optional_tensors(int i);
+  template <int N>
+  inline std::array<at::Tensor, N> tensorlist_n(int i);
+  inline std::vector<int64_t> intlist(int i);
+  inline std::vector<c10::SymInt> symintlist(int i);
+  inline c10::OptionalArray<int64_t> intlistOptional(int i);
+  inline c10::OptionalArray<c10::SymInt> symintlistOptional(int i);
+  inline std::vector<int64_t> intlistWithDefault(
+      int i,
+      std::vector<int64_t> default_intlist);
+  inline c10::optional<at::Generator> generator(int i);
+  inline at::Storage storage(int i);
+  inline at::Storage storage(
+      int i,
+      at::ScalarType& storage_scalar_type,
+      bool& is_typed_storage);
+  inline c10::Stream stream(int i);
+  inline at::ScalarType scalartype(int i);
+  inline at::ScalarType scalartypeWithDefault(
+      int i,
+      at::ScalarType default_scalartype);
+  inline c10::optional<at::ScalarType> scalartypeOptional(int i);
+  inline c10::optional<at::Scalar> scalarOptional(int i);
+  inline c10::optional<int64_t> toInt64Optional(int i);
+  inline c10::optional<c10::SymInt> toSymIntOptional(int i);
+  inline c10::optional<bool> toBoolOptional(int i);
+  inline c10::optional<double> toDoubleOptional(int i);
+  inline c10::OptionalArray<double> doublelistOptional(int i);
+  inline std::vector<double> doublelist(int i);
+  inline std::vector<double> getDoublelist(int i);
+  inline at::Layout layout(int i);
+  inline at::Layout layoutWithDefault(int i, at::Layout default_layout);
+  inline c10::optional<at::Layout> layoutOptional(int i);
+  inline at::Device device(int i);
+  inline at::Device deviceWithDefault(int i, const at::Device& default_device);
+  inline c10::optional<at::Device> deviceOptional(int i);
+  inline at::Dimname dimname(int i);
+  inline std::vector<at::Dimname> dimnamelist(int i);
+  inline c10::optional<std::vector<at::Dimname>> toDimnameListOptional(int i);
+  inline at::MemoryFormat memoryformat(int i);
+  inline c10::optional<at::MemoryFormat> memoryformatOptional(int i);
+  inline at::QScheme toQScheme(int i);
+  inline std::string string(int i);
+  inline std::string stringWithDefault(int i, const std::string& default_str);
+  inline c10::optional<std::string> stringOptional(int i);
+  inline c10::string_view stringView(int i);
+  inline c10::string_view stringViewWithDefault(
+      int i,
+      const c10::string_view default_str);
+  inline c10::optional<c10::string_view> stringViewOptional(int i);
+  inline PyObject* pyobject(int i);
+  inline int64_t toInt64(int i);
+  inline c10::SymInt toSymInt(int i);
+  inline c10::SymBool toSymBool(int i);
+  inline int64_t toInt64WithDefault(int i, int64_t default_int);
+  inline double toDouble(int i);
+  inline double toDoubleWithDefault(int i, double default_double);
+  inline c10::complex<double> toComplex(int i);
+  inline c10::complex<double> toComplexWithDefault(
+      int i,
+      c10::complex<double> default_complex);
+  inline bool toBool(int i);
+  inline bool toBoolWithDefault(int i, bool default_bool);
+  inline bool isNone(int i);
+  inline c10::optional<c10::DispatchKeySet> toDispatchKeySetOptional(int i);
+
+ private:
+  at::Tensor tensor_slow(int i);
+  at::Scalar scalar_slow(int i);
+  at::Scalar scalar_slow(PyObject* arg);
+};
+
+// FunctionParameter is a single formal parameter of a Python function.
+// It is immutable once constructed.
+struct FunctionParameter {
+  FunctionParameter(const std::string& fmt, bool keyword_only);
+
+  bool check(
+      PyObject* obj,
+      std::vector<PyObject*>& overloaded_args,
+      int argnum,
+      int64_t* failed_idx = nullptr);
+
+  void set_default_str(const std::string& str);
+  std::string type_name() const;
+
+  ParameterType type_;
+  bool optional;
+  bool allow_none;
+  bool keyword_only;
+  bool allow_numbers_as_tensors = false;
+  int size;
+  std::string name;
+  // having this as a raw PyObject * will presumably leak it, but these are only
+  // held by static objects anyway, and Py_Finalize can already be called when
+  // this is destructed.
+  PyObject* python_name;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
+  at::SmallVector<PyObject*, 5> numpy_python_names;
+  at::Scalar default_scalar;
+  std::vector<int64_t> default_intlist;
+  std::string default_string;
+  union {
+    bool default_bool;
+    int64_t default_int;
+    double default_double;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+    double default_complex[2]; // see Scalar
+    at::ScalarType default_scalartype;
+    at::Layout default_layout;
+  };
+};
+
+template <int N>
+inline PythonArgs PythonArgParser::parse(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs,
+    ParsedArgs<N>& dst) {
+  TORCH_CHECK_VALUE(
+      N >= max_args,
+      "PythonArgParser: dst ParsedArgs buffer does not have enough capacity, expected ",
+      max_args,
+      " (got ",
+      N,
+      ")");
+  return raw_parse(self, args, kwargs, dst.args);
+}
+
+template <int N>
+inline PythonArgs PythonArgParser::parse(
+    PyObject* args,
+    PyObject* kwargs,
+    ParsedArgs<N>& dst) {
+  return parse(nullptr, args, kwargs, dst);
+}
+
+inline PythonArgs PythonArgParser::parse(PyObject* self, ParsedArgs<0>& dst) {
+  return parse(self, nullptr, nullptr, dst);
+}
+
+inline bool PythonArgs::has_torch_function() {
+  return !overloaded_args.empty() || at::impl::torch_function_mode_enabled();
+}
+
+inline std::string PythonArgs::get_func_name() {
+  return signature.name;
+}
+
+// TODO: this can return MaybeOwned
+inline at::Tensor PythonArgs::tensor(int i) {
+  if (args[i] && THPVariable_CheckExact(args[i])) {
+    return THPVariable_Unpack(args[i]);
+  }
+  return tensor_slow(i);
+}
+
+inline c10::optional<at::Tensor> PythonArgs::optionalTensor(int i) {
+  at::Tensor t = tensor(i);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  if (t.defined()) {
+    return t;
+  } else {
+    return c10::nullopt;
+  }
+}
+
+inline at::Scalar PythonArgs::scalar(int i) {
+  if (!args[i])
+    return signature.params[i].default_scalar;
+  return scalar_slow(i);
+}
+
+inline std::vector<at::Scalar> PythonArgs::scalarlist(int i) {
+  if (!args[i])
+    return std::vector<at::Scalar>();
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  std::vector<at::Scalar> res(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
+                          : PyList_GET_ITEM(arg.get(), idx);
+    res[idx] = scalar_slow(obj);
+  }
+  return res;
+}
+
+inline at::Scalar PythonArgs::scalarWithDefault(
+    int i,
+    const at::Scalar& default_scalar) {
+  if (!args[i])
+    return default_scalar;
+  return scalar_slow(i);
+}
+
+inline c10::optional<at::Scalar> PythonArgs::scalarOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return scalar_slow(i);
+}
+
+inline std::vector<at::Tensor> PythonArgs::tensorlist(int i) {
+  if (!args[i])
+    return std::vector<at::Tensor>();
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  std::vector<at::Tensor> res(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
+                          : PyList_GET_ITEM(arg.get(), idx);
+    // This is checked by the argument parser so it's safe to cast without
+    // checking if this is a tensor first
+    res[idx] = THPVariable_Unpack(obj);
+  }
+  return res;
+}
+
+inline torch::List<c10::optional<at::Tensor>> PythonArgs::
+    list_of_optional_tensors(int i) {
+  if (!args[i])
+    return torch::List<c10::optional<at::Tensor>>();
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  torch::List<c10::optional<at::Tensor>> res;
+  res.reserve(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
+                          : PyList_GET_ITEM(arg.get(), idx);
+    // This is checked by the argument parser so it's safe to cast without
+    // checking if this is a tensor first
+    res.push_back(THPVariable_Unpack(obj));
+  }
+  return res;
+}
+
+template <int N>
+inline std::array<at::Tensor, N> PythonArgs::tensorlist_n(int i) {
+  auto res = std::array<at::Tensor, N>();
+  if (!args[i])
+    return res;
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  if (size != N) {
+    throw TypeError("expected tuple of %d elements but got %d", N, (int)size);
+  }
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx)
+                          : PyList_GET_ITEM(arg.get(), idx);
+    // This is checked by the argument parser so it's safe to cast without
+    // checking if this is a tensor first
+    res[idx] = THPVariable_Unpack(obj);
+  }
+  return res;
+}
+
+inline std::vector<int64_t> PythonArgs::intlist(int i) {
+  return intlistWithDefault(i, signature.params[i].default_intlist);
+}
+
+inline PyObject* toPyObject(c10::SymInt symint) {
+  if (symint.is_symbolic()) {
+    auto r = py::cast(symint).release().ptr();
+    TORCH_INTERNAL_ASSERT(r);
+    return r;
+  } else {
+    auto m = symint.maybe_as_int();
+    return THPUtils_packInt64(*m);
+  }
+}
+
+inline void throw_intlist_exception(
+    const torch::PythonArgs* args,
+    size_t i,
+    PyObject* obj,
+    size_t idx,
+    const std::exception& e = python_error()) {
+  std::string error = strlen(e.what())
+      ? e.what()
+      : std::string("type must be ") + args->signature.params[i].type_name() +
+          ",but got " + Py_TYPE(obj)->tp_name;
+  throw TypeError(
+      "%s(): argument '%s' failed to unpack the object at pos %zu with error \"%s\"",
+      args->signature.name.c_str(),
+      args->signature.params[i].name.c_str(),
+      idx + 1,
+      error.c_str());
+}
+
+inline std::vector<c10::SymInt> PythonArgs::symintlist(int i) {
+  if (!args[i]) {
+    return c10::fmap(signature.params[i].default_intlist, [](int64_t di) {
+      return c10::SymInt(di);
+    });
+  }
+
+  const auto size1 = signature.params[i].size;
+  if (size1 > 0 && THPUtils_checkLong(args[i])) {
+    return std::vector<c10::SymInt>(
+        size1, c10::SymInt(THPUtils_unpackLong(args[i])));
+  }
+
+  if (size1 > 0 && torch::is_symint(py::handle(args[i]))) {
+    auto si = py::handle(args[i]).cast<c10::SymInt>();
+    return std::vector<c10::SymInt>(size1, si);
+  }
+
+  PyObject* arg = args[i];
+  auto tuple = PyTuple_Check(arg);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<c10::SymInt> res;
+  res.reserve(size2);
+  for (const auto idx : c10::irange(size2)) {
+    PyObject* obj =
+        tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+
+    // Elements of torch.Size are tensors during tracing, and we need to
+    // record extra information before they are turned into an IntArrayRef
+    if (traceable && jit::tracer::isTracing() && THPVariable_Check(obj)) {
+      auto& var = THPVariable_Unpack(obj);
+      jit::tracer::ArgumentStash::stashIntArrayRefElem(
+          signature.params[i].name, size2, idx, var);
+      try {
+        res.emplace_back(var.item<int64_t>());
+        continue;
+      } catch (std::exception& e) {
+        throw_intlist_exception(this, i, obj, idx, e);
+      }
+      continue;
+    } else {
+      // convert tensor to scalar outside of try / catch,
+      // so that Tensor subclass exceptions will not be caught.
+      if (THPUtils_checkLongExact(obj)) {
+        // Fast path for plain numbers
+        try {
+          res.emplace_back(THPUtils_unpackLong(obj));
+        } catch (std::exception& e) {
+          throw_intlist_exception(this, i, obj, idx, e);
+        }
+      } else if (THPVariable_Check(obj)) {
+        auto& var = THPVariable_Unpack(obj);
+        if (var.numel() != 1 ||
+            !at::isIntegralType(
+                var.dtype().toScalarType(), /*include_bool*/ true)) {
+          throw_intlist_exception(this, i, obj, idx);
+        }
+        auto scalar = var.item();
+        TORCH_CHECK(scalar.isIntegral(/*include bool*/ false));
+        res.push_back(scalar.toSymInt());
+      } else {
+        try {
+          if (is_symint(py::handle(obj))) {
+            res.push_back(py::handle(obj).cast<c10::SymInt>());
+          } else {
+            res.emplace_back(THPUtils_unpackIndex(obj));
+          }
+        } catch (std::exception& e) {
+          throw_intlist_exception(this, i, obj, idx, e);
+        }
+      }
+    }
+  }
+
+  return res;
+}
+
+inline std::vector<int64_t> PythonArgs::intlistWithDefault(
+    int i,
+    std::vector<int64_t> default_intlist) {
+  if (!args[i])
+    return default_intlist;
+  PyObject* arg = args[i];
+  const auto size1 = signature.params[i].size;
+  if (size1 > 0 && THPUtils_checkLong(arg)) {
+    return std::vector<int64_t>(size1, THPUtils_unpackLong(arg));
+  }
+  if (size1 > 0 && torch::is_symint(py::handle(arg))) {
+    return std::vector<int64_t>(
+        size1,
+        py::handle(arg).cast<c10::SymInt>().guard_int(__FILE__, __LINE__));
+  }
+  auto tuple = PyTuple_Check(arg);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<int64_t> res(size2);
+  for (const auto idx : c10::irange(size2)) {
+    PyObject* obj =
+        tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    // Elements of torch.Size are tensors during tracing, and we need to
+    // record extra information before they are turned into an IntArrayRef
+    if (traceable && jit::tracer::isTracing() && THPVariable_Check(obj)) {
+      auto& var = THPVariable_Unpack(obj);
+      jit::tracer::ArgumentStash::stashIntArrayRefElem(
+          signature.params[i].name, size2, idx, var);
+      try {
+        res[idx] = var.item<int64_t>();
+        continue;
+      } catch (std::exception& e) {
+        throw_intlist_exception(this, i, obj, idx, e);
+      }
+    } else {
+      // convert tensor to scalar outside of try / catch,
+      // so that Tensor subclass exceptions will not be caught.
+      if (THPUtils_checkLongExact(obj)) {
+        // Fast path for plain numbers
+        try {
+          res[idx] = THPUtils_unpackLong(obj);
+        } catch (std::exception& e) {
+          throw_intlist_exception(this, i, obj, idx, e);
+        }
+      } else if (torch::is_symint(py::handle(obj))) {
+        res[idx] = py::cast<c10::SymInt>(py::handle(obj))
+                       .guard_int(__FILE__, __LINE__);
+      } else if (THPVariable_Check(obj)) {
+        auto& var = THPVariable_Unpack(obj);
+        if (var.numel() != 1 ||
+            !at::isIntegralType(
+                var.dtype().toScalarType(), /*include_bool*/ true)) {
+          throw_intlist_exception(this, i, obj, idx);
+        }
+        res[idx] = var.item<int64_t>();
+      } else {
+        try {
+          res[idx] = THPUtils_unpackIndex(obj);
+        } catch (std::exception& e) {
+          throw_intlist_exception(this, i, obj, idx, e);
+        }
+      }
+    }
+  }
+  return res;
+}
+
+inline c10::OptionalArray<int64_t> PythonArgs::intlistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return intlist(i);
+}
+
+inline c10::OptionalArray<c10::SymInt> PythonArgs::symintlistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return symintlist(i);
+}
+
+inline std::vector<double> PythonArgs::getDoublelist(int i) {
+  PyObject* arg = args[i];
+  auto tuple = PyTuple_Check(arg);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<double> res(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj =
+        tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    try {
+      res[idx] = THPUtils_unpackDouble(obj);
+    } catch (const std::exception& e) {
+      throw TypeError(
+          "%s(): argument '%s' must be %s, but found element of type %s at pos %zu",
+          signature.name.c_str(),
+          signature.params[i].name.c_str(),
+          signature.params[i].type_name().c_str(),
+          Py_TYPE(obj)->tp_name,
+          idx + 1);
+    }
+  }
+  return res;
+}
+
+inline c10::OptionalArray<double> PythonArgs::doublelistOptional(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
+inline std::vector<double> PythonArgs::doublelist(int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return this->getDoublelist(i);
+}
+
+inline c10::optional<c10::DispatchKeySet> PythonArgs::toDispatchKeySetOptional(
+    int i) {
+  if (!args[i]) {
+    return {};
+  }
+  return py::cast<c10::DispatchKeySet>(py::handle(args[i]));
+}
+
+inline at::ScalarType PythonArgs::scalartypeWithDefault(
+    int i,
+    at::ScalarType default_scalartype) {
+  if (!args[i])
+    return default_scalartype;
+  return scalartype(i);
+}
+
+inline at::ScalarType toScalarType(PyObject* obj) {
+  if (obj == (PyObject*)&PyFloat_Type) {
+    return at::ScalarType::Double;
+  }
+  if (obj == (PyObject*)&PyBool_Type) {
+    return at::ScalarType::Bool;
+  }
+  if (obj == (PyObject*)&PyLong_Type) {
+    return at::ScalarType::Long;
+  }
+  return reinterpret_cast<THPDtype*>(obj)->scalar_type;
+}
+
+inline at::ScalarType PythonArgs::scalartype(int i) {
+  if (!args[i]) {
+    auto scalartype = signature.params[i].default_scalartype;
+    return (scalartype == at::ScalarType::Undefined)
+        ? torch::tensors::get_default_scalar_type()
+        : scalartype;
+  }
+  PyObject* obj = args[i];
+  return toScalarType(obj);
+}
+
+inline c10::optional<at::ScalarType> PythonArgs::scalartypeOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return scalartype(i);
+}
+
+inline at::Layout toLayout(PyObject* obj) {
+  const auto layout = reinterpret_cast<THPLayout*>(obj);
+  return layout->layout;
+}
+
+inline at::Layout PythonArgs::layout(int i) {
+  if (!args[i])
+    return signature.params[i].default_layout;
+  return toLayout(args[i]);
+}
+
+inline at::Layout PythonArgs::layoutWithDefault(
+    int i,
+    at::Layout default_layout) {
+  if (!args[i])
+    return default_layout;
+  return layout(i);
+}
+
+inline c10::optional<at::Layout> PythonArgs::layoutOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return layout(i);
+}
+
+inline at::Device toDevice(PyObject* obj) {
+  if (THPDevice_Check(obj)) {
+    const auto device = reinterpret_cast<THPDevice*>(obj);
+    return device->device;
+  }
+  if (THPUtils_checkLong(obj)) {
+    const auto device_index = THPUtils_unpackLong(obj);
+    TORCH_CHECK(device_index >= 0, "Device index must not be negative");
+    if (c10::is_privateuse1_backend_registered()) {
+      return at::Device(
+          c10::DeviceType::PrivateUse1,
+          static_cast<c10::DeviceIndex>(device_index));
+    }
+    return at::Device(
+        c10::DeviceType::CUDA, static_cast<c10::DeviceIndex>(device_index));
+  }
+  const std::string& device_str = THPUtils_unpackString(obj);
+  return at::Device(device_str);
+}
+
+inline at::Device PythonArgs::device(int i) {
+  if (!args[i]) {
+    return torch::tensors::get_default_device();
+  }
+  return toDevice(args[i]);
+}
+
+inline at::Device PythonArgs::deviceWithDefault(
+    int i,
+    const at::Device& default_device) {
+  if (!args[i])
+    return default_device;
+  return device(i);
+}
+
+inline c10::optional<at::Device> PythonArgs::deviceOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return device(i);
+}
+
+inline at::Dimname PythonArgs::dimname(int i) {
+  TORCH_INTERNAL_ASSERT(args[i] != nullptr);
+  return THPDimname_parse(args[i]);
+}
+
+inline std::vector<at::Dimname> parseDimnameList(PyObject* arg) {
+  auto tuple = PyTuple_Check(arg);
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  auto size = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
+  std::vector<at::Dimname> res;
+  res.reserve(size);
+  for (const auto idx : c10::irange(size)) {
+    PyObject* obj =
+        tuple ? PyTuple_GET_ITEM(arg, idx) : PyList_GET_ITEM(arg, idx);
+    res.push_back(THPDimname_parse(obj));
+  }
+  return res;
+}
+
+inline c10::optional<std::vector<at::Dimname>> PythonArgs::
+    toDimnameListOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return parseDimnameList(args[i]);
+}
+
+inline std::vector<at::Dimname> PythonArgs::dimnamelist(int i) {
+  TORCH_INTERNAL_ASSERT(args[i]);
+  PyObject* arg = args[i];
+  auto size = signature.params[i].size;
+  TORCH_INTERNAL_ASSERT(size == 0 || size == 1);
+  if (size == 1 && THPUtils_checkDimname(arg)) {
+    return {THPDimname_parse(arg)};
+  }
+  return parseDimnameList(arg);
+}
+
+inline at::MemoryFormat PythonArgs::memoryformat(int i) {
+  if (!args[i])
+    return at::MemoryFormat::Contiguous;
+  TORCH_CHECK(
+      THPMemoryFormat_Check(args[i]),
+      "memory_format arg must be an instance of the torch.memory_format");
+  const auto memory_format = reinterpret_cast<THPMemoryFormat*>(args[i]);
+  return memory_format->memory_format;
+}
+
+inline c10::optional<at::MemoryFormat> PythonArgs::memoryformatOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return memoryformat(i);
+}
+
+inline at::QScheme PythonArgs::toQScheme(int i) {
+  if (!args[i])
+    return at::kPerTensorAffine;
+  TORCH_CHECK(
+      THPQScheme_Check(args[i]),
+      "qscheme arg must be an instance of the torch.qscheme");
+  const auto qscheme = reinterpret_cast<THPQScheme*>(args[i]);
+  return qscheme->qscheme;
+}
+
+inline std::string PythonArgs::string(int i) {
+  return stringWithDefault(i, signature.params[i].default_string);
+}
+
+inline std::string PythonArgs::stringWithDefault(
+    int i,
+    const std::string& default_str) {
+  if (!args[i])
+    return default_str;
+  return THPUtils_unpackString(args[i]);
+}
+
+inline c10::optional<std::string> PythonArgs::stringOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return THPUtils_unpackString(args[i]);
+}
+
+inline c10::string_view PythonArgs::stringView(int i) {
+  return stringViewWithDefault(i, signature.params[i].default_string);
+}
+
+inline c10::string_view PythonArgs::stringViewWithDefault(
+    int i,
+    const c10::string_view default_str) {
+  if (!args[i])
+    return default_str;
+  return THPUtils_unpackStringView(args[i]);
+}
+
+inline c10::optional<c10::string_view> PythonArgs::stringViewOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return THPUtils_unpackStringView(args[i]);
+}
+
+inline int64_t PythonArgs::toInt64(int i) {
+  if (!args[i])
+    return signature.params[i].default_int;
+  if (traceable && jit::tracer::isTracing() && THPVariable_Check(args[i])) {
+    auto& var = THPVariable_Unpack(args[i]);
+    jit::tracer::ArgumentStash::stashValue(
+        signature.params[i].name, idx, var, c10::IntType::get());
+  }
+  if (torch::is_symint(py::handle(args[i]))) {
+    return py::cast<c10::SymInt>(py::handle(args[i]))
+        .guard_int(__FILE__, __LINE__);
+  }
+  return THPUtils_unpackLong(args[i]);
+}
+
+inline c10::SymInt PythonArgs::toSymInt(int i) {
+  if (!args[i]) {
+    return c10::SymInt(signature.params[i].default_int);
+  }
+
+  if (traceable && jit::tracer::isTracing() && THPVariable_Check(args[i])) {
+    auto& var = THPVariable_Unpack(args[i]);
+    jit::tracer::ArgumentStash::stashValue(
+        signature.params[i].name, idx, var, c10::IntType::get());
+  }
+
+  return py::cast<c10::SymInt>(py::handle(args[i]));
+}
+
+inline c10::SymBool PythonArgs::toSymBool(int i) {
+  if (!args[i]) {
+    return c10::SymBool(signature.params[i].default_bool);
+  }
+  if (traceable && jit::tracer::isTracing() && THPVariable_Check(args[i])) {
+    auto& var = THPVariable_Unpack(args[i]);
+    jit::tracer::ArgumentStash::stashValue(
+        signature.params[i].name, idx, var, c10::BoolType::get());
+  }
+
+  return py::cast<c10::SymBool>(py::handle(args[i]));
+}
+
+inline int64_t PythonArgs::toInt64WithDefault(int i, int64_t default_int) {
+  if (!args[i])
+    return default_int;
+  return toInt64(i);
+}
+
+inline c10::optional<int64_t> PythonArgs::toInt64Optional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return toInt64(i);
+}
+
+inline c10::optional<c10::SymInt> PythonArgs::toSymIntOptional(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return toSymInt(i);
+}
+
+inline c10::optional<bool> PythonArgs::toBoolOptional(int i) {
+  if (!args[i]) {
+    return c10::nullopt;
+  }
+  return toBool(i);
+}
+
+inline c10::optional<double> PythonArgs::toDoubleOptional(int i) {
+  if (!args[i]) {
+    return c10::nullopt;
+  }
+  return toDouble(i);
+}
+
+inline double PythonArgs::toDouble(int i) {
+  if (!args[i])
+    return signature.params[i].default_double;
+  if (torch::is_symfloat(py::handle(args[i]))) {
+    return py::cast<c10::SymFloat>(py::handle(args[i]))
+        .guard_float(__FILE__, __LINE__);
+  }
+  if (torch::is_symint(py::handle(args[i]))) {
+    return static_cast<double>(py::cast<c10::SymInt>(py::handle(args[i]))
+                                   .guard_int(__FILE__, __LINE__));
+  }
+  return THPUtils_unpackDouble(args[i]);
+}
+
+inline bool PythonArgs::toBool(int i) {
+  if (!args[i])
+    return signature.params[i].default_bool;
+  if (torch::is_symbool(py::handle(args[i]))) {
+    return py::cast<c10::SymBool>(py::handle(args[i]))
+        .guard_bool(__FILE__, __LINE__);
+  }
+  return args[i] == Py_True;
+}
+
+inline double PythonArgs::toDoubleWithDefault(int i, double default_double) {
+  if (!args[i])
+    return default_double;
+  return toDouble(i);
+}
+
+inline c10::complex<double> PythonArgs::toComplex(int i) {
+  if (!args[i])
+    return *(reinterpret_cast<const c10::complex<double>*>(
+        signature.params[i].default_complex));
+  return THPUtils_unpackComplexDouble(args[i]);
+}
+
+inline c10::complex<double> PythonArgs::toComplexWithDefault(
+    int i,
+    c10::complex<double> default_value) {
+  if (!args[i])
+    return default_value;
+  return toComplex(i);
+}
+
+inline bool PythonArgs::toBoolWithDefault(int i, bool default_bool) {
+  if (!args[i])
+    return default_bool;
+  return toBool(i);
+}
+
+inline bool PythonArgs::isNone(int i) {
+  return args[i] == nullptr;
+}
+
+inline c10::optional<at::Generator> PythonArgs::generator(int i) {
+  if (!args[i])
+    return c10::nullopt;
+  return reinterpret_cast<THPGenerator*>(args[i])->cdata;
+}
+
+inline at::Storage PythonArgs::storage(int i) {
+  if (!args[i])
+    return at::Storage();
+  return createStorage(args[i]);
+}
+
+inline at::Storage PythonArgs::storage(
+    int i,
+    at::ScalarType& storage_scalar_type,
+    bool& is_typed_storage) {
+  at::Storage storage;
+  if (!args[i]) {
+    storage = at::Storage();
+    is_typed_storage = false;
+    storage_scalar_type = at::ScalarType::Undefined;
+  } else {
+    std::tie(storage, storage_scalar_type, is_typed_storage) =
+        createStorageGetType(args[i]);
+  }
+  return storage;
+}
+
+inline c10::Stream PythonArgs::stream(int i) {
+  if (!args[i])
+    return c10::Stream(
+        c10::Stream::Default::DEFAULT, c10::Device(c10::DeviceType::CPU, -1));
+  if (!THPStream_Check(args[i])) {
+    throw TypeError(
+        "expected Stream object. Got '%s'", Py_TYPE(args[i])->tp_name);
+  }
+  return c10::Stream::unpack3(
+      ((THPStream*)args[i])->stream_id,
+      static_cast<c10::DeviceIndex>(((THPStream*)args[i])->device_index),
+      static_cast<c10::DeviceType>(((THPStream*)args[i])->device_type));
+}
+
+inline PyObject* PythonArgs::pyobject(int i) {
+  if (!args[i])
+    return Py_None;
+  return args[i];
+}
+
+/*
+ *
+ * Handle __torch_function__ overrides if we know that there are overloaded
+ * arguments.  All objects stored in r.overloaded_args must have a
+ * __torch_function__ implementation and the arguments must be ordered in order
+ * of precedence. Precedence goes from left to right in the order of the
+ * signature of the function the overloaded arguments were passed to, except
+ * subclasses are always considered before superclasses.
+ *
+ * If the result of calling __torch_function__ is NotImplemented, the
+ * next implementation in the precedence order is called. If all
+ * arguments return NotImplemented from their __torch_function__
+ * implementation, a TypeError is raised in Python.
+ *
+ * Assumes overloaded_args has at least one entry. All entries must have
+ * a __torch_function__ attribute that resolves to a callable that
+ * accepts a torch API function, a tuple of arguments, and a dict of
+ * keyword arguments for the torch API function.
+ *
+ * It is sufficient to call PythonArgs::has_torch_function before
+ * calling this function to verify that there are valid arguments
+ * present. If that is not done then special care must be taken to
+ * ensure there are arguments that are overloaded with
+ * __torch_function__.
+ *
+ * See torch._overrides.handle_torch_function for the equivalent
+ * code in the pure-python implementation.
+ *
+ * 'r' is a parsed PythonArgs instance, returned from
+ * PythonArgParser::parse.
+ *
+ * 'args' is a reference to the python tuple of arguments to the torch
+ * API function.
+ *
+ * 'kwargs' is a reference to the python dict of keyword arguments to
+ * the torch API function.
+ *
+ * 'torch_api' is a reference to a python torch API namespace.
+ *
+ * 'torch_api_function' is the reference to the original torch method, usually,
+ * we can use torch_api and func_name to get torch_api_function. In some cases,
+ * e.g., torch custom op, we create the function in C++, if we still use
+ * torch_api and func_name to fetch original api, a cyclic call will happen.
+ *
+ * 'overloaded_args' is the args which have overloaded __torch_function__.
+ *
+ * 'func_name' is the named of the original torch method.
+ *
+ * TODO: we could use different names for the following 'handle_torch_function'
+ * instead of overloading.
+ *
+ */
+// Used for Tensor methods with arguments.
+auto handle_torch_function(
+    PythonArgs& r,
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs,
+    PyObject* torch_api,
+    const char* module_name,
+    const char* func_name_override = nullptr) -> PyObject*;
+
+// Used for functions which needs to parse python args.
+auto handle_torch_function(
+    PythonArgs& r,
+    PyObject* args,
+    PyObject* kwargs,
+    PyObject* torch_api,
+    const char* module_name,
+    const char* func_name_override = nullptr) -> PyObject*;
+
+// Used for functions that have no argument parsing.
+auto handle_torch_function(
+    PyObject* self,
+    const std::string& func_name,
+    PyObject* args = nullptr,
+    PyObject* kwargs = nullptr,
+    PyObject* torch_api = THPVariableClass,
+    const std::string& module_name = "torch.Tensor") -> PyObject*;
+
+// Used for functions created in C++, e.g., C++ custom op, which doesn't use
+// PythonArgParser to get overloaded_args.
+enum class TorchFunctionName { TorchFunction, TorchDispatch };
+
+auto TORCH_PYTHON_API handle_torch_function_no_python_arg_parser(
+    at::ArrayRef<PyObject*> overloaded_args,
+    PyObject* args,
+    PyObject* kwargs,
+    const char* func_name,
+    PyObject* torch_api_function,
+    const char* module_name,
+    TorchFunctionName torch_function_name = TorchFunctionName::TorchFunction)
+    -> PyObject*;
+
+// Used for getters of Tensor properties
+auto handle_torch_function_getter(
+    THPVariable* self,
+    const std::string& property_name) -> PyObject*;
+
+// Used for setters of Tensor properties.
+auto handle_torch_function_setter(
+    THPVariable* self,
+    const std::string& property_name,
+    PyObject* value) -> int;
+
+// Used for __getitem__ and __setitem__
+auto handle_torch_function_indexing(
+    PyObject* self,
+    PyObject* index,
+    PyObject* val = nullptr) -> PyObject*;
+
+/*
+ * Check if the input obj is Tensor type, including its subclass, or overloaded
+ * type. If the type defines __torch_function__, it also returns true.
+ * Otherwise returns flase. If the class is not torch.Tensor, and it defines
+ * __torch_function__, we append obj to overloaded_args.
+ *
+ * 'obj': the input argument to be checked
+ * 'overloaded_args': the vector to append the overloaded args.
+ */
+bool is_tensor_and_append_overloaded(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args);
+
+/*
+ * Check if the input obj is Tensor List or Tensor Tuple type. First check
+ * whether obj is Tuple or List type, if true, iterate over each element and
+ * check whether it is Tensor type, including its subclass or overloaded type.
+ * At the same time, the overloaded arg is appended to the overloaded_args.
+ *
+ * 'obj': the input argument to be checked
+ * 'overloaded_args': the vector to append the overloaded args.
+ * 'argnum': the number of total arguments of the function being checked.
+ * 'throw_error': whether throw error if any element in the list or tuple is
+ *                not tensor type or overloaded.
+ */
+bool is_tensor_list_and_append_overloaded(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args,
+    int argnum,
+    bool throw_error);
+
+/* Given an argument that is definitely a tensor and is definitely overloaded,
+ * append it to the overloaded arguments list.  Use this instead of
+ * is_tensor_and_append_overloaded in situations where you have a PyObject
+ * and you know it definitely is a Tensor and it is definitely overloaded.
+ *
+ * 'overloaded_args': the vector to append the overloaded args
+ * 'obj': the input tensor that is overloaded
+ */
+void append_overloaded_tensor(
+    std::vector<PyObject*>* overloaded_args,
+    PyObject* obj);
+
+/* Given an argument that is definitely a type and is definitely overloaded,
+ * append it to the overloaded arguments list. Use this only with
+ * __torch_dispatch__, where we operate on classes that have a
+ * __torch_dispatch__ classmethod.
+ *
+ * 'overloaded_args': the vector to append the overloaded type
+ * 'obj': the input class that has a __torch_dispatch__ classmethod.
+ */
+void append_overloaded_type(
+    std::vector<PyObject*>* overloaded_args,
+    PyObject* obj);
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_compat.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..10dba31488fd326edca94540677e54a1ae34e358
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_compat.h
@@ -0,0 +1,38 @@
+#ifndef PYTHON_COMPAT
+#define PYTHON_COMPAT
+
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// PyTorch-only compat functions
+
+#define IS_PYTHON_3_11_PLUS PY_VERSION_HEX >= 0x030B00C1
+#define IS_PYTHON_3_12_PLUS PY_VERSION_HEX >= 0x030C0000
+
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyCode_GetNCellvars(PyCodeObject* code) {
+// gh-26364 added co_ncellvars to Python 3.11.0rc1
+#if IS_PYTHON_3_11_PLUS
+  return code->co_ncellvars;
+#else
+  return PyTuple_GET_SIZE(code->co_cellvars);
+#endif
+}
+
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyCode_GetNFreevars(PyCodeObject* code) {
+// gh-26364 added co_nfreevars to Python 3.11.0rc1
+#if IS_PYTHON_3_11_PLUS
+  return code->co_nfreevars;
+#else
+  return PyTuple_GET_SIZE(code->co_freevars);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif // PYTHON_COMPAT
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_dispatch.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc3fc8a48bb4462aa0a0f9d5b90632afe262ba7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_dispatch.h
@@ -0,0 +1,17 @@
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace impl {
+namespace dispatch {
+
+void initDispatchBindings(PyObject* module);
+
+void python_op_registration_trampoline_impl(
+    const c10::OperatorHandle& op,
+    c10::DispatchKey key,
+    torch::jit::Stack* stack);
+
+} // namespace dispatch
+} // namespace impl
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_numbers.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_numbers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a45dd2a46d1f23496f3f4b11627e4b6e017598f0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_numbers.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/tensor_numpy.h>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+
+// largest integer that can be represented consecutively in a double
+const int64_t DOUBLE_INT_MAX = 9007199254740992;
+
+inline PyObject* THPUtils_packDeviceIndex(c10::DeviceIndex value) {
+  return PyLong_FromLong(value);
+}
+
+inline PyObject* THPUtils_packInt32(int32_t value) {
+  return PyLong_FromLong(value);
+}
+
+inline PyObject* THPUtils_packInt64(int64_t value) {
+  return PyLong_FromLongLong(value);
+}
+
+inline PyObject* THPUtils_packUInt32(uint32_t value) {
+  return PyLong_FromUnsignedLong(value);
+}
+
+inline PyObject* THPUtils_packUInt64(uint64_t value) {
+  return PyLong_FromUnsignedLongLong(value);
+}
+
+inline PyObject* THPUtils_packDoubleAsInt(double value) {
+  return PyLong_FromDouble(value);
+}
+
+inline bool THPUtils_checkLongExact(PyObject* obj) {
+  return PyLong_CheckExact(obj) && !PyBool_Check(obj);
+}
+
+inline bool THPUtils_checkLong(PyObject* obj) {
+  // Fast path
+  if (THPUtils_checkLongExact(obj)) {
+    return true;
+  }
+
+#ifdef USE_NUMPY
+  if (torch::utils::is_numpy_int(obj)) {
+    return true;
+  }
+#endif
+
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+}
+
+inline int32_t THPUtils_unpackInt(PyObject* obj) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int overflow;
+  long value = PyLong_AsLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+  if (value > std::numeric_limits<int32_t>::max() ||
+      value < std::numeric_limits<int32_t>::min()) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+  return (int32_t)value;
+}
+
+inline int64_t THPUtils_unpackLong(PyObject* obj) {
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int overflow;
+  long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking long");
+  }
+  return (int64_t)value;
+}
+
+inline uint32_t THPUtils_unpackUInt32(PyObject* obj) {
+  unsigned long value = PyLong_AsUnsignedLong(obj);
+  if (PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    throw std::runtime_error("Overflow when unpacking unsigned long");
+  }
+  return (uint32_t)value;
+}
+
+inline uint64_t THPUtils_unpackUInt64(PyObject* obj) {
+  unsigned long long value = PyLong_AsUnsignedLongLong(obj);
+  if (PyErr_Occurred()) {
+    throw python_error();
+  }
+  return (uint64_t)value;
+}
+
+bool THPUtils_checkIndex(PyObject* obj);
+
+inline int64_t THPUtils_unpackIndex(PyObject* obj) {
+  if (!THPUtils_checkLong(obj)) {
+    auto index = THPObjectPtr(PyNumber_Index(obj));
+    if (index == nullptr) {
+      throw python_error();
+    }
+    // NB: This needs to be called before `index` goes out of scope and the
+    // underlying object's refcount is decremented
+    return THPUtils_unpackLong(index.get());
+  }
+  return THPUtils_unpackLong(obj);
+}
+
+inline bool THPUtils_unpackBool(PyObject* obj) {
+  if (obj == Py_True) {
+    return true;
+  } else if (obj == Py_False) {
+    return false;
+  } else {
+    throw std::runtime_error("couldn't convert python object to boolean");
+  }
+}
+
+inline bool THPUtils_checkBool(PyObject* obj) {
+#ifdef USE_NUMPY
+  if (torch::utils::is_numpy_bool(obj)) {
+    return true;
+  }
+#endif
+  return PyBool_Check(obj);
+}
+
+inline bool THPUtils_checkDouble(PyObject* obj) {
+#ifdef USE_NUMPY
+  if (torch::utils::is_numpy_scalar(obj)) {
+    return true;
+  }
+#endif
+  return PyFloat_Check(obj) || PyLong_Check(obj);
+}
+
+inline double THPUtils_unpackDouble(PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    return PyFloat_AS_DOUBLE(obj);
+  }
+  double value = PyFloat_AsDouble(obj);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  return value;
+}
+
+inline c10::complex<double> THPUtils_unpackComplexDouble(PyObject* obj) {
+  Py_complex value = PyComplex_AsCComplex(obj);
+  if (value.real == -1.0 && PyErr_Occurred()) {
+    throw python_error();
+  }
+
+  return c10::complex<double>(value.real, value.imag);
+}
+
+inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    return (bool)PyFloat_AS_DOUBLE(obj);
+  }
+
+  if (PyComplex_Check(obj)) {
+    double real_val = PyComplex_RealAsDouble(obj);
+    double imag_val = PyComplex_ImagAsDouble(obj);
+    return !(real_val == 0 && imag_val == 0);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int overflow;
+  long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  // No need to check overflow, because when overflow occured, it should
+  // return true in order to keep the same behavior of numpy.
+  return (bool)value;
+}
+
+inline c10::DeviceIndex THPUtils_unpackDeviceIndex(PyObject* obj) {
+  int overflow = 0;
+  long value = PyLong_AsLongAndOverflow(obj, &overflow);
+  if (value == -1 && PyErr_Occurred()) {
+    throw python_error();
+  }
+  if (overflow != 0) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  if (value > std::numeric_limits<c10::DeviceIndex>::max() ||
+      value < std::numeric_limits<c10::DeviceIndex>::min()) {
+    throw std::runtime_error("Overflow when unpacking DeviceIndex");
+  }
+  return (c10::DeviceIndex)value;
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_raii.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_raii.h
new file mode 100644
index 0000000000000000000000000000000000000000..b87890906417fd404b386104e5b213bb25106d5e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_raii.h
@@ -0,0 +1,86 @@
+#include <c10/util/Optional.h>
+#include <torch/csrc/utils/pybind.h>
+#include <tuple>
+
+namespace torch {
+namespace impl {
+
+template <typename GuardT, typename... Args>
+struct RAIIContextManager {
+  explicit RAIIContextManager(Args&&... args)
+      : args_(std::forward<Args>(args)...) {}
+
+  void enter() {
+    auto emplace = [&](Args... args) {
+      guard_.emplace(std::forward<Args>(args)...);
+    };
+    std::apply(std::move(emplace), args_);
+  }
+
+  void exit() {
+    guard_ = c10::nullopt;
+  }
+
+ private:
+  c10::optional<GuardT> guard_;
+  std::tuple<Args...> args_;
+};
+
+// Turns a C++ RAII guard into a Python context manager.
+// See _ExcludeDispatchKeyGuard in python_dispatch.cpp for example.
+template <typename GuardT, typename... GuardArgs>
+void py_context_manager(const py::module& m, const char* name) {
+  using ContextManagerT = RAIIContextManager<GuardT, GuardArgs...>;
+  py::class_<ContextManagerT>(m, name)
+      .def(py::init<GuardArgs...>())
+      .def("__enter__", [](ContextManagerT& guard) { guard.enter(); })
+      .def(
+          "__exit__",
+          [](ContextManagerT& guard,
+             py::object exc_type,
+             py::object exc_value,
+             py::object traceback) { guard.exit(); });
+}
+
+template <typename GuardT, typename... Args>
+struct DeprecatedRAIIContextManager {
+  explicit DeprecatedRAIIContextManager(Args&&... args) {
+    guard_.emplace(std::forward<Args>(args)...);
+  }
+
+  void enter() {}
+
+  void exit() {
+    guard_ = c10::nullopt;
+  }
+
+ private:
+  c10::optional<GuardT> guard_;
+  std::tuple<Args...> args_;
+};
+
+// Definition: a "Python RAII guard" is an object in Python that acquires
+// a resource on init and releases the resource on deletion.
+//
+// This API turns a C++ RAII guard into an object can be used either as a
+// Python context manager or as a "Python RAII guard".
+//
+// Please prefer `py_context_manager` to this API if you are binding a new
+// RAII guard into Python because "Python RAII guards" don't work as expected
+// in Python (Python makes no guarantees about when an object gets deleted)
+template <typename GuardT, typename... GuardArgs>
+void py_context_manager_DEPRECATED(const py::module& m, const char* name) {
+  using ContextManagerT = DeprecatedRAIIContextManager<GuardT, GuardArgs...>;
+  py::class_<ContextManagerT>(m, name)
+      .def(py::init<GuardArgs...>())
+      .def("__enter__", [](ContextManagerT& guard) { guard.enter(); })
+      .def(
+          "__exit__",
+          [](ContextManagerT& guard,
+             py::object exc_type,
+             py::object exc_value,
+             py::object traceback) { guard.exit(); });
+}
+
+} // namespace impl
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_scalars.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_scalars.h
new file mode 100644
index 0000000000000000000000000000000000000000..68988f081465262059ec567f02b5580c999d6308
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_scalars.h
@@ -0,0 +1,163 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/util/TypeCast.h>
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+namespace torch {
+namespace utils {
+
+template <typename T>
+inline T unpackIntegral(PyObject* obj, const char* type) {
+#if PY_VERSION_HEX >= 0x030a00f0
+  // In Python-3.10 floats can no longer be silently converted to integers
+  // Keep backward compatible behavior for now
+  if (PyFloat_Check(obj)) {
+    return c10::checked_convert<T>(THPUtils_unpackDouble(obj), type);
+  }
+  return c10::checked_convert<T>(THPUtils_unpackLong(obj), type);
+#else
+  return static_cast<T>(THPUtils_unpackLong(obj));
+#endif
+}
+
+inline void store_scalar(void* data, at::ScalarType scalarType, PyObject* obj) {
+  switch (scalarType) {
+    case at::kByte:
+      *(uint8_t*)data = unpackIntegral<uint8_t>(obj, "uint8");
+      break;
+    case at::kUInt16:
+      *(uint16_t*)data = unpackIntegral<uint16_t>(obj, "uint16");
+      break;
+    case at::kUInt32:
+      *(uint32_t*)data = unpackIntegral<uint32_t>(obj, "uint32");
+      break;
+    case at::kUInt64:
+      // NB: This doesn't allow implicit conversion of float to int
+      *(uint64_t*)data = THPUtils_unpackUInt64(obj);
+      break;
+    case at::kChar:
+      *(int8_t*)data = unpackIntegral<int8_t>(obj, "int8");
+      break;
+    case at::kShort:
+      *(int16_t*)data = unpackIntegral<int16_t>(obj, "int16");
+      break;
+    case at::kInt:
+      *(int32_t*)data = unpackIntegral<int32_t>(obj, "int32");
+      break;
+    case at::kLong:
+      *(int64_t*)data = unpackIntegral<int64_t>(obj, "int64");
+      break;
+    case at::kHalf:
+      *(at::Half*)data =
+          at::convert<at::Half, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat:
+      *(float*)data = (float)THPUtils_unpackDouble(obj);
+      break;
+    case at::kDouble:
+      *(double*)data = THPUtils_unpackDouble(obj);
+      break;
+    case at::kComplexHalf:
+      *(c10::complex<at::Half>*)data =
+          (c10::complex<at::Half>)static_cast<c10::complex<float>>(
+              THPUtils_unpackComplexDouble(obj));
+      break;
+    case at::kComplexFloat:
+      *(c10::complex<float>*)data =
+          (c10::complex<float>)THPUtils_unpackComplexDouble(obj);
+      break;
+    case at::kComplexDouble:
+      *(c10::complex<double>*)data = THPUtils_unpackComplexDouble(obj);
+      break;
+    case at::kBool:
+      *(bool*)data = THPUtils_unpackNumberAsBool(obj);
+      break;
+    case at::kBFloat16:
+      *(at::BFloat16*)data =
+          at::convert<at::BFloat16, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat8_e5m2:
+      *(at::Float8_e5m2*)data =
+          at::convert<at::Float8_e5m2, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat8_e5m2fnuz:
+      *(at::Float8_e5m2fnuz*)data =
+          at::convert<at::Float8_e5m2fnuz, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat8_e4m3fn:
+      *(at::Float8_e4m3fn*)data =
+          at::convert<at::Float8_e4m3fn, double>(THPUtils_unpackDouble(obj));
+      break;
+    case at::kFloat8_e4m3fnuz:
+      *(at::Float8_e4m3fnuz*)data =
+          at::convert<at::Float8_e4m3fnuz, double>(THPUtils_unpackDouble(obj));
+      break;
+    default:
+      throw std::runtime_error("invalid type");
+  }
+}
+
+inline PyObject* load_scalar(const void* data, at::ScalarType scalarType) {
+  switch (scalarType) {
+    case at::kByte:
+      return THPUtils_packInt64(*(uint8_t*)data);
+    case at::kUInt16:
+      return THPUtils_packInt64(*(uint16_t*)data);
+    case at::kUInt32:
+      return THPUtils_packUInt32(*(uint32_t*)data);
+    case at::kUInt64:
+      return THPUtils_packUInt64(*(uint64_t*)data);
+    case at::kChar:
+      return THPUtils_packInt64(*(int8_t*)data);
+    case at::kShort:
+      return THPUtils_packInt64(*(int16_t*)data);
+    case at::kInt:
+      return THPUtils_packInt64(*(int32_t*)data);
+    case at::kLong:
+      return THPUtils_packInt64(*(int64_t*)data);
+    case at::kHalf:
+      return PyFloat_FromDouble(
+          at::convert<double, at::Half>(*(at::Half*)data));
+    case at::kFloat:
+      return PyFloat_FromDouble(*(float*)data);
+    case at::kDouble:
+      return PyFloat_FromDouble(*(double*)data);
+    case at::kComplexHalf: {
+      auto data_ = reinterpret_cast<const c10::complex<at::Half>*>(data);
+      return PyComplex_FromDoubles(data_->real(), data_->imag());
+    }
+    case at::kComplexFloat: {
+      auto data_ = reinterpret_cast<const c10::complex<float>*>(data);
+      return PyComplex_FromDoubles(data_->real(), data_->imag());
+    }
+    case at::kComplexDouble:
+      return PyComplex_FromCComplex(
+          *reinterpret_cast<Py_complex*>((c10::complex<double>*)data));
+    case at::kBool:
+      return PyBool_FromLong(*(bool*)data);
+    case at::kBFloat16:
+      return PyFloat_FromDouble(
+          at::convert<double, at::BFloat16>(*(at::BFloat16*)data));
+    case at::kFloat8_e5m2:
+      return PyFloat_FromDouble(
+          at::convert<double, at::Float8_e5m2>(*(at::Float8_e5m2*)data));
+    case at::kFloat8_e4m3fn:
+      return PyFloat_FromDouble(
+          at::convert<double, at::Float8_e4m3fn>(*(at::Float8_e4m3fn*)data));
+    case at::kFloat8_e5m2fnuz:
+      return PyFloat_FromDouble(at::convert<double, at::Float8_e5m2fnuz>(
+          *(at::Float8_e5m2fnuz*)data));
+    case at::kFloat8_e4m3fnuz:
+      return PyFloat_FromDouble(at::convert<double, at::Float8_e4m3fnuz>(
+          *(at::Float8_e4m3fnuz*)data));
+    default:
+      throw std::runtime_error("invalid type");
+  }
+}
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_strings.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..96507fc19f9026af2c42396101df6c0e602f367f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_strings.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#include <stdexcept>
+#include <string>
+
+// Utilities for handling Python strings. Note that PyString, when defined, is
+// the same as PyBytes.
+
+// Returns true if obj is a bytes/str or unicode object
+// As of Python 3.6, this does not require the GIL
+inline bool THPUtils_checkString(PyObject* obj) {
+  return PyBytes_Check(obj) || PyUnicode_Check(obj);
+}
+
+// Unpacks PyBytes (PyString) or PyUnicode as std::string
+// PyBytes are unpacked as-is. PyUnicode is unpacked as UTF-8.
+// NOTE: this method requires the GIL
+inline std::string THPUtils_unpackString(PyObject* obj) {
+  if (PyBytes_Check(obj)) {
+    size_t size = PyBytes_GET_SIZE(obj);
+    return std::string(PyBytes_AS_STRING(obj), size);
+  }
+  if (PyUnicode_Check(obj)) {
+    Py_ssize_t size = 0;
+    const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+    if (!data) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    return std::string(data, (size_t)size);
+  }
+  throw std::runtime_error("unpackString: expected bytes or unicode object");
+}
+
+// Unpacks PyBytes (PyString) or PyUnicode as c10::string_view
+// PyBytes are unpacked as-is. PyUnicode is unpacked as UTF-8.
+// NOTE: If `obj` is destroyed, then the non-owning c10::string_view will
+//   become invalid. If the string needs to be accessed at any point after
+//   `obj` is destroyed, then the c10::string_view should be copied into
+//   a std::string, or another owning object, and kept alive. For an example,
+//   look at how IValue and autograd nodes handle c10::string_view arguments.
+// NOTE: this method requires the GIL
+inline c10::string_view THPUtils_unpackStringView(PyObject* obj) {
+  if (PyBytes_Check(obj)) {
+    size_t size = PyBytes_GET_SIZE(obj);
+    return c10::string_view(PyBytes_AS_STRING(obj), size);
+  }
+  if (PyUnicode_Check(obj)) {
+    Py_ssize_t size = 0;
+    const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+    if (!data) {
+      throw std::runtime_error("error unpacking string as utf-8");
+    }
+    return c10::string_view(data, (size_t)size);
+  }
+  throw std::runtime_error("unpackString: expected bytes or unicode object");
+}
+
+inline PyObject* THPUtils_packString(const char* str) {
+  return PyUnicode_FromString(str);
+}
+
+inline PyObject* THPUtils_packString(const std::string& str) {
+  return PyUnicode_FromStringAndSize(str.c_str(), str.size());
+}
+
+inline PyObject* THPUtils_internString(const std::string& str) {
+  return PyUnicode_InternFromString(str.c_str());
+}
+
+// Precondition: THPUtils_checkString(obj) must be true
+inline bool THPUtils_isInterned(PyObject* obj) {
+  return PyUnicode_CHECK_INTERNED(obj);
+}
+
+// Precondition: THPUtils_checkString(obj) must be true
+inline void THPUtils_internStringInPlace(PyObject** obj) {
+  PyUnicode_InternInPlace(obj);
+}
+
+/*
+ * Reference:
+ * https://github.com/numpy/numpy/blob/f4c497c768e0646df740b647782df463825bfd27/numpy/core/src/common/get_attr_string.h#L42
+ *
+ * Stripped down version of PyObject_GetAttrString,
+ * avoids lookups for None, tuple, and List objects,
+ * and doesn't create a PyErr since this code ignores it.
+ *
+ * This can be much faster then PyObject_GetAttrString where
+ * exceptions are not used by caller.
+ *
+ * 'obj' is the object to search for attribute.
+ *
+ * 'name' is the attribute to search for.
+ *
+ * Returns a py::object wrapping the return value. If the attribute lookup
+ * failed the value will be NULL.
+ *
+ */
+
+// NOLINTNEXTLINE(clang-diagnostic-unused-function)
+static py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
+  PyTypeObject* tp = Py_TYPE(obj);
+  PyObject* res = (PyObject*)nullptr;
+
+  /* Attribute referenced by (char *)name */
+  if (tp->tp_getattr != nullptr) {
+    // This is OK per https://bugs.python.org/issue39620
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    res = (*tp->tp_getattr)(obj, const_cast<char*>(name));
+    if (res == nullptr) {
+      PyErr_Clear();
+    }
+  }
+  /* Attribute referenced by (PyObject *)name */
+  else if (tp->tp_getattro != nullptr) {
+    auto w = py::reinterpret_steal<py::object>(THPUtils_internString(name));
+    if (w.ptr() == nullptr) {
+      return py::object();
+    }
+    res = (*tp->tp_getattro)(obj, w.ptr());
+    if (res == nullptr) {
+      PyErr_Clear();
+    }
+  }
+  return py::reinterpret_steal<py::object>(res);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_stub.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3ce0d8907f520ed290e38c434fd5c1ad2927d0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_stub.h
@@ -0,0 +1,4 @@
+#pragma once
+
+struct _object;
+using PyObject = _object;
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_symnode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_symnode.h
new file mode 100644
index 0000000000000000000000000000000000000000..d461bcb64da963300042a22f826a21f6f593fff5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_symnode.h
@@ -0,0 +1,288 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <c10/core/SymNodeImpl.h>
+
+#include <torch/csrc/PyInterpreter.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+
+TORCH_PYTHON_API py::handle get_symint_class();
+TORCH_PYTHON_API py::handle get_symfloat_class();
+TORCH_PYTHON_API py::handle get_symbool_class();
+
+// NB: These functions must not be called too early, otherwise torch not setup.
+// Alternate design is to have torch "register" the object to us
+inline bool is_symint(py::handle obj) {
+  return py::isinstance(obj, get_symint_class());
+}
+inline bool is_symfloat(py::handle obj) {
+  return py::isinstance(obj, get_symfloat_class());
+}
+inline bool is_symbool(py::handle obj) {
+  return py::isinstance(obj, get_symbool_class());
+}
+
+namespace impl {
+
+// This c10::SymNodeImpl simply backends to a Python object that
+// implements the API.   The Python object is the source of truth,
+// this is just an adapter so C++ calls can get to the object.
+class PythonSymNodeImpl : public c10::SymNodeImpl {
+ public:
+  PythonSymNodeImpl(py::object pyobj) : c10::SymNodeImpl() {
+    pyobj_ = std::make_shared<c10::SafePyObject>(
+        pyobj.release().ptr(), getPyInterpreter());
+  };
+
+  c10::SymNode wrap_int(int64_t num) override {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr("wrap_int")(num);
+    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));
+  }
+
+  c10::SymNode wrap_float(double num) override {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr("wrap_float")(num);
+    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));
+  }
+
+  c10::SymNode wrap_bool(bool num) override {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr("wrap_bool")(num);
+    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));
+  }
+
+#define TORCH_SYMNODE_SIZES_STRIDES(n)                                        \
+  c10::SymNode n(                                                             \
+      c10::ArrayRef<c10::SymNode> sizes, c10::ArrayRef<c10::SymNode> strides) \
+      override {                                                              \
+    py::gil_scoped_acquire acquire;                                           \
+    auto r = getPyObj().attr(#n)(sizes, strides);                             \
+    return c10::make_intrusive<PythonSymNodeImpl>(std::move(r));              \
+  }
+
+  // clang-format off
+    TORCH_SYMNODE_SIZES_STRIDES(is_contiguous)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_2d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_contiguous_3d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_strides_2d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_channels_last_strides_3d)
+    TORCH_SYMNODE_SIZES_STRIDES(is_non_overlapping_and_dense)
+  // clang-format on
+
+#undef TORCH_SYMNODE_SIZES_STRIDES
+
+  bool bool_() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("bool_")().is(py::handle(Py_True));
+  }
+
+  bool is_int() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_int")().is(py::handle(Py_True));
+  }
+
+  bool is_float() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_float")().is(py::handle(Py_True));
+  }
+
+  bool is_bool() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_bool")().is(py::handle(Py_True));
+  }
+
+  bool is_nested_int() const override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("is_nested_int")().is(py::handle(Py_True));
+  }
+
+  bool has_hint() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("has_hint")().is(py::handle(Py_True));
+  }
+
+  int64_t guard_int(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_int")(file, line).cast<int64_t>();
+  }
+
+  double guard_float(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_float")(file, line).cast<double>();
+  }
+
+  bool guard_bool(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_bool")(file, line).cast<bool>();
+  }
+
+  bool expect_true(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("expect_true")(file, line).cast<bool>();
+  }
+
+  bool expect_size(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("expect_size")(file, line).cast<bool>();
+  }
+
+  bool guard_size_oblivious(const char* file, int64_t line) override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("guard_size_oblivious")(file, line).cast<bool>();
+  }
+
+  int64_t int_() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("int_")().cast<int64_t>();
+  }
+
+  c10::optional<int64_t> maybe_as_int() override {
+    py::gil_scoped_acquire acquire;
+    const auto& r = getPyObj().attr("maybe_as_int")();
+    if (r.is_none()) {
+      return c10::nullopt;
+    } else {
+      return r.cast<int64_t>();
+    }
+  }
+
+  std::string str() override {
+    py::gil_scoped_acquire acquire;
+    return getPyObj().attr("str")().cast<std::string>();
+  }
+
+  c10::SymNode dispatch_sym_ite_(
+      const char* fname,
+      const c10::SymNode& other,
+      const c10::SymNode& third) {
+    auto pother = dynamic_cast<PythonSymNodeImpl*>(other.get());
+    auto pthird = dynamic_cast<PythonSymNodeImpl*>(third.get());
+    TORCH_CHECK(pother);
+    TORCH_CHECK(pthird);
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr(fname)(pother->getPyObj(), pthird->getPyObj());
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode dispatch_common_(const char* fname, const c10::SymNode& other) {
+    auto pother = dynamic_cast<PythonSymNodeImpl*>(other.get());
+    TORCH_CHECK(pother);
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr(fname)(pother->getPyObj());
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode dispatch_common_(const char* fname) {
+    py::gil_scoped_acquire acquire;
+    auto r = getPyObj().attr(fname)();
+    return c10::make_intrusive<PythonSymNodeImpl>(r);
+  }
+
+  c10::SymNode add(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode sub(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode mul(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode truediv(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode pow(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode floordiv(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode mod(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode eq(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode ne(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode gt(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode lt(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode le(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode ge(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode sym_min(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+  c10::SymNode sym_max(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode sym_and(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode sym_or(const c10::SymNode& other) override {
+    return dispatch_common_(__func__, other);
+  }
+
+  c10::SymNode sym_ite(const c10::SymNode& other, const c10::SymNode& third)
+      override {
+    return dispatch_sym_ite_(__func__, other, third);
+  }
+
+  c10::SymNode sym_not() override {
+    return dispatch_common_(__func__);
+  }
+
+  c10::SymNode ceil() override {
+    return dispatch_common_(__func__);
+  }
+
+  c10::SymNode floor() override {
+    return dispatch_common_(__func__);
+  }
+
+  c10::SymNode neg() override {
+    return dispatch_common_(__func__);
+  }
+
+  c10::SymNode clone() override {
+    return dispatch_common_(__func__);
+  }
+
+  c10::SymNode sym_float() override {
+    return dispatch_common_(__func__);
+  }
+
+  py::handle getPyObj() const {
+    return py::handle(pyobj_->ptr(getPyInterpreter()));
+  }
+  std::shared_ptr<c10::SafePyObject> pyobj_ = nullptr;
+};
+
+} // namespace impl
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_torch_function_mode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_torch_function_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..8716f1b245909d34299db1abff2acbe42496406c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_torch_function_mode.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/PythonTorchFunctionTLS.h>
+
+namespace torch {
+namespace overrides {
+
+struct StashTorchFunctionModeGuard {
+  StashTorchFunctionModeGuard() {
+    cur_mode_ = at::impl::PythonTorchFunctionTLS::pop_stack();
+  }
+  ~StashTorchFunctionModeGuard() {
+    at::impl::PythonTorchFunctionTLS::push_onto_stack(cur_mode_);
+  }
+
+  const std::shared_ptr<c10::SafePyObject>& get_cur_mode() {
+    return cur_mode_;
+  }
+
+ private:
+  std::shared_ptr<c10::SafePyObject> cur_mode_;
+};
+
+} // namespace overrides
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_tuples.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_tuples.h
new file mode 100644
index 0000000000000000000000000000000000000000..f224941a50b2f22c31d8515566c1230b9d45f9c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/python_tuples.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+inline void THPUtils_packInt64Array(
+    PyObject* tuple,
+    size_t size,
+    const int64_t* sizes) {
+  for (size_t i = 0; i != size; ++i) {
+    PyObject* i64 = THPUtils_packInt64(sizes[i]);
+    if (!i64) {
+      throw python_error();
+    }
+    PyTuple_SET_ITEM(tuple, i, i64);
+  }
+}
+
+inline PyObject* THPUtils_packInt64Array(size_t size, const int64_t* sizes) {
+  THPObjectPtr tuple(PyTuple_New(size));
+  if (!tuple)
+    throw python_error();
+  THPUtils_packInt64Array(tuple.get(), size, sizes);
+  return tuple.release();
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pythoncapi_compat.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pythoncapi_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa62cb7d7352673f467b73295a7c80c8a565419
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/pythoncapi_compat.h
@@ -0,0 +1,716 @@
+// Header file providing new C API functions to old Python versions.
+//
+// File distributed under the Zero Clause BSD (0BSD) license.
+// Copyright Contributors to the pythoncapi_compat project.
+//
+// Homepage:
+// https://github.com/python/pythoncapi_compat
+//
+// Latest version:
+// https://raw.githubusercontent.com/python/pythoncapi_compat/master/pythoncapi_compat.h
+//
+// SPDX-License-Identifier: 0BSD
+
+#ifndef PYTHONCAPI_COMPAT
+#define PYTHONCAPI_COMPAT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+#include "frameobject.h"          // PyFrameObject, PyFrame_GetBack()
+
+
+// Compatibility with Visual Studio 2013 and older which don't support
+// the inline keyword in C (only in C++): use __inline instead.
+#if (defined(_MSC_VER) && _MSC_VER < 1900 \
+     && !defined(__cplusplus) && !defined(inline))
+#  define PYCAPI_COMPAT_STATIC_INLINE(TYPE) static __inline TYPE
+#else
+#  define PYCAPI_COMPAT_STATIC_INLINE(TYPE) static inline TYPE
+#endif
+
+
+#ifndef _Py_CAST
+#  define _Py_CAST(type, expr) ((type)(expr))
+#endif
+
+// On C++11 and newer, _Py_NULL is defined as nullptr on C++11,
+// otherwise it is defined as NULL.
+#ifndef _Py_NULL
+#  if defined(__cplusplus) && __cplusplus >= 201103
+#    define _Py_NULL nullptr
+#  else
+#    define _Py_NULL NULL
+#  endif
+#endif
+
+// Cast argument to PyObject* type.
+#ifndef _PyObject_CAST
+#  define _PyObject_CAST(op) _Py_CAST(PyObject*, op)
+#endif
+
+
+// bpo-42262 added Py_NewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_NewRef)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+_Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+#define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-42262 added Py_XNewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_XNewRef)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+_Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+#define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+PYCAPI_COMPAT_STATIC_INLINE(void)
+_Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
+// Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
+// It is excluded from the limited C API.
+#if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
+#define Py_SETREF(dst, src)                                     \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_DECREF(_tmp_dst);                                    \
+    } while (0)
+
+#define Py_XSETREF(dst, src)                                    \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_XDECREF(_tmp_dst);                                   \
+    } while (0)
+#endif
+
+
+// bpo-43753 added Py_Is(), Py_IsNone(), Py_IsTrue() and Py_IsFalse()
+// to Python 3.10.0b1.
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_Is)
+#  define Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsNone)
+#  define Py_IsNone(x) Py_Is(x, Py_None)
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsTrue)
+#  define Py_IsTrue(x) Py_Is(x, Py_True)
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsFalse)
+#  define Py_IsFalse(x) Py_Is(x, Py_False)
+#endif
+
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+PYCAPI_COMPAT_STATIC_INLINE(void)
+_Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+PYCAPI_COMPAT_STATIC_INLINE(void)
+_Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyCodeObject*)
+PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
+PYCAPI_COMPAT_STATIC_INLINE(PyCodeObject*)
+_PyFrame_GetCodeBorrow(PyFrameObject *frame)
+{
+    PyCodeObject *code = PyFrame_GetCode(frame);
+    Py_DECREF(code);
+    return code;
+}
+
+
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyFrameObject*)
+PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyFrameObject*)
+_PyFrame_GetBackBorrow(PyFrameObject *frame)
+{
+    PyFrameObject *back = PyFrame_GetBack(frame);
+    Py_XDECREF(back);
+    return back;
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLocals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyFrame_GetLocals(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030400B1
+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
+        return NULL;
+    }
+#else
+    PyFrame_FastToLocals(frame);
+#endif
+    return Py_NewRef(frame->f_locals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetGlobals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyFrame_GetGlobals(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_globals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetBuiltins() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyFrame_GetBuiltins(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_builtins);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLasti() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyFrame_GetLasti(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030A00A7
+    // bpo-27129: Since Python 3.10.0a7, f_lasti is an instruction offset,
+    // not a bytes offset anymore. Python uses 16-bit "wordcode" (2 bytes)
+    // instructions.
+    if (frame->f_lasti < 0) {
+        return -1;
+    }
+    return frame->f_lasti * 2;
+#else
+    return frame->f_lasti;
+#endif
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVar() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyFrame_GetVar(PyFrameObject *frame, PyObject *name)
+{
+    PyObject *locals, *value;
+
+    locals = PyFrame_GetLocals(frame);
+    if (locals == NULL) {
+        return NULL;
+    }
+#if PY_VERSION_HEX >= 0x03000000
+    value = PyDict_GetItemWithError(locals, name);
+#else
+    value = PyDict_GetItem(locals, name);
+#endif
+    Py_DECREF(locals);
+
+    if (value == NULL) {
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+#if PY_VERSION_HEX >= 0x03000000
+        PyErr_Format(PyExc_NameError, "variable %R does not exist", name);
+#else
+        PyErr_SetString(PyExc_NameError, "variable does not exist");
+#endif
+        return NULL;
+    }
+    return Py_NewRef(value);
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVarString() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyFrame_GetVarString(PyFrameObject *frame, const char *name)
+{
+    PyObject *name_obj, *value;
+    name_obj = PyUnicode_FromString(name);
+    if (name_obj == NULL) {
+        return NULL;
+    }
+    value = PyFrame_GetVar(frame, name_obj);
+    Py_DECREF(name_obj);
+    return value;
+}
+#endif
+
+
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyInterpreterState *)
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyFrameObject*)
+PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyFrameObject*)
+_PyThreadState_GetFrameBorrow(PyThreadState *tstate)
+{
+    PyFrameObject *frame = PyThreadState_GetFrame(tstate);
+    Py_XDECREF(frame);
+    return frame;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyInterpreterState*)
+PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(uint64_t)
+PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
+// bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(void)
+PyThreadState_EnterTracing(PyThreadState *tstate)
+{
+    tstate->tracing++;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = 0;
+#else
+    tstate->use_tracing = 0;
+#endif
+}
+#endif
+
+// bpo-43760 added PyThreadState_LeaveTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(void)
+PyThreadState_LeaveTracing(PyThreadState *tstate)
+{
+    int use_tracing = (tstate->c_tracefunc != _Py_NULL
+                       || tstate->c_profilefunc != _Py_NULL);
+    tstate->tracing--;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = use_tracing;
+#else
+    tstate->use_tracing = use_tracing;
+#endif
+}
+#endif
+
+
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyObject_CallMethodNoArgs(PyObject *obj, PyObject *name)
+{
+    return PyObject_CallMethodObjArgs(obj, name, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyObject_CallMethodOneArg(PyObject *obj, PyObject *name, PyObject *arg)
+{
+    return PyObject_CallMethodObjArgs(obj, name, arg, NULL);
+}
+#endif
+
+
+// bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
+{
+    int res;
+    Py_XINCREF(value);
+    res = PyModule_AddObject(module, name, value);
+    if (res < 0) {
+        Py_XDECREF(value);
+    }
+    return res;
+}
+#endif
+
+
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+_Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
+// bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
+// Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
+// C API: Python 3.11a2-3.11a6 versions are not supported.
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyFloat_Pack2(double x, char *p, int le)
+{ return _PyFloat_Pack2(x, (unsigned char*)p, le); }
+
+PYCAPI_COMPAT_STATIC_INLINE(double)
+PyFloat_Unpack2(const char *p, int le)
+{ return _PyFloat_Unpack2((const unsigned char *)p, le); }
+#endif
+
+
+// bpo-46906 added PyFloat_Pack4(), PyFloat_Pack8(), PyFloat_Unpack4() and
+// PyFloat_Unpack8() to Python 3.11a7.
+// Python 3.11a2 moved _PyFloat_Pack4(), _PyFloat_Pack8(), _PyFloat_Unpack4()
+// and _PyFloat_Unpack8() to the internal C API: Python 3.11a2-3.11a6 versions
+// are not supported.
+#if PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyFloat_Pack4(double x, char *p, int le)
+{ return _PyFloat_Pack4(x, (unsigned char*)p, le); }
+
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyFloat_Pack8(double x, char *p, int le)
+{ return _PyFloat_Pack8(x, (unsigned char*)p, le); }
+
+PYCAPI_COMPAT_STATIC_INLINE(double)
+PyFloat_Unpack4(const char *p, int le)
+{ return _PyFloat_Unpack4((const unsigned char *)p, le); }
+
+PYCAPI_COMPAT_STATIC_INLINE(double)
+PyFloat_Unpack8(const char *p, int le)
+{ return _PyFloat_Unpack8((const unsigned char *)p, le); }
+#endif
+
+
+// gh-92154 added PyCode_GetCode() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyCode_GetCode(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_code);
+}
+#endif
+
+
+// gh-95008 added PyCode_GetVarnames() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyCode_GetVarnames(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_varnames);
+}
+#endif
+
+// gh-95008 added PyCode_GetFreevars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyCode_GetFreevars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_freevars);
+}
+#endif
+
+// gh-95008 added PyCode_GetCellvars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyCode_GetCellvars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_cellvars);
+}
+#endif
+
+
+// Py_UNUSED() was added to Python 3.4.0b2.
+#if PY_VERSION_HEX < 0x030400B2 && !defined(Py_UNUSED)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#  else
+#    define Py_UNUSED(name) _unused_ ## name
+#  endif
+#endif
+
+
+// gh-105922 added PyImport_AddModuleRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A0
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyImport_AddModuleRef(const char *name)
+{
+    return Py_XNewRef(PyImport_AddModule(name));
+}
+#endif
+
+
+// gh-105927 added PyWeakref_GetRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D0000
+PYCAPI_COMPAT_STATIC_INLINE(int)
+PyWeakref_GetRef(PyObject *ref, PyObject **pobj)
+{
+    PyObject *obj;
+    if (ref != NULL && !PyWeakref_Check(ref)) {
+        *pobj = NULL;
+        PyErr_SetString(PyExc_TypeError, "expected a weakref");
+        return -1;
+    }
+    obj = PyWeakref_GetObject(ref);
+    if (obj == NULL) {
+        // SystemError if ref is NULL
+        *pobj = NULL;
+        return -1;
+    }
+    if (obj == Py_None) {
+        *pobj = NULL;
+        return 0;
+    }
+    *pobj = Py_NewRef(obj);
+    return (*pobj != NULL);
+}
+#endif
+
+
+// bpo-36974 added PY_VECTORCALL_ARGUMENTS_OFFSET to Python 3.8b1
+#ifndef PY_VECTORCALL_ARGUMENTS_OFFSET
+#  define PY_VECTORCALL_ARGUMENTS_OFFSET (_Py_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+#endif
+
+// bpo-36974 added PyVectorcall_NARGS() to Python 3.8b1
+#if PY_VERSION_HEX < 0x030800B1
+static inline Py_ssize_t
+PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+#endif
+
+
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+PYCAPI_COMPAT_STATIC_INLINE(PyObject*)
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PYTHONCAPI_COMPAT
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/schema_info.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/schema_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..562ec71ed69727b316f9b18f3b05ecc357e38c88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/schema_info.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#include <unordered_set>
+
+namespace torch {
+namespace utils {
+
+using SchemaSpecialCasePair =
+    std::pair<c10::FunctionSchema, std::unordered_set<std::string>>;
+/**
+ * class SchemaInfo
+ *
+ * FunctionSchema wrapper that publicizes argument value specific operator
+ * behavior (mutation, aliasing, special cases, etc...)
+ */
+
+struct TORCH_API SchemaInfo {
+ public:
+  explicit SchemaInfo(c10::FunctionSchema schema)
+      : schema_(std::move(schema)),
+        alias_maps_current_(false),
+        has_init_(false) {}
+  explicit SchemaInfo(const char* signature)
+      : schema_(torch::jit::parseSchema(signature)),
+        alias_maps_current_(false),
+        has_init_(false) {}
+
+  bool is_mutable();
+
+  bool is_mutable(const c10::SchemaArgument& argument);
+
+  bool is_mutable(c10::string_view name);
+
+  bool has_argument(c10::string_view name);
+
+  bool is_nondeterministic() const;
+
+  // Returns whether lhs and rhs may alias directly.
+  // This does not account for cases where lhs or rhs are a container that
+  // may contain elements that alias the other argument.
+  // Besides the checks already included in FunctionSchema::may_alias, this
+  // method also accounts special aliasing cases causes by aliasing argument
+  // values supplied from addArgumentValue.
+  bool may_alias(
+      const c10::SchemaArgument& lhs,
+      const c10::SchemaArgument& rhs);
+
+  // Returns whether lhs and rhs may alias directly or whether lhs/rhs are a
+  // container that may contain elements that alias the other argument. Besides
+  // the checks already included in FunctionSchema::may_contain_alias, this
+  // method also accounts for special aliasing cases causes by aliasing argument
+  // values supplied from addArgumentValue. bidirectional = false only returns
+  // whether lhs may contain an alias of rhs while bidirectional = true returns
+  // both directions.
+  bool may_contain_alias(
+      const c10::SchemaArgument& lhs,
+      const c10::SchemaArgument& rhs,
+      bool bidirectional = true);
+
+  void addArgumentValue(const std::string& name, const at::IValue& value);
+
+  void addArgumentValues(
+      const std::vector<c10::optional<at::IValue>>& value_list);
+
+  void addArgumentValues(
+      const std::unordered_map<std::string, at::IValue>& values);
+
+  bool hasInputArgumentNamed(const std::string& name) const;
+
+ private:
+  // This function enforces more conservative results when the TORCH_WARN is
+  // triggered from above due to duplicates in an argument list
+  void ensureConservativity(
+      const std::unordered_set<at::Symbol>& duplicates,
+      const std::vector<c10::Argument>& arguments_list,
+      c10::SchemaArgType type);
+
+  void initSchemaInfo();
+
+  void generateAliasMaps();
+
+  bool mayContainAliasImpl(
+      const c10::SchemaArgument& lhs,
+      const c10::SchemaArgument& rhs);
+
+  static std::vector<c10::FunctionSchema> getNonDeterministicOps();
+
+  static std::vector<SchemaSpecialCasePair> getTrainingOps();
+
+  const std::unordered_set<c10::SchemaArgument>& wildcardSet();
+
+  const std::unordered_set<c10::SchemaArgument>& containerSet();
+
+  // Set of all wildcard arguments
+  std::unordered_set<c10::SchemaArgument> wildcard_set_;
+
+  // Set of all container arguments
+  std::unordered_set<c10::SchemaArgument> container_set_;
+
+  // Map of argument IValues
+  std::unordered_map<std::string, at::IValue> value_map_;
+
+  // Alias map of inputs with each other
+  std::vector<std::unordered_set<size_t>> input_alias_map_;
+
+  // Alias map of outputs to inputs
+  std::vector<std::unordered_set<size_t>> output_alias_map_;
+
+  const c10::FunctionSchema schema_;
+
+  bool alias_maps_current_;
+
+  bool has_init_;
+};
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/six.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/six.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ca33c6c048e66863d8335035257ddc52e162ff8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/six.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/structseq.h>
+
+namespace six {
+
+// Usually instances of PyStructSequence is also an instance of tuple
+// but in some py2 environment it is not, so we have to manually check
+// the name of the type to determine if it is a namedtupled returned
+// by a pytorch operator.
+
+inline bool isStructSeq(pybind11::handle input) {
+  return pybind11::cast<std::string>(input.get_type().attr("__module__")) ==
+      "torch.return_types";
+}
+
+inline bool isStructSeq(PyObject* obj) {
+  return isStructSeq(pybind11::handle(obj));
+}
+
+inline bool isTuple(pybind11::handle input) {
+  if (PyTuple_Check(input.ptr())) {
+    return true;
+  }
+  return false;
+}
+
+inline bool isTuple(PyObject* obj) {
+  return isTuple(pybind11::handle(obj));
+}
+
+// maybeAsTuple: if the input is a structseq, then convert it to a tuple
+//
+// On Python 3, structseq is a subtype of tuple, so these APIs could be used
+// directly. But on Python 2, structseq is not a subtype of tuple, so we need to
+// manually create a new tuple object from structseq.
+inline THPObjectPtr maybeAsTuple(PyStructSequence* obj) {
+  Py_INCREF(obj);
+  return THPObjectPtr((PyObject*)obj);
+}
+
+inline THPObjectPtr maybeAsTuple(PyObject* obj) {
+  if (isStructSeq(obj))
+    return maybeAsTuple((PyStructSequence*)obj);
+  Py_INCREF(obj);
+  return THPObjectPtr(obj);
+}
+
+} // namespace six
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/structseq.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/structseq.h
new file mode 100644
index 0000000000000000000000000000000000000000..b48890351a17d986b3f17b03050d624dadba26de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/structseq.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace utils {
+
+PyObject* returned_structseq_repr(PyStructSequence* obj);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_apply.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_apply.h
new file mode 100644
index 0000000000000000000000000000000000000000..b198ca00e2afa677acc8c93c999146e66eb65676
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_apply.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+namespace utils {
+
+const at::Tensor& apply_(const at::Tensor& self, PyObject* fn);
+const at::Tensor& map_(
+    const at::Tensor& self,
+    const at::Tensor& other_,
+    PyObject* fn);
+const at::Tensor& map2_(
+    const at::Tensor& self,
+    const at::Tensor& x_,
+    const at::Tensor& y_,
+    PyObject* fn);
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_dtypes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_dtypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da58e4a3aa906ae49cdc14b1fc8fde96dd6934c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_dtypes.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <string>
+#include <tuple>
+
+namespace torch {
+namespace utils {
+
+std::pair<std::string, std::string> getDtypeNames(at::ScalarType scalarType);
+
+void initializeDtypes();
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_flatten.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_flatten.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbebd0b6bbcdc72164bb7a9eeeee8bb42d87fc5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_flatten.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <c10/core/TensorOptions.h>
+#include <torch/csrc/Export.h>
+#include <utility>
+
+namespace torch {
+namespace utils {
+
+/// Generate an ID for a combination of tensor backend + scalar type to be used
+/// when ordering tensors ('like' tensors are grouped by pulling out their
+/// backend + scalar type, so this function combines that into a single number)
+inline size_t type_id(const at::Tensor& tensor) {
+  return static_cast<size_t>(tensor.options().backend()) *
+      static_cast<size_t>(at::ScalarType::NumOptions) +
+      static_cast<size_t>(tensor.scalar_type());
+}
+
+inline at::Tensor flatten_dense_tensors(at::TensorList tensors) {
+  return at::flatten_dense_tensors(tensors);
+}
+
+inline std::vector<at::Tensor> unflatten_dense_tensors(
+    const at::Tensor& flat,
+    at::TensorList tensors) {
+  return at::unflatten_dense_tensors(flat, tensors);
+}
+
+struct TensorGroup {
+  std::vector<at::Tensor> tensors;
+  size_t size = 0;
+
+  size_t type_id() {
+    AT_ASSERT(!tensors.empty());
+    return ::torch::utils::type_id(tensors[0]);
+  }
+
+  const at::TensorOptions options() {
+    AT_ASSERT(!tensors.empty());
+    return tensors[0].options();
+  }
+};
+
+// Helper function that takes a list of tensors and splits them into tensor
+// groups by the size limit and outputs these tensor groups. If the input
+// tensors are of different tensor types, they will be split into different
+// groups as well.
+//
+// Two options of splitting provided to the user,
+//
+// Imagine the size_limit is 256 and the list of input tensors are:
+// tensor_a(fp16 - 128 bytes),
+// tensor_b(fp32 - 256 bytes),
+// tensor_c(fp16 - 128 bytes),
+//
+// when fine_grained == false:
+// The function will read the list of tensors sequentially and accumulate
+// enough tensors for each data type until the size_limit, therefore:
+// it will output: {{tensor_a, tensor_c}, {tensor_b}}
+//
+// when fine_grained == true:
+// The function will read the list of tensors sequentially and  accumulate
+// enough tensors for all data types until the size_limit, and then split
+// the accumulated tensors into different groups by data types, therefore:
+// it will output: {{tensor_a}, {tensor_b}, {tensor_c}}
+TORCH_API std::vector<TensorGroup> take_tensors(
+    at::TensorList tensors,
+    size_t size_limit,
+    bool fine_grained = false);
+
+TORCH_API void reorder_tensors_like(
+    std::vector<at::Tensor>& tensors,
+    at::TensorList order);
+
+TORCH_API std::pair<at::Tensor, at::Tensor> flatten_sparse_tensors(
+    at::TensorList tensors);
+
+TORCH_API std::vector<at::Tensor> unflatten_sparse_tensors(
+    const at::Tensor& flat_indices,
+    const at::Tensor& flat_values,
+    at::TensorList tensors);
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_layouts.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_layouts.h
new file mode 100644
index 0000000000000000000000000000000000000000..f667eb8fa7a770cf2536b55e90bf7b7b5cef7bad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_layouts.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace torch {
+namespace utils {
+
+void initializeLayouts();
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_list.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..72e8c0ea7b07e284713094581bf049592328b84e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_list.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace at {
+class Tensor;
+}
+
+namespace torch {
+namespace utils {
+
+PyObject* tensor_to_list(const at::Tensor& tensor);
+
+}
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_memoryformats.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_memoryformats.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a6f9bcb4c5fa3575c51907b79485d165db296b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_memoryformats.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/python_stub.h>
+
+namespace torch::utils {
+
+void initializeMemoryFormats();
+TORCH_PYTHON_API PyObject* getTHPMemoryFormat(c10::MemoryFormat);
+
+} // namespace torch::utils
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_new.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_new.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf532c6c0eac9e6d9fd07b1400ff8f832062ff3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_new.h
@@ -0,0 +1,116 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+
+#include <ATen/core/Tensor.h>
+
+namespace torch {
+namespace utils {
+
+at::Tensor base_tensor_ctor(PyObject* args, PyObject* kwargs);
+at::Tensor legacy_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+at::Tensor legacy_tensor_new(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+at::Tensor indexing_tensor_from_data(
+    c10::TensorOptions options,
+    at::ScalarType scalar_type,
+    c10::optional<at::Device> device,
+    PyObject* data);
+at::Tensor sparse_coo_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+void _validate_sparse_coo_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+
+at::Tensor sparse_compressed_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor sparse_csr_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor sparse_csc_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor sparse_bsr_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor sparse_bsc_tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+
+void _validate_sparse_compressed_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+void _validate_sparse_csr_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+void _validate_sparse_csc_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+void _validate_sparse_bsr_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+void _validate_sparse_bsc_tensor_args(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+
+at::Tensor tensor_ctor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor as_tensor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PythonArgs& r);
+at::Tensor new_tensor(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+at::Tensor new_ones(
+    c10::DispatchKey dispatch_key,
+    at::ScalarType scalar_type,
+    PyObject* args,
+    PyObject* kwargs);
+at::Tensor tensor_frombuffer(
+    PyObject* buffer,
+    at::ScalarType dtype,
+    int64_t count,
+    int64_t offset,
+    bool requires_grad);
+at::Tensor tensor_fromDLPack(PyObject* data);
+at::Tensor asarray(
+    PyObject* obj,
+    c10::optional<c10::ScalarType> dtype,
+    c10::optional<c10::Device> device,
+    c10::optional<bool> copy,
+    bool requires_grad);
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_numpy.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2a51fef080502f470b80f14e507ad407f23f878
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_numpy.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+
+namespace torch::utils {
+
+PyObject* tensor_to_numpy(const at::Tensor& tensor, bool force = false);
+at::Tensor tensor_from_numpy(PyObject* obj, bool warn_if_not_writeable = true);
+
+int aten_to_numpy_dtype(const at::ScalarType scalar_type);
+at::ScalarType numpy_dtype_to_aten(int dtype);
+
+bool is_numpy_available();
+bool is_numpy_int(PyObject* obj);
+bool is_numpy_bool(PyObject* obj);
+bool is_numpy_scalar(PyObject* obj);
+
+void warn_numpy_not_writeable();
+at::Tensor tensor_from_cuda_array_interface(PyObject* obj);
+
+void validate_numpy_for_dlpack_deleter_bug();
+bool is_numpy_dlpack_deleter_bugged();
+
+} // namespace torch::utils
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_qschemes.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_qschemes.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4463c1731782ded220df48e1d2d7a6f9e0a31e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_qschemes.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <torch/csrc/QScheme.h>
+
+namespace torch {
+namespace utils {
+
+PyObject* getTHPQScheme(at::QScheme qscheme);
+void initializeQSchemes();
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_types.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1bd979e64b6eb8f15ceeddaf32b345ea99d7b7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/tensor_types.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <c10/core/TensorOptions.h>
+#include <utility>
+#include <vector>
+
+namespace torch {
+namespace utils {
+
+std::string options_to_string(const at::TensorOptions& options);
+std::string type_to_string(const at::DeprecatedTypeProperties& type);
+at::TensorOptions options_from_string(const std::string& str);
+
+// return a vector of all "declared" types, even those that weren't compiled
+std::vector<std::pair<at::Backend, at::ScalarType>> all_declared_types();
+
+// return python module name of backend, like torch.cuda, torch.foo
+const char* backend_to_string(const at::Backend& backend);
+
+} // namespace utils
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark-inl.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c820810f7d15b796fbd893098b55c72234727627
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark-inl.h
@@ -0,0 +1,161 @@
+#pragma once
+
+#include <random>
+#include <thread>
+
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <ATen/Parallel.h>
+#include <c10/core/GradMode.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/irange.h>
+
+namespace torch {
+namespace throughput_benchmark {
+namespace detail {
+
+template <class Input, class Output, class Model>
+BenchmarkExecutionStats BenchmarkHelper<Input, Output, Model>::benchmark(
+    const BenchmarkConfig& config) const {
+  CHECK(initialized_);
+  TORCH_CHECK(
+      config.num_worker_threads == 1,
+      "Only parallelization by callers is supported");
+
+  LOG(INFO) << at::get_parallel_info();
+
+  // We pre-generate inputs here for each of the threads. This allows us to
+  // safely move inputs out for each of the threads independently and thus avoid
+  // overhead from the benchmark runner itself
+  std::vector<std::vector<Input>> thread_inputs(config.num_calling_threads);
+  std::vector<size_t> input_iters(config.num_calling_threads);
+  {
+    std::random_device seeder;
+    std::mt19937 engine(seeder());
+    TORCH_CHECK(
+        !inputs_.empty(),
+        "Please provide benchmark inputs."
+        "Did you forget to call add_input()? ");
+    std::uniform_int_distribution<int> dist(0, inputs_.size() - 1);
+
+    for (const auto thread_id : c10::irange(config.num_calling_threads)) {
+      // Just in case we generate num_iters inputs for each of the threads
+      // This was if one thread does all the work we will be fine
+      for (const auto i [[maybe_unused]] :
+           c10::irange(config.num_iters + config.num_warmup_iters)) {
+        thread_inputs[thread_id].push_back(cloneInput(inputs_[dist(engine)]));
+      }
+      input_iters[thread_id] = 0;
+    }
+  }
+
+  std::mutex m;
+  std::condition_variable worker_main_cv;
+  std::condition_variable main_worker_cv;
+  // TODO: add GUARDED_BY once it is available
+  int64_t initialized{0};
+  int64_t finished{0};
+  bool start{false};
+  std::atomic<int64_t> num_attempted_iters{0};
+  std::vector<std::thread> callers;
+
+  callers.reserve(config.num_calling_threads);
+
+  bool tls_grad_enabled = c10::GradMode::is_enabled();
+  c10::impl::LocalDispatchKeySet tls_key_set =
+      c10::impl::tls_local_dispatch_key_set();
+
+  for (const auto thread_id : c10::irange(config.num_calling_threads)) {
+    callers.emplace_back([&, thread_id]() {
+      // We use conditional variable as a barrier to make sure each thread
+      // performs required warmeup iterations before we start measuring
+      c10::GradMode::set_enabled(tls_grad_enabled);
+      c10::impl::_force_tls_local_dispatch_key_set(tls_key_set);
+
+      for (const auto j : c10::irange(config.num_warmup_iters)) {
+        (void)j;
+        runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
+        ++input_iters[thread_id];
+      }
+      {
+        std::unique_lock<std::mutex> lock(m);
+        ++initialized;
+        worker_main_cv.notify_one();
+        // NOLINTNEXTLINE(bugprone-infinite-loop)
+        while (!start) {
+          main_worker_cv.wait(lock);
+        }
+      }
+      LOG(INFO) << "Starting forward thread " << thread_id;
+      while (num_attempted_iters.fetch_add(1) < config.num_iters) {
+        runOnce(std::move(thread_inputs[thread_id][input_iters[thread_id]]));
+        ++input_iters[thread_id];
+      }
+
+      {
+        std::unique_lock<std::mutex> lock(m);
+        ++finished;
+        worker_main_cv.notify_one();
+        LOG(INFO) << "Shutting down forward thread " << thread_id
+                  << ". Total number of finished threads: " << finished;
+      }
+    });
+  }
+
+  using Clock = std::chrono::high_resolution_clock;
+  using RecordProfile = torch::autograd::profiler::RecordProfile;
+  using TimePoint = std::chrono::time_point<Clock>;
+  TimePoint start_time;
+
+  std::unique_ptr<RecordProfile> profiler_guard;
+  {
+    std::unique_lock<std::mutex> lock(m);
+    while (initialized != config.num_calling_threads) {
+      worker_main_cv.wait(lock);
+    }
+    if (!config.profiler_output_path.empty()) {
+      LOG(INFO) << "Using Autograd profiler. Trace will be saved to "
+                << config.profiler_output_path;
+      profiler_guard =
+          std::make_unique<RecordProfile>(config.profiler_output_path);
+    }
+    LOG(INFO) << "Starting threads";
+    start = true;
+    start_time = Clock::now();
+  }
+
+  main_worker_cv.notify_all();
+  {
+    std::unique_lock<std::mutex> lock(m);
+    worker_main_cv.wait(
+        lock, [&]() { return finished == config.num_calling_threads; });
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  profiler_guard.reset();
+  LOG(INFO) << "Finished benchmark";
+
+  BenchmarkExecutionStats stats;
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+  float total_time_ms = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                            end_time - start_time)
+                            .count() /
+      1000.0 / 1000.0;
+  // We use config.num_iters instead of num_attempted_iters as it is
+  // repsesatative of the real work done. Last attempted iteration on each
+  // calling threads doesn't represent the real work (i.e. running the model)
+  stats.latency_avg_ms =
+      // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+      total_time_ms * config.num_calling_threads / config.num_iters;
+  stats.num_iters = config.num_iters;
+
+  for (auto& t : callers) {
+    t.join();
+  }
+  return stats;
+}
+
+} // namespace detail
+} // namespace throughput_benchmark
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..a156a25057f166f5568af5cf119c5a80da74e8dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/throughput_benchmark.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <torch/csrc/jit/python/pybind_utils.h>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace torch {
+namespace throughput_benchmark {
+
+/**
+ * The struct is used to provide results of a benchmark to the caller
+ * In the future all additional statics should be added here.
+ */
+struct BenchmarkExecutionStats {
+  float latency_avg_ms{-1};
+  int64_t num_iters{-1};
+};
+
+std::ostream& operator<<(
+    std::ostream& os,
+    const BenchmarkExecutionStats& value);
+
+/**
+ * Use this struct in order to configure a throughput benchmark run.
+ * This struct should include parameters related to threading, batching, number
+ * of iterations, warm-up, etc. More configs can be added as needed.
+ * General rule here is that only things that c++ must(!) to be aware of should
+ * be here. If we can keep other parts in python, we should keep them there.
+ * This is typical for things that are not perf critical and don't affect
+ * execution statistics benchmark returns.
+ */
+struct BenchmarkConfig {
+ public:
+  // Calling threads are those threads that are calling into a module in
+  // parallel.
+  int num_calling_threads{1};
+  // Worker threads are not supported yet. This is just an example that we plan
+  // to support some sort of multi-threaded forward calls. We may change this
+  // setting in the future to support different intra and inter op parallelism
+  // which is not available in PyTorch yet
+  int num_worker_threads{1};
+  // Warmup iters are used to make sure we run a module a few times before
+  // actually measuring things. This way we avoid cold caches and any other
+  // similar problems
+  int num_warmup_iters{1};
+  // Number of iterations the benchmark should run with. This number is separate
+  // from the warmup iterations
+  int64_t num_iters{100};
+  // If set autograd profiler will be enabled. I.e. this variable would be
+  // created before the main benchmark loop (but after the warmup):
+  // RecordProfile guard(profiler_output_path);
+  std::string profiler_output_path{""};
+};
+
+namespace detail {
+
+/**
+ * A helper class to abstract out different models we test throughput of
+ */
+template <class Input, class Output, class Model>
+class BenchmarkHelper {
+ public:
+  BenchmarkHelper();
+  explicit BenchmarkHelper(Model model)
+      : model_(std::move(model)), initialized_(true) {}
+
+  // This method to be used in benchmark() method
+  // Note that there is no result. This way we don't have to call this under GIL
+  // even when running in the nn.Module mode. Otherwise destructor of the result
+  // would race with Python
+  void runOnce(Input&&) const;
+  // This method is to be used when calling from Python directly
+  Output runOnce(py::args&&, const py::kwargs&) const;
+  // Aggregate input in the format Model expects in order to avoid further
+  // conversions at the benchmark time
+  void addInput(py::args&&, py::kwargs&&);
+  void addInput(Input&&);
+  BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;
+
+  bool initialized() const {
+    return initialized_;
+  }
+
+  // Destructor doesn't require the GIL because it is going to be executed on
+  // the PyThon thread
+  std::vector<Input> inputs_;
+  Model model_;
+  bool initialized_{false};
+};
+
+struct C10_HIDDEN ModuleInput {
+  ModuleInput(ModuleInput&& other) = default;
+
+  ModuleInput(const ModuleInput&) = delete;
+  ModuleInput& operator=(ModuleInput& other) = delete;
+  ModuleInput& operator=(ModuleInput&& other) = delete;
+
+  ModuleInput(py::args&& args, py::kwargs&& kwargs)
+      : args(std::move(args)), kwargs(std::move(kwargs)) {}
+
+  py::args args;
+  py::kwargs kwargs;
+};
+typedef py::object ModuleOutput;
+typedef std::vector<at::IValue> ScriptModuleInput;
+typedef at::IValue ScriptModuleOutput;
+
+template <class Input>
+Input cloneInput(const Input& input);
+
+typedef BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>
+    ScriptModuleBenchmark;
+template <>
+inline BenchmarkHelper<ScriptModuleInput, at::IValue, jit::Module>::
+    BenchmarkHelper()
+    : model_("Module", std::make_shared<jit::CompilationUnit>()),
+      initialized_(false) {}
+typedef BenchmarkHelper<ModuleInput, py::object, py::object> ModuleBenchmark;
+template <>
+inline BenchmarkHelper<ModuleInput, py::object, py::object>::BenchmarkHelper()
+    : initialized_(false) {}
+
+template <>
+void ScriptModuleBenchmark::runOnce(ScriptModuleInput&& input) const;
+
+template <>
+ScriptModuleOutput ScriptModuleBenchmark::runOnce(
+    py::args&& args,
+    const py::kwargs& kwargs) const;
+
+template <>
+void ModuleBenchmark::runOnce(ModuleInput&& input) const;
+
+template <>
+ModuleOutput ModuleBenchmark::runOnce(py::args&& args, const py::kwargs& kwargs)
+    const;
+
+template <>
+void ScriptModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
+template <>
+void ScriptModuleBenchmark::addInput(ScriptModuleInput&& input);
+
+template <>
+void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs);
+
+} // namespace detail
+
+/**
+ * This class is a small c++ component responsible for executing a PyTorch
+ * module under an inference server like load. It can emulate multiple calling
+ * threads to a single module provided. In the future we plan to enhance this
+ * component to support inter and intra-op parallelism as well as multiple
+ * models running in a single process.
+ *
+ * For current available configurations refer to the BenchmarkConfig
+ * documentation
+ *
+ * The class supports working with either nn.Module or ScriptModule.
+ * Under the hood it just dispatches to corresponding specialization of
+ * class BenchmarkHelper<Input, Output, Model>
+ */
+class C10_HIDDEN ThroughputBenchmark {
+ public:
+  explicit ThroughputBenchmark(const jit::Module& module);
+  explicit ThroughputBenchmark(py::object module);
+
+  // Add one more input example. This input example should be in the exact
+  // format the module under test expects. It is responsibility of the module to
+  // perform any such format checks, the benchmark doesn't perform any
+  // validation of its own
+  void addInput(py::args args, py::kwargs kwargs);
+
+  // Equivalent to just running the model directly on the given input
+  py::object runOnce(py::args&& args, const py::kwargs& kwargs);
+
+  // The main method of the class allows to perform a multi-threaded benchmark
+  // It returns BenchmarkExecutionStats object with a lot of useful statistics
+  // about runtime execution. We can enhance this class in the future to provide
+  // more information to the user
+  BenchmarkExecutionStats benchmark(const BenchmarkConfig& config) const;
+
+ private:
+  detail::ScriptModuleBenchmark script_module_;
+  detail::ModuleBenchmark module_;
+};
+} // namespace throughput_benchmark
+} // namespace torch
+
+#include <torch/csrc/utils/throughput_benchmark-inl.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/torch_dispatch_mode.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/torch_dispatch_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..c08ee5c066f2020df17d5ca524173b41bfe56523
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/torch_dispatch_mode.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+
+namespace torch {
+namespace torch_dispatch_mode {
+
+struct StashTorchDispatchModeGuard {
+ public:
+  StashTorchDispatchModeGuard() {
+    if (c10::impl::TorchDispatchModeTLS::any_modes_set(
+            /*skip_infra_modes=*/true)) {
+      saved_mode_ = c10::impl::TorchDispatchModeTLS::pop_stack();
+    } else {
+      auto mode_and_key =
+          c10::impl::TorchDispatchModeTLS::pop_highest_infra_mode();
+      saved_mode_ = std::move(std::get<0>(mode_and_key));
+      saved_mode_key_ = std::get<1>(mode_and_key);
+    }
+  }
+
+  ~StashTorchDispatchModeGuard() {
+    if (saved_mode_key_ != c10::nullopt) {
+      c10::impl::TorchDispatchModeTLS::set_mode(
+          saved_mode_, saved_mode_key_.value());
+    } else {
+      c10::impl::TorchDispatchModeTLS::push_non_infra_mode_onto_stack(
+          std::move(saved_mode_));
+    }
+  }
+
+  const std::shared_ptr<c10::SafePyObject>& get_cur_mode() {
+    return saved_mode_;
+  }
+
+ private:
+  std::shared_ptr<at::SafePyObject> saved_mode_;
+  c10::optional<c10::impl::TorchDispatchModeKey> saved_mode_key_;
+};
+
+struct StashTorchDispatchStackGuard {
+ public:
+  StashTorchDispatchStackGuard() {
+    auto old = c10::impl::TorchDispatchModeTLS::get_state();
+    c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));
+    saved_state_ = std::move(old);
+  }
+
+  ~StashTorchDispatchStackGuard() {
+    c10::impl::TorchDispatchModeTLS::set_state(std::move(saved_state_));
+  }
+
+ private:
+  c10::impl::TorchDispatchModeTLS saved_state_;
+};
+
+} // namespace torch_dispatch_mode
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/variadic.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/variadic.h
new file mode 100644
index 0000000000000000000000000000000000000000..b46b67c62587ef67410ee9f3abbf4c31cdce2cd2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/variadic.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/Variadic.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <cstdint>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+
+using at::IterArgs;
+
+struct CountTensors : IterArgs<CountTensors> {
+  size_t out = 0;
+  void operator()(const at::Tensor& x) {
+    out += 1;
+  }
+  void operator()(const c10::optional<at::Tensor>& x) {
+    out += x.has_value();
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    out += xs.size();
+  }
+};
+
+template <typename... Args>
+size_t count_tensors(Args&&... args) {
+  return CountTensors().apply(std::forward<Args>(args)...).out;
+}
+
+struct CountVariables : IterArgs<CountVariables> {
+  size_t out = 0;
+  void operator()(const autograd::Variable& x) {
+    out += 1;
+  }
+  void operator()(at::ArrayRef<autograd::Variable> xs) {
+    out += xs.size();
+  }
+};
+
+template <typename... Args>
+inline size_t count_variables(Args&&... args) {
+  return CountVariables().apply(std::forward<Args>(args)...).out;
+}
+
+//===----------------------------------------------------------------------===//
+//                std::index_sequence shim for C++11
+//===----------------------------------------------------------------------===//
+
+// A container of type-template parameter indices.
+template <size_t... Is>
+struct Indices {};
+
+// Decrements the index N, adds N-1 to the list of indices and forwards
+// whatever we already have.
+template <size_t N, size_t... Is>
+struct MakeIndices : MakeIndices<N - 1, N - 1, Is...> {};
+
+// Partial specialization that forms our base case. When N is zero, we stop
+// and define a typedef that will be visible to earlier classes due to
+// inheritance. The typedef we define is an index list containing the numbers
+// 0 through N-1.
+template <size_t... Is>
+struct MakeIndices<0, Is...> {
+  using indices = Indices<Is...>;
+};
+
+//===----------------------------------------------------------------------===//
+//                                 Utilities
+//===----------------------------------------------------------------------===//
+
+template <bool value, typename T = void>
+using enable_if_t = std::enable_if_t<value, T>;
+
+template <bool value, typename T = void>
+using disable_if_t = enable_if_t<!value, T>;
+
+template <typename T>
+using decay_t = std::decay_t<T>;
+
+namespace detail {
+template <bool...>
+struct pack;
+} // namespace detail
+
+template <bool... values>
+struct all_of : std::is_same<
+                    detail::pack<values..., true>,
+                    detail::pack<true, values...>> {};
+
+template <bool...>
+struct any_of;
+
+template <>
+struct any_of<> : std::false_type {};
+
+template <bool head, bool... tail>
+struct any_of<head, tail...> {
+  static constexpr bool value = head || any_of<tail...>::value;
+};
+
+template <bool... values>
+struct none_of {
+  static constexpr bool value = !any_of<values...>::value;
+};
+
+template <bool... values>
+using enable_if_all_of_t = enable_if_t<all_of<values...>::value>;
+
+template <typename T, typename... Ts>
+using disable_if_contains_t =
+    enable_if_all_of_t<(!std::is_same_v<T, decay_t<Ts>>)...>;
+
+template <typename Function, typename... Ts>
+void apply(Function function, Ts&&... ts) {
+  // https://stackoverflow.com/questions/13978916/inserting-a-variadic-argument-list-into-a-vector
+  // Creates a dummy array, so that each function call is evaluated in order.
+  // `(function(), 0)` is because `function` should (!) return `void`, so
+  // according to the comma operator, it is evaluated and its result (`void`)
+  // is discarded. Then the zero is evaluated and used as an element in the
+  // array. The first zero ensures the array is not empty.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  int _[]{0, (function(std::forward<Ts>(ts)), 0)...};
+  (void)_;
+}
+
+template <
+    typename ReturnType,
+    typename... Ts,
+    typename Function,
+    typename Accessor>
+ReturnType unpack(Function function, Accessor accessor) {
+  return ReturnType(unpack<ReturnType, Ts...>(
+      std::move(function),
+      std::move(accessor),
+      typename MakeIndices<sizeof...(Ts)>::indices()));
+}
+
+template <
+    typename ReturnType,
+    typename... Ts,
+    typename Function,
+    typename Accessor,
+    size_t... Is>
+ReturnType unpack(Function function, Accessor accessor, Indices<Is...>) {
+  return ReturnType(function(accessor.template operator()<Ts>(Is)...));
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/verbose.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/verbose.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c49e84b8ae62a599ca21880376a567bf2eeccc7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/utils/verbose.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+namespace torch {
+
+void initVerboseBindings(PyObject* module);
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Event.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a9601b88b591da5b58867463f1e00869e134adb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Event.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/xpu/XPUEvent.h>
+#include <torch/csrc/python_headers.h>
+
+struct THXPEvent {
+  PyObject_HEAD at::xpu::XPUEvent xpu_event;
+};
+extern PyObject* THXPEventClass;
+
+void THXPEvent_init(PyObject* module);
+
+inline bool THXPEvent_Check(PyObject* obj) {
+  return THXPEventClass && PyObject_IsInstance(obj, THXPEventClass);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Module.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e6d4d39a445a09e4c1b60fb761756c0cb1289a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Module.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+PyMethodDef* THXPModule_methods();
+
+namespace torch::xpu {
+
+void initModule(PyObject* module);
+
+} // namespace torch::xpu
diff --git a/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Stream.h b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e93aa3604d2300374b75ee23a3abb317adb2ee7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/csrc/xpu/Stream.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <c10/xpu/XPUStream.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THXPStream : THPStream {
+  at::xpu::XPUStream xpu_stream;
+};
+extern PyObject* THXPStreamClass;
+
+void THXPStream_init(PyObject* module);
+
+inline bool THXPStream_Check(PyObject* obj) {
+  return THXPStreamClass && PyObject_IsInstance(obj, THXPStreamClass);
+}
diff --git a/MLPY/Lib/site-packages/torch/include/torch/custom_class.h b/MLPY/Lib/site-packages/torch/include/torch/custom_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..413fc09888aafee0cc36be255ebcb1b6377fb258
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/custom_class.h
@@ -0,0 +1,515 @@
+#pragma once
+
+#include <ATen/core/builtin_function.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/class_type.h>
+#include <ATen/core/op_registration/infer_schema.h>
+#include <ATen/core/stack.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/TypeTraits.h>
+#include <torch/custom_class_detail.h>
+#include <torch/library.h>
+#include <sstream>
+
+namespace torch {
+
+/// This function is used in conjunction with `class_::def()` to register
+/// a constructor for a given C++ class type. For example,
+/// `torch::init<int, std::string>()` would register a two-argument constructor
+/// taking an `int` and a `std::string` as argument.
+template <class... Types>
+detail::types<void, Types...> init() {
+  return detail::types<void, Types...>{};
+}
+
+template <typename Func, typename... ParameterTypeList>
+struct InitLambda {
+  Func f;
+};
+
+template <typename Func>
+decltype(auto) init(Func&& f) {
+  using InitTraits = c10::guts::infer_function_traits_t<std::decay_t<Func>>;
+  using ParameterTypeList = typename InitTraits::parameter_types;
+
+  InitLambda<Func, ParameterTypeList> init{std::forward<Func>(f)};
+  return init;
+}
+
+/// Entry point for custom C++ class registration. To register a C++ class
+/// in PyTorch, instantiate `torch::class_` with the desired class as the
+/// template parameter. Typically, this instantiation should be done in
+/// the initialization of a global variable, so that the class will be
+/// made available on dynamic library loading without any additional API
+/// calls needed. For example, to register a class named Foo, you might
+/// create a global variable like so:
+///
+///     static auto register_foo = torch::class_<Foo>("myclasses", "Foo")
+///       .def("myMethod", &Foo::myMethod)
+///       .def("lambdaMethod", [](const c10::intrusive_ptr<Foo>& self) {
+///         // Do something with `self`
+///       });
+///
+/// In addition to registering the class, this registration also chains
+/// `def()` calls to register methods. `myMethod()` is registered with
+/// a pointer to the Foo class's `myMethod()` method. `lambdaMethod()`
+/// is registered with a C++ lambda expression.
+template <class CurClass>
+class class_ : public ::torch::detail::class_base {
+  static_assert(
+      std::is_base_of<CustomClassHolder, CurClass>::value,
+      "torch::class_<T> requires T to inherit from CustomClassHolder");
+
+ public:
+  /// This constructor actually registers the class type.
+  /// String argument `namespaceName` is an identifier for the
+  /// namespace you would like this class to appear in.
+  /// String argument `className` is the name you would like to
+  /// see this class exposed as in Python and TorchScript. For example, if
+  /// you pass `foo` as the namespace name and `Bar` as the className, the
+  /// class will appear as `torch.classes.foo.Bar` in Python and TorchScript
+  explicit class_(
+      const std::string& namespaceName,
+      const std::string& className,
+      std::string doc_string = "")
+      : class_base(
+            namespaceName,
+            className,
+            std::move(doc_string),
+            typeid(c10::intrusive_ptr<CurClass>),
+            typeid(c10::tagged_capsule<CurClass>)) {}
+
+  /// def() can be used in conjunction with `torch::init()` to register
+  /// a constructor for a given C++ class type. For example, passing
+  /// `torch::init<int, std::string>()` would register a two-argument
+  /// constructor taking an `int` and a `std::string` as argument.
+  template <typename... Types>
+  class_& def(
+      torch::detail::types<void, Types...>,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args =
+          {}) { // Used in combination with
+    // torch::init<...>()
+    auto func = [](c10::tagged_capsule<CurClass> self, Types... args) {
+      auto classObj = c10::make_intrusive<CurClass>(args...);
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(std::move(classObj)));
+    };
+
+    defineMethod(
+        "__init__",
+        std::move(func),
+        std::move(doc_string),
+        default_args);
+    return *this;
+  }
+
+  // Used in combination with torch::init([]lambda(){......})
+  template <typename Func, typename... ParameterTypes>
+  class_& def(
+      InitLambda<Func, c10::guts::typelist::typelist<ParameterTypes...>> init,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto init_lambda_wrapper = [func = std::move(init.f)](
+                                   c10::tagged_capsule<CurClass> self,
+                                   ParameterTypes... arg) {
+      c10::intrusive_ptr<CurClass> classObj =
+          at::guts::invoke(func, std::forward<ParameterTypes>(arg)...);
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(classObj));
+    };
+
+    defineMethod(
+        "__init__",
+        std::move(init_lambda_wrapper),
+        std::move(doc_string),
+        default_args);
+
+    return *this;
+  }
+
+  /// This is the normal method registration API. `name` is the name that
+  /// the method will be made accessible by in Python and TorchScript.
+  /// `f` is a callable object that defines the method. Typically `f`
+  /// will either be a pointer to a method on `CurClass`, or a lambda
+  /// expression that takes a `c10::intrusive_ptr<CurClass>` as the first
+  /// argument (emulating a `this` argument in a C++ method.)
+  ///
+  /// Examples:
+  ///
+  ///     // Exposes method `foo` on C++ class `Foo` as `call_foo()` in
+  ///     // Python and TorchScript
+  ///     .def("call_foo", &Foo::foo)
+  ///
+  ///     // Exposes the given lambda expression as method `call_lambda()`
+  ///     // in Python and TorchScript.
+  ///     .def("call_lambda", [](const c10::intrusive_ptr<Foo>& self) {
+  ///       // do something
+  ///     })
+  template <typename Func>
+  class_& def(
+      std::string name,
+      Func f,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto wrapped_f = detail::wrap_func<CurClass, Func>(std::move(f));
+    defineMethod(
+        std::move(name),
+        std::move(wrapped_f),
+        std::move(doc_string),
+        default_args);
+    return *this;
+  }
+
+  /// Method registration API for static methods.
+  template <typename Func>
+  class_& def_static(std::string name, Func func, std::string doc_string = "") {
+    auto qualMethodName = qualClassName + "." + name;
+    auto schema =
+        c10::inferFunctionSchemaSingleReturn<Func>(std::move(name), "");
+
+    auto wrapped_func =
+        [func = std::move(func)](jit::Stack& stack) mutable -> void {
+      using RetType =
+          typename c10::guts::infer_function_traits_t<Func>::return_type;
+      detail::BoxedProxy<RetType, Func>()(stack, func);
+    };
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        std::move(qualMethodName),
+        std::move(schema),
+        std::move(wrapped_func),
+        std::move(doc_string));
+
+    classTypePtr->addStaticMethod(method.get());
+    registerCustomClassMethod(std::move(method));
+    return *this;
+  }
+
+  /// Property registration API for properties with both getter and setter
+  /// functions.
+  template <typename GetterFunc, typename SetterFunc>
+  class_& def_property(
+      const std::string& name,
+      GetterFunc getter_func,
+      SetterFunc setter_func,
+      std::string doc_string = "") {
+    torch::jit::Function* getter{};
+    torch::jit::Function* setter{};
+
+    auto wrapped_getter =
+        detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
+    getter = defineMethod(name + "_getter", wrapped_getter, doc_string);
+
+    auto wrapped_setter =
+        detail::wrap_func<CurClass, SetterFunc>(std::move(setter_func));
+    setter = defineMethod(name + "_setter", wrapped_setter, doc_string);
+
+    classTypePtr->addProperty(name, getter, setter);
+    return *this;
+  }
+
+  /// Property registration API for properties with only getter function.
+  template <typename GetterFunc>
+  class_& def_property(
+      const std::string& name,
+      GetterFunc getter_func,
+      std::string doc_string = "") {
+    torch::jit::Function* getter{};
+
+    auto wrapped_getter =
+        detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
+    getter = defineMethod(name + "_getter", wrapped_getter, doc_string);
+
+    classTypePtr->addProperty(name, getter, nullptr);
+    return *this;
+  }
+
+  /// Property registration API for properties with read-write access.
+  template <typename T>
+  class_& def_readwrite(const std::string& name, T CurClass::*field) {
+    auto getter_func = [field =
+                            field](const c10::intrusive_ptr<CurClass>& self) {
+      return self.get()->*field;
+    };
+
+    auto setter_func = [field = field](
+                           const c10::intrusive_ptr<CurClass>& self, T value) {
+      self.get()->*field = value;
+    };
+
+    return def_property(name, getter_func, setter_func);
+  }
+
+  /// Property registration API for properties with read-only access.
+  template <typename T>
+  class_& def_readonly(const std::string& name, T CurClass::*field) {
+    auto getter_func =
+        [field = std::move(field)](const c10::intrusive_ptr<CurClass>& self) {
+          return self.get()->*field;
+        };
+
+    return def_property(name, getter_func);
+  }
+
+  /// This is an unsafe method registration API added for adding custom JIT
+  /// backend support via custom C++ classes. It is not for general purpose use.
+  class_& _def_unboxed(
+      const std::string& name,
+      std::function<void(jit::Stack&)> func,
+      c10::FunctionSchema schema,
+      std::string doc_string = "") {
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        qualClassName + "." + name,
+        std::move(schema),
+        std::move(func),
+        std::move(doc_string));
+    classTypePtr->addMethod(method.get());
+    registerCustomClassMethod(std::move(method));
+    return *this;
+  }
+
+  /// def_pickle() is used to define exactly what state gets serialized
+  /// or deserialized for a given instance of a custom C++ class in
+  /// Python or TorchScript. This protocol is equivalent to the Pickle
+  /// concept of `__getstate__` and `__setstate__` from Python
+  /// (https://docs.python.org/2/library/pickle.html#object.__getstate__)
+  ///
+  /// Currently, both the `get_state` and `set_state` callables must be
+  /// C++ lambda expressions. They should have the following signatures,
+  /// where `CurClass` is the class you're registering and `T1` is some object
+  /// that encapsulates the state of the object.
+  ///
+  ///     __getstate__(intrusive_ptr<CurClass>) -> T1
+  ///     __setstate__(T2) -> intrusive_ptr<CurClass>
+  ///
+  /// `T1` must be an object that is convertable to IValue by the same rules
+  /// for custom op/method registration.
+  ///
+  /// For the common case, T1 == T2. T1 can also be a subtype of T2. An
+  /// example where it makes sense for T1 and T2 to differ is if __setstate__
+  /// handles legacy formats in a backwards compatible way.
+  ///
+  /// Example:
+  ///
+  ///     .def_pickle(
+  ///         // __getstate__
+  ///         [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+  ///           return self->stack_;
+  ///         },
+  ///         [](std::vector<std::string> state) { // __setstate__
+  ///            return c10::make_intrusive<MyStackClass<std::string>>(
+  ///               std::vector<std::string>{"i", "was", "deserialized"});
+  ///         })
+  template <typename GetStateFn, typename SetStateFn>
+  class_& def_pickle(GetStateFn&& get_state, SetStateFn&& set_state) {
+    static_assert(
+        c10::guts::is_stateless_lambda<std::decay_t<GetStateFn>>::value &&
+            c10::guts::is_stateless_lambda<std::decay_t<SetStateFn>>::value,
+        "def_pickle() currently only supports lambdas as "
+        "__getstate__ and __setstate__ arguments.");
+    def("__getstate__", std::forward<GetStateFn>(get_state));
+
+    // __setstate__ needs to be registered with some custom handling:
+    // We need to wrap the invocation of the user-provided function
+    // such that we take the return value (i.e. c10::intrusive_ptr<CurrClass>)
+    // and assign it to the `capsule` attribute.
+    using SetStateTraits =
+        c10::guts::infer_function_traits_t<std::decay_t<SetStateFn>>;
+    using SetStateArg = typename c10::guts::typelist::head_t<
+        typename SetStateTraits::parameter_types>;
+    auto setstate_wrapper = [set_state = std::forward<SetStateFn>(set_state)](
+                                c10::tagged_capsule<CurClass> self,
+                                SetStateArg&& arg) {
+      c10::intrusive_ptr<CurClass> classObj =
+          at::guts::invoke(set_state, std::forward<SetStateArg>(arg));
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(classObj));
+    };
+    defineMethod(
+        "__setstate__",
+        detail::wrap_func<CurClass, decltype(setstate_wrapper)>(
+            std::move(setstate_wrapper)));
+
+    // type validation
+    auto getstate_schema = classTypePtr->getMethod("__getstate__").getSchema();
+    auto format_getstate_schema = [&getstate_schema]() {
+      std::stringstream ss;
+      ss << getstate_schema;
+      return ss.str();
+    };
+    TORCH_CHECK(
+        getstate_schema.arguments().size() == 1,
+        "__getstate__ should take exactly one argument: self. Got: ",
+        format_getstate_schema());
+    auto first_arg_type = getstate_schema.arguments().at(0).type();
+    TORCH_CHECK(
+        *first_arg_type == *classTypePtr,
+        "self argument of __getstate__ must be the custom class type. Got ",
+        first_arg_type->repr_str());
+    TORCH_CHECK(
+        getstate_schema.returns().size() == 1,
+        "__getstate__ should return exactly one value for serialization. Got: ",
+        format_getstate_schema());
+
+    auto ser_type = getstate_schema.returns().at(0).type();
+    auto setstate_schema = classTypePtr->getMethod("__setstate__").getSchema();
+    auto arg_type = setstate_schema.arguments().at(1).type();
+    TORCH_CHECK(
+        ser_type->isSubtypeOf(*arg_type),
+        "__getstate__'s return type should be a subtype of "
+        "input argument of __setstate__. Got ",
+        ser_type->repr_str(),
+        " but expected ",
+        arg_type->repr_str());
+
+    return *this;
+  }
+
+ private:
+  template <typename Func>
+  torch::jit::Function* defineMethod(
+      std::string name,
+      Func func,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto qualMethodName = qualClassName + "." + name;
+    auto schema =
+        c10::inferFunctionSchemaSingleReturn<Func>(std::move(name), "");
+
+    // If default values are provided for function arguments, there must be
+    // none (no default values) or default values for all function
+    // arguments, except for self. This is because argument names are not
+    // extracted by inferFunctionSchemaSingleReturn, and so there must be a
+    // torch::arg instance in default_args even for arguments that do not
+    // have an actual default value provided.
+    TORCH_CHECK(
+        default_args.size() == 0 ||
+            default_args.size() == schema.arguments().size() - 1,
+        "Default values must be specified for none or all arguments");
+
+    // If there are default args, copy the argument names and default values to
+    // the function schema.
+    if (default_args.size() > 0) {
+      schema = withNewArguments(schema, default_args);
+    }
+
+    auto wrapped_func =
+        [func = std::move(func)](jit::Stack& stack) mutable -> void {
+      // TODO: we need to figure out how to profile calls to custom functions
+      // like this! Currently can't do it because the profiler stuff is in
+      // libtorch and not ATen
+      using RetType =
+          typename c10::guts::infer_function_traits_t<Func>::return_type;
+      detail::BoxedProxy<RetType, Func>()(stack, func);
+    };
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        qualMethodName,
+        std::move(schema),
+        std::move(wrapped_func),
+        std::move(doc_string));
+
+    // Register the method here to keep the Method alive.
+    // ClassTypes do not hold ownership of their methods (normally it
+    // those are held by the CompilationUnit), so we need a proxy for
+    // that behavior here.
+    auto method_val = method.get();
+    classTypePtr->addMethod(method_val);
+    registerCustomClassMethod(std::move(method));
+    return method_val;
+  }
+};
+
+/// make_custom_class() is a convenient way to create an instance of a
+/// registered custom class and wrap it in an IValue, for example when you want
+/// to pass the object to TorchScript. Its syntax is equivalent to APIs like
+/// `std::make_shared<>` or `c10::make_intrusive<>`.
+///
+/// For example, if you have a custom C++ class that can be constructed from an
+/// `int` and `std::string`, you might use this API like so:
+///
+///     IValue custom_class_iv = torch::make_custom_class<MyClass>(3,
+///     "foobarbaz");
+template <typename CurClass, typename... CtorArgs>
+c10::IValue make_custom_class(CtorArgs&&... args) {
+  auto userClassInstance =
+      c10::make_intrusive<CurClass>(std::forward<CtorArgs>(args)...);
+  return c10::IValue(std::move(userClassInstance));
+}
+
+// Alternative api for creating a torchbind class over torch::class_ this api is
+// preffered to prevent size regressions on Edge usecases. Must be used in
+// conjunction with TORCH_SELECTIVE_CLASS macro aka
+// selective_class<foo>("foo_namespace", TORCH_SELECTIVE_CLASS("foo"))
+template <class CurClass>
+inline class_<CurClass> selective_class_(
+    const std::string& namespace_name,
+    detail::SelectiveStr<true> className) {
+  auto class_name = std::string(className.operator const char*());
+  return torch::class_<CurClass>(namespace_name, class_name);
+}
+
+template <class CurClass>
+inline detail::ClassNotSelected selective_class_(
+    const std::string&,
+    detail::SelectiveStr<false>) {
+  return detail::ClassNotSelected();
+}
+
+// jit namespace for backward-compatibility
+// We previously defined everything in torch::jit but moved it out to
+// better reflect that these features are not limited only to TorchScript
+namespace jit {
+
+using ::torch::class_;
+using ::torch::getCustomClass;
+using ::torch::init;
+using ::torch::isCustomClass;
+
+} // namespace jit
+
+template <class CurClass>
+inline class_<CurClass> Library::class_(const std::string& className) {
+  TORCH_CHECK(
+      kind_ == DEF || kind_ == FRAGMENT,
+      "class_(\"",
+      className,
+      "\"): Cannot define a class inside of a TORCH_LIBRARY_IMPL block.  "
+      "All class_()s should be placed in the (unique) TORCH_LIBRARY block for their namespace.  "
+      "(Error occurred at ",
+      file_,
+      ":",
+      line_,
+      ")");
+  TORCH_INTERNAL_ASSERT(ns_.has_value(), file_, ":", line_);
+  return torch::class_<CurClass>(*ns_, className);
+}
+
+const std::unordered_set<std::string> getAllCustomClassesNames();
+
+template <class CurClass>
+inline class_<CurClass> Library::class_(detail::SelectiveStr<true> className) {
+  auto class_name = std::string(className.operator const char*());
+  TORCH_CHECK(
+      kind_ == DEF || kind_ == FRAGMENT,
+      "class_(\"",
+      class_name,
+      "\"): Cannot define a class inside of a TORCH_LIBRARY_IMPL block.  "
+      "All class_()s should be placed in the (unique) TORCH_LIBRARY block for their namespace.  "
+      "(Error occurred at ",
+      file_,
+      ":",
+      line_,
+      ")");
+  TORCH_INTERNAL_ASSERT(ns_.has_value(), file_, ":", line_);
+  return torch::class_<CurClass>(*ns_, class_name);
+}
+
+template <class CurClass>
+inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false>) {
+  return detail::ClassNotSelected();
+}
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/custom_class_detail.h b/MLPY/Lib/site-packages/torch/include/torch/custom_class_detail.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8ff1d3caf214916a586fb1d79908e932a5951b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/custom_class_detail.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h>
+#include <ATen/core/function.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/irange.h>
+
+namespace torch {
+
+namespace detail {
+/**
+ * In the Facebook internal build (using BUCK), this macro is enabled by
+ * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer
+ * binary.
+ */
+#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE
+TORCH_API void record_custom_class(std::string name);
+
+/**
+ * Record an instance of a custom class being loaded
+ * grab portion of string after final '.' from qualified name
+ * as this seemingly aligns with how users name their custom classes
+ * example: __torch__.torch.classes.xnnpack.Conv2dOpContext
+ */
+#define RECORD_CUSTOM_CLASS(NAME) \
+  auto name = std::string(NAME);  \
+  detail::record_custom_class(name.substr(name.find_last_of(".") + 1));
+#else
+#define RECORD_CUSTOM_CLASS(NAME)
+#endif
+} // namespace detail
+
+/// This struct is used to represent default values for arguments
+/// when registering methods for custom classes.
+///     static auto register_foo = torch::class_<Foo>("myclasses", "Foo")
+///       .def("myMethod", &Foo::myMethod, {torch::arg("name") = name});
+struct arg {
+  // Static method for representing a default value of None. This is meant to
+  // be used like so:
+  //     torch::arg("name") = torch::arg::none
+  // and is identical to:
+  //     torch::arg("name") = IValue()
+  static c10::IValue none() {
+    return c10::IValue();
+  }
+
+  // Explicit constructor.
+  explicit arg(std::string name)
+      : name_(std::move(name)), value_(c10::nullopt) {}
+  // Assignment operator. This enables the pybind-like syntax of
+  // torch::arg("name") = value.
+  arg& operator=(const c10::IValue& rhs) {
+    value_ = rhs;
+    return *this;
+  }
+
+  // The name of the argument. This is copied to the schema; argument
+  // names cannot be extracted from the C++ declaration.
+  std::string name_;
+  // IValue's default constructor makes it None, which is not distinguishable
+  // from an actual, user-provided default value that is None. This boolean
+  // helps distinguish between the two cases.
+  c10::optional<c10::IValue> value_;
+};
+
+namespace detail {
+
+// Argument type utilities
+template <class R, class...>
+struct types {
+  using type = types;
+};
+
+template <typename Method>
+struct WrapMethod;
+
+template <typename R, typename CurrClass, typename... Args>
+struct WrapMethod<R (CurrClass::*)(Args...)> {
+  WrapMethod(R (CurrClass::*m)(Args...)) : m(std::move(m)) {}
+
+  R operator()(c10::intrusive_ptr<CurrClass> cur, Args... args) {
+    return c10::guts::invoke(m, *cur, args...);
+  }
+
+  R (CurrClass::*m)(Args...);
+};
+
+template <typename R, typename CurrClass, typename... Args>
+struct WrapMethod<R (CurrClass::*)(Args...) const> {
+  WrapMethod(R (CurrClass::*m)(Args...) const) : m(std::move(m)) {}
+
+  R operator()(c10::intrusive_ptr<CurrClass> cur, Args... args) {
+    return c10::guts::invoke(m, *cur, args...);
+  }
+
+  R (CurrClass::*m)(Args...) const;
+};
+
+// Adapter for different callable types
+template <
+    typename CurClass,
+    typename Func,
+    std::enable_if_t<
+        std::is_member_function_pointer<std::decay_t<Func>>::value,
+        bool> = false>
+WrapMethod<Func> wrap_func(Func f) {
+  return WrapMethod<Func>(std::move(f));
+}
+
+template <
+    typename CurClass,
+    typename Func,
+    std::enable_if_t<
+        !std::is_member_function_pointer<std::decay_t<Func>>::value,
+        bool> = false>
+Func wrap_func(Func f) {
+  return f;
+}
+
+template <
+    class Functor,
+    bool AllowDeprecatedTypes,
+    size_t... ivalue_arg_indices>
+typename c10::guts::infer_function_traits_t<Functor>::return_type
+call_torchbind_method_from_stack(
+    Functor& functor,
+    jit::Stack& stack,
+    std::index_sequence<ivalue_arg_indices...>) {
+  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
+                 // be unused and we have to silence the compiler warning.
+
+  constexpr size_t num_ivalue_args = sizeof...(ivalue_arg_indices);
+
+  using IValueArgTypes =
+      typename c10::guts::infer_function_traits_t<Functor>::parameter_types;
+  // TODO We shouldn't use c10::impl stuff directly here. We should use the
+  // KernelFunction API instead.
+  return (functor)(c10::impl::ivalue_to_arg<
+                   typename c10::impl::decay_if_not_tensor<
+                       c10::guts::typelist::
+                           element_t<ivalue_arg_indices, IValueArgTypes>>::type,
+                   AllowDeprecatedTypes>::
+                       call(torch::jit::peek(
+                           stack, ivalue_arg_indices, num_ivalue_args))...);
+}
+
+template <class Functor, bool AllowDeprecatedTypes>
+typename c10::guts::infer_function_traits_t<Functor>::return_type
+call_torchbind_method_from_stack(Functor& functor, jit::Stack& stack) {
+  constexpr size_t num_ivalue_args =
+      c10::guts::infer_function_traits_t<Functor>::number_of_parameters;
+  return call_torchbind_method_from_stack<Functor, AllowDeprecatedTypes>(
+      functor, stack, std::make_index_sequence<num_ivalue_args>());
+}
+
+template <class RetType, class Func>
+struct BoxedProxy;
+
+template <class RetType, class Func>
+struct BoxedProxy {
+  void operator()(jit::Stack& stack, Func& func) {
+    auto retval = call_torchbind_method_from_stack<Func, false>(func, stack);
+    constexpr size_t num_ivalue_args =
+        c10::guts::infer_function_traits_t<Func>::number_of_parameters;
+    torch::jit::drop(stack, num_ivalue_args);
+    stack.emplace_back(c10::ivalue::from(std::move(retval)));
+  }
+};
+
+template <class Func>
+struct BoxedProxy<void, Func> {
+  void operator()(jit::Stack& stack, Func& func) {
+    call_torchbind_method_from_stack<Func, false>(func, stack);
+    constexpr size_t num_ivalue_args =
+        c10::guts::infer_function_traits_t<Func>::number_of_parameters;
+    torch::jit::drop(stack, num_ivalue_args);
+    stack.emplace_back();
+  }
+};
+
+inline bool validIdent(size_t i, char n) {
+  return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+}
+
+inline void checkValidIdent(const std::string& str, const char* type) {
+  for (const auto i : c10::irange(str.size())) {
+    TORCH_CHECK(
+        validIdent(i, str[i]),
+        type,
+        " must be a valid Python/C++ identifier."
+        " Character '",
+        str[i],
+        "' at index ",
+        i,
+        " is illegal.");
+  }
+}
+
+class TORCH_API class_base {
+ protected:
+  explicit class_base(
+      const std::string& namespaceName,
+      const std::string& className,
+      std::string doc_string,
+      const std::type_info& intrusivePtrClassTypeid,
+      const std::type_info& taggedCapsuleClass);
+
+  static c10::FunctionSchema withNewArguments(
+      const c10::FunctionSchema& schema,
+      std::initializer_list<arg> default_args);
+  std::string qualClassName;
+  at::ClassTypePtr classTypePtr;
+};
+
+} // namespace detail
+
+TORCH_API void registerCustomClass(at::ClassTypePtr class_type);
+TORCH_API void registerCustomClassMethod(std::unique_ptr<jit::Function> method);
+
+// Given a qualified name (e.g. __torch__.torch.classes.Foo), return
+// the ClassType pointer to the Type that describes that custom class,
+// or nullptr if no class by that name was found.
+TORCH_API at::ClassTypePtr getCustomClass(const std::string& name);
+
+// Given an IValue, return true if the object contained in that IValue
+// is a custom C++ class, otherwise return false.
+TORCH_API bool isCustomClass(const c10::IValue& v);
+
+// This API is for testing purposes ONLY. It should not be used in
+// any load-bearing code.
+TORCH_API std::vector<c10::FunctionSchema> customClassSchemasForBCCheck();
+
+namespace jit {
+using ::torch::registerCustomClass;
+using ::torch::registerCustomClassMethod;
+} // namespace jit
+
+} // namespace torch
diff --git a/MLPY/Lib/site-packages/torch/include/torch/extension.h b/MLPY/Lib/site-packages/torch/include/torch/extension.h
new file mode 100644
index 0000000000000000000000000000000000000000..236d9d9f4e956f973c887b92b4d619f33fa79138
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/extension.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifndef TORCH_INDUCTOR_CPP_WRAPPER
+// All pure C++ headers for the C++ frontend.
+#include <torch/all.h>
+#endif
+
+// Python bindings for the C++ frontend (includes Python.h).
+#include <torch/python.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/library.h b/MLPY/Lib/site-packages/torch/include/torch/library.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e5dfd0e4f6bc412f4cbcf55376676567780a07f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/library.h
@@ -0,0 +1,1050 @@
+#pragma once
+
+/// \file
+///
+/// This header provides an API for extending PyTorch's core library
+/// of operators with user defined operators and data types.  This
+/// API can be used in a few ways:
+///
+/// * You can define new custom operators and classes with TORCH_LIBRARY(),
+///   making them available for use in both eager Python as well as in
+///   TorchScript. This API is modeled off of pybind11's `PYBIND11_MODULE`
+///   macro, as the provided functionality is similar (pybind11 lets you bind
+///   C++ to Python only; `torch/library.h` lets you bind C++ simultaneously to
+///   Python and TorchScript).
+///
+/// * You can override existing operators with TORCH_LIBRARY_IMPL(),
+///   providing a new implementation for these operators for a custom
+///   backend (e.g., XLA).  When you pass operators with tensors of your custom
+///   backend, your overridden implementations will be called instead
+///   of the standard implementations.
+///
+/// * You can use both capabilities at the same time, allowing you
+///   to write custom operators that register CPU/CUDA/Autograd
+///   implementations without having to write the boilerplate
+///   conditionals yourself.
+///
+/// For a tutorial style introduction to the library API, check
+/// out the [Extending TorchScript with Custom C++
+/// Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html)
+/// tutorial.
+///
+/// ```
+/// // Define a library whose operators live in the namespace 'myops'.
+/// // You must define all of the operators for this library in
+/// // this namespace.
+/// TORCH_LIBRARY(myops, m) {
+///   // Define a operator with exactly one implementation for all backends.
+///   m.def("add(Tensor self, Tensor other) -> Tensor", &add_impl);
+///
+///   // Define a schema for an operator, but provide no implementation
+///   // (use this syntax if you want to use the dispatcher)
+///   m.def("mul(Tensor self, Tensor other) -> Tensor");
+///
+///   // Provide an implementation for a defined operator (you can
+///   // provide multiple; one per backend).  The dispatcher takes care of
+///   // calling the correct implementation depending on if we get a CPU
+///   // tensor or a CUDA tensor
+///   m.impl("mul", torch::kCPU, &mul_cpu_impl);
+///   m.impl("mul", torch::kCUDA, &mul_cuda_impl);
+/// }
+///
+/// // Define implementations for operators for a non-standard backend,
+/// // e.g., XLA (valid values are entries of DispatchKey).  This can
+/// // be used to define operators in a different file than the initial
+/// // TORCH_LIBRARY definition (e.g., if it is in an external library)
+/// TORCH_LIBRARY_IMPL(myops, XLA, m) {
+///   m.impl("mul", &mul_xla_impl);
+/// }
+/// ```
+
+#include <ATen/core/op_registration/infer_schema.h>
+#include <ATen/core/op_registration/op_allowlist.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/DispatchKey.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+
+// Just for inferFunctionSchemaFromFunctor
+#include <ATen/core/enum_tag.h>
+#include <ATen/core/op_registration/op_registration.h>
+
+namespace torch {
+
+#if defined C10_MOBILE
+/**
+ * The NoInferSchemaTag is a type name used to indicate that this call to the
+ * CppFunction constructor should not trigger schema inference from functor.
+ * Schema inference from functor utilizes template meta-programming, and is
+ * costly from a size perspective. Ideally, one would expect that the schema
+ * inference would require very little binary size since most of the
+ * computation can be done by the compiler at build time, but that isn't
+ * necessarily the case.
+ *
+ * Schema inference is elided only for mobile use-cases where we don't need
+ * the additional runtime cost or size overhead on client devices.
+ *
+ */
+struct NoInferSchemaTag {};
+#endif
+
+#define HAS_PT2_COMPLIANT_TAG
+
+// For multipy/torchdeploy use case
+enum class _RegisterOrVerify { REGISTER, VERIFY };
+
+template <class CurClass>
+class class_;
+
+#define HAS_IMPL_ABSTRACT_PYSTUB
+
+/// Represents a C++ function that implements an operator.  Most users won't
+/// interact directly with this class, except via error messages: the
+/// constructors this function define the set of permissible "function"-like
+/// things you can bind via the interface.
+///
+/// This class erases the type of the passed in function, but durably records
+/// the type via an inferred schema for the function.
+class TORCH_API CppFunction final {
+  // TODO: This is morally the same thing as KernelRegistrationConfig, but it's
+  // opaque to the user.
+
+ public:
+  /// This overload accepts function pointers, e.g., `CppFunction(&add_impl)`
+  template <typename Func>
+  explicit CppFunction(
+      Func* f,
+      std::enable_if_t<
+          c10::guts::is_function_type<Func>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
+        cpp_signature_(c10::impl::CppSignature::make<Func>()),
+        schema_(
+            c10::detail::inferFunctionSchemaFromFunctor<std::decay_t<Func>>()),
+        debug_() {}
+
+  /// This overload accepts compile time function pointers, e.g.,
+  /// `CppFunction(TORCH_FN(add_impl))`
+  template <typename FuncPtr>
+  explicit CppFunction(
+      FuncPtr f,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
+        cpp_signature_(
+            c10::impl::CppSignature::make<typename FuncPtr::FuncType>()),
+        schema_(c10::detail::inferFunctionSchemaFromFunctor<
+                typename FuncPtr::FuncType>()),
+        debug_() {}
+
+  /// This overload accepts lambdas, e.g., `CppFunction([](const Tensor& self) {
+  /// ... })`
+  template <typename Lambda>
+  explicit CppFunction(
+      Lambda&& f,
+      std::enable_if_t<
+          c10::guts::is_functor<std::decay_t<Lambda>>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedLambda(
+            std::forward<Lambda>(f))),
+        cpp_signature_(c10::impl::CppSignature::make<Lambda>()),
+        schema_(c10::detail::inferFunctionSchemaFromFunctor<
+                std::decay_t<Lambda>>()),
+        debug_() {}
+
+#if defined C10_MOBILE
+  /// This overload accepts function pointers, e.g., `CppFunction(&add_impl,
+  /// NoInferSchemaTag())`
+  template <typename Func>
+  explicit CppFunction(
+      Func* f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::guts::is_function_type<Func>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
+        cpp_signature_(c10::impl::CppSignature::make<Func>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+
+  /// This overload accepts compile time function pointers, e.g.,
+  /// `CppFunction(TORCH_FN(add_impl), NoInferSchemaTag())`
+  template <typename FuncPtr>
+  explicit CppFunction(
+      FuncPtr f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
+        cpp_signature_(
+            c10::impl::CppSignature::make<typename FuncPtr::FuncType>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+
+  /// This overload accepts lambdas, e.g., `CppFunction([](const Tensor& self) {
+  /// ... }. NoInferSchemaTag())`
+  template <typename Lambda>
+  explicit CppFunction(
+      Lambda&& f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::guts::is_functor<std::decay_t<Lambda>>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedLambda(
+            std::forward<Lambda>(f))),
+        cpp_signature_(c10::impl::CppSignature::make<Lambda>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+#endif
+
+  ~CppFunction();
+
+  CppFunction(CppFunction&&) noexcept = default;
+
+  CppFunction& operator=(CppFunction&&) = default;
+
+  /// \private
+  /// Creates a function from a type-erased boxed kernel.
+  static CppFunction makeFromBoxedKernel(c10::BoxedKernel kernel) {
+    return CppFunction(
+        c10::KernelFunction::makeFromBoxedKernel(std::move(kernel)),
+        /* cpp_signature */ c10::nullopt, // not known for boxed functions
+        /* schema */ nullptr);
+  }
+
+  /// This creates a fallthrough function.  Fallthrough functions
+  /// immediately redispatch to the next available dispatch key,
+  /// but are implemented more efficiently than a hand written
+  /// function done in the same way.
+  static CppFunction makeFallthrough() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFallthrough());
+  }
+
+  /// \private
+  ///
+  /// Creates a function that raises an error saying that named tensors
+  /// are not supported when called.
+  static CppFunction makeNamedNotSupported() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeNamedNotSupported());
+  }
+
+  /// Create a function from a boxed kernel function with signature
+  /// `void(const OperatorHandle&, Stack*)`; i.e., they receive a
+  /// stack of arguments in a boxed calling convention, rather than
+  /// in the native C++ calling convention.  Boxed functions are
+  /// typically only used to register backend fallbacks via
+  /// torch::Library::fallback().
+  template <c10::BoxedKernel::BoxedKernelFunction* func>
+  static CppFunction makeFromBoxedFunction() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFromFunction<func>());
+  }
+
+  // Variant that takes in a boxed kernel function with a plumbed
+  // DispatchKeySet. See Note [Plumbing Keys Through The Dispatcher] for
+  // details.
+  template <c10::BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+  static CppFunction makeFromBoxedFunction() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFromFunction<func>());
+  }
+
+  /// Create a function from a boxed kernel functor which defines
+  /// `operator()(const OperatorHandle&, DispatchKeySet, Stack*)`
+  /// (receiving arguments from boxed calling convention) and inherits
+  /// from `c10::OperatorKernel`.  Unlike makeFromBoxedFunction, functions
+  /// registered in this way can also carry additional state which
+  /// is managed by the functor; this is useful if you're writing an
+  /// adapter to some other implementation, e.g., a Python callable, which
+  /// is dynamically associated with the registered kernel.
+  template <class KernelFunctor>
+  static CppFunction makeFromBoxedFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor) {
+    return makeFromBoxedKernel(
+        c10::BoxedKernel::makeFromFunctor(std::move(kernelFunctor)));
+  }
+
+  /// Create a function from an unboxed kernel function.
+  /// This is typically used to register common operators.
+  template <
+      typename FuncPtr,
+      std::enable_if_t<
+          c10::guts::is_function_type<FuncPtr>::value,
+          std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr* f) {
+    return CppFunction(f);
+  }
+
+  /// Create a function from a compile time unboxed kernel function pointer.
+  /// This is typically used to register common operators.
+  /// Compile time function pointers can be used to allow the compiler
+  /// to optimize (e.g. inline) calls to it.
+  template <
+      typename FuncPtr,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr f) {
+    return CppFunction(f);
+  }
+
+  CppFunction&& debug(std::string d) && {
+    debug_ = std::move(d);
+    return std::move(*this);
+  }
+
+ private:
+  c10::optional<c10::DispatchKey> dispatch_key_;
+  c10::KernelFunction func_;
+  c10::optional<c10::impl::CppSignature> cpp_signature_;
+  std::unique_ptr<c10::FunctionSchema> schema_;
+  std::string debug_;
+
+  // The "setter" for dispatch_key_
+  template <typename Func>
+  friend CppFunction dispatch(c10::DispatchKey, Func&&);
+
+  // The only class which actually pulls out values from CppFunction (does so
+  // destructively, felt too lazy to write accessors that I don't even
+  // want users to use)
+  friend class Library;
+
+  CppFunction(
+      c10::KernelFunction func,
+      c10::optional<c10::impl::CppSignature> cpp_signature,
+      std::unique_ptr<c10::FunctionSchema> schema);
+};
+
+/// \defgroup torch-dispatch-overloads torch::dispatch overloads
+
+/// Create a torch::CppFunction which is associated with a specific
+/// dispatch key.  torch::CppFunctions that are tagged with a
+/// c10::DispatchKey don't get invoked unless the dispatcher determines
+/// that this particular c10::DispatchKey is the one that should be
+/// dispatched to.
+///
+/// This function is generally not used directly, instead, prefer using
+/// TORCH_LIBRARY_IMPL(), which will implicitly set the c10::DispatchKey
+/// for all registration calls inside of its body.
+///
+/// \ingroup torch-dispatch-overloads
+template <typename Func>
+inline CppFunction dispatch(c10::DispatchKey k, Func&& raw_f) {
+  CppFunction f(std::forward<Func>(raw_f));
+  if (k == c10::DispatchKey::CatchAll) {
+    f.dispatch_key_ = c10::nullopt;
+  } else {
+    f.dispatch_key_ = k;
+  }
+  return f;
+}
+
+/// Convenience overload of dispatch() which accepts c10::DeviceType
+///
+/// \ingroup torch-dispatch-overloads
+template <typename Func>
+inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
+  auto deviceTypeToDispatchKey = [](c10::DeviceType t) {
+    switch (t) {
+      // This list is synchronized with the k-constants in c10/core/DeviceType.h
+      case c10::DeviceType::CPU:
+        return c10::DispatchKey::CPU;
+      case c10::DeviceType::CUDA:
+        return c10::DispatchKey::CUDA;
+      case c10::DeviceType::IPU:
+        return c10::DispatchKey::IPU;
+      case c10::DeviceType::XLA:
+        return c10::DispatchKey::XLA;
+      case c10::DeviceType::Lazy:
+        return c10::DispatchKey::Lazy;
+      case c10::DeviceType::XPU:
+        return c10::DispatchKey::XPU;
+      case c10::DeviceType::MPS:
+        return c10::DispatchKey::MPS;
+      case c10::DeviceType::Meta:
+        return c10::DispatchKey::Meta;
+      case c10::DeviceType::HIP:
+        return c10::DispatchKey::HIP;
+      case c10::DeviceType::ORT:
+        return c10::DispatchKey::ORT;
+      case c10::DeviceType::HPU:
+        return c10::DispatchKey::HPU;
+      case c10::DeviceType::MTIA:
+        return c10::DispatchKey::MTIA;
+      case c10::DeviceType::PrivateUse1:
+        return c10::DispatchKey::PrivateUse1;
+      default:
+        TORCH_CHECK(
+            false,
+            "Device type ",
+            t,
+            " cannot be overloaded at dispatch time, "
+            "please file a bug report explaining what you were trying to do.");
+    }
+  };
+  return dispatch(deviceTypeToDispatchKey(type), std::forward<Func>(raw_f));
+}
+
+/// \defgroup torch-schema-overloads torch::schema overloads
+
+/// Construct a c10::FunctionSchema from a string, with an explicitly
+/// specified c10::AliasAnalysisKind.  Ordinarily, schemas are simply
+/// passed in as strings, but if you need to specify a custom alias
+/// analysis, you can replace the string with a call to this function.
+///
+/// ```
+/// // Default alias analysis (FROM_SCHEMA)
+/// m.def("def3(Tensor self) -> Tensor");
+/// // Pure function alias analysis
+/// m.def(torch::schema("def3(Tensor self) -> Tensor",
+/// c10::AliasAnalysisKind::PURE_FUNCTION));
+/// ```
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema schema(const char* str, c10::AliasAnalysisKind k) {
+  c10::FunctionSchema s = torch::jit::parseSchema(str);
+  s.setAliasAnalysis(k);
+  return s;
+}
+
+/// Function schemas can be directly constructed from string literals.
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema schema(const char* s) {
+  return schema(s, c10::AliasAnalysisKind::FROM_SCHEMA);
+}
+
+/// \private
+///
+/// Already constructed function schemas are accepted if they are
+/// rvalues.
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema&& schema(c10::FunctionSchema&& s) {
+  return std::move(s);
+}
+
+namespace detail {
+
+inline std::variant<c10::OperatorName, c10::FunctionSchema> constructSchemaOrName(
+    c10::FunctionSchema&& s) {
+  return std::move(s);
+}
+inline std::variant<c10::OperatorName, c10::FunctionSchema> constructSchemaOrName(
+    c10::OperatorName&& n) {
+  return std::move(n);
+}
+inline std::variant<c10::OperatorName, c10::FunctionSchema>
+constructSchemaOrName(const char* str) {
+  auto s = torch::jit::parseSchemaOrName(str);
+  if (std::holds_alternative<c10::FunctionSchema>(s)) {
+    std::get<c10::FunctionSchema>(s).setAliasAnalysis(
+        c10::AliasAnalysisKind::FROM_SCHEMA);
+  }
+  return s;
+}
+
+class TorchLibraryInit;
+
+} // namespace detail
+
+// Note [Selective build]
+// ~~~~~~~~~~~~~~~~~~~~~~
+// In some settings, especially mobile, it is important to avoid compiling any
+// references to functions that you aren't actually going to use, so that they
+// can be eliminated by the linker.  We call this capability "selective build".
+//
+// A very easy way to implement selective build which results in a lot of
+// boilerplate is to just add ifdef's around every registration call, but this
+// means you have to write a lot of extra lines of code at every registration
+// site, and it also means you have to define some munging scheme to map
+// operators to macros.
+//
+// Instead of doing this, we have a different mechanism centered around the
+// concept of a SelectiveStr.  A selective name is like a const char* string,
+// except it also carries at compile time a boolean saying whether or not a
+// registration should actually happen or not.  We then have extra overloads
+// which bypass registration entirely if a selective name is disabled.  We do a
+// constexpr test to see if a operator should be enabled or not; this is
+// currently implemented in ATen/core/op_registration/op_allowlist.h
+
+namespace detail {
+
+// dummy class for non selected custom torchbind classes
+class ClassNotSelected {
+ public:
+  ClassNotSelected& def_pickle(...) {
+    return *this;
+  }
+  ClassNotSelected& def(...) {
+    return *this;
+  }
+};
+
+// A SelectiveStr is like a const char*, except that it also comes
+// with a type brand that says whether or not the name is enabled or
+// not.  If the string is disabled, then (at compile time) we DON'T generate
+// a registration call for it.  This class is not intended to be called
+// directly; use TORCH_SELECTIVE_NAME or TORCH_SELECTIVE_SCHEMA macros below
+// to create it.
+template <bool enabled>
+class SelectiveStr {
+ public:
+  constexpr explicit SelectiveStr(const char* name) : name_(name) {}
+  constexpr operator const char*() {
+    return name_;
+  }
+
+ private:
+  const char* name_;
+};
+
+#define TORCH_SELECTIVE_CLASS(n) \
+  torch::detail::SelectiveStr<c10::impl::custom_class_allowlist_check(n)>(n)
+#define TORCH_SELECTIVE_NAME(n) \
+  torch::detail::SelectiveStr<c10::impl::op_allowlist_check(n)>(n)
+#define TORCH_SELECTIVE_SCHEMA(n) \
+  torch::detail::SelectiveStr<c10::impl::schema_allowlist_check(n)>(n)
+
+} // namespace detail
+
+/// This object provides the API for defining operators and providing
+/// implementations at dispatch keys.  Typically, a torch::Library
+/// is not allocated directly; instead it is created by the
+/// TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() macros.
+///
+/// Most methods on torch::Library return a reference to itself,
+/// supporting method chaining.
+///
+/// ```
+/// // Examples:
+///
+/// TORCH_LIBRARY(torchvision, m) {
+///    // m is a torch::Library
+///    m.def("roi_align", ...);
+///    ...
+/// }
+///
+/// TORCH_LIBRARY_IMPL(aten, XLA, m) {
+///    // m is a torch::Library
+///    m.impl("add", ...);
+///    ...
+/// }
+/// ```
+///
+class TORCH_API Library final {
+ public:
+  /// \private
+  ///
+  /// Which type of macro produced this Library
+  enum Kind {
+    DEF, // from TORCH_LIBRARY (no qualifier)
+    IMPL,
+    FRAGMENT,
+  };
+
+  /// \private
+  ///
+  /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these
+  /// constructors directly
+  Library(
+      Kind kind,
+      std::string ns,
+      c10::optional<c10::DispatchKey> k,
+      const char* file,
+      uint32_t line);
+
+  Library(const Library&) = delete;
+  Library& operator=(const Library&) = delete;
+  Library(Library&&) = default;
+  Library& operator=(Library&&) = default;
+
+  // Some notes about the API design here.  We had the following constraints:
+  //
+  //  - We need to support multiple "types" of arguments for schema and
+  //    functions (e.g., unnamed lambda types, regular functions, const char*,
+  //    fully instantiated schemas)
+  //  - We don't want to write exponentially many overloads
+  //  - We don't want to rely on implicit conversion to a common type,
+  //    because the C++ compiler will only be willing to do a single
+  //    implicit conversion (reducing the set of valid types which you
+  //    can invoke with); also error messages are worse when an implicit
+  //    conversion is not selected (as the compiler will not explain
+  //    why it didn't select an implicit conversion; this is different
+  //    from overloads where it will explain each candidate overload and
+  //    why it didn't apply)
+  //
+  // To solve all of these constraints at the same time, we use a trick taken
+  // from the pybind11 library: template over the argument in the user visible
+  // API, and inside of the templated function explicitly call an overloaded
+  // function to resolve the argument to a real type.  You get the good error
+  // messages from overloads, but at the same time you only need to write the
+  // overload for any given argument type once.
+
+  /// Declare an operator with a schema, but don't provide any implementations
+  /// for it.  You're expected to then provide implementations using the
+  /// impl() method.  All template arguments are inferred.
+  ///
+  /// \param raw_schema The schema of the operator to be defined.
+  ///     Typically, this is a `const char*` string literal, but any type
+  ///     accepted by torch::schema() is accepted here.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY(myops, m) {
+  ///   m.def("add(Tensor self, Tensor other) -> Tensor");
+  /// }
+  /// ```
+
+  template <typename Schema>
+  Library& def(
+      Schema&& raw_schema,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    c10::FunctionSchema s = schema(std::forward<Schema>(raw_schema));
+    return _def(std::move(s), nullptr, tags, rv);
+  }
+
+  /// Declares that for all operators that are subsequently def'ed, their
+  /// abstract impls may be found in the given Python module (pymodule).
+  /// This registers some help text that is used if the abstract impl
+  /// cannot be found.
+  ///
+  /// Args:
+  /// - pymodule: the python module
+  /// - context: We may include this in the error message.
+  Library& impl_abstract_pystub(const char* pymodule, const char* context = "") {
+    impl_abstract_pystub_ = {pymodule, context};
+    return *this;
+  }
+
+  /// Define an operator for a schema and then register an implementation for
+  /// it.  This is typically what you would use if you aren't planning
+  /// on making use of the dispatcher to structure your operator
+  /// implementation.  It's roughly equivalent to calling def() and
+  /// then impl(), but if you omit the schema of the operator, we will
+  /// infer it from the type of your C++ function.  All template
+  /// arguments are inferred.
+  ///
+  /// \param raw_name_or_schema The schema of the operator to be
+  ///   defined, or just the name of the operator if the schema is to be
+  ///   inferred from `raw_f`.  Typically a `const char*` literal.
+  /// \param raw_f The C++ function that implements this operator.
+  ///   Any valid constructor of torch::CppFunction is accepted here;
+  ///   typically you provide a function pointer or lambda.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY(myops, m) {
+  ///   m.def("add", add_fn);
+  /// }
+  /// ```
+  template <typename NameOrSchema, typename Func>
+  Library& def(NameOrSchema&& raw_name_or_schema, Func&& raw_f,
+      const std::vector<at::Tag>& tags = {}) & {
+    CppFunction f(std::forward<Func>(raw_f));
+    return _def(
+        detail::constructSchemaOrName(
+            ::std::forward<NameOrSchema>(raw_name_or_schema)),
+        ::std::move(f), tags);
+  }
+
+  /// Register an implementation for an operator.  You may register multiple
+  /// implementations for a single operator at different dispatch keys
+  /// (see torch::dispatch()).  Implementations must have a corresponding
+  /// declaration (from def()), otherwise they are invalid.  If you plan
+  /// to register multiple implementations, DO NOT provide a function
+  /// implementation when you def() the operator.
+  ///
+  /// \param name The name of the operator to implement.  Do NOT provide
+  ///   schema here.
+  /// \param raw_f The C++ function that implements this operator.  Any
+  ///   valid constructor of torch::CppFunction is accepted here;
+  ///   typically you provide a function pointer or lambda.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY_IMPL(myops, CUDA, m) {
+  ///   m.impl("add", add_cuda);
+  /// }
+  /// ```
+  template <typename Name, typename Func>
+  Library& impl(
+      Name name,
+      Func&& raw_f,
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    // TODO: need to raise an error when you impl a function that has a
+    // catch all def
+#if defined C10_MOBILE
+    CppFunction f(std::forward<Func>(raw_f), NoInferSchemaTag());
+#else
+    CppFunction f(std::forward<Func>(raw_f));
+#endif
+    return _impl(name, std::move(f), rv);
+  }
+
+#if defined C10_MOBILE
+  // Note: This overload is needed only for C10_MOBILE, since the automatically
+  // defined copy constructor for the CppFunction doesn't have the additional
+  // NoInferSchemaTag argument. We define the overload for the impl() function
+  // to accept a CppFunction&& argument. The already constructed CppFunction
+  // object may or may not have the inferred schema, but it doesn't matter
+  // for our purposes since if it already has the inferred schema, then we
+  // might as well just pass it through directly.
+  //
+  template <typename Name>
+  Library& impl(Name name, CppFunction&& raw_f) & {
+    // TODO: need to raise an error when you impl a function that has a
+    // catch all def
+    CppFunction f(std::forward<CppFunction>(raw_f));
+    return _impl(name, std::move(f));
+  }
+#endif
+
+  // Helper for getting an OperatorName for a const char*.  You probably
+  // don't need this.
+  c10::OperatorName _resolve(const char* name) const;
+
+  /// \private
+  ///
+  /// Convenience overload for directly specifying the dispatch key when
+  /// impl().  You probably don't need this; instead, prefer specifying
+  /// the dispatch key for the entire block in TORCH_LIBRARY_IMPL()
+  template <typename Name, typename Dispatch, typename Func>
+  Library& impl(Name name, Dispatch&& key, Func&& raw_f) & {
+    return impl(
+        name, dispatch(std::forward<Dispatch>(key), std::forward<Func>(raw_f)));
+  }
+
+  template <typename Name, typename Func>
+  Library& impl_UNBOXED(Name /*name*/, Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  // These overloads cover cases when a SelectiveStr (see Note [Selective
+  // build]) has been disabled at compile time.  In that case, don't generate
+  // any code referencing the passed in functions at all.
+  Library& def(detail::SelectiveStr<false>, const std::vector<at::Tag>& tags = {}) & {
+    return *this;
+  }
+  Library& def(detail::SelectiveStr<true> raw_schema, const std::vector<at::Tag>& tags = {}) & {
+    return def(raw_schema.operator const char*(), tags);
+  }
+  template <typename Func>
+  Library& def(detail::SelectiveStr<false>, Func&& /*raw_f*/, const std::vector<at::Tag>& tags = {}) & {
+    return *this;
+  }
+  template <typename Func>
+  Library& def(detail::SelectiveStr<true> raw_name_or_schema, Func&& raw_f, const std::vector<at::Tag>& tags = {}) & {
+    return def(
+        raw_name_or_schema.operator const char*(), std::forward<Func>(raw_f), tags);
+  }
+
+  template <typename Func>
+  Library& impl(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
+    return *this;
+  }
+  template <typename Dispatch, typename Func>
+  Library& impl(
+      detail::SelectiveStr<false>,
+      Dispatch&& /*key*/,
+      Func&& /*raw_f*/) & {
+    return *this;
+  }
+  template <typename Func>
+  Library& impl_UNBOXED(
+      detail::SelectiveStr<false> /*name*/,
+      Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  template <typename Func>
+  Library& impl(detail::SelectiveStr<true> name, Func&& raw_f) & {
+    return impl(name.operator const char*(), std::forward<Func>(raw_f));
+  }
+  template <typename Dispatch, typename Func>
+  Library& impl(
+      detail::SelectiveStr<true> name,
+      Dispatch&& key,
+      Func&& raw_f) & {
+    return impl(
+        name.operator const char*(),
+        std::forward<Dispatch>(key),
+        std::forward<Func>(raw_f));
+  }
+  template <typename Func>
+  Library& impl_UNBOXED(
+      detail::SelectiveStr<true> /*name*/,
+      Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  /// Register a fallback implementation for all operators which will be used
+  /// if there is not a specific implementation for an operator available.
+  /// There MUST be a DispatchKey associated with a fallback; e.g.,
+  /// only call this from TORCH_LIBRARY_IMPL() with namespace `_`.
+  ///
+  /// \param raw_f The function that implements the fallback.  Unboxed
+  ///   functions typically do not work as fallback functions, as
+  ///   fallback functions must work for every operator (even though
+  ///   they have varying type signatures).  Typical arguments are
+  ///   CppFunction::makeFallthrough() or
+  ///   CppFunction::makeFromBoxedFunction()
+  ///
+  /// ```
+  /// // Example:
+  ///
+  /// TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
+  ///   // If there is not a kernel explicitly registered
+  ///   // for AutogradXLA, fallthrough to the next
+  ///   // available kernel
+  ///   m.fallback(torch::CppFunction::makeFallthrough());
+  /// }
+  ///
+  /// // See aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+  /// // for a full example of boxed fallback
+  /// ```
+  template <typename Func>
+  Library& fallback(Func&& raw_f) & {
+    CppFunction f((std::forward<Func>(raw_f)));
+    return _fallback(std::move(f));
+  }
+
+  template <class CurClass>
+  inline torch::class_<CurClass> class_(const std::string& className);
+
+  // These overloads enable the use of selective build on classes registered
+  // within a library. The API is the same as before with 1 minor change.
+  // Instead of m.class_<foo>("foo") you instead do
+  // m.class_<foo>(TORCH_SELECTIVE_CLASS("foo"))
+  template <class CurClass>
+  inline torch::class_<CurClass> class_(detail::SelectiveStr<true> className);
+
+  template <class CurClass>
+  inline detail::ClassNotSelected class_(detail::SelectiveStr<false> className);
+
+  // De-registers all registrations created with this Library
+  void reset();
+
+ private:
+  Kind kind_;
+  c10::optional<std::string> ns_;
+  c10::optional<c10::DispatchKey> dispatch_key_;
+  c10::optional<std::pair<const char*, const char*>> impl_abstract_pystub_;
+  const char* file_;
+  uint32_t line_;
+
+  std::vector<c10::RegistrationHandleRAII> registrars_;
+
+  friend class detail::TorchLibraryInit;
+
+  // Non-user visible actual implementations of functions.  These aren't
+  // public because we only implement & qualifier and not && qualifier
+  Library& _def(
+      c10::FunctionSchema&& schema,
+      c10::OperatorName* out_name = nullptr,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
+  Library& _def(
+      std::variant<c10::OperatorName, c10::FunctionSchema>&&,
+      CppFunction&& f,
+      const std::vector<at::Tag>& tags = {}) &;
+  Library& _impl(
+      const char* name,
+      CppFunction&& f,
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
+  Library& _fallback(CppFunction&& f) &;
+
+  at::OperatorName _parseNameForLib(const char* name_str) const;
+};
+
+namespace detail {
+
+class TorchLibraryInit final {
+ private:
+  using InitFn = void(Library&);
+  Library lib_;
+
+ public:
+  TorchLibraryInit(
+      Library::Kind kind,
+      InitFn* fn,
+      const char* ns,
+      c10::optional<c10::DispatchKey> k,
+      const char* file,
+      uint32_t line)
+      : lib_(kind, ns, k, file, line) {
+    fn(lib_);
+  }
+};
+
+} // namespace detail
+
+} // namespace torch
+
+// NB: The EXACT NAMING of the initializer functions (e.g.,
+// TORCH_LIBRARY_init_aten) matters for the code analyzer;
+// see the regexes at tools/code_analyzer/run_analyzer.sh
+
+/// Macro for defining a function that will be run at static
+/// initialization time to define a library of operators in the
+/// namespace `ns` (must be a valid C++ identifier, no quotes).
+/// Use this macro when you want to define a new set of custom operators
+/// that do not already exist in PyTorch.
+///
+/// Example usage:
+///
+/// ```
+/// TORCH_LIBRARY(myops, m) {
+///   // m is a torch::Library; methods on it will define
+///   // operators in the myops namespace
+///   m.def("add", add_impl);
+/// }
+/// ```
+///
+/// The `m` argument is bound to a torch::Library that is used to
+/// register operators.  There may only be one TORCH_LIBRARY()
+/// for any given namespace.
+#define TORCH_LIBRARY(ns, m)                                                   \
+  static void TORCH_LIBRARY_init_##ns(torch::Library&);                        \
+  static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_##ns( \
+      torch::Library::DEF,                                                     \
+      &TORCH_LIBRARY_init_##ns,                                                \
+      #ns,                                                                     \
+      c10::nullopt,                                                            \
+      __FILE__,                                                                \
+      __LINE__);                                                               \
+  void TORCH_LIBRARY_init_##ns(torch::Library& m)
+
+/// \private
+///
+/// This macro is a version of TORCH_LIBRARY() that doesn't enforce that there
+/// is only one library (it is a "fragment").  This is used inside the
+/// PerOpRegistration.cpp file, as well as in places where all op registrations
+/// within the same namespace cannot be easily put into one macro block
+/// (this is mostly the case for custom ops in fbcode that were ported from
+/// the old API)
+#define TORCH_LIBRARY_FRAGMENT(ns, m) _TORCH_LIBRARY_FRAGMENT(ns, m, C10_UID)
+
+/// \private
+///
+/// The above macro requires an extra unique identifier (uid) to prevent
+/// variable name collisions This can happen if TORCH_LIBRARY_FRAGMENT is called
+/// multiple times with the same namespace in the same translation unit. Note
+/// that the TORCH_LIBRARY variant doesn't run into this problem, because it
+/// enforces that it can only be called once for a given namespace.
+#define _TORCH_LIBRARY_FRAGMENT(ns, m, uid)                       \
+  static void C10_CONCATENATE(                                    \
+      TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(torch::Library&); \
+  static const torch::detail::TorchLibraryInit C10_CONCATENATE(   \
+      TORCH_LIBRARY_FRAGMENT_static_init_##ns##_, uid)(           \
+      torch::Library::FRAGMENT,                                   \
+      &C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid), \
+      #ns,                                                        \
+      c10::nullopt,                                               \
+      __FILE__,                                                   \
+      __LINE__);                                                  \
+  void C10_CONCATENATE(                                           \
+      TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(torch::Library & m)
+
+/// Macro for defining a function that will be run at static
+/// initialization time to define operator overrides for dispatch key
+/// `k` (must be an unqualified enum member of c10::DispatchKey) in
+/// namespace `ns` (must be a valid C++ identifer, no quotes).  Use this
+/// macro when you want to implement a preexisting set of custom
+/// operators on a new dispatch key (e.g., you want to provide CUDA
+/// implementations of already existing operators).  One common usage
+/// pattern is to use TORCH_LIBRARY() to define schema for all new
+/// operators you want to define, and then use several
+/// TORCH_LIBRARY_IMPL() blocks to provide implementations of the
+/// operator for CPU, CUDA and Autograd.
+///
+/// In some cases, you need to define something that applies to all namespaces,
+/// not just one namespace (usually a fallback).  In that case, use the reserved
+/// namespace _, e.g.,
+///
+/// ```
+/// TORCH_LIBRARY_IMPL(_, XLA, m) {
+///    m.fallback(xla_fallback);
+/// }
+/// ```
+///
+/// Example usage:
+///
+/// ```
+/// TORCH_LIBRARY_IMPL(myops, CPU, m) {
+///   // m is a torch::Library; methods on it will define
+///   // CPU implementations of operators in the myops namespace.
+///   // It is NOT valid to call torch::Library::def()
+///   // in this context.
+///   m.impl("add", add_cpu_impl);
+/// }
+/// ```
+///
+/// If ``add_cpu_impl`` is an overloaded function, use a
+/// ``static_cast`` to specify which overload you want
+/// (by providing the full type).
+///
+// NB: if the dispatch key is not whitelisted, we simply omit the Library
+// call entirely
+#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
+
+/// \private
+///
+/// The above macro requires an extra unique identifier (uid) to prevent
+/// variable name collisions. This can happen if TORCH_LIBRARY_IMPL is called
+/// multiple times with the same namespace and dispatch key in the same
+/// translation unit.
+#define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
+  static void C10_CONCATENATE(                                            \
+      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&);       \
+  static const torch::detail::TorchLibraryInit C10_CONCATENATE(           \
+      TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                 \
+      torch::Library::IMPL,                                               \
+      (c10::impl::dispatch_key_allowlist_check(c10::DispatchKey::k)       \
+           ? &C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid) \
+           : [](torch::Library&) -> void {}),                             \
+      #ns,                                                                \
+      c10::make_optional(c10::DispatchKey::k),                            \
+      __FILE__,                                                           \
+      __LINE__);                                                          \
+  void C10_CONCATENATE(                                                   \
+      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m)
+
+// These are variants of the macros above which are to be used for testing (they
+// don't setup the static initializer, so you can control the visibility of
+// the allocated library yourself).
+//
+// DO NOT use these in production code, they are NOT understood by the
+// code analyzer and will be incorrectly analyzed in those situations.
+
+/// \private
+#define MAKE_TORCH_LIBRARY(ns) \
+  torch::Library(torch::Library::DEF, #ns, c10::nullopt, __FILE__, __LINE__)
+/// \private
+#define MAKE_TORCH_LIBRARY_IMPL(ns, k)         \
+  torch::Library(                              \
+      torch::Library::IMPL,                    \
+      #ns,                                     \
+      c10::make_optional(c10::DispatchKey::k), \
+      __FILE__,                                \
+      __LINE__)
+
+// Make the custom class API visible, so it is available from
+// torch::Library.
+
+#include <torch/custom_class.h>
diff --git a/MLPY/Lib/site-packages/torch/include/torch/script.h b/MLPY/Lib/site-packages/torch/include/torch/script.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ef3b444b9354f19204c4c112f0f4582fea8218f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/torch/script.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/api/include/torch/types.h>
+#include <torch/csrc/autograd/InferenceMode.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/custom_class.h>
+
+#include <ATen/ATen.h>
diff --git a/MLPY/Lib/site-packages/torch/include/xnnpack.h b/MLPY/Lib/site-packages/torch/include/xnnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd3194a96174242e3eebddbc625d1bec6594d8a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/include/xnnpack.h
@@ -0,0 +1,6172 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <pthreadpool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// The number of bytes XNNPACK may read beyond array bounds.
+/// The caller must allocate at least this many extra bytes after the tensor data passed to XNNPACK.
+///
+/// Note: XNNPACK reads, but never writes beyond array bounds.
+#define XNN_EXTRA_BYTES 16
+
+/// Maximum number of dimensions in tensor shape.
+#define XNN_MAX_TENSOR_DIMS 6
+
+/// Allow sparse inference in a Runtime.
+///
+/// Note: this flag hints XNNPACK to consider sparse inference, but does not guarantee it.
+#define XNN_FLAG_HINT_SPARSE_INFERENCE 0x00000001
+
+/// Allow IEEE FP16 inference in a Runtime.
+///
+/// Note: this flag hints XNNPACK to consider IEEE FP16 inference, but does not guarantee it.
+#define XNN_FLAG_HINT_FP16_INFERENCE 0x00000002
+
+/// Force IEEE FP16 inference in a Runtime, and fail if FP16 inference is not possible.
+///
+/// Note: this flag guarantees that XNNPACK will use IEEE FP16 inference, or fail to create the Runtime object.
+/// Warning: on x86 systems FP16 computations will be emulated at a substantial performance cost.
+#define XNN_FLAG_FORCE_FP16_INFERENCE 0x00000004
+
+/// Enable timing of each operator's runtime.
+#define XNN_FLAG_BASIC_PROFILING 0x00000008
+
+/// Enable the just-in-time compiler.
+#define XNN_FLAG_JIT 0x00000010
+
+/// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
+#define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
+
+/// Assume transposed weights in a fully connected operator.
+#define XNN_FLAG_TRANSPOSE_WEIGHTS 0x00000001
+
+/// The operator assumes NHWC layout for the input, regardless of the output layout.
+#define XNN_FLAG_INPUT_NHWC 0x00000002
+
+/// Match "SAME" padding in TensorFlow. Exact padding values are computed dynamically depending on input size.
+#define XNN_FLAG_TENSORFLOW_SAME_PADDING 0x00000004
+
+/// Assume transposed weights in a batch matrix multiply operator.
+#define XNN_FLAG_TRANSPOSE_B XNN_FLAG_TRANSPOSE_WEIGHTS
+
+/// Assume transposed input in a batch matrix multiply operator.
+#define XNN_FLAG_TRANSPOSE_A 0x00000002
+
+/// Implicitly flatten and reshape input of a Fully Connected operator into a 2D tensor.
+#define XNN_FLAG_TENSORFLOW_RESHAPE_2D 0x00000004
+
+/// Match behaviour of TensorFlow 1.x.
+#define XNN_FLAG_TENSORFLOW_LEGACY_MODE 0x00000004
+
+/// Static weights of the FP16 operator are in FP32 format.
+#define XNN_FLAG_FP32_STATIC_WEIGHTS 0x00000008
+
+/// Align corners of input and output images in resize operations.
+#define XNN_FLAG_ALIGN_CORNERS 0x00000008
+
+/// Yield worker threads of the thread pool to the system scheduler after the inference.
+#define XNN_FLAG_YIELD_WORKERS 0x00000010
+
+/// Use transient indirection buffer to reduce memory footprint
+#define XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER 0x00000020
+
+/// Reduce the dimensions.
+#define XNN_FLAG_REDUCE_DIMS 0x00000040
+
+/// The number of entries in an array of xnn_dynamic_quantization_params that XNNPACK may read beyond array bounds.
+/// The caller must allocate at least this many extra xnn_dynamic_quantization_params before passing the array to XNNPACK.
+///
+/// Note: XNNPACK reads, but never writes beyond array bounds.
+#define XNN_EXTRA_QUANTIZATION_PARAMS 8
+
+struct xnn_dynamic_quantization_params {
+  int32_t zero_point;
+  float scale;
+};
+
+/// Status code for any XNNPACK function call.
+enum xnn_status {
+  /// The call succeeded, and all output arguments now contain valid data.
+  xnn_status_success = 0,
+  xnn_status_uninitialized = 1,
+  xnn_status_invalid_parameter = 2,
+  xnn_status_invalid_state = 3,
+  xnn_status_unsupported_parameter = 4,
+  xnn_status_unsupported_hardware = 5,
+  xnn_status_out_of_memory = 6,
+  xnn_status_reallocation_required = 7,
+};
+
+struct xnn_allocator {
+  /// User-specified pointer that will be passed as-is to all functions in this structure.
+  void* context;
+  /// Pointer to a function to be called for general memory allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param size - The size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the allocated memory block of at least @ref size bytes.
+  ///          If allocation fails, the function must return NULL.
+  void* (*allocate)(void* context, size_t size);
+  /// Pointer to a function to be called for general memory re-allocation, i.e. to increase or shrink a previously
+  /// allocated memory block. The content of the old memory block is copied to the new memory block.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
+  ///                  If the pointer is NULL, the @ref reallocate call is equivalent to an @ref allocate call.
+  /// @param size - The new size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the newly allocated memory block of at least @ref size bytes with the content of the previous
+  ///          memory block.
+  ///          If allocation fails, the function must return NULL, but must not release the previous memory block.
+  void* (*reallocate)(void* context, void* pointer, size_t size);
+  /// Pointer to a function to be called for general memory de-allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref allocate or @ref reallocate functions. Can be NULL.
+  ///                  If the pointer is NULL, the @ref deallocate call is a no-op.
+  void (*deallocate)(void* context, void* pointer);
+  /// Pointer to a function to be called for aligned memory allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param alignment - The alignment of the memory block to allocate, in bytes. Alignment is always a power-of-2.
+  /// @param size - The size of the memory block to allocate, in bytes.
+  ///
+  /// @returns Pointer to the allocated memory block of at least @ref size bytes.
+  ///          If allocation fails, the function must return NULL.
+  void* (*aligned_allocate)(void* context, size_t alignment, size_t size);
+  /// Pointer to a function to be called for aligned memory de-allocation.
+  ///
+  /// @param context - The user-specified pointer from xnn_allocator structure.
+  /// @param pointer - Pointer to a memory block allocated by @ref aligned_allocate function. Can be NULL.
+  ///                  If the pointer is NULL, the @ref aligned_deallocate call is a no-op.
+  void (*aligned_deallocate)(void* context, void* pointer);
+};
+
+/// Initialize XNNPACK library.
+///
+/// XNNPACK must be successfully initialized before use. During initialization, XNNPACK populates internal structures
+/// depending on the host processor. Initialization can be time-consuming.
+///
+/// @param[in] allocator - structure with function pointers to be use for memory allocation and de-allocation.
+///                        If this argument is NULL, system-provided memory management functions (e.g. malloc/free)
+///                        will be used.
+///
+/// @retval xnn_status_success - XNNPACK is successfully initialized and ready to use.
+/// @retval xnn_status_out_of_memory - initialization failed due to out-of-memory condition.
+/// @retval xnn_status_unsupported_hardware - initialization failed because the host processor does not satisfy the
+///                                           minimum hardware requirements for XNNPACK. E.g. this may happen on x86
+///                                           processors without SSE2 extension, or on 32-bit ARM processors without
+///                                           the NEON SIMD extension.
+enum xnn_status xnn_initialize(const struct xnn_allocator* allocator);
+
+/// Deinitialize XNNPACK library.
+///
+/// To avoid memory and resource leaks, users must call xnn_deinitialize once for each successful xnn_initialize call.
+///
+/// @retval xnn_status_success - deinitialization call succeeded.
+enum xnn_status xnn_deinitialize(void);
+
+/// Subgraph is an abstract representation of a neural network model.
+/// Subgraph objects are used to define Values (tensors) and Nodes (operators) comprising the model.
+typedef struct xnn_subgraph* xnn_subgraph_t;
+
+/// Create a empty Subgraph object.
+///
+/// @param external_value_ids - number of Value IDs to reserve for communication with external graph representation.
+///                             The Subgraph object would avoid creating internal Value IDs in the
+///                             [0, reserved_value_ids-1] range.
+/// @param flags - binary features of the subgraph. No supported flags are currently defined.
+/// @param subgraph_out - pointer to the variable that will be initialized with a handle to the Subgraph object upon
+///                       successful return.
+enum xnn_status xnn_create_subgraph(
+  uint32_t external_value_ids,
+  uint32_t flags,
+  xnn_subgraph_t* subgraph_out);
+
+/// Destroy a Subgraph object, as well as Values, and Nodes associated with the subgraph.
+///
+/// @param subgraph - the Subgraph object to destroy.
+enum xnn_status xnn_delete_subgraph(
+  xnn_subgraph_t subgraph);
+
+#define XNN_VALUE_FLAG_EXTERNAL_INPUT  0x00000001
+#define XNN_VALUE_FLAG_EXTERNAL_OUTPUT 0x00000002
+#define XNN_VALUE_FLAG_PERSISTENT      0x00000004
+
+#define XNN_INVALID_VALUE_ID UINT32_MAX
+
+/// Type of elements in a Value object.
+enum xnn_datatype {
+  /// Invalid data type. Valid Values never have this datatype.
+  xnn_datatype_invalid = 0,
+  /// IEEE754 single-precision floating-point.
+  xnn_datatype_fp32 = 1,
+  /// IEEE754 half-precision floating-point.
+  xnn_datatype_fp16 = 2,
+  /// Quantized 8-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint8 = 3,
+  /// Quantized 8-bit unsigned integer with shared per-Value quantization parameters.
+  xnn_datatype_quint8 = 4,
+  /// Quantized 32-bit signed integer with shared per-Value quantization parameters.
+  xnn_datatype_qint32 = 5,
+  /// Quantized 8-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint8 = 6,
+  /// Quantized 32-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint32 = 7,
+  /// Quantized 4-bit signed integer with shared per-channel quantization parameters.
+  xnn_datatype_qcint4 = 8,
+  /// Dynamically quantized 8-bit signed integer with per-batch quantization parameters.
+  xnn_datatype_qdint8 = 9,
+};
+
+/// Define a tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  size_t num_dims,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - multiplication factor to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  float scale,
+  size_t num_dims,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+enum xnn_status xnn_define_channelwise_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Validate the dimensions, channel_dim, zero point, datatype, and scale of a quantized tensor-type.
+///
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - multiplication factor to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_validate_quantized_tensor(
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  float scale,
+  size_t num_dims,
+  const size_t* dims);
+
+/// Validate the dimensions, channel_dim, zero point, datatype, and scales of a channelwise quantized tensor-type.
+///
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
+///                      Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
+///                      Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
+///                      the Depthwise Convolution operators.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_validate_channelwise_quantized_tensor(
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims);
+
+/// Define a channelwise quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param zero_point - offset from zero to subtract from the quantized elements in the Value.
+/// @param scale - per-channel multiplication factors to convert quantized elements to real representation.
+/// @param num_dims - number of dimensions in the shape.
+/// @param channel_dim - index of the channel dimension in the tensor with per-channel quantization parameters.
+///                      Typically this is the first dimension (dimension #0) of the filter tensors in the Convolution,
+///                      Deconvolution, and Fully Connected operators and the last dimension of the filter tensors in
+///                      the Depthwise Convolution operators.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param data - pointer to static data used for tensor initialization. If the tensor is not statically initialized,
+///               this pointer must be is NULL. If non-NULL, the life-time of the static data must exceed the life-time
+///               of the Subgraph object, and of any Runtime objects created from the Subgraph.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. Supported values are any combination of XNN_VALUE_FLAG_EXTERNAL_INPUT
+///                and XNN_VALUE_FLAG_EXTERNAL_OUTPUT.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_channelwise_quantized_tensor_value_v2(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  int32_t zero_point,
+  const float* scale,
+  size_t num_dims,
+  size_t channel_dim,
+  const size_t* dims,
+  const void* data,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a dynamically quantized tensor-type Value and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Value.
+/// @param datatype - type of the tensor elements.
+/// @param num_dims - number of dimensions in the shape.
+/// @param num_non_batch_dims - number of non-batch dimensions in the shape. The leading (num_dims - num_non_batch_dims)
+///                             dimensions will be flattened and treated as batch size. A set of quantization parameters
+///                             will be calculated for each batch element.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param flags - binary features of the Value. No supported flags are currently defined.
+/// @param id_out - pointer to the variable that will be initialized with the Value ID upon successful return. If a
+///                 valid @a external_id was provided, the variable will be initialized with the @a external_id value.
+enum xnn_status xnn_define_dynamically_quantized_tensor_value(
+  xnn_subgraph_t subgraph,
+  enum xnn_datatype datatype,
+  size_t num_dims,
+  size_t num_nonbatch_dims,
+  const size_t* dims,
+  uint32_t external_id,
+  uint32_t flags,
+  uint32_t* id_out);
+
+/// Define a Convert Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Convert Node. No supported flags are currently defined.
+enum xnn_status xnn_define_convert(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Convolution Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
+/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param groups - number of convolution groups.
+/// @param group_input_channels - number of input channels per group.
+/// @param group_output_channels - number of output channels per group.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, groups * group_input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
+///                    dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
+///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with [groups *
+///                  group_output_channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, groups * group_output_channels] dimensions.
+/// @param flags - binary features of the 2D Convolution Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_convolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Deconvolution (Transposed Convolution) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param padding_top - implicit padding above 2D output data.
+/// @param padding_right - implicit padding to the right of 2D output data.
+/// @param padding_bottom - implicit padding below 2D output data.
+/// @param padding_left - implicit padding to the left of 2D output data.
+/// @param adjustment_height - additional elements in the bottom of the 2D output data.
+/// @param adjustment_width - additional elements to the right of the 2D output data.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param upsampling_height - height of upsampling region for deconvolution input (deconvolution height stride).
+/// @param upsampling_width - width of upsampling region for deconvolution input (deconvolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param groups - number of convolution groups.
+/// @param group_input_channels - number of input channels per group.
+/// @param group_output_channels - number of output channels per group.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, groups * group_input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [groups * group_output_channels, kernel_height, kernel_width, group_input_channels]
+///                    dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Convolution Node without a bias. If
+///                  present, the bias tensor must be a 1D tensor defined in the @a subgraph with
+///                  [groups * group_output_channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, groups * group_output_channels] dimensions.
+/// @param flags - binary features of the 2D Deconvolution Node. No supported flags are currently defined.
+enum xnn_status xnn_define_deconvolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t padding_top,
+  uint32_t padding_right,
+  uint32_t padding_bottom,
+  uint32_t padding_left,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t upsampling_height,
+  uint32_t upsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Depthwise Convolution Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param kernel_height - kernel (filter) height.
+/// @param kernel_width - kernel (filter) width.
+/// @param subsampling_height - height of subsampling region for convolution output (convolution height stride).
+/// @param subsampling_width - width of subsampling region for convolution output (convolution width stride).
+/// @param dilation_height - dilation of kernel elements along the height dimension.
+/// @param dilation_width - dilation of kernel elements along the width dimension.
+/// @param depth_multiplier - ratio of output channels to input channels.
+/// @param input_channels - number of input channels.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, input_channels] dimensions
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must ge a 4D tensor defined in the @a subgraph
+///                    with [1, kernel_height, kernel_width, input_channels * depth_multiplier] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a 2D Depthwise Convolution Node without
+///                  a bias. If present, the bias tensor must be a 1D tensor defined in the @a subgraph with
+///                  [input_channels * depth_multiplier] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, input_channels * depth_multiplier] dimensions.
+/// @param flags - binary features of the 2D Depthwise Convolution Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_depthwise_convolution_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t depth_multiplier,
+  size_t input_channels,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Depth To Space Node 2D and add it to a Subgraph.
+///
+/// The Depth To Space 2D Node rearranges data from depth into blocks of spatial data (a reverse transform to
+/// Space To Depth). For a given input pixel, an output square of pixels with side @a block_size is formed from values
+/// in the corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times
+/// smaller than that of the input.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param block_size - the size of the spatial block.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, OC * block_size * block_size] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, IH * block_size, IW * block_size, OC] dimensions.
+/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
+enum xnn_status xnn_define_depth_to_space_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t block_size,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+enum xnn_status xnn_define_depth_to_space(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t block_size,
+  uint32_t flags);
+
+/// Define a 1D Global Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 1D Global Average Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_average_pooling_1d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Global Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second- and third-innermost
+///                   dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 2D Global Average Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_average_pooling_2d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 1D Global Sum Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 2 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second-innermost dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 2 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 1D Global Sum Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_sum_pooling_1d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Global Sum Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with 3 or more dimensions
+///                   defined in the @a subgraph. Averaging is performed across the second- and third-innermost
+///                   dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor with 3 or more
+///                    dimensions defined in the @a subgraph.
+/// @param flags - binary features of the 2D Global Sum Pooling Node. The only currently supported value is
+///                XNN_FLAG_REDUCE_DIMS.
+enum xnn_status xnn_define_global_sum_pooling_2d(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Average Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param pooling_height - pooling (kernel) height.
+/// @param pooling_width - pooling (kernel) width.
+/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
+///                        to vertically adjacent output pixels.
+/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
+///                        to horizontally adjacent output pixels.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D Average Pooling Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_average_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Fully Connected Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
+///                   @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
+///                   1D and its last dimension must match the last dimension of the filter tensor. In particular, if
+///                   input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
+///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
+///                   divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
+///                   [num_input_elements] dimensions, then reshaped into a 2D tensor of
+///                   [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
+///                   total number of elements in the input tensor.
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
+///                    If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
+///                    [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
+///                    specified, the filter tensor must have [input_channels, output_channels] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
+///                  If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
+///                  dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
+///                    dimensionality as the input tensor, all its dimensions but the last one must match the
+///                    corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
+///                    match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
+///                    must be a 2D tensor of [batch_size, output_channels] dimensions.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
+///                    [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
+///                    total number of elements in the input tensor.
+/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
+enum xnn_status xnn_define_fully_connected(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Sparse Fully Connected Node and add it to a Subgraph.
+///
+/// This operator is experimental, and will be removed in the future.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the
+///                   @a subgraph. If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the input tensor must be at least
+///                   1D and its last dimension must match the last dimension of the filter tensor. In particular, if
+///                   input is a 2D tensor, it must have [batch_size, input_channels] dimensions.
+///                   If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, the number of elements in the input tensor must be
+///                   divisible by the input_channels. The tensor will be first flattened into a 1D tensor of
+///                   [num_input_elements] dimensions, then reshaped into a 2D tensor of
+///                   [num_input_elements / input_channels, input_channels] dimensions where num_input_elements is the
+///                   total number of elements in the input tensor.
+/// @param filter_id - Value ID for the filter tensor. The filter tensor must a 2D tensor defined in the @a subgraph.
+///                    If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is not specified, the filter tensor must have
+///                    [output_channels, input_channels] dimensions. If the XNN_FLAG_TRANSPOSE_WEIGHTS flag is
+///                    specified, the filter tensor must have [input_channels, output_channels] dimensions.
+/// @param bias_id - Value ID for the bias tensor, or XNN_INVALID_VALUE_ID for a Fully Connected Node without a bias.
+///                  If present, the bias tensor must be a 1D tensor defined in the @a subgraph with [output_channels]
+///                  dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is not specified, the output tensor must have the same
+///                    dimensionality as the input tensor, all its dimensions but the last one must match the
+///                    corresponding dimensions of the input tensor, and the last dimensions of the output tensor must
+///                    match the first dimension of the filter tensor. In particular, if input is a 2D tensor, output
+///                    must be a 2D tensor of [batch_size, output_channels] dimensions.
+///                    If XNN_FLAG_TENSORFLOW_RESHAPE_2D is specified, output must be a 2D tensor of
+///                    [num_input_elements / input_channels, output_channels] dimensions where num_input_elements is the
+///                    total number of elements in the input tensor.
+/// @param flags - binary features of the Fully Connected Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_RESHAPE_2D and XNN_FLAG_TRANSPOSE_WEIGHTS.
+enum xnn_status xnn_define_fully_connected_sparse(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t filter_id,
+  uint32_t bias_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D Max Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data. Must be 0 if XNN_FLAG_TENSORFLOW_SAME_PADDING
+///                            flag is specified.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data. Must be 0 if
+///                              XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data. Must be 0 if
+///                               XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data. Must be 0 if
+///                             XNN_FLAG_TENSORFLOW_SAME_PADDING flag is specified.
+/// @param pooling_height - pooling (kernel) height.
+/// @param pooling_width - pooling (kernel) width.
+/// @param stride_height - displacing of the pooling window in the vertical dimension of the input pixels corresponding
+///                        to vertically adjacent output pixels.
+/// @param stride_width - displacing of the pooling window in the horizontal dimension of the input pixels corresponding
+///                        to horizontally adjacent output pixels.
+/// @param dilation_height - dilation of pooling elements along the height dimension.
+/// @param dilation_width - dilation of pooling elements along the width dimension.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D Max Pooling Node. The only currently supported values is
+///                XNN_FLAG_TENSORFLOW_SAME_PADDING.
+enum xnn_status xnn_define_max_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2D ArgMax Pooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_padding_top - implicit zero-padding above 2D input data.
+/// @param input_padding_right - implicit zero-padding to the right of 2D input data.
+/// @param input_padding_bottom - implicit zero-padding below 2D input data.
+/// @param input_padding_left - implicit zero-padding to the left of 2D input data.
+/// @param pooling_height - pooling (kernel) height. Vertical stride between pooling regions match this value.
+/// @param pooling_width - pooling (kernel) width. Horizontal stride between pooling regions match this value.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH, IW, channels] dimensions
+/// @param output_value_id - Value ID for the output tensor with the maximum values in the pools. The output tensor must
+///                          be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels] dimensions.
+/// @param output_index_id - Value ID for the output tensor with the indexes of the maximum values in the pools. The
+///                          output tensor must be a 4D tensor defined in the @a subgraph with [N, OH, OW, channels]
+///                          dimensions.
+/// @param flags - binary features of the 2D ArgMax Pooling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_argmax_pooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t input_id,
+  uint32_t output_value_id,
+  uint32_t output_index_id,
+  uint32_t flags);
+
+/// Define a 2D UnPooling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param padding_top - implicit padding above 2D output data.
+/// @param padding_right - implicit padding to the right of 2D output data.
+/// @param padding_bottom - implicit padding below 2D output data.
+/// @param padding_left - implicit padding to the left of 2D output data.
+/// @param pooling_height - height of the pooling window.
+/// @param pooling_width - width of the pooling window.
+/// @param input_value_id - Value ID for the input tensor with the max-pooling values to invert. The input value tensor
+///                         must be a 4D tensor defined in the @a subgraph with [N, IH, IW, channels] dimensions.
+/// @param input_index_id - Value ID for the input tensor with the indices of the per-pool maximum values produced by
+///                         a 2D UnPooling Node. The input tensor must be a 4D tensor defined in the @a subgraph with
+///                         [N, IH, IW, channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, OH, OW, channels] dimensions.
+/// @param flags - binary features of the 2D UnPooling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_unpooling_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t padding_top,
+  uint32_t padding_right,
+  uint32_t padding_bottom,
+  uint32_t padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t input_value_id,
+  uint32_t input_index_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Add Node and add it to a Subgraph.
+///
+/// The 2-Input Add Node computes elementwise addition of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Add Node. No supported flags are currently defined.
+enum xnn_status xnn_define_add2(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Multiply Node and add it to a Subgraph.
+///
+/// The 2-Input Multiply Node computes elementwise multiplication of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Multiply Node. No supported flags are currently defined.
+enum xnn_status xnn_define_multiply2(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+// Cap operations applied to logits (Q * K) of attention operator.
+enum xnn_attention_logits_cap_type {
+  // No capping.
+  xnn_attention_logits_cap_type_none = 0,
+  // Cap the absolute values of logits by tanh: tanh(logits / cap) * cap
+  xnn_attention_logits_cap_type_tanh
+};
+
+// Params when the cap type is xnn_attention_logits_cap_type_tanh.
+struct xnn_attention_logits_cap_tanh_params {
+  float cap;
+};
+
+/// Define a Scaled Dot-Product Attention Node and add it to a Subgraph.
+///
+/// This operator is experimental.
+///
+/// The Scaled Dot-Product Attention Node computes a multi-head or multi-query scaled dot attention on the query, key,
+/// and value tensors.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param cap_type - type of cap to be applied to the logits.
+/// @param cap_params - parameters for the cap. Must be a pointer to xnn_attention_logits_cap_tanh_params if cap_type
+///                     is xnn_attention_logits_cap_type_tanh.
+/// @param query_id - Value ID for the query tensor. The query tensor must be a 3+-dimensional tensor defined in the
+///                   @a subgraph with the dimensions as [*, H, T, C], where H/T/C are the heads/tokens/channels, and *
+///                   is the 0 or more dimensions treated as batch size.
+/// @param key_id - Value ID for the key tensor. The key tensor must be a 2+--dimensional tensor defined in the
+///                 @a subgraph. It can have the same number of dimensions as the query, with the dimensions as
+///                 [*, H, U, C] (multi-head), or have 1 less dimension than the query, with the dimensions as
+///                 as [*, U, C] (multi-query, number of heads omitted implies single head), where H/U/C are the
+///                 heads/key_value_tokens/channels, and * is the 0 or more dimensions treated as batch size. These
+///                 batch size dimensions must be the same as query.
+/// @param value_id - Value ID for the value tensor. The value tensor must be a 2+--dimensional tensor defined in the
+///                   @a subgraph. It can have the same number of dimensions as the query, with the dimensions as
+///                   [*, H, U, D] (multi-head), or have 1 less dimension than the query, with the dimensions as
+///                   as [*, U, D] (multi-query, number of heads omitted implies single head), where H/U/D are the
+///                   heads/key_value_tokens/value_channels, and * is the 0 or more dimensions treated as batch size.
+///                   These batch size dimensions must be the same as query and key.
+/// @param scale_id - Value ID for the scale tensor. The scale tensor must be a 1D tensor defined in the @a subgraph
+///                   with [C] dimensions. The query tensor is multiplied with this scale tensor before the dot product
+///                   with the key tensor.
+/// @param mask_id - Value ID for the mask tensor. The mask tensor must be a 2D tensor defined in the @a subgraph with
+///                  [T, U] dimensions. The mask tensor is added to the logits (query dot value).
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 3+-dimensional tensor defined in the
+///                    @a subgraph with the dimensions as [*, H, T, D], where H/T/D are the heads/tokens/value_channels,
+///                    and * is the 0 or more dimensions treated as batch size. These batch size dimensions must be the
+///                    same as query, key, and value.
+/// @param flags - binary features of the Scaled Dot Product Attention Node. No supported flags are currently defined.
+enum xnn_status xnn_define_scaled_dot_product_attention(
+  xnn_subgraph_t subgraph,
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t query_id,
+  uint32_t key_id,
+  uint32_t value_id,
+  uint32_t scale_id,
+  uint32_t mask_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Subtract Node and add it to a Subgraph.
+///
+/// The Subtract Node computes elementwise subtraction of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Subtract Node. No supported flags are currently defined.
+enum xnn_status xnn_define_subtract(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Divide Node and add it to a Subgraph.
+///
+/// The Divide Node computes elementwise division of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Divide Node. No supported flags are currently defined.
+enum xnn_status xnn_define_divide(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Maximum Node and add it to a Subgraph.
+///
+/// The 2-Input Maximum Node computes elementwise maximum of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Maximum Node. No supported flags are currently defined.
+enum xnn_status xnn_define_maximum2(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Minimum Node and add it to a Subgraph.
+///
+/// The 2-Input Minimum Node computes elementwise minimum of two tensor inputs with numpy broadcasting rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Minimum Node. No supported flags are currently defined.
+enum xnn_status xnn_define_minimum2(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Squared Difference Node and add it to a Subgraph.
+///
+/// The Squared Difference Node computes elementwise squared difference of two tensor inputs with numpy broadcasting
+/// rules.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the second
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an M-dimensional tensor defined in
+///                    the @a subgraph with each dimension either equal to the corresponding dimension of the first
+///                    input, or equal to 1. In the latter case, the elements of the input tensor are broadcasted along
+///                    that dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a max(N,M)-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the maximum between the corresponding dimension
+///                    of the two inputs.
+/// @param flags - binary features of the Squared Difference Node. No supported flags are currently defined.
+enum xnn_status xnn_define_squared_difference(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Constant Pad Node with static padding specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param pre_paddings - number of padding elements to insert before input elements for every dimension. This array
+///                       must have as many elements as the number of dimensions in the input tensor.
+/// @param post_paddings - number of padding elements to insert after input elements for every dimension. This array
+///                        must have as many elements as the number of dimensions in the input tensor.
+/// @param padding_value - constant value used to initialize padding elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor with padding.
+/// @param flags - binary features of the Constant Pad Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_constant_pad(
+  xnn_subgraph_t subgraph,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  float padding_value,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Mean Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_reduction_axes - number of axes along which mean is computed.
+/// @param reduction_axes - axes along which mean is computed.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a dense tensor with at least
+///                   @a num_reduction_axes dimensions defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a dense tensor defined in the
+///                    @a subgraph with @a num_reduction_axes fewer dimensions than the input tensor (if
+///                    XNN_FLAG_REDUCE_DIMS is specified), or has same dimension rank but the dimension at
+///                    @a reduction_axes reduced to 1 (if XNN_FLAG_REDUCE_DIMS is not specified).
+/// @param flags - binary features of the Mean Node. The only currently supported value is XNN_FLAG_REDUCE_DIMS
+enum xnn_status xnn_define_static_mean(
+  xnn_subgraph_t subgraph,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 2-Input Concatenate Node concatenates two tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the two input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    second input.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    first input.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of both inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of both inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate2(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 3-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 3-Input Concatenate Node concatenates three tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the three input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate3(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 4-Input Concatenate Node and add it to a Subgraph.
+///
+/// The 4-Input Concatenate Node concatenates four tensors along a specified axis.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param axis - the axis to concatenate the four input tensors along
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input3_id - Value ID for the third input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param input4_id - Value ID for the fourth input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph with each dimension, except the axis, equal to the corresponding dimension of the
+///                    other inputs.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to the dimension of all inputs, except the axis
+///                    dimension, where it is the sum of the corresponding dimensions of all inputs.
+/// @param flags - binary features of the Concatenate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_concatenate4(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t input4_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+enum xnn_status xnn_define_concatenate5(
+  xnn_subgraph_t subgraph,
+  size_t axis,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t input3_id,
+  uint32_t input4_id,
+  uint32_t input5_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Copy Node and add it to a Subgraph.
+///
+/// The Copy Node copies an input tensor to an output tensor.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the first input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Copy Node. No supported flags are currently defined.
+enum xnn_status xnn_define_copy(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a 2-Output Split Node and add it to a Subgraph.
+///
+/// The 2-Output Split Node splits an input tensor into two output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the second output. The split_dim dimension is half of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the first output. The split_dim dimension is half of the input's split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split2(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t flags);
+
+/// Define a 3-Output Split Node and add it to a Subgraph.
+///
+/// The 3-Output Split Node splits an input tensor into three output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the second and third output. The split_dim dimension is one third of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the first and third output. The split_dim dimension is one third of the input's
+///                     split_dim.
+/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the second and third output. The split_dim dimension is one third of the input's
+///                     split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split3(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t output3_id,
+  uint32_t flags);
+
+/// Define a 4-Output Split Node and add it to a Subgraph.
+///
+/// The 4-Output Split Node splits an input tensor into four output tensors along a specified axis evenly.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param split_dim - the dimension to split the input tensor along
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in the @a
+///                   subgraph.
+/// @param output1_id - Value ID for the first output tensor. The output tensor must be an N-dimensional tensor defined
+///                     in the @a subgraph with each dimension, except the axis, equal to the corresponding dimension
+///                     of the other output tensors. The split_dim dimension is one fourth of the input's split_dim.
+/// @param output2_id - Value ID for the second output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param output3_id - Value ID for the third output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param output4_id - Value ID for the fourth output tensor. The output tensor must be an N-dimensional tensor
+///                     defined in the @a subgraph with each dimension, except the axis, equal to the corresponding
+///                     dimension of the other output tensors. The split_dim dimension is one fourth of the input's
+///                     split_dim.
+/// @param flags - binary features of the Split Node. No supported flags are currently defined.
+enum xnn_status xnn_define_even_split4(
+  xnn_subgraph_t subgraph,
+  size_t split_dim,
+  uint32_t input_id,
+  uint32_t output1_id,
+  uint32_t output2_id,
+  uint32_t output3_id,
+  uint32_t output4_id,
+  uint32_t flags);
+
+/// Define a Reshape Node with static shape specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_dims - number of shape dimensions in the output tensor.
+/// @param new_shape - shape dimensions of the output tensor.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor with padding.
+/// @param flags - binary features of the Reshape Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_reshape(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* new_shape,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Node that reshapes a tensor to two dimensions, retaining the
+/// trailing dimension, and add it to a Subgraph.
+///
+/// This operator is experimental.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be
+///                   defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be
+///                    defined in the @a subgraph, and its
+///                    size must match the shape of the input tensor with
+///                    padding.
+/// @param flags - binary features of the Reshape Node. No supported flags are
+///                currently defined.
+enum xnn_status xnn_define_reshape_2d(xnn_subgraph_t subgraph,
+                                      uint32_t input_id, uint32_t output_id,
+                                      uint32_t flags);
+
+/// Define a 2D Resize Bilinear Node with static output height & width specification and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param new_height - height dimension of the output tensor.
+/// @param new_width - width dimension of the output tensor.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, H, W, C] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, new_height, new_width, C] dimensions.
+/// @param flags - binary features of the 2D Resize Bilinear Node. The only currently supported values are
+///                XNN_FLAG_TENSORFLOW_LEGACY_MODE and XNN_FLAG_ALIGN_CORNERS, which are mutually exclusive.
+enum xnn_status xnn_define_static_resize_bilinear_2d(
+  xnn_subgraph_t subgraph,
+  size_t new_height,
+  size_t new_width,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a PReLU (Parametric ReLU) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, H, W, channels] dimensions.
+/// @param slope_id - Value ID for the slope tensor. The slope tensor must be a 1D tensor defined in the @a subgraph with
+///                   [channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, H, W, channels] dimensions.
+/// @param flags - binary features of the PReLU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_prelu(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t slope_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a RoPE (Rotary Positional Embeddings) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param max_tokens - maximum possible number of tokens (maximum sequence length) of the input/output tensors.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [batch, tokens, heads, channels] dimensions.
+/// @param weights_id - Value ID for the weights tensor. The weights tensor must be a 2D tensor defined in the
+///                     @a subgraph with [max_tokens, channels] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [batch, tokens, heads, channels] dimensions.
+/// @param flags - binary features of the RoPE Node. No supported flags are currently defined.
+enum xnn_status xnn_define_rope(
+  xnn_subgraph_t subgraph,
+  size_t max_sequence_size,
+  uint32_t input_id,
+  uint32_t weights_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Abs Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Abs Node. No supported flags are currently defined.
+enum xnn_status xnn_define_abs(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Bankers' Rounding Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Bankers' Rounding Node. No supported flags are currently defined.
+enum xnn_status xnn_define_bankers_rounding(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Batch Matrix Multiply Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input1_id - Value ID for the first input tensor. The input tensor must be an N-dimensional tensor defined in
+///                    the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the second input
+///                    tensor. The last 2 dimensions are [M, K]. If XNN_FLAG_TRANSPOSE_B is not specified, the last
+///                    dimension must match the second last dimension of the second input tensor. If
+///                    XNN_FLAG_TRANSPOSE_B is specified, the last dimension must match the last dimension of the
+///                    second input tensor.
+/// @param input2_id - Value ID for the second input tensor. The input tensor must be an N-dimensional tensor defined
+///                    in the @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first input
+///                    tensor. If XNN_FLAG_TRANSPOSE_B is not specified, the last 2 dimensions are [K, N], and the
+///                    second last dimension must match the last dimension of the first input tensor. If
+///                    XNN_FLAG_TRANSPOSE_B is specified, the last 2 dimensions are [N, K], and the last dimension must
+///                    match the last dimension of the first input tensor.
+/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined in the
+///                    @a subgraph. It must be at least 3D. The first N-2 dimensions must match the first and second
+///                    input tensors . The last 2 dimensions must be [M, N].
+/// @param flags - binary features of the Batch Matrix Multiply Node. The only currently supported value is
+///                XNN_FLAG_TRANSPOSE_B.
+enum xnn_status xnn_define_batch_matrix_multiply(
+  xnn_subgraph_t subgraph,
+  uint32_t input1_id,
+  uint32_t input2_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Ceiling Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Ceiling Node. No supported flags are currently defined.
+enum xnn_status xnn_define_ceiling(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Clamp Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param output_min - lower bound for clipping output values.
+/// @param output_max - upper bound for clipping output values.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Clamp Node. No supported flags are currently defined.
+enum xnn_status xnn_define_clamp(
+  xnn_subgraph_t subgraph,
+  float output_min,
+  float output_max,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define an ELU (Exponential Linear Unit) Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param alpha - scale factor for negative output elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the ELU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_elu(
+  xnn_subgraph_t subgraph,
+  float alpha,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Floor Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Floor Node. No supported flags are currently defined.
+enum xnn_status xnn_define_floor(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a HardSwish Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the HardSwish Node. No supported flags are currently defined.
+enum xnn_status xnn_define_hardswish(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Leaky ReLU Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param negative_slope - scale factor for negative input elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Leaky ReLU Node. No supported flags are currently defined.
+enum xnn_status xnn_define_leaky_relu(
+  xnn_subgraph_t subgraph,
+  float negative_slope,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Negate Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Negate Node. No supported flags are currently defined.
+enum xnn_status xnn_define_negate(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Sigmoid Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Sigmoid Node. No supported flags are currently defined.
+enum xnn_status xnn_define_sigmoid(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a SoftMax Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph, and have at
+///                   least one dimension.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the SoftMax Node. No supported flags are currently defined.
+enum xnn_status xnn_define_softmax(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Space To Depth 2D Node and add it to a Subgraph.
+///
+/// The Space To Depth 2D Node rearranges blocks of spatial data into blocks (a reverse transform to Depth To Space 2D).
+/// For a given input pixel, an output square of pixels with side @a block_size is formed from values in the
+/// corresponding number of its channels. The output depth is therefore @a block_size x @a block_size times greater
+/// than that of the input.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param block_size - the size of the spatial block.
+/// @param input_id - Value ID for the input tensor. The input tensor must be a 4D tensor defined in the @a subgraph
+///                   with [N, IH * block_size, IW * block_size, OC] dimensions.
+/// @param output_id - Value ID for the output tensor. The output tensor must be a 4D tensor defined in the @a subgraph
+///                    with [N, IH, IW, OC * block_size * block_size] dimensions.
+/// @param flags - binary features of the input_channels Node. No supported flags are currently defined.
+enum xnn_status xnn_define_space_to_depth_2d(
+  xnn_subgraph_t subgraph,
+  uint32_t block_size,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Square Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Node. No supported flags are currently defined.
+enum xnn_status xnn_define_square(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Square Root Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Root Node. No supported flags are currently defined.
+enum xnn_status xnn_define_square_root(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Reciprocal Square Root Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be
+/// defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be
+/// defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Square Root Node. No supported flags
+/// are currently defined.
+enum xnn_status xnn_define_reciprocal_square_root(xnn_subgraph_t subgraph,
+                                                  uint32_t input_id,
+                                                  uint32_t output_id,
+                                                  uint32_t flags);
+
+/// Define a Static Slice Node add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param num_dims - number of shape dimensions in the input and output tensor.
+/// @param offsets - offsets in each dimension of the input tensor. This array must have @a num_dims elements.
+/// @param sizes - size of each dimension in output tensor. This array must have @a num_dims elements.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    dimensions must match @a sizes.
+/// @param flags - binary features of the Static Slice Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_slice(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* offsets,
+  const size_t* sizes,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Static Transpose Node and add it to a Subgraph.
+///
+/// The Static Transpose Node applies a generalized transpose to the input tensor using the permuation in perm.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be an N-dimensional tensor defined in
+///                   the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be an N-dimensional tensor defined
+///                    in the @a subgraph with each dimension equal to its corresponding permuted input dimension.
+/// @param num_dims - the number of permutation dimensions. This must be equal to the number of input dimensions.
+/// @param perm - The permutation of the axis of the input tensor. The perm array must must contain 0 to N-1 in the
+///               permuted order.
+/// @param flags - binary features of the Static Transpose Node. No supported flags are currently defined.
+enum xnn_status xnn_define_static_transpose(
+  xnn_subgraph_t subgraph,
+  size_t num_dims,
+  const size_t* perm,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Define a Tanh Node and add it to a Subgraph.
+///
+/// @param subgraph - a Subgraph object that will own the created Node.
+/// @param input_id - Value ID for the input tensor. The input tensor must be defined in the @a subgraph.
+/// @param output_id - Value ID for the output tensor. The output tensor must be defined in the @a subgraph, and its
+///                    shape must match the shape of the input tensor.
+/// @param flags - binary features of the Tanh Node. No supported flags are currently defined.
+enum xnn_status xnn_define_tanh(
+  xnn_subgraph_t subgraph,
+  uint32_t input_id,
+  uint32_t output_id,
+  uint32_t flags);
+
+/// Code cache is a cache for JIT generated code.
+typedef struct xnn_code_cache* xnn_code_cache_t;
+
+/// Weights cache can be finalized in these ways:
+enum xnn_weights_cache_finalization_kind {
+  /// Weights cache is finalized, no insert operations into the weights cache is allowed, even if the "inserted"
+  /// weights already exist in thee cache. Weights cache memory will also be trimmed to page boundary and set to
+  /// read-only (to prevent writes).
+  xnn_weights_cache_finalization_kind_hard,
+  /// Weights cache will be finalized with some extra space at the end, this allows for "inserting" into the cache only
+  /// if the weights are already in the cache, and errors on inserting uncached weights. There is memory overhead.
+  xnn_weights_cache_finalization_kind_soft,
+};
+
+/// A combination of multiple factors to uniquely locate the weights cache.
+struct xnn_weights_cache_look_up_key {
+  /// The unique seed for each ukernel. It is guaranteed that each ukernel provides
+  /// a consistent and identical seed.
+  uint32_t seed;
+  /// Pointer to the original kernel.
+  const void* kernel;
+  /// Pointer to the original bias, could be NULL.
+  const void* bias;
+};
+
+/// A group of function pointers to manage weights cache. All functions may be
+/// called on multi threads.
+struct xnn_weights_cache_provider {
+  /// User-specified pointer that will be passed as-is to all functions in this
+  /// structure.
+  void* context;
+
+  /// Looks up the tuple of {cache_key, kernel, bias} in the cache. If it is found,
+  /// returns the offset to the found entry for reuse. Otherwise, returns SIZE_MAX.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param cache_key - The key used to locate the weights cache entry.
+  size_t (*look_up)(void* context, const struct xnn_weights_cache_look_up_key* cache_key);
+
+  /// Ensures that cache has enough space for `n` bytes. Returns the address to
+  /// store weight cache. Returns NULL if fails to reserve space.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param n - size to be reserved.
+  void* (*reserve_space)(void* context, size_t n);
+
+  /// Looks up packed weights at `ptr` in the cache. If it is found, reuse it.
+  /// Otherwise, it is added to the cache. Returns the offset to the cache.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param cache_key - The key used to locate the weights cache entry.
+  /// @param ptr - pointer pointing to the packed weight.
+  /// @param size - size of the packed weight.
+  size_t (*look_up_or_insert)(void* context, const struct xnn_weights_cache_look_up_key* cache_key, void* ptr, size_t size);
+
+  /// Returns whether the cache is finalized.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  bool (*is_finalized)(void* context);
+
+  /// Returns the absolute pointer corresponding to `offset`, where the offset is returned from
+  /// `look_up` or `get_or_insert`. This function must be called after finalize.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  /// @param offset - offset to the start of internal buffer
+  void* (*offset_to_addr)(void* context, size_t offset);
+
+  /// Destroy a weights cache object, as well as memory used for the cache.
+  /// @param context - The user-specified pointer from xnn_weights_cache_provider structure.
+  enum xnn_status (*delete_cache)(void* context);
+};
+
+/// Weights cache is a cache for packed weights. It can be reused between runtimes.
+typedef struct xnn_weights_cache_provider* xnn_weights_cache_t;
+
+/// Create a weights cache object specifying the initial size of weights cache (in bytes).
+///
+/// @param[in] size - initial capacity of the weights cache (in bytes), i.e. it can hold size bytes without growing.
+/// @param weights_cache_out - pointer to the variable that will be initialized to a handle to the weights cache provider
+///                            upon successful return. Once created, the weights cache provider can be shared between
+///                            different Runtime objects.
+enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out);
+
+enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out);
+
+/// Finalizes the weights cache. The kind of finalization is specified by `finalization_kind`.
+/// @param weights_cache - the weights cache object to finalize.
+/// @param finalization_kind - the kind of finalization.
+enum xnn_status xnn_finalize_weights_cache(
+  xnn_weights_cache_t weights_cache,
+  enum xnn_weights_cache_finalization_kind finalization_kind);
+
+/// Destroy a weights cache object, as well as memory used for the cache.
+/// @param weights_cache - the weights cache object to destroy.
+enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache);
+
+typedef struct xnn_workspace* xnn_workspace_t;
+
+/// Create a workspace object.
+/// @param workspace_out - pointer to the variable that will be initialized to a handle to the workspace object upon
+///                        successful return. Once created, the workspace can be shared between different Runtime
+///                        objects.
+enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out);
+/// Destroy a workspace object, as well as memory used by the workspace. Object destruction can be deferred until all
+/// Runtime objects created with this workspace are destroyed.
+/// @param workspace - the workspace object to destroy.
+enum xnn_status xnn_release_workspace(xnn_workspace_t workspace);
+
+/// Runtime is a combination of an execution plan for subgraph Nodes and a memory manager for subgraph Values.
+typedef struct xnn_runtime* xnn_runtime_t;
+
+enum xnn_profile_info {
+  /// Returns a size_t containing the number of operators.
+  xnn_profile_info_num_operators,
+  /// Returns a char[] containing the null character separated names of all operators.
+  xnn_profile_info_operator_name,
+  /// Returns a uint64_t[] with the runtimes of all operators in the same order as xnn_profile_info_operator_name.
+  xnn_profile_info_operator_timing,
+};
+
+/// Return profile information for all operators.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime, @ref xnn_create_runtime_v2 or
+///                  @ref xnn_create_runtime_v3.
+/// @param param_name - type of profile information required.
+/// @param param_value_size - the size in bytes of memory pointed to by param_value. If this is not sufficient then
+///                           param_value_size_ret will be set to the required size and xnn_status_out_of_memory will be
+///                           returned.
+/// @param param_value - a pointer to memory location where appropriate values for a given param_value will be written.
+/// @param param_value_size_ret - returns number of bytes required to write the result if param_value_size is not
+///                               sufficient.
+enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime,
+                                               enum xnn_profile_info param_name,
+                                               size_t param_value_size,
+                                               void* param_value,
+                                               size_t* param_value_size_ret);
+
+/// Create a Runtime object from a subgraph.
+///
+/// @param subgraph - a Subgraph object with all Values and Nodes that would be handled by the runtime. No Values or
+///                   Nodes can be added to the runtime once it is constructed.
+/// @param weights_cache - a cache for packed weights. The runtime will look up and reuse packed weights in this cache,
+///                        this will reduce memory allocated for packed weights.
+/// @param workspace - a workspace to hold internal tensors. The runtime will allocate space used for internal tensors
+///                    and track them using workspace. Workspace can be shared and reused across different runtimes. If
+///                    workspace is NULL, there will be no sharing: each runtime has its own workspace.
+/// @param threadpool - the thread pool to be used for parallelisation of computations in the runtime. If the thread
+///                     pool is NULL, the computation would run on the caller thread without parallelization.
+/// @param flags - binary features of the runtime. The only currently supported values are
+///                XNN_FLAG_HINT_SPARSE_INFERENCE, XNN_FLAG_HINT_FP16_INFERENCE, XNN_FLAG_FORCE_FP16_INFERENCE,
+///                XNN_FLAG_YIELD_WORKERS, and XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER. If XNN_FLAG_YIELD_WORKERS is
+///                specified, worker threads would be yielded to the system scheduler after processing the last operator
+///                in the Runtime. If XNN_FLAG_TRANSIENT_INDIRECTION_BUFFER is specified, convolution operators will
+///                initialize indirection buffers on each inference run using temporary memory in the workspace, instead
+///                of initializing persistent indirection buffers once.
+/// @param runtime_out - pointer to the variable that will be initialized with a handle to the Runtime object upon
+///                      successful return. Once constructed, the Runtime object is independent of the Subgraph object
+///                      used to create it.
+enum xnn_status xnn_create_runtime_v4(
+  xnn_subgraph_t subgraph,
+  xnn_weights_cache_t weights_cache,
+  xnn_workspace_t workspace,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime_v3(
+  xnn_subgraph_t subgraph,
+  xnn_weights_cache_t weights_cache,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime_v2(
+  xnn_subgraph_t subgraph,
+  pthreadpool_t threadpool,
+  uint32_t flags,
+  xnn_runtime_t* runtime_out);
+
+enum xnn_status xnn_create_runtime(
+  xnn_subgraph_t subgraph,
+  xnn_runtime_t* runtime_out);
+
+struct xnn_external_value {
+  uint32_t id;
+  void* data;
+};
+
+/// Reshape an external value.
+///
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. If the external ID is XNN_INVALID_VALUE_ID, an internal ID will be
+///                      created for the Value.
+/// @param num_dims - number of dimensions in the shape.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. If num_dims is 0, this pointer can be NULL.
+///               XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_reshape_external_value(
+  xnn_runtime_t runtime,
+  uint32_t external_id,
+  size_t num_dims,
+  const size_t* dims);
+
+/// Get the external value shape.
+///
+/// @param external_id - external ID for the Value. The ID must be within the range of reversed Value IDs specified on
+///                      the Subgraph creation. The external ID can not be XNN_INVALID_VALUE_ID.
+/// @param num_dims -  A valid pointer into which the number of dimensions in the shape will be written. It can not be larger than XNN_MAX_TENSOR_DIMS.
+/// @param dims - pointer to an array of @a num_dims shape dimensions. This pointer can't be NULL. It must be large enough to hold
+///               at least @a num_dims elements. XNNPACK does not keep any pointers to this array after the function returns.
+enum xnn_status xnn_get_external_value_shape(
+  xnn_runtime_t runtime,
+  uint32_t external_id,
+  size_t* num_dims,
+  size_t* dims);
+
+/// Reshape the XNNPACK runtime.
+///
+/// Propgates the shapes of input tensors through the graph to determine the shapes of intermediate and output tensors.
+/// Memory is allocated if required. Output tensor shapes are returned by xnn_get_external_value_shape.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+enum xnn_status xnn_reshape_runtime(
+  xnn_runtime_t runtime);
+
+/// Deprecated. Use xnn_reshape_runtime and xnn_setup_runtime_v2.
+///
+/// Setup data pointers for external inputs and outputs in a Runtime object and
+/// allocate memory.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
+///                              match the number of external inputs and outputs in the runtime, i.e. all external
+///                              inputs and outputs in the runtime must be specified in one call.
+/// @param external_values - array with location information for all external inputs and outputs in the runtime.
+enum xnn_status xnn_setup_runtime(
+  xnn_runtime_t runtime,
+  size_t num_external_values,
+  const struct xnn_external_value* external_values);
+
+/// Setup data pointers for external inputs and outputs in a Runtime object.
+/// Should be called after xnn_reshape_runtime.
+///
+/// @param runtime - a Runtime object created with @ref xnn_create_runtime or @ref xnn_create_runtime_v2.
+/// @param num_external_values - the number of external inputs and outputs specified in this call. This number must
+///                              match the number of external inputs and outputs in the runtime, i.e. all external
+///                              inputs and outputs in the runtime must be specified in one call.
+/// @param external_values - array with location information for all external inputs and outputs in the runtime.
+enum xnn_status xnn_setup_runtime_v2(
+  xnn_runtime_t runtime,
+  size_t num_external_values,
+  const struct xnn_external_value* external_values);
+
+/// Execute forward pass for all operators in the runtime.
+///
+/// @param runtime - the Runtime object with the execution plan to invoke.
+enum xnn_status xnn_invoke_runtime(
+  xnn_runtime_t runtime);
+
+/// Destroy a Runtime object, as well as operators and memory associated with it.
+///
+/// @param runtime - the Runtime object to destroy.
+enum xnn_status xnn_delete_runtime(
+  xnn_runtime_t runtime);
+
+typedef struct xnn_operator* xnn_operator_t;
+
+enum xnn_status xnn_run_operator(
+  xnn_operator_t op,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_delete_operator(
+  xnn_operator_t op);
+
+
+/// Operator API:
+/// - create operator will create and populate a xnn_operator_t
+/// - reshape operator will update fields in xnn_operator_t with shape/dimensions and parallelization information
+/// - setup operator will update pointers to input and outputs
+/// Each supported operator must have a create, reshape, and setup function. (Optionally a run function.)
+/// Operators listed below are in alphabetical order by operator name; within each operator, we sort alphabetically by
+/// data layout and type. We also group create, reshape, setup (and optionally run) functions of each operator together.
+
+enum xnn_status xnn_create_abs_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* abs_op_out);
+
+enum xnn_status xnn_reshape_abs_nc_f16(
+  xnn_operator_t abs_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_abs_nc_f16(
+  xnn_operator_t abs_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_abs_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* abs_op_out);
+
+enum xnn_status xnn_reshape_abs_nc_f32(
+  xnn_operator_t abs_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_abs_nc_f32(
+  xnn_operator_t abs_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_abs_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_f16(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_f16(
+  xnn_operator_t add_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_add_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_f32(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_f32(
+  xnn_operator_t add_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_add_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_qs8(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_qs8(
+  xnn_operator_t add_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_add_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_add_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* add_op_out);
+
+enum xnn_status xnn_reshape_add_nd_qu8(
+  xnn_operator_t add_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_add_nd_qu8(
+  xnn_operator_t add_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_add_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_argmax_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t flags,
+  xnn_operator_t* argmax_pooling_op_out);
+
+enum xnn_status xnn_reshape_argmax_pooling2d_nhwc_f32(
+  xnn_operator_t argmax_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_argmax_pooling2d_nhwc_f32(
+  xnn_operator_t argmax_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output,
+  uint32_t* index);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_f16(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_f16(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_f32(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_average_pooling2d_nhwc_qu8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* average_pooling_op_out);
+
+enum xnn_status xnn_reshape_average_pooling2d_nhwc_qu8(
+  xnn_operator_t average_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_average_pooling2d_nhwc_qu8(
+  xnn_operator_t average_pooling_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_bankers_rounding_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* rounding_op_out);
+
+enum xnn_status xnn_reshape_bankers_rounding_nc_f16(
+  xnn_operator_t rounding_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_bankers_rounding_nc_f16(
+  xnn_operator_t rounding_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_bankers_rounding_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* rounding_op_out);
+
+enum xnn_status xnn_reshape_bankers_rounding_nc_f32(
+  xnn_operator_t rounding_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_bankers_rounding_nc_f32(
+  xnn_operator_t rounding_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_bankers_rounding_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_batch_matrix_multiply_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* batch_matrix_multiply_op);
+
+enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f16(
+  xnn_operator_t batch_matrix_multiply_op,
+  size_t batch_size,
+  size_t m,
+  size_t k,
+  size_t n,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_batch_matrix_multiply_nc_f16(
+  xnn_operator_t batch_matrix_multiply_op,
+  void* workspace,
+  const void* lhs_input,
+  const void* rhs_input,
+  void* output);
+
+enum xnn_status xnn_create_batch_matrix_multiply_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* batch_matrix_multiply_op);
+
+enum xnn_status xnn_reshape_batch_matrix_multiply_nc_f32(
+  xnn_operator_t batch_matrix_multiply_op,
+  size_t batch_size,
+  size_t m,
+  size_t k,
+  size_t n,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_batch_matrix_multiply_nc_f32(
+  xnn_operator_t batch_matrix_multiply_op,
+  void* workspace,
+  const float* lhs_input,
+  const float* rhs_input,
+  float* output);
+
+enum xnn_status xnn_create_ceiling_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* ceiling_op_out);
+
+enum xnn_status xnn_reshape_ceiling_nc_f16(
+  xnn_operator_t ceiling_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_ceiling_nc_f16(
+  xnn_operator_t ceiling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_ceiling_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* ceiling_op_out);
+
+enum xnn_status xnn_run_ceiling_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_ceiling_nc_f32(
+  xnn_operator_t ceiling_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_ceiling_nc_f32(
+  xnn_operator_t ceiling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_channel_shuffle_nc_x8(
+  size_t groups,
+  size_t group_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* channel_shuffle_op_out);
+
+enum xnn_status xnn_reshape_channel_shuffle_nc_x8(
+  xnn_operator_t channel_shuffle_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_channel_shuffle_nc_x8(
+  xnn_operator_t channel_shuffle_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_channel_shuffle_nc_x32(
+  size_t groups,
+  size_t group_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* channel_shuffle_op_out);
+
+enum xnn_status xnn_reshape_channel_shuffle_nc_x32(
+  xnn_operator_t channel_shuffle_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_channel_shuffle_nc_x32(
+  xnn_operator_t channel_shuffle_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_clamp_nc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_f16(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_f16(
+  xnn_operator_t clamp_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_clamp_nc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_f32(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_f32(
+  xnn_operator_t clamp_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_clamp_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_clamp_nc_s8(
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_s8(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_s8(
+  xnn_operator_t clamp_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_clamp_nc_u8(
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* clamp_op_out);
+
+enum xnn_status xnn_reshape_clamp_nc_u8(
+  xnn_operator_t clamp_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_clamp_nc_u8(
+  xnn_operator_t clamp_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_constant_pad_nd_x8(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x8(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x8(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x8(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_constant_pad_nd_x16(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x16(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x16(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x16(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_constant_pad_nd_x32(
+  const void* padding_value,
+  uint32_t flags,
+  xnn_operator_t* constant_pad_op_out);
+
+enum xnn_status xnn_reshape_constant_pad_nd_x32(
+  xnn_operator_t constant_pad_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_padding,
+  const size_t* post_padding,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_constant_pad_nd_x32(
+  xnn_operator_t constant_pad_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_constant_pad_nd_x32(
+  uint32_t flags,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* pre_paddings,
+  const size_t* post_paddings,
+  const void* input,
+  void* output,
+  const void* padding_value,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f16_f32(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f16_f32(
+  xnn_operator_t convert_op,
+  const void* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_f16_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const void* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f16_qd8(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f16_qd8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries.
+enum xnn_status xnn_setup_convert_nc_f16_qd8(
+  xnn_operator_t convert_op,
+  const void* input,
+  int8_t* output,
+  struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_create_convert_nc_f32_qd8(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qd8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+// quantization_params must be padded with at least XNN_EXTRA_QUANTIZATION_PARAMS entries.
+enum xnn_status xnn_setup_convert_nc_f32_qd8(
+  xnn_operator_t convert_op,
+  const float* input,
+  int8_t* output,
+  struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_create_convert_nc_f32_f16(
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  const float* input,
+  void* output);
+
+enum xnn_status xnn_run_convert_nc_f32_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  void* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f32_qs8(
+  float output_scale,
+  int8_t output_zero_point,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_qs8(
+  xnn_operator_t convert_op,
+  const float* input,
+  int8_t* output);
+
+enum xnn_status xnn_run_convert_nc_f32_qs8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  int8_t* output,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_f32_qu8(
+  float output_scale,
+  uint8_t output_zero_point,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_f32_qu8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_f32_qu8(
+  xnn_operator_t convert_op,
+  const float* input,
+  uint8_t* output);
+
+enum xnn_status xnn_run_convert_nc_f32_qu8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  uint8_t* output,
+  float output_scale,
+  uint8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qs8(
+  float input_scale,
+  int8_t input_zero_point,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convert_nc_qs8_f16(
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8_f16(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  void* output);
+
+enum xnn_status xnn_create_convert_nc_qs8_f32(
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs8_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs8_f32(
+  xnn_operator_t convert_op,
+  const int8_t* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_qs8_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const int8_t* input,
+  float* output,
+  float input_scale,
+  int8_t input_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qs16_qs8(
+  float input_scale,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qs16_qs8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qs16_qs8(
+  xnn_operator_t convert_op,
+  const int16_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_run_convert_nc_qs16_qs8(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const int16_t* input,
+  int8_t* output,
+  float input_scale,
+  float output_scale,
+  int8_t output_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convert_nc_qu8(
+  float input_scale,
+  uint8_t input_zero_point,
+  float output_scale,
+  uint8_t output_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qu8(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qu8(
+  xnn_operator_t convert_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_convert_nc_qu8_f32(
+  float input_scale,
+  uint8_t input_zero_point,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_reshape_convert_nc_qu8_f32(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convert_nc_qu8_f32(
+  xnn_operator_t convert_op,
+  const uint8_t* input,
+  float* output);
+
+enum xnn_status xnn_run_convert_nc_qu8_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const uint8_t* input,
+  float* output,
+  float input_scale,
+  uint8_t input_zero_point,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_convolution2d_nchw_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nchw_f16(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nchw_f16(
+  xnn_operator_t convolution_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_convolution2d_nchw_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nchw_f32(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nchw_f32(
+  xnn_operator_t convolution_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_f16(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_f16(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+// Forward declare.
+struct xnn_post_operation;
+
+/// Create a convolution operator with a number of post operations. The
+/// convolution operator created using this function does not have output_min
+/// and output_max. The list of operators in post_operations will be applied in
+/// order. Convolution with post operations is only supported on JIT platforms
+/// and when JIT is enabled.
+enum xnn_status xnn_create_fused_convolution2d_nhwc_f32(
+    uint32_t input_padding_top,
+    uint32_t input_padding_right,
+    uint32_t input_padding_bottom,
+    uint32_t input_padding_left,
+    uint32_t kernel_height,
+    uint32_t kernel_width,
+    uint32_t subsampling_height,
+    uint32_t subsampling_width,
+    uint32_t dilation_height,
+    uint32_t dilation_width,
+    uint32_t groups,
+    size_t group_input_channels,
+    size_t group_output_channels,
+    size_t input_channel_stride,
+    size_t output_channel_stride,
+    const float* kernel,
+    const float* bias,
+    size_t num_post_operations,
+    struct xnn_post_operation* post_operations,
+    uint32_t flags,
+    xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache,
+    xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_f32(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_f32(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qd8_f16_qc8w(
+    uint32_t input_padding_top, uint32_t input_padding_right,
+    uint32_t input_padding_bottom, uint32_t input_padding_left,
+    uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height,
+    uint32_t subsampling_width, uint32_t dilation_height,
+    uint32_t dilation_width, uint32_t groups, size_t group_input_channels,
+    size_t group_output_channels, size_t input_channel_stride,
+    size_t output_channel_stride, const float* kernel_scale,
+    const int8_t* kernel, const float* bias, float output_min, float output_max,
+    uint32_t flags, xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qd8_f32_qc8w(
+    uint32_t input_padding_top, uint32_t input_padding_right,
+    uint32_t input_padding_bottom, uint32_t input_padding_left,
+    uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height,
+    uint32_t subsampling_width, uint32_t dilation_height,
+    uint32_t dilation_width, uint32_t groups, size_t group_input_channels,
+    size_t group_output_channels, size_t input_channel_stride,
+    size_t output_channel_stride, const float* kernel_scale,
+    const int8_t* kernel, const float* bias, float output_min, float output_max,
+    uint32_t flags, xnn_code_cache_t code_cache,
+    xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qs8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f16_qc8w(
+    xnn_operator_t convolution_op, size_t batch_size, size_t input_height,
+    size_t input_width, size_t* workspace_size, size_t* workspace_alignment,
+    size_t* output_height_out, size_t* output_width_out,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, size_t batch_size, size_t input_height,
+    size_t input_width, size_t* workspace_size, size_t* workspace_alignment,
+    size_t* output_height_out, size_t* output_width_out,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qs8(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f16_qc8w(
+    xnn_operator_t convolution_op, void* workspace, const int8_t* input,
+    void* output,
+    const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qd8_f32_qc8w(
+    xnn_operator_t convolution_op, void* workspace, const int8_t* input,
+    float* output,
+    const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qs8(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qs8_qc8w(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qs8_qc8w(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qs8_qc8w(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_convolution2d_nhwc_qu8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_qu8(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_qu8(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_copy_nc_x8(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x8(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x8(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_copy_nc_x16(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x16(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x16(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_copy_nc_x32(
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_reshape_copy_nc_x32(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_copy_nc_x32(
+  xnn_operator_t copy_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_copy_nc_x32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const uint32_t* input,
+  uint32_t* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_f16(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_f16(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_f16(
+  xnn_operator_t deconvolution_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_f32(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
+  xnn_operator_t deconvolution_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_qs8(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_qs8(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_qs8(
+  xnn_operator_t deconvolution_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_deconvolution2d_nhwc_qu8(
+  uint32_t output_padding_top,
+  uint32_t output_padding_right,
+  uint32_t output_padding_bottom,
+  uint32_t output_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* deconvolution_op_out);
+
+enum xnn_status xnn_reshape_deconvolution2d_nhwc_qu8(
+  xnn_operator_t deconvolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  uint32_t adjustment_height,
+  uint32_t adjustment_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_deconvolution2d_nhwc_qu8(
+  xnn_operator_t deconvolution_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nchw2nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nchw2nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nchw2nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x8(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x8(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x8(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x16(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_depth_to_space_nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* depth_to_space_op_out);
+
+enum xnn_status xnn_reshape_depth_to_space_nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_depth_to_space_nhwc_x32(
+  xnn_operator_t depth_to_space_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_divide_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* divide_op_out);
+
+enum xnn_status xnn_reshape_divide_nd_f16(
+  xnn_operator_t divide_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_divide_nd_f16(
+  xnn_operator_t divide_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_divide_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* divide_op_out);
+
+enum xnn_status xnn_reshape_divide_nd_f32(
+  xnn_operator_t divide_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_divide_nd_f32(
+  xnn_operator_t divide_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_divide_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_dynamic_fully_connected_nc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* dynamic_fully_connected_op_out);
+
+enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f16(
+  xnn_operator_t dynamic_fully_connected_op,
+  size_t batch_size,
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_dynamic_fully_connected_nc_f16(
+  xnn_operator_t dynamic_fully_connected_op,
+  void* workspace,
+  const void* input,
+  const void* kernel,
+  const void* bias,
+  void* output);
+
+enum xnn_status xnn_create_dynamic_fully_connected_nc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* dynamic_fully_connected_op_out);
+
+enum xnn_status xnn_reshape_dynamic_fully_connected_nc_f32(
+  xnn_operator_t dynamic_fully_connected_op,
+  size_t batch_size,
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_dynamic_fully_connected_nc_f32(
+  xnn_operator_t dynamic_fully_connected_op,
+  void* workspace,
+  const float* input,
+  const float* kernel,
+  const float* bias,
+  float* output);
+
+enum xnn_status xnn_create_elu_nc_f16(
+  float alpha,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_f16(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_f16(
+  xnn_operator_t elu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_elu_nc_f32(
+  float alpha,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_f32(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_f32(
+  xnn_operator_t elu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_elu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float alpha,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_elu_nc_qs8(
+  float alpha,
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* elu_op_out);
+
+enum xnn_status xnn_reshape_elu_nc_qs8(
+  xnn_operator_t elu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_elu_nc_qs8(
+  xnn_operator_t elu_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_floor_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* floor_op_out);
+
+enum xnn_status xnn_reshape_floor_nc_f16(
+  xnn_operator_t floor_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_floor_nc_f16(
+  xnn_operator_t floor_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_floor_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* floor_op_out);
+
+enum xnn_status xnn_reshape_floor_nc_f32(
+  xnn_operator_t floor_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_floor_nc_f32(
+  xnn_operator_t floor_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_floor_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_f16(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f16(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f16(
+  xnn_operator_t fully_connected_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const uint8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_f32_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const void* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc4w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  void* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc4w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t kernel_zero_point,
+  const float* kernel_scale,
+  const void* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  float* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc4w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f16_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f16_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  void* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f16_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qd8_f32_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const float* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_setup_fully_connected_nc_qd8_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  float* output,
+  const struct xnn_dynamic_quantization_params* quantization_params);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qd8_f32_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_fully_connected_nc_qs8(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  float kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qs8(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qs8(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qs8_qc8w(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  int8_t input_zero_point,
+  float input_scale,
+  const float* kernel_scale,
+  const int8_t* kernel,
+  const int32_t* bias,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qs8_qc8w(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qs8_qc8w(
+  xnn_operator_t fully_connected_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_fully_connected_nc_qu8(
+  size_t input_channels,
+  size_t output_channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t kernel_zero_point,
+  float kernel_scale,
+  const uint8_t* kernel,
+  const int32_t* bias,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* fully_connected_op_out);
+
+enum xnn_status xnn_reshape_fully_connected_nc_qu8(
+  xnn_operator_t fully_connected_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_fully_connected_nc_qu8(
+  xnn_operator_t fully_connected_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_global_average_pooling_ncw_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_ncw_f16(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_ncw_f16(
+  xnn_operator_t global_average_pooling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_average_pooling_ncw_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_ncw_f32(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_ncw_f32(
+  xnn_operator_t global_average_pooling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_f16(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_f16(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_f32(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_f32(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_qs8(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_qs8(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_global_average_pooling_nwc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* global_average_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_average_pooling_nwc_qu8(
+  xnn_operator_t global_average_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_average_pooling_nwc_qu8(
+  xnn_operator_t global_average_pooling_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_global_sum_pooling_nwc_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_sum_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_sum_pooling_nwc_f16(
+  xnn_operator_t global_sum_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_sum_pooling_nwc_f16(
+  xnn_operator_t global_sum_pooling_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_global_sum_pooling_nwc_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* global_sum_pooling_op_out);
+
+enum xnn_status xnn_reshape_global_sum_pooling_nwc_f32(
+  xnn_operator_t global_sum_pooling_op,
+  size_t batch_size,
+  size_t width,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_global_sum_pooling_nwc_f32(
+  xnn_operator_t global_sum_pooling_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_hardswish_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* hardswish_op_out);
+
+enum xnn_status xnn_reshape_hardswish_nc_f16(
+  xnn_operator_t hardswish_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_hardswish_nc_f16(
+  xnn_operator_t hardswish_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_hardswish_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* hardswish_op_out);
+
+enum xnn_status xnn_reshape_hardswish_nc_f32(
+  xnn_operator_t hardswish_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_hardswish_nc_f32(
+  xnn_operator_t hardswish_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_hardswish_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_leaky_relu_nc_f16(
+  float negative_slope,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_f16(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_f16(
+  xnn_operator_t leaky_relu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_leaky_relu_nc_f32(
+  float negative_slope,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_f32(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_f32(
+  xnn_operator_t leaky_relu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_leaky_relu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  float negative_slope,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_leaky_relu_nc_qs8(
+  float negative_slope,
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_qs8(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_qs8(
+  xnn_operator_t leaky_relu_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_leaky_relu_nc_qu8(
+  float negative_slope,
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* leaky_relu_op_out);
+
+enum xnn_status xnn_reshape_leaky_relu_nc_qu8(
+  xnn_operator_t leaky_relu_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_leaky_relu_nc_qu8(
+  xnn_operator_t leaky_relu_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_f16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_f16(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f16(
+  xnn_operator_t max_pooling_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_f32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_f32(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
+  xnn_operator_t max_pooling_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_s8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_s8(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_s8(
+  xnn_operator_t max_pooling_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_max_pooling2d_nhwc_u8(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  uint32_t stride_height,
+  uint32_t stride_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* max_pooling_op_out);
+
+enum xnn_status xnn_reshape_max_pooling2d_nhwc_u8(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
+  xnn_operator_t max_pooling_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_maximum_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* maximum_op_out);
+
+enum xnn_status xnn_reshape_maximum_nd_f16(
+  xnn_operator_t maximum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_maximum_nd_f16(
+  xnn_operator_t maximum_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_maximum_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* maximum_op_out);
+
+enum xnn_status xnn_reshape_maximum_nd_f32(
+  xnn_operator_t maximum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_maximum_nd_f32(
+  xnn_operator_t maximum_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_maximum_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_mean_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* mean_op_out);
+
+enum xnn_status xnn_reshape_mean_nd_f16(
+  xnn_operator_t mean_op,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  size_t num_input_dims,
+  const size_t* input_shape,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_mean_nd_f16(
+  xnn_operator_t mean_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_mean_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* mean_op_out);
+
+enum xnn_status xnn_reshape_mean_nd_f32(
+  xnn_operator_t mean_op,
+  size_t num_reduction_axes,
+  const size_t* reduction_axes,
+  size_t num_input_dims,
+  const size_t* input_shape,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_mean_nd_f32(
+  xnn_operator_t mean_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_minimum_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* minimum_op_out);
+
+enum xnn_status xnn_reshape_minimum_nd_f16(
+  xnn_operator_t minimum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_minimum_nd_f16(
+  xnn_operator_t minimum_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_minimum_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* minimum_op_out);
+
+enum xnn_status xnn_reshape_minimum_nd_f32(
+  xnn_operator_t minimum_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_minimum_nd_f32(
+  xnn_operator_t minimum_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_minimum_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_f16(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_f16(
+  xnn_operator_t multiply_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_multiply_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_f32(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_f32(
+  xnn_operator_t multiply_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_multiply_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_qs8(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_qs8(
+  xnn_operator_t multiply_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_multiply_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_multiply_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_reshape_multiply_nd_qu8(
+  xnn_operator_t multiply_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_multiply_nd_qu8(
+  xnn_operator_t multiply_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_multiply_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_negate_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* negate_op_out);
+
+enum xnn_status xnn_reshape_negate_nc_f16(
+  xnn_operator_t negate_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_negate_nc_f16(
+  xnn_operator_t negate_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_negate_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* negate_op_out);
+
+enum xnn_status xnn_reshape_negate_nc_f32(
+  xnn_operator_t negate_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_negate_nc_f32(
+  xnn_operator_t negate_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_negate_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_prelu_nc_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  const void* negative_slope,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* prelu_op_out);
+
+enum xnn_status xnn_reshape_prelu_nc_f16(
+  xnn_operator_t prelu_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_prelu_nc_f16(
+  xnn_operator_t prelu_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_prelu_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  const float* negative_slope,
+  uint32_t flags,
+  xnn_code_cache_t code_cache,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* prelu_op_out);
+
+enum xnn_status xnn_reshape_prelu_nc_f32(
+  xnn_operator_t prelu_op,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_prelu_nc_f32(
+  xnn_operator_t prelu_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nchw_f32(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f32(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nchw_f32(
+  xnn_operator_t resize_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nchw_f16(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nchw_f16(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nchw_f16(
+  xnn_operator_t resize_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_f16(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f16(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f16(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_f32(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_s8(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
+  size_t output_height,
+  size_t output_width,
+  uint32_t flags,
+  xnn_operator_t* resize_op_out);
+
+enum xnn_status xnn_reshape_resize_bilinear2d_nhwc_u8(
+  xnn_operator_t resize_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
+  xnn_operator_t resize_op,
+  void* workspace,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_rope_nthc_f16(
+  size_t max_tokens,
+  uint32_t flags,
+  xnn_operator_t* rope_op_out);
+
+enum xnn_status xnn_reshape_rope_nthc_f16(
+  xnn_operator_t rope_op,
+  size_t batch_size,
+  size_t tokens,
+  size_t heads,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_rope_nthc_f16(
+  xnn_operator_t rope_op,
+  const void* input,
+  const void* weights,
+  void* output);
+
+enum xnn_status xnn_create_rope_nthc_f32(
+  size_t max_tokens,
+  uint32_t flags,
+  xnn_operator_t* rope_op_out);
+
+enum xnn_status xnn_reshape_rope_nthc_f32(
+  xnn_operator_t rope_op,
+  size_t batch_size,
+  size_t tokens,
+  size_t heads,
+  size_t channels,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_rope_nthc_f32(
+  xnn_operator_t rope_op,
+  const float* input,
+  const float* weights,
+  float* output);
+
+// N: batch size
+// H: number of heads
+// T: tokens (sequence length)
+// C: channels (head dimension)
+enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f16(
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t flags,
+  xnn_operator_t* attention_op_out);
+
+enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f16(
+  xnn_operator_t attention_op,
+  size_t batch_size,
+  size_t query_heads,
+  // Number of tokens in query.
+  size_t query_tokens,
+  size_t key_value_heads,
+  // Number of tokens in key/value. For self-attention, this is same as tokens.
+  size_t key_value_tokens,
+  size_t query_key_channels,
+  size_t value_channels,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+// Query is of dimension [batch_size, query_heads, query_tokens, channels].
+// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, channels].
+// Scale is of dimension [channels].
+// Mask is of dimension [query_tokens, key_value_tokens].
+enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f16(
+  xnn_operator_t attention_op,
+  void* workspace,
+  const void* query,
+  const void* key,
+  const void* value,
+  const void* scale,
+  const void* mask,
+  void* output);
+
+// N: batch size
+// H: number of heads
+// T: tokens (sequence length)
+// C: channels (head dimension)
+enum xnn_status xnn_create_scaled_dot_product_attention_nhtc_f32(
+  enum xnn_attention_logits_cap_type cap_type,
+  const void* cap_params,
+  uint32_t flags,
+  xnn_operator_t* attention_op_out);
+
+enum xnn_status xnn_reshape_scaled_dot_product_attention_nhtc_f32(
+  xnn_operator_t attention_op,
+  size_t batch_size,
+  size_t query_heads,
+  // Number of tokens in query.
+  size_t query_tokens,
+  size_t key_value_heads,
+  // Number of tokens in key/value. For self-attention, this is same as tokens.
+  size_t key_value_tokens,
+  size_t query_key_channels,
+  size_t value_channels,
+  size_t* workspace_size,
+  size_t* workspace_alignment,
+  pthreadpool_t threadpool);
+
+// Query is of dimension [batch_size, query_heads, query_tokens, query_key_channels].
+// Key and value are of dimension [batch_size, key_value_heads, key_value_tokens, query_key_channels].
+// Scale is of dimension [query_key_channels].
+// Mask is of dimension [query_tokens, key_value_tokens].
+// Output is of dimension [batch_size, query_heads, query_tokens, value_channels].
+enum xnn_status xnn_setup_scaled_dot_product_attention_nhtc_f32(
+  xnn_operator_t attention_op,
+  void* workspace,
+  const float* query,
+  const float* key,
+  const float* value,
+  const float* scale,
+  const float* mask,
+  float* output);
+
+enum xnn_status xnn_create_sigmoid_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_f16(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_f16(
+  xnn_operator_t sigmoid_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_sigmoid_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_f32(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_f32(
+  xnn_operator_t sigmoid_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_sigmoid_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_sigmoid_nc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_qs8(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_qs8(
+  xnn_operator_t sigmoid_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_sigmoid_nc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* sigmoid_op_out);
+
+enum xnn_status xnn_reshape_sigmoid_nc_qu8(
+  xnn_operator_t sigmoid_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_sigmoid_nc_qu8(
+  xnn_operator_t sigmoid_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_slice_nd_x16(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x16(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x16(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_slice_nd_x32(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x32(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x32(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_slice_nd_x32(
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  const void* input,
+  void* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_softmax_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_f16(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_f16(
+  xnn_operator_t softmax_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_softmax_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_f32(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_f32(
+  xnn_operator_t softmax_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_create_softmax_nc_qu8(
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint32_t flags,
+  xnn_operator_t* softmax_op_out);
+
+enum xnn_status xnn_reshape_softmax_nc_qu8(
+  xnn_operator_t softmax_op,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_softmax_nc_qu8(
+  xnn_operator_t softmax_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x16(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x16(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x16(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x32(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x32(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x32(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* square_op_out);
+
+enum xnn_status xnn_reshape_square_nc_f16(
+  xnn_operator_t square_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_nc_f16(
+  xnn_operator_t square_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* square_op_out);
+
+enum xnn_status xnn_reshape_square_nc_f32(
+  xnn_operator_t square_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_nc_f32(
+  xnn_operator_t square_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_square_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_square_root_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_square_root_nc_f16(
+  xnn_operator_t sqrt_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_root_nc_f16(
+  xnn_operator_t sqrt_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_square_root_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_square_root_nc_f32(
+  xnn_operator_t sqrt_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_square_root_nc_f32(
+  xnn_operator_t sqrt_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_square_root_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_reciprocal_square_root_nc_f32(
+    uint32_t flags, xnn_operator_t* sqrt_op_out);
+
+enum xnn_status xnn_reshape_reciprocal_square_root_nc_f32(
+    xnn_operator_t sqrt_op, size_t batch_size, size_t channels,
+    size_t input_stride, size_t output_stride, pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_reciprocal_square_root_nc_f32(xnn_operator_t sqrt_op,
+                                                        const float* input,
+                                                        float* output);
+
+enum xnn_status xnn_run_reciprocal_square_root_nc_f32(
+    size_t channels, size_t input_stride, size_t output_stride,
+    size_t batch_size, const float* input, float* output, uint32_t flags,
+    pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_squared_difference_nd_f16(
+  uint32_t flags,
+  xnn_operator_t* squared_difference_op_out);
+
+enum xnn_status xnn_reshape_squared_difference_nd_f16(
+  xnn_operator_t squared_difference_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_squared_difference_nd_f16(
+  xnn_operator_t squared_difference_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_squared_difference_nd_f32(
+  uint32_t flags,
+  xnn_operator_t* squared_difference_op_out);
+
+enum xnn_status xnn_reshape_squared_difference_nd_f32(
+  xnn_operator_t squared_difference_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_squared_difference_nd_f32(
+  xnn_operator_t squared_difference_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_squared_difference_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_f16(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_f16(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_f16(
+  xnn_operator_t subtract_op,
+  const void* input1,
+  const void* input2,
+  void* output);
+
+enum xnn_status xnn_create_subtract_nd_f32(
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_f32(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_f32(
+  xnn_operator_t subtract_op,
+  const float* input1,
+  const float* input2,
+  float* output);
+
+enum xnn_status xnn_run_subtract_nd_f32(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  const float* input1,
+  const float* input2,
+  float* output,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_qs8(
+  int8_t input1_zero_point,
+  float input1_scale,
+  int8_t input2_zero_point,
+  float input2_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_qs8(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_qs8(
+  xnn_operator_t subtract_op,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output);
+
+enum xnn_status xnn_run_subtract_nd_qs8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  int8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  int8_t input2_zero_point,
+  float input2_scale,
+  const int8_t* input1,
+  const int8_t* input2,
+  int8_t* output,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_subtract_nd_qu8(
+  uint8_t input1_zero_point,
+  float input1_scale,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* subtract_op_out);
+
+enum xnn_status xnn_reshape_subtract_nd_qu8(
+  xnn_operator_t subtract_op,
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_subtract_nd_qu8(
+  xnn_operator_t subtract_op,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output);
+
+enum xnn_status xnn_run_subtract_nd_qu8(
+  size_t num_input1_dims,
+  const size_t* input1_shape,
+  uint8_t input1_zero_point,
+  float input1_scale,
+  size_t num_input2_dims,
+  const size_t* input2_shape,
+  uint8_t input2_zero_point,
+  float input2_scale,
+  const uint8_t* input1,
+  const uint8_t* input2,
+  uint8_t* output,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_tanh_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_f16(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_f16(
+  xnn_operator_t tanh_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_tanh_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_f32(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_f32(
+  xnn_operator_t tanh_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_tanh_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_tanh_nc_qs8(
+  int8_t input_zero_point,
+  float input_scale,
+  int8_t output_zero_point,
+  float output_scale,
+  int8_t output_min,
+  int8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_qs8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_qs8(
+  xnn_operator_t tanh_op,
+  const int8_t* input,
+  int8_t* output);
+
+enum xnn_status xnn_create_tanh_nc_qu8(
+  uint8_t input_zero_point,
+  float input_scale,
+  uint8_t output_zero_point,
+  float output_scale,
+  uint8_t output_min,
+  uint8_t output_max,
+  uint32_t flags,
+  xnn_operator_t* tanh_op_out);
+
+enum xnn_status xnn_reshape_tanh_nc_qu8(
+  xnn_operator_t tanh_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_tanh_nc_qu8(
+  xnn_operator_t tanh_op,
+  const uint8_t* input,
+  uint8_t* output);
+
+enum xnn_status xnn_create_transpose_nd_x8(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x8(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x8(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x8(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x16(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x16(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x16(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x16(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x32(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x32(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x32(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x32(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_transpose_nd_x64(
+  uint32_t flags,
+  xnn_operator_t* transpose_op_out);
+
+enum xnn_status xnn_reshape_transpose_nd_x64(
+  xnn_operator_t transpose_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_transpose_nd_x64(
+  xnn_operator_t transpose_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_run_transpose_nd_x64(
+  const void* input,
+  void* output,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* output_perm,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_truncation_nc_f16(
+  uint32_t flags,
+  xnn_operator_t* truncation_op_out);
+
+enum xnn_status xnn_reshape_truncation_nc_f16(
+  xnn_operator_t truncation_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_truncation_nc_f16(
+  xnn_operator_t truncation_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_truncation_nc_f32(
+  uint32_t flags,
+  xnn_operator_t* truncation_op_out);
+
+enum xnn_status xnn_reshape_truncation_nc_f32(
+  xnn_operator_t truncation_op,
+  size_t batch_size,
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_truncation_nc_f32(
+  xnn_operator_t truncation_op,
+  const float* input,
+  float* output);
+
+enum xnn_status xnn_run_truncation_nc_f32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  size_t batch_size,
+  const float* input,
+  float* output,
+  uint32_t flags,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_create_unpooling2d_nhwc_x32(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t pooling_height,
+  uint32_t pooling_width,
+  size_t channels,
+  size_t input_pixel_stride,
+  size_t output_pixel_stride,
+  uint32_t flags,
+  xnn_operator_t* unpooling_op_out);
+
+enum xnn_status xnn_reshape_unpooling2d_nhwc_x32(
+  xnn_operator_t unpooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_unpooling2d_nhwc_x32(
+  xnn_operator_t unpooling_op,
+  const void* input,
+  const uint32_t* index,
+  void* output);
+
+enum xnn_status xnn_create_slice_nd_x8(
+  uint32_t flags,
+  xnn_operator_t* slice_op_out);
+
+enum xnn_status xnn_reshape_slice_nd_x8(
+  xnn_operator_t slice_op,
+  size_t num_dims,
+  const size_t* input_shape,
+  const size_t* offsets,
+  const size_t* sizes,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_slice_nd_x8(
+  xnn_operator_t slice_op,
+  const void* input,
+  void* output);
+
+enum xnn_status xnn_create_space_to_depth_nhwc_x8(
+  uint32_t block_size,
+  uint32_t flags,
+  xnn_operator_t* space_to_depth_op_out);
+
+enum xnn_status xnn_reshape_space_to_depth_nhwc_x8(
+  xnn_operator_t space_to_depth_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t input_channels,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  size_t* output_channels_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_space_to_depth_nhwc_x8(
+  xnn_operator_t space_to_depth_op,
+  const void* input,
+  void* output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/MLPY/Lib/site-packages/torch/jit/__init__.py b/MLPY/Lib/site-packages/torch/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27fdaa4b1bfcdd9326fd640e0ee266134e282d83
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/__init__.py
@@ -0,0 +1,294 @@
+import warnings
+
+from contextlib import contextmanager
+from typing import Any, Iterator
+
+import torch._C
+
+# These are imported so users can access them from the `torch.jit` module
+from torch._jit_internal import (
+    _Await,
+    _drop,
+    _IgnoreContextManager,
+    _isinstance,
+    _overload,
+    _overload_method,
+    export,
+    Final,
+    Future,
+    ignore,
+    is_scripting,
+    unused,
+)
+from torch.jit._async import fork, wait
+from torch.jit._await import _awaitable, _awaitable_nowait, _awaitable_wait
+from torch.jit._decomposition_utils import _register_decomposition
+from torch.jit._freeze import freeze, optimize_for_inference, run_frozen_optimizations
+from torch.jit._fuser import (
+    fuser,
+    last_executed_optimized_graph,
+    optimized_execution,
+    set_fusion_strategy,
+)
+from torch.jit._ir_utils import _InsertPoint
+from torch.jit._script import (
+    _ScriptProfile,
+    _unwrap_optional,
+    Attribute,
+    CompilationUnit,
+    interface,
+    RecursiveScriptClass,
+    RecursiveScriptModule,
+    script,
+    script_method,
+    ScriptFunction,
+    ScriptModule,
+    ScriptWarning,
+)
+from torch.jit._serialization import (
+    jit_module_from_flatbuffer,
+    load,
+    save,
+    save_jit_module_to_flatbuffer,
+)
+from torch.jit._trace import (
+    _flatten,
+    _get_trace_graph,
+    _script_if_tracing,
+    _unique_state_dict,
+    is_tracing,
+    ONNXTracedModule,
+    TopLevelTracedModule,
+    trace,
+    trace_module,
+    TracedModule,
+    TracerWarning,
+    TracingCheckError,
+)
+
+from torch.utils import set_module
+
+__all__ = [
+    "Attribute",
+    "CompilationUnit",
+    "Error",
+    "Future",
+    "ScriptFunction",
+    "ScriptModule",
+    "annotate",
+    "enable_onednn_fusion",
+    "export",
+    "export_opnames",
+    "fork",
+    "freeze",
+    "ignore",
+    "isinstance",
+    "load",
+    "onednn_fusion_enabled",
+    "optimize_for_inference",
+    "save",
+    "script",
+    "script_if_tracing",
+    "set_fusion_strategy",
+    "strict_fusion",
+    "trace",
+    "trace_module",
+    "unused",
+    "wait",
+]
+
+# For backwards compatibility
+_fork = fork
+_wait = wait
+_set_fusion_strategy = set_fusion_strategy
+
+
+def export_opnames(m):
+    r"""
+    Generate new bytecode for a Script module.
+
+    Returns what the op list would be for a Script Module based off the current code base.
+
+    If you have a LiteScriptModule and want to get the currently present
+    list of ops call _export_operator_list instead.
+    """
+    return torch._C._export_opnames(m._c)
+
+
+# torch.jit.Error
+Error = torch._C.JITException
+set_module(Error, "torch.jit")
+# This is not perfect but works in common cases
+Error.__name__ = "Error"
+Error.__qualname__ = "Error"
+
+
+# for use in python if using annotate
+def annotate(the_type, the_value):
+    """Use to give type of `the_value` in TorchScript compiler.
+
+    This method is a pass-through function that returns `the_value`, used to hint TorchScript
+    compiler the type of `the_value`. It is a no-op when running outside of TorchScript.
+
+    Though TorchScript can infer correct type for most Python expressions, there are some cases where
+    type inference can be wrong, including:
+
+    - Empty containers like `[]` and `{}`, which TorchScript assumes to be container of `Tensor`
+    - Optional types like `Optional[T]` but assigned a valid value of type `T`, TorchScript would assume
+      it is type `T` rather than `Optional[T]`
+
+    Note that `annotate()` does not help in `__init__` method of `torch.nn.Module` subclasses because it
+    is executed in eager mode. To annotate types of `torch.nn.Module` attributes,
+    use :meth:`~torch.jit.Annotate` instead.
+
+    Example:
+
+    .. testcode::
+
+        import torch
+        from typing import Dict
+
+        @torch.jit.script
+        def fn():
+            # Telling TorchScript that this empty dictionary is a (str -> int) dictionary
+            # instead of default dictionary type of (str -> Tensor).
+            d = torch.jit.annotate(Dict[str, int], {})
+
+            # Without `torch.jit.annotate` above, following statement would fail because of
+            # type mismatch.
+            d["name"] = 20
+
+    .. testcleanup::
+
+        del fn
+
+    Args:
+        the_type: Python type that should be passed to TorchScript compiler as type hint for `the_value`
+        the_value: Value or expression to hint type for.
+
+    Returns:
+        `the_value` is passed back as return value.
+    """
+    return the_value
+
+
+def script_if_tracing(fn):
+    """
+    Compiles ``fn`` when it is first called during tracing.
+
+    ``torch.jit.script`` has a non-negligible start up time when it is first called due to
+    lazy-initializations of many compiler builtins. Therefore you should not use
+    it in library code. However, you may want to have parts of your library work
+    in tracing even if they use control flow. In these cases, you should use
+    ``@torch.jit.script_if_tracing`` to substitute for
+    ``torch.jit.script``.
+
+    Args:
+        fn: A function to compile.
+
+    Returns:
+        If called during tracing, a :class:`ScriptFunction` created by `torch.jit.script` is returned.
+        Otherwise, the original function `fn` is returned.
+    """
+    return _script_if_tracing(fn)
+
+
+# for torch.jit.isinstance
+def isinstance(obj, target_type):
+    """
+    Provide container type refinement in TorchScript.
+
+    It can refine parameterized containers of the List, Dict, Tuple, and Optional types. E.g. ``List[str]``,
+    ``Dict[str, List[torch.Tensor]]``, ``Optional[Tuple[int,str,int]]``. It can also
+    refine basic types such as bools and ints that are available in TorchScript.
+
+    Args:
+        obj: object to refine the type of
+        target_type: type to try to refine obj to
+    Returns:
+        ``bool``: True if obj was successfully refined to the type of target_type,
+            False otherwise with no new type refinement
+
+
+    Example (using ``torch.jit.isinstance`` for type refinement):
+    .. testcode::
+
+        import torch
+        from typing import Any, Dict, List
+
+        class MyModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, input: Any): # note the Any type
+                if torch.jit.isinstance(input, List[torch.Tensor]):
+                    for t in input:
+                        y = t.clamp(0, 0.5)
+                elif torch.jit.isinstance(input, Dict[str, str]):
+                    for val in input.values():
+                        print(val)
+
+        m = torch.jit.script(MyModule())
+        x = [torch.rand(3,3), torch.rand(4,3)]
+        m(x)
+        y = {"key1":"val1","key2":"val2"}
+        m(y)
+    """
+    return _isinstance(obj, target_type)
+
+
+class strict_fusion:
+    """
+    Give errors if not all nodes have been fused in inference, or symbolically differentiated in training.
+
+    Example:
+    Forcing fusion of additions.
+
+    .. code-block:: python
+
+        @torch.jit.script
+        def foo(x):
+            with torch.jit.strict_fusion():
+                return x + x + x
+
+    """
+
+    def __init__(self):
+        if not torch._jit_internal.is_scripting():
+            warnings.warn("Only works in script mode")
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, type: Any, value: Any, tb: Any) -> None:
+        pass
+
+
+# Context manager for globally hiding source ranges when printing graphs.
+# Note that these functions are exposed to Python as static members of the
+# Graph class, so mypy checks need to be skipped.
+@contextmanager
+def _hide_source_ranges() -> Iterator[None]:
+    old_enable_source_ranges = torch._C.Graph.global_print_source_ranges  # type: ignore[attr-defined]
+    try:
+        torch._C.Graph.set_global_print_source_ranges(False)  # type: ignore[attr-defined]
+        yield
+    finally:
+        torch._C.Graph.set_global_print_source_ranges(old_enable_source_ranges)  # type: ignore[attr-defined]
+
+
+def enable_onednn_fusion(enabled: bool):
+    """Enable or disables onednn JIT fusion based on the parameter `enabled`."""
+    torch._C._jit_set_llga_enabled(enabled)
+
+
+def onednn_fusion_enabled():
+    """Return whether onednn JIT fusion is enabled."""
+    return torch._C._jit_llga_enabled()
+
+
+del Any
+
+if not torch._C._jit_init():
+    raise RuntimeError("JIT initialization failed")
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd6f337fa38d40b14ed648820cb40eac8537ae33
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_async.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_async.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7de1e409e3caf9de7d4b2ce231d52af97f1515bf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_async.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_await.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_await.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45044c991a2a071ba671160bcbacbbe720047867
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_await.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_builtins.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_builtins.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31d89539ad1134da1ed8db780666ba0afdb8f9aa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_builtins.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_check.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_check.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5adfdabdfed759e3b7187cc15922205edf56c2c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_check.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_dataclass_impls.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_dataclass_impls.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec61e1ac6e7078108773cea8d12d2ee5c6d2d15d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_dataclass_impls.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_decomposition_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_decomposition_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c053b73d6cc0e3df47cc6ca1c757ce9a862a21f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_decomposition_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_decompositions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_decompositions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92f3a7356c409b7d0645a0db4291e8602773dcbb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_decompositions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_freeze.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_freeze.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..494d5a6f3f0dd04ef94a8597e1110293b783c09c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_freeze.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_fuser.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_fuser.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0663c9976d1245330fde7a611c756919119f55
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_fuser.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_ir_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_ir_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55e5d90c900031a46215456a9a6ac3dde60caa91
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_ir_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_logging.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_logging.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..729fe79526be2fa34bffa9c9fdb2f58257a25c5e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_logging.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_monkeytype_config.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_monkeytype_config.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf3ca5361e4d94b3f9edf700114d1252898b0a55
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_monkeytype_config.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_pickle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_pickle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2741ae8d0178f83679b887aff120ff1a93d81229
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_pickle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_recursive.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_recursive.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee079912651c4c2afd00ad50fc7505b09dd98ff6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_recursive.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_script.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_script.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48c42ac27ea41523b3b5565299f4bf444d36988e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_script.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_serialization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_serialization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f617dc4b4da39bc7ec0788d01ccebf0db371bc2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_serialization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_shape_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_shape_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a01ab03ae7d8e856de8f8730c6adfdda5d6fd027
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_shape_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_state.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_state.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98a095b95f3e54256888775bda1faba1ef41e6dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_state.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/_trace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/_trace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e24245175706f3ce4be6ac53a5485973ca8f9aab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/_trace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/annotations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/annotations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e50351ef9904ae10b185e4c91ab392af0bc3b773
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/annotations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/frontend.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/frontend.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af8f914060c2d67547d8c7d143b985fc98c6f941
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/frontend.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/generate_bytecode.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/generate_bytecode.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e7acb576fe247b4c1825555aabc4d29a4b554a1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/generate_bytecode.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/quantized.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/quantized.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c1d83c26f27c4d7414866ce21d5120c3fcfe93f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/quantized.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/supported_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/supported_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc242c99ff94a1502c5be6e09ff7b2ef51b14f57
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/supported_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/__pycache__/unsupported_tensor_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/__pycache__/unsupported_tensor_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a559508c20b7547ed146e031094cb439c3e0d5fb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/__pycache__/unsupported_tensor_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/_async.py b/MLPY/Lib/site-packages/torch/jit/_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc8908119ed4d19880e696f5136955467203c664
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_async.py
@@ -0,0 +1,101 @@
+"""Async API.
+
+This module contains the API for parallelism in TorchScript, notably:
+    * torch.jit.fork
+    * torch.jit.wait
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+
+import torch
+from torch._jit_internal import Future
+from torch.jit._builtins import _register_builtin
+
+from torch.utils import set_module
+
+set_module(Future, "torch.jit")
+
+
+def fork(func, *args, **kwargs):
+    r"""
+    Create an asynchronous task executing `func` and a reference to the value of the result of this execution.
+
+    `fork` will return immediately, so the return value of `func` may not have been computed yet. To force completion
+    of the task and access the return value invoke `torch.jit.wait` on the Future. `fork` invoked
+    with a `func` which returns `T` is typed as `torch.jit.Future[T]`. `fork` calls can be arbitrarily
+    nested, and may be invoked with positional and keyword arguments.
+    Asynchronous execution will only occur when run in TorchScript. If run in pure python,
+    `fork` will not execute in parallel. `fork` will also not execute in parallel when invoked
+    while tracing, however the `fork` and `wait` calls will be captured in the exported IR Graph.
+
+    .. warning::
+        `fork` tasks will execute non-deterministically. We recommend only spawning
+        parallel fork tasks for pure functions that do not modify their inputs,
+        module attributes, or global state.
+
+    Args:
+        func (callable or torch.nn.Module):  A Python function or `torch.nn.Module`
+            that will be invoked. If executed in TorchScript, it will execute asynchronously,
+            otherwise it will not. Traced invocations of fork will be captured in the IR.
+        ``*args``, ``**kwargs``: arguments to invoke `func` with.
+    Returns:
+        `torch.jit.Future[T]`: a reference to the execution of `func`. The value `T`
+        can only be accessed by forcing completion of `func` through `torch.jit.wait`.
+
+    Example (fork a free function):
+
+    .. code-block:: python
+
+        import torch
+        from torch import Tensor
+        def foo(a : Tensor, b : int) -> Tensor:
+            return a + b
+        def bar(a):
+            fut : torch.jit.Future[Tensor] = torch.jit.fork(foo, a, b=2)
+            return torch.jit.wait(fut)
+        script_bar = torch.jit.script(bar)
+        input = torch.tensor(2)
+        # only the scripted version executes asynchronously
+        assert script_bar(input) == bar(input)
+        # trace is not run asynchronously, but fork is captured in IR
+        graph = torch.jit.trace(bar, (input,)).graph
+        assert "fork" in str(graph)
+
+    Example (fork a module method):
+
+    .. code-block:: python
+
+        import torch
+        from torch import Tensor
+        class AddMod(torch.nn.Module):
+            def forward(self, a: Tensor, b : int):
+                return a + b
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super(self).__init__()
+                self.mod = AddMod()
+            def forward(self, input):
+                fut = torch.jit.fork(self.mod, a, b=2)
+                return torch.jit.wait(fut)
+        input = torch.tensor(2)
+        mod = Mod()
+        assert mod(input) == torch.jit.script(mod).forward(input)
+    """
+    return torch._C.fork(func, *args, **kwargs)
+
+
+def wait(future):
+    r"""
+    Force completion of a `torch.jit.Future[T]` asynchronous task, returning the result of the task.
+
+    See :func:`~fork` for docs and examples.
+    Args:
+        future (torch.jit.Future[T]): an asynchronous task reference, created through `torch.jit.fork`
+    Returns:
+        `T`: the return value of the completed task
+    """
+    return torch._C.wait(future)
+
+
+_register_builtin(wait, "aten::wait")
diff --git a/MLPY/Lib/site-packages/torch/jit/_await.py b/MLPY/Lib/site-packages/torch/jit/_await.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8cb9523e412022d7b9f1e0cce4a0a33dc6935d1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_await.py
@@ -0,0 +1,26 @@
+import torch
+from torch._jit_internal import _Await
+from torch.jit._builtins import _register_builtin
+
+from torch.utils import set_module
+
+set_module(_Await, "torch.jit")
+
+
+def _awaitable(func, *args, **kwargs):
+    r"""Create Await object that will call specified functioni with specified args, when it is requested for the result."""
+    return torch._C._awaitable(func, *args, **kwargs)
+
+
+def _awaitable_wait(aw):
+    r"""Request await the result of execution, if Await is not completed yet, the func will be called immediately."""
+    return torch._C._awaitable_wait(aw)
+
+
+def _awaitable_nowait(o):
+    r"""Create completed Await with specified result."""
+    return torch._C._awaitable_nowait(o)
+
+
+_register_builtin(_awaitable_wait, "prim::awaitable_wait")
+_register_builtin(_awaitable_nowait, "prim::awaitable_nowait")
diff --git a/MLPY/Lib/site-packages/torch/jit/_builtins.py b/MLPY/Lib/site-packages/torch/jit/_builtins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b02d854e4d3c16493f01ed84acef8afd88251c9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_builtins.py
@@ -0,0 +1,187 @@
+import cmath
+import math
+import warnings
+
+from collections import OrderedDict
+from typing import Dict, Optional
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from ..nn.modules.utils import _list_with_default, _pair, _quadruple, _single, _triple
+
+_builtin_table: Optional[Dict[int, str]] = None
+
+_modules_containing_builtins = (torch, torch._C._nn, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._sparse, torch._C._special)  # type: ignore[attr-defined] # noqa: B950
+
+_builtin_ops = [
+    # Pairs of (function, op_name)
+    (_pair, "aten::_pair"),
+    (_quadruple, "aten::_quadruple"),
+    (_single, "aten::_single"),
+    (_triple, "aten::_triple"),
+    (_list_with_default, "aten::list_with_default"),
+    (OrderedDict, "aten::dict"),
+    (dict, "aten::dict"),
+    (cudnn.is_acceptable, "aten::cudnn_is_acceptable"),
+    (math.ceil, "aten::ceil"),
+    (math.copysign, "aten::copysign"),
+    (math.erf, "aten::erf"),
+    (math.erfc, "aten::erfc"),
+    (math.exp, "aten::exp"),
+    (math.expm1, "aten::expm1"),
+    (math.fabs, "aten::fabs"),
+    (math.floor, "aten::floor"),
+    (math.gamma, "aten::gamma"),
+    (math.lgamma, "aten::lgamma"),
+    (math.log, "aten::log"),
+    (math.log10, "aten::log10"),
+    (math.log1p, "aten::log1p"),
+    (math.pow, "aten::pow"),
+    (math.sqrt, "aten::sqrt"),
+    (math.isnan, "aten::isnan"),
+    (math.asinh, "aten::asinh"),
+    (math.atanh, "aten::atanh"),
+    (math.cosh, "aten::cosh"),
+    (math.sinh, "aten::sinh"),
+    (math.tanh, "aten::tanh"),
+    (math.acos, "aten::acos"),
+    (math.asin, "aten::asin"),
+    (math.atan, "aten::atan"),
+    (math.atan2, "aten::atan2"),
+    (math.cos, "aten::cos"),
+    (math.sin, "aten::sin"),
+    (math.tan, "aten::tan"),
+    (math.asinh, "aten::asinh"),
+    (math.atanh, "aten::atanh"),
+    (math.acosh, "aten::acosh"),
+    (math.fmod, "aten::fmod"),
+    (math.modf, "aten::modf"),
+    (math.factorial, "aten::factorial"),
+    (math.frexp, "aten::frexp"),
+    (math.isinf, "aten::isinf"),
+    (math.degrees, "aten::degrees"),
+    (math.radians, "aten::radians"),
+    (cmath.isnan, "aten::isnan"),
+    (cmath.isfinite, "aten::isfinite"),
+    (cmath.isinf, "aten::isinf"),
+    (cmath.phase, "aten::angle"),
+    (cmath.rect, "aten::polar"),
+    (cmath.log, "aten::log"),
+    (cmath.log10, "aten::log10"),
+    (cmath.sqrt, "aten::sqrt"),
+    (cmath.exp, "aten::exp"),
+    (cmath.sin, "aten::sin"),
+    (cmath.tan, "aten::tan"),
+    (cmath.cos, "aten::cos"),
+    (cmath.asin, "aten::asin"),
+    (cmath.acos, "aten::acos"),
+    (cmath.atan, "aten::atan"),
+    (cmath.sinh, "aten::sinh"),
+    (cmath.cosh, "aten::cosh"),
+    (cmath.tanh, "aten::tanh"),
+    (cmath.asinh, "aten::asinh"),
+    (cmath.acosh, "aten::acosh"),
+    (cmath.atanh, "aten::atanh"),
+    (math.ldexp, "aten::ldexp"),
+    (torch._assert, "aten::_assert"),
+    (torch.autograd.grad, "aten::grad"),
+    (torch.autograd.backward, "aten::backward"),
+    (torch._C._infer_size, "aten::_infer_size"),
+    (torch.nn.functional._no_grad_embedding_renorm_, "aten::_no_grad_embedding_renorm_"),  # type: ignore[attr-defined]
+    (torch.nn.functional.assert_int_or_pair, "aten::_assert_int_or_pair"),
+    (torch.nn.init._no_grad_fill_, "aten::_no_grad_fill_"),
+    (torch.nn.init._no_grad_normal_, "aten::_no_grad_normal_"),
+    (torch.nn.init._no_grad_uniform_, "aten::_no_grad_uniform_"),
+    (torch.nn.init._no_grad_zero_, "aten::_no_grad_zero_"),
+    (torch._C._get_tracing_state, "aten::_get_tracing_state"),
+    (torch._C._get_cpu_capability, "aten::_get_cpu_capability"),
+    (warnings.warn, "aten::warn"),
+    (torch._VF.stft, "aten::stft"),  # type: ignore[attr-defined]
+    (torch._VF.istft, "aten::istft"),  # type: ignore[attr-defined]
+    (torch._VF.cdist, "aten::cdist"),  # type: ignore[attr-defined]
+    (torch._VF.norm, "aten::norm"),  # type: ignore[attr-defined]
+    (torch._VF.unique_dim, "aten::unique_dim"),
+    (torch._VF.unique_consecutive, "aten::unique_consecutive"),  # type: ignore[attr-defined]
+    (torch._VF.nuclear_norm, "aten::nuclear_norm"),
+    (torch._VF.frobenius_norm, "aten::frobenius_norm"),
+    (torch._VF.tensordot, "aten::tensordot"),  # type: ignore[attr-defined]
+]
+
+# ops in torch.functional are bound to torch
+# in these cases, we want to resolve the function to their python implementation
+# instead looking up a builtin "aten::" schema
+
+
+def _gen_torch_functional_registered_ops():
+    # eventually ops should encompass all of torch/functional.py, (torch.functional.__all__)
+    # but we are currently only able to compile some of the functions. additionally,
+    # some functions directly map to their aten:: implementations.
+    # TODO: add support for more ops
+    ops = [
+        "stft",
+        "istft",
+        "lu",
+        "cdist",
+        "norm",
+        "unique",
+        "unique_consecutive",
+        "tensordot",
+    ]
+    return {getattr(torch.functional, name) for name in ops}
+
+
+_functional_registered_ops = _gen_torch_functional_registered_ops()
+
+
+def _is_special_functional_bound_op(fn):
+    return fn in _functional_registered_ops
+
+
+# lazily built to ensure the correct initialization order
+def _get_builtin_table():
+    global _builtin_table
+    if _builtin_table is not None:
+        return _builtin_table
+    _builtin_table = {}
+
+    def register_all(mod):
+        for name in dir(mod):
+            v = getattr(mod, name)
+            if (
+                callable(v)
+                and not _is_special_functional_bound_op(v)
+                and v is not torch.no_grad
+                and v is not torch.autocast
+            ):
+                # Fixup inconsistency in segment_reduce
+                if name == "_segment_reduce":
+                    name = name[1:]
+                _builtin_ops.append((v, "aten::" + name))
+
+    for mod in _modules_containing_builtins:
+        register_all(mod)
+
+    _builtin_ops.append((math.gcd, "aten::gcd"))
+    _builtin_ops.append((math.isfinite, "aten::isfinite"))
+    _builtin_ops.append((math.remainder, "aten::mathremainder"))  # type: ignore[attr-defined]
+
+    import torch.distributed.autograd as dist_autograd
+
+    if dist_autograd.is_available():
+        _builtin_ops.append((dist_autograd.get_gradients, "aten::get_gradients"))
+        _builtin_ops.append((dist_autograd.backward, "aten::dist_backward"))
+
+    # populate the _builtin_table from _builtin_ops
+    for builtin, aten_op in _builtin_ops:
+        _builtin_table[id(builtin)] = aten_op
+
+    return _builtin_table
+
+
+def _register_builtin(fn, op):
+    _get_builtin_table()[id(fn)] = op
+
+
+def _find_builtin(fn):
+    return _get_builtin_table().get(id(fn))
diff --git a/MLPY/Lib/site-packages/torch/jit/_check.py b/MLPY/Lib/site-packages/torch/jit/_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a206dd9c21018495377a9076c264e6cef477615
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_check.py
@@ -0,0 +1,248 @@
+import ast
+import inspect
+import textwrap
+import warnings
+
+import torch
+
+
+class AttributeTypeIsSupportedChecker(ast.NodeVisitor):
+    """Check the ``__init__`` method of a given ``nn.Module``.
+
+    It ensures that all instance-level attributes can be properly initialized.
+
+    Specifically, we do type inference based on attribute values...even
+    if the attribute in question has already been typed using
+    Python3-style annotations or ``torch.jit.annotate``. This means that
+    setting an instance-level attribute to ``[]`` (for ``List``),
+    ``{}`` for ``Dict``), or ``None`` (for ``Optional``) isn't enough
+    information for us to properly initialize that attribute.
+
+    An object of this class can walk a given ``nn.Module``'s AST and
+    determine if it meets our requirements or not.
+
+    Known limitations
+    1. We can only check the AST nodes for certain constructs; we can't
+    ``eval`` arbitrary expressions. This means that function calls,
+    class instantiations, and complex expressions that resolve to one of
+    the "empty" values specified above will NOT be flagged as
+    problematic.
+    2. We match on string literals, so if the user decides to use a
+    non-standard import (e.g. `from typing import List as foo`), we
+    won't catch it.
+
+    Example:
+        .. code-block:: python
+
+            class M(torch.nn.Module):
+                def fn(self):
+                    return []
+
+                def __init__(self):
+                    super().__init__()
+                    self.x: List[int] = []
+
+                def forward(self, x: List[int]):
+                    self.x = x
+                    return 1
+
+        The above code will pass the ``AttributeTypeIsSupportedChecker``
+        check since we have a function call in ``__init__``. However,
+        it will still fail later with the ``RuntimeError`` "Tried to set
+        nonexistent attribute: x. Did you forget to initialize it in
+        __init__()?".
+
+    Args:
+        nn_module - The instance of ``torch.nn.Module`` whose
+            ``__init__`` method we wish to check
+    """
+
+    def check(self, nn_module: torch.nn.Module) -> None:
+        source_lines = inspect.getsource(nn_module.__class__.__init__)
+
+        # Ignore comments no matter the indentation
+        def is_useless_comment(line):
+            line = line.strip()
+            return line.startswith("#") and not line.startswith("# type:")
+
+        source_lines = "\n".join(
+            [l for l in source_lines.split("\n") if not is_useless_comment(l)]
+        )
+
+        # This AST only contains the `__init__` method of the nn.Module
+        init_ast = ast.parse(textwrap.dedent(source_lines))
+
+        # Get items annotated in the class body
+        self.class_level_annotations = list(nn_module.__annotations__.keys())
+
+        # Flag for later
+        self.visiting_class_level_ann = False
+
+        self.visit(init_ast)
+
+    def _is_empty_container(self, node: ast.AST, ann_type: str) -> bool:
+        if ann_type == "List":
+            # Assigning `[]` to a `List` type gives you a Node where
+            # value=List(elts=[], ctx=Load())
+            if not isinstance(node, ast.List):
+                return False
+            if node.elts:
+                return False
+        elif ann_type == "Dict":
+            # Assigning `{}` to a `Dict` type gives you a Node where
+            # value=Dict(keys=[], values=[])
+            if not isinstance(node, ast.Dict):
+                return False
+            if node.keys:
+                return False
+        elif ann_type == "Optional":
+            # Assigning `None` to an `Optional` type gives you a
+            # Node where value=Constant(value=None, kind=None)
+            if not isinstance(node, ast.Constant):
+                return False
+            if node.value:  # type: ignore[attr-defined]
+                return False
+
+        return True
+
+    def visit_Assign(self, node):
+        """Store assignment state when assigning to a Call Node.
+
+        If we're visiting a Call Node (the right-hand side of an
+        assignment statement), we won't be able to check the variable
+        that we're assigning to (the left-hand side of an assignment).
+        Because of this, we need to store this state in visitAssign.
+        (Luckily, we only have to do this if we're assigning to a Call
+        Node, i.e. ``torch.jit.annotate``. If we're using normal Python
+        annotations, we'll be visiting an AnnAssign Node, which has its
+        target built in.)
+        """
+        try:
+            if (
+                isinstance(node.value, ast.Call)
+                and node.targets[0].attr in self.class_level_annotations
+            ):
+                self.visiting_class_level_ann = True
+        except AttributeError:
+            return
+        self.generic_visit(node)
+        self.visiting_class_level_ann = False
+
+    def visit_AnnAssign(self, node):
+        """Visit an AnnAssign node in an ``nn.Module``'s ``__init__`` method.
+
+        It checks if it conforms to our attribute annotation rules."""
+        # If we have a local variable
+        try:
+            if node.target.value.id != "self":
+                return
+        except AttributeError:
+            return
+
+        # If we have an attribute that's already been annotated at the
+        # class level
+        if node.target.attr in self.class_level_annotations:
+            return
+
+        # TODO @ansley: add `Union` once landed
+
+        # NB: Even though `Tuple` is a "container", we don't want to
+        # check for it here. `Tuple` functions as an type with an
+        # "infinite" number of subtypes, in the sense that you can have
+        # `Tuple[())]`, `Tuple[T1]`, `Tuple[T2]`, `Tuple[T1, T2]`,
+        # `Tuple[T2, T1]` and so on, and none of these subtypes can be
+        # used in place of the other. Therefore, assigning an empty
+        # tuple in `__init__` CORRECTLY means that that variable
+        # cannot be reassigned later to a non-empty tuple. Same
+        # deal with `NamedTuple`
+
+        containers = {"List", "Dict", "Optional"}
+
+        # If we're not evaluating one of the specified problem types
+        try:
+            if node.annotation.value.id not in containers:
+                return
+        except AttributeError:
+            # To evaluate a base type (`str`, `int`, etc.), we would
+            # have needed to get the name through `node.annotation.id`
+            # instead of `node.annotation.value.id`. Seems that we're
+            # not evaluating one of our "containers"
+            return
+
+        # Check if the assigned variable is empty
+        ann_type = node.annotation.value.id
+        if not self._is_empty_container(node.value, ann_type):
+            return
+
+        warnings.warn(
+            "The TorchScript type system doesn't support "
+            "instance-level annotations on empty non-base "
+            "types in `__init__`. Instead, either 1) use a "
+            "type annotation in the class body, or 2) wrap "
+            "the type in `torch.jit.Attribute`."
+        )
+
+    def visit_Call(self, node):
+        """Determine if a Call node is 'torch.jit.annotate' in __init__.
+
+        Visit a Call node in an ``nn.Module``'s ``__init__``
+        method and determine if it's ``torch.jit.annotate``. If so,
+        see if it conforms to our attribute annotation rules.
+        """
+        # If we have an attribute that's already been annotated at the
+        # class level
+        if self.visiting_class_level_ann:
+            return
+
+        # If this isn't a call to `torch.jit.annotate`
+        try:
+            if (
+                node.func.value.value.id != "torch"
+                or node.func.value.attr != "jit"
+                or node.func.attr != "annotate"
+            ):
+                self.generic_visit(node)
+            elif (
+                node.func.value.value.id != "jit" or node.func.value.attr != "annotate"
+            ):
+                self.generic_visit(node)
+        except AttributeError:
+            # Looks like we didn't even have the right node structure
+            # to check for `torch.jit.annotate` in the first place
+            self.generic_visit(node)
+
+        # Invariant: we have a `torch.jit.annotate` or a
+        # `torch.annotate` call
+
+        # A Call Node for `torch.jit.annotate` should have an `args`
+        # list of length 2 where args[0] represents the annotation and
+        # args[1] represents the actual value
+        if len(node.args) != 2:
+            return
+
+        if not isinstance(node.args[0], ast.Subscript):
+            return
+
+        # See notes in `visit_AnnAssign` r.e. containers
+
+        containers = {"List", "Dict", "Optional"}
+
+        try:
+            ann_type = node.args[0].value.id  # type: ignore[attr-defined]
+        except AttributeError:
+            return
+
+        if ann_type not in containers:
+            return
+
+        # Check if the assigned variable is empty
+        if not self._is_empty_container(node.args[1], ann_type):
+            return
+
+        warnings.warn(
+            "The TorchScript type system doesn't support "
+            "instance-level annotations on empty non-base "
+            "types in `__init__`. Instead, either 1) use a "
+            "type annotation in the class body, or 2) wrap "
+            "the type in `torch.jit.Attribute`."
+        )
diff --git a/MLPY/Lib/site-packages/torch/jit/_dataclass_impls.py b/MLPY/Lib/site-packages/torch/jit/_dataclass_impls.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0923fb8036162568b883dbd93c11cf54e9d3601
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_dataclass_impls.py
@@ -0,0 +1,189 @@
+# Functions for synthesizing magic methods for JIT-compiled dataclasses
+import ast
+import dataclasses
+import inspect
+import os
+from functools import partial
+from typing import Callable, Dict, List
+
+from torch._jit_internal import FAKE_FILENAME_PREFIX, is_optional
+from torch._sources import ParsedDef, SourceContext
+
+
+def _get_fake_filename(cls, method_name):
+    return os.path.join(FAKE_FILENAME_PREFIX, cls.__name__, method_name)
+
+
+def compose_fn(cls, name: str, body_lines: List[str], signature: str) -> ParsedDef:
+    body = "\n".join(f"  {b}" for b in body_lines)
+    decl = f"def {name}{signature}:\n{body}"
+
+    # Parse the function declaration
+    try:
+        py_ast = ast.parse(decl)
+    except SyntaxError as e:
+        # This should only happen if there's some unforeseeable change
+        # in the dataclasses module that makes our synthesized code fail
+        raise RuntimeError(
+            f"TorchScript failed to synthesize dataclass method '{name}' for class '{cls.__name__}'. "
+            "Please file a bug report at <https://github.com/pytorch/pytorch/issues>"
+        ) from e
+    fake_filename = _get_fake_filename(cls, name)
+    # Parse the function
+    return ParsedDef(
+        py_ast,
+        ctx=SourceContext(
+            source=decl, filename=fake_filename, file_lineno=0, leading_whitespace_len=0
+        ),
+        source=decl,
+        filename=fake_filename,
+        file_lineno=0,
+    )
+
+
+def synthesize__init__(cls) -> ParsedDef:
+    # Supporting default factories in the way that people expect would sort of require us to
+    # allow compiling lambda functions, which is not currently supported.
+    if any(
+        field.default_factory is not dataclasses.MISSING
+        for field in dataclasses.fields(cls)
+    ):
+        raise NotImplementedError(
+            "Default factory initializers are not supported in TorchScript dataclasses"
+        )
+
+    # Simply read off the generated __init__ signature from CPython's implementation. It'll be
+    # almost correct except for InitVar annotations, which we need to handle specially.
+    signature = inspect.signature(cls.__init__)
+
+    # Handle InitVars if needed (only works on Python 3.8+, when a `type` attribute was added to InitVar);
+    # see CPython commit here https://github.com/python/cpython/commit/01ee12ba35a333e8a6a25c4153c4a21838e9585c
+    init_vars: List[str] = []
+    params = []
+    for name, param in signature.parameters.items():
+        ann = param.annotation
+
+        if isinstance(ann, dataclasses.InitVar):
+            # The TorchScript interpreter can't handle InitVar annotations, so we unwrap the underlying type here
+            init_vars.append(name)
+            params.append(param.replace(annotation=ann.type))  # type: ignore[attr-defined]
+        else:
+            params.append(param)
+
+    signature = signature.replace(parameters=params)
+
+    body = [
+        # Assign all attributes to self
+        f"self.{field.name} = {field.name}"
+        for field in dataclasses.fields(cls)
+        if field.init and field.name not in init_vars
+    ]
+    # Call user's impl of __post_init__ if it exists
+    if hasattr(cls, "__post_init__"):
+        body.append("self.__post_init__(" + ", ".join(init_vars) + ")")
+
+    return compose_fn(cls, "__init__", body or ["pass"], signature=str(signature))
+
+
+# This is a placeholder at the moment since the TorchScript interpreter doesn't call __repr__
+def synthesize__repr__(cls) -> ParsedDef:
+    return compose_fn(
+        cls,
+        "__repr__",
+        [
+            f"return '{cls.__name__}("
+            + ", ".join(
+                [
+                    f"{field.name}=self.{field.name}"
+                    for field in dataclasses.fields(cls)
+                    if field.repr
+                ]
+            )
+            + ")'"
+        ],
+        signature="(self) -> str",
+    )
+
+
+def synthesize__hash__(cls) -> ParsedDef:
+    return compose_fn(
+        cls,
+        "__hash__",
+        [
+            # This is just a placeholder to prevent compilation from failing; this won't even get called at
+            # all right now because the TorchScript interpreter doesn't call custom __hash__ implementations
+            "raise NotImplementedError('__hash__ is not supported for dataclasses in TorchScript')"
+        ],
+        signature="(self) -> int",
+    )
+
+
+# Implementation for __eq__ and __ne__
+def synthesize_equality(cls, name: str, converse: str) -> ParsedDef:
+    return synthesize_comparison(
+        cls,
+        name,
+        allow_eq=True,
+        raise_on_none=False,
+        inner=[f"if val1 {converse} val2: return False"],
+    )
+
+
+def synthesize_inequality(cls, name: str, op: str, allow_eq: bool) -> ParsedDef:
+    return synthesize_comparison(
+        cls,
+        name,
+        allow_eq,
+        raise_on_none=True,
+        inner=[
+            f"if val1 {op} val2: return True",
+            f"elif val2 {op} val1: return False",
+        ],
+    )
+
+
+def synthesize_comparison(
+    cls, name: str, allow_eq: bool, raise_on_none: bool, inner: List[str]
+) -> ParsedDef:
+    body = []
+    for field in dataclasses.fields(cls):
+        if not field.compare:
+            continue
+
+        body.extend(
+            [
+                f"val1 = self.{field.name}",
+                f"val2 = other.{field.name}",
+            ]
+        )
+        body.extend(
+            inner
+            if not is_optional(field.type)
+            else [
+                # Type refinement for optional fields; we need this to avoid type errors from the interpreter
+                "if val1 is not None and val2 is not None:",
+                *["  " + line for line in inner],
+                "elif (val1 is None) != (val2 is None):",
+                f"  raise TypeError('Cannot compare {cls.__name__} with None')"
+                if raise_on_none
+                else "  return False",
+            ]
+        )
+
+    body.append(f"return {allow_eq}")
+    return compose_fn(
+        cls, name, body, signature=f"(self, other: {cls.__name__}) -> bool"
+    )
+
+
+DATACLASS_MAGIC_METHODS: Dict[str, Callable] = {
+    "__init__": synthesize__init__,
+    "__repr__": synthesize__repr__,
+    "__hash__": synthesize__hash__,
+    "__eq__": partial(synthesize_equality, name="__eq__", converse="!="),
+    "__ne__": partial(synthesize_equality, name="__ne__", converse="=="),
+    "__lt__": partial(synthesize_inequality, name="__lt__", op="<", allow_eq=False),
+    "__le__": partial(synthesize_inequality, name="__le__", op="<", allow_eq=True),
+    "__gt__": partial(synthesize_inequality, name="__gt__", op=">", allow_eq=False),
+    "__ge__": partial(synthesize_inequality, name="__ge__", op=">", allow_eq=True),
+}
diff --git a/MLPY/Lib/site-packages/torch/jit/_decomposition_utils.py b/MLPY/Lib/site-packages/torch/jit/_decomposition_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7317e72697ecd27b26afdcb2cb5da64c33051817
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_decomposition_utils.py
@@ -0,0 +1,11 @@
+import torch
+from torch._ops import OpOverload, OpOverloadPacket
+
+
+def _register_decomposition(op: OpOverload, graph: torch._C.Graph):
+    assert not isinstance(
+        op, OpOverloadPacket
+    ), f"Must pass specific op overload, not overload packet, found {op}"
+    assert isinstance(op, OpOverload)
+
+    torch._C._jit_register_decomposition_for_schema(op._schema, graph)
diff --git a/MLPY/Lib/site-packages/torch/jit/_decompositions.py b/MLPY/Lib/site-packages/torch/jit/_decompositions.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f39cbc62dad68b0e343afd74455577e54aa69f6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_decompositions.py
@@ -0,0 +1,127 @@
+import torch
+from torch import Tensor
+
+aten = torch.ops.aten
+import inspect
+import warnings
+from typing import Dict, List, Optional, Set
+
+from torch.types import Number
+
+decomposition_table: Dict[str, torch.jit.ScriptFunction] = {}
+function_name_set: Set[str] = set()
+
+
+def check_decomposition_has_type_annotations(f):
+    inspect_empty = inspect._empty  # type: ignore[attr-defined]
+    sig = inspect.signature(f)
+    for param in sig.parameters.values():
+        assert (
+            param.annotation != inspect_empty
+        ), f"No signature on param {param.name} for function {f.name}"
+
+    assert (
+        sig.return_annotation != inspect_empty
+    ), f"No return annotation for function {f.name}"
+
+
+def signatures_match(decomposition_sig, torch_op_sig):
+    decomp_params = decomposition_sig.parameters
+    op_params = torch_op_sig.parameters
+
+    if len(decomp_params) != len(op_params):
+        return False
+
+    for decomp_param, op_param in zip(decomp_params.values(), op_params.values()):
+        # can't check full equality yet because not all fields are correcly deduced
+        # in the torch_op_sig - like default value
+        # can't check 'kind' bc
+        # kwarg-only values with defaults not yet supported in TS
+        inspect_empty = inspect._empty  # type: ignore[attr-defined]
+        for field in ["name", "annotation"]:
+            if field == "name" and decomp_param.name == "self":
+                warnings.warn("PyTorch uses 'input' instead of 'self' on public api")
+
+            if getattr(decomp_param, field) != getattr(op_param, field):
+                return False
+
+        decomp_default = decomp_param.default
+        op_default = op_param.default
+        # default value not always correctly inferred as being present on torch schema,
+        # but if specified on both they should be equal
+        if decomp_default != inspect_empty and op_default != inspect_empty:
+            if decomp_default != op_default:
+                return False
+
+    return decomposition_sig.return_annotation == torch_op_sig.return_annotation
+
+
+def register_decomposition(aten_op, registry=None):
+    def decomposition_decorator(f):
+        nonlocal registry
+        if registry is None:
+            registry = decomposition_table
+
+        assert isinstance(aten_op, torch._ops.OpOverload)
+
+        # Need unique name for jit function serialization
+        assert (
+            f.__name__ not in function_name_set
+        ), f"Duplicated function name {f.__name__}"
+        function_name_set.add(f.__name__)
+
+        scripted_func = torch.jit.script(f)
+        torch._C._jit_pass_inline(scripted_func.graph)
+
+        for _ in range(2):
+            torch._C._jit_pass_peephole(scripted_func.graph)
+            torch._C._jit_pass_constant_propagation(scripted_func.graph)
+
+        registry[str(aten_op._schema)] = scripted_func
+        return f
+
+    return decomposition_decorator
+
+
+# TODO: replace torch.sigmoid -> aten.sigmoid
+
+
+@register_decomposition(aten.var.correction)
+def var_decomposition(
+    input: Tensor,
+    dim: Optional[List[int]] = None,
+    correction: Optional[Number] = None,
+    keepdim: bool = False,
+) -> Tensor:
+    if dim is None:
+        dim_i: List[int] = []
+        dim = dim_i
+
+    if isinstance(dim, (tuple, list)) and len(dim) == 0:
+        n = input.numel()
+    else:
+        n = 1
+        for dim_i in dim:  # type: ignore[assignment]
+            n *= input.shape[dim_i]  # type: ignore[call-overload]
+
+    mean = aten.mean(input, dim, True)
+    sub = input - mean
+    sq = sub * sub
+    sum = aten.sum(sq, dim, keepdim)
+
+    if correction is None:
+        denom = float(n - 1)
+    else:
+        if isinstance(correction, int):
+            denom = float(n - correction)
+        elif isinstance(correction, float):
+            denom = float(n) - correction
+        else:
+            raise RuntimeError("correction must be int or float")
+
+    return sum / max(0, denom)
+
+
+@register_decomposition(aten.var.default)
+def var(input: Tensor, unbiased: bool = True) -> Tensor:
+    return var_decomposition(input, correction=(1 if unbiased else 0))
diff --git a/MLPY/Lib/site-packages/torch/jit/_freeze.py b/MLPY/Lib/site-packages/torch/jit/_freeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ce28f6ca806784af615b4d02efc171f9d7f1e4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_freeze.py
@@ -0,0 +1,227 @@
+"""Freezing.
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+
+from typing import List, Optional
+
+import torch
+from torch.jit._script import RecursiveScriptModule, ScriptModule
+
+
+def freeze(
+    mod, preserved_attrs: Optional[List[str]] = None, optimize_numerics: bool = True
+):
+    r"""Freeze ScriptModule, inline submodules, and attributes as constants.
+
+    Freezing a :class:`ScriptModule` will clone it and attempt to inline the cloned
+    module's submodules, parameters, and attributes as constants in the TorchScript IR Graph.
+    By default, `forward` will be preserved, as well as attributes & methods specified in
+    `preserved_attrs`. Additionally, any attribute that is modified within a preserved
+    method will be preserved.
+
+    Freezing currently only accepts ScriptModules that are in eval mode.
+
+    Freezing applies generic optimization that will speed up your model regardless of machine.
+    To further optimize using server-specific settings, run `optimize_for_inference` after
+    freezing.
+
+    Args:
+        mod (:class:`ScriptModule`): a module to be frozen
+        preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
+            Attributes modified in preserved methods will also be preserved.
+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
+            preserve numerics. Full details of optimization can be found at `torch.jit.run_frozen_optimizations`.
+
+    Returns:
+        Frozen :class:`ScriptModule`.
+
+    Example (Freezing a simple module with a Parameter):
+
+    .. testcode::
+        import torch
+        class MyModule(torch.nn.Module):
+            def __init__(self, N, M):
+                super().__init__()
+                self.weight = torch.nn.Parameter(torch.rand(N, M))
+                self.linear = torch.nn.Linear(N, M)
+
+            def forward(self, input):
+                output = self.weight.mm(input)
+                output = self.linear(output)
+                return output
+
+        scripted_module = torch.jit.script(MyModule(2, 3).eval())
+        frozen_module = torch.jit.freeze(scripted_module)
+        # parameters have been removed and inlined into the Graph as constants
+        assert len(list(frozen_module.named_parameters())) == 0
+        # See the compiled graph as Python code
+        print(frozen_module.code)
+
+    Example (Freezing a module with preserved attributes)
+
+    .. testcode::
+        import torch
+        class MyModule2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.modified_tensor = torch.tensor(10.)
+                self.version = 1
+
+            def forward(self, input):
+                self.modified_tensor += 1
+                return input + self.modified_tensor
+
+        scripted_module = torch.jit.script(MyModule2().eval())
+        frozen_module = torch.jit.freeze(scripted_module, preserved_attrs=["version"])
+        # we've manually preserved `version`, so it still exists on the frozen module and can be modified
+        assert frozen_module.version == 1
+        frozen_module.version = 2
+        # `modified_tensor` is detected as being mutated in the forward, so freezing preserves
+        # it to retain model semantics
+        assert frozen_module(torch.tensor(1)) == torch.tensor(12)
+        # now that we've run it once, the next result will be incremented by one
+        assert frozen_module(torch.tensor(1)) == torch.tensor(13)
+
+    Note:
+        Freezing submodule attributes is also supported:
+        frozen_module = torch.jit.freeze(scripted_module, preserved_attrs=["submodule.version"])
+
+    Note:
+        If you're not sure why an attribute is not being inlined as a constant, you can run
+        `dump_alias_db` on frozen_module.forward.graph to see if freezing has detected the
+        attribute is being modified.
+
+    Note:
+        Because freezing makes weights constants and removes module hierarchy, `to` and other
+        nn.Module methods to manipulate device or dtype no longer work. As a workaround,
+        You can remap devices by specifying `map_location` in `torch.jit.load`, however
+        device-specific logic may have been baked into the model.
+    """
+    if not isinstance(mod, ScriptModule):
+        raise RuntimeError(
+            "Freezing expects a ScriptModule as input. "
+            "Please use torch.jit.script or torch.jit.trace to script your 'nn.Module'."
+        )
+
+    if mod.training:
+        raise RuntimeError(
+            "Freezing is currently only implemented for modules in eval mode. "
+            "Please call .eval() on your module before freezing."
+        )
+
+    preserved_attrs = preserved_attrs if preserved_attrs is not None else []
+
+    out = RecursiveScriptModule(torch._C._freeze_module(mod._c, preserved_attrs))
+    RecursiveScriptModule._finalize_scriptmodule(out)
+
+    preserved_methods = [x for x in preserved_attrs if mod._c._has_method(x)]
+    run_frozen_optimizations(out, optimize_numerics, preserved_methods)
+
+    return out
+
+
+def run_frozen_optimizations(
+    mod, optimize_numerics: bool = True, preserved_methods: Optional[List[str]] = None
+):
+    r"""
+    Run a series of optimizations looking for patterns that occur in frozen graphs.
+
+    The current set of optimizations includes:
+        - Dropout Removal
+        - Pretranspose Linear Layers
+        - Concat Linear Layers with same input Tensor
+        - Conv -> Batchnorm folding
+        - Conv -> Add/Sub folding
+        - Conv -> Mul/Div folding
+
+    Args:
+        mod (:class:`ScriptModule`): a frozen module to be optimized
+
+        optimize_numerics (bool): If ``True``, a set of optimization passes will be run that does not strictly
+        preserve numerics. These optimizations preserve default rtol and atol of `torch.testing.assert_close`
+        when applied on a single transformation, however in a module where many transformations are applied
+        the rtol or atol may no longer fall within the default `assert_close` tolerance. Conv -> Batchnorm folding,
+        Conv-Add/Sub, and Conv -> Mul/Div folding all may alter numerics.
+
+    Returns:
+        None
+
+    Note:
+        In rare occassions, this can result in slower execution.
+
+    Example (Freezing a module with Conv->Batchnorm)
+    .. code-block:: python
+        import torch
+        in_channels, out_channels = 3, 32
+        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
+        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
+        mod = torch.nn.Sequential(conv, bn)
+        # set optimize to False here, by default freezing runs run_frozen_optimizations
+        frozen_mod = torch.jit.freeze(torch.jit.script(mod.eval()), optimize=False)
+        # inspect frozen mod
+        assert "batch_norm" in str(frozen_mod.graph)
+        torch.jit.run_frozen_optimizations(frozen_mod)
+        assert "batch_norm" not in str(frozen_mod.graph)
+
+    """
+    if mod._c._has_method("forward"):
+        torch._C._jit_pass_optimize_frozen_graph(mod.graph, optimize_numerics)
+
+    if preserved_methods is None:
+        preserved_methods = []
+
+    for method in preserved_methods:
+        torch._C._jit_pass_optimize_frozen_graph(
+            mod.__getattr__(method).graph, optimize_numerics
+        )
+
+
+def optimize_for_inference(
+    mod: ScriptModule, other_methods: Optional[List[str]] = None
+) -> ScriptModule:
+    """
+    Perform a set of optimization passes to optimize a model for the purposes of inference.
+
+    If the model is not already frozen, optimize_for_inference
+    will invoke `torch.jit.freeze` automatically.
+
+    In addition to generic optimizations that should speed up your model regardless
+    of environment, prepare for inference will also bake in build specific settings
+    such as the presence of CUDNN or MKLDNN, and may in the future make transformations
+    which speed things up on one machine but slow things down on another. Accordingly,
+    serialization is not implemented following invoking `optimize_for_inference` and
+    is not guaranteed.
+
+    This is still in prototype, and may have the potential to slow down your model.
+    Primary use cases that have been targeted so far have been vision models on cpu
+    and gpu to a lesser extent.
+
+    Example (optimizing a module with Conv->Batchnorm)::
+
+        import torch
+        in_channels, out_channels = 3, 32
+        conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=True)
+        bn = torch.nn.BatchNorm2d(out_channels, eps=.001)
+        mod = torch.nn.Sequential(conv, bn)
+        frozen_mod = torch.jit.optimize_for_inference(torch.jit.script(mod.eval()))
+        assert "batch_norm" not in str(frozen_mod.graph)
+        # if built with MKLDNN, convolution will be run with MKLDNN weights
+        assert "MKLDNN" in frozen_mod.graph
+    """
+    if not isinstance(mod, ScriptModule):
+        raise RuntimeError(
+            "optimize_for_inference expects a ScriptModule as input. "
+            "Please use torch.jit.script or torch.jit.trace to script your 'nn.Module'."
+        )
+
+    if other_methods is None:
+        other_methods = []
+
+    if hasattr(mod, "training"):
+        mod = freeze(mod.eval(), preserved_attrs=other_methods)
+
+    torch._C._jit_pass_optimize_for_inference(mod._c, other_methods)
+
+    return mod
diff --git a/MLPY/Lib/site-packages/torch/jit/_fuser.py b/MLPY/Lib/site-packages/torch/jit/_fuser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c59439842208416291ad19a82145e5b850348ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_fuser.py
@@ -0,0 +1,160 @@
+import contextlib
+from typing import List, Tuple
+
+import torch
+
+
+@contextlib.contextmanager
+def optimized_execution(should_optimize):
+    """Context manager that controls whether the JIT's executor will run optimizations before executing a function."""
+    stored_flag = torch._C._get_graph_executor_optimize()
+    torch._C._set_graph_executor_optimize(should_optimize)
+    try:
+        yield
+    finally:
+        torch._C._set_graph_executor_optimize(stored_flag)
+
+
+@contextlib.contextmanager
+def fuser(name):
+    """Context manager that facilitates switching between backend fusers.
+
+    Valid names:
+    * ``fuser0`` - enables only legacy fuser
+    * ``fuser1`` - enables only NNC
+    * ``fuser2`` - enables only nvFuser
+    * ``fuser3`` - enables oneDNN Graph
+    """
+    old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
+    old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
+    old_texpr_fuser_state = torch._C._jit_texpr_fuser_enabled()
+    old_nvfuser_state = torch._C._jit_nvfuser_enabled()
+    old_llga_state = torch._C._jit_llga_enabled()
+    if name == "fuser0":  # legacy fuser
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
+    elif name == "fuser1":  # NNC
+        old_profiling_executor = torch._C._jit_set_profiling_executor(True)
+        old_profiling_mode = torch._C._get_graph_executor_optimize(True)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+        torch._C._jit_set_texpr_fuser_enabled(True)
+        torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
+    elif name == "fuser2":  # nvFuser
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(True)
+        torch._C._jit_set_llga_enabled(False)
+    elif name == "fuser3":  # oneDNN Graph
+        old_profiling_executor = torch._C._jit_set_profiling_executor(True)
+        old_profiling_mode = torch._C._get_graph_executor_optimize(True)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(True)
+        torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(True)
+    elif name == "none":  # Turn Pytorch fuser off
+        torch._C._jit_override_can_fuse_on_cpu(False)
+        torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
+        torch._C._jit_set_nvfuser_enabled(False)
+        torch._C._jit_set_llga_enabled(False)
+    else:
+        raise Exception(f"unrecognized fuser option (name: {name})")
+    try:
+        yield
+    finally:
+        if name in ["fuser1", "fuser3"]:  # NNC or oneDNN Graph
+            torch._C._jit_set_profiling_executor(old_profiling_executor)  # type: ignore[possibly-undefined]
+            torch._C._get_graph_executor_optimize(old_profiling_mode)  # type: ignore[possibly-undefined]
+        # recover the previous values
+        torch._C._jit_override_can_fuse_on_cpu(old_cpu_fuse)
+        torch._C._jit_override_can_fuse_on_gpu(old_gpu_fuse)
+        torch._C._jit_set_texpr_fuser_enabled(old_texpr_fuser_state)
+        torch._C._jit_set_nvfuser_enabled(old_nvfuser_state)
+        torch._C._jit_set_llga_enabled(old_llga_state)
+
+
+last_executed_optimized_graph = torch._C._last_executed_optimized_graph
+
+
+def _get_differentiable_graph_node(node, diff_node):
+    if node.kind() == "prim::DifferentiableGraph":
+        diff_node.append(node)
+    else:
+        for block in node.blocks():
+            for n in block.nodes():
+                _get_differentiable_graph_node(n, diff_node)
+
+
+def _graph_for(self, *args, **kwargs):
+    return _script_method_graph_for(self, self, *args, **kwargs)
+
+
+def _script_method_graph_for(self, parent, *args, **kwargs):
+    try:
+        dbs = parent.get_debug_state()
+        eps = list(dbs.execution_plans.values())
+        assert len(eps) == 1
+        graph = eps[0].graph.copy()
+
+        # graph_executor_states for differentiable node
+        fw_states = eps[0].code.differentiable_op_executor_states()
+        diff_nodes: List[torch._C.Node] = []
+        for n in graph.nodes():
+            _get_differentiable_graph_node(n, diff_nodes)
+
+        assert len(fw_states) == len(diff_nodes)
+        # swap each differentiable graph with optimized graph in their execution plan
+        for n, state in zip(diff_nodes, fw_states):
+            fw_execution_plans = list(state.execution_plans.values())
+            # we can only update the subgraph when there's a unique execution
+            # plan. Avoid assert here so we would skip the ones that can't be
+            # updated while try the best effort to update other nodes.
+            if len(fw_execution_plans) == 1:
+                n.g_("Subgraph", fw_execution_plans[0].graph)
+
+        return graph
+    except Exception:
+        # fallback approach, we just ran the graph and return the recorded optimized
+        # graph
+        self(*args, **kwargs)
+        return last_executed_optimized_graph()
+
+
+def set_fusion_strategy(strategy: List[Tuple[str, int]]):
+    """Set the type and number of specializations that can occur during fusion.
+
+    Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or "DYNAMIC"
+    and depth is an integer.
+
+    Behavior - static vs dynamic:
+        In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined
+        based on some initial profiling runs.
+        In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
+        shapes are possible.
+
+    In both cases, we also recompile on new striding behavior, device, or dtype.
+
+    Behavior - fallback functions & depth:
+        When an input doesn't match the format required by the specialized compiled op, it will run
+        a fallback function. Fallback functions are recursively be compiled and specialized based
+        on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to
+        limit the number of specializations that can be compiled, before giving up on recompiling and
+        falling back to a completely un-fused, un-specialized implementation.
+
+    The list of (type, depth) pairs controls the type of specializations and the number of
+    specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] indicates that the first
+    two specializations will use static fusions, the following two specializations will use
+    dynamic fusion, and any inputs that satisfy none of the 4 options will run an
+    unfused implementation.
+
+    NB: in the future, if more as more fusion backends are added there may be more granular
+    apis for specific fusers.
+    """
+    return torch._C._jit_set_fusion_strategy(strategy)
diff --git a/MLPY/Lib/site-packages/torch/jit/_ir_utils.py b/MLPY/Lib/site-packages/torch/jit/_ir_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf623ed5107fef841db8ecf9182469b7874da5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_ir_utils.py
@@ -0,0 +1,25 @@
+from typing import Union
+
+import torch
+
+
+class _InsertPoint:
+    def __init__(
+        self,
+        insert_point_graph: torch._C.Graph,
+        insert_point: Union[torch._C.Node, torch._C.Block],
+    ):
+        self.insert_point = insert_point
+        self.g = insert_point_graph
+        self.guard = None
+
+    def __enter__(self):
+        self.prev_insert_point = self.g.insertPoint()
+        self.g.setInsertPoint(self.insert_point)
+
+    def __exit__(self, *args):
+        self.g.setInsertPoint(self.prev_insert_point)
+
+
+def insert_point_guard(self, insert_point: Union[torch._C.Node, torch._C.Block]):
+    return _InsertPoint(self, insert_point)
diff --git a/MLPY/Lib/site-packages/torch/jit/_logging.py b/MLPY/Lib/site-packages/torch/jit/_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5771b6b0b369b28a0882873731c7f34d555848
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_logging.py
@@ -0,0 +1,10 @@
+import torch
+
+add_stat_value = torch.ops.prim.AddStatValue
+
+set_logger = torch._C._logging_set_logger
+LockingLogger = torch._C.LockingLogger
+AggregationType = torch._C.AggregationType
+NoopLogger = torch._C.NoopLogger
+
+time_point = torch.ops.prim.TimePoint
diff --git a/MLPY/Lib/site-packages/torch/jit/_monkeytype_config.py b/MLPY/Lib/site-packages/torch/jit/_monkeytype_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe0200bb2271ba2e185d773c536f6690c7981371
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_monkeytype_config.py
@@ -0,0 +1,192 @@
+import inspect
+import pathlib
+import sys
+import typing
+from collections import defaultdict
+from types import CodeType
+from typing import Dict, Iterable, List, Optional
+
+import torch
+
+_IS_MONKEYTYPE_INSTALLED = True
+try:
+    import monkeytype  # type: ignore[import]
+    from monkeytype import trace as monkeytype_trace
+    from monkeytype.config import _startswith, LIB_PATHS  # type: ignore[import]
+    from monkeytype.db.base import (  # type: ignore[import]
+        CallTraceStore,
+        CallTraceStoreLogger,
+        CallTraceThunk,
+    )
+    from monkeytype.tracing import CallTrace, CodeFilter  # type: ignore[import]
+except ImportError:
+    _IS_MONKEYTYPE_INSTALLED = False
+
+
+# Checks whether a class is defind in `torch.*` modules
+def is_torch_native_class(cls):
+    if not hasattr(cls, "__module__"):
+        return False
+
+    parent_modules = cls.__module__.split(".")
+    if not parent_modules:
+        return False
+
+    root_module = sys.modules.get(parent_modules[0])
+    return root_module is torch
+
+
+def get_type(type):
+    """Convert the given type to a torchScript acceptable format."""
+    if isinstance(type, str):
+        return type
+    elif inspect.getmodule(type) == typing:
+        # If the type is a type imported from typing
+        # like Tuple, List, Dict then replace `typing.`
+        # with a null string. This needs to be done since
+        # typing.List is not accepted by TorchScript.
+        type_to_string = str(type)
+        return type_to_string.replace(type.__module__ + ".", "")
+    elif is_torch_native_class(type):
+        # If the type is a subtype of torch module, then TorchScript expects a fully qualified name
+        # for the type which is obtained by combining the module name and type name.
+        return type.__module__ + "." + type.__name__
+    else:
+        # For all other types use the name for the type.
+        return type.__name__
+
+
+def get_optional_of_element_type(types):
+    """Extract element type, return as `Optional[element type]` from consolidated types.
+
+    Helper function to extracts the type of the element to be annotated to Optional
+    from the list of consolidated types and returns `Optional[element type]`.
+    TODO: To remove this check once Union support lands.
+    """
+    elem_type = types[1] if type(None) == types[0] else types[0]
+    elem_type = get_type(elem_type)
+
+    # Optional type is internally converted to Union[type, NoneType], which
+    # is not supported yet in TorchScript. Hence, representing the optional type as string.
+    return "Optional[" + elem_type + "]"
+
+
+def get_qualified_name(func):
+    return func.__qualname__
+
+
+if _IS_MONKEYTYPE_INSTALLED:
+
+    class JitTypeTraceStoreLogger(CallTraceStoreLogger):
+        """A JitTypeCallTraceLogger that stores logged traces in a CallTraceStore."""
+
+        def __init__(self, store: CallTraceStore):
+            super().__init__(store)
+
+        def log(self, trace: CallTrace) -> None:
+            self.traces.append(trace)
+
+    class JitTypeTraceStore(CallTraceStore):
+        def __init__(self):
+            super().__init__()
+            # A dictionary keeping all collected CallTrace
+            # key is fully qualified name of called function
+            # value is list of all CallTrace
+            self.trace_records: Dict[str, list] = defaultdict(list)
+
+        def add(self, traces: Iterable[CallTrace]):
+            for t in traces:
+                qualified_name = get_qualified_name(t.func)
+                self.trace_records[qualified_name].append(t)
+
+        def filter(
+            self,
+            qualified_name: str,
+            qualname_prefix: Optional[str] = None,
+            limit: int = 2000,
+        ) -> List[CallTraceThunk]:
+            return self.trace_records[qualified_name]
+
+        def analyze(self, qualified_name: str) -> Dict:
+            # Analyze the types for the given module
+            # and create a dictionary of all the types
+            # for arguments.
+            records = self.trace_records[qualified_name]
+            all_args = defaultdict(set)
+            for record in records:
+                for arg, arg_type in record.arg_types.items():
+                    all_args[arg].add(arg_type)
+            return all_args
+
+        def consolidate_types(self, qualified_name: str) -> Dict:
+            all_args = self.analyze(qualified_name)
+            # If there are more types for an argument,
+            # then consolidate the type to `Any` and replace the entry
+            # by type `Any`.
+            for arg, types in all_args.items():
+                types = list(types)
+                type_length = len(types)
+                if type_length == 2 and type(None) in types:
+                    # TODO: To remove this check once Union suppport in TorchScript lands.
+                    all_args[arg] = get_optional_of_element_type(types)
+                elif type_length > 1:
+                    all_args[arg] = "Any"
+                elif type_length == 1:
+                    all_args[arg] = get_type(types[0])
+            return all_args
+
+        def get_args_types(self, qualified_name: str) -> Dict:
+            return self.consolidate_types(qualified_name)
+
+    class JitTypeTraceConfig(monkeytype.config.Config):
+        def __init__(self, s: JitTypeTraceStore):
+            super().__init__()
+            self.s = s
+
+        def trace_logger(self) -> JitTypeTraceStoreLogger:
+            """Return a JitCallTraceStoreLogger that logs to the configured trace store."""
+            return JitTypeTraceStoreLogger(self.trace_store())
+
+        def trace_store(self) -> CallTraceStore:
+            return self.s
+
+        def code_filter(self) -> Optional[CodeFilter]:
+            return jit_code_filter
+
+else:
+    # When MonkeyType is not installed, we provide dummy class definitions
+    # for the below classes.
+    class JitTypeTraceStoreLogger:  # type:  ignore[no-redef]
+        def __init__(self):
+            pass
+
+    class JitTypeTraceStore:  # type:  ignore[no-redef]
+        def __init__(self):
+            self.trace_records = None
+
+    class JitTypeTraceConfig:  # type:  ignore[no-redef]
+        def __init__(self):
+            pass
+
+    monkeytype_trace = None  # type: ignore[assignment]  # noqa: F811
+
+
+def jit_code_filter(code: CodeType) -> bool:
+    """Codefilter for Torchscript to trace forward calls.
+
+    The custom CodeFilter is required while scripting a FX Traced forward calls.
+    FX Traced forward calls have `code.co_filename` start with '<' which is used
+    to exclude tracing of stdlib and site-packages in the default code filter.
+    Since we need all forward calls to be traced, this custom code filter
+    checks for code.co_name to be 'forward' and enables tracing for all such calls.
+    The code filter is similar to default code filter for monkeytype and
+    excludes tracing of stdlib and site-packages.
+    """
+    # Filter code without a source file and exclude this check for 'forward' calls.
+    if code.co_name != "forward" and (
+        not code.co_filename or code.co_filename[0] == "<"
+    ):
+        return False
+
+    filename = pathlib.Path(code.co_filename).resolve()
+    return not any(_startswith(filename, lib_path) for lib_path in LIB_PATHS)
diff --git a/MLPY/Lib/site-packages/torch/jit/_passes/__init__.py b/MLPY/Lib/site-packages/torch/jit/_passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..812bf898f3235d6077ed9a52f462b215c7e85562
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/_property_propagation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/_property_propagation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55943bed2d018afd2887d46dd6236af78d216aaf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/_passes/__pycache__/_property_propagation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/_passes/_property_propagation.py b/MLPY/Lib/site-packages/torch/jit/_passes/_property_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c499ec12dd304d5b6ee89aa48fe51b91d73e06f1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_passes/_property_propagation.py
@@ -0,0 +1,46 @@
+"""
+Tools to help with tensor property propagation.
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+
+from typing import Any, List
+
+import torch
+from torch import TensorType
+from torch._C import Graph
+
+
+def apply_input_props_using_example(graph: Graph, example_input: List[Any]):
+    """
+    Applies properties for each tensor in the graph inputs
+    using the example supplied.
+    """
+    graph_inputs = list(graph.inputs())
+    if len(graph_inputs) == 0:
+        return
+
+    # Strip self args off for methods
+    in_0 = graph_inputs[0]
+    if isinstance(in_0.type(), torch._C.ClassType) and in_0.debugName() == "self":
+        graph_inputs = graph_inputs[1:]
+
+    if not len(graph_inputs) == len(example_input):
+        raise RuntimeError(
+            "Number of inputs in graph does not match number of inputs in the example"
+        )
+
+    for i, (graph_i, example_i) in enumerate(zip(graph_inputs, example_input)):
+        if example_i is None:
+            continue  # Skip the type check
+
+        if isinstance(example_i, torch.Tensor) != isinstance(
+            graph_i.type(), TensorType
+        ):
+            raise RuntimeError(
+                f"Input {i} does not match type of example", graph_i, example_i
+            )
+
+        if isinstance(example_i, torch.Tensor):
+            graph_i.setType(TensorType.create_from_tensor(example_i))  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/jit/_pickle.py b/MLPY/Lib/site-packages/torch/jit/_pickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eeb5b5743c59deb0429362cfba3af6f49e2b5cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_pickle.py
@@ -0,0 +1,37 @@
+# These functions are referenced from the pickle archives produced by
+# ScriptModule.save()
+
+
+# These (`build_*`) functions used to be used by `pickler.cpp` to specify
+# the type of the list for certain special types, but now all lists get
+# a type attached and restored via `restore_type_tag` below. The legacy
+# functions should stick around for backwards-compatibility.
+
+
+def build_intlist(data):
+    return data
+
+
+def build_tensorlist(data):
+    return data
+
+
+def build_doublelist(data):
+    return data
+
+
+def build_boollist(data):
+    return data
+
+
+def build_tensor_from_id(data):
+    if isinstance(data, int):
+        # just the id, can't really do anything
+        return data
+
+
+def restore_type_tag(value, type_str):
+    # The type_ptr is used by the jit unpickler to restore the full static type
+    # to container types like list when they are re-loaded, but this doesn't
+    # matter for Python, so just return the plain value
+    return value
diff --git a/MLPY/Lib/site-packages/torch/jit/_recursive.py b/MLPY/Lib/site-packages/torch/jit/_recursive.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a29b4cd182ea99ffe203c112c3bdabf4d6ccd9c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_recursive.py
@@ -0,0 +1,1076 @@
+import collections
+import functools
+import inspect
+import sys
+import textwrap
+import types
+import warnings
+from typing import Dict, List, Set, Type
+
+import torch
+
+import torch._jit_internal as _jit_internal
+from torch._sources import fake_range
+from torch.jit._builtins import _find_builtin
+from torch.jit._check import AttributeTypeIsSupportedChecker
+from torch.jit._state import _add_script_class, _get_script_class, _python_cu
+from torch.jit.frontend import (
+    get_class_properties,
+    get_default_args,
+    get_jit_class_def,
+    get_jit_def,
+)
+from torch.nn import Module
+
+
+ScriptMethodStub = collections.namedtuple(
+    "ScriptMethodStub", ("resolution_callback", "def_", "original_method")
+)
+PropertyStub = collections.namedtuple("PropertyStub", ("resolution_callback", "def_"))
+
+
+# TODO: there should be a more principled way of doing this.
+ignored_attributes = [
+    "_version",
+    "_parameters",
+    "_buffers",
+    "_non_persistent_buffers_set",
+    "_backward_hooks",
+    "_backward_pre_hooks",
+    "_forward_hooks",
+    "_forward_hooks_with_kwargs",
+    "_forward_pre_hooks",
+    "_forward_pre_hooks_with_kwargs",
+    "_forward_hooks_always_called",
+    "_state_dict_hooks",
+    "_state_dict_pre_hooks",
+    "_load_state_dict_pre_hooks",
+    "_load_state_dict_post_hooks",
+    "_modules",
+    "_initializing",
+    "dump_patches",
+]
+
+
+def _compile_and_register_class(obj, rcb, qualified_name):
+    script_class = _get_script_class(obj)
+
+    if not script_class:
+        ast = get_jit_class_def(obj, obj.__name__)
+        defaults = torch.jit.frontend.get_default_args_for_class(obj)
+        script_class = torch._C._jit_script_class_compile(
+            qualified_name, ast, defaults, rcb
+        )
+        _add_script_class(obj, script_class)
+
+    return script_class
+
+
+def make_stub(func, name):
+    rcb = _jit_internal.createResolutionCallbackFromClosure(func)
+    ast = get_jit_def(func, name, self_name="RecursiveScriptModule")
+    return ScriptMethodStub(rcb, ast, func)
+
+
+def make_stub_from_method(nn_module, method_name):
+    func = getattr(nn_module, method_name)
+    if isinstance(func, ScriptMethodStub):
+        return func
+    # Make sure the name present in the resulting AST will match the name
+    # requested here. The only time they don't match is if you do something
+    # like:
+    #   def _forward(self):
+    #       pass
+    #   forward = _forward
+    # In this case, the actual function object will have the name `_forward`,
+    # even though we requested a stub for `forward`.
+    return make_stub(func, method_name)
+
+
+def make_stubs_from_exported_methods(mod):
+    stubs = []
+    for name in dir(mod):
+        item = getattr(mod, name, None)
+        if (
+            _jit_internal.get_torchscript_modifier(item)
+            is _jit_internal.FunctionModifiers.EXPORT
+        ):
+            stubs.append(make_stub_from_method(mod, name))
+
+    return stubs
+
+
+def jit_ignored_properties(module):
+    user_annotated_ignored_attributes = getattr(
+        module, "__jit_ignored_attributes__", list()
+    )
+
+    def get_properties_names(module):
+        return {k for k, v in vars(module).items() if isinstance(v, property)}
+
+    properties = get_properties_names(type(module))
+    user_annoted_ignored_properties = set()
+
+    for ignored_attr in user_annotated_ignored_attributes:
+        if ignored_attr in properties:
+            user_annoted_ignored_properties.add(ignored_attr)
+    return user_annoted_ignored_properties
+
+
+# base types that can be constants
+# in addition, tuples and lists of these base types are also considered constants
+# If you edit this list, then you also need to edit the handlers in
+# ConstantValue in jit/script/init.cpp
+_constant_types = (
+    bool,
+    float,
+    int,
+    str,
+    type(None),
+    torch.device,
+    torch.layout,
+    torch.dtype,
+)
+
+
+def _get_valid_constant(attr, v, owner_type):
+    if isinstance(v, _constant_types):
+        return v
+    elif isinstance(v, (tuple, list)):
+        return tuple(_get_valid_constant(attr, x, owner_type) for x in v)
+    constants = ", ".join(torch.typename(typ) for typ in _constant_types)
+    raise TypeError(
+        textwrap.dedent(
+            f"""
+        '{torch.typename(type(v))}' object in attribute '{owner_type}.{attr}' is not a valid constant.
+        Valid constants are:
+        1. a nn.ModuleList
+        2. a value of type {{{constants}}}
+        3. a list or tuple of (2)
+        """
+        )
+    )
+
+
+class SourceContext(torch._C._jit_tree_views.SourceRangeFactory):
+    def __init__(self, source, filename, file_lineno, leading_whitespace_len):
+        super().__init__(source, filename, file_lineno, leading_whitespace_len)
+
+
+def get_annotations(obj):
+    if sys.version_info < (3, 10):
+        return getattr(obj, "__annotations__", {})
+    # In Python-3.10+ it is recommended to use inspect.get_annotations
+    # See https://docs.python.org/3.10/howto/annotations.html
+    # But also, in 3.10 annotations from base class are not inherited
+    # by unannotated derived one, so they must be manually extracted
+    annotations = inspect.get_annotations(obj)
+    if annotations:
+        return annotations
+
+    def get_cls_annotations(cls):
+        cls_annotations = inspect.get_annotations(cls)
+        if cls_annotations:
+            return cls_annotations
+        for base in cls.__bases__:
+            cls_annotations = get_cls_annotations(base)
+            if cls_annotations:
+                return cls_annotations
+        return {}
+
+    cls = obj if isinstance(obj, type) else type(obj)
+    return get_cls_annotations(cls)
+
+
+def infer_concrete_type_builder(nn_module, share_types=True):
+    """
+    Build a ConcreteModuleTypeBuilder from an nn.Module.
+
+    This ConcreteModuleType doesn't have a JIT type associated with it yet, it
+    must be filled in by the caller.
+    """
+    concrete_type_builder = torch._C.ConcreteModuleTypeBuilder(type(nn_module))
+    if isinstance(nn_module, (torch.nn.ModuleDict)):
+        concrete_type_builder.set_module_dict()
+    if isinstance(nn_module, (torch.nn.ModuleList, torch.nn.Sequential)):
+        concrete_type_builder.set_module_list()
+    if isinstance(nn_module, (torch.nn.ParameterList)):
+        concrete_type_builder.set_parameter_list()
+    if isinstance(nn_module, (torch.nn.ParameterDict)):
+        concrete_type_builder.set_parameter_dict()
+
+    class_annotations = get_annotations(nn_module)
+    if isinstance(nn_module, (torch.ao.quantization.QuantWrapper)):
+        class_annotations = {}
+
+    # Get user-annotated ignored attributes.
+    user_annotated_ignored_attributes = getattr(
+        nn_module, "__jit_ignored_attributes__", list()
+    )
+    concrete_type_builder.add_ignored_attributes(user_annotated_ignored_attributes)
+    ignored_properties = jit_ignored_properties(nn_module)
+
+    # try to infer the type from type annotation or from the object itself
+    def infer_type(name, item):
+        # The forward function from Module is special; never use this annotations; we
+        # need to infer type directly using JIT.  I originally wanted to write
+        # this test as isinstance(class_annotations[name], Callable) but
+        # isinstance on typing things doesn't seem to work: isinstance(list, Callable)
+        # is also true!
+        inferred = False
+        try:
+            if (
+                name in class_annotations
+                and class_annotations[name]
+                != torch.nn.Module.__annotations__["forward"]
+            ):
+                ann_to_type = torch.jit.annotations.ann_to_type(
+                    class_annotations[name], fake_range()
+                )
+                attr_type = torch._C.InferredType(ann_to_type)
+            elif isinstance(item, torch.jit.Attribute):
+                ann_to_type = torch.jit.annotations.ann_to_type(item.type, fake_range())
+                attr_type = torch._C.InferredType(ann_to_type)
+            else:
+                attr_type = torch._C._jit_try_infer_type(item)
+                inferred = True
+        except RuntimeError as re:
+            raise RuntimeError(f"Error inferring type for {name}: {item}: {re}") from re
+
+        return attr_type, inferred
+
+    added_names = set()
+
+    for name, item in nn_module._parameters.items():
+        if name in user_annotated_ignored_attributes:
+            continue
+
+        assert item is None or isinstance(item, torch.Tensor)
+        attr_type, _ = infer_type(name, item)
+        # We currently have the invariant in various places in our code
+        # that parameters must be Tensors. However, the nn.Module API also
+        # allows NoneType parameters. These parameters are not returned as
+        # part of `parameters()` and its variants, but are available
+        # through direct attribute access.
+        concrete_type_builder.add_attribute(name, attr_type.type(), True, False)
+        added_names.add(name)
+
+    for name, item in nn_module._buffers.items():
+        if name in user_annotated_ignored_attributes:
+            continue
+
+        assert item is None or isinstance(item, torch.Tensor)
+        attr_type, _ = infer_type(name, item)
+        concrete_type_builder.add_attribute(name, attr_type.type(), False, True)
+        added_names.add(name)
+
+    for name, item in nn_module._modules.items():
+        if name in user_annotated_ignored_attributes:
+            continue
+
+        attr_type, _ = infer_type(name, item)
+        if item is None:
+            # Modules can be None. We don't have direct support for optional
+            # Modules, so the register it as an NoneType attribute instead.
+            concrete_type_builder.add_attribute(name, attr_type.type(), False, False)
+            continue
+        if attr_type.success():
+            assert attr_type.type().is_interface_type()
+            # if the type can be inferred, it should be a module interface type
+            sub_concrete_type = torch._C.ConcreteModuleType.from_jit_type(
+                attr_type.type()
+            )
+        else:
+            # otherwise we get the concrete module type for item and add it to concrete_type
+            sub_concrete_type = get_module_concrete_type(item, share_types)
+        concrete_type_builder.add_module(name, sub_concrete_type)
+
+        added_names.add(name)
+
+    # populate constants_set
+    constants_set = set(getattr(nn_module, "__constants__", ()))
+
+    # Constants annotated via `Final[T]` rather than being added to `__constants__`
+    for name, ann in class_annotations.items():
+        if torch._jit_internal.is_final(ann):
+            constants_set.add(name)
+
+    for name in constants_set:
+        if name in added_names:
+            # TODO: We should really error in this case, but its bc-breaking so
+            # we need to warn for at least one release
+            if name in nn_module._modules:
+                hint = "submodule"
+            elif name in nn_module._buffers:
+                hint = "buffer"
+            elif name in nn_module._parameters:
+                hint = "parameter"
+            else:
+                raise AssertionError(
+                    "added_names must be submodule, parameter, or buffer"
+                )
+
+            warnings.warn(
+                f"'{name}' was found in ScriptModule constants, "
+                f" but it is a non-constant {hint}. Consider removing it."
+            )
+            continue
+        if not hasattr(nn_module, name):
+            # TODO: We should really error in this case, but its bc-breaking so
+            # we need to warn for at least one release
+            warnings.warn(
+                f"'{name}' was found in ScriptModule constants, "
+                "but was not actually set in __init__. "
+                "Consider removing it."
+            )
+            continue
+        value = getattr(nn_module, name)
+        concrete_type_builder.add_constant(
+            name, _get_valid_constant(name, value, type(nn_module).__name__)
+        )
+        added_names.add(name)
+
+    # populate overloads
+    overloads = getattr(nn_module, "__overloads__", {})
+    # update with any annotated overloads
+    overloads.update(
+        get_overload_name_mapping(
+            get_overload_annotations(nn_module, ignored_properties)
+        )
+    )
+    for name, overloaded_names in overloads.items():
+        concrete_type_builder.add_overload(name, overloaded_names)
+
+    for name, value in nn_module.__dict__.items():
+        if name in ignored_attributes or name.startswith("__"):
+            # Python objects have lots of random attributes attached to them;
+            # PyTorch adds a few more. Prevent these from getting compiled.
+            continue
+
+        if name in user_annotated_ignored_attributes:
+            continue
+
+        if name in added_names:
+            # Don't re-add anything we already added
+            continue
+
+        isoverloadpacket = isinstance(value, torch._ops.OpOverloadPacket)
+        if isoverloadpacket:
+            value = value.op
+        # Handle Python function attributes
+        if inspect.isfunction(value):
+            try:
+                scripted_fn = torch.jit.script(value)
+                concrete_type_builder.add_function_attribute(
+                    name, torch._C._jit_try_infer_type(scripted_fn).type(), value
+                )
+            except Exception as e:
+                # If we fail to script the function, it isn't a hard error.
+                # Instead, we will add it to the list of attributes we failed
+                # to convert, with the compilation error.
+                hint = (
+                    "(This function exists as an attribute on the Python module, "
+                    "but we failed to compile it to a TorchScript function. "
+                    f"\nThe error stack is reproduced here:\n{e}"
+                )
+                concrete_type_builder.add_failed_attribute(name, hint)
+                pass
+
+            continue
+
+        # Handle calls to builtin functions (either bespoke builtins from torch.jit._builtins or
+        # a call to an aten function like torch.add)
+        builtin_symbol_name = _find_builtin(value)
+        if builtin_symbol_name:
+            concrete_type_builder.add_builtin_function(name, builtin_symbol_name)
+            continue
+
+        # Handle Script function attributes
+        if isinstance(value, torch.jit.ScriptFunction):
+            concrete_type_builder.add_function_attribute(
+                name, torch._C._jit_try_infer_type(value).type(), value
+            )
+            continue
+
+        # If we got here, this is a regular "data" attribute, add it to the concrete type
+        attr_type, inferred = infer_type(name, value)
+        if attr_type.success():
+            concrete_type_builder.add_attribute(name, attr_type.type(), False, False)
+        else:
+            # TODO: could add more detail here. For example, what the user should do
+            # when the pytype is `list` or `NoneType`
+            inferred_msg = (
+                "Its type was inferred; try adding a type annotation for the attribute."
+                if inferred
+                else ""
+            )
+            additional_info = f"{attr_type.reason()}. {inferred_msg}"
+            hint = (
+                "(This attribute exists on the Python module, "
+                f"but we failed to convert Python type: '{torch.typename(type(value))}' "
+                f"to a TorchScript type. {additional_info})"
+            )
+            concrete_type_builder.add_failed_attribute(name, hint)
+
+    # add hooks to concrete type
+    for hook in nn_module._forward_hooks.values():
+        concrete_type_builder.add_forward_hook(hook)
+    for pre_hook in nn_module._forward_pre_hooks.values():
+        concrete_type_builder.add_forward_pre_hook(pre_hook)
+
+    return concrete_type_builder
+
+
+class ConcreteTypeStore:
+    type_store: Dict[Type[Module], List[torch._C.ConcreteModuleType]]
+    methods_compiled: Set[torch._C.ConcreteModuleType]
+
+    def __init__(self):
+        # Python module type => List[ConcreteModuleType)]
+        self.type_store = {}
+        # ConcreteTypes that have had their methods already compiled
+        self.methods_compiled = set()
+
+    def get_or_create_concrete_type(self, nn_module):
+        """Infer a ConcreteType from this `nn.Module` instance. Underlying JIT types are re-used if possible."""
+        concrete_type_builder = infer_concrete_type_builder(nn_module)
+
+        nn_module_type = type(nn_module)
+        if nn_module_type not in self.type_store:
+            self.type_store[nn_module_type] = []
+
+        # Search the type store for an already-available JIT type
+        known_types = self.type_store[nn_module_type]
+        for known_type in known_types:
+            if known_type.equals(concrete_type_builder):
+                return known_type
+
+        # We didn't find anything; generate a new JIT type from this concrete type
+        concrete_type = concrete_type_builder.build()
+        self.type_store[nn_module_type].append(concrete_type)
+        return concrete_type
+
+
+concrete_type_store = ConcreteTypeStore()
+
+
+def create_methods_and_properties_from_stubs(
+    concrete_type, method_stubs, property_stubs
+):
+    method_defs = [m.def_ for m in method_stubs]
+    method_rcbs = [m.resolution_callback for m in method_stubs]
+    method_defaults = [get_default_args(m.original_method) for m in method_stubs]
+
+    property_defs = [p.def_ for p in property_stubs]
+    property_rcbs = [p.resolution_callback for p in property_stubs]
+
+    concrete_type._create_methods_and_properties(
+        property_defs, property_rcbs, method_defs, method_rcbs, method_defaults
+    )
+
+
+def create_hooks_from_stubs(concrete_type, hook_stubs, pre_hook_stubs):
+    hook_defs = [h.def_ for h in hook_stubs]
+    hook_rcbs = [h.resolution_callback for h in hook_stubs]
+
+    pre_hook_defs = [h.def_ for h in pre_hook_stubs]
+    pre_hook_rcbs = [h.resolution_callback for h in pre_hook_stubs]
+
+    concrete_type._create_hooks(hook_defs, hook_rcbs, pre_hook_defs, pre_hook_rcbs)
+
+
+def get_module_concrete_type(nn_module, share_types=True):
+    """
+    Get a concrete type for nn_modules.
+
+    If share_types is True, the concrete type is fetched from concrete_type_store.
+    If it is False, a new concrete type is created without first searching concrete_type_store.
+
+    Args:
+        nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
+        share_types = Whether to share underlying JIT types between modules (if possible).
+
+    Returns:
+        A concrete type for nn_module.
+    """
+    assert isinstance(nn_module, Module)
+    if isinstance(nn_module, torch.jit.ScriptModule) and hasattr(
+        nn_module, "_concrete_type"
+    ):
+        return nn_module._concrete_type
+
+    if share_types:
+        # Look into the store of cached JIT types
+        concrete_type = concrete_type_store.get_or_create_concrete_type(nn_module)
+    else:
+        # Get a concrete type directly, without trying to re-use an existing JIT
+        # type from the type store.
+        concrete_type_builder = infer_concrete_type_builder(nn_module, share_types)
+        concrete_type_builder.set_poisoned()
+        concrete_type = concrete_type_builder.build()
+
+    return concrete_type
+
+
+def create_script_class(obj):
+    """
+    Create and return a RecursiveScriptClass instance from a Python object.
+
+    Arguments:
+        obj: A Python object.
+    """
+    qualified_class_name = _jit_internal._qualified_name(type(obj))
+    rcb = _jit_internal.createResolutionCallbackForClassMethods(type(obj))
+    # Script the type of obj if it hasn't already been scripted.
+    _compile_and_register_class(type(obj), rcb, qualified_class_name)
+    class_ty = _python_cu.get_class(qualified_class_name)
+    # Create an empty torch._C.ScriptObject with the scripted type.
+    cpp_object = torch._C._create_object_with_type(class_ty)
+    # Copy all of the attributes over to the torch._C.ScriptObject.
+    for name, value in obj.__dict__.items():
+        cpp_object.setattr(name, value)
+
+    # Wrap the torch._C.ScriptObject in a RecursiveScriptClass instance.
+    return wrap_cpp_class(cpp_object)
+
+
+def create_script_module(nn_module, stubs_fn, share_types=True, is_tracing=False):
+    """
+    Create a new ScriptModule from an nn.Module.
+
+    Args:
+        nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
+        stubs_fn:  Lambda that takes an nn.Module and generates a list of ScriptMethodStubs to compile.
+        share_types:  Whether to share underlying JIT types between modules (if possible).
+            NOTE: Only set to False this when we cannot guarantee type sharing will work
+                correctly. This only happens today for traced modules, where the same
+                module can produce different traced methods depending on the inputs.
+        is_tracing: Whether this function is called during tracing or scripting. If tracing,
+                we don't need to do AttributeTypeIsSupportedChecker because all the unsupported
+                attributes will be baked as constant in the tracing graph. In addition,
+                this check significantly slows down the traced modules when the module size is big.
+    """
+    assert not isinstance(nn_module, torch.jit.RecursiveScriptModule)
+    check_module_initialized(nn_module)
+    concrete_type = get_module_concrete_type(nn_module, share_types)
+    if not is_tracing:
+        AttributeTypeIsSupportedChecker().check(nn_module)
+    return create_script_module_impl(nn_module, concrete_type, stubs_fn)
+
+
+def create_script_module_impl(nn_module, concrete_type, stubs_fn):
+    """
+    Convert an nn.Module to a RecursiveScriptModule.
+
+    Args:
+        nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
+        concrete_type:  The fully initialized ConcreteType of the module.
+        stubs_fn:  Lambda that takes an nn.Module and generates a list of ScriptMethodStubs to compile.
+    """
+    cpp_module = torch._C._create_module_with_type(concrete_type.jit_type)
+    method_stubs = stubs_fn(nn_module)
+    property_stubs = get_property_stubs(nn_module)
+    hook_stubs, pre_hook_stubs = get_hook_stubs(nn_module)
+
+    user_annotated_ignored_attributes = getattr(
+        nn_module, "__jit_ignored_attributes__", list()
+    )
+    ignored_properties = jit_ignored_properties(nn_module)
+
+    def init_fn(script_module):
+        # Initialize the ScriptModule:
+        # 1. Copy the attributes/parameters/buffers from the original `nn_module` to the new ScriptModule.
+        for name in concrete_type.get_attributes().keys():
+            orig_value = getattr(nn_module, name)
+            orig_value = (
+                orig_value.value
+                if isinstance(orig_value, torch.jit.Attribute)
+                else orig_value
+            )
+            cpp_module.setattr(name, orig_value)
+
+        # 2. Copy the submodules from the original `nn_module` to the new ScriptModule,
+        #    recursively scripting them.
+        for name, sub_concrete_type in concrete_type.get_modules():
+            orig_value = getattr(nn_module, name)
+            assert isinstance(
+                orig_value, Module
+            ), f"Expected Module but got {type(orig_value)}"
+            module_type = sub_concrete_type.jit_type
+            if isinstance(module_type, torch._C.InterfaceType):
+                # use the interface inference rule to compile the module
+                scripted = interface_script(module_type, orig_value)
+            elif isinstance(orig_value, torch.jit.ScriptModule):
+                scripted = orig_value
+            else:
+                # always reuse the provided stubs_fn to infer the methods to compile
+                scripted = create_script_module_impl(
+                    orig_value, sub_concrete_type, stubs_fn
+                )
+
+            cpp_module.setattr(name, scripted)
+            script_module._modules[name] = scripted
+
+        # 3. Copy @ignored/@unused methods and attrs from the original `nn_module` to the new ScriptModule.
+        #    This ensures we can access these Python methods on the ScriptModule.
+        for name in dir(nn_module):
+            if name in ignored_properties:
+                continue
+            item = getattr(nn_module, name, None)
+            if inspect.ismethod(item) and _jit_internal.is_ignored_fn(item):
+                unbound_function = getattr(nn_module, name).__func__
+                bound_method = unbound_function.__get__(script_module)
+                setattr(script_module, name, bound_method)
+            elif concrete_type.is_ignored_attribute(name):
+                setattr(script_module, name, item)
+
+        # For convenience, attach the concrete type to the new ScriptModule
+        script_module._concrete_type = concrete_type
+
+    # Actually create the ScriptModule, initializing it with the function we just defined
+    script_module = torch.jit.RecursiveScriptModule._construct(cpp_module, init_fn)
+
+    # Compile methods if necessary
+    if concrete_type not in concrete_type_store.methods_compiled:
+        create_methods_and_properties_from_stubs(
+            concrete_type, method_stubs, property_stubs
+        )
+        # Create hooks after methods to ensure no name collisions between hooks and methods.
+        # If done before, hooks can overshadow methods that aren't exported.
+        create_hooks_from_stubs(concrete_type, hook_stubs, pre_hook_stubs)
+        torch._C._run_emit_module_hook(cpp_module)
+        concrete_type_store.methods_compiled.add(concrete_type)
+
+    # Copy the forward hooks and pre-hooks to the new ScriptModule
+    # to allow the hooks to be run from eager as ScriptFunctions
+    for idx, fn in enumerate(script_module._c._get_forward_pre_hooks()):
+        script_module._forward_pre_hooks[idx] = fn
+    for idx, fn in enumerate(script_module._c._get_forward_hooks()):
+        script_module._forward_hooks[idx] = fn
+
+    # Special handling so methods like __len__ work in script methods on classes derived from containers
+    if (
+        isinstance(
+            nn_module, (torch.nn.ModuleList, torch.nn.Sequential, torch.nn.ModuleDict)
+        )
+        and "__len__" not in cpp_module._method_names()
+    ):
+        script_module.define(f"def __len__(self):\n   return {len(nn_module)}\n")
+    if (
+        isinstance(nn_module, torch.nn.ModuleDict)
+        and "__contains__" not in cpp_module._method_names()
+    ):
+        if len(nn_module.keys()):
+            keys = repr(list(nn_module.keys()))
+            script_module.define(
+                f"def __contains__(self, key: str):\n   return key in {keys}\n"
+            )
+        else:
+            script_module.define("def __contains__(self, key: str):\n   return False\n")
+
+    # Make the compiled methods available to the Python ScriptModule class.
+    for method_stub in method_stubs:
+        if method_stub.original_method is None:
+            # define()'d methods don't have an Python original_method, so we
+            # don't need to do any Python re-wrapping stuff
+            continue
+
+        name = method_stub.original_method.__name__
+        if name != method_stub.def_.name().name:
+            # TODO: Why skip this? Because @torch.jit._overload_method will
+            # mangle the name of the function.
+            continue
+        script_method = cpp_module._get_method(name)
+
+        # Wrap the original to propagate docstrings and such.
+        # TODO: we don't currently do this functions that are recursively
+        # compiled, we should.
+        wrapped_script_method = functools.wraps(method_stub.original_method)(
+            script_method
+        )
+
+        # Add the methods to the script_module directly. This ensures they will
+        # be found first when `name` is looked up (as opposed to the stubs or
+        # nn.Module.forward)
+        script_module.__dict__[name] = wrapped_script_method
+
+    # Make module properties available on the Python ScriptModule class.
+    for property_stub in property_stubs:
+        property_name = property_stub.def_.name().name
+        fget = cpp_module._get_method(property_stub.def_.getter_name().name)
+        # Setter is optional, so it may not exist.
+        setter_name = property_stub.def_.setter_name()
+        fset = cpp_module._get_method(setter_name.name) if setter_name else None
+        script_module.__dict__[property_name] = property(property_name, fget, fset)  # type: ignore[arg-type]
+
+    # copy over python methods to script module if they aren't defined on the script module
+    # this is currently an internal api used only on module containers
+    for name in dir(nn_module):
+        if name in ignored_properties:
+            continue
+        item = getattr(nn_module, name, None)
+        if (
+            _jit_internal.get_torchscript_modifier(item)
+            is _jit_internal.FunctionModifiers.COPY_TO_SCRIPT_WRAPPER
+        ):
+            add_python_attr_to_scripted_model(script_module, nn_module, name)
+
+    return script_module
+
+
+# We define shims of certain attributes on the RecursiveScriptModule to support
+# magic methods. To check if a script model defines an attribute we need
+# to also check that the attribute is not the shim
+def script_model_defines_attr(script_model, attr):
+    script_attr = getattr(script_model, attr, None)
+    if script_attr is None:
+        return False
+    default_attr = getattr(torch.jit.RecursiveScriptModule, attr, None)
+    if default_attr is None:
+        return False
+    return script_attr != default_attr
+
+
+def add_python_attr_to_scripted_model(script_model, orig, attr):
+    if hasattr(orig, attr) and script_model_defines_attr(script_model, attr):
+        setattr(script_model, attr, getattr(orig, attr))
+
+
+def get_overload_annotations(mod, jit_ignored_properties):
+    # original function => [(mangled overload name, overload function)]
+    overloads = {}
+
+    for name in dir(type(mod)):
+        if name in jit_ignored_properties:
+            continue
+        item = getattr(mod, name, None)
+        if not callable(item):
+            continue
+
+        # builtin functions like repr() in python 2 do not have __module__ defined
+        if hasattr(item, "__module__") and item.__module__ is not None:
+            method_overloads = _jit_internal._get_overloaded_methods(
+                item, mod.__class__
+            )
+            if method_overloads is None:
+                continue
+
+            if item.__func__ in method_overloads:
+                raise RuntimeError(
+                    _jit_internal.get_overload_no_implementation_error_message(
+                        "method", item.__func__
+                    )
+                )
+
+            names = [name + "__" + str(i) for i in range(len(method_overloads))]
+            overloads[item] = list(zip(names, method_overloads))
+
+    return overloads
+
+
+def get_overload_name_mapping(overload_info):
+    # Same format as __overloads__
+    # original function => [overload names]
+    overload_name_mappings: Dict[str, List[str]] = {}
+    for orig_fn, overloads in overload_info.items():
+        original_name = orig_fn.__name__
+        if original_name not in overload_name_mappings:
+            overload_name_mappings[original_name] = []
+
+        for overload_name, _ in overloads:
+            overload_name_mappings[original_name].append(overload_name)
+    return overload_name_mappings
+
+
+def _check_no_signature(func):
+    signature = torch.jit.annotations.get_signature(
+        func, None, fake_range(), inspect.ismethod(func)
+    )
+    if signature is None:
+        qual_name = _jit_internal._qualified_name(func)
+        raise RuntimeError(
+            f"Must explicitly add type annotations to overloaded functions: {qual_name}"
+        )
+
+
+def make_stubs_for_overloads(overload_info):
+    overload_stubs = []
+    for orig_fn, overloads in overload_info.items():
+        orig_ast = get_jit_def(
+            orig_fn, orig_fn.__name__, self_name="RecursiveScriptModule"
+        )
+        for overload_name, overload_fn in overloads:
+            _check_no_signature(overload_fn)
+            over_ast = get_jit_def(
+                overload_fn, overload_fn.__name__, self_name="RecursiveScriptModule"
+            )
+            new_ast = torch._C._replace_overloaded_method_decl(
+                over_ast.decl(), orig_ast, overload_name
+            )
+            _rcb = _jit_internal.createResolutionCallbackFromClosure(orig_fn)
+            overload_stubs.append(ScriptMethodStub(_rcb, new_ast, overload_fn))
+    return overload_stubs
+
+
+def check_module_initialized(mod):
+    assert isinstance(mod, torch.nn.Module)
+    if not hasattr(mod, "_parameters"):
+        raise RuntimeError(
+            f"'{torch.typename(type(mod))}' has not been initialized, did you forget to call 'super()'?"
+        )
+
+    # This is to avoid importing torch.distributed.nn
+    if not hasattr(mod, "remote_parameters"):
+        for name, param in mod._parameters.items():
+            if param is not None and torch.nn.parameter.is_lazy(param):
+                raise RuntimeError(
+                    "'{}' has uninitialized parameters {}. Did you forget to run a forward pass?".format(
+                        torch.typename(type(mod)), name
+                    )
+                )
+        for name, buf in mod._buffers.items():
+            if buf is not None and torch.nn.parameter.is_lazy(buf):
+                raise RuntimeError(
+                    "'{}' has uninitialized buffers {}. Did you forget to run a forward pass?".format(
+                        torch.typename(type(mod)), name
+                    )
+                )
+
+
+def infer_methods_to_compile(nn_module):
+    """Implement the default rules for which methods should act as starting points for compilation.
+
+    (TODO add a link when the rules are published).
+    """
+    check_module_initialized(nn_module)
+    user_annotated_ignored_attributes = getattr(
+        nn_module, "__jit_ignored_attributes__", list()
+    )
+    ignored_properties = jit_ignored_properties(nn_module)
+
+    methods: List[str] = []
+    if hasattr(nn_module, "forward") and not _jit_internal.is_ignored_fn(
+        nn_module.forward
+    ):
+        forward_func = getattr(nn_module.forward, "__func__", None)
+        module_forward = getattr(torch.nn.Module, "forward", None)
+        if forward_func != module_forward:
+            methods = ["forward"]
+
+    exported = []
+    for name in dir(nn_module):
+        if name in ignored_properties:
+            continue
+        item = getattr(nn_module, name, None)
+        if (
+            _jit_internal.get_torchscript_modifier(item)
+            is _jit_internal.FunctionModifiers.EXPORT
+        ):
+            exported.append(name)
+
+    methods = methods + exported
+
+    overload_name_mappings = dict(getattr(nn_module, "__overloads__", {}))
+    overload_info = get_overload_annotations(nn_module, ignored_properties)
+    overload_name_mappings.update(get_overload_name_mapping(overload_info))
+    overload_stubs = make_stubs_for_overloads(overload_info)
+
+    nn_module.__overloads__ = overload_name_mappings
+
+    # we shouldn't directly compile overloaded methods, just its overloads
+    def ignore_overloaded(method_name):
+        return method_name not in overload_name_mappings
+
+    filtered_methods = filter(ignore_overloaded, methods)
+
+    # Unique the methods. We don't want to use a set to store the methods because it
+    # introduces non-determinism to compile order.
+    uniquer: Set[str] = set()
+    uniqued_methods = []
+    for name in filtered_methods:
+        if name in uniquer:
+            continue
+        uniqued_methods.append(name)
+        uniquer.add(name)
+
+    stubs = []
+    for method in uniqued_methods:
+        stubs.append(make_stub_from_method(nn_module, method))
+    return overload_stubs + stubs
+
+
+def get_hook_stubs(nn_module):
+    """Return forward hook and pre_hook ScriptModuleStubs."""
+    check_module_initialized(nn_module)
+    hook_map: Dict = {}
+
+    hook_stubs = []
+    for hook in nn_module._forward_hooks.values():
+        if hook.__name__ in hook_map:
+            if id(hook) != id(hook_map[hook.__name__]):
+                raise RuntimeError(
+                    f"Hook '{hook.__name__}' on {type(nn_module).__name__} "
+                    "has at least two different python definitions."
+                    " Please use unique names for all hooks."
+                )
+        else:
+            hook_map[hook.__name__] = hook
+        hook_stubs.append(make_stub(hook, hook.__name__))
+
+    pre_hook_stubs = []
+    for pre_hook in nn_module._forward_pre_hooks.values():
+        if pre_hook.__name__ in hook_map:
+            if id(pre_hook) != id(hook_map[pre_hook.__name__]):
+                raise RuntimeError(
+                    f"Pre-hook '{pre_hook.__name__}' on {type(nn_module).__name__} "
+                    "has at least two different python definitions."
+                    " Please use unique names for all hooks."
+                )
+        else:
+            hook_map[pre_hook.__name__] = pre_hook
+        pre_hook_stubs.append(make_stub(pre_hook, pre_hook.__name__))
+
+    return hook_stubs, pre_hook_stubs
+
+
+def get_property_stubs(nn_module):
+    """Create property stubs for the properties of the module by creating method stubs for the getter and setter."""
+    module_ty = type(nn_module)
+    properties_asts = get_class_properties(module_ty, self_name="RecursiveScriptModule")
+    rcbs = {}
+
+    for name in dir(module_ty):
+        item = getattr(module_ty, name, None)
+        if isinstance(item, property):
+            if not item.fget:
+                raise RuntimeError(
+                    f"Property {name} of {nn_module.__name__} must have a getter"
+                )
+
+            rcbs[name] = _jit_internal.createResolutionCallbackFromClosure(item.fget)
+
+    stubs = [PropertyStub(rcbs[ast.name().name], ast) for ast in properties_asts]
+    return stubs
+
+
+def interface_script(mod_interface, nn_module):
+    """
+    Make a ScriptModule from an nn.Module, using the interface methods rule for determining which methods to compile.
+
+    Args:
+        mod_interface: the interface type that the module have
+        nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
+    """
+    if isinstance(nn_module, torch.jit.ScriptModule):
+        return nn_module
+
+    check_module_initialized(nn_module)
+
+    def infer_interface_methods_to_compile(nn_module):
+        """Rule to infer the methods from the interface type.
+
+        It is used to know which methods need to act as starting points for compilation.
+        """
+        stubs = []
+        for method in mod_interface.getMethodNames():
+            stubs.append(make_stub_from_method(nn_module, method))
+        return stubs
+
+    return create_script_module(nn_module, infer_interface_methods_to_compile)
+
+
+def try_compile_fn(fn, loc):
+    if _jit_internal.is_ignored_fn(fn):
+        # Don't do anything for @ignore'd functions
+        return None
+
+    if isinstance(fn, torch.nn.Module):
+        # Since modules are callable pybind recognizes them as functions, but
+        # don't do anything for them
+        return None
+
+    if not inspect.isfunction(fn) and not inspect.ismethod(fn):
+        raise RuntimeError(
+            f"`{fn}` is not a function. Recursive scripting only supports "
+            "Python functions or methods currently.\n"
+            f"Consider manually annotating `{fn}` with @torch.jit.script."
+        )
+
+    # The object returned by __prepare_scriptable__ might have a different closure.
+    # Resolve it here to get the right resolution callback.
+    fn = fn.__prepare_scriptable__() if hasattr(fn, "__prepare_scriptable__") else fn  # type: ignore[operator]
+
+    # We don't have the actual scope where the function was defined, but we can
+    # extract the necessary info from the closed over variables on the function
+    # object
+    rcb = _jit_internal.createResolutionCallbackFromClosure(fn)
+    return torch.jit.script(fn, _rcb=rcb)
+
+
+def wrap_cpp_class(cpp_class):
+    """Wrap this torch._C.Object in a Python RecursiveScriptClass."""
+    return torch.jit.RecursiveScriptClass(cpp_class)
+
+
+def wrap_cpp_module(cpp_module):
+    """Wrap this torch._C.ScriptModule in a Python ScriptModule, recursively for all submodules."""
+
+    def init_fn(script_module):
+        for name, cpp_module in torch._C.ModuleDict(script_module._c).items():
+            setattr(script_module, name, wrap_cpp_module(cpp_module))
+        script_module._concrete_type = torch._C.ConcreteModuleType.from_jit_type(
+            script_module._c._type()
+        )
+
+        for idx, fn in enumerate(script_module._c._get_forward_pre_hooks()):
+            script_module._forward_pre_hooks[idx] = fn
+        for idx, fn in enumerate(script_module._c._get_forward_hooks()):
+            script_module._forward_hooks[idx] = fn
+
+    return torch.jit.RecursiveScriptModule._construct(cpp_module, init_fn)
+
+
+def compile_unbound_method(concrete_type, fn):
+    if _jit_internal.is_ignored_fn(fn):
+        return None
+    stub = make_stub(fn, fn.__name__)
+    with torch._jit_internal._disable_emit_hooks():
+        # We don't want to call the hooks here since the graph that is calling
+        # this function is not yet complete
+        create_methods_and_properties_from_stubs(concrete_type, (stub,), ())
+    return stub
+
+
+def lazy_bind(concrete_type, unbound_method):
+    """
+    Return a function that lazily binds `unbound_method` to a provided Module IValue, then invokes the method.
+
+    We do this so that any Python shenanigans that
+    will poison type sharing are impossible at compile time.
+    """
+
+    def lazy_binding_method(cpp_module, *args):
+        def init_fn(script_module):
+            orig_class = concrete_type.py_class
+
+            # Copy @ignored/@unused methods from the original module to the new one.
+            # This ensures they are available during execution.
+            for name in dir(orig_class):
+                item = getattr(orig_class, name, None)
+                if _jit_internal.is_ignored_fn(item):
+                    setattr(script_module, name, item)
+
+            # Copy constants over so they are available during execution.
+            for name, value in concrete_type.get_constants().items():
+                setattr(script_module, name, value)
+
+        script_module = torch.jit.RecursiveScriptModule._construct(cpp_module, init_fn)
+        method = types.MethodType(unbound_method, script_module)
+        return method(*args)
+
+    # make the lazy binding method "look like" the original method
+    lazy_binding_method.original_fn = unbound_method  # type: ignore[attr-defined]
+    lazy_binding_method.__name__ = unbound_method.__name__
+    torch._jit_internal.copy_torchscript_modifier(unbound_method, lazy_binding_method)
+
+    return lazy_binding_method
diff --git a/MLPY/Lib/site-packages/torch/jit/_script.py b/MLPY/Lib/site-packages/torch/jit/_script.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec94da03eb7384d5ab25286502a6e63a1fd3c794
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_script.py
@@ -0,0 +1,1690 @@
+"""TorchScript.
+
+This module contains functionality to support the JIT's scripting frontend, notably:
+    - torch.jit.script
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+import collections
+import copy
+import enum
+import functools
+import inspect
+import pickle
+import warnings
+from typing import Any, Callable, Dict, List, Set, Tuple, Union
+
+import torch
+import torch._jit_internal as _jit_internal
+from torch._classes import classes
+from torch._jit_internal import _qualified_name
+from torch.jit._builtins import _register_builtin
+from torch.jit._fuser import _graph_for, _script_method_graph_for
+
+from torch.jit._monkeytype_config import (
+    JitTypeTraceConfig,
+    JitTypeTraceStore,
+    monkeytype_trace,
+)
+from torch.jit._recursive import (
+    _compile_and_register_class,
+    infer_methods_to_compile,
+    ScriptMethodStub,
+    wrap_cpp_module,
+)
+from torch.jit._state import (
+    _enabled,
+    _set_jit_function_cache,
+    _set_jit_overload_cache,
+    _try_get_jit_cached_function,
+    _try_get_jit_cached_overloads,
+)
+from torch.jit.frontend import get_default_args, get_jit_class_def, get_jit_def
+from torch.nn import Module
+from torch.overrides import (
+    has_torch_function,
+    has_torch_function_unary,
+    has_torch_function_variadic,
+)
+from torch.package import PackageExporter, PackageImporter
+from torch.utils import set_module
+from ._serialization import validate_map_location
+
+type_trace_db = JitTypeTraceStore()  # DB to hold all call traces from MonkeyType
+
+torch._C.ScriptMethod.graph_for = _script_method_graph_for  # type: ignore[attr-defined]
+torch._C.ScriptFunction.graph_for = _graph_for  # type: ignore[attr-defined]
+ScriptFunction = torch._C.ScriptFunction
+ScriptFunction.__doc__ = """
+Functionally equivalent to a :class:`ScriptModule`, but represents a single
+function and does not have any attributes or Parameters.
+"""
+set_module(ScriptFunction, "torch.jit")
+
+
+# Throws an error if a jit function is pickled.
+# Helps to avoid Python crashes for Python versions 3.9.5 + when protocol 0 or 1 is given as an argument.
+def _reduce(cls):
+    raise pickle.PickleError("ScriptFunction cannot be pickled")
+
+
+ScriptFunction.__reduce__ = _reduce  # type: ignore[assignment]
+
+
+if _enabled:
+    Attribute = collections.namedtuple("Attribute", ["value", "type"])
+else:
+
+    def Attribute(value, type):  # type: ignore[no-redef]
+        return value
+
+
+Attribute.__doc__ = """
+    This method is a pass-through function that returns `value`, mostly
+    used to indicate to the TorchScript compiler that the left-hand side
+    expression is a class instance attribute with type of `type`. Note that
+    `torch.jit.Attribute` should only be used in `__init__` method of `jit.ScriptModule`
+    subclasses.
+
+    Though TorchScript can infer correct type for most Python expressions, there are some cases where
+    type inference can be wrong, including:
+
+    - Empty containers like `[]` and `{}`, which TorchScript assumes to be container of `Tensor`
+    - Optional types like `Optional[T]` but assigned a valid value of type `T`, TorchScript would assume
+      it is type `T` rather than `Optional[T]`
+
+    In eager mode, it is simply a pass-through function that returns `value`
+    without other implications.
+
+    Example:
+
+    .. testcode::
+
+        import torch
+        from typing import Dict
+
+        class AttributeModule(torch.jit.ScriptModule):
+            def __init__(self):
+                super().__init__()
+                self.foo = torch.jit.Attribute(0.1, float)
+
+                # we should be able to use self.foo as a float here
+                assert 0.0 < self.foo
+
+                self.names_ages = torch.jit.Attribute({}, Dict[str, int])
+                self.names_ages["someone"] = 20
+                assert isinstance(self.names_ages["someone"], int)
+
+        m = AttributeModule()
+        # m will contain two attributes
+        # 1. foo of type float
+        # 2. names_ages of type Dict[str, int]
+
+    .. testcleanup::
+
+        del AttributeModule
+        del m
+
+    Note: it's now preferred to instead use type annotations instead of `torch.jit.Attribute`:
+
+    .. testcode::
+
+        import torch
+        from typing import Dict
+
+        class AttributeModule(torch.nn.Module):
+            names: Dict[str, int]
+
+            def __init__(self):
+                super().__init__()
+                self.names = {}
+
+        m = AttributeModule()
+
+    .. testcleanup::
+
+        del AttributeModule
+        del m
+
+    Args:
+        value: An initial value to be assigned to attribute.
+        type: A Python type
+
+    Returns:
+        Returns `value`
+"""
+
+
+def _get_type_trace_db():
+    # This is a private API. Use of this for external purposes is discouraged.
+    return type_trace_db
+
+
+# Gets a function from the name of a method on a type
+def _get_function_from_type(cls, name):
+    return getattr(cls, name, None)
+
+
+# ScriptClasses must be new-style classes because we construct them using their
+# __new__ method.
+def _is_new_style_class(cls):
+    if hasattr(cls, "__class__"):
+        return "__dict__" in dir(cls) or hasattr(cls, "__slots__")
+
+
+# These OrderedDictWrapper classes replace the actual OrderedDicts in
+# module with versions that get/set properties inside of Module.
+# This allows us to reuse most of nn.Module while still storing the
+# data in C++.
+# Each OrderedDict needs to support:
+#  x not in view
+#  x in view
+#  view[name] = ...
+#  view.values()
+#  del view[name]
+#  view.items()
+#  view.keys()
+#  len(view)
+
+
+class OrderedDictWrapper:
+    def __init__(self, _c):
+        self._c = _c
+
+    def keys(self):
+        return [k for k, v in self.items()]
+
+    def values(self):
+        return [v for k, v in self.items()]
+
+    def __len__(self):
+        return len(self.values())
+
+    def __delitem__(self, k):
+        raise RuntimeError("cannot delete methods or parameters of a script module")
+
+    def items(self):
+        return self._c.items()
+
+    def __setitem__(self, k, v):
+        if k not in self:
+            raise RuntimeError(
+                f"Can't add a new parameter after ScriptModule construction. Tried to add '{k}"
+            )
+        self._c.setattr(k, v)
+
+    def __contains__(self, k):
+        return self._c.contains(k)
+
+    def __getitem__(self, k):
+        if k not in self:
+            raise KeyError(k)
+        return self._c.getattr(k)
+
+
+class OrderedModuleDict(OrderedDictWrapper):
+    def __init__(self, module, python_dict):
+        super().__init__(torch._C.ModuleDict(module))
+        # contains _both_ script modules and non-script python-only modules
+
+        # because script modules are subclassed in python and the
+        # C++ Module class will not hold references to them,
+        # to ensure that you always get the same python value here
+        # we store it in the python dict as well
+        self._python_modules = python_dict
+
+    def items(self):
+        r = self._python_modules.items()
+        return r
+
+    def __contains__(self, k):
+        return k in self._python_modules
+
+    def __setitem__(self, k, v):
+        # Cases where sub-module can be re-assigned after ScriptModule construction
+        # 1. If the attr is an module interface type, it's guaranteed that the module is
+        #    not inlined in the graph, so it's safe to swap a new ScriptModule in.
+        # 2. if the new value if a ScriptModule with the same JIT type, IR won't change
+        #    and it's legit to swap a new module in.
+        # In these two cases we allow swapping a new scripted module and update the
+        # corresponding python module dict to keep sync.
+        # Note: the value to be swapped in has to be ScriptModule instead of nn.Module,
+        # otherwise it's illegal and we throw error.
+        if isinstance(v, ScriptModule):
+            self._c.setattr(k, v)
+            self._python_modules[k] = v
+        else:
+            raise RuntimeError(
+                "Cannot re-assign modules in a ScriptModule with non-scripted "
+                f"module, tried to replace existing module '{k}': {v}"
+            )
+
+    def __getitem__(self, k):
+        return self._python_modules[k]
+
+
+# For each user-defined class that subclasses ScriptModule, this meta-class:
+# (1) finds all the methods annotated with @script_method in a ScriptModule and
+#     removes them from the class attributes
+# (2) puts a wrapper around the class's __init__ method to recursively compile
+#     all of the script_methods with the module after the original __init__ has
+#     run. This has to occur after the user-defined __init__ so that submodules and
+#     parameters are initialized _before_ the script compiler resolve references to
+#     `self.param` or `self.module`.
+class ScriptMeta(type):
+    def __init__(cls, name, bases, attrs):  # noqa: B902
+        # Aggregate all the ScriptMethods and constants from superclasses
+        cls._methods: Dict[str, Any] = {}
+        cls._constants_set = set(getattr(cls, "__constants__", ()))
+        for base in reversed(bases):
+            for k, v in getattr(base, "_methods", {}).items():
+                cls._methods[k] = v
+            base_constants: Set = getattr(base, "_constants_set", set())
+            cls._constants_set = cls._constants_set.union(base_constants)
+
+        # find all the script methods of the current class
+        for k, v in sorted(attrs.items()):
+            if isinstance(v, ScriptMethodStub):
+                delattr(cls, k)
+                cls._methods[v.original_method.__name__] = v
+
+        if getattr(cls, "_disable_script_meta", False):
+            # We leave built-in ScriptModule types alone, since this metaclass
+            # is only for compiling user classes that inherit from
+            # ScriptModule.
+            return super().__init__(name, bases, attrs)
+
+        original_init = getattr(cls, "__init__", lambda self: None)
+
+        @functools.wraps(original_init)
+        def init_then_script(self, *args, **kwargs):
+            num_methods = len(cls._methods)
+            original_init(self, *args, **kwargs)
+            added_methods_in_init = len(cls._methods) > num_methods
+
+            if type(self) == cls:
+
+                def make_stubs(module):
+                    cls = type(module)
+                    if hasattr(cls, "_methods"):
+                        return [v for k, v in sorted(cls._methods.items())]
+                    else:
+                        return infer_methods_to_compile(module)
+
+                self.__dict__[
+                    "_actual_script_module"
+                ] = torch.jit._recursive.create_script_module(
+                    self, make_stubs, share_types=not added_methods_in_init
+                )
+
+                # Delete the Python attributes that now shadow the ScriptModule
+                # ones, so that __getattr__ and __setattr__ will properly find
+                # the scripted versions.
+                concrete_type = self._actual_script_module._concrete_type
+                for name in concrete_type.get_attributes():
+                    delattr(self, name)
+                for name, _ in concrete_type.get_modules():
+                    delattr(self, name)
+                for name in ("_parameters", "_buffers", "_modules"):
+                    delattr(self, name)
+
+        cls.__init__ = init_then_script  # type: ignore[misc]
+        super().__init__(name, bases, attrs)
+
+
+class _CachedForward:
+    def __get__(self, obj, cls):
+        return self.__getattr__("forward")  # type: ignore[attr-defined]
+
+
+class ScriptWarning(Warning):
+    pass
+
+
+def script_method(fn):
+    if not _enabled:
+        return fn
+    # NOTE: we need to traverse two frames here because the meta-class frame
+    # for ScriptModule will be present, as opposed to invoking @script on a
+    # a function or invoking define() on a CompilationUnit.
+    # The stack will look like:
+    #
+    # 0. createResolutionCallback()
+    # 1. script_method()
+    # 2. ScriptModule metaclass frame
+    # 3. Surrounding scope
+    #
+    # createResolutionCallback internally adds 1 to get us to the scope of this
+    # function (the calling function). Adding 2 gets us to the proper surrounding scope.
+    _rcb = _jit_internal.createResolutionCallbackFromFrame(frames_up=2)
+    ast = get_jit_def(fn, fn.__name__, self_name="ScriptModule")
+    return ScriptMethodStub(_rcb, ast, fn)
+
+
+class ConstMap:
+    def __init__(self, const_mapping):
+        self.const_mapping = const_mapping
+
+    def __getattr__(self, attr):
+        return self.const_mapping[attr]
+
+
+def unpackage_script_module(
+    importer: PackageImporter, script_module_id: str
+) -> torch.nn.Module:
+    """
+    Call by ``torch.package.PackageImporter``'s Pickler's ``persistent_load`` function.
+
+    Performs work of loading and returning a ScriptModule from a ``torch.package`` archive.
+    """
+    if not isinstance(importer.zip_reader, torch._C.PyTorchFileReader):
+        raise RuntimeError(
+            "Loading ScriptObjects from a PackageImporter created from a "
+            "directory is not supported. Use a package archive file instead."
+        )
+    cu = torch._C.CompilationUnit()
+    cpp_module = torch._C._import_ir_module_from_package(
+        cu,
+        importer.zip_reader,
+        importer.storage_context,
+        validate_map_location(importer.last_map_location),
+        script_module_id,
+    )
+    return wrap_cpp_module(cpp_module)
+
+
+if _enabled:
+    _magic_methods = [
+        "__iter__",
+        "__len__",
+        "__neg__",
+        "__mul__",
+        "__contains__",
+        "__add__",
+        "__sub__",
+        "__pow__",
+        "__truediv__",
+        "__mod__",
+        "__ne__",
+        "__eq__",
+        "__lt__",
+        "__gt__",
+        "__le__",
+        "__ge__",
+        "__and__",
+        "__or__",
+        "__xor__",
+        "__getitem__",
+        "__setitem__",
+        "__call__",
+        "__int__",
+        "__float__",
+        "__bool__",
+        "__str__",
+        "__enter__",
+        "__exit__",
+    ]
+
+    class RecursiveScriptClass:
+        """Wrapper for a TorchScript class instance for use in Python.
+
+        An analogue of RecursiveScriptModule for regular objects that are not modules.
+        This class is a wrapper around a torch._C.ScriptObject that represents an instance
+        of a TorchScript class and allows it to be used in Python.
+
+        Attributes:
+            _c [torch._C.ScriptObject]: The C++ object to which attribute lookups and method
+                calls are forwarded.
+            _props [Dict[str, property]]: A dictionary of properties fetched from self._c and
+                exposed on this wrppaer.
+        """
+
+        def __init__(self, cpp_class):
+            super().__init__()
+            self.__dict__["_initializing"] = True
+            self._c = cpp_class
+
+            # Add wrapped object's properties to this class instance.
+            self._props = {
+                prop.name: property(prop.getter, prop.setter)
+                for prop in self._c._properties()
+            }
+
+            self.__dict__["_initializing"] = False
+
+        def __getattr__(self, attr):
+            if self.__dict__.get("_initializing"):
+                return super().__getattr__(attr)  # type: ignore[misc]
+
+            if attr in self._props:
+                return self._props[attr].fget()  # type: ignore[call-arg, misc]
+
+            return getattr(self._c, attr)
+
+        def __setattr__(self, attr, value):
+            if self.__dict__.get("_initializing"):
+                return super().__setattr__(attr, value)
+
+            if attr in self._props:
+                return self._props[attr].fset(value)  # type: ignore[call-arg, misc]
+
+            setattr(self._c, attr, value)
+
+        # Delegate calls to magic methods like __len__ to the C++ module backing the
+        # RecursiveScriptClass.
+        def forward_magic_method(self, method_name, *args, **kwargs):
+            if not self._c._has_method(method_name):
+                raise TypeError()
+
+            self_method = self.__getattr__(method_name)
+            return self_method(*args, **kwargs)
+
+        def __getstate__(self):
+            raise pickle.PickleError("ScriptClasses cannot be pickled")
+
+        def __iadd__(self, other):
+            if self._c._has_method("__iadd__"):
+                return self.forward_magic_method("__iadd__", other)
+            else:
+                return self.forward_magic_method("__add__", other)
+
+    for method_name in _magic_methods:
+
+        def method_template(self, *args, **kwargs):
+            return self.forward_magic_method(method_name, *args, **kwargs)
+
+        setattr(RecursiveScriptClass, method_name, method_template)
+
+    # this is a Python 'non-data descriptor' that causes the first access
+    # to ScriptModule's forward to look up the forward method and stash
+    # it in the objects dict. Due to the standard rules for attribute lookup,
+    # subsequent lookups will just directly return the previously looked up method.
+    # This is necessary because nn.Module defines forward as a method. If we
+    # did nothing, __getattr__ would not be called. Instead we'd get nn.Module.forward
+    # which always throws an exception.
+
+    class ScriptModule(Module, metaclass=ScriptMeta):
+        r"""Wrapper for C++ torch::jit::Module with methods, attributes, and parameters.
+
+        A wrapper around C++ ``torch::jit::Module``. ``ScriptModule``\s
+        contain methods, attributes, parameters, and
+        constants. These can be accessed the same way as on a normal ``nn.Module``.
+        """
+
+        __jit_unused_properties__ = [
+            "code",
+            "code_with_constants",
+            "graph",
+            "inlined_graph",
+            "original_name",
+        ]
+
+        def __init__(self):
+            super().__init__()
+
+        forward: Callable[..., Any] = _CachedForward()  # type: ignore[assignment]
+
+        def __getattr__(self, attr):
+            if "_actual_script_module" not in self.__dict__:
+                return super().__getattr__(attr)
+            return getattr(self._actual_script_module, attr)
+
+        def __setattr__(self, attr, value):
+            if "_actual_script_module" not in self.__dict__:
+                # Unwrap torch.jit.Attribute into a regular setattr + record
+                # the provided type in __annotations__.
+                #
+                # This ensures that if we use the attr again in `__init__`, it
+                # will look like the actual value, not an instance of Attribute.
+                if isinstance(value, Attribute):
+                    # NB: Ensure that we set __annotations__ on the specific
+                    # class in question, and not on a superclass (which would
+                    # be wrong wrong wrong!).
+                    # See also https://github.com/pytorch/pytorch/issues/39463
+                    if "__annotations__" not in self.__class__.__dict__:
+                        self.__class__.__annotations__ = {}
+                    self.__annotations__[attr] = value.type
+                    value = value.value
+                return super().__setattr__(attr, value)
+
+            setattr(self._actual_script_module, attr, value)
+
+        def define(self, src):
+            if "_actual_script_module" in self.__dict__:
+                # If we have completed initialization, just defer to the
+                # backing RecursiveScriptModule to eagerly compile the provided
+                # source.
+                return self._actual_script_module.define(src)
+
+            # Otherwise, we are still in the object's __init__.
+            # In that case, add `src` as a stub to be compiled.
+            #
+            # We use frames_up=1 to get to the proper surrounding scope. The stack
+            # will look like:
+            # 0. createResolutionCallback
+            # 1. define()
+            # 2. surrounding scope.
+            #
+            # createResolutionCallback internally adds 1 to get us to our frame, then
+            # we add 1 to get to the proper surrounding scope.
+            rcb = _jit_internal.createResolutionCallbackFromFrame(frames_up=1)
+            ast = torch._C._parse_source_def(src)
+            self._methods[ast.name().name] = ScriptMethodStub(rcb, ast, None)
+
+        def _replicate_for_data_parallel(self):
+            return self._actual_script_module._replicate_for_data_parallel()
+
+        def __reduce_package__(self, exporter: PackageExporter):
+            """Save a ScriptModule inside of a ``torch.package`` archive.
+
+            Called by ``torch.package.PackageExporter``'s Pickler's ``persistent_id`` when
+            saving TorchScript objects. Performs act of saving a ScriptModule inside of
+            a ``torch.package`` archive.
+
+            Returns method to load the ScriptModule from a ``torch.package.PackageImporter``'s
+            Pickler's ``persistent_load`` function.
+            """
+            script_module_id = exporter.get_unique_id()
+            exporter.script_module_serializer.serialize(self._c, int(script_module_id))
+            return (unpackage_script_module, (script_module_id,))
+
+    class RecursiveScriptModule(ScriptModule):
+        # XXX: RecursiveScriptModule inherits from ScriptModule for the sole
+        # reason that it retains the existing isinstance(ScriptModule)
+        # behavior.
+        r"""Retain the existing isinstance(ScriptModule) behavior.
+
+        The core data structure in TorchScript is the ``ScriptModule``. It is an
+        analogue of torch's ``nn.Module`` and represents an entire model as a tree of
+        submodules. Like normal modules, each individual module in a ``ScriptModule`` can
+        have submodules, parameters, and methods. In ``nn.Module``\s methods are implemented
+        as Python functions, but in ``ScriptModule``\s methods are implemented as
+        TorchScript functions, a statically-typed subset of Python that contains all
+        of PyTorch's built-in Tensor operations. This difference allows your
+        ``ScriptModule``\s code to run without the need for a Python interpreter.
+
+        ``ScriptModule``\s should not be created manually, instead use
+        either :func:`tracing <torch.jit.trace>` or :func:`scripting <torch.jit.script>`.
+        Tracing and scripting can be applied incrementally and :ref:`composed as necessary <Types>`.
+
+        * Tracing records the tensor operations as executed with a set of example inputs and uses these
+          operations to construct a computation graph. You can use the full dynamic behavior of Python with tracing,
+          but values other than Tensors and control flow aren't captured in the graph.
+
+        * Scripting inspects the Python code of the model
+          and compiles it to TorchScript. Scripting allows the use of many `types`_ of values and supports dynamic control flow.
+          Many, but not all features of Python are supported by the compiler, so changes to the source code may be necessary.
+        """
+
+        _disable_script_meta = True
+
+        def __init__(self, cpp_module):
+            self.__dict__["_initializing"] = True
+            self._c = cpp_module
+            super().__init__()
+            # Delete the 'training' attribute set up by `Module.__init__`. It
+            # will get set on the underlying cpp module, so we delete it here
+            # to avoid this version shadowing the cpp module version.
+            delattr(self, "training")
+
+        @staticmethod
+        def _construct(cpp_module, init_fn):
+            """
+            Construct a RecursiveScriptModule that's ready for use.
+
+            PyTorch code should use this to construct a RecursiveScriptModule instead
+            of instead of calling `__init__` directly, as it makes sure the
+            object is properly finalized (and in the future, we may take
+            control of how the RecursiveScriptModule instance is created).
+
+            Args:
+                cpp_module:  The C++ Module that will hold the actual state of
+                             this RecursiveScriptModule instance.
+                init_fn:  Lambda that initializes the RecursiveScriptModule passed to it.
+            """
+            script_module = RecursiveScriptModule(cpp_module)
+            init_fn(script_module)
+
+            # Finalize the ScriptModule: replace the nn.Module state with our
+            # custom implementations and flip the _initializing bit.
+            RecursiveScriptModule._finalize_scriptmodule(script_module)
+            return script_module
+
+        @staticmethod
+        def _finalize_scriptmodule(script_module):
+            script_module._parameters = OrderedDictWrapper(
+                torch._C.ParameterDict(script_module._c)
+            )
+            script_module._buffers = OrderedDictWrapper(
+                torch._C.BufferDict(script_module._c)
+            )
+            script_module._modules = OrderedModuleDict(
+                script_module._c, script_module._modules
+            )
+            script_module._initializing = False
+
+        def _reconstruct(self, cpp_module):
+            """
+            Re-construct an instance of RecursiveScriptModule using an instance of a C++ module.
+
+            Args:
+                cpp_module: The C++ module that this RecursiveScriptModule will be rebuilt around.
+            """
+            self.__init__(cpp_module)  # type: ignore[misc]
+
+            # Copy the concrete type from the C++ module to this ScriptModule.
+            self._concrete_type = torch._C.ConcreteModuleType.from_jit_type(
+                self._c._type()
+            )
+
+            # Copy submodules from the C++ module to this ScriptModule.
+            modules = {}
+            for name, cpp_module in torch._C.ModuleDict(self._c).items():
+                modules[name] = wrap_cpp_module(cpp_module)
+            self._modules = OrderedModuleDict(self._c, modules)  # type: ignore[assignment]
+
+            # Copy parameters and buffers.
+            self._parameters = OrderedDictWrapper(torch._C.ParameterDict(self._c))  # type: ignore[assignment]
+            self._buffers = OrderedDictWrapper(torch._C.BufferDict(self._c))  # type: ignore[assignment]
+
+            # Get rid of the functions from the old C++ module.
+            self.__dict__ = {
+                k: v
+                for k, v in self.__dict__.items()
+                if not isinstance(v, torch._C.ScriptMethod)
+            }
+            self.__dict__["_initializing"] = False
+
+        @property
+        def graph(self):
+            r"""Return a string representation of the internal graph for the ``forward`` method.
+
+            See :ref:`interpreting-graphs` for details.
+            """
+            return self._c._get_method("forward").graph
+
+        @property
+        def inlined_graph(self):
+            r"""
+            Return a string representation of the internal graph for the ``forward`` method.
+
+            This graph will be preprocessed to inline all function and method calls.
+            See :ref:`interpreting-graphs` for details.
+            """
+            return self.forward.inlined_graph  # type: ignore[attr-defined]
+
+        @property
+        def code(self):
+            r"""
+            Return a pretty-printed representation (as valid Python syntax) of the internal graph for the ``forward`` method.
+
+            See :ref:`inspecting-code` for details.
+            """
+            return self.forward.code  # type: ignore[attr-defined]
+
+        @property
+        def code_with_constants(self):
+            r"""Return a tuple.
+
+            Returns a tuple of:
+
+            [0] a pretty-printed representation (as valid Python syntax) of
+            the internal graph for the ``forward`` method. See `code`.
+            [1] a ConstMap following the CONSTANT.cN format of the output in [0].
+            The indices in the [0] output are keys to the underlying constant's values.
+
+            See :ref:`inspecting-code` for details.
+            """
+            r = self.forward.code_with_constants  # type: ignore[attr-defined]
+            return (r[0], ConstMap(r[1]))
+
+        def save(self, f, **kwargs):
+            r"""Save with a file-like object.
+
+            save(f, _extra_files={})
+
+            See :func:`torch.jit.save <torch.jit.save>` which accepts a file-like object.
+            This function, torch.save(), converts the object to a string, treating it as a path.
+            DO NOT confuse these two functions when it comes to the 'f' parameter functionality.
+            """
+            return self._c.save(str(f), **kwargs)
+
+        def _save_for_lite_interpreter(self, *args, **kwargs):
+            r"""Add (or update) the bytecode session to the script model.
+
+            _save_for_lite_interpreter(f)
+
+            The updated model is used
+            in lite interpreter for mobile applications.
+
+            Args:
+                f: a string containing a file name.
+                _extra_files: Map from filename to contents which will be stored as part of 'f'.
+
+            """
+            return self._c._save_for_mobile(*args, **kwargs)
+
+        def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs):
+            return self._c._save_to_buffer_for_mobile(*args, **kwargs)
+
+        def save_to_buffer(self, *args, **kwargs):
+            return self._c.save_to_buffer(*args, **kwargs)
+
+        def get_debug_state(self, *args, **kwargs):
+            return self._c.get_debug_state()
+
+        def extra_repr(self):
+            return f"original_name={self.original_name}"
+
+        def graph_for(self, *args, **kwargs):
+            return self.forward.graph_for(self, *args, **kwargs)  # type: ignore[attr-defined]
+
+        @property
+        def original_name(self):
+            if type(self) == str(self._c._type().name()):
+                return ""
+            return str(self._c._type().name())
+
+        def define(self, src):
+            # We use frames_up=1 to get to the proper surrounding scope. The stack
+            # will look like:
+            # 0. createResolutionCallback
+            # 1. define()
+            # 2. surrounding scope.
+            #
+            # createResolutionCallback internally adds 1 to get us to our frame, then
+            # we add 1 to get to the proper surrounding scope.
+            rcb = _jit_internal.createResolutionCallbackFromFrame(frames_up=1)
+            self._c._define(self._concrete_type, src, rcb)
+
+        def __getattr__(self, attr):
+            if "_initializing" not in self.__dict__:
+                raise RuntimeError(
+                    "ScriptModule has not been initialized, did you forget to call super's init?"
+                )
+
+            if self._initializing:
+                return super().__getattr__(attr)
+
+            # _modules check is before hasattr since modules are included as attributes in _c,
+            # but we want to get the python wrapper from _modules instead of the raw _c object.
+            if attr in self._modules:
+                return self._modules[attr]
+            elif self._c.hasattr(attr):
+                return self._c.getattr(attr)
+            elif self._c._has_method(attr):
+                script_method = self._c._get_method(attr)
+                # cache method so future calls do not go through __getattr__
+                # to improve invocation performance
+                self.__dict__[attr] = script_method
+                return script_method
+
+            return super().__getattr__(attr)
+
+        def __setattr__(self, attr, value):
+            if self._initializing:
+                return super().__setattr__(attr, value)
+
+            if attr in self._modules:
+                self._modules[attr] = value
+            elif self._c.hasattr(attr):
+                self._c.setattr(attr, value)
+            elif (
+                hasattr(self, "_concrete_type")
+                and attr in self._concrete_type.get_constants().keys()
+            ):
+                # TODO: we don't have _concrete_type set after load(), and in general we lose constant information.
+                # We should encode constants as class type attributes (or something) so it persists across save/load.
+                raise AttributeError(
+                    f"Cannot mutate TorchScript constant value: '{attr}'. Value: '{value}'"
+                )
+            else:
+                # We allow setting Python attributes on the ScriptModule, for
+                # when people want to stash some convenience info on it.
+                # TODO: it's possible that the following is confusing:
+                #   s = torch.jit.script(...)
+                #   s.python_attr = ...
+                #   s.save()   <--- this doesn't have `python_attr`
+                # It's fairly trivial to save enough info to warn in this case.
+                return super().__setattr__(attr, value)
+
+        def __copy__(self):
+            return torch.jit._recursive.wrap_cpp_module(copy.copy(self._c))
+
+        def __deepcopy__(self, memo):
+            return torch.jit._recursive.wrap_cpp_module(copy.deepcopy(self._c, memo))
+
+        # Python magic methods do method lookups on an object's class type, instead of looking up
+        # the method defines on the class instance. In order to continue to expose the magic methods
+        # of builtin-containers (ModuleList, Sequential, ModuleDict) to Python, we
+        # define magic methods here as a shim to the correct attribute.
+        def forward_magic_method(self, method_name, *args, **kwargs):
+            self_method = getattr(self, method_name)
+            if getattr(self_method, "__func__", None) == getattr(
+                RecursiveScriptModule, method_name
+            ):
+                raise NotImplementedError()
+            return self_method(*args, **kwargs)
+
+        def __iter__(self):
+            return self.forward_magic_method("__iter__")
+
+        def __getitem__(self, idx):
+            return self.forward_magic_method("__getitem__", idx)
+
+        def __len__(self):
+            return self.forward_magic_method("__len__")
+
+        def __contains__(self, key):
+            return self.forward_magic_method("__contains__", key)
+
+        # dir is defined by the base nn.Module, so instead of throwing if
+        # it is not overridden, we call into the nn.Module __dir__ method
+        def __dir__(self):
+            self_method = self.__dir__
+            if (
+                self_method.__func__  # type: ignore[attr-defined]
+                == _get_function_from_type(RecursiveScriptModule, "__dir__")
+            ):
+                return super().__dir__()
+            return self_method()
+
+        # to resolve bool(value), Python looks if __bool__ is defined then __iter__
+        # is defined then returns true for classes. Since __iter__() on this
+        # class throws if it isn't overridden, we define __bool__ to preserve default behavior
+        def __bool__(self):
+            self_method = self.__bool__
+            if (
+                self_method.__func__  # type: ignore[attr-defined]
+                == _get_function_from_type(RecursiveScriptModule, "__bool__")
+            ):
+                return True
+            return self_method()
+
+        def _replicate_for_data_parallel(self):
+            # we have to initialize ScriptModule properly so that
+            # it works with pybind11
+            def init_fn(script_module):
+                # Don't do anything here, we'll initialize the ScriptModule below
+                return
+
+            return RecursiveScriptModule._construct(
+                self._c._replicate_for_data_parallel(), init_fn
+            )
+
+    # Need to copy all RecursiveScriptModule methods to ScriptModule.
+    #
+    # This is because `super().foo()` does not use
+    # `__getattr__` to look up `foo`. So we need to make each method available on
+    # the ScriptModule manually.
+    for name, item in RecursiveScriptModule.__dict__.items():
+        if not callable(item) and not isinstance(item, property):
+            continue
+        if name.startswith("__") or hasattr(ScriptModule, name):
+            continue
+        # We can copy over the implementation wholesale because besides the
+        # `super()` thing above, ScriptModule behaves exactly like
+        # RecursiveScriptModule
+        setattr(ScriptModule, name, item)
+
+    def _get_methods(cls):
+        import inspect
+
+        # In Python 3 unbound methods are functions, but in Python 2 they are methods
+        return inspect.getmembers(
+            cls, predicate=lambda x: inspect.isfunction(x) or inspect.ismethod(x)
+        )
+
+    _compiled_methods_allowlist = {
+        "forward",
+        "register_buffer",
+        "register_parameter",
+        "register_module",
+        "add_module",
+        "_apply",
+        "apply",
+        "cuda",
+        "cpu",
+        "to",
+        "type",
+        "float",
+        "double",
+        "half",
+        "state_dict",
+        "_save_to_state_dict",
+        "load_state_dict",
+        "_load_from_state_dict",
+        "_named_members",
+        "parameters",
+        "named_parameters",
+        "buffers",
+        "named_buffers",
+        "children",
+        "named_children",
+        "modules",
+        "named_modules",
+        "zero_grad",
+        "share_memory",
+        "_get_name",
+        "extra_repr",
+        "_slow_forward",
+        "_tracing_name",
+        "eval",
+        "train",
+        "get_extra_state",
+        "set_extra_state",
+    }
+
+    def _make_fail(name):
+        def fail(self, *args, **kwargs):
+            raise RuntimeError(name + " is not supported on ScriptModules")
+
+        return fail
+
+    for name, method in _get_methods(torch.nn.Module):
+        if name.startswith("__") or name.endswith("_call_impl"):
+            continue
+        if (
+            name not in RecursiveScriptModule.__dict__
+            and name not in _compiled_methods_allowlist
+        ):
+            setattr(RecursiveScriptModule, method.__name__, _make_fail(name))
+
+
+else:
+    # TODO MAKE SURE THAT DISABLING WORKS
+    class RecursiveScriptClass:  # type: ignore[no-redef]
+        pass
+
+    class ScriptModule(torch.nn.Module):  # type: ignore[no-redef]
+        def __init__(self, arg=None):
+            super().__init__()
+
+    class RecursiveScriptModule(ScriptModule):  # type: ignore[no-redef]
+        def __init__(self, arg=None):
+            super().__init__()
+
+
+def call_prepare_scriptable_func_impl(obj, memo):
+    if not isinstance(obj, torch.nn.Module):
+        return obj
+
+    obj_id = id(obj)
+
+    # If obj_id is in memo, obj has already been prepared or is being
+    # prepared in another call up the stack.
+    if obj_id in memo:
+        return memo[id(obj)]
+
+    obj = obj.__prepare_scriptable__() if hasattr(obj, "__prepare_scriptable__") else obj  # type: ignore[operator]
+    # Record obj in memo to avoid infinite recursion in the case of cycles in the module
+    # hierarchy when recursing below.
+    memo[obj_id] = obj
+
+    new_obj_dict = {}
+
+    for name, sub_module in obj.__dict__.items():
+        if name == "_modules":
+            for k, v in sub_module.items():
+                sub_module[k] = call_prepare_scriptable_func_impl(v, memo)
+            new_obj_dict[name] = sub_module
+        elif isinstance(sub_module, torch.nn.Module) and not isinstance(
+            sub_module, ScriptModule
+        ):
+            new_obj_dict[name] = call_prepare_scriptable_func_impl(sub_module, memo)
+        else:
+            new_obj_dict[name] = sub_module
+
+    for k, v in new_obj_dict.items():
+        obj.__dict__[name] = v
+
+    return obj
+
+
+def call_prepare_scriptable_func(obj):
+    memo: Dict[int, torch.nn.Module] = {}
+    return call_prepare_scriptable_func_impl(obj, memo)
+
+
+def create_script_dict(obj):
+    """
+    Create a ``torch._C.ScriptDict`` instance with the data from ``obj``.
+
+    Args:
+        obj (dict): The Python dictionary that is used to initialize the ``ScriptDict``
+                    returned by this function.
+
+    Returns:
+        An instance of ``torch._C.ScriptDict`` that has the same data as ``obj``
+        and can be passed between Python and TorchScript with reference semantics and
+        zero copy overhead.
+    """
+    return torch._C.ScriptDict(obj)  # type: ignore[attr-defined]
+
+
+def create_script_list(obj, type_hint=None):
+    """
+    Create a ``torch._C.ScriptList`` instance with the data from ``obj``.
+
+    Args:
+        obj (dict): The Python list that is used to initialize the ``ScriptList``
+                    returned by this function.
+    Returns:
+        An instance of ``torch._C.ScriptList`` that has the same data as ``obj``
+        and can be passed between Python and TorchScript with reference semantics and
+        zero copy overhead.
+    """
+    return torch._C.ScriptList(obj)  # type: ignore[attr-defined]
+
+
+def script(
+    obj,
+    optimize=None,
+    _frames_up=0,
+    _rcb=None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+):
+    r"""Script the function.
+
+    Scripting a function or ``nn.Module`` will inspect the source code, compile
+    it as TorchScript code using the TorchScript compiler, and return a :class:`ScriptModule` or
+    :class:`ScriptFunction`. TorchScript itself is a subset of the Python language, so not all
+    features in Python work, but we provide enough functionality to compute on
+    tensors and do control-dependent operations. For a complete guide, see the
+    :ref:`language-reference`.
+
+    Scripting a dictionary or list copies the data inside it into a TorchScript instance than can be
+    subsequently passed by reference between Python and TorchScript with zero copy overhead.
+
+    ``torch.jit.script`` can be used as a function for modules, functions, dictionaries and lists
+     and as a decorator ``@torch.jit.script`` for :ref:`torchscript-classes` and functions.
+
+    Args:
+        obj (Callable, class, or nn.Module):  The ``nn.Module``, function, class type,
+                                                  dictionary, or list to compile.
+        example_inputs (Union[List[Tuple], Dict[Callable, List[Tuple]], None]): Provide example inputs
+            to annotate the arguments for a function or ``nn.Module``.
+
+    Returns:
+        If ``obj`` is ``nn.Module``, ``script`` returns
+        a :class:`ScriptModule` object. The returned :class:`ScriptModule` will
+        have the same set of sub-modules and parameters as the
+        original ``nn.Module``. If ``obj`` is a standalone function,
+        a :class:`ScriptFunction` will be returned. If ``obj`` is a ``dict``, then
+        ``script`` returns an instance of `torch._C.ScriptDict`. If ``obj`` is a ``list``,
+        then ``script`` returns an instance of `torch._C.ScriptList`.
+
+    **Scripting a function**
+        The ``@torch.jit.script`` decorator will construct a :class:`ScriptFunction`
+        by compiling the body of the function.
+
+        Example (scripting a function):
+
+        .. testcode::
+
+            import torch
+
+            @torch.jit.script
+            def foo(x, y):
+                if x.max() > y.max():
+                    r = x
+                else:
+                    r = y
+                return r
+
+            print(type(foo))  # torch.jit.ScriptFunction
+
+            # See the compiled graph as Python code
+            print(foo.code)
+
+            # Call the function using the TorchScript interpreter
+            foo(torch.ones(2, 2), torch.ones(2, 2))
+
+        .. testoutput::
+            :hide:
+
+            ...
+
+    ****Scripting a function using example_inputs**
+        Example inputs can be used to annotate a function arguments.
+
+        Example (annotating a function before scripting):
+
+        .. testcode::
+
+            import torch
+
+            def test_sum(a, b):
+                return a + b
+
+            # Annotate the arguments to be int
+            scripted_fn = torch.jit.script(test_sum, example_inputs=[(3, 4)])
+
+            print(type(scripted_fn))  # torch.jit.ScriptFunction
+
+            # See the compiled graph as Python code
+            print(scripted_fn.code)
+
+            # Call the function using the TorchScript interpreter
+            scripted_fn(20, 100)
+
+        .. testoutput::
+            :hide:
+
+            ...
+
+    **Scripting an nn.Module**
+        Scripting an ``nn.Module`` by default will compile the ``forward`` method and recursively
+        compile any methods, submodules, and functions called by ``forward``. If a ``nn.Module`` only uses
+        features supported in TorchScript, no changes to the original module code should be necessary. ``script``
+        will construct :class:`ScriptModule` that has copies of the attributes, parameters, and methods of
+        the original module.
+
+        Example (scripting a simple module with a Parameter):
+
+        .. testcode::
+
+            import torch
+
+            class MyModule(torch.nn.Module):
+                def __init__(self, N, M):
+                    super().__init__()
+                    # This parameter will be copied to the new ScriptModule
+                    self.weight = torch.nn.Parameter(torch.rand(N, M))
+
+                    # When this submodule is used, it will be compiled
+                    self.linear = torch.nn.Linear(N, M)
+
+                def forward(self, input):
+                    output = self.weight.mv(input)
+
+                    # This calls the `forward` method of the `nn.Linear` module, which will
+                    # cause the `self.linear` submodule to be compiled to a `ScriptModule` here
+                    output = self.linear(output)
+                    return output
+
+            scripted_module = torch.jit.script(MyModule(2, 3))
+
+        Example (scripting a module with traced submodules):
+
+        .. testcode::
+
+            import torch
+            import torch.nn as nn
+            import torch.nn.functional as F
+
+            class MyModule(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    # torch.jit.trace produces a ScriptModule's conv1 and conv2
+                    self.conv1 = torch.jit.trace(nn.Conv2d(1, 20, 5), torch.rand(1, 1, 16, 16))
+                    self.conv2 = torch.jit.trace(nn.Conv2d(20, 20, 5), torch.rand(1, 20, 16, 16))
+
+                def forward(self, input):
+                    input = F.relu(self.conv1(input))
+                    input = F.relu(self.conv2(input))
+                    return input
+
+            scripted_module = torch.jit.script(MyModule())
+
+        To compile a method other than ``forward`` (and recursively compile anything it calls), add
+        the :func:`@torch.jit.export <torch.jit.export>` decorator to the method. To opt out of compilation
+        use :func:`@torch.jit.ignore <torch.jit.ignore>` or :func:`@torch.jit.unused <torch.jit.unused>`.
+
+        Example (an exported and ignored method in a module)::
+
+            import torch
+            import torch.nn as nn
+
+            class MyModule(nn.Module):
+                def __init__(self):
+                    super().__init__()
+
+                @torch.jit.export
+                def some_entry_point(self, input):
+                    return input + 10
+
+                @torch.jit.ignore
+                def python_only_fn(self, input):
+                    # This function won't be compiled, so any
+                    # Python APIs can be used
+                    import pdb
+                    pdb.set_trace()
+
+                def forward(self, input):
+                    if self.training:
+                        self.python_only_fn(input)
+                    return input * 99
+
+            scripted_module = torch.jit.script(MyModule())
+            print(scripted_module.some_entry_point(torch.randn(2, 2)))
+            print(scripted_module(torch.randn(2, 2)))
+
+        Example ( Annotating forward of nn.Module using example_inputs)::
+
+            import torch
+            import torch.nn as nn
+            from typing import NamedTuple
+
+            class MyModule(NamedTuple):
+            result: List[int]
+
+            class TestNNModule(torch.nn.Module):
+                def forward(self, a) -> MyModule:
+                    result = MyModule(result=a)
+                    return result
+
+            pdt_model = TestNNModule()
+
+            # Runs the pdt_model in eager model with the inputs provided and annotates the arguments of forward
+            scripted_model = torch.jit.script(pdt_model, example_inputs={pdt_model: [([10, 20, ], ), ], })
+
+            # Run the scripted_model with actual inputs
+            print(scripted_model([20]))
+    """
+    global type_trace_db
+    if not _enabled:
+        return obj
+
+    if optimize is not None:
+        warnings.warn(
+            "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
+        )
+
+    # No-op for modules, functions, class instances that are already scripted
+    if isinstance(obj, RecursiveScriptClass):
+        return obj
+    if isinstance(obj, ScriptModule):
+        return obj
+    if isinstance(obj, ScriptFunction):
+        return obj
+
+    if example_inputs:
+        # If MonkeyType is installed, enable profile directed type annotation
+        # Check if example_inputs are defined and generate call traces
+        # for the method by running eager mode version of the method with
+        # the provide example inputs. This logs all the traces in type_trace_db
+        type_trace_db = JitTypeTraceStore()
+        if monkeytype_trace:
+            monkeytype_config = JitTypeTraceConfig(type_trace_db)
+            with monkeytype_trace(monkeytype_config):
+                if isinstance(example_inputs, Dict):
+                    # If the obj is an nn.Module or a class, then each method is
+                    # executed with the arguments provided in the example inputs.
+                    # example inputs here will be of type Dict(class.method, (arguments))
+                    # This is used to infer type annotations for those methods
+                    # which are not called directly under the hood of monkeytype.
+                    for module, example_input in example_inputs.items():
+                        for example in example_input:
+                            module(*example)
+                elif isinstance(example_inputs, List):
+                    for examples in example_inputs:
+                        obj(*examples)
+                else:
+                    raise ValueError(
+                        "Error: Unable to infer types. Please format the inputs to type `List[Tuple]`"
+                        " or `Dict[Callable, List[Tuple]]` to be run with MonkeyType."
+                    )
+        else:
+            warnings.warn(
+                "Warning: monkeytype is not installed. Please install https://github.com/Instagram/MonkeyType "
+                "to enable Profile-Directed Typing in TorchScript. Refer to "
+                "https://github.com/Instagram/MonkeyType/blob/master/README.rst to install MonkeyType. "
+            )
+
+    if isinstance(obj, torch.nn.Module):
+        obj = call_prepare_scriptable_func(obj)
+        return torch.jit._recursive.create_script_module(
+            obj, torch.jit._recursive.infer_methods_to_compile
+        )
+    else:
+        obj = obj.__prepare_scriptable__() if hasattr(obj, "__prepare_scriptable__") else obj  # type: ignore[operator]
+
+    if isinstance(obj, dict):
+        return create_script_dict(obj)
+    if isinstance(obj, list):
+        return create_script_list(obj)
+
+    if inspect.isclass(obj):
+        qualified_name = _qualified_name(obj)
+        # If this type is a `nn.Module` subclass, they probably meant to pass
+        # an instance instead of a Module
+        if issubclass(obj, torch.nn.Module):
+            raise RuntimeError(
+                f"Type '{obj}' cannot be compiled since it inherits from nn.Module, pass an instance instead"
+            )
+
+        # Enums are automatically usable in TorchScript, explicitly scripting
+        # is not necessary, but not harmful either.
+        if issubclass(obj, enum.Enum):
+            return obj
+
+        if not _is_new_style_class(obj):
+            raise RuntimeError(
+                "TorchScript classes must be new-style classes. "
+                "Please inherit from 'object'."
+            )
+        if len(obj.mro()) > 2:
+            raise RuntimeError(
+                "TorchScript classes does not support inheritance yet. "
+                "Please directly inherit from 'object'."
+            )
+        if _rcb is None:
+            _rcb = _jit_internal.createResolutionCallbackFromFrame(_frames_up + 1)
+        _compile_and_register_class(obj, _rcb, qualified_name)
+        return obj
+    elif inspect.isfunction(obj) or inspect.ismethod(obj):
+        qualified_name = _qualified_name(obj)
+        # this is a decorated fn, and we need to the underlying fn and its rcb
+        if hasattr(obj, "__script_if_tracing_wrapper"):
+            obj = obj.__original_fn  # type: ignore[union-attr]
+            _rcb = _jit_internal.createResolutionCallbackFromClosure(obj)
+
+        # some functions are explicitly marked as not supported in script mode
+        if hasattr(obj, "__script_unsupported"):
+            raise RuntimeError("TorchScript error: " + obj.__script_unsupported)
+
+        _check_directly_compile_overloaded(obj)
+        maybe_already_compiled_fn = _try_get_jit_cached_function(obj)
+        if maybe_already_compiled_fn:
+            return maybe_already_compiled_fn
+        ast = get_jit_def(obj, obj.__name__)
+        if _rcb is None:
+            _rcb = _jit_internal.createResolutionCallbackFromClosure(obj)
+        fn = torch._C._jit_script_compile(
+            qualified_name, ast, _rcb, get_default_args(obj)
+        )
+        # Forward docstrings
+        fn.__doc__ = obj.__doc__
+        # Allow torch.compile() to inline
+        fn._torchdynamo_inline = obj  # type: ignore[attr-defined]
+        _set_jit_function_cache(obj, fn)
+        return fn
+    else:
+        return torch.jit._recursive.create_script_class(obj)
+
+
+# overloads are registered in _jit_internal and compiled here so that _overload
+# can be used in nn/functional.py without an import cycle
+
+
+def _check_overload_defaults(impl_defaults, overload_defaults, loc):
+    for name, overload_value in overload_defaults.items():
+        if name not in impl_defaults or impl_defaults[name] != overload_value:
+            raise torch.jit.frontend.FrontendError(
+                loc,
+                "Default parameters on overloads do not affect the runtime so they "
+                "must equal to the default parameter on the implementation function. Found on "
+                f"parameter {name}",
+            )
+
+
+def _compile_function_with_overload(overload_fn, qual_name, impl_fn):
+    overload_decl = get_jit_def(overload_fn, overload_fn.__name__).decl()
+    overload_signature = torch.jit.annotations.get_signature(
+        overload_fn, None, None, inspect.ismethod(overload_fn)
+    )
+    impl_ast = get_jit_def(impl_fn, impl_fn.__name__)
+    overload_defaults = get_default_args(overload_fn)
+    implementation_defaults = get_default_args(impl_fn)
+    _rcb = _jit_internal.createResolutionCallbackFromClosure(impl_fn)
+    _check_overload_defaults(
+        implementation_defaults, overload_defaults, overload_decl.range()
+    )
+    fn = torch._C._jit_script_compile_overload(
+        qual_name,
+        overload_decl,
+        impl_ast,
+        _rcb,
+        implementation_defaults,
+        overload_signature,
+    )
+    return fn
+
+
+def _get_overloads(obj):
+    # check for cached compiled fns
+    existing_compiled_fns = _try_get_jit_cached_overloads(obj)
+    qual_name = _qualified_name(obj)
+    uncompiled_overloads = _jit_internal._get_fn_overloads(qual_name)
+    if uncompiled_overloads is None:
+        return existing_compiled_fns
+
+    if obj in uncompiled_overloads:
+        raise RuntimeError(
+            _jit_internal.get_overload_no_implementation_error_message("function", obj)
+        )
+
+    compiled_fns = []
+    for overload_fn in uncompiled_overloads:
+        compiled_fns.append(
+            _compile_function_with_overload(overload_fn, qual_name, obj)
+        )
+
+    if existing_compiled_fns:
+        compiled_fns = existing_compiled_fns + compiled_fns
+
+    # cache compilation, remove information stored to do compilation
+    _set_jit_overload_cache(obj, compiled_fns)
+    _jit_internal._clear_fn_overloads(qual_name)
+    return compiled_fns
+
+
+def _check_directly_compile_overloaded(obj):
+    qual_name = _qualified_name(obj)
+    if _jit_internal._get_fn_overloads(qual_name) or _try_get_jit_cached_overloads(obj):
+        raise RuntimeError(
+            f"Function {qual_name} cannot be directly compiled because it"
+            " is overloaded. It must be used in a context of a function"
+            " where its inputs can determine which overload to call."
+        )
+
+
+def interface(obj):
+    r"""Decorate to annotate classes or modules of different types.
+
+    This decorator can be used to define an interface that can be used to annotate
+    classes or modules of different types. This can be used for to annotate a submodule
+    or attribute class that could have different types that implement the same
+    interface, or which could be swapped at runtime; or to store a list of modules or
+    classes of varying types.
+
+    It is sometimes used to implement "Callables" - functions or modules that implement
+    an interface but whose implementations differ and which can be swapped out.
+
+    Example:
+    .. testcode::
+
+        import torch
+        from typing import List
+
+        @torch.jit.interface
+        class InterfaceType:
+            def run(self, x: torch.Tensor) -> torch.Tensor:
+                pass
+
+        # implements InterfaceType
+        @torch.jit.script
+        class Impl1:
+            def run(self, x: torch.Tensor) -> torch.Tensor:
+                return x.relu()
+
+        class Impl2(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.val = torch.rand(())
+
+            @torch.jit.export
+            def run(self, x: torch.Tensor) -> torch.Tensor:
+                return x + self.val
+
+        def user_fn(impls: List[InterfaceType], idx: int, val: torch.Tensor) -> torch.Tensor:
+            return impls[idx].run(val)
+
+        user_fn_jit = torch.jit.script(user_fn)
+
+        impls = [Impl1(), torch.jit.script(Impl2())]
+        val = torch.rand(4, 4)
+        user_fn_jit(impls, 0, val)
+        user_fn_jit(impls, 1, val)
+    """
+    if not inspect.isclass(obj):
+        raise RuntimeError("interface must be applied to a class")
+    if not _is_new_style_class(obj):
+        raise RuntimeError("TorchScript interfaces must inherit from 'object'")
+
+    # Expected MRO is:
+    #   User module
+    #   torch.nn.modules.module.Module
+    #   object
+    is_module_interface = issubclass(obj, torch.nn.Module) and len(obj.mro()) == 3
+
+    if not is_module_interface and len(obj.mro()) > 2:
+        raise RuntimeError(
+            "TorchScript interface does not support inheritance yet. "
+            "Please directly inherit from 'object' or 'nn.Module'."
+        )
+
+    qualified_name = _qualified_name(obj)
+    rcb = _jit_internal.createResolutionCallbackFromFrame(1)
+    # if this type is a `nn.Module` subclass, generate a module interface type
+    # instead of a class interface type; a module interface type only compiles
+    # the user provided methods as part of the interface
+    ast = get_jit_class_def(obj, obj.__name__)
+    mangled_classname = torch._C._jit_script_interface_compile(
+        qualified_name, ast, rcb, is_module_interface
+    )
+    obj.__torch_script_interface__ = mangled_classname
+    return obj
+
+
+def _recursive_compile_class(obj, loc):
+    _qual_name = _qualified_name(obj)
+    # We're starting a new compilation, so update the error call stack in
+    # case it fails
+    error_stack = torch._C.CallStack(_qual_name, loc)
+    rcb = _jit_internal.createResolutionCallbackForClassMethods(obj)
+    return _compile_and_register_class(obj, rcb, _qual_name)
+
+
+CompilationUnit = torch._C.CompilationUnit
+set_module(CompilationUnit, "torch.jit")
+
+
+def pad(s: str, padding: int, offset: int = 0, char: str = " "):
+    if padding >= len(s):
+        padding -= len(s)
+    return "".join([char for _ in range(padding + offset)]) + s
+
+
+class _ScriptProfileColumn:
+    def __init__(self, header: str, alignment: int = 4, offset: int = 0):
+        self.header = header
+        self.alignment = alignment
+        self.offset = offset
+        self.rows: Dict[int, Any] = {}
+
+    def add_row(self, lineno: int, value: Any):
+        self.rows[lineno] = value
+
+    def materialize(self):
+        max_length = len(self.header)
+        rows: List[Tuple[int, str]] = []
+        for key, value in self.rows.items():
+            cell = str(value)
+            rows.append((key, cell))
+            max_length = max(len(cell), max_length)
+
+        if self.alignment > 0:
+            padding = max_length + self.alignment
+            padding -= padding % self.alignment
+        else:
+            padding = 0
+
+        rows = [(key, pad(cell, padding, self.offset)) for key, cell in rows]
+        return pad(self.header, padding, self.offset), rows
+
+
+class _ScriptProfileTable:
+    def __init__(self, cols: List[_ScriptProfileColumn], source_range: List[int]):
+        self.cols = cols
+        self.source_range = source_range
+
+    def dump_string(self):
+        outputs: List[str] = []
+        cells: List[Tuple[str, Dict[int, str]]] = []
+        header_buffer = ""
+        for col in self.cols:
+            header, rows = col.materialize()
+            header_buffer += header
+            cells.append((header, dict(rows)))
+
+        outputs.append(header_buffer)
+        outputs.append(pad("", len(header_buffer), 0, "="))
+        for line in self.source_range:
+            row_buffer = ""
+            for header, rows in cells:
+                cell = rows.get(line)
+                if cell is None:
+                    row_buffer += pad("", len(header))
+                else:
+                    row_buffer += cell
+            outputs.append(row_buffer)
+        return "\n".join(outputs)
+
+
+class _ScriptProfile:
+    def __init__(self):
+        self.profile = classes.profiling._ScriptProfile()
+
+    def enable(self):
+        self.profile.enable()
+
+    def disable(self):
+        self.profile.disable()
+
+    def dump_string(self) -> str:
+        outputs: List[str] = []
+        for source_stats in self.profile._dump_stats():
+            source_ref = source_stats.source()
+            source_lines = source_ref.text().splitlines()
+            dedent = min([len(line) - len(line.lstrip(" ")) for line in source_lines])
+            source_lines = [line[dedent:] for line in source_lines]
+
+            start_line = source_ref.starting_lineno()
+            end_line = start_line + len(source_lines)
+            source_range = range(start_line, end_line)
+            lineno = _ScriptProfileColumn("Line #")
+            hits = _ScriptProfileColumn("Hits")
+            time_ns = _ScriptProfileColumn("Time (ns)")
+            line_contents = _ScriptProfileColumn("Line Contents", 0, 1)
+            stats = source_stats.line_map()
+            for line in source_range:
+                lineno.add_row(line, line)
+                line_contents.add_row(line, source_lines[line - start_line])
+                stat = stats.get(line)
+                if stat is not None:
+                    hits.add_row(line, stat.count())
+                    time_ns.add_row(line, stat.duration_ns())
+
+            table = _ScriptProfileTable(
+                [lineno, hits, time_ns, line_contents], list(source_range)
+            )
+            outputs.append(table.dump_string())
+        return "\n\n".join(outputs)
+
+    def dump(self):
+        print(self.dump_string())
+
+
+def _unwrap_optional(x):
+    assert x is not None, "Unwrapping null optional"
+    return x
+
+
+_register_builtin(_unwrap_optional, "aten::_unwrap_optional")
+_register_builtin(_jit_internal.is_scripting, "aten::is_scripting")
+_register_builtin(has_torch_function, "aten::has_torch_function")
+_register_builtin(has_torch_function_unary, "aten::has_torch_function")
+_register_builtin(has_torch_function_variadic, "aten::has_torch_function")
diff --git a/MLPY/Lib/site-packages/torch/jit/_script.pyi b/MLPY/Lib/site-packages/torch/jit/_script.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..9d37c4af04f6c1d77ac4049e663cacb4056ad619
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_script.pyi
@@ -0,0 +1,302 @@
+# mypy: disable-error-code="type-arg"
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    NamedTuple,
+    Optional,
+    overload,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+from _typeshed import Incomplete
+from typing_extensions import Never, TypeAlias
+
+import torch
+from torch._classes import classes as classes
+from torch._jit_internal import _qualified_name as _qualified_name
+from torch.jit._builtins import _register_builtin as _register_builtin
+from torch.jit._fuser import (
+    _graph_for as _graph_for,
+    _script_method_graph_for as _script_method_graph_for,
+)
+from torch.jit._monkeytype_config import (
+    JitTypeTraceConfig as JitTypeTraceConfig,
+    JitTypeTraceStore as JitTypeTraceStore,
+    monkeytype_trace as monkeytype_trace,
+)
+from torch.jit._recursive import (
+    _compile_and_register_class as _compile_and_register_class,
+    infer_methods_to_compile as infer_methods_to_compile,
+    ScriptMethodStub as ScriptMethodStub,
+    wrap_cpp_module as wrap_cpp_module,
+)
+from torch.jit._state import (
+    _enabled as _enabled,
+    _set_jit_function_cache as _set_jit_function_cache,
+    _set_jit_overload_cache as _set_jit_overload_cache,
+    _try_get_jit_cached_function as _try_get_jit_cached_function,
+    _try_get_jit_cached_overloads as _try_get_jit_cached_overloads,
+)
+from torch.jit.frontend import (
+    get_default_args as get_default_args,
+    get_jit_class_def as get_jit_class_def,
+    get_jit_def as get_jit_def,
+)
+from torch.nn import Module as Module
+from torch.overrides import (
+    has_torch_function as has_torch_function,
+    has_torch_function_unary as has_torch_function_unary,
+    has_torch_function_variadic as has_torch_function_variadic,
+)
+from torch.package import (
+    PackageExporter as PackageExporter,
+    PackageImporter as PackageImporter,
+)
+from torch.utils import set_module as set_module
+
+from ._serialization import validate_map_location as validate_map_location
+
+ScriptFunction = torch._C.ScriptFunction
+
+type_trace_db: JitTypeTraceStore
+
+# Defined in torch/csrc/jit/python/script_init.cpp
+ResolutionCallback: TypeAlias = Callable[[str], Callable[..., Any]]
+_ClassVar = TypeVar("_ClassVar", bound=type)
+
+def _reduce(cls) -> None: ...
+
+class Attribute(NamedTuple):
+    value: Incomplete
+    type: Incomplete
+
+def _get_type_trace_db(): ...
+def _get_function_from_type(cls, name): ...
+def _is_new_style_class(cls): ...
+
+class OrderedDictWrapper:
+    _c: Incomplete
+    def __init__(self, _c) -> None: ...
+    def keys(self): ...
+    def values(self): ...
+    def __len__(self) -> int: ...
+    def __delitem__(self, k) -> None: ...
+    def items(self): ...
+    def __setitem__(self, k, v) -> None: ...
+    def __contains__(self, k) -> bool: ...
+    def __getitem__(self, k): ...
+
+class OrderedModuleDict(OrderedDictWrapper):
+    _python_modules: Incomplete
+    def __init__(self, module, python_dict) -> None: ...
+    def items(self): ...
+    def __contains__(self, k) -> bool: ...
+    def __setitem__(self, k, v) -> None: ...
+    def __getitem__(self, k): ...
+
+class ScriptMeta(type):
+    def __init__(cls, name, bases, attrs) -> None: ...
+
+class _CachedForward:
+    def __get__(self, obj, cls): ...
+
+class ScriptWarning(Warning): ...
+
+def script_method(fn): ...
+
+class ConstMap:
+    const_mapping: Incomplete
+    def __init__(self, const_mapping) -> None: ...
+    def __getattr__(self, attr): ...
+
+def unpackage_script_module(
+    importer: PackageImporter, script_module_id: str
+) -> torch.nn.Module: ...
+
+_magic_methods: Incomplete
+
+class RecursiveScriptClass:
+    _c: Incomplete
+    _props: Incomplete
+    def __init__(self, cpp_class) -> None: ...
+    def __getattr__(self, attr): ...
+    def __setattr__(self, attr, value): ...
+    def forward_magic_method(self, method_name, *args, **kwargs): ...
+    def __getstate__(self) -> None: ...
+    def __iadd__(self, other): ...
+
+def method_template(self, *args, **kwargs): ...
+
+class ScriptModule(Module, metaclass=ScriptMeta):
+    __jit_unused_properties__: Incomplete
+    def __init__(self) -> None: ...
+    forward: Callable[..., Any]
+    def __getattr__(self, attr): ...
+    def __setattr__(self, attr, value): ...
+    def define(self, src): ...
+    def _replicate_for_data_parallel(self): ...
+    def __reduce_package__(self, exporter: PackageExporter): ...
+    # add __jit_unused_properties__
+    @property
+    def code(self) -> str: ...
+    @property
+    def code_with_constants(self) -> Tuple[str, ConstMap]: ...
+    @property
+    def graph(self) -> torch.Graph: ...
+    @property
+    def inlined_graph(self) -> torch.Graph: ...
+    @property
+    def original_name(self) -> str: ...
+
+class RecursiveScriptModule(ScriptModule):
+    _disable_script_meta: bool
+    _c: Incomplete
+    def __init__(self, cpp_module) -> None: ...
+    @staticmethod
+    def _construct(cpp_module, init_fn): ...
+    @staticmethod
+    def _finalize_scriptmodule(script_module) -> None: ...
+    _concrete_type: Incomplete
+    _modules: Incomplete
+    _parameters: Incomplete
+    _buffers: Incomplete
+    __dict__: Incomplete
+    def _reconstruct(self, cpp_module) -> None: ...
+    def save(self, f, **kwargs): ...
+    def _save_for_lite_interpreter(self, *args, **kwargs): ...
+    def _save_to_buffer_for_lite_interpreter(self, *args, **kwargs): ...
+    def save_to_buffer(self, *args, **kwargs): ...
+    def get_debug_state(self, *args, **kwargs): ...
+    def extra_repr(self): ...
+    def graph_for(self, *args, **kwargs): ...
+    def define(self, src) -> None: ...
+    def __getattr__(self, attr): ...
+    def __setattr__(self, attr, value): ...
+    def __copy__(self): ...
+    def __deepcopy__(self, memo): ...
+    def forward_magic_method(self, method_name, *args, **kwargs): ...
+    def __iter__(self): ...
+    def __getitem__(self, idx): ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key) -> bool: ...
+    def __dir__(self): ...
+    def __bool__(self) -> bool: ...
+    def _replicate_for_data_parallel(self): ...
+
+def _get_methods(cls): ...
+
+_compiled_methods_allowlist: Incomplete
+
+def _make_fail(name): ...
+def call_prepare_scriptable_func_impl(obj, memo): ...
+def call_prepare_scriptable_func(obj): ...
+def create_script_dict(obj): ...
+def create_script_list(obj, type_hint: Incomplete | None = ...): ...
+@overload
+def script(
+    obj: Type[Module],
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> Never: ...
+@overload
+def script(  # type: ignore[misc]
+    obj: Dict,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> torch.ScriptDict: ...
+@overload
+def script(  # type: ignore[misc]
+    obj: List,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> torch.ScriptList: ...
+@overload
+def script(  # type: ignore[misc]
+    obj: Module,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> RecursiveScriptModule: ...
+@overload
+def script(  # type: ignore[misc]
+    obj: _ClassVar,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> _ClassVar: ...
+@overload
+def script(  # type: ignore[misc]
+    obj: Callable,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> ScriptFunction: ...
+@overload
+def script(
+    obj: Any,
+    optimize: Optional[bool] = None,
+    _frames_up: int = 0,
+    _rcb: Optional[ResolutionCallback] = None,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = None,
+) -> RecursiveScriptClass: ...
+@overload
+def script(
+    obj,
+    optimize: Incomplete | None = ...,
+    _frames_up: int = ...,
+    _rcb: Incomplete | None = ...,
+    example_inputs: Union[List[Tuple], Dict[Callable, List[Tuple]], None] = ...,
+): ...
+def _check_overload_defaults(impl_defaults, overload_defaults, loc) -> None: ...
+def _compile_function_with_overload(overload_fn, qual_name, impl_fn): ...
+def _get_overloads(obj): ...
+def _check_directly_compile_overloaded(obj) -> None: ...
+def interface(obj): ...
+def _recursive_compile_class(obj, loc): ...
+
+CompilationUnit: Incomplete
+
+def pad(s: str, padding: int, offset: int = ..., char: str = ...): ...
+
+class _ScriptProfileColumn:
+    header: Incomplete
+    alignment: Incomplete
+    offset: Incomplete
+    rows: Incomplete
+    def __init__(
+        self, header: str, alignment: int = ..., offset: int = ...
+    ) -> None: ...
+    def add_row(self, lineno: int, value: Any): ...
+    def materialize(self): ...
+
+class _ScriptProfileTable:
+    cols: Incomplete
+    source_range: Incomplete
+    def __init__(
+        self, cols: List[_ScriptProfileColumn], source_range: List[int]
+    ) -> None: ...
+    def dump_string(self): ...
+
+class _ScriptProfile:
+    profile: Incomplete
+    def __init__(self) -> None: ...
+    def enable(self) -> None: ...
+    def disable(self) -> None: ...
+    def dump_string(self) -> str: ...
+    def dump(self) -> None: ...
+
+def _unwrap_optional(x): ...
diff --git a/MLPY/Lib/site-packages/torch/jit/_serialization.py b/MLPY/Lib/site-packages/torch/jit/_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2469350a22a91099efa55c12bc888e36f23de53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_serialization.py
@@ -0,0 +1,266 @@
+"""Serialization.
+
+This module contains functionality for serializing TorchScript modules, notably:
+    * torch.jit.save
+    * torch.jit.load
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+import os
+
+import torch
+from torch.jit._recursive import wrap_cpp_module
+from torch.serialization import validate_cuda_device
+
+
+def save(m, f, _extra_files=None):
+    r"""
+    Save an offline version of this module for use in a separate process.
+
+    The saved module serializes all of the methods, submodules, parameters, and
+    attributes of this module. It can be loaded into the C++ API using
+    ``torch::jit::load(filename)`` or into the Python API with
+    :func:`torch.jit.load <torch.jit.load>`.
+
+    To be able to save a module, it must not make any calls to native Python
+    functions.  This means that all submodules must be subclasses of
+    :class:`ScriptModule` as well.
+
+    .. DANGER::
+        All modules, no matter their device, are always loaded onto the CPU
+        during loading.  This is different from :func:`torch.load`'s semantics
+        and may change in the future.
+
+    Args:
+        m: A :class:`ScriptModule` to save.
+        f: A file-like object (has to implement write and flush) or a string
+           containing a file name.
+        _extra_files: Map from filename to contents which will be stored as part of `f`.
+
+    .. note::
+        torch.jit.save attempts to preserve the behavior of some operators
+        across versions. For example, dividing two integer tensors in
+        PyTorch 1.5 performed floor division, and if the module
+        containing that code is saved in PyTorch 1.5 and loaded in PyTorch 1.6
+        its division behavior will be preserved. The same module saved in
+        PyTorch 1.6 will fail to load in PyTorch 1.5, however, since the
+        behavior of division changed in 1.6, and 1.5 does not know how to
+        replicate the 1.6 behavior.
+
+    Example:
+    .. testcode::
+
+        import torch
+        import io
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + 10
+
+        m = torch.jit.script(MyModule())
+
+        # Save to file
+        torch.jit.save(m, 'scriptmodule.pt')
+        # This line is equivalent to the previous
+        m.save("scriptmodule.pt")
+
+        # Save to io.BytesIO buffer
+        buffer = io.BytesIO()
+        torch.jit.save(m, buffer)
+
+        # Save with extra files
+        extra_files = {'foo.txt': b'bar'}
+        torch.jit.save(m, 'scriptmodule.pt', _extra_files=extra_files)
+    """
+    if _extra_files is None:
+        _extra_files = {}
+    if isinstance(f, (str, os.PathLike)):
+        m.save(f, _extra_files=_extra_files)
+    else:
+        ret = m.save_to_buffer(_extra_files=_extra_files)
+        f.write(ret)
+
+
+def load(f, map_location=None, _extra_files=None, _restore_shapes=False):
+    r"""
+    Load a :class:`ScriptModule` or :class:`ScriptFunction` previously saved with :func:`torch.jit.save <torch.jit.save>`.
+
+    All previously saved modules, no matter their device, are first loaded onto CPU,
+    and then are moved to the devices they were saved from. If this fails (e.g.
+    because the run time system doesn't have certain devices), an exception is
+    raised.
+
+    Args:
+        f: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+        map_location (string or torch.device): A simplified version of
+            ``map_location`` in `torch.jit.save` used to dynamically remap
+            storages to an alternative set of devices.
+        _extra_files (dictionary of filename to content): The extra
+            filenames given in the map would be loaded and their content
+            would be stored in the provided map.
+        _restore_shapes (bool): Whether or not to retrace the module on load using stored inputs
+
+    Returns:
+        A :class:`ScriptModule` object.
+
+    Example:
+    .. testcode::
+
+        import torch
+        import io
+
+        torch.jit.load('scriptmodule.pt')
+
+        # Load ScriptModule from io.BytesIO object
+        with open('scriptmodule.pt', 'rb') as f:
+            buffer = io.BytesIO(f.read())
+
+        # Load all tensors to the original device
+        torch.jit.load(buffer)
+
+        # Load all tensors onto CPU, using a device
+        buffer.seek(0)
+        torch.jit.load(buffer, map_location=torch.device('cpu'))
+
+        # Load all tensors onto CPU, using a string
+        buffer.seek(0)
+        torch.jit.load(buffer, map_location='cpu')
+
+        # Load with extra files.
+        extra_files = {'foo.txt': ''}  # values will be replaced with data
+        torch.jit.load('scriptmodule.pt', _extra_files=extra_files)
+        print(extra_files['foo.txt'])
+
+    .. testoutput::
+        :hide:
+
+        ...
+
+    .. testcleanup::
+
+        import os
+        os.remove("scriptmodule.pt")
+    """
+    if isinstance(f, (str, os.PathLike)):
+        if not os.path.exists(f):  # type: ignore[type-var]
+            raise ValueError(f"The provided filename {f} does not exist")  # type: ignore[str-bytes-safe]
+        if os.path.isdir(f):
+            raise ValueError(f"The provided filename {f} is a directory")  # type: ignore[str-bytes-safe]
+
+    map_location = validate_map_location(map_location)
+    if _extra_files is None:
+        _extra_files = {}
+
+    cu = torch._C.CompilationUnit()
+    if isinstance(f, (str, os.PathLike)):
+        cpp_module = torch._C.import_ir_module(cu, os.fspath(f), map_location, _extra_files, _restore_shapes)  # type: ignore[call-arg]
+    else:
+        cpp_module = torch._C.import_ir_module_from_buffer(
+            cu, f.read(), map_location, _extra_files, _restore_shapes
+        )  # type: ignore[call-arg]
+
+    # TODO: Pretty sure this approach loses ConstSequential status and such
+    return wrap_cpp_module(cpp_module)
+
+
+def validate_map_location(map_location=None):
+    if isinstance(map_location, str):
+        map_location = torch.device(map_location)
+    elif not (map_location is None or isinstance(map_location, torch.device)):
+        raise ValueError(
+            "map_location should be either None, string or torch.device, "
+            "but got type: " + str(type(map_location))
+        )
+
+    if str(map_location).startswith("cuda"):
+        validate_cuda_device(map_location)
+
+    return map_location
+
+
+def jit_module_from_flatbuffer(f):
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+        return wrap_cpp_module(torch._C._load_jit_module_from_file(f))
+    else:
+        return wrap_cpp_module(torch._C._load_jit_module_from_bytes(f.read()))
+
+
+def save_jit_module_to_flatbuffer(m, f, _extra_files=None):
+    r"""
+    Save an offline version of this module for use in a separate process.
+
+    The saved module serializes all of the methods, submodules, parameters, and
+    attributes of this module. It can be loaded into the C++ API using
+    ``torch::jit::load_jit_module_from_file(filename)`` or into the Python API with
+    :func:`torch.jit.jit_module_from_flatbuffer<torch.jit.jit_module_from_flatbuffer>`.
+
+    To be able to save a module, it must not make any calls to native Python
+    functions.  This means that all submodules must be subclasses of
+    :class:`ScriptModule` as well.
+
+    .. DANGER::
+        All modules, no matter their device, are always loaded onto the CPU
+        during loading.  This is different from :func:`torch.load`'s semantics
+        and may change in the future.
+
+    Args:
+        m: A :class:`ScriptModule` to save.
+        f: A string for file path
+
+
+    Example:
+    .. testcode::
+
+        import torch
+        import io
+
+        class MyModule(torch.nn.Module):
+            def forward(self, x):
+                return x + 10
+
+        m = torch.jit.script(MyModule())
+
+        # Save to file
+        torch.jit.save_jit_module_to_flatbuffer(m, 'scriptmodule.ff')
+    """
+    extra_files = _extra_files
+    if extra_files is None:
+        extra_files = {}
+
+    if isinstance(f, (str, os.PathLike)):
+        f = os.fspath(f)
+        torch._C._save_jit_module(m._c, f, extra_files)
+    else:
+        s = torch._C._save_jit_module_to_bytes(m._c, extra_files)
+        f.write(s)
+
+
+def get_flatbuffer_module_info(path_or_file):
+    r"""Get some information regarding a model file in flatbuffer format.
+
+    Args:
+        path_or_file: Either str, Path or file like object (BytesIO OK).
+            If it's str or Path, we will read the file referenced by that
+            path as Bytes.
+
+    Returns:
+        A dict with metadata on what that file contains, currently looks like
+        this:
+        {
+            'bytecode_version': 4,  # int
+            'operator_version': 4,  # int
+            'function_names': {
+                '__torch__.___torch_mangle_0.Foo.forward'}, # set
+            'type_names': set(),  # set
+            'opname_to_num_args': {'aten::linear': 3} # Dict[str, int]
+        }
+    """
+    if isinstance(path_or_file, (str, os.PathLike)):
+        with open(path_or_file, "rb") as f:
+            all_bytes = f.read()
+    else:
+        all_bytes = path_or_file.read()
+    return torch._C._get_module_info_from_flatbuffer(all_bytes)
diff --git a/MLPY/Lib/site-packages/torch/jit/_shape_functions.py b/MLPY/Lib/site-packages/torch/jit/_shape_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..228eeeb265d2dc91262c88c18ef175189396601a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_shape_functions.py
@@ -0,0 +1,1459 @@
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+number = Union[int, float]
+# flake8: noqa
+
+###
+# There are generated files that depend on this file
+# To re-generate, please run from the root of the repo:
+# python torchgen/shape_functions/gen_jit_shape_functions.py
+
+# How to test:
+# After regenerating files, compile PyTorch.
+# Then run: ./build/bin/test_jit --gtest_filter=TestShapeGraphLinting.Basic
+# If you have enabled opinfo testing for the op, also run:
+# python test/test_ops_jit.py TestJitCPU.test_variant_consistency_jit_[FAILING_OP]_cpu_float32
+# to reproduce errors from opinfo tests.
+
+# Example PR: https://github.com/pytorch/pytorch/pull/80860/files
+####
+
+import torch
+
+
+def broadcast(a: List[int], b: List[int]):
+    dimsA = len(a)
+    dimsB = len(b)
+    ndim = max(dimsA, dimsB)
+    expandedSizes: List[int] = []
+
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dimA = dimsA - 1 - offset
+        dimB = dimsB - 1 - offset
+        sizeA = a[dimA] if (dimA >= 0) else 1
+        sizeB = b[dimB] if (dimB >= 0) else 1
+
+        if sizeA != sizeB and sizeA != 1 and sizeB != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                f"The size of tensor a {sizeA} must match the size of tensor b ({sizeB}) at non-singleton dimension {i}"
+            )
+
+        expandedSizes.append(sizeB if sizeA == 1 else sizeA)
+
+    return expandedSizes
+
+
+def broadcast_three(a: List[int], b: List[int], c: List[int]):
+    return broadcast(broadcast(a, b), c)
+
+
+def broadcast_one_three(a: List[int], b: Any, c: List[int]):
+    return broadcast(a, c)
+
+
+def adaptive_avg_pool2d(self: List[int], out: List[int]):
+    assert len(out) == 2
+    assert len(self) == 3 or len(self) == 4
+    for i in range(1, len(self)):
+        assert self[i] != 0
+
+    shape: List[int] = []
+    for i in range(0, len(self) - 2):
+        shape.append(self[i])
+    for elem in out:
+        shape.append(elem)
+    return shape
+
+
+def _copy(self: List[int]):
+    out: List[int] = []
+    for elem in self:
+        out.append(elem)
+    return out
+
+
+def unary(self: List[int]):
+    return _copy(self)
+
+
+def broadcast_inplace(a: List[int], b: List[int]):
+    dimsA = len(a)
+    dimsB = len(b)
+    if dimsB > dimsA:
+        raise AssertionError(
+            f"The dims of tensor b ({dimsB}) must be less than or equal tothe dims of tensor a ({dimsA}) "
+        )
+    for dimA in range(dimsA):
+        dimB = dimsB - dimsA + dimA
+        sizeA = a[dimA]
+        sizeB = b[dimB] if (dimB >= 0) else 1
+        if sizeA != sizeB and sizeB != 1:
+            # TODO: only assertion error is bound in C++ compilation right now
+            raise AssertionError(
+                "The size of tensor a {} must match the size of tensor b ("
+                "{}) at non-singleton dimension {}".format(sizeA, sizeB, dimA)
+            )
+    return _copy(a)
+
+
+def expand(self: List[int], sizes: List[int]):
+    assert len(sizes) >= len(self)
+    ndim = len(sizes)
+    tensor_dim = len(self)
+    if ndim == 0:
+        return _copy(sizes)
+    out: List[int] = []
+    for i in range(ndim):
+        offset = ndim - 1 - i
+        dim = tensor_dim - 1 - offset
+        size = self[dim] if dim >= 0 else 1
+        targetSize = sizes[i]
+        if targetSize == -1:
+            assert dim >= 0
+            targetSize = size
+        if size != targetSize:
+            assert size == 1
+            size = targetSize
+        out.append(size)
+    return out
+
+
+def expand_one_unused(self: List[int], sizes: List[int], inp0: Any):
+    return expand(self, sizes)
+
+
+def infer_size_impl(shape: List[int], numel: int) -> List[int]:
+    newsize = 1
+    infer_dim: Optional[int] = None
+    for dim in range(len(shape)):
+        if shape[dim] == -1:
+            if infer_dim is not None:
+                raise AssertionError("only one dimension can be inferred")
+            infer_dim = dim
+        elif shape[dim] >= 0:
+            newsize *= shape[dim]
+        else:
+            raise AssertionError("invalid shape dimensions")
+    if not (
+        numel == newsize
+        or (infer_dim is not None and newsize > 0 and numel % newsize == 0)
+    ):
+        raise AssertionError("invalid shape")
+    out = _copy(shape)
+    if infer_dim is not None:
+        out[infer_dim] = numel // newsize
+    return out
+
+
+def numel(sizes: List[int]):
+    numel = 1
+    for elem in sizes:
+        numel *= elem
+    return numel
+
+
+def view(self: List[int], sizes: List[int]):
+    return infer_size_impl(sizes, numel(self))
+
+
+def view_one_unused(self: List[int], sizes: List[int], *, implicit: bool = False):
+    return view(self, sizes)
+
+
+def sum_mean_dim(
+    self: List[int], opt_dims: Optional[List[int]], keep_dim: bool, dt: Any
+):
+    out: List[int] = []
+    if opt_dims is None or len(opt_dims) == 0:
+        dims: List[int] = list(range(len(self)))
+    else:
+        dims = opt_dims
+
+    for idx in range(len(self)):
+        is_mean_dim: bool = False
+        for reduce_dim in dims:
+            if idx == maybe_wrap_dim(reduce_dim, len(self)):
+                is_mean_dim = True
+        if is_mean_dim:
+            if keep_dim:
+                out.append(1)
+        else:
+            out.append(self[idx])
+    return out
+
+
+def max_dim(self: List[int], dim: int, keep_dim: bool):
+    out = sum_mean_dim(self, [dim], keep_dim, None)
+    return out, out
+
+
+# note: python already rounds down towards negative infinity on integer division, special arithmetic not needed
+def div_rtn(x: int, y: int):
+    return x // y
+
+
+def pooling_output_shape_pad_lr(
+    inputSize: int,
+    kernelSize: int,
+    pad_l: int,
+    pad_r: int,
+    stride: int,
+    dilation: int,
+    ceil_mode: bool,
+):
+    outputSize = (
+        div_rtn(
+            inputSize
+            + pad_l
+            + pad_r
+            - dilation * (kernelSize - 1)
+            - 1
+            + (stride - 1 if ceil_mode else 0),
+            stride,
+        )
+        + 1
+    )
+    if ceil_mode:
+        if (outputSize - 1) * stride >= inputSize + pad_l:
+            outputSize = outputSize - 1
+    return outputSize
+
+
+def pooling_output_shape(
+    inputSize: int,
+    kernelSize: int,
+    pad_l: int,
+    stride: int,
+    dilation: int,
+    ceil_mode: bool,
+):
+    assert stride != 0, "stride should not be zeero"
+    return pooling_output_shape_pad_lr(
+        inputSize, kernelSize, pad_l, pad_l, stride, dilation, ceil_mode
+    )
+
+
+def pool2d_shape_check(
+    input: List[int],
+    kH: int,
+    kW: int,
+    dH: int,
+    dW: int,
+    padH: int,
+    padW: int,
+    dilationH: int,
+    dilationW: int,
+    nInputPlane: int,
+    inputHeight: int,
+    inputWidth: int,
+    outputHeight: int,
+    outputWidth: int,
+):
+    ndim = len(input)
+    nOutputPlane = nInputPlane
+
+    assert kW > 0 and kH > 0
+    assert dW > 0 and dH > 0
+    assert dilationH > 0 and dilationW > 0
+
+    valid_dims = input[1] != 0 and input[2] != 0
+    assert (
+        ndim == 3
+        and input[0] != 0
+        and valid_dims
+        or (ndim == 4 and valid_dims and input[3] != 0)
+    )
+
+    assert kW // 2 >= padW and kH // 2 >= padH
+    assert outputWidth >= 1 and outputHeight >= 1
+
+
+def max_pool2d(
+    input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool,
+):
+    assert (
+        len(kernel_size) == 1 or len(kernel_size) == 2
+    ), "max_pool2d: kernel_size must either be a single int, or a tuple of two ints"
+    kH = kernel_size[0]
+    kW = kH if len(kernel_size) == 1 else kernel_size[1]
+
+    assert (
+        len(stride) == 0 or len(stride) == 1 or len(stride) == 2
+    ), "max_pool2d: stride must either be omitted, a single int, or a tuple of two ints"
+    dH = kH if len(stride) == 0 else stride[0]
+    if len(stride) == 0:
+        dW = kW
+    elif len(stride) == 1:
+        dW = dH
+    else:
+        dW = stride[1]
+
+    assert (
+        len(padding) == 1 or len(padding) == 2
+    ), "max_pool2d: padding must either be a single int, or a tuple of two ints"
+    padH = padding[0]
+    padW = padH if len(padding) == 1 else padding[1]
+
+    assert (
+        len(dilation) == 1 or len(dilation) == 2
+    ), "max_pool2d: dilation must be either a single int, or a tuple of two ints"
+    dilationH = dilation[0]
+    dilationW = dilationH if len(dilation) == 1 else dilation[1]
+
+    assert len(input) == 3 or len(input) == 4
+
+    nbatch = input[-4] if len(input) == 4 else 1
+    nInputPlane = input[-3]
+    inputHeight = input[-2]
+    inputWidth = input[-1]
+
+    outputHeight = pooling_output_shape(inputHeight, kH, padH, dH, dilationH, ceil_mode)
+    outputWidth = pooling_output_shape(inputWidth, kW, padW, dW, dilationW, ceil_mode)
+
+    pool2d_shape_check(
+        input,
+        kH,
+        kW,
+        dH,
+        dW,
+        padH,
+        padW,
+        dilationH,
+        dilationW,
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        outputHeight,
+        outputWidth,
+    )
+
+    if len(input) == 3:
+        return [nInputPlane, outputHeight, outputWidth]
+    else:
+        return [nbatch, nInputPlane, outputHeight, outputWidth]
+
+
+def max_pool2d_with_indices(
+    input: List[int],
+    kernel_size: List[int],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    ceil_mode: bool,
+):
+    out = max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
+    return (out, out)
+
+
+def upsample_nearest2d(
+    input: List[int],
+    output_size: Optional[List[int]],
+    scale_factors: Optional[List[float]],
+):
+    out: List[int] = []
+    out.append(input[0])
+    out.append(input[1])
+
+    if scale_factors is None and output_size is None:
+        assert 0, "Either output_size or scale_factors must be presented"
+
+    if output_size is not None:
+        assert (
+            scale_factors is None
+        ), "Must specify exactly one of output_size and scale_factors"
+        assert len(output_size) == 2
+        out.append(output_size[0])
+        out.append(output_size[1])
+
+    if scale_factors is not None:
+        assert (
+            output_size is None
+        ), "Must specify exactly one of output_size and scale_factors"
+        assert len(scale_factors) == 2
+        out.append(int(input[2] * scale_factors[0]))
+        out.append(int(input[3] * scale_factors[1]))
+
+    return out
+
+
+def mm(self: List[int], mat2: List[int]):
+    assert len(self) == 2, "self must be a matrix"
+    assert len(mat2) == 2, "mat2 must be a matrix"
+
+    assert self[1] == mat2[0]
+    return [self[0], mat2[1]]
+
+
+def dot(self: List[int], tensor: List[int]):
+    assert len(self) == 1 and len(tensor) == 1
+    assert self[0] == tensor[0]
+    out: List[int] = []
+    return out
+
+
+def mv(self: List[int], vec: List[int]):
+    assert len(self) == 2 and len(vec) == 1
+    assert self[1] == vec[0]
+    # TODO: return self
+    return [self[0]]
+
+
+def unsqueeze(li: List[int], dim: int):
+    dim = maybe_wrap_dim(dim, len(li) + 1)
+    out = _copy(li)
+    out.insert(dim, 1)
+    return out
+
+
+def squeeze_nodim(li: List[int]):
+    out: List[int] = []
+    for i in range(len(li)):
+        if li[i] != 1:
+            out.append(li[i])
+    return out
+
+
+def squeeze(li: List[int], dim: int):
+    out: List[int] = []
+    wrapped_dim = maybe_wrap_dim(dim, len(li))
+    for i in range(len(li)):
+        if i == wrapped_dim:
+            if li[i] != 1:
+                out.append(li[i])
+        else:
+            out.append(li[i])
+    return out
+
+
+def squeeze_dims(li: List[int], dims: List[int]):
+    if len(dims) == 0:
+        return li
+    wrapped_dims = _copy(dims)
+    for i in range(len(dims)):
+        wrapped_dims[i] = maybe_wrap_dim(wrapped_dims[i], len(li))
+    result: List[int] = []
+    for i in range(len(li)):
+        if li[i] == 1:
+            if i not in wrapped_dims:
+                result.append(li[i])
+        else:
+            result.append(li[i])
+    return result
+
+
+def index_select(self: List[int], dim: int, index: List[int]):
+    dim = maybe_wrap_dim(dim, len(self))
+    numel = multiply_integers(index)
+    assert len(index) <= 1
+    assert dim == 0 or dim < len(self)
+    result_size: List[int] = []
+    for i in range(len(self)):
+        if dim == i:
+            result_size.append(numel)
+        else:
+            result_size.append(self[i])
+    return result_size
+
+
+def embedding(
+    weight: List[int],
+    indices: List[int],
+    padding_idx: int = -1,
+    scale_grad_by_freq: bool = False,
+    sparse: bool = False,
+):
+    assert len(weight) == 2
+    if len(indices) == 1:
+        return index_select(weight, 0, indices)
+    size = _copy(indices)
+    size.append(weight[1])
+    return size
+
+
+def max_int():
+    return 9223372036854775807
+
+
+def slice(
+    self: List[int], dim: int, start: Optional[int], end: Optional[int], step: int
+):
+    ndim = len(self)
+    assert ndim != 0
+    dim = maybe_wrap_dim(dim, ndim)
+    start_val = start if start is not None else 0
+    end_val = end if end is not None else max_int()
+    assert step > 0
+    if start_val == max_int():
+        start_val = 0
+    if start_val < 0:
+        start_val += self[dim]
+    if end_val < 0:
+        end_val += self[dim]
+    if start_val < 0:
+        start_val = 0
+    elif start_val > self[dim]:
+        start_val = self[dim]
+    if end_val < start_val:
+        end_val = start_val
+    elif end_val >= self[dim]:
+        end_val = self[dim]
+    slice_len = end_val - start_val
+    out = _copy(self)
+    out[dim] = (slice_len + step - 1) // step
+    return out
+
+
+def check_cat_no_zero_dim(tensors: List[List[int]]):
+    for tensor in tensors:
+        assert len(tensor) > 0
+
+
+def legacy_cat_wrap_dim(dim: int, tensor_sizes: List[List[int]]):
+    out_dim: Optional[int] = None
+    for size in tensor_sizes:
+        if not (len(size) == 1 and size[0] == 0):
+            if out_dim is None:
+                out_dim = maybe_wrap_dim(dim, len(size))
+    if out_dim is None:
+        out_dim = dim
+    return out_dim
+
+
+def should_skip(tensor: List[int]):
+    return numel(tensor) == 0 and len(tensor) == 1
+
+
+def check_cat_shape_except_dim(
+    first: List[int], second: List[int], dimension: int, index: int
+):
+    first_dims = len(first)
+    second_dims = len(second)
+    assert first_dims == second_dims, "Tensors must have same number of dimensions"
+    for dim in range(0, first_dims):
+        if dim != dimension:
+            assert (
+                first[dim] == second[dim]
+            ), "Sizes of tensors must match except in dimension"
+
+
+def cat(tensors: List[List[int]], dim: int):
+    check_cat_no_zero_dim(tensors)
+    dim = legacy_cat_wrap_dim(dim, tensors)
+    assert len(tensors) > 0
+    not_skipped_tensor: Optional[List[int]] = None
+    for tensor in tensors:
+        if not should_skip(tensor):
+            not_skipped_tensor = tensor
+    if not_skipped_tensor is None:
+        return [0]
+
+    cat_dim_size = 0
+
+    for i in range(len(tensors)):
+        tensor = tensors[i]
+        if not should_skip(tensor):
+            check_cat_shape_except_dim(not_skipped_tensor, tensor, dim, i)
+            cat_dim_size = cat_dim_size + tensor[dim]
+
+    result_size = _copy(not_skipped_tensor)
+    result_size[dim] = cat_dim_size
+    return result_size
+
+
+def stack(tensors: List[List[int]], dim: int):
+    unsqueezed_tensors: List[List[int]] = []
+    for tensor in tensors:
+        unsqueezed = unsqueeze(tensor, dim)
+        unsqueezed_tensors.append(unsqueezed)
+    return cat(unsqueezed_tensors, dim)
+
+
+def select(self: List[int], dim: int, index: int):
+    ndim = len(self)
+    assert ndim != 0
+    dim = maybe_wrap_dim(dim, ndim)
+    size = self[dim]
+    assert not (index < -size or index >= size)
+    if index < 0:
+        index += size
+    out: List[int] = []
+    for i in range(ndim):
+        if i != dim:
+            out.append(self[i])
+    return out
+
+
+def matmul(tensor1: List[int], tensor2: List[int]):
+    dim_tensor1 = len(tensor1)
+    dim_tensor2 = len(tensor2)
+    if dim_tensor1 == 1 and dim_tensor2 == 1:
+        return dot(tensor1, tensor2)
+    elif dim_tensor1 == 2 and dim_tensor2 == 1:
+        return mv(tensor1, tensor2)
+    elif dim_tensor1 == 1 and dim_tensor2 == 2:
+        return squeeze(mm(unsqueeze(tensor1, 0), tensor2), 0)
+    elif dim_tensor1 == 2 and dim_tensor2 == 2:
+        return mm(tensor1, tensor2)
+    elif dim_tensor1 >= 1 and dim_tensor2 >= 1:
+        # We are multiplying b1 x n x m1 by x2 x m2 x p (where b1 can be a list);
+        # we track m1 vs m2 separately even though they must match for nicer error messages
+        n = tensor1[-2] if dim_tensor1 > 1 else 1
+        m1 = tensor1[-1]
+        batch_tensor1: List[int] = []
+        # TODO: handling of slice
+        for i in range(dim_tensor1 - 2):
+            batch_tensor1.append(tensor1[i])
+        m2 = tensor2[-1] if dim_tensor2 > 1 else 1
+        p = tensor2[-1]
+        batch_tensor2: List[int] = []
+        # TODO: handling of slice
+        for i in range(dim_tensor2 - 2):
+            batch_tensor2.append(tensor2[i])
+
+        # expand the batch portion (i.e. cut off matrix dimensions and expand rest)
+        expand_batch_portion = broadcast(batch_tensor1, batch_tensor2)
+
+        # todo: copy ?
+        output_shape = expand_batch_portion
+        if dim_tensor1 > 1:
+            output_shape.append(n)
+
+        if dim_tensor2 > 1:
+            output_shape.append(p)
+
+        return output_shape
+    else:
+        assert False, "both  arguments to matmul need to be at least 1D"
+
+
+def t(self: List[int]):
+    assert len(self) <= 2
+    self_len = len(self)
+    if self_len == 0:
+        out: List[int] = []
+        return out
+    elif self_len == 1:
+        return [self[0]]
+    else:
+        return [self[1], self[0]]
+
+
+def transpose(self: List[int], dim0: int, dim1: int):
+    ndims = len(self)
+    dim0 = maybe_wrap_dim(dim0, ndims)
+    dim1 = maybe_wrap_dim(dim1, ndims)
+    if dim0 == dim1:
+        return _copy(self)
+    out: List[int] = []
+    for i in range(ndims):
+        if i == dim0:
+            out.append(self[dim1])
+        elif i == dim1:
+            out.append(self[dim0])
+        else:
+            out.append(self[i])
+    return out
+
+
+def linear(input: List[int], weight: List[int], bias: Optional[List[int]]):
+    out = matmul(input, t(weight))
+    if bias is not None:
+        assert broadcast(bias, out) == out
+    return out
+
+
+def addmm(self: List[int], mat1: List[int], mat2: List[int], beta: Any, alpha: Any):
+    return broadcast(self, mm(mat1, mat2))
+
+
+def check_non_negative(array: List[int]) -> bool:
+    # TODO: look into rewriting with early return and getting loop unrolling to fire
+    non_negative = False
+    for val in array:
+        if val < 0:
+            non_negative = True
+    return non_negative
+
+
+def check_shape_forward(
+    input: List[int],
+    weight_sizes: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    k = len(input)
+    weight_dim = len(weight_sizes)
+
+    # TODO: assertions could be expanded with the error messages
+    assert not check_non_negative(padding)
+    assert not check_non_negative(stride)
+
+    assert weight_dim == k
+    assert weight_sizes[0] >= groups
+    assert (weight_sizes[0] % groups) == 0
+    # only handling not transposed
+    assert input[1] == weight_sizes[1] * groups
+    assert bias is None or (len(bias) == 1 and bias[0] == weight_sizes[0])
+
+    for i in range(2, k):
+        assert (input[i] + 2 * padding[i - 2]) >= (
+            dilation[i - 2] * (weight_sizes[i] - 1) + 1
+        )
+
+    # this is not handling transposed convolution yet
+
+
+def conv_output_size(
+    input_size: List[int],
+    weight_size: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    check_shape_forward(
+        input_size, weight_size, bias, stride, padding, dilation, groups
+    )
+
+    has_dilation = len(dilation) > 0
+    dim = len(input_size)
+    output_size: List[int] = []
+    input_batch_size_dim = 0
+    weight_output_channels_dim = 0
+    output_size.append(input_size[input_batch_size_dim])
+    output_size.append(weight_size[weight_output_channels_dim])
+
+    for d in range(2, dim):
+        dilation_ = dilation[d - 2] if has_dilation else 1
+        kernel = dilation_ * (weight_size[d] - 1) + 1
+        output_size.append(
+            (input_size[d] + (2 * padding[d - 2]) - kernel) // stride[d - 2] + 1
+        )
+    return output_size
+
+
+def conv1d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 3
+    assert len(input) == 3
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def conv2d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 4
+    assert len(input) == 4
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def conv_backwards(
+    grad_output: List[int],
+    input: List[int],
+    weight: List[int],
+    biases: Optional[List[int]],
+):
+    # Bias gradient is always generated regardess of if biases is supplied
+    return _copy(input), _copy(weight), [grad_output[1]]
+
+
+def conv_transpose2d_input(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]] = None,
+    stride: Optional[List[int]] = None,
+    padding: Optional[List[int]] = None,
+    output_padding: Optional[List[int]] = None,
+    groups: int = 1,
+    dilation: Optional[List[int]] = None,
+) -> List[int]:
+    if stride is None:
+        stride = [1, 1]
+    if padding is None:
+        padding = [0, 0]
+    if output_padding is None:
+        output_padding = [0, 0]
+    if dilation is None:
+        dilation = [1, 1]
+    has_dilation = len(dilation) > 0
+    dim = len(input)
+    output_size: List[int] = []
+    input_batch_size_dim = 0
+    weight_output_channels_dim = 1
+    output_size.append(input[input_batch_size_dim])
+    output_size.append(weight[weight_output_channels_dim] * groups)
+
+    for d in range(2, dim):
+        dilation_ = dilation[d - 2] if has_dilation else 1
+        kernel = dilation_ * (weight[d] - 1)
+        output_size.append(
+            (input[d] - 1) * stride[d - 2]
+            - 2 * padding[d - 2]
+            + kernel
+            + output_padding[d - 2]
+            + 1
+        )
+    return output_size
+
+
+def conv_forwards(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+) -> List[int]:
+    has_dilation = len(dilation) > 0
+    has_output_padding = len(output_padding) > 0
+    dim = len(input)
+    output_size: List[int] = []
+    input_batch_size_dim = 0
+    weight_output_channels_dim = 1 if transposed else 0
+    output_size.append(input[input_batch_size_dim])
+    if transposed:
+        output_size.append(weight[weight_output_channels_dim] * groups)
+    else:
+        output_size.append(weight[weight_output_channels_dim])
+
+    for d in range(2, dim):
+        dilation_ = dilation[d - 2] if has_dilation else 1
+        output_padding_ = output_padding[d - 2] if has_output_padding else 0
+        if transposed:
+            kernel = dilation_ * (weight[d] - 1)
+            output_size.append(
+                (input[d] - 1) * stride[d - 2]
+                - 2 * padding[d - 2]
+                + kernel
+                + output_padding_
+                + 1
+            )
+        else:
+            kernel = dilation_ * (weight[d] - 1) + 1
+            output_size.append(
+                (input[d] + (2 * padding[d - 2]) - kernel) // stride[d - 2] + 1
+            )
+    return output_size
+
+
+def _conv_forwards(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    transposed: bool,
+    output_padding: List[int],
+    groups: int,
+    benchmark: bool,
+    deterministic: bool,
+    cudnn_enabled: bool,
+    allow_tf32: bool,
+) -> List[int]:
+    return conv_forwards(
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+    )
+
+
+def batch_norm(
+    input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+    training: bool,
+    momentum: float,
+    eps: float,
+    cudnn_enabled: bool,
+):
+    out: List[int] = []
+    for elem in input:
+        out.append(elem)
+    return out
+
+
+def conv3d(
+    input: List[int],
+    weight: List[int],
+    bias: Optional[List[int]],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+):
+    assert len(weight) == 5
+    assert len(input) == 5
+    return conv_output_size(input, weight, bias, stride, padding, dilation, groups)
+
+
+def maybe_wrap_dim(dim: int, dim_post_expr: int, wrap_scalar: bool = True):
+    if dim_post_expr <= 0:
+        assert wrap_scalar
+        dim_post_expr = 1
+    min = -dim_post_expr
+    max = dim_post_expr - 1
+    assert not (dim < min or dim > max)
+    if dim < 0:
+        dim += dim_post_expr
+    return dim
+
+
+def zero_dim_tensor(input: Any):
+    out: List[int] = []
+    return out
+
+
+def multiply_integers(li: List[int]):
+    out = 1
+    for elem in li:
+        out = out * elem
+    return out
+
+
+def arange_end(end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any):
+    assert end >= 0
+    return [int(math.ceil(end))]
+
+
+def arange_start(
+    start: number, end: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
+):
+    assert end >= 0
+    assert end >= start
+    return [int(math.ceil(end - start))]
+
+
+def arange_start_step(
+    start: number, end: number, step: number, inp0: Any, inp1: Any, inp2: Any, inp3: Any
+):
+    assert step != 0
+    if step < 0:
+        assert start >= end
+    else:
+        assert end >= start
+    return [int(math.ceil((end - start) / step))]
+
+
+def permute(input: List[int], dims: List[int]):
+    assert len(input) == len(dims)
+    ndim = len(dims)
+    seen_dims: List[int] = []
+    newSizes: List[int] = []
+    for i in range(ndim):
+        dim = maybe_wrap_dim(dims[i], ndim)
+        seen_dims.append(dim)
+        newSizes.append(input[dim])
+    for i in range(1, ndim):
+        for j in range(i):
+            assert seen_dims[i] != seen_dims[j]
+    return newSizes
+
+
+def movedim(self: List[int], source: List[int], destination: List[int]) -> List[int]:
+    self_dim = len(self)
+    if self_dim <= 1:
+        return self
+    normalized_src: List[int] = []
+    normalized_dst: List[int] = []
+    for i in range(len(source)):
+        normalized_src.append(maybe_wrap_dim(source[i], self_dim))
+        normalized_dst.append(maybe_wrap_dim(destination[i], self_dim))
+    order = [-1 for i in range(self_dim)]
+    src_dims = [i for i in range(self_dim)]
+    dst_dims = [i for i in range(self_dim)]
+
+    for i in range(len(source)):
+        order[normalized_dst[i]] = normalized_src[i]
+        src_dims[normalized_src[i]] = -1
+        dst_dims[normalized_dst[i]] = -1
+
+    source_dims: List[int] = []
+    destination_dims: List[int] = []
+    for ele in src_dims:
+        if ele != -1:
+            source_dims.append(ele)
+    for ele in dst_dims:
+        if ele != -1:
+            destination_dims.append(ele)
+
+    rest_dim = self_dim - len(source)
+    for i in range(rest_dim):
+        order[destination_dims[i]] = source_dims[i]
+    return permute(self, order)
+
+
+def flatten(input: List[int], start_dim: int, end_dim: int):
+    start_dim = maybe_wrap_dim(start_dim, len(input))
+    end_dim = maybe_wrap_dim(end_dim, len(input))
+    assert start_dim <= end_dim
+    if len(input) == 0:
+        return [1]
+    if start_dim == end_dim:
+        # TODO: return self
+        out: List[int] = []
+        for elem in input:
+            out.append(elem)
+        return out
+    slice_numel = 1
+    for i in range(start_dim, end_dim + 1):
+        slice_numel *= input[i]
+    # TODO: use slicing when slice optimization has landed
+    # slice_numel = multiply_integers(input[start_dim:end_dim - start_dim + 1])
+    shape: List[int] = []
+    for i in range(start_dim):
+        shape.append(input[i])
+    shape.append(slice_numel)
+    for i in range(end_dim + 1, len(input)):
+        shape.append(input[i])
+    return shape
+
+
+def nonzero_lower_bound(input: List[int]):
+    return [0, len(input)]
+
+
+def nonzero_upper_bound(input: List[int]):
+    return [numel(input), len(input)]
+
+
+def _reduce_along_dim(self: List[int], dim: int, keepdim: bool):
+    dim = maybe_wrap_dim(dim, len(self))
+    out: List[int] = []
+    for i, self_dim in enumerate(self):
+        if i == dim:
+            if keepdim:
+                out.append(1)
+        else:
+            out.append(self_dim)
+    return out
+
+
+def argmax(
+    self: List[int], dim: Optional[int] = None, keepdim: bool = False
+) -> List[int]:
+    if dim is None:
+        return []
+    return _reduce_along_dim(self, dim, keepdim)
+
+
+def bmm(self: List[int], mat2: List[int]) -> List[int]:
+    assert len(self) == 3, "bmm only supports 3D tensors"
+    assert len(mat2) == 3, "bmm only supports 3D tensors"
+    assert self[0] == mat2[0], "mismatching batch dimension"
+    assert self[2] == mat2[1], "mismatching contracting dimension"
+    return [self[0], self[1], mat2[2]]
+
+
+def _shape_as_tensor(self: List[int]) -> List[int]:
+    return [len(self)]
+
+
+def topk(self: List[int], k: int, dim: int = -1) -> Tuple[List[int], List[int]]:
+    if len(self) == 0:
+        result: List[int] = []
+    else:
+        assert (
+            k <= self[dim]
+        ), f"k ({k}) is too big for dimension {dim} of size {self[dim]}"
+        result = _copy(self)
+        result[dim] = k
+    return result, result
+
+
+def nll_loss_forward(
+    self: List[int], target: List[int], weight: Optional[List[int]], reduction: int
+) -> Tuple[List[int], List[int]]:
+    # This is taken shamelessly from the meta function in LossNLL.cpp
+    self_dim = len(self)
+    target_dim = len(target)
+    assert 0 < self_dim <= 2
+    assert target_dim <= 1
+    no_batch_dim = self_dim == 1 and target_dim == 0
+    assert no_batch_dim or (self[0] == target[0])
+    n_classes = self[-1]
+    scalar_shape: List[int] = []
+    assert weight is None or (len(weight) == 1 and weight[0] == n_classes)
+    if reduction == 0 and self_dim == 2:
+        reduction_shape = [self[0]]
+    else:
+        reduction_shape = scalar_shape
+    return reduction_shape, scalar_shape
+
+
+def native_layer_norm(
+    input: List[int], normalized_shape: List[int]
+) -> Tuple[List[int], List[int], List[int]]:
+    reduction_shape: List[int] = []
+    num_unreduced_dimensions = len(input) - len(normalized_shape)
+    assert num_unreduced_dimensions >= 0
+    for i in range(num_unreduced_dimensions):
+        reduction_shape.append(input[i])
+    for i in range(num_unreduced_dimensions, len(input)):
+        reduction_shape.append(1)
+    return _copy(input), reduction_shape, reduction_shape
+
+
+def native_batch_norm(
+    input: List[int],
+    weight: Optional[List[int]],
+    bias: Optional[List[int]],
+    running_mean: Optional[List[int]],
+    running_var: Optional[List[int]],
+    training: bool,
+) -> Tuple[List[int], List[int], List[int]]:
+    if training:
+        _size = [input[1]]
+    else:
+        _size = [0]
+    return _copy(input), _size, _size
+
+
+def cross_entropy_loss(
+    self: List[int],
+    target: List[int],
+    weight: Optional[List[int]] = None,
+    reduction: int = 1,
+    ignore_index: int = -100,
+    label_smoothing: float = 0.0,
+) -> List[int]:
+    result_shape = nll_loss_forward(self, target, weight, reduction)[0]
+    return result_shape
+
+
+"""
+Currently deferring the enabling of this, as part of the propoasal to suspend
+adding ops.
+There are currently cases in the test case where this is being called
+in the SSA opinfo tests with with unexpected values (eg list of two ints, see the first
+opinfo test). The behavoir of index is significantly dependent on the inputs.
+
+This could be an error with how we are matching up shape functions, or that this
+function needs to just implement everything.
+
+def index_Tensor(self: List[int], indices: List[Optional[List[int]]]) -> List[int]:
+    assert len(indices) <= len(self), "More indices than dimensions to index"
+    broadcasted_shape: List[int] = []
+    for index_tensor_shape in indices:
+        if index_tensor_shape is not None:
+            broadcasted_shape = broadcast(broadcasted_shape, index_tensor_shape)
+    return broadcasted_shape
+"""
+
+ScriptFn = torch._C.ScriptFunction
+shape_compute_graph_mapping: Dict[str, ScriptFn] = {}
+bounded_compute_graph_mapping: Dict[str, Tuple[ScriptFn, ScriptFn]] = {}
+script_func_map: Dict[Callable, ScriptFn] = {}
+
+
+def process_func(func: Callable):
+    if func not in script_func_map:
+        scripted_func = torch.jit.script(func)
+
+        torch._C._jit_pass_inline(scripted_func.graph)
+
+        for _ in range(2):
+            torch._C._jit_pass_peephole(scripted_func.graph)
+            torch._C._jit_pass_constant_propagation(scripted_func.graph)
+
+        script_func_map[func] = scripted_func
+    return script_func_map[func]
+
+
+def add_shape_compute_mapping(operator_schema: str, func: Callable):
+    global shape_compute_graph_mapping
+
+    shape_compute_graph_mapping[operator_schema] = process_func(func)
+
+
+def add_bounded_compute_mapping(
+    operator_schema: str, lower_bound_func: Callable, upper_bound_func: Callable
+):
+    # Adds a shape compute function for both upper and lower bounds
+    fns = (process_func(lower_bound_func), process_func(upper_bound_func))
+    bounded_compute_graph_mapping[operator_schema] = fns
+
+
+add_shape_compute_mapping(
+    "aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::rsub.Tensor(Tensor self, Scalar other, Scalar alpha=1) -> Tensor", unary
+)
+add_shape_compute_mapping(
+    "aten::dropout(Tensor input, float p, bool train) -> Tensor", unary
+)
+add_shape_compute_mapping(
+    "aten::adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor",
+    adaptive_avg_pool2d,
+)
+add_shape_compute_mapping(
+    "prim::NumToTensor.Scalar(Scalar a) -> Tensor", zero_dim_tensor
+)
+add_shape_compute_mapping("prim::NumToTensor.bool(bool a) -> Tensor", zero_dim_tensor)
+add_shape_compute_mapping(
+    "aten::zeros(int[] size, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::to.dtype(Tensor(a) self, int dtype, bool non_blocking=False, bool copy=False, int? memory_format=None) -> (Tensor(a))",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::arange(Scalar end, *, int? dtype=None, int? layout=None, Device? device=None, bool? pin_memory=None) -> (Tensor)",
+    arange_end,
+)
+add_shape_compute_mapping(
+    "aten::arange.start(Scalar start, Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor",
+    arange_start,
+)
+add_shape_compute_mapping(
+    "aten::arange.start_step(Scalar start, Scalar end, Scalar step, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor",
+    arange_start_step,
+)
+add_shape_compute_mapping("aten::squeeze(Tensor(a) self) -> Tensor(a)", squeeze_nodim)
+add_shape_compute_mapping(
+    "aten::squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)", squeeze
+)
+add_shape_compute_mapping(
+    "aten::squeeze.dims(Tensor(a) self, int[] dim) -> Tensor(a)", squeeze_dims
+)
+add_shape_compute_mapping(
+    "aten::unsqueeze(Tensor(a) self, int dim) -> Tensor(a)", unsqueeze
+)
+add_shape_compute_mapping(
+    "aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)",
+    slice,
+)
+add_shape_compute_mapping(
+    "aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)", select
+)
+add_shape_compute_mapping(
+    "aten::index_select(Tensor self, int dim, Tensor index) -> Tensor", index_select
+)
+add_shape_compute_mapping(
+    "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, "
+    "float eps=1e-05, bool cudnn_enable=True) -> Tensor",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor", unary
+)
+add_shape_compute_mapping(
+    "aten::_no_grad_embedding_renorm_(Tensor weight, Tensor input, float max_norm, float norm_type) -> Tensor",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor",
+    embedding,
+)
+add_shape_compute_mapping("aten::mm(Tensor self, Tensor mat2) -> Tensor", mm)
+add_shape_compute_mapping("aten::dot(Tensor self, Tensor tensor) -> Tensor", dot)
+add_shape_compute_mapping("aten::mv(Tensor self, Tensor vec) -> Tensor", mv)
+add_shape_compute_mapping("aten::matmul(Tensor self, Tensor other) -> Tensor", matmul)
+add_shape_compute_mapping(
+    "aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor", linear
+)
+add_shape_compute_mapping(
+    "aten::max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor",
+    max_pool2d,
+)
+add_shape_compute_mapping(
+    "aten::max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)",
+    max_pool2d_with_indices,
+)
+add_shape_compute_mapping("aten::t(Tensor(a) self) -> Tensor(a)", t)
+add_shape_compute_mapping(
+    "aten::transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)", transpose
+)
+add_shape_compute_mapping(
+    "aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor",
+    conv1d,
+)
+add_shape_compute_mapping(
+    "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
+    conv2d,
+)
+add_shape_compute_mapping(
+    "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
+    batch_norm,
+)
+add_shape_compute_mapping(
+    "aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor",
+    conv3d,
+)
+add_shape_compute_mapping(
+    "aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, int[]? bias_sizes, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)",
+    conv_backwards,
+)
+add_shape_compute_mapping(
+    "aten::convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor",
+    conv_forwards,
+)
+add_shape_compute_mapping(
+    "aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor",
+    _conv_forwards,
+)
+add_shape_compute_mapping(
+    "aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor",
+    conv_transpose2d_input,
+)
+add_shape_compute_mapping(
+    "aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)",
+    flatten,
+)
+add_shape_compute_mapping("aten::cat(Tensor[] tensors, int dim=0) -> Tensor", cat)
+add_shape_compute_mapping("aten::stack(Tensor[] tensors, int dim=0) -> Tensor", stack)
+add_shape_compute_mapping(
+    "aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)", permute
+)
+add_shape_compute_mapping(
+    "aten::movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)",
+    movedim,
+)
+add_shape_compute_mapping("aten::view(Tensor(a) self, int[] size) -> Tensor(a)", view)
+add_shape_compute_mapping(
+    "aten::expand_as(Tensor(a) self, Tensor other) -> Tensor(a)", expand
+)
+add_shape_compute_mapping(
+    "aten::expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)",
+    expand_one_unused,
+)
+add_shape_compute_mapping(
+    "aten::mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+    sum_mean_dim,
+)
+add_shape_compute_mapping(
+    "aten::sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor",
+    sum_mean_dim,
+)
+add_shape_compute_mapping(
+    "aten::max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)",
+    max_dim,
+)
+add_shape_compute_mapping(
+    "aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor", zero_dim_tensor
+)
+add_shape_compute_mapping(
+    "aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor", zero_dim_tensor
+)
+add_shape_compute_mapping(
+    "aten::addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor",
+    addmm,
+)
+add_shape_compute_mapping(
+    "aten::upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> (Tensor)",
+    upsample_nearest2d,
+)
+add_shape_compute_mapping(
+    "aten::quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor",
+    unary,
+)
+add_shape_compute_mapping(
+    "aten::quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor",
+    unary,
+)
+add_shape_compute_mapping("aten::dequantize(Tensor self) -> Tensor", unary)
+add_shape_compute_mapping(
+    "quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc",
+    broadcast,
+)
+add_shape_compute_mapping(
+    "aten::argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor", argmax
+)
+add_shape_compute_mapping("aten::bmm(Tensor self, Tensor mat2) -> Tensor", bmm)
+add_shape_compute_mapping(
+    "aten::_shape_as_tensor(Tensor self) -> Tensor", _shape_as_tensor
+)
+add_shape_compute_mapping(
+    "aten::topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)",
+    topk,
+)
+add_shape_compute_mapping(
+    "aten::nll_loss_forward(Tensor self, Tensor target, Tensor? weight, int reduction, int ignore_index) -> (Tensor output, Tensor total_weight)",
+    nll_loss_forward,
+)
+add_shape_compute_mapping(
+    "aten::native_layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)",
+    native_layer_norm,
+)
+add_shape_compute_mapping(
+    "aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)",
+    native_batch_norm,
+)
+add_shape_compute_mapping(
+    "aten::_native_batch_norm_legit(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)",
+    native_batch_norm,
+)
+add_shape_compute_mapping(
+    "aten::_native_batch_norm_legit.no_stats(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)",
+    native_batch_norm,
+)
+add_shape_compute_mapping(
+    "aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor",
+    cross_entropy_loss,
+)
+# add_shape_compute_mapping("aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor", index_Tensor)
+
+# TODO: migrate over all of symbolic_shape_registry_util.cpp
+# These are duplicated here so that the functions will be serialiazed
+add_shape_compute_mapping(
+    "aten::lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor",
+    broadcast_three,
+)
+add_shape_compute_mapping(
+    "aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor",
+    broadcast_one_three,
+)
+add_shape_compute_mapping(
+    "aten::add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)",
+    broadcast_inplace,
+)
+
+# quantized_conv_prepack TODO
+
+# Shape Compute Fn with upper and lower bounds
+add_bounded_compute_mapping(
+    "aten::nonzero(Tensor self) -> (Tensor)", nonzero_lower_bound, nonzero_upper_bound
+)
diff --git a/MLPY/Lib/site-packages/torch/jit/_state.py b/MLPY/Lib/site-packages/torch/jit/_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb033fb61ddcdb0c7589480fadc14a4ec712f1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_state.py
@@ -0,0 +1,126 @@
+"""JIT-related state.
+
+This module stores various pieces of Python-global state relating to the JIT.
+
+This is not intended to be imported directly; please the exposed
+functionalities in `torch.jit`.
+"""
+import os
+import weakref
+from typing import Any, Dict, Type
+
+import torch
+
+
+class EnabledProxy:
+    """Stores whether the JIT is enabled or not.
+
+    This is just a wrapper for a bool, so that we get reference semantics
+    """
+
+    def __init__(self):
+        self.enabled = self.parse_env(
+            "PYTORCH_JIT", True, "> Using PyTorch JIT", "> PyTorch JIT DISABLED"
+        )
+
+    def parse_env(self, name, default, true_message, false_message):
+        value = os.environ.get(name)
+        if value is None:
+            return default
+        if value.lower() in {"1", "true", "yes"}:
+            return True
+        elif value.lower() in {"0", "false", "no"}:
+            return False
+        if value == "1v":
+            print(true_message)
+            return True
+        elif value == "0v":
+            print(false_message)
+            return False
+        raise ValueError(f"Unknown setting of {name}. Try using 0 or 1.")
+
+    def __bool__(self):
+        return self.enabled
+
+
+_enabled = EnabledProxy()
+
+
+def disable():
+    _enabled.enabled = False
+
+
+def enable():
+    _enabled.enabled = True
+
+
+# The Python CompilationUnit. All functions and modules defined in Python will
+# live in here. It's defined in Python because doing in cpp creates static
+# destruction order issues.
+_python_cu = torch._C.CompilationUnit()
+
+
+# python class => ScriptClass mapping
+_script_classes: Dict[Type[Any], Type[Any]] = {}
+_name_to_pyclass: Dict[str, Type[Any]] = {}
+
+
+def _add_script_class(python_class, script_class):
+    _script_classes[python_class] = script_class
+    _name_to_pyclass[script_class.qualified_name()] = python_class
+
+
+def _get_script_class(python_class):
+    override = getattr(python_class, "_jit_override_qualname", None)
+    if override is not None:
+        python_class = _get_python_class(override)
+    return _script_classes.get(python_class, None)
+
+
+def _get_python_class(qualified_name):
+    return _name_to_pyclass.get(qualified_name, None)
+
+
+def _clear_class_state():
+    _script_classes.clear()
+    _name_to_pyclass.clear()
+
+
+# Caching: we currently cache compilation of free functions and overloaded functions.
+# To cache free functions we hold a weak ref to the function object and
+# map to the compiled fn's qualified name.
+# To cache overloaded functions we hold a weak ref to the function obj and
+# map to all of its overloaded compiled fns.
+# In the future we could consider caching more types of objects so that
+# aliasing is preserved across separate compilations of the same object.
+
+_jit_caching_layer: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+_jit_function_overload_caching: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+
+
+def _try_get_jit_cached_overloads(key):
+    qual_names = _jit_function_overload_caching.get(key, None)
+    if qual_names:
+        return [_python_cu.find_function(qual_name) for qual_name in qual_names]
+    else:
+        return None
+
+
+def _set_jit_overload_cache(key, compiled_fns):
+    _jit_function_overload_caching[key] = [fn.qualified_name for fn in compiled_fns]
+
+
+def _try_get_jit_cached_function(key):
+    if getattr(key, "__disable_jit_function_caching__", False) is True:
+        return None
+    qual_name = _jit_caching_layer.get(key, None)
+    if qual_name:
+        return _python_cu.find_function(qual_name)
+    else:
+        return None
+
+
+def _set_jit_function_cache(key, value):
+    # only free functions currently supported
+    assert isinstance(value, torch.jit.ScriptFunction)
+    _jit_caching_layer[key] = value.qualified_name
diff --git a/MLPY/Lib/site-packages/torch/jit/_trace.py b/MLPY/Lib/site-packages/torch/jit/_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc2eda5e873164faca867de50537a08cd1d32403
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/_trace.py
@@ -0,0 +1,1313 @@
+"""Tracing.
+
+This module contains functionality to support the JIT's tracing frontend, notably:
+    * torch.jit.trace
+    * torch.jit.trace_module
+
+This is not intended to be imported directly; please use the exposed
+functionalities in `torch.jit`.
+"""
+import contextlib
+
+import copy
+import functools
+import inspect
+import os
+import re
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
+
+from typing_extensions import ParamSpec
+
+import torch
+from torch._jit_internal import (
+    _qualified_name,
+    get_callable_argument_names,
+    is_scripting,
+)
+from torch.autograd import function
+from torch.jit._script import _CachedForward, script, ScriptModule
+
+from torch.jit._state import _enabled, _python_cu
+from torch.nn import Module
+
+from torch.testing._comparison import default_tolerances
+
+_flatten = torch._C._jit_flatten
+_unflatten = torch._C._jit_unflatten
+
+R = TypeVar("R", covariant=True)  # return type (always covariant)
+P = ParamSpec("P")
+
+
+def _create_interpreter_name_lookup_fn(frames_up=1):
+    def _get_interpreter_name_for_var(var):
+        frame = inspect.currentframe()
+        if not frame:
+            raise RuntimeError("failed to inspect frame")
+
+        i = 0
+        while i < frames_up + 1:
+            frame = frame.f_back
+            if not frame:
+                raise RuntimeError("failed to get frame")
+            i += 1
+
+        f_locals = frame.f_locals
+        f_globals = frame.f_globals
+
+        for k, v in f_locals.items():
+            if isinstance(v, torch.Tensor) and var is v:
+                return k if k != "self" else ""
+        return ""
+
+    return _get_interpreter_name_for_var
+
+
+def _unique_state_dict(module, keep_vars=False):
+    # since Parameter.detach() always creates a new torch.Tensor instance,
+    # id(v) doesn't work with it. So we always get the Parameter or Buffer
+    # as values, and deduplicate the params using Parameters and Buffers
+    state_dict = module.state_dict(keep_vars=True)
+    filtered_dict = type(state_dict)()
+    seen_ids: Set[int] = set()
+    for k, v in state_dict.items():
+        if id(v) in seen_ids:
+            continue
+        seen_ids.add(id(v))
+        if keep_vars:
+            filtered_dict[k] = v
+        else:
+            filtered_dict[k] = v.detach()
+    return filtered_dict
+
+
+class ONNXTracedModule(torch.nn.Module):
+    def __init__(
+        self,
+        inner,
+        strict=True,
+        force_outplace=False,
+        return_inputs=False,
+        return_inputs_states=False,
+    ):
+        super().__init__()
+        # inner may be a Module, or it may be an arbitrary callable
+        # If it's a Module, we get its parameters automatically, which lets
+        # us avoid a special casing functions versus modules.
+        self.inner = inner
+        self.strict = strict
+        self._force_outplace = force_outplace
+        self._return_inputs = return_inputs
+        self._return_inputs_states = return_inputs_states
+
+    def forward(self, *args: torch.Tensor):
+        in_vars, in_desc = _flatten(args)
+        # NOTE: use full state, because we need it for BatchNorm export
+        # This differs from the compiler path, which doesn't support it at the moment.
+        module_state = list(_unique_state_dict(self, keep_vars=True).values())
+
+        ret_inputs = []
+        inputs_states = []
+        outs = []
+
+        def wrapper(*args):
+            in_args: List[torch.Tensor] = []
+            for i in range(len(in_vars)):
+                if not isinstance(args[i], torch.Tensor):
+                    raise RuntimeError("Expected Tensor argument")
+                in_args.append(args[i])
+
+            trace_inputs = _unflatten(in_args, in_desc)
+
+            if self._return_inputs:
+                ret_inputs.append(
+                    tuple(x.clone(memory_format=torch.preserve_format) for x in args)
+                )
+            if self._return_inputs_states:
+                inputs_states.append(_unflatten(in_args, in_desc))
+            outs.append(self.inner(*trace_inputs))
+            if self._return_inputs_states:
+                inputs_states[0] = (inputs_states[0], trace_inputs)
+            out_vars, _ = _flatten(outs)
+            if len(out_vars) == 1:
+                return out_vars[0]
+            else:
+                return tuple(out_vars)
+
+        graph, out = torch._C._create_graph_by_tracing(
+            wrapper,
+            in_vars + module_state,
+            _create_interpreter_name_lookup_fn(),
+            self.strict,
+            self._force_outplace,
+        )
+
+        if self._return_inputs:
+            return graph, outs[0], ret_inputs[0]
+        if self._return_inputs_states:
+            return graph, outs[0], inputs_states[0]
+        else:
+            return graph, outs[0]
+
+
+def _clone_inputs(args):
+    def clone_input(a):
+        if a is None:
+            return None
+        elif isinstance(a, torch.Tensor):
+            # TODO: figure out one liner to .clone() and set requires_grad
+            v = (
+                a.detach()
+                .clone(memory_format=None if a.is_mkldnn else torch.preserve_format)
+                .requires_grad_(a.requires_grad)
+            )
+            if a.grad is not None:
+                v.grad = clone_input(v.grad)
+            return v
+        else:
+            return a.clone(memory_format=torch.preserve_format)
+
+    return function._nested_map(
+        lambda x: isinstance(x, torch.Tensor), clone_input, condition_msg="tensors"
+    )(args)
+
+
+# This is purely for developer debugging.  We are not going to advertise it.
+_JIT_TIME = os.environ.get("PYTORCH_JIT_TIME", False)  # CUDA-only timing
+_JIT_DISABLE = os.environ.get("PYTORCH_JIT_DISABLE", False)
+_JIT_STATS = os.environ.get("PYTORCH_JIT_STATS", False)
+
+
+@contextlib.contextmanager
+def _time(trace_name, name, time=True):
+    if (not _JIT_TIME and not time) or not torch.cuda.is_available():
+        yield
+        return
+    stream = torch.cuda.current_stream()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    stream.record_event(start)
+    try:
+        yield
+    finally:
+        stream.record_event(end)
+        end.synchronize()
+        print(f"{trace_name} {name} time: {start.elapsed_time(end)} ms")
+
+
+def verify(model, args, loss_fn=torch.sum, devices=None):
+    """
+    Verify that a JIT compiled model has the same behavior as its uncompiled version along with its backwards pass.
+
+    If your model returns multiple outputs,
+    you must also specify a `loss_fn` to produce a loss for which
+    the backwards will be computed.
+
+    This function has side-effects (e.g., it executes your model / saves and loads
+    parameters), so don't expect the model to come out exactly the same as what
+    you passed in.
+
+    Args:
+        model (compiled torch.nn.Module or function): the module/function to be
+            verified.  The module/function definition MUST have been decorated with
+            `@torch.jit.compile`.
+        args (tuple or Tensor): the positional arguments to pass to the
+            compiled function/module to be verified.  A non-tuple is assumed to
+            be a single positional argument to be passed to the model.
+        loss_fn (function, optional): the loss function to be applied to
+            the output of the model, before backwards is invoked.  By default,
+            we assume that a model returns a single result, and we :func:`torch.sum`
+            before calling backwards; if this is inappropriate, you can pass your
+            own loss function.  Note that if a model returns a tuple of results,
+            these are passed as separate positional arguments to `loss_fn`.
+        devices (iterable of device IDs, optional): the GPU devices which the
+            compiled module will be run on.  This determines the RNG state we
+            must save when running both compiled and uncompiled versions of the model.
+    """
+    # TODO: In principle, we track device information in our trace, so it
+    # should be possible to check if our execution actually obeyed the 'devices'
+    # the user provided.
+
+    # TODO: Consider adding a utility function to torch.jit to test
+    # for this case
+    if not isinstance(model, torch._C.CompiledFunction):  # type: ignore[attr-defined]
+        raise TypeError(
+            "Cannot verify an uncompiled module.  Add @torch.jit.compile to compile it"
+        )
+    is_module = isinstance(model, Module)
+
+    if not isinstance(args, tuple):
+        args = (args,)
+
+    saved_args = _clone_inputs(args)
+    if is_module:
+        saved_state = copy.deepcopy(model.state_dict())
+
+    def run_fwd_bwd(args, force_trace=False, assert_compiled=False):
+        params = list(model.parameters()) if is_module else []
+        in_vars, _ = _flatten((args, params))
+        # We use a special API to reset the trace and compile it from scratch.
+        compiled_fn = model
+        if force_trace:
+            compiled_fn.clear_cache()
+        if assert_compiled:
+            hits = compiled_fn.hits
+        out = model(*args)
+        if assert_compiled and compiled_fn.hits == hits:  # type: ignore[possibly-undefined]
+            raise RuntimeError("failed to use the compiled function")
+        if not isinstance(out, tuple):
+            out = (out,)
+        if loss_fn == torch.sum and len(out) != 1:
+            raise ValueError(
+                f"Model returns {len(out)} outputs, but default loss function "
+                "(torch.sum) can only handle a single output"
+            )
+        out_vars, _ = _flatten(out)
+        saved_outs = [
+            v.detach().clone(memory_format=torch.preserve_format) for v in out_vars
+        ]
+        loss = loss_fn(*out)
+        grads = torch.autograd.grad([loss], in_vars)
+        # TODO: I'm not sure if the clone here is necessary but it is safer
+        saved_grads = [
+            v.detach().clone(memory_format=torch.preserve_format) for v in grads
+        ]
+        return (saved_outs, saved_grads)
+
+    with torch.random.fork_rng(devices, _caller="torch.jit.verify"):
+        uncompiled_outs, uncompiled_grads = run_fwd_bwd(args, force_trace=True)
+        assert model.has_trace_for(*args)
+
+    if is_module:
+        model.load_state_dict(saved_state)  # type: ignore[possibly-undefined]
+    compiled_outs, compiled_grads = run_fwd_bwd(args, assert_compiled=True)
+
+    _verify_equal(uncompiled_outs, compiled_outs)
+    _verify_equal(uncompiled_grads, compiled_grads)
+
+
+def _verify_equal(xs, ys):
+    for x, y in zip(xs, ys):
+        if x.sub(y).abs().max() > 1e-6:
+            raise RuntimeError("JIT and real computation mismatch")
+
+
+def indent(s):
+    return "\n".join(["\t" + line for line in s.splitlines()])
+
+
+class TracingCheckError(Exception):
+    def __init__(self, graph_diff_error, tensor_compare_error, extra_msg=None):
+        self.message = "Tracing failed sanity checks!\n"
+        if extra_msg is not None:
+            self.message += extra_msg + "\n"
+        if graph_diff_error is not None:
+            self.message += "ERROR: Graphs differed across invocations!\n"
+            self.message += indent(graph_diff_error) + "\n"
+        if tensor_compare_error is not None:
+            self.message += (
+                "ERROR: Tensor-valued Constant nodes differed in value "
+                "across invocations. This often indicates that the tracer has"
+                " encountered untraceable code.\n"
+            )
+            self.message += indent(tensor_compare_error) + "\n"
+        super().__init__(self.message)
+
+
+# Check the traced module against a set of user-provided validation inputs
+@torch.no_grad()
+def _check_trace(
+    check_inputs,
+    func,
+    traced_func,
+    check_tolerance,
+    strict,
+    force_outplace,
+    is_trace_module,
+    _module_class,
+    example_inputs_is_kwarg=False,
+):
+    # Note: tracing is independent of optimizations, which consume the trace
+    for inputs in check_inputs:
+        if isinstance(inputs, torch.Tensor):
+            inputs = (inputs,)
+
+        if is_trace_module:
+            copied_dict = {}
+            for name, data in inputs.items():
+                copied_dict[name] = _clone_inputs(data)
+            check_mod = torch.jit.trace_module(
+                getattr(func, "__self__", func),
+                copied_dict,
+                check_trace=False,
+                strict=strict,
+                _force_outplace=force_outplace,
+                _module_class=_module_class,
+                _compilation_unit=torch._C.CompilationUnit(),
+                example_inputs_is_kwarg=example_inputs_is_kwarg,
+                _store_inputs=False,
+            )
+            check_mod_func = check_mod._c._get_method(traced_func.name)
+            inputs = inputs[traced_func.name]
+            if (
+                isinstance(inputs, (torch.Tensor))
+                or isinstance(inputs, dict)
+                and not example_inputs_is_kwarg
+            ):
+                inputs = (inputs,)
+        else:
+            if example_inputs_is_kwarg:
+                check_mod = torch.jit.trace(
+                    func,
+                    check_trace=False,
+                    strict=strict,
+                    _force_outplace=force_outplace,
+                    _module_class=_module_class,
+                    example_kwarg_inputs=_clone_inputs(inputs),
+                    _store_inputs=False,
+                )
+            else:
+                check_mod = torch.jit.trace(
+                    func,
+                    _clone_inputs(inputs),
+                    check_trace=False,
+                    strict=strict,
+                    _force_outplace=force_outplace,
+                    _module_class=_module_class,
+                    _store_inputs=False,
+                )
+            check_mod_func = check_mod
+
+        def graph_diagnostic_info():
+            mod_canonicalized = torch._C._jit_pass_canonicalize(traced_func.graph)
+            torch._C._jit_pass_inline(mod_canonicalized)
+            torch._C._jit_pass_erase_shape_information(mod_canonicalized)
+            mod_str = str(mod_canonicalized)
+            mod_str = re.sub(r"___torch_mangle_[0-9]+\.", "", mod_str)
+            check_canonicalized = torch._C._jit_pass_canonicalize(check_mod_func.graph)
+            torch._C._jit_pass_inline(check_canonicalized)
+            torch._C._jit_pass_erase_shape_information(check_canonicalized)
+            check_str = str(check_canonicalized)
+            check_str = re.sub(r"___torch_mangle_[0-9]+\.", "", check_str)
+
+            graph_diff_errors = None
+            if mod_str != check_str:
+                import difflib
+
+                graph_diff = difflib.ndiff(
+                    mod_str.splitlines(True), check_str.splitlines(True)
+                )
+                graph_diff_errors = "Graph diff:\n" + indent("".join(graph_diff)) + "\n"
+
+                for n_mod, n_check in zip(
+                    mod_canonicalized.nodes(), check_canonicalized.nodes()
+                ):
+                    if str(n_mod) != str(n_check):
+                        graph_diff_errors += "First diverging operator:\n"
+                        node_diff = difflib.ndiff(
+                            str(n_mod).splitlines(True), str(n_check).splitlines(True)
+                        )
+                        source_printout = (
+                            "Node diff:\n" + indent("".join(node_diff)) + "\n"
+                        )
+                        mod_stack = n_mod.sourceRange()
+                        if mod_stack:
+                            source_printout += (
+                                "Trace source location:\n" + indent(mod_stack) + "\n"
+                            )
+                        check_stack = n_check.sourceRange()
+                        if check_stack:
+                            source_printout += (
+                                "Check source location:\n" + indent(check_stack) + "\n"
+                            )
+                        graph_diff_errors += source_printout
+
+                        break  # For now, only print out the first pair of nodes that diverges
+
+            tensor_compare_errors = None
+            # Check Tensor-valued constant nodes
+            for n_mod, n_check in zip(
+                mod_canonicalized.nodes(), check_canonicalized.nodes()
+            ):
+                if n_mod.kind() != n_check.kind():
+                    break  # Graphs have already diverged
+
+                if n_mod.kind() == "prim::Constant" and not (
+                    n_mod.mustBeNone() or n_check.mustBeNone()
+                ):
+                    if not n_mod.hasAttribute("value"):
+                        continue
+                    if n_mod.kindOf("value") != "t" or n_check.kindOf("value") != "t":
+                        continue
+
+                    mod_tensor_val = n_mod.t("value")
+                    check_tensor_val = n_check.t("value")
+
+                    try:
+                        torch.testing.assert_close(
+                            mod_tensor_val, check_tensor_val, equal_nan=True
+                        )
+                    except (RuntimeError, AssertionError) as e:
+                        if tensor_compare_errors is None:
+                            tensor_compare_errors = ""
+                        tensor_compare_errors += "Node:\n" + indent(str(n_mod)) + "\n"
+                        compare_stack = n_mod.sourceRange()
+                        if compare_stack:
+                            tensor_compare_errors += (
+                                "Source Location:\n" + indent(compare_stack) + "\n"
+                            )
+                        tensor_compare_errors += "Comparison exception: " + indent(
+                            str(e)
+                        )
+
+                        break  # For now, only print the first diverging pair
+
+            return graph_diff_errors, tensor_compare_errors
+
+        def wrap_retval(x):
+            return x if isinstance(x, tuple) else (x,)
+
+        def run_mod_and_filter_tensor_outputs(mod, inputs, running_what):
+            try:
+                if isinstance(inputs, dict) and example_inputs_is_kwarg:
+                    outs = wrap_retval(mod(**inputs))
+                else:
+                    outs = wrap_retval(mod(*_clone_inputs(inputs)))
+                outs = [out for out in outs if isinstance(out, torch.Tensor)]
+                return outs
+            except Exception as e:
+                graph_diff_errors, tensor_compare_errors = graph_diagnostic_info()
+                msg = f"encountered an exception while running the {running_what} with test inputs.\nException:\n{indent(str(e))}"
+                raise TracingCheckError(
+                    graph_diff_errors,
+                    tensor_compare_errors,
+                    extra_msg=msg,
+                ) from e
+
+        has_warned = [False]
+
+        def maybe_warn_nondeterministic():
+            if has_warned[0]:
+                return
+            has_warned[0] = True
+            nondeterm_ops = [
+                op for op in traced_func.graph.nodes() if op.isNondeterministic()
+            ]
+            if len(nondeterm_ops) > 0:
+                nondeterministic_ops_warning = "Trace had nondeterministic nodes. "
+                nondeterministic_ops_warning += (
+                    "Did you forget call .eval() on your model? Nodes:\n"
+                )
+                nondeterministic_ops_warning += "\n".join(
+                    [indent(str(op)) for op in nondeterm_ops][:20]
+                )
+                nondeterministic_ops_warning += (
+                    "\nThis may cause errors in trace checking. To disable trace checking,"
+                    " pass check_trace=False to torch.jit.trace()"
+                )
+                warnings.warn(
+                    nondeterministic_ops_warning, category=TracerWarning, stacklevel=5
+                )
+
+        def compare_outputs(original, reference, match_what):
+            all_ok = True
+            for i, (orig, ref) in enumerate(zip(original, reference)):
+                try:
+                    if orig.is_quantized:
+                        orig = orig.dequantize()
+                    if ref.is_quantized:
+                        ref = ref.dequantize()
+                    if orig.is_mkldnn:
+                        orig = orig.to_dense()
+                    if ref.is_mkldnn:
+                        ref = ref.to_dense()
+                    if ref.is_complex() or orig.is_complex():
+                        torch.testing.assert_close(
+                            orig.to(torch.cdouble),
+                            ref.to(torch.cdouble),
+                            rtol=check_tolerance,
+                            atol=default_tolerances(orig, ref)[1],
+                            equal_nan=True,
+                        )
+                    else:
+                        if orig.is_mps or ref.is_mps:
+                            torch.testing.assert_close(
+                                orig.float(),
+                                ref.float(),
+                                rtol=check_tolerance,
+                                atol=default_tolerances(orig, ref)[1],
+                                equal_nan=True,
+                            )
+                        elif getattr(orig, "is_nested", None) or getattr(
+                            ref, "is_nested", None
+                        ):
+                            assert getattr(orig, "is_nested", None) == getattr(
+                                ref, "is_nested", None
+                            )
+                            for t_orig, t_ref in zip(orig.unbind(), ref.unbind()):
+                                torch.testing.assert_close(
+                                    t_orig.double(),
+                                    t_ref.double(),
+                                    rtol=check_tolerance,
+                                    atol=default_tolerances(t_orig, t_ref)[1],
+                                    equal_nan=True,
+                                )
+                        else:
+                            torch.testing.assert_close(
+                                orig.double(),
+                                ref.double(),
+                                rtol=check_tolerance,
+                                atol=default_tolerances(orig, ref)[1],
+                                equal_nan=True,
+                            )
+                except AssertionError as e:
+                    maybe_warn_nondeterministic()
+                    warnings.warn(
+                        "Output nr "
+                        + str(i + 1)
+                        + ". of the traced function does not match "
+                        "the corresponding output of the "
+                        + match_what
+                        + ". Detailed error:\n"
+                        + str(e),
+                        category=TracerWarning,
+                        stacklevel=4,
+                    )
+                    all_ok = False
+
+            return all_ok
+
+        traced_outs = run_mod_and_filter_tensor_outputs(traced_func, inputs, "trace")
+        fn_outs = run_mod_and_filter_tensor_outputs(func, inputs, "Python function")
+        if compare_outputs(traced_outs, fn_outs, "Python function"):
+            check_outs = run_mod_and_filter_tensor_outputs(
+                check_mod_func, inputs, "repeated trace"
+            )
+            compare_outputs(traced_outs, check_outs, "repeated trace")
+
+        diag_info = graph_diagnostic_info()
+        if any(info is not None for info in diag_info):
+            raise TracingCheckError(*diag_info)
+
+
+class TracerWarning(Warning):
+    @staticmethod
+    def ignore_lib_warnings():
+        # We ignore warnings from all submodules excluding the JIT, because we need them e.g. for _check_trace
+        warnings.filterwarnings(
+            "ignore", category=TracerWarning, module="torch.(?!jit)"
+        )
+        warnings.filterwarnings("ignore", "torch::jit::fuser::cuda")
+
+
+# We ignore the tracer warnings coming form inside the library, because all our shape
+# checks in nn will trigger them.
+TracerWarning.ignore_lib_warnings()
+torch._C._tracer_warn_use_python()
+
+
+def make_tuple(example_inputs):
+    if isinstance(example_inputs, (torch.Tensor, dict)):
+        return (example_inputs,)
+    # done primarily so that weird iterables fail here and not pybind11 code
+    if not isinstance(example_inputs, tuple):
+        return tuple(example_inputs)
+    return example_inputs
+
+
+def make_module(mod, _module_class, _compilation_unit):
+    if isinstance(mod, ScriptModule):
+        return mod
+    elif torch._jit_internal.module_has_exports(mod):
+        infer_methods_stubs_fn = torch.jit._recursive.make_stubs_from_exported_methods
+        return torch.jit._recursive.create_script_module(
+            mod, infer_methods_stubs_fn, share_types=False, is_tracing=True
+        )
+    else:
+        if _module_class is None:
+            _module_class = TopLevelTracedModule
+        return _module_class(mod, _compilation_unit=_compilation_unit)
+
+
+def wrap_check_inputs(check_inputs):
+    if check_inputs is None:
+        return None
+
+    return [{"forward": c} for c in check_inputs]
+
+
+def trace(
+    func,
+    example_inputs=None,
+    optimize=None,
+    check_trace=True,
+    check_inputs=None,
+    check_tolerance=1e-5,
+    strict=True,
+    _force_outplace=False,
+    _module_class=None,
+    _compilation_unit=_python_cu,
+    example_kwarg_inputs=None,
+    _store_inputs=True,
+):
+    r"""
+    Trace a function and return an executable  or :class:`ScriptFunction` that will be optimized using just-in-time compilation.
+
+    Tracing is ideal for code that operates only on
+    ``Tensor``\\s and lists, dictionaries, and
+    tuples of ``Tensor``\\s.
+
+    Using `torch.jit.trace` and `torch.jit.trace_module`, you can turn an
+    existing module or Python function into a TorchScript
+    :class:`ScriptFunction` or :class:`ScriptModule`. You must provide example
+    inputs, and we run the function, recording the operations performed on all
+    the tensors.
+
+    * The resulting recording of a standalone function produces `ScriptFunction`.
+    * The resulting recording of `nn.Module.forward` or `nn.Module` produces
+      `ScriptModule`.
+
+    This module also contains any parameters that the original
+    module had as well.
+
+    Warning:
+        Tracing only correctly records functions and modules which are not data
+        dependent (e.g., do not have conditionals on data in tensors) and do not have
+        any untracked external dependencies (e.g., perform input/output or
+        access global variables). Tracing only records operations done when the given
+        function is run on the given tensors. Therefore, the returned
+        `ScriptModule` will always run the same traced graph on any input. This
+        has some important implications when your module is expected to run
+        different sets of operations, depending on the input and/or the module
+        state. For example,
+
+        * Tracing will not record any control-flow like if-statements or loops.
+          When this control-flow is constant across your module, this is fine
+          and it often inlines the control-flow decisions. But sometimes the
+          control-flow is actually part of the model itself. For instance, a
+          recurrent network is a loop over the (possibly dynamic) length of an
+          input sequence.
+        * In the returned :class:`ScriptModule`, operations that have different
+          behaviors in ``training`` and ``eval`` modes will always behave as if
+          it is in the mode it was in during tracing, no matter which mode the
+          `ScriptModule` is in.
+
+        In cases like these, tracing would not be appropriate and
+        :func:`scripting <torch.jit.script>` is a better choice. If you trace
+        such models, you may silently get incorrect results on subsequent
+        invocations of the model. The tracer will try to emit warnings when
+        doing something that may cause an incorrect trace to be produced.
+
+    Args:
+        func (callable or torch.nn.Module):  A Python function or `torch.nn.Module`
+            that will be run with `example_inputs`. `func` arguments and return
+            values  must be tensors or (possibly nested) tuples that contain
+            tensors. When a module is passed `torch.jit.trace`, only the
+            ``forward`` method is run and traced (see :func:`torch.jit.trace
+            <torch.jit.trace_module>` for details).
+
+    Keyword arguments:
+        example_inputs (tuple or torch.Tensor or None, optional): A tuple of example
+            inputs that will be passed to the function while tracing.
+            Default: ``None``. Either this argument or ``example_kwarg_inputs``
+            should be specified. The resulting trace can be run with inputs of
+            different types and shapes assuming the traced operations support those
+            types and shapes. `example_inputs` may also be a single Tensor in which
+            case it is automatically wrapped in a tuple. When the value is None,
+            ``example_kwarg_inputs`` should be specified.
+
+        check_trace (``bool``, optional): Check if the same inputs run through
+            traced code produce the same outputs. Default: ``True``. You might want
+            to disable this if, for example, your network contains non-
+            deterministic ops or if you are sure that the network is correct despite
+            a checker failure.
+
+        check_inputs (list of tuples, optional): A list of tuples of input
+            arguments that should be used to check the trace against what is
+            expected. Each tuple is equivalent to a set of input arguments that
+            would be specified in ``example_inputs``. For best results, pass in
+            a set of checking inputs representative of the space of shapes and
+            types of inputs you expect the network to see.  If not specified,
+            the original ``example_inputs`` are used for checking
+        check_tolerance (float, optional): Floating-point comparison tolerance
+            to use in the checker procedure.  This can be used to relax the
+            checker strictness in the event that results diverge numerically
+            for a known reason, such as operator fusion.
+        strict (``bool``, optional): run the tracer in a strict mode or not
+            (default: ``True``). Only turn this off when you want the tracer to
+            record your mutable container types (currently ``list``/``dict``)
+            and you are sure that the container you are using in your
+            problem is a ``constant`` structure and does not get used as
+            control flow (if, for) conditions.
+        example_kwarg_inputs (dict, optional): This parameter is a pack of keyword
+            arguments of example inputs that will be passed to the function while
+            tracing. Default: ``None``. Either this argument or ``example_inputs``
+            should be specified. The dict will be unpacking by the arguments name
+            of the traced function. If the keys of the dict don't not match with
+            the traced function's arguments name, a runtime exception will be raised.
+
+    Returns:
+        If `func` is `nn.Module` or ``forward`` of `nn.Module`, `trace` returns
+        a :class:`ScriptModule` object with a single ``forward`` method
+        containing the traced code.  The returned `ScriptModule` will
+        have the same set of sub-modules and parameters as the original
+        ``nn.Module``.  If ``func`` is a standalone function, ``trace``
+        returns `ScriptFunction`.
+
+    Example (tracing a function):
+
+    .. testcode::
+
+        import torch
+
+        def foo(x, y):
+            return 2 * x + y
+
+        # Run `foo` with the provided inputs and record the tensor operations
+        traced_foo = torch.jit.trace(foo, (torch.rand(3), torch.rand(3)))
+
+        # `traced_foo` can now be run with the TorchScript interpreter or saved
+        # and loaded in a Python-free environment
+
+    Example (tracing an existing module)::
+
+        import torch
+        import torch.nn as nn
+
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(1, 1, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        n = Net()
+        example_weight = torch.rand(1, 1, 3, 3)
+        example_forward_input = torch.rand(1, 1, 3, 3)
+
+        # Trace a specific method and construct `ScriptModule` with
+        # a single `forward` method
+        module = torch.jit.trace(n.forward, example_forward_input)
+
+        # Trace a module (implicitly traces `forward`) and construct a
+        # `ScriptModule` with a single `forward` method
+        module = torch.jit.trace(n, example_forward_input)
+
+    """
+    if not _enabled:
+        return func
+    if optimize is not None:
+        warnings.warn(
+            "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
+        )
+
+    if isinstance(func, torch.jit.ScriptModule):
+        # it is hard to trace it because the forward method on ScriptModule is already defined, so it
+        # would result in an error.
+        warnings.warn(
+            "The input to trace is already a ScriptModule, tracing it is a no-op. Returning the object as is."
+        )
+        return func
+
+    if isinstance(func, torch.nn.Module):
+        if example_inputs is None:
+            if isinstance(example_kwarg_inputs, dict):
+                example_inputs = example_kwarg_inputs
+            else:
+                raise RuntimeError("example_kwarg_inputs should be a dict")
+        return trace_module(
+            func,
+            {"forward": example_inputs},
+            None,
+            check_trace,
+            wrap_check_inputs(check_inputs),
+            check_tolerance,
+            strict,
+            _force_outplace,
+            _module_class,
+            example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            _store_inputs=_store_inputs,
+        )
+    if (
+        hasattr(func, "__self__")
+        and isinstance(func.__self__, torch.nn.Module)
+        and func.__name__ == "forward"
+    ):
+        if example_inputs is None:
+            if isinstance(example_kwarg_inputs, dict):
+                example_inputs = example_kwarg_inputs
+            else:
+                raise RuntimeError("example_kwarg_inputs should be a dict")
+        return trace_module(
+            func.__self__,
+            {"forward": example_inputs},
+            None,
+            check_trace,
+            wrap_check_inputs(check_inputs),
+            check_tolerance,
+            strict,
+            _force_outplace,
+            _module_class,
+            example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            _store_inputs=_store_inputs,
+        )
+
+    # Special case for common case of passing a single Tensor
+    if (
+        isinstance(example_inputs, (torch.Tensor, dict))
+        and example_kwarg_inputs is None
+    ):
+        example_inputs = (example_inputs,)
+    # done primarily so that weird iterables fail here and not pybind11 code
+    elif example_kwarg_inputs is None and not isinstance(example_inputs, tuple):
+        example_inputs = tuple(example_inputs)
+
+    var_lookup_fn = _create_interpreter_name_lookup_fn(0)
+
+    if hasattr(func, "__self__") and isinstance(func.__self__, torch.nn.Module):
+        raise AttributeError(
+            "trace doesn't support compiling individual module's functions.\n"
+            "Please use trace_module"
+        )
+
+    name = _qualified_name(func)
+    if isinstance(example_kwarg_inputs, dict):
+        example_inputs = example_kwarg_inputs
+        traced = torch._C._create_function_from_trace_with_dict(
+            name,
+            func,
+            example_kwarg_inputs,
+            var_lookup_fn,
+            strict,
+            _force_outplace,
+            get_callable_argument_names(func),
+        )
+    else:
+        traced = torch._C._create_function_from_trace(
+            name,
+            func,
+            example_inputs,
+            var_lookup_fn,
+            strict,
+            _force_outplace,
+            get_callable_argument_names(func),
+        )
+
+    # Check the trace against new traces created from user-specified inputs
+    if check_trace:
+        if check_inputs is not None:
+            _check_trace(
+                check_inputs,
+                func,
+                traced,
+                check_tolerance,
+                strict,
+                _force_outplace,
+                False,
+                _module_class,
+                example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            )
+        else:
+            _check_trace(
+                [example_inputs],
+                func,
+                traced,
+                check_tolerance,
+                strict,
+                _force_outplace,
+                False,
+                _module_class,
+                example_inputs_is_kwarg=isinstance(example_kwarg_inputs, dict),
+            )
+
+    # Allow torch.compile() to inline
+    traced._torchdynamo_inline = func  # type: ignore[attr-defined]
+    return traced
+
+
+_trace_module_map: Optional[Dict[Any, Any]] = None
+
+
+def trace_module(
+    mod,
+    inputs,
+    optimize=None,
+    check_trace=True,
+    check_inputs=None,
+    check_tolerance=1e-5,
+    strict=True,
+    _force_outplace=False,
+    _module_class=None,
+    _compilation_unit=_python_cu,
+    example_inputs_is_kwarg=False,
+    _store_inputs=True,
+):
+    """
+    Trace a module and return an executable :class:`ScriptModule` that will be optimized using just-in-time compilation.
+
+    When a module is passed to :func:`torch.jit.trace <torch.jit.trace>`, only
+    the ``forward`` method is run and traced. With ``trace_module``, you can specify a dictionary of
+    method names to example inputs to trace (see the ``inputs``) argument below.
+
+    See :func:`torch.jit.trace <torch.jit.trace>` for more information on tracing.
+
+    Args:
+        mod (torch.nn.Module):  A ``torch.nn.Module`` containing methods whose names are
+                                specified in ``inputs``. The given methods will be compiled
+                                as a part of a single `ScriptModule`.
+        inputs (dict):  A dict containing sample inputs indexed by method names in ``mod``.
+                                The inputs will be passed to methods whose names correspond to inputs'
+                                keys while tracing.
+                                ``{ 'forward' : example_forward_input, 'method2': example_method2_input}``
+    Keyword arguments:
+        check_trace (``bool``, optional): Check if the same inputs run through
+                                      traced code produce the same outputs. Default: ``True``. You might want
+                                      to disable this if, for example, your network contains non-
+                                      deterministic ops or if you are sure that the network is correct despite
+                                      a checker failure.
+
+        check_inputs (list of dicts, optional): A list of dicts of input arguments that should be used
+                                                 to check the trace against what is expected. Each tuple
+                                                 is equivalent to a set of input arguments that would
+                                                 be specified in ``inputs``. For best results, pass in a
+                                                 set of checking inputs representative of the space of
+                                                 shapes and types of inputs you expect the network to see.
+                                                 If not specified, the original ``inputs`` are used for checking
+        check_tolerance (float, optional): Floating-point comparison tolerance to use in the checker procedure.
+                                           This can be used to relax the checker strictness in the event that
+                                           results diverge numerically for a known reason, such as operator fusion.
+        example_inputs_is_kwarg (``bool``, optional): This parameter indicate whether the example inputs is a pack
+                                           pack of keyword arguments. Default: ``False``.
+
+    Returns:
+        A :class:`ScriptModule` object with a single ``forward`` method containing the traced code.
+        When ``func`` is a ``torch.nn.Module``, the returned :class:`ScriptModule` will have the same set of
+        sub-modules and parameters as ``func``.
+
+    Example (tracing a module with multiple methods)::
+
+        import torch
+        import torch.nn as nn
+
+        class Net(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = nn.Conv2d(1, 1, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+            def weighted_kernel_sum(self, weight):
+                return weight * self.conv.weight
+
+
+        n = Net()
+        example_weight = torch.rand(1, 1, 3, 3)
+        example_forward_input = torch.rand(1, 1, 3, 3)
+
+        # Trace a specific method and construct `ScriptModule` with
+        # a single `forward` method
+        module = torch.jit.trace(n.forward, example_forward_input)
+
+        # Trace a module (implicitly traces `forward`) and construct a
+        # `ScriptModule` with a single `forward` method
+        module = torch.jit.trace(n, example_forward_input)
+
+        # Trace specific methods on a module (specified in `inputs`), constructs
+        # a `ScriptModule` with `forward` and `weighted_kernel_sum` methods
+        inputs = {'forward' : example_forward_input, 'weighted_kernel_sum' : example_weight}
+        module = torch.jit.trace_module(n, inputs)
+
+    """
+    if not _enabled:
+        return mod
+    if optimize is not None:
+        warnings.warn(
+            "`optimize` is deprecated and has no effect. Use `with torch.jit.optimized_execution() instead"
+        )
+
+    var_lookup_fn = _create_interpreter_name_lookup_fn(0)
+
+    if not isinstance(mod, torch.nn.Module):
+        raise AttributeError("expected torch.nn.Module as the first argument")
+
+    if not isinstance(inputs, dict):
+        raise AttributeError("expected a dictionary of (method_name, input) pairs")
+
+    old_module_map = torch.jit._trace._trace_module_map
+    try:
+        trace_module_map: Dict[Any, Any] = {}
+
+        def register_submods(mod, prefix):
+            for name, child in mod.named_children():
+                submod_qualname = prefix + "." + name
+                trace_module_map[child] = submod_qualname
+                register_submods(child, submod_qualname)
+
+        trace_module_map["__module"] = mod
+        torch.jit._trace._trace_module_map = trace_module_map
+        register_submods(mod, "__module")
+
+        module = make_module(mod, _module_class, _compilation_unit)
+
+        for method_name, example_inputs in inputs.items():
+            if method_name == "forward":
+                # "forward" is a special case because we need to trace
+                # `Module.__call__`, which sets up some extra tracing, but uses
+                # argument names of the real `Module.forward` method.
+                func = mod
+                forward_method = getattr(mod, method_name)
+                argument_names = get_callable_argument_names(forward_method)
+            else:
+                func = getattr(mod, method_name)
+                argument_names = get_callable_argument_names(func)
+
+            if isinstance(example_inputs, dict) and example_inputs_is_kwarg:
+                # Raise exception when the user provided key names are not aligned with forward() method's arguments' name/
+                for key in example_inputs:
+                    if key not in argument_names:
+                        valid_arguments = "[" + ",".join(argument_names) + "]"
+                        raise NameError(
+                            f"""'{key}' is not in forward() method's arguments,
+                         valid arguments name are {valid_arguments}"""
+                        )
+                module._c._create_method_from_trace_with_dict(
+                    method_name,
+                    func,
+                    example_inputs,
+                    var_lookup_fn,
+                    strict,
+                    _force_outplace,
+                    argument_names,
+                    _store_inputs,
+                )
+            else:
+                example_inputs = make_tuple(example_inputs)
+                module._c._create_method_from_trace(
+                    method_name,
+                    func,
+                    example_inputs,
+                    var_lookup_fn,
+                    strict,
+                    _force_outplace,
+                    argument_names,
+                    _store_inputs,
+                )
+
+            check_trace_method = module._c._get_method(method_name)
+
+            # Check the trace against new traces created from user-specified inputs
+            if check_trace:
+                if check_inputs is not None:
+                    _check_trace(
+                        check_inputs,
+                        func,
+                        check_trace_method,
+                        check_tolerance,
+                        strict,
+                        _force_outplace,
+                        True,
+                        _module_class,
+                        example_inputs_is_kwarg=example_inputs_is_kwarg,
+                    )
+                else:
+                    _check_trace(
+                        [inputs],
+                        func,
+                        check_trace_method,
+                        check_tolerance,
+                        strict,
+                        _force_outplace,
+                        True,
+                        _module_class,
+                        example_inputs_is_kwarg=example_inputs_is_kwarg,
+                    )
+    finally:
+        torch.jit._trace._trace_module_map = old_module_map
+
+    return module
+
+
+def is_tracing():
+    """Return a boolean value.
+
+    Returns ``True`` in tracing (if a function is called during the
+    tracing of code with ``torch.jit.trace``) and ``False`` otherwise.
+    """
+    if is_scripting():
+        return False
+    return torch._C._is_tracing()
+
+
+class TracedModule(ScriptModule):
+    _disable_script_meta = True
+
+    def __init__(self, orig, id_set=None, _compilation_unit=None):
+        # XXX: orig can be a nn.Module or a function!
+        super().__init__()
+        assert isinstance(orig, torch.nn.Module)
+
+        # Copy a subset of `orig` to a temporary nn.Module.
+        # This is a way to customize what will actually get compiled by create_script_module
+        id_set = set()
+
+        # This allows us to preserve the original module's qualified name by defining a new
+        # type with the attribute _jit_override_qualname. In torch._jit_internal._qualified_name
+        # we have a special case that will look up this attribute to override whatever qualname
+        # we would get from the python type system
+        class QualnameWrapper(torch.nn.Module):
+            pass
+
+        QualnameWrapper._jit_override_qualname = torch._jit_internal._qualified_name(  # type: ignore[attr-defined]
+            type(orig)
+        )
+
+        tmp_module = QualnameWrapper()
+
+        def check_unique(param):
+            if param in id_set:
+                raise ValueError(
+                    "TracedModules don't support parameter sharing between modules"
+                )
+            id_set.add(param)
+
+        tmp_module.training = orig.training
+
+        for name, param in orig._parameters.items():
+            if param is not None:
+                tmp_module._parameters[name] = param
+                check_unique(param)
+        for name, buf in orig._buffers.items():
+            if buf is not None:
+                tmp_module._buffers[name] = buf
+                check_unique(buf)
+        for name, val in orig.__dict__.items():
+            if (
+                torch._C._jit_is_script_object(val)
+                and name not in orig._parameters
+                and name not in orig._buffers
+            ):
+                setattr(tmp_module, name, val)
+
+        if orig._backward_hooks:
+            raise ValueError(
+                "Modules that have backward hooks assigned can't be compiled: "
+                + str(orig)
+            )
+
+        for name, submodule in orig._modules.items():
+            if submodule is None:
+                continue
+            tmp_module._modules[name] = make_module(
+                submodule, TracedModule, _compilation_unit=None
+            )
+
+        script_module = torch.jit._recursive.create_script_module(
+            tmp_module, lambda module: (), share_types=False, is_tracing=True
+        )
+
+        self.__dict__["_name"] = type(orig).__name__
+        self.__dict__["_actual_script_module"] = script_module
+        for name in ("_parameters", "_buffers", "_modules", "training"):
+            delattr(self, name)
+
+    def forward(self, *args, **kwargs):
+        raise RuntimeError("Trace submodules cannot be called.")
+
+    def __getattr__(self, attr):
+        if "_actual_script_module" not in self.__dict__:
+            return super().__getattr__(attr)
+        return getattr(self._actual_script_module, attr)
+
+    def __setattr__(self, attr, value):
+        if "_actual_script_module" not in self.__dict__:
+            return super().__setattr__(attr, value)
+        setattr(self._actual_script_module, attr, value)
+
+    def _get_name(self):
+        return self._name
+
+    def extra_repr(self):
+        return f"original_name={self._name}"
+
+
+class TopLevelTracedModule(TracedModule):
+    forward: Callable[..., Any] = _CachedForward()  # type: ignore[assignment]
+
+    def _reconstruct(self, cpp_module):
+        """
+        Re-construct an instance of TopLevelTracedModule using an instance of a C++ module.
+
+        Args:
+            cpp_module: The C++ module that this TopLevelTracedModule will be rebuilt around.
+        """
+        self.__dict__["_actual_script_module"]._reconstruct(cpp_module)
+
+
+def _script_if_tracing(fn: Callable[P, R]) -> Callable[P, R]:
+    @functools.wraps(fn)
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+        if not is_tracing():
+            # Not tracing, don't do anything
+            return fn(*args, **kwargs)
+
+        compiled_fn: Callable[P, R] = script(wrapper.__original_fn)  # type: ignore[attr-defined]
+        return compiled_fn(*args, **kwargs)
+
+    wrapper.__original_fn = fn  # type: ignore[attr-defined]
+    wrapper.__script_if_tracing_wrapper = True  # type: ignore[attr-defined]
+
+    return wrapper
+
+
+def _get_trace_graph(
+    f,
+    args=(),
+    kwargs=None,
+    strict=True,
+    _force_outplace=False,
+    return_inputs=False,
+    _return_inputs_states=False,
+):
+    """Return a tuple on tracing a function or model.
+
+    .. warning::
+        This function is internal-only and should only be used by the ONNX
+        exporter. If you are trying to get a graph through tracing, please go
+        through the public API instead::
+
+            trace = torch.jit.trace(nn.LSTMCell(), (input, hidden))
+            trace_graph = trace.graph
+
+    Trace a function or model, returning a tuple consisting of the both the
+    *trace* of an execution, as well as the original return value. If return_inputs,
+    also returns the trace inputs as part of the tuple
+
+    Tracing is guaranteed not to change the semantics of the function/module
+    that is traced.
+
+    Args:
+        f (torch.nn.Module or function): the function or module
+            to be traced.
+        args (tuple or Tensor): the positional arguments to pass to the
+            function/module to be traced.  A non-tuple is assumed to
+            be a single positional argument to be passed to the model.
+        kwargs (dict): the keyword arguments to pass to the function/module
+            to be traced.
+
+    Example (trace a cell):
+
+    .. testcode::
+
+        trace = torch.jit.trace(nn.LSTMCell(), (input, hidden))
+    """
+    if kwargs is None:
+        kwargs = {}
+    if not isinstance(args, tuple):
+        args = (args,)
+    outs = ONNXTracedModule(
+        f, strict, _force_outplace, return_inputs, _return_inputs_states
+    )(*args, **kwargs)
+    return outs
diff --git a/MLPY/Lib/site-packages/torch/jit/annotations.py b/MLPY/Lib/site-packages/torch/jit/annotations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5493dfeeb7aa9e41b54651aefc79ca2cfdfae2d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/annotations.py
@@ -0,0 +1,550 @@
+import ast
+import builtins
+import dis
+import enum
+import inspect
+import re
+import typing
+import warnings
+
+from textwrap import dedent
+from typing import Type
+
+import torch
+
+from torch._C import (
+    _GeneratorType,
+    AnyType,
+    AwaitType,
+    BoolType,
+    ComplexType,
+    DeviceObjType,
+    DictType,
+    EnumType,
+    FloatType,
+    FutureType,
+    InterfaceType,
+    IntType,
+    ListType,
+    NoneType,
+    NumberType,
+    OptionalType,
+    StreamObjType,
+    StringType,
+    TensorType,
+    TupleType,
+    UnionType,
+)
+from torch._sources import get_source_lines_and_file
+from .._jit_internal import (  # type: ignore[attr-defined]
+    _Await,
+    _qualified_name,
+    Any,
+    BroadcastingList1,
+    BroadcastingList2,
+    BroadcastingList3,
+    Dict,
+    Future,
+    is_await,
+    is_dict,
+    is_future,
+    is_ignored_fn,
+    is_list,
+    is_optional,
+    is_tuple,
+    is_union,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+from ._state import _get_script_class
+
+if torch.distributed.rpc.is_available():
+    from torch._C import RRefType
+    from .._jit_internal import is_rref, RRef
+
+from torch._ops import OpOverloadPacket
+
+
+class Module:
+    def __init__(self, name, members):
+        self.name = name
+        self.members = members
+
+    def __getattr__(self, name):
+        try:
+            return self.members[name]
+        except KeyError:
+            raise RuntimeError(
+                f"Module {self.name} has no member called {name}"
+            ) from None
+
+
+class EvalEnv:
+    env = {
+        "torch": Module("torch", {"Tensor": torch.Tensor}),
+        "Tensor": torch.Tensor,
+        "typing": Module("typing", {"Tuple": Tuple}),
+        "Tuple": Tuple,
+        "List": List,
+        "Dict": Dict,
+        "Optional": Optional,
+        "Union": Union,
+        "Future": Future,
+        "Await": _Await,
+    }
+
+    def __init__(self, rcb):
+        self.rcb = rcb
+        if torch.distributed.rpc.is_available():
+            self.env["RRef"] = RRef
+
+    def __getitem__(self, name):
+        if name in self.env:
+            return self.env[name]
+        if self.rcb is not None:
+            return self.rcb(name)
+        return getattr(builtins, name, None)
+
+
+def get_signature(fn, rcb, loc, is_method):
+    if isinstance(fn, OpOverloadPacket):
+        signature = try_real_annotations(fn.op, loc)
+    else:
+        signature = try_real_annotations(fn, loc)
+    if signature is not None and is_method:
+        # If this is a method, then the signature will include a type for
+        # `self`, but type comments do not contain a `self`. So strip it
+        # away here so everything is consistent (`inspect.ismethod` does
+        # not work here since `fn` is unbound at this point)
+        param_types, return_type = signature
+        param_types = param_types[1:]
+        signature = (param_types, return_type)
+
+    if signature is None:
+        type_line, source = None, None
+        try:
+            source = dedent("".join(get_source_lines_and_file(fn)[0]))
+            type_line = get_type_line(source)
+        except TypeError:
+            pass
+        # This might happen both because we failed to get the source of fn, or
+        # because it didn't have any annotations.
+        if type_line is not None:
+            signature = parse_type_line(type_line, rcb, loc)
+
+    return signature
+
+
+def is_function_or_method(the_callable):
+    # A stricter version of `inspect.isroutine` that does not pass for built-in
+    # functions
+    return inspect.isfunction(the_callable) or inspect.ismethod(the_callable)
+
+
+def is_vararg(the_callable):
+    if not is_function_or_method(the_callable) and callable(the_callable):  # noqa: B004
+        # If `the_callable` is a class, de-sugar the call so we can still get
+        # the signature
+        the_callable = the_callable.__call__
+
+    if is_function_or_method(the_callable):
+        return inspect.getfullargspec(the_callable).varargs is not None
+    else:
+        return False
+
+
+def get_param_names(fn, n_args):
+    if isinstance(fn, OpOverloadPacket):
+        fn = fn.op
+
+    if (
+        not is_function_or_method(fn)
+        and callable(fn)
+        and is_function_or_method(fn.__call__)
+    ):  # noqa: B004
+        # De-sugar calls to classes
+        fn = fn.__call__
+
+    if is_function_or_method(fn):
+        if is_ignored_fn(fn):
+            fn = inspect.unwrap(fn)
+        return inspect.getfullargspec(fn).args
+    else:
+        # The `fn` was not a method or function (maybe a class with a __call__
+        # method, so use a default param name list)
+        return [str(i) for i in range(n_args)]
+
+
+def check_fn(fn, loc):
+    # Make sure the function definition is not a class instantiation
+    try:
+        source = dedent("".join(get_source_lines_and_file(fn)[0]))
+    except (OSError, TypeError):
+        return
+    if source is None:
+        return
+
+    py_ast = ast.parse(source)
+    if len(py_ast.body) == 1 and isinstance(py_ast.body[0], ast.ClassDef):
+        raise torch.jit.frontend.FrontendError(
+            loc,
+            f"Cannot instantiate class '{py_ast.body[0].name}' in a script function",
+        )
+    if len(py_ast.body) != 1 or not isinstance(py_ast.body[0], ast.FunctionDef):
+        raise torch.jit.frontend.FrontendError(
+            loc, "Expected a single top-level function"
+        )
+
+
+def _eval_no_call(stmt, glob, loc):
+    """Evaluate statement as long as it does not contain any method/function calls."""
+    bytecode = compile(stmt, "", mode="eval")
+    for insn in dis.get_instructions(bytecode):
+        if "CALL" in insn.opname:
+            raise RuntimeError(
+                f"Type annotation should not contain calls, but '{stmt}' does"
+            )
+    return eval(bytecode, glob, loc)  # type: ignore[arg-type] # noqa: P204
+
+
+def parse_type_line(type_line, rcb, loc):
+    """Parse a type annotation specified as a comment.
+
+    Example inputs:
+        # type: (Tensor, torch.Tensor) -> Tuple[Tensor]
+        # type: (Tensor, Tuple[Tensor, Tensor]) -> Tensor
+    """
+    arg_ann_str, ret_ann_str = split_type_line(type_line)
+
+    try:
+        arg_ann = _eval_no_call(arg_ann_str, {}, EvalEnv(rcb))
+    except (NameError, SyntaxError) as e:
+        raise RuntimeError(
+            "Failed to parse the argument list of a type annotation"
+        ) from e
+
+    if not isinstance(arg_ann, tuple):
+        arg_ann = (arg_ann,)
+
+    try:
+        ret_ann = _eval_no_call(ret_ann_str, {}, EvalEnv(rcb))
+    except (NameError, SyntaxError) as e:
+        raise RuntimeError(
+            "Failed to parse the return type of a type annotation"
+        ) from e
+
+    arg_types = [ann_to_type(ann, loc) for ann in arg_ann]
+    return arg_types, ann_to_type(ret_ann, loc)
+
+
+def get_type_line(source):
+    """Try to find the line containing a comment with the type annotation."""
+    type_comment = "# type:"
+
+    lines = source.split("\n")
+    lines = list(enumerate(lines))
+    type_lines = list(filter(lambda line: type_comment in line[1], lines))
+    # `type: ignore` comments may be needed in JIT'ed functions for mypy, due
+    # to the hack in torch/_VF.py.
+
+    # An ignore type comment can be of following format:
+    #   1) type: ignore
+    #   2) type: ignore[rule-code]
+    # This ignore statement must be at the end of the line
+
+    # adding an extra backslash before the space, to avoid triggering
+    # one of the checks in .github/workflows/lint.yml
+    type_pattern = re.compile("# type:\\ ignore(\\[[a-zA-Z-]+\\])?$")
+    type_lines = list(filter(lambda line: not type_pattern.search(line[1]), type_lines))
+
+    if len(type_lines) == 0:
+        # Catch common typo patterns like extra spaces, typo in 'ignore', etc.
+        wrong_type_pattern = re.compile("#[\t ]*type[\t ]*(?!: ignore(\\[.*\\])?$):")
+        wrong_type_lines = list(
+            filter(lambda line: wrong_type_pattern.search(line[1]), lines)
+        )
+        if len(wrong_type_lines) > 0:
+            raise RuntimeError(
+                "The annotation prefix in line "
+                + str(wrong_type_lines[0][0])
+                + " is probably invalid.\nIt must be '# type:'"
+                + "\nSee PEP 484 (https://www.python.org/dev/peps/pep-0484/#suggested-syntax-for-python-2-7-and-straddling-code)"  # noqa: B950
+                + "\nfor examples"
+            )
+        return None
+    elif len(type_lines) == 1:
+        # Only 1 type line, quit now
+        return type_lines[0][1].strip()
+
+    # Parse split up argument types according to PEP 484
+    # https://www.python.org/dev/peps/pep-0484/#suggested-syntax-for-python-2-7-and-straddling-code
+    return_line = None
+    parameter_type_lines = []
+    for line_num, line in type_lines:
+        if "# type: (...) -> " in line:
+            return_line = (line_num, line)
+            break
+        elif type_comment in line:
+            parameter_type_lines.append(line)
+    if return_line is None:
+        raise RuntimeError(
+            "Return type line '# type: (...) -> ...' not found on multiline "
+            "type annotation\nfor type lines:\n"
+            + "\n".join([line[1] for line in type_lines])
+            + "\n(See PEP 484 https://www.python.org/dev/peps/pep-0484/#suggested-syntax-for-python-2-7-and-straddling-code)"
+        )
+
+    def get_parameter_type(line):
+        item_type = line[line.find(type_comment) + len(type_comment) :]
+        return item_type.strip()
+
+    types = map(get_parameter_type, parameter_type_lines)
+    parameter_types = ", ".join(types)
+
+    return return_line[1].replace("...", parameter_types)
+
+
+def split_type_line(type_line):
+    """Split the comment with the type annotation into parts for argument and return types.
+
+    For example, for an input of:
+        # type: (Tensor, torch.Tensor) -> Tuple[Tensor, Tensor]
+
+    This function will return:
+        ("(Tensor, torch.Tensor)", "Tuple[Tensor, Tensor]")
+
+    """
+    start_offset = len("# type:")
+    try:
+        arrow_pos = type_line.index("->")
+    except ValueError:
+        raise RuntimeError(
+            "Syntax error in type annotation (cound't find `->`)"
+        ) from None
+    return type_line[start_offset:arrow_pos].strip(), type_line[arrow_pos + 2 :].strip()
+
+
+def try_real_annotations(fn, loc):
+    """Try to use the Py3.5+ annotation syntax to get the type."""
+    try:
+        # Note: anything annotated as `Optional[T]` will automatically
+        # be returned as `Union[T, None]` per
+        # https://github.com/python/typing/blob/master/src/typing.py#L850
+        sig = inspect.signature(fn)
+    except ValueError:
+        return None
+
+    all_annots = [sig.return_annotation] + [
+        p.annotation for p in sig.parameters.values()
+    ]
+    if all(ann is sig.empty for ann in all_annots):
+        return None
+
+    arg_types = [ann_to_type(p.annotation, loc) for p in sig.parameters.values()]
+    return_type = ann_to_type(sig.return_annotation, loc)
+    return arg_types, return_type
+
+
+# Finds common type for enum values belonging to an Enum class. If not all
+# values have the same type, AnyType is returned.
+def get_enum_value_type(e: Type[enum.Enum], loc):
+    enum_values: List[enum.Enum] = list(e)
+    if not enum_values:
+        raise ValueError(f"No enum values defined for: '{e.__class__}'")
+
+    types = {type(v.value) for v in enum_values}
+    ir_types = [try_ann_to_type(t, loc) for t in types]
+
+    # If Enum values are of different types, an exception will be raised here.
+    # Even though Python supports this case, we chose to not implement it to
+    # avoid overcomplicate logic here for a rare use case. Please report a
+    # feature request if you find it necessary.
+    res = torch._C.unify_type_list(ir_types)
+    if not res:
+        return AnyType.get()
+    return res
+
+
+def is_tensor(ann):
+    if issubclass(ann, torch.Tensor):
+        return True
+
+    if issubclass(
+        ann,
+        (
+            torch.LongTensor,
+            torch.DoubleTensor,
+            torch.FloatTensor,
+            torch.IntTensor,
+            torch.ShortTensor,
+            torch.HalfTensor,
+            torch.CharTensor,
+            torch.ByteTensor,
+            torch.BoolTensor,
+        ),
+    ):
+        warnings.warn(
+            "TorchScript will treat type annotations of Tensor "
+            "dtype-specific subtypes as if they are normal Tensors. "
+            "dtype constraints are not enforced in compilation either."
+        )
+        return True
+
+    return False
+
+
+def _fake_rcb(inp):
+    return None
+
+
+def try_ann_to_type(ann, loc, rcb=None):
+    ann_args = typing.get_args(ann)  # always returns a tuple!
+
+    if ann is inspect.Signature.empty:
+        return TensorType.getInferred()
+    if ann is None:
+        return NoneType.get()
+    if inspect.isclass(ann) and is_tensor(ann):
+        return TensorType.get()
+    if is_tuple(ann):
+        # Special case for the empty Tuple type annotation `Tuple[()]`
+        if len(ann_args) == 1 and ann_args[0] == ():
+            return TupleType([])
+        return TupleType([try_ann_to_type(a, loc) for a in ann_args])
+    if is_list(ann):
+        elem_type = try_ann_to_type(ann_args[0], loc)
+        if elem_type:
+            return ListType(elem_type)
+    if is_dict(ann):
+        key = try_ann_to_type(ann_args[0], loc)
+        value = try_ann_to_type(ann_args[1], loc)
+        # Raise error if key or value is None
+        if key is None:
+            raise ValueError(
+                f"Unknown type annotation: '{ann_args[0]}' at {loc.highlight()}"
+            )
+        if value is None:
+            raise ValueError(
+                f"Unknown type annotation: '{ann_args[1]}' at {loc.highlight()}"
+            )
+        return DictType(key, value)
+    if is_optional(ann):
+        if issubclass(ann_args[1], type(None)):
+            contained = ann_args[0]
+        else:
+            contained = ann_args[1]
+        valid_type = try_ann_to_type(contained, loc)
+        msg = "Unsupported annotation {} could not be resolved because {} could not be resolved. At\n{}"
+        assert valid_type, msg.format(repr(ann), repr(contained), repr(loc))
+        return OptionalType(valid_type)
+    if is_union(ann):
+        # TODO: this is hack to recognize NumberType
+        if set(ann_args) == {int, float, complex}:
+            return NumberType.get()
+        inner: List = []
+        # We need these extra checks because both `None` and invalid
+        # values will return `None`
+        # TODO: Determine if the other cases need to be fixed as well
+        for a in typing.get_args(ann):
+            if a is None:
+                inner.append(NoneType.get())
+            maybe_type = try_ann_to_type(a, loc)
+            msg = "Unsupported annotation {} could not be resolved because {} could not be resolved. At\n{}"
+            assert maybe_type, msg.format(repr(ann), repr(maybe_type), repr(loc))
+            inner.append(maybe_type)
+        return UnionType(inner)  # type: ignore[arg-type]
+    if torch.distributed.rpc.is_available() and is_rref(ann):
+        return RRefType(try_ann_to_type(ann_args[0], loc))
+    if is_future(ann):
+        return FutureType(try_ann_to_type(ann_args[0], loc))
+    if is_await(ann):
+        elementType = try_ann_to_type(ann_args[0], loc) if ann_args else AnyType.get()
+        return AwaitType(elementType)
+    if ann is float:
+        return FloatType.get()
+    if ann is complex:
+        return ComplexType.get()
+    if ann is int or ann is torch.SymInt:
+        return IntType.get()
+    if ann is str:
+        return StringType.get()
+    if ann is bool:
+        return BoolType.get()
+    if ann is Any:
+        return AnyType.get()
+    if ann is type(None):
+        return NoneType.get()
+    if inspect.isclass(ann) and hasattr(ann, "__torch_script_interface__"):
+        return InterfaceType(ann.__torch_script_interface__)
+    if ann is torch.device:
+        return DeviceObjType.get()
+    if ann is torch.Generator:
+        return _GeneratorType.get()
+    if ann is torch.Stream:
+        return StreamObjType.get()
+    if ann is torch.dtype:
+        return IntType.get()  # dtype not yet bound in as its own type
+    if inspect.isclass(ann) and issubclass(ann, enum.Enum):
+        if _get_script_class(ann) is None:
+            scripted_class = torch.jit._script._recursive_compile_class(ann, loc)
+            name = scripted_class.qualified_name()
+        else:
+            name = _qualified_name(ann)
+        return EnumType(name, get_enum_value_type(ann, loc), list(ann))
+    if inspect.isclass(ann):
+        maybe_script_class = _get_script_class(ann)
+        if maybe_script_class is not None:
+            return maybe_script_class
+        if torch._jit_internal.can_compile_class(ann):
+            return torch.jit._script._recursive_compile_class(ann, loc)
+
+    # Maybe resolve a NamedTuple to a Tuple Type
+    if rcb is None:
+        rcb = _fake_rcb
+    return torch._C._resolve_type_from_object(ann, loc, rcb)
+
+
+def ann_to_type(ann, loc, rcb=None):
+    the_type = try_ann_to_type(ann, loc, rcb)
+    if the_type is not None:
+        return the_type
+    raise ValueError(f"Unknown type annotation: '{ann}' at {loc.highlight()}")
+
+
+__all__ = [
+    "Any",
+    "List",
+    "BroadcastingList1",
+    "BroadcastingList2",
+    "BroadcastingList3",
+    "Tuple",
+    "is_tuple",
+    "is_list",
+    "Dict",
+    "is_dict",
+    "is_optional",
+    "is_union",
+    "TensorType",
+    "TupleType",
+    "FloatType",
+    "ComplexType",
+    "IntType",
+    "ListType",
+    "StringType",
+    "DictType",
+    "AnyType",
+    "Module",
+    # TODO: Consider not exporting these during wildcard import (reserve
+    # that for the types; for idiomatic typing code.)
+    "get_signature",
+    "check_fn",
+    "get_param_names",
+    "parse_type_line",
+    "get_type_line",
+    "split_type_line",
+    "try_real_annotations",
+    "try_ann_to_type",
+    "ann_to_type",
+]
diff --git a/MLPY/Lib/site-packages/torch/jit/frontend.py b/MLPY/Lib/site-packages/torch/jit/frontend.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5817a590c0df1040f3ff57043bad960a0e6153a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/frontend.py
@@ -0,0 +1,1266 @@
+import ast
+import dataclasses
+import inspect
+import re
+import string
+import sys
+from collections import namedtuple
+from textwrap import dedent
+from typing import List, Tuple  # noqa: F401
+
+import torch
+import torch.jit.annotations
+from torch import _jit_internal
+from torch._C._jit_tree_views import (
+    Apply,
+    Assert,
+    Assign,
+    Attribute,
+    AugAssign,
+    BinOp,
+    Break,
+    ClassDef,
+    Const,
+    Continue,
+    Decl,
+    Def,
+    Delete,
+    DictComp,
+    DictLiteral,
+    Dots,
+    EmptyTypeAnnotation,
+    ExprStmt,
+    FalseLiteral,
+    For,
+    Ident,
+    If,
+    ListComp,
+    ListLiteral,
+    NoneLiteral,
+    Param,
+    Pass,
+    Property,
+    Raise,
+    Return,
+    Select,
+    SliceExpr,
+    Starred,
+    Stmt,
+    StringLiteral,
+    Subscript,
+    TernaryIf,
+    TrueLiteral,
+    TupleLiteral,
+    UnaryOp,
+    Var,
+    While,
+    With,
+    WithItem,
+)
+from torch._jit_internal import (  # noqa: F401
+    _is_drop_fn,
+    FunctionModifiers,
+    is_static_fn,
+    should_drop,
+)
+from torch._sources import (
+    get_source_lines_and_file,
+    make_source_context,
+    parse_def,
+    ParsedDef as _ParsedDef,
+)
+from torch.jit._dataclass_impls import DATACLASS_MAGIC_METHODS
+from torch.jit._monkeytype_config import get_qualified_name, monkeytype_trace
+
+_IS_ASTUNPARSE_INSTALLED = False
+try:
+    import astunparse  # type: ignore[import]
+
+    _IS_ASTUNPARSE_INSTALLED = True
+except ImportError:
+    pass
+
+# Borrowed from cPython implementation
+# https://github.com/python/cpython/blob/561612d8456cfab5672c9b445521113b847bd6b3/Lib/textwrap.py#L411#
+
+_reserved_prefix = "__jit"
+_reserved_names = {"print"}
+_identifier_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits)
+
+
+def is_reserved_name(name):
+    return name.startswith(_reserved_prefix) or name in _reserved_names
+
+
+pretty_node_names = {
+    ast.FunctionDef: "function definitions",
+    ast.For: "for loops",
+    ast.Delete: "del statements",
+    ast.ClassDef: "class definitions",
+    ast.With: "with statements",
+    ast.Raise: "raise statements",
+    ast.Assert: "assertions",
+    ast.Import: "import statements",
+    ast.ImportFrom: "import statements",
+    ast.Global: "global variables",
+    ast.Break: "break statements",
+    ast.Continue: "continue statements",
+}
+
+node_start_tokens = {
+    ast.FunctionDef: "def",
+    ast.For: "for",
+    ast.Delete: "del",
+    ast.ClassDef: "class",
+    ast.With: "with",
+    ast.Raise: "raise",
+    ast.Assert: "assert",
+    ast.Import: "import",
+    ast.ImportFrom: "from",
+    ast.Global: "global",
+    ast.Break: "break",
+    ast.Continue: "continue",
+}
+
+pretty_node_names.update(
+    {
+        ast.AsyncFunctionDef: "async function definitions",
+        ast.AsyncFor: "async for loops",
+        ast.AsyncWith: "async with statements",
+        ast.Try: "try blocks",
+        ast.Nonlocal: "nonlocal variables",
+    }
+)
+
+node_start_tokens.update(
+    {
+        ast.AsyncFunctionDef: "async def",
+        ast.AsyncFor: "async for",
+        ast.AsyncWith: "async with",
+        ast.Try: "try",
+        ast.Nonlocal: "nonlocal",
+    }
+)
+
+pretty_node_names.update(
+    {
+        ast.AnnAssign: "annotated assignments",
+    }
+)
+# NB: no specific token for AnnAssign
+
+
+class FrontendError(Exception):
+    def __init__(self, source_range, msg):
+        self.source_range = source_range
+        self.msg = msg
+
+        # This has to be instantiated here so the ErrorReport is accurate to the
+        # call stack when the FrontendError was raised
+        self.error_report = torch._C.ErrorReport(self.source_range)
+
+    def __str__(self):
+        return self.msg + self.error_report.what().lstrip()
+
+
+class NotSupportedError(FrontendError):
+    pass
+
+
+class UnsupportedNodeError(NotSupportedError):
+    def __init__(self, ctx, offending_node, reason=""):
+        # If we don't have a specific token, we default to length of 1
+        node_type = type(offending_node)
+        range_len = len(node_start_tokens.get(node_type, " "))
+        source_range = ctx.make_range(
+            offending_node.lineno,
+            offending_node.col_offset,
+            offending_node.col_offset + range_len,
+        )
+        feature_name = pretty_node_names.get(node_type, node_type.__name__)
+        msg = f"{feature_name} {reason + ' ' if reason else ''}aren't supported"
+        super().__init__(source_range, msg)
+
+
+class FrontendTypeError(FrontendError):
+    pass
+
+
+def build_withitems(ctx, items):
+    items = [build_withitem(ctx, i) for i in items]
+    return list(items)
+
+
+def build_stmts(ctx, stmts):
+    stmts = [build_stmt(ctx, s) for s in stmts]
+    return list(filter(None, stmts))
+
+
+def get_class_properties(cls, self_name):
+    """
+    Get a list of Property objects representing the properties of a class.
+
+    Args:
+        cls:  The class to get properties of.
+        self_name: The name of the class that the properties should belong to.
+    Returns:
+        A list of Property objects corresponding to the properties of cls. Property
+        here refers to the subclass of TreeView.
+    """
+    props = inspect.getmembers(cls, predicate=lambda m: isinstance(m, property))
+    # Any property that should not compiled must be in this list on the Module.
+    unused_properties = getattr(cls, "__jit_unused_properties__", [])
+
+    # Create Property TreeView objects from inspected property objects.
+    properties = []
+    for prop in props:
+        if prop[0] not in unused_properties and not should_drop(prop[1].fget):
+            getter = get_jit_def(
+                prop[1].fget, f"__{prop[0]}_getter", self_name=self_name
+            )
+            setter = (
+                get_jit_def(prop[1].fset, f"__{prop[0]}_setter", self_name=self_name)
+                if prop[1].fset
+                else None
+            )
+            properties.append(
+                Property(getter.range(), Ident(getter.range(), prop[0]), getter, setter)
+            )
+
+    return properties
+
+
+def get_class_assigns(ctx, cls_ast):
+    assigns = []
+
+    def maybe_build_assign(builder, entry):
+        nonlocal assigns
+        try:
+            assigns.append(builder(ctx, entry))
+        except NotSupportedError:
+            pass
+
+    for entry in cls_ast.body:
+        if isinstance(entry, ast.Assign):
+            maybe_build_assign(StmtBuilder.build_Assign, entry)
+        elif isinstance(entry, ast.AnnAssign):
+            maybe_build_assign(StmtBuilder.build_AnnAssign, entry)
+    return assigns
+
+
+def get_jit_class_def(cls, self_name):
+    # Get defs for each method within the current class independently
+    # TODO: proper overriding analysis when implementing class inheritance
+    methods = inspect.getmembers(
+        cls,
+        predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m))
+        and not is_static_fn(cls, m.__name__)
+        and m.__name__ in cls.__dict__
+        and not _is_drop_fn(m),
+    )
+
+    def is_classmethod(fn):
+        return inspect.ismethod(fn) and getattr(fn, "__self__", None) == cls
+
+    # Get and parse the source code for this class
+    sourcelines, file_lineno, filename = get_source_lines_and_file(
+        cls, torch._C.ErrorReport.call_stack()
+    )
+    source = "".join(sourcelines)
+
+    dedent_src = dedent(source)
+    py_ast = ast.parse(dedent_src)
+
+    class_ast = py_ast.body[0]
+    assert isinstance(class_ast, ast.ClassDef)
+
+    # Special case for dataclasses. In general we need access to the source code for
+    # an object in order to JIT compile it. But the dataclasses module dynamically synthesizes
+    # magic methods for classes, and we can't get the source code for these methods. As a
+    # workaround, we synthesize TorchScript-friendly implementations ourselves.
+    if dataclasses.is_dataclass(cls):
+        # Detect whether the user manually implemented any of the magic methods. If they did,
+        # we don't want to synthesize/override them.
+        overrides = {
+            method.name
+            for method in class_ast.body
+            if isinstance(method, ast.FunctionDef)
+            and method.name in DATACLASS_MAGIC_METHODS
+        }
+        for i, (name, _) in enumerate(methods):
+            # Is this a magic method we can synthesize?
+            synthesizer_fn = DATACLASS_MAGIC_METHODS.get(name)
+            if synthesizer_fn and name not in overrides:
+                parsed_def = synthesizer_fn(cls)
+                methods[i] = name, parsed_def
+                func = getattr(cls, name)
+                _jit_internal.loader.cache(func, parsed_def.source)
+
+    method_defs = [
+        get_jit_def(obj, name, self_name=self_name, is_classmethod=is_classmethod(obj))
+        for (name, obj) in methods
+    ]
+    properties = get_class_properties(cls, self_name)
+
+    leading_whitespace_len = len(source.split("\n", 1)[0]) - len(
+        dedent_src.split("\n", 1)[0]
+    )
+    ctx = make_source_context(
+        source, filename, file_lineno, leading_whitespace_len, False
+    )
+    assigns = get_class_assigns(ctx, class_ast)
+
+    return build_class_def(ctx, class_ast, method_defs, properties, self_name, assigns)
+
+
+def get_jit_def(fn, def_name, self_name=None, is_classmethod=False):
+    """
+    Build a JIT AST (TreeView) from the given function.
+
+    Args:
+        fn: A function object to compile or a pre-parsed ParsedDef object
+        def_name: The name to give to the resulting AST object. This is not
+            always the same as `fn.__name__`, for example:
+                def _forward(self):
+                    ...
+                forward = _forward
+            In this case, the `__name__` attribute of the function object is "_forward",
+            but we want the result AST to have the name "forward".
+        self_name: If this function is a method, what the type name of `self` is.
+    """
+    parsed_def = parse_def(fn) if not isinstance(fn, _ParsedDef) else fn
+    type_line = torch.jit.annotations.get_type_line(parsed_def.source)
+    fn_def = parsed_def.ast.body[0]
+
+    if is_classmethod:
+        arg_name = fn_def.args.args[0].arg
+        # Insert a statement that assigns the first argument to the class
+        assign_stmt = ast.parse(f"{arg_name} = {self_name}").body[0]
+        fn_def.body.insert(0, assign_stmt)
+
+    # Swap out the function signature and body if it is unused
+    if should_drop(fn):
+        unused_fn_def = ast.parse(
+            'def unused_fn(self: Any):\n\traise RuntimeError("Cannot call @unused methods")'
+        )
+        if len(unused_fn_def.body) != 1 or not isinstance(
+            unused_fn_def.body[0], ast.FunctionDef
+        ):
+            raise RuntimeError(
+                f"Expected a single top-level function: {parsed_def.filename}:{parsed_def.file_lineno}"
+            )
+        unused_def = unused_fn_def.body[0]
+        fn_def.body = unused_def.body
+        # kwarg/vararg not supported by `build_def`
+        fn_def.args.kwarg = fn_def.args.vararg = None
+        for arg in fn_def.args.args + fn_def.args.kwonlyargs:
+            # Replace potentially unsupported type annotations by "Any"
+            arg.annotation = unused_def.args.args[0].annotation
+        if _is_drop_fn(fn):
+            # Dropping potentially unsupported return type annotation for jit._drop
+            fn_def.returns = None
+            fn_def.type_comment = None
+
+    # If MonkeyType is installed, get all the consolidated type traces
+    # for the arguments from type_trace_db
+    type_trace_db = torch.jit._script._get_type_trace_db()
+    pdt_arg_types = None
+    if monkeytype_trace and not isinstance(fn, _ParsedDef):  # type: ignore[truthy-function]
+        qualname = get_qualified_name(fn)
+        pdt_arg_types = type_trace_db.get_args_types(qualname)
+
+    return build_def(
+        parsed_def.ctx,
+        fn_def,
+        type_line,
+        def_name,
+        self_name=self_name,
+        pdt_arg_types=pdt_arg_types,
+    )
+
+
+# TODO: more robust handling of recognizing ignore context manager
+def is_torch_jit_ignore_context_manager(stmt):
+    # checks if the statement is torch.jit.ignore context manager
+    if isinstance(stmt.items[0].context_expr, ast.Call):
+        # extract torch part
+        function = stmt.items[0].context_expr.func
+        if isinstance(function, ast.Attribute):
+            attr_name = function.attr
+            attr_value = function.value
+            if attr_name == "_IgnoreContextManager" and isinstance(
+                attr_value, ast.Attribute
+            ):
+                # there should be at most two nested attributes (e.g torch.jit._IgnoreContextManager)
+                if attr_value.attr == "jit" and isinstance(attr_value.value, ast.Name):
+                    if attr_value.value.id == "torch":
+                        return True
+    return False
+
+
+class Builder:
+    def __call__(self, ctx, node):
+        method = getattr(self, "build_" + node.__class__.__name__, None)
+        if method is None:
+            raise UnsupportedNodeError(ctx, node)
+        return method(ctx, node)
+
+
+def build_class_def(ctx, py_def, methods, properties, self_name, assigns):
+    r = ctx.make_range(
+        py_def.lineno, py_def.col_offset, py_def.col_offset + len("class")
+    )
+    return ClassDef(
+        Ident(r, self_name), [Stmt(method) for method in methods], properties, assigns
+    )
+
+
+def build_def(ctx, py_def, type_line, def_name, self_name=None, pdt_arg_types=None):
+    body = py_def.body
+    r = ctx.make_range(py_def.lineno, py_def.col_offset, py_def.col_offset + len("def"))
+
+    param_list = build_param_list(ctx, py_def.args, self_name, pdt_arg_types)
+    return_type = None
+    if getattr(py_def, "returns", None) is not None:
+        return_type = build_expr(ctx, py_def.returns)
+
+    decl = Decl(r, param_list, return_type)
+    is_method = self_name is not None
+    if type_line is not None:
+        type_comment_decl = torch._C.parse_type_comment(type_line)
+        decl = torch._C.merge_type_from_type_comment(decl, type_comment_decl, is_method)
+
+    return Def(Ident(r, def_name), decl, build_stmts(ctx, body))
+
+
+_vararg_kwarg_err = (
+    "Compiled functions can't take variable number of arguments "
+    "or use keyword-only arguments with defaults"
+)
+
+
+def build_param_list(ctx, py_args, self_name, pdt_arg_types=None):
+    if py_args.kwarg is not None:
+        expr = py_args.kwarg
+        ctx_range = ctx.make_range(
+            expr.lineno, expr.col_offset - 1, expr.col_offset + len(expr.arg)
+        )
+        raise NotSupportedError(ctx_range, _vararg_kwarg_err)
+    if py_args.vararg is not None:
+        expr = py_args.vararg
+        ctx_range = ctx.make_range(
+            expr.lineno, expr.col_offset - 1, expr.col_offset + len(expr.arg)
+        )
+        raise NotSupportedError(ctx_range, _vararg_kwarg_err)
+    if len(py_args.kw_defaults) > 0:
+        # kw_defaults is a list of the values for the kwargs (which default to None),
+        # so they don't actually have line numbers.
+        for arg in py_args.kw_defaults:
+            if arg is not None:
+                ctx_range = build_expr(ctx, arg).range()
+                raise NotSupportedError(ctx_range, _vararg_kwarg_err)
+
+    # List of Tuple of args and type as inferred by profile directed typing
+    arg_and_types = [
+        (
+            arg,
+            pdt_arg_types[arg.arg]
+            if pdt_arg_types and bool(pdt_arg_types[arg.arg])
+            else None,
+        )
+        for arg in py_args.args
+    ]
+    arg_and_types_kwonlyargs = [
+        (
+            arg,
+            pdt_arg_types[arg.arg]
+            if pdt_arg_types and bool(pdt_arg_types[arg.arg])
+            else None,
+        )
+        for arg in py_args.kwonlyargs
+    ]
+
+    result = [
+        build_param(ctx, arg, self_name, kwarg_only=False, pdt_arg_type=arg_type)
+        for arg, arg_type in arg_and_types
+    ]
+    result += [
+        build_param(ctx, arg, self_name, kwarg_only=True, pdt_arg_type=arg_type)
+        for arg, arg_type in arg_and_types_kwonlyargs
+    ]
+    return result
+
+
+def build_param(ctx, py_arg, self_name, kwarg_only, pdt_arg_type=None):
+    # NB: In Python3 py_arg is a pair of (str arg, expr? annotation)
+    name = py_arg.arg
+    r = ctx.make_range(py_arg.lineno, py_arg.col_offset, py_arg.col_offset + len(name))
+    if getattr(py_arg, "annotation", None) is not None:
+        annotation_expr = build_expr(ctx, py_arg.annotation)
+    elif pdt_arg_type:
+        annotation_expr = Var(Ident(r, pdt_arg_type))
+    elif self_name is not None and name == "self":
+        annotation_expr = Var(Ident(r, self_name))
+    else:
+        annotation_expr = EmptyTypeAnnotation(r)
+    return Param(annotation_expr, Ident(r, name), kwarg_only)
+
+
+def build_ignore_context_manager(ctx, stmt):
+    InputType = namedtuple("InputType", ["name", "ann"])
+    OutputType = namedtuple("OutputType", ["name", "ann"])
+
+    def process_ins_outs(args):
+        # parse the context manager to figure out inputs and outputs
+        # with their annotated types
+        # TODO: add input, output validator
+        inputs = []
+        outputs = []
+        for arg in args:
+            var_name = arg.arg
+            var_ann = arg.value.value
+            var_decl_type, var_ann = var_ann.split(":")
+            if var_decl_type == "inp":
+                inputs.append(InputType(var_name, var_ann))
+            if var_decl_type == "out":
+                outputs.append(OutputType(var_name, var_ann))
+        return inputs, outputs
+
+    def create_unique_name_ext(ctx, stmt):
+        # extension will be based on the full path filename plus
+        # the line number of original context manager
+        fn = re.sub(r"[^a-zA-Z0-9_]", "_", ctx.filename)
+        return f"{fn}_{stmt.lineno}"
+
+    def build_return_ann_stmt(outputs):
+        return_type_ann = ""
+        return_statement_str = "return "
+        if len(outputs) == 0:
+            return_type_ann += " -> None"
+        if len(outputs) == 1:
+            return_type_ann = " -> " + outputs[0].ann
+            return_statement_str += outputs[0].name
+        if len(outputs) > 1:
+            return_type_ann = " -> Tuple"
+            return_type_ann += "[" + ", ".join([var.ann for var in outputs]) + "]"
+            return_statement_str += ", ".join([var.name for var in outputs])
+        return return_type_ann, return_statement_str
+
+    def build_args(args):
+        return ", ".join([arg.name for arg in args])
+
+    inputs, outputs = process_ins_outs(stmt.items[0].context_expr.keywords)
+
+    # build the replacement function str with given inputs and outputs
+    ignore_function_name = "func_ignore_" + create_unique_name_ext(ctx, stmt)
+    ignore_function_str = "\ndef " + ignore_function_name
+    ignore_function_str += (
+        "(" + ", ".join([var.name + " :" + var.ann for var in inputs]) + ")"
+    )
+
+    return_ann, return_stmt = build_return_ann_stmt(outputs)
+    ignore_function_str += return_ann + ": pass"
+
+    # first create the functionDef object from just declaration
+    ignore_function = ast.parse(ignore_function_str).body[0]
+
+    # dump the body of context manager to dummy function
+    ignore_function.body = stmt.body  # type: ignore[attr-defined]
+
+    # insert return statement to the function
+    return_stmt = ast.parse(return_stmt).body[0]
+    ignore_function.body.append(return_stmt)  # type: ignore[attr-defined]
+
+    # registers the custom function in the global context
+    ignore_func_str = "@torch.jit.ignore\n" + astunparse.unparse(ignore_function)
+    ignore_func_str += f'\nglobals()["{ignore_function_name}"] = {ignore_function_name}'
+    exec(ignore_func_str)  # noqa: P204
+
+    # build the statements as:
+    # <out_1>, <out_2>, ... = torch.jit.frontend.<func>(<in_1>, <in_2>)
+    assign_str_lhs = build_args(outputs)
+    # this function will be registered in torch.jit.frontend module by default
+    assign_str_rhs = (
+        f"torch.jit.frontend.{ignore_function_name}(" + build_args(inputs) + ")"
+    )
+
+    if len(outputs) > 0:
+        assign_str = assign_str_lhs + " = " + assign_str_rhs
+    else:
+        assign_str = assign_str_rhs
+    assign_ast = ast.parse(assign_str).body[0]
+    return assign_ast
+
+
+def get_default_args(fn):
+    if fn is None:
+        return {}
+
+    signature = inspect.signature(fn)
+
+    return {
+        k: v.default
+        for k, v in signature.parameters.items()
+        if v.default is not inspect.Parameter.empty
+    }
+
+
+def get_default_args_for_class(cls):
+    """
+    Get default arguments for all methods in a class (except for static methods).
+
+    Args:
+        cls: type - The class type to inspect for default arguments.
+    Returns:
+        A Dict[str, Dict[str, Any]] which maps each method name to a Dict[str, Any]
+        that maps each argument name to its default value.
+    """
+    # Get methods (except static methods because those are compiled separately as
+    # if they were independent script functions).
+    methods = inspect.getmembers(
+        cls,
+        predicate=lambda m: (inspect.ismethod(m) or inspect.isfunction(m))
+        and not is_static_fn(cls, m.__name__)
+        and m.__name__ in cls.__dict__,
+    )
+
+    # Get method defaults. Property defaults do not need to be considered
+    # because setters cannot be invoked without a value.
+    defaults = {
+        method_name: get_default_args(method_impl)
+        for method_name, method_impl in methods
+    }
+
+    return defaults
+
+
+class WithItemBuilder(Builder):
+    @staticmethod
+    def build_withitem(ctx, item):
+        lineno = item.context_expr.lineno
+        start = item.context_expr.col_offset
+        end = start + len(pretty_node_names[ast.With])
+        op_vars = item.optional_vars
+        r = ctx.make_range(lineno, start, end)
+
+        return WithItem(
+            r,
+            build_expr(ctx, item.context_expr),
+            build_expr(ctx, op_vars) if op_vars else None,
+        )
+
+
+class StmtBuilder(Builder):
+    augassign_map = {
+        ast.Add: "+",
+        ast.Sub: "-",
+        ast.Mult: "*",
+        ast.Div: "/",
+        ast.Mod: "%",
+        ast.BitOr: "|",
+        ast.BitAnd: "&",
+        ast.BitXor: "^",
+        ast.LShift: "<<",
+        ast.RShift: ">>",
+        ast.Pow: "**",
+    }
+
+    @staticmethod
+    def build_Expr(ctx, stmt):
+        value = stmt.value
+        if value.__class__.__name__ == "Str":
+            # If a statement is a string literal expression,
+            # then it is a docstring. Just ignore it.
+            return None
+        else:
+            return ExprStmt(build_expr(ctx, value))
+
+    @staticmethod
+    def build_Assign(ctx, stmt):
+        rhs = build_expr(ctx, stmt.value)
+        lhs = [build_expr(ctx, x) for x in stmt.targets]
+        return Assign(lhs, rhs)
+
+    @staticmethod
+    def build_AnnAssign(ctx, stmt):
+        if stmt.value is None:
+            raise UnsupportedNodeError(ctx, stmt, reason="without assigned value")
+
+        # Disallow type annotations on instance attributes outside of __init__
+        if (
+            type(stmt.target) == ast.Attribute
+            and stmt.target.value.id == "self"  # type: ignore[attr-defined]
+            and ctx.funcname != "__init__"
+        ):
+            start = stmt.col_offset
+            end = start + len(f"self.{stmt.target.attr}")
+            if hasattr(stmt.annotation, "id"):
+                end += len(f": {stmt.annotation.id}")
+            sr = ctx.make_range(stmt.lineno, start, end)
+            raise ValueError(
+                "Type annotations on instance attributes must be declared in "
+                f"__init__, not '{ctx.funcname}': {sr}"
+            )
+
+        rhs = build_expr(ctx, stmt.value)
+        lhs = build_expr(ctx, stmt.target)
+        the_type = build_expr(ctx, stmt.annotation)
+        return Assign([lhs], rhs, the_type)
+
+    @staticmethod
+    def build_Delete(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("del"))
+
+        return Delete(r, [build_expr(ctx, target) for target in stmt.targets])
+
+    @staticmethod
+    def build_Return(ctx, stmt):
+        r = ctx.make_range(
+            stmt.lineno, stmt.col_offset, stmt.col_offset + len("return")
+        )
+        return Return(r, None if stmt.value is None else build_expr(ctx, stmt.value))
+
+    @staticmethod
+    def build_Raise(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("raise"))
+        expr = build_expr(ctx, stmt.exc)
+        return Raise(r, expr)
+
+    @staticmethod
+    def build_Assert(ctx, stmt):
+        r = ctx.make_range(
+            stmt.lineno, stmt.col_offset, stmt.col_offset + len("assert")
+        )
+        test = build_expr(ctx, stmt.test)
+        msg = build_expr(ctx, stmt.msg) if stmt.msg is not None else None
+        return Assert(r, test, msg)
+
+    @staticmethod
+    def build_AugAssign(ctx, stmt):
+        lhs = build_expr(ctx, stmt.target)
+        rhs = build_expr(ctx, stmt.value)
+        op = type(stmt.op)
+        if op in StmtBuilder.augassign_map:
+            op_token = StmtBuilder.augassign_map[op]
+        else:
+            raise NotSupportedError(
+                find_before(ctx, rhs.range().start, "=", offsets=(-1, 0)),
+                "unsupported kind of augmented assignment: " + op.__name__,
+            )
+        return AugAssign(lhs, op_token, rhs)
+
+    @staticmethod
+    def build_While(ctx, stmt):
+        if stmt.orelse:
+            # TODO: try to recover the location of else:? Python doesn't give us useful
+            # annotations in this case
+            raise NotSupportedError(
+                None, "else branches of while loops aren't supported"
+            )
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("while"))
+        return While(r, build_expr(ctx, stmt.test), build_stmts(ctx, stmt.body))
+
+    @staticmethod
+    def build_For(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("for"))
+        if stmt.orelse:
+            raise NotSupportedError(r, "else branches of for loops aren't supported")
+
+        return For(
+            r,
+            [build_expr(ctx, stmt.target)],
+            [build_expr(ctx, stmt.iter)],
+            build_stmts(ctx, stmt.body),
+        )
+
+    @staticmethod
+    def build_If(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("if"))
+        return If(
+            r,
+            build_expr(ctx, stmt.test),
+            build_stmts(ctx, stmt.body),
+            build_stmts(ctx, stmt.orelse),
+        )
+
+    @staticmethod
+    def build_Print(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("print"))
+        if stmt.dest:
+            raise NotSupportedError(
+                r, "print statements with non-default destinations aren't supported"
+            )
+        args = [build_expr(ctx, val) for val in stmt.values]
+        return ExprStmt(Apply(Var(Ident(r, "print")), args, []))
+
+    @staticmethod
+    def build_Pass(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("pass"))
+        return Pass(r)
+
+    @staticmethod
+    def build_Break(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("break"))
+        return Break(r)
+
+    @staticmethod
+    def build_Continue(ctx, stmt):
+        r = ctx.make_range(
+            stmt.lineno, stmt.col_offset, stmt.col_offset + len("continue")
+        )
+        return Continue(r)
+
+    @staticmethod
+    def build_With(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("with"))
+        # Handle ignore context manager
+        if is_torch_jit_ignore_context_manager(stmt):
+            if not _IS_ASTUNPARSE_INSTALLED:
+                raise RuntimeError(
+                    "torch.jit._IgnoreContextManager requires installing Python library `astunparse`, \
+                                   please install it in your Python environment"
+                )
+            assign_ast = build_ignore_context_manager(ctx, stmt)
+            return build_stmt(ctx, assign_ast)
+        return With(r, build_withitems(ctx, stmt.items), build_stmts(ctx, stmt.body))
+
+
+class ExprBuilder(Builder):
+    binop_map = {
+        ast.Add: "+",
+        ast.Sub: "-",
+        ast.Mult: "*",
+        ast.Div: "/",
+        ast.Pow: "**",
+        ast.Mod: "%",
+        ast.FloorDiv: "//",
+        ast.BitAnd: "&",
+        ast.BitXor: "^",
+        ast.BitOr: "|",
+        ast.LShift: "<<",
+        ast.RShift: ">>",
+    }
+
+    binop_map[ast.MatMult] = "@"
+
+    unop_map = {
+        ast.Not: "not",
+        ast.USub: "-",
+        ast.Invert: "~",
+    }
+
+    boolop_map = {
+        ast.And: "and",
+        ast.Or: "or",
+    }
+
+    cmpop_map = {
+        ast.Eq: "==",
+        ast.NotEq: "!=",
+        ast.LtE: "<=",
+        ast.Lt: "<",
+        ast.GtE: ">=",
+        ast.Gt: ">",
+        ast.Is: "is",
+        ast.IsNot: "is not",
+        ast.In: "in",
+        ast.NotIn: "not in",
+    }
+
+    @staticmethod
+    def build_Attribute(ctx, expr):
+        base = build_expr(ctx, expr.value)
+        # expr.attr is just a string, so it's not annotated in any way, so we have
+        # to build the range manually
+        source = ctx.source.encode("utf-8")
+
+        def get_char(index):
+            return chr(source[index])
+
+        start_pos = base.range().end + 1
+        while get_char(start_pos) in string.whitespace:  # Skip whitespace
+            start_pos += 1
+        end_pos = start_pos + len(expr.attr)
+        name_range = ctx.make_raw_range(start_pos, end_pos)
+        return Select(base, Ident(name_range, expr.attr))
+
+    @staticmethod
+    def build_Call(ctx, expr):
+        func = build_expr(ctx, expr.func)
+        args = [build_expr(ctx, py_arg) for py_arg in expr.args]
+        if hasattr(expr, "starargs") and expr.starargs:
+            stararg_expr = build_expr(ctx, expr.starargs)
+            args += [Starred(stararg_expr.range(), stararg_expr)]
+        kwargs = []
+        for kw in expr.keywords:
+            kw_expr = build_expr(ctx, kw.value)
+            # XXX: we could do a better job at figuring out the range for the name here
+            if not kw.arg:
+                raise NotSupportedError(
+                    kw_expr.range(), "keyword-arg expansion is not supported"
+                )
+            kwargs.append(Attribute(Ident(kw_expr.range(), kw.arg), kw_expr))
+        return Apply(func, args, kwargs)
+
+    @staticmethod
+    def build_Ellipsis(ctx, expr):
+        r = ctx.make_range(
+            expr.lineno, expr.col_offset, expr.col_offset + 3
+        )  # len("...") == 3
+        return Dots(r)
+
+    @staticmethod
+    def build_Name(ctx, expr):
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(expr.id))
+        if expr.id.startswith(_reserved_prefix):
+            raise NotSupportedError(
+                r,
+                "names of variables used in JIT-ed functions "
+                "can't start with " + _reserved_prefix,
+            )
+        if expr.id == "True":
+            return TrueLiteral(r)
+        elif expr.id == "False":
+            return FalseLiteral(r)
+        elif expr.id == "None":
+            return NoneLiteral(r)
+        elif expr.id == "Ellipsis":
+            return Dots(r)
+        return Var(Ident(r, expr.id))
+
+    @staticmethod
+    def build_NameConstant(ctx, expr):
+        r = ctx.make_range(
+            expr.lineno, expr.col_offset, expr.col_offset + len(str(expr.value))
+        )
+        if expr.value is True:
+            return TrueLiteral(r)
+        elif expr.value is False:
+            return FalseLiteral(r)
+        elif expr.value is None:
+            return NoneLiteral(r)
+        elif expr.value == Ellipsis:
+            return Dots(r)
+        else:
+            raise ValueError("Name constant value unsupported: " + str(expr.value))
+
+    @staticmethod
+    def build_BinOp(ctx, expr):
+        lhs = build_expr(ctx, expr.left)
+        rhs = build_expr(ctx, expr.right)
+        op = type(expr.op)
+
+        if op == ast.Div and not ctx.uses_true_division:
+            err_range = ctx.make_raw_range(lhs.range().end, rhs.range().start)
+            raise FrontendError(
+                err_range,
+                "Division of ints in TorchScript uses Python 3 true "
+                "division semantics. Please put `from __future__ "
+                "import division` at the top of your file",
+            )
+        op_token = ExprBuilder.binop_map.get(op)
+        if op_token is None:
+            err_range = ctx.make_raw_range(lhs.range().end, rhs.range().start)
+            raise NotSupportedError(
+                err_range, "unsupported binary operator: " + op.__name__
+            )
+        return BinOp(op_token, lhs, rhs)
+
+    @staticmethod
+    def build_UnaryOp(ctx, expr):
+        sub_expr = build_expr(ctx, expr.operand)
+        op = type(expr.op)
+        op_token = ExprBuilder.unop_map.get(op)
+        if op_token is None:
+            raise NotSupportedError(
+                expr.range(), "unsupported unary operator: " + op.__name__
+            )
+        r = ctx.make_range(
+            expr.lineno, expr.col_offset, expr.col_offset + len(op_token)
+        )
+        return UnaryOp(r, op_token, sub_expr)
+
+    @staticmethod
+    def build_BoolOp(ctx, expr):
+        if len(expr.values) < 2:
+            raise AssertionError(
+                "expected at least 2 values in BoolOp, but got " + str(len(expr.values))
+            )
+        sub_exprs = [build_expr(ctx, sub_expr) for sub_expr in expr.values]
+        op = type(expr.op)
+        op_token = ExprBuilder.boolop_map.get(op)
+        if op_token is None:
+            err_range = ctx.make_raw_range(
+                sub_exprs[0].range().end, sub_exprs[1].range().start
+            )
+            raise NotSupportedError(
+                err_range, "unsupported boolean operator: " + op.__name__
+            )
+        lhs = sub_exprs[0]
+        for rhs in sub_exprs[1:]:
+            lhs = BinOp(op_token, lhs, rhs)
+        return lhs
+
+    @staticmethod
+    def build_IfExp(ctx, expr):
+        return TernaryIf(
+            build_expr(ctx, expr.test),
+            build_expr(ctx, expr.body),
+            build_expr(ctx, expr.orelse),
+        )
+
+    @staticmethod
+    def build_Compare(ctx, expr):
+        operands = [build_expr(ctx, e) for e in [expr.left] + list(expr.comparators)]
+        result = None
+        for lhs, op_, rhs in zip(operands, expr.ops, operands[1:]):
+            op = type(op_)
+            op_token = ExprBuilder.cmpop_map.get(op)
+            r = ctx.make_raw_range(lhs.range().end, rhs.range().start)
+            if op_token is None:
+                raise NotSupportedError(
+                    r, "unsupported comparison operator: " + op.__name__
+                )
+
+            if op == ast.NotIn:
+                # NB: `not in` is just `not( in )`, so we don't introduce new tree view
+                # but just make it a nested call in our tree view structure
+                in_expr = BinOp("in", lhs, rhs)
+                cmp_expr = UnaryOp(r, "not", in_expr)
+            else:
+                cmp_expr = BinOp(op_token, lhs, rhs)
+
+            if result is None:
+                result = cmp_expr
+            else:
+                result = BinOp("and", result, cmp_expr)
+        return result
+
+    @staticmethod
+    def build_Subscript(ctx, expr):
+        def build_SliceExpr(ctx, base, slice_expr):
+            lower = (
+                build_expr(ctx, slice_expr.lower)
+                if slice_expr.lower is not None
+                else None
+            )
+            upper = (
+                build_expr(ctx, slice_expr.upper)
+                if slice_expr.upper is not None
+                else None
+            )
+            step = (
+                build_expr(ctx, slice_expr.step)
+                if slice_expr.step is not None
+                else None
+            )
+            return SliceExpr(base.range(), lower, upper, step)
+
+        def build_Index(ctx, base, index_expr):
+            if isinstance(index_expr.value, ast.Tuple):
+                raise NotSupportedError(
+                    base.range(),
+                    "slicing multiple dimensions with tuples not supported yet",
+                )
+            return build_expr(ctx, index_expr.value)
+
+        def build_ExtSlice(ctx, base, extslice):
+            sub_exprs = []
+            for expr in extslice.dims:
+                sub_type = type(expr)
+                if sub_type is ast.Index:
+                    sub_exprs.append(build_Index(ctx, base, expr))
+                elif sub_type is ast.Slice:
+                    sub_exprs.append(build_SliceExpr(ctx, base, expr))
+                elif sub_type is ast.Ellipsis:
+                    sub_exprs.append(Dots(base.range()))
+                else:
+                    raise NotSupportedError(
+                        base.range(),
+                        f"slicing multiple dimensions with {sub_type} not supported",
+                    )
+            return sub_exprs
+
+        base = build_expr(ctx, expr.value)
+        sub_type = type(expr.slice)
+        if sub_type is ast.Index:
+            if isinstance(expr.slice.value, ast.Tuple):
+                # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k]
+                # XXX: Indexing using a list is **different**! It triggers advanced indexing.
+                indices = [
+                    build_expr(ctx, index_expr) for index_expr in expr.slice.value.elts
+                ]
+                if not indices:
+                    # `col_offset` is an int, but `end_col_offset` is
+                    # `Optional[int]`. The magic number is here to make
+                    # sure we can parse `()` on any machine
+                    r = ctx.make_range(
+                        expr.lineno,
+                        expr.slice.value.col_offset,
+                        expr.slice.value.col_offset + 2,
+                    )
+                    tup = TupleLiteral(r, [])
+                    indices.append(tup)
+                return Subscript(base, indices)
+            else:
+                return Subscript(base, [build_expr(ctx, expr.slice.value)])
+        elif sub_type is ast.Slice:
+            return Subscript(base, [build_SliceExpr(ctx, base, expr.slice)])
+        elif sub_type is ast.ExtSlice:
+            return Subscript(base, build_ExtSlice(ctx, base, expr.slice))
+        elif sys.version_info >= (
+            3,
+            9,
+        ):  # In Python3.9 array indicies are not wrapped in ast.Index
+            if sub_type is ast.Tuple:
+                # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k]
+                indices = []
+                for index_expr in expr.slice.elts:
+                    if isinstance(index_expr, ast.Slice):
+                        indices.append(build_SliceExpr(ctx, base, index_expr))
+                    else:
+                        indices.append(build_expr(ctx, index_expr))
+                # Special-case logic for `typing.Tuple[()]`
+                if not indices:
+                    # See note above r.e. magic number
+                    r = ctx.make_range(
+                        expr.lineno, expr.slice.col_offset, expr.slice.col_offset + 2
+                    )
+                    tup = TupleLiteral(r, [])
+                    indices.append(tup)
+                return Subscript(base, indices)
+            return Subscript(base, [build_expr(ctx, expr.slice)])
+        else:  # Ellipsis (can only happen in Python 2)
+            raise NotSupportedError(base.range(), "ellipsis is not supported")
+
+    @staticmethod
+    def build_List(ctx, expr):
+        return ListLiteral(
+            ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+            [build_expr(ctx, e) for e in expr.elts],
+        )
+
+    @staticmethod
+    def build_Tuple(ctx, expr):
+        return TupleLiteral(
+            ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1),
+            [build_expr(ctx, e) for e in expr.elts],
+        )
+
+    @staticmethod
+    def build_Dict(ctx, expr):
+        range = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1)
+        if expr.keys and not expr.keys[0]:
+            raise NotSupportedError(
+                range, "Dict expansion (e.g. `{**dict}`) is not supported"
+            )
+        return DictLiteral(
+            range,
+            [build_expr(ctx, e) for e in expr.keys],
+            [build_expr(ctx, e) for e in expr.values],
+        )
+
+    @staticmethod
+    def build_Num(ctx, expr):
+        value = str(expr.value)
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + len(value))
+        return Const(r, value)
+
+    @staticmethod
+    def build_Constant(ctx, expr):
+        value = expr.value
+        if value is None or isinstance(value, bool):
+            # NB: this check has to happen before the int check because bool is
+            # a subclass of int
+            return ExprBuilder.build_NameConstant(ctx, expr)
+        if isinstance(value, (int, float, complex)):
+            return ExprBuilder.build_Num(ctx, expr)
+        elif isinstance(value, str):
+            return ExprBuilder.build_Str(ctx, expr)
+        elif isinstance(value, type(Ellipsis)):
+            return ExprBuilder.build_Ellipsis(ctx, expr)
+        else:
+            error_range = ctx.make_range(
+                expr.lineno, expr.col_offset, expr.col_offset + len(str(value))
+            )
+            raise FrontendError(error_range, "Unknown Constant expression type")
+
+    @staticmethod
+    def build_Str(ctx, expr):
+        value = str(expr.value)
+        r = ctx.make_range(
+            expr.lineno, expr.col_offset, expr.col_offset + len(value) + 1
+        )
+        return StringLiteral(r, value)
+
+    @staticmethod
+    def build_JoinedStr(ctx, expr):
+        s = ""
+        args = []
+        for value in expr.values:
+            r = ctx.make_range(value.lineno, value.col_offset, value.col_offset + 1)
+            if isinstance(value, ast.FormattedValue):
+                if value.conversion != -1:
+                    raise NotSupportedError(r, "Don't support conversion in JoinedStr")
+                if value.format_spec is not None:
+                    raise NotSupportedError(r, "Don't support formatting in JoinedStr")
+                s += "{}"
+                args.append(build_expr(ctx, value.value))
+            elif isinstance(value, ast.Str):
+                s += value.s
+            else:
+                raise NotSupportedError(r, "Unsupported value in JoinedStr")
+
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1)
+        return Apply(Select(StringLiteral(r, s), Ident(r, "format")), args, [])
+
+    @staticmethod
+    def build_ListComp(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset)
+        if len(stmt.generators) != 1:
+            raise NotSupportedError(r, "Only a single generator is currently supported")
+
+        if len(stmt.generators[0].ifs) != 0:
+            raise NotSupportedError(r, "Comprehension ifs are not supported yet")
+
+        elt_expr = build_expr(ctx, stmt.elt)
+        target_expr = build_expr(ctx, stmt.generators[0].target)
+        iter_expr = build_expr(ctx, stmt.generators[0].iter)
+
+        return ListComp(r, elt_expr, target_expr, iter_expr)
+
+    @staticmethod
+    def build_GeneratorExp(ctx, stmt):
+        # Convert Generator expression to ListComp
+        return ExprBuilder.build_ListComp(ctx, stmt)
+
+    @staticmethod
+    def build_DictComp(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset)
+        if len(stmt.generators) != 1:
+            raise NotSupportedError(r, "Only a single generator is currently supported")
+
+        if len(stmt.generators[0].ifs) != 0:
+            raise NotSupportedError(r, "Comprehension ifs are not supported yet")
+
+        key_expr = build_expr(ctx, stmt.key)
+        value_expr = build_expr(ctx, stmt.value)
+        target_expr = build_expr(ctx, stmt.generators[0].target)
+        iter_expr = build_expr(ctx, stmt.generators[0].iter)
+
+        return DictComp(r, key_expr, value_expr, target_expr, iter_expr)
+
+    @staticmethod
+    def build_Starred(ctx, expr):
+        r = ctx.make_range(expr.lineno, expr.col_offset, expr.col_offset + 1)
+        return Starred(r, build_expr(ctx, expr.value))
+
+
+build_expr = ExprBuilder()
+build_stmt = StmtBuilder()
+build_withitem = WithItemBuilder()
+
+
+def find_before(ctx, pos, substr, offsets=(0, 0)):
+    new_pos = ctx.source[:pos].rindex(substr)
+    return ctx.make_raw_range(new_pos + offsets[0], new_pos + len(substr) + offsets[1])
diff --git a/MLPY/Lib/site-packages/torch/jit/generate_bytecode.py b/MLPY/Lib/site-packages/torch/jit/generate_bytecode.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df273e76da826a28186031baeca8623c1bbe4bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/generate_bytecode.py
@@ -0,0 +1,33 @@
+from typing import List
+
+from torch._C import _compile_graph_to_code_table, _generate_upgraders_graph
+
+
+def format_bytecode(table):
+    # given a nested tuple, convert it to nested list
+    def listify(content):
+        if not isinstance(content, tuple):
+            return content
+        return [listify(i) for i in content]
+
+    formatted_table = {}
+    for entry in table:
+        identifier = entry[0]
+        content = entry[1]
+        content = listify(content)
+        formatted_table[identifier] = content
+    return formatted_table
+
+
+def generate_upgraders_bytecode() -> List:
+    yaml_content = []
+    upgraders_graph_map = _generate_upgraders_graph()
+    for upgrader_name, upgrader_graph in upgraders_graph_map.items():
+        bytecode_table = _compile_graph_to_code_table(upgrader_name, upgrader_graph)
+        entry = {upgrader_name: format_bytecode(bytecode_table)}
+        yaml_content.append(entry)
+    return yaml_content
+
+
+if __name__ == "__main__":
+    raise RuntimeError("This file is not meant to be run directly")
diff --git a/MLPY/Lib/site-packages/torch/jit/mobile/__init__.py b/MLPY/Lib/site-packages/torch/jit/mobile/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8495c90c54bf123b37d93694b1bcd8480ef8a8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/mobile/__init__.py
@@ -0,0 +1,232 @@
+import os
+
+import torch
+
+from torch.jit._serialization import validate_map_location
+
+
+def _load_for_lite_interpreter(f, map_location=None):
+    r"""
+    Load a :class:`LiteScriptModule` saved with :func:`torch.jit._save_for_lite_interpreter`.
+
+    Args:
+        f: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+        map_location: a string or torch.device used to dynamically remap
+            storages to an alternative set of devices.
+
+    Returns:
+        A :class:`LiteScriptModule` object.
+
+    Example:
+
+    .. testcode::
+
+        import torch
+        import io
+
+        # Load LiteScriptModule from saved file path
+        torch.jit._load_for_lite_interpreter('lite_script_module.pt')
+
+        # Load LiteScriptModule from io.BytesIO object
+        with open('lite_script_module.pt', 'rb') as f:
+            buffer = io.BytesIO(f.read())
+
+        # Load all tensors to the original device
+        torch.jit.mobile._load_for_lite_interpreter(buffer)
+    """
+    if isinstance(f, (str, os.PathLike)):
+        if not os.path.exists(f):
+            raise ValueError(f"The provided filename {f} does not exist")
+        if os.path.isdir(f):
+            raise ValueError(f"The provided filename {f} is a directory")
+
+    map_location = validate_map_location(map_location)
+
+    if isinstance(f, (str, os.PathLike)):
+        cpp_module = torch._C._load_for_lite_interpreter(os.fspath(f), map_location)
+    else:
+        cpp_module = torch._C._load_for_lite_interpreter_from_buffer(
+            f.read(), map_location
+        )
+
+    return LiteScriptModule(cpp_module)
+
+
+class LiteScriptModule:
+    def __init__(self, cpp_module):
+        self._c = cpp_module
+        super().__init__()
+
+    def __call__(self, *input):
+        return self._c.forward(input)
+
+    def find_method(self, method_name):
+        return self._c.find_method(method_name)
+
+    def forward(self, *input):
+        return self._c.forward(input)
+
+    def run_method(self, method_name, *input):
+        return self._c.run_method(method_name, input)
+
+
+def _export_operator_list(module: LiteScriptModule):
+    r"""Return a set of root operator names (with overload name) that are used by any method in this mobile module."""
+    return torch._C._export_operator_list(module._c)
+
+
+def _get_model_bytecode_version(f_input) -> int:
+    r"""Take a file-like object to return an integer.
+
+    Args:
+        f_input: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+
+    Returns:
+        version: An integer. If the integer is -1, the version is invalid. A warning
+            will show in the log.
+
+    Example:
+    .. testcode::
+
+        from torch.jit.mobile import _get_model_bytecode_version
+
+        # Get bytecode version from a saved file path
+        version = _get_model_bytecode_version("path/to/model.ptl")
+
+    """
+    if isinstance(f_input, (str, os.PathLike)):
+        if not os.path.exists(f_input):
+            raise ValueError(f"The provided filename {f_input} does not exist")
+        if os.path.isdir(f_input):
+            raise ValueError(f"The provided filename {f_input} is a directory")
+
+    if isinstance(f_input, (str, os.PathLike)):
+        return torch._C._get_model_bytecode_version(os.fspath(f_input))
+    else:
+        return torch._C._get_model_bytecode_version_from_buffer(f_input.read())
+
+
+def _get_mobile_model_contained_types(f_input) -> int:
+    r"""Take a file-like object and return a set of string, like ("int", "Optional").
+
+    Args:
+        f_input: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+
+    Returns:
+        type_list: A set of string, like ("int", "Optional"). These are types used in bytecode.
+
+    Example:
+
+    .. testcode::
+
+        from torch.jit.mobile import _get_mobile_model_contained_types
+
+        # Get type list from a saved file path
+        type_list = _get_mobile_model_contained_types("path/to/model.ptl")
+
+    """
+    if isinstance(f_input, (str, os.PathLike)):
+        if not os.path.exists(f_input):
+            raise ValueError(f"The provided filename {f_input} does not exist")
+        if os.path.isdir(f_input):
+            raise ValueError(f"The provided filename {f_input} is a directory")
+
+    if isinstance(f_input, (str, os.PathLike)):
+        return torch._C._get_mobile_model_contained_types(os.fspath(f_input))
+    else:
+        return torch._C._get_mobile_model_contained_types_from_buffer(f_input.read())
+
+
+def _backport_for_mobile(f_input, f_output, to_version):
+    r"""Take a input string containing a file name (file-like object) and a new destination to return a boolean.
+
+    Args:
+        f_input: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+        f_output: path to new model destination
+        to_version: the expected output model bytecode version
+    Returns:
+        success: A boolean. If backport success, return true, otherwise false
+    """
+    if isinstance(f_input, (str, os.PathLike)):
+        if not os.path.exists(f_input):
+            raise ValueError(f"The provided filename {f_input} does not exist")
+        if os.path.isdir(f_input):
+            raise ValueError(f"The provided filename {f_input} is a directory")
+
+    if (isinstance(f_input, (str, os.PathLike))) and (
+        isinstance(f_output, (str, os.PathLike))
+    ):
+        return torch._C._backport_for_mobile(
+            os.fspath(f_input), os.fspath(f_output), to_version
+        )
+    else:
+        return torch._C._backport_for_mobile_from_buffer(
+            f_input.read(), str(f_output), to_version
+        )
+
+
+def _backport_for_mobile_to_buffer(f_input, to_version):
+    r"""Take a string containing a file name (file-like object).
+
+    Args:
+        f_input: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+
+    """
+    if isinstance(f_input, (str, os.PathLike)):
+        if not os.path.exists(f_input):
+            raise ValueError(f"The provided filename {f_input} does not exist")
+        if os.path.isdir(f_input):
+            raise ValueError(f"The provided filename {f_input} is a directory")
+
+    if isinstance(f_input, (str, os.PathLike)):
+        return torch._C._backport_for_mobile_to_buffer(os.fspath(f_input), to_version)
+    else:
+        return torch._C._backport_for_mobile_from_buffer_to_buffer(
+            f_input.read(), to_version
+        )
+
+
+def _get_model_ops_and_info(f_input):
+    r"""Retrieve the root (top level) operators of a model and their corresponding compatibility info.
+
+    These root operators can call other operators within them (traced ops), and
+    a root op can call many different traced ops depending on internal code paths in the root op.
+    These traced ops are not returned by this function. Those operators are abstracted into the
+    runtime as an implementation detail (and the traced ops themselves can also call other operators)
+    making retrieving them difficult and their value from this api negligible since they will differ
+    between which runtime version the model is run on. Because of this, there is a false positive this
+    api can't prevent in a compatibility usecase. All the root ops of a model are present in a
+    target runtime, but not all the traced ops are which prevents a model from being able to run.
+    Args:
+        f_input: a file-like object (has to implement read, readline, tell, and seek),
+            or a string containing a file name
+
+    Returns:
+        Operators and info: A Dictionary mapping strings (the qualified names of the root operators)
+        of the model to their OperatorInfo structs.
+
+    Example:
+
+    .. testcode::
+
+        from torch.jit.mobile import _get_model_ops_and_info
+
+        # Get bytecode version from a saved file path
+        ops_and_info = _get_model_ops_and_info("path/to/model.ptl")
+
+    """
+    if isinstance(f_input, (str, os.PathLike)):
+        if not os.path.exists(f_input):
+            raise ValueError(f"The provided filename {f_input} does not exist")
+        if os.path.isdir(f_input):
+            raise ValueError(f"The provided filename {f_input} is a directory")
+
+    if isinstance(f_input, (str, os.PathLike)):
+        return torch._C._get_model_ops_and_info(os.fspath(f_input))
+    else:
+        return torch._C._get_model_ops_and_info(f_input.read())
diff --git a/MLPY/Lib/site-packages/torch/jit/mobile/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/jit/mobile/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8ca7234adb48728ed68aeede1ef7391ba163c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/jit/mobile/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/jit/quantized.py b/MLPY/Lib/site-packages/torch/jit/quantized.py
new file mode 100644
index 0000000000000000000000000000000000000000..e41f7169ecda90e2db9868f16682f02f7277fe84
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/quantized.py
@@ -0,0 +1,99 @@
+import torch
+
+
+class QuantizedLinear(torch.jit.ScriptModule):
+    def __init__(self, other):
+        raise RuntimeError(
+            "torch.jit.QuantizedLinear is no longer supported. Please use "
+            "torch.ao.nn.quantized.dynamic.Linear instead."
+        )
+
+
+# FP16 weights
+class QuantizedLinearFP16(torch.jit.ScriptModule):
+    def __init__(self, other):
+        super().__init__()
+        raise RuntimeError(
+            "torch.jit.QuantizedLinearFP16 is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.Linear instead."
+        )
+
+
+# Quantized RNN cell implementations
+class QuantizedRNNCellBase(torch.jit.ScriptModule):
+    def __init__(self, other):
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNCellBase is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
+        )
+
+
+class QuantizedRNNCell(QuantizedRNNCellBase):
+    def __init__(self, other):
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.RNNCell instead."
+        )
+
+
+class QuantizedLSTMCell(QuantizedRNNCellBase):
+    def __init__(self, other):
+        super().__init__(other)
+        raise RuntimeError(
+            "torch.jit.QuantizedLSTMCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.LSTMCell instead."
+        )
+
+
+class QuantizedGRUCell(QuantizedRNNCellBase):
+    def __init__(self, other):
+        super().__init__(other)
+        raise RuntimeError(
+            "torch.jit.QuantizedGRUCell is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.GRUCell instead."
+        )
+
+
+class QuantizedRNNBase(torch.jit.ScriptModule):
+    def __init__(self, other, dtype=torch.int8):
+        raise RuntimeError(
+            "torch.jit.QuantizedRNNBase is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic instead."
+        )
+
+
+class QuantizedLSTM(QuantizedRNNBase):
+    def __init__(self, other, dtype):
+        raise RuntimeError(
+            "torch.jit.QuantizedLSTM is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.LSTM instead."
+        )
+
+
+class QuantizedGRU(QuantizedRNNBase):
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(
+            "torch.jit.QuantizedGRU is no longer supported. "
+            "Please use the torch.ao.nn.quantized.dynamic.GRU instead."
+        )
+
+
+def quantize_rnn_cell_modules(module):
+    raise RuntimeError(
+        "quantize_rnn_cell_modules function is no longer supported. "
+        "Please use torch.ao.quantization.quantize_dynamic API instead."
+    )
+
+
+def quantize_linear_modules(module, dtype=torch.int8):
+    raise RuntimeError(
+        "quantize_linear_modules function is no longer supported. "
+        "Please use torch.ao.quantization.quantize_dynamic API instead."
+    )
+
+
+def quantize_rnn_modules(module, dtype=torch.int8):
+    raise RuntimeError(
+        "quantize_rnn_modules function is no longer supported. "
+        "Please use torch.ao.quantization.quantize_dynamic API instead."
+    )
diff --git a/MLPY/Lib/site-packages/torch/jit/supported_ops.py b/MLPY/Lib/site-packages/torch/jit/supported_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4edf697617ac904ecf3897e5f08742f961a231e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/supported_ops.py
@@ -0,0 +1,342 @@
+import inspect
+import textwrap
+
+import torch.jit
+from torch.jit._builtins import _find_builtin
+
+# this file is for generating documentation using sphinx autodoc
+# > help(torch.jit.supported_ops) will also give a nice listed of the
+# supported ops programmatically
+
+
+def _hidden(name):
+    return name.startswith("_") and not name.startswith("__")
+
+
+def _emit_type(type):
+    return str(type)
+
+
+def _emit_arg(indent, i, arg):
+    v = f"{arg.name} : {_emit_type(arg.type)}"
+    default = arg.default_value
+    if default is not None:
+        v = f"{v}={str(default)}"
+    if i > 0:
+        v = f"\n{' ' * indent}{v}"
+    return v
+
+
+def _emit_args(indent, arguments):
+    return ",".join(_emit_arg(indent, i, arg) for i, arg in enumerate(arguments))
+
+
+def _emit_ret(ret):
+    return _emit_type(ret.type)
+
+
+def _emit_rets(returns):
+    if len(returns) == 1:
+        return _emit_ret(returns[0])
+    return f"Tuple[{', '.join(_emit_ret(r) for r in returns)}]"
+
+
+def _emit_schema(mod, name, schema, arg_start=0, padding=4):
+    if mod is None:
+        qualified_name = name
+    else:
+        qualified_name = f"{mod}.{name}"
+    schema_str = "{}({}) -> {}".format(
+        qualified_name,
+        _emit_args(len(qualified_name) + 1 + padding, schema.arguments[arg_start:]),
+        _emit_rets(schema.returns),
+    )
+    return schema_str
+
+
+def _get_tensor_ops():
+    def is_tensor_method(schema):
+        if len(schema.arguments) == 0:
+            return False
+        self = schema.arguments[0]
+        if self.name != "self":
+            return False
+        if not self.type.isSubtypeOf(torch._C.TensorType.get()):
+            return False
+        return True
+
+    methods = []
+    # discover methods
+    for elem in dir(torch.Tensor):
+        if not _hidden(elem):
+            schemas = torch._C._jit_get_schemas_for_operator("aten::" + elem)
+            for schema in schemas:
+                if is_tensor_method(schema):
+                    methods.append(_emit_schema("Tensor", elem, schema, arg_start=1))
+
+    return "Supported Tensor Methods", methods
+
+
+def _get_nn_functional_ops():
+    functions = []
+
+    # Iterate over torch.nn.functional
+    mod = torch.nn.functional
+    name = mod.__name__
+    for elem in dir(torch.nn.functional):
+        attr = getattr(mod, elem)
+        if not inspect.isfunction(attr) or _hidden(elem[0]):
+            # Ignore non-functions and internal methods
+            continue
+
+        attr_module = inspect.getmodule(attr)
+        if not attr_module:
+            raise RuntimeError(f"Module for {attr} not found")
+
+        if "torch.nn.functional" not in attr_module.__name__:
+            # Ignore functions from outside torch.nn.functional
+            continue
+
+        try:
+            # compile fn, get schema
+            scripted = torch.jit.script(attr)
+            scripted_schema = scripted.schema
+            functions.append(_emit_schema(name, elem, scripted_schema))
+        except:  # noqa: B001,E722
+            # Skip interpolate / boolean dispatched things
+            pass
+
+    # Iterate over modules that we know contain a lot of builtins
+    for mod in torch.jit._builtins._modules_containing_builtins:
+        name = mod.__name__
+        for elem in dir(mod):
+            builtin = _find_builtin(getattr(mod, elem))
+            if builtin is not None:
+                schemas = torch._C._jit_get_schemas_for_operator(builtin)
+                for schema in schemas:
+                    # remove _tan but not __and__
+                    if not _hidden(elem):
+                        functions.append(_emit_schema(name, elem, schema))
+    return "Supported PyTorch Functions", functions
+
+
+def _get_builtins_helper():
+    builtins = []
+    for fn, _builtin_name in torch.jit._builtins._builtin_ops:
+        mod = inspect.getmodule(fn)
+
+        if not hasattr(fn, "__name__"):
+            # typing classes
+            continue
+        if not mod:
+            continue
+        if _hidden(fn.__name__) or _hidden(fn.__qualname__) or _hidden(mod.__name__):
+            # skip internal-only methods
+            continue
+
+        if "torch._C" in mod.__name__:
+            continue
+
+        builtins.append((fn, _builtin_name))
+
+    return builtins
+
+
+def _is_math_fn(fn):
+    mod = inspect.getmodule(fn)
+    if not mod:
+        raise RuntimeError(f"Module for {fn} not found")
+
+    return mod.__name__ == "math"
+
+
+def _get_torchscript_builtins():
+    functions = []
+    builtins = filter(lambda fn: not _is_math_fn(fn[0]), _get_builtins_helper())
+    builtins_list = list(builtins)
+    # Iterate over the specially added builtins
+    for fn, _builtin_name in builtins_list:
+        mod = inspect.getmodule(fn)
+        if not mod:
+            raise RuntimeError(f"Module for {fn} not found")
+        builtin = _find_builtin(fn)
+        if builtin is not None:
+            schemas = torch._C._jit_get_schemas_for_operator(builtin)
+            for schema in schemas:
+                functions.append(_emit_schema(mod.__name__, fn.__name__, schema))
+                pass
+
+    return "TorchScript Builtin Functions", functions
+
+
+def _get_math_builtins():
+    functions = []
+    builtins = filter(lambda fn: _is_math_fn(fn[0]), _get_builtins_helper())
+    builtins_list = list(builtins)
+    # Iterate over the specially added builtins
+    for fn, _builtin_name in builtins_list:
+        mod = inspect.getmodule(fn)
+        if not mod:
+            raise RuntimeError(f"Module for {fn} not found")
+        builtin = _find_builtin(fn)
+        if builtin is not None:
+            schemas = torch._C._jit_get_schemas_for_operator(builtin)
+            for schema in schemas:
+                schema_str = _emit_schema(mod.__name__, fn.__name__, schema)
+                if "Tensor" in schema_str:
+                    # Skip Tensor ops that have the same name as math functions
+                    # (they will show up in the tensor methods section)
+                    continue
+                functions.append(schema)
+                pass
+
+    return "``math`` Module", functions
+
+
+def _get_global_builtins():
+    # Taken from the 'globals' map in torch/csrc/jit/frontend/ir_emitter.cpp
+    supported_builtins = [
+        "print",
+        "tuple",
+        "float",
+        "complex",
+        "int",
+        "bool",
+        "str",
+        "getattr",
+        "hasattr",
+        "isinstance",
+        "len",
+        "hex",
+        "oct",
+        "round",
+        "hash",
+        "min",
+        "max",
+        "abs",
+        "all",
+        "divmod",
+        "list",
+        "ord",
+        "chr",
+        "bin",
+        "range",
+        "zip",
+        "enumerate",
+        "sorted",
+    ]
+
+    op_renames = {
+        "bool": "aten::Bool",
+        "int": "aten::Int",
+        "float": "aten::Float",
+        "complex": "aten::Complex",
+        "abs": "prim::abs",
+        "max": "prim::max",
+        "min": "prim::min",
+        "range": "fake::does_not_exist",
+    }
+
+    schemaless_op_explanations = {
+        "print": "Print any value",
+        "tuple": "Lists cannot be converted to tuples with this method since their size is not statically known",
+        "getattr": "Attribute name must be a literal string",
+        "hasattr": "Attribute name must be a literal string",
+        "isinstance": "Result is static",
+        "zip": "Arguments must be iterable. See :ref:`Iterables <jit_iterables>` for details.",
+        "enumerate": "Arguments must be iterable. See :ref:`Iterables <jit_iterables>` for details.",
+        "range": "Can only be used as an iterator in a for loop",
+    }
+
+    magic_methods = [
+        ("complex", "__complex__"),
+        ("float", "__float__"),
+        ("int", "__int__"),
+        ("bool", "__bool__"),
+        ("str", "__str__"),
+        ("len", "__len__"),
+        ("hex", "__hex__"),
+        ("oct", "__oct__"),
+    ]
+
+    magic_methods_rows = []
+    for fn, magic_method in magic_methods:
+        magic_methods_rows.append(f'"{fn}", "``{magic_method}``"')
+
+    schematized_ops = []
+    schemaless_ops = []
+
+    for fn in supported_builtins:
+        op_name = f"aten::{fn}"
+        if fn in op_renames:
+            op_name = op_renames[fn]
+        schemas = torch._C._jit_get_schemas_for_operator(op_name)
+        for s in schemas:
+            schematized_ops.append(_emit_schema(None, fn, s, padding=0))
+        if len(schemas) > 0:
+            schematized_ops.append("")
+        else:
+            table_row = f'":any:`{fn}`", "{schemaless_op_explanations[fn]}"'
+            schemaless_ops.append(table_row)
+
+    schematized_ops_str = "\n".join(schematized_ops)
+    schemaless_ops_str = "\n".join(schemaless_ops)
+    magic_methods_rows_str = "\n".join(magic_methods_rows)
+    schematized_ops_str = textwrap.indent(schematized_ops_str, "\t")
+    schemaless_ops_str = textwrap.indent(schemaless_ops_str, "\t")
+    magic_methods_rows_str = textwrap.indent(magic_methods_rows_str, "\t")
+    section = f"""
+The functions in the following table are supported but do not have a static schema
+
+.. csv-table::
+    :header: "Function", "Note"
+
+{schemaless_ops_str}
+
+The following functions will use the corresponding magic method on :any:`TorchScript classes`
+
+.. csv-table::
+    :header: "Function", "Magic Method"
+
+{magic_methods_rows_str}
+
+These built-in functions use the schema
+
+.. rst-class:: codeblock-height-limiter
+
+::
+
+{schematized_ops_str}
+    """
+
+    return "Python Built-in Functions", section
+
+
+def _list_supported_ops():
+    def emit_block(decls):
+        return "\n.. rst-class:: codeblock-height-limiter\n\n::\n\n{}\n".format(
+            "".join(f"    {d}\n\n" for d in decls)
+        )
+
+    body = ""
+    op_gathering_fns = (
+        _get_tensor_ops,
+        _get_nn_functional_ops,
+        _get_torchscript_builtins,
+        _get_global_builtins,
+        _get_math_builtins,
+    )
+    for fn in op_gathering_fns:
+        header, items = fn()
+        link_target = header.replace("`", "").replace("-", "").lower().replace(" ", "-")
+        if isinstance(items, str):
+            section = f"{header}\n{'~' * len(header)}\n{items}\n"
+        else:
+            section = f"{header}\n{'~' * len(header)}\n{emit_block(items)}"
+        section = f".. _{link_target}:" + "\n\n" + section
+        body += section
+
+    return body
+
+
+__doc__ = _list_supported_ops()
diff --git a/MLPY/Lib/site-packages/torch/jit/unsupported_tensor_ops.py b/MLPY/Lib/site-packages/torch/jit/unsupported_tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d79f19bf4bf024f668aecbb22086cc63e468d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/jit/unsupported_tensor_ops.py
@@ -0,0 +1,78 @@
+from textwrap import dedent
+
+from typing import Any, Dict
+
+import torch.jit
+
+
+def execWrapper(code, glob, loc):
+    exec(code, glob, loc)
+
+
+def _gen_unsupported_methods_properties():
+    tensor_attrs = set(filter(lambda x: x[0] != "_", dir(torch.Tensor)))
+    tensor = torch.tensor([2])
+    funcs_template = dedent(
+        """
+    def func(x):
+        return x.{op}()
+    """
+    )
+
+    deprecated_apis = {
+        "volatile",
+        "resize",
+        "reinforce",
+        "new",
+        "name",
+        "map2_",
+        "has_names",
+        "grad_fn",
+        "resize_as",
+    }
+    tensor_attrs = tensor_attrs - deprecated_apis
+
+    properties = []
+    methods = []
+    sorted_tensor_attrs = sorted(tensor_attrs, key=lambda x: x.lower())
+    for attr in sorted_tensor_attrs:
+        funcs_str = funcs_template.format(op=attr)
+        scope: Dict[str, Any] = {}
+        execWrapper(funcs_str, globals(), scope)
+        try:
+            cu = torch.jit.CompilationUnit(funcs_str)
+        except Exception as e:
+            if "nonexistent attribute" not in repr(e):
+                continue
+            attr_repr = repr(getattr(tensor, attr))
+            if "bound method" in attr_repr or "built-in method" in attr_repr:
+                methods.append(attr)
+            else:
+                properties.append(attr)
+
+    mapped_methods = ("\t*  :meth:`~torch.Tensor." + x + r"`" for x in methods)
+    mapped_properties = ("\t*  :attr:`~torch.Tensor." + x + r"`" for x in properties)
+    return "\n".join(mapped_methods), "\n".join(mapped_properties)
+
+
+def _list_unsupported_tensor_ops():
+    header = """\n\n
+Unsupported Tensor Methods
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    """
+    methods, properties = _gen_unsupported_methods_properties()
+    return (
+        header
+        + "\n"
+        + methods
+        + """
+
+Unsupported Tensor Properties
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    """
+        + "\n"
+        + properties
+    )
+
+
+__doc__ = _list_unsupported_tensor_ops()
diff --git a/MLPY/Lib/site-packages/torch/lib/XNNPACK.lib b/MLPY/Lib/site-packages/torch/lib/XNNPACK.lib
new file mode 100644
index 0000000000000000000000000000000000000000..74051ea0c684d575e8727bff538ecb8b470b8fdc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/XNNPACK.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d795cf0d9c1956f120dafe4e2ffb9764cb5b1f59d5de4a34912a1e85e9752fe
+size 3703446
diff --git a/MLPY/Lib/site-packages/torch/lib/_C.lib b/MLPY/Lib/site-packages/torch/lib/_C.lib
new file mode 100644
index 0000000000000000000000000000000000000000..a3ef111a144974bab273b5db5e20c67f1464e5f4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/_C.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/asmjit.dll b/MLPY/Lib/site-packages/torch/lib/asmjit.dll
new file mode 100644
index 0000000000000000000000000000000000000000..75d2557ec3e06bd90e8fc51199ca6dd1c73879d4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/asmjit.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/asmjit.lib b/MLPY/Lib/site-packages/torch/lib/asmjit.lib
new file mode 100644
index 0000000000000000000000000000000000000000..b207e9d74ea8d2242a6981fe0c3aa08aeeda55d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/asmjit.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/c10.dll b/MLPY/Lib/site-packages/torch/lib/c10.dll
new file mode 100644
index 0000000000000000000000000000000000000000..fefef1eaf1dade5f0650add0be0029b12b2aab92
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/c10.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/c10.lib b/MLPY/Lib/site-packages/torch/lib/c10.lib
new file mode 100644
index 0000000000000000000000000000000000000000..4b88949bbfa1ba96e5a70589056f72b724354dda
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/c10.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/cpuinfo.lib b/MLPY/Lib/site-packages/torch/lib/cpuinfo.lib
new file mode 100644
index 0000000000000000000000000000000000000000..9ab0b69eaa0ba227432abc6d9d7d3811a64c9c1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/cpuinfo.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/dnnl.lib b/MLPY/Lib/site-packages/torch/lib/dnnl.lib
new file mode 100644
index 0000000000000000000000000000000000000000..a40a58ddaedc9332e43235d1b38f965bf4d18cb6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/dnnl.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ee2983d073345f4b2aef3ab4ad5b915c8a4d3709113cbb1453e3f283bec14a1
+size 635606996
diff --git a/MLPY/Lib/site-packages/torch/lib/fbgemm.dll b/MLPY/Lib/site-packages/torch/lib/fbgemm.dll
new file mode 100644
index 0000000000000000000000000000000000000000..c2bafc25ee4fa9f7621a9a9b004d732ac61fab4f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/fbgemm.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e18ad86dae8caa56ebfd9da655f9c8b81d324a35586caf78734d9d0a48aa0518
+size 4961280
diff --git a/MLPY/Lib/site-packages/torch/lib/fbgemm.lib b/MLPY/Lib/site-packages/torch/lib/fbgemm.lib
new file mode 100644
index 0000000000000000000000000000000000000000..cf7f25bfb98637027d39a428c6168d1d92f2b248
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/fbgemm.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eef33a8cb398728d2e9bad2abbae2cef6d2d3128fe18b4dd98fcf455079537c
+size 1409606
diff --git a/MLPY/Lib/site-packages/torch/lib/fmt.lib b/MLPY/Lib/site-packages/torch/lib/fmt.lib
new file mode 100644
index 0000000000000000000000000000000000000000..451708b1c035e503d08f46f6cc44e05ac27ecaea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/fmt.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4413445ab3ec1f26105a073afd5eba3c94dac7972d398c0876bf60ea6e7969ea
+size 1668942
diff --git a/MLPY/Lib/site-packages/torch/lib/kineto.lib b/MLPY/Lib/site-packages/torch/lib/kineto.lib
new file mode 100644
index 0000000000000000000000000000000000000000..4b943586a9aaaf0072375e9829ed7817a70e1c3c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/kineto.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:905424bb615f55f5c7a847cc565c7981a5c8117b10dd75fc766161813dc84501
+size 22097456
diff --git a/MLPY/Lib/site-packages/torch/lib/libiomp5md.dll b/MLPY/Lib/site-packages/torch/lib/libiomp5md.dll
new file mode 100644
index 0000000000000000000000000000000000000000..07a4a06fb01a2cbb49766ee8cbc34da442fc2163
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/libiomp5md.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dbe32d07a43996afbc109d3ef0d49c3a1ab1e14b10690323f53f1f3deb76dbb
+size 1942464
diff --git a/MLPY/Lib/site-packages/torch/lib/libiompstubs5md.dll b/MLPY/Lib/site-packages/torch/lib/libiompstubs5md.dll
new file mode 100644
index 0000000000000000000000000000000000000000..0c50070f71608887d1b7ef8c337ef4cfa29dc3c6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/libiompstubs5md.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/libprotobuf-lite.lib b/MLPY/Lib/site-packages/torch/lib/libprotobuf-lite.lib
new file mode 100644
index 0000000000000000000000000000000000000000..b4ea7c90598d4cd13f388ada631cff2edb16c991
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/libprotobuf-lite.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc7b3f9cdfe5d96389cc99b90a715de14d2889172f942daa57f5612ab0a28f0d
+size 5168824
diff --git a/MLPY/Lib/site-packages/torch/lib/libprotobuf.lib b/MLPY/Lib/site-packages/torch/lib/libprotobuf.lib
new file mode 100644
index 0000000000000000000000000000000000000000..d66b2a2c12a26f733a8e1599ddb3617365655060
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/libprotobuf.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d91a1f18c20cc0d43ccfad21f51fa5daf3d4f1e983fea9db2e6cb22c64b4669
+size 37795392
diff --git a/MLPY/Lib/site-packages/torch/lib/libprotoc.lib b/MLPY/Lib/site-packages/torch/lib/libprotoc.lib
new file mode 100644
index 0000000000000000000000000000000000000000..4b3a40313fb14dd16d5b1b64ee98efeb73dffad8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/libprotoc.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a8bd01ed50eaffdf9ffac81fec8e847e9c6a4bed390dc1e9c389943415efcf
+size 39643602
diff --git a/MLPY/Lib/site-packages/torch/lib/pthreadpool.lib b/MLPY/Lib/site-packages/torch/lib/pthreadpool.lib
new file mode 100644
index 0000000000000000000000000000000000000000..e4519a8022e6ea743447454a3a56555e2504b959
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/pthreadpool.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/shm.dll b/MLPY/Lib/site-packages/torch/lib/shm.dll
new file mode 100644
index 0000000000000000000000000000000000000000..372f2423ea453c339573108362edd806c17743d6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/shm.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/shm.lib b/MLPY/Lib/site-packages/torch/lib/shm.lib
new file mode 100644
index 0000000000000000000000000000000000000000..6efbaf2297f27c8c98d74fff000324de8132eebc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/shm.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/torch.dll b/MLPY/Lib/site-packages/torch/lib/torch.dll
new file mode 100644
index 0000000000000000000000000000000000000000..4a03d27c609f2c4a4bcb2ac08f1da80ed6bf1721
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/torch.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/torch.lib b/MLPY/Lib/site-packages/torch/lib/torch.lib
new file mode 100644
index 0000000000000000000000000000000000000000..6db00c37fb936a19fd78cb5a2c4dbbdbca701644
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/torch.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/torch_cpu.dll b/MLPY/Lib/site-packages/torch/lib/torch_cpu.dll
new file mode 100644
index 0000000000000000000000000000000000000000..f44634f47b2b048b8dcf22927c84d127628cafbc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/torch_cpu.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4acf07358473bbd61312f5d725dd1bfe2e9d7ad6cc126c1197d72aeabebe1857
+size 131308032
diff --git a/MLPY/Lib/site-packages/torch/lib/torch_cpu.lib b/MLPY/Lib/site-packages/torch/lib/torch_cpu.lib
new file mode 100644
index 0000000000000000000000000000000000000000..9e2000e2b4dcd2ffdd551392e579a696e3f157b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/torch_cpu.lib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4604eb0b6f2d2866116f74f3723a910a0e4bf3dc6766773b50a85667a2ce96aa
+size 29408314
diff --git a/MLPY/Lib/site-packages/torch/lib/torch_global_deps.dll b/MLPY/Lib/site-packages/torch/lib/torch_global_deps.dll
new file mode 100644
index 0000000000000000000000000000000000000000..f94b55a46725943c1aa6b04a553192e948c21332
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/torch_global_deps.dll differ
diff --git a/MLPY/Lib/site-packages/torch/lib/torch_python.dll b/MLPY/Lib/site-packages/torch/lib/torch_python.dll
new file mode 100644
index 0000000000000000000000000000000000000000..d3167d2d7deced50f9412273e328f1af2bdc2bc2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/lib/torch_python.dll
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d71b7b50a01816b01016a7331913f7b3fa7f041655dad298d29f6da429151ea5
+size 14645248
diff --git a/MLPY/Lib/site-packages/torch/lib/torch_python.lib b/MLPY/Lib/site-packages/torch/lib/torch_python.lib
new file mode 100644
index 0000000000000000000000000000000000000000..69caba889774c75648385c7f9d6fb764bf395f7c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/torch_python.lib differ
diff --git a/MLPY/Lib/site-packages/torch/lib/uv.dll b/MLPY/Lib/site-packages/torch/lib/uv.dll
new file mode 100644
index 0000000000000000000000000000000000000000..6f161e4f9dcec554ba800d7b65bcdfe773f96610
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/lib/uv.dll differ
diff --git a/MLPY/Lib/site-packages/torch/library.py b/MLPY/Lib/site-packages/torch/library.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9dd6b8ac400b97117dd665380bc7a857eb03154
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/library.py
@@ -0,0 +1,532 @@
+from ._ops import OpOverload
+from typing import Any, Optional, Set, List
+import traceback
+import torch
+import weakref
+import functools
+import inspect
+import re
+import contextlib
+import sys
+
+__all__ = [
+    'Library',
+    'impl',
+    'define',
+    'fallthrough_kernel',
+    'impl_abstract',
+    'get_ctx',
+]
+
+# Set containing the combination of (namespace, operator, DispatchKey) for which a new kernel has been registered
+# The keys in the set are of the form `namespace + "/" + op_name + "/" + dispatch_key`.
+# This set is maintained to ensure that two libraries don't try to override the exact same functionality to avoid
+# libraries calling into kernels not intended to be called.
+_impls: Set[str] = set()
+_defs: Set[str] = set()
+
+# prim is reserved by TorchScript interpreter
+_reserved_namespaces = ['prim']
+
+def fallthrough_kernel():
+    """
+    A dummy function to pass to ``Library.impl`` in order to register a fallthrough.
+    """
+    raise NotImplementedError("fallthrough_kernel() should never be called.")
+
+class Library:
+    """
+    A class to create libraries that can be used to register new operators or
+    override operators in existing libraries from Python.
+    A user can optionally pass in a dispatch keyname if they only want to register
+    kernels corresponding to only one specific dispatch key.
+
+    To create a library to override operators in an existing library (with name ns), set the kind to "IMPL".
+    To create a new library (with name ns) to register new operators, set the kind to "DEF".
+    To create a fragment of a possibly existing library to register operators (and bypass
+    the limitation that there is only one library for a given namespace), set the kind to
+    "FRAGMENT".
+
+    Args:
+        ns: library name
+        kind: "DEF", "IMPL" (default: "IMPL"), "FRAGMENT"
+        dispatch_key: PyTorch dispatch key (default: "")
+    """
+    def __init__(self, ns, kind, dispatch_key=""):
+        if kind not in ('IMPL', 'DEF', 'FRAGMENT'):
+            raise ValueError("Unsupported kind: ", kind)
+
+        if ns in _reserved_namespaces and (kind == "DEF" or kind == 'FRAGMENT'):
+            raise ValueError(ns, " is a reserved namespace. Please try creating a library with another name.")
+
+        frame = traceback.extract_stack(limit=3)[0]
+        filename, lineno = frame.filename, frame.lineno
+        self.m: Optional[Any] = torch._C._dispatch_library(kind, ns, dispatch_key, filename, lineno)
+        self.ns = ns
+        self._op_defs: Set[str] = set()
+        self._op_impls: Set[str] = set()
+        self._registration_handles: List["torch._library.utils.RegistrationHandle"] = []
+        self.kind = kind
+        self.dispatch_key = dispatch_key
+        # Use a finalizer to setup the "destructor" instead of __del__.
+        # Python __del__ can lead to weird things (globals and locals may already
+        # be gone when __del__ actually gets called!). finalizers help the
+        # situation because it lets us capture references and keeps them alive
+        weakref.finalize(self, _del_library, _impls, self._op_impls, _defs, self._op_defs, self._registration_handles)
+
+    def __repr__(self):
+        return f"Library(kind={self.kind}, ns={self.ns}, dispatch_key={self.dispatch_key})>"
+
+    def define(self, schema, alias_analysis="", *, tags=()):
+        r'''Defines a new operator and its semantics in the ns namespace.
+
+        Args:
+            schema: function schema to define a new operator.
+            alias_analysis (optional): Indicates if the aliasing properties of the operator arguments can be
+                                       inferred from the schema (default behavior) or not ("CONSERVATIVE").
+            tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this
+                                       operator. Tagging an operator changes the operator's behavior
+                                       under various PyTorch subsystems; please read the docs for the
+                                       torch.Tag carefully before applying it.
+
+        Returns:
+            name of the operator as inferred from the schema.
+
+        Example::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
+            >>> my_lib = Library("foo", "DEF")
+            >>> my_lib.define("sum(Tensor self) -> Tensor")
+        '''
+        # This is added because we also want to disallow PURE_FUNCTION alias analysis which is a valid
+        # AliasAnalysis type in C++
+        if alias_analysis not in ["", "FROM_SCHEMA", "CONSERVATIVE"]:
+            raise RuntimeError(f"Invalid alias_analysis type {alias_analysis}")
+        assert self.m is not None
+        if isinstance(tags, torch.Tag):
+            tags = (tags,)
+        result = self.m.define(schema, alias_analysis, tuple(tags))
+        qualname = self.ns + "::" + schema.split("(")[0]
+        self._op_defs.add(qualname)
+        _defs.add(qualname)
+        return result
+
+    def impl(self, op_name, fn, dispatch_key=''):
+        r'''Registers the function implementation for an operator defined in the library.
+
+        Args:
+            op_name: operator name (along with the overload) or OpOverload object.
+            fn: function that's the operator implementation for the input dispatch key or :func:`~fallthrough_kernel`
+                to register a fallthrough.
+            dispatch_key: dispatch key that the input function should be registered for. By default, it uses
+                          the dispatch key that the library was created with.
+
+        Example::
+            >>> my_lib = Library("aten", "IMPL")
+            >>> def div_cpu(self, other):
+            >>>     return self * (1 / other)
+            >>> my_lib.impl("div.Tensor", div_cpu, "CPU")
+        '''
+        if not callable(fn):
+            raise TypeError(f"Input function is required to be a callable but found type {type(fn)}")
+        if dispatch_key == '':
+            dispatch_key = self.dispatch_key
+
+        if isinstance(op_name, str):
+            name = op_name
+        elif isinstance(op_name, OpOverload):
+            name = op_name._schema.name
+            overload_name = op_name._schema.overload_name
+            if overload_name != '':
+                name = name + '.' + overload_name
+        else:
+            raise RuntimeError("impl should be passed either a name or an OpOverload object as the first argument")
+
+        key = self.ns + "/" + name.split("::")[-1] + "/" + dispatch_key
+        if key in _impls:
+            # TODO: in future, add more info about where the existing function is registered (this info is
+            # today already returned by the C++ warning when impl is called but we error out before that)
+            raise RuntimeError("This is not allowed since there's already a kernel registered from python overriding {}"
+                               "'s behavior for {} dispatch key and {} namespace.".
+                               format(name.split("::")[-1], dispatch_key, self.ns))
+
+        if dispatch_key == "Meta":
+            dispatcher_op_name = name
+            if '::' not in dispatcher_op_name:
+                dispatcher_op_name = f'{self.ns}::{dispatcher_op_name}'
+
+            # Internally, we shouldn't be registering meta kernels for any operators that
+            # have CompositeImplicitAutograd kernels.
+            # Instead, we should be letting those decompositions run, and writing meta kernels
+            # only for the base operators.
+            if torch._C._dispatch_has_kernel_for_dispatch_key(dispatcher_op_name, "CompositeImplicitAutograd"):
+                raise RuntimeError(
+                    f"We should not register a meta kernel directly to the operator '{name}',"
+                    " because it has a CompositeImplicitAutograd kernel in core."
+                    " Instead we should let the operator decompose, and ensure that we have meta kernels"
+                    " for the base ops that it decomposes into.")
+
+        assert self.m is not None
+        self.m.impl(name, dispatch_key if dispatch_key != "" else "CompositeImplicitAutograd", fn)
+
+        _impls.add(key)
+        self._op_impls.add(key)
+
+    def _destroy(self):
+        if self.m is not None:
+            self.m.reset()
+        self.m = None
+        for handle in self._registration_handles:
+            handle.destroy()
+        self._registration_handles.clear()
+        for name in self._op_defs:
+            # Delete the cached torch.ops.ns.foo if it was registered.
+            # Otherwise, accessing it leads to a segfault.
+            # It's possible that we only registered an overload in this Library
+            # and another library owns an alive overload.
+            # That's OK - the next time torch.ops.ns.foo gets called, it'll be
+            # recomputed to point at the right collection of overloads.
+            ns, name_with_overload = name.split("::")
+            name = name_with_overload.split(".")[0]
+            if not hasattr(torch.ops, ns):
+                continue
+            namespace = getattr(torch.ops, ns)
+            if not hasattr(namespace, name):
+                continue
+            delattr(namespace, name)
+
+
+def _del_library(captured_impls, op_impls, captured_defs, op_defs, registration_handles):
+    captured_impls -= op_impls
+    captured_defs -= op_defs
+    for handle in registration_handles:
+        handle.destroy()
+
+
+@contextlib.contextmanager
+def _scoped_library(*args, **kwargs):
+    try:
+        lib = Library(*args, **kwargs)
+        yield lib
+    finally:
+        lib._destroy()
+
+
+_keep_alive: List[Library] = []
+
+
+NAMELESS_SCHEMA = re.compile(r"\(.*\) -> .*")
+
+
+@functools.singledispatch
+def define(qualname, schema, *, lib=None, tags=()):
+    r"""Defines a new operator.
+
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define the op (by providing an operator name and schema)
+    - we need to implement behavior for how the operator interacts with
+    various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+
+    This entrypoint defines the custom operator (the first step)
+    you must then perform the second step by calling various
+    ``impl_*`` APIs, like :func:`torch.library.impl` or
+    :func:`torch.library.impl_abstract`.
+
+    Args:
+        qualname (str): The qualified name for the operator. Should be
+            a string that looks like "namespace::name", e.g. "aten::sin".
+            Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module.
+        schema (str): The schema of the operator. E.g. "(Tensor x) -> Tensor"
+            for an op that accepts one Tensor and returns one Tensor. It does
+            not contain the operator name (that is passed in ``qualname``).
+        lib (Optional[Library]): If provided, the lifetime of this operator
+            will be tied to the lifetime of the Library object.
+        tags (Tag | Sequence[Tag]): one or more torch.Tag to apply to this
+            operator. Tagging an operator changes the operator's behavior
+            under various PyTorch subsystems; please read the docs for the
+            torch.Tag carefully before applying it.
+
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LIBRARY)
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Define the operator
+        >>> torch.library.define("mylib::sin", "(Tensor x) -> Tensor")
+        >>>
+        >>> # Add implementations for the operator
+        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> def f(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Call the new operator from torch.ops.
+        >>> x = torch.randn(3)
+        >>> y = torch.ops.mylib.sin(x)
+        >>> assert torch.allclose(y, x)
+
+    """
+    if not isinstance(qualname, str):
+        raise ValueError(
+            f"define(qualname, schema): expected qualname "
+            f"to be instance of str, got {type(qualname)}")
+    namespace, name = torch._library.utils.parse_namespace(qualname)
+    if lib is None:
+        lib = Library(namespace, "FRAGMENT")
+        _keep_alive.append(lib)
+    if not NAMELESS_SCHEMA.fullmatch(schema):
+        raise ValueError(
+            f"define(qualname, schema, ...): expected schema "
+            f"to look like e.g. \"(Tensor x) -> Tensor\" but "
+            f"got \"{schema}\"")
+    lib.define(name + schema, alias_analysis="", tags=tags)
+
+
+@define.register
+def _(lib: Library, schema, alias_analysis=""):
+    """The old torch.library.define.
+    We're keeping this around for BC reasons
+    """
+    def wrap(f):
+        name = lib.define(schema, alias_analysis)
+        lib.impl(name, f)
+        return f
+    return wrap
+
+
+@functools.singledispatch
+def impl(qualname, types, func=None, *, lib=None):
+    """Register an implementation for a device type for this operator.
+
+    You may pass "default" for ``types`` to register this implementation as the
+    default implementation for ALL device types.
+    Please only use this if the implementation truly supports all device types;
+    for example, this is true if it is a composition of built-in PyTorch operators.
+
+    Some valid types are: "cpu", "cuda", "xla", "mps", "ipu", "xpu".
+
+    Args:
+        qualname (str): Should be a string that looks like "namespace::operator_name".
+        types (str | Sequence[str]): The device types to register an impl to.
+        lib (Optional[Library]): If provided, the lifetime of this registration
+            will be tied to the lifetime of the Library object.
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Define the operator
+        >>> torch.library.define("mylibrary::sin", "(Tensor x) -> Tensor")
+        >>>
+        >>> # Add implementations for the cpu device
+        >>> @torch.library.impl("mylibrary::sin", "cpu")
+        >>> def f(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> x = torch.randn(3)
+        >>> y = torch.ops.mylibrary.sin(x)
+        >>> assert torch.allclose(y, x.sin())
+    """
+    if isinstance(types, str):
+        types = (types,)
+    keys = set({})
+    for typ in types:
+        is_dispatch_key = torch._C._parse_dispatch_key(typ)
+        if is_dispatch_key:
+            # We also support passing a DispatchKey to impl. Please prefer using
+            # the higher-level torch.library APIs and only pass DispatchKey to
+            # torch.library.impl with caution (or even better, don't use this
+            # option and file an issue on GitHub for what you need).
+            # We don't advertise this to users because
+            # it is very easy to shoot yourself in the foot.
+            keys.add(typ)
+        else:
+            keys.add(_device_type_to_key(typ))
+
+    def register(func):
+        namespace, _ = torch._library.utils.parse_namespace(qualname)
+        if lib is None:
+            use_lib = Library(namespace, "FRAGMENT")
+            _keep_alive.append(use_lib)
+        else:
+            use_lib = lib
+        for key in keys:
+            use_lib.impl(qualname, func, key)
+
+    if func is None:
+        return register
+    else:
+        register(func)
+
+
+def _device_type_to_key(device_type: str) -> str:
+    if device_type == "default":
+        # This is technically not correct, because although all device_type
+        # DispatchKeys are included in CompositeExplicitAutograd,
+        # not everything in CompositeExplicitAutograd is associated with a
+        # device_type. I don't really care that much about the difference.
+        return "CompositeExplicitAutograd"
+    return torch._C._dispatch_key_for_device(device_type)
+
+
+@impl.register
+def _(lib: Library, name, dispatch_key=""):
+    """Legacy torch.library.impl API. Kept around for BC"""
+    def wrap(f):
+        lib.impl(name, f, dispatch_key)
+        return f
+    return wrap
+
+
+
+def impl_abstract(qualname, func=None, *, lib=None, _stacklevel=1):
+    r"""Register an abstract implementation for this operator.
+
+    An "abstract implementation" specifies the behavior of this operator on
+    Tensors that carry no data. Given some input Tensors with certain properties
+    (sizes/strides/storage_offset/device), it specifies what the properties of
+    the output Tensors are.
+
+    The abstract implementation has the same signature as the operator.
+    It is run for both FakeTensors and meta tensors. To write an abstract
+    implementation, assume that all Tensor inputs to the operator are
+    regular CPU/CUDA/Meta tensors, but they do not have storage, and
+    you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+    The abstract implementation must consist of only PyTorch operations
+    (and may not directly access the storage or data of any input or
+    intermediate Tensors).
+
+    This API may be used as a decorator (see examples).
+
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1W--T6wz8IY8fOI0Vm8BF44PdBgs283QvpelJZWieQWQ/edit
+
+    Examples:
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Example 1: an operator without data-dependent output shape
+        >>> torch.library.define(
+        >>>     "mylib::custom_linear",
+        >>>     "(Tensor x, Tensor weight, Tensor bias) -> Tensor")
+        >>>
+        >>> @torch.library.impl_abstract("mylib::custom_linear")
+        >>> def custom_linear_abstract(x, weight):
+        >>>     assert x.dim() == 2
+        >>>     assert weight.dim() == 2
+        >>>     assert bias.dim() == 1
+        >>>     assert x.shape[1] == weight.shape[1]
+        >>>     assert weight.shape[0] == bias.shape[0]
+        >>>     assert x.device == weight.device
+        >>>
+        >>>     return (x @ weight.t()) + bias
+        >>>
+        >>> # Example 2: an operator with data-dependent output shape
+        >>> torch.library.define("mylib::custom_nonzero", "(Tensor x) -> Tensor")
+        >>>
+        >>> @torch.library.impl_abstract("mylib::custom_nonzero")
+        >>> def custom_nonzero_abstract(x):
+        >>>     # Number of nonzero-elements is data-dependent.
+        >>>     # Since we cannot peek at the data in an abstract impl,
+        >>>     # we use the ctx object to construct a new symint that
+        >>>     # represents the data-dependent size.
+        >>>     ctx = torch.library.get_ctx()
+        >>>     nnz = ctx.new_dynamic_size()
+        >>>     shape = [nnz, x.dim()]
+        >>>     result = x.new_empty(shape, dtype=torch.int64)
+        >>>     return result
+        >>>
+        >>> @torch.library.impl("mylib::custom_nonzero", "cpu")
+        >>> def custom_nonzero_cpu(x):
+        >>>     x_np = x.numpy()
+        >>>     res = np.stack(np.nonzero(x_np), axis=1)
+        >>>     return torch.tensor(res, device=x.device)
+
+    """
+    source = torch._library.utils.get_source(_stacklevel + 1)
+    frame = sys._getframe(_stacklevel)
+    caller_module = inspect.getmodule(frame)
+    # Can be none if you call impl_abstract from somewhere there isn't a module
+    # (e.g. __main__)
+    caller_module_name = None if caller_module is None else caller_module.__name__
+
+    # TODO(rzou): We're gonna need to stage this change with torchvision,
+    # since torchvision is github first.
+    if caller_module_name is not None and caller_module_name.startswith("torchvision."):
+        caller_module_name = None
+
+    def inner(func):
+        entry = torch._library.simple_registry.singleton.find(qualname)
+        if caller_module_name is not None:
+            func_to_register = _check_pystubs_once(func, qualname, caller_module_name)
+        else:
+            func_to_register = func
+
+        handle = entry.abstract_impl.register(func_to_register, source)
+        if lib is not None:
+            lib._registration_handles.append(handle)
+        return func
+
+    if func is None:
+        return inner
+    return inner(func)
+
+
+# If the op was defined in C++, then we want to make sure there was an
+# m.impl_abstract_pystub(module, ...) call and that the module is the
+# same as the module that called torch.library.impl_abstract.
+def _check_pystubs_once(func, qualname, actual_module_name):
+    checked = False
+
+    def inner(*args, **kwargs):
+        nonlocal checked
+        if checked:
+            return func(*args, **kwargs)
+
+        op = torch._library.utils.lookup_op(qualname)
+        if op._defined_in_python:
+            checked = True
+            return func(*args, **kwargs)
+
+        maybe_pystub = torch._C._dispatch_pystub(
+            op._schema.name,
+            op._schema.overload_name)
+        if not maybe_pystub:
+            namespace = op.namespace
+            cpp_filename = op._handle().debug()
+            raise RuntimeError(
+                f"Operator '{qualname}' was defined in C++ and has a Python "
+                f"abstract impl. In this situation, we require there to also be a "
+                f"companion C++ `m.impl_abstract_pystub(\"{actual_module_name}\")` "
+                f"call, but we could not find one. Please add that to "
+                f"to the top of the C++ TORCH_LIBRARY({namespace}, ...) block the "
+                f"operator was registered in ({cpp_filename})")
+        pystub_module = maybe_pystub[0]
+        if actual_module_name != pystub_module:
+            cpp_filename = op._handle().debug()
+            raise RuntimeError(
+                f"Operator '{qualname}' specified that its python abstract impl "
+                f"is in the Python module '{pystub_module}' but it was actually found "
+                f"in '{actual_module_name}'. Please either move the abstract impl "
+                f"or correct the m.impl_abstract_pystub call ({cpp_filename})")
+        checked = True
+        return func(*args, **kwargs)
+    return inner
+
+
+# NOTE [ctx inside the fake implementation]
+# If a user has an operator with data-dependent output shape, then when writing
+# a fake implementation they must query the current ctx and use methods on the
+# ctx to construct a new unbacked symint.
+#
+# This is done via us setting the global_ctx_getter function every time a fake
+# implementation is invoked.
+def get_ctx() -> "torch._library.abstract_impl.AbstractImplCtx":
+    """get_ctx() returns the current AbstractImplCtx object.
+
+    Calling ``get_ctx()`` is only valid inside of an abstract impl
+    (see :func:`torch.library.impl_abstract` for more usage details.
+    """
+    return torch._library.abstract_impl.global_ctx_getter()
diff --git a/MLPY/Lib/site-packages/torch/linalg/__init__.py b/MLPY/Lib/site-packages/torch/linalg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a4e89496ed704f4200241ac01eca066770003b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/linalg/__init__.py
@@ -0,0 +1,2848 @@
+import sys
+
+import torch
+from torch._C import _add_docstr, _linalg  # type: ignore[attr-defined]
+
+LinAlgError = torch._C._LinAlgError  # type: ignore[attr-defined]
+
+Tensor = torch.Tensor
+
+common_notes = {
+    "experimental_warning": """This function is "experimental" and it may change in a future PyTorch release.""",
+    "sync_note": "When inputs are on a CUDA device, this function synchronizes that device with the CPU.",
+    "sync_note_ex": r"When the inputs are on a CUDA device, this function synchronizes only when :attr:`check_errors`\ `= True`.",
+    "sync_note_has_ex": ("When inputs are on a CUDA device, this function synchronizes that device with the CPU. "
+                         "For a version of this function that does not synchronize, see :func:`{}`.")
+}
+
+
+# Note: This not only adds doc strings for functions in the linalg namespace, but
+# also connects the torch.linalg Python namespace to the torch._C._linalg builtins.
+
+cross = _add_docstr(_linalg.linalg_cross, r"""
+linalg.cross(input, other, *, dim=-1, out=None) -> Tensor
+
+
+Computes the cross product of two 3-dimensional vectors.
+
+Supports input of float, double, cfloat and cdouble dtypes. Also supports batches
+of vectors, for which it computes the product along the dimension :attr:`dim`.
+It broadcasts over the batch dimensions.
+
+Args:
+    input (Tensor): the first input tensor.
+    other (Tensor): the second input tensor.
+    dim  (int, optional): the dimension along which to take the cross-product. Default: `-1`.
+
+Keyword args:
+    out (Tensor, optional): the output tensor. Ignored if `None`. Default: `None`.
+
+Example:
+    >>> a = torch.randn(4, 3)
+    >>> a
+    tensor([[-0.3956,  1.1455,  1.6895],
+            [-0.5849,  1.3672,  0.3599],
+            [-1.1626,  0.7180, -0.0521],
+            [-0.1339,  0.9902, -2.0225]])
+    >>> b = torch.randn(4, 3)
+    >>> b
+    tensor([[-0.0257, -1.4725, -1.2251],
+            [-1.1479, -0.7005, -1.9757],
+            [-1.3904,  0.3726, -1.1836],
+            [-0.9688, -0.7153,  0.2159]])
+    >>> torch.linalg.cross(a, b)
+    tensor([[ 1.0844, -0.5281,  0.6120],
+            [-2.4490, -1.5687,  1.9792],
+            [-0.8304, -1.3037,  0.5650],
+            [-1.2329,  1.9883,  1.0551]])
+    >>> a = torch.randn(1, 3)  # a is broadcast to match shape of b
+    >>> a
+    tensor([[-0.9941, -0.5132,  0.5681]])
+    >>> torch.linalg.cross(a, b)
+    tensor([[ 1.4653, -1.2325,  1.4507],
+            [ 1.4119, -2.6163,  0.1073],
+            [ 0.3957, -1.9666, -1.0840],
+            [ 0.2956, -0.3357,  0.2139]])
+""")
+
+cholesky = _add_docstr(_linalg.linalg_cholesky, r"""
+linalg.cholesky(A, *, upper=False, out=None) -> Tensor
+
+Computes the Cholesky decomposition of a complex Hermitian or real symmetric positive-definite matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **Cholesky decomposition** of a complex Hermitian or real symmetric positive-definite matrix
+:math:`A \in \mathbb{K}^{n \times n}` is defined as
+
+.. math::
+
+    A = LL^{\text{H}}\mathrlap{\qquad L \in \mathbb{K}^{n \times n}}
+
+where :math:`L` is a lower triangular matrix with real positive diagonal (even in the complex case) and
+:math:`L^{\text{H}}` is the conjugate transpose when :math:`L` is complex, and the transpose when :math:`L` is real-valued.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.cholesky_ex")}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.cholesky_ex` for a version of this operation that
+        skips the (slow) error checking by default and instead returns the debug
+        information. This makes it a faster way to check if a matrix is
+        positive-definite.
+
+        :func:`torch.linalg.eigh` for a different decomposition of a Hermitian matrix.
+        The eigenvalue decomposition gives more information about the matrix but it
+        slower to compute than the Cholesky decomposition.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of symmetric or Hermitian positive-definite matrices.
+
+Keyword args:
+    upper (bool, optional): whether to return an upper triangular matrix.
+        The tensor returned with upper=True is the conjugate transpose of the tensor
+        returned with upper=False.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if the :attr:`A` matrix or any matrix in a batched :attr:`A` is not Hermitian
+                  (resp. symmetric) positive-definite. If :attr:`A` is a batch of matrices,
+                  the error message will include the batch index of the first matrix that fails
+                  to meet this condition.
+
+Examples::
+
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> A = A @ A.T.conj() + torch.eye(2) # creates a Hermitian positive-definite matrix
+    >>> A
+    tensor([[2.5266+0.0000j, 1.9586-2.0626j],
+            [1.9586+2.0626j, 9.4160+0.0000j]], dtype=torch.complex128)
+    >>> L = torch.linalg.cholesky(A)
+    >>> L
+    tensor([[1.5895+0.0000j, 0.0000+0.0000j],
+            [1.2322+1.2976j, 2.4928+0.0000j]], dtype=torch.complex128)
+    >>> torch.dist(L @ L.T.conj(), A)
+    tensor(4.4692e-16, dtype=torch.float64)
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.float64)
+    >>> A = A @ A.mT + torch.eye(2)  # batch of symmetric positive-definite matrices
+    >>> L = torch.linalg.cholesky(A)
+    >>> torch.dist(L @ L.mT, A)
+    tensor(5.8747e-16, dtype=torch.float64)
+""")
+
+cholesky_ex = _add_docstr(_linalg.linalg_cholesky_ex, r"""
+linalg.cholesky_ex(A, *, upper=False, check_errors=False, out=None) -> (Tensor, Tensor)
+
+Computes the Cholesky decomposition of a complex Hermitian or real
+symmetric positive-definite matrix.
+
+This function skips the (slow) error checking and error message construction
+of :func:`torch.linalg.cholesky`, instead directly returning the LAPACK
+error codes as part of a named tuple ``(L, info)``. This makes this function
+a faster way to check if a matrix is positive-definite, and it provides an
+opportunity to handle decomposition errors more gracefully or performantly
+than :func:`torch.linalg.cholesky` does.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+If :attr:`A` is not a Hermitian positive-definite matrix, or if it's a batch of matrices
+and one or more of them is not a Hermitian positive-definite matrix,
+then ``info`` stores a positive integer for the corresponding matrix.
+The positive integer indicates the order of the leading minor that is not positive-definite,
+and the decomposition could not be completed.
+``info`` filled with zeros indicates that the decomposition was successful.
+If ``check_errors=True`` and ``info`` contains positive integers, then a RuntimeError is thrown.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+.. seealso::
+        :func:`torch.linalg.cholesky` is a NumPy compatible variant that always checks for errors.
+
+Args:
+    A (Tensor): the Hermitian `n \times n` matrix or the batch of such matrices of size
+                    `(*, n, n)` where `*` is one or more batch dimensions.
+
+Keyword args:
+    upper (bool, optional): whether to return an upper triangular matrix.
+        The tensor returned with upper=True is the conjugate transpose of the tensor
+        returned with upper=False.
+    check_errors (bool, optional): controls whether to check the content of ``infos``. Default: `False`.
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> A = A @ A.t().conj()  # creates a Hermitian positive-definite matrix
+    >>> L, info = torch.linalg.cholesky_ex(A)
+    >>> A
+    tensor([[ 2.3792+0.0000j, -0.9023+0.9831j],
+            [-0.9023-0.9831j,  0.8757+0.0000j]], dtype=torch.complex128)
+    >>> L
+    tensor([[ 1.5425+0.0000j,  0.0000+0.0000j],
+            [-0.5850-0.6374j,  0.3567+0.0000j]], dtype=torch.complex128)
+    >>> info
+    tensor(0, dtype=torch.int32)
+
+""")
+
+inv = _add_docstr(_linalg.linalg_inv, r"""
+linalg.inv(A, *, out=None) -> Tensor
+
+Computes the inverse of a square matrix if it exists.
+Throws a `RuntimeError` if the matrix is not invertible.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+for a matrix :math:`A \in \mathbb{K}^{n \times n}`,
+its **inverse matrix** :math:`A^{-1} \in \mathbb{K}^{n \times n}` (if it exists) is defined as
+
+.. math::
+
+    A^{-1}A = AA^{-1} = \mathrm{I}_n
+
+where :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+
+The inverse matrix exists if and only if :math:`A` is `invertible`_. In this case,
+the inverse is unique.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices
+then the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.inv_ex")}
+""" + r"""
+
+.. note::
+    Consider using :func:`torch.linalg.solve` if possible for multiplying a matrix on the left by
+    the inverse, as::
+
+        linalg.solve(A, B) == linalg.inv(A) @ B  # When B is a matrix
+
+    It is always preferred to use :func:`~solve` when possible, as it is faster and more
+    numerically stable than computing the inverse explicitly.
+
+.. seealso::
+
+        :func:`torch.linalg.pinv` computes the pseudoinverse (Moore-Penrose inverse) of matrices
+        of any shape.
+
+        :func:`torch.linalg.solve` computes :attr:`A`\ `.inv() @ \ `:attr:`B` with a
+        numerically stable algorithm.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of invertible matrices.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if the matrix :attr:`A` or any matrix in the batch of matrices :attr:`A` is not invertible.
+
+Examples::
+
+    >>> A = torch.randn(4, 4)
+    >>> Ainv = torch.linalg.inv(A)
+    >>> torch.dist(A @ Ainv, torch.eye(4))
+    tensor(1.1921e-07)
+
+    >>> A = torch.randn(2, 3, 4, 4)  # Batch of matrices
+    >>> Ainv = torch.linalg.inv(A)
+    >>> torch.dist(A @ Ainv, torch.eye(4))
+    tensor(1.9073e-06)
+
+    >>> A = torch.randn(4, 4, dtype=torch.complex128)  # Complex matrix
+    >>> Ainv = torch.linalg.inv(A)
+    >>> torch.dist(A @ Ainv, torch.eye(4))
+    tensor(7.5107e-16, dtype=torch.float64)
+
+.. _invertible:
+    https://en.wikipedia.org/wiki/Invertible_matrix#The_invertible_matrix_theorem
+""")
+
+solve_ex = _add_docstr(_linalg.linalg_solve_ex, r"""
+linalg.solve_ex(A, B, *, left=True, check_errors=False, out=None) -> (Tensor, Tensor)
+
+A version of :func:`~solve` that does not perform error checks unless :attr:`check_errors`\ `= True`.
+It also returns the :attr:`info` tensor returned by `LAPACK's getrf`_.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    left (bool, optional): whether to solve the system :math:`AX=B` or :math:`XA = B`. Default: `True`.
+    check_errors (bool, optional): controls whether to check the content of ``infos`` and raise
+                                   an error if it is non-zero. Default: `False`.
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(result, info)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> Ainv, info = torch.linalg.solve_ex(A)
+    >>> torch.dist(torch.linalg.inv(A), Ainv)
+    tensor(0.)
+    >>> info
+    tensor(0, dtype=torch.int32)
+
+.. _LAPACK's getrf:
+    https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+""")
+
+inv_ex = _add_docstr(_linalg.linalg_inv_ex, r"""
+linalg.inv_ex(A, *, check_errors=False, out=None) -> (Tensor, Tensor)
+
+Computes the inverse of a square matrix if it is invertible.
+
+Returns a namedtuple ``(inverse, info)``. ``inverse`` contains the result of
+inverting :attr:`A` and ``info`` stores the LAPACK error codes.
+
+If :attr:`A` is not an invertible matrix, or if it's a batch of matrices
+and one or more of them is not an invertible matrix,
+then ``info`` stores a positive integer for the corresponding matrix.
+The positive integer indicates the diagonal element of the LU decomposition of
+the input matrix that is exactly zero.
+``info`` filled with zeros indicates that the inversion was successful.
+If ``check_errors=True`` and ``info`` contains positive integers, then a RuntimeError is thrown.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.inv` is a NumPy compatible variant that always checks for errors.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                    consisting of square matrices.
+    check_errors (bool, optional): controls whether to check the content of ``info``. Default: `False`.
+
+Keyword args:
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> Ainv, info = torch.linalg.inv_ex(A)
+    >>> torch.dist(torch.linalg.inv(A), Ainv)
+    tensor(0.)
+    >>> info
+    tensor(0, dtype=torch.int32)
+
+""")
+
+det = _add_docstr(_linalg.linalg_det, r"""
+linalg.det(A, *, out=None) -> Tensor
+
+Computes the determinant of a square matrix.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.linalg.slogdet` computes the sign and natural logarithm of the absolute
+        value of the determinant of square matrices.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.linalg.det(A)
+    tensor(0.0934)
+
+    >>> A = torch.randn(3, 2, 2)
+    >>> torch.linalg.det(A)
+    tensor([1.1990, 0.4099, 0.7386])
+""")
+
+slogdet = _add_docstr(_linalg.linalg_slogdet, r"""
+linalg.slogdet(A, *, out=None) -> (Tensor, Tensor)
+
+Computes the sign and natural logarithm of the absolute value of the determinant of a square matrix.
+
+For complex :attr:`A`, it returns the sign and the natural logarithm of the modulus of the
+determinant, that is, a logarithmic polar decomposition of the determinant.
+
+The determinant can be recovered as `sign * exp(logabsdet)`.
+When a matrix has a determinant of zero, it returns `(0, -inf)`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.linalg.det` computes the determinant of square matrices.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    out (tuple, optional): output tuple of two tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(sign, logabsdet)`.
+
+    `sign` will have the same dtype as :attr:`A`.
+
+    `logabsdet` will always be real-valued, even when :attr:`A` is complex.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> A
+    tensor([[ 0.0032, -0.2239, -1.1219],
+            [-0.6690,  0.1161,  0.4053],
+            [-1.6218, -0.9273, -0.0082]])
+    >>> torch.linalg.det(A)
+    tensor(-0.7576)
+    >>> torch.logdet(A)
+    tensor(nan)
+    >>> torch.linalg.slogdet(A)
+    torch.return_types.linalg_slogdet(sign=tensor(-1.), logabsdet=tensor(-0.2776))
+""")
+
+eig = _add_docstr(_linalg.linalg_eig, r"""
+linalg.eig(A, *, out=None) -> (Tensor, Tensor)
+
+Computes the eigenvalue decomposition of a square matrix if it exists.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **eigenvalue decomposition** of a square matrix
+:math:`A \in \mathbb{K}^{n \times n}` (if it exists) is defined as
+
+.. math::
+
+    A = V \operatorname{diag}(\Lambda) V^{-1}\mathrlap{\qquad V \in \mathbb{C}^{n \times n}, \Lambda \in \mathbb{C}^n}
+
+This decomposition exists if and only if :math:`A` is `diagonalizable`_.
+This is the case when all its eigenvalues are different.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. note:: The eigenvalues and eigenvectors of a real matrix may be complex.
+
+""" + fr"""
+.. note:: {common_notes["sync_note"]}
+""" + r"""
+
+.. warning:: This function assumes that :attr:`A` is `diagonalizable`_ (for example, when all the
+             eigenvalues are different). If it is not diagonalizable, the returned
+             eigenvalues will be correct but :math:`A \neq V \operatorname{diag}(\Lambda)V^{-1}`.
+
+.. warning:: The returned eigenvectors are normalized to have norm `1`.
+             Even then, the eigenvectors of a matrix are not unique, nor are they continuous with respect to
+             :attr:`A`. Due to this lack of uniqueness, different hardware and software may compute
+             different eigenvectors.
+
+             This non-uniqueness is caused by the fact that multiplying an eigenvector by
+             by :math:`e^{i \phi}, \phi \in \mathbb{R}` produces another set of valid eigenvectors
+             of the matrix.  For this reason, the loss function shall not depend on the phase of the
+             eigenvectors, as this quantity is not well-defined.
+             This is checked when computing the gradients of this function. As such,
+             when inputs are on a CUDA device, the computation of the gradients
+             of this function synchronizes that device with the CPU.
+
+
+.. warning:: Gradients computed using the `eigenvectors` tensor will only be finite when
+             :attr:`A` has distinct eigenvalues.
+             Furthermore, if the distance between any two eigenvalues is close to zero,
+             the gradient will be numerically unstable, as it depends on the eigenvalues
+             :math:`\lambda_i` through the computation of
+             :math:`\frac{1}{\min_{i \neq j} \lambda_i - \lambda_j}`.
+
+.. seealso::
+
+        :func:`torch.linalg.eigvals` computes only the eigenvalues.
+        Unlike :func:`torch.linalg.eig`, the gradients of :func:`~eigvals` are always
+        numerically stable.
+
+        :func:`torch.linalg.eigh` for a (faster) function that computes the eigenvalue decomposition
+        for Hermitian and symmetric matrices.
+
+        :func:`torch.linalg.svd` for a function that computes another type of spectral
+        decomposition that works on matrices of any shape.
+
+        :func:`torch.linalg.qr` for another (much faster) decomposition that works on matrices of
+        any shape.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of diagonalizable matrices.
+
+Keyword args:
+    out (tuple, optional): output tuple of two tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(eigenvalues, eigenvectors)` which corresponds to :math:`\Lambda` and :math:`V` above.
+
+    `eigenvalues` and `eigenvectors` will always be complex-valued, even when :attr:`A` is real. The eigenvectors
+    will be given by the columns of `eigenvectors`.
+
+Examples::
+
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> A
+    tensor([[ 0.9828+0.3889j, -0.4617+0.3010j],
+            [ 0.1662-0.7435j, -0.6139+0.0562j]], dtype=torch.complex128)
+    >>> L, V = torch.linalg.eig(A)
+    >>> L
+    tensor([ 1.1226+0.5738j, -0.7537-0.1286j], dtype=torch.complex128)
+    >>> V
+    tensor([[ 0.9218+0.0000j,  0.1882-0.2220j],
+            [-0.0270-0.3867j,  0.9567+0.0000j]], dtype=torch.complex128)
+    >>> torch.dist(V @ torch.diag(L) @ torch.linalg.inv(V), A)
+    tensor(7.7119e-16, dtype=torch.float64)
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.float64)
+    >>> L, V = torch.linalg.eig(A)
+    >>> torch.dist(V @ torch.diag_embed(L) @ torch.linalg.inv(V), A)
+    tensor(3.2841e-16, dtype=torch.float64)
+
+.. _diagonalizable:
+    https://en.wikipedia.org/wiki/Diagonalizable_matrix#Definition
+""")
+
+eigvals = _add_docstr(_linalg.linalg_eigvals, r"""
+linalg.eigvals(A, *, out=None) -> Tensor
+
+Computes the eigenvalues of a square matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **eigenvalues** of a square matrix :math:`A \in \mathbb{K}^{n \times n}` are defined
+as the roots (counted with multiplicity) of the polynomial `p` of degree `n` given by
+
+.. math::
+
+    p(\lambda) = \operatorname{det}(A - \lambda \mathrm{I}_n)\mathrlap{\qquad \lambda \in \mathbb{C}}
+
+where :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. note:: The eigenvalues of a real matrix may be complex, as the roots of a real polynomial may be complex.
+
+          The eigenvalues of a matrix are always well-defined, even when the matrix is not diagonalizable.
+
+""" + fr"""
+.. note:: {common_notes["sync_note"]}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.eig` computes the full eigenvalue decomposition.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Returns:
+    A complex-valued tensor containing the eigenvalues even when :attr:`A` is real.
+
+Examples::
+
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> L = torch.linalg.eigvals(A)
+    >>> L
+    tensor([ 1.1226+0.5738j, -0.7537-0.1286j], dtype=torch.complex128)
+
+    >>> torch.dist(L, torch.linalg.eig(A).eigenvalues)
+    tensor(2.4576e-07)
+""")
+
+eigh = _add_docstr(_linalg.linalg_eigh, r"""
+linalg.eigh(A, UPLO='L', *, out=None) -> (Tensor, Tensor)
+
+Computes the eigenvalue decomposition of a complex Hermitian or real symmetric matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **eigenvalue decomposition** of a complex Hermitian or real symmetric matrix
+:math:`A \in \mathbb{K}^{n \times n}` is defined as
+
+.. math::
+
+    A = Q \operatorname{diag}(\Lambda) Q^{\text{H}}\mathrlap{\qquad Q \in \mathbb{K}^{n \times n}, \Lambda \in \mathbb{R}^n}
+
+where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex, and the transpose when :math:`Q` is real-valued.
+:math:`Q` is orthogonal in the real case and unitary in the complex case.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+:attr:`A` is assumed to be Hermitian (resp. symmetric), but this is not checked internally, instead:
+
+- If :attr:`UPLO`\ `= 'L'` (default), only the lower triangular part of the matrix is used in the computation.
+- If :attr:`UPLO`\ `= 'U'`, only the upper triangular part of the matrix is used.
+
+The eigenvalues are returned in ascending order.
+
+""" + fr"""
+.. note:: {common_notes["sync_note"]}
+""" + r"""
+
+.. note:: The eigenvalues of real symmetric or complex Hermitian matrices are always real.
+
+.. warning:: The eigenvectors of a symmetric matrix are not unique, nor are they continuous with
+             respect to :attr:`A`. Due to this lack of uniqueness, different hardware and
+             software may compute different eigenvectors.
+
+             This non-uniqueness is caused by the fact that multiplying an eigenvector by
+             `-1` in the real case or by :math:`e^{i \phi}, \phi \in \mathbb{R}` in the complex
+             case produces another set of valid eigenvectors of the matrix.
+             For this reason, the loss function shall not depend on the phase of the eigenvectors, as
+             this quantity is not well-defined.
+             This is checked for complex inputs when computing the gradients of this function. As such,
+             when inputs are complex and are on a CUDA device, the computation of the gradients
+             of this function synchronizes that device with the CPU.
+
+.. warning:: Gradients computed using the `eigenvectors` tensor will only be finite when
+             :attr:`A` has distinct eigenvalues.
+             Furthermore, if the distance between any two eigenvalues is close to zero,
+             the gradient will be numerically unstable, as it depends on the eigenvalues
+             :math:`\lambda_i` through the computation of
+             :math:`\frac{1}{\min_{i \neq j} \lambda_i - \lambda_j}`.
+
+.. warning:: User may see pytorch crashes if running `eigh` on CUDA devices with CUDA versions before 12.1 update 1
+             with large ill-conditioned matrices as inputs.
+             Refer to :ref:`Linear Algebra Numerical Stability<Linear Algebra Stability>` for more details.
+             If this is the case, user may (1) tune their matrix inputs to be less ill-conditioned,
+             or (2) use :func:`torch.backends.cuda.preferred_linalg_library` to
+             try other supported backends.
+
+.. seealso::
+
+        :func:`torch.linalg.eigvalsh` computes only the eigenvalues of a Hermitian matrix.
+        Unlike :func:`torch.linalg.eigh`, the gradients of :func:`~eigvalsh` are always
+        numerically stable.
+
+        :func:`torch.linalg.cholesky` for a different decomposition of a Hermitian matrix.
+        The Cholesky decomposition gives less information about the matrix but is much faster
+        to compute than the eigenvalue decomposition.
+
+        :func:`torch.linalg.eig` for a (slower) function that computes the eigenvalue decomposition
+        of a not necessarily Hermitian square matrix.
+
+        :func:`torch.linalg.svd` for a (slower) function that computes the more general SVD
+        decomposition of matrices of any shape.
+
+        :func:`torch.linalg.qr` for another (much faster) decomposition that works on general
+        matrices.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of symmetric or Hermitian matrices.
+    UPLO ('L', 'U', optional): controls whether to use the upper or lower triangular part
+                               of :attr:`A` in the computations. Default: `'L'`.
+
+Keyword args:
+    out (tuple, optional): output tuple of two tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(eigenvalues, eigenvectors)` which corresponds to :math:`\Lambda` and :math:`Q` above.
+
+    `eigenvalues` will always be real-valued, even when :attr:`A` is complex.
+    It will also be ordered in ascending order.
+
+    `eigenvectors` will have the same dtype as :attr:`A` and will contain the eigenvectors as its columns.
+
+Examples::
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> A = A + A.T.conj()  # creates a Hermitian matrix
+    >>> A
+    tensor([[2.9228+0.0000j, 0.2029-0.0862j],
+            [0.2029+0.0862j, 0.3464+0.0000j]], dtype=torch.complex128)
+    >>> L, Q = torch.linalg.eigh(A)
+    >>> L
+    tensor([0.3277, 2.9415], dtype=torch.float64)
+    >>> Q
+    tensor([[-0.0846+-0.0000j, -0.9964+0.0000j],
+            [ 0.9170+0.3898j, -0.0779-0.0331j]], dtype=torch.complex128)
+    >>> torch.dist(Q @ torch.diag(L.cdouble()) @ Q.T.conj(), A)
+    tensor(6.1062e-16, dtype=torch.float64)
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.float64)
+    >>> A = A + A.mT  # creates a batch of symmetric matrices
+    >>> L, Q = torch.linalg.eigh(A)
+    >>> torch.dist(Q @ torch.diag_embed(L) @ Q.mH, A)
+    tensor(1.5423e-15, dtype=torch.float64)
+""")
+
+eigvalsh = _add_docstr(_linalg.linalg_eigvalsh, r"""
+linalg.eigvalsh(A, UPLO='L', *, out=None) -> Tensor
+
+Computes the eigenvalues of a complex Hermitian or real symmetric matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **eigenvalues** of a complex Hermitian or real symmetric  matrix :math:`A \in \mathbb{K}^{n \times n}`
+are defined as the roots (counted with multiplicity) of the polynomial `p` of degree `n` given by
+
+.. math::
+
+    p(\lambda) = \operatorname{det}(A - \lambda \mathrm{I}_n)\mathrlap{\qquad \lambda \in \mathbb{R}}
+
+where :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+The eigenvalues of a real symmetric or complex Hermitian matrix are always real.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+The eigenvalues are returned in ascending order.
+
+:attr:`A` is assumed to be Hermitian (resp. symmetric), but this is not checked internally, instead:
+
+- If :attr:`UPLO`\ `= 'L'` (default), only the lower triangular part of the matrix is used in the computation.
+- If :attr:`UPLO`\ `= 'U'`, only the upper triangular part of the matrix is used.
+
+""" + fr"""
+.. note:: {common_notes["sync_note"]}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.eigh` computes the full eigenvalue decomposition.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of symmetric or Hermitian matrices.
+    UPLO ('L', 'U', optional): controls whether to use the upper or lower triangular part
+                               of :attr:`A` in the computations. Default: `'L'`.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Returns:
+    A real-valued tensor containing the eigenvalues even when :attr:`A` is complex.
+    The eigenvalues are returned in ascending order.
+
+Examples::
+
+    >>> A = torch.randn(2, 2, dtype=torch.complex128)
+    >>> A = A + A.T.conj()  # creates a Hermitian matrix
+    >>> A
+    tensor([[2.9228+0.0000j, 0.2029-0.0862j],
+            [0.2029+0.0862j, 0.3464+0.0000j]], dtype=torch.complex128)
+    >>> torch.linalg.eigvalsh(A)
+    tensor([0.3277, 2.9415], dtype=torch.float64)
+
+    >>> A = torch.randn(3, 2, 2, dtype=torch.float64)
+    >>> A = A + A.mT  # creates a batch of symmetric matrices
+    >>> torch.linalg.eigvalsh(A)
+    tensor([[ 2.5797,  3.4629],
+            [-4.1605,  1.3780],
+            [-3.1113,  2.7381]], dtype=torch.float64)
+""")
+
+householder_product = _add_docstr(_linalg.linalg_householder_product, r"""
+householder_product(A, tau, *, out=None) -> Tensor
+
+Computes the first `n` columns of a product of Householder matrices.
+
+Let :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, and
+let :math:`V \in \mathbb{K}^{m \times n}` be a matrix with columns :math:`v_i \in \mathbb{K}^m`
+for :math:`i=1,\ldots,m` with :math:`m \geq n`. Denote by :math:`w_i` the vector resulting from
+zeroing out the first :math:`i-1` components of :math:`v_i` and setting to `1` the :math:`i`-th.
+For a vector :math:`\tau \in \mathbb{K}^k` with :math:`k \leq n`, this function computes the
+first :math:`n` columns of the matrix
+
+.. math::
+
+    H_1H_2 ... H_k \qquad\text{with}\qquad H_i = \mathrm{I}_m - \tau_i w_i w_i^{\text{H}}
+
+where :math:`\mathrm{I}_m` is the `m`-dimensional identity matrix and :math:`w^{\text{H}}` is the
+conjugate transpose when :math:`w` is complex, and the transpose when :math:`w` is real-valued.
+The output matrix is the same size as the input matrix :attr:`A`.
+
+See `Representation of Orthogonal or Unitary Matrices`_ for further details.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.geqrf` can be used together with this function to form the `Q` from the
+        :func:`~qr` decomposition.
+
+        :func:`torch.ormqr` is a related function that computes the matrix multiplication
+        of a product of Householder matrices with another matrix.
+        However, that function is not supported by autograd.
+
+.. warning::
+    Gradient computations are only well-defined if :math:`tau_i \neq \frac{1}{||v_i||^2}`.
+    If this condition is not met, no error will be thrown, but the gradient produced may contain `NaN`.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    tau (Tensor): tensor of shape `(*, k)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if :attr:`A` doesn't satisfy the requirement `m >= n`,
+                  or :attr:`tau` doesn't satisfy the requirement `n >= k`.
+
+Examples::
+
+    >>> A = torch.randn(2, 2)
+    >>> h, tau = torch.geqrf(A)
+    >>> Q = torch.linalg.householder_product(h, tau)
+    >>> torch.dist(Q, torch.linalg.qr(A).Q)
+    tensor(0.)
+
+    >>> h = torch.randn(3, 2, 2, dtype=torch.complex128)
+    >>> tau = torch.randn(3, 1, dtype=torch.complex128)
+    >>> Q = torch.linalg.householder_product(h, tau)
+    >>> Q
+    tensor([[[ 1.8034+0.4184j,  0.2588-1.0174j],
+            [-0.6853+0.7953j,  2.0790+0.5620j]],
+
+            [[ 1.4581+1.6989j, -1.5360+0.1193j],
+            [ 1.3877-0.6691j,  1.3512+1.3024j]],
+
+            [[ 1.4766+0.5783j,  0.0361+0.6587j],
+            [ 0.6396+0.1612j,  1.3693+0.4481j]]], dtype=torch.complex128)
+
+.. _Representation of Orthogonal or Unitary Matrices:
+    https://www.netlib.org/lapack/lug/node128.html
+""")
+
+ldl_factor = _add_docstr(_linalg.linalg_ldl_factor, r"""
+linalg.ldl_factor(A, *, hermitian=False, out=None) -> (Tensor, Tensor)
+
+Computes a compact representation of the LDL factorization of a Hermitian or symmetric (possibly indefinite) matrix.
+
+When :attr:`A` is complex valued it can be Hermitian (:attr:`hermitian`\ `= True`)
+or symmetric (:attr:`hermitian`\ `= False`).
+
+The factorization is of the form the form :math:`A = L D L^T`.
+If :attr:`hermitian` is `True` then transpose operation is the conjugate transpose.
+
+:math:`L` (or :math:`U`) and :math:`D` are stored in compact form in ``LD``.
+They follow the format specified by `LAPACK's sytrf`_ function.
+These tensors may be used in :func:`torch.linalg.ldl_solve` to solve linear systems.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.ldl_factor_ex")}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of symmetric or Hermitian matrices.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the input to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LD, pivots)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> A
+    tensor([[7.2079, 4.2414, 1.9428],
+            [4.2414, 3.4554, 0.3264],
+            [1.9428, 0.3264, 1.3823]])
+    >>> LD, pivots = torch.linalg.ldl_factor(A)
+    >>> LD
+    tensor([[ 7.2079,  0.0000,  0.0000],
+            [ 0.5884,  0.9595,  0.0000],
+            [ 0.2695, -0.8513,  0.1633]])
+    >>> pivots
+    tensor([1, 2, 3], dtype=torch.int32)
+
+.. _LAPACK's sytrf:
+    https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+""")
+
+ldl_factor_ex = _add_docstr(_linalg.linalg_ldl_factor_ex, r"""
+linalg.ldl_factor_ex(A, *, hermitian=False, check_errors=False, out=None) -> (Tensor, Tensor, Tensor)
+
+This is a version of :func:`~ldl_factor` that does not perform error checks unless :attr:`check_errors`\ `= True`.
+It also returns the :attr:`info` tensor returned by `LAPACK's sytrf`_.
+``info`` stores integer error codes from the backend library.
+A positive integer indicates the diagonal element of :math:`D` that is zero.
+Division by 0 will occur if the result is used for solving a system of linear equations.
+``info`` filled with zeros indicates that the factorization was successful.
+If ``check_errors=True`` and ``info`` contains positive integers, then a `RuntimeError` is thrown.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions
+                consisting of symmetric or Hermitian matrices.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the input to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    check_errors (bool, optional): controls whether to check the content of ``info`` and raise
+                                   an error if it is non-zero. Default: `False`.
+    out (tuple, optional): tuple of three tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LD, pivots, info)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> A
+    tensor([[7.2079, 4.2414, 1.9428],
+            [4.2414, 3.4554, 0.3264],
+            [1.9428, 0.3264, 1.3823]])
+    >>> LD, pivots, info = torch.linalg.ldl_factor_ex(A)
+    >>> LD
+    tensor([[ 7.2079,  0.0000,  0.0000],
+            [ 0.5884,  0.9595,  0.0000],
+            [ 0.2695, -0.8513,  0.1633]])
+    >>> pivots
+    tensor([1, 2, 3], dtype=torch.int32)
+    >>> info
+    tensor(0, dtype=torch.int32)
+
+.. _LAPACK's sytrf:
+    https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html
+""")
+
+ldl_solve = _add_docstr(_linalg.linalg_ldl_solve, r"""
+linalg.ldl_solve(LD, pivots, B, *, hermitian=False, out=None) -> Tensor
+
+Computes the solution of a system of linear equations using the LDL factorization.
+
+:attr:`LD` and :attr:`pivots` are the compact representation of the LDL factorization and
+are expected to be computed by :func:`torch.linalg.ldl_factor_ex`.
+:attr:`hermitian` argument to this function should be the same
+as the corresponding arguments in :func:`torch.linalg.ldl_factor_ex`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    LD (Tensor): the `n \times n` matrix or the batch of such matrices of size
+                      `(*, n, n)` where `*` is one or more batch dimensions.
+    pivots (Tensor): the pivots corresponding to the LDL factorization of :attr:`LD`.
+    B (Tensor): right-hand side tensor of shape `(*, n, k)`.
+
+Keyword args:
+    hermitian (bool, optional): whether to consider the decomposed matrix to be Hermitian or symmetric.
+                                For real-valued matrices, this switch has no effect. Default: `False`.
+    out (tuple, optional): output tensor. `B` may be passed as `out` and the result is computed in-place on `B`.
+                           Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> A = A @ A.mT # make symmetric
+    >>> LD, pivots, info = torch.linalg.ldl_factor_ex(A)
+    >>> B = torch.randn(2, 3, 4)
+    >>> X = torch.linalg.ldl_solve(LD, pivots, B)
+    >>> torch.linalg.norm(A @ X - B)
+    >>> tensor(0.0001)
+""")
+
+lstsq = _add_docstr(_linalg.linalg_lstsq, r"""
+torch.linalg.lstsq(A, B, rcond=None, *, driver=None) -> (Tensor, Tensor, Tensor, Tensor)
+
+Computes a solution to the least squares problem of a system of linear equations.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **least squares problem** for a linear system :math:`AX = B` with
+:math:`A \in \mathbb{K}^{m \times n}, B \in \mathbb{K}^{m \times k}` is defined as
+
+.. math::
+
+    \min_{X \in \mathbb{K}^{n \times k}} \|AX - B\|_F
+
+where :math:`\|-\|_F` denotes the Frobenius norm.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+:attr:`driver` chooses the backend function that will be used.
+For CPU inputs the valid values are `'gels'`, `'gelsy'`, `'gelsd`, `'gelss'`.
+To choose the best driver on CPU consider:
+
+- If :attr:`A` is well-conditioned (its `condition number`_ is not too large), or you do not mind some precision loss.
+
+  - For a general matrix: `'gelsy'` (QR with pivoting) (default)
+  - If :attr:`A` is full-rank: `'gels'` (QR)
+
+- If :attr:`A` is not well-conditioned.
+
+  - `'gelsd'` (tridiagonal reduction and SVD)
+  - But if you run into memory issues: `'gelss'` (full SVD).
+
+For CUDA input, the only valid driver is `'gels'`, which assumes that :attr:`A` is full-rank.
+
+See also the `full description of these drivers`_
+
+:attr:`rcond` is used to determine the effective rank of the matrices in :attr:`A`
+when :attr:`driver` is one of (`'gelsy'`, `'gelsd'`, `'gelss'`).
+In this case, if :math:`\sigma_i` are the singular values of `A` in decreasing order,
+:math:`\sigma_i` will be rounded down to zero if :math:`\sigma_i \leq \text{rcond} \cdot \sigma_1`.
+If :attr:`rcond`\ `= None` (default), :attr:`rcond` is set to the machine precision of the dtype of :attr:`A` times `max(m, n)`.
+
+This function returns the solution to the problem and some extra information in a named tuple of
+four tensors `(solution, residuals, rank, singular_values)`. For inputs :attr:`A`, :attr:`B`
+of shape `(*, m, n)`, `(*, m, k)` respectively, it contains
+
+- `solution`: the least squares solution. It has shape `(*, n, k)`.
+- `residuals`: the squared residuals of the solutions, that is, :math:`\|AX - B\|_F^2`.
+  It has shape equal to the batch dimensions of :attr:`A`.
+  It is computed when `m > n` and every matrix in :attr:`A` is full-rank,
+  otherwise, it is an empty tensor.
+  If :attr:`A` is a batch of matrices and any matrix in the batch is not full rank,
+  then an empty tensor is returned. This behavior may change in a future PyTorch release.
+- `rank`: tensor of ranks of the matrices in :attr:`A`.
+  It has shape equal to the batch dimensions of :attr:`A`.
+  It is computed when :attr:`driver` is one of (`'gelsy'`, `'gelsd'`, `'gelss'`),
+  otherwise it is an empty tensor.
+- `singular_values`: tensor of singular values of the matrices in :attr:`A`.
+  It has shape `(*, min(m, n))`.
+  It is computed when :attr:`driver` is one of (`'gelsd'`, `'gelss'`),
+  otherwise it is an empty tensor.
+
+.. note::
+    This function computes `X = \ `:attr:`A`\ `.pinverse() @ \ `:attr:`B` in a faster and
+    more numerically stable way than performing the computations separately.
+
+.. warning::
+    The default value of :attr:`rcond` may change in a future PyTorch release.
+    It is therefore recommended to use a fixed value to avoid potential
+    breaking changes.
+
+Args:
+    A (Tensor): lhs tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    B (Tensor): rhs tensor of shape `(*, m, k)` where `*` is zero or more batch dimensions.
+    rcond (float, optional): used to determine the effective rank of :attr:`A`.
+                             If :attr:`rcond`\ `= None`, :attr:`rcond` is set to the machine
+                             precision of the dtype of :attr:`A` times `max(m, n)`. Default: `None`.
+
+Keyword args:
+    driver (str, optional): name of the LAPACK/MAGMA method to be used.
+        If `None`, `'gelsy'` is used for CPU inputs and `'gels'` for CUDA inputs.
+        Default: `None`.
+
+Returns:
+    A named tuple `(solution, residuals, rank, singular_values)`.
+
+Examples::
+
+    >>> A = torch.randn(1,3,3)
+    >>> A
+    tensor([[[-1.0838,  0.0225,  0.2275],
+         [ 0.2438,  0.3844,  0.5499],
+         [ 0.1175, -0.9102,  2.0870]]])
+    >>> B = torch.randn(2,3,3)
+    >>> B
+    tensor([[[-0.6772,  0.7758,  0.5109],
+         [-1.4382,  1.3769,  1.1818],
+         [-0.3450,  0.0806,  0.3967]],
+        [[-1.3994, -0.1521, -0.1473],
+         [ 1.9194,  1.0458,  0.6705],
+         [-1.1802, -0.9796,  1.4086]]])
+    >>> X = torch.linalg.lstsq(A, B).solution # A is broadcasted to shape (2, 3, 3)
+    >>> torch.dist(X, torch.linalg.pinv(A) @ B)
+    tensor(1.5152e-06)
+
+    >>> S = torch.linalg.lstsq(A, B, driver='gelsd').singular_values
+    >>> torch.dist(S, torch.linalg.svdvals(A))
+    tensor(2.3842e-07)
+
+    >>> A[:, 0].zero_()  # Decrease the rank of A
+    >>> rank = torch.linalg.lstsq(A, B).rank
+    >>> rank
+    tensor([2])
+
+.. _condition number:
+    https://pytorch.org/docs/master/linalg.html#torch.linalg.cond
+.. _full description of these drivers:
+    https://www.netlib.org/lapack/lug/node27.html
+""")
+
+matrix_power = _add_docstr(_linalg.linalg_matrix_power, r"""
+matrix_power(A, n, *, out=None) -> Tensor
+
+Computes the `n`-th power of a square matrix for an integer `n`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+If :attr:`n`\ `= 0`, it returns the identity matrix (or batch) of the same shape
+as :attr:`A`. If :attr:`n` is negative, it returns the inverse of each matrix
+(if invertible) raised to the power of `abs(n)`.
+
+.. note::
+    Consider using :func:`torch.linalg.solve` if possible for multiplying a matrix on the left by
+    a negative power as, if :attr:`n`\ `> 0`::
+
+        torch.linalg.solve(matrix_power(A, n), B) == matrix_power(A, -n)  @ B
+
+    It is always preferred to use :func:`~solve` when possible, as it is faster and more
+    numerically stable than computing :math:`A^{-n}` explicitly.
+
+.. seealso::
+
+        :func:`torch.linalg.solve` computes :attr:`A`\ `.inverse() @ \ `:attr:`B` with a
+        numerically stable algorithm.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, m)` where `*` is zero or more batch dimensions.
+    n (int): the exponent.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if :attr:`n`\ `< 0` and the matrix :attr:`A` or any matrix in the
+                  batch of matrices :attr:`A` is not invertible.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> torch.linalg.matrix_power(A, 0)
+    tensor([[1., 0., 0.],
+            [0., 1., 0.],
+            [0., 0., 1.]])
+    >>> torch.linalg.matrix_power(A, 3)
+    tensor([[ 1.0756,  0.4980,  0.0100],
+            [-1.6617,  1.4994, -1.9980],
+            [-0.4509,  0.2731,  0.8001]])
+    >>> torch.linalg.matrix_power(A.expand(2, -1, -1), -2)
+    tensor([[[ 0.2640,  0.4571, -0.5511],
+            [-1.0163,  0.3491, -1.5292],
+            [-0.4899,  0.0822,  0.2773]],
+            [[ 0.2640,  0.4571, -0.5511],
+            [-1.0163,  0.3491, -1.5292],
+            [-0.4899,  0.0822,  0.2773]]])
+""")
+
+matrix_rank = _add_docstr(_linalg.linalg_matrix_rank, r"""
+linalg.matrix_rank(A, *, atol=None, rtol=None, hermitian=False, out=None) -> Tensor
+
+Computes the numerical rank of a matrix.
+
+The matrix rank is computed as the number of singular values
+(or eigenvalues in absolute value when :attr:`hermitian`\ `= True`)
+that are greater than :math:`\max(\text{atol}, \sigma_1 * \text{rtol})` threshold,
+where :math:`\sigma_1` is the largest singular value (or eigenvalue).
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+If :attr:`hermitian`\ `= True`, :attr:`A` is assumed to be Hermitian if complex or
+symmetric if real, but this is not checked internally. Instead, just the lower
+triangular part of the matrix is used in the computations.
+
+If :attr:`rtol` is not specified and :attr:`A` is a matrix of dimensions `(m, n)`,
+the relative tolerance is set to be :math:`\text{rtol} = \max(m, n) \varepsilon`
+and :math:`\varepsilon` is the epsilon value for the dtype of :attr:`A` (see :class:`.finfo`).
+If :attr:`rtol` is not specified and :attr:`atol` is specified to be larger than zero then
+:attr:`rtol` is set to zero.
+
+If :attr:`atol` or :attr:`rtol` is a :class:`torch.Tensor`, its shape must be broadcastable to that
+of the singular values of :attr:`A` as returned by :func:`torch.linalg.svdvals`.
+
+.. note::
+    This function has NumPy compatible variant `linalg.matrix_rank(A, tol, hermitian=False)`.
+    However, use of the positional argument :attr:`tol` is deprecated in favor of :attr:`atol` and :attr:`rtol`.
+
+""" + fr"""
+.. note:: The matrix rank is computed using a singular value decomposition
+          :func:`torch.linalg.svdvals` if :attr:`hermitian`\ `= False` (default) and the eigenvalue
+          decomposition :func:`torch.linalg.eigvalsh` when :attr:`hermitian`\ `= True`.
+          {common_notes["sync_note"]}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    tol (float, Tensor, optional): [NumPy Compat] Alias for :attr:`atol`. Default: `None`.
+
+Keyword args:
+    atol (float, Tensor, optional): the absolute tolerance value. When `None` it's considered to be zero.
+                                    Default: `None`.
+    rtol (float, Tensor, optional): the relative tolerance value. See above for the value it takes when `None`.
+                                    Default: `None`.
+    hermitian(bool): indicates whether :attr:`A` is Hermitian if complex
+                     or symmetric if real. Default: `False`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.eye(10)
+    >>> torch.linalg.matrix_rank(A)
+    tensor(10)
+    >>> B = torch.eye(10)
+    >>> B[0, 0] = 0
+    >>> torch.linalg.matrix_rank(B)
+    tensor(9)
+
+    >>> A = torch.randn(4, 3, 2)
+    >>> torch.linalg.matrix_rank(A)
+    tensor([2, 2, 2, 2])
+
+    >>> A = torch.randn(2, 4, 2, 3)
+    >>> torch.linalg.matrix_rank(A)
+    tensor([[2, 2, 2, 2],
+            [2, 2, 2, 2]])
+
+    >>> A = torch.randn(2, 4, 3, 3, dtype=torch.complex64)
+    >>> torch.linalg.matrix_rank(A)
+    tensor([[3, 3, 3, 3],
+            [3, 3, 3, 3]])
+    >>> torch.linalg.matrix_rank(A, hermitian=True)
+    tensor([[3, 3, 3, 3],
+            [3, 3, 3, 3]])
+    >>> torch.linalg.matrix_rank(A, atol=1.0, rtol=0.0)
+    tensor([[3, 2, 2, 2],
+            [1, 2, 1, 2]])
+    >>> torch.linalg.matrix_rank(A, atol=1.0, rtol=0.0, hermitian=True)
+    tensor([[2, 2, 2, 1],
+            [1, 2, 2, 2]])
+""")
+
+norm = _add_docstr(_linalg.linalg_norm, r"""
+linalg.norm(A, ord=None, dim=None, keepdim=False, *, out=None, dtype=None) -> Tensor
+
+Computes a vector or matrix norm.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+
+Whether this function computes a vector or matrix norm is determined as follows:
+
+- If :attr:`dim` is an `int`, the vector norm will be computed.
+- If :attr:`dim` is a `2`-`tuple`, the matrix norm will be computed.
+- If :attr:`dim`\ `= None` and :attr:`ord`\ `= None`,
+  :attr:`A` will be flattened to 1D and the `2`-norm of the resulting vector will be computed.
+- If :attr:`dim`\ `= None` and :attr:`ord` `!= None`, :attr:`A` must be 1D or 2D.
+
+:attr:`ord` defines the norm that is computed. The following norms are supported:
+
+======================     =========================  ========================================================
+:attr:`ord`                norm for matrices          norm for vectors
+======================     =========================  ========================================================
+`None` (default)           Frobenius norm             `2`-norm (see below)
+`'fro'`                    Frobenius norm             -- not supported --
+`'nuc'`                    nuclear norm               -- not supported --
+`inf`                      `max(sum(abs(x), dim=1))`  `max(abs(x))`
+`-inf`                     `min(sum(abs(x), dim=1))`  `min(abs(x))`
+`0`                        -- not supported --        `sum(x != 0)`
+`1`                        `max(sum(abs(x), dim=0))`  as below
+`-1`                       `min(sum(abs(x), dim=0))`  as below
+`2`                        largest singular value     as below
+`-2`                       smallest singular value    as below
+other `int` or `float`     -- not supported --        `sum(abs(x)^{ord})^{(1 / ord)}`
+======================     =========================  ========================================================
+
+where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+.. seealso::
+
+        :func:`torch.linalg.vector_norm` computes a vector norm.
+
+        :func:`torch.linalg.matrix_norm` computes a matrix norm.
+
+        The above functions are often clearer and more flexible than using :func:`torch.linalg.norm`.
+        For example, `torch.linalg.norm(A, ord=1, dim=(0, 1))` always
+        computes a matrix norm, but with `torch.linalg.vector_norm(A, ord=1, dim=(0, 1))` it is possible
+        to compute a vector norm over the two dimensions.
+
+Args:
+    A (Tensor): tensor of shape `(*, n)` or `(*, m, n)` where `*` is zero or more batch dimensions
+    ord (int, float, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `None`
+    dim (int, Tuple[int], optional): dimensions over which to compute
+        the vector or matrix norm. See above for the behavior when :attr:`dim`\ `= None`.
+        Default: `None`
+    keepdim (bool, optional): If set to `True`, the reduced dimensions are retained
+        in the result as dimensions with size one. Default: `False`
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    dtype (:class:`torch.dtype`, optional): If specified, the input tensor is cast to
+        :attr:`dtype` before performing the operation, and the returned tensor's type
+        will be :attr:`dtype`. Default: `None`
+
+Returns:
+    A real-valued tensor, even when :attr:`A` is complex.
+
+Examples::
+
+    >>> from torch import linalg as LA
+    >>> a = torch.arange(9, dtype=torch.float) - 4
+    >>> a
+    tensor([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
+    >>> B = a.reshape((3, 3))
+    >>> B
+    tensor([[-4., -3., -2.],
+            [-1.,  0.,  1.],
+            [ 2.,  3.,  4.]])
+
+    >>> LA.norm(a)
+    tensor(7.7460)
+    >>> LA.norm(B)
+    tensor(7.7460)
+    >>> LA.norm(B, 'fro')
+    tensor(7.7460)
+    >>> LA.norm(a, float('inf'))
+    tensor(4.)
+    >>> LA.norm(B, float('inf'))
+    tensor(9.)
+    >>> LA.norm(a, -float('inf'))
+    tensor(0.)
+    >>> LA.norm(B, -float('inf'))
+    tensor(2.)
+
+    >>> LA.norm(a, 1)
+    tensor(20.)
+    >>> LA.norm(B, 1)
+    tensor(7.)
+    >>> LA.norm(a, -1)
+    tensor(0.)
+    >>> LA.norm(B, -1)
+    tensor(6.)
+    >>> LA.norm(a, 2)
+    tensor(7.7460)
+    >>> LA.norm(B, 2)
+    tensor(7.3485)
+
+    >>> LA.norm(a, -2)
+    tensor(0.)
+    >>> LA.norm(B.double(), -2)
+    tensor(1.8570e-16, dtype=torch.float64)
+    >>> LA.norm(a, 3)
+    tensor(5.8480)
+    >>> LA.norm(a, -3)
+    tensor(0.)
+
+Using the :attr:`dim` argument to compute vector norms::
+
+    >>> c = torch.tensor([[1., 2., 3.],
+    ...                   [-1, 1, 4]])
+    >>> LA.norm(c, dim=0)
+    tensor([1.4142, 2.2361, 5.0000])
+    >>> LA.norm(c, dim=1)
+    tensor([3.7417, 4.2426])
+    >>> LA.norm(c, ord=1, dim=1)
+    tensor([6., 6.])
+
+Using the :attr:`dim` argument to compute matrix norms::
+
+    >>> A = torch.arange(8, dtype=torch.float).reshape(2, 2, 2)
+    >>> LA.norm(A, dim=(1,2))
+    tensor([ 3.7417, 11.2250])
+    >>> LA.norm(A[0, :, :]), LA.norm(A[1, :, :])
+    (tensor(3.7417), tensor(11.2250))
+""")
+
+vector_norm = _add_docstr(_linalg.linalg_vector_norm, r"""
+linalg.vector_norm(x, ord=2, dim=None, keepdim=False, *, dtype=None, out=None) -> Tensor
+
+Computes a vector norm.
+
+If :attr:`x` is complex valued, it computes the norm of :attr:`x`\ `.abs()`
+
+Supports input of float, double, cfloat and cdouble dtypes.
+
+This function does not necessarily treat multidimensional :attr:`x` as a batch of
+vectors, instead:
+
+- If :attr:`dim`\ `= None`, :attr:`x` will be flattened before the norm is computed.
+- If :attr:`dim` is an `int` or a `tuple`, the norm will be computed over these dimensions
+  and the other dimensions will be treated as batch dimensions.
+
+This behavior is for consistency with :func:`torch.linalg.norm`.
+
+:attr:`ord` defines the vector norm that is computed. The following norms are supported:
+
+======================   ===============================
+:attr:`ord`              vector norm
+======================   ===============================
+`2` (default)            `2`-norm (see below)
+`inf`                    `max(abs(x))`
+`-inf`                   `min(abs(x))`
+`0`                      `sum(x != 0)`
+other `int` or `float`   `sum(abs(x)^{ord})^{(1 / ord)}`
+======================   ===============================
+
+where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+:attr:`dtype` may be used to perform the computation in a more precise dtype.
+It is semantically equivalent to calling ``linalg.vector_norm(x.to(dtype))``
+but it is faster in some cases.
+
+.. seealso::
+
+        :func:`torch.linalg.matrix_norm` computes a matrix norm.
+
+Args:
+    x (Tensor): tensor, flattened by default, but this behavior can be
+        controlled using :attr:`dim`.
+    ord (int, float, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `2`
+    dim (int, Tuple[int], optional): dimensions over which to compute
+        the norm. See above for the behavior when :attr:`dim`\ `= None`.
+        Default: `None`
+    keepdim (bool, optional): If set to `True`, the reduced dimensions are retained
+        in the result as dimensions with size one. Default: `False`
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    dtype (:class:`torch.dtype`, optional): type used to perform the accumulation and the return.
+        If specified, :attr:`x` is cast to :attr:`dtype` before performing the operation,
+        and the returned tensor’s type will be :attr:`dtype` if real and of its real counterpart if complex.
+        :attr:`dtype` may be complex if :attr:`x` is complex, otherwise it must be real.
+        :attr:`x` should be convertible without narrowing to :attr:`dtype`. Default: None
+
+Returns:
+    A real-valued tensor, even when :attr:`x` is complex.
+
+Examples::
+
+    >>> from torch import linalg as LA
+    >>> a = torch.arange(9, dtype=torch.float) - 4
+    >>> a
+    tensor([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.])
+    >>> B = a.reshape((3, 3))
+    >>> B
+    tensor([[-4., -3., -2.],
+            [-1.,  0.,  1.],
+            [ 2.,  3.,  4.]])
+    >>> LA.vector_norm(a, ord=3.5)
+    tensor(5.4345)
+    >>> LA.vector_norm(B, ord=3.5)
+    tensor(5.4345)
+""")
+
+matrix_norm = _add_docstr(_linalg.linalg_matrix_norm, r"""
+linalg.matrix_norm(A, ord='fro', dim=(-2, -1), keepdim=False, *, dtype=None, out=None) -> Tensor
+
+Computes a matrix norm.
+
+If :attr:`A` is complex valued, it computes the norm of :attr:`A`\ `.abs()`
+
+Support input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices: the norm will be computed over the
+dimensions specified by the 2-tuple :attr:`dim` and the other dimensions will
+be treated as batch dimensions. The output will have the same batch dimensions.
+
+:attr:`ord` defines the matrix norm that is computed. The following norms are supported:
+
+======================   ========================================================
+:attr:`ord`              matrix norm
+======================   ========================================================
+`'fro'` (default)        Frobenius norm
+`'nuc'`                  nuclear norm
+`inf`                    `max(sum(abs(x), dim=1))`
+`-inf`                   `min(sum(abs(x), dim=1))`
+`1`                      `max(sum(abs(x), dim=0))`
+`-1`                     `min(sum(abs(x), dim=0))`
+`2`                      largest singular value
+`-2`                     smallest singular value
+======================   ========================================================
+
+where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+Args:
+    A (Tensor): tensor with two or more dimensions. By default its
+        shape is interpreted as `(*, m, n)` where `*` is zero or more
+        batch dimensions, but this behavior can be controlled using :attr:`dim`.
+    ord (int, inf, -inf, 'fro', 'nuc', optional): order of norm. Default: `'fro'`
+    dim (Tuple[int, int], optional): dimensions over which to compute the norm. Default: `(-2, -1)`
+    keepdim (bool, optional): If set to `True`, the reduced dimensions are retained
+        in the result as dimensions with size one. Default: `False`
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+    dtype (:class:`torch.dtype`, optional): If specified, the input tensor is cast to
+        :attr:`dtype` before performing the operation, and the returned tensor's type
+        will be :attr:`dtype`. Default: `None`
+
+Returns:
+    A real-valued tensor, even when :attr:`A` is complex.
+
+Examples::
+
+    >>> from torch import linalg as LA
+    >>> A = torch.arange(9, dtype=torch.float).reshape(3, 3)
+    >>> A
+    tensor([[0., 1., 2.],
+            [3., 4., 5.],
+            [6., 7., 8.]])
+    >>> LA.matrix_norm(A)
+    tensor(14.2829)
+    >>> LA.matrix_norm(A, ord=-1)
+    tensor(9.)
+    >>> B = A.expand(2, -1, -1)
+    >>> B
+    tensor([[[0., 1., 2.],
+            [3., 4., 5.],
+            [6., 7., 8.]],
+
+            [[0., 1., 2.],
+            [3., 4., 5.],
+            [6., 7., 8.]]])
+    >>> LA.matrix_norm(B)
+    tensor([14.2829, 14.2829])
+    >>> LA.matrix_norm(B, dim=(0, 2))
+    tensor([ 3.1623, 10.0000, 17.2627])
+""")
+
+matmul = _add_docstr(_linalg.linalg_matmul, r"""
+linalg.matmul(input, other, *, out=None) -> Tensor
+
+Alias for :func:`torch.matmul`
+""")
+
+diagonal = _add_docstr(_linalg.linalg_diagonal, r"""
+linalg.diagonal(A, *, offset=0, dim1=-2, dim2=-1) -> Tensor
+
+Alias for :func:`torch.diagonal` with defaults :attr:`dim1`\ `= -2`, :attr:`dim2`\ `= -1`.
+""")
+
+multi_dot = _add_docstr(_linalg.linalg_multi_dot, r"""
+linalg.multi_dot(tensors, *, out=None)
+
+Efficiently multiplies two or more matrices by reordering the multiplications so that
+the fewest arithmetic operations are performed.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+This function does not support batched inputs.
+
+Every tensor in :attr:`tensors` must be 2D, except for the first and last which
+may be 1D. If the first tensor is a 1D vector of shape `(n,)` it is treated as a row vector
+of shape `(1, n)`, similarly if the last tensor is a 1D vector of shape `(n,)` it is treated
+as a column vector of shape `(n, 1)`.
+
+If the first and last tensors are matrices, the output will be a matrix.
+However, if either is a 1D vector, then the output will be a 1D vector.
+
+Differences with `numpy.linalg.multi_dot`:
+
+- Unlike `numpy.linalg.multi_dot`, the first and last tensors must either be 1D or 2D
+  whereas NumPy allows them to be nD
+
+.. warning:: This function does not broadcast.
+
+.. note:: This function is implemented by chaining :func:`torch.mm` calls after
+          computing the optimal matrix multiplication order.
+
+.. note:: The cost of multiplying two matrices with shapes `(a, b)` and `(b, c)` is
+          `a * b * c`. Given matrices `A`, `B`, `C` with shapes `(10, 100)`,
+          `(100, 5)`, `(5, 50)` respectively, we can calculate the cost of different
+          multiplication orders as follows:
+
+          .. math::
+
+             \begin{align*}
+             \operatorname{cost}((AB)C) &= 10 \times 100 \times 5 + 10 \times 5 \times 50 = 7500 \\
+             \operatorname{cost}(A(BC)) &= 10 \times 100 \times 50 + 100 \times 5 \times 50 = 75000
+             \end{align*}
+
+          In this case, multiplying `A` and `B` first followed by `C` is 10 times faster.
+
+Args:
+    tensors (Sequence[Tensor]): two or more tensors to multiply. The first and last
+        tensors may be 1D or 2D. Every other tensor must be 2D.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> from torch.linalg import multi_dot
+
+    >>> multi_dot([torch.tensor([1, 2]), torch.tensor([2, 3])])
+    tensor(8)
+    >>> multi_dot([torch.tensor([[1, 2]]), torch.tensor([2, 3])])
+    tensor([8])
+    >>> multi_dot([torch.tensor([[1, 2]]), torch.tensor([[2], [3]])])
+    tensor([[8]])
+
+    >>> A = torch.arange(2 * 3).view(2, 3)
+    >>> B = torch.arange(3 * 2).view(3, 2)
+    >>> C = torch.arange(2 * 2).view(2, 2)
+    >>> multi_dot((A, B, C))
+    tensor([[ 26,  49],
+            [ 80, 148]])
+""")
+
+svd = _add_docstr(_linalg.linalg_svd, r"""
+linalg.svd(A, full_matrices=True, *, driver=None, out=None) -> (Tensor, Tensor, Tensor)
+
+Computes the singular value decomposition (SVD) of a matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **full SVD** of a matrix
+:math:`A \in \mathbb{K}^{m \times n}`, if `k = min(m,n)`, is defined as
+
+.. math::
+
+    A = U \operatorname{diag}(S) V^{\text{H}}
+    \mathrlap{\qquad U \in \mathbb{K}^{m \times m}, S \in \mathbb{R}^k, V \in \mathbb{K}^{n \times n}}
+
+where :math:`\operatorname{diag}(S) \in \mathbb{K}^{m \times n}`,
+:math:`V^{\text{H}}` is the conjugate transpose when :math:`V` is complex, and the transpose when :math:`V` is real-valued.
+The matrices  :math:`U`, :math:`V` (and thus :math:`V^{\text{H}}`) are orthogonal in the real case, and unitary in the complex case.
+
+When `m > n` (resp. `m < n`) we can drop the last `m - n` (resp. `n - m`) columns of `U` (resp. `V`) to form the **reduced SVD**:
+
+.. math::
+
+    A = U \operatorname{diag}(S) V^{\text{H}}
+    \mathrlap{\qquad U \in \mathbb{K}^{m \times k}, S \in \mathbb{R}^k, V \in \mathbb{K}^{k \times n}}
+
+where :math:`\operatorname{diag}(S) \in \mathbb{K}^{k \times k}`.
+In this case, :math:`U` and :math:`V` also have orthonormal columns.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+The returned decomposition is a named tuple `(U, S, Vh)`
+which corresponds to :math:`U`, :math:`S`, :math:`V^{\text{H}}` above.
+
+The singular values are returned in descending order.
+
+The parameter :attr:`full_matrices` chooses between the full (default) and reduced SVD.
+
+The :attr:`driver` kwarg may be used in CUDA with a cuSOLVER backend to choose the algorithm used to compute the SVD.
+The choice of a driver is a trade-off between accuracy and speed.
+
+- If :attr:`A` is well-conditioned (its `condition number`_ is not too large), or you do not mind some precision loss.
+
+  - For a general matrix: `'gesvdj'` (Jacobi method)
+  - If :attr:`A` is tall or wide (`m >> n` or `m << n`): `'gesvda'` (Approximate method)
+
+- If :attr:`A` is not well-conditioned or precision is relevant: `'gesvd'` (QR based)
+
+By default (:attr:`driver`\ `= None`), we call `'gesvdj'` and, if it fails, we fallback to `'gesvd'`.
+
+Differences with `numpy.linalg.svd`:
+
+- Unlike `numpy.linalg.svd`, this function always returns a tuple of three tensors
+  and it doesn't support `compute_uv` argument.
+  Please use :func:`torch.linalg.svdvals`, which computes only the singular values,
+  instead of `compute_uv=False`.
+
+.. note:: When :attr:`full_matrices`\ `= True`, the gradients with respect to `U[..., :, min(m, n):]`
+          and `Vh[..., min(m, n):, :]` will be ignored, as those vectors can be arbitrary bases
+          of the corresponding subspaces.
+
+.. warning:: The returned tensors `U` and `V` are not unique, nor are they continuous with
+             respect to :attr:`A`.
+             Due to this lack of uniqueness, different hardware and software may compute
+             different singular vectors.
+
+             This non-uniqueness is caused by the fact that multiplying any pair of singular
+             vectors :math:`u_k, v_k` by `-1` in the real case or by
+             :math:`e^{i \phi}, \phi \in \mathbb{R}` in the complex case produces another two
+             valid singular vectors of the matrix.
+             For this reason, the loss function shall not depend on this :math:`e^{i \phi}` quantity,
+             as it is not well-defined.
+             This is checked for complex inputs when computing the gradients of this function. As such,
+             when inputs are complex and are on a CUDA device, the computation of the gradients
+             of this function synchronizes that device with the CPU.
+
+.. warning:: Gradients computed using `U` or `Vh` will only be finite when
+             :attr:`A` does not have repeated singular values. If :attr:`A` is rectangular,
+             additionally, zero must also not be one of its singular values.
+             Furthermore, if the distance between any two singular values is close to zero,
+             the gradient will be numerically unstable, as it depends on the singular values
+             :math:`\sigma_i` through the computation of
+             :math:`\frac{1}{\min_{i \neq j} \sigma_i^2 - \sigma_j^2}`.
+             In the rectangular case, the gradient will also be numerically unstable when
+             :attr:`A` has small singular values, as it also depends on the computation of
+             :math:`\frac{1}{\sigma_i}`.
+
+.. seealso::
+
+        :func:`torch.linalg.svdvals` computes only the singular values.
+        Unlike :func:`torch.linalg.svd`, the gradients of :func:`~svdvals` are always
+        numerically stable.
+
+        :func:`torch.linalg.eig` for a function that computes another type of spectral
+        decomposition of a matrix. The eigendecomposition works just on square matrices.
+
+        :func:`torch.linalg.eigh` for a (faster) function that computes the eigenvalue decomposition
+        for Hermitian and symmetric matrices.
+
+        :func:`torch.linalg.qr` for another (much faster) decomposition that works on general
+        matrices.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    full_matrices (bool, optional): controls whether to compute the full or reduced
+                                    SVD, and consequently,
+                                    the shape of the returned tensors
+                                    `U` and `Vh`. Default: `True`.
+
+Keyword args:
+    driver (str, optional): name of the cuSOLVER method to be used. This keyword argument only works on CUDA inputs.
+        Available options are: `None`, `gesvd`, `gesvdj`, and `gesvda`.
+        Default: `None`.
+    out (tuple, optional): output tuple of three tensors. Ignored if `None`.
+
+Returns:
+    A named tuple `(U, S, Vh)` which corresponds to :math:`U`, :math:`S`, :math:`V^{\text{H}}` above.
+
+    `S` will always be real-valued, even when :attr:`A` is complex.
+    It will also be ordered in descending order.
+
+    `U` and `Vh` will have the same dtype as :attr:`A`. The left / right singular vectors will be given by
+    the columns of `U` and the rows of `Vh` respectively.
+
+Examples::
+
+    >>> A = torch.randn(5, 3)
+    >>> U, S, Vh = torch.linalg.svd(A, full_matrices=False)
+    >>> U.shape, S.shape, Vh.shape
+    (torch.Size([5, 3]), torch.Size([3]), torch.Size([3, 3]))
+    >>> torch.dist(A, U @ torch.diag(S) @ Vh)
+    tensor(1.0486e-06)
+
+    >>> U, S, Vh = torch.linalg.svd(A)
+    >>> U.shape, S.shape, Vh.shape
+    (torch.Size([5, 5]), torch.Size([3]), torch.Size([3, 3]))
+    >>> torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh)
+    tensor(1.0486e-06)
+
+    >>> A = torch.randn(7, 5, 3)
+    >>> U, S, Vh = torch.linalg.svd(A, full_matrices=False)
+    >>> torch.dist(A, U @ torch.diag_embed(S) @ Vh)
+    tensor(3.0957e-06)
+
+.. _condition number:
+    https://pytorch.org/docs/master/linalg.html#torch.linalg.cond
+.. _the resulting vectors will span the same subspace:
+    https://en.wikipedia.org/wiki/Singular_value_decomposition#Singular_values,_singular_vectors,_and_their_relation_to_the_SVD
+""")
+
+svdvals = _add_docstr(_linalg.linalg_svdvals, r"""
+linalg.svdvals(A, *, driver=None, out=None) -> Tensor
+
+Computes the singular values of a matrix.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+The singular values are returned in descending order.
+
+.. note:: This function is equivalent to NumPy's `linalg.svd(A, compute_uv=False)`.
+
+""" + fr"""
+.. note:: {common_notes["sync_note"]}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.svd` computes the full singular value decomposition.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    driver (str, optional): name of the cuSOLVER method to be used. This keyword argument only works on CUDA inputs.
+        Available options are: `None`, `gesvd`, `gesvdj`, and `gesvda`.
+        Check :func:`torch.linalg.svd` for details.
+        Default: `None`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Returns:
+    A real-valued tensor, even when :attr:`A` is complex.
+
+Examples::
+
+    >>> A = torch.randn(5, 3)
+    >>> S = torch.linalg.svdvals(A)
+    >>> S
+    tensor([2.5139, 2.1087, 1.1066])
+
+    >>> torch.dist(S, torch.linalg.svd(A, full_matrices=False).S)
+    tensor(2.4576e-07)
+""")
+
+cond = _add_docstr(_linalg.linalg_cond, r"""
+linalg.cond(A, p=None, *, out=None) -> Tensor
+
+Computes the condition number of a matrix with respect to a matrix norm.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **condition number** :math:`\kappa` of a matrix
+:math:`A \in \mathbb{K}^{n \times n}` is defined as
+
+.. math::
+
+    \kappa(A) = \|A\|_p\|A^{-1}\|_p
+
+The condition number of :attr:`A` measures the numerical stability of the linear system `AX = B`
+with respect to a matrix norm.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+:attr:`p` defines the matrix norm that is computed. The following norms are supported:
+
+=========    =================================
+:attr:`p`    matrix norm
+=========    =================================
+`None`       `2`-norm (largest singular value)
+`'fro'`      Frobenius norm
+`'nuc'`      nuclear norm
+`inf`        `max(sum(abs(x), dim=1))`
+`-inf`       `min(sum(abs(x), dim=1))`
+`1`          `max(sum(abs(x), dim=0))`
+`-1`         `min(sum(abs(x), dim=0))`
+`2`          largest singular value
+`-2`         smallest singular value
+=========    =================================
+
+where `inf` refers to `float('inf')`, NumPy's `inf` object, or any equivalent object.
+
+For :attr:`p` is one of `('fro', 'nuc', inf, -inf, 1, -1)`, this function uses
+:func:`torch.linalg.norm` and :func:`torch.linalg.inv`.
+As such, in this case, the matrix (or every matrix in the batch) :attr:`A` has to be square
+and invertible.
+
+For :attr:`p` in `(2, -2)`, this function can be computed in terms of the singular values
+:math:`\sigma_1 \geq \ldots \geq \sigma_n`
+
+.. math::
+
+    \kappa_2(A) = \frac{\sigma_1}{\sigma_n}\qquad \kappa_{-2}(A) = \frac{\sigma_n}{\sigma_1}
+
+In these cases, it is computed using :func:`torch.linalg.svdvals`. For these norms, the matrix
+(or every matrix in the batch) :attr:`A` may have any shape.
+
+.. note :: When inputs are on a CUDA device, this function synchronizes that device with the CPU
+           if :attr:`p` is one of `('fro', 'nuc', inf, -inf, 1, -1)`.
+
+.. seealso::
+
+        :func:`torch.linalg.solve` for a function that solves linear systems of square matrices.
+
+        :func:`torch.linalg.lstsq` for a function that solves linear systems of general matrices.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions
+                    for :attr:`p` in `(2, -2)`, and of shape `(*, n, n)` where every matrix
+                    is invertible for :attr:`p` in `('fro', 'nuc', inf, -inf, 1, -1)`.
+    p (int, inf, -inf, 'fro', 'nuc', optional):
+        the type of the matrix norm to use in the computations (see above). Default: `None`
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Returns:
+    A real-valued tensor, even when :attr:`A` is complex.
+
+Raises:
+    RuntimeError:
+        if :attr:`p` is one of `('fro', 'nuc', inf, -inf, 1, -1)`
+        and the :attr:`A` matrix or any matrix in the batch :attr:`A` is not square
+        or invertible.
+
+Examples::
+
+    >>> A = torch.randn(3, 4, 4, dtype=torch.complex64)
+    >>> torch.linalg.cond(A)
+    >>> A = torch.tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
+    >>> torch.linalg.cond(A)
+    tensor([1.4142])
+    >>> torch.linalg.cond(A, 'fro')
+    tensor(3.1623)
+    >>> torch.linalg.cond(A, 'nuc')
+    tensor(9.2426)
+    >>> torch.linalg.cond(A, float('inf'))
+    tensor(2.)
+    >>> torch.linalg.cond(A, float('-inf'))
+    tensor(1.)
+    >>> torch.linalg.cond(A, 1)
+    tensor(2.)
+    >>> torch.linalg.cond(A, -1)
+    tensor(1.)
+    >>> torch.linalg.cond(A, 2)
+    tensor([1.4142])
+    >>> torch.linalg.cond(A, -2)
+    tensor([0.7071])
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> torch.linalg.cond(A)
+    tensor([[9.5917],
+            [3.2538]])
+    >>> A = torch.randn(2, 3, 3, dtype=torch.complex64)
+    >>> torch.linalg.cond(A)
+    tensor([[4.6245],
+            [4.5671]])
+""")
+
+pinv = _add_docstr(_linalg.linalg_pinv, r"""
+linalg.pinv(A, *, atol=None, rtol=None, hermitian=False, out=None) -> Tensor
+
+Computes the pseudoinverse (Moore-Penrose inverse) of a matrix.
+
+The pseudoinverse may be `defined algebraically`_
+but it is more computationally convenient to understand it `through the SVD`_
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+If :attr:`hermitian`\ `= True`, :attr:`A` is assumed to be Hermitian if complex or
+symmetric if real, but this is not checked internally. Instead, just the lower
+triangular part of the matrix is used in the computations.
+
+The singular values (or the norm of the eigenvalues when :attr:`hermitian`\ `= True`)
+that are below :math:`\max(\text{atol}, \sigma_1 \cdot \text{rtol})` threshold are
+treated as zero and discarded in the computation,
+where :math:`\sigma_1` is the largest singular value (or eigenvalue).
+
+If :attr:`rtol` is not specified and :attr:`A` is a matrix of dimensions `(m, n)`,
+the relative tolerance is set to be :math:`\text{rtol} = \max(m, n) \varepsilon`
+and :math:`\varepsilon` is the epsilon value for the dtype of :attr:`A` (see :class:`.finfo`).
+If :attr:`rtol` is not specified and :attr:`atol` is specified to be larger than zero then
+:attr:`rtol` is set to zero.
+
+If :attr:`atol` or :attr:`rtol` is a :class:`torch.Tensor`, its shape must be broadcastable to that
+of the singular values of :attr:`A` as returned by :func:`torch.linalg.svd`.
+
+.. note:: This function uses :func:`torch.linalg.svd` if :attr:`hermitian`\ `= False` and
+          :func:`torch.linalg.eigh` if :attr:`hermitian`\ `= True`.
+          For CUDA inputs, this function synchronizes that device with the CPU.
+
+.. note::
+    Consider using :func:`torch.linalg.lstsq` if possible for multiplying a matrix on the left by
+    the pseudoinverse, as::
+
+        torch.linalg.lstsq(A, B).solution == A.pinv() @ B
+
+    It is always preferred to use :func:`~lstsq` when possible, as it is faster and more
+    numerically stable than computing the pseudoinverse explicitly.
+
+.. note::
+    This function has NumPy compatible variant `linalg.pinv(A, rcond, hermitian=False)`.
+    However, use of the positional argument :attr:`rcond` is deprecated in favor of :attr:`rtol`.
+
+.. warning::
+    This function uses internally :func:`torch.linalg.svd` (or :func:`torch.linalg.eigh`
+    when :attr:`hermitian`\ `= True`), so its derivative has the same problems as those of these
+    functions. See the warnings in :func:`torch.linalg.svd` and :func:`torch.linalg.eigh` for
+    more details.
+
+.. seealso::
+
+        :func:`torch.linalg.inv` computes the inverse of a square matrix.
+
+        :func:`torch.linalg.lstsq` computes :attr:`A`\ `.pinv() @ \ `:attr:`B` with a
+        numerically stable algorithm.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    rcond (float, Tensor, optional): [NumPy Compat]. Alias for :attr:`rtol`. Default: `None`.
+
+Keyword args:
+    atol (float, Tensor, optional): the absolute tolerance value. When `None` it's considered to be zero.
+                                    Default: `None`.
+    rtol (float, Tensor, optional): the relative tolerance value. See above for the value it takes when `None`.
+                                    Default: `None`.
+    hermitian(bool, optional): indicates whether :attr:`A` is Hermitian if complex
+                               or symmetric if real. Default: `False`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(3, 5)
+    >>> A
+    tensor([[ 0.5495,  0.0979, -1.4092, -0.1128,  0.4132],
+            [-1.1143, -0.3662,  0.3042,  1.6374, -0.9294],
+            [-0.3269, -0.5745, -0.0382, -0.5922, -0.6759]])
+    >>> torch.linalg.pinv(A)
+    tensor([[ 0.0600, -0.1933, -0.2090],
+            [-0.0903, -0.0817, -0.4752],
+            [-0.7124, -0.1631, -0.2272],
+            [ 0.1356,  0.3933, -0.5023],
+            [-0.0308, -0.1725, -0.5216]])
+
+    >>> A = torch.randn(2, 6, 3)
+    >>> Apinv = torch.linalg.pinv(A)
+    >>> torch.dist(Apinv @ A, torch.eye(3))
+    tensor(8.5633e-07)
+
+    >>> A = torch.randn(3, 3, dtype=torch.complex64)
+    >>> A = A + A.T.conj()  # creates a Hermitian matrix
+    >>> Apinv = torch.linalg.pinv(A, hermitian=True)
+    >>> torch.dist(Apinv @ A, torch.eye(3))
+    tensor(1.0830e-06)
+
+.. _defined algebraically:
+    https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse#Existence_and_uniqueness
+.. _through the SVD:
+    https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse#Singular_value_decomposition_(SVD)
+""")
+
+matrix_exp = _add_docstr(_linalg.linalg_matrix_exp, r"""
+linalg.matrix_exp(A) -> Tensor
+
+Computes the matrix exponential of a square matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+this function computes the **matrix exponential** of :math:`A \in \mathbb{K}^{n \times n}`, which is defined as
+
+.. math::
+    \mathrm{matrix\_exp}(A) = \sum_{k=0}^\infty \frac{1}{k!}A^k \in \mathbb{K}^{n \times n}.
+
+If the matrix :math:`A` has eigenvalues :math:`\lambda_i \in \mathbb{C}`,
+the matrix :math:`\mathrm{matrix\_exp}(A)` has eigenvalues :math:`e^{\lambda_i} \in \mathbb{C}`.
+
+Supports input of bfloat16, float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+
+Example::
+
+    >>> A = torch.empty(2, 2, 2)
+    >>> A[0, :, :] = torch.eye(2, 2)
+    >>> A[1, :, :] = 2 * torch.eye(2, 2)
+    >>> A
+    tensor([[[1., 0.],
+             [0., 1.]],
+
+            [[2., 0.],
+             [0., 2.]]])
+    >>> torch.linalg.matrix_exp(A)
+    tensor([[[2.7183, 0.0000],
+             [0.0000, 2.7183]],
+
+             [[7.3891, 0.0000],
+              [0.0000, 7.3891]]])
+
+    >>> import math
+    >>> A = torch.tensor([[0, math.pi/3], [-math.pi/3, 0]]) # A is skew-symmetric
+    >>> torch.linalg.matrix_exp(A) # matrix_exp(A) = [[cos(pi/3), sin(pi/3)], [-sin(pi/3), cos(pi/3)]]
+    tensor([[ 0.5000,  0.8660],
+            [-0.8660,  0.5000]])
+""")
+
+
+solve = _add_docstr(_linalg.linalg_solve, r"""
+linalg.solve(A, B, *, left=True, out=None) -> Tensor
+
+Computes the solution of a square system of linear equations with a unique solution.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+this function computes the solution :math:`X \in \mathbb{K}^{n \times k}` of the **linear system** associated to
+:math:`A \in \mathbb{K}^{n \times n}, B \in \mathbb{K}^{n \times k}`, which is defined as
+
+.. math:: AX = B
+
+If :attr:`left`\ `= False`, this function returns the matrix :math:`X \in \mathbb{K}^{n \times k}` that solves the system
+
+.. math::
+
+    XA = B\mathrlap{\qquad A \in \mathbb{K}^{k \times k}, B \in \mathbb{K}^{n \times k}.}
+
+This system of linear equations has one solution if and only if :math:`A` is `invertible`_.
+This function assumes that :math:`A` is invertible.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+Letting `*` be zero or more batch dimensions,
+
+- If :attr:`A` has shape `(*, n, n)` and :attr:`B` has shape `(*, n)` (a batch of vectors) or shape
+  `(*, n, k)` (a batch of matrices or "multiple right-hand sides"), this function returns `X` of shape
+  `(*, n)` or `(*, n, k)` respectively.
+- Otherwise, if :attr:`A` has shape `(*, n, n)` and  :attr:`B` has shape `(n,)`  or `(n, k)`, :attr:`B`
+  is broadcasted to have shape `(*, n)` or `(*, n, k)` respectively.
+  This function then returns the solution of the resulting batch of systems of linear equations.
+
+.. note::
+    This function computes `X = \ `:attr:`A`\ `.inverse() @ \ `:attr:`B` in a faster and
+    more numerically stable way than performing the computations separately.
+
+.. note::
+    It is possible to compute the solution of the system :math:`XA = B` by passing the inputs
+    :attr:`A` and :attr:`B` transposed and transposing the output returned by this function.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.solve_ex")}
+""" + r"""
+
+.. seealso::
+
+        :func:`torch.linalg.solve_triangular` computes the solution of a triangular system of linear
+        equations with a unique solution.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` where `*` is zero or more batch dimensions.
+    B (Tensor): right-hand side tensor of shape `(*, n)` or  `(*, n, k)` or `(n,)` or `(n, k)`
+                according to the rules described above
+
+Keyword args:
+    left (bool, optional): whether to solve the system :math:`AX=B` or :math:`XA = B`. Default: `True`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if the :attr:`A` matrix is not invertible or any matrix in a batched :attr:`A`
+                  is not invertible.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> b = torch.randn(3)
+    >>> x = torch.linalg.solve(A, b)
+    >>> torch.allclose(A @ x, b)
+    True
+    >>> A = torch.randn(2, 3, 3)
+    >>> B = torch.randn(2, 3, 4)
+    >>> X = torch.linalg.solve(A, B)
+    >>> X.shape
+    torch.Size([2, 3, 4])
+    >>> torch.allclose(A @ X, B)
+    True
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> b = torch.randn(3, 1)
+    >>> x = torch.linalg.solve(A, b) # b is broadcasted to size (2, 3, 1)
+    >>> x.shape
+    torch.Size([2, 3, 1])
+    >>> torch.allclose(A @ x, b)
+    True
+    >>> b = torch.randn(3)
+    >>> x = torch.linalg.solve(A, b) # b is broadcasted to size (2, 3)
+    >>> x.shape
+    torch.Size([2, 3])
+    >>> Ax = A @ x.unsqueeze(-1)
+    >>> torch.allclose(Ax, b.unsqueeze(-1).expand_as(Ax))
+    True
+
+.. _invertible:
+    https://en.wikipedia.org/wiki/Invertible_matrix#The_invertible_matrix_theorem
+""")
+
+solve_triangular = _add_docstr(_linalg.linalg_solve_triangular, r"""
+linalg.solve_triangular(A, B, *, upper, left=True, unitriangular=False, out=None) -> Tensor
+
+Computes the solution of a triangular system of linear equations with a unique solution.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+this function computes the solution :math:`X \in \mathbb{K}^{n \times k}` of the **linear system**
+associated to the triangular matrix :math:`A \in \mathbb{K}^{n \times n}` without zeros on the diagonal
+(that is, it is `invertible`_) and the rectangular matrix , :math:`B \in \mathbb{K}^{n \times k}`,
+which is defined as
+
+.. math:: AX = B
+
+The argument :attr:`upper` signals whether :math:`A` is upper or lower triangular.
+
+If :attr:`left`\ `= False`, this function returns the matrix :math:`X \in \mathbb{K}^{n \times k}` that
+solves the system
+
+.. math::
+
+    XA = B\mathrlap{\qquad A \in \mathbb{K}^{k \times k}, B \in \mathbb{K}^{n \times k}.}
+
+If :attr:`upper`\ `= True` (resp. `False`) just the upper (resp. lower) triangular half of :attr:`A`
+will be accessed. The elements below the main diagonal will be considered to be zero and will not be accessed.
+
+If :attr:`unitriangular`\ `= True`, the diagonal of :attr:`A` is assumed to be ones and will not be accessed.
+
+The result may contain `NaN` s if the diagonal of :attr:`A` contains zeros or elements that
+are very close to zero and :attr:`unitriangular`\ `= False` (default) or if the input matrix
+has very small eigenvalues.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.linalg.solve` computes the solution of a general square system of linear
+        equations with a unique solution.
+
+Args:
+    A (Tensor): tensor of shape `(*, n, n)` (or `(*, k, k)` if :attr:`left`\ `= True`)
+                where `*` is zero or more batch dimensions.
+    B (Tensor): right-hand side tensor of shape `(*, n, k)`.
+
+Keyword args:
+    upper (bool): whether :attr:`A` is an upper or lower triangular matrix.
+    left (bool, optional): whether to solve the system :math:`AX=B` or :math:`XA = B`. Default: `True`.
+    unitriangular (bool, optional): if `True`, the diagonal elements of :attr:`A` are assumed to be
+                                    all equal to `1`. Default: `False`.
+    out (Tensor, optional): output tensor. `B` may be passed as `out` and the result is computed in-place on `B`.
+                            Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3).triu_()
+    >>> B = torch.randn(3, 4)
+    >>> X = torch.linalg.solve_triangular(A, B, upper=True)
+    >>> torch.allclose(A @ X, B)
+    True
+
+    >>> A = torch.randn(2, 3, 3).tril_()
+    >>> B = torch.randn(2, 3, 4)
+    >>> X = torch.linalg.solve_triangular(A, B, upper=False)
+    >>> torch.allclose(A @ X, B)
+    True
+
+    >>> A = torch.randn(2, 4, 4).tril_()
+    >>> B = torch.randn(2, 3, 4)
+    >>> X = torch.linalg.solve_triangular(A, B, upper=False, left=False)
+    >>> torch.allclose(X @ A, B)
+    True
+
+.. _invertible:
+    https://en.wikipedia.org/wiki/Invertible_matrix#The_invertible_matrix_theorem
+""")
+
+lu_factor = _add_docstr(_linalg.linalg_lu_factor, r"""
+linalg.lu_factor(A, *, bool pivot=True, out=None) -> (Tensor, Tensor)
+
+Computes a compact representation of the LU factorization with partial pivoting of a matrix.
+
+This function computes a compact representation of the decomposition given by :func:`torch.linalg.lu`.
+If the matrix is square, this representation may be used in :func:`torch.linalg.lu_solve`
+to solve system of linear equations that share the matrix :attr:`A`.
+
+The returned decomposition is represented as a named tuple `(LU, pivots)`.
+The ``LU`` matrix has the same shape as the input matrix ``A``. Its upper and lower triangular
+parts encode the non-constant elements of ``L`` and ``U`` of the LU decomposition of ``A``.
+
+The returned permutation matrix is represented by a 1-indexed vector. `pivots[i] == j` represents
+that in the `i`-th step of the algorithm, the `i`-th row was permuted with the `j-1`-th row.
+
+On CUDA, one may use :attr:`pivot`\ `= False`. In this case, this function returns the LU
+decomposition without pivoting if it exists.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_has_ex"].format("torch.linalg.lu_factor_ex")}
+""" + r"""
+.. warning:: The LU decomposition is almost never unique, as often there are different permutation
+             matrices that can yield different LU decompositions.
+             As such, different platforms, like SciPy, or inputs on different devices,
+             may produce different valid decompositions.
+
+             Gradient computations are only supported if the input matrix is full-rank.
+             If this condition is not met, no error will be thrown, but the gradient may not be finite.
+             This is because the LU decomposition with pivoting is not differentiable at these points.
+
+.. seealso::
+
+        :func:`torch.linalg.lu_solve` solves a system of linear equations given the output of this
+        function provided the input matrix was square and invertible.
+
+        :func:`torch.lu_unpack` unpacks the tensors returned by :func:`~lu_factor` into the three
+        matrices `P, L, U` that form the decomposition.
+
+        :func:`torch.linalg.lu` computes the LU decomposition with partial pivoting of a possibly
+        non-square matrix. It is a composition of :func:`~lu_factor` and :func:`torch.lu_unpack`.
+
+        :func:`torch.linalg.solve` solves a system of linear equations. It is a composition
+        of :func:`~lu_factor` and :func:`~lu_solve`.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    pivot (bool, optional): Whether to compute the LU decomposition with partial pivoting, or the regular LU
+                            decomposition. :attr:`pivot`\ `= False` not supported on CPU. Default: `True`.
+    out (tuple, optional): tuple of two tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LU, pivots)`.
+
+Raises:
+    RuntimeError: if the :attr:`A` matrix is not invertible or any matrix in a batched :attr:`A`
+                  is not invertible.
+
+Examples::
+
+    >>> A = torch.randn(2, 3, 3)
+    >>> B1 = torch.randn(2, 3, 4)
+    >>> B2 = torch.randn(2, 3, 7)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> X1 = torch.linalg.lu_solve(LU, pivots, B1)
+    >>> X2 = torch.linalg.lu_solve(LU, pivots, B2)
+    >>> torch.allclose(A @ X1, B1)
+    True
+    >>> torch.allclose(A @ X2, B2)
+    True
+
+.. _invertible:
+    https://en.wikipedia.org/wiki/Invertible_matrix#The_invertible_matrix_theorem
+""")
+
+lu_factor_ex = _add_docstr(_linalg.linalg_lu_factor_ex, r"""
+linalg.lu_factor_ex(A, *, pivot=True, check_errors=False, out=None) -> (Tensor, Tensor, Tensor)
+
+This is a version of :func:`~lu_factor` that does not perform error checks unless :attr:`check_errors`\ `= True`.
+It also returns the :attr:`info` tensor returned by `LAPACK's getrf`_.
+
+""" + fr"""
+.. note:: {common_notes["sync_note_ex"]}
+
+.. warning:: {common_notes["experimental_warning"]}
+""" + r"""
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+
+Keyword args:
+    pivot (bool, optional): Whether to compute the LU decomposition with partial pivoting, or the regular LU
+                            decomposition. :attr:`pivot`\ `= False` not supported on CPU. Default: `True`.
+    check_errors (bool, optional): controls whether to check the content of ``infos`` and raise
+                                   an error if it is non-zero. Default: `False`.
+    out (tuple, optional): tuple of three tensors to write the output to. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(LU, pivots, info)`.
+
+.. _LAPACK's getrf:
+    https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html
+""")
+
+lu_solve = _add_docstr(_linalg.linalg_lu_solve, r"""
+linalg.lu_solve(LU, pivots, B, *, left=True, adjoint=False, out=None) -> Tensor
+
+Computes the solution of a square system of linear equations with a unique solution given an LU decomposition.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+this function computes the solution :math:`X \in \mathbb{K}^{n \times k}` of the **linear system** associated to
+:math:`A \in \mathbb{K}^{n \times n}, B \in \mathbb{K}^{n \times k}`, which is defined as
+
+.. math:: AX = B
+
+where :math:`A` is given factorized as returned by :func:`~lu_factor`.
+
+If :attr:`left`\ `= False`, this function returns the matrix :math:`X \in \mathbb{K}^{n \times k}` that solves the system
+
+.. math::
+
+    XA = B\mathrlap{\qquad A \in \mathbb{K}^{k \times k}, B \in \mathbb{K}^{n \times k}.}
+
+If  :attr:`adjoint`\ `= True` (and :attr:`left`\ `= True`), given an LU factorization of :math:`A`
+this function function returns the :math:`X \in \mathbb{K}^{n \times k}` that solves the system
+
+.. math::
+
+    A^{\text{H}}X = B\mathrlap{\qquad A \in \mathbb{K}^{k \times k}, B \in \mathbb{K}^{n \times k}.}
+
+where :math:`A^{\text{H}}` is the conjugate transpose when :math:`A` is complex, and the
+transpose when :math:`A` is real-valued. The :attr:`left`\ `= False` case is analogous.
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if the inputs are batches of matrices then
+the output has the same batch dimensions.
+
+Args:
+    LU (Tensor): tensor of shape `(*, n, n)` (or `(*, k, k)` if :attr:`left`\ `= True`)
+                 where `*` is zero or more batch dimensions as returned by :func:`~lu_factor`.
+    pivots (Tensor): tensor of shape `(*, n)` (or `(*, k)` if :attr:`left`\ `= True`)
+                     where `*` is zero or more batch dimensions as returned by :func:`~lu_factor`.
+    B (Tensor): right-hand side tensor of shape `(*, n, k)`.
+
+Keyword args:
+    left (bool, optional): whether to solve the system :math:`AX=B` or :math:`XA = B`. Default: `True`.
+    adjoint (bool, optional): whether to solve the system :math:`AX=B` or :math:`A^{\text{H}}X = B`. Default: `False`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> A = torch.randn(3, 3)
+    >>> LU, pivots = torch.linalg.lu_factor(A)
+    >>> B = torch.randn(3, 2)
+    >>> X = torch.linalg.lu_solve(LU, pivots, B)
+    >>> torch.allclose(A @ X, B)
+    True
+
+    >>> B = torch.randn(3, 3, 2)   # Broadcasting rules apply: A is broadcasted
+    >>> X = torch.linalg.lu_solve(LU, pivots, B)
+    >>> torch.allclose(A @ X, B)
+    True
+
+    >>> B = torch.randn(3, 5, 3)
+    >>> X = torch.linalg.lu_solve(LU, pivots, B, left=False)
+    >>> torch.allclose(X @ A, B)
+    True
+
+    >>> B = torch.randn(3, 3, 4)   # Now solve for A^T
+    >>> X = torch.linalg.lu_solve(LU, pivots, B, adjoint=True)
+    >>> torch.allclose(A.mT @ X, B)
+    True
+
+.. _invertible:
+    https://en.wikipedia.org/wiki/Invertible_matrix#The_invertible_matrix_theorem
+""")
+
+lu = _add_docstr(_linalg.linalg_lu, r"""
+lu(A, *, pivot=True, out=None) -> (Tensor, Tensor, Tensor)
+
+Computes the LU decomposition with partial pivoting of a matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **LU decomposition with partial pivoting** of a matrix
+:math:`A \in \mathbb{K}^{m \times n}` is defined as
+
+.. math::
+
+    A = PLU\mathrlap{\qquad P \in \mathbb{K}^{m \times m}, L \in \mathbb{K}^{m \times k}, U \in \mathbb{K}^{k \times n}}
+
+where `k = min(m,n)`, :math:`P` is a `permutation matrix`_, :math:`L` is lower triangular with ones on the diagonal
+and :math:`U` is upper triangular.
+
+If :attr:`pivot`\ `= False` and :attr:`A` is on GPU, then the **LU decomposition without pivoting** is computed
+
+.. math::
+
+    A = LU\mathrlap{\qquad L \in \mathbb{K}^{m \times k}, U \in \mathbb{K}^{k \times n}}
+
+When :attr:`pivot`\ `= False`, the returned matrix :attr:`P` will be empty.
+The LU decomposition without pivoting `may not exist`_ if any of the principal minors of :attr:`A` is singular.
+In this case, the output matrix may contain `inf` or `NaN`.
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+.. seealso::
+
+        :func:`torch.linalg.solve` solves a system of linear equations using the LU decomposition
+        with partial pivoting.
+
+.. warning:: The LU decomposition is almost never unique, as often there are different permutation
+             matrices that can yield different LU decompositions.
+             As such, different platforms, like SciPy, or inputs on different devices,
+             may produce different valid decompositions.
+
+.. warning:: Gradient computations are only supported if the input matrix is full-rank.
+             If this condition is not met, no error will be thrown, but the gradient
+             may not be finite.
+             This is because the LU decomposition with pivoting is not differentiable at these points.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    pivot (bool, optional): Controls whether to compute the LU decomposition with partial pivoting or
+        no pivoting. Default: `True`.
+
+Keyword args:
+    out (tuple, optional): output tuple of three tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(P, L, U)`.
+
+Examples::
+
+    >>> A = torch.randn(3, 2)
+    >>> P, L, U = torch.linalg.lu(A)
+    >>> P
+    tensor([[0., 1., 0.],
+            [0., 0., 1.],
+            [1., 0., 0.]])
+    >>> L
+    tensor([[1.0000, 0.0000],
+            [0.5007, 1.0000],
+            [0.0633, 0.9755]])
+    >>> U
+    tensor([[0.3771, 0.0489],
+            [0.0000, 0.9644]])
+    >>> torch.dist(A, P @ L @ U)
+    tensor(5.9605e-08)
+
+    >>> A = torch.randn(2, 5, 7, device="cuda")
+    >>> P, L, U = torch.linalg.lu(A, pivot=False)
+    >>> P
+    tensor([], device='cuda:0')
+    >>> torch.dist(A, L @ U)
+    tensor(1.0376e-06, device='cuda:0')
+
+.. _permutation matrix:
+    https://en.wikipedia.org/wiki/Permutation_matrix
+.. _may not exist:
+    https://en.wikipedia.org/wiki/LU_decomposition#Definitions
+""")
+
+tensorinv = _add_docstr(_linalg.linalg_tensorinv, r"""
+linalg.tensorinv(A, ind=2, *, out=None) -> Tensor
+
+Computes the multiplicative inverse of :func:`torch.tensordot`.
+
+If `m` is the product of the first :attr:`ind` dimensions of :attr:`A` and `n` is the product of
+the rest of the dimensions, this function expects `m` and `n` to be equal.
+If this is the case, it computes a tensor `X` such that
+`tensordot(\ `:attr:`A`\ `, X, \ `:attr:`ind`\ `)` is the identity matrix in dimension `m`.
+`X` will have the shape of :attr:`A` but with the first :attr:`ind` dimensions pushed back to the end
+
+.. code:: text
+
+    X.shape == A.shape[ind:] + A.shape[:ind]
+
+Supports input of float, double, cfloat and cdouble dtypes.
+
+.. note:: When :attr:`A` is a `2`-dimensional tensor and :attr:`ind`\ `= 1`,
+          this function computes the (multiplicative) inverse of :attr:`A`
+          (see :func:`torch.linalg.inv`).
+
+.. note::
+    Consider using :func:`torch.linalg.tensorsolve` if possible for multiplying a tensor on the left
+    by the tensor inverse, as::
+
+        linalg.tensorsolve(A, B) == torch.tensordot(linalg.tensorinv(A), B)  # When B is a tensor with shape A.shape[:B.ndim]
+
+    It is always preferred to use :func:`~tensorsolve` when possible, as it is faster and more
+    numerically stable than computing the pseudoinverse explicitly.
+
+.. seealso::
+
+        :func:`torch.linalg.tensorsolve` computes
+        `torch.tensordot(tensorinv(\ `:attr:`A`\ `), \ `:attr:`B`\ `)`.
+
+Args:
+    A (Tensor): tensor to invert. Its shape must satisfy
+                    `prod(\ `:attr:`A`\ `.shape[:\ `:attr:`ind`\ `]) ==
+                    prod(\ `:attr:`A`\ `.shape[\ `:attr:`ind`\ `:])`.
+    ind (int): index at which to compute the inverse of :func:`torch.tensordot`. Default: `2`.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if the reshaped :attr:`A` is not invertible or the product of the first
+                  :attr:`ind` dimensions is not equal to the product of the rest.
+
+Examples::
+
+    >>> A = torch.eye(4 * 6).reshape((4, 6, 8, 3))
+    >>> Ainv = torch.linalg.tensorinv(A, ind=2)
+    >>> Ainv.shape
+    torch.Size([8, 3, 4, 6])
+    >>> B = torch.randn(4, 6)
+    >>> torch.allclose(torch.tensordot(Ainv, B), torch.linalg.tensorsolve(A, B))
+    True
+
+    >>> A = torch.randn(4, 4)
+    >>> Atensorinv = torch.linalg.tensorinv(A, ind=1)
+    >>> Ainv = torch.linalg.inv(A)
+    >>> torch.allclose(Atensorinv, Ainv)
+    True
+""")
+
+tensorsolve = _add_docstr(_linalg.linalg_tensorsolve, r"""
+linalg.tensorsolve(A, B, dims=None, *, out=None) -> Tensor
+
+Computes the solution `X` to the system `torch.tensordot(A, X) = B`.
+
+If `m` is the product of the first :attr:`B`\ `.ndim`  dimensions of :attr:`A` and
+`n` is the product of the rest of the dimensions, this function expects `m` and `n` to be equal.
+
+The returned tensor `x` satisfies
+`tensordot(\ `:attr:`A`\ `, x, dims=x.ndim) == \ `:attr:`B`.
+`x` has shape :attr:`A`\ `[B.ndim:]`.
+
+If :attr:`dims` is specified, :attr:`A` will be reshaped as
+
+.. code:: text
+
+    A = movedim(A, dims, range(len(dims) - A.ndim + 1, 0))
+
+Supports inputs of float, double, cfloat and cdouble dtypes.
+
+.. seealso::
+
+        :func:`torch.linalg.tensorinv` computes the multiplicative inverse of
+        :func:`torch.tensordot`.
+
+Args:
+    A (Tensor): tensor to solve for. Its shape must satisfy
+                    `prod(\ `:attr:`A`\ `.shape[:\ `:attr:`B`\ `.ndim]) ==
+                    prod(\ `:attr:`A`\ `.shape[\ `:attr:`B`\ `.ndim:])`.
+    B (Tensor): tensor of shape :attr:`A`\ `.shape[:\ `:attr:`B`\ `.ndim]`.
+    dims (Tuple[int], optional): dimensions of :attr:`A` to be moved.
+        If `None`, no dimensions are moved. Default: `None`.
+
+Keyword args:
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Raises:
+    RuntimeError: if the reshaped :attr:`A`\ `.view(m, m)` with `m` as above  is not
+                  invertible or the product of the first :attr:`ind` dimensions is not equal
+                  to the product of the rest of the dimensions.
+
+Examples::
+
+    >>> A = torch.eye(2 * 3 * 4).reshape((2 * 3, 4, 2, 3, 4))
+    >>> B = torch.randn(2 * 3, 4)
+    >>> X = torch.linalg.tensorsolve(A, B)
+    >>> X.shape
+    torch.Size([2, 3, 4])
+    >>> torch.allclose(torch.tensordot(A, X, dims=X.ndim), B)
+    True
+
+    >>> A = torch.randn(6, 4, 4, 3, 2)
+    >>> B = torch.randn(4, 3, 2)
+    >>> X = torch.linalg.tensorsolve(A, B, dims=(0, 2))
+    >>> X.shape
+    torch.Size([6, 4])
+    >>> A = A.permute(1, 3, 4, 0, 2)
+    >>> A.shape[B.ndim:]
+    torch.Size([6, 4])
+    >>> torch.allclose(torch.tensordot(A, X, dims=X.ndim), B, atol=1e-6)
+    True
+""")
+
+qr = _add_docstr(_linalg.linalg_qr, r"""
+qr(A, mode='reduced', *, out=None) -> (Tensor, Tensor)
+
+Computes the QR decomposition of a matrix.
+
+Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`,
+the **full QR decomposition** of a matrix
+:math:`A \in \mathbb{K}^{m \times n}` is defined as
+
+.. math::
+
+    A = QR\mathrlap{\qquad Q \in \mathbb{K}^{m \times m}, R \in \mathbb{K}^{m \times n}}
+
+where :math:`Q` is orthogonal in the real case and unitary in the complex case,
+and :math:`R` is upper triangular with real diagonal (even in the complex case).
+
+When `m > n` (tall matrix), as `R` is upper triangular, its last `m - n` rows are zero.
+In this case, we can drop the last `m - n` columns of `Q` to form the
+**reduced QR decomposition**:
+
+.. math::
+
+    A = QR\mathrlap{\qquad Q \in \mathbb{K}^{m \times n}, R \in \mathbb{K}^{n \times n}}
+
+The reduced QR decomposition agrees with the full QR decomposition when `n >= m` (wide matrix).
+
+Supports input of float, double, cfloat and cdouble dtypes.
+Also supports batches of matrices, and if :attr:`A` is a batch of matrices then
+the output has the same batch dimensions.
+
+The parameter :attr:`mode` chooses between the full and reduced QR decomposition.
+If :attr:`A` has shape `(*, m, n)`, denoting `k = min(m, n)`
+
+- :attr:`mode`\ `= 'reduced'` (default): Returns `(Q, R)` of shapes `(*, m, k)`, `(*, k, n)` respectively.
+  It is always differentiable.
+- :attr:`mode`\ `= 'complete'`: Returns `(Q, R)` of shapes `(*, m, m)`, `(*, m, n)` respectively.
+  It is differentiable for `m <= n`.
+- :attr:`mode`\ `= 'r'`: Computes only the reduced `R`. Returns `(Q, R)` with `Q` empty and `R` of shape `(*, k, n)`.
+  It is never differentiable.
+
+Differences with `numpy.linalg.qr`:
+
+- :attr:`mode`\ `= 'raw'` is not implemented.
+- Unlike `numpy.linalg.qr`, this function always returns a tuple of two tensors.
+  When :attr:`mode`\ `= 'r'`, the `Q` tensor is an empty tensor.
+
+.. warning:: The elements in the diagonal of `R` are not necessarily positive.
+             As such, the returned QR decomposition is only unique up to the sign of the diagonal of `R`.
+             Therefore, different platforms, like NumPy, or inputs on different devices,
+             may produce different valid decompositions.
+
+.. warning:: The QR decomposition is only well-defined if the first `k = min(m, n)` columns
+             of every matrix in :attr:`A` are linearly independent.
+             If this condition is not met, no error will be thrown, but the QR produced
+             may be incorrect and its autodiff may fail or produce incorrect results.
+
+Args:
+    A (Tensor): tensor of shape `(*, m, n)` where `*` is zero or more batch dimensions.
+    mode (str, optional): one of `'reduced'`, `'complete'`, `'r'`.
+                          Controls the shape of the returned tensors. Default: `'reduced'`.
+
+Keyword args:
+    out (tuple, optional): output tuple of two tensors. Ignored if `None`. Default: `None`.
+
+Returns:
+    A named tuple `(Q, R)`.
+
+Examples::
+
+    >>> A = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+    >>> Q, R = torch.linalg.qr(A)
+    >>> Q
+    tensor([[-0.8571,  0.3943,  0.3314],
+            [-0.4286, -0.9029, -0.0343],
+            [ 0.2857, -0.1714,  0.9429]])
+    >>> R
+    tensor([[ -14.0000,  -21.0000,   14.0000],
+            [   0.0000, -175.0000,   70.0000],
+            [   0.0000,    0.0000,  -35.0000]])
+    >>> (Q @ R).round()
+    tensor([[  12.,  -51.,    4.],
+            [   6.,  167.,  -68.],
+            [  -4.,   24.,  -41.]])
+    >>> (Q.T @ Q).round()
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1., -0.],
+            [ 0., -0.,  1.]])
+    >>> Q2, R2 = torch.linalg.qr(A, mode='r')
+    >>> Q2
+    tensor([])
+    >>> torch.equal(R, R2)
+    True
+    >>> A = torch.randn(3, 4, 5)
+    >>> Q, R = torch.linalg.qr(A, mode='complete')
+    >>> torch.dist(Q @ R, A)
+    tensor(1.6099e-06)
+    >>> torch.dist(Q.mT @ Q, torch.eye(4))
+    tensor(6.2158e-07)
+""")
+
+vander = _add_docstr(_linalg.linalg_vander, r"""
+vander(x, N=None) -> Tensor
+
+Generates a Vandermonde matrix.
+
+Returns the Vandermonde matrix :math:`V`
+
+.. math::
+
+    V = \begin{pmatrix}
+            1 & x_1 & x_1^2 & \dots & x_1^{N-1}\\
+            1 & x_2 & x_2^2 & \dots & x_2^{N-1}\\
+            1 & x_3 & x_3^2 & \dots & x_3^{N-1}\\
+            \vdots & \vdots & \vdots & \ddots &\vdots \\
+            1 & x_n & x_n^2 & \dots & x_n^{N-1}
+        \end{pmatrix}.
+
+for `N > 1`.
+If :attr:`N`\ `= None`, then `N = x.size(-1)` so that the output is a square matrix.
+
+Supports inputs of float, double, cfloat, cdouble, and integral dtypes.
+Also supports batches of vectors, and if :attr:`x` is a batch of vectors then
+the output has the same batch dimensions.
+
+Differences with `numpy.vander`:
+
+- Unlike `numpy.vander`, this function returns the powers of :attr:`x` in ascending order.
+  To get them in the reverse order call ``linalg.vander(x, N).flip(-1)``.
+
+Args:
+    x (Tensor): tensor of shape `(*, n)` where `*` is zero or more batch dimensions
+                consisting of vectors.
+
+Keyword args:
+    N (int, optional): Number of columns in the output. Default: `x.size(-1)`
+
+Example::
+
+    >>> x = torch.tensor([1, 2, 3, 5])
+    >>> linalg.vander(x)
+    tensor([[  1,   1,   1,   1],
+            [  1,   2,   4,   8],
+            [  1,   3,   9,  27],
+            [  1,   5,  25, 125]])
+    >>> linalg.vander(x, N=3)
+    tensor([[ 1,  1,  1],
+            [ 1,  2,  4],
+            [ 1,  3,  9],
+            [ 1,  5, 25]])
+""")
+
+vecdot = _add_docstr(_linalg.linalg_vecdot, r"""
+linalg.vecdot(x, y, *, dim=-1, out=None) -> Tensor
+
+Computes the dot product of two batches of vectors along a dimension.
+
+In symbols, this function computes
+
+.. math::
+
+    \sum_{i=1}^n \overline{x_i}y_i.
+
+over the dimension :attr:`dim` where :math:`\overline{x_i}` denotes the conjugate for complex
+vectors, and it is the identity for real vectors.
+
+Supports input of half, bfloat16, float, double, cfloat, cdouble and integral dtypes.
+It also supports broadcasting.
+
+Args:
+    x (Tensor): first batch of vectors of shape `(*, n)`.
+    y (Tensor): second batch of vectors of shape `(*, n)`.
+
+Keyword args:
+    dim (int): Dimension along which to compute the dot product. Default: `-1`.
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> v1 = torch.randn(3, 2)
+    >>> v2 = torch.randn(3, 2)
+    >>> linalg.vecdot(v1, v2)
+    tensor([ 0.3223,  0.2815, -0.1944])
+    >>> torch.vdot(v1[0], v2[0])
+    tensor(0.3223)
+""")
diff --git a/MLPY/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4896d1ab3e8cfba66c3e5351e187b2ba0276dee3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/linalg/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/__init__.py b/MLPY/Lib/site-packages/torch/masked/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb51a3fed6eabb7c55b4ff4e9216baa943c74f0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/__init__.py
@@ -0,0 +1,37 @@
+from .maskedtensor.core import is_masked_tensor, MaskedTensor
+from .maskedtensor.creation import as_masked_tensor, masked_tensor
+from ._ops import (
+    _canonical_dim,
+    _generate_docstring,
+    _reduction_identity,
+    _where,
+    _input_mask,
+    _output_mask,
+    _combine_input_and_mask,
+    sum,
+    prod,
+    cumsum,
+    cumprod,
+    amax,
+    amin,
+    argmax,
+    argmin,
+    mean,
+    median,
+    logsumexp,
+    logaddexp,
+    norm,
+    var,
+    std,
+    softmax,
+    log_softmax,
+    softmin,
+    normalize,
+)
+
+__all__ = [
+    "as_masked_tensor",
+    "is_masked_tensor",
+    "masked_tensor",
+    "MaskedTensor",
+]
diff --git a/MLPY/Lib/site-packages/torch/masked/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f4c9aa3f31626b729011d01401aeb0b7c2d14b4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/__pycache__/_docs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/__pycache__/_docs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cdd04d512cd934d57726bb5f2917a3c8215d106
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/__pycache__/_docs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/__pycache__/_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/__pycache__/_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b6a0daa422d0ac910a7a9f2d002af9ff4d6e111
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/__pycache__/_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/_docs.py b/MLPY/Lib/site-packages/torch/masked/_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2040887593ad27ddd9bdf00f63a5007fdafde8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/_docs.py
@@ -0,0 +1,1177 @@
+# This file is generated, do not modify it!
+#
+# To update this file, run the update masked docs script as follows:
+#
+#   python tools/update_masked_docs.py
+#
+# The script must be called from an environment where the development
+# version of torch package can be imported and is functional.
+#
+
+amax_docstring = """amax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns maximum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of maximum operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in maximum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of maximum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.amax(input, 1, mask=mask)
+    tensor([                  -1, -9223372036854775808])
+"""
+
+amin_docstring = """amin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns minimum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of minimum operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in minimum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of minimum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.amin(input, 1, mask=mask)
+    tensor([                 -3, 9223372036854775807])
+"""
+
+argmax_docstring = """argmax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns argmax of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of argmax operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in argmax computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of argmax operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which argmax is computed.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.argmax(input, 1, mask=mask)
+    tensor([2, 0])
+"""
+
+argmin_docstring = """argmin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns argmin of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of argmin operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in argmin computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of argmin operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which argmin is computed.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.argmin(input, 1, mask=mask)
+    tensor([0, 0])
+"""
+
+cumprod_docstring = """cumprod(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns cumulative_prod of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``prod(x[:i])``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+cumulative_prod computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the cumulative_prod output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which cumulative_prod is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.cumprod(input, 1, mask=mask)
+    tensor([[-3., -3.,  3.],
+            [ 1.,  1.,  1.]])
+"""
+
+cumsum_docstring = """cumsum(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns cumulative_sum of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``sum(x[:i])``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+cumulative_sum computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the cumulative_sum output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which cumulative_sum is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.cumsum(input, 1, mask=mask)
+    tensor([[-3., -3., -4.],
+            [ 0.,  0.,  0.]])
+"""
+
+log_softmax_docstring = """log_softmax(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns log_softmax of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is
+defined as ``log(exp(x[i])/sum(exp(x)))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+log_softmax computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the log_softmax output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which log_softmax is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.log_softmax(input, 1, mask=mask)
+    tensor([[-2.1269,    -inf, -0.1269],
+            [    nan,     nan,     nan]])
+"""
+
+logsumexp_docstring = """logsumexp(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns logsumexp of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of logsumexp operation, which is used to start the reduction, is ``-2147483648``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in logsumexp computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of logsumexp operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.logsumexp(input, 1, mask=mask)
+    tensor([                   0, -9223372036854775808])
+"""
+
+mean_docstring = """mean(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns mean of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+By definition, the identity value of a mean operation is the mean
+value of the tensor. If all elements of the input tensor along given
+dimension(s) :attr:`dim` are masked-out, the identity value of the
+mean is undefined.  Due to this ambiguity, the elements of output
+tensor with strided layout, that correspond to fully masked-out
+elements, have ``nan`` values.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in mean computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of mean operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.mean(input, 1, mask=mask)
+    tensor([-2., nan])
+"""
+
+median_docstring = """median(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns median of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+By definition, the identity value of a median operation is the median
+value of the tensor. If all elements of the input tensor along given
+dimension(s) :attr:`dim` are masked-out, the identity value of the
+median is undefined.  Due to this ambiguity, the elements of output
+tensor with strided layout, that correspond to fully masked-out
+elements, have ``nan`` values.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in median computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of median operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which median is computed.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.median(input, 1, mask=mask)
+    tensor([-3., nan])
+"""
+
+norm_docstring = """norm(input, ord, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns norm of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of norm operation, which is used to start the
+reduction, is ``0.0``, except for ``ord=-inf`` it is
+``inf``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in norm computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of norm operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    ord (int, float, optional): the order of vector norm. Default: 2.
+      See :func:`torch.linalg.vector_norm` for a list of supported norms.
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.norm(input, 2.0, 1, mask=mask)
+    tensor([3.1623, 0.0000])
+"""
+
+normalize_docstring = """normalize(input, ord, dim, *, eps=1e-12, dtype=None, mask=None) -> Tensor
+
+Returns normalize of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Normalize of i-th element in ``x`` is
+defined as ``x[i]/max(norm(x, p), eps)``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+normalize computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the normalize output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    ord (int, float): the order of vector norm. Default: 2.
+      See :func:`torch.linalg.vector_norm` for a list of supported norms.
+    dim (int): the dimension along which normalize is computed.
+
+Keyword args:
+    eps (float, optional): small value to avoid division by zero. Default: 1e-12.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.normalize(input, 2.0, 1, mask=mask)
+    tensor([[-0.9487,  0.0000, -0.3162],
+            [ 0.0000,  0.0000,  0.0000]])
+"""
+
+prod_docstring = """prod(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns product of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of product operation, which is used to start the reduction, is ``1``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in product computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of product operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.prod(input, 1, mask=mask)
+    tensor([3, 1])
+"""
+
+softmax_docstring = """softmax(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns softmax of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmax of i-th element in ``x`` is
+defined as ``exp(x[i])/sum(exp(x))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+softmax computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the softmax output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which softmax is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.softmax(input, 1, mask=mask)
+    tensor([[0.1192, 0.0000, 0.8808],
+            [   nan,    nan,    nan]])
+"""
+
+softmin_docstring = """softmin(input, dim, *, dtype=None, mask=None) -> Tensor
+
+Returns softmin of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmin of i-th element in ``x`` is
+defined as ``exp(-x[i])/sum(exp(-x))``.
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+softmin computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the softmin output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int): the dimension along which softmin is computed.
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]])
+    >>> input
+    tensor([[-3., -2., -1.],
+            [ 0.,  1.,  2.]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.softmin(input, 1, mask=mask)
+    tensor([[0.8808, 0.0000, 0.1192],
+            [   nan,    nan,    nan]])
+"""
+
+std_docstring = """std(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns standard_deviation of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of sample standard deviation operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in standard_deviation computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of standard_deviation operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+      the uncorrected sample variance.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.std(input, 1, False, mask=mask)
+    tensor([1., nan])
+"""
+
+sum_docstring = """sum(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor
+
+Returns sum of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+
+The identity value of sum operation, which is used to start the reduction, is ``0``.
+
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in sum computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of sum operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.sum(input, 1, mask=mask)
+    tensor([-4,  0])
+"""
+
+var_docstring = """var(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor
+Returns variance of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.
+The identity value of sample variance operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in variance computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of variance operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+      Default: None that is equivalent to ``tuple(range(input.ndim))``.
+    unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+      the uncorrected sample variance.
+
+Keyword args:
+    keepdim (bool, optional): whether the output tensor has
+      :attr:`dim` retained or not. Default: False.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the input tensor is
+      casted to :attr:`dtype` before the operation is
+      performed. Default: None.
+    mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of input tensor
+      elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+Example::
+
+    >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]])
+    >>> input
+    tensor([[-3, -2, -1],
+            [ 0,  1,  2]])
+    >>> mask = tensor([[ True, False, True], [False, False, False]])
+    >>> mask
+    tensor([[ True, False,  True],
+            [False, False, False]])
+    >>> torch.masked._ops.var(input, 1, False, mask=mask)
+    tensor([1., nan])
+"""
diff --git a/MLPY/Lib/site-packages/torch/masked/_ops.py b/MLPY/Lib/site-packages/torch/masked/_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..04dae7253f0a6573f918e2c7f8a561b2f591222a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/_ops.py
@@ -0,0 +1,1796 @@
+
+import warnings
+
+# A workaround to support both TorchScript and MyPy:
+from typing import Any, List, Optional, Tuple, TYPE_CHECKING, Union
+
+import torch
+from torch import Tensor
+from torch.masked import as_masked_tensor, is_masked_tensor, MaskedTensor
+from . import _docs
+from torch._prims_common import corresponding_real_dtype
+from torch import sym_float
+
+if TYPE_CHECKING:
+    from torch.types import _dtype as DType
+
+    DimOrDims = Optional[Union[int, Tuple[int], List[int]]]
+else:
+    # The JIT doesn't understand Union, nor torch.dtype here
+    DType = int
+    DimOrDims = Optional[Tuple[int]]
+
+
+__all__: List[str] = []
+
+# All masked reduction/normalization operations have the same
+# signatures. Here we introduce docstring templates that are applied
+# to docstrings of reduction/normalization functions via
+# _apply_docstring_templates decorator.
+
+
+def _apply_docstring_templates(func):
+    """Decorator that applies docstring templates to function docstring
+    and returns the function instance.
+    """
+
+    doc_string = getattr(_docs, f"{func.__name__}_docstring", None)
+    if doc_string is None:
+        warnings.warn(
+            f"No documentation string available for {func.__name__}."
+            " PyTorch team should run `python tools/update_masked_docs.py`"
+            " to generate the missing docstrings."
+        )
+    else:
+        func.__doc__ = doc_string
+
+    # Expose function as public symbol
+    __all__.append(func.__name__)
+
+    return func
+
+
+def _generate_docstring(func):
+    """A utility function called from tools/update_masked_docs.py
+    script to update the module torch.masked._docs.py
+    """
+    docstring_templates = dict(
+        reduction_signature="""\
+{function_name}(input, {operation_args}, *, {operation_kwargs}) -> Tensor""",
+        reduction_descr="""\
+Returns {operation name} of all the elements in the :attr:`input`
+tensor along the given dimension(s) :attr:`dim` while the :attr:`input`
+elements are masked out according to the boolean tensor
+:attr:`mask`.""",
+        reduction_args="""\
+If :attr:`keepdim` is ``True``, the output tensor is of the same size
+as :attr:`input` except in the dimension(s) :attr:`dim` where it is of
+size 1. Otherwise, :attr:`dim` is squeezed (see
+:func:`torch.squeeze`), resulting in the output tensor having 1 (or
+``len(dim)``) fewer dimension(s).
+
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True
+then the corresponding element in :attr:`input` tensor will be
+included in {operation name} computation, otherwise the element is
+ignored.
+
+When all elements of :attr:`input` along the given dimension
+:attr:`dim` are ignored (fully masked-out), the corresponding element
+of the output tensor will have undefined value: it may or may not
+correspond to the identity value of {operation name} operation; the
+choice may correspond to the value that leads to the most efficient
+storage of :attr:`output` tensor.
+
+The mask of the output tensor can be computed as
+``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim,
+dtype=torch.bool)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    {args_declarations}
+
+Keyword args:
+    {kwargs_declarations}""",
+        reduction_example="""\
+Example::
+
+    >>> input = {example_input}
+    >>> input
+    {indent_example_input}
+    >>> mask = {example_mask}
+    >>> mask
+    {indent_example_mask}
+    >>> {full_function_name}(input, {example_args}, mask=mask)
+    {indent_example_output}
+""",
+        reduction_identity="""\
+The identity value of {operation name} operation, which is used to start the reduction, is ``{identity_int32}``.""",
+        reduction_identity_dtype="""\
+The identity value of {operation name} operation, which is used to start the
+reduction, depends on input dtype. For instance, for float32, uint8,
+and int32 dtypes, the identity values are ``{identity_float32}``, ``{identity_uint8}``, and ``{identity_int32}``, respectively.""",
+        normalization_signature="""\
+{function_name}(input, {operation_args}, *, {operation_kwargs}) -> Tensor""",
+        normalization_descr="""\
+Returns {operation name} of all the slices in the :attr:`input` tensor
+along :attr:`dim` while the :attr:`input` elements are masked out
+according to the boolean tensor :attr:`mask`.
+
+{definition}""",
+        normalization_args="""\
+The boolean tensor :attr:`mask` defines the "validity" of
+:attr:`input` tensor elements: if :attr:`mask` element is True then
+the corresponding element in :attr:`input` tensor will be included in
+{operation name} computation, otherwise the element is ignored.
+
+The values of masked-out elements of the output tensor have undefined
+value: it may or may not be set to zero or nan; the choice may correspond to
+the value that leads to the most efficient storage of :attr:`output`
+tensor.
+
+The mask of the {operation name} output tensor can be computed as
+``torch.broadcast_to(mask, input.shape)``.
+
+The shapes of the :attr:`mask` tensor and the :attr:`input` tensor
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the :attr:`mask`
+tensor must not be greater than of the :attr:`input` tensor.
+
+Args:
+    input (Tensor): the input tensor
+    {args_declarations}
+
+Keyword args:
+    {kwargs_declarations}""",
+        normalization_example="""\
+Example::
+
+    >>> input = {example_input}
+    >>> input
+    {indent_example_input}
+    >>> mask = {example_mask}
+    >>> mask
+    {indent_example_mask}
+    >>> {full_function_name}(input, {example_args}, mask=mask)
+    {indent_example_output}
+""",
+    )
+
+    args_and_kwargs = dict(
+        # argument name sufficies separated by double underscore will
+        # be removed in the final documentation string.
+        sum=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        prod=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        cumsum=(("dim__as_int",), ("dtype=None", "mask=None")),
+        cumprod=(("dim__as_int",), ("dtype=None", "mask=None")),
+        amin=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        amax=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        argmin=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        argmax=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        mean=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        median=(("dim__as_int",), ("keepdim=False", "dtype=None", "mask=None")),
+        norm=(
+            (
+                "ord",
+                "dim",
+            ),
+            ("keepdim=False", "dtype=None", "mask=None"),
+        ),
+        var=(("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
+        std=(("dim", "unbiased"), ("keepdim=False", "dtype=None", "mask=None")),
+        logsumexp=(("dim",), ("keepdim=False", "dtype=None", "mask=None")),
+        softmax=(("dim__as_int",), ("dtype=None", "mask=None")),
+        log_softmax=(("dim__as_int",), ("dtype=None", "mask=None")),
+        softmin=(("dim__as_int",), ("dtype=None", "mask=None")),
+        normalize=(
+            (
+                "ord__required",
+                "dim__as_int",
+            ),
+            ("eps=1e-12", "dtype=None", "mask=None"),
+        ),
+    )
+
+    argument_declarations = dict(
+        dim="""\
+dim (int or tuple of ints, optional): the dimension or dimensions to reduce.
+  Default: None that is equivalent to ``tuple(range(input.ndim))``.""",
+        dim__as_int="""\
+dim (int): the dimension along which {operation name} is computed.""",
+        ord="""\
+ord (int, float, optional): the order of vector norm. Default: 2.
+  See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
+        ord__required="""\
+ord (int, float): the order of vector norm. Default: 2.
+  See :func:`torch.linalg.vector_norm` for a list of supported norms.""",
+        unbiased="""\
+unbiased (bool): when True, use Bessel’s correction, otherwise, compute
+  the uncorrected sample variance.""",
+        eps="""\
+eps (float, optional): small value to avoid division by zero. Default: {default}.""",
+        keepdim="""\
+keepdim (bool, optional): whether the output tensor has
+  :attr:`dim` retained or not. Default: {default}.""",
+        dtype="""\
+dtype (:class:`torch.dtype`, optional): the desired data type
+  of returned tensor.  If specified, the input tensor is
+  casted to :attr:`dtype` before the operation is
+  performed. Default: {default}.""",
+        mask="""\
+mask (:class:`torch.Tensor`, optional): the boolean tensor
+  containing the binary mask of validity of input tensor
+  elements.
+  Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.""",
+    )
+
+    definitions = dict(
+        softmax="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmax of i-th element in ``x`` is
+defined as ``exp(x[i])/sum(exp(x))``.""",
+        log_softmax="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is
+defined as ``log(exp(x[i])/sum(exp(x)))``.""",
+        softmin="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Softmin of i-th element in ``x`` is
+defined as ``exp(-x[i])/sum(exp(-x))``.""",
+        normalize="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Normalize of i-th element in ``x`` is
+defined as ``x[i]/max(norm(x, p), eps)``.""",
+        cumsum="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``sum(x[:i])``.""",
+        cumprod="""\
+Let ``x`` be a sequence of unmasked elements of one-dimensional slice
+of the :attr:`input` tensor. Cumsum of i-th element in ``x`` is
+defined as ``prod(x[:i])``.""",
+    )
+
+    reduction_names = dict(
+        sum="sum",
+        prod="product",
+        amax="maximum",
+        amin="minimum",
+        argmax="argmax",
+        argmin="argmin",
+        mean="mean",
+        median="median",
+        norm="norm",
+        var="variance",
+        std="standard_deviation",
+        logsumexp="logsumexp",
+    )
+
+    normalization_names = dict(
+        softmax="softmax",
+        log_softmax="log_softmax",
+        softmin="softmin",
+        normalize="normalize",
+        cumsum="cumulative_sum",
+        cumprod="cumulative_prod",
+    )
+
+    operation_names = {}
+    operation_names.update(reduction_names)
+    operation_names.update(normalization_names)
+
+    # Default example data:
+    example_dim = 1
+    example_input = torch.tensor([[-3, -2, -1], [0, 1, 2]])
+    example_mask = torch.tensor([[True, False, True], [False, False, False]])
+    example_args: Tuple[Any, ...]
+    if func.__name__ in {"norm", "normalize"}:
+        example_args = (2.0, example_dim)
+        example_input = example_input.to(dtype=torch.float32)
+    elif func.__name__ in {"var", "std"}:
+        example_args = (example_dim, False)
+    elif func.__name__ == "median":
+        example_args = (example_dim,)
+        example_input = example_input.to(dtype=torch.float32)
+    else:
+        example_args = (example_dim,)
+
+    operation_args: Tuple[str, ...]
+    operation_kwargs: Tuple[str, ...]
+    operation_args, operation_kwargs = args_and_kwargs[func.__name__]
+    arg_declarations = [
+        "\n    ".join(
+            argument_declarations.get(a, f'{a.split("__", 1)[0]}: TBD.').splitlines()
+        )
+        for a in operation_args
+    ]
+    kwarg_declarations = [
+        "\n    ".join(
+            argument_declarations.get(
+                a.split("=", 1)[0], f'{a.split("__", 1)[0]}: TBD.'
+            )
+            .format(default=a.split("=", 1)[1])
+            .splitlines()
+        )
+        for a in operation_kwargs
+    ]
+
+    if func.__name__ in reduction_names:
+        op_kind = "reduction"
+        doc_sections = ["signature", "descr", "identity", "args", "example"]
+    elif func.__name__ in normalization_names:
+        op_kind = "normalization"
+        doc_sections = ["signature", "descr", "args", "example"]
+        example_input = example_input.to(dtype=torch.float32)
+    else:
+        assert 0  # add function name to operation names dictionaries
+    example_output = func(example_input, *example_args, mask=example_mask)
+
+    template_data = {
+        "function_name": func.__name__,
+        "full_function_name": func.__module__ + "." + func.__name__,
+        "operation name": operation_names[func.__name__],
+        "operation_args": ", ".join(a.split("__", 1)[0] for a in operation_args),
+        "operation_kwargs": ", ".join(a.split("__", 1)[0] for a in operation_kwargs),
+        # one-line representation of a tensor:
+        "example_input": " ".join(str(example_input).split()),
+        "example_args": ", ".join(map(str, example_args)),
+        "example_mask": " ".join(str(example_mask).split()),
+        # multi-line representation of a tensor with indent
+        "indent_example_input": ("\n    ").join(str(example_input).splitlines()),
+        "indent_example_mask": ("\n    ").join(str(example_mask).splitlines()),
+        "indent_example_output": ("\n    ").join(str(example_output).splitlines()),
+    }
+
+    if func.__name__ in reduction_names:
+        template_data.update(
+            identity_uint8=_reduction_identity(
+                func.__name__, torch.tensor(0, dtype=torch.uint8)
+            ),
+            identity_int32=_reduction_identity(
+                func.__name__, torch.tensor(0, dtype=torch.int32)
+            ),
+            identity_float32=_reduction_identity(
+                func.__name__, torch.tensor(0, dtype=torch.float32)
+            ),
+        )
+        if func.__name__ == "norm":
+            template_data.update(
+                identity_ord_ninf=_reduction_identity(
+                    func.__name__, torch.tensor(0, dtype=torch.float32), float("-inf")
+                )
+            )
+    elif func.__name__ in normalization_names:
+        template_data.update(definition=definitions[func.__name__])
+    else:
+        assert 0  # add function name to operation names dictionaries
+    template_data.update(
+        args_declarations=("\n    ".join(arg_declarations)).format_map(template_data)
+    )
+    template_data.update(
+        kwargs_declarations=("\n    ".join(kwarg_declarations)).format_map(
+            template_data
+        )
+    )
+
+    # Apply function name info to docstring templates:
+    templates = {
+        k: v.format_map(template_data)
+        for k, v in docstring_templates.items()
+        if k.startswith(op_kind)
+    }
+    templates.update(
+        (k, v.format_map(template_data) if isinstance(v, str) else v)
+        for k, v in template_data.items()
+    )
+
+    # Apply docstring templates to function doctring:
+    if func.__doc__ is None:
+        doc_template = "\n\n".join([f"{{{op_kind}_{sec}}}" for sec in doc_sections])
+    else:
+        doc_template = func.__doc__
+    return doc_template.format_map(templates)
+
+
+def _reduction_identity(op_name: str, input: Tensor, *args):
+    """Return identity value as scalar tensor of a reduction operation on
+    given input, or None, if the identity value cannot be uniquely
+    defined for the given input.
+
+    The identity value of the operation is defined as the initial
+    value to reduction operation that has a property ``op(op_identity,
+    value) == value`` for any value in the domain of the operation.
+    Or put it another way, including or excluding the identity value in
+    a list of operands will not change the reduction result.
+
+    See https://github.com/pytorch/rfcs/pull/27 for more information.
+
+    """
+    dtype: DType = input.dtype
+    device = input.device
+    op_name = op_name.rsplit(".", 1)[-1]  # lstrip module name when present
+    if op_name in {"sum", "cumsum"}:
+        return torch.tensor(0, dtype=dtype, device=device)
+    elif op_name in {"prod", "cumprod"}:
+        return torch.tensor(1, dtype=dtype, device=device)
+    elif op_name in {"amax", "argmax", "logsumexp"}:
+        if torch.is_floating_point(input):
+            return torch.tensor(-torch.inf, dtype=dtype, device=device)
+        elif torch.is_signed(input) or dtype == torch.uint8:
+            return torch.tensor(torch.iinfo(dtype).min, dtype=dtype, device=device)
+    elif op_name in {"amin", "argmin"}:
+        if torch.is_floating_point(input):
+            return torch.tensor(torch.inf, dtype=dtype, device=device)
+        elif torch.is_signed(input) or dtype == torch.uint8:
+            return torch.tensor(torch.iinfo(dtype).max, dtype=dtype, device=device)
+    elif op_name == "mean":
+        # Strictly speaking, the identity value of the mean operation
+        # is the mean of the input. Since the mean value depends on
+        # the dim argument and it may be a non-scalar tensor, we
+        # consider the identity value of the mean operation ambiguous.
+        # Moreover, the mean value of empty input is undefined.
+        return None
+    elif op_name == "norm":
+        ord = args[0] if args else 2
+        if ord == float("-inf"):
+            assert torch.is_floating_point(input), input.dtype
+            return torch.tensor(torch.inf, dtype=dtype, device=device)
+        return torch.tensor(0, dtype=dtype, device=device)
+    elif op_name == "median":
+        # We use NaN for now because the implementation is currently using torch.nanmedian
+        # and NaN is the identity for that function since it gets ignored
+        dtype = input.dtype if torch.is_floating_point(input) else torch.float
+        return torch.tensor(torch.nan, dtype=dtype, device=device)
+    elif op_name in {"var", "std"}:
+        return None
+    raise NotImplementedError(f"identity of {op_name} on {dtype} input")
+
+
+def _canonical_dim(dim: DimOrDims, ndim: int) -> Tuple[int, ...]:
+    """Return dim argument as a tuple of sorted dim values."""
+    dims: List[int] = []
+    if dim == ():
+        # Currently, `dim=()` in reductions operations means "reduce
+        # over all dimensions" while in future, it will read "no
+        # reduce". See https://github.com/pytorch/pytorch/issues/29137
+        # When gh-29137 is resolved, this if-block must be deleted.
+        dim = None
+    if dim is None:
+        return tuple(range(ndim))
+    ndim = max(ndim, 1)
+    dim_ = (dim,) if isinstance(dim, (int, torch.SymInt)) else dim
+    for d in dim_:
+        if d in dims:
+            raise RuntimeError(f"dim={d} appears multiple times in the list of dims")
+        if d >= ndim or d < -ndim:
+            raise IndexError(
+                f"Dimension out of range (expected to be in range of [{-ndim}, {ndim-1}], but got {d})"
+            )
+        dims.append(d % ndim)
+    return tuple(sorted(dims))
+
+
+def _sparse_coo_flatten_indices(indices: Tensor, shape: tuple):
+    # Flatted N-D indices to 1-D indices
+    flat_indices = indices.new_zeros(indices.size(1))
+    for d, sz in enumerate(shape):
+        flat_indices.mul_(sz)
+        flat_indices.add_(indices[d])
+    return flat_indices
+
+
+def _any(input: Tensor, dim: tuple, keepdim: bool):
+    # Support torch.any with tuple dim argument.
+    # Workaround of https://github.com/pytorch/pytorch/issues/56586
+    r = input
+    for d in reversed(dim):
+        r = r.any(dim=d, keepdim=keepdim)
+    return r
+
+
+def _sparse_coo_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """Sparse variant of torch.where. Supports sparse COO and hybrid sparse COO tensors.
+
+    _sparse_coo_where implements the following invariant:
+
+      _sparse_coo_where(mask, input, fill_value).to_dense(fill_value) ==
+        torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value))
+
+    where `a == b` means `assertEqual(a, b)`, mask is boolean sparse
+    tensor, and `to_dense(fill_value)` is like `to_dense()` except
+    that the unspecified elements are mapped to `fill_value` rather
+    than to `0`.
+
+    Returns a sparse COO tensor with the following features:
+
+    - all specified elements correspond to masked-in elements that
+      have the values of the input tensor. If there exists a masked-in
+      element (as specified by mask) that is not specified in the
+      input, in the result tensor, the corresponding element has value
+      0. In the dense part of the sparse tensor, the masked-out
+      elements are replaced with fill_value.
+
+    - all unspecified elements correspond to masked-out elements.
+    """
+
+    assert input.layout == torch.sparse_coo
+    assert mask.layout == input.layout
+    assert mask.shape == input.shape
+    assert mask.dense_dim() == input.dense_dim()  # TODO: eliminate this restriction
+
+    input = input.coalesce()
+
+    # For set operations on sparse tensor indices, we'll convert
+    # multi-dimensional indices to 1-D indices for efficiency.
+    input_flat_indices = _sparse_coo_flatten_indices(
+        input.indices(), input.shape[: input.sparse_dim()]
+    )
+    mask_flat_indices = _sparse_coo_flatten_indices(
+        mask.indices(), mask.shape[: mask.sparse_dim()]
+    )
+
+    # the set of mask flat indices that define masked-in elements:
+    if mask.dense_dim() > 0:
+        mask_values = _any(
+            mask.values(), tuple(range(1, input.sparse_dim() + 1)), False
+        )
+    else:
+        mask_values = mask.values()
+    maskin_flat_indices = mask_flat_indices[mask_values.nonzero()[:, 0]]
+
+    def intersection(i1, i2):
+        union, counts = torch.cat([i1, i2]).unique(return_counts=True)
+        return union, torch.where(counts.gt(1))
+
+    def minus(i1, i2):
+        union, counts = torch.cat([i1, i2]).unique(return_counts=True)
+        return intersection(union[torch.where(counts.eq(1))], i1)
+
+    def _apply(a):
+        obj, w = a
+        return obj[w]
+
+    # the set of input flat indices of specified and masked-in elements:
+    maskin_input_flat_indices = _apply(
+        intersection(maskin_flat_indices, input_flat_indices)
+    )
+    _, w = intersection(input_flat_indices, maskin_input_flat_indices)
+
+    # the indices and values of masked-in elements
+    where_input_indices = input.indices()[(slice(None),) + w]
+    where_input_values = input.values()[w]
+
+    if mask.dense_dim() > 0:
+        # apply mask to the dense part of the input values:
+        _, w1 = intersection(mask_flat_indices, maskin_input_flat_indices)
+        where_mask_values = mask.values()[w1]
+        where_input_values = torch.where(
+            where_mask_values, where_input_values, fill_value
+        )
+
+    # the set of flat indices of unspecified input and masked-in elements:
+    maskin_zero_flat_indices = _apply(
+        minus(maskin_flat_indices, maskin_input_flat_indices)
+    )
+
+    # the indices of masked-in zero elements
+    _, w = intersection(mask_flat_indices, maskin_zero_flat_indices)
+    where_zero_indices = mask.indices()[(slice(None),) + w]
+
+    # construct result
+    n = where_zero_indices.size(1)
+    if n == 0:
+        # the input is coalesced, hence input_flat_indices are ordered
+        # and the result is guaranteed to be coalesced:
+        result = torch.sparse_coo_tensor(
+            where_input_indices, where_input_values, input.shape
+        )
+        return result._coalesced_(True)
+
+    where_indices = torch.cat([where_input_indices, where_zero_indices], dim=1)
+    where_values = torch.cat(
+        [
+            where_input_values,
+            where_input_values.new_zeros((n,) + where_input_values.shape[1:]),
+        ]
+    )
+    result = torch.sparse_coo_tensor(where_indices, where_values, input.shape)
+
+    # appending zero elements leads to uncoalesced sparse tensor
+    return result.coalesce()
+
+
+def _sparse_coo_scatter_reduction_helper(
+    op,
+    mask_input: Tensor,
+    dims: Tuple[int, ...],
+    keepdim: bool,
+    dtype: Optional[DType] = None,
+) -> Tensor:
+    reduce = op.__name__
+    valid_reductions = ["sum", "prod", "amax", "amin"]
+    if reduce not in valid_reductions:
+        raise ValueError(
+            f"op must be one of {' '.join(valid_reductions)}, but got {reduce} instead"
+        )
+
+    output_dtype = dtype
+    values, indices = mask_input._values(), mask_input._indices()
+    input_dims = mask_input.dim()
+    num_sparse_dims = mask_input.sparse_dim()
+    reduced_sparse_dims = []
+    retained_sparse_dims = []
+    reduced_dense_dims = []
+
+    # promote dtype if specified
+    if values.dtype != output_dtype:
+        values = values.to(output_dtype)
+
+    if keepdim:
+        output_shape = tuple(
+            1 if i in dims else si for (i, si) in enumerate(mask_input.shape)
+        )
+    else:
+        output_shape = tuple(
+            si for (i, si) in enumerate(mask_input.shape) if i not in dims
+        )
+
+    for d in dims:
+        if d >= input_dims:
+            continue
+
+        if d < num_sparse_dims:
+            reduced_sparse_dims.append(d)
+        else:
+            reduced_dense_dims.append(d + 1 - num_sparse_dims)
+
+    # Reduce dense dimensions
+    if len(reduced_dense_dims) > 0:
+        if reduce == "sum":
+            new_values = values
+            new_values = op(new_values, dim=reduced_dense_dims, keepdim=bool(keepdim))
+        else:
+            # FIXME: Implement reductions for dense dimensions for ops with non-zero reduction identities
+            return NotImplemented
+    else:
+        new_values = values.clone()
+
+    # Reduce sparse dimensions
+    if len(reduced_sparse_dims) == num_sparse_dims:
+        if reduce in {"amax", "amin"} and new_values.size(0) == 0:
+            # IndexError: amax(): Expected reduction dim 0 to have non-zero size.
+            # sum()/prod() return the reduction identity when dim has size 0 but amax()/amin() do not
+            # See https://github.com/pytorch/pytorch/issues/61901
+            new_values = _reduction_identity(reduce, new_values)
+        else:
+            new_values = op(new_values, dim=0)
+        if keepdim:
+            for _ in range(num_sparse_dims):
+                new_values = new_values.unsqueeze(0)
+        return new_values.to(dtype=output_dtype).to_sparse()
+    else:
+        new_indices = indices.clone()
+        if keepdim:
+            # zero out reduced sparse dimensions if keepdim = True
+            # ensures that the call to torch.unique folds duplicated indices together while preserving the dimension
+            new_indices[reduced_sparse_dims, :] = 0
+        else:
+            # remove reduced sparse dimensions if keepdim = False
+            if len(reduced_sparse_dims) > 0:
+                retained_sparse_dims = [
+                    i
+                    for i in range(num_sparse_dims)
+                    if i not in set(reduced_sparse_dims)
+                ]
+                new_indices = new_indices.index_select(
+                    0, torch.tensor(retained_sparse_dims).to(mask_input.device)
+                )
+
+    # Use scatter_reduce to reduce items in the new_values tensor that correspond to the same indices in new_indices
+    if new_indices.numel() > 0:
+        # lexsort indices and get index tensor for scatter reduction
+        new_indices, inverse_indices = torch.unique(
+            new_indices, return_inverse=True, dim=1
+        )
+        out_shape = list(new_values.shape)
+        out_shape[0] = new_indices.shape[1]
+        for _ in range(new_values.ndim - 1):
+            inverse_indices = inverse_indices.unsqueeze(-1)
+        scatter_indices = inverse_indices.expand(new_values.shape)
+        # FIXME: temporary workaround for issue with bfloat16/float16 remove when acctype is implemented for scatter_reduce
+        if output_dtype in {torch.bfloat16, torch.float16}:
+            new_values = new_values.to(torch.float)
+            out = new_values.new_empty(out_shape)
+            new_values = out.scatter_reduce_(
+                0, scatter_indices, new_values, reduce=reduce, include_self=False
+            )
+            new_values = new_values.to(dtype=output_dtype)
+        else:
+            out = new_values.new_empty(out_shape)
+            new_values = out.scatter_reduce_(
+                0, scatter_indices, new_values, reduce=reduce, include_self=False
+            )
+
+    return torch.sparse_coo_tensor(
+        new_indices,
+        new_values,
+        output_shape,
+        dtype=output_dtype,
+        device=mask_input.device,
+    )
+
+
+def _sparse_csr_segment_reduction_helper(
+    op,
+    mask_input: Tensor,
+    dims: Tuple[int, ...],
+    keepdim: bool,
+    dtype: Optional[DType] = None,
+) -> Tensor:
+    # Currently, while sparse CSR is always 2D with no dense dimensions keepdim must be True
+    # FIXME: when dense dimensions are implemented for CSR tensors
+    assert (
+        keepdim
+    ), "reduction operations on CSR tensors with keepdim=False is unsupported"
+    reduce = op.__name__
+    valid_reductions = ["sum", "prod", "mean", "amax", "amin"]
+    if reduce not in valid_reductions:
+        raise ValueError(
+            f"op must be one of {' '.join(valid_reductions)}, but got {reduce} instead"
+        )
+    device = mask_input.device
+    output_dtype = dtype
+    values, crow_indices, col_indices = (
+        mask_input.values(),
+        mask_input.crow_indices(),
+        mask_input.col_indices(),
+    )
+
+    # promote dtype if specified
+    if values.dtype != output_dtype:
+        values = values.to(output_dtype)
+
+    if len(dims) == 0:
+        return mask_input
+    if len(dims) == 1:
+        if dims[0] == 0:
+            new_col_indices, scatter_indices = torch.unique(
+                col_indices, return_inverse=True
+            )
+            new_nnz = new_col_indices.shape[0]
+            new_crow_indices = torch.tensor([0, new_nnz])
+            new_values = values.new_empty(new_col_indices.shape)
+            new_values.scatter_reduce_(
+                0, scatter_indices, values, reduce, include_self=False
+            )
+            new_shape = [1, mask_input.size(1)]
+        else:
+            assert (
+                dims[0] == 1
+            ), "Sparse CSR tensors are 2D and only support reduction along dim 0 or 1."
+            # all intervals new_crow_indices[i] - new_crow_indices[i-1] are 1
+            # except for where crow_indices[i] == crow_indices[i-1] where the interval remains as 0
+            new_crow_indices = torch.cat(
+                (
+                    crow_indices.new_zeros(1),
+                    torch.cumsum(torch.diff(crow_indices) != 0, 0),
+                ),
+                0,
+            )
+            new_nnz = new_crow_indices[-1]
+            new_col_indices = col_indices.new_zeros(new_nnz)
+            new_values = torch._segment_reduce(values, reduce, offsets=crow_indices)  # type: ignore[attr-defined]
+            new_shape = [mask_input.size(0), 1]
+    else:
+        assert len(dims) == 2
+        nnz = min(1, values.numel())
+        if nnz == 1:
+            op_kwargs = {"keepdim": True, "dtype": output_dtype}
+            # amax and amin do not support dtype kwarg
+            if reduce in ["amax", "amin"]:
+                del op_kwargs["dtype"]
+            new_values = op(values, 0, **op_kwargs)
+        else:
+            new_values = torch.empty(0, dtype=output_dtype)
+        new_col_indices = col_indices.new_zeros(nnz)
+        new_crow_indices = torch.tensor([0, nnz])
+        new_shape = [1, nnz]
+
+    return torch.sparse_csr_tensor(
+        new_crow_indices,
+        new_col_indices,
+        new_values,
+        new_shape,
+        dtype=output_dtype,
+        device=device,
+    )
+
+
+def _sparse_csr_where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """Sparse variant of torch.where. Supports sparse CSR tensors."""
+    # TODO: implement sparse CSR specific where operator for efficiency
+    return _sparse_coo_where(
+        mask.to_sparse_coo(), input.to_sparse_coo(), fill_value
+    ).to_sparse_csr()
+
+
+def _where(mask: Tensor, input: Tensor, fill_value: Tensor) -> Tensor:
+    """torch.where with sparse inputs support.
+
+    _where implements the following invariant:
+
+      _where(mask, input, fill_value).to_dense(fill_value) ==
+        torch.where(mask.to_dense(), input.to_dense(), torch.full(input.shape, fill_value))
+
+    where `a == b` means `assertEqual(a, b)`, mask is boolean sparse
+    tensor, and `to_dense(fill_value)` is like `to_dense()` except
+    that the unspecified elements are mapped to `fill_value` rather
+    than to `0`.
+
+    Returns a sparse tensor with the following features:
+
+    - all specified elements correspond to masked-in elements that
+      have the values of the input tensor. If there exists a masked-in
+      element (as specified by mask) that is not specified in the
+      input, in the result tensor, the corresponding element has value
+      0. In the dense part of the sparse tensor, the masked-out
+      elements are replaced with fill_value.
+
+    - all unspecified elements correspond to masked-out elements.
+    """
+    if mask.layout == torch.strided:
+        return torch.where(mask, input, fill_value)
+    elif mask.layout == torch.sparse_coo:
+        return _sparse_coo_where(mask, input, fill_value)
+    elif mask.layout == torch.sparse_csr:
+        return _sparse_csr_where(mask, input, fill_value)
+    else:
+        raise ValueError(
+            f"_where expects strided or sparse COO or sparse CSR tensor but got {mask.layout}"
+        )
+
+
+def _input_mask(input: Union[Tensor, MaskedTensor], *args, **kwargs) -> Tensor:
+    """Return canonical input mask.
+
+    A canonical input mask is defined as a boolean mask tensor that
+    shape and layout matches with the shape and the layout of the
+    input.
+
+    The canonical input mask is computed from the :attr:`mask` tensor
+    content to meet the following criteria:
+
+    1. The shape of the canonical input mask is the same as the shape
+       of :attr:`input` tensor. If the mask tensor has a smaller shape
+       than the shape of the :attr:`input`, broadcasting rules will be
+       applied. Downcasting of mask is not supported.
+
+    2. The layout of the canonical input mask is the same as the
+       layout of the :attr:`input` tensor. If the mask has different
+       layout, it will be converted to the expected layout.  In the
+       case of sparse COO layout, the canonical input mask will be
+       coalesced.
+
+    3. The dtype of the canonical input mask is torch.bool. If the
+       mask dtype is not bool then it will be converted to bool dtype
+       using `.to(dtype=bool)` method call.
+
+    4. The elements of the canonical input mask have boolean values
+       copied from the content of the :attr:`mask` tensor (after
+       possible broadcasting and dtype conversion transforms).  In
+       general, the sparsity pattern of the sparse canonical input
+       mask need not to be the same as the sparsity pattern of the
+       sparse :attr:`input` tensor.
+
+    """
+    if input.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}:
+        raise ValueError(
+            f"_input_mask expects strided or sparse COO or sparse CSR tensor but got {input.layout}"
+        )
+
+    mask = kwargs.get("mask")
+
+    # default mask
+    if mask is None:
+        raise ValueError("_input_mask requires explicit mask")
+
+    # mask shape must match with input shape
+    if mask.shape != input.shape:
+        if mask.ndim > input.ndim:
+            raise IndexError(
+                "_input_mask expected broadcastable mask (got mask dimensionality higher than of the input)"
+            )
+        if mask.layout == torch.strided:
+            mask = torch.broadcast_to(mask.clone(), input.shape).to(dtype=torch.bool)
+        elif mask.layout == torch.sparse_coo:
+            mask = torch._sparse_broadcast_to(mask, input.shape)
+        else:
+            assert mask.layout == torch.sparse_csr
+            # Broadcasting of CSR tensors is not implemented. Working
+            # around by using COO layout.
+            mask = torch._sparse_broadcast_to(
+                mask.to_sparse(), input.shape
+            ).to_sparse_csr()
+
+    # mask layout must match with input layout
+    if mask.layout != input.layout:
+        if input.layout == torch.strided:
+            mask = mask.to_dense()
+        elif input.layout == torch.sparse_coo:
+            if mask.layout == torch.strided:
+                mask = mask.to_sparse(input.sparse_dim())
+            else:
+                mask = mask.to_sparse()
+        else:
+            assert input.layout == torch.sparse_csr
+            mask = mask.to_sparse_csr()
+
+    # sparse mask must be coalesced
+    if mask.layout == torch.sparse_coo:
+        mask = mask.coalesce()
+
+    # mask is a boolean tensor
+    mask = mask.to(dtype=torch.bool)
+
+    return mask
+
+
+def _output_mask(op, input: Tensor, *args, **kwargs) -> Tensor:
+    """Return output mask of masked operation applied to given arguments."""
+    if callable(op):
+        is_reduction = op.__name__ in {
+            "sum",
+            "prod",
+            "amax",
+            "amin",
+            "argmax",
+            "argmin",
+            "mean",
+            "median",
+            "norm",
+            "var",
+            "std",
+            "logsumexp",
+        }
+        is_normalization = op.__name__ in {
+            "softmax",
+            "log_softmax",
+            "softmin",
+            "normalize",
+            "cumsum",
+            "cumprod",
+        }
+        if is_reduction:
+            if op.__name__ == "norm":
+                if args:
+                    args = args[1:]  # lstrip ord argument
+            dim = args[0] if args else kwargs.get("dim")
+            outmask = _input_mask(input, *args, **kwargs)
+            keepdim = kwargs.get("keepdim", False)
+            dim_ = _canonical_dim(dim, input.ndim)
+            return _any(outmask, dim_, bool(keepdim))
+        elif is_normalization:
+            return _input_mask(input, *args, **kwargs)
+        else:
+            raise ValueError(
+                f"_output_mask expected masked operation (got callable {op.__module__}.{op.__name__})"
+            )
+    else:
+        raise ValueError(
+            f"_output_mask expected masked operation (got {type(op).__name__} object)"
+        )
+
+
+def _combine_input_and_mask(
+    op, input: Union[MaskedTensor, Tensor], mask, *args
+) -> Tensor:
+    def helper(input, mask):
+        if mask is None:
+            return input
+        canonical_mask = _input_mask(input, mask=mask)
+        if callable(op):
+            fill_value = _reduction_identity(op.__name__, input, *args)
+            return _where(canonical_mask, input, fill_value)
+        else:
+            raise ValueError(
+                f"_combine_input_and_mask expected masked operation (got {type(op).__name__} object)"
+            )
+
+    class Combine(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, input, mask):
+            """Return input with masked-out elements eliminated for the given operations."""
+            ctx.save_for_backward(mask)
+
+            if mask is not None:
+                ctx.mark_non_differentiable(mask)
+
+            return helper(input, mask)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            (mask,) = ctx.saved_tensors
+            grad_data = (
+                grad_output.get_data() if is_masked_tensor(grad_output) else grad_output
+            )
+            result = as_masked_tensor(grad_data, mask)
+            return result, None
+
+    return (
+        Combine.apply(input.get_data(), input.get_mask())  # type: ignore[union-attr]
+        if is_masked_tensor(input)
+        else helper(input, mask)
+    )
+
+
+@_apply_docstring_templates
+def sum(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    # __doc__ is generated by _apply_docstring_templates decorator
+    if dtype is None:
+        # promote integer types to int64 when output dtype is not specified
+        if input.layout == torch.sparse_csr:
+            if input.dtype in {
+                torch.uint8,
+                torch.bool,
+                torch.int8,
+                torch.int16,
+                torch.int32,
+            }:
+                # csr.to(dtype=torch.int64) is not implemented, so
+                # using coo.to on input to ensure the promoted dtype
+                input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr()
+            else:
+                dtype = input.dtype
+        else:
+            dtype = input.dtype
+            if input.dtype in {
+                torch.uint8,
+                torch.bool,
+                torch.int8,
+                torch.int16,
+                torch.int32,
+            }:
+                dtype = torch.int64
+    dim_ = _canonical_dim(dim, input.ndim)
+    mask_input = _combine_input_and_mask(sum, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.sum(mask_input, dim_, bool(keepdim), dtype=dtype)
+    elif mask_input.layout == torch.sparse_coo:
+        return _sparse_coo_scatter_reduction_helper(
+            torch.sum, mask_input, dim_, bool(keepdim), dtype
+        )
+    elif mask_input.layout == torch.sparse_csr:
+        return torch._sparse_csr_sum(
+            mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype
+        )
+    else:
+        raise ValueError(
+            f"masked sum expects strided, sparse_coo or sparse_csr tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def prod(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    # __doc__ is generated by _apply_docstring_templates decorator
+    if dtype is None:
+        # promote integer types to int64 when output dtype is not specified
+        if input.layout == torch.sparse_csr:
+            if input.dtype in {
+                torch.uint8,
+                torch.bool,
+                torch.int8,
+                torch.int16,
+                torch.int32,
+            }:
+                # csr.to(dtype=torch.int64) is not implemented, so
+                # using coo.to on input to ensure the promoted dtype
+                input = input.to_sparse_coo().to(dtype=torch.int64).to_sparse_csr()
+            else:
+                dtype = input.dtype
+        else:
+            dtype = input.dtype
+            if input.dtype in {
+                torch.uint8,
+                torch.bool,
+                torch.int8,
+                torch.int16,
+                torch.int32,
+            }:
+                dtype = torch.int64
+    dim_ = _canonical_dim(dim, input.ndim)
+    mask_input = _combine_input_and_mask(prod, input, mask)
+    if mask_input.layout == torch.strided:
+        # Workaround https://github.com/pytorch/pytorch/issues/56586
+        result = mask_input
+        result = result.to(dtype=dtype)
+        for d in reversed(dim_):
+            result = result.prod(dim=d, keepdim=bool(keepdim))
+        return result
+    elif mask_input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch, the same issue arises for sparse_coo tensors
+            raise ValueError(
+                "masked prod expects explicit mask for sparse_coo tensor input"
+            )
+        return _sparse_coo_scatter_reduction_helper(
+            torch.prod, mask_input, dim_, bool(keepdim), dtype
+        )
+    elif mask_input.layout == torch.sparse_csr:
+        if mask is None:
+            # mask is None corresponds to all-True mask. The
+            # unspecified elements in the CSR tensor correspond to
+            # zero values. Hence, the prod reduction result is
+            # automatically zero unless all elements are specified.
+            # A semi-optimal way to take this into account is to use:
+            #
+            #   masked_prod(csr, ..., mask=None) == torch._sparse_csr_prod(csr, ...) * all(csr.nonzero(), ...)
+            #
+            # but that requires implementing `all` and `nonzero`
+            # support for sparse csr tensors.
+            raise ValueError(
+                "masked prod expects explicit mask for sparse_csr tensor input"
+            )
+        return torch._sparse_csr_prod(
+            mask_input, dim=list(dim_), keepdim=bool(keepdim), dtype=dtype
+        )
+    else:
+        raise ValueError(
+            f"masked prod expects strided, sparse_coo or sparse_csr tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def cumsum(
+    input: Tensor,
+    dim: int,
+    *,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(sum, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.cumsum(mask_input, dim_, dtype=dtype).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked cumsum expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def cumprod(
+    input: Tensor,
+    dim: int,
+    *,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(prod, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.cumprod(mask_input, dim_, dtype=dtype).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked cumprod expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def amax(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+
+{reduction_descr}
+
+{reduction_identity_dtype}
+
+{reduction_args}
+
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+
+    mask_input = _combine_input_and_mask(amax, input, mask)
+    dim_ = _canonical_dim(dim, mask_input.ndim)
+    if mask_input.layout == torch.strided:
+        return torch.amax(mask_input, dim_, bool(keepdim)).to(dtype=dtype)
+    elif mask_input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch of prod, a similar issue arises here
+            # where unspecified elements along a dimension may need to be reduced with the result
+            raise ValueError(
+                "masked amax expects explicit mask for sparse_coo tensor input"
+            )
+        return _sparse_coo_scatter_reduction_helper(
+            torch.amax, mask_input, dim_, bool(keepdim), dtype
+        )
+    elif mask_input.layout == torch.sparse_csr:
+        if mask is None:
+            raise ValueError(
+                "masked amax expects explicit mask for sparse_csr tensor input"
+            )
+        return _sparse_csr_segment_reduction_helper(
+            torch.amax, mask_input, dim_, bool(keepdim), dtype
+        )
+    else:
+        raise ValueError(
+            f"masked amax expects strided, sparse_coo or sparse_csr tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def amin(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+
+{reduction_descr}
+
+{reduction_identity_dtype}
+
+{reduction_args}
+
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+
+    mask_input = _combine_input_and_mask(amin, input, mask)
+    dim_ = _canonical_dim(dim, mask_input.ndim)
+    if mask_input.layout == torch.strided:
+        return torch.amin(mask_input, dim_, bool(keepdim)).to(dtype=dtype)
+    elif mask_input.layout == torch.sparse_coo:
+        if mask is None:
+            # See comment in the sparse_csr branch of prod, a similar issue arises here
+            # where unspecified elements along a dimension may need to be reduced with the result
+            raise ValueError(
+                "masked amax expects explicit mask for sparse_coo tensor input"
+            )
+        return _sparse_coo_scatter_reduction_helper(
+            torch.amin, mask_input, dim_, bool(keepdim), dtype
+        )
+    elif mask_input.layout == torch.sparse_csr:
+        if mask is None:
+            raise ValueError(
+                "masked amin expects explicit mask for sparse_csr tensor input"
+            )
+        return _sparse_csr_segment_reduction_helper(
+            torch.amin, mask_input, dim_, bool(keepdim), dtype
+        )
+    else:
+        raise ValueError(
+            f"masked amin expects strided, sparse_coo or sparse_csr tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def argmax(
+    input: Union[Tensor, MaskedTensor],
+    dim: Optional[int] = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+{reduction_identity_dtype}
+{reduction_args}
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    mask_input = _combine_input_and_mask(argmax, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.argmax(mask_input, dim, bool(keepdim)).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked argmax expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def argmin(
+    input: Union[Tensor, MaskedTensor],
+    dim: Optional[int] = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+{reduction_identity_dtype}
+{reduction_args}
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    mask_input = _combine_input_and_mask(argmin, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.argmin(mask_input, dim, bool(keepdim)).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked argmin expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def mean(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+
+{reduction_descr}
+
+By definition, the identity value of a mean operation is the mean
+value of the tensor. If all elements of the input tensor along given
+dimension(s) :attr:`dim` are masked-out, the identity value of the
+mean is undefined.  Due to this ambiguity, the elements of output
+tensor with strided layout, that correspond to fully masked-out
+elements, have ``nan`` values.
+
+{reduction_args}
+
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    if input.layout == torch.strided:
+        if mask is None:
+            # TODO: compute count analytically
+            count = sum(
+                torch.ones(input.shape, dtype=torch.int64, device=input.device),
+                dim,
+                keepdim=keepdim,
+            )
+            total = sum(input, dim, keepdim=keepdim, dtype=dtype)
+        else:
+            inmask = _input_mask(input, mask=mask)
+            count = sum(
+                inmask.new_ones(input.shape, dtype=torch.int64),
+                dim,
+                keepdim=keepdim,
+                mask=inmask,
+            )
+            total = sum(input, dim, keepdim=keepdim, dtype=dtype, mask=inmask)
+        return total / count
+    elif input.layout == torch.sparse_csr:
+        mask_input = _combine_input_and_mask(mean, input, mask)
+        dim_ = _canonical_dim(dim, mask_input.ndim)
+        if mask is None:
+            raise ValueError(
+                "masked mean expects explicit mask for sparse_csr tensor input"
+            )
+        return _sparse_csr_segment_reduction_helper(
+            torch.mean, mask_input, dim_, bool(keepdim), dtype
+        )
+    else:
+        raise ValueError(
+            f"masked mean expects strided or sparse_csr tensor (got {input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def median(
+    input: Union[Tensor, MaskedTensor],
+    dim: int = -1,
+    *,
+    keepdim: bool = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+
+    """\
+{reduction_signature}
+{reduction_descr}
+By definition, the identity value of a median operation is the median
+value of the tensor. If all elements of the input tensor along given
+dimension(s) :attr:`dim` are masked-out, the identity value of the
+median is undefined.  Due to this ambiguity, the elements of output
+tensor with strided layout, that correspond to fully masked-out
+elements, have ``nan`` values.
+{reduction_args}
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    is_float = torch.is_floating_point(input)
+    if not is_float:
+        input = input.to(dtype=torch.float)
+    mask_input = _combine_input_and_mask(median, input, mask)
+    if mask_input.layout == torch.strided:
+        output = torch.nanmedian(mask_input, dim_, keepdim).values
+        if is_float:
+            return output
+        elif not is_float and not torch.isnan(output).any():
+            return output.to(dtype=dtype)
+        else:
+            raise ValueError(
+                "masked median expects no fully masked out rows if dtype is not floating point"
+            )
+    else:
+        raise ValueError(
+            f"masked median expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def logsumexp(
+    input: Tensor,
+    dim: DimOrDims = None,
+    *,
+    keepdim: bool = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)
+    mask_input = _combine_input_and_mask(logsumexp, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.logsumexp(mask_input, dim_, keepdim=keepdim).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked logsumexp expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+# Cannot use _apply_docstring_templates as it is only set up for reductions and normalizations
+def logaddexp(
+    input: Union[Tensor, MaskedTensor],
+    other: Union[Tensor, MaskedTensor],
+    *,
+    dtype: Optional[DType] = None,
+    input_mask: Optional[Tensor] = None,
+    other_mask: Optional[Tensor] = None,
+) -> Tensor:
+    """logaddexp(input, other, *, dtype=None, input_mask=None, other_mask=None) -> Tensor
+
+Returns logaddexp of all the elements in the :attr:`input` and the :attr:`other`
+tensor. The :attr:`input` elements are masked out according to the boolean tensor
+:attr:`input_mask` and the attr:`other` elements are masked out according to the boolean tensor
+:attr:`other_mask`.
+
+The shapes of a mask tensor and the tensor to be masked
+don't need to match, but they must be :ref:`broadcastable
+<broadcasting-semantics>` and the dimensionality of the mask
+tensor must not be greater than of the tensor to be masked.
+
+Args:
+    input (Tensor): the input tensor
+    other (Tensor): the second input tensor
+
+Keyword args:
+    dtype (:class:`torch.dtype`, optional): the desired data type
+      of returned tensor.  If specified, the output tensor is
+      casted to :attr:`dtype` after the operation is
+      performed. Default: None.
+    input_mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of :attr:`input` tensor elements.
+      Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``.
+    other_mask (:class:`torch.Tensor`, optional): the boolean tensor
+      containing the binary mask of validity of :attr:`other` tensor elements.
+      Default: None that is equivalent to ``torch.ones(other.shape, dtype=torch.bool)``.
+
+Example::
+
+    >>> input = torch.tensor([-100.0, -200, -300])
+    >>> input
+    tensor([-100., -200., -300.])
+    >>> other = torch.tensor([-1.0, -2, -3])
+    >>> other
+    tensor([-1., -2., -3.])
+    >>> mask = torch.tensor([True, False, True])
+    >>> mask
+    tensor([ True, False,  True])
+    >>> torch.masked._ops.logaddexp(input, other, input_mask=mask, other_mask=mask)
+    tensor([-1., -inf, -3.])
+"""
+    if dtype is None:
+        dtype = input.dtype
+    if input.layout == torch.strided and other.layout == torch.strided:
+        mask_input = _combine_input_and_mask(logsumexp, input, input_mask)
+        mask_other = _combine_input_and_mask(logsumexp, other, other_mask)
+        return torch.logaddexp(mask_input, mask_other).to(dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked logaddexp expects strided tensors (got {input.layout} tensor for input, {other.layout} for other)"
+        )
+
+
+@_apply_docstring_templates
+def norm(
+    input: Union[Tensor, MaskedTensor],
+    ord: Optional[float] = 2.0,
+    dim: DimOrDims = None,
+    *,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+
+{reduction_descr}
+
+The identity value of norm operation, which is used to start the
+reduction, is ``{identity_float32}``, except for ``ord=-inf`` it is
+``{identity_ord_ninf}``.
+
+{reduction_args}
+
+{reduction_example}"""
+    if dtype is None:
+        dtype = input.dtype
+    mask_input = _combine_input_and_mask(norm, input, mask, ord)
+    if mask_input.layout == torch.strided:
+        dim_ = _canonical_dim(dim, input.ndim)
+        return torch.linalg.vector_norm(
+            mask_input, ord, dim_, bool(keepdim), dtype=dtype
+        )
+    else:
+        raise ValueError(
+            f"masked norm expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+def _std_var(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims,
+    unbiased: Optional[bool],
+    *,
+    correction_opt: Optional[Union[int, float]],
+    keepdim: Optional[bool],
+    dtype: Optional[DType],
+    mask: Optional[Tensor],
+    take_sqrt: Optional[bool],
+) -> Tensor:
+    assert (unbiased is None or correction_opt is None), "Only one of unbiased and correction may be given"
+    correction = 1.0
+    if unbiased is not None:
+        correction = 1.0 if unbiased else 0.0
+    if correction_opt is not None:
+        correction = sym_float(correction_opt)
+
+    if dtype is None:
+        dtype = input.dtype
+        if not (dtype.is_floating_point or dtype.is_complex):
+            dtype = torch.float32
+    compute_dtype = dtype
+    if not (compute_dtype.is_floating_point or compute_dtype.is_complex):
+        compute_dtype = torch.float32
+    if input.layout == torch.strided:
+        if mask is None:
+            # TODO: compute count analytically
+            count = sum(
+                torch.ones(input.shape, dtype=torch.int64, device=input.device),
+                dim,
+                keepdim=True,
+            )
+            sample_total = sum(input, dim, keepdim=True, dtype=dtype)
+        else:
+            inmask = _input_mask(input, mask=mask)
+            count = sum(
+                inmask.new_ones(input.shape, dtype=torch.int64),
+                dim,
+                keepdim=True,
+                mask=inmask,
+            )
+            sample_total = sum(input, dim, keepdim=True, dtype=dtype, mask=inmask)
+        # TODO: replace torch.subtract/divide/square/maximum with
+        # masked subtract/divide/square/maximum when these will be
+        # available.
+        sample_mean = torch.divide(sample_total, count)
+        x = torch.subtract(input, sample_mean)
+        if mask is None:
+            total = sum(x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype)
+        else:
+            total = sum(
+                x * x.conj(), dim, keepdim=keepdim, dtype=compute_dtype, mask=inmask  # type: ignore[possibly-undefined]
+            )
+        if not keepdim:
+            count = count.reshape(total.shape)
+        if correction != 0:
+            real_dtype = (corresponding_real_dtype(compute_dtype)
+                          if compute_dtype.is_complex else compute_dtype)
+            count = count.to(real_dtype)
+            count = torch.subtract(count, correction)
+            count = torch.maximum(count, count.new_zeros([]))
+        output = torch.divide(total, count).to(dtype=dtype)
+        if take_sqrt:
+            output = torch.sqrt(output)
+        return output
+    else:
+        raise ValueError(
+            f"masked std/var expects strided tensor (got {input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def var(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    unbiased: Optional[bool] = None,
+    *,
+    correction: Optional[Union[int, float]] = None,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+The identity value of sample variance operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+{reduction_args}
+{reduction_example}"""
+    return _std_var(
+        input=input,
+        dim=dim,
+        unbiased=unbiased,
+        correction_opt=correction,
+        keepdim=keepdim,
+        dtype=dtype,
+        mask=mask,
+        take_sqrt=False,
+    )
+
+
+@_apply_docstring_templates
+def std(
+    input: Union[Tensor, MaskedTensor],
+    dim: DimOrDims = None,
+    unbiased: Optional[bool] = None,
+    *,
+    correction: Optional[int] = None,
+    keepdim: Optional[bool] = False,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    """\
+{reduction_signature}
+{reduction_descr}
+The identity value of sample standard deviation operation is undefined. The
+elements of output tensor with strided layout, that correspond to
+fully masked-out elements, have ``nan`` values.
+{reduction_args}
+{reduction_example}"""
+    return _std_var(
+        input=input,
+        dim=dim,
+        unbiased=unbiased,
+        correction_opt=correction,
+        keepdim=keepdim,
+        dtype=dtype,
+        mask=mask,
+        take_sqrt=True,
+    )
+
+
+@_apply_docstring_templates
+def softmax(
+    input: Union[Tensor, MaskedTensor],
+    dim: int,
+    *,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amax, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.nn.functional.softmax(mask_input, dim_, dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked softmax expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def log_softmax(
+    input: Union[Tensor, MaskedTensor],
+    dim: int,
+    *,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amax, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.nn.functional.log_softmax(mask_input, dim_, dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked log_softmax expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def softmin(
+    input: Union[Tensor, MaskedTensor],
+    dim: int,
+    *,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    mask_input = _combine_input_and_mask(amin, input, mask)
+    if mask_input.layout == torch.strided:
+        return torch.nn.functional.softmin(mask_input, dim_, dtype=dtype)
+    else:
+        raise ValueError(
+            f"masked softmin expects strided tensor (got {mask_input.layout} tensor)"
+        )
+
+
+@_apply_docstring_templates
+def normalize(
+    input: Union[Tensor, MaskedTensor],
+    ord: float,
+    dim: int,
+    *,
+    eps: float = 1e-12,
+    dtype: Optional[DType] = None,
+    mask: Optional[Tensor] = None,
+) -> Tensor:
+    if dtype is None:
+        dtype = input.dtype
+    dim_ = _canonical_dim(dim, input.ndim)[0]
+    # TODO: eliminate mask_input as unnecessary when using masked divide.
+    mask_input = _combine_input_and_mask(sum, input, mask)
+    if mask_input.layout == torch.strided:
+        nrm_ = norm(input, ord, dim, keepdim=True, dtype=dtype, mask=mask)
+        # TODO: replace torch.maximum with masked maximum when available.
+        denom = torch.maximum(nrm_, nrm_.new_full([], eps))
+        # TODO: replace torch.divide with masked divide when available.
+        return torch.divide(mask_input, denom)
+    else:
+        raise ValueError(
+            f"masked normalize expects strided tensor (got {mask_input.layout} tensor)"
+        )
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__init__.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ee874f06d63dcf6b2b68ea47ce58aa107759f02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# flake8: noqa
+
+from .binary import _apply_native_binary, _is_native_binary
+from .core import is_masked_tensor, MaskedTensor
+from .passthrough import _apply_pass_through_fn, _is_pass_through_fn
+from .reductions import _apply_reduction, _is_reduction
+from .unary import _apply_native_unary, _is_native_unary
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b2acf17a324be5c33b10601320a2f4ec55c0171
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/_ops_refs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/_ops_refs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abeb8337225ff7d382d9ce188c99b37adce293df
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/_ops_refs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/binary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/binary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..774d2d02a0bcc663bf6ed03da076c2c1a61e488c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/binary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/core.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/core.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0381a83c03294dd09f057a769bd7f1d2ef82a1f5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/core.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/creation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/creation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..380c456e92fd7831987c6d96616f5df6d185bc69
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/creation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/passthrough.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/passthrough.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a22d0501285e53fde3446db4abc23f49714caee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/passthrough.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/reductions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/reductions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2682d7eb38a53894de678094f4c021de0c4ddca1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/reductions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/unary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/unary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01b929d0609e458af551524693f3c8a81ac0803e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/masked/maskedtensor/__pycache__/unary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/_ops_refs.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/_ops_refs.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc8f20da774830bc38e71566a13544c3e2e5df2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/_ops_refs.py
@@ -0,0 +1,477 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from functools import partial
+from typing import Callable, Any, Dict, TYPE_CHECKING
+import torch
+
+if TYPE_CHECKING:
+    import torch._ops
+
+from .binary import (
+    _apply_native_binary,
+    NATIVE_BINARY_FNS,
+    NATIVE_INPLACE_BINARY_FNS,
+)
+from .core import is_masked_tensor, MaskedTensor, _get_data, _masks_match, _maybe_get_mask
+from .passthrough import (
+    _apply_pass_through_fn,
+    PASSTHROUGH_FNS
+)
+from .reductions import (
+    _apply_reduction,
+    NATIVE_REDUCE_FNS,
+    TORCH_REDUCE_FNS,
+    TENSOR_REDUCE_FNS,
+)
+from .unary import (
+    _apply_native_unary,
+    NATIVE_UNARY_FNS,
+    NATIVE_INPLACE_UNARY_FNS,
+)
+
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+def _check_args_kwargs_length(args, kwargs, error_prefix, len_args=None, len_kwargs=None):
+    if len_args is not None and len_args != len(args):
+        raise ValueError(f"{error_prefix}: len(args) must be {len_args} but got {len(args)}")
+    if len_kwargs is not None and len_kwargs != len(kwargs):
+        raise ValueError(f"{error_prefix}: len(kwargs) must be {len_kwargs} but got {len(kwargs)}")
+
+
+class _MaskedContiguous(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        if not is_masked_tensor(input):
+            raise ValueError("MaskedContiguous forward: input must be a MaskedTensor.")
+
+        if input.is_contiguous():
+            return input
+
+        data = input.get_data()
+        mask = input.get_mask()
+
+        return MaskedTensor(data.contiguous(), mask.contiguous())
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output
+
+
+class _MaskedToDense(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        if not is_masked_tensor(input):
+            raise ValueError("MaskedToDense forward: input must be a MaskedTensor.")
+
+        if input.layout == torch.strided:
+            return input
+
+        ctx.layout = input.layout
+        data = input.get_data()
+        mask = input.get_mask()
+
+        return MaskedTensor(data.to_dense(), mask.to_dense())
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        layout = ctx.layout
+
+        if layout == torch.sparse_coo:
+            return grad_output.to_sparse_coo()
+        elif layout == torch.sparse_csr:
+            return grad_output.to_sparse_csr()
+        elif layout == torch.strided:
+            return grad_output.to_dense()
+        raise ValueError("to_dense: Unsupported input layout: ", layout)
+
+
+class _MaskedToSparse(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        if not is_masked_tensor(input):
+            raise ValueError("MaskedToSparse forward: input must be a MaskedTensor.")
+
+        # Following the convention from sparse tensors that to_sparse always means that we convert to sparse_coo
+        if input.layout == torch.sparse_coo:
+            return input
+
+        data = input.get_data()
+        mask = input.get_mask()
+        sparse_mask = mask.to_sparse_coo().coalesce()
+        sparse_data = data.sparse_mask(sparse_mask)
+
+        return MaskedTensor(sparse_data, sparse_mask)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.to_dense()
+
+
+class _MaskedToSparseCsr(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        if not is_masked_tensor(input):
+            raise ValueError("MaskedToSparseCsr forward: input must be a MaskedTensor.")
+
+        if input._masked_data.ndim != 2:
+            raise ValueError(f"Only 2D tensors can be converted to the SparseCsr layout but got shape: {input._masked_data.size()}")
+
+        if input.layout == torch.sparse_csr:
+            return input
+
+        data = input.get_data()
+        mask = input.get_mask()
+        sparse_mask = mask.to_sparse_csr()
+        sparse_data = data.sparse_mask(sparse_mask)
+
+        return MaskedTensor(sparse_data, sparse_mask)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.to_dense()
+
+
+class _MaskedWhere(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, cond, self, other):
+        ctx.mark_non_differentiable(cond)
+        ctx.save_for_backward(cond)
+        return torch.ops.aten.where(cond, self, other)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (cond,) = ctx.saved_tensors
+
+        def masked_out_like(mt):
+            return MaskedTensor(mt.get_data(), torch.zeros_like(mt.get_mask()).bool())
+
+        return (
+            None,
+            torch.ops.aten.where(cond, grad_output, masked_out_like(grad_output)),
+            torch.ops.aten.where(cond, masked_out_like(grad_output), grad_output),
+        )
+
+
+_MASKEDTENSOR_FUNCTION_TABLE = {}
+
+_function_fn_apply_map = {
+    (tuple(NATIVE_REDUCE_FNS), tuple(TORCH_REDUCE_FNS), tuple(TENSOR_REDUCE_FNS)): _apply_reduction,
+}
+
+for fn_map_list, apply_fn in _function_fn_apply_map.items():
+    for fn_map in fn_map_list:
+        for fn in fn_map:
+            _MASKEDTENSOR_FUNCTION_TABLE[fn] = partial(apply_fn, fn)
+
+
+def register_function_func(ops):
+    """
+    Used for registering a new __torch_function__ function to MaskedTensor
+    Called via _MASKEDTENSOR_FUNCTION_TABLE[func](*args, **kwargs)
+
+    The code to register a new function looks like:
+
+    @register_function_func(list_of_ops)
+    def foo(func, *args, **kwargs):
+        <implementation>
+    """
+    def wrapper(func):
+        for op in ops:
+            _MASKEDTENSOR_FUNCTION_TABLE[op] = partial(func, op)
+    return wrapper
+
+
+@register_function_func(NATIVE_REDUCE_FNS + TORCH_REDUCE_FNS + TENSOR_REDUCE_FNS)
+def _general_function_reductions(func, *args, **kwargs):
+    return _apply_reduction(func, *args, **kwargs)
+
+
+@register_function_func([torch.Tensor.where, torch.where])
+def _function_where(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, "__torch_function__, torch.where", len_args=3, len_kwargs=0)
+    return _MaskedWhere.apply(*args)
+
+
+@register_function_func([torch.Tensor.contiguous])
+def _function_contiguous(func, *args, **kwargs):
+    return _MaskedContiguous.apply(args[0])
+
+
+@register_function_func([torch.Tensor.to_dense])
+def _function_to_dense(func, *args, **kwargs):
+    return _MaskedToDense.apply(args[0])
+
+
+@register_function_func([torch.Tensor.to_sparse])
+def _function_to_sparse(func, *args, **kwargs):
+    return _MaskedToSparse.apply(args[0])
+
+
+@register_function_func([torch.Tensor.to_sparse_csr])
+def _function_to_sparse_csr(func, *args, **kwargs):
+    return _MaskedToSparseCsr.apply(args[0])
+
+
+_MASKEDTENSOR_DISPATCH_TABLE: Dict["torch._ops.OpOverload", Callable[..., Any]] = {}
+
+def register_dispatch_func(aten_ops):
+    """
+    Used for registering a new __torch_dispatch__ function to MaskedTensor
+    Called via _MASKEDTENSOR_DISPATCH_TABLE[func](*args, **kwargs)
+
+    The code to register a new function looks like:
+
+    @register_dispatch_func(list_of_ops)
+    def foo(func, *args, **kwargs):
+        <implementation>
+    """
+    def wrapper(func):
+        for aten_op in aten_ops:
+            _MASKEDTENSOR_DISPATCH_TABLE[aten_op] = partial(func, aten_op)
+    return wrapper
+
+
+@register_dispatch_func(NATIVE_REDUCE_FNS + TORCH_REDUCE_FNS + TENSOR_REDUCE_FNS)
+def _general_reduction(func, *args, **kwargs):
+    return _apply_reduction(func, *args, **kwargs)
+
+
+@register_dispatch_func(PASSTHROUGH_FNS)
+def _general_passthrough(func, *args, **kwargs):
+    return _apply_pass_through_fn(func, *args, **kwargs)
+
+
+@register_dispatch_func(NATIVE_UNARY_FNS + NATIVE_INPLACE_UNARY_FNS)
+def _general_unary(func, *args, **kwargs):
+    return _apply_native_unary(func, *args, **kwargs)
+
+
+@register_dispatch_func(NATIVE_BINARY_FNS + NATIVE_INPLACE_BINARY_FNS)
+def _general_binary(func, *args, **kwargs):
+    return _apply_native_binary(func, *args, **kwargs)
+
+
+@register_dispatch_func([torch.ops.aten.stride])
+def stride(func, *args, **kwargs):
+    return None
+
+
+@register_dispatch_func([torch.ops.aten.sym_stride])
+def sym_stride(func, *args, **kwargs):
+    return None
+
+
+@register_dispatch_func([torch.ops.prim.layout])
+def layout(func, *args, **kwargs):
+    return _get_data(args[0]).layout
+
+
+@register_dispatch_func([torch.ops.aten.is_contiguous])
+def is_contiguous(func, *args, **kwargs):
+    data = _get_data(args[0])
+    if data.is_sparse:
+        raise ValueError(
+            "MaskedTensors with sparse data do not have is_contiguous"
+        )
+    return func(data, *args[1:], **kwargs)
+
+
+@register_dispatch_func([torch.ops.aten.is_strides_like_format])
+def is_strides_like_format(func, *args, **kwargs):
+    data = _get_data(args[0])
+    if data.is_sparse:
+        raise ValueError(
+            "MaskedTensors with sparse data do not have is_strides_like_format"
+        )
+    return func(data, *args[1:], **kwargs)
+
+
+@register_dispatch_func([torch.ops.aten.is_non_overlapping_and_dense])
+def is_non_overlapping_and_dense(func, *args, **kwargs):
+    data = _get_data(args[0])
+    if data.is_sparse:
+        raise ValueError(
+            "MaskedTensors with sparse data do not have is_non_overlapping_and_dense"
+        )
+    return func(data, *args[1:], **kwargs)
+
+
+@register_dispatch_func([torch.ops.aten.contiguous])
+def contiguous(func, *args, **kwargs):
+    if _get_data(args[0]).is_sparse:
+        raise ValueError(
+            "MaskedTensors with sparse data do not have contiguous"
+        )
+    return _MaskedContiguous.apply(args[0])
+
+
+@register_dispatch_func([torch.ops.aten.new_empty_strided])
+def new_empty_strided(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=3)
+    data = _get_data(args[0])
+    mask = _maybe_get_mask(args[0])
+    if tuple(args[1]) != tuple(data.size()):
+        raise ValueError(f"__torch_dispatch__, {func}: args[1] expected to be the same as data.size()")
+    if tuple(args[2]) != tuple(data.stride()):
+        raise ValueError(f"__torch_dispatch__, {func}: args[2] expected to be the same as data.stride()")
+    return MaskedTensor(func(data, args[1], args[2], **kwargs), mask)
+
+
+@register_dispatch_func([torch.ops.aten._local_scalar_dense])
+def _local_scalar_dense(func, *args, **kwargs):
+    if not _maybe_get_mask(args[0]):
+        raise ValueError(f"__torch_dispatch__, {func}: expected a mask tensor")
+    return torch.ops.aten._local_scalar_dense(_get_data(args[0]))
+
+
+@register_dispatch_func([torch.ops.aten.detach, torch.ops.aten.clone])
+def _apply_fn_on_data(func, *args, **kwargs):
+    return MaskedTensor(func(_get_data(args[0])), _maybe_get_mask(args[0]))
+
+
+@register_dispatch_func([torch.ops.aten._to_copy])
+def _to_copy(func, *args, **kwargs):
+    new_data = func(_get_data(args[0]), *args[1:], **kwargs)
+    return MaskedTensor(new_data, _maybe_get_mask(args[0]))
+
+
+@register_dispatch_func([torch.ops.aten._softmax])
+def _softmax(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=3, len_kwargs=0)
+    data = _get_data(args[0])
+    mask = _maybe_get_mask(args[0])
+    result_data = torch.ops.aten._masked_softmax(data, ~mask, args[1], 2)
+    return MaskedTensor(result_data, mask)
+
+
+@register_dispatch_func([torch.ops.aten.ones_like])
+def ones_like(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1)
+    result_data = func(_get_data(args[0]), **kwargs)
+    return MaskedTensor(result_data, _maybe_get_mask(args[0]))
+
+
+@register_dispatch_func([torch.ops.aten._softmax_backward_data])
+def _softmax_backward_data(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=4)
+    grad, output, dim, input_dtype = args
+    if is_masked_tensor(grad) and is_masked_tensor(output):
+        if not _masks_match(grad, output):
+            raise ValueError("__torch_dispatch__, {func}: expected the masks of grad and output to match")
+        grad_data = _get_data(grad)
+        new_grad_data = torch.ops.aten._masked_softmax_backward(
+            grad_data,
+            _get_data(output),
+            ~_maybe_get_mask(grad),
+            dim % grad_data.ndim,
+        )
+        res = MaskedTensor(new_grad_data, _maybe_get_mask(grad))
+        return res
+    else:
+        raise ValueError(f"__torch_dispatch__, {func}: grad and output must both be MaskedTensors")
+
+
+@register_dispatch_func([torch.ops.aten.copy_])
+def copy_(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=2)
+    if not _masks_match(_maybe_get_mask(args[0]), _maybe_get_mask(args[1])):
+        raise ValueError("args[0] mask and args[1] mask must match but do not")
+    func(_get_data(args[0]), _get_data(args[1]))
+    return args[0]
+
+
+@register_dispatch_func([torch.ops.aten.where])
+def where(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=3, len_kwargs=0)
+    if not torch.is_tensor(args[0]):
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+    mx = args[1]
+    my = args[2]
+    if not is_masked_tensor(mx):
+        mx = MaskedTensor(mx, torch.ones_like(mx, dtype=torch.bool))
+    if not is_masked_tensor(my):
+        my = MaskedTensor(my, torch.ones_like(my, dtype=torch.bool))
+    new_data = func(args[0], mx.get_data(), my.get_data())
+    new_mask = func(args[0], mx.get_mask(), my.get_mask())
+    return MaskedTensor(new_data, new_mask)
+
+
+@register_dispatch_func([torch.ops.aten._to_sparse])
+def _to_sparse(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0)
+    if not torch.is_tensor(args[0]):
+        raise TypeError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+    mt = args[0]
+    if not is_masked_tensor(mt):
+        mt = MaskedTensor(mt, torch.ones_like(mt, dtype=torch.bool))
+    if mt.is_sparse_coo():
+        return mt
+    new_mask = func(_maybe_get_mask(args[0])).coalesce()
+    new_data = _get_data(args[0]).sparse_mask(new_mask)
+    return MaskedTensor(new_data, new_mask)
+
+
+@register_dispatch_func([torch.ops.aten._to_sparse_csr])
+def _to_sparse_csr(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0)
+    if not torch.is_tensor(args[0]):
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+    mt = args[0]
+    if not is_masked_tensor(mt):
+        mt = MaskedTensor(mt, torch.ones_like(mt).bool())
+    if mt.is_sparse_csr():
+        return mt
+    new_mask = func(_maybe_get_mask(args[0]))
+    new_data = _get_data(args[0]).sparse_mask(new_mask)
+    return MaskedTensor(new_data, new_mask)
+
+
+@register_dispatch_func([torch.ops.aten._to_dense])
+def _to_dense(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0)
+    if not torch.is_tensor(args[0]):
+        raise ValueError("__torch_dispatch__, {func}: expected args[0] to be a tensor")
+    mt = args[0]
+    if not is_masked_tensor(mt):
+        mt = MaskedTensor(mt, torch.ones_like(mt).bool())
+    new_data = func(_get_data(args[0]))
+    new_mask = func(_maybe_get_mask(args[0]))
+    return MaskedTensor(new_data, new_mask)
+
+
+@register_dispatch_func([torch.ops.aten._indices])
+def _indices(func, *args, **kwargs):
+    # Assumes data is sparse
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0)
+    data = _get_data(args[0]).indices()
+    return MaskedTensor(data, torch.ones_like(data).bool())
+
+
+@register_dispatch_func([torch.ops.aten._values])
+def _values(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=1, len_kwargs=0)
+    data = _get_data(args[0]).values()
+    return MaskedTensor(data, torch.ones_like(data).bool())
+
+
+@register_dispatch_func([torch.ops.aten._sparse_coo_tensor_with_dims_and_tensors])
+def _sparse_coo_tensor_with_dims_and_tensors(func, *args, **kwargs):
+    new_args = list(args)
+    if is_masked_tensor(args[-1]):
+        new_args[-1] = args[-1].get_data()
+    if is_masked_tensor(args[-2]):
+        new_args[-2] = args[-2].get_data()
+
+    new_data = func(*new_args, **kwargs)
+    new_args[-1] = torch.ones_like(new_args[-1])
+    new_mask = func(*new_args, **kwargs).bool()
+
+    return MaskedTensor(new_data, new_mask)
+
+
+@register_dispatch_func([torch.ops.aten.is_same_size])
+def is_same_size(func, *args, **kwargs):
+    _check_args_kwargs_length(args, kwargs, f"__torch_dispatch__, {func}", len_args=2)
+    return _get_data(args[0]).is_same_size(_get_data(args[1]))
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/binary.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb94b27c90142786ad1ace13da968eeb22d9a50
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/binary.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import torch
+
+from .core import _map_mt_args_kwargs, _masks_match, _tensors_match, _wrap_result, is_masked_tensor
+
+__all__ = []  # type: ignore[var-annotated]
+
+BINARY_NAMES = [
+    "add",
+    "atan2",
+    "arctan2",
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "bitwise_left_shift",
+    "bitwise_right_shift",
+    "div",
+    "divide",
+    "floor_divide",
+    "fmod",
+    "logaddexp",
+    "logaddexp2",
+    "mul",
+    "multiply",
+    "nextafter",
+    "remainder",
+    "sub",
+    "subtract",
+    "true_divide",
+    "eq",
+    "ne",
+    "le",
+    "ge",
+    "greater",
+    "greater_equal",
+    "gt",
+    "less_equal",
+    "lt",
+    "less",
+    "maximum",
+    "minimum",
+    "fmax",
+    "fmin",
+    "not_equal",
+]
+
+INPLACE_BINARY_NAMES = [
+    n + "_"
+    for n in (
+        list(
+            set(BINARY_NAMES)
+            - {
+                "logaddexp",
+                "logaddexp2",
+                "equal",
+                "fmin",
+                "minimum",
+                "maximum",
+                "fmax",
+            }
+        )
+    )
+]
+
+
+def _get_at_least_one_mask(a, b):
+    if not is_masked_tensor(a) and not is_masked_tensor(b):
+        raise TypeError("At least one of `a` and `b` must be a MaskedTensor")
+    if not _masks_match(a, b):
+        raise ValueError("a and b must have matching masks")
+    if is_masked_tensor(a):
+        return a.get_mask()
+    return b.get_mask()
+
+
+def _binary_helper(fn, args, kwargs, inplace):
+    if len(kwargs) != 0:
+        raise ValueError("len(kwargs) must equal 0")
+    for a in args[2:]:
+        if torch.is_tensor(a):
+            raise TypeError("MaskedTensor binary ops do not support Tensor arguments aside from the lhs and rhs")
+
+    if not _masks_match(*args[:2]):
+        raise ValueError(
+            "Input masks must match. If you need support for this, please open an issue on Github."
+        )
+
+    data_args, data_kwargs = _map_mt_args_kwargs(
+        args, kwargs, lambda x: x.get_data()
+    )
+    mask_args, mask_kwargs = _map_mt_args_kwargs(
+        args, kwargs, lambda x: x.get_mask()
+    )
+
+    args0_layout = data_args[0].layout
+    same_layout = (
+        (torch.is_tensor(data_args[1]) or is_masked_tensor(data_args[1])) and
+        (args0_layout == data_args[1].layout)
+    )
+
+    if args0_layout == torch.sparse_coo:
+        if same_layout:
+            if not _tensors_match(data_args[0].indices(), data_args[1].indices()):
+                raise ValueError(
+                    "sparse_coo indices must match. If you need support for this, please open an issue on Github."
+                )
+            if data_args[0].size() != data_args[1].size():
+                raise ValueError("input1 and input2 must have the same size for binary functions.")
+
+            data_args[1] = data_args[1].values()
+
+        i = data_args[0].indices()
+        size = data_args[0].size()
+        data_args[0] = data_args[0].values()
+        v = fn(*data_args)
+        result_data = torch.sparse_coo_tensor(i, v, size)
+
+    elif args0_layout == torch.sparse_csr:
+        if same_layout:
+            if not (
+                _tensors_match(data_args[0].crow_indices(), data_args[1].crow_indices())
+                and _tensors_match(
+                    data_args[0].col_indices(), data_args[1].col_indices()
+                )
+            ):
+                raise ValueError(
+                    "sparse_csr indices must match. If you need support for this, please open an issue on Github."
+                )
+
+            data_args[1] = data_args[1].values()
+
+        crow = data_args[0].crow_indices()
+        col = data_args[0].col_indices()
+        data_args[0] = data_args[0].values()
+        v = fn(*data_args)
+        result_data = torch.sparse_csr_tensor(crow, col, v)
+
+    else:
+        result_data = fn(*data_args)
+
+    if inplace:
+        args[0]._set_data_mask(result_data, mask_args[0])
+        return args[0]
+    else:
+        result_mask = _get_at_least_one_mask(*args[:2])
+        # sparse tensors don't have strides so we can only expand if the layout is strided
+        if args0_layout == torch.strided:
+            result_mask = result_mask.expand_as(result_data)
+        return _wrap_result(result_data, result_mask)
+
+
+def _torch_binary(fn_name):
+    fn = getattr(torch.ops.aten, fn_name)
+
+    def binary_fn(*args, **kwargs):
+        return _binary_helper(fn, args, kwargs, inplace=False)
+
+    return binary_fn
+
+
+def _torch_inplace_binary(fn_name):
+    fn = getattr(torch.ops.aten, fn_name)
+
+    def binary_fn(*args, **kwargs):
+        return _binary_helper(fn, args, kwargs, inplace=True)
+
+    return binary_fn
+
+
+NATIVE_BINARY_MAP = {
+    getattr(torch.ops.aten, name): _torch_binary(name) for name in BINARY_NAMES
+}
+NATIVE_INPLACE_BINARY_MAP = {
+    getattr(torch.ops.aten, name): _torch_inplace_binary(name)
+    for name in INPLACE_BINARY_NAMES
+}
+
+NATIVE_BINARY_FNS = list(NATIVE_BINARY_MAP.keys())
+NATIVE_INPLACE_BINARY_FNS = list(NATIVE_INPLACE_BINARY_MAP.keys())
+
+
+def _is_native_binary(fn):
+    return fn in NATIVE_BINARY_FNS or fn in NATIVE_INPLACE_BINARY_FNS
+
+
+def _apply_native_binary(fn, *args, **kwargs):
+    if fn in NATIVE_BINARY_FNS:
+        return NATIVE_BINARY_MAP[fn](*args, **kwargs)
+    if fn in NATIVE_INPLACE_BINARY_FNS:
+        return NATIVE_INPLACE_BINARY_MAP[fn](*args, **kwargs)
+    return NotImplemented
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/core.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ebaab7346614162df887080e0ddf1e7216c68b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/core.py
@@ -0,0 +1,336 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import warnings
+
+import torch
+from torch.overrides import get_default_nowrap_functions
+
+
+__all__ = [
+    "MaskedTensor",
+    "is_masked_tensor",
+]
+
+
+def is_masked_tensor(a):
+    r""" Returns True if the input is a MaskedTensor, else False
+
+    Args:
+        a: any input
+
+    Examples:
+
+        >>> # xdoctest: +SKIP
+        >>> from torch.masked import MaskedTensor
+        >>> data = torch.arange(6).reshape(2,3)
+        >>> mask = torch.tensor([[True, False, False], [True, True, False]])
+        >>> mt = MaskedTensor(data, mask)
+        >>> is_masked_tensor(mt)
+        True
+    """
+    return isinstance(a, MaskedTensor)
+
+
+def _tensors_match(a, b, exact=True, rtol=1e-05, atol=1e-08):
+    if is_masked_tensor(a) or is_masked_tensor(b):
+        raise ValueError("Neither `a` nor `b` can be a MaskedTensor.")
+    if a.layout != b.layout:
+        raise ValueError(f"`a` and `b` must have the same layout. Got {a.layout} and {b.layout}")
+
+    if a.dtype != b.dtype:
+        b = b.type(a.dtype)
+    if a.layout == b.layout == torch.sparse_coo:
+        return _tensors_match(a.values(), b.values(), exact) and _tensors_match(
+            a.indices(), b.indices(), exact
+        )
+    elif a.layout == b.layout == torch.sparse_csr:
+        return (
+            _tensors_match(a.crow_indices(), b.crow_indices(), exact)
+            and _tensors_match(a.col_indices(), b.col_indices(), exact)
+            and _tensors_match(a.values(), b.values(), exact)
+        )
+    if exact:
+        return (a.dim() == b.dim()) and torch.eq(a, b).all().item()
+    return (a.dim() == b.dim()) and torch.allclose(a, b, rtol=rtol, atol=atol)
+
+
+def _masks_match(a, b):
+    if is_masked_tensor(a) and is_masked_tensor(b):
+        mask_a = a.get_mask()
+        mask_b = b.get_mask()
+        return _tensors_match(mask_a, mask_b, exact=True)
+    return True
+
+
+def _map_mt_args_kwargs(args, kwargs, map_fn):
+    def _helper(a, map_fn):
+        if is_masked_tensor(a):
+            return map_fn(a)
+        elif torch.is_tensor(a):
+            return a
+        elif isinstance(a, list):
+            a_impl, _ = _map_mt_args_kwargs(a, {}, map_fn)
+            return a_impl
+        elif isinstance(a, tuple):
+            a_impl, _ = _map_mt_args_kwargs(a, {}, map_fn)
+            return tuple(a_impl)
+        else:
+            return a
+
+    if kwargs is None:
+        kwargs = {}
+    impl_args = []
+    for a in args:
+        impl_args.append(_helper(a, map_fn))
+    impl_kwargs = {}
+    for k in kwargs.keys():
+        impl_kwargs[k] = _helper(a, map_fn)
+    return impl_args, impl_kwargs
+
+
+def _wrap_result(result_data, result_mask):
+    if isinstance(result_data, list):
+        return [_wrap_result(r, m) for (r, m) in zip(result_data, result_mask)]
+    if isinstance(result_data, tuple):
+        return tuple(_wrap_result(r, m) for (r, m) in zip(result_data, result_mask))
+    if torch.is_tensor(result_data):
+        return MaskedTensor(result_data, result_mask)
+    # Expect result_data and result_mask to be Tensors only
+    return NotImplemented
+
+
+def _masked_tensor_str(data, mask, formatter):
+    if data.layout in {torch.sparse_coo, torch.sparse_csr}:
+        data = data.to_dense()
+        mask = mask.to_dense()
+    if data.dim() == 1:
+        formatted_elements = [
+            formatter.format(d.item()) if isinstance(d.item(), float) else str(d.item())
+            for d in data
+        ]
+        max_len = max(
+            8 if x[1] else len(x[0]) for x in zip(formatted_elements, ~mask)
+        )
+        return (
+            "["
+            + ", ".join(
+                [
+                    "--".rjust(max_len) if m else e
+                    for (e, m) in zip(formatted_elements, ~mask)
+                ]
+            )
+            + "]"
+        )
+    sub_strings = [_masked_tensor_str(d, m, formatter) for (d, m) in zip(data, mask)]
+    sub_strings = ["\n".join(["  " + si for si in s.split("\n")]) for s in sub_strings]
+    return "[\n" + ",\n".join(sub_strings) + "\n]"
+
+
+def _get_data(a):
+    if is_masked_tensor(a):
+        return a._masked_data
+    return a
+
+
+def _maybe_get_mask(a):
+    if is_masked_tensor(a):
+        return a.get_mask()
+    return None
+
+
+class MaskedTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, data, mask, requires_grad=False):
+        if is_masked_tensor(data) or not torch.is_tensor(data):
+            raise TypeError("data must be a Tensor")
+        if is_masked_tensor(mask) or not torch.is_tensor(mask):
+            raise TypeError("mask must be a Tensor")
+        # Use a Tensor that of the give size for the wrapper.
+        kwargs = {}
+        kwargs["device"] = data.device
+        kwargs["dtype"] = data.dtype
+        kwargs["layout"] = data.layout
+        kwargs["requires_grad"] = requires_grad
+        kwargs["dispatch_sizes_strides_policy"] = "strides"
+        kwargs["dispatch_layout"] = True
+        warnings.warn(("The PyTorch API of MaskedTensors is in prototype stage "
+                       "and will change in the near future. Please open a Github issue "
+                       "for features requests and see our documentation on the torch.masked "
+                       "module for further information about the project."), UserWarning)
+        if data.requires_grad:
+            warnings.warn("It is not recommended to create a MaskedTensor with a tensor that requires_grad. "
+                          "To avoid this, you can use data.clone().detach()", UserWarning)
+        return torch.Tensor._make_wrapper_subclass(cls, data.size(), **kwargs)  # type: ignore[attr-defined]
+
+    def _preprocess_data(self, data, mask):
+        from .._ops import _sparse_coo_where, _sparse_csr_where
+
+        if data.layout != mask.layout:
+            raise TypeError("data and mask must have the same layout.")
+        if data.layout == torch.sparse_coo:
+            data = data.coalesce()
+            mask = mask.coalesce()
+            if data._nnz() != mask._nnz():
+                data = _sparse_coo_where(mask, data, torch.tensor(0))
+        elif data.layout == torch.sparse_csr:
+            if data._nnz() != mask._nnz():
+                data = _sparse_csr_where(mask, data, torch.tensor(0))
+
+        # Have to pick awkward names to not conflict with existing fields such as data
+        self._masked_data = data.clone()
+        self._masked_mask = mask.clone()
+
+    def _validate_members(self):
+        data = self._masked_data
+        mask = self.get_mask()
+        if type(data) != type(mask):
+            raise TypeError(f"data and mask must have the same type. Got {type(data)} and {type(mask)}")
+        if data.layout not in {torch.strided, torch.sparse_coo, torch.sparse_csr}:
+            raise TypeError(f"data layout of {data.layout} is not supported.")
+        if data.layout == torch.sparse_coo:
+            if not _tensors_match(data.indices(), mask.indices(), exact=True):
+                raise ValueError("data and mask are both sparse COO tensors but do not have the same indices.")
+        elif data.layout == torch.sparse_csr:
+            if not _tensors_match(
+                data.crow_indices(), mask.crow_indices(), exact=True
+            ) or not _tensors_match(data.col_indices(), mask.col_indices(), exact=True):
+                raise ValueError("data and mask are both sparse CSR tensors but do not share either crow or col indices.")
+        if mask.dtype != torch.bool:
+            raise TypeError("mask must have dtype bool.")
+        if not (
+            data.dtype == torch.float16
+            or data.dtype == torch.float32
+            or data.dtype == torch.float64
+            or data.dtype == torch.bool
+            or data.dtype == torch.int8
+            or data.dtype == torch.int16
+            or data.dtype == torch.int32
+            or data.dtype == torch.int64
+        ):
+            raise TypeError(f"{data.dtype} is not supported in MaskedTensor.")
+        if data.dim() != mask.dim():
+            raise ValueError("data.dim() must equal mask.dim()")
+        if data.size() != mask.size():
+            raise ValueError("data.size() must equal mask.size()")
+
+    def __init__(self, data, mask, requires_grad=False):
+        self._preprocess_data(data, mask)
+        self._validate_members()
+
+    @staticmethod
+    def _from_values(data, mask):
+        """ Differentiable constructor for MaskedTensor """
+        class Constructor(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, data, mask):
+                return MaskedTensor(data, mask)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                return grad_output, None
+
+        result = Constructor.apply(data, mask)
+        return result
+
+    def _set_data_mask(self, data, mask):
+        self._masked_data = data
+        self._masked_mask = mask
+        self._validate_members()
+
+    def __repr__(self):
+        formatter = "{0:8.4f}"
+        if self.dim() == 0:
+            scalar_data = self.get_data().item()
+            data_formatted = (
+                formatter.format(scalar_data)
+                if isinstance(scalar_data, float)
+                else str(scalar_data)
+            )
+            if not self.get_mask().item():
+                data_formatted = "--"
+            return (
+                "MaskedTensor("
+                + data_formatted
+                + ", "
+                + str(self.get_mask().item())
+                + ")"
+            )
+        s = _masked_tensor_str(self.get_data(), self.get_mask(), formatter)
+        s = "\n".join("  " + si for si in s.split("\n"))
+        return "MaskedTensor(\n" + s + "\n)"
+
+    # Seems like this needs to be defined before torch_dispatch to work
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        from ._ops_refs import _MASKEDTENSOR_FUNCTION_TABLE
+        if func in _MASKEDTENSOR_FUNCTION_TABLE:
+            return _MASKEDTENSOR_FUNCTION_TABLE[func](*args, **kwargs)
+
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+        with torch._C.DisableTorchFunctionSubclass():
+            ret = func(*args, **kwargs)
+            if func in get_default_nowrap_functions():
+                return ret
+            else:
+                return torch._tensor._convert(ret, cls)
+
+    @classmethod
+    def unary(cls, fn, data, mask):
+        return MaskedTensor(fn(data), mask)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        func = func.overloadpacket
+
+        from ._ops_refs import _MASKEDTENSOR_DISPATCH_TABLE
+        if func in _MASKEDTENSOR_DISPATCH_TABLE:
+            return _MASKEDTENSOR_DISPATCH_TABLE[func](*args, **kwargs)
+
+        msg = (
+            f"{func.__name__} is not implemented in __torch_dispatch__ for MaskedTensor.\n"
+            "If you would like this operator to be supported, please file an issue for a feature request at "
+            "https://github.com/pytorch/maskedtensor/issues with a minimal reproducible code snippet.\n"
+            "In the case that the semantics for the operator are not trivial, it would be appreciated "
+            "to also include a proposal for the semantics."
+        )
+        warnings.warn(msg)
+        return NotImplemented
+
+    def __lt__(self, other):
+        if is_masked_tensor(other):
+            return MaskedTensor(self.get_data() < _get_data(other), self.get_mask())
+        return MaskedTensor(self.get_data() < other, self.get_mask())
+
+    def to_tensor(self, value):
+        return self.get_data().masked_fill(~self.get_mask(), value)
+
+    def get_data(self):
+        class GetData(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, self):
+                return self._masked_data
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                if is_masked_tensor(grad_output):
+                    return grad_output
+                return MaskedTensor(grad_output, self.get_mask())
+
+        return GetData.apply(self)
+
+    def get_mask(self):
+        return self._masked_mask
+
+    def is_sparse_coo(self):
+        return self.layout == torch.sparse_coo
+
+    def is_sparse_csr(self):
+        return self.layout == torch.sparse_csr
+
+    # Update later to support more sparse layouts
+    @property
+    def is_sparse(self):
+        return self.is_sparse_coo() or self.is_sparse_csr()
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/creation.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/creation.py
new file mode 100644
index 0000000000000000000000000000000000000000..647316869b6a4fd7b22e728bfe7b06de3ea0eceb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/creation.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+from .core import MaskedTensor
+
+__all__ = [
+    "as_masked_tensor",
+    "masked_tensor",
+]
+
+
+""""
+These two factory functions are intended to mirror
+    torch.tensor - guaranteed to be a leaf node
+    torch.as_tensor - differentiable constructor that preserves the autograd history
+"""
+
+def masked_tensor(data, mask, requires_grad=False):
+    return MaskedTensor(data, mask, requires_grad)
+
+def as_masked_tensor(data, mask):
+    return MaskedTensor._from_values(data, mask)
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/passthrough.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/passthrough.py
new file mode 100644
index 0000000000000000000000000000000000000000..7620e50d0b2d98353702907547c3e7dc0f44340e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/passthrough.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+"""
+These are functions that should simply be applied to both mask and data.
+Take select or stack as an example. This operation can be applied to
+both the mask and data of a MaskedTensor and the result wrapped into
+a new MaskedTensor as a result.
+"""
+
+import torch
+
+from .core import _map_mt_args_kwargs, _wrap_result
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+PASSTHROUGH_FNS = [
+    torch.ops.aten.select,
+    torch.ops.aten.transpose,
+    torch.ops.aten.split,
+    torch.ops.aten.t,
+    torch.ops.aten.slice,
+    torch.ops.aten.slice_backward,
+    torch.ops.aten.select_backward,
+    torch.ops.aten.index,
+    torch.ops.aten.expand,
+    torch.ops.aten.view,
+    torch.ops.aten._unsafe_view,
+    torch.ops.aten._reshape_alias,
+    torch.ops.aten.cat,
+    torch.ops.aten.unsqueeze,
+]
+
+
+def _is_pass_through_fn(fn):
+    return fn in PASSTHROUGH_FNS
+
+
+def _apply_pass_through_fn(fn, *args, **kwargs):
+    data_args, data_kwargs = _map_mt_args_kwargs(args, kwargs, lambda x: x.get_data())
+    result_data = fn(*data_args, **data_kwargs)
+    mask_args, mask_kwargs = _map_mt_args_kwargs(args, kwargs, lambda x: x.get_mask())
+    result_mask = fn(*mask_args, **mask_kwargs)
+    return _wrap_result(result_data, result_mask)
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/reductions.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6746593657c8f267586ffacdc0a55e24b292f607
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/reductions.py
@@ -0,0 +1,173 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import warnings
+
+import torch
+
+from .core import is_masked_tensor
+from .creation import as_masked_tensor, masked_tensor
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+def _masked_all_all(data, mask=None):
+    if mask is None:
+        return data.all()
+    return data.masked_fill(~mask, True).all()
+
+
+def _masked_all_dim(data, dim, keepdim=False, mask=None):
+    if mask is None:
+        return torch.all(data, dim=dim, keepdim=keepdim)
+    return torch.all(data.masked_fill(~mask, True), dim=dim, keepdim=keepdim)
+
+
+def _masked_all(*args, **kwargs):
+    if len(args) == 1 and len(kwargs) == 1:
+        return _masked_all_all(args[0], mask=kwargs["mask"])
+    return _masked_all_dim(*args, **kwargs)
+
+
+def _multidim_any(mask, dim, keepdim):
+    if isinstance(dim, int):
+        return _multidim_any(mask, [dim], keepdim)
+    for d in sorted(dim, reverse=True):
+        mask = torch.any(mask, dim=d, keepdim=keepdim)
+    return mask
+
+
+def _get_masked_fn(fn):
+    if fn == "all":
+        return _masked_all
+    return getattr(torch.masked, fn)
+
+
+def _torch_reduce_all(fn):
+    def reduce_all(self):
+        masked_fn = _get_masked_fn(fn)
+        data = self.get_data()
+        mask = self.get_mask().values() if self.is_sparse else self.get_mask()
+        # When reduction is "all", then torch.argmin/torch.argmax needs to return the index of the
+        # element corresponding to the min/max, but this operation isn't supported correctly for sparse layouts.
+        # Therefore, this implementation calculates it using the strides.
+        if fn == "all":
+            result_data = masked_fn(data, mask=mask)
+
+        elif fn in {"argmin", "argmax"} and self.is_sparse_coo():
+            sparse_idx = masked_fn(data.values(), mask=mask).to(dtype=torch.int)
+            indices = (
+                data.to_sparse_coo().indices()
+                if not self.is_sparse_coo()
+                else data.indices()
+            )
+            idx = indices.unbind(1)[sparse_idx]
+            stride = data.size().numel() / torch.tensor(
+                data.size(), device=data.device
+            ).cumprod(0)
+            result_data = torch.sum(idx * stride)
+
+        # we simply pass in the values for sparse COO/CSR tensors
+        elif self.is_sparse:
+            result_data = masked_fn(masked_tensor(data.values(), mask))
+
+        else:
+            result_data = masked_fn(self, mask=mask)
+
+        return as_masked_tensor(result_data, torch.any(mask))
+
+    return reduce_all
+
+
+def _torch_reduce_dim(fn):
+    def reduce_dim(self, dim, keepdim=False, dtype=None):
+        if self.is_sparse:
+            msg = (
+                f"The sparse version of {fn} is not implemented in reductions.\n"
+                "If you would like this operator to be supported, please file an issue for a feature request at "
+                "https://github.com/pytorch/maskedtensor/issues with a minimal reproducible code snippet.\n"
+                "In the case that the semantics for the operator are not trivial, it would be appreciated "
+                "to also include a proposal for the semantics."
+            )
+            warnings.warn(msg)
+            return NotImplemented
+        if not is_masked_tensor(self):
+            raise TypeError("Input to reduce_dim must be a MaskedTensor")
+
+        masked_fn = _get_masked_fn(fn)
+        data = self.get_data()
+        mask = self.get_mask()
+        if fn == "all":
+            result_data = masked_fn(data, dim=dim, keepdim=keepdim, mask=mask)
+        else:
+            result_data = masked_fn(
+                self, dim=dim, keepdim=keepdim, dtype=dtype, mask=self.get_mask()
+            )
+        return as_masked_tensor(result_data, _multidim_any(mask, dim, keepdim))
+
+    return reduce_dim
+
+
+def _torch_reduce(fn):
+    def reduce_fn(*args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            return _torch_reduce_all(fn)(args[0])
+        return _torch_reduce_dim(fn)(*args, **kwargs)
+
+    return reduce_fn
+
+
+def _reduce_dim_args(input, dim, keepdim=False, dtype=None):
+    return input, dim, keepdim, dtype
+
+
+def _torch_grad_reduce(fn):
+    def grad_reduce(*args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            return _torch_reduce_all(fn)(args[0])
+        # TODO: autograd.Function doesn't support kwarg
+        input, dim, keepdim, dtype = _reduce_dim_args(*args, **kwargs)
+        return _torch_reduce_dim(fn)(input, dim, keepdim, dtype)
+
+    return grad_reduce
+
+
+REDUCE_NAMES = [
+    "sum",
+    "mean",
+    "amin",
+    "amax",
+    "argmin",
+    "argmax",
+    "prod",
+    "all",
+    "norm",
+    "var",
+    "std",
+]
+
+NATIVE_REDUCE_MAP = {
+    getattr(torch.ops.aten, name): _torch_reduce(name) for name in REDUCE_NAMES
+}
+TORCH_REDUCE_MAP = {
+    getattr(torch, name): _torch_grad_reduce(name) for name in REDUCE_NAMES
+}
+TENSOR_REDUCE_MAP = {
+    getattr(torch.Tensor, name): _torch_grad_reduce(name) for name in REDUCE_NAMES
+}
+
+NATIVE_REDUCE_FNS = list(NATIVE_REDUCE_MAP.keys())
+TORCH_REDUCE_FNS = list(TORCH_REDUCE_MAP.keys())
+TENSOR_REDUCE_FNS = list(TENSOR_REDUCE_MAP.keys())
+
+def _is_reduction(fn):
+    return fn in NATIVE_REDUCE_MAP or fn in TORCH_REDUCE_MAP or fn in TENSOR_REDUCE_MAP
+
+
+def _apply_reduction(fn, *args, **kwargs):
+    if fn in NATIVE_REDUCE_MAP:
+        return NATIVE_REDUCE_MAP[fn](*args, **kwargs)
+    if fn in TORCH_REDUCE_MAP:
+        return TORCH_REDUCE_MAP[fn](*args, **kwargs)
+    if fn in TENSOR_REDUCE_MAP:
+        return TENSOR_REDUCE_MAP[fn](*args, **kwargs)
+    return NotImplemented
diff --git a/MLPY/Lib/site-packages/torch/masked/maskedtensor/unary.py b/MLPY/Lib/site-packages/torch/masked/maskedtensor/unary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fbde311a0a41769649e72f1c928c7995c3230b4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/masked/maskedtensor/unary.py
@@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import torch
+
+from .core import _map_mt_args_kwargs, _wrap_result
+
+__all__ = []  # type: ignore[var-annotated]
+
+
+UNARY_NAMES = [
+    "abs",
+    "absolute",
+    "acos",
+    "arccos",
+    "acosh",
+    "arccosh",
+    "angle",
+    "asin",
+    "arcsin",
+    "asinh",
+    "arcsinh",
+    "atan",
+    "arctan",
+    "atanh",
+    "arctanh",
+    "bitwise_not",
+    "ceil",
+    "clamp",
+    "clip",
+    "conj_physical",
+    "cos",
+    "cosh",
+    "deg2rad",
+    "digamma",
+    "erf",
+    "erfc",
+    "erfinv",
+    "exp",
+    "exp2",
+    "expm1",
+    "fix",
+    "floor",
+    "frac",
+    "lgamma",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logit",
+    "i0",
+    "isnan",
+    "nan_to_num",
+    "neg",
+    "negative",
+    "positive",
+    "pow",
+    "rad2deg",
+    "reciprocal",
+    "round",
+    "rsqrt",
+    "sigmoid",
+    "sign",
+    "sgn",
+    "signbit",
+    "sin",
+    "sinc",
+    "sinh",
+    "sqrt",
+    "square",
+    "tan",
+    "tanh",
+    "trunc",
+]
+
+INPLACE_UNARY_NAMES = [
+    n + "_"
+    for n in (list(set(UNARY_NAMES) - {"angle", "positive", "signbit", "isnan"}))
+]
+
+# Explicitly tracking functions we know are currently not supported
+# This might be due to missing code gen or because of complex semantics
+UNARY_NAMES_UNSUPPORTED = [
+    "atan2",
+    "arctan2",
+    "bitwise_left_shift",
+    "bitwise_right_shift",
+    "copysign",
+    "float_power",
+    "fmod",
+    "frexp",
+    "gradient",
+    "imag",
+    "ldexp",
+    "lerp",
+    "logical_not",
+    "hypot",
+    "igamma",
+    "igammac",
+    "mvlgamma",
+    "nextafter",
+    "polygamma",
+    "real",
+    "remainder",
+    "true_divide",
+    "xlogy",
+]
+
+
+def _unary_helper(fn, args, kwargs, inplace):
+    if len(kwargs) != 0:
+        raise ValueError("MaskedTensor unary ops require that len(kwargs) == 0. "
+                         "If you need support for this, please open an issue on Github.")
+    for a in args[1:]:
+        if torch.is_tensor(a):
+            raise TypeError("MaskedTensor unary ops do not support additional Tensor arguments")
+
+    mask_args, mask_kwargs = _map_mt_args_kwargs(
+        args, kwargs, lambda x: x._masked_mask
+    )
+    data_args, data_kwargs = _map_mt_args_kwargs(
+        args, kwargs, lambda x: x._masked_data
+    )
+
+    if args[0].layout == torch.sparse_coo:
+        data_args[0] = data_args[0].coalesce()
+        s = data_args[0].size()
+        i = data_args[0].indices()
+        data_args[0] = data_args[0].coalesce().values()
+        v = fn(*data_args)
+        result_data = torch.sparse_coo_tensor(i, v, size=s)
+
+    elif args[0].layout == torch.sparse_csr:
+        crow = data_args[0].crow_indices()
+        col = data_args[0].col_indices()
+        data_args[0] = data_args[0].values()
+        v = fn(*data_args)
+        result_data = torch.sparse_csr_tensor(crow, col, v)
+
+    else:
+        result_data = fn(*data_args)
+
+    if inplace:
+        args[0]._set_data_mask(result_data, mask_args[0])
+        return args[0]
+    else:
+        return _wrap_result(result_data, mask_args[0])
+
+
+def _torch_unary(fn_name):
+    fn = getattr(torch.ops.aten, fn_name)
+
+    def unary_fn(*args, **kwargs):
+        return _unary_helper(fn, args, kwargs, inplace=False)
+
+    return unary_fn
+
+
+def _torch_inplace_unary(fn_name):
+    fn = getattr(torch.ops.aten, fn_name)
+
+    def unary_fn(*args, **kwargs):
+        return _unary_helper(fn, args, kwargs, inplace=True)
+
+    return unary_fn
+
+
+NATIVE_UNARY_MAP = {
+    getattr(torch.ops.aten, name): _torch_unary(name) for name in UNARY_NAMES
+}
+NATIVE_INPLACE_UNARY_MAP = {
+    getattr(torch.ops.aten, name): _torch_inplace_unary(name)
+    for name in INPLACE_UNARY_NAMES
+}
+
+NATIVE_UNARY_FNS = list(NATIVE_UNARY_MAP.keys())
+NATIVE_INPLACE_UNARY_FNS = list(NATIVE_INPLACE_UNARY_MAP.keys())
+
+
+def _is_native_unary(fn):
+    return fn in NATIVE_UNARY_FNS or fn in NATIVE_INPLACE_UNARY_FNS
+
+
+def _apply_native_unary(fn, *args, **kwargs):
+    if fn in NATIVE_UNARY_FNS:
+        return NATIVE_UNARY_MAP[fn](*args, **kwargs)
+    if fn in NATIVE_INPLACE_UNARY_FNS:
+        return NATIVE_INPLACE_UNARY_MAP[fn](*args, **kwargs)
+    return NotImplemented
diff --git a/MLPY/Lib/site-packages/torch/monitor/__init__.py b/MLPY/Lib/site-packages/torch/monitor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e1ee7127632e948bd82ea022aacc19ee7683b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/monitor/__init__.py
@@ -0,0 +1,37 @@
+from torch._C._monitor import *  # noqa: F403
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from torch.utils.tensorboard import SummaryWriter
+
+
+STAT_EVENT = "torch.monitor.Stat"
+
+
+class TensorboardEventHandler:
+    """
+    TensorboardEventHandler is an event handler that will write known events to
+    the provided SummaryWriter.
+
+    This currently only supports ``torch.monitor.Stat`` events which are logged
+    as scalars.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_MONITOR)
+        >>> # xdoctest: +REQUIRES(module:tensorboard)
+        >>> from torch.utils.tensorboard import SummaryWriter
+        >>> from torch.monitor import TensorboardEventHandler, register_event_handler
+        >>> writer = SummaryWriter("log_dir")
+        >>> register_event_handler(TensorboardEventHandler(writer))
+    """
+    def __init__(self, writer: "SummaryWriter") -> None:
+        """
+        Constructs the ``TensorboardEventHandler``.
+        """
+        self._writer = writer
+
+    def __call__(self, event: Event) -> None:
+        if event.name == STAT_EVENT:
+            for k, v in event.data.items():
+                self._writer.add_scalar(k, v, walltime=event.timestamp.timestamp())
diff --git a/MLPY/Lib/site-packages/torch/monitor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/monitor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..117c892106cb27e082a92495e6990c0a08beee3d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/monitor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/mps/__init__.py b/MLPY/Lib/site-packages/torch/mps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e3f7af5d4e94e6c814e4b22f7d163ed2d2dc09
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/mps/__init__.py
@@ -0,0 +1,130 @@
+r"""
+This package enables an interface for accessing MPS (Metal Performance Shaders) backend in Python.
+Metal is Apple's API for programming metal GPU (graphics processor unit). Using MPS means that increased
+performance can be achieved, by running work on the metal GPU(s).
+See https://developer.apple.com/documentation/metalperformanceshaders for more details.
+"""
+import torch
+from .. import Tensor
+
+_is_in_bad_fork = getattr(torch._C, "_mps_is_in_bad_fork", lambda: False)
+_default_mps_generator: torch._C.Generator = None  # type: ignore[assignment]
+
+
+# local helper function (not public or exported)
+def _get_default_mps_generator() -> torch._C.Generator:
+    global _default_mps_generator
+    if _default_mps_generator is None:
+        _default_mps_generator = torch._C._mps_get_default_generator()
+    return _default_mps_generator
+
+
+def synchronize() -> None:
+    r"""Waits for all kernels in all streams on a MPS device to complete."""
+    return torch._C._mps_deviceSynchronize()
+
+
+def get_rng_state() -> Tensor:
+    r"""Returns the random number generator state as a ByteTensor."""
+    return _get_default_mps_generator().get_state()
+
+
+def set_rng_state(new_state: Tensor) -> None:
+    r"""Sets the random number generator state.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    _get_default_mps_generator().set_state(new_state_copy)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Sets the seed for generating random numbers.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    # the torch.mps.manual_seed() can be called from the global
+    # torch.manual_seed() in torch/random.py. So we need to make
+    # sure mps is available (otherwise we just return without
+    # erroring out)
+    if not torch._C._has_mps:
+        return
+    seed = int(seed)
+    _get_default_mps_generator().manual_seed(seed)
+
+
+def seed() -> None:
+    r"""Sets the seed for generating random numbers to a random number."""
+    _get_default_mps_generator().seed()
+
+
+def empty_cache() -> None:
+    r"""Releases all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other GPU applications.
+    """
+    torch._C._mps_emptyCache()
+
+
+def set_per_process_memory_fraction(fraction) -> None:
+    r"""Set memory fraction for limiting process's memory allocation on MPS device.
+    The allowed value equals the fraction multiplied by recommended maximum device memory
+    (obtained from Metal API device.recommendedMaxWorkingSetSize).
+    If trying to allocate more than the allowed value in a process, it will raise an out of
+    memory error in allocator.
+
+    Args:
+        fraction(float): Range: 0~2. Allowed memory equals total_memory * fraction.
+
+    .. note::
+       Passing 0 to fraction means unlimited allocations
+       (may cause system failure if out of memory).
+       Passing fraction greater than 1.0 allows limits beyond the value
+       returned from device.recommendedMaxWorkingSetSize.
+    """
+
+    if not isinstance(fraction, float):
+        raise TypeError("Invalid type for fraction argument, must be `float`")
+    if fraction < 0 or fraction > 2:
+        raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~2")
+
+    torch._C._mps_setMemoryFraction(fraction)
+
+
+def current_allocated_memory() -> int:
+    r"""Returns the current GPU memory occupied by tensors in bytes.
+
+    .. note::
+       The returned size does not include cached allocations in
+       memory pools of MPSAllocator.
+    """
+    return torch._C._mps_currentAllocatedMemory()
+
+
+def driver_allocated_memory() -> int:
+    r"""Returns total GPU memory allocated by Metal driver for the process in bytes.
+
+    .. note::
+       The returned size includes cached allocations in MPSAllocator pools
+       as well as allocations from MPS/MPSGraph frameworks.
+    """
+    return torch._C._mps_driverAllocatedMemory()
+
+
+from . import profiler
+from .event import Event
+
+__all__ = [
+    "get_rng_state",
+    "manual_seed",
+    "seed",
+    "set_rng_state",
+    "synchronize",
+    "empty_cache",
+    "set_per_process_memory_fraction",
+    "current_allocated_memory",
+    "driver_allocated_memory",
+    "Event",
+    "profiler",
+]
diff --git a/MLPY/Lib/site-packages/torch/mps/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/mps/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bdd34f422fba03c641987de20edd13401bab82c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/mps/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/mps/__pycache__/event.cpython-39.pyc b/MLPY/Lib/site-packages/torch/mps/__pycache__/event.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c40fbf2e3a0e7b4d6862b2e80206545a506dbde
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/mps/__pycache__/event.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/mps/__pycache__/profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/mps/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccc605372a0cf8bddce02354fd1389a128326286
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/mps/__pycache__/profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/mps/event.py b/MLPY/Lib/site-packages/torch/mps/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b84fc9454f373ecf818fddf0069a065d90d132f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/mps/event.py
@@ -0,0 +1,45 @@
+import torch
+
+
+class Event:
+    r"""Wrapper around an MPS event.
+
+    MPS events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize MPS streams.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+    """
+
+    def __init__(self, enable_timing=False):
+        self.__eventId = torch._C._mps_acquireEvent(enable_timing)
+
+    def __del__(self):
+        # checks if torch._C is already destroyed
+        if hasattr(torch._C, "_mps_releaseEvent") and self.__eventId > 0:
+            torch._C._mps_releaseEvent(self.__eventId)
+
+    def record(self):
+        r"""Records the event in the default stream."""
+        torch._C._mps_recordEvent(self.__eventId)
+
+    def wait(self):
+        r"""Makes all future work submitted to the default stream wait for this event."""
+        torch._C._mps_waitForEvent(self.__eventId)
+
+    def query(self):
+        r"""Returns True if all work currently captured by event has completed."""
+        return torch._C._mps_queryEvent(self.__eventId)
+
+    def synchronize(self):
+        r"""Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+        """
+        torch._C._mps_synchronizeEvent(self.__eventId)
+
+    def elapsed_time(self, end_event):
+        r"""Returns the time elapsed in milliseconds after the event was
+        recorded and before the end_event was recorded.
+        """
+        return torch._C._mps_elapsedTimeOfEvents(self.__eventId, end_event.__eventId)
diff --git a/MLPY/Lib/site-packages/torch/mps/profiler.py b/MLPY/Lib/site-packages/torch/mps/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..90261ee8702c0c80171db83c6565abcbddc9337e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/mps/profiler.py
@@ -0,0 +1,59 @@
+import contextlib
+
+import torch
+
+__all__ = ["start", "stop", "profile"]
+
+
+def start(mode: str = "interval", wait_until_completed: bool = False) -> None:
+    r"""Start OS Signpost tracing from MPS backend.
+
+    The generated OS Signposts could be recorded and viewed in
+    XCode Instruments Logging tool.
+
+    Args:
+        mode(str): OS Signpost tracing mode could be "interval", "event",
+            or both "interval,event".
+            The interval mode traces the duration of execution of the operations,
+            whereas event mode marks the completion of executions.
+            See document `Recording Performance Data`_ for more info.
+        wait_until_completed(bool): Waits until the MPS Stream complete
+            executing each encoded GPU operation. This helps generating single
+            dispatches on the trace's timeline.
+            Note that enabling this option would affect the performance negatively.
+
+    .. _Recording Performance Data:
+       https://developer.apple.com/documentation/os/logging/recording_performance_data
+    """
+    mode_normalized = mode.lower().replace(" ", "")
+    torch._C._mps_profilerStartTrace(mode_normalized, wait_until_completed)
+
+
+def stop():
+    r"""Stops generating OS Signpost tracing from MPS backend."""
+    torch._C._mps_profilerStopTrace()
+
+
+@contextlib.contextmanager
+def profile(mode: str = "interval", wait_until_completed: bool = False):
+    r"""Context Manager to enabling generating OS Signpost tracing from MPS backend.
+
+    Args:
+        mode(str): OS Signpost tracing mode could be "interval", "event",
+            or both "interval,event".
+            The interval mode traces the duration of execution of the operations,
+            whereas event mode marks the completion of executions.
+            See document `Recording Performance Data`_ for more info.
+        wait_until_completed(bool): Waits until the MPS Stream complete
+            executing each encoded GPU operation. This helps generating single
+            dispatches on the trace's timeline.
+            Note that enabling this option would affect the performance negatively.
+
+    .. _Recording Performance Data:
+       https://developer.apple.com/documentation/os/logging/recording_performance_data
+    """
+    try:
+        start(mode, wait_until_completed)
+        yield
+    finally:
+        stop()
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__init__.py b/MLPY/Lib/site-packages/torch/multiprocessing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0766374b4c3d5817f42fca0c46e74fd4e3108e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/__init__.py
@@ -0,0 +1,78 @@
+"""torch.multiprocessing is a wrapper around the native :mod:`multiprocessing` module.
+
+It registers custom reducers, that use shared memory to provide shared
+views on the same data in different processes. Once the tensor/storage is moved
+to shared_memory (see :func:`~torch.Tensor.share_memory_`), it will be possible
+to send it to other processes without making any copies.
+
+The API is 100% compatible with the original module - it's enough to change
+``import multiprocessing`` to ``import torch.multiprocessing`` to have all the
+tensors sent through the queues or shared via other mechanisms, moved to shared
+memory.
+
+Because of the similarity of APIs we do not document most of this package
+contents, and we recommend referring to very good docs of the original module.
+"""
+import multiprocessing
+import sys
+
+import torch
+from .reductions import init_reductions
+
+__all__ = ["set_sharing_strategy", "get_sharing_strategy", "get_all_sharing_strategies"]
+
+
+from multiprocessing import *  # noqa: F403
+
+
+__all__ += multiprocessing.__all__  # noqa: PLE0605 type: ignore[attr-defined]
+
+
+# This call adds a Linux specific prctl(2) wrapper function to this module.
+# See https://github.com/pytorch/pytorch/pull/14391 for more information.
+torch._C._multiprocessing_init()
+
+
+"""Add helper function to spawn N processes and wait for completion of any of
+them. This depends `mp.get_context` which was added in Python 3.4."""
+from .spawn import (
+    ProcessContext,
+    ProcessExitedException,
+    ProcessRaisedException,
+    spawn,
+    SpawnContext,
+    start_processes,
+)
+
+
+if sys.platform == "darwin" or sys.platform == "win32":
+    _sharing_strategy = "file_system"
+    _all_sharing_strategies = {"file_system"}
+else:
+    _sharing_strategy = "file_descriptor"
+    _all_sharing_strategies = {"file_descriptor", "file_system"}
+
+
+def set_sharing_strategy(new_strategy):
+    """Set the strategy for sharing CPU tensors.
+
+    Args:
+        new_strategy (str): Name of the selected strategy. Should be one of
+            the values returned by :func:`get_all_sharing_strategies()`.
+    """
+    global _sharing_strategy
+    assert new_strategy in _all_sharing_strategies
+    _sharing_strategy = new_strategy
+
+
+def get_sharing_strategy():
+    """Return the current strategy for sharing CPU tensors."""
+    return _sharing_strategy
+
+
+def get_all_sharing_strategies():
+    """Return a set of sharing strategies supported on a current system."""
+    return _all_sharing_strategies
+
+
+init_reductions()
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f983146288147b24f5a73d7a6b8cff528564e42
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/_atfork.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/_atfork.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0999a49a72a440f9bdb90089e6872ba1a17dee25
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/_atfork.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/pool.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/pool.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28d8f058813b58423799c69214e427e6e6d1e03b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/pool.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/queue.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/queue.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f18bb1bcf4b7f2f32d55d8694c6d31b2e3c2e63
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/queue.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/reductions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/reductions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18e98e512ba56ddaf604cb665159b38bfbb3995b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/reductions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/spawn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/spawn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac33daa69f4f59d57f752dd6cee5c28b289b906
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/multiprocessing/__pycache__/spawn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/_atfork.py b/MLPY/Lib/site-packages/torch/multiprocessing/_atfork.py
new file mode 100644
index 0000000000000000000000000000000000000000..22e6c4b4371b7eb59203543cd11245de6b6b1cb5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/_atfork.py
@@ -0,0 +1,33 @@
+import sys
+
+__all__ = ["register_after_fork"]
+
+if sys.platform == "win32":
+    import multiprocessing.util as _util
+
+    def _register(func):
+        def wrapper(arg):
+            func()
+
+        _util.register_after_fork(_register, wrapper)
+
+else:
+    import os
+
+    def _register(func):
+        os.register_at_fork(after_in_child=func)
+
+
+def register_after_fork(func):
+    """Register a callable to be executed in the child process after a fork.
+
+    Note:
+        In python < 3.7 this will only work with processes created using the
+        ``multiprocessing`` module. In python >= 3.7 it also works with
+        ``os.fork()``.
+
+    Args:
+        func (function): Function taking no arguments to be called in the child after fork
+
+    """
+    _register(func)
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/pool.py b/MLPY/Lib/site-packages/torch/multiprocessing/pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e81f5d6eccf3929232fc6177c095e3030adf9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/pool.py
@@ -0,0 +1,52 @@
+import multiprocessing.pool
+import multiprocessing.util as util
+
+from .queue import SimpleQueue
+
+
+def clean_worker(*args, **kwargs):
+    import gc
+
+    multiprocessing.pool.worker(*args, **kwargs)
+    # Regular multiprocessing workers don't fully clean up after themselves,
+    # so we have to explicitly trigger garbage collection to make sure that all
+    # destructors are called...
+    gc.collect()
+
+
+class Pool(multiprocessing.pool.Pool):
+    """Pool implementation which uses our version of SimpleQueue.
+
+    This lets us pass tensors in shared memory across processes instead of
+    serializing the underlying data.
+    """
+
+    def _setup_queues(self):
+        self._inqueue = SimpleQueue()
+        self._outqueue = SimpleQueue()
+        self._quick_put = self._inqueue._writer.send
+        self._quick_get = self._outqueue._reader.recv
+
+    def _repopulate_pool(self):
+        """Increase the number of pool processes to the specified number.
+
+        Bring the number of pool processes up to the specified number, for use after
+        reaping workers which have exited.
+        """
+        for i in range(self._processes - len(self._pool)):
+            # changed worker -> clean_worker
+            args = (
+                self._inqueue,
+                self._outqueue,
+                self._initializer,
+                self._initargs,
+                self._maxtasksperchild,
+            )
+            if hasattr(self, "_wrap_exception"):
+                args += (self._wrap_exception,)
+            w = self.Process(target=clean_worker, args=args)
+            self._pool.append(w)
+            w.name = w.name.replace("Process", "PoolWorker")
+            w.daemon = True
+            w.start()
+            util.debug("added worker")
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/queue.py b/MLPY/Lib/site-packages/torch/multiprocessing/queue.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e3d54b69117211709918e786a85dc4cbf198d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/queue.py
@@ -0,0 +1,42 @@
+import io
+import multiprocessing.queues
+import pickle
+from multiprocessing.reduction import ForkingPickler
+
+
+class ConnectionWrapper:
+    """Proxy class for _multiprocessing.Connection which uses ForkingPickler for object serialization."""
+
+    def __init__(self, conn):
+        self.conn = conn
+
+    def send(self, obj):
+        buf = io.BytesIO()
+        ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)
+        self.send_bytes(buf.getvalue())
+
+    def recv(self):
+        buf = self.recv_bytes()
+        return pickle.loads(buf)
+
+    def __getattr__(self, name):
+        if "conn" in self.__dict__:
+            return getattr(self.conn, name)
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute 'conn'")
+
+
+class Queue(multiprocessing.queues.Queue):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._reader: ConnectionWrapper = ConnectionWrapper(self._reader)
+        self._writer: ConnectionWrapper = ConnectionWrapper(self._writer)
+        self._send = self._writer.send
+        self._recv = self._reader.recv
+
+
+class SimpleQueue(multiprocessing.queues.SimpleQueue):
+    def _make_methods(self):
+        if not isinstance(self._reader, ConnectionWrapper):
+            self._reader: ConnectionWrapper = ConnectionWrapper(self._reader)
+            self._writer: ConnectionWrapper = ConnectionWrapper(self._writer)
+        super()._make_methods()  # type: ignore[misc]
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/reductions.py b/MLPY/Lib/site-packages/torch/multiprocessing/reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f44a81f6e1efbd0d5e5f756ab97b384226ee3054
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/reductions.py
@@ -0,0 +1,594 @@
+import multiprocessing
+import os
+import threading
+from multiprocessing.reduction import ForkingPickler
+from multiprocessing.util import register_after_fork
+from typing import Union
+
+import torch
+import torch.utils.hooks
+from torch._namedtensor_internals import check_serializing_named_tensor
+
+try:
+    # Early load resource_sharer to prevent a partially initialized instance
+    # from being inherited in a forked child process. The reduce_storage method
+    # requires this module indirectly through DupFd(). The built-in mp.Queue
+    # class pickles arguments in a background thread which may overlap with the
+    # fork.
+    import multiprocessing.resource_sharer
+except ImportError:
+    pass
+
+
+class StorageWeakRef:
+    r"""A weak reference to a Storage.
+
+    The cdata member is a Python number containing the integer representation of
+    the Storage pointer.
+    """
+
+    __slots__ = ["cdata", "_free_weak_ref"]
+
+    def __init__(self, storage):
+        self.cdata = storage._weak_ref()
+        # Save a direct reference to _free_weak_ref because the `torch` module
+        # might be cleared during Python shutdown before this module is cleared.
+        self._free_weak_ref = torch.Storage._free_weak_ref  # type: ignore[attr-defined]
+
+    @classmethod
+    def from_weakref(cls, cdata):
+        instance = cls.__new__(cls)
+        instance.cdata = cdata
+        instance._free_weak_ref = torch.Storage._free_weak_ref  # type: ignore[attr-defined]
+        return instance
+
+    def expired(self):
+        return torch.Storage._expired(self.cdata)  # type: ignore[attr-defined]
+
+    def __del__(self):
+        self._free_weak_ref(self.cdata)
+
+    def __hash__(self):
+        return self.cdata
+
+    def __eq__(self, other):
+        if id(self) == id(other):
+            return True
+        return self.cdata == other.cdata
+
+
+class SharedCache(dict):
+    """Dictionary from multiprocessing handles to StorageWeakRef."""
+
+    def __init__(self):
+        # free_dead_references() is called if the len exceeds the current
+        # limit. The limit scales with the number of remaining live objects.
+        self.limit = 128
+        # `fork` inherits lock state, so in case we fork when the lock is held,
+        # we register a function to reset the lock to a new object to avoid
+        # possible deadlocks, following python multiprocessing library design.
+        self._after_fork()
+        register_after_fork(self, SharedCache._after_fork)
+
+    def _after_fork(self):
+        self.lock = threading.Lock()
+
+    def get(self, key):
+        with self.lock:
+            return dict.get(self, key)
+
+    def __setitem__(self, key, storage_ref):
+        with self.lock:
+            dict.__setitem__(self, key, storage_ref)
+            if len(self) > self.limit:
+                self.free_dead_references()
+
+    def free_dead_references(self):
+        live = 0
+        for key, storage_ref in list(self.items()):
+            if storage_ref.expired():
+                del self[key]
+            else:
+                live += 1
+        self.limit = max(128, live * 2)
+
+
+# mapping from handles to StorageWeakRef objects
+shared_cache = SharedCache()
+
+
+def rebuild_event(device, handle):
+    return torch.cuda.Event.from_ipc_handle(device, handle)
+
+
+def reduce_event(event):
+    handle = event.ipc_handle()
+    return (rebuild_event, (event.device, handle))
+
+
+def rebuild_tensor(cls, storage, metadata):
+    storage_offset, size, stride, requires_grad = metadata
+    t = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
+    if cls == torch.nn.parameter.Parameter:
+        # we have to pass requires_grad into constructor, rather than set it as an
+        # attribute later, because it's an important check for Integer Tensors to
+        # have requires_grad=False (or else they raise an error)
+        t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)
+    else:
+        t.requires_grad = requires_grad
+    return t
+
+
+def rebuild_cuda_tensor(
+    tensor_cls,
+    tensor_size,
+    tensor_stride,
+    tensor_offset,
+    storage_cls,
+    dtype,
+    storage_device,
+    storage_handle,
+    storage_size_bytes,
+    storage_offset_bytes,
+    requires_grad,
+    ref_counter_handle,
+    ref_counter_offset,
+    event_handle,
+    event_sync_required,
+):
+    # If storage_handle is None, storage points to nullptr.
+    if storage_handle is None or storage_size_bytes == 0:
+        storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True)
+    else:
+        storage = storage_from_cache(
+            storage_cls, (storage_handle, storage_offset_bytes)
+        )
+        if storage is None:
+            torch.cuda._lazy_init()
+            storage = storage_cls._new_shared_cuda(
+                storage_device,
+                storage_handle,
+                storage_size_bytes,
+                storage_offset_bytes,
+                ref_counter_handle,
+                ref_counter_offset,
+                event_handle,
+                event_sync_required,
+            )
+            shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(
+                storage
+            )
+        else:
+            # We already ref counting this Storage, but producer needs new ref-counters to be released.
+            storage_cls._release_ipc_counter(
+                ref_counter_handle, ref_counter_offset, device=storage_device
+            )
+
+    _storage = (
+        storage
+        if isinstance(storage, torch.UntypedStorage)
+        else storage._untyped_storage
+    )
+
+    t = torch._utils._rebuild_tensor(
+        torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True),
+        tensor_offset,
+        tensor_size,
+        tensor_stride,
+    )
+
+    if tensor_cls == torch.nn.parameter.Parameter:
+        # It is crucial for integer tensors to receive
+        # the requires_grad=False as an argument in the constructor
+        t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)
+    else:
+        t.requires_grad = requires_grad
+
+    return t
+
+
+def reduce_tensor(tensor):
+    if tensor.requires_grad and not tensor.is_leaf:
+        raise RuntimeError(
+            "Cowardly refusing to serialize non-leaf tensor which requires_grad, "
+            "since autograd does not support crossing process boundaries.  "
+            "If you just want to transfer the data, call detach() on the tensor "
+            "before serializing (e.g., putting it on the queue)."
+        )
+
+    check_serializing_named_tensor(tensor)
+    torch.utils.hooks.warn_if_has_hooks(tensor)
+
+    # Note [CUDA IPC and the caching allocator]
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # When you send a CUDA tensor over IPC, you might expect that you will
+    # get out the same storage from the other end.  However, the CUDA caching
+    # allocator makes it difficult to preserve this invariant.  Consider
+    # the following situation: a tensor of size 0x100 points to offset 0x20 of
+    # a storage at 0xA100 of size 0x100.  (For simplicity, all of these
+    # sizes are given in bytes).  HOWEVER, with the caching allocator, this storage
+    # might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000.
+    #
+    # When we want to send this CUDA tensor over IPC, we must send the
+    # *entire* cudaMalloc allocation, i.e., the 0xA000 region, not just
+    # the storage 0xA100 (because that is what CUDA supports).  So, on the
+    # other end, there simply isn't any way to say, "Wait, you gave me
+    # a bigger region (0xA000) than the one I wanted (0xA100)".
+    #
+    # OK, so if you sent the cudaMalloc allocation, can you just wrap that up as
+    # one storage itself? No, because this cudaMalloc allocation might contain
+    # storages of mixed types: float, bytes, double... If you make the entire
+    # allocation a single storage of a type A, we'll hit an error when constructing
+    # a tensor of type B on the storage.
+    #
+    # cudaIpcMemHandle is an identifier to access the sender cudaMalloc allocation on the
+    # receiver side. However, cudaIpcMemHandles from each device in a given process may
+    # only be opened by one context per device per other process.
+    # If we open and close a memory handle multiples times in a process, CUDA is allowed
+    # to give it a different address; similarly, once we close the memory, we're not
+    # allowed to access it(and the storage/tensor built on top of it), even if it is
+    # still live in the original process. As we cannot make a cudaMalloc allocation
+    # to a single storage in one go, this requires us to cache the device pointer for
+    # each cudaIpcMemHandle on C++ side to reconstruct types of storages, while keep
+    # the old ones alives.
+    # See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html]
+    #
+    # This is fine, because all we need to do is to save our position in the allocation,
+    # and reconstruct storage and tensor from it.
+    # 0xA000 ->  -------CUDA Allocation------
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    # 0xA100 ->  --------storage1 begin------
+    #           |                            |
+    # 0xA120 ->  --------tensor1 begin ------
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    # 0xA160 ->  --------tensor1 end---------
+    #           |                            |
+    #           |                            |
+    #           |                            |
+    # 0xA200 ->  --------storage1 end--------
+    #           |                            |
+    # 0xE000 ->  --------CUDA allocation-----
+    #
+    # To send tensor1, the following info are required from sender to receiver for
+    # storage recontruction.
+    #   1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process).
+    #      basePtr may not be exactly 0xA000 since it's a different process.
+    #   2. offset(0xA100) of storage1 in the CUDA allocation.
+    #   3. size of storage1(0x100).
+    #
+    # On receiver side:
+    #   1. Get the devPtr of the MemHandle to access the memory, reconstruct a storage
+    #      of the same type using (basePtr, offset, size).
+    #   2. we can reconstruct the tensor on top of the reconstructed storage
+    #   Tensor(size=0x040, offset=0x020, storage=Storage(data=basePtr+0xA100, size=0x0100))
+    #
+    # This strategy has a few implications:
+    #
+    # 1. When we serialize a CUDA tensor for IPC, we cannot do it all in one
+    #    go (non-compositionally), and this requires to have a global map
+    #    memHandle -> devPtr for each process.
+    #
+    # 2. We MUST NOT let the new IPC tensor be resizable.  Originally, a resize
+    #    of the storage beyond 0x100 would merely have caused us to do a
+    #    reallocation.  You don't really want to do this, but if you did,
+    #    all that would happen is that you would lose IPC sharing.  But if
+    #    you do this in the new world, we will happily let you write out of
+    #    bounds of your "allocation", clobbering unrelated data in the cached
+    #    allocator block.  BAD!
+    #
+    # By the way, in old versions of PyTorch, we supported this situation
+    # natively using a "storage view", which permitted multiple storages to be
+    # views on each other.  But this was the *only* use of storage views, so we
+    # eliminated it so that we could just use tensor views to implement the same
+    # thing.
+    #
+
+    # TODO: Handle distinguishing between subclass and non-subclass versions of NT better
+    # https://github.com/pytorch/pytorch/issues/110543
+    from torch.nested._internal.nested_tensor import NestedTensor
+
+    if tensor.is_nested and not isinstance(tensor, NestedTensor):
+        return reduce_nested_tensor(tensor)
+
+    if tensor.layout in {
+        torch.sparse_coo,
+        torch.sparse_csr,
+        torch.sparse_bsr,
+        torch.sparse_csc,
+        torch.sparse_bsc,
+    }:
+        return reduce_sparse_tensor(tensor)
+
+    storage = tensor._typed_storage()
+
+    if storage._untyped_storage.device.type == "cuda":
+        (
+            device,
+            handle,
+            storage_size_bytes,
+            storage_offset_bytes,
+            ref_counter_handle,
+            ref_counter_offset,
+            event_handle,
+            event_sync_required,
+        ) = storage._share_cuda_()
+        tensor_offset = tensor.storage_offset()
+        shared_cache[handle] = StorageWeakRef(storage)
+        # _backward_hooks purposely omitted here, see
+        # Note [Don't serialize hooks]
+        return (
+            rebuild_cuda_tensor,
+            (
+                type(tensor),
+                tensor.size(),
+                tensor.stride(),
+                tensor_offset,  # tensor offset in its storage
+                type(storage),
+                tensor.dtype,
+                device,
+                handle,  # identifier which CUDA allocation is the storage in.
+                storage_size_bytes,  # size(in bytes) of the storage
+                storage_offset_bytes,  # offset(in bytes) of the storage in the CUDA allocation
+                tensor.requires_grad,
+                ref_counter_handle,
+                ref_counter_offset,
+                event_handle,
+                event_sync_required,
+            ),
+        )
+
+    # _backward_hooks purposely omitted here, see Note [Don't serialize hooks]
+    metadata = (
+        tensor.storage_offset(),
+        tensor.size(),
+        tensor.stride(),
+        tensor.requires_grad,
+    )
+    return (rebuild_tensor, (type(tensor), storage, metadata))
+
+
+def rebuild_nested_tensor(
+    rebuild_buffer_func,
+    rebuild_buffer_args,
+    rebuild_sizes_func,
+    rebuild_sizes_args,
+    rebuild_strides_func,
+    rebuild_strides_args,
+    rebuild_offsets_func,
+    rebuild_offsets_args,
+):
+    buffer = rebuild_buffer_func(*rebuild_buffer_args)
+    sizes = rebuild_sizes_func(*rebuild_sizes_args)
+    strides = rebuild_strides_func(*rebuild_strides_args)
+    offsets = rebuild_offsets_func(*rebuild_offsets_args)
+    return torch._nested_view_from_buffer_copy(buffer, sizes, strides, offsets)
+
+
+def reduce_nested_tensor(nt):
+    rebuild_buffer_func, rebuild_buffer_args = reduce_tensor(nt.values())
+    rebuild_sizes_func, rebuild_sizes_args = reduce_tensor(nt._nested_tensor_size())
+    rebuild_strides_func, rebuild_strides_args = reduce_tensor(
+        nt._nested_tensor_strides()
+    )
+    rebuild_offsets_func, rebuild_offsets_args = reduce_tensor(
+        nt._nested_tensor_storage_offsets()
+    )
+
+    return (
+        rebuild_nested_tensor,
+        (
+            rebuild_buffer_func,
+            rebuild_buffer_args,
+            rebuild_sizes_func,
+            rebuild_sizes_args,
+            rebuild_strides_func,
+            rebuild_strides_args,
+            rebuild_offsets_func,
+            rebuild_offsets_args,
+        ),
+    )
+
+
+def rebuild_sparse_coo_tensor(
+    rebuild_indices_func,
+    rebuild_indices_args,
+    rebuild_values_func,
+    rebuild_values_args,
+    shape,
+    is_coalesced,
+):
+    indices = rebuild_indices_func(*rebuild_indices_args)
+    values = rebuild_values_func(*rebuild_values_args)
+    return torch.sparse_coo_tensor(indices, values, shape, is_coalesced=is_coalesced)
+
+
+def rebuild_sparse_compressed_tensor(
+    rebuild_compressed_indices_func,
+    rebuild_compressed_indices_args,
+    rebuild_plain_indices_func,
+    rebuild_plain_indices_args,
+    rebuild_values_func,
+    rebuild_values_args,
+    shape,
+    layout,
+):
+    compressed_indices = rebuild_compressed_indices_func(
+        *rebuild_compressed_indices_args
+    )
+    plain_indices = rebuild_plain_indices_func(*rebuild_plain_indices_args)
+    values = rebuild_values_func(*rebuild_values_args)
+    return torch.sparse_compressed_tensor(
+        compressed_indices, plain_indices, values, shape, layout=layout
+    )
+
+
+def reduce_sparse_tensor(sparse):
+    if sparse.layout is torch.sparse_coo:
+        rebuild_indices_func, rebuild_indices_args = reduce_tensor(sparse._indices())
+        rebuild_values_func, rebuild_values_args = reduce_tensor(sparse._values())
+        return (
+            rebuild_sparse_coo_tensor,
+            (
+                rebuild_indices_func,
+                rebuild_indices_args,
+                rebuild_values_func,
+                rebuild_values_args,
+                sparse.shape,
+                sparse.is_coalesced(),
+            ),
+        )
+    else:
+        if sparse.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            compressed_indices = sparse.crow_indices()
+            plain_indices = sparse.col_indices()
+        elif sparse.layout in {torch.sparse_csc, torch.sparse_bsc}:
+            compressed_indices = sparse.ccol_indices()
+            plain_indices = sparse.row_indices()
+        else:
+            raise NotImplementedError(sparse.layout)
+        (
+            rebuild_compressed_indices_func,
+            rebuild_compressed_indices_args,
+        ) = reduce_tensor(compressed_indices)
+        rebuild_plain_indices_func, rebuild_plain_indices_args = reduce_tensor(
+            plain_indices
+        )
+        rebuild_values_func, rebuild_values_args = reduce_tensor(sparse.values())
+        return (
+            rebuild_sparse_compressed_tensor,
+            (
+                rebuild_compressed_indices_func,
+                rebuild_compressed_indices_args,
+                rebuild_plain_indices_func,
+                rebuild_plain_indices_args,
+                rebuild_values_func,
+                rebuild_values_args,
+                sparse.shape,
+                sparse.layout,
+            ),
+        )
+
+
+def fd_id(fd):
+    # Returns a tuple which uniquely identifies a file descriptor. In Mac OS,
+    # this doesn't work with shared memory handles, which is why we don't
+    # support the "file_descriptor" sharing method on that platform.
+    stat = os.fstat(fd)
+    return (stat.st_ino, stat.st_dev)
+
+
+def storage_from_cache(cls, key):
+    storage_ref = shared_cache.get(key)
+    if storage_ref is None:
+        return None
+    return torch.UntypedStorage._new_with_weak_ptr(storage_ref.cdata)
+
+
+def rebuild_storage_fd(cls, df, size):
+    fd = df.detach()
+    try:
+        storage = storage_from_cache(cls, fd_id(fd))
+        if storage is not None:
+            return storage
+        storage = cls._new_shared_fd_cpu(fd, size)
+        shared_cache[fd_id(fd)] = StorageWeakRef(storage)
+        return storage
+    finally:
+        os.close(fd)
+
+
+def rebuild_storage_filename(cls, manager, handle, size, dtype=None):
+    storage: Union[torch.TypedStorage, torch.UntypedStorage] = storage_from_cache(
+        cls, handle
+    )
+    if storage is not None:
+        return storage._shared_decref()
+    if dtype is None:
+        storage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, size)
+    else:
+        byte_size = size * torch._utils._element_size(dtype)
+        untyped_storage: torch.UntypedStorage = (
+            torch.UntypedStorage._new_shared_filename_cpu(manager, handle, byte_size)
+        )
+        storage = torch.TypedStorage(
+            wrap_storage=untyped_storage, dtype=dtype, _internal=True
+        )
+    shared_cache[handle] = StorageWeakRef(storage)
+    return storage._shared_decref()
+
+
+def rebuild_storage_empty(cls):
+    return cls()
+
+
+def rebuild_typed_storage(storage, dtype):
+    return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype, _internal=True)
+
+
+# Use for torch.storage.TypedStorage
+def reduce_typed_storage(storage):
+    return (rebuild_typed_storage, (storage._untyped_storage, storage.dtype))
+
+
+def rebuild_typed_storage_child(storage, storage_type):
+    return storage_type(wrap_storage=storage, _internal=True)
+
+
+# Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage
+def reduce_typed_storage_child(storage):
+    return (rebuild_typed_storage_child, (storage._untyped_storage, type(storage)))
+
+
+def reduce_storage(storage):
+    from . import get_sharing_strategy
+
+    if storage.is_cuda:
+        raise RuntimeError(
+            "Cannot pickle CUDA storage; try pickling a CUDA tensor instead"
+        )
+    elif get_sharing_strategy() == "file_system":
+        metadata = storage._share_filename_cpu_()
+        cache_key = metadata[1]
+        rebuild = rebuild_storage_filename
+        if isinstance(storage, torch.TypedStorage):
+            metadata += (storage.dtype,)
+        storage._shared_incref()
+    elif storage.size() == 0:
+        # This is special cased because Empty tensors
+        # (with size 0) cannot be mmapped.
+        return (rebuild_storage_empty, (type(storage),))
+    else:
+        fd, size = storage._share_fd_cpu_()
+        df = multiprocessing.reduction.DupFd(fd)
+        cache_key = fd_id(fd)
+        metadata = (df, size)
+        rebuild = rebuild_storage_fd  # type: ignore[assignment]
+
+    shared_cache[cache_key] = StorageWeakRef(storage)
+    return (rebuild, (type(storage),) + metadata)
+
+
+def init_reductions():
+    ForkingPickler.register(torch.cuda.Event, reduce_event)
+
+    for t in torch._storage_classes:
+        if t.__name__ == "UntypedStorage":
+            ForkingPickler.register(t, reduce_storage)
+        else:
+            ForkingPickler.register(t, reduce_typed_storage_child)
+
+    ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage)
+
+    for t in torch._tensor_classes:
+        ForkingPickler.register(t, reduce_tensor)
+
+    # TODO: Maybe this should be in tensor_classes? :)
+    ForkingPickler.register(torch.Tensor, reduce_tensor)
+    ForkingPickler.register(torch.nn.parameter.Parameter, reduce_tensor)
diff --git a/MLPY/Lib/site-packages/torch/multiprocessing/spawn.py b/MLPY/Lib/site-packages/torch/multiprocessing/spawn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcd99992f1e5957ab55a9fceb91ca7cf2e02bcc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/multiprocessing/spawn.py
@@ -0,0 +1,281 @@
+import logging
+import multiprocessing
+import multiprocessing.connection
+import os
+import pickle
+import signal
+import sys
+import tempfile
+import time
+import warnings
+from typing import Optional
+
+from . import _prctl_pr_set_pdeathsig  # type: ignore[attr-defined]
+
+log = logging.getLogger(__name__)
+
+
+class ProcessException(Exception):
+    __slots__ = ["error_index", "error_pid"]
+
+    def __init__(self, msg: str, error_index: int, pid: int):
+        super().__init__(msg)
+        self.msg = msg
+        self.error_index = error_index
+        self.pid = pid
+
+    def __reduce__(self):
+        return type(self), (self.msg, self.error_index, self.pid)
+
+
+class ProcessRaisedException(ProcessException):
+    """Exception raised when a process failed due to an exception raised by the code."""
+
+    def __init__(
+        self,
+        msg: str,
+        error_index: int,
+        error_pid: int,
+    ):
+        super().__init__(msg, error_index, error_pid)
+
+
+class ProcessExitedException(ProcessException):
+    """Exception raised when a process failed due to signal or exited with a specific code."""
+
+    __slots__ = ["exit_code"]
+
+    def __init__(
+        self,
+        msg: str,
+        error_index: int,
+        error_pid: int,
+        exit_code: int,
+        signal_name: Optional[str] = None,
+    ):
+        super().__init__(msg, error_index, error_pid)
+        self.exit_code = exit_code
+        self.signal_name = signal_name
+
+    def __reduce__(self):
+        return (
+            type(self),
+            (self.msg, self.error_index, self.pid, self.exit_code, self.signal_name),
+        )
+
+
+def _wrap(fn, i, args, error_file):
+    # prctl(2) is a Linux specific system call.
+    # On other systems the following function call has no effect.
+    # This is set to ensure that non-daemonic child processes can
+    # terminate if their parent terminates before they do.
+    _prctl_pr_set_pdeathsig(signal.SIGINT)
+
+    try:
+        fn(i, *args)
+    except KeyboardInterrupt:
+        pass  # SIGINT; Killed by parent, do nothing
+    except Exception:
+        # Propagate exception to parent process, keeping original traceback
+        import traceback
+
+        with open(error_file, "wb") as fh:
+            pickle.dump(traceback.format_exc(), fh)
+        sys.exit(1)
+
+
+class ProcessContext:
+    def __init__(self, processes, error_files):
+        self.error_files = error_files
+        self.processes = processes
+        self.sentinels = {
+            process.sentinel: index for index, process in enumerate(processes)
+        }
+
+    def pids(self):
+        return [int(process.pid) for process in self.processes]
+
+    def join(self, timeout=None):
+        r"""Join one or more processes within spawn context.
+
+        Attempt to join one or more processes in this spawn context.
+        If one of them exited with a non-zero exit status, this function
+        kills the remaining processes and raises an exception with the cause
+        of the first process exiting.
+
+        Returns ``True`` if all processes have been joined successfully,
+        ``False`` if there are more processes that need to be joined.
+
+        Args:
+            timeout (float): Wait this long before giving up on waiting.
+        """
+        # Ensure this function can be called even when we're done.
+        if len(self.sentinels) == 0:
+            return True
+
+        # Wait for any process to fail or all of them to succeed.
+        ready = multiprocessing.connection.wait(
+            self.sentinels.keys(),
+            timeout=timeout,
+        )
+
+        error_index = None
+        for sentinel in ready:
+            index = self.sentinels.pop(sentinel)
+            process = self.processes[index]
+            process.join()
+            if process.exitcode != 0:
+                error_index = index
+                break
+
+        # Return if there was no error.
+        if error_index is None:
+            # Return whether or not all processes have been joined.
+            return len(self.sentinels) == 0
+
+        # Assume failure. Terminate processes that are still alive.
+        # Try SIGTERM then SIGKILL if the process isn't going down.
+        # The reason is related to python signal handling is limited
+        # to main thread and if that is in c/c++ land and stuck it won't
+        # to handle it. We have seen processes getting stuck not handling
+        # SIGTERM for the above reason.
+        timeout: int = 30
+        for process in self.processes:
+            if process.is_alive():
+                log.warning("Terminating process %s via signal SIGTERM", process.pid)
+                process.terminate()
+        end = time.monotonic() + timeout
+        for process in self.processes:
+            time_to_wait = max(0, end - time.monotonic())
+            process.join(time_to_wait)
+        for process in self.processes:
+            if process.is_alive():
+                log.warning(
+                    "Unable to shutdown process %s via SIGTERM , forcefully exiting via SIGKILL",
+                    process.pid,
+                )
+                process.kill()
+            process.join()
+
+        # The file will only be created if the process crashed.
+        failed_process = self.processes[error_index]
+        if not os.access(self.error_files[error_index], os.R_OK):
+            exitcode = self.processes[error_index].exitcode
+            if exitcode < 0:
+                try:
+                    name = signal.Signals(-exitcode).name
+                except ValueError:
+                    name = f"<Unknown signal {-exitcode}>"
+                raise ProcessExitedException(
+                    "process %d terminated with signal %s" % (error_index, name),
+                    error_index=error_index,
+                    error_pid=failed_process.pid,
+                    exit_code=exitcode,
+                    signal_name=name,
+                )
+            else:
+                raise ProcessExitedException(
+                    "process %d terminated with exit code %d" % (error_index, exitcode),
+                    error_index=error_index,
+                    error_pid=failed_process.pid,
+                    exit_code=exitcode,
+                )
+
+        with open(self.error_files[error_index], "rb") as fh:
+            original_trace = pickle.load(fh)
+        msg = "\n\n-- Process %d terminated with the following error:\n" % error_index
+        msg += original_trace
+        raise ProcessRaisedException(msg, error_index, failed_process.pid)
+
+
+class SpawnContext(ProcessContext):
+    def __init__(self, processes, error_files):
+        warnings.warn("SpawnContext is renamed to ProcessContext since 1.4 release.")
+        super().__init__(processes, error_files)
+
+
+# Note: [start_processes]
+# mp.start_processes handles both start_method='spawn' and 'fork'. It's supposed to be a
+# more generalized API than mp.spawn. Currently we only document mp.spawn as it's the
+# CUDA compatible start_method. However, in environments like Ipython notebooks, 'fork'
+# works better than 'spawn'. Every helper function we created for mp.spawn is indeed
+# general enough, and backends like XLA can reuse them in Colab notebooks as well.
+# Currently we only add this API first, we can consider adding it to documentation as
+# needed in the future.
+def start_processes(
+    fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"
+):
+    mp = multiprocessing.get_context(start_method)
+    error_files = []
+    processes = []
+    for i in range(nprocs):
+        # Each process is assigned a file to write tracebacks to.  We
+        # use the file being non-empty to indicate an exception
+        # occurred (vs an expected shutdown).  Note: this previously
+        # used a multiprocessing.Queue but that can be prone to
+        # deadlocks, so we went with a simpler solution for a one-shot
+        # message between processes.
+        tf = tempfile.NamedTemporaryFile(
+            prefix="pytorch-errorfile-", suffix=".pickle", delete=False
+        )
+        tf.close()
+        os.unlink(tf.name)
+        process = mp.Process(
+            target=_wrap,
+            args=(fn, i, args, tf.name),
+            daemon=daemon,
+        )
+        process.start()
+        error_files.append(tf.name)
+        processes.append(process)
+
+    context = ProcessContext(processes, error_files)
+    if not join:
+        return context
+
+    # Loop on join until it returns True or raises an exception.
+    while not context.join():
+        pass
+
+
+def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method="spawn"):
+    r"""Spawns ``nprocs`` processes that run ``fn`` with ``args``.
+
+    If one of the processes exits with a non-zero exit status, the
+    remaining processes are killed and an exception is raised with the
+    cause of termination. In the case an exception was caught in the
+    child process, it is forwarded and its traceback is included in
+    the exception raised in the parent process.
+
+    Args:
+        fn (function): Function is called as the entrypoint of the
+            spawned process. This function must be defined at the top
+            level of a module so it can be pickled and spawned. This
+            is a requirement imposed by multiprocessing.
+
+            The function is called as ``fn(i, *args)``, where ``i`` is
+            the process index and ``args`` is the passed through tuple
+            of arguments.
+
+        args (tuple): Arguments passed to ``fn``.
+        nprocs (int): Number of processes to spawn.
+        join (bool): Perform a blocking join on all processes.
+        daemon (bool): The spawned processes' daemon flag. If set to True,
+                       daemonic processes will be created.
+        start_method (str): (deprecated) this method will always use ``spawn``
+                               as the start method. To use a different start method
+                               use ``start_processes()``.
+
+    Returns:
+        None if ``join`` is ``True``,
+        :class:`~ProcessContext` if ``join`` is ``False``
+
+    """
+    if start_method != "spawn":
+        msg = (
+            "This method only supports start_method=spawn (got: %s).\n"
+            "To use a different start_method use:\n\t\t"
+            " torch.multiprocessing.start_processes(...)" % start_method
+        )
+        warnings.warn(msg)
+    return start_processes(fn, args, nprocs, join, daemon, start_method="spawn")
diff --git a/MLPY/Lib/site-packages/torch/nested/__init__.py b/MLPY/Lib/site-packages/torch/nested/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e41e656e65f237765543af2cdb66c149c588142
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nested/__init__.py
@@ -0,0 +1,253 @@
+from typing import List, Optional, Union, Sequence
+
+import torch
+from torch import SymInt, Tensor
+from torch._C import _add_docstr, _nested  # type: ignore[attr-defined]
+
+from torch.types import _device as Device, _dtype as DType
+
+__all__ = [
+    "to_padded_tensor",
+    "as_nested_tensor",
+    "nested_tensor",
+    "narrow",
+]
+
+# Nested Tensor constructor functions
+
+
+def as_nested_tensor(
+    tensor_list: Sequence[Tensor],
+    dtype: Optional[DType] = None,
+    device: Optional[Device] = None,
+    layout=None
+) -> Tensor:
+    r"""
+    Constructs a nested tensor preserving autograd history from :attr:`tensor_list` a list of tensors.
+
+    .. note::
+        Tensors within the list are always copied by this function due to current nested tensor semantics.
+
+    Args:
+        tensor_list (List[Tensor]): a list of tensors with the same ndim
+
+    Keyword arguments:
+        dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
+            Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
+        device (:class:`torch.device`, optional): the desired device of returned nested tensor.
+            Default: if None, same :class:`torch.device` as leftmost tensor in the list
+        layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
+            Only strided and jagged layouts are supported. Default: if None, the strided layout.
+
+    Example::
+
+        >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
+        >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
+        >>> nt = torch.nested.as_nested_tensor([a, b])
+        >>> nt.is_leaf
+        False
+        >>> fake_grad = torch.nested.nested_tensor([torch.ones_like(a), torch.zeros_like(b)])
+        >>> nt.backward(fake_grad)
+        >>> a.grad
+        tensor([1., 1., 1.])
+        >>> b.grad
+        tensor([0., 0., 0., 0., 0.])
+    """
+    if not isinstance(tensor_list, list) or any(
+        not isinstance(t, Tensor) for t in tensor_list
+    ):
+        raise TypeError(
+            "as_nested_tensor(): Expected first argument to be a list of tensors "
+        )
+
+    if layout is None:
+        layout = torch.strided
+    if layout == torch.strided:
+        return torch._nested_tensor_from_tensor_list(tensor_list, dtype, None, device, None)
+    elif layout == torch.jagged:
+        from torch.nested._internal.nested_tensor import jagged_from_list
+
+        nt, _ = jagged_from_list(tensor_list, offsets=None, device=device, dtype=dtype)
+        return nt
+    else:
+        raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
+
+
+# Note: This not only adds doc strings for the nested ops, but
+# also connects the torch.nested Python namespace to the torch._C._nested builtins.
+
+to_padded_tensor = _add_docstr(
+    _nested.nested_to_padded_tensor,
+    r"""
+to_padded_tensor(input, padding, output_size=None, out=None) -> Tensor
+
+Returns a new (non-nested) Tensor by padding the :attr:`input` nested tensor.
+The leading entries will be filled with the nested data,
+while the trailing entries will be padded.
+
+.. warning::
+
+    :func:`to_padded_tensor` always copies the underlying data,
+    since the nested and the non-nested tensors differ in memory layout.
+
+Args:
+    padding (float): The padding value for the trailing entries.
+
+Keyword args:
+    output_size (Tuple[int]): The size of the output tensor.
+                              If given, it must be large enough to contain all nested data;
+                              else, will infer by taking the max size of each nested sub-tensor along each dimension.
+    out (Tensor, optional): the output tensor.
+
+Example::
+
+    >>> nt = torch.nested.nested_tensor([torch.randn((2, 5)), torch.randn((3, 4))])
+    nested_tensor([
+      tensor([[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
+              [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995]]),
+      tensor([[-1.8546, -0.7194, -0.2918, -0.1846],
+              [ 0.2773,  0.8793, -0.5183, -0.6447],
+              [ 1.8009,  1.8468, -0.9832, -1.5272]])
+    ])
+    >>> pt_infer = torch.nested.to_padded_tensor(nt, 0.0)
+    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276],
+             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995],
+             [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]],
+            [[-1.8546, -0.7194, -0.2918, -0.1846,  0.0000],
+             [ 0.2773,  0.8793, -0.5183, -0.6447,  0.0000],
+             [ 1.8009,  1.8468, -0.9832, -1.5272,  0.0000]]])
+    >>> pt_large = torch.nested.to_padded_tensor(nt, 1.0, (2, 4, 6))
+    tensor([[[ 1.6862, -1.1282,  1.1031,  0.0464, -1.3276,  1.0000],
+             [-1.9967, -1.0054,  1.8972,  0.9174, -1.4995,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]],
+            [[-1.8546, -0.7194, -0.2918, -0.1846,  1.0000,  1.0000],
+             [ 0.2773,  0.8793, -0.5183, -0.6447,  1.0000,  1.0000],
+             [ 1.8009,  1.8468, -0.9832, -1.5272,  1.0000,  1.0000],
+             [ 1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000]]])
+    >>> pt_small = torch.nested.to_padded_tensor(nt, 2.0, (2, 2, 2))
+    RuntimeError: Value in output_size is less than NestedTensor padded size. Truncation is not supported.
+
+""",
+)
+
+def nested_tensor(tensor_list, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor:
+    r"""
+Constructs a nested tensor with no autograd history (also known as a “leaf tensor”, see
+:ref:`Autograd mechanics <autograd-mechanics>`) from :attr:`tensor_list` a list of tensors.
+
+Args:
+    tensor_list (List[array_like]): a list of tensors, or anything that can be passed to torch.tensor,
+    where each element of the list has the same dimensionality.
+
+Keyword arguments:
+    dtype (:class:`torch.dtype`, optional): the desired type of returned nested tensor.
+        Default: if None, same :class:`torch.dtype` as leftmost tensor in the list.
+    layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
+        Only strided and jagged layouts are supported. Default: if None, the strided layout.
+    device (:class:`torch.device`, optional): the desired device of returned nested tensor.
+        Default: if None, same :class:`torch.device` as leftmost tensor in the list
+    requires_grad (bool, optional): If autograd should record operations on the
+        returned nested tensor. Default: ``False``.
+    pin_memory (bool, optional): If set, returned nested tensor would be allocated in
+        the pinned memory. Works only for CPU tensors. Default: ``False``.
+
+Example::
+
+    >>> a = torch.arange(3, dtype=torch.float, requires_grad=True)
+    >>> b = torch.arange(5, dtype=torch.float, requires_grad=True)
+    >>> nt = torch.nested.nested_tensor([a, b], requires_grad=True)
+    >>> nt.is_leaf
+    True
+    """
+    if layout is None:
+        layout = torch.strided
+    if layout == torch.strided:
+        return _nested.nested_tensor(
+            tensor_list,
+            dtype=dtype,
+            device=device,
+            requires_grad=requires_grad,
+            pin_memory=pin_memory)
+    elif layout == torch.jagged:
+        # Need to wrap lists of scalars as tensors
+        list_of_tensors = [t if isinstance(t, Tensor) else torch.as_tensor(t) for t in tensor_list]
+
+        from torch.nested._internal.nested_tensor import jagged_from_list
+
+        with torch.no_grad():
+            nt, _ = jagged_from_list(list_of_tensors, offsets=None, device=device, dtype=dtype)
+
+        nt.requires_grad_(requires_grad)
+        if pin_memory:
+            nt = nt.pin_memory()  # type: ignore[assignment]
+
+        return nt
+    else:
+        raise RuntimeError(f"Specified layout is unsupported for nested tensors: {layout}")
+
+
+def narrow(tensor: Tensor, dim: int, start: Union[int, Tensor], length: Union[int, Tensor], layout=torch.strided) -> Tensor:
+    r"""
+Constructs a nested tensor (which might be a view) from :attr:`tensor`, a strided tensor. This follows
+similar semantics to torch.Tensor.narrow, where in the :attr:`dim`-th dimension the new nested tensor
+shows only the elements in the interval `[start, start+length)`. As nested representations
+allow for a different `start` and `length` at each 'row' of that dimension, :attr:`start` and :attr:`length`
+can also be tensors of shape `tensor.shape[0]`.
+
+There's some differences depending on the layout you use for the nested tensor. If using strided layout,
+torch.narrow will do a copy of the narrowed data into a contiguous NT with strided layout, while
+jagged layout narrow() will create a non-contiguous view of your original strided tensor. This particular
+representation is really useful for representing kv-caches in Transformer models, as specialized
+SDPA kernels can deal with format easily, resulting in performance improvements.
+
+
+Args:
+    tensor (:class:`torch.Tensor`): a strided tensor, which will be used as the underlying data
+        for the nested tensor if using the jagged layout or will be copied for the strided layout.
+    dim (int): the dimension where narrow will be applied. Only `dim=1` is supported for the
+        jagged layout, while strided supports all dim
+    start (Union[int, :class:`torch.Tensor`]): starting element for the narrow operation
+    length (Union[int, :class:`torch.Tensor`]): number of elements taken during the narrow op
+
+Keyword arguments:
+    layout (:class:`torch.layout`, optional): the desired layout of returned nested tensor.
+        Only strided and jagged layouts are supported. Default: if None, the strided layout.
+
+Example::
+
+    >>> starts = torch.tensor([0, 1, 2, 3, 4], dtype=torch.int64)
+    >>> lengths = torch.tensor([3, 2, 2, 1, 5], dtype=torch.int64)
+    >>> narrow_base = torch.randn(5, 10, 20)
+    >>> nt_narrowed = torch.nested.narrow(narrow_base, 1, starts, lengths, layout=torch.jagged)
+    >>> nt_narrowed.is_contiguous()
+    False
+    """
+    if not isinstance(start, (int, SymInt, Tensor)):
+        raise RuntimeError("start must be an integer or a tensor")
+
+    if not isinstance(length, (int, SymInt, Tensor)):
+        raise RuntimeError("length must be an integer or a tensor")
+
+    if layout == torch.strided:
+        if isinstance(start, Tensor) or isinstance(length, Tensor):
+            raise RuntimeError("start and length must be integers for the strided layout NT impl")
+        # TODO: switch to as_nested_tensor(tensor) when it is available
+        nt = as_nested_tensor(torch.unbind(tensor), layout=torch.strided).narrow(dim, start, length)
+    elif layout == torch.jagged:
+        if dim != 1:
+            raise RuntimeError("jagged layout only supports dim=1")
+
+        from torch.nested._internal.nested_tensor import jagged_from_tensor_and_lengths
+
+        if isinstance(start, (int, SymInt)):
+            start = torch.tensor([start], device=tensor.device, dtype=torch.int64)
+
+        if isinstance(length, (int, SymInt)):
+            length = torch.tensor([length], device=tensor.device, dtype=torch.int64)
+
+        nt, _, _ = jagged_from_tensor_and_lengths(tensor, start, length)
+    else:
+        raise RuntimeError(f"Specified layout is unsupported for nested narrow: {layout}")
+
+    return nt
diff --git a/MLPY/Lib/site-packages/torch/nested/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nested/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b92cc9e0b227f2d1eb486a12624f36079bbc5ae2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nested/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/__init__.py b/MLPY/Lib/site-packages/torch/nested/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cadb3af6a9768ae055d287a672e9674a319bbb2a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/nested_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/nested_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34bcc43857e93370cb786aa0f2693b54bc635fb2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/nested_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bbd71dfaf00eac635d9de51499f7dac001dc6e8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/sdpa.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/sdpa.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f03086c4b85293e1ce669149fa913d4d0f4100ac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nested/_internal/__pycache__/sdpa.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/nested_tensor.py b/MLPY/Lib/site-packages/torch/nested/_internal/nested_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..15131efa471c6d7e7511bcb3080f08e6ea559aaf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nested/_internal/nested_tensor.py
@@ -0,0 +1,431 @@
+from typing import Tuple
+
+import torch
+from torch._C import DispatchKey, DispatchKeySet
+from torch._prims_common import is_expandable_to
+from torch.fx.experimental.symbolic_shapes import has_free_symbols
+from torch.utils.weak import WeakTensorKeyDictionary
+from typing import *  # noqa: F403
+
+_tensor_id_counter = 0
+_tensor_symint_registry = WeakTensorKeyDictionary()
+
+
+def get_tensor_symint(tensor, *, coeff=1):
+    global _tensor_id_counter
+    tensor_symint = _tensor_symint_registry.get(tensor)
+    if tensor_symint is None:
+        tensor_symint = torch._C._get_nested_int(_tensor_id_counter, coeff)
+        _tensor_id_counter += 1
+        _tensor_symint_registry[tensor] = tensor_symint
+    return tensor_symint
+
+
+# SDPA metadata; max / min seqlens are needed for e.g. flash
+def _get_sdpa_extreme_seqlen(func, tensor):
+    return int(func(tensor).item())
+
+
+class NestedTensor(torch.Tensor):
+    _values: torch.Tensor  # type: ignore[assignment]
+    _offsets: torch.Tensor
+    _lengths: Optional[torch.Tensor]
+    # NOTE [ Nested ints for ragged sizes and strides ]
+    #
+    # Jagged layout tensors are tensors that represent a n-dim tensor with a
+    # ragged dimension, but are backed by an (n-1)-dim tensor underneath, e.g.,
+    # a jagged tensor with outer shape [B, x, D] is represented internally by a
+    # tensor with shape [sum(x), D] where we introduce what we call a nested int
+    # denoted as "x" here (but sometimes denoted with "*" to
+    # represent the ragged dimension, and sum(x) represents the dim of the inner
+    # tensor or equivalently the sum of all the sizes of the constituent
+    # tensors' varying lengths.
+    #
+    # We also use nested ints to represent the strides of this tensor.
+    # For example, a jagged tensor with shape [B, x, D] can be strided in two
+    # ways: [xD, D, 1] and [x, 1, sum(x)], where xD represents x multiplied by D
+    _size: Tuple[int, ...]
+    _stride: Tuple[int, ...]
+    # Indicates that the nth dimension is ragged
+    _ragged_idx: int
+    _metadata_cache: Dict[str, Any]
+
+    @staticmethod
+    def __new__(
+        cls,
+        values,
+        offsets,
+        *,
+        lengths=None,
+        **kwargs,
+    ):
+        ks = DispatchKeySet(DispatchKey.NestedTensor)
+        ks = ks.add(DispatchKey.AutogradNestedTensor)
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls,
+            (0,),
+            (0,),
+            0,
+            torch.contiguous_format,
+            values.dtype,
+            torch.jagged,
+            values.device,
+            False,
+            kwargs.get("requires_grad", False),
+            "sizes",
+            False,
+            True,  # dispatch_layout
+            ks,
+        )
+        return r
+
+    def __init__(self, values, offsets, *, lengths=None, **kwargs):
+        super().__init__()
+        # Only support jagged for now.
+        assert offsets is not None
+        assert offsets.ndim == 1
+        assert not isinstance(values, NestedTensor)
+
+        # Query cache for the symint associated with offsets or lengths
+        # (create a new one if needed).
+        ragged_source = offsets if lengths is None else lengths
+        ragged_size = get_tensor_symint(ragged_source, coeff=1)
+        self._ragged_idx = kwargs.get("_ragged_idx", 1)
+        B = offsets.shape[0] - 1
+        if lengths is not None:
+            assert B == lengths.shape[0]
+
+        # subtract 1 to convert to values dim space
+        r = self._ragged_idx - 1
+        self._size = (B, *values.shape[:r], ragged_size, *values.shape[r + 1 :])
+        stride = values.stride()
+        self._strides = (ragged_size * stride[r], *stride)
+
+        self._values = values
+        self._offsets = offsets
+        self._lengths = lengths
+
+        # holds properties that are computed lazily
+        self._metadata_cache = kwargs.get("_metadata_cache") or {}
+
+        # collapsed ragged dim must always be dynamic
+        torch._dynamo.mark_dynamic(self, self._ragged_idx)
+        torch._dynamo.mark_dynamic(self._values, self._ragged_idx - 1)
+
+    def values(self):
+        # dispatch to get proper view relationship
+        return torch._nested_get_values(self)  # type: ignore[return-value]
+
+    def offsets(self):
+        return self._offsets
+
+    def lengths(self):
+        return self._lengths
+
+    @property
+    def _max_seqlen(self):
+        if "max_seqlen" not in self._metadata_cache:
+            # compute & cache
+            self._metadata_cache["max_seqlen"] = _get_sdpa_extreme_seqlen(
+                torch.max,
+                self._offsets.diff() if self._lengths is None else self._lengths,
+            )
+        return self._metadata_cache["max_seqlen"]
+
+    @property
+    def _min_seqlen(self):
+        if "min_seqlen" not in self._metadata_cache:
+            # compute & cache
+            self._metadata_cache["min_seqlen"] = _get_sdpa_extreme_seqlen(
+                torch.min,
+                self._offsets.diff() if self._lengths is None else self._lengths,
+            )
+        return self._metadata_cache["min_seqlen"]
+
+    def __repr__(self):
+        # We should implement this in torch/_tensor_str.py instead
+        grad_fn_str = (
+            f", requires_grad={self.requires_grad}" if self.requires_grad else ""
+        )
+        if self.grad_fn:
+            grad_fn_str = f", grad_fn={self.grad_fn}"
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._lengths is None})"
+
+    def __reduce_ex__(self, proto):
+        state = torch._utils._get_obj_state(self)
+
+        # SymNodes are not serializable
+        assert "_size" in state and "_strides" in state
+        state = dict(state)
+        del state["_size"]
+        del state["_strides"]
+
+        func = NestedTensor
+        args = (self._values, self._offsets)
+        return (torch._tensor._rebuild_from_type_v2, (func, type(self), args, state))
+
+    def __tensor_flatten__(self):
+        ctx = {
+            "requires_grad": self.requires_grad,
+            # TODO: Don't guard on this!
+            "metadata_cache": self._metadata_cache,
+            "ragged_idx": self._ragged_idx,
+        }
+        inner_tensors = ["_values", "_offsets"]
+        if self._lengths is not None:
+            inner_tensors.append("_lengths")
+        return inner_tensors, ctx
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors: Dict, meta, outer_size, outer_stride):
+        # inner tensors: _values, _offsets, [_lengths]
+        assert len(inner_tensors) >= 2 and len(inner_tensors) <= 3
+        values = inner_tensors["_values"]
+        offsets = inner_tensors["_offsets"]
+        lengths = inner_tensors.get("_lengths", None)
+        ragged_idx = meta["ragged_idx"]
+
+        # Note that we cannot simply check if is_fake(values) because
+        # during aot autograd, FunctionalTensors are not fake but hold
+        # symbolic sizes.
+        ragged_source = offsets if lengths is None else lengths
+        if has_free_symbols(ragged_source) or has_free_symbols(values):
+            # Associate offsets or lengths (possibly fake, possibly functionalized)
+            # with the ragged_size.
+            ragged_size = outer_size[ragged_idx]
+            _tensor_symint_registry[ragged_source] = ragged_size
+
+        return NestedTensor(
+            values,
+            offsets=offsets,
+            lengths=lengths,
+            requires_grad=meta["requires_grad"],
+            _ragged_idx=ragged_idx,
+            _metadata_cache=meta["metadata_cache"],
+        )
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        kwargs = {} if kwargs is None else kwargs
+
+        # Lazy import to avoid circular dependency
+        from .ops import lookup_jagged
+
+        fn = lookup_jagged(func, *args, **kwargs)
+        if fn is not None:
+            return fn(*args, **kwargs)
+
+        raise NotImplementedError(func)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+
+        from .ops import jagged_torch_function
+
+        try:
+            return jagged_torch_function(func, *args, **kwargs)
+        except NotImplementedError:
+            pass
+        with torch._C.DisableTorchFunctionSubclass():
+            return func(*args, **kwargs)
+
+
+# NB: These fake view autograd.Functions are superseded by real view ops. Don't use them!
+# TODO: Remove ViewBufferFromNested, ViewNestedFromBuffer, and buffer_from_jagged once the
+# internal BC period has passed.
+
+
+# Not actually a view!
+class ViewBufferFromNested(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x: NestedTensor):  # type: ignore[override]
+        ctx.save_for_backward(x.offsets())
+        ctx.metadata_cache = x._metadata_cache
+        ctx.ragged_idx = x._ragged_idx
+        return x._values
+
+    @staticmethod
+    def backward(ctx, gO: torch.Tensor):  # type: ignore[override]
+        (offsets,) = ctx.saved_tensors
+        return NestedTensor(
+            gO,
+            offsets=offsets,
+            _metadata_cache=ctx.metadata_cache,
+            _ragged_idx=ctx.ragged_idx,
+        )
+
+
+# Not actually a view!
+class ViewNestedFromBuffer(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        offsets: torch.Tensor,
+        metadata_cache: Optional[Dict[str, Any]] = None,
+    ):  # type: ignore[override]
+        return NestedTensor(
+            values.detach(),
+            offsets=offsets,
+            _metadata_cache=metadata_cache,
+        )
+
+    @staticmethod
+    def backward(ctx, gO: NestedTensor):  # type: ignore[override]
+        return gO._values, None, None
+
+
+def buffer_from_jagged(jagged):
+    return ViewBufferFromNested.apply(jagged)
+
+
+# Need to make it obvious that users should be passing in offsets
+def jagged_from_list(
+    tensors: List[torch.Tensor],
+    offsets: Optional[torch.Tensor],
+    dtype=None,
+    device=None,
+) -> Tuple[NestedTensor, torch.Tensor]:
+    """Constructs a NestedTensor backed by jagged layout from a list of tensors"""
+
+    if not len(set(t.dtype for t in tensors)) == 1:  # noqa: C401
+        raise RuntimeError(
+            "When constructing a nested tensor, all tensors in list must have the same dtype"
+        )
+    if not len(set(t.device for t in tensors)) == 1:  # noqa: C401
+        raise RuntimeError(
+            "When constructing a nested tensor, all tensors in list must be on the same device"
+        )
+
+    # Check that the NT is representable by the jagged layout.
+    # Jagged layout represents (B, *, D_0, D_1, ..., D_N), where the only
+    # raggedness allowed is for the single dim immediately adjacent to the batch dim.
+    sizes = [t.shape for t in tensors]
+    non_first_sizes = [s[1:] for s in sizes]
+    at_most_first_ragged = all(s == non_first_sizes[0] for s in non_first_sizes)
+    if not at_most_first_ragged:
+        raise RuntimeError(
+            "Cannot represent given tensor list as a nested tensor with the jagged layout. "
+            "Note that the jagged layout only represents shapes of the form "
+            "(B, *, D_0, D_1, ..., D_N), with only * allowed to be ragged."
+        )
+
+    # Set properties appropriately.
+    values = torch.cat(tensors, dim=0)
+    to_kwargs = {}
+    if device is not None:
+        to_kwargs["device"] = device
+    if dtype is not None:
+        to_kwargs["dtype"] = dtype
+    values = values.to(**to_kwargs)
+
+    # Calculate jagged offsets if not provided.
+    if offsets is None:
+        # Jagged layout specifies that offsets are stored as int64 on the same device as values.
+        # TODO: An alternative way to construct offsets is to use F.pad. This avoids creating
+        # an extra leaf tensor during the forward, potentially resolving compatibility issues.
+        offsets = torch.cat(
+            [
+                torch.zeros(1, dtype=torch.int64, device=values.device),
+                torch.tensor([s[0] for s in sizes], device=values.device).cumsum(dim=0),
+            ]
+        )
+
+    ret_nt = nested_view_from_values_offsets(values, offsets)
+    ret_nt._metadata_cache = {
+        # compute this now since it's easy
+        "max_seqlen": max([t.shape[0] for t in tensors]),
+        "min_seqlen": min([t.shape[0] for t in tensors]),
+    }
+    return (ret_nt, offsets)  # type: ignore[return-value]
+
+
+def jagged_from_tensor_and_lengths(
+    tensor: torch.Tensor, starts: torch.Tensor, lengths: torch.Tensor
+) -> Tuple[NestedTensor, torch.Tensor, Optional[torch.Tensor]]:
+    """Constructs a NestedTensor backed by jagged layout from a tensor, starts of sequences, and sequence lengths"""
+    batch_size = tensor.shape[0]
+    if is_expandable_to(starts.shape, (batch_size,)) and is_expandable_to(
+        lengths.shape, (batch_size,)
+    ):
+        start_list = starts.expand(batch_size)
+        length_list = lengths.expand(batch_size)
+    else:
+        raise RuntimeError(
+            "When constructing a jagged nested tensor using narrow(), "
+            "your start and length must be Tensors that broadcast to input.shape[0]"
+        )
+
+    # Calculate jagged offsets
+    assert (
+        len(tensor.shape) >= 2
+    ), "tensor must at least be 2D for the nested narrow op to work"
+    max_seq_len = tensor.shape[1]
+    offset_lengths = max_seq_len * torch.arange(
+        0, batch_size, dtype=torch.int64, device=tensor.device
+    )
+    # Jagged layout specifies that offsets are stored as int64 on the same device as values.
+    offsets = torch.cat(
+        [
+            start_list + offset_lengths,
+            (start_list[-1] + offset_lengths[-1] + length_list[-1]).unsqueeze(0),
+        ]
+    )
+
+    # Reshape buffer to flatten the 1st and 2nd dimension (view used to enforce non-copy)
+    if len(tensor.shape) > 2:
+        values = tensor.view(-1, *tensor.shape[2:])
+    else:
+        values = tensor.view(-1)
+
+    # Check if offsets and lengths make it possibly contiguous and return a regular NT
+    is_contiguous = True
+    orig_dim = tensor.shape[1]
+    if torch.any(length_list[1:-1].ne(orig_dim)):
+        is_contiguous = False
+    if torch.any(offsets[1:-2].diff().ne(orig_dim)):
+        is_contiguous = False
+    if offsets[0] + length_list[0] != orig_dim:
+        is_contiguous = False
+
+    actual_max_seqlen = int(torch.max(lengths).item())
+    min_seqlen = int(torch.min(lengths).item())
+
+    if is_contiguous:
+        ret_nt = nested_view_from_values_offsets(
+            values[offsets[0] : offsets[-1]], offsets - offsets[0]
+        )
+    else:
+        ret_nt = nested_view_from_values_offsets_lengths(values, offsets, length_list)
+
+    # populate metadata cache with computed seqlen extremes
+    ret_nt._metadata_cache = {
+        "max_seqlen": actual_max_seqlen,
+        "min_seqlen": min_seqlen,
+    }
+
+    return (ret_nt, offsets, None if is_contiguous else length_list)
+
+
+# NB: A dummy arg is required so that NestedTensor.__torch_dispatch__() is invoked
+# for _nested_view_from_values_offsets(). Sizes don't matter much, but they shouldn't be
+# 0/1 because the dummy can be fake-ified and we want to avoid specializing.
+# This arg is otherwise unused.
+_nt_view_dummy = NestedTensor(
+    values=torch.randn(3, 3, device="meta"),
+    offsets=torch.randint(3, (2,), device="meta", dtype=torch.int64),
+).detach()
+
+
+def nested_view_from_values_offsets(values, offsets, ragged_idx=1):
+    return torch._nested_view_from_jagged(
+        values, offsets, _nt_view_dummy, None, ragged_idx
+    )  # type: ignore[return-value]
+
+
+def nested_view_from_values_offsets_lengths(values, offsets, lengths, ragged_idx=1):
+    return torch._nested_view_from_jagged(
+        values, offsets, _nt_view_dummy, lengths, ragged_idx
+    )  # type: ignore[return-value]
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/ops.py b/MLPY/Lib/site-packages/torch/nested/_internal/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a95417a5e599a62fe9d712033baac8bbc0da29
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nested/_internal/ops.py
@@ -0,0 +1,1120 @@
+import functools
+import math
+import operator
+
+import torch
+from torch.nested._internal.sdpa import jagged_scaled_dot_product_attention
+
+from .nested_tensor import NestedTensor
+from typing import *  # noqa: F403
+import torch.nn.functional as F
+from torch.fx.operator_schemas import normalize_function
+
+__all__: List[Any] = []
+
+JAGGED_OPS_TABLE: Dict[Any, Any] = {}
+
+
+# Simplifying assumption: we assume that the batch dim is always the left-most
+# dim, and the ragged dim is always the second dim.
+def _outer_to_inner_dim(ndim, dim):
+    assert dim >= 0 and dim < ndim
+    return 0 if dim < 2 else dim - 1
+
+
+def _wrap_jagged_dim(
+    ndim, dim, op_name, convert_to_inner_dim=True, allow_batch_dim=False
+):
+    from torch._prims_common import canonicalize_dims
+
+    wrapped = canonicalize_dims(ndim, dim)
+    if wrapped == 1:
+        raise RuntimeError(f"{op_name}(): not supported for NestedTensor on dim=1")
+    elif wrapped == 0 and not allow_batch_dim:
+        raise RuntimeError(f"{op_name}(): not supported for NestedTensor on dim=0")
+    return _outer_to_inner_dim(ndim, wrapped) if convert_to_inner_dim else wrapped
+
+
+def _wrap_jagged_dims(ndim, dims, op_name):
+    # ex: (2, 3, 4) -> (1, 2, 3)
+    # ex: (0, 1, 4) -> (0, 3)
+    from torch._prims_common import canonicalize_dims
+
+    wrapped_dims = [canonicalize_dims(ndim, d) for d in dims]
+    # This logic needs to be done after we canonicalize dims but before we
+    # map to inner dims so we can print a nicer error message.
+    zero_in_dims = 0 in wrapped_dims
+    one_in_dims = 1 in wrapped_dims
+    if zero_in_dims ^ one_in_dims:
+        apply, not_apply = ("batch", "ragged") if zero_in_dims else ("ragged", "batch")
+        raise RuntimeError(
+            f"{op_name}(): applying over the {apply} dimension, but not the {not_apply}"
+            " dimension is not supported for NestedTensor"
+        )
+    return (
+        tuple(_outer_to_inner_dim(ndim, d) for d in dims if d != 0),
+        zero_in_dims,
+    )
+
+
+def check_schema(schema_str: str, func, *args, **kwargs) -> None:
+    named_arg_types = schema_str.split(", ")
+    num_optional_args = sum([x.endswith("?") for x in named_arg_types])
+    min_args = len(named_arg_types) - num_optional_args
+
+    # special case: ellipses allows for any number of unchecked args at the end
+    if named_arg_types[-1] == "...":
+        named_arg_types = named_arg_types[:-1]
+    else:
+        if not (len(args) >= min_args and len(args) <= len(named_arg_types)):
+            raise ValueError(
+                f"NestedTensor {func.__name__}({schema_str}): expected at least {min_args} "
+                f"arguments and at most {len(named_arg_types)} arguments, but got: "
+                f"{len(args)} arguments"
+            )
+
+    arg_type_check_fns = {
+        "t": lambda x: isinstance(x, torch.Tensor) and not isinstance(x, NestedTensor),
+        "jt": lambda x: isinstance(x, NestedTensor)
+        and x._lengths is None
+        and x._ragged_idx == 1,  # ops with "jt" require contiguous JT only
+        "jt_all": lambda x: isinstance(
+            x, NestedTensor
+        ),  # ops with "jt_all" can accept all kinds of JT
+        "any": lambda x: True,
+    }
+    for i, named_arg_type in enumerate(named_arg_types):
+        name, arg_type = named_arg_type.split(": ")
+        is_optional = arg_type.endswith("?")
+        normalized_arg_type = arg_type[:-1] if is_optional else arg_type
+        if normalized_arg_type not in arg_type_check_fns.keys():
+            raise AssertionError(f"Unknown arg type: {normalized_arg_type}")
+
+        if i >= len(args):
+            if not is_optional:
+                raise ValueError(
+                    f"NestedTensor {func.__name__}({schema_str}) "
+                    f"missing required argument: {name}"
+                )
+            continue
+
+        _check_fn = arg_type_check_fns[normalized_arg_type]
+
+        def check_fn(x, is_optional=is_optional):
+            if is_optional:
+                return x is None or _check_fn(x)
+            else:
+                return _check_fn(x)
+
+        if not check_fn(args[i]):
+            type_to_desc = {
+                "t": "tensor",
+                "t?": "optional tensor",
+                "jt": "contiguous jagged layout NestedTensor",
+                "jt_all": "jagged layout NestedTensor",
+                "any": "<any type>",
+            }
+
+            raise ValueError(
+                f"NestedTensor {func.__name__}({schema_str}): expected {name} to be a "
+                f"{type_to_desc[arg_type]}"
+            )
+
+
+def check_ragged_dim_same(
+    func, a: NestedTensor, a_name: str, b: NestedTensor, b_name: str
+) -> None:
+    # Calling into .shape here
+    if a._size[a._ragged_idx] != b._size[b._ragged_idx]:
+        raise RuntimeError(
+            f"NestedTensor {func.__name__}: expected {a_name} and {b_name} to have the "
+            "same exact offsets tensor."
+        )
+
+
+# returns True if the raggedness-relevant portions of the NT shape
+# match those of the specified size
+def raggedness_matches(nt, size):
+    end = nt._ragged_idx + 1
+    nt_ragged = nt._size[:end]
+    size_ragged = size[:end]
+    return len(nt_ragged) == len(size_ragged) and (
+        all(ns == s or s == -1 for ns, s in zip(nt_ragged, size_ragged))
+    )
+
+
+def squeeze_leading_ones(t):
+    # Note: [ Squeezing leading ones ]
+    #
+    # Squeeze leading ones from t.
+    #
+    # We want:
+    #   (B, j0, ?, ?) + (1, 1, ?, ?) -> (B, j0, ?, ?)
+    #   (B, j0, ?, ?) + (1, 1, 1, ?, ?) -> (1, B, j0, ?, ?)  (not yet supported)
+    #
+    # 1) Squeeze extra ones and grab values from NT
+    #   (1, 1, ?, ?) -> (?, ?)   and   (sum(*), ?, ?) -> (B, j0, ?, ?)
+    # 2) Do dense broadcasting:
+    #   (sum(*), ?, ?) + (?, ?) -> (sum(*), ?, ?)
+    # 3) Construct nested tensor
+    #   (sum(*), ?, ?) -> (B, j0, ?, ?)
+    #
+    # If unsqueezing on the 0th dim becomes supported, we would unsqueeze
+    # at step (4) and we would need to update this function to record how
+    # many ones we unsqueezed.
+    while t.shape[0] == 1:
+        t = t.squeeze(0)
+    return t
+
+
+def register_func(tables, aten_ops, schema_str):
+    if not isinstance(aten_ops, list):
+        aten_ops = [aten_ops]
+    if not isinstance(tables, list):
+        tables = [tables]
+
+    def wrapper(func):
+        for aten_op in aten_ops:
+
+            def get_inner(aten_op):
+                def inner(*args, **kwargs):
+                    check_schema(schema_str, func, *args, **kwargs)
+                    return func(aten_op, *args, **kwargs)
+
+                return inner
+
+            for table in tables:
+                table[aten_op] = get_inner(aten_op)
+        return func
+
+    return wrapper
+
+
+register_jagged_func = functools.partial(register_func, JAGGED_OPS_TABLE)
+
+
+def lookup_jagged(func, *args, **kwargs) -> Optional[Callable]:
+    dispatch_func = JAGGED_OPS_TABLE.get(func, None)
+    if dispatch_func is not None:
+        return dispatch_func
+
+    # Handle pointwise fallbacks
+    if torch.Tag.pointwise in func.tags:
+        # Assume there aren't additional tensors that aren't the "unary/binary" args
+        num_tensor_args = sum([isinstance(x, torch.Tensor) for x in args])
+        if num_tensor_args == 1:
+            check_schema("self: jt_all, ...", func, *args, **kwargs)
+            return functools.partial(jagged_unary_pointwise, func)
+        elif num_tensor_args == 2:
+            check_schema("lhs: any, rhs: any, ...", func, *args, **kwargs)
+            return functools.partial(jagged_binary_pointwise, func)
+
+    return None
+
+
+def extract_kwargs(arg):
+    kwargs = {
+        "offsets": arg.offsets(),
+        "_metadata_cache": arg._metadata_cache,
+        "_ragged_idx": arg._ragged_idx,
+    }
+    return kwargs
+
+
+def jagged_unary_pointwise(func, *args, **kwargs):
+    return NestedTensor(
+        func(args[0]._values, *args[1:], **kwargs), **extract_kwargs(args[0])
+    )
+
+
+def jagged_binary_pointwise(func, *args, **kwargs):
+    a, b = args[0], args[1]
+    assert isinstance(a, NestedTensor) or isinstance(b, NestedTensor)
+
+    mismatch_error_msg = (
+        "cannot call binary pointwise function {} with inputs of shapes {} and {}"
+    )
+    # a is NT, b is NT
+    if isinstance(a, NestedTensor) and isinstance(b, NestedTensor):
+        # ex: (B, j0, D) + (B, j0, D)
+        # ex: (B, j0, D) + (B, j0, 1)
+        if raggedness_matches(a, b._size):
+            return NestedTensor(
+                func(a._values, b._values, *args[2:], **kwargs), **extract_kwargs(a)
+            )
+        raise RuntimeError(mismatch_error_msg.format(func.__name__, a._size, b._size))
+    # either a is NT or b is NT at this point
+    a_is_nt = isinstance(a, NestedTensor)
+    extracted_kwargs = extract_kwargs(a) if a_is_nt else extract_kwargs(b)
+
+    # === Handle broadcasting across the batch / ragged dims ===
+
+    # Easy case: take advantage of pre-existing broadcasting logic
+    # ex: (B, j0, ?, ?) + (?) -> (B, j0, ?, ?)
+    # ex: (B, j0, ?, ?) + (?, ?) -> (B, j0, ?, ?)
+    # ex: (B, j0, ?, ?) + (1, 1, ?, ?) -> (B, j0, ?, ?)
+    nt, t = (a, b) if a_is_nt else (b, a)
+    # See Note: [ Squeezing leading ones ]
+    if t.dim() > nt.dim():
+        raise NotImplementedError("NYI: broadcasting NT with T with larger dim")
+    t_squeezed = squeeze_leading_ones(t)
+    if nt.dim() >= t_squeezed.dim() + 2:
+        lhs, rhs = (nt._values, t_squeezed) if a_is_nt else (t_squeezed, nt._values)
+        return NestedTensor(func(lhs, rhs, *args[2:], **kwargs), **extracted_kwargs)
+
+    # Harder case: do manual broadcasting over unbound components
+    # when NT dim == non-NT dim
+    # ex: (B, j0, D_0, D_1) + (B, 1, D_0, D_1) -> (B, j0, D_0, D_1)
+    if a.dim() == b.dim():
+        # ex: (B, j0, D_0, D_1) + (1, 1, D_0, D_1) -> should
+        # be (B, j0, D_0, D_1) but not yet supported
+        if a.shape[0] != b.shape[0]:
+            raise RuntimeError(
+                mismatch_error_msg.format(func.__name__, a.shape, b.shape)
+            )
+
+        # need to use offsets to broadcast across ragged dim properly
+        # NB: inefficient fallback here; Triton codegen can help this
+        # TODO: Make this work with autograd
+        outputs = []
+        for a_comp, b_comp in zip(a.unbind(), b.unbind()):
+            outputs.append(func(a_comp, b_comp, *args[2:], **kwargs))
+        new_values = torch.cat(outputs, dim=0)
+        return NestedTensor(new_values, **extracted_kwargs)
+
+    # ex: (B, j0, D_0, D_1) + (A, B, 1, D_0, D_1) -> error because this breaks the invariant
+    # that ragged dim is wrt left-most batch dim
+    raise RuntimeError(mismatch_error_msg.format(func.__name__, a.shape, b.shape))
+
+
+def jagged_torch_function(func, *args, **kwargs):
+    # SDPA has special kernels that handle nested tensors.
+    # Dispatch to the correct implementation here
+    if func is torch._C._nn.scaled_dot_product_attention:
+        return jagged_scaled_dot_product_attention(*args, **kwargs)
+
+    # Handle flatten() here because it's CompositeImplicit.
+    if func.__name__ == "flatten":
+
+        def _flatten_sig(input, start_dim=0, end_dim=-1):
+            pass
+
+        _, new_kwargs = normalize_function(
+            _flatten_sig, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+        )
+
+        inp = new_kwargs.pop("input")
+
+        # NB: stay in outer dim space because we're going to redispatch on a NT input
+        start_dim = _wrap_jagged_dim(
+            inp.dim(), new_kwargs["start_dim"], "flatten", convert_to_inner_dim=False
+        )
+        end_dim = _wrap_jagged_dim(
+            inp.dim(), new_kwargs["end_dim"], "flatten", convert_to_inner_dim=False
+        )
+
+        if start_dim == end_dim:
+            return inp
+
+        product = functools.reduce(operator.mul, inp.shape[start_dim : end_dim + 1])
+        new_shape = (*inp.shape[:start_dim], product, *inp.shape[end_dim + 1 :])
+
+        return inp.reshape(*new_shape)
+
+    raise NotImplementedError(func)
+
+
+@register_jagged_func(
+    [
+        torch.ops.aten.is_non_overlapping_and_dense.default,
+        torch.ops.aten.sym_size.default,
+        torch.ops.aten.dim.default,
+        torch.ops.aten.sym_numel.default,
+        torch.ops.aten.sym_stride.default,
+        torch.ops.aten.sym_storage_offset.default,
+    ],
+    "self: jt_all",
+)
+def tensor_attr_supported_getter(func, *args, **kwargs):
+    if func == torch.ops.aten.is_non_overlapping_and_dense.default:
+        return False
+
+    if func == torch.ops.aten.sym_size.default:
+        return args[0]._size
+
+    if func == torch.ops.aten.dim.default:
+        return len(args[0]._size)
+
+    if func == torch.ops.aten.sym_numel.default:
+        if args[0]._lengths is not None:
+            return int(sum(args[0]._lengths) * math.prod(args[0]._size[2:]))
+        return args[0]._values.numel()
+
+    if func == torch.ops.aten.sym_stride.default:
+        return args[0]._strides
+
+    if func == torch.ops.aten.sym_storage_offset.default:
+        return args[0]._values.storage_offset()
+
+
+@register_jagged_func(torch.ops.prim.layout.default, "self: jt_all")
+def prim_layout_default(func, *args, **kwargs):
+    return torch.jagged
+
+
+@register_jagged_func(
+    [torch.ops.aten.size.default],
+    "self: jt_all",
+)
+def tensor_attr_unsupported_getter(func, *args, **kwargs):
+    if func == torch.ops.aten.size.default:
+        raise RuntimeError(
+            "NestedTensors does not support directly calling torch.ops.aten.size "
+            "please use `nested_tensor.size()` instead."
+        )
+
+
+@register_jagged_func(torch.ops.aten.is_contiguous.default, "self: jt_all")
+def is_contiguous_general(func, *args, **kwargs):
+    from torch._prims_common import is_contiguous_for_memory_format
+
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    inp = new_kwargs.pop("input")
+
+    # If created from narrow() check for lengths
+    if inp.lengths() is not None:
+        return False
+
+    new_kwargs["memory_format"] = new_kwargs.get(
+        "memory_format", torch.contiguous_format
+    )
+    if new_kwargs["memory_format"] == torch.preserve_format:
+        return True
+    return is_contiguous_for_memory_format(inp._values, **new_kwargs)
+
+
+register_jagged_func(
+    torch.ops.aten.is_contiguous.memory_format, "self: jt_all, memory_format: any?"
+)(is_contiguous_general)
+
+
+@register_jagged_func(torch.ops.aten.linear.default, "input: jt, weight: t, bias: t?")
+def linear_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    torch.ops.aten.linear_backward.default,
+    "self: jt, grad_output: jt, weight: t, output_mask: any",
+)
+def linear_backward_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    grad_output = new_kwargs.pop("grad_output")
+    weight = new_kwargs.pop("weight")
+
+    check_ragged_dim_same(func, inp, "self", grad_output, "grad_output")
+    ds = NestedTensor(
+        torch.mm(grad_output._values, weight), **extract_kwargs(grad_output)
+    )
+    dw = torch.mm(grad_output._values.T, inp._values)
+    db = None  # NYI: gradient for bias, need to reduce over ragged dim
+    return (ds, dw, db)
+
+
+@register_jagged_func(torch.ops.aten._to_copy.default, "self: jt_all")
+def to_copy_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    # don't change layout
+    new_kwargs.pop("layout")
+
+    new_values = func(inp._values, **new_kwargs)
+    # NB: Purposefully keep offsets on the old device.
+    return NestedTensor(new_values, **extract_kwargs(inp))
+
+
+register_jagged_func(
+    [
+        torch.ops.aten.empty_like.default,
+        torch.ops.aten.ones_like.default,
+        torch.ops.aten.zeros_like.default,
+        torch.ops.aten.randn_like.default,
+        torch.ops.aten.detach.default,
+    ],
+    "self: jt_all",
+)(jagged_unary_pointwise)
+
+
+register_jagged_func(
+    torch.ops.aten._softmax.default, "self: jt, dim: any, half_to_float: any"
+)(jagged_unary_pointwise)
+
+
+@register_jagged_func(
+    torch.ops.aten.native_dropout.default, "self: jt, float: any, train: any?"
+)
+def native_dropout_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    out1, out2 = func(inp._values, **new_kwargs)
+    return (
+        NestedTensor(out1, **extract_kwargs(inp)),
+        NestedTensor(out2, **extract_kwargs(inp)),
+    )
+
+
+@register_jagged_func(
+    torch.ops.aten.native_dropout_backward.default,
+    "grad_output: jt, mask: jt, scale: any",
+)
+def native_dropout_backward_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    grad_output = new_kwargs.pop("grad_output")
+    mask = new_kwargs.pop("mask")
+    return NestedTensor(
+        func(grad_output._values, mask._values, **new_kwargs),
+        **extract_kwargs(grad_output),
+    )
+
+
+@register_jagged_func(torch.ops.aten.prod.dim_int, "self: jt, dim: any, keepdim: any?")
+def prod_dim_int(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    # TODO: Figure out how to handle this better
+    # keep_dim is required to keep it in jagged format
+    if not new_kwargs["keepdim"]:
+        raise RuntimeError("prod(): keepdim=True must be set for NestedTensor")
+    dim = new_kwargs["dim"]
+    new_kwargs["dim"] = _wrap_jagged_dim(len(inp._size), dim, "prod")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(args[0]))
+
+
+@register_jagged_func(
+    torch.ops.aten.split.Tensor, "self: jt, split_size: any, dim: any"
+)
+def split_tensor(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    new_kwargs["dim"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim"], "split")
+
+    return tuple(
+        NestedTensor(values=x, **extract_kwargs(inp))
+        for x in func(inp._values, **new_kwargs)
+    )
+
+
+@register_jagged_func(
+    torch.ops.aten.split_with_sizes.default, "self: jt, split_sizes: any, dim: any"
+)
+def split_with_sizes_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    new_kwargs["dim"] = _wrap_jagged_dim(
+        inp.dim(), new_kwargs["dim"], "split_with_sizes"
+    )
+
+    return [
+        NestedTensor(values=x, **extract_kwargs(inp))
+        for x in func(inp._values, **new_kwargs)
+    ]
+
+
+@register_jagged_func(torch.ops.aten.chunk.default, "self: jt, chunks: any, dim: any?")
+def chunk_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    new_kwargs["dim"] = _wrap_jagged_dim(
+        inp.dim(), new_kwargs["dim"], "chunk", allow_batch_dim=True
+    )
+
+    if new_kwargs["dim"] == 0:
+        chunks = new_kwargs["chunks"]
+        dim0_size = inp._size[0]
+        chunk_size = math.ceil(dim0_size / chunks)
+
+        # get _offsets of the chunks
+        lengths = inp._offsets.diff()
+        chunked_lengths = lengths.chunk(chunks)
+        chunked_offsets = [torch.cumsum(x, dim=0) for x in chunked_lengths]
+        chunked_offsets = [F.pad(x, (1, 0), value=0) for x in chunked_offsets]
+        nested_kwargs = [
+            {"offsets": per_offsets, "_ragged_idx": inp._ragged_idx}
+            for per_offsets in chunked_offsets
+        ]
+
+        # get _values of the chunks
+        split_sizes = [x.sum().item() for x in chunked_lengths]
+        chunk_values = inp._values.split(split_sizes)
+
+        return [
+            NestedTensor(values=chunk_values[i], **(nested_kwargs[i]))
+            for i in range(0, chunk_size)
+        ]
+    else:
+        return [
+            NestedTensor(values=x, **extract_kwargs(inp))
+            for x in func(inp._values, **new_kwargs)
+        ]
+
+
+@register_jagged_func(torch.ops.aten.unbind.int, "self: jt_all, dim: any?")
+def unbind_int(func, *args, **kwargs):
+    # Note that this specializes on the length of the offsets
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    dim = new_kwargs["dim"]
+    if dim != 0:
+        raise RuntimeError("unbind(): only supported for NestedTensor on dim=0")
+
+    inp = new_kwargs.pop("input")
+    values = inp.values()
+    offsets = inp.offsets()
+    lengths = inp.lengths()
+
+    if inp._ragged_idx != 1:
+        raise RuntimeError(
+            "unbind(): only supported for NestedTensor when jagged dimension is 1"
+        )
+
+    if lengths is None:
+        return torch.split(values, offsets.diff().tolist())
+    return [
+        values[offsets[i] : (offsets[i] + lengths[i])] for i in range(lengths.shape[0])
+    ]
+
+
+@register_jagged_func(torch.ops.aten.squeeze.dim, "self: jt, dim: any")
+def squeeze_dim(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    values = inp._values
+
+    new_kwargs["dim"] = _wrap_jagged_dim(len(inp._size), new_kwargs["dim"], "squeeze")
+    return NestedTensor(func(values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(torch.ops.aten.unsqueeze.default, "self: jt, dim: any")
+def unsqueeze_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    values = inp._values
+
+    # Account for collapsed jagged dim
+    dim = new_kwargs["dim"]
+    new_kwargs["dim"] = _wrap_jagged_dim(len(inp._size) + 1, dim, "unsqueeze")
+    return NestedTensor(func(values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(torch.ops.aten.cat.default, "tensors: any, dim: any")
+def cat_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    tensors = new_kwargs.pop("tensors")
+
+    # Convert any non-nested to nested
+    nested = [t for t in tensors if t.is_nested]
+    assert len(nested) > 0
+    first = nested[0]
+    tensors = [t if t.is_nested else t.expand_as(first) for t in tensors]
+
+    # Account for collapsed jagged dim
+    dim = new_kwargs["dim"]
+    new_kwargs["dim"] = _wrap_jagged_dim(len(first.shape), dim, "cat")
+
+    return NestedTensor(
+        func([t._values for t in tensors], **new_kwargs), **extract_kwargs(tensors[0])
+    )
+
+
+@register_jagged_func(torch.ops.aten.matmul.default, "self: jt, other: any")
+def matmul_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    other = new_kwargs.pop("other")
+
+    if inp.is_nested and not other.is_nested:
+        return NestedTensor(
+            func(inp._values, other, **new_kwargs), **extract_kwargs(inp)
+        )
+    elif inp.is_nested and other.is_nested:
+        # BMM with equivalent ragged dims between the two inputs
+        if inp.dim() > 3 and other.dim() > 3 and raggedness_matches(inp, other._size):
+            return NestedTensor(func(inp._values, other._values), **extract_kwargs(inp))
+
+    raise RuntimeError(
+        f"matmul(): not supported between inputs of shapes {inp._size} and {other.shape}"
+    )
+
+
+@register_jagged_func(
+    torch.ops.aten.expand.default, "self: jt, size: any, implicit: any?"
+)
+def expand_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    size = new_kwargs["size"]
+
+    assert ("implicit" not in new_kwargs) or (not new_kwargs.pop("implicit"))
+    if not raggedness_matches(inp, size):
+        raise RuntimeError(f"expand(): cannot expand shape {inp._size} -> {size}")
+
+    expand_arg = [-1, *size[2:]]
+    return NestedTensor(func(inp._values, expand_arg), **extract_kwargs(inp))
+
+
+@register_jagged_func(torch.ops.aten.expand_as.default, "self: t, other: jt")
+def expand_as_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    other = new_kwargs.pop("other")
+
+    return NestedTensor(func(inp, other._values), **extract_kwargs(other))
+
+
+@register_jagged_func(torch.ops.aten.where.self, "condition: jt, self: jt, other: jt")
+def where_self(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    condition = new_kwargs.pop("condition")
+    inp = new_kwargs.pop("input")
+    other = new_kwargs.pop("other")
+
+    assert condition._size == other._size == inp._size
+
+    return NestedTensor(
+        func(condition._values, inp._values, other._values, **new_kwargs),
+        **extract_kwargs(condition),
+    )
+
+
+@register_jagged_func(torch.ops.aten._pin_memory.default, "self: jt, device: any?")
+def _pin_memory_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(torch.ops.aten.is_pinned.default, "self: jt, device: any?")
+def is_pinned_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    return func(inp._values, **new_kwargs)
+
+
+@register_jagged_func(
+    torch.ops.aten.is_same_size.default, "self: jt_all, other: jt_all"
+)
+def is_same_size_default(func, *args, **kwargs):
+    return args[0]._size == args[1]._size
+
+
+@register_jagged_func(
+    torch.ops.aten.sum.dim_IntList, "self: jt, dim: any?, keepdim: any?, dtype: any?"
+)
+def sum_dim_IntList(func, *args, **kwargs):
+    # sum_dim_IntList can produce a NT or a T depending on whether the ragged dims
+    # are reduced away.
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    inp = new_kwargs.pop("input")
+    assert inp._ragged_idx == 1
+    new_kwargs["dim"], ragged_reduced_away = _wrap_jagged_dims(
+        inp.dim(), new_kwargs["dim"], "sum"
+    )
+
+    if not ragged_reduced_away:
+        return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+    else:
+        # Don't wrap because we reduced away the raggedness
+        out = func(inp._values, **new_kwargs)
+        if new_kwargs["keepdim"]:
+            out = out.unsqueeze(0)
+        return out
+
+
+@register_jagged_func(
+    torch.ops.aten.transpose.int, "self: jt_all, dim0: any, dim1: any"
+)
+def transpose_int(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    from torch._prims_common import canonicalize_dims
+
+    inp = new_kwargs.pop("input")
+    dim0, dim1 = canonicalize_dims(inp.dim(), (new_kwargs["dim0"], new_kwargs["dim1"]))
+
+    if inp._lengths is not None:
+        raise ValueError(
+            "transpose(): not supported on jagged layout nested tensor with holes"
+        )
+
+    # To support the SDPA API, inputs need to have the ragged idx transposed to dim 2
+    # instead of 1, although the internal Flash and mem-effn implementations will
+    # use the inputs with raggedness in dim 1.
+    if dim0 == inp._ragged_idx or dim1 == inp._ragged_idx:
+        if dim0 == 0 or dim1 == 0:
+            raise ValueError(
+                "Transpose is not supported on the batch dimension for jagged NT"
+            )
+        if dim0 == inp._ragged_idx:
+            to_dim = dim1
+        else:
+            to_dim = dim0
+        inp_kwargs = extract_kwargs(inp)
+        inp_kwargs["_ragged_idx"] = to_dim
+        return NestedTensor(
+            inp.values().transpose(
+                _outer_to_inner_dim(len(inp._size), dim0),
+                _outer_to_inner_dim(len(inp._size), dim1),
+            ),
+            **inp_kwargs,
+        )
+
+    new_kwargs["dim0"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim0"], "transpose")
+    new_kwargs["dim1"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim1"], "transpose")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    [torch.ops.aten.view.default, torch.ops.aten._unsafe_view.default],
+    "self: jt_all, size: any",
+)
+def view_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    size = new_kwargs.pop("size")
+
+    if inp._ragged_idx != 1 and tuple(inp._size) != tuple(size):
+        raise RuntimeError(
+            f"view(): does not support ragged_idx != 1 except when inp._size == size. "
+            f"inp._size is ({inp._size}) and size is ({size})."
+        )
+
+    # Ensure specified size still includes batch and ragged dims
+    if len(size) < 3 or not raggedness_matches(inp, size):
+        raise RuntimeError(f"view(): cannot view shape {inp._size} as {size}")
+
+    # outer size: the size of the NT, e.g. [3, j0, 10]
+    # inner size: the size of the values, e.g. [8, 10] (e.g. for offsets = [0, 3, 5, 8])
+    # this function gets inner_size[inner_idx] for a given inner_idx.
+    #
+    # example: for outer size [a, b, c, j0, d, e, f]
+    #                         assume that j0 is ragged, other are concrete integers
+    #                         and ragged_idx=3
+    # inner size will be      [b, c, inp._values.size(ragged_idx), d, e, f]
+    # therefore:
+    #    inner_size[0] = outer_size[1]
+    #    inner_size[1] = outer_size[2]
+    #    inner_size[0] = inp._values.size(ragged_idx - 1)
+    #    inner_size[3] = outer_size[4]
+    #    inner_size[4] = outer_size[5]
+    def get_inner_size(inner_idx):
+        nonlocal inp, size
+        if inner_idx == inp._ragged_idx - 1:
+            return inp._values.size(inner_idx)
+        else:
+            return size[inner_idx + 1]
+
+    inner_size = [get_inner_size(i) for i in range(len(size) - 1)]
+
+    return NestedTensor(func(inp._values, inner_size), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    torch.ops.aten.native_layer_norm.default,
+    "input: jt, normalized_shape: any, weight: any?, bias: any?, eps: any",
+)
+def native_layer_norm_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    normalized_shape = new_kwargs["normalized_shape"]
+
+    # Ensure we're not trying to normalize over the ragged dim
+    if inp.dim() < 3 or (inp.dim() - len(normalized_shape)) < 2:
+        raise RuntimeError(
+            "layer_norm(): normalizing over ragged dim not supported for nested tensors"
+        )
+
+    output, mean, std = func(inp._values, **new_kwargs)
+    return (NestedTensor(output, **extract_kwargs(inp)), mean, std)
+
+
+@register_jagged_func(
+    torch.ops.aten.native_layer_norm_backward.default,
+    "grad_out: jt, input: jt, normalized_shape: any, mean: any, rstd: any, weight: any?, bias: any?, output_mask: any",
+)
+def native_layer_norm_backward_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    grad_out = new_kwargs.pop("grad_out")
+    inp = new_kwargs.pop("input")
+    d_input, d_gamma, d_beta = func(grad_out._values, inp._values, **new_kwargs)
+    if d_input is None:
+        return (None, d_gamma, d_beta)
+
+    return (NestedTensor(d_input, **extract_kwargs(inp)), d_gamma, d_beta)
+
+
+@register_jagged_func(torch.ops.aten.select.int, "self: jt, dim: any, index: any")
+def select_int(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    new_kwargs["dim"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim"], "select")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    torch.ops.aten.slice.Tensor,
+    "self: jt, dim: any?, start: any?, end: any?, step: any?",
+)
+def slice_tensor(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    new_kwargs["dim"] = _wrap_jagged_dim(inp.dim(), new_kwargs["dim"], "slice")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    torch.ops.aten.convolution.default,
+    "input: jt, weight: t, bias: t?, stride: any, padding: any, "
+    "dilation: any, transposed: any, output_padding: any, groups: any",
+)
+def convolution_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(
+    torch.ops.aten.mean.dim, "self: jt, dim: any?, keepdim: any, dtype: any?"
+)
+def mean_dim(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    # NB: mean expects dim as a single item list of ints for some reason
+    new_kwargs["dim"] = [_wrap_jagged_dim(inp.dim(), new_kwargs["dim"][0], "mean")]
+
+    return NestedTensor(func(inp._values, **new_kwargs), **extract_kwargs(inp))
+
+
+@register_jagged_func(torch.ops.aten.stack.default, "tensors: any, dim: any")
+def stack_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    # guaranteed this is non-empty if we got here
+    tensors = new_kwargs.pop("tensors")
+    for t in tensors:
+        if not isinstance(t, NestedTensor):
+            raise RuntimeError("stack(): expected all nested tensors inputs")
+
+        if t.dim() != tensors[0].dim():
+            raise RuntimeError(
+                "stack(): expected all nested tensors to have the same dim"
+            )
+
+        if not raggedness_matches(t, tensors[0].shape):
+            raise RuntimeError(
+                "stack(): expected all nested tensors to have the same nested structure"
+            )
+
+    new_kwargs["dim"] = _wrap_jagged_dim(
+        tensors[0].dim() + 1, new_kwargs["dim"], "stack"
+    )
+
+    return NestedTensor(
+        func([t._values for t in tensors], **new_kwargs), **extract_kwargs(tensors[0])
+    )
+
+
+@register_jagged_func(
+    torch.ops.aten.embedding.default,
+    "weight: t, indices: jt, padding_idx: any?, scale_grad_by_freq: any?, sparse: any?",
+)
+def embedding_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    # guaranteed this is non-empty if we got here
+    indices = new_kwargs.pop("indices")
+    weight = new_kwargs.pop("weight")
+
+    return NestedTensor(
+        func(weight, indices._values, **new_kwargs), **extract_kwargs(indices)
+    )
+
+
+@register_jagged_func(
+    [
+        torch.ops.aten.values.default,
+        torch.ops.aten._nested_get_values.default,
+    ],
+    "self: jt_all",
+)
+def values_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+
+    # TODO: Handle inference mode properly.
+    # See https://github.com/pytorch/pytorch/issues/112024#issuecomment-1779554292
+    return inp._values.detach()
+
+
+@register_jagged_func(
+    torch.ops.aten._nested_view_from_jagged.default,
+    "values: t, offsets: t, dummy: jt_all, lengths: t?, ragged_idx: any?",
+)
+def _nested_view_from_jagged_default(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    values, offsets, lengths = (
+        new_kwargs["input"],
+        new_kwargs["offsets"],
+        new_kwargs["lengths"],
+    )
+    ragged_idx = new_kwargs["ragged_idx"]
+
+    return NestedTensor(values, offsets, lengths=lengths, _ragged_idx=ragged_idx)
+
+
+@register_jagged_func(torch.ops.aten._nested_get_offsets.default, "self: jt_all")
+def _nested_get_offsets(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._offsets
+
+
+@register_jagged_func(torch.ops.aten._nested_get_lengths.default, "self: jt_all")
+def _nested_get_lengths(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._lengths
+
+
+@register_jagged_func(torch.ops.aten._nested_get_ragged_idx.default, "self: jt_all")
+def _nested_get_ragged_idx(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+
+    inp = new_kwargs.pop("input")
+    return inp._ragged_idx
+
+
+# Make the dummy available on the C++ side.
+@register_jagged_func(torch.ops.aten._nested_get_jagged_dummy.default, "self: any")
+def _nested_get_jagged_dummy(func, *args, **kwargs):
+    from torch.nested._internal.nested_tensor import _nt_view_dummy
+
+    return _nt_view_dummy
+
+
+with torch.library._scoped_library("aten", "IMPL") as aten:
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "CPU")
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "CUDA")
+    aten.impl("_nested_get_jagged_dummy", _nested_get_jagged_dummy, "Meta")
diff --git a/MLPY/Lib/site-packages/torch/nested/_internal/sdpa.py b/MLPY/Lib/site-packages/torch/nested/_internal/sdpa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a70385fd9c11b02b8b70c83ab9e120a815426f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nested/_internal/sdpa.py
@@ -0,0 +1,780 @@
+import logging
+from typing import Optional, Tuple
+
+import torch
+import torch.nn
+import torch.nn.functional as F
+from torch.backends.cuda import (
+    can_use_efficient_attention,
+    can_use_flash_attention,
+    flash_sdp_enabled,
+    math_sdp_enabled,
+    mem_efficient_sdp_enabled,
+    SDPAParams,
+)
+
+from torch.nn.attention import SDPBackend
+from .nested_tensor import NestedTensor
+
+log = logging.getLogger(__name__)
+
+
+def _validate_sdpa_input(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    if (
+        not isinstance(query, NestedTensor)
+        or not isinstance(key, NestedTensor)
+        or not isinstance(value, NestedTensor)
+    ):
+        raise ValueError(
+            f"Expected query, key, and value to be nested tensors, "
+            f"but got query.is_nested: {query.is_nested}, key.is_nested: {key.is_nested}, "
+            f"and value.is_nested: {value.is_nested} instead."
+        )
+    if query.dtype != key.dtype or query.dtype != value.dtype:
+        raise ValueError(
+            f"Expected query, key, and value to have the same dtype, "
+            f"but got query.dtype: {query.dtype}, key.dtype: {key.dtype}, "
+            f"and value.dtype: {value.dtype} instead."
+        )
+    if query.device != key.device or query.device != value.device:
+        raise ValueError(
+            f"Expected query, key, and value to have the same device type, "
+            f"but got query.device: {query.device}, key.device: {key.device}, "
+            f"and value.device: {value.device} instead."
+        )
+    if query.dim() < 2 or key.dim() < 2 or value.dim() < 2:
+        raise ValueError(
+            f"Expected query, key, and value to all be  at least 2 dimensional, but got query.dim: "
+            f"{query.dim()}, key.dim: {key.dim()} and value.dim: {value.dim()} instead."
+        )
+    if query._ragged_idx != key._ragged_idx or query._ragged_idx != value._ragged_idx:
+        raise ValueError(
+            f"Expected query, key, and value to all be ragged on the same dimension, but got ragged "
+            f"dims {query._ragged_idx}, {key._ragged_idx}, and {value._ragged_idx}, respectively."
+        )
+    if attn_mask is not None:
+        # TODO: Figure out whether masks are actually supported for this layout or not
+        raise ValueError("Masks are not yet supported!")
+        if attn_mask.dtype != torch.bool and attn_mask.dtype != query.dtype:
+            raise ValueError(
+                f"Expected attn_mask dtype to be bool or to match query dtype, but got attn_mask.dtype: "
+                f"{attn_mask.dtype}, and query.dtype: {query.dtype} instead."
+            )
+
+
+def _check_batch_size_nested(params: SDPAParams, debug=False) -> bool:
+    # This is expected to be called after check_tensor_shapes ensuring that the
+    # size() calls won't error since the inputs are all 4 dimensional
+    q_batch_size = params.query.size(0)
+    k_batch_size = params.key.size(0)
+    v_batch_size = params.value.size(0)
+
+    # num_heads logic for nested input is checked in
+    # check_for_seq_len_0_nested_tensor as there is handling there to make sure
+    # num_heads is not ragged
+    return q_batch_size == k_batch_size and q_batch_size == v_batch_size
+
+
+def _check_head_dim_size_flash_nested(params: SDPAParams, debug=False) -> bool:
+    max_size = 256
+    query_size_last = params.query.size(-1)
+    key_size_last = params.key.size(-1)
+    value_size_last = params.value.size(-1)
+    same_head_dim_size = (
+        query_size_last == key_size_last and query_size_last == value_size_last
+    )
+    if not (
+        same_head_dim_size
+        and (query_size_last % 8 == 0)
+        and (query_size_last <= max_size)
+    ):
+        if debug:
+            log.warning(
+                "For NestedTensor inputs, Flash attention requires q,k,v to have the same "
+                "last dimension and to be a multiple of 8 and less than or equal to 256. "
+                "Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.",
+                query_size_last,
+                key_size_last,
+                value_size_last,
+            )
+        return False
+    return True
+
+
+def _check_for_seq_len_0_and_consistent_head_dim_nested_helper(
+    param: torch.Tensor, param_name: str, debug=False
+) -> bool:
+    assert isinstance(param, NestedTensor), "param should be a jagged NT"
+
+    if param._ragged_idx == 1:
+        # num_head_dims is ragged
+        if debug:
+            log.warning(
+                "Fused kernels do not support ragged num_head_dims, %s has a ragged num_heads.",
+                param_name,
+            )
+        return False
+
+    # This is being called inside sdp with shape [batch, heads, {seq_len}, dim]
+    if param._min_seqlen == 0:
+        if debug:
+            log.warning(
+                "Fused kernels do not support seq_len == 0, %s has a seq len of 0.",
+                param_name,
+            )
+        return False
+
+    return True
+
+
+def _try_broadcast_param_size(q_size, k_size, v_size, param_name, debug=False) -> bool:
+    max_size = max(q_size, k_size, v_size)
+    if (
+        (q_size != max_size and q_size != 1)
+        or (k_size != max_size and k_size != 1)
+        or (v_size != max_size and v_size != 1)
+    ):
+        if debug:
+            log.warning(
+                "Both fused kernels require query, key and value to have broadcastable %s, "
+                "got Query %s %d, Key %s %d, Value %s %d instead.",
+                param_name,
+                param_name,
+                q_size,
+                param_name,
+                k_size,
+                param_name,
+                v_size,
+            )
+        return False
+    return True
+
+
+def _check_for_seq_len_0_nested(params: SDPAParams, debug=False) -> bool:
+    # When this function is called we are assured that the nt is dim==4
+    q_is_safe = (
+        _check_for_seq_len_0_and_consistent_head_dim_nested_helper(
+            params.query, "query", debug
+        )
+        if params.query.is_nested
+        else True
+    )
+    # short circuit if any is unsafe
+    if not q_is_safe:
+        return False
+
+    k_is_safe = (
+        _check_for_seq_len_0_and_consistent_head_dim_nested_helper(
+            params.key, "key", debug
+        )
+        if params.key.is_nested
+        else True
+    )
+    # short circuit if any is unsafe
+    if not k_is_safe:
+        return False
+
+    v_is_safe = (
+        _check_for_seq_len_0_and_consistent_head_dim_nested_helper(
+            params.value, "value", debug
+        )
+        if params.value.is_nested
+        else True
+    )
+    # short circuit if any is unsafe
+    if not v_is_safe:
+        return False
+
+    # We now know none of the inputs have ragged num_heads, so we can safely
+    # access .size(1)
+    q_num_heads = params.query.size(1)
+    k_num_heads = params.key.size(1)
+    v_num_heads = params.value.size(1)
+    same_num_heads = q_num_heads == k_num_heads and q_num_heads == v_num_heads
+
+    if not same_num_heads:
+        if (
+            params.query.requires_grad
+            or params.key.requires_grad
+            or params.value.requires_grad
+        ):
+            if debug:
+                log.warning(
+                    "Both fused kernels do not support training with broadcasted NT inputs."
+                )
+            return False
+        return _try_broadcast_param_size(
+            q_num_heads, k_num_heads, v_num_heads, "num heads", debug
+        )
+    return True
+
+
+def _can_use_flash_sdpa_jagged(params: SDPAParams, debug=False) -> bool:
+    constraints = (
+        _check_batch_size_nested,
+        _check_head_dim_size_flash_nested,
+        _check_for_seq_len_0_nested,
+    )
+    for constraint in constraints:
+        if not constraint(params, debug):
+            return False
+    return True
+
+
+def _can_use_efficient_sdpa_jagged(params: SDPAParams, debug=False) -> bool:
+    constraints = (
+        _check_batch_size_nested,
+        _check_for_seq_len_0_nested,
+    )
+    for constraint in constraints:
+        if not constraint(params, debug):
+            return False
+    return True
+
+
+def _can_use_math_sdpa_jagged(params: SDPAParams, debug=False) -> bool:
+    if (
+        not params.query.transpose(1, 2).is_contiguous()
+        or not params.key.transpose(1, 2).is_contiguous()
+        or not params.value.transpose(1, 2).is_contiguous()
+    ):
+        if debug:
+            log.warning(
+                "If inputs are nested tensors they must be contiguous after transposing."
+            )
+        return False
+    if params.is_causal:
+        if debug:
+            log.warning(
+                "Nested tensors for query / key are not supported when is_causal=True."
+            )
+        return False
+    return True
+
+
+def _select_sdp_backend(query, key, value, attn_mask, dropout, is_causal):
+    if (
+        not flash_sdp_enabled()
+        and not mem_efficient_sdp_enabled()
+        and not math_sdp_enabled()
+    ):
+        return SDPBackend.ERROR
+
+    ordering = (
+        SDPBackend.FLASH_ATTENTION,
+        SDPBackend.EFFICIENT_ATTENTION,
+        SDPBackend.MATH,
+    )
+
+    params = SDPAParams(query, key, value, attn_mask, dropout, is_causal)
+
+    for backend in ordering:
+        if backend == SDPBackend.FLASH_ATTENTION:
+            if can_use_flash_attention(params) and _can_use_flash_sdpa_jagged(params):
+                return SDPBackend.FLASH_ATTENTION
+        if backend == SDPBackend.EFFICIENT_ATTENTION:
+            if can_use_efficient_attention(params) and _can_use_efficient_sdpa_jagged(
+                params
+            ):
+                return SDPBackend.EFFICIENT_ATTENTION
+        if backend == SDPBackend.MATH:
+            if math_sdp_enabled() and _can_use_math_sdpa_jagged(params):
+                return SDPBackend.MATH
+
+    log.warning("Memory efficient kernel not used because:")
+    can_use_efficient_attention(params, debug=True)
+    _can_use_efficient_sdpa_jagged(params, debug=True)
+    log.warning("Flash attention kernel not used because:")
+    can_use_flash_attention(params, debug=True)
+    _can_use_flash_sdpa_jagged(params, debug=True)
+    log.warning("Math attention kernel not used because:")
+    _can_use_math_sdpa_jagged(params, debug=True)
+    return SDPBackend.ERROR
+
+
+def _cumulative_and_max_seq_len_nnz(qkv: torch.Tensor) -> Tuple[torch.Tensor, int, int]:
+    # This function is used to calculate two pieces of metadata that are needed
+    # for use with flash-attention and efficient_attention kernels. They are the
+    # cumulative sequence_length over a batch of sequences and the maximum
+    # sequence length.
+
+    # It returns a tuple of cumulative sequence lengths and the maximum sequence
+    # length, and the last element in the cumulative_sequence_lengths
+    if not isinstance(qkv, NestedTensor):
+        raise ValueError("QKV must be nested for flash cumulative_seq_len calculation.")
+
+    if qkv.lengths() is None:
+        # TODO: Explore performance impact of copying
+        cumulative_seqlen = qkv.offsets().to(dtype=torch.int32, device=qkv.device)
+        max_seqlen = qkv._max_seqlen
+        n_elem = qkv.values().shape[0]
+    else:
+        # TODO: Explore performance impact of copying
+        cumulative_seqlen = (
+            qkv.lengths().cumsum(0).to(dtype=torch.int32, device=qkv.device)
+        )
+        batch_size = qkv.size(0)
+        max_seqlen = qkv._max_seqlen
+        # TODO: Explore performance impact when compiling
+        n_elem = int(cumulative_seqlen[-1].item())
+    return cumulative_seqlen, max_seqlen, n_elem
+
+
+def _is_safe_to_get_storage_as_tensor(tensor: torch.Tensor):
+    # This function checks if a nested tensor is valid for
+    # use with the flash-attention and efficient_attention kernels without
+    # needing to call contiguous on the nested tensor input.
+    # It checks that the storage offsets' adjacent_differences are a constant
+    # mutiple of the previous tensor in the nested tensor and that the strides
+    # are monitonically decreasing. This check is done after calling transpose on
+    # the nested tensor resulting in a Nt of shape [bsz, {seq_len}, num_heads, dim]
+
+    # Returns a boolean indicating if contiguous needs to be called for input
+    assert isinstance(tensor, NestedTensor)
+    offsets = tensor.offsets()
+    strides = tensor._strides
+
+    n_tensors = offsets.size(0) - 1
+    if n_tensors <= 1:
+        return True
+
+    # Check initially that the tensor strides are in strictly descending order
+    prev_stride = strides[1]
+    for stride in strides[2:]:
+        if prev_stride <= stride:
+            # This would mean that the last stride is greater than the seq_len
+            # stride
+            return False
+        prev_stride = stride
+
+    # Congrats you made it!
+    return True
+
+
+def _view_as_dense(
+    tensor: torch.Tensor, Nnz: int, num_heads: int, head_dim: int
+) -> torch.Tensor:
+    if tensor.is_nested:
+        return tensor.values()
+    return tensor.view(Nnz, num_heads, head_dim)
+
+
+# TODO: Next iteration should add test cases and check it works
+# def _sdpa_nested_preprocessing_with_broadcast(query, key, value):
+#     # Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
+#     # Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+#     # Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+#     q_batch_size = query.size(0)
+#     k_batch_size = key.size(0)
+#     v_batch_size = value.size(0)
+
+#     output_batch_size = max(q_batch_size, k_batch_size, v_batch_size)
+
+#     q_num_heads = query.size(1)
+#     k_num_heads = key.size(1)
+#     v_num_heads = value.size(1)
+
+#     output_num_heads = max(q_num_heads, k_num_heads, v_num_heads)
+
+#     head_dim_qk = query.size(3)
+#     head_dim_v = value.size(3)
+
+#     q_t = query.transpose(1, 2)
+#     k_t = key.transpose(1, 2)
+#     v_t = value.transpose(1, 2)
+
+#     # Checks in sdp_utils ensure that if {*}_batch_size/{*}_num_heads !=
+#     # output_batch_size/num_heads then they are 1
+#     q_batch_size_needs_broadcast = q_batch_size != output_batch_size
+#     k_batch_size_needs_broadcast = k_batch_size != output_batch_size
+#     v_batch_size_needs_broadcast = v_batch_size != output_batch_size
+
+#     # If {*}_batch_size_needs_broadcast, then
+#     # (1) max_seqlen_batch_{*} is given by {*}_t.size(1)
+#     #     this is because needs_broadcast indicates that the batch_size is 1
+#     #     and hence there is only 1 value for seq_len
+#     # (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
+#     # ..., outut_batch_size * {*}_t.size(1)]
+#     # (3) Nnz_{*} is given by output_batch_size * {*}_t.size(1)
+
+#     if q_batch_size_needs_broadcast or not q_t.is_nested:
+#         max_seqlen_batch_q = q_t.size(1)
+#         cumulative_sequence_length_q = torch.arange(
+#             0,
+#             (output_batch_size + 1) * max_seqlen_batch_q,
+#             max_seqlen_batch_q,
+#             device=q_t.device,
+#             dtype=torch.int32,
+#         )
+#         Nnz_q = output_batch_size * max_seqlen_batch_q
+#     else:
+#         (
+#             cumulative_sequence_length_q,
+#             max_seqlen_batch_q,
+#             Nnz_q,
+#         ) = _cumulative_and_max_seq_len_nnz(q_t)
+
+#     if k_batch_size_needs_broadcast and v_batch_size_needs_broadcast:
+#         assert k_t.size(1) == v_t.size(1)
+#         max_seqlen_batch_kv = k_t.size(1)
+#         cumulative_sequence_length_kv = torch.arange(
+#             0,
+#             (output_batch_size + 1) * max_seqlen_batch_kv,
+#             max_seqlen_batch_kv,
+#             device=k_t.device,
+#             dtype=torch.int32,
+#         )
+#         Nnz_kv = output_batch_size * max_seqlen_batch_kv
+#     else:
+#         cumulative_sequence_length_kv, max_seqlen_batch_kv, Nnz_kv = (
+#             _cumulative_and_max_seq_len_nnz(v_t)
+#             if k_batch_size_needs_broadcast
+#             else _cumulative_and_max_seq_len_nnz(k_t)
+#         )
+
+#     q_num_heads_needs_broadcast = q_num_heads != output_num_heads
+#     k_num_heads_needs_broadcast = k_num_heads != output_num_heads
+#     v_num_heads_needs_broadcast = v_num_heads != output_num_heads
+
+#     if not q_t.is_nested:
+#         query_buffer_reshaped = q_t.expand(
+#             output_batch_size, q_t.size(1), output_num_heads, head_dim_qk
+#         )
+#         query_buffer_reshaped = query_buffer_reshaped.reshape(
+#             Nnz_q, output_num_heads, head_dim_qk
+#         )
+#     else:
+#         if not q_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(q_t):
+#             q_t = q_t.contiguous()
+#         # If we are broadcasting then Nnz_q will be the output_batch_size since
+#         # seq_len is 1
+#         effective_batch_size_q = (
+#             output_batch_size if q_batch_size_needs_broadcast else Nnz_q
+#         )
+#         query_buffer_reshaped = _view_as_dense(
+#             q_t, effective_batch_size_q, output_num_heads, head_dim_qk
+#         )
+
+#     # If the physical layout of the NestedTensor's storage
+#     # is not: batch, {seq_len}, num_heads, head_dim then we need
+#     # to call contiguous
+#     if not k_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(k_t):
+#         k_t = k_t.contiguous()
+#     if not v_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(v_t):
+#         v_t = v_t.contiguous()
+
+#     effective_batch_size_k = (
+#         output_batch_size if k_batch_size_needs_broadcast else Nnz_kv
+#     )
+#     key_buffer_reshaped = _view_as_dense(
+#         k_t, effective_batch_size_k, output_num_heads, head_dim_qk
+#     )
+
+#     effective_batch_size_v = (
+#         output_batch_size if v_batch_size_needs_broadcast else Nnz_kv
+#     )
+#     value_buffer_reshaped = _view_as_dense(
+#         v_t, effective_batch_size_v, output_num_heads, head_dim_v
+#     )
+
+#     if not q_batch_size_needs_broadcast:
+#         output_shape = q_t._size
+#         if head_dim_v != head_dim_qk:
+#             output_shape[-1] = head_dim_v
+#         if q_num_heads_needs_broadcast:
+#             output_shape[1] = output_num_heads
+#     else:
+#         output_shape = torch.empty(3, dtype=torch.int64, device=torch.device("cpu"))
+#         output_shape[0] = q_t.size(1)
+#         output_shape[1] = output_num_heads
+#         output_shape[2] = head_dim_v
+
+#     return (
+#         query_buffer_reshaped,
+#         key_buffer_reshaped,
+#         value_buffer_reshaped,
+#         cumulative_sequence_length_q,
+#         cumulative_sequence_length_kv,
+#         max_seqlen_batch_q,
+#         max_seqlen_batch_kv,
+#         output_shape,
+#     )
+
+
+def _sdpa_nested_preprocessing(query, key, value):
+    # Query (Batch x Num_heads x {Q_seq_len}  x Dim_per_head)
+    # Key   (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+    # Value (Batch x Num_heads x {KV_seq_len} x Dim_per_head)
+    q_batch_size = query.size(0)
+    k_batch_size = key.size(0)
+    v_batch_size = value.size(0)
+
+    q_num_heads = query.size(1)
+    k_num_heads = key.size(1)
+    v_num_heads = value.size(1)
+
+    if not (q_batch_size == k_batch_size and q_batch_size == v_batch_size) or not (
+        q_num_heads == k_num_heads and k_num_heads == v_num_heads
+    ):
+        raise RuntimeError(
+            "This path is currently not implemented for jagged layout NT."
+        )
+        # return _sdpa_nested_preprocessing_with_broadcast(query, key, value)
+
+    num_heads = query.size(1)
+    head_dim_qk = query.size(3)
+    head_dim_v = value.size(3)
+    q_t = query.transpose(1, 2)
+    k_t = key.transpose(1, 2)
+    v_t = value.transpose(1, 2)
+
+    (
+        cumulative_sequence_length_q,
+        max_seqlen_batch_q,
+        Nnz_q,
+    ) = _cumulative_and_max_seq_len_nnz(q_t)
+    (
+        cumulative_sequence_length_kv,
+        max_seqlen_batch_kv,
+        Nnz_kv,
+    ) = _cumulative_and_max_seq_len_nnz(k_t)
+
+    # [TODO] K and V have to have the same Nnz, should probably torch_check
+    # assume in order to not iterate over v
+
+    # If the physical layout of the NestedTensor's storage
+    # is not: batch, {seq_len}, num_heads, head_dim then we need
+    # to call contiguous
+    if not q_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(q_t):
+        q_t = q_t.contiguous()
+    if not k_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(k_t):
+        k_t = k_t.contiguous()
+    if not v_t.is_contiguous() and not _is_safe_to_get_storage_as_tensor(v_t):
+        v_t = v_t.contiguous()
+
+    query_buffer_reshaped = _view_as_dense(q_t, Nnz_q, num_heads, head_dim_qk)
+    key_buffer_reshaped = _view_as_dense(k_t, Nnz_kv, num_heads, head_dim_qk)
+    value_buffer_reshaped = _view_as_dense(v_t, Nnz_kv, num_heads, head_dim_v)
+
+    output_nt_info = {
+        "offsets": q_t.offsets(),
+        "_max_seqlen": q_t._max_seqlen,
+        "_min_seqlen": q_t._min_seqlen,
+    }
+
+    return (
+        query_buffer_reshaped,
+        key_buffer_reshaped,
+        value_buffer_reshaped,
+        cumulative_sequence_length_q,
+        cumulative_sequence_length_kv,
+        max_seqlen_batch_q,
+        max_seqlen_batch_kv,
+        output_nt_info,
+    )
+
+
+def _pad_last_dim(
+    tensor: torch.Tensor, alignment_size: int, slice: bool
+) -> torch.Tensor:
+    # FlashAttentionV2 requires that head dimension be a multiple of 8
+    # This was previously done within the kernel, however
+    # This causes the kernel to maybe alias query, key, value
+    # So instead we pad the head_dimensions to be a multiple of 8
+    # in the composite region
+    last_dim_size = tensor.size(-1)
+    if last_dim_size % alignment_size == 0:
+        return tensor
+    pad_count = alignment_size - (last_dim_size % alignment_size)
+    tensor = torch.nn.functional.pad(tensor, [0, pad_count])
+    if slice:
+        return tensor[..., 0:last_dim_size]
+    return tensor
+
+
+# TODO: coalesce with torch/nn/utils/attention.py
+def _calculate_scale(query, scale):
+    # TODO: Investigate why math.sqrt() isn't properly handled by Dynamo?
+    softmax_scale = scale if scale is not None else torch.sym_sqrt(1.0 / query.size(-1))
+    return softmax_scale
+
+
+def _post_process_flash_output(out: torch.Tensor, og_size):
+    if not out.is_nested and out.size(-1) != og_size:
+        out = out[..., 0:og_size]
+    return out
+
+
+def jagged_scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    _validate_sdpa_input(query, key, value, attn_mask, dropout_p, is_causal, scale)
+    # for mypy, ugh
+    assert (
+        isinstance(query, NestedTensor)
+        and isinstance(key, NestedTensor)
+        and isinstance(value, NestedTensor)
+    )
+
+    # Special path for non-ragged sequence length (e.g. for SAM where we have a ragged
+    # second batch dim instead). For this case, we can just send the dense buffers through
+    # vanilla SDPA.
+    if query.dim() > 3 and key.dim() > 3 and value.dim() > 3 and query._ragged_idx == 1:
+        from torch.nested._internal.ops import extract_kwargs
+
+        output = F.scaled_dot_product_attention(
+            query._values,
+            key._values,
+            value._values,
+            attn_mask=(
+                attn_mask._values if isinstance(attn_mask, NestedTensor) else attn_mask
+            ),
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+        )
+
+        return NestedTensor(output, **extract_kwargs(query))
+
+    compute_logsumexp = query.requires_grad or key.requires_grad or value.requires_grad
+
+    backend_choice = _select_sdp_backend(
+        query, key, value, attn_mask, dropout_p, is_causal
+    )
+
+    if backend_choice == SDPBackend.FLASH_ATTENTION:
+        og_size = query.size(-1)
+        query_padded = _pad_last_dim(query, 8, False)
+        key_padded = _pad_last_dim(key, 8, False)
+        value_padded = _pad_last_dim(value, 8, False)
+        # We need to calculate the scale based off the OG head dim size
+        og_scale = _calculate_scale(query, scale)
+        (
+            query_buffer_reshaped,
+            key_buffer_reshaped,
+            value_buffer_reshaped,
+            cumulative_sequence_length_q,
+            cumulative_sequence_length_kv,
+            max_seqlen_batch_q,
+            max_seqlen_batch_kv,
+            output_nt_info,
+        ) = _sdpa_nested_preprocessing(query_padded, key_padded, value_padded)
+
+        (
+            attention,
+            logsumexp,
+            philox_seed,
+            philox_offset,
+            debug_attn_mask,
+        ) = torch.ops.aten._flash_attention_forward(
+            query_buffer_reshaped,
+            key_buffer_reshaped,
+            value_buffer_reshaped,
+            cumulative_sequence_length_q,
+            cumulative_sequence_length_kv,
+            max_seqlen_batch_q,
+            max_seqlen_batch_kv,
+            dropout_p,
+            is_causal,
+            False,
+            scale=og_scale,
+        )
+        # Reshape output to convert nnz to batch_size and seq_len
+        from torch.nested._internal.nested_tensor import nested_view_from_values_offsets
+
+        attention = nested_view_from_values_offsets(
+            attention.squeeze(0), output_nt_info["offsets"]
+        ).transpose(1, 2)
+        return _post_process_flash_output(attention, og_size)
+    elif backend_choice == SDPBackend.EFFICIENT_ATTENTION:
+        (
+            query_reshaped,
+            key_reshaped,
+            value_reshaped,
+            cumulative_sequence_length_q,
+            cumulative_sequence_length_kv,
+            max_seqlen_batch_q,
+            max_seqlen_batch_kv,
+            output_nt_info,
+        ) = _sdpa_nested_preprocessing(query, key, value)
+        (
+            attention,
+            log_sumexp,
+            seed,
+            offset,
+            max_seqlen_q,
+            max_seqlen_batch_kv,
+        ) = torch.ops.aten._efficient_attention_forward(
+            query_reshaped.unsqueeze(0),
+            key_reshaped.unsqueeze(0),
+            value_reshaped.unsqueeze(0),
+            None,
+            cumulative_sequence_length_q,
+            cumulative_sequence_length_kv,
+            max_seqlen_batch_q,
+            max_seqlen_batch_kv,
+            dropout_p,
+            int(is_causal),
+            compute_logsumexp,
+            scale=scale,
+        )
+
+        # Reshape output to convert nnz to batch_size and seq_len
+        from torch.nested._internal.nested_tensor import nested_view_from_values_offsets
+
+        return nested_view_from_values_offsets(
+            attention.squeeze(0), output_nt_info["offsets"]
+        ).transpose(1, 2)
+    elif backend_choice == SDPBackend.MATH:
+        # save the offsets and shape of the inputs, so we can reshape the final output
+        # query @ key = attn: [B, D1, j0, D'] @ [B, D1, D' j1] = [B, D1, j0, j1]
+        # attn @ value = out: [B, D1, j0, j1] @ [B, D1, j1, D2] = [B, D1, j0, D2]
+        offsets = query.offsets()
+        d1 = query._size[1]
+        d2 = value._size[-1]
+
+        # convert jagged layout Nested Tensor to strided layout Nested Tensor
+        # which support the math implementation of SDPA
+        def get_strided_layout_nested_tensor(jagged_layout_nt):
+            lengths = jagged_layout_nt._offsets[1:] - jagged_layout_nt._offsets[:-1]
+            transpose = torch.transpose(jagged_layout_nt, 1, 2)
+            tensor_list = transpose.values().split(list(lengths), dim=0)
+            strided_nt = torch.nested.as_nested_tensor(list(tensor_list))
+            strided_nt = strided_nt.transpose(1, 2).contiguous()
+            return strided_nt
+
+        query = get_strided_layout_nested_tensor(query)
+        key = get_strided_layout_nested_tensor(key)
+        value = get_strided_layout_nested_tensor(value)
+
+        attn_out = torch._scaled_dot_product_attention_math(
+            query, key, value, attn_mask, dropout_p, is_causal, scale=scale
+        )[0]
+
+        from torch.nested._internal.nested_tensor import nested_view_from_values_offsets
+
+        # convert strided layout Nested Tensor back to jagged layout Nested Tensor
+        attn_out = attn_out.transpose(1, 2).contiguous().values()
+        attn_out = attn_out.view(-1, d1, d2)
+        attn_out = nested_view_from_values_offsets(attn_out, offsets)
+        attn_out = attn_out.transpose(1, 2)
+
+        return attn_out
+    else:
+        raise RuntimeError(
+            "No viable backend for scaled_dot_product_attention was found."
+        )
diff --git a/MLPY/Lib/site-packages/torch/nn/__init__.py b/MLPY/Lib/site-packages/torch/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..951ba0bb4c82d8e289b22843354f9fa310ee240a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/__init__.py
@@ -0,0 +1,53 @@
+from .modules import *  # noqa: F403
+from .parameter import (
+    Parameter as Parameter,
+    UninitializedParameter as UninitializedParameter,
+    UninitializedBuffer as UninitializedBuffer,
+)
+from .parallel import DataParallel as DataParallel
+from . import init
+from . import functional
+from . import utils
+from . import attention
+
+
+def factory_kwargs(kwargs):
+    r"""Return a canonicalized dict of factory kwargs.
+
+    Given kwargs, returns a canonicalized dict of factory kwargs that can be directly passed
+    to factory functions like torch.empty, or errors if unrecognized kwargs are present.
+
+    This function makes it simple to write code like this::
+
+        class MyModule(nn.Module):
+            def __init__(self, **kwargs):
+                factory_kwargs = torch.nn.factory_kwargs(kwargs)
+                self.weight = Parameter(torch.empty(10, **factory_kwargs))
+
+    Why should you use this function instead of just passing `kwargs` along directly?
+
+    1. This function does error validation, so if there are unexpected kwargs we will
+    immediately report an error, instead of deferring it to the factory call
+    2. This function supports a special `factory_kwargs` argument, which can be used to
+    explicitly specify a kwarg to be used for factory functions, in the event one of the
+    factory kwargs conflicts with an already existing argument in the signature (e.g.
+    in the signature ``def f(dtype, **kwargs)``, you can specify ``dtype`` for factory
+    functions, as distinct from the dtype argument, by saying
+    ``f(dtype1, factory_kwargs={"dtype": dtype2})``)
+    """
+    if kwargs is None:
+        return {}
+    simple_keys = {"device", "dtype", "memory_format"}
+    expected_keys = simple_keys | {"factory_kwargs"}
+    if not kwargs.keys() <= expected_keys:
+        raise TypeError(f"unexpected kwargs {kwargs.keys() - expected_keys}")
+
+    # guarantee no input kwargs is untouched
+    r = dict(kwargs.get("factory_kwargs", {}))
+    for k in simple_keys:
+        if k in kwargs:
+            if k in r:
+                raise TypeError(f"{k} specified twice, in **kwargs and in factory_kwargs")
+            r[k] = kwargs[k]
+
+    return r
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a511e093ec0e00b96ce790335c7ea12b1b5b34ab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d4c94f180b3487c497b27c19fce57131af39990
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b3cb49f2a38807e9513490105ac2fb66e25058f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/cpp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/cpp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccf4bf655b435f3a704217f87163a30e34d49069
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/cpp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..614d2128d6d54edbe2af3d2d05684604532aa12b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd9611b82fe2450ea4412df654b491eae53909da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cd670f1162fecddc01f1796892d573f0e28e987
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5aa22d6b24b5b18e4113854f4ea0cba5ad25d41
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/_reduction.py b/MLPY/Lib/site-packages/torch/nn/_reduction.py
new file mode 100644
index 0000000000000000000000000000000000000000..e782bd2242c82880abd41d947fa49b4ad3216ac0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/_reduction.py
@@ -0,0 +1,47 @@
+from typing import Optional
+import warnings
+
+# NB: Keep this file in sync with enums in aten/src/ATen/core/Reduction.h
+
+
+def get_enum(reduction: str) -> int:
+    if reduction == 'none':
+        ret = 0
+    elif reduction == 'mean':
+        ret = 1
+    elif reduction == 'elementwise_mean':
+        warnings.warn("reduction='elementwise_mean' is deprecated, please use reduction='mean' instead.")
+        ret = 1
+    elif reduction == 'sum':
+        ret = 2
+    else:
+        ret = -1  # TODO: remove once JIT exceptions support control flow
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+    return ret
+
+# In order to support previous versions, accept boolean size_average and reduce
+# and convert them into the new constants for now
+
+
+# We use these functions in torch/legacy as well, in which case we'll silence the warning
+def legacy_get_string(size_average: Optional[bool], reduce: Optional[bool], emit_warning: bool = True) -> str:
+    warning = "size_average and reduce args will be deprecated, please use reduction='{}' instead."
+
+    if size_average is None:
+        size_average = True
+    if reduce is None:
+        reduce = True
+
+    if size_average and reduce:
+        ret = 'mean'
+    elif reduce:
+        ret = 'sum'
+    else:
+        ret = 'none'
+    if emit_warning:
+        warnings.warn(warning.format(ret))
+    return ret
+
+
+def legacy_get_enum(size_average: Optional[bool], reduce: Optional[bool], emit_warning: bool = True) -> int:
+    return get_enum(legacy_get_string(size_average, reduce, emit_warning))
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/__init__.py b/MLPY/Lib/site-packages/torch/nn/attention/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c61b34871b2ef2af85b1855915147ac9a0086d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/attention/__init__.py
@@ -0,0 +1,117 @@
+""" This module contains functions and classes that alter the behavior of torch.nn.functional.scaled_dot_product_attention """
+import contextlib
+from typing import List, Union
+from warnings import warn
+
+from torch.backends.cuda import (
+    can_use_efficient_attention,
+    can_use_flash_attention,
+    enable_flash_sdp,
+    enable_math_sdp,
+    enable_mem_efficient_sdp,
+    flash_sdp_enabled,
+    math_sdp_enabled,
+    mem_efficient_sdp_enabled,
+    SDPAParams,
+)
+
+__all__: List[str] = ["SDPBackend", "sdpa_kernel", "WARN_FOR_UNFUSED_KERNELS"]
+
+# Note: [SDPA warnings]
+# TODO: Consider using this for sdpa regardless of subclasses
+# This only effects users of bias subclasses
+# If this is set to True, we will warn the user if they are not using the fused kernels
+# As well, it will raise warnings for all the reasons why the fused kernels can't be run.
+# To set this to True, run
+# torch.nn.attention.WARN_FOR_UNFUSED_KERNELS = True
+WARN_FOR_UNFUSED_KERNELS = False
+
+
+from torch._C import _SDPBackend as SDPBackend
+
+# Hacks for Sphinx documentation:
+# https://stackoverflow.com/questions/38765577/overriding-sphinx-autodoc-alias-of-for-import-of-private-class
+SDPBackend = SDPBackend
+r"""An enum-like class that contains the different backends for scaled dot product attention.
+    This backend class is designed to be used with the sdpa_kernel context manager.
+
+    The following Enums are available:
+        - ERROR: An error occurred when trying to determine the backend.
+        - MATH: The math backend for scaled dot product attention.
+        - FLASH_ATTENTION: The flash attention backend for scaled dot product attention.
+        - EFFICIENT_ATTENTION: The efficient attention backend for scaled dot product attention.
+        - CUDNN_ATTENTION: The cuDNN backend for scaled dot product attention.
+
+    See :func:`torch.nn.attention.sdpa_kernel` for more details.
+
+    .. warning:: This class is in beta and subject to change.
+"""
+SDPBackend.__module__ = __name__
+SDPBackend.__name__ = "SDPBackend"
+
+
+def _raise_kernel_warnings(params: SDPAParams) -> None:
+    """
+    If WARN_FOR_UNFUSED_KERNELS is set to True, this will raise warnings
+    for all the reasons why the fused kernels can't be run. If using subclasses
+    """
+    if WARN_FOR_UNFUSED_KERNELS:
+        if not can_use_efficient_attention(params):
+            warn("Efficient attention can't be used because:")
+            can_use_efficient_attention(params, True)
+        if not can_use_flash_attention(params):
+            warn("Flash attention can't be used because:")
+            can_use_flash_attention(params, True)
+
+
+@contextlib.contextmanager
+def sdpa_kernel(backends: Union[List[SDPBackend], SDPBackend]):
+    r"""
+    Context manager to select which backend to use for scaled dot product attention.
+
+    .. warning:: This function is beta and subject to change.
+
+    Args:
+        backend (Union[List[SDPBackend], SDPBackend]): A backend or list of backends for scaled dot product attention.
+
+    Example:
+
+    .. code-block:: python
+
+        from torch.nn.functional import scaled_dot_product_attention
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+        # Only enable flash attention backend
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            scaled_dot_product_attention(...)
+
+        # Enable the Math or Efficient attention backends
+        with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+            scaled_dot_product_attention(...)
+
+    This context manager can be used to select which backend to use for scaled dot product attention.
+    Upon exiting the context manager, the previous state of the flags will be restored, enabling all backends.
+    """
+    assert isinstance(
+        backends, (list, SDPBackend)
+    ), "Backend must be an instance of SDPBackend or a list of SDPBackend instances"
+
+    if isinstance(backends, SDPBackend):
+        backends = [backends]
+
+    backends = set(backends)
+    previous_flash: bool = flash_sdp_enabled()
+    previous_mem_efficient: bool = mem_efficient_sdp_enabled()
+    previous_math: bool = math_sdp_enabled()
+    try:
+        enable_flash = SDPBackend.FLASH_ATTENTION in backends
+        enable_mem_efficient = SDPBackend.EFFICIENT_ATTENTION in backends
+        enable_math = SDPBackend.MATH in backends
+
+        enable_flash_sdp(enable_flash)
+        enable_mem_efficient_sdp(enable_mem_efficient)
+        enable_math_sdp(enable_math)
+        yield {}
+    finally:
+        enable_flash_sdp(previous_flash)
+        enable_mem_efficient_sdp(previous_mem_efficient)
+        enable_math_sdp(previous_math)
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bd5c703b1300961926d521b6f9f7ea6b837b9d1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b618fdc8ad2cd3c39db8c2ccfdf064e0948a9c8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/bias.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/bias.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b074fb0383b0d3e8f1d109f17b1e1c58b2e9b29
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/attention/__pycache__/bias.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/_utils.py b/MLPY/Lib/site-packages/torch/nn/attention/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcaa2b44874bfe62b30679ca490b6ca848e91cea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/attention/_utils.py
@@ -0,0 +1,57 @@
+"""Defines utilities for interacting with scaled_dot_product_attention"""
+import math
+from typing import List, Optional
+
+import torch
+
+__all__: List[str] = []
+
+
+def _input_requires_grad(*tensors: torch.Tensor) -> bool:
+    """Returns True if any of the tensors requires grad"""
+    return any(t.requires_grad for t in tensors)
+
+
+def _postprocess_flash_output(inpt_tensor: torch.Tensor, og_size: int) -> torch.Tensor:
+    """Handles the unpad of the last dimension"""
+    if inpt_tensor.size(-1) != og_size:
+        return inpt_tensor[..., :og_size]
+    return inpt_tensor
+
+
+def _calculate_scale(head_dim_size: int, scale: Optional[float]) -> float:
+    """
+    For FlashAttention we pad the head dimension to be a multiple of 8 so we need to scale the output
+    by the original head size and not the padded.
+    """
+    if scale is not None:
+        return scale
+    return 1.0 / math.sqrt(head_dim_size)
+
+
+def _validate_sdpa_input(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    if query.dtype != key.dtype or query.dtype != value.dtype:
+        raise ValueError(
+            f"Expected query, key, and value to have the same dtype, "
+            f"but got query.dtype: {query.dtype}, key.dtype: {key.dtype}, "
+            f"and value.dtype: {value.dtype} instead."
+        )
+    if query.device != key.device or query.device != value.device:
+        raise ValueError(
+            f"Expected query, key, and value to have the same device type, "
+            f"but got query.device: {query.device}, key.device: {key.device}, "
+            f"and value.device: {value.device} instead."
+        )
+    if query.dim() < 2 or key.dim() < 2 or value.dim() < 2:
+        raise ValueError(
+            f"Expected query, key, and value to all be  at least 2 dimensional, but got query.dim: "
+            f"{query.dim()}, key.dim: {key.dim()} and value.dim: {value.dim()} instead."
+        )
diff --git a/MLPY/Lib/site-packages/torch/nn/attention/bias.py b/MLPY/Lib/site-packages/torch/nn/attention/bias.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac4911d3d3c3cbbbd88deccf12c2352565b27ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/attention/bias.py
@@ -0,0 +1,353 @@
+"""Defines bias subclasses that work with scaled_dot_product_attention"""
+from enum import auto, IntEnum
+from typing import Optional
+from warnings import warn
+
+import torch
+from torch.backends.cuda import (
+    can_use_efficient_attention,
+    can_use_flash_attention,
+    SDPAParams,
+)
+from torch.nn.attention import _raise_kernel_warnings
+from torch.nn.attention._utils import (
+    _calculate_scale,
+    _input_requires_grad,
+    _postprocess_flash_output,
+    _validate_sdpa_input,
+)
+from torch.nn.functional import scaled_dot_product_attention
+
+__all__ = ["causal_upper_left", "causal_lower_right", "CausalVariant", "CausalBias"]
+
+
+torch._dynamo.allow_in_graph(can_use_flash_attention)
+torch._dynamo.allow_in_graph(can_use_efficient_attention)
+torch._dynamo.allow_in_graph(SDPAParams)
+
+
+class CausalVariant(IntEnum):
+    r"""
+    Enum for causal variants used in attention mechanisms.
+
+    Defines two types of causal biases:
+
+    `UPPER_LEFT`: Represents upper-left triangular bias for standard causal attention.
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        torch.tril(torch.ones(size, dtype=torch.bool))
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0]]
+
+
+    `LOWER_RIGHT`: Represents lower-right triangular bias, the include values are aligned to the lower
+    right corner of the matrix.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        diagonal_offset = size[1] - size[0]
+        torch.tril(
+            torch.ones(size, dtype=torch.bool),
+            diagonal=diagonal_offset,
+        )
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 1]]
+
+    Note that these variants are equivalent to each other when the sequence lengths of the query and key/value
+    tensors are equal since the triangular matrix is square.
+
+    .. warning:: This enum is a prototype and subject to change.
+    """
+
+    UPPER_LEFT = auto()
+    LOWER_RIGHT = auto()
+
+
+class CausalBias(torch.Tensor):
+    """
+    A bias representing causal attention patterns. For an overview of the bias structure, see the :class:`CausalVariant` enum.
+
+    This class is used for defining causal (triangular) attention biases. For construing the bias, there exist
+    two factory functions: :func:`causal_upper_left` and :func:`causal_lower_right`.
+
+    Example:
+
+    .. code-block:: python
+
+        from torch.nn.attention.bias import causal_lower_right
+
+        bsz, num_heads, seqlen_q, seqlen_kv, head_dim = 32, 8, 4, 12, 8
+
+        # Create a lower-right causal bias
+        attn_bias = causal_lower_right(seqlen_q, seqlen_kv)
+
+        q = torch.randn(bsz, num_heads, seqlen_q, head_dim, device="cuda", dtype=torch.float16)
+        k = torch.randn(bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16)
+        v = torch.randn(bsz, num_heads, seqlen_kv, head_dim, device="cuda", dtype=torch.float16)
+
+        out = F.scaled_dot_product_attention(q, k, v, attn_bias)
+
+    .. warning:: This class is a prototype and subject to change.
+    """
+
+    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int):
+        """
+        Initializes the CausalBias instance with a specified variant and sequence lengths.
+
+        Args:
+            variant (CausalVariant): The type of causal bias to use (either UPPER_LEFT or LOWER_RIGHT).
+            seq_len_q (int): The sequence length of the query tensor.
+            seq_len_kv (int): The sequence length of the key/value tensor.
+
+        Raises a warning if the LOWER_RIGHT variant is used with seq_len_q > seq_len_kv, as it may produce NaNs.
+        """
+        assert isinstance(variant, CausalVariant)
+        self.variant = variant
+        self.seq_len_q = seq_len_q
+        self.seq_len_kv = seq_len_kv
+        if seq_len_q > seq_len_kv and variant == CausalVariant.LOWER_RIGHT:
+            warn(
+                "Lower right causal bias will produce NaNs in the output when seq_len_q > seq_len_kv!"
+            )
+
+    def _upper_left(self, device: torch.device) -> torch.Tensor:
+        """Upper left causal bias"""
+        return torch.tril(
+            torch.ones(self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool)
+        )
+
+    def _lower_right(self, device: torch.device) -> torch.Tensor:
+        """Lower right causal bias"""
+        diagonal_offset = self.seq_len_kv - self.seq_len_q
+        return torch.tril(
+            torch.ones(
+                self.seq_len_q, self.seq_len_kv, device=device, dtype=torch.bool
+            ),
+            diagonal=diagonal_offset,
+        )
+
+    def _materialize(self, device: Optional[torch.device] = None) -> torch.Tensor:
+        """
+        Materializes the causal bias into a tensor form.
+
+        Depending on the variant, this method generates either an upper-left or lower-right
+        triangular matrix to represent the causal bias.
+
+        Args:
+            device (Optional[torch.device]): The device on which to create the tensor. Defaults to CPU.
+
+        Returns:
+            torch.Tensor: The materialized bias tensor.
+        """
+        if device is None:
+            device = torch.device("cpu")
+        if self.variant == CausalVariant.UPPER_LEFT:
+            return self._upper_left(device)
+        elif self.variant == CausalVariant.LOWER_RIGHT:
+            return self._lower_right(device)
+
+    @staticmethod
+    def _dispatch(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: "CausalBias",
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+    ) -> torch.Tensor:
+        r"""
+        Handles the logic for computing attention with the specified causal bias.
+
+        Args:
+            query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
+            key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
+            value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
+            attn_mask (CausalBias): The type of causal attention to apply.
+                A boolean mask where a value of True indicates that the element *should* take part in attention.
+                A float mask of the same type as query, key, value that is added to the attention score.
+            dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
+            is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
+                are set.
+            scale (optional float): Scaling factor applied prior to softmax. If None, the default value is set
+                to :math:`\frac{1}{\sqrt{E}}`.
+
+        Returns:
+            output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.
+
+        Raises:
+            ValueError: If the causal bias variant is not a CausalVariant type.
+
+        """
+        if is_causal:
+            raise ValueError("CausalBias should not be used with causal=True")
+
+        if (
+            attn_mask.seq_len_q == attn_mask.seq_len_kv
+            or attn_mask.variant == CausalVariant.UPPER_LEFT
+        ):
+            return scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=None,
+                dropout_p=dropout_p,
+                is_causal=True,
+                scale=scale,
+            )
+        elif attn_mask.variant == CausalVariant.LOWER_RIGHT:
+            _validate_sdpa_input(query, key, value, None, dropout_p, is_causal, scale)
+            sdpa_params = SDPAParams(query, key, value, None, dropout_p, is_causal)
+            if can_use_flash_attention(sdpa_params):
+                needs_padding = query.size(-1) % 8 != 0
+                og_head_size = query.size(-1)
+                og_scale = _calculate_scale(og_head_size, scale)
+                if needs_padding:
+                    query = torch.nn.functional.pad(query, (0, 8 - query.size(-1) % 8))
+                    key = torch.nn.functional.pad(key, (0, 8 - key.size(-1) % 8))
+                    value = torch.nn.functional.pad(value, (0, 8 - value.size(-1) % 8))
+                out = torch.ops.aten._scaled_dot_product_flash_attention(
+                    query,
+                    key,
+                    value,
+                    dropout_p,
+                    is_causal=True,  # TODO: Flash accepts causal = True and for this particular op it means lower right
+                    return_debug_mask=False,
+                    scale=og_scale,
+                )[0]
+                return _postprocess_flash_output(out, og_head_size)
+            if can_use_efficient_attention(sdpa_params):
+                compute_log_sumexp = False
+                if _input_requires_grad(query, key, value):
+                    compute_log_sumexp = True
+                return torch.ops.aten._efficient_attention_forward(
+                    query.transpose(1, 2),
+                    key.transpose(1, 2),
+                    value.transpose(1, 2),
+                    bias=None,
+                    cu_seqlens_q=None,
+                    cu_seqlens_k=None,
+                    max_seqlen_q=None,
+                    max_seqlen_k=None,
+                    dropout_p=dropout_p,
+                    custom_mask_type=int(attn_mask.variant),
+                    compute_log_sumexp=compute_log_sumexp,
+                    scale=scale,
+                    causal_diagonal=None,
+                    seqlen_k=None,
+                )[0].transpose(1, 2)
+            else:
+                _raise_kernel_warnings(sdpa_params)
+                # We cant use efficient attention the only support for lower right is via materialization
+                return scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attn_mask._materialize(query.device),
+                    dropout_p=dropout_p,
+                    is_causal=False,
+                    scale=scale,
+                )
+        else:
+            raise ValueError(
+                f"CausalBias.variant must be a CausalVariant type, but found: {attn_mask.variant}"
+            )
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        """Defines the behavior of torch.nn.functional.scaled_dot_product_attention when the attn_bias is an AttnBias"""
+        if kwargs is None:
+            kwargs = {}
+        if func != torch.nn.functional.scaled_dot_product_attention:
+            raise NotImplementedError(
+                "CausalBias only supports scaled_dot_product_attention"
+            )
+        return cls._dispatch(*args, **kwargs)
+
+    def __repr__(self):
+        return self._materialize().__repr__()
+
+
+def causal_upper_left(*size) -> CausalBias:
+    """
+    Creates an upper-left triangular causal bias.
+
+    This function generates a upper-left triangular matrix to represent causal attention bias with a
+    diagonal offset set so that the inclusive values are aligned to the upper left corner of the matrix.
+    This equivalent to the `is_causal=True` argument in `scaled_dot_product_attention`.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        torch.tril(torch.ones(size, dtype=torch.bool))
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 0, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 0]]
+
+    Args:
+        size: The size of the bias matrix.
+
+    Returns:
+        CausalBias: The UPPER_LEFT triangular causal bias variant.
+    """
+    assert len(size) == 2, "causal_upper_left only supports 2D tensors"
+    seq_len_q, seq_len_kv = size
+    return CausalBias(CausalVariant.UPPER_LEFT, seq_len_q, seq_len_kv)
+
+
+def causal_lower_right(*size) -> CausalBias:
+    """
+    Creates a lower-right triangular causal bias.
+
+    This function generates a lower-right triangular matrix to represent causal attention bias with a
+    diagonal offset set so that the inclusive values are aligned to the lower right corner of the matrix.
+
+    The equivalent pytorch code for constructing this bias is:
+
+    .. code-block:: python
+
+        diagonal_offset = size[1] - size[0]
+        torch.tril(
+            torch.ones(size, dtype=torch.bool),
+            diagonal=diagonal_offset,
+        )
+
+    For instance, with `shape=(3,4)`, the materialized bias tensor will be:
+
+    .. code-block:: text
+
+        [[1, 1, 0, 0],
+         [1, 1, 1, 0],
+         [1, 1, 1, 1]]
+
+    Args:
+        size: The size of the bias matrix.
+
+    Returns:
+        CausalBias: The LOWER_RIGHT triangular causal bias variant.
+    """
+    assert len(size) == 2, "causal_lower_right only supports 2D tensors"
+    seq_len_q, seq_len_kv = size
+    return CausalBias(CausalVariant.LOWER_RIGHT, seq_len_q, seq_len_kv)
diff --git a/MLPY/Lib/site-packages/torch/nn/backends/__init__.py b/MLPY/Lib/site-packages/torch/nn/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6f1646a73940f6bdf90aadda66fbe9cb92936f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/thnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/thnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c827616806d1d8934990b0d8facc1e26f61e1b6a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/backends/__pycache__/thnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/backends/thnn.py b/MLPY/Lib/site-packages/torch/nn/backends/thnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb4b9463696e6063069888528854bc3a4869b354
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/backends/thnn.py
@@ -0,0 +1,4 @@
+# this is for historical pickle deserialization, it is not used otherwise
+
+def _get_thnn_function_backend():
+    pass
diff --git a/MLPY/Lib/site-packages/torch/nn/common_types.py b/MLPY/Lib/site-packages/torch/nn/common_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fab419b5600cac13dfc0b82e8581da1c94476b3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/common_types.py
@@ -0,0 +1,42 @@
+from typing import TypeVar, Union, Tuple, Optional
+from .. import Tensor
+
+# Create some useful type aliases
+
+# Template for arguments which can be supplied as a tuple, or which can be a scalar which PyTorch will internally
+# broadcast to a tuple.
+# Comes in several variants: A tuple of unknown size, and a fixed-size tuple for 1d, 2d, or 3d operations.
+T = TypeVar('T')
+_scalar_or_tuple_any_t = Union[T, Tuple[T, ...]]
+_scalar_or_tuple_1_t = Union[T, Tuple[T]]
+_scalar_or_tuple_2_t = Union[T, Tuple[T, T]]
+_scalar_or_tuple_3_t = Union[T, Tuple[T, T, T]]
+_scalar_or_tuple_4_t = Union[T, Tuple[T, T, T, T]]
+_scalar_or_tuple_5_t = Union[T, Tuple[T, T, T, T, T]]
+_scalar_or_tuple_6_t = Union[T, Tuple[T, T, T, T, T, T]]
+
+# For arguments which represent size parameters (eg, kernel size, padding)
+_size_any_t = _scalar_or_tuple_any_t[int]
+_size_1_t = _scalar_or_tuple_1_t[int]
+_size_2_t = _scalar_or_tuple_2_t[int]
+_size_3_t = _scalar_or_tuple_3_t[int]
+_size_4_t = _scalar_or_tuple_4_t[int]
+_size_5_t = _scalar_or_tuple_5_t[int]
+_size_6_t = _scalar_or_tuple_6_t[int]
+
+# For arguments which represent optional size parameters (eg, adaptive pool parameters)
+_size_any_opt_t = _scalar_or_tuple_any_t[Optional[int]]
+_size_2_opt_t = _scalar_or_tuple_2_t[Optional[int]]
+_size_3_opt_t = _scalar_or_tuple_3_t[Optional[int]]
+
+# For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
+_ratio_2_t = _scalar_or_tuple_2_t[float]
+_ratio_3_t = _scalar_or_tuple_3_t[float]
+_ratio_any_t = _scalar_or_tuple_any_t[float]
+
+_tensor_list_t = _scalar_or_tuple_any_t[Tensor]
+
+# For the return value of max pooling operations that may or may not return indices.
+# With the proposed 'Literal' feature to Python typing, it might be possible to
+# eventually eliminate this.
+_maybe_indices_t = _scalar_or_tuple_2_t[Tensor]
diff --git a/MLPY/Lib/site-packages/torch/nn/cpp.py b/MLPY/Lib/site-packages/torch/nn/cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0c17c11b00f2a35f8be3c419d4a0f31e266d6b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/cpp.py
@@ -0,0 +1,88 @@
+"""Functionality for Python <-> C++ frontend inter-op."""
+
+from torch import nn
+
+
+class OrderedDictWrapper:
+    """A wrapper around a C++ OrderedDict.
+
+    It dynamically evaluates the OrderedDict getter on a bound C++ module, such
+    that new changes on the C++ side are picked up. Otherwise accessing e.g.
+    ``cpp_module._parameters`` just once would get a frozen copy of the parameters
+    at the time of access. ``torch.nn.Module`` accesses ``_parameters`` et al. via ``self.__dict__``
+    so using properties does not work.
+    """
+
+    def __init__(self, cpp_module, attr):
+        self.cpp_module = cpp_module
+        self.attr = attr
+
+    @property
+    def cpp_dict(self):
+        return getattr(self.cpp_module, self.attr)
+
+    # Magic methods cannot be assigned dynamically and bypass ``getattr``, so we
+    # must manually override them.
+
+    def items(self):
+        return self.cpp_dict.items()
+
+    def keys(self):
+        return self.cpp_dict.keys()
+
+    def values(self):
+        return self.cpp_dict.values()
+
+    def __iter__(self):
+        return self.cpp_dict.__iter__()
+
+    def __len__(self):
+        return self.cpp_dict.__len__()
+
+    def __contains__(self, key):
+        return self.cpp_dict.__contains__(key)
+
+    def __getitem__(self, key):
+        return self.cpp_dict.__getitem__(key)
+
+
+class ModuleWrapper(nn.Module):
+    """A subclass of ``torch.nn.Module`` that wraps a C++ frontend module and delegates all access."""
+
+    def __init__(self, cpp_module):
+        # Assign before the super class constructor so ``self.training`` can be
+        # assigned to in the super class constructor.
+        self.cpp_module = cpp_module
+        super().__init__()
+        self._parameters = OrderedDictWrapper(cpp_module, "_parameters")  # type: ignore[assignment]
+        self._buffers: OrderedDictWrapper = OrderedDictWrapper(cpp_module, "_buffers")  # type: ignore[assignment]
+        self._modules: OrderedDictWrapper = OrderedDictWrapper(cpp_module, "_modules")  # type: ignore[assignment]
+        for attr in dir(cpp_module):
+            # Skip magic methods and the three attributes above.
+            if not attr.startswith("_"):
+                setattr(self, attr, getattr(self.cpp_module, attr))
+
+    def _apply(self, fn, recurse=True):
+        for param in self.parameters():
+            # Tensors stored in modules are graph leaves, and we don't
+            # want to create copy nodes, so we have to unpack the data.
+            param.data = fn(param.data)
+            if param._grad is not None:
+                param._grad.data = fn(param._grad.data)
+
+        for buf in self.buffers():
+            buf.data = fn(buf.data)
+
+        return self
+
+    # nn.Module defines training as a boolean
+    @property  # type: ignore[override]
+    def training(self):
+        return self.cpp_module.training
+
+    @training.setter
+    def training(self, mode):
+        self.cpp_module.train(mode)
+
+    def __repr__(self):
+        return self.cpp_module.__repr__()
diff --git a/MLPY/Lib/site-packages/torch/nn/functional.py b/MLPY/Lib/site-packages/torch/nn/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..158f8cc3e941048e8a6d379829c3d11382daedb1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/functional.py
@@ -0,0 +1,5512 @@
+"""Functional interface."""
+from typing import Callable, List, Optional, Tuple, Union
+import math
+import warnings
+import importlib
+
+try:
+    import numpy as np
+except ModuleNotFoundError:
+    np = None
+
+import torch
+from torch import _VF
+from torch import sym_int as _sym_int
+from torch._C import _infer_size, _add_docstr
+from torch._torch_docs import reproducibility_notes, tf32_notes, sparse_support_notes
+# A workaround to support both TorchScript and MyPy:
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from torch.types import _dtype as DType
+else:
+    # The JIT doesn't understand Union, nor torch.dtype here
+    DType = int
+
+from .._jit_internal import boolean_dispatch, _overload, BroadcastingList1, BroadcastingList2, BroadcastingList3
+from ..overrides import (
+    has_torch_function, has_torch_function_unary, has_torch_function_variadic,
+    handle_torch_function)
+from . import _reduction as _Reduction
+from . import grad  # noqa: F401
+from .modules import utils
+from .modules.utils import _single, _pair, _triple, _list_with_default
+
+Tensor = torch.Tensor
+
+conv1d = _add_docstr(
+    torch.conv1d,
+    r"""
+conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 1D convolution over an input signal composed of several input
+planes.
+
+{tf32_note}
+
+See :class:`~torch.nn.Conv1d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+
+Note:
+    This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
+    weight: filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: ``None``
+    stride: the stride of the convolving kernel. Can be a single number or
+      a one-element tuple `(sW,)`. Default: 1
+    padding: implicit paddings on both sides of the input. Can be a string {'valid', 'same'},
+      single number or a one-element tuple `(padW,)`. Default: 0
+      ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+      the input so the output has the same shape as the input. However, this mode
+      doesn't support any stride values other than 1.
+
+      .. warning::
+          For ``padding='same'``, if the ``weight`` is even-length and
+          ``dilation`` is odd in any dimension, a full :func:`pad` operation
+          may be needed internally. Lowering performance.
+    dilation: the spacing between kernel elements. Can be a single number or
+      a one-element tuple `(dW,)`. Default: 1
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
+      the number of groups. Default: 1
+
+Examples::
+
+    >>> inputs = torch.randn(33, 16, 30)
+    >>> filters = torch.randn(20, 16, 5)
+    >>> F.conv1d(inputs, filters)
+""",
+)
+
+conv2d = _add_docstr(
+    torch.conv2d,
+    r"""
+conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 2D convolution over an input image composed of several input
+planes.
+
+{tf32_note}
+
+See :class:`~torch.nn.Conv2d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+
+Note:
+    This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+    weight: filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kH , kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple `(sH, sW)`. Default: 1
+    padding: implicit paddings on both sides of the input. Can be a string {'valid', 'same'},
+      single number or a tuple `(padH, padW)`. Default: 0
+      ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+      the input so the output has the same shape as the input. However, this mode
+      doesn't support any stride values other than 1.
+
+      .. warning::
+          For ``padding='same'``, if the ``weight`` is even-length and
+          ``dilation`` is odd in any dimension, a full :func:`pad` operation
+          may be needed internally. Lowering performance.
+
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dH, dW)`. Default: 1
+    groups: split input into groups, both :math:`\text{in\_channels}` and :math:`\text{out\_channels}`
+      should be divisible by the number of groups. Default: 1
+
+Examples::
+
+    >>> # With square kernels and equal stride
+    >>> filters = torch.randn(8, 4, 3, 3)
+    >>> inputs = torch.randn(1, 4, 5, 5)
+    >>> F.conv2d(inputs, filters, padding=1)
+""",
+)  # noqa: E501
+
+conv3d = _add_docstr(
+    torch.conv3d,
+    r"""
+conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
+
+Applies a 3D convolution over an input image composed of several input
+planes.
+
+{tf32_note}
+
+See :class:`~torch.nn.Conv3d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+
+Note:
+    This operator supports complex data types i.e. ``complex32, complex64, complex128``.
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iT , iH , iW)`
+    weight: filters of shape :math:`(\text{out\_channels} , \frac{\text{in\_channels}}{\text{groups}} , kT , kH , kW)`
+    bias: optional bias tensor of shape :math:`(\text{out\_channels})`. Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple `(sT, sH, sW)`. Default: 1
+    padding: implicit paddings on both sides of the input. Can be a string {'valid', 'same'},
+      single number or a tuple `(padT, padH, padW)`. Default: 0
+      ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+      the input so the output has the same shape as the input. However, this mode
+      doesn't support any stride values other than 1.
+
+      .. warning::
+          For ``padding='same'``, if the ``weight`` is even-length and
+          ``dilation`` is odd in any dimension, a full :func:`pad` operation
+          may be needed internally. Lowering performance.
+
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dT, dH, dW)`. Default: 1
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by
+      the number of groups. Default: 1
+
+Examples::
+
+    >>> filters = torch.randn(33, 16, 3, 3, 3)
+    >>> inputs = torch.randn(20, 16, 50, 10, 20)
+    >>> F.conv3d(inputs, filters)
+""",
+)  # noqa: E501
+
+conv_transpose1d = _add_docstr(
+    torch.conv_transpose1d,
+    r"""
+conv_transpose1d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 1D transposed convolution operator over an input signal
+composed of several input planes, sometimes also called "deconvolution".
+
+{tf32_note}
+
+See :class:`~torch.nn.ConvTranspose1d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
+    weight: filters of shape :math:`(\text{in\_channels} , \frac{\text{out\_channels}}{\text{groups}} , kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sW,)``. Default: 1
+    padding: ``dilation * (kernel_size - 1) - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padW,)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple ``(out_padW)``. Default: 0
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple ``(dW,)``. Default: 1
+
+Examples::
+
+    >>> inputs = torch.randn(20, 16, 50)
+    >>> weights = torch.randn(16, 33, 5)
+    >>> F.conv_transpose1d(inputs, weights)
+""",
+)
+
+conv_transpose2d = _add_docstr(
+    torch.conv_transpose2d,
+    r"""
+conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 2D transposed convolution operator over an input image
+composed of several input planes, sometimes also called "deconvolution".
+
+{tf32_note}
+
+See :class:`~torch.nn.ConvTranspose2d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+    weight: filters of shape :math:`(\text{in\_channels} , \frac{\text{out\_channels}}{\text{groups}} , kH , kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sH, sW)``. Default: 1
+    padding: ``dilation * (kernel_size - 1) - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padH, padW)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple ``(out_padH, out_padW)``.
+      Default: 0
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple ``(dH, dW)``. Default: 1
+
+Examples::
+
+    >>> # With square kernels and equal stride
+    >>> inputs = torch.randn(1, 4, 5, 5)
+    >>> weights = torch.randn(4, 8, 3, 3)
+    >>> F.conv_transpose2d(inputs, weights, padding=1)
+""",
+)  # noqa: E501
+
+conv_transpose3d = _add_docstr(
+    torch.conv_transpose3d,
+    r"""
+conv_transpose3d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1) -> Tensor
+
+Applies a 3D transposed convolution operator over an input image
+composed of several input planes, sometimes also called "deconvolution"
+
+{tf32_note}
+
+See :class:`~torch.nn.ConvTranspose3d` for details and output shape.
+
+Note:
+    {cudnn_reproducibility_note}
+""".format(
+        **reproducibility_notes, **tf32_notes
+    )
+    + r"""
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iT , iH , iW)`
+    weight: filters of shape :math:`(\text{in\_channels} , \frac{\text{out\_channels}}{\text{groups}} , kT , kH , kW)`
+    bias: optional bias of shape :math:`(\text{out\_channels})`. Default: None
+    stride: the stride of the convolving kernel. Can be a single number or a
+      tuple ``(sT, sH, sW)``. Default: 1
+    padding: ``dilation * (kernel_size - 1) - padding`` zero-padding will be added to both
+      sides of each dimension in the input. Can be a single number or a tuple
+      ``(padT, padH, padW)``. Default: 0
+    output_padding: additional size added to one side of each dimension in the
+      output shape. Can be a single number or a tuple
+      ``(out_padT, out_padH, out_padW)``. Default: 0
+    groups: split input into groups, :math:`\text{in\_channels}` should be divisible by the
+      number of groups. Default: 1
+    dilation: the spacing between kernel elements. Can be a single number or
+      a tuple `(dT, dH, dW)`. Default: 1
+
+Examples::
+
+    >>> inputs = torch.randn(20, 16, 50, 10, 20)
+    >>> weights = torch.randn(16, 33, 3, 3, 3)
+    >>> F.conv_transpose3d(inputs, weights)
+""",
+)  # noqa: E501
+
+conv_tbc = _add_docstr(
+    torch.conv_tbc,
+    r"""
+Applies a 1-dimensional sequence convolution over an input sequence.
+Input and output dimensions are (Time, Batch, Channels) - hence TBC.
+
+Args:
+    input: input tensor of shape :math:`(\text{sequence length} \times batch \times \text{in\_channels})`
+    weight: filter of shape (:math:`\text{kernel width} \times \text{in\_channels} \times \text{out\_channels}`)
+    bias: bias of shape (:math:`\text{out\_channels}`)
+    pad: number of timesteps to pad. Default: 0
+""",
+)
+
+
+# Pooling
+avg_pool1d = _add_docstr(
+    torch.avg_pool1d,
+    r"""
+avg_pool1d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) -> Tensor
+
+Applies a 1D average pooling over an input signal composed of several
+input planes.
+
+See :class:`~torch.nn.AvgPool1d` for details and output shape.
+
+Args:
+    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
+    kernel_size: the size of the window. Can be a single number or a
+      tuple `(kW,)`
+    stride: the stride of the window. Can be a single number or a tuple
+      `(sW,)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padW,)`. Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` to compute the
+        output shape. Default: ``False``
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation. Default: ``True``
+
+Examples::
+
+    >>> # pool of square window of size=3, stride=2
+    >>> input = torch.tensor([[[1, 2, 3, 4, 5, 6, 7]]], dtype=torch.float32)
+    >>> F.avg_pool1d(input, kernel_size=3, stride=2)
+    tensor([[[ 2.,  4.,  6.]]])
+
+""",
+)
+
+
+avg_pool2d = _add_docstr(
+    torch._C._nn.avg_pool2d,
+    r"""
+avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None) -> Tensor
+
+Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size
+:math:`sH \times sW` steps. The number of output features is equal to the number of
+input planes.
+
+See :class:`~torch.nn.AvgPool2d` for details and output shape.
+
+Args:
+    input: input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+    kernel_size: size of the pooling region. Can be a single number or a
+      tuple `(kH, kW)`
+    stride: stride of the pooling operation. Can be a single number or a
+      tuple `(sH, sW)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padH, padW)`. Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+        to compute the output shape. Default: ``False``
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation. Default: ``True``
+    divisor_override: if specified, it will be used as divisor, otherwise
+         size of the pooling region will be used. Default: None
+""",
+)
+
+avg_pool3d = _add_docstr(
+    torch._C._nn.avg_pool3d,
+    r"""
+avg_pool3d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True, divisor_override=None) -> Tensor
+
+Applies 3D average-pooling operation in :math:`kT \times kH \times kW` regions by step
+size :math:`sT \times sH \times sW` steps. The number of output features is equal to
+:math:`\lfloor\frac{\text{input planes}}{sT}\rfloor`.
+
+See :class:`~torch.nn.AvgPool3d` for details and output shape.
+
+Args:
+    input: input tensor :math:`(\text{minibatch} , \text{in\_channels} , iT \times iH , iW)`
+    kernel_size: size of the pooling region. Can be a single number or a
+      tuple `(kT, kH, kW)`
+    stride: stride of the pooling operation. Can be a single number or a
+      tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
+    padding: implicit zero paddings on both sides of the input. Can be a
+      single number or a tuple `(padT, padH, padW)`, Default: 0
+    ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+        to compute the output shape
+    count_include_pad: when True, will include the zero-padding in the
+        averaging calculation
+    divisor_override: if specified, it will be used as divisor, otherwise
+        size of the pooling region will be used. Default: None
+""",
+)
+
+
+def fractional_max_pool2d_with_indices(
+    input: Tensor, kernel_size: BroadcastingList2[int],
+    output_size: Optional[BroadcastingList2[int]] = None,
+    output_ratio: Optional[BroadcastingList2[float]] = None,
+    return_indices: bool = False,
+    _random_samples: Optional[Tensor] = None
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    fractional_max_pool2d(input, kernel_size, output_size=None, output_ratio=None, return_indices=False, _random_samples=None)
+
+    Applies 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number :math:`k` (for a square kernel of :math:`k \times k`)
+                     or a tuple `(kH, kW)`
+        output_size: the target output size of the image of the form :math:`oH \times oW`.
+                     Can be a tuple `(oH, oW)` or a single number :math:`oH` for a square image :math:`oH \times oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :func:`~torch.nn.functional.max_unpool2d`.
+
+    Examples::
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> F.fractional_max_pool2d(input, 3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> F.fractional_max_pool2d(input, 3, output_ratio=(0.5, 0.5))
+
+    .. _Fractional MaxPooling:
+        http://arxiv.org/abs/1412.6071
+    """
+    if has_torch_function_variadic(input, _random_samples):
+        return handle_torch_function(
+            fractional_max_pool2d_with_indices,
+            (input, _random_samples),
+            input,
+            kernel_size,
+            output_size=output_size,
+            output_ratio=output_ratio,
+            return_indices=return_indices,
+            _random_samples=_random_samples,
+        )
+    if output_size is None and output_ratio is None:
+        raise ValueError("fractional_max_pool2d requires specifying either an output_size or an output_ratio")
+    if output_size is None:
+        assert output_ratio is not None
+        if len(output_ratio) > 2:
+            raise ValueError("fractional_max_pool2d requires output_ratio to either be a single Int or tuple of Ints.")
+        _output_ratio = _pair(output_ratio)
+        output_size = [int(input.size(-2) * _output_ratio[0]), int(input.size(-1) * _output_ratio[1])]
+
+    if _random_samples is None:
+        n_batch = 1 if input.dim() == 3 else input.size(0)
+        _random_samples = torch.rand(n_batch, input.size(-3), 2, dtype=input.dtype, device=input.device)
+    return torch._C._nn.fractional_max_pool2d(input, kernel_size, output_size, _random_samples)
+
+
+def _fractional_max_pool2d(
+    input: Tensor, kernel_size: BroadcastingList2[int],
+    output_size: Optional[BroadcastingList2[int]] = None,
+    output_ratio: Optional[BroadcastingList2[float]] = None,
+    return_indices: bool = False,
+    _random_samples: Optional[Tensor] = None
+) -> Tensor:
+    if has_torch_function_variadic(input, _random_samples):
+        return handle_torch_function(
+            fractional_max_pool2d,
+            (input, _random_samples),
+            input,
+            kernel_size,
+            output_size=output_size,
+            output_ratio=output_ratio,
+            return_indices=return_indices,
+            _random_samples=_random_samples,
+        )
+    return fractional_max_pool2d_with_indices(
+        input, kernel_size, output_size, output_ratio, return_indices, _random_samples
+    )[0]
+
+
+fractional_max_pool2d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=4,
+    default=False,
+    if_true=fractional_max_pool2d_with_indices,
+    if_false=_fractional_max_pool2d,
+    module_name=__name__,
+    func_name="fractional_max_pool2d",
+)
+
+
+def fractional_max_pool3d_with_indices(
+    input: Tensor, kernel_size: BroadcastingList3[int],
+    output_size: Optional[BroadcastingList3[int]] = None,
+    output_ratio: Optional[BroadcastingList3[float]] = None,
+    return_indices: bool = False,
+    _random_samples: Optional[Tensor] = None
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    fractional_max_pool3d(input, kernel_size, output_size=None, output_ratio=None, return_indices=False, _random_samples=None)
+
+    Applies 3D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number :math:`k` (for a square kernel of :math:`k \times k \times k`)
+                     or a tuple `(kT, kH, kW)`
+        output_size: the target output size of the form :math:`oT \times oH \times oW`.
+                     Can be a tuple `(oT, oH, oW)` or a single number :math:`oH` for a cubic output
+                     :math:`oH \times oH \times oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :func:`~torch.nn.functional.max_unpool3d`.
+
+    Shape:
+        - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})`
+
+    Examples::
+        >>> input = torch.randn(20, 16, 50, 32, 16)
+        >>> # pool of cubic window of size=3, and target output size 13x12x11
+        >>> F.fractional_max_pool3d(input, 3, output_size=(13, 12, 11))
+        >>> # pool of cubic window and target output size being half of input size
+        >>> F.fractional_max_pool3d(input, 3, output_ratio=(0.5, 0.5, 0.5))
+
+    .. _Fractional MaxPooling:
+        http://arxiv.org/abs/1412.6071
+    """
+    if has_torch_function_variadic(input, _random_samples):
+        return handle_torch_function(
+            fractional_max_pool3d_with_indices,
+            (input, _random_samples),
+            input,
+            kernel_size,
+            output_size=output_size,
+            output_ratio=output_ratio,
+            return_indices=return_indices,
+            _random_samples=_random_samples,
+        )
+    if output_size is None and output_ratio is None:
+        raise ValueError("fractional_max_pool3d requires specifying either an output_size or an output_ratio")
+    if output_size is None:
+        assert output_ratio is not None
+        _output_ratio = _triple(output_ratio)
+        output_size = [
+            int(input.size(-3) * _output_ratio[0]),
+            int(input.size(-2) * _output_ratio[1]),
+            int(input.size(-1) * _output_ratio[2]),
+        ]
+
+    if _random_samples is None:
+        n_batch = 1 if input.dim() == 4 else input.size(0)
+        _random_samples = torch.rand(n_batch, input.size(-4), 3, dtype=input.dtype, device=input.device)
+    return torch._C._nn.fractional_max_pool3d(input, kernel_size, output_size, _random_samples)
+
+
+def _fractional_max_pool3d(
+    input: Tensor, kernel_size: BroadcastingList3[int],
+    output_size: Optional[BroadcastingList3[int]] = None,
+    output_ratio: Optional[BroadcastingList3[float]] = None,
+    return_indices: bool = False,
+    _random_samples: Optional[Tensor] = None
+) -> Tensor:
+    if has_torch_function_variadic(input, _random_samples):
+        return handle_torch_function(
+            fractional_max_pool3d,
+            (input, _random_samples),
+            input,
+            kernel_size,
+            output_size=output_size,
+            output_ratio=output_ratio,
+            return_indices=return_indices,
+            _random_samples=_random_samples,
+        )
+    return fractional_max_pool3d_with_indices(
+        input, kernel_size, output_size, output_ratio, return_indices, _random_samples
+    )[0]
+
+
+fractional_max_pool3d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=4,
+    default=False,
+    if_true=fractional_max_pool3d_with_indices,
+    if_false=_fractional_max_pool3d,
+    module_name=__name__,
+    func_name="fractional_max_pool3d",
+)
+
+
+def max_pool1d_with_indices(
+    input: Tensor, kernel_size: BroadcastingList1[int],
+    stride: Optional[BroadcastingList1[int]] = None,
+    padding: BroadcastingList1[int] = 0,
+    dilation: BroadcastingList1[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    max_pool1d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False)
+
+    Applies a 1D max pooling over an input signal composed of several input
+    planes.
+
+    .. note::
+        The order of :attr:`ceil_mode` and :attr:`return_indices` is different from
+        what seen in :class:`~torch.nn.MaxPool1d`, and will change in a future release.
+
+    See :class:`~torch.nn.MaxPool1d` for details.
+
+    Args:
+        input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`, minibatch dim optional.
+        kernel_size: the size of the window. Can be a single number or a
+            tuple `(kW,)`
+        stride: the stride of the window. Can be a single number or a tuple
+            `(sW,)`. Default: :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.functional.max_unpool1d` later
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool1d_with_indices,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.max_pool1d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+def _max_pool1d(
+    input: Tensor, kernel_size: BroadcastingList1[int],
+    stride: Optional[BroadcastingList1[int]] = None,
+    padding: BroadcastingList1[int] = 0,
+    dilation: BroadcastingList1[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool1d,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+max_pool1d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=6,
+    default=False,
+    if_true=max_pool1d_with_indices,
+    if_false=_max_pool1d,
+    module_name=__name__,
+    func_name="max_pool1d",
+)
+
+
+def max_pool2d_with_indices(
+    input: Tensor, kernel_size: BroadcastingList2[int],
+    stride: Optional[BroadcastingList2[int]] = None,
+    padding: BroadcastingList2[int] = 0,
+    dilation: BroadcastingList2[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False)
+
+    Applies a 2D max pooling over an input signal composed of several input
+    planes.
+
+    .. note::
+        The order of :attr:`ceil_mode` and :attr:`return_indices` is different from
+        what seen in :class:`~torch.nn.MaxPool2d`, and will change in a future release.
+
+    See :class:`~torch.nn.MaxPool2d` for details.
+
+    Args:
+        input: input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`, minibatch dim optional.
+        kernel_size: size of the pooling region. Can be a single number or a
+            tuple `(kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+            tuple `(sH, sW)`. Default: :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.functional.max_unpool2d` later
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool2d_with_indices,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch._C._nn.max_pool2d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+def _max_pool2d(
+    input: Tensor, kernel_size: BroadcastingList2[int],
+    stride: Optional[BroadcastingList2[int]] = None,
+    padding: BroadcastingList2[int] = 0,
+    dilation: BroadcastingList2[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool2d,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+max_pool2d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=6,
+    default=False,
+    if_true=max_pool2d_with_indices,
+    if_false=_max_pool2d,
+    module_name=__name__,
+    func_name="max_pool2d",
+)
+
+
+def max_pool3d_with_indices(
+    input: Tensor, kernel_size: BroadcastingList3[int],
+    stride: Optional[BroadcastingList3[int]] = None,
+    padding: BroadcastingList3[int] = 0,
+    dilation: BroadcastingList3[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    max_pool3d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False)
+
+    Applies a 3D max pooling over an input signal composed of several input
+    planes.
+
+    .. note::
+        The order of :attr:`ceil_mode` and :attr:`return_indices` is different from
+        what seen in :class:`~torch.nn.MaxPool3d`, and will change in a future release.
+
+    See :class:`~torch.nn.MaxPool3d` for details.
+
+    Args:
+        input: input tensor :math:`(\text{minibatch} , \text{in\_channels} , iD, iH , iW)`, minibatch dim optional.
+        kernel_size: size of the pooling region. Can be a single number or a
+                     tuple `(kT, kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+                tuple `(sT, sH, sW)`. Default: :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.functional.max_unpool3d` later
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool3d_with_indices,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch._C._nn.max_pool3d_with_indices(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+def _max_pool3d(
+    input: Tensor, kernel_size: BroadcastingList3[int],
+    stride: Optional[BroadcastingList3[int]] = None,
+    padding: BroadcastingList3[int] = 0,
+    dilation: BroadcastingList3[int] = 1,
+    ceil_mode: bool = False,
+    return_indices: bool = False
+) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_pool3d,
+            (input,),
+            input,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            ceil_mode=ceil_mode,
+            return_indices=return_indices,
+        )
+    if stride is None:
+        stride = torch.jit.annotate(List[int], [])
+    return torch.max_pool3d(input, kernel_size, stride, padding, dilation, ceil_mode)
+
+
+max_pool3d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=6,
+    default=False,
+    if_true=max_pool3d_with_indices,
+    if_false=_max_pool3d,
+    module_name=__name__,
+    func_name="max_pool3d",
+)
+
+
+def _unpool_output_size(
+    input: Tensor, kernel_size: List[int], stride: List[int], padding: List[int], output_size: Optional[List[int]]
+) -> List[int]:
+    input_size = input.size()
+    default_size = torch.jit.annotate(List[int], [])
+    for d in range(len(kernel_size)):
+        default_size.append((input_size[-len(kernel_size) + d] - 1) * stride[d] + kernel_size[d] - 2 * padding[d])
+    if output_size is None:
+        ret = default_size
+    else:
+        if len(output_size) == len(kernel_size) + 2:
+            output_size = output_size[2:]
+        if len(output_size) != len(kernel_size):
+            raise ValueError(
+                "output_size should be a sequence containing "
+                f"{len(kernel_size)} or {len(kernel_size) + 2} elements, but it has a length of '{len(output_size)}'"
+            )
+        for d in range(len(kernel_size)):
+            min_size = default_size[d] - stride[d]
+            max_size = default_size[d] + stride[d]
+            if not (min_size < output_size[d] < max_size):
+                raise ValueError(
+                    f'invalid output_size "{output_size}" (dim {d} must be between {min_size} and {max_size})'
+                )
+
+        ret = output_size
+    return ret
+
+
+def max_unpool1d(
+    input: Tensor, indices: Tensor,
+    kernel_size: BroadcastingList1[int],
+    stride: Optional[BroadcastingList1[int]] = None,
+    padding: BroadcastingList1[int] = 0,
+    output_size: Optional[BroadcastingList1[int]] = None
+) -> Tensor:
+    r"""Compute a partial inverse of :class:`MaxPool1d`.
+
+    See :class:`~torch.nn.MaxUnpool1d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_unpool1d,
+            (input,),
+            input,
+            indices,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_size=output_size,
+        )
+    kernel_size = _single(kernel_size)
+    if stride is not None:
+        _stride = _single(stride)
+    else:
+        _stride = kernel_size
+    padding = _single(padding)
+    output_size = _unpool_output_size(input, kernel_size, _stride, padding, output_size)
+    if isinstance(output_size, list):
+        output_size = output_size + [1]
+    else:
+        output_size = output_size + (1,)
+    return torch._C._nn.max_unpool2d(input.unsqueeze(-1), indices.unsqueeze(-1), output_size).squeeze(-1)
+
+
+def max_unpool2d(
+    input: Tensor, indices: Tensor,
+    kernel_size: BroadcastingList2[int],
+    stride: Optional[BroadcastingList2[int]] = None,
+    padding: BroadcastingList2[int] = 0,
+    output_size: Optional[BroadcastingList2[int]] = None
+) -> Tensor:
+    r"""Compute a partial inverse of :class:`MaxPool2d`.
+
+    See :class:`~torch.nn.MaxUnpool2d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_unpool2d,
+            (input,),
+            input,
+            indices,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_size=output_size,
+        )
+    kernel_size = _pair(kernel_size)
+    if stride is not None:
+        _stride = _pair(stride)
+    else:
+        _stride = kernel_size
+    padding = _pair(padding)
+    output_size = _unpool_output_size(input, kernel_size, _stride, padding, output_size)
+    return torch._C._nn.max_unpool2d(input, indices, output_size)
+
+
+def max_unpool3d(
+    input: Tensor, indices: Tensor,
+    kernel_size: BroadcastingList3[int],
+    stride: Optional[BroadcastingList3[int]] = None,
+    padding: BroadcastingList3[int] = 0,
+    output_size: Optional[BroadcastingList3[int]] = None
+) -> Tensor:
+    r"""Compute a partial inverse of :class:`MaxPool3d`.
+
+    See :class:`~torch.nn.MaxUnpool3d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            max_unpool3d,
+            (input,),
+            input,
+            indices,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            output_size=output_size,
+        )
+    kernel_size = _triple(kernel_size)
+    if stride is not None:
+        _stride = _triple(stride)
+    else:
+        _stride = kernel_size
+    padding = _triple(padding)
+    output_size = _unpool_output_size(input, kernel_size, _stride, padding, output_size)
+    return torch._C._nn.max_unpool3d(input, indices, output_size, _stride, padding)
+
+
+def lp_pool3d(
+    input: Tensor, norm_type: Union[int, float],
+    kernel_size: BroadcastingList3[int],
+    stride: Optional[BroadcastingList3[int]] = None,
+    ceil_mode: bool = False
+) -> Tensor:
+    r"""
+    Apply a 3D power-average pooling over an input signal composed of several input planes.
+
+    If the sum of all inputs to the power of `p` is
+    zero, the gradient is set to zero as well.
+
+    See :class:`~torch.nn.LPPool3d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            lp_pool3d, (input,), input, norm_type, kernel_size, stride=stride, ceil_mode=ceil_mode
+        )
+    kd, kw, kh = utils._triple(kernel_size)
+    if stride is not None:
+        out = avg_pool3d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
+    else:
+        out = avg_pool3d(input.pow(norm_type), kernel_size, padding=0, ceil_mode=ceil_mode)
+
+    return (torch.sign(out) * relu(torch.abs(out))).mul(kd * kw * kh).pow(1.0 / norm_type)
+
+
+def lp_pool2d(
+    input: Tensor, norm_type: Union[int, float],
+    kernel_size: BroadcastingList2[int],
+    stride: Optional[BroadcastingList2[int]] = None,
+    ceil_mode: bool = False
+) -> Tensor:
+    r"""
+    Apply a 2D power-average pooling over an input signal composed of several input planes.
+
+    If the sum of all inputs to the power of `p` is
+    zero, the gradient is set to zero as well.
+
+    See :class:`~torch.nn.LPPool2d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            lp_pool2d, (input,), input, norm_type, kernel_size, stride=stride, ceil_mode=ceil_mode
+        )
+    kw, kh = utils._pair(kernel_size)
+    if stride is not None:
+        out = avg_pool2d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
+    else:
+        out = avg_pool2d(input.pow(norm_type), kernel_size, padding=0, ceil_mode=ceil_mode)
+
+    return (torch.sign(out) * relu(torch.abs(out))).mul(kw * kh).pow(1.0 / norm_type)
+
+
+def lp_pool1d(
+    input: Tensor, norm_type: Union[int, float],
+    kernel_size: int,
+    stride: Optional[BroadcastingList1[int]] = None,
+    ceil_mode: bool = False
+) -> Tensor:
+    r"""Apply a 1D power-average pooling over an input signal composed of several input planes.
+
+    If the sum of all inputs to the power of `p` is
+    zero, the gradient is set to zero as well.
+
+    See :class:`~torch.nn.LPPool1d` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            lp_pool1d, (input,), input, norm_type, kernel_size, stride=stride, ceil_mode=ceil_mode
+        )
+    if stride is not None:
+        out = avg_pool1d(input.pow(norm_type), kernel_size, stride, 0, ceil_mode)
+    else:
+        out = avg_pool1d(input.pow(norm_type), kernel_size, padding=0, ceil_mode=ceil_mode)
+
+    return (torch.sign(out) * relu(torch.abs(out))).mul(kernel_size).pow(1.0 / norm_type)
+
+
+def adaptive_max_pool1d_with_indices(
+    input: Tensor, output_size: BroadcastingList1[int], return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    adaptive_max_pool1d(input, output_size, return_indices=False)
+
+    Applies a 1D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool1d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool1d_with_indices, (input,), input, output_size, return_indices=return_indices
+        )
+    return torch.adaptive_max_pool1d(input, output_size)
+
+
+def _adaptive_max_pool1d(input: Tensor, output_size: BroadcastingList1[int], return_indices: bool = False) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool1d, (input,), input, output_size, return_indices=return_indices
+        )
+    return adaptive_max_pool1d_with_indices(input, output_size)[0]
+
+
+adaptive_max_pool1d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=2,
+    default=False,
+    if_true=adaptive_max_pool1d_with_indices,
+    if_false=_adaptive_max_pool1d,
+    module_name=__name__,
+    func_name="adaptive_max_pool1d",
+)
+
+
+def adaptive_max_pool2d_with_indices(
+    input: Tensor, output_size: BroadcastingList2[int],
+    return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""adaptive_max_pool2d(input, output_size, return_indices=False)
+
+    Applies a 2D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            double-integer tuple)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool2d_with_indices, (input,), input, output_size, return_indices=return_indices
+        )
+    output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_max_pool2d(input, output_size)
+
+
+def _adaptive_max_pool2d(input: Tensor, output_size: BroadcastingList2[int], return_indices: bool = False) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool2d, (input,), input, output_size, return_indices=return_indices
+        )
+    return adaptive_max_pool2d_with_indices(input, output_size)[0]
+
+
+adaptive_max_pool2d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=2,
+    default=False,
+    if_true=adaptive_max_pool2d_with_indices,
+    if_false=_adaptive_max_pool2d,
+    module_name=__name__,
+    func_name="adaptive_max_pool2d",
+)
+
+
+def adaptive_max_pool3d_with_indices(
+    input: Tensor, output_size: BroadcastingList3[int],
+    return_indices: bool = False
+) -> Tuple[Tensor, Tensor]:  # noqa: D400
+    r"""
+    adaptive_max_pool3d(input, output_size, return_indices=False)
+
+    Applies a 3D adaptive max pooling over an input signal composed of
+    several input planes.
+
+    See :class:`~torch.nn.AdaptiveMaxPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            triple-integer tuple)
+        return_indices: whether to return pooling indices. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool3d_with_indices, (input,), input, output_size, return_indices=return_indices
+        )
+    output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_max_pool3d(input, output_size)
+
+
+def _adaptive_max_pool3d(input: Tensor, output_size: BroadcastingList3[int], return_indices: bool = False) -> Tensor:
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            adaptive_max_pool3d, (input,), input, output_size, return_indices=return_indices
+        )
+    return adaptive_max_pool3d_with_indices(input, output_size)[0]
+
+
+adaptive_max_pool3d = boolean_dispatch(
+    arg_name="return_indices",
+    arg_index=2,
+    default=False,
+    if_true=adaptive_max_pool3d_with_indices,
+    if_false=_adaptive_max_pool3d,
+    module_name=__name__,
+    func_name="adaptive_max_pool3d",
+)
+
+
+adaptive_avg_pool1d = _add_docstr(
+    torch.adaptive_avg_pool1d,
+    r"""
+adaptive_avg_pool1d(input, output_size) -> Tensor
+
+Applies a 1D adaptive average pooling over an input signal composed of
+several input planes.
+
+See :class:`~torch.nn.AdaptiveAvgPool1d` for details and output shape.
+
+Args:
+    output_size: the target output size (single integer)
+""",
+)
+
+
+def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor:
+    r"""Apply a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            double-integer tuple)
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(adaptive_avg_pool2d, (input,), input, output_size)
+    _output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_avg_pool2d(input, _output_size)
+
+
+def adaptive_avg_pool3d(input: Tensor, output_size: BroadcastingList3[int]) -> Tensor:
+    r"""Apply a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    See :class:`~torch.nn.AdaptiveAvgPool3d` for details and output shape.
+
+    Args:
+        output_size: the target output size (single integer or
+            triple-integer tuple)
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(adaptive_avg_pool3d, (input,), input, output_size)
+    _output_size = _list_with_default(output_size, input.size())
+    return torch._C._nn.adaptive_avg_pool3d(input, _output_size)
+
+
+# Activation functions
+def dropout(input: Tensor, p: float = 0.5, training: bool = True, inplace: bool = False) -> Tensor:
+    r"""During training, randomly zeroes some elements of the input tensor with probability :attr:`p`.
+
+    Uses samples from a Bernoulli distribution.
+
+    See :class:`~torch.nn.Dropout` for details.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        training: apply dropout if is ``True``. Default: ``True``
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(dropout, (input,), input, p=p, training=training, inplace=inplace)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    return _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
+
+
+def alpha_dropout(input: Tensor, p: float = 0.5, training: bool = False, inplace: bool = False) -> Tensor:
+    r"""Apply alpha dropout to the input.
+
+    See :class:`~torch.nn.AlphaDropout` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(alpha_dropout, (input,), input, p=p, training=training, inplace=inplace)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    return _VF.alpha_dropout_(input, p, training) if inplace else _VF.alpha_dropout(input, p, training)
+
+
+def dropout1d(input: Tensor, p: float = 0.5, training: bool = True, inplace: bool = False) -> Tensor:
+    r"""Randomly zero out entire channels (a channel is a 1D feature map).
+
+    For example, the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 1D tensor :math:`\text{input}[i, j]` of the input tensor.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    See :class:`~torch.nn.Dropout1d` for details.
+
+    Args:
+        p: probability of a channel to be zeroed. Default: 0.5
+        training: apply dropout if is ``True``. Default: ``True``
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(dropout1d, (input,), input, p=p, training=training, inplace=inplace)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    inp_dim = input.dim()
+    if inp_dim not in (2, 3):
+        raise RuntimeError(f"dropout1d: Expected 2D or 3D input, but received a {inp_dim}D input. "
+                           "Note that dropout1d exists to provide channel-wise dropout on inputs with 1 "
+                           "spatial dimension, a channel dimension, and an optional batch dimension "
+                           "(i.e. 2D or 3D inputs).")
+
+    is_batched = inp_dim == 3
+    if not is_batched:
+        input = input.unsqueeze_(0) if inplace else input.unsqueeze(0)
+
+    result = _VF.feature_dropout_(input, p, training) if inplace else _VF.feature_dropout(input, p, training)
+
+    if not is_batched:
+        result = result.squeeze_(0) if inplace else result.squeeze(0)
+
+    return result
+
+
+def dropout2d(input: Tensor, p: float = 0.5, training: bool = True, inplace: bool = False) -> Tensor:
+    r"""Randomly zero out entire channels (a channel is a 2D feature map).
+
+    For example, the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 2D tensor :math:`\text{input}[i, j]` of the input tensor.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    See :class:`~torch.nn.Dropout2d` for details.
+
+    Args:
+        p: probability of a channel to be zeroed. Default: 0.5
+        training: apply dropout if is ``True``. Default: ``True``
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(dropout2d, (input,), input, p=p, training=training, inplace=inplace)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    inp_dim = input.dim()
+    if inp_dim not in (3, 4):
+        warn_msg = (f"dropout2d: Received a {inp_dim}-D input to dropout2d, which is deprecated "
+                    "and will result in an error in a future release. To retain the behavior "
+                    "and silence this warning, please use dropout instead. Note that dropout2d "
+                    "exists to provide channel-wise dropout on inputs with 2 spatial dimensions, "
+                    "a channel dimension, and an optional batch dimension (i.e. 3D or 4D inputs).")
+        warnings.warn(warn_msg)
+
+    # TODO: Properly support no-batch-dim inputs. For now, these are NOT supported; passing
+    # a 3D input will perform dropout1d behavior instead. This was done historically and the
+    # behavior is maintained here for now.
+    # See https://github.com/pytorch/pytorch/issues/77081
+    if inp_dim == 3:
+        warnings.warn("dropout2d: Received a 3D input to dropout2d and assuming that channel-wise "
+                      "1D dropout behavior is desired - input is interpreted as shape (N, C, L), where C "
+                      "is the channel dim. This behavior will change in a future release to interpret the "
+                      "input as one without a batch dimension, i.e. shape (C, H, W). To maintain the 1D "
+                      "channel-wise dropout behavior, please switch to using dropout1d instead.")
+
+    result = _VF.feature_dropout_(input, p, training) if inplace else _VF.feature_dropout(input, p, training)
+
+    return result
+
+
+def dropout3d(input: Tensor, p: float = 0.5, training: bool = True, inplace: bool = False) -> Tensor:
+    r"""Randomly zero out entire channels (a channel is a 3D feature map).
+
+    For example, the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 3D tensor :math:`\text{input}[i, j]` of the input tensor.
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    See :class:`~torch.nn.Dropout3d` for details.
+
+    Args:
+        p: probability of a channel to be zeroed. Default: 0.5
+        training: apply dropout if is ``True``. Default: ``True``
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(dropout3d, (input,), input, p=p, training=training, inplace=inplace)
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    inp_dim = input.dim()
+    if inp_dim not in (4, 5):
+        warn_msg = (f"dropout3d: Received a {inp_dim}-D input to dropout3d, which is deprecated "
+                    "and will result in an error in a future release. To retain the behavior "
+                    "and silence this warning, please use dropout instead. Note that dropout3d "
+                    "exists to provide channel-wise dropout on inputs with 3 spatial dimensions, "
+                    "a channel dimension, and an optional batch dimension (i.e. 4D or 5D inputs).")
+        warnings.warn(warn_msg)
+
+    is_batched = inp_dim == 5
+    if not is_batched:
+        input = input.unsqueeze_(0) if inplace else input.unsqueeze(0)
+
+    result = _VF.feature_dropout_(input, p, training) if inplace else _VF.feature_dropout(input, p, training)
+
+    if not is_batched:
+        result = result.squeeze_(0) if inplace else result.squeeze(0)
+    return result
+
+
+def feature_alpha_dropout(input: Tensor, p: float = 0.5, training: bool = False, inplace: bool = False) -> Tensor:
+    r"""Randomly masks out entire channels (a channel is a feature map).
+
+    For example, the :math:`j`-th channel of the :math:`i`-th sample in the batch input
+    is a tensor :math:`\text{input}[i, j]` of the input tensor. Instead of
+    setting activations to zero, as in regular Dropout, the activations are set
+    to the negative saturation value of the SELU activation function.
+
+    Each element will be masked independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+    The elements to be masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit variance.
+
+    See :class:`~torch.nn.FeatureAlphaDropout` for details.
+
+    Args:
+        p: dropout probability of a channel to be zeroed. Default: 0.5
+        training: apply dropout if is ``True``. Default: ``True``
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            feature_alpha_dropout, (input,), input, p=p, training=training, inplace=inplace
+        )
+    if p < 0.0 or p > 1.0:
+        raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+    return _VF.feature_alpha_dropout_(input, p, training) if inplace else _VF.feature_alpha_dropout(input, p, training)
+
+
+def _threshold(input: Tensor, threshold: float, value: float, inplace: bool = False) -> Tensor:
+    r"""Apply a threshold to each element of the input Tensor.
+
+    See :class:`~torch.nn.Threshold` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(_threshold, (input,), input, threshold, value, inplace=inplace)
+    if inplace:
+        result = _VF.threshold_(input, threshold, value)
+    else:
+        result = _VF.threshold(input, threshold, value)
+    return result
+
+
+# We define this function as _threshold because it takes an argument
+# named threshold, which clobbers the recursive reference to the
+# function needed for __torch_function__ support
+threshold = _threshold
+
+threshold_ = _add_docstr(
+    _VF.threshold_,
+    r"""
+threshold_(input, threshold, value) -> Tensor
+
+In-place version of :func:`~threshold`.
+""",
+)
+
+
+def relu(input: Tensor, inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""relu(input, inplace=False) -> Tensor
+
+    Applies the rectified linear unit function element-wise. See
+    :class:`~torch.nn.ReLU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(relu, (input,), input, inplace=inplace)
+    if inplace:
+        result = torch.relu_(input)
+    else:
+        result = torch.relu(input)
+    return result
+
+
+relu_ = _add_docstr(
+    torch.relu_,
+    r"""
+relu_(input) -> Tensor
+
+In-place version of :func:`~relu`.
+""",
+)
+
+
+def glu(input: Tensor, dim: int = -1) -> Tensor:  # noqa: D400,D402
+    r"""
+    glu(input, dim=-1) -> Tensor
+
+    The gated linear unit. Computes:
+
+    .. math ::
+        \text{GLU}(a, b) = a \otimes \sigma(b)
+
+    where `input` is split in half along `dim` to form `a` and `b`, :math:`\sigma`
+    is the sigmoid function and :math:`\otimes` is the element-wise product between matrices.
+
+    See `Language Modeling with Gated Convolutional Networks <https://arxiv.org/abs/1612.08083>`_.
+
+    Args:
+        input (Tensor): input tensor
+        dim (int): dimension on which to split the input. Default: -1
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(glu, (input,), input, dim=dim)
+    if input.dim() == 0:
+        raise RuntimeError("glu does not support scalars because halving size must be even")
+    return torch._C._nn.glu(input, dim)
+
+
+def hardtanh(input: Tensor, min_val: float = -1., max_val: float = 1., inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""
+    hardtanh(input, min_val=-1., max_val=1., inplace=False) -> Tensor
+
+    Applies the HardTanh function element-wise. See :class:`~torch.nn.Hardtanh` for more
+    details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(hardtanh, (input,), input, min_val=min_val, max_val=max_val, inplace=inplace)
+    if inplace:
+        result = torch._C._nn.hardtanh_(input, min_val, max_val)
+    else:
+        result = torch._C._nn.hardtanh(input, min_val, max_val)
+    return result
+
+
+hardtanh_ = _add_docstr(
+    torch._C._nn.hardtanh_,
+    r"""
+hardtanh_(input, min_val=-1., max_val=1.) -> Tensor
+
+In-place version of :func:`~hardtanh`.
+""",
+)
+
+
+def relu6(input: Tensor, inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""relu6(input, inplace=False) -> Tensor
+
+    Applies the element-wise function :math:`\text{ReLU6}(x) = \min(\max(0,x), 6)`.
+
+    See :class:`~torch.nn.ReLU6` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(relu6, (input,), input, inplace=inplace)
+    if inplace:
+        result = torch._C._nn.relu6_(input)
+    else:
+        result = torch._C._nn.relu6(input)
+    return result
+
+
+def elu(input: Tensor, alpha: float = 1.0, inplace: bool = False) -> Tensor:
+    r"""Apply the Exponential Linear Unit (ELU) function element-wise.
+
+    See :class:`~torch.nn.ELU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(elu, (input,), input, alpha=alpha, inplace=inplace)
+    if inplace:
+        result = torch._C._nn.elu_(input, alpha)
+    else:
+        result = torch._C._nn.elu(input, alpha)
+    return result
+
+
+elu_ = _add_docstr(
+    torch._C._nn.elu_,
+    r"""
+elu_(input, alpha=1.) -> Tensor
+
+In-place version of :func:`~elu`.
+""",
+)
+
+
+def selu(input: Tensor, inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""selu(input, inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{SELU}(x) = scale * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))`,
+    with :math:`\alpha=1.6732632423543772848170429916717` and
+    :math:`scale=1.0507009873554804934193349852946`.
+
+    See :class:`~torch.nn.SELU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(selu, (input,), input, inplace=inplace)
+    if inplace:
+        result = torch.selu_(input)
+    else:
+        result = torch.selu(input)
+    return result
+
+
+selu_ = _add_docstr(
+    torch.selu_,
+    r"""
+selu_(input) -> Tensor
+
+In-place version of :func:`~selu`.
+""",
+)
+
+
+def celu(input: Tensor, alpha: float = 1.0, inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""celu(input, alpha=1., inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))`.
+
+    See :class:`~torch.nn.CELU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(celu, (input,), input, alpha=alpha, inplace=inplace)
+    if inplace:
+        result = torch.celu_(input, alpha)
+    else:
+        result = torch.celu(input, alpha)
+    return result
+
+
+celu_ = _add_docstr(
+    torch.celu_,
+    r"""
+celu_(input, alpha=1.) -> Tensor
+
+In-place version of :func:`~celu`.
+""",
+)
+
+
+def leaky_relu(input: Tensor, negative_slope: float = 0.01, inplace: bool = False) -> Tensor:  # noqa: D400,D402
+    r"""
+    leaky_relu(input, negative_slope=0.01, inplace=False) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)`
+
+    See :class:`~torch.nn.LeakyReLU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(leaky_relu, (input,), input, negative_slope=negative_slope, inplace=inplace)
+    if inplace:
+        result = torch._C._nn.leaky_relu_(input, negative_slope)
+    else:
+        result = torch._C._nn.leaky_relu(input, negative_slope)
+    return result
+
+
+leaky_relu_ = _add_docstr(
+    torch._C._nn.leaky_relu_,
+    r"""
+leaky_relu_(input, negative_slope=0.01) -> Tensor
+
+In-place version of :func:`~leaky_relu`.
+""",
+)
+
+
+prelu = _add_docstr(
+    torch.prelu,
+    r"""prelu(input, weight) -> Tensor
+
+Applies element-wise the function
+:math:`\text{PReLU}(x) = \max(0,x) + \text{weight} * \min(0,x)` where weight is a
+learnable parameter.
+
+.. note::
+    `weight` is expected to be a scalar or 1-D tensor. If `weight` is 1-D,
+    its size must match the number of input channels, determined by
+    `input.size(1)` when `input.dim() >= 2`, otherwise 1.
+    In the 1-D case, note that when `input` has dim > 2, `weight` can be expanded
+    to the shape of `input` in a way that is not possible using normal
+    :ref:`broadcasting semantics<broadcasting-semantics>`.
+
+See :class:`~torch.nn.PReLU` for more details.
+""")
+
+
+def rrelu(
+    input: Tensor, lower: float = 1.0 / 8, upper: float = 1.0 / 3, training: bool = False, inplace: bool = False
+) -> Tensor:  # noqa: D400,D402
+    r"""rrelu(input, lower=1./8, upper=1./3, training=False, inplace=False) -> Tensor
+
+    Randomized leaky ReLU.
+
+    See :class:`~torch.nn.RReLU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            rrelu, (input,), input, lower=lower, upper=upper, training=training, inplace=inplace
+        )
+    if inplace:
+        result = torch.rrelu_(input, lower, upper, training)
+    else:
+        result = torch.rrelu(input, lower, upper, training)
+    return result
+
+
+rrelu_ = _add_docstr(
+    torch.rrelu_,
+    r"""
+rrelu_(input, lower=1./8, upper=1./3, training=False) -> Tensor
+
+In-place version of :func:`~rrelu`.
+""",
+)
+
+logsigmoid = _add_docstr(
+    torch._C._nn.log_sigmoid,
+    r"""
+logsigmoid(input) -> Tensor
+
+Applies element-wise :math:`\text{LogSigmoid}(x_i) = \log \left(\frac{1}{1 + \exp(-x_i)}\right)`
+
+See :class:`~torch.nn.LogSigmoid` for more details.
+""",
+)
+
+gelu = _add_docstr(
+    torch._C._nn.gelu,
+    r"""
+gelu(input, approximate = 'none') -> Tensor
+
+When the approximate argument is 'none', it applies element-wise the function
+:math:`\text{GELU}(x) = x * \Phi(x)`
+
+where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+When the approximate argument is 'tanh', Gelu is estimated with
+
+.. math::
+    \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+
+See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_.
+""")
+
+hardshrink = _add_docstr(
+    torch.hardshrink,
+    r"""
+hardshrink(input, lambd=0.5) -> Tensor
+
+Applies the hard shrinkage function element-wise
+
+See :class:`~torch.nn.Hardshrink` for more details.
+""")
+
+
+def tanhshrink(input):  # noqa: D400,D402
+    r"""tanhshrink(input) -> Tensor
+
+    Applies element-wise, :math:`\text{Tanhshrink}(x) = x - \text{Tanh}(x)`
+
+    See :class:`~torch.nn.Tanhshrink` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(tanhshrink, (input,), input)
+    return input - input.tanh()
+
+
+def softsign(input):  # noqa: D400,D402
+    r"""softsign(input) -> Tensor
+
+    Applies element-wise, the function :math:`\text{SoftSign}(x) = \frac{x}{1 + |x|}`
+
+    See :class:`~torch.nn.Softsign` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(softsign, (input,), input)
+    return input / (input.abs() + 1)
+
+
+softplus = _add_docstr(
+    torch._C._nn.softplus,
+    r"""
+softplus(input, beta=1, threshold=20) -> Tensor
+
+Applies element-wise, the function :math:`\text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))`.
+
+For numerical stability the implementation reverts to the linear function
+when :math:`input \times \beta > threshold`.
+
+See :class:`~torch.nn.Softplus` for more details.
+""",
+)
+
+
+def _get_softmax_dim(name: str, ndim: int, stacklevel: int) -> int:
+    warnings.warn(
+        f"Implicit dimension choice for {name} has been deprecated. Change the call to include dim=X as an argument.",
+        stacklevel=stacklevel,
+    )
+    if ndim == 0 or ndim == 1 or ndim == 3:
+        ret = 0
+    else:
+        ret = 1
+    return ret
+
+
+def softmin(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3, dtype: Optional[DType] = None) -> Tensor:
+    r"""Apply a softmin function.
+
+    Note that :math:`\text{Softmin}(x) = \text{Softmax}(-x)`. See softmax definition for mathematical formula.
+
+    See :class:`~torch.nn.Softmin` for more details.
+
+    Args:
+        input (Tensor): input
+        dim (int): A dimension along which softmin will be computed (so every slice
+            along dim will sum to 1).
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(softmin, (input,), input, dim=dim, _stacklevel=_stacklevel, dtype=dtype)
+    if dim is None:
+        dim = _get_softmax_dim("softmin", input.dim(), _stacklevel)
+    if dtype is None:
+        ret = (-input).softmax(dim)
+    else:
+        ret = (-input).softmax(dim, dtype=dtype)
+    return ret
+
+
+def softmax(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3, dtype: Optional[DType] = None) -> Tensor:
+    r"""Apply a softmax function.
+
+    Softmax is defined as:
+
+    :math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}`
+
+    It is applied to all slices along dim, and will re-scale them so that the elements
+    lie in the range `[0, 1]` and sum to 1.
+
+    See :class:`~torch.nn.Softmax` for more details.
+
+    Args:
+        input (Tensor): input
+        dim (int): A dimension along which softmax will be computed.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+          If specified, the input tensor is casted to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
+
+    .. note::
+        This function doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use log_softmax instead (it's faster and has better numerical properties).
+
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(softmax, (input,), input, dim=dim, _stacklevel=_stacklevel, dtype=dtype)
+    if dim is None:
+        dim = _get_softmax_dim("softmax", input.dim(), _stacklevel)
+    if dtype is None:
+        ret = input.softmax(dim)
+    else:
+        ret = input.softmax(dim, dtype=dtype)
+    return ret
+
+
+def gumbel_softmax(logits: Tensor, tau: float = 1, hard: bool = False, eps: float = 1e-10, dim: int = -1) -> Tensor:
+    r"""
+    Sample from the Gumbel-Softmax distribution (`Link 1`_  `Link 2`_) and optionally discretize.
+
+    Args:
+      logits: `[..., num_features]` unnormalized log probabilities
+      tau: non-negative scalar temperature
+      hard: if ``True``, the returned samples will be discretized as one-hot vectors,
+            but will be differentiated as if it is the soft sample in autograd
+      dim (int): A dimension along which softmax will be computed. Default: -1.
+
+    Returns:
+      Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
+      If ``hard=True``, the returned samples will be one-hot, otherwise they will
+      be probability distributions that sum to 1 across `dim`.
+
+    .. note::
+      This function is here for legacy reasons, may be removed from nn.Functional in the future.
+
+    .. note::
+      The main trick for `hard` is to do  `y_hard - y_soft.detach() + y_soft`
+
+      It achieves two things:
+      - makes the output value exactly one-hot
+      (since we add then subtract y_soft value)
+      - makes the gradient equal to y_soft gradient
+      (since we strip all other gradients)
+
+    Examples::
+        >>> logits = torch.randn(20, 32)
+        >>> # Sample soft categorical using reparametrization trick:
+        >>> F.gumbel_softmax(logits, tau=1, hard=False)
+        >>> # Sample hard categorical using "Straight-through" trick:
+        >>> F.gumbel_softmax(logits, tau=1, hard=True)
+
+    .. _Link 1:
+        https://arxiv.org/abs/1611.00712
+    .. _Link 2:
+        https://arxiv.org/abs/1611.01144
+    """
+    if has_torch_function_unary(logits):
+        return handle_torch_function(gumbel_softmax, (logits,), logits, tau=tau, hard=hard, eps=eps, dim=dim)
+    if eps != 1e-10:
+        warnings.warn("`eps` parameter is deprecated and has no effect.")
+
+    gumbels = (
+        -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log()
+    )  # ~Gumbel(0,1)
+    gumbels = (logits + gumbels) / tau  # ~Gumbel(logits,tau)
+    y_soft = gumbels.softmax(dim)
+
+    if hard:
+        # Straight through.
+        index = y_soft.max(dim, keepdim=True)[1]
+        y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+        ret = y_hard - y_soft.detach() + y_soft
+    else:
+        # Reparametrization trick.
+        ret = y_soft
+    return ret
+
+
+def log_softmax(input: Tensor, dim: Optional[int] = None, _stacklevel: int = 3, dtype: Optional[DType] = None) -> Tensor:
+    r"""Apply a softmax followed by a logarithm.
+
+    While mathematically equivalent to log(softmax(x)), doing these two
+    operations separately is slower and numerically unstable. This function
+    uses an alternative formulation to compute the output and gradient correctly.
+
+    See :class:`~torch.nn.LogSoftmax` for more details.
+
+    Args:
+        input (Tensor): input
+        dim (int): A dimension along which log_softmax will be computed.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+          If specified, the input tensor is cast to :attr:`dtype` before the operation
+          is performed. This is useful for preventing data type overflows. Default: None.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(log_softmax, (input,), input, dim=dim, _stacklevel=_stacklevel, dtype=dtype)
+    if dim is None:
+        dim = _get_softmax_dim("log_softmax", input.dim(), _stacklevel)
+    if dtype is None:
+        ret = input.log_softmax(dim)
+    else:
+        ret = input.log_softmax(dim, dtype=dtype)
+    return ret
+
+
+softshrink = _add_docstr(
+    torch._C._nn.softshrink,
+    r"""
+softshrink(input, lambd=0.5) -> Tensor
+
+Applies the soft shrinkage function elementwise
+
+See :class:`~torch.nn.Softshrink` for more details.
+""",
+)
+
+
+def tanh(input):  # noqa: D400,D402
+    r"""tanh(input) -> Tensor
+
+    Applies element-wise,
+    :math:`\text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)}{\exp(x) + \exp(-x)}`
+
+    See :class:`~torch.nn.Tanh` for more details.
+    """
+    return input.tanh()
+
+
+def sigmoid(input):  # noqa: D400,D402
+    r"""sigmoid(input) -> Tensor
+
+    Applies the element-wise function :math:`\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}`
+
+    See :class:`~torch.nn.Sigmoid` for more details.
+    """
+    return input.sigmoid()
+
+
+def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""Apply the Hardsigmoid function element-wise.
+
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    See :class:`~torch.nn.Hardsigmoid` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(hardsigmoid, (input,), input, inplace=inplace)
+    if inplace:
+        return torch._C._nn.hardsigmoid_(input)
+    return torch._C._nn.hardsigmoid(input)
+
+
+linear = _add_docstr(
+    torch._C._nn.linear,
+    r"""
+linear(input, weight, bias=None) -> Tensor
+
+Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
+
+This operation supports 2-D :attr:`weight` with :ref:`sparse layout<sparse-docs>`
+
+{sparse_beta_warning}
+
+This operator supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+Shape:
+
+    - Input: :math:`(*, in\_features)` where `*` means any number of
+      additional dimensions, including none
+    - Weight: :math:`(out\_features, in\_features)` or :math:`(in\_features)`
+    - Bias: :math:`(out\_features)` or :math:`()`
+    - Output: :math:`(*, out\_features)` or :math:`(*)`, based on the shape of the weight
+""".format(**sparse_support_notes))
+
+
+bilinear = _add_docstr(
+    torch.bilinear,
+    r"""
+bilinear(input1, input2, weight, bias=None) -> Tensor
+
+Applies a bilinear transformation to the incoming data:
+:math:`y = x_1^T A x_2 + b`
+
+Shape:
+
+    - input1: :math:`(N, *, H_{in1})` where :math:`H_{in1}=\text{in1\_features}`
+      and :math:`*` means any number of additional dimensions.
+      All but the last dimension of the inputs should be the same.
+    - input2: :math:`(N, *, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`
+    - weight: :math:`(\text{out\_features}, \text{in1\_features},
+      \text{in2\_features})`
+    - bias: :math:`(\text{out\_features})`
+    - output: :math:`(N, *, H_{out})` where :math:`H_{out}=\text{out\_features}`
+      and all but the last dimension are the same shape as the input.
+""")
+
+
+def silu(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""Apply the Sigmoid Linear Unit (SiLU) function, element-wise.
+
+    The SiLU function is also known as the swish function.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
+        where the SiLU was experimented with later.
+
+    See :class:`~torch.nn.SiLU` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(silu, (input,), input, inplace=inplace)
+    if inplace:
+        return torch._C._nn.silu_(input)
+    return torch._C._nn.silu(input)
+
+
+def mish(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""Apply the Mish function, element-wise.
+
+    Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+    .. math::
+        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    See :class:`~torch.nn.Mish` for more details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(mish, (input,), input, inplace=inplace)
+    if inplace:
+        return torch._C._nn.mish_(input)
+    return torch._C._nn.mish(input)
+
+
+def hardswish(input: Tensor, inplace: bool = False) -> Tensor:
+    r"""Apply hardswish function, element-wise.
+
+    Follows implementation as described in the paper:
+    `Searching for MobileNetV3`_.
+
+    .. math::
+        \text{Hardswish}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+
+    See :class:`~torch.nn.Hardswish` for more details.
+
+    .. _`Searching for MobileNetV3`:
+        https://arxiv.org/abs/1905.02244
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(hardswish, (input,), input, inplace=inplace)
+    if inplace:
+        return torch._C._nn.hardswish_(input)
+    return torch._C._nn.hardswish(input)
+
+
+def _no_grad_embedding_renorm_(weight: Tensor, input: Tensor, max_norm: float, norm_type: float) -> Tuple[Tensor, Tensor]:
+    torch.embedding_renorm_(weight.detach(), input, max_norm, norm_type)
+
+
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx: Optional[int] = None,
+    max_norm: Optional[float] = None,
+    norm_type: float = 2.0,
+    scale_grad_by_freq: bool = False,
+    sparse: bool = False,
+) -> Tensor:
+    r"""Generate a simple lookup table that looks up embeddings in a fixed dictionary and size.
+
+    This module is often used to retrieve word embeddings using indices.
+    The input to the module is a list of indices, and the embedding matrix,
+    and the output is the corresponding word embeddings.
+
+    See :class:`torch.nn.Embedding` for more details.
+
+    .. note::
+        Note that the analytical gradients of this function with respect to
+        entries in :attr:`weight` at the row specified by :attr:`padding_idx`
+        are expected to differ from the numerical ones.
+
+    .. note::
+        Note that `:class:`torch.nn.Embedding` differs from this function in
+        that it initializes the row of :attr:`weight` specified by
+        :attr:`padding_idx` to all zeros on construction.
+
+    Args:
+        input (LongTensor): Tensor containing indices into the embedding matrix
+        weight (Tensor): The embedding matrix with number of rows equal to the maximum possible index + 1,
+            and number of columns equal to the embedding size
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                     therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                     i.e. it remains as a fixed "pad".
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+                                    Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
+
+    Shape:
+        - Input: LongTensor of arbitrary shape containing the indices to extract
+        - Weight: Embedding matrix of floating point type with shape `(V, embedding_dim)`,
+          where V = maximum index + 1 and embedding_dim = the embedding size
+        - Output: `(*, embedding_dim)`, where `*` is the input shape
+
+    Examples::
+
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # an embedding matrix containing 10 tensors of size 3
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> F.embedding(input, embedding_matrix)
+        tensor([[[ 0.8490,  0.9625,  0.6753],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.6246,  0.9751,  0.3618],
+                 [ 0.4161,  0.2419,  0.7383]],
+
+                [[ 0.6246,  0.9751,  0.3618],
+                 [ 0.0237,  0.7794,  0.0528],
+                 [ 0.9666,  0.7761,  0.6108],
+                 [ 0.3385,  0.8612,  0.1867]]])
+
+        >>> # example with padding_idx
+        >>> weights = torch.rand(10, 3)
+        >>> weights[0, :].zero_()
+        >>> embedding_matrix = weights
+        >>> input = torch.tensor([[0, 2, 0, 5]])
+        >>> F.embedding(input, embedding_matrix, padding_idx=0)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.5609,  0.5384,  0.8720],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [ 0.6262,  0.2438,  0.7471]]])
+    """
+    if has_torch_function_variadic(input, weight):
+        return handle_torch_function(
+            embedding,
+            (input, weight),
+            input,
+            weight,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            sparse=sparse,
+        )
+    if padding_idx is not None:
+        if padding_idx > 0:
+            assert padding_idx < weight.size(0), "Padding_idx must be within num_embeddings"
+        elif padding_idx < 0:
+            assert padding_idx >= -weight.size(0), "Padding_idx must be within num_embeddings"
+            padding_idx = weight.size(0) + padding_idx
+    else:
+        padding_idx = -1
+    if max_norm is not None:
+        # Note [embedding_renorm contiguous]
+        # `embedding_renorm_` will call .contiguous() on input anyways, so we
+        # call it here and take advantage of the improved locality in the
+        # `embedding` call below too.
+        input = input.contiguous()
+        # Note [embedding_renorm set_grad_enabled]
+        # XXX: equivalent to
+        # with torch.no_grad():
+        #   torch.embedding_renorm_
+        # remove once script supports set_grad_enabled
+        _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+
+
+def embedding_bag(
+    input: Tensor,
+    weight: Tensor,
+    offsets: Optional[Tensor] = None,
+    max_norm: Optional[float] = None,
+    norm_type: float = 2,
+    scale_grad_by_freq: bool = False,
+    mode: str = "mean",
+    sparse: bool = False,
+    per_sample_weights: Optional[Tensor] = None,
+    include_last_offset: bool = False,
+    padding_idx: Optional[int] = None,
+) -> Tensor:
+    r"""Compute sums, means or maxes of `bags` of embeddings.
+
+    Calculation is done without instantiating the intermediate embeddings.
+    See :class:`torch.nn.EmbeddingBag` for more details.
+
+    Note:
+        {backward_reproducibility_note}
+
+    Args:
+        input (LongTensor): Tensor containing bags of indices into the embedding matrix
+        weight (Tensor): The embedding matrix with number of rows equal to the maximum possible index + 1,
+            and number of columns equal to the embedding size
+        offsets (LongTensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                             the starting index position of each bag (sequence) in :attr:`input`.
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+                                    Note: this will modify :attr:`weight` in-place.
+        norm_type (float, optional): The ``p`` in the ``p``-norm to compute for the :attr:`max_norm` option.
+                                     Default ``2``.
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` will be a sparse tensor. See Notes under
+                                 :class:`torch.nn.Embedding` for more details regarding sparse gradients.
+                                 Note: this option is not supported when ``mode="max"``.
+        per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
+            to indicate all weights should be taken to be 1. If specified, :attr:`per_sample_weights`
+            must have exactly the same shape as input and is treated as having the same
+            :attr:`offsets`, if those are not None.
+
+        include_last_offset (bool, optional): if ``True``, the size of offsets is equal to the number of bags + 1.
+            The last element is the size of the input, or the ending index position of the last bag (sequence).
+
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the
+                                     gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated
+                                     during training, i.e. it remains as a fixed "pad". Note that the embedding
+                                     vector at :attr:`padding_idx` is excluded from the reduction.
+
+    Shape:
+        - :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
+
+          - If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences)
+            each of fixed length ``N``, and this will return ``B`` values aggregated in a way
+            depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+          - If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of
+            multiple bags (sequences). :attr:`offsets` is required to be a 1D tensor containing
+            the starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets`
+            of shape `(B)`, :attr:`input` will be viewed as having ``B`` bags.
+            Empty bags (i.e., having 0-length) will have returned vectors filled by zeros.
+
+        - :attr:`weight` (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
+
+        - :attr:`per_sample_weights` (Tensor, optional). Has the same shape as :attr:`input`.
+
+        - :attr:`output`: aggregated embedding values of shape `(B, embedding_dim)`
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
+        >>> offsets = torch.tensor([0, 4])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> F.embedding_bag(input, embedding_matrix, offsets)
+        tensor([[ 0.3397,  0.3552,  0.5545],
+                [ 0.5893,  0.4386,  0.5882]])
+
+        >>> # example with padding_idx
+        >>> embedding_matrix = torch.rand(10, 3)
+        >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9])
+        >>> offsets = torch.tensor([0, 4])
+        >>> F.embedding_bag(input, embedding_matrix, offsets, padding_idx=2, mode='sum')
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7082,  3.2145, -2.6251]])
+    """
+    if has_torch_function_variadic(input, weight, offsets, per_sample_weights):
+        return handle_torch_function(
+            embedding_bag,
+            (input, weight, offsets, per_sample_weights),
+            input,
+            weight,
+            offsets=offsets,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            mode=mode,
+            sparse=sparse,
+            per_sample_weights=per_sample_weights,
+            include_last_offset=include_last_offset,
+            padding_idx=padding_idx,
+        )
+    # Check for backward compatibility.
+    # Used to be embedding_bag(weight, input, ...)
+    # Now is     embedding_bag(input, weight, ...)
+    if weight.dtype == torch.long and input.is_floating_point():
+        warnings.warn(
+            "Argument order of nn.functional.embedding_bag was changed. "
+            "Usage `embedding_bag(weight, input, ...)` is deprecated, "
+            "and should now be `embedding_bag(input, weight, ...)`."
+        )
+        weight, input = input, weight
+
+    if per_sample_weights is not None and input.size() != per_sample_weights.size():
+        raise ValueError(
+            f"embedding_bag: If per_sample_weights ({per_sample_weights.shape}) is not None, "
+            f"then it must have the same shape as the input ({input.shape})"
+        )
+
+    if not weight.dim() == 2:
+        raise ValueError(
+            f"weight has to be a 2D Tensor, but got Tensor of dimension {weight.dim()}"
+        )
+
+    if input.dim() == 2:
+        if offsets is not None:
+            type_str = "<unknown>"
+            # TODO: Remove this once script supports type() calls
+            if not torch.jit.is_scripting():
+                type_str = str(type(offsets))
+            raise ValueError(
+                "if input is 2D, then offsets has to be None"
+                ", as input is treated is a mini-batch of"
+                " fixed length sequences. However, found "
+                f"offsets of type {type_str}"
+            )
+        offsets = torch.arange(0, input.numel(), input.size(1), dtype=input.dtype, device=input.device)
+
+        input = input.reshape(-1)
+        if per_sample_weights is not None:
+            per_sample_weights = per_sample_weights.reshape(-1)
+    elif input.dim() == 1:
+        if offsets is None:
+            raise ValueError("offsets has to be a 1D Tensor but got None")
+        if offsets.dim() != 1:
+            raise ValueError("offsets has to be a 1D Tensor")
+    else:
+        raise ValueError(f"input has to be 1D or 2D Tensor, but got Tensor of dimension {input.dim()}")
+    if mode == "sum":
+        mode_enum = 0
+    elif mode == "mean":
+        mode_enum = 1
+    elif mode == "max":
+        mode_enum = 2
+
+        if scale_grad_by_freq:
+            raise ValueError("max mode does not support scaling the gradient by the frequency")
+
+        if sparse:
+            raise ValueError("max mode does not support sparse weights")
+
+    else:
+        raise ValueError("mode has to be one of sum, mean or max")
+
+    if max_norm is not None:
+        # XXX: equivalent to
+        # with torch.no_grad():
+        #   torch.nembedding_renorm_
+        # remove once script supports set_grad_enabled
+        _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
+
+    if per_sample_weights is not None and mode != "sum":
+        raise NotImplementedError(
+            "embedding_bag: per_sample_weights was not None. "
+            "per_sample_weights is only supported for mode='sum' "
+            f"(got mode='{mode}'). Please open a feature request on GitHub."
+        )
+
+    ret, _, _, _ = torch.embedding_bag(
+        weight, input, offsets, scale_grad_by_freq, mode_enum, sparse, per_sample_weights, include_last_offset, padding_idx
+    )
+    return ret
+
+
+if embedding_bag.__doc__:
+    embedding_bag.__doc__ = embedding_bag.__doc__.format(**reproducibility_notes)
+
+
+def _verify_batch_size(size: List[int]) -> None:
+    # XXX: JIT script does not support the reduce from functools, and mul op is a
+    # builtin, which cannot be used as a value to a func yet, so rewrite this size
+    # check to a simple equivalent for loop
+    #
+    # TODO: make use of reduce like below when JIT is ready with the missing features:
+    # from operator import mul
+    # from functools import reduce
+    #
+    #   if reduce(mul, size[2:], size[0]) == 1
+    size_prods = size[0]
+    for i in range(len(size) - 2):
+        size_prods *= size[i + 2]
+    if size_prods == 1:
+        raise ValueError(f"Expected more than 1 value per channel when training, got input size {size}")
+
+
+def batch_norm(
+    input: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    training: bool = False,
+    momentum: float = 0.1,
+    eps: float = 1e-5,
+) -> Tensor:
+    r"""Apply Batch Normalization for each channel across a batch of data.
+
+    See :class:`~torch.nn.BatchNorm1d`, :class:`~torch.nn.BatchNorm2d`,
+    :class:`~torch.nn.BatchNorm3d` for details.
+    """
+    if has_torch_function_variadic(input, running_mean, running_var, weight, bias):
+        return handle_torch_function(
+            batch_norm,
+            (input, running_mean, running_var, weight, bias),
+            input,
+            running_mean,
+            running_var,
+            weight=weight,
+            bias=bias,
+            training=training,
+            momentum=momentum,
+            eps=eps,
+        )
+    if training:
+        _verify_batch_size(input.size())
+
+    return torch.batch_norm(
+        input, weight, bias, running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled
+    )
+
+
+def _verify_spatial_size(size: List[int]) -> None:
+    # Verify that there is > 1 spatial element for instance norm calculation.
+    size_prods = 1
+    for i in range(2, len(size)):
+        size_prods *= size[i]
+    if size_prods == 1:
+        raise ValueError(f"Expected more than 1 spatial element when training, got input size {size}")
+
+
+def instance_norm(
+    input: Tensor,
+    running_mean: Optional[Tensor] = None,
+    running_var: Optional[Tensor] = None,
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    use_input_stats: bool = True,
+    momentum: float = 0.1,
+    eps: float = 1e-5,
+) -> Tensor:
+    r"""Apply Instance Normalization independently for each channel in every data sample within a batch.
+
+    See :class:`~torch.nn.InstanceNorm1d`, :class:`~torch.nn.InstanceNorm2d`,
+    :class:`~torch.nn.InstanceNorm3d` for details.
+    """
+    if has_torch_function_variadic(input, running_mean, running_var, weight, bias):
+        return handle_torch_function(
+            instance_norm,
+            (input, running_mean, running_var, weight, bias),
+            input,
+            running_mean=running_mean,
+            running_var=running_var,
+            weight=weight,
+            bias=bias,
+            use_input_stats=use_input_stats,
+            momentum=momentum,
+            eps=eps,
+        )
+    if use_input_stats:
+        _verify_spatial_size(input.size())
+    return torch.instance_norm(
+        input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, torch.backends.cudnn.enabled
+    )
+
+
+def layer_norm(
+    input: Tensor,
+    normalized_shape: List[int],
+    weight: Optional[Tensor] = None,
+    bias: Optional[Tensor] = None,
+    eps: float = 1e-5,
+) -> Tensor:
+    r"""Apply Layer Normalization for last certain number of dimensions.
+
+    See :class:`~torch.nn.LayerNorm` for details.
+    """
+    if has_torch_function_variadic(input, weight, bias):
+        return handle_torch_function(
+            layer_norm, (input, weight, bias), input, normalized_shape, weight=weight, bias=bias, eps=eps
+        )
+    return torch.layer_norm(input, normalized_shape, weight, bias, eps, torch.backends.cudnn.enabled)
+
+
+def group_norm(
+    input: Tensor, num_groups: int, weight: Optional[Tensor] = None, bias: Optional[Tensor] = None, eps: float = 1e-5
+) -> Tensor:
+    r"""Apply Group Normalization for last certain number of dimensions.
+
+    See :class:`~torch.nn.GroupNorm` for details.
+    """
+    if has_torch_function_variadic(input, weight, bias):
+        return handle_torch_function(group_norm, (input, weight, bias,), input, num_groups, weight=weight, bias=bias, eps=eps)
+    if input.dim() < 2:
+        raise RuntimeError(f"Expected at least 2 dimensions for input tensor but received {input.dim()}")
+    _verify_batch_size([input.size(0) * input.size(1) // num_groups, num_groups] + list(input.size()[2:]))
+    return torch.group_norm(input, num_groups, weight, bias, eps, torch.backends.cudnn.enabled)
+
+
+def local_response_norm(input: Tensor, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.0) -> Tensor:
+    r"""Apply local response normalization over an input signal.
+
+    The input signal is composed of several input planes, where channels occupy the second dimension.
+    Normalization is applied across channels.
+
+    See :class:`~torch.nn.LocalResponseNorm` for details.
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(local_response_norm, (input,), input, size, alpha=alpha, beta=beta, k=k)
+    dim = input.dim()
+    if dim < 3:
+        raise ValueError(
+            f"Expected 3D or higher dimensionality                          input (got {dim} dimensions)"
+        )
+
+    if input.numel() == 0:
+        return input
+
+    div = input.mul(input)
+    if dim == 3:
+        div = div.unsqueeze(1)
+        div = pad(div, (0, 0, size // 2, (size - 1) // 2))
+        div = avg_pool2d(div, (size, 1), stride=1).squeeze(1)
+    else:
+        sizes = input.size()
+        div = div.view(sizes[0], 1, sizes[1], sizes[2], -1)
+        div = pad(div, (0, 0, 0, 0, size // 2, (size - 1) // 2))
+        div = avg_pool3d(div, (size, 1, 1), stride=1).squeeze(1)
+        div = div.view(sizes)
+    div = div.mul(alpha).add(k).pow(beta)
+    return input / div
+
+
+# loss
+
+
+def ctc_loss(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Tensor,
+    target_lengths: Tensor,
+    blank: int = 0,
+    reduction: str = "mean",
+    zero_infinity: bool = False,
+) -> Tensor:
+    r"""Apply the Connectionist Temporal Classification loss.
+
+    See :class:`~torch.nn.CTCLoss` for details.
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        {backward_reproducibility_note}
+
+    Args:
+        log_probs: :math:`(T, N, C)` or :math:`(T, C)` where `C = number of characters in alphabet including blank`,
+            `T = input length`, and `N = batch size`.
+            The logarithmized probabilities of the outputs
+            (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
+        targets: :math:`(N, S)` or `(sum(target_lengths))`.
+            Targets cannot be blank. In the second form, the targets are assumed to be concatenated.
+        input_lengths: :math:`(N)` or :math:`()`.
+            Lengths of the inputs (must each be :math:`\leq T`)
+        target_lengths: :math:`(N)` or :math:`()`.
+            Lengths of the targets
+        blank (int, optional):
+            Blank label. Default :math:`0`.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken, ``'sum'``: the output will be
+            summed. Default: ``'mean'``
+        zero_infinity (bool, optional):
+            Whether to zero infinite losses and the associated gradients.
+            Default: ``False``
+            Infinite losses mainly occur when the inputs are too short
+            to be aligned to the targets.
+
+    Example::
+
+        >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
+        >>> targets = torch.randint(1, 20, (16, 30), dtype=torch.long)
+        >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
+        >>> target_lengths = torch.randint(10, 30, (16,), dtype=torch.long)
+        >>> loss = F.ctc_loss(log_probs, targets, input_lengths, target_lengths)
+        >>> loss.backward()
+    """
+    if has_torch_function_variadic(log_probs, targets, input_lengths, target_lengths):
+        return handle_torch_function(
+            ctc_loss,
+            (log_probs, targets, input_lengths, target_lengths),
+            log_probs, targets, input_lengths, target_lengths,
+            blank=blank, reduction=reduction, zero_infinity=zero_infinity
+        )
+    return torch.ctc_loss(
+        log_probs, targets, input_lengths, target_lengths, blank, _Reduction.get_enum(reduction), zero_infinity
+    )
+
+
+if ctc_loss.__doc__:
+    ctc_loss.__doc__ = ctc_loss.__doc__.format(**reproducibility_notes)
+
+
+def nll_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    ignore_index: int = -100,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:
+    r"""Compute the negative log likelihood loss.
+
+    See :class:`~torch.nn.NLLLoss` for details.
+
+    Args:
+        input: :math:`(N, C)` where `C = number of classes` or :math:`(N, C, H, W)`
+            in case of 2D Loss, or :math:`(N, C, d_1, d_2, ..., d_K)` where :math:`K \geq 1`
+            in the case of K-dimensional loss. `input` is expected to be log-probabilities.
+        target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`,
+            or :math:`(N, d_1, d_2, ..., d_K)` where :math:`K \geq 1` for
+            K-dimensional loss.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Default: -100
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Example::
+
+        >>> # input is of size N x C = 3 x 5
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.tensor([1, 0, 4])
+        >>> output = F.nll_loss(F.log_softmax(input, dim=1), target)
+        >>> output.backward()
+    """
+    if has_torch_function_variadic(input, target, weight):
+        return handle_torch_function(
+            nll_loss,
+            (input, target, weight),
+            input,
+            target,
+            weight=weight,
+            size_average=size_average,
+            ignore_index=ignore_index,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
+
+
+def poisson_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    log_input: bool = True,
+    full: bool = False,
+    size_average: Optional[bool] = None,
+    eps: float = 1e-8,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:
+    r"""Poisson negative log likelihood loss.
+
+    See :class:`~torch.nn.PoissonNLLLoss` for details.
+
+    Args:
+        input: expectation of underlying Poisson distribution.
+        target: random sample :math:`target \sim \text{Poisson}(input)`.
+        log_input: if ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target} * \text{input}`, if ``False`` then loss is
+            :math:`\text{input} - \text{target} * \log(\text{input}+\text{eps})`. Default: ``True``
+        full: whether to compute full loss, i. e. to add the Stirling
+            approximation term. Default: ``False``
+            :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input`\ =\ ``False``. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            poisson_nll_loss,
+            (input, target),
+            input,
+            target,
+            log_input=log_input,
+            full=full,
+            size_average=size_average,
+            eps=eps,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    if reduction != "none" and reduction != "mean" and reduction != "sum":
+        ret = input
+        raise ValueError(reduction + " is not a valid value for reduction")
+
+    ret = torch.poisson_nll_loss(input, target, log_input, full, eps, _Reduction.get_enum(reduction))
+    return ret
+
+
+def gaussian_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    var: Tensor,
+    full: bool = False,
+    eps: float = 1e-6,
+    reduction: str = "mean",
+) -> Tensor:
+    r"""Gaussian negative log likelihood loss.
+
+    See :class:`~torch.nn.GaussianNLLLoss` for details.
+
+    Args:
+        input: expectation of the Gaussian distribution.
+        target: sample from the Gaussian distribution.
+        var: tensor of positive variance(s), one for each of the expectations
+            in the input (heteroscedastic), or a single one (homoscedastic).
+        full (bool, optional): include the constant term in the loss calculation. Default: ``False``.
+        eps (float, optional): value added to var, for stability. Default: 1e-6.
+        reduction (str, optional): specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output is the average of all batch member losses,
+            ``'sum'``: the output is the sum of all batch member losses.
+            Default: ``'mean'``.
+    """
+    if has_torch_function_variadic(input, target, var):
+        return handle_torch_function(
+            gaussian_nll_loss,
+            (input, target, var),
+            input,
+            target,
+            var,
+            full=full,
+            eps=eps,
+            reduction=reduction,
+        )
+
+    # Check var size
+    # If var.size == input.size, the case is heteroscedastic and no further checks are needed.
+    # Otherwise:
+    if var.size() != input.size():
+
+        # If var is one dimension short of input, but the sizes match otherwise, then this is a homoscedastic case.
+        # e.g. input.size = (10, 2, 3), var.size = (10, 2)
+        # -> unsqueeze var so that var.shape = (10, 2, 1)
+        # this is done so that broadcasting can happen in the loss calculation
+        if input.size()[:-1] == var.size():
+            var = torch.unsqueeze(var, -1)
+
+        # This checks if the sizes match up to the final dimension, and the final dimension of var is of size 1.
+        # This is also a homoscedastic case.
+        # e.g. input.size = (10, 2, 3), var.size = (10, 2, 1)
+        elif input.size()[:-1] == var.size()[:-1] and var.size(-1) == 1:  # Heteroscedastic case
+            pass
+
+        # If none of the above pass, then the size of var is incorrect.
+        else:
+            raise ValueError("var is of incorrect size")
+
+    # Check validity of reduction mode
+    if reduction != 'none' and reduction != 'mean' and reduction != 'sum':
+        raise ValueError(reduction + " is not valid")
+
+    # Entries of var must be non-negative
+    if torch.any(var < 0):
+        raise ValueError("var has negative entry/entries")
+
+    # Clamp for stability
+    var = var.clone()
+    with torch.no_grad():
+        var.clamp_(min=eps)
+
+    # Calculate the loss
+    loss = 0.5 * (torch.log(var) + (input - target)**2 / var)
+    if full:
+        loss += 0.5 * math.log(2 * math.pi)
+
+    if reduction == 'mean':
+        return loss.mean()
+    elif reduction == 'sum':
+        return loss.sum()
+    else:
+        return loss
+
+
+def kl_div(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+    log_target: bool = False,
+) -> Tensor:
+    r"""Compute the KL Divergence loss.
+
+    Refer - The `Kullback-Leibler divergence Loss
+    <https://en.wikipedia.org/wiki/Kullback-Leibler_divergence>`__
+
+    See :class:`~torch.nn.KLDivLoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape in log-probabilities.
+        target: Tensor of the same shape as input. See :attr:`log_target` for
+            the target's interpretation.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
+            ``'none'``: no reduction will be applied
+            ``'batchmean'``: the sum of the output will be divided by the batchsize
+            ``'sum'``: the output will be summed
+            ``'mean'``: the output will be divided by the number of elements in the output
+            Default: ``'mean'``
+        log_target (bool): A flag indicating whether ``target`` is passed in the log space.
+            It is recommended to pass certain distributions (like ``softmax``)
+            in the log space to avoid numerical issues caused by explicit ``log``.
+            Default: ``False``
+
+    .. note::
+        :attr:`size_average` and :attr:`reduce` are in the process of being deprecated,
+        and in the meantime, specifying either of those two args will override :attr:`reduction`.
+
+    .. warning::
+        :attr:`reduction` = ``'mean'`` doesn't return the true kl divergence value, please use
+        :attr:`reduction` = ``'batchmean'`` which aligns with KL math definition.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            kl_div,
+            (input, target),
+            input,
+            target,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+            log_target=log_target,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        if reduction == "mean":
+            warnings.warn(
+                "reduction: 'mean' divides the total loss by both the batch size and the support size."
+                "'batchmean' divides only by the batch size, and aligns with the KL div math definition."
+                "'mean' will be changed to behave the same as 'batchmean' in the next major release."
+            )
+
+        # special case for batchmean
+        if reduction == "batchmean":
+            reduction_enum = _Reduction.get_enum("sum")
+        else:
+            reduction_enum = _Reduction.get_enum(reduction)
+
+    reduced = torch.kl_div(input, target, reduction_enum, log_target=log_target)
+
+    if reduction == "batchmean" and input.dim() != 0:
+        reduced = reduced / input.size()[0]
+
+    return reduced
+
+
+def cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    ignore_index: int = -100,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+    label_smoothing: float = 0.0,
+) -> Tensor:
+    r"""Compute the cross entropy loss between input logits and target.
+
+    See :class:`~torch.nn.CrossEntropyLoss` for details.
+
+    Args:
+        input (Tensor) : Predicted unnormalized logits;
+            see Shape section below for supported shapes.
+        target (Tensor) : Ground truth class indices or class probabilities;
+            see Shape section below for supported shapes.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, has to be a Tensor of size `C`
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Note that
+            :attr:`ignore_index` is only applicable when the target contains class indices.
+            Default: -100
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
+
+    Shape:
+        - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of `K`-dimensional loss.
+        - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
+          :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`.
+          If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                C ={} & \text{number of classes} \\
+                N ={} & \text{batch size} \\
+            \end{aligned}
+
+    Examples::
+
+        >>> # Example of target with class indices
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randint(5, (3,), dtype=torch.int64)
+        >>> loss = F.cross_entropy(input, target)
+        >>> loss.backward()
+        >>>
+        >>> # Example of target with class probabilities
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5).softmax(dim=1)
+        >>> loss = F.cross_entropy(input, target)
+        >>> loss.backward()
+    """
+    if has_torch_function_variadic(input, target, weight):
+        return handle_torch_function(
+            cross_entropy,
+            (input, target, weight),
+            input,
+            target,
+            weight=weight,
+            size_average=size_average,
+            ignore_index=ignore_index,
+            reduce=reduce,
+            reduction=reduction,
+            label_smoothing=label_smoothing,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
+
+
+def binary_cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:
+    r"""Measure Binary Cross Entropy between the target and input probabilities.
+
+    See :class:`~torch.nn.BCELoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape as probabilities.
+        target: Tensor of the same shape as input with values between 0 and 1.
+        weight (Tensor, optional): a manual rescaling weight
+                if provided it's repeated to match input tensor shape
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Examples::
+
+        >>> input = torch.randn(3, 2, requires_grad=True)
+        >>> target = torch.rand(3, 2, requires_grad=False)
+        >>> loss = F.binary_cross_entropy(torch.sigmoid(input), target)
+        >>> loss.backward()
+    """
+    if has_torch_function_variadic(input, target, weight):
+        return handle_torch_function(
+            binary_cross_entropy,
+            (input, target, weight),
+            input,
+            target,
+            weight=weight,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    if target.size() != input.size():
+        raise ValueError(
+            "Using a target size ({}) that is different to the input size ({}) is deprecated. "
+            "Please ensure they have the same size.".format(target.size(), input.size())
+        )
+
+    if weight is not None:
+        new_size = _infer_size(target.size(), weight.size())
+        weight = weight.expand(new_size)
+
+    return torch._C._nn.binary_cross_entropy(input, target, weight, reduction_enum)
+
+
+def binary_cross_entropy_with_logits(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+    pos_weight: Optional[Tensor] = None,
+) -> Tensor:
+    r"""Calculate Binary Cross Entropy between target and input logits.
+
+    See :class:`~torch.nn.BCEWithLogitsLoss` for details.
+
+    Args:
+        input: Tensor of arbitrary shape as unnormalized scores (often referred to as logits).
+        target: Tensor of the same shape as input with values between 0 and 1
+        weight (Tensor, optional): a manual rescaling weight
+            if provided it's repeated to match input tensor shape
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when reduce is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target.
+            Must be a tensor with equal size along the class dimension to the number of classes.
+            Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired
+            operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
+            size [B, C, H, W] will apply different pos_weights to each element of the batch or
+            [C, H, W] the same pos_weights across the batch. To apply the same positive weight
+            along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+            Default: ``None``
+
+    Examples::
+
+         >>> input = torch.randn(3, requires_grad=True)
+         >>> target = torch.empty(3).random_(2)
+         >>> loss = F.binary_cross_entropy_with_logits(input, target)
+         >>> loss.backward()
+    """
+    if has_torch_function_variadic(input, target, weight, pos_weight):
+        return handle_torch_function(
+            binary_cross_entropy_with_logits,
+            (input, target, weight, pos_weight),
+            input,
+            target,
+            weight=weight,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+            pos_weight=pos_weight,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+
+    if not (target.size() == input.size()):
+        raise ValueError(f"Target size ({target.size()}) must be the same as input size ({input.size()})")
+
+    return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
+
+
+def smooth_l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+    beta: float = 1.0,
+) -> Tensor:
+    r"""Compute the Smooth L1 loss.
+
+    Function uses a squared term if the absolute
+    element-wise error falls below beta and an L1 term otherwise.
+
+    See :class:`~torch.nn.SmoothL1Loss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            smooth_l1_loss,
+            (input, target),
+            input,
+            target,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+            beta=beta,
+        )
+    if not (target.size() == input.size()):
+        warnings.warn(
+            f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
+            "This will likely lead to incorrect results due to broadcasting. "
+            "Please ensure they have the same size.",
+            stacklevel=2,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+
+    expanded_input, expanded_target = torch.broadcast_tensors(input, target)
+
+    if beta == 0.0:
+        return torch._C._nn.l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+    else:
+        return torch._C._nn.smooth_l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction), beta)
+
+
+def huber_loss(
+    input: Tensor,
+    target: Tensor,
+    reduction: str = 'mean',
+    delta: float = 1.0,
+) -> Tensor:
+    r"""Compute the Huber loss.
+
+    Function uses a squared term if the absolute
+    element-wise error falls below delta and a delta-scaled L1 term otherwise.
+
+    When delta equals 1, this loss is equivalent to SmoothL1Loss.
+    In general, Huber loss differs from SmoothL1Loss by a factor of delta (AKA beta in Smooth L1).
+
+    See :class:`~torch.nn.HuberLoss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            huber_loss,
+            (input, target),
+            input,
+            target,
+            reduction=reduction,
+            delta=delta,
+        )
+    if not (target.size() == input.size()):
+        warnings.warn(f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
+                      "This will likely lead to incorrect results due to broadcasting. "
+                      "Please ensure they have the same size.",
+                      stacklevel=2)
+
+    expanded_input, expanded_target = torch.broadcast_tensors(input, target)
+    return torch._C._nn.huber_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction), delta)
+
+
+def l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""l1_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    Function that takes the mean element-wise absolute value difference.
+
+    See :class:`~torch.nn.L1Loss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            l1_loss, (input, target), input, target, size_average=size_average, reduce=reduce, reduction=reduction
+        )
+    if not (target.size() == input.size()):
+        warnings.warn(
+            f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
+            "This will likely lead to incorrect results due to broadcasting. "
+            "Please ensure they have the same size.",
+            stacklevel=2,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+
+    expanded_input, expanded_target = torch.broadcast_tensors(input, target)
+    return torch._C._nn.l1_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+
+
+def mse_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""mse_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    Measures the element-wise mean squared error.
+    See :class:`~torch.nn.MSELoss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            mse_loss, (input, target), input, target, size_average=size_average, reduce=reduce, reduction=reduction
+        )
+    if not (target.size() == input.size()):
+        warnings.warn(
+            f"Using a target size ({target.size()}) that is different to the input size ({input.size()}). "
+            "This will likely lead to incorrect results due to broadcasting. "
+            "Please ensure they have the same size.",
+            stacklevel=2,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+
+    expanded_input, expanded_target = torch.broadcast_tensors(input, target)
+    return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+
+
+def margin_ranking_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = 0,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""margin_ranking_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.MarginRankingLoss` for details.
+    """
+    if has_torch_function_variadic(input1, input2, target):
+        return handle_torch_function(
+            margin_ranking_loss,
+            (input1, input2, target),
+            input1,
+            input2,
+            target,
+            margin=margin,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    if (input1.dim() != input2.dim() or input1.dim() != target.dim()):
+        raise RuntimeError(
+            f"margin_ranking_loss : All input tensors should have same dimension but got sizes: "
+            f"input1: {input1.size()}, input2: {input2.size()}, target: {target.size()} "
+        )
+    return torch.margin_ranking_loss(input1, input2, target, margin, reduction_enum)
+
+
+def hinge_embedding_loss(
+    input: Tensor,
+    target: Tensor,
+    margin: float = 1.0,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""hinge_embedding_loss(input, target, margin=1.0, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.HingeEmbeddingLoss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            hinge_embedding_loss,
+            (input, target),
+            input,
+            target,
+            margin=margin,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    return torch.hinge_embedding_loss(input, target, margin, reduction_enum)
+
+
+def multilabel_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""multilabel_margin_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.MultiLabelMarginLoss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            multilabel_margin_loss,
+            (input, target),
+            input,
+            target,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    return torch._C._nn.multilabel_margin_loss(input, target, reduction_enum)
+
+
+def soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""
+    soft_margin_loss(input, target, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.SoftMarginLoss` for details.
+    """
+    if has_torch_function_variadic(input, target):
+        return handle_torch_function(
+            soft_margin_loss, (input, target), input, target, size_average=size_average, reduce=reduce, reduction=reduction
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    return torch._C._nn.soft_margin_loss(input, target, reduction_enum)
+
+
+def multilabel_soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""multilabel_soft_margin_loss(input, target, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.MultiLabelSoftMarginLoss` for details.
+    """
+    if has_torch_function_variadic(input, target, weight):
+        return handle_torch_function(
+            multilabel_soft_margin_loss,
+            (input, target, weight),
+            input,
+            target,
+            weight=weight,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction = _Reduction.legacy_get_string(size_average, reduce)
+
+    loss = -(target * logsigmoid(input) + (1 - target) * logsigmoid(-input))
+
+    if weight is not None:
+        loss = loss * weight
+
+    class_dim = input.dim() - 1
+    C = input.size(class_dim)
+    loss = loss.sum(dim=class_dim) / C  # only return N loss values
+
+    if reduction == "none":
+        ret = loss
+    elif reduction == "mean":
+        ret = loss.mean()
+    elif reduction == "sum":
+        ret = loss.sum()
+    else:
+        ret = input
+        raise ValueError(reduction + " is not valid")
+    return ret
+
+
+def cosine_embedding_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = 0,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""cosine_embedding_loss(input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.CosineEmbeddingLoss` for details.
+    """
+    if has_torch_function_variadic(input1, input2, target):
+        return handle_torch_function(
+            cosine_embedding_loss,
+            (input1, input2, target),
+            input1,
+            input2,
+            target,
+            margin=margin,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction_enum)
+
+
+def multi_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    p: int = 1,
+    margin: float = 1.0,
+    weight: Optional[Tensor] = None,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:  # noqa: D400,D402
+    r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor
+
+    See :class:`~torch.nn.MultiMarginLoss` for details.
+    """
+    if has_torch_function_variadic(input, target, weight):
+        return handle_torch_function(
+            multi_margin_loss,
+            (input, target, weight),
+            input,
+            target,
+            p=p,
+            margin=margin,
+            weight=weight,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    if p != 1 and p != 2:
+        raise ValueError("only p == 1 and p == 2 supported")
+    if weight is not None:
+        if weight.dim() != 1:
+            raise ValueError("weight must be one-dimensional")
+
+    return torch._C._nn.multi_margin_loss(input, target, p, margin, weight, reduction_enum)
+
+
+pixel_shuffle = _add_docstr(
+    torch.pixel_shuffle,
+    r"""
+pixel_shuffle(input, upscale_factor) -> Tensor
+
+Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)` to a
+tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is the :attr:`upscale_factor`.
+
+See :class:`~torch.nn.PixelShuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    upscale_factor (int): factor to increase spatial resolution by
+
+Examples::
+
+    >>> input = torch.randn(1, 9, 4, 4)
+    >>> output = torch.nn.functional.pixel_shuffle(input, 3)
+    >>> print(output.size())
+    torch.Size([1, 1, 12, 12])
+""",
+)
+
+pixel_unshuffle = _add_docstr(
+    torch.pixel_unshuffle,
+    r"""
+pixel_unshuffle(input, downscale_factor) -> Tensor
+
+Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements in a
+tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+:math:`(*, C \times r^2, H, W)`, where r is the :attr:`downscale_factor`.
+
+See :class:`~torch.nn.PixelUnshuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    downscale_factor (int): factor to increase spatial resolution by
+
+Examples::
+
+    >>> input = torch.randn(1, 1, 12, 12)
+    >>> output = torch.nn.functional.pixel_unshuffle(input, 3)
+    >>> print(output.size())
+    torch.Size([1, 9, 4, 4])
+""",
+)
+
+channel_shuffle = _add_docstr(
+    torch.channel_shuffle,
+    r"""
+channel_shuffle(input, groups) -> Tensor
+
+Divide the channels in a tensor of shape :math:`(*, C , H, W)`
+into g groups and rearrange them as :math:`(*, C \frac g, g, H, W)`,
+while keeping the original tensor shape.
+
+See :class:`~torch.nn.ChannelShuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    groups (int): number of groups to divide channels in and rearrange.
+
+Examples::
+
+    >>> input = torch.randn(1, 4, 2, 2)
+    >>> print(input)
+    [[[[1, 2],
+       [3, 4]],
+      [[5, 6],
+       [7, 8]],
+      [[9, 10],
+       [11, 12]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+    >>> output = torch.nn.functional.channel_shuffle(input, 2)
+    >>> print(output)
+    [[[[1, 2],
+       [3, 4]],
+      [[9, 10],
+       [11, 12]],
+      [[5, 6],
+       [7, 8]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+""",
+)
+
+native_channel_shuffle = _add_docstr(
+    torch.native_channel_shuffle,
+    r"""
+native_channel_shuffle(input, groups) -> Tensor
+
+Native kernel level implementation of the `channel_shuffle`.
+This function might become private in future releases, use with caution.
+
+Divide the channels in a tensor of shape :math:`(*, C , H, W)`
+into g groups and rearrange them as :math:`(*, C \frac g, g, H, W)`,
+while keeping the original tensor shape.
+
+See :class:`~torch.nn.ChannelShuffle` for details.
+
+Args:
+    input (Tensor): the input tensor
+    groups (int): number of groups to divide channels in and rearrange.
+
+Examples::
+
+    >>> input = torch.randn(1, 4, 2, 2)
+    >>> print(input)
+    [[[[1, 2],
+       [3, 4]],
+      [[5, 6],
+       [7, 8]],
+      [[9, 10],
+       [11, 12]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+    >>> output = torch.nn.functional.native_channel_shuffle(input, 2)
+    >>> print(output)
+    [[[[1, 2],
+       [3, 4]],
+      [[9, 10],
+       [11, 12]],
+      [[5, 6],
+       [7, 8]],
+      [[13, 14],
+       [15, 16]],
+     ]]
+""",
+)
+
+@_overload  # noqa: F811
+def upsample(input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None, mode: str = "nearest", align_corners: Optional[bool] = None) -> Tensor:  # noqa: F811,B950
+    pass
+
+
+@_overload  # noqa: F811
+def upsample(input: Tensor, size: Optional[List[int]] = None, scale_factor: Optional[float] = None, mode: str = "nearest", align_corners: Optional[bool] = None) -> Tensor:  # noqa: F811,B950
+    pass
+
+
+def upsample(input, size=None, scale_factor=None, mode="nearest", align_corners=None):  # noqa: F811
+    r"""Upsample input.
+
+    Provided tensor is upsampled to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with ``nn.functional.interpolate(...)``.
+
+    Note:
+        {backward_reproducibility_note}
+
+    The algorithm used for upsampling is determined by :attr:`mode`.
+
+    Currently temporal, spatial and volumetric upsampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    The modes available for upsampling are: `nearest`, `linear` (3D-only),
+    `bilinear`, `bicubic` (4D-only), `trilinear` (5D-only)
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+            ``'trilinear'``. Default: ``'nearest'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'linear'``, ``'bilinear'``, ``'bicubic'`` or ``'trilinear'``.
+            Default: ``False``
+
+    .. note::
+        With ``mode='bicubic'``, it's possible to cause overshoot, in other words it can produce
+        negative values or values greater than 255 for images.
+        Explicitly call ``result.clamp(min=0, max=255)`` if you want to reduce the overshoot
+        when displaying the image.
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, and `trilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+
+    """
+    warnings.warn("nn.functional.upsample is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode, align_corners)
+
+
+if upsample.__doc__:
+    upsample.__doc__ = upsample.__doc__.format(**reproducibility_notes)
+
+
+def _is_integer(x) -> bool:
+    r"""Type check the input number is an integer.
+
+    Will return True for int, SymInt, Numpy integers and Tensors with integer elements.
+    """
+    if isinstance(x, (int, torch.SymInt)):
+        return True
+    if np is not None and isinstance(x, np.integer):
+        return True
+    return isinstance(x, Tensor) and not x.is_floating_point()
+
+
+@_overload  # noqa: F811
+def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None, mode: str = 'nearest', align_corners: Optional[bool] = None, recompute_scale_factor: Optional[bool] = None, antialias: bool = False) -> Tensor:  # noqa: F811,B950
+    pass
+
+
+@_overload  # noqa: F811
+def interpolate(input: Tensor, size: Optional[List[int]] = None, scale_factor: Optional[List[float]] = None, mode: str = 'nearest', align_corners: Optional[bool] = None, recompute_scale_factor: Optional[bool] = None, antialias: bool = False) -> Tensor:  # noqa: F811,B950
+    pass
+
+
+@_overload  # noqa: F811
+def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None, mode: str = 'nearest', align_corners: Optional[bool] = None, recompute_scale_factor: Optional[bool] = None, antialias: bool = False) -> Tensor:  # noqa: F811,B950
+    pass
+
+
+@_overload  # noqa: F811
+def interpolate(  # noqa: F811
+    input: Tensor,
+    size: Optional[List[int]] = None,
+    scale_factor: Optional[float] = None,
+    mode: str = "nearest",
+    align_corners: Optional[bool] = None,
+    recompute_scale_factor: Optional[bool] = None,
+    antialias: bool = False,
+) -> Tensor:  # noqa: F811
+    pass
+
+def interpolate(input: Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None, mode: str = 'nearest', align_corners: Optional[bool] = None, recompute_scale_factor: Optional[bool] = None, antialias: bool = False) -> Tensor:  # noqa: F811,B950
+    r"""Down/up samples the input.
+
+    Tensor interpolated to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    The algorithm used for interpolation is determined by :attr:`mode`.
+
+    Currently temporal, spatial and volumetric sampling are supported, i.e.
+    expected inputs are 3-D, 4-D or 5-D in shape.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    The modes available for resizing are: `nearest`, `linear` (3D-only),
+    `bilinear`, `bicubic` (4D-only), `trilinear` (5D-only), `area`, `nearest-exact`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. If `scale_factor` is a tuple,
+            its length has to match the number of spatial dimensions; `input.dim() - 2`.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+            ``'trilinear'`` | ``'area'`` | ``'nearest-exact'``. Default: ``'nearest'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'linear'``, ``'bilinear'``, ``'bicubic'`` or ``'trilinear'``.
+            Default: ``False``
+        recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
+            interpolation calculation. If `recompute_scale_factor` is ``True``, then
+            `scale_factor` must be passed in and `scale_factor` is used to compute the
+            output `size`. The computed output `size` will be used to infer new scales for
+            the interpolation. Note that when `scale_factor` is floating-point, it may differ
+            from the recomputed `scale_factor` due to rounding and precision issues.
+            If `recompute_scale_factor` is ``False``, then `size` or `scale_factor` will
+            be used directly for interpolation. Default: ``None``.
+        antialias (bool, optional): flag to apply anti-aliasing. Default: ``False``. Using anti-alias
+            option together with ``align_corners=False``, interpolation result would match Pillow
+            result for downsampling operation. Supported modes: ``'bilinear'``, ``'bicubic'``.
+
+    .. note::
+        With ``mode='bicubic'``, it's possible to cause overshoot, in other words it can produce
+        negative values or values greater than 255 for images.
+        Explicitly call ``result.clamp(min=0, max=255)`` if you want to reduce the overshoot
+        when displaying the image.
+
+    .. note::
+        Mode ``mode='nearest-exact'`` matches Scikit-Image and PIL nearest neighbours interpolation
+        algorithms and fixes known issues with ``mode='nearest'``. This mode is introduced to keep
+        backward compatibility.
+        Mode ``mode='nearest'`` matches buggy OpenCV's ``INTER_NEAREST`` interpolation algorithm.
+
+    .. note::
+        The gradients for the dtype ``float16`` on CUDA may be inaccurate in the upsample operation
+        when using modes ``['linear', 'bilinear', 'bicubic', 'trilinear', 'area']``.
+        For more details, please refer to the discussion in
+        `issue#104157 <https://github.com/pytorch/pytorch/issues/104157>`_.
+
+    Note:
+        {backward_reproducibility_note}
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            interpolate,
+            (input,),
+            input,
+            size=size,
+            scale_factor=scale_factor,
+            mode=mode,
+            align_corners=align_corners,
+            recompute_scale_factor=recompute_scale_factor,
+            antialias=antialias
+        )
+
+    if mode in ("nearest", "area", "nearest-exact"):
+        if align_corners is not None:
+            raise ValueError(
+                "align_corners option can only be set with the "
+                "interpolating modes: linear | bilinear | bicubic | trilinear"
+            )
+    else:
+        if align_corners is None:
+            align_corners = False
+
+    dim = input.dim() - 2  # Number of spatial dimensions.
+
+    # Process size and scale_factor.  Validate that exactly one is set.
+    # Validate its length if it is a list, or expand it if it is a scalar.
+    # After this block, exactly one of output_size and scale_factors will
+    # be non-None, and it will be a list (or tuple).
+    if size is not None and scale_factor is not None:
+        raise ValueError("only one of size or scale_factor should be defined")
+    elif size is not None:
+        assert scale_factor is None
+        scale_factors = None
+        if isinstance(size, (list, tuple)):
+            if len(size) != dim:
+                raise ValueError(
+                    "Input and output must have the same number of spatial dimensions, but got "
+                    f"input with spatial dimensions of {list(input.shape[2:])} and output size of {size}. "
+                    "Please provide input tensor in (N, C, d1, d2, ...,dK) format and "
+                    "output size in (o1, o2, ...,oK) format."
+                )
+            if not torch.jit.is_scripting():
+                if not all(_is_integer(x) for x in size):
+                    raise TypeError(
+                        "expected size to be one of int or Tuple[int] or Tuple[int, int] or "
+                        f"Tuple[int, int, int], but got size with types {[type(x) for x in size]}"
+                    )
+            output_size = size
+        else:
+            output_size = [size for _ in range(dim)]
+    elif scale_factor is not None:
+        assert size is None
+        output_size = None
+        if isinstance(scale_factor, (list, tuple)):
+            if len(scale_factor) != dim:
+                raise ValueError(
+                    "Input and scale_factor must have the same number of spatial dimensions, but "
+                    f"got input with spatial dimensions of {list(input.shape[2:])} and "
+                    f"scale_factor of shape {scale_factor}. "
+                    "Please provide input tensor in (N, C, d1, d2, ...,dK) format and "
+                    "scale_factor in (s1, s2, ...,sK) format."
+                )
+            scale_factors = scale_factor
+        else:
+            scale_factors = [scale_factor for _ in range(dim)]
+    else:
+        raise ValueError("either size or scale_factor should be defined")
+
+    if recompute_scale_factor is not None and recompute_scale_factor and size is not None:
+        raise ValueError("recompute_scale_factor is not meaningful with an explicit size.")
+
+    # "area" mode always requires an explicit size rather than scale factor.
+    # Re-use the recompute_scale_factor code path.
+    if mode == "area" and output_size is None:
+        recompute_scale_factor = True
+
+    if recompute_scale_factor is not None and recompute_scale_factor:
+        # We compute output_size here, then un-set scale_factors.
+        # The C++ code will recompute it based on the (integer) output size.
+        assert scale_factors is not None
+        if not torch.jit.is_scripting() and torch._C._get_tracing_state():
+            # make scale_factor a tensor in tracing so constant doesn't get baked in
+            output_size = [
+                (torch.floor((input.size(i + 2).float() * torch.tensor(scale_factors[i], dtype=torch.float32)).float()))
+                for i in range(dim)
+            ]
+        elif torch.jit.is_scripting():
+            output_size = [int(math.floor(float(input.size(i + 2)) * scale_factors[i]))
+                           for i in range(dim)]
+        else:
+            output_size = [
+                _sym_int(input.size(i + 2) * scale_factors[i])
+                for i in range(dim)
+            ]
+        scale_factors = None
+
+    if antialias and not (mode in ("bilinear", "bicubic") and input.ndim == 4):
+        raise ValueError("Anti-alias option is restricted to bilinear and bicubic modes and requires a 4-D tensor as input")
+
+    if input.dim() == 3 and mode == "nearest":
+        return torch._C._nn.upsample_nearest1d(input, output_size, scale_factors)
+    if input.dim() == 4 and mode == "nearest":
+        return torch._C._nn.upsample_nearest2d(input, output_size, scale_factors)
+    if input.dim() == 5 and mode == "nearest":
+        return torch._C._nn.upsample_nearest3d(input, output_size, scale_factors)
+
+    if input.dim() == 3 and mode == "nearest-exact":
+        return torch._C._nn._upsample_nearest_exact1d(input, output_size, scale_factors)
+    if input.dim() == 4 and mode == "nearest-exact":
+        return torch._C._nn._upsample_nearest_exact2d(input, output_size, scale_factors)
+    if input.dim() == 5 and mode == "nearest-exact":
+        return torch._C._nn._upsample_nearest_exact3d(input, output_size, scale_factors)
+
+    if input.dim() == 3 and mode == "area":
+        assert output_size is not None
+        return adaptive_avg_pool1d(input, output_size)
+    if input.dim() == 4 and mode == "area":
+        assert output_size is not None
+        return adaptive_avg_pool2d(input, output_size)
+    if input.dim() == 5 and mode == "area":
+        assert output_size is not None
+        return adaptive_avg_pool3d(input, output_size)
+
+    if input.dim() == 3 and mode == "linear":
+        assert align_corners is not None
+        return torch._C._nn.upsample_linear1d(input, output_size, align_corners, scale_factors)
+    if input.dim() == 4 and mode == "bilinear":
+        assert align_corners is not None
+        if antialias:
+            return torch._C._nn._upsample_bilinear2d_aa(input, output_size, align_corners, scale_factors)
+        # Two levels are necessary to prevent TorchScript from touching
+        # are_deterministic_algorithms_enabled.
+        if not torch.jit.is_scripting():
+            if torch.are_deterministic_algorithms_enabled() and input.is_cuda:
+                # Use slow decomp whose backward will be in terms of index_put
+                # importlib is required because the import cannot be top level
+                # (cycle) and cannot be nested (TS doesn't support)
+                return importlib.import_module('torch._decomp.decompositions')._upsample_linear_vec(
+                    input, output_size, align_corners, scale_factors)
+        return torch._C._nn.upsample_bilinear2d(input, output_size, align_corners, scale_factors)
+    if input.dim() == 5 and mode == "trilinear":
+        assert align_corners is not None
+        return torch._C._nn.upsample_trilinear3d(input, output_size, align_corners, scale_factors)
+    if input.dim() == 4 and mode == "bicubic":
+        assert align_corners is not None
+        if antialias:
+            return torch._C._nn._upsample_bicubic2d_aa(input, output_size, align_corners, scale_factors)
+        return torch._C._nn.upsample_bicubic2d(input, output_size, align_corners, scale_factors)
+
+    if input.dim() == 3 and mode == "bilinear":
+        raise NotImplementedError("Got 3D input, but bilinear mode needs 4D input")
+    if input.dim() == 3 and mode == "trilinear":
+        raise NotImplementedError("Got 3D input, but trilinear mode needs 5D input")
+    if input.dim() == 4 and mode == "linear":
+        raise NotImplementedError("Got 4D input, but linear mode needs 3D input")
+    if input.dim() == 4 and mode == "trilinear":
+        raise NotImplementedError("Got 4D input, but trilinear mode needs 5D input")
+    if input.dim() == 5 and mode == "linear":
+        raise NotImplementedError("Got 5D input, but linear mode needs 3D input")
+    if input.dim() == 5 and mode == "bilinear":
+        raise NotImplementedError("Got 5D input, but bilinear mode needs 4D input")
+
+    raise NotImplementedError(
+        "Input Error: Only 3D, 4D and 5D input Tensors supported"
+        f" (got {input.dim()}D) for the modes: nearest | linear | bilinear | bicubic | trilinear | area | nearest-exact"
+        f" (got {mode})"
+    )
+
+
+if interpolate.__doc__:
+    interpolate.__doc__ = interpolate.__doc__.format(**reproducibility_notes)
+
+
+@_overload  # noqa: F811
+def upsample_nearest(input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None) -> Tensor:  # noqa: F811
+    pass
+
+
+@_overload  # noqa: F811
+def upsample_nearest(input: Tensor, size: Optional[List[int]] = None, scale_factor: Optional[float] = None) -> Tensor:  # noqa: F811
+    pass
+
+
+def upsample_nearest(input, size=None, scale_factor=None):  # noqa: F811
+    r"""Upsamples the input, using nearest neighbours' pixel values.
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with ``nn.functional.interpolate(..., mode='nearest')``.
+
+    Currently spatial and volumetric upsampling are supported (i.e. expected
+    inputs are 4 or 5 dimensional).
+
+    Args:
+        input (Tensor): input
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatia
+            size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+
+    Note:
+        {backward_reproducibility_note}
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.functional.upsample_nearest is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode="nearest")
+
+
+if upsample_nearest.__doc__:
+    upsample_nearest.__doc__ = upsample_nearest.__doc__.format(**reproducibility_notes)
+
+
+@_overload  # noqa: F811
+def upsample_bilinear(
+    input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None
+) -> Tensor:  # noqa: F811
+    pass
+
+
+@_overload  # noqa: F811
+def upsample_bilinear(  # noqa: F811
+    input: Tensor, size: Optional[List[int]] = None, scale_factor: Optional[float] = None
+) -> Tensor:  # noqa: F811
+    pass
+
+
+@_overload  # noqa: F811
+def upsample_bilinear(  # noqa: F811
+    input: Tensor, size: Optional[int] = None, scale_factor: Optional[List[float]] = None
+) -> Tensor:  # noqa: F811
+    pass
+
+
+@_overload  # noqa: F811
+def upsample_bilinear(  # noqa: F811
+    input: Tensor, size: Optional[List[int]] = None, scale_factor: Optional[List[float]] = None
+) -> Tensor:  # noqa: F811
+    pass
+
+
+def upsample_bilinear(input, size=None, scale_factor=None):  # noqa: F811
+    r"""Upsamples the input, using bilinear upsampling.
+
+    .. warning::
+        This function is deprecated in favor of :func:`torch.nn.functional.interpolate`.
+        This is equivalent with
+        ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Expected inputs are spatial (4 dimensional). Use `upsample_trilinear` fo
+    volumetric (5 dimensional) inputs.
+
+    Args:
+        input (Tensor): input
+        size (int or Tuple[int, int]): output spatial size.
+        scale_factor (int or Tuple[int, int]): multiplier for spatial size
+
+    Note:
+        {backward_reproducibility_note}
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.functional.upsample_bilinear is deprecated. Use nn.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode="bilinear", align_corners=True)
+
+
+if upsample_bilinear.__doc__:
+    upsample_bilinear.__doc__ = upsample_bilinear.__doc__.format(**reproducibility_notes)
+
+GRID_SAMPLE_INTERPOLATION_MODES = {
+    "bilinear": 0,
+    "nearest": 1,
+    "bicubic": 2,
+}
+
+GRID_SAMPLE_PADDING_MODES = {
+    "zeros": 0,
+    "border": 1,
+    "reflection": 2,
+}
+
+
+def grid_sample(
+    input: Tensor,
+    grid: Tensor,
+    mode: str = "bilinear",
+    padding_mode: str = "zeros",
+    align_corners: Optional[bool] = None,
+) -> Tensor:
+    r"""Compute grid sample.
+
+    Given an :attr:`input` and a flow-field :attr:`grid`, computes the
+    ``output`` using :attr:`input` values and pixel locations from :attr:`grid`.
+
+    Currently, only spatial (4-D) and volumetric (5-D) :attr:`input` are
+    supported.
+
+    In the spatial (4-D) case, for :attr:`input` with shape
+    :math:`(N, C, H_\text{in}, W_\text{in})` and :attr:`grid` with shape
+    :math:`(N, H_\text{out}, W_\text{out}, 2)`, the output will have shape
+    :math:`(N, C, H_\text{out}, W_\text{out})`.
+
+    For each output location ``output[n, :, h, w]``, the size-2 vector
+    ``grid[n, h, w]`` specifies :attr:`input` pixel locations ``x`` and ``y``,
+    which are used to interpolate the output value ``output[n, :, h, w]``.
+    In the case of 5D inputs, ``grid[n, d, h, w]`` specifies the
+    ``x``, ``y``, ``z`` pixel locations for interpolating
+    ``output[n, :, d, h, w]``. :attr:`mode` argument specifies ``nearest`` or
+    ``bilinear`` interpolation method to sample the input pixels.
+
+    :attr:`grid` specifies the sampling pixel locations normalized by the
+    :attr:`input` spatial dimensions. Therefore, it should have most values in
+    the range of ``[-1, 1]``. For example, values ``x = -1, y = -1`` is the
+    left-top pixel of :attr:`input`, and values  ``x = 1, y = 1`` is the
+    right-bottom pixel of :attr:`input`.
+
+    If :attr:`grid` has values outside the range of ``[-1, 1]``, the corresponding
+    outputs are handled as defined by :attr:`padding_mode`. Options are
+
+        * ``padding_mode="zeros"``: use ``0`` for out-of-bound grid locations,
+        * ``padding_mode="border"``: use border values for out-of-bound grid locations,
+        * ``padding_mode="reflection"``: use values at locations reflected by
+          the border for out-of-bound grid locations. For location far away
+          from the border, it will keep being reflected until becoming in bound,
+          e.g., (normalized) pixel location ``x = -3.5`` reflects by border ``-1``
+          and becomes ``x' = 1.5``, then reflects by border ``1`` and becomes
+          ``x'' = -0.5``.
+
+    Note:
+        This function is often used in conjunction with :func:`affine_grid`
+        to build `Spatial Transformer Networks`_ .
+
+    Note:
+        When using the CUDA backend, this operation may induce nondeterministic
+        behaviour in its backward pass that is not easily switched off.
+        Please see the notes on :doc:`/notes/randomness` for background.
+
+    Note:
+        NaN values in :attr:`grid` would be interpreted as ``-1``.
+
+    Args:
+        input (Tensor): input of shape :math:`(N, C, H_\text{in}, W_\text{in})` (4-D case)
+                        or :math:`(N, C, D_\text{in}, H_\text{in}, W_\text{in})` (5-D case)
+        grid (Tensor): flow-field of shape :math:`(N, H_\text{out}, W_\text{out}, 2)` (4-D case)
+                       or :math:`(N, D_\text{out}, H_\text{out}, W_\text{out}, 3)` (5-D case)
+        mode (str): interpolation mode to calculate output values
+            ``'bilinear'`` | ``'nearest'`` | ``'bicubic'``. Default: ``'bilinear'``
+            Note: ``mode='bicubic'`` supports only 4-D input.
+            When ``mode='bilinear'`` and the input is 5-D, the interpolation mode
+            used internally will actually be trilinear. However, when the input is 4-D,
+            the interpolation mode will legitimately be bilinear.
+        padding_mode (str): padding mode for outside grid values
+            ``'zeros'`` | ``'border'`` | ``'reflection'``. Default: ``'zeros'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input  as squares rather than points.
+            If set to ``True``, the extrema (``-1`` and ``1``) are considered as referring
+            to the center points of the input's corner pixels. If set to ``False``, they
+            are instead considered as referring to the corner points of the input's corner
+            pixels, making the sampling more resolution agnostic.
+            This option parallels the ``align_corners`` option in
+            :func:`interpolate`, and so whichever option is used here
+            should also be used there to resize the input image before grid sampling.
+            Default: ``False``
+
+    Returns:
+        output (Tensor): output Tensor
+
+    .. _`Spatial Transformer Networks`:
+        https://arxiv.org/abs/1506.02025
+
+    .. warning::
+        When ``align_corners = True``, the grid positions depend on the pixel
+        size relative to the input image size, and so the locations sampled by
+        :func:`grid_sample` will differ for the same input given at different
+        resolutions (that is, after being upsampled or downsampled).
+        The default behavior up to version 1.2.0 was ``align_corners = True``.
+        Since then, the default behavior has been changed to ``align_corners = False``,
+        in order to bring it in line with the default for :func:`interpolate`.
+
+    .. note::
+        ``mode='bicubic'`` is implemented using the `cubic convolution algorithm`_ with :math:`\alpha=-0.75`.
+        The constant :math:`\alpha` might be different from packages to packages.
+        For example, `PIL`_ and `OpenCV`_ use -0.5 and -0.75 respectively.
+        This algorithm may "overshoot" the range of values it's interpolating.
+        For example, it may produce negative values or values greater than 255 when interpolating input in [0, 255].
+        Clamp the results with :func:`torch.clamp` to ensure they are within the valid range.
+    .. _`cubic convolution algorithm`: https://en.wikipedia.org/wiki/Bicubic_interpolation
+    .. _`PIL`: https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/src/libImaging/Resample.c#L51
+    .. _`OpenCV`: https://github.com/opencv/opencv/blob/f345ed564a06178670750bad59526cfa4033be55/modules/imgproc/src/resize.cpp#L908
+    """
+    if has_torch_function_variadic(input, grid):
+        return handle_torch_function(
+            grid_sample, (input, grid), input, grid, mode=mode, padding_mode=padding_mode, align_corners=align_corners
+        )
+    if mode != "bilinear" and mode != "nearest" and mode != "bicubic":
+        raise ValueError(
+            f"nn.functional.grid_sample(): expected mode to be 'bilinear', 'nearest' or 'bicubic', but got: '{mode}'"
+        )
+    if padding_mode != "zeros" and padding_mode != "border" and padding_mode != "reflection":
+        raise ValueError(
+            "nn.functional.grid_sample(): expected padding_mode "
+            "to be 'zeros', 'border', or 'reflection', "
+            f"but got: '{padding_mode}'"
+        )
+
+    if mode == "bilinear":
+        mode_enum = 0
+    elif mode == "nearest":
+        mode_enum = 1
+    else:  # mode == 'bicubic'
+        mode_enum = 2
+
+    if padding_mode == "zeros":
+        padding_mode_enum = 0
+    elif padding_mode == "border":
+        padding_mode_enum = 1
+    else:  # padding_mode == 'reflection'
+        padding_mode_enum = 2
+
+    if align_corners is None:
+        warnings.warn(
+            "Default grid_sample and affine_grid behavior has changed "
+            "to align_corners=False since 1.3.0. Please specify "
+            "align_corners=True if the old behavior is desired. "
+            "See the documentation of grid_sample for details."
+        )
+        align_corners = False
+
+    return torch.grid_sampler(input, grid, mode_enum, padding_mode_enum, align_corners)
+
+
+def affine_grid(theta: Tensor, size: List[int], align_corners: Optional[bool] = None) -> Tensor:
+    r"""Generate 2D or 3D flow field (sampling grid), given a batch of affine matrices :attr:`theta`.
+
+    .. note::
+        This function is often used in conjunction with :func:`grid_sample`
+        to build `Spatial Transformer Networks`_ .
+
+    Args:
+        theta (Tensor): input batch of affine matrices with shape
+            (:math:`N \times 2 \times 3`) for 2D or
+            (:math:`N \times 3 \times 4`) for 3D
+        size (torch.Size): the target output image size.
+            (:math:`N \times C \times H \times W` for 2D or
+            :math:`N \times C \times D \times H \times W` for 3D)
+            Example: torch.Size((32, 3, 24, 24))
+        align_corners (bool, optional): if ``True``, consider ``-1`` and ``1``
+            to refer to the centers of the corner pixels rather than the image corners.
+            Refer to :func:`grid_sample` for a more complete description.
+            A grid generated by :func:`affine_grid` should be passed to :func:`grid_sample`
+            with the same setting for this option.
+            Default: ``False``
+
+    Returns:
+        output (Tensor): output Tensor of size (:math:`N \times H \times W \times 2`)
+
+    .. _`Spatial Transformer Networks`:
+        https://arxiv.org/abs/1506.02025
+
+    .. warning::
+        When ``align_corners = True``, the grid positions depend on the pixel
+        size relative to the input image size, and so the locations sampled by
+        :func:`grid_sample` will differ for the same input given at different
+        resolutions (that is, after being upsampled or downsampled).
+        The default behavior up to version 1.2.0 was ``align_corners = True``.
+        Since then, the default behavior has been changed to ``align_corners = False``,
+        in order to bring it in line with the default for :func:`interpolate`.
+    .. warning::
+        When ``align_corners = True``, 2D affine transforms on 1D data and
+        3D affine transforms on 2D data (that is, when one of the spatial
+        dimensions has unit size) are ill-defined, and not an intended use case.
+        This is not a problem when ``align_corners = False``.
+        Up to version 1.2.0, all grid points along a unit dimension were
+        considered arbitrarily to be at ``-1``.
+        From version 1.3.0, under ``align_corners = True`` all grid points
+        along a unit dimension are considered to be at ``0``
+        (the center of the input image).
+    """
+    if has_torch_function_unary(theta):
+        return handle_torch_function(affine_grid, (theta,), theta, size, align_corners=align_corners)
+    if align_corners is None:
+        warnings.warn(
+            "Default grid_sample and affine_grid behavior has changed "
+            "to align_corners=False since 1.3.0. Please specify "
+            "align_corners=True if the old behavior is desired. "
+            "See the documentation of grid_sample for details."
+        )
+        align_corners = False
+
+    # enforce floating point dtype on theta
+    if not theta.is_floating_point():
+        raise ValueError(f"Expected theta to have floating point type, but got {theta.dtype}")
+    # check that shapes and sizes match
+    if len(size) == 4:
+        if theta.dim() != 3 or theta.shape[-2] != 2 or theta.shape[-1] != 3:
+            raise ValueError(
+                f"Expected a batch of 2D affine matrices of shape Nx2x3 for size {size}. Got {theta.shape}."
+            )
+        spatial_size = size[-2:]  # spatial dimension sizes
+    elif len(size) == 5:
+        if theta.dim() != 3 or theta.shape[-2] != 3 or theta.shape[-1] != 4:
+            raise ValueError(
+                f"Expected a batch of 3D affine matrices of shape Nx3x4 for size {size}. Got {theta.shape}."
+            )
+        spatial_size = size[-3:]  # spatial dimension sizes
+    else:
+        raise NotImplementedError(
+            "affine_grid only supports 4D and 5D sizes, "
+            "for 2D and 3D affine transforms, respectively. "
+            f"Got size {size}."
+        )
+    # check for empty span
+    if align_corners and min(spatial_size) == 1:
+        warnings.warn(
+            "Since version 1.3.0, affine_grid behavior has changed "
+            "for unit-size grids when align_corners=True. "
+            "This is not an intended use case of affine_grid. "
+            "See the documentation of affine_grid for details."
+        )
+    elif min(size) <= 0:
+        raise ValueError(f"Expected non-zero, positive output size. Got {size}")
+
+    return torch.affine_grid_generator(theta, size, align_corners)
+
+
+def pad(input: Tensor, pad: List[int], mode: str = "constant", value: Optional[float] = None) -> Tensor:
+    r"""
+pad(input, pad, mode="constant", value=None) -> Tensor
+
+Pads tensor.
+
+Padding size:
+    The padding size by which to pad some dimensions of :attr:`input`
+    are described starting from the last dimension and moving forward.
+    :math:`\left\lfloor\frac{\text{len(pad)}}{2}\right\rfloor` dimensions
+    of ``input`` will be padded.
+    For example, to pad only the last dimension of the input tensor, then
+    :attr:`pad` has the form
+    :math:`(\text{padding\_left}, \text{padding\_right})`;
+    to pad the last 2 dimensions of the input tensor, then use
+    :math:`(\text{padding\_left}, \text{padding\_right},`
+    :math:`\text{padding\_top}, \text{padding\_bottom})`;
+    to pad the last 3 dimensions, use
+    :math:`(\text{padding\_left}, \text{padding\_right},`
+    :math:`\text{padding\_top}, \text{padding\_bottom}`
+    :math:`\text{padding\_front}, \text{padding\_back})`.
+
+Padding mode:
+    See :class:`torch.nn.CircularPad2d`, :class:`torch.nn.ConstantPad2d`,
+    :class:`torch.nn.ReflectionPad2d`, and :class:`torch.nn.ReplicationPad2d`
+    for concrete examples on how each of the padding modes works. Constant
+    padding is implemented for arbitrary dimensions. Circular, replicate and
+    reflection padding are implemented for padding the last 3 dimensions of a
+    4D or 5D input tensor, the last 2 dimensions of a 3D or 4D input tensor,
+    or the last dimension of a 2D or 3D input tensor.
+
+Note:
+    When using the CUDA backend, this operation may induce nondeterministic
+    behaviour in its backward pass that is not easily switched off.
+    Please see the notes on :doc:`/notes/randomness` for background.
+
+Args:
+    input (Tensor): N-dimensional tensor
+    pad (tuple): m-elements tuple, where
+        :math:`\frac{m}{2} \leq` input dimensions and :math:`m` is even.
+    mode: ``'constant'``, ``'reflect'``, ``'replicate'`` or ``'circular'``.
+        Default: ``'constant'``
+    value: fill value for ``'constant'`` padding. Default: ``0``
+
+Examples::
+
+    >>> t4d = torch.empty(3, 3, 4, 2)
+    >>> p1d = (1, 1) # pad last dim by 1 on each side
+    >>> out = F.pad(t4d, p1d, "constant", 0)  # effectively zero padding
+    >>> print(out.size())
+    torch.Size([3, 3, 4, 4])
+    >>> p2d = (1, 1, 2, 2) # pad last dim by (1, 1) and 2nd to last by (2, 2)
+    >>> out = F.pad(t4d, p2d, "constant", 0)
+    >>> print(out.size())
+    torch.Size([3, 3, 8, 4])
+    >>> t4d = torch.empty(3, 3, 4, 2)
+    >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
+    >>> out = F.pad(t4d, p3d, "constant", 0)
+    >>> print(out.size())
+    torch.Size([3, 9, 7, 3])
+
+"""
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            torch.nn.functional.pad, (input,), input, pad, mode=mode, value=value)
+    if not torch.jit.is_scripting():
+        if torch.are_deterministic_algorithms_enabled() and input.is_cuda:
+            if mode == 'replicate':
+                # Use slow decomp whose backward will be in terms of index_put.
+                # importlib is required because the import cannot be top level
+                # (cycle) and cannot be nested (TS doesn't support)
+                return importlib.import_module('torch._decomp.decompositions')._replication_pad(
+                    input, pad
+                )
+    return torch._C._nn.pad(input, pad, mode, value)
+
+# TODO: Fix via https://github.com/pytorch/pytorch/issues/75798
+pad.__module__ = "torch.nn.functional"
+
+# distance
+
+
+pairwise_distance = _add_docstr(
+    torch.pairwise_distance,
+    r"""
+pairwise_distance(x1, x2, p=2.0, eps=1e-6, keepdim=False) -> Tensor
+
+See :class:`torch.nn.PairwiseDistance` for details
+""")
+
+
+pdist = _add_docstr(
+    torch.pdist,
+    r"""
+pdist(input, p=2) -> Tensor
+
+Computes the p-norm distance between every pair of row vectors in the input.
+This is identical to the upper triangular portion, excluding the diagonal, of
+`torch.norm(input[:, None] - input, dim=2, p=p)`. This function will be faster
+if the rows are contiguous.
+
+If input has shape :math:`N \times M` then the output will have shape
+:math:`\frac{1}{2} N (N - 1)`.
+
+This function is equivalent to ``scipy.spatial.distance.pdist(input,
+'minkowski', p=p)`` if :math:`p \in (0, \infty)`. When :math:`p = 0` it is
+equivalent to ``scipy.spatial.distance.pdist(input, 'hamming') * M``.
+When :math:`p = \infty`, the closest scipy function is
+``scipy.spatial.distance.pdist(xn, lambda x, y: np.abs(x - y).max())``.
+
+Args:
+    input: input tensor of shape :math:`N \times M`.
+    p: p value for the p-norm distance to calculate between each vector pair
+        :math:`\in [0, \infty]`.
+""",
+)
+
+
+cosine_similarity = _add_docstr(
+    torch.cosine_similarity,
+    r"""
+cosine_similarity(x1, x2, dim=1, eps=1e-8) -> Tensor
+
+Returns cosine similarity between ``x1`` and ``x2``, computed along dim. ``x1`` and ``x2`` must be broadcastable
+to a common shape. ``dim`` refers to the dimension in this common shape. Dimension ``dim`` of the output is
+squeezed (see :func:`torch.squeeze`), resulting in the
+output tensor having 1 fewer dimension.
+
+.. math ::
+    \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2, \epsilon) \cdot \max(\Vert x_2 \Vert _2, \epsilon)}
+
+Supports :ref:`type promotion <type-promotion-doc>`.
+
+Args:
+    x1 (Tensor): First input.
+    x2 (Tensor): Second input.
+    dim (int, optional): Dimension along which cosine similarity is computed. Default: 1
+    eps (float, optional): Small value to avoid division by zero.
+        Default: 1e-8
+
+Example::
+
+    >>> input1 = torch.randn(100, 128)
+    >>> input2 = torch.randn(100, 128)
+    >>> output = F.cosine_similarity(input1, input2)
+    >>> print(output)
+""",
+)
+
+
+one_hot = _add_docstr(
+    torch._C._nn.one_hot,
+    r"""
+one_hot(tensor, num_classes=-1) -> LongTensor
+
+Takes LongTensor with index values of shape ``(*)`` and returns a tensor
+of shape ``(*, num_classes)`` that have zeros everywhere except where the
+index of last dimension matches the corresponding value of the input tensor,
+in which case it will be 1.
+
+See also `One-hot on Wikipedia`_ .
+
+.. _One-hot on Wikipedia:
+    https://en.wikipedia.org/wiki/One-hot
+
+Arguments:
+    tensor (LongTensor): class values of any shape.
+    num_classes (int):  Total number of classes. If set to -1, the number
+        of classes will be inferred as one greater than the largest class
+        value in the input tensor.
+
+Returns:
+    LongTensor that has one more dimension with 1 values at the
+    index of last dimension indicated by the input, and 0 everywhere
+    else.
+
+Examples:
+    >>> F.one_hot(torch.arange(0, 5) % 3)
+    tensor([[1, 0, 0],
+            [0, 1, 0],
+            [0, 0, 1],
+            [1, 0, 0],
+            [0, 1, 0]])
+    >>> F.one_hot(torch.arange(0, 5) % 3, num_classes=5)
+    tensor([[1, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 0],
+            [1, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0]])
+    >>> F.one_hot(torch.arange(0, 6).view(3,2) % 3)
+    tensor([[[1, 0, 0],
+             [0, 1, 0]],
+            [[0, 0, 1],
+             [1, 0, 0]],
+            [[0, 1, 0],
+             [0, 0, 1]]])
+""",
+)
+
+
+def triplet_margin_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    margin: float = 1.0,
+    p: float = 2,
+    eps: float = 1e-6,
+    swap: bool = False,
+    size_average: Optional[bool] = None,
+    reduce: Optional[bool] = None,
+    reduction: str = "mean",
+) -> Tensor:
+    r"""Compute the triplet loss between given input tensors and a margin greater than 0.
+
+    See :class:`~torch.nn.TripletMarginLoss` for details.
+    """
+    if has_torch_function_variadic(anchor, positive, negative):
+        return handle_torch_function(
+            triplet_margin_loss,
+            (anchor, positive, negative),
+            anchor,
+            positive,
+            negative,
+            margin=margin,
+            p=p,
+            eps=eps,
+            swap=swap,
+            size_average=size_average,
+            reduce=reduce,
+            reduction=reduction,
+        )
+    if size_average is not None or reduce is not None:
+        reduction_enum = _Reduction.legacy_get_enum(size_average, reduce)
+    else:
+        reduction_enum = _Reduction.get_enum(reduction)
+    return torch.triplet_margin_loss(anchor, positive, negative, margin, p, eps, swap, reduction_enum)
+
+
+def triplet_margin_with_distance_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    *,
+    distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+    margin: float = 1.0,
+    swap: bool = False,
+    reduction: str = "mean"
+) -> Tensor:
+    r"""Compute the triplet margin loss for input tensors using a custom distance function.
+
+    See :class:`~torch.nn.TripletMarginWithDistanceLoss` for details.
+    """
+    if torch.jit.is_scripting():
+        raise NotImplementedError(
+            "F.triplet_margin_with_distance_loss does not support JIT scripting: "
+            "functions requiring Callables cannot be scripted."
+        )
+
+    if has_torch_function_variadic(anchor, positive, negative):
+        return handle_torch_function(
+            triplet_margin_with_distance_loss,
+            (anchor, positive, negative),
+            anchor,
+            positive,
+            negative,
+            distance_function=distance_function,
+            margin=margin,
+            swap=swap,
+            reduction=reduction,
+        )
+
+    # Check validity of reduction mode
+    if reduction not in ("mean", "sum", "none"):
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+
+    # Check dimensions
+    a_dim = anchor.ndim
+    p_dim = positive.ndim
+    n_dim = negative.ndim
+    if not (a_dim == p_dim and p_dim == n_dim):
+        raise RuntimeError(
+            f"The anchor, positive, and negative tensors are expected to have "
+            f"the same number of dimensions, but got: anchor {a_dim}D, "
+            f"positive {p_dim}D, and negative {n_dim}D inputs")
+
+    # Calculate loss
+    if distance_function is None:
+        distance_function = torch.pairwise_distance
+
+    dist_pos = distance_function(anchor, positive)
+    dist_neg = distance_function(anchor, negative)
+    # The distance swap is described in the paper "Learning shallow
+    # convolutional feature descriptors with triplet losses" by V. Balntas, E.
+    # Riba et al.  If True, and if the positive example is closer to the
+    # negative example than the anchor is, swaps the positive example and the
+    # anchor in the loss computation.
+    if swap:
+        dist_swap = distance_function(positive, negative)
+        dist_neg = torch.minimum(dist_neg, dist_swap)
+    loss = torch.clamp_min(margin + dist_pos - dist_neg, 0)
+
+    # Apply reduction
+    if reduction == "sum":
+        return torch.sum(loss)
+    elif reduction == "mean":
+        return torch.mean(loss)
+    else:  # reduction == "none"
+        return loss
+
+
+def normalize(input: Tensor, p: float = 2.0, dim: int = 1, eps: float = 1e-12, out: Optional[Tensor] = None) -> Tensor:
+    r"""Perform :math:`L_p` normalization of inputs over specified dimension.
+
+    For a tensor :attr:`input` of sizes :math:`(n_0, ..., n_{dim}, ..., n_k)`, each
+    :math:`n_{dim}` -element vector :math:`v` along dimension :attr:`dim` is transformed as
+
+    .. math::
+        v = \frac{v}{\max(\lVert v \rVert_p, \epsilon)}.
+
+    With the default arguments it uses the Euclidean norm over vectors along dimension :math:`1` for normalization.
+
+    Args:
+        input: input tensor of any shape
+        p (float): the exponent value in the norm formulation. Default: 2
+        dim (int or tuple of ints): the dimension to reduce. Default: 1
+        eps (float): small value to avoid division by zero. Default: 1e-12
+        out (Tensor, optional): the output tensor. If :attr:`out` is used, this
+                                operation won't be differentiable.
+    """
+    if has_torch_function_variadic(input, out):
+        return handle_torch_function(normalize, (input, out), input, p=p, dim=dim, eps=eps, out=out)
+    if out is None:
+        denom = input.norm(p, dim, keepdim=True).clamp_min(eps).expand_as(input)
+        return input / denom
+    else:
+        denom = input.norm(p, dim, keepdim=True).clamp_min_(eps).expand_as(input)
+        return torch.div(input, denom, out=out)
+
+
+def assert_int_or_pair(arg: List[int], arg_name: str, message: str) -> None:
+    assert isinstance(arg, int) or len(arg) == 2, message.format(arg_name)
+
+
+def unfold(
+    input: Tensor, kernel_size: BroadcastingList2[int],
+    dilation: BroadcastingList2[int] = 1,
+    padding: BroadcastingList2[int] = 0,
+    stride: BroadcastingList2[int] = 1
+) -> Tensor:
+    r"""Extract sliding local blocks from a batched input tensor.
+
+    .. warning::
+        Currently, only 4-D input tensors (batched image-like tensors) are
+        supported.
+
+    .. warning::
+
+        More than one element of the unfolded tensor may refer to a single
+        memory location. As a result, in-place operations (especially ones that
+        are vectorized) may result in incorrect behavior. If you need to write
+        to the tensor, please clone it first.
+
+
+    See :class:`torch.nn.Unfold` for details
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            unfold, (input,), input, kernel_size, dilation=dilation, padding=padding, stride=stride
+        )
+    return torch._C._nn.im2col(input, _pair(kernel_size), _pair(dilation), _pair(padding), _pair(stride))
+
+
+def fold(
+    input: Tensor, output_size: BroadcastingList2[int],
+    kernel_size: BroadcastingList2[int],
+    dilation: BroadcastingList2[int] = 1,
+    padding: BroadcastingList2[int] = 0,
+    stride: BroadcastingList2[int] = 1
+) -> Tensor:
+    r"""Combine an array of sliding local blocks into a large containing tensor.
+
+    .. warning::
+        Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported.
+
+    See :class:`torch.nn.Fold` for details
+    """
+    if has_torch_function_unary(input):
+        return handle_torch_function(
+            fold, (input,), input, output_size, kernel_size, dilation=dilation, padding=padding, stride=stride
+        )
+    return torch._C._nn.col2im(
+        input, _pair(output_size), _pair(kernel_size), _pair(dilation), _pair(padding), _pair(stride)
+    )
+
+#
+# multihead attention
+#
+
+def _in_projection_packed(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    w: Tensor,
+    b: Optional[Tensor] = None,
+) -> List[Tensor]:
+    r"""Perform the in-projection step of the attention operation, using packed weights.
+
+    Output is a triple containing projection tensors for query, key and value.
+
+    Args:
+        q, k, v: query, key and value tensors to be projected. For self-attention,
+            these are typically the same tensor; for encoder-decoder attention,
+            k and v are typically the same tensor. (We take advantage of these
+            identities for performance if they are present.) Regardless, q, k and v
+            must share a common embedding dimension; otherwise their shapes may vary.
+        w: projection weights for q, k and v, packed into a single tensor. Weights
+            are packed along dimension 0, in q, k, v order.
+        b: optional projection biases for q, k and v, packed into a single tensor
+            in q, k, v order.
+
+    Shape:
+        Inputs:
+        - q: :math:`(..., E)` where E is the embedding dimension
+        - k: :math:`(..., E)` where E is the embedding dimension
+        - v: :math:`(..., E)` where E is the embedding dimension
+        - w: :math:`(E * 3, E)` where E is the embedding dimension
+        - b: :math:`E * 3` where E is the embedding dimension
+
+        Output:
+        - in output list :math:`[q', k', v']`, each output tensor will have the
+            same shape as the corresponding input tensor.
+    """
+    E = q.size(-1)
+    if k is v:
+        if q is k:
+            # self-attention
+            proj = linear(q, w, b)
+            # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+            proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+            return proj[0], proj[1], proj[2]
+        else:
+            # encoder-decoder attention
+            w_q, w_kv = w.split([E, E * 2])
+            if b is None:
+                b_q = b_kv = None
+            else:
+                b_q, b_kv = b.split([E, E * 2])
+            q_proj = linear(q, w_q, b_q)
+            kv_proj = linear(k, w_kv, b_kv)
+            # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
+            kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+            return (q_proj, kv_proj[0], kv_proj[1])
+    else:
+        w_q, w_k, w_v = w.chunk(3)
+        if b is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = b.chunk(3)
+        return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
+
+
+def _in_projection(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    w_q: Tensor,
+    w_k: Tensor,
+    w_v: Tensor,
+    b_q: Optional[Tensor] = None,
+    b_k: Optional[Tensor] = None,
+    b_v: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    r"""Perform the in-projection step of the attention operation.
+
+    This is simply a triple of linear projections,
+    with shape constraints on the weights which
+    ensure embedding dimension uniformity in the projected outputs.
+    Output is a triple containing projection tensors for query, key and value.
+
+    Args:
+        q, k, v: query, key and value tensors to be projected.
+        w_q, w_k, w_v: weights for q, k and v, respectively.
+        b_q, b_k, b_v: optional biases for q, k and v, respectively.
+
+    Shape:
+        Inputs:
+        - q: :math:`(Qdims..., Eq)` where Eq is the query embedding dimension and Qdims are any
+            number of leading dimensions.
+        - k: :math:`(Kdims..., Ek)` where Ek is the key embedding dimension and Kdims are any
+            number of leading dimensions.
+        - v: :math:`(Vdims..., Ev)` where Ev is the value embedding dimension and Vdims are any
+            number of leading dimensions.
+        - w_q: :math:`(Eq, Eq)`
+        - w_k: :math:`(Eq, Ek)`
+        - w_v: :math:`(Eq, Ev)`
+        - b_q: :math:`(Eq)`
+        - b_k: :math:`(Eq)`
+        - b_v: :math:`(Eq)`
+
+        Output: in output triple :math:`(q', k', v')`,
+         - q': :math:`[Qdims..., Eq]`
+         - k': :math:`[Kdims..., Eq]`
+         - v': :math:`[Vdims..., Eq]`
+
+    """
+    Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1)
+    assert w_q.shape == (Eq, Eq), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
+    assert w_k.shape == (Eq, Ek), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
+    assert w_v.shape == (Eq, Ev), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
+    assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
+    assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
+    assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
+    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
+
+scaled_dot_product_attention = _add_docstr(
+    torch._C._nn.scaled_dot_product_attention, r"""
+scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> Tensor:
+
+Computes scaled dot product attention on query, key and value tensors, using
+an optional attention mask if passed, and applying dropout if a probability
+greater than 0.0 is specified. The optional scale argument can only be specified as a keyword argument.
+
+.. code-block:: python
+
+    # Efficient implementation equivalent to the following:
+    def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        attn_bias = torch.zeros(L, S, dtype=query.dtype)
+        if is_causal:
+            assert attn_mask is None
+            temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+        attn_weight = query @ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return attn_weight @ value
+
+.. warning:: This function is beta and subject to change.
+
+Note:
+
+    There are currently three supported implementations of scaled dot product attention:
+
+        - `FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning`_
+        - `Memory-Efficient Attention`_
+        - A PyTorch implementation defined in C++ matching the above formulation
+
+    The function may call optimized kernels for improved performance when using the CUDA backend.
+    For all other backends, the PyTorch implementation will be used.
+
+    All implementations are enabled by default. Scaled dot product attention attempts to automatically select the
+    most optimal implementation based on the inputs. In order to provide more fine-grained control over what implementation
+    is used, the following functions are provided for enabling and disabling implementations.
+    The context manager is the preferred mechanism:
+
+        - :func:`torch.nn.attention.sdpa_kernel`: A context manager used to enable or disable any of the implementations.
+        - :func:`torch.backends.cuda.enable_flash_sdp`: Globally enables or disables FlashAttention.
+        - :func:`torch.backends.cuda.enable_mem_efficient_sdp`: Globally enables or disables  Memory-Efficient Attention.
+        - :func:`torch.backends.cuda.enable_math_sdp`: Globally enables or disables  the PyTorch C++ implementation.
+
+    Each of the fused kernels has specific input limitations. If the user requires the use of a specific fused implementation,
+    disable the PyTorch C++ implementation using :func:`torch.nn.attention.sdpa_kernel`.
+    In the event that a fused implementation is not available, a warning will be raised with the
+    reasons why the fused implementation cannot run.
+
+    Due to the nature of fusing floating point operations, the output of this function may be different
+    depending on what backend kernel is chosen.
+    The c++ implementation supports torch.float64 and can be used when higher precision is required.
+    For more information please see :doc:`/notes/numerical_accuracy`
+
+Note:
+    {cudnn_reproducibility_note}
+""".format(**reproducibility_notes)
+    + r"""
+Args:
+    query (Tensor): Query tensor; shape :math:`(N, ..., L, E)`.
+    key (Tensor): Key tensor; shape :math:`(N, ..., S, E)`.
+    value (Tensor): Value tensor; shape :math:`(N, ..., S, Ev)`.
+    attn_mask (optional Tensor): Attention mask; shape must be broadcastable to the shape of attention weights,
+        which is :math:`(N,..., L, S)`. Two types of masks are supported.
+        A boolean mask where a value of True indicates that the element *should* take part in attention.
+        A float mask of the same type as query, key, value that is added to the attention score.
+    dropout_p (float): Dropout probability; if greater than 0.0, dropout is applied
+    is_causal (bool): If true, assumes upper left causal attention masking and errors if both attn_mask and is_causal
+        are set.
+    scale (optional float, keyword-only): Scaling factor applied prior to softmax. If None, the default value is set
+        to :math:`\frac{1}{\sqrt{E}}`.
+
+
+Returns:
+    output (Tensor): Attention output; shape :math:`(N, ..., L, Ev)`.
+
+Shape legend:
+    - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
+    - :math:`S: \text{Source sequence length}`
+    - :math:`L: \text{Target sequence length}`
+    - :math:`E: \text{Embedding dimension of the query and key}`
+    - :math:`Ev: \text{Embedding dimension of the value}`
+
+Examples:
+
+    >>> # Optionally use the context manager to ensure one of the fused kernels is run
+    >>> query = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+    >>> with torch.backends.cuda.sdp_kernel(enable_math=False):
+    >>>     F.scaled_dot_product_attention(query,key,value)
+
+
+.. _FlashAttention-2\: Faster Attention with Better Parallelism and Work Partitioning:
+    https://arxiv.org/abs/2307.08691
+.. _Memory-Efficient Attention:
+    https://github.com/facebookresearch/xformers
+
+""")
+
+def _mha_shape_check(query: Tensor, key: Tensor, value: Tensor,
+                     key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], num_heads: int):
+    # Verifies the expected shape for `query, `key`, `value`, `key_padding_mask` and `attn_mask`
+    # and returns if the input is batched or not.
+    # Raises an error if `query` is not 2-D (unbatched) or 3-D (batched) tensor.
+
+    # Shape check.
+    if query.dim() == 3:
+        # Batched Inputs
+        is_batched = True
+        assert key.dim() == 3 and value.dim() == 3, \
+            ("For batched (3-D) `query`, expected `key` and `value` to be 3-D"
+             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
+        if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 2, \
+                ("For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
+                 f" but found {key_padding_mask.dim()}-D tensor instead")
+        if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), \
+                ("For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                 f" but found {attn_mask.dim()}-D tensor instead")
+    elif query.dim() == 2:
+        # Unbatched Inputs
+        is_batched = False
+        assert key.dim() == 2 and value.dim() == 2, \
+            ("For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
+             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 1, \
+                ("For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
+                 f" but found {key_padding_mask.dim()}-D tensor instead")
+
+        if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), \
+                ("For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                 f" but found {attn_mask.dim()}-D tensor instead")
+            if attn_mask.dim() == 3:
+                expected_shape = (num_heads, query.shape[0], key.shape[0])
+                assert attn_mask.shape == expected_shape, \
+                    (f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}")
+    else:
+        raise AssertionError(
+            f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor")
+
+    return is_batched
+
+def _canonical_mask(
+        mask: Optional[Tensor],
+        mask_name: str,
+        other_type: Optional[DType],
+        other_name: str,
+        target_type: DType,
+        check_other: bool = True,
+) -> Optional[Tensor]:
+
+    if mask is not None:
+        _mask_dtype = mask.dtype
+        _mask_is_float = torch.is_floating_point(mask)
+        if _mask_dtype != torch.bool and not _mask_is_float:
+            raise AssertionError(
+                f"only bool and floating types of {mask_name} are supported")
+        if check_other and other_type is not None:
+            if _mask_dtype != other_type:
+                warnings.warn(
+                    f"Support for mismatched {mask_name} and {other_name} "
+                    "is deprecated. Use same type for both instead."
+                )
+        if not _mask_is_float:
+            mask = (
+                torch.zeros_like(mask, dtype=target_type)
+                .masked_fill_(mask, float("-inf"))
+            )
+    return mask
+
+def _none_or_dtype(input: Optional[Tensor]) -> Optional[DType]:
+    if input is None:
+        return None
+    elif isinstance(input, torch.Tensor):
+        return input.dtype
+    raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor")
+
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Optional[Tensor],
+    in_proj_bias: Optional[Tensor],
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Optional[Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""Forward method for MultiHeadAttention.
+
+    See :class:`torch.nn.MultiheadAttention` for details.
+
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+            Default: `True`
+            Note: `needs_weight` defaults to `True`, but should be set to `False`
+            For best performance when attention weights are not needed.
+            *Setting needs_weights to `True`
+            leads to a significant performance degradation.*
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        is_causal: If specified, applies a causal mask as attention mask, and ignores
+            attn_mask for computing scaled dot product attention.
+            Default: ``False``.
+            .. warning::
+                is_causal is provides a hint that the attn_mask is the
+                causal mask.Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads.
+            Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect
+            when ``need_weights=True.``. Default: True
+
+
+    Shape:
+        Inputs:
+        - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a FloatTensor is provided, it will be directly added to the value.
+          If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+
+        Outputs:
+        - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns
+          attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+          head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`.
+    """
+    tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
+    if has_torch_function(tens_ops):
+        return handle_torch_function(
+            multi_head_attention_forward,
+            tens_ops,
+            query,
+            key,
+            value,
+            embed_dim_to_check,
+            num_heads,
+            in_proj_weight,
+            in_proj_bias,
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            dropout_p,
+            out_proj_weight,
+            out_proj_bias,
+            training=training,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            attn_mask=attn_mask,
+            is_causal=is_causal,
+            use_separate_proj_weight=use_separate_proj_weight,
+            q_proj_weight=q_proj_weight,
+            k_proj_weight=k_proj_weight,
+            v_proj_weight=v_proj_weight,
+            static_k=static_k,
+            static_v=static_v,
+            average_attn_weights=average_attn_weights,
+        )
+
+    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
+
+    # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
+    # is batched, run the computation and before returning squeeze the
+    # batch dimension so that the output doesn't carry this temporary batch dimension.
+    if not is_batched:
+        # unsqueeze if the input is unbatched
+        query = query.unsqueeze(1)
+        key = key.unsqueeze(1)
+        value = value.unsqueeze(1)
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask.unsqueeze(0)
+
+    # set up shape vars
+    tgt_len, bsz, embed_dim = query.shape
+    src_len, _, _ = key.shape
+
+    key_padding_mask = _canonical_mask(
+        mask=key_padding_mask,
+        mask_name="key_padding_mask",
+        other_type=_none_or_dtype(attn_mask),
+        other_name="attn_mask",
+        target_type=query.dtype
+    )
+
+    if is_causal and attn_mask is None:
+        raise RuntimeError(
+            "Need attn_mask if specifying the is_causal hint. "
+            "You may use the Transformer module method "
+            "`generate_square_subsequent_mask` to create this mask."
+        )
+
+    if is_causal and key_padding_mask is None and not need_weights:
+        # when we have a kpm or need weights, we need attn_mask
+        # Otherwise, we use the is_causal hint go as is_causal
+        # indicator to SDPA.
+        attn_mask = None
+    else:
+        attn_mask = _canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=None,
+            other_name="",
+            target_type=query.dtype,
+            check_other=False,
+        )
+
+        if key_padding_mask is not None:
+            # We have the attn_mask, and use that to merge kpm into it.
+            # Turn off use of is_causal hint, as the merged mask is no
+            # longer causal.
+            is_causal = False
+
+    assert embed_dim == embed_dim_to_check, \
+        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    if isinstance(embed_dim, torch.Tensor):
+        # embed_dim can be a tensor when JIT tracing
+        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
+    else:
+        head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    if use_separate_proj_weight:
+        # allow MHA to have different embedding dimensions when separate projection weights are used
+        assert key.shape[:2] == value.shape[:2], \
+            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+    else:
+        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
+
+    #
+    # compute in-projection
+    #
+    if not use_separate_proj_weight:
+        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
+        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
+    else:
+        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
+        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
+        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
+        if in_proj_bias is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = in_proj_bias.chunk(3)
+        q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
+
+    # prep attention mask
+
+    if attn_mask is not None:
+        # ensure attn_mask's dim is 3
+        if attn_mask.dim() == 2:
+            correct_2d_size = (tgt_len, src_len)
+            if attn_mask.shape != correct_2d_size:
+                raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
+            attn_mask = attn_mask.unsqueeze(0)
+        elif attn_mask.dim() == 3:
+            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
+            if attn_mask.shape != correct_3d_size:
+                raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
+        else:
+            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+
+    # add bias along batch dimension (currently second)
+    if bias_k is not None and bias_v is not None:
+        assert static_k is None, "bias cannot be added to static key."
+        assert static_v is None, "bias cannot be added to static value."
+        k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+        v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    #
+    # reshape q, k, v for multihead attention and make em batch first
+    #
+    q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if static_k is None:
+        k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+    else:
+        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+        assert static_k.size(0) == bsz * num_heads, \
+            f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
+        assert static_k.size(2) == head_dim, \
+            f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        k = static_k
+    if static_v is None:
+        v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
+    else:
+        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+        assert static_v.size(0) == bsz * num_heads, \
+            f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
+        assert static_v.size(2) == head_dim, \
+            f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        v = static_v
+
+    # add zero attention along batch dimension (now first)
+    if add_zero_attn:
+        zero_attn_shape = (bsz * num_heads, 1, head_dim)
+        k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+
+    # update source sequence length after adjustments
+    src_len = k.size(1)
+
+    # merge key padding and attention masks
+    if key_padding_mask is not None:
+        assert key_padding_mask.shape == (bsz, src_len), \
+            f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
+        key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
+            expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
+        if attn_mask is None:
+            attn_mask = key_padding_mask
+        else:
+            attn_mask = attn_mask + key_padding_mask
+
+    # adjust dropout probability
+    if not training:
+        dropout_p = 0.0
+
+    #
+    # (deep breath) calculate attention and out projection
+    #
+
+    if need_weights:
+        B, Nt, E = q.shape
+        q_scaled = q * math.sqrt(1.0 / float(E))
+
+        assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
+
+        if attn_mask is not None:
+            attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
+        else:
+            attn_output_weights = torch.bmm(q_scaled, k.transpose(-2, -1))
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        if dropout_p > 0.0:
+            attn_output_weights = dropout(attn_output_weights, p=dropout_p)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+
+        # optionally average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+        if average_attn_weights:
+            attn_output_weights = attn_output_weights.mean(dim=1)
+
+        if not is_batched:
+            # squeeze the output if input was unbatched
+            attn_output = attn_output.squeeze(1)
+            attn_output_weights = attn_output_weights.squeeze(0)
+        return attn_output, attn_output_weights
+    else:
+        # attn_mask can be either (L,S) or (N*num_heads, L, S)
+        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
+        # in order to match the input for SDPA of (N, num_heads, L, S)
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
+
+        q = q.view(bsz, num_heads, tgt_len, head_dim)
+        k = k.view(bsz, num_heads, src_len, head_dim)
+        v = v.view(bsz, num_heads, src_len, head_dim)
+
+        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+        if not is_batched:
+            # squeeze the output if input was unbatched
+            attn_output = attn_output.squeeze(1)
+        return attn_output, None
diff --git a/MLPY/Lib/site-packages/torch/nn/functional.pyi b/MLPY/Lib/site-packages/torch/nn/functional.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..53c59a66d04b5e7356f31122fed9463a3afe99d4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/functional.pyi
@@ -0,0 +1,682 @@
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+from torch import Tensor
+from torch.types import _dtype, _int, _size
+
+from .common_types import (
+    _ratio_any_t,
+    _size_1_t,
+    _size_2_opt_t,
+    _size_2_t,
+    _size_3_opt_t,
+    _size_3_t,
+    _size_any_t,
+)
+
+# 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys.
+# It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature
+# is wide-spread.
+
+# from mypy_extensions import TypedDict
+
+# GRID_SAMPLE_INTERPOLATION_MODES = TypedDict('GRID_SAMPLE_INTERPOLATION_MODES', {'bilinear': int, 'nearest': int})
+# GRID_SAMPLE_PADDING_MODES = TypedDict('GRID_SAMPLE_PADDING_MODES', {'zeros': int, 'border': int, 'reflection': int})
+
+GRID_SAMPLE_INTERPOLATION_MODES = Dict[str, int]
+GRID_SAMPLE_PADDING_MODES = Dict[str, int]
+
+# These stubs were generated by running stubgen (`stubgen --parse-only functional.py`), followed by manual cleaning.
+#
+# The 'BroadcastingList{1,2,3}' types were replaced by `_size` or _output_ratio, as appropriate.
+# This was necessary since the JIT uses BroadcastingList* types but static checking with mypy etc requires a `Sequence`
+# type. There is no way to express the expected lengths of these lists in the current Python typing system.
+#
+# Functions created via `_add_docstr` in `functional.py` where merely typed as `Any` by `stubgen`, so those were
+# deleted from the stub and replaced by generated declarations. See `gen_pyi` for the implementation of the code
+# generation logic for those functions. In the future, it might be worth looking into using the mypy plugin system
+# to encode the type semantics of `_add_docstr`, should that system ever become widespread.
+def fractional_max_pool2d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    output_size: Optional[_size] = ...,
+    output_ratio: Optional[_ratio_any_t] = ...,
+    return_indices: bool = ...,
+    _random_samples: Optional[Tensor] = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def fractional_max_pool3d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    output_size: Optional[_size] = ...,
+    output_ratio: Optional[_ratio_any_t] = ...,
+    return_indices: bool = ...,
+    _random_samples: Optional[Tensor] = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def max_pool1d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def max_pool2d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def max_pool3d_with_indices(
+    input: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    dilation: _size = ...,
+    ceil_mode: bool = ...,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def max_unpool1d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    output_size: Optional[_size] = ...,
+) -> Tensor: ...
+def max_unpool2d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    output_size: Optional[_size] = ...,
+) -> Tensor: ...
+def max_unpool3d(
+    input: Tensor,
+    indices: Tensor,
+    kernel_size: _size,
+    stride: Optional[_size] = ...,
+    padding: _size = ...,
+    output_size: Optional[_size] = ...,
+) -> Tensor: ...
+def lp_pool1d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_1_t,
+    stride: Union[Optional[_size], Optional[int]] = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+def lp_pool2d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_2_t,
+    stride: Union[Optional[_size], Optional[int]] = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+def lp_pool3d(
+    input: Tensor,
+    norm_type: float,
+    kernel_size: _size_3_t,
+    stride: Union[Optional[_size], Optional[int]] = ...,
+    ceil_mode: bool = ...,
+) -> Tensor: ...
+def adaptive_max_pool1d_with_indices(
+    input: Tensor,
+    output_size: _size,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def adaptive_max_pool2d_with_indices(
+    input: Tensor,
+    output_size: _size_2_opt_t,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def adaptive_max_pool3d_with_indices(
+    input: Tensor,
+    output_size: _size_3_opt_t,
+    return_indices: bool = ...,
+) -> Tuple[Tensor, Tensor]: ...
+def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ...
+def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ...
+def dropout(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def alpha_dropout(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def dropout1d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def dropout2d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def dropout3d(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def feature_alpha_dropout(
+    input: Tensor,
+    p: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def threshold(
+    input: Tensor,
+    threshold: float,
+    value: float,
+    inplace: bool = ...,
+) -> Tensor: ...
+def relu(input: Tensor, inplace: bool = ...) -> Tensor: ...
+def glu(input: Tensor, dim: int = ...) -> Tensor: ...
+def hardtanh(
+    input: Tensor,
+    min_val: float = ...,
+    max_val: float = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def relu6(input: Tensor, inplace: bool = ...) -> Tensor: ...
+def elu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+def selu(input: Tensor, inplace: bool = ...) -> Tensor: ...
+def celu(input: Tensor, alpha: float = ..., inplace: bool = ...) -> Tensor: ...
+def leaky_relu(
+    input: Tensor,
+    negative_slope: float = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def rrelu(
+    input: Tensor,
+    lower: float = ...,
+    upper: float = ...,
+    training: bool = ...,
+    inplace: bool = ...,
+) -> Tensor: ...
+def tanhshrink(input: Any): ...
+def softsign(input: Any): ...
+def softmin(
+    input: Tensor,
+    dim: Optional[int] = ...,
+    _stacklevel: int = ...,
+    dtype: Optional[_dtype] = ...,
+) -> Tensor: ...
+def softmax(
+    input: Tensor,
+    dim: Optional[int] = ...,
+    _stacklevel: int = ...,
+    dtype: Optional[_dtype] = ...,
+) -> Tensor: ...
+def gumbel_softmax(
+    logits: Tensor,
+    tau: float = ...,
+    hard: bool = ...,
+    eps: float = ...,
+    dim: int = ...,
+) -> Tensor: ...
+def log_softmax(
+    input: Tensor,
+    dim: Optional[int] = ...,
+    _stacklevel: int = ...,
+    dtype: Optional[_dtype] = ...,
+) -> Tensor: ...
+def tanh(input: Any): ...
+def sigmoid(input: Any) -> Tensor: ...
+def hardsigmoid(input: Tensor, inplace: bool = False) -> Tensor: ...
+def silu(input: Tensor, inplace: bool = False) -> Tensor: ...
+def mish(input: Tensor, inplace: bool = False) -> Tensor: ...
+def hardswish(input: Tensor, inplace: bool = False) -> Tensor: ...
+def embedding(
+    input: Tensor,
+    weight: Tensor,
+    padding_idx: Optional[int] = ...,
+    max_norm: Optional[float] = ...,
+    norm_type: float = ...,
+    scale_grad_by_freq: bool = ...,
+    sparse: bool = ...,
+) -> Tensor: ...
+def embedding_bag(
+    input: Tensor,
+    weight: Tensor,
+    offsets: Optional[Tensor] = ...,
+    max_norm: Optional[float] = ...,
+    norm_type: float = ...,
+    scale_grad_by_freq: bool = ...,
+    mode: str = ...,
+    sparse: bool = ...,
+    per_sample_weights: Optional[Tensor] = ...,
+    include_last_offset: bool = ...,
+    padding_idx: Optional[int] = ...,
+) -> Tensor: ...
+def batch_norm(
+    input: Tensor,
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    weight: Optional[Tensor] = ...,
+    bias: Optional[Tensor] = ...,
+    training: bool = ...,
+    momentum: float = ...,
+    eps: float = ...,
+) -> Tensor: ...
+def instance_norm(
+    input: Tensor,
+    running_mean: Optional[Tensor] = ...,
+    running_var: Optional[Tensor] = ...,
+    weight: Optional[Tensor] = ...,
+    bias: Optional[Tensor] = ...,
+    use_input_stats: bool = ...,
+    momentum: float = ...,
+    eps: float = ...,
+) -> Tensor: ...
+def layer_norm(
+    input: Tensor,
+    normalized_shape: Sequence[int],
+    weight: Optional[Tensor] = ...,
+    bias: Optional[Tensor] = ...,
+    eps: float = ...,
+) -> Tensor: ...
+def group_norm(
+    input: Tensor,
+    num_groups: int,
+    weight: Optional[Tensor] = ...,
+    bias: Optional[Tensor] = ...,
+    eps: float = ...,
+) -> Tensor: ...
+def local_response_norm(
+    input: Tensor,
+    size: int,
+    alpha: float = ...,
+    beta: float = ...,
+    k: float = ...,
+) -> Tensor: ...
+def ctc_loss(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Tensor,
+    target_lengths: Tensor,
+    blank: int = ...,
+    reduction: str = ...,
+    zero_infinity: bool = ...,
+) -> Tensor: ...
+def nll_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    ignore_index: int = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def poisson_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    log_input: bool = ...,
+    full: bool = ...,
+    size_average: Optional[bool] = ...,
+    eps: float = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def gaussian_nll_loss(
+    input: Tensor,
+    target: Tensor,
+    var: Tensor,
+    full: Optional[bool] = ...,
+    eps: Optional[float] = ...,
+    reduction: Optional[str] = ...,
+) -> Tensor: ...
+def kl_div(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+    log_target: bool = ...,
+) -> Tensor: ...
+def cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    ignore_index: int = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+    label_smoothing: float = ...,
+) -> Tensor: ...
+def binary_cross_entropy(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def binary_cross_entropy_with_logits(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+    pos_weight: Optional[Tensor] = ...,
+) -> Tensor: ...
+def smooth_l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+    beta: float = ...,
+) -> Tensor: ...
+def huber_loss(
+    input: Tensor,
+    target: Tensor,
+    reduction: str = ...,
+    delta: float = ...,
+) -> Tensor: ...
+def l1_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def mse_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def margin_ranking_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def hinge_embedding_loss(
+    input: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def multilabel_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def multilabel_soft_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def cosine_embedding_loss(
+    input1: Tensor,
+    input2: Tensor,
+    target: Tensor,
+    margin: float = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def multi_margin_loss(
+    input: Tensor,
+    target: Tensor,
+    p: int = ...,
+    margin: float = ...,
+    weight: Optional[Tensor] = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def upsample(
+    input: Any,
+    size: Optional[Any] = ...,
+    scale_factor: Optional[Any] = ...,
+    mode: str = ...,
+    align_corners: Optional[Any] = ...,
+): ...
+def interpolate(
+    input: Any,
+    size: Optional[Any] = ...,
+    scale_factor: Optional[Any] = ...,
+    mode: str = ...,
+    align_corners: Optional[Any] = ...,
+    recompute_scale_factor: Optional[Any] = ...,
+    antialias: bool = ...,
+): ...
+def upsample_nearest(
+    input: Any,
+    size: Optional[Any] = ...,
+    scale_factor: Optional[Any] = ...,
+): ...
+def upsample_bilinear(
+    input: Any,
+    size: Optional[Any] = ...,
+    scale_factor: Optional[Any] = ...,
+): ...
+def grid_sample(
+    input: Tensor,
+    grid: Tensor,
+    mode: str = ...,
+    padding_mode: str = ...,
+    align_corners: Optional[Any] = ...,
+) -> Tensor: ...
+def affine_grid(
+    theta: Tensor,
+    size: List[int],
+    align_corners: Optional[Any] = ...,
+) -> Tensor: ...
+def triplet_margin_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    margin: float = ...,
+    p: float = ...,
+    eps: float = ...,
+    swap: bool = ...,
+    size_average: Optional[bool] = ...,
+    reduce: Optional[bool] = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def triplet_margin_with_distance_loss(
+    anchor: Tensor,
+    positive: Tensor,
+    negative: Tensor,
+    *,
+    distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = ...,
+    margin: float = ...,
+    swap: bool = ...,
+    reduction: str = ...,
+) -> Tensor: ...
+def normalize(
+    input: Tensor,
+    p: float = ...,
+    dim: int = ...,
+    eps: float = ...,
+    out: Optional[Tensor] = ...,
+) -> Tensor: ...
+def assert_int_or_pair(
+    arg: Any,
+    arg_name: Any,
+    message: Any,
+) -> None: ...
+def unfold(
+    input: Tensor,
+    kernel_size: _size_any_t,
+    dilation: _size_any_t = ...,
+    padding: _size_any_t = ...,
+    stride: _size_any_t = ...,
+) -> Tensor: ...
+def fold(
+    input: Tensor,
+    output_size: _size_any_t,
+    kernel_size: _size_any_t,
+    dilation: _size_any_t = ...,
+    padding: _size_any_t = ...,
+    stride: _size_any_t = ...,
+) -> Tensor: ...
+def _canonical_mask(
+    mask: Optional[Tensor],
+    mask_name: str,
+    other_type: Optional[_dtype],
+    other_name: str,
+    target_type: _dtype,
+    check_other: bool = True,
+) -> Optional[Tensor]: ...
+def _none_or_dtype(input: Optional[Tensor]) -> Optional[_dtype]: ...
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Optional[Tensor],
+    in_proj_bias: Optional[Tensor],
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Optional[Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> Tuple[Tensor, Optional[Tensor]]: ...
+
+from .. import conv1d as conv1d
+from .. import conv2d as conv2d
+from .. import conv3d as conv3d
+from .. import conv_transpose1d as conv_transpose1d
+from .. import conv_transpose2d as conv_transpose2d
+from .. import conv_transpose3d as conv_transpose3d
+from .. import conv_tbc as conv_tbc
+from .. import avg_pool1d as avg_pool1d
+from .. import adaptive_avg_pool1d as adaptive_avg_pool1d
+from .. import relu_ as relu_
+from .. import selu_ as selu_
+from .. import celu_ as celu_
+from .. import prelu as prelu
+from .. import rrelu_ as rrelu_
+from .. import hardshrink as hardshrink
+from .. import bilinear as bilinear
+from .. import pixel_shuffle as pixel_shuffle
+from .. import pixel_unshuffle as pixel_unshuffle
+from .. import channel_shuffle as channel_shuffle
+from .. import native_channel_shuffle as native_channel_shuffle
+from .. import pairwise_distance as pairwise_distance
+from .. import pdist as pdist
+from .. import cosine_similarity as cosine_similarity
+from .._C._nn import avg_pool2d as avg_pool2d
+from .._C._nn import avg_pool3d as avg_pool3d
+from .._C._nn import hardtanh_ as hardtanh_
+from .._C._nn import elu_ as elu_
+from .._C._nn import leaky_relu_ as leaky_relu_
+from .._C._nn import gelu as gelu
+from .._C._nn import softplus as softplus
+from .._C._nn import softshrink as softshrink
+from .._C._nn import linear as linear
+from .._C._nn import pad as pad
+from .._C._nn import one_hot as one_hot
+from .._C._nn import scaled_dot_product_attention as scaled_dot_product_attention
+from .._C._nn import log_sigmoid
+logsigmoid = log_sigmoid
+
+@overload
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def adaptive_max_pool1d(input: Tensor, output_size: Union[_int, _size], *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size], *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size], return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size], *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]] = None, output_ratio: Optional[_ratio_any_t] = None, return_indices: Literal[False] = False, _random_samples: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]], output_ratio: Optional[_ratio_any_t], return_indices: Literal[True], /, _random_samples: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]] = None, output_ratio: Optional[_ratio_any_t] = None, *, return_indices: Literal[True], _random_samples: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]] = None, output_ratio: Optional[_ratio_any_t] = None, return_indices: Literal[False] = False, _random_samples: Optional[Tensor] = None) -> Tensor: ...
+@overload
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]], output_ratio: Optional[_ratio_any_t], return_indices: Literal[True], /, _random_samples: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Optional[Union[_int, _size]] = None, output_ratio: Optional[_ratio_any_t] = None, *, return_indices: Literal[True], _random_samples: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]], padding: Union[_int, _size], dilation: Union[_int, _size], ceil_mode: bool, return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool1d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]], padding: Union[_int, _size], dilation: Union[_int, _size], ceil_mode: bool, return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, return_indices: Literal[False] = False) -> Tensor: ...
+@overload
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]], padding: Union[_int, _size], dilation: Union[_int, _size], ceil_mode: bool, return_indices: Literal[True], /) -> Tuple[Tensor, Tensor]: ...
+@overload
+def max_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, dilation: Union[_int, _size] = 1, ceil_mode: bool = False, *, return_indices: Literal[True]) -> Tuple[Tensor, Tensor]: ...
diff --git a/MLPY/Lib/site-packages/torch/nn/grad.py b/MLPY/Lib/site-packages/torch/nn/grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ed278bfd854672932978a7e6866f63a5ebee366
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/grad.py
@@ -0,0 +1,189 @@
+"""Gradient interface."""
+
+import torch
+from .modules.utils import _single, _pair, _triple
+
+
+def conv1d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv1d with respect to the input of the convolution.
+
+    This is same as the 1D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weight tensor (out_channels x in_channels/groups x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(1, 1, 3, requires_grad=True)
+        >>> weight = torch.randn(1, 1, 1, requires_grad=True)
+        >>> output = F.conv1d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv1d_input(input.shape, weight, grad_output)
+
+    """
+    input = grad_output.new_empty(1).expand(input_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _single(stride), _single(padding), _single(dilation),
+                                               False, [0], groups, (True, False, False))[0]
+
+
+def conv1d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv1d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(1, 1, 3, requires_grad=True)
+        >>> weight = torch.randn(1, 1, 1, requires_grad=True)
+        >>> output = F.conv1d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> # xdoctest: +SKIP
+        >>> grad_weight = torch.autograd.grad(output, filter, grad_output)
+        >>> F.grad.conv1d_weight(input, weight.shape, grad_output)
+
+    """
+    weight = grad_output.new_empty(1).expand(weight_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _single(stride), _single(padding), _single(dilation),
+                                               False, [0], groups, (False, True, False))[1]
+
+
+def conv2d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv2d with respect to the input of the convolution.
+
+    This is same as the 2D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weight tensor (out_channels x in_channels/groups x kH x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(1, 1, 3, 3, requires_grad=True)
+        >>> weight = torch.randn(1, 1, 1, 2, requires_grad=True)
+        >>> output = F.conv2d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv2d_input(input.shape, weight, grad_output)
+
+    """
+    input = grad_output.new_empty(1).expand(input_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _pair(stride), _pair(padding), _pair(dilation),
+                                               False, [0], groups, (True, False, False))[0]
+
+
+def conv2d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv2d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iH x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(1, 1, 3, 3, requires_grad=True)
+        >>> weight = torch.randn(1, 1, 1, 2, requires_grad=True)
+        >>> output = F.conv2d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> # xdoctest: +SKIP
+        >>> grad_weight = torch.autograd.grad(output, filter, grad_output)
+        >>> F.grad.conv2d_weight(input, weight.shape, grad_output)
+
+    """
+    weight = grad_output.new_empty(1).expand(weight_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _pair(stride), _pair(padding), _pair(dilation),
+                                               False, [0], groups, (False, True, False))[1]
+
+
+def conv3d_input(input_size, weight, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv3d with respect to the input of the convolution.
+
+    This is same as the 3D transposed convolution operator under the hood but requires
+    the shape of the gradient w.r.t. input to be specified explicitly.
+
+    Args:
+        input_size : Shape of the input gradient tensor
+        weight: weights tensor (out_channels x in_channels/groups x kT x kH x kW)
+        grad_output : output gradient tensor (minibatch x out_channels x oT x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(2, 8, 10, 10, 20, requires_grad=True)
+        >>> weight = torch.randn(4, 8, 2, 3, 3, requires_grad=True)
+        >>> output = F.conv3d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_input = torch.autograd.grad(output, input, grad_output)
+        >>> F.grad.conv3d_input(input.shape, weight, grad_output)
+
+    """
+    input = grad_output.new_empty(1).expand(input_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _triple(stride), _triple(padding), _triple(dilation),
+                                               False, [0], groups, (True, False, False))[0]
+
+
+def conv3d_weight(input, weight_size, grad_output, stride=1, padding=0, dilation=1, groups=1):
+    r"""Compute the gradient of conv3d with respect to the weight of the convolution.
+
+    Args:
+        input: input tensor of shape (minibatch x in_channels x iT x iH x iW)
+        weight_size : Shape of the weight gradient tensor
+        grad_output : output gradient tensor (minibatch x out_channels x oT x oH x oW)
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+
+    Examples::
+
+        >>> input = torch.randn(2, 8, 10, 10, 20, requires_grad=True)
+        >>> weight = torch.randn(4, 8, 2, 3, 3, requires_grad=True)
+        >>> output = F.conv3d(input, weight)
+        >>> grad_output = torch.randn(output.shape)
+        >>> grad_weight = torch.autograd.grad(output, weight, grad_output)
+        >>> F.grad.conv3d_weight(input, weight.shape, grad_output)
+
+    """
+    weight = grad_output.new_empty(1).expand(weight_size)
+
+    return torch.ops.aten.convolution_backward(grad_output, input, weight, None,
+                                               _triple(stride), _triple(padding), _triple(dilation),
+                                               False, [0], groups, (False, True, False))[1]
diff --git a/MLPY/Lib/site-packages/torch/nn/init.py b/MLPY/Lib/site-packages/torch/nn/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1bcb39ae248be258a89fa9894b7e205fd525da9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/init.py
@@ -0,0 +1,626 @@
+"""This file contains utilities for initializing neural network parameters."""
+import math
+import warnings
+
+from torch import Tensor
+import torch
+from typing import Optional as _Optional
+
+# These no_grad_* functions are necessary as wrappers around the parts of these
+# functions that use `with torch.no_grad()`. The JIT doesn't support context
+# managers, so these need to be implemented as builtins. Using these wrappers
+# lets us keep those builtins small and re-usable.
+def _no_grad_uniform_(tensor, a, b, generator=None):
+    with torch.no_grad():
+        return tensor.uniform_(a, b, generator=generator)
+
+
+def _no_grad_normal_(tensor, mean, std, generator=None):
+    with torch.no_grad():
+        return tensor.normal_(mean, std, generator=generator)
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=None):
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1, generator=generator)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def _no_grad_fill_(tensor, val):
+    with torch.no_grad():
+        return tensor.fill_(val)
+
+
+def _no_grad_zero_(tensor):
+    with torch.no_grad():
+        return tensor.zero_()
+
+
+def calculate_gain(nonlinearity, param=None):
+    r"""Return the recommended gain value for the given nonlinearity function.
+
+    The values are as follows:
+
+    ================= ====================================================
+    nonlinearity      gain
+    ================= ====================================================
+    Linear / Identity :math:`1`
+    Conv{1,2,3}D      :math:`1`
+    Sigmoid           :math:`1`
+    Tanh              :math:`\frac{5}{3}`
+    ReLU              :math:`\sqrt{2}`
+    Leaky Relu        :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
+    SELU              :math:`\frac{3}{4}`
+    ================= ====================================================
+
+    .. warning::
+        In order to implement `Self-Normalizing Neural Networks`_ ,
+        you should use ``nonlinearity='linear'`` instead of ``nonlinearity='selu'``.
+        This gives the initial weights a variance of ``1 / N``,
+        which is necessary to induce a stable fixed point in the forward pass.
+        In contrast, the default gain for ``SELU`` sacrifices the normalization
+        effect for more stable gradient flow in rectangular layers.
+
+    Args:
+        nonlinearity: the non-linear function (`nn.functional` name)
+        param: optional parameter for the non-linear function
+
+    Examples:
+        >>> gain = nn.init.calculate_gain('leaky_relu', 0.2)  # leaky_relu with negative_slope=0.2
+
+    .. _Self-Normalizing Neural Networks: https://papers.nips.cc/paper/2017/hash/5d44ee6f2c3f71b73125876103c8f6c4-Abstract.html
+    """
+    linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(f"negative_slope {param} not a valid number")
+        return math.sqrt(2.0 / (1 + negative_slope ** 2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4  # Value found empirically (https://github.com/pytorch/pytorch/pull/50664)
+    else:
+        raise ValueError(f"Unsupported nonlinearity {nonlinearity}")
+
+
+def uniform_(
+    tensor: Tensor,
+    a: float = 0.0,
+    b: float = 1.0,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+    r"""Fill the input Tensor with values drawn from the uniform distribution.
+
+    :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the lower bound of the uniform distribution
+        b: the upper bound of the uniform distribution
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.uniform_(w)
+    """
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(
+            uniform_, (tensor,), tensor=tensor, a=a, b=b, generator=generator
+        )
+    return _no_grad_uniform_(tensor, a, b, generator)
+
+
+def normal_(
+    tensor: Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+    r"""Fill the input Tensor with values drawn from the normal distribution.
+
+    :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.normal_(w)
+    """
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(
+            normal_, (tensor,), tensor=tensor, mean=mean, std=std, generator=generator
+        )
+    return _no_grad_normal_(tensor, mean, std, generator)
+
+def trunc_normal_(
+    tensor: Tensor,
+    mean: float = 0.,
+    std: float = 1.,
+    a: float = -2.,
+    b: float = 2.,
+    generator: _Optional[torch.Generator] = None
+) -> Tensor:
+    r"""Fill the input Tensor with values drawn from a truncated normal distribution.
+
+    The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=generator)
+
+
+def constant_(tensor: Tensor, val: float) -> Tensor:
+    r"""Fill the input Tensor with the value :math:`\text{val}`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        val: the value to fill the tensor with
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.constant_(w, 0.3)
+    """
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(constant_, (tensor,), tensor=tensor, val=val)
+    return _no_grad_fill_(tensor, val)
+
+
+def ones_(tensor: Tensor) -> Tensor:
+    r"""Fill the input Tensor with the scalar value `1`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.ones_(w)
+    """
+    return _no_grad_fill_(tensor, 1.)
+
+
+def zeros_(tensor: Tensor) -> Tensor:
+    r"""Fill the input Tensor with the scalar value `0`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.zeros_(w)
+    """
+    return _no_grad_zero_(tensor)
+
+
+def eye_(tensor):
+    r"""Fill the 2-dimensional input `Tensor` with the identity matrix.
+
+    Preserves the identity of the inputs in `Linear` layers, where as
+    many inputs are preserved as possible.
+
+    Args:
+        tensor: a 2-dimensional `torch.Tensor`
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.eye_(w)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    with torch.no_grad():
+        torch.eye(*tensor.shape, out=tensor, requires_grad=tensor.requires_grad)
+    return tensor
+
+
+def dirac_(tensor, groups=1):
+    r"""Fill the {3, 4, 5}-dimensional input `Tensor` with the Dirac delta function.
+
+    Preserves the identity of the inputs in `Convolutional`
+    layers, where as many input channels are preserved as possible. In case
+    of groups>1, each group of channels preserves identity
+
+    Args:
+        tensor: a {3, 4, 5}-dimensional `torch.Tensor`
+        groups (int, optional): number of groups in the conv layer (default: 1)
+    Examples:
+        >>> w = torch.empty(3, 16, 5, 5)
+        >>> nn.init.dirac_(w)
+        >>> w = torch.empty(3, 24, 5, 5)
+        >>> nn.init.dirac_(w, 3)
+    """
+    dimensions = tensor.ndimension()
+    if dimensions not in [3, 4, 5]:
+        raise ValueError("Only tensors with 3, 4, or 5 dimensions are supported")
+
+    sizes = tensor.size()
+
+    if sizes[0] % groups != 0:
+        raise ValueError('dim 0 must be divisible by groups')
+
+    out_chans_per_grp = sizes[0] // groups
+    min_dim = min(out_chans_per_grp, sizes[1])
+
+    with torch.no_grad():
+        tensor.zero_()
+
+        for g in range(groups):
+            for d in range(min_dim):
+                if dimensions == 3:  # Temporal convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.size(2) // 2] = 1
+                elif dimensions == 4:  # Spatial convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.size(2) // 2,
+                           tensor.size(3) // 2] = 1
+                else:  # Volumetric convolution
+                    tensor[g * out_chans_per_grp + d, d, tensor.size(2) // 2,
+                           tensor.size(3) // 2, tensor.size(4) // 2] = 1
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor):
+    dimensions = tensor.dim()
+    if dimensions < 2:
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
+
+    num_input_fmaps = tensor.size(1)
+    num_output_fmaps = tensor.size(0)
+    receptive_field_size = 1
+    if tensor.dim() > 2:
+        # math.prod is not always available, accumulate the product manually
+        # we could use functools.reduce but that is not supported by TorchScript
+        for s in tensor.shape[2:]:
+            receptive_field_size *= s
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(
+    tensor: Tensor, gain: float = 1.0, generator: _Optional[torch.Generator] = None
+) -> Tensor:
+    r"""Fill the input `Tensor` with values using a Xavier uniform distribution.
+
+    The method is described in `Understanding the difficulty of training
+    deep feedforward neural networks` - Glorot, X. & Bengio, Y. (2010).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-a, a)` where
+
+    .. math::
+        a = \text{gain} \times \sqrt{\frac{6}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    a = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+
+    return _no_grad_uniform_(tensor, -a, a, generator)
+
+
+def xavier_normal_(
+    tensor: Tensor,
+    gain: float = 1.0,
+    generator: _Optional[torch.Generator] = None,
+) -> Tensor:
+    r"""Fill the input `Tensor` with values using a Xavier normal distribution.
+
+    The method is described in `Understanding the difficulty of training deep feedforward
+    neural networks` - Glorot, X. & Bengio, Y. (2010). The resulting tensor
+    will have values sampled from :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \text{gain} \times \sqrt{\frac{2}{\text{fan\_in} + \text{fan\_out}}}
+
+    Also known as Glorot initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        gain: an optional scaling factor
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.xavier_normal_(w)
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+
+    return _no_grad_normal_(tensor, 0., std, generator)
+
+
+def _calculate_correct_fan(tensor, mode):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError(f"Mode {mode} not supported, please use one of {valid_modes}")
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def kaiming_uniform_(
+    tensor: Tensor,
+    a: float = 0,
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    generator: _Optional[torch.Generator] = None,
+):
+    r"""Fill the input `Tensor` with values using a Kaiming uniform distribution.
+
+    The method is described in `Delving deep into rectifiers: Surpassing
+    human-level performance on ImageNet classification` - He, K. et al. (2015).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
+
+    .. math::
+        \text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')
+    """
+    if torch.overrides.has_torch_function_variadic(tensor):
+        return torch.overrides.handle_torch_function(
+            kaiming_uniform_,
+            (tensor,),
+            tensor=tensor,
+            a=a,
+            mode=mode,
+            nonlinearity=nonlinearity,
+            generator=generator)
+
+    if 0 in tensor.shape:
+        warnings.warn("Initializing zero-element tensors is a no-op")
+        return tensor
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+    with torch.no_grad():
+        return tensor.uniform_(-bound, bound, generator=generator)
+
+
+def kaiming_normal_(
+    tensor: Tensor,
+    a: float = 0,
+    mode: str = "fan_in",
+    nonlinearity: str = "leaky_relu",
+    generator: _Optional[torch.Generator] = None,
+):
+    r"""Fill the input `Tensor` with values using a Kaiming normal distribution.
+
+    The method is described in `Delving deep into rectifiers: Surpassing
+    human-level performance on ImageNet classification` - He, K. et al. (2015).
+    The resulting tensor will have values sampled from
+    :math:`\mathcal{N}(0, \text{std}^2)` where
+
+    .. math::
+        \text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}
+
+    Also known as He initialization.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        a: the negative slope of the rectifier used after this layer (only
+            used with ``'leaky_relu'``)
+        mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
+            preserves the magnitude of the variance of the weights in the
+            forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
+            backwards pass.
+        nonlinearity: the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` (default).
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.kaiming_normal_(w, mode='fan_out', nonlinearity='relu')
+    """
+    if 0 in tensor.shape:
+        warnings.warn("Initializing zero-element tensors is a no-op")
+        return tensor
+    fan = _calculate_correct_fan(tensor, mode)
+    gain = calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    with torch.no_grad():
+        return tensor.normal_(0, std, generator=generator)
+
+
+def orthogonal_(
+    tensor,
+    gain=1,
+    generator: _Optional[torch.Generator] = None,
+):
+    r"""Fill the input `Tensor` with a (semi) orthogonal matrix.
+
+    Described in `Exact solutions to the nonlinear dynamics of learning in deep
+    linear neural networks` - Saxe, A. et al. (2013). The input tensor must have
+    at least 2 dimensions, and for tensors with more than 2 dimensions the
+    trailing dimensions are flattened.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`, where :math:`n \geq 2`
+        gain: optional scaling factor
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.orthogonal_(w)
+    """
+    if tensor.ndimension() < 2:
+        raise ValueError("Only tensors with 2 or more dimensions are supported")
+
+    if tensor.numel() == 0:
+        # no-op
+        return tensor
+    rows = tensor.size(0)
+    cols = tensor.numel() // rows
+    flattened = tensor.new(rows, cols).normal_(0, 1, generator=generator)
+
+    if rows < cols:
+        flattened.t_()
+
+    # Compute the qr factorization
+    q, r = torch.linalg.qr(flattened)
+    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
+    d = torch.diag(r, 0)
+    ph = d.sign()
+    q *= ph
+
+    if rows < cols:
+        q.t_()
+
+    with torch.no_grad():
+        tensor.view_as(q).copy_(q)
+        tensor.mul_(gain)
+    return tensor
+
+
+def sparse_(
+    tensor,
+    sparsity,
+    std=0.01,
+    generator: _Optional[torch.Generator] = None,
+):
+    r"""Fill the 2D input `Tensor` as a sparse matrix.
+
+    The non-zero elements will be drawn from the normal distribution
+    :math:`\mathcal{N}(0, 0.01)`, as described in `Deep learning via
+    Hessian-free optimization` - Martens, J. (2010).
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        sparsity: The fraction of elements in each column to be set to zero
+        std: the standard deviation of the normal distribution used to generate
+            the non-zero values
+        generator: the torch Generator to sample from (default: None)
+
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.sparse_(w, sparsity=0.1)
+    """
+    if tensor.ndimension() != 2:
+        raise ValueError("Only tensors with 2 dimensions are supported")
+
+    rows, cols = tensor.shape
+    num_zeros = int(math.ceil(sparsity * rows))
+
+    with torch.no_grad():
+        tensor.normal_(0, std, generator=generator)
+        for col_idx in range(cols):
+            row_indices = torch.randperm(rows)
+            zero_indices = row_indices[:num_zeros]
+            tensor[zero_indices, col_idx] = 0
+    return tensor
+
+
+# for backward compatibility
+def _make_deprecate(meth):
+    new_name = meth.__name__
+    old_name = new_name[:-1]
+
+    def deprecated_init(*args, **kwargs):
+        warnings.warn(f"nn.init.{old_name} is now deprecated in favor of nn.init.{new_name}.", stacklevel=2)
+        return meth(*args, **kwargs)
+
+    deprecated_init.__doc__ = fr"""
+    {old_name}(...)
+
+    .. warning::
+        This method is now deprecated in favor of :func:`torch.nn.init.{new_name}`.
+
+    See :func:`~torch.nn.init.{new_name}` for details."""
+    deprecated_init.__name__ = old_name
+    return deprecated_init
+
+
+uniform = _make_deprecate(uniform_)
+normal = _make_deprecate(normal_)
+constant = _make_deprecate(constant_)
+eye = _make_deprecate(eye_)
+dirac = _make_deprecate(dirac_)
+xavier_uniform = _make_deprecate(xavier_uniform_)
+xavier_normal = _make_deprecate(xavier_normal_)
+kaiming_uniform = _make_deprecate(kaiming_uniform_)
+kaiming_normal = _make_deprecate(kaiming_normal_)
+orthogonal = _make_deprecate(orthogonal_)
+sparse = _make_deprecate(sparse_)
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cb04a4c28a91263261cc08599a42bf7a3fae24d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/__init__.py
@@ -0,0 +1,35 @@
+from torch.ao.nn.intrinsic import ConvBn1d
+from torch.ao.nn.intrinsic import ConvBn2d
+from torch.ao.nn.intrinsic import ConvBn3d
+from torch.ao.nn.intrinsic import ConvBnReLU1d
+from torch.ao.nn.intrinsic import ConvBnReLU2d
+from torch.ao.nn.intrinsic import ConvBnReLU3d
+from torch.ao.nn.intrinsic import ConvReLU1d
+from torch.ao.nn.intrinsic import ConvReLU2d
+from torch.ao.nn.intrinsic import ConvReLU3d
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.ao.nn.intrinsic import BNReLU2d
+from torch.ao.nn.intrinsic import BNReLU3d
+from torch.ao.nn.intrinsic import LinearBn1d
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule  # noqa: F401
+
+# Include the subpackages in case user imports from it directly
+from . import modules  # noqa: F401
+from . import qat  # noqa: F401
+from . import quantized  # noqa: F401
+
+__all__ = [
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+    'BNReLU2d',
+    'BNReLU3d',
+    'LinearBn1d',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bed6d8b646f2a55de02df84713a46c53ea003b9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ce748a71bee3c9826fbe4c0646a927a93bfe10
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__init__.py
@@ -0,0 +1,31 @@
+from .fused import _FusedModule  # noqa: F401
+from .fused import BNReLU2d
+from .fused import BNReLU3d
+from .fused import ConvBn1d
+from .fused import ConvBn2d
+from .fused import ConvBn3d
+from .fused import ConvBnReLU1d
+from .fused import ConvBnReLU2d
+from .fused import ConvBnReLU3d
+from .fused import ConvReLU1d
+from .fused import ConvReLU2d
+from .fused import ConvReLU3d
+from .fused import LinearBn1d
+from .fused import LinearReLU
+
+
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearBn1d',
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..488faf888d820a3e8191aca4f6ec8cdfcff04a49
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..490588dc420b2f26ee471114cebfc8f1431cc80c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/__pycache__/fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/fused.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..41bec77087f6c56b66a22bd2f1e826fc4497c2ce
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/modules/fused.py
@@ -0,0 +1,30 @@
+from torch.ao.nn.intrinsic import BNReLU2d
+from torch.ao.nn.intrinsic import BNReLU3d
+from torch.ao.nn.intrinsic import ConvBn1d
+from torch.ao.nn.intrinsic import ConvBn2d
+from torch.ao.nn.intrinsic import ConvBn3d
+from torch.ao.nn.intrinsic import ConvBnReLU1d
+from torch.ao.nn.intrinsic import ConvBnReLU2d
+from torch.ao.nn.intrinsic import ConvBnReLU3d
+from torch.ao.nn.intrinsic import ConvReLU1d
+from torch.ao.nn.intrinsic import ConvReLU2d
+from torch.ao.nn.intrinsic import ConvReLU3d
+from torch.ao.nn.intrinsic import LinearBn1d
+from torch.ao.nn.intrinsic import LinearReLU
+from torch.ao.nn.intrinsic.modules.fused import _FusedModule  # noqa: F401
+
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvBn1d',
+    'ConvBn2d',
+    'ConvBn3d',
+    'ConvBnReLU1d',
+    'ConvBnReLU2d',
+    'ConvBnReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearBn1d',
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cfdcd17371d0b1d12ac4add0f17d2fbc33ef385
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..673e1d6d401d3dde3f49863751c122d1de786e82
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__init__.py
@@ -0,0 +1,31 @@
+from .linear_relu import LinearReLU
+from .linear_fused import LinearBn1d
+from .conv_fused import (
+    ConvBn1d,
+    ConvBn2d,
+    ConvBn3d,
+    ConvBnReLU1d,
+    ConvBnReLU2d,
+    ConvBnReLU3d,
+    ConvReLU1d,
+    ConvReLU2d,
+    ConvReLU3d,
+    update_bn_stats,
+    freeze_bn_stats,
+)
+
+__all__ = [
+    "LinearReLU",
+    "LinearBn1d",
+    "ConvReLU1d",
+    "ConvReLU2d",
+    "ConvReLU3d",
+    "ConvBn1d",
+    "ConvBn2d",
+    "ConvBn3d",
+    "ConvBnReLU1d",
+    "ConvBnReLU2d",
+    "ConvBnReLU3d",
+    "update_bn_stats",
+    "freeze_bn_stats",
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb5a61e3df954c3d39f7d31755fd430488d00e9d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27ba40f6d3abe25c248fc7e11b4d626d0dbfb1dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/conv_fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee51a7b8ab5466be541e8540fe2be30b665d9a8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_fused.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7141d521c5b421cf8959056bf1c3c13a4fd4d19d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/conv_fused.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/conv_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92e300a995fceaaf38d6d74d136307befc6d7be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -0,0 +1,37 @@
+# flake8: noqa: F401
+r"""Intrinsic QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/intrinsic/qat/modules`,
+while adding an import statement here.
+"""
+
+__all__ = [
+    # Modules
+    'ConvBn1d',
+    'ConvBnReLU1d',
+    'ConvReLU1d',
+    'ConvBn2d',
+    'ConvBnReLU2d',
+    'ConvReLU2d',
+    'ConvBn3d',
+    'ConvBnReLU3d',
+    'ConvReLU3d',
+    # Utilities
+    'freeze_bn_stats',
+    'update_bn_stats',
+]
+
+from torch.ao.nn.intrinsic.qat import ConvBn1d
+from torch.ao.nn.intrinsic.qat import ConvBnReLU1d
+from torch.ao.nn.intrinsic.qat import ConvReLU1d
+from torch.ao.nn.intrinsic.qat import ConvBn2d
+from torch.ao.nn.intrinsic.qat import ConvBnReLU2d
+from torch.ao.nn.intrinsic.qat import ConvReLU2d
+from torch.ao.nn.intrinsic.qat import ConvBn3d
+from torch.ao.nn.intrinsic.qat import ConvBnReLU3d
+from torch.ao.nn.intrinsic.qat import ConvReLU3d
+from torch.ao.nn.intrinsic.qat import freeze_bn_stats
+from torch.ao.nn.intrinsic.qat import update_bn_stats
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_fused.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_fused.py
new file mode 100644
index 0000000000000000000000000000000000000000..7205f7b2a65859e5763613c2206d93b4a4519cbc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_fused.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Intrinsic QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/intrinsic/qat/modules`,
+while adding an import statement here.
+"""
+
+__all__ = [
+    'LinearBn1d',
+]
+
+from torch.ao.nn.intrinsic.qat import LinearBn1d
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb54baf0ac9947bc19ecfa7ee129470cab243e93
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Intrinsic QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/intrinsic/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/intrinsic/qat/modules`,
+while adding an import statement here.
+"""
+
+__all__ = [
+    'LinearReLU',
+]
+
+from torch.ao.nn.intrinsic.qat import LinearReLU
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfcfb1cf12384e018c2ea6e01b9f8f92820e070d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__init__.py
@@ -0,0 +1,13 @@
+from .modules import *  # noqa: F403
+# to ensure customers can use the module below
+# without importing it directly
+import torch.nn.intrinsic.quantized.dynamic
+
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..109c19fbc93005107ac58aa38fa7e7244c87988d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5abf5415a55b4080859f8128de578e467417592
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d414374bcc4a01ba55974fc00536cf29c1659f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,5 @@
+from .linear_relu import LinearReLU
+
+__all__ = [
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddcf62f4501a74e33f1338ff29050ab427c71644
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c089db9cfe7e72be6b3ee32c1c7d6df9de25fb8a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fe8eb0da50565bd6c23945da27739277bbcada
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/dynamic/modules/linear_relu.py
@@ -0,0 +1,5 @@
+from torch.ao.nn.intrinsic.quantized.dynamic import LinearReLU
+
+__all__ = [
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b97d7b883d113c727e226946402702fd0a5863
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__init__.py
@@ -0,0 +1,12 @@
+from .linear_relu import LinearReLU
+from .conv_relu import ConvReLU1d, ConvReLU2d, ConvReLU3d
+from .bn_relu import BNReLU2d, BNReLU3d
+
+__all__ = [
+    'LinearReLU',
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+    'BNReLU2d',
+    'BNReLU3d',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f31c38e8f4368fef7dd60636fecaa2c2c3fc5ff
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55448f135bb708bdd3d4016e950078e8a6606fb7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/bn_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14923857a29d5051ae966e7dae375a6df25b0d31
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/conv_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cded19998e013fcbdef19a2d4de62227fbbcb6e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/__pycache__/linear_relu.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/bn_relu.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/bn_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..07bfd8c237c4eebae727019abf98be312cc2b70b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/bn_relu.py
@@ -0,0 +1,7 @@
+from torch.ao.nn.intrinsic.quantized import BNReLU2d
+from torch.ao.nn.intrinsic.quantized import BNReLU3d
+
+__all__ = [
+    'BNReLU2d',
+    'BNReLU3d',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/conv_relu.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/conv_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d64a99e1bc1e647e057f61ab65a2440d5491dc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/conv_relu.py
@@ -0,0 +1,9 @@
+from torch.ao.nn.intrinsic.quantized import ConvReLU1d
+from torch.ao.nn.intrinsic.quantized import ConvReLU2d
+from torch.ao.nn.intrinsic.quantized import ConvReLU3d
+
+__all__ = [
+    'ConvReLU1d',
+    'ConvReLU2d',
+    'ConvReLU3d',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/linear_relu.py b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/linear_relu.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b1384a44bac44ceedaa3ceabcac6a1ec27d78f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/intrinsic/quantized/modules/linear_relu.py
@@ -0,0 +1,5 @@
+from torch.ao.nn.intrinsic.quantized import LinearReLU
+
+__all__ = [
+    'LinearReLU',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9374dd9e35335e888b29fcea4d26523840dac878
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/__init__.py
@@ -0,0 +1,68 @@
+from .module import Module
+from .linear import Identity, Linear, Bilinear, LazyLinear
+from .conv import Conv1d, Conv2d, Conv3d, \
+    ConvTranspose1d, ConvTranspose2d, ConvTranspose3d, \
+    LazyConv1d, LazyConv2d, LazyConv3d, LazyConvTranspose1d, LazyConvTranspose2d, LazyConvTranspose3d
+from .activation import Threshold, ReLU, Hardtanh, ReLU6, Sigmoid, Tanh, \
+    Softmax, Softmax2d, LogSoftmax, ELU, SELU, CELU, GELU, Hardshrink, LeakyReLU, LogSigmoid, \
+    Softplus, Softshrink, MultiheadAttention, PReLU, Softsign, Softmin, Tanhshrink, RReLU, GLU, \
+    Hardsigmoid, Hardswish, SiLU, Mish
+from .loss import L1Loss, NLLLoss, KLDivLoss, MSELoss, BCELoss, BCEWithLogitsLoss, NLLLoss2d, \
+    CosineEmbeddingLoss, CTCLoss, HingeEmbeddingLoss, MarginRankingLoss, \
+    MultiLabelMarginLoss, MultiLabelSoftMarginLoss, MultiMarginLoss, SmoothL1Loss, HuberLoss, \
+    SoftMarginLoss, CrossEntropyLoss, TripletMarginLoss, TripletMarginWithDistanceLoss, PoissonNLLLoss, GaussianNLLLoss
+from .container import Container, Sequential, ModuleList, ModuleDict, ParameterList, ParameterDict
+from .pooling import AvgPool1d, AvgPool2d, AvgPool3d, MaxPool1d, MaxPool2d, MaxPool3d, \
+    MaxUnpool1d, MaxUnpool2d, MaxUnpool3d, FractionalMaxPool2d, FractionalMaxPool3d, LPPool1d, LPPool2d, LPPool3d, \
+    AdaptiveMaxPool1d, AdaptiveMaxPool2d, AdaptiveMaxPool3d, AdaptiveAvgPool1d, AdaptiveAvgPool2d, AdaptiveAvgPool3d
+from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d, SyncBatchNorm, \
+    LazyBatchNorm1d, LazyBatchNorm2d, LazyBatchNorm3d
+from .instancenorm import InstanceNorm1d, InstanceNorm2d, InstanceNorm3d, \
+    LazyInstanceNorm1d, LazyInstanceNorm2d, LazyInstanceNorm3d
+from .normalization import LocalResponseNorm, CrossMapLRN2d, LayerNorm, GroupNorm
+from .dropout import Dropout, Dropout1d, Dropout2d, Dropout3d, AlphaDropout, FeatureAlphaDropout
+from .padding import ReflectionPad1d, ReflectionPad2d, ReflectionPad3d, ReplicationPad1d, ReplicationPad2d, \
+    ReplicationPad3d, ZeroPad1d, ZeroPad2d, ZeroPad3d, ConstantPad1d, ConstantPad2d, ConstantPad3d, \
+    CircularPad1d, CircularPad2d, CircularPad3d
+from .sparse import Embedding, EmbeddingBag
+from .rnn import RNNBase, RNN, LSTM, GRU, \
+    RNNCellBase, RNNCell, LSTMCell, GRUCell
+from .pixelshuffle import PixelShuffle, PixelUnshuffle
+from .upsampling import UpsamplingNearest2d, UpsamplingBilinear2d, Upsample
+from .distance import PairwiseDistance, CosineSimilarity
+from .fold import Fold, Unfold
+from .adaptive import AdaptiveLogSoftmaxWithLoss
+from .transformer import TransformerEncoder, TransformerDecoder, \
+    TransformerEncoderLayer, TransformerDecoderLayer, Transformer
+from .flatten import Flatten, Unflatten
+from .channelshuffle import ChannelShuffle
+
+__all__ = [
+    'Module', 'Identity', 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d',
+    'ConvTranspose2d', 'ConvTranspose3d', 'Threshold', 'ReLU', 'Hardtanh', 'ReLU6',
+    'Sigmoid', 'Tanh', 'Softmax', 'Softmax2d', 'LogSoftmax', 'ELU', 'SELU', 'CELU', 'GLU', 'GELU', 'Hardshrink',
+    'LeakyReLU', 'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Softmin',
+    'Tanhshrink', 'RReLU', 'L1Loss', 'NLLLoss', 'KLDivLoss', 'MSELoss', 'BCELoss', 'BCEWithLogitsLoss',
+    'NLLLoss2d', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'CTCLoss', 'HingeEmbeddingLoss', 'MarginRankingLoss',
+    'MultiLabelMarginLoss', 'MultiLabelSoftMarginLoss', 'MultiMarginLoss', 'SmoothL1Loss', 'GaussianNLLLoss',
+    'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'Container', 'Sequential', 'ModuleList', 'ModuleDict',
+    'ParameterList', 'ParameterDict', 'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'MaxPool1d', 'MaxPool2d',
+    'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d', 'FractionalMaxPool2d', "FractionalMaxPool3d",
+    'LPPool1d', 'LPPool2d', 'LPPool3d', 'LocalResponseNorm', 'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d',
+    'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm', 'GroupNorm', 'SyncBatchNorm',
+    'Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout',
+    'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d', 'ReplicationPad2d', 'ReplicationPad1d', 'ReplicationPad3d',
+    'CrossMapLRN2d', 'Embedding', 'EmbeddingBag', 'RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell',
+    'LSTMCell', 'GRUCell', 'PixelShuffle', 'PixelUnshuffle', 'Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d',
+    'PairwiseDistance', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
+    'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d', 'TripletMarginLoss', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d',
+    'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d', 'Bilinear', 'CosineSimilarity', 'Unfold', 'Fold',
+    'AdaptiveLogSoftmaxWithLoss', 'TransformerEncoder', 'TransformerDecoder',
+    'TransformerEncoderLayer', 'TransformerDecoderLayer', 'Transformer',
+    'LazyLinear', 'LazyConv1d', 'LazyConv2d', 'LazyConv3d',
+    'LazyConvTranspose1d', 'LazyConvTranspose2d', 'LazyConvTranspose3d',
+    'LazyBatchNorm1d', 'LazyBatchNorm2d', 'LazyBatchNorm3d',
+    'LazyInstanceNorm1d', 'LazyInstanceNorm2d', 'LazyInstanceNorm3d',
+    'Flatten', 'Unflatten', 'Hardsigmoid', 'Hardswish', 'SiLU', 'Mish', 'TripletMarginWithDistanceLoss', 'ChannelShuffle',
+    'CircularPad1d', 'CircularPad2d', 'CircularPad3d'
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de7b3fea459ff574d7eae37080825fa83cf0184f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7cdd302ad09a5846f6754217de811e19861a0f0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e605edb073b9fd72b0e647653f6769de4ce1b197
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/adaptive.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/adaptive.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc9638a2ca37d75193e3f069ca0bb16f1aebf32d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/adaptive.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/batchnorm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/batchnorm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc49b9a073a8f69400c237407b2dd1ab34eb3e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/batchnorm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/channelshuffle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/channelshuffle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25a6935aff0012d6430cd8c7f4c7123b5f788109
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/channelshuffle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/container.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/container.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e643de85e997a59bd99c6a5284a279eeb75ed248
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/container.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12e27d83461c376b73ae72ab98b46081fab77951
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/distance.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/distance.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d681215f3693c55ee3650399172008c0b0fb89f4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/distance.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/dropout.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/dropout.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82abda1224a7f851583b9a83ccb5fc73163b6060
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/dropout.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/flatten.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/flatten.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48279341596469c4a5074de12a10acfac688cf4a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/flatten.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/fold.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/fold.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5daae140a624bdad31bf3b0c0f4d7ca68bfa181c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/fold.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/instancenorm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/instancenorm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e908a0f7a2cf45baacdbac06c80a8e4c0f070d46
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/instancenorm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/lazy.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/lazy.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a9a0742e831aed7f923f9e714b131cc47b88b0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/lazy.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0761236c7be8ecd96fc1b7d67b334534843fd64e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/loss.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/loss.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a34a6dbf105784edc5548af1bb0b5dd41470606e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/loss.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe8cfe2d8af9bfa66975006a145c8f3f0d8328c5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/normalization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/normalization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52faab0353fd3e337482e089388a6851c6c07f5b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/normalization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/padding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/padding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bffd90818995ef08cf47aa6b079aaea42fb59e3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/padding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pixelshuffle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pixelshuffle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..054a4fe68d30d9945f724a151e7f90a0544f07a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pixelshuffle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pooling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pooling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06738ab31ed22a5051e81e4fbd52daca39fba860
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/pooling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd63dccf0bd930e69cd353b9ceccd3629646e12b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/sparse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/sparse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18e1b51848e7fa193edb8fedfd67323385663b53
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/sparse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/transformer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/transformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9958ee80c5f7b0e2f1e70406abea5894b37ffe9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/transformer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/upsampling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/upsampling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..472cdd0b31453cd622d732bc7c6ea1201f647a4e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/upsampling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba4e065fedac82283333a54a0e9b3464e20bd380
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/modules/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/_functions.py b/MLPY/Lib/site-packages/torch/nn/modules/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfc3e7274fe859b164a0607ec887963b5bbfa7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/_functions.py
@@ -0,0 +1,288 @@
+import torch
+import torch.distributed as dist
+
+from torch.autograd.function import Function
+
+class SyncBatchNorm(Function):
+
+    @staticmethod
+    def forward(self, input, weight, bias, running_mean, running_var, eps, momentum, process_group, world_size):
+        if not (
+            input.is_contiguous(memory_format=torch.channels_last) or
+            input.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
+            input = input.contiguous()
+        if weight is not None:
+            weight = weight.contiguous()
+
+        size = int(input.numel() // input.size(1))
+        if size == 1 and world_size < 2:
+            raise ValueError(f'Expected more than 1 value per channel when training, got input size {size}')
+
+        num_channels = input.shape[1]
+        if input.numel() > 0:
+            # calculate mean/invstd for input.
+            mean, invstd = torch.batch_norm_stats(input, eps)
+
+            count = torch.full(
+                (1,),
+                input.numel() // input.size(1),
+                dtype=mean.dtype,
+                device=mean.device
+            )
+
+            # C, C, 1 -> (2C + 1)
+            combined = torch.cat([mean, invstd, count], dim=0)
+        else:
+            # for empty input, set stats and the count to zero. The stats with
+            # zero count will be filtered out later when computing global mean
+            # & invstd, but they still needs to participate the all_gather
+            # collective communication to unblock other peer processes.
+            combined = torch.zeros(
+                2 * num_channels + 1,
+                dtype=input.dtype,
+                device=input.device
+            )
+
+        # Use allgather instead of allreduce because count could be different across
+        # ranks, simple all reduce op can not give correct results.
+        # batch_norm_gather_stats_with_counts calculates global mean & invstd based on
+        # all gathered mean, invstd and count.
+        # for nccl backend, use the optimized version of all gather.
+        # The Gloo backend does not support `all_gather_into_tensor`.
+        if process_group._get_backend_name() != "gloo":
+            # world_size * (2C + 1)
+            combined_size = combined.numel()
+            combined_flat = torch.empty(1,
+                                        combined_size * world_size,
+                                        dtype=combined.dtype,
+                                        device=combined.device)
+            dist.all_gather_into_tensor(combined_flat, combined, process_group, async_op=False)
+            combined = torch.reshape(combined_flat, (world_size, combined_size))
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+        else:
+            # world_size * (2C + 1)
+            combined_list = [
+                torch.empty_like(combined) for _ in range(world_size)
+            ]
+            dist.all_gather(combined_list, combined, process_group, async_op=False)
+            combined = torch.stack(combined_list, dim=0)
+            # world_size * (2C + 1) -> world_size * C, world_size * C, world_size * 1
+            mean_all, invstd_all, count_all = torch.split(combined, num_channels, dim=1)
+
+        if not (torch.cuda.is_available() and torch.cuda.is_current_stream_capturing()):
+            # The lines below force a synchronization between CUDA and CPU, because
+            # the shape of the result count_all depends on the values in mask tensor.
+            # Such synchronizations break CUDA Graph capturing.
+            # See https://github.com/pytorch/pytorch/issues/78549
+            # FIXME: https://github.com/pytorch/pytorch/issues/78656 describes
+            # a better longer-term solution.
+
+            # remove stats from empty inputs
+            mask = count_all.squeeze(-1) >= 1
+            count_all = count_all[mask]
+            mean_all = mean_all[mask]
+            invstd_all = invstd_all[mask]
+
+        # calculate global mean & invstd
+        counts = count_all.view(-1)
+        if running_mean is not None and counts.dtype != running_mean.dtype:
+            counts = counts.to(running_mean.dtype)
+        mean, invstd = torch.batch_norm_gather_stats_with_counts(
+            input,
+            mean_all,
+            invstd_all,
+            running_mean,
+            running_var,
+            momentum,
+            eps,
+            counts,
+        )
+
+        self.save_for_backward(input, weight, mean, invstd, count_all.to(torch.int32))
+        self.process_group = process_group
+
+        # apply element-wise normalization
+        if input.numel() > 0:
+            return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
+        else:
+            return torch.empty_like(input)
+
+    @staticmethod
+    def backward(self, grad_output):
+        if not (
+            grad_output.is_contiguous(memory_format=torch.channels_last) or
+            grad_output.is_contiguous(memory_format=torch.channels_last_3d)
+        ):
+            grad_output = grad_output.contiguous()
+        saved_input, weight, mean, invstd, count_tensor = self.saved_tensors
+        grad_input = grad_weight = grad_bias = None
+        process_group = self.process_group
+
+        if saved_input.numel() > 0:
+            # calculate local stats as well as grad_weight / grad_bias
+            sum_dy, sum_dy_xmu, grad_weight, grad_bias = torch.batch_norm_backward_reduce(
+                grad_output,
+                saved_input,
+                mean,
+                invstd,
+                weight,
+                self.needs_input_grad[0],
+                self.needs_input_grad[1],
+                self.needs_input_grad[2]
+            )
+
+            if self.needs_input_grad[0]:
+                # synchronizing stats used to calculate input gradient.
+                num_channels = sum_dy.shape[0]
+                combined = torch.cat([sum_dy, sum_dy_xmu], dim=0)
+                torch.distributed.all_reduce(
+                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
+                sum_dy, sum_dy_xmu = torch.split(combined, num_channels)
+
+                # backward pass for gradient calculation
+                if weight is not None and weight.dtype != mean.dtype:
+                    weight = weight.to(mean.dtype)
+                grad_input = torch.batch_norm_backward_elemt(
+                    grad_output,
+                    saved_input,
+                    mean,
+                    invstd,
+                    weight,
+                    sum_dy,
+                    sum_dy_xmu,
+                    count_tensor
+                )
+            # synchronizing of grad_weight / grad_bias is not needed as distributed
+            # training would handle all reduce.
+            if weight is None or not self.needs_input_grad[1]:
+                grad_weight = None
+
+            if weight is None or not self.needs_input_grad[2]:
+                grad_bias = None
+        else:
+            # This process got an empty input tensor in the forward pass.
+            # Although this process can directly set grad_input as an empty
+            # tensor of zeros, it still needs to participate in the collective
+            # communication to unblock its peers, as other peer processes might
+            # have received non-empty inputs.
+            num_channels = saved_input.shape[1]
+            if self.needs_input_grad[0]:
+                # launch all_reduce to unblock other peer processes
+                combined = torch.zeros(
+                    2 * num_channels,
+                    dtype=saved_input.dtype,
+                    device=saved_input.device
+                )
+                torch.distributed.all_reduce(
+                    combined, torch.distributed.ReduceOp.SUM, process_group, async_op=False)
+
+            # Leave grad_input, grad_weight and grad_bias as None, which will be
+            # interpreted by the autograd engine as Tensors full of zeros.
+
+        return grad_input, grad_weight, grad_bias, None, None, None, None, None, None
+
+class CrossMapLRN2d(Function):
+
+    @staticmethod
+    def forward(ctx, input, size, alpha=1e-4, beta=0.75, k=1):
+        ctx.size = size
+        ctx.alpha = alpha
+        ctx.beta = beta
+        ctx.k = k
+        ctx.scale = None
+
+        if input.dim() != 4:
+            raise ValueError(f"CrossMapLRN2d: Expected input to be 4D, got {input.dim()}D instead.")
+
+        ctx.scale = ctx.scale or input.new()
+        output = input.new()
+
+        batch_size = input.size(0)
+        channels = input.size(1)
+        input_height = input.size(2)
+        input_width = input.size(3)
+
+        output.resize_as_(input)
+        ctx.scale.resize_as_(input)
+
+        # use output storage as temporary buffer
+        input_square = output
+        torch.pow(input, 2, out=input_square)
+
+        pre_pad = int((ctx.size - 1) / 2 + 1)
+        pre_pad_crop = min(pre_pad, channels)
+
+        scale_first = ctx.scale.select(1, 0)
+        scale_first.zero_()
+        # compute first feature map normalization
+        for c in range(pre_pad_crop):
+            scale_first.add_(input_square.select(1, c))
+
+        # reuse computations for next feature maps normalization
+        # by adding the next feature map and removing the previous
+        for c in range(1, channels):
+            scale_previous = ctx.scale.select(1, c - 1)
+            scale_current = ctx.scale.select(1, c)
+            scale_current.copy_(scale_previous)
+            if c < channels - pre_pad + 1:
+                square_next = input_square.select(1, c + pre_pad - 1)
+                scale_current.add_(square_next, alpha=1)
+
+            if c > pre_pad:
+                square_previous = input_square.select(1, c - pre_pad)
+                scale_current.add_(square_previous, alpha=-1)
+
+        ctx.scale.mul_(ctx.alpha / ctx.size).add_(ctx.k)
+
+        torch.pow(ctx.scale, -ctx.beta, out=output)
+        output.mul_(input)
+
+        ctx.save_for_backward(input, output)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, output = ctx.saved_tensors
+        grad_input = grad_output.new()
+
+        batch_size = input.size(0)
+        channels = input.size(1)
+        input_height = input.size(2)
+        input_width = input.size(3)
+
+        paddded_ratio = input.new(channels + ctx.size - 1, input_height,
+                                  input_width)
+        accum_ratio = input.new(input_height, input_width)
+
+        cache_ratio_value = 2 * ctx.alpha * ctx.beta / ctx.size
+        inversePrePad = int(ctx.size - (ctx.size - 1) / 2)
+
+        grad_input.resize_as_(input)
+        torch.pow(ctx.scale, -ctx.beta, out=grad_input).mul_(grad_output)
+
+        paddded_ratio.zero_()
+        padded_ratio_center = paddded_ratio.narrow(0, inversePrePad,
+                                                   channels)
+        for n in range(batch_size):
+            torch.mul(grad_output[n], output[n], out=padded_ratio_center)
+            padded_ratio_center.div_(ctx.scale[n])
+            torch.sum(
+                paddded_ratio.narrow(0, 0, ctx.size - 1), 0, keepdim=False, out=accum_ratio)
+            for c in range(channels):
+                accum_ratio.add_(paddded_ratio[c + ctx.size - 1])
+                grad_input[n][c].addcmul_(input[n][c], accum_ratio, value=-cache_ratio_value)
+                accum_ratio.add_(paddded_ratio[c], alpha=-1)
+
+        return grad_input, None, None, None, None
+
+class BackwardHookFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, *args):
+        ctx.mark_non_differentiable(*[arg for arg in args if not arg.requires_grad])
+        return args
+
+    @staticmethod
+    def backward(ctx, *args):
+        return args
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/activation.py b/MLPY/Lib/site-packages/torch/nn/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff27fcbe99325d55027a24aac9ce573d5f3a4cf3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/activation.py
@@ -0,0 +1,1624 @@
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+from .linear import NonDynamicallyQuantizableLinear
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.parameter import Parameter
+from .module import Module
+from .. import functional as F
+
+__all__ = ['Threshold', 'ReLU', 'RReLU', 'Hardtanh', 'ReLU6', 'Sigmoid', 'Hardsigmoid', 'Tanh',
+           'SiLU', 'Mish', 'Hardswish', 'ELU', 'CELU', 'SELU', 'GLU', 'GELU', 'Hardshrink', 'LeakyReLU',
+           'LogSigmoid', 'Softplus', 'Softshrink', 'MultiheadAttention', 'PReLU', 'Softsign', 'Tanhshrink',
+           'Softmin', 'Softmax', 'Softmax2d', 'LogSoftmax']
+
+
+class Threshold(Module):
+    r"""Thresholds each element of the input Tensor.
+
+    Threshold is defined as:
+
+    .. math::
+        y =
+        \begin{cases}
+        x, &\text{ if } x > \text{threshold} \\
+        \text{value}, &\text{ otherwise }
+        \end{cases}
+
+    Args:
+        threshold: The value to threshold at
+        value: The value to replace with
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> m = nn.Threshold(0.1, 20)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['threshold', 'value', 'inplace']
+
+    threshold: float
+    value: float
+    inplace: bool
+
+    def __init__(self, threshold: float, value: float, inplace: bool = False) -> None:
+        super().__init__()
+        self.threshold = threshold
+        self.value = value
+        self.inplace = inplace
+        # TODO: check in THNN (if inplace == True, then assert value <= threshold)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.threshold(input, self.threshold, self.value, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'threshold={self.threshold}, value={self.value}{inplace_str}'
+
+
+class ReLU(Module):
+    r"""Applies the rectified linear unit function element-wise.
+
+    :math:`\text{ReLU}(x) = (x)^+ = \max(0, x)`
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ReLU.png
+
+    Examples::
+
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+
+      An implementation of CReLU - https://arxiv.org/abs/1603.05201
+
+        >>> m = nn.ReLU()
+        >>> input = torch.randn(2).unsqueeze(0)
+        >>> output = torch.cat((m(input), m(-input)))
+    """
+
+    __constants__ = ['inplace']
+    inplace: bool
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.relu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+
+class RReLU(Module):
+    r"""Applies the randomized leaky rectified linear unit function, element-wise.
+
+    Method described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_.
+
+    The function is defined as:
+
+    .. math::
+        \text{RReLU}(x) =
+        \begin{cases}
+            x & \text{if } x \geq 0 \\
+            ax & \text{ otherwise }
+        \end{cases}
+
+    where :math:`a` is randomly sampled from uniform distribution
+    :math:`\mathcal{U}(\text{lower}, \text{upper})` during training while during
+    evaluation :math:`a` is fixed with :math:`a = \frac{\text{lower} + \text{upper}}{2}`.
+
+    Args:
+        lower: lower bound of the uniform distribution. Default: :math:`\frac{1}{8}`
+        upper: upper bound of the uniform distribution. Default: :math:`\frac{1}{3}`
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/RReLU.png
+
+    Examples::
+
+        >>> m = nn.RReLU(0.1, 0.3)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    """
+
+    __constants__ = ['lower', 'upper', 'inplace']
+
+    lower: float
+    upper: float
+    inplace: bool
+
+    def __init__(
+        self,
+        lower: float = 1. / 8,
+        upper: float = 1. / 3,
+        inplace: bool = False
+    ):
+        super().__init__()
+        self.lower = lower
+        self.upper = upper
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.rrelu(input, self.lower, self.upper, self.training, self.inplace)
+
+    def extra_repr(self):
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'lower={self.lower}, upper={self.upper}{inplace_str}'
+
+
+class Hardtanh(Module):
+    r"""Applies the HardTanh function element-wise.
+
+    HardTanh is defined as:
+
+    .. math::
+        \text{HardTanh}(x) = \begin{cases}
+            \text{max\_val} & \text{ if } x > \text{ max\_val } \\
+            \text{min\_val} & \text{ if } x < \text{ min\_val } \\
+            x & \text{ otherwise } \\
+        \end{cases}
+
+    Args:
+        min_val: minimum value of the linear region range. Default: -1
+        max_val: maximum value of the linear region range. Default: 1
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Keyword arguments :attr:`min_value` and :attr:`max_value`
+    have been deprecated in favor of :attr:`min_val` and :attr:`max_val`.
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardtanh.png
+
+    Examples::
+
+        >>> m = nn.Hardtanh(-2, 2)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['min_val', 'max_val', 'inplace']
+
+    min_val: float
+    max_val: float
+    inplace: bool
+
+    def __init__(
+        self,
+        min_val: float = -1.,
+        max_val: float = 1.,
+        inplace: bool = False,
+        min_value: Optional[float] = None,
+        max_value: Optional[float] = None
+    ) -> None:
+        super().__init__()
+        if min_value is not None:
+            warnings.warn("keyword argument min_value is deprecated and rename to min_val")
+            min_val = min_value
+        if max_value is not None:
+            warnings.warn("keyword argument max_value is deprecated and rename to max_val")
+            max_val = max_value
+
+        self.min_val = min_val
+        self.max_val = max_val
+        self.inplace = inplace
+        assert self.max_val > self.min_val
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardtanh(input, self.min_val, self.max_val, self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'min_val={self.min_val}, max_val={self.max_val}{inplace_str}'
+
+
+class ReLU6(Hardtanh):
+    r"""Applies the ReLU6 function element-wise.
+
+    .. math::
+        \text{ReLU6}(x) = \min(\max(0,x), 6)
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ReLU6.png
+
+    Examples::
+
+        >>> m = nn.ReLU6()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__(0., 6., inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+
+class Sigmoid(Module):
+    r"""Applies the Sigmoid function element-wise.
+
+    .. math::
+        \text{Sigmoid}(x) = \sigma(x) = \frac{1}{1 + \exp(-x)}
+
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Sigmoid.png
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return torch.sigmoid(input)
+
+
+class Hardsigmoid(Module):
+    r"""Applies the Hardsigmoid function element-wise.
+
+    Hardsigmoid is defined as:
+
+    .. math::
+        \text{Hardsigmoid}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            1 & \text{if~} x \ge +3, \\
+            x / 6 + 1 / 2 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardsigmoid.png
+
+    Examples::
+
+        >>> m = nn.Hardsigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['inplace']
+
+    inplace: bool
+
+    def __init__(self, inplace : bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardsigmoid(input, self.inplace)
+
+
+class Tanh(Module):
+    r"""Applies the Hyperbolic Tangent (Tanh) function element-wise.
+
+    Tanh is defined as:
+
+    .. math::
+        \text{Tanh}(x) = \tanh(x) = \frac{\exp(x) - \exp(-x)} {\exp(x) + \exp(-x)}
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Tanh.png
+
+    Examples::
+
+        >>> m = nn.Tanh()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return torch.tanh(input)
+
+class SiLU(Module):
+    r"""Applies the Sigmoid Linear Unit (SiLU) function, element-wise.
+
+    The SiLU function is also known as the swish function.
+
+    .. math::
+        \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
+
+    .. note::
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
+        where the SiLU was experimented with later.
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/SiLU.png
+
+    Examples::
+
+        >>> m = nn.SiLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['inplace']
+    inplace: bool
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+class Mish(Module):
+    r"""Applies the Mish function, element-wise.
+
+    Mish: A Self Regularized Non-Monotonic Neural Activation Function.
+
+    .. math::
+        \text{Mish}(x) = x * \text{Tanh}(\text{Softplus}(x))
+
+    .. note::
+        See `Mish: A Self Regularized Non-Monotonic Neural Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Mish.png
+
+    Examples::
+
+        >>> m = nn.Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['inplace']
+    inplace: bool
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input, inplace=self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+class Hardswish(Module):
+    r"""Applies the Hardswish function, element-wise.
+
+    Method described in the paper: `Searching for MobileNetV3 <https://arxiv.org/abs/1905.02244>`_.
+
+    Hardswish is defined as:
+
+    .. math::
+        \text{Hardswish}(x) = \begin{cases}
+            0 & \text{if~} x \le -3, \\
+            x & \text{if~} x \ge +3, \\
+            x \cdot (x + 3) /6 & \text{otherwise}
+        \end{cases}
+
+    Args:
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardswish.png
+
+    Examples::
+
+        >>> m = nn.Hardswish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['inplace']
+
+    inplace: bool
+
+    def __init__(self, inplace : bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardswish(input, self.inplace)
+
+
+class ELU(Module):
+    r"""Applies the Exponential Linear Unit (ELU) function, element-wise.
+
+    Method described in the paper: `Fast and Accurate Deep Network Learning by Exponential Linear
+    Units (ELUs) <https://arxiv.org/abs/1511.07289>`__.
+
+    ELU is defined as:
+
+    .. math::
+        \text{ELU}(x) = \begin{cases}
+        x, & \text{ if } x > 0\\
+        \alpha * (\exp(x) - 1), & \text{ if } x \leq 0
+        \end{cases}
+
+    Args:
+        alpha: the :math:`\alpha` value for the ELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/ELU.png
+
+    Examples::
+
+        >>> m = nn.ELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['alpha', 'inplace']
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.elu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'alpha={self.alpha}{inplace_str}'
+
+
+class CELU(Module):
+    r"""Applies the CELU function element-wise.
+
+    .. math::
+        \text{CELU}(x) = \max(0,x) + \min(0, \alpha * (\exp(x/\alpha) - 1))
+
+    More details can be found in the paper `Continuously Differentiable Exponential Linear Units`_ .
+
+    Args:
+        alpha: the :math:`\alpha` value for the CELU formulation. Default: 1.0
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/CELU.png
+
+    Examples::
+
+        >>> m = nn.CELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _`Continuously Differentiable Exponential Linear Units`:
+        https://arxiv.org/abs/1704.07483
+    """
+
+    __constants__ = ['alpha', 'inplace']
+    alpha: float
+    inplace: bool
+
+    def __init__(self, alpha: float = 1., inplace: bool = False) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.celu(input, self.alpha, self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'alpha={self.alpha}{inplace_str}'
+
+
+class SELU(Module):
+    r"""Applies the SELU function element-wise.
+
+    .. math::
+        \text{SELU}(x) = \text{scale} * (\max(0,x) + \min(0, \alpha * (\exp(x) - 1)))
+
+    with :math:`\alpha = 1.6732632423543772848170429916717` and
+    :math:`\text{scale} = 1.0507009873554804934193349852946`.
+
+    .. warning::
+        When using ``kaiming_normal`` or ``kaiming_normal_`` for initialisation,
+        ``nonlinearity='linear'`` should be used instead of ``nonlinearity='selu'``
+        in order to get `Self-Normalizing Neural Networks`_.
+        See :func:`torch.nn.init.calculate_gain` for more information.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        inplace (bool, optional): can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/SELU.png
+
+    Examples::
+
+        >>> m = nn.SELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    __constants__ = ['inplace']
+    inplace: bool
+
+    def __init__(self, inplace: bool = False) -> None:
+        super().__init__()
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.selu(input, self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = 'inplace=True' if self.inplace else ''
+        return inplace_str
+
+
+class GLU(Module):
+    r"""Applies the gated linear unit function.
+
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
+    of the input matrices and :math:`b` is the second half.
+
+    Args:
+        dim (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+
+        >>> m = nn.GLU()
+        >>> input = torch.randn(4, 2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['dim']
+    dim: int
+
+    def __init__(self, dim: int = -1) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.glu(input, self.dim)
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}'
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function.
+
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    When the approximate argument is 'tanh', Gelu is estimated with:
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt{2 / \pi} * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['approximate']
+    approximate: str
+
+    def __init__(self, approximate: str = 'none') -> None:
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.gelu(input, approximate=self.approximate)
+
+    def extra_repr(self) -> str:
+        return f'approximate={repr(self.approximate)}'
+
+
+class Hardshrink(Module):
+    r"""Applies the Hard Shrinkage (Hardshrink) function element-wise.
+
+    Hardshrink is defined as:
+
+    .. math::
+        \text{HardShrink}(x) =
+        \begin{cases}
+        x, & \text{ if } x > \lambda \\
+        x, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` value for the Hardshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Hardshrink.png
+
+    Examples::
+
+        >>> m = nn.Hardshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['lambd']
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.hardshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        return f'{self.lambd}'
+
+
+class LeakyReLU(Module):
+    r"""Applies the LeakyReLU function element-wise.
+
+    .. math::
+        \text{LeakyReLU}(x) = \max(0, x) + \text{negative\_slope} * \min(0, x)
+
+
+    or
+
+    .. math::
+        \text{LeakyReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        \text{negative\_slope} \times x, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        negative_slope: Controls the angle of the negative slope (which is used for
+          negative input values). Default: 1e-2
+        inplace: can optionally do the operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    .. image:: ../scripts/activation_images/LeakyReLU.png
+
+    Examples::
+
+        >>> m = nn.LeakyReLU(0.1)
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['inplace', 'negative_slope']
+    inplace: bool
+    negative_slope: float
+
+    def __init__(self, negative_slope: float = 1e-2, inplace: bool = False) -> None:
+        super().__init__()
+        self.negative_slope = negative_slope
+        self.inplace = inplace
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.leaky_relu(input, self.negative_slope, self.inplace)
+
+    def extra_repr(self) -> str:
+        inplace_str = ', inplace=True' if self.inplace else ''
+        return f'negative_slope={self.negative_slope}{inplace_str}'
+
+
+class LogSigmoid(Module):
+    r"""Applies the Logsigmoid function element-wise.
+
+    .. math::
+        \text{LogSigmoid}(x) = \log\left(\frac{ 1 }{ 1 + \exp(-x)}\right)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/LogSigmoid.png
+
+    Examples::
+
+        >>> m = nn.LogSigmoid()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.logsigmoid(input)
+
+
+class Softplus(Module):
+    r"""Applies the Softplus function element-wise.
+
+    .. math::
+        \text{Softplus}(x) = \frac{1}{\beta} * \log(1 + \exp(\beta * x))
+
+    SoftPlus is a smooth approximation to the ReLU function and can be used
+    to constrain the output of a machine to always be positive.
+
+    For numerical stability the implementation reverts to the linear function
+    when :math:`input \times \beta > threshold`.
+
+    Args:
+        beta: the :math:`\beta` value for the Softplus formulation. Default: 1
+        threshold: values above this revert to a linear function. Default: 20
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softplus.png
+
+    Examples::
+
+        >>> m = nn.Softplus()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['beta', 'threshold']
+    beta: float
+    threshold: float
+
+    def __init__(self, beta: float = 1.0, threshold: float = 20.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.threshold = threshold
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softplus(input, self.beta, self.threshold)
+
+    def extra_repr(self) -> str:
+        return f'beta={self.beta}, threshold={self.threshold}'
+
+
+class Softshrink(Module):
+    r"""Applies the soft shrinkage function element-wise.
+
+    .. math::
+        \text{SoftShrinkage}(x) =
+        \begin{cases}
+        x - \lambda, & \text{ if } x > \lambda \\
+        x + \lambda, & \text{ if } x < -\lambda \\
+        0, & \text{ otherwise }
+        \end{cases}
+
+    Args:
+        lambd: the :math:`\lambda` (must be no less than zero) value for the Softshrink formulation. Default: 0.5
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softshrink.png
+
+    Examples::
+
+        >>> m = nn.Softshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['lambd']
+    lambd: float
+
+    def __init__(self, lambd: float = 0.5) -> None:
+        super().__init__()
+        self.lambd = lambd
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softshrink(input, self.lambd)
+
+    def extra_repr(self) -> str:
+        return str(self.lambd)
+
+
+def _check_arg_device(x: Optional[torch.Tensor]) -> bool:
+    if x is not None:
+        return x.device.type in ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+    return True
+
+
+def _arg_requires_grad(x: Optional[torch.Tensor]) -> bool:
+    if x is not None:
+        return x.requires_grad
+    return False
+
+
+def _is_make_fx_tracing():
+    if not torch.jit.is_scripting():
+        torch_dispatch_mode_stack = torch.utils._python_dispatch._get_current_dispatch_mode_stack()
+        return any(type(x) == torch.fx.experimental.proxy_tensor.ProxyTorchDispatchMode for x in torch_dispatch_mode_stack)
+    else:
+        return False
+
+
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information from different representation subspaces.
+
+    Method described in the paper:
+    `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+
+    Multi-Head Attention is defined as:
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+
+    ``nn.MultiHeadAttention`` will use the optimized implementations of
+    ``scaled_dot_product_attention()`` when possible.
+
+    In addition to support for the new ``scaled_dot_product_attention()``
+    function, for speeding up Inference, MHA will use
+    fastpath inference with support for Nested Tensors, iff:
+
+    - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor).
+    - inputs are batched (3D) with ``batch_first==True``
+    - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
+    - training is disabled (using ``.eval()``)
+    - ``add_bias_kv`` is ``False``
+    - ``add_zero_attn`` is ``False``
+    - ``kdim`` and ``vdim`` are equal to ``embed_dim``
+    - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
+      nor ``attn_mask`` is passed
+    - autocast is disabled
+
+    If the optimized inference fastpath implementation is in use, a
+    `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
+    ``query``/``key``/``value`` to represent padding more efficiently than using a
+    padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
+    will be returned, and an additional speedup proportional to the fraction of the input
+    that is padding can be expected.
+
+    Args:
+        embed_dim: Total dimension of the model.
+        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+            Default: ``False``.
+        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+
+    .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
+    """
+
+    __constants__ = ['batch_first']
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
+                 kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
+        if embed_dim <= 0 or num_heads <= 0:
+            raise ValueError(
+                f"embed_dim and num_heads must be greater than 0,"
+                f" got embed_dim={embed_dim} and num_heads={num_heads} instead"
+            )
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if not self._qkv_same_embed_dim:
+            self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
+            self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
+            self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+
+        super().__setstate__(state)
+
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            key_padding_mask: Optional[Tensor] = None,
+            need_weights: bool = True,
+            attn_mask: Optional[Tensor] = None,
+            average_attn_weights: bool = True,
+            is_causal : bool = False) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""Compute attention outputs using query, key, and value embeddings.
+
+        Supports optional parameters for padding, masks and attention weights.
+
+    Args:
+        query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
+            or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
+            :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
+            Queries are compared against key-value pairs to produce the output.
+            See "Attention Is All You Need" for more details.
+        key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
+            or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
+            :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
+            See "Attention Is All You Need" for more details.
+        value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
+            ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
+            sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+            to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
+            Binary and float masks are supported.
+            For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+            the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
+        need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+            Set ``need_weights=False`` to use the optimized ``scaled_dot_product_attention``
+            and achieve the best performance for MHA.
+            Default: ``True``.
+        attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+            :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+            :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+            broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+            Binary and float masks are supported. For a binary mask, a ``True`` value indicates that the
+            corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+            the attention weight.
+            If both attn_mask and key_padding_mask are supplied, their types should match.
+        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
+            heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
+            effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
+        is_causal: If specified, applies a causal mask as attention mask.
+            Default: ``False``.
+            Warning:
+            ``is_causal`` provides a hint that ``attn_mask`` is the
+            causal mask. Providing incorrect hints can result in
+            incorrect execution, including forward and backward
+            compatibility.
+
+    Outputs:
+        - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
+          :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
+          where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
+          embedding dimension ``embed_dim``.
+        - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
+          returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+          head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
+
+        .. note::
+            `batch_first` argument is ignored for unbatched inputs.
+        """
+        why_not_fast_path = ''
+        if ((attn_mask is not None and torch.is_floating_point(attn_mask))
+           or (key_padding_mask is not None) and torch.is_floating_point(key_padding_mask)):
+            why_not_fast_path = "floating-point masks are not supported for fast path."
+
+        is_batched = query.dim() == 3
+
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype
+        )
+
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=None,
+            other_name="",
+            target_type=query.dtype,
+            check_other=False,
+        )
+
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+
+        if not is_fastpath_enabled:
+            why_not_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not is_batched:
+            why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
+        elif query is not key or key is not value:
+            # When lifting this restriction, don't forget to either
+            # enforce that the dtypes all match or test cases where
+            # they don't!
+            why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
+        elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+        elif self.in_proj_weight is None:
+            why_not_fast_path = "in_proj_weight was None"
+        elif query.dtype != self.in_proj_weight.dtype:
+            # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+        elif self.training:
+            why_not_fast_path = "training is enabled"
+        elif (self.num_heads % 2) != 0:
+            why_not_fast_path = "self.num_heads is not even"
+        elif not self.batch_first:
+            why_not_fast_path = "batch_first was not True"
+        elif self.bias_k is not None:
+            why_not_fast_path = "self.bias_k was not None"
+        elif self.bias_v is not None:
+            why_not_fast_path = "self.bias_v was not None"
+        elif self.add_zero_attn:
+            why_not_fast_path = "add_zero_attn was enabled"
+        elif not self._qkv_same_embed_dim:
+            why_not_fast_path = "_qkv_same_embed_dim was not True"
+        elif query.is_nested and (key_padding_mask is not None or attn_mask is not None):
+            why_not_fast_path = "supplying both src_key_padding_mask and src_mask at the same time \
+                                 is not supported with NestedTensor input"
+        elif torch.is_autocast_enabled():
+            why_not_fast_path = "autocast is enabled"
+
+        if not why_not_fast_path:
+            tensor_args = (
+                query,
+                key,
+                value,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj.weight,
+                self.out_proj.bias,
+            )
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_fast_path = "some Tensor argument has_torch_function"
+            elif _is_make_fx_tracing():
+                why_not_fast_path = "we are running make_fx tracing"
+            elif not all(_check_arg_device(x) for x in tensor_args):
+                why_not_fast_path = ("some Tensor argument's device is neither one of "
+                                     f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}")
+            elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args):
+                why_not_fast_path = ("grad is enabled and at least one of query or the "
+                                     "input/output projection weights or biases requires_grad")
+            if not why_not_fast_path:
+                merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
+
+                if self.in_proj_bias is not None and self.in_proj_weight is not None:
+                    return torch._native_multi_head_attention(
+                        query,
+                        key,
+                        value,
+                        self.embed_dim,
+                        self.num_heads,
+                        self.in_proj_weight,
+                        self.in_proj_bias,
+                        self.out_proj.weight,
+                        self.out_proj.bias,
+                        merged_mask,
+                        need_weights,
+                        average_attn_weights,
+                        mask_type)
+
+        any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
+                                f"The fast path was not hit because {why_not_fast_path}")
+
+        if self.batch_first and is_batched:
+            # make sure that the transpose op does not affect the "is" property
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = (x.transpose(1, 0) for x in (query, key))
+                    value = key
+            else:
+                query, key, value = (x.transpose(1, 0) for x in (query, key, value))
+
+        if not self._qkv_same_embed_dim:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal)
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                average_attn_weights=average_attn_weights,
+                is_causal=is_causal)
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+
+    def merge_masks(self, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor],
+                    query: Tensor) -> Tuple[Optional[Tensor], Optional[int]]:
+        r"""Determine mask type and combine masks if necessary.
+
+        If only one mask is provided, that mask
+        and the corresponding mask type will be returned. If both masks are provided, they will be both
+        expanded to shape ``(batch_size, num_heads, seq_len, seq_len)``, combined with logical ``or``
+        and mask type 2 will be returned
+        Args:
+            attn_mask: attention mask of shape ``(seq_len, seq_len)``, mask type 0
+            key_padding_mask: padding mask of shape ``(batch_size, seq_len)``, mask type 1
+            query: query embeddings of shape ``(batch_size, seq_len, embed_dim)``
+        Returns:
+            merged_mask: merged mask
+            mask_type: merged mask type (0, 1, or 2)
+        """
+        mask_type: Optional[int] = None
+        merged_mask: Optional[Tensor] = None
+
+        if key_padding_mask is not None:
+            mask_type = 1
+            merged_mask = key_padding_mask
+
+        if attn_mask is not None:
+            # In this branch query can't be a nested tensor, so it has a shape
+            batch_size, seq_len, _ = query.shape
+            mask_type = 2
+
+            # Always expands attn_mask to 4D
+            if attn_mask.dim() == 3:
+                attn_mask_expanded = attn_mask.view(batch_size, -1, seq_len, seq_len)
+            else:  # attn_mask.dim() == 2:
+                attn_mask_expanded = attn_mask.view(1, 1, seq_len, seq_len).expand(batch_size, self.num_heads, -1, -1)
+            merged_mask = attn_mask_expanded
+
+            if key_padding_mask is not None:
+                key_padding_mask_expanded = key_padding_mask.view(batch_size, 1, 1, seq_len).expand(-1, self.num_heads, -1, -1)
+                merged_mask = attn_mask_expanded + key_padding_mask_expanded
+
+        # no attn_mask and no key_padding_mask, returns None, None
+        return merged_mask, mask_type
+
+
+class PReLU(Module):
+    r"""Applies the element-wise PReLU function.
+
+    .. math::
+        \text{PReLU}(x) = \max(0,x) + a * \min(0,x)
+
+    or
+
+    .. math::
+        \text{PReLU}(x) =
+        \begin{cases}
+        x, & \text{ if } x \geq 0 \\
+        ax, & \text{ otherwise }
+        \end{cases}
+
+    Here :math:`a` is a learnable parameter. When called without arguments, `nn.PReLU()` uses a single
+    parameter :math:`a` across all input channels. If called with `nn.PReLU(nChannels)`,
+    a separate :math:`a` is used for each input channel.
+
+
+    .. note::
+        weight decay should not be used when learning :math:`a` for good performance.
+
+    .. note::
+        Channel dim is the 2nd dim of input. When input has dims < 2, then there is
+        no channel dim and the number of channels = 1.
+
+    Args:
+        num_parameters (int): number of :math:`a` to learn.
+            Although it takes an int as input, there is only two values are legitimate:
+            1, or the number of channels at input. Default: 1
+        init (float): the initial value of :math:`a`. Default: 0.25
+
+    Shape:
+        - Input: :math:`( *)` where `*` means, any number of additional
+          dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Attributes:
+        weight (Tensor): the learnable weights of shape (:attr:`num_parameters`).
+
+    .. image:: ../scripts/activation_images/PReLU.png
+
+    Examples::
+
+        >>> m = nn.PReLU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['num_parameters']
+    num_parameters: int
+
+    def __init__(self, num_parameters: int = 1, init: float = 0.25,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        self.num_parameters = num_parameters
+        super().__init__()
+        self.init = init
+        self.weight = Parameter(torch.empty(num_parameters, **factory_kwargs))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.constant_(self.weight, self.init)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.prelu(input, self.weight)
+
+    def extra_repr(self) -> str:
+        return f'num_parameters={self.num_parameters}'
+
+
+class Softsign(Module):
+    r"""Applies the element-wise Softsign function.
+
+    .. math::
+        \text{SoftSign}(x) = \frac{x}{ 1 + |x|}
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Softsign.png
+
+    Examples::
+
+        >>> m = nn.Softsign()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softsign(input)
+
+
+class Tanhshrink(Module):
+    r"""Applies the element-wise Tanhshrink function.
+
+    .. math::
+        \text{Tanhshrink}(x) = x - \tanh(x)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    .. image:: ../scripts/activation_images/Tanhshrink.png
+
+    Examples::
+
+        >>> m = nn.Tanhshrink()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.tanhshrink(input)
+
+
+class Softmin(Module):
+    r"""Applies the Softmin function to an n-dimensional input Tensor.
+
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range `[0, 1]` and sum to 1.
+
+    Softmin is defined as:
+
+    .. math::
+        \text{Softmin}(x_{i}) = \frac{\exp(-x_i)}{\sum_j \exp(-x_j)}
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Args:
+        dim (int): A dimension along which Softmin will be computed (so every slice
+            along dim will sum to 1).
+
+    Returns:
+        a Tensor of the same dimension and shape as the input, with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmin(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['dim']
+    dim: Optional[int]
+
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softmin(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self):
+        return f'dim={self.dim}'
+
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor.
+
+    Rescales them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    .. note::
+        This module doesn't work directly with NLLLoss,
+        which expects the Log to be computed between the Softmax and itself.
+        Use `LogSoftmax` instead (it's faster and has better numerical properties).
+
+    Examples::
+
+        >>> m = nn.Softmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+
+    """
+
+    __constants__ = ['dim']
+    dim: Optional[int]
+
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}'
+
+
+class Softmax2d(Module):
+    r"""Applies SoftMax over features to each spatial location.
+
+    When given an image of ``Channels x Height x Width``, it will
+    apply `Softmax` to each location :math:`(Channels, h_i, w_j)`
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Examples::
+
+        >>> m = nn.Softmax2d()
+        >>> # you softmax over the 2nd dimension
+        >>> input = torch.randn(2, 3, 12, 13)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        if input.dim() not in (3, 4):
+            raise ValueError(
+                f"Softmax2d: expected input to be 3D or 4D, got {input.dim()}D instead"
+            )
+        return F.softmax(input, -3, _stacklevel=5)
+
+
+class LogSoftmax(Module):
+    r"""Applies the :math:`\log(\text{Softmax}(x))` function to an n-dimensional input Tensor.
+
+    The LogSoftmax formulation can be simplified as:
+
+    .. math::
+        \text{LogSoftmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Args:
+        dim (int): A dimension along which LogSoftmax will be computed.
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [-inf, 0)
+
+    Examples::
+
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> input = torch.randn(2, 3)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['dim']
+    dim: Optional[int]
+
+    def __init__(self, dim: Optional[int] = None) -> None:
+        super().__init__()
+        self.dim = dim
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'dim'):
+            self.dim = None
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.log_softmax(input, self.dim, _stacklevel=5)
+
+    def extra_repr(self):
+        return f'dim={self.dim}'
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/adaptive.py b/MLPY/Lib/site-packages/torch/nn/modules/adaptive.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f7b20fda298e8a26754f20cb04a104867dbeee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/adaptive.py
@@ -0,0 +1,312 @@
+
+from collections import namedtuple
+
+import torch
+
+from torch import Tensor
+from typing import List, Sequence
+
+from . import Sequential, ModuleList, Linear
+from .module import Module
+from ..functional import log_softmax
+
+__all__ = ['AdaptiveLogSoftmaxWithLoss']
+
+_ASMoutput = namedtuple('_ASMoutput', ['output', 'loss'])
+
+
+class AdaptiveLogSoftmaxWithLoss(Module):
+    r"""Efficient softmax approximation.
+
+    As described in
+    `Efficient softmax approximation for GPUs by Edouard Grave, Armand Joulin,
+    Moustapha Cissé, David Grangier, and Hervé Jégou
+    <https://arxiv.org/abs/1609.04309>`__.
+
+    Adaptive softmax is an approximate strategy for training models with large
+    output spaces. It is most effective when the label distribution is highly
+    imbalanced, for example in natural language modelling, where the word
+    frequency distribution approximately follows the `Zipf's law`_.
+
+    Adaptive softmax partitions the labels into several clusters, according to
+    their frequency. These clusters may contain different number of targets
+    each.
+    Additionally, clusters containing less frequent labels assign lower
+    dimensional embeddings to those labels, which speeds up the computation.
+    For each minibatch, only clusters for which at least one target is
+    present are evaluated.
+
+    The idea is that the clusters which are accessed frequently
+    (like the first one, containing most frequent labels), should also be cheap
+    to compute -- that is, contain a small number of assigned labels.
+
+    We highly recommend taking a look at the original paper for more details.
+
+    * :attr:`cutoffs` should be an ordered Sequence of integers sorted
+      in the increasing order.
+      It controls number of clusters and the partitioning of targets into
+      clusters. For example setting ``cutoffs = [10, 100, 1000]``
+      means that first `10` targets will be assigned
+      to the 'head' of the adaptive softmax, targets `11, 12, ..., 100` will be
+      assigned to the first cluster, and targets `101, 102, ..., 1000` will be
+      assigned to the second cluster, while targets
+      `1001, 1002, ..., n_classes - 1` will be assigned
+      to the last, third cluster.
+
+    * :attr:`div_value` is used to compute the size of each additional cluster,
+      which is given as
+      :math:`\left\lfloor\frac{\texttt{in\_features}}{\texttt{div\_value}^{idx}}\right\rfloor`,
+      where :math:`idx` is the cluster index (with clusters
+      for less frequent words having larger indices,
+      and indices starting from :math:`1`).
+
+    * :attr:`head_bias` if set to True, adds a bias term to the 'head' of the
+      adaptive softmax. See paper for details. Set to False in the official
+      implementation.
+
+    .. warning::
+        Labels passed as inputs to this module should be sorted according to
+        their frequency. This means that the most frequent label should be
+        represented by the index `0`, and the least frequent
+        label should be represented by the index `n_classes - 1`.
+
+    .. note::
+        This module returns a ``NamedTuple`` with ``output``
+        and ``loss`` fields. See further documentation for details.
+
+    .. note::
+        To compute log-probabilities for all classes, the ``log_prob``
+        method can be used.
+
+    Args:
+        in_features (int): Number of features in the input tensor
+        n_classes (int): Number of classes in the dataset
+        cutoffs (Sequence): Cutoffs used to assign targets to their buckets
+        div_value (float, optional): value used as an exponent to compute sizes
+            of the clusters. Default: 4.0
+        head_bias (bool, optional): If ``True``, adds a bias term to the 'head' of the
+            adaptive softmax. Default: ``False``
+
+    Returns:
+        ``NamedTuple`` with ``output`` and ``loss`` fields:
+            * **output** is a Tensor of size ``N`` containing computed target
+              log probabilities for each example
+            * **loss** is a Scalar representing the computed negative
+              log likelihood loss
+
+    Shape:
+        - input: :math:`(N, \texttt{in\_features})` or :math:`(\texttt{in\_features})`
+        - target: :math:`(N)` or :math:`()` where each value satisfies :math:`0 <= \texttt{target[i]} <= \texttt{n\_classes}`
+        - output1: :math:`(N)` or :math:`()`
+        - output2: ``Scalar``
+
+    .. _Zipf's law: https://en.wikipedia.org/wiki/Zipf%27s_law
+    """
+
+    in_features: int
+    n_classes: int
+    cutoffs: List[int]
+    div_value: float
+    head_bias: bool
+    head: Linear
+    tail: ModuleList
+
+    def __init__(
+        self,
+        in_features: int,
+        n_classes: int,
+        cutoffs: Sequence[int],
+        div_value: float = 4.,
+        head_bias: bool = False,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+
+        cutoffs = list(cutoffs)
+
+        if (len(cutoffs) == 0):
+            raise ValueError("cutoffs should be a sequence of length larger than 0")
+
+        if (cutoffs != sorted(cutoffs)) \
+                or (min(cutoffs) <= 0) \
+                or (max(cutoffs) > (n_classes - 1)) \
+                or (len(set(cutoffs)) != len(cutoffs)) \
+                or any(int(c) != c for c in cutoffs):
+
+            raise ValueError("cutoffs should be a sequence of unique, positive "
+                             "integers sorted in an increasing order, where "
+                             "each value is between 1 and n_classes-1")
+
+        self.in_features = in_features
+        self.n_classes = n_classes
+        self.cutoffs = cutoffs + [n_classes]
+        self.div_value = div_value
+        self.head_bias = head_bias
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        self.head = Linear(self.in_features, self.head_size, bias=self.head_bias,
+                           **factory_kwargs)
+        self.tail = ModuleList()
+
+        for i in range(self.n_clusters):
+
+            hsz = int(self.in_features // (self.div_value ** (i + 1)))
+            osz = self.cutoffs[i + 1] - self.cutoffs[i]
+
+            projection = Sequential(
+                Linear(self.in_features, hsz, bias=False, **factory_kwargs),
+                Linear(hsz, osz, bias=False, **factory_kwargs),
+            )
+
+            self.tail.append(projection)
+
+    def reset_parameters(self) -> None:
+        self.head.reset_parameters()
+        for i2h, h2o in self.tail:
+            i2h.reset_parameters()
+            h2o.reset_parameters()
+
+    def forward(self, input_: Tensor, target_: Tensor) -> _ASMoutput:
+        targ_dim = target_.dim()
+
+        if targ_dim == 1:
+            if input_.size(0) != target_.size(0):
+                raise RuntimeError('Input and target should have the same size '
+                                   'in the batch dimension.')
+            if input_.dim() != 2:
+                raise RuntimeError('1D target tensor expects 2D input tensors, '
+                                   'but found inputs with size', input_.size())
+        elif targ_dim == 0:
+            if input_.dim() != 1:
+                raise RuntimeError('0D target tensor expects 1D input tensors, '
+                                   'but found inputs with size', input_.size())
+        else:
+            raise RuntimeError('0D or 1D target tensor expected, '
+                               'multi-target not supported')
+
+        is_batched = targ_dim > 0
+        input = input_ if is_batched else input_.unsqueeze(0)
+        target = target_ if is_batched else target_.unsqueeze(0)
+
+        used_rows = 0
+        batch_size = target.size(0)
+
+        output = input.new_zeros(batch_size)
+        gather_inds = target.new_empty(batch_size)
+
+        cutoff_values = [0] + self.cutoffs
+        for i in range(len(cutoff_values) - 1):
+
+            low_idx = cutoff_values[i]
+            high_idx = cutoff_values[i + 1]
+
+            target_mask = (target >= low_idx) & (target < high_idx)
+            row_indices = target_mask.nonzero().squeeze()
+
+            if row_indices.numel() == 0:
+                continue
+
+            if i == 0:
+                gather_inds.index_copy_(0, row_indices, target[target_mask])
+
+            else:
+                relative_target = target[target_mask] - low_idx
+                input_subset = input.index_select(0, row_indices)
+
+                cluster_output = self.tail[i - 1](input_subset)
+                cluster_index = self.shortlist_size + i - 1
+
+                gather_inds.index_fill_(0, row_indices, cluster_index)
+                cluster_logprob = log_softmax(cluster_output, dim=1)
+                local_logprob = cluster_logprob.gather(1, relative_target.unsqueeze(1))
+                output.index_copy_(0, row_indices, local_logprob.squeeze(1))
+
+            used_rows += row_indices.numel()
+
+        if used_rows != batch_size:
+            raise RuntimeError(f"Target values should be in [0, {self.n_classes - 1}], "
+                               f"but values in range [{target.min().item()}, {target.max().item()}] "
+                               "were found. ")
+
+        head_output = self.head(input)
+        head_logprob = log_softmax(head_output, dim=1)
+        output += head_logprob.gather(1, gather_inds.unsqueeze(1)).squeeze()
+        loss = (-output).mean()
+
+        if not is_batched:
+            output = output.squeeze(0)
+
+        return _ASMoutput(output, loss)
+
+    def _get_full_log_prob(self, input, head_output):
+        """Given input tensor, and output of ``self.head``, compute the log of the full distribution."""
+        out = input.new_empty((head_output.size(0), self.n_classes))
+        head_logprob = log_softmax(head_output, dim=1)
+
+        out[:, :self.shortlist_size] = head_logprob[:, :self.shortlist_size]
+
+        for i, (start_idx, stop_idx) in enumerate(zip(self.cutoffs, self.cutoffs[1:])):
+            cluster_output = self.tail[i](input)
+            cluster_logprob = log_softmax(cluster_output, dim=1)
+            output_logprob = cluster_logprob + head_logprob[:, self.shortlist_size + i].unsqueeze(1)
+
+            out[:, start_idx:stop_idx] = output_logprob
+
+        return out
+
+    def log_prob(self, input: Tensor) -> Tensor:
+        r"""Compute log probabilities for all :math:`\texttt{n\_classes}`.
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            log-probabilities of for each class :math:`c`
+            in range :math:`0 <= c <= \texttt{n\_classes}`, where :math:`\texttt{n\_classes}` is a
+            parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor.
+
+        Shape:
+            - Input: :math:`(N, \texttt{in\_features})`
+            - Output: :math:`(N, \texttt{n\_classes})`
+
+        """
+        head_output = self.head(input)
+        return self._get_full_log_prob(input, head_output)
+
+    def predict(self, input: Tensor) -> Tensor:
+        r"""Return the class with the highest probability for each example in the input minibatch.
+
+        This is equivalent to ``self.log_prob(input).argmax(dim=1)``, but is more efficient in some cases.
+
+        Args:
+            input (Tensor): a minibatch of examples
+
+        Returns:
+            output (Tensor): a class with the highest probability for each example
+
+        Shape:
+            - Input: :math:`(N, \texttt{in\_features})`
+            - Output: :math:`(N)`
+        """
+        head_output = self.head(input)
+        output = torch.argmax(head_output, dim=1)
+        not_in_shortlist = (output >= self.shortlist_size)
+        all_in_shortlist = not (not_in_shortlist.any())
+
+        if all_in_shortlist:
+            return output
+
+        elif not_in_shortlist.all():
+            log_prob = self._get_full_log_prob(input, head_output)
+            return torch.argmax(log_prob, dim=1)
+
+        else:
+            log_prob = self._get_full_log_prob(input[not_in_shortlist],
+                                               head_output[not_in_shortlist])
+            output[not_in_shortlist] = torch.argmax(log_prob, dim=1)
+            return output
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/batchnorm.py b/MLPY/Lib/site-packages/torch/nn/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc202e72cf0f3c7a3bac0bb2304ecc92618ae519
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/batchnorm.py
@@ -0,0 +1,849 @@
+from typing import Optional, Any
+
+import torch
+from torch import Tensor
+from torch.nn.parameter import Parameter, UninitializedParameter, UninitializedBuffer
+
+from .. import functional as F
+from .. import init
+from ._functions import SyncBatchNorm as sync_batch_norm
+from .lazy import LazyModuleMixin
+from .module import Module
+
+__all__ = ['BatchNorm1d', 'LazyBatchNorm1d', 'BatchNorm2d', 'LazyBatchNorm2d', 'BatchNorm3d',
+           'LazyBatchNorm3d', 'SyncBatchNorm']
+
+
+class _NormBase(Module):
+    """Common base of _InstanceNorm and _BatchNorm."""
+
+    _version = 2
+    __constants__ = ["track_running_stats", "momentum", "eps", "num_features", "affine"]
+    num_features: int
+    eps: float
+    momentum: float
+    affine: bool
+    track_running_stats: bool
+    # WARNING: weight and bias purposely not defined here.
+    # See https://github.com/pytorch/pytorch/issues/39670
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_features, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_features, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features, **factory_kwargs))
+            self.register_buffer('running_var', torch.ones(num_features, **factory_kwargs))
+            self.running_mean: Optional[Tensor]
+            self.running_var: Optional[Tensor]
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long,
+                                              **{k: v for k, v in factory_kwargs.items() if k != 'dtype'}))
+            self.num_batches_tracked: Optional[Tensor]
+        else:
+            self.register_buffer("running_mean", None)
+            self.register_buffer("running_var", None)
+            self.register_buffer("num_batches_tracked", None)
+        self.reset_parameters()
+
+    def reset_running_stats(self) -> None:
+        if self.track_running_stats:
+            # running_mean/running_var/num_batches... are registered at runtime depending
+            # if self.track_running_stats is on
+            self.running_mean.zero_()  # type: ignore[union-attr]
+            self.running_var.fill_(1)  # type: ignore[union-attr]
+            self.num_batches_tracked.zero_()  # type: ignore[union-attr,operator]
+
+    def reset_parameters(self) -> None:
+        self.reset_running_stats()
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def extra_repr(self):
+        return (
+            "{num_features}, eps={eps}, momentum={momentum}, affine={affine}, "
+            "track_running_stats={track_running_stats}".format(**self.__dict__)
+        )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if (version is None or version < 2) and self.track_running_stats:
+            # at version 2: added num_batches_tracked buffer
+            #               this should have a default value of 0
+            num_batches_tracked_key = prefix + "num_batches_tracked"
+            if num_batches_tracked_key not in state_dict:
+                state_dict[num_batches_tracked_key] = (
+                    self.num_batches_tracked
+                    if self.num_batches_tracked is not None and self.num_batches_tracked.device != torch.device('meta')
+                    else torch.tensor(0, dtype=torch.long)
+                )
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+
+class _BatchNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            # TODO: if statement only here to tell the jit to skip emitting this when it is None
+            if self.num_batches_tracked is not None:  # type: ignore[has-type]
+                self.num_batches_tracked.add_(1)  # type: ignore[has-type]
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        return F.batch_norm(
+            input,
+            # If buffers are not to be tracked, ensure that they won't be updated
+            self.running_mean
+            if not self.training or self.track_running_stats
+            else None,
+            self.running_var if not self.training or self.track_running_stats else None,
+            self.weight,
+            self.bias,
+            bn_training,
+            exponential_average_factor,
+            self.eps,
+        )
+
+
+class _LazyNormBase(LazyModuleMixin, _NormBase):
+
+    weight: UninitializedParameter  # type: ignore[assignment]
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(self, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            # affine and track_running_stats are hardcoded to False to
+            # avoid creating tensors that will soon be overwritten.
+            0,
+            eps,
+            momentum,
+            False,
+            False,
+            **factory_kwargs,
+        )
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.weight = UninitializedParameter(**factory_kwargs)
+            self.bias = UninitializedParameter(**factory_kwargs)
+        if self.track_running_stats:
+            self.running_mean = UninitializedBuffer(**factory_kwargs)
+            self.running_var = UninitializedBuffer(**factory_kwargs)
+            self.num_batches_tracked = torch.tensor(
+                0, dtype=torch.long, **{k: v for k, v in factory_kwargs.items() if k != 'dtype'})
+
+    def reset_parameters(self) -> None:
+        if not self.has_uninitialized_params() and self.num_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        if self.has_uninitialized_params():
+            self.num_features = input.shape[1]
+            if self.affine:
+                assert isinstance(self.weight, UninitializedParameter)
+                assert isinstance(self.bias, UninitializedParameter)
+                self.weight.materialize((self.num_features,))
+                self.bias.materialize((self.num_features,))
+            if self.track_running_stats:
+                self.running_mean.materialize((self.num_features,))  # type:ignore[union-attr]
+                self.running_var.materialize((self.num_features,))  # type:ignore[union-attr]
+            self.reset_parameters()
+
+
+class BatchNorm1d(_BatchNorm):
+    r"""Applies Batch Normalization over a 2D or 3D input.
+
+    Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the number of features or channels of the input). By default, the
+    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
+    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
+    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
+    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.
+
+    Args:
+        num_features: number of features or channels :math:`C` of the input
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
+          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm1d(100, affine=False)
+        >>> input = torch.randn(20, 100)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+
+
+class LazyBatchNorm1d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm1d` module with lazy initialization.
+
+    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm1d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+
+
+class BatchNorm2d(_BatchNorm):
+    r"""Applies Batch Normalization over a 4D input.
+
+    4D is a mini-batch of 2D inputs
+    with additional channel dimension. Method described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm2d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+
+
+class LazyBatchNorm2d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm2d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm2d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError(f"expected 4D input (got {input.dim()}D input)")
+
+
+class BatchNorm3d(_BatchNorm):
+    r"""Applies Batch Normalization over a 5D input.
+
+    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
+    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
+    standard-deviation is calculated via the biased estimator, equivalent to
+    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
+    standard-deviation is calculated via the unbiased  estimator, equivalent to
+    ``torch.var(input, unbiased=True)``.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
+    or Spatio-temporal Batch Normalization.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # With Learnable Parameters
+        >>> m = nn.BatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+
+
+class LazyBatchNorm3d(_LazyNormBase, _BatchNorm):
+    r"""A :class:`torch.nn.BatchNorm3d` module with lazy initialization.
+
+    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
+    from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+    """
+
+    cls_to_become = BatchNorm3d  # type: ignore[assignment]
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError(f"expected 5D input (got {input.dim()}D input)")
+
+
+class SyncBatchNorm(_BatchNorm):
+    r"""Applies Batch Normalization over a N-Dimensional input.
+
+    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
+    `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension over all
+    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
+    are learnable parameter vectors of size `C` (where `C` is the input size).
+    By default, the elements of :math:`\gamma` are sampled from
+    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    Also by default, during training this layer keeps running estimates of its
+    computed mean and variance, which are then used for normalization during
+    evaluation. The running estimates are kept with a default :attr:`momentum`
+    of 0.1.
+
+    If :attr:`track_running_stats` is set to ``False``, this layer then does not
+    keep running estimates, and batch statistics are instead used during
+    evaluation time as well.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
+    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
+    Normalization or Spatio-temporal Batch Normalization.
+
+    Currently :class:`SyncBatchNorm` only supports
+    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
+    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
+    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
+    Network with DDP.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, +)`
+        eps: a value added to the denominator for numerical stability.
+            Default: ``1e-5``
+        momentum: the value used for the running_mean and running_var
+            computation. Can be set to ``None`` for cumulative moving average
+            (i.e. simple average). Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters. Default: ``True``
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics, and initializes statistics
+            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
+            When these buffers are ``None``, this module always uses batch statistics.
+            in both training and eval modes. Default: ``True``
+        process_group: synchronization of stats happen within each process group
+            individually. Default behavior is synchronization across the whole
+            world
+
+    Shape:
+        - Input: :math:`(N, C, +)`
+        - Output: :math:`(N, C, +)` (same shape as input)
+
+    .. note::
+        Synchronization of batchnorm statistics occurs only while training, i.e.
+        synchronization is disabled when ``model.eval()`` is set or if
+        ``self.training`` is otherwise ``False``.
+
+    Examples::
+
+        >>> # xdoctest: +SKIP
+        >>> # With Learnable Parameters
+        >>> m = nn.SyncBatchNorm(100)
+        >>> # creating process group (optional)
+        >>> # ranks is a list of int identifying rank ids.
+        >>> ranks = list(range(8))
+        >>> r1, r2 = ranks[:4], ranks[4:]
+        >>> # Note: every rank calls into new_group for every
+        >>> # process group created, even if that rank is not
+        >>> # part of the group.
+        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+        >>> # Without Learnable Parameters
+        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+
+        >>> # network is nn.BatchNorm layer
+        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
+        >>> # only single gpu per process is currently supported
+        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
+        >>>                         sync_bn_network,
+        >>>                         device_ids=[args.local_rank],
+        >>>                         output_device=args.local_rank)
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = True,
+        track_running_stats: bool = True,
+        process_group: Optional[Any] = None,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs
+        )
+        self.process_group = process_group
+
+    def _check_input_dim(self, input):
+        if input.dim() < 2:
+            raise ValueError(
+                f"expected at least 2D input (got {input.dim()}D input)"
+            )
+
+    def _check_non_zero_input_channels(self, input):
+        if input.size(1) == 0:
+            raise ValueError(
+                "SyncBatchNorm number of input channels should be non-zero"
+            )
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        self._check_non_zero_input_channels(input)
+
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            assert self.num_batches_tracked is not None
+            self.num_batches_tracked.add_(1)
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        # If buffers are not to be tracked, ensure that they won't be updated
+        running_mean = (
+            self.running_mean if not self.training or self.track_running_stats else None
+        )
+        running_var = (
+            self.running_var if not self.training or self.track_running_stats else None
+        )
+
+        # Don't sync batchnorm stats in inference mode (model.eval()).
+        need_sync = (bn_training and self.training and
+                     torch.distributed.is_available() and torch.distributed.is_initialized())
+        if need_sync:
+            # currently only GPU/PrivateUse1 input is supported
+            if input.device.type not in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                raise ValueError("SyncBatchNorm expected input tensor to be on GPU or "
+                                 f"{torch._C._get_privateuse1_backend_name()}")
+
+            process_group = torch.distributed.group.WORLD
+            if self.process_group:
+                process_group = self.process_group
+            world_size = torch.distributed.get_world_size(process_group)
+            need_sync = world_size > 1
+
+        # fallback to framework BN when synchronization is not necessary
+        if not need_sync:
+            return F.batch_norm(
+                input,
+                running_mean,
+                running_var,
+                self.weight,
+                self.bias,
+                bn_training,
+                exponential_average_factor,
+                self.eps,
+            )
+        else:
+            assert bn_training
+            return sync_batch_norm.apply(
+                input,
+                self.weight,
+                self.bias,
+                running_mean,
+                running_var,
+                self.eps,
+                exponential_average_factor,
+                process_group,  # type: ignore[possibly-undefined]
+                world_size,  # type: ignore[possibly-undefined]
+            )
+
+    @classmethod
+    def convert_sync_batchnorm(cls, module, process_group=None):
+        r"""Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.
+
+        Args:
+            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
+            process_group (optional): process group to scope synchronization,
+                default is the whole world
+
+        Returns:
+            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
+            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
+            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
+            instead.
+
+        Example::
+
+            >>> # Network with nn.BatchNorm layer
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> module = torch.nn.Sequential(
+            >>>            torch.nn.Linear(20, 100),
+            >>>            torch.nn.BatchNorm1d(100),
+            >>>          ).cuda()
+            >>> # creating process group (optional)
+            >>> # ranks is a list of int identifying rank ids.
+            >>> ranks = list(range(8))
+            >>> r1, r2 = ranks[:4], ranks[4:]
+            >>> # Note: every rank calls into new_group for every
+            >>> # process group created, even if that rank is not
+            >>> # part of the group.
+            >>> # xdoctest: +SKIP("distributed")
+            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
+            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
+            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)
+
+        """
+        module_output = module
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+            module_output = torch.nn.SyncBatchNorm(
+                module.num_features,
+                module.eps,
+                module.momentum,
+                module.affine,
+                module.track_running_stats,
+                process_group,
+            )
+            if module.affine:
+                with torch.no_grad():
+                    module_output.weight = module.weight
+                    module_output.bias = module.bias
+            module_output.running_mean = module.running_mean
+            module_output.running_var = module.running_var
+            module_output.num_batches_tracked = module.num_batches_tracked
+            module_output.training = module.training
+            if hasattr(module, "qconfig"):
+                module_output.qconfig = module.qconfig
+        for name, child in module.named_children():
+            module_output.add_module(
+                name, cls.convert_sync_batchnorm(child, process_group)
+            )
+        del module
+        return module_output
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/channelshuffle.py b/MLPY/Lib/site-packages/torch/nn/modules/channelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0c59a0e3e6f20f6319c28b2cb6a6beccd2f551
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/channelshuffle.py
@@ -0,0 +1,57 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+
+__all__ = ['ChannelShuffle']
+
+class ChannelShuffle(Module):
+    r"""Divides and rearranges the channels in a tensor.
+
+    This operation divides the channels in a tensor of shape :math:`(*, C , H, W)`
+    into g groups and rearranges them as :math:`(*, \frac{C}{g}, g, H, W)`,
+    while keeping the original tensor shape.
+
+    Args:
+        groups (int): number of groups to divide channels in.
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("FIXME: incorrect want")
+        >>> channel_shuffle = nn.ChannelShuffle(2)
+        >>> input = torch.randn(1, 4, 2, 2)
+        >>> print(input)
+        [[[[1, 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]],
+          [[9, 10],
+           [11, 12]],
+          [[13, 14],
+           [15, 16]],
+         ]]
+        >>> output = channel_shuffle(input)
+        >>> print(output)
+        [[[[1, 2],
+           [3, 4]],
+          [[9, 10],
+           [11, 12]],
+          [[5, 6],
+           [7, 8]],
+          [[13, 14],
+           [15, 16]],
+         ]]
+    """
+
+    __constants__ = ['groups']
+    groups: int
+
+    def __init__(self, groups: int) -> None:
+        super().__init__()
+        self.groups = groups
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.channel_shuffle(input, self.groups)
+
+    def extra_repr(self) -> str:
+        return f'groups={self.groups}'
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/container.py b/MLPY/Lib/site-packages/torch/nn/modules/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c2f1cc9b3a3366f6196eb0f986535c27eedff7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/container.py
@@ -0,0 +1,911 @@
+import warnings
+from collections import OrderedDict, abc as container_abcs
+from itertools import chain, islice
+import operator
+
+import torch
+from .module import Module
+from ..parameter import Parameter
+from torch._jit_internal import _copy_to_script_wrapper
+
+from typing import Any, Dict, Iterable, Iterator, Mapping, Optional, overload, Tuple, TypeVar, Union
+from typing_extensions import Self
+
+__all__ = ['Container', 'Sequential', 'ModuleList', 'ModuleDict', 'ParameterList', 'ParameterDict']
+
+T = TypeVar('T', bound=Module)
+
+
+# Copied from torch.nn.modules.module, required for a custom __repr__ for ModuleList
+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+
+
+class Container(Module):
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__()
+        # DeprecationWarning is ignored by default <sigh>
+        warnings.warn("nn.Container is deprecated. All of it's functionality "
+                      "is now implemented in nn.Module. Subclass that instead.")
+        for key, value in kwargs.items():
+            self.add_module(key, value)
+
+
+class Sequential(Module):
+    r"""A sequential container.
+
+    Modules will be added to it in the order they are passed in the
+    constructor. Alternatively, an ``OrderedDict`` of modules can be
+    passed in. The ``forward()`` method of ``Sequential`` accepts any
+    input and forwards it to the first module it contains. It then
+    "chains" outputs to inputs sequentially for each subsequent module,
+    finally returning the output of the last module.
+
+    The value a ``Sequential`` provides over manually calling a sequence
+    of modules is that it allows treating the whole container as a
+    single module, such that performing a transformation on the
+    ``Sequential`` applies to each of the modules it stores (which are
+    each a registered submodule of the ``Sequential``).
+
+    What's the difference between a ``Sequential`` and a
+    :class:`torch.nn.ModuleList`? A ``ModuleList`` is exactly what it
+    sounds like--a list for storing ``Module`` s! On the other hand,
+    the layers in a ``Sequential`` are connected in a cascading way.
+
+    Example::
+
+        # Using Sequential to create a small model. When `model` is run,
+        # input will first be passed to `Conv2d(1,20,5)`. The output of
+        # `Conv2d(1,20,5)` will be used as the input to the first
+        # `ReLU`; the output of the first `ReLU` will become the input
+        # for `Conv2d(20,64,5)`. Finally, the output of
+        # `Conv2d(20,64,5)` will be used as input to the second `ReLU`
+        model = nn.Sequential(
+                  nn.Conv2d(1,20,5),
+                  nn.ReLU(),
+                  nn.Conv2d(20,64,5),
+                  nn.ReLU()
+                )
+
+        # Using Sequential with OrderedDict. This is functionally the
+        # same as the above code
+        model = nn.Sequential(OrderedDict([
+                  ('conv1', nn.Conv2d(1,20,5)),
+                  ('relu1', nn.ReLU()),
+                  ('conv2', nn.Conv2d(20,64,5)),
+                  ('relu2', nn.ReLU())
+                ]))
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    @overload
+    def __init__(self, *args: Module) -> None:
+        ...
+
+    @overload
+    def __init__(self, arg: 'OrderedDict[str, Module]') -> None:
+        ...
+
+    def __init__(self, *args):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+
+    def _get_item_by_idx(self, iterator, idx) -> T:  # type: ignore[misc, type-var]
+        """Get the idx-th item of the iterator."""
+        size = len(self)
+        idx = operator.index(idx)
+        if not -size <= idx < size:
+            raise IndexError(f'index {idx} is out of range')
+        idx %= size
+        return next(islice(iterator, idx, None))
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: Union[slice, int]) -> Union['Sequential', T]:
+        if isinstance(idx, slice):
+            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
+        else:
+            return self._get_item_by_idx(self._modules.values(), idx)
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        key: str = self._get_item_by_idx(self._modules.keys(), idx)
+        return setattr(self, key, module)
+
+    def __delitem__(self, idx: Union[slice, int]) -> None:
+        if isinstance(idx, slice):
+            for key in list(self._modules.keys())[idx]:
+                delattr(self, key)
+        else:
+            key = self._get_item_by_idx(self._modules.keys(), idx)
+            delattr(self, key)
+        # To preserve numbering
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __add__(self, other) -> 'Sequential':
+        if isinstance(other, Sequential):
+            ret = Sequential()
+            for layer in self:
+                ret.append(layer)
+            for layer in other:
+                ret.append(layer)
+            return ret
+        else:
+            raise ValueError('add operator supports only objects '
+                             f'of Sequential class, but {str(type(other))} is given.')
+
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+
+    def __iadd__(self, other) -> Self:
+        if isinstance(other, Sequential):
+            offset = len(self)
+            for i, module in enumerate(other):
+                self.add_module(str(i + offset), module)
+            return self
+        else:
+            raise ValueError('add operator supports only objects '
+                             f'of Sequential class, but {str(type(other))} is given.')
+
+    def __mul__(self, other: int) -> 'Sequential':
+        if not isinstance(other, int):
+            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
+        elif (other <= 0):
+            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+        else:
+            combined = Sequential()
+            offset = 0
+            for _ in range(other):
+                for module in self:
+                    combined.add_module(str(offset), module)
+                    offset += 1
+            return combined
+
+    def __rmul__(self, other: int) -> 'Sequential':
+        return self.__mul__(other)
+
+    def __imul__(self, other: int) -> Self:
+        if not isinstance(other, int):
+            raise TypeError(f"unsupported operand type(s) for *: {type(self)} and {type(other)}")
+        elif (other <= 0):
+            raise ValueError(f"Non-positive multiplication factor {other} for {type(self)}")
+        else:
+            len_original = len(self)
+            offset = len(self)
+            for _ in range(other - 1):
+                for i in range(len_original):
+                    self.add_module(str(i + offset), self._modules[str(i)])
+                offset += len_original
+            return self
+
+    @_copy_to_script_wrapper
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    # NB: We can't really type check this function as the type of input
+    # may change dynamically (as is tested in
+    # TestScript.test_sequential_intermediary_types).  Cannot annotate
+    # with Any as TorchScript expects a more precise type
+    def forward(self, input):
+        for module in self:
+            input = module(input)
+        return input
+
+    def append(self, module: Module) -> 'Sequential':
+        r"""Append a given module to the end.
+
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def insert(self, index: int, module: Module) -> 'Sequential':
+        if not isinstance(module, Module):
+            raise AssertionError(
+                f'module should be of type: {Module}')
+        n = len(self._modules)
+        if not (-n <= index <= n):
+            raise IndexError(
+                f'Index out of range: {index}')
+        if index < 0:
+            index += n
+        for i in range(n, index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+        return self
+
+    def extend(self, sequential) -> 'Sequential':
+        for layer in sequential:
+            self.append(layer)
+        return self
+
+
+class ModuleList(Module):
+    r"""Holds submodules in a list.
+
+    :class:`~torch.nn.ModuleList` can be indexed like a regular Python list, but
+    modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])
+
+            def forward(self, x):
+                # ModuleList can act as an iterable, or be indexed using ints
+                for i, l in enumerate(self.linears):
+                    x = self.linears[i // 2](x) + l(x)
+                return x
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Optional[Iterable[Module]] = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self += modules
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, idx: Union[int, slice]) -> Union[Module, 'ModuleList']:
+        if isinstance(idx, slice):
+            return self.__class__(list(self._modules.values())[idx])
+        else:
+            return self._modules[self._get_abs_string_index(idx)]
+
+    def __setitem__(self, idx: int, module: Module) -> None:
+        idx = self._get_abs_string_index(idx)
+        return setattr(self, str(idx), module)
+
+    def __delitem__(self, idx: Union[int, slice]) -> None:
+        if isinstance(idx, slice):
+            for k in range(len(self._modules))[idx]:
+                delattr(self, str(k))
+        else:
+            delattr(self, self._get_abs_string_index(idx))
+        # To preserve numbering, self._modules is being reconstructed with modules after deletion
+        str_indices = [str(i) for i in range(len(self._modules))]
+        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[Module]:
+        return iter(self._modules.values())
+
+    def __iadd__(self, modules: Iterable[Module]) -> Self:
+        return self.extend(modules)
+
+    def __add__(self, other: Iterable[Module]) -> 'ModuleList':
+        combined = ModuleList()
+        for i, module in enumerate(chain(self, other)):
+            combined.add_module(str(i), module)
+        return combined
+
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + '()'
+
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+
+        lines = []
+        main_str = self._get_name() + '('
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+
+        main_str += '\n  ' + '\n  '.join(lines) + '\n'
+        main_str += ')'
+        return main_str
+
+    @_copy_to_script_wrapper
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def insert(self, index: int, module: Module) -> None:
+        r"""Insert a given module before a given index in the list.
+
+        Args:
+            index (int): index to insert.
+            module (nn.Module): module to insert
+        """
+        for i in range(len(self._modules), index, -1):
+            self._modules[str(i)] = self._modules[str(i - 1)]
+        self._modules[str(index)] = module
+
+    def append(self, module: Module) -> 'ModuleList':
+        r"""Append a given module to the end of the list.
+
+        Args:
+            module (nn.Module): module to append
+        """
+        self.add_module(str(len(self)), module)
+        return self
+
+    def pop(self, key: Union[int, slice]) -> Module:
+        v = self[key]
+        del self[key]
+        return v
+
+    def extend(self, modules: Iterable[Module]) -> Self:
+        r"""Append modules from a Python iterable to the end of the list.
+
+        Args:
+            modules (iterable): iterable of modules to append
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("ModuleList.extend should be called with an "
+                            "iterable, but got " + type(modules).__name__)
+        offset = len(self)
+        for i, module in enumerate(modules):
+            self.add_module(str(offset + i), module)
+        return self
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+class ModuleDict(Module):
+    r"""Holds submodules in a dictionary.
+
+    :class:`~torch.nn.ModuleDict` can be indexed like a regular Python dictionary,
+    but modules it contains are properly registered, and will be visible by all
+    :class:`~torch.nn.Module` methods.
+
+    :class:`~torch.nn.ModuleDict` is an **ordered** dictionary that respects
+
+    * the order of insertion, and
+
+    * in :meth:`~torch.nn.ModuleDict.update`, the order of the merged
+      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
+      :class:`~torch.nn.ModuleDict` (the argument to
+      :meth:`~torch.nn.ModuleDict.update`).
+
+    Note that :meth:`~torch.nn.ModuleDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
+    preserve the order of the merged mapping.
+
+    Args:
+        modules (iterable, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.choices = nn.ModuleDict({
+                        'conv': nn.Conv2d(10, 10, 3),
+                        'pool': nn.MaxPool2d(3)
+                })
+                self.activations = nn.ModuleDict([
+                        ['lrelu', nn.LeakyReLU()],
+                        ['prelu', nn.PReLU()]
+                ])
+
+            def forward(self, x, choice, act):
+                x = self.choices[choice](x)
+                x = self.activations[act](x)
+                return x
+    """
+
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self.update(modules)
+
+    @_copy_to_script_wrapper
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    @_copy_to_script_wrapper
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    @_copy_to_script_wrapper
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    @_copy_to_script_wrapper
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict."""
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        r"""Remove key from the ModuleDict and return its module.
+
+        Args:
+            key (str): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    @_copy_to_script_wrapper
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ModuleDict keys."""
+        return self._modules.keys()
+
+    @_copy_to_script_wrapper
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        r"""Return an iterable of the ModuleDict key/value pairs."""
+        return self._modules.items()
+
+    @_copy_to_script_wrapper
+    def values(self) -> Iterable[Module]:
+        r"""Return an iterable of the ModuleDict values."""
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
+                or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("ModuleDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(modules).__name__)
+
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError("ModuleDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("ModuleDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(m)) +
+                                     "; 2 is required")
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+class ParameterList(Module):
+    r"""Holds parameters in a list.
+
+    :class:`~torch.nn.ParameterList` can be used like a regular Python
+    list, but Tensors that are :class:`~torch.nn.Parameter` are properly registered,
+    and will be visible by all :class:`~torch.nn.Module` methods.
+
+    Note that the constructor, assigning an element of the list, the
+    :meth:`~torch.nn.ParameterDict.append` method and the :meth:`~torch.nn.ParameterDict.extend`
+    method will convert any :class:`~torch.Tensor` into :class:`~torch.nn.Parameter`.
+
+    Args:
+        parameters (iterable, optional): an iterable of elements to add to the list.
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.params = nn.ParameterList([nn.Parameter(torch.randn(10, 10)) for i in range(10)])
+
+            def forward(self, x):
+                # ParameterList can act as an iterable, or be indexed using ints
+                for i, p in enumerate(self.params):
+                    x = self.params[i // 2].mm(x) + p.mm(x)
+                return x
+    """
+
+    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
+        super().__init__()
+        self._size = 0
+        if values is not None:
+            self += values
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> Any:
+        ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, torch.Tensor) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
+
+    def __len__(self) -> int:
+        return self._size
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
+
+    def __iadd__(self, parameters: Iterable[Any]) -> Self:
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, value: Any) -> 'ParameterList':
+        """Append a given value at the end of the list.
+
+        Args:
+            value (Any): value to append
+        """
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
+        return self
+
+    def extend(self, values: Iterable[Any]) -> Self:
+        """Append values from a Python iterable to the end of the list.
+
+        Args:
+            values (iterable): iterable of values to append
+        """
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(values, torch.Tensor):
+            raise TypeError("ParameterList.extend should be called with an "
+                            "iterable, but got " + type(values).__name__)
+        for value in values:
+            self.append(value)
+        return self
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, torch.Tensor):
+                size_str = 'x'.join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f' ({p.device})'
+                else:
+                    device_str = ''
+                parastr = '{} containing: [{} of size {}{}]'.format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    p.dtype, size_str, device_str)
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+            else:
+                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
+
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError('ParameterList should not be called.')
+
+
+class ParameterDict(Module):
+    r"""Holds parameters in a dictionary.
+
+    ParameterDict can be indexed like a regular Python dictionary, but Parameters it
+    contains are properly registered, and will be visible by all Module methods.
+    Other objects are treated as would be done by a regular Python dictionary
+
+    :class:`~torch.nn.ParameterDict` is an **ordered** dictionary.
+    :meth:`~torch.nn.ParameterDict.update` with other unordered mapping
+    types (e.g., Python's plain ``dict``) does not preserve the order of the
+    merged mapping. On the other hand, ``OrderedDict`` or another :class:`~torch.nn.ParameterDict`
+    will preserve their ordering.
+
+    Note that the constructor, assigning an element of the dictionary and the
+    :meth:`~torch.nn.ParameterDict.update` method will convert any :class:`~torch.Tensor` into
+    :class:`~torch.nn.Parameter`.
+
+    Args:
+        values (iterable, optional): a mapping (dictionary) of
+            (string : Any) or an iterable of key-value pairs
+            of type (string, Any)
+
+    Example::
+
+        class MyModule(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.params = nn.ParameterDict({
+                        'left': nn.Parameter(torch.randn(5, 10)),
+                        'right': nn.Parameter(torch.randn(5, 10))
+                })
+
+            def forward(self, x, choice):
+                x = self.params[choice].mm(x)
+                return x
+    """
+
+    def __init__(self, parameters: Any = None) -> None:
+        super().__init__()
+        self._keys: Dict[str, None] = {}
+        if parameters is not None:
+            self.update(parameters)
+
+    def _key_to_attr(self, key: str) -> str:
+        if not isinstance(key, str):
+            raise TypeError("Index given to ParameterDict cannot be used as a key as it is "
+                            f"not a string (type is '{type(key).__name__}'). Open an issue on "
+                            "github if you need non-string keys.")
+        else:
+            # Use the key as-is so that `.named_parameters()` returns the right thing
+            return key
+
+    def __getitem__(self, key: str) -> Any:
+        attr = self._key_to_attr(key)
+        return getattr(self, attr)
+
+    def __setitem__(self, key: str, value: Any) -> None:
+        # Note that all other function that add an entry to the dictionary part of
+        # the ParameterDict end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the dictionary part and thus won't
+        # call into this function.
+        self._keys[key] = None
+        attr = self._key_to_attr(key)
+        if isinstance(value, torch.Tensor) and not isinstance(value, Parameter):
+            value = Parameter(value)
+        setattr(self, attr, value)
+
+    def __delitem__(self, key: str) -> None:
+        del self._keys[key]
+        attr = self._key_to_attr(key)
+        delattr(self, attr)
+
+    def __len__(self) -> int:
+        return len(self._keys)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._keys)
+
+    def __reversed__(self) -> Iterator[str]:
+        return reversed(list(self._keys))
+
+    def copy(self) -> 'ParameterDict':
+        """Return a copy of this :class:`~torch.nn.ParameterDict` instance."""
+        # We have to use an OrderedDict because the ParameterDict constructor
+        # behaves differently on plain dict vs OrderedDict
+        return ParameterDict(OrderedDict((k, self[k]) for k in self._keys))
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._keys
+
+    def setdefault(self, key: str, default: Optional[Any] = None) -> Any:
+        """Set the default for a key in the Parameterdict.
+
+        If key is in the ParameterDict, return its value.
+        If not, insert `key` with a parameter `default` and return `default`.
+        `default` defaults to `None`.
+
+        Args:
+            key (str): key to set default for
+            default (Any): the parameter set to the key
+        """
+        if key not in self:
+            self[key] = default
+        return self[key]
+
+    def clear(self) -> None:
+        """Remove all items from the ParameterDict."""
+        for k in self._keys.copy():
+            del self[k]
+
+    def pop(self, key: str) -> Any:
+        r"""Remove key from the ParameterDict and return its parameter.
+
+        Args:
+            key (str): key to pop from the ParameterDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def popitem(self) -> Tuple[str, Any]:
+        """Remove and return the last inserted `(key, parameter)` pair from the ParameterDict."""
+        k, _ = self._keys.popitem()
+        # We need the key in the _keys to be able to access/del
+        self._keys[k] = None
+        val = self[k]
+        del self[k]
+        return k, val
+
+    def get(self, key: str, default: Optional[Any] = None) -> Any:
+        r"""Return the parameter associated with key if present. Otherwise return default if provided, None if not.
+
+        Args:
+            key (str): key to get from the ParameterDict
+            default (Parameter, optional): value to return if key not present
+        """
+        return self[key] if key in self else default
+
+    def fromkeys(self, keys: Iterable[str], default: Optional[Any] = None) -> 'ParameterDict':
+        r"""Return a new ParameterDict with the keys provided.
+
+        Args:
+            keys (iterable, string): keys to make the new ParameterDict from
+            default (Parameter, optional): value to set for all keys
+        """
+        return ParameterDict((k, default) for k in keys)
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ParameterDict keys."""
+        return self._keys.keys()
+
+    def items(self) -> Iterable[Tuple[str, Any]]:
+        r"""Return an iterable of the ParameterDict key/value pairs."""
+        return ((k, self[k]) for k in self._keys)
+
+    def values(self) -> Iterable[Any]:
+        r"""Return an iterable of the ParameterDict values."""
+        return (self[k] for k in self._keys)
+
+    def update(self, parameters: Union[Mapping[str, Any], 'ParameterDict']) -> None:
+        r"""Update the :class:`~torch.nn.ParameterDict` with key-value pairs from ``parameters``, overwriting existing keys.
+
+        .. note::
+            If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            parameters (iterable): a mapping (dictionary) from string to
+                :class:`~torch.nn.Parameter`, or an iterable of
+                key-value pairs of type (string, :class:`~torch.nn.Parameter`)
+        """
+        if not isinstance(parameters, container_abcs.Iterable):
+            raise TypeError("ParametersDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(parameters).__name__)
+
+        if isinstance(parameters, (OrderedDict, ParameterDict)):
+            for key, parameter in parameters.items():
+                self[key] = parameter
+        elif isinstance(parameters, container_abcs.Mapping):
+            for key, parameter in sorted(parameters.items()):
+                self[key] = parameter
+        else:
+            for j, p in enumerate(parameters):
+                if not isinstance(p, container_abcs.Iterable):
+                    raise TypeError("ParameterDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(p).__name__)
+                if not len(p) == 2:
+                    raise ValueError("ParameterDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(p)) +
+                                     "; 2 is required")
+                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
+                self[p[0]] = p[1]  # type: ignore[assignment]
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in self.items():
+            if isinstance(p, torch.Tensor):
+                size_str = 'x'.join(str(size) for size in p.size())
+                if p.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+                    device_str = f' ({p.device})'
+                else:
+                    device_str = ''
+                parastr = '{} containing: [{} of size {}{}]'.format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    torch.typename(p), size_str, device_str)
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+            else:
+                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+
+    def __call__(self, input):
+        raise RuntimeError('ParameterDict should not be called.')
+
+    def __or__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = self.copy()
+        copy.update(other)
+        return copy
+
+    def __ror__(self, other: 'ParameterDict') -> 'ParameterDict':
+        copy = other.copy()
+        copy.update(self)
+        return copy
+
+    def __ior__(self, other : 'ParameterDict') -> Self:
+        self.update(other)
+        return self
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/conv.py b/MLPY/Lib/site-packages/torch/nn/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..842a92ec87f19fb7b94653b814edecc7e6052449
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/conv.py
@@ -0,0 +1,1602 @@
+import math
+import warnings
+
+import torch
+from torch import Tensor
+from torch.nn.parameter import Parameter, UninitializedParameter
+from .. import functional as F
+from .. import init
+from .lazy import LazyModuleMixin
+from .module import Module
+from .utils import _single, _pair, _triple, _reverse_repeat_tuple
+from torch._torch_docs import reproducibility_notes
+
+from ..common_types import _size_1_t, _size_2_t, _size_3_t
+from typing import Optional, List, Tuple, Union
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d',
+           'LazyConv1d', 'LazyConv2d', 'LazyConv3d', 'LazyConvTranspose1d', 'LazyConvTranspose2d',
+           'LazyConvTranspose3d']
+
+convolution_notes = \
+    {"groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs.
+      :attr:`in_channels` and :attr:`out_channels` must both be divisible by
+      :attr:`groups`. For example,
+
+        * At groups=1, all inputs are convolved to all outputs.
+        * At groups=2, the operation becomes equivalent to having two conv
+          layers side by side, each seeing half the input channels
+          and producing half the output channels, and both subsequently
+          concatenated.
+        * At groups= :attr:`in_channels`, each input channel is convolved with
+          its own set of filters (of size
+          :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""",
+
+        "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`,
+        where `K` is a positive integer, this operation is also known as a "depthwise convolution".
+
+        In other words, for an input of size :math:`(N, C_{in}, L_{in})`,
+        a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments
+        :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`."""}  # noqa: B950
+
+
+
+
+
+class _ConvNd(Module):
+
+    __constants__ = ['stride', 'padding', 'dilation', 'groups',
+                     'padding_mode', 'output_padding', 'in_channels',
+                     'out_channels', 'kernel_size']
+    __annotations__ = {'bias': Optional[torch.Tensor]}
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor:  # type: ignore[empty-body]
+        ...
+
+    in_channels: int
+    _reversed_padding_repeated_twice: List[int]
+    out_channels: int
+    kernel_size: Tuple[int, ...]
+    stride: Tuple[int, ...]
+    padding: Union[str, Tuple[int, ...]]
+    dilation: Tuple[int, ...]
+    transposed: bool
+    output_padding: Tuple[int, ...]
+    groups: int
+    padding_mode: str
+    weight: Tensor
+    bias: Optional[Tensor]
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Tuple[int, ...],
+                 stride: Tuple[int, ...],
+                 padding: Tuple[int, ...],
+                 dilation: Tuple[int, ...],
+                 transposed: bool,
+                 output_padding: Tuple[int, ...],
+                 groups: int,
+                 bias: bool,
+                 padding_mode: str,
+                 device=None,
+                 dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if groups <= 0:
+            raise ValueError('groups must be a positive integer')
+        if in_channels % groups != 0:
+            raise ValueError('in_channels must be divisible by groups')
+        if out_channels % groups != 0:
+            raise ValueError('out_channels must be divisible by groups')
+        valid_padding_strings = {'same', 'valid'}
+        if isinstance(padding, str):
+            if padding not in valid_padding_strings:
+                raise ValueError(
+                    f"Invalid padding string {padding!r}, should be one of {valid_padding_strings}")
+            if padding == 'same' and any(s != 1 for s in stride):
+                raise ValueError("padding='same' is not supported for strided convolutions")
+
+        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
+        if padding_mode not in valid_padding_modes:
+            raise ValueError(f"padding_mode must be one of {valid_padding_modes}, but got padding_mode='{padding_mode}'")
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.output_padding = output_padding
+        self.groups = groups
+        self.padding_mode = padding_mode
+        # `_reversed_padding_repeated_twice` is the padding to be passed to
+        # `F.pad` if needed (e.g., for non-zero padding types that are
+        # implemented as two ops: padding + conv). `F.pad` accepts paddings in
+        # reverse order than the dimension.
+        if isinstance(self.padding, str):
+            self._reversed_padding_repeated_twice = [0, 0] * len(kernel_size)
+            if padding == 'same':
+                for d, k, i in zip(dilation, kernel_size,
+                                   range(len(kernel_size) - 1, -1, -1)):
+                    total_padding = d * (k - 1)
+                    left_pad = total_padding // 2
+                    self._reversed_padding_repeated_twice[2 * i] = left_pad
+                    self._reversed_padding_repeated_twice[2 * i + 1] = (
+                        total_padding - left_pad)
+        else:
+            self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
+
+        if transposed:
+            self.weight = Parameter(torch.empty(
+                (in_channels, out_channels // groups, *kernel_size), **factory_kwargs))
+        else:
+            self.weight = Parameter(torch.empty(
+                (out_channels, in_channels // groups, *kernel_size), **factory_kwargs))
+        if bias:
+            self.bias = Parameter(torch.empty(out_channels, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size)
+        # For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            if fan_in != 0:
+                bound = 1 / math.sqrt(fan_in)
+                init.uniform_(self.bias, -bound, bound)
+
+    def extra_repr(self):
+        s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
+             ', stride={stride}')
+        if self.padding != (0,) * len(self.padding):
+            s += ', padding={padding}'
+        if self.dilation != (1,) * len(self.dilation):
+            s += ', dilation={dilation}'
+        if self.output_padding != (0,) * len(self.output_padding):
+            s += ', output_padding={output_padding}'
+        if self.groups != 1:
+            s += ', groups={groups}'
+        if self.bias is None:
+            s += ', bias=False'
+        if self.padding_mode != 'zeros':
+            s += ', padding_mode={padding_mode}'
+        return s.format(**self.__dict__)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'padding_mode'):
+            self.padding_mode = 'zeros'
+
+
+class Conv1d(_ConvNd):
+    __doc__ = r"""Applies a 1D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\text{in}}, L)` and output :math:`(N, C_{\text{out}}, L_{\text{out}})` can be
+    precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{in} - 1} \text{weight}(C_{\text{out}_j}, k)
+        \star \text{input}(N_i, k)
+
+    where :math:`\star` is the valid `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`L` is a length of signal sequence.
+    """ + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a one-element tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    Note:
+        {depthwise_separable_note}
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to both sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out\_channels},
+            \frac{\text{in\_channels}}{\text{groups}}, \text{kernel\_size})`.
+            The values of these weights are sampled from
+            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``, then the values of these weights are
+            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \text{kernel\_size}}`
+
+    Examples::
+
+        >>> m = nn.Conv1d(16, 33, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: Union[str, _size_1_t] = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        # we create new variables below to make mypy happy since kernel_size has
+        # type Union[int, Tuple[int]] and kernel_size_ has type Tuple[int]
+        kernel_size_ = _single(kernel_size)
+        stride_ = _single(stride)
+        padding_ = padding if isinstance(padding, str) else _single(padding)
+        dilation_ = _single(dilation)
+        super().__init__(
+            in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
+            False, _single(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
+        if self.padding_mode != 'zeros':
+            return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                            weight, bias, self.stride,
+                            _single(0), self.dilation, self.groups)
+        return F.conv1d(input, weight, bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+class Conv2d(_ConvNd):
+    __doc__ = r"""Applies a 2D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size
+    :math:`(N, C_{\text{in}}, H, W)` and output :math:`(N, C_{\text{out}}, H_{\text{out}}, W_{\text{out}})`
+    can be precisely described as:
+
+    .. math::
+        \text{out}(N_i, C_{\text{out}_j}) = \text{bias}(C_{\text{out}_j}) +
+        \sum_{k = 0}^{C_{\text{in}} - 1} \text{weight}(C_{\text{out}_j}, k) \star \text{input}(N_i, k)
+
+
+    where :math:`\star` is the valid 2D `cross-correlation`_ operator,
+    :math:`N` is a batch size, :math:`C` denotes a number of channels,
+    :math:`H` is a height of input planes in pixels, and :math:`W` is
+    width in pixels.
+    """ + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation, a single
+      number or a tuple.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or an int / a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        {depthwise_separable_note}
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] - \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] - \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+            :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+            The values of these weights are sampled from
+            :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape
+            (out_channels). If :attr:`bias` is ``True``,
+            then the values of these weights are
+            sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+            :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+
+    Examples:
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> # non-square kernels and unequal stride and with padding and dilation
+        >>> m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: Union[str, _size_2_t] = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size_ = _pair(kernel_size)
+        stride_ = _pair(stride)
+        padding_ = padding if isinstance(padding, str) else _pair(padding)
+        dilation_ = _pair(dilation)
+        super().__init__(
+            in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
+            False, _pair(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
+        if self.padding_mode != 'zeros':
+            return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
+                            weight, bias, self.stride,
+                            _pair(0), self.dilation, self.groups)
+        return F.conv2d(input, weight, bias, self.stride,
+                        self.padding, self.dilation, self.groups)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+class Conv3d(_ConvNd):
+    __doc__ = r"""Applies a 3D convolution over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C_{in}, D, H, W)`
+    and output :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_{out_j}) = bias(C_{out_j}) +
+                                \sum_{k = 0}^{C_{in} - 1} weight(C_{out_j}, k) \star input(N_i, k)
+
+    where :math:`\star` is the valid 3D `cross-correlation`_ operator
+    """ + r"""
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of padding applied to the input. It
+      can be either a string {{'valid', 'same'}} or a tuple of ints giving the
+      amount of implicit padding applied on both sides.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Note:
+        {depthwise_separable_note}
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Note:
+        ``padding='valid'`` is the same as no padding. ``padding='same'`` pads
+        the input so the output has the shape as the input. However, this mode
+        doesn't support any stride values other than 1.
+
+    Note:
+        This module supports complex data types i.e. ``complex32, complex64, complex128``.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all six sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`,
+          where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0]
+                    \times (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1]
+                    \times (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2]
+                    \times (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{out\_channels}, \frac{\text{in\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels). If :attr:`bias` is ``True``,
+                         then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{in} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.Conv3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.Conv3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(4, 2, 0))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: Union[str, _size_3_t] = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size_ = _triple(kernel_size)
+        stride_ = _triple(stride)
+        padding_ = padding if isinstance(padding, str) else _triple(padding)
+        dilation_ = _triple(dilation)
+        super().__init__(
+            in_channels, out_channels, kernel_size_, stride_, padding_, dilation_,
+            False, _triple(0), groups, bias, padding_mode, **factory_kwargs)
+
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
+        if self.padding_mode != "zeros":
+            return F.conv3d(
+                F.pad(
+                    input, self._reversed_padding_repeated_twice, mode=self.padding_mode
+                ),
+                weight,
+                bias,
+                self.stride,
+                _triple(0),
+                self.dilation,
+                self.groups,
+            )
+        return F.conv3d(
+            input, weight, bias, self.stride, self.padding, self.dilation, self.groups
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
+
+
+
+class _ConvTransposeNd(_ConvNd):
+    def __init__(self, in_channels, out_channels, kernel_size, stride,
+                 padding, dilation, transposed, output_padding,
+                 groups, bias, padding_mode, device=None, dtype=None) -> None:
+        if padding_mode != 'zeros':
+            raise ValueError(f'Only "zeros" padding mode is supported for {self.__class__.__name__}')
+
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride,
+            padding, dilation, transposed, output_padding,
+            groups, bias, padding_mode, **factory_kwargs)
+
+    # dilation being an optional parameter is for backwards
+    # compatibility
+    def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
+                        stride: List[int], padding: List[int], kernel_size: List[int],
+                        num_spatial_dims: int, dilation: Optional[List[int]] = None) -> List[int]:
+        if output_size is None:
+            ret = _single(self.output_padding)  # converting to list if was not already
+        else:
+            has_batch_dim = input.dim() == num_spatial_dims + 2
+            num_non_spatial_dims = 2 if has_batch_dim else 1
+            if len(output_size) == num_non_spatial_dims + num_spatial_dims:
+                output_size = output_size[num_non_spatial_dims:]
+            if len(output_size) != num_spatial_dims:
+                raise ValueError(
+                    "ConvTranspose{}D: for {}D input, output_size must have {} or {} elements (got {})"
+                    .format(num_spatial_dims, input.dim(), num_spatial_dims,
+                            num_non_spatial_dims + num_spatial_dims, len(output_size)))
+
+            min_sizes = torch.jit.annotate(List[int], [])
+            max_sizes = torch.jit.annotate(List[int], [])
+            for d in range(num_spatial_dims):
+                dim_size = ((input.size(d + num_non_spatial_dims) - 1) * stride[d] -
+                            2 * padding[d] +
+                            (dilation[d] if dilation is not None else 1) * (kernel_size[d] - 1) + 1)
+                min_sizes.append(dim_size)
+                max_sizes.append(min_sizes[d] + stride[d] - 1)
+
+            for i in range(len(output_size)):
+                size = output_size[i]
+                min_size = min_sizes[i]
+                max_size = max_sizes[i]
+                if size < min_size or size > max_size:
+                    raise ValueError(
+                        f"requested an output size of {output_size}, but valid sizes range "
+                        f"from {min_sizes} to {max_sizes} (for an input of {input.size()[2:]})")
+
+            res = torch.jit.annotate(List[int], [])
+            for d in range(num_spatial_dims):
+                res.append(output_size[d] - min_sizes[d])
+
+            ret = res
+        return ret
+
+
+class ConvTranspose1d(_ConvTransposeNd):
+    __doc__ = r"""Applies a 1D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv1d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv1d` and a :class:`~torch.nn.ConvTranspose1d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv1d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        In some circumstances when using the CUDA backend with CuDNN, this operator
+        may select a nondeterministic algorithm to increase performance. If this is
+        undesirable, you can try to make the operation deterministic (potentially at
+        a performance cost) by setting ``torch.backends.cudnn.deterministic =
+        True``.
+        Please see the notes on :doc:`/notes/randomness` for background.
+
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, L_{in})` or :math:`(C_{in}, L_{in})`
+        - Output: :math:`(N, C_{out}, L_{out})` or :math:`(C_{out}, L_{out})`, where
+
+          .. math::
+              L_{out} = (L_{in} - 1) \times \text{stride} - 2 \times \text{padding} + \text{dilation}
+                        \times (\text{kernel\_size} - 1) + \text{output\_padding} + 1
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels).
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \text{kernel\_size}}`
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _single(kernel_size)
+        stride = _single(stride)
+        padding = _single(padding)
+        dilation = _single(dilation)
+        output_padding = _single(output_padding)
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        if self.padding_mode != 'zeros':
+            raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 1
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
+        return F.conv_transpose1d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+class ConvTranspose2d(_ConvTransposeNd):
+    __doc__ = r"""Applies a 2D transposed convolution operator over an input image
+    composed of several input planes.
+
+    This module can be seen as the gradient of Conv2d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimensions
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv2d` and a :class:`~torch.nn.ConvTranspose2d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv2d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})` or :math:`(C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` or :math:`(C_{out}, H_{out}, W_{out})`, where
+
+        .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{1}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose2d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
+        >>> input = torch.randn(20, 16, 50, 100)
+        >>> output = m(input)
+        >>> # exact output size can be also specified as an argument
+        >>> input = torch.randn(1, 16, 12, 12)
+        >>> downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)
+        >>> upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)
+        >>> h = downsample(input)
+        >>> h.size()
+        torch.Size([1, 16, 6, 6])
+        >>> output = upsample(h, output_size=input.size())
+        >>> output.size()
+        torch.Size([1, 16, 12, 12])
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_2_t = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _pair(kernel_size)
+        stride = _pair(stride)
+        padding = _pair(padding)
+        dilation = _pair(dilation)
+        output_padding = _pair(output_padding)
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        if self.padding_mode != 'zeros':
+            raise ValueError('Only `zeros` padding mode is supported for ConvTranspose2d')
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 2
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
+
+        return F.conv_transpose2d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+class ConvTranspose3d(_ConvTransposeNd):
+    __doc__ = r"""Applies a 3D transposed convolution operator over an input image composed of several input
+    planes.
+    The transposed convolution operator multiplies each input value element-wise by a learnable kernel,
+    and sums over the outputs from all input feature planes.
+
+    This module can be seen as the gradient of Conv3d with respect to its input.
+    It is also known as a fractionally-strided convolution or
+    a deconvolution (although it is not an actual deconvolution operation as it does
+    not compute a true inverse of convolution). For more information, see the visualizations
+    `here`_ and the `Deconvolutional Networks`_ paper.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of implicit zero padding on both
+      sides for ``dilation * (kernel_size - 1) - padding`` number of points. See note
+      below for details.
+
+    * :attr:`output_padding` controls the additional size added to one side
+      of the output shape. See note below for details.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but the link `here`_ has a nice visualization of what :attr:`dilation` does.
+
+    {groups_note}
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`output_padding`
+    can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimensions
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Note:
+        The :attr:`padding` argument effectively adds ``dilation * (kernel_size - 1) - padding``
+        amount of zero padding to both sizes of the input. This is set so that
+        when a :class:`~torch.nn.Conv3d` and a :class:`~torch.nn.ConvTranspose3d`
+        are initialized with same parameters, they are inverses of each other in
+        regard to the input and output shapes. However, when ``stride > 1``,
+        :class:`~torch.nn.Conv3d` maps multiple input shapes to the same output
+        shape. :attr:`output_padding` is provided to resolve this ambiguity by
+        effectively increasing the calculated output shape on one side. Note
+        that :attr:`output_padding` is only used to find output shape, but does
+        not actually add zero-padding to output.
+
+    Note:
+        {cudnn_reproducibility_note}
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+    """.format(**reproducibility_notes, **convolution_notes) + r"""
+
+    Shape:
+        - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or
+          :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where
+
+        .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0]
+                        \times (\text{kernel\_size}[0] - 1) + \text{output\_padding}[0] + 1
+        .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[1] - 2 \times \text{padding}[1] + \text{dilation}[1]
+                        \times (\text{kernel\_size}[1] - 1) + \text{output\_padding}[1] + 1
+        .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride}[2] - 2 \times \text{padding}[2] + \text{dilation}[2]
+                        \times (\text{kernel\_size}[2] - 1) + \text{output\_padding}[2] + 1
+
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+                         :math:`(\text{in\_channels}, \frac{\text{out\_channels}}{\text{groups}},`
+                         :math:`\text{kernel\_size[0]}, \text{kernel\_size[1]}, \text{kernel\_size[2]})`.
+                         The values of these weights are sampled from
+                         :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+        bias (Tensor):   the learnable bias of the module of shape (out_channels)
+                         If :attr:`bias` is ``True``, then the values of these weights are
+                         sampled from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                         :math:`k = \frac{groups}{C_\text{out} * \prod_{i=0}^{2}\text{kernel\_size}[i]}`
+
+    Examples::
+
+        >>> # With square kernels and equal stride
+        >>> m = nn.ConvTranspose3d(16, 33, 3, stride=2)
+        >>> # non-square kernels and unequal stride and with padding
+        >>> m = nn.ConvTranspose3d(16, 33, (3, 5, 2), stride=(2, 1, 1), padding=(0, 4, 2))
+        >>> input = torch.randn(20, 16, 10, 50, 100)
+        >>> output = m(input)
+
+    .. _`here`:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    .. _`Deconvolutional Networks`:
+        https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        kernel_size = _triple(kernel_size)
+        stride = _triple(stride)
+        padding = _triple(padding)
+        dilation = _triple(dilation)
+        output_padding = _triple(output_padding)
+        super().__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            True, output_padding, groups, bias, padding_mode, **factory_kwargs)
+
+    def forward(self, input: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        if self.padding_mode != 'zeros':
+            raise ValueError('Only `zeros` padding mode is supported for ConvTranspose3d')
+
+        assert isinstance(self.padding, tuple)
+        # One cannot replace List by Tuple or Sequence in "_output_padding" because
+        # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`.
+        num_spatial_dims = 3
+        output_padding = self._output_padding(
+            input, output_size, self.stride, self.padding, self.kernel_size,  # type: ignore[arg-type]
+            num_spatial_dims, self.dilation)  # type: ignore[arg-type]
+
+        return F.conv_transpose3d(
+            input, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)
+
+
+# TODO: Deprecate and remove the following alias `_ConvTransposeMixin`.
+#
+# `_ConvTransposeMixin` was a mixin that was removed.  It is meant to be used
+# with `_ConvNd` to construct actual module classes that implements conv
+# transpose ops:
+#
+#   class MyConvTranspose(_ConvNd, _ConvTransposeMixin):
+#       ...
+#
+# In PyTorch, it has been replaced by `_ConvTransposeNd`, which is a proper
+# subclass of `_ConvNd`.  However, some user code in the wild still (incorrectly)
+# use the internal class `_ConvTransposeMixin`.  Hence, we provide this alias
+# for BC, because it is cheap and easy for us to do so, even though that
+# `_ConvTransposeNd` is really not a mixin anymore (but multiple inheritance as
+# above would still work).
+class _ConvTransposeMixin(_ConvTransposeNd):
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            "_ConvTransposeMixin is a deprecated internal class. "
+            "Please consider using public APIs.")
+        super().__init__(*args, **kwargs)
+
+
+# TODO: Conv2dLocal
+# TODO: Conv2dMap
+# TODO: ConvTranspose2dMap
+
+
+class _LazyConvXdMixin(LazyModuleMixin):
+    groups: int
+    transposed: bool
+    in_channels: int
+    out_channels: int
+    kernel_size: Tuple[int, ...]
+    weight: UninitializedParameter
+    bias: UninitializedParameter
+
+    def reset_parameters(self) -> None:
+        # has_uninitialized_params is defined in parent class and it is using a protocol on self
+        if not self.has_uninitialized_params() and self.in_channels != 0:  # type: ignore[misc]
+            # "type:ignore[..]" is required because mypy thinks that "reset_parameters" is undefined
+            # in super class. Turns out that it is defined in _ConvND which is inherited by any class
+            # that also inherits _LazyConvXdMixin
+            super().reset_parameters()  # type: ignore[misc]
+
+    # Signature of "initialize_parameters" is incompatible with the definition in supertype LazyModuleMixin
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        # defined by parent class but using a protocol
+        if self.has_uninitialized_params():  # type: ignore[misc]
+            self.in_channels = self._get_in_channels(input)
+            if self.in_channels % self.groups != 0:
+                raise ValueError('in_channels must be divisible by groups')
+            assert isinstance(self.weight, UninitializedParameter)
+            if self.transposed:
+                self.weight.materialize((
+                    self.in_channels, self.out_channels // self.groups, *self.kernel_size))
+            else:
+                self.weight.materialize((
+                    self.out_channels, self.in_channels // self.groups, *self.kernel_size))
+            if self.bias is not None:
+                assert isinstance(self.bias, UninitializedParameter)
+                self.bias.materialize((self.out_channels,))
+            self.reset_parameters()
+
+    # Function to extract in_channels from first input.
+    def _get_in_channels(self, input: Tensor) -> int:
+        num_spatial_dims = self._get_num_spatial_dims()
+        num_dims_no_batch = num_spatial_dims + 1  # +1 for channels dim
+        num_dims_batch = num_dims_no_batch + 1
+        if input.dim() not in (num_dims_no_batch, num_dims_batch):
+            raise RuntimeError("Expected {}D (unbatched) or {}D (batched) input to {}, but "
+                               "got input of size: {}".format(num_dims_no_batch, num_dims_batch,
+                                                              self.__class__.__name__, input.shape))
+        return input.shape[1] if input.dim() == num_dims_batch else input.shape[0]
+
+    # Function to return the number of spatial dims expected for inputs to the module.
+    # This is expected to be implemented by subclasses.
+    def _get_num_spatial_dims(self) -> int:
+        raise NotImplementedError()
+
+
+# LazyConv1d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv1d(_LazyConvXdMixin, Conv1d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    .. seealso:: :class:`torch.nn.Conv1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        dilation: _size_1_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConv2d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv2d(_LazyConvXdMixin, Conv2d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv2d` that is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    .. seealso:: :class:`torch.nn.Conv2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',  # TODO: refine this type
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConv3d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConv3d(_LazyConvXdMixin, Conv3d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.Conv3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`Conv3d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (str, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel
+            elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+
+    .. seealso:: :class:`torch.nn.Conv3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = Conv3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        dilation: _size_3_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
+
+
+# LazyConvTranspose1d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose1d(_LazyConvXdMixin, ConvTranspose1d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose1d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose1d` that is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose1d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose1d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_1_t,
+        stride: _size_1_t = 1,
+        padding: _size_1_t = 0,
+        output_padding: _size_1_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_1_t = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 1
+
+
+# LazyConvTranspose2d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose2d(_LazyConvXdMixin, ConvTranspose2d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose2d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose2d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose2d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose2d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: _size_2_t = 0,
+        output_padding: _size_2_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: int = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 2
+
+
+# LazyConvTranspose3d defines weight as a Tensor but derived class defines it as UnitializeParameter
+class LazyConvTranspose3d(_LazyConvXdMixin, ConvTranspose3d):  # type: ignore[misc]
+    r"""A :class:`torch.nn.ConvTranspose3d` module with lazy initialization of the ``in_channels`` argument.
+
+    The ``in_channels`` argument of the :class:`ConvTranspose3d` is inferred from
+    the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight` and `bias`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): ``dilation * (kernel_size - 1) - padding`` zero-padding
+            will be added to both sides of each dimension in the input. Default: 0
+        output_padding (int or tuple, optional): Additional size added to one side
+            of each dimension in the output shape. Default: 0
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True``
+        dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+
+    .. seealso:: :class:`torch.nn.ConvTranspose3d` and :class:`torch.nn.modules.lazy.LazyModuleMixin`
+    """
+
+    # super class define this variable as None. "type: ignore[..] is required
+    # since we are redefining the variable.
+    cls_to_become = ConvTranspose3d  # type: ignore[assignment]
+
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: _size_3_t,
+        stride: _size_3_t = 1,
+        padding: _size_3_t = 0,
+        output_padding: _size_3_t = 0,
+        groups: int = 1,
+        bias: bool = True,
+        dilation: _size_3_t = 1,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            0,
+            0,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            groups,
+            # bias is hardcoded to False to avoid creating tensor
+            # that will soon be overwritten.
+            False,
+            dilation,
+            padding_mode,
+            **factory_kwargs
+        )
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_channels = out_channels
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def _get_num_spatial_dims(self) -> int:
+        return 3
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/distance.py b/MLPY/Lib/site-packages/torch/nn/modules/distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf25d9aabe8144bd40307021634efd689d346e2f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/distance.py
@@ -0,0 +1,89 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+
+__all__ = ['PairwiseDistance', 'CosineSimilarity']
+
+class PairwiseDistance(Module):
+    r"""
+    Computes the pairwise distance between input vectors, or between columns of input matrices.
+
+    Distances are computed using ``p``-norm, with constant ``eps`` added to avoid division by zero
+    if ``p`` is negative, i.e.:
+
+    .. math ::
+        \mathrm{dist}\left(x, y\right) = \left\Vert x-y + \epsilon e \right\Vert_p,
+
+    where :math:`e` is the vector of ones and the ``p``-norm is given by.
+
+    .. math ::
+        \Vert x \Vert _p = \left( \sum_{i=1}^n  \vert x_i \vert ^ p \right) ^ {1/p}.
+
+    Args:
+        p (real, optional): the norm degree. Can be negative. Default: 2
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-6
+        keepdim (bool, optional): Determines whether or not to keep the vector dimension.
+            Default: False
+    Shape:
+        - Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension`
+        - Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1
+        - Output: :math:`(N)` or :math:`()` based on input dimension.
+          If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension.
+
+    Examples::
+        >>> pdist = nn.PairwiseDistance(p=2)
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> output = pdist(input1, input2)
+    """
+
+    __constants__ = ['norm', 'eps', 'keepdim']
+    norm: float
+    eps: float
+    keepdim: bool
+
+    def __init__(self, p: float = 2., eps: float = 1e-6, keepdim: bool = False) -> None:
+        super().__init__()
+        self.norm = p
+        self.eps = eps
+        self.keepdim = keepdim
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        return F.pairwise_distance(x1, x2, self.norm, self.eps, self.keepdim)
+
+
+class CosineSimilarity(Module):
+    r"""Returns cosine similarity between :math:`x_1` and :math:`x_2`, computed along `dim`.
+
+    .. math ::
+        \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}.
+
+    Args:
+        dim (int, optional): Dimension where cosine similarity is computed. Default: 1
+        eps (float, optional): Small value to avoid division by zero.
+            Default: 1e-8
+    Shape:
+        - Input1: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`
+        - Input2: :math:`(\ast_1, D, \ast_2)`, same number of dimensions as x1, matching x1 size at dimension `dim`,
+              and broadcastable with x1 at other dimensions.
+        - Output: :math:`(\ast_1, \ast_2)`
+    Examples::
+        >>> input1 = torch.randn(100, 128)
+        >>> input2 = torch.randn(100, 128)
+        >>> cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        >>> output = cos(input1, input2)
+    """
+
+    __constants__ = ['dim', 'eps']
+    dim: int
+    eps: float
+
+    def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        return F.cosine_similarity(x1, x2, self.dim, self.eps)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/dropout.py b/MLPY/Lib/site-packages/torch/nn/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c645b9fc48efe6c39be271c937a7b87427967a0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/dropout.py
@@ -0,0 +1,294 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+
+__all__ = ['Dropout', 'Dropout1d', 'Dropout2d', 'Dropout3d', 'AlphaDropout', 'FeatureAlphaDropout']
+
+class _DropoutNd(Module):
+    __constants__ = ['p', 'inplace']
+    p: float
+    inplace: bool
+
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super().__init__()
+        if p < 0 or p > 1:
+            raise ValueError(f"dropout probability has to be between 0 and 1, but got {p}")
+        self.p = p
+        self.inplace = inplace
+
+    def extra_repr(self) -> str:
+        return f'p={self.p}, inplace={self.inplace}'
+
+
+class Dropout(_DropoutNd):
+    r"""During training, randomly zeroes some of the elements of the input tensor with probability :attr:`p`.
+
+    The zeroed elements are chosen independently for each forward call and are sampled from a Bernoulli distribution.
+
+    Each channel will be zeroed out independently on every forward call.
+
+    This has proven to be an effective technique for regularization and
+    preventing the co-adaptation of neurons as described in the paper
+    `Improving neural networks by preventing co-adaptation of feature
+    detectors`_ .
+
+    Furthermore, the outputs are scaled by a factor of :math:`\frac{1}{1-p}` during
+    training. This means that during evaluation the module simply computes an
+    identity function.
+
+    Args:
+        p: probability of an element to be zeroed. Default: 0.5
+        inplace: If set to ``True``, will do this operation in-place. Default: ``False``
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.Dropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Improving neural networks by preventing co-adaptation of feature
+        detectors: https://arxiv.org/abs/1207.0580
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout(input, self.p, self.training, self.inplace)
+
+
+class Dropout1d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 1D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 1D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv1d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout1d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`.
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout1d(p=0.2)
+        >>> input = torch.randn(20, 16, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout1d(input, self.p, self.training, self.inplace)
+
+
+class Dropout2d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 2D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 2D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv2d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout2d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zero-ed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    .. warning ::
+        Due to historical reasons, this class will perform 1D channel-wise dropout
+        for 3D inputs (as done by :class:`nn.Dropout1d`). Thus, it currently does NOT
+        support inputs without a batch dimension of shape :math:`(C, H, W)`. This
+        behavior will change in a future release to interpret 3D inputs as no-batch-dim
+        inputs. To maintain the old behavior, switch to :class:`nn.Dropout1d`.
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(N, C, L)`.
+        - Output: :math:`(N, C, H, W)` or :math:`(N, C, L)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout2d(p=0.2)
+        >>> input = torch.randn(20, 16, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout2d(input, self.p, self.training, self.inplace)
+
+
+class Dropout3d(_DropoutNd):
+    r"""Randomly zero out entire channels.
+
+    A channel is a 3D feature map,
+    e.g., the :math:`j`-th channel of the :math:`i`-th sample in the
+    batched input is a 3D tensor :math:`\text{input}[i, j]`.
+
+    Each channel will be zeroed out independently on every forward call with
+    probability :attr:`p` using samples from a Bernoulli distribution.
+
+    Usually the input comes from :class:`nn.Conv3d` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.Dropout3d` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed.
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.Dropout3d(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.dropout3d(input, self.p, self.training, self.inplace)
+
+
+class AlphaDropout(_DropoutNd):
+    r"""Applies Alpha Dropout over the input.
+
+    Alpha Dropout is a type of Dropout that maintains the self-normalizing
+    property.
+    For an input with zero mean and unit standard deviation, the output of
+    Alpha Dropout maintains the original mean and standard deviation of the
+    input.
+    Alpha Dropout goes hand-in-hand with SELU activation function, which ensures
+    that the outputs have zero mean and unit standard deviation.
+
+    During training, it randomly masks some of the elements of the input
+    tensor with probability *p* using samples from a bernoulli distribution.
+    The elements to masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit standard deviation.
+
+    During evaluation the module simply computes an identity function.
+
+    More details can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Args:
+        p (float): probability of an element to be dropped. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(*)`. Input can be of any shape
+        - Output: :math:`(*)`. Output is of the same shape as input
+
+    Examples::
+
+        >>> m = nn.AlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.alpha_dropout(input, self.p, self.training)
+
+
+class FeatureAlphaDropout(_DropoutNd):
+    r"""Randomly masks out entire channels.
+
+    A channel is a feature map,
+    e.g. the :math:`j`-th channel of the :math:`i`-th sample in the batch input
+    is a tensor :math:`\text{input}[i, j]` of the input tensor). Instead of
+    setting activations to zero, as in regular Dropout, the activations are set
+    to the negative saturation value of the SELU activation function. More details
+    can be found in the paper `Self-Normalizing Neural Networks`_ .
+
+    Each element will be masked independently for each sample on every forward
+    call with probability :attr:`p` using samples from a Bernoulli distribution.
+    The elements to be masked are randomized on every forward call, and scaled
+    and shifted to maintain zero mean and unit variance.
+
+    Usually the input comes from :class:`nn.AlphaDropout` modules.
+
+    As described in the paper
+    `Efficient Object Localization Using Convolutional Networks`_ ,
+    if adjacent pixels within feature maps are strongly correlated
+    (as is normally the case in early convolution layers) then i.i.d. dropout
+    will not regularize the activations and will otherwise just result
+    in an effective learning rate decrease.
+
+    In this case, :func:`nn.AlphaDropout` will help promote independence between
+    feature maps and should be used instead.
+
+    Args:
+        p (float, optional): probability of an element to be zeroed. Default: 0.5
+        inplace (bool, optional): If set to ``True``, will do this operation
+            in-place
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`.
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input).
+
+    Examples::
+
+        >>> m = nn.FeatureAlphaDropout(p=0.2)
+        >>> input = torch.randn(20, 16, 4, 32, 32)
+        >>> output = m(input)
+
+    .. _Self-Normalizing Neural Networks: https://arxiv.org/abs/1706.02515
+    .. _Efficient Object Localization Using Convolutional Networks:
+       https://arxiv.org/abs/1411.4280
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.feature_alpha_dropout(input, self.p, self.training)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/flatten.py b/MLPY/Lib/site-packages/torch/nn/modules/flatten.py
new file mode 100644
index 0000000000000000000000000000000000000000..52b4cc9d97dcd5419f0abfc25c279df8272b2692
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/flatten.py
@@ -0,0 +1,144 @@
+from .module import Module
+
+from typing import Tuple, Union
+from torch import Tensor
+from torch.types import _size
+
+__all__ = ['Flatten', 'Unflatten']
+
+class Flatten(Module):
+    r"""
+    Flattens a contiguous range of dims into a tensor.
+
+    For use with :class:`~nn.Sequential`, see :meth:`torch.flatten` for details.
+
+    Shape:
+        - Input: :math:`(*, S_{\text{start}},..., S_{i}, ..., S_{\text{end}}, *)`,'
+          where :math:`S_{i}` is the size at dimension :math:`i` and :math:`*` means any
+          number of dimensions including none.
+        - Output: :math:`(*, \prod_{i=\text{start}}^{\text{end}} S_{i}, *)`.
+
+    Args:
+        start_dim: first dim to flatten (default = 1).
+        end_dim: last dim to flatten (default = -1).
+
+    Examples::
+        >>> input = torch.randn(32, 1, 5, 5)
+        >>> # With default parameters
+        >>> m = nn.Flatten()
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([32, 25])
+        >>> # With non-default parameters
+        >>> m = nn.Flatten(0, 2)
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([160, 5])
+    """
+
+    __constants__ = ['start_dim', 'end_dim']
+    start_dim: int
+    end_dim: int
+
+    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
+        super().__init__()
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input.flatten(self.start_dim, self.end_dim)
+
+    def extra_repr(self) -> str:
+        return f'start_dim={self.start_dim}, end_dim={self.end_dim}'
+
+
+class Unflatten(Module):
+    r"""
+    Unflattens a tensor dim expanding it to a desired shape. For use with :class:`~nn.Sequential`.
+
+    * :attr:`dim` specifies the dimension of the input tensor to be unflattened, and it can
+      be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
+
+    * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape`
+      (tuple of `(name, size)` tuples) for `NamedTensor` input.
+
+    Shape:
+        - Input: :math:`(*, S_{\text{dim}}, *)`, where :math:`S_{\text{dim}}` is the size at
+          dimension :attr:`dim` and :math:`*` means any number of dimensions including none.
+        - Output: :math:`(*, U_1, ..., U_n, *)`, where :math:`U` = :attr:`unflattened_size` and
+          :math:`\prod_{i=1}^n U_i = S_{\text{dim}}`.
+
+    Args:
+        dim (Union[int, str]): Dimension to be unflattened
+        unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
+
+    Examples:
+        >>> input = torch.randn(2, 50)
+        >>> # With tuple of ints
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, (2, 5, 5))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With torch.Size
+        >>> m = nn.Sequential(
+        >>>     nn.Linear(50, 50),
+        >>>     nn.Unflatten(1, torch.Size([2, 5, 5]))
+        >>> )
+        >>> output = m(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+        >>> # With namedshape (tuple of tuples)
+        >>> input = torch.randn(2, 50, names=('N', 'features'))
+        >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+        >>> output = unflatten(input)
+        >>> output.size()
+        torch.Size([2, 2, 5, 5])
+    """
+
+    NamedShape = Tuple[Tuple[str, int]]
+
+    __constants__ = ['dim', 'unflattened_size']
+    dim: Union[int, str]
+    unflattened_size: Union[_size, NamedShape]
+
+    def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
+        super().__init__()
+
+        if isinstance(dim, int):
+            self._require_tuple_int(unflattened_size)
+        elif isinstance(dim, str):
+            self._require_tuple_tuple(unflattened_size)
+        else:
+            raise TypeError("invalid argument type for dim parameter")
+
+        self.dim = dim
+        self.unflattened_size = unflattened_size
+
+    def _require_tuple_tuple(self, input):
+        if (isinstance(input, tuple)):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, tuple):
+                    raise TypeError("unflattened_size must be tuple of tuples, " +
+                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+            return
+        raise TypeError("unflattened_size must be a tuple of tuples, " +
+                        f"but found type {type(input).__name__}")
+
+    def _require_tuple_int(self, input):
+        if (isinstance(input, (tuple, list))):
+            for idx, elem in enumerate(input):
+                if not isinstance(elem, int):
+                    raise TypeError("unflattened_size must be tuple of ints, " +
+                                    f"but found element of type {type(elem).__name__} at pos {idx}")
+            return
+        raise TypeError(f"unflattened_size must be a tuple of ints, but found type {type(input).__name__}")
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input.unflatten(self.dim, self.unflattened_size)
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, unflattened_size={self.unflattened_size}'
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/fold.py b/MLPY/Lib/site-packages/torch/nn/modules/fold.py
new file mode 100644
index 0000000000000000000000000000000000000000..93dd7a3909464997d32205044e5cb13fb41a15d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/fold.py
@@ -0,0 +1,303 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+from ..common_types import _size_any_t
+
+__all__ = ['Fold', 'Unfold']
+
+class Fold(Module):
+    r"""Combines an array of sliding local blocks into a large containing tensor.
+
+    Consider a batched :attr:`input` tensor containing sliding local blocks,
+    e.g., patches of images, of shape :math:`(N, C \times  \prod(\text{kernel\_size}), L)`,
+    where :math:`N` is batch dimension, :math:`C \times \prod(\text{kernel\_size})`
+    is the number of values within a block (a block has :math:`\prod(\text{kernel\_size})`
+    spatial locations each containing a :math:`C`-channeled vector), and
+    :math:`L` is the total number of blocks. (This is exactly the
+    same specification as the output shape of :class:`~torch.nn.Unfold`.) This
+    operation combines these local blocks into the large :attr:`output` tensor
+    of shape :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+    by summing the overlapping values. Similar to :class:`~torch.nn.Unfold`, the
+    arguments must satisfy
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{output\_size}[d] + 2 \times \text{padding}[d] %
+            - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`d` is over all spatial dimensions.
+
+    * :attr:`output_size` describes the spatial shape of the large containing
+      tensor of the sliding local blocks. It is useful to resolve the ambiguity
+      when multiple input shapes map to same number of sliding blocks, e.g.,
+      with ``stride > 0``.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        output_size (int or tuple): the shape of the spatial dimensions of the
+                                    output (i.e., ``output.sizes()[2:]``)
+        kernel_size (int or tuple): the size of the sliding blocks
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple): the stride of the sliding blocks in the input
+                               spatial dimensions. Default: 1
+
+    * If :attr:`output_size`, :attr:`kernel_size`, :attr:`dilation`,
+      :attr:`padding` or :attr:`stride` is an int or a tuple of length 1 then
+      their values will be replicated across all spatial dimensions.
+
+    * For the case of two output spatial dimensions this operation is sometimes
+      called ``col2im``.
+
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
+        In general, folding and unfolding operations are related as
+        follows. Consider :class:`~torch.nn.Fold` and
+        :class:`~torch.nn.Unfold` instances created with the same
+        parameters:
+
+        >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
+        >>> fold = nn.Fold(output_size=..., **fold_params)
+        >>> unfold = nn.Unfold(**fold_params)
+
+        Then for any (supported) ``input`` tensor the following
+        equality holds:
+
+        ::
+
+            fold(unfold(input)) == divisor * input
+
+        where ``divisor`` is a tensor that depends only on the shape
+        and dtype of the ``input``:
+
+        >>> # xdoctest: +SKIP
+        >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
+        >>> divisor = fold(unfold(input_ones))
+
+        When the ``divisor`` tensor contains no zero elements, then
+        ``fold`` and ``unfold`` operations are inverses of each
+        other (up to constant divisor).
+
+    .. warning::
+        Currently, only unbatched (3D) or batched (4D) image-like output tensors are supported.
+
+    Shape:
+        - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)`
+        - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)`
+          or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above
+
+    Examples::
+
+        >>> fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2))
+        >>> input = torch.randn(1, 3 * 2 * 2, 12)
+        >>> output = fold(input)
+        >>> output.size()
+        torch.Size([1, 3, 4, 5])
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+
+    __constants__ = ['output_size', 'kernel_size', 'dilation', 'padding',
+                     'stride']
+    output_size: _size_any_t
+    kernel_size: _size_any_t
+    dilation: _size_any_t
+    padding: _size_any_t
+    stride: _size_any_t
+
+    def __init__(
+        self,
+        output_size: _size_any_t,
+        kernel_size: _size_any_t,
+        dilation: _size_any_t = 1,
+        padding: _size_any_t = 0,
+        stride: _size_any_t = 1
+    ) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.fold(input, self.output_size, self.kernel_size, self.dilation,
+                      self.padding, self.stride)
+
+    def extra_repr(self) -> str:
+        return 'output_size={output_size}, kernel_size={kernel_size}, ' \
+            'dilation={dilation}, padding={padding}, stride={stride}'.format(
+                **self.__dict__
+            )
+
+
+class Unfold(Module):
+    r"""Extracts sliding local blocks from a batched input tensor.
+
+    Consider a batched :attr:`input` tensor of shape :math:`(N, C, *)`,
+    where :math:`N` is the batch dimension, :math:`C` is the channel dimension,
+    and :math:`*` represent arbitrary spatial dimensions. This operation flattens
+    each sliding :attr:`kernel_size`-sized block within the spatial dimensions
+    of :attr:`input` into a column (i.e., last dimension) of a 3-D :attr:`output`
+    tensor of shape :math:`(N, C \times \prod(\text{kernel\_size}), L)`, where
+    :math:`C \times \prod(\text{kernel\_size})` is the total number of values
+    within each block (a block has :math:`\prod(\text{kernel\_size})` spatial
+    locations each containing a :math:`C`-channeled vector), and :math:`L` is
+    the total number of such blocks:
+
+    .. math::
+        L = \prod_d \left\lfloor\frac{\text{spatial\_size}[d] + 2 \times \text{padding}[d] %
+            - \text{dilation}[d] \times (\text{kernel\_size}[d] - 1) - 1}{\text{stride}[d]} + 1\right\rfloor,
+
+    where :math:`\text{spatial\_size}` is formed by the spatial dimensions
+    of :attr:`input` (:math:`*` above), and :math:`d` is over all spatial
+    dimensions.
+
+    Therefore, indexing :attr:`output` at the last dimension (column dimension)
+    gives all values within a certain block.
+
+    The :attr:`padding`, :attr:`stride` and :attr:`dilation` arguments specify
+    how the sliding blocks are retrieved.
+
+    * :attr:`stride` controls the stride for the sliding blocks.
+
+    * :attr:`padding` controls the amount of implicit zero-paddings on both
+      sides for :attr:`padding` number of points for each dimension before
+      reshaping.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also known as the à trous algorithm.
+      It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        kernel_size (int or tuple): the size of the sliding blocks
+        dilation (int or tuple, optional): a parameter that controls the
+                                           stride of elements within the
+                                           neighborhood. Default: 1
+        padding (int or tuple, optional): implicit zero padding to be added on
+                                          both sides of input. Default: 0
+        stride (int or tuple, optional): the stride of the sliding blocks in the input
+                                         spatial dimensions. Default: 1
+
+    * If :attr:`kernel_size`, :attr:`dilation`, :attr:`padding` or
+      :attr:`stride` is an int or a tuple of length 1, their values will be
+      replicated across all spatial dimensions.
+
+    * For the case of two input spatial dimensions this operation is sometimes
+      called ``im2col``.
+
+    .. note::
+        :class:`~torch.nn.Fold` calculates each combined value in the resulting
+        large tensor by summing all values from all containing blocks.
+        :class:`~torch.nn.Unfold` extracts the values in the local blocks by
+        copying from the large tensor. So, if the blocks overlap, they are not
+        inverses of each other.
+
+        In general, folding and unfolding operations are related as
+        follows. Consider :class:`~torch.nn.Fold` and
+        :class:`~torch.nn.Unfold` instances created with the same
+        parameters:
+
+        >>> fold_params = dict(kernel_size=..., dilation=..., padding=..., stride=...)
+        >>> fold = nn.Fold(output_size=..., **fold_params)
+        >>> unfold = nn.Unfold(**fold_params)
+
+        Then for any (supported) ``input`` tensor the following
+        equality holds:
+
+        ::
+
+            fold(unfold(input)) == divisor * input
+
+        where ``divisor`` is a tensor that depends only on the shape
+        and dtype of the ``input``:
+
+        >>> # xdoctest: +SKIP
+        >>> input_ones = torch.ones(input.shape, dtype=input.dtype)
+        >>> divisor = fold(unfold(input_ones))
+
+        When the ``divisor`` tensor contains no zero elements, then
+        ``fold`` and ``unfold`` operations are inverses of each
+        other (up to constant divisor).
+
+    .. warning::
+        Currently, only 4-D input tensors (batched image-like tensors) are
+        supported.
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C \times \prod(\text{kernel\_size}), L)` as described above
+
+    Examples::
+
+        >>> unfold = nn.Unfold(kernel_size=(2, 3))
+        >>> input = torch.randn(2, 5, 3, 4)
+        >>> output = unfold(input)
+        >>> # each patch contains 30 values (2x3=6 vectors, each of 5 channels)
+        >>> # 4 blocks (2x3 kernels) in total in the 3x4 input
+        >>> output.size()
+        torch.Size([2, 30, 4])
+
+        >>> # xdoctest: +IGNORE_WANT
+        >>> # Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
+        >>> inp = torch.randn(1, 3, 10, 12)
+        >>> w = torch.randn(2, 3, 4, 5)
+        >>> inp_unf = torch.nn.functional.unfold(inp, (4, 5))
+        >>> out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
+        >>> out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
+        >>> # or equivalently (and avoiding a copy),
+        >>> # out = out_unf.view(1, 2, 7, 8)
+        >>> (torch.nn.functional.conv2d(inp, w) - out).abs().max()
+        tensor(1.9073e-06)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+
+    """
+
+    __constants__ = ['kernel_size', 'dilation', 'padding', 'stride']
+    kernel_size: _size_any_t
+    dilation: _size_any_t
+    padding: _size_any_t
+    stride: _size_any_t
+
+    def __init__(
+        self,
+        kernel_size: _size_any_t,
+        dilation: _size_any_t = 1,
+        padding: _size_any_t = 0,
+        stride: _size_any_t = 1
+    ) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.padding = padding
+        self.stride = stride
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.unfold(input, self.kernel_size, self.dilation,
+                        self.padding, self.stride)
+
+    def extra_repr(self) -> str:
+        return 'kernel_size={kernel_size}, dilation={dilation}, padding={padding},' \
+            ' stride={stride}'.format(**self.__dict__)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/instancenorm.py b/MLPY/Lib/site-packages/torch/nn/modules/instancenorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c380a1a5823782cd2fcbb673164afebe8118f83
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/instancenorm.py
@@ -0,0 +1,434 @@
+
+import warnings
+from torch import Tensor
+
+from .batchnorm import _LazyNormBase, _NormBase
+from .. import functional as F
+
+__all__ = ['InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LazyInstanceNorm1d',
+           'LazyInstanceNorm2d', 'LazyInstanceNorm3d']
+
+class _InstanceNorm(_NormBase):
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 1e-5,
+        momentum: float = 0.1,
+        affine: bool = False,
+        track_running_stats: bool = False,
+        device=None,
+        dtype=None
+    ) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(
+            num_features, eps, momentum, affine, track_running_stats, **factory_kwargs)
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def _get_no_batch_dim(self):
+        raise NotImplementedError
+
+    def _handle_no_batch_input(self, input):
+        return self._apply_instance_norm(input.unsqueeze(0)).squeeze(0)
+
+    def _apply_instance_norm(self, input):
+        return F.instance_norm(
+            input, self.running_mean, self.running_var, self.weight, self.bias,
+            self.training or not self.track_running_stats, self.momentum, self.eps)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+        # at version 1: removed running_mean and running_var when
+        # track_running_stats=False (default)
+        if version is None and not self.track_running_stats:
+            running_stats_keys = []
+            for name in ('running_mean', 'running_var'):
+                key = prefix + name
+                if key in state_dict:
+                    running_stats_keys.append(key)
+            if len(running_stats_keys) > 0:
+                error_msgs.append(
+                    'Unexpected running stats buffer(s) {names} for {klass} '
+                    'with track_running_stats=False. If state_dict is a '
+                    'checkpoint saved before 0.4.0, this may be expected '
+                    'because {klass} does not track running stats by default '
+                    'since 0.4.0. Please remove these keys from state_dict. If '
+                    'the running stats are actually needed, instead set '
+                    'track_running_stats=True in {klass} to enable them. See '
+                    'the documentation of {klass} for details.'
+                    .format(names=" and ".join(f'"{k}"' for k in running_stats_keys),
+                            klass=self.__class__.__name__))
+                for key in running_stats_keys:
+                    state_dict.pop(key)
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+
+        feature_dim = input.dim() - self._get_no_batch_dim()
+        if input.size(feature_dim) != self.num_features:
+            if self.affine:
+                raise ValueError(
+                    f"expected input's size at dim={feature_dim} to match num_features"
+                    f" ({self.num_features}), but got: {input.size(feature_dim)}.")
+            else:
+                warnings.warn(f"input's size at dim={feature_dim} does not match num_features. "
+                              "You can silence this warning by not passing in num_features, "
+                              "which is not used because affine=False")
+
+        if input.dim() == self._get_no_batch_dim():
+            return self._handle_no_batch_input(input)
+
+        return self._apply_instance_norm(input)
+
+
+class InstanceNorm1d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 2D (unbatched) or 3D (batched) input as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the number of features or channels of the input) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm1d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm1d` is applied
+        on each channel of channeled data like multidimensional time series, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm1d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: number of features or channels :math:`C` of the input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm1d(100, affine=True)
+        >>> input = torch.randn(20, 100, 40)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self):
+        return 2
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (2, 3):
+            raise ValueError(f'expected 2D or 3D input (got {input.dim()}D input)')
+
+
+class LazyInstanceNorm1d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm1d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm1d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`, `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, L)` or :math:`(C, L)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L)` or :math:`(C, L)`
+        - Output: :math:`(N, C, L)` or :math:`(C, L)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm1d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self):
+        return 2
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (2, 3):
+            raise ValueError(f'expected 2D or 3D input (got {input.dim()}D input)')
+
+
+class InstanceNorm2d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 4D input (a mini-batch of 2D inputs
+    with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size `C` (where `C` is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm2d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm2d` is applied
+        on each channel of channeled data like RGB images, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm2d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm2d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self):
+        return 3
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (3, 4):
+            raise ValueError(f'expected 3D or 4D input (got {input.dim()}D input)')
+
+
+class LazyInstanceNorm2d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm2d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm2d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)` or :math:`(C, H, W)`
+        - Output: :math:`(N, C, H, W)` or :math:`(C, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm2d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self):
+        return 3
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (3, 4):
+            raise ValueError(f'expected 3D or 4D input (got {input.dim()}D input)')
+
+
+class InstanceNorm3d(_InstanceNorm):
+    r"""Applies Instance Normalization.
+
+    This operation applies Instance Normalization
+    over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper
+    `Instance Normalization: The Missing Ingredient for Fast Stylization
+    <https://arxiv.org/abs/1607.08022>`__.
+
+    .. math::
+
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated per-dimension separately
+    for each object in a mini-batch. :math:`\gamma` and :math:`\beta` are learnable parameter vectors
+    of size C (where C is the input size) if :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    By default, this layer uses instance statistics computed from input data in
+    both training and evaluation modes.
+
+    If :attr:`track_running_stats` is set to ``True``, during training this
+    layer keeps running estimates of its computed mean and variance, which are
+    then used for normalization during evaluation. The running estimates are
+    kept with a default :attr:`momentum` of 0.1.
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically, the
+        update rule for running statistics here is
+        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
+        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
+        new observed value.
+
+    .. note::
+        :class:`InstanceNorm3d` and :class:`LayerNorm` are very similar, but
+        have some subtle differences. :class:`InstanceNorm3d` is applied
+        on each channel of channeled data like 3D models with RGB color, but
+        :class:`LayerNorm` is usually applied on entire sample and often in NLP
+        tasks. Additionally, :class:`LayerNorm` applies elementwise affine
+        transform, while :class:`InstanceNorm3d` usually don't apply affine
+        transform.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
+
+    Examples::
+
+        >>> # Without Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100)
+        >>> # With Learnable Parameters
+        >>> m = nn.InstanceNorm3d(100, affine=True)
+        >>> input = torch.randn(20, 100, 35, 45, 10)
+        >>> output = m(input)
+    """
+
+    def _get_no_batch_dim(self):
+        return 4
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (4, 5):
+            raise ValueError(f'expected 4D or 5D input (got {input.dim()}D input)')
+
+
+class LazyInstanceNorm3d(_LazyNormBase, _InstanceNorm):
+    r"""A :class:`torch.nn.InstanceNorm3d` module with lazy initialization of the ``num_features`` argument.
+
+    The ``num_features`` argument of the :class:`InstanceNorm3d` is inferred from the ``input.size(1)``.
+    The attributes that will be lazily initialized are `weight`, `bias`,
+    `running_mean` and `running_var`.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        num_features: :math:`C` from an expected input of size
+            :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        momentum: the value used for the running_mean and running_var computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, this module has
+            learnable affine parameters, initialized the same way as done for batch normalization.
+            Default: ``False``.
+        track_running_stats: a boolean value that when set to ``True``, this
+            module tracks the running mean and variance, and when set to ``False``,
+            this module does not track such statistics and always uses batch
+            statistics in both training and eval modes. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` or :math:`(C, D, H, W)` (same shape as input)
+    """
+
+    cls_to_become = InstanceNorm3d  # type: ignore[assignment]
+
+    def _get_no_batch_dim(self):
+        return 4
+
+    def _check_input_dim(self, input):
+        if input.dim() not in (4, 5):
+            raise ValueError(f'expected 4D or 5D input (got {input.dim()}D input)')
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/lazy.py b/MLPY/Lib/site-packages/torch/nn/modules/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34f261f2ef179e287618b1808fe994e1735dd7f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/lazy.py
@@ -0,0 +1,265 @@
+import itertools
+import warnings
+from typing import Protocol, Optional, Type, Any
+
+import torch
+from ..parameter import is_lazy
+
+__all__ = ['LazyModuleMixin']
+
+class _LazyProtocol(Protocol):
+    """This class is used to avoid errors with mypy checks for the attributes in a mixin.
+
+    https://mypy.readthedocs.io/en/latest/more_types.html#mixin-classes
+    """
+
+    def _register_load_state_dict_pre_hook(self, hook):
+        ...
+
+    def register_forward_pre_hook(self, hook, *, prepend=False, with_kwargs=False):
+        ...
+
+    def _lazy_load_hook(
+            self, state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs):
+        ...
+
+    def _get_name(self):
+        ...
+
+    def _infer_parameters(self, module, input):
+        ...
+
+    @property
+    def _parameters(self):
+        ...
+
+    @property
+    def _buffers(self):
+        ...
+
+    @property
+    def _non_persistent_buffers_set(self):
+        ...
+
+    @property
+    def _load_hook(self):
+        ...
+
+    @property
+    def _initialize_hook(self):
+        ...
+
+
+class LazyModuleMixin:
+    r"""A mixin for modules that lazily initialize parameters, also known as "lazy modules".
+
+    .. warning:
+        Lazy modules are an experimental new feature under active development,
+        and their API is likely to change.
+
+    Modules that lazily initialize parameters, or "lazy modules",
+    derive the shapes of their parameters from the first input(s)
+    to their forward method. Until that first forward they contain
+    :class:`torch.nn.UninitializedParameter` s that should not be accessed
+    or used, and afterward they contain regular :class:`torch.nn.Parameter` s.
+    Lazy modules are convenient since they don't require computing some
+    module arguments, like the :attr:`in_features` argument of a
+    typical :class:`torch.nn.Linear`.
+
+    After construction, networks with lazy modules should first
+    be converted to the desired dtype and placed on the expected device.
+    This is because lazy modules only perform shape inference so the usual dtype
+    and device placement behavior applies.
+    The lazy modules should then perform "dry runs" to initialize all the components in the module.
+    These "dry runs" send inputs of the correct size, dtype, and device through
+    the network and to each one of its lazy modules. After this the network can be used as usual.
+
+    >>> # xdoctest: +SKIP
+    >>> class LazyMLP(torch.nn.Module):
+    ...    def __init__(self):
+    ...        super().__init__()
+    ...        self.fc1 = torch.nn.LazyLinear(10)
+    ...        self.relu1 = torch.nn.ReLU()
+    ...        self.fc2 = torch.nn.LazyLinear(1)
+    ...        self.relu2 = torch.nn.ReLU()
+    ...
+    ...    def forward(self, input):
+    ...        x = self.relu1(self.fc1(input))
+    ...        y = self.relu2(self.fc2(x))
+    ...        return y
+    >>> # constructs a network with lazy modules
+    >>> lazy_mlp = LazyMLP()
+    >>> # transforms the network's device and dtype
+    >>> # NOTE: these transforms can and should be applied after construction and before any 'dry runs'
+    >>> lazy_mlp = lazy_mlp.cuda().double()
+    >>> lazy_mlp
+    LazyMLP( (fc1): LazyLinear(in_features=0, out_features=10, bias=True)
+      (relu1): ReLU()
+      (fc2): LazyLinear(in_features=0, out_features=1, bias=True)
+      (relu2): ReLU()
+    )
+    >>> # performs a dry run to initialize the network's lazy modules
+    >>> lazy_mlp(torch.ones(10,10).cuda())
+    >>> # after initialization, LazyLinear modules become regular Linear modules
+    >>> lazy_mlp
+    LazyMLP(
+      (fc1): Linear(in_features=10, out_features=10, bias=True)
+      (relu1): ReLU()
+      (fc2): Linear(in_features=10, out_features=1, bias=True)
+      (relu2): ReLU()
+    )
+    >>> # attaches an optimizer, since parameters can now be used as usual
+    >>> optim = torch.optim.SGD(mlp.parameters(), lr=0.01)
+
+    A final caveat when using lazy modules is that the order of initialization of a network's
+    parameters may change, since the lazy modules are always initialized after other modules.
+    For example, if the LazyMLP class defined above had a :class:`torch.nn.LazyLinear` module
+    first and then a regular :class:`torch.nn.Linear` second, the second module would be
+    initialized on construction and the first module would be initialized during the first dry run.
+    This can cause the parameters of a network using lazy modules to be initialized differently
+    than the parameters of a network without lazy modules as the order of parameter initializations,
+    which often depends on a stateful random number generator, is different.
+    Check :doc:`/notes/randomness` for more details.
+
+    Lazy modules can be serialized with a state dict like other modules. For example:
+
+    >>> lazy_mlp = LazyMLP()
+    >>> # The state dict shows the uninitialized parameters
+    >>> lazy_mlp.state_dict()
+    OrderedDict([('fc1.weight', Uninitialized parameter),
+                 ('fc1.bias',
+                  tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
+                           4.5637e-41, -1.8788e+22,  4.5636e-41, -2.0042e-31,  4.5637e-41])),
+                 ('fc2.weight', Uninitialized parameter),
+                 ('fc2.bias', tensor([0.0019]))])
+
+
+    Lazy modules can load regular :class:`torch.nn.Parameter` s (i.e. you can serialize/deserialize
+    initialized LazyModules and they will remain initialized)
+
+
+    >>> full_mlp = LazyMLP()
+    >>> # Dry run to initialize another module
+    >>> full_mlp.forward(torch.ones(10, 1))
+    >>> # Load an initialized state into a lazy module
+    >>> lazy_mlp.load_state_dict(full_mlp.state_dict())
+    >>> # The state dict now holds valid values
+    >>> lazy_mlp.state_dict()
+    OrderedDict([('fc1.weight',
+                  tensor([[-0.3837],
+                          [ 0.0907],
+                          [ 0.6708],
+                          [-0.5223],
+                          [-0.9028],
+                          [ 0.2851],
+                          [-0.4537],
+                          [ 0.6813],
+                          [ 0.5766],
+                          [-0.8678]])),
+                 ('fc1.bias',
+                  tensor([-1.8832e+25,  4.5636e-41, -1.8832e+25,  4.5636e-41, -6.1598e-30,
+                           4.5637e-41, -1.8788e+22,  4.5636e-41, -2.0042e-31,  4.5637e-41])),
+                 ('fc2.weight',
+                  tensor([[ 0.1320,  0.2938,  0.0679,  0.2793,  0.1088, -0.1795, -0.2301,  0.2807,
+                            0.2479,  0.1091]])),
+                 ('fc2.bias', tensor([0.0019]))])
+
+    Note, however, that the loaded parameters will not be replaced when doing a "dry run" if they are initialized
+    when the state is loaded. This prevents using initialized modules in different contexts.
+    """
+
+    # modules inheriting from this will change their __class__ to the specified
+    # one after they are fully initialized
+    cls_to_become: Optional[Type[Any]] = None
+
+    def __init__(self: _LazyProtocol, *args, **kwargs):
+        # Mypy doesnt like this super call in a mixin
+        super().__init__(*args, **kwargs)  # type: ignore[misc]
+        self._load_hook = self._register_load_state_dict_pre_hook(self._lazy_load_hook)
+        self._initialize_hook = self.register_forward_pre_hook(self._infer_parameters, with_kwargs=True)
+        warnings.warn('Lazy modules are a new feature under heavy development '
+                      'so changes to the API or functionality can happen at any moment.')
+
+    def _save_to_state_dict(self: _LazyProtocol, destination, prefix, keep_vars):
+        # This should be ideally implemented as a hook,
+        # but we should override `detach` in the UninitializedParameter to return itself
+        # which is not clean
+        for name, param in self._parameters.items():
+            if param is not None:
+                if not (is_lazy(param) or keep_vars):
+                    param = param.detach()
+                destination[prefix + name] = param
+        for name, buf in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                if not (is_lazy(buf) or keep_vars):
+                    buf = buf.detach()
+                destination[prefix + name] = buf
+
+    def _lazy_load_hook(
+            self: _LazyProtocol, state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs):
+        """load_state_dict pre-hook function for lazy buffers and parameters.
+
+        The purpose of this hook is to adjust the current state and/or
+        ``state_dict`` being loaded so that a module instance serialized in
+        both un/initialized state can be deserialized onto both un/initialized
+        module instance.
+        See comment in ``torch.nn.Module._register_load_state_dict_pre_hook``
+        for the details of the hook specification.
+        """
+        for name, param in itertools.chain(self._parameters.items(), self._buffers.items()):
+            key = prefix + name
+            if key in state_dict and param is not None:
+                input_param = state_dict[key]
+                if is_lazy(param):
+                    # The current parameter is not initialized but the one being loaded one is
+                    # create a new parameter based on the uninitialized one
+                    if not is_lazy(input_param):
+                        with torch.no_grad():
+                            param.materialize(input_param.shape)
+
+    def initialize_parameters(self: _LazyProtocol, *args, **kwargs):
+        r"""Initialize parameters according to the input batch properties.
+
+        This adds an interface to isolate parameter initialization from the
+        forward pass when doing parameter shape inference.
+        """
+        raise NotImplementedError(f'initialize_parameters is not implemented for {self.__class__.__name__}')
+
+    def has_uninitialized_params(self: _LazyProtocol):
+        r"""Check if a module has parameters that are not initialized."""
+        # This is to avoid the JIT to track this parameter and force
+        # custom modules __setstate__ to add it
+        params = self._parameters.values()
+        buffers = self._buffers.values()
+        for param in itertools.chain(params, buffers):
+            if is_lazy(param):
+                return True
+        return False
+
+    def _infer_parameters(self: _LazyProtocol, module, args, kwargs=None):
+        r"""Infers the size and initializes the parameters according to the provided input batch.
+
+        Given a module that contains parameters that were declared inferrable
+        using :class:`torch.nn.parameter.ParameterMode.Infer`, runs a forward pass
+        in the complete module using the provided input to initialize all the parameters
+        as needed.
+        The module is set into evaluation mode before running the forward pass in order
+        to avoid saving statistics or calculating gradients
+        """
+        kwargs = kwargs if kwargs else {}
+        module.initialize_parameters(*args, **kwargs)
+        if module.has_uninitialized_params():
+            raise RuntimeError(f'module {self._get_name()} has not been fully initialized')
+        module._initialize_hook.remove()
+        module._load_hook.remove()
+        delattr(module, '_initialize_hook')
+        delattr(module, '_load_hook')
+        if module.cls_to_become is not None:
+            module.__class__ = module.cls_to_become
+
+
+    def _replicate_for_data_parallel(self: _LazyProtocol):
+        raise RuntimeError('Modules with uninitialized parameters can\'t be used with `DataParallel`. '
+                           'Run a dummy forward pass to correctly initialize the modules')
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..5730e1052ffaf580911863a47dff0364581bf4d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/linear.py
@@ -0,0 +1,264 @@
+import math
+from typing import Any
+
+import torch
+from torch import Tensor
+from torch.nn.parameter import Parameter, UninitializedParameter
+from .. import functional as F
+from .. import init
+from .module import Module
+from .lazy import LazyModuleMixin
+
+
+__all__ = [
+    'Bilinear',
+    'Identity',
+    'LazyLinear',
+    'Linear',
+]
+
+
+class Identity(Module):
+    r"""A placeholder identity operator that is argument-insensitive.
+
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 20])
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__()
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class Linear(Module):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`.
+
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Args:
+        in_features: size of each input sample
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
+          dimensions including none and :math:`H_{in} = \text{in\_features}`.
+        - Output: :math:`(*, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+
+    Examples::
+
+        >>> m = nn.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+
+    __constants__ = ['in_features', 'out_features']
+    in_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
+        # https://github.com/pytorch/pytorch/issues/57109
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
+
+
+# This class exists solely to avoid triggering an obscure error when scripting
+# an improperly quantized attention layer. See this issue for details:
+# https://github.com/pytorch/pytorch/issues/58969
+# TODO: fail fast on quantization API usage error, then remove this class
+# and replace uses of it with plain Linear
+class NonDynamicallyQuantizableLinear(Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        super().__init__(in_features, out_features, bias=bias,
+                         device=device, dtype=dtype)
+
+
+class Bilinear(Module):
+    r"""Applies a bilinear transformation to the incoming data: :math:`y = x_1^T A x_2 + b`.
+
+    Args:
+        in1_features: size of each first input sample
+        in2_features: size of each second input sample
+        out_features: size of each output sample
+        bias: If set to False, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Shape:
+        - Input1: :math:`(*, H_{in1})` where :math:`H_{in1}=\text{in1\_features}` and
+          :math:`*` means any number of additional dimensions including none. All but the last dimension
+          of the inputs should be the same.
+        - Input2: :math:`(*, H_{in2})` where :math:`H_{in2}=\text{in2\_features}`.
+        - Output: :math:`(*, H_{out})` where :math:`H_{out}=\text{out\_features}`
+          and all but the last dimension are the same shape as the input.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in1\_features}, \text{in2\_features})`.
+            The values are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in1\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+                :math:`k = \frac{1}{\text{in1\_features}}`
+
+    Examples::
+
+        >>> m = nn.Bilinear(20, 30, 40)
+        >>> input1 = torch.randn(128, 20)
+        >>> input2 = torch.randn(128, 30)
+        >>> output = m(input1, input2)
+        >>> print(output.size())
+        torch.Size([128, 40])
+    """
+
+    __constants__ = ['in1_features', 'in2_features', 'out_features']
+    in1_features: int
+    in2_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(self, in1_features: int, in2_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in1_features = in1_features
+        self.in2_features = in2_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.empty((out_features, in1_features, in2_features), **factory_kwargs))
+
+        if bias:
+            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        bound = 1 / math.sqrt(self.weight.size(1))
+        init.uniform_(self.weight, -bound, bound)
+        if self.bias is not None:
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        return F.bilinear(input1, input2, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        return 'in1_features={}, in2_features={}, out_features={}, bias={}'.format(
+            self.in1_features, self.in2_features, self.out_features, self.bias is not None
+        )
+
+
+class LazyLinear(LazyModuleMixin, Linear):
+    r"""A :class:`torch.nn.Linear` module where `in_features` is inferred.
+
+    In this module, the `weight` and `bias` are of :class:`torch.nn.UninitializedParameter`
+    class. They will be initialized after the first call to ``forward`` is done and the
+    module will become a regular :class:`torch.nn.Linear` module. The ``in_features`` argument
+    of the :class:`Linear` is inferred from the ``input.shape[-1]``.
+
+    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
+    on lazy modules and their limitations.
+
+    Args:
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+
+
+    """
+
+    cls_to_become = Linear  # type: ignore[assignment]
+    weight: UninitializedParameter
+    bias: UninitializedParameter  # type: ignore[assignment]
+
+    def __init__(self, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        # bias is hardcoded to False to avoid creating tensor
+        # that will soon be overwritten.
+        super().__init__(0, 0, False)
+        self.weight = UninitializedParameter(**factory_kwargs)
+        self.out_features = out_features
+        if bias:
+            self.bias = UninitializedParameter(**factory_kwargs)
+
+    def reset_parameters(self) -> None:
+        if not self.has_uninitialized_params() and self.in_features != 0:
+            super().reset_parameters()
+
+    def initialize_parameters(self, input) -> None:  # type: ignore[override]
+        if self.has_uninitialized_params():
+            with torch.no_grad():
+                self.in_features = input.shape[-1]
+                self.weight.materialize((self.out_features, self.in_features))
+                if self.bias is not None:
+                    self.bias.materialize((self.out_features,))
+                self.reset_parameters()
+# TODO: PartialLinear - maybe in sparse?
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/loss.py b/MLPY/Lib/site-packages/torch/nn/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea34063c7121913512b14742c4ef2cfa812f64c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/loss.py
@@ -0,0 +1,1790 @@
+import warnings
+
+from .distance import PairwiseDistance
+from .module import Module
+from .. import functional as F
+from .. import _reduction as _Reduction
+
+from torch import Tensor
+from typing import Callable, Optional
+
+__all__ = ['L1Loss', 'NLLLoss', 'NLLLoss2d', 'PoissonNLLLoss', 'GaussianNLLLoss', 'KLDivLoss',
+           'MSELoss', 'BCELoss', 'BCEWithLogitsLoss', 'HingeEmbeddingLoss', 'MultiLabelMarginLoss',
+           'SmoothL1Loss', 'HuberLoss', 'SoftMarginLoss', 'CrossEntropyLoss', 'MultiLabelSoftMarginLoss',
+           'CosineEmbeddingLoss', 'MarginRankingLoss', 'MultiMarginLoss', 'TripletMarginLoss',
+           'TripletMarginWithDistanceLoss', 'CTCLoss']
+
+class _Loss(Module):
+    reduction: str
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__()
+        if size_average is not None or reduce is not None:
+            self.reduction: str = _Reduction.legacy_get_string(size_average, reduce)
+        else:
+            self.reduction = reduction
+
+
+class _WeightedLoss(_Loss):
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.register_buffer('weight', weight)
+        self.weight: Optional[Tensor]
+
+
+class L1Loss(_Loss):
+    r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
+    the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left| x_n - y_n \right|,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The sum operation still operates over all the elements, and divides by :math:`n`.
+
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
+    Supports real-valued and complex-valued inputs.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then
+          :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> loss = nn.L1Loss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.l1_loss(input, target, reduction=self.reduction)
+
+
+class NLLLoss(_WeightedLoss):
+    r"""The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
+    weight to each of the classes. This is particularly useful when you have an
+    unbalanced training set.
+
+    The `input` given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \geq 1` for the `K`-dimensional case. The latter is useful for
+    higher dimension inputs, such as computing NLL loss per-pixel for 2D images.
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
+    where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
+    this class index (this index may not necessarily be in the class range).
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_{y_n} x_{n,y_n}, \quad
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
+
+    where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
+    :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
+            \text{if reduction} = \text{`mean';}\\
+            \sum_{n=1}^N l_n,  &
+            \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``None``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When
+            :attr:`size_average` is ``True``, the loss is averaged over
+            non-ignored targets.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``None``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will override
+            :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(C)`, where `C = number of classes`, or
+          :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of `K`-dimensional loss.
+        - Target: :math:`(N)` or :math:`()`, where each value is
+          :math:`0 \leq \text{targets}[i] \leq C-1`, or
+          :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
+          K-dimensional loss.
+        - Output: If :attr:`reduction` is ``'none'``, shape :math:`(N)` or
+          :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss.
+          Otherwise, scalar.
+
+    Examples::
+
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> loss = nn.NLLLoss()
+        >>> # input is of size N x C = 3 x 5
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.tensor([1, 0, 4])
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+        >>>
+        >>>
+        >>> # 2D loss example (used, for example, with image inputs)
+        >>> N, C = 5, 4
+        >>> loss = nn.NLLLoss()
+        >>> # input is of size N x C x height x width
+        >>> data = torch.randn(N, 16, 10, 10)
+        >>> conv = nn.Conv2d(16, C, (3, 3))
+        >>> m = nn.LogSoftmax(dim=1)
+        >>> # each element in target has to have 0 <= value < C
+        >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
+        >>> output = loss(m(conv(data)), target)
+        >>> output.backward()
+    """
+    __constants__ = ['ignore_index', 'reduction']
+    ignore_index: int
+
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
+                 reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
+
+
+class NLLLoss2d(NLLLoss):
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
+                 reduce=None, reduction: str = 'mean') -> None:
+        warnings.warn("NLLLoss2d has been deprecated. "
+                      "Please use NLLLoss instead as a drop-in replacement and see "
+                      "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
+        super().__init__(weight, size_average, ignore_index, reduce, reduction)
+
+
+class PoissonNLLLoss(_Loss):
+    r"""Negative log likelihood loss with Poisson distribution of target.
+
+    The loss can be described as:
+
+    .. math::
+        \text{target} \sim \mathrm{Poisson}(\text{input})
+
+        \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
+                                    + \log(\text{target!})
+
+    The last term can be omitted or approximated with Stirling formula. The
+    approximation is used for target values more than 1. For targets less or
+    equal to 1 zeros are added to the loss.
+
+    Args:
+        log_input (bool, optional): if ``True`` the loss is computed as
+            :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
+            :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
+        full (bool, optional): whether to compute full loss, i. e. to add the
+            Stirling approximation term
+
+            .. math::
+                \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
+            :attr:`log_input = False`. Default: 1e-8
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Examples::
+
+        >>> loss = nn.PoissonNLLLoss()
+        >>> log_input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> output = loss(log_input, target)
+        >>> output.backward()
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(*)`,
+          the same shape as the input.
+    """
+    __constants__ = ['log_input', 'full', 'eps', 'reduction']
+    log_input: bool
+    full: bool
+    eps: float
+
+    def __init__(self, log_input: bool = True, full: bool = False, size_average=None,
+                 eps: float = 1e-8, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.log_input = log_input
+        self.full = full
+        self.eps = eps
+
+    def forward(self, log_input: Tensor, target: Tensor) -> Tensor:
+        return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
+                                  eps=self.eps, reduction=self.reduction)
+
+
+class GaussianNLLLoss(_Loss):
+    r"""Gaussian negative log likelihood loss.
+
+    The targets are treated as samples from Gaussian distributions with
+    expectations and variances predicted by the neural network. For a
+    ``target`` tensor modelled as having Gaussian distribution with a tensor
+    of expectations ``input`` and a tensor of positive variances ``var`` the loss is:
+
+    .. math::
+        \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
+        \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{target}\right)^2}
+        {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
+
+    where :attr:`eps` is used for stability. By default, the constant term of
+    the loss function is omitted unless :attr:`full` is ``True``. If ``var`` is not the same
+    size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
+    of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
+
+    Args:
+        full (bool, optional): include the constant term in the loss
+            calculation. Default: ``False``.
+        eps (float, optional): value used to clamp ``var`` (see note below), for
+            stability. Default: 1e-6.
+        reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+            will be applied, ``'mean'``: the output is the average of all batch
+            member losses, ``'sum'``: the output is the sum of all batch member
+            losses. Default: ``'mean'``.
+
+    Shape:
+        - Input: :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
+          dimensions
+        - Target: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
+          but with one dimension equal to 1 (to allow for broadcasting)
+        - Var: :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
+          with one dimension equal to 1, or same shape as the input but with one fewer
+          dimension (to allow for broadcasting)
+        - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
+          ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
+          shape as the input
+
+    Examples::
+        >>> loss = nn.GaussianNLLLoss()
+        >>> input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> var = torch.ones(5, 2, requires_grad=True)  # heteroscedastic
+        >>> output = loss(input, target, var)
+        >>> output.backward()
+
+        >>> loss = nn.GaussianNLLLoss()
+        >>> input = torch.randn(5, 2, requires_grad=True)
+        >>> target = torch.randn(5, 2)
+        >>> var = torch.ones(5, 1, requires_grad=True)  # homoscedastic
+        >>> output = loss(input, target, var)
+        >>> output.backward()
+
+    Note:
+        The clamping of ``var`` is ignored with respect to autograd, and so the
+        gradients are unaffected by it.
+
+    Reference:
+        Nix, D. A. and Weigend, A. S., "Estimating the mean and variance of the
+        target probability distribution", Proceedings of 1994 IEEE International
+        Conference on Neural Networks (ICNN'94), Orlando, FL, USA, 1994, pp. 55-60
+        vol.1, doi: 10.1109/ICNN.1994.374138.
+    """
+    __constants__ = ['full', 'eps', 'reduction']
+    full: bool
+    eps: float
+
+    def __init__(self, *, full: bool = False, eps: float = 1e-6, reduction: str = 'mean') -> None:
+        super().__init__(None, None, reduction)
+        self.full = full
+        self.eps = eps
+
+    def forward(self, input: Tensor, target: Tensor, var: Tensor) -> Tensor:
+        return F.gaussian_nll_loss(input, target, var, full=self.full, eps=self.eps, reduction=self.reduction)
+
+
+class KLDivLoss(_Loss):
+    r"""The Kullback-Leibler divergence loss.
+
+    For tensors of the same shape :math:`y_{\text{pred}},\ y_{\text{true}}`,
+    where :math:`y_{\text{pred}}` is the :attr:`input` and :math:`y_{\text{true}}` is the
+    :attr:`target`, we define the **pointwise KL-divergence** as
+
+    .. math::
+
+        L(y_{\text{pred}},\ y_{\text{true}})
+            = y_{\text{true}} \cdot \log \frac{y_{\text{true}}}{y_{\text{pred}}}
+            = y_{\text{true}} \cdot (\log y_{\text{true}} - \log y_{\text{pred}})
+
+    To avoid underflow issues when computing this quantity, this loss expects the argument
+    :attr:`input` in the log-space. The argument :attr:`target` may also be provided in the
+    log-space if :attr:`log_target`\ `= True`.
+
+    To summarise, this function is roughly equivalent to computing
+
+    .. code-block:: python
+
+        if not log_target: # default
+            loss_pointwise = target * (target.log() - input)
+        else:
+            loss_pointwise = target.exp() * (target - input)
+
+    and then reducing this result depending on the argument :attr:`reduction` as
+
+    .. code-block:: python
+
+        if reduction == "mean":  # default
+            loss = loss_pointwise.mean()
+        elif reduction == "batchmean":  # mathematically correct
+            loss = loss_pointwise.sum() / input.size(0)
+        elif reduction == "sum":
+            loss = loss_pointwise.sum()
+        else:  # reduction == "none"
+            loss = loss_pointwise
+
+    .. note::
+        As all the other losses in PyTorch, this function expects the first argument,
+        :attr:`input`, to be the output of the model (e.g. the neural network)
+        and the second, :attr:`target`, to be the observations in the dataset.
+        This differs from the standard mathematical notation :math:`KL(P\ ||\ Q)` where
+        :math:`P` denotes the distribution of the observations and :math:`Q` denotes the model.
+
+    .. warning::
+        :attr:`reduction`\ `= "mean"` doesn't return the true KL divergence value, please use
+        :attr:`reduction`\ `= "batchmean"` which aligns with the mathematical definition.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to `False`, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is `False`. Default: `True`
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is `False`, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: `True`
+        reduction (str, optional): Specifies the reduction to apply to the output. Default: `"mean"`
+        log_target (bool, optional): Specifies whether `target` is the log space. Default: `False`
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar by default. If :attr:`reduction` is `'none'`, then :math:`(*)`,
+          same shape as the input.
+
+    Examples::
+
+        >>> import torch.nn.functional as F
+        >>> kl_loss = nn.KLDivLoss(reduction="batchmean")
+        >>> # input should be a distribution in the log space
+        >>> input = F.log_softmax(torch.randn(3, 5, requires_grad=True), dim=1)
+        >>> # Sample a batch of distributions. Usually this would come from the dataset
+        >>> target = F.softmax(torch.rand(3, 5), dim=1)
+        >>> output = kl_loss(input, target)
+
+        >>> kl_loss = nn.KLDivLoss(reduction="batchmean", log_target=True)
+        >>> log_target = F.log_softmax(torch.rand(3, 5), dim=1)
+        >>> output = kl_loss(input, log_target)
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', log_target: bool = False) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.log_target = log_target
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.kl_div(input, target, reduction=self.reduction, log_target=self.log_target)
+
+
+class MSELoss(_Loss):
+    r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
+    each element in the input :math:`x` and target :math:`y`.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = \left( x_n - y_n \right)^2,
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The mean operation still operates over all the elements, and divides by :math:`n`.
+
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+
+    Examples::
+
+        >>> loss = nn.MSELoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.mse_loss(input, target, reduction=self.reduction)
+
+
+class BCELoss(_WeightedLoss):
+    r"""Creates a criterion that measures the Binary Cross Entropy between the target and
+    the input probabilities:
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets :math:`y` should be numbers
+    between 0 and 1.
+
+    Notice that if :math:`x_n` is either 0 or 1, one of the log terms would be
+    mathematically undefined in the above loss equation. PyTorch chooses to set
+    :math:`\log (0) = -\infty`, since :math:`\lim_{x\to 0} \log (x) = -\infty`.
+    However, an infinite term in the loss equation is not desirable for several reasons.
+
+    For one, if either :math:`y_n = 0` or :math:`(1 - y_n) = 0`, then we would be
+    multiplying 0 with infinity. Secondly, if we have an infinite loss value, then
+    we would also have an infinite term in our gradient, since
+    :math:`\lim_{x\to 0} \frac{d}{dx} \log (x) = \infty`.
+    This would make BCELoss's backward method nonlinear with respect to :math:`x_n`,
+    and using it for things like linear regression would not be straight-forward.
+
+    Our solution is that BCELoss clamps its log function outputs to be greater than
+    or equal to -100. This way, we can always have a finite loss value and a linear
+    backward method.
+
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size `nbatch`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> loss = nn.BCELoss()
+        >>> input = torch.randn(3, 2, requires_grad=True)
+        >>> target = torch.rand(3, 2, requires_grad=False)
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
+
+
+class BCEWithLogitsLoss(_Loss):
+    r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+    class. This version is more numerically stable than using a plain `Sigmoid`
+    followed by a `BCELoss` as, by combining the operations into one layer,
+    we take advantage of the log-sum-exp trick for numerical stability.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
+        + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
+
+    where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    This is used for measuring the error of a reconstruction in for example
+    an auto-encoder. Note that the targets `t[i]` should be numbers
+    between 0 and 1.
+
+    It's possible to trade off recall and precision by adding weights to positive examples.
+    In the case of multi-label classification the loss can be described as:
+
+    .. math::
+        \ell_c(x, y) = L_c = \{l_{1,c},\dots,l_{N,c}\}^\top, \quad
+        l_{n,c} = - w_{n,c} \left[ p_c y_{n,c} \cdot \log \sigma(x_{n,c})
+        + (1 - y_{n,c}) \cdot \log (1 - \sigma(x_{n,c})) \right],
+
+    where :math:`c` is the class number (:math:`c > 1` for multi-label binary classification,
+    :math:`c = 1` for single-label binary classification),
+    :math:`n` is the number of the sample in the batch and
+    :math:`p_c` is the weight of the positive answer for the class :math:`c`.
+
+    :math:`p_c > 1` increases the recall, :math:`p_c < 1` increases the precision.
+
+    For example, if a dataset contains 100 positive and 300 negative examples of a single class,
+    then ``pos_weight`` for the class should be equal to :math:`\frac{300}{100}=3`.
+    The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
+
+    Examples::
+
+        >>> target = torch.ones([10, 64], dtype=torch.float32)  # 64 classes, batch size = 10
+        >>> output = torch.full([10, 64], 1.5)  # A prediction (logit)
+        >>> pos_weight = torch.ones([64])  # All weights are equal to 1
+        >>> criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+        >>> criterion(output, target)  # -log(sigmoid(1.5))
+        tensor(0.20...)
+
+    In the above example, the ``pos_weight`` tensor's elements correspond to the 64 distinct classes
+    in a multi-label binary classification scenario. Each element in ``pos_weight`` is designed to adjust the
+    loss function based on the imbalance between negative and positive samples for the respective class.
+    This approach is useful in datasets with varying levels of class imbalance, ensuring that the loss
+    calculation accurately accounts for the distribution in each class.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to the loss
+            of each batch element. If given, has to be a Tensor of size `nbatch`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        pos_weight (Tensor, optional): a weight of positive examples to be broadcasted with target.
+            Must be a tensor with equal size along the class dimension to the number of classes.
+            Pay close attention to PyTorch's broadcasting semantics in order to achieve the desired
+            operations. For a target of size [B, C, H, W] (where B is batch size) pos_weight of
+            size [B, C, H, W] will apply different pos_weights to each element of the batch or
+            [C, H, W] the same pos_weights across the batch. To apply the same positive weight
+            along all spacial dimensions for a 2D multi-class target [C, H, W] use: [C, 1, 1].
+            Default: ``None``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+     Examples::
+
+        >>> loss = nn.BCEWithLogitsLoss()
+        >>> input = torch.randn(3, requires_grad=True)
+        >>> target = torch.empty(3).random_(2)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
+                 pos_weight: Optional[Tensor] = None) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.register_buffer('weight', weight)
+        self.register_buffer('pos_weight', pos_weight)
+        self.weight: Optional[Tensor]
+        self.pos_weight: Optional[Tensor]
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.binary_cross_entropy_with_logits(input, target,
+                                                  self.weight,
+                                                  pos_weight=self.pos_weight,
+                                                  reduction=self.reduction)
+
+
+class HingeEmbeddingLoss(_Loss):
+    r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
+    (containing 1 or -1).
+    This is usually used for measuring whether two inputs are similar or
+    dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
+    used for learning nonlinear embeddings or semi-supervised learning.
+
+    The loss function for :math:`n`-th sample in the mini-batch is
+
+    .. math::
+        l_n = \begin{cases}
+            x_n, & \text{if}\; y_n = 1,\\
+            \max \{0, margin - x_n\}, & \text{if}\; y_n = -1,
+        \end{cases}
+
+    and the total loss functions is
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \operatorname{mean}(L), & \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  & \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    where :math:`L = \{l_1,\dots,l_N\}^\top`.
+
+    Args:
+        margin (float, optional): Has a default value of `1`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
+          operates over all the elements.
+        - Target: :math:`(*)`, same shape as the input
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
+    """
+    __constants__ = ['margin', 'reduction']
+    margin: float
+
+    def __init__(self, margin: float = 1.0, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
+
+
+class MultiLabelMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a multi-class multi-classification
+    hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+    and output :math:`y` (which is a 2D `Tensor` of target class indices).
+    For each sample in the mini-batch:
+
+    .. math::
+        \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
+
+    where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
+    :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
+    :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
+    and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
+
+    :math:`y` and :math:`x` must have the same size.
+
+    The criterion only considers a contiguous block of non-negative targets that
+    starts at the front.
+
+    This allows for different samples to have variable amounts of target classes.
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
+          is the number of classes.
+        - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+
+    Examples::
+
+        >>> loss = nn.MultiLabelMarginLoss()
+        >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
+        >>> # for target y, only consider labels 3 and 0, not after label -1
+        >>> y = torch.LongTensor([[3, 0, -1, 1]])
+        >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
+        >>> loss(x, y)
+        tensor(0.85...)
+
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.multilabel_margin_loss(input, target, reduction=self.reduction)
+
+
+class SmoothL1Loss(_Loss):
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below beta and an L1 term otherwise.
+    It is less sensitive to outliers than :class:`torch.nn.MSELoss` and in some cases
+    prevents exploding gradients (e.g. see the paper `Fast R-CNN`_ by Ross Girshick).
+
+    For a batch of size :math:`N`, the unreduced loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1, ..., l_N\}^T
+
+    with
+
+    .. math::
+        l_n = \begin{cases}
+        0.5 (x_n - y_n)^2 / beta, & \text{if } |x_n - y_n| < beta \\
+        |x_n - y_n| - 0.5 * beta, & \text{otherwise }
+        \end{cases}
+
+    If `reduction` is not `none`, then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    .. note::
+        Smooth L1 loss can be seen as exactly :class:`L1Loss`, but with the :math:`|x - y| < beta`
+        portion replaced with a quadratic function such that its slope is 1 at :math:`|x - y| = beta`.
+        The quadratic segment smooths the L1 loss near :math:`|x - y| = 0`.
+
+    .. note::
+        Smooth L1 loss is closely related to :class:`HuberLoss`, being
+        equivalent to :math:`huber(x, y) / beta` (note that Smooth L1's beta hyper-parameter is
+        also known as delta for Huber). This leads to the following differences:
+
+        * As beta -> 0, Smooth L1 loss converges to :class:`L1Loss`, while :class:`HuberLoss`
+          converges to a constant 0 loss. When beta is 0, Smooth L1 loss is equivalent to L1 loss.
+        * As beta -> :math:`+\infty`, Smooth L1 loss converges to a constant 0 loss, while
+          :class:`HuberLoss` converges to :class:`MSELoss`.
+        * For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant slope of 1.
+          For :class:`HuberLoss`, the slope of the L1 segment is beta.
+
+    .. _`Fast R-CNN`: https://arxiv.org/abs/1504.08083
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+        beta (float, optional): Specifies the threshold at which to change between L1 and L2 loss.
+            The value must be non-negative. Default: 1.0
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean', beta: float = 1.0) -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.beta = beta
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
+
+
+class HuberLoss(_Loss):
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below delta and a delta-scaled L1 term otherwise.
+    This loss combines advantages of both :class:`L1Loss` and :class:`MSELoss`; the
+    delta-scaled L1 region makes the loss less sensitive to outliers than :class:`MSELoss`,
+    while the L2 region provides smoothness over :class:`L1Loss` near 0. See
+    `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`_ for more information.
+
+    For a batch of size :math:`N`, the unreduced loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1, ..., l_N\}^T
+
+    with
+
+    .. math::
+        l_n = \begin{cases}
+        0.5 (x_n - y_n)^2, & \text{if } |x_n - y_n| < delta \\
+        delta * (|x_n - y_n| - 0.5 * delta), & \text{otherwise }
+        \end{cases}
+
+    If `reduction` is not `none`, then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    .. note::
+        When delta is set to 1, this loss is equivalent to :class:`SmoothL1Loss`.
+        In general, this loss differs from :class:`SmoothL1Loss` by a factor of delta (AKA beta
+        in Smooth L1).
+        See :class:`SmoothL1Loss` for additional discussion on the differences in behavior
+        between the two losses.
+
+    Args:
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+        delta (float, optional): Specifies the threshold at which to change between delta-scaled L1 and L2 loss.
+            The value must be positive.  Default: 1.0
+
+    Shape:
+        - Input: :math:`(*)` where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same shape as the input.
+    """
+    __constants__ = ['reduction', 'delta']
+
+    def __init__(self, reduction: str = 'mean', delta: float = 1.0) -> None:
+        super().__init__(reduction=reduction)
+        self.delta = delta
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.huber_loss(input, target, reduction=self.reduction, delta=self.delta)
+
+
+class SoftMarginLoss(_Loss):
+    r"""Creates a criterion that optimizes a two-class classification
+    logistic loss between input tensor :math:`x` and target tensor :math:`y`
+    (containing 1 or -1).
+
+    .. math::
+        \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
+
+    Args:
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Target: :math:`(*)`, same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(*)`, same
+          shape as input.
+
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.soft_margin_loss(input, target, reduction=self.reduction)
+
+
+class CrossEntropyLoss(_WeightedLoss):
+    r"""This criterion computes the cross entropy loss between input logits
+    and target.
+
+    It is useful when training a classification problem with `C` classes.
+    If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
+    assigning weight to each of the classes.
+    This is particularly useful when you have an unbalanced training set.
+
+    The `input` is expected to contain the unnormalized logits for each class (which do `not` need
+    to be positive or sum to 1, in general).
+    `input` has to be a Tensor of size :math:`(C)` for unbatched input,
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1` for the
+    `K`-dimensional case. The last being useful for higher dimension inputs, such
+    as computing cross entropy loss per-pixel for 2D images.
+
+    The `target` that this criterion expects should contain either:
+
+    - Class indices in the range :math:`[0, C)` where :math:`C` is the number of classes; if
+      `ignore_index` is specified, this loss also accepts this class index (this index
+      may not necessarily be in the class range). The unreduced (i.e. with :attr:`reduction`
+      set to ``'none'``) loss for this case can be described as:
+
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - w_{y_n} \log \frac{\exp(x_{n,y_n})}{\sum_{c=1}^C \exp(x_{n,c})}
+          \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}
+
+      where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
+      :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
+      :math:`d_1, ..., d_k` for the `K`-dimensional case. If
+      :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
+
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n} \cdot \mathbb{1}\{y_n \not= \text{ignore\_index}\}} l_n, &
+               \text{if reduction} = \text{`mean';}\\
+                \sum_{n=1}^N l_n,  &
+                \text{if reduction} = \text{`sum'.}
+            \end{cases}
+
+      Note that this case is equivalent to applying :class:`~torch.nn.LogSoftmax`
+      on an input, followed by :class:`~torch.nn.NLLLoss`.
+
+    - Probabilities for each class; useful when labels beyond a single class per minibatch item
+      are required, such as for blended labels, label smoothing, etc. The unreduced (i.e. with
+      :attr:`reduction` set to ``'none'``) loss for this case can be described as:
+
+      .. math::
+          \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+          l_n = - \sum_{c=1}^C w_c \log \frac{\exp(x_{n,c})}{\sum_{i=1}^C \exp(x_{n,i})} y_{n,c}
+
+      where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight,
+      :math:`C` is the number of classes, and :math:`N` spans the minibatch dimension as well as
+      :math:`d_1, ..., d_k` for the `K`-dimensional case. If
+      :attr:`reduction` is not ``'none'`` (default ``'mean'``), then
+
+      .. math::
+          \ell(x, y) = \begin{cases}
+              \frac{\sum_{n=1}^N l_n}{N}, &
+               \text{if reduction} = \text{`mean';}\\
+                \sum_{n=1}^N l_n,  &
+                \text{if reduction} = \text{`sum'.}
+            \end{cases}
+
+    .. note::
+        The performance of this criterion is generally better when `target` contains class
+        indices, as this allows for optimized computation. Consider providing `target` as
+        class probabilities only when a single class label per minibatch item is too restrictive.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size `C` and floating point dtype
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        ignore_index (int, optional): Specifies a target value that is ignored
+            and does not contribute to the input gradient. When :attr:`size_average` is
+            ``True``, the loss is averaged over non-ignored targets. Note that
+            :attr:`ignore_index` is only applicable when the target contains class indices.
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will
+            be applied, ``'mean'``: the weighted mean of the output is taken,
+            ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in
+            the meantime, specifying either of those two args will override
+            :attr:`reduction`. Default: ``'mean'``
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
+
+    Shape:
+        - Input: Shape :math:`(C)`, :math:`(N, C)` or :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of `K`-dimensional loss.
+        - Target: If containing class indices, shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with
+          :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`.
+          If containing class probabilities, same shape as the input and each value should be between :math:`[0, 1]`.
+        - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
+          in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar.
+
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                C ={} & \text{number of classes} \\
+                N ={} & \text{batch size} \\
+            \end{aligned}
+
+    Examples::
+
+        >>> # Example of target with class indices
+        >>> loss = nn.CrossEntropyLoss()
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.empty(3, dtype=torch.long).random_(5)
+        >>> output = loss(input, target)
+        >>> output.backward()
+        >>>
+        >>> # Example of target with class probabilities
+        >>> input = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.randn(3, 5).softmax(dim=1)
+        >>> output = loss(input, target)
+        >>> output.backward()
+    """
+    __constants__ = ['ignore_index', 'reduction', 'label_smoothing']
+    ignore_index: int
+    label_smoothing: float
+
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, ignore_index: int = -100,
+                 reduce=None, reduction: str = 'mean', label_smoothing: float = 0.0) -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        self.ignore_index = ignore_index
+        self.label_smoothing = label_smoothing
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.cross_entropy(input, target, weight=self.weight,
+                               ignore_index=self.ignore_index, reduction=self.reduction,
+                               label_smoothing=self.label_smoothing)
+
+
+class MultiLabelSoftMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-label one-versus-all
+    loss based on max-entropy, between input :math:`x` and target :math:`y` of size
+    :math:`(N, C)`.
+    For each sample in the minibatch:
+
+    .. math::
+        loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
+                         + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
+
+    where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
+    :math:`y[i] \in \left\{0, \; 1\right\}`.
+
+    Args:
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
+        - Target: :math:`(N, C)`, label targets must have the same shape as the input.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
+    """
+    __constants__ = ['reduction']
+
+    def __init__(self, weight: Optional[Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
+
+
+class CosineEmbeddingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given input tensors
+    :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
+    Use (:math:`y=1`) to maximize the cosine similarity of two inputs, and (:math:`y=-1`) otherwise.
+    This is typically used for learning nonlinear
+    embeddings or semi-supervised learning.
+
+    The loss function for each sample is:
+
+    .. math::
+        \text{loss}(x, y) =
+        \begin{cases}
+        1 - \cos(x_1, x_2), & \text{if } y = 1 \\
+        \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
+        \end{cases}
+
+    Args:
+        margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
+            :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
+            default value is :math:`0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input1: :math:`(N, D)` or :math:`(D)`, where `N` is the batch size and `D` is the embedding dimension.
+        - Input2: :math:`(N, D)` or :math:`(D)`, same shape as Input1.
+        - Target: :math:`(N)` or :math:`()`.
+        - Output: If :attr:`reduction` is ``'none'``, then :math:`(N)`, otherwise scalar.
+
+    Examples::
+
+        >>> loss = nn.CosineEmbeddingLoss()
+        >>> input1 = torch.randn(3, 5, requires_grad=True)
+        >>> input2 = torch.randn(3, 5, requires_grad=True)
+        >>> target = torch.ones(3)
+        >>> output = loss(input1, input2, target)
+        >>> output.backward()
+    """
+    __constants__ = ['margin', 'reduction']
+    margin: float
+
+    def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+        return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
+
+
+class MarginRankingLoss(_Loss):
+    r"""Creates a criterion that measures the loss given
+    inputs :math:`x1`, :math:`x2`, two 1D mini-batch or 0D `Tensors`,
+    and a label 1D mini-batch or 0D `Tensor` :math:`y` (containing 1 or -1).
+
+    If :math:`y = 1` then it assumed the first input should be ranked higher
+    (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
+
+    The loss function for each pair of samples in the mini-batch is:
+
+    .. math::
+        \text{loss}(x1, x2, y) = \max(0, -y * (x1 - x2) + \text{margin})
+
+    Args:
+        margin (float, optional): Has a default value of :math:`0`.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input1: :math:`(N)` or :math:`()` where `N` is the batch size.
+        - Input2: :math:`(N)` or :math:`()`, same shape as the Input1.
+        - Target: :math:`(N)` or :math:`()`, same shape as the inputs.
+        - Output: scalar. If :attr:`reduction` is ``'none'`` and Input size is not :math:`()`, then :math:`(N)`.
+
+    Examples::
+
+        >>> loss = nn.MarginRankingLoss()
+        >>> input1 = torch.randn(3, requires_grad=True)
+        >>> input2 = torch.randn(3, requires_grad=True)
+        >>> target = torch.randn(3).sign()
+        >>> output = loss(input1, input2, target)
+        >>> output.backward()
+    """
+    __constants__ = ['margin', 'reduction']
+    margin: float
+
+    def __init__(self, margin: float = 0., size_average=None, reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+
+    def forward(self, input1: Tensor, input2: Tensor, target: Tensor) -> Tensor:
+        return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
+
+
+class MultiMarginLoss(_WeightedLoss):
+    r"""Creates a criterion that optimizes a multi-class classification hinge
+    loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
+    output :math:`y` (which is a 1D tensor of target class indices,
+    :math:`0 \leq y \leq \text{x.size}(1)-1`):
+
+    For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
+    output :math:`y` is:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
+
+    where :math:`i \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
+    and :math:`i \neq y`.
+
+    Optionally, you can give non-equal weighting on the classes by passing
+    a 1D :attr:`weight` tensor into the constructor.
+
+    The loss function then becomes:
+
+    .. math::
+        \text{loss}(x, y) = \frac{\sum_i w[y] * \max(0, \text{margin} - x[y] + x[i])^p}{\text{x.size}(0)}
+
+    Args:
+        p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
+            are the only supported values.
+        margin (float, optional): Has a default value of :math:`1`.
+        weight (Tensor, optional): a manual rescaling weight given to each
+            class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+            treated as if having all ones.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(C)`, where :math:`N` is the batch size and :math:`C` is the number of classes.
+        - Target: :math:`(N)` or :math:`()`, where each value is :math:`0 \leq \text{targets}[i] \leq C-1`.
+        - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the target.
+
+    Examples::
+
+        >>> loss = nn.MultiMarginLoss()
+        >>> x = torch.tensor([[0.1, 0.2, 0.4, 0.8]])
+        >>> y = torch.tensor([3])
+        >>> # 0.25 * ((1-(0.8-0.1)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
+        >>> loss(x, y)
+        tensor(0.32...)
+    """
+    __constants__ = ['p', 'margin', 'reduction']
+    margin: float
+    p: int
+
+    def __init__(self, p: int = 1, margin: float = 1., weight: Optional[Tensor] = None, size_average=None,
+                 reduce=None, reduction: str = 'mean') -> None:
+        super().__init__(weight, size_average, reduce, reduction)
+        if p != 1 and p != 2:
+            raise ValueError("only p == 1 and p == 2 supported")
+        if weight is not None and weight.dim() != 1 :
+            raise ValueError(
+                f"MultiMarginLoss: expected weight to be None or 1D tensor, got {weight.dim()}D instead"
+            )
+        self.p = p
+        self.margin = margin
+
+    def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
+                                   weight=self.weight, reduction=self.reduction)
+
+
+class TripletMarginLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given an input
+    tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `a`, `p` and `n` (i.e., `anchor`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
+    :math:`(N, D)`.
+
+    The distance swap is described in detail in the paper `Learning shallow
+    convolutional feature descriptors with triplet losses`_ by
+    V. Balntas, E. Riba et al.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+
+    where
+
+    .. math::
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
+
+    The norm is calculated using the specified p value and a small constant :math:`\varepsilon` is
+    added for numerical stability.
+
+    See also :class:`~torch.nn.TripletMarginWithDistanceLoss`, which computes the
+    triplet margin loss for input tensors using a custom distance function.
+
+    Args:
+        margin (float, optional): Default: :math:`1`.
+        p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
+        eps (float, optional): Small constant for numerical stability. Default: :math:`1e-6`.
+        swap (bool, optional): The distance swap is described in detail in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. Default: ``False``.
+        size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
+            the losses are averaged over each loss element in the batch. Note that for
+            some losses, there are multiple elements per sample. If the field :attr:`size_average`
+            is set to ``False``, the losses are instead summed for each minibatch. Ignored
+            when :attr:`reduce` is ``False``. Default: ``True``
+        reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
+            losses are averaged or summed over observations for each minibatch depending
+            on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
+            batch element instead and ignores :attr:`size_average`. Default: ``True``
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
+            and :attr:`reduce` are in the process of being deprecated, and in the meantime,
+            specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
+
+    Shape:
+        - Input: :math:`(N, D)` or :math:`(D)` where :math:`D` is the vector dimension.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and
+          input shape is :math:`(N, D)`; a scalar otherwise.
+
+    Examples::
+
+    >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2, eps=1e-7)
+    >>> anchor = torch.randn(100, 128, requires_grad=True)
+    >>> positive = torch.randn(100, 128, requires_grad=True)
+    >>> negative = torch.randn(100, 128, requires_grad=True)
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    .. _Learning shallow convolutional feature descriptors with triplet losses:
+        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+    """
+    __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
+    margin: float
+    p: float
+    eps: float
+    swap: bool
+
+    def __init__(self, margin: float = 1.0, p: float = 2., eps: float = 1e-6, swap: bool = False, size_average=None,
+                 reduce=None, reduction: str = 'mean'):
+        super().__init__(size_average, reduce, reduction)
+        self.margin = margin
+        self.p = p
+        self.eps = eps
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
+                                     eps=self.eps, swap=self.swap, reduction=self.reduction)
+
+
+class TripletMarginWithDistanceLoss(_Loss):
+    r"""Creates a criterion that measures the triplet loss given input
+    tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+    positive, and negative examples, respectively), and a nonnegative,
+    real-valued function ("distance function") used to compute the relationship
+    between the anchor and positive example ("positive distance") and the
+    anchor and negative example ("negative distance").
+
+    The unreduced loss (i.e., with :attr:`reduction` set to ``'none'``)
+    can be described as:
+
+    .. math::
+        \ell(a, p, n) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_i = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
+
+    where :math:`N` is the batch size; :math:`d` is a nonnegative, real-valued function
+    quantifying the closeness of two tensors, referred to as the :attr:`distance_function`;
+    and :math:`margin` is a nonnegative margin representing the minimum difference
+    between the positive and negative distances that is required for the loss to
+    be 0.  The input tensors have :math:`N` elements each and can be of any shape
+    that the distance function can handle.
+
+    If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        \ell(x, y) =
+        \begin{cases}
+            \operatorname{mean}(L), &  \text{if reduction} = \text{`mean';}\\
+            \operatorname{sum}(L),  &  \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    See also :class:`~torch.nn.TripletMarginLoss`, which computes the triplet
+    loss for input tensors using the :math:`l_p` distance as the distance function.
+
+    Args:
+        distance_function (Callable, optional): A nonnegative, real-valued function that
+            quantifies the closeness of two tensors. If not specified,
+            `nn.PairwiseDistance` will be used.  Default: ``None``
+        margin (float, optional): A nonnegative margin representing the minimum difference
+            between the positive and negative distances required for the loss to be 0. Larger
+            margins penalize cases where the negative examples are not distant enough from the
+            anchors, relative to the positives. Default: :math:`1`.
+        swap (bool, optional): Whether to use the distance swap described in the paper
+            `Learning shallow convolutional feature descriptors with triplet losses` by
+            V. Balntas, E. Riba et al. If True, and if the positive example is closer to the
+            negative example than the anchor is, swaps the positive example and the anchor in
+            the loss computation. Default: ``False``.
+        reduction (str, optional): Specifies the (optional) reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed. Default: ``'mean'``
+
+
+    Shape:
+        - Input: :math:`(N, *)` where :math:`*` represents any number of additional dimensions
+          as supported by the distance function.
+        - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'``, or a scalar
+          otherwise.
+
+    Examples::
+
+    >>> # Initialize embeddings
+    >>> embedding = nn.Embedding(1000, 128)
+    >>> anchor_ids = torch.randint(0, 1000, (1,))
+    >>> positive_ids = torch.randint(0, 1000, (1,))
+    >>> negative_ids = torch.randint(0, 1000, (1,))
+    >>> anchor = embedding(anchor_ids)
+    >>> positive = embedding(positive_ids)
+    >>> negative = embedding(negative_ids)
+    >>>
+    >>> # Built-in Distance Function
+    >>> triplet_loss = \
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance())
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function
+    >>> def l_infinity(x1, x2):
+    >>>     return torch.max(torch.abs(x1 - x2), dim=1).values
+    >>>
+    >>> # xdoctest: +SKIP("FIXME: Would call backwards a second time")
+    >>> triplet_loss = (
+    >>>     nn.TripletMarginWithDistanceLoss(distance_function=l_infinity, margin=1.5))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+    >>>
+    >>> # Custom Distance Function (Lambda)
+    >>> triplet_loss = (
+    >>>     nn.TripletMarginWithDistanceLoss(
+    >>>         distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
+    >>> output = triplet_loss(anchor, positive, negative)
+    >>> output.backward()
+
+    Reference:
+        V. Balntas, et al.: Learning shallow convolutional feature descriptors with triplet losses:
+        http://www.bmva.org/bmvc/2016/papers/paper119/index.html
+    """
+    __constants__ = ['margin', 'swap', 'reduction']
+    margin: float
+    swap: bool
+
+    def __init__(self, *, distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+                 margin: float = 1.0, swap: bool = False, reduction: str = 'mean'):
+        super().__init__(size_average=None, reduce=None, reduction=reduction)
+        self.distance_function: Optional[Callable[[Tensor, Tensor], Tensor]] = \
+            distance_function if distance_function is not None else PairwiseDistance()
+        self.margin = margin
+        self.swap = swap
+
+    def forward(self, anchor: Tensor, positive: Tensor, negative: Tensor) -> Tensor:
+        return F.triplet_margin_with_distance_loss(anchor, positive, negative,
+                                                   distance_function=self.distance_function,
+                                                   margin=self.margin, swap=self.swap, reduction=self.reduction)
+
+
+class CTCLoss(_Loss):
+    r"""The Connectionist Temporal Classification loss.
+
+    Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
+    probability of possible alignments of input to target, producing a loss value which is differentiable
+    with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
+    limits the length of the target sequence such that it must be :math:`\leq` the input length.
+
+    Args:
+        blank (int, optional): blank label. Default :math:`0`.
+        reduction (str, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the output losses will be divided by the target lengths and
+            then the mean over the batch is taken, ``'sum'``: the output losses will be summed.
+            Default: ``'mean'``
+        zero_infinity (bool, optional):
+            Whether to zero infinite losses and the associated gradients.
+            Default: ``False``
+            Infinite losses mainly occur when the inputs are too short
+            to be aligned to the targets.
+
+    Shape:
+        - Log_probs: Tensor of size :math:`(T, N, C)` or :math:`(T, C)`,
+          where :math:`T = \text{input length}`,
+          :math:`N = \text{batch size}`, and
+          :math:`C = \text{number of classes (including blank)}`.
+          The logarithmized probabilities of the outputs (e.g. obtained with
+          :func:`torch.nn.functional.log_softmax`).
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\operatorname{sum}(\text{target\_lengths}))`,
+          where :math:`N = \text{batch size}` and
+          :math:`S = \text{max target length, if shape is } (N, S)`.
+          It represent the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
+          where :math:`N = \text{batch size}`. It represent the lengths of the
+          inputs (must each be :math:`\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)` or :math:`()`,
+          where :math:`N = \text{batch size}`. It represent lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+        - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
+          ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N)` if input is batched or
+          :math:`()` if input is unbatched, where :math:`N = \text{batch size}`.
+
+    Examples::
+
+        >>> # Target are to be padded
+        >>> T = 50      # Input sequence length
+        >>> C = 20      # Number of classes (including blank)
+        >>> N = 16      # Batch size
+        >>> S = 30      # Target sequence length of longest target in batch (padding length)
+        >>> S_min = 10  # Minimum target length, for demonstration purposes
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
+        >>>
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>> target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # Target are to be un-padded
+        >>> T = 50      # Input sequence length
+        >>> C = 20      # Number of classes (including blank)
+        >>> N = 16      # Batch size
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        >>> target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long)
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # Target are to be un-padded and unbatched (effectively N=1)
+        >>> T = 50      # Input sequence length
+        >>> C = 20      # Number of classes (including blank)
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,C)
+        >>> # xdoctest: +SKIP("FIXME: error in doctest")
+        >>> input = torch.randn(T, C).log_softmax(1).detach().requires_grad_()
+        >>> input_lengths = torch.tensor(T, dtype=torch.long)
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target_lengths = torch.randint(low=1, high=T, size=(), dtype=torch.long)
+        >>> target = torch.randint(low=1, high=C, size=(target_lengths,), dtype=torch.long)
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+
+    Note:
+        In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
+        in concatenated format, all :attr:`input_lengths` must be `T`.  :math:`blank=0`,
+        :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
+        dtype :attr:`torch.int32`.
+
+        The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
+
+
+    Note:
+        In some circumstances when using the CUDA backend with CuDNN, this operator
+        may select a nondeterministic algorithm to increase performance. If this is
+        undesirable, you can try to make the operation deterministic (potentially at
+        a performance cost) by setting ``torch.backends.cudnn.deterministic =
+        True``.
+        Please see the notes on :doc:`/notes/randomness` for background.
+    """
+    __constants__ = ['blank', 'reduction']
+    blank: int
+    zero_infinity: bool
+
+    def __init__(self, blank: int = 0, reduction: str = 'mean', zero_infinity: bool = False):
+        super().__init__(reduction=reduction)
+        self.blank = blank
+        self.zero_infinity = zero_infinity
+
+    def forward(self, log_probs: Tensor, targets: Tensor, input_lengths: Tensor, target_lengths: Tensor) -> Tensor:
+        return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
+                          self.zero_infinity)
+
+# TODO: L1HingeEmbeddingCriterion
+# TODO: MSECriterion weight
+# TODO: ClassSimplexCriterion
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/module.py b/MLPY/Lib/site-packages/torch/nn/modules/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..42efbef6606b03579f46d563be9c7df6fb149f64
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/module.py
@@ -0,0 +1,2577 @@
+from collections import OrderedDict, namedtuple
+import itertools
+import warnings
+import functools
+import weakref
+
+import torch
+from torch._prims_common import DeviceLikeType
+from ..parameter import Parameter
+import torch.utils.hooks as hooks
+
+from torch import Tensor, device, dtype
+from typing import Union, Tuple, Any, Callable, Iterator, Set, Optional, overload, TypeVar, Mapping, Dict, List
+from typing_extensions import Self
+from ...utils.hooks import RemovableHandle
+from torch.utils._python_dispatch import is_traceable_wrapper_subclass
+
+__all__ = ['register_module_forward_pre_hook', 'register_module_forward_hook',
+           'register_module_full_backward_pre_hook', 'register_module_backward_hook',
+           'register_module_full_backward_hook', 'register_module_buffer_registration_hook',
+           'register_module_module_registration_hook', 'register_module_parameter_registration_hook', 'Module']
+
+_grad_t = Union[Tuple[Tensor, ...], Tensor]
+# See https://mypy.readthedocs.io/en/latest/generics.html#generic-methods-and-generic-self for the use
+# of `T` to annotate `self`. Many methods of `Module` return `self` and we want those return values to be
+# the type of the subclass, not the looser type of `Module`.
+T = TypeVar('T', bound='Module')
+
+
+class _IncompatibleKeys(namedtuple('IncompatibleKeys', ['missing_keys', 'unexpected_keys'])):
+    def __repr__(self):
+        if not self.missing_keys and not self.unexpected_keys:
+            return '<All keys matched successfully>'
+        return super().__repr__()
+
+    __str__ = __repr__
+
+
+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+
+r"""This tracks hooks common to all modules that are executed immediately before
+.registering the buffer/module/parameter"""
+_global_buffer_registration_hooks: Dict[int, Callable] = OrderedDict()
+_global_module_registration_hooks: Dict[int, Callable] = OrderedDict()
+_global_parameter_registration_hooks: Dict[int, Callable] = OrderedDict()
+
+class _WrappedHook:
+    def __init__(self, hook: Callable, module: Optional["Module"] = None):
+        self.hook: Callable = hook
+        functools.update_wrapper(self, hook)
+
+        self.with_module: bool = False
+
+        if module is not None:
+            self.module: weakref.ReferenceType[Module] = weakref.ref(module)
+            self.with_module = True
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        if self.with_module:
+            module = self.module()
+            if module is None:
+                raise RuntimeError("You are trying to call the hook of a dead Module!")
+            return self.hook(module, *args, **kwargs)
+        return self.hook(*args, **kwargs)
+
+    def __getstate__(self) -> Dict:
+        result = {"hook": self.hook, "with_module": self.with_module}
+        if self.with_module:
+            result["module"] = self.module()
+
+        return result
+
+    def __setstate__(self, state: Dict):
+        self.hook = state["hook"]
+        self.with_module = state["with_module"]
+
+        if self.with_module:
+            if state["module"] is None:
+                raise RuntimeError("You are trying to revive the hook of a dead Module!")
+            self.module = weakref.ref(state["module"])
+
+
+r"""This tracks hooks common to all modules that are executed before/after
+calling forward and backward. This is global state used for debugging/profiling
+purposes"""
+_global_backward_pre_hooks: Dict[int, Callable] = OrderedDict()
+_global_backward_hooks: Dict[int, Callable] = OrderedDict()
+_global_is_full_backward_hook: Optional[bool] = None
+_global_forward_pre_hooks: Dict[int, Callable] = OrderedDict()
+_global_forward_hooks: Dict[int, Callable] = OrderedDict()
+_global_forward_hooks_always_called: Dict[int, bool] = OrderedDict()
+
+_EXTRA_STATE_KEY_SUFFIX = '_extra_state'
+
+
+def register_module_buffer_registration_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a buffer registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_buffer` is invoked.
+    It should have the following signature::
+
+        hook(module, name, buffer) -> None or new buffer
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_buffer_registration_hooks)
+    _global_buffer_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_module_registration_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a module registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_module` is invoked.
+    It should have the following signature::
+
+        hook(module, name, submodule) -> None or new submodule
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_module_registration_hooks)
+    _global_module_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_parameter_registration_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a parameter registration hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.Module` module
+
+    The hook will be called every time :func:`register_parameter` is invoked.
+    It should have the following signature::
+
+        hook(module, name, param) -> None or new parameter
+
+    The hook can modify the input or return a single modified value in the hook.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_parameter_registration_hooks)
+    _global_parameter_registration_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_forward_pre_hook(hook: Callable[..., None]) -> RemovableHandle:
+    r"""Register a forward pre-hook common to all modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    The hook will be called every time before :func:`forward` is invoked.
+    It should have the following signature::
+
+        hook(module, input) -> None or modified input
+
+    The input contains only the positional arguments given to the module.
+    Keyword arguments won't be passed to the hooks and only to the ``forward``.
+    The hook can modify the input. User can either return a tuple or a
+    single modified value in the hook. We will wrap the value into a tuple
+    if a single value is returned(unless that value is already a tuple).
+
+    This hook has precedence over the specific module hooks registered with
+    ``register_forward_pre_hook``.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_forward_pre_hooks)
+    _global_forward_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_forward_hook(hook: Callable[..., None], *, always_call: bool = False) -> RemovableHandle:
+    r"""Register a global forward hook for all the modules.
+
+    .. warning ::
+
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    The hook will be called every time after :func:`forward` has computed an output.
+    It should have the following signature::
+
+        hook(module, input, output) -> None or modified output
+
+    The input contains only the positional arguments given to the module.
+    Keyword arguments won't be passed to the hooks and only to the ``forward``.
+    The hook can modify the output. It can modify the input inplace but
+    it will not have effect on forward since this is called after
+    :func:`forward` is called.
+
+    Parameters:
+        hook (Callable): The user defined hook to be registered.
+        always_call (bool): If ``True`` the ``hook`` will be run regardless of
+            whether an exception is raised while calling the Module.
+            Default: ``False``
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    This hook will be executed before specific module hooks registered with
+    ``register_forward_hook``.
+    """
+    handle = hooks.RemovableHandle(_global_forward_hooks,
+                                   extra_dict=_global_forward_hooks_always_called)
+    _global_forward_hooks[handle.id] = hook
+    if always_call:
+        _global_forward_hooks_always_called[handle.id] = True
+    return handle
+
+
+def register_module_backward_hook(
+    hook: Callable[['Module', _grad_t, _grad_t], Union[None, _grad_t]]
+) -> RemovableHandle:
+    r"""Register a backward hook common to all the modules.
+
+    This function is deprecated in favor of
+    :func:`torch.nn.modules.module.register_module_full_backward_hook`
+    and the behavior of this function will change in future versions.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    global _global_is_full_backward_hook
+    if _global_is_full_backward_hook is True:
+        raise RuntimeError("Cannot use both regular backward hooks and full backward hooks as a "
+                           "global Module hook. Please use only one of them.")
+
+    _global_is_full_backward_hook = False
+
+    handle = hooks.RemovableHandle(_global_backward_hooks)
+    _global_backward_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_full_backward_pre_hook(
+    hook: Callable[['Module', _grad_t], Union[None, _grad_t]]
+) -> RemovableHandle:
+    r"""Register a backward pre-hook common to all the modules.
+
+    .. warning ::
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_pre_hook`.
+    Refer to its documentation for more details.
+
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_pre_hook`.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    handle = hooks.RemovableHandle(_global_backward_pre_hooks)
+    _global_backward_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_module_full_backward_hook(
+    hook: Callable[['Module', _grad_t, _grad_t], Union[None, _grad_t]]
+) -> RemovableHandle:
+    r"""Register a backward hook common to all the modules.
+
+    .. warning ::
+        This adds global state to the `nn.module` module
+        and it is only intended for debugging/profiling purposes.
+
+    Hooks registered using this function behave in the same way as those
+    registered by :meth:`torch.nn.Module.register_full_backward_hook`.
+    Refer to its documentation for more details.
+
+    Hooks registered using this function will be called before hooks registered
+    using :meth:`torch.nn.Module.register_full_backward_hook`.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+
+    """
+    global _global_is_full_backward_hook
+    if _global_is_full_backward_hook is False:
+        raise RuntimeError("Cannot use both regular backward hooks and full backward hooks as a "
+                           "global Module hook. Please use only one of them.")
+
+    _global_is_full_backward_hook = True
+
+    handle = hooks.RemovableHandle(_global_backward_hooks)
+    _global_backward_hooks[handle.id] = hook
+    return handle
+
+
+# Trick mypy into not applying contravariance rules to inputs by defining
+# forward as a value, rather than a function.  See also
+# https://github.com/python/mypy/issues/8795
+def _forward_unimplemented(self, *input: Any) -> None:
+    r"""Define the computation performed at every call.
+
+    Should be overridden by all subclasses.
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        registered hooks while the latter silently ignores them.
+    """
+    raise NotImplementedError(f"Module [{type(self).__name__}] is missing the required \"forward\" function")
+
+
+class Module:
+    r"""Base class for all neural network modules.
+
+    Your models should also subclass this class.
+
+    Modules can also contain other Modules, allowing to nest them in
+    a tree structure. You can assign the submodules as regular attributes::
+
+        import torch.nn as nn
+        import torch.nn.functional as F
+
+        class Model(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv1 = nn.Conv2d(1, 20, 5)
+                self.conv2 = nn.Conv2d(20, 20, 5)
+
+            def forward(self, x):
+                x = F.relu(self.conv1(x))
+                return F.relu(self.conv2(x))
+
+    Submodules assigned in this way will be registered, and will have their
+    parameters converted too when you call :meth:`to`, etc.
+
+    .. note::
+        As per the example above, an ``__init__()`` call to the parent class
+        must be made before assignment on the child.
+
+    :ivar training: Boolean represents whether this module is in training or
+                    evaluation mode.
+    :vartype training: bool
+    """
+
+    dump_patches: bool = False
+
+    _version: int = 1
+    r"""This allows better BC support for :meth:`load_state_dict`. In
+    :meth:`state_dict`, the version number will be saved as in the attribute
+    `_metadata` of the returned state dict, and thus pickled. `_metadata` is a
+    dictionary with keys that follow the naming convention of state dict. See
+    ``_load_from_state_dict`` on how to use this information in loading.
+
+    If new parameters/buffers are added/removed from a module, this number shall
+    be bumped, and the module's `_load_from_state_dict` method can compare the
+    version number and do appropriate changes if the state dict is from before
+    the change."""
+
+    training: bool
+    _parameters: Dict[str, Optional[Parameter]]
+    _buffers: Dict[str, Optional[Tensor]]
+    _non_persistent_buffers_set: Set[str]
+    _backward_pre_hooks: Dict[int, Callable]
+    _backward_hooks: Dict[int, Callable]
+    _is_full_backward_hook: Optional[bool]
+    _forward_hooks: Dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support Set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_hooks_with_kwargs: Dict[int, bool]
+    # forward hooks that should always be called even if an exception is raised
+    _forward_hooks_always_called: Dict[int, bool]
+    _forward_pre_hooks: Dict[int, Callable]
+    # Marks whether the corresponding _forward_hooks accept kwargs or not.
+    # As JIT does not support Set[int], this dict is used as a set, where all
+    # hooks represented in this dict accept kwargs.
+    _forward_pre_hooks_with_kwargs: Dict[int, bool]
+    _state_dict_hooks: Dict[int, Callable]
+    _load_state_dict_pre_hooks: Dict[int, Callable]
+    _state_dict_pre_hooks: Dict[int, Callable]
+    _load_state_dict_post_hooks: Dict[int, Callable]
+    _modules: Dict[str, Optional['Module']]
+    call_super_init: bool = False
+    _compiled_call_impl : Optional[Callable] = None
+
+    def __init__(self, *args, **kwargs) -> None:
+        """Initialize internal Module state, shared by both nn.Module and ScriptModule."""
+        torch._C._log_api_usage_once("python.nn_module")
+
+        # Backward compatibility: no args used to be allowed when call_super_init=False
+        if self.call_super_init is False and bool(kwargs):
+            raise TypeError("{}.__init__() got an unexpected keyword argument '{}'"
+                            "".format(type(self).__name__, next(iter(kwargs))))
+
+        if self.call_super_init is False and bool(args):
+            raise TypeError(f"{type(self).__name__}.__init__() takes 1 positional argument but {len(args) + 1} were"
+                            " given")
+
+        """
+        Calls super().__setattr__('a', a) instead of the typical self.a = a
+        to avoid Module.__setattr__ overhead. Module's __setattr__ has special
+        handling for parameters, submodules, and buffers but simply calls into
+        super().__setattr__ for all other attributes.
+        """
+        super().__setattr__('training', True)
+        super().__setattr__('_parameters', OrderedDict())
+        super().__setattr__('_buffers', OrderedDict())
+        super().__setattr__('_non_persistent_buffers_set', set())
+        super().__setattr__('_backward_pre_hooks', OrderedDict())
+        super().__setattr__('_backward_hooks', OrderedDict())
+        super().__setattr__('_is_full_backward_hook', None)
+        super().__setattr__('_forward_hooks', OrderedDict())
+        super().__setattr__('_forward_hooks_with_kwargs', OrderedDict())
+        super().__setattr__('_forward_hooks_always_called', OrderedDict())
+        super().__setattr__('_forward_pre_hooks', OrderedDict())
+        super().__setattr__('_forward_pre_hooks_with_kwargs', OrderedDict())
+        super().__setattr__('_state_dict_hooks', OrderedDict())
+        super().__setattr__('_state_dict_pre_hooks', OrderedDict())
+        super().__setattr__('_load_state_dict_pre_hooks', OrderedDict())
+        super().__setattr__('_load_state_dict_post_hooks', OrderedDict())
+        super().__setattr__('_modules', OrderedDict())
+
+        if self.call_super_init:
+            super().__init__(*args, **kwargs)
+
+    forward: Callable[..., Any] = _forward_unimplemented
+
+    def register_buffer(self, name: str, tensor: Optional[Tensor], persistent: bool = True) -> None:
+        r"""Add a buffer to the module.
+
+        This is typically used to register a buffer that should not to be
+        considered a model parameter. For example, BatchNorm's ``running_mean``
+        is not a parameter, but is part of the module's state. Buffers, by
+        default, are persistent and will be saved alongside parameters. This
+        behavior can be changed by setting :attr:`persistent` to ``False``. The
+        only difference between a persistent buffer and a non-persistent buffer
+        is that the latter will not be a part of this module's
+        :attr:`state_dict`.
+
+        Buffers can be accessed as attributes using given names.
+
+        Args:
+            name (str): name of the buffer. The buffer can be accessed
+                from this module using the given name
+            tensor (Tensor or None): buffer to be registered. If ``None``, then operations
+                that run on buffers, such as :attr:`cuda`, are ignored. If ``None``,
+                the buffer is **not** included in the module's :attr:`state_dict`.
+            persistent (bool): whether the buffer is part of this module's
+                :attr:`state_dict`.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> self.register_buffer('running_mean', torch.zeros(num_features))
+
+        """
+        if persistent is False and isinstance(self, torch.jit.ScriptModule):
+            raise RuntimeError("ScriptModule does not support non-persistent buffers")
+
+        if '_buffers' not in self.__dict__:
+            raise AttributeError(
+                "cannot assign buffer before Module.__init__() call")
+        elif not isinstance(name, str):
+            raise TypeError(f"buffer name should be a string. Got {torch.typename(name)}")
+        elif '.' in name:
+            raise KeyError("buffer name can't contain \".\"")
+        elif name == '':
+            raise KeyError("buffer name can't be empty string \"\"")
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError(f"attribute '{name}' already exists")
+        elif tensor is not None and not isinstance(tensor, torch.Tensor):
+            raise TypeError(f"cannot assign '{torch.typename(tensor)}' object to buffer '{name}' "
+                            "(torch Tensor or None required)"
+                            )
+        else:
+            for hook in _global_buffer_registration_hooks.values():
+                output = hook(self, name, tensor)
+                if output is not None:
+                    tensor = output
+            self._buffers[name] = tensor
+            if persistent:
+                self._non_persistent_buffers_set.discard(name)
+            else:
+                self._non_persistent_buffers_set.add(name)
+
+    def register_parameter(self, name: str, param: Optional[Parameter]) -> None:
+        r"""Add a parameter to the module.
+
+        The parameter can be accessed as an attribute using given name.
+
+        Args:
+            name (str): name of the parameter. The parameter can be accessed
+                from this module using the given name
+            param (Parameter or None): parameter to be added to the module. If
+                ``None``, then operations that run on parameters, such as :attr:`cuda`,
+                are ignored. If ``None``, the parameter is **not** included in the
+                module's :attr:`state_dict`.
+        """
+        if '_parameters' not in self.__dict__:
+            raise AttributeError(
+                "cannot assign parameter before Module.__init__() call")
+
+        elif not isinstance(name, str):
+            raise TypeError(f"parameter name should be a string. Got {torch.typename(name)}")
+        elif '.' in name:
+            raise KeyError("parameter name can't contain \".\"")
+        elif name == '':
+            raise KeyError("parameter name can't be empty string \"\"")
+        elif hasattr(self, name) and name not in self._parameters:
+            raise KeyError(f"attribute '{name}' already exists")
+
+        if param is None:
+            self._parameters[name] = None
+        elif not isinstance(param, Parameter):
+            raise TypeError(f"cannot assign '{torch.typename(param)}' object to parameter '{name}' "
+                            "(torch.nn.Parameter or None required)"
+                            )
+        elif param.grad_fn:
+            raise ValueError(
+                f"Cannot assign non-leaf Tensor to parameter '{name}'. Model "
+                f"parameters must be created explicitly. To express '{name}' "
+                "as a function of another Tensor, compute the value in "
+                "the forward() method.")
+        else:
+            for hook in _global_parameter_registration_hooks.values():
+                output = hook(self, name, param)
+                if output is not None:
+                    param = output
+            self._parameters[name] = param
+
+    def add_module(self, name: str, module: Optional['Module']) -> None:
+        r"""Add a child module to the current module.
+
+        The module can be accessed as an attribute using the given name.
+
+        Args:
+            name (str): name of the child module. The child module can be
+                accessed from this module using the given name
+            module (Module): child module to be added to the module.
+        """
+        if not isinstance(module, Module) and module is not None:
+            raise TypeError(f"{torch.typename(module)} is not a Module subclass")
+        elif not isinstance(name, str):
+            raise TypeError(f"module name should be a string. Got {torch.typename(name)}")
+        elif hasattr(self, name) and name not in self._modules:
+            raise KeyError(f"attribute '{name}' already exists")
+        elif '.' in name:
+            raise KeyError(f"module name can't contain \".\", got: {name}")
+        elif name == '':
+            raise KeyError("module name can't be empty string \"\"")
+        for hook in _global_module_registration_hooks.values():
+            output = hook(self, name, module)
+            if output is not None:
+                module = output
+        self._modules[name] = module
+
+    def register_module(self, name: str, module: Optional['Module']) -> None:
+        r"""Alias for :func:`add_module`."""
+        self.add_module(name, module)
+
+    def get_submodule(self, target: str) -> "Module":
+        """Return the submodule given by ``target`` if it exists, otherwise throw an error.
+
+        For example, let's say you have an ``nn.Module`` ``A`` that
+        looks like this:
+
+        .. code-block:: text
+
+            A(
+                (net_b): Module(
+                    (net_c): Module(
+                        (conv): Conv2d(16, 33, kernel_size=(3, 3), stride=(2, 2))
+                    )
+                    (linear): Linear(in_features=100, out_features=200, bias=True)
+                )
+            )
+
+        (The diagram shows an ``nn.Module`` ``A``. ``A`` has a nested
+        submodule ``net_b``, which itself has two submodules ``net_c``
+        and ``linear``. ``net_c`` then has a submodule ``conv``.)
+
+        To check whether or not we have the ``linear`` submodule, we
+        would call ``get_submodule("net_b.linear")``. To check whether
+        we have the ``conv`` submodule, we would call
+        ``get_submodule("net_b.net_c.conv")``.
+
+        The runtime of ``get_submodule`` is bounded by the degree
+        of module nesting in ``target``. A query against
+        ``named_modules`` achieves the same result, but it is O(N) in
+        the number of transitive modules. So, for a simple check to see
+        if some submodule exists, ``get_submodule`` should always be
+        used.
+
+        Args:
+            target: The fully-qualified string name of the submodule
+                to look for. (See above example for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.nn.Module: The submodule referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not an
+                ``nn.Module``
+        """
+        if target == "":
+            return self
+
+        atoms: List[str] = target.split(".")
+        mod: torch.nn.Module = self
+
+        for item in atoms:
+
+            if not hasattr(mod, item):
+                raise AttributeError(mod._get_name() + " has no "
+                                     "attribute `" + item + "`")
+
+            mod = getattr(mod, item)
+
+            if not isinstance(mod, torch.nn.Module):
+                raise AttributeError("`" + item + "` is not "
+                                     "an nn.Module")
+
+        return mod
+
+    def get_parameter(self, target: str) -> "Parameter":
+        """Return the parameter given by ``target`` if it exists, otherwise throw an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the Parameter
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.nn.Parameter: The Parameter referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not an
+                ``nn.Parameter``
+        """
+        module_path, _, param_name = target.rpartition(".")
+
+        mod: torch.nn.Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, param_name):
+            raise AttributeError(mod._get_name() + " has no attribute `"
+                                 + param_name + "`")
+
+        param: torch.nn.Parameter = getattr(mod, param_name)
+
+        if not isinstance(param, torch.nn.Parameter):
+            raise AttributeError("`" + param_name + "` is not an "
+                                 "nn.Parameter")
+
+        return param
+
+    def get_buffer(self, target: str) -> "Tensor":
+        """Return the buffer given by ``target`` if it exists, otherwise throw an error.
+
+        See the docstring for ``get_submodule`` for a more detailed
+        explanation of this method's functionality as well as how to
+        correctly specify ``target``.
+
+        Args:
+            target: The fully-qualified string name of the buffer
+                to look for. (See ``get_submodule`` for how to specify a
+                fully-qualified string.)
+
+        Returns:
+            torch.Tensor: The buffer referenced by ``target``
+
+        Raises:
+            AttributeError: If the target string references an invalid
+                path or resolves to something that is not a
+                buffer
+        """
+        module_path, _, buffer_name = target.rpartition(".")
+
+        mod: torch.nn.Module = self.get_submodule(module_path)
+
+        if not hasattr(mod, buffer_name):
+            raise AttributeError(mod._get_name() + " has no attribute `"
+                                 + buffer_name + "`")
+
+        buffer: torch.Tensor = getattr(mod, buffer_name)
+
+        if buffer_name not in mod._buffers:
+            raise AttributeError("`" + buffer_name + "` is not a buffer")
+
+        return buffer
+
+    def get_extra_state(self) -> Any:
+        """Return any extra state to include in the module's state_dict.
+
+        Implement this and a corresponding :func:`set_extra_state` for your module
+        if you need to store extra state. This function is called when building the
+        module's `state_dict()`.
+
+        Note that extra state should be picklable to ensure working serialization
+        of the state_dict. We only provide provide backwards compatibility guarantees
+        for serializing Tensors; other objects may break backwards compatibility if
+        their serialized pickled form changes.
+
+        Returns:
+            object: Any extra state to store in the module's state_dict
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.get_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+            "to report this bug.")
+
+    def set_extra_state(self, state: Any) -> None:
+        """Set extra state contained in the loaded `state_dict`.
+
+        This function is called from :func:`load_state_dict` to handle any extra state
+        found within the `state_dict`. Implement this function and a corresponding
+        :func:`get_extra_state` for your module if you need to store extra state within its
+        `state_dict`.
+
+        Args:
+            state (dict): Extra state from the `state_dict`
+        """
+        raise RuntimeError(
+            "Reached a code path in Module.set_extra_state() that should never be called. "
+            "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+            "to report this bug.")
+
+    def _apply(self, fn, recurse=True):
+        if recurse:
+            for module in self.children():
+                module._apply(fn)
+
+        def compute_should_use_set_data(tensor, tensor_applied):
+            if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
+                # If the new tensor has compatible tensor type as the existing tensor,
+                # the current behavior is to change the tensor in-place using `.data =`,
+                # and the future behavior is to overwrite the existing tensor. However,
+                # changing the current behavior is a BC-breaking change, and we want it
+                # to happen in future releases. So for now we introduce the
+                # `torch.__future__.get_overwrite_module_params_on_conversion()`
+                # global flag to let the user control whether they want the future
+                # behavior of overwriting the existing tensor or not.
+                return not torch.__future__.get_overwrite_module_params_on_conversion()
+            else:
+                return False
+
+        should_use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion()
+
+        for key, param in self._parameters.items():
+            if param is None:
+                continue
+            # Tensors stored in modules are graph leaves, and we don't want to
+            # track autograd history of `param_applied`, so we have to use
+            # `with torch.no_grad():`
+            with torch.no_grad():
+                param_applied = fn(param)
+            p_should_use_set_data = compute_should_use_set_data(param, param_applied)
+
+            # subclasses may have multiple child tensors so we need to use swap_tensors
+            p_should_use_swap_tensors = should_use_swap_tensors or is_traceable_wrapper_subclass(param_applied)
+
+            param_grad = param.grad
+            if p_should_use_swap_tensors:
+                try:
+                    if param_grad is not None:
+                        # Accessing param.grad makes its at::Tensor's use_count 2, which will prevent swapping.
+                        # Decrement use count of the gradient by setting to None
+                        param.grad = None
+                    param_applied = torch.nn.Parameter(param_applied, requires_grad=param.requires_grad)
+                    torch.utils.swap_tensors(param, param_applied)
+                except Exception as e:
+                    if param_grad is not None:
+                        param.grad = param_grad
+                    raise RuntimeError(f"_apply(): Couldn't swap {self._get_name()}.{key}") from e
+                out_param = param
+            elif p_should_use_set_data:
+                param.data = param_applied
+                out_param = param
+            else:
+                assert isinstance(param, Parameter)
+                assert param.is_leaf
+                out_param = Parameter(param_applied, param.requires_grad)
+                self._parameters[key] = out_param
+
+            if param_grad is not None:
+                with torch.no_grad():
+                    grad_applied = fn(param_grad)
+                g_should_use_set_data = compute_should_use_set_data(param_grad, grad_applied)
+                if p_should_use_swap_tensors:
+                    grad_applied.requires_grad_(param_grad.requires_grad)
+                    try:
+                        torch.utils.swap_tensors(param_grad, grad_applied)
+                    except Exception as e:
+                        raise RuntimeError(f"_apply(): Couldn't swap {self._get_name()}.{key}.grad") from e
+                    out_param.grad = param_grad
+                elif g_should_use_set_data:
+                    assert out_param.grad is not None
+                    out_param.grad.data = grad_applied
+                else:
+                    assert param_grad.is_leaf
+                    out_param.grad = grad_applied.requires_grad_(param_grad.requires_grad)
+
+        for key, buf in self._buffers.items():
+            if buf is not None:
+                self._buffers[key] = fn(buf)
+
+        return self
+
+    def apply(self: T, fn: Callable[['Module'], None]) -> T:
+        r"""Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
+
+        Typical use includes initializing the parameters of a model
+        (see also :ref:`nn-init-doc`).
+
+        Args:
+            fn (:class:`Module` -> None): function to be applied to each submodule
+
+        Returns:
+            Module: self
+
+        Example::
+
+            >>> @torch.no_grad()
+            >>> def init_weights(m):
+            >>>     print(m)
+            >>>     if type(m) == nn.Linear:
+            >>>         m.weight.fill_(1.0)
+            >>>         print(m.weight)
+            >>> net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
+            >>> net.apply(init_weights)
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[1., 1.],
+                    [1., 1.]], requires_grad=True)
+            Linear(in_features=2, out_features=2, bias=True)
+            Parameter containing:
+            tensor([[1., 1.],
+                    [1., 1.]], requires_grad=True)
+            Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+
+        """
+        for module in self.children():
+            module.apply(fn)
+        fn(self)
+        return self
+
+    def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
+        r"""Move all model parameters and buffers to the GPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on GPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cuda(device))
+
+    def ipu(self: T, device: Optional[Union[int, device]] = None) -> T:
+        r"""Move all model parameters and buffers to the IPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on IPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.ipu(device))
+
+    def xpu(self: T, device: Optional[Union[int, device]] = None) -> T:
+        r"""Move all model parameters and buffers to the XPU.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on XPU while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Arguments:
+            device (int, optional): if specified, all parameters will be
+                copied to that device
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.xpu(device))
+
+    def cpu(self: T) -> T:
+        r"""Move all model parameters and buffers to the CPU.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.cpu())
+
+    def type(self: T, dst_type: Union[dtype, str]) -> T:
+        r"""Casts all parameters and buffers to :attr:`dst_type`.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            dst_type (type or string): the desired type
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.type(dst_type))
+
+    def float(self: T) -> T:
+        r"""Casts all floating point parameters and buffers to ``float`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.float() if t.is_floating_point() else t)
+
+    def double(self: T) -> T:
+        r"""Casts all floating point parameters and buffers to ``double`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.double() if t.is_floating_point() else t)
+
+    def half(self: T) -> T:
+        r"""Casts all floating point parameters and buffers to ``half`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
+
+    def bfloat16(self: T) -> T:
+        r"""Casts all floating point parameters and buffers to ``bfloat16`` datatype.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: t.bfloat16() if t.is_floating_point() else t)
+
+    def to_empty(self: T, *, device: Optional[DeviceLikeType], recurse: bool = True) -> T:
+        r"""Move the parameters and buffers to the specified device without copying storage.
+
+        Args:
+            device (:class:`torch.device`): The desired device of the parameters
+                and buffers in this module.
+            recurse (bool): Whether parameters and buffers of submodules should
+                be recursively moved to the specified device.
+
+        Returns:
+            Module: self
+        """
+        return self._apply(lambda t: torch.empty_like(t, device=device), recurse=recurse)
+
+    @overload
+    def to(self, device: Optional[DeviceLikeType] = ..., dtype: Optional[dtype] = ...,
+           non_blocking: bool = ...) -> Self:
+        ...
+
+    @overload
+    def to(self, dtype: dtype, non_blocking: bool = ...) -> Self:
+        ...
+
+    @overload
+    def to(self, tensor: Tensor, non_blocking: bool = ...) -> Self:
+        ...
+
+    def to(self, *args, **kwargs):
+        r"""Move and/or cast the parameters and buffers.
+
+        This can be called as
+
+        .. function:: to(device=None, dtype=None, non_blocking=False)
+           :noindex:
+
+        .. function:: to(dtype, non_blocking=False)
+           :noindex:
+
+        .. function:: to(tensor, non_blocking=False)
+           :noindex:
+
+        .. function:: to(memory_format=torch.channels_last)
+           :noindex:
+
+        Its signature is similar to :meth:`torch.Tensor.to`, but only accepts
+        floating point or complex :attr:`dtype`\ s. In addition, this method will
+        only cast the floating point or complex parameters and buffers to :attr:`dtype`
+        (if given). The integral parameters and buffers will be moved
+        :attr:`device`, if that is given, but with dtypes unchanged. When
+        :attr:`non_blocking` is set, it tries to convert/move asynchronously
+        with respect to the host if possible, e.g., moving CPU Tensors with
+        pinned memory to CUDA devices.
+
+        See below for examples.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (:class:`torch.device`): the desired device of the parameters
+                and buffers in this module
+            dtype (:class:`torch.dtype`): the desired floating point or complex dtype of
+                the parameters and buffers in this module
+            tensor (torch.Tensor): Tensor whose dtype and device are the desired
+                dtype and device for all parameters and buffers in this module
+            memory_format (:class:`torch.memory_format`): the desired memory
+                format for 4D parameters and buffers in this module (keyword
+                only argument)
+
+        Returns:
+            Module: self
+
+        Examples::
+
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> linear = nn.Linear(2, 2)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]])
+            >>> linear.to(torch.double)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1913, -0.3420],
+                    [-0.5113, -0.2325]], dtype=torch.float64)
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA1)
+            >>> gpu1 = torch.device("cuda:1")
+            >>> linear.to(gpu1, dtype=torch.half, non_blocking=True)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16, device='cuda:1')
+            >>> cpu = torch.device("cpu")
+            >>> linear.to(cpu)
+            Linear(in_features=2, out_features=2, bias=True)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.1914, -0.3420],
+                    [-0.5112, -0.2324]], dtype=torch.float16)
+
+            >>> linear = nn.Linear(2, 2, bias=None).to(torch.cdouble)
+            >>> linear.weight
+            Parameter containing:
+            tensor([[ 0.3741+0.j,  0.2382+0.j],
+                    [ 0.5593+0.j, -0.4443+0.j]], dtype=torch.complex128)
+            >>> linear(torch.ones(3, 2, dtype=torch.cdouble))
+            tensor([[0.6122+0.j, 0.1150+0.j],
+                    [0.6122+0.j, 0.1150+0.j],
+                    [0.6122+0.j, 0.1150+0.j]], dtype=torch.complex128)
+
+        """
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+
+        if dtype is not None:
+            if not (dtype.is_floating_point or dtype.is_complex):
+                raise TypeError('nn.Module.to only accepts floating point or complex '
+                                f'dtypes, but got desired dtype={dtype}')
+            if dtype.is_complex:
+                warnings.warn(
+                    "Complex modules are a new feature under active development whose design may change, "
+                    "and some modules might not work as expected when using complex tensors as parameters or buffers. "
+                    "Please file an issue at https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml "
+                    "if a complex module does not work as expected.")
+
+        def convert(t):
+            try:
+                if convert_to_format is not None and t.dim() in (4, 5):
+                    return t.to(
+                        device,
+                        dtype if t.is_floating_point() or t.is_complex() else None,
+                        non_blocking,
+                        memory_format=convert_to_format,
+                    )
+                return t.to(
+                    device,
+                    dtype if t.is_floating_point() or t.is_complex() else None,
+                    non_blocking,
+                )
+            except NotImplementedError as e:
+                if str(e) == "Cannot copy out of meta tensor; no data!":
+                    raise NotImplementedError(
+                        f"{e} Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() "
+                        f"when moving module from meta to a different device."
+                    ) from None
+                else:
+                    raise
+
+        return self._apply(convert)
+
+    def register_full_backward_pre_hook(
+        self,
+        hook: Callable[["Module", _grad_t], Union[None, _grad_t]],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a backward pre-hook on the module.
+
+        The hook will be called every time the gradients for the module are computed.
+        The hook should have the following signature::
+
+            hook(module, grad_output) -> tuple[Tensor] or None
+
+        The :attr:`grad_output` is a tuple. The hook should
+        not modify its arguments, but it can optionally return a new gradient with
+        respect to the output that will be used in place of :attr:`grad_output` in
+        subsequent computations. Entries in :attr:`grad_output` will be ``None`` for
+        all non-Tensor arguments.
+
+        For technical reasons, when this hook is applied to a Module, its forward function will
+        receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+        of each Tensor returned by the Module's forward function.
+
+        .. warning ::
+            Modifying inputs inplace is not allowed when using backward hooks and
+            will raise an error.
+
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward_pre`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward_pre`` hooks
+                on this :class:`torch.nn.modules.Module`. Note that global
+                ``backward_pre`` hooks registered with
+                :func:`register_module_full_backward_pre_hook` will fire before
+                all hooks registered by this method.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        handle = hooks.RemovableHandle(self._backward_pre_hooks)
+        self._backward_pre_hooks[handle.id] = hook
+        if prepend:
+            self._backward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def register_backward_hook(
+        self, hook: Callable[['Module', _grad_t, _grad_t], Union[None, _grad_t]]
+    ) -> RemovableHandle:
+        r"""Register a backward hook on the module.
+
+        This function is deprecated in favor of :meth:`~torch.nn.Module.register_full_backward_hook` and
+        the behavior of this function will change in future versions.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        if self._is_full_backward_hook is True:
+            raise RuntimeError("Cannot use both regular backward hooks and full backward hooks on a "
+                               "single Module. Please use only one of them.")
+
+        self._is_full_backward_hook = False
+
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        return handle
+
+    def register_full_backward_hook(
+        self,
+        hook: Callable[["Module", _grad_t, _grad_t], Union[None, _grad_t]],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a backward hook on the module.
+
+        The hook will be called every time the gradients with respect to a module
+        are computed, i.e. the hook will execute if and only if the gradients with
+        respect to module outputs are computed. The hook should have the following
+        signature::
+
+            hook(module, grad_input, grad_output) -> tuple(Tensor) or None
+
+        The :attr:`grad_input` and :attr:`grad_output` are tuples that contain the gradients
+        with respect to the inputs and outputs respectively. The hook should
+        not modify its arguments, but it can optionally return a new gradient with
+        respect to the input that will be used in place of :attr:`grad_input` in
+        subsequent computations. :attr:`grad_input` will only correspond to the inputs given
+        as positional arguments and all kwarg arguments are ignored. Entries
+        in :attr:`grad_input` and :attr:`grad_output` will be ``None`` for all non-Tensor
+        arguments.
+
+        For technical reasons, when this hook is applied to a Module, its forward function will
+        receive a view of each Tensor passed to the Module. Similarly the caller will receive a view
+        of each Tensor returned by the Module's forward function.
+
+        .. warning ::
+            Modifying inputs or outputs inplace is not allowed when using backward hooks and
+            will raise an error.
+
+        Args:
+            hook (Callable): The user-defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``backward`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``backward`` hooks on
+                this :class:`torch.nn.modules.Module`. Note that global
+                ``backward`` hooks registered with
+                :func:`register_module_full_backward_hook` will fire before
+                all hooks registered by this method.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+
+        """
+        if self._is_full_backward_hook is False:
+            raise RuntimeError("Cannot use both regular backward hooks and full backward hooks on a "
+                               "single Module. Please use only one of them.")
+
+        self._is_full_backward_hook = True
+
+        handle = hooks.RemovableHandle(self._backward_hooks)
+        self._backward_hooks[handle.id] = hook
+        if prepend:
+            self._backward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def _get_backward_hooks(self):
+        r"""Return the backward hooks for use in the call function.
+
+        It returns two lists, one with the full backward hooks and one with the non-full
+        backward hooks.
+        """
+        full_backward_hooks: List[Callable] = []
+        if (_global_is_full_backward_hook is True):
+            full_backward_hooks += _global_backward_hooks.values()
+        if (self._is_full_backward_hook is True):
+            full_backward_hooks += self._backward_hooks.values()
+
+        non_full_backward_hooks: List[Callable] = []
+        if (_global_is_full_backward_hook is False):
+            non_full_backward_hooks += _global_backward_hooks.values()
+        if (self._is_full_backward_hook is False):
+            non_full_backward_hooks += self._backward_hooks.values()
+
+        return full_backward_hooks, non_full_backward_hooks
+
+    def _get_backward_pre_hooks(self):
+        backward_pre_hooks: List[Callable] = []
+        backward_pre_hooks += _global_backward_pre_hooks.values()
+        backward_pre_hooks += self._backward_pre_hooks.values()
+
+        return backward_pre_hooks
+
+    def _maybe_warn_non_full_backward_hook(self, inputs, result, grad_fn):
+        if not isinstance(result, torch.Tensor):
+            if not (isinstance(result, tuple) and all(isinstance(r, torch.Tensor) for r in result)):
+                warnings.warn("Using non-full backward hooks on a Module that does not return a "
+                              "single Tensor or a tuple of Tensors is deprecated and will be removed "
+                              "in future versions. This hook will be missing some of the grad_output. "
+                              "Please use register_full_backward_hook to get the documented behavior.")
+                return
+        else:
+            result = (result,)
+
+        if not isinstance(inputs, torch.Tensor):
+            if not (isinstance(inputs, tuple) and all(isinstance(i, torch.Tensor) for i in inputs)):
+                warnings.warn("Using non-full backward hooks on a Module that does not take as input a "
+                              "single Tensor or a tuple of Tensors is deprecated and will be removed "
+                              "in future versions. This hook will be missing some of the grad_input. "
+                              "Please use register_full_backward_hook to get the documented behavior.")
+                return
+        else:
+            inputs = (inputs,)
+
+        # At this point we are sure that inputs and result are tuple of Tensors
+        out_grad_fn = {r.grad_fn for r in result if r.grad_fn is not None}
+        if len(out_grad_fn) == 0 or (len(out_grad_fn) == 1 and grad_fn not in out_grad_fn):
+            warnings.warn("Using a non-full backward hook when outputs are nested in python data structure "
+                          "is deprecated and will be removed in future versions. This hook will be missing "
+                          "some grad_output.")
+        elif len(out_grad_fn) > 1:
+            warnings.warn("Using a non-full backward hook when outputs are generated by different autograd Nodes "
+                          "is deprecated and will be removed in future versions. This hook will be missing "
+                          "some grad_output. Please use register_full_backward_hook to get the documented behavior.")
+        else:
+            # At this point the grad_output part of the hook will most likely be correct
+            inputs_grad_fn = {i.grad_fn for i in inputs if i.grad_fn is not None}
+
+            next_functions = {n[0] for n in grad_fn.next_functions}
+
+            if inputs_grad_fn != next_functions:
+                warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes "
+                              "is deprecated and will be removed in future versions. This hook will be missing "
+                              "some grad_input. Please use register_full_backward_hook to get the documented "
+                              "behavior.")
+
+    def register_forward_pre_hook(
+        self,
+        hook: Union[
+            Callable[[T, Tuple[Any, ...]], Optional[Any]],
+            Callable[[T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]],
+        ],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a forward pre-hook on the module.
+
+        The hook will be called every time before :func:`forward` is invoked.
+
+
+        If ``with_kwargs`` is false or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        input. User can either return a tuple or a single modified value in the
+        hook. We will wrap the value into a tuple if a single value is returned
+        (unless that value is already a tuple). The hook should have the
+        following signature::
+
+            hook(module, args) -> None or modified input
+
+        If ``with_kwargs`` is true, the forward pre-hook will be passed the
+        kwargs given to the forward function. And if the hook modifies the
+        input, both the args and kwargs should be returned. The hook should have
+        the following signature::
+
+            hook(module, args, kwargs) -> None or a tuple of modified input and kwargs
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If true, the provided ``hook`` will be fired before
+                all existing ``forward_pre`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward_pre`` hooks
+                on this :class:`torch.nn.modules.Module`. Note that global
+                ``forward_pre`` hooks registered with
+                :func:`register_module_forward_pre_hook` will fire before all
+                hooks registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If true, the ``hook`` will be passed the kwargs
+                given to the forward function.
+                Default: ``False``
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(
+            self._forward_pre_hooks,
+            extra_dict=self._forward_pre_hooks_with_kwargs
+        )
+        self._forward_pre_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_pre_hooks_with_kwargs[handle.id] = True
+
+        if prepend:
+            self._forward_pre_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def register_forward_hook(
+        self,
+        hook: Union[
+            Callable[[T, Tuple[Any, ...], Any], Optional[Any]],
+            Callable[[T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]],
+        ],
+        *,
+        prepend: bool = False,
+        with_kwargs: bool = False,
+        always_call: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a forward hook on the module.
+
+        The hook will be called every time after :func:`forward` has computed an output.
+
+        If ``with_kwargs`` is ``False`` or not specified, the input contains only
+        the positional arguments given to the module. Keyword arguments won't be
+        passed to the hooks and only to the ``forward``. The hook can modify the
+        output. It can modify the input inplace but it will not have effect on
+        forward since this is called after :func:`forward` is called. The hook
+        should have the following signature::
+
+            hook(module, args, output) -> None or modified output
+
+        If ``with_kwargs`` is ``True``, the forward hook will be passed the
+        ``kwargs`` given to the forward function and be expected to return the
+        output possibly modified. The hook should have the following signature::
+
+            hook(module, args, kwargs, output) -> None or modified output
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If ``True``, the provided ``hook`` will be fired
+                before all existing ``forward`` hooks on this
+                :class:`torch.nn.modules.Module`. Otherwise, the provided
+                ``hook`` will be fired after all existing ``forward`` hooks on
+                this :class:`torch.nn.modules.Module`. Note that global
+                ``forward`` hooks registered with
+                :func:`register_module_forward_hook` will fire before all hooks
+                registered by this method.
+                Default: ``False``
+            with_kwargs (bool): If ``True``, the ``hook`` will be passed the
+                kwargs given to the forward function.
+                Default: ``False``
+            always_call (bool): If ``True`` the ``hook`` will be run regardless of
+                whether an exception is raised while calling the Module.
+                Default: ``False``
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(
+            self._forward_hooks,
+            extra_dict=[self._forward_hooks_with_kwargs, self._forward_hooks_always_called],
+        )
+        self._forward_hooks[handle.id] = hook
+        if with_kwargs:
+            self._forward_hooks_with_kwargs[handle.id] = True
+        if always_call:
+            self._forward_hooks_always_called[handle.id] = True
+        if prepend:
+            self._forward_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+    def _slow_forward(self, *input, **kwargs):
+        tracing_state = torch._C._get_tracing_state()
+        if not tracing_state or isinstance(self.forward, torch._C.ScriptMethod):
+            return self.forward(*input, **kwargs)
+        recording_scopes = torch.jit._trace._trace_module_map is not None
+        if recording_scopes:
+            # type ignore was added because at this point one knows that
+            # torch.jit._trace._trace_module_map is not Optional and has type Dict[Any, Any]
+            name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None  # type: ignore[index, operator] # noqa: B950
+            if name:
+                tracing_state.push_scope(name)
+            else:
+                recording_scopes = False
+        try:
+            result = self.forward(*input, **kwargs)
+        finally:
+            if recording_scopes:
+                tracing_state.pop_scope()
+        return result
+
+    def _wrapped_call_impl(self, *args, **kwargs):
+        if self._compiled_call_impl is not None:
+            return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
+        else:
+            return self._call_impl(*args, **kwargs)
+
+    def _call_impl(self, *args, **kwargs):
+        forward_call = (self._slow_forward if torch._C._get_tracing_state() else self.forward)
+        # If we don't have any hooks, we want to skip the rest of the logic in
+        # this function, and just call forward.
+        if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
+                or _global_backward_pre_hooks or _global_backward_hooks
+                or _global_forward_hooks or _global_forward_pre_hooks):
+            return forward_call(*args, **kwargs)
+
+        try:
+            result = None
+            called_always_called_hooks = set()
+
+            full_backward_hooks, non_full_backward_hooks = [], []
+            backward_pre_hooks = []
+            if self._backward_pre_hooks or _global_backward_pre_hooks:
+                backward_pre_hooks = self._get_backward_pre_hooks()
+
+            if self._backward_hooks or _global_backward_hooks:
+                full_backward_hooks, non_full_backward_hooks = self._get_backward_hooks()
+
+            if _global_forward_pre_hooks or self._forward_pre_hooks:
+                for hook_id, hook in (
+                    *_global_forward_pre_hooks.items(),
+                    *self._forward_pre_hooks.items(),
+                ):
+                    if hook_id in self._forward_pre_hooks_with_kwargs:
+                        args_kwargs_result = hook(self, args, kwargs)  # type: ignore[misc]
+                        if args_kwargs_result is not None:
+                            if isinstance(args_kwargs_result, tuple) and len(args_kwargs_result) == 2:
+                                args, kwargs = args_kwargs_result
+                            else:
+                                raise RuntimeError(
+                                    "forward pre-hook must return None or a tuple "
+                                    f"of (new_args, new_kwargs), but got {args_kwargs_result}."
+                                )
+                    else:
+                        args_result = hook(self, args)
+                        if args_result is not None:
+                            if not isinstance(args_result, tuple):
+                                args_result = (args_result,)
+                            args = args_result
+
+            bw_hook = None
+            if full_backward_hooks or backward_pre_hooks:
+                bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks)
+                args = bw_hook.setup_input_hook(args)
+
+            result = forward_call(*args, **kwargs)
+            if _global_forward_hooks or self._forward_hooks:
+                for hook_id, hook in (
+                    *_global_forward_hooks.items(),
+                    *self._forward_hooks.items(),
+                ):
+                    # mark that always called hook is run
+                    if hook_id in self._forward_hooks_always_called or hook_id in _global_forward_hooks_always_called:
+                        called_always_called_hooks.add(hook_id)
+
+                    if hook_id in self._forward_hooks_with_kwargs:
+                        hook_result = hook(self, args, kwargs, result)
+                    else:
+                        hook_result = hook(self, args, result)
+
+                    if hook_result is not None:
+                        result = hook_result
+
+            if bw_hook:
+                if not isinstance(result, (torch.Tensor, tuple)):
+                    warnings.warn("For backward hooks to be called,"
+                                  " module output should be a Tensor or a tuple of Tensors"
+                                  f" but received {type(result)}")
+                result = bw_hook.setup_output_hook(result)
+
+            # Handle the non-full backward hooks
+            if non_full_backward_hooks:
+                var = result
+                while not isinstance(var, torch.Tensor):
+                    if isinstance(var, dict):
+                        var = next(v for v in var.values() if isinstance(v, torch.Tensor))
+                    else:
+                        var = var[0]
+                grad_fn = var.grad_fn
+                if grad_fn is not None:
+                    for hook in non_full_backward_hooks:
+                        grad_fn.register_hook(_WrappedHook(hook, self))
+                    self._maybe_warn_non_full_backward_hook(args, result, grad_fn)
+
+            return result
+
+        except Exception:
+            # run always called hooks if they have not already been run
+            # For now only forward hooks have the always_call option but perhaps
+            # this functionality should be added to full backward hooks as well.
+            for hook_id, hook in _global_forward_hooks.items():
+                if hook_id in _global_forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
+                    try:
+                        hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
+                        if hook_result is not None:
+                            result = hook_result
+                    except Exception as e:
+                        warnings.warn("global module forward hook with ``always_call=True`` raised an exception "
+                                      f"that was silenced as another error was raised in forward: {str(e)}")
+                        continue
+
+            for hook_id, hook in self._forward_hooks.items():
+                if hook_id in self._forward_hooks_always_called and hook_id not in called_always_called_hooks:  # type: ignore[possibly-undefined]
+                    try:
+                        if hook_id in self._forward_hooks_with_kwargs:
+                            hook_result = hook(self, args, kwargs, result)  # type: ignore[possibly-undefined]
+                        else:
+                            hook_result = hook(self, args, result)  # type: ignore[possibly-undefined]
+                        if hook_result is not None:
+                            result = hook_result
+                    except Exception as e:
+                        warnings.warn("module forward hook with ``always_call=True`` raised an exception "
+                                      f"that was silenced as another error was raised in forward: {str(e)}")
+                        continue
+            # raise exception raised in try block
+            raise
+
+
+    __call__ : Callable[..., Any] = _wrapped_call_impl
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state.pop("_compiled_call_impl", None)
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+        # Support loading old checkpoints that don't have the following attrs:
+        if '_forward_pre_hooks' not in self.__dict__:
+            self._forward_pre_hooks = OrderedDict()
+        if '_forward_pre_hooks_with_kwargs' not in self.__dict__:
+            self._forward_pre_hooks_with_kwargs = OrderedDict()
+        if '_forward_hooks_with_kwargs' not in self.__dict__:
+            self._forward_hooks_with_kwargs = OrderedDict()
+        if '_forward_hooks_always_called' not in self.__dict__:
+            self._forward_hooks_always_called = OrderedDict()
+        if '_state_dict_hooks' not in self.__dict__:
+            self._state_dict_hooks = OrderedDict()
+        if '_state_dict_pre_hooks' not in self.__dict__:
+            self._state_dict_pre_hooks = OrderedDict()
+        if '_load_state_dict_pre_hooks' not in self.__dict__:
+            self._load_state_dict_pre_hooks = OrderedDict()
+        if '_load_state_dict_post_hooks' not in self.__dict__:
+            self._load_state_dict_post_hooks = OrderedDict()
+        if '_non_persistent_buffers_set' not in self.__dict__:
+            self._non_persistent_buffers_set = set()
+        if '_is_full_backward_hook' not in self.__dict__:
+            self._is_full_backward_hook = None
+        if '_backward_pre_hooks' not in self.__dict__:
+            self._backward_pre_hooks = OrderedDict()
+
+    # On the return type:
+    # We choose to return `Any` in the `__getattr__` type signature instead of a more strict `Union[Tensor, Module]`.
+    # This is done for better interop with various type checkers for the end users.
+    # Having a stricter return type doesn't play nicely with `register_buffer()` and forces
+    # people to excessively use type-ignores, asserts, casts, etc.
+    # See full discussion on the problems with returning `Union` here
+    # https://github.com/microsoft/pyright/issues/4213
+    def __getattr__(self, name: str) -> Any:
+        if '_parameters' in self.__dict__:
+            _parameters = self.__dict__['_parameters']
+            if name in _parameters:
+                return _parameters[name]
+        if '_buffers' in self.__dict__:
+            _buffers = self.__dict__['_buffers']
+            if name in _buffers:
+                return _buffers[name]
+        if '_modules' in self.__dict__:
+            modules = self.__dict__['_modules']
+            if name in modules:
+                return modules[name]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+    def __setattr__(self, name: str, value: Union[Tensor, 'Module']) -> None:
+        def remove_from(*dicts_or_sets):
+            for d in dicts_or_sets:
+                if name in d:
+                    if isinstance(d, dict):
+                        del d[name]
+                    else:
+                        d.discard(name)
+
+        params = self.__dict__.get('_parameters')
+        if isinstance(value, Parameter):
+            if params is None:
+                raise AttributeError(
+                    "cannot assign parameters before Module.__init__() call")
+            remove_from(self.__dict__, self._buffers, self._modules, self._non_persistent_buffers_set)
+            self.register_parameter(name, value)
+        elif params is not None and name in params:
+            if value is not None:
+                raise TypeError(f"cannot assign '{torch.typename(value)}' as parameter '{name}' "
+                                "(torch.nn.Parameter or None expected)"
+                                )
+            self.register_parameter(name, value)
+        else:
+            modules = self.__dict__.get('_modules')
+            if isinstance(value, Module):
+                if modules is None:
+                    raise AttributeError(
+                        "cannot assign module before Module.__init__() call")
+                remove_from(self.__dict__, self._parameters, self._buffers, self._non_persistent_buffers_set)
+                for hook in _global_module_registration_hooks.values():
+                    output = hook(self, name, value)
+                    if output is not None:
+                        value = output
+                modules[name] = value
+            elif modules is not None and name in modules:
+                if value is not None:
+                    raise TypeError(f"cannot assign '{torch.typename(value)}' as child module '{name}' "
+                                    "(torch.nn.Module or None expected)"
+                                    )
+                for hook in _global_module_registration_hooks.values():
+                    output = hook(self, name, value)
+                    if output is not None:
+                        value = output
+                modules[name] = value
+            else:
+                buffers = self.__dict__.get('_buffers')
+                if buffers is not None and name in buffers:
+                    if value is not None and not isinstance(value, torch.Tensor):
+                        raise TypeError(f"cannot assign '{torch.typename(value)}' as buffer '{name}' "
+                                        "(torch.Tensor or None expected)"
+                                        )
+                    for hook in _global_buffer_registration_hooks.values():
+                        output = hook(self, name, value)
+                        if output is not None:
+                            value = output
+                    buffers[name] = value
+                else:
+                    super().__setattr__(name, value)
+
+    def __delattr__(self, name):
+        if name in self._parameters:
+            del self._parameters[name]
+        elif name in self._buffers:
+            del self._buffers[name]
+            self._non_persistent_buffers_set.discard(name)
+        elif name in self._modules:
+            del self._modules[name]
+        else:
+            super().__delattr__(name)
+
+    def _register_state_dict_hook(self, hook):
+        r"""Register a state-dict hook.
+
+        These hooks will be called with arguments: `self`, `state_dict`,
+        `prefix`, `local_metadata`, after the `state_dict` of `self` is set.
+        Note that only parameters and buffers of `self` or its children are
+        guaranteed to exist in `state_dict`. The hooks may modify `state_dict`
+        inplace or return a new one.
+        """
+        handle = hooks.RemovableHandle(self._state_dict_hooks)
+        self._state_dict_hooks[handle.id] = hook
+        return handle
+
+    def register_state_dict_pre_hook(self, hook):
+        r"""Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
+
+        These hooks will be called with arguments: ``self``, ``prefix``,
+        and ``keep_vars`` before calling ``state_dict`` on ``self``. The registered
+        hooks can be used to perform pre-processing before the ``state_dict``
+        call is made.
+        """
+        handle = hooks.RemovableHandle(self._state_dict_pre_hooks)
+        self._state_dict_pre_hooks[handle.id] = hook
+        return handle
+
+    def _save_to_state_dict(self, destination, prefix, keep_vars):
+        r"""Save module state to the `destination` dictionary.
+
+        The `destination` dictionary will contain the state
+        of the module, but not its descendants. This is called on every
+        submodule in :meth:`~torch.nn.Module.state_dict`.
+
+        In rare cases, subclasses can achieve class-specific behavior by
+        overriding this method with custom logic.
+
+        Args:
+            destination (dict): a dict where state will be stored
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+        """
+        for name, param in self._parameters.items():
+            if param is not None:
+                destination[prefix + name] = param if keep_vars else param.detach()
+        for name, buf in self._buffers.items():
+            if buf is not None and name not in self._non_persistent_buffers_set:
+                destination[prefix + name] = buf if keep_vars else buf.detach()
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "get_extra_state", Module.get_extra_state) is not Module.get_extra_state:
+            destination[extra_state_key] = self.get_extra_state()
+
+    # The user can pass an optional arbitrary mappable object to `state_dict`, in which case `state_dict` returns
+    # back that same object. But if they pass nothing, an `OrderedDict` is created and returned.
+    T_destination = TypeVar('T_destination', bound=Dict[str, Any])
+
+    @overload
+    def state_dict(self, *, destination: T_destination, prefix: str = ..., keep_vars: bool = ...) -> T_destination:
+        ...
+
+    @overload
+    def state_dict(self, *, prefix: str = ..., keep_vars: bool = ...) -> Dict[str, Any]:
+        ...
+
+    # TODO: Change `*args` to `*` and remove the corresponding warning in docs when BC allows.
+    # Also remove the logic for arg parsing together.
+    def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
+        r"""Return a dictionary containing references to the whole state of the module.
+
+        Both parameters and persistent buffers (e.g. running averages) are
+        included. Keys are corresponding parameter and buffer names.
+        Parameters and buffers set to ``None`` are not included.
+
+        .. note::
+            The returned object is a shallow copy. It contains references
+            to the module's parameters and buffers.
+
+        .. warning::
+            Currently ``state_dict()`` also accepts positional arguments for
+            ``destination``, ``prefix`` and ``keep_vars`` in order. However,
+            this is being deprecated and keyword arguments will be enforced in
+            future releases.
+
+        .. warning::
+            Please avoid the use of argument ``destination`` as it is not
+            designed for end-users.
+
+        Args:
+            destination (dict, optional): If provided, the state of module will
+                be updated into the dict and the same object is returned.
+                Otherwise, an ``OrderedDict`` will be created and returned.
+                Default: ``None``.
+            prefix (str, optional): a prefix added to parameter and buffer
+                names to compose the keys in state_dict. Default: ``''``.
+            keep_vars (bool, optional): by default the :class:`~torch.Tensor` s
+                returned in the state dict are detached from autograd. If it's
+                set to ``True``, detaching will not be performed.
+                Default: ``False``.
+
+        Returns:
+            dict:
+                a dictionary containing a whole state of the module
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> module.state_dict().keys()
+            ['bias', 'weight']
+
+        """
+        # TODO: Remove `args` and the parsing logic when BC allows.
+        if len(args) > 0:
+            if destination is None:
+                destination = args[0]
+            if len(args) > 1 and prefix == '':
+                prefix = args[1]
+            if len(args) > 2 and keep_vars is False:
+                keep_vars = args[2]
+            # DeprecationWarning is ignored by default
+            warnings.warn(
+                "Positional args are being deprecated, use kwargs instead. Refer to "
+                "https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict"
+                " for details.")
+
+        if destination is None:
+            destination = OrderedDict()
+            destination._metadata = OrderedDict()
+
+        local_metadata = dict(version=self._version)
+        if hasattr(destination, "_metadata"):
+            destination._metadata[prefix[:-1]] = local_metadata
+
+        for hook in self._state_dict_pre_hooks.values():
+            hook(self, prefix, keep_vars)
+        self._save_to_state_dict(destination, prefix, keep_vars)
+        for name, module in self._modules.items():
+            if module is not None:
+                module.state_dict(destination=destination, prefix=prefix + name + '.', keep_vars=keep_vars)
+        for hook in self._state_dict_hooks.values():
+            hook_result = hook(self, destination, prefix, local_metadata)
+            if hook_result is not None:
+                destination = hook_result
+        return destination
+
+    def _register_load_state_dict_pre_hook(self, hook, with_module=False):
+        r"""Register a pre-hook for the :meth:`~torch.nn.Module.load_state_dict` method.
+
+        These hooks will be called with arguments: `state_dict`, `prefix`,
+        `local_metadata`, `strict`, `missing_keys`, `unexpected_keys`,
+        `error_msgs`, before loading `state_dict` into `self`. These arguments
+        are exactly the same as those of `_load_from_state_dict`.
+
+        If ``with_module`` is ``True``, then the first argument to the hook is
+        an instance of the module.
+
+        Arguments:
+            hook (Callable): Callable hook that will be invoked before
+                loading the state dict.
+            with_module (bool, optional): Whether or not to pass the module
+                instance to the hook as the first parameter.
+        """
+        handle = hooks.RemovableHandle(self._load_state_dict_pre_hooks)
+        self._load_state_dict_pre_hooks[handle.id] = _WrappedHook(hook, self if with_module else None)
+        return handle
+
+    def register_load_state_dict_post_hook(self, hook):
+        r"""Register a post hook to be run after module's ``load_state_dict`` is called.
+
+        It should have the following signature::
+            hook(module, incompatible_keys) -> None
+
+        The ``module`` argument is the current module that this hook is registered
+        on, and the ``incompatible_keys`` argument is a ``NamedTuple`` consisting
+        of attributes ``missing_keys`` and ``unexpected_keys``. ``missing_keys``
+        is a ``list`` of ``str`` containing the missing keys and
+        ``unexpected_keys`` is a ``list`` of ``str`` containing the unexpected keys.
+
+        The given incompatible_keys can be modified inplace if needed.
+
+        Note that the checks performed when calling :func:`load_state_dict` with
+        ``strict=True`` are affected by modifications the hook makes to
+        ``missing_keys`` or ``unexpected_keys``, as expected. Additions to either
+        set of keys will result in an error being thrown when ``strict=True``, and
+        clearing out both missing and unexpected keys will avoid an error.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._load_state_dict_post_hooks)
+        self._load_state_dict_post_hooks[handle.id] = hook
+        return handle
+
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        r"""Copy parameters and buffers from :attr:`state_dict` into only this module, but not its descendants.
+
+        This is called on every submodule
+        in :meth:`~torch.nn.Module.load_state_dict`. Metadata saved for this
+        module in input :attr:`state_dict` is provided as :attr:`local_metadata`.
+        For state dicts without metadata, :attr:`local_metadata` is empty.
+        Subclasses can achieve class-specific backward compatible loading using
+        the version number at `local_metadata.get("version", None)`.
+        Additionally, :attr:`local_metadata` can also contain the key
+        `assign_to_params_buffers` that indicates whether keys should be
+        assigned their corresponding tensor in the state_dict.
+
+        .. note::
+            :attr:`state_dict` is not the same object as the input
+            :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So
+            it can be modified.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            prefix (str): the prefix for parameters and buffers used in this
+                module
+            local_metadata (dict): a dict containing the metadata for this module.
+                See
+            strict (bool): whether to strictly enforce that the keys in
+                :attr:`state_dict` with :attr:`prefix` match the names of
+                parameters and buffers in this module
+            missing_keys (list of str): if ``strict=True``, add missing keys to
+                this list
+            unexpected_keys (list of str): if ``strict=True``, add unexpected
+                keys to this list
+            error_msgs (list of str): error messages should be added to this
+                list, and will be reported together in
+                :meth:`~torch.nn.Module.load_state_dict`
+        """
+        for hook in self._load_state_dict_pre_hooks.values():
+            hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+
+        persistent_buffers = {k: v for k, v in self._buffers.items() if k not in self._non_persistent_buffers_set}
+        local_name_params = itertools.chain(self._parameters.items(), persistent_buffers.items())
+        local_state = {k: v for k, v in local_name_params if v is not None}
+        assign_to_params_buffers = local_metadata.get("assign_to_params_buffers", False)
+        use_swap_tensors = torch.__future__.get_swap_module_params_on_conversion()
+
+        for name, param in local_state.items():
+            key = prefix + name
+            if key in state_dict:
+                input_param = state_dict[key]
+                if not torch.overrides.is_tensor_like(input_param):
+                    error_msgs.append(f'While copying the parameter named "{key}", '
+                                      'expected torch.Tensor or Tensor-like object from checkpoint but '
+                                      f'received {type(input_param)}'
+                                      )
+                    continue
+
+                # This is used to avoid copying uninitialized parameters into
+                # non-lazy modules, since they dont have the hook to do the checks
+                # in such case, it will error when accessing the .shape attribute.
+                is_param_lazy = torch.nn.parameter.is_lazy(param)
+                # Backward compatibility: loading 1-dim tensor from 0.3.* to version 0.4+
+                if not is_param_lazy and len(param.shape) == 0 and len(input_param.shape) == 1:
+                    input_param = input_param[0]
+
+                if not is_param_lazy and input_param.shape != param.shape:
+                    # local shape should match the one in checkpoint
+                    error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
+                                      'the shape in current model is {}.'
+                                      .format(key, input_param.shape, param.shape))
+                    continue
+
+                if param.is_meta and not input_param.is_meta and not assign_to_params_buffers:
+                    warnings.warn(f'for {key}: copying from a non-meta parameter in the checkpoint to a meta '
+                                  'parameter in the current model, which is a no-op. (Did you mean to '
+                                  'pass `assign=True` to assign items in the state dictionary to their '
+                                  'corresponding key in the module instead of copying them in place?)')
+
+                try:
+                    with torch.no_grad():
+                        if use_swap_tensors:
+                            new_input_param = param.module_load(input_param, assign=assign_to_params_buffers)
+                            if id(new_input_param) == id(input_param) or id(new_input_param) == id(param):
+                                raise RuntimeError("module_load returned one of self or other, please .detach() "
+                                                   "the result if returning one of the inputs in module_load")
+                            if (isinstance(param, torch.nn.Parameter)):
+                                if not isinstance(new_input_param, torch.nn.Parameter):
+                                    new_input_param = torch.nn.Parameter(new_input_param, requires_grad=param.requires_grad)
+                                else:
+                                    new_input_param.requires_grad_(param.requires_grad)
+                            torch.utils.swap_tensors(param, new_input_param)
+                            del new_input_param
+                        elif assign_to_params_buffers:
+                            # Shape checks are already done above
+                            if (isinstance(param, torch.nn.Parameter)):
+                                if not isinstance(input_param, torch.nn.Parameter):
+                                    input_param = torch.nn.Parameter(input_param, requires_grad=param.requires_grad)
+                                else:
+                                    input_param.requires_grad_(param.requires_grad)
+                            setattr(self, name, input_param)
+                        else:
+                            param.copy_(input_param)
+                except Exception as ex:
+                    action = "swapping" if use_swap_tensors else "copying"
+                    error_msgs.append(f'While {action} the parameter named "{key}", '
+                                      f'whose dimensions in the model are {param.size()} and '
+                                      f'whose dimensions in the checkpoint are {input_param.size()}, '
+                                      f'an exception occurred : {ex.args}.'
+                                      )
+            elif strict:
+                missing_keys.append(key)
+
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if getattr(self.__class__, "set_extra_state", Module.set_extra_state) is not Module.set_extra_state:
+            if extra_state_key in state_dict:
+                self.set_extra_state(state_dict[extra_state_key])
+            elif strict:
+                missing_keys.append(extra_state_key)
+        elif strict and (extra_state_key in state_dict):
+            unexpected_keys.append(extra_state_key)
+
+        if strict:
+            for key in state_dict.keys():
+                if key.startswith(prefix) and key != extra_state_key:
+                    input_name = key[len(prefix):]
+                    input_name = input_name.split('.', 1)[0]  # get the name of param/buffer/child
+                    if input_name not in self._modules and input_name not in local_state:
+                        unexpected_keys.append(key)
+
+    def load_state_dict(self, state_dict: Mapping[str, Any],
+                        strict: bool = True, assign: bool = False):
+        r"""Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
+
+        If :attr:`strict` is ``True``, then
+        the keys of :attr:`state_dict` must exactly match the keys returned
+        by this module's :meth:`~torch.nn.Module.state_dict` function.
+
+        .. warning::
+            If :attr:`assign` is ``True`` the optimizer must be created after
+            the call to :attr:`load_state_dict` unless
+            :func:`~torch.__future__.get_swap_module_params_on_conversion` is ``True``.
+
+        Args:
+            state_dict (dict): a dict containing parameters and
+                persistent buffers.
+            strict (bool, optional): whether to strictly enforce that the keys
+                in :attr:`state_dict` match the keys returned by this module's
+                :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
+            assign (bool, optional): When ``False``, the properties of the tensors
+                in the current module are preserved while when ``True``, the
+                properties of the Tensors in the state dict are preserved. The only
+                exception is the ``requires_grad`` field of :class:`~torch.nn.Parameter`s
+                for which the value from the module is preserved.
+                Default: ``False``
+
+        Returns:
+            ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
+                * **missing_keys** is a list of str containing the missing keys
+                * **unexpected_keys** is a list of str containing the unexpected keys
+
+        Note:
+            If a parameter or buffer is registered as ``None`` and its corresponding key
+            exists in :attr:`state_dict`, :meth:`load_state_dict` will raise a
+            ``RuntimeError``.
+        """
+        if not isinstance(state_dict, Mapping):
+            raise TypeError(f"Expected state_dict to be dict-like, got {type(state_dict)}.")
+
+        missing_keys: List[str] = []
+        unexpected_keys: List[str] = []
+        error_msgs: List[str] = []
+
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = OrderedDict(state_dict)
+        if metadata is not None:
+            # mypy isn't aware that "_metadata" exists in state_dict
+            state_dict._metadata = metadata  # type: ignore[attr-defined]
+
+        def load(module, local_state_dict, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            if assign:
+                local_metadata['assign_to_params_buffers'] = assign
+            module._load_from_state_dict(
+                local_state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    child_prefix = prefix + name + '.'
+                    child_state_dict = {k: v for k, v in local_state_dict.items() if k.startswith(child_prefix)}
+                    load(child, child_state_dict, child_prefix)  # noqa: F821
+
+            # Note that the hook can modify missing_keys and unexpected_keys.
+            incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys)
+            for hook in module._load_state_dict_post_hooks.values():
+                out = hook(module, incompatible_keys)
+                assert out is None, (
+                    "Hooks registered with ``register_load_state_dict_post_hook`` are not"
+                    "expected to return new values, if incompatible_keys need to be modified,"
+                    "it should be done inplace."
+                )
+
+        load(self, state_dict)
+        del load
+
+        if strict:
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0, 'Unexpected key(s) in state_dict: {}. '.format(
+                        ', '.join(f'"{k}"' for k in unexpected_keys)))
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0, 'Missing key(s) in state_dict: {}. '.format(
+                        ', '.join(f'"{k}"' for k in missing_keys)))
+
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               self.__class__.__name__, "\n\t".join(error_msgs)))
+        return _IncompatibleKeys(missing_keys, unexpected_keys)
+
+    def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate: bool = True):
+        r"""Help yield various names + members of modules."""
+        memo = set()
+        modules = self.named_modules(prefix=prefix, remove_duplicate=remove_duplicate) if recurse else [(prefix, self)]
+        for module_prefix, module in modules:
+            members = get_members_fn(module)
+            for k, v in members:
+                if v is None or v in memo:
+                    continue
+                if remove_duplicate:
+                    memo.add(v)
+                name = module_prefix + ('.' if module_prefix else '') + k
+                yield name, v
+
+    def parameters(self, recurse: bool = True) -> Iterator[Parameter]:
+        r"""Return an iterator over module parameters.
+
+        This is typically passed to an optimizer.
+
+        Args:
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+
+        Yields:
+            Parameter: module parameter
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for param in model.parameters():
+            >>>     print(type(param), param.size())
+            <class 'torch.Tensor'> (20L,)
+            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for name, param in self.named_parameters(recurse=recurse):
+            yield param
+
+    def named_parameters(
+            self,
+            prefix: str = '',
+            recurse: bool = True,
+            remove_duplicate: bool = True
+    ) -> Iterator[Tuple[str, Parameter]]:
+        r"""Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
+
+        Args:
+            prefix (str): prefix to prepend to all parameter names.
+            recurse (bool): if True, then yields parameters of this module
+                and all submodules. Otherwise, yields only parameters that
+                are direct members of this module.
+            remove_duplicate (bool, optional): whether to remove the duplicated
+                parameters in the result. Defaults to True.
+
+        Yields:
+            (str, Parameter): Tuple containing the name and parameter
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, param in self.named_parameters():
+            >>>     if name in ['bias']:
+            >>>         print(param.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._parameters.items(),
+            prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
+        yield from gen
+
+    def buffers(self, recurse: bool = True) -> Iterator[Tensor]:
+        r"""Return an iterator over module buffers.
+
+        Args:
+            recurse (bool): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module.
+
+        Yields:
+            torch.Tensor: module buffer
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for buf in model.buffers():
+            >>>     print(type(buf), buf.size())
+            <class 'torch.Tensor'> (20L,)
+            <class 'torch.Tensor'> (20L, 1L, 5L, 5L)
+
+        """
+        for _, buf in self.named_buffers(recurse=recurse):
+            yield buf
+
+    def named_buffers(self, prefix: str = '', recurse: bool = True, remove_duplicate: bool = True) -> Iterator[Tuple[str, Tensor]]:
+        r"""Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
+
+        Args:
+            prefix (str): prefix to prepend to all buffer names.
+            recurse (bool, optional): if True, then yields buffers of this module
+                and all submodules. Otherwise, yields only buffers that
+                are direct members of this module. Defaults to True.
+            remove_duplicate (bool, optional): whether to remove the duplicated buffers in the result. Defaults to True.
+
+        Yields:
+            (str, torch.Tensor): Tuple containing the name and buffer
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, buf in self.named_buffers():
+            >>>     if name in ['running_var']:
+            >>>         print(buf.size())
+
+        """
+        gen = self._named_members(
+            lambda module: module._buffers.items(),
+            prefix=prefix, recurse=recurse, remove_duplicate=remove_duplicate)
+        yield from gen
+
+    def children(self) -> Iterator['Module']:
+        r"""Return an iterator over immediate children modules.
+
+        Yields:
+            Module: a child module
+        """
+        for name, module in self.named_children():
+            yield module
+
+    def named_children(self) -> Iterator[Tuple[str, 'Module']]:
+        r"""Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
+
+        Yields:
+            (str, Module): Tuple containing a name and child module
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined vars")
+            >>> for name, module in model.named_children():
+            >>>     if name in ['conv4', 'conv5']:
+            >>>         print(module)
+
+        """
+        memo = set()
+        for name, module in self._modules.items():
+            if module is not None and module not in memo:
+                memo.add(module)
+                yield name, module
+
+    def modules(self) -> Iterator['Module']:
+        r"""Return an iterator over all modules in the network.
+
+        Yields:
+            Module: a module in the network
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.modules()):
+            ...     print(idx, '->', m)
+
+            0 -> Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            )
+            1 -> Linear(in_features=2, out_features=2, bias=True)
+
+        """
+        for _, module in self.named_modules():
+            yield module
+
+    def named_modules(self, memo: Optional[Set['Module']] = None, prefix: str = '', remove_duplicate: bool = True):
+        r"""Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
+
+        Args:
+            memo: a memo to store the set of modules already added to the result
+            prefix: a prefix that will be added to the name of the module
+            remove_duplicate: whether to remove the duplicated module instances in the result
+                or not
+
+        Yields:
+            (str, Module): Tuple of name and module
+
+        Note:
+            Duplicate modules are returned only once. In the following
+            example, ``l`` will be returned only once.
+
+        Example::
+
+            >>> l = nn.Linear(2, 2)
+            >>> net = nn.Sequential(l, l)
+            >>> for idx, m in enumerate(net.named_modules()):
+            ...     print(idx, '->', m)
+
+            0 -> ('', Sequential(
+              (0): Linear(in_features=2, out_features=2, bias=True)
+              (1): Linear(in_features=2, out_features=2, bias=True)
+            ))
+            1 -> ('0', Linear(in_features=2, out_features=2, bias=True))
+
+        """
+        if memo is None:
+            memo = set()
+        if self not in memo:
+            if remove_duplicate:
+                memo.add(self)
+            yield prefix, self
+            for name, module in self._modules.items():
+                if module is None:
+                    continue
+                submodule_prefix = prefix + ('.' if prefix else '') + name
+                yield from module.named_modules(memo, submodule_prefix, remove_duplicate)
+
+    def train(self: T, mode: bool = True) -> T:
+        r"""Set the module in training mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in training/evaluation
+        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                         mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        if not isinstance(mode, bool):
+            raise ValueError("training mode is expected to be boolean")
+        self.training = mode
+        for module in self.children():
+            module.train(mode)
+        return self
+
+    def eval(self: T) -> T:
+        r"""Set the module in evaluation mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in training/evaluation
+        mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`,
+        etc.
+
+        This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.eval()` and several similar mechanisms that may be confused with it.
+
+        Returns:
+            Module: self
+        """
+        return self.train(False)
+
+    def requires_grad_(self: T, requires_grad: bool = True) -> T:
+        r"""Change if autograd should record operations on parameters in this module.
+
+        This method sets the parameters' :attr:`requires_grad` attributes
+        in-place.
+
+        This method is helpful for freezing part of the module for finetuning
+        or training parts of a model individually (e.g., GAN training).
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.requires_grad_()` and several similar mechanisms that may be confused with it.
+
+        Args:
+            requires_grad (bool): whether autograd should record operations on
+                                  parameters in this module. Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        for p in self.parameters():
+            p.requires_grad_(requires_grad)
+        return self
+
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        r"""Reset gradients of all model parameters.
+
+        See similar function under :class:`torch.optim.Optimizer` for more context.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                See :meth:`torch.optim.Optimizer.zero_grad` for details.
+        """
+        if getattr(self, '_is_replica', False):
+            warnings.warn(
+                "Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
+                "The parameters are copied (in a differentiable manner) from the original module. "
+                "This means they are not leaf nodes in autograd and so don't accumulate gradients. "
+                "If you need gradients in your forward method, consider using autograd.grad instead.")
+
+        for p in self.parameters():
+            if p.grad is not None:
+                if set_to_none:
+                    p.grad = None
+                else:
+                    if p.grad.grad_fn is not None:
+                        p.grad.detach_()
+                    else:
+                        p.grad.requires_grad_(False)
+                    p.grad.zero_()
+
+    def share_memory(self: T) -> T:
+        r"""See :meth:`torch.Tensor.share_memory_`."""
+        return self._apply(lambda t: t.share_memory_())
+
+    def _get_name(self):
+        return self.__class__.__name__
+
+    def extra_repr(self) -> str:
+        r"""Set the extra representation of the module.
+
+        To print customized extra information, you should re-implement
+        this method in your own modules. Both single-line and multi-line
+        strings are acceptable.
+        """
+        return ''
+
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = self.extra_repr()
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines = extra_repr.split('\n')
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+        lines = extra_lines + child_lines
+
+        main_str = self._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+        main_str += ')'
+        return main_str
+
+    def __dir__(self):
+        module_attrs = dir(self.__class__)
+        attrs = list(self.__dict__.keys())
+        parameters = list(self._parameters.keys())
+        modules = list(self._modules.keys())
+        buffers = list(self._buffers.keys())
+        keys = module_attrs + attrs + parameters + modules + buffers
+
+        # Eliminate attrs that are not legal Python variable names
+        keys = [key for key in keys if not key[0].isdigit()]
+
+        return sorted(keys)
+
+    def _replicate_for_data_parallel(self):
+        replica = self.__new__(type(self))
+        replica.__dict__ = self.__dict__.copy()
+
+        # replicas do not have parameters themselves, the replicas reference the original
+        # module.
+        replica._parameters = OrderedDict()
+        replica._buffers = replica._buffers.copy()
+        replica._modules = replica._modules.copy()
+        replica._is_replica = True  # type: ignore[assignment]
+
+        return replica
+
+    def compile(self, *args, **kwargs):
+        """
+        Compile this Module's forward using :func:`torch.compile`.
+
+        This Module's `__call__` method is compiled and all arguments are passed as-is
+        to :func:`torch.compile`.
+
+        See :func:`torch.compile` for details on the arguments for this function.
+        """
+        self._compiled_call_impl = torch.compile(self._call_impl, *args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/normalization.py b/MLPY/Lib/site-packages/torch/nn/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..66d43a1f93644db4056e79b9dbf3cd3acf382f7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/normalization.py
@@ -0,0 +1,297 @@
+import torch
+import numbers
+from torch.nn.parameter import Parameter
+from .module import Module
+from ._functions import CrossMapLRN2d as _cross_map_lrn2d
+from .. import functional as F
+from .. import init
+
+from torch import Tensor, Size
+from typing import Union, List, Tuple
+
+__all__ = ['LocalResponseNorm', 'CrossMapLRN2d', 'LayerNorm', 'GroupNorm']
+
+class LocalResponseNorm(Module):
+    r"""Applies local response normalization over an input signal.
+
+    The input signal is composed of several input planes, where channels occupy the second dimension.
+    Applies normalization across channels.
+
+    .. math::
+        b_{c} = a_{c}\left(k + \frac{\alpha}{n}
+        \sum_{c'=\max(0, c-n/2)}^{\min(N-1,c+n/2)}a_{c'}^2\right)^{-\beta}
+
+    Args:
+        size: amount of neighbouring channels used for normalization
+        alpha: multiplicative factor. Default: 0.0001
+        beta: exponent. Default: 0.75
+        k: additive factor. Default: 1
+
+    Shape:
+        - Input: :math:`(N, C, *)`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    Examples::
+
+        >>> lrn = nn.LocalResponseNorm(2)
+        >>> signal_2d = torch.randn(32, 5, 24, 24)
+        >>> signal_4d = torch.randn(16, 5, 7, 7, 7, 7)
+        >>> output_2d = lrn(signal_2d)
+        >>> output_4d = lrn(signal_4d)
+
+    """
+
+    __constants__ = ['size', 'alpha', 'beta', 'k']
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1.) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.local_response_norm(input, self.size, self.alpha, self.beta,
+                                     self.k)
+
+    def extra_repr(self):
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+
+
+class CrossMapLRN2d(Module):
+    size: int
+    alpha: float
+    beta: float
+    k: float
+
+    def __init__(self, size: int, alpha: float = 1e-4, beta: float = 0.75, k: float = 1) -> None:
+        super().__init__()
+        self.size = size
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    def forward(self, input: Tensor) -> Tensor:
+        return _cross_map_lrn2d.apply(input, self.size, self.alpha, self.beta,
+                                      self.k)
+
+    def extra_repr(self) -> str:
+        return '{size}, alpha={alpha}, beta={beta}, k={k}'.format(**self.__dict__)
+
+
+_shape_t = Union[int, List[int], Size]
+
+
+class LayerNorm(Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
+    the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+        bias: If set to ``False``, the layer will not learn an additive bias (only relevant if
+            :attr:`elementwise_affine` is ``True``). Default: ``True``.
+
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+            The values are initialized to 1.
+        bias:   the learnable bias of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 0.
+
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+
+    Examples::
+
+        >>> # NLP Example
+        >>> batch, sentence_length, embedding_dim = 20, 5, 10
+        >>> embedding = torch.randn(batch, sentence_length, embedding_dim)
+        >>> layer_norm = nn.LayerNorm(embedding_dim)
+        >>> # Activate module
+        >>> layer_norm(embedding)
+        >>>
+        >>> # Image Example
+        >>> N, C, H, W = 20, 5, 10, 10
+        >>> input = torch.randn(N, C, H, W)
+        >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
+        >>> # as shown in the image below
+        >>> layer_norm = nn.LayerNorm([C, H, W])
+        >>> output = layer_norm(input)
+
+    .. image:: ../_static/img/nn/layer_norm.jpg
+        :scale: 50 %
+
+    """
+
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            if bias:
+                self.bias = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            else:
+                self.register_parameter('bias', None)
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            if self.bias is not None:
+                init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps)
+
+    def extra_repr(self) -> str:
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+
+
+class GroupNorm(Module):
+    r"""Applies Group Normalization over a mini-batch of inputs.
+
+    This layer implements the operation as described in
+    the paper `Group Normalization <https://arxiv.org/abs/1803.08494>`__
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+
+    The input channels are separated into :attr:`num_groups` groups, each containing
+    ``num_channels / num_groups`` channels. :attr:`num_channels` must be divisible by
+    :attr:`num_groups`. The mean and standard-deviation are calculated
+    separately over the each group. :math:`\gamma` and :math:`\beta` are learnable
+    per-channel affine transform parameter vectors of size :attr:`num_channels` if
+    :attr:`affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+
+    Args:
+        num_groups (int): number of groups to separate the channels into
+        num_channels (int): number of channels expected in input
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        affine: a boolean value that when set to ``True``, this module
+            has learnable per-channel affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+
+    Shape:
+        - Input: :math:`(N, C, *)` where :math:`C=\text{num\_channels}`
+        - Output: :math:`(N, C, *)` (same shape as input)
+
+    Examples::
+
+        >>> input = torch.randn(20, 6, 10, 10)
+        >>> # Separate 6 channels into 3 groups
+        >>> m = nn.GroupNorm(3, 6)
+        >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
+        >>> m = nn.GroupNorm(6, 6)
+        >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
+        >>> m = nn.GroupNorm(1, 6)
+        >>> # Activating the module
+        >>> output = m(input)
+    """
+
+    __constants__ = ['num_groups', 'num_channels', 'eps', 'affine']
+    num_groups: int
+    num_channels: int
+    eps: float
+    affine: bool
+
+    def __init__(self, num_groups: int, num_channels: int, eps: float = 1e-5, affine: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        if num_channels % num_groups != 0:
+            raise ValueError('num_channels must be divisible by num_groups')
+
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        self.affine = affine
+        if self.affine:
+            self.weight = Parameter(torch.empty(num_channels, **factory_kwargs))
+            self.bias = Parameter(torch.empty(num_channels, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.group_norm(
+            input, self.num_groups, self.weight, self.bias, self.eps)
+
+    def extra_repr(self) -> str:
+        return '{num_groups}, {num_channels}, eps={eps}, ' \
+            'affine={affine}'.format(**self.__dict__)
+
+
+# TODO: ContrastiveNorm2d
+# TODO: DivisiveNorm2d
+# TODO: SubtractiveNorm2d
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/padding.py b/MLPY/Lib/site-packages/torch/nn/modules/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d91cf5478931e036d6c4807d407e107f88ad6bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/padding.py
@@ -0,0 +1,801 @@
+from .module import Module
+from .utils import _pair, _quadruple, _ntuple
+from .. import functional as F
+
+from torch import Tensor
+from ..common_types import _size_2_t, _size_4_t, _size_6_t
+from typing import Sequence, Tuple
+
+
+# TODO: grad_output size asserts in THNN
+
+__all__ = ['CircularPad1d', 'CircularPad2d', 'CircularPad3d', 'ConstantPad1d', 'ConstantPad2d',
+           'ConstantPad3d', 'ReflectionPad1d', 'ReflectionPad2d', 'ReflectionPad3d',
+           'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad1d', 'ZeroPad2d', 'ZeroPad3d']
+
+
+class _CircularPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+
+    def _check_input_dim(self, input):
+        raise NotImplementedError
+
+    def forward(self, input: Tensor) -> Tensor:
+        self._check_input_dim(input)
+        return F.pad(input, self.padding, 'circular')
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+
+
+class CircularPad1d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.CircularPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 3., 0., 1., 2., 3., 0., 1.],
+                 [6., 7., 4., 5., 6., 7., 4., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad1d((3, 1))
+        >>> m(input)
+        tensor([[[1., 2., 3., 0., 1., 2., 3., 0.],
+                 [5., 6., 7., 4., 5., 6., 7., 4.]]])
+    """
+
+    padding: Tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError(
+                f"expected 2D or 3D input (got {input.dim()}D input)"
+            )
+
+
+class CircularPad2d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.CircularPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.],
+                  [7., 8., 6., 7., 8., 6., 7.],
+                  [1., 2., 0., 1., 2., 0., 1.],
+                  [4., 5., 3., 4., 5., 3., 4.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.],
+                  [2., 0., 1., 2., 0.],
+                  [5., 3., 4., 5., 3.],
+                  [8., 6., 7., 8., 6.]]]])
+    """
+
+    padding: Tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+    def _check_input_dim(self, input):
+        if input.dim() != 3 and input.dim() != 4:
+            raise ValueError(
+                f"expected 3D or 4D input (got {input.dim()}D input)"
+            )
+
+
+class CircularPad3d(_CircularPadNd):
+    r"""Pads the input tensor using circular padding of the input boundary.
+
+    Tensor values at the beginning of the dimension are used to pad the end,
+    and values at the end are used to pad the beginning. If negative padding is
+    applied then the ends of the tensor get removed.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.CircularPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.CircularPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+
+    padding: Tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4 and input.dim() != 5:
+            raise ValueError(
+                f"expected 4D or 5D input (got {input.dim()}D input)"
+            )
+
+
+class _ConstantPadNd(Module):
+    __constants__ = ['padding', 'value']
+    value: float
+    padding: Sequence[int]
+
+    def __init__(self, value: float) -> None:
+        super().__init__()
+        self.value = value
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'constant', self.value)
+
+    def extra_repr(self) -> str:
+        return f'padding={self.padding}, value={self.value}'
+
+
+class ConstantPad1d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000, -1.0491, -0.7152, -0.0749,  0.8530,  3.5000,
+                   3.5000],
+                 [ 3.5000,  3.5000, -1.3287,  1.8966,  0.1466, -0.2771,  3.5000,
+                   3.5000]]])
+        >>> m = nn.ConstantPad1d(2, 3.5)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad1d((3, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  1.6616,  1.4523, -1.1255,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000, -3.6372,  0.1182, -1.8652,  3.5000]]])
+    """
+
+    padding: Tuple[int, int]
+
+    def __init__(self, padding: _size_2_t, value: float):
+        super().__init__(value)
+        self.padding = _pair(padding)
+
+
+class ConstantPad2d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ConstantPad2d(2, 3.5)
+        >>> input = torch.randn(1, 2, 2)
+        >>> input
+        tensor([[[ 1.6585,  0.4320],
+                 [-0.8701, -0.4649]]])
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  1.6585,  0.4320,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000, -0.8701, -0.4649,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad2d((3, 0, 2, 1), 3.5)
+        >>> m(input)
+        tensor([[[ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000],
+                 [ 3.5000,  3.5000,  3.5000,  1.6585,  0.4320],
+                 [ 3.5000,  3.5000,  3.5000, -0.8701, -0.4649],
+                 [ 3.5000,  3.5000,  3.5000,  3.5000,  3.5000]]])
+    """
+
+    __constants__ = ['padding', 'value']
+    padding: Tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _quadruple(padding)
+
+
+class ConstantPad3d(_ConstantPadNd):
+    r"""Pads the input tensor boundaries with a constant value.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ConstantPad3d(3, 3.5)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ConstantPad3d((3, 3, 6, 6, 0, 1), 3.5)
+        >>> output = m(input)
+    """
+
+    padding: Tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t, value: float) -> None:
+        super().__init__(value)
+        self.padding = _ntuple(6)(padding)
+
+
+class _ReflectionPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'reflect')
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+
+
+class ReflectionPad1d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ReflectionPad1d(2)
+        >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
+                 [6., 5., 4., 5., 6., 7., 6., 5.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad1d((3, 1))
+        >>> m(input)
+        tensor([[[3., 2., 1., 0., 1., 2., 3., 2.],
+                 [7., 6., 5., 4., 5., 6., 7., 6.]]])
+    """
+
+    padding: Tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+
+class ReflectionPad2d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+            Note that padding size should be less than the corresponding input dimension.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})` where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad2d(2)
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [8., 7., 6., 7., 8., 7., 6.],
+                  [5., 4., 3., 4., 5., 4., 3.],
+                  [2., 1., 0., 1., 2., 1., 0.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReflectionPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[7., 6., 7., 8., 7.],
+                  [4., 3., 4., 5., 4.],
+                  [1., 0., 1., 2., 1.],
+                  [4., 3., 4., 5., 4.],
+                  [7., 6., 7., 8., 7.]]]])
+    """
+
+    padding: Tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+
+class ReflectionPad3d(_ReflectionPadNd):
+    r"""Pads the input tensor using the reflection of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReflectionPad3d(1)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 1, 2, 2, 2)
+        >>> m(input)
+        tensor([[[[[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]],
+                  [[7., 6., 7., 6.],
+                   [5., 4., 5., 4.],
+                   [7., 6., 7., 6.],
+                   [5., 4., 5., 4.]],
+                  [[3., 2., 3., 2.],
+                   [1., 0., 1., 0.],
+                   [3., 2., 3., 2.],
+                   [1., 0., 1., 0.]]]]])
+    """
+
+    padding: Tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+
+class _ReplicationPadNd(Module):
+    __constants__ = ['padding']
+    padding: Sequence[int]
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pad(input, self.padding, 'replicate')
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+
+
+class ReplicationPad1d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("not sure why xdoctest is choking on this")
+        >>> m = nn.ReplicationPad1d(2)
+        >>> input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
+        >>> input
+        tensor([[[0., 1., 2., 3.],
+                 [4., 5., 6., 7.]]])
+        >>> m(input)
+        tensor([[[0., 0., 0., 1., 2., 3., 3., 3.],
+                 [4., 4., 4., 5., 6., 7., 7., 7.]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad1d((3, 1))
+        >>> m(input)
+        tensor([[[0., 0., 0., 0., 1., 2., 3., 3.],
+                 [4., 4., 4., 4., 5., 6., 7., 7.]]])
+    """
+
+    padding: Tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__()
+        self.padding = _pair(padding)
+
+
+class ReplicationPad2d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ReplicationPad2d(2)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> input = torch.arange(9, dtype=torch.float).reshape(1, 1, 3, 3)
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> m(input)
+        tensor([[[[0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [0., 0., 0., 1., 2., 2., 2.],
+                  [3., 3., 3., 4., 5., 5., 5.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.],
+                  [6., 6., 6., 7., 8., 8., 8.]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [0., 0., 1., 2., 2.],
+                  [3., 3., 4., 5., 5.],
+                  [6., 6., 7., 8., 8.]]]])
+    """
+
+    padding: Tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__()
+        self.padding = _quadruple(padding)
+
+
+class ReplicationPad3d(_ReplicationPadNd):
+    r"""Pads the input tensor using replication of the input boundary.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ReplicationPad3d(3)
+        >>> input = torch.randn(16, 3, 8, 320, 480)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ReplicationPad3d((3, 3, 6, 6, 1, 1))
+        >>> output = m(input)
+    """
+
+    padding: Tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__()
+        self.padding = _ntuple(6)(padding)
+
+
+class ZeroPad1d(ConstantPad1d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in both boundaries. If a 2-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`)
+
+    Shape:
+        - Input: :math:`(C, W_{in})` or :math:`(N, C, W_{in})`.
+        - Output: :math:`(C, W_{out})` or :math:`(N, C, W_{out})`, where
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 4)
+        >>> input
+        tensor([[[-1.0491, -0.7152, -0.0749,  0.8530],
+                 [-1.3287,  1.8966,  0.1466, -0.2771]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000, -1.0491, -0.7152, -0.0749,  0.8530,  0.0000,
+                   0.0000],
+                 [ 0.0000,  0.0000, -1.3287,  1.8966,  0.1466, -0.2771,  0.0000,
+                   0.0000]]])
+        >>> m = nn.ZeroPad1d(2)
+        >>> input = torch.randn(1, 2, 3)
+        >>> input
+        tensor([[[ 1.6616,  1.4523, -1.1255],
+                 [-3.6372,  0.1182, -1.8652]]])
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000,  0.0000],
+                 [ 0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000,  0.0000]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad1d((3, 1))
+        >>> m(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000,  1.6616,  1.4523, -1.1255,  0.0000],
+                 [ 0.0000,  0.0000,  0.0000, -3.6372,  0.1182, -1.8652,  0.0000]]])
+    """
+
+    padding: Tuple[int, int]
+
+    def __init__(self, padding: _size_2_t) -> None:
+        super().__init__(padding, 0.)
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+
+class ZeroPad2d(ConstantPad2d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 4-`tuple`, uses (:math:`\text{padding\_left}`,
+            :math:`\text{padding\_right}`, :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`)
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> m = nn.ZeroPad2d(2)
+        >>> input = torch.randn(1, 1, 3, 3)
+        >>> input
+        tensor([[[[-0.1678, -0.4418,  1.9466],
+                  [ 0.9604, -0.4219, -0.5241],
+                  [-0.9162, -0.5436, -0.6446]]]])
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.1678, -0.4418,  1.9466,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.9604, -0.4219, -0.5241,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000, -0.9162, -0.5436, -0.6446,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]]])
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad2d((1, 1, 2, 0))
+        >>> m(input)
+        tensor([[[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
+                  [ 0.0000, -0.1678, -0.4418,  1.9466,  0.0000],
+                  [ 0.0000,  0.9604, -0.4219, -0.5241,  0.0000],
+                  [ 0.0000, -0.9162, -0.5436, -0.6446,  0.0000]]]])
+    """
+
+    padding: Tuple[int, int, int, int]
+
+    def __init__(self, padding: _size_4_t) -> None:
+        super().__init__(padding, 0.)
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
+
+class ZeroPad3d(ConstantPad3d):
+    r"""Pads the input tensor boundaries with zero.
+
+    For `N`-dimensional padding, use :func:`torch.nn.functional.pad()`.
+
+    Args:
+        padding (int, tuple): the size of the padding. If is `int`, uses the same
+            padding in all boundaries. If a 6-`tuple`, uses
+            (:math:`\text{padding\_left}`, :math:`\text{padding\_right}`,
+            :math:`\text{padding\_top}`, :math:`\text{padding\_bottom}`,
+            :math:`\text{padding\_front}`, :math:`\text{padding\_back}`)
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          :math:`D_{out} = D_{in} + \text{padding\_front} + \text{padding\_back}`
+
+          :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+          :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+    Examples::
+
+        >>> m = nn.ZeroPad3d(3)
+        >>> input = torch.randn(16, 3, 10, 20, 30)
+        >>> output = m(input)
+        >>> # using different paddings for different sides
+        >>> m = nn.ZeroPad3d((3, 3, 6, 6, 0, 1))
+        >>> output = m(input)
+    """
+
+    padding: Tuple[int, int, int, int, int, int]
+
+    def __init__(self, padding: _size_6_t) -> None:
+        super().__init__(padding, 0.)
+
+    def extra_repr(self) -> str:
+        return f'{self.padding}'
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/pixelshuffle.py b/MLPY/Lib/site-packages/torch/nn/modules/pixelshuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..e84eac66ffc775ddf6862d2ed08ee9479a537a6c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/pixelshuffle.py
@@ -0,0 +1,113 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+
+__all__ = ['PixelShuffle', 'PixelUnshuffle']
+
+class PixelShuffle(Module):
+    r"""Rearrange elements in a tensor according to an upscaling factor.
+
+    Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an upscale factor.
+
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of :math:`1/r`.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+
+    Args:
+        upscale_factor (int): factor to increase spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \div \text{upscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \times \text{upscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \times \text{upscale\_factor}
+
+    Examples::
+
+        >>> pixel_shuffle = nn.PixelShuffle(3)
+        >>> input = torch.randn(1, 9, 4, 4)
+        >>> output = pixel_shuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 1, 12, 12])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    __constants__ = ['upscale_factor']
+    upscale_factor: int
+
+    def __init__(self, upscale_factor: int) -> None:
+        super().__init__()
+        self.upscale_factor = upscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pixel_shuffle(input, self.upscale_factor)
+
+    def extra_repr(self) -> str:
+        return f'upscale_factor={self.upscale_factor}'
+
+
+class PixelUnshuffle(Module):
+    r"""Reverse the PixelShuffle operation.
+
+    Reverses the :class:`~torch.nn.PixelShuffle` operation by rearranging elements
+    in a tensor of shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape
+    :math:`(*, C \times r^2, H, W)`, where r is a downscale factor.
+
+    See the paper:
+    `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_
+    by Shi et. al (2016) for more details.
+
+    Args:
+        downscale_factor (int): factor to decrease spatial resolution by
+
+    Shape:
+        - Input: :math:`(*, C_{in}, H_{in}, W_{in})`, where * is zero or more batch dimensions
+        - Output: :math:`(*, C_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        C_{out} = C_{in} \times \text{downscale\_factor}^2
+
+    .. math::
+        H_{out} = H_{in} \div \text{downscale\_factor}
+
+    .. math::
+        W_{out} = W_{in} \div \text{downscale\_factor}
+
+    Examples::
+
+        >>> pixel_unshuffle = nn.PixelUnshuffle(3)
+        >>> input = torch.randn(1, 1, 12, 12)
+        >>> output = pixel_unshuffle(input)
+        >>> print(output.size())
+        torch.Size([1, 9, 4, 4])
+
+    .. _Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network:
+        https://arxiv.org/abs/1609.05158
+    """
+
+    __constants__ = ['downscale_factor']
+    downscale_factor: int
+
+    def __init__(self, downscale_factor: int) -> None:
+        super().__init__()
+        self.downscale_factor = downscale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.pixel_unshuffle(input, self.downscale_factor)
+
+    def extra_repr(self) -> str:
+        return f'downscale_factor={self.downscale_factor}'
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/pooling.py b/MLPY/Lib/site-packages/torch/nn/modules/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..5336d5fee57cf69900dfc1410992c76f287979d2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/pooling.py
@@ -0,0 +1,1306 @@
+from typing import List, Optional
+
+from torch import Tensor
+from .module import Module
+from .utils import _single, _pair, _triple
+from .. import functional as F
+
+from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t,
+                            _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t)
+
+__all__ = ['MaxPool1d', 'MaxPool2d', 'MaxPool3d', 'MaxUnpool1d', 'MaxUnpool2d', 'MaxUnpool3d',
+           'AvgPool1d', 'AvgPool2d', 'AvgPool3d', 'FractionalMaxPool2d', 'FractionalMaxPool3d', 'LPPool1d',
+           'LPPool2d', 'LPPool3d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d',
+           'AdaptiveAvgPool1d', 'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d']
+
+class _MaxPoolNd(Module):
+    __constants__ = ['kernel_size', 'stride', 'padding', 'dilation',
+                     'return_indices', 'ceil_mode']
+    return_indices: bool
+    ceil_mode: bool
+
+    def __init__(self, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
+                 padding: _size_any_t = 0, dilation: _size_any_t = 1,
+                 return_indices: bool = False, ceil_mode: bool = False) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.dilation = dilation
+        self.return_indices = return_indices
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self) -> str:
+        return 'kernel_size={kernel_size}, stride={stride}, padding={padding}' \
+            ', dilation={dilation}, ceil_mode={ceil_mode}'.format(**self.__dict__)
+
+
+class MaxPool1d(_MaxPoolNd):
+    r"""Applies a 1D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`
+    and output :math:`(N, C, L_{out})` can be precisely described as:
+
+    .. math::
+        out(N_i, C_j, k) = \max_{m=0, \ldots, \text{kernel\_size} - 1}
+                input(N_i, C_j, stride \times k + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` is the stride between the elements within the
+    sliding window. This `link`_ has a nice visualization of the pooling parameters.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    Args:
+        kernel_size: The size of the sliding window, must be > 0.
+        stride: The stride of the sliding window, must be > 0. Default value is :attr:`kernel_size`.
+        padding: Implicit negative infinity padding to be added on both sides, must be >= 0 and <= kernel_size / 2.
+        dilation: The stride between elements within a sliding window, must be > 0.
+        return_indices: If ``True``, will return the argmax along with the max values.
+                        Useful for :class:`torch.nn.MaxUnpool1d` later
+        ceil_mode: If ``True``, will use `ceil` instead of `floor` to compute the output shape. This
+                   ensures that every element in the input tensor is covered by a sliding window.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} + 2 \times \text{padding} - \text{dilation}
+                    \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of size=3, stride=2
+        >>> m = nn.MaxPool1d(3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    dilation: _size_1_t
+
+    def forward(self, input: Tensor):
+        return F.max_pool1d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+
+
+class MaxPool2d(_MaxPoolNd):
+    r"""Applies a 2D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
+                                                   \text{stride[1]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on both sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool2d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 * \text{padding[0]} - \text{dilation[0]}
+                    \times (\text{kernel\_size[0]} - 1) - 1}{\text{stride[0]}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 * \text{padding[1]} - \text{dilation[1]}
+                    \times (\text{kernel\_size[1]} - 1) - 1}{\text{stride[1]}} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    dilation: _size_2_t
+
+    def forward(self, input: Tensor):
+        return F.max_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+
+
+class MaxPool3d(_MaxPoolNd):
+    r"""Applies a 3D max pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+    for :attr:`padding` number of points. :attr:`dilation` controls the spacing between the kernel points.
+    It is harder to describe, but this `link`_ has a nice visualization of what :attr:`dilation` does.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`, :attr:`dilation` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: Implicit negative infinity padding to be added on all three sides
+        dilation: a parameter that controls the stride of elements in the window
+        return_indices: if ``True``, will return the max indices along with the outputs.
+                        Useful for :class:`torch.nn.MaxUnpool3d` later
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] \times
+                (\text{kernel\_size}[0] - 1) - 1}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] - \text{dilation}[1] \times
+                (\text{kernel\_size}[1] - 1) - 1}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] - \text{dilation}[2] \times
+                (\text{kernel\_size}[2] - 1) - 1}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.MaxPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.MaxPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """  # noqa: E501
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    dilation: _size_3_t
+
+    def forward(self, input: Tensor):
+        return F.max_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.dilation, ceil_mode=self.ceil_mode,
+                            return_indices=self.return_indices)
+
+
+class _MaxUnpoolNd(Module):
+
+    def extra_repr(self) -> str:
+        return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}'
+
+
+class MaxUnpool1d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool1d`.
+
+    :class:`MaxPool1d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool1d` takes in as input the output of :class:`MaxPool1d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool1d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool1d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in})` or :math:`(C, H_{in})`.
+        - Output: :math:`(N, C, H_{out})` or :math:`(C, H_{out})`, where
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{kernel\_size}[0]
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> pool = nn.MaxPool1d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool1d(2, stride=2)
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+
+        >>> # Example showcasing the use of output_size
+        >>> input = torch.tensor([[[1., 2, 3, 4, 5, 6, 7, 8, 9]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.,  0.]]])
+
+        >>> unpool(output, indices)
+        tensor([[[ 0.,  2.,  0.,  4.,  0.,  6.,  0., 8.]]])
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+
+    def __init__(self, kernel_size: _size_1_t, stride: Optional[_size_1_t] = None, padding: _size_1_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if (stride is not None) else kernel_size)
+        self.padding = _single(padding)
+
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool1d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class MaxUnpool2d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool2d`.
+
+    :class:`MaxPool2d` is not fully invertible, since the non-maximal values are lost.
+
+    :class:`MaxUnpool2d` takes in as input the output of :class:`MaxPool2d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool2d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs and Example below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool2d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+            H_{out} = (H_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+            W_{out} = (W_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> pool = nn.MaxPool2d(2, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool2d(2, stride=2)
+        >>> input = torch.tensor([[[[ 1.,  2.,  3.,  4.],
+                                    [ 5.,  6.,  7.,  8.],
+                                    [ 9., 10., 11., 12.],
+                                    [13., 14., 15., 16.]]]])
+        >>> output, indices = pool(input)
+        >>> unpool(output, indices)
+        tensor([[[[  0.,   0.,   0.,   0.],
+                  [  0.,   6.,   0.,   8.],
+                  [  0.,   0.,   0.,   0.],
+                  [  0.,  14.,   0.,  16.]]]])
+        >>> # Now using output_size to resolve an ambiguous size for the inverse
+        >>> input = torch.torch.tensor([[[[ 1.,  2.,  3., 4., 5.],
+                                          [ 6.,  7.,  8., 9., 10.],
+                                          [11., 12., 13., 14., 15.],
+                                          [16., 17., 18., 19., 20.]]]])
+        >>> output, indices = pool(input)
+        >>> # This call will not work without specifying output_size
+        >>> unpool(output, indices, output_size=input.size())
+        tensor([[[[ 0.,  0.,  0.,  0.,  0.],
+                  [ 0.,  7.,  0.,  9.,  0.],
+                  [ 0.,  0.,  0.,  0.,  0.],
+                  [ 0., 17.,  0., 19.,  0.]]]])
+
+
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+
+    def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride if (stride is not None) else kernel_size)
+        self.padding = _pair(padding)
+
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool2d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class MaxUnpool3d(_MaxUnpoolNd):
+    r"""Computes a partial inverse of :class:`MaxPool3d`.
+
+    :class:`MaxPool3d` is not fully invertible, since the non-maximal values are lost.
+    :class:`MaxUnpool3d` takes in as input the output of :class:`MaxPool3d`
+    including the indices of the maximal values and computes a partial inverse
+    in which all non-maximal values are set to zero.
+
+    Note:
+        This operation may behave nondeterministically when the input indices has repeat values.
+        See https://github.com/pytorch/pytorch/issues/80827 and :doc:`/notes/randomness` for more information.
+
+    .. note:: :class:`MaxPool3d` can map several input sizes to the same output
+              sizes. Hence, the inversion process can get ambiguous.
+              To accommodate this, you can provide the needed output size
+              as an additional argument :attr:`output_size` in the forward call.
+              See the Inputs section below.
+
+    Args:
+        kernel_size (int or tuple): Size of the max pooling window.
+        stride (int or tuple): Stride of the max pooling window.
+            It is set to :attr:`kernel_size` by default.
+        padding (int or tuple): Padding that was added to the input
+
+    Inputs:
+        - `input`: the input Tensor to invert
+        - `indices`: the indices given out by :class:`~torch.nn.MaxPool3d`
+        - `output_size` (optional): the targeted output size
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = (D_{in} - 1) \times \text{stride[0]} - 2 \times \text{padding[0]} + \text{kernel\_size[0]}
+
+          .. math::
+              H_{out} = (H_{in} - 1) \times \text{stride[1]} - 2 \times \text{padding[1]} + \text{kernel\_size[1]}
+
+          .. math::
+              W_{out} = (W_{in} - 1) \times \text{stride[2]} - 2 \times \text{padding[2]} + \text{kernel\_size[2]}
+
+          or as given by :attr:`output_size` in the call operator
+
+    Example::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> pool = nn.MaxPool3d(3, stride=2, return_indices=True)
+        >>> unpool = nn.MaxUnpool3d(3, stride=2)
+        >>> output, indices = pool(torch.randn(20, 16, 51, 33, 15))
+        >>> unpooled_output = unpool(output, indices)
+        >>> unpooled_output.size()
+        torch.Size([20, 16, 51, 33, 15])
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+
+    def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.stride = _triple(stride if (stride is not None) else kernel_size)
+        self.padding = _triple(padding)
+
+    def forward(self, input: Tensor, indices: Tensor, output_size: Optional[List[int]] = None) -> Tensor:
+        return F.max_unpool3d(input, indices, self.kernel_size, self.stride,
+                              self.padding, output_size)
+
+
+class _AvgPoolNd(Module):
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad']
+
+    def extra_repr(self) -> str:
+        return f'kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}'
+
+
+class AvgPool1d(_AvgPoolNd):
+    r"""Applies a 1D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, L)`,
+    output :math:`(N, C, L_{out})` and :attr:`kernel_size` :math:`k`
+    can be precisely described as:
+
+    .. math::
+
+        \text{out}(N_i, C_j, l) = \frac{1}{k} \sum_{m=0}^{k-1}
+                               \text{input}(N_i, C_j, \text{stride} \times l + m)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can each be
+    an ``int`` or a one-element tuple.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor \frac{L_{in} +
+              2 \times \text{padding} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(L_{out} - 1) \times \text{stride} \geq L_{in}
+          + \text{padding}`, we skip the last window as it would start in the right padded region, resulting in
+          :math:`L_{out}` being reduced by one.
+
+    Examples::
+
+        >>> # pool with window of size=3, stride=2
+        >>> m = nn.AvgPool1d(3, stride=2)
+        >>> m(torch.tensor([[[1., 2, 3, 4, 5, 6, 7]]]))
+        tensor([[[2., 4., 6.]]])
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+    padding: _size_1_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(self, kernel_size: _size_1_t, stride: _size_1_t = None, padding: _size_1_t = 0, ceil_mode: bool = False,
+                 count_include_pad: bool = True) -> None:
+        super().__init__()
+        self.kernel_size = _single(kernel_size)
+        self.stride = _single(stride if stride is not None else kernel_size)
+        self.padding = _single(padding)
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool1d(
+            input, self.kernel_size, self.stride, self.padding, self.ceil_mode,
+            self.count_include_pad)
+
+
+class AvgPool2d(_AvgPoolNd):
+    r"""Applies a 2D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
+    output :math:`(N, C, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kH, kW)`
+    can be precisely described as:
+
+    .. math::
+
+        out(N_i, C_j, h, w)  = \frac{1}{kH * kW} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1}
+                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on both sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise size of the pooling region will be used.
+
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding}[0] -
+                \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding}[1] -
+                \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(H_{out} - 1)\times \text{stride}[0]\geq H_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the bottom padded region,
+          resulting in :math:`H_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}`.
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool2d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool2d((3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+    padding: _size_2_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(self, kernel_size: _size_2_t, stride: Optional[_size_2_t] = None, padding: _size_2_t = 0,
+                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool2d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
+
+
+class AvgPool3d(_AvgPoolNd):
+    r"""Applies a 3D average pooling over an input signal composed of several input planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, C, D, H, W)`,
+    output :math:`(N, C, D_{out}, H_{out}, W_{out})` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, C_j, d, h, w) ={} & \sum_{k=0}^{kD-1} \sum_{m=0}^{kH-1} \sum_{n=0}^{kW-1} \\
+                                              & \frac{\text{input}(N_i, C_j, \text{stride}[0] \times d + k,
+                                                      \text{stride}[1] \times h + m, \text{stride}[2] \times w + n)}
+                                                     {kD \times kH \times kW}
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly zero-padded on all three sides
+    for :attr:`padding` number of points.
+
+    Note:
+        When ceil_mode=True, sliding windows are allowed to go off-bounds if they start within the left padding
+        or the input. Sliding windows that would start in the right padded region are ignored.
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the depth, height and width dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        padding: implicit zero padding to be added on all three sides
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+        count_include_pad: when True, will include the zero-padding in the averaging calculation
+        divisor_override: if specified, it will be used as divisor, otherwise :attr:`kernel_size` will be used
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] -
+                    \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} + 2 \times \text{padding}[1] -
+                    \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} + 2 \times \text{padding}[2] -
+                    \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+          Per the note above, if ``ceil_mode`` is True and :math:`(D_{out} - 1)\times \text{stride}[0]\geq D_{in}
+          + \text{padding}[0]`, we skip the last window as it would start in the padded region,
+          resulting in :math:`D_{out}` being reduced by one.
+
+          The same applies for :math:`W_{out}` and :math:`H_{out}`.
+
+    Examples::
+
+        >>> # pool of square window of size=3, stride=2
+        >>> m = nn.AvgPool3d(3, stride=2)
+        >>> # pool of non-square window
+        >>> m = nn.AvgPool3d((3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+    """
+
+    __constants__ = ['kernel_size', 'stride', 'padding', 'ceil_mode', 'count_include_pad', 'divisor_override']
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+    padding: _size_3_t
+    ceil_mode: bool
+    count_include_pad: bool
+
+    def __init__(self, kernel_size: _size_3_t, stride: Optional[_size_3_t] = None, padding: _size_3_t = 0,
+                 ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if (stride is not None) else kernel_size
+        self.padding = padding
+        self.ceil_mode = ceil_mode
+        self.count_include_pad = count_include_pad
+        self.divisor_override = divisor_override
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.avg_pool3d(input, self.kernel_size, self.stride,
+                            self.padding, self.ceil_mode, self.count_include_pad, self.divisor_override)
+
+    def __setstate__(self, d):
+        super().__setstate__(d)
+        self.__dict__.setdefault('padding', 0)
+        self.__dict__.setdefault('ceil_mode', False)
+        self.__dict__.setdefault('count_include_pad', True)
+
+
+class FractionalMaxPool2d(Module):
+    r"""Applies a 2D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k) or a tuple `(kh, kw)`
+        output_size: the target output size of the image of the form `oH x oW`.
+                     Can be a tuple `(oH, oW)` or a single number oH for a square image `oH x oH`.
+                     Note that we must have :math:`kH + oH - 1 <= H_{in}` and :math:`kW + oW - 1 <= W_{in}`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1).
+                      Note that we must have :math:`kH + (output\_ratio\_H * H_{in}) - 1 <= H_{in}`
+                      and :math:`kW + (output\_ratio\_W * W_{in}) - 1 <= W_{in}`
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool2d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(H_{out}, W_{out})=\text{output\_ratio} \times (H_{in}, W_{in})`.
+
+    Examples:
+        >>> # pool of square window of size=3, and target output size 13x12
+        >>> m = nn.FractionalMaxPool2d(3, output_size=(13, 12))
+        >>> # pool of square window and target output size being half of input image size
+        >>> m = nn.FractionalMaxPool2d(3, output_ratio=(0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ['kernel_size', 'return_indices', 'output_size',
+                     'output_ratio']
+
+    kernel_size: _size_2_t
+    return_indices: bool
+    output_size: _size_2_t
+    output_ratio: _ratio_2_t
+
+    def __init__(self, kernel_size: _size_2_t, output_size: Optional[_size_2_t] = None,
+                 output_ratio: Optional[_ratio_2_t] = None,
+                 return_indices: bool = False, _random_samples=None) -> None:
+        super().__init__()
+        self.kernel_size = _pair(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer('_random_samples', _random_samples)
+        self.output_size = _pair(output_size) if output_size is not None else None
+        self.output_ratio = _pair(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError("FractionalMaxPool2d requires specifying either "
+                             "an output size, or a pooling ratio")
+        if output_size is not None and output_ratio is not None:
+            raise ValueError("only one of output_size and output_ratio may be specified")
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1):
+                raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})")
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool2d(
+            input, self.kernel_size, self.output_size, self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples)
+
+
+class FractionalMaxPool3d(Module):
+    r"""Applies a 3D fractional max pooling over an input signal composed of several input planes.
+
+    Fractional MaxPooling is described in detail in the paper `Fractional MaxPooling`_ by Ben Graham
+
+    The max-pooling operation is applied in :math:`kT \times kH \times kW` regions by a stochastic
+    step size determined by the target output size.
+    The number of output features is equal to the number of input planes.
+
+    .. note:: Exactly one of ``output_size`` or ``output_ratio`` must be defined.
+
+    Args:
+        kernel_size: the size of the window to take a max over.
+                     Can be a single number k (for a square kernel of k x k x k) or a tuple `(kt x kh x kw)`
+        output_size: the target output size of the image of the form `oT x oH x oW`.
+                     Can be a tuple `(oT, oH, oW)` or a single number oH for a square image `oH x oH x oH`
+        output_ratio: If one wants to have an output size as a ratio of the input size, this option can be given.
+                      This has to be a number or tuple in the range (0, 1)
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to :meth:`nn.MaxUnpool3d`. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, T_{in}, H_{in}, W_{in})` or :math:`(C, T_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, T_{out}, H_{out}, W_{out})` or :math:`(C, T_{out}, H_{out}, W_{out})`, where
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_size}` or
+          :math:`(T_{out}, H_{out}, W_{out})=\text{output\_ratio} \times (T_{in}, H_{in}, W_{in})`
+
+    Examples:
+        >>> # pool of cubic window of size=3, and target output size 13x12x11
+        >>> m = nn.FractionalMaxPool3d(3, output_size=(13, 12, 11))
+        >>> # pool of cubic window and target output size being half of input size
+        >>> m = nn.FractionalMaxPool3d(3, output_ratio=(0.5, 0.5, 0.5))
+        >>> input = torch.randn(20, 16, 50, 32, 16)
+        >>> output = m(input)
+
+    .. _Fractional MaxPooling:
+        https://arxiv.org/abs/1412.6071
+    """
+
+    __constants__ = ['kernel_size', 'return_indices', 'output_size',
+                     'output_ratio']
+    kernel_size: _size_3_t
+    return_indices: bool
+    output_size: _size_3_t
+    output_ratio: _ratio_3_t
+
+    def __init__(self, kernel_size: _size_3_t, output_size: Optional[_size_3_t] = None,
+                 output_ratio: Optional[_ratio_3_t] = None,
+                 return_indices: bool = False, _random_samples=None) -> None:
+        super().__init__()
+        self.kernel_size = _triple(kernel_size)
+        self.return_indices = return_indices
+        self.register_buffer('_random_samples', _random_samples)
+        self.output_size = _triple(output_size) if output_size is not None else None
+        self.output_ratio = _triple(output_ratio) if output_ratio is not None else None
+        if output_size is None and output_ratio is None:
+            raise ValueError("FractionalMaxPool3d requires specifying either "
+                             "an output size, or a pooling ratio")
+        if output_size is not None and output_ratio is not None:
+            raise ValueError("only one of output_size and output_ratio may be specified")
+        if self.output_ratio is not None:
+            if not (0 < self.output_ratio[0] < 1 and 0 < self.output_ratio[1] < 1 and 0 < self.output_ratio[2] < 1):
+                raise ValueError(f"output_ratio must be between 0 and 1 (got {output_ratio})")
+
+    def forward(self, input: Tensor):
+        return F.fractional_max_pool3d(
+            input, self.kernel_size, self.output_size, self.output_ratio,
+            self.return_indices,
+            _random_samples=self._random_samples)
+
+
+class _LPPoolNd(Module):
+    __constants__ = ['norm_type', 'kernel_size', 'stride', 'ceil_mode']
+
+    norm_type: float
+    ceil_mode: bool
+
+    def __init__(self, norm_type: float, kernel_size: _size_any_t, stride: Optional[_size_any_t] = None,
+                 ceil_mode: bool = False) -> None:
+        super().__init__()
+        self.norm_type = norm_type
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.ceil_mode = ceil_mode
+
+    def extra_repr(self) -> str:
+        return 'norm_type={norm_type}, kernel_size={kernel_size}, stride={stride}, ' \
+            'ceil_mode={ceil_mode}'.format(**self.__dict__)
+
+
+class LPPool1d(_LPPoolNd):
+    r"""Applies a 1D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to Average Pooling)
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: a single int, the size of the window
+        stride: a single int, the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+
+          .. math::
+              L_{out} = \left\lfloor\frac{L_{in} - \text{kernel\_size}}{\text{stride}} + 1\right\rfloor
+
+    Examples::
+        >>> # power-2 pool of window of length 3, with stride 2.
+        >>> m = nn.LPPool1d(2, 3, stride=2)
+        >>> input = torch.randn(20, 16, 50)
+        >>> output = m(input)
+    """
+
+    kernel_size: _size_1_t
+    stride: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool1d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+
+
+class LPPool2d(_LPPoolNd):
+    r"""Applies a 2D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height and width dimension
+        - a ``tuple`` of two ints -- in which case, the first `int` is used for the height dimension,
+          and the second `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool2d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool2d(1.2, (3, 2), stride=(2, 1))
+        >>> input = torch.randn(20, 16, 50, 32)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_2_t
+    stride: _size_2_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool2d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+
+
+class LPPool3d(_LPPoolNd):
+    r"""Applies a 3D power-average pooling over an input signal composed of several input planes.
+
+    On each window, the function computed is:
+
+    .. math::
+        f(X) = \sqrt[p]{\sum_{x \in X} x^{p}}
+
+    - At p = :math:`\infty`, one gets Max Pooling
+    - At p = 1, one gets Sum Pooling (which is proportional to average pooling)
+
+    The parameters :attr:`kernel_size`, :attr:`stride` can either be:
+
+        - a single ``int`` -- in which case the same value is used for the height, width and depth dimension
+        - a ``tuple`` of three ints -- in which case, the first `int` is used for the depth dimension,
+          the second `int` for the height dimension and the third `int` for the width dimension
+
+    .. note:: If the sum to the power of `p` is zero, the gradient of this function is
+              not defined. This implementation will set the gradient to zero in this case.
+
+    Args:
+        kernel_size: the size of the window
+        stride: the stride of the window. Default value is :attr:`kernel_size`
+        ceil_mode: when True, will use `ceil` instead of `floor` to compute the output shape
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or
+          :math:`(C, D_{out}, H_{out}, W_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in} - \text{kernel\_size}[0]}{\text{stride}[0]} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in} - \text{kernel\_size}[1]}{\text{stride}[1]} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in} - \text{kernel\_size}[2]}{\text{stride}[2]} + 1\right\rfloor
+
+    Examples::
+
+        >>> # power-2 pool of square window of size=3, stride=2
+        >>> m = nn.LPPool3d(2, 3, stride=2)
+        >>> # pool of non-square window of power 1.2
+        >>> m = nn.LPPool3d(1.2, (3, 2, 2), stride=(2, 1, 2))
+        >>> input = torch.randn(20, 16, 50, 44, 31)
+        >>> output = m(input)
+
+    """
+
+    kernel_size: _size_3_t
+    stride: _size_3_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.lp_pool3d(input, float(self.norm_type), self.kernel_size,
+                           self.stride, self.ceil_mode)
+
+
+class _AdaptiveMaxPoolNd(Module):
+    __constants__ = ['output_size', 'return_indices']
+    return_indices: bool
+
+    def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None:
+        super().__init__()
+        self.output_size = output_size
+        self.return_indices = return_indices
+
+    def extra_repr(self) -> str:
+        return f'output_size={self.output_size}'
+
+# FIXME (by @ssnl): Improve adaptive pooling docs: specify what the input and
+#   output shapes are, and how the operation computes output.
+
+
+class AdaptiveMaxPool1d(_AdaptiveMaxPoolNd):
+    r"""Applies a 1D adaptive max pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool1d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveMaxPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
+    r"""Applies a 2D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(H_{out}, W_{out})` or a single :math:`H_{out}` for a
+                     square image :math:`H_{out} \times H_{out}`. :math:`H_{out}` and :math:`W_{out}`
+                     can be either a ``int``, or ``None`` which means the size will be the same as that
+                     of the input.
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool2d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, H_{out}, W_{out})` or :math:`(C, H_{out}, W_{out})`, where
+          :math:`(H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveMaxPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveMaxPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveMaxPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
+
+
+class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
+    r"""Applies a 3D adaptive max pooling over an input signal composed of several input planes.
+
+    The output is of size :math:`D_{out} \times H_{out} \times W_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form :math:`D_{out} \times H_{out} \times W_{out}`.
+                     Can be a tuple :math:`(D_{out}, H_{out}, W_{out})` or a single
+                     :math:`D_{out}` for a cube :math:`D_{out} \times D_{out} \times D_{out}`.
+                     :math:`D_{out}`, :math:`H_{out}` and :math:`W_{out}` can be either a
+                     ``int``, or ``None`` which means the size will be the same as that of the input.
+
+        return_indices: if ``True``, will return the indices along with the outputs.
+                        Useful to pass to nn.MaxUnpool3d. Default: ``False``
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})` or :math:`(C, D_{out}, H_{out}, W_{out})`,
+          where :math:`(D_{out}, H_{out}, W_{out})=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveMaxPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveMaxPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveMaxPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor):
+        return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
+
+
+class _AdaptiveAvgPoolNd(Module):
+    __constants__ = ['output_size']
+
+    def __init__(self, output_size: _size_any_opt_t) -> None:
+        super().__init__()
+        self.output_size = output_size
+
+    def extra_repr(self) -> str:
+        return f'output_size={self.output_size}'
+
+
+class AdaptiveAvgPool1d(_AdaptiveAvgPoolNd):
+    r"""Applies a 1D adaptive average pooling over an input signal composed of several input planes.
+
+    The output size is :math:`L_{out}`, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size :math:`L_{out}`.
+
+    Shape:
+        - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`.
+        - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where
+          :math:`L_{out}=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5
+        >>> m = nn.AdaptiveAvgPool1d(5)
+        >>> input = torch.randn(1, 64, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_1_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool1d(input, self.output_size)
+
+
+class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
+    r"""Applies a 2D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the image of the form H x W.
+                     Can be a tuple (H, W) or a single H for a square image H x H.
+                     H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1})` or :math:`(C, S_{0}, S_{1})`, where
+          :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7
+        >>> m = nn.AdaptiveAvgPool2d((5, 7))
+        >>> input = torch.randn(1, 64, 8, 9)
+        >>> output = m(input)
+        >>> # target output size of 7x7 (square)
+        >>> m = nn.AdaptiveAvgPool2d(7)
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+        >>> # target output size of 10x7
+        >>> m = nn.AdaptiveAvgPool2d((None, 7))
+        >>> input = torch.randn(1, 64, 10, 9)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_2_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool2d(input, self.output_size)
+
+
+class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
+    r"""Applies a 3D adaptive average pooling over an input signal composed of several input planes.
+
+    The output is of size D x H x W, for any input size.
+    The number of output features is equal to the number of input planes.
+
+    Args:
+        output_size: the target output size of the form D x H x W.
+                     Can be a tuple (D, H, W) or a single number D for a cube D x D x D.
+                     D, H and W can be either a ``int``, or ``None`` which means the size will
+                     be the same as that of the input.
+
+    Shape:
+        - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`.
+        - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`,
+          where :math:`S=\text{output\_size}`.
+
+    Examples:
+        >>> # target output size of 5x7x9
+        >>> m = nn.AdaptiveAvgPool3d((5, 7, 9))
+        >>> input = torch.randn(1, 64, 8, 9, 10)
+        >>> output = m(input)
+        >>> # target output size of 7x7x7 (cube)
+        >>> m = nn.AdaptiveAvgPool3d(7)
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+        >>> # target output size of 7x9x8
+        >>> m = nn.AdaptiveAvgPool3d((7, None, None))
+        >>> input = torch.randn(1, 64, 10, 9, 8)
+        >>> output = m(input)
+
+    """
+
+    output_size: _size_3_opt_t
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.adaptive_avg_pool3d(input, self.output_size)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/rnn.py b/MLPY/Lib/site-packages/torch/nn/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a3295460cd99838238a8e63bdae2d73f68158a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/rnn.py
@@ -0,0 +1,1480 @@
+import math
+import warnings
+import numbers
+import weakref
+from typing import List, Tuple, Optional, overload
+
+import torch
+from torch import Tensor
+from .module import Module
+from ..parameter import Parameter
+from ..utils.rnn import PackedSequence
+from .. import init
+from ... import _VF
+
+__all__ = ['RNNBase', 'RNN', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', 'LSTMCell', 'GRUCell']
+
+_rnn_impls = {
+    'RNN_TANH': _VF.rnn_tanh,
+    'RNN_RELU': _VF.rnn_relu,
+}
+
+
+def _apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    return tensor.index_select(dim, permutation)
+
+
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
+    warnings.warn("apply_permutation is deprecated, please use tensor.index_select(dim, permutation) instead")
+    return _apply_permutation(tensor, permutation, dim)
+
+
+class RNNBase(Module):
+    r"""Base class for RNN modules (RNN, LSTM, GRU).
+
+    Implements aspects of RNNs shared by the RNN, LSTM, and GRU classes, such as module initialization
+    and utility methods for parameter storage management.
+
+    .. note::
+        The forward method is not implemented by the RNNBase class.
+
+    .. note::
+        LSTM and GRU classes override some methods implemented by RNNBase.
+    """
+
+    __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
+                     'batch_first', 'dropout', 'bidirectional', 'proj_size']
+    __jit_unused_properties__ = ['all_weights']
+
+    mode: str
+    input_size: int
+    hidden_size: int
+    num_layers: int
+    bias: bool
+    batch_first: bool
+    dropout: float
+    bidirectional: bool
+    proj_size: int
+
+    def __init__(self, mode: str, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True, batch_first: bool = False,
+                 dropout: float = 0., bidirectional: bool = False, proj_size: int = 0,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.mode = mode
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.proj_size = proj_size
+        self._flat_weight_refs: List[Optional[weakref.ReferenceType[Parameter]]] = []
+        num_directions = 2 if bidirectional else 1
+
+        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
+                isinstance(dropout, bool):
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0 and num_layers == 1:
+            warnings.warn("dropout option adds dropout after all but last "
+                          "recurrent layer, so non-zero dropout expects "
+                          f"num_layers greater than 1, but got dropout={dropout} and "
+                          f"num_layers={num_layers}")
+
+        if not isinstance(hidden_size, int):
+            raise TypeError(f"hidden_size should be of type int, got: {type(hidden_size).__name__}")
+        if hidden_size <= 0:
+            raise ValueError("hidden_size must be greater than zero")
+        if num_layers <= 0:
+            raise ValueError("num_layers must be greater than zero")
+        if proj_size < 0:
+            raise ValueError("proj_size should be a positive integer or zero to disable projections")
+        if proj_size >= hidden_size:
+            raise ValueError("proj_size has to be smaller than hidden_size")
+
+        if mode == 'LSTM':
+            gate_size = 4 * hidden_size
+        elif mode == 'GRU':
+            gate_size = 3 * hidden_size
+        elif mode == 'RNN_TANH':
+            gate_size = hidden_size
+        elif mode == 'RNN_RELU':
+            gate_size = hidden_size
+        else:
+            raise ValueError("Unrecognized RNN mode: " + mode)
+
+        self._flat_weights_names = []
+        self._all_weights = []
+        for layer in range(num_layers):
+            for direction in range(num_directions):
+                real_hidden_size = proj_size if proj_size > 0 else hidden_size
+                layer_input_size = input_size if layer == 0 else real_hidden_size * num_directions
+
+                w_ih = Parameter(torch.empty((gate_size, layer_input_size), **factory_kwargs))
+                w_hh = Parameter(torch.empty((gate_size, real_hidden_size), **factory_kwargs))
+                b_ih = Parameter(torch.empty(gate_size, **factory_kwargs))
+                # Second bias vector included for CuDNN compatibility. Only one
+                # bias vector is needed in standard definition.
+                b_hh = Parameter(torch.empty(gate_size, **factory_kwargs))
+                layer_params: Tuple[Tensor, ...] = ()
+                if self.proj_size == 0:
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh)
+                    else:
+                        layer_params = (w_ih, w_hh)
+                else:
+                    w_hr = Parameter(torch.empty((proj_size, hidden_size), **factory_kwargs))
+                    if bias:
+                        layer_params = (w_ih, w_hh, b_ih, b_hh, w_hr)
+                    else:
+                        layer_params = (w_ih, w_hh, w_hr)
+
+                suffix = '_reverse' if direction == 1 else ''
+                param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
+                if bias:
+                    param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
+                if self.proj_size > 0:
+                    param_names += ['weight_hr_l{}{}']
+                param_names = [x.format(layer, suffix) for x in param_names]
+
+                for name, param in zip(param_names, layer_params):
+                    setattr(self, name, param)
+                self._flat_weights_names.extend(param_names)
+                self._all_weights.append(param_names)
+
+        self._init_flat_weights()
+
+        self.reset_parameters()
+
+    def _init_flat_weights(self):
+        self._flat_weights = [getattr(self, wn) if hasattr(self, wn) else None
+                              for wn in self._flat_weights_names]
+        self._flat_weight_refs = [weakref.ref(w) if w is not None else None
+                                  for w in self._flat_weights]
+        self.flatten_parameters()
+
+    def __setattr__(self, attr, value):
+        if hasattr(self, "_flat_weights_names") and attr in self._flat_weights_names:
+            # keep self._flat_weights up to date if you do self.weight = ...
+            idx = self._flat_weights_names.index(attr)
+            self._flat_weights[idx] = value
+        super().__setattr__(attr, value)
+
+    def flatten_parameters(self) -> None:
+        """Reset parameter data pointer so that they can use faster code paths.
+
+        Right now, this works only if the module is on the GPU and cuDNN is enabled.
+        Otherwise, it's a no-op.
+        """
+        # Short-circuits if _flat_weights is only partially instantiated
+        if len(self._flat_weights) != len(self._flat_weights_names):
+            return
+
+        for w in self._flat_weights:
+            if not isinstance(w, Tensor):
+                return
+        # Short-circuits if any tensor in self._flat_weights is not acceptable to cuDNN
+        # or the tensors in _flat_weights are of different dtypes
+
+        first_fw = self._flat_weights[0]
+        dtype = first_fw.dtype
+        for fw in self._flat_weights:
+            if (not isinstance(fw.data, Tensor) or not (fw.data.dtype == dtype) or
+                    not fw.data.is_cuda or
+                    not torch.backends.cudnn.is_acceptable(fw.data)):
+                return
+
+        # If any parameters alias, we fall back to the slower, copying code path. This is
+        # a sufficient check, because overlapping parameter buffers that don't completely
+        # alias would break the assumptions of the uniqueness check in
+        # Module.named_parameters().
+        unique_data_ptrs = {p.data_ptr() for p in self._flat_weights}
+        if len(unique_data_ptrs) != len(self._flat_weights):
+            return
+
+        with torch.cuda.device_of(first_fw):
+            import torch.backends.cudnn.rnn as rnn
+
+            # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is
+            # an inplace operation on self._flat_weights
+            with torch.no_grad():
+                if torch._use_cudnn_rnn_flatten_weight():
+                    num_weights = 4 if self.bias else 2
+                    if self.proj_size > 0:
+                        num_weights += 1
+                    torch._cudnn_rnn_flatten_weight(
+                        self._flat_weights, num_weights,
+                        self.input_size, rnn.get_cudnn_mode(self.mode),
+                        self.hidden_size, self.proj_size, self.num_layers,
+                        self.batch_first, bool(self.bidirectional))
+
+    def _apply(self, fn, recurse=True):
+        self._flat_weight_refs = []
+        ret = super()._apply(fn, recurse)
+
+        # Resets _flat_weights
+        # Note: be v. careful before removing this, as 3rd party device types
+        # likely rely on this behavior to properly .to() modules like LSTM.
+        self._init_flat_weights()
+
+        return ret
+
+    def reset_parameters(self) -> None:
+        stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0
+        for weight in self.parameters():
+            init.uniform_(weight, -stdv, stdv)
+
+    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
+        if not torch.jit.is_scripting():
+            if input.dtype != self._flat_weights[0].dtype and not torch._C._is_any_autocast_enabled():
+                raise ValueError(f'input must have the type {self._flat_weights[0].dtype}, got type {input.dtype}')
+        expected_input_dim = 2 if batch_sizes is not None else 3
+        if input.dim() != expected_input_dim:
+            raise RuntimeError(
+                f'input must have {expected_input_dim} dimensions, got {input.dim()}')
+        if self.input_size != input.size(-1):
+            raise RuntimeError(
+                f'input.size(-1) must be equal to input_size. Expected {self.input_size}, got {input.size(-1)}')
+
+    def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if self.proj_size > 0:
+            expected_hidden_size = (self.num_layers * num_directions,
+                                    mini_batch, self.proj_size)
+        else:
+            expected_hidden_size = (self.num_layers * num_directions,
+                                    mini_batch, self.hidden_size)
+        return expected_hidden_size
+
+    def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int],
+                          msg: str = 'Expected hidden size {}, got {}') -> None:
+        if hx.size() != expected_hidden_size:
+            raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
+
+    def _weights_have_changed(self):
+        # Returns True if the weight tensors have changed since the last forward pass.
+        # This is the case when used with torch.func.functional_call(), for example.
+        weights_changed = False
+        for ref, name in zip(self._flat_weight_refs, self._flat_weights_names):
+            weight = getattr(self, name) if hasattr(self, name) else None
+            if weight is not None and ref is not None and ref() is not weight:
+                weights_changed = True
+                break
+        return weights_changed
+
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]):
+        self.check_input(input, batch_sizes)
+        expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
+
+        self.check_hidden_size(hidden, expected_hidden_size)
+
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]):
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx, permutation)
+
+
+    def extra_repr(self) -> str:
+        s = '{input_size}, {hidden_size}'
+        if self.proj_size != 0:
+            s += ', proj_size={proj_size}'
+        if self.num_layers != 1:
+            s += ', num_layers={num_layers}'
+        if self.bias is not True:
+            s += ', bias={bias}'
+        if self.batch_first is not False:
+            s += ', batch_first={batch_first}'
+        if self.dropout != 0:
+            s += ', dropout={dropout}'
+        if self.bidirectional is not False:
+            s += ', bidirectional={bidirectional}'
+        return s.format(**self.__dict__)
+
+    def _update_flat_weights(self):
+        if not torch.jit.is_scripting():
+            if self._weights_have_changed():
+                self._init_flat_weights()
+
+    def __getstate__(self):
+        # If weights have been changed, update the _flat_weights in __getstate__ here.
+        self._update_flat_weights()
+        # Don't serialize the weight references.
+        state = self.__dict__.copy()
+        del state['_flat_weight_refs']
+        return state
+
+    def __setstate__(self, d):
+        super().__setstate__(d)
+        if 'all_weights' in d:
+            self._all_weights = d['all_weights']
+        # In PyTorch 1.8 we added a proj_size member variable to LSTM.
+        # LSTMs that were serialized via torch.save(module) before PyTorch 1.8
+        # don't have it, so to preserve compatibility we set proj_size here.
+        if 'proj_size' not in d:
+            self.proj_size = 0
+
+        if not isinstance(self._all_weights[0][0], str):
+            num_layers = self.num_layers
+            num_directions = 2 if self.bidirectional else 1
+            self._flat_weights_names = []
+            self._all_weights = []
+            for layer in range(num_layers):
+                for direction in range(num_directions):
+                    suffix = '_reverse' if direction == 1 else ''
+                    weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}',
+                               'bias_hh_l{}{}', 'weight_hr_l{}{}']
+                    weights = [x.format(layer, suffix) for x in weights]
+                    if self.bias:
+                        if self.proj_size > 0:
+                            self._all_weights += [weights]
+                            self._flat_weights_names.extend(weights)
+                        else:
+                            self._all_weights += [weights[:4]]
+                            self._flat_weights_names.extend(weights[:4])
+                    else:
+                        if self.proj_size > 0:
+                            self._all_weights += [weights[:2]] + [weights[-1:]]
+                            self._flat_weights_names.extend(weights[:2] + [weights[-1:]])
+                        else:
+                            self._all_weights += [weights[:2]]
+                            self._flat_weights_names.extend(weights[:2])
+            self._flat_weights = [getattr(self, wn) if hasattr(self, wn) else None
+                                  for wn in self._flat_weights_names]
+
+        self._flat_weight_refs = [weakref.ref(w) if w is not None else None
+                                  for w in self._flat_weights]
+
+    @property
+    def all_weights(self) -> List[List[Parameter]]:
+        return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
+
+    def _replicate_for_data_parallel(self):
+        replica = super()._replicate_for_data_parallel()
+        # Need to copy these caches, otherwise the replica will share the same
+        # flat weights list.
+        replica._flat_weights = replica._flat_weights[:]
+        replica._flat_weights_names = replica._flat_weights_names[:]
+        return replica
+
+
+class RNN(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)
+
+    Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}`
+    non-linearity to an input sequence. For each element in the input sequence,
+    each layer computes the following function:
+
+    .. math::
+        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
+    the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
+    previous layer at time `t-1` or the initial hidden state at time `0`.
+    If :attr:`nonlinearity` is ``'relu'``, then :math:`\text{ReLU}` is used instead of :math:`\tanh`.
+
+    .. code-block:: python
+
+        # Efficient implementation equivalent to the following with bidirectional=False
+        def forward(x, h_0=None):
+            if batch_first:
+                x = x.transpose(0, 1)
+            seq_len, batch_size, _ = x.size()
+            if h_0 is None:
+                h_0 = torch.zeros(num_layers, batch_size, hidden_size)
+            h_t_minus_1 = h_0
+            h_t = h_0
+            output = []
+            for t in range(seq_len):
+                for layer in range(num_layers):
+                    h_t[layer] = torch.tanh(
+                        x[t] @ weight_ih[layer].T
+                        + bias_ih[layer]
+                        + h_t_minus_1[layer] @ weight_hh[layer].T
+                        + bias_hh[layer]
+                    )
+                output.append(h_t[-1])
+                h_t_minus_1 = h_t
+            output = torch.stack(output)
+            if batch_first:
+                output = output.transpose(0, 1)
+            return output, h_t
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two RNNs together to form a `stacked RNN`,
+            with the second RNN taking in outputs of the first RNN and
+            computing the final results. Default: 1
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            RNN layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
+
+    Inputs: input, h_0
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the initial hidden
+          state for the input sequence batch. Defaults to zeros if not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{out} ={} & \text{hidden\_size}
+            \end{aligned}
+
+    Outputs: output, h_n
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the RNN, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
+          for each element in the batch.
+
+    Attributes:
+        weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
+            of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
+            `(hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
+            of shape `(hidden_size, hidden_size)`
+        bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+        bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
+            of shape `(hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional RNNs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. include:: ../cudnn_rnn_determinism.rst
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.RNN(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    @overload
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1,
+                 nonlinearity: str = 'tanh', bias: bool = True, batch_first: bool = False,
+                 dropout: float = 0., bidirectional: bool = False, device=None,
+                 dtype=None) -> None:
+        ...
+
+    @overload
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __init__(self, *args, **kwargs):
+        if 'proj_size' in kwargs:
+            raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
+        if len(args) > 3:
+            self.nonlinearity = args[3]
+            args = args[:3] + args[4:]
+        else:
+            self.nonlinearity = kwargs.pop('nonlinearity', 'tanh')
+        if self.nonlinearity == 'tanh':
+            mode = 'RNN_TANH'
+        elif self.nonlinearity == 'relu':
+            mode = 'RNN_RELU'
+        else:
+            raise ValueError(f"Unknown nonlinearity '{self.nonlinearity}'. Select from 'tanh' or 'relu'.")
+        super().__init__(mode, *args, **kwargs)
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        pass
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]:
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        self._update_flat_weights()
+
+        num_directions = 2 if self.bidirectional else 1
+        orig_input = input
+
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            # script() is unhappy when max_batch_size is different type in cond branches, so we duplicate
+            if hx is None:
+                hx = torch.zeros(self.num_layers * num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 dtype=input.dtype, device=input.device)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            batch_sizes = None
+            if input.dim() not in (2, 3):
+                raise ValueError(f"RNN: Expected input to be 2D or 3D, got {input.dim()}D tensor instead")
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor")
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor")
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                hx = torch.zeros(self.num_layers * num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 dtype=input.dtype, device=input.device)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        assert hx is not None
+        self.check_forward_args(input, hx, batch_sizes)
+        assert self.mode == 'RNN_TANH' or self.mode == 'RNN_RELU'
+        if batch_sizes is None:
+            if self.mode == 'RNN_TANH':
+                result = _VF.rnn_tanh(input, hx, self._flat_weights, self.bias, self.num_layers,
+                                      self.dropout, self.training, self.bidirectional,
+                                      self.batch_first)
+            else:
+                result = _VF.rnn_relu(input, hx, self._flat_weights, self.bias, self.num_layers,
+                                      self.dropout, self.training, self.bidirectional,
+                                      self.batch_first)
+        else:
+            if self.mode == 'RNN_TANH':
+                result = _VF.rnn_tanh(input, batch_sizes, hx, self._flat_weights, self.bias,
+                                      self.num_layers, self.dropout, self.training,
+                                      self.bidirectional)
+            else:
+                result = _VF.rnn_relu(input, batch_sizes, hx, self._flat_weights, self.bias,
+                                      self.num_layers, self.dropout, self.training,
+                                      self.bidirectional)
+
+        output = result[0]
+        hidden = result[1]
+
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+
+        if not is_batched:  # type: ignore[possibly-undefined]
+            output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+            hidden = hidden.squeeze(1)
+
+        return output, self.permute_hidden(hidden, unsorted_indices)
+
+# XXX: LSTM and GRU implementation is different from RNNBase, this is because:
+# 1. we want to support nn.LSTM and nn.GRU in TorchScript and TorchScript in
+#    its current state could not support the python Union Type or Any Type
+# 2. TorchScript static typing does not allow a Function or Callable type in
+#    Dict values, so we have to separately call _VF instead of using _rnn_impls
+# 3. This is temporary only and in the transition state that we want to make it
+#    on time for the release
+#
+# More discussion details in https://github.com/pytorch/pytorch/pull/23266
+#
+# TODO: remove the overriding implementations for LSTM and GRU when TorchScript
+# support expressing these two modules generally.
+
+
+class LSTM(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,proj_size=0,device=None,dtype=None)
+
+    Apply a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
+            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
+            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
+            o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho}) \\
+            c_t = f_t \odot c_{t-1} + i_t \odot g_t \\
+            h_t = o_t \odot \tanh(c_t) \\
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
+    state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{t-1}`
+    is the hidden state of the layer at time `t-1` or the initial hidden
+    state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
+    :math:`o_t` are the input, forget, cell, and output gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    If ``proj_size > 0`` is specified, LSTM with projections will be used. This changes
+    the LSTM cell in the following way. First, the dimension of :math:`h_t` will be changed from
+    ``hidden_size`` to ``proj_size`` (dimensions of :math:`W_{hi}` will be changed accordingly).
+    Second, the output hidden state of each layer will be multiplied by a learnable projection
+    matrix: :math:`h_t = W_{hr}h_t`. Note that as a consequence of this, the output
+    of LSTM network will be of different shape as well. See Inputs/Outputs sections below for exact
+    dimensions of all variables. You can find more details in https://arxiv.org/abs/1402.1128.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two LSTMs together to form a `stacked LSTM`,
+            with the second LSTM taking in outputs of the first LSTM and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            LSTM layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
+        proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
+
+    Inputs: input, (h_0, c_0)
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          initial hidden state for each element in the input sequence.
+          Defaults to zeros if (h_0, c_0) is not provided.
+        * **c_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          initial cell state for each element in the input sequence.
+          Defaults to zeros if (h_0, c_0) is not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{cell} ={} & \text{hidden\_size} \\
+                H_{out} ={} & \text{proj\_size if } \text{proj\_size}>0 \text{ otherwise hidden\_size} \\
+            \end{aligned}
+
+    Outputs: output, (h_n, c_n)
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the LSTM, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence. When ``bidirectional=True``, `output` will contain
+          a concatenation of the forward and reverse hidden states at each time step in the sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the
+          final hidden state for each element in the sequence. When ``bidirectional=True``,
+          `h_n` will contain a concatenation of the final forward and reverse hidden states, respectively.
+        * **c_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{cell})` for unbatched input or
+          :math:`(D * \text{num\_layers}, N, H_{cell})` containing the
+          final cell state for each element in the sequence. When ``bidirectional=True``,
+          `c_n` will contain a concatenation of the final forward and reverse cell states, respectively.
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`. If
+            ``proj_size > 0`` was specified, the shape will be
+            `(4*hidden_size, num_directions * proj_size)` for `k > 0`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`. If ``proj_size > 0``
+            was specified, the shape will be `(4*hidden_size, proj_size)`.
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
+        weight_hr_l[k] : the learnable projection weights of the :math:`\text{k}^{th}` layer
+            of shape `(proj_size, hidden_size)`. Only present when ``proj_size > 0`` was
+            specified.
+        weight_ih_l[k]_reverse: Analogous to `weight_ih_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        weight_hh_l[k]_reverse:  Analogous to `weight_hh_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        bias_ih_l[k]_reverse:  Analogous to `bias_ih_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        bias_hh_l[k]_reverse:  Analogous to `bias_hh_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True``.
+        weight_hr_l[k]_reverse:  Analogous to `weight_hr_l[k]` for the reverse direction.
+            Only present when ``bidirectional=True`` and ``proj_size > 0`` was specified.
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional LSTMs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        For bidirectional LSTMs, `h_n` is not equivalent to the last element of `output`; the
+        former contains the final forward and reverse hidden states, while the latter contains the
+        final forward hidden state and the initial reverse hidden state.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. note::
+        ``proj_size`` should be smaller than ``hidden_size``.
+
+    .. include:: ../cudnn_rnn_determinism.rst
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+    """
+
+    @overload
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bias: bool = True,
+                 batch_first: bool = False, dropout: float = 0., bidirectional: bool = False,
+                 proj_size: int = 0, device=None, dtype=None) -> None:
+        ...
+
+    @overload
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __init__(self, *args, **kwargs):
+        super().__init__('LSTM', *args, **kwargs)
+
+    def get_expected_cell_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
+        if batch_sizes is not None:
+            mini_batch = int(batch_sizes[0])
+        else:
+            mini_batch = input.size(0) if self.batch_first else input.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        expected_hidden_size = (self.num_layers * num_directions,
+                                mini_batch, self.hidden_size)
+        return expected_hidden_size
+
+    # In the future, we should prevent mypy from applying contravariance rules here.
+    # See torch/nn/modules/module.py::_forward_unimplemented
+    def check_forward_args(self,  # type: ignore[override]
+                           input: Tensor,
+                           hidden: Tuple[Tensor, Tensor],
+                           batch_sizes: Optional[Tensor],
+                           ):
+        self.check_input(input, batch_sizes)
+        self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
+                               'Expected hidden[0] size {}, got {}')
+        self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
+                               'Expected hidden[1] size {}, got {}')
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    def permute_hidden(self,  # type: ignore[override]
+                       hx: Tuple[Tensor, Tensor],
+                       permutation: Optional[Tensor]
+                       ) -> Tuple[Tensor, Tensor]:
+        if permutation is None:
+            return hx
+        return _apply_permutation(hx[0], permutation), _apply_permutation(hx[1], permutation)
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    @overload  # type: ignore[override]
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None
+                ) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:  # noqa: F811
+        pass
+
+    # Same as above, see torch/nn/modules/module.py::_forward_unimplemented
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
+                ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:  # noqa: F811
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        self._update_flat_weights()
+
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        batch_sizes = None
+        do_permute = False
+        num_directions = 2 if self.bidirectional else 1
+        real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            if hx is None:
+                h_zeros = torch.zeros(self.num_layers * num_directions,
+                                      max_batch_size, real_hidden_size,
+                                      dtype=input.dtype, device=input.device)
+                c_zeros = torch.zeros(self.num_layers * num_directions,
+                                      max_batch_size, self.hidden_size,
+                                      dtype=input.dtype, device=input.device)
+                hx = (h_zeros, c_zeros)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            if input.dim() not in (2, 3):
+                raise ValueError(f"LSTM: Expected input to be 2D or 3D, got {input.dim()}D instead")
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                h_zeros = torch.zeros(self.num_layers * num_directions,
+                                      max_batch_size, real_hidden_size,
+                                      dtype=input.dtype, device=input.device)
+                c_zeros = torch.zeros(self.num_layers * num_directions,
+                                      max_batch_size, self.hidden_size,
+                                      dtype=input.dtype, device=input.device)
+                hx = (h_zeros, c_zeros)
+                self.check_forward_args(input, hx, batch_sizes)
+            else:
+                if is_batched:
+                    if (hx[0].dim() != 3 or hx[1].dim() != 3):
+                        msg = ("For batched 3-D input, hx and cx should "
+                               f"also be 3-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                else:
+                    if hx[0].dim() != 2 or hx[1].dim() != 2:
+                        msg = ("For unbatched 2-D input, hx and cx should "
+                               f"also be 2-D but got ({hx[0].dim()}-D, {hx[1].dim()}-D) tensors")
+                        raise RuntimeError(msg)
+                    hx = (hx[0].unsqueeze(1), hx[1].unsqueeze(1))
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                self.check_forward_args(input, hx, batch_sizes)
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        if batch_sizes is None:
+            result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
+                              self.dropout, self.training, self.bidirectional, self.batch_first)
+        else:
+            result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
+                              self.num_layers, self.dropout, self.training, self.bidirectional)
+        output = result[0]
+        hidden = result[1:]
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = (hidden[0].squeeze(1), hidden[1].squeeze(1))
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+
+class GRU(RNNBase):
+    r"""__init__(input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)
+
+    Apply a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
+            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn})) \\
+            h_t = (1 - z_t) \odot n_t + z_t \odot h_{(t-1)}
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
+    at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
+    at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
+    :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
+    :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
+    (:math:`l \ge 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
+    dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
+    variable which is :math:`0` with probability :attr:`dropout`.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
+            would mean stacking two GRUs together to form a `stacked GRU`,
+            with the second GRU taking in outputs of the first GRU and
+            computing the final results. Default: 1
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        batch_first: If ``True``, then the input and output tensors are provided
+            as `(batch, seq, feature)` instead of `(seq, batch, feature)`.
+            Note that this does not apply to hidden or cell states. See the
+            Inputs/Outputs sections below for details.  Default: ``False``
+        dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
+            GRU layer except the last layer, with dropout probability equal to
+            :attr:`dropout`. Default: 0
+        bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
+
+    Inputs: input, h_0
+        * **input**: tensor of shape :math:`(L, H_{in})` for unbatched input,
+          :math:`(L, N, H_{in})` when ``batch_first=False`` or
+          :math:`(N, L, H_{in})` when ``batch_first=True`` containing the features of
+          the input sequence.  The input can also be a packed variable length sequence.
+          See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
+          :func:`torch.nn.utils.rnn.pack_sequence` for details.
+        * **h_0**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
+          :math:`(D * \text{num\_layers}, N, H_{out})`
+          containing the initial hidden state for the input sequence. Defaults to zeros if not provided.
+
+        where:
+
+        .. math::
+            \begin{aligned}
+                N ={} & \text{batch size} \\
+                L ={} & \text{sequence length} \\
+                D ={} & 2 \text{ if bidirectional=True otherwise } 1 \\
+                H_{in} ={} & \text{input\_size} \\
+                H_{out} ={} & \text{hidden\_size}
+            \end{aligned}
+
+    Outputs: output, h_n
+        * **output**: tensor of shape :math:`(L, D * H_{out})` for unbatched input,
+          :math:`(L, N, D * H_{out})` when ``batch_first=False`` or
+          :math:`(N, L, D * H_{out})` when ``batch_first=True`` containing the output features
+          `(h_t)` from the last layer of the GRU, for each `t`. If a
+          :class:`torch.nn.utils.rnn.PackedSequence` has been given as the input, the output
+          will also be a packed sequence.
+        * **h_n**: tensor of shape :math:`(D * \text{num\_layers}, H_{out})` or
+          :math:`(D * \text{num\_layers}, N, H_{out})` containing the final hidden state
+          for the input sequence.
+
+    Attributes:
+        weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
+            Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
+        weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
+            (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
+        bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
+        bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
+            (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    .. note::
+        For bidirectional GRUs, forward and backward are directions 0 and 1 respectively.
+        Example of splitting the output layers when ``batch_first=False``:
+        ``output.view(seq_len, batch, num_directions, hidden_size)``.
+
+    .. note::
+        ``batch_first`` argument is ignored for unbatched inputs.
+
+    .. note::
+        The calculation of new gate :math:`n_t` subtly differs from the original paper and other frameworks.
+        In the original implementation, the Hadamard product :math:`(\odot)` between :math:`r_t` and the
+        previous hidden state :math:`h_{(t-1)}` is done before the multiplication with the weight matrix
+        `W` and addition of bias:
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + W_{hn} ( r_t \odot h_{(t-1)} ) + b_{hn})
+            \end{aligned}
+
+        This is in contrast to PyTorch implementation, which is done after :math:`W_{hn} h_{(t-1)}`
+
+        .. math::
+            \begin{aligned}
+                n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)}+ b_{hn}))
+            \end{aligned}
+
+        This implementation differs on purpose for efficiency.
+
+    .. include:: ../cudnn_persistent_rnn.rst
+
+    Examples::
+
+        >>> rnn = nn.GRU(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> output, hn = rnn(input, h0)
+    """
+
+    @overload
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int = 1, bias: bool = True,
+                 batch_first: bool = False, dropout: float = 0., bidirectional: bool = False,
+                 device=None, dtype=None) -> None:
+        ...
+
+    @overload
+    def __init__(self, *args, **kwargs):
+        ...
+
+    def __init__(self, *args, **kwargs):
+        if 'proj_size' in kwargs:
+            raise ValueError("proj_size argument is only supported for LSTM, not RNN or GRU")
+        super().__init__('GRU', *args, **kwargs)
+
+    @overload  # type: ignore[override]
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:  # noqa: F811
+        pass
+
+    @overload
+    @torch._jit_internal._overload_method  # noqa: F811
+    def forward(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]:  # noqa: F811
+        pass
+
+    def forward(self, input, hx=None):  # noqa: F811
+        self._update_flat_weights()
+
+        orig_input = input
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            input, batch_sizes, sorted_indices, unsorted_indices = input
+            max_batch_size = batch_sizes[0]
+            if hx is None:
+                num_directions = 2 if self.bidirectional else 1
+                hx = torch.zeros(self.num_layers * num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 dtype=input.dtype, device=input.device)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+        else:
+            batch_sizes = None
+            if input.dim() not in (2, 3):
+                raise ValueError(f"GRU: Expected input to be 2D or 3D, got {input.dim()}D instead")
+            is_batched = input.dim() == 3
+            batch_dim = 0 if self.batch_first else 1
+            if not is_batched:
+                input = input.unsqueeze(batch_dim)
+                if hx is not None:
+                    if hx.dim() != 2:
+                        raise RuntimeError(
+                            f"For unbatched 2-D input, hx should also be 2-D but got {hx.dim()}-D tensor")
+                    hx = hx.unsqueeze(1)
+            else:
+                if hx is not None and hx.dim() != 3:
+                    raise RuntimeError(
+                        f"For batched 3-D input, hx should also be 3-D but got {hx.dim()}-D tensor")
+            max_batch_size = input.size(0) if self.batch_first else input.size(1)
+            sorted_indices = None
+            unsorted_indices = None
+            if hx is None:
+                num_directions = 2 if self.bidirectional else 1
+                hx = torch.zeros(self.num_layers * num_directions,
+                                 max_batch_size, self.hidden_size,
+                                 dtype=input.dtype, device=input.device)
+            else:
+                # Each batch of the hidden state should match the input sequence that
+                # the user believes he/she is passing in.
+                hx = self.permute_hidden(hx, sorted_indices)
+
+        self.check_forward_args(input, hx, batch_sizes)
+        if batch_sizes is None:
+            result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
+                             self.dropout, self.training, self.bidirectional, self.batch_first)
+        else:
+            result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
+                             self.num_layers, self.dropout, self.training, self.bidirectional)
+        output = result[0]
+        hidden = result[1]
+
+        # xxx: isinstance check needs to be in conditional for TorchScript to compile
+        if isinstance(orig_input, PackedSequence):
+            output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
+            return output_packed, self.permute_hidden(hidden, unsorted_indices)
+        else:
+            if not is_batched:  # type: ignore[possibly-undefined]
+                output = output.squeeze(batch_dim)  # type: ignore[possibly-undefined]
+                hidden = hidden.squeeze(1)
+
+            return output, self.permute_hidden(hidden, unsorted_indices)
+
+
+class RNNCellBase(Module):
+    __constants__ = ['input_size', 'hidden_size', 'bias']
+
+    input_size: int
+    hidden_size: int
+    bias: bool
+    weight_ih: Tensor
+    weight_hh: Tensor
+    # WARNING: bias_ih and bias_hh purposely not defined here.
+    # See https://github.com/pytorch/pytorch/issues/39670
+
+    def __init__(self, input_size: int, hidden_size: int, bias: bool, num_chunks: int,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.weight_ih = Parameter(torch.empty((num_chunks * hidden_size, input_size), **factory_kwargs))
+        self.weight_hh = Parameter(torch.empty((num_chunks * hidden_size, hidden_size), **factory_kwargs))
+        if bias:
+            self.bias_ih = Parameter(torch.empty(num_chunks * hidden_size, **factory_kwargs))
+            self.bias_hh = Parameter(torch.empty(num_chunks * hidden_size, **factory_kwargs))
+        else:
+            self.register_parameter('bias_ih', None)
+            self.register_parameter('bias_hh', None)
+
+        self.reset_parameters()
+
+    def extra_repr(self) -> str:
+        s = '{input_size}, {hidden_size}'
+        if 'bias' in self.__dict__ and self.bias is not True:
+            s += ', bias={bias}'
+        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
+            s += ', nonlinearity={nonlinearity}'
+        return s.format(**self.__dict__)
+
+    def reset_parameters(self) -> None:
+        stdv = 1.0 / math.sqrt(self.hidden_size) if self.hidden_size > 0 else 0
+        for weight in self.parameters():
+            init.uniform_(weight, -stdv, stdv)
+
+
+class RNNCell(RNNCellBase):
+    r"""An Elman RNN cell with tanh or ReLU non-linearity.
+
+    .. math::
+
+        h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})
+
+    If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
+            Default: ``True``
+        nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
+
+    Inputs: input, hidden
+        - **input**: tensor containing input features
+        - **hidden**: tensor containing the initial hidden state
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
+          for each element in the batch
+
+    Shape:
+        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`.
+        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
+          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
+        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    Examples::
+
+        >>> rnn = nn.RNNCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    __constants__ = ['input_size', 'hidden_size', 'bias', 'nonlinearity']
+    nonlinearity: str
+
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True, nonlinearity: str = "tanh",
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=1, **factory_kwargs)
+        self.nonlinearity = nonlinearity
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        if input.dim() not in (1, 2):
+            raise ValueError(f"RNNCell: Expected input to be 1D or 2D, got {input.dim()}D instead")
+        if hx is not None and hx.dim() not in (1, 2):
+            raise ValueError(f"RNNCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead")
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        if self.nonlinearity == "tanh":
+            ret = _VF.rnn_tanh_cell(
+                input, hx,
+                self.weight_ih, self.weight_hh,
+                self.bias_ih, self.bias_hh,
+            )
+        elif self.nonlinearity == "relu":
+            ret = _VF.rnn_relu_cell(
+                input, hx,
+                self.weight_ih, self.weight_hh,
+                self.bias_ih, self.bias_hh,
+            )
+        else:
+            ret = input  # TODO: remove when jit supports exception flow
+            raise RuntimeError(
+                f"Unknown nonlinearity: {self.nonlinearity}")
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
+
+
+class LSTMCell(RNNCellBase):
+    r"""A long short-term memory (LSTM) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
+        f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
+        g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
+        o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
+        c' = f \odot c + i \odot g \\
+        h' = o \odot \tanh(c') \\
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, (h_0, c_0)
+        - **input** of shape `(batch, input_size)` or `(input_size)`: tensor containing input features
+        - **h_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial hidden state
+        - **c_0** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the initial cell state
+
+          If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
+
+    Outputs: (h_1, c_1)
+        - **h_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next hidden state
+        - **c_1** of shape `(batch, hidden_size)` or `(hidden_size)`: tensor containing the next cell state
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(4*hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(4*hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Examples::
+
+        >>> rnn = nn.LSTMCell(10, 20)  # (input_size, hidden_size)
+        >>> input = torch.randn(2, 3, 10)  # (time_steps, batch, input_size)
+        >>> hx = torch.randn(3, 20)  # (batch, hidden_size)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(input.size()[0]):
+        ...     hx, cx = rnn(input[i], (hx, cx))
+        ...     output.append(hx)
+        >>> output = torch.stack(output, dim=0)
+    """
+
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=4, **factory_kwargs)
+
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        if input.dim() not in (1, 2):
+            raise ValueError(f"LSTMCell: Expected input to be 1D or 2D, got {input.dim()}D instead")
+        if hx is not None:
+            for idx, value in enumerate(hx):
+                if value.dim() not in (1, 2):
+                    raise ValueError(f"LSTMCell: Expected hx[{idx}] to be 1D or 2D, got {value.dim()}D instead")
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+            hx = (zeros, zeros)
+        else:
+            hx = (hx[0].unsqueeze(0), hx[1].unsqueeze(0)) if not is_batched else hx
+
+        ret = _VF.lstm_cell(
+            input, hx,
+            self.weight_ih, self.weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = (ret[0].squeeze(0), ret[1].squeeze(0))
+        return ret
+
+
+class GRUCell(RNNCellBase):
+    r"""A gated recurrent unit (GRU) cell.
+
+    .. math::
+
+        \begin{array}{ll}
+        r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
+        z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
+        n = \tanh(W_{in} x + b_{in} + r \odot (W_{hn} h + b_{hn})) \\
+        h' = (1 - z) \odot n + z \odot h
+        \end{array}
+
+    where :math:`\sigma` is the sigmoid function, and :math:`\odot` is the Hadamard product.
+
+    Args:
+        input_size: The number of expected features in the input `x`
+        hidden_size: The number of features in the hidden state `h`
+        bias: If ``False``, then the layer does not use bias weights `b_ih` and
+            `b_hh`. Default: ``True``
+
+    Inputs: input, hidden
+        - **input** : tensor containing input features
+        - **hidden** : tensor containing the initial hidden
+          state for each element in the batch.
+          Defaults to zero if not provided.
+
+    Outputs: h'
+        - **h'** : tensor containing the next hidden state
+          for each element in the batch
+
+    Shape:
+        - input: :math:`(N, H_{in})` or :math:`(H_{in})` tensor containing input features where
+          :math:`H_{in}` = `input_size`.
+        - hidden: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the initial hidden
+          state where :math:`H_{out}` = `hidden_size`. Defaults to zero if not provided.
+        - output: :math:`(N, H_{out})` or :math:`(H_{out})` tensor containing the next hidden state.
+
+    Attributes:
+        weight_ih: the learnable input-hidden weights, of shape
+            `(3*hidden_size, input_size)`
+        weight_hh: the learnable hidden-hidden weights, of shape
+            `(3*hidden_size, hidden_size)`
+        bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
+        bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`
+
+    .. note::
+        All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
+        where :math:`k = \frac{1}{\text{hidden\_size}}`
+
+    On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
+
+    Examples::
+
+        >>> rnn = nn.GRUCell(10, 20)
+        >>> input = torch.randn(6, 3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+        ...     hx = rnn(input[i], hx)
+        ...     output.append(hx)
+    """
+
+    def __init__(self, input_size: int, hidden_size: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__(input_size, hidden_size, bias, num_chunks=3, **factory_kwargs)
+
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
+        if input.dim() not in (1, 2):
+            raise ValueError(f"GRUCell: Expected input to be 1D or 2D, got {input.dim()}D instead")
+        if hx is not None and hx.dim() not in (1, 2):
+            raise ValueError(f"GRUCell: Expected hidden to be 1D or 2D, got {hx.dim()}D instead")
+        is_batched = input.dim() == 2
+        if not is_batched:
+            input = input.unsqueeze(0)
+
+        if hx is None:
+            hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
+        else:
+            hx = hx.unsqueeze(0) if not is_batched else hx
+
+        ret = _VF.gru_cell(
+            input, hx,
+            self.weight_ih, self.weight_hh,
+            self.bias_ih, self.bias_hh,
+        )
+
+        if not is_batched:
+            ret = ret.squeeze(0)
+
+        return ret
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/sparse.py b/MLPY/Lib/site-packages/torch/nn/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80e785204e5b713b8ace56cd6083529db3dfc5b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/sparse.py
@@ -0,0 +1,455 @@
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.nn.parameter import Parameter
+
+from .module import Module
+from .. import functional as F
+from .. import init
+
+__all__ = ['Embedding', 'EmbeddingBag']
+
+class Embedding(Module):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                     therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                     i.e. it remains as a fixed "pad". For a newly constructed Embedding,
+                                     the embedding vector at :attr:`padding_idx` will default to all zeros,
+                                     but can be updated to another value to be used as the padding vector.
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+        sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
+                                 See Notes for more details regarding sparse gradients.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+                         initialized from :math:`\mathcal{N}(0, 1)`
+
+    Shape:
+        - Input: :math:`(*)`, IntTensor or LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
+
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+
+    .. note::
+        When :attr:`max_norm` is not ``None``, :class:`Embedding`'s forward method will modify the
+        :attr:`weight` tensor in-place. Since tensors needed for gradient computations cannot be
+        modified in-place, performing a differentiable operation on ``Embedding.weight`` before
+        calling :class:`Embedding`'s forward method requires cloning ``Embedding.weight`` when
+        :attr:`max_norm` is not ``None``. For example::
+
+            n, d, m = 3, 5, 7
+            embedding = nn.Embedding(n, d, max_norm=True)
+            W = torch.randn((m, d), requires_grad=True)
+            idx = torch.tensor([1, 2])
+            a = embedding.weight.clone() @ W.t()  # weight must be cloned for this to be differentiable
+            b = embedding(idx) @ W.t()  # modifies weight in-place
+            out = (a.unsqueeze(0) + b.unsqueeze(1))
+            loss = out.sigmoid().prod()
+            loss.backward()
+
+    Examples::
+
+        >>> # an Embedding module containing 10 tensors of size 3
+        >>> embedding = nn.Embedding(10, 3)
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding(input)
+        tensor([[[-0.0251, -1.6902,  0.7172],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 1.4970,  1.3448, -0.9685],
+                 [-0.3677, -2.7265, -0.1685]],
+
+                [[ 1.4970,  1.3448, -0.9685],
+                 [ 0.4362, -0.4004,  0.9400],
+                 [-0.6431,  0.0748,  0.6969],
+                 [ 0.9124, -2.3616,  1.1151]]])
+
+
+        >>> # example with padding_idx
+        >>> embedding = nn.Embedding(10, 3, padding_idx=0)
+        >>> input = torch.LongTensor([[0, 2, 0, 5]])
+        >>> embedding(input)
+        tensor([[[ 0.0000,  0.0000,  0.0000],
+                 [ 0.1535, -2.0309,  0.9315],
+                 [ 0.0000,  0.0000,  0.0000],
+                 [-0.1655,  0.9897,  0.0635]]])
+
+        >>> # example of changing `pad` vector
+        >>> padding_idx = 0
+        >>> embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+        >>> with torch.no_grad():
+        ...     embedding.weight[padding_idx] = torch.ones(3)
+        >>> embedding.weight
+        Parameter containing:
+        tensor([[ 1.0000,  1.0000,  1.0000],
+                [-0.7895, -0.7089, -0.0364],
+                [ 0.6778,  0.5803,  0.2678]], requires_grad=True)
+    """
+
+    __constants__ = ['num_embeddings', 'embedding_dim', 'padding_idx', 'max_norm',
+                     'norm_type', 'scale_grad_by_freq', 'sparse']
+
+    num_embeddings: int
+    embedding_dim: int
+    padding_idx: Optional[int]
+    max_norm: Optional[float]
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    freeze: bool
+    sparse: bool
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 sparse: bool = False, _weight: Optional[Tensor] = None, _freeze: bool = False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if _weight is None:
+            self.weight = Parameter(torch.empty((num_embeddings, embedding_dim), **factory_kwargs),
+                                    requires_grad=not _freeze)
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
+                'Shape of weight does not match num_embeddings and embedding_dim'
+            self.weight = Parameter(_weight, requires_grad=not _freeze)
+
+        self.sparse = sparse
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.embedding(
+            input, self.weight, self.padding_idx, self.max_norm,
+            self.norm_type, self.scale_grad_by_freq, self.sparse)
+
+    def extra_repr(self) -> str:
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.padding_idx is not None:
+            s += ', padding_idx={padding_idx}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        if self.sparse is not False:
+            s += ', sparse=True'
+        return s.format(**self.__dict__)
+
+    @classmethod
+    def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
+                        max_norm=None, norm_type=2., scale_grad_by_freq=False,
+                        sparse=False):
+        r"""Create Embedding instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the Embedding.
+                First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
+            freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
+            padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the gradient;
+                                         therefore, the embedding vector at :attr:`padding_idx` is not updated during training,
+                                         i.e. it remains as a fixed "pad".
+            max_norm (float, optional): See module initialization documentation.
+            norm_type (float, optional): See module initialization documentation. Default ``2``.
+            scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
+            sparse (bool, optional): See module initialization documentation.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embedding = nn.Embedding.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([1])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embedding(input)
+            tensor([[ 4.0000,  5.1000,  6.3000]])
+        """
+        assert embeddings.dim() == 2, \
+            'Embeddings parameter is expected to be 2-dimensional'
+        rows, cols = embeddings.shape
+        embedding = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            _freeze=freeze,
+            padding_idx=padding_idx,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            sparse=sparse)
+        return embedding
+
+
+class EmbeddingBag(Module):
+    r"""Compute sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings.
+
+    For bags of constant length, no :attr:`per_sample_weights`, no indices equal to :attr:`padding_idx`,
+    and with 2D inputs, this class
+
+        * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``,
+        * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``,
+        * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``.
+
+    However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
+    operations.
+
+    EmbeddingBag also supports per-sample weights as an argument to the forward
+    pass. This scales the output of the Embedding before performing a weighted
+    reduction as specified by ``mode``. If :attr:`per_sample_weights` is passed, the
+    only supported ``mode`` is ``"sum"``, which computes a weighted sum according to
+    :attr:`per_sample_weights`.
+
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
+                                    is renormalized to have norm :attr:`max_norm`.
+        norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
+        scale_grad_by_freq (bool, optional): if given, this will scale gradients by the inverse of frequency of
+                                                the words in the mini-batch. Default ``False``.
+                                                Note: this option is not supported when ``mode="max"``.
+        mode (str, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
+                                 ``"sum"`` computes the weighted sum, taking :attr:`per_sample_weights`
+                                 into consideration. ``"mean"`` computes the average of the values
+                                 in the bag, ``"max"`` computes the max value over each bag.
+                                 Default: ``"mean"``
+        sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
+                                 Notes for more details regarding sparse gradients. Note: this option is not
+                                 supported when ``mode="max"``.
+        include_last_offset (bool, optional): if ``True``, :attr:`offsets` has one additional element, where the last element
+                                      is equivalent to the size of `indices`. This matches the CSR format.
+        padding_idx (int, optional): If specified, the entries at :attr:`padding_idx` do not contribute to the
+                                     gradient; therefore, the embedding vector at :attr:`padding_idx` is not updated
+                                     during training, i.e. it remains as a fixed "pad". For a newly constructed
+                                     EmbeddingBag, the embedding vector at :attr:`padding_idx` will default to all
+                                     zeros, but can be updated to another value to be used as the padding vector.
+                                     Note that the embedding vector at :attr:`padding_idx` is excluded from the
+                                     reduction.
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
+                         initialized from :math:`\mathcal{N}(0, 1)`.
+
+    Examples::
+
+        >>> # an EmbeddingBag module containing 10 tensors of size 3
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
+        >>> # a batch of 2 samples of 4 indices each
+        >>> input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> embedding_sum(input, offsets)
+        tensor([[-0.8861, -5.4350, -0.0523],
+                [ 1.1306, -2.5798, -1.0044]])
+
+        >>> # Example with padding_idx
+        >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)
+        >>> input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
+        >>> offsets = torch.tensor([0, 4], dtype=torch.long)
+        >>> embedding_sum(input, offsets)
+        tensor([[ 0.0000,  0.0000,  0.0000],
+                [-0.7082,  3.2145, -2.6251]])
+
+        >>> # An EmbeddingBag can be loaded from an Embedding like so
+        >>> embedding = nn.Embedding(10, 3, padding_idx=2)
+        >>> embedding_sum = nn.EmbeddingBag.from_pretrained(
+                embedding.weight,
+                padding_idx=embedding.padding_idx,
+                mode='sum')
+    """
+
+    __constants__ = ['num_embeddings', 'embedding_dim', 'max_norm', 'norm_type',
+                     'scale_grad_by_freq', 'mode', 'sparse', 'include_last_offset',
+                     'padding_idx']
+
+    num_embeddings: int
+    embedding_dim: int
+    max_norm: Optional[float]
+    norm_type: float
+    scale_grad_by_freq: bool
+    weight: Tensor
+    mode: str
+    sparse: bool
+    include_last_offset: bool
+    padding_idx: Optional[int]
+
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 max_norm: Optional[float] = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 mode: str = 'mean', sparse: bool = False, _weight: Optional[Tensor] = None,
+                 include_last_offset: bool = False, padding_idx: Optional[int] = None,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        if padding_idx is not None:
+            if padding_idx > 0:
+                assert padding_idx < self.num_embeddings, 'padding_idx must be within num_embeddings'
+            elif padding_idx < 0:
+                assert padding_idx >= -self.num_embeddings, 'padding_idx must be within num_embeddings'
+                padding_idx = self.num_embeddings + padding_idx
+        self.padding_idx = padding_idx
+        if _weight is None:
+            self.weight = Parameter(torch.empty((num_embeddings, embedding_dim), **factory_kwargs))
+            self.reset_parameters()
+        else:
+            assert list(_weight.shape) == [num_embeddings, embedding_dim], \
+                'Shape of weight does not match num_embeddings and embedding_dim'
+            self.weight = Parameter(_weight)
+        self.mode = mode
+        self.sparse = sparse
+        self.include_last_offset = include_last_offset
+
+    def reset_parameters(self) -> None:
+        init.normal_(self.weight)
+        self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input: Tensor, offsets: Optional[Tensor] = None, per_sample_weights: Optional[Tensor] = None) -> Tensor:
+        """Forward pass of EmbeddingBag.
+
+        Args:
+            input (Tensor): Tensor containing bags of indices into the embedding matrix.
+            offsets (Tensor, optional): Only used when :attr:`input` is 1D. :attr:`offsets` determines
+                the starting index position of each bag (sequence) in :attr:`input`.
+            per_sample_weights (Tensor, optional): a tensor of float / double weights, or None
+                to indicate all weights should be taken to be ``1``. If specified, :attr:`per_sample_weights`
+                must have exactly the same shape as input and is treated as having the same
+                :attr:`offsets`, if those are not ``None``. Only supported for ``mode='sum'``.
+
+        Returns:
+            Tensor output shape of `(B, embedding_dim)`.
+
+        .. note::
+
+            A few notes about ``input`` and ``offsets``:
+
+            - :attr:`input` and :attr:`offsets` have to be of the same type, either int or long
+
+            - If :attr:`input` is 2D of shape `(B, N)`, it will be treated as ``B`` bags (sequences)
+              each of fixed length ``N``, and this will return ``B`` values aggregated in a way
+              depending on the :attr:`mode`. :attr:`offsets` is ignored and required to be ``None`` in this case.
+
+            - If :attr:`input` is 1D of shape `(N)`, it will be treated as a concatenation of
+              multiple bags (sequences).  :attr:`offsets` is required to be a 1D tensor containing the
+              starting index positions of each bag in :attr:`input`. Therefore, for :attr:`offsets` of shape `(B)`,
+              :attr:`input` will be viewed as having ``B`` bags. Empty bags (i.e., having 0-length) will have
+              returned vectors filled by zeros.
+        """
+        return F.embedding_bag(input, self.weight, offsets,
+                               self.max_norm, self.norm_type,
+                               self.scale_grad_by_freq, self.mode, self.sparse,
+                               per_sample_weights, self.include_last_offset,
+                               self.padding_idx)
+
+    def extra_repr(self) -> str:
+        s = '{num_embeddings}, {embedding_dim}'
+        if self.max_norm is not None:
+            s += ', max_norm={max_norm}'
+        if self.norm_type != 2:
+            s += ', norm_type={norm_type}'
+        if self.scale_grad_by_freq is not False:
+            s += ', scale_grad_by_freq={scale_grad_by_freq}'
+        s += ', mode={mode}'
+        if self.padding_idx is not None:
+            s += ', padding_idx={padding_idx}'
+        return s.format(**{k: repr(v) for k, v in self.__dict__.items()})
+
+    @classmethod
+    def from_pretrained(cls, embeddings: Tensor, freeze: bool = True, max_norm: Optional[float] = None,
+                        norm_type: float = 2., scale_grad_by_freq: bool = False,
+                        mode: str = 'mean', sparse: bool = False, include_last_offset: bool = False,
+                        padding_idx: Optional[int] = None) -> 'EmbeddingBag':
+        r"""Create EmbeddingBag instance from given 2-dimensional FloatTensor.
+
+        Args:
+            embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag.
+                First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'.
+            freeze (bool, optional): If ``True``, the tensor does not get updated in the learning process.
+                Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True``
+            max_norm (float, optional): See module initialization documentation. Default: ``None``
+            norm_type (float, optional): See module initialization documentation. Default ``2``.
+            scale_grad_by_freq (bool, optional): See module initialization documentation. Default ``False``.
+            mode (str, optional): See module initialization documentation. Default: ``"mean"``
+            sparse (bool, optional): See module initialization documentation. Default: ``False``.
+            include_last_offset (bool, optional): See module initialization documentation. Default: ``False``.
+            padding_idx (int, optional): See module initialization documentation. Default: ``None``.
+
+        Examples::
+
+            >>> # FloatTensor containing pretrained weights
+            >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
+            >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
+            >>> # Get embeddings for index 1
+            >>> input = torch.LongTensor([[1, 0]])
+            >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+            >>> embeddingbag(input)
+            tensor([[ 2.5000,  3.7000,  4.6500]])
+        """
+        assert embeddings.dim() == 2, \
+            'Embeddings parameter is expected to be 2-dimensional'
+        rows, cols = embeddings.shape
+        embeddingbag = cls(
+            num_embeddings=rows,
+            embedding_dim=cols,
+            _weight=embeddings,
+            max_norm=max_norm,
+            norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq,
+            mode=mode,
+            sparse=sparse,
+            include_last_offset=include_last_offset,
+            padding_idx=padding_idx)
+        embeddingbag.weight.requires_grad = not freeze
+        return embeddingbag
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/transformer.py b/MLPY/Lib/site-packages/torch/nn/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..399d4d91bd9ae9667ce4bf2664a66152ea8b4a7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/transformer.py
@@ -0,0 +1,975 @@
+import copy
+from typing import Optional, Any, Union, Callable
+
+import torch
+import warnings
+from torch import Tensor
+from .. import functional as F
+from .module import Module
+from .activation import MultiheadAttention
+from .container import ModuleList
+from ..init import xavier_uniform_
+from .dropout import Dropout
+from .linear import Linear
+from .normalization import LayerNorm
+
+__all__ = ['Transformer', 'TransformerEncoder', 'TransformerDecoder', 'TransformerEncoderLayer', 'TransformerDecoderLayer']
+
+def _generate_square_subsequent_mask(
+        sz: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+) -> Tensor:
+    r"""Generate a square causal mask for the sequence.
+
+    The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+    """
+    if device is None:
+        device = torch.device('cpu')
+    if dtype is None:
+        dtype = torch.float32
+    return torch.triu(
+        torch.full((sz, sz), float('-inf'), dtype=dtype, device=device),
+        diagonal=1,
+    )
+
+
+def _get_seq_len(
+        src: Tensor,
+        batch_first: bool
+) -> Optional[int]:
+
+    if src.is_nested:
+        return None
+    else:
+        src_size = src.size()
+        if len(src_size) == 2:
+            # unbatched: S, E
+            return src_size[0]
+        else:
+            # batched: B, S, E if batch_first else S, B, E
+            seq_len_pos = 1 if batch_first else 0
+            return src_size[seq_len_pos]
+
+
+class Transformer(Module):
+    r"""A transformer model.
+
+    User is able to modify the attributes as needed. The architecture
+    is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    Processing Systems, pages 6000-6010.
+
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of encoder/decoder intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before
+            other attention and feedforward operations, otherwise after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples::
+        >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+        >>> src = torch.rand((10, 32, 512))
+        >>> tgt = torch.rand((20, 32, 512))
+        >>> out = transformer_model(src, tgt)
+
+    Note: A full example to apply nn.Transformer module for the word language model is available in
+    https://github.com/pytorch/examples/tree/master/word_language_model
+    """
+
+    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
+                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    bias, **factory_kwargs)
+            encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    bias, **factory_kwargs)
+            decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        self.batch_first = batch_first
+
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None,
+                src_is_causal: Optional[bool] = None, tgt_is_causal: Optional[bool] = None,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Take in and process masked source/target sequences.
+
+        .. note::
+
+            If a boolean tensor is provided for any of the [src/tgt/memory]_mask arguments, positions with a ``True`` value are
+            not allowed to participate in the attention,
+            which is the opposite of the definition for :attr:`attn_mask`
+            in :func:`torch.nn.functional.scaled_dot_product_attention`.
+
+        Args:
+            src: the sequence to the encoder (required).
+            tgt: the sequence to the decoder (required).
+            src_mask: the additive mask for the src sequence (optional).
+            tgt_mask: the additive mask for the tgt sequence (optional).
+            memory_mask: the additive mask for the encoder output (optional).
+            src_key_padding_mask: the Tensor mask for src keys per batch (optional).
+            tgt_key_padding_mask: the Tensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the Tensor mask for memory keys per batch (optional).
+            src_is_causal: If specified, applies a causal mask as ``src_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``src_is_causal`` provides a hint that ``src_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            tgt_is_causal: If specified, applies a causal mask as ``tgt_mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory_mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            - src: :math:`(S, E)` for unbatched input, :math:`(S, N, E)` if `batch_first=False` or
+              `(N, S, E)` if `batch_first=True`.
+            - tgt: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+            - src_mask: :math:`(S, S)` or :math:`(N\cdot\text{num\_heads}, S, S)`.
+            - tgt_mask: :math:`(T, T)` or :math:`(N\cdot\text{num\_heads}, T, T)`.
+            - memory_mask: :math:`(T, S)`.
+            - src_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+            - tgt_key_padding_mask: :math:`(T)` for unbatched input otherwise :math:`(N, T)`.
+            - memory_key_padding_mask: :math:`(S)` for unbatched input otherwise :math:`(N, S)`.
+
+            Note: [src/tgt/memory]_mask ensures that position :math:`i` is allowed to attend the unmasked
+            positions. If a BoolTensor is provided, positions with ``True``
+            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+            is provided, it will be added to the attention weight.
+            [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+            the attention. If a BoolTensor is provided, the positions with the
+            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+
+            - output: :math:`(T, E)` for unbatched input, :math:`(T, N, E)` if `batch_first=False` or
+              `(N, T, E)` if `batch_first=True`.
+
+            Note: Due to the multi-head attention architecture in the transformer model,
+            the output sequence length of a transformer is same as the input sequence
+            (i.e. target) length of the decoder.
+
+            where :math:`S` is the source sequence length, :math:`T` is the target sequence length, :math:`N` is the
+            batch size, :math:`E` is the feature number
+
+        Examples:
+            >>> # xdoctest: +SKIP
+            >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
+        """
+        is_batched = src.dim() == 3
+        if not self.batch_first and src.size(1) != tgt.size(1) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+        elif self.batch_first and src.size(0) != tgt.size(0) and is_batched:
+            raise RuntimeError("the batch number of src and tgt must be equal")
+
+        if src.size(-1) != self.d_model or tgt.size(-1) != self.d_model:
+            raise RuntimeError("the feature number of src and tgt must be equal to d_model")
+
+        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask,
+                              is_causal=src_is_causal)
+        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
+                              tgt_key_padding_mask=tgt_key_padding_mask,
+                              memory_key_padding_mask=memory_key_padding_mask,
+                              tgt_is_causal=tgt_is_causal, memory_is_causal=memory_is_causal)
+        return output
+
+    @staticmethod
+    def generate_square_subsequent_mask(
+            sz: int,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ) -> Tensor:
+        r"""Generate a square causal mask for the sequence.
+
+        The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0).
+        """
+        return _generate_square_subsequent_mask(sz, dtype=dtype, device=device)
+
+    def _reset_parameters(self):
+        r"""Initiate parameters in the transformer model."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+
+
+class TransformerEncoder(Module):
+    r"""TransformerEncoder is a stack of N encoder layers.
+
+    Users can build the BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
+
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+        enable_nested_tensor: if True, input will automatically convert to nested tensor
+            (and convert back on output). This will improve the overall performance of
+            TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
+
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+
+    __constants__ = ['norm']
+
+    def __init__(
+        self,
+        encoder_layer: "TransformerEncoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None,
+        enable_nested_tensor: bool = True,
+        mask_check: bool = True
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        # this attribute saves the value providedat object construction
+        self.enable_nested_tensor = enable_nested_tensor
+        # this attribute controls whether nested tensors are used
+        self.use_nested_tensor = enable_nested_tensor
+        self.mask_check = mask_check
+
+        enc_layer = "encoder_layer"
+        why_not_sparsity_fast_path = ''
+        if not isinstance(encoder_layer, torch.nn.TransformerEncoderLayer):
+            why_not_sparsity_fast_path = f"{enc_layer} was not TransformerEncoderLayer"
+        elif encoder_layer.norm_first :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm_first was True"
+        elif not encoder_layer.self_attn.batch_first:
+            why_not_sparsity_fast_path = (f"{enc_layer}.self_attn.batch_first was not True" +
+                                          "(use batch_first for better inference performance)")
+        elif not encoder_layer.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn._qkv_same_embed_dim was not True"
+        elif encoder_layer.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn was passed bias=False"
+        elif not encoder_layer.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = f"{enc_layer}.activation_relu_or_gelu was not True"
+        elif not (encoder_layer.norm1.eps == encoder_layer.norm2.eps) :
+            why_not_sparsity_fast_path = f"{enc_layer}.norm1.eps was not equal to {enc_layer}.norm2.eps"
+        elif encoder_layer.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = f"{enc_layer}.self_attn.num_heads is odd"
+
+        if enable_nested_tensor and why_not_sparsity_fast_path:
+            warnings.warn(f"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}")
+            self.use_nested_tensor = False
+
+
+    def forward(
+            self,
+            src: Tensor,
+            mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: Optional[bool] = None) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``is_causal`` provides a hint that ``mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(mask),
+            other_name="mask",
+            target_type=src.dtype
+        )
+
+        mask = F._canonical_mask(
+            mask=mask,
+            mask_name="mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+
+        output = src
+        convert_to_nested = False
+        first_layer = self.layers[0]
+        src_key_padding_mask_for_layers = src_key_padding_mask
+        why_not_sparsity_fast_path = ''
+        str_first_layer = "self.layers[0]"
+        batch_first = first_layer.self_attn.batch_first
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not hasattr(self, "use_nested_tensor"):
+            why_not_sparsity_fast_path = "use_nested_tensor attribute not present"
+        elif not self.use_nested_tensor:
+            why_not_sparsity_fast_path = "self.use_nested_tensor (set in init) was not True"
+        elif first_layer.training:
+            why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
+        elif not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif src_key_padding_mask is None:
+            why_not_sparsity_fast_path = "src_key_padding_mask was None"
+        elif (((not hasattr(self, "mask_check")) or self.mask_check)
+                and not torch._nested_tensor_from_mask_left_aligned(src, src_key_padding_mask.logical_not())):
+            why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
+        elif output.is_nested:
+            why_not_sparsity_fast_path = "NestedTensor input is not supported"
+        elif mask is not None:
+            why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                first_layer.self_attn.in_proj_weight,
+                first_layer.self_attn.in_proj_bias,
+                first_layer.self_attn.out_proj.weight,
+                first_layer.self_attn.out_proj.bias,
+                first_layer.norm1.weight,
+                first_layer.norm1.bias,
+                first_layer.norm2.weight,
+                first_layer.norm2.bias,
+                first_layer.linear1.weight,
+                first_layer.linear1.bias,
+                first_layer.linear2.weight,
+                first_layer.linear2.bias,
+            )
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif src.device.type not in _supported_device_type:
+                why_not_sparsity_fast_path = f"src device is neither one of {_supported_device_type}"
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+
+            if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None):
+                convert_to_nested = True
+                output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
+                src_key_padding_mask_for_layers = None
+
+        seq_len = _get_seq_len(src, batch_first)
+        is_causal = _detect_is_causal_mask(mask, is_causal, seq_len)
+
+        for mod in self.layers:
+            output = mod(output, src_mask=mask, is_causal=is_causal, src_key_padding_mask=src_key_padding_mask_for_layers)
+
+        if convert_to_nested:
+            output = output.to_padded_tensor(0., src.size())
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers.
+
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+
+    __constants__ = ['norm']
+
+    def __init__(
+        self,
+        decoder_layer: "TransformerDecoderLayer",
+        num_layers: int,
+        norm: Optional[Module] = None
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once(f"torch.nn.modules.{self.__class__.__name__}")
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None, tgt_is_causal: Optional[bool] = None,
+                memory_is_causal: bool = False) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``None``; try to detect a causal mask.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        output = tgt
+
+        seq_len = _get_seq_len(tgt, self.layers[0].self_attn.batch_first)
+        tgt_is_causal = _detect_is_causal_mask(tgt_mask, tgt_is_causal, seq_len)
+
+        for mod in self.layers:
+            output = mod(output, memory, tgt_mask=tgt_mask,
+                         memory_mask=memory_mask,
+                         tgt_key_padding_mask=tgt_key_padding_mask,
+                         memory_key_padding_mask=memory_key_padding_mask,
+                         tgt_is_causal=tgt_is_causal,
+                         memory_is_causal=memory_is_causal)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+
+    TransformerEncoderLayer can handle either traditional torch.tensor inputs,
+    or Nested Tensor inputs.  Derived classes are expected to similarly accept
+    both input formats.  (Not all combinations of inputs are currently
+    supported by TransformerEncoderLayer while Nested Tensor is in prototype
+    state.)
+
+    If you are implementing a custom layer, you may derive it either from
+    the Module or TransformerEncoderLayer class.  If your custom layer
+    supports both torch.Tensors and Nested Tensors inputs, make its
+    implementation a derived class of TransformerEncoderLayer. If your custom
+    Layer supports only torch.Tensor inputs, derive its implementation from
+    Module.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to attention and feedforward
+            operations, respectively. Otherwise it's done after. Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+
+    Fast path:
+        forward() will use a special optimized implementation described in
+        `FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`_ if all of the following
+        conditions are met:
+
+        - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor
+          argument ``requires_grad``
+        - training is disabled (using ``.eval()``)
+        - batch_first is ``True`` and the input is batched (i.e., ``src.dim() == 3``)
+        - activation is one of: ``"relu"``, ``"gelu"``, ``torch.functional.relu``, or ``torch.functional.gelu``
+        - at most one of ``src_mask`` and ``src_key_padding_mask`` is passed
+        - if src is a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_, neither ``src_mask``
+          nor ``src_key_padding_mask`` is passed
+        - the two ``LayerNorm`` instances have a consistent ``eps`` value (this will naturally be the case
+          unless the caller has manually modified one without modifying the other)
+
+        If the optimized implementation is in use, a
+        `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be
+        passed for ``src`` to represent padding more efficiently than using a padding
+        mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ will be
+        returned, and an additional speedup proportional to the fraction of the input that
+        is padding can be expected.
+
+        .. _`FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness`:
+         https://arxiv.org/abs/2205.14135
+
+    """
+
+    __constants__ = ['norm_first']
+
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout,
+                                            bias=bias, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            activation = _get_activation_fn(activation)
+
+        # We can't test self.activation in forward() in TorchScript,
+        # so stash some information about it instead.
+        if activation is F.relu or isinstance(activation, torch.nn.ReLU):
+            self.activation_relu_or_gelu = 1
+        elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
+            self.activation_relu_or_gelu = 2
+        else:
+            self.activation_relu_or_gelu = 0
+        self.activation = activation
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        if not hasattr(self, 'activation'):
+            self.activation = F.relu
+
+
+    def forward(
+            self,
+            src: Tensor,
+            src_mask: Optional[Tensor] = None,
+            src_key_padding_mask: Optional[Tensor] = None,
+            is_causal: bool = False) -> Tensor:
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+            is_causal: If specified, applies a causal mask as ``src mask``.
+                Default: ``False``.
+                Warning:
+                ``is_causal`` provides a hint that ``src_mask`` is the
+                causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        src_key_padding_mask = F._canonical_mask(
+            mask=src_key_padding_mask,
+            mask_name="src_key_padding_mask",
+            other_type=F._none_or_dtype(src_mask),
+            other_name="src_mask",
+            target_type=src.dtype
+        )
+
+        src_mask = F._canonical_mask(
+            mask=src_mask,
+            mask_name="src_mask",
+            other_type=None,
+            other_name="",
+            target_type=src.dtype,
+            check_other=False,
+        )
+
+        is_fastpath_enabled = torch.backends.mha.get_fastpath_enabled()
+
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+        why_not_sparsity_fast_path = ''
+        if not is_fastpath_enabled:
+            why_not_sparsity_fast_path = "torch.backends.mha.get_fastpath_enabled() was not True"
+        elif not src.dim() == 3:
+            why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
+        elif self.training:
+            why_not_sparsity_fast_path = "training is enabled"
+        elif not self.self_attn.batch_first:
+            why_not_sparsity_fast_path = "self_attn.batch_first was not True"
+        elif self.self_attn.in_proj_bias is None:
+            why_not_sparsity_fast_path = "self_attn was passed bias=False"
+        elif not self.self_attn._qkv_same_embed_dim:
+            why_not_sparsity_fast_path = "self_attn._qkv_same_embed_dim was not True"
+        elif not self.activation_relu_or_gelu:
+            why_not_sparsity_fast_path = "activation_relu_or_gelu was not True"
+        elif not (self.norm1.eps == self.norm2.eps):
+            why_not_sparsity_fast_path = "norm1.eps is not equal to norm2.eps"
+        elif src.is_nested and (src_key_padding_mask is not None or src_mask is not None):
+            why_not_sparsity_fast_path = "neither src_key_padding_mask nor src_mask are not supported with NestedTensor input"
+        elif self.self_attn.num_heads % 2 == 1:
+            why_not_sparsity_fast_path = "num_head is odd"
+        elif torch.is_autocast_enabled():
+            why_not_sparsity_fast_path = "autocast is enabled"
+        if not why_not_sparsity_fast_path:
+            tensor_args = (
+                src,
+                self.self_attn.in_proj_weight,
+                self.self_attn.in_proj_bias,
+                self.self_attn.out_proj.weight,
+                self.self_attn.out_proj.bias,
+                self.norm1.weight,
+                self.norm1.bias,
+                self.norm2.weight,
+                self.norm2.bias,
+                self.linear1.weight,
+                self.linear1.bias,
+                self.linear2.weight,
+                self.linear2.bias,
+            )
+
+            # We have to use list comprehensions below because TorchScript does not support
+            # generator expressions.
+            _supported_device_type = ["cpu", "cuda", torch.utils.backend_registration._privateuse1_backend_name]
+            if torch.overrides.has_torch_function(tensor_args):
+                why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
+            elif not all((x.device.type in _supported_device_type) for x in tensor_args):
+                why_not_sparsity_fast_path = ("some Tensor argument's device is neither one of "
+                                              f"{_supported_device_type}")
+            elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
+                why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
+                                              "input/output projection weights or biases requires_grad")
+
+            if not why_not_sparsity_fast_path:
+                merged_mask, mask_type = self.self_attn.merge_masks(src_mask, src_key_padding_mask, src)
+                return torch._transformer_encoder_layer_fwd(
+                    src,
+                    self.self_attn.embed_dim,
+                    self.self_attn.num_heads,
+                    self.self_attn.in_proj_weight,
+                    self.self_attn.in_proj_bias,
+                    self.self_attn.out_proj.weight,
+                    self.self_attn.out_proj.bias,
+                    self.activation_relu_or_gelu == 2,
+                    self.norm_first,
+                    self.norm1.eps,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    self.linear1.weight,
+                    self.linear1.bias,
+                    self.linear2.weight,
+                    self.linear2.bias,
+                    merged_mask,
+                    mask_type,
+                )
+
+
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask, is_causal=is_causal)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask, is_causal=is_causal))
+            x = self.norm2(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           need_weights=False, is_causal=is_causal)[0]
+        return self.dropout1(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+
+
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+
+    This standard decoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, layer norm is done prior to self attention, multihead
+            attention and feedforward operations, respectively. Otherwise it's done after.
+            Default: ``False`` (after).
+        bias: If set to ``False``, ``Linear`` and ``LayerNorm`` layers will not learn an additive
+            bias. Default: ``True``.
+
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = decoder_layer(tgt, memory)
+
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> memory = torch.rand(32, 10, 512)
+        >>> tgt = torch.rand(32, 20, 512)
+        >>> out = decoder_layer(tgt, memory)
+    """
+
+    __constants__ = ['norm_first']
+
+    def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 bias: bool = True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            bias=bias, **factory_kwargs)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                                 bias=bias, **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, bias=bias, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, bias=bias, **factory_kwargs)
+
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super().__setstate__(state)
+
+    def forward(
+        self,
+        tgt: Tensor,
+        memory: Tensor,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        tgt_is_causal: bool = False,
+        memory_is_causal: bool = False,
+    ) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+            tgt_is_causal: If specified, applies a causal mask as ``tgt mask``.
+                Default: ``False``.
+                Warning:
+                ``tgt_is_causal`` provides a hint that ``tgt_mask`` is
+                the causal mask. Providing incorrect hints can result in
+                incorrect execution, including forward and backward
+                compatibility.
+            memory_is_causal: If specified, applies a causal mask as
+                ``memory mask``.
+                Default: ``False``.
+                Warning:
+                ``memory_is_causal`` provides a hint that
+                ``memory_mask`` is the causal mask. Providing incorrect
+                hints can result in incorrect execution, including
+                forward and backward compatibility.
+
+        Shape:
+            see the docs in :class:`~torch.nn.Transformer`.
+        """
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+
+        x = tgt
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask, tgt_is_causal)
+            x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask, memory_is_causal)
+            x = x + self._ff_block(self.norm3(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask, tgt_is_causal))
+            x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask, memory_is_causal))
+            x = self.norm3(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           is_causal=is_causal,
+                           need_weights=False)[0]
+        return self.dropout1(x)
+
+    # multihead attention block
+    def _mha_block(self, x: Tensor, mem: Tensor,
+                   attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor], is_causal: bool = False) -> Tensor:
+        x = self.multihead_attn(x, mem, mem,
+                                attn_mask=attn_mask,
+                                key_padding_mask=key_padding_mask,
+                                is_causal=is_causal,
+                                need_weights=False)[0]
+        return self.dropout2(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+
+
+def _get_clones(module, N):
+    # FIXME: copy.deepcopy() is not defined on nn.module
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}")
+
+
+def _detect_is_causal_mask(
+        mask: Optional[Tensor],
+        is_causal: Optional[bool] = None,
+        size: Optional[int] = None,
+) -> bool:
+    """Return whether the given attention mask is causal.
+
+    Warning:
+    If ``is_causal`` is not ``None``, its value will be returned as is.  If a
+    user supplies an incorrect ``is_causal`` hint,
+
+    ``is_causal=False`` when the mask is in fact a causal attention.mask
+       may lead to reduced performance relative to what would be achievable
+       with ``is_causal=True``;
+    ``is_causal=True`` when the mask is in fact not a causal attention.mask
+       may lead to incorrect and unpredictable execution - in some scenarios,
+       a causal mask may be applied based on the hint, in other execution
+       scenarios the specified mask may be used.  The choice may not appear
+       to be deterministic, in that a number of factors like alignment,
+       hardware SKU, etc influence the decision whether to use a mask or
+       rely on the hint.
+    ``size`` if not None, check whether the mask is a causal mask of the provided size
+       Otherwise, checks for any causal mask.
+    """
+    # Prevent type refinement
+    make_causal = (is_causal is True)
+
+    if is_causal is None and mask is not None:
+        sz = size if size is not None else mask.size(-2)
+        causal_comparison = _generate_square_subsequent_mask(
+            sz, device=mask.device, dtype=mask.dtype)
+
+        # Do not use `torch.equal` so we handle batched masks by
+        # broadcasting the comparison.
+        if mask.size() == causal_comparison.size():
+            make_causal = bool((mask == causal_comparison).all())
+        else:
+            make_causal = False
+
+    return make_causal
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/upsampling.py b/MLPY/Lib/site-packages/torch/nn/modules/upsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..12768e9fc4b7f37f3e6ea706a71c694684d0923e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/upsampling.py
@@ -0,0 +1,264 @@
+from .module import Module
+from .. import functional as F
+
+from torch import Tensor
+from typing import Optional
+from ..common_types import _size_2_t, _ratio_2_t, _size_any_t, _ratio_any_t
+
+__all__ = ['Upsample', 'UpsamplingNearest2d', 'UpsamplingBilinear2d']
+
+
+class Upsample(Module):
+    r"""Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D (volumetric) data.
+
+    The input data is assumed to be of the form
+    `minibatch x channels x [optional depth] x [optional height] x width`.
+    Hence, for spatial inputs, we expect a 4D Tensor and for volumetric inputs, we expect a 5D Tensor.
+
+    The algorithms available for upsampling are nearest neighbor and linear,
+    bilinear, bicubic and trilinear for 3D, 4D and 5D input Tensor,
+    respectively.
+
+    One can either give a :attr:`scale_factor` or the target output :attr:`size` to
+    calculate the output size. (You cannot give both, as it is ambiguous)
+
+    Args:
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int], optional):
+            output spatial sizes
+        scale_factor (float or Tuple[float] or Tuple[float, float] or Tuple[float, float, float], optional):
+            multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str, optional): the upsampling algorithm: one of ``'nearest'``,
+            ``'linear'``, ``'bilinear'``, ``'bicubic'`` and ``'trilinear'``.
+            Default: ``'nearest'``
+        align_corners (bool, optional): if ``True``, the corner pixels of the input
+            and output tensors are aligned, and thus preserving the values at
+            those pixels. This only has effect when :attr:`mode` is
+            ``'linear'``, ``'bilinear'``, ``'bicubic'``, or ``'trilinear'``.
+            Default: ``False``
+        recompute_scale_factor (bool, optional): recompute the scale_factor for use in the
+            interpolation calculation. If `recompute_scale_factor` is ``True``, then
+            `scale_factor` must be passed in and `scale_factor` is used to compute the
+            output `size`. The computed output `size` will be used to infer new scales for
+            the interpolation. Note that when `scale_factor` is floating-point, it may differ
+            from the recomputed `scale_factor` due to rounding and precision issues.
+            If `recompute_scale_factor` is ``False``, then `size` or `scale_factor` will
+            be used directly for interpolation.
+
+    Shape:
+        - Input: :math:`(N, C, W_{in})`, :math:`(N, C, H_{in}, W_{in})` or :math:`(N, C, D_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C, W_{out})`, :math:`(N, C, H_{out}, W_{out})`
+          or :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
+
+    .. math::
+        D_{out} = \left\lfloor D_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`linear`, `bilinear`, `bicubic`, and `trilinear`) don't proportionally
+        align the output and input pixels, and thus the output values can depend
+        on the input size. This was the default behavior for these modes up to
+        version 0.3.1. Since then, the default behavior is
+        ``align_corners = False``. See below for concrete examples on how this
+        affects the outputs.
+
+    .. note::
+        If you want downsampling/general resizing, you should use :func:`~nn.functional.interpolate`.
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='nearest')
+        >>> m(input)
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("other tests seem to modify printing styles")
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> m(input)
+        tensor([[[[1.0000, 1.2500, 1.7500, 2.0000],
+                  [1.5000, 1.7500, 2.2500, 2.5000],
+                  [2.5000, 2.7500, 3.2500, 3.5000],
+                  [3.0000, 3.2500, 3.7500, 4.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> m(input)
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
+
+        >>> # Try scaling the same data in a larger tensor
+        >>> input_3x3 = torch.zeros(3, 3).view(1, 1, 3, 3)
+        >>> input_3x3[:, :, :2, :2].copy_(input)
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+        >>> input_3x3
+        tensor([[[[1., 2., 0.],
+                  [3., 4., 0.],
+                  [0., 0., 0.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("seems to fail when other tests are run in the same session")
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear')  # align_corners=False
+        >>> # Notice that values in top left corner are the same with the small input (except at boundary)
+        >>> m(input_3x3)
+        tensor([[[[1.0000, 1.2500, 1.7500, 1.5000, 0.5000, 0.0000],
+                  [1.5000, 1.7500, 2.2500, 1.8750, 0.6250, 0.0000],
+                  [2.5000, 2.7500, 3.2500, 2.6250, 0.8750, 0.0000],
+                  [2.2500, 2.4375, 2.8125, 2.2500, 0.7500, 0.0000],
+                  [0.7500, 0.8125, 0.9375, 0.7500, 0.2500, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
+
+        >>> m = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        >>> # Notice that values in top left corner are now changed
+        >>> m(input_3x3)
+        tensor([[[[1.0000, 1.4000, 1.8000, 1.6000, 0.8000, 0.0000],
+                  [1.8000, 2.2000, 2.6000, 2.2400, 1.1200, 0.0000],
+                  [2.6000, 3.0000, 3.4000, 2.8800, 1.4400, 0.0000],
+                  [2.4000, 2.7200, 3.0400, 2.5600, 1.2800, 0.0000],
+                  [1.2000, 1.3600, 1.5200, 1.2800, 0.6400, 0.0000],
+                  [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]]]])
+    """
+
+    __constants__ = ['size', 'scale_factor', 'mode', 'align_corners', 'name', 'recompute_scale_factor']
+    name: str
+    size: Optional[_size_any_t]
+    scale_factor: Optional[_ratio_any_t]
+    mode: str
+    align_corners: Optional[bool]
+    recompute_scale_factor: Optional[bool]
+
+    def __init__(self, size: Optional[_size_any_t] = None, scale_factor: Optional[_ratio_any_t] = None,
+                 mode: str = 'nearest', align_corners: Optional[bool] = None,
+                 recompute_scale_factor: Optional[bool] = None) -> None:
+        super().__init__()
+        self.name = type(self).__name__
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+        self.recompute_scale_factor = recompute_scale_factor
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.interpolate(input, self.size, self.scale_factor, self.mode, self.align_corners,
+                             recompute_scale_factor=self.recompute_scale_factor)
+
+    def __setstate__(self, state):
+        if 'recompute_scale_factor' not in state:
+            state['recompute_scale_factor'] = True
+
+        super().__setstate__(state)
+
+    def extra_repr(self) -> str:
+        if self.scale_factor is not None:
+            info = 'scale_factor=' + repr(self.scale_factor)
+        else:
+            info = 'size=' + repr(self.size)
+        info += ', mode=' + repr(self.mode)
+        return info
+
+
+class UpsamplingNearest2d(Upsample):
+    r"""Applies a 2D nearest neighbor upsampling to an input signal composed of several input channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+          H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+          W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> m = nn.UpsamplingNearest2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[1., 1., 2., 2.],
+                  [1., 1., 2., 2.],
+                  [3., 3., 4., 4.],
+                  [3., 3., 4., 4.]]]])
+    """
+
+    def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
+        super().__init__(size, scale_factor, mode='nearest')
+
+
+class UpsamplingBilinear2d(Upsample):
+    r"""Applies a 2D bilinear upsampling to an input signal composed of several input channels.
+
+    To specify the scale, it takes either the :attr:`size` or the :attr:`scale_factor`
+    as it's constructor argument.
+
+    When :attr:`size` is given, it is the output size of the image `(h, w)`.
+
+    Args:
+        size (int or Tuple[int, int], optional): output spatial sizes
+        scale_factor (float or Tuple[float, float], optional): multiplier for
+            spatial size.
+
+    .. warning::
+        This class is deprecated in favor of :func:`~nn.functional.interpolate`. It is
+        equivalent to ``nn.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    Shape:
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+    .. math::
+        H_{out} = \left\lfloor H_{in} \times \text{scale\_factor} \right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor W_{in} \times \text{scale\_factor} \right\rfloor
+
+    Examples::
+
+        >>> input = torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2)
+        >>> input
+        tensor([[[[1., 2.],
+                  [3., 4.]]]])
+
+        >>> # xdoctest: +IGNORE_WANT("do other tests modify the global state?")
+        >>> m = nn.UpsamplingBilinear2d(scale_factor=2)
+        >>> m(input)
+        tensor([[[[1.0000, 1.3333, 1.6667, 2.0000],
+                  [1.6667, 2.0000, 2.3333, 2.6667],
+                  [2.3333, 2.6667, 3.0000, 3.3333],
+                  [3.0000, 3.3333, 3.6667, 4.0000]]]])
+    """
+
+    def __init__(self, size: Optional[_size_2_t] = None, scale_factor: Optional[_ratio_2_t] = None) -> None:
+        super().__init__(size, scale_factor, mode='bilinear', align_corners=True)
diff --git a/MLPY/Lib/site-packages/torch/nn/modules/utils.py b/MLPY/Lib/site-packages/torch/nn/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a9a883d047126782569b42ee95b66ebe9624bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/modules/utils.py
@@ -0,0 +1,79 @@
+import collections
+from itertools import repeat
+from typing import List, Dict, Any
+
+__all__ = ['consume_prefix_in_state_dict_if_present']
+
+
+def _ntuple(n, name="parse"):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    parse.__name__ = name
+    return parse
+
+
+_single = _ntuple(1, "_single")
+_pair = _ntuple(2, "_pair")
+_triple = _ntuple(3, "_triple")
+_quadruple = _ntuple(4, "_quadruple")
+
+
+def _reverse_repeat_tuple(t, n):
+    r"""Reverse the order of `t` and repeat each element for `n` times.
+
+    This can be used to translate padding arg used by Conv and Pooling modules
+    to the ones used by `F.pad`.
+    """
+    return tuple(x for x in reversed(t) for _ in range(n))
+
+
+def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]:
+    import torch
+    if isinstance(out_size, (int, torch.SymInt)):
+        return out_size
+    if len(defaults) <= len(out_size):
+        raise ValueError(
+            f"Input dimension should be at least {len(out_size) + 1}"
+        )
+    return [
+        v if v is not None else d for v, d in zip(out_size, defaults[-len(out_size) :])
+    ]
+
+
+def consume_prefix_in_state_dict_if_present(
+    state_dict: Dict[str, Any], prefix: str
+) -> None:
+    r"""Strip the prefix in state_dict in place, if any.
+
+    ..note::
+        Given a `state_dict` from a DP/DDP model, a local model can load it by applying
+        `consume_prefix_in_state_dict_if_present(state_dict, "module.")` before calling
+        :meth:`torch.nn.Module.load_state_dict`.
+
+    Args:
+        state_dict (OrderedDict): a state-dict to be loaded to the model.
+        prefix (str): prefix.
+    """
+    keys = list(state_dict.keys())
+    for key in keys:
+        if key.startswith(prefix):
+            newkey = key[len(prefix) :]
+            state_dict[newkey] = state_dict.pop(key)
+
+    # also strip the prefix in metadata if any.
+    if hasattr(state_dict, "_metadata"):
+        keys = list(state_dict._metadata.keys())
+        for key in keys:
+            # for the metadata dict, the key can be:
+            # '': for the DDP module, which we want to remove.
+            # 'module': for the actual model.
+            # 'module.xx.xx': for the rest.
+            if len(key) == 0:
+                continue
+            # handling both, 'module' case and  'module.' cases
+            if key == prefix.replace('.', '') or key.startswith(prefix):
+                newkey = key[len(prefix) :]
+                state_dict._metadata[newkey] = state_dict._metadata.pop(key)
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__init__.py b/MLPY/Lib/site-packages/torch/nn/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..507fb1e9f6c089ace0fb065291304ea1eb3c07a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/__init__.py
@@ -0,0 +1,14 @@
+from .parallel_apply import parallel_apply
+from .replicate import replicate
+from .data_parallel import DataParallel, data_parallel
+from .scatter_gather import gather, scatter
+from .distributed import DistributedDataParallel
+
+__all__ = ['replicate', 'scatter', 'parallel_apply', 'gather', 'data_parallel',
+           'DataParallel', 'DistributedDataParallel']
+
+def DistributedDataParallelCPU(*args, **kwargs):
+    import warnings
+    warnings.warn("torch.nn.parallel.DistributedDataParallelCPU is deprecated, "
+                  "please use torch.nn.parallel.DistributedDataParallel instead.")
+    return DistributedDataParallel(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..443f268ab97b2d4fc4ca419fd92e7fbafb24301f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/_functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/_functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7ac4b760a08448fbab883bb0122a38d0d4bceef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/_functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/comm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/comm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a41747cbc70f09e212071a3fe8407c5026b2ba2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/comm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61c3c6d0aa21675edf9dbafc23661b1d81711a66
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/data_parallel.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1de21aa502316c32192c1700d968e1a6e94184d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/parallel_apply.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/parallel_apply.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65388d9e09c0864d32f1b33039630a2fc03f3135
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/parallel_apply.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/replicate.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/replicate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50ca07844da71927a714aaa8ccd3ed8b4a867041
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/replicate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6ff9a2e79dab75b856ecbeea64f3418eef68f19
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/parallel/__pycache__/scatter_gather.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/_functions.py b/MLPY/Lib/site-packages/torch/nn/parallel/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bebee2b9da81adb359afc3960d49291944e2cbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/_functions.py
@@ -0,0 +1,126 @@
+import warnings
+
+import torch
+from . import comm
+from torch.autograd import Function
+from torch._utils import _get_device_index
+from typing import List, Optional
+
+
+class Broadcast(Function):
+
+    @staticmethod
+    def forward(ctx, target_gpus, *inputs):
+        assert all(i.device.type != 'cpu' for i in inputs), (
+            'Broadcast function not implemented for CPU tensors'
+        )
+        target_gpus = [_get_device_index(x, True) for x in target_gpus]
+        ctx.target_gpus = target_gpus
+        if len(inputs) == 0:
+            return tuple()
+        ctx.num_inputs = len(inputs)
+        ctx.input_device = inputs[0].get_device()
+        outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
+        non_differentiables = []
+        for idx, input_requires_grad in enumerate(ctx.needs_input_grad[1:]):
+            if not input_requires_grad:
+                for output in outputs:
+                    non_differentiables.append(output[idx])
+        ctx.mark_non_differentiable(*non_differentiables)
+        return tuple([t for tensors in outputs for t in tensors])
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None,) + ReduceAddCoalesced.apply(ctx.input_device, ctx.num_inputs, *grad_outputs)
+
+
+class ReduceAddCoalesced(Function):
+
+    @staticmethod
+    def forward(ctx, destination, num_inputs, *grads):
+        ctx.target_gpus = [grads[i].get_device() for i in range(0, len(grads), num_inputs)]
+
+        grads_ = [grads[i:i + num_inputs]
+                  for i in range(0, len(grads), num_inputs)]
+        return comm.reduce_add_coalesced(grads_, destination)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None,) + Broadcast.apply(ctx.target_gpus, *grad_outputs)
+
+
+class Gather(Function):
+
+    @staticmethod
+    def forward(ctx, target_device, dim, *inputs):
+        assert all(i.device.type != 'cpu' for i in inputs), (
+            'Gather function not implemented for CPU tensors'
+        )
+        if (target_device == 'cpu'):
+            ctx.target_device = 'cpu'
+        else:
+            target_device = _get_device_index(target_device, True)
+            ctx.target_device = target_device
+        ctx.dim = dim
+        ctx.input_gpus = tuple(i.get_device() for i in inputs)
+        if all(t.dim() == 0 for t in inputs) and dim == 0:
+            inputs = tuple(t.view(1) for t in inputs)
+            warnings.warn('Was asked to gather along dimension 0, but all '
+                          'input tensors were scalars; will instead unsqueeze '
+                          'and return a vector.')
+            ctx.unsqueezed_scalar = True
+        else:
+            ctx.unsqueezed_scalar = False
+        ctx.input_sizes = tuple(i.size(ctx.dim) for i in inputs)
+        return comm.gather(inputs, ctx.dim, ctx.target_device)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        scattered_grads = Scatter.apply(ctx.input_gpus, ctx.input_sizes, ctx.dim, grad_output)
+        if ctx.unsqueezed_scalar:
+            scattered_grads = tuple(g[0] for g in scattered_grads)
+        return (None, None) + scattered_grads
+
+
+class Scatter(Function):
+
+    @staticmethod
+    def forward(ctx, target_gpus, chunk_sizes, dim, input):
+        target_gpus = [_get_device_index(x, True) for x in target_gpus]
+        ctx.dim = dim
+        ctx.input_device = input.get_device() if input.device.type != "cpu" else -1
+        streams = None
+        if torch.cuda.is_available() and ctx.input_device == -1:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(torch.device("cuda", device)) for device in target_gpus]
+        outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            for i, output in enumerate(outputs):
+                with torch.cuda.device(target_gpus[i]):
+                    main_stream = torch.cuda.current_stream()
+                    main_stream.wait_stream(streams[i])
+                    output.record_stream(main_stream)
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        return None, None, None, Gather.apply(ctx.input_device, ctx.dim, *grad_output)
+
+
+# background streams used for copying
+_streams: Optional[List[Optional[torch.Stream]]] = None
+
+def _get_stream(device: torch.device):
+    """Get a background stream for copying between CPU and target device."""
+    global _streams
+    if device.type == "cpu":
+        return None
+    device_mod = getattr(torch, device.type, None)
+    if device_mod is None:
+        return None
+    if _streams is None:
+        _streams = [None] * device_mod.device_count()
+    if _streams[device.index] is None:
+        _streams[device.index] = device_mod.Stream(device.index)
+    return _streams[device.index]
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/comm.py b/MLPY/Lib/site-packages/torch/nn/parallel/comm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e49e9a7608acd8e8aa6a23d058849012ee65163
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/comm.py
@@ -0,0 +1,236 @@
+import warnings
+import torch
+from torch.cuda import nccl
+from torch._utils import _take_tensors, _flatten_dense_tensors, \
+    _unflatten_dense_tensors, _reorder_tensors_as, _get_device_index, _handle_complex
+from typing import List
+
+def broadcast(tensor, devices=None, *, out=None):
+    r"""Broadcasts a tensor to specified GPU devices.
+
+    Args:
+        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
+        devices (Iterable[torch.device, str or int], optional): an iterable of
+          GPU devices, among which to broadcast.
+        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
+          store output results.
+
+    .. note::
+        Exactly one of :attr:`devices` and :attr:`out` must be specified.
+
+    Returns:
+        - If :attr:`devices` is specified,
+            a tuple containing copies of :attr:`tensor`, placed on
+            :attr:`devices`.
+        - If :attr:`out` is specified,
+            a tuple containing :attr:`out` tensors, each containing a copy of
+            :attr:`tensor`.
+    """
+    tensor = _handle_complex(tensor)
+    if not ((devices is None) ^ (out is None)):
+        raise RuntimeError(
+            f"Exactly one of 'devices' and 'out' must be specified, but got devices={devices} and out={out}")
+    if devices is not None:
+        devices = [_get_device_index(d) for d in devices]
+        return torch._C._broadcast(tensor, devices)
+    else:
+        return torch._C._broadcast_out(tensor, out)
+
+
+def broadcast_coalesced(tensors, devices, buffer_size=10485760):
+    """Broadcast a sequence of tensors to the specified GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number of synchronizations.
+
+    Args:
+        tensors (sequence): tensors to broadcast. Must be on the same device,
+          either CPU or GPU.
+        devices (Iterable[torch.device, str or int]): an iterable of GPU
+          devices, among which to broadcast.
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
+    """
+    devices = [_get_device_index(d) for d in devices]
+    tensors = [_handle_complex(t) for t in tensors]
+    return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
+
+
+def reduce_add(inputs, destination=None):
+    """Sum tensors from multiple GPUs.
+
+    All inputs should have matching shapes, dtype, and layout. The output tensor
+    will be of the same shape, dtype, and layout.
+
+    Args:
+        inputs (Iterable[Tensor]): an iterable of tensors to add.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+
+    Returns:
+        A tensor containing an elementwise sum of all inputs, placed on the
+        :attr:`destination` device.
+    """
+    destination = _get_device_index(destination, optional=True)
+    input_size = inputs[0].size()
+    root_index = None  # index of input tensor that already is on the correct device
+    for i, inp in enumerate(inputs):
+        assert inp.device.type != "cpu", "reduce_add expects all inputs to be on GPUs"
+        if inp.get_device() == destination:
+            root_index = i
+        if inp.size() != input_size:
+            got = 'x'.join(str(x) for x in inp.size())
+            expected = 'x'.join(str(x) for x in input_size)
+            raise ValueError(f"input {i} has invalid size: got {got}, but expected {expected}")
+    if root_index is None:
+        raise RuntimeError("reduce_add expects destination to be on the same GPU with one of the tensors")
+
+    if len(inputs) == 1:
+        return inputs[0]
+
+    if nccl.is_available(inputs):
+        result = torch.empty_like(inputs[root_index])
+        nccl.reduce(inputs, output=result, root=root_index)
+    else:
+        destination_device = torch.device(inputs[root_index].device.type, destination)
+        nonroot = [t for i, t in enumerate(inputs) if i != root_index]
+        # make a new tensor w/o clone
+        result = inputs[root_index] + nonroot[0].to(device=destination_device, non_blocking=True)
+        for other in nonroot[1:]:
+            result.add_(other.to(device=destination_device, non_blocking=True))
+    return result
+
+
+def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
+    """Sum tensors from multiple GPUs.
+
+    Small tensors are first coalesced into a buffer to reduce the number
+    of synchronizations.
+
+    Args:
+        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
+            contain tensors from a single device.
+        destination (int, optional): a device on which the output will be
+            placed (default: current device).
+        buffer_size (int): maximum size of the buffer used for coalescing
+
+    Returns:
+        A tuple of tensors containing an elementwise sum of each group of
+        inputs, placed on the ``destination`` device.
+    """
+    # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just
+    #       return `inputs`.
+    dense_tensors: List[List] = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
+    output = []
+    ref_order = []
+    # process sparse ones first since they may have different sizes on different gpus
+    for tensor_at_gpus in zip(*inputs):
+        if all(t.is_sparse for t in tensor_at_gpus):
+            result = reduce_add(tensor_at_gpus, destination)  # this will be sparse too
+            output.append(result)
+            ref_order.append(tensor_at_gpus[0])
+        else:
+            for coll, t in zip(dense_tensors, tensor_at_gpus):
+                coll.append(t.to_dense() if t.is_sparse else t)
+            ref_order.append(dense_tensors[0][-1])
+    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
+    # now the dense ones, which have consistent sizes
+    for chunks in zip(*itrs):
+        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]  # (num_gpus,)
+        flat_result = reduce_add(flat_tensors, destination)
+        for t in _unflatten_dense_tensors(flat_result, chunks[0]):
+            # The unflattened tensors do not share storage, and we don't expose
+            # base flat tensor anyways, so give them different version counters.
+            # See NOTE [ Version Counter in comm.*_coalesced ]
+            output.append(t.data)
+    return tuple(_reorder_tensors_as(output, ref_order))
+
+
+def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=None):
+    """Scatters tensor across multiple GPUs.
+
+    Args:
+        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
+        devices (Iterable[torch.device, str or int], optional): an iterable of
+          GPU devices, among which to scatter.
+        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
+          each device. It should match :attr:`devices` in length and sums to
+          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
+          into equal chunks.
+        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
+          Default: ``0``.
+        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
+          which to execute the scatter. If not specified, the default stream will
+          be utilized.
+        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
+          store output results. Sizes of these tensors must match that of
+          :attr:`tensor`, except for :attr:`dim`, where the total size must
+          sum to ``tensor.size(dim)``.
+
+    .. note::
+        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
+        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
+        will be inferred from sizes of :attr:`out`.
+
+    Returns:
+        - If :attr:`devices` is specified,
+            a tuple containing chunks of :attr:`tensor`, placed on
+            :attr:`devices`.
+        - If :attr:`out` is specified,
+            a tuple containing :attr:`out` tensors, each containing a chunk of
+            :attr:`tensor`.
+    """
+    tensor = _handle_complex(tensor)
+    if out is None:
+        devices = [_get_device_index(d) for d in devices]
+        return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
+    else:
+        if devices is not None:
+            raise RuntimeError(
+                f"'devices' must not be specified when 'out' is specified, but got devices={devices}")
+        if chunk_sizes is not None:
+            raise RuntimeError(
+                f"'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes={chunk_sizes}")
+        return tuple(torch._C._scatter_out(tensor, out, dim, streams))
+
+
+def gather(tensors, dim=0, destination=None, *, out=None):
+    r"""Gathers tensors from multiple GPU devices.
+
+    Args:
+        tensors (Iterable[Tensor]): an iterable of tensors to gather.
+          Tensor sizes in all dimensions other than :attr:`dim` have to match.
+        dim (int, optional): a dimension along which the tensors will be
+          concatenated. Default: ``0``.
+        destination (torch.device, str, or int, optional): the output device.
+          Can be CPU or CUDA. Default: the current CUDA device.
+        out (Tensor, optional, keyword-only): the tensor to store gather result.
+          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
+          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
+          Can be on CPU or CUDA.
+
+    .. note::
+        :attr:`destination` must not be specified when :attr:`out` is specified.
+
+    Returns:
+        - If :attr:`destination` is specified,
+            a tensor located on :attr:`destination` device, that is a result of
+            concatenating :attr:`tensors` along :attr:`dim`.
+        - If :attr:`out` is specified,
+            the :attr:`out` tensor, now containing results of concatenating
+            :attr:`tensors` along :attr:`dim`.
+    """
+    tensors = [_handle_complex(t) for t in tensors]
+    if out is None:
+        if destination == -1:
+            warnings.warn(
+                'Using -1 to represent CPU tensor is deprecated. Please use a '
+                'device object or string instead, e.g., "cpu".')
+        destination = _get_device_index(destination, allow_cpu=True, optional=True)
+        return torch._C._gather(tensors, dim, destination)
+    else:
+        if destination is not None:
+            raise RuntimeError(
+                f"'destination' must not be specified when 'out' is specified, but got destination={destination}")
+        return torch._C._gather_out(tensors, out, dim)
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/data_parallel.py b/MLPY/Lib/site-packages/torch/nn/parallel/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ee525c440306b08bc53bb75193e343dd9f2175f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/data_parallel.py
@@ -0,0 +1,269 @@
+import operator
+import torch
+import warnings
+from itertools import chain
+from typing import Any, Dict, Generic, List, Optional, Sequence, Tuple, TypeVar, Union
+from ..modules import Module
+from .scatter_gather import scatter_kwargs, gather
+from .replicate import replicate
+from .parallel_apply import parallel_apply
+from torch._utils import (
+    _get_all_device_indices,
+    _get_available_device_type,
+    _get_device_index,
+    _get_devices_properties
+)
+
+__all__ = ['DataParallel', 'data_parallel']
+
+def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
+    imbalance_warn = """
+    There is an imbalance between your GPUs. You may want to exclude GPU {} which
+    has less than 75% of the memory or cores of GPU {}. You can do so by setting
+    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
+    environment variable."""
+    device_ids = [_get_device_index(x, True) for x in device_ids]
+    dev_props = _get_devices_properties(device_ids)
+
+    def warn_imbalance(get_prop):
+        values = [get_prop(props) for props in dev_props]
+        min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1))
+        max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
+        if min_val / max_val < 0.75:
+            warnings.warn(imbalance_warn.format(device_ids[min_pos], device_ids[max_pos]))
+            return True
+        return False
+
+    if warn_imbalance(lambda props: props.total_memory):
+        return
+    if warn_imbalance(lambda props: props.multi_processor_count):
+        return
+
+
+T = TypeVar("T", bound=Module)
+
+
+class DataParallel(Module, Generic[T]):
+    r"""Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given :attr:`module` by
+    splitting the input across the specified devices by chunking in the batch
+    dimension (other objects will be copied once per device). In the forward
+    pass, the module is replicated on each device, and each replica handles a
+    portion of the input. During the backwards pass, gradients from each replica
+    are summed into the original module.
+
+    The batch size should be larger than the number of GPUs used.
+
+    .. warning::
+        It is recommended to use :class:`~torch.nn.parallel.DistributedDataParallel`,
+        instead of this class, to do multi-GPU training, even if there is only a single
+        node. See: :ref:`cuda-nn-ddp-instead` and :ref:`ddp`.
+
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel but some types are specially handled. tensors will be
+    **scattered** on dim specified (default 0). tuple, list and dict types will
+    be shallow copied. The other types will be shared among different threads
+    and can be corrupted if written to in the model's forward pass.
+
+    The parallelized :attr:`module` must have its parameters and buffers on
+    ``device_ids[0]`` before running this :class:`~torch.nn.DataParallel`
+    module.
+
+    .. warning::
+        In each forward, :attr:`module` is **replicated** on each device, so any
+        updates to the running module in ``forward`` will be lost. For example,
+        if :attr:`module` has a counter attribute that is incremented in each
+        ``forward``, it will always stay at the initial value because the update
+        is done on the replicas which are destroyed after ``forward``. However,
+        :class:`~torch.nn.DataParallel` guarantees that the replica on
+        ``device[0]`` will have its parameters and buffers sharing storage with
+        the base parallelized :attr:`module`. So **in-place** updates to the
+        parameters or buffers on ``device[0]`` will be recorded. E.g.,
+        :class:`~torch.nn.BatchNorm2d` and :func:`~torch.nn.utils.spectral_norm`
+        rely on this behavior to update the buffers.
+
+    .. warning::
+        Forward and backward hooks defined on :attr:`module` and its submodules
+        will be invoked ``len(device_ids)`` times, each with inputs located on
+        a particular device. Particularly, the hooks are only guaranteed to be
+        executed in correct order with respect to operations on corresponding
+        devices. For example, it is not guaranteed that hooks set via
+        :meth:`~torch.nn.Module.register_forward_pre_hook` be executed before
+        `all` ``len(device_ids)`` :meth:`~torch.nn.Module.forward` calls, but
+        that each such hook be executed before the corresponding
+        :meth:`~torch.nn.Module.forward` call of that device.
+
+    .. warning::
+        When :attr:`module` returns a scalar (i.e., 0-dimensional tensor) in
+        :func:`forward`, this wrapper will return a vector of length equal to
+        number of devices used in data parallelism, containing the result from
+        each device.
+
+    .. note::
+        There is a subtlety in using the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`pack-rnn-unpack-with-data-parallelism` section in FAQ for
+        details.
+
+
+    Args:
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices (default: all devices)
+        output_device (int or torch.device): device location of output (default: device_ids[0])
+
+    Attributes:
+        module (Module): the module to be parallelized
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)  # input_var can be on any device, including CPU
+    """
+
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+
+    def __init__(
+        self,
+        module: T,
+        device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
+        output_device: Optional[Union[int, torch.device]] = None,
+        dim: int = 0,
+    ) -> None:
+        super().__init__()
+        torch._C._log_api_usage_once("torch.nn.parallel.DataParallel")
+        device_type = _get_available_device_type()
+        if device_type is None:
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = _get_all_device_indices()
+
+        if device_ids is None:
+            raise RuntimeError("no available devices were found")
+
+        if output_device is None:
+            output_device = device_ids[0]
+
+        self.dim = dim
+        self.module = module
+        self.device_ids = [_get_device_index(x, True) for x in device_ids]
+        self.output_device = _get_device_index(output_device, True)
+        self.src_device_obj = torch.device(device_type, self.device_ids[0])
+
+        if device_type == "cuda":
+            _check_balance(self.device_ids)
+
+        if len(self.device_ids) == 1:
+            self.module.to(self.src_device_obj)
+
+    def forward(self, *inputs: Any, **kwargs: Any) -> Any:
+        with torch.autograd.profiler.record_function("DataParallel.forward"):
+            if not self.device_ids:
+                return self.module(*inputs, **kwargs)
+
+            for t in chain(self.module.parameters(), self.module.buffers()):
+                if t.device != self.src_device_obj:
+                    raise RuntimeError("module must have its parameters and buffers "
+                                       f"on device {self.src_device_obj} (device_ids[0]) but found one of "
+                                       f"them on device: {t.device}")
+
+            inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            # for forward function without any inputs, empty list and dict will be created
+            # so the module can be executed on one device which is the first one in device_ids
+            if not inputs and not module_kwargs:
+                inputs = ((),)
+                module_kwargs = ({},)
+
+            if len(self.device_ids) == 1:
+                return self.module(*inputs[0], **module_kwargs[0])
+            replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+            outputs = self.parallel_apply(replicas, inputs, module_kwargs)
+            return self.gather(outputs, self.output_device)
+
+    def replicate(self, module: T, device_ids: Sequence[Union[int, torch.device]]) -> List[T]:
+        return replicate(module, device_ids, not torch.is_grad_enabled())
+
+    def scatter(
+        self,
+        inputs: Tuple[Any, ...],
+        kwargs: Optional[Dict[str, Any]],
+        device_ids: Sequence[Union[int, torch.device]],
+    ) -> Any:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs: Any, output_device: Union[int, torch.device]) -> Any:
+        return gather(outputs, output_device, dim=self.dim)
+
+
+def data_parallel(
+    module: Module,
+    inputs: Any,
+    device_ids: Optional[Sequence[Union[int, torch.device]]] = None,
+    output_device: Optional[Union[int, torch.device]] = None,
+    dim: int = 0,
+    module_kwargs: Optional[Any] = None,
+) -> torch.Tensor:
+    r"""Evaluate module(input) in parallel across the GPUs given in device_ids.
+
+    This is the functional version of the DataParallel module.
+
+    Args:
+        module (Module): the module to evaluate in parallel
+        inputs (Tensor): inputs to the module
+        device_ids (list of int or torch.device): GPU ids on which to replicate module
+        output_device (list of int or torch.device): GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Tensor containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,) if inputs is not None else ()
+
+    device_type = _get_available_device_type()
+
+    if device_type is None:
+        raise RuntimeError("device type could not be determined")
+
+    if device_ids is None:
+        device_ids = _get_all_device_indices()
+
+    if device_ids is None:
+        raise RuntimeError("no available devices were found")
+
+    if output_device is None:
+        output_device = device_ids[0]
+
+    device_ids = [_get_device_index(x, True) for x in device_ids]
+    output_device = _get_device_index(output_device, True)
+    src_device_obj = torch.device(device_type, device_ids[0])
+
+    for t in chain(module.parameters(), module.buffers()):
+        if t.device != src_device_obj:
+            raise RuntimeError("module must have its parameters and buffers "
+                               f"on device {src_device_obj} (device_ids[0]) but found one of "
+                               f"them on device: {t.device}")
+
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    # for module without any inputs, empty list and dict will be created
+    # so the module can be executed on one device which is the first one in device_ids
+    if not inputs and not module_kwargs:
+        inputs = ((),)
+        module_kwargs = ({},)
+
+    assert module_kwargs is not None
+
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[:len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/distributed.py b/MLPY/Lib/site-packages/torch/nn/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d5ccd998b530f98b915a3b3175ecc2ac48d280
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/distributed.py
@@ -0,0 +1,2350 @@
+import copy
+import functools
+import inspect
+import itertools
+import logging
+import os
+import sys
+import warnings
+import weakref
+from collections import defaultdict, deque
+from contextlib import contextmanager
+from dataclasses import dataclass, fields, is_dataclass
+from enum import auto, Enum
+from typing import Any, Callable, List, Optional, Tuple, Type
+
+import torch
+import torch.distributed as dist
+from torch.autograd import Function, Variable
+from torch.distributed.algorithms.join import Join, Joinable, JoinHook
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.utils.hooks import RemovableHandle
+
+RPC_AVAILABLE = False
+if dist.is_available():
+    from torch.distributed.distributed_c10d import (
+        _get_default_group,
+        _rank_not_in_group,
+        ReduceOp,
+    )
+    from torch.distributed.utils import (
+        _alloc_storage,
+        _cast_forward_inputs,
+        _free_storage,
+        _sync_module_states,
+        _to_kwargs,
+        _verify_param_shape_across_processes,
+    )
+if torch.distributed.rpc.is_available():
+    RPC_AVAILABLE = True
+    from torch.distributed.rpc import RRef
+
+from torch._utils import _get_device_index
+
+from ..modules import Module
+from .scatter_gather import gather, scatter_kwargs  # noqa: F401
+
+__all__ = ["DistributedDataParallel"]
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _MixedPrecision:
+    """
+    This configures DDP-native mixed precision training.
+
+    Attributes:
+        param_dtype (torch.dtype): This specifies the dtype for model
+            parameters, inputs (when ``cast_forward_inputs`` is set to
+            ``True``), and therefore the dtype for computation.
+            However, outside the forward and backward passes, parameters are in
+            full precision. Model checkpointing always happens in full
+            precision.
+        reduce_dtype (torch.dtype): This specifies the dtype for gradient
+            reduction, which is permitted to differ from ``param_dtype``.
+        buffer_dtype (torch.dtype): This specifies the dtype for buffers.
+
+    .. note:: This API is experimental and subject to change.
+
+    .. note:: Only floating point tensors are cast to their specified dtypes.
+
+    .. note:: ``state_dict`` checkpoints parameters and buffers in full
+        precision.
+
+    .. note:: Each low precision dtype must be specified explicitly. For
+        example, ``_MixedPrecision(reduce_dtype=torch.float16)`` only specifies
+        the reduction dtype to be low precision, and DDP will not cast
+        parameters or buffers.
+
+    .. note:: If a ``reduce_dtype`` is not specified, then gradient reduction
+        happens in ``param_dtype`` if specified or the original parameter dtype
+        otherwise. For example, ``_MixedPrecision(param_dtype=torch.float16)``
+        would result in communication occurring in fp16.
+    """
+
+    param_dtype: Optional[torch.dtype] = None
+    reduce_dtype: Optional[torch.dtype] = None
+    buffer_dtype: Optional[torch.dtype] = None
+    # TODO (rohan-varma): keep_low_precision_grads: bool = False
+    # TODO (rohan-varma): APIs to allow users to run batchnorm and layernorm
+    # in full precision. For DDP, this can be implemented by not performing the
+    # parameter cast for BN and LN units.
+
+
+def _cast_buffers(mixed_precision_config, root_module):
+    """Casts buffers to the given ``buffer_dtype``."""
+    for buf in root_module.buffers():
+        if hasattr(buf, "_ddp_ignored") and buf._ddp_ignored:
+            continue
+
+        buf.data = buf.to(dtype=mixed_precision_config.buffer_dtype)
+
+
+def _setup_mixed_precision_params(mixed_precision_config, root_module):
+    """Create and free storage for the mixed precision parameters."""
+    for param in root_module.parameters():
+        # Do not setup mixed precision for DDP ignored parameters.
+        if hasattr(param, "_ddp_ignored") and param._ddp_ignored:
+            continue
+
+        if not hasattr(param, "_mp_param"):
+            param._mp_param = torch.zeros_like(
+                param,
+                device=param.device,
+                dtype=mixed_precision_config.param_dtype,
+                requires_grad=param.requires_grad,
+            )
+            _free_storage(param._mp_param)
+            # _fp_param will point to the full precision param so it can be switched
+            # back to at the end of forward / backward.
+            param._fp_param = param.data
+
+
+def _tree_flatten_with_rref(output):
+    output_is_rref = RPC_AVAILABLE and isinstance(output, RRef)
+    if output_is_rref:
+        output_tensor_list, treespec = tree_flatten(output.local_value())
+    else:
+        output_tensor_list, treespec = tree_flatten(output)
+    # Need to return flattened tensors, spec to re-pack them, as well
+    # as if the return type was actually an RRef to reconstruct.
+    return output_tensor_list, treespec, output_is_rref
+
+
+def _tree_unflatten_with_rref(output, treespec, output_is_rref):
+    output = tree_unflatten(output, treespec)
+    if output_is_rref:
+        output = RRef(output)
+    return output
+
+
+def _find_tensors(obj):
+    r"""Recursively find all tensors contained in the specified object."""
+    if RPC_AVAILABLE and isinstance(obj, RRef):
+        # If the current node is the owner of the RRef, unwrap it and try to
+        # find Tensors.
+        # TODO: Expand to remote RRefs.
+        if obj.is_owner():
+            return _find_tensors(obj.local_value())
+    if isinstance(obj, torch.Tensor):
+        return [obj]
+    if isinstance(obj, (list, tuple)):
+        return itertools.chain.from_iterable(map(_find_tensors, obj))
+    if isinstance(obj, dict):
+        return itertools.chain.from_iterable(map(_find_tensors, obj.values()))
+    if is_dataclass(obj):
+        return itertools.chain.from_iterable(
+            map(_find_tensors, (getattr(obj, f.name) for f in fields(obj)))
+        )
+
+    return []
+
+
+def _dump_DDP_relevant_env_vars():
+    relevant_env_vars = [
+        "RANK",
+        "LOCAL_RANK",
+        "WORLD_SIZE",
+        "MASTER_PORT",
+        "MASTER_ADDR",
+        "CUDA_VISIBLE_DEVICES",
+        "GLOO_SOCKET_IFNAME",
+        "GLOO_DEVICE_TRANSPORT",
+        "NCCL_SOCKET_IFNAME",
+        "TORCH_NCCL_BLOCKING_WAIT",
+        "NCCL_DEBUG",
+        "NCCL_DEBUG_SUBSYS",
+        "NCCL_IB_DISABLE",
+        # More NCCL env vars:
+        "NCCL_P2P_DISABLE",
+        "NCCL_P2P_LEVEL",
+        "NCCL_SHM_DISABLE",
+        "NCCL_SOCKET_NTHREADS",
+        "NCCL_NSOCKS_PERTHREAD",
+        "NCCL_BUFFSIZE",
+        "NCCL_NTHREADS",
+        "NCCL_RINGS",
+        "NCCL_MAX_NCHANNELS",
+        "NCCL_MIN_NCHANNELS",
+        "NCCL_CHECKS_DISABLE",
+        "NCCL_CHECK_POINTERS",
+        "NCCL_LAUNCH_MODE",
+        "NCCL_IB_HCA",
+        "NCCL_IB_TIMEOUT",
+        "NCCL_IB_RETRY_CNT",
+        "NCCL_IB_GID_INDEX",
+        "NCCL_IB_SL",
+        "NCCL_IB_TC",
+        "NCCL_IB_AR_THRESHOLD",
+        "NCCL_IB_CUDA_SUPPORT",
+        "NCCL_NET_GDR_LEVEL",
+        "NCCL_NET_GDR_READ",
+        "NCCL_SINGLE_RING_THRESHOLD",
+        "NCCL_LL_THRESHOLD",
+        "NCCL_TREE_THRESHOLD",
+        "NCCL_ALGO",
+        "NCCL_PROTO",
+        "NCCL_IGNORE_CPU_AFFINITY",
+        "NCCL_DEBUG_FILE",
+        "NCCL_COLLNET_ENABLE",
+        "NCCL_TOPO_FILE",
+        "NCCL_TOPO_DUMP_FILE",
+        "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+    ]
+    formatted_output = ""
+    for var in relevant_env_vars:
+        value = os.environ[var] if var in os.environ else "N/A"
+        formatted_output += f"env:{var}={value}\n"
+    print(formatted_output)
+
+
+class _BufferCommHookLocation(Enum):
+    PRE_FORWARD = auto()
+    POST_FORWARD = auto()
+
+
+@dataclass
+class _BufferCommHook:
+    buffer_comm_hook: Callable
+    buffer_comm_hook_state: Any
+    buffer_comm_hook_location: _BufferCommHookLocation
+
+
+# Add a DDPSink to run various functions when backwards starts, such as
+# queueing call back of out-most backward/graph task,
+# this helps call back is fired after all gradients' calculation
+# is completed.
+class _DDPSink(Function):
+    @staticmethod
+    def forward(ctx, ddp_weakref, *inputs):
+        # set_materialize_grads(False) will ensure that None gradients stay as
+        # None and are not filled with zeros.
+        ctx.set_materialize_grads(False)
+        ctx.ddp_weakref = ddp_weakref
+        ret = tuple(
+            inp.clone() if isinstance(inp, torch.Tensor) else inp for inp in inputs
+        )
+        return ret
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        # Enqueue delay allreduce for static graph training on the first
+        # iteration.
+        ddp_weakref = ctx.ddp_weakref()
+        reducer = ddp_weakref.reducer
+        static_graph = ddp_weakref.static_graph
+        delay_ar_enqueued = (
+            static_graph and ddp_weakref._static_graph_delay_allreduce_enqueued
+        )
+        if static_graph and not delay_ar_enqueued:
+            Variable._execution_engine.queue_callback(  # type: ignore[call-arg,misc]
+                reducer._delay_all_reduce
+            )
+            ddp_weakref._static_graph_delay_allreduce_enqueued = True
+
+        return (None, *grad_outputs)
+
+
+class _DDPJoinHook(JoinHook):
+    def __init__(self, ddp, divide_by_initial_world_size):
+        """Set config variables for internal usage."""
+        assert isinstance(ddp, DistributedDataParallel), (
+            "DDP join hook requires passing in a DistributedDataParallel "
+            "instance as the state"
+        )
+        assert ddp.logger is not None
+        ddp.logger._set_uneven_input_join()
+        self.ddp = ddp
+        self.ddp._divide_by_initial_world_size = divide_by_initial_world_size
+        super().__init__()
+
+    def main_hook(self):
+        """Shadow the DDP collective communication operations in the forward and backward passes."""
+        ddp = self.ddp
+        # Buckets are rebuilt only once during a training period
+        ddp.reducer._rebuild_buckets()
+
+        # Schedule a broadcast if we are syncing module buffers in the
+        # forward pass
+        # TODO: make DDP uneven inputs context manager support buffer
+        # comm hook (https://github.com/pytorch/pytorch/issues/65436)
+        ddp._check_and_sync_module_buffers()
+
+        # Check if need to sync in the backward pass
+        should_sync_backwards = ddp._check_global_requires_backward_grad_sync(
+            is_joined_rank=True
+        )
+        # Forward parameter sync is disabled in the next iteration if we
+        # are skipping gradient sync this iteration, so set
+        # `require_forward_param_sync` accordingly
+        ddp.require_forward_param_sync = should_sync_backwards
+        if not should_sync_backwards:
+            return
+
+        # Schedule one allreduce per gradient bucket to match the backward
+        # pass allreduce
+        ddp._match_all_reduce_for_bwd_pass()
+
+        # Check if we need to allreduce locally unused parameters
+        if ddp.find_unused_parameters:
+            ddp._match_unused_params_allreduce()
+
+        # Rebuilt parameters are pushed only once during a training period
+        ddp.reducer._push_all_rebuilt_params()
+
+    def post_hook(self, is_last_joiner: bool):
+        """Sync the final model to ensure that the model is the same across all processes."""
+        self.ddp._sync_final_model(is_last_joiner)
+
+
+class DistributedDataParallel(Module, Joinable):
+    r"""Implement distributed data parallelism based on ``torch.distributed`` at module level.
+
+    This container provides data parallelism by synchronizing gradients
+    across each model replica. The devices to synchronize across are
+    specified by the input ``process_group``, which is the entire world
+    by default. Note that ``DistributedDataParallel`` does not chunk or
+    otherwise shard the input across participating GPUs; the user is
+    responsible for defining how to do so, for example through the use
+    of a :class:`DistributedSampler`.
+
+    See also: :ref:`distributed-basics` and :ref:`cuda-nn-ddp-instead`.
+    The same constraints on input as in :class:`torch.nn.DataParallel` apply.
+
+    Creation of this class requires that ``torch.distributed`` to be already
+    initialized, by calling :func:`torch.distributed.init_process_group`.
+
+    ``DistributedDataParallel`` is proven to be significantly faster than
+    :class:`torch.nn.DataParallel` for single-node multi-GPU data
+    parallel training.
+
+    To use ``DistributedDataParallel`` on a host with N GPUs, you should spawn
+    up ``N`` processes, ensuring that each process exclusively works on a single
+    GPU from 0 to N-1. This can be done by either setting
+    ``CUDA_VISIBLE_DEVICES`` for every process or by calling:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.cuda.set_device(i)
+
+    where i is from 0 to N-1. In each process, you should refer the following
+    to construct this module:
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.distributed.init_process_group(
+        >>>     backend='nccl', world_size=N, init_method='...'
+        >>> )
+        >>> model = DistributedDataParallel(model, device_ids=[i], output_device=i)
+
+    In order to spawn up multiple processes per node, you can use either
+    ``torch.distributed.launch`` or ``torch.multiprocessing.spawn``.
+
+    .. note::
+        Please refer to `PyTorch Distributed Overview <https://pytorch.org/tutorials/beginner/dist_overview.html>`__
+        for a brief introduction to all features related to distributed training.
+
+    .. note::
+        ``DistributedDataParallel`` can be used in conjunction with
+        :class:`torch.distributed.optim.ZeroRedundancyOptimizer` to reduce
+        per-rank optimizer states memory footprint. Please refer to
+        `ZeroRedundancyOptimizer recipe <https://pytorch.org/tutorials/recipes/zero_redundancy_optimizer.html>`__
+        for more details.
+
+    .. note:: ``nccl`` backend is currently the fastest and highly recommended
+        backend when using GPUs. This applies to both single-node and
+        multi-node distributed training.
+
+    .. note:: This module also supports mixed-precision distributed training.
+        This means that your model can have different types of parameters such
+        as mixed types of ``fp16`` and ``fp32``, the gradient reduction on these
+        mixed types of parameters will just work fine.
+
+    .. note:: If you use ``torch.save`` on one process to checkpoint the module,
+        and ``torch.load`` on some other processes to recover it, make sure that
+        ``map_location`` is configured properly for every process. Without
+        ``map_location``, ``torch.load`` would recover the module to devices
+        where the module was saved from.
+
+    .. note:: When a model is trained on ``M`` nodes with ``batch=N``, the
+        gradient will be ``M`` times smaller when compared to the same model
+        trained on a single node with ``batch=M*N`` if the loss is summed (NOT
+        averaged as usual) across instances in a batch (because the gradients
+        between different nodes are averaged). You should take this into
+        consideration when you want to obtain a mathematically equivalent
+        training process compared to the local training counterpart. But in most
+        cases, you can just treat a DistributedDataParallel wrapped model, a
+        DataParallel wrapped model and an ordinary model on a single GPU as the
+        same (E.g. using the same learning rate for equivalent batch size).
+
+    .. note::
+        Parameters are never broadcast between processes. The module performs
+        an all-reduce step on gradients and assumes that they will be modified
+        by the optimizer in all processes in the same way. Buffers
+        (e.g. BatchNorm stats) are broadcast from the module in process of rank
+        0, to all other replicas in the system in every iteration.
+
+    .. note::
+        If you are using DistributedDataParallel in conjunction with the
+        :ref:`distributed-rpc-framework`, you should always use
+        :meth:`torch.distributed.autograd.backward` to compute gradients and
+        :class:`torch.distributed.optim.DistributedOptimizer` for optimizing
+        parameters.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> import torch.distributed.autograd as dist_autograd
+            >>> from torch.nn.parallel import DistributedDataParallel as DDP
+            >>> import torch
+            >>> from torch import optim
+            >>> from torch.distributed.optim import DistributedOptimizer
+            >>> import torch.distributed.rpc as rpc
+            >>> from torch.distributed.rpc import RRef
+            >>>
+            >>> t1 = torch.rand((3, 3), requires_grad=True)
+            >>> t2 = torch.rand((3, 3), requires_grad=True)
+            >>> rref = rpc.remote("worker1", torch.add, args=(t1, t2))
+            >>> ddp_model = DDP(my_model)
+            >>>
+            >>> # Setup optimizer
+            >>> optimizer_params = [rref]
+            >>> for param in ddp_model.parameters():
+            >>>     optimizer_params.append(RRef(param))
+            >>>
+            >>> dist_optim = DistributedOptimizer(
+            >>>     optim.SGD,
+            >>>     optimizer_params,
+            >>>     lr=0.05,
+            >>> )
+            >>>
+            >>> with dist_autograd.context() as context_id:
+            >>>     pred = ddp_model(rref.to_here())
+            >>>     loss = loss_func(pred, target)
+            >>>     dist_autograd.backward(context_id, [loss])
+            >>>     dist_optim.step(context_id)
+
+    .. note::
+        DistributedDataParallel currently offers limited support for gradient
+        checkpointing with :meth:`torch.utils.checkpoint`.
+        If the checkpoint is done with use_reentrant=False (recommended), DDP
+        will work as expected without any limitations.
+        If, however, the checkpoint is done with use_reentrant=True (the default),
+        DDP will work as expected when there are no unused parameters in the model
+        and each layer is checkpointed at most once (make sure you are not passing
+        `find_unused_parameters=True` to DDP). We currently do not support the
+        case where a layer is checkpointed multiple times, or when there unused
+        parameters in the checkpointed model.
+
+    .. note::
+        To let a non-DDP model load a state dict from a DDP model,
+        :meth:`~torch.nn.modules.utils.consume_prefix_in_state_dict_if_present`
+        needs to be applied to strip the prefix "module." in the DDP state dict before loading.
+
+    .. warning::
+        Constructor, forward method, and differentiation of the output (or a
+        function of the output of this module) are distributed synchronization
+        points. Take that into account in case different processes might be
+        executing different code.
+
+    .. warning::
+        This module assumes all parameters are registered in the model by the
+        time it is created. No parameters should be added nor removed later.
+        Same applies to buffers.
+
+    .. warning::
+        This module assumes all parameters are registered in the model of each
+        distributed processes are in the same order. The module itself will
+        conduct gradient ``allreduce`` following the reverse order of the
+        registered parameters of the model. In other words, it is users'
+        responsibility to ensure that each distributed process has the exact
+        same model and thus the exact same parameter registration order.
+
+    .. warning::
+        This module allows parameters with non-rowmajor-contiguous strides.
+        For example, your model may contain some parameters whose
+        :class:`torch.memory_format` is ``torch.contiguous_format``
+        and others whose format is ``torch.channels_last``.  However,
+        corresponding parameters in different processes must have the
+        same strides.
+
+    .. warning::
+        This module doesn't work with :func:`torch.autograd.grad` (i.e. it will
+        only work if gradients are to be accumulated in ``.grad`` attributes of
+        parameters).
+
+    .. warning::
+        If you plan on using this module with a ``nccl`` backend or a ``gloo``
+        backend (that uses Infiniband), together with a DataLoader that uses
+        multiple workers, please change the multiprocessing start method to
+        ``forkserver`` (Python 3 only) or ``spawn``. Unfortunately
+        Gloo (that uses Infiniband) and NCCL2 are not fork safe, and you will
+        likely experience deadlocks if you don't change this setting.
+
+    .. warning::
+        You should never try to change your model's parameters after wrapping
+        up your model with ``DistributedDataParallel``. Because, when
+        wrapping up your model with ``DistributedDataParallel``, the constructor
+        of ``DistributedDataParallel`` will register the additional gradient
+        reduction functions on all the parameters of the model itself at the
+        time of construction. If you change the model's parameters afterwards,
+        gradient reduction functions no longer match the correct set of
+        parameters.
+
+    .. warning::
+        Using ``DistributedDataParallel`` in conjunction with the
+        :ref:`distributed-rpc-framework` is experimental and subject to change.
+
+    Args:
+        module (Module): module to be parallelized
+        device_ids (list of int or torch.device): CUDA devices.
+                   1) For single-device modules, ``device_ids`` can
+                   contain exactly one device id, which represents the only
+                   CUDA device where the input module corresponding to this process resides.
+                   Alternatively, ``device_ids`` can also be ``None``.
+                   2) For multi-device modules and CPU modules,
+                   ``device_ids`` must be ``None``.
+
+                   When ``device_ids`` is ``None`` for both cases,
+                   both the input data for the forward pass and the actual module
+                   must be placed on the correct device.
+                   (default: ``None``)
+        output_device (int or torch.device): Device location of output for
+                      single-device CUDA modules. For multi-device modules and
+                      CPU modules, it must be ``None``, and the module itself
+                      dictates the output location. (default: ``device_ids[0]``
+                      for single-device modules)
+        broadcast_buffers (bool): Flag that enables syncing (broadcasting)
+                          buffers of the module at beginning of the ``forward``
+                          function. (default: ``True``)
+        process_group: The process group to be used for distributed data
+                       all-reduction. If ``None``, the default process group, which
+                       is created by :func:`torch.distributed.init_process_group`,
+                       will be used. (default: ``None``)
+        bucket_cap_mb: ``DistributedDataParallel`` will bucket parameters into
+                       multiple buckets so that gradient reduction of each
+                       bucket can potentially overlap with backward computation.
+                       :attr:`bucket_cap_mb` controls the bucket size in
+                       MegaBytes (MB). (default: 25)
+        find_unused_parameters (bool): Traverse the autograd graph from all
+                               tensors contained in the return value of the
+                               wrapped module's ``forward`` function. Parameters
+                               that don't receive gradients as part of this
+                               graph are preemptively marked as being ready to
+                               be reduced. In addition, parameters that may have
+                               been used in the wrapped module's ``forward``
+                               function but were not part of loss computation and
+                               thus would also not receive gradients are
+                               preemptively marked as ready to be reduced.
+                               (default: ``False``)
+        check_reduction: This argument is deprecated.
+        gradient_as_bucket_view (bool): When set to ``True``, gradients will be views
+                      pointing to different offsets of ``allreduce`` communication
+                      buckets. This can reduce peak memory usage, where the
+                      saved memory size will be equal to the total gradients
+                      size. Moreover, it avoids the overhead of copying between
+                      gradients and ``allreduce`` communication buckets. When
+                      gradients are views, ``detach_()`` cannot be called on the
+                      gradients. If hitting such errors, please fix it by
+                      referring to the :meth:`~torch.optim.Optimizer.zero_grad`
+                      function in ``torch/optim/optimizer.py`` as a solution.
+                      Note that gradients will be views after first iteration, so
+                      the peak memory saving should be checked after first iteration.
+        static_graph (bool): When set to ``True``, DDP knows the trained graph is
+                     static. Static graph means 1) The set of used and unused
+                     parameters will not change during the whole training loop; in
+                     this case, it does not matter whether users set
+                     ``find_unused_parameters = True`` or not. 2) How the graph is trained
+                     will not change during the whole training loop (meaning there is
+                     no control flow depending on iterations).
+                     When static_graph is set to be ``True``, DDP will support cases that
+                     can not be supported in the past:
+                     1) Reentrant backwards.
+                     2) Activation checkpointing multiple times.
+                     3) Activation checkpointing when model has unused parameters.
+                     4) There are model parameters that are outside of forward function.
+                     5) Potentially improve performance when there are unused parameters,
+                     as DDP will not search graph in each iteration to detect unused
+                     parameters when static_graph is set to be ``True``.
+                     To check whether you can set static_graph to be ``True``, one way is to
+                     check ddp logging data at the end of your previous model training,
+                     if ``ddp_logging_data.get("can_set_static_graph") == True``, mostly you
+                     can set ``static_graph = True`` as well.
+
+                     Example::
+                         >>> # xdoctest: +SKIP("undefined variables")
+                         >>> model_DDP = torch.nn.parallel.DistributedDataParallel(model)
+                         >>> # Training loop
+                         >>> ...
+                         >>> ddp_logging_data = model_DDP._get_ddp_logging_data()
+                         >>> static_graph = ddp_logging_data.get("can_set_static_graph")
+        delay_all_reduce_named_params (list of tuple of str and torch.nn.Parameter): a list
+                    of named parameters whose all reduce will be delayed when the gradient of
+                    the parameter specified in ``param_to_hook_all_reduce`` is ready. Other
+                    arguments of DDP do not apply to named params specified in this argument
+                    as these named params will be ignored by DDP reducer.
+        param_to_hook_all_reduce (torch.nn.Parameter): a parameter to hook delayed all reduce
+                    of parameters specified in ``delay_all_reduce_named_params``.
+
+
+    Attributes:
+        module (Module): the module to be parallelized.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+        >>> net = torch.nn.parallel.DistributedDataParallel(model)
+    """
+
+    # used to track whether the given thread is inside ddp forward for torchdynamo purposes
+    _active_ddp_module: Optional["DistributedDataParallel"] = None
+
+    def __init__(
+        self,
+        module,
+        device_ids=None,
+        output_device=None,
+        dim=0,
+        broadcast_buffers=True,
+        process_group=None,
+        bucket_cap_mb=25,
+        find_unused_parameters=False,
+        check_reduction=False,
+        gradient_as_bucket_view=False,
+        static_graph=False,
+        delay_all_reduce_named_params=None,
+        param_to_hook_all_reduce=None,
+        mixed_precision: Optional[_MixedPrecision] = None,
+        device_mesh=None,
+    ):
+        super().__init__()
+        Joinable.__init__(self)
+        self.logger = None
+        if bool(delay_all_reduce_named_params is not None) != bool(
+            param_to_hook_all_reduce is not None
+        ):
+            self._log_and_throw(
+                ValueError,
+                "delay_all_reduce_named_params and param_to_hook_all_reduce "
+                "need to be set at the same time.",
+            )
+
+        self._delay_all_reduce_params = []
+        if hasattr(module, "_ddp_params_and_buffers_to_ignore"):
+            self.parameters_to_ignore = set(module._ddp_params_and_buffers_to_ignore)
+        else:
+            self.parameters_to_ignore = set()
+        if delay_all_reduce_named_params is not None:
+            for name, param in delay_all_reduce_named_params:
+                self.parameters_to_ignore.add(name)
+                self._delay_all_reduce_params.append(param)
+
+        self._module_parameters = [
+            p
+            for n, p in module.named_parameters()
+            if n not in self.parameters_to_ignore
+        ]
+        if not any(p.requires_grad for p in self._module_parameters):
+            if len(self._delay_all_reduce_params):
+                logger.info("Delay the AllReduce of all parameters.")
+            else:
+                self._log_and_throw(
+                    RuntimeError,
+                    "DistributedDataParallel is not needed when a module "
+                    "doesn't have any parameter that requires a gradient.",
+                )
+
+        if device_ids is not None and len(device_ids) > 1:
+            self._log_and_throw(
+                ValueError,
+                "device_ids can only be None or contain a single element.",
+            )
+
+        self.is_multi_device_module = (
+            len({p.device for p in self._module_parameters}) > 1
+        )
+        distinct_device_types = {
+            p.device.type for p in self._module_parameters if p.device is not None
+        }
+        if len(distinct_device_types) != 1:
+            self._log_and_throw(
+                ValueError,
+                "DistributedDataParallel's input module must be on "
+                f"the same type of devices, but input module parameters locate in {distinct_device_types}.",
+            )
+
+        self.device_type = next(iter(distinct_device_types))
+
+        if (
+            device_ids is None
+            or len(device_ids) == 0  # For backward compatibility.
+            or self.device_type == "cpu"
+            or self.is_multi_device_module
+        ):
+            if device_ids or output_device:
+                self._log_and_throw(
+                    ValueError,
+                    "DistributedDataParallel device_ids and output_device arguments "
+                    "only work with single-device/multiple-device GPU modules or CPU modules, "
+                    "but got device_ids {}, output_device {}, and module parameters {}.".format(
+                        device_ids,
+                        output_device,
+                        {p.device for p in self._module_parameters},
+                    ),
+                )
+
+            self.device_ids = None
+            self.output_device = None
+        else:
+            self.device_ids = [_get_device_index(x, True) for x in device_ids]
+
+            if output_device is None:
+                output_device = device_ids[0]
+
+            self.output_device = _get_device_index(output_device, True)
+
+        if process_group and device_mesh is not None:
+            raise RuntimeError(
+                "Cannot specify both process_group and device_mesh arguments."
+            )
+        elif process_group is None and device_mesh is None:
+            self.process_group = _get_default_group()
+        elif device_mesh is None:
+            self.process_group = process_group
+        else:
+            if device_mesh.ndim != 1:
+                raise RuntimeError(
+                    f"Only 1D device mesh is supported, but got {device_mesh}."
+                )
+            self.device_mesh = device_mesh
+            self.process_group = device_mesh.get_group(mesh_dim=0)
+
+        self.static_graph = False
+        self.dim = dim
+        self.module = module
+        self.device = next(iter(self._module_parameters)).device
+        self.broadcast_buffers = broadcast_buffers
+        self.find_unused_parameters = find_unused_parameters
+        self.require_backward_grad_sync = True
+        self.require_forward_param_sync = True
+        self.gradient_as_bucket_view = gradient_as_bucket_view
+        self.mixed_precision = mixed_precision
+        if self.mixed_precision is not None:
+            logger.warning("Received mixed precision config %s", self.mixed_precision)
+
+        if check_reduction:
+            # This argument is no longer used since the reducer
+            # will ensure reduction completes even if some parameters
+            # do not receive gradients.
+            warnings.warn(
+                "The `check_reduction` argument in `DistributedDataParallel` "
+                "module is deprecated. Please avoid using it."
+            )
+
+        # Check that a module does not have Uninitialized parameters
+        for param in self._module_parameters:
+            if isinstance(param, torch.nn.parameter.UninitializedParameter):
+                self._log_and_throw(
+                    RuntimeError,
+                    "Modules with uninitialized parameters can't be used with `DistributedDataParallel`. "
+                    "Run a dummy forward pass to correctly initialize the modules",
+                )
+        # used for intra-node param sync and inter-node sync as well
+        self.broadcast_bucket_size = int(250 * 1024 * 1024)
+
+        # reduction bucket size
+        self.bucket_bytes_cap = int(bucket_cap_mb * 1024 * 1024)
+        # Whether to perform input tensor CPU to GPU copies on a side-stream
+        self.use_side_stream_for_tensor_copies = (
+            os.environ.get("PYTORCH_DDP_USE_SIDE_STREAM", "1") == "1"
+        )
+
+        # Initialize gradient buffers and register all reduce hook
+        self._delay_grad_buffer = None
+        self._delay_grad_views: List[torch.Tensor] = []
+        self._delay_all_reduce_all_params = False
+        if len(self._delay_all_reduce_params) != 0:
+            self._register_delay_all_reduce_hook(
+                bucket_cap_mb=bucket_cap_mb,
+                param_to_hook_all_reduce=param_to_hook_all_reduce,
+                device_ids=device_ids,
+            )
+            if self._delay_all_reduce_all_params:
+                return
+
+        # Build parameters for reducer.
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+        # Verify model equivalence.
+        _verify_param_shape_across_processes(self.process_group, parameters)
+        # Sync params and buffers. Ensures all DDP models start off at the same value.
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=0,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+            broadcast_buffers=self.broadcast_buffers,
+        )
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            static_graph,
+        )
+        self._comm_hooks: List[Tuple[Callable, object]] = []
+
+        if self.mixed_precision is not None:
+            _setup_mixed_precision_params(self.mixed_precision, self.module)
+            _cast_buffers(self.mixed_precision, self.module)
+            # Stream used for async low precision copies.
+            self._mp_stream = torch.cuda.Stream()
+            self._submodule_to_event = defaultdict(deque)  # type: ignore[var-annotated]
+            # Add forward pre-hook to root module to kick off copies to lower
+            # precision.
+            self.module.register_forward_pre_hook(
+                self._root_copy_hook, prepend=False, with_kwargs=True
+            )
+            # Add forward pre hook to all submodules to wait for copy events
+            # before running computation.
+            for module in self.module.modules():
+                module.register_forward_pre_hook(
+                    self._module_wait_for_copy_hook,
+                    prepend=False,
+                    with_kwargs=True,
+                )
+            # Set up callbacks in backward to upcast and use full precision
+            # params. TODO (rohan-varma): Make this compose with general
+            # comm hooks and apply_optimizer_in_backward. Importing inline to
+            # avoid circular import issue.
+            from torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks import (
+                _AllreduceUpcastHookState,
+                _reducer_allreduce_and_upcast_hook,
+            )
+
+            upcast_hook_state = _AllreduceUpcastHookState(
+                ddp_weakref=weakref.ref(self),
+                upcast_stream=torch.cuda.Stream(),
+            )
+            self.register_comm_hook(
+                upcast_hook_state,
+                _reducer_allreduce_and_upcast_hook,
+            )
+            # Inform reducer of reduced precision param dtype for correctness
+            # of type checks between gradient and bucket.
+            self.reducer._set_mixed_precision_param_dtype(  # type: ignore[attr-defined]
+                self.mixed_precision.param_dtype
+            )
+
+        self._has_rebuilt_buckets = False
+
+        if static_graph:
+            self._set_static_graph()
+
+        self._lazy_init_ran = False
+
+        # Register the AccumulateGrad post hooks if optimize_ddp is
+        # True. The hooks will be deregistered if compiled_autograd is not
+        # enabled.
+        self._accum_grad_hooks: List[RemovableHandle] = []
+        optimize_ddp = torch._dynamo.config._get_optimize_ddp_mode()
+        self._use_python_reducer = optimize_ddp in (
+            "python_reducer",
+            "python_reducer_without_compiled_forward",
+        )
+        self._force_to_disable_cpp_reducer = (
+            optimize_ddp == "python_reducer_without_compiled_forward"
+        )
+        if self._use_python_reducer:
+            self._register_accum_grad_hook()
+
+    def _register_accum_grad_hook(self):
+        import torch.distributed._functional_collectives as fcol
+
+        def compiled_accum_grad_hook(
+            param,
+            *,
+            param_index: int,
+        ):
+            if not self.require_backward_grad_sync:
+                return
+
+            if param.grad is None:
+                return
+
+            if self._comm_hooks:
+                for hook, state in self._comm_hooks:
+                    hook(state, (param.grad, param))
+            else:
+                gradient = param.grad / self.process_group.size()
+                gradient = fcol.all_reduce(gradient, "sum", self.process_group)
+                param.grad.copy_(gradient)
+
+        for index, param in enumerate(self._module_parameters):
+            self._accum_grad_hooks.append(
+                param.register_post_accumulate_grad_hook(
+                    functools.partial(
+                        compiled_accum_grad_hook,
+                        param_index=index,
+                    )
+                )
+            )
+
+    def _delayed_all_reduce_hook(self, grad):
+        world_size = dist.get_world_size(self.process_group)
+
+        self._delay_grad_buffer.div_(world_size)  # type: ignore[union-attr]
+        _ = dist.all_reduce(
+            self._delay_grad_buffer, group=self.process_group, async_op=True
+        )
+        return grad
+
+    def _register_delay_all_reduce_hook(
+        self,
+        bucket_cap_mb,
+        param_to_hook_all_reduce,
+        device_ids,
+    ):
+        # 1. Create gradient buffer
+        device = torch.device("cpu") if device_ids is None else device_ids[0]
+        self._delay_grad_buffer = torch.zeros(
+            sum([p.numel() for p in self._delay_all_reduce_params]),
+            device=device,
+        )
+
+        # 2. Broadcast the parameters
+        detached_params = [p.detach() for p in self._delay_all_reduce_params]
+        dist._broadcast_coalesced(self.process_group, detached_params, bucket_cap_mb, 0)
+
+        # 3. Hook all reduce to the specified parameter
+        param_to_hook_all_reduce.register_hook(self._delayed_all_reduce_hook)
+
+        # 4. Build tensor views for gradients
+        offset = 0
+        for param in self._delay_all_reduce_params:
+            grad_view = self._delay_grad_buffer[offset : (offset + param.numel())].view(
+                param.shape
+            )
+            self._delay_grad_views.append(grad_view)
+            offset = offset + param.numel()
+
+        # 5. Check whether the all reduce of all params requiring grad is delayed.
+        for module_name, module in self.module.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if param.requires_grad:
+                    full_name = f"{module_name}.{param_name}"
+                    if full_name not in self.parameters_to_ignore:
+                        # There is at least a param whose all reduce will not be delayed.
+                        # In this case, we should not set self._delay_all_reduce_all_params
+                        # to True.
+                        return
+        self._delay_all_reduce_all_params = True
+
+    def _setup_in_backward_optimizers(self):
+        # Check if user has used apply_optim_in_backward to overlap optimizer
+        # step + DDP backward. Current constraints:
+        # 1. Only allreduce is supported at the moment, no custom communication.
+        # 2. For DDP-managed parameters that have their optimizer run in
+        # backward, their gradients are set to ``None``. If your use case
+        # requires DDP parameters grad not to be set to ``None`` after their
+        # in-backward optimizer runs, please ping
+        # https://github.com/pytorch/pytorch/issues/90052.
+        # NOTE: we use self._module_parameters instead of .parameters() since
+        # the former excludes ignored (non-DDP managed) parameters.
+        if any(hasattr(p, "_in_backward_optimizers") for p in self._module_parameters):
+            torch._C._log_api_usage_once("ddp.optimizer_in_backward")
+            # Remove hooks that apply_optim_in_backward had registered because
+            # DDP customizes how optimizer is overlapped with backward due to
+            # the allreduce.
+            param_to_handle_map = (
+                dist.optim.apply_optimizer_in_backward.param_to_optim_hook_handle_map
+            )
+            for p in self._module_parameters:
+                for handle in param_to_handle_map.get(p, []):
+                    handle.remove()
+
+            # Need a weakref to DDP instance to run all_reduce (from reducer)
+            # and get managed DDP parameters.
+            ddp_weakref = weakref.ref(self)
+            # Note: importing in function, otherwise this will cause a circular
+            # import.
+            from torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks import (
+                _apply_optim_in_backward_hook,
+            )
+
+            self.register_comm_hook(
+                ddp_weakref,
+                _apply_optim_in_backward_hook(
+                    gradient_is_bucket_view=self.gradient_as_bucket_view
+                ),
+            )
+
+            self.reducer._set_optimizer_in_backward()  # type: ignore[attr-defined]
+
+    def _fire_reducer_autograd_hook(self, idx, *unused):
+        """
+        Fire the reducer's autograd hook to allreduce params in a Reducer bucket.
+
+        Note that this is only used during mixed precision training as the
+        Reducer's hooks installed during construction time would not be called
+        as we're working in the low precision parameter setting.
+        """
+        self.reducer._autograd_hook(idx)  # type: ignore[attr-defined]
+
+    def _root_copy_hook(self, *args: Any, **kwargs: Any) -> None:
+        """
+        For DDP mixed precision, put low precision copies on separate stream and create events to wait for them.
+
+        When training with DDP mixed precision, this root pre-forward hook kicks
+        off low precision copies on a separate stream and creates respective
+        events to wait for them.
+        """
+        # Clear out previous iteration submodule to event. This is because we
+        # may have populated some events for modules that didn't end up being
+        # used.
+        self._submodule_to_event = defaultdict(deque)  # type: ignore[var-annotated]
+        with torch.cuda.stream(self._mp_stream):
+            for submodule in self.module.modules():
+                for param in submodule.parameters(recurse=False):
+                    # Do not cast DDP ignored parameters.
+                    if hasattr(param, "_ddp_ignored") and param._ddp_ignored:
+                        continue
+                    _alloc_storage(param._mp_param, param.size())
+                    # copy() implicitly casts to low precision
+                    with torch.no_grad():
+                        param._mp_param.copy_(param.data)
+                        # TODO: when zero_grad(set_to_none=False) or in grad
+                        # accumulation case, accumulated grads can be in fp32
+                        # which can cause errors when running DDP backwards due
+                        # to mismatched incoming and accumulated gradient types.
+                        # So we manually cast the accumulated grad down for now,
+                        # in the future we may shift to FSDP style gradient
+                        # accumulation management where the accumulated gradient
+                        # is saved and .grad field is set to None, bypassing
+                        # this issue.
+                        if param.grad is not None:
+                            param.grad.data = param.grad.to(
+                                self.mixed_precision.param_dtype  # type: ignore[union-attr]
+                            )
+                    param.data = param._mp_param
+                copy_event = torch.cuda.Event()
+                copy_event.record()
+                self._submodule_to_event[submodule].append(copy_event)
+
+    def _module_wait_for_copy_hook(
+        self,
+        module,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Before carrying out computation, wait on the appropriate event to ensure low precision copies have finished."""
+        try:
+            event = self._submodule_to_event[module].popleft()
+        except IndexError:
+            # copy event has already been waited on
+            return
+
+        event.wait(stream=torch.cuda.current_stream())
+        for p in module.parameters(recurse=False):
+            # Don't register hooks if param does not require grad
+            if not p.requires_grad or (hasattr(p, "_ddp_ignored") and p._ddp_ignored):
+                continue
+            # We need to register autograd hook here instead of DDP's ctor
+            # since we're working with the low precision param. Register them
+            # via obtaining the gradient accumulator.
+            tmp = p.expand_as(p)
+            grad_acc = tmp.grad_fn.next_functions[0][0]
+
+            hook = grad_acc.register_hook(
+                functools.partial(self._fire_reducer_autograd_hook, p._idx)
+            )
+            p._ddp_mp_hook_state = (grad_acc, hook)
+
+    def _log_and_throw(self, err_type, err_msg):
+        if self.logger is not None:
+            self.logger.set_error_and_log(f"{str(err_type)}: {err_msg}")
+        raise err_type(err_msg)
+
+    def _ddp_init_helper(
+        self,
+        parameters,
+        expect_sparse_gradient,
+        param_to_name_mapping,
+        static_graph,
+    ):
+        """
+        DDP init helper function to manage parameters, grad hooks, logging, and SyncBatchNorm.
+
+        Initialization helper function that does the following:
+        (1) bucketing the parameters for reductions
+        (2) resetting the bucketing states
+        (3) registering the grad hooks
+        (4) Logging construction-time DDP logging data
+        (5) passing a handle of DDP to SyncBatchNorm Layer
+        """
+        # Notice, the parameters order is not in the order in which they are used,
+        # especially in models with control flow.
+        #
+        # Alongside parameters are not presented in the real execution order,
+        # if a certain model happens to also
+        #   1) have other collectives comm ops in its backward graph.
+        #   2) have unused parameter in subset ranks of the whole world.
+        # bucketing could insert ALL-REDUCE comm op too early on the rank with unused parameter,
+        # matching up with other collectives comm ops on other ranks unexpectedly.
+        #
+        # In order to handle this corner case, when the parameters are not in the real execution order,
+        # we don't do bucketing, thus only one ALL-REDUCE is inserted after all the gradients
+        # of the whole graph are computed.
+        #
+        # Notice, here we only disable bucketing for the first iteration.
+        # After the first iteration, it's OK to rebuild buckets,
+        # because "bucket rebuild" bucketizes parameters based on its real execution order in backward graph.
+
+        # Can remove this branching once #73732 is landed.
+        if static_graph is True or self.find_unused_parameters is False:
+            bucket_size_limits = [sys.maxsize]
+        else:
+            bucket_size_limits = [
+                dist._DEFAULT_FIRST_BUCKET_BYTES,
+                self.bucket_bytes_cap,
+            ]
+        (
+            bucket_indices,
+            per_bucket_size_limits,
+        ) = dist._compute_bucket_assignment_by_size(
+            parameters,
+            bucket_size_limits,
+            expect_sparse_gradient,
+        )
+
+        # Remember index for parameters if we are in mixed precision, as we
+        # need to pass in index to Reducer's autograd hook via python.
+        if self.mixed_precision is not None:
+            for i, p in enumerate(parameters):
+                p._idx = i
+
+        # Note: reverse list of buckets because we want to approximate the
+        # order in which their gradients are produced, and assume they
+        # are used in the forward pass in the order they are defined.
+        self.reducer = dist.Reducer(
+            parameters,
+            list(reversed(bucket_indices)),
+            list(reversed(per_bucket_size_limits)),
+            self.process_group,
+            expect_sparse_gradient,
+            # The bucket size limit is specified in the constructor.
+            # Additionally, we allow for a single small bucket for parameters
+            # that are defined first, such that their gradients don't spill into
+            # a much larger bucket, adding unnecessary latency after gradient
+            # computation finishes. Experiments showed 1MB is a reasonable value.
+            self.bucket_bytes_cap,
+            self.find_unused_parameters,
+            self.gradient_as_bucket_view,
+            param_to_name_mapping,
+            # User can set dist._DEFAULT_FIRST_BUCKET_BYTES to tune DDP first
+            # bucket.
+            dist._DEFAULT_FIRST_BUCKET_BYTES,
+        )
+
+        self.logger = dist.Logger(self.reducer)
+        # Set as a weak reference to avoid reference cycle between
+        # logger and reducer.
+        self.reducer.set_logger(self.logger)
+
+        has_sync_bn = False
+        for submodule in self.module.modules():
+            if isinstance(submodule, torch.nn.SyncBatchNorm):
+                has_sync_bn = True
+                break
+
+        # Set logging data that can be got during construction time.
+        self.logger.set_construction_data_and_log(
+            self.module.__class__.__name__,
+            [] if self.device_ids is None else self.device_ids,
+            -1 if self.output_device is None else self.output_device,
+            self.broadcast_buffers,
+            has_sync_bn,
+            static_graph,
+        )
+
+        # passing a handle to torch.nn.SyncBatchNorm layer
+        self._passing_sync_batchnorm_handle(self.module)
+
+    def __getstate__(self):
+        self._check_default_group()
+        attrs = copy.copy(self.__dict__)
+        del attrs["process_group"]
+        del attrs["reducer"]
+        del attrs["logger"]
+        return attrs
+
+    def __setstate__(self, state):
+        # If serializable, then the process group should be the default one
+        self.process_group = _get_default_group()
+        super().__setstate__(state)
+        self.__dict__.setdefault("require_forward_param_sync", True)
+        self.__dict__.setdefault("require_backward_grad_sync", True)
+        parameters, expect_sparse_gradient = self._build_params_for_reducer()
+        # In debug mode, build a mapping of parameter index -> parameter.
+        param_to_name_mapping = self._build_debug_param_to_name_mapping(parameters)
+        # Builds reducer.
+        self._ddp_init_helper(
+            parameters,
+            expect_sparse_gradient,
+            param_to_name_mapping,
+            self.static_graph,
+        )
+        if self.static_graph:
+            self.reducer._set_static_graph()
+            assert self.logger is not None
+            self.logger._set_static_graph()
+
+    def _build_params_for_reducer(self):
+        # Build tuple of (module, parameter) for all parameters that require grads.
+        modules_and_parameters = [
+            (module, parameter)
+            for module_name, module in self.module.named_modules()
+            for parameter in [
+                param
+                # Note that we access module.named_parameters instead of
+                # parameters(module). parameters(module) is only needed in the
+                # single-process multi device case, where it accesses replicated
+                # parameters through _former_parameters.
+                for param_name, param in module.named_parameters(recurse=False)
+                if param.requires_grad
+                and f"{module_name}.{param_name}" not in self.parameters_to_ignore
+            ]
+        ]
+
+        # Deduplicate any parameters that might be shared across child modules.
+        memo = set()
+        modules_and_parameters = [
+            # "p not in memo" is the deduplication check.
+            # "not memo.add(p)" is always True, and it's only there to cause "add(p)" if needed.
+            (m, p)
+            for m, p in modules_and_parameters
+            if p not in memo and not memo.add(p)  # type: ignore[func-returns-value]
+        ]
+
+        # Build list of parameters.
+        parameters = [parameter for _, parameter in modules_and_parameters]
+
+        # Checks if a module will produce a sparse gradient.
+        def produces_sparse_gradient(module):
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)):
+                return module.sparse
+            return False
+
+        # Build list of booleans indicating whether or not to expect sparse
+        # gradients for the corresponding parameters.
+        expect_sparse_gradient = [
+            produces_sparse_gradient(module) for module, _ in modules_and_parameters
+        ]
+
+        self._assign_modules_buffers()
+
+        return parameters, expect_sparse_gradient
+
+    def _assign_modules_buffers(self):
+        """
+        Assign self.module.named_buffers to self.modules_buffers.
+
+        Assigns module buffers to self.modules_buffers which are then used to
+        broadcast across ranks when broadcast_buffers=True. Note that this
+        must be called every time buffers need to be synced because buffers can
+        be reassigned by user module,
+        see https://github.com/pytorch/pytorch/issues/63916.
+        """
+        # Collect buffers for modules, filtering out buffers that should be ignored.
+        named_module_buffers = [
+            (buffer, buffer_name)
+            for buffer_name, buffer in self.module.named_buffers()
+            if buffer_name not in self.parameters_to_ignore
+        ]
+        self.modules_buffers = [
+            buffer for (buffer, buffer_name) in named_module_buffers
+        ]
+        # Dict[str, tensor] representing module buffers not ignored by DDP.
+        self.named_module_buffers = {
+            buffer_name: buffer for (buffer, buffer_name) in named_module_buffers
+        }
+
+    def _build_debug_param_to_name_mapping(self, parameters):
+        param_to_param_index = {parameters[i]: i for i in range(len(parameters))}
+        param_set = set(parameters)
+        param_index_to_param_fqn = {}
+        for module_name, module in self.module.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                fqn = f"{module_name}.{param_name}"
+                # Bypass ignored parameters since those are not reduced by DDP
+                # to begin with.
+                if fqn not in self.parameters_to_ignore and param.requires_grad:
+                    if param not in param_set:
+                        self._log_and_throw(
+                            ValueError,
+                            f"Param with name {fqn} found in module parameters, but not DDP parameters."
+                            " This indicates a bug in DDP, please report an issue to PyTorch.",
+                        )
+                    param_index = param_to_param_index[param]
+                    param_index_to_param_fqn[param_index] = fqn
+
+        # Ensure we covered all parameters
+        if len(param_set) != len(param_index_to_param_fqn):
+            self._log_and_throw(
+                ValueError,
+                (
+                    "Expected param to name mapping to cover all parameters, but"
+                    f" got conflicting lengths: {len(param_set)} vs "
+                    f"{len(param_index_to_param_fqn)}. This indicates a bug in DDP"
+                    ", please report an issue to PyTorch."
+                ),
+            )
+
+        return param_index_to_param_fqn
+
+    def _get_parameters(self, m, recurse=True):
+        """Return a generator of module parameters."""
+
+        def model_parameters(m):
+            ps = (
+                m._former_parameters.values()
+                if hasattr(m, "_former_parameters")
+                else m.parameters(recurse=False)
+            )
+            yield from ps
+
+        for mod in m.modules() if recurse else [m]:
+            yield from model_parameters(mod)
+
+    def _check_default_group(self):
+        pickle_not_supported = False
+        try:
+            if self.process_group != _get_default_group():
+                pickle_not_supported = True
+        except RuntimeError:
+            pickle_not_supported = True
+
+        if pickle_not_supported:
+            self._log_and_throw(
+                RuntimeError,
+                "DDP Pickling/Unpickling are only supported "
+                "when using DDP with the default process "
+                "group. That is, when you have called "
+                "init_process_group and have not passed "
+                "process_group argument to DDP constructor",
+            )
+
+    @contextmanager
+    def no_sync(self):
+        r"""
+        Context manager to disable gradient synchronizations across DDP processes.
+
+        Within this context, gradients will be accumulated on module
+        variables, which will later be synchronized in the first
+        forward-backward pass exiting the context.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("undefined variables")
+            >>> ddp = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> with ddp.no_sync():
+            >>>     for input in inputs:
+            >>>         ddp(input).backward()  # no synchronization, accumulate grads
+            >>> ddp(another_input).backward()  # synchronize grads
+
+        .. warning::
+            The forward pass should be included inside the context manager, or
+            else gradients will still be synchronized.
+        """
+        old_require_backward_grad_sync = self.require_backward_grad_sync
+        self.require_backward_grad_sync = False
+        try:
+            yield
+        finally:
+            self.require_backward_grad_sync = old_require_backward_grad_sync
+
+    @classmethod
+    def _get_active_ddp_module(cls):
+        """`TorchDynamo` requires DDP's status and module for cooperative optimization."""
+        return cls._active_ddp_module
+
+    # note, this ctxmgr function is marked 'skip' in torchdynamo, so dynamo only kicks in
+    # for the 'module_to_run' underneath
+    # see torch._dynamo/eval_frame.py TorchPatcher.patch for more details
+    @contextmanager
+    @torch._disable_dynamo(recursive=False)
+    def _inside_ddp_forward(self):
+        DistributedDataParallel._active_ddp_module = self
+        try:
+            yield
+        finally:
+            DistributedDataParallel._active_ddp_module = None
+
+    def _run_ddp_forward(self, *inputs, **kwargs):
+        if self._use_python_reducer:
+            return self.module(*inputs, **kwargs)  # type: ignore[index]
+        else:
+            with self._inside_ddp_forward():
+                return self.module(*inputs, **kwargs)  # type: ignore[index]
+
+    def _clear_grad_buffer(self):
+        # Making param.grad points to the grad buffers before backward is based on the
+        # assumption that the grad accumulation is done in place in autograd engine,
+        # for some edge cases, if the grad accumulation in autograd engine is not in
+        # place, then the param.grad and grad buffers are detached.
+        if self._delay_grad_buffer is not None:
+            # We batch zero_grad for all params by resetting the whole grad
+            # buffer when the grad of all params is set to None.
+            all_param_grad_none = all(
+                param.grad is None for param in self._delay_all_reduce_params
+            )
+
+            for index, param in enumerate(self._delay_all_reduce_params):
+                if param.grad is None:
+                    param.grad = self._delay_grad_views[index]
+                    if not all_param_grad_none:
+                        param.grad.zero_()
+
+            if all_param_grad_none:
+                self._delay_grad_buffer.zero_()
+
+    def _lazy_init(self):
+        # Initialization for DDP that occurs after construction, but lazily
+        # before the first forward pass.
+        self._setup_in_backward_optimizers()
+        self._lazy_init_ran = True
+
+    def _should_disable_cpp_reducer(self) -> bool:
+        return self._use_python_reducer and (
+            torch._utils.is_compiling() or self._force_to_disable_cpp_reducer
+        )
+
+    def _pre_forward(self, *inputs, **kwargs):
+        if self._should_disable_cpp_reducer():
+            return inputs, kwargs
+
+        # Disable the python reducer if compiled_autograd is not enabled.
+        if self._accum_grad_hooks:
+            for index, h in enumerate(self._accum_grad_hooks):
+                h.remove()
+            self._accum_grad_hooks.clear()
+
+        if not self._lazy_init_ran and not torch._utils.is_compiling():
+            self._lazy_init()
+
+        if self._delay_all_reduce_all_params:
+            return inputs, kwargs
+
+        if torch.is_grad_enabled() and self.require_backward_grad_sync:
+            assert self.logger is not None
+            self.logger.set_runtime_stats_and_log()
+            self.reducer.prepare_for_forward()
+
+        # Notify the join context that this process has not joined, if
+        # needed
+        work = Join.notify_join_context(self)
+        if work:
+            self.reducer._set_forward_pass_work_handle(
+                work, self._divide_by_initial_world_size  # type: ignore[arg-type]
+            )
+
+        # Calling _rebuild_buckets before forward computation,
+        # It may allocate new buckets before deallocating old buckets
+        # inside _rebuild_buckets. To save peak memory usage,
+        # call _rebuild_buckets before the peak memory usage increases
+        # during forward computation.
+        # This should be called only once during whole training period.
+        if torch.is_grad_enabled() and self.reducer._rebuild_buckets():
+            logger.info("Reducer buckets have been rebuilt in this iteration.")
+            self._has_rebuilt_buckets = True
+
+        # sync params according to location (before/after forward) user
+        # specified as part of hook, if hook was specified.
+        if self._check_sync_bufs_pre_fwd():
+            self._sync_buffers()
+
+        if self._join_config.enable:
+            # Notify joined ranks whether they should sync in backwards pass or not.
+            self._check_global_requires_backward_grad_sync(is_joined_rank=False)
+
+        if self.device_ids:
+            moved_inputs, moved_kwargs = _to_kwargs(
+                inputs,
+                kwargs,
+                torch.device(self.device_type, self.device_ids[0]),
+                self.use_side_stream_for_tensor_copies,
+            )
+            args, kwargs = moved_inputs[0], moved_kwargs[0]
+            # Cast inputs to reduced precision if needed.
+            if self.mixed_precision is not None:
+                args, kwargs = _cast_forward_inputs(
+                    self.mixed_precision.param_dtype,
+                    *args,
+                    **kwargs,
+                )
+            return args, kwargs
+        else:
+            # Cast inputs to reduced precision if needed.
+            # TODO (rohan-varma) test this codepath.
+            if self.mixed_precision is not None:
+                inputs, kwargs = _cast_forward_inputs(
+                    self.mixed_precision.param_dtype,
+                    *inputs,
+                    **kwargs,
+                )
+            return inputs, kwargs
+
+    def _post_forward(self, output):
+        if self._should_disable_cpp_reducer():
+            return output
+
+        if self._delay_all_reduce_all_params:
+            self._clear_grad_buffer()
+            return output
+
+        # sync params according to location (before/after forward) user
+        # specified as part of hook, if hook was specified.
+        if self._check_sync_bufs_post_fwd():
+            self._sync_buffers()
+
+        if torch.is_grad_enabled() and self.require_backward_grad_sync:
+            self.require_forward_param_sync = True
+            # We'll return the output object verbatim since it is a freeform
+            # object. We need to find any tensors in this object, though,
+            # because we need to figure out which parameters were used during
+            # this forward pass, to ensure we short circuit reduction for any
+            # unused parameters. Only if `find_unused_parameters` is set.
+            if self.find_unused_parameters and not self.static_graph:
+                # Do not need to populate this for static graph.
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            self.require_forward_param_sync = False
+
+        # TODO: DDPSink is currently enabled for unused parameter detection and
+        # static graph training for first iteration.
+        if (self.find_unused_parameters and not self.static_graph) or (
+            self.static_graph and not self._static_graph_delay_allreduce_enqueued
+        ):
+            (
+                output_tensor_list,
+                treespec,
+                output_is_rref,
+            ) = _tree_flatten_with_rref(output)
+            output_placeholders = [None for _ in range(len(output_tensor_list))]
+            # Do not touch tensors that have no grad_fn, which can cause issues
+            # such as https://github.com/pytorch/pytorch/issues/60733
+            for i, output in enumerate(output_tensor_list):
+                if torch.is_tensor(output) and output.grad_fn is None:
+                    output_placeholders[i] = output
+
+            # When find_unused_parameters=True, makes tensors which require grad
+            # run through the DDPSink backward pass. When not all outputs are
+            # used in loss, this makes those corresponding tensors receive
+            # undefined gradient which the reducer then handles to ensure
+            # param.grad field is not touched and we don't error out.
+            passthrough_tensor_list = _DDPSink.apply(
+                weakref.ref(self),
+                *output_tensor_list,
+            )
+            for i in range(len(output_placeholders)):
+                if output_placeholders[i] is None:
+                    output_placeholders[i] = passthrough_tensor_list[i]
+
+            # Reconstruct output data structure.
+            output = _tree_unflatten_with_rref(
+                output_placeholders, treespec, output_is_rref
+            )
+
+        # At the end of the forward pass, reset the grad buffer and grad views
+        self._clear_grad_buffer()
+        return output
+
+    def forward(self, *inputs, **kwargs):
+        with torch.autograd.profiler.record_function("DistributedDataParallel.forward"):
+            inputs, kwargs = self._pre_forward(*inputs, **kwargs)
+            output = (
+                self.module.forward(*inputs, **kwargs)
+                if self._delay_all_reduce_all_params
+                else self._run_ddp_forward(*inputs, **kwargs)
+            )
+            return self._post_forward(output)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Kept for BC
+        return _to_kwargs(
+            inputs,
+            kwargs,
+            torch.device(self.device_type, device_id),
+            self.use_side_stream_for_tensor_copies,
+        )
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+    def train(self, mode=True):
+        super().train(mode)
+        return self
+
+    # When running in join mode, schedules an allreduce to notify joined ranks
+    # of whether backwards pass synchronization will run this iteration or not.
+    def _check_global_requires_backward_grad_sync(self, is_joined_rank):
+        if not is_joined_rank and self.require_backward_grad_sync:
+            requires_sync_tensor = torch.ones(1, device=self.device)
+        else:
+            requires_sync_tensor = torch.zeros(1, device=self.device)
+
+        work = dist.all_reduce(
+            requires_sync_tensor, group=self.process_group, async_op=True
+        )
+
+        # (kwen2501) This if condition is a plain translation of previous
+        # behavior, i.e. in the `is_joined_rank=False` case, `work.wait()`
+        # is not called and it doesn't care about the result. I am guessing
+        # that it just wants to fire a matching all-reduce and does not want
+        # the main stream to wait.
+        if is_joined_rank:
+            work.wait()
+            should_sync_backwards = requires_sync_tensor.item() != 0
+            return should_sync_backwards
+        else:
+            return None  # Return value is not/should not be used.
+
+    # When running in join mode, checks and performs sync of module buffers if
+    # the models have buffers that should be synchronized in the forward pass.
+    def _check_and_sync_module_buffers(self):
+        if self._check_sync_bufs_pre_fwd():
+            authoritative_rank = self._find_common_rank(self._distributed_rank, False)
+            self._sync_module_buffers(authoritative_rank)
+
+    # When running in join model, agrees upon a common rank and broadcast model
+    # parameters to all other ranks.
+    def _sync_final_model(self, is_last_joiner):
+        # Agree upon the process that will be the authoritative model copy.
+        # The current rank is a candidate for being the authoritative copy if
+        # is_last_joiner=True. We break ties via picking the larger rank.
+        self._authoritative_rank = self._find_common_rank(
+            self._distributed_rank, is_last_joiner
+        )
+        _sync_module_states(
+            module=self.module,
+            process_group=self.process_group,
+            broadcast_bucket_size=self.broadcast_bucket_size,
+            src=self._authoritative_rank,
+            params_and_buffers_to_ignore=self.parameters_to_ignore,
+            broadcast_buffers=self.broadcast_buffers,
+        )
+
+    # Schedule comm ops to match those scheduled in the reducer's backward
+    # pass.
+    def _match_all_reduce_for_bwd_pass(self):
+        comm_work = []
+        # Schedule comm in the same order as Reducer schedules them, i.e.
+        # the order of the buckets. Retrieving the bucket order from the reducer
+        # ensures that we keep the same order in join mode, such as when bucket
+        # order is rebuilt dynamically.
+
+        # Returns grad_buckets in order, but real tensors are substituted with
+        # zero tensors of the same shape.
+        grad_buckets = self.reducer._get_zeros_like_grad_buckets()
+        for grad_bucket in grad_buckets:
+            # Joined processes contribute zero gradient. In the case that
+            # divide_by_initial_world_size=True, we divide grads by the static
+            # world size, if not, the dividing factor is reduced by the number
+            # of joined processes.
+            work = self.reducer._run_comm_hook(grad_bucket)
+            comm_work.append(work)
+        for work in comm_work:
+            work.wait()
+
+    # Allreduces the used parameter mapping across ranks.
+    def _match_unused_params_allreduce(self):
+        locally_used_param_map = self.reducer._get_local_used_map()
+        self.process_group.allreduce(locally_used_param_map)
+
+    def join(
+        self,
+        divide_by_initial_world_size: bool = True,
+        enable: bool = True,
+        throw_on_early_termination: bool = False,
+    ):
+        r"""
+        Context manager for training with uneven inputs across processes in DDP.
+
+        This context manager will keep track of already-joined DDP processes,
+        and "shadow" the forward and backward passes by inserting collective
+        communication operations to match with the ones created by non-joined
+        DDP processes. This will ensure each collective call has a corresponding
+        call by already-joined DDP processes, preventing hangs or errors that
+        would otherwise happen when training with uneven inputs across
+        processes. Alternatively, if the flag ``throw_on_early_termination`` is
+        specified to be ``True``, all trainers will throw an error once one rank
+        runs out of inputs, allowing these errors to be caught and handled
+        according to application logic.
+
+        Once all DDP processes have joined, the context manager will broadcast
+        the model corresponding to the last joined process to all processes to
+        ensure the model is the same across all processes
+        (which is guaranteed by DDP).
+
+        To use this to enable training with uneven inputs across processes,
+        simply wrap this context manager around your training loop. No further
+        modifications to the model or data loading is required.
+
+        .. warning::
+            If the model or training loop this context manager is wrapped around
+            has additional distributed collective operations, such as
+            ``SyncBatchNorm`` in the model's forward pass, then the flag
+            ``throw_on_early_termination`` must be enabled. This is because this
+            context manager is not aware of non-DDP collective communication.
+            This flag will cause all ranks to throw when any one rank
+            exhausts inputs, allowing these errors to be caught and recovered
+            from across all ranks.
+
+        Args:
+            divide_by_initial_world_size (bool): If ``True``, will divide
+                gradients by the initial ``world_size`` DDP training was launched
+                with. If ``False``, will compute the effective world size
+                (number of ranks that have not depleted their inputs yet) and
+                divide gradients by that during allreduce. Set
+                ``divide_by_initial_world_size=True`` to ensure every input
+                sample including the uneven inputs have equal weight in terms of
+                how much they contribute to the global gradient. This is
+                achieved by always dividing the gradient by the initial
+                ``world_size`` even when we encounter uneven inputs. If you set
+                this to ``False``, we divide the gradient by the remaining
+                number of nodes. This ensures parity with training on a smaller
+                ``world_size`` although it also means the uneven inputs would
+                contribute more towards the global gradient. Typically, you
+                would want to set this to ``True`` for cases where the last few
+                inputs of your training job are uneven. In extreme cases, where
+                there is a large discrepancy in the number of inputs, setting
+                this to ``False`` might provide better results.
+            enable (bool): Whether to enable uneven input detection or not. Pass
+                in ``enable=False`` to disable in cases where you know that
+                inputs are even across participating processes. Default is
+                ``True``.
+            throw_on_early_termination (bool): Whether to throw an error
+                or continue training when at least one rank has exhausted
+                inputs. If ``True``, will throw upon the first rank reaching end
+                of data. If ``False``, will continue training with a smaller
+                effective world size until all ranks are joined. Note that if
+                this flag is specified, then the flag
+                ``divide_by_initial_world_size`` would be ignored. Default
+                is ``False``.
+
+
+        Example::
+
+            >>> # xdoctest: +SKIP("Distributed")
+            >>> import torch
+            >>> import torch.distributed as dist
+            >>> import os
+            >>> import torch.multiprocessing as mp
+            >>> import torch.nn as nn
+            >>> # On each spawned worker
+            >>> def worker(rank):
+            >>>     dist.init_process_group("nccl", rank=rank, world_size=2)
+            >>>     torch.cuda.set_device(rank)
+            >>>     model = nn.Linear(1, 1, bias=False).to(rank)
+            >>>     model = torch.nn.parallel.DistributedDataParallel(
+            >>>         model, device_ids=[rank], output_device=rank
+            >>>     )
+            >>>     # Rank 1 gets one more input than rank 0.
+            >>>     inputs = [torch.tensor([1]).float() for _ in range(10 + rank)]
+            >>>     with model.join():
+            >>>         for _ in range(5):
+            >>>             for inp in inputs:
+            >>>                 loss = model(inp).sum()
+            >>>                 loss.backward()
+            >>>     # Without the join() API, the below synchronization will hang
+            >>>     # blocking for rank 1's allreduce to complete.
+            >>>     torch.cuda.synchronize(device=rank)
+        """
+        return Join(
+            [self],
+            enable,
+            throw_on_early_termination,
+            divide_by_initial_world_size=divide_by_initial_world_size,
+        )
+
+    def join_hook(
+        self,
+        **kwargs,
+    ):
+        r"""
+        DDP join hook enables training on uneven inputs by mirroring communications in forward and backward passes.
+
+        Arguments:
+            kwargs (dict): a :class:`dict` containing any keyword arguments
+                to modify the behavior of the join hook at run time; all
+                :class:`Joinable` instances sharing the same join context
+                manager are forwarded the same value for ``kwargs``.
+
+        The hook supports the following keyword arguments:
+            divide_by_initial_world_size (bool, optional):
+                If ``True``, then gradients are divided by the initial world
+                size that DDP was launched with.
+                If ``False``, then gradients are divided by the effective world
+                size (i.e. the number of non-joined processes), meaning that
+                the uneven inputs contribute more toward the global gradient.
+                Typically, this should be set to ``True`` if the degree of
+                unevenness is small but can be set to ``False`` in extreme
+                cases for possibly better results.
+                Default is ``True``.
+        """
+        divide_by_initial_world_size = kwargs.get("divide_by_initial_world_size", True)
+        return _DDPJoinHook(
+            self, divide_by_initial_world_size=divide_by_initial_world_size
+        )
+
+    @property
+    def join_device(self):
+        return self.device
+
+    @property
+    def join_process_group(self):
+        return self.process_group
+
+    def _register_buffer_comm_hook(
+        self,
+        state,
+        hook: Callable,
+        comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+    ):
+        r"""
+        Allow custom registration of hooks that define how buffer are synchronized across ranks.
+
+        The hook takes in an optional state and is passed in a Dict[str, Tensor]
+        corresponding to buffer names and the buffers, and can run arbitrary reductions
+        on buffers as opposed to DDP's default broadcast from rank 0. This is useful for
+        example if a counter needs to be summed or averaged across ranks every iteration.
+
+        Args:
+            state (Any): Optional state that is passed to the hook.
+            hook (Callable): Callable with the following signature:
+                         ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``
+            comm_hook_location (_BufferCommHookLocation): Enum value indicating
+                            where to run the hook.
+                            _BufferCommHookLocation.PRE_FORWARD means that the
+                            hook will run _before_ the forward pass, and
+                            _BufferCommHookLocation.POST_FORWARD means that the
+                            hook will run _after_ the forward pass.
+
+            NOTE: To maximize performance, users can return a
+                List[torch.futures.Future] from their hook, and DDP will
+                install and await these hooks appropriately at the end of
+                the backward pass. This will ensure all buffers are
+                synchronized by the end of the backward pass. If this
+                setting is used, it is recommended to pass
+                comm_hook_location=_BufferCommHookLocation.POST_FORWARD,
+                which will trigger the hook after the forward pass.
+                If _BufferCommHookLocation.PRE_FORWARD is used, users must
+                ensure appropriate synchronization when manipulating GPU
+                buffers in the forward pass.
+        """
+        assert callable(hook)
+        self.buffer_hook = _BufferCommHook(
+            buffer_comm_hook=hook,
+            buffer_comm_hook_state=state,
+            buffer_comm_hook_location=comm_hook_location,
+        )
+
+    def register_comm_hook(self, state: object, hook: Callable):
+        r"""
+        Register communication hook for user-defined DDP aggregation of gradients across multiple workers.
+
+        This hook would be very useful for researchers to try out new ideas. For
+        example, this hook can be used to implement several algorithms like GossipGrad
+        and gradient compression which involve different communication strategies for
+        parameter syncs while running Distributed DataParallel training.
+
+        Args:
+            state (object): Passed to the hook to maintain any state information during the training process.
+                            Examples include error feedback in gradient compression,
+                            peers to communicate with next in GossipGrad, etc.
+
+                            It is locally stored by each worker
+                            and shared by all the gradient tensors on the worker.
+            hook (Callable): Callable with the following signature:
+                             ``hook(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]``:
+
+                             This function is called once the bucket is ready. The
+                             hook can perform whatever processing is needed and return
+                             a Future indicating completion of any async work (ex: allreduce).
+                             If the hook doesn't perform any communication, it still
+                             must return a completed Future. The Future should hold the
+                             new value of grad bucket's tensors. Once a bucket is ready,
+                             c10d reducer would call this hook and use the tensors returned
+                             by the Future and copy grads to individual parameters.
+                             Note that the future's return type must be a single tensor.
+
+                             We also provide an API called ``get_future`` to retrieve a
+                             Future associated with the completion of ``c10d.ProcessGroup.Work``.
+                             ``get_future`` is currently supported for NCCL and also supported for most
+                             operations on GLOO and MPI, except for peer to peer operations (send/recv).
+
+        .. warning ::
+            Grad bucket's tensors will not be predivided by world_size. User is responsible
+            to divide by the world_size in case of operations like allreduce.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        .. warning ::
+            The Future object that hook returns should contain a single tensor
+            that has the same shape with the tensors inside grad bucket.
+
+        .. warning ::
+            ``get_future`` API supports NCCL, and partially GLOO and MPI backends (no support
+            for peer-to-peer operations like send/recv) and will return a ``torch.futures.Future``.
+
+        Example::
+            Below is an example of a noop hook that returns the same tensor.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> def noop(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     fut = torch.futures.Future()
+            >>>     fut.set_result(bucket.buffer())
+            >>>     return fut
+            >>> ddp.register_comm_hook(state=None, hook=noop)
+
+        Example::
+            Below is an example of a Parallel SGD algorithm where gradients are encoded before
+            allreduce, and then decoded after allreduce.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> def encode_and_decode(state: object, bucket: dist.GradBucket) -> torch.futures.Future[torch.Tensor]:
+            >>>     encoded_tensor = encode(bucket.buffer())  # encode gradients
+            >>>     fut = torch.distributed.all_reduce(encoded_tensor).get_future()
+            >>>     # Define the then callback to decode.
+            >>>     def decode(fut):
+            >>>         decoded_tensor = decode(fut.value()[0])  # decode gradients
+            >>>         return decoded_tensor
+            >>>     return fut.then(decode)
+            >>> ddp.register_comm_hook(state=None, hook=encode_and_decode)
+        """
+        self._check_comm_hook(hook)
+        if hook.__name__ in ["bf16_compress_hook", "fp16_compress_hook"]:
+            # If we pass None, then the hook will try to get the world size
+            # by calling `dist.group.WORLD.size()`, which causes compilation
+            # errors. So we pre-decode the process group and pass it to the
+            # hook.
+            if state is None:
+                state = dist.group.WORLD
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(hook.__qualname__)
+        self._comm_hooks.append((hook, state))
+        dist._register_comm_hook(self.reducer, state, hook)
+
+    def _register_builtin_comm_hook(self, comm_hook_type):
+        r"""
+        Register a built-in communication hook that specifies how DDP aggregates gradients across multiple workers.
+
+        The built-in hooks aim to provide efficient C++ implementations for certain hooks,
+        which might not be as efficient if implemented in Python using a Python communication hook.
+
+        Args:
+            comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as ALLREDUCE, FP16_COMPRESS, etc.
+
+        .. warning ::
+            DDP communication hook can only be registered once and should be registered
+            before calling backward.
+
+        Example::
+            Below is an example of a FP16 compression where gradients are
+            compressed into 16-bit floating-point numbers before allreduce, and
+            then decompressed after allreduce.
+
+            >>> # xdoctest: +SKIP('undefined name')
+            >>> ddp._register_builtin_comm_hook(dist.BuiltinCommHookType.FP16_COMPRESS)
+
+        """
+        assert self.logger is not None
+        self.logger._set_comm_hook_name(str(comm_hook_type))
+        dist._register_builtin_comm_hook(self.reducer, comm_hook_type)
+
+    def _register_fused_optim(self, optim: Type, *args, optim_params=None, **kwargs):
+        r"""
+        Register an optimizer in DDP to optimize parameter immediately after its gradient reduction.
+
+        Registers an optimizer with DDP such that the optimization for a
+        parameter will run immediately when that parameter's gradient is
+        finished with reduction, instead of waiting for all parameters'
+        gradients to finish reduction. This can result in a training speedup
+        depending on your workload since the optimizer can run while gradient
+        reduction for other parameters are still ongoing. In addition, this has
+        the potential to reduce peak memory consumption during training, as it
+        only needs to load the per-parameter optimizer states of a single
+        parameter at a time, instead of loading all per-parameter optimizer
+        states at once.
+
+        Args:
+            optim (Type): a ``torch.optim.Optimizer`` class to be registered
+            as a fused optimizer.
+            *args (Sequence[Any]): Arguments to forward to `optim`.
+            optim_params (Optional[Iterable[torch.Tensor]]): Set of parameters
+            to optimize, similar to `params` argument of traditional `torch.optim`
+            Optimizers. If this is omitted, all DDP model parameters will be
+            optimized.
+            **kwargs: (Dict[str, Any]): Keyword arguments to forward to `optim`.
+
+        .. warning ::
+            _register_fused_optim should only be called once on a DDP instance,
+            and registering multiple fused optimizers for the same DDP model
+            is not currently supported. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            _register_fused_optim and register_comm_hook currently do not
+            compose together, meaning that custom DDP communication hooks are
+            not supported with overlapped optimizers. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        .. warning ::
+            Gradient accumulation and DDP `no_sync` are currently not supported
+            with overlapped optimizer. Please ping
+            https://github.com/pytorch/pytorch/issues/71595 if this is necessary
+            for your use case.
+
+        Example::
+
+            >>> # xdoctest: +SKIP("No rendezvous handler")
+            >>> torch.distributed.init_process_group(backend='nccl', world_size=4, init_method='...')
+            >>> net = torch.nn.parallel.DistributedDataParallel(model, pg)
+            >>> lr = 1e-2
+            >>> betas = (0.9, 0.99)
+            >>> eps = 1e-6
+            >>> net._register_fused_optim(torch.optim.Adam, lr, betas=betas, eps=eps)
+            >>> # Example with subset of parameters
+            >>> params_to_opt = [list(net.parameters())[0]]
+            >>> net._register_fused_optim(
+            ...   torch.optim.Adam, lr, optim_params=params_to_opt,  betas=betas, eps=eps
+            ... )
+        """
+        # Note: importing in function, otherwise this will cause a circular
+        # import as optimizer_overlap module needs to import DistributedDataParallel.
+        from torch.distributed.algorithms._optimizer_overlap import _as_overlapped_optim
+
+        overlapped_optim = _as_overlapped_optim(optim, optim_params, *args, **kwargs)
+        try:
+            overlapped_optim.register_ddp(self)
+        except NotImplementedError as e:
+            raise RuntimeError(
+                f"{optim} does not support overlapped DDP. Please file an issue to PyTorch or the respective owner of {optim}."
+            ) from e
+
+    def _distributed_broadcast_coalesced(
+        self, tensors, buffer_size, authoritative_rank=0
+    ):
+        dist._broadcast_coalesced(
+            self.process_group, tensors, buffer_size, authoritative_rank
+        )
+
+    def _check_sync_bufs_post_fwd(self):
+        return (
+            self.will_sync_module_buffers()
+            and hasattr(self, "buffer_hook")
+            and self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.POST_FORWARD
+        )
+
+    def _check_sync_bufs_pre_fwd(self):
+        return self.will_sync_module_buffers() and (
+            not hasattr(self, "buffer_hook")
+            or self.buffer_hook.buffer_comm_hook_location
+            == _BufferCommHookLocation.PRE_FORWARD
+        )
+
+    def will_sync_module_buffers(self):
+        return (
+            self.require_forward_param_sync
+            and self.broadcast_buffers
+            and len(self.modules_buffers) > 0
+        )
+
+    def _find_common_rank(self, input_rank, rank_cond):
+        # -1 indicates that this rank is not under consideration to be the
+        # common_rank
+        rank_to_use = torch.tensor(
+            [input_rank if rank_cond else -1],
+            device=self.device,
+        )
+        dist.all_reduce(rank_to_use, op=ReduceOp.MAX, group=self.process_group)
+        if rank_to_use.item() == -1:
+            self._log_and_throw(
+                ValueError,
+                "BUG! Expected rank_cond to be true for at least one process."
+                " This indicates a bug in PyTorch, please report an issue.",
+            )
+        return rank_to_use.item()
+
+    def _sync_buffers(self):
+        with torch.no_grad():
+            # module buffer sync
+            # Synchronize buffers across processes.
+            # If we are running DDP with the join manager, we have to agree
+            # upon a rank to sync module buffers from, since rank 0 may
+            # already have been joined and have stale module buffers.
+            if self._join_config.enable:
+                authoritative_rank = self._find_common_rank(
+                    self._distributed_rank, True
+                )
+            else:
+                # The process with rank 0 is considered the authoritative copy.
+                authoritative_rank = 0
+            # Update self.modules_buffers incase any buffers were
+            # reassigned.
+            self._assign_modules_buffers()
+            self._sync_module_buffers(authoritative_rank)
+
+    def _sync_module_buffers(self, authoritative_rank):
+        if not hasattr(self, "buffer_hook"):
+            self._default_broadcast_coalesced(authoritative_rank=authoritative_rank)
+        else:
+            hook = self.buffer_hook.buffer_comm_hook
+            state = self.buffer_hook.buffer_comm_hook_state
+            futs = hook(state, self.named_module_buffers)
+            if futs is not None:
+                self.reducer._install_post_backward_futures(futs)
+
+    def _default_broadcast_coalesced(
+        self, bufs=None, bucket_size=None, authoritative_rank=0
+    ):
+        """
+        Broadcasts buffers from rank 0 to rest of workers.
+
+        If bufs, bucket_size are None, default values self.modules_buffers
+        and self.broadcast_bucket_size are used instead.
+        """
+        if bufs is None:
+            bufs = self.modules_buffers
+        if bucket_size is None:
+            bucket_size = self.broadcast_bucket_size
+
+        self._distributed_broadcast_coalesced(bufs, bucket_size, authoritative_rank)
+
+    def _passing_sync_batchnorm_handle(self, module):
+        for layer in module.modules():
+            if isinstance(layer, torch.nn.modules.SyncBatchNorm):
+                if self.device_type == "cpu":
+                    self._log_and_throw(
+                        ValueError,
+                        "SyncBatchNorm layers only work with GPU modules",
+                    )
+
+    def _check_comm_hook(self, hook):
+        if not callable(hook):
+            self._log_and_throw(TypeError, "Communication hook must be callable.")
+
+        sig = inspect.signature(hook)
+        if (
+            sig.parameters["bucket"].annotation != inspect._empty
+            and sig.parameters["bucket"].annotation != dist.GradBucket
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: bucket annotation should be dist.GradBucket.",
+            )
+
+        if (
+            sig.return_annotation != inspect._empty
+            and sig.return_annotation != torch.futures.Future[torch.Tensor]
+        ):
+            self._log_and_throw(
+                ValueError,
+                "Communication hook: return annotation should be torch.futures.Future[torch.Tensor].",
+            )
+
+        if hook.__name__ in [
+            "bf16_compress_hook",
+            "bf16_compress_wrapper_hook",
+        ] and (
+            (torch.version.cuda is None and torch.version.hip is None)
+            or (
+                torch.version.cuda is not None
+                and int(torch.version.cuda.split(".")[0]) < 11
+            )
+            or not dist.is_available()
+            or not dist.is_nccl_available()
+            or torch.cuda.nccl.version() < (2, 10)
+        ):
+            self._log_and_throw(
+                TypeError,
+                "BF16 all reduce communication hook required CUDA 11+ and NCCL 2.10+.",
+            )
+
+    @property
+    def _distributed_rank(self):
+        return dist.get_rank(self.process_group)
+
+    @staticmethod
+    def _get_data_parallel_params(module, named_params=False):
+        """Return a generator of parameters managed by a given DDP unit."""
+        for param in (
+            module.parameters() if not named_params else module.named_parameters()
+        ):
+            if not hasattr(param, "_ddp_ignored"):
+                yield param
+
+    @staticmethod
+    def _set_params_and_buffers_to_ignore_for_model(
+        module, params_and_buffers_to_ignore
+    ):
+        """
+        Set parameters and buffers to be ignored by DDP.
+
+        Expected format for parameters is the fully qualified name: {module_name}.{param_name}, and
+        similarly, {module_name}.{buffer_name} for buffers. For example:
+        params_to_ignore = []
+        # NB: model here is vanilla PyTorch module, not yet wrapped with DDP.
+        for module_name, module in model.named_modules():
+            for param_name, param in module.named_parameters(recurse=False):
+                if should_ignore(param):
+                    # Create expected format
+                    fqn = f"{module_name}.{param_name}"
+                    params_to_ignore.append(fqn)
+        torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+            model,
+            params_to_ignore
+        )
+        """
+        # This is a workaround to set parameters and buffers DDP should ignore
+        # during synchronization. It will be removed when the API is finalized
+        # as part of addressing https://github.com/pytorch/pytorch/issues/43690.
+        module._ddp_params_and_buffers_to_ignore = params_and_buffers_to_ignore
+        for name, param in module.named_parameters():
+            if name in params_and_buffers_to_ignore:
+                param._ddp_ignored = True
+        for name, buffer in module.named_buffers():
+            if name in params_and_buffers_to_ignore:
+                buffer._ddp_ignored = True
+
+    def _get_ddp_logging_data(self):
+        r"""
+        Return a dictionary of logging data for debugging and analysis.
+
+        This interface can be called after DistributedDataParallel() is
+        constructed. It returns a dictionary of logging data. It could help
+        for debugging and analysis. The logging data includes DistributedDataParallel
+        constructor input parameters, some internal states of DistributedDataParallel
+        and performance metrics. Simply print the dictionary and see what
+        these metrics are.
+        This is a prototype interface and subject to change in the future.
+        """
+        assert self.logger is not None
+        ddp_logging_data = self.logger._get_ddp_logging_data()
+        return {**ddp_logging_data.strs_map, **ddp_logging_data.ints_map}
+
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate):
+        r"""
+        Set sample_rate of collecting runtime stats.
+
+        This interface allows users to set sample_rate of collecting
+        runtime stats. The runtime stats will be recorded for the
+        first 10 iterations, after 10 iterations runtime stats will be
+        recorded once every "sample_rate" training iterations. In
+        default, runtime stats are recorded for the first 10 iterations,
+        after 10 iterations runtime stats are recorded once every
+        "kDDPRuntimeLoggingSampleRate=100" training iterations.
+        This is a prototype interface and subject to change in the future.
+        """
+        if sample_rate < 1:
+            self._log_and_throw(
+                ValueError,
+                "DDP runtime logging sample rate should be equal or greater than 1",
+            )
+        self.reducer._set_ddp_runtime_logging_sample_rate(sample_rate)
+
+    def _set_static_graph(self):
+        """
+        Set static graph for DDP.
+
+        It is recommended to set static graph in the DDP constructor, which will
+        call this private API internally.
+        """
+        # If self.static_graph has been set, no need to set it again
+        if self.static_graph:
+            warnings.warn(
+                "You've set static_graph to be True, no need to set it again."
+            )
+            return
+        self.static_graph = True
+        self._static_graph_delay_allreduce_enqueued = False
+        self.reducer._set_static_graph()
+        assert self.logger is not None
+        self.logger._set_static_graph()
+        if self.find_unused_parameters:
+            warnings.warn(
+                "You passed find_unused_parameters=true to DistributedDataParallel, "
+                "`_set_static_graph` will detect unused parameters automatically, so "
+                "you do not need to set find_unused_parameters=true, just be sure these "
+                "unused parameters will not change during training loop while calling "
+                "`_set_static_graph`."
+            )
+
+    def _remove_autograd_hooks(self):
+        """Remove autograd hooks registered by the reducer on the model parameters."""
+        self.reducer._remove_autograd_hooks()
+
+    def _check_reducer_finalized(self):
+        """
+        Check if the reducer has processed all buckets and finalized the backward appropriately.
+
+        It is useful to call this method after calling .backward() in your training loop
+        in order to avoid subsequent hard to debug errors down the road due to the
+        reducer not finalizing backward.
+        """
+        self.reducer._check_reducer_finalized()
+
+    def _set_sparse_metadata(self, global_unique_ids):
+        self.reducer._set_sparse_metadata(global_unique_ids)
+
+    def _update_process_group(self, new_process_group):
+        """
+        Dynamically updates the process group for DDP so that we can shrink/expand DDP
+        world size without having to reinitialize DDP.
+
+        NOTE: If you are using custom communications hooks via, register_comm_hook,
+        you need to update the process groups for those hooks separately.
+        """
+        # Force a rebuild of buckets for a new process group. This ensures all ranks
+        # are synchronized in terms of when they will rebuild buckets and also
+        # re-evaluates previous assumptions of buckets given the world size might have
+        # changed.
+        self._has_rebuilt_buckets = False
+        self.reducer._reset_state()
+
+        if not _rank_not_in_group(new_process_group):
+            self.process_group = new_process_group
+            self.reducer._update_process_group(new_process_group)
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/parallel_apply.py b/MLPY/Lib/site-packages/torch/nn/parallel/parallel_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a029f72886acfbcacce9d55cbc256a62edf22b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/parallel_apply.py
@@ -0,0 +1,110 @@
+import threading
+import torch
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast
+from ..modules import Module
+from torch.cuda._utils import _get_device_index
+from torch.cuda.amp import autocast
+from torch._utils import ExceptionWrapper
+
+__all__ = ['get_a_var', 'parallel_apply']
+
+def get_a_var(obj: Union[torch.Tensor, List[Any], Tuple[Any, ...], Dict[Any, Any]]) -> Optional[torch.Tensor]:
+    if isinstance(obj, torch.Tensor):
+        return obj
+
+    if isinstance(obj, (list, tuple)):
+        for result in map(get_a_var, obj):
+            if isinstance(result, torch.Tensor):
+                return result
+    if isinstance(obj, dict):
+        for result in map(get_a_var, obj.items()):
+            if isinstance(result, torch.Tensor):
+                return result
+    return None
+
+def parallel_apply(
+    modules: Sequence[Module],
+    inputs: Sequence[Any],
+    kwargs_tup: Optional[Sequence[Dict[str, Any]]] = None,
+    devices: Optional[Sequence[Optional[Union[int, torch.device]]]] = None,
+) -> List[Any]:
+    r"""Apply each `module` in :attr:`modules` in parallel on each of :attr:`devices`.
+
+    Args:
+        modules (Module): modules to be parallelized
+        inputs (tensor): inputs to the modules
+        devices (list of int or torch.device): CUDA devices
+
+    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
+    :attr:`devices` (if given) should all have same length. Moreover, each
+    element of :attr:`inputs` can either be a single object as the only argument
+    to a module, or a collection of positional arguments.
+    """
+    assert len(modules) == len(inputs), f'The number of modules {len(modules)} is not equal to the number of inputs {len(inputs)}'
+    if kwargs_tup is not None:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = (cast(Dict[str, Any], {}),) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+    devices = [_get_device_index(x, True) for x in devices]
+    streams = [torch.cuda.current_stream(x) for x in devices]
+    lock = threading.Lock()
+    results = {}
+    grad_enabled, autocast_enabled = torch.is_grad_enabled(), torch.is_autocast_enabled()
+
+    def _worker(
+        i: int,
+        module: Module,
+        input: Any,
+        kwargs: Dict[str, Any],
+        device: Optional[Union[int, torch.device]] = None,
+        stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            t = get_a_var(input)
+            if t is None:
+                with lock:
+                    results[i] = ExceptionWrapper(
+                        where=f"in replica {i}, no device was provided and no tensor input was found; "
+                        "device cannot be resolved")
+                return
+            device = t.get_device()
+        if stream is None:
+            stream = torch.cuda.current_stream(device)
+        try:
+            with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
+                output = module(*input, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception:
+            with lock:
+                results[i] = ExceptionWrapper(
+                    where=f"in replica {i} on device {device}")
+
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, kwargs, device, stream))
+                   for i, (module, input, kwargs, device, stream) in
+                   enumerate(zip(modules, inputs, kwargs_tup, devices, streams))]
+
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0], streams[0])
+
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, ExceptionWrapper):
+            output.reraise()
+        outputs.append(output)
+    return outputs
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/replicate.py b/MLPY/Lib/site-packages/torch/nn/parallel/replicate.py
new file mode 100644
index 0000000000000000000000000000000000000000..6454687034878c67bacee7f10597c6b2e0ffbc6b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/replicate.py
@@ -0,0 +1,186 @@
+import torch
+from ..modules import Module
+from . import comm
+from typing import TYPE_CHECKING, Dict, Iterator, List, Optional, Sequence, Set, TypeVar, Union, cast
+from torch._utils import _get_device_index
+
+from collections import OrderedDict
+
+if TYPE_CHECKING:
+    import torch.jit
+    import torch.jit._state
+
+__all__ = ['replicate']
+
+def _is_script_module(module: Module) -> bool:
+    import torch.jit
+    return isinstance(module, torch.jit.ScriptModule)
+
+
+def _is_script_method(module: Module) -> bool:
+    import torch.jit
+    return isinstance(module, torch._C.ScriptMethod)
+
+
+def _init_script_module() -> "torch.jit.ScriptModule":
+    import torch.jit
+    return torch.jit.ScriptModule()
+
+
+def _is_jit_enabled() -> "torch.jit._state.EnabledProxy":
+    import torch.jit._state
+    return torch.jit._state._enabled
+
+
+# Check if we can safely replicate the module.
+# there are two types of module:
+# 1. python modules
+# 2. ScriptModule
+#
+# currently a module cannot be replicated properly if the descendants of
+# any ScriptModule contains python module (type 1 above)
+def _replicatable_module(module: Module, memo: Optional[Set[Module]] = None) -> bool:
+
+    # module.modules() contains module itself as the first element
+    def descendant_modules(module: Module) -> Iterator[Module]:
+        gen = module.modules()
+        next(gen)
+        return gen
+
+    if not _is_jit_enabled():
+        return True
+    if memo is None:
+        memo = set()
+
+    # memoize visited modules
+    memo.add(module)
+    if _is_script_module(module):
+        memo.update(descendant_modules(module))
+        return all(_is_script_module(descendant) for
+                   descendant in descendant_modules(module))
+
+    for child in module.children():
+        # since any unreplicatable module will cause the check to return
+        # False early, visited modules here can be safely ignored.
+        if child in memo:
+            continue
+        if not _replicatable_module(child, memo):
+            return False
+
+    return True
+
+def _broadcast_coalesced_reshape(
+    tensors: Sequence[torch.Tensor],
+    devices: Sequence[Union[int, torch.device]],
+    detach: bool = False,
+) -> List[List[torch.Tensor]]:
+    from ._functions import Broadcast
+    if detach:
+        return comm.broadcast_coalesced(tensors, devices)
+    else:
+        # Use the autograd function to broadcast if not detach
+        if len(tensors) > 0:
+            tensor_copies = Broadcast.apply(devices, *tensors)
+            return [tensor_copies[i:i + len(tensors)]
+                    for i in range(0, len(tensor_copies), len(tensors))]
+        else:
+            return []
+
+
+T = TypeVar("T", bound=Module)
+
+
+def replicate(
+    network: T,
+    devices: Sequence[Union[int, torch.device]],
+    detach: bool = False,
+) -> List[T]:
+    if not _replicatable_module(network):
+        raise RuntimeError("Cannot replicate network where python modules are "
+                           "childrens of ScriptModule")
+
+    if not devices:
+        return []
+
+    devices = [_get_device_index(x, True) for x in devices]
+    num_replicas = len(devices)
+
+    params = list(network.parameters())
+    param_indices = {param: idx for idx, param in enumerate(params)}
+    param_copies = _broadcast_coalesced_reshape(params, devices, detach)
+
+    buffers = list(network.buffers())
+    buffers_rg: List[torch.Tensor] = []
+    buffers_not_rg: List[torch.Tensor] = []
+    for buf in buffers:
+        if buf.requires_grad and not detach:
+            buffers_rg.append(buf)
+        else:
+            buffers_not_rg.append(buf)
+
+    buffer_indices_rg = {buf: idx for idx, buf in enumerate(buffers_rg)}
+    buffer_indices_not_rg = {buf: idx for idx, buf in enumerate(buffers_not_rg)}
+
+    buffer_copies_rg = _broadcast_coalesced_reshape(buffers_rg, devices, detach=detach)
+    buffer_copies_not_rg = _broadcast_coalesced_reshape(buffers_not_rg, devices, detach=True)
+
+    modules = list(network.modules())
+    module_copies: List[List[Module]] = [[] for _ in devices]
+    module_indices: Dict[Module, int] = {}
+
+    for i, module in enumerate(modules):
+        module_indices[module] = i
+        for j in range(num_replicas):
+            replica = module._replicate_for_data_parallel()
+            # This is a temporary fix for DDP. DDP needs to access the
+            # replicated model parameters. It used to do so through
+            # `mode.parameters()`. The fix added in #33907 for DP stops the
+            # `parameters()` API from exposing the replicated parameters.
+            # Hence, we add a `_former_parameters` dict here to support DDP.
+            replica._former_parameters = OrderedDict()
+
+            module_copies[j].append(replica)
+
+    for i, module in enumerate(modules):
+        for key, child in module._modules.items():
+            if child is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._modules[key] = None
+            else:
+                module_idx = module_indices[child]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    setattr(replica, key, module_copies[j][module_idx])
+        for key, param in module._parameters.items():
+            if param is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._parameters[key] = None
+            else:
+                param_idx = param_indices[param]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    param_copy = param_copies[j][param_idx]
+                    # parameters in replicas are no longer leaves,
+                    # so setattr them as non-parameter attributes
+                    setattr(replica, key, param_copy)
+                    # expose the parameter for DDP
+                    replica._former_parameters[key] = param_copy
+        for key, buf in module._buffers.items():  # type: ignore[assignment]
+            if buf is None:
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    replica._buffers[key] = None
+            else:
+                if buf.requires_grad and not detach:
+                    buffer_copies = buffer_copies_rg
+                    buffer_idx = buffer_indices_rg[buf]
+                else:
+                    buffer_copies = buffer_copies_not_rg
+                    buffer_idx = buffer_indices_not_rg[buf]
+                for j in range(num_replicas):
+                    replica = module_copies[j][i]
+                    setattr(replica, key, buffer_copies[j][buffer_idx])
+
+    return [cast(T, module_copies[j][0]) for j in range(num_replicas)]
diff --git a/MLPY/Lib/site-packages/torch/nn/parallel/scatter_gather.py b/MLPY/Lib/site-packages/torch/nn/parallel/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80c1a1cd7b8491bae1d7ebda26d2653bb11c871
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parallel/scatter_gather.py
@@ -0,0 +1,107 @@
+import torch
+from typing import Any, Dict, List, Optional, Sequence, Tuple, TypeVar, Union, overload
+from ._functions import Scatter, Gather
+import warnings
+
+__all__ = ['scatter', 'scatter_kwargs', 'gather']
+
+def is_namedtuple(obj: Any) -> bool:
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    warnings.warn("is_namedtuple is deprecated, please use the python checks instead")
+    return _is_namedtuple(obj)
+
+def _is_namedtuple(obj: Any) -> bool:
+    # Check if type was created from collections.namedtuple or a typing.NamedTuple.
+    return (
+        isinstance(obj, tuple) and hasattr(obj, "_asdict") and hasattr(obj, "_fields")
+    )
+
+
+T = TypeVar("T", dict, list, tuple)
+
+# For some reason, 'scatter' returns a tuple when given a single Tensor input but a list otherwise.
+@overload
+def scatter(
+    inputs: torch.Tensor,
+    target_gpus: Sequence[Union[int, torch.device]],
+    dim: int = ...,
+) -> Tuple[torch.Tensor, ...]:
+    ...
+
+@overload
+def scatter(inputs: T, target_gpus: Sequence[Union[int, torch.device]], dim: int = ...) -> List[T]:
+    ...
+
+def scatter(inputs, target_gpus, dim=0):
+    r"""Slice tensors into approximately equal chunks and distributes them across given GPUs.
+
+    Duplicates references to objects that are not tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            return Scatter.apply(target_gpus, None, dim, obj)
+        if _is_namedtuple(obj):
+            return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return [list(i) for i in zip(*map(scatter_map, obj))]
+        if isinstance(obj, dict) and len(obj) > 0:
+            return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
+        return [obj for _ in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        res = scatter_map(inputs)
+    finally:
+        scatter_map = None  # type: ignore[assignment]
+    return res
+
+
+def scatter_kwargs(
+    inputs: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]],
+    target_gpus: Sequence[Union[int, torch.device]],
+    dim: int = 0,
+) -> Tuple[Tuple[Any, ...], Tuple[Dict[str, Any], ...]]:
+    r"""Scatter with support for kwargs dictionary."""
+    scattered_inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    scattered_kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(scattered_inputs) < len(scattered_kwargs):
+        scattered_inputs.extend(() for _ in range(len(scattered_kwargs) - len(scattered_inputs)))
+    elif len(scattered_kwargs) < len(inputs):
+        scattered_kwargs.extend({} for _ in range(len(scattered_inputs) - len(scattered_kwargs)))
+    return tuple(scattered_inputs), tuple(scattered_kwargs)
+
+
+def gather(outputs: Any, target_device: Union[int, torch.device], dim: int = 0) -> Any:
+    r"""Gather tensors from different GPUs on a specified device.
+
+    Use 'cpu' for CPU to avoid a deprecation warning.
+    """
+    def gather_map(outputs):
+        out = outputs[0]
+        if isinstance(out, torch.Tensor):
+            return Gather.apply(target_device, dim, *outputs)
+        if out is None:
+            return None
+        if isinstance(out, dict):
+            if not all(len(out) == len(d) for d in outputs):
+                raise ValueError('All dicts must have the same number of keys')
+            return type(out)((k, gather_map([d[k] for d in outputs]))
+                             for k in out)
+        if _is_namedtuple(out):
+            return type(out)._make(map(gather_map, zip(*outputs)))
+        return type(out)(map(gather_map, zip(*outputs)))
+
+    # Recursive function calls like this create reference cycles.
+    # Setting the function to None clears the refcycle.
+    try:
+        res = gather_map(outputs)
+    finally:
+        gather_map = None  # type: ignore[assignment]
+    return res
diff --git a/MLPY/Lib/site-packages/torch/nn/parameter.py b/MLPY/Lib/site-packages/torch/nn/parameter.py
new file mode 100644
index 0000000000000000000000000000000000000000..98af8a02a4092b8192e6c08fcc9caab899e932f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parameter.py
@@ -0,0 +1,223 @@
+import torch
+from torch._C import _disabled_torch_function_impl
+from collections import OrderedDict
+
+# Metaclass to combine _TensorMeta and the instance check override for Parameter.
+class _ParameterMeta(torch._C._TensorMeta):
+    # Make `isinstance(t, Parameter)` return True for custom tensor instances that have the _is_param flag.
+    def __instancecheck__(self, instance):
+        return super().__instancecheck__(instance) or (
+            isinstance(instance, torch.Tensor) and getattr(instance, '_is_param', False))
+
+
+class Parameter(torch.Tensor, metaclass=_ParameterMeta):
+    r"""A kind of Tensor that is to be considered a module parameter.
+
+    Parameters are :class:`~torch.Tensor` subclasses, that have a
+    very special property when used with :class:`Module` s - when they're
+    assigned as Module attributes they are automatically added to the list of
+    its parameters, and will appear e.g. in :meth:`~Module.parameters` iterator.
+    Assigning a Tensor doesn't have such effect. This is because one might
+    want to cache some temporary state, like last hidden state of the RNN, in
+    the model. If there was no such class as :class:`Parameter`, these
+    temporaries would get registered too.
+
+    Args:
+        data (Tensor): parameter tensor.
+        requires_grad (bool, optional): if the parameter requires gradient. Note that
+            the torch.no_grad() context does NOT affect the default behavior of
+            Parameter creation--the Parameter will still have `requires_grad=True` in
+            :class:`~no_grad` mode. See :ref:`locally-disable-grad-doc` for more
+            details. Default: `True`
+    """
+
+    def __new__(cls, data=None, requires_grad=True):
+        if data is None:
+            data = torch.empty(0)
+        if type(data) is torch.Tensor or type(data) is Parameter:
+            # For ease of BC maintenance, keep this path for standard Tensor.
+            # Eventually (tm), we should change the behavior for standard Tensor to match.
+            return torch.Tensor._make_subclass(cls, data, requires_grad)
+
+        # Path for custom tensors: set a flag on the instance to indicate parameter-ness.
+        t = data.detach().requires_grad_(requires_grad)
+        if type(t) is not type(data):
+            raise RuntimeError(f"Creating a Parameter from an instance of type {type(data).__name__} "
+                               "requires that detach() returns an instance of the same type, but return "
+                               f"type {type(t).__name__} was found instead. To use the type as a "
+                               "Parameter, please correct the detach() semantics defined by "
+                               "its __torch_dispatch__() implementation.")
+        t._is_param = True
+        return t
+
+    # Note: the 3 methods below only apply to standard Tensor. Parameters of custom tensor types
+    # are still considered that custom tensor type and these methods will not be called for them.
+    def __deepcopy__(self, memo):
+        if id(self) in memo:
+            return memo[id(self)]
+        else:
+            result = type(self)(self.data.clone(memory_format=torch.preserve_format), self.requires_grad)
+            memo[id(self)] = result
+            return result
+
+    def __repr__(self):
+        return 'Parameter containing:\n' + super().__repr__()
+
+    def __reduce_ex__(self, proto):
+        state = torch._utils._get_obj_state(self)
+
+        # See Note [Don't serialize hooks]
+        hooks = OrderedDict()
+        if not state:
+            return (
+                torch._utils._rebuild_parameter,
+                (self.data, self.requires_grad, hooks)
+            )
+
+        return (
+            torch._utils._rebuild_parameter_with_state,
+            (self.data, self.requires_grad, hooks, state)
+        )
+
+    __torch_function__ = _disabled_torch_function_impl
+
+
+class UninitializedTensorMixin:
+    _allowed_methods = [
+        torch.Tensor.__hash__,
+        torch.Tensor.size,
+        torch.Tensor.copy_,
+        torch.Tensor.is_complex,
+        torch.Tensor.is_floating_point,
+        torch.Tensor.half,
+        torch.Tensor.float,
+        torch.Tensor.double,
+        torch.Tensor.char,
+        torch.Tensor.short,
+        torch.Tensor.int,
+        torch.Tensor.long,
+        torch.Tensor.cuda,
+        torch.Tensor.cpu,
+        torch.Tensor.to,
+        torch.Tensor.get_device,
+        torch._has_compatible_shallow_copy_type,
+    ]
+
+    def materialize(self, shape, device=None, dtype=None):
+        r"""Create a Parameter or Tensor with the same properties of the uninitialized one.
+
+        Given a shape, it materializes a parameter in the same device
+        and with the same `dtype` as the current one or the specified ones in the
+        arguments.
+
+        Args:
+            shape : (tuple): the shape for the materialized tensor.
+            device (:class:`torch.device`): the desired device of the parameters
+                and buffers in this module. Optional.
+            dtype (:class:`torch.dtype`): the desired floating point type of
+                the floating point parameters and buffers in this module. Optional.
+        """
+        if device is None:
+            device = self.data.device
+        if dtype is None:
+            dtype = self.data.dtype
+        self.data = torch.empty(shape, device=device, dtype=dtype)
+        self.__class__ = self.cls_to_become
+
+    @property
+    def shape(self):
+        raise RuntimeError(
+            'Can\'t access the shape of an uninitialized parameter or buffer. '
+            'This error usually happens in `load_state_dict` when trying to load '
+            'an uninitialized parameter into an initialized one. '
+            'Call `forward` to initialize the parameters before accessing their attributes.')
+
+    def share_memory_(self):
+        raise RuntimeError(
+            'Can\'t share memory on an uninitialized parameter or buffer. '
+            'Call `forward` to initialize the parameters before calling '
+            '`module.share_memory()`.')
+
+    def __repr__(self):
+        return f'<{self.__class__.__name__}>'
+
+    def __reduce_ex__(self, proto):
+        # See Note [Don't serialize hooks]
+        return (
+            self.__class__,
+            (self.requires_grad,)
+        )
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        # method-wrapper is to detect access to Tensor properties that are
+        # wrapped in descriptors
+        if func in cls._allowed_methods or func.__class__.__name__ == 'method-wrapper':
+            if kwargs is None:
+                kwargs = {}
+            return super().__torch_function__(func, types, args, kwargs)
+        raise ValueError(
+            f'Attempted to use an uninitialized parameter in {func}. '
+            'This error happens when you are using a `LazyModule` or '
+            f'explicitly manipulating `torch.nn.parameter.{cls.__name__}` '
+            'objects. When using LazyModules Call `forward` with a dummy batch '
+            'to initialize the parameters before calling torch functions')
+
+
+def is_lazy(param):
+    return isinstance(param, UninitializedTensorMixin)
+
+
+class UninitializedParameter(UninitializedTensorMixin, Parameter):
+    r"""A parameter that is not initialized.
+
+    Uninitialized Parameters are a a special case of :class:`torch.nn.Parameter`
+    where the shape of the data is still unknown.
+
+    Unlike a :class:`torch.nn.Parameter`, uninitialized parameters
+    hold no data and attempting to access some properties, like their shape,
+    will throw a runtime error. The only operations that can be performed on a uninitialized
+    parameter are changing its datatype, moving it to a different device and
+    converting it to a regular :class:`torch.nn.Parameter`.
+
+    The default device or dtype to use when the parameter is materialized can be set
+    during construction using e.g. ``device='cuda'``.
+    """
+
+    cls_to_become = Parameter
+
+    def __new__(cls, requires_grad=True, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        data = torch.empty(0, **factory_kwargs)
+        return torch.Tensor._make_subclass(cls, data, requires_grad)
+
+    def __deepcopy__(self, memo):
+        if id(self) in memo:
+            return memo[id(self)]
+        else:
+            result = type(self)(self.requires_grad, self.data.device, self.data.dtype)
+            memo[id(self)] = result
+            return result
+
+class UninitializedBuffer(UninitializedTensorMixin, torch.Tensor):
+    r"""A buffer that is not initialized.
+
+    Uninitialized Buffer is a a special case of :class:`torch.Tensor`
+    where the shape of the data is still unknown.
+
+    Unlike a :class:`torch.Tensor`, uninitialized parameters
+    hold no data and attempting to access some properties, like their shape,
+    will throw a runtime error. The only operations that can be performed on a uninitialized
+    parameter are changing its datatype, moving it to a different device and
+    converting it to a regular :class:`torch.Tensor`.
+
+    The default device or dtype to use when the buffer is materialized can be set
+    during construction using e.g. ``device='cuda'``.
+    """
+
+    cls_to_become = torch.Tensor
+
+    def __new__(cls, requires_grad=False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        data = torch.empty(0, **factory_kwargs)
+        return torch.Tensor._make_subclass(cls, data, requires_grad)
diff --git a/MLPY/Lib/site-packages/torch/nn/parameter.pyi b/MLPY/Lib/site-packages/torch/nn/parameter.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..ace980d6821d5f435738b02b7345b4f9bfa4d721
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/parameter.pyi
@@ -0,0 +1,40 @@
+import builtins
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+class Parameter(Tensor):
+    def __init__(
+        self,
+        data: Tensor = ...,
+        requires_grad: builtins.bool = ...,
+    ): ...
+
+def is_lazy(param: Tensor): ...
+
+class UninitializedParameter(Tensor):
+    def __init__(
+        self,
+        data: Tensor = ...,
+        requires_grad: builtins.bool = ...,
+    ): ...
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ): ...
+
+class UninitializedBuffer(Tensor):
+    def __init__(
+        self,
+        data: Tensor = ...,
+        requires_grad: builtins.bool = ...,
+    ): ...
+    def materialize(
+        self,
+        shape: Tuple[int, ...],
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ): ...
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/__init__.py b/MLPY/Lib/site-packages/torch/nn/qat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f68ec65951c945b4ee39de6650c2423f78c4e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/__init__.py
@@ -0,0 +1,18 @@
+# flake8: noqa: F401
+r"""QAT Dynamic Modules.
+
+This package is in the process of being deprecated.
+Please, use `torch.ao.nn.qat.dynamic` instead.
+"""
+from . import dynamic  # noqa: F403
+from . import modules  # noqa: F403
+from .modules import *  # noqa: F403
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..312284aece1d013ed28761c67e061f8b2539dbc9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c4267a4fce99cfc9d6b8e3ebbd637b141eb5a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__init__.py
@@ -0,0 +1,7 @@
+# flake8: noqa: F401
+r"""QAT Dynamic Modules.
+
+This package is in the process of being deprecated.
+Please, use `torch.ao.nn.qat.dynamic` instead.
+"""
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..662539b68f573ab86c8d65fa28e9a46686671eb2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f656409ea408920a9eb2f4d28ccf0aeb0f9b50ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__init__.py
@@ -0,0 +1,3 @@
+from .linear import Linear
+
+__all__ = ["Linear"]
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23bacc1bca9f312f689b074013f6318e8efabee1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56734aba4a59a7b577b40007560fd3534c258bb8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..a219eb5fa6b796f7ee2332d5f196de312d8b3896
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/dynamic/modules/linear.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/qat/dynamic`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/qat/dynamic/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.qat.dynamic.modules.linear import Linear
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/qat/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..670d924dcb905eeedb341f50d795133f0974a11b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/modules/__init__.py
@@ -0,0 +1,24 @@
+# flake8: noqa: F401
+r"""QAT Modules.
+
+This package is in the process of being deprecated.
+Please, use `torch.ao.nn.qat.modules` instead.
+"""
+from torch.ao.nn.qat.modules.linear import Linear
+from torch.ao.nn.qat.modules.conv import Conv1d
+from torch.ao.nn.qat.modules.conv import Conv2d
+from torch.ao.nn.qat.modules.conv import Conv3d
+from torch.ao.nn.qat.modules.embedding_ops import EmbeddingBag, Embedding
+
+from . import conv
+from . import embedding_ops
+from . import linear
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa9462888b072f76446fe9127cbff3f22bbb07fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42e4131686de5a2ae31cc13305811d3467be38b8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ac1c80ecbd5da4808d0b0ceb6258e81086d8996
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/embedding_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe4cdf9b03df6cd0d028073923f8427bcb45fbf9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/qat/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/conv.py b/MLPY/Lib/site-packages/torch/nn/qat/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..29fd7eabc032695d0e04971b5c65fd20d4151185
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/modules/conv.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F401
+r"""QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/qat/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.qat.modules.conv import Conv1d
+from torch.ao.nn.qat.modules.conv import Conv2d
+from torch.ao.nn.qat.modules.conv import Conv3d
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/embedding_ops.py b/MLPY/Lib/site-packages/torch/nn/qat/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b62ad5da30523aeeb181fa6755121f63a4ca4ab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/modules/embedding_ops.py
@@ -0,0 +1,14 @@
+# flake8: noqa: F401
+r"""QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/qat/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['Embedding', 'EmbeddingBag']
+
+from torch.ao.nn.qat.modules.embedding_ops import Embedding
+from torch.ao.nn.qat.modules.embedding_ops import EmbeddingBag
diff --git a/MLPY/Lib/site-packages/torch/nn/qat/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/qat/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..55a2cc2a2aa9dfc3623c59737daaa0cee90b0af4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/qat/modules/linear.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""QAT Modules.
+
+This file is in the process of migration to `torch/ao/nn/qat`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/qat/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.qat.modules.linear import Linear
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantizable/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantizable/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da62b607ddbc32543334211d6eaa36a7eebf9ceb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantizable/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..265ce58bf974c242e36a3a50b30e3c3d8aeb72d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__init__.py
@@ -0,0 +1,9 @@
+from torch.ao.nn.quantizable.modules.activation import MultiheadAttention
+from torch.ao.nn.quantizable.modules.rnn import LSTM
+from torch.ao.nn.quantizable.modules.rnn import LSTMCell
+
+__all__ = [
+    'LSTM',
+    'LSTMCell',
+    'MultiheadAttention',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72144b92f3afd4b983763abe74289624499ac90e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84aafade4e7168fed60f5d486caf30c27ad7552d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8de09b8c521a7494578b7a56ab5e06e08a312cc9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/activation.py b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee489780efcb10be22f121d3e2c61e50a00447
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/activation.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""Quantizable Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantizable`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantizable/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.quantizable.modules.activation import MultiheadAttention
diff --git a/MLPY/Lib/site-packages/torch/nn/quantizable/modules/rnn.py b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bb465202d95343e8e0663b4bc62415249c7381b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantizable/modules/rnn.py
@@ -0,0 +1,11 @@
+# flake8: noqa: F401
+r"""Quantizable Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantizable`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantizable/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.quantizable.modules.rnn import LSTM
+from torch.ao.nn.quantizable.modules.rnn import LSTMCell
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..691357c9a1b198596b698aaebc6e3872576c08d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/__init__.py
@@ -0,0 +1,40 @@
+from . import dynamic  # noqa: F403
+from . import functional  # noqa: F403
+from . import modules  # noqa: F403
+from .modules import *  # noqa: F403
+from .modules import MaxPool2d
+
+__all__ = [
+    'BatchNorm2d',
+    'BatchNorm3d',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'DeQuantize',
+    'Dropout',
+    'ELU',
+    'Embedding',
+    'EmbeddingBag',
+    'GroupNorm',
+    'Hardswish',
+    'InstanceNorm1d',
+    'InstanceNorm2d',
+    'InstanceNorm3d',
+    'LayerNorm',
+    'LeakyReLU',
+    'Linear',
+    'LSTM',
+    'MultiheadAttention',
+    'PReLU',
+    'Quantize',
+    'ReLU6',
+    'Sigmoid',
+    'Softmax',
+    # Wrapper modules
+    'FloatFunctional',
+    'FXFloatFunctional',
+    'QFunctional',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22f93c0950b804f7df0d40b10e31146b63823e3d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fe0b9d7ab1ca4326e02689b8db1cddc25adb0db
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebc6df8afce25c62a5707136bc46cab16c49a83c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__init__.py
@@ -0,0 +1 @@
+from .modules import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fef77a9596ee1d271ff7a6ee4130c58e585b015
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a02a84260cdeb12b2ce15b3f40972bf82177550
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py
@@ -0,0 +1,31 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.linear import Linear
+from torch.ao.nn.quantized.reference.modules.conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCell, LSTMCell, GRUCell, LSTM
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding, EmbeddingBag
+
+__all__ = [
+    'Linear',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'RNNCell',
+    'LSTMCell',
+    'GRUCell',
+    'LSTM',
+    'Embedding',
+    'EmbeddingBag',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8758b7dc32c013a56e9c378f951d086d06ea3bb5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c8ffe2406866c93d730cd3f9f888c1e037f7900
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fd94af5b0e10b19dd3e61b06e81bcba9cab85a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dcb55f0048997c5302fa931c1c87007496378c9b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/sparse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/sparse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9764fa4d4ad494d20a39137d22cfe19a2c4662ed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/sparse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a9ef8886f3438c4085a74f8b3a75f79f9bb6586
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..534dad29b6c5d9aa6caaabcb1cee72096451c53f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py
@@ -0,0 +1,19 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.conv import _ConvNd
+from torch.ao.nn.quantized.reference.modules.conv import Conv1d
+from torch.ao.nn.quantized.reference.modules.conv import Conv2d
+from torch.ao.nn.quantized.reference.modules.conv import Conv3d
+from torch.ao.nn.quantized.reference.modules.conv import _ConvTransposeNd
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose1d
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose2d
+from torch.ao.nn.quantized.reference.modules.conv import ConvTranspose3d
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f197031e21a2495a9aeb0ec25273bd28eedd37a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.linear import Linear
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..263d40562c343d250d6d4756c1eb44aad8a5fb0e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py
@@ -0,0 +1,17 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCellBase
+from torch.ao.nn.quantized.reference.modules.rnn import RNNCell
+from torch.ao.nn.quantized.reference.modules.rnn import LSTMCell
+from torch.ao.nn.quantized.reference.modules.rnn import GRUCell
+from torch.ao.nn.quantized.reference.modules.rnn import RNNBase
+from torch.ao.nn.quantized.reference.modules.rnn import LSTM
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..267b8923dadb1ef50608a0c1be519fcb2afb58ff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py
@@ -0,0 +1,13 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding
+from torch.ao.nn.quantized.reference.modules.sparse import EmbeddingBag
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ef39bdef8e3c9f54a550efbf15867c8ba876bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+from torch.ao.nn.quantized.reference.modules.utils import _quantize_weight
+from torch.ao.nn.quantized.reference.modules.utils import _quantize_and_dequantize_weight
+from torch.ao.nn.quantized.reference.modules.utils import _save_weight_qparams
+from torch.ao.nn.quantized.reference.modules.utils import _get_weight_qparam_keys
+from torch.ao.nn.quantized.reference.modules.utils import ReferenceQuantizedModule
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b5ba879b0115efa008052d7ed0c798fa8d6d25d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__init__.py
@@ -0,0 +1 @@
+from torch.ao.nn.quantized.dynamic import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c8ad2d6315bae4d752093b5911ccfaba8b61a14
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea3b0e9ad12f7131fcea93c49081e8ecbb1054c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__init__.py
@@ -0,0 +1,32 @@
+# flake8: noqa: F401
+r"""Quantized Dynamic Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized/dynamic`,
+and is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/dynamic`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.dynamic.modules import conv
+from torch.ao.nn.quantized.dynamic.modules import linear
+from torch.ao.nn.quantized.dynamic.modules import rnn
+
+from torch.ao.nn.quantized.dynamic.modules.conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from torch.ao.nn.quantized.dynamic.modules.linear import Linear
+from torch.ao.nn.quantized.dynamic.modules.rnn import LSTM, GRU, LSTMCell, RNNCell, GRUCell
+
+__all__ = [
+    'Linear',
+    'LSTM',
+    'GRU',
+    'LSTMCell',
+    'RNNCell',
+    'GRUCell',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b68df14d3c0d6d4163681ce34f904f2643ead54
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68c7076ed602d2dbb2e3e57eeb0ced592c0705c9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0aec1b7adb732ea4029d7cc087fa0360d5772879
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85f6859d3a173cb9d864a3192ae3360e33c07d27
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/conv.py b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b82ed917d0db884e32edc1a30c7eafe6f0865147
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/conv.py
@@ -0,0 +1,18 @@
+# flake8: noqa: F401
+r"""Quantized Dynamic Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized/dynamic`,
+and is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/dynamic/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
+from torch.ao.nn.quantized.dynamic.modules.conv import Conv1d
+from torch.ao.nn.quantized.dynamic.modules.conv import Conv2d
+from torch.ao.nn.quantized.dynamic.modules.conv import Conv3d
+from torch.ao.nn.quantized.dynamic.modules.conv import ConvTranspose1d
+from torch.ao.nn.quantized.dynamic.modules.conv import ConvTranspose2d
+from torch.ao.nn.quantized.dynamic.modules.conv import ConvTranspose3d
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..9648ae8b2850b7775c66b401451a6060859c4b1f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/linear.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""Quantized Dynamic Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized/dynamic`,
+and is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/dynamic/modules`,
+while adding an import statement here.
+"""
+from torch.ao.nn.quantized.dynamic.modules.linear import Linear
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/rnn.py b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff6f95ecebf6d4f662df8de0ddc3cf7bfa37b921
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/dynamic/modules/rnn.py
@@ -0,0 +1,22 @@
+# flake8: noqa: F401
+r"""Quantized Dynamic Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized/dynamic`,
+and is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/dynamic/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['pack_weight_bias', 'PackedParameter', 'RNNBase', 'LSTM', 'GRU', 'RNNCellBase', 'RNNCell', 'LSTMCell',
+           'GRUCell']
+
+from torch.ao.nn.quantized.dynamic.modules.rnn import pack_weight_bias
+from torch.ao.nn.quantized.dynamic.modules.rnn import PackedParameter
+from torch.ao.nn.quantized.dynamic.modules.rnn import RNNBase
+from torch.ao.nn.quantized.dynamic.modules.rnn import LSTM
+from torch.ao.nn.quantized.dynamic.modules.rnn import GRU
+from torch.ao.nn.quantized.dynamic.modules.rnn import RNNCellBase
+from torch.ao.nn.quantized.dynamic.modules.rnn import RNNCell
+from torch.ao.nn.quantized.dynamic.modules.rnn import LSTMCell
+from torch.ao.nn.quantized.dynamic.modules.rnn import GRUCell
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/functional.py b/MLPY/Lib/site-packages/torch/nn/quantized/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea40d5965072df75ec3809106fb2d868373a868b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/functional.py
@@ -0,0 +1,10 @@
+r"""nn.quantized.functional.
+
+Quantized equivalents of the `nn.functional`.
+
+Note::
+    This location is in the process of being deprecated.
+    Please, use the `torch.ao.nn.quantized.functional` instead.
+"""
+
+from torch.ao.nn.quantized.functional import *  # noqa: F401,F403
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__init__.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f5368d0f098a2a96fad43e11f2a26ea17329158
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__init__.py
@@ -0,0 +1,70 @@
+r"""Quantized Modules.
+
+Note::
+    The `torch.nn.quantized` namespace is in the process of being deprecated.
+    Please, use `torch.ao.nn.quantized` instead.
+"""
+
+from torch.ao.nn.quantized.modules.activation import ReLU6, Hardswish, ELU, LeakyReLU, Sigmoid, Softmax, MultiheadAttention, PReLU
+from torch.ao.nn.quantized.modules.batchnorm import BatchNorm2d, BatchNorm3d
+from torch.ao.nn.quantized.modules.conv import Conv1d, Conv2d, Conv3d
+from torch.ao.nn.quantized.modules.conv import ConvTranspose1d, ConvTranspose2d, ConvTranspose3d
+from torch.ao.nn.quantized.modules.dropout import Dropout
+from torch.ao.nn.quantized.modules.embedding_ops import Embedding, EmbeddingBag
+from torch.ao.nn.quantized.modules.functional_modules import FloatFunctional, FXFloatFunctional, QFunctional
+from torch.ao.nn.quantized.modules.linear import Linear
+from torch.ao.nn.quantized.modules.normalization import LayerNorm, GroupNorm, InstanceNorm1d, InstanceNorm2d, InstanceNorm3d
+from torch.ao.nn.quantized.modules.rnn import LSTM
+
+from torch.ao.nn.quantized.modules import MaxPool2d
+from torch.ao.nn.quantized.modules import Quantize, DeQuantize
+
+# The following imports are needed in case the user decides
+# to import the files directly,
+# s.a. `from torch.nn.quantized.modules.conv import ...`.
+# No need to add them to the `__all__`.
+from torch.ao.nn.quantized.modules import activation
+from torch.ao.nn.quantized.modules import batchnorm
+from torch.ao.nn.quantized.modules import conv
+from torch.ao.nn.quantized.modules import dropout
+from torch.ao.nn.quantized.modules import embedding_ops
+from torch.ao.nn.quantized.modules import functional_modules
+from torch.ao.nn.quantized.modules import linear
+from torch.ao.nn.quantized.modules import normalization
+from torch.ao.nn.quantized.modules import rnn
+from torch.ao.nn.quantized.modules import utils
+
+__all__ = [
+    'BatchNorm2d',
+    'BatchNorm3d',
+    'Conv1d',
+    'Conv2d',
+    'Conv3d',
+    'ConvTranspose1d',
+    'ConvTranspose2d',
+    'ConvTranspose3d',
+    'DeQuantize',
+    'ELU',
+    'Embedding',
+    'EmbeddingBag',
+    'GroupNorm',
+    'Hardswish',
+    'InstanceNorm1d',
+    'InstanceNorm2d',
+    'InstanceNorm3d',
+    'LayerNorm',
+    'LeakyReLU',
+    'Linear',
+    'LSTM',
+    'MultiheadAttention',
+    'Quantize',
+    'ReLU6',
+    'Sigmoid',
+    'Softmax',
+    'Dropout',
+    'PReLU',
+    # Wrapper modules
+    'FloatFunctional',
+    'FXFloatFunctional',
+    'QFunctional',
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0238efc355b4e3d9ab3f71b65da5a4a790c6a44
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/activation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/activation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72117c49f9a13525b7d0190da749df2b0e9dec4b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/activation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97189c71d9b34d0c9673e7b6957ea591b6c2a1fd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/batchnorm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/conv.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/conv.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8233215856c5098f6c5f7f8a681b2b32bcdfc8f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/conv.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b5a355565adc9e1e9587a3b04ee6651fa33bb54
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/dropout.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b134fcd2eca6b72d5c267296bd2197078bd1d69
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/embedding_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..462ec927e492fdf54abe536164f3839c913c97fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/functional_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/linear.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/linear.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c6f9efcef88a46e42bc9b20b115f7ca4d27a70c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/linear.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac5c25aed2ef3b4523d3b81bb4475a6b709d5d51
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/normalization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6cd7a706a3912fba82e81ed6c384b1ba6a9dc83
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43b200bcbe1be6e796d46181153690e09fdda598
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/quantized/modules/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/activation.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0026904c273fcc935f875236d11fa01dd11be1d6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/activation.py
@@ -0,0 +1,18 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.modules.activation import ELU
+from torch.ao.nn.quantized.modules.activation import Hardswish
+from torch.ao.nn.quantized.modules.activation import LeakyReLU
+from torch.ao.nn.quantized.modules.activation import MultiheadAttention
+from torch.ao.nn.quantized.modules.activation import PReLU
+from torch.ao.nn.quantized.modules.activation import ReLU6
+from torch.ao.nn.quantized.modules.activation import Sigmoid
+from torch.ao.nn.quantized.modules.activation import Softmax
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/batchnorm.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/batchnorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e018a23d201c68901e50d5068a343e351eb926f2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/batchnorm.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.modules.batchnorm import BatchNorm2d
+from torch.ao.nn.quantized.modules.batchnorm import BatchNorm3d
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/conv.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6afc226f12edfaa12438ed6f57f9d491827e31c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/conv.py
@@ -0,0 +1,21 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d']
+
+from torch.ao.nn.quantized.modules.conv import _reverse_repeat_padding
+
+from torch.ao.nn.quantized.modules.conv import Conv1d
+from torch.ao.nn.quantized.modules.conv import Conv2d
+from torch.ao.nn.quantized.modules.conv import Conv3d
+
+from torch.ao.nn.quantized.modules.conv import ConvTranspose1d
+from torch.ao.nn.quantized.modules.conv import ConvTranspose2d
+from torch.ao.nn.quantized.modules.conv import ConvTranspose3d
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/dropout.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd1cd7866f96d6228751f6ca5204494c27e6726
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/dropout.py
@@ -0,0 +1,13 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['Dropout']
+
+from torch.ao.nn.quantized.modules.dropout import Dropout
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/embedding_ops.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/embedding_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a4c5f36d70231beac4c25c72615ffd94e2e351
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/embedding_ops.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['EmbeddingPackedParams', 'Embedding', 'EmbeddingBag']
+
+from torch.ao.nn.quantized.modules.embedding_ops import Embedding
+from torch.ao.nn.quantized.modules.embedding_ops import EmbeddingBag
+from torch.ao.nn.quantized.modules.embedding_ops import EmbeddingPackedParams
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/functional_modules.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/functional_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..0151e44eb889ecb3bc6b5afa15605fbdb63ad34f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/functional_modules.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['FloatFunctional', 'FXFloatFunctional', 'QFunctional']
+
+from torch.ao.nn.quantized.modules.functional_modules import FloatFunctional
+from torch.ao.nn.quantized.modules.functional_modules import FXFloatFunctional
+from torch.ao.nn.quantized.modules.functional_modules import QFunctional
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/linear.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b58b2e9c31c2535ef2918b439a34f9656be50da
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/linear.py
@@ -0,0 +1,14 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['LinearPackedParams', 'Linear']
+
+from torch.ao.nn.quantized.modules.linear import Linear
+from torch.ao.nn.quantized.modules.linear import LinearPackedParams
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/normalization.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..75dc47265b560728fde4057f4de18d6b3c2400d0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/normalization.py
@@ -0,0 +1,17 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+__all__ = ['LayerNorm', 'GroupNorm', 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d']
+
+from torch.ao.nn.quantized.modules.normalization import LayerNorm
+from torch.ao.nn.quantized.modules.normalization import GroupNorm
+from torch.ao.nn.quantized.modules.normalization import InstanceNorm1d
+from torch.ao.nn.quantized.modules.normalization import InstanceNorm2d
+from torch.ao.nn.quantized.modules.normalization import InstanceNorm3d
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/rnn.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea30f1d597b3ce3c6331c35118cc186982d782
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/rnn.py
@@ -0,0 +1,11 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.modules.rnn import LSTM
diff --git a/MLPY/Lib/site-packages/torch/nn/quantized/modules/utils.py b/MLPY/Lib/site-packages/torch/nn/quantized/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa35b3a4568af348734e6612f9c47669cb193a9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/quantized/modules/utils.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""Quantized Modules.
+
+This file is in the process of migration to `torch/ao/nn/quantized`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/modules`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.modules.utils import _ntuple_from_first
+from torch.ao.nn.quantized.modules.utils import _pair_from_first
+from torch.ao.nn.quantized.modules.utils import _quantize_weight
+from torch.ao.nn.quantized.modules.utils import _hide_packed_params_repr
+from torch.ao.nn.quantized.modules.utils import WeightedQuantizedModule
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__init__.py b/MLPY/Lib/site-packages/torch/nn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf0b281a8f534967cbc223fb67e1da721dbc88e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/__init__.py
@@ -0,0 +1,32 @@
+from . import rnn
+from .clip_grad import clip_grad_norm, clip_grad_norm_, clip_grad_value_
+from .weight_norm import weight_norm, remove_weight_norm
+from .convert_parameters import parameters_to_vector, vector_to_parameters
+from .spectral_norm import spectral_norm, remove_spectral_norm
+from .fusion import fuse_conv_bn_eval, fuse_conv_bn_weights, fuse_linear_bn_eval, fuse_linear_bn_weights
+from .memory_format import convert_conv2d_weight_memory_format, convert_conv3d_weight_memory_format
+from . import parametrizations
+from .init import skip_init
+from . import stateless
+
+__all__ = [
+    "clip_grad_norm",
+    "clip_grad_norm_",
+    "clip_grad_value_",
+    "convert_conv2d_weight_memory_format",
+    "convert_conv3d_weight_memory_format",
+    "fuse_conv_bn_eval",
+    "fuse_conv_bn_weights",
+    "fuse_linear_bn_eval",
+    "fuse_linear_bn_weights",
+    "parameters_to_vector",
+    "parametrizations",
+    "remove_spectral_norm",
+    "remove_weight_norm",
+    "rnn",
+    "skip_init",
+    "spectral_norm",
+    "stateless",
+    "vector_to_parameters",
+    "weight_norm",
+]
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e3aefa7d5d3d6fb0678fdd0687b65c623f143f1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_deprecation_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_deprecation_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af7b340d610a26e021e09f3e09adcb5bdc99629d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_deprecation_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cedf0116c7503227936bcdca4e6f806642f939e0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_per_sample_grad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_per_sample_grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe94e9e170a6c9800ecf9665953c542e449913ee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/_per_sample_grad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24ecf82f3b771c162ca1a14c793af5522b1e544a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..912b9dc50066a57d611f356c263fc34e491adad3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee91e5829337a6b37c99ceedf305b3214ae33d4d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ac3bdc9132c9118d77349da12bd9dbb23fbd353
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c82515af5944c6dfb36a56acce7452d70666cd24
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a7405fdea2aeef01f34ba45e7a9a1e0a9716951
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6141a30b6ac54fc87bf966ee7b20ad13d2ba9275
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/prune.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/prune.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..737c685d8b3d24f8f556e31ad6e9372140fb9d3c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/prune.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..caf846d8a56db316f7cb7a9cb1b46ef754cafa2d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d77d559c695b40fa3a0b5b0cf40a70751958f20f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..149d57e66438f22ac3ac1f65d8b4b8ce32e8d334
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e9856c5a5fc993ce3861066b76b36a615b15f0a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_deprecation_utils.py b/MLPY/Lib/site-packages/torch/nn/utils/_deprecation_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0747e325785d90fdd3bc02798cd72be6e026208
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_deprecation_utils.py
@@ -0,0 +1,45 @@
+from typing import List, Callable
+import importlib
+import warnings
+
+
+_MESSAGE_TEMPLATE = r"Usage of '{old_location}' is deprecated; please use '{new_location}' instead."
+
+def lazy_deprecated_import(all: List[str], old_module: str, new_module: str) -> Callable:
+    r"""Import utility to lazily import deprecated packages / modules / functional.
+
+    The old_module and new_module are also used in the deprecation warning defined
+    by the `_MESSAGE_TEMPLATE`.
+
+    Args:
+        all: The list of the functions that are imported. Generally, the module's
+            __all__ list of the module.
+        old_module: Old module location
+        new_module: New module location / Migrated location
+
+    Returns:
+        Callable to assign to the `__getattr__`
+
+    Usage:
+
+        # In the `torch/nn/quantized/functional.py`
+        from torch.nn.utils._deprecation_utils import lazy_deprecated_import
+        _MIGRATED_TO = "torch.ao.nn.quantized.functional"
+        __getattr__ = lazy_deprecated_import(
+            all=__all__,
+            old_module=__name__,
+            new_module=_MIGRATED_TO)
+    """
+    warning_message = _MESSAGE_TEMPLATE.format(
+        old_location=old_module,
+        new_location=new_module)
+
+    def getattr_dunder(name):
+        if name in all:
+            # We are using the "RuntimeWarning" to make sure it is not
+            # ignored by default.
+            warnings.warn(warning_message, RuntimeWarning)
+            package = importlib.import_module(new_module)
+            return getattr(package, name)
+        raise AttributeError(f"Module {new_module!r} has no attribute {name!r}.")
+    return getattr_dunder
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..017c673b631cb089e029518669c91d2a82fc8a36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py
@@ -0,0 +1,9 @@
+from .conv_expanded_weights import ConvPerSampleGrad
+from .embedding_expanded_weights import EmbeddingPerSampleGrad
+from .group_norm_expanded_weights import GroupNormPerSampleGrad
+from .instance_norm_expanded_weights import InstanceNormPerSampleGrad
+from .layer_norm_expanded_weights import LayerNormPerSampleGrad
+from .linear_expanded_weights import LinearPerSampleGrad
+from .expanded_weights_impl import ExpandedWeight
+
+__all__ = ['ExpandedWeight']
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6a41f980fe0833ef20973ccb2be4747698d4523
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..374302a3df6348d2cb2045887ac11059c7ebe50c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1e85a7194b0e50b623dfb9133d731268247fd00
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/conv_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/embedding_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/embedding_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e1ab4bdaab6bab006028f290e5a0b88290ee283
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/embedding_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_impl.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_impl.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e6e49197fe88dd1be9d4a12037aca3f152ccf9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_impl.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a1806bf9c19282880ab2a1b43ac9416d42c53dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/expanded_weights_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/group_norm_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/group_norm_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..add731a1ade0f519b845ce6bd22c36078bc0fb4c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/group_norm_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/instance_norm_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/instance_norm_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45f9bcc9de2cce05f6f3bdb22c2b0504532bf304
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/instance_norm_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/layer_norm_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/layer_norm_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8384dbd4fe0a41020f9d6c6ba3ec5b13bbb46d0b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/layer_norm_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/linear_expanded_weights.cpython-39.pyc b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/linear_expanded_weights.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7be463a9627fea361f1343bec6815a6bde45b300
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/__pycache__/linear_expanded_weights.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3fa20bc8ae15d8bb3102f533305478fe6ac843
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_expanded_weights.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn.functional as F
+
+from .conv_utils import conv_backward, conv_args_and_kwargs, conv_picker, conv_input_for_string_padding
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import forward_helper
+
+@implements_per_sample_grads(F.conv1d)
+@implements_per_sample_grads(F.conv2d)
+@implements_per_sample_grads(F.conv3d)
+class ConvPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, conv_fn, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs)
+        orig_input = expanded_args[0]
+        was_same_padding = expanded_kwargs['padding'] == "same"
+
+        if isinstance(expanded_kwargs['padding'], str):
+            # if padding is a string, we'll do the necessary padding (slowly) using F.pad
+            kernel_size = expanded_args[1].shape[2:]
+            padding, dilation = expanded_kwargs['padding'], expanded_kwargs['dilation']
+            input = conv_input_for_string_padding(conv_fn, padding, expanded_args[0], dilation, kernel_size)
+            expanded_args = (input, expanded_args[1])
+            # since we've already done the padding, don't need any more
+            expanded_kwargs['padding'] = 0
+
+        output = forward_helper(conv_fn, expanded_args, expanded_kwargs)
+        input, weight = expanded_args
+        batched_dim_size = conv_picker(conv_fn, 3, 4, 5)
+        if input.dim() != batched_dim_size:
+            raise RuntimeError(f"Expanded Weights only support convolution with batched input, got {conv_fn} with an"
+                               f"unbatched input of dim {input.dim()}, expected input of dim {batched_dim_size}")
+
+        ctx.conv_fn = conv_fn
+
+        ctx.batch_size = orig_input.shape[0]
+        ctx.input_required_grad = orig_input.requires_grad
+        ctx.orig_input_shape = orig_input.shape
+        ctx.was_same_padding = was_same_padding
+        ctx.stride, ctx.padding = expanded_kwargs['stride'], expanded_kwargs['padding']
+        ctx.dilation, ctx.groups = expanded_kwargs['dilation'], expanded_kwargs['groups']
+
+        if isinstance(weight, ExpandedWeight):
+            ctx.input = input
+        ctx.weight = weight
+        ctx.bias = expanded_kwargs['bias']
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return conv_backward(ctx.conv_fn, ctx, grad_output)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_utils.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca0d5022f6f37eea80a21402a56daa577186c94b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/conv_utils.py
@@ -0,0 +1,240 @@
+import torch
+import torch.nn.functional as F
+
+import numpy as np
+from typing import List, Optional
+
+from .expanded_weights_utils import \
+    set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+
+THRESHOLD = 32
+
+
+def conv_picker(func, conv1dOpt, conv2dOpt, conv3dOpt):
+    if func == F.conv1d:
+        return conv1dOpt
+    if func == F.conv2d:
+        return conv2dOpt
+    else:
+        assert func == F.conv3d
+        return conv3dOpt
+
+
+def conv_args_and_kwargs(kwarg_names, expanded_args_and_kwargs):
+    args = expanded_args_and_kwargs[:len(expanded_args_and_kwargs) - len(kwarg_names)]
+    kwargs = expanded_args_and_kwargs[len(expanded_args_and_kwargs) - len(kwarg_names):]
+    kwargs = dict(zip(kwarg_names, kwargs))
+
+    return conv_normalizer(*args, **kwargs)
+
+
+def conv_normalizer(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+    return (input, weight), {'bias': bias, 'stride': stride, 'padding': padding, 'dilation': dilation, 'groups': groups}
+
+
+def conv_input_for_string_padding(func, padding_style, input, dilation, kernel_size):
+    if padding_style == "valid":
+        return input
+    else:
+        padding = int_padding_for_string_padding(func, padding_style, dilation, kernel_size)
+        return F.pad(input, padding)
+
+
+def int_padding_for_string_padding(func, padding_style, dilation, kernel_size):
+    def get_dilation(i):
+        return dilation[i] if isinstance(dilation, tuple) else dilation
+
+    if padding_style == "same":
+        padding: List[int] = []
+        # F.pad needs the padding in reverse order from what conv expects
+        for i in range(conv_picker(func, 0, 1, 2), -1, -1):
+            padding += conv_padding_for_same(get_dilation(i), kernel_size[i])
+        return padding
+    elif padding_style == "valid":
+        return conv_picker(func, 2, 4, 6) * (0,)
+    else:
+        raise RuntimeError(f"got padding type of {padding_style}, only accept 'same' or 'valid'")
+
+
+def conv_padding_for_same(dilation, kernel_size):
+    total_pad = dilation * (kernel_size - 1)
+    left_pad = total_pad // 2
+    right_pad = total_pad - left_pad
+    return left_pad, right_pad
+
+
+def conv_backward(func, ctx, grad_output):
+
+    def weight_grad_sample(weight):
+        if (batch_size < THRESHOLD and groups == 1):
+            return conv_group_weight_grad_sample(ctx.input, grad_output, weight_shape, stride, padding, dilation, batch_size, func)
+        else:
+            return conv_unfold_weight_grad_sample(ctx.input, grad_output, weight_shape, kernel_size,
+                                                  stride, padding, dilation, groups, func)
+
+    def expand(param):
+        if isinstance(param, int):
+            return conv_picker(func, (param,), (param, param), (param, param, param))
+        else:
+            return param
+
+    def calc_total_padding(func, was_same, padding, dilation, kernel_size):
+        if was_same:
+            all_padding = int_padding_for_string_padding(func, "same", dilation, kernel_size)
+            # F.pad needs the padding in reverse order from what conv expects
+            total_padding = tuple(all_padding[i] + all_padding[i - 1] for i in range(len(all_padding) - 1, -1, -2))
+            return total_padding
+        else:
+            return tuple(2 * pad for pad in padding)
+
+    weight_shape = ctx.weight.shape
+    stride, padding, dilation, groups = expand(ctx.stride), expand(ctx.padding), expand(ctx.dilation), ctx.groups
+
+    kernel_size = []
+    for i in range(2, conv_picker(func, 3, 4, 5)):
+        kernel_size.append(weight_shape[i])
+
+    batch_size = ctx.batch_size
+    results: List[Optional[torch.Tensor]] = []
+    results.append(None)  # for kwarg names
+    results.append(None)  # for op reference
+
+    # "same" padding may give uneven padding on either side so we need to separate the "padding" attr and total padding
+    total_padding = calc_total_padding(func, ctx.was_same_padding, padding, dilation, kernel_size)
+
+    if ctx.input_required_grad:
+        output_padding = []
+        input_dims = conv_picker(func, 1, 2, 3)
+        for i in range(input_dims):
+            input_dim = ctx.orig_input_shape[2 + i]
+            output_padding.append((total_padding[i] + input_dim - (kernel_size[i] * dilation[i] - dilation[i] + 1)) % stride[i])
+        weight_ = unpack_expanded_weight_or_tensor(ctx.weight)
+        transpose_func = conv_picker(func, F.conv_transpose1d, F.conv_transpose2d, F.conv_transpose3d)
+        out = transpose_func(grad_output, weight_, None, stride, padding, tuple(output_padding), groups, dilation)
+
+        if ctx.was_same_padding:
+            for i in range(len(total_padding)):
+                out = torch.narrow(out, 2 + i, total_padding[i] // 2, ctx.orig_input_shape[2 + i])
+
+        results.append(out)
+    else:
+        results.append(None)
+    # weight and bias don't compute batched gradients; no other arguments are differentiable
+    results = results + [None] * 6
+
+    # set grad_sample field for weight and bias with per sample gradients
+    set_grad_sample_if_exists(ctx.weight, weight_grad_sample)
+    set_grad_sample_if_exists(ctx.bias, lambda _: grad_output.reshape(*grad_output.shape[:2], -1).sum(dim=2))
+    return tuple(results)
+
+
+def conv_unfold_weight_grad_sample(input, grad_output, weight_shape, kernel_size, stride, padding, dilation, groups, func):
+    n = input.shape[0]
+    in_channels = input.shape[1]
+
+    unfold_func = conv_picker(
+        func,
+        lambda: F.unfold(input.unsqueeze(-2),
+                         kernel_size=(1, kernel_size[0]),
+                         dilation=(1, dilation[0]),
+                         padding=(0, padding[0]),
+                         stride=(1, stride[0])),
+        lambda: F.unfold(input, kernel_size, dilation=dilation, padding=padding, stride=stride),
+        lambda: unfold3d(input, kernel_size, padding, stride, dilation)
+    )
+
+    input = unfold_func()
+    grad_output = grad_output.reshape(n, -1, input.shape[-1])
+
+    # n=batch_sz; o=num_out_channels; p=(num_in_channels/groups)*kernel_sz
+    weight_grad_sample = torch.einsum("noq,npq->nop", grad_output, input)
+    # rearrange the above tensor and extract diagonals.
+    weight_grad_sample = weight_grad_sample.view(
+        n,
+        groups,
+        -1,
+        groups,
+        int(in_channels / groups),
+        np.prod(kernel_size),
+    )
+    weight_grad_sample = torch.einsum("ngrg...->ngr...", weight_grad_sample).contiguous()
+    shape = [n] + list(weight_shape)
+    weight_grad_sample = weight_grad_sample.view(shape)
+    return weight_grad_sample
+
+
+def conv_group_weight_grad_sample(input, grad_output, weight_shape, stride, padding, dilation, batch_size, func):
+    I = input.shape[1]
+    O = grad_output.shape[1]
+
+    input_ = input.transpose(0, 1)
+    grad_output_ = grad_output.view(grad_output.shape[0] * grad_output.shape[1], 1, *grad_output.shape[2:])
+
+    weight_grad_sample = func(input_, grad_output_, None, stride=dilation, padding=padding, dilation=stride, groups=batch_size)
+    input_dims = conv_picker(func, 3, 4, 5)
+    for i in range(2, input_dims):
+        weight_grad_sample = weight_grad_sample.narrow(i, 0, weight_shape[i])
+    weight_grad_sample = weight_grad_sample.view(I, batch_size, O, *weight_grad_sample.shape[2:])
+    weight_grad_sample = weight_grad_sample.movedim(0, 2)
+    return weight_grad_sample
+
+
+def unfold3d(
+    tensor,
+    kernel_size,
+    padding,
+    stride,
+    dilation,
+):
+    r"""
+    Extract sliding local blocks from an batched input tensor.
+
+    :class:`torch.nn.Unfold` only supports 4D inputs (batched image-like tensors).
+    This method implements the same action for 5D inputs
+    Args:
+        tensor: An input tensor of shape ``(B, C, D, H, W)``.
+        kernel_size: the size of the sliding blocks
+        padding: implicit zero padding to be added on both sides of input
+        stride: the stride of the sliding blocks in the input spatial dimensions
+        dilation: the spacing between the kernel points.
+    Returns:
+        A tensor of shape ``(B, C * np.prod(kernel_size), L)``, where L - output spatial dimensions.
+        See :class:`torch.nn.Unfold` for more details
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> B, C, D, H, W = 3, 4, 5, 6, 7
+        >>> tensor = torch.arange(1, B * C * D * H * W + 1.).view(B, C, D, H, W)
+        >>> unfold3d(tensor, kernel_size=2, padding=0, stride=1).shape
+        torch.Size([3, 32, 120])
+    """
+    if len(tensor.shape) != 5:
+        raise ValueError(
+            f"Input tensor must be of the shape [B, C, D, H, W]. Got{tensor.shape}"
+        )
+
+    if dilation != (1, 1, 1):
+        raise NotImplementedError(f"dilation={dilation} not supported.")
+
+    batch_size, channels, _, _, _ = tensor.shape
+
+    # Input shape: (B, C, D, H, W)
+    tensor = F.pad(
+        tensor, (padding[2], padding[2], padding[1], padding[1], padding[0], padding[0])
+    )
+    # Output shape: (B, C, D+2*padding[2], H+2*padding[1], W+2*padding[0])
+
+    tensor = tensor.unfold(dimension=2, size=kernel_size[0], step=stride[0])
+    tensor = tensor.unfold(dimension=3, size=kernel_size[1], step=stride[1])
+    tensor = tensor.unfold(dimension=4, size=kernel_size[2], step=stride[2])
+    # Output shape: (B, C, D_out, H_out, W_out, kernel_size[0], kernel_size[1], kernel_size[2])
+    # For D_out, H_out, W_out definitions see :class:`torch.nn.Unfold`
+
+    tensor = tensor.permute(0, 2, 3, 4, 1, 5, 6, 7)
+    # Output shape: (B, D_out, H_out, W_out, C, kernel_size[0], kernel_size[1], kernel_size[2])
+
+    tensor = tensor.reshape(batch_size, -1, channels * np.prod(kernel_size)).transpose(
+        1, 2
+    )
+    # Output shape: (B, D_out * H_out * W_out, C * kernel_size[0] * kernel_size[1] * kernel_size[2]
+
+    return tensor
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d004b2981ca3129d3c144a363da72fd9415f46
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/embedding_expanded_weights.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import standard_kwargs, forward_helper, set_grad_sample_if_exists
+
+from typing import List, Optional
+
+@implements_per_sample_grads(F.embedding)
+class EmbeddingPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        if len(expanded_args[0].shape) == 1:
+            raise RuntimeError(f"Expanded Weights needs an input with a batch size, got a 1D tensor, {expanded_args[0]}")
+        output = forward_helper(F.embedding, expanded_args, expanded_kwargs)
+        ctx.input, ctx.weight = expanded_args
+        ctx.padding_idx, ctx.scale_grad_by_freq = expanded_kwargs['padding_idx'], expanded_kwargs['scale_grad_by_freq']
+        ctx.sparse = expanded_kwargs['sparse']
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.input, ctx.weight
+        padding_idx, scale_grad_by_freq, sparse = ctx.padding_idx, ctx.scale_grad_by_freq, ctx.sparse
+
+        def weight_per_sample_grad(weight):
+            batch_size = input.shape[0]
+            embedding_dim = weight.shape[1]
+            index = (
+                input.unsqueeze(-1)
+                .expand(*input.shape, embedding_dim)
+                .reshape(batch_size, -1, embedding_dim)
+            )
+            grad_sample = torch.zeros(
+                batch_size, *weight.shape, device=weight.device, dtype=grad_output.dtype
+            )
+            return grad_sample.scatter_add_(1, index, grad_output.reshape(batch_size, -1, embedding_dim))
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            bw_fn = torch.ops.aten.embedding_backward
+            results.append(bw_fn(grad_output, input, weight.shape[0], padding_idx, scale_grad_by_freq, sparse))
+        else:
+            results.append(None)
+
+        # weight doesn't compute batched gradients; no other arguments are differentiable (2 not saved from forward)
+        results = results + [None] * 6
+
+        # set grad_sample field for weight with per sample gradients
+        set_grad_sample_if_exists(weight, weight_per_sample_grad)
+        return tuple(results)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_impl.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5832da15a4e26b2cbf05c48cedbf68e5592be987
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_impl.py
@@ -0,0 +1,153 @@
+from contextlib import contextmanager
+
+import torch
+import functools
+from torch._decomp import decomposition_table
+
+from typing import Callable, Dict
+
+from torch.utils._pytree import tree_map_only
+
+HANDLED_FUNCTIONS: Dict[Callable, torch.autograd.Function] = {}
+
+aten = torch._ops.ops.aten
+# __torch_function__ runs before the pydispatcher so we need to manually use the same
+# decompositions indexed by their torch equivalent
+expanded_weights_rnn_decomps = {
+    # func: (input_decomp, data_decomp)
+    torch.rnn_relu: (decomposition_table[aten.rnn_relu.input], decomposition_table[aten.rnn_relu.data]),
+    torch.rnn_tanh: (decomposition_table[aten.rnn_tanh.input], decomposition_table[aten.rnn_tanh.data]),
+    torch.lstm: (decomposition_table[aten.lstm.input], decomposition_table[aten.lstm.data]),
+    torch.gru: (decomposition_table[aten.gru.input], decomposition_table[aten.gru.data]),
+}
+
+# all of the RNN decomps run linear with the batch dimension second, even if batch_first was set
+@contextmanager
+def batch_second(args, kwargs):
+    def set_batch_second(ew):
+        ew.set_batch_first(False)
+
+    def reset_batch_first(ew):
+        ew.set_batch_first(True)
+
+    tree_map_only(ExpandedWeight, set_batch_second, args)
+    tree_map_only(ExpandedWeight, set_batch_second, kwargs)
+    try:
+        yield
+    finally:
+        tree_map_only(ExpandedWeight, reset_batch_first, args)
+        tree_map_only(ExpandedWeight, reset_batch_first, kwargs)
+
+# to support packed sequences, we need to allow for smaller batches. Expanded weights represents the largest batch
+@contextmanager
+def allow_smaller_batches(args, kwargs):
+    def allow(ew):
+        ew.set_allow_smaller_batches(True)
+
+    def reset(ew):
+        ew.set_allow_smaller_batches(False)
+
+    tree_map_only(ExpandedWeight, allow, args)
+    tree_map_only(ExpandedWeight, allow, kwargs)
+    try:
+        yield
+    finally:
+        tree_map_only(ExpandedWeight, reset, args)
+        tree_map_only(ExpandedWeight, reset, kwargs)
+
+@contextmanager
+def setup_rnn(use_input_variant, args, kwargs):
+    with batch_second(args, kwargs) if use_input_variant else allow_smaller_batches(args, kwargs):
+        yield
+
+
+def implements_per_sample_grads(torch_function):
+    @functools.wraps(torch_function)
+    def decorator(autograd_func):
+        HANDLED_FUNCTIONS[torch_function] = autograd_func
+        return autograd_func
+    return decorator
+
+# ExpandedWeight represents a weight (parameter) Tensor that has an expanded
+# batch dimension. Operations on the ExpandedWeight Tensor act exactly like
+# those without an expanded batch dimension but a call to .backward() populates
+# the original (unexpanded) tensor with per-sample-gradients for in the grad_sample field
+#
+# ExpandedWeight has a fallback that always fails since we cannot know what the batch
+# dimension of the input tensor is and therefore cannot know if this is a valid call
+#
+# This is a __torch_function__ object but it could have also been a Tensor Extension
+# with a dispatch key.
+#
+# Needs to be a tensor subclass to allow reparamaterization
+class ExpandedWeight(torch.Tensor):
+    def __init__(self, orig_weight, batch_size, loss_reduction):
+        self.batch_size = batch_size
+        self.batch_first = True
+        self.allow_smaller_batches = False
+        self.orig_weight = orig_weight
+        self.loss_reduction = loss_reduction
+
+    handled_functions = HANDLED_FUNCTIONS
+
+    def __new__(cls, orig_weight, batch_size, loss_reduction):
+        if not isinstance(orig_weight, torch.Tensor):
+            raise RuntimeError(f"Can only make Expanded Weights of Tensors, got {type(orig_weight).__name__}")
+        if not orig_weight.requires_grad:
+            raise RuntimeError("Can only build ExpandedWeights objects of tensors that require_grad")
+        ret = torch.Tensor._make_subclass(cls, orig_weight, True)
+        return ret
+
+    @classmethod
+    def __torch_function__(cls, func, _, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        if func in expanded_weights_rnn_decomps:
+            # in aten, choosing the input or data variants is done by parsing logic. This mimics some of that
+            decomp_opts = expanded_weights_rnn_decomps[func]
+            use_input_variant = isinstance(args[2], list)  # data variant uses a list here
+            decomp = decomp_opts[0] if use_input_variant else decomp_opts[1]
+
+            if decomp is not None:
+                with setup_rnn(use_input_variant, args, kwargs):
+                    return decomp(*args, **kwargs)
+        if func == torch._cudnn_rnn_flatten_weight:
+            # since we aren't using the fused cuda kernels for RNNs, don't do this
+            return
+        if func in cls.handled_functions:
+            return cls.handled_functions[func].apply(tuple(kwargs.keys()), func, *(args + tuple(kwargs.values())))
+        # We cannot use a fallback here because we do not know the batch dimension for any regular tensor inputs,
+        # i.e. torch.add(torch.Tensor, ExpandedWeight)
+        raise RuntimeError(f"Expanded Weights encountered but cannot handle function {func.__name__}")
+
+    @property
+    def dtype(self):
+        return self.orig_weight.dtype
+
+    @property
+    def data(self):
+        return self.orig_weight.data
+
+    @property
+    def shape(self):
+        return self.orig_weight.shape
+
+    @property
+    def device(self):
+        return self.orig_weight.device
+
+    @property
+    def is_cuda(self):
+        return self.orig_weight.is_cuda
+
+    def data_ptr(self):
+        return self.orig_weight.data_ptr()
+
+    def get_device(self):
+        return self.orig_weight.get_device()
+
+    def set_allow_smaller_batches(self, is_allow_smaller_batches):
+        self.allow_smaller_batches = is_allow_smaller_batches
+
+    def set_batch_first(self, is_batch_first=True):
+        self.batch_first = is_batch_first
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_utils.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..025057527ecf5d0b7065e53111a02ca709c7719a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/expanded_weights_utils.py
@@ -0,0 +1,145 @@
+from typing import Optional
+
+import torch
+from .expanded_weights_impl import ExpandedWeight
+
+def is_batch_first(expanded_args_and_kwargs):
+    batch_first = None
+    for arg in expanded_args_and_kwargs:
+        if not isinstance(arg, ExpandedWeight):
+            continue
+
+        if not batch_first:
+            batch_first = arg.batch_first
+        elif arg.batch_first != batch_first:
+            raise RuntimeError("Got conflicting batch_first arguments in the same layer")
+    return batch_first
+
+def standard_kwargs(kwarg_names, expanded_args):
+    r"""Separate args and kwargs from `__torch_function__`s that standardize kwargs.
+
+    Most `__torch_function__`s standardize the kwargs that they give, so this will separate
+    the args and kwargs they pass. Functions that don't are linear and convND.
+    """
+    kwarg_values = expanded_args[len(expanded_args) - len(kwarg_names):]
+    expanded_args_without_kwargs = expanded_args[:len(expanded_args) - len(kwarg_names)]
+    expanded_kwargs = dict(zip(kwarg_names, kwarg_values))
+    return expanded_args_without_kwargs, expanded_kwargs
+
+def forward_helper(func, expanded_args, expanded_kwargs):
+    r"""Compute the forward pass for a function that has expanded weight(s) passed to it.
+
+    It will run the forward pass where all ExpandedWeights are their original
+    weight. It runs checks on the given arguments and detaches the outputs.
+
+    .. note:: First argument in :attr:`expanded_args` must be the input with the batch
+    dimension as the first element of the shape
+
+    .. note:: :attr:`func` must return a Tensor or tuple of Tensors
+
+    Args:
+        func: The function to be called
+        expanded_args: Arguments to be passed to :attr:`func`. Will include arguments
+          that need to be unpacked because they are ExpandedWeights
+        expanded_kwargs: Keyword arguments to be passed to :attr:`func`.
+          Similar to :attr:`expanded_args`.
+    """
+    unexpanded_args, unexpanded_kwargs = _check_and_unexpand_args(func, expanded_args, expanded_kwargs)
+    return func(*unexpanded_args, **unexpanded_kwargs)
+
+def _check_and_unexpand_args(func, expanded_args, expanded_kwargs):
+    # input must be the first argument passed
+    input = expanded_args[0]
+    if isinstance(input, ExpandedWeight):
+        raise RuntimeError("Expanded Weights do not support inputs that are also ExpandedWeights. "
+                           f"Input must be a Tensor, got {type(input).__name__} in function {func.__name__}")
+    if not isinstance(input, torch.Tensor):
+        raise RuntimeError("Expanded Weights requires a Tensor as the first input to get the batch dimension, "
+                           f"got {type(input).__name__} in function {func.__name__}")
+    if len(input.shape) == 0:
+        raise RuntimeError(f"Expanded Weights requires a batch dimension but got an input of size 0 in function {func.__name__}")
+    if input.shape[0] == 0:
+        raise RuntimeError("0 is not a valid batch size for Expanded Weights but got input tensor of "
+                           f"{input} in function {func.__name__}")
+    for arg in expanded_args + tuple(expanded_kwargs.values()):
+        if not isinstance(arg, ExpandedWeight):
+            continue
+        batch_size = input.shape[0] if arg.batch_first else input.shape[1]
+        if (arg.allow_smaller_batches and batch_size > arg.batch_size) or \
+                (not arg.allow_smaller_batches and arg.batch_size != batch_size):
+            raise RuntimeError("Expected ExpandedWeights to have batch size matching input but got "
+                               f"input batch size of {batch_size} with ExpandedWeight of batch size {arg.batch_size}")
+
+    loss_reduction: Optional[str] = None
+    for arg in expanded_args + tuple(expanded_kwargs.values()):
+        if isinstance(arg, ExpandedWeight):
+            if loss_reduction is None:
+                loss_reduction = arg.loss_reduction
+            elif loss_reduction != arg.loss_reduction:
+                raise RuntimeError("Expected ExpandedWeights to all have the same loss_reduction argument but got one"
+                                   f"with {loss_reduction} and one with {arg.loss_reduction}")
+
+    unexpanded_args = tuple(arg.orig_weight if isinstance(arg, ExpandedWeight) else arg for arg in expanded_args)
+    unexpanded_kwargs = {name: arg.orig_weight if isinstance(arg, ExpandedWeight) else arg
+                         for (name, arg) in expanded_kwargs.items()}
+    return unexpanded_args, unexpanded_kwargs
+
+def maybe_scale_by_batch_size(grad_sample, expanded_weight):
+    if expanded_weight.loss_reduction == "mean":
+        return grad_sample * expanded_weight.batch_size
+    else:
+        return grad_sample
+
+def set_grad_sample_if_exists(maybe_expanded_weight, per_sample_grad_fn):
+    unpacked = unpack_expanded_weight_or_tensor(maybe_expanded_weight)
+    if isinstance(maybe_expanded_weight, ExpandedWeight):
+        grad_sample_contribution = maybe_scale_by_batch_size(per_sample_grad_fn(unpacked), maybe_expanded_weight)
+
+        if maybe_expanded_weight.batch_size > grad_sample_contribution.shape[0]:
+            # this only passes the other checks if the arg allows smaller batch sizes
+            intermediate = torch.zeros(maybe_expanded_weight.batch_size, *grad_sample_contribution.shape[1:],
+                                       dtype=grad_sample_contribution.dtype,
+                                       device=grad_sample_contribution.device)
+            intermediate[:grad_sample_contribution.shape[0]] = grad_sample_contribution
+            grad_sample_contribution = intermediate
+
+        if hasattr(unpacked, "grad_sample") and unpacked.grad_sample is not None:
+            unpacked.grad_sample = unpacked.grad_sample + grad_sample_contribution
+        else:
+            unpacked.grad_sample = grad_sample_contribution
+
+def unpack_expanded_weight_or_tensor(maybe_expanded_weight, func=lambda x: x):
+    if isinstance(maybe_expanded_weight, ExpandedWeight):
+        orig_weight = maybe_expanded_weight.orig_weight
+        return func(orig_weight)
+    elif isinstance(maybe_expanded_weight, torch.Tensor) and not maybe_expanded_weight.requires_grad:
+        return func(maybe_expanded_weight)
+    elif isinstance(maybe_expanded_weight, torch.Tensor):
+        raise RuntimeError("ExpandedWeights currently does not support a mixture of ExpandedWeight parameters "
+                           "and normal Parameters. Please file and issue with pytorch/pytorch")
+
+
+
+def sum_over_all_but_batch_and_last_n(
+    tensor: torch.Tensor, n_dims: int
+) -> torch.Tensor:
+    r"""
+    Calculate the sum over all dimensions, except the first (batch dimension), and excluding the last n_dims.
+
+    This function will ignore the first dimension and it will
+    not aggregate over the last n_dims dimensions.
+    Args:
+        tensor: An input tensor of shape ``(B, ..., X[n_dims-1])``.
+        n_dims: Number of dimensions to keep.
+    Example:
+        >>> tensor = torch.ones(1, 2, 3, 4, 5)
+        >>> sum_over_all_but_batch_and_last_n(tensor, n_dims=2).shape
+        torch.Size([1, 4, 5])
+    Returns:
+        A tensor of shape ``(B, ..., X[n_dims-1])``
+    """
+    if tensor.dim() == n_dims + 1:
+        return tensor
+    else:
+        dims = list(range(1, tensor.dim() - n_dims))
+        return tensor.sum(dim=dims)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4762e1e28445c305b8a53098e382b65dfdf3174
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py
@@ -0,0 +1,64 @@
+from functools import reduce
+import operator
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import standard_kwargs, \
+    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.group_norm)
+class GroupNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        input, num_groups = expanded_args
+        N = input.shape[0]
+        C = input.shape[1]
+        HxW = reduce(operator.mul, input.shape[2:], 1)
+        weight, bias, eps = expanded_kwargs['weight'], expanded_kwargs['bias'], expanded_kwargs['eps']
+        output, mean, rstd = forward_helper(torch.native_group_norm, (input, weight, bias, N, C, HxW, num_groups, eps), {})
+        ctx.input, ctx.num_groups = input, num_groups
+        ctx.weight, ctx.eps = weight, eps
+        ctx.mean, ctx.rstd = mean, rstd
+        if isinstance(bias, ExpandedWeight):
+            ctx.bias = bias
+        if input.requires_grad and isinstance(weight, ExpandedWeight):
+            ctx.weight = weight
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, num_groups = ctx.input, ctx.num_groups
+        weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
+        mean, rstd = ctx.mean, ctx.rstd
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            weight_c = unpack_expanded_weight_or_tensor(weight, lambda t: t.contiguous())
+            input_c = input.contiguous()
+            grad_output_c = grad_output.contiguous() if grad_output is not None else None
+            N = input.shape[0]
+            C = input.shape[1]
+            HxW = 1
+            for s in input.shape[2:]:
+                HxW *= s
+            bw_fn = torch.ops.aten.native_group_norm_backward
+            results.append(bw_fn(grad_output_c, input_c,
+                                 mean, rstd, weight_c, N, C, HxW, num_groups, (True, False, False))[0])
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable
+        results = results + [None] * 4
+
+        # set grad_sample field for weight and bias with per sample gradients
+        if hasattr(ctx, "weight"):
+            set_grad_sample_if_exists(weight,
+                                      lambda _: torch.einsum("ni...->ni", F.group_norm(input, num_groups, eps=eps) * grad_output))
+        if hasattr(ctx, "bias"):
+            set_grad_sample_if_exists(bias, lambda _: torch.einsum("ni...->ni", grad_output))
+        return tuple(results)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b2c43c91216f7bbf3446e059553f98ef2ee33b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py
@@ -0,0 +1,60 @@
+from functools import partial
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import \
+    forward_helper, set_grad_sample_if_exists, standard_kwargs, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.instance_norm)
+class InstanceNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        instance_norm = partial(torch.instance_norm, cudnn_enabled=True)
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        output = forward_helper(instance_norm, expanded_args, expanded_kwargs)
+        ctx.input = expanded_args[0]
+        ctx.running_mean, ctx.running_var = expanded_kwargs['running_mean'], expanded_kwargs['running_var']
+        ctx.weight, ctx.bias, ctx.eps = expanded_kwargs['weight'], expanded_kwargs['bias'], expanded_kwargs['eps']
+        return output
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, running_mean, running_var = ctx.input, ctx.running_mean, ctx.running_var
+        weight, bias, eps = ctx.weight, ctx.bias, ctx.eps
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+        if input.requires_grad:
+            b = input.shape[0]
+            c = input.shape[1]
+            new_shape = (1, b * c, *input.shape[2:])
+
+            weight_ = unpack_expanded_weight_or_tensor(weight, lambda orig_weight: orig_weight.repeat(b))
+            running_mean_ = running_mean.repeat(b) if running_mean is not None else None
+            running_var_ = running_var.repeat(b) if running_var is not None else None
+            input_reshaped = input.contiguous().view(new_shape)
+            grad_output_reshaped = grad_output.contiguous().view(new_shape)
+            mean = torch.mean(input_reshaped, (0,) + tuple(range(2, input.dim())), False)
+            var = torch.var(input_reshaped, (0,) + tuple(range(2, input.dim())), keepdim=False, unbiased=False)
+            rstd = 1 / torch.sqrt(var + eps)
+
+            # must use native batch norm since it supports all inputs. This may have used cuda or openmi during the forward but
+            # it didn't save the metadata, so we don't know during the backward
+            res = torch.ops.aten.native_batch_norm_backward(
+                grad_output_reshaped, input_reshaped, weight_, running_mean_, running_var_,
+                mean, rstd, True, eps, (True, False, False))
+            results.append(res[0].reshape(input.shape))
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable (2 are not saved from the forward)
+        results = results + [None] * 7
+
+        # set grad_sample field for weight and bias with per sample gradients
+        set_grad_sample_if_exists(weight,
+                                  lambda _: torch.einsum("ni...->ni", F.instance_norm(input, eps=eps) * grad_output))
+        set_grad_sample_if_exists(bias, lambda _: torch.einsum("ni...->ni", grad_output))
+        return tuple(results)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..3640bc3145f6ee5d2f6412efe34dfe33af205b74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py
@@ -0,0 +1,59 @@
+
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import ExpandedWeight, implements_per_sample_grads
+from .expanded_weights_utils import forward_helper, set_grad_sample_if_exists, \
+    standard_kwargs, sum_over_all_but_batch_and_last_n, unpack_expanded_weight_or_tensor
+from typing import List, Optional
+
+@implements_per_sample_grads(F.layer_norm)
+class LayerNormPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, kwarg_names, _, *expanded_args_and_kwargs):
+        expanded_args, expanded_kwargs = standard_kwargs(kwarg_names, expanded_args_and_kwargs)
+        input = expanded_args[0]
+        normalized_shape = expanded_args[1]
+        if len(input.shape) <= len(normalized_shape):
+            raise RuntimeError("Expanded Weights: Layer norm should not normalize over batch dimension for per sample gradient"
+                               f"computations but got that normalized shape, {normalized_shape}, matched input shape.")
+        output, mean, rstd = forward_helper(torch.native_layer_norm, expanded_args, expanded_kwargs)
+        ctx.args = expanded_args
+
+        if input.requires_grad or isinstance(expanded_kwargs['weight'], ExpandedWeight):
+            ctx.weight = expanded_kwargs['weight']
+        if input.requires_grad or isinstance(expanded_kwargs['bias'], ExpandedWeight):
+            ctx.bias = expanded_kwargs['bias']
+        ctx.eps = expanded_kwargs['eps']
+        ctx.mean, ctx.rstd = mean, rstd
+        return output
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        def weight_per_sample_grad(weight):
+            return sum_over_all_but_batch_and_last_n(F.layer_norm(input, normalized_shape, eps=ctx.eps) * grad_output, weight.dim())
+
+        input, normalized_shape = ctx.args
+        mean, rstd = ctx.mean, ctx.rstd
+
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg names
+        results.append(None)  # for op reference
+        if input.requires_grad:
+            weight_ = unpack_expanded_weight_or_tensor(ctx.weight)
+            bias_ = unpack_expanded_weight_or_tensor(ctx.bias)
+            results.append(torch.ops.aten.native_layer_norm_backward(
+                grad_output, input, normalized_shape, mean, rstd, weight_, bias_, (True, False, False))[0])
+        else:
+            results.append(None)
+
+        # weight and bias don't compute batched gradients; no other arguments are differentiable
+        results = results + [None] * 4
+
+        # set grad_sample field for weight and bias with per sample gradients
+        if hasattr(ctx, "weight"):
+            set_grad_sample_if_exists(ctx.weight, weight_per_sample_grad)
+        if hasattr(ctx, "bias"):
+            set_grad_sample_if_exists(ctx.bias, lambda bias: sum_over_all_but_batch_and_last_n(grad_output, bias.dim()))
+        return tuple(results)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/linear_expanded_weights.py b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
new file mode 100644
index 0000000000000000000000000000000000000000..813bbe792bd0f426796f55d7ab94dd6d80b070f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_expanded_weights/linear_expanded_weights.py
@@ -0,0 +1,44 @@
+import torch
+import torch.nn.functional as F
+from .expanded_weights_impl import implements_per_sample_grads
+from .expanded_weights_utils import \
+    forward_helper, set_grad_sample_if_exists, unpack_expanded_weight_or_tensor, is_batch_first
+from typing import List, Optional
+
+@implements_per_sample_grads(F.linear)
+class LinearPerSampleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, _, __, *expanded_args_and_kwargs):
+        if len(expanded_args_and_kwargs[0].shape) <= 1:
+            raise RuntimeError("Input does not have a batch dimension. Expanded Weights expected input "
+                               f"of at least rank 2, got of rank {len(expanded_args_and_kwargs[0].shape)}")
+        expanded_kwargs = {'bias': expanded_args_and_kwargs[2] if len(expanded_args_and_kwargs) == 3 else None}
+        expanded_args = expanded_args_and_kwargs[:2]
+        ctx.batch_first = is_batch_first(expanded_args_and_kwargs)
+        output = forward_helper(F.linear, expanded_args, expanded_kwargs)
+        ctx.args = expanded_args
+        ctx.kwargs = expanded_kwargs
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight = ctx.args
+        bias = ctx.kwargs['bias']
+        results: List[Optional[torch.Tensor]] = []
+        results.append(None)  # for kwarg_names
+        results.append(None)  # for op reference
+
+        if input.requires_grad:
+            results.append(grad_output.matmul(unpack_expanded_weight_or_tensor(weight)))
+        else:
+            results.append(None)
+        results.extend([None] * 2)  # weight and bias don't compute batched gradients
+
+        if not ctx.batch_first:
+            grad_output = grad_output.transpose(0, 1)
+            input = input.transpose(0, 1)
+
+        # weight and bias get their grad_sample fields set directly if they exist
+        set_grad_sample_if_exists(weight, lambda _: torch.einsum("n...i,n...j->nij", grad_output, input))
+        set_grad_sample_if_exists(bias, lambda _: torch.einsum("n...k->nk", grad_output))
+        return tuple(results)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_named_member_accessor.py b/MLPY/Lib/site-packages/torch/nn/utils/_named_member_accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ecad2f6b2b9bd2e2dc914a5a3c9dbcfb136ddb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_named_member_accessor.py
@@ -0,0 +1,374 @@
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Iterable, List, Tuple
+
+import torch
+
+
+_MISSING: torch.Tensor = object()  # type: ignore[assignment]
+
+
+def set_tensor(module: "torch.nn.Module", name: str, tensor: torch.Tensor) -> None:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if not isinstance(tensor, torch.Tensor) and tensor is not None:
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+    if name in module._parameters:
+        module._parameters[name] = tensor  # type: ignore[assignment]
+    elif name in module._buffers:
+        module._buffers[name] = tensor
+    else:
+        setattr(module, name, tensor)
+
+
+def swap_tensor(
+    module: "torch.nn.Module",
+    name: str,
+    tensor: torch.Tensor,
+    allow_missing: bool = False,
+) -> torch.Tensor:
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if (
+        tensor is not _MISSING
+        and not isinstance(tensor, torch.Tensor)
+        and tensor is not None
+    ):
+        raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+    if "." in name:
+        raise KeyError('tensor name can\'t contain "."')
+    if name == "":
+        raise KeyError('tensor name can\'t be empty string ""')
+
+    orig_tensor: torch.Tensor
+    if name in module._parameters:
+        orig_tensor = module._parameters[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._parameters[name] = tensor  # type: ignore[assignment]
+        else:
+            del module._parameters[name]
+    elif name in module._buffers:
+        orig_tensor = module._buffers[name]  # type: ignore[assignment]
+        if tensor is not _MISSING:
+            module._buffers[name] = tensor
+        else:
+            del module._buffers[name]
+    else:
+        try:
+            orig_tensor = getattr(module, name)
+        except AttributeError as ex:
+            if not allow_missing:
+                raise AttributeError(
+                    f"{module._get_name()} has no attribute `{name}`"
+                ) from ex
+            orig_tensor = _MISSING
+        if (
+            orig_tensor is not _MISSING
+            and not isinstance(orig_tensor, torch.Tensor)
+            and orig_tensor is not None
+        ):
+            raise TypeError(
+                f"attribute `{name}`: {orig_tensor} is not an instance of torch.Tensor"
+            )
+        if tensor is not _MISSING:
+            setattr(module, name, tensor)
+        elif hasattr(module, name):
+            delattr(module, name)
+    return orig_tensor
+
+
+def swap_submodule(
+    module: "torch.nn.Module",
+    name: str,
+    submodule: "torch.nn.Module",
+) -> "torch.nn.Module":
+    if not isinstance(module, torch.nn.Module):
+        raise TypeError(f"{module} is not an instance of torch.nn.Module")
+    if not isinstance(submodule, torch.nn.Module):
+        raise TypeError(f"{submodule} is not an instance of torch.nn.Module")
+    if "." in name:
+        raise KeyError('submodule name can\'t contain "."')
+    if name == "":
+        raise KeyError('submodule name can\'t be empty string ""')
+    if name not in module._modules:
+        raise KeyError(f"submodule {name} does not exist")
+
+    orig_submodule = module._modules[name]
+    if not isinstance(orig_submodule, torch.nn.Module):
+        raise TypeError(f"{name} attribute is not an instance of torch.nn.Module")
+    module._modules[name] = submodule
+    return orig_submodule
+
+
+class NamedMemberAccessor:
+    """
+    A class that provides a way to access the submodules and parameters/buffers of a module.
+
+    It provides caching mechanism to speed up submodule lookups.
+    This is useful for functional programming to manipulate the module state.
+    """
+
+    def __init__(self, module: "torch.nn.Module") -> None:
+        self.module = module
+        self.memo: Dict[str, torch.nn.Module] = {}
+
+    # Nested attribute access
+
+    def get_submodule(self, name: str) -> "torch.nn.Module":
+        """
+        Return the submodule specified by the given path.
+
+        For example, to get the submodule mod.layer1.conv1,
+        use accessor.get_submodule("layer1.conv1")
+
+        Compare to mod.get_submodule("layer1.conv1"), this method will cache the
+        intermediate submodule access to speed up future lookups.
+        """
+        if not name:
+            return self.module
+
+        try:
+            return self.memo[name]
+        except KeyError:
+            prefix, dot, attr = name.rpartition(".")
+            if dot:
+                module = self.get_submodule(prefix)
+            else:
+                module = self.module
+            try:
+                submodule = getattr(module, attr)
+            except AttributeError as ex:
+                raise AttributeError(
+                    f"{module._get_name()} has no attribute `{attr}`"
+                ) from ex
+            if not isinstance(submodule, torch.nn.Module):
+                raise TypeError(  # noqa: TRY200
+                    f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
+                )
+            self.memo[name] = submodule
+            return submodule
+
+    def swap_submodule(self, path: str, value: "torch.nn.Module") -> "torch.nn.Module":
+        """
+        Swap the submodule specified by the given ``path`` to ``value``.
+
+        For example, to swap the attribute mod.layer1.conv1 use
+        ``accessor.swap_submodule("layer1.conv1", conv2)``.
+        """
+        prefix, _, attr = path.rpartition(".")
+        return swap_submodule(self.get_submodule(prefix), attr, value)
+
+    def get_tensor(self, name: str) -> torch.Tensor:
+        """
+        Get the tensor specified by the given path to value.
+
+        For example, to get the attribute mod.layer1.conv1.weight,
+        use accessor.get_tensor('layer1.conv1.weight')
+
+        Compare to mod.get_parameter("layer1.conv1.weight"), this method will
+        cache the intermediate submodule access to speed up future lookups.
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            tensor = getattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+        if not isinstance(tensor, torch.Tensor) and tensor is not None:
+            raise TypeError(f"{tensor} is not an instance of torch.Tensor")
+        return tensor  # type: ignore[return-value]
+
+    def set_tensor(self, name: str, value: torch.Tensor) -> None:
+        """
+        Set the attribute specified by the given path to value.
+
+        For example, to set the attribute mod.layer1.conv1.weight,
+        use accessor.set_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        set_tensor(self.get_submodule(prefix), attr, value)
+
+    def del_tensor(self, name: str) -> None:
+        """
+        Delete the attribute specified by the given path.
+
+        For example, to delete the attribute mod.layer1.conv1.weight,
+        use accessor.del_tensor("layer1.conv1.weight")
+        """
+        prefix, _, attr = name.rpartition(".")
+        submodule = self.get_submodule(prefix)
+        try:
+            delattr(submodule, attr)
+        except AttributeError as ex:
+            raise AttributeError(
+                f"{submodule._get_name()} has no attribute `{name}`"
+            ) from ex
+
+    def swap_tensor(
+        self, name: str, value: torch.Tensor, allow_missing: bool = False
+    ) -> torch.Tensor:
+        """
+        Swap the attribute specified by the given path to value.
+
+        For example, to swap the attribute mod.layer1.conv1.weight,
+        use accessor.swap_tensor("layer1.conv1.weight", value)
+        """
+        prefix, _, attr = name.rpartition(".")
+        return swap_tensor(
+            self.get_submodule(prefix), attr, value, allow_missing=allow_missing
+        )
+
+    # Batched operations
+
+    def get_tensors(self, names: Iterable[str]) -> List[torch.Tensor]:
+        """
+        Get the tensors specified by the given paths.
+
+        For example, to get the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.get_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        return [self.get_tensor(name) for name in names]
+
+    def set_tensors(self, names: Iterable[str], values: Iterable[torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        for name, value in zip(names, values):
+            self.set_tensor(name, value)
+
+    def set_tensors_dict(self, named_tensors: Dict[str, torch.Tensor]) -> None:
+        """
+        Set the attributes specified by the given paths to values.
+
+        For example, to set the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.set_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        for name, value in named_tensors.items():
+            self.set_tensor(name, value)
+
+    def del_tensors(self, names: Iterable[str]) -> None:
+        """
+        Delete the attributes specified by the given paths.
+
+        For example, to delete the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.del_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"])
+        """
+        for name in names:
+            self.del_tensor(name)
+
+    def swap_tensors(
+        self,
+        names: Iterable[str],
+        values: Iterable[torch.Tensor],
+        allow_missing: bool = False,
+    ) -> List[torch.Tensor]:
+        """
+        Swap the attributes specified by the given paths to values.
+
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors(["layer1.conv1.weight",
+        "layer1.conv1.bias"], [weight, bias])
+        """
+        if not isinstance(names, (list, tuple)):
+            names = list(names)
+        if not isinstance(values, (list, tuple)):
+            values = list(values)
+        assert len(names) == len(values), "names and values must have the same length"
+
+        return [
+            self.swap_tensor(name, value, allow_missing=allow_missing)
+            for name, value in zip(names, values)
+        ]
+
+    def swap_tensors_dict(
+        self, named_tensors: Dict[str, torch.Tensor], allow_missing: bool = False
+    ) -> Tuple[Dict[str, torch.Tensor], List[str]]:
+        """
+        Swap the attributes specified by the given paths to values.
+
+        For example, to swap the attributes mod.layer1.conv1.weight and
+        mod.layer1.conv1.bias, use accessor.swap_tensors_dict({
+            "layer1.conv1.weight": weight,
+            "layer1.conv1.bias": bias,
+        })
+        """
+        orig_named_tensors = {}
+        missing_keys = []
+        try:
+            for name, tensor in named_tensors.items():
+                orig_tensor = self.swap_tensor(name, tensor, allow_missing=True)
+                if orig_tensor is _MISSING:
+                    missing_keys.append(name)
+                orig_named_tensors[name] = orig_tensor
+        except Exception:
+            # Swap back if any exception occurs
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise
+        if missing_keys and not allow_missing:
+            # Swap back if any key is missing when allow_missing is False
+            for name, orig_tensor in orig_named_tensors.items():
+                self.swap_tensor(name, orig_tensor, allow_missing=True)
+            raise RuntimeError(f"Missing key(s): {', '.join(map(repr, missing_keys))}.")
+        return orig_named_tensors, missing_keys
+
+    def check_keys(self, keys: Iterable[str]) -> Tuple[List[str], List[str]]:
+        """Check that the given keys are valid."""
+        keys = set(keys)
+        valid_keys = {name for name, _ in self.named_tensors(remove_duplicate=False)}
+        missing_keys = valid_keys - keys
+        unexpected_keys = keys - valid_keys
+        return sorted(missing_keys), sorted(unexpected_keys)
+
+    # Shortcut methods
+
+    def named_parameters(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """Iterate over all the parameters in the module."""
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+
+    def named_buffers(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """Iterate over all the buffers in the module."""
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_tensors(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, torch.Tensor]]:
+        """Iterate over all the tensors in the module."""
+        yield from self.module.named_parameters(remove_duplicate=remove_duplicate)
+        yield from self.module.named_buffers(remove_duplicate=remove_duplicate)
+
+    def named_modules(
+        self,
+        remove_duplicate: bool = True,
+    ) -> Iterable[Tuple[str, "torch.nn.Module"]]:
+        """Iterate over all the modules in the module."""
+        yield from self.module.named_modules(remove_duplicate=remove_duplicate)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/_per_sample_grad.py b/MLPY/Lib/site-packages/torch/nn/utils/_per_sample_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec06045990bb4170fb9fed82859119e81fd3c198
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/_per_sample_grad.py
@@ -0,0 +1,102 @@
+import functools
+
+import torch
+from torch.nn.utils._expanded_weights.expanded_weights_impl import ExpandedWeight
+
+from torch.utils import _pytree as pytree
+
+
+# dependency on `functional_call` means that this can't be exposed in utils
+# without creating circular dependency
+def call_for_per_sample_grads(module, *, batch_size=None, loss_reduction="sum", batch_first=True):
+    r"""
+    Return a forward function for a module, populating grad_sample with per sample gradients on backward invocation.
+
+    Args:
+        module: The ``nn.Module`` to get per sample gradients with respect to. All trainable
+          parameters will compute per sample gradients, located in a ``grad_sample``
+          field when ``backward`` is invoked
+        batch_size: The batch size of the input. If None is passed, all tensor arguments in args and kwargs must have
+          the same batch size, which is the size of the first dimension. Otherwise, it must be passed manually.
+          Default: None
+        loss_reduction: Indicates if the loss reduction (for aggregating the gradients) is a sum or a mean operation. If
+          "mean", per sample gradients will be scaled by the batch size to offset the crossbatch interaction from
+          running mean across a batch. Must be "mean" or "sum". Default: "sum"
+        batch_first: Indicates if the batch dimension is the first dimension. If True, the batch dimension is the first
+          dimension. If False, it's the second dimension. Default: True.
+
+    Examples::
+        >>> # xdoctest: +SKIP
+        >>> model = nn.Linear(4, 3)
+        >>> batched_input = torch.randn(5, 4)  # batch size of 5
+        >>> res = call_for_per_sample_grads(model)(batched_input).sum()
+        >>> res.backward()
+        >>> assert model.weight.shape == (3, 4)
+        >>> assert model.weight.grad_sample.shape == (5, 3, 4)
+        >>> assert model.weight.grad is None
+        >>> assert model.bias.shape == (3,)
+        >>> assert model.bias.grad_sample.shape == (5, 3)
+        >>> assert model.bias.grad is None
+
+    An example using "mean" loss reduction. The grad_sample fields will be scaled by batch_size from what they would be
+    if we ran the same code with loss_reduction="sum". This is because the mean at the end will scale all
+    grad_outputs by 1 / batch_size from cross batch interaction.
+        >>> model = nn.Linear(4, 3)
+        >>> batched_input = torch.randn(5, 4)  # batch size of 5
+        >>> res = call_for_per_sample_grads(model, 5, loss_reduction="mean")(batched_input).mean()
+        >>> res.backward()
+
+    Note::
+        Does not work with any `nn.RNN`, including `nn.GRU` or `nn.LSTM`. Please use custom
+        rewrites that wrap an `nn.Linear` module. See Opacus for an example
+    """
+
+    def maybe_build_expanded_weight(og_tensor, batch_size):
+        if og_tensor.requires_grad:
+            return ExpandedWeight(og_tensor, batch_size, loss_reduction)
+        else:
+            return og_tensor
+
+    def compute_batch_size(*args, **kwargs):
+        args_and_kwargs = pytree.arg_tree_leaves(*args, **kwargs)
+        batch_size = None
+        for arg in args_and_kwargs:
+            if not isinstance(arg, torch.Tensor):
+                continue
+
+            arg_batch_size = arg.shape[0] if batch_first else arg.shape[1]
+            if batch_size is not None and batch_size != arg_batch_size:
+                raise RuntimeError("When computing batch size, found at least one input with batch size "
+                                   f"{batch_size} and one with batch size {arg_batch_size}. Please specify it "
+                                   "explicitly using the batch size kwarg in call_for_per_sample_grads")
+            batch_size = arg_batch_size
+        if batch_size is None:
+            raise RuntimeError("Unable to find a tensor in the passed args and kwargs. They may not be pytree-able "
+                               "and so ExpandedWeights cannot compute the batch size from the inputs. Please specify "
+                               "it explicitly")
+        return batch_size
+
+    if loss_reduction not in ["sum", "mean"]:
+        raise RuntimeError(f"Expected loss_reduction argument to be sum or mean, got {loss_reduction}")
+
+    if not isinstance(module, torch.nn.Module):
+        raise RuntimeError(f"Module passed must be nn.Module, got {type(module).__name__}")
+    if not (batch_size is None or isinstance(batch_size, int)):
+        raise RuntimeError(f"Batch size passed must be None or an integer, got {type(batch_size).__name__}")
+    if batch_size is not None and batch_size < 1:
+        raise RuntimeError(f"Batch size must be positive, got {batch_size}")
+    for weight in module.parameters():
+        if hasattr(weight, "grad_sample") and weight.grad_sample is not None:  # type: ignore[attr-defined]
+            raise RuntimeError("Current Expanded Weights accumulates the gradients, which will be incorrect for multiple "
+                               f"calls without clearing gradients. Please clear out the grad_sample parameter of {weight} or "
+                               "post an issue to pytorch/pytorch to prioritize correct behavior")
+
+    @functools.wraps(module.forward)
+    def wrapper(*args, **kwargs):
+        wrapper_batch_size = batch_size
+        if wrapper_batch_size is None:
+            wrapper_batch_size = compute_batch_size(*args, **kwargs)
+
+        params = {name: maybe_build_expanded_weight(value, wrapper_batch_size) for (name, value) in module.named_parameters()}
+        return torch.func.functional_call(module, params, args, kwargs)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/clip_grad.py b/MLPY/Lib/site-packages/torch/nn/utils/clip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c19e72606685be8b737ec57991ee6cca795e54c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/clip_grad.py
@@ -0,0 +1,151 @@
+import warnings
+import functools
+from typing import Union, Iterable, List, Dict, Tuple, Optional, cast
+
+import torch
+from torch import Tensor
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype, _has_foreach_support, _device_has_foreach_support
+
+_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
+
+__all__ = ['clip_grad_norm_', 'clip_grad_norm', 'clip_grad_value_']
+
+def _no_grad(func):
+    """
+    This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions
+    clip_grad_norm_ and clip_grad_value_ themselves.
+    """
+    def _no_grad_wrapper(*args, **kwargs):
+        with torch.no_grad():
+            return func(*args, **kwargs)
+    functools.update_wrapper(_no_grad_wrapper, func)
+    return _no_grad_wrapper
+
+@_no_grad
+def clip_grad_norm_(
+        parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0,
+        error_if_nonfinite: bool = False, foreach: Optional[bool] = None) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    The norm is computed over all gradients together, as if they were
+    concatenated into a single vector. Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float): max norm of the gradients
+        norm_type (float): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:`parameters` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return torch.tensor(0.)
+    first_device = grads[0].device
+    grouped_grads: Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Tensor]], List[int]]] \
+        = _group_tensors_by_device_and_dtype([grads])  # type: ignore[assignment]
+
+    norms: List[Tensor] = []
+    for ((device, _), ([device_grads], _)) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            (foreach is None and _has_foreach_support(device_grads, device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
+            norms.extend(torch._foreach_norm(device_grads, norm_type))
+        elif foreach:
+            raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
+        else:
+            norms.extend([torch.linalg.vector_norm(g, norm_type) for g in device_grads])
+
+    total_norm = torch.linalg.vector_norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
+
+    if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of order {norm_type} for gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. To disable '
+            'this error and scale the gradients by the non-finite norm anyway, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
+    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
+    # when the gradients do not reside in CPU memory.
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    for ((device, _), ([device_grads], _)) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            (foreach is None and _has_foreach_support(device_grads, device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
+            torch._foreach_mul_(device_grads, clip_coef_clamped.to(device))
+        elif foreach:
+            raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
+        else:
+            clip_coef_clamped_device = clip_coef_clamped.to(device)
+            for g in device_grads:
+                g.mul_(clip_coef_clamped_device)
+
+    return total_norm
+
+
+def clip_grad_norm(
+        parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.,
+        error_if_nonfinite: bool = False, foreach: Optional[bool] = None) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    .. warning::
+        This method is now deprecated in favor of
+        :func:`torch.nn.utils.clip_grad_norm_`.
+    """
+    warnings.warn("torch.nn.utils.clip_grad_norm is now deprecated in favor "
+                  "of torch.nn.utils.clip_grad_norm_.", stacklevel=2)
+    return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach)
+
+
+@_no_grad
+def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float, foreach: Optional[bool] = None) -> None:
+    r"""Clip the gradients of an iterable of parameters at specified value.
+
+    Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        clip_value (float): maximum allowed value of the gradients.
+            The gradients are clipped in the range
+            :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
+        foreach (bool): use the faster foreach-based implementation
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and
+            silently fall back to the slow implementation for other device types.
+            Default: ``None``
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    clip_value = float(clip_value)
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    grouped_grads = _group_tensors_by_device_and_dtype([grads])
+
+    for ((device, _), ([grads], _)) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            (foreach is None and _has_foreach_support(cast(List[Tensor], grads), device=device))
+            or (foreach and _device_has_foreach_support(device))
+        ):
+            torch._foreach_clamp_min_(cast(List[Tensor], grads), -clip_value)
+            torch._foreach_clamp_max_(cast(List[Tensor], grads), clip_value)
+        elif foreach:
+            raise RuntimeError(f'foreach=True was passed, but can\'t use the foreach API on {device.type} tensors')
+        else:
+            for grad in grads:
+                cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/convert_parameters.py b/MLPY/Lib/site-packages/torch/nn/utils/convert_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..77797f210b2c70eef0e7bbbeb469af6fe8430047
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/convert_parameters.py
@@ -0,0 +1,83 @@
+import torch
+from typing import Iterable, Optional
+
+
+def parameters_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor:
+    r"""Flatten an iterable of parameters into a single vector.
+
+    Args:
+        parameters (Iterable[Tensor]): an iterable of Tensors that are the
+            parameters of a model.
+
+    Returns:
+        The parameters represented by a single vector
+    """
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    vec = []
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        vec.append(param.view(-1))
+    return torch.cat(vec)
+
+
+def vector_to_parameters(vec: torch.Tensor, parameters: Iterable[torch.Tensor]) -> None:
+    r"""Copy slices of a vector into an iterable of parameters.
+
+    Args:
+        vec (Tensor): a single vector representing the parameters of a model.
+        parameters (Iterable[Tensor]): an iterable of Tensors that are the
+            parameters of a model.
+    """
+    # Ensure vec of type Tensor
+    if not isinstance(vec, torch.Tensor):
+        raise TypeError(f'expected torch.Tensor, but got: {torch.typename(vec)}')
+    # Flag for the device where the parameter is located
+    param_device = None
+
+    # Pointer for slicing the vector for each parameter
+    pointer = 0
+    for param in parameters:
+        # Ensure the parameters are located in the same device
+        param_device = _check_param_device(param, param_device)
+
+        # The length of the parameter
+        num_param = param.numel()
+        # Slice the vector, reshape it, and replace the old data of the parameter
+        param.data = vec[pointer:pointer + num_param].view_as(param).data
+
+        # Increment the pointer
+        pointer += num_param
+
+
+def _check_param_device(param: torch.Tensor, old_param_device: Optional[int]) -> int:
+    r"""Check if the parameters are located on the same device.
+
+    Currently, the conversion between model parameters and single vector form is not supported
+    for multiple allocations, e.g. parameters in different GPUs/PrivateUse1s, or mixture of CPU/GPU/PrivateUse1.
+
+    Args:
+        param ([Tensor]): a Tensor of a parameter of a model
+        old_param_device (int): the device where the first parameter of a
+                                model is allocated.
+
+    Returns:
+        old_param_device (int): report device for the first time
+    """
+    # Meet the first parameter
+    support_device_types = ["cuda", torch._C._get_privateuse1_backend_name()]
+    if old_param_device is None:
+        old_param_device = param.get_device() if param.device.type in support_device_types else -1
+    else:
+        warn = False
+        if param.device.type in support_device_types:  # Check if in same GPU/PrivateUse1
+            warn = (param.get_device() != old_param_device)
+        else:  # Check if in CPU
+            warn = (old_param_device != -1)
+        if warn:
+            raise TypeError('Found two parameters on different devices, '
+                            'this is currently not supported.')
+    return old_param_device
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/fusion.py b/MLPY/Lib/site-packages/torch/nn/utils/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fef9124c60d60de8af3b3aff498386316cbdc7c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/fusion.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+import copy
+from typing import Optional, Tuple, TypeVar
+
+import torch
+
+__all__ = ['fuse_conv_bn_eval', 'fuse_conv_bn_weights', 'fuse_linear_bn_eval', 'fuse_linear_bn_weights']
+
+ConvT = TypeVar("ConvT", bound="torch.nn.modules.conv._ConvNd")
+LinearT = TypeVar("LinearT", bound="torch.nn.Linear")
+
+def fuse_conv_bn_eval(conv: ConvT, bn: torch.nn.modules.batchnorm._BatchNorm, transpose: bool = False) -> ConvT:
+    r"""Fuse a convolutional module and a BatchNorm module into a single, new convolutional module.
+
+    Args:
+        conv (torch.nn.modules.conv._ConvNd): A convolutional module.
+        bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module.
+        transpose (bool, optional): If True, transpose the convolutional weight. Defaults to False.
+
+    Returns:
+        torch.nn.modules.conv._ConvNd: The fused convolutional module.
+
+    .. note::
+        Both ``conv`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed.
+    """
+    assert not (conv.training or bn.training), "Fusion only for eval!"
+    fused_conv = copy.deepcopy(conv)
+
+    assert bn.running_mean is not None and bn.running_var is not None
+    fused_conv.weight, fused_conv.bias = fuse_conv_bn_weights(
+        fused_conv.weight, fused_conv.bias,
+        bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias, transpose)
+
+    return fused_conv
+
+def fuse_conv_bn_weights(
+    conv_w: torch.Tensor,
+    conv_b: Optional[torch.Tensor],
+    bn_rm: torch.Tensor,
+    bn_rv: torch.Tensor,
+    bn_eps: float,
+    bn_w: Optional[torch.Tensor],
+    bn_b: Optional[torch.Tensor],
+    transpose: bool = False
+) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
+    r"""Fuse convolutional module parameters and BatchNorm module parameters into new convolutional module parameters.
+
+    Args:
+        conv_w (torch.Tensor): Convolutional weight.
+        conv_b (Optional[torch.Tensor]): Convolutional bias.
+        bn_rm (torch.Tensor): BatchNorm running mean.
+        bn_rv (torch.Tensor): BatchNorm running variance.
+        bn_eps (float): BatchNorm epsilon.
+        bn_w (Optional[torch.Tensor]): BatchNorm weight.
+        bn_b (Optional[torch.Tensor]): BatchNorm bias.
+        transpose (bool, optional): If True, transpose the conv weight. Defaults to False.
+
+    Returns:
+        Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused convolutional weight and bias.
+    """
+    conv_weight_dtype = conv_w.dtype
+    conv_bias_dtype = conv_b.dtype if conv_b is not None else conv_weight_dtype
+    if conv_b is None:
+        conv_b = torch.zeros_like(bn_rm)
+    if bn_w is None:
+        bn_w = torch.ones_like(bn_rm)
+    if bn_b is None:
+        bn_b = torch.zeros_like(bn_rm)
+    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
+
+    if transpose:
+        shape = [1, -1] + [1] * (len(conv_w.shape) - 2)
+    else:
+        shape = [-1, 1] + [1] * (len(conv_w.shape) - 2)
+
+    fused_conv_w = (conv_w * (bn_w * bn_var_rsqrt).reshape(shape)).to(dtype=conv_weight_dtype)
+    fused_conv_b = ((conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b).to(dtype=conv_bias_dtype)
+
+    return (
+        torch.nn.Parameter(fused_conv_w, conv_w.requires_grad), torch.nn.Parameter(fused_conv_b, conv_b.requires_grad)
+    )
+
+def fuse_linear_bn_eval(linear: LinearT, bn: torch.nn.modules.batchnorm._BatchNorm) -> LinearT:
+    r"""Fuse a linear module and a BatchNorm module into a single, new linear module.
+
+    Args:
+        linear (torch.nn.Linear): A Linear module.
+        bn (torch.nn.modules.batchnorm._BatchNorm): A BatchNorm module.
+
+    Returns:
+        torch.nn.Linear: The fused linear module.
+
+    .. note::
+        Both ``linear`` and ``bn`` must be in eval mode, and ``bn`` must have its running buffers computed.
+    """
+    assert not (linear.training or bn.training), "Fusion only for eval!"
+    fused_linear = copy.deepcopy(linear)
+
+    """
+    Linear-BN needs to be fused while preserving the shapes of linear weight/bias.
+    To preserve the shapes of linear weight/bias, the channel dim of bn needs to be broadcastable with the last dim of linear,
+    because bn operates over the channel dim, (N, C_in, H, W) while linear operates over the last dim, (*, H_in).
+    To be broadcastable, the number of features in bn and
+    the number of output features from linear must satisfy the following condition:
+    1. they are equal, or
+    2. the number of features in bn is 1
+    Otherwise, skip the folding path
+    """
+    assert (
+        linear.out_features == bn.num_features or bn.num_features == 1
+    ), "To fuse, linear.out_features == bn.num_features or bn.num_features == 1"
+
+    assert bn.running_mean is not None and bn.running_var is not None
+    fused_linear.weight, fused_linear.bias = fuse_linear_bn_weights(
+        fused_linear.weight, fused_linear.bias,
+        bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
+
+    return fused_linear
+
+def fuse_linear_bn_weights(
+    linear_w: torch.Tensor,
+    linear_b: Optional[torch.Tensor],
+    bn_rm: torch.Tensor,
+    bn_rv: torch.Tensor,
+    bn_eps: float,
+    bn_w: torch.Tensor,
+    bn_b: torch.Tensor,
+) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
+    r"""Fuse linear module parameters and BatchNorm module parameters into new linear module parameters.
+
+    Args:
+        linear_w (torch.Tensor): Linear weight.
+        linear_b (Optional[torch.Tensor]): Linear bias.
+        bn_rm (torch.Tensor): BatchNorm running mean.
+        bn_rv (torch.Tensor): BatchNorm running variance.
+        bn_eps (float): BatchNorm epsilon.
+        bn_w (torch.Tensor): BatchNorm weight.
+        bn_b (torch.Tensor): BatchNorm bias.
+        transpose (bool, optional): If True, transpose the conv weight. Defaults to False.
+
+    Returns:
+        Tuple[torch.nn.Parameter, torch.nn.Parameter]: Fused linear weight and bias.
+    """
+    if linear_b is None:
+        linear_b = torch.zeros_like(bn_rm)
+    bn_scale = bn_w * torch.rsqrt(bn_rv + bn_eps)
+
+    fused_w = linear_w * bn_scale.unsqueeze(-1)
+    fused_b = (linear_b - bn_rm) * bn_scale + bn_b
+
+    return torch.nn.Parameter(fused_w, linear_w.requires_grad), torch.nn.Parameter(fused_b, linear_b.requires_grad)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/init.py b/MLPY/Lib/site-packages/torch/nn/utils/init.py
new file mode 100644
index 0000000000000000000000000000000000000000..486e4829716242a97f927320685189dc4ff09b02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/init.py
@@ -0,0 +1,53 @@
+import inspect
+import torch
+
+
+def skip_init(module_cls, *args, **kwargs):
+    r"""
+    Given a module class object and args / kwargs, instantiate the module without initializing parameters / buffers.
+
+    This can be useful if initialization is slow or if custom initialization will
+    be performed, making the default initialization unnecessary. There are some caveats to this, due to
+    the way this function is implemented:
+
+    1. The module must accept a `device` arg in its constructor that is passed to any parameters
+    or buffers created during construction.
+
+    2. The module must not perform any computation on parameters in its constructor except
+    initialization (i.e. functions from :mod:`torch.nn.init`).
+
+    If these conditions are satisfied, the module can be instantiated with parameter / buffer values
+    uninitialized, as if having been created using :func:`torch.empty`.
+
+    Args:
+        module_cls: Class object; should be a subclass of :class:`torch.nn.Module`
+        args: args to pass to the module's constructor
+        kwargs: kwargs to pass to the module's constructor
+
+    Returns:
+        Instantiated module with uninitialized parameters / buffers
+
+    Example::
+
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> import torch
+        >>> m = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1)
+        >>> m.weight
+        Parameter containing:
+        tensor([[0.0000e+00, 1.5846e+29, 7.8307e+00, 2.5250e-29, 1.1210e-44]],
+               requires_grad=True)
+        >>> m2 = torch.nn.utils.skip_init(torch.nn.Linear, in_features=6, out_features=1)
+        >>> m2.weight
+        Parameter containing:
+        tensor([[-1.4677e+24,  4.5915e-41,  1.4013e-45,  0.0000e+00, -1.4677e+24,
+                  4.5915e-41]], requires_grad=True)
+
+    """
+    if not issubclass(module_cls, torch.nn.Module):
+        raise RuntimeError(f'Expected a Module; got {module_cls}')
+    if 'device' not in inspect.signature(module_cls).parameters:
+        raise RuntimeError('Module must support a \'device\' arg to skip initialization')
+
+    final_device = kwargs.pop('device', 'cpu')
+    kwargs['device'] = 'meta'
+    return module_cls(*args, **kwargs).to_empty(device=final_device)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/memory_format.py b/MLPY/Lib/site-packages/torch/nn/utils/memory_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46931bf817209f8a232b9b80a04745e59030d9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/memory_format.py
@@ -0,0 +1,143 @@
+import torch
+
+
+def convert_conv2d_weight_memory_format(module, memory_format):
+    r"""Convert ``memory_format`` of ``nn.Conv2d.weight`` to ``memory_format``.
+
+    The conversion recursively applies to nested ``nn.Module``, including ``module``.
+    Note that it only changes the memory_format, but not the semantics of each dimensions.
+    This function is used to facilitate the computation to adopt NHWC kernels, which
+    provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0
+
+    .. note::
+        Calling ``model.to(memory_format=torch.channels_last)`` is more aggressive
+        than the utility function ``convert_conv2d_weight_memory_format``. Any
+        layer with 4d weight will be affected by ``model.to``, which does not
+        necessarily benefit from conversion to specified ``memory_format``.
+        One place we are confident in is that NHWC(channels_last) conversion for
+        convolution in cuDNN, As it is beneficial to run convolution in NHWC,
+        even in cases where we have to apply permutation to input tensors.
+
+        Hence our strategy here is to convert only the weight of convolution to
+        channels_last. This ensures that;
+        1. Fast convolution kernels will be used, the benefit of which could
+        outweigh overhead of permutation (if input is not in the same format)
+        2. No unnecessary permutations are applied on layers that do not benefit
+        from memory_format conversion.
+
+        The optimal case is that, layers between convolution layers are channels
+        last compatible. Input tensor would be permuted to channels last when it
+        encounters the first convolution layer and stay in that memory format.
+        Hence following convolutions will not need to permute its input tensor.
+
+        In case where a channels last incompatible layer is between convolution
+        layers, we need to permute the input tensor back to contiguous format
+        for that layer. The input tensor will go through the remaining layers in
+        contiguous format and be permuted to channels last when it encounters
+        another convolution layer. There's no point in propagating that
+        permutation to an earlier layer, as most layers are quite agnostic to
+        ``memory_format``.
+
+        This claim might change when PyTorch supports fusion of permutation, as
+        there might have been a better spot to fuse the permutation other than
+        immediately before a convolution.
+
+    Args:
+        module (nn.Module): ``nn.Conv2d`` & ``nn.ConvTranspose2d`` or container
+                            ``nn.Module``
+        memory_format: user specified ``memory_format``,
+            e.g. ``torch.channels_last`` or ``torch.contiguous_format``
+
+    Returns:
+        The original module with updated ``nn.Conv2d``
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+        >>> input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float16, device="cuda")
+        >>> model = nn.Sequential(
+        >>>     nn.Conv2d(8, 4, 3)).cuda().half()
+        >>> # This is identical to:
+        >>> # nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last)
+        >>> model = nn.utils.convert_conv2d_weight_memory_format(model, torch.channels_last)
+        >>> out = model(input)
+    """
+    # TODO: expand this to `_ConvNd` when channels_last support is extended
+    # beyond only 4d tensors.
+    if isinstance(module, (torch.nn.Conv2d, torch.nn.ConvTranspose2d)):
+        weight_data = module.weight.detach().clone().contiguous(memory_format=memory_format)
+        module.weight.data = weight_data.resize_(weight_data.size(), memory_format=memory_format)
+    for child in module.children():
+        convert_conv2d_weight_memory_format(child, memory_format)
+    return module
+
+
+def convert_conv3d_weight_memory_format(module, memory_format):
+    r"""Convert ``memory_format`` of ``nn.Conv3d.weight`` to ``memory_format``
+    The conversion recursively applies to nested ``nn.Module``, including ``module``.
+    Note that it only changes the memory_format, but not the semantics of each dimensions.
+    This function is used to facilitate the computation to adopt NHWC kernels, which
+    provides considerable speed up for fp16 data on CUDA devices with compute capability >= 7.0
+
+    .. note::
+        Calling ``model.to(memory_format=torch.channels_last)`` is more aggressive
+        than the utility function ``convert_conv3d_weight_memory_format``. Any
+        layer with 4d weight will be affected by ``model.to``, which does not
+        necessarily benefit from conversion to specified ``memory_format``.
+        One place we are confident in is that NHWC(channels_last) conversion for
+        convolution in cuDNN, As it is beneficial to run convolution in NHWC,
+        even in cases where we have to apply permutation to input tensors.
+
+        Hence our strategy here is to convert only the weight of convolution to
+        channels_last. This ensures that;
+        1. Fast convolution kernels will be used, the benefit of which could
+        outweigh overhead of permutation (if input is not in the same format)
+        2. No unnecessary permutations are applied on layers that do not benefit
+        from memory_format conversion.
+
+        The optimal case is that, layers between convolution layers are channels
+        last compatible. Input tensor would be permuted to channels last when it
+        encounters the first convolution layer and stay in that memory format.
+        Hence following convolutions will not need to permute its input tensor.
+
+        In case where a channels last incompatible layer is between convolution
+        layers, we need to permute the input tensor back to contiguous format
+        for that layer. The input tensor will go through the remaining layers in
+        contiguous format and be permuted to channels last when it encounters
+        another convolution layer. There's no point in propagating that
+        permutation to an earlier layer, as most layers are quite agnostic to
+        ``memory_format``.
+
+        This claim might change when PyTorch supports fusion of permutation, as
+        there might have been a better spot to fuse the permutation other than
+        immediately before a convolution.
+
+    Args:
+        module (nn.Module): ``nn.Conv3d`` & ``nn.ConvTranspose3d`` or container
+                            ``nn.Module``
+        memory_format: user specified ``memory_format``,
+            e.g. ``torch.channels_last`` or ``torch.contiguous_format``
+
+    Returns:
+        The original module with updated ``nn.Conv3d``
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> # xdoctest: +REQUIRES(env:CUBLAS_WORKSPACE_CONFIG)
+        >>> input = torch.randint(1, 10, (2, 8, 4, 4, 4), dtype=torch.float16, device="cuda")
+        >>> model = nn.Sequential(
+        >>>     nn.Conv3d(8, 4, 3)).cuda().half()
+        >>> # This is identical to:
+        >>> # nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last)
+        >>> model = nn.utils.convert_conv3d_weight_memory_format(model, torch.channels_last)
+        >>> out = model(input)
+    """
+
+    # TODO: expand this to `_ConvNd` when channels_last support is extended
+    # beyond only 4d tensors.
+    if isinstance(module, (torch.nn.Conv3d, torch.nn.ConvTranspose3d)):
+        weight_data = module.weight.detach().clone().contiguous(memory_format=memory_format)
+        module.weight.data = weight_data.resize_(weight_data.size(), memory_format=memory_format)
+    for child in module.children():
+        convert_conv3d_weight_memory_format(child, memory_format)
+    return module
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/parametrizations.py b/MLPY/Lib/site-packages/torch/nn/utils/parametrizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba564229344be02b67e37ac929bf39b5f20c864
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/parametrizations.py
@@ -0,0 +1,571 @@
+from enum import Enum, auto
+
+import torch
+from torch import Tensor
+from ..utils import parametrize
+from ..modules import Module
+from .. import functional as F
+
+from typing import Optional
+
+__all__ = ['orthogonal', 'spectral_norm', 'weight_norm']
+
+
+def _is_orthogonal(Q, eps=None):
+    n, k = Q.size(-2), Q.size(-1)
+    Id = torch.eye(k, dtype=Q.dtype, device=Q.device)
+    # A reasonable eps, but not too large
+    eps = 10. * n * torch.finfo(Q.dtype).eps
+    return torch.allclose(Q.mH @ Q, Id, atol=eps)
+
+
+def _make_orthogonal(A):
+    """Assume that A is a tall matrix.
+
+    Compute the Q factor s.t. A = QR (A may be complex) and diag(R) is real and non-negative.
+    """
+    X, tau = torch.geqrf(A)
+    Q = torch.linalg.householder_product(X, tau)
+    # The diagonal of X is the diagonal of R (which is always real) so we normalise by its signs
+    Q *= X.diagonal(dim1=-2, dim2=-1).sgn().unsqueeze(-2)
+    return Q
+
+
+class _OrthMaps(Enum):
+    matrix_exp = auto()
+    cayley = auto()
+    householder = auto()
+
+
+class _Orthogonal(Module):
+    base: Tensor
+
+    def __init__(self,
+                 weight,
+                 orthogonal_map: _OrthMaps,
+                 *,
+                 use_trivialization=True) -> None:
+        super().__init__()
+
+        # Note [Householder complex]
+        # For complex tensors, it is not possible to compute the tensor `tau` necessary for
+        # linalg.householder_product from the reflectors.
+        # To see this, note that the reflectors have a shape like:
+        # 0 0 0
+        # * 0 0
+        # * * 0
+        # which, for complex matrices, give n(n-1) (real) parameters. Now, you need n^2 parameters
+        # to parametrize the unitary matrices. Saving tau on its own does not work either, because
+        # not every combination of `(A, tau)` gives a unitary matrix, meaning that if we optimise
+        # them as independent tensors we would not maintain the constraint
+        # An equivalent reasoning holds for rectangular matrices
+        if weight.is_complex() and orthogonal_map == _OrthMaps.householder:
+            raise ValueError("The householder parametrization does not support complex tensors.")
+
+        self.shape = weight.shape
+        self.orthogonal_map = orthogonal_map
+        if use_trivialization:
+            self.register_buffer("base", None)
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        n, k = X.size(-2), X.size(-1)
+        transposed = n < k
+        if transposed:
+            X = X.mT
+            n, k = k, n
+        # Here n > k and X is a tall matrix
+        if self.orthogonal_map == _OrthMaps.matrix_exp or self.orthogonal_map == _OrthMaps.cayley:
+            # We just need n x k - k(k-1)/2 parameters
+            X = X.tril()
+            if n != k:
+                # Embed into a square matrix
+                X = torch.cat([X, X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1)
+            A = X - X.mH
+            # A is skew-symmetric (or skew-hermitian)
+            if self.orthogonal_map == _OrthMaps.matrix_exp:
+                Q = torch.matrix_exp(A)
+            elif self.orthogonal_map == _OrthMaps.cayley:
+                # Computes the Cayley retraction (I+A/2)(I-A/2)^{-1}
+                Id = torch.eye(n, dtype=A.dtype, device=A.device)
+                Q = torch.linalg.solve(torch.add(Id, A, alpha=-0.5), torch.add(Id, A, alpha=0.5))
+            # Q is now orthogonal (or unitary) of size (..., n, n)
+            if n != k:
+                Q = Q[..., :k]
+            # Q is now the size of the X (albeit perhaps transposed)
+        else:
+            # X is real here, as we do not support householder with complex numbers
+            A = X.tril(diagonal=-1)
+            tau = 2. / (1. + (A * A).sum(dim=-2))
+            Q = torch.linalg.householder_product(A, tau)
+            # The diagonal of X is 1's and -1's
+            # We do not want to differentiate through this or update the diagonal of X hence the casting
+            Q = Q * X.diagonal(dim1=-2, dim2=-1).int().unsqueeze(-2)
+
+        if hasattr(self, "base"):
+            Q = self.base @ Q
+        if transposed:
+            Q = Q.mT
+        return Q  # type: ignore[possibly-undefined]
+
+    @torch.autograd.no_grad()
+    def right_inverse(self, Q: torch.Tensor) -> torch.Tensor:
+        if Q.shape != self.shape:
+            raise ValueError(f"Expected a matrix or batch of matrices of shape {self.shape}. "
+                             f"Got a tensor of shape {Q.shape}.")
+
+        Q_init = Q
+        n, k = Q.size(-2), Q.size(-1)
+        transpose = n < k
+        if transpose:
+            Q = Q.mT
+            n, k = k, n
+
+        # We always make sure to always copy Q in every path
+        if not hasattr(self, "base"):
+            # Note [right_inverse expm cayley]
+            # If we do not have use_trivialization=True, we just implement the inverse of the forward
+            # map for the Householder. To see why, think that for the Cayley map,
+            # we would need to find the matrix X \in R^{n x k} such that:
+            # Y = torch.cat([X.tril(), X.new_zeros(n, n - k).expand(*X.shape[:-2], -1, -1)], dim=-1)
+            # A = Y - Y.mH
+            # cayley(A)[:, :k]
+            # gives the original tensor. It is not clear how to do this.
+            # Perhaps via some algebraic manipulation involving the QR like that of
+            # Corollary 2.2 in Edelman, Arias and Smith?
+            if self.orthogonal_map == _OrthMaps.cayley or self.orthogonal_map == _OrthMaps.matrix_exp:
+                raise NotImplementedError("It is not possible to assign to the matrix exponential "
+                                          "or the Cayley parametrizations when use_trivialization=False.")
+
+            # If parametrization == _OrthMaps.householder, make Q orthogonal via the QR decomposition.
+            # Here Q is always real because we do not support householder and complex matrices.
+            # See note [Householder complex]
+            A, tau = torch.geqrf(Q)
+            # We want to have a decomposition X = QR with diag(R) > 0, as otherwise we could
+            # decompose an orthogonal matrix Q as Q = (-Q)@(-Id), which is a valid QR decomposition
+            # The diagonal of Q is the diagonal of R from the qr decomposition
+            A.diagonal(dim1=-2, dim2=-1).sign_()
+            # Equality with zero is ok because LAPACK returns exactly zero when it does not want
+            # to use a particular reflection
+            A.diagonal(dim1=-2, dim2=-1)[tau == 0.] *= -1
+            return A.mT if transpose else A
+        else:
+            if n == k:
+                # We check whether Q is orthogonal
+                if not _is_orthogonal(Q):
+                    Q = _make_orthogonal(Q)
+                else:  # Is orthogonal
+                    Q = Q.clone()
+            else:
+                # Complete Q into a full n x n orthogonal matrix
+                N = torch.randn(*(Q.size()[:-2] + (n, n - k)), dtype=Q.dtype, device=Q.device)
+                Q = torch.cat([Q, N], dim=-1)
+                Q = _make_orthogonal(Q)
+            self.base = Q
+
+            # It is necessary to return the -Id, as we use the diagonal for the
+            # Householder parametrization. Using -Id makes:
+            # householder(torch.zeros(m,n)) == torch.eye(m,n)
+            # Poor man's version of eye_like
+            neg_Id = torch.zeros_like(Q_init)
+            neg_Id.diagonal(dim1=-2, dim2=-1).fill_(-1.)
+            return neg_Id
+
+
+def orthogonal(module: Module,
+               name: str = 'weight',
+               orthogonal_map: Optional[str] = None,
+               *,
+               use_trivialization: bool = True) -> Module:
+    r"""Apply an orthogonal or unitary parametrization to a matrix or a batch of matrices.
+
+    Letting :math:`\mathbb{K}` be :math:`\mathbb{R}` or :math:`\mathbb{C}`, the parametrized
+    matrix :math:`Q \in \mathbb{K}^{m \times n}` is **orthogonal** as
+
+    .. math::
+
+        \begin{align*}
+            Q^{\text{H}}Q &= \mathrm{I}_n \mathrlap{\qquad \text{if }m \geq n}\\
+            QQ^{\text{H}} &= \mathrm{I}_m \mathrlap{\qquad \text{if }m < n}
+        \end{align*}
+
+    where :math:`Q^{\text{H}}` is the conjugate transpose when :math:`Q` is complex
+    and the transpose when :math:`Q` is real-valued, and
+    :math:`\mathrm{I}_n` is the `n`-dimensional identity matrix.
+    In plain words, :math:`Q` will have orthonormal columns whenever :math:`m \geq n`
+    and orthonormal rows otherwise.
+
+    If the tensor has more than two dimensions, we consider it as a batch of matrices of shape `(..., m, n)`.
+
+    The matrix :math:`Q` may be parametrized via three different ``orthogonal_map`` in terms of the original tensor:
+
+    - ``"matrix_exp"``/``"cayley"``:
+      the :func:`~torch.matrix_exp` :math:`Q = \exp(A)` and the `Cayley map`_
+      :math:`Q = (\mathrm{I}_n + A/2)(\mathrm{I}_n - A/2)^{-1}` are applied to a skew-symmetric
+      :math:`A` to give an orthogonal matrix.
+    - ``"householder"``: computes a product of Householder reflectors
+      (:func:`~torch.linalg.householder_product`).
+
+    ``"matrix_exp"``/``"cayley"`` often make the parametrized weight converge faster than
+    ``"householder"``, but they are slower to compute for very thin or very wide matrices.
+
+    If ``use_trivialization=True`` (default), the parametrization implements the "Dynamic Trivialization Framework",
+    where an extra matrix :math:`B \in \mathbb{K}^{n \times n}` is stored under
+    ``module.parametrizations.weight[0].base``. This helps the
+    convergence of the parametrized layer at the expense of some extra memory use.
+    See `Trivializations for Gradient-Based Optimization on Manifolds`_ .
+
+    Initial value of :math:`Q`:
+    If the original tensor is not parametrized and ``use_trivialization=True`` (default), the initial value
+    of :math:`Q` is that of the original tensor if it is orthogonal (or unitary in the complex case)
+    and it is orthogonalized via the QR decomposition otherwise (see :func:`torch.linalg.qr`).
+    Same happens when it is not parametrized and ``orthogonal_map="householder"`` even when ``use_trivialization=False``.
+    Otherwise, the initial value is the result of the composition of all the registered
+    parametrizations applied to the original tensor.
+
+    .. note::
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`.
+
+
+    .. _`Cayley map`: https://en.wikipedia.org/wiki/Cayley_transform#Matrix_map
+    .. _`Trivializations for Gradient-Based Optimization on Manifolds`: https://arxiv.org/abs/1909.09501
+
+    Args:
+        module (nn.Module): module on which to register the parametrization.
+        name (str, optional): name of the tensor to make orthogonal. Default: ``"weight"``.
+        orthogonal_map (str, optional): One of the following: ``"matrix_exp"``, ``"cayley"``, ``"householder"``.
+            Default: ``"matrix_exp"`` if the matrix is square or complex, ``"householder"`` otherwise.
+        use_trivialization (bool, optional): whether to use the dynamic trivialization framework.
+            Default: ``True``.
+
+    Returns:
+        The original module with an orthogonal parametrization registered to the specified
+        weight
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> orth_linear = orthogonal(nn.Linear(20, 40))
+        >>> orth_linear
+        ParametrizedLinear(
+        in_features=20, out_features=40, bias=True
+        (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+            (0): _Orthogonal()
+            )
+        )
+        )
+        >>> # xdoctest: +IGNORE_WANT
+        >>> Q = orth_linear.weight
+        >>> torch.dist(Q.T @ Q, torch.eye(20))
+        tensor(4.9332e-07)
+    """
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
+        raise ValueError(
+            f"Module '{module}' has no parameter or buffer with name '{name}'"
+        )
+
+    # We could implement this for 1-dim tensors as the maps on the sphere
+    # but I believe it'd bite more people than it'd help
+    if weight.ndim < 2:
+        raise ValueError("Expected a matrix or batch of matrices. "
+                         f"Got a tensor of {weight.ndim} dimensions.")
+
+    if orthogonal_map is None:
+        orthogonal_map = "matrix_exp" if weight.size(-2) == weight.size(-1) or weight.is_complex() else "householder"
+
+    orth_enum = getattr(_OrthMaps, orthogonal_map, None)
+    if orth_enum is None:
+        raise ValueError('orthogonal_map has to be one of "matrix_exp", "cayley", "householder". '
+                         f'Got: {orthogonal_map}')
+    orth = _Orthogonal(weight,
+                       orth_enum,
+                       use_trivialization=use_trivialization)
+    parametrize.register_parametrization(module, name, orth, unsafe=True)
+    return module
+
+
+class _WeightNorm(Module):
+    def __init__(
+        self,
+        dim: Optional[int] = 0,
+    ) -> None:
+        super().__init__()
+        if dim is None:
+            dim = -1
+        self.dim = dim
+
+    def forward(self, weight_g, weight_v):
+        return torch._weight_norm(weight_v, weight_g, self.dim)
+
+    def right_inverse(self, weight):
+        weight_g = torch.norm_except_dim(weight, 2, self.dim)
+        weight_v = weight
+
+        return weight_g, weight_v
+
+
+def weight_norm(module: Module, name: str = 'weight', dim: int = 0):
+    r"""Apply weight normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by :attr:`name` with two parameters: one specifying the magnitude
+    and one specifying the direction.
+
+    By default, with ``dim=0``, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    ``dim=None``.
+
+    See https://arxiv.org/abs/1602.07868
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = weight_norm(nn.Linear(20, 40), name='weight')
+        >>> m
+        ParametrizedLinear(
+          in_features=20, out_features=40, bias=True
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _WeightNorm()
+            )
+          )
+        )
+        >>> m.parametrizations.weight.original0.size()
+        torch.Size([40, 1])
+        >>> m.parametrizations.weight.original1.size()
+        torch.Size([40, 20])
+
+    """
+    _weight_norm = _WeightNorm(dim)
+    parametrize.register_parametrization(module, name, _weight_norm, unsafe=True)
+
+    def _weight_norm_compat_hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        g_key = f"{prefix}{name}_g"
+        v_key = f"{prefix}{name}_v"
+        if g_key in state_dict and v_key in state_dict:
+            original0 = state_dict.pop(g_key)
+            original1 = state_dict.pop(v_key)
+            state_dict[f"{prefix}parametrizations.{name}.original0"] = original0
+            state_dict[f"{prefix}parametrizations.{name}.original1"] = original1
+    module._register_load_state_dict_pre_hook(_weight_norm_compat_hook)
+    return module
+
+
+class _SpectralNorm(Module):
+    def __init__(
+        self,
+        weight: torch.Tensor,
+        n_power_iterations: int = 1,
+        dim: int = 0,
+        eps: float = 1e-12
+    ) -> None:
+        super().__init__()
+        ndim = weight.ndim
+        if dim >= ndim or dim < -ndim:
+            raise IndexError("Dimension out of range (expected to be in range of "
+                             f"[-{ndim}, {ndim - 1}] but got {dim})")
+
+        if n_power_iterations <= 0:
+            raise ValueError('Expected n_power_iterations to be positive, but '
+                             f'got n_power_iterations={n_power_iterations}')
+        self.dim = dim if dim >= 0 else dim + ndim
+        self.eps = eps
+        if ndim > 1:
+            # For ndim == 1 we do not need to approximate anything (see _SpectralNorm.forward)
+            self.n_power_iterations = n_power_iterations
+            weight_mat = self._reshape_weight_to_matrix(weight)
+            h, w = weight_mat.size()
+
+            u = weight_mat.new_empty(h).normal_(0, 1)
+            v = weight_mat.new_empty(w).normal_(0, 1)
+            self.register_buffer('_u', F.normalize(u, dim=0, eps=self.eps))
+            self.register_buffer('_v', F.normalize(v, dim=0, eps=self.eps))
+
+            # Start with u, v initialized to some reasonable values by performing a number
+            # of iterations of the power method
+            self._power_method(weight_mat, 15)
+
+    def _reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor:
+        # Precondition
+        assert weight.ndim > 1
+
+        if self.dim != 0:
+            # permute dim to front
+            weight = weight.permute(self.dim, *(d for d in range(weight.dim()) if d != self.dim))
+
+        return weight.flatten(1)
+
+    @torch.autograd.no_grad()
+    def _power_method(self, weight_mat: torch.Tensor, n_power_iterations: int) -> None:
+        # See original note at torch/nn/utils/spectral_norm.py
+        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
+        #     updated in power iteration **in-place**. This is very important
+        #     because in `DataParallel` forward, the vectors (being buffers) are
+        #     broadcast from the parallelized module to each module replica,
+        #     which is a new module object created on the fly. And each replica
+        #     runs its own spectral norm power iteration. So simply assigning
+        #     the updated vectors to the module this function runs on will cause
+        #     the update to be lost forever. And the next time the parallelized
+        #     module is replicated, the same randomly initialized vectors are
+        #     broadcast and used!
+        #
+        #     Therefore, to make the change propagate back, we rely on two
+        #     important behaviors (also enforced via tests):
+        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
+        #          is already on correct device; and it makes sure that the
+        #          parallelized module is already on `device[0]`.
+        #       2. If the out tensor in `out=` kwarg has correct shape, it will
+        #          just fill in the values.
+        #     Therefore, since the same power iteration is performed on all
+        #     devices, simply updating the tensors in-place will make sure that
+        #     the module replica on `device[0]` will update the _u vector on the
+        #     parallelized module (by shared storage).
+        #
+        #    However, after we update `u` and `v` in-place, we need to **clone**
+        #    them before using them to normalize the weight. This is to support
+        #    backproping through two forward passes, e.g., the common pattern in
+        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
+        #    complain that variables needed to do backward for the first forward
+        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
+
+        # Precondition
+        assert weight_mat.ndim > 1
+
+        for _ in range(n_power_iterations):
+            # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+            # are the first left and right singular vectors.
+            # This power iteration produces approximations of `u` and `v`.
+            self._u = F.normalize(torch.mv(weight_mat, self._v),      # type: ignore[has-type]
+                                  dim=0, eps=self.eps, out=self._u)   # type: ignore[has-type]
+            self._v = F.normalize(torch.mv(weight_mat.H, self._u),
+                                  dim=0, eps=self.eps, out=self._v)   # type: ignore[has-type]
+
+    def forward(self, weight: torch.Tensor) -> torch.Tensor:
+        if weight.ndim == 1:
+            # Faster and more exact path, no need to approximate anything
+            return F.normalize(weight, dim=0, eps=self.eps)
+        else:
+            weight_mat = self._reshape_weight_to_matrix(weight)
+            if self.training:
+                self._power_method(weight_mat, self.n_power_iterations)
+            # See above on why we need to clone
+            u = self._u.clone(memory_format=torch.contiguous_format)
+            v = self._v.clone(memory_format=torch.contiguous_format)
+            # The proper way of computing this should be through F.bilinear, but
+            # it seems to have some efficiency issues:
+            # https://github.com/pytorch/pytorch/issues/58093
+            sigma = torch.vdot(u, torch.mv(weight_mat, v))
+            return weight / sigma
+
+    def right_inverse(self, value: torch.Tensor) -> torch.Tensor:
+        # we may want to assert here that the passed value already
+        # satisfies constraints
+        return value
+
+
+def spectral_norm(module: Module,
+                  name: str = 'weight',
+                  n_power_iterations: int = 1,
+                  eps: float = 1e-12,
+                  dim: Optional[int] = None) -> Module:
+    r"""Apply spectral normalization to a parameter in the given module.
+
+    .. math::
+        \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})},
+        \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    When applied on a vector, it simplifies to
+
+    .. math::
+        \mathbf{x}_{SN} = \dfrac{\mathbf{x}}{\|\mathbf{x}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generative Adversarial Networks (GANs) by reducing the Lipschitz constant
+    of the model. :math:`\sigma` is approximated performing one iteration of the
+    `power method`_ every time the weight is accessed. If the dimension of the
+    weight tensor is greater than 2, it is reshaped to 2D in power iteration
+    method to get spectral norm.
+
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`power method`: https://en.wikipedia.org/wiki/Power_iteration
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    .. note::
+        This function is implemented using the parametrization functionality
+        in :func:`~torch.nn.utils.parametrize.register_parametrization`. It is a
+        reimplementation of :func:`torch.nn.utils.spectral_norm`.
+
+    .. note::
+        When this constraint is registered, the singular vectors associated to the largest
+        singular value are estimated rather than sampled at random. These are then updated
+        performing :attr:`n_power_iterations` of the `power method`_ whenever the tensor
+        is accessed with the module on `training` mode.
+
+    .. note::
+        If the `_SpectralNorm` module, i.e., `module.parametrization.weight[idx]`,
+        is in training mode on removal, it will perform another power iteration.
+        If you'd like to avoid this iteration, set the module to eval mode
+        before its removal.
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter. Default: ``"weight"``.
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectral norm. Default: ``1``.
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms. Default: ``1e-12``.
+        dim (int, optional): dimension corresponding to number of outputs.
+            Default: ``0``, except for modules that are instances of
+            ConvTranspose{1,2,3}d, when it is ``1``
+
+    Returns:
+        The original module with a new parametrization registered to the specified
+        weight
+
+    Example::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> snm = spectral_norm(nn.Linear(20, 40))
+        >>> snm
+        ParametrizedLinear(
+          in_features=20, out_features=40, bias=True
+          (parametrizations): ModuleDict(
+            (weight): ParametrizationList(
+              (0): _SpectralNorm()
+            )
+          )
+        )
+        >>> torch.linalg.matrix_norm(snm.weight, 2)
+        tensor(1.0081, grad_fn=<AmaxBackward0>)
+    """
+    weight = getattr(module, name, None)
+    if not isinstance(weight, Tensor):
+        raise ValueError(
+            f"Module '{module}' has no parameter or buffer with name '{name}'"
+        )
+
+    if dim is None:
+        if isinstance(module, (torch.nn.ConvTranspose1d,
+                               torch.nn.ConvTranspose2d,
+                               torch.nn.ConvTranspose3d)):
+            dim = 1
+        else:
+            dim = 0
+    parametrize.register_parametrization(module, name, _SpectralNorm(weight, n_power_iterations, dim, eps))
+    return module
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/parametrize.py b/MLPY/Lib/site-packages/torch/nn/utils/parametrize.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ce2d41000b6714f945cbb4b6d9fcda8963c200
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/parametrize.py
@@ -0,0 +1,758 @@
+import torch
+from torch.nn.modules.container import ModuleList, ModuleDict, Module
+from torch.nn.parameter import Parameter
+from torch import Tensor
+
+import collections
+import copyreg
+from copy import deepcopy
+from contextlib import contextmanager
+from typing import Union, Optional, Dict, Tuple, Sequence
+
+__all__ = ['cached', 'ParametrizationList', 'register_parametrization', 'is_parametrized', 'remove_parametrizations',
+           'type_before_parametrizations', 'transfer_parametrizations_and_params']
+
+_cache_enabled = 0
+_cache: Dict[Tuple[int, str], Optional[Tensor]] = {}
+
+
+@contextmanager
+def cached():
+    r"""Context manager that enables the caching system within parametrizations registered with :func:`register_parametrization`.
+
+    The value of the parametrized objects is computed and cached the first time
+    they are required when this context manager is active. The cached values are
+    discarded when leaving the context manager.
+
+    This is useful when using a parametrized parameter more than once in the forward pass.
+    An example of this is when parametrizing the recurrent kernel of an RNN or when
+    sharing weights.
+
+    The simplest way to activate the cache is by wrapping the forward pass of the neural network
+
+    .. code-block:: python
+
+        import torch.nn.utils.parametrize as P
+        ...
+        with P.cached():
+            output = model(inputs)
+
+    in training and evaluation. One may also wrap the parts of the modules that use
+    several times the parametrized tensors. For example, the loop of an RNN with a
+    parametrized recurrent kernel:
+
+    .. code-block:: python
+
+        with P.cached():
+            for x in xs:
+                out_rnn = self.rnn_cell(x, out_rnn)
+    """
+    global _cache
+    global _cache_enabled
+    _cache_enabled += 1
+    try:
+        yield
+    finally:
+        _cache_enabled -= 1
+        if not _cache_enabled:
+            _cache = {}
+
+
+def _register_parameter_or_buffer(module, name, X):
+    if isinstance(X, Parameter):
+        module.register_parameter(name, X)
+    else:
+        module.register_buffer(name, X)
+
+
+class ParametrizationList(ModuleList):
+    r"""A sequential container that holds and manages the original parameters or buffers of a parametrized :class:`torch.nn.Module`.
+
+    It is the type of ``module.parametrizations[tensor_name]`` when ``module[tensor_name]``
+    has been parametrized with :func:`register_parametrization`.
+
+    If the first registered parametrization has a ``right_inverse`` that returns one tensor or
+    does not have a ``right_inverse`` (in which case we assume that ``right_inverse`` is the identity),
+    it will hold the tensor under the name ``original``.
+    If it has a ``right_inverse`` that returns more than one tensor, these will be registered as
+    ``original0``, ``original1``, ...
+
+    .. warning::
+        This class is used internally by :func:`register_parametrization`. It is documented
+        here for completeness. It shall not be instantiated by the user.
+
+    Args:
+        modules (sequence): sequence of modules representing the parametrizations
+        original (Parameter or Tensor): parameter or buffer that is parametrized
+        unsafe (bool): a boolean flag that denotes whether the parametrization
+            may change the dtype and shape of the tensor. Default: `False`
+            Warning: the parametrization is not checked for consistency upon registration.
+            Enable this flag at your own risk.
+    """
+
+    original: Tensor
+    unsafe: bool
+
+    def __init__(
+        self, modules: Sequence[Module], original: Union[Tensor, Parameter], unsafe: bool = False
+    ) -> None:
+        # We require this because we need to treat differently the first parametrization
+        # This should never throw, unless this class is used from the outside
+        if len(modules) == 0:
+            raise ValueError("ParametrizationList requires one or more modules.")
+
+        super().__init__(modules)
+        self.unsafe = unsafe
+
+        # In plain words:
+        # module.weight must keep its dtype and shape.
+        # Furthermore, if there is no right_inverse or the right_inverse returns a tensor,
+        # this should be of the same dtype as the original tensor
+        #
+        # We check that the following invariants hold:
+        #    X = module.weight
+        #    Y = param.right_inverse(X)
+        #    assert isinstance(Y, Tensor) or
+        #           (isinstance(Y, collections.abc.Sequence) and all(isinstance(t, Tensor) for t in Y))
+        #    Z = param(Y) if isinstance(Y, Tensor) else param(*Y)
+        #    # Consistency checks
+        #    assert X.dtype == Z.dtype and X.shape == Z.shape
+        #    # If it has one input, this allows to be able to use set_ to be able to
+        #    # move data to/from the original tensor without changing its id (which is what the
+        #    # optimizer uses to track parameters)
+        #    if isinstance(Y, Tensor)
+        #      assert X.dtype == Y.dtype
+        # Below we use original = X, new = Y
+
+        original_shape = original.shape
+        original_dtype = original.dtype
+
+        # Compute new
+        with torch.no_grad():
+            new = original
+            for module in reversed(self):  # type: ignore[call-overload]
+                if hasattr(module, "right_inverse"):
+                    try:
+                        new = module.right_inverse(new)
+                    except NotImplementedError:
+                        pass
+                # else, or if it throws, we assume that right_inverse is the identity
+
+        if not isinstance(new, Tensor) and not isinstance(new, collections.abc.Sequence):
+            raise ValueError("'right_inverse' must return a Tensor or a Sequence of tensors (list, tuple...). "
+                             f"Got {type(new).__name__}")
+
+        # Set the number of original tensors
+        self.is_tensor = isinstance(new, Tensor)
+        self.ntensors = 1 if self.is_tensor else len(new)
+
+        # Register the tensor(s)
+        if self.is_tensor:
+            if original.dtype != new.dtype:
+                raise ValueError(
+                    "When `right_inverse` outputs one tensor, it may not change the dtype.\n"
+                    f"original.dtype: {original.dtype}\n"
+                    f"right_inverse(original).dtype: {new.dtype}"
+                )
+            # Set the original to original so that the user does not need to re-register the parameter
+            # manually in the optimiser
+            with torch.no_grad():
+                original.set_(new)  # type: ignore[call-overload]
+            _register_parameter_or_buffer(self, "original", original)
+        else:
+            for i, originali in enumerate(new):
+                if not isinstance(originali, Tensor):
+                    raise ValueError("'right_inverse' must return a Tensor or a Sequence of tensors "
+                                     "(list, tuple...). "
+                                     f"Got element {i} of the sequence with type {type(originali).__name__}.")
+
+                # If the original tensor was a Parameter that required grad, we expect the user to
+                # add the new parameters to the optimizer after registering the parametrization
+                # (this is documented)
+                if isinstance(original, Parameter):
+                    originali = Parameter(originali)
+                originali.requires_grad_(original.requires_grad)
+                _register_parameter_or_buffer(self, f"original{i}", originali)
+
+        if not self.unsafe:
+            # Consistency checks:
+            # Since f : A -> B, right_inverse : B -> A, Z and original should live in B
+            # Z = forward(right_inverse(original))
+            Z = self()
+            if not isinstance(Z, Tensor):
+                raise ValueError(
+                    f"A parametrization must return a tensor. Got {type(Z).__name__}."
+                )
+            if Z.dtype != original_dtype:
+                raise ValueError(
+                    "Registering a parametrization may not change the dtype of the tensor, unless `unsafe` flag is enabled.\n"
+                    f"unparametrized dtype: {original_dtype}\n"
+                    f"parametrized dtype: {Z.dtype}"
+                )
+            if Z.shape != original_shape:
+                raise ValueError(
+                    "Registering a parametrization may not change the shape of the tensor, unless `unsafe` flag is enabled.\n"
+                    f"unparametrized shape: {original_shape}\n"
+                    f"parametrized shape: {Z.shape}"
+                )
+
+    def right_inverse(self, value: Tensor) -> None:
+        r"""Call the ``right_inverse`` methods of the parametrizations in the inverse registration order.
+
+        Then, it stores the result in ``self.original`` if ``right_inverse`` outputs one tensor
+        or in ``self.original0``, ``self.original1``, ... if it outputs several.
+
+        Args:
+            value (Tensor): Value to which initialize the module
+        """
+        # All the exceptions in this function should almost never throw.
+        # They could throw if, for example, right_inverse function returns a different
+        # dtype when given a different input, which should most likely be caused by a
+        # bug in the user's code
+
+        with torch.no_grad():
+            # See https://github.com/pytorch/pytorch/issues/53103
+            for module in reversed(self):  # type: ignore[call-overload]
+                if hasattr(module, "right_inverse"):
+                    value = module.right_inverse(value)
+                else:
+                    raise RuntimeError(f"parametrization {type(module).__name__} does not implement "
+                                       "right_inverse.")
+            if self.is_tensor:
+                # These exceptions should only throw when a right_inverse function does not
+                # return the same dtype for every input, which should most likely be caused by a bug
+                if not isinstance(value, Tensor):
+                    raise ValueError(
+                        f"`right_inverse` should return a tensor. Got {type(value).__name__}"
+                    )
+                if value.dtype != self.original.dtype:
+                    raise ValueError(
+                        f"The tensor returned by `right_inverse` has dtype {value.dtype} "
+                        f"while `original` has dtype {self.original.dtype}"
+                    )
+                # We know that the result is going to have the same dtype
+                self.original.set_(value)  # type: ignore[call-overload]
+            else:
+                if not isinstance(value, collections.abc.Sequence):
+                    raise ValueError(
+                        "'right_inverse' must return a sequence of tensors. "
+                        f"Got {type(value).__name__}."
+                    )
+                if len(value) != self.ntensors:
+                    raise ValueError(
+                        "'right_inverse' must return a sequence of tensors of length "
+                        f"{self.ntensors}. Got a sequence of length {len(value)}."
+                    )
+                for i, tensor in enumerate(value):
+                    original_i = getattr(self, f"original{i}")
+                    if not isinstance(tensor, Tensor):
+                        raise ValueError(
+                            f"`right_inverse` must return a sequence of tensors. "
+                            f"Got element {i} of type {type(tensor).__name__}"
+                        )
+                    if original_i.dtype != tensor.dtype:
+                        raise ValueError(
+                            f"Tensor {i} returned by `right_inverse` has dtype {tensor.dtype} "
+                            f"while `original{i}` has dtype {original_i.dtype}"
+                        )
+                    original_i.set_(tensor)
+
+    def forward(self) -> Tensor:
+        if torch.jit.is_scripting():
+            raise RuntimeError('Parametrization is not working with scripting.')
+        # Unpack the originals for the first parametrization
+        if self.is_tensor:
+            x = self[0](self.original)
+        else:
+            originals = (getattr(self, f"original{i}") for i in range(self.ntensors))
+            x = self[0](*originals)
+        # It's not possible to call self[1:] here, so we have to be a bit more cryptic
+        # Also we want to skip all non-integer keys
+        curr_idx = 1
+        while hasattr(self, str(curr_idx)):
+            x = self[curr_idx](x)
+            curr_idx += 1
+        return x
+
+
+def _inject_new_class(module: Module) -> None:
+    r"""Set up a module to be parametrized.
+
+    This works by substituting the class of the module by a class
+    that extends it to be able to inject a property
+
+    Args:
+        module (nn.Module): module into which to inject the property
+    """
+    cls = module.__class__
+
+    def default_deepcopy(self, memo):
+        # Just emulate a standard deepcopy procedure when __deepcopy__ doesn't exist in the current class.
+        obj = memo.get(id(self), None)
+        if obj is not None:
+            return obj
+        replica = self.__new__(self.__class__)
+        memo[id(self)] = replica
+        replica.__dict__ = deepcopy(self.__dict__, memo)
+        # Also save all slots if they exist.
+        slots_to_save = copyreg._slotnames(self.__class__)  # type: ignore[attr-defined]
+        for slot in slots_to_save:
+            if hasattr(self, slot):
+                setattr(replica, slot, deepcopy(getattr(self, slot), memo))
+        return replica
+
+    def getstate(self):
+        raise RuntimeError(
+            "Serialization of parametrized modules is only "
+            "supported through state_dict(). See:\n"
+            "https://pytorch.org/tutorials/beginner/saving_loading_models.html"
+            "#saving-loading-a-general-checkpoint-for-inference-and-or-resuming-training"
+        )
+
+    dct = {"__getstate__": getstate}
+    # We don't allow serialization of parametrized modules but should still allow deepcopying.
+    # Default 'deepcopy' function invokes __deepcopy__ method instead of __getstate__ when it exists.
+    if not hasattr(cls, "__deepcopy__"):
+        dct["__deepcopy__"] = default_deepcopy  # type: ignore[assignment]
+
+    param_cls = type(
+        f"Parametrized{cls.__name__}",
+        (cls,),
+        dct,
+    )
+
+    module.__class__ = param_cls
+
+
+def _inject_property(module: Module, tensor_name: str) -> None:
+    r"""Injects a property into module[tensor_name].
+
+    It assumes that the class in the module has already been modified from its
+    original one using _inject_new_class and that the tensor under :attr:`tensor_name`
+    has already been moved out
+
+    Args:
+        module (nn.Module): module into which to inject the property
+        tensor_name (str): name of the name of the property to create
+    """
+    # We check the precondition.
+    # This should never fire if register_parametrization is correctly implemented
+    assert not hasattr(module, tensor_name)
+
+    @torch.jit.unused
+    def get_cached_parametrization(parametrization) -> Tensor:
+        global _cache
+        key = (id(module), tensor_name)
+        tensor = _cache.get(key)
+        if tensor is None:
+            tensor = parametrization()
+            _cache[key] = tensor
+        return tensor
+
+    def get_parametrized(self) -> Tensor:
+        if torch.jit.is_scripting():
+            raise RuntimeError('Parametrization is not working with scripting.')
+        parametrization = self.parametrizations[tensor_name]
+        if _cache_enabled:
+            if torch.jit.is_scripting():
+                # Scripting
+                raise RuntimeError('Caching is not implemented for scripting. '
+                                   'Either disable caching or avoid scripting.')
+            elif torch._C._get_tracing_state() is not None:
+                # Tracing
+                raise RuntimeError('Cannot trace a model while caching parametrizations.')
+            else:
+                return get_cached_parametrization(parametrization)
+        else:
+            # If caching is not active, this function just evaluates the parametrization
+            return parametrization()
+
+    def set_original(self, value: Tensor) -> None:
+        if torch.jit.is_scripting():
+            raise RuntimeError('Parametrization is not working with scripting.')
+        self.parametrizations[tensor_name].right_inverse(value)
+
+    setattr(module.__class__, tensor_name, property(get_parametrized, set_original))
+
+def register_parametrization(
+    module: Module, tensor_name: str, parametrization: Module, *, unsafe: bool = False,
+) -> Module:
+    r"""Register a parametrization to a tensor in a module.
+
+    Assume that ``tensor_name="weight"`` for simplicity. When accessing ``module.weight``,
+    the module will return the parametrized version ``parametrization(module.weight)``.
+    If the original tensor requires a gradient, the backward pass will differentiate
+    through :attr:`parametrization`, and the optimizer will update the tensor accordingly.
+
+    The first time that a module registers a parametrization, this function will add an attribute
+    ``parametrizations`` to the module of type :class:`~ParametrizationList`.
+
+    The list of parametrizations on the tensor ``weight`` will be accessible under
+    ``module.parametrizations.weight``.
+
+    The original tensor will be accessible under
+    ``module.parametrizations.weight.original``.
+
+    Parametrizations may be concatenated by registering several parametrizations
+    on the same attribute.
+
+    The training mode of a registered parametrization is updated on registration
+    to match the training mode of the host module
+
+    Parametrized parameters and buffers have an inbuilt caching system that can be activated
+    using the context manager :func:`cached`.
+
+    A :attr:`parametrization` may optionally implement a method with signature
+
+    .. code-block:: python
+
+        def right_inverse(self, X: Tensor) -> Union[Tensor, Sequence[Tensor]]
+
+    This method is called on the unparametrized tensor when the first parametrization
+    is registered to compute the initial value of the original tensor.
+    If this method is not implemented, the original tensor will be just the unparametrized tensor.
+
+    If all the parametrizations registered on a tensor implement `right_inverse` it is possible
+    to initialize a parametrized tensor by assigning to it, as shown in the example below.
+
+    It is possible for the first parametrization to depend on several inputs.
+    This may be implemented returning a tuple of tensors from ``right_inverse``
+    (see the example implementation of a ``RankOne`` parametrization below).
+
+    In this case, the unconstrained tensors are also located under ``module.parametrizations.weight``
+    with names ``original0``, ``original1``,...
+
+    .. note::
+
+        If unsafe=False (default) both the forward and right_inverse methods will be called
+        once to perform a number of consistency checks.
+        If unsafe=True, then right_inverse will be called if the tensor is not parametrized,
+        and nothing will be called otherwise.
+
+    .. note::
+
+        In most situations, ``right_inverse`` will be a function such that
+        ``forward(right_inverse(X)) == X`` (see
+        `right inverse <https://en.wikipedia.org/wiki/Inverse_function#Right_inverses>`_).
+        Sometimes, when the parametrization is not surjective, it may be reasonable
+        to relax this.
+
+    .. warning::
+
+        If a parametrization depends on several inputs, :func:`~register_parametrization`
+        will register a number of new parameters. If such parametrization is registered
+        after the optimizer is created, these new parameters will need to be added manually
+        to the optimizer. See :meth:`torch.Optimizer.add_param_group`.
+
+    Args:
+        module (nn.Module): module on which to register the parametrization
+        tensor_name (str): name of the parameter or buffer on which to register
+            the parametrization
+        parametrization (nn.Module): the parametrization to register
+    Keyword args:
+        unsafe (bool): a boolean flag that denotes whether the parametrization
+            may change the dtype and shape of the tensor. Default: `False`
+            Warning: the parametrization is not checked for consistency upon registration.
+            Enable this flag at your own risk.
+
+    Raises:
+        ValueError: if the module does not have a parameter or a buffer named :attr:`tensor_name`
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_LAPACK)
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> import torch.nn.utils.parametrize as P
+        >>>
+        >>> class Symmetric(nn.Module):
+        >>>     def forward(self, X):
+        >>>         return X.triu() + X.triu(1).T  # Return a symmetric matrix
+        >>>
+        >>>     def right_inverse(self, A):
+        >>>         return A.triu()
+        >>>
+        >>> m = nn.Linear(5, 5)
+        >>> P.register_parametrization(m, "weight", Symmetric())
+        >>> print(torch.allclose(m.weight, m.weight.T))  # m.weight is now symmetric
+        True
+        >>> A = torch.rand(5, 5)
+        >>> A = A + A.T   # A is now symmetric
+        >>> m.weight = A  # Initialize the weight to be the symmetric matrix A
+        >>> print(torch.allclose(m.weight, A))
+        True
+
+        >>> class RankOne(nn.Module):
+        >>>     def forward(self, x, y):
+        >>>         # Form a rank 1 matrix multiplying two vectors
+        >>>         return x.unsqueeze(-1) @ y.unsqueeze(-2)
+        >>>
+        >>>     def right_inverse(self, Z):
+        >>>         # Project Z onto the rank 1 matrices
+        >>>         U, S, Vh = torch.linalg.svd(Z, full_matrices=False)
+        >>>         # Return rescaled singular vectors
+        >>>         s0_sqrt = S[0].sqrt().unsqueeze(-1)
+        >>>         return U[..., :, 0] * s0_sqrt, Vh[..., 0, :] * s0_sqrt
+        >>>
+        >>> linear_rank_one = P.register_parametrization(nn.Linear(4, 4), "weight", RankOne())
+        >>> print(torch.linalg.matrix_rank(linear_rank_one.weight).item())
+        1
+
+    """
+    parametrization.train(module.training)
+    if is_parametrized(module, tensor_name):
+        # Correctness checks.
+        # If A is the space of tensors with shape and dtype equal to module.weight
+        # we check that parametrization.forward and parametrization.right_inverse are
+        # functions from A to A
+        if not unsafe:
+            Y = getattr(module, tensor_name)
+            X = parametrization(Y)
+            if not isinstance(X, Tensor):
+                raise ValueError(
+                    f"A parametrization must return a tensor. Got {type(X).__name__}."
+                )
+            if X.dtype != Y.dtype:
+                raise ValueError(
+                    "Registering a parametrization may not change the dtype of the tensor, unless the `unsafe` flag is enabled.\n"
+                    f"module.{tensor_name}.dtype: {Y.dtype}\n"
+                    f"parametrization(module.{tensor_name}).dtype: {X.dtype}"
+                )
+            if X.shape != Y.shape:
+                raise ValueError(
+                    "Registering a parametrization may not change the shape of the tensor, unless the `unsafe` flag is enabled.\n"
+                    f"module.{tensor_name}.shape: {Y.shape}\n"
+                    f"parametrization(module.{tensor_name}).shape: {X.shape}"
+                )
+            if hasattr(parametrization, "right_inverse"):
+                try:
+                    Z = parametrization.right_inverse(X)  # type: ignore[operator]
+                except NotImplementedError:
+                    pass
+                else:
+                    if not isinstance(Z, Tensor):
+                        raise ValueError(
+                            f"parametrization.right_inverse must return a tensor. Got: {type(Z).__name__}"
+                        )
+                    if Z.dtype != Y.dtype:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same dtype "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.dtype: {Y.dtype}\n"
+                            f"returned dtype: {Z.dtype}"
+                        )
+                    if Z.shape != Y.shape:
+                        raise ValueError(
+                            "The tensor returned by parametrization.right_inverse must have the same shape "
+                            f"as module.{tensor_name}, unless the `unsafe` flag is enabled.\n"
+                            f"module.{tensor_name}.shape: {Y.shape}\n"
+                            f"returned shape: {Z.shape}"
+                        )
+            # else right_inverse is assumed to be the identity
+
+        # add the new parametrization to the parametrization list
+        assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+        module.parametrizations[tensor_name].append(parametrization)
+        # If unsafe was True in previous parametrization, keep it enabled
+        module.parametrizations[tensor_name].unsafe |= unsafe  # type: ignore[index, union-attr]
+    elif tensor_name in module._buffers or tensor_name in module._parameters:
+        # Set the parametrization mechanism
+        # Fetch the original buffer or parameter
+        original = getattr(module, tensor_name)
+        # We create this early to check for possible errors
+        parametrizations = ParametrizationList([parametrization], original, unsafe=unsafe)
+        # Delete the previous parameter or buffer
+        delattr(module, tensor_name)
+        # If this is the first parametrization registered on the module,
+        # we prepare the module to inject the property
+        if not is_parametrized(module):
+            # Change the class
+            _inject_new_class(module)
+            # Inject a ``ModuleDict`` into the instance under module.parametrizations
+            module.parametrizations = ModuleDict()
+        # Add a property into the class
+        _inject_property(module, tensor_name)
+        # Add a ParametrizationList
+        assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+        module.parametrizations[tensor_name] = parametrizations
+    else:
+        raise ValueError(
+            f"Module '{module}' does not have a parameter, a buffer, or a "
+            f"parametrized element with name '{tensor_name}'"
+        )
+    return module
+
+
+def is_parametrized(module: Module, tensor_name: Optional[str] = None) -> bool:
+    r"""Determine if a module has a parametrization.
+
+    Args:
+        module (nn.Module): module to query
+        tensor_name (str, optional): name of the parameter in the module
+            Default: ``None``
+    Returns:
+        ``True`` if :attr:`module` has a parametrization for the parameter named :attr:`tensor_name`,
+        or if it has any parametrization when :attr:`tensor_name` is ``None``;
+        otherwise ``False``
+    """
+    parametrizations = getattr(module, "parametrizations", None)
+    if parametrizations is None or not isinstance(parametrizations, ModuleDict):
+        return False
+    if tensor_name is None:
+        # Check that there is at least one parametrized buffer or Parameter
+        return len(parametrizations) > 0
+    else:
+        return tensor_name in parametrizations
+
+def remove_parametrizations(
+    module: Module, tensor_name: str, leave_parametrized: bool = True
+) -> Module:
+    r"""Remove the parametrizations on a tensor in a module.
+
+    - If ``leave_parametrized=True``, ``module[tensor_name]`` will be set to
+      its current output. In this case, the parametrization shall not change the ``dtype``
+      of the tensor.
+    - If ``leave_parametrized=False``, ``module[tensor_name]`` will be set to
+      the unparametrised tensor in ``module.parametrizations[tensor_name].original``.
+      This is only possible when the parametrization depends on just one tensor.
+
+    Args:
+        module (nn.Module): module from which remove the parametrization
+        tensor_name (str): name of the parametrization to be removed
+        leave_parametrized (bool, optional): leave the attribute :attr:`tensor_name` parametrized.
+            Default: ``True``
+
+    Returns:
+        Module: module
+
+    Raises:
+        ValueError: if ``module[tensor_name]`` is not parametrized
+        ValueError: if ``leave_parametrized=False`` and the parametrization depends on several tensors
+    """
+    if not is_parametrized(module, tensor_name):
+        raise ValueError(f"Module {module} does not have a parametrization on {tensor_name}")
+
+    # Fetch the original tensor
+    assert isinstance(module.parametrizations, ModuleDict)  # Make mypy happy
+    parametrizations = module.parametrizations[tensor_name]
+    if parametrizations.is_tensor:
+        original = parametrizations.original
+        if leave_parametrized:
+            with torch.no_grad():
+                t = getattr(module, tensor_name)
+            # We know they have the same dtype because we have checked this when registering the
+            # parametrizations. As such, we can use set_
+            # We do this so that the parameter does not to change the id()
+            # This way the user does not need to update the optimizer
+            with torch.no_grad():
+                if type(original) is torch.Tensor:
+                    original.set_(t)
+                else:
+                    try:
+                        original.set_(t)
+                    except RuntimeError as e:
+                        # TODO: Fix this for tensor subclasses that are parameters:
+                        # RuntimeError: set_storage is not allowed on a Tensor created from .data or .detach().
+                        raise RuntimeError("Calling remove_parametrizations() with leave_parametrized=True "
+                                           "for a parameter that is an instance of a tensor subclass requires "
+                                           "set_() to be implemented correctly for the tensor subclass. Either "
+                                           "set leave_parametrized=False or provide a working implementation for "
+                                           "set_() in the tensor subclass.") from e
+    else:
+        if leave_parametrized:
+            # We cannot use no_grad because we need to know whether one or more
+            # original tensors required grad
+            t = getattr(module, tensor_name)
+            # We'll have to trust the user to add it to the optimizer
+            original = Parameter(t) if t.requires_grad else t
+        else:
+            raise ValueError("Cannot leave unparametrized (`leave_parametrized=False`) a tensor "
+                             "that is parametrized in terms of a sequence of tensors.")
+
+    # Delete the property that manages the parametrization
+    delattr(module.__class__, tensor_name)
+    # Delete the ParametrizationList
+    del module.parametrizations[tensor_name]
+
+    # Restore the parameter / buffer into the main class
+    _register_parameter_or_buffer(module, tensor_name, original)
+
+    # Roll back the parametrized class if no other buffer or parameter
+    # is currently parametrized in this class
+    if not is_parametrized(module):
+        delattr(module, "parametrizations")
+        # Restore class
+        orig_cls = module.__class__.__bases__[0]
+        module.__class__ = orig_cls
+    return module
+
+def type_before_parametrizations(module: Module) -> type:
+    r"""Return the module type before parametrizations were applied and if not, then it returns the module type.
+
+    Args:
+        module (nn.Module): module to get type of
+    """
+    if is_parametrized(module):
+        return module.__class__.__bases__[0]
+    else:
+        return type(module)
+
+def transfer_parametrizations_and_params(
+    from_module: Module, to_module: Module, tensor_name: Optional[str] = None
+) -> Module:
+    r"""Transfer parametrizations and the parameters they parametrize from :attr:`from_module` to :attr:`to_module`.
+
+    If :attr:`tensor_name` is specified, only transfers the specified parameter, otherwise
+    transfers all parametrized parameters. If those parameters do not exist in to_module, it will create them.
+    Does nothing if from_module is not parametrized.
+
+    Args:
+        from_module (nn.Module): module to transfer from
+        to_module (nn.Module): module to transfer to
+        tensor_name (str, optional): parameter to transfer
+
+    Returns:
+        Module: to_module
+    """
+    if is_parametrized(from_module):
+        assert isinstance(from_module.parametrizations, ModuleDict)  # for mypy
+
+        # get list of all params or the single param to transfer
+        parameters_to_transfer: Union[list, ModuleDict] = (
+            from_module.parametrizations if tensor_name is None else [tensor_name]
+        )
+
+        assert hasattr(parameters_to_transfer, "__iter__")  # for mypy
+        for parameter_name in parameters_to_transfer:
+
+            # initialize the to-be-transferred param in to_module if it doesn't exist already
+            if not hasattr(to_module, parameter_name):
+                setattr(
+                    to_module,
+                    parameter_name,
+                    Parameter(getattr(from_module, parameter_name)),
+                )
+
+            # apply the params's parametrizations to to_module
+            for param_func in from_module.parametrizations[parameter_name]:
+                register_parametrization(to_module, parameter_name, param_func)
+            assert isinstance(to_module.parametrizations, ModuleDict)  # for mypy
+
+            # make values match, original values can be stored in either original or
+            # original0, original1..., need to check both cases
+            if hasattr(from_module.parametrizations[parameter_name], "original"):
+                to_module.parametrizations[parameter_name].original = \
+                    from_module.parametrizations[parameter_name].original
+            else:
+                num = 0
+                orig_num = "original" + str(num)
+                # loop through each original# until all values have been set
+                while hasattr(from_module.parametrizations[parameter_name], orig_num):
+                    setattr(
+                        to_module.parametrizations[parameter_name],
+                        orig_num,
+                        getattr(from_module.parametrizations[parameter_name], orig_num),
+                    )
+                    num = num + 1
+                    orig_num = "original" + str(num)
+
+    return to_module
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/prune.py b/MLPY/Lib/site-packages/torch/nn/utils/prune.py
new file mode 100644
index 0000000000000000000000000000000000000000..c00a5a74afe557cfcdecd41ccea52b57cf16a12d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/prune.py
@@ -0,0 +1,1379 @@
+r"""Pruning methods."""
+import numbers
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import Tuple
+
+import torch
+
+
+class BasePruningMethod(ABC):
+    r"""Abstract base class for creation of new pruning techniques.
+
+    Provides a skeleton for customization requiring the overriding of methods
+    such as :meth:`compute_mask` and :meth:`apply`.
+    """
+
+    _tensor_name: str
+
+    def __call__(self, module, inputs):
+        r"""Multiply the mask into original tensor and store the result.
+
+        Multiplies the mask (stored in ``module[name + '_mask']``)
+        into the original tensor (stored in ``module[name + '_orig']``)
+        and stores the result into ``module[name]`` by using :meth:`apply_mask`.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            inputs: not used.
+        """
+        setattr(module, self._tensor_name, self.apply_mask(module))
+
+    @abstractmethod
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a random mask to
+        apply on top of the ``default_mask`` according to the specific pruning
+        method recipe.
+
+        Args:
+            t (torch.Tensor): tensor representing the importance scores of the
+            parameter to prune.
+            default_mask (torch.Tensor): Base mask from previous pruning
+            iterations, that need to be respected after the new mask is
+            applied. Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+        """
+        pass
+
+    def apply_mask(self, module):
+        r"""Simply handles the multiplication between the parameter being pruned and the generated mask.
+
+        Fetches the mask and the original tensor from the module
+        and returns the pruned version of the tensor.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+
+        Returns:
+            pruned_tensor (torch.Tensor): pruned version of the input tensor
+        """
+        # to carry out the multiplication, the mask needs to have been computed,
+        # so the pruning method must know what tensor it's operating on
+        assert self._tensor_name is not None, f"Module {module} has to be pruned"  # this gets set in apply()
+        mask = getattr(module, self._tensor_name + "_mask")
+        orig = getattr(module, self._tensor_name + "_orig")
+        pruned_tensor = mask.to(dtype=orig.dtype) * orig
+        return pruned_tensor
+
+    @classmethod
+    def apply(cls, module, name, *args, importance_scores=None, **kwargs):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            args: arguments passed on to a subclass of
+                :class:`BasePruningMethod`
+            importance_scores (torch.Tensor): tensor of importance scores (of
+                same shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the
+                corresponding elements in the parameter being pruned.
+                If unspecified or None, the parameter will be used in its place.
+            kwargs: keyword arguments passed on to a subclass of a
+                :class:`BasePruningMethod`
+        """
+
+        def _get_composite_method(cls, module, name, *args, **kwargs):
+            # Check if a pruning method has already been applied to
+            # `module[name]`. If so, store that in `old_method`.
+            old_method = None
+            found = 0
+            # there should technically be only 1 hook with hook.name == name
+            # assert this using `found`
+            hooks_to_remove = []
+            for k, hook in module._forward_pre_hooks.items():
+                # if it exists, take existing thing, remove hook, then
+                # go through normal thing
+                if isinstance(hook, BasePruningMethod) and hook._tensor_name == name:
+                    old_method = hook
+                    hooks_to_remove.append(k)
+                    found += 1
+            assert (
+                found <= 1
+            ), f"Avoid adding multiple pruning hooks to the\
+                same tensor {name} of module {module}. Use a PruningContainer."
+
+            for k in hooks_to_remove:
+                del module._forward_pre_hooks[k]
+
+            # Apply the new pruning method, either from scratch or on top of
+            # the previous one.
+            method = cls(*args, **kwargs)  # new pruning
+            # Have the pruning method remember what tensor it's been applied to
+            method._tensor_name = name
+
+            # combine `methods` with `old_method`, if `old_method` exists
+            if old_method is not None:  # meaning that there was a hook
+                # if the hook is already a pruning container, just add the
+                # new pruning method to the container
+                if isinstance(old_method, PruningContainer):
+                    old_method.add_pruning_method(method)
+                    method = old_method  # rename old_method --> method
+
+                # if the hook is simply a single pruning method, create a
+                # container, add the old pruning method and the new one
+                elif isinstance(old_method, BasePruningMethod):
+                    container = PruningContainer(old_method)
+                    # Have the pruning method remember the name of its tensor
+                    # setattr(container, '_tensor_name', name)
+                    container.add_pruning_method(method)
+                    method = container  # rename container --> method
+            return method
+
+        method = _get_composite_method(cls, module, name, *args, **kwargs)
+        # at this point we have no forward_pre_hooks but we could have an
+        # active reparametrization of the tensor if another pruning method
+        # had been applied (in which case `method` would be a PruningContainer
+        # and not a simple pruning method).
+
+        # Pruning is to be applied to the module's tensor named `name`,
+        # starting from the state it is found in prior to this iteration of
+        # pruning. The pruning mask is calculated based on importances scores.
+
+        orig = getattr(module, name)
+        if importance_scores is not None:
+            assert (
+                importance_scores.shape == orig.shape
+            ), f"importance_scores should have the same shape as parameter                 {name} of {module}"
+        else:
+            importance_scores = orig
+
+        # If this is the first time pruning is applied, take care of moving
+        # the original tensor to a new parameter called name + '_orig' and
+        # and deleting the original parameter
+        if not isinstance(method, PruningContainer):
+            # copy `module[name]` to `module[name + '_orig']`
+            module.register_parameter(name + "_orig", orig)
+            # temporarily delete `module[name]`
+            del module._parameters[name]
+            default_mask = torch.ones_like(orig)  # temp
+        # If this is not the first time pruning is applied, all of the above
+        # has been done before in a previous pruning iteration, so we're good
+        # to go
+        else:
+            default_mask = (
+                getattr(module, name + "_mask")
+                .detach()
+                .clone(memory_format=torch.contiguous_format)
+            )
+
+        # Use try/except because if anything goes wrong with the mask
+        # computation etc., you'd want to roll back.
+        try:
+            # get the final mask, computed according to the specific method
+            mask = method.compute_mask(importance_scores, default_mask=default_mask)
+            # reparameterize by saving mask to `module[name + '_mask']`...
+            module.register_buffer(name + "_mask", mask)
+            # ... and the new pruned tensor to `module[name]`
+            setattr(module, name, method.apply_mask(module))
+            # associate the pruning method to the module via a hook to
+            # compute the function before every forward() (compile by run)
+            module.register_forward_pre_hook(method)
+
+        except Exception as e:
+            if not isinstance(method, PruningContainer):
+                orig = getattr(module, name + "_orig")
+                module.register_parameter(name, orig)
+                del module._parameters[name + "_orig"]
+            raise e
+
+        return method
+
+    def prune(self, t, default_mask=None, importance_scores=None):
+        r"""Compute and returns a pruned version of input tensor ``t``.
+
+        According to the pruning rule specified in :meth:`compute_mask`.
+
+        Args:
+            t (torch.Tensor): tensor to prune (of same dimensions as
+                ``default_mask``).
+            importance_scores (torch.Tensor): tensor of importance scores (of
+                same shape as ``t``) used to compute mask for pruning ``t``.
+                The values in this tensor indicate the importance of the
+                corresponding elements in the ``t`` that is being pruned.
+                If unspecified or None, the tensor ``t`` will be used in its place.
+            default_mask (torch.Tensor, optional): mask from previous pruning
+                iteration, if any. To be considered when determining what
+                portion of the tensor that pruning should act on. If None,
+                default to a mask of ones.
+
+        Returns:
+            pruned version of tensor ``t``.
+        """
+        if importance_scores is not None:
+            assert (
+                importance_scores.shape == t.shape
+            ), "importance_scores should have the same shape as tensor t"
+        else:
+            importance_scores = t
+        default_mask = default_mask if default_mask is not None else torch.ones_like(t)
+        return t * self.compute_mask(importance_scores, default_mask=default_mask)
+
+    def remove(self, module):
+        r"""Remove the pruning reparameterization from a module.
+
+        The pruned parameter named ``name`` remains permanently pruned,
+        and the parameter named ``name+'_orig'`` is removed from the parameter list.
+        Similarly, the buffer named ``name+'_mask'`` is removed from the buffers.
+
+        Note:
+            Pruning itself is NOT undone or reversed!
+        """
+        # before removing pruning from a tensor, it has to have been applied
+        assert (
+            self._tensor_name is not None
+        ), f"Module {module} has to be pruned            before pruning can be removed"  # this gets set in apply()
+
+        # to update module[name] to latest trained weights
+        weight = self.apply_mask(module)  # masked weights
+
+        # delete and reset
+        if hasattr(module, self._tensor_name):
+            delattr(module, self._tensor_name)
+        orig = module._parameters[self._tensor_name + "_orig"]
+        orig.data = weight.data
+        del module._parameters[self._tensor_name + "_orig"]
+        del module._buffers[self._tensor_name + "_mask"]
+        setattr(module, self._tensor_name, orig)
+
+
+class PruningContainer(BasePruningMethod):
+    """Container holding a sequence of pruning methods for iterative pruning.
+
+    Keeps track of the order in which pruning methods are applied and handles
+    combining successive pruning calls.
+
+    Accepts as argument an instance of a BasePruningMethod or an iterable of
+    them.
+    """
+
+    def __init__(self, *args):
+        self._pruning_methods: Tuple[BasePruningMethod, ...] = tuple()
+        if not isinstance(args, Iterable):  # only 1 item
+            self._tensor_name = args._tensor_name
+            self.add_pruning_method(args)
+        elif len(args) == 1:  # only 1 item in a tuple
+            self._tensor_name = args[0]._tensor_name
+            self.add_pruning_method(args[0])
+        else:  # manual construction from list or other iterable (or no args)
+            for method in args:
+                self.add_pruning_method(method)
+
+    def add_pruning_method(self, method):
+        r"""Add a child pruning ``method`` to the container.
+
+        Args:
+            method (subclass of BasePruningMethod): child pruning method
+                to be added to the container.
+        """
+        # check that we're adding a pruning method to the container
+        if not isinstance(method, BasePruningMethod) and method is not None:
+            raise TypeError(
+                f"{type(method)} is not a BasePruningMethod subclass"
+            )
+        elif method is not None and self._tensor_name != method._tensor_name:
+            raise ValueError(
+                "Can only add pruning methods acting on "
+                f"the parameter named '{self._tensor_name}' to PruningContainer {self}."
+                + f" Found '{method._tensor_name}'"
+            )
+        # if all checks passed, add to _pruning_methods tuple
+        self._pruning_methods += (method,)  # type: ignore[operator]
+
+    def __len__(self):
+        return len(self._pruning_methods)
+
+    def __iter__(self):
+        return iter(self._pruning_methods)
+
+    def __getitem__(self, idx):
+        return self._pruning_methods[idx]
+
+    def compute_mask(self, t, default_mask):
+        r"""Apply the latest ``method`` by computing the new partial masks and returning its combination with the ``default_mask``.
+
+        The new partial mask should be computed on the entries or channels
+        that were not zeroed out by the ``default_mask``.
+        Which portions of the tensor ``t`` the new mask will be calculated from
+        depends on the ``PRUNING_TYPE`` (handled by the type handler):
+
+        * for 'unstructured', the mask will be computed from the raveled
+          list of nonmasked entries;
+
+        * for 'structured', the mask will be computed from the nonmasked
+          channels in the tensor;
+
+        * for 'global', the mask will be computed across all entries.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+                (of same dimensions as ``default_mask``).
+            default_mask (torch.Tensor): mask from previous pruning iteration.
+
+        Returns:
+            mask (torch.Tensor): new mask that combines the effects
+            of the ``default_mask`` and the new mask from the current
+            pruning ``method`` (of same dimensions as ``default_mask`` and
+            ``t``).
+        """
+
+        def _combine_masks(method, t, mask):
+            r"""Combine the masks from all pruning methods and returns a new mask.
+
+            Args:
+                method (a BasePruningMethod subclass): pruning method
+                    currently being applied.
+                t (torch.Tensor): tensor representing the parameter to prune
+                    (of same dimensions as mask).
+                mask (torch.Tensor): mask from previous pruning iteration
+
+            Returns:
+                new_mask (torch.Tensor): new mask that combines the effects
+                    of the old mask and the new mask from the current
+                    pruning method (of same dimensions as mask and t).
+            """
+            new_mask = mask  # start off from existing mask
+            new_mask = new_mask.to(dtype=t.dtype)
+
+            # compute a slice of t onto which the new pruning method will operate
+            if method.PRUNING_TYPE == "unstructured":
+                # prune entries of t where the mask is 1
+                slc = mask == 1
+
+            # for struct pruning, exclude channels that have already been
+            # entirely pruned
+            elif method.PRUNING_TYPE == "structured":
+                if not hasattr(method, "dim"):
+                    raise AttributeError(
+                        "Pruning methods of PRUNING_TYPE "
+                        '"structured" need to have the attribute `dim` defined.'
+                    )
+
+                # find the channels to keep by removing the ones that have been
+                # zeroed out already (i.e. where sum(entries) == 0)
+                n_dims = t.dim()  # "is this a 2D tensor? 3D? ..."
+                dim = method.dim
+                # convert negative indexing
+                if dim < 0:
+                    dim = n_dims + dim
+                # if dim is still negative after subtracting it from n_dims
+                if dim < 0:
+                    raise IndexError(
+                        f"Index is out of bounds for tensor with dimensions {n_dims}"
+                    )
+                # find channels along dim = dim that aren't already tots 0ed out
+                keep_channel = mask.sum(dim=[d for d in range(n_dims) if d != dim]) != 0
+                # create slice to identify what to prune
+                slc = [slice(None)] * n_dims
+                slc[dim] = keep_channel
+
+            elif method.PRUNING_TYPE == "global":
+                n_dims = len(t.shape)  # "is this a 2D tensor? 3D? ..."
+                slc = [slice(None)] * n_dims
+
+            else:
+                raise ValueError(
+                    f"Unrecognized PRUNING_TYPE {method.PRUNING_TYPE}"
+                )
+
+            # compute the new mask on the unpruned slice of the tensor t
+            partial_mask = method.compute_mask(t[slc], default_mask=mask[slc])
+            new_mask[slc] = partial_mask.to(dtype=new_mask.dtype)
+
+            return new_mask
+
+        method = self._pruning_methods[-1]
+        mask = _combine_masks(method, t, default_mask)
+        return mask
+
+
+class Identity(BasePruningMethod):
+    r"""Utility pruning method that does not prune any units but generates the pruning parametrization with a mask of ones."""
+
+    PRUNING_TYPE = "unstructured"
+
+    def compute_mask(self, t, default_mask):
+        mask = default_mask
+        return mask
+
+    @classmethod
+    def apply(cls, module, name):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+        """
+        return super().apply(module, name)
+
+
+class RandomUnstructured(BasePruningMethod):
+    r"""Prune (currently unpruned) units in a tensor at random.
+
+    Args:
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+    """
+
+    PRUNING_TYPE = "unstructured"
+
+    def __init__(self, amount):
+        # Check range of validity of pruning amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+
+    def compute_mask(self, t, default_mask):
+        # Check that the amount of units to prune is not > than the number of
+        # parameters in t
+        tensor_size = t.nelement()
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        mask = default_mask.clone(memory_format=torch.contiguous_format)
+
+        if nparams_toprune != 0:  # k=0 not supported by torch.kthvalue
+            prob = torch.rand_like(t)
+            topk = torch.topk(prob.view(-1), k=nparams_toprune)
+            mask.view(-1)[topk.indices] = 0
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+        """
+        return super().apply(module, name, amount=amount)
+
+
+class L1Unstructured(BasePruningMethod):
+    r"""Prune (currently unpruned) units in a tensor by zeroing out the ones with the lowest L1-norm.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+    """
+
+    PRUNING_TYPE = "unstructured"
+
+    def __init__(self, amount):
+        # Check range of validity of pruning amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+
+    def compute_mask(self, t, default_mask):
+        # Check that the amount of units to prune is not > than the number of
+        # parameters in t
+        tensor_size = t.nelement()
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        mask = default_mask.clone(memory_format=torch.contiguous_format)
+
+        if nparams_toprune != 0:  # k=0 not supported by torch.kthvalue
+            # largest=True --> top k; largest=False --> bottom k
+            # Prune the smallest k
+            topk = torch.topk(torch.abs(t).view(-1), k=nparams_toprune, largest=False)
+            # topk will have .indices and .values
+            mask.view(-1)[topk.indices] = 0
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, importance_scores=None):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            importance_scores (torch.Tensor): tensor of importance scores (of same
+                shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the corresponding
+                elements in the parameter being pruned.
+                If unspecified or None, the module parameter will be used in its place.
+        """
+        return super().apply(
+            module, name, amount=amount, importance_scores=importance_scores
+        )
+
+
+class RandomStructured(BasePruningMethod):
+    r"""Prune entire (currently unpruned) channels in a tensor at random.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        dim (int, optional): index of the dim along which we define
+            channels to prune. Default: -1.
+    """
+
+    PRUNING_TYPE = "structured"
+
+    def __init__(self, amount, dim=-1):
+        # Check range of validity of amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+        self.dim = dim
+
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a random mask to
+        apply on top of the ``default_mask`` by randomly zeroing out channels
+        along the specified dim of the tensor.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+            default_mask (torch.Tensor): Base mask from previous pruning
+                iterations, that need to be respected after the new mask is
+                applied. Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+
+        Raises:
+            IndexError: if ``self.dim >= len(t.shape)``
+        """
+        # Check that tensor has structure (i.e. more than 1 dimension) such
+        # that the concept of "channels" makes sense
+        _validate_structured_pruning(t)
+
+        # Check that self.dim is a valid dim to index t, else raise IndexError
+        _validate_pruning_dim(t, self.dim)
+
+        # Check that the amount of channels to prune is not > than the number of
+        # channels in t along the dim to prune
+        tensor_size = t.shape[self.dim]
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        # Compute binary mask by initializing it to all 0s and then filling in
+        # 1s wherever topk.indices indicates, along self.dim.
+        # mask has the same shape as tensor t
+        def make_mask(t, dim, nchannels, nchannels_toprune):
+            # generate a random number in [0, 1] to associate to each channel
+            prob = torch.rand(nchannels)
+            # generate mask for each channel by 0ing out the channels that
+            # got assigned the k = nchannels_toprune lowest values in prob
+            threshold = torch.kthvalue(prob, k=nchannels_toprune).values
+            channel_mask = prob > threshold
+
+            mask = torch.zeros_like(t)
+            slc = [slice(None)] * len(t.shape)
+            slc[dim] = channel_mask
+            mask[slc] = 1
+            return mask
+
+        if nparams_toprune == 0:  # k=0 not supported by torch.kthvalue
+            mask = default_mask
+        else:
+            # apply the new structured mask on top of prior (potentially
+            # unstructured) mask
+            mask = make_mask(t, self.dim, tensor_size, nparams_toprune)
+            mask *= default_mask.to(dtype=mask.dtype)
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, dim=-1):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            dim (int, optional): index of the dim along which we define
+                channels to prune. Default: -1.
+        """
+        return super().apply(module, name, amount=amount, dim=dim)
+
+
+class LnStructured(BasePruningMethod):
+    r"""Prune entire (currently unpruned) channels in a tensor based on their L\ ``n``-norm.
+
+    Args:
+        amount (int or float): quantity of channels to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument ``p`` in :func:`torch.norm`.
+        dim (int, optional): index of the dim along which we define
+            channels to prune. Default: -1.
+    """
+
+    PRUNING_TYPE = "structured"
+
+    def __init__(self, amount, n, dim=-1):
+        # Check range of validity of amount
+        _validate_pruning_amount_init(amount)
+        self.amount = amount
+        self.n = n
+        self.dim = dim
+
+    def compute_mask(self, t, default_mask):
+        r"""Compute and returns a mask for the input tensor ``t``.
+
+        Starting from a base ``default_mask`` (which should be a mask of ones
+        if the tensor has not been pruned yet), generate a mask to apply on
+        top of the ``default_mask`` by zeroing out the channels along the
+        specified dim with the lowest L\ ``n``-norm.
+
+        Args:
+            t (torch.Tensor): tensor representing the parameter to prune
+            default_mask (torch.Tensor): Base mask from previous pruning
+                iterations, that need to be respected after the new mask is
+                applied.  Same dims as ``t``.
+
+        Returns:
+            mask (torch.Tensor): mask to apply to ``t``, of same dims as ``t``
+
+        Raises:
+            IndexError: if ``self.dim >= len(t.shape)``
+        """
+        # Check that tensor has structure (i.e. more than 1 dimension) such
+        # that the concept of "channels" makes sense
+        _validate_structured_pruning(t)
+        # Check that self.dim is a valid dim to index t, else raise IndexError
+        _validate_pruning_dim(t, self.dim)
+
+        # Check that the amount of channels to prune is not > than the number of
+        # channels in t along the dim to prune
+        tensor_size = t.shape[self.dim]
+        # Compute number of units to prune: amount if int,
+        # else amount * tensor_size
+        nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
+        nparams_tokeep = tensor_size - nparams_toprune
+        # This should raise an error if the number of units to prune is larger
+        # than the number of units in the tensor
+        _validate_pruning_amount(nparams_toprune, tensor_size)
+
+        # Structured pruning prunes entire channels so we need to know the
+        # L_n norm along each channel to then find the topk based on this
+        # metric
+        norm = _compute_norm(t, self.n, self.dim)
+        # largest=True --> top k; largest=False --> bottom k
+        # Keep the largest k channels along dim=self.dim
+        topk = torch.topk(norm, k=nparams_tokeep, largest=True)
+        # topk will have .indices and .values
+
+        # Compute binary mask by initializing it to all 0s and then filling in
+        # 1s wherever topk.indices indicates, along self.dim.
+        # mask has the same shape as tensor t
+        def make_mask(t, dim, indices):
+            # init mask to 0
+            mask = torch.zeros_like(t)
+            # e.g.: slc = [None, None, None], if len(t.shape) = 3
+            slc = [slice(None)] * len(t.shape)
+            # replace a None at position=dim with indices
+            # e.g.: slc = [None, None, [0, 2, 3]] if dim=2 & indices=[0,2,3]
+            slc[dim] = indices
+            # use slc to slice mask and replace all its entries with 1s
+            # e.g.: mask[:, :, [0, 2, 3]] = 1
+            mask[slc] = 1
+            return mask
+
+        if nparams_toprune == 0:  # k=0 not supported by torch.kthvalue
+            mask = default_mask
+        else:
+            mask = make_mask(t, self.dim, topk.indices)
+            mask *= default_mask.to(dtype=mask.dtype)
+
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, amount, n, dim, importance_scores=None):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+            amount (int or float): quantity of parameters to prune.
+                If ``float``, should be between 0.0 and 1.0 and represent the
+                fraction of parameters to prune. If ``int``, it represents the
+                absolute number of parameters to prune.
+            n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+                entries for argument ``p`` in :func:`torch.norm`.
+            dim (int): index of the dim along which we define channels to
+                prune.
+            importance_scores (torch.Tensor): tensor of importance scores (of same
+                shape as module parameter) used to compute mask for pruning.
+                The values in this tensor indicate the importance of the corresponding
+                elements in the parameter being pruned.
+                If unspecified or None, the module parameter will be used in its place.
+        """
+        return super().apply(
+            module,
+            name,
+            amount=amount,
+            n=n,
+            dim=dim,
+            importance_scores=importance_scores,
+        )
+
+
+class CustomFromMask(BasePruningMethod):
+
+    PRUNING_TYPE = "global"
+
+    def __init__(self, mask):
+        self.mask = mask
+
+    def compute_mask(self, t, default_mask):
+        assert default_mask.shape == self.mask.shape
+        mask = default_mask * self.mask.to(dtype=default_mask.dtype)
+        return mask
+
+    @classmethod
+    def apply(cls, module, name, mask):
+        r"""Add pruning on the fly and reparametrization of a tensor.
+
+        Adds the forward pre-hook that enables pruning on the fly and
+        the reparametrization of a tensor in terms of the original tensor
+        and the pruning mask.
+
+        Args:
+            module (nn.Module): module containing the tensor to prune
+            name (str): parameter name within ``module`` on which pruning
+                will act.
+        """
+        return super().apply(module, name, mask=mask)
+
+
+def identity(module, name):
+    r"""Apply pruning reparametrization without pruning any units.
+
+    Applies pruning reparametrization to the tensor corresponding to the
+    parameter called ``name`` in ``module`` without actually pruning any
+    units. Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Note:
+        The mask is a tensor of ones.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune.
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.identity(nn.Linear(2, 3), 'bias')
+        >>> print(m.bias_mask)
+        tensor([1., 1., 1.])
+    """
+    Identity.apply(module, name)
+    return module
+
+
+def random_unstructured(module, name, amount):
+    r"""Prune tensor by removing random (currently unpruned) units.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) units
+    selected at random.
+    Modifies module in place (and also return the modified module) by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.random_unstructured(nn.Linear(2, 3), 'weight', amount=1)
+        >>> torch.sum(m.weight_mask == 0)
+        tensor(1)
+
+    """
+    RandomUnstructured.apply(module, name, amount)
+    return module
+
+
+def l1_unstructured(module, name, amount, importance_scores=None):
+    r"""Prune tensor by removing units with the lowest L1-norm.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified `amount` of (currently unpruned) units with the
+    lowest L1-norm.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        importance_scores (torch.Tensor): tensor of importance scores (of same
+            shape as module parameter) used to compute mask for pruning.
+            The values in this tensor indicate the importance of the corresponding
+            elements in the parameter being pruned.
+            If unspecified or None, the module parameter will be used in its place.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.l1_unstructured(nn.Linear(2, 3), 'weight', amount=0.2)
+        >>> m.state_dict().keys()
+        odict_keys(['bias', 'weight_orig', 'weight_mask'])
+    """
+    L1Unstructured.apply(
+        module, name, amount=amount, importance_scores=importance_scores
+    )
+    return module
+
+
+def random_structured(module, name, amount, dim):
+    r"""Prune tensor by removing random channels along the specified dimension.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) channels
+    along the specified ``dim`` selected at random.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        dim (int): index of the dim along which we define channels to prune.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> m = prune.random_structured(
+        ...     nn.Linear(5, 3), 'weight', amount=3, dim=1
+        ... )
+        >>> columns_pruned = int(sum(torch.sum(m.weight, dim=0) == 0))
+        >>> print(columns_pruned)
+        3
+    """
+    RandomStructured.apply(module, name, amount, dim)
+    return module
+
+
+def ln_structured(module, name, amount, n, dim, importance_scores=None):
+    r"""Prune tensor by removing channels with the lowest L\ ``n``-norm along the specified dimension.
+
+    Prunes tensor corresponding to parameter called ``name`` in ``module``
+    by removing the specified ``amount`` of (currently unpruned) channels
+    along the specified ``dim`` with the lowest L\ ``n``-norm.
+    Modifies module in place (and also return the modified module)
+    by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+                will act.
+        amount (int or float): quantity of parameters to prune.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument ``p`` in :func:`torch.norm`.
+        dim (int): index of the dim along which we define channels to prune.
+        importance_scores (torch.Tensor): tensor of importance scores (of same
+            shape as module parameter) used to compute mask for pruning.
+            The values in this tensor indicate the importance of the corresponding
+            elements in the parameter being pruned.
+            If unspecified or None, the module parameter will be used in its place.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = prune.ln_structured(
+        ...     nn.Conv2d(5, 3, 2), 'weight', amount=0.3, dim=1, n=float('-inf')
+        ... )
+    """
+    LnStructured.apply(
+        module, name, amount, n, dim, importance_scores=importance_scores
+    )
+    return module
+
+
+def global_unstructured(parameters, pruning_method, importance_scores=None, **kwargs):
+    r"""
+    Globally prunes tensors corresponding to all parameters in ``parameters`` by applying the specified ``pruning_method``.
+
+    Modifies modules in place by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        parameters (Iterable of (module, name) tuples): parameters of
+            the model to prune in a global fashion, i.e. by aggregating all
+            weights prior to deciding which ones to prune. module must be of
+            type :class:`nn.Module`, and name must be a string.
+        pruning_method (function): a valid pruning function from this module,
+            or a custom one implemented by the user that satisfies the
+            implementation guidelines and has ``PRUNING_TYPE='unstructured'``.
+        importance_scores (dict): a dictionary mapping (module, name) tuples to
+            the corresponding parameter's importance scores tensor. The tensor
+            should be the same shape as the parameter, and is used for computing
+            mask for pruning.
+            If unspecified or None, the parameter will be used in place of its
+            importance scores.
+        kwargs: other keyword arguments such as:
+            amount (int or float): quantity of parameters to prune across the
+            specified parameters.
+            If ``float``, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If ``int``, it represents the
+            absolute number of parameters to prune.
+
+    Raises:
+        TypeError: if ``PRUNING_TYPE != 'unstructured'``
+
+    Note:
+        Since global structured pruning doesn't make much sense unless the
+        norm is normalized by the size of the parameter, we now limit the
+        scope of global pruning to unstructured methods.
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> from collections import OrderedDict
+        >>> net = nn.Sequential(OrderedDict([
+        ...     ('first', nn.Linear(10, 4)),
+        ...     ('second', nn.Linear(4, 1)),
+        ... ]))
+        >>> parameters_to_prune = (
+        ...     (net.first, 'weight'),
+        ...     (net.second, 'weight'),
+        ... )
+        >>> prune.global_unstructured(
+        ...     parameters_to_prune,
+        ...     pruning_method=prune.L1Unstructured,
+        ...     amount=10,
+        ... )
+        >>> print(sum(torch.nn.utils.parameters_to_vector(net.buffers()) == 0))
+        tensor(10)
+
+    """
+    # ensure parameters is a list or generator of tuples
+    if not isinstance(parameters, Iterable):
+        raise TypeError("global_unstructured(): parameters is not an Iterable")
+
+    importance_scores = importance_scores if importance_scores is not None else {}
+    if not isinstance(importance_scores, dict):
+        raise TypeError("global_unstructured(): importance_scores must be of type dict")
+
+    # flatten importance scores to consider them all at once in global pruning
+    relevant_importance_scores = torch.nn.utils.parameters_to_vector(
+        [
+            importance_scores.get((module, name), getattr(module, name))
+            for (module, name) in parameters
+        ]
+    )
+    # similarly, flatten the masks (if they exist), or use a flattened vector
+    # of 1s of the same dimensions as t
+    default_mask = torch.nn.utils.parameters_to_vector(
+        [
+            getattr(module, name + "_mask", torch.ones_like(getattr(module, name)))
+            for (module, name) in parameters
+        ]
+    )
+
+    # use the canonical pruning methods to compute the new mask, even if the
+    # parameter is now a flattened out version of `parameters`
+    container = PruningContainer()
+    container._tensor_name = "temp"  # to make it match that of `method`
+    method = pruning_method(**kwargs)
+    method._tensor_name = "temp"  # to make it match that of `container`
+    if method.PRUNING_TYPE != "unstructured":
+        raise TypeError(
+            'Only "unstructured" PRUNING_TYPE supported for '
+            f"the `pruning_method`. Found method {pruning_method} of type {method.PRUNING_TYPE}"
+        )
+
+    container.add_pruning_method(method)
+
+    # use the `compute_mask` method from `PruningContainer` to combine the
+    # mask computed by the new method with the pre-existing mask
+    final_mask = container.compute_mask(relevant_importance_scores, default_mask)
+
+    # Pointer for slicing the mask to match the shape of each parameter
+    pointer = 0
+    for module, name in parameters:
+
+        param = getattr(module, name)
+        # The length of the parameter
+        num_param = param.numel()
+        # Slice the mask, reshape it
+        param_mask = final_mask[pointer : pointer + num_param].view_as(param)
+        # Assign the correct pre-computed mask to each parameter and add it
+        # to the forward_pre_hooks like any other pruning method
+        custom_from_mask(module, name, mask=param_mask)
+
+        # Increment the pointer to continue slicing the final_mask
+        pointer += num_param
+
+
+def custom_from_mask(module, name, mask):
+    r"""Prune tensor corresponding to parameter called ``name`` in ``module`` by applying the pre-computed mask in ``mask``.
+
+    Modifies module in place (and also return the modified module) by:
+
+    1) adding a named buffer called ``name+'_mask'`` corresponding to the
+       binary mask applied to the parameter ``name`` by the pruning method.
+    2) replacing the parameter ``name`` by its pruned version, while the
+       original (unpruned) parameter is stored in a new parameter named
+       ``name+'_orig'``.
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+        mask (Tensor): binary mask to be applied to the parameter.
+
+    Returns:
+        module (nn.Module): modified (i.e. pruned) version of the input module
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = prune.custom_from_mask(
+        ...     nn.Linear(5, 3), name='bias', mask=torch.tensor([0, 1, 0])
+        ... )
+        >>> print(m.bias_mask)
+        tensor([0., 1., 0.])
+
+    """
+    CustomFromMask.apply(module, name, mask)
+    return module
+
+
+def remove(module, name):
+    r"""Remove the pruning reparameterization from a module and the pruning method from the forward hook.
+
+    The pruned parameter named ``name`` remains permanently pruned, and the parameter
+    named ``name+'_orig'`` is removed from the parameter list. Similarly,
+    the buffer named ``name+'_mask'`` is removed from the buffers.
+
+    Note:
+        Pruning itself is NOT undone or reversed!
+
+    Args:
+        module (nn.Module): module containing the tensor to prune
+        name (str): parameter name within ``module`` on which pruning
+            will act.
+
+    Examples:
+        >>> m = random_unstructured(nn.Linear(5, 7), name='weight', amount=0.2)
+        >>> m = remove(m, name='weight')
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, BasePruningMethod) and hook._tensor_name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError(
+        f"Parameter '{name}' of module {module} has to be pruned before pruning can be removed"
+    )
+
+
+def is_pruned(module):
+    r"""Check if a module is pruned by looking for pruning pre-hooks.
+
+    Check whether ``module`` is pruned by looking for
+    ``forward_pre_hooks`` in its modules that inherit from the
+    :class:`BasePruningMethod`.
+
+    Args:
+        module (nn.Module): object that is either pruned or unpruned
+
+    Returns:
+        binary answer to whether ``module`` is pruned.
+
+    Examples:
+        >>> from torch.nn.utils import prune
+        >>> m = nn.Linear(5, 7)
+        >>> print(prune.is_pruned(m))
+        False
+        >>> prune.random_unstructured(m, name='weight', amount=0.2)
+        >>> print(prune.is_pruned(m))
+        True
+    """
+    for _, submodule in module.named_modules():
+        for hook in submodule._forward_pre_hooks.values():
+            if isinstance(hook, BasePruningMethod):
+                return True
+    return False
+
+
+def _validate_pruning_amount_init(amount):
+    r"""Validate helper to check the range of amount at init.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+
+    Raises:
+        ValueError: if amount is a float not in [0, 1], or if it's a negative
+            integer.
+        TypeError: if amount is neither a float nor an integer.
+
+    Note:
+        This does not take into account the number of parameters in the
+        tensor to be pruned, which is known only at prune.
+    """
+    if not isinstance(amount, numbers.Real):
+        raise TypeError(
+            f"Invalid type for amount: {amount}. Must be int or float."
+        )
+
+    if (isinstance(amount, numbers.Integral) and amount < 0) or (
+        not isinstance(amount, numbers.Integral)  # so it's a float
+        and (float(amount) > 1.0 or float(amount) < 0.0)
+    ):
+        raise ValueError(
+            f"amount={amount} should either be a float in the range [0, 1] or a non-negative integer"
+        )
+
+
+def _validate_pruning_amount(amount, tensor_size):
+    r"""Validate that the pruning amount is meaningful wrt to the size of the data.
+
+    Validation helper to check that the amount of parameters to prune
+    is meaningful wrt to the size of the data (`tensor_size`).
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+        tensor_size (int): absolute number of parameters in the tensor
+            to prune.
+    """
+    # TODO: consider removing this check and allowing users to specify
+    # a number of units to prune that is greater than the number of units
+    # left to prune. In this case, the tensor will just be fully pruned.
+
+    if isinstance(amount, numbers.Integral) and amount > tensor_size:
+        raise ValueError(
+            f"amount={amount} should be smaller than the number of parameters to prune={tensor_size}"
+        )
+
+
+def _validate_structured_pruning(t):
+    r"""Validate that the tensor to be pruned is at least 2-Dimensional.
+
+    Validation helper to check that the tensor to be pruned is multi-
+    dimensional, such that the concept of "channels" is well-defined.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+
+    Raises:
+        ValueError: if the tensor `t` is not at least 2D.
+    """
+    shape = t.shape
+    if len(shape) <= 1:
+        raise ValueError(
+            "Structured pruning can only be applied to "
+            "multidimensional tensors. Found tensor of shape "
+            f"{shape} with {len(shape)} dims"
+        )
+
+
+def _compute_nparams_toprune(amount, tensor_size):
+    r"""Convert the pruning amount from a percentage to absolute value.
+
+    Since amount can be expressed either in absolute value or as a
+    percentage of the number of units/channels in a tensor, this utility
+    function converts the percentage to absolute value to standardize
+    the handling of pruning.
+
+    Args:
+        amount (int or float): quantity of parameters to prune.
+            If float, should be between 0.0 and 1.0 and represent the
+            fraction of parameters to prune. If int, it represents the
+            absolute number of parameters to prune.
+        tensor_size (int): absolute number of parameters in the tensor
+            to prune.
+
+    Returns:
+        int: the number of units to prune in the tensor
+    """
+    # incorrect type already checked in _validate_pruning_amount_init
+    if isinstance(amount, numbers.Integral):
+        return amount
+    else:
+        return round(amount * tensor_size)
+
+
+def _validate_pruning_dim(t, dim):
+    r"""Validate that the pruning dimension is within the bounds of the tensor dimension.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+        dim (int): index of the dim along which we define channels to prune
+    """
+    if dim >= t.dim():
+        raise IndexError(f"Invalid index {dim} for tensor of size {t.shape}")
+
+
+def _compute_norm(t, n, dim):
+    r"""Compute the L_n-norm of a tensor along all dimensions except for the specified dimension.
+
+    The L_n-norm will be computed across all entries in tensor `t` along all dimension
+    except for the one identified by dim.
+    Example: if `t` is of shape, say, 3x2x4 and dim=2 (the last dim),
+    then norm will have Size [4], and each entry will represent the
+    `L_n`-norm computed using the 3x2=6 entries for each of the 4 channels.
+
+    Args:
+        t (torch.Tensor): tensor representing the parameter to prune
+        n (int, float, inf, -inf, 'fro', 'nuc'): See documentation of valid
+            entries for argument p in torch.norm
+        dim (int): dim identifying the channels to prune
+
+    Returns:
+        norm (torch.Tensor): L_n norm computed across all dimensions except
+            for `dim`. By construction, `norm.shape = t.shape[-1]`.
+    """
+    # dims = all axes, except for the one identified by `dim`
+    dims = list(range(t.dim()))
+    # convert negative indexing
+    if dim < 0:
+        dim = dims[dim]
+    dims.remove(dim)
+
+    norm = torch.norm(t, p=n, dim=dims)
+    return norm
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/rnn.py b/MLPY/Lib/site-packages/torch/nn/utils/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae1fcb5572db45a66ffba66d6e35d1f2f123d5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/rnn.py
@@ -0,0 +1,517 @@
+import warnings
+from typing import Iterable, List, NamedTuple, Tuple, Union
+
+import torch
+from torch import Tensor
+from ... import _VF
+from ..._jit_internal import Optional
+
+
+__all__ = ['PackedSequence', 'invert_permutation', 'pack_padded_sequence', 'pad_packed_sequence', 'pad_sequence',
+           'unpad_sequence', 'pack_sequence', 'unpack_sequence']
+
+
+class PackedSequence_(NamedTuple):
+    data: torch.Tensor
+    batch_sizes: torch.Tensor
+    sorted_indices: Optional[torch.Tensor]
+    unsorted_indices: Optional[torch.Tensor]
+
+
+def bind(optional, fn):
+    if optional is None:
+        return None
+    return fn(optional)
+
+
+class PackedSequence(PackedSequence_):
+    r"""Holds the data and list of :attr:`batch_sizes` of a packed sequence.
+
+    All RNN modules accept packed sequences as inputs.
+
+    Note:
+        Instances of this class should never be created manually. They are meant
+        to be instantiated by functions like :func:`pack_padded_sequence`.
+
+        Batch sizes represent the number elements at each sequence step in
+        the batch, not the varying sequence lengths passed to
+        :func:`pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
+        the :class:`PackedSequence` would contain data ``axbc`` with
+        ``batch_sizes=[2,1,1]``.
+
+    Attributes:
+        data (Tensor): Tensor containing packed sequence
+        batch_sizes (Tensor): Tensor of integers holding
+            information about the batch size at each sequence step
+        sorted_indices (Tensor, optional): Tensor of integers holding how this
+            :class:`PackedSequence` is constructed from sequences.
+        unsorted_indices (Tensor, optional): Tensor of integers holding how this
+            to recover the original sequences with correct order.
+
+    .. note::
+        :attr:`data` can be on arbitrary device and of arbitrary dtype.
+        :attr:`sorted_indices` and :attr:`unsorted_indices` must be ``torch.int64``
+        tensors on the same device as :attr:`data`.
+
+        However, :attr:`batch_sizes` should always be a CPU ``torch.int64`` tensor.
+
+        This invariant is maintained throughout :class:`PackedSequence` class,
+        and all functions that construct a :class:`PackedSequence` in PyTorch
+        (i.e., they only pass in tensors conforming to this constraint).
+
+    """
+
+    def __new__(cls, data, batch_sizes=None, sorted_indices=None, unsorted_indices=None):
+        return super().__new__(
+            cls,
+            *_packed_sequence_init_args(data, batch_sizes, sorted_indices,
+                                        unsorted_indices))
+
+    # NOTE [ device and dtype of a PackedSequence ]
+    #
+    # See the note above in doc string (starting with ":attr:`data` can be on
+    # arbitrary device...").
+    def pin_memory(self):
+        # Why not convert `batch_sizes`?
+        # See NOTE [ device and dtype of a PackedSequence ]
+        return type(self)(self.data.pin_memory(), self.batch_sizes,
+                          bind(self.sorted_indices, lambda t: t.pin_memory()),
+                          bind(self.unsorted_indices, lambda t: t.pin_memory()))
+
+    def cuda(self, *args, **kwargs):
+        # Tests to see if 'cuda' should be added to kwargs
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
+        if ex.is_cuda:
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cuda', **kwargs)
+
+    def cpu(self, *args, **kwargs):
+
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
+        if ex.device.type == 'cpu':
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cpu', **kwargs)
+
+    def double(self):
+        return self.to(dtype=torch.double)
+
+    def float(self):
+        return self.to(dtype=torch.float)
+
+    def half(self):
+        return self.to(dtype=torch.half)
+
+    def long(self):
+        return self.to(dtype=torch.long)
+
+    def int(self):
+        return self.to(dtype=torch.int)
+
+    def short(self):
+        return self.to(dtype=torch.short)
+
+    def char(self):
+        return self.to(dtype=torch.int8)
+
+    def byte(self):
+        return self.to(dtype=torch.uint8)
+
+    def to(self, *args, **kwargs):
+        r"""Perform dtype and/or device conversion on `self.data`.
+
+        It has similar signature as :meth:`torch.Tensor.to`, except optional
+        arguments like `non_blocking` and `copy` should be passed as kwargs,
+        not args, or they will not apply to the index tensors.
+
+        .. note::
+
+            If the ``self.data`` Tensor already has the correct :class:`torch.dtype`
+            and :class:`torch.device`, then ``self`` is returned.
+            Otherwise, returns a copy with the desired configuration.
+        """
+        # Why not convert `batch_sizes`?
+        # See NOTE [ device and dtype of a PackedSequence ]
+        data = self.data.to(*args, **kwargs)
+        if data is self.data:
+            return self
+        else:
+            # Does not forward device or dtype arg/kwargs, device is set from data.device
+            kwargs = dict(filter(lambda t: t[0] != 'device' and t[0] != 'dtype', kwargs.items()))
+            sorted_indices = bind(self.sorted_indices, lambda t: t.to(data.device, **kwargs))
+            unsorted_indices = bind(self.unsorted_indices, lambda t: t.to(data.device, **kwargs))
+            return type(self)(data, self.batch_sizes, sorted_indices, unsorted_indices)
+
+    @property
+    def is_cuda(self):
+        r"""Return true if `self.data` stored on a gpu."""
+        return self.data.is_cuda
+
+    def is_pinned(self):
+        r"""Return true if `self.data` stored on in pinned memory."""
+        return self.data.is_pinned()
+
+
+# TorchScript doesn't support constructors on named tuples, so we use this helper
+# method to construct PackedSequence
+def _packed_sequence_init_args(
+    data: Tensor,
+    batch_sizes: Optional[Tensor] = None,
+    sorted_indices: Optional[Tensor] = None,
+    unsorted_indices: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+    # NB: if unsorted_indices is provided, it should be the inverse permutation
+    # to sorted_indices. Don't assert it here because the PackedSequence ctor
+    # should only be used internally.
+
+    if unsorted_indices is None:
+        unsorted_indices = invert_permutation(sorted_indices)
+
+    # support being called as `PackedSequence(data, batch_sizes, sorted_indices)`
+    if batch_sizes is not None:
+        # TODO: Re-enable this check (.type isn't supported in TorchScript)
+        if batch_sizes.device.type != 'cpu':
+            raise ValueError(
+                "batch_sizes should always be on CPU. "
+                "Instances of PackedSequence should never be created manually. "
+                "They should be instantiated by functions like pack_sequence "
+                "and pack_padded_sequences in nn.utils.rnn. "
+                "https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_sequence")
+        return data, batch_sizes, sorted_indices, unsorted_indices
+
+    # support being called as `PackedSequence((data, batch_sizes), *, sorted_indices)`
+    else:
+        assert isinstance(data, (list, tuple)) and len(data) == 2
+        return data[0], data[1], sorted_indices, unsorted_indices
+
+
+def _packed_sequence_init(
+    data: Tensor,
+    batch_sizes: Optional[Tensor] = None,
+    sorted_indices: Optional[Tensor] = None,
+    unsorted_indices: Optional[Tensor] = None,
+) -> PackedSequence:
+    data, batch_sizes, sorted_indices, unsorted_indices = _packed_sequence_init_args(
+        data, batch_sizes, sorted_indices, unsorted_indices)
+    return PackedSequence(data, batch_sizes, sorted_indices, unsorted_indices)
+
+
+def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
+    if permutation is None:
+        return None
+    output = torch.empty_like(permutation, memory_format=torch.legacy_contiguous_format)
+    output.scatter_(0, permutation,
+                    torch.arange(0, permutation.numel(), device=permutation.device))
+    return output
+
+
+def pack_padded_sequence(
+    input: Tensor,
+    lengths: Tensor,
+    batch_first: bool = False,
+    enforce_sorted: bool = True,
+) -> PackedSequence:
+    r"""Packs a Tensor containing padded sequences of variable length.
+
+    :attr:`input` can be of size ``T x B x *`` where `T` is the length of the
+    longest sequence (equal to ``lengths[0]``), ``B`` is the batch size, and
+    ``*`` is any number of dimensions (including 0). If ``batch_first`` is
+    ``True``, ``B x T x *`` :attr:`input` is expected.
+
+    For unsorted sequences, use `enforce_sorted = False`. If :attr:`enforce_sorted` is
+    ``True``, the sequences should be sorted by length in a decreasing order, i.e.
+    ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the shortest
+    one. `enforce_sorted = True` is only necessary for ONNX export.
+
+    Note:
+        This function accepts any input that has at least two dimensions. You
+        can apply it to pack the labels, and use the output of the RNN with
+        them to compute the loss directly. A Tensor can be retrieved from
+        a :class:`PackedSequence` object by accessing its ``.data`` attribute.
+
+    Args:
+        input (Tensor): padded batch of variable length sequences.
+        lengths (Tensor or list(int)): list of sequence lengths of each batch
+            element (must be on the CPU if provided as a tensor).
+        batch_first (bool, optional): if ``True``, the input is expected in ``B x T x *``
+            format.
+        enforce_sorted (bool, optional): if ``True``, the input is expected to
+            contain sequences sorted by length in a decreasing order. If
+            ``False``, the input will get sorted unconditionally. Default: ``True``.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    if not isinstance(lengths, torch.Tensor):
+        if torch._C._get_tracing_state():
+            warnings.warn('pack_padded_sequence has been called with a Python list of '
+                          'sequence lengths. The tracer cannot track the data flow of Python '
+                          'values, and it will treat them as constants, likely rendering '
+                          'the trace incorrect for any other combination of lengths.',
+                          stacklevel=2)
+        lengths = torch.as_tensor(lengths, dtype=torch.int64, device='cpu')
+    else:
+        lengths = lengths.to(dtype=torch.int64)
+
+    if enforce_sorted:
+        sorted_indices = None
+    else:
+        lengths, sorted_indices = torch.sort(lengths, descending=True)
+        sorted_indices = sorted_indices.to(input.device)
+        batch_dim = 0 if batch_first else 1
+        input = input.index_select(batch_dim, sorted_indices)
+
+    data, batch_sizes = \
+        _VF._pack_padded_sequence(input, lengths, batch_first)
+    return _packed_sequence_init(data, batch_sizes, sorted_indices, None)
+
+
+def pad_packed_sequence(
+    sequence: PackedSequence,
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+    total_length: Optional[int] = None,
+) -> Tuple[Tensor, Tensor]:
+    r"""Pad a packed batch of variable length sequences.
+
+    It is an inverse operation to :func:`pack_padded_sequence`.
+
+    The returned Tensor's data will be of size ``T x B x *``, where `T` is the length
+    of the longest sequence and `B` is the batch size. If ``batch_first`` is True,
+    the data will be transposed into ``B x T x *`` format.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+        >>> seq = torch.tensor([[1, 2, 0], [3, 0, 0], [4, 5, 6]])
+        >>> lens = [2, 1, 3]
+        >>> packed = pack_padded_sequence(seq, lens, batch_first=True, enforce_sorted=False)
+        >>> packed
+        PackedSequence(data=tensor([4, 1, 3, 5, 2, 6]), batch_sizes=tensor([3, 2, 1]),
+                       sorted_indices=tensor([2, 0, 1]), unsorted_indices=tensor([1, 2, 0]))
+        >>> seq_unpacked, lens_unpacked = pad_packed_sequence(packed, batch_first=True)
+        >>> seq_unpacked
+        tensor([[1, 2, 0],
+                [3, 0, 0],
+                [4, 5, 6]])
+        >>> lens_unpacked
+        tensor([2, 1, 3])
+
+    .. note::
+        :attr:`total_length` is useful to implement the
+        ``pack sequence -> recurrent network -> unpack sequence`` pattern in a
+        :class:`~torch.nn.Module` wrapped in :class:`~torch.nn.DataParallel`.
+        See :ref:`this FAQ section <pack-rnn-unpack-with-data-parallelism>` for
+        details.
+
+    Args:
+        sequence (PackedSequence): batch to pad
+        batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
+            format.
+        padding_value (float, optional): values for padded elements.
+        total_length (int, optional): if not ``None``, the output will be padded to
+            have length :attr:`total_length`. This method will throw :class:`ValueError`
+            if :attr:`total_length` is less than the max sequence length in
+            :attr:`sequence`.
+
+    Returns:
+        Tuple of Tensor containing the padded sequence, and a Tensor
+        containing the list of lengths of each sequence in the batch.
+        Batch elements will be re-ordered as they were ordered originally when
+        the batch was passed to ``pack_padded_sequence`` or ``pack_sequence``.
+
+
+
+
+    """
+    max_seq_length = sequence.batch_sizes.size(0)
+    if total_length is not None:
+        if total_length < max_seq_length:
+            raise ValueError("Expected total_length to be at least the length "
+                             "of the longest sequence in input, but got "
+                             f"total_length={total_length} and max sequence length being {max_seq_length}"
+                             )
+        max_seq_length = total_length
+    padded_output, lengths = _VF._pad_packed_sequence(
+        sequence.data, sequence.batch_sizes, batch_first, padding_value, max_seq_length)
+    unsorted_indices = sequence.unsorted_indices
+    if unsorted_indices is not None:
+        batch_dim = 0 if batch_first else 1
+        return padded_output.index_select(batch_dim, unsorted_indices), lengths[unsorted_indices.cpu()]
+    return padded_output, lengths
+
+# NOTE: .pyi stub allows Iterable[Tensor], but for JIT-compatibility we need to be more restrictive here.
+def pad_sequence(
+    sequences: Union[Tensor, List[Tensor]],
+    batch_first: bool = False,
+    padding_value: float = 0.0,
+) -> Tensor:
+    r"""Pad a list of variable length Tensors with ``padding_value``.
+
+    ``pad_sequence`` stacks a list of Tensors along a new dimension,
+    and pads them to equal length. For example, if the input is a list of
+    sequences with size ``L x *`` and ``batch_first`` is False, the output is
+    of size ``T x B x *``.
+
+    `B` is batch size. It is equal to the number of elements in ``sequences``.
+    `T` is length of the longest sequence.
+    `L` is length of the sequence.
+    `*` is any number of trailing dimensions, including none.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> pad_sequence([a, b, c]).size()
+        torch.Size([25, 3, 300])
+
+    Note:
+        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+        where `T` is the length of the longest sequence. This function assumes
+        trailing dimensions and type of all the Tensors in sequences are same.
+
+    Args:
+        sequences (list[Tensor]): list of variable length sequences.
+        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
+            ``T x B x *`` otherwise. Default: False.
+        padding_value (float, optional): value for padded elements. Default: 0.
+
+    Returns:
+        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
+        Tensor of size ``B x T x *`` otherwise
+    """
+    if not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        # JIT doesn't support `Iterable`
+        if not isinstance(sequences, Iterable):
+            msg = ('pad_sequence: Expected iterable for input sequences, but got arg of type: '
+                   f'{type(sequences)}')
+            raise RuntimeError(msg)
+
+        # In JIT context this leads to,
+        # RuntimeError: cannot statically infer the expected size of a list in this context
+        sequences = tuple(sequences)
+    else:
+        # For JIT, we only support Union[Tensor, Tuple[Tensor]]
+        if isinstance(sequences, torch.Tensor):
+            sequences = sequences.unbind(0)
+
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    return torch._C._nn.pad_sequence(sequences, batch_first, padding_value)
+
+
+def unpad_sequence(
+    padded_sequences: Tensor,
+    lengths: Tensor,
+    batch_first: bool = False,
+) -> List[Tensor]:
+    r"""Unpad padded Tensor into a list of variable length Tensors.
+
+    ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors.
+
+    Example:
+        >>> from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+        >>> a = torch.ones(25, 300)
+        >>> b = torch.ones(22, 300)
+        >>> c = torch.ones(15, 300)
+        >>> sequences = [a, b, c]
+        >>> padded_sequences = pad_sequence(sequences)
+        >>> lengths = torch.as_tensor([v.size(0) for v in sequences])
+        >>> unpadded_sequences = unpad_sequence(padded_sequences, lengths)
+        >>> torch.allclose(sequences[0], unpadded_sequences[0])
+        True
+        >>> torch.allclose(sequences[1], unpadded_sequences[1])
+        True
+        >>> torch.allclose(sequences[2], unpadded_sequences[2])
+        True
+
+    Args:
+        padded_sequences (Tensor): padded sequences.
+        lengths (Tensor): length of original (unpadded) sequences.
+        batch_first (bool, optional): whether batch dimension first or not. Default: False.
+
+    Returns:
+        a list of :class:`Tensor` objects
+    """
+    unpadded_sequences = []
+
+    if not batch_first:
+        padded_sequences.transpose_(0, 1)
+
+    max_length = padded_sequences.shape[1]
+    idx = torch.arange(max_length, device=lengths.device)
+
+    for seq, length in zip(padded_sequences, lengths):
+        mask = idx < length
+        unpacked_seq = seq[mask]
+        unpadded_sequences.append(unpacked_seq)
+
+    return unpadded_sequences
+
+
+def pack_sequence(sequences: List[Tensor], enforce_sorted: bool = True) -> PackedSequence:
+    r"""Packs a list of variable length Tensors.
+
+    Consecutive call of the next functions: ``pad_sequence``, ``pack_padded_sequence``.
+
+    ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
+    the length of a sequence and `*` is any number of trailing dimensions,
+    including zero.
+
+    For unsorted sequences, use `enforce_sorted = False`. If ``enforce_sorted``
+    is ``True``, the sequences should be sorted in the order of decreasing length.
+    ``enforce_sorted = True`` is only necessary for ONNX export.
+
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_sequence
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5])
+        >>> c = torch.tensor([6])
+        >>> pack_sequence([a, b, c])
+        PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)
+
+
+    Args:
+        sequences (list[Tensor]): A list of sequences of decreasing length.
+        enforce_sorted (bool, optional): if ``True``, checks that the input
+            contains sequences sorted by length in a decreasing order. If
+            ``False``, this condition is not checked. Default: ``True``.
+
+    Returns:
+        a :class:`PackedSequence` object
+    """
+    lengths = torch.as_tensor([v.size(0) for v in sequences])
+    return pack_padded_sequence(pad_sequence(sequences), lengths, enforce_sorted=enforce_sorted)
+
+
+def unpack_sequence(packed_sequences: PackedSequence) -> List[Tensor]:
+    r"""Unpack PackedSequence into a list of variable length Tensors.
+
+    ``packed_sequences`` should be a PackedSequence object.
+
+
+    Example:
+        >>> from torch.nn.utils.rnn import pack_sequence, unpack_sequence
+        >>> a = torch.tensor([1, 2, 3])
+        >>> b = torch.tensor([4, 5])
+        >>> c = torch.tensor([6])
+        >>> sequences = [a, b, c]
+        >>> print(sequences)
+        [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])]
+        >>> packed_sequences = pack_sequence(sequences)
+        >>> print(packed_sequences)
+        PackedSequence(data=tensor([1, 4, 6, 2, 5, 3]), batch_sizes=tensor([3, 2, 1]), sorted_indices=None, unsorted_indices=None)
+        >>> unpacked_sequences = unpack_sequence(packed_sequences)
+        >>> print(unpacked_sequences)
+        [tensor([1, 2, 3]), tensor([4, 5]), tensor([6])]
+
+
+    Args:
+        packed_sequences (PackedSequence): A PackedSequence object.
+
+    Returns:
+        a list of :class:`Tensor` objects
+    """
+    padded_sequences, lengths = pad_packed_sequence(packed_sequences, batch_first=True)
+    unpacked_sequences = unpad_sequence(padded_sequences, lengths, batch_first=True)
+    return unpacked_sequences
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/rnn.pyi b/MLPY/Lib/site-packages/torch/nn/utils/rnn.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..001dcf4ad29febf56a7db1eaaa0fd5939132e69e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/rnn.pyi
@@ -0,0 +1,102 @@
+from typing import (
+    Any,
+    Iterable,
+    NamedTuple,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+from typing_extensions import Self
+
+from torch import Tensor
+
+from torch._prims_common import DeviceLikeType
+from torch.types import _dtype
+
+class PackedSequence_(NamedTuple):
+    data: Tensor
+    batch_sizes: Tensor
+    sorted_indices: Optional[Tensor]
+    unsorted_indices: Optional[Tensor]
+
+def bind(optional: Any, fn: Any): ...
+
+_T = TypeVar("_T")
+
+class PackedSequence(PackedSequence_):
+    def __new__(
+        cls,
+        data: Tensor,
+        batch_sizes: Optional[Tensor] = ...,
+        sorted_indices: Optional[Tensor] = ...,
+        unsorted_indices: Optional[Tensor] = ...,
+    ) -> Self: ...
+    def pin_memory(self: _T) -> _T: ...
+    def cuda(self: _T, *args: Any, **kwargs: Any) -> _T: ...
+    def cpu(self: _T) -> _T: ...
+    def double(self: _T) -> _T: ...
+    def float(self: _T) -> _T: ...
+    def half(self: _T) -> _T: ...
+    def long(self: _T) -> _T: ...
+    def int(self: _T) -> _T: ...
+    def short(self: _T) -> _T: ...
+    def char(self: _T) -> _T: ...
+    def byte(self: _T) -> _T: ...
+    @overload
+    def to(
+        self: _T,
+        dtype: _dtype,
+        non_blocking: bool = False,
+        copy: bool = False,
+    ) -> _T: ...
+    @overload
+    def to(
+        self: _T,
+        device: Optional[DeviceLikeType] = None,
+        dtype: Optional[_dtype] = None,
+        non_blocking: bool = False,
+        copy: bool = False,
+    ) -> _T: ...
+    @overload
+    def to(
+        self: _T,
+        other: Tensor,
+        non_blocking: bool = False,
+        copy: bool = False,
+    ) -> _T: ...
+    @property
+    def is_cuda(self) -> bool: ...
+    def is_pinned(self) -> bool: ...
+
+def invert_permutation(permutation: Optional[Tensor]): ...
+def pack_padded_sequence(
+    input: Tensor,
+    lengths: Tensor,
+    batch_first: bool = ...,
+    enforce_sorted: bool = ...,
+) -> PackedSequence: ...
+def pad_packed_sequence(
+    sequence: PackedSequence,
+    batch_first: bool = ...,
+    padding_value: float = ...,
+    total_length: Optional[int] = ...,
+) -> Tuple[Tensor, ...]: ...
+def pad_sequence(
+    sequences: Union[Tensor, Iterable[Tensor]],
+    batch_first: bool = False,
+    padding_value: float = ...,
+) -> Tensor: ...
+def pack_sequence(
+    sequences: Sequence[Tensor],
+    enforce_sorted: bool = ...,
+) -> PackedSequence: ...
+def get_packed_sequence(
+    data: Tensor,
+    batch_sizes: Optional[Tensor],
+    sorted_indices: Optional[Tensor],
+    unsorted_indices: Optional[Tensor],
+) -> PackedSequence: ...
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/spectral_norm.py b/MLPY/Lib/site-packages/torch/nn/utils/spectral_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..436211e733ffff9f431dcb62270e407b55449aaf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/spectral_norm.py
@@ -0,0 +1,312 @@
+"""Spectral Normalization from https://arxiv.org/abs/1802.05957."""
+import torch
+from torch.nn.functional import normalize
+from typing import Any, Optional, TypeVar
+from ..modules import Module
+
+__all__ = ['SpectralNorm', 'SpectralNormLoadStateDictPreHook', 'SpectralNormStateDictHook',
+           'spectral_norm', 'remove_spectral_norm']
+
+class SpectralNorm:
+    # Invariant before and after each forward call:
+    #   u = normalize(W @ v)
+    # NB: At initialization, this invariant is not enforced
+
+    _version: int = 1
+    # At version 1:
+    #   made  `W` not a buffer,
+    #   added `v` as a buffer, and
+    #   made eval mode use `W = u @ W_orig @ v` rather than the stored `W`.
+    name: str
+    dim: int
+    n_power_iterations: int
+    eps: float
+
+    def __init__(self, name: str = 'weight', n_power_iterations: int = 1, dim: int = 0, eps: float = 1e-12) -> None:
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError('Expected n_power_iterations to be positive, but '
+                             f'got n_power_iterations={n_power_iterations}')
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def reshape_weight_to_matrix(self, weight: torch.Tensor) -> torch.Tensor:
+        weight_mat = weight
+        if self.dim != 0:
+            # permute dim to front
+            weight_mat = weight_mat.permute(self.dim,
+                                            *[d for d in range(weight_mat.dim()) if d != self.dim])
+        height = weight_mat.size(0)
+        return weight_mat.reshape(height, -1)
+
+    def compute_weight(self, module: Module, do_power_iteration: bool) -> torch.Tensor:
+        # NB: If `do_power_iteration` is set, the `u` and `v` vectors are
+        #     updated in power iteration **in-place**. This is very important
+        #     because in `DataParallel` forward, the vectors (being buffers) are
+        #     broadcast from the parallelized module to each module replica,
+        #     which is a new module object created on the fly. And each replica
+        #     runs its own spectral norm power iteration. So simply assigning
+        #     the updated vectors to the module this function runs on will cause
+        #     the update to be lost forever. And the next time the parallelized
+        #     module is replicated, the same randomly initialized vectors are
+        #     broadcast and used!
+        #
+        #     Therefore, to make the change propagate back, we rely on two
+        #     important behaviors (also enforced via tests):
+        #       1. `DataParallel` doesn't clone storage if the broadcast tensor
+        #          is already on correct device; and it makes sure that the
+        #          parallelized module is already on `device[0]`.
+        #       2. If the out tensor in `out=` kwarg has correct shape, it will
+        #          just fill in the values.
+        #     Therefore, since the same power iteration is performed on all
+        #     devices, simply updating the tensors in-place will make sure that
+        #     the module replica on `device[0]` will update the _u vector on the
+        #     parallelized module (by shared storage).
+        #
+        #    However, after we update `u` and `v` in-place, we need to **clone**
+        #    them before using them to normalize the weight. This is to support
+        #    backproping through two forward passes, e.g., the common pattern in
+        #    GAN training: loss = D(real) - D(fake). Otherwise, engine will
+        #    complain that variables needed to do backward for the first forward
+        #    (i.e., the `u` and `v` vectors) are changed in the second forward.
+        weight = getattr(module, self.name + '_orig')
+        u = getattr(module, self.name + '_u')
+        v = getattr(module, self.name + '_v')
+        weight_mat = self.reshape_weight_to_matrix(weight)
+
+        if do_power_iteration:
+            with torch.no_grad():
+                for _ in range(self.n_power_iterations):
+                    # Spectral norm of weight equals to `u^T W v`, where `u` and `v`
+                    # are the first left and right singular vectors.
+                    # This power iteration produces approximations of `u` and `v`.
+                    v = normalize(torch.mv(weight_mat.t(), u), dim=0, eps=self.eps, out=v)
+                    u = normalize(torch.mv(weight_mat, v), dim=0, eps=self.eps, out=u)
+                if self.n_power_iterations > 0:
+                    # See above on why we need to clone
+                    u = u.clone(memory_format=torch.contiguous_format)
+                    v = v.clone(memory_format=torch.contiguous_format)
+
+        sigma = torch.dot(u, torch.mv(weight_mat, v))
+        weight = weight / sigma
+        return weight
+
+    def remove(self, module: Module) -> None:
+        with torch.no_grad():
+            weight = self.compute_weight(module, do_power_iteration=False)
+        delattr(module, self.name)
+        delattr(module, self.name + '_u')
+        delattr(module, self.name + '_v')
+        delattr(module, self.name + '_orig')
+        module.register_parameter(self.name, torch.nn.Parameter(weight.detach()))
+
+    def __call__(self, module: Module, inputs: Any) -> None:
+        setattr(module, self.name, self.compute_weight(module, do_power_iteration=module.training))
+
+    def _solve_v_and_rescale(self, weight_mat, u, target_sigma):
+        # Tries to returns a vector `v` s.t. `u = normalize(W @ v)`
+        # (the invariant at top of this class) and `u @ W @ v = sigma`.
+        # This uses pinverse in case W^T W is not invertible.
+        v = torch.linalg.multi_dot([weight_mat.t().mm(weight_mat).pinverse(), weight_mat.t(), u.unsqueeze(1)]).squeeze(1)
+        return v.mul_(target_sigma / torch.dot(u, torch.mv(weight_mat, v)))
+
+    @staticmethod
+    def apply(module: Module, name: str, n_power_iterations: int, dim: int, eps: float) -> 'SpectralNorm':
+        for hook in module._forward_pre_hooks.values():
+            if isinstance(hook, SpectralNorm) and hook.name == name:
+                raise RuntimeError(f"Cannot register two spectral_norm hooks on the same parameter {name}")
+
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = module._parameters[name]
+        if weight is None:
+            raise ValueError(f'`SpectralNorm` cannot be applied as parameter `{name}` is None')
+        if isinstance(weight, torch.nn.parameter.UninitializedParameter):
+            raise ValueError(
+                'The module passed to `SpectralNorm` can\'t have uninitialized parameters. '
+                'Make sure to run the dummy forward before applying spectral normalization')
+
+        with torch.no_grad():
+            weight_mat = fn.reshape_weight_to_matrix(weight)
+
+            h, w = weight_mat.size()
+            # randomly initialize `u` and `v`
+            u = normalize(weight.new_empty(h).normal_(0, 1), dim=0, eps=fn.eps)
+            v = normalize(weight.new_empty(w).normal_(0, 1), dim=0, eps=fn.eps)
+
+        delattr(module, fn.name)
+        module.register_parameter(fn.name + "_orig", weight)
+        # We still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an nn.Parameter and
+        # gets added as a parameter. Instead, we register weight.data as a plain
+        # attribute.
+        setattr(module, fn.name, weight.data)
+        module.register_buffer(fn.name + "_u", u)
+        module.register_buffer(fn.name + "_v", v)
+
+        module.register_forward_pre_hook(fn)
+        module._register_state_dict_hook(SpectralNormStateDictHook(fn))
+        module._register_load_state_dict_pre_hook(SpectralNormLoadStateDictPreHook(fn))
+        return fn
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormLoadStateDictPreHook:
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn) -> None:
+        self.fn = fn
+
+    # For state_dict with version None, (assuming that it has gone through at
+    # least one training forward), we have
+    #
+    #    u = normalize(W_orig @ v)
+    #    W = W_orig / sigma, where sigma = u @ W_orig @ v
+    #
+    # To compute `v`, we solve `W_orig @ x = u`, and let
+    #    v = x / (u @ W_orig @ x) * (W / W_orig).
+    def __call__(self, state_dict, prefix, local_metadata, strict,
+                 missing_keys, unexpected_keys, error_msgs) -> None:
+        fn = self.fn
+        version = local_metadata.get('spectral_norm', {}).get(fn.name + '.version', None)
+        if version is None or version < 1:
+            weight_key = prefix + fn.name
+            if version is None and all(weight_key + s in state_dict for s in ('_orig', '_u', '_v')) and \
+                    weight_key not in state_dict:
+                # Detect if it is the updated state dict and just missing metadata.
+                # This could happen if the users are crafting a state dict themselves,
+                # so we just pretend that this is the newest.
+                return
+            has_missing_keys = False
+            for suffix in ('_orig', '', '_u'):
+                key = weight_key + suffix
+                if key not in state_dict:
+                    has_missing_keys = True
+                    if strict:
+                        missing_keys.append(key)
+            if has_missing_keys:
+                return
+            with torch.no_grad():
+                weight_orig = state_dict[weight_key + '_orig']
+                weight = state_dict.pop(weight_key)
+                sigma = (weight_orig / weight).mean()
+                weight_mat = fn.reshape_weight_to_matrix(weight_orig)
+                u = state_dict[weight_key + '_u']
+                v = fn._solve_v_and_rescale(weight_mat, u, sigma)
+                state_dict[weight_key + '_v'] = v
+
+
+# This is a top level class because Py2 pickle doesn't like inner class nor an
+# instancemethod.
+class SpectralNormStateDictHook:
+    # See docstring of SpectralNorm._version on the changes to spectral_norm.
+    def __init__(self, fn) -> None:
+        self.fn = fn
+
+    def __call__(self, module, state_dict, prefix, local_metadata) -> None:
+        if 'spectral_norm' not in local_metadata:
+            local_metadata['spectral_norm'] = {}
+        key = self.fn.name + '.version'
+        if key in local_metadata['spectral_norm']:
+            raise RuntimeError(f"Unexpected key in metadata['spectral_norm']: {key}")
+        local_metadata['spectral_norm'][key] = self.fn._version
+
+
+T_module = TypeVar('T_module', bound=Module)
+
+def spectral_norm(module: T_module,
+                  name: str = 'weight',
+                  n_power_iterations: int = 1,
+                  eps: float = 1e-12,
+                  dim: Optional[int] = None) -> T_module:
+    r"""Apply spectral normalization to a parameter in the given module.
+
+    .. math::
+        \mathbf{W}_{SN} = \dfrac{\mathbf{W}}{\sigma(\mathbf{W})},
+        \sigma(\mathbf{W}) = \max_{\mathbf{h}: \mathbf{h} \ne 0} \dfrac{\|\mathbf{W} \mathbf{h}\|_2}{\|\mathbf{h}\|_2}
+
+    Spectral normalization stabilizes the training of discriminators (critics)
+    in Generative Adversarial Networks (GANs) by rescaling the weight tensor
+    with spectral norm :math:`\sigma` of the weight matrix calculated using
+    power iteration method. If the dimension of the weight tensor is greater
+    than 2, it is reshaped to 2D in power iteration method to get spectral
+    norm. This is implemented via a hook that calculates spectral norm and
+    rescales weight before every :meth:`~Module.forward` call.
+
+    See `Spectral Normalization for Generative Adversarial Networks`_ .
+
+    .. _`Spectral Normalization for Generative Adversarial Networks`: https://arxiv.org/abs/1802.05957
+
+    Args:
+        module (nn.Module): containing module
+        name (str, optional): name of weight parameter
+        n_power_iterations (int, optional): number of power iterations to
+            calculate spectral norm
+        eps (float, optional): epsilon for numerical stability in
+            calculating norms
+        dim (int, optional): dimension corresponding to number of outputs,
+            the default is ``0``, except for modules that are instances of
+            ConvTranspose{1,2,3}d, when it is ``1``
+
+    Returns:
+        The original module with the spectral norm hook
+
+    .. note::
+        This function has been reimplemented as
+        :func:`torch.nn.utils.parametrizations.spectral_norm` using the new
+        parametrization functionality in
+        :func:`torch.nn.utils.parametrize.register_parametrization`. Please use
+        the newer version. This function will be deprecated in a future version
+        of PyTorch.
+
+    Example::
+
+        >>> m = spectral_norm(nn.Linear(20, 40))
+        >>> m
+        Linear(in_features=20, out_features=40, bias=True)
+        >>> m.weight_u.size()
+        torch.Size([40])
+
+    """
+    if dim is None:
+        if isinstance(module, (torch.nn.ConvTranspose1d,
+                               torch.nn.ConvTranspose2d,
+                               torch.nn.ConvTranspose3d)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(module, name, n_power_iterations, dim, eps)
+    return module
+
+
+def remove_spectral_norm(module: T_module, name: str = 'weight') -> T_module:
+    r"""Remove the spectral normalization reparameterization from a module.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = spectral_norm(nn.Linear(40, 10))
+        >>> remove_spectral_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, SpectralNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            break
+    else:
+        raise ValueError(f"spectral_norm of '{name}' not found in {module}")
+
+    for k, hook in module._state_dict_hooks.items():
+        if isinstance(hook, SpectralNormStateDictHook) and hook.fn.name == name:
+            del module._state_dict_hooks[k]
+            break
+
+    for k, hook in module._load_state_dict_pre_hooks.items():
+        if isinstance(hook, SpectralNormLoadStateDictPreHook) and hook.fn.name == name:
+            del module._load_state_dict_pre_hooks[k]
+            break
+
+    return module
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/stateless.py b/MLPY/Lib/site-packages/torch/nn/utils/stateless.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ed3250685b2c04f385cbbc7a45ff2d67c53f1e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/stateless.py
@@ -0,0 +1,263 @@
+import contextlib
+import warnings
+from collections import defaultdict
+from typing import Any, Dict, Iterator, Optional, Set, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+__all__ = ["functional_call"]
+
+
+def _untie_named_tensors_map(
+    module: "torch.nn.Module",
+    parameters_and_buffers: Dict[str, Tensor],
+) -> Dict[str, Tensor]:
+    """
+    Unties all tied tensors in the module to parameters_and_buffers.
+
+    This function returns a new untied_parameters_and_buffers dictionary and leave the original
+    untied_parameters_and_buffers dictionary unchanged. It adds new (missing) keys for tied tensors
+    in the module to untied_parameters_and_buffers. The value of the new key is the user-given value
+    in the original parameters_and_buffers dictionary.
+
+    If there are more than one user-given values for the same tied tensor, it will raise an error.
+
+    For example, if the module has two tied weights self.foo and self.tied_foo and the user passes
+    {'foo': foo_value, ...}, this will return {'foo': foo_value, 'tied_foo': foo_value, ...}. If the
+    user passes {'foo': foo_value, 'tied_foo': tied_foo_value, ...}, it will raise an error. If the
+    user passes {'foo': foo_value, 'tied_foo': foo_value, ...}, it will not raise an error.
+
+    Args:
+        module (torch.nn.Module): the module to determine which tensors are tied.
+        parameters_and_buffers (Dict[str, Tensor]): a map of {name: tensor} for reparamaterizing the module.
+
+    Returns:
+        A new untied version of the parameters_and_buffers dictionary.
+
+    Raises:
+        ValueError: if there are more than one user-given values for the same tied tensor.
+    """
+    # A map of {name: tensor} for all tensors (including tied ones) in the module.
+    all_named_tensors: Dict[str, Tensor] = {}
+    all_named_tensors.update(module.named_parameters(remove_duplicate=False))
+    all_named_tensors.update(module.named_buffers(remove_duplicate=False))
+
+    # A map of {tensor: set(all_tied_names)} for all tensor names in the module.
+    tensor_to_tied_names_map: Dict[Tensor, Set[str]] = defaultdict(set)
+    for name, tensor in all_named_tensors.items():
+        tensor_to_tied_names_map[tensor].add(name)
+
+    # A map of {tied_name: set(all_tied_names)} for all tensor names in the module.
+    # If a name is not tied, it will not be in this map.
+    tied_names_map: Dict[str, Set[str]] = {}
+    for tied_names in tensor_to_tied_names_map.values():
+        if len(tied_names) > 1:
+            for tied_name in tied_names:
+                tied_names_map[tied_name] = tied_names
+
+    # Make sure the user didn't pass multiple values for the same tied tensor.
+    given_names = set(parameters_and_buffers.keys())
+    given_names_for_tied_tensors = given_names.intersection(tied_names_map.keys())
+    for given_name in given_names_for_tied_tensors:
+        tied_names = tied_names_map[given_name]
+        if (
+            # Detect if there are multiple keys present for the same tied tensor.
+            len(tied_names.intersection(given_names_for_tied_tensors)) > 1
+            # Only raise an error if the user passed multiple values for the same tied tensor.
+            # If all given values are the same, don't raise.
+            and len({parameters_and_buffers[tied_name] for tied_name in tied_names})
+            != 1
+        ):
+            raise ValueError(
+                f"functional_call got multiple values for keys {sorted(tied_names)}, "
+                f"which are tied. Consider using tie_weights=False"
+            )
+
+    # Untie the given named tensor map
+    # Make a copy for not modifying the original dict
+    untied_parameters_and_buffers = parameters_and_buffers.copy()
+    for given_name in given_names_for_tied_tensors:
+        for tied_name in tied_names_map[given_name]:
+            untied_parameters_and_buffers[tied_name] = parameters_and_buffers[
+                given_name
+            ]
+    return untied_parameters_and_buffers
+
+
+@contextlib.contextmanager
+def _reparametrize_module(
+    module: "torch.nn.Module",
+    parameters_and_buffers: Dict[str, Tensor],
+    *,
+    tie_weights: bool = False,
+    strict: bool = False,
+) -> Iterator[None]:
+    if tie_weights:
+        untied_parameters_and_buffers = _untie_named_tensors_map(
+            module, parameters_and_buffers
+        )
+    else:
+        untied_parameters_and_buffers = parameters_and_buffers
+
+    accessor = NamedMemberAccessor(module)
+    if strict:
+        missing_keys, unexpected_keys = accessor.check_keys(
+            untied_parameters_and_buffers
+        )
+        error_msgs = []
+        if len(unexpected_keys) > 0:
+            error_msgs.append(
+                f"Unexpected key(s): {', '.join(map(repr, unexpected_keys))}."
+            )
+        if len(missing_keys) > 0:
+            error_msgs.append(f"Missing key(s): {', '.join(map(repr, missing_keys))}.")
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in reparametrizing for {}:\n\t{}".format(
+                    module._get_name(), "\n\t".join(error_msgs)
+                )
+            )
+
+    orig_parameters_and_buffers: Dict[str, Tensor] = {}
+    try:
+        orig_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            untied_parameters_and_buffers, allow_missing=True
+        )
+        yield
+    finally:
+        new_parameters_and_buffers, _ = accessor.swap_tensors_dict(
+            orig_parameters_and_buffers, allow_missing=True
+        )
+        # Sometimes the module is not completely stateless and has some in-place modifications on
+        # the _parameters and _buffers dictionaries.
+        # Write the changed parameters and buffers back to the original dict.
+        parameters_and_buffers.update(
+            {
+                k: new_parameters_and_buffers[k]
+                for k in parameters_and_buffers
+                if k in new_parameters_and_buffers
+            }
+        )
+
+
+def functional_call(
+    module: "torch.nn.Module",
+    parameters_and_buffers: Dict[str, Tensor],
+    args: Union[Any, Tuple],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    tie_weights: bool = True,
+    strict: bool = False,
+):
+    r"""Perform a functional call on the module by replacing the module parameters and buffers with the provided ones.
+
+    .. warning::
+
+        This API is deprecated as of PyTorch 2.0 and will be removed in a future
+        version of PyTorch. Please use :func:`torch.func.functional_call` instead,
+        which is a drop-in replacement for this API.
+
+    .. note:: If the module has active parametrizations, passing a value in the
+        :attr:`parameters_and_buffers` argument with the name set to the regular parameter
+        name will completely disable the parametrization.
+        If you want to apply the parametrization function to the value passed
+        please set the key as ``{submodule_name}.parametrizations.{parameter_name}.original``.
+
+    .. note:: If the module performs in-place operations on parameters/buffers, these will be reflected
+        in the `parameters_and_buffers` input.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # does self.foo = self.foo + 1
+            >>> print(mod.foo)  # tensor(0.)
+            >>> functional_call(mod, a, torch.ones(()))
+            >>> print(mod.foo)  # tensor(0.)
+            >>> print(a['foo'])  # tensor(1.)
+
+    .. note:: If the module has tied weights, whether or not functional_call respects the tying is determined by the
+        tie_weights flag.
+
+        Example::
+
+            >>> a = {'foo': torch.zeros(())}
+            >>> # xdoctest: +SKIP
+            >>> mod = Foo()  # has both self.foo and self.foo_tied which are tied. Returns x + self.foo + self.foo_tied
+            >>> print(mod.foo)  # tensor(1.)
+            >>> mod(torch.zeros(()))  # tensor(2.)
+            >>> functional_call(mod, a, torch.zeros(()))  # tensor(0.) since it will change self.foo_tied too
+            >>> functional_call(mod, a, torch.zeros(()), tie_weights=False)  # tensor(1.)--self.foo_tied is not updated
+            >>> new_a = {'foo': torch.zeros(()), 'foo_tied': torch.zeros(())}
+            >>> functional_call(mod, new_a, torch.zeros()) # tensor(0.)
+
+    Args:
+        module (torch.nn.Module): the module to call
+        parameters_and_buffers (dict of str and Tensor): the parameters that will be used in
+            the module call.
+        args (Any or tuple): arguments to be passed to the module call. If not a tuple, considered a single argument.
+        kwargs (dict): keyword arguments to be passed to the module call
+        tie_weights (bool, optional): If True, then parameters and buffers tied in the original model will be treated as
+            tied in the reparamaterized version. Therefore, if True and different values are passed for the tied
+            parameters and buffers, it will error. If False, it will not respect the originally tied parameters and
+            buffers unless the values passed for both weights are the same. Default: True.
+        strict (bool, optional): If True, then the parameters and buffers passed in must match the parameters and
+            buffers in the original module. Therefore, if True and there are any missing or unexpected keys, it will
+            error. Default: False.
+
+    Returns:
+        Any: the result of calling ``module``.
+    """
+    warnings.warn(
+        "This API is deprecated as of PyTorch 2.0 and will be removed in a future "
+        "version of PyTorch. Please use torch.func.functional_call instead "
+        "which is a drop-in replacement for this API."
+    )
+
+    return _functional_call(
+        module,
+        parameters_and_buffers,
+        args,
+        kwargs,
+        tie_weights=tie_weights,
+        strict=strict,
+    )
+
+
+def _functional_call(
+    module: "torch.nn.Module",
+    parameters_and_buffers: Dict[str, Tensor],
+    args: Union[Any, Tuple],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    tie_weights: bool = True,
+    strict: bool = False,
+):
+    # TODO allow kwargs such as unsafe and others for parametrization
+    if (
+        torch.jit.is_tracing()
+        or torch.jit.is_scripting()
+        or isinstance(
+            module,
+            (
+                torch.jit.RecursiveScriptModule,
+                torch.jit.ScriptModule,
+                torch.jit.ScriptFunction,
+            ),
+        )
+    ):
+        raise RuntimeError("The stateless API can't be used with Jitted modules")
+    if isinstance(module, torch.nn.DataParallel):
+        raise RuntimeError(
+            "The stateless API can't be used with nn.DataParallel module"
+        )
+    if kwargs is None:
+        kwargs = {}
+    if not isinstance(args, tuple):
+        args = (args,)
+    with _reparametrize_module(
+        module, parameters_and_buffers, tie_weights=tie_weights, strict=strict
+    ):
+        return module(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/nn/utils/weight_norm.py b/MLPY/Lib/site-packages/torch/nn/utils/weight_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef807a65d9fd5133d72f89022c57de313f88fa5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/nn/utils/weight_norm.py
@@ -0,0 +1,151 @@
+r"""Weight Normalization from https://arxiv.org/abs/1602.07868."""
+from torch.nn.parameter import Parameter, UninitializedParameter
+from torch import _weight_norm, norm_except_dim
+from typing import Any, TypeVar
+import warnings
+from ..modules import Module
+
+__all__ = ['WeightNorm', 'weight_norm', 'remove_weight_norm']
+
+class WeightNorm:
+    name: str
+    dim: int
+
+    def __init__(self, name: str, dim: int) -> None:
+        if dim is None:
+            dim = -1
+        self.name = name
+        self.dim = dim
+
+    # TODO Make return type more specific
+    def compute_weight(self, module: Module) -> Any:
+        g = getattr(module, self.name + '_g')
+        v = getattr(module, self.name + '_v')
+        return _weight_norm(v, g, self.dim)
+
+    @staticmethod
+    def apply(module, name: str, dim: int) -> 'WeightNorm':
+        warnings.warn("torch.nn.utils.weight_norm is deprecated in favor of torch.nn.utils.parametrizations.weight_norm.")
+
+        for hook in module._forward_pre_hooks.values():
+            if isinstance(hook, WeightNorm) and hook.name == name:
+                raise RuntimeError(f"Cannot register two weight_norm hooks on the same parameter {name}")
+
+        if dim is None:
+            dim = -1
+
+        fn = WeightNorm(name, dim)
+
+        weight = getattr(module, name)
+        if isinstance(weight, UninitializedParameter):
+            raise ValueError(
+                'The module passed to `WeightNorm` can\'t have uninitialized parameters. '
+                'Make sure to run the dummy forward before applying weight normalization')
+        # remove w from parameter list
+        del module._parameters[name]
+
+        # add g and v as new parameters and express w as g/||v|| * v
+        module.register_parameter(name + '_g', Parameter(norm_except_dim(weight, 2, dim).data))
+        module.register_parameter(name + '_v', Parameter(weight.data))
+        setattr(module, name, fn.compute_weight(module))
+
+        # recompute weight before every forward()
+        module.register_forward_pre_hook(fn)
+
+        return fn
+
+    def remove(self, module: Module) -> None:
+        weight = self.compute_weight(module)
+        delattr(module, self.name)
+        del module._parameters[self.name + '_g']
+        del module._parameters[self.name + '_v']
+        setattr(module, self.name, Parameter(weight.data))
+
+    def __call__(self, module: Module, inputs: Any) -> None:
+        setattr(module, self.name, self.compute_weight(module))
+
+
+T_module = TypeVar('T_module', bound=Module)
+
+def weight_norm(module: T_module, name: str = 'weight', dim: int = 0) -> T_module:
+    r"""Apply weight normalization to a parameter in the given module.
+
+    .. math::
+         \mathbf{w} = g \dfrac{\mathbf{v}}{\|\mathbf{v}\|}
+
+    Weight normalization is a reparameterization that decouples the magnitude
+    of a weight tensor from its direction. This replaces the parameter specified
+    by :attr:`name` (e.g. ``'weight'``) with two parameters: one specifying the magnitude
+    (e.g. ``'weight_g'``) and one specifying the direction (e.g. ``'weight_v'``).
+    Weight normalization is implemented via a hook that recomputes the weight
+    tensor from the magnitude and direction before every :meth:`~Module.forward`
+    call.
+
+    By default, with ``dim=0``, the norm is computed independently per output
+    channel/plane. To compute a norm over the entire weight tensor, use
+    ``dim=None``.
+
+    See https://arxiv.org/abs/1602.07868
+
+    .. warning::
+
+        This function is deprecated.  Use :func:`torch.nn.utils.parametrizations.weight_norm`
+        which uses the modern parametrization API.  The new ``weight_norm`` is compatible
+        with ``state_dict`` generated from old ``weight_norm``.
+
+        Migration guide:
+
+        * The magnitude (``weight_g``) and direction (``weight_v``) are now expressed
+          as ``parametrizations.weight.original0`` and ``parametrizations.weight.original1``
+          respectively.  If this is bothering you, please comment on
+          https://github.com/pytorch/pytorch/issues/102999
+
+        * To remove the weight normalization reparametrization, use
+          :func:`torch.nn.utils.parametrize.remove_parametrizations`.
+
+        * The weight is no longer recomputed once at module forward; instead, it will
+          be recomputed on every access.  To restore the old behavior, use
+          :func:`torch.nn.utils.parametrize.cached` before invoking the module
+          in question.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+        dim (int, optional): dimension over which to compute the norm
+
+    Returns:
+        The original module with the weight norm hook
+
+    Example::
+
+        >>> m = weight_norm(nn.Linear(20, 40), name='weight')
+        >>> m
+        Linear(in_features=20, out_features=40, bias=True)
+        >>> m.weight_g.size()
+        torch.Size([40, 1])
+        >>> m.weight_v.size()
+        torch.Size([40, 20])
+
+    """
+    WeightNorm.apply(module, name, dim)
+    return module
+
+
+def remove_weight_norm(module: T_module, name: str = 'weight') -> T_module:
+    r"""Remove the weight normalization reparameterization from a module.
+
+    Args:
+        module (Module): containing module
+        name (str, optional): name of weight parameter
+
+    Example:
+        >>> m = weight_norm(nn.Linear(20, 40))
+        >>> remove_weight_norm(m)
+    """
+    for k, hook in module._forward_pre_hooks.items():
+        if isinstance(hook, WeightNorm) and hook.name == name:
+            hook.remove(module)
+            del module._forward_pre_hooks[k]
+            return module
+
+    raise ValueError(f"weight_norm of '{name}' not found in {module}")
diff --git a/MLPY/Lib/site-packages/torch/onnx/__init__.py b/MLPY/Lib/site-packages/torch/onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0e54f7d188165d837284654aed4d04dfdb18ffa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/__init__.py
@@ -0,0 +1,177 @@
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch._C._onnx import (
+    _CAFFE2_ATEN_FALLBACK,
+    OperatorExportTypes,
+    TensorProtoDataType,
+    TrainingMode,
+)
+
+from . import (  # usort:skip. Keep the order instead of sorting lexicographically
+    _deprecation,
+    errors,
+    symbolic_caffe2,
+    symbolic_helper,
+    symbolic_opset7,
+    symbolic_opset8,
+    symbolic_opset9,
+    symbolic_opset10,
+    symbolic_opset11,
+    symbolic_opset12,
+    symbolic_opset13,
+    symbolic_opset14,
+    symbolic_opset15,
+    symbolic_opset16,
+    symbolic_opset17,
+    symbolic_opset18,
+    utils,
+)
+
+# TODO(After 1.13 release): Remove the deprecated SymbolicContext
+from ._exporter_states import ExportTypes, SymbolicContext
+from ._type_utils import JitScalarType
+from .errors import CheckerError  # Backwards compatibility
+from .utils import (
+    _optimize_graph,
+    _run_symbolic_function,
+    _run_symbolic_method,
+    export,
+    export_to_pretty_string,
+    is_in_onnx_export,
+    register_custom_op_symbolic,
+    select_model_mode_for_export,
+    unregister_custom_op_symbolic,
+)
+
+from ._internal.exporter import (  # usort:skip. needs to be last to avoid circular import
+    DiagnosticOptions,
+    ExportOptions,
+    ONNXProgram,
+    ONNXProgramSerializer,
+    ONNXRuntimeOptions,
+    InvalidExportOptionsError,
+    OnnxExporterError,
+    OnnxRegistry,
+    dynamo_export,
+    enable_fake_mode,
+)
+
+from ._internal.onnxruntime import (
+    is_onnxrt_backend_supported,
+    OrtBackend as _OrtBackend,
+    OrtBackendOptions as _OrtBackendOptions,
+    OrtExecutionProvider as _OrtExecutionProvider,
+)
+
+__all__ = [
+    # Modules
+    "symbolic_helper",
+    "utils",
+    "errors",
+    # All opsets
+    "symbolic_caffe2",
+    "symbolic_opset7",
+    "symbolic_opset8",
+    "symbolic_opset9",
+    "symbolic_opset10",
+    "symbolic_opset11",
+    "symbolic_opset12",
+    "symbolic_opset13",
+    "symbolic_opset14",
+    "symbolic_opset15",
+    "symbolic_opset16",
+    "symbolic_opset17",
+    "symbolic_opset18",
+    # Enums
+    "ExportTypes",
+    "OperatorExportTypes",
+    "TrainingMode",
+    "TensorProtoDataType",
+    "JitScalarType",
+    # Public functions
+    "export",
+    "export_to_pretty_string",
+    "is_in_onnx_export",
+    "select_model_mode_for_export",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+    "disable_log",
+    "enable_log",
+    # Errors
+    "CheckerError",  # Backwards compatibility
+    # Dynamo Exporter
+    "DiagnosticOptions",
+    "ExportOptions",
+    "ONNXProgram",
+    "ONNXProgramSerializer",
+    "ONNXRuntimeOptions",
+    "InvalidExportOptionsError",
+    "OnnxExporterError",
+    "OnnxRegistry",
+    "dynamo_export",
+    "enable_fake_mode",
+    # DORT / torch.compile
+    "is_onnxrt_backend_supported",
+]
+
+# Set namespace for exposed private names
+ExportTypes.__module__ = "torch.onnx"
+JitScalarType.__module__ = "torch.onnx"
+ExportOptions.__module__ = "torch.onnx"
+ONNXProgram.__module__ = "torch.onnx"
+ONNXProgramSerializer.__module__ = "torch.onnx"
+ONNXRuntimeOptions.__module__ = "torch.onnx"
+dynamo_export.__module__ = "torch.onnx"
+InvalidExportOptionsError.__module__ = "torch.onnx"
+OnnxExporterError.__module__ = "torch.onnx"
+enable_fake_mode.__module__ = "torch.onnx"
+OnnxRegistry.__module__ = "torch.onnx"
+DiagnosticOptions.__module__ = "torch.onnx"
+is_onnxrt_backend_supported.__module__ = "torch.onnx"
+_OrtExecutionProvider.__module__ = "torch.onnx"
+_OrtBackendOptions.__module__ = "torch.onnx"
+_OrtBackend.__module__ = "torch.onnx"
+
+producer_name = "pytorch"
+producer_version = _C_onnx.PRODUCER_VERSION
+
+
+@_deprecation.deprecated(
+    since="1.12.0", removed_in="2.0", instructions="use `torch.onnx.export` instead"
+)
+def _export(*args, **kwargs):
+    return utils._export(*args, **kwargs)
+
+
+# TODO(justinchuby): Deprecate these logging functions in favor of the new diagnostic module.
+
+# Returns True iff ONNX logging is turned on.
+is_onnx_log_enabled = _C._jit_is_onnx_log_enabled
+
+
+def enable_log() -> None:
+    r"""Enables ONNX logging."""
+    _C._jit_set_onnx_log_enabled(True)
+
+
+def disable_log() -> None:
+    r"""Disables ONNX logging."""
+    _C._jit_set_onnx_log_enabled(False)
+
+
+"""Sets output stream for ONNX logging.
+
+Args:
+    stream_name (str, default "stdout"): Only 'stdout' and 'stderr' are supported
+        as ``stream_name``.
+"""
+set_log_stream = _C._jit_set_onnx_log_output_stream
+
+
+"""A simple logging facility for ONNX exporter.
+
+Args:
+    args: Arguments are converted to string, concatenated together with a newline
+        character appended to the end, and flushed to output stream.
+"""
+log = _C._jit_onnx_log
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63a164a5e254def7f8519ac6367a0f2195eb2f9d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_constants.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69404544686bb6b6da0890db2b800f9f7ec84020
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_constants.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_deprecation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_deprecation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee1eeef5f89012b8c2210e46d342de1e6ca15214
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_deprecation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_experimental.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_experimental.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8ebc97223bc2f67f8183ee1ea0615d7bceba594
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_experimental.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_exporter_states.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_exporter_states.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84eabdee328ed7887cbc4c6011c3e9f36c84faf7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_exporter_states.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_globals.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_globals.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6355d644cbc6c08f9c406f856f14f42560e677d3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_globals.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_onnx_supported_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_onnx_supported_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9981682007682e3ca44b6203b56ad31a470b3d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_onnx_supported_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/_type_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_type_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbce10b82b5ff25bd66d131957a021a8d5fdbe19
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/_type_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/errors.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/errors.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4696f84280bfaefec83e9637d5c3587bf0c4a1d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/errors.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/operators.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/operators.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9773963338d690d116184a6472ec6f9b98f89d8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/operators.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_caffe2.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_caffe2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cfebb29b4a364e8fa73039b7db45ac62c42c67f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_caffe2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_helper.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_helper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..551ef4c3a7fe92049dd9d3f66fffb324fa0a5c3b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_helper.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset10.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset10.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef91f02680570b1c4d9f0c2a850e310d2c066812
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset10.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset11.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset11.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c6006312fe0e22e925073ba75a2f6318e974616
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset11.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset12.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset12.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e9bcd2967dd0e88534cc1fd4ba2f97aab602102
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset12.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset13.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset13.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc343801816b779157702dda7aa01b43c79553a1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset13.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset14.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset14.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71e4714b19229c8c38ec1ee04e188237eedf5d6e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset14.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset15.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset15.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c3e2ad0c505c20daf3e997bf59629d6cbb0dd7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset15.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset16.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset16.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ccd6ff844dd02d34003b6607a51d999c33e35a6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset16.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset17.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset17.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46fbf862ab7b993813c4560058b2c169d0e49615
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset17.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset18.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset18.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7356f25da56e0e48b777ced775680a12351c26e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset18.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset7.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset7.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa533f01918dee10e2cfa41e7bb9f51322bc6fa4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset7.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset8.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset8.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b0afe5057cabcc3181b6d199f50a08b6a2e1c2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset8.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset9.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset9.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fd5091ddfc3395a35edc273d9118be140f6bf52
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/symbolic_opset9.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..703e01c7860054a2c6057d55559a82206a50bf39
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/__pycache__/verification.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/__pycache__/verification.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26e784055734fd0d94373b16f27edb2aa7d74ed1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/__pycache__/verification.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_constants.py b/MLPY/Lib/site-packages/torch/onnx/_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa846233d9159dff4db69143fe0b28be33120160
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_constants.py
@@ -0,0 +1,25 @@
+"""Constant values used in ONNX."""
+
+ONNX_ARCHIVE_MODEL_PROTO_NAME = "__MODEL_PROTO"
+
+ONNX_BASE_OPSET = 9
+ONNX_MIN_OPSET = 7
+ONNX_MAX_OPSET = 19
+ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 17
+# ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
+ONNX_DEFAULT_OPSET = 17
+ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
+
+PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
+
+INT64_MAX = 9223372036854775807
+INT32_MAX = 2147483647
+INT16_MAX = 32767
+INT8_MAX = 127
+UINT8_MAX = 255
+
+INT64_MIN = -9223372036854775808
+INT32_MIN = -2147483648
+INT16_MIN = -32768
+INT8_MIN = -128
+UINT8_MIN = 0
diff --git a/MLPY/Lib/site-packages/torch/onnx/_deprecation.py b/MLPY/Lib/site-packages/torch/onnx/_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4e0be91b1935303df68ef8ad5c46b2829d7bec
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_deprecation.py
@@ -0,0 +1,64 @@
+"""Utility for deprecating functions."""
+
+import functools
+import textwrap
+import warnings
+
+
+def deprecated(since: str, removed_in: str, instructions: str):
+    """Marks functions as deprecated.
+
+    It will result in a warning when the function is called and a note in the
+    docstring.
+
+    Args:
+        since: The version when the function was first deprecated.
+        removed_in: The version when the function will be removed.
+        instructions: The action users should take.
+    """
+
+    def decorator(function):
+        @functools.wraps(function)
+        def wrapper(*args, **kwargs):
+            warnings.warn(
+                f"'{function.__module__}.{function.__name__}' "
+                f"is deprecated in version {since} and will be "
+                f"removed in {removed_in}. Please {instructions}.",
+                category=FutureWarning,
+                stacklevel=2,
+            )
+            return function(*args, **kwargs)
+
+        # Add a deprecation note to the docstring.
+        docstring = function.__doc__ or ""
+
+        # Add a note to the docstring.
+        deprecation_note = textwrap.dedent(
+            f"""\
+            .. deprecated:: {since}
+                Deprecated and will be removed in version {removed_in}.
+                Please {instructions}.
+            """
+        )
+
+        # Split docstring at first occurrence of newline
+        summary_and_body = docstring.split("\n\n", 1)
+
+        if len(summary_and_body) > 1:
+            summary, body = summary_and_body
+
+            # Dedent the body. We cannot do this with the presence of the summary because
+            # the body contains leading whitespaces when the summary does not.
+            body = textwrap.dedent(body)
+
+            new_docstring_parts = [deprecation_note, "\n\n", summary, body]
+        else:
+            summary = summary_and_body[0]
+
+            new_docstring_parts = [deprecation_note, "\n\n", summary]
+
+        wrapper.__doc__ = "".join(new_docstring_parts)
+
+        return wrapper
+
+    return decorator
diff --git a/MLPY/Lib/site-packages/torch/onnx/_experimental.py b/MLPY/Lib/site-packages/torch/onnx/_experimental.py
new file mode 100644
index 0000000000000000000000000000000000000000..964d5cf2ea7a0395278b4634f65123f2bc99cf55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_experimental.py
@@ -0,0 +1,28 @@
+"""Experimental classes and functions used by ONNX export."""
+
+import dataclasses
+from typing import Mapping, Optional, Sequence, Set, Type, Union
+
+import torch
+import torch._C._onnx as _C_onnx
+
+
+@dataclasses.dataclass
+class ExportOptions:
+    """Arguments used by :func:`torch.onnx.export`.
+
+    TODO: Adopt this in `torch.onnx.export` api to replace keyword arguments.
+    """
+
+    export_params: bool = True
+    verbose: bool = False
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+    input_names: Optional[Sequence[str]] = None
+    output_names: Optional[Sequence[str]] = None
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX
+    opset_version: Optional[int] = None
+    do_constant_folding: bool = True
+    dynamic_axes: Optional[Mapping[str, Union[Mapping[int, str], Sequence[int]]]] = None
+    keep_initializers_as_inputs: Optional[bool] = None
+    custom_opsets: Optional[Mapping[str, int]] = None
+    export_modules_as_functions: Union[bool, Set[Type[torch.nn.Module]]] = False
diff --git a/MLPY/Lib/site-packages/torch/onnx/_exporter_states.py b/MLPY/Lib/site-packages/torch/onnx/_exporter_states.py
new file mode 100644
index 0000000000000000000000000000000000000000..b049f8c14208e3966f6da1f02d7ded87576a02bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_exporter_states.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Dict
+
+from torch import _C
+
+
+class ExportTypes:
+    r"""Specifies how the ONNX model is stored."""
+
+    PROTOBUF_FILE = "Saves model in the specified protobuf file."
+    ZIP_ARCHIVE = "Saves model in the specified ZIP file (uncompressed)."
+    COMPRESSED_ZIP_ARCHIVE = "Saves model in the specified ZIP file (compressed)."
+    DIRECTORY = "Saves model in the specified folder."
+
+
+class SymbolicContext:
+    """Extra context for symbolic functions.
+
+    Args:
+        params_dict (Dict[str, _C.IValue]): Mapping from graph initializer name to IValue.
+        env (Dict[_C.Value, _C.Value]): Mapping from Torch domain graph Value to ONNX domain graph Value.
+        cur_node (_C.Node): Current node being converted to ONNX domain.
+        onnx_block (_C.Block): Current ONNX block that converted nodes are being appended to.
+    """
+
+    def __init__(
+        self,
+        params_dict: Dict[str, _C.IValue],
+        env: dict,
+        cur_node: _C.Node,
+        onnx_block: _C.Block,
+    ):
+        self.params_dict: Dict[str, _C.IValue] = params_dict
+        self.env: Dict[_C.Value, _C.Value] = env
+        # Current node that is being converted.
+        self.cur_node: _C.Node = cur_node
+        # Current onnx block that converted nodes are being appended to.
+        self.onnx_block: _C.Block = onnx_block
diff --git a/MLPY/Lib/site-packages/torch/onnx/_globals.py b/MLPY/Lib/site-packages/torch/onnx/_globals.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9df9560300eba682896aaf1ffef3c52d1084061
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_globals.py
@@ -0,0 +1,85 @@
+"""Globals used internally by the ONNX exporter.
+
+Do not use this module outside of `torch.onnx` and its tests.
+
+Be very judicious when adding any new global variables. Do not create new global
+variables unless they are absolutely necessary.
+"""
+import torch._C._onnx as _C_onnx
+
+# This module should only depend on _constants and nothing else in torch.onnx to keep
+# dependency direction clean.
+from torch.onnx import _constants
+
+
+class _InternalGlobals:
+    """Globals used internally by ONNX exporter.
+
+    NOTE: Be very judicious when adding any new variables. Do not create new
+    global variables unless they are absolutely necessary.
+    """
+
+    def __init__(self):
+        self._export_onnx_opset_version = _constants.ONNX_DEFAULT_OPSET
+        self._training_mode: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+        self._in_onnx_export: bool = False
+        # Whether the user's model is training during export
+        self.export_training: bool = False
+        self.operator_export_type: _C_onnx.OperatorExportTypes = (
+            _C_onnx.OperatorExportTypes.ONNX
+        )
+        self.onnx_shape_inference: bool = True
+        self._autograd_inlining: bool = True
+
+    @property
+    def training_mode(self):
+        """The training mode for the exporter."""
+        return self._training_mode
+
+    @training_mode.setter
+    def training_mode(self, training_mode: _C_onnx.TrainingMode):
+        if not isinstance(training_mode, _C_onnx.TrainingMode):
+            raise TypeError(
+                "training_mode must be of type 'torch.onnx.TrainingMode'. This is "
+                "likely a bug in torch.onnx."
+            )
+        self._training_mode = training_mode
+
+    @property
+    def export_onnx_opset_version(self) -> int:
+        """Opset version used during export."""
+        return self._export_onnx_opset_version
+
+    @export_onnx_opset_version.setter
+    def export_onnx_opset_version(self, value: int):
+        supported_versions = range(
+            _constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1
+        )
+        if value not in supported_versions:
+            raise ValueError(f"Unsupported ONNX opset version: {value}")
+        self._export_onnx_opset_version = value
+
+    @property
+    def in_onnx_export(self) -> bool:
+        """Whether it is in the middle of ONNX export."""
+        return self._in_onnx_export
+
+    @in_onnx_export.setter
+    def in_onnx_export(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("in_onnx_export must be a boolean")
+        self._in_onnx_export = value
+
+    @property
+    def autograd_inlining(self) -> bool:
+        """Whether Autograd must be inlined."""
+        return self._autograd_inlining
+
+    @autograd_inlining.setter
+    def autograd_inlining(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("autograd_inlining must be a boolean")
+        self._autograd_inlining = value
+
+
+GLOBALS = _InternalGlobals()
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfdf89258eff58c49bf1f5757bd103e188a0f7a6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/_beartype.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/_beartype.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5846466a2437c9c80122a852fd63a10e31a6e531
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/_beartype.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/exporter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/exporter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cbcc05497ca34e495e76d7e2a006cd70e1571ec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/exporter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/io_adapter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/io_adapter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fe4aac1447963217d5b40d33f3b7962d026b26d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/io_adapter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/jit_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/jit_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0434e626d76f6f50348aa51ddb30c55cd0ab626f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/jit_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnx_proto_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnx_proto_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7815302ec265c041a781067381bcda744a6a5f3b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnx_proto_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnxruntime.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnxruntime.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f33c6df755c166719ea6f1411c61848d55fe001
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/onnxruntime.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/registration.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/registration.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae2205fc0a159321e2761ed3fb44b1f072d3e078
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/__pycache__/registration.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/_beartype.py b/MLPY/Lib/site-packages/torch/onnx/_internal/_beartype.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2f748f45d9eb03194d5f9ac4d1d7677170768ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/_beartype.py
@@ -0,0 +1,131 @@
+"""An internal wrapper for the beartype library.
+
+The module returns a no-op decorator when the beartype library is not installed.
+"""
+import enum
+import functools
+import os
+import traceback
+import typing
+import warnings
+from types import ModuleType
+
+try:
+    import beartype as _beartype_lib  # type: ignore[import]
+    from beartype import roar as _roar  # type: ignore[import]
+
+    # Beartype warns when we import from typing because the types are deprecated
+    # in Python 3.9. But there will be a long time until we can move to using
+    # the native container types for type annotations (when 3.9 is the lowest
+    # supported version). So we silence the warning.
+    warnings.filterwarnings(
+        "ignore",
+        category=_roar.BeartypeDecorHintPep585DeprecationWarning,
+    )
+
+    if _beartype_lib.__version__ == "0.16.0":
+        # beartype 0.16.0 has a bug that causes it to crash when used with
+        # PyTorch. See https://github.com/beartype/beartype/issues/282
+        warnings.warn("beartype 0.16.0 is not supported. Please upgrade to 0.16.1+.")
+        _beartype_lib = None  # type: ignore[assignment]
+except ImportError:
+    _beartype_lib = None  # type: ignore[assignment]
+except Exception as e:
+    # Warn errors that are not import errors (unexpected).
+    warnings.warn(f"{e}")
+    _beartype_lib = None  # type: ignore[assignment]
+
+
+@enum.unique
+class RuntimeTypeCheckState(enum.Enum):
+    """Runtime type check state."""
+
+    # Runtime type checking is disabled.
+    DISABLED = enum.auto()
+    # Runtime type checking is enabled but warnings are shown only.
+    WARNINGS = enum.auto()
+    # Runtime type checking is enabled.
+    ERRORS = enum.auto()
+
+
+class CallHintViolationWarning(UserWarning):
+    """Warning raised when a type hint is violated during a function call."""
+
+    pass
+
+
+def _no_op_decorator(func):
+    return func
+
+
+def _create_beartype_decorator(
+    runtime_check_state: RuntimeTypeCheckState,
+):
+    # beartype needs to be imported outside of the function and aliased because
+    # this module overwrites the name "beartype".
+
+    if runtime_check_state == RuntimeTypeCheckState.DISABLED:
+        return _no_op_decorator
+    if _beartype_lib is None:
+        # If the beartype library is not installed, return a no-op decorator
+        return _no_op_decorator
+
+    assert isinstance(_beartype_lib, ModuleType)
+
+    if runtime_check_state == RuntimeTypeCheckState.ERRORS:
+        # Enable runtime type checking which errors on any type hint violation.
+        return _beartype_lib.beartype
+
+    # Warnings only
+    def beartype(func):
+        """Warn on type hint violation."""
+
+        if "return" in func.__annotations__:
+            # Remove the return type from the func function's
+            # annotations so that the beartype decorator does not complain
+            # about the return type.
+            return_type = func.__annotations__["return"]
+            del func.__annotations__["return"]
+            beartyped = _beartype_lib.beartype(func)
+            # Restore the return type to the func function's annotations
+            func.__annotations__["return"] = return_type
+        else:
+            beartyped = _beartype_lib.beartype(func)
+
+        @functools.wraps(func)
+        def _coerce_beartype_exceptions_to_warnings(*args, **kwargs):
+            try:
+                return beartyped(*args, **kwargs)
+            except _roar.BeartypeCallHintParamViolation:
+                # Fall back to the original function if the beartype hint is violated.
+                warnings.warn(
+                    traceback.format_exc(),
+                    category=CallHintViolationWarning,
+                    stacklevel=2,
+                )
+
+            return func(*args, **kwargs)  # noqa: B012
+
+        return _coerce_beartype_exceptions_to_warnings
+
+    return beartype
+
+
+if typing.TYPE_CHECKING:
+    # This is a hack to make mypy play nicely with the beartype decorator.
+    def beartype(func):
+        return func
+
+else:
+    _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK = os.getenv(
+        "TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK"
+    )
+    if _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK == "ERRORS":
+        _runtime_type_check_state = RuntimeTypeCheckState.ERRORS
+    elif _TORCH_ONNX_EXPERIMENTAL_RUNTIME_TYPE_CHECK == "DISABLED":
+        _runtime_type_check_state = RuntimeTypeCheckState.DISABLED
+    else:
+        _runtime_type_check_state = RuntimeTypeCheckState.WARNINGS
+    beartype = _create_beartype_decorator(_runtime_type_check_state)
+    # Make sure that the beartype decorator is enabled whichever path we took.
+    assert beartype is not None
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d46f390c21b4d8ea5e374859b7bf3d925ce36
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py
@@ -0,0 +1,21 @@
+from ._diagnostic import (
+    create_export_diagnostic_context,
+    diagnose,
+    engine,
+    export_context,
+    ExportDiagnosticEngine,
+    TorchScriptOnnxExportDiagnostic,
+)
+from ._rules import rules
+from .infra import levels
+
+__all__ = [
+    "TorchScriptOnnxExportDiagnostic",
+    "ExportDiagnosticEngine",
+    "rules",
+    "levels",
+    "engine",
+    "export_context",
+    "create_export_diagnostic_context",
+    "diagnose",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..937ef5f4c8e9c503436b93134bad66dfc54e601a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_diagnostic.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_diagnostic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2396d934f0507dd51ad022f61355ea1d0e660b90
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_diagnostic.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_rules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_rules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c573efc4828ed48ec71e0588bdd506947ec47fd6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/__pycache__/_rules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ae336fdd29e18a739079ce7be38b8a65c1bdaa6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -0,0 +1,212 @@
+"""Diagnostic components for TorchScript based ONNX export, i.e. `torch.onnx.export`."""
+from __future__ import annotations
+
+import contextlib
+import gzip
+from collections.abc import Generator
+from typing import List, Optional
+
+import torch
+
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, sarif
+from torch.onnx._internal.diagnostics.infra.sarif import version as sarif_version
+from torch.utils import cpp_backtrace
+
+
+def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32) -> infra.Stack:
+    """Returns the current C++ call stack.
+
+    This function utilizes `torch.utils.cpp_backtrace` to get the current C++ call stack.
+    The returned C++ call stack is a concatenated string of the C++ call stack frames.
+    Each frame is separated by a newline character, in the same format of
+    r"frame #[0-9]+: (?P<frame_info>.*)". More info at `c10/util/Backtrace.cpp`.
+
+    """
+    # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
+    frames = cpp_backtrace.get_cpp_backtrace(frames_to_skip, frames_to_log).split("\n")
+    frame_messages = []
+    for frame in frames:
+        segments = frame.split(":", 1)
+        if len(segments) == 2:
+            frame_messages.append(segments[1].strip())
+        else:
+            frame_messages.append("<unknown frame>")
+    return infra.Stack(
+        frames=[
+            infra.StackFrame(location=infra.Location(message=message))
+            for message in frame_messages
+        ]
+    )
+
+
+class TorchScriptOnnxExportDiagnostic(infra.Diagnostic):
+    """Base class for all export diagnostics.
+
+    This class is used to represent all export diagnostics. It is a subclass of
+    infra.Diagnostic, and adds additional methods to add more information to the
+    diagnostic.
+    """
+
+    python_call_stack: Optional[infra.Stack] = None
+    cpp_call_stack: Optional[infra.Stack] = None
+
+    def __init__(
+        self,
+        *args,
+        frames_to_skip: int = 1,
+        cpp_stack: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.python_call_stack = self.record_python_call_stack(
+            frames_to_skip=frames_to_skip
+        )
+        if cpp_stack:
+            self.cpp_call_stack = self.record_cpp_call_stack(
+                frames_to_skip=frames_to_skip
+            )
+
+    def record_cpp_call_stack(self, frames_to_skip: int) -> infra.Stack:
+        """Records the current C++ call stack in the diagnostic."""
+        # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
+        # No need to skip this function because python frame is not recorded
+        # in cpp call stack.
+        stack = _cpp_call_stack(frames_to_skip=frames_to_skip)
+        stack.message = "C++ call stack"
+        self.with_stack(stack)
+        return stack
+
+
+class ExportDiagnosticEngine:
+    """PyTorch ONNX Export diagnostic engine.
+
+    The only purpose of creating this class instead of using `DiagnosticContext` directly
+    is to provide a background context for `diagnose` calls inside exporter.
+
+    By design, one `torch.onnx.export` call should initialize one diagnostic context.
+    All `diagnose` calls inside exporter should be made in the context of that export.
+    However, since diagnostic context is currently being accessed via a global variable,
+    there is no guarantee that the context is properly initialized. Therefore, we need
+    to provide a default background context to fallback to, otherwise any invocation of
+    exporter internals, e.g. unit tests, will fail due to missing diagnostic context.
+    This can be removed once the pipeline for context to flow through the exporter is
+    established.
+    """
+
+    contexts: List[infra.DiagnosticContext]
+    _background_context: infra.DiagnosticContext
+
+    def __init__(self) -> None:
+        self.contexts = []
+        self._background_context = infra.DiagnosticContext(
+            name="torch.onnx",
+            version=torch.__version__,
+        )
+
+    @property
+    def background_context(self) -> infra.DiagnosticContext:
+        return self._background_context
+
+    def create_diagnostic_context(
+        self,
+        name: str,
+        version: str,
+        options: Optional[infra.DiagnosticOptions] = None,
+    ) -> infra.DiagnosticContext:
+        """Creates a new diagnostic context.
+
+        Args:
+            name: The subject name for the diagnostic context.
+            version: The subject version for the diagnostic context.
+            options: The options for the diagnostic context.
+
+        Returns:
+            A new diagnostic context.
+        """
+        if options is None:
+            options = infra.DiagnosticOptions()
+        context: infra.DiagnosticContext[infra.Diagnostic] = infra.DiagnosticContext(
+            name, version, options
+        )
+        self.contexts.append(context)
+        return context
+
+    def clear(self):
+        """Clears all diagnostic contexts."""
+        self.contexts.clear()
+        self._background_context.diagnostics.clear()
+
+    def to_json(self) -> str:
+        return formatter.sarif_to_json(self.sarif_log())
+
+    def dump(self, file_path: str, compress: bool = False) -> None:
+        """Dumps the SARIF log to a file."""
+        if compress:
+            with gzip.open(file_path, "wt") as f:
+                f.write(self.to_json())
+        else:
+            with open(file_path, "w") as f:
+                f.write(self.to_json())
+
+    def sarif_log(self):
+        log = sarif.SarifLog(
+            version=sarif_version.SARIF_VERSION,
+            schema_uri=sarif_version.SARIF_SCHEMA_LINK,
+            runs=[context.sarif() for context in self.contexts],
+        )
+
+        log.runs.append(self._background_context.sarif())
+        return log
+
+
+engine = ExportDiagnosticEngine()
+_context = engine.background_context
+
+
+@contextlib.contextmanager
+def create_export_diagnostic_context() -> (
+    Generator[infra.DiagnosticContext, None, None]
+):
+    """Create a diagnostic context for export.
+
+    This is a workaround for code robustness since diagnostic context is accessed by
+    export internals via global variable. See `ExportDiagnosticEngine` for more details.
+    """
+    global _context
+    assert (
+        _context == engine.background_context
+    ), "Export context is already set. Nested export is not supported."
+    _context = engine.create_diagnostic_context(
+        "torch.onnx.export",
+        torch.__version__,
+    )
+    try:
+        yield _context
+    finally:
+        _context = engine.background_context
+
+
+def diagnose(
+    rule: infra.Rule,
+    level: infra.Level,
+    message: Optional[str] = None,
+    frames_to_skip: int = 2,
+    **kwargs,
+) -> TorchScriptOnnxExportDiagnostic:
+    """Creates a diagnostic and record it in the global diagnostic context.
+
+    This is a wrapper around `context.log` that uses the global diagnostic
+    context.
+    """
+    # NOTE: Cannot use `@_beartype.beartype`. It somehow erases the cpp stack frame info.
+    diagnostic = TorchScriptOnnxExportDiagnostic(
+        rule, level, message, frames_to_skip=frames_to_skip, **kwargs
+    )
+    export_context().log(diagnostic)
+    return diagnostic
+
+
+def export_context() -> infra.DiagnosticContext:
+    global _context
+    return _context
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..96abe97ebb285be96330d65b66308657003ec53d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py
@@ -0,0 +1,634 @@
+"""
+GENERATED CODE - DO NOT EDIT DIRECTLY
+This file is generated by gen_diagnostics.py.
+See tools/onnx/gen_diagnostics.py for more information.
+
+Diagnostic rules for PyTorch ONNX export.
+"""
+
+import dataclasses
+from typing import Tuple
+
+# flake8: noqa
+from torch.onnx._internal.diagnostics import infra
+
+"""
+GENERATED CODE - DO NOT EDIT DIRECTLY
+The purpose of generating a class for each rule is to override the `format_message`
+method to provide more details in the signature about the format arguments.
+"""
+
+
+class _NodeMissingOnnxShapeInference(infra.Rule):
+    """Node is missing ONNX shape inference."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
+
+class _MissingCustomSymbolicFunction(infra.Rule):
+    """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
+
+class _MissingStandardSymbolicFunction(infra.Rule):
+    """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, issue_url
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return self.message_default_template.format(
+            op_name=op_name, opset_version=opset_version, issue_url=issue_url
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, issue_url
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name, opset_version=opset_version, issue_url=issue_url
+            ),
+        )
+
+
+class _OperatorSupportedInNewerOpsetVersion(infra.Rule):
+    """Operator is supported in newer opset version."""
+
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, supported_opset_version
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return self.message_default_template.format(
+            op_name=op_name,
+            opset_version=opset_version,
+            supported_opset_version=supported_opset_version,
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, supported_opset_version
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name,
+                opset_version=opset_version,
+                supported_opset_version=supported_opset_version,
+            ),
+        )
+
+
+class _FxGraphToOnnx(infra.Rule):
+    """Transforms graph from FX IR to ONNX IR."""
+
+    def format_message(self, graph_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Transforming FX graph {graph_name} to ONNX graph.'
+        """
+        return self.message_default_template.format(graph_name=graph_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, graph_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Transforming FX graph {graph_name} to ONNX graph.'
+        """
+        return self, level, self.format_message(graph_name=graph_name)
+
+
+class _FxNodeToOnnx(infra.Rule):
+    """Transforms an FX node to an ONNX node."""
+
+    def format_message(self, node_repr) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Transforming FX node {node_repr} to ONNX node.'
+        """
+        return self.message_default_template.format(node_repr=node_repr)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node_repr
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Transforming FX node {node_repr} to ONNX node.'
+        """
+        return self, level, self.format_message(node_repr=node_repr)
+
+
+class _FxPass(infra.Rule):
+    """FX graph transformation during ONNX export before converting from FX IR to ONNX IR."""
+
+    def format_message(self, pass_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Running {pass_name} pass.'
+        """
+        return self.message_default_template.format(pass_name=pass_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, pass_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Running {pass_name} pass.'
+        """
+        return self, level, self.format_message(pass_name=pass_name)
+
+
+class _NoSymbolicFunctionForCallFunction(infra.Rule):
+    """Cannot find symbolic function to convert the "call_function" FX node to ONNX."""
+
+    def format_message(self, target) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+        """
+        return self.message_default_template.format(target=target)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, target
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+        """
+        return self, level, self.format_message(target=target)
+
+
+class _UnsupportedFxNodeAnalysis(infra.Rule):
+    """Result from FX graph analysis to reveal unsupported FX nodes."""
+
+    def format_message(  # type: ignore[override]
+        self, node_op_to_target_mapping
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Unsupported FX nodes: {node_op_to_target_mapping}. '
+        """
+        return self.message_default_template.format(
+            node_op_to_target_mapping=node_op_to_target_mapping
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node_op_to_target_mapping
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Unsupported FX nodes: {node_op_to_target_mapping}. '
+        """
+        return (
+            self,
+            level,
+            self.format_message(node_op_to_target_mapping=node_op_to_target_mapping),
+        )
+
+
+class _OpLevelDebugging(infra.Rule):
+    """Report any op level validation failure in warnings."""
+
+    def format_message(self, node, symbolic_fn) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation.'
+        """
+        return self.message_default_template.format(node=node, symbolic_fn=symbolic_fn)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node, symbolic_fn
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation.'
+        """
+        return self, level, self.format_message(node=node, symbolic_fn=symbolic_fn)
+
+
+class _FindOpschemaMatchedSymbolicFunction(infra.Rule):
+    """Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."""
+
+    def format_message(self, symbolic_fn, node) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}.'
+        """
+        return self.message_default_template.format(symbolic_fn=symbolic_fn, node=node)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, symbolic_fn, node
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}.'
+        """
+        return self, level, self.format_message(symbolic_fn=symbolic_fn, node=node)
+
+
+class _FxNodeInsertTypePromotion(infra.Rule):
+    """Determine if type promotion is required for the FX node. Insert cast nodes if needed."""
+
+    def format_message(self, target) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Performing explicit type promotion for node {target}. '
+        """
+        return self.message_default_template.format(target=target)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, target
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Performing explicit type promotion for node {target}. '
+        """
+        return self, level, self.format_message(target=target)
+
+
+class _FindOperatorOverloadsInOnnxRegistry(infra.Rule):
+    """Find the list of OnnxFunction of the PyTorch operator in onnx registry."""
+
+    def format_message(self, node) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Checking if the FX node: {node} is supported in onnx registry.'
+        """
+        return self.message_default_template.format(node=node)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Checking if the FX node: {node} is supported in onnx registry.'
+        """
+        return self, level, self.format_message(node=node)
+
+
+@dataclasses.dataclass
+class _POERules(infra.RuleCollection):
+    node_missing_onnx_shape_inference: _NodeMissingOnnxShapeInference = dataclasses.field(
+        default=_NodeMissingOnnxShapeInference.from_sarif(
+            **{
+                "id": "POE0001",
+                "name": "node-missing-onnx-shape-inference",
+                "short_description": {"text": "Node is missing ONNX shape inference."},
+                "full_description": {
+                    "text": "Node is missing ONNX shape inference. This usually happens when the node is not valid under standard ONNX operator spec.",
+                    "markdown": "Node is missing ONNX shape inference.\nThis usually happens when the node is not valid under standard ONNX operator spec.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Node is missing ONNX shape inference."""
+
+    missing_custom_symbolic_function: _MissingCustomSymbolicFunction = dataclasses.field(
+        default=_MissingCustomSymbolicFunction.from_sarif(
+            **{
+                "id": "POE0002",
+                "name": "missing-custom-symbolic-function",
+                "short_description": {
+                    "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."
+                },
+                "full_description": {
+                    "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.",
+                    "markdown": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
+
+    missing_standard_symbolic_function: _MissingStandardSymbolicFunction = dataclasses.field(
+        default=_MissingStandardSymbolicFunction.from_sarif(
+            **{
+                "id": "POE0003",
+                "name": "missing-standard-symbolic-function",
+                "short_description": {
+                    "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."
+                },
+                "full_description": {
+                    "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.",
+                    "markdown": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
+
+    operator_supported_in_newer_opset_version: _OperatorSupportedInNewerOpsetVersion = dataclasses.field(
+        default=_OperatorSupportedInNewerOpsetVersion.from_sarif(
+            **{
+                "id": "POE0004",
+                "name": "operator-supported-in-newer-opset-version",
+                "short_description": {
+                    "text": "Operator is supported in newer opset version."
+                },
+                "full_description": {
+                    "text": "Operator is supported in newer opset version.",
+                    "markdown": "Operator is supported in newer opset version.\n\nExample:\n```python\ntorch.onnx.export(model, args, ..., opset_version=9)\n```\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Operator is supported in newer opset version."""
+
+    fx_graph_to_onnx: _FxGraphToOnnx = dataclasses.field(
+        default=_FxGraphToOnnx.from_sarif(
+            **{
+                "id": "FXE0007",
+                "name": "fx-graph-to-onnx",
+                "short_description": {
+                    "text": "Transforms graph from FX IR to ONNX IR."
+                },
+                "full_description": {
+                    "text": "Transforms graph from FX IR to ONNX IR.",
+                    "markdown": "This diagnostic tracks the transformation process from an FX Graph (in FX IR) to an ONNX Graph (in ONNX IR).\n\n## Key Representations:\n\n- **FX Graph**: The graph in FX IR produced by dynamo or symbolic tracing.\n- **ONNX Graph**: The graph in ONNX IR and [operators](https://onnx.ai/onnx/operators/).\n\n## Additional Notes:\n\n- Prior to this transformation step, the FX graph undergoes preprocessing through multiple FX passes.\n  To gain insight into these transformations, refer to diagnostic `FXE0010`.\n- To enable a detailed view of the graph transformation in progress within this diagnostic, switch to the DEBUG mode.\n\n  - Set DiagnosticOptions.verbosity_level to logging.DEBUG.\n  - Activate the environment variable TORCH_LOGS='onnx_diagnostics'.\n\n- For specific information related to node-level FX to ONNX transformations, explore the diagnostic `FXE0008`.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Transforming FX graph {graph_name} to ONNX graph."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Transforms graph from FX IR to ONNX IR."""
+
+    fx_node_to_onnx: _FxNodeToOnnx = dataclasses.field(
+        default=_FxNodeToOnnx.from_sarif(
+            **{
+                "id": "FXE0008",
+                "name": "fx-node-to-onnx",
+                "short_description": {"text": "Transforms an FX node to an ONNX node."},
+                "full_description": {
+                    "text": "Transforms an FX node to an ONNX node.",
+                    "markdown": "This diagnostic tracks the transformation process from an FX Node to ONNX [Operators](https://onnx.ai/onnx/operators/).\n\nThe process of converting FX Node to ONNX Node involves dealing with six distinct node types:\n  1. `placeholder`: Represents a module input, maps to an ONNX graph input.\n  2. `call_module`: Symbolizes a call to a submodule, maps to an ONNX\n  3. `call_method`: Symbolizes a method call. Not yet implemented.\n  4. `call_function`: Symbolizes a function call. [Core ATen](https://pytorch.org/docs/stable/ir.html#core-aten-ir) is expected\n    as the function call target. The mapping from ATen to ONNX is implemented by [ONNXScript torchlib](https://github.com/microsoft/onnxscript/tree/main/onnxscript/function_libs/torch_lib/ops).\n    This [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) shows how to write and register a custom symbolic function for call_function FX node.\n  5. `get_attr`: Indicates an attribute access within the current module. Maps to an ONNX graph initializer.\n  6. `output`: Represents the module's output. Maps to an ONNX graph output.\n\nFor a granular understanding of how each node type is transformed, refer to the implementation details in `FxOnnxInterpreter`.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Transforming FX node {node_repr} to ONNX node."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Transforms an FX node to an ONNX node."""
+
+    fx_pass: _FxPass = dataclasses.field(
+        default=_FxPass.from_sarif(
+            **{
+                "id": "FXE0010",
+                "name": "fx-pass",
+                "short_description": {
+                    "text": "FX graph transformation during ONNX export before converting from FX IR to ONNX IR."
+                },
+                "full_description": {
+                    "text": "FX graph transformation during ONNX export before converting from FX IR to ONNX IR.",
+                    "markdown": "This diagnostic tracks the FX passes executed during the ONNX export process prior\nto converting from FX IR (Intermediate Representation) to ONNX IR.\n\nUnder the scope of ONNX export, an FX pass refers to a specific transformation applied to the FX GraphModule.\nThe primary aim of these passes is to streamline the graph into a format that aligns more with the ONNX IR.\nMoreover, these passes work to substitute unsupported FX IR features with those recognized and endorsed by\nONNX IR. Common transformations include, but aren't limited to, decomposition, functionalization and\ntype promotion.\n\nFor those who are interested in a comprehensive log detailing the modifications made during these passes,\nthere are a couple of options:\n\n- Set DiagnosticOptions.verbosity_level to logging.DEBUG.\n- Activate the environment variable TORCH_LOGS='onnx_diagnostics'.\n\nHowever, it's noteworthy that by default, such detailed logging is turned off. The primary reason being\nits considerable impact on performance.\n\nFor an in-depth understanding of each specific pass, please refer to the directory: torch/onnx/_internal/fx/passes.\n",
+                },
+                "message_strings": {"default": {"text": "Running {pass_name} pass."}},
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX graph transformation during ONNX export before converting from FX IR to ONNX IR."""
+
+    no_symbolic_function_for_call_function: _NoSymbolicFunctionForCallFunction = dataclasses.field(
+        default=_NoSymbolicFunctionForCallFunction.from_sarif(
+            **{
+                "id": "FXE0011",
+                "name": "no-symbolic-function-for-call-function",
+                "short_description": {
+                    "text": 'Cannot find symbolic function to convert the "call_function" FX node to ONNX.'
+                },
+                "full_description": {
+                    "text": 'Cannot find symbolic function to convert the "call_function" FX node to ONNX. ',
+                    "markdown": 'This error occurs when the ONNX converter is unable to find a corresponding symbolic function\nto convert a "call_function" node in the input graph to its equivalence in ONNX. The "call_function"\nnode represents a normalized function call in PyTorch, such as "torch.aten.ops.add".\n\nTo resolve this error, you can try one of the following:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html#overview) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n',
+                },
+                "message_strings": {
+                    "default": {
+                        "text": 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Cannot find symbolic function to convert the "call_function" FX node to ONNX."""
+
+    unsupported_fx_node_analysis: _UnsupportedFxNodeAnalysis = dataclasses.field(
+        default=_UnsupportedFxNodeAnalysis.from_sarif(
+            **{
+                "id": "FXE0012",
+                "name": "unsupported-fx-node-analysis",
+                "short_description": {
+                    "text": "Result from FX graph analysis to reveal unsupported FX nodes."
+                },
+                "full_description": {
+                    "text": "Result from FX graph analysis to reveal unsupported FX nodes.",
+                    "markdown": "This error indicates that an FX graph contains one or more unsupported nodes. The error message\nis typically accompanied by a list of the unsupported nodes found during analysis.\n\nTo resolve this error, you can try resolving each individual unsupported node error by following\nthe suggestions by its diagnostic. Typically, options include:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Unsupported FX nodes: {node_op_to_target_mapping}. "
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Result from FX graph analysis to reveal unsupported FX nodes."""
+
+    op_level_debugging: _OpLevelDebugging = dataclasses.field(
+        default=_OpLevelDebugging.from_sarif(
+            **{
+                "id": "FXE0013",
+                "name": "op-level-debugging",
+                "short_description": {
+                    "text": "Report any op level validation failure in warnings."
+                },
+                "full_description": {
+                    "text": "Report any op level validation failure in warnings.",
+                    "markdown": "This warning message indicates that during op level debugging, certain symbolic functions\nhave failed to match the results of torch ops when using real tensors generated from fake\ntensors. It is important to note that the symbolic functions may not necessarily be\nincorrect, as the validation process is non-deterministic and should only be used as a\nreference.\n\nThere are two categories of warnings that can be triggered:\n\n1. Non-validated operators:\n  If the warnings are caused by the following errors, they can be disregarded by users,\n  as these errors occur due to the non-deterministic nature of the validation. However,\n  it is important to be aware that the operators have not been validated.\n\n  - IndexError: Unsupported input arguments of randomized dimensions/indices(INT64).\n  - RuntimeError: Unsupported input arguments for torch ops are generated.\n  - ValueError: Arguments/keyword arguments do not match the signature of the symbolic function.\n\n2. Potentially wrong torchlib operators:\n  If the warnings are triggered by the following error, users should be aware that the symbolic functions\n  may be incorrect in dispatching or implementation. In such cases, it is recommended to report\n  the issue to the PyTorch-ONNX team, or create/register a custom symbolic function to replace the default one.\n\n  - AssertionError: The symbolic function is potentially wrong as the results do not match the results of torch ops.\n  - TypeError: The symbolic function is potentially wrong as the opschema doesn't match inputs.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Report any op level validation failure in warnings."""
+
+    find_opschema_matched_symbolic_function: _FindOpschemaMatchedSymbolicFunction = dataclasses.field(
+        default=_FindOpschemaMatchedSymbolicFunction.from_sarif(
+            **{
+                "id": "FXE0014",
+                "name": "find-opschema-matched-symbolic-function",
+                "short_description": {
+                    "text": "Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."
+                },
+                "full_description": {
+                    "text": "Find the OnnxFunction that matches the input dtypes by comparing them with their opschemas. A warning will be issued if the matched OnnxFunction is not an exact match.",
+                    "markdown": "When an ATen/Custom operator is registered and needs to be dispatched to an OnnxFunction, the input/attribute\ndtypes of the ATen/Custom operator are compared with the input/attribute dtypes of the OnnxFunction opschemas\nto find a match. However, if a perfect/exact match is not found, the dispatcher will attempt to find\nthe nearest match with the highest number of input/attribute dtypes matching the OnnxFunction opschemas, while\nissuing a warning.\n\nThere are two types of level that can be triggered in this rule:\n\n1. NOTE: A perfect match is found, and no warning is issued.\n2. WARNING: The matched OnnxFunction is not a perfect/exact match.\n\nHere are some suggestions based on the WARNING situation:\n\n1. If there are NO errors or mismatches in the results, it is safe to disregard this warning,\n  as the definition of OnnxFunction schema is usually more stringent.\n2. If there are errors or mismatches in the results, it is recommended to:\n  (a) Enable op_level_debugging to determine if the OnnxFunction might be incorrect.\n  (b) Report the issue to the PyTorch-ONNX team.\n  (c) Create/register a custom symbolic function to replace the default one.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."""
+
+    fx_node_insert_type_promotion: _FxNodeInsertTypePromotion = dataclasses.field(
+        default=_FxNodeInsertTypePromotion.from_sarif(
+            **{
+                "id": "FXE0015",
+                "name": "fx-node-insert-type-promotion",
+                "short_description": {
+                    "text": "Determine if type promotion is required for the FX node. Insert cast nodes if needed."
+                },
+                "full_description": {
+                    "text": "Determine if type promotion is required for the FX node. Insert cast nodes if needed.",
+                    "markdown": "This diagnostic monitors the node-level type promotion insertion process. In PyTorch, there is an automatic process called implicit type promotion,\nwhere the input types of an operator are promoted to a common type. The determination of the common type is based on the type promotion rule specific to each operator.\nTo learn more about PyTorch's type promotion rules, refer to the [elementwise_dtypes doc](https://github.com/pytorch/pytorch/blob/f044613f78df713fb57f70c608483c9f10ad332e/torch/_prims_common/__init__.py#L1252-L1335)\nand [torch._refs ops](https://github.com/pytorch/pytorch/blob/a475ea4542dfe961c9d097e33ab5041f61c8c17f/torch/_refs/__init__.py#L484).\n\nHowever, implicit type promotion is not supported in ONNX. Therefore, to replicate the PyTorch behavior, we need to explicitly insert cast nodes.\nThis diagnostic tracks the process of node-level type promotion insertion.\n\nThe type promotion rules used by this process can be found in `torch/onnx/_internal/fx/passes/type_promotion.py.`\nTo update or add new type promotion rules, please refer to the [Note: Update type promotion rule] section.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Performing explicit type promotion for node {target}. "
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Determine if type promotion is required for the FX node. Insert cast nodes if needed."""
+
+    find_operator_overloads_in_onnx_registry: _FindOperatorOverloadsInOnnxRegistry = dataclasses.field(
+        default=_FindOperatorOverloadsInOnnxRegistry.from_sarif(
+            **{
+                "id": "FXE0016",
+                "name": "find-operator-overloads-in-onnx-registry",
+                "short_description": {
+                    "text": "Find the list of OnnxFunction of the PyTorch operator in onnx registry."
+                },
+                "full_description": {
+                    "text": "This rule involves finding the list of OnnxFunction for the PyTorch operator overload in the ONNX registry. If the operator overload is not supported but its default overload is, a warning will be issued. If both the operator overload and its default overload are not supported, an error will be issued.",
+                    "markdown": "The operator overload name serves the purpose of verifying whether a PyTorch operator is registered in the ONNX registry.\nIf it's not found, the dispatcher takes a fallback approach and tries to locate the default overload of the PyTorch\noperator in the registry. If even the default overload is absent, it signifies that the operator is officially unsupported.\n\nThere are three types of level that can be triggered in this rule:\n\n1. NOTE: The op overload is supported.\n2. WARNING: The op overload is not supported, but it's default overload is supported.\n3. ERROR: The op overload is not supported, and it's default overload is also not supported.\n\nHere are some suggestions based on the WARNING situation:\n\n1. If there are NO errors or mismatches in the results, it is safe to disregard this warning.\n2. If there are errors or mismatches in the results, it is recommended to:\n  (a) Enable op_level_debugging to determine if the OnnxFunction might be incorrect.\n  (b) Report the unsupported overload to the PyTorch-ONNX team.\n  (c) Create/register a custom symbolic function to replace the default one.\n\nHere are some suggestions based on the ERROR situation:\n\n1. Report the unsupported operator to the PyTorch-ONNX team.\n2. Create/register a custom symbolic function to replace the default one.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Checking if the FX node: {node} is supported in onnx registry."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Find the list of OnnxFunction of the PyTorch operator in onnx registry."""
+
+
+rules = _POERules()
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..05fd47bbc54547e8dfe52c0c2498adc949f1c4d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__init__.py
@@ -0,0 +1,33 @@
+from ._infra import (
+    DiagnosticOptions,
+    Graph,
+    Invocation,
+    Level,
+    levels,
+    Location,
+    Rule,
+    RuleCollection,
+    Stack,
+    StackFrame,
+    Tag,
+    ThreadFlowLocation,
+)
+from .context import Diagnostic, DiagnosticContext, RuntimeErrorWithDiagnostic
+
+__all__ = [
+    "Diagnostic",
+    "DiagnosticContext",
+    "DiagnosticOptions",
+    "Graph",
+    "Invocation",
+    "Level",
+    "levels",
+    "Location",
+    "Rule",
+    "RuleCollection",
+    "RuntimeErrorWithDiagnostic",
+    "Stack",
+    "StackFrame",
+    "Tag",
+    "ThreadFlowLocation",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90929a2e67172372632520a25acb1a24f6d95afd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/_infra.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/_infra.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..668dc244d08d1292e92160de795b7da7150d7a05
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/_infra.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/context.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/context.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dffae430c19359cfa39fdc65f4d083624e2c125
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/context.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/decorator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/decorator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4619078f4cf2bad940ed8745f930b45d61cd76fd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/decorator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/formatter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/formatter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31307ce8d71b6b1e4f06325f21797c0e89ac7789
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/formatter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e87923683369b4b017c7fa29628a3b6f769740cd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20cc47d1fc227378a10528fd0298f368d7e5ad5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -0,0 +1,284 @@
+"""This file defines an additional layer of abstraction on top of the SARIF OM."""
+
+from __future__ import annotations
+
+import dataclasses
+import enum
+import logging
+from typing import FrozenSet, List, Mapping, Optional, Sequence, Tuple
+
+from torch.onnx._internal.diagnostics.infra import formatter, sarif
+
+
+class Level(enum.IntEnum):
+    """The level of a diagnostic.
+
+    This class is used to represent the level of a diagnostic. The levels are defined
+    by the SARIF specification, and are not modifiable. For alternative categories,
+    please use infra.Tag instead. When selecting a level, please consider the following
+    guidelines:
+
+    - NONE: Informational result that does not indicate the presence of a problem.
+    - NOTE: An opportunity for improvement was found.
+    - WARNING: A potential problem was found.
+    - ERROR: A serious problem was found.
+
+    This level is a subclass of enum.IntEnum, and can be used as an integer. Its integer
+    value maps to the logging levels in Python's logging module. The mapping is as
+    follows:
+
+        Level.NONE = logging.DEBUG = 10
+        Level.NOTE = logging.INFO = 20
+        Level.WARNING = logging.WARNING = 30
+        Level.ERROR = logging.ERROR = 40
+    """
+
+    NONE = 10
+    NOTE = 20
+    WARNING = 30
+    ERROR = 40
+
+
+levels = Level
+
+
+class Tag(enum.Enum):
+    """The tag of a diagnostic. This class can be inherited to define custom tags."""
+
+
+class PatchedPropertyBag(sarif.PropertyBag):
+    """Key/value pairs that provide additional information about the object.
+
+    The definition of PropertyBag via SARIF spec is "A property bag is an object (§3.6)
+    containing an unordered set of properties with arbitrary names." However it is not
+    reflected in the json file, and therefore not captured by the python representation.
+    This patch adds additional **kwargs to the `__init__` method to allow recording
+    arbitrary key/value pairs.
+    """
+
+    def __init__(self, tags: Optional[List[str]] = None, **kwargs):
+        super().__init__(tags=tags)
+        self.__dict__.update(kwargs)
+
+
+@dataclasses.dataclass(frozen=True)
+class Rule:
+    id: str
+    name: str
+    message_default_template: str
+    short_description: Optional[str] = None
+    full_description: Optional[str] = None
+    full_description_markdown: Optional[str] = None
+    help_uri: Optional[str] = None
+
+    @classmethod
+    def from_sarif(cls, **kwargs):
+        """Returns a rule from the SARIF reporting descriptor."""
+        short_description = kwargs.get("short_description", {}).get("text")
+        full_description = kwargs.get("full_description", {}).get("text")
+        full_description_markdown = kwargs.get("full_description", {}).get("markdown")
+        help_uri = kwargs.get("help_uri")
+
+        rule = cls(
+            id=kwargs["id"],
+            name=kwargs["name"],
+            message_default_template=kwargs["message_strings"]["default"]["text"],
+            short_description=short_description,
+            full_description=full_description,
+            full_description_markdown=full_description_markdown,
+            help_uri=help_uri,
+        )
+        return rule
+
+    def sarif(self) -> sarif.ReportingDescriptor:
+        """Returns a SARIF reporting descriptor of this Rule."""
+        short_description = (
+            sarif.MultiformatMessageString(text=self.short_description)
+            if self.short_description is not None
+            else None
+        )
+        full_description = (
+            sarif.MultiformatMessageString(
+                text=self.full_description, markdown=self.full_description_markdown
+            )
+            if self.full_description is not None
+            else None
+        )
+        return sarif.ReportingDescriptor(
+            id=self.id,
+            name=self.name,
+            short_description=short_description,
+            full_description=full_description,
+            help_uri=self.help_uri,
+        )
+
+    def format(self, level: Level, *args, **kwargs) -> Tuple[Rule, Level, str]:
+        """Returns a tuple of (rule, level, message) for a diagnostic.
+
+        This method is used to format the message of a diagnostic. The message is
+        formatted using the default template of this rule, and the arguments passed in
+        as `*args` and `**kwargs`. The level is used to override the default level of
+        this rule.
+        """
+        return (self, level, self.format_message(*args, **kwargs))
+
+    def format_message(self, *args, **kwargs) -> str:
+        """Returns the formatted default message of this Rule.
+
+        This method should be overridden (with code generation) by subclasses to reflect
+        the exact arguments needed by the message template. This is a helper method to
+        create the default message for a diagnostic.
+        """
+        return self.message_default_template.format(*args, **kwargs)
+
+
+@dataclasses.dataclass
+class Location:
+    uri: Optional[str] = None
+    line: Optional[int] = None
+    message: Optional[str] = None
+    start_column: Optional[int] = None
+    end_column: Optional[int] = None
+    snippet: Optional[str] = None
+    function: Optional[str] = None
+
+    def sarif(self) -> sarif.Location:
+        """Returns the SARIF representation of this location."""
+        return sarif.Location(
+            physical_location=sarif.PhysicalLocation(
+                artifact_location=sarif.ArtifactLocation(uri=self.uri),
+                region=sarif.Region(
+                    start_line=self.line,
+                    start_column=self.start_column,
+                    end_column=self.end_column,
+                    snippet=sarif.ArtifactContent(text=self.snippet),
+                ),
+            ),
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
+        )
+
+
+@dataclasses.dataclass
+class StackFrame:
+    location: Location
+
+    def sarif(self) -> sarif.StackFrame:
+        """Returns the SARIF representation of this stack frame."""
+        return sarif.StackFrame(location=self.location.sarif())
+
+
+@dataclasses.dataclass
+class Stack:
+    """Records a stack trace. The frames are in order from newest to oldest stack frame."""
+
+    frames: List[StackFrame] = dataclasses.field(default_factory=list)
+    message: Optional[str] = None
+
+    def sarif(self) -> sarif.Stack:
+        """Returns the SARIF representation of this stack."""
+        return sarif.Stack(
+            frames=[frame.sarif() for frame in self.frames],
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
+        )
+
+
+@dataclasses.dataclass
+class ThreadFlowLocation:
+    """Records code location and the initial state."""
+
+    location: Location
+    state: Mapping[str, str]
+    index: int
+    stack: Optional[Stack] = None
+
+    def sarif(self) -> sarif.ThreadFlowLocation:
+        """Returns the SARIF representation of this thread flow location."""
+        return sarif.ThreadFlowLocation(
+            location=self.location.sarif(),
+            state=self.state,
+            stack=self.stack.sarif() if self.stack is not None else None,
+        )
+
+
+@dataclasses.dataclass
+class Graph:
+    """A graph of diagnostics.
+
+    This class stores the string representation of a model graph.
+    The `nodes` and `edges` fields are unused in the current implementation.
+    """
+
+    graph: str
+    name: str
+    description: Optional[str] = None
+
+    def sarif(self) -> sarif.Graph:
+        """Returns the SARIF representation of this graph."""
+        return sarif.Graph(
+            description=sarif.Message(text=self.graph),
+            properties=PatchedPropertyBag(name=self.name, description=self.description),
+        )
+
+
+@dataclasses.dataclass
+class RuleCollection:
+    _rule_id_name_set: FrozenSet[Tuple[str, str]] = dataclasses.field(init=False)
+
+    def __post_init__(self) -> None:
+        self._rule_id_name_set = frozenset(
+            {
+                (field.default.id, field.default.name)
+                for field in dataclasses.fields(self)
+                if isinstance(field.default, Rule)
+            }
+        )
+
+    def __contains__(self, rule: Rule) -> bool:
+        """Checks if the rule is in the collection."""
+        return (rule.id, rule.name) in self._rule_id_name_set
+
+    @classmethod
+    def custom_collection_from_list(
+        cls, new_collection_class_name: str, rules: Sequence[Rule]
+    ) -> RuleCollection:
+        """Creates a custom class inherited from RuleCollection with the list of rules."""
+        return dataclasses.make_dataclass(
+            new_collection_class_name,
+            [
+                (
+                    formatter.kebab_case_to_snake_case(rule.name),
+                    type(rule),
+                    dataclasses.field(default=rule),
+                )
+                for rule in rules
+            ],
+            bases=(cls,),
+        )()
+
+
+class Invocation:
+    # TODO: Implement this.
+    # Tracks top level call arguments and diagnostic options.
+    def __init__(self) -> None:
+        raise NotImplementedError()
+
+
+@dataclasses.dataclass
+class DiagnosticOptions:
+    """Options for diagnostic context.
+
+    Attributes:
+        verbosity_level: Set the amount of information logged for each diagnostics,
+            equivalent to the 'level' in Python logging module.
+        warnings_as_errors: When True, warning diagnostics are treated as error diagnostics.
+    """
+
+    verbosity_level: int = dataclasses.field(default=logging.INFO)
+    """Set the amount of information logged for each diagnostics, equivalent to the 'level' in Python logging module."""
+
+    warnings_as_errors: bool = dataclasses.field(default=False)
+    """If True, warning diagnostics are treated as error diagnostics."""
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..975908a7109fbc96c68e8a29b7418750d6c9d63f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py
@@ -0,0 +1,415 @@
+"""A diagnostic context based on SARIF."""
+
+from __future__ import annotations
+
+import contextlib
+
+import dataclasses
+import gzip
+
+import logging
+
+from typing import (
+    Callable,
+    Generator,
+    Generic,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Type,
+    TypeVar,
+)
+
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, sarif, utils
+from torch.onnx._internal.diagnostics.infra.sarif import version as sarif_version
+
+
+# This is a workaround for mypy not supporting Self from typing_extensions.
+_Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
+diagnostic_logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class Diagnostic:
+    rule: infra.Rule
+    level: infra.Level
+    message: Optional[str] = None
+    locations: List[infra.Location] = dataclasses.field(default_factory=list)
+    stacks: List[infra.Stack] = dataclasses.field(default_factory=list)
+    graphs: List[infra.Graph] = dataclasses.field(default_factory=list)
+    thread_flow_locations: List[infra.ThreadFlowLocation] = dataclasses.field(
+        default_factory=list
+    )
+    additional_messages: List[str] = dataclasses.field(default_factory=list)
+    tags: List[infra.Tag] = dataclasses.field(default_factory=list)
+    source_exception: Optional[Exception] = None
+    """The exception that caused this diagnostic to be created."""
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+    """The logger for this diagnostic. Defaults to 'diagnostic_logger' which has the same
+    log level setting with `DiagnosticOptions.verbosity_level`."""
+    _current_log_section_depth: int = 0
+
+    def __post_init__(self) -> None:
+        pass
+
+    def sarif(self) -> sarif.Result:
+        """Returns the SARIF Result representation of this diagnostic."""
+        message = self.message or self.rule.message_default_template
+        if self.additional_messages:
+            additional_message = "\n".join(self.additional_messages)
+            message_markdown = (
+                f"{message}\n\n## Additional Message:\n\n{additional_message}"
+            )
+        else:
+            message_markdown = message
+
+        kind: Literal["informational", "fail"] = (
+            "informational" if self.level == infra.Level.NONE else "fail"
+        )
+
+        sarif_result = sarif.Result(
+            message=sarif.Message(text=message, markdown=message_markdown),
+            level=self.level.name.lower(),  # type: ignore[arg-type]
+            rule_id=self.rule.id,
+            kind=kind,
+        )
+        sarif_result.locations = [location.sarif() for location in self.locations]
+        sarif_result.stacks = [stack.sarif() for stack in self.stacks]
+        sarif_result.graphs = [graph.sarif() for graph in self.graphs]
+        sarif_result.code_flows = [
+            sarif.CodeFlow(
+                thread_flows=[
+                    sarif.ThreadFlow(
+                        locations=[loc.sarif() for loc in self.thread_flow_locations]
+                    )
+                ]
+            )
+        ]
+        sarif_result.properties = sarif.PropertyBag(
+            tags=[tag.value for tag in self.tags]
+        )
+        return sarif_result
+
+    def with_location(self: _Diagnostic, location: infra.Location) -> _Diagnostic:
+        """Adds a location to the diagnostic."""
+        self.locations.append(location)
+        return self
+
+    def with_thread_flow_location(
+        self: _Diagnostic, location: infra.ThreadFlowLocation
+    ) -> _Diagnostic:
+        """Adds a thread flow location to the diagnostic."""
+        self.thread_flow_locations.append(location)
+        return self
+
+    def with_stack(self: _Diagnostic, stack: infra.Stack) -> _Diagnostic:
+        """Adds a stack to the diagnostic."""
+        self.stacks.append(stack)
+        return self
+
+    def with_graph(self: _Diagnostic, graph: infra.Graph) -> _Diagnostic:
+        """Adds a graph to the diagnostic."""
+        self.graphs.append(graph)
+        return self
+
+    @contextlib.contextmanager
+    def log_section(
+        self, level: int, message: str, *args, **kwargs
+    ) -> Generator[None, None, None]:
+        """
+        Context manager for a section of log messages, denoted by a title message and increased indentation.
+
+        Same api as `logging.Logger.log`.
+
+        This context manager logs the given title at the specified log level, increases the current
+        section depth for subsequent log messages, and ensures that the section depth is decreased
+        again when exiting the context.
+
+        Args:
+            level: The log level.
+            message: The title message to log.
+            *args: The arguments to the message. Use `LazyString` to defer the
+                expensive evaluation of the arguments until the message is actually logged.
+            **kwargs: The keyword arguments for `logging.Logger.log`.
+
+        Yields:
+            None: This context manager does not yield any value.
+
+        Example:
+            >>> with DiagnosticContext("DummyContext", "1.0"):
+            ...     rule = infra.Rule("RuleID", "DummyRule", "Rule message")
+            ...     diagnostic = Diagnostic(rule, infra.Level.WARNING)
+            ...     with diagnostic.log_section(logging.INFO, "My Section"):
+            ...         diagnostic.log(logging.INFO, "My Message")
+            ...         with diagnostic.log_section(logging.INFO, "My Subsection"):
+            ...             diagnostic.log(logging.INFO, "My Submessage")
+            ...     diagnostic.additional_messages
+            ['## My Section', 'My Message', '### My Subsection', 'My Submessage']
+        """
+        if self.logger.isEnabledFor(level):
+            indented_format_message = (
+                f"##{'#' * self._current_log_section_depth } {message}"
+            )
+            self.log(
+                level,
+                indented_format_message,
+                *args,
+                **kwargs,
+            )
+        self._current_log_section_depth += 1
+        try:
+            yield
+        finally:
+            self._current_log_section_depth -= 1
+
+    def log(self, level: int, message: str, *args, **kwargs) -> None:
+        """Logs a message within the diagnostic. Same api as `logging.Logger.log`.
+
+        If logger is not enabled for the given level, the message will not be logged.
+        Otherwise, the message will be logged and also added to the diagnostic's additional_messages.
+
+        The default setting for `DiagnosticOptions.verbosity_level` is `logging.INFO`. Based on this default,
+        the log level recommendations are as follows. If you've set a different default verbosity level in your
+        application, please adjust accordingly:
+
+        - logging.ERROR: Log any events leading to application failure.
+        - logging.WARNING: Log events that might result in application issues or failures, although not guaranteed.
+        - logging.INFO: Log general useful information, ensuring minimal performance overhead.
+        - logging.DEBUG: Log detailed debug information, which might affect performance when logged.
+
+        Args:
+            level: The log level.
+            message: The message to log.
+            *args: The arguments to the message. Use `LazyString` to defer the
+                expensive evaluation of the arguments until the message is actually logged.
+            **kwargs: The keyword arguments for `logging.Logger.log`.
+        """
+        if self.logger.isEnabledFor(level):
+            formatted_message = message % args
+            self.logger.log(level, formatted_message, **kwargs)
+            self.additional_messages.append(formatted_message)
+
+    def debug(self, message: str, *args, **kwargs) -> None:
+        """Logs a debug message within the diagnostic. Same api as logging.Logger.debug.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.DEBUG, message, *args, **kwargs)
+
+    def info(self, message: str, *args, **kwargs) -> None:
+        """Logs an info message within the diagnostic. Same api as logging.Logger.info.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.INFO, message, *args, **kwargs)
+
+    def warning(self, message: str, *args, **kwargs) -> None:
+        """Logs a warning message within the diagnostic. Same api as logging.Logger.warning.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.WARNING, message, *args, **kwargs)
+
+    def error(self, message: str, *args, **kwargs) -> None:
+        """Logs an error message within the diagnostic. Same api as logging.Logger.error.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.ERROR, message, *args, **kwargs)
+
+    def log_source_exception(self, level: int, exception: Exception) -> None:
+        """Logs a source exception within the diagnostic.
+
+        Invokes `log_section` and `log` to log the exception in markdown section format.
+        """
+        self.source_exception = exception
+        with self.log_section(level, "Exception log"):
+            self.log(level, "%s", formatter.lazy_format_exception(exception))
+
+    def record_python_call_stack(self, frames_to_skip: int) -> infra.Stack:
+        """Records the current Python call stack."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip)
+        self.with_stack(stack)
+        if len(stack.frames) > 0:
+            self.with_location(stack.frames[0].location)
+        return stack
+
+    def record_python_call(
+        self,
+        fn: Callable,
+        state: Mapping[str, str],
+        message: Optional[str] = None,
+        frames_to_skip: int = 0,
+    ) -> infra.ThreadFlowLocation:
+        """Records a python call as one thread flow step."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip, frames_to_log=5)
+        location = utils.function_location(fn)
+        location.message = message
+        # Add function location to the top of the stack.
+        stack.frames.insert(0, infra.StackFrame(location=location))
+        thread_flow_location = infra.ThreadFlowLocation(
+            location=location,
+            state=state,
+            index=len(self.thread_flow_locations),
+            stack=stack,
+        )
+        self.with_thread_flow_location(thread_flow_location)
+        return thread_flow_location
+
+
+class RuntimeErrorWithDiagnostic(RuntimeError):
+    """Runtime error with enclosed diagnostic information."""
+
+    def __init__(self, diagnostic: Diagnostic):
+        super().__init__(diagnostic.message)
+        self.diagnostic = diagnostic
+
+
+@dataclasses.dataclass
+class DiagnosticContext(Generic[_Diagnostic]):
+    name: str
+    version: str
+    options: infra.DiagnosticOptions = dataclasses.field(
+        default_factory=infra.DiagnosticOptions
+    )
+    diagnostics: List[_Diagnostic] = dataclasses.field(init=False, default_factory=list)
+    # TODO(bowbao): Implement this.
+    # _invocation: infra.Invocation = dataclasses.field(init=False)
+    _inflight_diagnostics: List[_Diagnostic] = dataclasses.field(
+        init=False, default_factory=list
+    )
+    _previous_log_level: int = dataclasses.field(init=False, default=logging.WARNING)
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+    _bound_diagnostic_type: Type = dataclasses.field(init=False, default=Diagnostic)
+
+    def __enter__(self):
+        self._previous_log_level = self.logger.level
+        self.logger.setLevel(self.options.verbosity_level)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.logger.setLevel(self._previous_log_level)
+        return None
+
+    def sarif(self) -> sarif.Run:
+        """Returns the SARIF Run object."""
+        unique_rules = {diagnostic.rule for diagnostic in self.diagnostics}
+        return sarif.Run(
+            sarif.Tool(
+                driver=sarif.ToolComponent(
+                    name=self.name,
+                    version=self.version,
+                    rules=[rule.sarif() for rule in unique_rules],
+                )
+            ),
+            results=[diagnostic.sarif() for diagnostic in self.diagnostics],
+        )
+
+    def sarif_log(self) -> sarif.SarifLog:  # type: ignore[name-defined]
+        """Returns the SARIF Log object."""
+        return sarif.SarifLog(
+            version=sarif_version.SARIF_VERSION,
+            schema_uri=sarif_version.SARIF_SCHEMA_LINK,
+            runs=[self.sarif()],
+        )
+
+    def to_json(self) -> str:
+        return formatter.sarif_to_json(self.sarif_log())
+
+    def dump(self, file_path: str, compress: bool = False) -> None:
+        """Dumps the SARIF log to a file."""
+        if compress:
+            with gzip.open(file_path, "wt") as f:
+                f.write(self.to_json())
+        else:
+            with open(file_path, "w") as f:
+                f.write(self.to_json())
+
+    def log(self, diagnostic: _Diagnostic) -> None:
+        """Logs a diagnostic.
+
+        This method should be used only after all the necessary information for the diagnostic
+        has been collected.
+
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        if not isinstance(diagnostic, self._bound_diagnostic_type):
+            raise TypeError(
+                f"Expected diagnostic of type {self._bound_diagnostic_type}, got {type(diagnostic)}"
+            )
+        if self.options.warnings_as_errors and diagnostic.level == infra.Level.WARNING:
+            diagnostic.level = infra.Level.ERROR
+        self.diagnostics.append(diagnostic)
+
+    def log_and_raise_if_error(self, diagnostic: _Diagnostic) -> None:
+        """Logs a diagnostic and raises an exception if it is an error.
+
+        Use this method for logging non inflight diagnostics where diagnostic level is not known or
+        lower than ERROR. If it is always expected raise, use `log` and explicit
+        `raise` instead. Otherwise there is no way to convey the message that it always
+        raises to Python intellisense and type checking tools.
+
+        This method should be used only after all the necessary information for the diagnostic
+        has been collected.
+
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        self.log(diagnostic)
+        if diagnostic.level == infra.Level.ERROR:
+            if diagnostic.source_exception is not None:
+                raise diagnostic.source_exception
+            raise RuntimeErrorWithDiagnostic(diagnostic)
+
+    @contextlib.contextmanager
+    def add_inflight_diagnostic(
+        self, diagnostic: _Diagnostic
+    ) -> Generator[_Diagnostic, None, None]:
+        """Adds a diagnostic to the context.
+
+        Use this method to add diagnostics that are not created by the context.
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+        try:
+            yield diagnostic
+        finally:
+            self._inflight_diagnostics.pop()
+
+    def push_inflight_diagnostic(self, diagnostic: _Diagnostic) -> None:
+        """Pushes a diagnostic to the inflight diagnostics stack.
+
+        Args:
+            diagnostic: The diagnostic to push.
+
+        Raises:
+            ValueError: If the rule is not supported by the tool.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+
+    def pop_inflight_diagnostic(self) -> _Diagnostic:
+        """Pops the last diagnostic from the inflight diagnostics stack.
+
+        Returns:
+            The popped diagnostic.
+        """
+        return self._inflight_diagnostics.pop()
+
+    def inflight_diagnostic(self, rule: Optional[infra.Rule] = None) -> _Diagnostic:
+        if rule is None:
+            # TODO(bowbao): Create builtin-rules and create diagnostic using that.
+            if len(self._inflight_diagnostics) <= 0:
+                raise AssertionError("No inflight diagnostics")
+
+            return self._inflight_diagnostics[-1]
+        else:
+            for diagnostic in reversed(self._inflight_diagnostics):
+                if diagnostic.rule == rule:
+                    return diagnostic
+            raise AssertionError(f"No inflight diagnostic for rule {rule.name}")
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b0a5c033728f0ac6e7340847d52c6a5336bd5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+import functools
+import logging
+import traceback
+from typing import Any, Callable, Dict, Optional, Tuple, Type
+
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, utils
+
+
+MessageFormatterType = Callable[..., str]
+
+
+@_beartype.beartype
+def format_message_in_text(fn: Callable, *args: Any, **kwargs: Any) -> str:
+    return f"{formatter.display_name(fn)}. "
+
+
+@_beartype.beartype
+def format_exception_in_markdown(exception: Exception) -> str:
+    msg_list = ["### Exception log", "```"]
+    msg_list.extend(
+        traceback.format_exception(type(exception), exception, exception.__traceback__)
+    )
+    msg_list.append("```")
+    return "\n".join(msg_list)
+
+
+@_beartype.beartype
+def format_function_signature_in_markdown(
+    fn: Callable,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    msg_list = [f"### Function Signature {formatter.display_name(fn)}"]
+
+    state = utils.function_state(fn, args, kwargs)
+
+    for k, v in state.items():
+        msg_list.append(f"- {k}: {format_argument(v)}")
+
+    return "\n".join(msg_list)
+
+
+@_beartype.beartype
+def format_return_values_in_markdown(
+    return_values: Any,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    return f"{format_argument(return_values)}"
+
+
+ModifierCallableType = Callable[
+    [infra.Diagnostic, Callable, Tuple[Any, ...], Dict[str, Any], Any], None
+]
+
+
+@_beartype.beartype
+def diagnose_call(
+    rule: infra.Rule,
+    *,
+    level: infra.Level = infra.Level.NONE,
+    diagnostic_type: Type[infra.Diagnostic] = infra.Diagnostic,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+    diagnostic_message_formatter: MessageFormatterType = format_message_in_text,
+) -> Callable:
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            common_error_message = "diagnose_call can only be applied to callables"
+            if not callable(fn):
+                raise AssertionError(
+                    f"{common_error_message}. Got {type(fn)} instead of callable."
+                )
+            arg0 = args[0] if len(args) > 0 else None
+            if isinstance(ctx := arg0, infra.DiagnosticContext):
+                pass
+            elif isinstance(
+                ctx := getattr(arg0, "diagnostic_context", None),
+                infra.DiagnosticContext,
+            ):
+                pass
+            else:
+                # NOTE: At decorate time, it can't tell if a callable is function or method.
+                # Technically both are regarded as function at that time.
+                raise AssertionError(
+                    f"{common_error_message}. For {fn}, "
+                    f"If it is a function, a DiagnosticContext instance must be present as "
+                    f"the first argument. "
+                    f"If it is a method, a DiagnosticContext instance must be present as "
+                    f"the attribute 'diagnostic_context' of the 'self' argument."
+                )
+
+            diag = diagnostic_type(
+                rule,
+                level,
+                diagnostic_message_formatter(fn, *args, **kwargs),
+            )
+
+            # pop the decorator frame
+            # TODO(bowbao): by default diagnostic doesn't have stack.
+            # So need to check before doing this. Make the code cleaner.
+            # Option: do not capture stack by default in diagnostic initialization.
+            stack: Optional[infra.Stack] = None
+            if len(diag.stacks) > 0:
+                stack = diag.stacks[0]
+                stack.frames.pop(0)
+
+            # set function location
+            fn_location = utils.function_location(fn)
+            diag.locations.insert(0, fn_location)
+            # Add function location to the top of the stack.
+            if stack is not None:
+                stack.frames.insert(0, infra.StackFrame(location=fn_location))
+
+            with diag.log_section(logging.INFO, "Function Signature"):
+                diag.log(
+                    logging.INFO,
+                    "%s",
+                    formatter.LazyString(
+                        format_function_signature_in_markdown,
+                        fn,
+                        args,
+                        kwargs,
+                        format_argument,
+                    ),
+                )
+
+            return_values: Any = None
+            with ctx.add_inflight_diagnostic(diag) as diag:
+                try:
+                    return_values = fn(*args, **kwargs)
+                    with diag.log_section(logging.INFO, "Return values"):
+                        diag.log(
+                            logging.INFO,
+                            "%s",
+                            formatter.LazyString(
+                                format_return_values_in_markdown,
+                                return_values,
+                                format_argument,
+                            ),
+                        )
+                    return return_values
+                except Exception as e:
+                    diag.log_source_exception(logging.ERROR, e)
+                    diag.level = infra.Level.ERROR
+                finally:
+                    ctx.log_and_raise_if_error(diag)
+
+        return wrapper
+
+    return decorator
+
+
+# TODO(bowbao): decorator to report only when failed.
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b94d66500e4a49ecf15ef81a2a2e7a9e394a9a90
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import dataclasses
+import json
+import re
+import traceback
+from typing import Any, Callable, Dict, List, Optional, Union
+
+from torch._logging import LazyString
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.diagnostics.infra import sarif
+
+
+# A list of types in the SARIF module to support pretty printing.
+# This is solely for type annotation for the functions below.
+_SarifClass = Union[
+    sarif.SarifLog,
+    sarif.Run,
+    sarif.ReportingDescriptor,
+    sarif.Result,
+]
+
+
+def lazy_format_exception(exception: Exception) -> LazyString:
+    return LazyString(
+        lambda: "\n".join(
+            (
+                "```",
+                *traceback.format_exception(
+                    type(exception), exception, exception.__traceback__
+                ),
+                "```",
+            )
+        ),
+    )
+
+
+@_beartype.beartype
+def snake_case_to_camel_case(s: str) -> str:
+    splits = s.split("_")
+    if len(splits) <= 1:
+        return s
+    return "".join([splits[0], *map(str.capitalize, splits[1:])])
+
+
+@_beartype.beartype
+def camel_case_to_snake_case(s: str) -> str:
+    return re.sub(r"([A-Z])", r"_\1", s).lower()
+
+
+@_beartype.beartype
+def kebab_case_to_snake_case(s: str) -> str:
+    return s.replace("-", "_")
+
+
+@_beartype.beartype
+def _convert_key(
+    object: Union[Dict[str, Any], Any], convert: Callable[[str], str]
+) -> Union[Dict[str, Any], Any]:
+    """Convert and update keys in a dictionary with "convert".
+
+    Any value that is a dictionary will be recursively updated.
+    Any value that is a list will be recursively searched.
+
+    Args:
+        object: The object to update.
+        convert: The function to convert the keys, e.g. `kebab_case_to_snake_case`.
+
+    Returns:
+        The updated object.
+    """
+    if not isinstance(object, Dict):
+        return object
+    new_dict = {}
+    for k, v in object.items():
+        new_k = convert(k)
+        if isinstance(v, Dict):
+            new_v = _convert_key(v, convert)
+        elif isinstance(v, List):
+            new_v = [_convert_key(elem, convert) for elem in v]
+        else:
+            new_v = v
+        if new_v is None:
+            # Otherwise unnecessarily bloated sarif log with "null"s.
+            continue
+        if new_v == -1:
+            # WAR: -1 as default value shouldn't be logged into sarif.
+            continue
+
+        new_dict[new_k] = new_v
+
+    return new_dict
+
+
+@_beartype.beartype
+def sarif_to_json(attr_cls_obj: _SarifClass, indent: Optional[str] = " ") -> str:
+    dict = dataclasses.asdict(attr_cls_obj)
+    dict = _convert_key(dict, snake_case_to_camel_case)
+    return json.dumps(dict, indent=indent, separators=(",", ":"))
+
+
+@_beartype.beartype
+def format_argument(obj: Any) -> str:
+    return f"{type(obj)}"
+
+
+@_beartype.beartype
+def display_name(fn: Callable) -> str:
+    if hasattr(fn, "__qualname__"):
+        return fn.__qualname__
+    elif hasattr(fn, "__name__"):
+        return fn.__name__
+    else:
+        return str(fn)
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0d00cd4022b9eef20a73789b199ebeea3a422e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__init__.py
@@ -0,0 +1,100 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from torch.onnx._internal.diagnostics.infra.sarif._address import Address
+from torch.onnx._internal.diagnostics.infra.sarif._artifact import Artifact
+from torch.onnx._internal.diagnostics.infra.sarif._artifact_change import ArtifactChange
+from torch.onnx._internal.diagnostics.infra.sarif._artifact_content import (
+    ArtifactContent,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._artifact_location import (
+    ArtifactLocation,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._attachment import Attachment
+from torch.onnx._internal.diagnostics.infra.sarif._code_flow import CodeFlow
+from torch.onnx._internal.diagnostics.infra.sarif._configuration_override import (
+    ConfigurationOverride,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._conversion import Conversion
+from torch.onnx._internal.diagnostics.infra.sarif._edge import Edge
+from torch.onnx._internal.diagnostics.infra.sarif._edge_traversal import EdgeTraversal
+from torch.onnx._internal.diagnostics.infra.sarif._exception import Exception
+from torch.onnx._internal.diagnostics.infra.sarif._external_properties import (
+    ExternalProperties,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._external_property_file_reference import (
+    ExternalPropertyFileReference,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._external_property_file_references import (
+    ExternalPropertyFileReferences,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._fix import Fix
+from torch.onnx._internal.diagnostics.infra.sarif._graph import Graph
+from torch.onnx._internal.diagnostics.infra.sarif._graph_traversal import GraphTraversal
+from torch.onnx._internal.diagnostics.infra.sarif._invocation import Invocation
+from torch.onnx._internal.diagnostics.infra.sarif._location import Location
+from torch.onnx._internal.diagnostics.infra.sarif._location_relationship import (
+    LocationRelationship,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._logical_location import (
+    LogicalLocation,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._message import Message
+from torch.onnx._internal.diagnostics.infra.sarif._multiformat_message_string import (
+    MultiformatMessageString,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._node import Node
+from torch.onnx._internal.diagnostics.infra.sarif._notification import Notification
+from torch.onnx._internal.diagnostics.infra.sarif._physical_location import (
+    PhysicalLocation,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._property_bag import PropertyBag
+from torch.onnx._internal.diagnostics.infra.sarif._rectangle import Rectangle
+from torch.onnx._internal.diagnostics.infra.sarif._region import Region
+from torch.onnx._internal.diagnostics.infra.sarif._replacement import Replacement
+from torch.onnx._internal.diagnostics.infra.sarif._reporting_configuration import (
+    ReportingConfiguration,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._reporting_descriptor import (
+    ReportingDescriptor,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._reporting_descriptor_reference import (
+    ReportingDescriptorReference,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._reporting_descriptor_relationship import (
+    ReportingDescriptorRelationship,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._result import Result
+from torch.onnx._internal.diagnostics.infra.sarif._result_provenance import (
+    ResultProvenance,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._run import Run
+from torch.onnx._internal.diagnostics.infra.sarif._run_automation_details import (
+    RunAutomationDetails,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._sarif_log import SarifLog
+from torch.onnx._internal.diagnostics.infra.sarif._special_locations import (
+    SpecialLocations,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._stack import Stack
+from torch.onnx._internal.diagnostics.infra.sarif._stack_frame import StackFrame
+from torch.onnx._internal.diagnostics.infra.sarif._suppression import Suppression
+from torch.onnx._internal.diagnostics.infra.sarif._thread_flow import ThreadFlow
+from torch.onnx._internal.diagnostics.infra.sarif._thread_flow_location import (
+    ThreadFlowLocation,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._tool import Tool
+from torch.onnx._internal.diagnostics.infra.sarif._tool_component import ToolComponent
+from torch.onnx._internal.diagnostics.infra.sarif._tool_component_reference import (
+    ToolComponentReference,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._translation_metadata import (
+    TranslationMetadata,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._version_control_details import (
+    VersionControlDetails,
+)
+from torch.onnx._internal.diagnostics.infra.sarif._web_request import WebRequest
+from torch.onnx._internal.diagnostics.infra.sarif._web_response import WebResponse
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f340ceb537e4f910db95aa98c24ebf2bec9da7c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_address.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_address.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64b23b3732e8f2088ec3705523fac695c1ead69b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_address.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1174e3a5b77e961ee18cbc0158806a3798d9fcd9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_change.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_change.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bd2f09cb897ee58a812fcef2c2c9b4623992627
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_change.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_content.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_content.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d03cf283944b534fae08bc4149896fdeb21743ce
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_content.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_location.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_location.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..520e6b538ac5d0e21f1624e2914eb4346d1fa765
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_artifact_location.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_attachment.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_attachment.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..affc38159fb8315e96337ed3edd1f79d43d9e573
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_attachment.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_code_flow.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_code_flow.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fcec1c2d8f9f6b354598d306ec91dae16c625b9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_code_flow.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_configuration_override.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_configuration_override.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f898012b5609e1eee44328c2a4c48c715ab12a5f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_configuration_override.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_conversion.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_conversion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2359e8a0ca29ac3a24f6fd7c0ce821284581f94
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_conversion.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84caaee2992c4cef7d4f04f63a545ff84118a292
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge_traversal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge_traversal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bed0a863b82a958597a81a6f17dc1f873d6e1629
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_edge_traversal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_exception.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_exception.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f44d72523903da007e1875ab8745ccc424da9226
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_exception.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_properties.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_properties.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2fd8ed9b6171bda5b9346aeded3d7012da0a09db
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_properties.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_reference.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_reference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f3ed05632c970ccc041ebb853dc1ef94d9f4dd9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_reference.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_references.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_references.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e39e0ba350ede64f635dfe058989789b27eefca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_external_property_file_references.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_fix.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_fix.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc3a3bcc6560570066c2a5b42609a0c36dd52aa6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_fix.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79404603d5681d73cc4504de4fc519dedb41db58
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph_traversal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph_traversal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de96d17682fe8bc34665006a31963f4dbf41bf88
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_graph_traversal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_invocation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_invocation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e7e91e8d39167a97949b17932e5762eaa4c8cdc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_invocation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b53682c9619df9db1fa3e09b071898c9ee32ee73
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location_relationship.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location_relationship.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..652dd87989d36134dcf482d9616a3667c2c8fc7f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_location_relationship.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_logical_location.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_logical_location.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fb77eea874a57bd2c5d96ede121daa446f60a7c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_logical_location.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_message.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_message.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f935745e5fc02d8f90dfa9c6828ce6fabb5e63e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_message.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_multiformat_message_string.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_multiformat_message_string.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a8f1ad2cb34c33af4b9b5b177a19c47f15d8d6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_multiformat_message_string.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_node.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_node.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..509365e48dc3f632dd8b4b614533a3fef6e6b612
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_node.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_notification.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_notification.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d047f671faa756adbc89a56b8dfa65b9ed1aa9c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_notification.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_physical_location.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_physical_location.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a50d083147aa893737a61d8090c0282869a4ac0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_physical_location.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_property_bag.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_property_bag.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c743142ed82684fabcd2a4086e950440e2818fe1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_property_bag.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_rectangle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_rectangle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d079c6567091d182e3eead3a604a61194b5eec6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_rectangle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_region.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_region.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c34297cd185cacdd6c377be3c07378f99d8baca7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_region.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_replacement.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_replacement.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a82822f1cd13a97810afb18bde28562c4c48187
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_replacement.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_configuration.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_configuration.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..545299a4aeadf3100aff348886f7446545c716e2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_configuration.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47e0bf8bf1bd35f4a637a59a9c04943623a0b60b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_reference.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_reference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8052d3532acded6c426216af3b44c0538c769cb8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_reference.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_relationship.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_relationship.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94898e73ebdf05e3b479683dd4f692019edfe6b5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_reporting_descriptor_relationship.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d949156e2c67f0f15edcb4825fcb4036adfeeb93
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result_provenance.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result_provenance.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02be8c6673d7defbc3bcf55a4a0fc5b8d4bd8a71
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_result_provenance.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efd0d67507873e554263a3d6f29cbd75fe26c201
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run_automation_details.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run_automation_details.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53681dfa80ef0f946d6167289133decc0fb069c6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_run_automation_details.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_sarif_log.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_sarif_log.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4661b49d2a6a8ef47987be7264a855df2d7fc28e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_sarif_log.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_special_locations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_special_locations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddd5221c2663181bf9afcf1296e2b8951e3dc760
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_special_locations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5230671af5bf6d871cadf42dd8871babd6a5a699
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack_frame.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack_frame.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7497dd9f1c9d726c81e847b09ac60fbb1376cd3b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_stack_frame.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_suppression.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_suppression.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75642605217ca1293bd06778c521a0cb3a010bf2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_suppression.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7d1f558f436f7437eaaba04e76144911e4e4561
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow_location.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow_location.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14cabe30f6ee9ff04af9f79995b958dbddc54340
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_thread_flow_location.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..842323632cbb6f45a051fc21ffc103d45d9285f6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..608170b99b89930b5abd8cca04f2895ea12b17ab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component_reference.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component_reference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa2d66c579d95f68b0370ff15f32ddcda8c2561a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_tool_component_reference.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_translation_metadata.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_translation_metadata.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31caf2bd0a6ac04a04afb67a8e4e0b5bd419bf72
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_translation_metadata.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_version_control_details.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_version_control_details.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74889f713b0eaa777195a348429bccb8ddae7e50
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_version_control_details.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_request.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_request.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3065f308dc7826e8a5ce5b456dcf831bfc17e71c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_request.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_response.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_response.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a4b6da0befa58e0b8d89645155e73460a673a98
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/_web_response.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/version.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/version.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9c826326c9ee268b653756dc0eb94d2e308480c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/__pycache__/version.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_address.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_address.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a20d9691ee4e2ddac27a7348db9b2f68149249
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_address.py
@@ -0,0 +1,48 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class Address(object):
+    """A physical or virtual address, or a range of addresses, in an 'addressable region' (memory or a binary file)."""
+
+    absolute_address: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "absoluteAddress"}
+    )
+    fully_qualified_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullyQualifiedName"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    kind: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "kind"}
+    )
+    length: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "length"}
+    )
+    name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "name"}
+    )
+    offset_from_parent: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "offsetFromParent"}
+    )
+    parent_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "parentIndex"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    relative_address: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "relativeAddress"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbde42bee8b5fea16264b6dc21b537d6ef5bc194
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact.py
@@ -0,0 +1,88 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_content,
+    _artifact_location,
+    _message,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Artifact(object):
+    """A single artifact. In some cases, this artifact might be nested within another artifact."""
+
+    contents: Optional[_artifact_content.ArtifactContent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "contents"}
+    )
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    encoding: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "encoding"}
+    )
+    hashes: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "hashes"}
+    )
+    last_modified_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "lastModifiedTimeUtc"}
+    )
+    length: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "length"}
+    )
+    location: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    mime_type: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "mimeType"}
+    )
+    offset: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "offset"}
+    )
+    parent_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "parentIndex"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    roles: Optional[
+        List[
+            Literal[
+                "analysisTarget",
+                "attachment",
+                "responseFile",
+                "resultFile",
+                "standardStream",
+                "tracedFile",
+                "unmodified",
+                "modified",
+                "added",
+                "deleted",
+                "renamed",
+                "uncontrolled",
+                "driver",
+                "extension",
+                "translation",
+                "taxonomy",
+                "policy",
+                "referencedOnCommandLine",
+                "memoryContents",
+                "directory",
+                "userSpecifiedConfiguration",
+                "toolSpecifiedConfiguration",
+                "debugOutputFile",
+            ]
+        ]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "roles"})
+    source_language: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "sourceLanguage"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_change.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_change.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd06c46f033b3f39a047fd3a66adc9e65d7095a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_change.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _property_bag,
+    _replacement,
+)
+
+
+@dataclasses.dataclass
+class ArtifactChange(object):
+    """A change to a single artifact."""
+
+    artifact_location: _artifact_location.ArtifactLocation = dataclasses.field(
+        metadata={"schema_property_name": "artifactLocation"}
+    )
+    replacements: List[_replacement.Replacement] = dataclasses.field(
+        metadata={"schema_property_name": "replacements"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_content.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_content.py
new file mode 100644
index 0000000000000000000000000000000000000000..603b1eb47e90efe53418d22edd7674efbb7a3ad0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_content.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _multiformat_message_string,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class ArtifactContent(object):
+    """Represents the contents of an artifact."""
+
+    binary: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "binary"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    rendered: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "rendered"})
+    text: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "text"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_location.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..3759e4cc6e7a1441b202bb41627939ec1b8eb185
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_artifact_location.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class ArtifactLocation(object):
+    """Specifies the location of an artifact."""
+
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "uri"}
+    )
+    uri_base_id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "uriBaseId"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_attachment.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_attachment.py
new file mode 100644
index 0000000000000000000000000000000000000000..924150bf6f5adc28c382ed6b36b23e4f5277a6fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_attachment.py
@@ -0,0 +1,39 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _message,
+    _property_bag,
+    _rectangle,
+    _region,
+)
+
+
+@dataclasses.dataclass
+class Attachment(object):
+    """An artifact relevant to a result."""
+
+    artifact_location: _artifact_location.ArtifactLocation = dataclasses.field(
+        metadata={"schema_property_name": "artifactLocation"}
+    )
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    rectangles: Optional[List[_rectangle.Rectangle]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "rectangles"}
+    )
+    regions: Optional[List[_region.Region]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "regions"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_code_flow.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_code_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..97bc8d8f6863580733dba1419537d2d980b04cfb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_code_flow.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _message,
+    _property_bag,
+    _thread_flow,
+)
+
+
+@dataclasses.dataclass
+class CodeFlow(object):
+    """A set of threadFlows which together describe a pattern of code execution relevant to detecting a result."""
+
+    thread_flows: List[_thread_flow.ThreadFlow] = dataclasses.field(
+        metadata={"schema_property_name": "threadFlows"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_configuration_override.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_configuration_override.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d554e006deb0ad707292a1452e1ff935fe6bc6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_configuration_override.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _property_bag,
+    _reporting_configuration,
+    _reporting_descriptor_reference,
+)
+
+
+@dataclasses.dataclass
+class ConfigurationOverride(object):
+    """Information about how a specific rule or notification was reconfigured at runtime."""
+
+    configuration: _reporting_configuration.ReportingConfiguration = dataclasses.field(
+        metadata={"schema_property_name": "configuration"}
+    )
+    descriptor: _reporting_descriptor_reference.ReportingDescriptorReference = (
+        dataclasses.field(metadata={"schema_property_name": "descriptor"})
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_conversion.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..889c92ddfca0af626a129caad31ddbfa4b3374a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_conversion.py
@@ -0,0 +1,35 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _invocation,
+    _property_bag,
+    _tool,
+)
+
+
+@dataclasses.dataclass
+class Conversion(object):
+    """Describes how a converter transformed the output of a static analysis tool from the analysis tool's native output format into the SARIF format."""
+
+    tool: _tool.Tool = dataclasses.field(metadata={"schema_property_name": "tool"})
+    analysis_tool_log_files: Optional[
+        List[_artifact_location.ArtifactLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "analysisToolLogFiles"}
+    )
+    invocation: Optional[_invocation.Invocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "invocation"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge.py
new file mode 100644
index 0000000000000000000000000000000000000000..53672fec7cae013871ac513b9d388e433cfacd53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class Edge(object):
+    """Represents a directed edge in a graph."""
+
+    id: str = dataclasses.field(metadata={"schema_property_name": "id"})
+    source_node_id: str = dataclasses.field(
+        metadata={"schema_property_name": "sourceNodeId"}
+    )
+    target_node_id: str = dataclasses.field(
+        metadata={"schema_property_name": "targetNodeId"}
+    )
+    label: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "label"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge_traversal.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge_traversal.py
new file mode 100644
index 0000000000000000000000000000000000000000..764bea56de880fe3e5fd825ae7b4b3307af7a908
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_edge_traversal.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class EdgeTraversal(object):
+    """Represents the traversal of a single edge during a graph traversal."""
+
+    edge_id: str = dataclasses.field(metadata={"schema_property_name": "edgeId"})
+    final_state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "finalState"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    step_over_edge_count: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stepOverEdgeCount"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_exception.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_exception.py
new file mode 100644
index 0000000000000000000000000000000000000000..6904e0f70bf13fee0a7eddd901485563c6ea4218
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_exception.py
@@ -0,0 +1,37 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _exception,
+    _property_bag,
+    _stack,
+)
+
+
+@dataclasses.dataclass
+class Exception(object):
+    """Describes a runtime exception encountered during the execution of an analysis tool."""
+
+    inner_exceptions: Optional[List[_exception.Exception]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "innerExceptions"}
+    )
+    kind: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "kind"}
+    )
+    message: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    stack: Optional[_stack.Stack] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stack"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e8ac49faf70827258879ceefe5d2f907ddf26f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_properties.py
@@ -0,0 +1,98 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _address,
+    _artifact,
+    _conversion,
+    _graph,
+    _invocation,
+    _logical_location,
+    _property_bag,
+    _result,
+    _thread_flow_location,
+    _tool_component,
+    _web_request,
+    _web_response,
+)
+
+
+@dataclasses.dataclass
+class ExternalProperties(object):
+    """The top-level element of an external property file."""
+
+    addresses: Optional[List[_address.Address]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "addresses"}
+    )
+    artifacts: Optional[List[_artifact.Artifact]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "artifacts"}
+    )
+    conversion: Optional[_conversion.Conversion] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "conversion"}
+    )
+    driver: Optional[_tool_component.ToolComponent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "driver"}
+    )
+    extensions: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "extensions"}
+    )
+    externalized_properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "externalizedProperties"}
+    )
+    graphs: Optional[List[_graph.Graph]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "graphs"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    invocations: Optional[List[_invocation.Invocation]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "invocations"}
+    )
+    logical_locations: Optional[
+        List[_logical_location.LogicalLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "logicalLocations"}
+    )
+    policies: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "policies"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    results: Optional[List[_result.Result]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "results"}
+    )
+    run_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "runGuid"}
+    )
+    schema: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "schema"}
+    )
+    taxonomies: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "taxonomies"}
+    )
+    thread_flow_locations: Optional[
+        List[_thread_flow_location.ThreadFlowLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "threadFlowLocations"}
+    )
+    translations: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "translations"}
+    )
+    version: Optional[Literal["2.1.0"]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "version"}
+    )
+    web_requests: Optional[List[_web_request.WebRequest]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webRequests"}
+    )
+    web_responses: Optional[List[_web_response.WebResponse]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webResponses"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_reference.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f42c0a6309f4584c3cc36365d171b780161f3b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_reference.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class ExternalPropertyFileReference(object):
+    """Contains information that enables a SARIF consumer to locate the external property file that contains the value of an externalized property associated with the run."""
+
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    item_count: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "itemCount"}
+    )
+    location: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_references.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_references.py
new file mode 100644
index 0000000000000000000000000000000000000000..04fc1c2a0bd3483042fb84c9df09802e6dc41d76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_external_property_file_references.py
@@ -0,0 +1,86 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _external_property_file_reference,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class ExternalPropertyFileReferences(object):
+    """References to external property files that should be inlined with the content of a root log file."""
+
+    addresses: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "addresses"})
+    artifacts: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "artifacts"})
+    conversion: Optional[
+        _external_property_file_reference.ExternalPropertyFileReference
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "conversion"})
+    driver: Optional[
+        _external_property_file_reference.ExternalPropertyFileReference
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "driver"})
+    extensions: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "extensions"})
+    externalized_properties: Optional[
+        _external_property_file_reference.ExternalPropertyFileReference
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "externalizedProperties"}
+    )
+    graphs: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "graphs"})
+    invocations: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "invocations"}
+    )
+    logical_locations: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "logicalLocations"}
+    )
+    policies: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "policies"})
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    results: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "results"})
+    taxonomies: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "taxonomies"})
+    thread_flow_locations: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "threadFlowLocations"}
+    )
+    translations: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "translations"}
+    )
+    web_requests: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webRequests"}
+    )
+    web_responses: Optional[
+        List[_external_property_file_reference.ExternalPropertyFileReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webResponses"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_fix.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_fix.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fd49015254abef1ec5025252869ce17e9043e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_fix.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_change,
+    _message,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Fix(object):
+    """A proposed fix for the problem represented by a result object. A fix specifies a set of artifacts to modify. For each artifact, it specifies a set of bytes to remove, and provides a set of new bytes to replace them."""
+
+    artifact_changes: List[_artifact_change.ArtifactChange] = dataclasses.field(
+        metadata={"schema_property_name": "artifactChanges"}
+    )
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..d512a50f437898966665ae67f63ba2970738a170
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph.py
@@ -0,0 +1,35 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _edge,
+    _message,
+    _node,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Graph(object):
+    """A network of nodes and directed edges that describes some aspect of the structure of the code (for example, a call graph)."""
+
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    edges: Optional[List[_edge.Edge]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "edges"}
+    )
+    nodes: Optional[List[_node.Node]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "nodes"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph_traversal.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph_traversal.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d859c8d47179b58e4ea61b414f2d3b256a9f28
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_graph_traversal.py
@@ -0,0 +1,43 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _edge_traversal,
+    _message,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class GraphTraversal(object):
+    """Represents a path through a graph."""
+
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    edge_traversals: Optional[List[_edge_traversal.EdgeTraversal]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "edgeTraversals"}
+    )
+    immutable_state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "immutableState"}
+    )
+    initial_state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "initialState"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    result_graph_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "resultGraphIndex"}
+    )
+    run_graph_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "runGraphIndex"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_invocation.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_invocation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bc91e6485df4d63324711d7e5e68f6db7dd10b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_invocation.py
@@ -0,0 +1,117 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _configuration_override,
+    _notification,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Invocation(object):
+    """The runtime environment of the analysis tool run."""
+
+    execution_successful: bool = dataclasses.field(
+        metadata={"schema_property_name": "executionSuccessful"}
+    )
+    account: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "account"}
+    )
+    arguments: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "arguments"}
+    )
+    command_line: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "commandLine"}
+    )
+    end_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "endTimeUtc"}
+    )
+    environment_variables: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "environmentVariables"}
+    )
+    executable_location: Optional[
+        _artifact_location.ArtifactLocation
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "executableLocation"}
+    )
+    exit_code: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "exitCode"}
+    )
+    exit_code_description: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "exitCodeDescription"}
+    )
+    exit_signal_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "exitSignalName"}
+    )
+    exit_signal_number: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "exitSignalNumber"}
+    )
+    machine: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "machine"}
+    )
+    notification_configuration_overrides: Optional[
+        List[_configuration_override.ConfigurationOverride]
+    ] = dataclasses.field(
+        default=None,
+        metadata={"schema_property_name": "notificationConfigurationOverrides"},
+    )
+    process_id: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "processId"}
+    )
+    process_start_failure_message: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "processStartFailureMessage"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    response_files: Optional[
+        List[_artifact_location.ArtifactLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "responseFiles"}
+    )
+    rule_configuration_overrides: Optional[
+        List[_configuration_override.ConfigurationOverride]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "ruleConfigurationOverrides"}
+    )
+    start_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "startTimeUtc"}
+    )
+    stderr: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stderr"}
+    )
+    stdin: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stdin"}
+    )
+    stdout: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stdout"}
+    )
+    stdout_stderr: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stdoutStderr"}
+    )
+    tool_configuration_notifications: Optional[
+        List[_notification.Notification]
+    ] = dataclasses.field(
+        default=None,
+        metadata={"schema_property_name": "toolConfigurationNotifications"},
+    )
+    tool_execution_notifications: Optional[
+        List[_notification.Notification]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "toolExecutionNotifications"}
+    )
+    working_directory: Optional[
+        _artifact_location.ArtifactLocation
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "workingDirectory"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c9d84204f8d86cae0dbaa981c76abcb948c293
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location.py
@@ -0,0 +1,50 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _location_relationship,
+    _logical_location,
+    _message,
+    _physical_location,
+    _property_bag,
+    _region,
+)
+
+
+@dataclasses.dataclass
+class Location(object):
+    """A location within a programming artifact."""
+
+    annotations: Optional[List[_region.Region]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "annotations"}
+    )
+    id: int = dataclasses.field(default=-1, metadata={"schema_property_name": "id"})
+    logical_locations: Optional[
+        List[_logical_location.LogicalLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "logicalLocations"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    physical_location: Optional[
+        _physical_location.PhysicalLocation
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "physicalLocation"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    relationships: Optional[
+        List[_location_relationship.LocationRelationship]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "relationships"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location_relationship.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location_relationship.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3e34dcb17b55289a94b999836ef6e08d90e9ecc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_location_relationship.py
@@ -0,0 +1,28 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class LocationRelationship(object):
+    """Information about the relation of one location to another."""
+
+    target: int = dataclasses.field(metadata={"schema_property_name": "target"})
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    kinds: List[str] = dataclasses.field(
+        default_factory=lambda: ["relevant"], metadata={"schema_property_name": "kinds"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_logical_location.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_logical_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..33bcfdd8794ea97ef1e86cb26ca59237121b7c8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_logical_location.py
@@ -0,0 +1,39 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class LogicalLocation(object):
+    """A logical location of a construct that produced a result."""
+
+    decorated_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "decoratedName"}
+    )
+    fully_qualified_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullyQualifiedName"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    kind: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "kind"}
+    )
+    name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "name"}
+    )
+    parent_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "parentIndex"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_message.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c4c740ae51ffb06e72a1454391f3cb721a427b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_message.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class Message(object):
+    """Encapsulates a message intended to be read by the end user."""
+
+    arguments: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "arguments"}
+    )
+    id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "id"}
+    )
+    markdown: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "markdown"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    text: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "text"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_multiformat_message_string.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_multiformat_message_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9963116f0a6c3055c191a40ce840ea255d08a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_multiformat_message_string.py
@@ -0,0 +1,25 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class MultiformatMessageString(object):
+    """A message string or message format string rendered in multiple formats."""
+
+    text: str = dataclasses.field(metadata={"schema_property_name": "text"})
+    markdown: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "markdown"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_node.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_node.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5e308e66196fc4b60b60c667a2d9304f465c944
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_node.py
@@ -0,0 +1,36 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _location,
+    _message,
+    _node,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Node(object):
+    """Represents a node in a graph."""
+
+    id: str = dataclasses.field(metadata={"schema_property_name": "id"})
+    children: Optional[List[_node.Node]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "children"}
+    )
+    label: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "label"}
+    )
+    location: Optional[_location.Location] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f114e07cdd2cffb53404270b185c5b80c1068b19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_notification.py
@@ -0,0 +1,53 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _exception,
+    _location,
+    _message,
+    _property_bag,
+    _reporting_descriptor_reference,
+)
+
+
+@dataclasses.dataclass
+class Notification(object):
+    """Describes a condition relevant to the tool itself, as opposed to being relevant to a target being analyzed by the tool."""
+
+    message: _message.Message = dataclasses.field(
+        metadata={"schema_property_name": "message"}
+    )
+    associated_rule: Optional[
+        _reporting_descriptor_reference.ReportingDescriptorReference
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "associatedRule"}
+    )
+    descriptor: Optional[
+        _reporting_descriptor_reference.ReportingDescriptorReference
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "descriptor"})
+    exception: Optional[_exception.Exception] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "exception"}
+    )
+    level: Literal["none", "note", "warning", "error"] = dataclasses.field(
+        default="warning", metadata={"schema_property_name": "level"}
+    )
+    locations: Optional[List[_location.Location]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "locations"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    thread_id: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "threadId"}
+    )
+    time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "timeUtc"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_physical_location.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_physical_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..07bc4b1104bf863f89a4bc5c04b5fc0b96a11c22
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_physical_location.py
@@ -0,0 +1,40 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _address,
+    _artifact_location,
+    _property_bag,
+    _region,
+)
+
+
+@dataclasses.dataclass
+class PhysicalLocation(object):
+    """A physical location relevant to a result. Specifies a reference to a programming artifact together with a range of bytes or characters within that artifact."""
+
+    address: Optional[_address.Address] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "address"}
+    )
+    artifact_location: Optional[
+        _artifact_location.ArtifactLocation
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "artifactLocation"}
+    )
+    context_region: Optional[_region.Region] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "contextRegion"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    region: Optional[_region.Region] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "region"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_property_bag.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_property_bag.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec3a8ecdc1f5065374a7c2b64a6bbb4764a677a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_property_bag.py
@@ -0,0 +1,19 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+
+@dataclasses.dataclass
+class PropertyBag(object):
+    """Key/value pairs that provide additional information about the object."""
+
+    tags: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "tags"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_rectangle.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_rectangle.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7439eeb6e7701ebbd6ab03c76e32d0596a15eb7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_rectangle.py
@@ -0,0 +1,36 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class Rectangle(object):
+    """An area within an image."""
+
+    bottom: Optional[float] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "bottom"}
+    )
+    left: Optional[float] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "left"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    right: Optional[float] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "right"}
+    )
+    top: Optional[float] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "top"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_region.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_region.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0eb2f641b92ec0a9bf35c3499c23cc039d76114
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_region.py
@@ -0,0 +1,58 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_content,
+    _message,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class Region(object):
+    """A region within an artifact where a result was detected."""
+
+    byte_length: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "byteLength"}
+    )
+    byte_offset: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "byteOffset"}
+    )
+    char_length: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "charLength"}
+    )
+    char_offset: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "charOffset"}
+    )
+    end_column: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "endColumn"}
+    )
+    end_line: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "endLine"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    snippet: Optional[_artifact_content.ArtifactContent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "snippet"}
+    )
+    source_language: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "sourceLanguage"}
+    )
+    start_column: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "startColumn"}
+    )
+    start_line: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "startLine"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_replacement.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_replacement.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec03cb71a043885880a3bad544c1fec4c143cf09
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_replacement.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_content,
+    _property_bag,
+    _region,
+)
+
+
+@dataclasses.dataclass
+class Replacement(object):
+    """The replacement of a single region of an artifact."""
+
+    deleted_region: _region.Region = dataclasses.field(
+        metadata={"schema_property_name": "deletedRegion"}
+    )
+    inserted_content: Optional[_artifact_content.ArtifactContent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "insertedContent"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py
new file mode 100644
index 0000000000000000000000000000000000000000..034bc1be98f53f08df0413d4c69302eb50b854e8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_configuration.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class ReportingConfiguration(object):
+    """Information about a rule or notification that can be configured at runtime."""
+
+    enabled: bool = dataclasses.field(
+        default=True, metadata={"schema_property_name": "enabled"}
+    )
+    level: Literal["none", "note", "warning", "error"] = dataclasses.field(
+        default="warning", metadata={"schema_property_name": "level"}
+    )
+    parameters: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "parameters"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    rank: float = dataclasses.field(
+        default=-1.0, metadata={"schema_property_name": "rank"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c5c5fbfb204a932d79044313156cbbbc9d5c1b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor.py
@@ -0,0 +1,71 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _multiformat_message_string,
+    _property_bag,
+    _reporting_configuration,
+    _reporting_descriptor_relationship,
+)
+
+
+@dataclasses.dataclass
+class ReportingDescriptor(object):
+    """Metadata that describes a specific report produced by the tool, as part of the analysis it provides or its runtime reporting."""
+
+    id: str = dataclasses.field(metadata={"schema_property_name": "id"})
+    default_configuration: Optional[
+        _reporting_configuration.ReportingConfiguration
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "defaultConfiguration"}
+    )
+    deprecated_guids: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "deprecatedGuids"}
+    )
+    deprecated_ids: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "deprecatedIds"}
+    )
+    deprecated_names: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "deprecatedNames"}
+    )
+    full_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullDescription"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    help: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "help"})
+    help_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "helpUri"}
+    )
+    message_strings: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "messageStrings"}
+    )
+    name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "name"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    relationships: Optional[
+        List[_reporting_descriptor_relationship.ReportingDescriptorRelationship]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "relationships"}
+    )
+    short_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "shortDescription"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_reference.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7446cf883ce6d6c4bec5fc6636550c52b9d77c3a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_reference.py
@@ -0,0 +1,38 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _property_bag,
+    _tool_component_reference,
+)
+
+
+@dataclasses.dataclass
+class ReportingDescriptorReference(object):
+    """Information about how to locate a relevant reporting descriptor."""
+
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "id"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    tool_component: Optional[
+        _tool_component_reference.ToolComponentReference
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "toolComponent"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_relationship.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_relationship.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b0bcc968327196ef63e9277f4eb112d24d1791f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_reporting_descriptor_relationship.py
@@ -0,0 +1,34 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _message,
+    _property_bag,
+    _reporting_descriptor_reference,
+)
+
+
+@dataclasses.dataclass
+class ReportingDescriptorRelationship(object):
+    """Information about the relation of one reporting descriptor to another."""
+
+    target: _reporting_descriptor_reference.ReportingDescriptorReference = (
+        dataclasses.field(metadata={"schema_property_name": "target"})
+    )
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    kinds: List[str] = dataclasses.field(
+        default_factory=lambda: ["relevant"], metadata={"schema_property_name": "kinds"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..d98b7abf15435949f49598b46aa630bcb1aee268
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result.py
@@ -0,0 +1,128 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _attachment,
+    _code_flow,
+    _fix,
+    _graph,
+    _graph_traversal,
+    _location,
+    _message,
+    _property_bag,
+    _reporting_descriptor_reference,
+    _result_provenance,
+    _stack,
+    _suppression,
+    _web_request,
+    _web_response,
+)
+
+
+@dataclasses.dataclass
+class Result(object):
+    """A result produced by an analysis tool."""
+
+    message: _message.Message = dataclasses.field(
+        metadata={"schema_property_name": "message"}
+    )
+    analysis_target: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "analysisTarget"}
+    )
+    attachments: Optional[List[_attachment.Attachment]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "attachments"}
+    )
+    baseline_state: Optional[
+        Literal["new", "unchanged", "updated", "absent"]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "baselineState"}
+    )
+    code_flows: Optional[List[_code_flow.CodeFlow]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "codeFlows"}
+    )
+    correlation_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "correlationGuid"}
+    )
+    fingerprints: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fingerprints"}
+    )
+    fixes: Optional[List[_fix.Fix]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fixes"}
+    )
+    graph_traversals: Optional[
+        List[_graph_traversal.GraphTraversal]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "graphTraversals"}
+    )
+    graphs: Optional[List[_graph.Graph]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "graphs"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    hosted_viewer_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "hostedViewerUri"}
+    )
+    kind: Literal[
+        "notApplicable", "pass", "fail", "review", "open", "informational"
+    ] = dataclasses.field(default="fail", metadata={"schema_property_name": "kind"})
+    level: Literal["none", "note", "warning", "error"] = dataclasses.field(
+        default="warning", metadata={"schema_property_name": "level"}
+    )
+    locations: Optional[List[_location.Location]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "locations"}
+    )
+    occurrence_count: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "occurrenceCount"}
+    )
+    partial_fingerprints: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "partialFingerprints"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    provenance: Optional[_result_provenance.ResultProvenance] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "provenance"}
+    )
+    rank: float = dataclasses.field(
+        default=-1.0, metadata={"schema_property_name": "rank"}
+    )
+    related_locations: Optional[List[_location.Location]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "relatedLocations"}
+    )
+    rule: Optional[
+        _reporting_descriptor_reference.ReportingDescriptorReference
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "rule"})
+    rule_id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "ruleId"}
+    )
+    rule_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "ruleIndex"}
+    )
+    stacks: Optional[List[_stack.Stack]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stacks"}
+    )
+    suppressions: Optional[List[_suppression.Suppression]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "suppressions"}
+    )
+    taxa: Optional[
+        List[_reporting_descriptor_reference.ReportingDescriptorReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "taxa"})
+    web_request: Optional[_web_request.WebRequest] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webRequest"}
+    )
+    web_response: Optional[_web_response.WebResponse] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webResponse"}
+    )
+    work_item_uris: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "workItemUris"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result_provenance.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result_provenance.py
new file mode 100644
index 0000000000000000000000000000000000000000..71479c1850364930eb91f0c152345fabb656bcf6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_result_provenance.py
@@ -0,0 +1,44 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _physical_location,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class ResultProvenance(object):
+    """Contains information about how and when a result was detected."""
+
+    conversion_sources: Optional[
+        List[_physical_location.PhysicalLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "conversionSources"}
+    )
+    first_detection_run_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "firstDetectionRunGuid"}
+    )
+    first_detection_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "firstDetectionTimeUtc"}
+    )
+    invocation_index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "invocationIndex"}
+    )
+    last_detection_run_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "lastDetectionRunGuid"}
+    )
+    last_detection_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "lastDetectionTimeUtc"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f32e243532853c7abacac1f28d682af4a993204
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run.py
@@ -0,0 +1,134 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _address,
+    _artifact,
+    _conversion,
+    _external_property_file_references,
+    _graph,
+    _invocation,
+    _logical_location,
+    _property_bag,
+    _result,
+    _run_automation_details,
+    _special_locations,
+    _thread_flow_location,
+    _tool,
+    _tool_component,
+    _version_control_details,
+    _web_request,
+    _web_response,
+)
+
+
+@dataclasses.dataclass
+class Run(object):
+    """Describes a single run of an analysis tool, and contains the reported output of that run."""
+
+    tool: _tool.Tool = dataclasses.field(metadata={"schema_property_name": "tool"})
+    addresses: Optional[List[_address.Address]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "addresses"}
+    )
+    artifacts: Optional[List[_artifact.Artifact]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "artifacts"}
+    )
+    automation_details: Optional[
+        _run_automation_details.RunAutomationDetails
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "automationDetails"}
+    )
+    baseline_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "baselineGuid"}
+    )
+    column_kind: Optional[
+        Literal["utf16CodeUnits", "unicodeCodePoints"]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "columnKind"})
+    conversion: Optional[_conversion.Conversion] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "conversion"}
+    )
+    default_encoding: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "defaultEncoding"}
+    )
+    default_source_language: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "defaultSourceLanguage"}
+    )
+    external_property_file_references: Optional[
+        _external_property_file_references.ExternalPropertyFileReferences
+    ] = dataclasses.field(
+        default=None,
+        metadata={"schema_property_name": "externalPropertyFileReferences"},
+    )
+    graphs: Optional[List[_graph.Graph]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "graphs"}
+    )
+    invocations: Optional[List[_invocation.Invocation]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "invocations"}
+    )
+    language: str = dataclasses.field(
+        default="en-US", metadata={"schema_property_name": "language"}
+    )
+    logical_locations: Optional[
+        List[_logical_location.LogicalLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "logicalLocations"}
+    )
+    newline_sequences: List[str] = dataclasses.field(
+        default_factory=lambda: ["\r\n", "\n"],
+        metadata={"schema_property_name": "newlineSequences"},
+    )
+    original_uri_base_ids: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "originalUriBaseIds"}
+    )
+    policies: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "policies"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    redaction_tokens: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "redactionTokens"}
+    )
+    results: Optional[List[_result.Result]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "results"}
+    )
+    run_aggregates: Optional[
+        List[_run_automation_details.RunAutomationDetails]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "runAggregates"}
+    )
+    special_locations: Optional[
+        _special_locations.SpecialLocations
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "specialLocations"}
+    )
+    taxonomies: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "taxonomies"}
+    )
+    thread_flow_locations: Optional[
+        List[_thread_flow_location.ThreadFlowLocation]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "threadFlowLocations"}
+    )
+    translations: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "translations"}
+    )
+    version_control_provenance: Optional[
+        List[_version_control_details.VersionControlDetails]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "versionControlProvenance"}
+    )
+    web_requests: Optional[List[_web_request.WebRequest]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webRequests"}
+    )
+    web_responses: Optional[List[_web_response.WebResponse]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webResponses"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run_automation_details.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run_automation_details.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acc3b009585760cc0e1693fda1e157c25045d6f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_run_automation_details.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _message, _property_bag
+
+
+@dataclasses.dataclass
+class RunAutomationDetails(object):
+    """Information that describes a run's identity and role within an engineering system process."""
+
+    correlation_guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "correlationGuid"}
+    )
+    description: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "description"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "id"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dcfe4f6c2200a3264fd19548336c980965a9f41
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_sarif_log.py
@@ -0,0 +1,37 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _external_properties,
+    _property_bag,
+    _run,
+)
+
+
+@dataclasses.dataclass
+class SarifLog(object):
+    """Static Analysis Results Format (SARIF) Version 2.1.0 JSON Schema: a standard format for the output of static analysis tools."""
+
+    runs: List[_run.Run] = dataclasses.field(metadata={"schema_property_name": "runs"})
+    version: Literal["2.1.0"] = dataclasses.field(
+        metadata={"schema_property_name": "version"}
+    )
+    schema_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "$schema"}
+    )
+    inline_external_properties: Optional[
+        List[_external_properties.ExternalProperties]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "inlineExternalProperties"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_special_locations.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_special_locations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f2420017fcc95d110512088dd9c474ce2c8e3b5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_special_locations.py
@@ -0,0 +1,27 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class SpecialLocations(object):
+    """Defines locations of special significance to SARIF consumers."""
+
+    display_base: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "displayBase"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b0abdf8ac07236c9721375fdb7fa5b1209c8be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack.py
@@ -0,0 +1,31 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _message,
+    _property_bag,
+    _stack_frame,
+)
+
+
+@dataclasses.dataclass
+class Stack(object):
+    """A call stack that is relevant to a result."""
+
+    frames: List[_stack_frame.StackFrame] = dataclasses.field(
+        metadata={"schema_property_name": "frames"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack_frame.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..c04baefc8d2fa37db3a401494f6ba61f8248b1bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_stack_frame.py
@@ -0,0 +1,33 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _location, _property_bag
+
+
+@dataclasses.dataclass
+class StackFrame(object):
+    """A function call within a stack trace."""
+
+    location: Optional[_location.Location] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    module: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "module"}
+    )
+    parameters: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "parameters"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    thread_id: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "threadId"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py
new file mode 100644
index 0000000000000000000000000000000000000000..ced42d819759ffd7c1ea04bd1d66c054372a776c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_suppression.py
@@ -0,0 +1,36 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _location, _property_bag
+
+
+@dataclasses.dataclass
+class Suppression(object):
+    """A suppression that is relevant to a result."""
+
+    kind: Literal["inSource", "external"] = dataclasses.field(
+        metadata={"schema_property_name": "kind"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    justification: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "justification"}
+    )
+    location: Optional[_location.Location] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    state: Optional[Literal["accepted", "underReview", "rejected"]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "state"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..2863759e9044cbc59bccb0073f040124174c95c7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow.py
@@ -0,0 +1,40 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _message,
+    _property_bag,
+    _thread_flow_location,
+)
+
+
+@dataclasses.dataclass
+class ThreadFlow(object):
+    """Describes a sequence of code locations that specify a path through a single thread of execution such as an operating system or fiber."""
+
+    locations: List[_thread_flow_location.ThreadFlowLocation] = dataclasses.field(
+        metadata={"schema_property_name": "locations"}
+    )
+    id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "id"}
+    )
+    immutable_state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "immutableState"}
+    )
+    initial_state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "initialState"}
+    )
+    message: Optional[_message.Message] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "message"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..98daa5d45ecad4f73d5f0165fe46b96aaad0a891
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_thread_flow_location.py
@@ -0,0 +1,67 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _location,
+    _property_bag,
+    _reporting_descriptor_reference,
+    _stack,
+    _web_request,
+    _web_response,
+)
+
+
+@dataclasses.dataclass
+class ThreadFlowLocation(object):
+    """A location visited by an analysis tool while simulating or monitoring the execution of a program."""
+
+    execution_order: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "executionOrder"}
+    )
+    execution_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "executionTimeUtc"}
+    )
+    importance: Literal["important", "essential", "unimportant"] = dataclasses.field(
+        default="important", metadata={"schema_property_name": "importance"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    kinds: Optional[List[str]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "kinds"}
+    )
+    location: Optional[_location.Location] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "location"}
+    )
+    module: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "module"}
+    )
+    nesting_level: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "nestingLevel"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    stack: Optional[_stack.Stack] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "stack"}
+    )
+    state: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "state"}
+    )
+    taxa: Optional[
+        List[_reporting_descriptor_reference.ReportingDescriptorReference]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "taxa"})
+    web_request: Optional[_web_request.WebRequest] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webRequest"}
+    )
+    web_response: Optional[_web_response.WebResponse] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "webResponse"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3a6d162dcfb717ddd8b579b52503697bf9b5c0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool.py
@@ -0,0 +1,27 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import List, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag, _tool_component
+
+
+@dataclasses.dataclass
+class Tool(object):
+    """The analysis tool that was run."""
+
+    driver: _tool_component.ToolComponent = dataclasses.field(
+        metadata={"schema_property_name": "driver"}
+    )
+    extensions: Optional[List[_tool_component.ToolComponent]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "extensions"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f370e2ceccb1b6d8997a150f2b68ff6948d911
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component.py
@@ -0,0 +1,123 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, List, Literal, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _multiformat_message_string,
+    _property_bag,
+    _reporting_descriptor,
+    _tool_component_reference,
+    _translation_metadata,
+)
+
+
+@dataclasses.dataclass
+class ToolComponent(object):
+    """A component, such as a plug-in or the driver, of the analysis tool that was run."""
+
+    name: str = dataclasses.field(metadata={"schema_property_name": "name"})
+    associated_component: Optional[
+        _tool_component_reference.ToolComponentReference
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "associatedComponent"}
+    )
+    contents: List[Literal["localizedData", "nonLocalizedData"]] = dataclasses.field(
+        default_factory=lambda: ["localizedData", "nonLocalizedData"],
+        metadata={"schema_property_name": "contents"},
+    )
+    dotted_quad_file_version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "dottedQuadFileVersion"}
+    )
+    download_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "downloadUri"}
+    )
+    full_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullDescription"}
+    )
+    full_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullName"}
+    )
+    global_message_strings: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "globalMessageStrings"}
+    )
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    information_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "informationUri"}
+    )
+    is_comprehensive: Optional[bool] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "isComprehensive"}
+    )
+    language: str = dataclasses.field(
+        default="en-US", metadata={"schema_property_name": "language"}
+    )
+    localized_data_semantic_version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "localizedDataSemanticVersion"}
+    )
+    locations: Optional[List[_artifact_location.ArtifactLocation]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "locations"}
+    )
+    minimum_required_localized_data_semantic_version: Optional[str] = dataclasses.field(
+        default=None,
+        metadata={
+            "schema_property_name": "minimumRequiredLocalizedDataSemanticVersion"
+        },
+    )
+    notifications: Optional[
+        List[_reporting_descriptor.ReportingDescriptor]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "notifications"}
+    )
+    organization: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "organization"}
+    )
+    product: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "product"}
+    )
+    product_suite: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "productSuite"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    release_date_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "releaseDateUtc"}
+    )
+    rules: Optional[
+        List[_reporting_descriptor.ReportingDescriptor]
+    ] = dataclasses.field(default=None, metadata={"schema_property_name": "rules"})
+    semantic_version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "semanticVersion"}
+    )
+    short_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "shortDescription"}
+    )
+    supported_taxonomies: Optional[
+        List[_tool_component_reference.ToolComponentReference]
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "supportedTaxonomies"}
+    )
+    taxa: Optional[List[_reporting_descriptor.ReportingDescriptor]] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "taxa"}
+    )
+    translation_metadata: Optional[
+        _translation_metadata.TranslationMetadata
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "translationMetadata"}
+    )
+    version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "version"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component_reference.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component_reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0d5b4c46c429b518bc13f91ccb5e824fac643b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_tool_component_reference.py
@@ -0,0 +1,30 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import _property_bag
+
+
+@dataclasses.dataclass
+class ToolComponentReference(object):
+    """Identifies a particular toolComponent object, either the driver or an extension."""
+
+    guid: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "guid"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "name"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_translation_metadata.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_translation_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..16942a6db75770dde6bdccf1920b418204c6c037
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_translation_metadata.py
@@ -0,0 +1,44 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _multiformat_message_string,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class TranslationMetadata(object):
+    """Provides additional metadata related to translation."""
+
+    name: str = dataclasses.field(metadata={"schema_property_name": "name"})
+    download_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "downloadUri"}
+    )
+    full_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullDescription"}
+    )
+    full_name: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "fullName"}
+    )
+    information_uri: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "informationUri"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    short_description: Optional[
+        _multiformat_message_string.MultiformatMessageString
+    ] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "shortDescription"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_version_control_details.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_version_control_details.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdcfccdf48ffb158b10300a3cbd7db7bb6cff4b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_version_control_details.py
@@ -0,0 +1,42 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_location,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class VersionControlDetails(object):
+    """Specifies the information necessary to retrieve a desired revision from a version control system."""
+
+    repository_uri: str = dataclasses.field(
+        metadata={"schema_property_name": "repositoryUri"}
+    )
+    as_of_time_utc: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "asOfTimeUtc"}
+    )
+    branch: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "branch"}
+    )
+    mapped_to: Optional[_artifact_location.ArtifactLocation] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "mappedTo"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    revision_id: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "revisionId"}
+    )
+    revision_tag: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "revisionTag"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_request.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_request.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcebca8b0606546ddd216c651a3c4d976f7b6f8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_request.py
@@ -0,0 +1,48 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_content,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class WebRequest(object):
+    """Describes an HTTP request."""
+
+    body: Optional[_artifact_content.ArtifactContent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "body"}
+    )
+    headers: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "headers"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    method: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "method"}
+    )
+    parameters: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "parameters"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    protocol: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "protocol"}
+    )
+    target: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "target"}
+    )
+    version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "version"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_response.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_response.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b31295b9b398b9941d1605a88f82b9702fc5f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/_web_response.py
@@ -0,0 +1,48 @@
+# DO NOT EDIT! This file was generated by jschema_to_python version 0.0.1.dev29,
+# with extension for dataclasses and type annotation.
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, Optional
+
+from torch.onnx._internal.diagnostics.infra.sarif import (
+    _artifact_content,
+    _property_bag,
+)
+
+
+@dataclasses.dataclass
+class WebResponse(object):
+    """Describes the response to an HTTP request."""
+
+    body: Optional[_artifact_content.ArtifactContent] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "body"}
+    )
+    headers: Any = dataclasses.field(
+        default=None, metadata={"schema_property_name": "headers"}
+    )
+    index: int = dataclasses.field(
+        default=-1, metadata={"schema_property_name": "index"}
+    )
+    no_response_received: Optional[bool] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "noResponseReceived"}
+    )
+    properties: Optional[_property_bag.PropertyBag] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "properties"}
+    )
+    protocol: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "protocol"}
+    )
+    reason_phrase: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "reasonPhrase"}
+    )
+    status_code: Optional[int] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "statusCode"}
+    )
+    version: Optional[str] = dataclasses.field(
+        default=None, metadata={"schema_property_name": "version"}
+    )
+
+
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/version.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad9ac8129cbce11ac1b4764ed3b5358f31fe4e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/sarif/version.py
@@ -0,0 +1,5 @@
+from typing import Final
+
+SARIF_VERSION: Final = "2.1.0"
+SARIF_SCHEMA_LINK: Final = "https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/schemas/sarif-schema-2.1.0.json"
+# flake8: noqa
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7821056d6f698f5db8211075631d1ad73736c7e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import functools
+
+import inspect
+import traceback
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple
+
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.diagnostics.infra import _infra, formatter
+
+
+@_beartype.beartype
+def python_frame(frame: traceback.FrameSummary) -> _infra.StackFrame:
+    """Returns a StackFrame for the given traceback.FrameSummary."""
+    snippet = frame.line
+
+    return _infra.StackFrame(
+        location=_infra.Location(
+            uri=frame.filename,
+            line=frame.lineno,
+            snippet=snippet,
+            function=frame.name,
+            message=snippet,
+        )
+    )
+
+
+@_beartype.beartype
+def python_call_stack(frames_to_skip: int = 0, frames_to_log: int = 16) -> _infra.Stack:
+    """Returns the current Python call stack."""
+    if frames_to_skip < 0:
+        raise ValueError("frames_to_skip must be non-negative")
+    if frames_to_log < 0:
+        raise ValueError("frames_to_log must be non-negative")
+    frames_to_skip += 2  # Skip this function and beartype.
+    stack = _infra.Stack()
+    # Frames are returned in order of oldest to newest.
+    frames = traceback.extract_stack(limit=frames_to_skip + frames_to_log)
+    frames.reverse()
+    stack.frames = [python_frame(frame) for frame in frames[frames_to_skip:]]
+    stack.message = "Python call stack"
+    return stack
+
+
+@functools.lru_cache
+def _function_source_info(fn: Callable) -> Tuple[Sequence[str], int, Optional[str]]:
+    """Returns the source lines, line number, and source file path for the given function.
+
+    Essentially, inspect.getsourcelines() and inspect.getsourcefile() combined.
+    Caching is applied to reduce the performance impact of this function.
+    """
+    source_lines, lineno = inspect.getsourcelines(fn)
+    return source_lines, lineno, inspect.getsourcefile(fn)
+
+
+@_beartype.beartype
+def function_location(fn: Callable) -> _infra.Location:
+    """Returns a Location for the given function."""
+    source_lines, lineno, uri = _function_source_info(fn)
+    snippet = source_lines[0].strip() if len(source_lines) > 0 else "<unknown>"
+    return _infra.Location(
+        uri=uri,
+        line=lineno,
+        snippet=snippet,
+        message=formatter.display_name(fn),
+    )
+
+
+@_beartype.beartype
+def function_state(
+    fn: Callable, args: Tuple[Any, ...], kwargs: Dict[str, Any]
+) -> Mapping[str, Any]:
+    bind = inspect.signature(fn).bind(*args, **kwargs)
+    return bind.arguments
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/exporter.py b/MLPY/Lib/site-packages/torch/onnx/_internal/exporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..49da5d4d31f18bb2600655fb4b63aa71e1d8a3ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/exporter.py
@@ -0,0 +1,1534 @@
+from __future__ import (  # for onnx.ModelProto (ONNXProgram) and onnxruntime (ONNXRuntimeOptions)
+    annotations,
+)
+
+import abc
+
+import contextlib
+import dataclasses
+import io
+import logging
+import os
+
+import warnings
+from collections import defaultdict
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Final,
+    List,
+    Mapping,
+    Optional,
+    Protocol,
+    runtime_checkable,
+    Sequence,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    TypeVar,
+    Union,
+)
+
+from typing_extensions import Self
+
+import torch
+
+import torch._ops
+import torch.export as torch_export
+import torch.utils._pytree as pytree
+from torch._subclasses import fake_tensor
+
+from torch.onnx._internal import _beartype, io_adapter
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.fx import (
+    decomposition_table,
+    patcher as patcher,
+    registration,
+    serialization as fx_serialization,
+)
+
+# We can only import onnx from this module in a type-checking context to ensure that
+# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
+# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
+if TYPE_CHECKING:
+    import onnx
+    import onnxruntime  # type: ignore[import]
+    import onnxscript  # type: ignore[import]
+    from onnxscript.function_libs.torch_lib import (  # type: ignore[import]
+        registration as torchlib_registry,
+    )
+
+    from torch.onnx._internal.fx import diagnostics
+else:
+    try:
+        # beartype needs this import due to runtime type checking.
+        # This cannot be normally imported at top level due to
+        # https://github.com/pytorch/pytorch/issues/103764
+        from torch.onnx._internal.fx import diagnostics
+    except ImportError:
+        # The error will be handled elsewhere when the exporter is used.
+        pass
+
+_DEFAULT_OPSET_VERSION: Final[int] = 18
+"""The default ONNX opset version the exporter will use if one is not specified explicitly
+through :class:`ExportOptions`. This should NEVER be accessed outside of this module! Users
+should reference :attr:`ExportOptions.opset_version`."""
+
+_PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
+"""The URL to the PyTorch GitHub issues page."""
+
+_DEFAULT_FAILED_EXPORT_SARIF_LOG_PATH = "report_dynamo_export.sarif"
+"""The default path to write the SARIF log to if the export fails."""
+
+_PROTOBUF_SIZE_MAX_LIMIT = 2 * 1024 * 1024 * 1024
+"""The maximum size of a Protobuf file in bytes. This is used to determine whether to
+serialize the model with external data or not."""
+
+log = logging.getLogger(__name__)
+
+
+DiagnosticOptions = infra.DiagnosticOptions
+
+
+@dataclasses.dataclass
+class ONNXFakeContext:
+    """A dataclass used to store context for model export using FakeTensor.
+
+    This dataclass stores the FakeTensorMode instance used to convert
+    real tensors and model parameters into fake tensors. This :attr:`ONNXFakeContext.fake_mode` is
+    reused internally during tracing of a :class:`torch.nn.Module` into a FX :class:`GraphModule`.
+    """
+
+    fake_mode: fake_tensor.FakeTensorMode
+    """The fake tensor mode used for tracing model using fake tensors and parameters."""
+
+    state_dict_paths: Optional[Tuple[Union[str, io.BytesIO, Dict[str, Any]]]] = None
+    """List of paths of files that contain the model :meth:`state_dict`"""
+
+
+class OnnxRegistry:
+    """Registry for ONNX functions.
+
+    The registry maintains a mapping from qualified names to symbolic functions under a
+    fixed opset version. It supports registering custom onnx-script functions and for
+    dispatcher to dispatch calls to the appropriate function.
+
+    """
+
+    def __init__(self) -> None:
+        """Initializes the registry"""
+
+        # NOTE: _registry is the registry maps OpNameto a list of ONNXFunctions. It is important
+        # not to directly modify this variable. Instead, access to it should be done through
+        # the public methods: register_custom_op, get_ops, and is_registered_op.
+        self._registry: Dict[
+            registration.OpName, List[registration.ONNXFunction]
+        ] = defaultdict(list)
+        # FIXME: Avoid importing onnxscript into torch
+        from onnxscript.function_libs.torch_lib import (  # type: ignore[import]  # noqa: F401
+            registration,
+        )
+
+        # opset_version is unused for now, since torchlib only supports opset18.
+        # TODO: get opset version from torchlib
+        self._opset_version = _DEFAULT_OPSET_VERSION
+        warnings.warn(
+            f"torch.onnx.dynamo_export only implements opset version {self._opset_version} for now. If you need to use a "
+            "different opset version, please register them with register_custom_op."
+        )
+
+        # Initialize registry from torchlib
+        self._initiate_registry_from_torchlib(registration.default_registry)
+
+    @property
+    def opset_version(self) -> int:
+        """The ONNX opset version the exporter should target. Defaults to the latest
+        supported ONNX opset version: 18. The default version will increment over time as
+        ONNX continues to evolve."""
+
+        return self._opset_version
+
+    def _initiate_registry_from_torchlib(
+        self, torchlib_registry: torchlib_registry.Registry
+    ):
+        """Populates the registry with ATen functions from torchlib.
+
+        Args:
+            torchlib_registry: The torchlib registry to use for populating the registry.
+        """
+        for aten_name, aten_overloads_func in torchlib_registry.items():
+            internal_name_instance = registration.OpName.from_qualified_name(aten_name)
+            for overload_func in aten_overloads_func.overloads:
+                symbolic_function = registration.ONNXFunction(
+                    onnx_function=overload_func,
+                    op_full_name=internal_name_instance.qualified_name(),
+                    is_custom=False,
+                    is_complex=False,
+                )
+                self._register(internal_name_instance, symbolic_function)
+
+            for complex_func in aten_overloads_func.complex:
+                symbolic_function = registration.ONNXFunction(
+                    onnx_function=complex_func,
+                    op_full_name=internal_name_instance.qualified_name(),
+                    is_custom=False,
+                    is_complex=True,
+                )
+                self._register(internal_name_instance, symbolic_function)
+
+    @_beartype.beartype
+    def _register(
+        self,
+        internal_qualified_name: registration.OpName,
+        symbolic_function: registration.ONNXFunction,
+    ) -> None:
+        """Registers a ONNXFunction to an operator.
+
+        Args:
+            internal_qualified_name: The qualified name of the operator to register: OpName.
+            symbolic_function: The ONNXFunction to register.
+        """
+        self._registry[internal_qualified_name].append(symbolic_function)
+
+    @_beartype.beartype
+    def register_op(
+        self,
+        function: Union["onnxscript.OnnxFunction", "onnxscript.TracedOnnxFunction"],
+        namespace: str,
+        op_name: str,
+        overload: Optional[str] = None,
+        is_complex: bool = False,
+    ) -> None:
+        """Registers a custom operator: torch.ops.<namespace>.<op_name>.<overload>.
+
+        Args:
+            function: The onnx-sctip function to register.
+            namespace: The namespace of the operator to register.
+            op_name: The name of the operator to register.
+            overload: The overload of the operator to register. If it's default overload,
+                leave it to None.
+            is_complex: Whether the function is a function that handles complex valued inputs.
+
+        Raises:
+            ValueError: If the name is not in the form of 'namespace::op'.
+        """
+        internal_name_instance = registration.OpName.from_name_parts(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        symbolic_function = registration.ONNXFunction(
+            onnx_function=function,
+            op_full_name=internal_name_instance.qualified_name(),
+            is_custom=True,
+            is_complex=is_complex,
+        )
+        self._register(internal_name_instance, symbolic_function)
+
+    @_beartype.beartype
+    def get_op_functions(
+        self, namespace: str, op_name: str, overload: Optional[str] = None
+    ) -> Optional[List[registration.ONNXFunction]]:
+        """Returns a list of ONNXFunctions for the given op: torch.ops.<namespace>.<op_name>.<overload>.
+
+        The list is ordered by the time of registration. The custom operators should be
+        in the second half of the list.
+
+        Args:
+            namespace: The namespace of the operator to get.
+            op_name: The name of the operator to get.
+            overload: The overload of the operator to get. If it's default overload,
+                leave it to None.
+        Returns:
+            A list of ONNXFunctions corresponding to the given name, or None if
+            the name is not in the registry.
+        """
+        internal_name_instance = registration.OpName.from_name_parts(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        return self._registry.get(internal_name_instance)
+
+    @_beartype.beartype
+    def is_registered_op(
+        self, namespace: str, op_name: str, overload: Optional[str] = None
+    ) -> bool:
+        """Returns whether the given op is registered: torch.ops.<namespace>.<op_name>.<overload>.
+
+        Args:
+            namespace: The namespace of the operator to check.
+            op_name: The name of the operator to check.
+            overload: The overload of the operator to check. If it's default overload,
+                leave it to None.
+
+        Returns:
+            True if the given op is registered, otherwise False.
+        """
+        functions = self.get_op_functions(
+            namespace=namespace, op_name=op_name, overload=overload
+        )
+        return functions is not None
+
+    @_beartype.beartype
+    def _all_registered_ops(self) -> Set[str]:
+        """Returns the set of all registered function names."""
+        return {
+            op_name_class.qualified_name() for op_name_class in self._registry.keys()
+        }
+
+
+class ExportOptions:
+    """Options to influence the TorchDynamo ONNX exporter.
+
+    Attributes:
+        dynamic_shapes: Shape information hint for input/output tensors.
+            When ``None``, the exporter determines the most compatible setting.
+            When ``True``, all input shapes are considered dynamic.
+            When ``False``, all input shapes are considered static.
+        op_level_debug: Whether to export the model with op-level debug information
+        diagnostic_options: The diagnostic options for the exporter.
+        fake_context: The fake context used for symbolic tracing.
+        onnx_registry: The ONNX registry used to register ATen operators to ONNX functions.
+    """
+
+    dynamic_shapes: Optional[bool] = None
+    """Shape information hint for input/output tensors.
+
+    - ``None``: the exporter determines the most compatible setting.
+    - ``True``: all input shapes are considered dynamic.
+    - ``False``: all input shapes are considered static.
+    """
+
+    op_level_debug: Optional[bool] = None
+    """When True export the model with op-level debug running ops through ONNX Runtime."""
+
+    diagnostic_options: DiagnosticOptions
+    """The diagnostic options for the exporter."""
+
+    fake_context: Optional[ONNXFakeContext] = None
+    """The fake context used for symbolic tracing."""
+
+    onnx_registry: Optional[OnnxRegistry] = None
+    """The ONNX registry used to register ATen operators to ONNX functions."""
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        *,
+        dynamic_shapes: Optional[bool] = None,
+        op_level_debug: Optional[bool] = None,
+        fake_context: Optional[ONNXFakeContext] = None,
+        onnx_registry: Optional[OnnxRegistry] = None,
+        diagnostic_options: Optional[DiagnosticOptions] = None,
+    ):
+        self.dynamic_shapes = dynamic_shapes
+        self.op_level_debug = op_level_debug
+        self.fake_context = fake_context
+        self.onnx_registry = onnx_registry
+        self.diagnostic_options = diagnostic_options or DiagnosticOptions()
+
+
+class ResolvedExportOptions(ExportOptions):
+    """Consolidates :class:`ExportOptions` with default values.
+    All unspecified options from :class:`ExportOptions` are assigned a default value.
+    This is an internal class and its API may be changed at any time without notice.
+    """
+
+    # Public attributes MUST be redefined below without ``Optional[]`` from ``ExportOptions``
+    dynamic_shapes: bool
+    op_level_debug: bool
+    diagnostic_options: DiagnosticOptions
+    fake_context: ONNXFakeContext
+    onnx_registry: OnnxRegistry
+
+    # Private only attributes
+    decomposition_table: Dict[torch._ops.OpOverload, Callable]
+    """A dictionary that maps operators to their decomposition functions."""
+
+    onnxfunction_dispatcher: torch.onnx._internal.fx.onnxfunction_dispatcher.OnnxFunctionDispatcher
+    """The ONNX dispatcher used to dispatch ATen operators to ONNX functions."""
+
+    fx_tracer: FXGraphExtractor
+    """The FXGraphExtractor instance used to extract the FX graph from the model."""
+
+    diagnostic_context: diagnostics.DiagnosticContext
+    """The diagnostics context for the export. Responsible for recording diagnostics,
+    logging diagnostics, and generating the SARIF log."""
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        options: Union[ExportOptions, "ResolvedExportOptions"],
+        model: Optional[Union[torch.nn.Module, Callable, torch_export.ExportedProgram]] = None,  # type: ignore[name-defined]
+    ):
+        from torch.onnx._internal.fx import (  # TODO: Prevent circular dep
+            diagnostics,
+            dynamo_graph_extractor,
+            torch_export_graph_extractor,
+        )
+
+        if isinstance(options, ResolvedExportOptions):
+            self.dynamic_shapes = options.dynamic_shapes
+            self.op_level_debug = options.op_level_debug
+            self.diagnostic_options = options.diagnostic_options
+            self.fake_context = options.fake_context
+            # private
+            if isinstance(model, torch_export.ExportedProgram) and not isinstance(
+                options.fx_tracer, torch_export_graph_extractor.TorchExport
+            ):
+                message = "'model' of type 'ExportedProgram' is only supported with 'TorchExport' FX Tracer"
+                e = InvalidExportOptionsError(message)
+                raise InvalidExportOptionsError(
+                    ONNXProgram._from_failure(e, options.diagnostic_context), message
+                )
+            self.fx_tracer = options.fx_tracer
+            self.onnx_registry = options.onnx_registry
+            self.onnxfunction_dispatcher = options.onnxfunction_dispatcher
+            self.decomposition_table = options.decomposition_table
+            self.diagnostic_context = options.diagnostic_context
+        else:
+            T = TypeVar("T")
+
+            @_beartype.beartype
+            def resolve(value: Optional[T], fallback: Union[T, Callable[[], T]]) -> T:
+                if value is not None:
+                    return value
+                if callable(fallback):
+                    return fallback()
+                return fallback
+
+            self.dynamic_shapes = resolve(options.dynamic_shapes, False)
+
+            self.diagnostic_options = resolve(
+                options.diagnostic_options, DiagnosticOptions()
+            )
+            if isinstance(model, torch_export.ExportedProgram):
+                self.fx_tracer = torch_export_graph_extractor.TorchExport()
+            else:
+                self.fx_tracer = dynamo_graph_extractor.DynamoExport()
+
+            self.fake_context = resolve(options.fake_context, None)
+            self.diagnostic_context = diagnostics.DiagnosticContext(
+                "torch.onnx.dynamo_export",
+                torch.__version__,
+                self.diagnostic_options,
+            )
+
+            self.onnx_registry = resolve(options.onnx_registry, OnnxRegistry())
+            self.decomposition_table = (
+                decomposition_table.create_onnx_friendly_decomposition_table(
+                    self.onnx_registry
+                )
+            )
+
+            from torch.onnx._internal.fx import onnxfunction_dispatcher
+
+            self.op_level_debug = resolve(options.op_level_debug, False)
+            self.onnxfunction_dispatcher = (
+                onnxfunction_dispatcher.OnnxFunctionDispatcher(
+                    self.onnx_registry,
+                    self.diagnostic_context,
+                )
+            )
+
+            for key in dir(options):
+                if not key.startswith("_"):  # skip private attributes
+                    assert hasattr(self, key), f"Unresolved option '{key}'"
+
+
+@contextlib.contextmanager
+def enable_fake_mode():
+    """Enable fake mode for the duration of the context.
+
+    Internally it instantiates a :class:`torch._subclasses.fake_tensor.FakeTensorMode` context manager
+    that converts user input and model parameters into :class:`torch._subclasses.fake_tensor.FakeTensor`.
+
+    A :class:`torch._subclasses.fake_tensor.FakeTensor`
+    is a :class:`torch.Tensor` with the ability to run PyTorch code without having to
+    actually do computation through tensors allocated on a ``meta`` device. Because
+    there is no actual data being allocated on the device, this API allows for
+    exporting large models without the actual memory footprint needed for executing it.
+
+    It is highly recommended to enable fake mode when exporting models that
+    are too large to fit into memory.
+
+    Returns:
+        A :class:`ONNXFakeContext` object that must be passed to :func:`dynamo_export`
+        through the :attr:`ExportOptions.fake_context` argument.
+
+    Example::
+
+        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> import torch.onnx
+        >>> class MyModel(torch.nn.Module):  # Dummy model
+        ...     def __init__(self) -> None:
+        ...         super().__init__()
+        ...         self.linear = torch.nn.Linear(2, 2)
+        ...     def forward(self, x):
+        ...         out = self.linear(x)
+        ...         return out
+        >>> with torch.onnx.enable_fake_mode() as fake_context:
+        ...     my_nn_module = MyModel()
+        ...     arg1 = torch.randn(2, 2, 2)  # positional input 1
+        >>> export_options = torch.onnx.ExportOptions(fake_context=fake_context)
+        >>> onnx_program = torch.onnx.dynamo_export(
+        ...     my_nn_module,
+        ...     arg1,
+        ...     export_options=export_options
+        ... )
+        >>> # Saving model WITHOUT initializers
+        >>> onnx_program.save("my_model_without_initializers.onnx")
+        >>> # Saving model WITH initializers
+        >>> onnx_program.save("my_model_with_initializers.onnx", model_state=MyModel().state_dict())
+
+    .. warning::
+        This API is experimental and is *NOT* backward-compatible.
+
+    """
+    from torch._subclasses import fake_tensor
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
+    # This overrides the internal `FakeTensorMode` instance created by `torch._dynamo.export`[1].
+    # It is a good idea to keep them in sync (constructor args) to maintain the same default behavior
+    # [1] `torch/_dynamo/output_graph.py::InstructionTranslator::OutputGraph.__init__`
+    # Mixed fake/real tensors are only allowed when `torch.onnx.dynamo_export` is not called within `FakeTensorMode`
+    # This is needed because models can create new parameters during `forward(self, *args, **kwargs)` run
+    fake_mode = fake_tensor.FakeTensorMode(
+        allow_non_fake_inputs=not torch._guards.detect_fake_mode(),
+        shape_env=ShapeEnv(
+            allow_scalar_outputs=False, allow_dynamic_output_shape_ops=False
+        ),
+    )
+    # The patcher is needed for when user calls `fake_model.load_state_dict(...)` within fake mode
+    patcher_context = patcher.ONNXTorchPatcher()
+    fake_context = ONNXFakeContext(fake_mode=fake_mode)
+    with fake_mode, patcher_context:
+        yield fake_context
+    fake_context.state_dict_paths = tuple(
+        patcher_context.paths,
+    )  # type: ignore[assignment]
+
+
+@runtime_checkable
+class ONNXProgramSerializer(Protocol):
+    """Protocol for serializing an ONNX graph into a specific format (e.g. Protobuf).
+    Note that this is an advanced usage scenario."""
+
+    def serialize(
+        self, onnx_program: ONNXProgram, destination: io.BufferedIOBase
+    ) -> None:
+        """Protocol method that must be implemented for serialization.
+
+        Args:
+            onnx_program: Represents the in-memory exported ONNX model
+            destination: A binary IO stream or pre-allocated buffer into which
+                the serialized model should be written.
+
+        Example:
+
+            A simple serializer that writes the exported :py:obj:`onnx.ModelProto` in Protobuf
+            format to ``destination``:
+
+            ::
+
+                # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+                >>> import io
+                >>> import torch
+                >>> import torch.onnx
+                >>> class MyModel(torch.nn.Module):  # Dummy model
+                ...     def __init__(self) -> None:
+                ...         super().__init__()
+                ...         self.linear = torch.nn.Linear(2, 2)
+                ...     def forward(self, x):
+                ...         out = self.linear(x)
+                ...         return out
+                >>> class ProtobufONNXProgramSerializer:
+                ...     def serialize(
+                ...         self, onnx_program: torch.onnx.ONNXProgram, destination: io.BufferedIOBase
+                ...     ) -> None:
+                ...         destination.write(onnx_program.model_proto.SerializeToString())
+                >>> model = MyModel()
+                >>> arg1 = torch.randn(2, 2, 2)  # positional input 1
+                >>> torch.onnx.dynamo_export(model, arg1).save(
+                ...     destination="exported_model.onnx",
+                ...     serializer=ProtobufONNXProgramSerializer(),
+                ... )
+        """
+        ...
+
+
+class ProtobufONNXProgramSerializer:
+    """Serializes ONNX graph as Protobuf."""
+
+    @_beartype.beartype
+    def serialize(
+        self, onnx_program: ONNXProgram, destination: io.BufferedIOBase
+    ) -> None:
+        import onnx
+
+        if not isinstance(onnx_program.model_proto, onnx.ModelProto):  # type: ignore[attr-defined]
+            raise ValueError("onnx_program.ModelProto is not an onnx.ModelProto")
+        destination.write(onnx_program.model_proto.SerializeToString())
+
+
+class LargeProtobufONNXProgramSerializer:
+    """Serializes ONNX graph as Protobuf.
+
+    Fallback to serializing as Protobuf with external data for models larger than 2GB.
+    """
+
+    _destination_path: Final[str]
+
+    def __init__(self, destination_path: str):
+        self._destination_path = destination_path
+
+    @_beartype.beartype
+    def serialize(
+        self, onnx_program: ONNXProgram, destination: io.BufferedIOBase
+    ) -> None:
+        """`destination` is ignored. The model is saved to `self._destination_path` instead."""
+        import onnx
+
+        if onnx_program.model_proto.ByteSize() < _PROTOBUF_SIZE_MAX_LIMIT:
+            onnx.save_model(onnx_program.model_proto, self._destination_path)  # type: ignore[attr-defined]
+        else:
+            # ValueError: Message onnx.ModelProto exceeds maximum protobuf size of 2GB
+            # Fallback to serializing the model with external data.
+            onnx.save_model(  # type: ignore[attr-defined]
+                onnx_program.model_proto,
+                self._destination_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=True,
+            )
+
+
+class ONNXRuntimeOptions:
+    """Options to influence the execution of the ONNX model through ONNX Runtime.
+
+    Attributes:
+        session_options: ONNX Runtime session options.
+        execution_providers: ONNX Runtime execution providers to use during model execution.
+        execution_provider_options: ONNX Runtime execution provider options.
+    """
+
+    session_options: Optional[Sequence["onnxruntime.SessionOptions"]] = None
+    """ONNX Runtime session options."""
+
+    execution_providers: Optional[
+        Sequence[Union[str, Tuple[str, Dict[Any, Any]]]]
+    ] = None
+    """ONNX Runtime execution providers to use during model execution."""
+
+    execution_provider_options: Optional[Sequence[Dict[Any, Any]]] = None
+    """ONNX Runtime execution provider options."""
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        *,
+        session_options: Optional[Sequence["onnxruntime.SessionOptions"]] = None,
+        execution_providers: Optional[
+            Sequence[Union[str, Tuple[str, Dict[Any, Any]]]]
+        ] = None,
+        execution_provider_options: Optional[Sequence[Dict[Any, Any]]] = None,
+    ):
+        self.session_options = session_options
+        self.execution_providers = execution_providers
+        self.execution_provider_options = execution_provider_options
+
+
+class ONNXProgram:
+    """An in-memory representation of a PyTorch model that has been exported to ONNX.
+
+    Args:
+        model_proto: The exported ONNX model as an :py:obj:`onnx.ModelProto`.
+        input_adapter: The input adapter used to convert PyTorch inputs into ONNX inputs.
+        output_adapter: The output adapter used to convert PyTorch outputs into ONNX outputs.
+        diagnostic_context: Context object for the SARIF diagnostic system responsible for logging errors and metadata.
+        fake_context: The fake context used for symbolic tracing.
+        export_exception: The exception that occurred during export, if any.
+        model_signature: The model signature for the exported ONNX graph.
+    """
+
+    _model_proto: Final[onnx.ModelProto]  # type: ignore[name-defined]
+    _input_adapter: Final[io_adapter.InputAdapter]
+    _output_adapter: Final[io_adapter.OutputAdapter]
+    _diagnostic_context: Final[diagnostics.DiagnosticContext]
+    _fake_context: Final[Optional[ONNXFakeContext]]
+    _export_exception: Final[Optional[Exception]]
+    _model_signature: Final[Optional[torch.export.ExportGraphSignature]]
+    _model_torch: Final[
+        Optional[Union[torch.nn.Module, Callable, torch_export.ExportedProgram]]
+    ]
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        model_proto: onnx.ModelProto,  # type: ignore[name-defined]
+        input_adapter: io_adapter.InputAdapter,
+        output_adapter: io_adapter.OutputAdapter,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        *,
+        fake_context: Optional[ONNXFakeContext] = None,
+        export_exception: Optional[Exception] = None,
+        model_signature: Optional[torch.export.ExportGraphSignature] = None,
+        model_torch: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ):
+        self._model_proto = model_proto
+        self._model_signature = model_signature
+        self._model_torch = model_torch
+        self._input_adapter = input_adapter
+        self._output_adapter = output_adapter
+        self._diagnostic_context = diagnostic_context
+        self._fake_context = fake_context
+        self._export_exception = export_exception
+
+    def __call__(
+        self,
+        *args: Any,
+        model_with_state_dict: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+        options: Optional[ONNXRuntimeOptions] = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Runs the ONNX model using ONNX Runtime
+
+        Args:
+            args: The positional inputs to the model.
+            kwargs: The keyword inputs to the model.
+            model_with_state_dict: The PyTorch model to fetch state from.
+                Required when :func:`enable_fake_mode` is used to extract real initializers as needed by the ONNX graph.
+            options: The options to use for running the model with ONNX Runtime.
+
+        Returns:
+            The model output as computed by ONNX Runtime
+        """
+        import onnxruntime  # type: ignore[import]
+
+        # model specified by the user has precedence, when specified
+        model_with_state_dict = model_with_state_dict or self._model_torch
+
+        onnx_input = self.adapt_torch_inputs_to_onnx(
+            *args, model_with_state_dict=model_with_state_dict, **kwargs
+        )
+        options = options or ONNXRuntimeOptions()
+        providers = options.execution_providers or onnxruntime.get_available_providers()
+        onnx_model = self.model_proto.SerializeToString()
+        ort_session = onnxruntime.InferenceSession(onnx_model, providers=providers)
+
+        onnxruntime_input = {
+            k.name: v.numpy(force=True)
+            for k, v in zip(ort_session.get_inputs(), onnx_input)
+        }
+
+        return ort_session.run(None, onnxruntime_input)
+
+    @property
+    def model_proto(self) -> onnx.ModelProto:  # type: ignore[name-defined]
+        """The exported ONNX model as an :py:obj:`onnx.ModelProto`."""
+
+        if self._export_exception is not None:
+            raise self._export_exception
+        return self._model_proto
+
+    @property
+    def model_signature(self) -> Optional[torch.export.ExportGraphSignature]:
+        """The model signature for the exported ONNX graph.
+
+        This information is relevant because ONNX specification often differs from PyTorch's, resulting
+        in a ONNX graph with input and output schema different from the actual PyTorch model implementation.
+        By using the model signature, the users can understand the inputs and outputs differences
+        and properly execute the model in ONNX Runtime.
+
+        NOTE: Model signature is only available when the ONNX graph was exported from a
+        :class:`torch.export.ExportedProgram` object.
+
+        NOTE: Any transformation done to the model that changes the model signature must be accompanied
+        by updates to this model signature as well through :class:`InputAdaptStep` and/or :class:`OutputAdaptStep`.
+
+        Example:
+
+            The following model produces different sets of inputs and outputs.
+            The first 4 inputs are model parameters (namely conv1.weight, conv2.weight, fc1.weight, fc2.weight),
+            and the next 2 inputs are registered buffers (namely my_buffer2, my_buffer1) and finally
+            the last 2 inputs are user inputs (namely x and b).
+            The first output is a buffer mutation (namely my_buffer2) and the last output is the actual model output.
+
+            >>> class CustomModule(torch.nn.Module):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.my_parameter = torch.nn.Parameter(torch.tensor(2.0))
+            ...         self.register_buffer("my_buffer1", torch.tensor(3.0))
+            ...         self.register_buffer("my_buffer2", torch.tensor(4.0))
+            ...         self.conv1 = torch.nn.Conv2d(1, 32, 3, 1, bias=False)
+            ...         self.conv2 = torch.nn.Conv2d(32, 64, 3, 1, bias=False)
+            ...         self.fc1 = torch.nn.Linear(9216, 128, bias=False)
+            ...         self.fc2 = torch.nn.Linear(128, 10, bias=False)
+            ...     def forward(self, x, b):
+            ...         tensor_x = self.conv1(x)
+            ...         tensor_x = torch.nn.functional.sigmoid(tensor_x)
+            ...         tensor_x = self.conv2(tensor_x)
+            ...         tensor_x = torch.nn.functional.sigmoid(tensor_x)
+            ...         tensor_x = torch.nn.functional.max_pool2d(tensor_x, 2)
+            ...         tensor_x = torch.flatten(tensor_x, 1)
+            ...         tensor_x = self.fc1(tensor_x)
+            ...         tensor_x = torch.nn.functional.sigmoid(tensor_x)
+            ...         tensor_x = self.fc2(tensor_x)
+            ...         output = torch.nn.functional.log_softmax(tensor_x, dim=1)
+            ...         (
+            ...         self.my_buffer2.add_(1.0) + self.my_buffer1
+            ...         )  # Mutate buffer through in-place addition
+            ...         return output
+            >>> inputs = (torch.rand((64, 1, 28, 28), dtype=torch.float32), torch.randn(3))
+            >>> exported_program = torch.export.export(CustomModule(), args=inputs)
+            >>> onnx_program = torch.onnx.dynamo_export(exported_program, *inputs)
+            >>> print(onnx_program.model_signature)
+            ExportGraphSignature(
+                input_specs=[
+                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg0_1'),
+                              target='conv1.weight', persistent=None),
+                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg1_1'),
+                              target='conv2.weight', persistent=None),
+                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg2_1'),
+                              target='fc1.weight', persistent=None),
+                    InputSpec(kind=<InputKind.PARAMETER: 2>, arg=TensorArgument(name='arg3_1'),
+                              target='fc2.weight', persistent=None),
+                    InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg4_1'),
+                              target='my_buffer2', persistent=True),
+                    InputSpec(kind=<InputKind.BUFFER: 3>, arg=TensorArgument(name='arg5_1'),
+                              target='my_buffer1', persistent=True),
+                    InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg6_1'),
+                              target=None, persistent=None),
+                    InputSpec(kind=<InputKind.USER_INPUT: 1>, arg=TensorArgument(name='arg7_1'),
+                              target=None, persistent=None)
+                ],
+                output_specs=[
+                    OutputSpec(kind=<OutputKind.BUFFER_MUTATION: 3>, arg=TensorArgument(name='add'), target='my_buffer2'),
+                    OutputSpec(kind=<OutputKind.USER_OUTPUT: 1>, arg=TensorArgument(name='_log_softmax'), target=None)
+                ]
+            )
+        """
+
+        return self._model_signature
+
+    @property
+    def diagnostic_context(self) -> diagnostics.DiagnosticContext:
+        """The diagnostic context associated with the export."""
+
+        return self._diagnostic_context
+
+    @property
+    def fake_context(self) -> Optional[ONNXFakeContext]:
+        """The fake context associated with the export."""
+
+        return self._fake_context
+
+    @_beartype.beartype
+    def adapt_torch_inputs_to_onnx(
+        self,
+        *model_args,
+        model_with_state_dict: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+        **model_kwargs,
+    ) -> Sequence[Union[torch.Tensor, int, float, bool]]:
+        """Converts the PyTorch model inputs to exported ONNX model inputs format.
+
+        Due to design differences, input/output format between PyTorch model and exported
+        ONNX model are often not the same. E.g., None is allowed for PyTorch model, but are
+        not supported by ONNX. Nested constructs of tensors are allowed for PyTorch model,
+        but only flattened tensors are supported by ONNX, etc.
+
+        The actual adapting steps are associated with each individual export. It
+        depends on the PyTorch model, the particular set of model_args and model_kwargs
+        used for the export, and export options.
+
+        This method replays the adapting steps recorded during export.
+
+        Args:
+            model_args: The PyTorch model inputs.
+            model_with_state_dict: The PyTorch model to get extra state from.
+                If not specified, the model used during export is used.
+                Required when :func:`enable_fake_mode` is used to extract real initializers as needed by the ONNX graph.
+            model_kwargs: The PyTorch model keyword inputs.
+
+        Returns:
+            A sequence of tensors converted from PyTorch model inputs.
+
+        Example::
+
+            # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+            >>> import torch
+            >>> import torch.onnx
+            >>> from typing import Dict, Tuple
+            >>> def func_nested_input(
+            ...     x_dict: Dict[str, torch.Tensor],
+            ...     y_tuple: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
+            ... ):
+            ...     if "a" in x_dict:
+            ...         x = x_dict["a"]
+            ...     elif "b" in x_dict:
+            ...         x = x_dict["b"]
+            ...     else:
+            ...         x = torch.randn(3)
+            ...
+            ...     y1, (y2, y3) = y_tuple
+            ...
+            ...     return x + y1 + y2 + y3
+            >>> x_dict = {"a": torch.tensor(1.)}
+            >>> y_tuple = (torch.tensor(2.), (torch.tensor(3.), torch.tensor(4.)))
+            >>> onnx_program = torch.onnx.dynamo_export(func_nested_input, x_dict, y_tuple)
+            >>> print(x_dict, y_tuple)
+            {'a': tensor(1.)} (tensor(2.), (tensor(3.), tensor(4.)))
+            >>> print(onnx_program.adapt_torch_inputs_to_onnx(x_dict, y_tuple, model_with_state_dict=func_nested_input))
+            (tensor(1.), tensor(2.), tensor(3.), tensor(4.))
+
+        .. warning::
+            This API is experimental and is *NOT* backward-compatible.
+
+        """
+        # model specified by the user has precedence, when specified
+        model_with_state_dict = model_with_state_dict or self._model_torch
+        assert (
+            model_with_state_dict is not None
+        ), "model_with_state_dict must be specified."
+        return self._input_adapter.apply(
+            *model_args, model=model_with_state_dict, **model_kwargs
+        )
+
+    @_beartype.beartype
+    def adapt_torch_outputs_to_onnx(
+        self,
+        model_outputs: Any,
+        model_with_state_dict: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Union[torch.Tensor, int, float, bool]]:
+        """Converts the PyTorch model outputs to exported ONNX model outputs format.
+
+        Due to design differences, input/output format between PyTorch model and exported
+        ONNX model are often not the same. E.g., None is allowed for PyTorch model, but are
+        not supported by ONNX. Nested constructs of tensors are allowed for PyTorch model,
+        but only flattened tensors are supported by ONNX, etc.
+
+        The actual adapting steps are associated with each individual export. It
+        depends on the PyTorch model, the particular set of model_args and model_kwargs
+        used for the export, and export options.
+
+        This method replays the adapting steps recorded during export.
+
+        Args:
+            model_outputs: The PyTorch model outputs.
+            model_with_state_dict: The PyTorch model to get extra state from.
+                If not specified, the model used during export is used.
+                Required when :func:`enable_fake_mode` is used to extract real initializers as needed by the ONNX graph.
+
+        Returns:
+            PyTorch model outputs in exported ONNX model outputs format.
+
+        Example::
+
+            # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+            >>> import torch
+            >>> import torch.onnx
+            >>> def func_returning_tuples(x, y, z):
+            ...     x = x + y
+            ...     y = y + z
+            ...     z = x + y
+            ...     return (x, (y, z))
+            >>> x = torch.tensor(1.)
+            >>> y = torch.tensor(2.)
+            >>> z = torch.tensor(3.)
+            >>> onnx_program = torch.onnx.dynamo_export(func_returning_tuples, x, y, z)
+            >>> pt_output = func_returning_tuples(x, y, z)
+            >>> print(pt_output)
+            (tensor(3.), (tensor(5.), tensor(8.)))
+            >>> print(onnx_program.adapt_torch_outputs_to_onnx(pt_output, model_with_state_dict=func_returning_tuples))
+            [tensor(3.), tensor(5.), tensor(8.)]
+
+        .. warning::
+            This API is experimental and is *NOT* backward-compatible.
+
+        """
+        # model specified by the user has precedence, when specified
+        model_with_state_dict = model_with_state_dict or self._model_torch
+        assert (
+            model_with_state_dict is not None
+        ), "model_with_state_dict must be specified."
+        return self._output_adapter.apply(model_outputs, model=model_with_state_dict)
+
+    @_beartype.beartype
+    def save(
+        self,
+        destination: Union[str, io.BufferedIOBase],
+        *,
+        model_state: Optional[Union[Dict[str, Any], str]] = None,
+        serializer: Optional[ONNXProgramSerializer] = None,
+    ) -> None:
+        """Saves the in-memory ONNX model to ``destination`` using specified ``serializer``.
+
+        Args:
+            destination: The destination to save the ONNX model. It can be either a string or a file-like object.
+                When used with ``model_state``, it must be a string with a full path to the destination.
+                If `destination` is a string, besides saving the ONNX model into a file, model weights are also stored
+                in separate files in the same directory as the ONNX model. E.g. for `destination="/path/model.onnx"`,
+                the initializers are saved in "/path/" folder along with "onnx.model".
+            model_state: The state_dict of the PyTorch model containing all weights on it.
+                It can be either a string with the path to a checkpoint or a dictionary with the actual model state.
+                The supported file formats are the same as those supported by `torch.load` and `safetensors.safe_open`.
+                Required when :func:`enable_fake_mode` is used but real initializers are needed on the ONNX graph.
+            serializer: The serializer to use. If not specified, the model will be serialized as Protobuf.
+        """
+        if serializer is None:
+            if isinstance(destination, str):
+                serializer = LargeProtobufONNXProgramSerializer(destination)
+            else:
+                serializer = ProtobufONNXProgramSerializer()
+
+        # Add initializers when symbolic tracing is enabled
+        _model_state_files: List[Union[str, io.BytesIO, Dict[str, Any]]] = []
+        if model_state is not None:
+            assert isinstance(
+                model_state, (dict, str)
+            ), "model_state must be a path to the model's state_dict or the actual state_dict"
+            # NOTE: For dict, there can be performance penalty or high memory usage that might lead to OOM
+            #       if the dict wasn't loaded with torch.load(..., mmap=True, map_location="cpu")
+            _model_state_files.append(model_state)
+        elif self._fake_context and self._fake_context.state_dict_paths:
+            # Load state from previous model.load_state_dict() call within enable_fake_mode() context
+            for path in self._fake_context.state_dict_paths:
+                if path in _model_state_files:
+                    # ignore duplicate
+                    continue
+                if os.path.exists(path):  # type: ignore[arg-type]
+                    _model_state_files.append(path)
+
+        if _model_state_files:
+            if not isinstance(destination, str):
+                raise RuntimeError(
+                    "`destination` must be a string with a path when `model_state` is specified."
+                )
+            destination_path, destination_filename = os.path.split(destination)
+            destination_path = destination_path or os.getcwd()
+            onnx_model_location = destination_filename
+
+            # TODO: Should this be part of the serializer?
+            fx_serialization.save_model_with_external_data(
+                destination_path,
+                onnx_model_location,
+                "",  # When initializers >2GB, must be in the same folder as the model
+                tuple(_model_state_files),
+                self.model_proto,
+            )
+        else:
+            if isinstance(destination, str):
+                with open(destination, "wb") as f:
+                    serializer.serialize(self, f)
+            else:
+                try:
+                    serializer.serialize(self, destination)
+                except ValueError as exc:
+                    raise ValueError(
+                        "'destination' should be provided as a path-like string when saving a model larger than 2GB. "
+                        "External tensor data will be saved alongside the model on disk."
+                    ) from exc
+
+    @_beartype.beartype
+    def save_diagnostics(self, destination: str) -> None:
+        """Saves the export diagnostics as a SARIF log to the specified destination path.
+
+        Args:
+            destination: The destination to save the diagnostics SARIF log.
+                It must have a `.sarif` extension.
+
+        Raises:
+            ValueError: If the destination path does not end with `.sarif` extension.
+        """
+        if not destination.endswith(".sarif"):
+            message = f"'destination' must have a .sarif extension, got {destination}"
+            log.fatal(message)
+            raise ValueError(message)
+
+        self.diagnostic_context.dump(destination)
+
+    @classmethod
+    def _from_failure(
+        cls,
+        export_exception: Exception,
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ) -> Self:
+        """
+        Creates an instance of :class:`ONNXProgram` when the export process encounters a failure.
+
+        In case of a failed export, this method is used to encapsulate the exception
+        and associated diagnostic context within an :class:`ONNXProgram` instance for
+        easier handling and debugging.
+
+        Args:
+            export_exception: The exception raised during the export process.
+            diagnostic_context: The context associated with diagnostics during export.
+
+        Returns:
+            An instance of :class:`ONNXProgram` representing the failed ONNX program.
+        """
+        # Defer `import onnx` out of `import torch` path
+        # https://github.com/pytorch/pytorch/issues/103764
+        import onnx
+
+        # TODO: Should we populate ONNXProgram with more info, such _model_torch for easier debug?
+        return ONNXProgram(
+            onnx.ModelProto(),  # type: ignore[attr-defined]
+            io_adapter.InputAdapter(),
+            io_adapter.OutputAdapter(),
+            diagnostic_context,
+            export_exception=export_exception,
+        )
+
+
+class FXGraphExtractor(abc.ABC):
+    """Abstract interface for FX graph extractor engines.
+    This class isolates FX extraction logic from the rest of the export logic.
+    That allows a single ONNX exporter that can leverage different FX graphs."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.input_adapter: io_adapter.InputAdapter = io_adapter.InputAdapter()
+        self.output_adapter: io_adapter.OutputAdapter = io_adapter.OutputAdapter()
+
+    @abc.abstractmethod
+    def generate_fx(
+        self,
+        options: ResolvedExportOptions,
+        model: Union[torch.nn.Module, Callable],
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        """Analyzes user ``model`` and generates a FX graph.
+        Args:
+            options: The export options.
+            model: The user model.
+            model_args: The model's positional input arguments.
+            model_kwargs: The model's keyword input arguments.
+        Returns:
+            The generated FX Graph.
+        """
+        ...
+
+    # TODO: Design the passes API
+    @abc.abstractmethod
+    def pre_export_passes(
+        self,
+        options: ResolvedExportOptions,
+        original_model: Union[torch.nn.Module, Callable],
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        """Applies pre-export passes to the FX graph.
+
+        Pre-export passes are FX-to-FX graph transformations that make the graph
+        more palatable for the FX-to-ONNX conversion.
+        For example, it can be used to flatten model input/output, add explicit
+        casts to the graph, replace/decompose operators, functionalize the graph, etc.
+        """
+        ...
+
+
+class Exporter:
+    @_beartype.beartype
+    def __init__(
+        self,
+        options: ResolvedExportOptions,
+        model: Union[torch.nn.Module, Callable, torch_export.ExportedProgram],
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ):
+        self.options = options
+        assert self.options is not None
+
+        self.model = model
+        self.model_args = model_args
+        self.model_kwargs = model_kwargs
+
+        # TODO: https://github.com/pytorch/pytorch/issues/107714
+        # NOTE: FXSymbolicTracer would fail in this assert, as it does not use `enable_fake_mode`
+        from torch.onnx._internal.fx import fx_symbolic_graph_extractor
+
+        if not isinstance(
+            self.options.fx_tracer, fx_symbolic_graph_extractor.FXSymbolicTracer
+        ):
+            self._assert_fake_tensor_mode()
+
+    def export(self) -> ONNXProgram:
+        # TODO: Defer `import onnxscript` out of `import torch` path
+        # https://github.com/pytorch/pytorch/issues/103764
+        from torch.onnx._internal.fx import decomposition_skip
+
+        with self.options.diagnostic_context, decomposition_skip.enable_decomposition_skips(
+            self.options
+        ):
+            graph_module = self.options.fx_tracer.generate_fx(
+                self.options, self.model, self.model_args, self.model_kwargs
+            )
+            # TODO: Defer `import onnxscript` out of `import torch` path
+            # https://github.com/pytorch/pytorch/issues/103764
+            from torch.onnx._internal.fx import fx_onnx_interpreter
+
+            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
+                diagnostic_context=self.options.diagnostic_context
+            )
+            onnxscript_graph = fx_interpreter.run(
+                fx_graph_module=graph_module,
+                onnxfunction_dispatcher=self.options.onnxfunction_dispatcher,
+                op_level_debug=self.options.op_level_debug,
+            )
+
+            # NOTE: Filter out the initializers with fake tensors when it's fake_mode exporting.
+            # Otherwise, the ONNX exporter will fail: RuntimeError: basic_string::_M_construct null
+            # not valid.
+            # Concrete data is expected to be filled for those initializers later during `ONNXProgram.save`.
+            if self.options.fake_context is not None:
+                initializers_with_real_tensors: Dict[str, torch.Tensor] = {}
+                for (
+                    initializer_name,
+                    initializer,
+                ) in onnxscript_graph.initializers.items():
+                    if not isinstance(initializer, torch._subclasses.FakeTensor):
+                        initializers_with_real_tensors[initializer_name] = initializer
+                onnxscript_graph.initializers = initializers_with_real_tensors
+
+            # Export TorchScript graph to ONNX ModelProto.
+            onnx_model = onnxscript_graph.to_model_proto(
+                self.options.onnx_registry.opset_version,
+            )
+
+            return torch.onnx.ONNXProgram(
+                onnx_model,
+                self.options.fx_tracer.input_adapter,
+                self.options.fx_tracer.output_adapter,
+                self.options.diagnostic_context,
+                fake_context=self.options.fake_context,
+                model_signature=getattr(
+                    self.model, "graph_signature", None
+                ),  # Available for isinstance(self.model, ExportedProgram) only
+                model_torch=self.model,
+            )
+
+    def _assert_fake_tensor_mode(self):
+        """Asserts that the model and its input do not contain fake tensors."""
+
+        # Case 1: Model with fake inputs/weights and without enabling fake mode
+        has_any_fake_tensor = pytree.tree_any(
+            lambda x: isinstance(x, torch._subclasses.FakeTensor),
+            (self.model_args, self.model_kwargs),
+        )
+        has_any_fake_param_or_buffer = False
+        if isinstance(self.model, torch.nn.Module):
+            has_any_fake_param_or_buffer = pytree.tree_any(
+                lambda x: isinstance(x, torch._subclasses.FakeTensor),
+                (self.model.parameters(), self.model.buffers()),
+            )
+        if (
+            has_any_fake_tensor or has_any_fake_param_or_buffer
+        ) and not self.options.fake_context:
+            raise RuntimeError(
+                "Cannot export a model with fake inputs/weights without enabling fake mode.",
+            )
+        # Case 2: Model with non fake inputs/weights and enabled fake mode
+        has_any_non_fake_tensors = pytree.tree_any(
+            lambda x: isinstance(x, torch.Tensor)
+            and not isinstance(x, torch._subclasses.FakeTensor),
+            (self.model_args, self.model_kwargs),
+        )
+        has_any_non_fake_param_or_buffer = False
+        if isinstance(self.model, torch.nn.Module):
+            has_any_non_fake_param_or_buffer = pytree.tree_any(
+                lambda x: isinstance(x, torch.Tensor)
+                and not isinstance(x, torch._subclasses.FakeTensor),
+                (self.model.parameters(), self.model.buffers()),
+            )
+        if (
+            has_any_non_fake_tensors or has_any_non_fake_param_or_buffer
+        ) and self.options.fake_context:
+            raise RuntimeError(
+                "Cannot export a model with non fake inputs/weights and enabled fake mode.",
+            )
+
+
+class UnsatisfiedDependencyError(RuntimeError):
+    """Raised when an ONNX exporter dependency cannot be satisfied."""
+
+    def __init__(self, package_name: str, message: str):
+        super().__init__(message)
+        self.package_name = package_name
+
+
+class OnnxExporterError(RuntimeError):
+    """Raised when an ONNX exporter error occurs.
+
+    This exception is thrown when there's an error during the ONNX export process.
+    It encapsulates the :class:`ONNXProgram` object generated until the failure, allowing
+    access to the partial export results and associated metadata.
+    """
+
+    onnx_program: Final[ONNXProgram]
+
+    def __init__(self, onnx_program: ONNXProgram, message: str):
+        """
+        Initializes the OnnxExporterError with the given ONNX program and message.
+
+        Args:
+            onnx_program (ONNXProgram): The partial results of the ONNX export.
+            message (str): The error message to be displayed.
+        """
+        super().__init__(message)
+        self.onnx_program = onnx_program
+
+
+class InvalidExportOptionsError(RuntimeError):
+    """Raised when user specified an invalid value for the :class:`ExportOptions`."""
+
+    pass
+
+
+@_beartype.beartype
+def _assert_dependencies(export_options: ResolvedExportOptions):
+    opset_version = export_options.onnx_registry.opset_version
+
+    def missing_package(package_name: str, exc_info: logging._ExcInfoType):
+        message = (
+            f"Please install the `{package_name}` package "
+            f"(e.g. `python -m pip install {package_name}`)."
+        )
+        log.fatal(message, exc_info=exc_info)
+        return UnsatisfiedDependencyError(package_name, message)
+
+    def missing_opset(package_name: str):
+        message = (
+            f"The installed `{package_name}` does not support the specified ONNX opset "
+            f"version {opset_version}. Install a newer `{package_name}` package or "
+            f"specify an older opset version."
+        )
+        log.fatal(message)
+        return UnsatisfiedDependencyError(package_name, message)
+
+    try:
+        import onnx
+    except ImportError as e:
+        raise missing_package("onnx", e) from e
+
+    if onnx.defs.onnx_opset_version() < opset_version:
+        raise missing_opset("onnx")
+
+    try:
+        # PyTorch runs lintrunner in CI without onnxscript installed
+        import onnxscript  # type: ignore[import]
+    except ImportError as e:
+        raise missing_package("onnxscript", e) from e
+
+    if not isinstance(
+        onnxscript.onnx_opset.all_opsets[("", opset_version)],
+        onnxscript.values.Opset,
+    ):
+        raise missing_opset("onnxscript")
+
+
+@_beartype.beartype
+def dynamo_export(
+    model: Union[torch.nn.Module, Callable, torch_export.ExportedProgram],  # type: ignore[name-defined]
+    /,
+    *model_args,
+    export_options: Optional[ExportOptions] = None,
+    **model_kwargs,
+) -> ONNXProgram:
+    """Export a torch.nn.Module to an ONNX graph.
+
+    Args:
+        model: The PyTorch model to be exported to ONNX.
+        model_args: Positional inputs to ``model``.
+        model_kwargs: Keyword inputs to ``model``.
+        export_options: Options to influence the export to ONNX.
+
+    Returns:
+        An in-memory representation of the exported ONNX model.
+
+    **Example 1 - Simplest export**
+    ::
+
+        class MyModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+            def forward(self, x, bias=None):
+                out = self.linear(x)
+                out = out + bias
+                return out
+        model = MyModel()
+        kwargs = {"bias": 3.}
+        args = (torch.randn(2, 2, 2),)
+        onnx_program = torch.onnx.dynamo_export(
+            model,
+            *args,
+            **kwargs).save("my_simple_model.onnx")
+
+    **Example 2 - Exporting with dynamic shapes**
+    ::
+
+        # The previous model can be exported with dynamic shapes
+        export_options = torch.onnx.ExportOptions(dynamic_shapes=True)
+        onnx_program = torch.onnx.dynamo_export(
+            model,
+            *args,
+            **kwargs,
+            export_options=export_options)
+        onnx_program.save("my_dynamic_model.onnx")
+
+
+    By printing input dynamic dimensions we can see the input shape is no longer (2,2,2)
+    ::
+
+        >>> print(onnx_program.model_proto.graph.input[0])
+        name: "arg0"
+        type {
+          tensor_type {
+            elem_type: 1
+            shape {
+              dim {
+                dim_param: "arg0_dim_0"
+              }
+              dim {
+                dim_param: "arg0_dim_1"
+              }
+              dim {
+                dim_param: "arg0_dim_2"
+              }
+            }
+          }
+        }
+    """
+
+    if export_options is not None:
+        resolved_export_options = (
+            export_options
+            if isinstance(export_options, ResolvedExportOptions)
+            else ResolvedExportOptions(export_options, model=model)
+        )
+    else:
+        resolved_export_options = ResolvedExportOptions(ExportOptions(), model=model)
+
+    _assert_dependencies(resolved_export_options)
+
+    try:
+        return Exporter(
+            options=resolved_export_options,
+            model=model,
+            model_args=model_args,
+            model_kwargs=model_kwargs,
+        ).export()
+    except Exception as e:
+        sarif_report_path = _DEFAULT_FAILED_EXPORT_SARIF_LOG_PATH
+        resolved_export_options.diagnostic_context.dump(sarif_report_path)
+        message = (
+            f"Failed to export the model to ONNX. Generating SARIF report at '{sarif_report_path}'. "
+            "SARIF is a standard format for the output of static analysis tools. "
+            "SARIF logs can be loaded in VS Code SARIF viewer extension, "
+            "or SARIF web viewer (https://microsoft.github.io/sarif-web-component/). "
+            f"Please report a bug on PyTorch Github: {_PYTORCH_GITHUB_ISSUES_URL}"
+        )
+        raise OnnxExporterError(
+            ONNXProgram._from_failure(e, resolved_export_options.diagnostic_context),
+            message,
+        ) from e
+
+
+def common_pre_export_passes(
+    options: ResolvedExportOptions,
+    original_model: Union[torch.nn.Module, Callable],
+    fx_module: torch.fx.GraphModule,
+    fx_module_args: Sequence[Any],
+):
+    # TODO: Import here to prevent circular dependency
+    from torch.onnx._internal.fx import analysis, passes
+
+    diagnostic_context = options.diagnostic_context
+
+    # Apply decomposition table to the input graph.
+    module = passes.Decompose(
+        diagnostic_context,
+        fx_module,
+        options.decomposition_table,
+        enable_dynamic_axes=options.dynamic_shapes,
+        allow_fake_constant=options.fake_context is not None,
+    ).run(*fx_module_args)
+
+    # ONNX does not support views and mutations.
+    # Functionalize to get a semantically equivalent graph without mutations.
+    module = passes.Functionalize(
+        diagnostic_context,
+        module,
+        enable_dynamic_axes=options.dynamic_shapes,
+        allow_fake_constant=options.fake_context is not None,
+    ).run(*fx_module_args)
+
+    # Input mutations are detected and distilled after `Functionalize` pass.
+    # Remove them since ONNX inference does not need them.
+    module = passes.RemoveInputMutation(diagnostic_context, module).run(*fx_module_args)
+
+    # ONNX does not support concept of (implicit) type promotion.
+    # Insert type casts explicitly where needed.
+    module = passes.InsertTypePromotion(diagnostic_context, module).run()
+
+    analysis.UnsupportedFxNodesAnalysis(
+        diagnostic_context, module, options.onnxfunction_dispatcher
+    ).analyze(infra.levels.ERROR)
+
+    if isinstance(original_model, torch.nn.Module):
+        module = passes.RestoreParameterAndBufferNames(
+            diagnostic_context, module, original_model
+        ).run()
+
+    # This operation should be invoked as the last pre export pass.
+    # See [NOTE: Modularize pass ordering]
+    module = passes.Modularize(diagnostic_context, module).run()
+
+    # ONNX does not support None inputs. During graph building, all None inputs
+    # are removed. Here we register this step to input adapter.
+    options.fx_tracer.input_adapter.append_step(io_adapter.RemoveNoneInputStep())
+
+    # NOTE: temp workaround for https://github.com/pytorch/pytorch/issues/99534
+    # Dynamo doesn't support non-tensor inputs.
+    options.fx_tracer.input_adapter.append_step(io_adapter.RemoveNonTensorInputStep())
+
+    # ONNX does not support complex inputs. During graph building, all complex inputs
+    # are converted to real representation inputs. Here we register this step to
+    # input/output adapter.
+    options.fx_tracer.input_adapter.append_step(
+        io_adapter.ConvertComplexToRealRepresentationInputStep()
+    )
+
+    # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+    # tensor, etc), we flatten the collection and register each element as output.
+    options.fx_tracer.output_adapter.append_step(io_adapter.FlattenOutputStep())
+
+    # Output post-processing steps should happen after `FlattenOutputStep`.
+    options.fx_tracer.output_adapter.append_step(
+        io_adapter.ConvertComplexToRealRepresentationOutputStep()
+    )
+
+    return module
+
+
+__all__ = [
+    "DiagnosticOptions",
+    "ExportOptions",
+    "ONNXProgram",
+    "ONNXProgramSerializer",
+    "ONNXRuntimeOptions",
+    "InvalidExportOptionsError",
+    "OnnxExporterError",
+    "OnnxRegistry",
+    "UnsatisfiedDependencyError",
+    "dynamo_export",
+    "enable_fake_mode",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..718b224998172695fa3b1927a4eeff6764238bfe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__init__.py
@@ -0,0 +1,8 @@
+from .patcher import ONNXTorchPatcher
+from .serialization import save_model_with_external_data
+
+
+__all__ = [
+    "save_model_with_external_data",
+    "ONNXTorchPatcher",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d11cc52d81b845077b1119287a2050d459323fc0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/_pass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/_pass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ab0d9a22a842e14beff7ad66e0977afe65b1644
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/_pass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_skip.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_skip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6259676a8c2c548e9b942c5b77388668a9757dc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_skip.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_table.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_table.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82e1fdbeabeff320b7929a0c4c4161406361a956
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/decomposition_table.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/diagnostics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/diagnostics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad15de1ea27e6595f29c01f20e03307e0777a910
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/diagnostics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/dynamo_graph_extractor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/dynamo_graph_extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73a223cc75075ff4e4c4672d0101dd306c5d9d94
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/dynamo_graph_extractor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_onnx_interpreter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_onnx_interpreter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12b72ae7cb7d8b6959c55eca1e1c59066d0f88a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_onnx_interpreter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_symbolic_graph_extractor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_symbolic_graph_extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ad32206571fd90e5fabe91450f75952ede3c534
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/fx_symbolic_graph_extractor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/onnxfunction_dispatcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/onnxfunction_dispatcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d5f401e8ae9c02e55685fb3d0bce71c0e131403
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/onnxfunction_dispatcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/op_validation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/op_validation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68fe1763fb5f9abb2299e1f394c151b4358d87ec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/op_validation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/patcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/patcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d5f44e01efe15b1f1251ce9d42ad36f4ef64ce5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/patcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/registration.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/registration.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a76509bffa8911eb57beb9a24d7b275adf14b2a8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/registration.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/serialization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/serialization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1abc9dc0d7d7d52cfa0403d27bd4d832bd3a8f6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/serialization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/torch_export_graph_extractor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/torch_export_graph_extractor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0773eceb3663f50bac394ff49251d6bb7edd1334
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/torch_export_graph_extractor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/type_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/type_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d4cd139d51e2c859cd550855987cb03faacc5ea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/__pycache__/type_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/_pass.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1874b9ead27eb06c76b778f79b047f0c8483925
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/_pass.py
@@ -0,0 +1,325 @@
+from __future__ import annotations
+
+import abc
+
+import contextlib
+import dataclasses
+import difflib
+
+import io
+import logging
+import sys
+
+from typing import Any, Callable, Optional, Tuple
+
+import torch
+import torch.fx
+from torch._subclasses import fake_tensor
+from torch.fx.experimental.proxy_tensor import maybe_disable_fake_tensor_mode
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import diagnostics, onnxfunction_dispatcher
+
+
+@dataclasses.dataclass
+class PackageInfo:
+    package_name: str
+    version: Optional[str]
+    commit_hash: Optional[str]
+
+    def to_onnx_domain_string(self) -> str:
+        return ".".join(
+            filter(None, ("pkg", self.package_name, self.version, self.commit_hash))
+        )
+
+    @classmethod
+    def from_python_class(cls, python_class: type) -> PackageInfo:
+        package_name = python_class.__module__.split(".")[0]
+        package = __import__(package_name)
+        version = getattr(package, "__version__", None)
+        # TODO: Figure out how to retrieve commit hash.
+        commit_hash = None
+        return cls(package_name, version, commit_hash)
+
+
+@dataclasses.dataclass
+class GraphModuleOnnxMeta:
+    package_info: PackageInfo
+
+
+@contextlib.contextmanager
+def _patch_difflib_sequence_matcher_init():
+    """Context patching `difflib.SequenceMatcher` for fx readable graph.
+
+    Under this context, the `autojunk` argument of `difflib.SequenceMatcher` will always
+    be considered as `False`. This is to prevent `difflib.SequenceMatcher` recognizing
+    stacktrace messages in fx readable graph as junk, as these messages tend to be long (>200)
+    and repeat multiple times, which falls under the junk filter criteria.
+
+    `difflib.SequenceMatcher` is used underneath by all sorts of diffing functions
+    in `difflib`, including `difflib.unified_diff`, `difflib.ndiff`, `difflib.context_diff`.
+    Unfortunately, there is no way to pass `autojunk` argument to these functions, and
+    they all default to `True`. This context patching will affect all of them.
+
+    `Reference: Automatic junk heuristic <https://docs.python.org/3/library/difflib.html>`_
+    """
+    original_init = difflib.SequenceMatcher.__init__
+
+    def patched_init(self, isjunk=None, a="", b="", autojunk=True):
+        original_init(self, isjunk, a, b, autojunk=False)
+
+    difflib.SequenceMatcher.__init__ = patched_init  # type: ignore[assignment]
+    try:
+        yield
+    finally:
+        difflib.SequenceMatcher.__init__ = original_init  # type: ignore[assignment]
+
+
+def _unified_diff(a: str, b: str) -> str:
+    """Return a string containing the unified diff of two strings.
+
+    This function calls a patched version of `difflib.unified_diff` with `autojunk` set
+    to `False` for `difflib.SequenceMatcher` class. More details can be found in
+    `_patch_difflib_sequence_matcher_init` function.
+
+    Args:
+        a: The first string.
+        b: The second string.
+
+    Returns:
+        The unified diff of the two strings. If there is no diff, return "<no diff>".
+
+    Example::
+
+        >>> a = '''class GraphModule(torch.nn.Module):
+        ...     def forward(self, input_ids : torch.Tensor, attention_mask : torch.Tensor):
+        ...         # File: /modeling.py:770, code: input_ids = input_ids.view(-1, input_shape[-1])
+        ...         view = input_ids.view(-1, 3);  input_ids = None
+        ... '''
+        >>> b = '''class <lambda>(torch.nn.Module):
+        ...     def forward(self, input_ids: i64[1, 3], attention_mask: i64[1, 3]):
+        ...         # File: /modeling.py:770, code: input_ids = input_ids.view(-1, input_shape[-1])
+        ...         view: i64[1, 3] = torch.ops.aten.view.default(input_ids, [-1, 3]);  input_ids = None
+        ... '''
+        >>> print(_unified_diff(a, b))
+        ---
+        +++
+        @@ -1,4 +1,4 @@
+        -class GraphModule(torch.nn.Module):
+        -    def forward(self, input_ids : torch.Tensor, attention_mask : torch.Tensor):
+        +class <lambda>(torch.nn.Module):
+        +    def forward(self, input_ids: i64[1, 3], attention_mask: i64[1, 3]):
+                # File: /modeling.py:770, code: input_ids = input_ids.view(-1, input_shape[-1])
+        -        view = input_ids.view(-1, 3);  input_ids = None
+        +        view: i64[1, 3] = torch.ops.aten.view.default(input_ids, [-1, 3]);  input_ids = None
+    """
+
+    a_list = a.splitlines(keepends=True)
+    b_list = b.splitlines(keepends=True)
+
+    with _patch_difflib_sequence_matcher_init():
+        # Set `n` to `sys.maxsize` to show entire graph when there is a diff.
+        diff = "".join(difflib.unified_diff(a_list, b_list, n=sys.maxsize))
+
+    if not diff:
+        return "<no diff>"
+    return diff
+
+
+@_beartype.beartype
+def _transform_diagnose_call_message_formatter(
+    run: Callable,
+    self: Transform,
+    *args: Any,
+    **kwargs: Any,
+) -> str:
+    return f"Running {self.__class__.__name__} pass. "
+
+
+def maybe_fx_graph_tabular(graph: torch.fx.Graph) -> Optional[str]:
+    """Return the Graph nodes in tabular format. Equivalent to stdout of `graph.print_tabular()`.
+    If `tabulate` is not installed, return `None`.
+
+    Args:
+        graph: The Graph to print.
+
+    Returns:
+        The Graph printed in a tabular format. None if `tabulate` is not installed.
+    """
+    f = io.StringIO()
+    with contextlib.redirect_stdout(f):
+        try:
+            graph.print_tabular()
+        except ImportError:
+            return None
+    return f.getvalue()
+
+
+class Transform(abc.ABC):
+    """Base class for FX graph transformations to be used by FX-ONNX exporter.
+
+    Similar to `FX Interpreter <https://pytorch.org/docs/stable/fx.html#torch.fx.Interpreter>`_,
+    specializations of this class execute the FX graph Node-by-Node.
+    Methods in the `Transform` class can be overridden to customize the behavior of the model.
+    This pattern can be useful for many things, including writing code transformations as well as analysis passes.
+
+    The following methods can be overridden::
+
+        _run()
+            +-- run_node()
+                +-- placeholder()
+                +-- get_attr()
+                +-- call_function()
+                +-- call_method()
+                +-- call_module()
+                +-- output()
+
+    One important aspect to note is that if the transformation modifies the model input and/or output signature,
+    (e.g. additional inputs/outputs are added to the model), :class:`InputAdaptStep` and/or :class:`OutputAdaptStep`
+    are needed to reconcile :attr:`ONNXProgram.model_signature` and :attr:`ONNXProgram.model_proto`.
+    That is, the model signature and the model representation must match.
+
+    As an additional feature, this class provides builtin support for transformation recording using the diagnostics.
+    The granularity of overriding is up to the user. And it affects the granularity of
+    the diagnostics information. For example, if `_run()` is overridden, the
+    diagnostics information will only contain graph level transformation. Instead,
+    if `call_function()` is overridden, the diagnostics information will additionally
+    contain the node level information of `call_function()`.
+
+    TODO(bowbao): Add more overridable methods in call hierarchy
+    TODO(bowbao): Create an example once more overridable methods are added.
+    """
+
+    diagnostic_context: diagnostics.DiagnosticContext
+    """The diagnostic context for recording diagnostics."""
+
+    module: torch.fx.GraphModule
+    """The module to be transformed."""
+
+    fake_mode: Optional[fake_tensor.FakeTensorMode]
+    """The existing fake mode detected from `self.module`."""
+
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+    ):
+        """Initialize the transform.
+
+        Args:
+            diagnostic_context: The diagnostic context for recording diagnostics.
+            module: The module to be transformed.
+        """
+        self.diagnostic_context = diagnostic_context
+        self.module = module
+        self.fake_mode = self._detect_fake_mode()
+
+    def _detect_fake_mode(self) -> Optional[fake_tensor.FakeTensorMode]:
+        """Detect fake mode from the graph.
+
+        Scan through all nodes in graph and their meta['val'] to detect fake mode.
+        """
+        fake_tensors = [node.meta.get("val") for node in self.module.graph.nodes]
+        with maybe_disable_fake_tensor_mode():
+            return torch._dynamo.utils.detect_fake_mode(fake_tensors)
+
+    def _maybe_fakefy_args(
+        self, fake_mode: Optional[fake_tensor.FakeTensorMode], *args: Any
+    ) -> Tuple[Any, ...]:
+        if fake_mode is None:
+            return args
+        # NB: This should hit the cache if tensors were fakefied before.
+        # E.g., when the fx graph is produced by Dynamo.
+        return tuple(
+            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args
+        )
+
+    @abc.abstractmethod
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        ...
+
+    @diagnostics.diagnose_call(
+        diagnostics.rules.fx_pass,
+        diagnostic_message_formatter=_transform_diagnose_call_message_formatter,
+    )
+    def run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        """Run the transform on `self.module`.
+
+        Note that this method may or may not mutate `self.module`, and the returned
+        `GraphModule` could be either `self.module` or a new `GraphModule`.
+
+        Args:
+            *args: Positional arguments for `self.module` to run.
+            **kwargs: Keyword arguments for `self.module` to run.
+        """
+        diagnostic = self.diagnostic_context.inflight_diagnostic(
+            rule=diagnostics.rules.fx_pass
+        )
+        diagnostic.info(
+            "For detailed logging of graph modifications by this pass, either set "
+            "`DiagnosticOptions.verbosity_level` to `logging.DEBUG` or use the environment variable "
+            "`TORCH_LOGS='onnx_diagnostics'`."
+        )
+
+        # Gather graph information before transform.
+        graph_diff_log_level = logging.DEBUG
+        if diagnostic.logger.isEnabledFor(graph_diff_log_level):
+            # Cannot use LazyString because the graph may have been mutated at evaluation time.
+            old_readable_graph = self.module.print_readable(print_output=False)
+            old_tabular = maybe_fx_graph_tabular(self.module.graph)
+        else:
+            # Set to empty string to avoid unbound warning. This value should never be
+            # used since the log level is not enabled.
+            old_readable_graph = ""
+            old_tabular = ""
+
+        module = self._run(*args, **kwargs)
+
+        # Gather graph information after transform.
+        if diagnostic.logger.isEnabledFor(graph_diff_log_level):
+            new_readable_graph = module.print_readable(print_output=False)
+            new_tabular = maybe_fx_graph_tabular(module.graph)
+
+            with diagnostic.log_section(graph_diff_log_level, "Graph diff:"):
+                diagnostic.log(
+                    graph_diff_log_level,
+                    "```\n%s\n```",
+                    diagnostics.LazyString(
+                        _unified_diff, old_readable_graph, new_readable_graph
+                    ),
+                )
+
+            with diagnostic.log_section(graph_diff_log_level, "Tabular diff:"):
+                if old_tabular is None or new_tabular is None:
+                    diagnostic.log(
+                        graph_diff_log_level,
+                        "Tabular diff is not available because `tabulate` is not installed.",
+                    )
+                else:
+                    diagnostic.log(
+                        graph_diff_log_level,
+                        "```\n%s\n```",
+                        diagnostics.LazyString(_unified_diff, old_tabular, new_tabular),
+                    )
+
+        return module
+
+
+class AnalysisResult(abc.ABC):  # noqa: B024
+    ...
+
+
+class Analysis(abc.ABC):
+    @_beartype.beartype
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+    ):
+        self.diagnostic_context = diagnostic_context
+        self.module = module
+        self.onnxfunction_dispatcher = onnxfunction_dispatcher
+
+    @abc.abstractmethod
+    def analyze(self, diagnostic_level: diagnostics.infra.Level) -> AnalysisResult:
+        ...
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef66f08ef43e29c7391e0b8945a600b3eaa35bca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__init__.py
@@ -0,0 +1,5 @@
+from .unsupported_nodes import UnsupportedFxNodesAnalysis
+
+__all__ = [
+    "UnsupportedFxNodesAnalysis",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..236b436d3ab811fb19064d3c5a9ae8cd821e4aa9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/unsupported_nodes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/unsupported_nodes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf5e9342e10769aebebce612c5a7ed92f69492d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/__pycache__/unsupported_nodes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/unsupported_nodes.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/unsupported_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..a56919c4eef1cd29d9d418f9237439c5b1a9b3ca
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/analysis/unsupported_nodes.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import dataclasses
+from typing import Dict
+
+from torch.onnx._internal.fx import _pass, diagnostics, registration
+
+
+@dataclasses.dataclass
+class UnsupportedFxNodesAnalysisResult(_pass.AnalysisResult):
+    unsupported_op_to_target_mapping: Dict[str, Dict[str, None]]
+
+
+class UnsupportedFxNodesAnalysis(_pass.Analysis):
+    """An analysis that detects unsupported FX nodes in the graph."""
+
+    def _lint(
+        self,
+        analysis_result: UnsupportedFxNodesAnalysisResult,
+        diagnostic_level: diagnostics.infra.Level,
+    ):
+        """Lint the graph and emit diagnostics if unsupported FX nodes are found."""
+        if not analysis_result.unsupported_op_to_target_mapping:
+            return
+
+        normalized_op_targets_map = {
+            op: list(targets.keys())
+            for op, targets in analysis_result.unsupported_op_to_target_mapping.items()
+        }
+
+        rule = diagnostics.rules.unsupported_fx_node_analysis
+        diagnostic = diagnostics.Diagnostic(
+            rule,
+            level=diagnostic_level,
+            message=rule.format_message(normalized_op_targets_map),
+        )
+        self.diagnostic_context.log_and_raise_if_error(diagnostic)
+
+    def analyze(
+        self, diagnostic_level: diagnostics.infra.Level
+    ) -> UnsupportedFxNodesAnalysisResult:
+        """Analyze the graph, emit diagnostics and return a result that contains unsupported FX nodes.
+
+        Args:
+            diagnostic_level: The diagnostic level to use when emitting diagnostics.
+
+        Returns:
+            An analysis result that contains unsupported FX nodes.
+
+        Raises:
+            RuntimeErrorWithDiagnostic: If diagnostics are emitted and the diagnostic
+                level is `ERROR`.
+        """
+
+        op_to_target_mapping: Dict[str, Dict[str, None]] = {}
+        for node in self.module.graph.nodes:
+            if node.op == "call_function":
+                # NOTE: OPSchema matcher is not in this analysis scope.
+                internal_opname: registration.OpName = (
+                    self.onnxfunction_dispatcher._get_aten_name(
+                        node=node, diagnostic_context=self.diagnostic_context
+                    )
+                )
+                overload_registration = (
+                    self.onnxfunction_dispatcher.onnx_registry.is_registered_op(
+                        namespace=internal_opname.namespace,
+                        op_name=internal_opname.op_name,
+                        overload=internal_opname.overload,
+                    )
+                )
+                # NOTE: Fall back to default overload if the ONNX registry doesn't have the overload.
+                default_registration = (
+                    self.onnxfunction_dispatcher.onnx_registry.is_registered_op(
+                        namespace=internal_opname.namespace,
+                        op_name=internal_opname.op_name,
+                        overload=None,
+                    )
+                )
+                if not overload_registration and not default_registration:
+                    op_to_target_mapping.setdefault(node.op, {}).setdefault(
+                        str(node.target), None
+                    )
+
+        analysis_result = UnsupportedFxNodesAnalysisResult(op_to_target_mapping)
+        self._lint(analysis_result, diagnostic_level)
+        return analysis_result
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_skip.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_skip.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ed5538f0648b0ded63c57f249dc407327d4e13
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_skip.py
@@ -0,0 +1,197 @@
+"""A context manager that disables the decomposition of certain ops during dynamo tracing.
+
+The approach is to temporarily hijack the operator callable with PT2 custom operator.
+The custom operator will not be decomposed and will show up as a single node to be exported to ONNX.
+
+For the time being the decomposition of these ops is otherwise unavoidable.
+
+https://github.com/pytorch/pytorch/issues/116684
+https://github.com/pytorch/pytorch/issues/115883
+
+This solution will no longer be required once the issue is resolved.
+"""
+from __future__ import annotations
+
+import abc
+import contextlib
+
+from typing import Callable, Sequence, Type
+
+from onnxscript.function_libs.torch_lib.ops import (  # type: ignore[import-not-found]
+    core as torchlib_core,
+    nn as torchlib_nn,
+)
+
+import torch
+from torch._decomp import decompositions
+
+_NEW_OP_NAMESPACE: str = "onnx_export"
+"""The namespace for the custom operator."""
+
+
+class DecompSkip(abc.ABC):
+    op_callable: Callable
+    """The original operator callable to skip decomposition."""
+    onnxscript_function: Callable
+    """The ONNXScript function to be registered for exporting the custom operator."""
+
+    new_op_name: str
+    """The name for the custom operator."""
+    new_op_schema: str
+    """The schema for the custom operator. This should match with the signature of the original operator."""
+
+    @classmethod
+    @abc.abstractmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        """Registers the custom operator and overrides the original operator.
+
+        It should do the following steps in order:
+
+        1. Register the custom operator.
+        2. Override the original operator with the replacement callable.
+        3. Register the ONNXScript function for exporting the custom operator.
+        """
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def unregister(cls):
+        """Restores the original operator callable."""
+        ...
+
+    @classmethod
+    @abc.abstractmethod
+    def abstract(cls, *args, **kwargs):
+        """An abstract impl (meta kernel) for the operator."""
+        ...
+
+    @classmethod
+    def register_custom_op(cls):
+        """Registers the custom operator."""
+        new_op_qualname = f"{_NEW_OP_NAMESPACE}::{cls.new_op_name}"
+        torch.library.define(new_op_qualname, cls.new_op_schema)
+        torch.library.impl(new_op_qualname, "default", cls.replacement)
+        torch.library.impl_abstract(new_op_qualname, cls.abstract)
+
+    @classmethod
+    def replacement(cls, *args, **kwargs):
+        """A replacement callable for the operator to be hijacked.
+
+        This has the same signature and eager behavior as the original operator.
+        """
+        return cls.op_callable(*args, **kwargs)
+
+
+class UpsampleBilinear2DDecompSkip(DecompSkip):
+    op_callable = torch._C._nn.upsample_bilinear2d  # type: ignore[attr-defined]
+    onnxscript_function = torchlib_nn.aten_upsample_bilinear2d_vec  # type: ignore[attr-defined]
+    new_op_name = "upsample_bilinear2d"
+    new_op_schema = "(Tensor self, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> (Tensor)"
+
+    @classmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        if not hasattr(torch.ops, _NEW_OP_NAMESPACE) or not hasattr(
+            torch.ops.onnx_export, cls.new_op_name
+        ):
+            cls.register_custom_op()
+        torch._C._nn.upsample_bilinear2d = torch.ops.onnx_export.upsample_bilinear2d  # type: ignore[attr-defined]
+        if export_options.onnx_registry is None:
+            export_options.onnx_registry = torch.onnx.OnnxRegistry()
+        registry = export_options.onnx_registry
+        registry.register_op(
+            function=cls.onnxscript_function,
+            namespace=_NEW_OP_NAMESPACE,
+            op_name=cls.new_op_name,
+        )
+
+    @classmethod
+    def unregister(cls):
+        torch._C._nn.upsample_bilinear2d = cls.op_callable  # type: ignore[attr-defined]
+
+    @classmethod
+    def abstract(cls, input, output_size, align_corners, scale_factors):
+        osize = decompositions.upsample_compute_output_size(
+            input.size(), output_size, scale_factors
+        )
+        return torch.empty(
+            (input.size(0), input.size(1), *osize),
+            dtype=input.dtype,
+            device=input.device,
+        )
+
+
+class InstanceNormDecompSkip(DecompSkip):
+    op_callable = torch.instance_norm  # type: ignore[attr-defined]
+    onnxscript_function = torchlib_core.aten_instance_norm  # type: ignore[attr-defined]
+    new_op_name = "instance_norm"
+    new_op_schema = (
+        "(Tensor input, Tensor? weight, Tensor? bias, "
+        "Tensor? running_mean, Tensor? running_var, "
+        "bool use_input_stats, float momentum, float eps, "
+        "bool cudnn_enabled) -> Tensor"
+    )
+
+    @classmethod
+    def register(cls, export_options: torch.onnx.ExportOptions):
+        if not hasattr(torch.ops, _NEW_OP_NAMESPACE) or not hasattr(
+            torch.ops.onnx_export, cls.new_op_name
+        ):
+            cls.register_custom_op()
+
+        torch.instance_norm = torch.ops.onnx_export.instance_norm  # type: ignore[attr-defined]
+        if export_options.onnx_registry is None:
+            export_options.onnx_registry = torch.onnx.OnnxRegistry()
+        registry = export_options.onnx_registry
+        registry.register_op(
+            function=cls.onnxscript_function,
+            namespace=_NEW_OP_NAMESPACE,
+            op_name=cls.new_op_name,
+        )
+
+    @classmethod
+    def unregister(cls):
+        torch.instance_norm = cls.op_callable  # type: ignore[attr-defined]
+
+    @classmethod
+    def abstract(
+        cls,
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        use_input_stats: bool,
+        momentum: float,
+        eps: float,
+        cudnn_enabled: bool,
+    ):
+        return torch.empty(
+            input.size(),
+            dtype=input.dtype,
+            device=input.device,
+        )
+
+
+_DEFAULT_SKIP_LIST = [
+    UpsampleBilinear2DDecompSkip,
+    InstanceNormDecompSkip,
+]
+
+
+@contextlib.contextmanager
+def enable_decomposition_skips(
+    export_options: torch.onnx.ExportOptions,
+    skips: Sequence[Type[DecompSkip]] = _DEFAULT_SKIP_LIST,
+):
+    """A context manager that enables the decomposition skips.
+
+    The original operator callables that are otherwise decomposed are replaced with custom operators.
+    The ONNXScript functions for exporting the custom operators are added to the ONNX registry inside export_options.
+    """
+    try:
+        for skip in skips:
+            skip.register(export_options)
+        yield
+    finally:
+        for skip in skips:
+            skip.unregister()
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_table.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcdeac10c28846715221de37e29ab5c87a1c9489
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/decomposition_table.py
@@ -0,0 +1,114 @@
+"""Dispatcher for AtenLib functions from onnx-script."""
+
+from __future__ import annotations
+
+from typing import Callable, Dict, Set, Union
+
+import torch
+import torch._ops
+import torch.fx
+
+from torch.onnx._internal import _beartype
+
+from torch.onnx._internal.fx import registration
+
+
+# NOTE: OnnxRegistry annotation: beartype is a runtime type checker for python3,
+# so it doesn't work with TYPE_CHECKING
+@_beartype.beartype
+def _create_onnx_supports_op_overload_table(
+    registry,
+) -> Set[Union[torch._ops.OperatorBase, Callable]]:
+    """
+    Creates a set of OperatorBase and Callable objects that represent ONNX-supported PyTorch operations.
+
+    Args:
+        registry (OnnxRegistry): The ONNX registry for PyTorch.
+
+    Returns:
+        A collection of OperatorBase and Callable objects representing ONNX-supported PyTorch operations.
+    """
+    table: Set[Union[torch._ops.OperatorBase, Callable]] = set()
+
+    # Some ops in `torch.ops.aten` are not discoverable through `dir(torch.ops.aten)`,
+    # but retrievable via explicit lookup.
+    # https://github.com/pytorch/pytorch/issues/99681
+    # This is a workaround to make sure we register ONNX symbolic functions for these.
+    onnx_supported_aten_lookup_table = [
+        k.split("::")[1].split(".")[0]
+        for k in registry._all_registered_ops()
+        if k.startswith("aten::")
+    ]
+
+    for op_namespace in (torch.ops.aten, torch.ops.prims):
+        attr_names = dir(op_namespace)
+        if op_namespace is torch.ops.aten:
+            attr_names += onnx_supported_aten_lookup_table
+        for attr_name in attr_names:
+            if not hasattr(op_namespace, attr_name):
+                # torchlib owns some attributes that are not aten ops.
+                continue
+            op_overload_packet = getattr(op_namespace, attr_name)
+            if not isinstance(op_overload_packet, torch._ops.OpOverloadPacket):
+                continue
+
+            for overload_name in op_overload_packet.overloads():
+                op_overload = getattr(op_overload_packet, overload_name)
+                internal_op_name = registration.OpName.from_qualified_name(
+                    qualified_name=op_overload.name()
+                )
+                # NOTE: If the overload is supported in registry or it's default overload is supported in registry,
+                # we add it to the table.
+                if registry.is_registered_op(
+                    namespace=internal_op_name.namespace,
+                    op_name=internal_op_name.op_name,
+                    overload=internal_op_name.overload,
+                ) or registry.is_registered_op(
+                    namespace=internal_op_name.namespace,
+                    op_name=internal_op_name.op_name,
+                    overload=None,
+                ):
+                    # This line maps torch.ops.aten.add.Tensor, torch.ops.aten.add.Scalar, torch.ops.aten.add.out, etc
+                    # to "aten::add". This means the exporter for "aten::add" is used for all overloads of "aten::add".
+                    # This is applied to all ops under torch.ops.aten.
+                    table.add(op_overload)
+    return table
+
+
+@_beartype.beartype
+def create_onnx_friendly_decomposition_table(
+    registry,
+) -> Dict[torch._ops.OperatorBase, Callable]:
+    """
+    This function creates a dictionary of op overloads and their decomposition functions
+    for ops that do not have ONNX symbolic functions. If an op already has an ONNX symbolic function,
+    its decomposition function is excluded from the table. The decomposition table is a subset of PyTorch's
+    built-in aten-to-aten decomposition.
+
+    Args:
+        registry (torch.onnx.OnnxRegistry): The ONNX registry for PyTorch.
+
+    Returns:
+        Dict[torch._ops.OperatorBase, Callable]: A dictionary that maps op overloads to their corresponding
+        decomposition functions.
+    """
+    decomposition_table: Dict[torch._ops.OperatorBase, Callable] = {}
+    # Dictionary that maps torch.ops.aten.* to exporter look up key; e.g.,
+    # _OP_OVERLOAD_TO_EXPORTER_KEY_TABLE[torch.add.Tensor] is "aten::add".
+    _ONNX_SUPPORT_OP_OVERLOADS = _create_onnx_supports_op_overload_table(registry)
+
+    # NOTE: If we import torch._decomp, we will get RuntimeError: Only a single
+    # TORCH_LIBRARY can be used to register the namespace nvprims; please put all of your
+    # definitions in a single TORCH_LIBRARY block.
+    for op_overload, decomp_fn in torch._decomp.decomposition_table.items():  # type: ignore[attr-defined]
+        # Skip decomposition into "prim::*" ops (defined in 'torch._refs'), because they
+        # are not generally supported by ONNX.
+        # Skip decomposition for op_overload as long as that op_overload has a corresponding ONNX
+        # symbolic function.
+        if (
+            "torch._refs" in decomp_fn.__module__
+            or op_overload in _ONNX_SUPPORT_OP_OVERLOADS
+        ):
+            continue
+        decomposition_table[op_overload] = decomp_fn
+    return decomposition_table
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/diagnostics.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/diagnostics.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0dd5fada35b058156321cb85a46ea0c99e41c75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/diagnostics.py
@@ -0,0 +1,259 @@
+from __future__ import annotations
+
+import dataclasses
+
+import functools
+import logging
+
+from typing import Any, Optional
+
+import onnxscript  # type: ignore[import]
+from onnxscript.function_libs.torch_lib import graph_building  # type: ignore[import]
+
+import torch
+import torch.fx
+from torch.onnx._internal import diagnostics
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import decorator, formatter
+from torch.onnx._internal.fx import registration, type_utils as fx_type_utils
+
+# NOTE: The following limits are for the number of items to display in diagnostics for
+# a list, tuple or dict. The limit is picked such that common useful scenarios such as
+# operator arguments are covered, while preventing excessive processing loads on considerably
+# large containers such as the dictionary mapping from fx to onnx nodes.
+_CONTAINER_ITEM_LIMIT: int = 10
+
+# NOTE(bowbao): This is a shim over `torch.onnx._internal.diagnostics`, which is
+# used in `torch.onnx`, and loaded with `torch`. Hence anything related to `onnxscript`
+# cannot be put there.
+
+# [NOTE: `dynamo_export` diagnostics logging]
+# The 'dynamo_export' diagnostics leverages the PT2 artifact logger to handle the verbosity
+# level of logs that are recorded in each SARIF log diagnostic. In addition to SARIF log,
+# terminal logging is by default disabled. Terminal logging can be activated by setting
+# the environment variable `TORCH_LOGS="onnx_diagnostics"`. When the environment variable
+# is set, it also fixes logging level to `logging.DEBUG`, overriding the verbosity level
+# specified in the diagnostic options.
+# See `torch/_logging/__init__.py` for more on PT2 logging.
+_ONNX_DIAGNOSTICS_ARTIFACT_LOGGER_NAME = "onnx_diagnostics"
+diagnostic_logger = torch._logging.getArtifactLogger(
+    "torch.onnx", _ONNX_DIAGNOSTICS_ARTIFACT_LOGGER_NAME
+)
+
+
+def is_onnx_diagnostics_log_artifact_enabled() -> bool:
+    return torch._logging._internal.log_state.is_artifact_enabled(
+        _ONNX_DIAGNOSTICS_ARTIFACT_LOGGER_NAME
+    )
+
+
+@functools.singledispatch
+def _format_argument(obj: Any) -> str:
+    return formatter.format_argument(obj)
+
+
+def format_argument(obj: Any) -> str:
+    formatter = _format_argument.dispatch(type(obj))
+    return formatter(obj)
+
+
+# NOTE: EDITING BELOW? READ THIS FIRST!
+#
+# The below functions register the `format_argument` function for different types via
+# `functools.singledispatch` registry. These are invoked by the diagnostics system
+# when recording function arguments and return values as part of a diagnostic.
+# Hence, code with heavy workload should be avoided. Things to avoid for example:
+# `torch.fx.GraphModule.print_readable()`.
+
+
+@_format_argument.register
+def _torch_nn_module(obj: torch.nn.Module) -> str:
+    return f"torch.nn.Module({obj.__class__.__name__})"
+
+
+@_format_argument.register
+def _torch_fx_graph_module(obj: torch.fx.GraphModule) -> str:
+    return f"torch.fx.GraphModule({obj.__class__.__name__})"
+
+
+@_format_argument.register
+def _torch_fx_node(obj: torch.fx.Node) -> str:
+    node_string = f"fx.Node({obj.target})[{obj.op}]:"
+    if "val" not in obj.meta:
+        return node_string + "None"
+    return node_string + format_argument(obj.meta["val"])
+
+
+@_format_argument.register
+def _torch_fx_symbolic_bool(obj: torch.SymBool) -> str:
+    return f"SymBool({obj})"
+
+
+@_format_argument.register
+def _torch_fx_symbolic_int(obj: torch.SymInt) -> str:
+    return f"SymInt({obj})"
+
+
+@_format_argument.register
+def _torch_fx_symbolic_float(obj: torch.SymFloat) -> str:
+    return f"SymFloat({obj})"
+
+
+@_format_argument.register
+def _torch_tensor(obj: torch.Tensor) -> str:
+    return f"Tensor({fx_type_utils.from_torch_dtype_to_abbr(obj.dtype)}{_stringify_shape(obj.shape)})"
+
+
+@_format_argument.register
+def _int(obj: int) -> str:
+    return str(obj)
+
+
+@_format_argument.register
+def _float(obj: float) -> str:
+    return str(obj)
+
+
+@_format_argument.register
+def _bool(obj: bool) -> str:
+    return str(obj)
+
+
+@_format_argument.register
+def _str(obj: str) -> str:
+    return obj
+
+
+@_format_argument.register
+def _registration_onnx_function(obj: registration.ONNXFunction) -> str:
+    # TODO: Compact display of `param_schema`.
+    return f"registration.ONNXFunction({obj.op_full_name}, is_custom={obj.is_custom}, is_complex={obj.is_complex})"
+
+
+@_format_argument.register
+def _list(obj: list) -> str:
+    list_string = f"List[length={len(obj)}](\n"
+    if not obj:
+        return list_string + "None)"
+    for i, item in enumerate(obj):
+        if i >= _CONTAINER_ITEM_LIMIT:
+            # NOTE: Print only first _CONTAINER_ITEM_LIMIT items.
+            list_string += "...,\n"
+            break
+        list_string += f"{format_argument(item)},\n"
+    return list_string + ")"
+
+
+@_format_argument.register
+def _tuple(obj: tuple) -> str:
+    tuple_string = f"Tuple[length={len(obj)}](\n"
+    if not obj:
+        return tuple_string + "None)"
+    for i, item in enumerate(obj):
+        if i >= _CONTAINER_ITEM_LIMIT:
+            # NOTE: Print only first _CONTAINER_ITEM_LIMIT items.
+            tuple_string += "...,\n"
+            break
+        tuple_string += f"{format_argument(item)},\n"
+    return tuple_string + ")"
+
+
+@_format_argument.register
+def _dict(obj: dict) -> str:
+    dict_string = f"Dict[length={len(obj)}](\n"
+    if not obj:
+        return dict_string + "None)"
+    for i, (key, value) in enumerate(obj.items()):
+        if i >= _CONTAINER_ITEM_LIMIT:
+            # NOTE: Print only first _CONTAINER_ITEM_LIMIT items.
+            dict_string += "...\n"
+            break
+        dict_string += f"{key}: {format_argument(value)},\n"
+    return dict_string + ")"
+
+
+@_format_argument.register
+def _torch_nn_parameter(obj: torch.nn.Parameter) -> str:
+    return f"Parameter({format_argument(obj.data)})"
+
+
+@_format_argument.register
+def _onnxscript_torch_script_tensor(obj: graph_building.TorchScriptTensor) -> str:
+    return f"`TorchScriptTensor({fx_type_utils.from_torch_dtype_to_abbr(obj.dtype)}{_stringify_shape(obj.shape)})`"  # type: ignore[arg-type]  # noqa: B950
+
+
+@_format_argument.register
+def _onnxscript_onnx_function(obj: onnxscript.OnnxFunction) -> str:
+    return f"`OnnxFunction({obj.name})`"
+
+
+@_format_argument.register
+def _onnxscript_traced_onnx_function(obj: onnxscript.TracedOnnxFunction) -> str:
+    return f"`TracedOnnxFunction({obj.name})`"
+
+
+# from torch/fx/graph.py to follow torch format
+def _stringify_shape(shape: Optional[torch.Size]) -> str:
+    if shape is None:
+        return ""
+    return f"[{', '.join(str(x) for x in shape)}]"
+
+
+rules = diagnostics.rules
+levels = diagnostics.levels
+RuntimeErrorWithDiagnostic = infra.RuntimeErrorWithDiagnostic
+LazyString = formatter.LazyString
+DiagnosticOptions = infra.DiagnosticOptions
+
+
+@dataclasses.dataclass
+class Diagnostic(infra.Diagnostic):
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+
+    def log(self, level: int, message: str, *args, **kwargs) -> None:
+        if self.logger.isEnabledFor(level):
+            formatted_message = message % args
+            if is_onnx_diagnostics_log_artifact_enabled():
+                # Only log to terminal if artifact is enabled.
+                # See [NOTE: `dynamo_export` diagnostics logging] for details.
+                self.logger.log(level, formatted_message, **kwargs)
+
+            self.additional_messages.append(formatted_message)
+
+
+@dataclasses.dataclass
+class DiagnosticContext(infra.DiagnosticContext[Diagnostic]):
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+    _bound_diagnostic_type: type[Diagnostic] = dataclasses.field(
+        init=False, default=Diagnostic
+    )
+
+    def __enter__(self):
+        self._previous_log_level = self.logger.level
+        # Adjust the logger level based on `options.verbosity_level` and the environment
+        # variable `TORCH_LOGS`. See [NOTE: `dynamo_export` diagnostics logging] for details.
+        if not is_onnx_diagnostics_log_artifact_enabled():
+            return super().__enter__()
+        else:
+            return self
+
+
+diagnose_call = functools.partial(
+    decorator.diagnose_call,
+    diagnostic_type=Diagnostic,
+    format_argument=format_argument,
+)
+
+
+@dataclasses.dataclass
+class UnsupportedFxNodeDiagnostic(Diagnostic):
+    unsupported_fx_node: Optional[torch.fx.Node] = None
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        # NOTE: This is a hack to make sure that the additional fields must be set and
+        # not None. Ideally they should not be set as optional. But this is a known
+        # limitation with `dataclasses`. Resolvable in Python 3.10 with `kw_only=True`.
+        # https://stackoverflow.com/questions/69711886/python-dataclasses-inheritance-and-default-values
+        if self.unsupported_fx_node is None:
+            raise ValueError("unsupported_fx_node must be specified.")
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/dynamo_graph_extractor.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/dynamo_graph_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..af9a90da7d2ff2586b1038e02cf334053f5bcb07
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/dynamo_graph_extractor.py
@@ -0,0 +1,244 @@
+# NOTE: This file is referenced by name at
+#       /opt/pytorch/torch/_dynamo/eval_frame.py::DONT_WRAP_FILES.
+#       introduced by https://github.com/pytorch/pytorch/pull/98894.
+#       If this file is renamed, moved, etc please update the reference there!
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import inspect
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch._dynamo
+import torch.export as torch_export
+import torch.fx
+import torch.onnx
+from torch.onnx._internal import _beartype, exporter, io_adapter
+from torch.utils import _pytree as pytree
+
+
+class _PyTreeExtensionContext:
+    """Context manager to register PyTree extension."""
+
+    _extensions: Dict[Type, Tuple[pytree.FlattenFunc, pytree.UnflattenFunc]]
+
+    def __init__(self):
+        self._extensions = {}
+        # Register PyTree extension for HuggingFace model output.
+        self._register_huggingface_model_output_extension()
+
+    def __enter__(self):
+        for class_type, (flatten_func, unflatten_func) in self._extensions.items():
+            pytree._private_register_pytree_node(
+                class_type,
+                flatten_func,
+                unflatten_func,
+            )
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for class_type in self._extensions:
+            pytree.SUPPORTED_NODES.pop(class_type)
+
+    @_beartype.beartype
+    def register_pytree_node(
+        self,
+        class_type: Type,
+        flatten_func: pytree.FlattenFunc,
+        unflatten_func: pytree.UnflattenFunc,
+    ):
+        """Register PyTree extension for a custom python type.
+
+        Args:
+            class_type: The custom python type.
+            flatten_func: The flatten function.
+            unflatten_func: The unflatten function.
+
+        Raises:
+            AssertionError: If the custom python type is already registered.
+        """
+        if class_type in pytree.SUPPORTED_NODES or class_type in self._extensions:
+            # PyTree node already registered.
+            # E.g., `huggingface/transformer` registers `ModelOutput` as PyTree node after
+            # https://github.com/huggingface/transformers/pull/25358.
+            return
+        self._extensions[class_type] = (flatten_func, unflatten_func)
+
+    def _register_huggingface_model_output_extension(self):
+        try:
+            from transformers import modeling_outputs  # type: ignore[import]
+        except ImportError as e:
+            return
+
+        @_beartype.beartype
+        def model_output_flatten(
+            output: modeling_outputs.ModelOutput,
+        ) -> Tuple[List[Any], pytree.Context]:
+            return list(output.values()), (type(output), list(output.keys()))
+
+        @_beartype.beartype
+        def model_output_unflatten(
+            values: List[Any], context: pytree.Context
+        ) -> modeling_outputs.ModelOutput:
+            output_type, keys = context
+            return output_type(**dict(zip(keys, values)))
+
+        # All 'ModelOutput' subclasses are defined under module 'modeling_outputs'.
+        named_model_output_classes = inspect.getmembers(
+            modeling_outputs,
+            lambda x: (
+                inspect.isclass(x)
+                and issubclass(x, modeling_outputs.ModelOutput)
+                and x is not modeling_outputs.ModelOutput
+            ),
+        )
+
+        for _, class_type in named_model_output_classes:
+            self.register_pytree_node(
+                class_type, model_output_flatten, model_output_unflatten
+            )
+
+
+class DynamoFlattenOutputStep(io_adapter.FlattenOutputStep):
+    """Flatten nested collection and custom python types and return a flat list of elements.
+
+    Extended from :class:`io_adapter.FlattenOutputStep` to support flattening arbitrary
+    types via pytree extension. By default this supports many common user defined python
+    types such as :class:`ModelOutput` from HuggingFace transformers.
+
+    The pytree extension can be customized by passing in a ``_PyTreeExtensionContext``
+    object. See :meth:`_PyTreeExtensionContext.register_pytree_node`.
+    """
+
+    def __init__(
+        self, pytree_extension_context: Optional[_PyTreeExtensionContext] = None
+    ):
+        super().__init__()
+        self._pytree_extension_context = (
+            pytree_extension_context or _PyTreeExtensionContext()
+        )
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs, under the context of pytree extension."""
+        with self._pytree_extension_context:
+            return super().apply(model_outputs, model=model)
+
+
+def _wrap_model_with_output_adapter(
+    model: Union[torch.nn.Module, Callable],
+    output_adapter: DynamoFlattenOutputStep,
+) -> Callable:
+    """Wrap model with output adapter.
+
+    This is a helper function to enable :func:`dynamo.export` on models that produce
+    custom user defined types outputs. It wraps the model with an output adapter to
+    convert the outputs to :func:`dynamo.export` compatible types, i.e. :class:`torch.Tensor`.
+
+    The adapting logic is controlled by ``output_adapter``.
+
+    Args:
+        model: PyTorch model or function.
+        output_adapter: Output adapter to apply to model output.
+    Returns:
+        Wrapped model.
+    """
+    model_func = model.forward if isinstance(model, torch.nn.Module) else model
+
+    # Preserve original function signature.
+    @functools.wraps(model_func)
+    def wrapped(*args, **kwargs):
+        return output_adapter.apply(model_func(*args, **kwargs), model=model)
+
+    return wrapped
+
+
+class DynamoExport(exporter.FXGraphExtractor):
+    """Generates a FX GraphModule using torch.dynamo.export API
+    Args:
+        aten_graph: If True, exports a graph with ATen operators.
+                    If False, exports a graph with Python operators.
+    """
+
+    def __init__(
+        self,
+        aten_graph: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.aten_graph = aten_graph or True
+
+    def generate_fx(
+        self,
+        options: exporter.ResolvedExportOptions,
+        model: Union[torch.nn.Module, Callable],
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        # `dynamo.export` does not recognize custom user defined classes as output type.
+        # Apply wrapper to adapt the outputs back to `dynamo.export` compatible types,
+        # i.e. :class:`torch.Tensor`.
+        dynamo_flatten_output_step = DynamoFlattenOutputStep()
+        wrapped_model = _wrap_model_with_output_adapter(
+            model, dynamo_flatten_output_step
+        )
+        # Record the output adapter step.
+        self.output_adapter.append_step(dynamo_flatten_output_step)
+
+        # Translate callable to FX graph.
+        #
+        fake_mode = (
+            options.fake_context.fake_mode
+            if options.fake_context
+            else contextlib.nullcontext()
+        )
+        fx_mode = "symbolic" if options.dynamic_shapes else "fake"
+        with fake_mode:  # type: ignore[attr-defined]
+            graph_module, graph_guard = torch._dynamo.export(
+                wrapped_model,
+                tracing_mode=fx_mode,
+            )(
+                *model_args,
+                **model_kwargs,
+            )
+        del graph_guard  # Unused
+        torch._dynamo.reset()
+
+        # Export FX graph to ONNX ModelProto.
+        self.input_adapter.append_step(
+            io_adapter.FlattenInputWithTreeSpecValidationInputStep()
+        )
+
+        updated_model_args = self.input_adapter.apply(
+            *model_args, model=model, **model_kwargs
+        )
+
+        return self.pre_export_passes(options, model, graph_module, updated_model_args)  # type: ignore[return-value]
+
+    @_beartype.beartype
+    def pre_export_passes(
+        self,
+        options: exporter.ResolvedExportOptions,
+        original_model: Union[torch.nn.Module, Callable],
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        return exporter.common_pre_export_passes(
+            options, original_model, fx_module, fx_module_args
+        )
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py
new file mode 100644
index 0000000000000000000000000000000000000000..20d1fcdaadd2878f72138d78a4742a368249ab5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_onnx_interpreter.py
@@ -0,0 +1,869 @@
+from __future__ import annotations
+
+import inspect
+import logging
+import operator
+import re
+import types
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import onnxscript  # type: ignore[import]
+from onnxscript.function_libs.torch_lib import (  # type: ignore[import]
+    graph_building as onnxscript_graph_building,
+)
+
+import torch
+import torch.fx
+from torch.onnx import _type_utils as jit_type_utils
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import (
+    _pass,
+    diagnostics,
+    onnxfunction_dispatcher,
+    op_validation,
+    type_utils as fx_type_utils,
+)
+from torch.utils import _pytree
+
+
+@_beartype.beartype
+def _fx_node_to_onnx_message_formatter(
+    fn: Callable,
+    self,
+    node: torch.fx.Node,
+    *args,
+    **kwargs,
+) -> str:
+    return f"FX Node: {node.op}:{node.target}[name={node.name}]. "
+
+
+@_beartype.beartype
+def _fx_graph_to_onnx_message_formatter(
+    fn: Callable,
+    self,
+    fx_graph_module: torch.fx.GraphModule,
+    *args,
+    **kwargs,
+) -> str:
+    return f"FX Graph: {fx_graph_module._get_name()}. "
+
+
+def _location_from_fx_stack_trace(
+    node_stack_trace: str,
+) -> Optional[diagnostics.infra.Location]:
+    """Extract location from FX node stack trace.
+
+    TODO(bowbao): Create fx utils module and move this function there.
+
+    Args:
+        node_stack_trace: The stack trace of the FX node. Example:
+
+            File "path/file.py", line 311, in <function>
+                <code>
+            |   File "path/file2.py", line 389, in <function>
+                <code>
+
+    Returns:
+        location: The location of the FX node.
+    """
+    if "File" not in node_stack_trace:
+        return None
+
+    lines = node_stack_trace.strip().split("\n")
+    idx = 0
+    while idx < len(lines) and "File" not in lines[idx]:
+        idx += 1
+    if idx + 1 >= len(lines):
+        return None
+
+    pattern = re.compile(r"^File \"(.+)\", line (\d+), in (.+)$")
+    matches = pattern.match(lines[idx].strip())
+    if matches:
+        uri = matches.group(1)
+        line_number = int(matches.group(2))
+        snippet = lines[idx + 1].strip()
+        return diagnostics.infra.Location(uri=uri, line=line_number, snippet=snippet)
+    return None
+
+
+@_beartype.beartype
+def _retrieve_or_adapt_input_to_graph_set(
+    fx_node_arg: fx_type_utils.Argument,
+    fx_name_to_onnxscript_value: Dict[
+        str,
+        Union[
+            onnxscript_graph_building.TorchScriptTensor,
+            Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+    ],
+    tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+):
+    """Map FX value to TorchScript value.
+
+    When creating TorchScript graph from FX graph, we need a mapping from FX variable
+    to TorchScript variable. This function maps FX variable, fx_node_arg, to torch.jit.Value.
+    """
+
+    onnx_tensor = fx_node_arg
+    if isinstance(onnx_tensor, torch.fx.Node):
+        # 1. fx_node_arg is a torch.fx.Node, which means
+        #    fx_node_arg stands for the output of that torch.fx.Node.
+        # 2. fx_node_arg (variable in torch.fx.Graph) is be mapped to
+        #    torch.jit.Value, fx_name_to_onnxscript_value[fx_node_arg.name],
+        #    in TorchScript graph.
+        return fx_name_to_onnxscript_value[onnx_tensor.name]
+    elif isinstance(onnx_tensor, (tuple, list)) and any(
+        isinstance(node, torch.fx.Node)
+        and fx_type_utils.is_torch_symbolic_type(node.meta.get("val"))
+        for node in onnx_tensor
+    ):
+        # This intends to handle dynamic axes. for example, if the input size of op.Expand
+        # is dynamic, each dimension would be variable (i.e., sym variable in Pytorch
+        # FX graph. Note that sym variable is mapped to tensor in ONNX Script world)
+        # calculated by other operators.
+        sequence_mixed_elements: List[
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+                List[int],
+            ]
+        ] = []
+        # onnx_tensor contains a list of scalars which could be one of
+        #   - tensor with empty shape,
+        #   - tensor with tensor with shape (1,),
+        #   - torch.SymInt,
+        #   - int
+        #   - ...
+        # They should all be promoted to tensor with shape (1,)
+        # in order to call ONNX's Concat.
+        for tensor in onnx_tensor:
+            # Prepare `tensor` as input of ONNX's Concat.
+
+            if isinstance(
+                tensor, torch.fx.Node
+            ) and fx_type_utils.is_torch_symbolic_type(tensor.meta.get("val")):
+                # In this case, tensor is a torch.SymInt from Dynamo's perspective.
+                # It might be mapped to tensor with shape () or (1,) in ONNX.
+                element_value = fx_name_to_onnxscript_value[tensor.name]
+                if isinstance(
+                    element_value, onnxscript_graph_building.TorchScriptTensor
+                ):
+                    # All elements sequence_mixed_elements will be send to onnx's Concat
+                    # as inputs. Therefore, they are required to have the same rank.
+                    # Since tensors with rank=0 (i.e., scalar) cannot be concated, all
+                    # scalars are promoted to tensors with shape (1,).
+                    with onnxscript.evaluator.default_as(tracer):
+                        element_value = onnxscript.opset18.Reshape(element_value, [1])  # type: ignore[arg-type, type-var]
+                sequence_mixed_elements.append(element_value)
+            elif isinstance(tensor, int):
+                # NOTE: op.Concat doesn't support scalar, so we need to wrap it with
+                # dim, and onnx-script will promote it to tensor(int64)
+                sequence_mixed_elements.append([tensor])
+            else:
+                raise RuntimeError(
+                    f"Unsupported type in sequence_mixed_elements: {type(tensor)}"
+                )
+        # Concat all the elements in the sequence.
+        # shapes are mapped to tensors in ONNX graph (TorchScriptGraph),
+        # so list of sym_ints is concatenated to a tensor before calling ONNX op.
+
+        # For example:
+        #    inputs: [[2], [4], fx.Node(SymIntA), [1], fx.Node(SymIntB)]
+        #    outputs: op.Concat([op.Constant(2), op.Constant(4), TorchScriptTensor(A), op.Constant(1), TorchScriptTensor(B)])
+
+        # onnx-script auto wraps python number with op.Constants,
+        # so we don't need to specifically process them.
+        with onnxscript.evaluator.default_as(tracer):
+            output = onnxscript.opset18.Concat(*sequence_mixed_elements, axis=0)  # type: ignore[type-var]
+        output.dtype = torch.int64  # type: ignore[union-attr]
+        output.shape = [len(sequence_mixed_elements)]  # type: ignore[union-attr]
+        return output
+    elif isinstance(onnx_tensor, (tuple, list)) and all(
+        isinstance(node, torch.fx.Node) or node is None for node in onnx_tensor
+    ):
+        sequence_elements: List[
+            Union[
+                Optional[onnxscript_graph_building.TorchScriptTensor],
+                Tuple[
+                    onnxscript_graph_building.TorchScriptTensor,
+                    ...,
+                ],
+            ]
+        ] = []
+        for tensor in onnx_tensor:
+            sequence_elements.append(
+                fx_name_to_onnxscript_value[tensor.name] if tensor is not None else None
+            )
+        return sequence_elements
+    if isinstance(onnx_tensor, torch.dtype):
+        onnx_tensor = int(
+            jit_type_utils.JitScalarType.from_dtype(onnx_tensor).onnx_type()
+        )
+    # NOTE: if device is specified in kwargs (not consumed), it's free to ignored. But
+    # if it's in args, we need to set it to string for dispatcher to match schema.
+    if isinstance(onnx_tensor, torch.device):
+        # torch.device is not supported by onnxscript (no op). We turn it into
+        # a string.
+        return str(onnx_tensor)
+    # all other cases, we do nothing.
+    return onnx_tensor
+
+
+def filter_incompatible_and_dtype_convert_kwargs(kwargs):
+    """Filter out kwargs that are not supported by onnxscript."""
+    filtered = {}
+    for key, value in kwargs.items():
+        if key in {
+            "layout",
+            "device",
+            "requires_grad",
+            "pin_memory",
+            "memory_format",
+            "implicit",
+        }:
+            continue
+        if key == "dtype":
+            if value is None:
+                # We omit if dtype is not provided, because onnxscript handles the
+                # default case.
+                continue
+            else:
+                value = int(jit_type_utils.JitScalarType.from_dtype(value).onnx_type())
+        filtered[key] = value
+    return filtered
+
+
+@_beartype.beartype
+def _fill_tensor_shape_type(
+    onnxscript_values: Union[
+        onnxscript_graph_building.TorchScriptTensor,
+        Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+    ],
+    name: str,
+    expected_values: Union[
+        fx_type_utils.META_VALUE_TYPE,
+        List[fx_type_utils.META_VALUE_TYPE],
+        Tuple[Optional[fx_type_utils.META_VALUE_TYPE], ...],
+    ],
+):
+    """Fill the meta information of onnxscript_values with that from the fx FakeTensor."""
+
+    if isinstance(expected_values, (list, tuple)) and not isinstance(
+        onnxscript_values, (list, tuple)
+    ):
+        # ex: aten::split - in onnx_dtype: seq(tensor)
+        # onnxscript_values is a single tensor, but expected_values is a list of tensors.
+        return
+
+    flat_onnxscript_values, _ = _pytree.tree_flatten(onnxscript_values)
+    flat_expected_values, _ = _pytree.tree_flatten(expected_values)
+    for i, (onnxscript_value, expected_value) in enumerate(
+        zip(flat_onnxscript_values, flat_expected_values)
+    ):
+        if expected_value is None:
+            # There is no shape/type from None.
+            # NOTE: according to https://github.com/pytorch/pytorch/blob/main/torch/_meta_registrations.py,
+            # None could be a valid value for return type, so we need to handle it.
+            # e.g. the function: meta__scaled_dot_product_flash() in cpu mode.
+            continue
+        elif fx_type_utils.is_torch_symbolic_type(expected_value):
+            # aten::sym_size output is a int, not a tensor, which stands
+            # for the size of one dim. We treat it as 1-D tensor.
+            onnxscript_value.dtype = fx_type_utils.from_sym_value_to_torch_dtype(
+                expected_value
+            )
+            onnxscript_value.shape = torch.Size([1])
+        elif isinstance(expected_value, (int, float, bool)):
+            onnxscript_value.dtype = fx_type_utils.from_scalar_type_to_torch_dtype(
+                type(expected_value)
+            )
+            onnxscript_value.shape = torch.Size([])
+        elif isinstance(expected_value, complex):
+            # From complex scalar to real representation
+            onnxscript_value_to_torch_dtype = (
+                fx_type_utils.from_scalar_type_to_torch_dtype(type(expected_value))
+            )
+            onnxscript_value.dtype = (
+                fx_type_utils.from_complex_to_float(onnxscript_value_to_torch_dtype)
+                if onnxscript_value_to_torch_dtype is not None
+                else None
+            )
+            onnxscript_value.shape = torch.Size([2])
+        elif fx_type_utils.is_torch_complex_dtype(expected_value.dtype):
+            # Like torch.view_as_real, we flatten complex tensors to real tensors with
+            # additional last dimension of 2
+            onnxscript_value.shape = torch.Size((*expected_value.size(), 2))
+            # complex64 -> float32, complex128 -> float64, etc.
+            onnxscript_value.dtype = fx_type_utils.from_complex_to_float(
+                expected_value.dtype
+            )
+            # Dispatcher needs to know the value is complex
+            onnxscript_value.is_complex = True
+        else:
+            # We set node output sizes to be dynamic to continue the model conversion,
+            # and inputs are also set to be dynamic in add_input().
+            onnxscript_value.shape = expected_value.size()
+            onnxscript_value.dtype = expected_value.dtype
+
+        # naming
+        if i > 0:
+            onnxscript_value.name = f"{name}_{i}"
+        else:
+            onnxscript_value.name = name
+
+
+@_beartype.beartype
+def _fill_in_default_kwargs(
+    node: torch.fx.Node,
+) -> Tuple[List[fx_type_utils.Argument], Dict[str, fx_type_utils.Argument]]:
+    """Find and Fill in the not provided kwargs with default values."""
+
+    # TODO: aten::sym_size has overload, but fx graph is using
+    # overloadpacket for some reasons.
+    # https://github.com/pytorch/pytorch/issues/97201
+    # We manually assigned overload for aten::sym_size.
+    if hasattr(node.target, "_schema"):
+        node_schema = node.target._schema  # type: ignore[union-attr]
+    else:
+        node_schema = torch.ops.aten.sym_size.int._schema  # type: ignore[union-attr]
+
+    # This function assumes the order of arguments in FX op is the
+    # same as the order of arguments in TorchScript op.
+    complete_args: List[fx_type_utils.Argument] = []
+    complete_kwargs: Dict[str, fx_type_utils.Argument] = {}
+
+    if inspect.isbuiltin(node.target):
+        complete_args = list(node.args)
+    else:
+        for i, expected_arg in enumerate(node_schema.arguments):
+            if i < len(node.args):
+                complete_args.append(node.args[i])
+            elif expected_arg.name in node.kwargs:
+                complete_kwargs[expected_arg.name] = node.kwargs[expected_arg.name]
+            else:
+                # Get default from schema.
+                complete_kwargs[expected_arg.name] = expected_arg.default_value
+
+    return complete_args, complete_kwargs
+
+
+@_beartype.beartype
+def _wrap_fx_args_as_onnxscript_args(
+    complete_args: List[fx_type_utils.Argument],
+    complete_kwargs: Dict[str, fx_type_utils.Argument],
+    fx_name_to_onnxscript_value: Dict[
+        str,
+        Union[
+            onnxscript_graph_building.TorchScriptTensor,
+            Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ],
+    ],
+    tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+) -> Tuple[
+    Sequence[
+        Optional[
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                str,
+                int,
+                float,
+                bool,
+                list,
+                complex,
+            ]
+        ]
+    ],
+    Dict[str, fx_type_utils.Argument],
+]:
+    """Map all FX arguments of a node to arguments in TorchScript graph."""
+
+    onnxscript_args = tuple(
+        _retrieve_or_adapt_input_to_graph_set(arg, fx_name_to_onnxscript_value, tracer)
+        for arg in complete_args
+    )
+    onnxscript_kwargs = filter_incompatible_and_dtype_convert_kwargs(complete_kwargs)
+
+    return onnxscript_args, onnxscript_kwargs
+
+
+class FxOnnxInterpreter:
+    """Stateless class to process FX graph Nodes and translate them into their ONNX counterparts.
+
+    All FX nodes described by [FX Graph](https://pytorch.org/docs/stable/fx.html#torch.fx.Graph) are supported.
+    Similarly to [FX Interpreter pattern](https://pytorch.org/docs/stable/fx.html#torch.fx.Interpreter), each FX node
+    must be implemented on its own method in this class.
+
+    Each operator's implementation returns either an `onnxscript.OnnxFunction` or
+    `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm. They can
+    also raise RuntimeError: If there are no overloaded functions available for the given FX node.
+
+    TODO: Convert methods to @staticmethod when the diagnostic system supports it
+          DO NOT ADD NEW ATTRIBUTES TO THIS CLASS!
+    """
+
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ):
+        # THIS SHOULD BE THE ONLY STATE IN THIS CLASS (constraint from diagnosticS API)
+        # TODO: Diagnostics API should be revised to get rid of this attribute.
+        # DO NOT add other class-level attributes.
+        self.diagnostic_context = diagnostic_context
+
+    @_beartype.beartype
+    @diagnostics.diagnose_call(
+        diagnostics.rules.fx_node_to_onnx,
+        diagnostic_message_formatter=_fx_node_to_onnx_message_formatter,
+    )
+    def run_node(
+        self,
+        node,
+        fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        op_level_debug: bool,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        onnxscript_tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+    ):
+        """Execute a single FX node to produce its ONNX counterpart.
+
+        Args:
+            node: The FX node to be translated.
+            fx_graph_module: The FX graph module containing the node.
+            onnxfunction_dispatcher: The dispatcher to find the best matched ONNX op.
+            op_level_debug (bool): Whether to enable op level debug.
+            onnxscript_graph: The ONNX graph to be populated.
+            onnxscript_tracer: The tracer to trace the ONNX graph.
+            fx_name_to_onnxscript_value: The mapping from FX node name to ONNX Script value.
+
+        Raises:
+            RuntimeError: When a node.op is not supported.
+        """
+        # Record stack trace of node in diagnostic.
+        node_stack_trace = node.stack_trace
+        if node_stack_trace:
+            diagnostic = self.diagnostic_context.inflight_diagnostic(
+                rule=diagnostics.rules.fx_node_to_onnx
+            )
+            with diagnostic.log_section(logging.INFO, "PyTorch source information"):
+                diagnostic.info("```\n%s\n```", node_stack_trace)
+            location = _location_from_fx_stack_trace(node_stack_trace)
+            if location is not None:
+                diagnostic.with_location(location)
+
+        if node.op == "placeholder":
+            self.placeholder(node, onnxscript_graph, fx_name_to_onnxscript_value)
+        elif node.op == "get_attr":
+            self.get_attr(
+                node,
+                onnxscript_graph,
+                fx_name_to_onnxscript_value,
+                fx_graph_module,
+            )
+        elif node.op == "call_function":
+            self.call_function(
+                node,
+                onnxscript_tracer,
+                fx_name_to_onnxscript_value,
+                onnxfunction_dispatcher,
+                op_level_debug,
+                fx_graph_module,
+            )
+        elif node.op == "call_method":
+            self.call_method(node)
+        elif node.op == "call_module":
+            self.call_module(
+                node,
+                onnxscript_graph,
+                fx_name_to_onnxscript_value,
+                onnxscript_tracer,
+                fx_graph_module,
+                onnxfunction_dispatcher,
+                op_level_debug,
+            )
+        elif node.op == "output":
+            self.output(node, onnxscript_graph, fx_name_to_onnxscript_value)
+        else:
+            raise RuntimeError(f"Found node type not defined in torch.fx: {node.op}")
+
+    @_beartype.beartype
+    @diagnostics.diagnose_call(
+        diagnostics.rules.fx_graph_to_onnx,
+        diagnostic_message_formatter=_fx_graph_to_onnx_message_formatter,
+    )
+    def run(
+        self,
+        fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        op_level_debug: bool,
+        parent_onnxscript_graph: Optional[
+            onnxscript_graph_building.TorchScriptGraph
+        ] = None,
+    ) -> onnxscript_graph_building.TorchScriptGraph:
+        """Analyze all FX nodes and trigger their ONNX translation.
+
+        Args:
+            fx_graph_module: FX graph module to be translated.
+            onnxfunction_dispatcher: ONNX function dispatcher.
+            op_level_debug: Whether to enable op-level debug.
+            parent_onnxscript_graph: The parent TorchScript graph. Must be provided if
+                `fx_graph_module` is a submodule. If not provided,
+                `fx_graph_module` is assumed to be the root module.
+        """
+        diagnostic = self.diagnostic_context.inflight_diagnostic()
+        with diagnostic.log_section(logging.DEBUG, "FX Graph:"):
+            diagnostic.debug(
+                "```\n%s\n```",
+                diagnostics.LazyString(fx_graph_module.print_readable, False),
+            )
+
+        if parent_onnxscript_graph is not None:
+            # If parent_onnxscript_graph is provided, we assume fx_graph_module is a
+            # submodule representing a forward call of an nn.Module.
+            # Compose package and version where the nn.Module is defined as domain name
+            # for the local function.
+
+            onnx_meta: Optional[_pass.GraphModuleOnnxMeta] = fx_graph_module.meta.get(
+                "onnx"
+            )
+            if onnx_meta is None:
+                raise RuntimeError(
+                    f"ONNX meta is not found in submodule {fx_graph_module._get_name()}. "
+                    f"Only submodules produced by `Modularize` pass is supported in ONNX export."
+                )
+
+            onnx_domain = onnx_meta.package_info.to_onnx_domain_string()
+        else:
+            # Leave as default domain name for the root module.
+            onnx_domain = None
+
+        onnxscript_graph = onnxscript_graph_building.TorchScriptGraph(
+            parent_onnxscript_graph, domain_name=onnx_domain
+        )
+        onnxscript_tracer = onnxscript_graph_building.TorchScriptTracingEvaluator(
+            onnxscript_graph
+        )
+        # In the following loop, a TorchScript graph is created to
+        # represent the input FX graph with ONNX symbols (e.g., onnx::add).
+        # To connect the values to nodes in the TorchScript graph, we maintain
+        # fx_name_to_onnxscript_value. Basically, we want to translate
+        #   fx_tensor_x (type: torch.fx.Node) -> fx_node_1 -> fx_tensor_y (type: torch.fx.Node)
+        # to
+        #   fx_name_to_onnxscript_value[fx_tensor_x.name] -> onnx_node_1 -> fx_name_to_onnxscript_value[fx_tensor_y.name]
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ] = {}
+
+        # TODO: Fix FakeTensorMode limitation asap
+        # We want to pass list of ints and floats to TorchScript graph correctly
+        # in _export_fx_to_ts, so we must disable FakeTensorMode. Otherwise, graph may
+        # receive FakeTensor and results runtime error. In addition, TorchScript-based
+        # ONNX exporter used in _ts_graph_to_onnx_model_in_protobuf is not compatible
+        # with FakeTensorMode.
+        with torch.utils._mode_utils.no_dispatch():
+            # node_fixed_shape is only used on op_level_debug purpose.
+            for node in fx_graph_module.graph.nodes:
+                self.run_node(
+                    node,
+                    fx_graph_module,
+                    onnxfunction_dispatcher,
+                    op_level_debug,
+                    onnxscript_graph,
+                    onnxscript_tracer,
+                    fx_name_to_onnxscript_value,
+                )
+
+        with diagnostic.log_section(logging.DEBUG, "ONNX Graph:"):
+            diagnostic.debug("```\n%s\n```", onnxscript_graph.torch_graph)
+
+        return onnxscript_graph
+
+    @_beartype.beartype
+    def placeholder(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+    ):
+        # Input of graph.
+        # The node.meta["val"] is generated by FakeTensorProp.
+        # NOTE: add_input() intends to create nodes with shape/type
+        fake_tensor = node.meta.get("val", None)
+        # NOTE: During the tracing, when inputs are constants, they are represented
+        # by nodes with node.meta['val'] being None (nn.Module to dynamo_export)
+        # or nodes with node.meta['val'] being a builtin value (ExportedProgram to dynamo_export).
+        # Nonethless, the nodes are not consumed by others, so we don't need to
+        # create a TorchScriptTensor for them.
+        if fake_tensor is None or isinstance(fake_tensor, (int, float, bool, str)):
+            output = onnxscript_graph.add_input(
+                input_name=None,
+            )
+        elif isinstance(fake_tensor, torch.Tensor):
+            # NOTE: ONNX doesn't support tensor of complex64/complex128, so we
+            # convert them to float32/float64 with real representation.
+            if fx_type_utils.is_torch_complex_dtype(fake_tensor.dtype):
+                fake_tensor = torch.view_as_real(fake_tensor.resolve_conj())
+            output = onnxscript_graph.add_input(
+                input_name=node.name,
+                shape=fake_tensor.shape,
+                dtype=fake_tensor.dtype,
+            )
+
+        elif fx_type_utils.is_torch_symbolic_type(fake_tensor):
+            output = onnxscript_graph.add_input(
+                input_name=node.name,
+                shape=torch.Size([]),
+                dtype=fx_type_utils.from_sym_value_to_torch_dtype(fake_tensor),
+            )
+        else:
+            raise RuntimeError(
+                f"Unsupported type(node.meta['val']) for placeholder: {type(fake_tensor)}"
+            )
+        assert (
+            output is not None
+        ), f"Node creates None with target={node.target} and name={node.name}"
+
+        assert isinstance(output, onnxscript_graph_building.TorchScriptTensor)
+        assert isinstance(output, onnxscript.tensor.Tensor)
+
+        fx_name_to_onnxscript_value[node.name] = output
+
+    @_beartype.beartype
+    def call_function(
+        self,
+        node: torch.fx.Node,
+        onnxscript_tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        op_level_debug: bool,
+        fx_graph_module: torch.fx.GraphModule,
+    ):
+        # aten ops and other stateless functions.
+        if node.target == operator.getitem and isinstance(
+            fx_name_to_onnxscript_value[node.args[0].name], tuple  # type: ignore[union-attr,index]
+        ):
+            onnx_tensor_tuple = fx_name_to_onnxscript_value[node.args[0].name]  # type: ignore[union-attr,index]
+            index = node.args[1]
+            output = onnx_tensor_tuple[index]  # type: ignore[index]
+            assert (
+                output is not None
+            ), f"Node creates None with target={node.target} and name={node.name}"
+            assert isinstance(
+                output, (onnxscript_graph_building.TorchScriptTensor, tuple)
+            ), type(output)
+
+            fx_name_to_onnxscript_value[node.name] = output
+            return
+
+        # Map FX inputs to ONNX inputs and fill optional inputs with default values.
+        # torch_args and torch_kwargs are for op-level validation
+        fx_args, fx_kwargs = _fill_in_default_kwargs(node)
+
+        onnx_args, onnx_kwargs = _wrap_fx_args_as_onnxscript_args(
+            fx_args,
+            fx_kwargs,
+            fx_name_to_onnxscript_value,
+            onnxscript_tracer,
+        )
+        # Dispatch to ONNX op through OpShema. The input argument dtypes are compared to
+        # function signature in OpSchema, and find the best matched overload.
+        symbolic_fn = onnxfunction_dispatcher.dispatch(
+            node=node,
+            onnx_args=onnx_args,
+            onnx_kwargs=onnx_kwargs,
+            diagnostic_context=self.diagnostic_context,
+        )
+        with onnxscript.evaluator.default_as(onnxscript_tracer):
+            output: Union[  # type: ignore[no-redef]
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ] = symbolic_fn(*onnx_args, **onnx_kwargs)
+        assert (
+            output is not None
+        ), f"Node creates None with target={node.target}, name={node.name}, args={onnx_args}, kwargs={onnx_kwargs}"
+        # Assign type and shape from fx graph.
+        _fill_tensor_shape_type(output, node.name, node.meta["val"])
+        # One fx node could produce multiple outputs (e.g., tuple of tensors); in
+        # that case, v is a tuple of TorchScriptTensors.
+        assert isinstance(
+            output, (onnxscript_graph_building.TorchScriptTensor, tuple)
+        ), type(output)
+        # NOTE(titaiwang): We bypass two kinds of ops as it's not meaningful to
+        # validate them with op level debug.
+        # 1. aten::sym_size: The op is simply get item from a list of tensors.
+        # 2. BuiltinFunction: It doesn't supported tensor
+        if (
+            op_level_debug
+            and node.target != torch.ops.aten.sym_size
+            and not isinstance(node.target, types.BuiltinFunctionType)
+        ):
+            op_validation.validate_op_between_ort_torch(
+                self.diagnostic_context,
+                node,
+                symbolic_fn,
+                fx_args,
+                fx_kwargs,
+                fx_graph_module,
+            )
+        fx_name_to_onnxscript_value[node.name] = output
+
+    @_beartype.beartype
+    def output(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+    ):
+        if isinstance(node.args[0], torch.fx.Node):
+            onnx_tensor_or_tensor_tuple = fx_name_to_onnxscript_value[node.args[0].name]
+            onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+        else:
+            # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+            # tensor, etc), we flatten the collection and register each element as output.
+            flat_args, _ = _pytree.tree_flatten(node.args[0])
+            for arg in flat_args:
+                assert isinstance(
+                    arg, torch.fx.Node
+                ), f"arg must be a torch.fx.Node, not {type(arg)}"
+                onnx_tensor_or_tensor_tuple = fx_name_to_onnxscript_value[arg.name]
+                onnxscript_graph.register_outputs(onnx_tensor_or_tensor_tuple)
+
+    @_beartype.beartype
+    def call_method(self, node: torch.fx.Node):
+        # TODO(wechi): Support call_method.
+        raise RuntimeError("call_method is not supported yet.")
+
+    @_beartype.beartype
+    def call_module(
+        self,
+        node: torch.fx.Node,
+        parent_onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+        tracer: onnxscript_graph_building.TorchScriptTracingEvaluator,
+        root_fx_graph_module: torch.fx.GraphModule,
+        onnxfunction_dispatcher: onnxfunction_dispatcher.OnnxFunctionDispatcher,
+        op_level_debug: bool,
+    ) -> None:
+        """Export a fx.GraphModule submodule to ONNXScript graph.
+
+        The export process specifically targets `call_module` nodes that are created by
+        the exporter's `Modularize` pass. Each `call_module` node has an associated fx.GraphModule
+        by `node.target` underneath the root fx.GraphModule. These `call_module` nodes are exported as ONNX
+        function nodes. The related `sub_module` is then exported as an ONNX model local function,
+        which is represented by another `TorchScriptGraph`. This `TorchScriptGraph` sets the current
+        `onnxscript_graph` as its parent.
+
+        Args:
+            node: The call_module node in the FX graph that represents the submodule call.
+            parent_onnxscript_graph: The parent ONNXScript graph to which the ONNX function and
+                function node belong.
+            fx_name_to_onnxscript_value: The mapping from FX node name to ONNXScript value.
+            tracer: The tracer used to trace the ONNXScript graph.
+            root_fx_graph_module: The root FX module.
+            onnxfunction_dispatcher: The dispatcher.
+            op_level_debug: Whether to enable op-level debug.
+        """
+        assert isinstance(
+            node.target, str
+        ), f"node.target must be a str, not {type(node.target)} for node {node}."
+
+        sub_module = root_fx_graph_module.get_submodule(node.target)
+
+        assert isinstance(
+            sub_module, torch.fx.GraphModule
+        ), f"sub_module must be a torch.fx.GraphModule, not {type(sub_module)} for node {node}."
+
+        sub_onnxscript_graph = self.run(
+            sub_module, onnxfunction_dispatcher, op_level_debug, parent_onnxscript_graph
+        )
+
+        onnx_args, _ = _wrap_fx_args_as_onnxscript_args(
+            list(node.args), {}, fx_name_to_onnxscript_value, tracer
+        )
+
+        # TODO: We may want to consider other naming styles. The goal is to be stable and
+        # unique such that it can be easily identified in case of kernel substitution.
+        # Example for current style is combination of qualified module class name and
+        # module attribute name: `torch_nn_modules_conv_Conv2d_conv1`.
+        # Other naming styles such as qualified module class name made unique can also
+        # be considered.
+        unique_module_name = f"{sub_module._get_name()}_{node.target}"
+
+        outputs: Union[  # type: ignore[no-redef]
+            onnxscript_graph_building.TorchScriptTensor,
+            Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+        ] = parent_onnxscript_graph.add_module_call(
+            unique_module_name, sub_onnxscript_graph, onnx_args
+        )
+
+        assert isinstance(
+            outputs, (onnxscript_graph_building.TorchScriptTensor, tuple)
+        ), f"Unexpected outputs type {type(outputs)} for node {node}."
+
+        _fill_tensor_shape_type(outputs, node.name, node.meta["val"])
+        fx_name_to_onnxscript_value[node.name] = outputs
+
+        # Skip op_level_validation for call_module. Subgraph nodes are validated individually.
+
+    @_beartype.beartype
+    def get_attr(
+        self,
+        node: torch.fx.Node,
+        onnxscript_graph: onnxscript_graph_building.TorchScriptGraph,
+        fx_name_to_onnxscript_value: Dict[
+            str,
+            Union[
+                onnxscript_graph_building.TorchScriptTensor,
+                Tuple[onnxscript_graph_building.TorchScriptTensor, ...],
+            ],
+        ],
+        fx_graph_module: torch.fx.GraphModule,
+    ):
+        assert isinstance(node.target, str), f"node.target {node.target} is not a str."
+        attr_tensor = getattr(fx_graph_module, node.target)
+        assert isinstance(attr_tensor, torch.Tensor), f"{attr_tensor} is not a tensor."
+
+        # Parameter/buffer name cannot contain "."
+        # Revert from "/" to restore namespace formatting.
+        input_ = onnxscript_graph.add_initializer(
+            name=node.target.replace("/", "."),
+            value=attr_tensor,
+        )
+
+        assert isinstance(input_, onnxscript_graph_building.TorchScriptTensor)
+        assert isinstance(input_, onnxscript.tensor.Tensor)
+        fx_name_to_onnxscript_value[node.name] = input_
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_symbolic_graph_extractor.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_symbolic_graph_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..be668379807ee43354cda4318c0dbff37a21249e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/fx_symbolic_graph_extractor.py
@@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import functools
+
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.fx
+import torch.onnx
+
+import torch.onnx._internal.fx.passes as passes
+from torch.onnx._internal import _beartype, exporter, io_adapter
+
+# Functions directly wrapped to produce torch.fx.Proxy so that symbolic
+# data can flow through those functions. Python functions (e.g., `torch.arange`)
+# not defined by pybind11 in C++ do not go though Python dispatcher, so
+# they are not automatically patched by FX's Python dispatcher.
+# The list below means `torch.arange`, `torch.tensor`, and so on will be
+# patched.
+_TORCH_METHODS_TO_PATCH: Tuple[str, ...] = (
+    "arange",
+    "tensor",
+    "finfo",
+    "full",
+    "empty",
+)
+
+
+class ModuleExpansionTracer(torch.fx._symbolic_trace.Tracer):
+    """Tracer to create ONNX-exporting friendly FX graph.
+
+    This tracer traces models into operators. That is,
+    the traced graph mostly contains call_function nodes and
+    has no call_module nodes. The call_module nodes
+    are problematic to the use of make_fx(...) in ONNX
+    exporter.
+    """
+
+    @_beartype.beartype
+    def is_leaf_module(
+        self, module: torch.nn.Module, module_qualified_name: str
+    ) -> bool:
+        # This returns False so that all sub-modules are considered as not leaves
+        # and therefore expanded into operators in
+        # torch.fx._symbolic_trace.Tracer.call_module.
+        return False
+
+    @_beartype.beartype
+    def to_bool(self, obj: torch.fx.Proxy) -> bool:
+        # FIXME: This is a hack to tracing through if-else Python blocks.
+        # It may generate incorrect ONNX graphs if the if-else block
+        return False
+
+
+def _wrap_for_symbolic_trace(target: Callable) -> Tuple[Callable, Callable]:
+    """This function wraps ```target`` for symbolic tracing.
+
+    This function wraps ```target``` so that its wrapper produces
+    torch.fx.Proxy in symbolic computation. The returned values are
+    the wrapper and then the original function. Per `_TORCH_METHODS_TO_PATCH`,
+    this function shall receive `torch.arange`, `torch.tensor`, etc. as inputs.
+    """
+
+    @functools.wraps(target)
+    def wrapper(*args, **kwargs):
+        proxy = None
+
+        def check_has_proxy(v):
+            if isinstance(v, torch.fx.Proxy):
+                nonlocal proxy
+                proxy = v
+
+        torch.fx.node.map_aggregate(args, check_has_proxy)
+        torch.fx.node.map_aggregate(kwargs, check_has_proxy)
+
+        if proxy is not None:
+            return proxy.tracer.create_proxy("call_function", target, args, kwargs)
+        else:
+            return target(*args, **kwargs)
+
+    return wrapper, target
+
+
+@_beartype.beartype
+def _module_expansion_symbolic_trace(
+    root: Union[torch.nn.Module, Callable[..., Any]],
+    concrete_args: Optional[Dict[str, Any]] = None,
+) -> torch.fx.GraphModule:
+    """Trace a callable into FX graph.
+
+    When "root" is torch.nn.Module, calls to its submodule (type: torch.nn.Module) will be
+    expanded into operators (e.g., torch.matmul, torch.add, +, and -) to simplify graph
+    structure.
+    """
+    # For functions doesn't support symbolic tracing, create wrappers
+    # which produce symbolic results during tracing.
+    patched_torch_methods = {
+        target_name: _wrap_for_symbolic_trace(getattr(torch, target_name))
+        for target_name in _TORCH_METHODS_TO_PATCH
+    }
+
+    # Set the symbolic-tracing friendly functions so that `tracer.trace` below
+    # can work.
+    for name, (wrapper, _) in patched_torch_methods.items():
+        setattr(torch, name, wrapper)
+
+    try:
+        # Set up a tracer.
+        tracer = ModuleExpansionTracer()
+        # Trace the model.
+        graph = tracer.trace(root, concrete_args)
+        name = (
+            root.__class__.__name__
+            if isinstance(root, torch.nn.Module)
+            else root.__name__
+        )
+        return torch.fx.GraphModule(tracer.root, graph, name)
+    finally:
+        # Revert the patches for symbolic tracing.
+        for name, (_, wrapped) in patched_torch_methods.items():
+            # wrapped is the original version of `torch.name`.
+            setattr(torch, name, wrapped)
+
+
+# TODO: Migrate to `DynamoExporter` after fake model tracing is supported.
+# Proposal at https://github.com/pytorch/pytorch/issues/95900.
+class FXSymbolicTracer(exporter.FXGraphExtractor):
+    """Generates a FX GraphModule using torch.fx.symbolic_trace API
+    Args:
+        concrete_args: Inputs to be partially specialized
+            It can be used to remove control flow or data structures.
+            For example::
+                def f(a, b):
+                    if b == True:
+                        return a
+                    else:
+                        return a*2
+            FX can typically not trace through this due to the presence of control
+            flow. However, we can use `concrete_args` to specialize on the value of
+            `b` to trace through this::
+                f = fx.symbolic_trace(f, concrete_args={'b': False})
+                assert f(3, False)  == 6
+            Note that although you can still pass in different values of `b`, they will be ignored.
+            It can also be used to eliminate data-structure handling from
+            our function. This will use pytrees to flatten your input. To avoid
+            overspecializing, pass in `fx.PH` for values that shouldn't be
+            specialized. For example::
+                def f(x):
+                    out = 0
+                    for v in x.values():
+                        out += v
+                    return out
+                f = fx.symbolic_trace(f, concrete_args={'x': {'a': fx.PH, 'b': fx.PH, 'c': fx.PH}})
+                assert f({'a': 1, 'b': 2, 'c': 4}) == 7
+    """
+
+    def __init__(self, concrete_args: Optional[Dict[str, Any]] = None):
+        super().__init__()
+        # TODO: plumb ``concrete_args`` to symbolic_trace call at ``generate_fx``
+        self.concrete_args = concrete_args
+
+    @_beartype.beartype
+    def _trace_into_fx_graph_via_fx_symbolic_trace(
+        self, model, model_args, model_kwargs
+    ) -> torch.fx.GraphModule:
+        # Bind model args and kwargs with model signature to retrieve default values
+        # of unprovided arguments. These are then used to construct ``concrete_args``.
+        bind_input_step = io_adapter.BindInputStep(
+            torch.onnx.utils.model_signature(model)
+        )
+        self.input_adapter.append_step(bind_input_step)
+        _, named_args = bind_input_step.apply(model_args, model_kwargs, model=model)
+
+        # Create inputs to call symbolic trace (torch.fx.symbolic_trace)
+        # Example content of concrete_args:
+        #  concrete_args["x"] = torch.fx._symbolic_trace.PH
+        #  concrete_args["b"] = 1
+        # where "x" and "b" are argument names in "signature".
+        concrete_args = {}
+        for param_name, param_value in named_args.items():
+            if isinstance(param_value, torch.Tensor):
+                # param_value can be, e.g., a real tensor or a fake tensor.
+                # param_value is treated as substitutable tensor symbol (aka placeholder).
+                concrete_args[param_name] = torch.fx._symbolic_trace.PH
+            else:
+                concrete_args[param_name] = param_value
+
+        # Merge kwargs back into args since that is the format FX graph expects.
+        merge_kwargs_step = io_adapter.MergeKwargsIntoArgsInputStep()
+        self.input_adapter.append_step(merge_kwargs_step)
+        return _module_expansion_symbolic_trace(model, concrete_args=concrete_args)
+
+    def generate_fx(
+        self,
+        options: exporter.ResolvedExportOptions,
+        model: Union[torch.nn.Module, Callable],
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        diagnostic_context = options.diagnostic_context
+        graph_module = self._trace_into_fx_graph_via_fx_symbolic_trace(
+            model, model_args, model_kwargs
+        )
+
+        # Make sure all placeholder nodes are executed before get_attr nodes.
+        # Otherwise, inputs can interleave with initializers in the final ModeoProto.graph.input.
+        # Basically, we want
+        #  ModeoProto.graph.input =
+        #   [input_0, input_1, ..., input_n, weight_0, weight_1, ..., weight_m]
+        # and we don't want
+        #  ModeoProto.graph.input =
+        #   [input_0, weight_0, input_1, weight_1, ..., input_n, weight_0, weight_1, ..., weight_m]
+        graph_module = passes.MovePlaceholderToFront(
+            diagnostic_context, graph_module
+        ).run()
+        # To save memory, move get_attr to input so that the generated model doesn't
+        # have weigh tensors. "replaced_attrs" are a tuple of replaced weight tensors.
+        replace_get_attr_with_placeholder_pass = passes.ReplaceGetAttrWithPlaceholder(
+            diagnostic_context, graph_module
+        )
+        graph_module = replace_get_attr_with_placeholder_pass.run()
+        replaced_attrs = replace_get_attr_with_placeholder_pass.replaced_attrs
+        append_extra_input_step = io_adapter.LiftParametersAndBuffersIntoArgsInputStep(
+            replaced_attrs
+        )
+        self.input_adapter.append_step(append_extra_input_step)
+        # Move all newly created placeholder nodes to the front of the graph.
+        graph_module = passes.MovePlaceholderToFront(
+            diagnostic_context, graph_module
+        ).run()
+        # Finalize the graph editing.
+        graph_module.recompile()
+
+        updated_model_args = self.input_adapter.apply(
+            *model_args, model=model, **model_kwargs
+        )
+
+        return self.pre_export_passes(options, model, graph_module, updated_model_args)  # type: ignore[return-value]
+
+    @_beartype.beartype
+    def pre_export_passes(
+        self,
+        options: exporter.ResolvedExportOptions,
+        original_model: Union[torch.nn.Module, Callable],
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        return exporter.common_pre_export_passes(
+            options, original_model, fx_module, fx_module_args
+        )
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/onnxfunction_dispatcher.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89ffa2b85be882599bf231708d1a03d0e2e5b17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/onnxfunction_dispatcher.py
@@ -0,0 +1,903 @@
+"""Dispatcher for AtenLib functions from onnx-script."""
+
+from __future__ import annotations
+
+import logging
+import operator
+import types
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+
+import torch
+import torch._ops
+import torch.fx
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import (
+    diagnostics,
+    registration,
+    type_utils as fx_type_utils,
+)
+
+if TYPE_CHECKING:
+    import onnxscript  # type: ignore[import]
+
+    from torch.onnx import OnnxRegistry
+
+
+# For beartype
+from onnxscript.function_libs.torch_lib import (  # type: ignore[import]
+    graph_building as onnxscript_graph_building,
+)
+
+
+@_beartype.beartype
+def _find_opschema_matched_symbolic_function_disagnostic_message_formatter(
+    fn: Callable,
+    self,
+    node: torch.fx.Node,
+    default_and_custom_functions: List[registration.ONNXFunction],
+    *args,
+    **kwargs,
+) -> str:
+    """Format the diagnostic message for the nearest match warning."""
+    all_function_overload_names = ""
+    for symbolic_func in default_and_custom_functions:
+        overload_func = symbolic_func.onnx_function
+        all_function_overload_names += f"ONNX Node: {overload_func.name}[opset={overload_func.opset};is_custom={symbolic_func.is_custom}]. \n"  # noqa: B950
+    return f"FX Node: {node.target}. \n" f"{all_function_overload_names}"
+
+
+@_beartype.beartype
+def _find_operator_overloads_in_onnx_registry_disagnostic_message_formatter(
+    fn: Callable,
+    self,
+    node: torch.fx.Node,
+    *args,
+    **kwargs,
+) -> str:
+    """Format the diagnostic message for the nearest match warning."""
+    return f"Searching operator overload: '{node.target}' in onnx registry...\n"
+
+
+class OnnxFunctionDispatcher:
+    """A dispatcher that finds the best ONNX Function for ATen/Custom operators.
+
+    It uses the `torch.ops` name to find the function. If not found, it falls back to default.
+    Otherwise, the best match is found among all function overloads. An exact match has
+    higher precedence over the closest ones.
+
+    Below is a breakdown on how the dispatch mechanism works:
+
+    1. Use the torch.ops name to find the function:
+        a. Check if the ATen overload exists in the registry.
+        b. If not, check if the default overload exists in the registry.
+
+    2. Find the nearest match among all overloaded functions:
+        a. If the types match perfectly, select the function.
+        b. Otherwise, find the nearest one with the highest matching score. Because of
+            the potential wrongly annotated dtypes and attributes matching, we use
+            nearest match to find the best function once the aten name is targeted.
+
+    3. Tie-breaker: If there are multiple nearest matches, we will select the one with
+        the highest matching score.
+
+    NOTE: The nearest match `doesn't guarantee` a correct match, and a warning message is logged.
+    """
+
+    def __init__(
+        self,
+        onnx_registry: "OnnxRegistry",
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ):
+        """Initialize the ONNX Function dispatcher.
+
+        Args:
+            onnx_registry: The ONNX registry.
+            diagnostic_context: The diagnostic context to use for reporting errors.
+        """
+        self.onnx_registry = onnx_registry
+        self.diagnostic_context = diagnostic_context
+
+    @_beartype.beartype
+    def dispatch(
+        self,
+        node: torch.fx.Node,
+        onnx_args: Sequence[
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
+        ],
+        onnx_kwargs: Dict[str, fx_type_utils.Argument],
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ) -> Union["onnxscript.OnnxFunction", "onnxscript.TracedOnnxFunction"]:
+        """Dispatches an ONNX function based on the given FX node, arguments, and keyword arguments.
+        Args:
+            node: The TorchFX node to dispatch the function for.
+            onnx_args: The arguments of the ONNX function.
+            onnx_kwargs: The keyword arguments of the ONNX function.
+            diagnostic_context: The diagnostic context to use for reporting errors.
+        Returns:
+            Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
+        Raises:
+            RuntimeError: If there are no overloaded functions available for the given FX node.
+        """
+        # If there are no overloaded functions available for the given FX node, raise an
+        # unsupported error
+        default_and_custom_functions = self.get_function_overloads(
+            node, diagnostic_context
+        )
+
+        # If there are overloaded functions available, we will find one that perfect or
+        # nearest matches the given arguments and keyword arguments
+        return self._find_the_perfect_or_nearest_match_onnxfunction(
+            node,
+            default_and_custom_functions,
+            onnx_args,
+            onnx_kwargs,
+            diagnostic_context,
+        )
+
+    @_beartype.beartype
+    def _filter_or_keep_complex(
+        self,
+        node,
+        default_and_custom_functions: List[registration.ONNXFunction],
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ) -> List[registration.ONNXFunction]:
+        """Filter the complex functions if the input has complex dtype."""
+
+        args_with_complex_dtype = [_is_arg_with_complex_dtype(arg) for arg in node.args]
+        if any(args_with_complex_dtype):
+            default_and_custom_functions = [
+                func for func in default_and_custom_functions if func.is_complex
+            ]
+            # If we can't find the complex function group, raise error.
+            if not default_and_custom_functions:
+                op_full_name = self._get_aten_name(
+                    node, diagnostic_context
+                ).qualified_name()
+                diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+                    diagnostics.rules.no_symbolic_function_for_call_function,
+                    diagnostics.levels.ERROR,
+                    f"Cannot find any COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                    unsupported_fx_node=node,
+                )
+                diagnostic_context.log(diagnostic)
+                raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+        else:
+            default_and_custom_functions = [
+                func for func in default_and_custom_functions if not func.is_complex
+            ]
+            # If we can't find the complex function group, raise error.
+            if not default_and_custom_functions:
+                op_full_name = self._get_aten_name(
+                    node, diagnostic_context
+                ).qualified_name()
+                diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+                    diagnostics.rules.no_symbolic_function_for_call_function,
+                    diagnostics.levels.ERROR,
+                    f"Can ONLY find COMPLEX symbolic function for {op_full_name}, "
+                    f"which should be registered under {node.target}.",
+                    unsupported_fx_node=node,
+                )
+                diagnostic_context.log(diagnostic)
+                raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+        return default_and_custom_functions
+
+    @_beartype.beartype
+    @diagnostics.diagnose_call(
+        diagnostics.rules.find_opschema_matched_symbolic_function,
+        diagnostic_message_formatter=_find_opschema_matched_symbolic_function_disagnostic_message_formatter,
+    )
+    def _find_the_perfect_or_nearest_match_onnxfunction(
+        self,
+        node: torch.fx.Node,  # this is used in diagnostic_message_formatter
+        default_and_custom_functions: List[registration.ONNXFunction],
+        onnx_args: Sequence[
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
+        ],
+        onnx_kwargs: Dict[str, fx_type_utils.Argument],
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ):
+        """Find the perfect/nearest matched OnnxFunction for the given FX node, arguments, and keyword arguments.
+
+        Args:
+            default_and_custom_functions: The list includes overloaded functions, with
+                custom ones appearing after the default ones.
+            onnx_args: Arguments organized in PyTorch inputs way.
+            onnx_kwargs: Keyword arguments organized in PyTorch inputs way.
+            diagnostic_context: The diagnostic context to use for reporting errors.
+
+            Returns:
+                Either an `onnxscript.OnnxFunction` or `onnxscript.TracedOnnxFunction` instance based on the dispatch algorithm.
+            Raises:
+                RuntimeError: If there are no overloaded functions available for the given FX node.
+        """
+        overload_match_ranking: Dict[registration.ONNXFunction, Optional[int]] = {}
+        diagnostic = diagnostic_context.inflight_diagnostic()
+
+        # Iterate the overloaded functions in reverse order to prioritize the custom ones
+        # over the default ones, and find the perfect match.
+        for symbolic_function in reversed(default_and_custom_functions):
+            function_opschema = _OnnxSchemaChecker(symbolic_function.onnx_function)
+
+            # NOTE: 1. If the perfect match is found, return the function
+            if function_opschema.perfect_match_inputs(
+                diagnostic, onnx_args, onnx_kwargs
+            ):
+                return symbolic_function.onnx_function
+            # Record the match score for the nearest match if it's not the perfect match
+            overload_match_ranking[symbolic_function] = function_opschema.match_score
+
+        # NOTE: 2. If there is no perfect match, find the nearest match among the nearest matche candidates
+        # If there is no nearest match, raise an error
+        overload_match_ranking = {
+            k: v for k, v in overload_match_ranking.items() if v is not None
+        }
+        if not overload_match_ranking:
+            # If there are no overloaded functions available for the given FX node, raise an
+            # unsupported error
+            op_full_name = self._get_aten_name(
+                node, diagnostic_context
+            ).qualified_name()
+            diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+                diagnostics.rules.no_symbolic_function_for_call_function,
+                diagnostics.levels.ERROR,
+                f"Cannot find any perfect/nearest match of symbolic function for {op_full_name},"
+                f"which should be registered under {node.target}.",
+                unsupported_fx_node=node,
+            )
+            diagnostic_context.log(diagnostic)
+            raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+
+        diagnostic.warning(
+            "### Exact match is not found!\n"
+            "Cannot find a perfect match of symbolic overload, "
+            "a nearest match is found. Please check the ONNX output carefully. \n",
+        )
+        diagnostic.level = diagnostics.levels.WARNING
+        # NOTE: 3. Tie breaker: if there are multiple nearest matches, we will choose the one
+        # that is custom first. If there are multiple custom ones, we will choose the one
+        # that is added lastly in the list.
+        symbolic_function_list: List[registration.ONNXFunction] = sorted(
+            overload_match_ranking,
+            key=lambda k: (
+                overload_match_ranking[k],
+                k.is_custom,
+                default_and_custom_functions.index(k),
+            ),
+            reverse=True,
+        )
+        return symbolic_function_list[0].onnx_function
+
+    @_beartype.beartype
+    def _get_aten_name(
+        self, node: torch.fx.Node, diagnostic_context: diagnostics.DiagnosticContext
+    ) -> registration.OpName:
+        """Get the OpName from the target.
+
+        Args:
+            node: The TorchFX node to get the aten name for.
+            diagnostic_context: The diagnostic context to use for reporting errors.
+
+        Returns:
+            The internal op name within dataclass: registration.OpName.
+        """
+        if node.target == operator.getitem:
+            return registration.OpName.from_name_parts(
+                namespace="aten", op_name="getitem"
+            )
+        if isinstance(node.target, torch._ops.OpOverloadPacket):
+            # aten::sym_size is the only OverloadPacket that we support.
+            # schema: aten::sym_size(Tensor self, int dim) -> Tensor
+            if node.target != torch.ops.aten.sym_size:
+                diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+                    diagnostics.rules.no_symbolic_function_for_call_function,
+                    diagnostics.levels.ERROR,
+                    f"Unsupported OverloadPacket: {node.target}, aten.sym_size is the only allowed OverloadPacket!",
+                    unsupported_fx_node=node,
+                )
+                diagnostic_context.log(diagnostic)
+                raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+            # TODO(titaiwang): aten::sym_size has overload, but fx graph is using
+            # overloadpacket for some reasons.
+            # https://github.com/pytorch/pytorch/issues/97201
+            aten_op_default = node.target.default
+            return registration.OpName.from_op_overload(op_overload=aten_op_default)  # type: ignore[no-any-return]
+
+        if isinstance(node.target, types.BuiltinFunctionType):
+            # Make sure it's symint/symfloat consuming builtin ops.
+            for node_arg in node.args:
+                if (not isinstance(node_arg, (torch.fx.Node, int, float))) or (
+                    isinstance(node_arg, torch.fx.Node)
+                    and not fx_type_utils.is_torch_symbolic_type(node_arg.meta["val"])
+                ):
+                    diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+                        diagnostics.rules.no_symbolic_function_for_call_function,
+                        diagnostics.levels.ERROR,
+                        f"Unsupported node arg: {node_arg} (type {type(node_arg)}) with builtin function: {node.target},"
+                        " only int/float/SymInt/SymFloat is supported with built-in ops!",
+                        unsupported_fx_node=node,
+                    )
+                    diagnostic_context.log(diagnostic)
+                    raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+            return registration.OpName.from_builtin_function(node.target)
+
+        if isinstance(node.target, torch._ops.OpOverload):
+            return registration.OpName.from_op_overload(op_overload=node.target)
+
+        # Unexpected target, raise error.
+        diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+            diagnostics.rules.no_symbolic_function_for_call_function,
+            diagnostics.levels.ERROR,
+            f"Unknown call_function target: {node.target}",
+            unsupported_fx_node=node,
+        )
+        diagnostic_context.log(diagnostic)
+        raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+
+    @_beartype.beartype
+    @diagnostics.diagnose_call(
+        diagnostics.rules.find_operator_overloads_in_onnx_registry,
+        diagnostic_message_formatter=_find_operator_overloads_in_onnx_registry_disagnostic_message_formatter,
+    )
+    def get_function_overloads(
+        self,
+        node: torch.fx.Node,
+        diagnostic_context: diagnostics.DiagnosticContext,
+    ) -> List[registration.ONNXFunction]:
+        """Get the function overloads from the registry.
+
+        Args:
+            node: The node to get the function overloads for.
+            diagnostic_context: The diagnostic context to use for reporting errors.
+
+        Returns:
+            The list contains ONNXFunctions, starting with the default ones and
+            followed by any custom ones.
+        """
+
+        internal_opname: registration.OpName = self._get_aten_name(
+            node=node, diagnostic_context=diagnostic_context
+        )
+
+        # If the ATen/Custom operators are not registered, the group will be None.
+        # And non-registered ATen/Custom operators will trigger error in the next step.
+        function_group: Optional[List[registration.ONNXFunction]] = None
+
+        function_group = self.onnx_registry.get_op_functions(
+            namespace=internal_opname.namespace,
+            op_name=internal_opname.op_name,
+            overload=internal_opname.overload,
+        )
+
+        # NOTE: Fall back to default overload if the ONNX registry doesn't have the overload.
+        if function_group is None:
+            function_group = self.onnx_registry.get_op_functions(
+                namespace=internal_opname.namespace,
+                op_name=internal_opname.op_name,
+                overload=None,
+            )
+            if function_group is not None:
+                op_full_name = internal_opname.qualified_name()
+                diagnostic = diagnostic_context.inflight_diagnostic()
+                diagnostic.warning(
+                    "### The operator overload is not found in onnx registry!\n"
+                    "Cannot find the operator overload in onnx registry, but "
+                    "the default overload is found. Please check the ONNX output carefully. \n",
+                )
+                diagnostic.level = diagnostics.levels.WARNING
+
+        if function_group is not None:
+            # NOTE: If the input has complex dtype, we will only dispatch to the complex functions.
+            function_group = self._filter_or_keep_complex(
+                node, function_group, diagnostic_context
+            )
+            return function_group  # type: ignore[return-value]
+
+        op_full_name = internal_opname.qualified_name()
+        diagnostic = diagnostics.UnsupportedFxNodeDiagnostic(
+            diagnostics.rules.no_symbolic_function_for_call_function,
+            diagnostics.levels.ERROR,
+            f"Cannot find symbolic function for {op_full_name}, "
+            f"which should be registered under {node.target}.",
+            unsupported_fx_node=node,
+        )
+        diagnostic_context.log(diagnostic)
+        raise diagnostics.RuntimeErrorWithDiagnostic(diagnostic)
+
+
+class _OnnxSchemaChecker:
+    """
+    The OnnxSchemaChecker class is a checker for ONNX OpSchema and param schema.
+
+    It provides methods to check for input compatibility based on the OpSchema. It also
+    provides a matching score to indicate how well the OpSchema matches the input and
+    kwargs types. A function will be evaluated as perfect match, nearest match eligible,
+    or no match.
+
+    Here are some common examples in categories:
+
+    1. [NOTE: Perfect match]: The number of inputs and attributes are exactly the same as
+        the OpSchema. The types of inputs and attributes are exactly the same as the
+        OpSchema.
+
+        ```python
+        inputs = (Tensor[2, 3], Tensor[2, 3])
+        attributes = {"alpha": 1.0}
+
+        @torch_op("aten::op")
+        def aten_op(self: TReal, other: TReal, alpha: float = 1) -> TReal:
+            ...
+
+        ```
+        Result: Perfect match.
+
+    2. [NOTE: Optional input]: The dispatcher recognizes optional inputs. However,
+        the input can't be ignored. None must be provided.
+
+        ```python
+        inputs = (Tensor([2, 3]), None)
+        attributes = {}
+
+        aten_op(X: TTensor, Y: Optional[INT64]):
+            ...
+        ```
+        Result: Perfect match.
+        Real example: `aten::convolution`.
+
+    3. [NOTE: Different attributes]: If an attribute is provided with value, it's
+        a must to match the attribute in function signature.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a":1, "b":2}
+
+        aten_op(X: TTensor, a: int):
+            ...
+        ```
+        Result: No match.
+        Real example: `aten::div` vs `aten::div.Tensor_mode`.
+
+    4. [NOTE: Default attributes]: Default attribute will fill in the value into
+        inputs/attributes.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {}
+
+        aten_op(X: TTensor, a: int = 3):
+            ...
+        ```
+        Result: Perfect match.
+        Real example: `aten::clone`
+
+    5. [NOTE: Ignore attribute with None value]: The attributes with None value
+        will be ignored in matching.
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a": None}
+
+        aten_op(X: TTensor):
+            ...
+        ```
+        Result: Perfect match.
+
+        ```python
+        inputs = (Tensor([2, 3]),)
+        attributes = {"a": None}
+
+        aten_op(X: TTensor, a: int = 3):
+            ...
+        ```
+        Result: Nearest match eligible.
+
+        Real example: `aten::div` vs `aten::div.Tensor_mode`.
+
+    Attributes:
+        onnxfunction: The OnnxFunction.
+        param_schema: The parameter schema defined in the OnnxFunction.
+        op_schema: The ONNX OpSchema.
+        type_constraints: The type constraints defined in the OpSchema.
+        attributes: The attributes defined in the OpSchema.
+        _matching_score: The matching score of the OnnxSchemaChecker .
+
+    """
+
+    def __init__(
+        self,
+        onnxfunction: Union[onnxscript.OnnxFunction, onnxscript.TracedOnnxFunction],
+    ):
+        """Initialize the OnnxSchemaChecker .
+
+        Args:
+            onnxfunction: The OnnxFunction.
+        """
+        self.onnxfunction = onnxfunction
+        self.param_schema = self.onnxfunction.param_schemas()
+        op_schema = self.onnxfunction.op_schema
+        # Both `OnnxFunction` and `TracedOnnxFunction` never return None for `op_schema`.
+        # However their base class would. Hence return type is annotated as Optional[OpSchema].
+        assert op_schema is not None
+        self.op_schema = op_schema
+        self.type_constraints = {
+            # "T": {"tensor(int64)"}
+            constraint.type_param_str: set(constraint.allowed_type_strs)
+            for constraint in self.op_schema.type_constraints
+        }
+        self.attributes = self.op_schema.attributes
+        self._matching_score: Optional[int] = None
+
+    @property
+    def match_score(self) -> Optional[int]:
+        """The matching score of the OnnxSchemaChecker .
+
+        If this remains None, it means the matching score has not been calculated,
+        and it's not a nearest match candidate.
+
+        Returns:
+            The matching score of the OnnxSchemaChecker .
+        """
+        return self._matching_score
+
+    @_beartype.beartype
+    def perfect_match_inputs(
+        self,
+        diagnostic: diagnostics.Diagnostic,
+        args: Sequence[
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
+        ],
+        kwargs: Dict[str, fx_type_utils.Argument],
+    ) -> bool:
+        """Check if the inputs perfectly match the OpSchema requirements.
+
+        The definition of perfect match is that the input types are all in the type
+        constraints and the number of inputs matches the number of inputs in the
+        OpSchema.
+
+        Checking steps:
+        1. The function signature matches the inputs number, and attribute names.
+        2. The input/attribute types are all in the type constraints.
+
+        A function should at least pass the first step to be eligible for the
+        nearest matching.
+
+        Args:
+            diagnostic: The diagnostic to use for logging detailed info.
+            args: The input arguments organized in PyTorch inputs way.
+            kwargs: The input keyword arguments organized in PyTorch inputs way.
+
+        Returns:
+            True if the inputs match the requirements, False otherwise.
+        """
+
+        # NOTE: OnnxFunction does not have the same function signature as the original
+        # PyTorch operator. We need to separate the input/attributes from the arguments.
+        (
+            function_inputs,
+            function_attributes,
+        ) = self._separate_input_attributes_from_arguments(
+            self.param_schema,
+            args,
+            kwargs,
+            fill_defaults=True,  # fill defaults for optional arguments to match
+        )
+        with diagnostic.log_section(logging.INFO, "Checking perfect match..."):
+            diagnostic.info(
+                "%s",
+                diagnostics.LazyString(diagnostics.format_argument, self.onnxfunction),
+            )
+            # NOTE: 1. Check if the input number and attribute names match the
+            # OpSchema. If it's not, we know the function is not eligible to be a perfect
+            # match, nor a nearest match.
+            # We use is_perfect_match to postpone the return value to the end
+            # of the function, as we want to log all the mismatch info.
+            is_perfect_match = True
+            if len(function_inputs) != len(self.op_schema.inputs):
+                with diagnostic.log_section(
+                    logging.INFO, "Failed: input number mismatch!"
+                ):
+                    diagnostic.info(
+                        "Actual %d vs expected %d",
+                        len(function_inputs),
+                        len(self.op_schema.inputs),
+                    )
+                diagnostic.info("The function is not a nearest match candidate.")
+                is_perfect_match = False
+
+            if set(function_attributes) != set(self.attributes):
+                with diagnostic.log_section(
+                    logging.INFO, "Failed: attribute mismatch!"
+                ):
+                    diagnostic.info(
+                        "%s",
+                        diagnostics.LazyString(
+                            lambda: f"Actual {set(function_attributes)} vs expected {set(self.attributes)}",
+                        ),
+                    )
+                diagnostic.info("The function is not a nearest match candidate.")
+                is_perfect_match = False
+
+            # If it's already not a perfect match, we can return False directly. Further
+            # checking is only for the functions that are eligible for nearest match.
+            if not is_perfect_match:
+                return False
+
+            # NOTE: 2. The dtypes of inputs and attributes should be in the
+            # type constraints of the OpSchema. If they are not, we know the function is not
+            # eligible to be a perfect match, but can be a nearest match candidate.
+            for schema_input, torch_input in zip(
+                self.op_schema.inputs, function_inputs
+            ):
+                torch_input_compatible_types = _find_onnx_data_type(torch_input)
+                allowed_types = self.type_constraints[schema_input.type_str]
+                if not allowed_types.intersection(
+                    torch_input_compatible_types
+                ) and not any(
+                    fx_type_utils.is_optional_onnx_dtype_str(onnx_type_str)
+                    for onnx_type_str in allowed_types
+                ):
+                    # If torch_input_compatible_types isn't in allowed_types
+                    # of this input defined in the OpSchema, we know the function
+                    # and the input are not compatible
+                    with diagnostic.log_section(
+                        logging.INFO,
+                        "Failed: input type mismatch for input '%s'!",
+                        schema_input.name,
+                    ):
+                        diagnostic.info(
+                            "Actual %s vs\nExpected %s",
+                            torch_input_compatible_types,
+                            allowed_types,
+                        )
+                    is_perfect_match = False
+
+            for attribute_name, attribute in function_attributes.items():
+                if not self._match_onnx_attribute_type(attribute_name, attribute):
+                    # If the attribute type of the OpSchema and the attribute type don't match,
+                    # we know the function and the input are not compatible
+                    with diagnostic.log_section(
+                        logging.INFO,
+                        "Failed: attribute '%s' type mismatch!",
+                        attribute_name,
+                    ):
+                        diagnostic.info(
+                            "Actual %s vs\nExpected %s",
+                            type(attribute),
+                            self.attributes[attribute_name].type,
+                        )
+                    is_perfect_match = False
+
+            # NOTE: This is still a candidate for nearest match, as it only mismatches attributes on dtype.
+            self._record_matching_score(function_inputs, function_attributes)
+            diagnostic.info("match score: %d", self.match_score)
+            return is_perfect_match
+
+    @_beartype.beartype
+    def _match_onnx_attribute_type(
+        self,
+        attribute_name: str,
+        attribute: Union[
+            fx_type_utils.Argument, onnxscript_graph_building.TorchScriptTensor
+        ],
+        is_sequence: bool = False,
+    ) -> bool:
+        if isinstance(attribute, (int, float, bool, str)):
+            attribute_onnx_type = fx_type_utils.from_python_type_to_onnx_attribute_type(
+                type(attribute), is_sequence=is_sequence
+            )
+            if attribute_onnx_type != self.attributes[attribute_name].type:
+                return False
+        # If the attribute is an empty list, we don't know the type of the list
+        # so it's a mismatch
+        elif isinstance(attribute, (list, tuple)) and attribute:
+            return self._match_onnx_attribute_type(
+                attribute_name, attribute[0], is_sequence=True
+            )
+        else:
+            # NOTE: Unrecognized attribute type
+            return False
+        return True
+
+    @_beartype.beartype
+    def _record_matching_score(
+        self,
+        inputs: Sequence[
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
+        ],
+        attributes: Dict[str, fx_type_utils.Argument],
+    ):
+        """Calculate the inputs matching score of the OpSchema requirements to find the nearest match.
+
+        Only the functions which have the same number of inputs and attributes as the
+        OpSchema are eligible to be a nearest match candidate. Thus, we don't need to
+        check the length of inputs and attributes here, and only check the types of
+        inputs and attributes.
+
+        How the matchsing score is calculated:
+            score += 1 if one input/attribute type is in the type constraints.
+
+        Limitations:
+            None/NoeType/[] could result in zero matches, and the same score of overloads,
+            which will be recorded in SARIF.
+
+        Args:
+            inputs: The input arguments.
+            attributes: The input keyword arguments.
+
+        Returns:
+            True if the inputs match the requirements, False otherwise.
+        """
+        self._matching_score = 0
+        # If they have different length of arguments, the score would be lower to those
+        # functions which have the same length of arguments.
+        for schema_input, torch_input in zip(self.op_schema.inputs, inputs):
+            torch_input_compatible_types = _find_onnx_data_type(torch_input)
+            allowed_types = self.type_constraints[schema_input.type_str]
+            if allowed_types.intersection(torch_input_compatible_types):
+                # If torch_input_compatible_types is in allowed_types
+                # of this input defined in the OpSchema, we know the function
+                # and the input are compatible
+                self._matching_score += 1
+        # NOTE: The penalty is applied to those functions which have different attributes.
+        for attribute_name, attribute_proto in self.attributes.items():
+            attribute = attributes[attribute_name]
+            attribute_onnx_type = fx_type_utils.from_python_type_to_onnx_attribute_type(
+                type(attribute)
+            )
+            if attribute_onnx_type != attribute_proto.type:
+                # If the attribute type of the OpSchema and the attribute type don't match,
+                # we know the function and the input are not compatible
+                self._matching_score -= 1
+
+    # NOTE: Referenced from onnxscript internal function.
+    # Importing this function makes the code less robust, as it is not a public API.
+    @_beartype.beartype
+    def _separate_input_attributes_from_arguments(
+        self,
+        param_schemas: Sequence["onnxscript.values.ParamSchema"],
+        args: Sequence[
+            Optional[
+                Union[fx_type_utils.TensorLike, str, int, float, bool, list, complex]
+            ]
+        ],
+        kwargs: Dict[str, fx_type_utils.Argument],
+        fill_defaults: bool = True,
+    ) -> Tuple[List[Any], Dict[str, Any]]:
+        """Separate Python args and kwargs into ONNX inputs and attributes.
+
+        Extra_kwargs are ignored if their values are None. For example, if the
+        OpSchema has an attribute "rounding_mode" and the caller provides
+        "rounding_mode=None", the attribute "rounding_mode" will not be included
+        in the returned attributes when the OnnxFunction signature doesn't have
+        "rounding_mode" as an attribute.
+
+        Args:
+            param_schemas: The parameter schemas of an Op or a OnnxFunction.
+            args: The Python positional arguments supplied by the caller.
+            kwargs: The Python keyword arguments supplied by the caller.
+            fill_defaults: Whether to fill the default values for attributes.
+
+        Returns:
+            A tuple of two elements:
+            - A list of ONNX inputs.
+            - An dictionary of ONNX attribute names and values.
+
+        Raises:
+            TypeError: When allow_extra_kwargs is False and there are unknown kwargs.
+            TypeError: When a required input is not provided.
+        """
+        # args, kwargs and param_schemas should be all in order
+        # user may not specify all inputs or attributes
+
+        import onnx
+
+        onnx_inputs: List[Any] = []
+        onnx_attributes: Dict[str, Any] = dict()
+        # NOTE: We need to copy kwargs because we will mutate it
+        copy_kwargs = kwargs.copy()
+        for i, param in enumerate(param_schemas):
+            if param.is_variadic_input:
+                # Exhaust all remaining args
+                onnx_inputs.extend(args[i:])
+                args = []
+                continue
+            if i < len(args):
+                if param.is_input:
+                    onnx_inputs.append(args[i])
+                else:
+                    onnx_attributes[param.name] = args[i]
+            elif param.name in copy_kwargs:
+                if param.is_input:
+                    # Move the input from kwargs to inputs
+                    onnx_inputs.append(copy_kwargs[param.name])
+                    copy_kwargs.pop(param.name)
+                else:
+                    onnx_attributes[param.name] = copy_kwargs[param.name]
+            elif (
+                param.is_attribute
+                and self.attributes[param.name].default_value.type
+                != onnx.AttributeProto.UNDEFINED  # type: ignore[attr-defined]
+            ):
+                # User did not provide the attribute
+                if fill_defaults:
+                    onnx_attributes[param.name] = param.default
+            # optional input
+            elif param.is_input:
+                if fill_defaults:
+                    onnx_inputs.append(None)
+
+        # NOTE: Pick up extra kwargs if it's not None. None is not expected
+        # as an attribute value in torchlib.
+        for k, v in copy_kwargs.items():
+            if k not in onnx_attributes and v is not None:
+                onnx_attributes[k] = v
+        return onnx_inputs, onnx_attributes
+
+
+@_beartype.beartype
+def _is_arg_with_complex_dtype(arg: fx_type_utils.Argument) -> bool:
+    """Check if the node has complex dtype recursively."""
+    if (
+        isinstance(arg, torch.fx.Node)
+        and "val" in arg.meta
+        and isinstance(arg.meta["val"], torch.Tensor)
+        and torch.is_complex(arg.meta["val"])
+    ):
+        return True
+    elif isinstance(arg, list):
+        for item in arg:
+            return _is_arg_with_complex_dtype(item)
+    return False
+
+
+@_beartype.beartype
+def _find_onnx_data_type(
+    torch_input: Optional[
+        Union[fx_type_utils.TensorLike, str, int, float, bool, list, tuple, complex]
+    ]
+) -> Set[str]:
+    """Convert inputs data type from torch acceptable dtype to the compatible onnx dtype string."""
+    if (
+        isinstance(torch_input, fx_type_utils.TensorLike)
+        and torch_input.dtype is not None
+    ):
+        return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(torch_input.dtype)
+    if isinstance(torch_input, (int, float, bool, str, complex)):
+        return fx_type_utils.from_torch_dtype_to_onnx_dtype_str(type(torch_input))
+    if isinstance(torch_input, (list, tuple)) and torch_input:  # [Tensor, Tensor]
+        set_dtype = _find_onnx_data_type(torch_input[0])
+        if any(isinstance(input, fx_type_utils.TensorLike) for input in torch_input):
+            # NOTE: Any Tensor involved in a list would make it a seq(tensor(onnx_type))
+            return {f"seq({dtype})" for dtype in set_dtype}
+        else:
+            # constant list of non-tensor type
+            return set_dtype
+    if (
+        torch_input is None
+        or (
+            isinstance(torch_input, fx_type_utils.TensorLike)
+            and torch_input.dtype is None
+        )
+        or (isinstance(torch_input, (list, tuple)) and not torch_input)
+    ):
+        # NOTE: None, No dtype, and empty list are edge cases, we allow it to be any type to relax the type check
+        # seq(tensor) also goes to here, as it is not supported in torchscript, and it would be None in this case.
+        return set()
+
+    raise RuntimeError(f"Unknown input type from input: {torch_input}")
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/op_validation.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/op_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c4f756affc4fcb9090a7dcfaa29de5d9482f9df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/op_validation.py
@@ -0,0 +1,389 @@
+"""Module for handling op-level validation during exporting."""
+
+from __future__ import annotations
+
+import logging
+
+from typing import Any, Callable, Dict, List, Sequence, Tuple, Union
+
+import onnxscript  # type: ignore[import]
+from onnxscript import evaluator  # type: ignore[import]
+
+import torch
+import torch.fx
+
+from torch.fx.experimental import symbolic_shapes
+from torch.onnx import _constants, _type_utils as jit_type_utils
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import (
+    diagnostics,
+    fx_onnx_interpreter,
+    type_utils as fx_type_utils,
+)
+from torch.utils import _pytree
+
+
+@_beartype.beartype
+def _op_level_debug_message_formatter(
+    fn: Callable,
+    self,
+    node: torch.fx.Node,
+    symbolic_fn: Union[onnxscript.OnnxFunction, onnxscript.TracedOnnxFunction],
+    *args,
+    **kwargs,
+) -> str:
+    return (
+        f"FX Node: {node.op}::{node.target}[name={node.name}]. \n"
+        f"ONNX Node: {symbolic_fn.name}[opset={symbolic_fn.opset}]."
+    )
+
+
+@_beartype.beartype
+@diagnostics.diagnose_call(
+    diagnostics.rules.op_level_debugging,
+    diagnostic_message_formatter=_op_level_debug_message_formatter,
+)
+def validate_op_between_ort_torch(
+    diagnostic_context: diagnostics.DiagnosticContext,
+    node: torch.fx.Node,
+    symbolic_fn: Union[onnxscript.OnnxFunction, onnxscript.TracedOnnxFunction],
+    fx_args: List[fx_type_utils.Argument],
+    fx_kwargs: Dict[str, fx_type_utils.Argument],
+    fx_graph_module: torch.fx.GraphModule,
+):
+    """Validate the op between ONNX Runtime and PyTorch.
+
+    The function will run the op in ONNX Runtime and PyTorch and compare the
+    results. It doesn't break the exporting process, but saves each op validated
+    result into SARIF, under the section of `fx_onnx_interpreter`.
+
+    There are three signs can be found:
+    1. Blue: Pass
+    2. Yellow: Bypass
+
+    Args:
+        node (torch.fx.Node): The validated fx.node
+        symbolic_fn (Union[onnxscript.OnnxFunction, onnxscript.TracedOnnxFunction]): The corresponded ONNX node
+        torch_args (list): torch argument inputs
+        torch_kwargs (dict): torch keyword argument inputs
+        fx_graph_module (torch.fx.GraphModule): The fx.GraphModule that contains the nodes
+    """
+    # op-level validation
+    # Symbolic_fn should have the same output as node.target (torch ops)
+
+    try:
+        torch_args, torch_kwargs = _wrap_fx_args_as_torch_args(
+            fx_args, fx_kwargs, fx_graph_module
+        )
+    except ValueError as value_error:
+        diagnostic = diagnostic_context.inflight_diagnostic()
+        with diagnostic.log_section(
+            logging.WARNING, "Op level debug fails due to unsupported input types"
+        ):
+            diagnostic.log_source_exception(logging.WARNING, value_error)
+        diagnostic.level = diagnostics.levels.WARNING
+        return
+
+    with evaluator.default_as(evaluator.ort_evaluator):
+        try:
+            expected_outputs = node.target(*torch_args, **torch_kwargs)  # type: ignore[operator]
+        # NOTE: randomly generating indices/dim: INT64 could go out of bounds
+        except IndexError as index_error:
+            # TODO(titaiwang): How to bound indices/dim: INT64
+            diagnostic = diagnostic_context.inflight_diagnostic()
+            with diagnostic.log_section(logging.WARNING, "Op level debug is bypassed"):
+                diagnostic.log_source_exception(logging.WARNING, index_error)
+            diagnostic.level = diagnostics.levels.WARNING
+            return
+        # NOTE: Error in torch ops with random inputs generated from FakTensors
+        except RuntimeError as runtime_error:
+            diagnostic = diagnostic_context.inflight_diagnostic()
+            with diagnostic.log_section(
+                logging.WARNING, "Op level debug fails on PyTorch"
+            ):
+                diagnostic.log_source_exception(logging.WARNING, runtime_error)
+            diagnostic.level = diagnostics.levels.WARNING
+            return
+
+        try:
+            (
+                function_eager_inputs,
+                function_eager_attributes,
+            ) = _convert_torch_args_to_onnxfunction_args(
+                symbolic_fn.param_schemas(),
+                torch_args,
+                torch_kwargs,
+                allow_extra_kwargs=True,
+            )
+            # NOTE: Apply kwargs preprocessing AFTER they are split
+            function_eager_attributes = (
+                fx_onnx_interpreter.filter_incompatible_and_dtype_convert_kwargs(
+                    function_eager_attributes
+                )
+            )
+        # NOTE: Incompatible kwargs or missing required args
+        except TypeError as type_error:
+            diagnostic = diagnostic_context.inflight_diagnostic()
+            with diagnostic.log_section(logging.WARNING, "Op level debug is bypassed"):
+                diagnostic.log_source_exception(logging.WARNING, type_error)
+            diagnostic.level = diagnostics.levels.WARNING
+            return
+        try:
+            ort_outputs = symbolic_fn(
+                *function_eager_inputs, **function_eager_attributes
+            )
+        # NOTE: Error in ONNX Runtime with random inputs generated from FakTensors
+        except RuntimeError as runtime_error:
+            diagnostic = diagnostic_context.inflight_diagnostic()
+            with diagnostic.log_section(
+                logging.WARNING, "Op level debug fails on ONNXRUNTIME"
+            ):
+                diagnostic.log_source_exception(logging.WARNING, runtime_error)
+            diagnostic.level = diagnostics.levels.WARNING
+            return
+
+        flattened_torch_outputs, _ = _pytree.tree_flatten(expected_outputs)
+        flattened_function_outputs, _ = _pytree.tree_flatten(ort_outputs)
+
+        assert flattened_torch_outputs
+        assert len(flattened_torch_outputs) == len(flattened_function_outputs)
+
+        for torch_output, function_output in zip(
+            flattened_torch_outputs, flattened_function_outputs
+        ):
+            if isinstance(
+                torch_output, torch.Tensor
+            ) and fx_type_utils.is_torch_complex_dtype(torch_output.dtype):
+                torch_output = torch.view_as_real(torch_output.resolve_conj())
+            try:
+                if isinstance(function_output, onnxscript.tensor.Tensor):
+                    function_output = function_output.value
+
+                # Use torch.testing as opposed to np.testing to ensure dtypes and shapes match
+                torch.testing.assert_close(
+                    torch.tensor(function_output).cpu(),
+                    torch_output.cpu()
+                    if isinstance(torch_output, torch.Tensor)
+                    else torch.tensor(torch_output).cpu(),
+                    rtol=1e-4,
+                    atol=1e-3,
+                )
+            except AssertionError as e:
+                diagnostic = diagnostic_context.inflight_diagnostic()
+                with diagnostic.log_section(logging.WARNING, "Validation failed"):
+                    diagnostic.log_source_exception(logging.WARNING, e)
+                diagnostic.level = diagnostics.levels.WARNING
+
+
+@_beartype.beartype
+def _convert_symint_to_int_in_shape(shape: torch.Size) -> torch.Size:
+    """Convert SymInt to int in shape
+
+    Args:
+        shape (torch.Size): The shape of a tensor
+    Raises:
+        ValueError: When SymInt is found in shape
+    Returns:
+        torch.Size: The shape of a tensor with SymInt converted to int
+
+    """
+    list_int_shape = []
+    for dim in shape:
+        if isinstance(dim, torch.SymInt):
+            if symbolic_shapes.has_hint(dim):
+                list_int_shape.append(symbolic_shapes.hint_int(dim))
+            else:
+                raise ValueError(
+                    f"An unbacked SymInt found in shape. SymInt: {dim}; "
+                    f"torch.Size: {shape}. There is no hint for SymInt."
+                )
+        else:
+            list_int_shape.append(dim)
+    return torch.Size(list_int_shape)
+
+
+@_beartype.beartype
+def generate_random_tensors(shape: torch.Size, dtype: torch.dtype):
+    shape = _convert_symint_to_int_in_shape(shape)
+
+    if dtype == torch.uint8:
+        return torch.randint(
+            low=_constants.UINT8_MIN, high=_constants.UINT8_MAX, size=shape, dtype=dtype
+        )
+    if dtype == torch.int8:
+        return torch.randint(
+            low=_constants.INT8_MIN, high=_constants.INT8_MAX, size=shape, dtype=dtype
+        )
+    if dtype == torch.int16:
+        return torch.randint(
+            low=_constants.INT16_MIN, high=_constants.INT16_MAX, size=shape, dtype=dtype
+        )
+    if dtype == torch.int32:
+        return torch.randint(
+            low=_constants.INT32_MIN, high=_constants.INT32_MAX, size=shape, dtype=dtype
+        )
+    if dtype == torch.int64:
+        return torch.randint(
+            low=_constants.INT64_MIN, high=_constants.INT64_MAX, size=shape, dtype=dtype
+        )
+    if dtype == torch.bool:
+        random_numbers = torch.rand(shape)
+        return torch.where(
+            random_numbers > 0.5, torch.tensor(True), torch.tensor(False)
+        )
+    if fx_type_utils.is_torch_complex_dtype(dtype):
+        # ONNX does not support complex values, but supports their real representation
+        return torch.view_as_complex(
+            torch.randn((*shape, 2), dtype=fx_type_utils.from_complex_to_float(dtype))
+        )
+    return torch.randn(shape, dtype=dtype)
+
+
+@_beartype.beartype
+def _fx_args_to_torch_args(
+    fx_args: List[fx_type_utils.Argument], fx_graph_module: torch.fx.GraphModule
+) -> List[fx_type_utils.Argument]:
+    """Recursively convert fx args to torch args"""
+    wrapped_args: List[fx_type_utils.Argument] = []
+    for arg in fx_args:
+        if isinstance(arg, torch.fx.Node):
+            fake_tensor = arg.meta.get("val")
+            if fake_tensor is None and arg.op == "get_attr":
+                fake_tensor = getattr(fx_graph_module, arg.target)  # type: ignore[operator]
+            # NOTE: Currently, we are aware of
+            # FakeTensor/Tensor/SymInt/SymFloat/Symbool/int/float/bool could be in
+            # arg.meta["val"]/get_attr.
+            if isinstance(fake_tensor, torch.Tensor):
+                real_tensor = generate_random_tensors(
+                    fake_tensor.shape, fake_tensor.dtype
+                )
+                wrapped_args.append(real_tensor)
+            elif isinstance(fake_tensor, (int, float, bool)):
+                wrapped_args.append(fake_tensor)
+            elif symbolic_shapes.has_hint(fake_tensor):
+                wrapped_args.append(symbolic_shapes.hint_int(fake_tensor))
+            else:
+                raise ValueError(
+                    f"Unexpected input argument type found inside fx.Node. arg: {arg}; "
+                    f"arg.meta['val']/get_attr: {fake_tensor}; type(arg.meta['val']/get_attr): "
+                    f"{type(fake_tensor)}."
+                )
+        elif isinstance(arg, Sequence):
+            wrapped_args.append(_fx_args_to_torch_args(arg, fx_graph_module))
+        elif isinstance(arg, (int, float, torch.dtype)) or arg is None:
+            wrapped_args.append(arg)
+        elif isinstance(arg, torch.device):
+            wrapped_args.append(str(arg))
+        else:
+            raise ValueError(
+                f"Unexpected input argument type is found in node arguments. arg: {arg}; "
+            )
+
+    return wrapped_args
+
+
+@_beartype.beartype
+def _wrap_fx_args_as_torch_args(
+    fx_args: List[fx_type_utils.Argument],
+    fx_kwargs: Dict[str, fx_type_utils.Argument],
+    fx_graph_module: torch.fx.GraphModule,
+) -> Tuple[List[fx_type_utils.Argument], Dict[str, fx_type_utils.Argument]]:
+    """Prepare torch format args and kwargs for op-level validation by using fake tensor to create real tensor to feed in ops"""
+
+    # NOTE: This function only supports FakeTensor with concrete shapes
+    torch_args: List[fx_type_utils.Argument] = _fx_args_to_torch_args(
+        fx_args, fx_graph_module
+    )
+    return torch_args, fx_kwargs
+
+
+# NOTE: Referenced from onnxscript internal function: _tag_arguments_with_param_schemas.
+@_beartype.beartype
+def _convert_torch_args_to_onnxfunction_args(
+    param_schemas: Sequence[onnxscript.values.ParamSchema],
+    args: List[fx_type_utils.Argument],
+    kwargs: Dict[str, fx_type_utils.Argument],
+    allow_extra_kwargs: bool = False,
+) -> Tuple[List[Any], Dict[str, Any],]:
+    """Convert Python args and kwargs to OnnxFunction acceptable with matching ONNX ParamSchema.
+
+    NOTE: This is different from the param_schema separating in dispatcher, since at this point
+    we are already sure that the args and kwargs are in order and matched.
+
+    Args:
+        param_schemas: The parameter schemas of an Op or a OnnxFunction.
+        args: The Python positional arguments supplied by the caller.
+        kwargs: The Python keyword arguments supplied by the caller.
+        allow_extra_kwargs: Whether to allow extra keyword arguments.
+            When set to True, extra/unknown arguments will be ignored.
+
+    Returns:
+        A tuple of two elements:
+        - A list of Python positional argument.
+        - An ordered dictionary of Python keyword argument names and its values.
+
+    Raises:
+        TypeError: When allow_extra_kwargs is False and there are unknown kwargs.
+        TypeError: When a required input is not provided.
+    """
+    # args, kwargs and param_schemas should be all in order
+    # user may not specify all inputs or attributes
+
+    all_param_names = {param.name for param in param_schemas}
+    extra_kwargs = set(kwargs).difference(all_param_names)
+    if extra_kwargs and not allow_extra_kwargs:
+        raise TypeError(f"Unexpected keyword arguments '{extra_kwargs}'")
+
+    tagged_args: list[Any] = []
+    tagged_kwargs: dict[str, Any] = {}
+
+    for i, param in enumerate(param_schemas):
+        if param.is_variadic_input:
+            # Exhaust all remaining args
+            tagged_args.extend(arg for arg in args[i:])
+            args = []
+            continue
+        if i < len(args):
+            if param.is_input or isinstance(args[i], torch.dtype):
+                tagged_args.append(_convert_tensor_to_numpy(args[i]))
+            else:
+                tagged_args.append(args[i])
+        elif param.name in kwargs:
+            if param.is_input:
+                tagged_kwargs[param.name] = _convert_tensor_to_numpy(kwargs[param.name])
+            else:
+                tagged_kwargs[param.name] = kwargs[param.name]
+        elif param.required:
+            raise TypeError(f"Required input/attribute '{param}' was not provided")
+
+    return tagged_args, tagged_kwargs
+
+
+@_beartype.beartype
+def _convert_tensor_to_numpy(input: fx_type_utils.Argument) -> Any:
+    try:
+        import numpy as np
+    except ImportError as exc:
+        raise ImportError(f"{__name__} needs numpy, but it's not installed.") from exc
+
+    if isinstance(input, torch.Tensor):
+        if torch.is_complex(input):
+            # from complex to real representation
+            input = torch.view_as_real(input.resolve_conj())
+        return input.detach().cpu().numpy()
+    if isinstance(input, torch.dtype):
+        return int(jit_type_utils.JitScalarType.from_dtype(input).onnx_type())  # type: ignore[union-attr]
+    if isinstance(input, (tuple, list)):
+        if len(input) == 0:
+            return np.array((), dtype=np.int64)
+        if isinstance(input[0], torch.Tensor):
+            return [_convert_tensor_to_numpy(x) for x in input]
+        if isinstance(input[0], bool):
+            return np.array(input, dtype=np.bool_)
+
+        # Just a sequence of numbers
+        if isinstance(input[0], int):
+            return np.array(input, dtype=np.int64)
+        if isinstance(input[0], float):
+            return np.array(input)
+    return input
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__init__.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c906cde808d63dedc8f879cf19cea17353be159
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__init__.py
@@ -0,0 +1,17 @@
+from .decomp import Decompose
+from .functionalization import Functionalize, RemoveInputMutation
+from .modularization import Modularize
+from .readability import RestoreParameterAndBufferNames
+from .type_promotion import InsertTypePromotion
+from .virtualization import MovePlaceholderToFront, ReplaceGetAttrWithPlaceholder
+
+__all__ = [
+    "Decompose",
+    "InsertTypePromotion",
+    "Functionalize",
+    "Modularize",
+    "MovePlaceholderToFront",
+    "RemoveInputMutation",
+    "RestoreParameterAndBufferNames",
+    "ReplaceGetAttrWithPlaceholder",
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e98720f29dc13856e6b7321b83de0fcaa140d438
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a5628f40cad2ea44606a1ddae7a2b46ce6a1b47
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/decomp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/decomp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0daf7ed887edba15b6641fa19959c03017b2fe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/decomp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/functionalization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/functionalization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..572a270d4b4d794a141bf2edfdf6da429d16dcd8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/functionalization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/modularization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/modularization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e065952727828638c4861886707bd7886775743
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/modularization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/readability.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/readability.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6de28eabf40bbc7ce337a4aaa8774c55bbf48b82
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/readability.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/type_promotion.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/type_promotion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..027032b8389c55d0b96f84712283935b7e3cedf3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/type_promotion.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/virtualization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/virtualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b63689f96b2746d2afe690407e1ef5eb390815de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/__pycache__/virtualization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/_utils.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cca620c6076954d27bda291076c5e4490af477f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/_utils.py
@@ -0,0 +1,119 @@
+"""Common utility functions for FX passes.
+
+These functions should NOT be directly invoked outside of `passes` package.
+"""
+from __future__ import annotations
+
+import collections
+
+import re
+
+from typing import Callable, Dict, Optional, Tuple
+
+import torch.fx
+import torch.fx.traceback as fx_traceback
+from torch.onnx._internal import _beartype
+
+
+@_beartype.beartype
+def wrap_graph_module_for_node_meta_preservation(
+    graph_module: torch.fx.GraphModule,
+) -> Callable:
+    """Wrap a GraphModule with contexts to preserve node meta information, such as stacktrace info.
+
+    This is typically useful before calling `make_fx`. Without this wrapper, the
+    stacktrace information will be lost afterwards.
+    """
+
+    def wrapped(*args):
+        with fx_traceback.preserve_node_meta():
+            return torch.fx.Interpreter(graph_module).run(*args)
+
+    return wrapped
+
+
+def _get_node_base_name(node_name: str) -> Tuple[str, Optional[int]]:
+    pattern = r"(.*)\.(\d+)"
+    match = re.match(pattern, node_name)
+    if match is not None:
+        base_name, count_str = match.groups()
+        return base_name, int(count_str)
+    return node_name, None
+
+
+@_beartype.beartype
+def set_node_name(
+    node: torch.fx.Node,
+    new_name: str,
+    name_to_node_cache: Dict[str, torch.fx.Node],
+):
+    """Safely set the unique name of a node.
+
+    If the new name is already taken by another node, the name of the other node will be
+    updated. If `new_name` is a string of format f"{base_name}.{count}", where `count`
+    is an integer, the other node will be renamed as f"{base_name}.{count+1}". If not,
+    the other node will be renamed as "{new_name}.1". This function will iteratively
+    update the names until there is no conflict.
+
+    ``name_to_node_cache`` is required as an argument to avoid recomputation. The caller
+    is responsible for ensuring the cache is accurate and in sync with the owning module
+    of the node. The values in the cache will be updated accordingly.
+
+    Args:
+        node: The node to update.
+        new_name: The new name to use.
+        name_to_node_cache: A cache of node names to nodes.
+    """
+    module = node.graph.owning_module
+    node_name_to_set = collections.deque([(node, new_name)])
+
+    while node_name_to_set:
+        node, new_name = node_name_to_set.pop()
+        if new_name in name_to_node_cache and name_to_node_cache[new_name] != node:
+            base_name, postfix_count = _get_node_base_name(new_name)
+            if postfix_count is None:
+                postfix_count = 0
+            node_name_to_set.append(
+                (name_to_node_cache[new_name], f"{base_name}.{postfix_count + 1}")
+            )
+        node.name = new_name
+        name_to_node_cache[new_name] = node
+
+
+@_beartype.beartype
+def replace_placeholder_name_and_target(
+    module: torch.fx.GraphModule, reference_module: torch.fx.GraphModule
+):
+    """Replace the argument names in module with those in reference_module.
+
+    This function assumes the two modules have the same signature structure.
+    The caller is responsible for ensuring this. Otherwise, the behavior of this
+    function is undefined. This function only does minimal sanity check that the two
+    modules have the same number of arguments.
+
+    Name conflicts between new names and existing node names in the graph are handled.
+    Check the documentation of :func:`set_node_name` for more details.
+
+    Raises:
+        RuntimeError: If the two modules have different number of arguments.
+    """
+    placeholders = [node for node in module.graph.nodes if node.op == "placeholder"]
+    reference_placeholders = [
+        node for node in reference_module.graph.nodes if node.op == "placeholder"
+    ]
+
+    if len(placeholders) != len(reference_placeholders):
+        raise RuntimeError(
+            "The two modules have different number of arguments. "
+            f"module: {len(placeholders)}, reference_module: {len(reference_placeholders)}"
+        )
+
+    name_to_node: Dict[str, torch.fx.Node] = {}
+    for node in module.graph.nodes:
+        name_to_node[node.name] = node
+
+    for placeholder, reference_placeholder in zip(placeholders, reference_placeholders):
+        placeholder.target = reference_placeholder.target
+        set_node_name(placeholder, reference_placeholder.name, name_to_node)
+
+    module.recompile()
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/decomp.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/decomp.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c27b0b357e654e3a643d686e9c12d94c786a531
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/decomp.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import contextlib
+
+from typing import Callable, Mapping, Optional
+
+import torch
+import torch._ops
+import torch.fx
+from torch._dispatch import python as python_dispatch
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import _pass, diagnostics
+from torch.onnx._internal.fx.passes import _utils
+
+
+class Decompose(_pass.Transform):
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        decomposition_table: Mapping[torch._ops.OpOverload, Callable],
+        enable_dynamic_axes: bool,
+        allow_fake_constant: Optional[bool] = False,
+    ):
+        super().__init__(diagnostic_context, module)
+        self.decomposition_table = decomposition_table
+        self.enable_dynamic_axes = enable_dynamic_axes
+        self.allow_fake_constant = allow_fake_constant
+
+    @_beartype.beartype
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        assert not kwargs, "kwargs is not supported in Decompose."
+
+        # To preserve stack trace info after `make_fx`.
+        module = _utils.wrap_graph_module_for_node_meta_preservation(self.module)
+
+        # fake mode use static size to trace the size of tensors. while symbolic
+        # mode generates aten::sym_size to dynamically trace the size of tensors.
+
+        # e.g. fake mode:
+        #  view: f32[3, 5, 20] = torch.ops.aten.view.default(x, [3, 5, 20])
+
+        # e.g. symbolic mode:
+        #  sym_size = torch.ops.aten.sym_size(x, 0)
+        #  sym_size_1 = torch.ops.aten.sym_size(x, 1)
+        #  sym_size_2 = torch.ops.aten.sym_size(x, 2)
+        #  sym_size_3 = torch.ops.aten.sym_size(x, 3)
+        #  mul = sym_size_2 * sym_size_3;  sym_size_2 = sym_size_3 = None
+        #  view: f32[3, 5, 20] = torch.ops.aten.view.default(x, [sym_size, sym_size_1, mul])
+
+        # Mimic `torch._dynamo.export(aten_graph=True)` behavior in invoking `make_fx`.
+        # TODO: May need revisit for user fake mode export + dynamic shape scenario.
+        fake_mode: Optional[fake_tensor.FakeTensorMode] = self.fake_mode
+        maybe_fake_args = self._maybe_fakefy_args(fake_mode, *args)
+        if fake_mode is not None:
+            # Using existing fake mode as context, signal `make_fx` that it does not need
+            # to create a new fake mode by passing tracing_mode as "real".
+            tracing_mode = "real"
+        else:
+            # Existing fake mode not found, signal `make_fx` to create one.
+            fake_mode = contextlib.nullcontext()  # type: ignore[assignment]
+            tracing_mode = "symbolic" if self.enable_dynamic_axes else "fake"
+
+        # Apply decomposition table to the input graph.
+        assert fake_mode is not None  # for mypy
+        with proxy_tensor.maybe_disable_fake_tensor_mode(), python_dispatch.enable_python_dispatcher(), (
+            fake_mode
+        ):
+            decomposed_module = proxy_tensor.make_fx(
+                module,
+                decomposition_table=self.decomposition_table,
+                tracing_mode=tracing_mode,
+                _allow_non_fake_inputs=True,
+                _allow_fake_constant=self.allow_fake_constant,
+            )(*maybe_fake_args)
+
+        # Rename placeholder targets to match the original module's signature since
+        # We don't want to map forward(x, y, z) to forward(arg0, arg1, arg2).
+        _utils.replace_placeholder_name_and_target(decomposed_module, self.module)
+
+        return decomposed_module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/functionalization.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/functionalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..44272f18b8ccb22429570b9a9b50b3d2f3f52dcc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/functionalization.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+import contextlib
+
+from typing import Callable, Optional
+
+import torch
+import torch._ops
+import torch.func
+import torch.fx
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import _pass, diagnostics
+from torch.onnx._internal.fx.passes import _utils
+from torch.utils import _pytree as pytree
+
+
+class Functionalize(_pass.Transform):
+    """Functionalize a GraphModule.
+
+    This pass utilizes ``functionalization`` utility of ``torch._functorch`` to convert
+    a GraphModule into a functional form. The two main functionalities are (copied from
+    its documentations):
+
+    * ``functionalization`` removes (intermediate) mutations and aliasing from a
+    function, while preserving the function's semantics.
+
+    * ``functionalization`` also removes mutations (and views) that were performed
+    on function inputs. However to preserve semantics, functionalize will "fix up" the
+    mutations after the transform has finished running, by detecting if any tensor inputs
+    "should have" been mutated, and copying the new data back to the inputs if necessary.
+    For example, consider::
+
+        def fn(a, b):
+            a.add_(b)
+            return a
+
+      For a call like `fn(x, y)`, the variable `x` outside is also mutated. Hence just
+      functionalizing is not enough for preserving the original semantics. A "special"
+      input mutation step needs to be inserted at the end.::
+
+        # After functionalization, without input mutation "fix up".
+        # This is not semantically the same. The variable outside the function call that
+        # was passed in as `a` is not mutated.
+        def fn(a, b):
+            new_a = a + b
+            return new_a
+
+        # Functionalization with input mutation "fix up" that preserves semantics.
+        def fn(a, b):
+            new_a = a + b
+
+            # Copying the new data back to the inputs
+            a.copy_(new_a)
+
+            return new_a
+
+    For ONNX inference, it is recommended to run ``RemoveInputMutation`` after this pass.
+    ``RemoveInputMutation`` removes the "fix up" nodes that were added by ``Functionalize``,
+    which are not needed for ONNX inference.
+    """
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        enable_dynamic_axes: bool,
+        allow_fake_constant: Optional[bool] = False,
+    ):
+        super().__init__(diagnostic_context, module)
+        self.enable_dynamic_axes = enable_dynamic_axes
+        self.allow_fake_constant = allow_fake_constant
+
+    def _functionalize(self, function: Callable) -> Callable:
+        # Working around a dispatcher issue with `torch.func.functionalize` when used
+        # together with `make_fx`.
+        # Ref: https://github.com/pytorch/pytorch/issues/99774#issuecomment-1527949391
+        def wrapped(*inputs):
+            inputs_functional = pytree.tree_map_only(
+                torch.Tensor, torch._to_functional_tensor, inputs
+            )
+            torch._enable_functionalization(reapply_views=True)
+            try:
+                out = function(*inputs_functional)
+            finally:
+                torch._disable_functionalization()
+            flat_inputs = pytree.tree_leaves(inputs)
+            flat_inputs_functional = pytree.tree_leaves(inputs_functional)
+            for inpt, input_functional in zip(flat_inputs, flat_inputs_functional):
+                if isinstance(input_functional, torch.Tensor):
+                    torch._sync(input_functional)
+                    inpt_new = torch._from_functional_tensor(input_functional)
+            pytree.tree_map(torch._sync, out)
+            out_unwrapped = pytree.tree_map(torch._from_functional_tensor, out)
+            return out_unwrapped
+
+        return wrapped
+
+    @_beartype.beartype
+    def _run(self, *args) -> torch.fx.GraphModule:
+        # To preserve stack trace info after `make_fx`.
+        module = _utils.wrap_graph_module_for_node_meta_preservation(self.module)
+
+        functionalized_callable = self._functionalize(module)
+
+        # Mimic `torch._dynamo.export(aten_graph=True)` behavior in invoking `make_fx`.
+        # TODO: May need revisit for user fake mode export + dynamic shape scenario.
+        fake_mode: Optional[fake_tensor.FakeTensorMode] = self.fake_mode
+        maybe_fake_args = self._maybe_fakefy_args(fake_mode, *args)
+        if fake_mode is not None:
+            # Using existing fake mode as context, signal `make_fx` that it does not need
+            # to create a new fake mode by passing tracing_mode as "real".
+            tracing_mode = "real"
+        else:
+            # Existing fake mode not found, signal `make_fx` to create one.
+            fake_mode = contextlib.nullcontext()  # type: ignore[assignment]
+            tracing_mode = "symbolic" if self.enable_dynamic_axes else "fake"
+
+        assert fake_mode is not None  # for mypy
+        with proxy_tensor.maybe_disable_fake_tensor_mode(), fake_mode:
+            graph_module = proxy_tensor.make_fx(
+                functionalized_callable,
+                decomposition_table={},
+                tracing_mode=tracing_mode,
+                _allow_non_fake_inputs=True,
+                _allow_fake_constant=self.allow_fake_constant,
+            )(*maybe_fake_args)
+
+        # Rename placeholder targets to match the original module's signature since
+        # We don't want to map forward(x, y, z) to forward(arg0, arg1, arg2).
+        _utils.replace_placeholder_name_and_target(graph_module, self.module)
+
+        return graph_module
+
+
+class RemoveInputMutation(_pass.Transform):
+    """Remove `aten.copy_.default` nodes that mutate module inputs.
+
+    This pass is recommended to be used after ``Functionalization`` pass.
+    ``Functionalization`` pass adds `aten.copy_.default` nodes to the graph
+    when it detects mutations to inputs. These nodes are not needed for ONNX export
+    for inference. They could be useful for training.
+    """
+
+    @_beartype.beartype
+    def _run(self, *args) -> torch.fx.GraphModule:
+        for node in reversed(self.module.graph.nodes):
+            if (
+                node.op == "call_function"
+                and node.target == torch.ops.aten.copy_.default
+                and len(node.users) == 0
+                and isinstance(node.args[0], torch.fx.Node)
+                and node.args[0].op == "placeholder"
+            ):
+                self.module.graph.erase_node(node)
+        return self.module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/modularization.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/modularization.py
new file mode 100644
index 0000000000000000000000000000000000000000..34464553f1e36b4eb6efa9bd29be7b17f8be50fd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/modularization.py
@@ -0,0 +1,868 @@
+from __future__ import annotations
+
+import abc
+
+import collections
+import copy
+import operator
+
+from typing import (
+    Any,
+    Dict,
+    Final,
+    Generator,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.fx
+from torch.onnx._internal import _beartype
+
+from torch.onnx._internal.fx import _pass, diagnostics
+from torch.utils import _pytree as pytree
+
+_FX_TRACER_NN_MODULE_META_TYPE = Tuple[str, type]
+"""Legacy type of item from `node.meta["nn_module_stack"].items()` produced by FX symbolic tracer."""
+_FX_TRACER_NN_MODULE_STACK_META_TYPE = collections.OrderedDict
+"""Legacy type of `node.meta["nn_module_stack"]` produced by FX symbolic tracer."""
+
+_DYNAMO_NN_MODULE_META_TYPE = Tuple[str, Tuple[str, type]]
+"""Type of item from `node.meta["nn_module_stack"].items()` produced by FX dynamo tracer."""
+_DYNAMO_NN_MODULE_STACK_META_TYPE = Dict[str, _DYNAMO_NN_MODULE_META_TYPE]
+"""Type of `node.meta["nn_module_stack"]` produced by FX dynamo tracer."""
+
+
+class _ModuleMeta:
+    """Meta information about a module.
+
+    This class is used to represent the module information in a more structured way.
+    It parses raw module information from a single item from
+    `node.meta["nn_module_stack"].items()`.
+
+    See the uses of `from_raw_meta`, `from_fx_tracer_produced_raw_meta`, and
+    `from_dynamo_produced_raw_meta` for how to create an instance.
+
+    Attributes:
+        _module_class: The class of the module. E.g. `torch.nn.module.sparse.Embedding`.
+        _module_name: The name of the module. E.g. `L__self___h_1_mlp_c_proj`.
+        _raw_meta: The raw meta '(module_name, node.meta["nn_module_stack"][module_name])'.
+    """
+
+    _module_class: Final[Optional[type]]
+    _module_name: Final[str]
+    _raw_meta: Final[Tuple[Any, Any]]
+
+    @_beartype.beartype
+    def __init__(
+        self, module_name: str, module_class: Optional[type], raw_meta: Tuple[Any, Any]
+    ):
+        self._module_name = module_name
+        self._module_class = module_class
+        self._raw_meta = raw_meta
+
+    @property
+    def module_display_name(self) -> str:
+        """The display name of the module.
+
+        E.g. `h_1_mlp_c_proj`.
+        """
+        # E.g., from 'L__self___h_1_mlp_c_proj' to 'h_1_mlp_c_proj'.
+        name = self.module_name
+        if name.startswith("L__self___"):
+            name = name[len("L__self___") :]
+        return name
+
+    @property
+    def qualified_module_class_name(self) -> str:
+        """Qualified name of the module class.
+
+        E.g. `torch_nn_module_sparse_Embedding`.
+        """
+        if self._module_class is None:
+            return ""
+        return (
+            self._module_class.__module__ + "_" + self._module_class.__name__
+        ).replace(".", "_")
+
+    @property
+    def module_class_name(self) -> str:
+        """Name of the module class.
+
+        E.g. `Embedding`.
+        """
+        if self._module_class is None:
+            return ""
+        return self._module_class.__name__
+
+    @property
+    def module_name(self) -> str:
+        """Name of the module.
+
+        E.g. `L__self___h_1_mlp_c_proj`.
+        """
+        return self._module_name
+
+    @property
+    def raw_meta(self) -> Tuple[Any, Any]:
+        """Returns the raw module meta data.
+
+        I.e. (module_name, node.meta['nn_module_stack'][module_name]).
+        """
+        return self._raw_meta
+
+    def __eq__(self, __value: object) -> bool:
+        if not isinstance(__value, _ModuleMeta):
+            return False
+        return (
+            self._module_name == __value._module_name
+            and self._module_class == __value._module_class
+        )
+
+    def __hash__(self) -> int:
+        return hash((self._module_name, self._module_class))
+
+    def __repr__(self) -> str:
+        return f"ModuleMeta(name={self._module_name}, class={self._module_class})"
+
+    @classmethod
+    def create_root(cls) -> _ModuleMeta:
+        """Create an empty module meta representing root module."""
+        return _ModuleMeta("", None, ("", None))
+
+    @classmethod
+    def from_fx_tracer_produced_raw_meta(
+        cls, raw_meta: _FX_TRACER_NN_MODULE_META_TYPE
+    ) -> _ModuleMeta:
+        """Create a module meta from raw meta produced by FX symbolic tracer."""
+        module_name, module_class = raw_meta
+        return _ModuleMeta(module_name, module_class, raw_meta)
+
+    @classmethod
+    def from_dynamo_produced_raw_meta(
+        cls, raw_meta: _DYNAMO_NN_MODULE_META_TYPE
+    ) -> _ModuleMeta:
+        """Create a module meta from raw meta produced by FX dynamo tracer."""
+        module_name, (qualified_name, module_class) = raw_meta
+        return _ModuleMeta(module_name, module_class, raw_meta)
+
+    @classmethod
+    def from_raw_meta(
+        cls,
+        raw_meta: Union[_FX_TRACER_NN_MODULE_META_TYPE, _DYNAMO_NN_MODULE_META_TYPE],
+    ) -> _ModuleMeta:
+        if (
+            isinstance(raw_meta, tuple)
+            and len(raw_meta) == 2
+            and isinstance(raw_meta[1], type)
+        ):
+            # Trying to do `instance(raw_meta, _FX_TRACER_NN_MODULE_META_TYPE)`
+            return _ModuleMeta.from_fx_tracer_produced_raw_meta(raw_meta)
+        if (
+            isinstance(raw_meta, tuple)
+            and len(raw_meta) == 2
+            and isinstance(raw_meta[1], tuple)
+        ):
+            # Trying to do `instance(raw_meta, _DYNAMO_NN_MODULE_META_TYPE)`
+            return _ModuleMeta.from_dynamo_produced_raw_meta(raw_meta)
+        raise TypeError(
+            f"Unknown type of raw meta item from node.meta['nn_module_stack'].items(): {type(raw_meta)}"
+        )
+
+
+class _ModuleStackMeta:
+    """Meta information about the module call stack.
+
+    This class is used to represent the module call stack information in a more
+    structured way. It parses raw module stack information from `node.meta["nn_module_stack"]`.
+
+    Example of raw module stack information:
+
+        If produced by dynamo:
+
+            {
+                'L__self___h_1': (
+                    "L['self'].h[1]",
+                    <class 'transformers.models.gpt2.modeling_gpt2.GPT2Block'>
+                ),
+                'L__self___h_1_attn': (
+                    "L['self'].h[1].attn",
+                    <class 'transformers.models.gpt2.modeling_gpt2.GPT2Attention'>
+                )
+            }
+
+        If produced by fx.symbolic_trace:
+
+            {
+                'h.1': <class 'transformers.models.gpt2.modeling_gpt2.GPT2Block'>,
+                'h.1.attn': <class 'transformers.models.gpt2.modeling_gpt2.GPT2Attention'>
+            }
+    """
+
+    _module_stack: Final[List[_ModuleMeta]]
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        nn_module_stack_meta: Optional[
+            Union[
+                _FX_TRACER_NN_MODULE_STACK_META_TYPE, _DYNAMO_NN_MODULE_STACK_META_TYPE
+            ]
+        ],
+        is_exported_program: bool = True,
+    ):
+        self._module_stack = []
+        if nn_module_stack_meta is None:
+            return
+        raw_meta = copy.copy(nn_module_stack_meta)
+        for item in raw_meta.items():
+            # If produced by torch.export.export, there is another call stack layer
+            # that we need to skip
+            if is_exported_program:
+                is_exported_program = False
+                continue
+            self.push(_ModuleMeta.from_raw_meta(item))
+
+    def __len__(self) -> int:
+        return len(self._module_stack)
+
+    def __getitem__(self, index: int) -> _ModuleMeta:
+        return self._module_stack[index]
+
+    def __iter__(self) -> Iterator[_ModuleMeta]:
+        return iter(self._module_stack)
+
+    def is_empty_or_root(self) -> bool:
+        return len(self._module_stack) == 0
+
+    def top(self) -> _ModuleMeta:
+        """Returns the top module meta in the stack. I.e., the meta for leaf module.
+
+        Example:
+
+            Consider the following module stack:
+
+            stack = [GPT, block1, Attention_1, MLP]
+
+            stack.top() == MLP
+        """
+        if self.is_empty_or_root():
+            return _ModuleMeta.create_root()
+        return self._module_stack[-1]
+
+    @_beartype.beartype
+    def is_superset_of(
+        self,
+        module_stack: _ModuleStackMeta,
+    ) -> bool:
+        """Determines if self is a superset of the provided module stack.
+
+        I.e., If self includes all elements from the provided module stack, plus additional
+        elements on top. If self is empty or root, this method always return False.
+
+        Example:
+
+            Consider the following module stack:
+
+            stack_1 = [GPT, block1, Attention_1, MLP]
+            stack_2 = [GPT, block1]
+
+            stack_1.is_superset_of(stack_2) == True
+            stack_2.is_superset_of(stack_1) == False
+
+            stack_3 = [GPT, block2, Attention_1]
+
+            stack_1.is_superset_of(stack_3) == False
+            stack_3.is_superset_of(stack_1) == False
+        """
+        if self.is_empty_or_root():
+            return False
+
+        if module_stack.is_empty_or_root() is None:
+            return True
+
+        if len(self) <= len(module_stack):
+            return False
+
+        for i, parent_key in enumerate(module_stack):
+            if self[i] != parent_key:
+                return False
+
+        return True
+
+    def push(self, module_meta: _ModuleMeta) -> None:
+        """Pushes a module meta to the stack."""
+        self._module_stack.append(module_meta)
+
+    @_beartype.beartype
+    def __eq__(self, __value: object) -> bool:
+        if not isinstance(__value, _ModuleStackMeta):
+            return False
+        return self._module_stack == __value._module_stack
+
+    @property
+    def raw_meta(self) -> Optional[Dict[str, Tuple[str, type]]]:
+        """Returns the raw module stack meta data, i.e. node.meta['nn_module_stack']."""
+        return {
+            module_meta.raw_meta[0]: module_meta.raw_meta[1]
+            for module_meta in self._module_stack
+        }
+
+    def __repr__(self) -> str:
+        return f"ModuleStackMeta({self._module_stack})"
+
+    @property
+    def module_display_name(self) -> str:
+        """Returns the module display name of the top module."""
+        return self.top().module_display_name
+
+    @property
+    def qualified_module_class_name(self) -> str:
+        """Returns the qualified module class name of the top module."""
+        return self.top().qualified_module_class_name
+
+    @property
+    def module_class(self) -> Optional[type]:
+        """Returns the module class of the top module."""
+        return self.top()._module_class
+
+
+def _module_stack_meta_from_node(
+    node: torch.fx.Node, is_exported_program: bool = False
+) -> _ModuleStackMeta:
+    return _ModuleStackMeta(
+        node.meta.get("nn_module_stack"), is_exported_program=is_exported_program
+    )
+
+
+def _get_unique_module_name(module_names: Dict[str, int], module_name: str) -> str:
+    module_names.setdefault(module_name, 0)
+    module_names[module_name] += 1
+    return f"{module_name}_{module_names[module_name]}"
+
+
+class _IRNode(abc.ABC):
+    """Base class for IR nodes.
+
+    IR nodes are used for Modularize pass only. They add a layer of abstraction on top of
+    torch.fx.Node.
+
+    [NOTE: Modularize Pass Implementation]
+    The main job of the pass is to group `fx.Node`s that belong to the same `nn.Module`
+    forward call, and then create `call_module` node and sub `fx.GraphModule` from them.
+    Each `fx.Node` possesses an `nn_module_stack` meta data that contains information
+    about the module call stack. See `_ModuleStackMeta` for examples.
+
+    Analysis step
+    -------------
+
+    Each module call is identified by a set of base stack layers. For each module call,
+    the pass creates a `_ModuleNode` and groups the sequence of nodes that shares the
+    same base stack layers.
+
+    For example,
+
+        stack_of_node_0 = [GPT, block0]
+        stack_of_node_1 = [GPT, block1]
+        stack_of_node_2 = [GPT, block1, Attention1, MLP]
+        stack_of_node_3 = [GPT, block1, Attention1]
+        stack_of_node_4 = [GPT, block2]
+
+    All nodes belong to the `GPT` module call, since they share the base stack layers [GPT].
+    [node_1, node_2, node_3] are grouped for `GPT.block1`, because they share the base
+    stack layers [GPT, block1]. And [node_2, node_3] for `GPT.block1.Attention1`, [node_0]
+    for `GPT.block0`, and [node_4] for `GPT.block2` respectfully.
+
+    After the analysis step, a hierarchical representation is generated.
+
+    For above example, the representation is:
+
+        _ModuleNode(GPT)
+            _ModuleNode(block0)
+                _LeafNode(node_0)
+            _ModuleNode(block1)
+                _LeafNode(node_1)
+                _ModuleNode(Attention1)
+                    _ModuleNode(MLP)
+                        _LeafNode(node_2)
+                _LeafNode(node_3)
+            _ModuleNode(block2)
+                _LeafNode(node_4)
+
+    Construction step
+    -----------------
+
+    The second step is to build the actual `call_module` node and the sub `fx.GraphModule`.
+    This is done recursively from the leaf `_ModuleNode` to the root.
+
+    For example, the first submodule to be built is `GPT.block1.Attention1.MLP`. Below pair
+    is generated from `_ModuleNode(MLP)`.
+
+        fx.GraphModule(GPT.block1.Attention1.MLP)
+            graph:
+                node_2
+
+        new_mlp_node = `call_module[GPT.block1.Attention1.MLP](...)`
+
+    Next, the `GPT.block1.Attention1` submodule is built. Below is generated from
+    `_ModuleNode(Attention1)`.
+
+        fx.GraphModule(GPT.block1.Attention1)
+            graph:
+                new_mlp_node
+                node_3
+
+        new_attention1_node = `call_module[GPT.block1.Attention1](...)`
+
+    Until every submodule is built, the new modularized `fx.GraphModule` is generated.
+
+    Alternatives
+    ------------
+
+    The current algorithm adopts a top down approach. A bottom up approach is similar.
+    In contrast to these two, an alternative flat order approach is also possible, where
+    each node is traversed and copied to the corresponding submodule.
+
+    The advantage of the current approach lies in the encapsulation of the fx.GraphModule
+    construction for each individual submodule within a single `build_module` method, which
+    can be called separately once the analysis phase is completed, making debugging more
+    convenient.
+
+    Regarding construction step, an alternative implementation is to utilize `fx.Interpreter`
+    for traversing all the nodes under the flattened root module and copying the nodes
+    into their respective submodule under construction. This approach is not adopted because
+
+        1. It uses the flat order approach discussed above. This means one cannot individually
+    construct a submodule and examine it while debugging.
+
+        2. The graph execution functionality of `fx.Interpreter` is not necessary for the
+    purpose of this pass. Ignoring that, `fx.Interpreter.run` achieves the same effect
+    as a for loop over all the nodes.
+    """
+
+    @property
+    @abc.abstractmethod
+    def stack_meta(self) -> _ModuleStackMeta:
+        """The module stack meta data associated with this node."""
+        ...
+
+    @property
+    @abc.abstractmethod
+    def stack_trace(self) -> Optional[str]:
+        """The stack trace associated with this node."""
+        ...
+
+
+class _ModuleNode(_IRNode):
+    """Representing a sequence of fx.Nodes to be formed into a fx.GraphModule.
+
+    This class encapsulates metadata and provides building block methods to construct this
+    layered abstraction from a sequence of flat fx.Nodes.
+
+    Attributes:
+    - _stack_meta: Metadata of the module stack.
+    - _nodes: List of IR nodes in the module.
+    - _reference_root_module: Reference to the root flat fx.GraphModule instance.
+    """
+
+    def __init__(
+        self, reference_root_module: torch.fx.GraphModule, stack_meta: _ModuleStackMeta
+    ):
+        self._stack_meta = stack_meta
+        self._nodes: List[_IRNode] = []
+        self._reference_module = reference_root_module
+
+    @property
+    def stack_meta(self) -> _ModuleStackMeta:
+        return self._stack_meta
+
+    @property
+    def stack_trace(self) -> Optional[str]:
+        assert self._nodes
+        return self._nodes[0].stack_trace
+
+    def __str__(self) -> str:
+        return f"ModuleNode({self._stack_meta})"
+
+    def is_same_module_as(self, node: _IRNode) -> bool:
+        """Determines if the provided node pertains to the same module as this node."""
+        return self.stack_meta == node.stack_meta
+
+    def is_parent_module_of(self, node: _IRNode) -> bool:
+        """Determines if this node represents a parent module of the provided node."""
+        return node.stack_meta.is_superset_of(self.stack_meta)
+
+    def add_leaf_node(self, leaf_node: _LeafNode) -> None:
+        """Adds a leaf node to the module.
+
+        The leaf node must belong to the same or a child module. This method will recursively
+        construct _ModuleNode instance based on the stack_meta information of the leaf node.
+        """
+        if self.is_same_module_as(leaf_node) or leaf_node.fx_op == "call_module":
+            self._nodes.append(leaf_node)
+        elif leaf_node.fx_op == "placeholder":
+            # Although the original placeholder has empty nn_module_stack, the placeholder lifted
+            # from get_attr nodes by exported program has their original nn_module_stack. Here
+            # we need to avoid them building submodule.
+            self._nodes.append(leaf_node)
+        elif self.is_parent_module_of(leaf_node):
+            # This node belongs in a submodule.
+            # Check if the last node is a submodule and if it is the parent of this node.
+            last_node = self._nodes[-1] if self._nodes else None
+            if isinstance(last_node, _ModuleNode) and (
+                last_node.is_parent_module_of(leaf_node)
+                or last_node.is_same_module_as(leaf_node)
+            ):
+                # This node belongs to the last_node.
+                last_node.add_leaf_node(leaf_node)
+            else:
+                # Create a new SubmoduleNode for the immediate child module of the current
+                # module. The leaf node may be a grandchild of the current module.
+                # Example:
+                #   self.stack_meta = [A, B, C]
+                #   leaf_node.stack_meta = [A, B, C, D, E, F]
+                # Create a new ModuleNode with stack_meta = [A, B, C, D] and add leaf_node to it.
+                stack_meta = copy.deepcopy(self.stack_meta)
+                stack_meta.push(leaf_node.stack_meta[len(self.stack_meta)])
+                last_node = _ModuleNode(
+                    self._reference_module,
+                    stack_meta,
+                )
+                self._nodes.append(last_node)
+                last_node.add_leaf_node(leaf_node)
+        else:
+            raise AssertionError(
+                f"Node {leaf_node} ({leaf_node.stack_meta}) does not belong to module "
+                f"{self._stack_meta}."
+            )
+
+    def fx_nodes(self) -> Generator[torch.fx.Node, None, None]:
+        """Returns an iterator for the sequence of fx nodes this instance holds."""
+        for node in self._nodes:
+            if isinstance(node, _ModuleNode):
+                yield from node.fx_nodes()
+            else:
+                assert isinstance(node, _LeafNode)
+                yield node.fx_node
+
+    def module_inputs(self) -> Sequence[torch.fx.Node]:
+        """Extract module inputs from the sequence of fx nodes this instance holds.
+
+        All node args that are produced by nodes outside of the module are considered module
+        inputs. The order of returned module inputs is the same as the their use order.
+
+        ### Known limitations
+
+        The original ordering of module inputs is not preserved. There is no meta information
+        to be found from the `fx.GraphModule` that can be used to recover the original ordering.
+
+        Returns:
+            Sequence of module inputs.
+        """
+        nodes = list(self.fx_nodes())
+        assert len(nodes) > 0, "Cannot extract module inputs from empty nodes."
+        module_inputs: Dict[torch.fx.Node, None] = {}
+        node_set: Set[torch.fx.Node] = set(nodes)
+
+        def _extract_arg_if_node_outside_module(arg: Any):
+            if isinstance(arg, torch.fx.Node) and arg not in node_set:
+                module_inputs[arg] = None
+
+        for node in nodes:
+            pytree.tree_map(_extract_arg_if_node_outside_module, node.args)
+            pytree.tree_map(_extract_arg_if_node_outside_module, node.kwargs)
+        return list(module_inputs.keys())
+
+    def module_outputs(self) -> Sequence[torch.fx.Node]:
+        """Extract module outputs from the sequence of fx nodes this instance holds.
+
+        All nodes that are used by nodes outside of the module are considered module
+        outputs. The order of returned module outputs is the same as the their creation order.
+
+        ### Known limitations
+
+        The original ordering of module outputs is not preserved. There is no meta information
+        to be found from the `fx.GraphModule` that can be used to recover the original ordering.
+
+        Returns:
+            Sequence of module outputs.
+        """
+        nodes = list(self.fx_nodes())
+        assert len(nodes) > 0, "Cannot extract module inputs from empty nodes."
+        # Need ordered set. Emulate with dict.
+        module_outputs: Dict[torch.fx.Node, None] = {}
+        node_set: Set[torch.fx.Node] = set(nodes)
+
+        for node in nodes:
+            if any(user not in node_set for user in node.users):
+                module_outputs[node] = None
+        return list(module_outputs.keys())
+
+    def build_module(self, module_names: Dict[str, int]) -> torch.fx.GraphModule:
+        """
+        Constructs the fx.GraphModule for this node, registering submodules as necessary.
+
+        Args:
+            module_names: A dictionary of module names and their counts. This is used to
+                generate unique module names for submodules. This should be an empty
+                dictionary when the method is called on a root module.
+        """
+        module_class_name = self._stack_meta.qualified_module_class_name
+        fx_graph = torch.fx.Graph()
+        copy_env: Dict[torch.fx.Node, torch.fx.Node] = {}
+
+        def _arg_transform(node: torch.fx.Node) -> torch.fx.Node:
+            return copy_env[node]
+
+        ref_inputs = self.module_inputs()
+        for node in ref_inputs:
+            copy_env[node] = fx_graph.placeholder(node.name, node.type)
+            copy_env[node].meta = copy.copy(node.meta)
+
+        for ir_node in self._nodes:
+            if isinstance(ir_node, _LeafNode):
+                fx_node = ir_node.fx_node
+                copy_env[fx_node] = fx_graph.node_copy(
+                    fx_node, arg_transform=_arg_transform
+                )
+                continue
+
+            assert isinstance(ir_node, _ModuleNode)
+            # Create fx.GraphModule for child submodule.
+            submodule = ir_node.build_module(module_names)
+            ref_submodule_inputs = ir_node.module_inputs()
+            ref_submodule_outputs = ir_node.module_outputs()
+            unique_submodule_name = _get_unique_module_name(
+                module_names, ir_node.stack_meta.module_display_name
+            )
+            # Link the newly generated sub fx.GraphModule with the root reference module.
+            # This step is essential to meet the needs of the subsequent fx.GraphModule initialization
+            # for the fx.GraphModule being created by this method.
+            # The initialization of fx.GraphModule will replicate all necessary attributes from a reference
+            # fx.GraphModule for the fx.Graph. While the root reference module possesses all
+            # parameters and buffers, it does not include the newly created sub fx.GraphModule.
+            # Therefore, it's necessary to register it under the root reference at this stage.
+            self._reference_module.add_submodule(unique_submodule_name, submodule)
+
+            # create call_module fx.Node
+            submodule_node = fx_graph.call_module(
+                unique_submodule_name,
+                tuple(_arg_transform(node) for node in ref_submodule_inputs),
+            )
+            if len(ref_submodule_outputs) > 1:
+                # Module node has multiple output. Create 'getitem' node for each output.
+                submodule_node.meta["val"] = tuple(
+                    ref_output.meta.get("val") for ref_output in ref_submodule_outputs
+                )
+                for i, ref_output in enumerate(ref_submodule_outputs):
+                    getitem_node = fx_graph.call_function(
+                        operator.getitem,
+                        args=(submodule_node, i),
+                        type_expr=ref_output.type,
+                    )
+                    getitem_node.meta = copy.copy(ref_output.meta)
+                    # Make a copy for "nn_module_stack" since the current module will be
+                    # popped from the stack for this 'getitem' node.
+                    getitem_node.meta["nn_module_stack"] = copy.copy(
+                        ref_output.meta["nn_module_stack"]
+                    )
+                    # The node is associated with the parent module.
+                    getitem_node.meta["nn_module_stack"].popitem()
+                    copy_env[ref_output] = getitem_node
+            else:
+                # Module node has single output. Use module node directly.
+                copy_env[ref_submodule_outputs[0]] = submodule_node
+                submodule_node.meta = copy.copy(ref_submodule_outputs[0].meta)
+
+            # Update meta for new call_module node.
+            if (stack_trace := ir_node.stack_trace) is not None:
+                submodule_node.meta["stack_trace"] = stack_trace
+            raw_module_stack_meta = ir_node.stack_meta.raw_meta
+            assert raw_module_stack_meta is not None
+            submodule_node.meta["nn_module_stack"] = copy.copy(raw_module_stack_meta)
+            # The node is associated with the parent module.
+            submodule_node.meta["nn_module_stack"].popitem()
+
+        new_nodes = fx_graph.nodes
+        # Skip if the last node is already 'output'. This is the case for root module.
+        # Otherwise create an 'output' node for the inferred outputs.
+        if next(iter(reversed(new_nodes))).op != "output":
+            ref_submodule_outputs = self.module_outputs()
+            new_outputs = [copy_env[ref_output] for ref_output in self.module_outputs()]
+            node = fx_graph.output(
+                new_outputs[0] if len(new_outputs) == 1 else new_outputs
+            )
+
+        graph_module = torch.fx.GraphModule(
+            self._reference_module, fx_graph, module_class_name
+        )
+        if (module_class := self._stack_meta.module_class) is not None:
+            graph_module.meta["onnx"] = _pass.GraphModuleOnnxMeta(
+                _pass.PackageInfo.from_python_class(module_class)
+            )
+        return graph_module
+
+
+class _LeafNode(_IRNode):
+    """Representing a single fx.Node."""
+
+    def __init__(self, node: torch.fx.Node, is_exported_program: bool = False):
+        self._node = node
+        self._stack_meta = _module_stack_meta_from_node(
+            node, is_exported_program=is_exported_program
+        )
+
+    @property
+    def fx_op(self) -> str:
+        """Syntax sugar for self.fx_node.op."""
+        return self._node.op
+
+    @property
+    def fx_node(self) -> torch.fx.Node:
+        """Returns the fx.Node this instance represents."""
+        return self._node
+
+    @property
+    def stack_meta(self) -> _ModuleStackMeta:
+        """Returns the module stack meta data associated with this node."""
+        return self._stack_meta
+
+    @property
+    def stack_trace(self) -> Optional[str]:
+        """Returns the stack trace associated with this node."""
+        return self.fx_node.meta.get("stack_trace")
+
+    def __str__(self) -> str:
+        return f"LeafNode({self._node})"
+
+
+class Modularize(_pass.Transform):
+    """Transforms a flattened `fx.GraphModule` into a modular structure.
+
+    In the flattened `fx.GraphModule`, each `nn.Module` forward call has been traced as
+    a sequence of `fx.Node`s. All these `fx.Node`s are flattened and reside in the same
+    `fx.GraphModule`. `fx.GraphModule` could be from `torch.export.ExportedProgram` or
+    directly generated by `torch._dynamo.export` with torch.nn.Module.
+
+    This pass generates a new `fx.GraphModule`. It groups the flattened `fx.Node`s that belong
+    to the same `nn.Module` forward call into a sub `fx.GraphModule`. It then replaces the
+    sequence of flattened `fx.Node`s with a single `call_module` node, which is linked with
+    the sub `fx.GraphModule` by `node.target`. The sub `fx.GraphModule` is registered as a
+    submodule of the new `fx.GraphModule`.
+
+    The process is done based on information from the `nn_module_stack` metadata of each node, i.e.
+    `node.meta["nn_module_stack"]`. For more implementation details, see [NOTE: Modularize Pass Implementation].
+
+    An fx submodule under this context can typically be interpreted in three different ways:
+
+        1. As an embodiment of an nn.Module class, which is considered stateless.
+        Its execution path can vary depending on the configuration of module initialization,
+        which should also be part of the inputs.
+
+        2. As a representation of an nn.Module instance. It maintains the state initialized in the module.
+        The execution path can vary based on actual input data.
+
+        3. As a captured call of an nn.Module instance, where the execution path
+        is set.
+
+    The generality decreases along this list. Within the scope of this function, the pass
+    creates fx submodules according to the third interpretation.
+
+    The first interpretation is the most general case. It requires complex analysis and additional
+    metadata and code information to construct its general form. Consider an example nn.Module
+    that generates arbitrary submodules based on an initialization configuration file. It's impractical
+    to extract this logic for the generated fx submodule to function with arbitrary configuration.
+
+    The second interpretation demands less analysis and is sturdier than the
+    first. In most use cases, it's equivalent to the third. It only differs in exceptional situations
+    where a complex nn.Module instance is called multiple times, each with a different set of inputs
+    leading to a unique execution branching path.
+
+    The third interpretation is the most specific scenario. It necessitates the minimum
+    analysis and creates the most stable representation. The drawback is that it
+    generates more redundancy than the other two methods. If needed, a subsequent post-processing
+    pass can be applied to consolidate completely identical functions and reduce duplication.
+
+    ### Known constraints
+    Two successive calls to the same module instance will be conflated. They are indistinguishable.
+    This is due to limitations of the current fx metadata "nn_module_stack".
+
+    [NOTE: Modularize pass ordering]
+    This pass groups fx nodes into subgraphs that reside within the `call_module` fx node.
+    Other fx passes (including some outside the exporter) might not recognize `call_module`.
+    They may assume that all nodes are flattened. Hence it is recommended to invoke this pass
+    as the last pre onnx export fx pass. If not for this consideration, this operation could
+    potentially be relocated anywhere earlier in the pipeline.
+
+    Example:
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> from torch.onnx._internal.fx import passes
+        >>> from torch.onnx._internal.diagnostics import infra
+        >>>
+        >>> class CustomModule(torch.nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.embedding = torch.nn.Embedding(10, 32)
+        >>>         self.relu = torch.nn.ReLU()
+        >>>
+        >>>     def forward(self, x):
+        >>>         out = self.embedding(x)
+        >>>         out = self.relu(out)
+        >>>         return out
+        >>>
+        >>> class TestModule(torch.nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.layer = CustomModule()
+        >>>         self.linear = torch.nn.Linear(32, 10)
+        >>>
+        >>>     def forward(self, x):
+        >>>         out = self.layer(x)
+        >>>         out = self.linear(out)
+        >>>         return out
+        >>>
+        >>> gm, _ = torch._dynamo.export(TestModule(), aten_graph=True)(torch.tensor([0, 1, 2]))
+        >>> gm.print_readable()
+
+        >>> gm = passes.Modularize(infra.DiagnosticContext("test_context", "1.0"), gm).run()
+        >>> gm.print_readable()
+
+    """
+
+    @_beartype.beartype
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        is_exported_program: bool = False,
+    ):
+        super().__init__(diagnostic_context, module)
+        self.module = module
+        self.is_exported_program = is_exported_program
+
+    @_beartype.beartype
+    def _run(self) -> torch.fx.GraphModule:
+        # DCE to remove unused nodes.
+        # If a submodule is unused, it is hard to analyze which nodes constitutes the submodule
+        # outputs. But since it is unused, we can just remove it.
+        self.module.graph.eliminate_dead_code()
+
+        reference_module = torch.fx.GraphModule(self.module, self.module.graph)
+        root_module_node = _ModuleNode(
+            reference_module,
+            _ModuleStackMeta(
+                nn_module_stack_meta=None, is_exported_program=self.is_exported_program
+            ),
+        )
+        for fx_node in self.module.graph.nodes:
+            root_module_node.add_leaf_node(
+                _LeafNode(fx_node, is_exported_program=self.is_exported_program)
+            )
+        return root_module_node.build_module({})
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/readability.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/readability.py
new file mode 100644
index 0000000000000000000000000000000000000000..53b1b456ad833d0101c53fb5af2bcbc06fe20b33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/readability.py
@@ -0,0 +1,131 @@
+from __future__ import annotations
+
+from typing import Dict, List, Sequence, Tuple, Union
+
+import torch
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import _pass, diagnostics
+
+
+class RestoreParameterAndBufferNames(_pass.Transform):
+    """Restore parameter and buffer names from original nn.module.
+
+    This pass is useful for readability of the exported ONNX graph. It restores the
+    parameter and buffer names from the original nn.module. For example, if the original
+    nn.module has a parameter named `root.linear.0.weight`, and the parameter is renamed to
+    `_param_constant9` by FX, this pass will rename it back.
+
+    This pass must be run after `Decompose` pass. Because this pass is expected to be called on
+    `fx.GraphModule` produced by `proxy_tensor.make_fx`, where all parameters and buffers
+    are registered at root level.
+    """
+
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        fx_module: torch.fx.GraphModule,
+        original_nn_module: torch.nn.Module,
+    ):
+        super().__init__(diagnostic_context, fx_module)
+        self.original_nn_module = original_nn_module
+
+    @_beartype.beartype
+    def _rename_param_and_buffer(
+        self,
+        diagnostic: diagnostics.Diagnostic,
+        nodes: Sequence[torch.fx.Node],
+        new_name: str,
+    ) -> None:
+        """Rename the parameter/buffer and replace corresponding nodes with new nodes of updated target."""
+        assert len(nodes) > 0, "`nodes` cannot be empty"
+        assert (
+            len({node.target for node in nodes}) == 1
+        ), "`nodes` must all have same `target`"
+        old_name = nodes[0].target
+        assert isinstance(old_name, str), f"Expected str, got type({old_name})"
+        # Parameter/buffer name cannot contain "."
+        normalized_name = new_name.replace(".", "/")
+        attr_value = getattr(self.module, old_name)
+        setattr(self.module, normalized_name, attr_value)
+        delattr(self.module, old_name)
+        for node in nodes:
+            with self.module.graph.inserting_before(node):
+                new_node = self.module.graph.get_attr(normalized_name)
+                new_node.meta = node.meta
+                node.replace_all_uses_with(new_node)
+                self.module.graph.erase_node(node)
+        diagnostic.info(
+            "Renamed 'self.%s' to 'self.%s', "
+            "normalized from original parameter name '%s'.",
+            old_name,
+            normalized_name,
+            new_name,
+        )
+
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        """Restore parameter and buffer names from original module.
+
+        For each `get_attr` node, if the target is a str representing a parameter or buffer
+        under `self.module`, we rename the parameter or buffer to its original name.
+        The parameters and buffers between `self.module` and `self.original_nn_module` refer
+        to the same objects, allowing us to use it as key to retrieve the original name.
+        """
+        assert len(args) == 0, "RestoreParameterAndBufferNames does not take any args"
+        assert (
+            len(kwargs) == 0
+        ), "RestoreParameterAndBufferNames does not take any kwargs"
+        # state_to_readable_name[parameter/buffer] returns the original readable name of
+        # the parameter/buffer. E.g., "self.linear.weight".
+        state_to_readable_name: Dict[Union[torch.nn.Parameter, torch.Tensor], str] = {}
+        state_to_readable_name.update(
+            {v: k for k, v in self.original_nn_module.named_parameters()}
+        )
+        state_to_readable_name.update(
+            {v: k for k, v in self.original_nn_module.named_buffers()}
+        )
+        diagnostic = self.diagnostic_context.inflight_diagnostic()
+
+        # old_name_to_nodes[old_name] returns a tuple of (nodes, new_name)
+        # where `nodes` is a list of `get_attr` nodes with `old_name` as `target` and
+        # `new_name` is the new readable name.
+        old_name_to_nodes: Dict[str, Tuple[List[torch.fx.Node], str]] = {}
+
+        for node in self.module.graph.nodes:
+            if node.op == "get_attr":
+                assert isinstance(
+                    node.target, str
+                ), f"Expected str, got type({node.target})"
+                if node.target.find(".") != -1:
+                    raise RuntimeError(
+                        f"Unexpected target {node.target} in get_attr, found '.' in target. "
+                        f"All parameters and buffers are expected to be registered at root level, "
+                        f"i.e., self.module. "
+                    )
+                if node.target in old_name_to_nodes:
+                    # We have already processed this parameter/buffer.
+                    old_name_to_nodes[node.target][0].append(node)
+                    continue
+                attr_value = getattr(self.module, node.target)
+                if (
+                    isinstance(attr_value, (torch.nn.Parameter, torch.Tensor))
+                    and attr_value in state_to_readable_name
+                ):
+                    readable_name = state_to_readable_name[attr_value]
+                    old_name_to_nodes[node.target] = ([node], readable_name)
+                    continue
+
+                diagnostic.info(
+                    "Cannot find readable name for self.%s: %s. The name is unchanged.",
+                    node.target,
+                    type(attr_value),
+                )
+                if isinstance(attr_value, torch.nn.Parameter):
+                    # If it is a parameter we treat it more seriously.
+                    diagnostic.level = diagnostics.levels.WARNING
+                else:
+                    diagnostic.level = diagnostics.levels.NONE
+
+        for nodes, new_name in old_name_to_nodes.values():
+            self._rename_param_and_buffer(diagnostic, nodes, new_name)
+
+        return self.module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py
new file mode 100644
index 0000000000000000000000000000000000000000..d78ed3d01f66a4d61bc3a9099562deda0f56f869
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/type_promotion.py
@@ -0,0 +1,1735 @@
+# Owner(s): ["module: onnx"]
+from __future__ import annotations
+
+import abc
+
+import dataclasses
+import inspect
+import logging
+from types import ModuleType
+
+from typing import Any, Callable, Mapping, Optional, Sequence, Set, Union
+
+import torch
+import torch._ops
+import torch.fx
+import torch.fx.traceback as fx_traceback
+
+from torch import _prims_common, _refs
+
+from torch._prims_common import (
+    ELEMENTWISE_TYPE_PROMOTION_KIND,
+    wrappers as _prims_common_wrappers,
+)
+from torch._refs import linalg as _linalg_refs, nn as _nn_refs, special as _special_refs
+from torch._refs.nn import functional as _functional_refs
+from torch._subclasses import fake_tensor
+from torch.fx.experimental import proxy_tensor
+
+# Imported to resolve beartype issue when type checking node.Argument.
+from torch.fx.node import Node  # noqa: F401
+
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import _pass, diagnostics, type_utils as fx_type_utils
+from torch.utils import _python_dispatch, _pytree
+
+logger = logging.getLogger(__name__)
+
+# TODO(bowbao): move to type utils.
+_SCALAR_TYPE_TENSOR_DTYPE_MAP: Mapping[type, torch.dtype] = {
+    bool: torch.bool,
+    int: torch.int64,
+    float: torch.float32,
+    complex: torch.complex32,
+}
+
+
+def _try_getclosurevars(func):
+    try:
+        return inspect.getclosurevars(func)
+    except TypeError as e:
+        return None
+
+
+@dataclasses.dataclass
+class TypePromotionSnapshot:
+    """Type promotion snapshot for a fx node and its inputs.
+
+    Contains the promoted dtype for args and kwargs that needs promoting.
+    Contains the expected node output dtype.
+    """
+
+    args_dtypes: Mapping[int, torch.dtype]
+    """Mapping from arg position to dtype to promote to."""
+
+    kwargs_dtypes: Mapping[str, torch.dtype]
+    """Mapping from kwarg name to dtype to promote to."""
+
+    out_dtype: torch.dtype
+    """Expected output dtype of the node."""
+
+
+class TypePromotionRule(abc.ABC):
+    """Base class for type promotion rule per 'torch.ops.{namespace}.{op_name}'."""
+
+    def __init__(self, namespace: str, op_name: str):
+        self.namespace = namespace
+        self.op_name = op_name
+
+    # Make this abstract as well because subclass needs to override __eq__().
+    # A class that overrides __eq__() and does not define __hash__() will have its __hash__() implicitly set to None.
+    # Ref: https://docs.python.org/3/reference/datamodel.html#object.__hash__
+    @abc.abstractmethod
+    def __hash__(self) -> int:
+        ...
+
+    @abc.abstractmethod
+    def __repr__(self):
+        ...
+
+    @abc.abstractmethod
+    def __eq__(self, other: object) -> bool:
+        ...
+
+    def is_valid(self) -> bool:
+        """Check if the rule is valid."""
+        # This always returns a module. If the module does not exist it will be created.
+        module = getattr(torch.ops, self.namespace)
+        py_op = getattr(module, self.op_name, None)
+        if py_op is None:
+            logger.warning(
+                "Cannot find op: %s in module: %s", self.op_name, self.namespace
+            )
+            return False
+        if not isinstance(py_op, torch._ops.OpOverloadPacket):
+            logger.warning(
+                "Op: torch.ops.%s.%s is not an OpOverloadPacket, got: %s",
+                self.namespace,
+                self.op_name,
+                type(py_op),
+            )
+            return False
+
+        return True
+
+    @abc.abstractmethod
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        """Preview type promotion results for provided set of args and kwargs.
+
+        Returns a TypePromotionSnapshot object that contains the promoted dtypes for
+        the arguments and the expected output dtype.
+        """
+        ...
+
+
+class ElementwiseTypePromotionRule(TypePromotionRule):
+    """Defines how to perform elementwise type promotion for 'torch.ops.{namespace}.{op_name}'."""
+
+    _USE_OPMATH: bool = False
+    """Whether to use opmath to compute the promoted input dtype.
+    If used, upcasts will be inserted everywhere for lower precision models.
+    Set to False and have torchlib handle upcasts in op implementation internally.
+    """
+
+    def __init__(
+        self,
+        namespace: str,
+        op_name: str,
+        promote_args_positions: Sequence[int],
+        promote_kwargs_names: Sequence[str],
+        promotion_kind: _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND,
+    ):
+        """Constructs a TypePromotionRule for elementwise operators.
+
+        Args:
+            namespace: Namespace of the op. E.g. 'aten' in 'torch.ops.aten.add'.
+            op_name: Name of the op. E.g. 'add' in 'torch.ops.aten.add'.
+            promote_args_positions: Positions of args to promote.
+            promote_kwargs_names: Names of kwargs to promote.
+            promotion_kind: Type promotion kind. Refer to [_prims_common.elementwise_dtypes](https://github.com/pytorch/pytorch/blob/main/torch/_prims_common/__init__.py) for detail.  # noqa: B950
+        """
+        super().__init__(namespace, op_name)
+        self.promote_args_positions = promote_args_positions
+        self.promote_kwargs_names = promote_kwargs_names
+        self.promotion_kind = promotion_kind
+
+    def __repr__(self):
+        return (
+            f"ElementwiseTypePromotionRule('{self.namespace}', '{self.op_name}', "
+            f"{self.promote_args_positions}, {self.promote_kwargs_names}, {self.promotion_kind})"
+        )
+
+    def __eq__(self, __value: object) -> bool:
+        if not isinstance(__value, ElementwiseTypePromotionRule):
+            return False
+        return (
+            self.namespace == __value.namespace
+            and self.op_name == __value.op_name
+            and self.promote_args_positions == __value.promote_args_positions
+            and self.promote_kwargs_names == __value.promote_kwargs_names
+            and self.promotion_kind == __value.promotion_kind
+        )
+
+    def __hash__(self) -> int:
+        return f"{type(self)}:{self.namespace}.{self.op_name}".__hash__()
+
+    def _consolidate_input_dtype(
+        self, computed_dtype: torch.dtype, result_dtype: torch.dtype
+    ) -> torch.dtype:
+        """
+        Although opmath is the right thing to do to retain on-par precision, it inserts
+        upcasts everywhere in the graph. This is particularly hard for backend to optimize
+        since there is no way to differentiate between inserted upcasts and model code
+        casts. Hence we consolidate the input dtype to the result dtype to avoid this.
+        """
+        if not self._USE_OPMATH and self.promotion_kind in (
+            _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+            _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+        ):
+            return result_dtype
+        return computed_dtype
+
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        candidate_args = {
+            i: args[i]
+            for i in self.promote_args_positions
+            if i < len(args) and args[i] is not None
+        }
+        candidate_kwargs = {
+            name: kwargs[name]
+            for name in self.promote_kwargs_names
+            if name in kwargs and kwargs[name] is not None
+        }
+
+        computed_dtype, result_dtype = _prims_common.elementwise_dtypes(
+            *_pytree.arg_tree_leaves(*candidate_args.values(), **candidate_kwargs),
+            type_promotion_kind=self.promotion_kind,
+        )
+
+        consolidated_input_dtype = self._consolidate_input_dtype(
+            computed_dtype, result_dtype
+        )
+
+        return TypePromotionSnapshot(
+            dict.fromkeys(candidate_args.keys(), consolidated_input_dtype),
+            dict.fromkeys(candidate_kwargs.keys(), consolidated_input_dtype),
+            result_dtype,
+        )
+
+
+class DivElementwiseTypePromotionRule(ElementwiseTypePromotionRule):
+    """Reference type promotion rule from torch._refs.div.
+
+    Rule depends on the value of the `rounding_mode` argument.
+    """
+
+    def __init__(self):
+        super().__init__(
+            "aten",
+            "div",
+            promote_args_positions=(0, 1),
+            promote_kwargs_names=(),
+            promotion_kind=_prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+        )
+
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        rounding_mode = kwargs.get("rounding_mode", None)
+        if rounding_mode is None:
+            # true_divide
+            self.promotion_kind = (
+                _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+            )
+            return super().preview_type_promotion(args, kwargs)
+        if rounding_mode == "trunc":
+            # trunc_divide
+            self.promotion_kind = _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+            return super().preview_type_promotion(args, kwargs)
+        if rounding_mode == "floor":
+            # floor_divide
+            self.promotion_kind = _prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+            return super().preview_type_promotion(args, kwargs)
+        raise ValueError(f"Unknown rounding_mode: {rounding_mode}")
+
+
+class ReductionTypePromotionRule(TypePromotionRule):
+    def __init__(
+        self,
+        namespace: str,
+        op_name: str,
+        promotion_kind: _prims_common.REDUCTION_OUTPUT_TYPE_KIND,
+    ):
+        """Constructs a TypePromotionRule for reduction operators.
+
+        Args:
+            namespace: Namespace of the op. E.g. 'aten' in 'torch.ops.aten.sum'.
+            op_name: Name of the op. E.g. 'sum' in 'torch.ops.aten.sum'.
+            promotion_kind: Type promotion kind. Refer to [_prims_common.reduction_dtypes]((https://github.com/pytorch/pytorch/blob/main/torch/_prims_common/__init__.py)) for detail.  # noqa: B950
+        """
+        super().__init__(namespace, op_name)
+        self.promotion_kind = promotion_kind
+
+    def __repr__(self):
+        return f"ReductionTypePromotionRule('{self.namespace}', '{self.op_name}', {self.promotion_kind})"
+
+    def __eq__(self, __value: object) -> bool:
+        if not isinstance(__value, ElementwiseTypePromotionRule):
+            return False
+        return (
+            self.namespace == __value.namespace
+            and self.op_name == __value.op_name
+            and self.promotion_kind == __value.promotion_kind
+        )
+
+    def __hash__(self) -> int:
+        return f"{type(self)}:{self.namespace}.{self.op_name}".__hash__()
+
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        assert len(args) >= 1 and isinstance(
+            arg := args[0], torch.Tensor
+        ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        dtype: Optional[torch.dtype] = kwargs.get("dtype", None)
+
+        computation_dtype, result_dtype = _prims_common.reduction_dtypes(
+            arg, self.promotion_kind, dtype
+        )
+        if result_dtype is None:
+            # Inspecting code, this can only happen when `promotion_kind` is `KEEP_PROMOTED_TYPE`.
+            # Hence set same as computation_dtype.
+            result_dtype = computation_dtype
+
+        return TypePromotionSnapshot(
+            {0: computation_dtype},
+            {},
+            result_dtype,
+        )
+
+
+class AllOrAnyReductionTypePromotionRule(ReductionTypePromotionRule):
+    """Reference type promotion rule from torch.ops.aten.all or torch.ops.aten.any.
+
+    This is a special case where computation dtype is always torch.bool.
+    The result dtype is always uint8 if `dtype` kwarg is uint8, otherwise torch.bool.
+    """
+
+    def __init__(self, op_name: str):
+        super().__init__(
+            "aten",
+            op_name,
+            _prims_common.REDUCTION_OUTPUT_TYPE_KIND.ALWAYS_BOOL,
+        )
+
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        assert len(args) >= 1 and isinstance(
+            arg := args[0], torch.Tensor
+        ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        computation_dtype = torch.bool
+        # Preserves uint8 -- probably a legacy mask thing
+        result_dtype = torch.uint8 if arg.dtype == torch.uint8 else torch.bool
+        return TypePromotionSnapshot(
+            {0: computation_dtype},
+            {},
+            result_dtype,
+        )
+
+
+class SumLikeReductionTypePromotionRule(ReductionTypePromotionRule):
+    """Reference type promotion rule from torch.ops.aten.sum.
+
+    This is a special case where computation dtype is always torch.int64 for integral arg,
+    unless overridden by `dtype` kwarg.
+    """
+
+    def preview_type_promotion(
+        self, args: tuple, kwargs: dict
+    ) -> TypePromotionSnapshot:
+        assert len(args) >= 1 and isinstance(
+            arg := args[0], torch.Tensor
+        ), f"Reduction op torch.ops.{self.namespace}.{self.op_name} expects at least one argument"
+        dtype: Optional[torch.dtype] = kwargs.get("dtype", None)
+        # The below logic is copied from `torch/_refs/__init__.py` reduction ops impl.
+        if dtype is None:
+            if _prims_common.is_boolean_dtype(
+                arg.dtype
+            ) or _prims_common.is_integer_dtype(arg.dtype):
+                dtype = torch.int64
+            else:
+                dtype = arg.dtype
+        return super().preview_type_promotion(args, {"dtype": dtype})
+
+
+# NOTE: [Update type promotion rule]
+# BELOW TABLE IS GENERATED FROM `TypePromotionRuleSetGenerator.generate_from_torch_refs`.
+# DO NOT EDIT MANUALLY !!!
+# For missing rules or discrepancies, please
+# 1. Run `pytest test/onnx/test_fx_type_promotion.py` to validate if the generated rule set is current.
+#    If it is not, update with new generated set.
+# 2. If discrepancies still exist, consider debugging torch._refs or report a bug.
+# 3. If rules are still missing, add them to `_EXTRA_TYPE_PROMOTION_RULE_SET` or report a bug.
+# Check `TypePromotionRule` class for how each rule is defined and used.
+_GENERATED_ATEN_TYPE_PROMOTION_RULE_SET = {
+    ElementwiseTypePromotionRule(
+        "aten", "abs", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "abs_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "acos", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "acos_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "acosh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "acosh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "add", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "add_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "addcdiv", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "addcdiv_", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "addcmul", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "addcmul_", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "addr", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "asin", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "asin_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "asinh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "asinh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atan", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atan2", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atan2_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atan_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atanh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "atanh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_and", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_and_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "bitwise_left_shift",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "bitwise_left_shift_",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_not", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_not_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_or", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_or_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "bitwise_right_shift",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "bitwise_right_shift_",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_xor", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "bitwise_xor_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cat", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cauchy", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cauchy_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ceil", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ceil_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "celu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "celu_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "clamp", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "clamp_", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "copysign", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "copysign_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cos", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cos_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cosh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "cosh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "deg2rad", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "deg2rad_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "digamma", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "digamma_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "elu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "elu_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "eq", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "eq_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erf", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erf_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erfc", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erfc_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erfinv", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "erfinv_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exp", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exp2", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exp2_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exp_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "expm1", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "expm1_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exponential", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "exponential_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "fill", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "floor", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "floor_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "floor_divide", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "floor_divide_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "fmax", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "fmin", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "fmod", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "fmod_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "frac", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "frac_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "gcd", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "gcd_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ge", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ge_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "gelu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "geometric", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "geometric_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "glu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "gt", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "gt_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "hardtanh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "heaviside", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "heaviside_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "huber_loss", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "hypot", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "hypot_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "i0", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "i0_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "igamma", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "igamma_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "igammac", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "igammac_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isfinite", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isinf", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isnan", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isneginf", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isposinf", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "isreal", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "l1_loss", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lcm", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lcm_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "le", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "le_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "leaky_relu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lerp", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lerp_", [0, 1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lgamma", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lgamma_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log10", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log10_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log1p", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log1p_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log2", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log2_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log_normal", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "log_normal_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logaddexp", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logaddexp2", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_and", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_and_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_not", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_not_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_or", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_or_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_xor", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logical_xor_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logit", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "logsumexp", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lt", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "lt_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "maximum", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "minimum", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "mish", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "mish_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "mse_loss", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "mul", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "mul_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ne", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "ne_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "neg", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "neg_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "nextafter", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "nextafter_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "nll_loss", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "normal", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "normal_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "pdist", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "poisson_nll_loss",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "pow", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "pow_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "prelu", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "rad2deg", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "rad2deg_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "reciprocal", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "reciprocal_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "relu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "remainder", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "remainder_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "round", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "rsqrt", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "rsqrt_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "rsub", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "selu", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "selu_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sgn", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sgn_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sigmoid", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sigmoid_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sign", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sign_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "signbit", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.ALWAYS_BOOL
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sin", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sin_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sinc", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sinc_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sinh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sinh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "smooth_l1_loss",
+        [0, 1],
+        [],
+        ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "softplus", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sqrt", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sqrt_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "square", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "square_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.BOOL_TO_LONG
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sub", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "sub_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "tan", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "tan_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "tanh", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "tanh_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "threshold", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "threshold_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "true_divide", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "true_divide_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "trunc", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "trunc_", [0], [], ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "where", [1, 2], [], ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "xlogy", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+    ElementwiseTypePromotionRule(
+        "aten", "xlogy_", [0, 1], [], ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT
+    ),
+}
+
+# Manually curated extra type promotion rules. Please see NOTE [Update type promotion rule]
+# before adding new rules.
+_EXTRA_TYPE_PROMOTION_RULE_SET = {
+    # torch._refs skips type promotion decoration for `clamp_min` and `clamp_max` since
+    # the call is routed to the decorated `aten.clamp` op.
+    ElementwiseTypePromotionRule(
+        "aten",
+        "clamp_max",
+        promote_args_positions=(0, 1),
+        promote_kwargs_names=(),
+        promotion_kind=_prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    ElementwiseTypePromotionRule(
+        "aten",
+        "clamp_min",
+        promote_args_positions=(0, 1),
+        promote_kwargs_names=(),
+        promotion_kind=_prims_common.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT,
+    ),
+    # torch.ops.aten.div.Tensor_mode applies different type promotion rules
+    # depending on the value of the `mode` argument.
+    DivElementwiseTypePromotionRule(),
+    # Manually curating reduction ops since the logic is written inside the op reference
+    # implementation.
+    AllOrAnyReductionTypePromotionRule("all"),
+    AllOrAnyReductionTypePromotionRule("any"),
+    ReductionTypePromotionRule(
+        "aten",
+        "amax",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+    ReductionTypePromotionRule(
+        "aten",
+        "amin",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+    # torch.ops.aten.mean is a special case that does not need type promotion.
+    ReductionTypePromotionRule(
+        "aten",
+        "std",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT,
+    ),
+    ReductionTypePromotionRule(
+        "aten",
+        "std_mean",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT,
+    ),
+    ReductionTypePromotionRule(
+        "aten",
+        "var",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.COMPLEX_TO_FLOAT,
+    ),
+    SumLikeReductionTypePromotionRule(
+        "aten",
+        "cumprod",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+    SumLikeReductionTypePromotionRule(
+        "aten",
+        "cumsum",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+    SumLikeReductionTypePromotionRule(
+        "aten",
+        "prod",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+    SumLikeReductionTypePromotionRule(
+        "aten",
+        "sum",
+        promotion_kind=_prims_common.REDUCTION_OUTPUT_TYPE_KIND.SAME,
+    ),
+}
+
+
+class ElementwiseTypePromotionRuleSetGenerator:
+    """Hackly distilling info from reference ops decorated with elementwise type promotion rule.
+
+    The goal is to retrieve the decorator
+
+    ```python
+        @elementwise_type_promotion_wrapper(
+            type_promoting_args=("a", "b"),
+            type_promotion_kind=type_promotion_kind,
+        )
+    ```
+
+    from the reference ops. It provides info as for which arguments are promoted
+    and what kind of promotion is applied.
+    """
+
+    @classmethod
+    def generate_from_torch_refs(cls) -> Set[ElementwiseTypePromotionRule]:
+        """Parse type promotion rules from reference ops under torch._C._refs."""
+        rule_set = set()
+        rule_set.update(cls._parse_torch_refs(_refs))
+        rule_set.update(cls._parse_torch_refs(_nn_refs))
+        rule_set.update(cls._parse_torch_refs(_linalg_refs))
+        rule_set.update(cls._parse_torch_refs(_special_refs))
+        rule_set.update(cls._parse_torch_refs(_functional_refs))
+        return rule_set
+
+    @classmethod
+    def _parse_torch_refs(
+        cls, ref_module: ModuleType
+    ) -> Set[ElementwiseTypePromotionRule]:
+        logger.info("Processing module: %s", ref_module.__name__)
+        rule_set = set()
+        for name in ref_module.__all__:
+            decorated_op = getattr(ref_module, name)
+            rule = cls._parse_type_promotion_rule_from_refs_op(decorated_op)
+            if rule is not None and rule.is_valid():
+                rule_set.add(rule)
+
+        return rule_set
+
+    @classmethod
+    def _parse_type_promotion_rule_from_refs_op(
+        cls,
+        decorated_op: Callable,
+    ) -> Optional[ElementwiseTypePromotionRule]:
+        """Retrieve and parse type promotion decorator from op under torch._refs."""
+        fn = decorated_op
+        type_promo_wrapper = None
+        while fn_closure_vars := _try_getclosurevars(fn):
+            if "fn" not in fn_closure_vars.nonlocals:
+                break
+            if "self" in fn_closure_vars.nonlocals and isinstance(
+                fn_closure_vars.nonlocals["self"],
+                _prims_common_wrappers.elementwise_type_promotion_wrapper,
+            ):
+                type_promo_wrapper = fn_closure_vars.nonlocals["self"]
+                break
+            fn = fn_closure_vars.nonlocals["fn"]
+
+        if type_promo_wrapper is not None:
+            signature = inspect.signature(decorated_op)
+
+            pos = 0
+            promote_args_positions = []
+            promote_kwargs_names = []
+
+            if type_promo_wrapper.type_promoting_arg_names is not None:
+                for name, param in signature.parameters.items():
+                    if name in type_promo_wrapper.type_promoting_arg_names:
+                        if param.kind in (
+                            param.POSITIONAL_OR_KEYWORD,
+                            param.POSITIONAL_ONLY,
+                        ):
+                            promote_args_positions.append(pos)
+                        elif param.kind == param.KEYWORD_ONLY:
+                            promote_kwargs_names.append(name)
+                    pos += 1
+
+            return ElementwiseTypePromotionRule(
+                "aten",
+                decorated_op.__name__,
+                promote_args_positions=promote_args_positions,
+                promote_kwargs_names=promote_kwargs_names,
+                promotion_kind=type_promo_wrapper.type_promotion_kind,
+            )
+
+        logger.warning(
+            "Cannot find type promotion rule for: %s.%s",
+            decorated_op.__module__,
+            decorated_op.__name__,
+        )
+        return None
+
+
+class TypePromotionTable:
+    """Type promotion table for torch.ops."""
+
+    def __init__(self):
+        self._rule_table = {}
+        for rule in _GENERATED_ATEN_TYPE_PROMOTION_RULE_SET:
+            self.add_rule(rule)
+        for rule in _EXTRA_TYPE_PROMOTION_RULE_SET:
+            self.add_rule(rule)
+
+    @_beartype.beartype
+    def add_rule(self, rule: TypePromotionRule) -> None:
+        """Add a type promotion rule for a python op in a torch.ops module.
+
+        Args:
+            rule: Type promotion rule.
+            module: Module containing the op. E.g. torch.ops.aten.
+
+        Raises:
+            ValueError: If the rule is invalid.
+        """
+        if not rule.is_valid():
+            raise ValueError(f"Invalid type promotion rule: {rule}")
+        self._rule_table[f"{rule.namespace}.{rule.op_name}"] = rule
+
+    @_beartype.beartype
+    def get_rule(
+        self, py_op: torch._ops.OpOverloadPacket
+    ) -> Optional[TypePromotionRule]:
+        """Get type promotion rule for a python op under 'torch.ops.<namespace>'."""
+        return self._rule_table.get(str(py_op), None)
+
+
+@_beartype.beartype
+def get_type_promotion_rule(
+    diagnostic: diagnostics.Diagnostic,
+    node: torch.fx.Node,
+    type_promotion_table: TypePromotionTable,
+) -> Optional[TypePromotionRule]:
+    """Get type promotion rule for a node.
+
+    Args:
+        diagnostic: Diagnostic object.
+        node: Node to get type promotion rule for.
+        type_promotion_table: Type promotion table.
+
+    Returns:
+        Type promotion rule for the node. None if no rule is found or if the node is not
+        representing a torch operator.
+    """
+    op = node.target
+    if not isinstance(op, torch._ops.OpOverload):
+        # TODO(bowbao): diagnostic.emit and diagnostic.set_message api.
+        diagnostic.message = (
+            f"Skipped for {diagnostics.format_argument(node)}: "
+            f"node.target is not OpOverload. Got type: {type(op)}"
+        )
+        return None
+    if (rule := type_promotion_table.get_rule(op.overloadpacket)) is None:
+        diagnostic.message = (
+            f"Skipped for {diagnostics.format_argument(node)}: "
+            f"Cannot find type promotion rule for op: {op}"
+        )
+        return None
+
+    diagnostic.info("Found type promotion rule: %s", rule)
+    return rule
+
+
+class _OpTraceDispatchMode(_python_dispatch.TorchDispatchMode):
+    """Trace ops that were dispatched.
+
+    Utilize the dispatch mechanism in [`__torch_dispatch__`](https://dev-discuss.pytorch.org/t/what-and-why-is-torch-dispatch/557)
+    to trace op overloads that were dispatched to. This is used to find the compatible
+    op overload for a given op overload packet for different set of args and kwargs.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.traced_ops = []
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        self.traced_ops.append(func)
+        return func(*args, **kwargs)
+
+
+@_beartype.beartype
+def find_compatible_op_overload(
+    op: torch._ops.OpOverloadPacket, args: tuple, kwargs: dict
+) -> torch._ops.OpOverload:
+    """Find compatible OpOverload for an OpOverloadPacket using provided args and kwargs.
+
+    Each "call_function" fx.Node in the fx.GraphModule has a target that represents a torch._ops.OpOverload.
+    The OpOverload contains an OpOverloadPacket that holds all the available overloads for the operation.
+
+    During the type promotion pass, there are cases where the types of the args and kwargs may change,
+    such as promoting Python numbers to tensors. Consequently, the original OpOverload might not be
+    compatible with the updated args and kwargs. This function is used to identify the compatible
+    OpOverload for the given args and kwargs.
+
+    Args:
+        op: OpOverloadPacket to find compatible OpOverload for.
+        args: The positional arguments to consider for compatibility.
+        kwargs: The keyword arguments to consider for compatibility.
+
+    Returns:
+        torch._ops.OpOverload: The compatible OpOverload found for the given args and kwargs.
+
+    Raises:
+        RuntimeError: If no compatible op overload is found.
+
+    Examples:
+        >>> import torch
+        >>> packet = torch.ops.aten.pow
+        >>> args = (torch.tensor([1.0, 2.0]), 2)
+        >>> find_compatible_op_overload(packet, args, {})._overloadname
+        'Tensor_Scalar'
+        >>> args = (torch.tensor([1.0, 2.0]), torch.tensor(2.0))
+        >>> find_compatible_op_overload(packet, args, {})._overloadname
+        'Tensor_Tensor'
+    """
+    # Utilize the dispatch mechanism to find the compatible op overload.
+    op_trace_dispatch_mode = _OpTraceDispatchMode()
+    with op_trace_dispatch_mode:
+        op(*args, **kwargs)
+    assert (
+        len(op_trace_dispatch_mode.traced_ops) >= 1
+    ), "Expected at least 1 traced op, got 0"
+
+    new_op_overload = op_trace_dispatch_mode.traced_ops[0]
+    assert isinstance(
+        new_op_overload, torch._ops.OpOverload
+    ), f"Expected OpOverload, got {type(new_op_overload)}"
+    assert (
+        new_op_overload.overloadpacket == op
+    ), f"Expected same OpOverload packet, got {new_op_overload.overloadpacket} != {op}"
+
+    return new_op_overload
+
+
+class _TypePromotionInterpreter(torch.fx.Interpreter):
+    """Interpreter that inserts type promotion for each node."""
+
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        type_promotion_table: TypePromotionTable,
+    ):
+        super().__init__(module)
+        self.diagnostic_context = diagnostic_context
+        self.type_promotion_table = type_promotion_table
+
+    def _run_node_and_set_meta(self, node) -> Any:
+        """Run node and set meta according to `fx_traceback.get_current_meta()`.
+
+        This should be used on new nodes or nodes that have been modified.
+        By default `Interpreter.run_node` does not update `node.meta`.
+        Set `node.meta` to the current meta, except for `node.meta["val"]`, which is
+        recomputed.
+        """
+        out = super().run_node(node)
+        # Update interpreter env state with new output value.
+        self.env[node] = out
+        node.meta.update(
+            (k, v)
+            for k, v in fx_traceback.get_current_meta().items()
+            if k not in node.meta
+        )
+        node.meta["val"] = proxy_tensor.extract_val(out)
+        return out
+
+    @_beartype.beartype
+    def _create_node(
+        self,
+        graph: torch.fx.Graph,
+        op_type: str,
+        target: torch.fx.node.Target,
+        args: tuple,
+        kwargs: dict,
+    ) -> torch.fx.Node:
+        """Create a node and set its metadata."""
+        assert op_type in (
+            "call_function",
+            "call_method",
+            "get_attr",
+            "call_module",
+            "placeholder",
+            "output",
+        ), f"Unexpected op_type: {op_type}"
+        node = getattr(graph, op_type)(target, args, kwargs)
+        self._run_node_and_set_meta(node)
+        return node
+
+    @_beartype.beartype
+    def _rerun_node_after_type_promotion(
+        self,
+        diagnostic: diagnostics.Diagnostic,
+        node: torch.fx.Node,
+        expected_out_dtype: torch.dtype,
+    ) -> None:
+        """Rerun a node after type promotion and update node.meta["val"] with the output value."""
+        node_val = node.meta.get("val", None)
+        assert node_val is not None, f"Node {node} node.meta['val'] is not set."
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        target = node.target
+        assert isinstance(
+            target, torch._ops.OpOverload
+        ), f"Expected OpOverload, got {type(target)}"
+        node.target = find_compatible_op_overload(target.overloadpacket, args, kwargs)
+
+        new_node_val = self._run_node_and_set_meta(node)
+        assert isinstance(new_node_val, type(node_val)), (
+            f"run_node output type should not change between runs. "
+            f"Got {type(new_node_val)}, expect {type(node_val)}."
+        )
+
+        if isinstance(node_val, torch.Tensor):
+            prev_node_dtype = node_val.dtype
+
+            assert prev_node_dtype == expected_out_dtype, (
+                f"node.meta['val'].dtype({prev_node_dtype}) does not agree with "
+                f"type promotion rule({expected_out_dtype})."
+            )
+
+            if new_node_val.dtype != expected_out_dtype:
+                # With explicit type promotion, the expected result dtype may not be
+                # the same as the computation dtype. This is referred to as "op math".
+                # We need to explicitly cast the output back to the expected dtype.
+                # See more about "op math" topic at `_prims_common.elementwise_dtypes`.
+                graph = node.graph
+                with graph.inserting_after(node):
+                    output_cast_node = self._create_node(
+                        graph,
+                        "call_function",
+                        torch.ops.prims.convert_element_type.default,
+                        (node,),
+                        {"dtype": expected_out_dtype},
+                    )
+                    node.replace_all_uses_with(output_cast_node)
+                    output_cast_node.args = (node,)
+                    diagnostic.info(
+                        "Node '%s' output dtype becomes %s due to op math. "
+                        "Cast back to %s.",
+                        node,
+                        new_node_val.dtype,
+                        expected_out_dtype,
+                    )
+
+        elif fx_type_utils.is_torch_symbolic_type(node_val):
+            raise NotImplementedError(
+                "Type promotion does not support node output of sym types."
+            )
+        elif isinstance(node_val, (list, tuple)):
+            raise NotImplementedError(
+                "Type promotion does not support node output of list or tuple."
+            )
+        else:
+            raise RuntimeError(f"Unexpected node output type: {type(node_val)}.")
+
+    @_beartype.beartype
+    def _maybe_promote_arg(
+        self,
+        diagnostic: diagnostics.Diagnostic,
+        node: torch.fx.Node,
+        fx_arg: torch.fx.node.Argument,
+        dtype: Optional[torch.dtype],
+    ) -> torch.fx.node.Argument:
+        """Promote fx_arg to dtype if necessary."""
+        if dtype is None:
+            diagnostic.info(
+                "Argument %s is not promoted. Not mentioned by type promotion rule.",
+                fx_arg,
+            )
+            return fx_arg
+
+        if isinstance(fx_arg, torch.fx.Node):
+            arg_val = self.env[fx_arg]
+            if isinstance(arg_val, torch.Tensor):
+                if (old_dtype := arg_val.dtype) != dtype:
+                    # Promote tensor to dtype.
+                    graph = node.graph
+                    with graph.inserting_before(node):
+                        diagnostic.info(
+                            "Argument %s(%s) is promoted to %s.",
+                            fx_arg,
+                            old_dtype,
+                            dtype,
+                        )
+                        return self._create_node(
+                            graph,
+                            "call_function",
+                            torch.ops.prims.convert_element_type.default,
+                            (fx_arg,),
+                            {"dtype": dtype},
+                        )
+                diagnostic.info(
+                    "Argument %s is not promoted. Already %s.", fx_arg, dtype
+                )
+                return fx_arg
+            elif fx_type_utils.is_torch_symbolic_type(arg_val):
+                arg_type = type(arg_val)
+                equivalent_dtype = fx_type_utils.from_scalar_type_to_torch_dtype(
+                    arg_type
+                )
+                assert equivalent_dtype is not None, f"Unexpected arg_type: {arg_type}"
+                if equivalent_dtype != dtype:
+                    # Promote Sym number to tensor of dtype.
+                    graph = node.graph
+                    with graph.inserting_before(node):
+                        diagnostic.info(
+                            "Argument %s(Scalar of equivalent dtype: %s) "
+                            "is promoted to %s.",
+                            fx_arg,
+                            equivalent_dtype,
+                            dtype,
+                        )
+                        return self._create_node(
+                            graph,
+                            "call_function",
+                            torch.ops.aten.scalar_tensor.default,
+                            (fx_arg,),
+                            {"dtype": dtype},
+                        )
+                diagnostic.info(
+                    "Argument %s is not promoted. Already %s.", fx_arg, dtype
+                )
+                return fx_arg
+        elif (
+            equivalent_dtype := fx_type_utils.from_scalar_type_to_torch_dtype(
+                type(fx_arg)
+            )
+        ) is not None:
+            if equivalent_dtype != dtype:
+                # Promote number to tensor of dtype.
+                # The op should have overload that supports tensor for this arg, otherwise
+                # the type promotion rule should not suggest promoting this arg.
+                graph = node.graph
+                with graph.inserting_before(node):
+                    diagnostic.info(
+                        "Argument %s(Scalar of equivalent dtype: %s) "
+                        "is promoted to %s.",
+                        fx_arg,
+                        equivalent_dtype,
+                        dtype,
+                    )
+                    return self._create_node(
+                        graph,
+                        "call_function",
+                        torch.ops.aten.scalar_tensor.default,
+                        (fx_arg,),
+                        {"dtype": dtype},
+                    )
+            diagnostic.info("Argument %s is not promoted. Already %s.", fx_arg, dtype)
+            return fx_arg
+        elif isinstance(fx_arg, (tuple, list)):
+            diagnostic.info(
+                "Argument %s is a tuple/list. Promoting each element.", fx_arg
+            )
+            return type(fx_arg)(
+                self._maybe_promote_arg(diagnostic, node, fx_arg_elem, dtype)
+                for fx_arg_elem in fx_arg
+            )
+
+        raise NotImplementedError(f"Unknown fx arg type: {type(fx_arg)}")
+
+    @_beartype.beartype
+    def _maybe_promote_node(
+        self,
+        diagnostic: diagnostics.Diagnostic,
+        node: torch.fx.Node,
+        rule: TypePromotionRule,
+    ) -> torch.fx.Node:
+        """Promote node inputs and outputs according to type promotion rule."""
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        type_promotion_info = rule.preview_type_promotion(args, kwargs)
+        new_args = []
+        new_kwargs = {}
+        for i, arg in enumerate(node.args):
+            new_args.append(
+                self._maybe_promote_arg(
+                    diagnostic, node, arg, type_promotion_info.args_dtypes.get(i, None)
+                )
+            )
+
+        for name, arg in node.kwargs.items():
+            new_kwargs[name] = self._maybe_promote_arg(
+                diagnostic, node, arg, type_promotion_info.kwargs_dtypes.get(name, None)
+            )
+        new_args = tuple(new_args)
+
+        if node.args != new_args or node.kwargs != new_kwargs:
+            diagnostic.message = f"Applied type promotion for {node}. "
+            node.args = new_args
+            node.kwargs = new_kwargs
+            self._rerun_node_after_type_promotion(
+                diagnostic, node, type_promotion_info.out_dtype
+            )
+        else:
+            diagnostic.message = f"Type promotion not needed for {node}. "
+
+        return node
+
+    @diagnostics.diagnose_call(
+        rule=diagnostics.rules.fx_node_insert_type_promotion,
+        level=diagnostics.levels.NONE,
+    )
+    def run_node(self, node: torch.fx.Node) -> Any:
+        """This method is an override which inserts type promotion nodes as needed.
+
+        For each `call_function` node, an initial check is conducted to determine if a type
+        promotion rule is applicable. If a relevant rule exists, type casting nodes are
+        introduced for the corresponding arguments. The OpOverload of the node is updated
+        to one that accommodates the promoted types. Should the output type be different,
+        type casting node is inserted for this output.
+
+        The call `super().run_node(node)` is guaranteed to be invoked for each node.
+        In the case of new or modified nodes, the result of `super().run_node(node)` is
+        used to update its `node.meta["val"]` value.
+        """
+        diagnostic = self.diagnostic_context.inflight_diagnostic()
+        with self._set_current_node(node):
+            if node.op != "call_function":
+                diagnostic.message = f"Skipped {node}: not a call_function."
+            elif rule := get_type_promotion_rule(
+                diagnostic, node, self.type_promotion_table
+            ):
+                self._maybe_promote_node(diagnostic, node, rule)
+
+        return super().run_node(node)
+
+
+class InsertTypePromotion(_pass.Transform):
+    """Explicitly insert type promotion ops to the graph.
+
+    This class subclasses `_pass.Transform` to provide graph level diagnostic tracking.
+    Underneath, the main pass is driven by `_TypePromotionInterpreter`, which is a subclass
+    of `torch.fx.Interpreter` to interpret the fx.Graph and perform the insertion of type
+    promotion operations.
+
+    The interpreter is extended with ability to track diagnostic information for each node.
+
+    By re-running the new and modified nodes using the interpreter, we can update the
+    metadata, specifically the fake tensor stored under node.meta["val"], and ensure it
+    reflects the latest changes.
+
+    See [FXE0015: fx_node_insert_type_promotion](https://pytorch.org/docs/master/generated/onnx_dynamo_diagnostics_rules/FXE0015%3Afx-node-insert-type-promotion.html) for more details.  # noqa: B950
+    """
+
+    def __init__(
+        self,
+        diagnostic_context: diagnostics.DiagnosticContext,
+        module: torch.fx.GraphModule,
+        type_promotion_table: Optional[TypePromotionTable] = None,
+    ):
+        super().__init__(diagnostic_context, module)
+        self.interpreter = _TypePromotionInterpreter(
+            diagnostic_context, module, type_promotion_table or TypePromotionTable()
+        )
+
+    def _fetch_fake_args(
+        self,
+    ) -> Sequence[
+        Optional[
+            Union[
+                fake_tensor.FakeTensor,
+                float,
+                int,
+                bool,
+                torch.SymInt,
+                torch.SymFloat,
+                torch.SymBool,
+            ]
+        ]
+    ]:
+        """Fetch fake args from fx graph.
+
+        For each argument, try to fetch fake tensor from the matching placeholder node.
+        """
+        fake_args = []
+        for node in self.module.graph.nodes:
+            if node.op == "placeholder":
+                try:
+                    # Meta value can be torch.Tensor, int, float, bool,
+                    # torch.SymInt, torch.SymFloat, torch.SymBool.
+                    meta_value = _val = node.meta.get("val", None)
+                except RuntimeError as e:
+                    if not node.users:
+                        # If the placeholder is not used, we can safely ignore it and put
+                        # None as placeholder.
+                        meta_value = None
+                    else:
+                        raise RuntimeError(
+                            "Cannot fetch symbolic fake args from fx graph. "
+                            "InsertTypePromotion pass needs to run with pre-existing fake args, "
+                            "Otherwise the pass will produce inaccurate dynamic shape. "
+                        ) from e
+
+                fake_args.append(meta_value)
+        return fake_args
+
+    @_beartype.beartype
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        assert not args, (
+            "`InsertTypePromotion` deduces symbolic fake arguments from the graph. "
+            "It does not accept concrete arguments as input because this pass requires "
+            "re-running the graph. When executed with newly faked concrete arguments, "
+            "the pass loses the symbolic dynamic shape information."
+        )
+        assert not kwargs, "`kwargs` is not supported"
+
+        fake_args = self._fetch_fake_args()
+        fake_mode = self.fake_mode
+        assert fake_mode is not None, "Cannot detect fake_mode."
+
+        with proxy_tensor.maybe_disable_fake_tensor_mode(), (
+            fake_mode
+        ), fx_traceback.preserve_node_meta():
+            self.interpreter.run(*fake_args)
+
+        return self.module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/virtualization.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/virtualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a58cf366f38dd93d803131c10d2869639df4405
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/passes/virtualization.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.fx
+
+from torch.onnx._internal import _beartype
+from torch.onnx._internal.fx import _pass
+
+
+class MovePlaceholderToFront(_pass.Transform):
+    """This pass move all placeholder nodes to the front of the graph node list.
+
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    @_beartype.beartype
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        graph_module = self.module
+        graph = graph_module.graph
+        placeholders = []
+        first_not_placeholder = None
+        for node in graph.nodes:
+            if node.op == "placeholder":
+                placeholders.append(node)
+            if first_not_placeholder is None and node.op != "placeholder":
+                first_not_placeholder = node
+        if first_not_placeholder is None:
+            return graph_module
+        for placeholder in placeholders:
+            first_not_placeholder.prepend(placeholder)
+        return graph_module
+
+
+class ReplaceGetAttrWithPlaceholder(_pass.Transform):
+    """Replace get_attr with placeholder.
+
+    The parameters and buffers accessed by the original get_attr are returned;
+    they are useful when creating random inputs for the modified graph_module.
+    """
+
+    _replaced_attrs: Optional[Tuple[torch.Tensor, ...]]
+
+    @property
+    def replaced_attrs(self) -> Tuple[torch.Tensor, ...]:
+        """The list of replaced weight tensors."""
+        assert (
+            self._replaced_attrs is not None
+        ), "Must run ReplaceGetAttrWithPlaceholder first"
+        return self._replaced_attrs
+
+    @_beartype.beartype
+    def _run(self, *args, **kwargs) -> torch.fx.GraphModule:
+        graph_module = self.module
+        graph = graph_module.graph
+        replaced_attrs: List[torch.Tensor] = []
+        for node in graph.nodes:
+            if node.op == "get_attr":
+                replaced_attr: Optional[torch.Tensor] = None
+                # get_attr could retrieve either parameter or buffer, so
+                # we need to try both.
+                try:
+                    replaced_attr = graph_module.get_parameter(node.target)
+                except AttributeError:
+                    # It's possible that model author use buffer instead of
+                    # parameter to store trainable weights. In this case,
+                    # 1. get_parameter will throw something like
+                    #    AttributeError: `bias` is not an nn.Parameter.
+                    # 2. get_buffer should work.
+                    replaced_attr = graph_module.get_buffer(node.target)
+
+                # Reassign op type so that get_attr node becomes placeholder node.
+                node.op = "placeholder"
+                # The target name in placeholder must be a valid Python identifier.
+                # Thus, we replace, e.g., "module.submodule.weight" with
+                # "module_submodule_weight".
+                node.target = node.target.replace(".", "_")
+                # Default value is None. This is needed as long as the "graph_module"
+                # has optional inputs. Assume the original forward signature is
+                #  def forward(self, x, y=None)
+                # and the replaced get_attr node has target "z". Then, the modified
+                # signature should be
+                #  def forward(self, x, y=None, z=None)
+                # Without the following line, the signature will be
+                #  def forward(self, x, y=None, z)
+                # , which is not valid Python code.
+                node.args = (None,)
+
+                replaced_attrs.append(replaced_attr)
+
+        self._replaced_attrs = tuple(replaced_attrs)
+
+        return graph_module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/patcher.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/patcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd63d545c40ac44c40da2a9b284f2d40be6554a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/patcher.py
@@ -0,0 +1,140 @@
+import copy
+import functools
+import io
+from typing import List, Union
+
+import torch
+
+
+# TODO: Remove after https://github.com/huggingface/safetensors/pull/318
+@functools.lru_cache(None)
+def has_safetensors_and_transformers():
+    try:
+        # safetensors is not an exporter requirement, but needed for some huggingface models
+        import safetensors  # type: ignore[import]  # noqa: F401
+        import transformers  # type: ignore[import]  # noqa: F401
+
+        from safetensors import torch as safetensors_torch  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
+class ONNXTorchPatcher:
+    """Context manager to temporarily patch PyTorch during FX-to-ONNX export.
+
+    This class is a collection of "patches" required by FX-to-ONNX exporter.
+
+    This context overrides several torch functions to support symbolic
+    export of large scale models.
+
+    torch.load:
+        This function is patched to record the files PyTorch stores model
+        parameters and buffers. Downstream FX-to-ONNX exporter can create
+        initializers from these files.
+    torch.fx._symbolic_trace._wrapped_methods_to_patch:
+        This list is extended with (torch.Tensor, "__getitem__") so that
+        weight[x, :, y] becomes exportable with torch.fx.symbolic_trace.
+    safetensors.torch.load_file:
+        This function is patched to allow safetensors to be loaded within
+        FakeTensorMode. Remove after https://github.com/huggingface/safetensors/pull/318
+
+    Search for ONNXTorchPatcher in test_fx_to_onnx_with_onnxruntime.py for
+    example usage.
+
+    TODO: Should this really be a global patcher? Can we make it a local patcher?
+        A reason for splitting this into several patchers is to patch one part of the code
+        as a collateral damage of patching another part of the code. For example, we
+        for tracing model with torch._dynamo.export, we don't need to patch
+        `torch.fx._symbolic_trace._wrapped_methods_to_patch`
+    """
+
+    def __init__(self):
+        # List of file paths processed by torch.load.
+        self.paths: List[Union[str, io.BufferedIOBase]] = []
+
+        def torch_load_wrapper(f, *args, **kwargs):
+            # Record path for later serialization into ONNX proto
+            self.paths.append(f)
+            # Then, call the original torch.load.
+            return self.torch_load(f, *args, **kwargs)
+
+        # Original version of torch.load.
+        self.torch_load = torch.load
+
+        # Wrapper or modified version of torch functions.
+        self.torch_load_wrapper = torch_load_wrapper
+
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            def safetensors_load_file_wrapper(filename, device="cpu"):
+                # Record path for later serialization into ONNX proto
+                self.paths.append(filename)
+                result = {}
+                with safetensors.torch.safe_open(  # type: ignore[attr-defined]
+                    filename, framework="pt", device=device
+                ) as f:
+                    for k in f.keys():
+                        fake_mode = torch._guards.detect_fake_mode()
+                        if not fake_mode:
+                            result[k] = f.get_tensor(k)
+                        else:
+                            empty_tensor = f.get_slice(k)
+                            result[k] = torch.empty(
+                                tuple(empty_tensor.get_shape()),
+                                dtype=safetensors.torch._getdtype(
+                                    empty_tensor.get_dtype()
+                                ),
+                            )
+                return result
+
+            self.safetensors_torch_load_file = safetensors.torch.load_file
+            self.safetensors_torch_load_file_wrapper = safetensors_load_file_wrapper
+            self.transformers_modeling_utils_safe_load_file = (
+                transformers.modeling_utils.safe_load_file
+            )
+
+    def __enter__(self):
+        torch.load = self.torch_load_wrapper
+
+        self.torch_fx__symbolic_trace__wrapped_methods_to_patch = (
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        desired_wrapped_methods = copy.deepcopy(
+            torch.fx._symbolic_trace._wrapped_methods_to_patch
+        )
+        if (torch.Tensor, "__getitem__") not in desired_wrapped_methods:
+            # Adding `__getitem__` to the patching list will make tensor indexing traceable via
+            # torch.fx.symbolic_trace. Otherwise, `tensor[x, :, y]` cannot be traced.
+            # This happens because `__getitem__` is neither under torch domain nor an aten operator,
+            # so the patching (or similar Proxy-generating mechanism) doesn't happen automatically.
+            # Note that torch.fx.symbolic_trace defines FX_PATCH_GETITEM environment variable for
+            # enabling the line below for patching.
+            desired_wrapped_methods.append((torch.Tensor, "__getitem__"))
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = desired_wrapped_methods
+
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            safetensors.torch.load_file = self.safetensors_torch_load_file_wrapper
+            transformers.modeling_utils.safe_load_file = (
+                self.safetensors_torch_load_file_wrapper
+            )
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        torch.load = self.torch_load
+        torch.fx._symbolic_trace._wrapped_methods_to_patch = (
+            self.torch_fx__symbolic_trace__wrapped_methods_to_patch
+        )
+        if has_safetensors_and_transformers():
+            import safetensors
+            import transformers
+
+            safetensors.torch.load_file = self.safetensors_torch_load_file
+            transformers.modeling_utils.safe_load_file = (
+                self.transformers_modeling_utils_safe_load_file
+            )
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/registration.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/registration.py
new file mode 100644
index 0000000000000000000000000000000000000000..d838e8d48666dffeaec6a57381f2e30cc5fba797
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/registration.py
@@ -0,0 +1,91 @@
+"""Module for handling ATen to ONNX functions registration."""
+
+from __future__ import annotations
+
+import dataclasses
+import types
+from typing import Optional, TYPE_CHECKING, Union
+
+import torch._ops
+from torch.onnx._internal import _beartype
+
+# We can only import onnx from this module in a type-checking context to ensure that
+# 'import torch.onnx' continues to work without having 'onnx' installed. We fully
+# 'import onnx' inside of dynamo_export (by way of _assert_dependencies).
+if TYPE_CHECKING:
+    import onnxscript  # type: ignore[import]
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class ONNXFunction:
+    """A wrapper of onnx-script function.
+
+    op_full_name: The qualified name of the function. In the form of '<namespace>::<op_name>.<overload>'.
+    onnx_function: The onnx-script function from torchlib.
+    is_custom: Whether the function is a custom function.
+    is_complex: Whether the function is a function that handles complex valued inputs.
+
+    """
+
+    onnx_function: Union["onnxscript.OnnxFunction", "onnxscript.TracedOnnxFunction"]
+    op_full_name: str
+    is_custom: bool = False
+    is_complex: bool = False
+
+
+@dataclasses.dataclass(frozen=True, eq=True)
+class OpName:
+    """A class representing an operator name in internal ONNX converter."""
+
+    namespace: str
+    op_name: str
+    overload: str
+
+    @classmethod
+    @_beartype.beartype
+    def from_name_parts(
+        cls, namespace: str, op_name: str, overload: Optional[str] = None
+    ) -> OpName:
+        # NOTE: in PyTorch, the overload could be unprovided to indicate the
+        # default overload
+        if overload is None or overload == "":
+            overload = "default"
+        return cls(namespace, op_name, overload)
+
+    @classmethod
+    @_beartype.beartype
+    def from_qualified_name(cls, qualified_name: str) -> OpName:
+        """When the name is <namespace>::<op_name>[.<overload>]"""
+        namespace, opname_overload = qualified_name.split("::")
+        op_name, *overload = opname_overload.split(".", 1)
+        overload = overload[0] if overload else "default"
+        return cls(namespace, op_name, overload)
+
+    @classmethod
+    @_beartype.beartype
+    def from_op_overload(cls, op_overload: torch._ops.OpOverload) -> OpName:
+        return cls.from_qualified_name(op_overload.name())
+
+    @classmethod
+    @_beartype.beartype
+    def from_builtin_function(
+        cls, builtin_function: types.BuiltinFunctionType
+    ) -> OpName:
+        """From a builtin function, e.g. operator.add, math.ceil, etc, get the OpName.
+
+        FX graph uses built-in functions to caculate sympy expression. This function
+        is used to get the OpName from a builtin function.
+
+        Args:
+            builtin_function (types.BuiltinFunctionType): operator.add, math.ceil, etc.
+
+        Returns:
+            OpName: _description_
+        """
+        op = builtin_function.__name__  # add, sub, etc.
+        module = builtin_function.__module__  # _operators or math
+        return cls.from_qualified_name(module + "::" + op)
+
+    @_beartype.beartype
+    def qualified_name(self) -> str:
+        return f"{self.namespace}::{self.op_name}.{self.overload}"
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/serialization.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8d94b35572789af71c002452f03518e202f65a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/serialization.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import io
+import logging
+import os
+from typing import Tuple, TYPE_CHECKING, Union
+
+import torch
+from torch.onnx import _type_utils as jit_type_utils
+from torch.onnx._internal import _beartype
+
+if TYPE_CHECKING:
+    import onnx
+
+log = logging.getLogger(__name__)
+
+
+@_beartype.beartype
+def _create_tensor_proto_with_external_data(
+    tensor: torch.Tensor, name: str, location: str, basepath: str
+) -> onnx.TensorProto:  # type: ignore[name-defined]
+    """Create a TensorProto with external data from a PyTorch tensor.
+    The external data is saved to os.path.join(basepath, location).
+
+    Args:
+        tensor: Tensor to be saved.
+        name: Name of the tensor (i.e., initializer name in ONNX graph).
+        location: Relative location of the external data file
+            (e.g., "/tmp/initializers/weight_0" when model is "/tmp/model_name.onnx").
+        basepath: Base path of the external data file (e.g., "/tmp/external_data" while model must be in "/tmp").
+
+
+    Reference for ONNX's external data format:
+        How to load?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L187
+        How to save?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L43
+        How to set ONNX fields?
+        https://github.com/onnx/onnx/blob/5dac81ac0707bdf88f56c35c0a5e8855d3534673/onnx/external_data_helper.py#L88
+    """
+    # FIXME: Avoid importing onnx into torch.onnx.
+    import onnx
+
+    tensor_proto = onnx.TensorProto()  # type: ignore[attr-defined]
+    tensor_proto.name = name
+    tensor_proto.data_type = jit_type_utils.JitScalarType.from_dtype(
+        tensor.dtype
+    ).onnx_type()
+    tensor_proto.dims.extend(tensor.shape)
+    tensor_proto.data_location = onnx.TensorProto.EXTERNAL  # type: ignore[attr-defined]
+
+    # Settings for saving one tensor per file.
+    # Offset is zero because there is no other tensor in the same file.
+    key_value_pairs = {
+        "location": location,
+        "offset": 0,
+        "length": tensor.untyped_storage().nbytes(),
+    }
+    for k, v in key_value_pairs.items():
+        entry = tensor_proto.external_data.add()
+        entry.key = k
+        entry.value = str(v)
+
+    # Actual path to write content of tensor.
+    external_data_file_path = os.path.join(basepath, location)
+    if os.path.exists(external_data_file_path):
+        os.remove(external_data_file_path)
+
+    # Create external data's folder if not exists.
+    external_data_dir_path = os.path.dirname(external_data_file_path)
+    if not os.path.exists(external_data_dir_path):
+        # if the demo_folder directory is not present
+        # then create it.
+        os.makedirs(external_data_dir_path)
+
+    # Create a fresh file.
+    with open(external_data_file_path, "xb") as data_file:
+        # No need to call "seek" because offset is 0.
+        # data_file.seek(0)
+        # Write tensor content to the file.
+        data_file.write(tensor.numpy(force=True).tobytes())
+
+    return tensor_proto
+
+
+def _convert_safetensors_to_torch_format(safetensors_file):
+    # It this function is called, safetensors is guaranteed to exist
+    # because the HF model with safetensors was already loaded and exported to ONNX
+    from safetensors import safe_open  # type: ignore[import-not-found]
+
+    tensors = {}
+    with safe_open(safetensors_file, framework="pt", device="cpu") as f:  # type: ignore[attr-defined]
+        for k in f.keys():
+            tensors[k] = f.get_tensor(k).cpu()
+    return tensors
+
+
+# TODO: generalize to allow more checkpoints formats (torch or gguf)
+@_beartype.beartype
+def save_model_with_external_data(
+    basepath: str,
+    model_location: str,
+    initializer_location: str,
+    torch_state_dicts: Tuple[Union[dict, str, io.BytesIO], ...],
+    onnx_model: onnx.ModelProto,  # type: ignore[name-defined]
+    rename_initializer: bool = False,
+) -> None:
+    """Load PyTorch tensors from files and add to "onnx_model" as external initializers.
+
+    Output files:
+        ONNX model file path:
+        ONNX initializer folder: os.path.join(basepath, initializer_location)
+
+    After running this function, you can do
+        ort_sess = onnxruntime.InferenceSession(os.path.join(basepath, model_location))
+    to execute the model.
+
+    Arguments:
+        basepath: Base path of the ONNX external data file (e.g., "/path/to/large_model/").
+        model_location: Relative location of the ONNX model file.
+            E.g., "model.onnx" so that the model file is saved to
+            "<basepath>/model.onnx".
+        initializer_location: Relative location of the ONNX initializer folder.
+            E.g., "initializers" so that the initializers are saved to
+            "<basepath>/initializers/".
+            Note: When initializers are >2GB, must be the same as `model_location`.
+        torch_state_dicts: Dictionaries or files which contain PyTorch tensors to be saved
+            as ONNX initializers. For non-dict arguments, `torch.load` will be used to load them from file-like objects.
+        onnx_model: ONNX model to be saved with external initializers.
+            If an input name matches a tensor loaded from "torch_state_dicts",
+            the tensor will be saved as that input's external initializer.
+        rename_initializer: Replaces "." by "_" for all ONNX initializer names.
+            Not needed by the official torch.onnx.dynamo_export. This is a hack
+            for supporting `FXSymbolicTracer` tracer with fake tensor mode.
+            In short, `FXSymbolicTracer` lifts FX parameters (self.linear_weight)
+            as inputs (`def forward(self, linear_weight)`) and therefore, `.` cannot be used.
+    """
+    # FIXME: Avoid importing onnx into torch.onnx.
+    import onnx
+
+    onnx_model_with_initializers = onnx.ModelProto()  # type: ignore[attr-defined]
+    onnx_model_with_initializers.CopyFrom(onnx_model)
+    onnx_input_names = {input.name for input in onnx_model.graph.input}
+    for el in torch_state_dicts:
+        if isinstance(el, dict):
+            # Useful for when state_dict is loaded with torch.load(..., mmap=True, map_location="cpu") by the user
+            # Using torch.save wouldn't leverage mmap, leading to higher memory usage
+            state_dict = el
+        else:
+            if isinstance(el, str) and el.endswith(".safetensors"):
+                state_dict = _convert_safetensors_to_torch_format(el)
+            else:
+                try:
+                    # Loads checkpoint using memory-map on CPU to support really large models
+                    # The underlying torch.UntypedStorage is memory mapped, so state_dict is lazy loaded
+                    state_dict = torch.load(el, map_location="cpu", mmap=True)
+                except (RuntimeError, ValueError) as e:
+                    if "mmap can only be used with files saved with" in str(
+                        e
+                    ) or isinstance(el, io.BytesIO):
+                        log.warning(
+                            "Failed to load the checkpoint with memory-map enabled, retrying without memory-map."
+                            "Consider updating the checkpoint with mmap by using torch.save() on PyTorch version >= 1.6."
+                        )
+                        if isinstance(el, io.BytesIO):
+                            el.seek(0)  # torch.load from `try:` has read the file.
+                        state_dict = torch.load(el, map_location="cpu")
+                    else:
+                        raise e
+        for name, tensor in state_dict.items():
+            if rename_initializer:
+                # Basically, "transformer.attention.self.query.weight" is mapped
+                # to "transformer_attention_self_query_weight" for mimicking the
+                # name-modifying code in FX-to-ONNX exporter.
+                # See function _replace_get_attr_with_placeholder for details.
+                name = name.replace(".", "_")
+
+            # This block tries to match the onnx initializer name with torch parameter/buffer
+            #  e.g. A pytorch buffer 'transformer.h.0.attn.bias' can be named 'h.0.attn.bias' in a ONNX initializer
+            # For each PyTorch tensor name loaded by torch.load,
+            #  1.  Search its best match in ONNX model. E.g., the match of
+            #       "transformer_attention_weight" could be "attention_weight".
+            #  2.  Set "tensor" as the initializer of the matched ONNX input.
+            #      E.g., "tensor" is stored as the initializer of "attention_weight".
+            # Step 1 is required because sometimes, tensor names are stored with prefix the dictionary
+            # loaded by torch.load.
+            if name in onnx_input_names:
+                # Same input name shouldn't be matched again
+                onnx_input_names.remove(name)
+            else:
+                for onnx_input_name in onnx_input_names:
+                    if onnx_input_name.endswith(name) or name.endswith(onnx_input_name):
+                        # Find a match. Change name to the matched ONNX input name, so that we
+                        # create initializer with the right ONNX name.
+                        name = onnx_input_name
+                        onnx_input_names.remove(onnx_input_name)
+                        break
+
+            relative_tensor_file_path = os.path.join(initializer_location, name)
+            # Create one file per tensor.
+            # tensor_proto.raw_data is stored to external file at
+            # os.path.join(basepath, relative_tensor_file_path).
+            tensor_proto = _create_tensor_proto_with_external_data(
+                tensor, name, relative_tensor_file_path, basepath
+            )
+            # Add the tensor_proto to the ONNX model as an initializer with external data.
+            onnx_model_with_initializers.graph.initializer.append(tensor_proto)
+
+    # model_location should be a pure file name such as "file_name.onnx", not "folder/file_name.onnx".
+    onnx.save(onnx_model_with_initializers, os.path.join(basepath, model_location))  # type: ignore[attr-defined]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/torch_export_graph_extractor.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/torch_export_graph_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a2fb3140de3a2291310726f5e0f3cf21349ff66
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/torch_export_graph_extractor.py
@@ -0,0 +1,125 @@
+# NOTE: This file is referenced by name at
+#       /opt/pytorch/torch/_dynamo/eval_frame.py::DONT_WRAP_FILES.
+#       introduced by https://github.com/pytorch/pytorch/pull/98894.
+#       If this file is renamed, moved, etc please update the reference there!
+
+from __future__ import annotations
+
+from typing import Any, Callable, Mapping, Optional, Sequence, TYPE_CHECKING, Union
+
+import torch._dynamo
+import torch.fx
+import torch.onnx
+from torch.onnx._internal import _beartype, exporter, io_adapter
+from torch.onnx._internal.diagnostics import infra
+
+if TYPE_CHECKING:
+    from torch.export.exported_program import ExportedProgram
+
+
+class TorchExport(exporter.FXGraphExtractor):
+    """Generates a FX GraphModule using torch.export API
+    Args:
+        aten_graph: If True, exports a graph with ATen operators.
+                    If False, exports a graph with Python operators.
+    """
+
+    def __init__(
+        self,
+        aten_graph: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.aten_graph = aten_graph or True
+
+    def generate_fx(
+        self,
+        options: exporter.ResolvedExportOptions,
+        model: "ExportedProgram",  # type: ignore[override]
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+    ) -> torch.fx.GraphModule:
+        # No need to translate callable to FX graph.
+        # This FX Graph extractor assumes `model` was obtained through
+        #     exported_program = torch.export.export(
+        #         model,
+        #         args=model_args,  # type: ignore[arg-type]
+        #         kwargs=model_kwargs,  # type: ignore[arg-type]
+        #     )
+
+        # Export FX graph to ONNX ModelProto.
+        self.input_adapter.append_step(
+            io_adapter.FlattenInputWithTreeSpecValidationInputStep()
+        )
+        self.input_adapter.append_step(
+            io_adapter.PrependParamsBuffersConstantAotAutogradInputStep()
+        )
+
+        # ONNX does not support None inputs. During graph building, all None inputs
+        # are removed. Here we register this step to input adapter.
+        options.fx_tracer.input_adapter.append_step(io_adapter.RemoveNoneInputStep())
+
+        # NOTE: temp workaround for https://github.com/pytorch/pytorch/issues/99534
+        # Dynamo doesn't support non-tensor inputs.
+        options.fx_tracer.input_adapter.append_step(
+            io_adapter.RemoveNonTensorInputStep()
+        )
+
+        # ONNX does not support complex inputs. During graph building, all complex inputs
+        # are converted to real representation inputs. Here we register this step to
+        # input/output adapter.
+        options.fx_tracer.input_adapter.append_step(
+            io_adapter.ConvertComplexToRealRepresentationInputStep()
+        )
+
+        updated_model_args = self.input_adapter.apply(
+            *model_args, model=model, **model_kwargs
+        )
+
+        # ONNX can't represent collection types (e.g., dictionary, tuple of tuple of
+        # tensor, etc), we flatten the collection and register each element as output.
+        options.fx_tracer.output_adapter.append_step(io_adapter.FlattenOutputStep())
+
+        # Output post-processing steps should happen after `FlattenOutputStep`.
+        options.fx_tracer.output_adapter.append_step(
+            io_adapter.ConvertComplexToRealRepresentationOutputStep()
+        )
+
+        options.fx_tracer.output_adapter.append_step(
+            io_adapter.PrependParamsAndBuffersAotAutogradOutputStep()
+        )
+
+        # run_decomposition generates a new graph module with decomposed ops.
+        # Thus, we need to run this step after io_adapters.
+        model = model.run_decompositions(options.decomposition_table)
+
+        # Export FX graph to ONNX ModelProto.
+        return self.pre_export_passes(options, model, model.graph_module, updated_model_args)  # type: ignore[return-value]
+
+    @_beartype.beartype
+    def pre_export_passes(
+        self,
+        options: exporter.ResolvedExportOptions,
+        original_model: Union[torch.nn.Module, Callable],
+        fx_module: torch.fx.GraphModule,
+        fx_module_args: Sequence[Any],
+    ):
+        # TODO: Import here to prevent circular dependency
+        from torch.onnx._internal.fx import analysis, passes
+
+        diagnostic_context = options.diagnostic_context
+
+        # ONNX does not support concept of (implicit) type promotion.
+        # Insert type casts explicitly where needed.
+        fx_module = passes.InsertTypePromotion(diagnostic_context, fx_module).run()
+
+        analysis.UnsupportedFxNodesAnalysis(
+            diagnostic_context, fx_module, options.onnxfunction_dispatcher
+        ).analyze(infra.levels.ERROR)
+
+        # This operation should be invoked as the last pre export pass.
+        # See [NOTE: Modularize pass ordering]
+        fx_module = passes.Modularize(
+            diagnostic_context, fx_module, is_exported_program=True
+        ).run()
+
+        return fx_module
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/fx/type_utils.py b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/type_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b10454c807bc9ffd7ed0bafccfe0a33317d8849
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/fx/type_utils.py
@@ -0,0 +1,254 @@
+"""Utilities for converting and operating on ONNX, JIT and torch types."""
+from __future__ import annotations
+
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Protocol,
+    runtime_checkable,
+    Set,
+    Tuple,
+    Type,
+    TYPE_CHECKING,
+    Union,
+)
+
+import numpy
+import onnx
+
+import torch
+from torch._subclasses import fake_tensor
+
+if TYPE_CHECKING:
+    import onnx.defs.OpSchema.AttrType  # type: ignore[import]
+
+
+# Enable both TorchScriptTensor and torch.Tensor to be tested
+# for dtype in OpSchemaWrapper.
+@runtime_checkable
+class TensorLike(Protocol):
+    @property
+    def dtype(self) -> Optional[torch.dtype]:
+        ...
+
+
+def is_torch_complex_dtype(tensor_dtype: torch.dtype) -> bool:
+    # NOTE: This is needed as TorchScriptTensor is nor supported by torch.is_complex()
+    return tensor_dtype in _COMPLEX_TO_FLOAT
+
+
+def from_complex_to_float(dtype: torch.dtype) -> torch.dtype:
+    return _COMPLEX_TO_FLOAT[dtype]
+
+
+def from_sym_value_to_torch_dtype(sym_value: SYM_VALUE_TYPE) -> torch.dtype:
+    return _SYM_TYPE_TO_TORCH_DTYPE[type(sym_value)]
+
+
+def is_optional_onnx_dtype_str(onnx_type_str: str) -> bool:
+    return onnx_type_str in _OPTIONAL_ONNX_DTYPE_STR
+
+
+def from_torch_dtype_to_onnx_dtype_str(dtype: Union[torch.dtype, type]) -> Set[str]:
+    return _TORCH_DTYPE_TO_COMPATIBLE_ONNX_TYPE_STRINGS[dtype]
+
+
+def from_python_type_to_onnx_attribute_type(
+    dtype: type, is_sequence: bool = False
+) -> Optional[onnx.defs.OpSchema.AttrType]:
+    import onnx.defs  # type: ignore[import]
+
+    _PYTHON_TYPE_TO_ONNX_ATTRIBUTE_TYPE = {
+        float: onnx.defs.OpSchema.AttrType.FLOAT,
+        int: onnx.defs.OpSchema.AttrType.INT,
+        str: onnx.defs.OpSchema.AttrType.STRING,
+        bool: onnx.defs.OpSchema.AttrType.INT,
+    }
+
+    _SEQUENCE_TYPE_TO_ONNX_ATTRIBUTE_TYPE = {
+        float: onnx.defs.OpSchema.AttrType.FLOATS,
+        int: onnx.defs.OpSchema.AttrType.INTS,
+        str: onnx.defs.OpSchema.AttrType.STRINGS,
+        bool: onnx.defs.OpSchema.AttrType.INTS,
+    }
+
+    if is_sequence:
+        return _SEQUENCE_TYPE_TO_ONNX_ATTRIBUTE_TYPE.get(dtype)
+    return _PYTHON_TYPE_TO_ONNX_ATTRIBUTE_TYPE.get(dtype)
+
+
+def from_python_type_to_onnx_tensor_element_type(type: type):
+    """
+    Converts a Python type to the corresponding ONNX tensor element type.
+    For example, `from_python_type_to_onnx_tensor_element_type(float)` returns
+    `onnx.TensorProto.FLOAT`.
+
+    Args:
+      type (type): The Python type to convert.
+
+    Returns:
+      int: The corresponding ONNX tensor element type.
+
+    """
+    _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
+        float: onnx.TensorProto.FLOAT,  # type: ignore[attr-defined]
+        int: onnx.TensorProto.INT64,  # type: ignore[attr-defined]
+        bool: onnx.TensorProto.BOOL,  # type: ignore[attr-defined]
+    }
+    return _PYTHON_TYPE_TO_ONNX_TENSOR_ELEMENT_TYPE.get(type)
+
+
+def is_torch_symbolic_type(value: Any) -> bool:
+    return isinstance(value, (torch.SymBool, torch.SymInt, torch.SymFloat))
+
+
+def from_torch_dtype_to_abbr(dtype: Optional[torch.dtype]) -> str:
+    if dtype is None:
+        return ""
+    return _TORCH_DTYPE_TO_ABBREVIATION.get(dtype, "")
+
+
+def from_scalar_type_to_torch_dtype(scalar_type: type) -> Optional[torch.dtype]:
+    return _SCALAR_TYPE_TO_TORCH_DTYPE.get(scalar_type)
+
+
+# NOTE: this is a mapping from torch dtype to a set of compatible onnx types
+# It's used in dispatcher to find the best match overload for the input dtypes
+_TORCH_DTYPE_TO_COMPATIBLE_ONNX_TYPE_STRINGS: Dict[
+    Union[torch.dtype, type], Set[str]
+] = {
+    torch.bfloat16: {"tensor(bfloat16)"},
+    torch.bool: {"tensor(bool)"},
+    torch.float64: {"tensor(double)"},
+    torch.float32: {"tensor(float)"},
+    torch.float16: {"tensor(float16)"},
+    torch.float8_e4m3fn: {"tensor(float8_e4m3fn)"},
+    torch.float8_e4m3fnuz: {"tensor(float8_e4m3fnuz)"},
+    torch.float8_e5m2: {"tensor(float8_e5m2)"},
+    torch.float8_e5m2fnuz: {"tensor(float8_e5m2fnuz)"},
+    torch.int16: {"tensor(int16)"},
+    torch.int32: {"tensor(int32)"},
+    torch.int64: {"tensor(int64)"},
+    torch.int8: {"tensor(int8)"},
+    torch.uint8: {"tensor(uint8)"},
+    str: {"tensor(string)"},
+    int: {"tensor(int16)", "tensor(int32)", "tensor(int64)"},
+    float: {"tensor(float16)", "tensor(float)", "tensor(double)"},
+    bool: {"tensor(int32)", "tensor(int64)", "tensor(bool)"},
+    complex: {"tensor(float)", "tensor(double)"},
+    torch.complex32: {"tensor(float16)"},
+    torch.complex64: {"tensor(float)"},
+    torch.complex128: {"tensor(double)"},
+}
+
+_OPTIONAL_ONNX_DTYPE_STR: Set[str] = {
+    f"optional({value})"
+    for value_set in _TORCH_DTYPE_TO_COMPATIBLE_ONNX_TYPE_STRINGS.values()
+    for value in value_set
+}
+
+_PYTHON_TYPE_TO_TORCH_DTYPE = {
+    bool: torch.bool,
+    int: torch.int64,
+    float: torch.float32,
+    complex: torch.complex64,
+}
+
+_COMPLEX_TO_FLOAT: Dict[torch.dtype, torch.dtype] = {
+    torch.complex32: torch.float16,
+    torch.complex64: torch.float32,
+    torch.complex128: torch.float64,  # NOTE: ORT doesn't support torch.float64
+}
+
+_SYM_TYPE_TO_TORCH_DTYPE = {
+    torch.SymInt: torch.int64,
+    torch.SymFloat: torch.float32,
+    torch.SymBool: torch.bool,
+}
+
+_SCALAR_TYPE_TO_TORCH_DTYPE: Dict[Type, torch.dtype] = {
+    **_PYTHON_TYPE_TO_TORCH_DTYPE,
+    **_SYM_TYPE_TO_TORCH_DTYPE,
+}
+
+_TORCH_DTYPE_TO_ABBREVIATION = {
+    torch.bfloat16: "bf16",
+    torch.float64: "f64",
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.float8_e4m3fn: "e4m3fn",
+    torch.float8_e4m3fnuz: "e4m3fnuz",
+    torch.float8_e5m2: "f8e5m2",
+    torch.float8_e5m2fnuz: "e5m2fnuz",
+    torch.complex32: "c32",
+    torch.complex64: "c64",
+    torch.complex128: "c128",
+    torch.int8: "i8",
+    torch.int16: "i16",
+    torch.int32: "i32",
+    torch.int64: "i64",
+    torch.bool: "b8",
+    torch.uint8: "u8",
+}
+
+_TORCH_DTYPE_TO_NUMPY_DTYPE = {
+    torch.float16: numpy.float16,
+    torch.float32: numpy.float32,
+    torch.float64: numpy.float64,
+    torch.uint8: numpy.uint8,
+    torch.int8: numpy.int8,
+    torch.int16: numpy.int16,
+    torch.int32: numpy.int32,
+    torch.int64: numpy.longlong,
+    torch.bool: numpy.bool_,
+}
+
+_ONNX_TENSOR_ELEMENT_TYPE_TO_TORCH_DTYPE = {
+    onnx.TensorProto.FLOAT: torch.float32,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT16: torch.float16,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E5M2: torch.float8_e5m2,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E4M3FN: torch.float8_e4m3fn,  # type: ignore[attr-defined]
+    onnx.TensorProto.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,  # type: ignore[attr-defined]
+    onnx.TensorProto.DOUBLE: torch.float64,  # type: ignore[attr-defined]
+    onnx.TensorProto.BOOL: torch.bool,  # type: ignore[attr-defined]
+    onnx.TensorProto.UINT8: torch.uint8,  # type: ignore[attr-defined]
+    onnx.TensorProto.INT8: torch.int8,  # type: ignore[attr-defined]
+    onnx.TensorProto.INT16: torch.int16,  # type: ignore[attr-defined]
+    onnx.TensorProto.INT32: torch.int32,  # type: ignore[attr-defined]
+    onnx.TensorProto.INT64: torch.int64,  # type: ignore[attr-defined]
+}
+
+_TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE = {
+    value: key for key, value in _ONNX_TENSOR_ELEMENT_TYPE_TO_TORCH_DTYPE.items()
+}
+
+SYM_VALUE_TYPE = Union[torch.SymInt, torch.SymFloat, torch.SymBool]
+META_VALUE_TYPE = Union[fake_tensor.FakeTensor, SYM_VALUE_TYPE, int, float, bool]
+# NOTE: Belows are from torch/fx/node.py
+BaseArgumentTypes = Union[
+    str,
+    int,
+    float,
+    bool,
+    complex,
+    torch.dtype,
+    torch.Tensor,
+    torch.device,
+    torch.memory_format,
+    torch.layout,
+    torch._ops.OpOverload,
+]
+Argument = Optional[
+    Union[
+        Tuple[Any, ...],  # actually Argument, but mypy can't represent recursive types
+        List[Any],  # actually Argument
+        Dict[str, Any],  # actually Argument
+        slice,  # Slice[Argument, Argument, Argument], but slice is not a templated type in typing
+        range,
+        "torch.fx.Node",
+        BaseArgumentTypes,
+    ]
+]
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/io_adapter.py b/MLPY/Lib/site-packages/torch/onnx/_internal/io_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..96764476fed2d29ed82a58957edc98671ff93558
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/io_adapter.py
@@ -0,0 +1,679 @@
+from __future__ import annotations
+
+import inspect
+
+from typing import (
+    Any,
+    Callable,
+    List,
+    Mapping,
+    Optional,
+    Protocol,
+    runtime_checkable,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch.export as torch_export
+
+from torch.onnx._internal import _beartype
+from torch.utils import _pytree as pytree
+
+# TODO(bowbao): Add diagnostics for IO adapters.
+
+
+@runtime_checkable
+class InputAdaptStep(Protocol):
+    """A protocol that defines a step in the input adapting process.
+
+    The input adapting process is a sequence of steps that are applied to the
+    PyTorch model inputs to transform them into the inputs format expected by the
+    exported ONNX model. Each step takes the PyTorch model inputs as arguments and
+    returns the transformed inputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    input signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        ...
+
+
+class InputAdapter:
+    """A class that adapts the PyTorch model inputs to exported ONNX model inputs format."""
+
+    def __init__(self, steps: Optional[List[InputAdaptStep]] = None):
+        self._steps = steps or []
+
+    @_beartype.beartype
+    def append_step(self, step: InputAdaptStep) -> None:
+        """Appends a step to the input adapt steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    @_beartype.beartype
+    def apply(
+        self,
+        *model_args,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+        **model_kwargs,
+    ) -> Sequence[Union[int, float, bool, str, "torch.Tensor", None]]:
+        """Converts the PyTorch model inputs to exported ONNX model inputs format.
+
+        Args:
+            model_args: The PyTorch model inputs.
+            model: The PyTorch model.
+            model_kwargs: The PyTorch model keyword inputs.
+        Returns:
+            A sequence of tensors converted from PyTorch model inputs.
+        """
+        args: Sequence[Any] = model_args
+        kwargs: Mapping[str, Any] = model_kwargs
+        for step in self._steps:
+            args, kwargs = step.apply(args, kwargs, model=model)
+        assert not kwargs
+        return args
+
+
+@runtime_checkable
+class OutputAdaptStep(Protocol):
+    """A protocol that defines a step in the output adapting process.
+
+    The output adapting process is a sequence of steps that are applied to the
+    PyTorch model outputs to transform them into the outputs format produced by the
+    exported ONNX model. Each step takes the PyTorch model outputs as arguments and
+    returns the transformed outputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    output signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Any:
+        ...
+
+
+class OutputAdapter:
+    """A class that adapts the PyTorch model outputs to exported ONNX model outputs format."""
+
+    def __init__(self, steps: Optional[List[OutputAdaptStep]] = None):
+        self._steps = steps or []
+
+    @_beartype.beartype
+    def append_step(self, step: OutputAdaptStep) -> None:
+        """Appends a step to the output format steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    @_beartype.beartype
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Union["torch.Tensor", int, float, bool, str]]:
+        """Converts the PyTorch model outputs to exported ONNX model outputs format.
+
+        Args:
+            model_outputs: The PyTorch model outputs.
+            model: The PyTorch model.
+
+        Returns:
+            PyTorch model outputs in exported ONNX model outputs format.
+        """
+        for step in self._steps:
+            model_outputs = step.apply(model_outputs, model=model)
+        return model_outputs
+
+
+# TODO: make_fx lose stack info https://github.com/pytorch/pytorch/issues/90276
+
+
+def _replace_tuple_with_list(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    _type = list if spec.type == tuple else spec.type
+    return pytree.TreeSpec(
+        _type, spec.context, list(map(_replace_tuple_with_list, spec.children_specs))
+    )
+
+
+def _open_top_level_list_if_single_element(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    if spec.type == list and spec.num_children == 1:
+        return spec.children_specs[0]
+    return spec
+
+
+def _assert_identical_pytree_spec(
+    spec1: pytree.TreeSpec, spec2: pytree.TreeSpec, error_message: str
+) -> None:
+    """Assert the two `TreeSpec` objects are identical.
+
+    Args:
+        spec1: The first `TreeSpec` object.
+        spec2: The second `TreeSpec` object.
+        error_message: The error message to raise if the two `TreeSpec` objects are not
+            identical.
+
+    Raises:
+        ValueError: If the two `TreeSpec` objects are not identical.
+    """
+    # TODO(bowbao): Turn this check into diagnostic. Consider warning instead of error.
+    pass_if_any_checks: Sequence[Callable[[], bool]] = [
+        lambda: spec1 == spec2,
+        # FIXME: Bug in `dynamo.export`. Sometimes outputs returned in 'list' instead of 'tuple'.
+        lambda: _replace_tuple_with_list(spec1) == _replace_tuple_with_list(spec2),
+        # FIXME: Bug in `dynamo.export`. Sometimes single function return is wrapped in list.
+        lambda: _open_top_level_list_if_single_element(spec1) == spec2,
+        lambda: spec1 == _open_top_level_list_if_single_element(spec2),
+    ]
+
+    if not any(check() for check in pass_if_any_checks):
+        raise ValueError(f"{error_message}\nExpect {spec1}.\nActual {spec2}.")
+
+
+class BindInputStep(InputAdaptStep):
+    """Bind the input arguments to the model signature."""
+
+    def __init__(self, model_signature: inspect.Signature):
+        self._model_signature = model_signature
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Bind the input arguments to the model signature.
+
+        We hope the input kwargs will be mapped to bound.args after binding.
+        If not, we will raise an error.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. args is always empty.
+
+        Raises:
+            ValueError: If there are keyword-only arguments left after binding args and
+                kwargs to model signature.
+        """
+        bound = self._model_signature.bind(*model_args, **model_kwargs)
+        bound.apply_defaults()
+
+        # keyword-only arguments are not handled.
+        # bound.kwargs only contains keyword-only arguments after calling
+        # bind & apply_defaults, so we raise if it's not empty.
+        if bound.kwargs:
+            raise ValueError("Keyword-only arguments are not supported.")
+        return (), bound.arguments
+
+
+class MergeKwargsIntoArgsInputStep(InputAdaptStep):
+    """Merge the input kwargs into the input args."""
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Merge the input kwargs into the input args.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. kwargs is always empty.
+        """
+        return tuple(model_args) + tuple(model_kwargs.values()), {}
+
+
+class LiftParametersAndBuffersIntoArgsInputStep(InputAdaptStep):
+    """Append parameters and buffers to model's positional argument list."""
+
+    def __init__(self, inputs: Tuple["torch.Tensor", ...]) -> None:
+        self.inputs = inputs
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Append model's parameters and buffers into its input.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args + appended inputs and kwargs.
+        """
+        return (*model_args, *self.inputs), model_kwargs
+
+
+class ConvertComplexToRealRepresentationInputStep(InputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        return (
+            tuple(
+                torch.view_as_real(arg.resolve_conj())
+                if isinstance(arg, torch.Tensor) and arg.is_complex()
+                else arg
+                for arg in model_args
+            ),
+            model_kwargs,
+        )
+
+
+class RemoveNoneInputStep(InputAdaptStep):
+    """Remove `None` from arguments.
+
+    This adapt step assumes ``model_kwargs`` is empty. It also assumes ``model_args``
+    is flattened, i.e. it does not check `None` inside nested collections.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove `None` from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return tuple(arg for arg in model_args if arg is not None), {}
+
+
+class RemoveNonTensorInputStep(InputAdaptStep):
+    """Remove the non-tensor input arguments.
+
+    Dynamo does not support non-tensor input arguments (https://github.com/pytorch/pytorch/issues/99534).
+
+    Specifically, it does put the input into graph with an empty node, but consumed by no ones.
+    The concrete value is embedded into the graph as a constant arg of a target node. Meta
+    suggests in this case that one should rewrite the model code to make it tensor if the
+    input value is supposed to change at runtime. We might need to further investigate
+    the feasibility of that suggestion.
+
+    For example,
+
+        def func(x, b=1.0):
+            y = x + b
+            z = y.relu()
+            return (y, z)
+
+        x = torch.randn(1, 1, 2, dtype=torch.float32)
+        gm_fun, _ = dynamo.export(func, x, b=8.0, aten_graph=True, tracing_mode="real")
+
+        # class GraphModule(torch.nn.Module):
+        #     def forward(self, x, b):
+        #         arg0: f32[1, 1, 2], arg1, = fx_pytree.tree_flatten_spec(([x, b], {}), self._in_spec)
+        #         # File: path/to/pytorch/test_constant_input.py:5, code: y = x + b
+        #         add_tensor: f32[1, 1, 2] = torch.ops.aten.add.Tensor(arg0, 8.0);  arg0 = None
+
+        #         # File: path/to/pytorch/test_constant_input.py:6, code: z = y.relu()
+        #         relu_default: f32[1, 1, 2] = torch.ops.aten.relu.default(add_tensor)
+        #         return pytree.tree_unflatten([add_tensor, relu_default], self._out_spec)
+
+    Empty torch.fx.Node input leading to a mismatched number of input with PyTorch, as
+    it's ignored in ONNX graph. Thus, we delete the useless input here.
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove Constant from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return (
+            tuple(
+                arg
+                for arg in model_args
+                if not isinstance(arg, (int, float, bool, str))
+            ),
+            {},
+        )
+
+
+class FlattenInputWithTreeSpecValidationInputStep(InputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: Optional[pytree.TreeSpec] = None
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Flatten the model args and kwargs and validate the `SpecTree` output.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model args and kwargs. The kwargs is empty, because
+            they are flattened and merged into the args.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_args, spec = pytree.tree_flatten((model_args, model_kwargs))
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model inputs incompatible with the format that was exported. ",
+            )
+        return flattened_args, {}
+
+
+class FlattenOutputStep(OutputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    NOTE: Ideally we would want to use ``FlattenOutputWithTreeSpecValidationOutputStep``, such
+    that `SpecTree` can be validate for new model outputs. However, this is not possible
+    currently because we never have access to real PyTorch model outputs during export.
+    Only traced outputs may be available, but they are not an accurate reflection of the
+    original PyTorch model outputs format as they are typically in their own unique format,
+    depending on the tracing strategy.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model outputs.
+        """
+        return pytree.tree_leaves(model_outputs)
+
+
+class ConvertComplexToRealRepresentationOutputStep(OutputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Any:
+        """Convert float tensors to complex tensors.
+
+        Args:
+            model_output: The model output.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model output.
+        """
+        return [
+            torch.view_as_real(output.resolve_conj())
+            if isinstance(output, torch.Tensor) and torch.is_complex(output)
+            else output
+            for output in model_outputs
+        ]
+
+
+class FlattenOutputWithTreeSpecValidationOutputStep(OutputAdaptStep):
+    """Same as ``FlattenOutputStep``, with additional `TreeSpec` validation.
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: Optional[pytree.TreeSpec] = None
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_outputs, spec = pytree.tree_flatten(model_outputs)
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model outputs incompatible with the format that was exported. ",
+            )
+        return flattened_outputs
+
+
+class PrependParamsBuffersConstantAotAutogradInputStep(InputAdaptStep):
+    """Prepend model parameters, buffers and constants to the user input.
+
+    :func:`torch.export.export` lifts model parameters, buffers and constants as model input, thus, they
+    must be added to the user input before the model is executed.
+
+    Args:
+        model: The PyTorch model with embedded parameters and buffers.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        ordered_params = tuple(
+            model.state_dict[name] for name in model.graph_signature.parameters  # type: ignore[union-attr,index]
+        )
+        non_persistent_buffers = set(model.graph_signature.non_persistent_buffers)  # type: ignore[union-attr]
+        ordered_buffers = []
+        for name in model.graph_signature.buffers:  # type: ignore[union-attr]
+            if name in non_persistent_buffers:
+                ordered_buffers.append(model.constants[name])  # type: ignore[union-attr]
+            else:
+                ordered_buffers.append(model.state_dict[name])  # type: ignore[union-attr,index]
+        ordered_constant_tensors = tuple(
+            model.constants[fqn] for fqn in model.graph_signature.lifted_tensor_constants  # type: ignore[union-attr,index]
+        )
+
+        # NOTE: calling convention is first params, then buffers, then args as user supplied them.
+        # See: torch/_functorch/aot_autograd.py#L1034
+        updated_args = (
+            *ordered_params,
+            *ordered_buffers,
+            *ordered_constant_tensors,
+            *model_args,
+        )
+        if model_kwargs:
+            return MergeKwargsIntoArgsInputStep().apply(
+                updated_args, model_kwargs, model=model
+            )
+        return updated_args, {}
+
+
+class PrependParamsAndBuffersAotAutogradOutputStep(OutputAdaptStep):
+    """Prepend model's mutated buffers to the user output.
+
+    :func:`torch.export.export` lifts model's mutated buffers as outputs, thus, they
+    must be added to the user output after the model is executed.
+
+    Args:
+        model: The PyTorch model with mutated buffers.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: Optional[
+            Union[torch.nn.Module, Callable, torch_export.ExportedProgram]
+        ] = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+        """
+
+        assert isinstance(
+            model, torch_export.ExportedProgram
+        ), "'model' must be torch_export.ExportedProgram"
+        ordered_buffers = tuple(
+            model.state_dict[name]
+            if name in model.state_dict
+            else model.constants[name]
+            for name in model.graph_signature.buffers_to_mutate.values()
+        )
+
+        # NOTE: calling convention is first mutated buffers, then outputs args as model returned them.
+        updated_outputs = (*ordered_buffers, *model_outputs)
+        return updated_outputs
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/jit_utils.py b/MLPY/Lib/site-packages/torch/onnx/_internal/jit_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d89e0f74a94dbc71ac1966d094f62ac32695bfab
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/jit_utils.py
@@ -0,0 +1,399 @@
+"""Utilities for manipulating the torch.Graph object and the torchscript."""
+from __future__ import annotations
+
+# TODO(justinchuby): Move more of the symbolic helper functions here and expose
+# them to the user.
+
+import dataclasses
+import re
+import typing
+from typing import Any, Dict, Iterable, Optional, Sequence, Tuple, Union
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, registration
+
+
+_ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
+_SKIP_NODE_ATTRIBUTES = {"inplace", "aten"}
+
+
+@dataclasses.dataclass
+class GraphContext:
+    """Extra context for symbolic functions with all methods from torch.Graph.
+
+    NOTE: This class is not meant for external consumption. Please do not depend on
+    it outside of torch.onnx as the interface may evolve.
+
+    Attributes:
+        graph: The _C.Graph being constructed.
+        block: The current _C.Block being constructed.
+        opset: The opset version.
+        original_node: Current node that is being converted from.
+        params_dict: Mapping from graph initializer name to IValue.
+        env: Mapping from Torch domain graph Value to ONNX domain graph Value.
+    """
+
+    graph: _C.Graph
+    block: _C.Block
+    opset: int
+    original_node: _C.Node
+    params_dict: Dict[str, "_C.IValue"]
+    env: Dict[_C.Value, _C.Value]
+
+    # Relay methods from _C.Graph for compatibility with symbolic functions that expect
+    # a _C.Graph
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.graph, name)
+
+    @_beartype.beartype
+    def op(
+        self,
+        opname: str,
+        *raw_args: Union[torch.Tensor, _C.Value],
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator "opname", taking "raw_args" as inputs and "kwargs" as attributes.
+
+        The set of operators and the inputs/attributes they take
+        is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+        Args:
+            opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+                with a namespace, e.g., `aten::add`.
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # FIXME(justinchuby): Add the return type back once we know how to handle mypy
+        return _add_op(self, opname, *raw_args, outputs=outputs, **kwargs)
+
+    @_beartype.beartype
+    def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
+        """Generates an ONNX ATen op node.
+
+        This function is for backward compatibility with the old symbolic functions.
+        """
+        return self.op(
+            "aten::ATen",
+            *args,
+            operator_s=operator,
+            overload_name_s=overload_name,
+            **kwargs,
+        )
+
+    # NOTE: For backward compatibility with the old symbolic functions.
+    # We are probably going to remove this only after the fx exporter is established.
+    at = aten_op
+
+    @_beartype.beartype
+    def onnxscript_op(
+        self,
+        onnx_fn,
+        *raw_args: Union[torch.Tensor, _C.Value],
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator from onnx-script function, taking "raw_args" as inputs and "kwargs" as attributes.
+
+        onnx-script repository: https://github.com/microsoft/onnx-script
+
+        Args:
+            onnx_fn: ONNXFunction from onnx-script; An example can be found at
+                https://github.com/microsoft/onnx-script#example
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # NOTE(titaiwang): This is using class attributes, and it needs to be updated
+        # if onnx-script makes any change on these.
+        symbolic_name = f"{onnx_fn.opset.domain}::{onnx_fn.name}"
+        opset_version = onnx_fn.opset.version
+
+        registration.custom_onnx_symbolic(symbolic_name, opset_version)(onnx_fn)
+
+        return _add_op(self, symbolic_name, *raw_args, outputs=outputs, **kwargs)
+
+
+@_beartype.beartype
+def add_op_with_blocks(
+    graph_context: GraphContext,
+    opname: str,
+    *inputs: _C.Value,
+    outputs: int = 1,
+    n_blocks: int = 1,
+    **attributes,
+) -> Tuple[Any, Tuple[GraphContext, ...], _C.Node]:
+    """Creates an ONNX operator "opname", taking inputs and attributes.
+
+    Args:
+        graph_context: The context for the current graph.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        inputs: The inputs to the operator.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        n_blocks: The number of sub-blocks to create in the node.
+        attributes: The attributes of the ONNX operator.
+
+    Returns:
+        A tuple of (output_values, new_contexts, node) where:
+            output_values: One or more output value of this operator
+                (see the `outputs` keyword argument for multi-return nodes).
+            new_contexts: A tuple of new graph contexts for each sub-block.
+            node: The node representing the operator.
+    """
+
+    output_values = graph_context.op(opname, *inputs, outputs=outputs, **attributes)
+    if isinstance(output_values, Sequence):
+        node = output_values[0].node()
+    else:
+        node = output_values.node()
+
+    new_contexts = []
+    for _ in range(n_blocks):
+        new_block = node.addBlock()
+        # Create shallow copy of the graph context and update the block
+        new_context = dataclasses.replace(graph_context, block=new_block)
+        new_contexts.append(new_context)
+
+    return output_values, tuple(new_contexts), node
+
+
+@_beartype.beartype
+def _add_op(
+    graph_context: GraphContext,
+    opname: str,
+    *args: Union[torch.Tensor, _C.Value],
+    outputs: int = 1,
+    **kwargs,
+):
+    """Creates an ONNX operator "opname", taking "args" as inputs and attributes "kwargs".
+
+    The set of operators and the inputs/attributes they take
+    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+    This function is monkey-patched onto Graph.
+
+    Args:
+        graph_context: The Torch Graph or Block.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        args: The inputs to the operator; usually provided
+            as arguments to the `symbolic` definition.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        kwargs: The attributes of the ONNX operator, whose keys are named
+            according to the following convention: `alpha_f` indicates
+            the `alpha` attribute with type `f`.  The valid type specifiers are
+            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+            specified with type float accepts either a single float, or a
+            list of floats (e.g., you would say `dims_i` for a `dims` attribute
+            that takes a list of integers).
+
+    Returns:
+        (Union[_C.Value, Tuple[_C.Value, ...]])
+        The value representing the single output of this operator (see the `outputs`
+        keyword argument for multi-return nodes).
+    """
+    inputs = [_const_if_tensor(graph_context, arg) for arg in args]
+    # Filter out None attributes, this can be convenient client side because
+    # now they can pass through None attributes, and have them not show up
+    attributes = {k: v for k, v in kwargs.items() if v is not None}
+
+    if "::" not in opname:
+        opname = "onnx::" + opname
+
+    node = _create_node(
+        graph_context.block,
+        opname,
+        inputs,
+        attributes,
+        params_dict=graph_context.params_dict,
+        opset_version=graph_context.opset,
+        n_outputs=outputs,
+        shape_inference=GLOBALS.onnx_shape_inference,
+    )
+
+    if outputs == 1:
+        return node.output()
+    return tuple(node.outputs())
+
+
+@_beartype.beartype
+def _const_if_tensor(graph_context: GraphContext, arg):
+    if arg is None:
+        return arg
+    if isinstance(arg, _C.Value):
+        return arg
+
+    return _add_op(graph_context, "onnx::Constant", value_z=arg)
+
+
+def _create_node(
+    graph_or_block: Union[_C.Graph, _C.Block],
+    domain_op: str,
+    inputs: Sequence,
+    attributes: dict,
+    params_dict: dict,
+    opset_version: int,
+    n_outputs: int,
+    shape_inference: bool = True,
+) -> _C.Node:
+    """Creates an node 'domain_op', taking inputs and attributes."""
+    if isinstance(graph_or_block, _C.Graph):
+        graph = graph_or_block
+        node = graph.create(domain_op, inputs, n_outputs)
+        node = graph.insertNode(node)
+    elif isinstance(graph_or_block, _C.Block):
+        block = graph_or_block
+        node = block.addNode(domain_op, inputs)
+
+        # Block does not have create defined, so we need to add outputs manually
+        if n_outputs > 1:
+            for _ in range(1, n_outputs):
+                node.addOutput()
+
+    node_outputs = tuple(node.outputs())  # type: ignore[possibly-undefined]
+    assert len(node_outputs) == n_outputs
+
+    aten = domain_op.startswith("aten::")
+
+    # Add all attributes
+    for key, value in sorted(attributes.items()):
+        if key in _SKIP_NODE_ATTRIBUTES:
+            continue
+        _add_attribute(node, key, value, aten=aten)
+    if shape_inference:
+        _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
+    return node
+
+
+@_beartype.beartype
+def _is_onnx_list(value):
+    return isinstance(value, Iterable) and not isinstance(
+        value, (str, bytes, torch.Tensor)
+    )
+
+
+@_beartype.beartype
+def _scalar(x: torch.Tensor):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x[0]
+
+
+@_beartype.beartype
+def _is_caffe2_aten_fallback() -> bool:
+    return (
+        GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        and _C_onnx._CAFFE2_ATEN_FALLBACK
+    )
+
+
+@_beartype.beartype
+def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
+    r"""Initializes the right attribute based on type of value."""
+    m = _ATTR_PATTERN.match(key)
+    if m is None:
+        raise ValueError(
+            f"Invalid attribute specifier '{key}' names "
+            "must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
+        )
+    name, kind = m.group(1), m.group(2)
+    if _is_onnx_list(value):
+        kind += "s"
+
+    if aten and _is_caffe2_aten_fallback():
+        if isinstance(value, torch.Tensor):
+            # Caffe2 proto does not support tensor attribute.
+            if value.numel() > 1:
+                raise ValueError("Should not pass tensor attribute")
+            value = _scalar(value)
+            if isinstance(value, float):
+                kind = "f"
+            else:
+                kind = "i"
+    return getattr(node, f"{kind}_")(name, value)
+
+
+# TODO: Expose this to user when migrating symbolic helper functions to here.
+@_beartype.beartype
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+@_beartype.beartype
+def get_device_from_value(value: _C.Value) -> Optional[torch.device]:
+    if not _is_tensor(value):
+        return None
+    tensor_type = typing.cast(_C.TensorType, value.type())
+    return tensor_type.device()
+
+
+@_beartype.beartype
+def parse_node_kind(kind: str) -> Tuple[str, str]:
+    """Parse node kind into domain and Op name."""
+    if "::" not in kind:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' is not in node kind.")
+    domain, opname = kind.split("::", 1)
+    if "::" in opname:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' should only apear once.")
+    return domain, opname
+
+
+@_beartype.beartype
+def is_aten(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "aten"
+
+
+@_beartype.beartype
+def is_prim(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "prim"
+
+
+@_beartype.beartype
+def is_onnx(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "onnx"
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py b/MLPY/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..59637498d7d4c061e47a727159d9d5b17d7d9b97
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py
@@ -0,0 +1,288 @@
+"""Utilities for manipulating the onnx and onnx-script dependencies and ONNX proto."""
+
+from __future__ import annotations
+
+import glob
+import io
+import os
+import shutil
+import zipfile
+from typing import Any, List, Mapping, Set, Tuple, Union
+
+import torch
+import torch.jit._trace
+import torch.serialization
+from torch.onnx import _constants, _exporter_states, errors
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+
+@_beartype.beartype
+def export_as_test_case(
+    model_bytes: bytes, inputs_data, outputs_data, name: str, dir: str
+) -> str:
+    """Export an ONNX model as a self contained ONNX test case.
+
+    The test case contains the model and the inputs/outputs data. The directory structure
+    is as follows:
+
+    dir
+    ├── test_<name>
+    │   ├── model.onnx
+    │   └── test_data_set_0
+    │       ├── input_0.pb
+    │       ├── input_1.pb
+    │       ├── output_0.pb
+    │       └── output_1.pb
+
+    Args:
+        model_bytes: The ONNX model in bytes.
+        inputs_data: The inputs data, nested data structure of numpy.ndarray.
+        outputs_data: The outputs data, nested data structure of numpy.ndarray.
+
+    Returns:
+        The path to the test case directory.
+    """
+    try:
+        import onnx
+    except ImportError as exc:
+        raise ImportError(
+            "Export test case to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    test_case_dir = os.path.join(dir, "test_" + name)
+    os.makedirs(test_case_dir, exist_ok=True)
+    _export_file(
+        model_bytes,
+        os.path.join(test_case_dir, "model.onnx"),
+        _exporter_states.ExportTypes.PROTOBUF_FILE,
+        {},
+    )
+    data_set_dir = os.path.join(test_case_dir, "test_data_set_0")
+    if os.path.exists(data_set_dir):
+        shutil.rmtree(data_set_dir)
+    os.makedirs(data_set_dir)
+
+    proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    for i, (input_proto, input) in enumerate(zip(proto.graph.input, inputs_data)):
+        export_data(input, input_proto, os.path.join(data_set_dir, f"input_{i}.pb"))
+    for i, (output_proto, output) in enumerate(zip(proto.graph.output, outputs_data)):
+        export_data(output, output_proto, os.path.join(data_set_dir, f"output_{i}.pb"))
+
+    return test_case_dir
+
+
+@_beartype.beartype
+def load_test_case(dir: str) -> Tuple[bytes, Any, Any]:
+    """Load a self contained ONNX test case from a directory.
+
+    The test case must contain the model and the inputs/outputs data. The directory structure
+    should be as follows:
+
+    dir
+    ├── test_<name>
+    │   ├── model.onnx
+    │   └── test_data_set_0
+    │       ├── input_0.pb
+    │       ├── input_1.pb
+    │       ├── output_0.pb
+    │       └── output_1.pb
+
+    Args:
+        dir: The directory containing the test case.
+
+    Returns:
+        model_bytes: The ONNX model in bytes.
+        inputs: the inputs data, mapping from input name to numpy.ndarray.
+        outputs: the outputs data, mapping from output name to numpy.ndarray.
+    """
+    try:
+        import onnx
+        from onnx import numpy_helper
+    except ImportError as exc:
+        raise ImportError(
+            "Load test case from ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(os.path.join(dir, "model.onnx"), "rb") as f:
+        model_bytes = f.read()
+
+    test_data_dir = os.path.join(dir, "test_data_set_0")
+
+    inputs = {}
+    input_files = glob.glob(os.path.join(test_data_dir, "input_*.pb"))
+    for input_file in input_files:
+        tensor = onnx.load_tensor(input_file)  # type: ignore[attr-defined]
+        inputs[tensor.name] = numpy_helper.to_array(tensor)
+    outputs = {}
+    output_files = glob.glob(os.path.join(test_data_dir, "output_*.pb"))
+    for output_file in output_files:
+        tensor = onnx.load_tensor(output_file)  # type: ignore[attr-defined]
+        outputs[tensor.name] = numpy_helper.to_array(tensor)
+
+    return model_bytes, inputs, outputs
+
+
+@_beartype.beartype
+def export_data(data, value_info_proto, f: str) -> None:
+    """Export data to ONNX protobuf format.
+
+    Args:
+        data: The data to export, nested data structure of numpy.ndarray.
+        value_info_proto: The ValueInfoProto of the data. The type of the ValueInfoProto
+            determines how the data is stored.
+        f: The file to write the data to.
+    """
+    try:
+        from onnx import numpy_helper
+    except ImportError as exc:
+        raise ImportError(
+            "Export data to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(f, "wb") as opened_file:
+        if value_info_proto.type.HasField("map_type"):
+            opened_file.write(
+                numpy_helper.from_dict(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("sequence_type"):
+            opened_file.write(
+                numpy_helper.from_list(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("optional_type"):
+            opened_file.write(
+                numpy_helper.from_optional(
+                    data, value_info_proto.name
+                ).SerializeToString()
+            )
+        else:
+            assert value_info_proto.type.HasField("tensor_type")
+            opened_file.write(
+                numpy_helper.from_array(data, value_info_proto.name).SerializeToString()
+            )
+
+
+@_beartype.beartype
+def _export_file(
+    model_bytes: bytes,
+    f: Union[io.BytesIO, str],
+    export_type: str,
+    export_map: Mapping[str, bytes],
+) -> None:
+    """export/write model bytes into directory/protobuf/zip"""
+    if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
+        assert len(export_map) == 0
+        with torch.serialization._open_file_like(f, "wb") as opened_file:
+            opened_file.write(model_bytes)
+    elif export_type in {
+        _exporter_states.ExportTypes.ZIP_ARCHIVE,
+        _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+    }:
+        compression = (
+            zipfile.ZIP_DEFLATED
+            if export_type == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
+            else zipfile.ZIP_STORED
+        )
+        with zipfile.ZipFile(f, "w", compression=compression) as z:
+            z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, model_bytes)
+            for k, v in export_map.items():
+                z.writestr(k, v)
+    elif export_type == _exporter_states.ExportTypes.DIRECTORY:
+        if isinstance(f, io.BytesIO) or not os.path.isdir(f):  # type: ignore[arg-type]
+            raise ValueError(
+                f"f should be directory when export_type is set to DIRECTORY, instead get type(f): {type(f)}"
+            )
+        if not os.path.exists(f):  # type: ignore[arg-type]
+            os.makedirs(f)  # type: ignore[arg-type]
+
+        model_proto_file = os.path.join(f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME)  # type: ignore[arg-type]
+        with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
+            opened_file.write(model_bytes)
+
+        for k, v in export_map.items():
+            weight_proto_file = os.path.join(f, k)  # type: ignore[arg-type]
+            with torch.serialization._open_file_like(
+                weight_proto_file, "wb"
+            ) as opened_file:
+                opened_file.write(v)
+    else:
+        raise ValueError("Unknown export type")
+
+
+@_beartype.beartype
+def _add_onnxscript_fn(
+    model_bytes: bytes,
+    custom_opsets: Mapping[str, int],
+) -> bytes:
+    """Insert model-included custom onnx-script function into ModelProto"""
+    try:
+        import onnx
+    except ImportError as e:
+        raise errors.OnnxExporterError("Module onnx is not installed!") from e
+
+    # For > 2GB model, onnx.load_fromstring would fail. However, because
+    # in _export_onnx, the tensors should be saved separately if the proto
+    # size > 2GB, and if it for some reason did not, the model would fail on
+    # serialization anyway in terms of the protobuf limitation. So we don't
+    # need to worry about > 2GB model getting here.
+    model_proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    # Iterate graph nodes to insert only the included custom
+    # function_proto into model_proto
+    onnx_function_list = list()  # type: ignore[var-annotated]
+    included_node_func = set()  # type: Set[str]
+    # onnx_function_list and included_node_func are expanded in-place
+    _find_onnxscript_op(
+        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
+    )
+
+    if onnx_function_list:
+        model_proto.functions.extend(onnx_function_list)
+        model_bytes = model_proto.SerializeToString()
+    return model_bytes
+
+
+@_beartype.beartype
+def _find_onnxscript_op(
+    graph_proto,
+    included_node_func: Set[str],
+    custom_opsets: Mapping[str, int],
+    onnx_function_list: List,
+):
+    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
+    for node in graph_proto.node:
+        node_kind = node.domain + "::" + node.op_type
+        # Recursive needed for control flow nodes: IF/Loop which has inner graph_proto
+        for attr in node.attribute:
+            if attr.g is not None:
+                _find_onnxscript_op(
+                    attr.g, included_node_func, custom_opsets, onnx_function_list
+                )
+        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
+        onnx_function_group = registration.registry.get_function_group(node_kind)
+        # Ruled out corner cases: onnx/prim in registry
+        if (
+            node.domain
+            and not jit_utils.is_aten(node.domain)
+            and not jit_utils.is_prim(node.domain)
+            and not jit_utils.is_onnx(node.domain)
+            and onnx_function_group is not None
+            and node_kind not in included_node_func
+        ):
+            specified_version = custom_opsets.get(node.domain, 1)
+            onnx_fn = onnx_function_group.get(specified_version)
+            if onnx_fn is not None:
+                if hasattr(onnx_fn, "to_function_proto"):
+                    onnx_function_proto = onnx_fn.to_function_proto()  # type: ignore[attr-defined]
+                    onnx_function_list.append(onnx_function_proto)
+                    included_node_func.add(node_kind)
+                continue
+
+            raise errors.UnsupportedOperatorError(
+                node_kind,
+                specified_version,
+                onnx_function_group.get_min_supported()
+                if onnx_function_group
+                else None,
+            )
+    return onnx_function_list, included_node_func
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/onnxruntime.py b/MLPY/Lib/site-packages/torch/onnx/_internal/onnxruntime.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b0f4a47c231ad9477e7e047279b0a4c0c41668
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/onnxruntime.py
@@ -0,0 +1,1199 @@
+import dataclasses
+import importlib
+import logging
+import os
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Final,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+from typing_extensions import TypeAlias
+
+import torch
+import torch._C
+import torch._ops
+import torch._prims.executor
+import torch.fx
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx._compatibility import compatibility
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
+from torch.utils import _pytree
+
+try:
+    # Use try-except to initialize package-dependent global variables.
+    import onnx
+    import onnxruntime  # type: ignore[import]
+    from onnxruntime.capi import _pybind_state as ORTC  # type: ignore[import]
+
+    # This is not use directly in DORT but needed by underlying exporter,
+    # so we still need to check if it exists.
+    importlib.import_module("onnxscript")
+
+    import torch.onnx
+    import torch.onnx._internal
+    import torch.onnx._internal.diagnostics
+    import torch.onnx._internal.exporter
+    import torch.onnx._internal.fx.decomposition_table
+    import torch.onnx._internal.fx.passes
+    from torch.onnx._internal.fx import fx_onnx_interpreter
+    from torch.onnx._internal.fx.type_utils import (
+        _TORCH_DTYPE_TO_NUMPY_DTYPE,
+        _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE,
+        from_python_type_to_onnx_tensor_element_type,
+    )
+
+    _SUPPORT_ONNXRT = True
+except ImportError:
+    _SUPPORT_ONNXRT = False
+
+__all__ = [
+    "is_onnxrt_backend_supported",
+    "torch_compile_backend",
+    "OrtExecutionProvider",
+    "OrtBackendOptions",
+    "OrtBackend",
+]
+
+
+def is_onnxrt_backend_supported() -> bool:
+    """Returns ``True`` if ONNX Runtime dependencies are installed and usable
+    to support TorchDynamo backend integration; ``False`` otherwise.
+
+    Example::
+
+        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> if torch.onnx.is_onnxrt_backend_supported():
+        ...     @torch.compile(backend="onnxrt")
+        ...     def f(x):
+        ...             return x * x
+        ...     print(f(torch.randn(10)))
+        ... else:
+        ...     print("pip install onnx onnxscript onnxruntime")
+        ...
+    """
+    return _SUPPORT_ONNXRT
+
+
+_dumped_onnx_model: Dict[str, int] = {}
+
+
+def _dump_onnx_model(
+    model_string: bytes, graph_module: Optional[torch.fx.GraphModule] = None
+) -> str:
+    """Stores the onnx model into a file.
+    The name is "{ONNXRT_DUMP_PATH}{N}.onnx"
+    where *N* is the number of files already stored with
+    this prefix.
+    If graph_module is not None, the graph is stored as a string with
+    the same filename except the extension (.txt).
+    """
+    prefix = os.environ.get("ONNXRT_DUMP_PATH", None)
+    if not prefix:
+        return ""
+    n = _dumped_onnx_model.get(prefix, -1) + 1
+    filename = f"{prefix}{n}.onnx"
+    with open(filename, "wb") as f:
+        f.write(model_string)
+    _dumped_onnx_model[prefix] = n
+    if graph_module is not None:
+        filename_txt = f"{prefix}{n}.txt"
+        with open(filename_txt, "w", encoding="utf-8") as f:
+            f.write(str(graph_module.graph))
+    return filename
+
+
+def _infer_default_eps() -> Sequence[str]:
+    # TODO: select a good default based on the capabilities of the host
+    # e.g. DML on Windows, etc.
+    return ["CPUExecutionProvider"]
+
+
+def _nvtx_range_push(name: str):
+    """If PyTorch is installed with CUDA support, this starts NVTX range.
+
+    Check torch.cuda.nvtx.range_push's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_push(name)
+
+
+def _nvtx_range_pop():
+    """If PyTorch is installed with CUDA support, this terminates NVTX range.
+
+    Check torch.cuda.nvtx.range_pop's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_pop()
+
+
+def _get_ort_device_type(device_type: str):
+    if device_type == "cuda":
+        return ORTC.OrtDevice.cuda()
+    if device_type == "cpu":
+        return ORTC.OrtDevice.cpu()
+    # ort pytorch device is mapped to NPU OrtDevice type
+    if device_type == "ort":
+        return ORTC.OrtDevice.npu()
+    raise ValueError("Unsupported device type: " + device_type)
+
+
+logger = logging.getLogger(__name__)
+# Uncomment the following lines to print out development info.
+# logging.basicConfig(level=logging.WARNING)
+# logger.setLevel(logging.WARNING)
+
+
+class OrtOperatorSupport(OperatorSupport):
+    """Operator support for ONNXRuntime backend.
+
+    It has two-level of support decision. One is via support_dict and the other one
+    is via extra_support_dict. The logic of using support_dict is implemented in
+    OrtOperatorSupport and extra_support_dict is used by OperatorSupport.is_node_supported.
+    """
+
+    def __init__(self, support_dict: Set[Any], extra_support_dict: Dict[str, Any]):
+        # Use extra_support_dict[op_name] = None to indicate
+        # we support op_name with all input types. Otherwise,
+        # see support_dict (type: SupportDict) in operator_support.py
+        # for specifying supported types.
+        super().__init__(extra_support_dict)
+        self._onnx_support_dict = support_dict
+
+    def is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        # OperatorSupport.is_node_supported returns True for non-callable nodes.
+        # Since ORT can't execute them, we return False here to override the base
+        # behavior.
+        if node.op not in CALLABLE_NODE_OPS:
+            return False
+        # This is the and the only place to decide if aten op is supported.
+        if node.op == "call_function" and node.target in self._onnx_support_dict:
+            logger.warning(
+                "support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        # If node.target is not in support_dict, we still want to check if torch.jit.script
+        # can convert it to ONNX equivalence. Let's use base mechanism to do this.
+        # See extra_support_dict  for supported ops.
+        if super().is_node_supported(submodules, node):
+            logger.warning(
+                "extra_support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        logger.warning(
+            "support_dict and extra_support_dict don't support node.target: %s (type: %s)",
+            node.target,
+            type(node.target),
+        )
+        return False
+
+
+def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
+    """
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    graph = graph_module.graph
+    placeholders = []
+    first_not_placeholder = None
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            placeholders.append(node)
+        if first_not_placeholder is None and node.op != "placeholder":
+            first_not_placeholder = node
+    if first_not_placeholder is None:
+        return
+    for placeholder in placeholders:
+        first_not_placeholder.prepend(placeholder)
+
+
+def _infer_ep_from_device(*args) -> Tuple[str, ...]:
+    """Return the first valid device (i.e., GPU or CPU) in argument list."""
+    eps = []
+    for arg in args:
+        if hasattr(arg, "device"):
+            device = arg.device
+            if device.type == "cuda":
+                eps.append("CUDAExecutionProvider")
+            elif device.type == "cpu":
+                eps.append("CPUExecutionProvider")
+    return tuple(eps)
+
+
+def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> Tuple[Any, ...]:
+    placeholders = []
+    for node in graph_module.graph.nodes:
+        if node.op == "placeholder":
+            if hasattr(node, "meta") and "val" in node.meta:
+                assert isinstance(node.meta["val"], torch.Tensor)
+            placeholders.append(node)
+    return tuple(placeholders)
+
+
+def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
+    """Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
+    for node in graph_module.graph.nodes:
+        if node.op == "output":
+            # Output node is unique. Let's retrieve output values from
+            # this node's input list. And then just return.
+            return node.args[0]
+    raise ValueError("No output node found in this torch.fx.GraphModule.")
+
+
+def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
+    """Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
+    flattened_output_args, _ = _pytree.tree_flatten(
+        _extract_graph_module_outputs(graph_module)
+    )
+    # Output arguments with example value (type: torch.Tensor) in the `graph_module`.
+    selected_output_args = [
+        output_arg.meta["val"]
+        for output_arg in flattened_output_args
+        # output_arg must have tensor for its device information.
+        # Otherwise, skip it.
+        if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
+    ]
+    return _infer_ep_from_device(*selected_output_args)
+
+
+def _sort_eps(eps: Tuple[str, ...]) -> Tuple[str, ...]:
+    """Sort execution providers in eps based on pre-set priority."""
+
+    def get_execution_provider_priority(ep: str) -> int:
+        if ep == "CPUExecutionProvider":
+            # Lowest priority.
+            return 2
+        if ep == "CUDAExecutionProvider":
+            # Higher priority than CPU but lower than
+            # other specialized EPs.
+            return 1
+        # Highest priority.
+        return 0
+
+    unique_eps = set(eps)
+    return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))
+
+
+def _get_onnx_devices(
+    values: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ]
+) -> Tuple["ORTC.OrtDevice", ...]:
+    def _device_id_or_zero(device_id: int) -> int:
+        return device_id or 0
+
+    def _map_tensor_or_sym_to_device(
+        value: Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+    ) -> int:
+        if isinstance(value, torch.Tensor):
+            return ORTC.OrtDevice(
+                _get_ort_device_type(value.device.type),
+                ORTC.OrtDevice.default_memory(),
+                _device_id_or_zero(value.device.index),
+            )
+        elif isinstance(
+            value, (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool)
+        ):
+            return ORTC.OrtDevice(
+                _get_ort_device_type("cpu"), ORTC.OrtDevice.default_memory(), 0
+            )
+        else:
+            raise ValueError("Unsupported value type: " + str(type(value)))
+
+    if len(values) > 0:
+        ort_devices = tuple(_map_tensor_or_sym_to_device(value) for value in values)
+        return ort_devices
+    else:
+        return (_map_tensor_or_sym_to_device(1),)
+
+
+def _get_ortvalues_from_torch_tensors(
+    tensors: Tuple[torch.Tensor, ...], devices: Tuple["ORTC.OrtDevice", ...]
+) -> Tuple[torch.Tensor, ...]:
+    ortvalues = ORTC.OrtValueVector()
+    ortvalues.reserve(len(tensors))
+    dtypes = []
+    shapes = []
+    data_ptrs = []
+
+    for tensor in tensors:
+        dtypes.append(_TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
+        shapes.append(tensor.size())
+        data_ptrs.append(tensor.data_ptr())
+    ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
+    return ortvalues
+
+
+def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
+    if tensor.is_sparse:
+        raise ValueError("sparse tensor is not yet supported.")
+    out = torch.empty(tensor.size(), dtype=tensor.dtype, device=tensor.device)
+    return out
+
+
+def _adjust_scalar_from_fx_to_onnx(
+    dynamo_value: Union[
+        torch.Tensor,
+        int,
+        float,
+        bool,
+    ],
+    value_info: "onnx.ValueInfoProto",  # type: ignore[name-defined]
+) -> torch.Tensor:
+    """Helper function to wrap PyTorch variables as torch.Tensor"""
+    if (
+        isinstance(dynamo_value, torch.Tensor)
+        and len(value_info.type.tensor_type.shape.dim) == 0
+        and dynamo_value.shape == (1,)
+    ):
+        # ONNX expect a scalar with empty shape.
+        # In contrast, PyTorch usually allows implicit
+        # conversion between shape=() and shape=(1,).
+        #
+        # Below, PyTorch's shape (1,) is reshaped to ().
+        return torch.squeeze(dynamo_value)
+    elif isinstance(dynamo_value, int):
+        return torch.tensor(dynamo_value, dtype=torch.int64)
+    elif isinstance(dynamo_value, float):
+        return torch.tensor(dynamo_value, dtype=torch.float32)
+    elif isinstance(dynamo_value, bool):
+        return torch.tensor(dynamo_value, dtype=torch.bool)
+    else:
+        assert isinstance(dynamo_value, torch.Tensor)
+        return dynamo_value.contiguous()
+
+
+def _adjust_scalar_from_onnx_to_fx(
+    tensor: torch.Tensor,
+    prim_value: Union[
+        torch.Tensor,
+        torch.SymInt,
+        int,
+        torch.SymFloat,
+        float,
+        torch.SymBool,
+        bool,
+    ],
+) -> Union[torch.Tensor, int, float, bool,]:
+    """Helper function to wrap ORT-produced torch.Tensor as PyTorch variables"""
+    assert isinstance(tensor, torch.Tensor), "ORT's output must be tensor."
+    if isinstance(
+        prim_value,
+        (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool),
+    ):
+        # Convert tensor back to scalar to match Dynamo's expectation.
+        return tensor.item()
+    return tensor
+
+
+def _run_onnx_session_with_ortvaluevector(
+    sess: "onnxruntime.InferenceSession",
+    input_names: Tuple[str, ...],
+    inputs: Tuple[torch.Tensor, ...],
+    input_devices: Tuple["ORTC.OrtDevice", ...],
+    output_names: Tuple[str, ...],
+    outputs: Tuple[torch.Tensor, ...],
+    output_devices: Tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
+    _nvtx_range_push("contiguous")
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    _nvtx_range_pop()
+
+    _nvtx_range_push("push_back_batch")
+    ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)
+
+    # preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
+    # Because the output ortvalue is not allocated and owned by ort, it does not need to convert the output ortvalue
+    # to torch Tensor transferring the ownership.
+    if preallocate_output:
+        pth_outputs = tuple(
+            _to_real_tensor(t) if isinstance(t, FakeTensor) else t for t in outputs
+        )
+        ort_outputs = _get_ortvalues_from_torch_tensors(pth_outputs, output_devices)
+    else:
+        ort_outputs = ORTC.OrtValueVector()
+    _nvtx_range_pop()
+
+    _nvtx_range_push("run_with_ortvaluevector")
+    run_options = onnxruntime.RunOptions()
+    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
+    sess.run_with_ortvaluevector(
+        run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices
+    )
+    _nvtx_range_pop()
+
+    # Post-processing step:
+    #  wrap ORT's outputs to the schema represented by
+    #  `prim_output` (obtained by running the original
+    #  torch.fx.GraphModule).
+    if preallocate_output:
+        # Profile the ORT-to-PyTorch type cast below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Outputs are stored on pre-allocated torch.Tensors' memory,
+        # so this case doesn't need to convert ORTValue to torch.Tensor.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+    else:
+        # Profile the two ORT-to-PyTorch type casts below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Map ORTValue to torch.Tensor.
+        pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(
+            ort_outputs
+        )
+        # Change some torch.Tensor to int, float, bool.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+
+
+def _run_onnx_session_with_fetch(
+    sess: "onnxruntime.InferenceSession",
+    input_names: Tuple[str, ...],
+    inputs: Tuple[torch.Tensor, ...],
+    input_devices: Tuple["ORTC.OrtDevice", ...],
+    output_names: Tuple[str, ...],
+    outputs: Tuple[torch.Tensor, ...],
+    output_devices: Tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    feed = {
+        name: onnxruntime.OrtValue.ortvalue_from_numpy(tensor.cpu().numpy())
+        for name, tensor in zip(input_names, inputs)
+    }
+    ort_outputs = sess.run(output_names, feed)
+    pth_outputs = tuple(
+        _adjust_scalar_from_onnx_to_fx(
+            torch.from_numpy(value),
+            prim_output,
+        )
+        for value, prim_output in zip(ort_outputs, normalized_prim_outputs)
+    )
+    return pth_outputs
+
+
+class OrtExecutionInfoPerSession:
+    """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
+
+    def __init__(
+        self,
+        session: "onnxruntime.InferenceSession",
+        input_names: Tuple[str, ...],
+        input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        output_names: Tuple[str, ...],
+        output_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        input_devices: Tuple["ORTC.OrtDevice", ...],
+        output_devices: Tuple["ORTC.OrtDevice", ...],
+        example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+    ):
+        # Carrier of ONNX model and its executor.
+        self.session: onnxruntime.InferenceSession = session
+        # For the ONNX model stored in self.session, self.input_names[i] is the
+        # name of the i-th positional input.
+        self.input_names: Tuple[str, ...] = input_names
+        # self.input_name[i]'s type information is stored in self.input_value_infos[i].
+        self.input_value_infos: Tuple[onnx.ValueInfoProto, ...] = input_value_infos  # type: ignore[name-defined]
+        # Similar to self.input_names, but for outputs.
+        self.output_names: Tuple[str, ...] = output_names
+        # Similar to self.input_value_infos but for outputs.
+        self.output_value_infos: Tuple[onnx.ValueInfoProto, ...] = output_value_infos  # type: ignore[name-defined]
+        # For the ONNX model stored in self.session, self.input_devices[i] is the
+        # i-th positional input's device.
+        self.input_devices: Tuple["ORTC.OrtDevice", ...] = input_devices
+        # Similar to self.input_devices, but for outputs.
+        self.output_devices: Tuple["ORTC.OrtDevice", ...] = output_devices
+        # This is the outputs of executing the original torch.fx.GraphModule with example inputs
+        # (i.e., args passed into OrtBackend._ort_acclerated_call).
+        self.example_outputs: Union[
+            Tuple[torch.Tensor, ...], torch.Tensor
+        ] = example_outputs
+
+    def is_supported(self, *args):
+        # Compare the args and the input schema in ONNX model and
+        # return the first match.
+        if len(args) != len(self.input_value_infos):
+            return False
+        for arg, value_info in zip(args, self.input_value_infos):
+            if not isinstance(arg, (torch.Tensor, float, int)):
+                return False
+
+            # Check Python scalars such as int, float, and bool.
+            if isinstance(arg, (int, float, bool)):
+                # Map, e.g., float to onnx.TensorProto.FLOAT.
+                onnx_dtype = from_python_type_to_onnx_tensor_element_type(type(arg))
+                if onnx_dtype != value_info.type.tensor_type.elem_type:
+                    return False
+                if len(value_info.type.tensor_type.shape.dim) != 0:
+                    return False
+                continue
+
+            # Check tensor.
+            onnx_dtype = _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE[arg.dtype]
+            if onnx_dtype != value_info.type.tensor_type.elem_type:
+                return False
+            for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
+                if isinstance(dim, int) and (
+                    onnx_dim.dim_value == dim or onnx_dim.dim_param
+                ):
+                    continue
+                elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
+                    continue
+                else:
+                    return False
+        return True
+
+
+@dataclasses.dataclass
+class OrtExecutionInfoForAllGraphModules:
+    def __init__(self):
+        # All sessions (and their related information) created by exporting the same GraphModule
+        # with different inputs.
+        self.execution_info_per_graph_module: Dict[
+            torch.fx.GraphModule, List[OrtExecutionInfoPerSession]
+        ] = {}
+
+    def search_reusable_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, *args
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            return None
+        # All execution information for ONNX models exported from the same `graph_module`
+        # with different inputs.
+        candidates = self.execution_info_per_graph_module[graph_module]
+
+        for candidate in candidates:
+            if candidate.is_supported(*args):
+                # Returns the first session that accepts this input schema.
+                return candidate
+        # No reusable session found.
+        return None
+
+    def cache_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            self.execution_info_per_graph_module[graph_module] = [info]
+        else:
+            self.execution_info_per_graph_module[graph_module].append(info)
+
+
+OrtExecutionProvider: TypeAlias = Union[str, Tuple[str, Mapping[str, Any]]]
+"""Either the name of an ONNX Runtime execution provider as a string or
+a 2-tuple of the name and a dictionary of execution provider options.
+
+Examples::
+
+    >>> "CPUExecutionProvider"
+
+    >>> ("CUDAExecutionProvider", {"device_id": 3})
+
+"""
+
+
+@dataclasses.dataclass(frozen=True)
+@compatibility(is_backward_compatible=False)
+class OrtBackendOptions:
+    """Options for constructing an ``OrtBackend``, the ONNX Runtime
+    backend (``"onnxrt"``) for ``torch.compile``.
+
+    Example::
+
+        >>> @torch.compile(
+        ...     backend="onnxrt",
+        ...     options=torch.onnx._OrtBackendOptions(...),
+        ... )
+        ... def ort_function(x):
+        ...     return x ** x
+    """
+
+    preferred_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """An optional sequence of execution providers to be prioritized ahead of any
+    execution providers that may be inferred (see ``infer_execution_providers``).
+    """
+
+    infer_execution_providers: bool = True
+    """Whether to infer an execution provider from ``torch.device`` bound to inputs or found in the graph."""
+
+    default_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """The default fallback execution providers. If not specified, one will be
+    be selected based on the host environment (most likely ``"CPUExecutionProvider"``).
+    """
+
+    # preallocate_output allows for allocating output torch Tensor buffers and feeding them to InferenceSession
+    # in order to avoid internal allocation of output buffers in InferenceSession.
+    # If output ortvalue returned from InferenceSession is allocated internally,
+    # it needs to be converted to torch Tensor for return, and the torch Tensor should hold the ownership.
+    # When a custom torch device is used with a custom aten allocator, the conversion from ortvalue to torch Tensor
+    # should be supported, which is currently done through dlpack. Note that dlpack might not support a custom torch device.
+    # It can be avoided by allowing for preallocation for output buffers allocated by a custom aten allocator,
+    # and use the preallocated output buffers for InferenceSession not holding any ownership for them.
+    # TODO(wschin): Make it to inference session level flag.
+    # See https://github.com/pytorch/pytorch/issues/106869.
+    preallocate_output: bool = False
+    """If ``True``, allocate memory for ONNX Runtime's outputs on the PyTorch side."""
+
+    use_aot_autograd: bool = True
+    """Whether to wrap the ``OrtBackend`` with TorchDynamo's aot_autograd backend
+    to support training (i.e., backward graphs are also sent to ``OrtBackend``).
+
+    Symbolic execution is used to capture the forward pass and backward passes as a single graph.
+    Then, a selected graph partition algorithm (``min_cut_rematerialization_partition``) is used
+    to split the entire graph into forward sub-graph and backward sub-graph. Finally, both
+    sub-graphs are compiled by ``OrtBackend``.
+    """
+
+    export_options: Optional["torch.onnx.ExportOptions"] = None
+    """Options for the TorchDynamo-based ONNX exporter used by the ``OrtBackend``."""
+
+    ort_session_options: Optional["onnxruntime.SessionOptions"] = None
+    """Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""
+
+    pre_ort_model_transforms: Optional[  # type: ignore[name-defined]
+        Sequence[Callable[["onnx.ModelProto"], None]]
+    ] = None
+    """A list of graph transforms to be applied to the ONNX model before it
+    is fed to ONNXRuntime's InferenceSession."""
+
+
+@compatibility(is_backward_compatible=False)
+class OrtBackend:
+    """A backend compiles (sub-)graphs in torch.fx.GraphModule to onnxruntime.InferenceSession calls.
+
+    The compiler entry point is OrtBackend.compile, which
+        1. partitions the original graph into supported sub-graphs (type: torch.fx.GraphModule) and unsupported
+           sub-graphs.
+        2. For each supported sub-graph, it replaces its _wrapped_call function with _ort_accelerated_call.
+        3. Inside _ort_accelerated_call, it creates onnxruntime.InferenceSession and calls it to execute the sub-graph.
+    """
+
+    def __init__(self, options: Optional[OrtBackendOptions] = None):
+        self._options: Final = OrtBackendOptions() if options is None else options
+
+        # options.export_options contains information shared between exporter and DORT.
+        # For example, they should use the same decomposition table when
+        #  1. capturing FX graph in torch.compile (see how we create aot_ort in register_backend.py)
+        #  2. call exporter's API to convert `torch.fx.GraphModule` to ONNX model
+        #     (see onnxfunction_dispatcher passed to FxOnnxInterpreter.run below).
+        #
+        # Convert user-facing option to internal option used by ONNX exporter
+        # to access required information.
+        # Some useful fields:
+        # - Decomposition table for decomposing FX operators in exporter is
+        #   self._resolved_onnx_exporter_options.decomposition_table.
+        # - self._resolved_onnx_exporter_options.onnx_registry records what
+        #   aten/prim ops are supported by exporter and their exporters (type: callable).
+        self._resolved_onnx_exporter_options = (
+            torch.onnx._internal.exporter.ResolvedExportOptions(
+                torch.onnx.ExportOptions()
+                if self._options.export_options is None
+                else self._options.export_options
+            )
+        )
+
+        #  Given DORT's computation flow:
+        #   1. OrtOperatorSupport uses support_dict and extra_support_dict to select operators
+        #      and send them to DORT.
+        #   2. Then, DORT exports the selected sub-graphs into ONNX.
+        #   3. Finally DORT calls ORT to do the computation.
+        #  OrtOperatorSupport and create_onnx_friendly_decomposition_table(...)
+        #  must use the same support_dict. If the support_dict here contains something not
+        #  supported by exporter, exporter will fails in step 2 since the selected graphs may
+        #  contains unsupported operators such as aten::_who_you_are.
+        #  This restriction is automatically done since DORT and exporter shares the same
+        #  self._resolved_onnx_exporter_options.
+        support_dict = torch.onnx._internal.fx.decomposition_table._create_onnx_supports_op_overload_table(
+            self._resolved_onnx_exporter_options.onnx_registry
+        )
+
+        extra_support_dict: Dict[str, Any] = {
+            "getattr": None,
+            # To send operator.getitem to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.getitem": None,
+            # To send operator.mul to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.mul": None,
+            "_operator.add": None,
+            "_operator.sub": None,
+        }
+
+        self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
+        # TODO(wschin): this is a naive implementation of cache without proper guard
+        # See https://github.com/pytorch/pytorch/issues/106868.
+        self._partitioner_cache: Dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
+        # Conceptually, this filed is a 2-layer dictionary
+        #   GraphModule 0
+        #     ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 1
+        #     ...
+        #   GraphModule 1
+        #     ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 3
+        #     ...
+        #   ...
+        # , which caches all previous compilation result so that we can reuse them.
+        # ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
+        # (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
+        # graphs captured by Dynamo and sent to OrtBackend.compile.
+        self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()
+
+        self._assert_allclose_to_baseline = False
+
+        self.execution_count = 0
+
+        # Function which invokes ORT do to the real computation.
+        self.run = (
+            _run_onnx_session_with_ortvaluevector
+            if hasattr(ORTC.OrtValueVector, "push_back_batch")
+            else _run_onnx_session_with_fetch
+        )
+
+    def _select_eps(
+        self, graph_module: torch.fx.GraphModule, *args
+    ) -> Sequence[Tuple[str, Mapping[str, Any]]]:
+        inferred_eps: Tuple[str, ...] = tuple()
+        if self._options.infer_execution_providers:
+            if eps_from_args := _infer_ep_from_device(*args):
+                # If user feeds CUDA tensor as input argument,
+                # we want to use CUDA EP.
+                # Thus, `eps_from_args` (deduced from input arguments)
+                # has highest priority.
+                inferred_eps = eps_from_args
+            elif eps_from_graph_module := _infer_ep_from_graph_module(graph_module):
+                # If there is no EP in input arguments, we deduce EP from
+                # graph_module's outputs. Those outputs may come from
+                # FakeTensorProp or Dynamo's built-in symbolic shape inference.
+                inferred_eps = eps_from_graph_module
+
+        selected_eps = []
+
+        for ep in (
+            *(self._options.preferred_execution_providers or []),
+            *_sort_eps(inferred_eps),
+            *(self._options.default_execution_providers or _infer_default_eps()),
+        ):
+            if isinstance(ep, str):
+                ep = (ep, {})
+            elif isinstance(ep, tuple) and ep[1] is None:
+                ep = (ep[0], {})
+            if ep is not None and ep not in selected_eps:
+                selected_eps.append(ep)
+
+        return selected_eps
+
+    def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
+        """This function replaces GraphModule._wrapped_call in compiled model.
+
+        The _wrapped_call is the underlying implementation of forward method. Replacing
+        it means we delegate the computation to _ort_acclerated_call and therefore
+        onnxruntime.InferenceSession.
+        """
+        cached_execution_info_per_session = (
+            self._all_ort_execution_info.search_reusable_session_execution_info(
+                graph_module, *args
+            )
+        )
+        if cached_execution_info_per_session:
+            onnx_session = cached_execution_info_per_session.session
+            input_names = cached_execution_info_per_session.input_names
+            output_names = cached_execution_info_per_session.output_names
+            input_value_infos = cached_execution_info_per_session.input_value_infos
+            output_value_infos = cached_execution_info_per_session.output_value_infos
+            input_devices = cached_execution_info_per_session.input_devices
+            output_devices = cached_execution_info_per_session.output_devices
+            prim_outputs = cached_execution_info_per_session.example_outputs
+        else:
+            # It's first time seeing such as graph. Let's make a new session
+            # (type: onnxruntime.InferenceSession) for it.
+
+            graph_module = torch.onnx._internal.fx.passes.MovePlaceholderToFront(
+                self._resolved_onnx_exporter_options.diagnostic_context,
+                graph_module,
+            ).run()
+            # Generate reference outputs. They are used to indicate output
+            # tensors' types and devices when calling ORT.
+            #
+            # WARNING: The downstream code should not change prim_outputs and
+            # this backend should always produces output with schema identical to prim_outputs'.
+
+            if self._resolved_onnx_exporter_options.dynamic_shapes:
+                # No pre-allocation when dynamic shape is enabled.
+                self.preallocate_output = False
+                extracted_outputs = _extract_graph_module_outputs(graph_module)
+
+                def maybe_map_to_meta_val(value):
+                    if hasattr(value, "meta") and "val" in value.meta:
+                        # Select outputs with "val" information. Without "val",
+                        # it's not possible access output_arg.meta["val"].device.
+                        return value.meta["val"]
+                    else:
+                        return value
+
+                prim_outputs = _pytree.tree_map(
+                    maybe_map_to_meta_val, extracted_outputs
+                )
+            else:
+                try:
+                    prim_outputs = FakeTensorProp(graph_module).propagate(
+                        *args, **kwargs
+                    )
+                except Exception:
+                    logger.warning("FakeTensorProb failed for %s", graph_module)
+                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
+                    # because the output shapes are not inferred.
+                    self.preallocate_output = False
+
+                    # rethrow FakeTensorProb failure because it is not yet currently handled.
+                    raise
+
+            # Create the object to iterate through the nodes in graph one-by-one
+            # and calls the corresponding ONNX exporter for each node.
+            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
+                diagnostic_context=self._resolved_onnx_exporter_options.diagnostic_context
+            )
+            # Cast FX variables if they will result schema-mismatch when searching
+            # for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
+            # but ONNX expects add(double_tensor, double_tensor).
+            graph_module = torch.onnx._internal.fx.passes.InsertTypePromotion(
+                self._resolved_onnx_exporter_options.diagnostic_context, graph_module
+            ).run()
+            # Start the per-node exporting process. It's conceptually a for loop
+            # scanning through the nodes in the graph.
+            exported = fx_interpreter.run(
+                fx_graph_module=graph_module,
+                onnxfunction_dispatcher=self._resolved_onnx_exporter_options.onnxfunction_dispatcher,
+                op_level_debug=self._resolved_onnx_exporter_options.op_level_debug,
+            )
+            # Convert the exported result to ONNX ModelProto.
+            onnx_model = exported.to_model_proto(
+                opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
+            )
+
+            # Modify ONNX model using pre-registered graph transforms.
+            # They are in-place modifications for avoiding unnecessary
+            # copy of ONNX initializers.
+            if self._options.pre_ort_model_transforms:
+                for transform in self._options.pre_ort_model_transforms:
+                    transform(onnx_model)
+
+            onnx_model_bytes = onnx_model.SerializeToString()
+            if os.environ.get("ONNXRT_DUMP_PATH", None):
+                # If not empty, environment variable ONNXRT_DUMP_PATH defined the path
+                # where generated onnx files should be stored.
+                # This module keeps a global variables keeping track of the
+                # stored models.
+                # If ONNXRT_DUMP_PATH="dumped/dumped_model_"
+                # The first file name will be 'dumped/dumped_model_0.onnx'.
+                # For every dumped model, a text file 'dumped/dumped_model_0.txt'
+                # is created as well to contain the string representing the graph_module.
+                _dump_onnx_model(onnx_model_bytes, graph_module=graph_module)
+
+            # Initialize a ORT session to execute this ONNX model.
+            # Note that TorchDynamo assumes all inputs/outputs are on the
+            # same device, but it's subject to change (very likely with
+            # dynamic shape support), so we add execution providers
+            # based on the logic in _select_eps: (explicitly preferred EPs,
+            # EPs inferred from inputs or graph, and the fallback default EP)/
+            #
+            # TODO(wschin): enable external allocators.
+            # See https://github.com/pytorch/pytorch/issues/106867
+            onnx_session = onnxruntime.InferenceSession(
+                path_or_bytes=onnx_model_bytes,
+                sess_options=self._options.ort_session_options,
+                providers=self._select_eps(graph_module, *args),
+            )
+
+            # Cache ORT session. It's reused for the same "graph_module".
+            # Generate ONNX model and extract its input and output names.
+            input_names = tuple(input.name for input in onnx_model.graph.input)
+            output_names = tuple(output.name for output in onnx_model.graph.output)
+            input_devices = _get_onnx_devices(args)
+            # Cache devices for inputs and outputs. They are used to invoke
+            # ORT session. Output devices indicate where (e.g., GPU or CPU)
+            # to store outputs
+            if isinstance(prim_outputs, tuple):
+                output_devices = _get_onnx_devices(prim_outputs)
+            else:
+                output_devices = _get_onnx_devices((prim_outputs,))
+
+            input_value_infos = tuple(input for input in onnx_model.graph.input)
+            output_value_infos = tuple(output for output in onnx_model.graph.output)
+
+            execution_info_per_session = OrtExecutionInfoPerSession(
+                session=onnx_session,
+                input_names=input_names,
+                input_value_infos=input_value_infos,
+                output_names=output_names,
+                output_value_infos=output_value_infos,
+                input_devices=input_devices,
+                output_devices=output_devices,
+                example_outputs=prim_outputs,
+            )
+
+            self._all_ort_execution_info.cache_session_execution_info(
+                graph_module, execution_info_per_session
+            )
+
+        self.execution_count += 1
+
+        # ORT always returns a tuple of outputs. If the original output is a tensor,
+        # ORT output's first element must be extracted and returned. Otherwise, type
+        # mismatch may happen in downstream computation.
+        is_single_tensor_output = isinstance(prim_outputs, torch.Tensor)
+        normalized_prim_outputs = (
+            (prim_outputs,) if is_single_tensor_output else prim_outputs
+        )
+        assert isinstance(normalized_prim_outputs, tuple)
+        assert all(
+            isinstance(elem, (torch.Tensor, torch.SymInt, int))
+            for elem in normalized_prim_outputs
+        )
+
+        _nvtx_range_push("run_onnx_session_with_ortvaluevector")
+        onnx_outputs = self.run(
+            onnx_session,
+            input_names,
+            args,
+            input_devices,
+            output_names,
+            normalized_prim_outputs,
+            output_devices,
+            self._options.preallocate_output,
+            input_value_infos,
+            normalized_prim_outputs,
+        )
+        _nvtx_range_pop()
+
+        if self._assert_allclose_to_baseline:
+            # Compute baseline.
+            baseline_outputs = torch._prims.executor.execute(
+                graph_module, *args, executor="aten"
+            )
+            normalized_baseline_ouptuts = (
+                (baseline_outputs,) if is_single_tensor_output else baseline_outputs
+            )
+            # Ensure every output tensor is close to the corresponding baseline.
+            for onnx_output, baseline_output in zip(
+                onnx_outputs, normalized_baseline_ouptuts
+            ):
+                torch.testing.assert_close(onnx_output, baseline_output)
+        return onnx_outputs[0] if is_single_tensor_output else onnx_outputs
+
+    def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
+        # Deferred import since CapabilityBasedPartitioner is not decorated with
+        # @compatibility; importing it at the module level will result in the test
+        # failing: pytest test/test_fx.py -k test_public_api_surface
+        # because this module is imported into torch.onnx.
+        from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+        # FX graph based partitioning based on ONNX supported ops.
+        # Given a graph module
+        #  GraphModule0
+        #   node_0
+        #   node_1
+        #   node_2
+        #   node_3
+        #   node_4
+        # If only node_2 is not supported by ONNX, this graph module will be partitioned into
+        #  GraphModule0
+        #   GraphModule1
+        #    node_0
+        #    node_1
+        #   node_2
+        #   GraphModule2
+        #    node_3
+        #    node_4
+        # by calling CapabilityBasedPartitioner.partition_and_fuse.
+        # Then, GraphModule1's and GraphModule2's forward method (GraphModule._wrapped_call)
+        # will be replaced by OrtBackend._ort_accelerated_call to delegate computation to ORT.
+        if graph_module in self._partitioner_cache:
+            partitioned_prim_graph_module = self._partitioner_cache[graph_module]
+        else:
+            prim_graph_module = graph_module
+            partitioner = CapabilityBasedPartitioner(
+                prim_graph_module,
+                self._supported_ops,
+                allows_single_node_partition=True,
+            )
+            partitioned_prim_graph_module = partitioner.partition_and_fuse()
+            self._partitioner_cache[graph_module] = partitioned_prim_graph_module
+
+            # Overriding fused_module's __call__() function with ort_acclerated_call()
+            # This loop goes through all graph partitions (each of them is an ONNX-representable graph)
+            # and override their _wrapped_call function with _ort_accelerated_call.
+            # Inside _ort_accelerated_call, the partition's graph is exported into ONNX and executed by ORT.
+            for node in partitioned_prim_graph_module.graph.nodes:
+                # TODO(wschin): use a better way to identify fused submodule
+                # See https://github.com/pytorch/pytorch/issues/106872.
+                if node.op == "call_module" and "fused_" in node.name:
+                    fused_module = getattr(partitioned_prim_graph_module, node.name)
+                    # self.ort_acclerated_call is responsible for exporting graph to ONNX,
+                    # creating ORT session, and running ORT session.
+                    fused_module._wrapped_call = self._ort_acclerated_call
+
+        return partitioned_prim_graph_module
+
+    def __call__(
+        self, graph_module: torch.fx.GraphModule, args
+    ) -> torch.fx.GraphModule:
+        """If ``OrtBackendOptions.use_aot_autograd`` is ``True``, the `auto_autograd` compiler
+        will be invoked, wrapping this ``OrtBackend`` instance's ``compile`` method. Otherwise,
+        the ``compile`` method is invoked directly."""
+        if self._options.use_aot_autograd:
+            from functorch.compile import min_cut_rematerialization_partition
+
+            from torch._dynamo.backends.common import aot_autograd
+
+            return aot_autograd(
+                fw_compiler=self.compile,
+                partition_fn=min_cut_rematerialization_partition,
+                decompositions=self._resolved_onnx_exporter_options.decomposition_table,
+            )(graph_module, args)
+
+        return self.compile(graph_module, args)
+
+    __instance_cache_max_count: Final = 8
+    __instance_cache: Final[List["OrtBackend"]] = []
+
+    @staticmethod
+    def get_cached_instance_for_options(
+        options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+    ) -> "OrtBackend":
+        """Returns a possibly cached instance of an ``OrtBackend``. If an existing
+        backend was created previously through this function with the same options,
+        it will be returned. Otherwise a new backend will be created, cached, and
+        returned.
+
+        Note: if ``options`` sets ``ort_session_options``, a new ``OrtBackend``
+        will always be returned, since ``onnxruntime.SessionOptions`` cannot
+        participate in caching."""
+
+        def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
+            if (
+                a.preferred_execution_providers != b.preferred_execution_providers
+                or a.infer_execution_providers != b.infer_execution_providers
+                or a.default_execution_providers != b.default_execution_providers
+                or a.preallocate_output != b.preallocate_output
+                or a.use_aot_autograd != b.use_aot_autograd
+                or a.pre_ort_model_transforms != b.pre_ort_model_transforms
+            ):
+                return False
+
+            # onnxruntime.SessionOptions is a pybind11 object, cannot be pickled,
+            # and holds too much potential state to reasonably check manually;
+            # ort_session_options is provided at all, the backend does not participate
+            # in caching.
+            if a.ort_session_options is not None or b.ort_session_options is not None:
+                return False
+
+            if a.export_options is b.export_options:
+                return True
+
+            # Similarly, some objects in ExportOptions are too stateful to use for
+            # caching. We should revisit this.
+            if a.export_options is not None and b.export_options is not None:
+                return (
+                    a.export_options.dynamic_shapes == b.export_options.dynamic_shapes
+                    and a.export_options.op_level_debug
+                    == b.export_options.op_level_debug
+                    and a.export_options.diagnostic_options
+                    == b.export_options.diagnostic_options
+                    and a.export_options.onnx_registry is b.export_options.onnx_registry
+                    and a.export_options.fake_context is b.export_options.fake_context
+                )
+
+            # We can't account for how the two option sets may differ, so it's not safe to reuse.
+            return False
+
+        if not isinstance(options, OrtBackendOptions):
+            options = OrtBackendOptions(**(options or {}))
+
+        backend = next(
+            (b for b in OrtBackend.__instance_cache if reusable(b._options, options)),
+            None,
+        )
+
+        if backend is None:
+            assert (
+                len(OrtBackend.__instance_cache) < OrtBackend.__instance_cache_max_count
+            ), (
+                f"No more than {OrtBackend.__instance_cache_max_count} instances of "
+                f"{OrtBackend} allowed. Please instantiate `{OrtBackend}` explicitly "
+                "to pass to `torch.compile`. "
+                "See https://github.com/pytorch/pytorch/pull/107973#discussion_r1306144795 "
+                "for discussion."
+            )
+            OrtBackend.__instance_cache.append(backend := OrtBackend(options))
+
+        return backend
+
+    @staticmethod
+    def clear_cached_instances():
+        OrtBackend.__instance_cache.clear()
+
+    @staticmethod
+    def get_cached_instances():
+        return tuple(OrtBackend.__instance_cache)
+
+
+@compatibility(is_backward_compatible=False)
+def torch_compile_backend(
+    graph_module: torch.fx.GraphModule,
+    args,
+    *,
+    options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+):
+    return OrtBackend.get_cached_instance_for_options(options)(graph_module, args)
diff --git a/MLPY/Lib/site-packages/torch/onnx/_internal/registration.py b/MLPY/Lib/site-packages/torch/onnx/_internal/registration.py
new file mode 100644
index 0000000000000000000000000000000000000000..24e84e53cf0e99c4cda77601cc3a2f46fb9db5c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_internal/registration.py
@@ -0,0 +1,339 @@
+"""Module for handling symbolic function registration."""
+
+import warnings
+from typing import (
+    Callable,
+    Collection,
+    Dict,
+    Generic,
+    Optional,
+    Sequence,
+    Set,
+    TypeVar,
+    Union,
+)
+
+from torch.onnx import _constants, errors
+from torch.onnx._internal import _beartype
+
+OpsetVersion = int
+
+
+def _dispatch_opset_version(
+    target: OpsetVersion, registered_opsets: Collection[OpsetVersion]
+) -> Optional[OpsetVersion]:
+    """Finds the registered opset given a target opset version and the available opsets.
+
+    Args:
+        target: The target opset version.
+        registered_opsets: The available opsets.
+
+    Returns:
+        The registered opset version.
+    """
+    if not registered_opsets:
+        return None
+
+    descending_registered_versions = sorted(registered_opsets, reverse=True)
+    # Linear search for the opset version, which is fine since the number of opset
+    # versions is small.
+
+    if target >= _constants.ONNX_BASE_OPSET:
+        # Always look down toward opset 1 when the target is >= ONNX_BASE_OPSET (opset 9).
+        # When a custom op is register at opset 1, we want to be able to discover it as a
+        # fallback for all opsets >= ONNX_BASE_OPSET.
+        for version in descending_registered_versions:
+            if version <= target:
+                return version
+        return None
+
+    # target < opset 9. This is the legacy behavior to support opset 7 and opset 8.
+    # for caffe2 support. We search up toward opset 9.
+    for version in reversed(descending_registered_versions):
+        # Count back up until _constants.ONNX_BASE_OPSET
+        if target <= version <= _constants.ONNX_BASE_OPSET:
+            return version
+
+    return None
+
+
+_K = TypeVar("_K")
+_V = TypeVar("_V")
+
+
+class OverrideDict(Generic[_K, _V], Collection[_K]):
+    """A dictionary that merges built-in and custom symbolic functions.
+
+    It supports overriding and un-overriding built-in symbolic functions with custom
+    ones.
+    """
+
+    def __init__(self):
+        self._base: Dict[_K, _V] = {}
+        self._overrides: Dict[_K, _V] = {}
+        self._merged: Dict[_K, _V] = {}
+
+    def set_base(self, key: _K, value: _V) -> None:
+        self._base[key] = value
+        if key not in self._overrides:
+            self._merged[key] = value
+
+    def in_base(self, key: _K) -> bool:
+        """Checks if a key is in the base dictionary."""
+        return key in self._base
+
+    def override(self, key: _K, value: _V) -> None:
+        """Overrides a base key-value with a new pair."""
+        self._overrides[key] = value
+        self._merged[key] = value
+
+    def remove_override(self, key: _K) -> None:
+        """Un-overrides a key-value pair."""
+        self._overrides.pop(key, None)  # type: ignore[arg-type]
+        self._merged.pop(key, None)  # type: ignore[arg-type]
+        if key in self._base:
+            self._merged[key] = self._base[key]
+
+    def overridden(self, key: _K) -> bool:
+        """Checks if a key-value pair is overridden."""
+        return key in self._overrides
+
+    def __getitem__(self, key: _K) -> _V:
+        return self._merged[key]
+
+    def get(self, key: _K, default: Optional[_V] = None):
+        return self._merged.get(key, default)
+
+    def __contains__(self, key: object) -> bool:
+        return key in self._merged
+
+    def __iter__(self):
+        return iter(self._merged)
+
+    def __len__(self) -> int:
+        return len(self._merged)
+
+    def __repr__(self) -> str:
+        return f"OverrideDict(base={self._base}, overrides={self._overrides})"
+
+    def __bool__(self) -> bool:
+        return bool(self._merged)
+
+
+class _SymbolicFunctionGroup:
+    """Different versions of symbolic functions registered to the same name.
+
+    O(number of registered versions of an op) search is performed to find the most
+    recent version of the op.
+
+    The registration is delayed until op is used to improve startup time.
+
+    Function overloads with different arguments are not allowed.
+    Custom op overrides are supported.
+    """
+
+    def __init__(self, name: str) -> None:
+        self._name = name
+        # A dictionary of functions, keyed by the opset version.
+        self._functions: OverrideDict[OpsetVersion, Callable] = OverrideDict()
+
+    def __repr__(self) -> str:
+        return f"_SymbolicFunctionGroup({self._name}, registered={self._functions})"
+
+    def __getitem__(self, key: OpsetVersion) -> Callable:
+        result = self.get(key)
+        if result is None:
+            raise KeyError(key)
+        return result
+
+    # TODO(justinchuby): Add @functools.lru_cache(maxsize=None) if lookup time becomes
+    # a problem.
+    def get(self, opset: OpsetVersion) -> Optional[Callable]:
+        """Find the most recent version of the function."""
+        version = _dispatch_opset_version(opset, self._functions)
+        if version is None:
+            return None
+
+        return self._functions[version]
+
+    def add(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a symbolic function.
+
+        Args:
+            func: The function to add.
+            opset: The opset version of the function to add.
+        """
+        if self._functions.in_base(opset):
+            warnings.warn(
+                f"Symbolic function '{self._name}' already registered for opset {opset}. "
+                f"Replacing the existing function with new function. This is unexpected. "
+                f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+                errors.OnnxExporterWarning,
+            )
+        self._functions.set_base(opset, func)
+
+    def add_custom(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a custom symbolic function.
+
+        Args:
+            func: The symbolic function to register.
+            opset: The corresponding opset version.
+        """
+        self._functions.override(opset, func)
+
+    def remove_custom(self, opset: OpsetVersion) -> None:
+        """Removes a custom symbolic function.
+
+        Args:
+            opset: The opset version of the custom function to remove.
+        """
+        if not self._functions.overridden(opset):
+            warnings.warn(
+                f"No custom function registered for '{self._name}' opset {opset}"
+            )
+            return
+        self._functions.remove_override(opset)
+
+    def get_min_supported(self) -> OpsetVersion:
+        """Returns the lowest built-in opset version supported by the function."""
+        return min(self._functions)
+
+
+class SymbolicRegistry:
+    """Registry for symbolic functions.
+
+    The registry maintains a mapping from qualified names to symbolic functions.
+    It is used to register new symbolic functions and to dispatch calls to
+    the appropriate function.
+    """
+
+    def __init__(self) -> None:
+        self._registry: Dict[str, _SymbolicFunctionGroup] = {}
+
+    def register(
+        self, name: str, opset: OpsetVersion, func: Callable, custom: bool = False
+    ) -> None:
+        """Registers a symbolic function.
+
+        Args:
+            name: The qualified name of the function to register. In the form of 'domain::op'.
+                E.g. 'aten::add'.
+            opset: The opset version of the function to register.
+            func: The symbolic function to register.
+            custom: Whether the function is a custom function that overrides existing ones.
+
+        Raises:
+            ValueError: If the separator '::' is not in the name.
+        """
+        if "::" not in name:
+            raise ValueError(
+                f"The name must be in the form of 'domain::op', not '{name}'"
+            )
+        symbolic_functions = self._registry.setdefault(
+            name, _SymbolicFunctionGroup(name)
+        )
+        if custom:
+            symbolic_functions.add_custom(func, opset)
+        else:
+            symbolic_functions.add(func, opset)
+
+    def unregister(self, name: str, opset: OpsetVersion) -> None:
+        """Unregisters a symbolic function.
+
+        Args:
+            name: The qualified name of the function to unregister.
+            opset: The opset version of the function to unregister.
+        """
+        if name not in self._registry:
+            return
+        self._registry[name].remove_custom(opset)
+
+    def get_function_group(self, name: str) -> Optional[_SymbolicFunctionGroup]:
+        """Returns the function group for the given name."""
+        return self._registry.get(name)
+
+    def is_registered_op(self, name: str, version: int) -> bool:
+        """Returns whether the given op is registered for the given opset version."""
+        functions = self.get_function_group(name)
+        if functions is None:
+            return False
+        return functions.get(version) is not None
+
+    def all_functions(self) -> Set[str]:
+        """Returns the set of all registered function names."""
+        return set(self._registry)
+
+
+@_beartype.beartype
+def onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+    custom: bool = False,
+) -> Callable:
+    """Registers a symbolic function.
+
+    Usage::
+
+    ```
+    @onnx_symbolic("aten::symbolic_b", opset=10, decorate=[quantized_aten_handler(scale=1/128, zero_point=0)])
+    @symbolic_helper.parse_args("v", "v", "b")
+    def symbolic_b(g: _C.Graph, x: _C.Value, y: _C.Value, arg1: bool) -> _C.Value:
+        ...
+    ```
+
+    Args:
+        name: The qualified name of the function in the form of 'domain::op'.
+            E.g. 'aten::add'.
+        opset: The opset versions of the function to register at.
+        decorate: A sequence of decorators to apply to the function.
+        custom: Whether the function is a custom symbolic function.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+
+    def wrapper(func: Callable) -> Callable:
+        decorated = func
+        if decorate is not None:
+            for decorate_func in decorate:
+                decorated = decorate_func(decorated)
+
+        global registry
+        nonlocal opset
+        if isinstance(opset, OpsetVersion):
+            opset = (opset,)
+        for opset_version in opset:
+            registry.register(name, opset_version, decorated, custom=custom)
+
+        # Return the original function because the decorators in "decorate" are only
+        # specific to the instance being registered.
+        return func
+
+    return wrapper
+
+
+@_beartype.beartype
+def custom_onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+) -> Callable:
+    """Registers a custom symbolic function.
+
+    Args:
+        name: the qualified name of the function.
+        opset: the opset version of the function.
+        decorate: a sequence of decorators to apply to the function.
+
+    Returns:
+        The decorator.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+    return onnx_symbolic(name, opset, decorate, custom=True)
+
+
+# The registry for all symbolic functions.
+registry = SymbolicRegistry()
diff --git a/MLPY/Lib/site-packages/torch/onnx/_onnx_supported_ops.py b/MLPY/Lib/site-packages/torch/onnx/_onnx_supported_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..72734ea3869b445ea4771bc94f27c4f7d89c325c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_onnx_supported_ops.py
@@ -0,0 +1,97 @@
+import inspect
+from typing import Dict, List, Union
+
+from torch import _C
+from torch.onnx import _constants
+from torch.onnx._internal import registration
+
+
+class _TorchSchema:
+    def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
+        if isinstance(schema, _C.FunctionSchema):
+            self.name: str = schema.name
+            self.overload_name: str = schema.overload_name
+            self.arguments: List[str] = [arg.name for arg in schema.arguments]
+            self.optional_arguments: List[str] = []
+            self.returns: List[str] = [ret.name for ret in schema.returns]
+            self.opsets: List[int] = []
+        else:
+            self.name = schema
+            self.overload_name = ""
+            self.arguments = []
+            self.optional_arguments = []
+            self.returns = []
+            self.opsets = []
+
+    def __str__(self) -> str:
+        s = (
+            f"{self.name}.{self.overload_name}("
+            + ", ".join(self.arguments)
+            + ") -> ("
+            + ", ".join(self.returns)
+            + ")"
+            + " in opsets "
+            + ", ".join(str(opset) for opset in self.opsets)
+        )
+        return s
+
+    def __hash__(self):
+        # TODO(thiagocrepaldi): handle overload_name?
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, _TorchSchema):
+            return False
+        # TODO(thiagocrepaldi): handle overload_name?
+        return self.name == other.name
+
+    def is_aten(self) -> bool:
+        return self.name.startswith("aten::")
+
+    def is_backward(self) -> bool:
+        return "backward" in self.name
+
+
+def _symbolic_argument_count(func):
+    params = []
+    signature = inspect.signature(func)
+    optional_params = []
+    for name, parameter in signature.parameters.items():
+        if name in {"_outputs", "g"}:
+            continue
+        if parameter.default is parameter.empty:
+            optional_params.append(parameter)
+        else:
+            params.append(str(parameter))
+    return params
+
+
+def all_forward_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all TorchScript forward ops."""
+    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
+    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
+
+
+def all_symbolics_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all onnx supported ops."""
+    symbolics_schemas = {}
+
+    for name in registration.registry.all_functions():
+        func_group = registration.registry.get_function_group(name)
+        assert func_group is not None
+        symbolics_schema = _TorchSchema(name)
+        func = func_group.get(_constants.ONNX_MAX_OPSET)
+        if func is not None:
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(
+                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
+            )
+        else:
+            # Only support opset < 9
+            func = func_group.get(7)
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
+
+        symbolics_schemas[name] = symbolics_schema
+
+    return symbolics_schemas
diff --git a/MLPY/Lib/site-packages/torch/onnx/_type_utils.py b/MLPY/Lib/site-packages/torch/onnx/_type_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5ca8100a6209b72465bf219231e0e848a961c62
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/_type_utils.py
@@ -0,0 +1,380 @@
+"""Utilities for converting and operating on ONNX, JIT and torch types."""
+from __future__ import annotations
+
+import enum
+import typing
+from typing import Dict, Literal, Optional, Union
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal import _beartype
+
+if typing.TYPE_CHECKING:
+    # Hack to help mypy to recognize torch._C.Value
+    from torch import _C  # noqa: F401
+
+ScalarName = Literal[
+    "Byte",
+    "Char",
+    "Double",
+    "Float",
+    "Half",
+    "Int",
+    "Long",
+    "Short",
+    "Bool",
+    "ComplexHalf",
+    "ComplexFloat",
+    "ComplexDouble",
+    "QInt8",
+    "QUInt8",
+    "QInt32",
+    "BFloat16",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E5M2FNUZ",
+    "Float8E4M3FNUZ",
+    "Undefined",
+]
+
+TorchName = Literal[
+    "bool",
+    "uint8_t",
+    "int8_t",
+    "double",
+    "float",
+    "half",
+    "int",
+    "int64_t",
+    "int16_t",
+    "complex32",
+    "complex64",
+    "complex128",
+    "qint8",
+    "quint8",
+    "qint32",
+    "bfloat16",
+    "float8_e5m2",
+    "float8_e4m3fn",
+    "float8_e5m2fnuz",
+    "float8_e4m3fnuz",
+]
+
+
+class JitScalarType(enum.IntEnum):
+    """Scalar types defined in torch.
+
+    Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
+        >>> JitScalarType.from_value(torch.ones(1, 2)).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_value(torch_c_value_with_type_float).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_dtype(torch.get_default_dtype).onnx_type()
+        TensorProtoDataType.FLOAT
+
+    """
+
+    # Order defined in https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+    UINT8 = 0
+    INT8 = enum.auto()  # 1
+    INT16 = enum.auto()  # 2
+    INT = enum.auto()  # 3
+    INT64 = enum.auto()  # 4
+    HALF = enum.auto()  # 5
+    FLOAT = enum.auto()  # 6
+    DOUBLE = enum.auto()  # 7
+    COMPLEX32 = enum.auto()  # 8
+    COMPLEX64 = enum.auto()  # 9
+    COMPLEX128 = enum.auto()  # 10
+    BOOL = enum.auto()  # 11
+    QINT8 = enum.auto()  # 12
+    QUINT8 = enum.auto()  # 13
+    QINT32 = enum.auto()  # 14
+    BFLOAT16 = enum.auto()  # 15
+    FLOAT8E5M2 = enum.auto()  # 16
+    FLOAT8E4M3FN = enum.auto()  # 17
+    FLOAT8E5M2FNUZ = enum.auto()  # 18
+    FLOAT8E4M3FNUZ = enum.auto()  # 19
+    UNDEFINED = enum.auto()  # 20
+
+    @classmethod
+    @_beartype.beartype
+    def _from_name(
+        cls, name: Union[ScalarName, TorchName, Optional[str]]
+    ) -> JitScalarType:
+        """Convert a JIT scalar type or torch type name to ScalarType.
+
+        Note: DO NOT USE this API when `name` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            name: JIT scalar type name (Byte) or torch type name (uint8_t).
+
+        Returns:
+            JitScalarType
+
+        Raises:
+           OnnxExporterError: if name is not a valid scalar type name or if it is None.
+        """
+        if name is None:
+            raise errors.OnnxExporterError("Scalar type name cannot be None")
+        if valid_scalar_name(name):
+            return _SCALAR_NAME_TO_TYPE[name]  # type: ignore[index]
+        if valid_torch_name(name):
+            return _TORCH_NAME_TO_SCALAR_TYPE[name]  # type: ignore[index]
+
+        raise errors.OnnxExporterError(f"Unknown torch or scalar type: '{name}'")
+
+    @classmethod
+    @_beartype.beartype
+    def from_dtype(cls, dtype: Optional[torch.dtype]) -> JitScalarType:
+        """Convert a torch dtype to JitScalarType.
+
+        Note: DO NOT USE this API when `dtype` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            dtype: A torch.dtype to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if dtype not in _DTYPE_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
+        return _DTYPE_TO_SCALAR_TYPE[dtype]
+
+    @classmethod
+    @_beartype.beartype
+    def from_value(
+        cls, value: Union[None, torch._C.Value, torch.Tensor], default=None
+    ) -> JitScalarType:
+        """Create a JitScalarType from an value's scalar type.
+
+        Args:
+            value: An object to fetch scalar type from.
+            default: The JitScalarType to return if a valid scalar cannot be fetched from value
+
+        Returns:
+            JitScalarType.
+
+        Raises:
+            OnnxExporterError: if value does not have a valid scalar type and default is None.
+            SymbolicValueError: when value.type()'s info are empty and default is None
+        """
+
+        if not isinstance(value, (torch._C.Value, torch.Tensor)) or (
+            isinstance(value, torch._C.Value) and value.node().mustBeNone()
+        ):
+            # default value of type JitScalarType is returned when value is not valid
+            if default is None:
+                raise errors.OnnxExporterError(
+                    "value must be either torch._C.Value or torch.Tensor objects."
+                )
+            elif not isinstance(default, JitScalarType):
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return default
+
+        # Each value type has their own way of storing scalar type
+        if isinstance(value, torch.Tensor):
+            return cls.from_dtype(value.dtype)
+        if isinstance(value.type(), torch.ListType):
+            try:
+                return cls.from_dtype(value.type().getElementType().dtype())
+            except RuntimeError:
+                return cls._from_name(str(value.type().getElementType()))
+        if isinstance(value.type(), torch._C.OptionalType):
+            if value.type().getElementType().dtype() is None:
+                if isinstance(default, JitScalarType):
+                    return default
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return cls.from_dtype(value.type().getElementType().dtype())
+
+        scalar_type = None
+        if value.node().kind() != "prim::Constant" or not isinstance(
+            value.type(), torch._C.NoneType
+        ):
+            # value must be a non-list torch._C.Value scalar
+            scalar_type = value.type().scalarType()
+
+        if scalar_type is not None:
+            return cls._from_name(scalar_type)
+
+        # When everything fails... try to default
+        if default is not None:
+            return default
+        raise errors.SymbolicValueError(
+            f"Cannot determine scalar type for this '{type(value.type())}' instance and "
+            "a default value was not provided.",
+            value,
+        )
+
+    @_beartype.beartype
+    def scalar_name(self) -> ScalarName:
+        """Convert a JitScalarType to a JIT scalar type name."""
+        return _SCALAR_TYPE_TO_NAME[self]
+
+    @_beartype.beartype
+    def torch_name(self) -> TorchName:
+        """Convert a JitScalarType to a torch type name."""
+        return _SCALAR_TYPE_TO_TORCH_NAME[self]
+
+    @_beartype.beartype
+    def dtype(self) -> torch.dtype:
+        """Convert a JitScalarType to a torch dtype."""
+        return _SCALAR_TYPE_TO_DTYPE[self]
+
+    @_beartype.beartype
+    def onnx_type(self) -> _C_onnx.TensorProtoDataType:
+        """Convert a JitScalarType to an ONNX data type."""
+        if self not in _SCALAR_TYPE_TO_ONNX:
+            raise errors.OnnxExporterError(
+                f"Scalar type {self} cannot be converted to ONNX"
+            )
+        return _SCALAR_TYPE_TO_ONNX[self]
+
+    @_beartype.beartype
+    def onnx_compatible(self) -> bool:
+        """Return whether this JitScalarType is compatible with ONNX."""
+        return (
+            self in _SCALAR_TYPE_TO_ONNX
+            and self != JitScalarType.UNDEFINED
+            and self != JitScalarType.COMPLEX32
+        )
+
+
+@_beartype.beartype
+def valid_scalar_name(scalar_name: Union[ScalarName, str]) -> bool:
+    """Return whether the given scalar name is a valid JIT scalar type name."""
+    return scalar_name in _SCALAR_NAME_TO_TYPE
+
+
+@_beartype.beartype
+def valid_torch_name(torch_name: Union[TorchName, str]) -> bool:
+    """Return whether the given torch name is a valid torch type name."""
+    return torch_name in _TORCH_NAME_TO_SCALAR_TYPE
+
+
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+_SCALAR_TYPE_TO_NAME: Dict[JitScalarType, ScalarName] = {
+    JitScalarType.BOOL: "Bool",
+    JitScalarType.UINT8: "Byte",
+    JitScalarType.INT8: "Char",
+    JitScalarType.INT16: "Short",
+    JitScalarType.INT: "Int",
+    JitScalarType.INT64: "Long",
+    JitScalarType.HALF: "Half",
+    JitScalarType.FLOAT: "Float",
+    JitScalarType.DOUBLE: "Double",
+    JitScalarType.COMPLEX32: "ComplexHalf",
+    JitScalarType.COMPLEX64: "ComplexFloat",
+    JitScalarType.COMPLEX128: "ComplexDouble",
+    JitScalarType.QINT8: "QInt8",
+    JitScalarType.QUINT8: "QUInt8",
+    JitScalarType.QINT32: "QInt32",
+    JitScalarType.BFLOAT16: "BFloat16",
+    JitScalarType.FLOAT8E5M2: "Float8E5M2",
+    JitScalarType.FLOAT8E4M3FN: "Float8E4M3FN",
+    JitScalarType.FLOAT8E5M2FNUZ: "Float8E5M2FNUZ",
+    JitScalarType.FLOAT8E4M3FNUZ: "Float8E4M3FNUZ",
+    JitScalarType.UNDEFINED: "Undefined",
+}
+
+_SCALAR_NAME_TO_TYPE: Dict[ScalarName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_NAME.items()
+}
+
+_SCALAR_TYPE_TO_TORCH_NAME: Dict[JitScalarType, TorchName] = {
+    JitScalarType.BOOL: "bool",
+    JitScalarType.UINT8: "uint8_t",
+    JitScalarType.INT8: "int8_t",
+    JitScalarType.INT16: "int16_t",
+    JitScalarType.INT: "int",
+    JitScalarType.INT64: "int64_t",
+    JitScalarType.HALF: "half",
+    JitScalarType.FLOAT: "float",
+    JitScalarType.DOUBLE: "double",
+    JitScalarType.COMPLEX32: "complex32",
+    JitScalarType.COMPLEX64: "complex64",
+    JitScalarType.COMPLEX128: "complex128",
+    JitScalarType.QINT8: "qint8",
+    JitScalarType.QUINT8: "quint8",
+    JitScalarType.QINT32: "qint32",
+    JitScalarType.BFLOAT16: "bfloat16",
+    JitScalarType.FLOAT8E5M2: "float8_e5m2",
+    JitScalarType.FLOAT8E4M3FN: "float8_e4m3fn",
+    JitScalarType.FLOAT8E5M2FNUZ: "float8_e5m2fnuz",
+    JitScalarType.FLOAT8E4M3FNUZ: "float8_e4m3fnuz",
+}
+
+_TORCH_NAME_TO_SCALAR_TYPE: Dict[TorchName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_TORCH_NAME.items()
+}
+
+_SCALAR_TYPE_TO_ONNX = {
+    JitScalarType.BOOL: _C_onnx.TensorProtoDataType.BOOL,
+    JitScalarType.UINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.INT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.INT16: _C_onnx.TensorProtoDataType.INT16,
+    JitScalarType.INT: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.INT64: _C_onnx.TensorProtoDataType.INT64,
+    JitScalarType.HALF: _C_onnx.TensorProtoDataType.FLOAT16,
+    JitScalarType.FLOAT: _C_onnx.TensorProtoDataType.FLOAT,
+    JitScalarType.DOUBLE: _C_onnx.TensorProtoDataType.DOUBLE,
+    JitScalarType.COMPLEX64: _C_onnx.TensorProtoDataType.COMPLEX64,
+    JitScalarType.COMPLEX128: _C_onnx.TensorProtoDataType.COMPLEX128,
+    JitScalarType.BFLOAT16: _C_onnx.TensorProtoDataType.BFLOAT16,
+    JitScalarType.UNDEFINED: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.COMPLEX32: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.QINT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.QUINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.QINT32: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.FLOAT8E5M2: _C_onnx.TensorProtoDataType.FLOAT8E5M2,
+    JitScalarType.FLOAT8E4M3FN: _C_onnx.TensorProtoDataType.FLOAT8E4M3FN,
+    JitScalarType.FLOAT8E5M2FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E5M2FNUZ,
+    JitScalarType.FLOAT8E4M3FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E4M3FNUZ,
+}
+
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+_SCALAR_TYPE_TO_DTYPE = {
+    JitScalarType.BOOL: torch.bool,
+    JitScalarType.UINT8: torch.uint8,
+    JitScalarType.INT8: torch.int8,
+    JitScalarType.INT16: torch.short,
+    JitScalarType.INT: torch.int,
+    JitScalarType.INT64: torch.int64,
+    JitScalarType.HALF: torch.half,
+    JitScalarType.FLOAT: torch.float,
+    JitScalarType.DOUBLE: torch.double,
+    JitScalarType.COMPLEX32: torch.complex32,
+    JitScalarType.COMPLEX64: torch.complex64,
+    JitScalarType.COMPLEX128: torch.complex128,
+    JitScalarType.QINT8: torch.qint8,
+    JitScalarType.QUINT8: torch.quint8,
+    JitScalarType.QINT32: torch.qint32,
+    JitScalarType.BFLOAT16: torch.bfloat16,
+    JitScalarType.FLOAT8E5M2: torch.float8_e5m2,
+    JitScalarType.FLOAT8E4M3FN: torch.float8_e4m3fn,
+    JitScalarType.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,
+    JitScalarType.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,
+}
+
+_DTYPE_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_DTYPE.items()}
diff --git a/MLPY/Lib/site-packages/torch/onnx/errors.py b/MLPY/Lib/site-packages/torch/onnx/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..b541cbe34318b8af1fa7efe734caaaedcdc6b287
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/errors.py
@@ -0,0 +1,106 @@
+"""ONNX exporter exceptions."""
+from __future__ import annotations
+
+import textwrap
+from typing import Optional
+
+from torch import _C
+from torch.onnx import _constants
+from torch.onnx._internal import diagnostics
+
+__all__ = [
+    "OnnxExporterError",
+    "OnnxExporterWarning",
+    "CheckerError",
+    "SymbolicValueError",
+    "UnsupportedOperatorError",
+]
+
+
+class OnnxExporterWarning(UserWarning):
+    """Base class for all warnings in the ONNX exporter."""
+
+    pass
+
+
+class OnnxExporterError(RuntimeError):
+    """Errors raised by the ONNX exporter."""
+
+    pass
+
+
+class CheckerError(OnnxExporterError):
+    """Raised when ONNX checker detects an invalid model."""
+
+    pass
+
+
+class UnsupportedOperatorError(OnnxExporterError):
+    """Raised when an operator is unsupported by the exporter."""
+
+    def __init__(self, name: str, version: int, supported_version: Optional[int]):
+        if supported_version is not None:
+            diagnostic_rule: diagnostics.infra.Rule = (
+                diagnostics.rules.operator_supported_in_newer_opset_version
+            )
+            msg = diagnostic_rule.format_message(name, version, supported_version)
+            diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+        else:
+            if name.startswith(("aten::", "prim::", "quantized::")):
+                diagnostic_rule = diagnostics.rules.missing_standard_symbolic_function
+                msg = diagnostic_rule.format_message(
+                    name, version, _constants.PYTORCH_GITHUB_ISSUES_URL
+                )
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+            else:
+                diagnostic_rule = diagnostics.rules.missing_custom_symbolic_function
+                msg = diagnostic_rule.format_message(name)
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+        super().__init__(msg)
+
+
+class SymbolicValueError(OnnxExporterError):
+    """Errors around TorchScript values and nodes."""
+
+    def __init__(self, msg: str, value: _C.Value):
+        message = (
+            f"{msg}  [Caused by the value '{value}' (type '{value.type()}') in the "
+            f"TorchScript graph. The containing node has kind '{value.node().kind()}'.] "
+        )
+
+        code_location = value.node().sourceRange()
+        if code_location:
+            message += f"\n    (node defined in {code_location})"
+
+        try:
+            # Add its input and output to the message.
+            message += "\n\n"
+            message += textwrap.indent(
+                (
+                    "Inputs:\n"
+                    + (
+                        "\n".join(
+                            f"    #{i}: {input_}  (type '{input_.type()}')"
+                            for i, input_ in enumerate(value.node().inputs())
+                        )
+                        or "    Empty"
+                    )
+                    + "\n"
+                    + "Outputs:\n"
+                    + (
+                        "\n".join(
+                            f"    #{i}: {output}  (type '{output.type()}')"
+                            for i, output in enumerate(value.node().outputs())
+                        )
+                        or "    Empty"
+                    )
+                ),
+                "    ",
+            )
+        except AttributeError:
+            message += (
+                " Failed to obtain its input and output for debugging. "
+                "Please refer to the TorchScript graph for debugging information."
+            )
+
+        super().__init__(message)
diff --git a/MLPY/Lib/site-packages/torch/onnx/operators.py b/MLPY/Lib/site-packages/torch/onnx/operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e1785a2466b2dd7a177d313b30ed423385ff94
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/operators.py
@@ -0,0 +1,20 @@
+r"""This file provides a location for operators that help exporting models via onnx.
+
+E.g. `shape_as_tensor` and `reshape_from_tensor_shape`
+are to make all dynamic sizes operations traceable.
+
+NOTE: at one point these functions were implemented differently.
+Since then we have implemented these directly in ATen, so this
+file is kept purely for backward-compatibility.
+"""
+
+import torch
+import torch.onnx
+
+
+def shape_as_tensor(x):
+    return torch._shape_as_tensor(x)
+
+
+def reshape_from_tensor_shape(x, shape):
+    return torch._reshape_from_tensor(x, shape)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_caffe2.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_caffe2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2225e3dd206e74142c098c0b284862817c0e6eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_caffe2.py
@@ -0,0 +1,359 @@
+import importlib
+import inspect
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+def register_quantized_ops(domain: str, version: int):
+    # Register all quantized ops
+    module = importlib.import_module("torch.onnx.symbolic_caffe2")
+    quant_version_ops = inspect.getmembers(module)
+    aten_q_ops = {
+        "relu",
+        "_empty_affine_quantized",
+        "dequantize",
+        "quantize_per_tensor",
+        "upsample_nearest2d",
+        "avg_pool2d",
+        "reshape",
+        "slice",
+        "cat",
+        "max_pool2d",
+        "sigmoid",
+    }
+    for op, func in quant_version_ops:
+        name = f"{domain}::{op}"
+        if inspect.isfunction(func) and not registration.registry.is_registered_op(
+            name, version
+        ):
+            if op in aten_q_ops:
+                # Override the builtin aten ops
+                registration.registry.register(
+                    f"aten::{op}", version, func, custom=True
+                )
+            registration.registry.register(name, version, func)
+
+
+def _permute_helper(g: jit_utils.GraphContext, input, axes):
+    quant_args = {
+        "axes_i": axes,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Transpose", input, **quant_args)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def nchw2nhwc(g: jit_utils.GraphContext, input):
+    axes = [0, 2, 3, 1]
+    return _permute_helper(g, input, axes)
+
+
+def nhwc2nchw(g: jit_utils.GraphContext, input):
+    axes = [0, 3, 1, 2]
+    return _permute_helper(g, input, axes)
+
+
+def linear_prepack(g: jit_utils.GraphContext, weight, bias):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "f", "i")
+def linear(g: jit_utils.GraphContext, input, weight, bias, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8FC", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def conv_prepack(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", input, weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Conv", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d_relu(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8ConvRelu", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "f", "i")
+def add(g: jit_utils.GraphContext, input_a, input_b, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Add", input_a, input_b, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def relu(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.relu(g, input)
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Relu", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "f", "i", "t")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Quantize", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def dequantize(g: jit_utils.GraphContext, input):
+    return g.op("_caffe2::Int8Dequantize", input)
+
+
+@symbolic_helper.parse_args("v", "t", "t", "t", "t", "t", "t", "t")
+def _empty_affine_quantized(
+    g: jit_utils.GraphContext,
+    input,
+    shape,
+    scale,
+    zero_point,
+    dtype,
+    pin_memory,
+    memory_format,
+    layout,
+):
+    return input
+
+
+def upsample_nearest2d(
+    g: jit_utils.GraphContext,
+    input,
+    output_size,
+    align_corners=None,
+    scales_h=None,
+    scales_w=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.upsample_nearest2d(g, input, output_size, align_corners)  # type: ignore[attr-defined]
+
+    output_size = symbolic_helper._parse_arg(output_size, "is")
+    kwargs = {
+        "output_size_i": output_size,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8ResizeNearest", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+def max_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.max_pool2d(  # type: ignore[attr-defined]
+            g, input, kernel_size, stride, padding, dilation, ceil_mode
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8MaxPool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+def avg_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.avg_pool2d(  # type: ignore[attr-defined]
+            g,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8AveragePool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def reshape(g: jit_utils.GraphContext, input, shape):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.reshape(g, input, shape)
+
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Reshape", input, shape, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def slice(g: jit_utils.GraphContext, input, dim, start, end, step):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.slice(g, input, dim, start, end, step)
+
+    if step != 1:
+        raise RuntimeError("ONNX quantized slice export only works for step 1.")
+    start = symbolic_helper._parse_arg(start, "i")
+    end = symbolic_helper._parse_arg(end, "i")
+    dim = symbolic_helper._parse_arg(dim, "i")
+
+    kwargs = {
+        "start_idx_i": start,
+        "end_idx_i": end,
+        "dim_i": dim,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Slice", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def cat(g: jit_utils.GraphContext, tensor_list, dim, scale=None, zero_point=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    input = tensors[0]
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.cat(g, tensor_list, dim)
+
+    dim = symbolic_helper._parse_arg(dim, "i")
+    kwargs = {
+        "Y_scale_f": tensors[0].node()["Y_scale"],
+        "Y_zero_point_i": tensors[0].node()["Y_zero_point"],
+    }
+    output = g.op("_caffe2::Int8Concat", *tensors, axis_i=dim, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def sigmoid(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.sigmoid(g, input)
+    # Caffe2 expects the output scale to be 1/2^8
+    # and output zero_point to be 0 (quint8 type)
+    out_scale = 1.0 / 256
+    zero_point = 0
+    kwargs = {
+        "Y_scale_f": out_scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Sigmoid", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_helper.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f5257207f8dd238d58886a39881a6750d77898
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_helper.py
@@ -0,0 +1,1823 @@
+from __future__ import annotations
+
+import functools
+import inspect
+import sys
+import typing
+import warnings
+from typing import (
+    Any,
+    Callable,
+    List,
+    Literal,
+    NoReturn,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _type_utils, errors
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, jit_utils
+from torch.types import Number
+
+__all__ = [
+    "args_have_same_dtype",
+    "cast_pytorch_to_onnx",
+    "check_training_mode",
+    "dequantize_helper",
+    "is_caffe2_aten_fallback",
+    "is_complex_value",
+    "parse_args",
+    "pytorch_name_to_type",
+    "quantize_helper",
+    "quantized_args",
+    "requantize_bias_helper",
+    "scalar_name_to_pytorch",
+    "scalar_type_to_onnx",
+    "scalar_type_to_pytorch_type",
+]
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+_ValueDescriptor = Literal[
+    "v",
+    "i",
+    "is",
+    "f",
+    "fs",
+    "b",
+    "s",
+    "t",
+    "none",
+]
+
+
+@_beartype.beartype
+def _parse_arg(
+    value,
+    desc: _ValueDescriptor,
+    arg_name: Optional[str] = None,
+    node_name: Optional[str] = None,
+):
+    if desc == "none":
+        return value
+    if desc == "v" or not _is_value(value):
+        return value
+
+    node = value.node()
+    if node.mustBeNone():
+        return None
+    if node.kind() == "onnx::Constant":
+        node_val = _node_get(node, "value")
+        if desc == "i":
+            return int(node_val)
+        elif desc == "f":
+            return float(node_val)
+        elif desc == "b":
+            return bool(node_val)
+        elif desc == "s":
+            return str(node_val)
+        elif desc == "t":
+            return node_val
+        elif desc == "is":
+            return [int(v) for v in node_val]
+        elif desc == "fs":
+            return [float(v) for v in node_val]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not understand the Constant node '{node}' "
+                f"specified with descriptor '{desc}'.",
+                value,
+            )
+    elif node.kind() == "prim::ListConstruct":
+        if desc == "is":
+            for v in node.inputs():
+                element_node = v.node()
+                if element_node.kind() != "onnx::Constant":
+                    raise errors.SymbolicValueError(
+                        f"Failed to export a node '{element_node}' "
+                        f"(in list node {node}) "
+                        f"because it is not constant. "
+                        f"Please try to make things (e.g. kernel sizes) static if possible.",
+                        value,
+                    )
+            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
+                f"is not a list of integers: '{node}'",
+                value,
+            )
+
+    if arg_name is None or node_name is None:
+        raise errors.SymbolicValueError(
+            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
+            value,
+        )
+
+    raise errors.SymbolicValueError(
+        "Expected node type 'onnx::Constant' "
+        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
+        value,
+    )
+
+
+@_beartype.beartype
+def _node_get(node: _C.Node, key: str):
+    """Gets attributes of a node which is polymorphic over return type."""
+    assert isinstance(node, _C.Node)
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
+
+
+@_beartype.beartype
+def _is_onnx_constant(value: _C.Value):
+    """Whether a Value is an ONNX constant."""
+    return value.node().kind() == "onnx::Constant"
+
+
+@_beartype.beartype
+def _maybe_get_const(
+    value: Optional[Union[_C.Value, torch.Tensor, Number, Sequence]],
+    descriptor: _ValueDescriptor,
+):
+    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
+    # otherwise it'd be converted to onnx::Constant
+    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
+    if isinstance(value, _C.Value) and _is_onnx_constant(value):
+        return _parse_arg(value, descriptor)
+    return value
+
+
+@_beartype.beartype
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, "t")
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+@_beartype.beartype
+def _get_const(value, desc, arg_name):
+    if not _is_constant(value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
+            f"got '{value}'",
+            value,
+        )
+    return _parse_arg(value, desc)
+
+
+@_beartype.beartype
+def _unpack_list(list_value: _C.Value) -> List[_C.Value]:
+    list_node = list_value.node()
+    if list_node.kind() != "prim::ListConstruct":
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type prim::ListConstruct, "
+            f"got '{list_node}'.",
+            list_value,
+        )
+    return list(list_node.inputs())
+
+
+@_beartype.beartype
+def _unpack_tuple(tuple_value: _C.Value) -> Tuple[_C.Value, ...]:
+    tuple_node = tuple_value.node()
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
+            f"got '{tuple_node.kind()}'.",
+            tuple_value,
+        )
+    return tuple(tuple_node.inputs())
+
+
+@_beartype.beartype
+def _unpack_quantized_tensor(tuple_value: _C.Value) -> Tuple[_C.Value, ...]:
+    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
+    Args:
+        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
+    Returns:
+        A tuple of tensor, scale, zero_point, and optionally axis.
+    """
+    tuple_node = tuple_value.node()
+    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
+            f"tensor. Is this likely due to missing support for quantized "
+            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
+            tuple_value,
+        )
+    unpacked = tuple(tuple_node.inputs())
+    assert len(unpacked) == 3 or len(unpacked) == 4
+    return unpacked
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be unpacked.
+@_beartype.beartype
+def _is_packed_list(list_value: Any) -> bool:
+    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
+
+
+@_beartype.beartype
+def parse_args(*arg_descriptors: _ValueDescriptor):
+    """A decorator which converts args from torch._C.Value to built-in types.
+
+    For example:
+
+    ```
+    @parse_args('v', 'i', 'fs')
+    foo(g, a, b, c):
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
+
+    Args:
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
+            "none": the variable is unused
+    """
+
+    def decorator(fn):
+        fn._arg_descriptors = arg_descriptors
+
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            # some args may be optional, so the length may be smaller
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            try:
+                sig = inspect.signature(fn)
+                arg_names = list(sig.parameters.keys())[1:]
+                fn_name = fn.__name__
+            except Exception:
+                # FIXME(justinchuby): Avoid catching Exception.
+                # Catch a more specific exception instead.
+                arg_names = [None] * len(args)  # type: ignore[list-item]
+                fn_name = None
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
+            # only support _outputs in kwargs
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
+                f"key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            if len(kwargs) == 1:
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
+                    f"'_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
+            return fn(g, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@_beartype.beartype
+def quantized_args(
+    *arg_q_descriptors: bool,
+    scale: Optional[float] = None,
+    zero_point: Optional[int] = None,
+    quantize_output: bool = True,
+):
+    """A decorator which extends support for quantized version of the base operator.
+
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument de-quantization and output quantization.
+
+    Otherwise, only the base symbolic function will be invoked.
+
+    For example:
+
+    ```
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+    ```
+
+    is equivalent to
+
+    ```
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+    ```
+
+    Args:
+        arg_q_descriptors: A sequence of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator. It defaults
+          to False for unspecified (variable length) arguments.
+        scale: Quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: Quantized output zero point. If None,
+          derive from the first quantized input zero point.
+        quantize_output: If True, quantize the output of the base operator. Default is True
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            nonlocal scale
+            nonlocal zero_point
+            if scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(scale))
+            else:
+                _scale = None
+            if zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
+            else:
+                _zero_point = None
+
+            # Support variable length arguments by marking unspecified ones as non-quantized
+            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
+                len(args) - len(arg_q_descriptors)
+            )
+            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
+
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
+            # Run regular symbolic function if none of the argument is QTensor.
+            is_quantized = list()
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        is_quantized.append(_is_arg_quantized(descriptor, arg_input))
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
+                return fn(g, *args, **kwargs)
+
+            # Dequantize arguments that are quantized
+            non_quantized_args = []
+            for descriptor, arg in descriptor_args:
+                if _is_arg_quantized(descriptor, arg):
+                    # Quantized arg is a tuple of (value, scale, zero_point)
+                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
+                        g, arg
+                    )
+                    non_quantized_args.append(dequantized_arg)
+                    # Set scale and zero_point to the first quantized input if not already set
+                    if _scale is None:
+                        _scale = arg_scale
+                    if _zero_point is None:
+                        _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
+                else:
+                    # Non-quantized arg
+                    non_quantized_args.append(arg)
+            # TODO(justinchuby): Only single output is supported for now. We may want to
+            # support multiple outputs in the future.
+            output = fn(g, *non_quantized_args, **kwargs)
+
+            assert _scale is not None, "Bug: Scale must be set for quantized operator"
+            assert (
+                _zero_point is not None
+            ), "Bug: Zero point must be set for quantized operator"
+
+            if quantize_output:
+                return quantize_helper(g, output, _scale, _zero_point)
+            return output
+
+        return wrapper
+
+    return decorator
+
+
+@_beartype.beartype
+def _scalar(x: Any) -> Optional[Number]:
+    """Convert a scalar tensor into a Python value."""
+    if isinstance(x, torch.Tensor) and x.shape == ():
+        return x.item()
+    return None
+
+
+@_beartype.beartype
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, _C.Value):
+        return self
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
+        return getattr(self, ty)()
+    return self
+
+
+@_beartype.beartype
+def _is_none(x: Any) -> bool:
+    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
+
+
+@_beartype.beartype
+def _is_value(x: Any) -> bool:
+    return isinstance(x, _C.Value)
+
+
+@_beartype.beartype
+def _is_constant(value: Any) -> bool:
+    return not _is_value(value) or value.node().kind() in {
+        "onnx::Constant",
+        "prim::Constant",
+    }
+
+
+@_beartype.beartype
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
+def _as_list_type(jit_type: _C.JitType) -> Optional[_C.ListType]:
+    if isinstance(jit_type, _C.ListType):
+        return jit_type
+    return None
+
+
+@_beartype.beartype
+def _is_list(x: _C.Value) -> bool:
+    return _as_list_type(x.type()) is not None
+
+
+@_beartype.beartype
+def _is_tensor_list(x: _C.Value) -> bool:
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    return isinstance(x_type.getElementType(), _C.TensorType)
+
+
+@_beartype.beartype
+def _is_scalar_list(x: _C.Value) -> bool:
+    """Checks if x is a scalar list, for example: List[float], List[int].
+
+    Besides checking the type is ListType, we also check if the data type is
+    a valid ONNX data type.
+    """
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
+
+
+@_beartype.beartype
+def _is_tuple_construct(x: _C.Value) -> bool:
+    return x.node().kind() == "prim::TupleConstruct"
+
+
+@_beartype.beartype
+def is_complex_value(x: _C.Value) -> bool:
+    assert _is_value(x)
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.COMPLEX32,
+        _type_utils.JitScalarType.COMPLEX64,
+        _type_utils.JitScalarType.COMPLEX128,
+    }
+
+
+@_beartype.beartype
+def is_caffe2_aten_fallback() -> bool:
+    return (
+        GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        and _C_onnx._CAFFE2_ATEN_FALLBACK
+    )
+
+
+@_beartype.beartype
+def _get_tensor_rank(x: _C.Value) -> Optional[int]:
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    return x_type.dim()
+
+
+@_beartype.beartype
+def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    if allow_nonstatic:
+        # Each individual symbol is returned as None.
+        # e.g. [1, "a", "b"] -> [1, None, None]
+        return x_type.varyingSizes()
+    # returns None, if exists any symbol in sizes.
+    # e.g. [1, "a", "b"] -> None
+    return x_type.sizes()
+
+
+@_beartype.beartype
+def _get_tensor_dim_size(x: _C.Value, dim: int) -> Optional[int]:
+    sizes = _get_tensor_sizes(x)
+    return sizes[dim] if sizes else None
+
+
+@_beartype.beartype
+def _get_dim_for_cross(x: _C.Value, dim: Optional[int]):
+    if dim == -1:
+        tensor_rank = _get_tensor_rank(x)
+        assert tensor_rank is not None
+        return dim + tensor_rank
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(x)
+        assert sizes is not None
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
+@_beartype.beartype
+def _unimplemented(op: str, msg: str, value: Optional[_C.Value] = None) -> None:
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if _C_onnx._CAFFE2_ATEN_FALLBACK:
+        warnings.warn(f"ONNX export failed on {op} because {msg} not supported")
+    elif GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}", value)
+
+
+@_beartype.beartype
+def _onnx_unsupported(op_name: str, value: Optional[_C.Value] = None) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of operator {op_name}. "
+        f"Please feel free to request support or submit a pull request "
+        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+@_beartype.beartype
+def _onnx_opset_unsupported(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    value: Optional[_C.Value] = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
+        f"Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+@_beartype.beartype
+def _onnx_opset_unsupported_detailed(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    reason: str,
+    value: Optional[_C.Value] = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in "
+        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+@_beartype.beartype
+def _block_list_in_opset(name: str):
+    def symbolic_fn(*args, **kwargs):
+        raise errors.OnnxExporterError(
+            f"ONNX export failed on {name}, which is not implemented for opset "
+            f"{GLOBALS.export_onnx_opset_version}. "
+            "Try exporting with other opset versions."
+        )
+
+    return symbolic_fn
+
+
+@_beartype.beartype
+def _try_get_scalar_type(*args) -> Optional[_type_utils.JitScalarType]:
+    for arg in args:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
+    return None
+
+
+@_beartype.beartype
+def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = _get_tensor_rank(index)
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
+        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_beartype.beartype
+def _slice_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes,
+    starts,
+    ends,
+    steps=None,
+):
+    if g.opset <= 9:
+        from torch.onnx.symbolic_opset9 import _slice as _slice9
+
+        return _slice9(g, input, axes, starts, ends)
+    else:
+        from torch.onnx.symbolic_opset10 import _slice as _slice10
+
+        return _slice10(g, input, axes, starts, ends, steps)
+
+
+@_beartype.beartype
+def _is_fp(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
+
+
+@_beartype.beartype
+def _is_bool(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
+
+
+@_beartype.beartype
+def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
+    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
+
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
+
+
+@_beartype.beartype
+def _sort_helper(g: jit_utils.GraphContext, input, dim, decending=True, out=None):
+    if out is not None:
+        _unimplemented("Sort", "Out parameter is not supported")
+    shape_ = g.op("Shape", input)
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if g.opset <= 10:
+        if not decending:
+            _unimplemented("Sort", "Ascending is not supported")
+        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=decending, outputs=2
+        )
+
+
+@_beartype.beartype
+def _topk_helper(
+    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
+):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported")
+    if not _is_value(k):
+        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
+            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
+    if g.opset <= 10:
+        if not largest:
+            _unimplemented("TopK", "Ascending is not supported")
+        return g.op("TopK", input, k, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
+
+
+@_beartype.beartype
+def _lt_helper(g: jit_utils.GraphContext, input, other):
+    if g.opset <= 8:
+        from torch.onnx.symbolic_opset8 import lt as _lt8
+
+        return _lt8(g, input, other)
+    else:
+        from torch.onnx.symbolic_opset9 import lt as _lt9
+
+        return _lt9(g, input, other)
+
+
+@_beartype.beartype
+def _interpolate_warning(interpolate_mode):
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
+
+@_beartype.beartype
+def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
+
+@_beartype.beartype
+def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
+        return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    assert axes_rank is not None
+    if axes_rank > 1:
+        raise errors.SymbolicValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
+
+@_beartype.beartype
+def _reducesum_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes_i=None,
+    keepdims_i=1,
+    noop_with_empty_axes_i=0,
+):
+    keepdims_i = _maybe_get_const(keepdims_i, "i")
+    if g.opset >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
+
+@_beartype.beartype
+def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, "is")
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
+        )
+        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        scale_dims = g.op("Div", dividend, divisor)
+        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
+    return scales
+
+
+@_beartype.beartype
+def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
+
+    if not available_scales:
+        return None
+
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
+    scales = g.op("Concat", offsets, scales_list, axis_i=0)
+    return scales
+
+
+@_beartype.beartype
+def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
+    if mode == "nearest":
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+@_beartype.beartype
+def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scale_factor_rank = _get_tensor_rank(scale_factor)
+    if isinstance(scale_factor.type(), _C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
+        return g.op("Concat", offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
+        scale_factor = g.op(
+            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+@_beartype.beartype
+def _interpolate_get_scales_and_mode(
+    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    _interpolate_warning(mode)
+
+    align_corners = _maybe_get_const(align_corners, "b")
+    if isinstance(align_corners, bool) and align_corners:
+        return _unimplemented("interpolate", "align_corners == True")
+
+    if not input.type().dim():
+        return _unimplemented("interpolate", "missing input shape")
+    dim = input.type().dim()
+
+    if not _is_none(scale_factor):
+        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
+    elif not _is_none(size):
+        if not _is_packed_list(size):
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
+            if is_scalar:
+                size = _unsqueeze_helper(g, size, [0])
+                size = [size for i in range(dim - 2)]
+                size = g.op("Concat", *size, axis_i=0)
+        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
+    else:
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
+    return scale_factor, mode
+
+
+@_beartype.beartype
+def _argmin_argmax_helper(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+    op_name: str,
+):
+    def op_wrapper(input, axis_i, keepdims_i):
+        if g.opset >= 12:
+            return g.op(
+                op_name,
+                input,
+                axis_i=axis_i,
+                keepdims_i=keepdims_i,
+                select_last_index_i=False,
+            )
+        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
+
+    if _is_none(dim):
+        flattened = _reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
+        if keepdim:
+            input_shape = g.op("Shape", input)
+            input_shape_shape = g.op("Shape", input_shape)
+            new_shape = g.op(
+                "ConstantOfShape",
+                input_shape_shape,
+                value_t=torch.tensor([1], dtype=torch.int64),
+            )
+            output = g.op("Reshape", output, new_shape)
+        return output
+
+    dim = _parse_arg(dim, "i")
+    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
+
+
+@_beartype.beartype
+def _interpolate_helper(name, dim, interpolate_mode):
+    @quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "half_pixel"
+        )
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
+            output_size = g.op(
+                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
+            )
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+        else:
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+@_beartype.beartype
+def __interpolate_helper(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    align_corners = _maybe_get_const(align_corners, "b")
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "half_pixel"
+    )
+
+    if not _is_none(size):
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and (
+                _maybe_get_const(size, "t").dim() == 0
+            )
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
+
+
+@_beartype.beartype
+def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
+    if g.opset < 11:
+        from torch.onnx.symbolic_opset9 import unbind
+    elif g.opset <= 12:
+        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
+    else:
+        from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
+    return unbind(g, self, dim, _outputs)
+
+
+@_beartype.beartype
+def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+    return scatter(g, self, dim, index, src)
+
+
+@_beartype.beartype
+def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
+    if g.opset <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    else:
+        from torch.onnx.symbolic_opset13 import split
+
+        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
+
+@_beartype.beartype
+def _repeat_interleave_single_value_repeat_helper(
+    g: jit_utils.GraphContext, self, repeats, dim
+):
+    from torch.onnx.symbolic_opset9 import flatten, unsqueeze
+
+    if not _is_tensor(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+
+    const_repeats: bool = _is_constant(repeats)
+    reps = _maybe_get_const(repeats, "t")
+
+    # Convert 'repeats' to 1-d if it is 0-d.
+    if _get_tensor_rank(repeats) == 0:
+        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
+
+    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
+    unsqueezed = unsqueeze(g, self, dim + 1)
+
+    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
+    if const_repeats:
+        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
+        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)
+        onehot[dim + 1] = reps
+        repeats_per_dim = g.op("Constant", value_t=onehot)
+    else:
+        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
+        onehot = g.op(
+            "OneHot",
+            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
+            g.op(
+                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
+            ),  # depth
+            g.op(
+                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
+            ),  # on/off values
+        )
+        repeats_per_dim = flatten(g, onehot, 0, 1)
+
+    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
+    return flatten(g, tiled, dim, dim + 1)
+
+
+@_beartype.beartype
+def _arange_cast_helper(
+    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
+) -> Tuple[
+    _type_utils.JitScalarType,
+    Optional[_C.Value],
+    Optional[_C.Value],
+    Optional[_C.Value],
+]:
+    def _is_all_integral(scalars):
+        for scalar in scalars:
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
+        return True
+
+    # This logic is based on torch.arange docs. If "dtype" is provided,
+    # infer input types from dtype. If not, then check if any of start, stop,
+    # or step are floating point, and infer the type from get_default.
+    # Otherwise, the dtype is inferred to be torch.int64.
+    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
+        if _is_all_integral([start, end, step]):
+            scalar_type = _type_utils.JitScalarType.INT64
+        else:
+            scalar_type = _type_utils.JitScalarType.from_dtype(
+                torch.get_default_dtype()
+            )
+    else:
+        assert isinstance(dtype, int)
+        # TODO(justinchuby): Check if dtype is indeed a int.
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
+    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
+    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
+    return scalar_type, end, start, step
+
+
+@_beartype.beartype
+def _arange_helper(g: jit_utils.GraphContext, *args):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import arange
+    else:
+        from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
+    return arange(g, *args)
+
+
+@_beartype.beartype
+def _size_helper(g: jit_utils.GraphContext, self, dim):
+    full_shape = g.op("Shape", self)
+    from torch.onnx.symbolic_opset9 import select
+
+    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
+
+
+@_beartype.beartype
+def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
+    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
+    # 2. expand index => [..., dim, ...], same shape as self except for dim.
+    # 3. expand value as well.
+    # 4. apply onnx::scatter.
+
+    from torch.onnx.symbolic_opset9 import expand
+
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+
+    if self.type().dim() is None:
+        return _unimplemented("index_fill", "input rank not accessible")
+    self_dim = self.type().dim()
+    dim_value = _parse_arg(dim, "i")
+    if dim_value < 0:
+        dim_value += self_dim
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
+    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
+    return expanded_index_shape, expanded_index
+
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
+@_beartype.beartype
+def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if g.opset <= 13:
+        if allowzero == 1:
+            _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
+            )
+        return g.op("Reshape", input, shape)
+    else:
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+
+@_beartype.beartype
+def _batchnorm_helper(
+    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
+):
+    from torch.onnx.symbolic_opset9 import _var_mean
+
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
+    return weight, bias, running_mean, running_var
+
+
+@_beartype.beartype
+def _avgpool_helper(
+    tuple_fn: Callable[[Any], Sequence[int]],
+    padding: Union[int, Sequence[int]],
+    kernel_size,
+    stride,
+    divisor_override,
+    name,
+) -> Tuple[int, ...]:
+    if divisor_override and divisor_override.node().kind() != "prim::Constant":
+        _unimplemented(name, "divisor_override")
+    return tuple(tuple_fn(padding))
+
+
+@_beartype.beartype
+def check_training_mode(op_train_mode: int, op_name: str) -> None:
+    """Warns the user if the model's training mode and the export mode do not agree."""
+    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
+        return
+
+    if op_train_mode:
+        op_mode_enum = _C_onnx.TrainingMode.TRAINING
+    else:
+        op_mode_enum = _C_onnx.TrainingMode.EVAL
+    if op_mode_enum == GLOBALS.training_mode:
+        # The modes agree. Do nothing
+        return
+
+    op_mode_text = f"train={bool(op_train_mode)}"
+    # Setting the model mode could result in op_mode != GLOBALS.training_mode
+    # if the model is a FuncModule. In this case we warn the user of
+    # the state and export depending on op_mode
+    # This is to support use-cases of fixing certain layer weights
+    # in training.
+    warnings.warn(
+        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+    )
+
+
+@_beartype.beartype
+def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
+    input_size = g.op("Shape", input)
+    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
+    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    if end_dim < dim - 1:
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
+
+    final_shape = g.op("Concat", *slices, axis_i=0)
+    from torch.onnx.symbolic_opset9 import _reshape_from_tensor
+
+    return _reshape_from_tensor(g, input, final_shape)
+
+
+@_beartype.beartype
+def _is_split_static(split_size_or_sizes, _outputs):
+    if _outputs is None:
+        return False
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
+        return False
+    return True
+
+
+@_beartype.beartype
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_beartype.beartype
+def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
+    rank = _get_tensor_rank(self)
+    if rank is not None and any(
+        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
+    ):
+        # If input tensor is empty, according to ONNX ReduceSum definition,
+        # set keepdims=1 so that the resulted tensor has the same rank as the input.
+        return g.op(op_name, self, keepdims_i=1)
+    return g.op(op_name, self, keepdims_i=0)
+
+
+@_beartype.beartype
+def dequantize_helper(
+    g: jit_utils.GraphContext,
+    qtensor: _C.Value,
+    qdtype: Optional[_C_onnx.TensorProtoDataType] = None,
+) -> Tuple[_C.Value, _C.Value, _C.Value, Optional[_C.Value]]:
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
+            for per tensor quantization, or
+            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
+            representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
+            data type of quantized tensor. It must be either
+            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype.onnx_type()
+        else:
+            qdtype = _C_onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            qtensor,
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+@_beartype.beartype
+def quantize_helper(
+    g: jit_utils.GraphContext,
+    tensor: _C.Value,
+    scale: _C.Value,
+    zero_point: _C.Value,
+    axis: Optional[_C.Value] = None,
+) -> _C.Value:
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+
+    Returns:
+        A TupleConstruct storing information of the quantized tensor.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            tensor,
+        )
+
+    assert scale is not None
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+@_beartype.beartype
+def requantize_bias_helper(
+    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
+):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+@_beartype.beartype
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
+    return has_same_dtype
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    "Byte": _C_onnx.TensorProtoDataType.UINT8,
+    "Char": _C_onnx.TensorProtoDataType.INT8,
+    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
+    "Float": _C_onnx.TensorProtoDataType.FLOAT,
+    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
+    "Int": _C_onnx.TensorProtoDataType.INT32,
+    "Long": _C_onnx.TensorProtoDataType.INT64,
+    "Short": _C_onnx.TensorProtoDataType.INT16,
+    "Bool": _C_onnx.TensorProtoDataType.BOOL,
+    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
+    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
+    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
+    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_name_to_pytorch = {
+    "uint8_t": "Byte",
+    "int8_t": "Char",
+    "double": "Double",
+    "float": "Float",
+    "half": "Half",
+    "int": "Int",
+    "int64_t": "Long",
+    "int16_t": "Short",
+    "bool": "Bool",
+    "complex64": "ComplexFloat",
+    "complex128": "ComplexDouble",
+    "qint8": "QInt8",
+    "quint8": "QUInt8",
+    "qint32": "QInt32",
+    "bfloat16": "BFloat16",
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
+]
+
+# Deprecated. Internally use _type_utils.ScalarType
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
+]
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
+_quantized_ops: Set[int] = set()
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset10.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset10.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab9448a33b452e83fa5511825393c2f6457cb9b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset10.py
@@ -0,0 +1,1233 @@
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+__all__ = [
+    "dequantize",
+    "div",
+    "embedding_bag",
+    "fake_quantize_per_tensor_affine",
+    "flip",
+    "fmod",
+    "isfinite",
+    "isinf",
+    "nan_to_num",
+    "quantize_per_tensor",
+    "quantized_add_relu",
+    "quantized_add",
+    "quantized_cat",
+    "quantized_conv1d_relu",
+    "quantized_conv2d_relu",
+    "quantized_conv3d_relu",
+    "quantized_conv1d",
+    "quantized_conv2d",
+    "quantized_conv3d",
+    "quantized_conv_transpose1d",
+    "quantized_conv_transpose2d",
+    "quantized_conv_transpose3d",
+    "quantized_group_norm",
+    "quantized_hardswish",
+    "quantized_instance_norm",
+    "quantized_layer_norm",
+    "quantized_leaky_relu",
+    "quantized_linear",
+    "quantized_linear_relu",
+    "quantized_mul",
+    "quantized_sigmoid",
+    "slice",
+    "sort",
+    "topk",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+@_onnx_symbolic("aten::div")
+@_beartype.beartype
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return opset9.true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+@_beartype.beartype
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    else:
+        return opset9._div_rounding_mode(g, self, other, rounding_mode)
+
+
+@_onnx_symbolic("aten::_floor_divide")
+@_beartype.beartype
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = opset9.true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Mod", self, other, fmod_i=0)
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Sub", div, one)
+        return g.op("Where", fixup_mask, fixup, div)
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+@_beartype.beartype
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+@_beartype.beartype
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+def _aten_max_pool_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+) -> _C.Value:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, _ = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze",
+            pool_result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    return pool_result
+
+
+# For MaxPool
+def _adjust_attributes_of_max_pool(
+    expand_size: int,
+    kernel_size: Union[Sequence[int], int],
+    stride: Union[Sequence[int], int],
+    padding: Union[Sequence[int], int],
+    dilation: Union[Sequence[int], int],
+) -> Tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(dilation, int):
+        dilation = [dilation] * expand_size
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        # 2D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 3:
+        # 3D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    else:
+        # When padding is already done for all dimensions,
+        # we don't need to double it
+        # eg: (1, 1, 1, 1, 1, 1)
+        pads = padding  # type: ignore[assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads, dilation)
+
+
+def _aten_max_pool_with_indices_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+    n_dims_one: Sequence[int],
+    n_dims_zero: Sequence[int],
+    n_dims_axes: Sequence[int],
+) -> Tuple[_C.Value, Sequence[int]]:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+    _, flatten_indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        dilations_i=dilations,
+        kernel_shape_i=n_dims_one,
+        strides_i=n_dims_one,
+    )
+
+    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
+    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
+    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
+
+    delta = g.op("Slice", flatten_indices, starts, ends, axes)
+    indices = g.op("Sub", indices, delta)
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
+        )
+        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
+
+    return (pool_result, indices)
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[_apply_params("max_pool1d", 1, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[_apply_params("max_pool2d", 2, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[_apply_params("max_pool3d", 3, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool1d_with_indices",
+    decorate=[
+        _apply_params(
+            "max_pool1d_with_indices",
+            1,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d_with_indices",
+    decorate=[
+        _apply_params(
+            "max_pool2d_with_indices",
+            2,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d_with_indices",
+    decorate=[
+        _apply_params(
+            "max_pool3d_with_indices",
+            3,
+            return_indices=True,
+        )
+    ],
+)
+@_beartype.beartype
+def _max_pool(name: str, expand_size: int, return_indices: bool):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(
+        g: jit_utils.GraphContext,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: Union[int, Sequence[int]],
+        dilation: Sequence[int],
+        ceil_mode: bool,
+    ):
+        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
+            expand_size, kernel_size, stride, padding, dilation
+        )
+
+        if return_indices:
+            return _aten_max_pool_with_indices_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+                ([1] * expand_size),
+                ([0] * expand_size),
+                ([2 + i for i in range(expand_size)]),
+            )
+        else:
+            return _aten_max_pool_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+            )
+
+    return symbolic_fn
+
+
+# For AvgPool
+def _adjust_attributes_of_avg_pool(
+    expand_size: int,
+    kernel_size: Union[Sequence[int], int],
+    stride: Union[Sequence[int], int],
+    padding: Union[Sequence[int], int],
+) -> Tuple[Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        pads = padding * expand_size  # type: ignore[operator, assignment]
+    else:
+        pads = padding * 2  # type: ignore[operator, assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[_apply_params("avg_pool1d", 1)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[_apply_params("avg_pool2d", 2)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[_apply_params("avg_pool3d", 3)],
+)
+@_beartype.beartype
+def _avg_pool(name, expand_size):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    @_beartype.beartype
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: Union[int, Sequence[int]],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
+            expand_size, kernel_size, stride, padding
+        )
+
+        result = g.op(
+            "AveragePool",
+            input,
+            ceil_mode_i=ceil_mode,
+            count_include_pad_i=count_include_pad,
+            kernel_shape_i=kernel_shape,
+            pads_i=pads,
+            strides_i=strides,
+        )
+
+        return result
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_beartype.beartype
+def _interpolate(name, dim, interpolate_mode):
+    @symbolic_helper.quantized_args(True, False, False)
+    @_beartype.beartype
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+@_beartype.beartype
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Resize", input, scales, mode_s=mode)
+
+
+@_beartype.beartype
+def _slice(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    axes: Union[List, torch.Tensor, torch._C.Value],
+    starts: Union[List, torch.Tensor, torch._C.Value],
+    ends: Union[List, torch.Tensor, torch._C.Value],
+    steps: Optional[Union[List, torch.Tensor, torch._C.Value]] = None,
+):
+    def is_none_value(value):
+        if value is None:
+            return True
+        return (
+            isinstance(value, torch._C.Value)
+            and value.node().kind() == "prim::Constant"
+            and isinstance(value.type(), _C.NoneType)
+        )
+
+    def to_slice_input(list_or_value, default_value=None):
+        # Convert input param into a 1D torch.Value.
+        if is_none_value(list_or_value) and default_value is not None:
+            list_or_value = [default_value]
+
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            return g.op("Constant", value_t=torch.tensor(list_or_value))
+
+        rank = symbolic_helper._get_tensor_rank(list_or_value)
+        if rank == 0:
+            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
+        if rank == 1:
+            return list_or_value
+        raise errors.SymbolicValueError(
+            f"Rank must be 0 or 1, not {rank}", list_or_value
+        )
+
+    def get_const_value(list_or_value):
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            if len(list_or_value) == 1:
+                return list_or_value[0]
+            return None
+        return symbolic_helper._maybe_get_const(list_or_value, "i")
+
+    # Check if slice is a no-op
+    if (
+        get_const_value(starts) == 0
+        and get_const_value(ends) == _constants.INT64_MAX
+        and (steps is None or get_const_value(steps) == 1)
+    ):
+        return input
+
+    axes = to_slice_input(axes)
+    starts = to_slice_input(starts, default_value=0)
+    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
+    if steps is None:
+        return g.op("Slice", input, starts, ends, axes)
+    steps = to_slice_input(steps, default_value=1)
+    return g.op("Slice", input, starts, ends, axes, steps)
+
+
+@_onnx_symbolic("aten::slice")
+@_beartype.beartype
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
+        dims, start, end, step = args
+    elif len(args) == 3:
+        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
+        start, end, step = args
+        dims = [0]
+    else:
+        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
+
+    return symbolic_helper._slice_helper(
+        g,
+        self,
+        axes=dims,
+        starts=start,
+        ends=end,
+        steps=step,
+    )
+
+
+@_onnx_symbolic("aten::flip")
+@symbolic_helper.parse_args("v", "is")
+@_beartype.beartype
+def flip(g: jit_utils.GraphContext, input, dims):
+    return symbolic_helper._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
+        steps=[-1] * len(dims),
+    )
+
+
+@_onnx_symbolic("aten::fmod")
+@_beartype.beartype
+def fmod(g: jit_utils.GraphContext, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+@_beartype.beartype
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
+    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
+    if offsets_dim_0 is not None:
+        if include_last_offset:
+            offset_len = offsets_dim_0 - 1
+            offsets_extended = offsets
+        else:
+            offset_len = offsets_dim_0
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
+            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
+        list_ = []
+        for i in range(offset_len):
+            start_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
+                [0],
+            )
+            end_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(
+                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
+                ),
+                [0],
+            )
+            axes_ = g.op("Constant", value_t=torch.tensor([0]))
+            indices_row = g.op("Slice", indices, start_, end_, axes_)
+
+            embeddings = g.op("Gather", embedding_matrix, indices_row)
+            if not symbolic_helper._is_none(per_sample_weights):
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
+                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
+            if mode == 0:
+                embeddings = symbolic_helper._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
+            elif mode == 1:
+                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+            else:
+                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
+            list_.append(embeddings)
+
+        output = g.op("Concat", *list_, axis_i=0)
+        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+        return output, None, None, None
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+@_beartype.beartype
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+            inputs,
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
+        raise errors.SymbolicValueError(
+            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    scale = symbolic_helper._maybe_get_scalar(scale)
+    if scale is None:
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+            inputs,
+        )
+    scale = scale.float().data  # Avoid exporter generating double type
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
+
+
+@_onnx_symbolic("aten::isinf")
+@_beartype.beartype
+def isinf(g: jit_utils.GraphContext, input):
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
+
+
+@_onnx_symbolic("aten::isfinite")
+@_beartype.beartype
+def isfinite(g: jit_utils.GraphContext, input):
+    inf_node = isinf(g, input)
+    nan_node = opset9.isnan(g, input)
+    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
+
+
+@_onnx_symbolic("aten::quantize_per_tensor")
+@_beartype.beartype
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    # TODO(justinchuby): Extract all the cast ops into a helper function.
+    zero_point = g.op(
+        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
+
+
+@_onnx_symbolic("aten::dequantize")
+@_beartype.beartype
+def dequantize(g: jit_utils.GraphContext, input):
+    return symbolic_helper.dequantize_helper(g, input)[0]
+
+
+@_onnx_symbolic("aten::nan_to_num")
+@symbolic_helper.parse_args("v", "f", "f", "f")
+@_beartype.beartype
+def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not symbolic_helper._is_fp(input):
+        return input
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
+    if nan is None:
+        nan = 0.0
+    nan_cond = opset9.isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input’s dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_result),
+        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        opset9.lt(
+            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
+        ),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# Quantized symbolics ---------------------------------------------------------
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
+# introduced in opset version 10.
+@_onnx_symbolic("quantized::linear")
+@_beartype.beartype
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+@_beartype.beartype
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add")
+@_beartype.beartype
+def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add_relu")
+@_beartype.beartype
+def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::mul")
+@_beartype.beartype
+def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.mul(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::hardswish")
+@_beartype.beartype
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::sigmoid")
+@_beartype.beartype
+def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.sigmoid(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::leaky_relu")
+@_beartype.beartype
+def quantized_leaky_relu(
+    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.leaky_relu(g, x, negative_slope, inplace)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::layer_norm")
+@_beartype.beartype
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::group_norm")
+@_beartype.beartype
+def quantized_group_norm(
+    g: jit_utils.GraphContext,
+    x,
+    num_groups,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
+@_beartype.beartype
+def quantized_instance_norm(
+    g: jit_utils.GraphContext,
+    q_input,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+
+    output = opset9.instance_norm(
+        g, input, weight, bias, None, None, False, 0.0, eps, False
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+@_beartype.beartype
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+@_beartype.beartype
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+@_beartype.beartype
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+@_beartype.beartype
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+@_beartype.beartype
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+@_beartype.beartype
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+@_beartype.beartype
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+@_beartype.beartype
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+@_beartype.beartype
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::cat")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def quantized_cat(
+    g: jit_utils.GraphContext,
+    q_inputs: _C.Value,
+    dim: int,
+    op_scale: _C.Value,
+    op_zero_point: _C.Value,
+) -> _C.Value:
+    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
+    dequantized = [
+        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
+    ]
+    concatenated = g.op("Concat", *dequantized, axis_i=dim)
+    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset11.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset11.py
new file mode 100644
index 0000000000000000000000000000000000000000..e93fab3f01ff82c9975323ca1c9358a7b7012761
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset11.py
@@ -0,0 +1,1650 @@
+"""This file exports ONNX ops for opset 11."""
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import Optional, Sequence
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset10 as opset10,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "add",
+    "append",
+    "arange",
+    "argsort",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "cat",
+    "chunk",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "constant_pad_nd",
+    "cumsum",
+    "Delete",
+    "embedding_bag",
+    "embedding_renorm",
+    "flatten",
+    "gather",
+    "hardtanh",
+    "hstack",
+    "im2col",
+    "index_fill",
+    "index",
+    "index_copy",
+    "index_put",
+    "insert",
+    "linalg_det",
+    "linalg_vector_norm",
+    "logdet",
+    "masked_scatter",
+    "masked_select",
+    "mm",
+    "narrow",
+    "normal",
+    "pad",
+    "pixel_shuffle",
+    "pop",
+    "prim_constant_chunk",
+    "reflection_pad",
+    "relu6",
+    "remainder",
+    "replication_pad",
+    "round",
+    "scatter",
+    "select",
+    "size",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "squeeze",
+    "stack",
+    "topk",
+    "unbind",
+    "unique_dim",
+    "unsqueeze",
+    "vstack",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+@_beartype.beartype
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
+    )
+    return opset9._op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::clamp")
+@_beartype.beartype
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    @_beartype.beartype
+    def _cast_if_not_none(tensor, dtype):
+        if tensor is not None and not symbolic_helper._is_none(tensor):
+            return g.op(
+                "Cast",
+                tensor,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            return tensor
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
+
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if (
+            symbolic_helper._get_tensor_rank(min) == 0
+            and symbolic_helper._get_tensor_rank(max) == 0
+        ):
+            return opset9._op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(min) == 0:
+        max = opset9.unused(g)
+        return opset9._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return opset9._op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(max) == 0:
+        min = opset9.unused(g)
+        return opset9._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return opset9._op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
+
+
+@_onnx_symbolic("aten::relu6")
+@_beartype.beartype
+def relu6(g: jit_utils.GraphContext, input):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
+    )
+    return clamp(g, input, min_val, max_val)
+
+
+@_onnx_symbolic("aten::select")
+# Opset 11 gather accepts negative indices
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+@_beartype.beartype
+def select(g: jit_utils.GraphContext, self, dim, index):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::index_put")
+@_beartype.beartype
+def index_put(
+    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
+):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    if symbolic_helper.is_caffe2_aten_fallback():
+        args = [self] + indices_list + [values, accumulate]
+        return g.at("index_put", *args)
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        return values
+
+    if len(indices_list) > 1:
+        for idx_ in range(len(indices_list)):
+            if symbolic_helper._is_bool(indices_list[idx_]):
+                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
+        index = indices_list[0]
+
+        for ind in indices_list[1:]:
+            index = opset9.add(g, index, ind)
+        broadcast_index_shape = g.op("Shape", index)
+        indices_list = [
+            symbolic_helper._unsqueeze_helper(
+                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
+        ]
+        index = g.op("Concat", *indices_list, axis_i=-1)
+    else:
+        # Replace index_put node with masked_scatter or masked_fill
+        # when inputs to the index_put node contains a single boolean input.
+        #
+        # index_put -> masked_fill
+        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
+        #   * input value contains single element (e.g.: %18).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
+        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
+        #   %24 : Tensor?[] = prim::ListConstruct(%23)
+        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #                aten::index_put(%mask, %24, %18, %30)
+        #   return (%25)
+        #
+        #
+        # index_put -> masked_scatter
+        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
+        #   * input value contains multiple elements (e.g.: %28).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
+        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
+        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::ne(%mask, %some_const)
+        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
+        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        #   %30 : int[] = prim::Constant[value=[-1]]()
+        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
+        #   %32 : Tensor?[] = prim::ListConstruct(%31)
+        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #               = aten::index_put(%mask, %32, %28, %38)
+        #   return (%33)
+        index = indices_list[0]
+        bool_inp = index
+        if symbolic_helper._is_bool(bool_inp):
+            rank = symbolic_helper._get_tensor_rank(values)
+            if rank is not None and rank == 0:
+                return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
+            return masked_scatter(g, self, bool_inp, values)
+        broadcast_index_shape = g.op("Shape", index)
+        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
+    sub_data_shape = symbolic_helper._slice_helper(
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
+    )
+    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    # Check if values is a singular value and expand accordingly
+    rank = symbolic_helper._get_tensor_rank(values)
+    if rank is not None and rank == 0:
+        values = opset9.expand(g, values, values_shape, None)
+    values = symbolic_helper._reshape_helper(g, values, values_shape)
+
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
+        )
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
+
+    if accumulate:
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
+        )
+        result = g.op("ScatterND", zeros, index, values)
+        result = add(g, self, result)
+    else:
+        result = g.op("ScatterND", self, index, values)
+
+    return result
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None and rank != 4:
+        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
+    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bicubic2d",
+    decorate=[_apply_params("upsample_bicubic2d", 4, "cubic")],
+)
+@_beartype.beartype
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
+
+
+@_onnx_symbolic("aten::__interpolate")
+@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+@_beartype.beartype
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    return symbolic_helper.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("gather", self, dim, index, sparse_grad)
+    return g.op("GatherElements", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("scatter", self, dim, index, src, overload_name="src")
+    src_type = _type_utils.JitScalarType.from_value(src)
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+        return g.op(
+            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
+        )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
+    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        cast = g.op(
+            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    else:
+        cast = self
+    csum = g.op("CumSum", cast, dim_tensor)
+    return csum
+
+
+@_onnx_symbolic("aten::masked_select")
+@_beartype.beartype
+def masked_select(g: jit_utils.GraphContext, self, mask):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    return g.op("GatherND", self, index)
+
+
+@_onnx_symbolic("aten::masked_scatter")
+@_beartype.beartype
+def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    # NOTE: source can have more elements than needed.
+    # It could also have arbitrary shape.
+    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
+    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
+    source = symbolic_helper._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=opset9.size(g, index, torch.LongTensor([0])),
+    )
+    return g.op("ScatterND", self, index, source)
+
+
+@_onnx_symbolic("aten::len")
+@_beartype.beartype
+def _len(g: jit_utils.GraphContext, self):
+    if (
+        symbolic_helper._is_tensor_list(self)
+        or self.node().kind() == "onnx::SplitToSequence"
+    ):
+        return g.op("SequenceLength", self)
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::__getitem_")
+@_beartype.beartype
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    if symbolic_helper._is_tensor_list(self):
+        # SequenceAt requires that the input be a List of Tensors
+        return g.op("SequenceAt", self, i)
+    else:
+        from torch.onnx.symbolic_opset9 import __getitem_ as getitem
+
+        return getitem(g, self, i)
+
+
+@_onnx_symbolic("aten::_set_item")
+@_beartype.beartype
+def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
+    tensor_list = g.op("SequenceErase", tensor_list, i)
+    return g.op("SequenceInsert", tensor_list, v, i)
+
+
+@_onnx_symbolic("aten::append")
+@_beartype.beartype
+def append(g: jit_utils.GraphContext, self, tensor):
+    return g.op("SequenceInsert", self, tensor)
+
+
+@_onnx_symbolic("aten::add")
+@_beartype.beartype
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        tensor_list_node = other.node()
+        if tensor_list_node.kind() != "prim::ListConstruct":
+            return symbolic_helper._unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
+        tensors = symbolic_helper._unpack_list(other)
+        l = self
+        for t in tensors:
+            l = g.op("SequenceInsert", l, t)
+        return l
+
+    return opset9.add(g, self, other, alpha)
+
+
+@_onnx_symbolic("aten::insert")
+@_beartype.beartype
+def insert(g: jit_utils.GraphContext, self, pos, tensor):
+    return g.op("SequenceInsert", self, tensor, pos)
+
+
+@_onnx_symbolic("aten::pop")
+@_beartype.beartype
+def pop(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::Delete")
+@_beartype.beartype
+def Delete(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.cat(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@_beartype.beartype
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.stack(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::unique_dim")
+@symbolic_helper.parse_args("v", "i", "i", "i", "i")
+@_beartype.beartype
+def unique_dim(
+    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
+):
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+@_beartype.beartype
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+@_beartype.beartype
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::argsort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+@_beartype.beartype
+def argsort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    _, indices = symbolic_helper._sort_helper(
+        g, self, dim, decending=decending, out=out
+    )
+    return indices
+
+
+@_onnx_symbolic("aten::round")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def round(g: jit_utils.GraphContext, self, decimals=0):
+    if not symbolic_helper._is_fp(self):
+        return self
+    if decimals == 0:
+        return g.op("Round", self)
+    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
+    round = g.op("Round", mul)
+    return g.op(
+        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
+    )
+
+
+@_onnx_symbolic("aten::remainder")
+@_beartype.beartype
+def remainder(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
+        return opset9.remainder(g, input, other)
+    return g.op("Mod", input, other, fmod_i=0)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+@_beartype.beartype
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+    else:
+        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+@_beartype.beartype
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+    else:
+        return opset9.unbind(g, self, dim, _outputs)
+
+
+@_beartype.beartype
+def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+
+    Args:
+        input: the input tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+            where m is in range [0, n].
+    """
+    if (
+        not symbolic_helper._is_packed_list(pad)
+        and symbolic_helper._is_list(pad)
+        and symbolic_helper._is_scalar_list(pad)
+    ):
+        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
+    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    rank = symbolic_helper._get_tensor_rank(input)
+    if rank is None:
+        rank = g.op("Size", g.op("Shape", input))
+    else:
+        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
+    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
+    # Currently ONNX only supports int64 type for Pad
+    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
+    # Reshape and reverse order and collate first beginnings and then ends
+    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
+    #               [..., 0, dim_n-1_end, dim_n_end]]
+    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
+    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return padding_c
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+@_beartype.beartype
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
+    mode = "constant"
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, input)
+    pad = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, pad, value, mode_s=mode)
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+@_beartype.beartype
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+@_beartype.beartype
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::pad")
+@_beartype.beartype
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return opset9._pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic("aten::linalg_det")
+@_beartype.beartype
+def linalg_det(g: jit_utils.GraphContext, self):
+    return g.op("Det", self)
+
+
+@_onnx_symbolic("aten::logdet")
+@_beartype.beartype
+def logdet(g: jit_utils.GraphContext, input):
+    return opset9.log(g, linalg_det(g, input))
+
+
+@_onnx_symbolic("aten::arange")
+@_beartype.beartype
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    if len(args) == 2 and all(isinstance(val, int) for val in args):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=type_.dtype()),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start_default, end, delta_default)
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        _, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        return g.op("Range", start, end, step)
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start, end, delta_default)
+    else:
+        return symbolic_helper._unimplemented(
+            "aten::arange", f"with {len(args)} arguments"
+        )
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.op("_caffe2::Range", stop)
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+@_beartype.beartype
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::squeeze")
+@_beartype.beartype
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    # dim as a tensor
+    if not symbolic_helper._is_constant(dim):
+        return symbolic_helper._squeeze_helper(g, self, [dim])
+
+    dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    input_rank = symbolic_helper._get_tensor_rank(self)
+    adjusted_dim = dim
+    if input_rank is not None and dim < 0:
+        adjusted_dim += input_rank
+    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
+    if (dim < 0 and input_rank is None) or dim_size is None:
+        # If onnx shape inference is not on, export always as dynamic.
+        # Because we cannot tell if observed static shape is also static at runtime.
+        # create "cond" node (condition is shape[i]==1)
+        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
+        size = symbolic_helper._size_helper(g, self, dim_constant)
+        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
+        cond = g.op("Equal", size, const_one)
+        # create the "If" node and add the "then" and "else" blocks to it.
+        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+            g, "If", cond, n_blocks=2
+        )
+        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
+        utils._add_output_to_block(if_context.block, squeeze_)
+        identity_ = else_context.op("Identity", self)
+        utils._add_output_to_block(else_context.block, identity_)
+        return if_op
+
+    # For static input shape
+    dim = adjusted_dim
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
+        return self
+    return symbolic_helper._squeeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@_beartype.beartype
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    if symbolic_helper._is_constant(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    return symbolic_helper._unsqueeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::mm")
+@_beartype.beartype
+def mm(g: jit_utils.GraphContext, self, other):
+    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::index")
+@_beartype.beartype
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("index", self, index, overload_name="Tensor")
+
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not symbolic_helper._is_none(index) and (
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
+        ):
+            index = opset9.nonzero(g, index)
+            return g.op("GatherND", self, index)
+    return opset9.index(g, self, index)
+
+
+@_onnx_symbolic("aten::index_fill")
+@_beartype.beartype
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "index_fill",
+            self,
+            index,
+            value,
+            overload_name="int_Scalar",
+            dim_i=dim_value,
+        )
+
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+@_beartype.beartype
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("index_copy", self, index, source, dim_i=dim_value)
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::__rshift_")
+@_beartype.beartype
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="RIGHT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+@_beartype.beartype
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="LEFT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_beartype.beartype
+def _get_im2col_indices_along_dim(
+    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = symbolic_helper._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = symbolic_helper._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
+    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+@_beartype.beartype
+def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op("Pad", input, pad)
+
+
+@_beartype.beartype
+def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
+
+    return g.op(
+        "Concat",
+        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
+        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
+
+
+@_onnx_symbolic("aten::im2col")
+@symbolic_helper.parse_args("v", "is", "is", "is", "is")
+@_beartype.beartype
+def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
+    # [[[[1., 2., 3.,],
+    #    [4., 5., 6.,],
+    #    [7., 8., 9.,]]]]
+    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[1., 2., 3.],
+    #     [4., 5., 6.]],
+    #    [[4., 5., 6.],
+    #     [7., 8., 9.]]]]]
+    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[[1., 2.],
+    #      [4., 5.]],
+    #     [[2., 3.],
+    #      [5., 6]]],
+    #    [[[4., 5.],
+    #      [7., 8.]],
+    #     [[5., 6.],
+    #      [8., 9.]]]]]]
+    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
+    #  [[[1., 2., 4., 5.],
+    #    [2., 3., 5., 6.],
+    #    [4., 5., 7., 8.],
+    #    [5., 6., 8., 9.]]]
+    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
+    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
+    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
+    return symbolic_helper._reshape_helper(g, output, output_shape)
+
+
+@_onnx_symbolic("aten::narrow")
+@_beartype.beartype
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    end = g.op("Add", start, length)
+    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim == 1:
+        return input
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1:
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
+            return g.op("Flatten", input, axis_i=start_dim)
+    elif start_dim == 0:
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
+            return g.op("Flatten", input, axis_i=end_dim + 1)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    # if end_dim is negative add dim
+    if end_dim < 0:
+        end_dim = dim + end_dim
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+@_beartype.beartype
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self,
+    ord,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype,
+):
+    if ord == 0:
+        if dim is None:
+            self = symbolic_helper._reshape_helper(
+                g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+            )
+            keepdim = False
+
+        cond_op = g.op(
+            "Not", g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0])))
+        )
+        cond_op = g.op(
+            "Cast",
+            cond_op,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+        return symbolic_helper._reducesum_helper(
+            g, cond_op, axes_i=dim, keepdims_i=keepdim
+        )
+    else:
+        return opset9.linalg_vector_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+@_beartype.beartype
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = symbolic_helper._unsqueeze_helper(
+        g,
+        symbolic_helper._size_helper(
+            g, indices, g.op("Constant", value_t=torch.tensor(0))
+        ),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = symbolic_helper._slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = symbolic_helper._slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = symbolic_helper._size_helper(
+        g, offsets_ends, g.op("Constant", value_t=torch.tensor(0))
+    )
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = symbolic_helper._unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = symbolic_helper._unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not symbolic_helper._is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = symbolic_helper._reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        embeddings = loop_context.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+    else:
+        embeddings = loop_context.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+@_onnx_symbolic("aten::embedding_renorm")
+@symbolic_helper.parse_args("v", "v", "f", "f")
+@_beartype.beartype
+def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_i = int(norm_type)
+    if norm_i == 1:
+        norm_type = "ReduceL1"
+    elif norm_i == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise errors.SymbolicValueError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
+            "Only 1. and 2. are supported.",
+            weight,
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
+
+@_onnx_symbolic("aten::chunk")
+@_beartype.beartype
+def chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    # Calculate chunk size for dynamic chunk
+    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
+    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
+    # Create splits vector
+    chunk_vec = [
+        opset9.expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
+    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
+    return split(g, self, chunk_vec, dim)
+
+
+@_onnx_symbolic("aten::normal")
+@_beartype.beartype
+def normal(
+    g: jit_utils.GraphContext,
+    mean,
+    std,
+    sizes=None,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
+    # scale-location transformation of that distribution, which has mean μ and variance σ's square. If x is a sample
+    # from a mean 0 and variance 1 distribution then
+    #       σx+μ
+    # is a sample with mean μ and variance σ's square.
+    if sizes is not None and not symbolic_helper._is_none(sizes):
+        mean = opset9.expand(g, mean, sizes, None)
+    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
+    return add(g, result, mean)
+
+
+@_onnx_symbolic("aten::atleast_1d")
+@_beartype.beartype
+def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 1D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1]))
+        )
+    return self
+
+
+@_onnx_symbolic("aten::atleast_2d")
+@_beartype.beartype
+def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 2D
+    #       If it's 1D, unsqueeze to 2D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+    return self
+
+
+@_onnx_symbolic("aten::atleast_3d")
+@_beartype.beartype
+def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 3D
+    #       If it's 1D, unsqueeze to 3D
+    #       If it's 2D, unsqueeze to 3D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            elif tensor_rank == 2:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    elif tensor_rank == 2:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    return self
+
+
+@_onnx_symbolic("prim::ConstantChunk")
+@_beartype.beartype
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op(
+        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+    )
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res
+
+
+@_onnx_symbolic("aten::hstack")
+@_beartype.beartype
+def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_1d(g, tensor_list)
+    first_tensor = g.op(
+        "SequenceAt",
+        tensor_list,
+        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
+    )
+    first_tensor_shape = g.op("Shape", first_tensor)
+    first_tensor_dim = g.op("Size", first_tensor_shape)
+
+    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
+
+    (
+        if_op_greater,
+        (if_context_equal, else_context_equal),
+        _,
+    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
+    result_if = if_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
+    )
+    utils._add_output_to_block(if_context_equal.block, result_if)
+    result_else = else_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
+    )
+    utils._add_output_to_block(else_context_equal.block, result_else)
+    result = if_op_greater.node().output()
+
+    return result
+
+
+@_onnx_symbolic("aten::vstack")
+@_beartype.beartype
+def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_2d(g, tensor_list)
+    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset12.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset12.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bc48ca140b091bfd42ff4a30f4a4d83cfc16a78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset12.py
@@ -0,0 +1,485 @@
+from __future__ import annotations
+
+import functools
+import sys
+from typing import Optional, Tuple
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 12
+
+__all__ = [
+    "argmax",
+    "argmin",
+    "binary_cross_entropy_with_logits",
+    "celu",
+    "cross_entropy_loss",
+    "dropout",
+    "einsum",
+    "ge",
+    "le",
+    "native_dropout",
+    "nll_loss",
+    "nll_loss2d",
+    "nll_loss_nd",
+    "outer",
+    "pow",
+    "tensordot",
+    "unfold",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
+
+
+@_beartype.beartype
+def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
+    if not tensors:
+        raise RuntimeError("Einsum inputs are empty.")
+    # ONNX does not support bool for Einsum inputs.
+    if symbolic_helper._is_bool(tensors[0]):
+        tensors = [
+            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=_C_onnx.TensorProtoDataType.BOOL,
+        )
+    else:
+        return g.op("Einsum", *tensors, equation_s=equation)
+
+
+@_onnx_symbolic("aten::einsum")
+@symbolic_helper.parse_args("s", "v", "is")
+@_beartype.beartype
+def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return _einsum_helper(g, equation, tensors)
+
+
+@_onnx_symbolic("aten::outer")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def outer(g: jit_utils.GraphContext, input, other):
+    # make sure to cast other to self's type
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
+        )
+    return _einsum_helper(g, "i,j->ij", [input, other])
+
+
+@_beartype.beartype
+def _dropout_returns_masked_input_and_mask(
+    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
+) -> Tuple[torch._C.Value, Optional[torch._C.Value]]:
+    symbolic_helper.check_training_mode(train, "dropout")
+    # In eval mode, dropout is non-op. That is, if the node's
+    # train param is set to False, dropout just returns its inputs.
+    if not train:
+        return input, None
+    p = g.op("Constant", value_t=torch.tensor(p))
+    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
+    r, mask = g.op("Dropout", input, p, t, outputs=2)
+    return r, mask
+
+
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+@_beartype.beartype
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
+    return masked
+
+
+@_onnx_symbolic("aten::native_dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+@_beartype.beartype
+def native_dropout(g: jit_utils.GraphContext, input, p, train):
+    return _dropout_returns_masked_input_and_mask(g, input, p, train)
+
+
+@_onnx_symbolic("aten::nll_loss")
+@_beartype.beartype
+def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return nllloss
+
+
+@_onnx_symbolic("aten::nll_loss2d")
+@_beartype.beartype
+def nll_loss2d(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::nll_loss_nd")
+@_beartype.beartype
+def nll_loss_nd(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::cross_entropy_loss")
+@_beartype.beartype
+def cross_entropy_loss(
+    g: jit_utils.GraphContext,
+    self,
+    target,
+    weight,
+    reduction,
+    ignore_index,
+    label_smoothing,
+):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
+    if label_smoothing is not None and label_smoothing > 0.0:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX does not support label_smoothing", self
+        )
+
+    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return celoss
+
+
+@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+@_beartype.beartype
+def binary_cross_entropy_with_logits(
+    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
+):
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = opset9.sigmoid(g, input)
+    log_sig_x = opset9.log(g, sig_x)
+    sub_1_x = opset9.sub(g, p, sig_x)
+    sub_1_y = opset9.sub(g, p, target)
+    log_1_x = opset9.log(g, sub_1_x)
+    if pos_weight is None or symbolic_helper._is_none(pos_weight):
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
+            ),
+        )
+    else:
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g,
+                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
+                opset9.mul(g, sub_1_y, log_1_x),
+            ),
+        )
+
+    if weight is not None and not symbolic_helper._is_none(weight):
+        output = opset9.mul(g, weight, output)
+
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return g.op("ReduceSum", output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
+            input,
+        )
+
+
+@_onnx_symbolic("aten::celu")
+@_beartype.beartype
+def celu(g: jit_utils.GraphContext, self, alpha):
+    alpha = symbolic_helper._maybe_get_const(alpha, "f")
+    # if the input is of type double cast it to float
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        out = g.op("Celu", self, alpha_f=alpha)
+        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+    return g.op("Celu", self, alpha_f=alpha)
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+@_beartype.beartype
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+@_beartype.beartype
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::pow")
+@_beartype.beartype
+def pow(g: jit_utils.GraphContext, self, exponent):
+    return g.op("Pow", self, exponent)
+
+
+@_onnx_symbolic("aten::ge")
+@_beartype.beartype
+def ge(g: jit_utils.GraphContext, input, other):
+    return g.op("GreaterOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::le")
+@_beartype.beartype
+def le(g: jit_utils.GraphContext, input, other):
+    return g.op("LessOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    const_size = symbolic_helper._maybe_get_const(size, "i")
+    const_step = symbolic_helper._maybe_get_const(step, "i")
+    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
+        const_step
+    ):
+        return opset9.unfold(g, input, dimension, const_size, const_step)
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("unfold", input, dimension_i=dimension, size_i=size, step_i=step)
+
+    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
+    if sizedim is not None:
+        low_start = g.op("Constant", value_t=torch.tensor(0))
+        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
+        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
+        low_indices = g.op("Range", low_start, low_end, step)
+        hi_indices = g.op("Range", size, hi_end, step)
+
+        low_size = symbolic_helper._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = symbolic_helper._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+
+        ndim = symbolic_helper._get_tensor_rank(input)
+        assert ndim is not None
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+
+        unsqueeze_list = []
+        loop_condition = g.op("Constant", value_t=torch.tensor(1))
+        loop_condition = g.op(
+            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+        )
+        loop_len = g.op("Min", low_size, hi_size)
+
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        # FIXME(justinchuby): cond is unused?
+        cond = utils._add_input_to_block(loop_block)
+
+        starts = loop_context.op("Gather", low_indices, block_input_iter)
+        ends = loop_context.op("Gather", hi_indices, block_input_iter)
+        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
+        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
+        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
+        stack = loop_context.op("Slice", input, starts, ends, axes)
+
+        unsqueeze = symbolic_helper._unsqueeze_helper(
+            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
+        )
+        unsqueeze_list.append(unsqueeze)
+        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
+
+        cond_out = loop_context.op(
+            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+        )
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, concat)
+
+        loop_output = loop.node().output()
+        perm = [0, 1, 2, 3, 4]
+        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
+        transpose = g.op("Transpose", loop_output, perm_i=perm)
+        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
+
+        return squeeze
+
+    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
+
+
+@_onnx_symbolic("aten::tensordot")
+@symbolic_helper.parse_args("v", "v", "is", "is", "v")
+@_beartype.beartype
+def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Tensordot", "Out parameter is not supported for tensordot."
+        )
+
+    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
+    if dim_count_a is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
+            input_a,
+        )
+
+    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
+    if dim_count_b is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
+            input_b,
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
+
+    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
+    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
+
+    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
+    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
+
+    input_shape = g.op("Shape", new_input_a)
+    left_sizes_a = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", output_a)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", new_input_b)
+    left_sizes_b = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
+    )
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    input_shape = g.op("Shape", output_b)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+
+    shape_sizes = [left_sizes_a, left_sizes_b]
+    return opset9._reshape_from_tensor(g, output, shape_sizes)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset13.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset13.py
new file mode 100644
index 0000000000000000000000000000000000000000..d283c464dd83511a91b85fd32ef030f78f915ed4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset13.py
@@ -0,0 +1,1156 @@
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 13
+import functools
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset11 as opset11,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    softmax = g.op("Softmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+
+    return softmax
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return return_op
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    dim_val = symbolic_helper._maybe_get_const(dim, "is")
+    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
+        return g.op("ReduceL2", self, keepdims_i=0)
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+@_beartype.beartype
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            raise errors.SymbolicValueError(
+                "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    splits = g.op("Constant", value_t=torch.tensor(splits))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@_beartype.beartype
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+@_beartype.beartype
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+@_beartype.beartype
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::tensor_split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+@_beartype.beartype
+def tensor_split(
+    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
+):
+    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    axis = opset11.unsqueeze(g, axis, 0)
+    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+
+    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
+        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
+
+        if split_val.dim() > 0:
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            res = []
+            assert _outputs is not None
+            for i in range(_outputs - 1):
+                end = g.op(
+                    "Gather",
+                    indices_or_sections,
+                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+                    axis_i=0,
+                )
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+
+            end = symbolic_helper._size_helper(g, self, axis)
+            res.append(g.op("Slice", self, start, end, axis))
+            return res
+
+        split_size = symbolic_helper._get_const(
+            indices_or_sections, "i", "indices_or_sections"
+        )
+
+        size = symbolic_helper._get_tensor_dim_size(self, dim)
+        if size is None:
+            if _outputs is not None:
+                size = split_size * _outputs
+            else:
+                raise errors.SymbolicValueError(
+                    "Unknown dimension size not supported", self
+                )
+
+        min_split_size = size // split_size
+        num_splits_one_extra = size % split_size
+
+        splits = num_splits_one_extra * [min_split_size + 1]
+        leftover = (split_size - num_splits_one_extra) * [min_split_size]
+
+        splits = g.op(
+            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
+        )
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+    if (
+        symbolic_helper._is_tensor(indices_or_sections)
+        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
+    ):
+        loop_len = symbolic_helper._size_helper(
+            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
+        )
+        loop_len = opset11.unsqueeze(g, loop_len, 0)
+        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+        # To make the first slice in the below loop work,
+        # we pad a zero to the first position so that it will be the initial start of slice.
+        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
+
+        final_splits = g.op("SequenceEmpty")
+        # Loop inputs
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)
+        final_splits = utils._add_input_to_block(loop_block)
+
+        start = loop_context.op(
+            "Gather", indices_or_sections, block_input_iter, axis_i=0
+        )
+        end = loop_context.op(
+            "Gather",
+            indices_or_sections,
+            loop_context.op("Add", block_input_iter, const_1),
+            axis_i=0,
+        )
+
+        slice = loop_context.op("Slice", self, start, end, axis)
+        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
+
+        # Loop outputs
+        cond_out = loop_context.op("Identity", loop_condition)
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, final_splits)
+
+        loop_out = loop.node().output()
+        start = g.op(
+            "Gather",
+            indices_or_sections,
+            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
+            axis_i=0,
+        )
+        start = opset11.unsqueeze(g, start, 0)
+        end = symbolic_helper._size_helper(g, self, axis)
+
+        last_slice = g.op("Slice", self, start, end, axis)
+
+        return g.op("SequenceInsert", loop_out, last_slice)
+
+    else:  # scalar tensor
+        dim_size = symbolic_helper._size_helper(g, self, axis)
+        min_split_size = g.op("Div", dim_size, indices_or_sections)
+        min_split_size_plus_1 = g.op(
+            "Add",
+            min_split_size,
+            const_1,
+        )
+        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
+        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
+        leftover = g.op(
+            "Tile",
+            min_split_size,
+            g.op(
+                "Sub",
+                opset11.unsqueeze(g, indices_or_sections, 0),
+                num_splits_one_extra,
+            ),
+        )
+
+        splits = g.op("Concat", splits, leftover, axis_i=0)
+        if _outputs is None:
+            return g.op("SplitToSequence", self, splits, axis_i=dim)
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
+    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+@_beartype.beartype
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+@_beartype.beartype
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = opset9.nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
+@_beartype.beartype
+def fake_quantize_per_channel_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    axis,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+@_beartype.beartype
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
+
+@_beartype.beartype
+def _reduce_op_symbolic(onnx_op_name):
+    @_beartype.beartype
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = opset9._maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_onnx_symbolic(
+    "aten::sum",
+    decorate=[_apply_params("ReduceSum", "sum")],
+)
+@_beartype.beartype
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @opset9.overload_by_arg_count
+    @_beartype.beartype
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.parse_args("v", "none")
+        @_beartype.beartype
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        @symbolic_helper.parse_args("v", "v", "i", "none")
+        @_beartype.beartype
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
+# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
+@_onnx_symbolic("aten::unflatten")
+@_beartype.beartype
+def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+
+    # dim could be negative
+    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
+    dim = g.op("Add", input_dim, dim)
+    dim = g.op("Mod", dim, input_dim)
+
+    input_size = g.op("Shape", input)
+
+    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
+    head_end_idx = g.op(
+        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
+
+    dim_plus_one = g.op(
+        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    tail_start_idx = g.op(
+        "Reshape",
+        dim_plus_one,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
+    )
+    tail_end_idx = g.op(
+        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+    )
+    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
+
+    final_shape = g.op(
+        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
+    )
+
+    return symbolic_helper._reshape_helper(g, input, final_shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+
+    # TODO: So far we don"t have a module using this method. We"ll keep
+    # this as a constant unless we see a request of dynamics in any
+    # user's modules.
+    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::tile")
+@_beartype.beartype
+def tile(g: jit_utils.GraphContext, self, dims):
+    self_shape = g.op("Shape", self)
+    self_rank = g.op("Size", self_shape)
+    dims_rank = g.op("Size", dims)
+    diff = g.op("Sub", self_rank, dims_rank)
+    const_zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    # 1. If dims is shorter than self.shape pad dims with 1
+    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
+    (
+        if_op_greater,
+        (if_context_greater, else_context_greater),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
+    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
+    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
+    utils._add_output_to_block(if_context_greater.block, dims_)
+    identity_dim = else_context_greater.op("Identity", dims)
+    utils._add_output_to_block(else_context_greater.block, identity_dim)
+    dims_final = if_op_greater.node().output()
+
+    # 2. If dims is longer than self.shape pad self.shape with 1
+    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
+    (
+        if_op_less,
+        (if_context_less, else_context_less),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_less = if_context_less.op(
+        "Reshape",
+        if_context_less.op("Abs", diff),
+        const_one,
+    )
+    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
+    self_final_shape = if_context_less.op(
+        "Concat", exapnd_ones_less, self_shape, axis_i=0
+    )
+    self_ = if_context_less.op("Reshape", self, self_final_shape)
+    utils._add_output_to_block(if_context_less.block, self_)
+    identity_self = else_context_less.op("Identity", self)
+    utils._add_output_to_block(else_context_less.block, identity_self)
+    self_final = if_op_less.node().output()
+
+    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Tile", self_final, dims_final)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+@_beartype.beartype
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+
+    # Check if all indices should be repeated the same number of times.
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = symbolic_helper._size_helper(g, self, dim)
+        reps = opset11.unsqueeze(g, reps, 0)
+
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        if cond_dynamic_repeats:
+            repeat_dim = symbolic_helper._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, self, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+
+    # Loop inputs
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
+    )
+
+    loop_block = loop_context.block
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)
+    final_splits = utils._add_input_to_block(loop_block)
+
+    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
+    r_concat = [
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
+    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
+    i_split = opset9.expand(loop_context, i_split, r_concat, None)
+    i_split = symbolic_helper._reshape_helper(
+        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
+    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
+
+
+@_onnx_symbolic("aten::diagonal")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
+    rank = symbolic_helper._get_tensor_rank(self)
+    # Replace negative indexing when rank is known
+    if rank is not None:
+        dim1 = dim1 if dim1 >= 0 else dim1 + rank
+        dim2 = dim2 if dim2 >= 0 else dim2 + rank
+
+    dim1_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
+    )
+    dim2_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
+    )
+    # Create appropriate mask
+    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
+    mask = opset9.zeros(g, mask_shape, None, None, None)
+    mask = g.op("EyeLike", mask, k_i=offset)
+    # dim1 and dim2 appended as a dimension at the end of the shape
+
+    if rank is not None:
+        axes = list(range(rank))
+        axes.remove(dim1)
+        axes.remove(dim2)
+        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
+    else:
+        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
+
+    # Multiply input and mask to calculate values along diagonal
+    # The mask consists of one values where diagonal values are to be calculated
+    # For example:
+    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
+    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
+    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
+    result = g.op("Mul", self, mask)
+    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
+
+    # Calculate gather indices based on offset and dims
+    # If offset is greater than zero, set offset to zero as this aids in
+    # calculation of selection window
+    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
+    if offset >= 0:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+        offset = 0
+    else:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+    diag_size = g.op("Concat", diag_size, axis_i=0)
+
+    # Calculate which diagonal values to select
+    # For example, in cases with offsets:
+    # [[0, 1.1, 0]
+    #  [0, 0, 2.2]]
+    # we need to select the last two columns, so we create a tensor
+    # with all columns that are to be selected
+    # So in this example, it is [1, 2]
+    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
+    gather_shape.append(diag_size)
+    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
+    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
+
+    # There might be cases where offset value is greater than number of rows/columns
+    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
+    # For example, if
+    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
+    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
+    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
+    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
+    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
+    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
+    # returning an empty tensor
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
+
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", overrun_cond, n_blocks=2
+    )
+
+    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
+    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
+        if_context, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun = if_context.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
+    utils._add_output_to_block(if_context.block, final_non_overrun)
+    utils._add_output_to_block(else_context.block, final_overrun)
+    return if_op
+
+
+# Quantized ops
+
+
+@_onnx_symbolic("quantized::linear")
+@_beartype.beartype
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+@_beartype.beartype
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+@_beartype.beartype
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+@_beartype.beartype
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+@_beartype.beartype
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+@_beartype.beartype
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+@_beartype.beartype
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+@_beartype.beartype
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+@_beartype.beartype
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+@_beartype.beartype
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+@_beartype.beartype
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset14.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset14.py
new file mode 100644
index 0000000000000000000000000000000000000000..4281c656a3fcf01f024f46a1113728e0541cb738
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset14.py
@@ -0,0 +1,289 @@
+"""This file exports ONNX ops for opset 14.
+
+Note [ONNX operators that are added/updated in opset 14]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    HardSwish, Trilu
+
+Updated operators:
+    Reshape
+    Add, Sub, Mul, Div
+    GRU, LSTM, RNN
+    BatchNorm, Cumsum, Relu
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+from __future__ import annotations
+
+import functools
+from typing import Optional
+
+import torch
+from torch.onnx import _constants, _type_utils, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+__all__ = [
+    "hardswish",
+    "tril",
+    "triu",
+    "reshape",
+    "batch_norm",
+    "quantized_hardswish",
+    "scaled_dot_product_attention",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def hardswish(g: jit_utils.GraphContext, self):
+    return g.op("HardSwish", self)
+
+
+@_onnx_symbolic("aten::tril")
+@_beartype.beartype
+def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=0)
+
+
+@_onnx_symbolic("aten::triu")
+@_beartype.beartype
+def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=1)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def reshape(g: jit_utils.GraphContext, self, shape):
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+@_beartype.beartype
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    symbolic_helper.check_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
+
+
+@_onnx_symbolic("quantized::hardswish")
+@_beartype.beartype
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
+# aten_scaled_dot_product_attention
+# NOTE: Need op.Trilu
+@_onnx_symbolic("aten::scaled_dot_product_attention")
+@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v")
+@_beartype.beartype
+def scaled_dot_product_attention(
+    g: jit_utils.GraphContext,
+    query: torch._C.Value,
+    key: torch._C.Value,
+    value: torch._C.Value,
+    attn_mask: Optional[torch._C.Value] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[torch._C.Value] = None,
+):
+    assert (not is_causal) or (
+        is_causal and symbolic_helper._is_none(attn_mask)
+    ), "is_causal and attn_mask cannot be set at the same time"
+
+    scale = symbolic_helper._maybe_get_const(scale, "f")
+    if symbolic_helper._is_none(scale):
+        scale = _attention_scale(g, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(g, query, key)
+
+    # Swap the last two axes of key
+    # NOTE: onnx-script has different logic here, because the attribute perms in
+    # transpose needs list of ints
+    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    key_transposed_axes = list(range(key_shape_builtin))
+    key_transposed_axes[-1], key_transposed_axes[-2] = (
+        key_transposed_axes[-2],
+        key_transposed_axes[-1],
+    )
+    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+    if symbolic_helper._is_none(attn_mask):
+        mul_qk_add = mul_qk
+    elif (
+        _type_utils.JitScalarType.from_value(attn_mask)
+        == _type_utils.JitScalarType.BOOL
+    ):
+        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    elif _type_utils.JitScalarType.from_value(attn_mask) in (
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    ):
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    else:
+        raise ValueError(
+            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
+        )
+
+    attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+
+    if dropout_p != 0:
+        attn_weight = g.op(
+            "Dropout",
+            attn_weight,
+            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+        )
+
+    return g.op("MatMul", attn_weight, value)
+
+
+@_beartype.beartype
+def _attention_scale(
+    g: jit_utils.GraphContext, query: torch._C.Value
+) -> torch._C.Value:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    query_shape = g.op("Shape", query)
+    query_shape_last = g.op(
+        "Slice",
+        query_shape,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+        g.op(
+            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+        ),
+    )
+    embedding_size = g.op(
+        "Cast",
+        query_shape_last,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
+    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
+    # Add a Cast to convert the scale back to original type
+    scale = g.op(
+        "Cast",
+        scale,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    return scale
+
+
+@_beartype.beartype
+def _causal_attention_mask(
+    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
+) -> torch._C.Value:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float('inf'))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+
+    query_shape = g.op("Shape", query)
+    key_shape = g.op("Shape", key)
+
+    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
+    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
+    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
+    # attn_mask = torch.ones(L, S) := {
+    size = g.op("Concat", target_length, source_length, axis_i=0)
+    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
+    attn_mask = g.op("Expand", const_one, size)
+    # }
+    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+    attn_mask = g.op(
+        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
+    )
+    return attn_mask
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset15.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset15.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3eaa95437364af035c4f12d4bc0f26278438882
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset15.py
@@ -0,0 +1,82 @@
+"""This file exports ONNX ops for opset 15.
+
+Note [ONNX operators that are added/updated in opset 15]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
+New operators:
+    Bernoulli
+    CastLike
+    Optional
+    OptionalGetElement
+    OptionalHasElement
+
+Updated operators:
+    BatchNormalization https://github.com/onnx/onnx/pull/3545
+                        Backwards compatible
+                        TODO: test coverage for mixed types inputs.
+    Pow                https://github.com/onnx/onnx/pull/3412
+                        Backwards compatible
+                        TODO: bfloat16 support.
+    Shape              https://github.com/onnx/onnx/pull/3580
+                        Backwards compatible
+                        TODO: optional start/end attribute.
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch import _C
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
+
+
+@_onnx_symbolic("aten::__is_")
+@_beartype.beartype
+def aten__is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if isinstance(self.type(), _C.OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return opset9.eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
+@_beartype.beartype
+def aten__isnot_(g: jit_utils.GraphContext, self, other):
+    return aten__is_(g, self, other)
+
+
+@_onnx_symbolic("aten::bernoulli")
+@_beartype.beartype
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
+@_onnx_symbolic("prim::unchecked_cast")
+@_beartype.beartype
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    # exists to refine the type of the Value
+    # if x is Optional[Tensor], unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor.
+    if isinstance(self.type(), _C.OptionalType):
+        return g.op("OptionalGetElement", self)
+
+    return self
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset16.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset16.py
new file mode 100644
index 0000000000000000000000000000000000000000..a75280c3d481dc9b4f77432dac98b067c3fcace5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset16.py
@@ -0,0 +1,187 @@
+"""This file exports ONNX ops for opset 16.
+
+Note [ONNX Operators that are added/updated in opset 16]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+New operators:
+    GridSample https://github.com/onnx/onnx/pull/3557
+
+Updated operators:
+    Identity
+    If
+    LeakyRelu
+    Loop
+    PRelu
+    RoiAlign
+    Scan
+    ScatterElements
+    ScatterND
+    Where
+    GreaterOrEqual
+    LessOrEqual
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx import _type_utils, errors, symbolic_helper, utils
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+@_beartype.beartype
+def grid_sampler(
+    g: jit_utils.GraphContext,
+    input,
+    grid,
+    mode_enum,
+    padding_mode_enum,
+    align_corners,
+):
+    # Check the input and grid tensor rank beforehand.
+    if symbolic_helper._get_tensor_rank(input) == 5:
+        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[padding_mode_enum]  # type: ignore[call-arg]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("scatter", self, dim, index, src, overload_name="src")
+
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src_sizes = symbolic_helper._get_tensor_sizes(src)
+    index_sizes = symbolic_helper._get_tensor_sizes(index)
+
+    if len(src_sizes) != len(index_sizes):
+        return symbolic_helper._unimplemented(
+            "scatter_add",
+            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
+        )
+
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
+
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+
+        return g.op(
+            "ScatterElements",
+            self,
+            index,
+            src,
+            axis_i=dim,
+            reduction_s="add",
+        )
+
+
+@_onnx_symbolic("aten::scatter_reduce")
+@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
+@_beartype.beartype
+def scatter_reduce(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    dim: int,
+    index: torch._C.Value,
+    src: torch._C.Value,
+    reduce: str,
+    include_self: bool,
+):
+    if reduce == "mean":
+        raise errors.OnnxExporterError(
+            "ONNX does not support mean reduction for scatter_reduce"
+        )
+    if not include_self:
+        raise errors.OnnxExporterError(
+            "ONNX does not support include_self=False for scatter_reduce"
+        )
+
+    reduce_mode = {  # convert torch string name to onnx string name
+        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
+        "sum": "add",
+        "prod": "mul",
+        "amin": "min",
+        "amax": "max",
+    }
+    onnx_reduce = reduce_mode[reduce]
+
+    self_rank = g.op("Size", g.op("Shape", self))
+
+    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
+    self_rank_is_zero = g.op(
+        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+    )
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
+    )
+    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+
+    self_reshape = if_context.op("Reshape", self, neg_1)
+    utils._add_output_to_block(if_context.block, self_reshape)
+    index_reshape = if_context.op("Reshape", index, neg_1)
+    utils._add_output_to_block(if_context.block, index_reshape)
+    src_reshape = if_context.op("Reshape", src, neg_1)
+    utils._add_output_to_block(if_context.block, src_reshape)
+
+    self_identity = else_context.op("Identity", self)
+    utils._add_output_to_block(else_context.block, self_identity)
+    index_identitye = else_context.op("Identity", index)
+    utils._add_output_to_block(else_context.block, index_identitye)
+    src_identity = else_context.op("Identity", src)
+    utils._add_output_to_block(else_context.block, src_identity)
+
+    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
+
+    # if self_rank == 0:
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
+    )
+    result_squeezed = if_context.op("Squeeze", result)
+    utils._add_output_to_block(if_context.block, result_squeezed)
+    result_identity = else_context.op("Identity", result)
+    utils._add_output_to_block(else_context.block, result_identity)
+    result_final = if_op.node().output()
+
+    return result_final
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset17.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset17.py
new file mode 100644
index 0000000000000000000000000000000000000000..21625e9ba498dd9ff09d9097778de8d503e7dadc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset17.py
@@ -0,0 +1,211 @@
+"""This file exports ONNX ops for opset 17.
+
+Note [ONNX Operators that are added/updated in opset 17]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
+New operators:
+    BlackmanWindow
+    DFT
+    HammingWindow
+    HannWindow
+    LayerNormalization
+    MelWeightMatrix
+    STFT
+    SequenceMap
+"""
+
+import functools
+from typing import Optional, Sequence
+
+import torch
+from torch import _C
+from torch.onnx import _type_utils, errors, symbolic_helper
+from torch.onnx._internal import _beartype, jit_utils, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = ["layer_norm", "stft"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    dtype = scalar_type.dtype()
+    if symbolic_helper._is_none(weight):
+        weight_value = torch.ones(normalized_shape, dtype=dtype)
+        weight = g.op("Constant", value_t=weight_value)
+    if symbolic_helper._is_none(bias):
+        bias_value = torch.zeros(normalized_shape, dtype=dtype)
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+    )
+
+
+def _compute_edge_sizes(n_fft, window_size):
+    """Helper function to compute the sizes of the edges (left and right)
+    of a given window centered within an FFT size."""
+    left = (n_fft - window_size) // 2
+    right = n_fft - left - window_size
+    return left, right
+
+
+@_onnx_symbolic("aten::stft")
+@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b")
+@_beartype.beartype
+def stft(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[_C.Value] = None,
+    normalized: bool = False,
+    onesided: Optional[bool] = True,
+    return_complex: Optional[bool] = False,
+) -> _C.Value:
+    """Associates `torch.stft` with the `STFT` ONNX operator.
+    Note that torch.stft calls _VF.stft, without centering or padding options.
+    Hence, this function does not contain these two arguments.
+    See torch.stft source code for more info.
+
+    Args:
+        g: Graph to write the ONNX representation into
+        input: Input tensor for the transformation
+        n_fft: FFT size
+        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
+        win_length: Size of the analysis window. Defaults to `n_fft`
+        window: Analysis window. Defaults to a window of all ones
+        normalized: Whether to return a normalized STFT
+        onesided: Whether to return only half (+1) of the results, given the
+            symmetry of the STFT
+        return_complex: Whether to return the complex value (Note: Must be
+            `False` or `None`)
+
+    Returns:
+        op: Operator for torch.stft associated with STFT (ONNX)
+    """
+    # Checks
+    if return_complex:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support complex types", value=input
+        )
+
+    # Get STFT sizes
+    frame_step_value = hop_length if hop_length is not None else n_fft // 4
+    frame_step_const = g.op(
+        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
+    )
+    frame_length_const = g.op(
+        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
+    )
+
+    # Pre-process input if needed
+    signal = input
+    signal_rank = symbolic_helper._get_tensor_rank(signal)
+    if signal_rank == 1:
+        # Add batch dimension
+        signal = g.op(
+            "Unsqueeze",
+            signal,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+    elif signal_rank > 2:
+        raise errors.SymbolicValueError(
+            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
+            f"Current rank of signal is {signal_rank}, please reduce it.",
+            value=input,
+        )
+
+    # Get window and make sure it's the same size as `win_length` or `n_fft`
+    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
+    if n_win is not None:
+        win_length_default = win_length if win_length else n_fft
+        assert n_win == win_length_default, (
+            "Analysis window size must equal `win_length` or `n_fft`. "
+            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
+        )
+
+        # Center window around zeros if needed (required by ONNX's STFT)
+        if n_win < n_fft:
+            left, right = _compute_edge_sizes(n_fft, n_win)
+            left_win = g.op("Constant", value_t=torch.zeros(left))
+            right_win = g.op("Constant", value_t=torch.zeros(right))
+            window = g.op("Concat", left_win, window, right_win, axis_i=0)
+
+    # Create window, if needed
+    if symbolic_helper._is_none(window):
+        if win_length:
+            if win_length > n_fft:
+                raise errors.SymbolicValueError(
+                    msg="The analysis window can't be longer than the size of the FFT. "
+                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
+                    value=input,
+                )
+
+            # Center window, if needed
+            left, right = _compute_edge_sizes(n_fft, win_length)
+            torch_window = torch.hstack(
+                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
+            )
+        else:
+            # Rectangle window
+            torch_window = torch.ones(n_fft)
+        assert torch_window.shape[0] == n_fft
+        window = g.op("Constant", value_t=torch_window)
+    window = g.op(
+        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+    )
+
+    # Run STFT
+    result = g.op(
+        "STFT",
+        signal,
+        frame_step_const,
+        window,
+        frame_length_const,
+        onesided_i=1 if onesided is None or onesided else 0,
+    )
+
+    # Transpose to mimic torch.stft's behavior
+    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
+
+    # Remove batch dimension, if needed
+    if signal_rank == 1:
+        result = g.op(
+            "Squeeze",
+            result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    # Normalize, if needed
+    if normalized:
+        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
+        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
+
+    return result
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset18.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset18.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8129d300147ee1a4b292f207818ea1425a2d534
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset18.py
@@ -0,0 +1,70 @@
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+"""
+
+import functools
+from typing import Sequence
+
+from torch import _C
+from torch.onnx import symbolic_helper
+from torch.onnx._internal import _beartype, registration
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["col2im"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+@_beartype.beartype
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding = []
+    for pad in padding:
+        for _ in range(2):
+            adjusted_padding.append(pad)
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset7.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset7.py
new file mode 100644
index 0000000000000000000000000000000000000000..6614648690432265db72e349deb1f7e601e246c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset7.py
@@ -0,0 +1,66 @@
+"""
+Note [ONNX operators that are added/updated from opset 7 to opset 8]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+  Expand
+
+Updated operators:
+  Min, Max, Sum, Mean: supports multidirectional broadcasting.
+  MaxPool: added optional indices output.
+  Scan
+"""
+
+import functools
+import warnings
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
+
+block_listed_operators = (
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+)
+
+
+# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+@_onnx_symbolic("aten::max")
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
+    return opset9.max(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::min")
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
+    return opset9.min(g, self, dim_or_y, keepdim)
+
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset8.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset8.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2576e45e5bfefc15fe7be490c166dfed5f5c0be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset8.py
@@ -0,0 +1,470 @@
+"""
+Note [ONNX operators that are added/updated from opset 8 to opset 9]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    Compress
+    ConstantOfShape
+    EyeLike
+    MaxUnpool
+    OneHot
+    Sinh
+    Cosh
+    Asinh
+    Acosh
+    Atanh
+    Shrink
+    IsNaN
+    Sign
+    Erf
+    Scatter
+    Where
+    NonZero
+    TfIdfVectorizer
+    MeanVarianceNormalization
+
+Updated operators:
+    BatchNormalization: removed spatial attribute.
+    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
+    Cast: more data types{string} supported.
+    Upsample: moved scales from attribute to input.
+    Scan
+"""
+
+import functools
+import warnings
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
+
+block_listed_operators = (
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
+    "any",
+    "all",
+)
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[_apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[_apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[_apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[_apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[_apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[_apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        output_size = symbolic_helper._maybe_get_const(output_size, "is")
+        if symbolic_helper._is_value(output_size):
+            return symbolic_helper._unimplemented(
+                name, "torch._C.Value (output_size) indexing"
+            )
+        if scales is None:
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
+        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
+    if not symbolic_helper._is_none(align_corners) and align_corners:
+        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
+
+    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
+        scale_factor
+    ):
+        return symbolic_helper._unimplemented(
+            "interpolate", "dynamic scales in opset 8"
+        )
+
+    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
+        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
+
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
+
+
+# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
+#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
+#       is lost after casting.
+def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
+    old_type = None
+    # Cast the input tensor to Float if its scalarType is known and is not floating number.
+    # If casting is performed, return the old scalarType, otherwise return None.
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
+        old_type = arg0_type
+        if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
+        else:
+            return (None,) + args
+    else:
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
+    return (old_type,) + args
+
+
+def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
+    if to_type is None:
+        return input
+    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
+
+
+def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
+    other = symbolic_helper._maybe_get_scalar(other)
+    other = symbolic_helper._if_scalar_type_as(other, input)
+    _, input, other = _try_cast_integer_to_float(g, input, other)
+    return g.op(op_name, input, other)
+
+
+# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
+#       integer input type not supported in opset8. Cast to float if possible.
+@_onnx_symbolic("aten::gt")
+def gt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Greater")
+
+
+@_onnx_symbolic("aten::lt")
+def lt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Less")
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other = _try_cast_integer_to_float(g, self, other)
+        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
+    else:
+        return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return bmm(g, self, other)
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    if self_rank is not None and self_rank > 2:
+        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+    elif self_rank == 0 and weight_sizes == [1]:
+        # self and weight are both scalar but weight has rank == 1, squeeze weight.
+        weight = symbolic_helper._squeeze_helper(g, weight, [0])
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
+        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
+    else:
+        return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
+    if scalar_type is None:
+        raise errors.SymbolicValueError(
+            "mm can only operate on tensors with known types", self
+        )
+    zero_constant = g.op(
+        "Constant",
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other, zero_constant = _try_cast_integer_to_float(
+            g, self, other, zero_constant
+        )
+        return _cast_to_type(
+            g,
+            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
+            old_type,
+        )
+    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
+        return _cast_to_type(
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=symbolic_helper._scalar(beta),
+                alpha_f=symbolic_helper._scalar(alpha),
+            ),
+            old_type,
+        )
+    else:
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=symbolic_helper._scalar(beta),
+            alpha_f=symbolic_helper._scalar(alpha),
+        )
+
+
+@_onnx_symbolic("aten::flatten")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
+    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
+
+    dim = input.type().dim()
+    if end_dim_i < 0:
+        end_dim_i = dim + end_dim_i
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim_i == 1 and end_dim_i == dim - 1:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=start_dim_i)
+    if start_dim_i == 0 and end_dim_i == dim - 2:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=end_dim_i + 1)
+
+    return opset9.flatten(g, input, start_dim, end_dim)
+
+
+def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if not scalar_type.dtype().is_floating_point:
+        result = g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return g.op("Cast", result, to_i=scalar_type.onnx_type())
+    else:
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=scalar_type.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device and layout in ONNX, so we ignore it
+    return _constant_fill(g, sizes, dtype, 0)
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 0)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    return _constant_fill(g, sizes, dtype, 1)
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 1)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return _constant_fill(g, sizes, dtype, const_value)
+
+
+@_onnx_symbolic("aten::full_like")
+@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, fill_value)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    if not symbolic_helper._is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    if symbolic_helper._is_packed_list(repeats):
+        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
+    else:
+        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
+        repeat_size_len = len(const_repeats)
+    if self.isCompleteTensor():
+        sizes = self.type().sizes()
+        diff_dims = repeat_size_len - len(sizes)
+        if diff_dims > 0:
+            self = opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
+    return g.op("Tile", self, repeats)
diff --git a/MLPY/Lib/site-packages/torch/onnx/symbolic_opset9.py b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset9.py
new file mode 100644
index 0000000000000000000000000000000000000000..96757b2e65cfd7bc13b176527d3cff913770174a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/symbolic_opset9.py
@@ -0,0 +1,7207 @@
+"""This file exports ONNX ops for opset 9.
+
+Opset 9 is supported by ONNX release 1.4.1
+release on 01/23/19
+"""
+from __future__ import annotations
+
+import builtins
+import functools
+import math
+import sys
+import warnings
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.nn.modules.utils
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _deprecation, _type_utils, errors, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, jit_utils, registration
+from torch.types import Number
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "abs",
+    "acos",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
+    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding_bag",
+    "embedding",
+    "empty_like",
+    "empty",
+    "eq",
+    "erf",
+    "exp",
+    "expand_as",
+    "expand",
+    "eye",
+    "fill",
+    "flatten",
+    "floor_divide",
+    "floor",
+    "floordiv",
+    "frobenius_norm",
+    "full_like",
+    "full",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "index",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log_sigmoid",
+    "log_softmax",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm_cell",
+    "lstm",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "max",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero_numpy",
+    "nonzero",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones_like",
+    "ones",
+    "onnx_placeholder",
+    "overload_by_arg_count",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_constant",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand_like",
+    "rand",
+    "randint_like",
+    "randint",
+    "randn_like",
+    "randn",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat_interleave",
+    "repeat",
+    "replication_pad",
+    "reshape_as",
+    "reshape",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter_add",
+    "scatter",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std_mean",
+    "std",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split_with_sizes",
+    "unsafe_split",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "noop_complex_operators",
+    "unused",
+    "var_mean",
+    "var",
+    "view_as",
+    "view",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zeros_like",
+    "zeros",
+    "zero",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+def _export(name: str):
+    """Exports the function in the current global namespace."""
+
+    def wrapper(func):
+        globals()[name] = func
+        __all__.append(name)
+        return func
+
+    return wrapper
+
+
+@_beartype.beartype
+def unused(g):
+    """Represents "missing" optional inputs."""
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_onnx_symbolic("aten::_shape_as_tensor")
+@_beartype.beartype
+def _shape_as_tensor(g: jit_utils.GraphContext, input):
+    return g.op("Shape", input)
+
+
+@_onnx_symbolic("aten::_reshape_from_tensor")
+@_beartype.beartype
+def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
+    if isinstance(shape, list):
+        shape = g.op("Concat", *shape, axis_i=0)
+    return reshape(g, input, shape)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def reshape(g: jit_utils.GraphContext, self, shape):
+    return symbolic_helper._reshape_helper(g, self, shape)
+
+
+@_onnx_symbolic("aten::reshape_as")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def reshape_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::add")
+@_beartype.beartype
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported", self
+        )
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Add", self, other)
+
+
+@_onnx_symbolic("aten::sub")
+@_beartype.beartype
+def sub(g: jit_utils.GraphContext, self, other, alpha=None):
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Sub", self, other)
+
+
+@_onnx_symbolic("aten::rsub")
+@_beartype.beartype
+def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
+    return sub(g, other, self, alpha=alpha)
+
+
+@_onnx_symbolic("aten::mul")
+@_beartype.beartype
+def mul(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
+        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
+        return g.op("And", self, other)
+    else:
+        return g.op("Mul", self, other)
+
+
+@_onnx_symbolic("aten::div")
+@_beartype.beartype
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@_onnx_symbolic("aten::addcmul")
+@symbolic_helper.parse_args("v", "v", "v", "f")
+@_beartype.beartype
+def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+@_beartype.beartype
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode is None:
+        return true_divide(g, self, other)
+    elif rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    elif rounding_mode == "trunc":
+        return _trunc_divide(g, self, other)
+    else:
+        raise errors.SymbolicValueError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
+            self,
+        )
+
+
+@_beartype.beartype
+def _trunc_divide(g: jit_utils.GraphContext, self, other):
+    out = g.op("Div", self, other)
+    # the correct operation is truncate, which is not supported in ONNX,
+    # we cannot call floor since it will behave differently for negative numbers
+    # (eg. -0.1 should become -0 )
+    # - if scalar_type information are not available, assume that
+    # we need to call floor (treat as float)
+    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+    # Matching PyTorch's behavior:
+    # - if self is fp the output's type is self's type
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
+    # - self is not fp and other is not fp, the output's type is self's output type
+    # - the output type defaults to Float
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
+            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        else:
+            out = g.op(
+                "Cast",
+                out,
+                to_i=scalar_type.onnx_type(),
+            )
+    else:
+        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return out
+
+
+@_beartype.beartype
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op(
+            "Xor",
+            symbolic_helper._lt_helper(g, self, zero),
+            symbolic_helper._lt_helper(g, other, zero),
+        )
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Sub", self, g.op("Mul", div, other))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Mul", fixup_mask, one)
+        return g.op("Sub", div, fixup)
+
+
+@_onnx_symbolic("aten::floor_divide")
+@_beartype.beartype
+def floor_divide(g: jit_utils.GraphContext, self, other):
+    # Deprecated behavior, floor_divide actually truncates
+    return _trunc_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::floordiv")
+@_beartype.beartype
+def floordiv(g: jit_utils.GraphContext, self, other):
+    return floor_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::true_divide")
+@_beartype.beartype
+def true_divide(g: jit_utils.GraphContext, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
+    # Case 1: either values are floating
+    # Performs div as usual.
+    # Implicit casting will be handled in scalar type analysis pass.
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        return g.op("Div", self, other)
+
+    # Case 2: neither is floating
+    # Casts both inputs to the default scalar type
+    scalar_type = torch.get_default_dtype()
+    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
+    assert scalar_type is torch.float or scalar_type is torch.double
+    if torch.get_default_dtype() is torch.double:
+        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
+
+    self = g.op("Cast", self, to_i=onnx_scalar_type)
+    other = g.op("Cast", other, to_i=onnx_scalar_type)
+    return g.op("Div", self, other)
+
+
+@_onnx_symbolic("aten::reciprocal")
+@_beartype.beartype
+def reciprocal(g: jit_utils.GraphContext, self):
+    # torch.reciprocal implicitly casts to float, so we do the same.
+    if not symbolic_helper._is_fp(self):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return g.op("Reciprocal", self)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+        or symbolic_helper._get_tensor_rank(t) is None
+        or symbolic_helper._get_tensor_rank(t)
+        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+        for t in nonempty_tensors
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    unsqueezed = [
+        symbolic_helper._unsqueeze_helper(g, t, [dim])
+        for t in symbolic_helper._unpack_list(tensor_list)
+    ]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+@_onnx_symbolic("aten::list")
+@_beartype.beartype
+def _list(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::mm")
+@_beartype.beartype
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    C = g.op("Constant", value_t=torch.tensor([1]))
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::bmm")
+@_beartype.beartype
+def bmm(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+@_beartype.beartype
+def matmul(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+@_beartype.beartype
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
+
+    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
+    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
+
+    def is_not_none_nor(v, u):
+        return v is not None and v != u
+
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
+    ):
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = symbolic_helper._scalar(alpha)
+        beta = symbolic_helper._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op(
+                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
+            )
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
+                ),
+            )
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=symbolic_helper._scalar(beta),
+        alpha_f=symbolic_helper._scalar(alpha),
+    )
+
+
+@_onnx_symbolic("aten::neg")
+@_beartype.beartype
+def neg(g: jit_utils.GraphContext, self):
+    return g.op("Neg", self)
+
+
+@_onnx_symbolic("aten::sqrt")
+@_beartype.beartype
+def sqrt(g: jit_utils.GraphContext, self):
+    if _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT16,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT64,
+    }:
+        # torch converts all int inputs to sqrt to float
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    return g.op("Sqrt", self)
+
+
+@_onnx_symbolic("aten::rsqrt")
+@_beartype.beartype
+def rsqrt(g: jit_utils.GraphContext, self):
+    return g.op(
+        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
+    )
+
+
+@_onnx_symbolic("aten::tanh")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
+@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
+@_beartype.beartype
+def tanh(g: jit_utils.GraphContext, self):
+    return g.op("Tanh", self)
+
+
+@_onnx_symbolic("aten::sin")
+@_beartype.beartype
+def sin(g: jit_utils.GraphContext, self):
+    return g.op("Sin", self)
+
+
+@_onnx_symbolic("aten::cos")
+@_beartype.beartype
+def cos(g: jit_utils.GraphContext, self):
+    return g.op("Cos", self)
+
+
+@_onnx_symbolic("aten::tan")
+@_beartype.beartype
+def tan(g: jit_utils.GraphContext, self):
+    return g.op("Tan", self)
+
+
+@_onnx_symbolic("aten::asin")
+@_beartype.beartype
+def asin(g: jit_utils.GraphContext, self):
+    return g.op("Asin", self)
+
+
+@_onnx_symbolic("aten::acos")
+@_beartype.beartype
+def acos(g: jit_utils.GraphContext, self):
+    return g.op("Acos", self)
+
+
+@_onnx_symbolic("aten::atan")
+@_beartype.beartype
+def atan(g: jit_utils.GraphContext, self):
+    return g.op("Atan", self)
+
+
+@_onnx_symbolic("aten::atan2")
+@_beartype.beartype
+def atan2(g: jit_utils.GraphContext, self, other):
+    # self is y, and other is x on coordinate
+    slope = g.op("Div", self, other)
+    atan = g.op("Atan", slope)
+    const_zero = g.op("Constant", value_t=torch.tensor(0))
+    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
+
+    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
+    second_third_quadrant = g.op(
+        "Where",
+        condition_second_or_third_quadrant,
+        g.op("Add", atan, const_pi),
+        g.op("Sub", atan, const_pi),
+    )
+
+    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
+    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
+
+    return result
+
+
+@_onnx_symbolic("aten::sigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@_beartype.beartype
+def sigmoid(g: jit_utils.GraphContext, self):
+    return g.op("Sigmoid", self)
+
+
+@_onnx_symbolic("aten::sign")
+@_beartype.beartype
+def sign(g: jit_utils.GraphContext, self):
+    return g.op("Sign", self)
+
+
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
+    assert len(starts) == len(ends)
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
+        return input
+    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
+
+
+@_beartype.beartype
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if (
+            not symbolic_helper._is_fp(self)
+            and scalar_type != _type_utils.JitScalarType.INT64
+        ):
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+@_beartype.beartype
+def _reduce_op_symbolic(onnx_op_name, allow_multi_dim_support=True):
+    @_beartype.beartype
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == tuple():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            desc = "is" if allow_multi_dim_support else "i"
+            dim, keepdim = symbolic_helper._get_const(
+                dim, desc, "dim"
+            ), symbolic_helper._get_const(keepdim, "i", "keepdim")
+            dim_list = dim if allow_multi_dim_support else [dim]
+            return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_beartype.beartype
+def overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    @_beartype.beartype
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return symbolic_helper._unimplemented(
+            f"aten::{fn.__name__}", f"with {len(args)} arguments"
+        )
+
+    return wrapper
+
+
+@_onnx_symbolic("aten::sum", decorate=[_apply_params("ReduceSum", "sum")])
+@_onnx_symbolic("aten::mean", decorate=[_apply_params("ReduceMean", "mean")])
+# torch.prod does not support multidimensional "dim"
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[_apply_params("ReduceProd", "prod", allow_multi_dim_support=False)],
+)
+@_beartype.beartype
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    symbolic = _reduce_op_symbolic(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.quantized_args(True)
+        @symbolic_helper.parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @symbolic_helper.quantized_args(True)
+        @symbolic_helper.parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        if dtype.node().kind() != "prim::Constant":
+            return symbolic_helper._unimplemented("cumsum", "dtype", dtype)
+        return g.at("cumsum", input, dim_i=dim)
+
+    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_sample_dirichlet")
+@_beartype.beartype
+def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        if not symbolic_helper._is_none(generator):
+            return symbolic_helper._unimplemented(
+                "_sample_dirichlet", "We are not able to export generator", self
+            )
+        return g.at("_sample_dirichlet", self)
+    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
+
+
+@_onnx_symbolic("aten::_standard_gamma")
+@_beartype.beartype
+def _standard_gamma(g: jit_utils.GraphContext, self, generator):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        if not symbolic_helper._is_none(generator):
+            return symbolic_helper._unimplemented(
+                "_standard_gamma", "not able to export generator", self
+            )
+        return g.at("_standard_gamma", self)
+
+    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
+
+
+@_onnx_symbolic("aten::t")
+@_beartype.beartype
+def t(g: jit_utils.GraphContext, self):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is None or rank < 2:
+        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
+        # clearly and onnxruntime fails on these cases. So we add an Identity node to
+        # mirror the behavior of eager mode.
+        return g.op("Identity", self)
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+@_onnx_symbolic("aten::numpy_T")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def numpy_T(g: jit_utils.GraphContext, input):
+    ndim = symbolic_helper._get_tensor_rank(input)
+    assert ndim is not None
+    perm = list(reversed(range(0, ndim)))
+    return g.op("Transpose", input, perm_i=perm)
+
+
+@_onnx_symbolic("aten::expand")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def expand(g: jit_utils.GraphContext, self, size, implicit):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::broadcast_to")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def broadcast_to(g: jit_utils.GraphContext, self, size):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::expand_as")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def expand_as(g: jit_utils.GraphContext, self, other):
+    self_t = symbolic_helper._maybe_get_const(self, "t")
+    if isinstance(self_t, torch.Tensor):
+        orig_type = self_t.dtype
+        self_t = self_t.to(torch.double)
+        dims = []
+        for d in range(self_t.dim()):
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+                dims.append(d)
+                self = g.op(
+                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
+                )
+
+    shape = g.op("Shape", other)
+    return g.op("Expand", self, shape)
+
+
+@_onnx_symbolic("aten::embedding")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i", "b", "v")
+@_beartype.beartype
+def embedding(
+    g: jit_utils.GraphContext,
+    weight,
+    indices,
+    padding_idx,
+    scale_grad_by_freq,
+    sparse,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients.",
+            weight,
+        )
+    if padding_idx >= 0 and GLOBALS.export_training:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
+
+    return g.op("Gather", weight, indices)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+@_beartype.beartype
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if not symbolic_helper._is_none(per_sample_weights):
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with per_sample_weights"
+        )
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "embedding_bag",
+            embedding_matrix,
+            indices,
+            offsets,
+            outputs=4,
+            scale_grad_by_freq_i=scale_grad_by_freq,
+            mode_i=mode,
+            sparse_i=sparse,
+            include_last_offset_i=include_last_offset,
+            padding_idx_i=padding_idx,
+        )
+
+    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+@_beartype.beartype
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    if symbolic_helper._maybe_get_const(dim, "i") < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
+            dim = g.op("Constant", value_t=torch.tensor(dim))
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::transpose")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None:
+        axes = list(range(rank))
+        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+        return g.op("Transpose", self, perm_i=axes)
+    elif symbolic_helper.is_caffe2_aten_fallback():
+        # if we don't have dim information we cannot
+        # output a permute so use ATen instead
+        return g.at("transpose", self, overload_name="int", dim0_i=dim0, dim1_i=dim1)
+    else:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
+            self,
+        )
+
+
+@_onnx_symbolic("aten::permute")
+@symbolic_helper.parse_args("v", "is")
+@_beartype.beartype
+def permute(g: jit_utils.GraphContext, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+@_onnx_symbolic("aten::view")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def view(g: jit_utils.GraphContext, self, size):
+    return reshape(g, self, size)
+
+
+@_onnx_symbolic("aten::view_as")
+@_beartype.beartype
+def view_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "unsafe_chunk", "unknown dimension size", self
+        )
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+@_beartype.beartype
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+@_beartype.beartype
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "is", "i", "i")
+@_beartype.beartype
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+@_beartype.beartype
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported", self
+        )
+
+    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::select")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+@_beartype.beartype
+def select(g: jit_utils.GraphContext, self, dim, index):
+    index = symbolic_helper._maybe_get_scalar(index)
+    if (not symbolic_helper._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = _constants.INT64_MAX
+        else:
+            end_index = index + 1
+        slice_node = symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
+        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
+    else:
+        # FIXME(justinchuby): can index be an int and not a value?
+        return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::square")
+@_beartype.beartype
+def square(g: jit_utils.GraphContext, self):
+    return g.op("Mul", self, self)
+
+
+@_onnx_symbolic("aten::squeeze")
+@_beartype.beartype
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
+    # Handle negative dims
+    if squeeze_dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            squeeze_dim += rank
+        else:
+            return symbolic_helper._unimplemented(
+                "squeeze", "negative axis with unknown input rank", self
+            )
+
+    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
+    if dim_size is None:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
+        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
+        return self
+
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
+    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+
+
+@_onnx_symbolic("aten::prelu")
+@_beartype.beartype
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    weight_rank = len(weight_sizes)
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = symbolic_helper._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0 and weight_sizes == [1]:
+            # self and weight are both scalar but weight has rank == 1, squeeze weight.
+            weight = symbolic_helper._squeeze_helper(g, weight, [0])
+            weight_rank = 0
+
+    if self_rank is not None and weight_rank is not None:
+        assert (
+            self_rank >= weight_rank
+        ), f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
+    return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::silu")
+@_beartype.beartype
+def silu(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Sigmoid", input))
+
+
+@_onnx_symbolic("aten::mish")
+@_beartype.beartype
+def mish(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
+
+
+@_beartype.beartype
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not symbolic_helper._is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not symbolic_helper._is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+@_onnx_symbolic("aten::relu")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def relu(g: jit_utils.GraphContext, input):
+    return _op_with_optional_float_cast(g, "Relu", input, opset_before=14)
+
+
+@_onnx_symbolic("aten::relu6")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def relu6(g: jit_utils.GraphContext, input):
+    return clamp(g, input, 0, 6)
+
+
+@_onnx_symbolic("aten::ceil")
+@_beartype.beartype
+def ceil(g: jit_utils.GraphContext, input):
+    return g.op("Ceil", input)
+
+
+@_onnx_symbolic("aten::floor")
+@_beartype.beartype
+def floor(g: jit_utils.GraphContext, input):
+    return g.op("Floor", input)
+
+
+@_onnx_symbolic("aten::len")
+@_beartype.beartype
+def _len(g: jit_utils.GraphContext, self):
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::threshold")
+@symbolic_helper.parse_args("v", "t", "t")
+@_beartype.beartype
+def threshold(g: jit_utils.GraphContext, self, threshold, value):
+    # See Note [Export inplace]
+    if symbolic_helper._scalar(threshold) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
+    if symbolic_helper._scalar(value) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
+    return g.op("Relu", self)
+
+
+@_onnx_symbolic("aten::leaky_relu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "b")
+@_beartype.beartype
+def leaky_relu(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    negative_slope: float,
+    inplace: bool = False,
+):
+    # See Note [Export inplace]
+    return g.op("LeakyRelu", input, alpha_f=negative_slope)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    # So use softmax when dim and axis both equal to ndim - 1,
+    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
+    # When input rank is not known at export time we compute softmax using a subgraph
+    # with other operators
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is not None:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+
+        is_transpose_required = input_dim != dim + 1
+
+        if is_transpose_required:
+            axes = list(range(input_dim))
+            axes[dim], axes[-1] = axes[-1], axes[dim]
+            input = g.op("Transpose", input, perm_i=axes)
+            dim = input_dim - 1
+
+        softmax = g.op("Softmax", input, axis_i=dim)
+        if dtype and dtype.node().kind() != "prim::Constant":
+            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+            softmax = g.op(
+                "Cast",
+                softmax,
+                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
+            )
+
+        if is_transpose_required:
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
+        return softmax
+
+    # Apply max normalization.
+    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
+
+    exp = g.op("Exp", input)
+    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
+    softmax = g.op("Div", exp, sum)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return softmax
+
+
+@_onnx_symbolic("aten::softplus")
+@_beartype.beartype
+def softplus(g: jit_utils.GraphContext, self, beta, threshold):
+    beta_const = symbolic_helper._maybe_get_const(beta, "f")
+    if beta_const != 1:
+        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
+    return g.op("Softplus", self)
+
+
+@_onnx_symbolic("aten::get_pool_ceil_padding")
+@_beartype.beartype
+def get_pool_ceil_padding(input, kernel_size, stride, padding):
+    # TODO(justinchuby): Looks like this op is deprecated in torch
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    dim = sizes[-len(padding) :] if sizes is not None else None
+    if dim is None or any(i is None for i in dim):
+        return symbolic_helper._unimplemented(
+            "get_pool_ceil_padding", "input size not accessible", input
+        )
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
+    # ensure last pooling starts inside
+    ceiled_output_dim = [
+        ceiled_output_dim[i] - 1
+        if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+        else ceiled_output_dim[i]
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        0
+        if (stride[i] == 1)
+        else (
+            kernel_size[i]
+            - (dim[i] + 2 * padding[i] - ((ceiled_output_dim[i] - 1) * stride[i] + 1))
+        )
+        for i in range(0, len(padding))
+    ]
+    # ensure padding is not > kernel_size
+    padding_ceil = [
+        (
+            int(padding_ceil[i])
+            if padding_ceil[i] < kernel_size[i] - 1
+            else int(kernel_size[i] - 1)
+        )
+        if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+        else int(padding_ceil[i])
+        for i in range(0, len(padding_ceil))
+    ]
+    return padding_ceil
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[
+        _apply_params(
+            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
+        ),
+        _export("max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[
+        _apply_params(
+            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
+        ),
+        _export("max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[
+        _apply_params(
+            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
+        ),
+        _export("max_pool3d"),
+    ],
+)
+@_beartype.beartype
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    @_beartype.beartype
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if set(tuple_fn(dilation)) != {1}:
+            return symbolic_helper._unimplemented(name, "dilation", input)
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
+        else:
+            padding = padding * 2
+        kwargs = {
+            "kernel_shape_i": tuple_fn(kernel_size),
+            "pads_i": padding,
+            "strides_i": tuple_fn(stride),
+        }
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and subtract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
+            # convert indices to have non-flattened indices values
+            s = symbolic_helper._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=list(tuple_fn(0)),
+                ends=list(tuple_fn(1)),
+            )
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
+    _max_pool(
+        "max_pool1d_with_indices",
+        torch.nn.modules.utils._single,
+        1,
+        return_indices=True,
+    )
+)
+max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
+    _max_pool(
+        "max_pool2d_with_indices",
+        torch.nn.modules.utils._pair,
+        2,
+        return_indices=True,
+    )
+)
+max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
+    _max_pool(
+        "max_pool3d_with_indices",
+        torch.nn.modules.utils._triple,
+        3,
+        return_indices=True,
+    )
+)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[
+        _apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        _export("avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[
+        _apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        _export("avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[
+        _apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        _export("avg_pool3d"),
+    ],
+)
+@_beartype.beartype
+def _avg_pool(name, tuple_fn):
+    @symbolic_helper.quantized_args(True)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    @_beartype.beartype
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: Union[int, Sequence[int]],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        if not stride:
+            stride = kernel_size
+        padding = symbolic_helper._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
+        assert isinstance(padding, tuple)
+        adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if count_include_pad:
+            input = _op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=adjusted_padding,
+        )
+        return output
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool1d",
+    decorate=[
+        _apply_params(
+            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
+        ),
+        _export("adaptive_avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool2d",
+    decorate=[
+        _apply_params(
+            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
+        ),
+        _export("adaptive_avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool3d",
+    decorate=[
+        _apply_params(
+            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
+        ),
+        _export("adaptive_avg_pool3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool1d",
+    decorate=[
+        _apply_params(
+            "adaptive_max_pool1d",
+            "MaxPool",
+            torch.nn.modules.utils._single,
+            max_pool1d_with_indices,
+        ),
+        _export("adaptive_max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool2d",
+    decorate=[
+        _apply_params(
+            "adaptive_max_pool2d",
+            "MaxPool",
+            torch.nn.modules.utils._pair,
+            max_pool2d_with_indices,
+        ),
+        _export("adaptive_max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool3d",
+    decorate=[
+        _apply_params(
+            "adaptive_max_pool3d",
+            "MaxPool",
+            torch.nn.modules.utils._triple,
+            max_pool3d_with_indices,
+        ),
+        _export("adaptive_max_pool3d"),
+    ],
+)
+@_beartype.beartype
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @symbolic_helper.quantized_args(True, False)
+    @_beartype.beartype
+    def symbolic_fn(g, input, output_size):
+        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
+        # by executing a GlobalPool.
+        # It is also supported for cases where the output size is a factor of the input size.
+        # For these cases the stride and kernel size are uniform along all the indices of
+        # the same dimension, which makes it possible to export it to ONNX.
+        # for MaxPool, GlobalMaxPool does not return indices,
+        # so we try using max_poolxd_with_indices, and if it is not possible
+        # (input is not a complete tensor or output size not factor of input size)
+        # then we call GlobalAveragePool and return None for the indices
+        output_size_value = output_size
+        try:
+            output_size = symbolic_helper._parse_arg(output_size, "is")
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant.", input
+            )
+        if output_size == [1] * len(output_size) and type == "AveragePool":
+            return g.op("GlobalAveragePool", input)
+        sizes = symbolic_helper._get_tensor_sizes(input)
+        try:
+            dim = sizes[2:]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            dim = None
+        if dim is None or any(i is None for i in dim):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "input size not accessible", input
+            )
+        # verify if output size % input size = 0 for all dim
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        if mod != [0] * len(mod):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "output size that are not factor of input size", output_size_value
+            )
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        # call max_poolxd_with_indices to get indices in the output
+        if type == "MaxPool":
+            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
+        return output
+
+    return symbolic_fn
+
+
+@_beartype.beartype
+def _prepare_onnx_paddings(dim: int, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+    Args:
+        dim: the dimension of the tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+    """
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    return paddings
+
+
+@_beartype.beartype
+def _convert_padding_node(input):
+    padding = symbolic_helper._maybe_get_const(input, "is")
+    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
+        input_list = symbolic_helper._unpack_list(padding)
+        try:
+            padding = [
+                symbolic_helper._get_const(v, "i", "padding") for v in input_list
+            ]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant", input
+            )
+    return padding
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+@_beartype.beartype
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
+    mode = "constant"
+    try:
+        value = symbolic_helper._get_const(value, "f", "value")
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant", value
+        )
+
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return _op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+@_beartype.beartype
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        tensors = []
+        if pad_l > 0:
+            left = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
+            middle = symbolic_helper._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[start],
+                ends=[end],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+@_beartype.beartype
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return _op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+@_beartype.beartype
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return _op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::pad")
+@_beartype.beartype
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[
+        _apply_params("upsample_nearest1d", 3, "nearest"),
+        _export("upsample_nearest1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[
+        _apply_params("upsample_nearest2d", 4, "nearest"),
+        _export("upsample_nearest2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[
+        _apply_params("upsample_nearest3d", 5, "nearest"),
+        _export("upsample_nearest3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[
+        _apply_params("upsample_linear1d", 3, "linear"),
+        _export("upsample_linear1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[
+        _apply_params("upsample_bilinear2d", 4, "linear"),
+        _export("upsample_bilinear2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[
+        _apply_params("upsample_trilinear3d", 5, "linear"),
+        _export("upsample_trilinear3d"),
+    ],
+)
+@_beartype.beartype
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+@_beartype.beartype
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, scales, mode_s=mode)
+
+
+@_onnx_symbolic("aten::bitwise_not")
+@_beartype.beartype
+def bitwise_not(g: jit_utils.GraphContext, input):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            input,
+        )
+    return g.op("Not", input)
+
+
+@_onnx_symbolic("aten::bitwise_or")
+@_beartype.beartype
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
+@_beartype.beartype
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()[f"_cast_{to_type}"]
+            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+
+        return wrap_with_cast
+
+    return decorator
+
+
+@_beartype.beartype
+def wrap_logical_op_with_negation(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrap_with_not(g, input, other):
+        return g.op("Not", func(g, input, other))
+
+    return wrap_with_not
+
+
+@_onnx_symbolic("aten::__not_")
+@_beartype.beartype
+def __not_(g: jit_utils.GraphContext, self):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            self,
+        )
+    return g.op("Not", self)
+
+
+@_onnx_symbolic("aten::eq")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def eq(g: jit_utils.GraphContext, self, other):
+    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
+        other.type(), _C.DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
+    self_node = self.node()
+    other_node = other.node()
+    if self_node.kind() == other_node.kind() == "onnx::Constant":
+        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
+            # Exporting strings to ONNX is not supported.
+            # If both strings are constant, we can compare them directly.
+            # The no-op check for equality will get constant-folded.
+            return g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    self_node.s("value") == other_node.s("value"),
+                    dtype=torch.bool,
+                ),
+            )
+
+    return g.op("Equal", self, other)
+
+
+@_onnx_symbolic("aten::ne")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+@_beartype.beartype
+def ne(g: jit_utils.GraphContext, self, other):
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::gt")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def gt(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_beartype.beartype
+def _gt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Greater", input, other)
+
+
+@_onnx_symbolic("aten::lt")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def lt(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_beartype.beartype
+def _lt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Less", input, other)
+
+
+@_onnx_symbolic("aten::ge")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+@_beartype.beartype
+def ge(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::le")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+@_beartype.beartype
+def le(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::__and_")
+@_beartype.beartype
+def __and_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::__or_")
+@_beartype.beartype
+def __or_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::__xor_")
+@_beartype.beartype
+def __xor_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_and")
+@wrap_logical_op_with_cast_to("Bool")
+@_beartype.beartype
+def logical_and(g: jit_utils.GraphContext, input, other):
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::logical_or")
+@wrap_logical_op_with_cast_to("Bool")
+@_beartype.beartype
+def logical_or(g: jit_utils.GraphContext, input, other):
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::logical_xor")
+@wrap_logical_op_with_cast_to("Bool")
+@_beartype.beartype
+def logical_xor(g: jit_utils.GraphContext, input, other):
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_not")
+@_beartype.beartype
+def logical_not(g: jit_utils.GraphContext, input):
+    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
+
+
+@_onnx_symbolic("aten::__rshift_")
+@_beartype.beartype
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+@_beartype.beartype
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+@_beartype.beartype
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+@_beartype.beartype
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # PyTorch dim and ONNX axis have different meanings.
+    # See Softmax comment for details.
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    if dim < 0:
+        dim = input_dim + dim
+    is_transpose_required = input_dim != dim + 1
+    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
+    if is_transpose_required:
+        axes = list(range(input_dim))
+        axes[dim], axes[-1] = axes[-1], axes[dim]
+        input = g.op("Transpose", input, perm_i=axes)
+        dim = input_dim - 1
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    if is_transpose_required:
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
+    return return_op
+
+
+@_onnx_symbolic("aten::_log_softmax")
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return log_softmax(g, input, dim)
+
+
+@_onnx_symbolic("aten::_convolution")
+@symbolic_helper.parse_args(
+    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
+)
+@_beartype.beartype
+def _convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::_convolution_mode")
+@symbolic_helper.parse_args(
+    "v",
+    "v",
+    "v",
+    "is",
+    "s",
+    "is",
+    "i",
+)
+@_beartype.beartype
+def _convolution_mode(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    if padding == "valid":
+        padding = "VALID"
+    elif padding == "same":
+        padding = "SAME_UPPER"
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        "auto_pad_s": padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    n = g.op("Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::convolution")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
+@_beartype.beartype
+def convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+@_beartype.beartype
+def conv1d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+@_beartype.beartype
+def conv2d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+@_beartype.beartype
+def conv3d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv_transpose1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+@_beartype.beartype
+def conv_transpose1d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+@_beartype.beartype
+def conv_transpose2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+@_beartype.beartype
+def conv_transpose3d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+@_beartype.beartype
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    symbolic_helper.check_training_mode(training, "batch_norm")
+
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
+        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
+        return res
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+@_beartype.beartype
+def native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> Tuple[_C.Value, _C.Value, _C.Value]:
+    axes = [-i for i in range(len(normalized_shape), 0, -1)]
+
+    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
+    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
+
+    mean = g.op("ReduceMean", input, axes_i=axes)
+    numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
+    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
+    variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
+    normalized = g.op("Div", numerator, denominator)
+
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
+    if not (weight is None or symbolic_helper._is_none(weight)):
+        normalized = mul(g, normalized, weight)
+    if not (bias is None or symbolic_helper._is_none(bias)):
+        normalized = add(g, normalized, bias)
+
+    # rdenominator := 1 / sqrt(variance + eps)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast", denominator, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()  # type: ignore[possibly-undefined]
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
+    return normalized, mean, rdenominator
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
+@_beartype.beartype
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+) -> _C.Value:
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "layer_norm",
+            input,
+            weight,
+            bias,
+            normalized_shape_i=normalized_shape,
+            eps_f=eps,
+            cudnn_enable_i=cudnn_enable,
+        )
+    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+    return normalized
+
+
+@_onnx_symbolic("aten::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
+@_beartype.beartype
+def instance_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats: bool,
+    momentum: Number,
+    eps: Number,
+    cudnn_enabled: bool,
+):
+    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if weight is None or symbolic_helper._is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or symbolic_helper._is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    if (
+        running_mean is None
+        or symbolic_helper._is_none(running_mean)
+        or running_var is None
+        or symbolic_helper._is_none(running_var)
+    ):
+        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+    else:
+        input_size = symbolic_helper._get_tensor_sizes(input)
+        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
+        # For more information instance_norm():
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
+        input_size_reshape = input_size.copy()
+        n = input_size[0]
+        if n is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size.",
+                input,
+            )
+        c = input_size[1]
+        input_size_reshape[0] = 1
+        input_size_reshape[1] = n * c
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
+        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("unfold", input, dimension_i=dimension, size_i=size, step_i=step)
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    # FIXME(justinchuby): Get rid of the try catch here to improve readability
+    try:
+        sizedim = sizes[dimension]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        sizedim = None
+    if sizedim is not None:
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [
+            symbolic_helper._slice_helper(
+                g, input, axes=[dimension], starts=[low], ends=[hi]
+            )
+            for low, hi in zip(low_indices, hi_indices)
+        ]
+        ndim = len(sizes)
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [
+            symbolic_helper._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return symbolic_helper._unimplemented(
+            "Unfold", "input size not accessible", input
+        )
+
+
+@_onnx_symbolic("aten::elu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "t", "t", "t")
+@_beartype.beartype
+def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
+    if scale and scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "scale", "does not support scale in Elu", scale
+        )
+    if input_scale and input_scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "input_scale", "does not support input_scale in Elu", input_scale
+        )
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
+
+
+@_onnx_symbolic("aten::selu")
+@symbolic_helper.quantized_args(True)
+@_beartype.beartype
+def selu(g: jit_utils.GraphContext, input):
+    return g.op("Selu", input)
+
+
+@_onnx_symbolic("aten::index_select")
+@symbolic_helper.parse_args("v", "i", "v")
+@_beartype.beartype
+def index_select(g: jit_utils.GraphContext, self, dim, index):
+    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
+    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
+    # also produces a tensor with the same rank as the input.
+    return symbolic_helper._select_helper(g, self, dim, index)
+
+
+@_onnx_symbolic("aten::index_put")
+@_beartype.beartype
+def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    if symbolic_helper.is_caffe2_aten_fallback():
+        args = [self] + indices_list + [values, accumulate]
+        return g.at("index_put", *args)
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        if accumulate:
+            return add(g, self, values)
+        return values
+    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
+
+
+@_onnx_symbolic("aten::index_fill")
+@_beartype.beartype
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "index_fill",
+            self,
+            index,
+            value,
+            overload_name="int_Scalar",
+            dim_i=dim_value,
+        )
+
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = expand(g, value, expanded_index_shape, None)
+
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+@_beartype.beartype
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("index_copy", self, index, source, dim_i=dim_value)
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bucketize")
+@symbolic_helper.parse_args("v", "v", "b", "b")
+@_beartype.beartype
+def bucketize(
+    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
+):
+    out_type = _C_onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = _C_onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    assert tensor_rank is not None
+    unsqueeze_axes = list(range(1, tensor_rank + 1))
+    expanded_boundaries = expand(
+        g,
+        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
+        new_shape,
+        None,
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
+@_onnx_symbolic("aten::type_as")
+@_beartype.beartype
+def type_as(g: jit_utils.GraphContext, self, other):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    other_dtype = symbolic_helper._try_get_scalar_type(other)
+    if self_dtype == other_dtype and self_dtype is not None:
+        return self
+    if other_dtype is not None:
+        return g.op(
+            "Cast",
+            self,
+            to_i=other_dtype.onnx_type(),
+        )
+
+    if symbolic_helper.is_caffe2_aten_fallback():
+        # We don't know the type of other, bail by emitting ATen
+        return g.at("type_as", self, other)
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of type_as for tensor "
+        "of unknown dtype. Please check if the dtype of the "
+        "parameter passed to the type_as function is correct.",
+        other,
+    )
+
+
+@_onnx_symbolic("aten::cosine_similarity")
+@symbolic_helper.parse_args("v", "v", "i", "f")
+@_beartype.beartype
+def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("cosine_similarity", x1, x2, dim_i=dim, eps_f=eps)
+    cross = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
+    )
+    x1_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
+    )
+    x2_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
+    )
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+@_onnx_symbolic("aten::pairwise_distance")
+@_beartype.beartype
+def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
+    if not symbolic_helper._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
+
+
+@_onnx_symbolic("aten::clone")
+# ignore clone operators that are inserted by PyTorch autograd
+@_beartype.beartype
+def clone(g: jit_utils.GraphContext, input, unused_memory_format):
+    return input
+
+
+@_onnx_symbolic("aten::abs")
+@_beartype.beartype
+def abs(g: jit_utils.GraphContext, self):
+    return g.op("Abs", self)
+
+
+@_onnx_symbolic("aten::log")
+@_beartype.beartype
+def log(g: jit_utils.GraphContext, self):
+    return g.op("Log", self)
+
+
+@_onnx_symbolic("aten::log1p")
+@_beartype.beartype
+def log1p(g: jit_utils.GraphContext, self):
+    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
+
+
+@_onnx_symbolic("aten::log10")
+@_beartype.beartype
+def log10(g: jit_utils.GraphContext, self):
+    _ln10 = 2.30258509299404568401
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
+
+
+@_onnx_symbolic("aten::pow")
+@_beartype.beartype
+def pow(g: jit_utils.GraphContext, self, exponent):
+    f_dtype = _type_utils.JitScalarType.from_value(self)
+    if not symbolic_helper._is_fp(self):
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
+    if not symbolic_helper._is_fp(exponent):
+        exponent = g.op(
+            "Cast",
+            exponent,
+            to_i=f_dtype.onnx_type(),
+        )
+    pow = g.op("Pow", self, exponent)
+    return pow
+
+
+@_onnx_symbolic("aten::clamp")
+@_beartype.beartype
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    # min or max may be None that we need to dispatch to
+    # Clip separately, as ONNX does not have None syntax
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
+            return _op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=symbolic_helper._parse_arg(min, "f"),
+                max_f=symbolic_helper._parse_arg(max, "f"),
+                opset_before=12,
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    if symbolic_helper._is_constant(min):
+        return _op_with_optional_float_cast(
+            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
+        return _op_with_optional_float_cast(g, "Max", self, min, opset_before=12)
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    if symbolic_helper._is_constant(max):
+        return _op_with_optional_float_cast(
+            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
+        return _op_with_optional_float_cast(g, "Min", self, max, opset_before=12)
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+@_beartype.beartype
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        dim = symbolic_helper._get_const(dim_or_y, "i", "dim")
+        keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+        max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+@_beartype.beartype
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        dim = symbolic_helper._get_const(dim_or_y, "i", "dim")
+        keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+        min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+@_beartype.beartype
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
+@_onnx_symbolic("aten::exp")
+@_beartype.beartype
+def exp(g: jit_utils.GraphContext, self):
+    return g.op("Exp", self)
+
+
+@_onnx_symbolic("aten::dropout_")
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "i")
+@_beartype.beartype
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    symbolic_helper.check_training_mode(train, "dropout")
+    # if train is False, dropout is no-op
+    if not train:
+        return input
+    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
+    return r
+
+
+@_onnx_symbolic(
+    "aten::alpha_dropout_", decorate=[_apply_params("aten::alpha_dropout_")]
+)  # See Note [Export inplace]
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout_",
+    decorate=[_apply_params("aten::feature_alpha_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout_", decorate=[_apply_params("aten::feature_dropout_")]
+)
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout",
+    decorate=[_apply_params("aten::feature_alpha_dropout")],
+)
+@_onnx_symbolic("aten::alpha_dropout", decorate=[_apply_params("aten::alpha_dropout")])
+@_onnx_symbolic(
+    "aten::feature_dropout", decorate=[_apply_params("aten::feature_dropout")]
+)
+@_beartype.beartype
+def _unsupported_dropout(name: str):
+    @symbolic_helper.parse_args("v", "none", "b")
+    @_beartype.beartype
+    def feature_dropout(g, input, p, train):
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        if train:
+            return symbolic_helper._unimplemented(name, "training mode", input)
+        return input
+
+    return feature_dropout
+
+
+@_onnx_symbolic("aten::norm")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
+@_beartype.beartype
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
+    if p == 1:
+        f = _reduce_op_symbolic("ReduceL1")
+    elif p == 2:
+        f = _reduce_op_symbolic("ReduceL2")
+    else:
+        raise errors.SymbolicValueError(
+            "ONNX export only p-norms with p of 1 or 2", self
+        )
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
+
+
+@_onnx_symbolic("aten::conv_tbc")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+@_beartype.beartype
+def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("conv_tbc", input, weight, bias, pad_i=pad)
+    else:
+        # input must have 3 dimensions, see:
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
+        # input = (time, batch, in_channels)
+        # weight = (kernel_width, in_channels, out_channels)
+        # bias = (out_channels,)
+        input = g.op("Transpose", input, perm_i=[1, 2, 0])
+        weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
+        conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
+        return g.op("Transpose", conv, perm_i=[2, 0, 1])
+
+
+@_onnx_symbolic("aten::_unique")
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "_unique",
+            input,
+            sorted_i=sorted,
+            return_inverse_i=return_inverse,
+            outputs=2,
+        )
+    else:
+        return symbolic_helper._onnx_unsupported("_unique", input)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "_unique2",
+            input,
+            sorted_i=sorted,
+            return_inverse_i=return_inverse,
+            return_counts_i=return_counts,
+            outputs=3,
+        )
+
+    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_cast_Byte")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+@_beartype.beartype
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+@_beartype.beartype
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+@_beartype.beartype
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::new_empty")
+@_beartype.beartype
+def new_empty(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return empty(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::scalar_tensor")
+@_beartype.beartype
+def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.FLOAT
+    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return scalar
+
+
+@_onnx_symbolic("aten::tensor")
+@_beartype.beartype
+def tensor(
+    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if symbolic_helper._is_packed_list(data):
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
+        input_list = list()
+        for t in symbolic_helper._unpack_list(data):
+            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
+            t = symbolic_helper._reshape_helper(g, t, shape_reference)
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            input_list.append(t)
+        return g.op("Concat", *input_list, axis_i=0)
+    else:
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(data)
+        if symbolic_helper._is_list(data) and (
+            symbolic_helper._is_tensor_list(data)
+            or symbolic_helper._is_scalar_list(data)
+        ):
+            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
+    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::as_tensor")
+@_beartype.beartype
+def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
+    return tensor(g, data, dtype, device)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+@_beartype.beartype
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+@_beartype.beartype
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_zeros")
+@_beartype.beartype
+def new_zeros(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zero")
+@_beartype.beartype
+def zero(g: jit_utils.GraphContext, self):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    return zeros_like(g, self, self_dtype)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+@_beartype.beartype
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+@_beartype.beartype
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_ones")
+@_beartype.beartype
+def new_ones(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return ones(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::full")
+@_beartype.beartype
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        if dtype is None:
+            scalar_type = _type_utils.JitScalarType.FLOAT
+        else:
+            scalar_type = _type_utils.JitScalarType(dtype)
+        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+        if isinstance(sizes_, list) and len(sizes_) == 0:
+            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::full_like")
+@_beartype.beartype
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if symbolic_helper._is_value(fill_value):
+        tmp = zeros_like(g, input, dtype, layout, device)
+        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
+        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        shape = g.op("Shape", input)
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::new_full")
+@_beartype.beartype
+def new_full(
+    g: jit_utils.GraphContext,
+    self,
+    size,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::eye")
+@_beartype.beartype
+def eye(g: jit_utils.GraphContext, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, pin_memory = args
+        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    if len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, pin_memory = args
+        shape = g.op(
+            "Concat",
+            symbolic_helper._unsqueeze_helper(g, n, [0]),
+            symbolic_helper._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+
+    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::slice")
+@_beartype.beartype
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
+        dim, start, end, step = args
+        step = symbolic_helper._parse_arg(step, "i")
+        if step != 1:
+            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        is_start_onnx_const = start.node().kind() == "onnx::Constant"
+        is_end_onnx_const = end.node().kind() == "onnx::Constant"
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+                raise errors.SymbolicValueError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version.",
+                    self,
+                )
+            else:
+                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
+        else:
+            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
+            dim = symbolic_helper._parse_arg(dim, "i")
+            return symbolic_helper._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
+    elif len(args) == 3:
+        # aten::slice(t[] l, int start, int end, int step) -> t[]
+        start, end, step = args
+        dim = 0
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
+        return symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[start], ends=[end]
+        )
+
+    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+@_beartype.beartype
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    return _op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def hardswish(g: jit_utils.GraphContext, self):
+    hs = hardsigmoid(g, self)
+    return g.op("Mul", self, hs)
+
+
+@_onnx_symbolic("aten::hardsigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def hardsigmoid(g: jit_utils.GraphContext, self):
+    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    return g.op("HardSigmoid", self, alpha_f=1 / 6)
+
+
+@_onnx_symbolic("aten::tanhshrink")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def tanhshrink(g: jit_utils.GraphContext, self):
+    return g.op("Sub", self, tanh(g, self))
+
+
+@_onnx_symbolic("aten::hardshrink")
+@symbolic_helper.parse_args("v", "f")
+@_beartype.beartype
+def hardshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
+    return g.op(
+        "Where",
+        cond,
+        self,
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::softshrink")
+@symbolic_helper.parse_args("v", "f")
+@_beartype.beartype
+def softshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    gt_cond = gt(g, self, lambd_op)
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    lt_cond = lt(g, self, neg(g, lambd_op))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    return add(g, gt_out, lt_out)
+
+
+@_onnx_symbolic("aten::alias")
+@_beartype.beartype
+def alias(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    # Handle negative dim
+    if dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            dim = dim + rank + 1
+        else:
+            return symbolic_helper._unimplemented(
+                "unsqueeze", "negative axis with unknown input rank", self
+            )
+
+    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
+
+
+@_onnx_symbolic("aten::sort")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "none")
+@_beartype.beartype
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Sort", "Out parameter is not supported for sort", self
+        )
+    self_sizes = symbolic_helper._get_tensor_sizes(self)
+    try:
+        dim_size = self_sizes[dim]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        dim_size = None
+
+    if dim_size is None:
+        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
+
+    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("aten::numel")
+@_beartype.beartype
+def numel(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@_onnx_symbolic("aten::topk")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
+@_beartype.beartype
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "TopK", "Out parameter is not supported for topk", self
+        )
+    if not largest:
+        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("prim::convert_element_type")
+@_beartype.beartype
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::to")
+@_beartype.beartype
+def to(g: jit_utils.GraphContext, self, *args):
+    @_beartype.beartype
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
+                or isinstance(args[0].type(), _C.DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if (
+            symbolic_helper._is_value(args[0])
+            and args[0].node().kind() == "onnx::Constant"
+        ):
+            tval = symbolic_helper._node_get(args[0].node(), "value")
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = _type_utils.JitScalarType.from_value(args[0])
+            return g.op(
+                "Cast",
+                self,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
+
+
+@_onnx_symbolic("aten::repeat")
+@_beartype.beartype
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    dtype = _type_utils.JitScalarType.INT64
+    shape_ = ones_like(g, repeats, dtype)
+    self = g.op("Expand", self, shape_)
+    return g.op("Tile", self, repeats)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+@_beartype.beartype
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        if repeats_sizes[0] is None:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported for cases with dynamic repeats",
+                self,
+            )
+        assert (
+            repeats_sizes[0] == input_sizes[dim]
+        ), "repeats must have the same size as input along dim"
+        reps = repeats_sizes[0]
+    else:
+        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
+
+    final_splits = list()
+    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
+    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = symbolic_helper._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_onnx_symbolic("aten::pixel_unshuffle")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_beartype.beartype
+def _generic_rnn(
+    g: jit_utils.GraphContext,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
+    weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
+        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
+    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
+    if batch_first:
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if dropout and train:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "dropout in training mode", input
+        )
+
+    if variant.startswith("RNN"):
+        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
+        variant = "RNN"
+
+    w_hh = all_weights[1]
+    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
+    if hidden_size is None:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "unknown hidden size", input
+        )
+
+    unidirectional = not bidirectional
+
+    prev_output = input
+
+    h_outs = []
+    if variant == "RNN" or variant == "GRU":
+        h0 = initial_states
+    elif variant == "LSTM":
+        h0, c0 = initial_states
+        c_outs = []
+
+    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+    if variant == "GRU":
+        # pytorch is reset, input, hidden
+        # onnx is    input, reset, hidden
+        reform_permutation = [(1, 2), (0, 1), (2, 3)]
+    elif variant == "LSTM":
+        # pytorch is input, forget, cell, output.
+        # onnx is    input, output, forget, cell.
+        reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+    @_beartype.beartype
+    def reform_weights(g, w, n, intervals):
+        slices = [
+            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
+        return g.op("Concat", *slices, axis_i=0)
+
+    @_beartype.beartype
+    def transform_weights_no_bias(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0]) for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
+        )
+
+    @_beartype.beartype
+    def transform_weights(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh, bias_ih, bias_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh, bias_ih, bias_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
+        )
+
+    @_beartype.beartype
+    def retrieve_state(x, start, end):
+        return (
+            x
+            if num_layers == 1
+            else symbolic_helper._slice_helper(
+                g, x, axes=[0], starts=[start], ends=[end]
+            )
+        )
+
+    for i in range(num_layers):
+        if unidirectional:
+            if weights_per_layer == 4:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+            else:
+                weight_ih, weight_hh = transform_weights_no_bias(i)
+                bias_concat = unused(g)
+
+            state_indices = i, i + 1
+        else:
+            if weights_per_layer == 4:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
+            else:
+                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
+                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
+                bias_concat = unused(g)
+
+            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
+            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
+
+            state_indices = 2 * i, 2 * i + 2
+
+        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
+
+        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
+        if variant == "RNN":
+            if bidirectional:
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
+            else:
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
+
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
+        elif variant == "GRU":
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
+        elif variant == "LSTM":
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
+
+        if bidirectional:
+            # The ONNX RNN/GRU/LSTM produce an output of dimensions
+            #   seq_len, num_directions, batch, hidden_size
+            # We have to convert to match pytorch's expected
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
+            # Transpose, and then combining it with hidden_size
+            # with Reshape.
+            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
+            prev_output = symbolic_helper._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
+        else:
+            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
+
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
+    if variant == "RNN" or variant == "GRU":
+        return prev_output, h_outs
+    elif variant == "LSTM":
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
+        return prev_output, h_outs, c_outs
+
+
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+@_beartype.beartype
+def _lstm_full(
+    g: jit_utils.GraphContext,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden, weight = symbolic_helper._unpack_list(
+        hidden_v
+    ), symbolic_helper._unpack_list(weight_v)
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+@_beartype.beartype
+def _lstm_packed(
+    g: jit_utils.GraphContext,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden, weight = symbolic_helper._unpack_list(
+        hidden_v
+    ), symbolic_helper._unpack_list(weight_v)
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
+
+
+@_onnx_symbolic("aten::lstm")
+@_beartype.beartype
+def lstm(g: jit_utils.GraphContext, *args):
+    if symbolic_helper._is_tensor_list(args[3]):
+        return _lstm_packed(g, *args)
+    else:
+        return _lstm_full(g, *args)
+
+
+@_onnx_symbolic("aten::lstm_cell")
+@_beartype.beartype
+def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = symbolic_helper._unsqueeze_helper(g, self, [0])
+    hidden = symbolic_helper._unpack_list(hidden)
+    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (
+        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
+    )
+    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return symbolic_helper._squeeze_helper(
+        g, h_outs, [0]
+    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
+
+
+@_onnx_symbolic("aten::gru", decorate=[_apply_params("GRU"), _export("gru")])
+@_onnx_symbolic(
+    "aten::rnn_tanh", decorate=[_apply_params("RNN_TANH"), _export("rnn_tanh")]
+)
+@_onnx_symbolic(
+    "aten::rnn_relu", decorate=[_apply_params("RNN_RELU"), _export("rnn_relu")]
+)
+def _one_hidden_rnn(kind: str):
+    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+    @_beartype.beartype
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
+
+    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
+
+    def symbolic(g, *args):
+        if symbolic_helper._is_tensor_list(args[3]):
+            return _rnn_packed(g, *args)
+        else:
+            return _rnn_full(g, *args)
+
+    return symbolic
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.op("_caffe2::Range", stop)
+    else:
+        # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::detach")
+@_beartype.beartype
+def detach(g: jit_utils.GraphContext, input):
+    # Erase aten::detach nodes because ONNX is inference only
+    return input
+
+
+@_onnx_symbolic("aten::contiguous")
+@symbolic_helper.parse_args("v", "i")
+@_beartype.beartype
+def contiguous(g: jit_utils.GraphContext, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise errors.SymbolicValueError(
+            "onnx memory_format support is not implemented", input
+        )
+    return input
+
+
+@_onnx_symbolic("aten::_pack_padded_sequence")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
+    # Currently there is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+    if batch_first:
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
+        raise errors.SymbolicValueError(
+            "'lengths' must be a Tensor for ONNX export", input
+        )
+    # We know it's a TensorType so this check is now safe.
+    # It's really only necessary because those operators expand to something that
+    # only works with int32 types in Caffe2...
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+
+@_onnx_symbolic("aten::_pad_packed_sequence")
+@symbolic_helper.parse_args("v", "v", "i", "t", "v")
+@_beartype.beartype
+def _pad_packed_sequence(
+    g: jit_utils.GraphContext,
+    data,
+    batch_sizes,
+    batch_first,
+    padding_value,
+    total_length,
+):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+    if batch_first:
+        data = g.op("Transpose", data, perm_i=[1, 0, 2])
+    return data, lengths
+
+
+@_onnx_symbolic("aten::randint")
+@_beartype.beartype
+def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        randn = g.op(
+            "RandomUniformLike",
+            shape_const,
+            low_f=low_i,
+            high_f=high_i,
+        )
+    else:
+        randn = g.op(
+            "RandomUniform",
+            shape_i=shape,
+            low_f=low_i,
+            high_f=high_i,
+        )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randint_like")
+@_beartype.beartype
+def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    randn = g.op(
+        "RandomUniformLike",
+        self,
+        low_f=low_i,
+        high_f=high_i,
+    )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randn")
+@_beartype.beartype
+def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomNormalLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomNormal",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::rand")
+@_beartype.beartype
+def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomUniform",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::randn_like")
+@_beartype.beartype
+def randn_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
+
+
+@_onnx_symbolic("aten::rand_like")
+@_beartype.beartype
+def rand_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    return g.op(
+        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+
+
+@_onnx_symbolic("aten::rrelu")
+@symbolic_helper.parse_args("v", "f", "f", "i", "none")
+@_beartype.beartype
+def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
+    if not training:
+        slope = (upper + lower) / 2.0
+        return g.op("LeakyRelu", input, alpha_f=slope)
+    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
+    return g.op("PRelu", input, p)
+
+
+@_onnx_symbolic("aten::bernoulli")
+@_beartype.beartype
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
+        return symbolic_helper._unimplemented(
+            "Bernoulli", "input dtype not accessible", input
+        )
+
+    rands = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=dtype.onnx_type(),
+    )
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
+    return g.op("Cast", output, to_i=dtype.onnx_type())
+
+
+@_onnx_symbolic("aten::log_sigmoid")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def log_sigmoid(g: jit_utils.GraphContext, input):
+    p = g.op("Sigmoid", input)
+    return g.op("Log", p)
+
+
+@_onnx_symbolic("aten::erf")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def erf(g: jit_utils.GraphContext, input):
+    return g.op("Erf", input)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+@_beartype.beartype
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+            input,
+        )
+
+    if dim == 0:
+        return symbolic_helper._reshape_helper(g, input, [1])
+    if dim == 1:
+        return g.op("Identity", input)
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    if end_dim < 0:
+        end_dim = dim + end_dim
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1 and end_dim == dim - 1:
+        return g.op("Flatten", input, axis_i=start_dim)
+    if start_dim == 0 and end_dim == dim - 2:
+        return g.op("Flatten", input, axis_i=end_dim + 1)
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::nonzero")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def nonzero(g: jit_utils.GraphContext, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
+    return t(g, g.op("NonZero", input))
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+@_beartype.beartype
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::isnan")
+@symbolic_helper.parse_args("v")
+@_beartype.beartype
+def isnan(g: jit_utils.GraphContext, input):
+    output = g.op("IsNaN", input)
+    return output
+
+
+@_onnx_symbolic("aten::any")
+@_beartype.beartype
+def _any(g: jit_utils.GraphContext, *args):
+    # aten::any(Tensor self)
+    if len(args) == 1:
+        input = args[0]
+        dim, keepdim = None, 0
+    # aten::any(Tensor self, int[]? dim, bool keepdim)
+    else:
+        input, dim, keepdim = args
+        # Can be int list or single int
+        dim = symbolic_helper._parse_arg(dim, "t")
+        dim = [int(d) for d in dim.view(-1)]
+        keepdim = symbolic_helper._parse_arg(keepdim, "i")
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+    input_sum = symbolic_helper._reducesum_helper(
+        g, input, axes_i=dim, keepdims_i=keepdim
+    )
+    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
+
+
+@_onnx_symbolic("aten::all")
+@_beartype.beartype
+def _all(g: jit_utils.GraphContext, *args):
+    input = g.op("Not", args[0])
+    # aten::all(Tensor self)
+    if len(args) == 1:
+        return g.op("Not", _any(g, input))
+    # aten::all(Tensor self, int[]? dim, bool keepdim)
+    else:
+        return g.op("Not", _any(g, input, args[1], args[2]))
+
+
+@_onnx_symbolic("aten::narrow")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+@_beartype.beartype
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    return symbolic_helper._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+@_beartype.beartype
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+@_beartype.beartype
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("Scatter", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
+        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        return symbolic_helper._unimplemented(
+            "scatter_add", "input dtype not accessible", self
+        )
+    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
+    if sizes:
+        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
+    else:
+        to_add = zeros_like(g, self, scalar_type)
+    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
+@_onnx_symbolic("aten::log2")
+@_beartype.beartype
+def log2(g: jit_utils.GraphContext, self):
+    _ln2 = 0.693147180559945309
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
+
+
+@_onnx_symbolic("aten::is_floating_point")
+@_beartype.beartype
+def is_floating_point(g: jit_utils.GraphContext, self):
+    if symbolic_helper._is_fp(self):
+        return g.op("Constant", value_t=torch.BoolTensor([1]))
+    return g.op("Constant", value_t=torch.BoolTensor([0]))
+
+
+@_onnx_symbolic("aten::__is_")
+@_beartype.beartype
+def __is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if symbolic_helper._is_none(self):
+            return g.op("Constant", value_t=torch.BoolTensor([1]))
+        return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@wrap_logical_op_with_negation
+@_beartype.beartype
+def __isnot_(g: jit_utils.GraphContext, self, other):
+    return __is_(g, self, other)
+
+
+@_onnx_symbolic("aten::one_hot")
+@_beartype.beartype
+def one_hot(g: jit_utils.GraphContext, self, num_classes):
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    # onnxruntime supports limited type combinations for OneHot.
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
+        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("OneHot", self, num_classes, values, axis_i=-1)
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+@_beartype.beartype
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
+    # NOTE: This workaround is needed since GatherElement is only supported
+    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=scalar_type.onnx_type(),
+    )
+    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
+    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
+
+
+@symbolic_helper.parse_args("v", "is", "i", "i")
+@_beartype.beartype
+def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if dim is None:
+        mean = g.op("ReduceMean", input, keepdims_i=0)
+        t_mean = mean
+        num_elements = numel(g, input)
+    else:
+        mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+        t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+        redudced_dims = g.op("Shape", input)
+        # dim could contain one or multiple dimensions
+        redudced_dims = g.op(
+            "Gather",
+            redudced_dims,
+            g.op("Constant", value_t=torch.tensor(dim)),
+            axis_i=0,
+        )
+        num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+    sub_v = g.op("Sub", input, t_mean)
+    sqr_sub = g.op("Mul", sub_v, sub_v)
+    keepdim_mean = 0 if dim is None else keepdim
+    var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+    # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+    if correction is None:
+        correction = 1
+    if correction != 0:
+        num_elements = g.op(
+            "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+        mul = g.op("Mul", var, num_elements)
+        var = g.op("Div", mul, g.op("Sub", num_elements, one))
+    return var, mean
+
+
+@_onnx_symbolic("aten::std")
+@_beartype.beartype
+def std(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return g.op("Sqrt", var)
+
+
+@_onnx_symbolic("aten::var")
+@_beartype.beartype
+def var(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return var
+
+
+@_onnx_symbolic("aten::var_mean")
+@_beartype.beartype
+def var_mean(g: jit_utils.GraphContext, input, *args):
+    # var_mean (and all variance-related functions) has multiple signatures, so need to manually figure
+    # out the correct arguments:
+    # aten::var_mean(Tensor self, bool unbiased)
+    # aten::var_mean(Tensor self, int[1] dim, bool unbiased, bool keepdim=False)
+    # aten::var_mean(Tensor self, int[1]? dim=None, *, int? correction=None, bool keepdim=False)
+    if len(args) == 1:
+        return _var_mean(g, input, None, args[0], None)
+    else:
+        return _var_mean(g, input, *args)
+
+
+@_onnx_symbolic("aten::std_mean")
+@_beartype.beartype
+def std_mean(g: jit_utils.GraphContext, input, *args):
+    var, mean = var_mean(g, input, *args)
+    return g.op("Sqrt", var), mean
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+@_beartype.beartype
+def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::arange")
+@_beartype.beartype
+def arange(g: jit_utils.GraphContext, *args):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("arange", *args)
+
+    @_beartype.beartype
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    @_beartype.beartype
+    def _float_step_convert(range_tensor):
+        if symbolic_helper._is_fp(range_tensor):
+            range_tensor = g.op(
+                "Cast",
+                g.op("Ceil", range_tensor),
+                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
+            )
+        return range_tensor
+
+    if len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        range_tensor = _float_step_convert(end)
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        step = symbolic_helper._unsqueeze_helper(g, step, [0])
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
+        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Sub", end, start))
+        arange_tensor = g.op(
+            "Add",
+            symbolic_helper._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+
+    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::linspace")
+@_beartype.beartype
+def linspace(
+    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
+):
+    range_tensor = symbolic_helper._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+@_onnx_symbolic("aten::lift")
+@_beartype.beartype
+def lift(g: jit_utils.GraphContext, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
+
+@_onnx_symbolic("aten::masked_fill")
+@_beartype.beartype
+def masked_fill(g: jit_utils.GraphContext, self, mask, value):
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    value = symbolic_helper._maybe_get_scalar(value)
+    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
+
+
+@_onnx_symbolic("aten::masked_fill_")
+@_beartype.beartype
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
+@_onnx_symbolic("aten::index")
+@_beartype.beartype
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("index", self, index, overload_name="Tensor")
+
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    @_beartype.beartype
+    def try_mask_to_index(index):
+        if not symbolic_helper._is_none(index) and (
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
+        ):
+            if g.opset < 9:
+                raise errors.SymbolicValueError(
+                    "Exporting masked indices are only supported after ONNX opset 9.",
+                    self,
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
+            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
+        return index
+
+    indices = [try_mask_to_index(idx) for idx in indices]
+    if len(indices) == 1:
+        return symbolic_helper._select_helper(
+            g, self, 0, indices[0], apply_reshape=False
+        )
+    else:
+        # Multiple tensors as indices. Each tensor could either be
+        #   1. prim::Constant()
+        #           representing ":" in python indexing. E.g. tensor[:, :]
+        #   2. prim::Constant[value=...] or tensor output
+        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
+        # For more info on advanced indexing,
+        # check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
+
+        # Consider a general case of
+        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
+        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
+        # Same results can be achieved through transposing t into
+        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
+        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
+        # and process the tensor indices.
+        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
+        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
+        # After gather, reshape and transpose back.
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
+        ]
+
+        if len(adv_idx_indices) == 0:
+            return self
+        elif len(adv_idx_indices) == 1:
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
+        else:
+            rank = symbolic_helper._get_tensor_rank(self)
+            if rank is None:
+                return symbolic_helper._unimplemented(
+                    "aten::index",
+                    "operator of advanced indexing on tensor of unknown rank. "
+                    "Try turning on shape inference during export: "
+                    "torch.onnx._export(..., onnx_shape_inference=True).",
+                    self,
+                )
+            # TODO: If indexing is supported natively in ONNX in future opsets,
+            #       update the warning to recommend exporting with higher opset version.
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                f"{GLOBALS.export_onnx_opset_version}"
+                " is achieved by combination of multiple ONNX operators, "
+                "including Reshape, Transpose, Concat, and Gather. "
+                "If indices include negative values, the exported graph will produce incorrect results."
+            )
+            adv_idx_count = len(adv_idx_indices)
+            shape_tensor = _shape_as_tensor(g, self)
+            dim_tensor_list = [
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
+            ]
+
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
+            self = g.op("Flatten", self, axis_i=adv_idx_count)
+
+            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
+            cum_adv_index = indices[adv_idx_indices[-1]]
+            multiplier = dim_tensor_list[adv_idx_indices[-1]]
+            for i in range(adv_idx_count - 2, -1, -1):
+                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
+                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
+
+            # perform gather
+            self = index_select(g, self, 0, cum_adv_index)
+
+            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
+            # check if all advanced indices are consecutive.
+            # Refer to https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
+            # to understand how the subarray position is decided.
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
+                # unfold regular index axes
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
+                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
+
+                # Transpose folded advanced indexed axis to its original location.
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
+                self = g.op("Transpose", self, perm_i=adv_idx_permute)
+
+                # unfold advanced index axes
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
+                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
+            else:
+                final_shape = g.op(
+                    "Concat",
+                    cum_adv_index_shape_tensor,
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
+
+            return symbolic_helper._reshape_helper(g, self, final_shape)
+
+
+@_onnx_symbolic("aten::linalg_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+@_beartype.beartype
+def linalg_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if symbolic_helper._is_none(ord):
+            self = symbolic_helper._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "dim", "Input rank must be known at export time.", self
+            )
+        if self_dim == 1:
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if symbolic_helper._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+@_beartype.beartype
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(g, self, [-1])
+        keepdim = False
+
+    if ord == math.inf:
+        result = g.op("ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        result = g.op("ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim)
+    elif ord == 0:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+        )
+    elif ord == 1:
+        result = _reduce_op_symbolic("ReduceL1")(g, self, dim=dim, keepdim=keepdim)
+    elif ord == 2:
+        result = _reduce_op_symbolic("ReduceL2")(g, self, dim=dim, keepdim=keepdim)
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = symbolic_helper._reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not symbolic_helper._is_none(dtype):
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+@_beartype.beartype
+def linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: List[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = symbolic_helper._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
+    else:
+        ord_value = symbolic_helper._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
+        # Wrap the dim vector to handle negative dim values
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time.", self
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = symbolic_helper._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@_onnx_symbolic("aten::linalg_cross")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "is", "b")
+@_beartype.beartype
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::multinomial")
+@symbolic_helper.parse_args("v", "i", "b", "v")
+@_beartype.beartype
+def multinomial(
+    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
+):
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Multinomial", "generator is not supported for multinomial", input
+        )
+    if not replacement and num_samples > 1:
+        symbolic_helper._unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+            input,
+        )
+
+    log_input = log(g, input)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=_C_onnx.TensorProtoDataType.INT64,
+        sample_size_i=num_samples,
+    )
+
+
+@_onnx_symbolic("aten::baddbmm")
+@_beartype.beartype
+def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(
+        g,
+        batch_mul,
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
+    )
+    mul_b = mul(
+        g,
+        self,
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
+    )
+    return add(g, mul_a, mul_b)
+
+
+@_onnx_symbolic("aten::meshgrid")
+@symbolic_helper.parse_args("v", "s")
+@_beartype.beartype
+def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: Optional[str] = None):
+    if indexing is None:
+        indexing = "ij"
+    elif indexing not in {"ij", "xy"}:
+        raise errors.SymbolicValueError(
+            f"Unsupported indexing: {indexing}", tensor_list
+        )
+    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
+    if indexing == "xy":
+        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
+    tensors = [
+        symbolic_helper._reshape_helper(
+            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
+        )
+        for t in unpacked_tensor_list
+    ]
+    tensors_shape = [g.op("Shape", t) for t in tensors]
+    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
+    out = []
+    for i, t in enumerate(tensors):
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
+        shape_i[i] = tensors_shape[i]
+        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
+        out.append(g.op("Expand", t_reshaped, out_shape))
+    if indexing == "xy":
+        out[0], out[1] = out[1], out[0]
+    return g.op("prim::ListConstruct", *out)
+
+
+@_onnx_symbolic("aten::remainder")
+@_beartype.beartype
+def remainder(g: jit_utils.GraphContext, input, other):
+    div = _floor_divide(g, input, other)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+@_beartype.beartype
+def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
+
+@_onnx_symbolic("aten::group_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
+@_beartype.beartype
+def group_norm(
+    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
+):
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at(
+            "group_norm",
+            input,
+            weight,
+            bias,
+            num_groups_i=num_groups,
+            eps_f=eps,
+            cudnn_enabled_i=cudnn_enabled,
+        )
+
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if channel_size is not None:
+        assert channel_size % num_groups == 0
+    input_rank = symbolic_helper._get_tensor_rank(input)
+    if input_rank is None:
+        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
+    # 0 in the shape list keeps dimension value unchanged.
+    shape = [0, num_groups, -1]
+    input_reshaped = symbolic_helper._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
+
+    # C is always divisible by num_groups
+    # Due to shape difference. we need to apply weight and bias after
+    # instance norm computation and reshape
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [1.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [0.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
+    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
+
+    if weight is None or weight.node().mustBeNone():
+        weight_value = torch.tensor(
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().mustBeNone():
+        bias_value = torch.tensor(
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        bias = g.op("Constant", value_t=bias_value)
+
+    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
+    axes = list(range(1, input_rank - 1))
+    return add(
+        g,
+        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
+        symbolic_helper._unsqueeze_helper(g, bias, axes),
+    )
+
+
+@_onnx_symbolic("aten::_weight_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
+    rank = symbolic_helper._get_tensor_rank(weight_v)
+    if rank is not None:
+        # W = g * ((v) / ||v||)
+        # Compute norm_except_dim for l2 norm. dim = None means over all dims
+        # torch's weight_norm module sets dim = -1 if it's None.
+        # This conflicts the logic for negative axes to access dims backwards
+        # TODO: Might need a fix in torch group_norm module
+        axes = list(range(rank))
+        if dim is not None:
+            if dim < -1:
+                dim += rank
+            if dim != -1:
+                axes.remove(dim)
+        norm_v = norm(g, weight_v, 2, axes, 1)
+        div = g.op("Div", weight_v, norm_v)
+        return g.op("Mul", div, weight_g)
+    if symbolic_helper.is_caffe2_aten_fallback():
+        return g.at("_weight_norm", weight_v, weight_g, dim_i=dim)
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
+        weight_v,
+    )
+
+
+@_onnx_symbolic("aten::dim")
+@_beartype.beartype
+def dim(g: jit_utils.GraphContext, self):
+    """Implement the dim functionality available for a pytorch tensor in ONNX"""
+    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
+    shape = g.op("Shape", self)
+    return g.op("Size", shape)
+
+
+@_onnx_symbolic("aten::__contains_")
+@_beartype.beartype
+def __contains_(g: jit_utils.GraphContext, self, element):
+    unpacked_list = symbolic_helper._unpack_list(self)
+    if all(
+        symbolic_helper._is_constant(x) for x in unpacked_list
+    ) and symbolic_helper._is_constant(element):
+        return g.op(
+            "Constant",
+            value_t=torch.tensor(
+                symbolic_helper._node_get(element.node(), "value")
+                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
+            ),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
+        self,
+    )
+
+
+@_onnx_symbolic("aten::__getitem_")
+@_beartype.beartype
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
+
+
+@_onnx_symbolic("aten::item")
+@_beartype.beartype
+def item(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::take")
+@_beartype.beartype
+def take(g: jit_utils.GraphContext, self, index):
+    self_flattened = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    out = index_select(g, self_flattened, 0, index)
+    out = reshape_as(g, out, index)
+    return out
+
+
+@_beartype.beartype
+def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
+    diff_ = sub(g, target, input)
+    exp_ = exp(g, target)
+    output = mul(g, exp_, diff_)
+    return output
+
+
+@_beartype.beartype
+def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
+    log_ = log(g, target)
+    diff_ = sub(g, log_, input)
+    output_pos = mul(g, target, diff_)
+    zeros_ = zeros_like(g, output_pos)
+    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
+    output = where(g, mask_, output_pos, zeros_)
+    return output
+
+
+@_onnx_symbolic("aten::kl_div")
+@symbolic_helper.parse_args("v", "v", "i", "b")
+@_beartype.beartype
+def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
+    if log_target:
+        output = _kl_div_log_target_impl(g, input, target)
+    else:
+        output = _kl_div_non_log_target_impl(g, input, target)
+
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::mse_loss")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
+    output = mul(g, sub(g, input, target), sub(g, input, target))
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "mse_loss with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::as_strided")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "is", "i")
+@_beartype.beartype
+def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
+    sizes = symbolic_helper._maybe_get_const(sizes, "is")
+    rank = len(strides)
+    self_1d = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    ind: Optional[torch.Tensor]
+    if not symbolic_helper._is_value(sizes):
+        ind = torch.tensor([0], dtype=torch.long)
+        for i, (size, stride) in enumerate(zip(sizes, strides)):
+            r_size = [1] * rank
+            r_size[i] = -1
+            ind = ind + torch.arange(size).view(r_size) * stride
+        if offset:
+            ind = ind + offset
+        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
+    else:
+        ind = None
+        for i, stride in enumerate(strides):
+            r_size = [1] * rank
+            r_size[i] = -1
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = symbolic_helper._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
+            if ind is None:
+                ind = tmp_ind
+            else:
+                ind = g.op("Add", ind, tmp_ind)
+        if offset:
+            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        return g.op("Gather", self_1d, ind)
+
+
+@_onnx_symbolic("aten::__derive_index")
+@_beartype.beartype
+def __derive_index(g: jit_utils.GraphContext, index, start, step):
+    return g.op("Add", start, g.op("Mul", index, step))
+
+
+@_onnx_symbolic("aten::__range_length")
+# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
+# if (step > 0 && lo < hi) {
+#   push(stack, 1 + (hi - 1 - lo) / step);
+# } else if (step < 0 && lo > hi) {
+#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
+# } else {
+#  push(stack, 0);
+# }
+@_beartype.beartype
+def __range_length(g: jit_utils.GraphContext, lo, hi, step):
+    sub = g.op("Sub", hi, lo)
+    div = g.op("Ceil", true_divide(g, sub, step))
+    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::linear")
+@_beartype.beartype
+def linear(g: jit_utils.GraphContext, input, weight, bias):
+    rank = symbolic_helper._get_tensor_rank(input)
+    weight = t(g, weight)
+    if rank == 2 and not bias.node().mustBeNone():
+        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        output = addmm(g, bias, input, weight, alpha, beta)
+    else:
+        output = matmul(g, input, weight)
+        if not bias.node().mustBeNone():
+            output = add(g, bias, output)
+
+    return output
+
+
+@_onnx_symbolic("aten::hann_window")
+@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
+@_beartype.beartype
+def hann_window(
+    g: jit_utils.GraphContext,
+    window_length,
+    periodic=True,
+    dtype: Optional[int] = None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype_ = torch.get_default_dtype()
+        if not dtype_ or not dtype_.is_floating_point:
+            dtype_ = torch.float
+        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    n_array = arange(g, window_length, 4, None, None, None)
+    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
+
+    if periodic is False:
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
+    output = div(g, output, window_length)
+    output = g.op(
+        "Cast",
+        square(g, sin(g, output)),
+        to_i=scalar_type.onnx_type(),
+    )
+
+    return output
+
+
+@_onnx_symbolic("aten::mv")
+@_beartype.beartype
+def mv(g: jit_utils.GraphContext, self, vec):
+    return matmul(g, self, vec)
+
+
+@_onnx_symbolic("aten::dot")
+@_beartype.beartype
+def dot(g: jit_utils.GraphContext, self, other):
+    return matmul(g, self, other)
+
+
+@_onnx_symbolic("aten::movedim")
+@symbolic_helper.parse_args("v", "t", "t")
+@_beartype.beartype
+def movedim(g: jit_utils.GraphContext, self, source, destination):
+    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
+    source = source.view(-1)
+    destination = destination.view(-1)
+
+    assert source.size() == destination.size()
+
+    if (source == destination).all():
+        return self
+
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    assert self_rank is not None
+
+    perm = list(range(self_rank))
+
+    src_dims = perm.copy()
+    dst_dims = perm.copy()
+
+    for src, dst in zip(source.tolist(), destination.tolist()):
+        perm[dst] = src
+        src_dims[src] = -1
+        dst_dims[dst] = -1
+
+    src_dims = [dim for dim in src_dims if dim != -1]
+    dst_dims = [dim for dim in dst_dims if dim != -1]
+
+    for src, dst in zip(src_dims, dst_dims):
+        perm[dst] = src
+
+    return g.op("Transpose", self, perm_i=perm)
+
+
+@_onnx_symbolic("aten::fill")
+@symbolic_helper.parse_args("v", "v")
+@_beartype.beartype
+def fill(g: jit_utils.GraphContext, self, value):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
+
+
+@_onnx_symbolic("aten::index_add")
+@_beartype.beartype
+def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
+
+    # ONNX does not support "alpha" argument, unlike aten index_add
+    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
+
+    dim = symbolic_helper._maybe_get_const(dim, "i")
+    if dim is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            "unknown 'dim' value.",
+            self,
+        )
+
+    self_dim_rank = symbolic_helper._get_tensor_rank(self)
+    other_dim_rank = symbolic_helper._get_tensor_rank(other)
+
+    if self_dim_rank is None or other_dim_rank is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            "the rank of self tensor or tensor to be added is unknown.",
+            self,
+        )
+
+    if other_dim_rank != self_dim_rank:
+        delta = self_dim_rank - other_dim_rank
+        for i in range(delta):
+            other = symbolic_helper._unsqueeze_helper(
+                g, other, [symbolic_helper._get_tensor_rank(other)]
+            )
+
+    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
+    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+
+    if (other_dim_size is not None) and (self_dim_size is not None):
+        if other_dim_size > self_dim_size:
+            raise errors.SymbolicValueError(
+                "ONNX export does not support exporting 'index_add_()' function with "
+                "duplicated values in 'index' parameter yet.",
+                self,
+            )
+
+    # Construct a new shape. It's almost as same as self except the size of the 'dim'
+    # dimension is 1, so that we can expand other dimensions as expected.
+    new_shape_axes = list(range(self_dim_rank))
+    new_shape_starts = [0 for i in range(self_dim_rank)]
+    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = symbolic_helper._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
+    other = expand_as(g, other, new_shape)
+
+    for i in range(dim):
+        index = symbolic_helper._unsqueeze_helper(g, index, [0])
+
+    for i in range(self_dim_rank - dim - 1):
+        index = symbolic_helper._unsqueeze_helper(
+            g, index, [symbolic_helper._get_tensor_rank(index)]
+        )
+
+    return scatter_add(g, self, dim, expand_as(g, index, other), other)
+
+
+@_onnx_symbolic("aten::roll")
+@symbolic_helper.parse_args("v", "is", "is")
+@_beartype.beartype
+def roll(g: jit_utils.GraphContext, self, shifts, dims):
+    assert len(shifts) == len(dims)
+
+    result = self
+    for i in range(len(shifts)):
+        shapes = []
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
+        )
+        shapes.append(shape)
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
+        shapes.append(shape)
+        result = g.op("Concat", *shapes, axis_i=dims[i])
+
+    return result
+
+
+@_onnx_symbolic("aten::cross")
+@symbolic_helper.parse_args("v", "v", "i")
+@_beartype.beartype
+def cross(g: jit_utils.GraphContext, input, other, dim=None):
+    dim = symbolic_helper._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+@_onnx_symbolic("aten::cdist")
+@_beartype.beartype
+def cdist(
+    g: jit_utils.GraphContext,
+    x1,
+    x2,
+    p=2.0,
+    compute_mode="use_mm_for_euclid_dist_if_necessary",
+):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    # Currently we ignore the 'compute_mode' variable as we use default to
+    # using matrix multiplication to calculate the euclidean distance
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
+@_onnx_symbolic("aten::lerp")
+@_beartype.beartype
+def lerp(g: jit_utils.GraphContext, self, end, weight):
+    # Conditional for better numeric. This has been discussed in
+    # https://github.com/pytorch/pytorch/pull/18871
+    diff = g.op("Sub", end, self)
+    return where(
+        g,
+        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
+        g.op("Add", self, g.op("Mul", weight, diff)),
+        g.op(
+            "Sub",
+            end,
+            g.op(
+                "Mul",
+                diff,
+                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
+            ),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::broadcast_tensors")
+@_beartype.beartype
+def broadcast_tensors(g: jit_utils.GraphContext, self):
+    all_tensors = symbolic_helper._unpack_list(self)
+    t_with_final_shape = zeros_like(g, all_tensors[0])
+
+    # Add operator supports multidirectional broadcasting. So we leverage this function
+    # to infer the final shape generated by the broadcast.
+    for t in all_tensors:
+        t_with_final_shape = add(g, t_with_final_shape, t)
+
+    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
+    return g.op("prim::ListConstruct", *t_list)
+
+
+@_onnx_symbolic("aten::is_pinned")
+def is_pinned(g: jit_utils.GraphContext, self, device=None):
+    # Unused by ONNX.
+    return None
+
+
+@_onnx_symbolic("prim::ConstantSplit")
+@_beartype.beartype
+def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantSplit", "unknown dimension size", self
+        )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+@_onnx_symbolic("prim::ConstantChunk")
+@_beartype.beartype
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if dim_size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantChunk", "unknown dimension size", self
+        )
+    split_size = (dim_size + chunks - 1) // chunks
+    return prim_constant_split(g, self, split_size, dim)
+
+
+@_onnx_symbolic("prim::shape")
+@_beartype.beartype
+def prim_shape(g: jit_utils.GraphContext, self):
+    return g.op("Shape", self)
+
+
+@_onnx_symbolic("prim::max")
+@_beartype.beartype
+def prim_max(g: jit_utils.GraphContext, self, other):
+    return _op_with_optional_float_cast(g, "Max", self, other, opset_before=12)
+
+
+@_onnx_symbolic("prim::min")
+@_beartype.beartype
+def prim_min(g: jit_utils.GraphContext, self, other=None):
+    if not other:
+        if symbolic_helper._is_packed_list(self):
+            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+        return min(g, self)
+    return min(g, self, other)
+
+
+@_onnx_symbolic("prim::data")
+@_beartype.beartype
+def prim_data(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::layout")
+def prim_layout(g: jit_utils.GraphContext, self):
+    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
+    # Layout class defined in 'c10/core/Layout.h'.
+    return g.op("Constant", value_t=torch.tensor(0))
+
+
+@_onnx_symbolic("prim::ListConstruct")
+@_beartype.beartype
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::ListUnpack")
+@_beartype.beartype
+def prim_list_unpack(
+    g: jit_utils.GraphContext, *inputs, **kwargs
+) -> Optional[List[_C.Value]]:
+    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
+        # Cancel the previous node if it is ListConstruct by returning its inputs
+        # TODO(justinchuby): Use a public method in the helper module
+        return symbolic_helper._unpack_list(inputs[0])
+
+    return None
+
+
+@_onnx_symbolic("prim::TupleConstruct")
+@_beartype.beartype
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::Uninitialized")
+@_beartype.beartype
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+# exists to refine the type of the Value
+# if x is an optional Tensor, unchecked_cast will cast
+# x to Tensor, so the rest of the graph knows that x is a Tensor
+# this doesn't do anything in runtime and is a noop in ONNX
+@_onnx_symbolic("prim::unchecked_cast")
+@_beartype.beartype
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::dtype")
+@_beartype.beartype
+def prim_dtype(g: jit_utils.GraphContext, self):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    # This node records a torch dtype as int
+    return g.op("Constant", value_t=torch.tensor(scalar_type))
+
+
+@_onnx_symbolic("prim::tolist")
+@_beartype.beartype
+def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
+    """tolist is currently supported only for 1D input tensors.
+
+    dim_val and elem_ty_val represent dimension and type annotations
+    that need to match dimension and type of the input tensor.
+    """
+    dim = symbolic_helper._maybe_get_const(dim_val, "i")
+    if dim > 1:
+        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
+    return input
+
+
+# -----------------------------------------------------------------------------
+# Symbolic functions that need extra context
+# -----------------------------------------------------------------------------
+@_onnx_symbolic("prim::device")
+@_beartype.beartype
+def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
+    output_type = g.original_node.output().type()
+    if isinstance(output_type, _C.DeviceObjType):
+        return None
+
+    return symbolic_helper._unimplemented(
+        "prim::device",
+        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
+        g.original_node.output(),
+    )
+
+
+@_onnx_symbolic("prim::Loop")
+@_beartype.beartype
+def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> List[_C.Value]:
+    node = g.original_node
+    env = g.env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    old_blocks = tuple(node.blocks())
+    new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
+    )
+
+    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+        # Copy input metadata to subblock
+        #
+        #   prim::Loop(iter, cond, input_1, ..., input_n)
+        #     block0(iter, input_1, ..., input_n)
+        #
+        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+        for i, b_in in enumerate(old_block.inputs()):
+            if i == 0 and i < len(inputs):
+                b_in.setType(inputs[i].type())
+            # For optional block inputs, they may switch between None not-None inside
+            # the loop body, so if the loop input is not optional, the block input may
+            # still need to be optional.
+            if (
+                i > 0
+                and (i + 1) < len(inputs)
+                and not isinstance(b_in.type(), _C.OptionalType)
+            ):
+                b_in.setType(inputs[i + 1].type())
+        torch._C._jit_pass_onnx_block(
+            old_block,
+            new_block_context.block,
+            operator_export_type,
+            env,
+            False,
+        )
+    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+        new_node, opset_version
+    )
+    # Run shape type inference for Loop after subblock is converted.
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            new_node, params_dict, opset_version
+        )
+    return fixed_outputs
+
+
+@_onnx_symbolic("prim::If")
+@_beartype.beartype
+def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> List[_C.Value]:
+    n = g.original_node
+    block = g.block
+    env = g.env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    static_if = inputs[0].node().kind() == "onnx::Constant"
+    if static_if:
+        # Fold static if
+        #
+        # The torch IR
+        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        # %21 : Long(device=cpu) = aten::eq(%20, %64)
+        # %22 : Long(device=cpu) = prim::If(%21)
+        #     block0():
+        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+        #     -> (%23)
+        #     block1():
+        #     -> (%65)
+        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+        #     block0():
+        #     -> (%embedding_matrix.1, %input.1)
+        #     block1():
+        #     -> (%input.1, %embedding_matrix.1)
+        # %26 : int[] = aten::size(%input.53)
+        #
+        # The converted ONNX graph
+        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
+        const_value = (
+            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+        )
+        block_idx = 0 if const_value else 1
+        current_b = list(n.blocks())[block_idx]
+        env = torch._C._jit_pass_onnx_block(
+            current_b,
+            block,
+            operator_export_type,
+            env,
+            True,
+        )
+        if_output_list = list(n.outputs())
+        current_b_list = list(current_b.outputs())
+
+        final_b_list = []
+        for idx in range(len(if_output_list)):
+            if current_b_list[idx] not in env:
+                raise errors.SymbolicValueError(
+                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
+                    current_b_list[idx],
+                )  # type:ignore[operator]
+            onnx_b = env[current_b_list[idx]]
+            final_b_list.append(onnx_b)
+        return final_b_list
+    else:
+        old_blocks = tuple(n.blocks())
+        new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
+        )
+
+        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+            torch._C._jit_pass_onnx_block(
+                old_block,
+                new_block_context.block,
+                operator_export_type,
+                env,
+                False,
+            )
+        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for If after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return fixed_outputs
+
+
+@_onnx_symbolic("prim::Constant")
+@_beartype.beartype
+def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+
+    if node.mustBeNone():
+        return None
+    # This must go before checking for string values, because some device constants
+    # have string values, but we want to keep them as unconverted Device types so
+    # that eq() can work on them.
+    if isinstance(node.output().type(), _C.DeviceObjType):
+        return None
+    if node.kindOf("value") == "t":
+        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
+    if node.kindOf("value") == "s":
+        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
+    if node.output().type().isSubtypeOf(
+        _C.ListType.ofInts()
+    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
+        return g.op(
+            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
+        )
+    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
+        str_constants = [
+            g.op("Constant", value_s=s)
+            for s in symbolic_helper._node_get(node, "value")
+        ]
+        return g.op("prim::ListConstruct", *str_constants)
+
+    raise errors.SymbolicValueError(
+        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
+        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+        node.output(),
+    )
+
+
+@_onnx_symbolic("prim::type")
+@_beartype.beartype
+def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
+    if device_value.node().kind() == "prim::device":
+        device = jit_utils.get_device_from_value(device_value.node().input())
+        if device is not None:
+            return g.op("Constant", value_s=str(device))
+
+    return symbolic_helper._unimplemented(
+        "prim::type",
+        "Device type cannot be statically determined.",
+        device_value,
+    )
+
+
+@_onnx_symbolic("onnx::Placeholder")
+@_beartype.beartype
+def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+    block = g.block
+    env = g.env
+
+    return torch._C._jit_onnx_convert_pattern_from_subblock(block, node, env)
+
+
+@_onnx_symbolic("aten::resolve_conj")
+@_onnx_symbolic("aten::resolve_neg")
+@_beartype.beartype
+def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
+    return input
+
+
+@_onnx_symbolic("aten::_conj")
+@_onnx_symbolic("aten::conj_physical")
+@_beartype.beartype
+def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
+    if symbolic_helper.is_complex_value(input):
+        # FIXME(justinchuby): report correct name for symbolic being executed
+        return symbolic_helper._onnx_unsupported(
+            "aten::_conj, aten::conj_physical",
+            input,
+        )
+
+    # they can safely be implemented as no-op for real numbers only
+    return noop_complex_operators(g, input)
+
+
+@_onnx_symbolic("aten::logit")
+@_beartype.beartype
+def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
+    one = g.op("Constant", value_t=torch.tensor(1.0))
+
+    if not symbolic_helper._is_none(eps):
+        eps = g.op(
+            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
+        )
+        one_sub_eps = g.op("Sub", one, eps)
+        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
+        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
+
+        temporary_self_less_eps = g.op("Less", temporary_self, eps)
+        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
+    else:
+        z = self
+
+    sub = g.op("Sub", one, z)
+    div = g.op("Div", z, sub)
+    return g.op("Log", div)
diff --git a/MLPY/Lib/site-packages/torch/onnx/utils.py b/MLPY/Lib/site-packages/torch/onnx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..007ed3cc0be8b9aae1bc47eb754300795795f95a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/utils.py
@@ -0,0 +1,2121 @@
+"""Functions to export models into the ONNX IR format.
+
+These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+from __future__ import annotations
+
+import contextlib
+import copy
+import inspect
+import io
+import re
+import textwrap
+import typing
+import warnings
+from typing import (
+    Any,
+    Callable,
+    cast,
+    Collection,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.jit._trace
+import torch.serialization
+from torch import _C
+from torch.onnx import (  # noqa: F401
+    _constants,
+    _exporter_states,
+    errors,
+    symbolic_caffe2,
+    symbolic_helper,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import (
+    _beartype,
+    diagnostics,
+    jit_utils,
+    onnx_proto_utils,
+    registration,
+)
+
+__all__ = [
+    "is_in_onnx_export",
+    "select_model_mode_for_export",
+    "disable_apex_o2_state_dict_hook",
+    "setup_onnx_logging",
+    "exporter_context",
+    "export",
+    "model_signature",
+    "warn_on_static_input_change",
+    "unpack_quantized_tensor",
+    "export_to_pretty_string",
+    "unconvertible_ops",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+]
+
+
+def is_in_onnx_export() -> bool:
+    """Returns whether it is in the middle of ONNX export."""
+    return GLOBALS.in_onnx_export
+
+
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore[var-annotated]
+
+
+@contextlib.contextmanager
+@_beartype.beartype
+def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
+    r"""A context manager to temporarily set the training mode of ``model``
+    to ``mode``, resetting it when we exit the with-block.
+
+    Args:
+        model: Same type and meaning as ``model`` arg to :func:`export`.
+        mode: Same type and meaning as ``training`` arg to :func:`export`.
+    """
+    if not isinstance(mode, _C_onnx.TrainingMode):
+        raise TypeError(
+            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
+        )
+    originally_training: bool = False
+
+    if hasattr(model, "training"):
+        originally_training = model.training
+
+        # ONNX opset 12 has better support for training amenable models, with updated
+        # versions of the dropout and batch_norm operators
+        if mode == _C_onnx.TrainingMode.TRAINING or (
+            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
+        ):
+            GLOBALS.export_training = True
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset "
+                    f"version {GLOBALS.export_onnx_opset_version}. "
+                    "Opset versions lower than opset 12 will not be able to export "
+                    "nodes such as Dropout and BatchNorm correctly."
+                )
+        else:
+            GLOBALS.export_training = False
+
+        GLOBALS.training_mode = mode
+        if mode == _C_onnx.TrainingMode.TRAINING:
+            model.train(True)
+        elif mode == _C_onnx.TrainingMode.EVAL:
+            model.train(False)
+        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
+
+    try:
+        yield
+    finally:
+        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+            model.train(originally_training)
+
+
+@contextlib.contextmanager
+@_beartype.beartype
+def disable_apex_o2_state_dict_hook(
+    model: Union[torch.nn.Module, torch.jit.ScriptFunction]
+):
+    # Apex O2 hook state_dict to return fp16 weights as fp32.
+    # Exporter cannot identify them as same tensors.
+    # Since this hook is only used by optimizer, it is safe to
+    # remove this hook while exporting.
+    if not isinstance(model, torch.jit.ScriptFunction):
+        model_hooks = {}  # type: ignore[var-annotated]
+        for module in model.modules():
+            for key, hook in module._state_dict_hooks.items():
+                if type(hook).__name__ == "O2StateDictHook":
+                    if module not in model_hooks:
+                        model_hooks[module] = {}
+                    model_hooks[module][key] = hook
+            if module in model_hooks:
+                for key in model_hooks[module]:
+                    module._state_dict_hooks.pop(key)
+        try:
+            yield
+        finally:
+            # Add the hooks back
+            for module, m_map in model_hooks.items():
+                for key, hook in m_map.items():
+                    module._state_dict_hooks[key] = hook
+    else:
+        try:
+            yield
+        finally:
+            pass
+
+
+@contextlib.contextmanager
+@_beartype.beartype
+def setup_onnx_logging(verbose: bool):
+    is_originally_enabled = torch.onnx.is_onnx_log_enabled()
+    if is_originally_enabled or verbose:
+        torch.onnx.enable_log()
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:
+            torch.onnx.disable_log()
+
+
+@contextlib.contextmanager
+@_beartype.beartype
+def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
+    with select_model_mode_for_export(
+        model, mode
+    ) as mode_ctx, disable_apex_o2_state_dict_hook(
+        model
+    ) as apex_ctx, setup_onnx_logging(
+        verbose
+    ) as log_ctx, diagnostics.create_export_diagnostic_context() as diagnostic_ctx:
+        yield (mode_ctx, apex_ctx, log_ctx, diagnostic_ctx)
+
+
+@_beartype.beartype
+def export(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule, torch.jit.ScriptFunction],
+    args: Union[Tuple[Any, ...], torch.Tensor],
+    f: Union[str, io.BytesIO],
+    export_params: bool = True,
+    verbose: bool = False,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    input_names: Optional[Sequence[str]] = None,
+    output_names: Optional[Sequence[str]] = None,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    opset_version: Optional[int] = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Optional[
+        Union[Mapping[str, Mapping[int, str]], Mapping[str, Sequence[int]]]
+    ] = None,
+    keep_initializers_as_inputs: Optional[bool] = None,
+    custom_opsets: Optional[Mapping[str, int]] = None,
+    export_modules_as_functions: Union[bool, Collection[Type[torch.nn.Module]]] = False,
+    autograd_inlining: Optional[bool] = True,
+) -> None:
+    r"""Exports a model into ONNX format.
+
+    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
+    :class:`torch.jit.ScriptFunction`, this runs
+    ``model`` once in order to convert it to a TorchScript graph to be exported
+    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
+    for dynamic control flow as :func:`torch.jit.trace`.
+
+    Args:
+        model (:class:`torch.nn.Module`, :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`):
+            the model to be exported.
+        args (tuple or torch.Tensor):
+
+            args can be structured either as:
+
+            1. ONLY A TUPLE OF ARGUMENTS::
+
+                args = (x, y, z)
+
+            The tuple should contain model inputs such that ``model(*args)`` is a valid
+            invocation of the model. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+
+            2. A TENSOR::
+
+                args = torch.Tensor([1])
+
+            This is equivalent to a 1-ary tuple of that Tensor.
+
+            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
+
+                args = (
+                    x,
+                    {
+                        "y": input_y,
+                        "z": input_z
+                    }
+                )
+
+            All but the last element of the tuple will be passed as non-keyword arguments,
+            and named arguments will be set from the last element. If a named argument is
+            not present in the dictionary, it is assigned the default value, or None if a
+            default value is not provided.
+
+            .. note::
+                If a dictionary is the last element of the args tuple, it will be
+                interpreted as containing named arguments. In order to pass a dict as the
+                last non-keyword arg, provide an empty dict as the last element of the args
+                tuple. For example, instead of::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            # WRONG: will be interpreted as named arguments
+                            {y: z}
+                        ),
+                        "test.onnx.pb"
+                    )
+
+                Write::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            {y: z},
+                            {}
+                        ),
+                        "test.onnx.pb"
+                    )
+
+        f: a file-like object (such that ``f.fileno()`` returns a file descriptor)
+            or a string containing a file name.  A binary protocol buffer will be written
+            to this file.
+        export_params (bool, default True): if True, all parameters will
+            be exported. Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
+        verbose (bool, default False): if True, prints a description of the
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``. If True, ONNX exporter logging will be turned on.
+        training (enum, default TrainingMode.EVAL):
+            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
+                False and in training mode if model.training is True.
+            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
+                which might interfere with training.
+        input_names (list of str, default empty list): names to assign to the
+            input nodes of the graph, in order.
+        output_names (list of str, default empty list): names to assign to the
+            output nodes of the graph, in order.
+        operator_export_type (enum, default OperatorExportTypes.ONNX):
+
+            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
+                (in the default opset domain).
+            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
+                to standard ONNX ops in the default opset domain. If unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting the op into a custom opset domain without conversion. Applies
+                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+                as well as ATen ops. For the exported model to be usable, the runtime must support
+                these non-standard ops.
+            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
+                are exported as ATen ops (in opset domain "org.pytorch.aten").
+                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
+                this instructs the runtime to use PyTorch's implementation of these ops.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+                    This may be useful if the numeric differences in implementations of operators are
+                    causing large differences in behavior between PyTorch and Caffe2 (which is more
+                    common on untrained models).
+
+            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
+                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
+                context.
+                For example::
+
+                    graph(%0 : Float):
+                    %3 : int = prim::Constant[value=0]()
+                    # conversion unsupported
+                    %4 : Float = aten::triu(%0, %3)
+                    # conversion supported
+                    %5 : Float = aten::mul(%4, %0)
+                    return (%5)
+
+                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
+
+                    graph(%0 : Float):
+                    %1 : Long() = onnx::Constant[value={0}]()
+                    # not converted
+                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
+                    # converted
+                    %3 : Float = onnx::Mul(%2, %0)
+                    return (%3)
+
+                If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then
+                Caffe2-specific behavior will be enabled, including special support
+                for ops are produced by the modules described in
+                `Quantization <https://pytorch.org/docs/stable/quantization.html>`_.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+        opset_version (int, default 17): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7 and <= 17.
+        do_constant_folding (bool, default True): Apply the constant-folding optimization.
+            Constant-folding will replace some of the ops that have all constant inputs
+            with pre-computed constant nodes.
+        dynamic_axes (dict[string, dict[int, string]] or dict[string, list(int)], default empty dict):
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"]
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    }
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs (bool, default None): If True, all the
+            initializers (typically corresponding to parameters) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
+            This may allow for better optimizations (e.g. constant folding) by
+            backends/runtimes.
+
+            If True, `deduplicate_initializers` pass will not be executed. This means
+            initializers with duplicated values will not be deduplicated and
+            will be treated as distinct inputs to the graph. This allows different
+            input initializers to be supplied at the runtime following export.
+
+            If ``opset_version < 9``, initializers MUST be part of graph
+            inputs and this argument will be ignored and the behavior will be
+            equivalent to setting this argument to True.
+
+            If None, then the behavior is chosen automatically as follows:
+
+            * If ``operator_export_type=OperatorExportTypes.ONNX``, the behavior is equivalent
+                to setting this argument to False.
+            * Else, the behavior is equivalent to setting this argument to True.
+
+        custom_opsets (dict[str, int], default empty dict): A dict with schema:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+
+        export_modules_as_functions (bool or set of type of nn.Module, default False): Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+
+        autograd_inlining (bool, default True): Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+
+    Raises:
+        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
+        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
+            uses an operator that is not supported by the exporter.
+        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
+            All errors are subclasses of :class:`errors.OnnxExporterError`.
+    """
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+        autograd_inlining=autograd_inlining,
+    )
+
+
+@_beartype.beartype
+def _is_constant_tensor_list(node):
+    if node.kind() != "prim::Constant":
+        return False
+    output_type = node.output().type()
+    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
+        return True
+    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
+        return True
+
+
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+
+
+@_beartype.beartype
+def _split_tensor_list_constants(g, block):
+    for node in block.nodes():
+        for subblock in node.blocks():
+            _split_tensor_list_constants(g, subblock)
+        if _is_constant_tensor_list(node):
+            inputs = []
+            for val in node.output().toIValue():
+                input = g.insertConstant(val)
+                input.node().moveBefore(node)
+                input.node().copyMetadata(node)
+                inputs.append(input)
+
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(_C.ListType.ofTensors())
+            )
+            lc.node().copyMetadata(node)
+            node.output().replaceAllUsesWith(lc)
+
+
+@_beartype.beartype
+def _optimize_graph(
+    graph: _C.Graph,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
+    if params_dict is None:
+        params_dict = {}
+
+    # Inline everything
+    _C._jit_pass_inline(graph)
+
+    # Remove fork/wait nodes
+    _C._jit_pass_inline_fork_wait(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.autograd_inlining:
+        _C._jit_pass_onnx_autograd_function_process(graph)
+    _C._jit_pass_lower_all_tuples(graph)
+
+    # we now record some ops like ones/zeros
+    # into a trace where we previously recorded constants.
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    if _disable_torch_constant_prop is False:
+        _C._jit_pass_constant_propagation(graph)
+
+    _split_tensor_list_constants(graph, graph)
+    # run dce to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+    _C._jit_pass_dce(graph)
+    _C._jit_pass_lint(graph)
+
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
+    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    _C._jit_pass_lint(graph)
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_fuse_addmm(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_lower_all_tuples(graph)
+    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
+    # However, there are nodes that cannot be converted without additional context.
+    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
+    # until the point where it is unpacked by listUnpack node.
+    # This pass does a preprocess, and prepares the nodes such that enough context can be received
+    # by the symbolic function.
+    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+    _C._jit_pass_onnx_preprocess(graph)
+
+    # onnx does not support tuples, so try to remove them
+    _C._jit_pass_lint(graph)
+
+    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
+    _C._jit_pass_prepare_division_for_onnx(graph)
+
+    _C._jit_pass_onnx_remove_print(graph)
+    _C._jit_pass_onnx_preprocess_caffe2(graph)
+
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    _C._jit_pass_onnx_unpack_quantized_weights(
+        graph, params_dict, symbolic_helper.is_caffe2_aten_fallback()
+    )
+    if symbolic_helper.is_caffe2_aten_fallback():
+        # Insert permutes before and after each conv op to ensure correct order.
+        _C._jit_pass_onnx_quantization_insert_permutes(graph, params_dict)
+
+        # Find consecutive permutes that are no-ops and remove them.
+        _C._jit_pass_custom_pattern_based_rewrite_graph(
+            textwrap.dedent(
+                """\
+                graph(%Pi):
+                    %Pq = quantized::nhwc2nchw(%Pi)
+                    %Pr = quantized::nchw2nhwc(%Pq)
+                    return (%Pr)"""
+            ),
+            textwrap.dedent(
+                """\
+                graph(%Ri):
+                    return (%Ri)"""
+            ),
+            graph,
+        )
+
+    # onnx only supports tensors, so we turn all out number types into tensors
+    _C._jit_pass_erase_number_types(graph)
+    if GLOBALS.onnx_shape_inference:
+        input_names = [] if input_names is None else input_names
+        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+    _C._jit_pass_onnx_lint(graph)
+
+    graph = _C._jit_pass_onnx(graph, operator_export_type)
+    _C._jit_pass_onnx_lint(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
+    _C._jit_pass_lint(graph)
+
+    # graph is not a valid jit graph anymore because types have been replaced
+    # (e.g. int with Tensor), so it now contains operators that don't actually
+    # exist. We can't run normal dead code elimination because it'd fail trying
+    # to look up if an operator has side effects, but we can run a dead code
+    # elimination variant that doesn't need to look up if an op has side effects.
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    _C._jit_pass_lint(graph)
+    graph = _C._jit_pass_canonicalize(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError as exc:
+            if (
+                _C_onnx._CAFFE2_ATEN_FALLBACK
+                and exc.args[0]
+                == "ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type!"
+            ):
+                # Caffe2 builds can have UNKNOWN_SCALAR for some tensors
+                pass
+
+    return graph
+
+
+@_beartype.beartype
+def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
+    for input, traced_input in zip(input_states[0], input_states[1]):
+        if isinstance(input, dict):
+            if list(input.keys()) != list(traced_input.keys()):
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
+                warnings.warn(warning)
+        elif isinstance(input, str):
+            if input != traced_input:
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
+                warnings.warn(warning)
+
+
+@_beartype.beartype
+def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    if (
+        operator_export_type is not operator_export_type.ONNX
+        and _C_onnx._CAFFE2_ATEN_FALLBACK
+    ):
+        if arg_value is True:
+            warnings.warn(
+                f"'{arg_name}' can be set to True only when 'operator_export_type' is "
+                "`ONNX`. Since 'operator_export_type' is not set to 'ONNX', "
+                f"'{arg_name}' argument will be ignored."
+            )
+        arg_value = False
+    return arg_value
+
+
+@_beartype.beartype
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: Optional[bool],
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
+    if opset_version < 9:
+        if keep_initializers_as_inputs is False:
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
+        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
+    ):
+        val_keep_init_as_ip = False
+    return val_keep_init_as_ip
+
+
+@_beartype.beartype
+def _decide_add_node_names(add_node_names, operator_export_type):
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
+
+
+@_beartype.beartype
+def _decide_constant_folding(do_constant_folding, operator_export_type, training):
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not _C_onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
+    return do_constant_folding
+
+
+@_beartype.beartype
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
+@_beartype.beartype
+def _decide_input_format(model, args):
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping _decide_input_format")
+        return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: Dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
+    # Cases of models with no input args
+    except IndexError:
+        warnings.warn("No input args, skipping _decide_input_format")
+    except Exception as e:
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+
+    return args
+
+
+@_beartype.beartype
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    warn_on_static_input_change(inputs_states)
+
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
+    if return_outs:
+        return trace_graph, torch_out
+    return trace_graph
+
+
+@_beartype.beartype
+def _trace_and_get_graph_from_model(model, args):
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
+
+    # Disable Autocast cache because it replaces kernel's weight and bias
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
+    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
+
+    warn_on_static_input_change(inputs_states)
+
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
+
+    return trace_graph, torch_out
+
+
+@_beartype.beartype
+def _get_param_count_list(method_graph, args_params):
+    param_count_list = []
+    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
+        if "PackedParams" in str(input_.type()):
+            in_vars, _ = torch.jit._flatten(arg_params_)
+            param_count_list.append(len(in_vars))
+        else:
+            param_count_list.append(arg_params_ is not None)
+
+    return param_count_list
+
+
+@_beartype.beartype
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    @_beartype.beartype
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                yield from flatten(inner)
+        elif isinstance(x, dict):
+            for inner in x.values():
+                yield from flatten(inner)
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
+def _create_jit_graph(
+    model: Union[torch.nn.Module, torch.jit.ScriptFunction], args: Sequence[Any]
+) -> Tuple[_C.Graph, List[_C.IValue], Optional[Any], Optional[_C.ScriptModule]]:
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
+        torch_out = None
+
+        if isinstance(model, torch.jit.ScriptModule):
+            try:
+                graph = model.forward.graph  # type: ignore[attr-defined]
+            except AttributeError as e:
+                raise RuntimeError("'forward' method must be a script method") from e
+            _C._jit_pass_onnx_function_substitution(graph)
+            freezed_module = _C._freeze_module(
+                cast(_C.ScriptModule, model._c), preserveParameters=True
+            )
+            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
+            method_graph = module._get_method("forward").graph
+            args_params = tuple(args) + tuple(params)
+            param_count_list = _get_param_count_list(method_graph, args_params)
+            in_vars, _ = torch.jit._flatten(args_params)
+            graph = _C._propagate_and_assign_input_shapes(
+                method_graph, tuple(in_vars), param_count_list, False, False
+            )
+            return graph, params, torch_out, module
+
+        # torch.jit.ScriptFunction
+        params = []
+        graph = model.graph
+        _C._jit_pass_onnx_function_substitution(graph)
+        param_count_list = _get_param_count_list(graph, args)
+        graph = _C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
+        return graph, params, torch_out, None
+
+    graph, torch_out = _trace_and_get_graph_from_model(model, args)
+    _C._jit_pass_onnx_lint(graph)
+    state_dict = torch.jit._unique_state_dict(model)
+    params = list(state_dict.values())
+    graph_inputs = list(graph.inputs())
+    user_input_num = len(graph_inputs) - len(state_dict)
+    param_names = list(state_dict.keys())
+    for i, inp in enumerate(graph_inputs):
+        if i >= user_input_num:
+            inp.setDebugName(param_names[i - user_input_num])
+    _C._jit_pass_onnx_function_substitution(graph)
+    return graph, params, torch_out, None
+
+
+@_beartype.beartype
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
+
+
+@_beartype.beartype
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
+    return example_outputs
+
+
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+@_beartype.beartype
+def unpack_quantized_tensor(value, cast_onnx_accepted=True):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = (
+            torch.tensor(value.q_scale(), dtype=torch.double)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_scale(), dtype=torch.float32)
+        )
+        q_zero_point = (
+            torch.tensor(value.q_zero_point(), dtype=torch.int64)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
+        )
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+@_beartype.beartype
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+@_beartype.beartype
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    dynamic_axes=None,
+) -> Tuple[
+    _C.Graph,
+    Dict[str, torch.Tensor],
+    Optional[
+        Union[
+            torch.Tensor,
+            Tuple[torch.Tensor, ...],
+            List[torch.Tensor],
+            Dict[str, torch.Tensor],
+            Any,  # Can be nested tuples etc.
+        ]
+    ],
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph: A TorchScript IR Graph with ONNX nodes.
+        params_dict: Dict from input param name to param value.
+        torch_out: The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
+    """
+    # TODO: can we simplify this to always return a tuple of Tensor or None?
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, (torch.Tensor, int, float, bool)):
+        args = (args,)
+
+    model = _pre_trace_quant_model(model, args)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
+    params_dict = _get_named_param_dict(graph, params)
+
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception as e:
+        torch.onnx.log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
+        example_outputs = _get_example_outputs(model, args)
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        _C._jit_pass_onnx_assign_output_shape(
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
+        )
+
+    # NB: ONNX requires complete information about output types, which might be
+    # erased by some optimizations, so we need to set it explicitly again.
+    else:
+        if not isinstance(torch_out, (list, tuple)):
+            output_wrapped = [torch_out]
+        else:
+            output_wrapped = torch_out  # type: ignore[assignment]
+
+        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            _C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+                GLOBALS.export_onnx_opset_version,
+            )
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    params_dict = _get_named_param_dict(graph, params)
+
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        if training is None or training == _C_onnx.TrainingMode.EVAL:
+            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+        params_dict = _C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError as exc:
+            if (
+                _C_onnx._CAFFE2_ATEN_FALLBACK
+                and exc.args[0]
+                == "ScalarType UNKNOWN_SCALAR is an unexpected tensor scalar type!"
+            ):
+                # Caffe2 builds can have UNKNOWN_SCALAR for some tensors
+                pass
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if GLOBALS.export_onnx_opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    # If output names lack a proper name and are identified only by their unique
+    # give them a legible name for debugging purposes
+    _apply_friendly_debug_names(graph, params_dict)
+
+    return graph, params_dict, torch_out
+
+
+@_beartype.beartype
+@torch._disable_dynamo
+def export_to_pretty_string(
+    model,
+    args,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    google_printer=False,
+    opset_version=None,
+    keep_initializers_as_inputs=None,
+    custom_opsets=None,
+    add_node_names=True,
+    do_constant_folding=True,
+    dynamic_axes=None,
+):
+    r"""
+    Similar to :func:`export`, but returns a text representation of the ONNX
+    model. Only differences in args listed below. All other args are the same
+    as :func:`export`.
+
+    Args:
+        add_node_names (bool, default True): Whether or not to set
+            NodeProto.name. This makes no difference unless
+            ``google_printer=True``.
+        google_printer (bool, default False): If False, will return a custom,
+            compact representation of the model. If True will return the
+            protobuf's `Message::DebugString()`, which is more verbose.
+
+    Returns:
+        A UTF-8 str containing a human-readable representation of the ONNX model.
+    """
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+    if custom_opsets is None:
+        custom_opsets = {}
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    with exporter_context(model, training, verbose):
+        val_keep_init_as_ip = _decide_keep_init_as_input(
+            keep_initializers_as_inputs, operator_export_type, opset_version
+        )
+        val_add_node_names = _decide_add_node_names(
+            add_node_names, operator_export_type
+        )
+        val_do_constant_folding = _decide_constant_folding(
+            do_constant_folding, operator_export_type, training
+        )
+        args = _decide_input_format(model, args)
+        graph, params_dict, torch_out = _model_to_graph(
+            model,
+            args,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            val_do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return graph._pretty_print_onnx(  # type: ignore[attr-defined]
+            params_dict,
+            opset_version,
+            False,
+            operator_export_type,
+            google_printer,
+            val_keep_init_as_ip,
+            custom_opsets,
+            val_add_node_names,
+        )
+
+
+@_beartype.beartype
+def unconvertible_ops(
+    model,
+    args,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: Optional[int] = None,
+) -> Tuple[_C.Graph, List[str]]:
+    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
+
+    The list is approximated because some ops may be removed during the conversion
+    process and don't need to be converted. Some other ops may have partial support
+    that will fail conversion with particular inputs. Please open a Github Issue
+    for op support requests.
+
+    Args:
+        model: Same as the `model` parameter in :func:`torch.onnx.export`.
+        args: Same as the `args` parameter in :func:`torch.onnx.export`.
+        training: Same as the `training` parameter in :func:`torch.onnx.export`.
+        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
+
+    Returns:
+        The JIT graph and a list of unconvertible ops in the format of "domain::op".
+    """
+
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
+    GLOBALS.export_onnx_opset_version = opset_version
+
+    try:
+        with exporter_context(model, training, verbose=False):
+            # Create a mostly clean JIT graph that contains the plain aten and
+            # other ops we can check with the symbolic registry.
+            # NOTE: We don't want to actually convert any ops to ONNX or run any
+            # symbolic functions because there is a higher chance that a pass
+            # fails or an unconvertible op messes up the graph during ONNX conversion.
+            # This way we can always generate a list just by looking at the names
+            # of the ops in the graph.
+            args = _decide_input_format(model, args)
+            model = _pre_trace_quant_model(model, args)
+            graph, _, _, module = _create_jit_graph(model, args)
+            _C._jit_pass_inline(graph)
+            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+            _C._jit_pass_erase_number_types(graph)
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    except Exception as e:
+        raise errors.OnnxExporterError(
+            "Failed to discover unconvertible ops because of errors during the JIT graph "
+            "generation process."
+        ) from e
+
+    unsupported_ops = []
+    for node in graph.nodes():
+        domain_op = node.kind()
+        if domain_op.startswith(("onnx::", "prim::")):
+            # We consider onnx and prim ops as supported ops, even though some "prim"
+            # ops are not implemented as symbolic functions, because they may be
+            # eliminated in the conversion passes. Users may still see errors caused
+            # by prim ops even though they don't show up in the list.
+            continue
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
+            # We consider all registered ops supported, even though some of them are
+            # only partially supported, because there is not yet a good way to check
+            # if an op is fully supported.
+            # TODO(justinchuby): Create a way to check if an op is fully supported.
+            unsupported_ops.append(domain_op)
+    return graph, unsupported_ops
+
+
+@_beartype.beartype
+def _setup_trace_module_map(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    export_modules_as_functions: Union[bool, Collection[Type[torch.nn.Module]]],
+) -> Set[str]:
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+
+        Example:
+            >>> _unqualified_variable_name('__main__.Foo.bar')
+            'bar'
+            >>> _unqualified_variable_name('__main__.Foo.bar.0')
+            'bar.0'
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(
+            torch.typename(type(_m)), _unqualified_variable_name(_n)
+        )
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    "Got `%s`." % (type(v).__name__)
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+@_beartype.beartype
+def _reset_trace_module_map():
+    torch.jit._trace._trace_module_map = None
+    _C._jit_pass_onnx_clear_scope_records()
+
+
+@_beartype.beartype
+def _get_module_attributes(module):
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    # Check whether module attributes can be accessed. Some classes
+    # define attributes but don't provide access to them in their
+    # constructor.
+    #
+    # For example, torch.nn.Embedding has the `freeze` variable and its
+    # type specified in the class but the attribute is not created in the
+    # constructor. In other words, there is no `self.freeze = <True | False>`
+    # in the constructor.
+    #
+    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            torch.onnx.log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+@_beartype.beartype
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions=False,
+    autograd_inlining=True,
+):
+    assert GLOBALS.in_onnx_export is False
+
+    if export_type is None:
+        export_type = _exporter_states.ExportTypes.PROTOBUF_FILE
+
+    # Discussed deprecation with Nikita Shulga and Sergii Dymchenko from Meta
+    if _C_onnx._CAFFE2_ATEN_FALLBACK:
+        warnings.warn(
+            "Caffe2 ONNX exporter is deprecated in version 2.0 and will be "
+            "removed in 2.2. Please use PyTorch 2.1 or older for this capability.",
+            category=FutureWarning,
+            stacklevel=2,
+        )
+
+    if isinstance(model, torch.nn.DataParallel):
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
+
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    # torch.onnx.export does not support opset versions >=18
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+        # We do not want to fail because we should still allow users to create
+        # custom symbolic functions for opset>17
+        warnings.warn(
+            f"Exporting to ONNX opset version {opset_version} is not supported. "
+            f"by 'torch.onnx.export()'. "
+            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
+            f"To use a newer opset version, consider 'torch.onnx.dynamo_export()'. "
+            f"Note that dynamo_export() is in preview. Please report errors with "
+            f"dynamo_export() as Github issues to https://github.com/pytorch/pytorch/issues.",
+            category=errors.OnnxExporterWarning,
+        )
+
+    if export_modules_as_functions and opset_version < 15:
+        raise ValueError(
+            "`export_modules_as_functions` is not supported for `opset_version` < 15."
+            "This is because `opset_version` < 15 implies IR version < 8, which means "
+            "no local function support. "
+        )
+    if not operator_export_type:
+        if _C_onnx._CAFFE2_ATEN_FALLBACK:
+            operator_export_type = _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        else:
+            operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    # By default, training=TrainingMode.EVAL,
+    # which is good because running a model in training mode could result in
+    # internal buffers getting updated, dropout getting applied, etc.
+    # If you really know what you're doing, you can turn
+    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
+    # (to preserve whatever the original training mode was.)
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    try:
+        GLOBALS.in_onnx_export = True
+        _autograd_inlining_previous = GLOBALS.autograd_inlining
+        GLOBALS.autograd_inlining = autograd_inlining
+
+        module_typenames_to_export_as_functions: Set[str] = set()
+        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
+            module_typenames_to_export_as_functions = _setup_trace_module_map(
+                model, export_modules_as_functions
+            )
+
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs,
+                operator_export_type,
+                opset_version,
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
+            # Normally f can be a file-like object, but for large models, the external data format requires a
+            # valid `model_file_location`. Code in export.cpp will enforce this.
+            if isinstance(f, str):
+                model_file_location = f
+            else:
+                model_file_location = ""
+            args = _decide_input_format(model, args)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
+
+            # TODO: Don't allocate a in-memory string for the protobuf
+            defer_weight_export = (
+                export_type is not _exporter_states.ExportTypes.PROTOBUF_FILE
+            )
+            if custom_opsets is None:
+                custom_opsets = {}
+
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+            node_attr_to_name = {}  # type: ignore[var-annotated]
+            if module_typenames_to_export_as_functions:
+                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
+                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
+                    graph,
+                    module_typenames_to_export_as_functions,
+                    list(params_dict.keys()),
+                )
+
+            if keep_initializers_as_inputs is not True:
+                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                    graph, params_dict, getattr(model, "training", False)  # type: ignore[arg-type]
+                )
+            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
+            if export_params:
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            else:
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    False,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            # insert function_proto into model_proto.
+            proto = onnx_proto_utils._add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
+            if verbose:
+                torch.onnx.log("Exported graph: ", graph)
+            onnx_proto_utils._export_file(proto, f, export_type, export_map)
+            # The ONNX checker only works for ONNX graph. So if the operator_export_type is not ONNX,
+            # we can skip this check.
+            # If large model format export is enabled, proto will only contain data location instead of
+            # raw data and _check_onnx_proto() will fail because it can only handle the raw ONNX proto
+            # string in memory.
+            if (operator_export_type is _C_onnx.OperatorExportTypes.ONNX) and (
+                not val_use_external_data_format
+            ):
+                try:
+                    _C._check_onnx_proto(proto)
+                except RuntimeError as e:
+                    raise errors.CheckerError(e) from e
+    finally:
+        assert GLOBALS.in_onnx_export
+        GLOBALS.in_onnx_export = False
+        GLOBALS.autograd_inlining = _autograd_inlining_previous
+        _reset_trace_module_map()
+
+    return torch_out
+
+
+@_beartype.beartype
+def _apply_friendly_debug_names(graph, params):
+    for n in graph.nodes():
+        for v in n.inputs():
+            old_name = v.debugName()
+            if old_name != str(v.unique()):
+                continue
+            new_name = f"{n.kind()}_{v.unique()}"
+            v.setDebugName(new_name)
+            if old_name in params:
+                params[new_name] = params.pop(old_name)
+
+
+@_beartype.beartype
+def _set_input_and_output_names(graph, input_names, output_names):
+    @_beartype.beartype
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                "number of %s names provided (%d) exceeded number of %ss (%d)"
+                % (descriptor, len(name_list), descriptor, len(node_list))
+            )
+
+        # Mark if the output node DebugName is set before.
+        output_node_set = set()
+        for i, (name, node) in enumerate(zip(name_list, node_list)):
+            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
+            if descriptor == "output":
+                if node in output_node_set:
+                    identity_node = graph.create("onnx::Identity")
+                    identity_node.insertAfter(node.node())
+                    identity_node.addInput(node)
+                    identity_node.output().setType(node.type())
+                    graph.return_node().replaceInput(i, identity_node.output())
+                    node = identity_node.output()
+                output_node_set.add(node)
+
+            if node.debugName() != name:
+                node.setDebugName(name)
+
+    set_names(list(graph.inputs()), input_names, "input")
+    set_names(list(graph.outputs()), output_names, "output")
+
+
+@_beartype.beartype
+def _run_symbolic_method(g, op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+        )
+        return symbolic_fn(graph_context, *args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
+        raise
+
+
+@_beartype.beartype
+def _add_block(node: _C.Node) -> _C.Block:
+    return node.addBlock()
+
+
+@_beartype.beartype
+def _add_input_to_block(block: _C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
+
+
+@_beartype.beartype
+def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
+    return block.registerOutput(value)
+
+
+@_beartype.beartype
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+    # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available
+
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
+    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
+    is_aten_fallback_export = (
+        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+    is_caffe2_build = _C_onnx._CAFFE2_ATEN_FALLBACK
+
+    if not name.startswith("aten::"):
+        return False
+
+    if is_caffe2_build:
+        if (
+            is_onnx_aten_export or is_aten_fallback_export
+        ) and not is_exportable_aten_op:
+            return True
+    else:
+        if is_onnx_aten_export or (
+            is_aten_fallback_export and not is_exportable_aten_op
+        ):
+            return True
+
+    return False
+
+
+@_beartype.beartype
+def _need_symbolic_context(symbolic_fn: Callable) -> bool:
+    """Checks if the first argument to symbolic_fn is annotated as type `torch.onnx.SymbolicContext`."""
+    params = tuple(inspect.signature(symbolic_fn).parameters.values())
+    # When the annotation is postpone-evaluated, the annotation is a string
+    # and not a type. We need to use get_type_hints to get the real type.
+    if not params:
+        return False
+    first_param_name = params[0].name
+    type_hints = typing.get_type_hints(symbolic_fn)
+    if first_param_name not in type_hints:
+        return False
+    param_type = type_hints[first_param_name]
+    return issubclass(param_type, _exporter_states.SymbolicContext)
+
+
+@_beartype.beartype
+def _symbolic_context_handler(symbolic_fn: Callable) -> Callable:
+    """Decorator that provides the symbolic context to the symbolic function if needed."""
+    if _need_symbolic_context(symbolic_fn):
+        # TODO(justinchuby): Update the module name of GraphContext when it is public
+        warnings.warn(
+            "The first argument to symbolic functions is deprecated in 1.13 and will be "
+            "removed in the future. Please annotate treat the first argument (g) as GraphContext "
+            "and use context information from the object instead.",
+            category=FutureWarning,
+        )
+
+        def wrapper(graph_context: jit_utils.GraphContext, *args, **kwargs):
+            symbolic_context = _exporter_states.SymbolicContext(
+                params_dict=graph_context.params_dict,
+                env=graph_context.env,
+                cur_node=graph_context.original_node,
+                onnx_block=graph_context.block,
+            )
+            return symbolic_fn(symbolic_context, graph_context, *args, **kwargs)
+
+        return wrapper
+    return symbolic_fn
+
+
+@_beartype.beartype
+def _get_aten_op_overload_name(n: _C.Node) -> str:
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::") or symbolic_helper.is_caffe2_aten_fallback():
+        return ""
+    return _C.parse_schema(schema).overload_name
+
+
+@_beartype.beartype
+def _run_symbolic_function(
+    graph: _C.Graph,
+    block: _C.Block,
+    node: _C.Node,
+    inputs: Any,
+    env: Dict[_C.Value, _C.Value],
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+) -> Optional[Union[_C.Value, Sequence[Optional[_C.Value]]]]:
+    """Runs a symbolic function.
+
+    The function is used in C++ to export the node to ONNX.
+
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
+
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    # See Note [Export inplace]
+    node_kind = node.kind()
+    if node_kind.endswith("_"):
+        # Treat relu_ -> relu; add_ -> add etc.
+        ns_op_name = node_kind[:-1]
+    else:
+        ns_op_name = node_kind
+
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
+
+    graph_context = jit_utils.GraphContext(
+        graph=graph,
+        block=block,
+        opset=opset_version,
+        original_node=node,
+        params_dict=_params_dict,
+        env=env,
+    )
+
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.aten_op(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
+    try:
+        # Caffe2-specific: Quantized op symbolics are registered for opset 9 only.
+        if symbolic_helper.is_caffe2_aten_fallback() and opset_version == 9:
+            symbolic_caffe2.register_quantized_ops("caffe2", opset_version)
+
+        if namespace == "quantized" and symbolic_helper.is_caffe2_aten_fallback():
+            domain = "caffe2"
+        else:
+            domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
+
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
+                attrs = {
+                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
+                }
+                return symbolic_fn(graph_context, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        if namespace == "onnx":
+            # Clone node to trigger ONNX shape inference
+            return graph_context.op(op_name, *inputs, **attrs, outputs=node.outputsSize())  # type: ignore[attr-defined]
+
+        raise errors.UnsupportedOperatorError(
+            symbolic_function_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
+    except RuntimeError:
+        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            return None
+        elif (
+            operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+            and not symbolic_helper.is_caffe2_aten_fallback()
+        ):
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {
+                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+                for k in node.attributeNames()
+            }
+            return graph_context.aten_op(
+                op_name,
+                *inputs,
+                overload_name=_get_aten_op_overload_name(node),
+                **attrs,
+            )
+        raise
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
+        raise
+
+
+@_beartype.beartype
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
+    if ns == "onnx":
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
+
+
+@_beartype.beartype
+def register_custom_op_symbolic(
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
+):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        symbolic_fn (Callable): A function that takes in the ONNX graph and
+            the input arguments to the current operator, and returns new
+            operator nodes to add to the graph.
+        opset_version (int): The ONNX opset version in which to register.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.custom_onnx_symbolic(
+        symbolic_name,
+        opset_version,
+        decorate=[
+            _symbolic_context_handler,
+        ],
+    )(symbolic_fn)
+
+
+@_beartype.beartype
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+    """Unregisters ``symbolic_name``.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        opset_version (int): The ONNX opset version in which to unregister.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.registry.unregister(symbolic_name, opset_version)
+
+
+@_beartype.beartype
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
+    if len(dynamic_axes) == 0:
+        return
+
+    if hasattr(model, "graph"):
+        # Extracting set of valid input/output names that shall be used for dynamic_axes
+        if (input_names is None) or len(input_names) == 0:
+            input_names = [x.debugName() for x in model.graph.inputs()]
+        if (output_names is None) or len(output_names) == 0:
+            output_names = [y.debugName() for y in model.graph.outputs()]
+
+    valid_names = set((input_names or []) + (output_names or []))
+
+    # If dynamic axes are provided as a list rather than dictionary, they should
+    # first get converted to a dictionary in expected format. If desired axes names
+    # are not provided for dynamic axes, automatic names shall be generated for
+    # provided dynamic axes of specified input/output
+    for key, value in dynamic_axes.items():
+        if key not in valid_names:
+            warnings.warn(
+                f"Provided key {key} for dynamic axes is not a valid input/output name"
+            )
+        if isinstance(value, list):
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+            )
+
+            value_dict = {}
+            for i, x in enumerate(value):
+                if not isinstance(x, int):
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
+                if x in value_dict:
+                    warnings.warn(
+                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                    )
+                else:
+                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
+            dynamic_axes[key] = value_dict
+
+
+def model_signature(model: Union[torch.nn.Module, Callable]) -> inspect.Signature:
+    return inspect.signature(
+        model.forward if isinstance(model, torch.nn.Module) else model
+    )
diff --git a/MLPY/Lib/site-packages/torch/onnx/verification.py b/MLPY/Lib/site-packages/torch/onnx/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee79d4b25514fda04fd216017d96b753ad0870d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/onnx/verification.py
@@ -0,0 +1,1884 @@
+"""Functions to verify exported ONNX model is functionally equivalent to original PyTorch model.
+
+ONNX Runtime is required, and is used as the ONNX backend for export verification.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import copy
+import dataclasses
+import datetime
+import difflib
+import enum
+import functools
+import io
+import itertools
+import os
+import tempfile
+import warnings
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    Dict,
+    FrozenSet,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
+
+import numpy as np
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, _experimental, _exporter_states, utils
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import _beartype, onnx_proto_utils
+from torch.types import Number
+
+_ORT_PROVIDERS = ("CPUExecutionProvider",)
+
+_NumericType = Union[Number, torch.Tensor, np.ndarray]
+_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
+_InputArgsType = Union[torch.Tensor, Tuple[Any, ...]]
+_InputKwargsType = Mapping[str, Any]
+_OutputsType = Union[Sequence[_NumericType], Sequence]
+
+
+class OnnxBackend(enum.Enum):
+    """Enum class for ONNX backend used for export verification."""
+
+    REFERENCE = "ONNXReferenceEvaluator"
+    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
+    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
+
+
+@dataclasses.dataclass
+class VerificationOptions:
+    """Options for ONNX export verification.
+
+    Attributes:
+        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
+            Tensors for ONNX. Set this to False if nested structures are to be preserved
+            for ONNX, which is usually the case with exporting ScriptModules. Default True.
+        ignore_none: Whether to ignore None type in torch output, which is usually the
+            case with tracing. Set this to False, if torch output should keep None type,
+            which is usually the case with exporting ScriptModules. Default to True.
+        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
+            are exactly the same. Set this to False to allow output shape broadcasting.
+            Default to True.
+        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
+            are consistent. Default to True.
+        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
+        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
+        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
+        remained_onnx_input_idx: If provided, only the specified inputs will be passed
+            to the ONNX model. Supply a list when there are unused inputs in the model.
+            Since unused inputs will be removed in the exported ONNX model, supplying
+            all inputs will cause an error on unexpected inputs. This parameter tells
+            the verifier which inputs to pass into the ONNX model.
+        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
+            It should be a float of value between 0.0 and 1.0.
+    """
+
+    flatten: bool = True
+    ignore_none: bool = True
+    check_shape: bool = True
+    check_dtype: bool = True
+    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
+    rtol: float = 1e-3
+    atol: float = 1e-7
+    remained_onnx_input_idx: Optional[Sequence[int]] = None
+    acceptable_error_percentage: Optional[float] = None
+
+
+@_beartype.beartype
+def _flatten_tuples(elem):
+    flattened = []
+    for t in elem:
+        if isinstance(t, tuple):
+            flattened.extend(_flatten_tuples(t))
+        else:
+            flattened.append(t)
+    return flattened
+
+
+# TODO(justinchuby): Add type checking by narrowing down the return type when input is None
+def _to_numpy(elem) -> Union[list, np.ndarray]:
+    if isinstance(elem, torch.Tensor):
+        if elem.requires_grad:
+            return elem.detach().cpu().numpy()
+        else:
+            return elem.cpu().numpy()
+    elif isinstance(elem, (list, tuple)):
+        return [_to_numpy(inp) for inp in elem]
+    elif isinstance(elem, (bool, int, float)):
+        return np.array(elem)
+    elif isinstance(elem, dict):
+        flattened = []
+        for k in elem:
+            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
+        return flattened
+    return elem
+
+
+@_beartype.beartype
+def _inline_flatten_list(inputs, res_list) -> list:
+    for i in inputs:
+        res_list.append(i) if not isinstance(
+            i, (list, tuple)
+        ) else _inline_flatten_list(i, res_list)
+    return res_list
+
+
+@_beartype.beartype
+def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(
+            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
+        )
+    return [_to_numpy(v) for v in value_unpacked]
+
+
+@_beartype.beartype
+def _run_onnx(onnx_session, inputs) -> _OutputsType:
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = _to_numpy(input)
+    inputs = _to_numpy(inputs)
+    if hasattr(onnx_session, "get_inputs"):
+        # onnxruntime.InferenceSession
+        input_names = [i.name for i in onnx_session.get_inputs()]
+    elif hasattr(onnx_session, "input_names"):
+        # onnx.reference.ReferenceEvaluator
+        input_names = onnx_session.input_names
+    else:
+        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
+
+    for i, input in enumerate(inputs):
+        if i == len(input_names) or input_names[i] in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
+                f"input names: {input_names}."
+            )
+        ort_inputs[input_names[i]] = input
+    onnx_outs = onnx_session.run(None, ort_inputs)
+    return onnx_outs
+
+
+@_beartype.beartype
+def _ort_session(
+    model: Union[str, io.BytesIO], ort_providers: Sequence[str] = _ORT_PROVIDERS
+):
+    try:
+        import onnxruntime  # type: ignore[import]
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
+
+    if ort_providers is None:
+        ort_providers = _ORT_PROVIDERS
+
+    session_options = onnxruntime.SessionOptions()
+    # suppress ort warnings.
+    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+    session_options.log_severity_level = 3
+    ort_session = onnxruntime.InferenceSession(
+        model if isinstance(model, str) else model.getvalue(),
+        session_options,
+        providers=ort_providers,
+    )
+    return ort_session
+
+
+@_beartype.beartype
+def _onnx_reference_evaluator_session(model: Union[str, io.BytesIO]):
+    try:
+        import onnx
+        from onnx import reference as onnx_reference  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError("onnx >= 1.13 is required for reference evaluator.") from exc
+
+    proto = (
+        onnx.load(model)  # type: ignore[attr-defined]
+        if isinstance(model, str)
+        else onnx.load_model_from_string(model.getvalue())  # type: ignore[attr-defined]
+    )
+    onnx_session = onnx_reference.ReferenceEvaluator(proto)
+    return onnx_session
+
+
+@_beartype.beartype
+def _onnx_backend_session(model: Union[str, io.BytesIO], backend: OnnxBackend):
+    if backend == OnnxBackend.REFERENCE:
+        onnx_session = _onnx_reference_evaluator_session(model)
+    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
+        onnx_session = _ort_session(model, (backend.value,))
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+    return onnx_session
+
+
+@_beartype.beartype
+def _compare_onnx_pytorch_outputs_in_np(
+    onnx_outs: _OutputsType,
+    pt_outs: _OutputsType,
+    options: VerificationOptions,
+):
+    assert len(onnx_outs) == len(
+        pt_outs
+    ), f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
+    acceptable_error_percentage = options.acceptable_error_percentage
+    if acceptable_error_percentage and (
+        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
+    ):
+        raise ValueError(
+            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
+        )
+
+    for ort_out, pt_out in zip(onnx_outs, pt_outs):
+        try:
+            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
+            if not options.check_shape:
+                # Allow different but broadcastable output shapes.
+                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
+            torch.testing.assert_close(
+                ort_out,
+                pt_out,
+                rtol=options.rtol,
+                atol=options.atol,
+                check_dtype=options.check_dtype,
+                equal_nan=True,
+            )
+        except AssertionError as e:
+            if acceptable_error_percentage:
+                error_percentage = 1 - np.sum(
+                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
+                ) / np.prod(ort_out.shape)
+                if error_percentage <= acceptable_error_percentage:
+                    warnings.warn(
+                        f"Suppressed AssertionError:\n{e}.\n"
+                        f"Error percentage {error_percentage} "
+                        f"within acceptable range {acceptable_error_percentage}."
+                    )
+                    continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
+            raise
+
+
+@_beartype.beartype
+def _compare_onnx_pytorch_outputs(
+    onnx_outs: _OutputsType,
+    pt_outs: Any,
+    options: VerificationOptions,
+):
+    """
+    Compare ONNX and PyTorch outputs.
+
+    Args:
+        onnx_outs: outputs from ONNX backend.
+        pt_outs: outputs from PyTorch.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options.ignore_none:
+        # torch.jit._flatten filters None type
+        pt_outs, _ = torch.jit._flatten(pt_outs)
+    else:
+        pt_outs = _inline_flatten_list([pt_outs], [])
+    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
+    onnx_outs = _inline_flatten_list(onnx_outs, [])
+    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
+
+
+@_beartype.beartype
+def _prepare_input_for_pytorch(args, kwargs):
+    """Prepare input for PyTorch model execution.
+
+    Any future changes/formatting to the input before dispatching to the PyTorch
+    model should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+    """
+    if isinstance(args, (torch.Tensor, dict)):
+        args = (args,)
+    # In-place operators will update input tensor data as well.
+    # Thus inputs are replicated before every forward call.
+    args = copy.deepcopy(args)
+    if kwargs:
+        kwargs = copy.deepcopy(kwargs)
+    else:
+        kwargs = {}
+    return args, kwargs
+
+
+@_beartype.beartype
+def _prepare_input_for_export(args, kwargs):
+    """Prepare input for ONNX model export.
+
+    Any future changes/formatting to the input before dispatching to the
+    :func:`torch.onnx.export` api should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model export, as `args` in
+            :func:`torch.onnx.export`.
+    """
+    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
+    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
+        onnx_inputs = args + ({},)
+    elif kwargs:
+        onnx_inputs = args + (kwargs,)
+    else:
+        onnx_inputs = args
+    return onnx_inputs
+
+
+@_beartype.beartype
+def _prepare_input_for_onnx(
+    args, kwargs, remained_onnx_input_idx: Optional[Sequence[int]], flatten: bool
+):
+    """Prepare input for ONNX model execution in ONNX backend.
+
+    Any future changes/formatting to the input before dispatching to the ONNX backend
+    run should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
+        flatten: whether to flatten the input before dispatching to the ONNX model execution.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
+    """
+    onnx_inputs = _prepare_input_for_export(args, kwargs)
+    if flatten:
+        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
+    elif onnx_inputs and onnx_inputs[-1] == {}:
+        # Handle empty kwargs (normally removed by flatten).
+        onnx_inputs = onnx_inputs[:-1]
+    if remained_onnx_input_idx is not None:
+        return [onnx_inputs[i] for i in remained_onnx_input_idx]
+    else:
+        return onnx_inputs
+
+
+@_beartype.beartype
+def _try_clone_model(model):
+    """Used for preserving original model in case forward mutates model states."""
+    try:
+        return copy.deepcopy(model)
+    except Exception:
+        warnings.warn(
+            "Failed to clone model. Model state might be mutated during verification."
+        )
+        return model
+
+
+@_beartype.beartype
+def _compare_onnx_pytorch_model(
+    pt_model: _ModelType,
+    onnx_model_f: Union[str, io.BytesIO],
+    input_args: _InputArgsType,
+    input_kwargs: Optional[_InputKwargsType],
+    additional_test_inputs: Optional[Sequence[_InputArgsType]],
+    options: VerificationOptions,
+):
+    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
+
+    Args:
+        pt_model: PyTorch model.
+        onnx_model_f: ONNX model file path or file-like object.
+        input_args: positional arguments for PyTorch model forward method.
+        input_kwargs: keyword arguments for PyTorch model forward method.
+        additional_test_inputs: additional positional arguments for PyTorch model
+            forward method.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+    """
+    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
+
+    @_beartype.beartype
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
+        # TODO: remove this and treat mutating model separately. See #77679
+        pt_model_copy = _try_clone_model(pt_model)
+        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
+        )
+
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+
+        _compare_onnx_pytorch_outputs(
+            onnx_outs=onnx_outs,
+            pt_outs=pt_outs,
+            options=options,
+        )
+
+    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
+
+    if additional_test_inputs:
+        for test_input_args in additional_test_inputs:
+            compare_onnx_pytorch_model_with_input(test_input_args, {})
+
+
+class _GraphDiff:
+    """A class to represent the difference between two graphs."""
+
+    @_beartype.beartype
+    def __init__(self, graph_a: _C.Graph, graph_b: _C.Graph):
+        """Construct a _GraphDiff object.
+
+        Args:
+            graph_a (_C.Graph): First graph to compare.
+            graph_b (_C.Graph): Second graph to compare.
+        """
+        self.graph_a = graph_a
+        self.graph_b = graph_b
+
+    @_beartype.beartype
+    def __str__(self):
+        """See function :func:`diff_report`."""
+        return self.diff_report()
+
+    @_beartype.beartype
+    def _indent(self, lines: str) -> str:
+        return "\n".join(["\t" + line for line in lines.splitlines()])
+
+    @_beartype.beartype
+    def diff_report(self) -> str:
+        """Return a string representation of the graph difference.
+
+        The report shows the first pair of nodes that diverges. It also shows the source
+        location of the pair of nodes.
+
+        Returns:
+            graph_diff_report (str): A string representation of the graph difference.
+        """
+        graph_a = self.graph_a
+        graph_b = self.graph_b
+
+        graph_a_str = str(graph_a)
+        graph_b_str = str(graph_b)
+
+        if graph_a_str == graph_b_str:
+            return ""
+
+        graph_diff = difflib.ndiff(
+            graph_a_str.splitlines(True), graph_b_str.splitlines(True)
+        )
+        graph_diff_report = ["Graph diff:", self._indent("".join(graph_diff))]
+
+        for node_a, node_b in itertools.zip_longest(graph_a.nodes(), graph_b.nodes()):
+            if str(node_a) != str(node_b):
+                graph_diff_report.append("First diverging operator:")
+                node_diff = difflib.ndiff(
+                    str(node_a).splitlines(True), str(node_b).splitlines(True)
+                )
+                source_printout = ["node diff:", self._indent("".join(node_diff))]
+
+                stack_a = node_a.sourceRange() if node_a else None
+                if stack_a:
+                    source_printout.extend(
+                        ["Former source location:", self._indent(str(stack_a))]
+                    )
+                stack_b = node_b.sourceRange() if node_b else None
+                if stack_b:
+                    source_printout.extend(
+                        ["Latter source location:", self._indent(str(stack_b))]
+                    )
+
+                graph_diff_report.extend(source_printout)
+
+                break
+
+        return "\n".join(graph_diff_report)
+
+
+@_beartype.beartype
+def _check_graph_diff(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    test_input_groups: Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]],
+    export_options: _experimental.ExportOptions,
+    model_to_graph_func: Callable[
+        [
+            torch.nn.Module,
+            Tuple[Any, ...],
+            Mapping[str, Any],
+            _experimental.ExportOptions,
+        ],
+        _C.Graph,
+    ],
+) -> str:
+    """Check if graph produced by `model_to_graph_func` is the same across `test_input_groups`.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        test_input_groups: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+        model_to_graph_func: A function to convert a PyTorch model to a JIT IR graph.
+
+    Returns:
+        graph_diff_report (str): A string representation of the graph difference.
+    """
+    if len(test_input_groups) < 2:
+        raise ValueError("Need at least two groups of test inputs to compare.")
+
+    ref_jit_graph = None
+    for args, kwargs in test_input_groups:
+        jit_graph = model_to_graph_func(model, args, kwargs, export_options)
+        if ref_jit_graph is None:
+            ref_jit_graph = jit_graph
+            continue
+
+        graph_diff_report = _GraphDiff(ref_jit_graph, jit_graph).diff_report()
+        if graph_diff_report:
+            return graph_diff_report
+    return ""
+
+
+@_beartype.beartype
+def _traced_graph_from_model(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    args: Tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, create a traced JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        jit_graph (_C.Graph): A traced JIT graph.
+    """
+    training = export_options.training
+    verbose = export_options.verbose
+
+    with utils.exporter_context(model, training, verbose):
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        model = utils._pre_trace_quant_model(model, export_inputs)
+        jit_graph, _, _, _ = utils._create_jit_graph(model, export_inputs)
+        return jit_graph
+
+
+@_beartype.beartype
+def _onnx_graph_from_model(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    args: Tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, export an ONNX JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        onnx_graph (_C.Graph): An ONNX JIT graph.
+    """
+    # TODO: refactor utils.py to remove duplicated code of context setup. See #78834
+    opset_version = export_options.opset_version
+    operator_export_type = export_options.operator_export_type
+    export_modules_as_functions = export_options.export_modules_as_functions
+    training = export_options.training
+    verbose = export_options.verbose
+    dynamic_axes = export_options.dynamic_axes
+    input_names = export_options.input_names
+    output_names = export_options.output_names
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    utils._setup_trace_module_map(model, export_modules_as_functions)
+
+    if not operator_export_type:
+        if _C_onnx._CAFFE2_ATEN_FALLBACK:
+            operator_export_type = _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+        else:
+            operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    with utils.exporter_context(model, training, verbose):
+        do_constant_folding = utils._decide_constant_folding(
+            export_options.do_constant_folding, operator_export_type, training
+        )
+
+        if dynamic_axes is None:
+            dynamic_axes = {}
+        utils._validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        export_inputs = utils._decide_input_format(model, export_inputs)
+        onnx_graph, _, _ = utils._model_to_graph(
+            model,
+            export_inputs,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return onnx_graph
+
+
+@_beartype.beartype
+def _onnx_graph_from_aten_graph(
+    graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: Optional[Dict[str, Any]] = None,
+) -> Tuple[torch.Graph, Dict[str, Any]]:
+    if params_dict is None:
+        params_dict = {}
+    operator_export_type = export_options.operator_export_type
+    dynamic_axes = export_options.dynamic_axes or {}
+    input_names = export_options.input_names
+    training = export_options.training
+    do_constant_folding = export_options.do_constant_folding
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    do_constant_folding = utils._decide_constant_folding(
+        do_constant_folding, operator_export_type, training
+    )
+
+    # TODO: Below is doing aten graph to onnx. It should be abstracted as a
+    # function in torch/onnx/utils.py.
+    graph = graph.copy()
+    graph = utils._optimize_graph(
+        graph,
+        operator_export_type,
+        params_dict=params_dict,
+        dynamic_axes=dynamic_axes,
+        input_names=input_names,
+    )
+
+    if training is None or training == _C_onnx.TrainingMode.EVAL:
+        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+    if (
+        do_constant_folding
+        and opset_version >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        params_dict = _C._jit_pass_onnx_constant_fold(graph, params_dict, opset_version)
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        _C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, opset_version)
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if export_options.verbose:
+        print("ONNX graph: ", graph)
+
+    return graph, params_dict
+
+
+@_beartype.beartype
+def _onnx_proto_from_onnx_graph(
+    onnx_graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: Dict[str, Any],
+) -> Tuple[bytes, Mapping[str, bytes]]:
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+    dynamic_axes = export_options.dynamic_axes or {}
+    operator_export_type = export_options.operator_export_type
+    val_keep_init_as_ip = utils._decide_keep_init_as_input(
+        export_options.keep_initializers_as_inputs,
+        operator_export_type,
+        opset_version,
+    )
+    val_add_node_names = utils._decide_add_node_names(True, operator_export_type)
+    custom_opsets = export_options.custom_opsets or {}
+
+    proto, export_map, _, _ = onnx_graph._export_onnx(  # type: ignore[attr-defined]
+        params_dict,
+        opset_version,
+        dynamic_axes,
+        False,
+        operator_export_type,
+        not export_options.verbose,
+        val_keep_init_as_ip,
+        custom_opsets,
+        val_add_node_names,
+        "",
+        {},
+    )
+
+    return proto, export_map
+
+
+@_beartype.beartype
+def check_export_model_diff(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    test_input_groups: Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]],
+    export_options: Optional[_experimental.ExportOptions] = None,
+) -> str:
+    """Verify exported model discrepancy between different groups of inputs.
+
+    A graph is exported for each group of inputs. The exported graphs are then compared
+    to each other, and discrepancies of first pair of nodes are reported. This function
+    first checks the jit graph. If no discrepancies were found, it then checks the onnx
+    graph.
+
+    Unless otherwise specified, the jit/ONNX graph is expected to be the same, regardless
+    of the inputs used for exporting. A discrepancy implies the graph exported is
+    not accurate when run on other groups of inputs, which will typically results in
+    runtime errors or mismatching output.
+
+    Args:
+        model (torch.nn.Module or torch.jit.ScriptModule): The model to be exported.
+        test_input_groups (Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]]): A sequence
+            of input groups to be used to export the model. Each input group is a pair of
+            (args, kwargs).
+        export_options (_experimental.ExportOptions, optional): An _experimental.ExportOptions
+            object that controls the export behavior.
+
+    Returns:
+        str: A string containing the diff of the exported models.
+    """
+    export_options = (
+        _experimental.ExportOptions() if export_options is None else export_options
+    )
+
+    jit_diff_report = _check_graph_diff(
+        model, test_input_groups, export_options, _traced_graph_from_model
+    )
+    if jit_diff_report:
+        return jit_diff_report
+
+    return _check_graph_diff(
+        model, test_input_groups, export_options, _onnx_graph_from_model
+    )
+
+
+@_beartype.beartype
+def verify(
+    model: _ModelType,
+    input_args: _InputArgsType,
+    input_kwargs: Optional[_InputKwargsType] = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Optional[
+        Mapping[str, Union[Mapping[int, str], Mapping[str, Sequence[int]]]]
+    ] = None,
+    input_names: Optional[Sequence[str]] = None,
+    output_names: Optional[Sequence[str]] = None,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: Optional[int] = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    fixed_batch_size: bool = False,
+    use_external_data: bool = False,
+    additional_test_inputs: Optional[Sequence[_InputArgsType]] = None,
+    options: Optional[VerificationOptions] = None,
+):
+    """Verify model export to ONNX against original PyTorch model.
+
+    Args:
+        model (torch.nn.Module or torch.jit.ScriptModule): See :func:`torch.onnx.export`.
+        input_args (tuple): See :func:`torch.onnx.export`.
+        input_kwargs (dict): See :func:`torch.onnx.export`.
+        do_constant_folding (bool, optional): See :func:`torch.onnx.export`.
+        dynamic_axes (dict, optional): See :func:`torch.onnx.export`.
+        input_names (list, optional): See :func:`torch.onnx.export`.
+        output_names (list, optional): See :func:`torch.onnx.export`.
+        training (torch.onnx.TrainingMode): See :func:`torch.onnx.export`.
+        opset_version (int, optional): See :func:`torch.onnx.export`.
+        keep_initializers_as_inputs (bool, optional): See :func:`torch.onnx.export`.
+        verbose (bool, optional): See :func:`torch.onnx.export`.
+        fixed_batch_size (bool, optional): Legacy argument, used only by rnn test cases.
+        use_external_data (bool, optional): Explicitly specify whether to export the
+            model with external data.
+        additional_test_inputs (list, optional): List of tuples. Each tuple is a group of
+            input arguments to test. Currently only *args are supported.
+        options (_VerificationOptions, optional): A _VerificationOptions object that
+            controls the verification behavior.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options is None:
+        options = VerificationOptions()
+
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad(), contextlib.ExitStack() as stack:
+        model_f: Union[str, io.BytesIO] = io.BytesIO()
+        if use_external_data:
+            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+            model_f = os.path.join(tmpdir_path, "model.onnx")
+
+        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
+
+        # TODO(#77679): remove this and treat mutating model separately.
+        model_copy = _try_clone_model(model)
+        utils._export(
+            model,
+            inputs_for_export,
+            model_f,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
+
+        _compare_onnx_pytorch_model(
+            pt_model=model_copy,
+            onnx_model_f=model_f,
+            input_args=input_args,
+            input_kwargs=input_kwargs,
+            additional_test_inputs=additional_test_inputs,
+            options=options,
+        )
+
+
+@_beartype.beartype
+def verify_aten_graph(
+    graph: torch.Graph,
+    input_args: Tuple[Any, ...],
+    export_options: _experimental.ExportOptions,
+    params_dict: Optional[Dict[str, Any]] = None,
+    verification_options: Optional[VerificationOptions] = None,
+) -> Tuple[Optional[AssertionError], torch.Graph, _OutputsType, _OutputsType]:
+    if verification_options is None:
+        verification_options = VerificationOptions()
+    if params_dict is None:
+        params_dict = {}
+
+    original_jit_graph = graph
+    graph = graph.copy()
+
+    # Execute aten graph and get reference torch jit outputs.
+    graph_inputs = list(graph.inputs())
+    jit_inputs = tuple([arg for arg in input_args if arg is not None])
+    weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
+    assert all(w is not None for w in weights)
+    # TODO: Only copy the argument if mutation is detected in Graph.
+    jit_inputs = copy.deepcopy(jit_inputs)
+    jit_input_and_parameters = jit_inputs + tuple(weights)
+    jit_outs = torch._C._jit_interpret_graph(graph, jit_input_and_parameters)  # type: ignore[attr-defined]
+    if not isinstance(jit_outs, (list, tuple)):
+        jit_outs = [jit_outs]
+
+    # Convert aten graph to onnx graph.
+    graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+        graph, export_options, params_dict
+    )
+
+    proto, export_map = _onnx_proto_from_onnx_graph(
+        graph, export_options, onnx_params_dict
+    )
+    model_f: Union[str, io.BytesIO] = io.BytesIO()
+    export_type = _exporter_states.ExportTypes.PROTOBUF_FILE
+    onnx_proto_utils._export_file(proto, model_f, export_type, export_map)
+
+    # NOTE: Verification is unstable. Try catch to emit information for debugging.
+    try:
+        # NOTE: Input might be dce'ed, so we need to remove those from the input args.
+        new_input_names = {v.debugName() for v in graph.inputs()}
+        new_input_args = []
+        for v, arg in zip(original_jit_graph.inputs(), input_args):
+            if v.debugName() in new_input_names:
+                new_input_args.append(arg)
+        input_args = tuple(new_input_args)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args,
+            {},
+            verification_options.remained_onnx_input_idx,
+            verification_options.flatten,
+        )
+
+        onnx_session = _onnx_backend_session(model_f, verification_options.backend)
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+        del onnx_session  # To free device memory
+
+        try:
+            _compare_onnx_pytorch_outputs(
+                onnx_outs=onnx_outs,
+                pt_outs=jit_outs,
+                options=verification_options,
+            )
+        except AssertionError as e:
+            return e, graph, jit_outs, onnx_outs
+
+        return None, graph, jit_outs, onnx_outs
+
+    except Exception as e:
+        print("Unexpected error during verification.")
+        print("jit graph: ", original_jit_graph)
+        print("onnx graph: ", graph)
+        raise e
+
+
+class GraphInfoPrettyPrinter:
+    graph_info: Optional[GraphInfo]
+    upper_printer: Optional[GraphInfoPrettyPrinter]
+    lower_printer: Optional[GraphInfoPrettyPrinter]
+
+    graph_str_lambdas: Mapping[int, str]
+    connector_str_lambdas: Mapping[int, str]
+    children_str_lambdas: Mapping[int, str]
+
+    def __init__(self, graph_info: Optional[GraphInfo]):
+        self.graph_info = graph_info
+        if (
+            graph_info is not None
+            and graph_info.upper_graph_info is not None
+            and graph_info.lower_graph_info is not None
+        ):
+            self.upper_printer = GraphInfoPrettyPrinter(graph_info.upper_graph_info)
+            self.lower_printer = GraphInfoPrettyPrinter(graph_info.lower_graph_info)
+        else:
+            self.upper_printer = None
+            self.lower_printer = None
+
+    @_beartype.beartype
+    def _total_rows(self) -> int:
+        if self.graph_info is None:
+            return 1
+        if self.upper_printer and self.lower_printer:
+            return (
+                self.upper_printer._total_rows() + self.lower_printer._total_rows() + 1
+            )
+        return 2  # Two lines: node count + id.
+
+    @_beartype.beartype
+    def _node_count_segment_str(self) -> str:
+        if self.graph_info is None:
+            return "..."
+        node_count = self.graph_info.essential_node_count()
+        has_mismatch = self.graph_info.has_mismatch()
+        error_node_kind = (
+            f"({self.graph_info.essential_node_kinds().pop()})"
+            if node_count == 1 and has_mismatch
+            else ""
+        )
+
+        return f"{node_count} {'X' if has_mismatch else '✓'} {error_node_kind}"
+
+    @_beartype.beartype
+    def _graph_id_segment_str(self) -> str:
+        if self.graph_info is None:
+            return ""
+        return f"id: {self.graph_info.id}"
+
+    @_beartype.beartype
+    def _max_segment_columns(self) -> int:
+        return max(
+            map(len, (self._node_count_segment_str(), self._graph_id_segment_str()))
+        )
+
+    @_beartype.beartype
+    def _graph_segment_str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph segment at the given line."""
+        if line == 0:
+            result_str = self._node_count_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if line == 1:
+            result_str = self._graph_id_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if 0 <= line < self._total_rows():
+            return " " * self._max_segment_columns()
+        return ""
+
+    @_beartype.beartype
+    def _connector_segment_str_at_line(self, line: int) -> str:
+        """Get the connector segment string at the given line."""
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if line == 0:
+            return "  __"
+        elif line < upper_total_rows + 1:
+            return " |  "
+        elif line == upper_total_rows + 1:
+            return " |__"
+        elif line < upper_total_rows + lower_total_rows + 1:
+            return "    "
+        return ""
+
+    @_beartype.beartype
+    def _children_str_at_line(self, line: int) -> str:
+        """Get the string representation of the children at the given line.
+
+        Recursively calls `_str_at_line` on children nodes.
+        """
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if 0 <= line < upper_total_rows:
+            return (
+                self.upper_printer._str_at_line(line) if self.upper_printer else "..."
+            )
+        elif upper_total_rows < line < upper_total_rows + lower_total_rows + 1:
+            return (
+                self.lower_printer._str_at_line(line - upper_total_rows - 1)
+                if self.lower_printer
+                else "..."
+            )
+        return ""
+
+    @_beartype.beartype
+    def _str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph at the given line."""
+        return (
+            self._graph_segment_str_at_line(line)
+            + self._connector_segment_str_at_line(line)
+            + self._children_str_at_line(line)
+        )
+
+    def pretty_print(self):
+        if self.graph_info is None:
+            print(None)
+            return
+        # Print tree.
+        print(" Tree: ".center(80, "="))
+        total_rows = self._total_rows()
+        for line in range(total_rows):
+            print(self._str_at_line(line).rstrip())
+        if self.graph_info.has_mismatch():
+            # Summarize leaf subgraphs with mismatch.
+            print(" Mismatch leaf subgraphs: ".center(80, "="))
+            print(
+                [
+                    graph_info.id
+                    for graph_info in self.graph_info.all_mismatch_leaf_graph_info()
+                ]
+            )
+            # Summarize node kinds with mismatch.
+            mismatch_node_kinds: Dict[str, int] = {}
+            for graph_info in self.graph_info.all_mismatch_leaf_graph_info():
+                node_kinds = graph_info.essential_node_kinds()
+                if len(node_kinds) == 1:
+                    node_kind = node_kinds.pop()
+                    mismatch_node_kinds[node_kind] = (
+                        mismatch_node_kinds.get(node_kind, 0) + 1
+                    )
+            print(" Mismatch node kinds: ".center(80, "="))
+            print(mismatch_node_kinds)
+        else:
+            print(" No mismatch found. ".center(80, "="))
+
+
+class OnnxTestCaseRepro:
+    def __init__(self, repro_dir):
+        self.repro_dir = repro_dir
+        self.proto, self.inputs, self.outputs = onnx_proto_utils.load_test_case(
+            repro_dir
+        )
+
+    @classmethod
+    @_beartype.beartype
+    def create_test_case_repro(
+        cls, proto: bytes, inputs, outputs, dir: str, name: Optional[str] = None
+    ):
+        """Create a repro under "{dir}/test_{name}" for an ONNX test case.
+
+        The test case contains the model and the inputs/outputs data. The directory
+        structure is as follows:
+
+        dir
+        ├── test_<name>
+        │   ├── model.onnx
+        │   └── test_data_set_0
+        │       ├── input_0.pb
+        │       ├── input_1.pb
+        │       ├── output_0.pb
+        │       └── output_1.pb
+
+        Args:
+            proto: ONNX model proto.
+            inputs: Inputs to the model.
+            outputs: Outputs of the model.
+            dir: Directory to save the repro.
+            name: Name of the test case. If not specified, a name based on current time
+                will be generated.
+        Returns:
+            Path to the repro.
+        """
+        if name is None:
+            name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+        return onnx_proto_utils.export_as_test_case(
+            proto,
+            _to_numpy(inputs),
+            _to_numpy(outputs),
+            name,
+            dir,
+        )
+
+    @_beartype.beartype
+    def validate(self, options: VerificationOptions):
+        """Run the ONNX test case with options.backend, and compare with the expected outputs.
+
+        Args:
+            options: Options for validation.
+
+        Raise:
+            AssertionError: if outputs from options.backend and expected outputs are not
+                equal up to specified precision.
+        """
+        onnx_session = _onnx_backend_session(io.BytesIO(self.proto), options.backend)
+        run_outputs = onnx_session.run(None, self.inputs)
+        if hasattr(onnx_session, "get_outputs"):
+            output_names = [o.name for o in onnx_session.get_outputs()]
+        elif hasattr(onnx_session, "output_names"):
+            output_names = onnx_session.output_names
+        else:
+            raise ValueError(f"Unknown onnx session type: {type(onnx_session)}")
+        expected_outs = [self.outputs[name] for name in output_names]
+        _compare_onnx_pytorch_outputs_in_np(run_outputs, expected_outs, options)
+
+
+@dataclasses.dataclass
+class GraphInfo:
+    """GraphInfo contains validation information of a TorchScript graph and its converted ONNX graph."""
+
+    graph: torch.Graph
+    input_args: Tuple[Any, ...]
+    params_dict: Dict[str, Any]
+    export_options: _experimental.ExportOptions = dataclasses.field(
+        default_factory=_experimental.ExportOptions
+    )
+    mismatch_error: Optional[AssertionError] = dataclasses.field(
+        default=None, init=False
+    )
+    pt_outs: Optional[Sequence[_NumericType]] = dataclasses.field(
+        default=None, init=False
+    )
+    upper_graph_info: Optional[GraphInfo] = dataclasses.field(default=None, init=False)
+    lower_graph_info: Optional[GraphInfo] = dataclasses.field(default=None, init=False)
+    id: str = dataclasses.field(default="")
+    _onnx_graph: Optional[torch.Graph] = dataclasses.field(init=False, default=None)
+
+    _EXCLUDED_NODE_KINDS: FrozenSet[str] = frozenset(
+        {"prim::Constant", "prim::ListConstruct", "aten::ScalarImplicit"}
+    )
+
+    def clear(self):
+        """Clear states and results of previous verification."""
+        self.mismatch_error = None
+        self.pt_outs = None
+        self._onnx_graph = None
+        self.upper_graph_info = None
+        self.lower_graph_info = None
+
+    def pretty_print_tree(self):
+        """Pretty print `GraphInfo` tree.
+
+        Each node represents a subgraph, showing the number of nodes in the subgraph and
+        a check mark if the subgraph has output mismatch between torch and ONNX.
+
+        The id of the subgraph is shown under the node. The `GraphInfo` object for any
+        subgraph can be retrieved by calling `graph_info.find_partition(id)`.
+
+        Example::
+
+            ==================================== Tree: =====================================
+            5 X   __2 X    __1 ✓
+            id:  |  id: 0 |  id: 00
+                 |        |
+                 |        |__1 X (aten::relu)
+                 |           id: 01
+                 |
+                 |__3 X    __1 ✓
+                    id: 1 |  id: 10
+                          |
+                          |__2 X     __1 X (aten::relu)
+                             id: 11 |  id: 110
+                                    |
+                                    |__1 ✓
+                                       id: 111
+            =========================== Mismatch leaf subgraphs: ===========================
+            ['01', '110']
+            ============================= Mismatch node kinds: =============================
+            {'aten::relu': 2}
+
+        """
+        GraphInfoPrettyPrinter(self).pretty_print()
+
+    def pretty_print_mismatch(self, graph: bool = False):
+        """Pretty print details of the mismatch between torch and ONNX.
+
+        Args:
+            graph: If True, print the ATen JIT graph and ONNX graph.
+        """
+        print(f" Mismatch info for graph partition {self.id}: ".center(80, "="))
+        if graph:
+            print(" ATen JIT graph ".center(80, "="))
+            # TODO: A more compact graph printer.
+            #   * Drop stride, grad, device information.
+            #   * Show source location on a separate line.
+            print(self.graph)
+            if self._onnx_graph is not None:
+                print(" ONNX graph ".center(80, "="))
+                print(self._onnx_graph)
+        if self.has_mismatch():
+            print(" Mismatch error ".center(80, "="))
+            print(self.mismatch_error)
+        else:
+            print(" No mismatch ".center(80, "="))
+
+    @_beartype.beartype
+    def has_mismatch(self) -> bool:
+        """Return True if the subgraph has output mismatch between torch and ONNX."""
+        return self.mismatch_error is not None
+
+    @_beartype.beartype
+    def essential_node_count(self) -> int:
+        """Return the number of nodes in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return sum(
+            1 for n in self.graph.nodes() if n.kind() not in self._EXCLUDED_NODE_KINDS
+        )
+
+    @_beartype.beartype
+    def essential_node_kinds(self) -> Set[str]:
+        """Return the set of node kinds in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return {
+            n.kind()
+            for n in self.graph.nodes()
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        }
+
+    @_beartype.beartype
+    def all_mismatch_leaf_graph_info(self) -> List["GraphInfo"]:
+        """Return a list of all leaf `GraphInfo` objects that have mismatch."""
+        if not self.has_mismatch():
+            return []
+
+        no_mismatch_children = (
+            self.upper_graph_info is None or not self.upper_graph_info.has_mismatch()
+        ) and (
+            self.lower_graph_info is None or not self.lower_graph_info.has_mismatch()
+        )
+
+        if no_mismatch_children:
+            return [self]
+
+        results = []
+        if self.upper_graph_info is not None:
+            results += self.upper_graph_info.all_mismatch_leaf_graph_info()
+        if self.lower_graph_info is not None:
+            results += self.lower_graph_info.all_mismatch_leaf_graph_info()
+
+        return results
+
+    @_beartype.beartype
+    def find_partition(self, id: str) -> Optional["GraphInfo"]:
+        """Find the `GraphInfo` object with the given id."""
+        if id == self.id:
+            return self
+        current_length = len(self.id)
+        if len(id) > current_length:
+            if id[current_length] == "0" and self.upper_graph_info is not None:
+                return self.upper_graph_info.find_partition(id)
+            elif id[current_length] == "1" and self.lower_graph_info is not None:
+                return self.lower_graph_info.find_partition(id)
+        return None
+
+    @_beartype.beartype
+    def export_repro(
+        self, repro_dir: Optional[str] = None, name: Optional[str] = None
+    ) -> str:
+        """Export the subgraph to ONNX along with the input/output data for repro.
+
+        The repro directory will contain the following files::
+
+            dir
+            ├── test_<name>
+            │   ├── model.onnx
+            │   └── test_data_set_0
+            │       ├── input_0.pb
+            │       ├── input_1.pb
+            │       ├── output_0.pb
+            │       └── output_1.pb
+
+        Args:
+            repro_dir: The directory to export the repro files to. Defaults to current
+                working directory if None.
+            name: An optional name for the test case folder: "test_{name}".
+
+        Returns:
+            The path to the exported repro directory.
+        """
+
+        if repro_dir is None:
+            repro_dir = os.getcwd()
+        repro_dir = os.path.join(repro_dir, "onnx_debug")
+
+        onnx_graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+            self.graph, self.export_options, self.params_dict
+        )
+
+        proto, _ = _onnx_proto_from_onnx_graph(
+            onnx_graph, self.export_options, onnx_params_dict
+        )
+        return OnnxTestCaseRepro.create_test_case_repro(
+            proto, self.input_args, self.pt_outs, repro_dir, name
+        )
+
+    @_beartype.beartype
+    def _graph_partition_pivot(self) -> int:
+        """Find the pivot index to partition the graph.
+
+        The pivot is the node that splits the graph into two parts. Each part should
+        have the similar amount of nodes, excluding non essential ops, defined in
+        `_EXCLUDED_NODE_KINDS`, such as `prim::Constant`.
+        If the graph has an odd number of nodes, the upper part will have one more node.
+        If the graph does not have any node that can be partitioned, return -1.
+
+        Returns:
+            The index of the pivot node.
+        """
+        included_node_indices = [
+            i
+            for i, n in enumerate(self.graph.nodes())
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        ]
+        half_idx = len(included_node_indices) // 2 - 1
+        if half_idx >= 0 and len(included_node_indices) > half_idx:
+            return included_node_indices[half_idx] + 1
+        return -1
+
+    @_beartype.beartype
+    def _partition_upper_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+
+        def _process_bridge_value_for_upper(
+            new_outputs: List[torch.Value], bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as upper graph outputs.
+            new_outputs.append(bridge_value)
+            return bridge_value
+
+        new_outputs: List[torch.Value] = []
+        process_bridge_value_for_upper = functools.partial(
+            _process_bridge_value_for_upper, new_outputs
+        )
+        _, dropped_nodes, complete_upper_nodes_set, _ = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_upper
+        )
+
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for node in reversed(dropped_nodes):
+            node.destroy()
+
+        for i, input in reversed(list(enumerate(list(graph.inputs())))):
+            if (
+                not _has_uses_by_nodes(input, complete_upper_nodes_set)
+                and input not in new_outputs
+            ):
+                try:
+                    graph.eraseInput(i)
+                except RuntimeError as e:
+                    print(input, graph)
+                    raise e
+
+        return graph
+
+    @_beartype.beartype
+    def _partition_lower_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+        original_inputs = list(graph.inputs())
+
+        new_outputs = []
+
+        def _process_bridge_value_for_lower(
+            graph: torch.Graph, bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as lower graph inputs.
+            new_input = graph.addInput()
+            bridge_value.replaceAllUsesWith(new_input)
+            new_input.copyMetadata(bridge_value)
+            return new_input
+
+        process_bridge_value_for_lower = functools.partial(
+            _process_bridge_value_for_lower, graph
+        )
+
+        upper_nodes, lower_nodes, _, complete_lower_nodes_set = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_lower
+        )
+
+        for output in original_outputs:
+            if _produced_by(output, lower_nodes):
+                new_outputs.append(output)
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for input in original_inputs:
+            if _has_uses_by_nodes(input, complete_lower_nodes_set):
+                new_input = graph.addInput()
+                input.replaceAllUsesWith(new_input)
+                new_input.copyMetadata(input)
+
+        for node in reversed(upper_nodes):
+            if node not in complete_lower_nodes_set:
+                try:
+                    node.destroy()
+                except RuntimeError as e:
+                    print(node, graph)
+                    raise e
+
+        for _ in original_inputs:
+            graph.eraseInput(0)
+
+        return graph
+
+    @_beartype.beartype
+    def _partition_node(
+        self,
+        node: torch.Node,
+        complete_upper_nodes_set: Set[torch.Node],
+        complete_lower_nodes_set: Set[torch.Node],
+        original_graph_outputs: Set[torch.Value],
+        covered_bridge_values: Set[torch.Value],
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ):
+        if node in complete_lower_nodes_set:
+            return
+
+        if (
+            _node_has_uses_by(node, complete_lower_nodes_set)
+            and node.kind() in self._EXCLUDED_NODE_KINDS
+        ):
+            complete_lower_nodes_set.update(_all_nodes([node]))
+            for input in node.inputs():
+                if input in covered_bridge_values:
+                    continue
+                self._partition_node(
+                    input.node(),
+                    complete_upper_nodes_set,
+                    complete_lower_nodes_set,
+                    original_graph_outputs,
+                    covered_bridge_values,
+                    process_bridge_value,
+                )
+        else:
+            for output in node.outputs():
+                if output in covered_bridge_values:
+                    continue
+                if (
+                    _has_uses_by_nodes(output, complete_lower_nodes_set)
+                    or output in original_graph_outputs
+                ):
+                    covered_bridge_values.add(process_bridge_value(output))
+
+    @_beartype.beartype
+    def _partition_nodes(
+        self,
+        graph: torch.Graph,
+        pivot: int,
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ) -> Tuple[List[torch.Node], List[torch.Node], Set[torch.Node], Set[torch.Node]]:
+        nodes = list(graph.nodes())
+        upper_nodes = nodes[:pivot]
+        lower_nodes = nodes[pivot:]
+        # `upper_nodes` and `complete_upper_nodes_set` differs in that the latter
+        # recursively contains nodes in subblock of `upper_nodes`.
+        # The same applies for `lower_nodes` and `complete_lower_nodes_set`.
+        # With addition that `complete_lower_nodes_set` will include nodes that
+        # are determined to be copied from `upper_nodes` to `lower_nodes`.
+        complete_upper_nodes_set = _all_nodes(upper_nodes)
+        complete_lower_nodes_set = _all_nodes(lower_nodes)
+        original_graph_outputs = set(graph.outputs())
+        # Bridge values are values produced from upper graph, and consumed
+        # by lower graph. These values need to be become upper graph outputs
+        # and lower graph inputs, to bridge the interaction.
+        # Start with all graph inputs marked as covered. If any graph input is
+        # needed by lower graph, just keep it in lower graph inputs later.
+        covered_bridge_values = set(graph.inputs())
+        for node in upper_nodes:
+            self._partition_node(
+                node,
+                complete_upper_nodes_set,
+                complete_lower_nodes_set,
+                original_graph_outputs,
+                covered_bridge_values,
+                process_bridge_value,
+            )
+        return (
+            upper_nodes,
+            lower_nodes,
+            complete_upper_nodes_set,
+            complete_lower_nodes_set,
+        )
+
+    @_beartype.beartype
+    def _bridge_kwargs(self):
+        pt_outs = self.pt_outs
+        graph_outputs = list(self.graph.outputs())
+        assert pt_outs is not None
+        assert len(graph_outputs) == len(
+            pt_outs
+        ), f"{len(graph_outputs)} vs {len(pt_outs)}\nGraph: {self.graph}"
+        return {v.debugName(): o for v, o in zip(graph_outputs, pt_outs)}
+
+    @_beartype.beartype
+    def _args_and_params_for_partition_graph(
+        self,
+        graph: torch.Graph,
+        bridge_kwargs: Mapping[str, Union[_NumericType, Sequence[_NumericType]]],
+        full_kwargs: Mapping[str, torch.Tensor],
+        full_params: Mapping[str, torch.Tensor],
+    ):
+        input_names = [input.debugName() for input in graph.inputs()]
+        args = tuple(bridge_kwargs[k] for k in input_names if k in bridge_kwargs)
+        args += tuple(full_kwargs[k] for k in input_names if k in full_kwargs)
+        params = {k: full_params[k] for k in input_names if k in full_params}
+        assert len(args) + len(params) == len(
+            input_names
+        ), f"{len(args)} + {len(params)} vs {len(input_names)}: {input_names}"
+        return args, params
+
+    @_beartype.beartype
+    def verify_export(
+        self, options: VerificationOptions
+    ) -> Tuple[Optional[AssertionError], torch.Graph, _OutputsType, _OutputsType]:
+        """
+        Verify the export from TorchScript IR graph to ONNX.
+
+        Export the TorchScript IR graph to ONNX, with the inputs, parameters and export
+        options recorded in this object. Then verify the exported ONNX graph against
+        the original TorchScript IR graph under the provided verification options.
+
+        Args:
+            options: The verification options.
+
+        Returns:
+            error: The AssertionError raised during the verification. Returns None if no
+            error is raised.
+            onnx_graph: The exported ONNX graph in TorchScript IR format.
+            onnx_outs: The outputs from running exported ONNX model under the onnx
+            backend in `options`.
+            pt_outs: The outputs from running the TorchScript IR graph.
+        """
+        return verify_aten_graph(
+            self.graph,
+            input_args=self.input_args,
+            params_dict=self.params_dict,
+            export_options=self.export_options,
+            verification_options=options,
+        )
+
+    @_beartype.beartype
+    def find_mismatch(
+        self,
+        options: Optional[VerificationOptions] = None,
+    ):
+        """
+        Find all mismatches between the TorchScript IR graph and the exported onnx model.
+
+        Binary searches the model graph to find the minimal subgraph that exhibits the
+        mismatch. A `GraphInfo` object is created for each subgraph, recording the test
+        inputs and export options, as well as the validation results.
+
+        Args:
+            options: The verification options.
+        """
+        self.clear()
+
+        if options is None:
+            options = VerificationOptions()
+
+        if self.export_options.verbose:
+            print(self.graph)
+
+        if len(list(self.graph.outputs())) == 0:
+            return
+
+        assert len(self.input_args) + len(self.params_dict) == len(
+            list(self.graph.inputs())
+        ), (
+            f"Number of graph inputs({len(list(self.graph.inputs()))}) does not match "
+            f"the provided tensor arguments({len(self.input_args)} + {len(self.params_dict)})."
+        )
+
+        self.mismatch_error, self._onnx_graph, self.pt_outs, _ = self.verify_export(
+            options
+        )
+
+        if self.mismatch_error is None:
+            # No mismatch found in graph.
+            return
+
+        if self.essential_node_count() <= 1:
+            # Reached leaf node, no more partitioning.
+            return
+
+        full_kwargs = {
+            k.debugName(): v for k, v in zip(self.graph.inputs(), self.input_args)
+        }
+        full_params = self.params_dict
+
+        upper_graph = self._partition_upper_graph()
+        upper_args, upper_params = self._args_and_params_for_partition_graph(
+            upper_graph, {}, full_kwargs, full_params
+        )
+        self.upper_graph_info = GraphInfo(
+            upper_graph,
+            upper_args,
+            upper_params,
+            self.export_options,
+            id=self.id + "0",
+        )
+
+        self.upper_graph_info.find_mismatch(options)
+
+        bridge_kwargs = self.upper_graph_info._bridge_kwargs()
+        lower_graph = self._partition_lower_graph()
+        lower_args, lower_params = self._args_and_params_for_partition_graph(
+            lower_graph, bridge_kwargs, full_kwargs, full_params
+        )
+        self.lower_graph_info = GraphInfo(
+            lower_graph,
+            lower_args,
+            lower_params,
+            self.export_options,
+            id=self.id + "1",
+        )
+
+        self.lower_graph_info.find_mismatch(options)
+
+
+@_beartype.beartype
+def _all_nodes(nodes: Collection[torch.Node]) -> Set[torch.Node]:
+    all_nodes = set(nodes)
+    for n in nodes:
+        for b in n.blocks():
+            all_nodes.update(_all_nodes(list(b.nodes())))
+    return all_nodes
+
+
+@_beartype.beartype
+def _has_uses_by_nodes(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    if any(use.user in nodes for use in value.uses()):
+        return True
+    return False
+
+
+@_beartype.beartype
+def _node_has_uses_by(node: torch.Node, nodes: Collection[torch.Node]) -> bool:
+    for output in node.outputs():
+        if _has_uses_by_nodes(output, nodes):
+            return True
+    return False
+
+
+@_beartype.beartype
+def _produced_by(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    return value.node() in nodes
+
+
+@_beartype.beartype
+def find_mismatch(
+    model: Union[torch.nn.Module, torch.jit.ScriptModule],
+    input_args: Tuple[Any, ...],
+    do_constant_folding: bool = True,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: Optional[int] = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    options: Optional[VerificationOptions] = None,
+) -> GraphInfo:
+    r"""Find all mismatches between the original model and the exported model.
+
+    Experimental. The API is subject to change.
+
+    This tool helps debug the mismatch between the original PyTorch model and exported
+    ONNX model. It binary searches the model graph to find the minimal subgraph that
+    exhibits the mismatch.
+
+    Args:
+        model: The model to be exported.
+        input_args: The input arguments to the model.
+        do_constant_folding: Same as `do_constant_folding` in :func:`torch.onnx.export`.
+        training: Same as `training` in :func:`torch.onnx.export`.
+        opset_version: Same as `opset_version` in :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: Same as `keep_initializers_as_inputs` in :func:`torch.onnx.export`.
+        verbose: Same as `verbose` in :func:`torch.onnx.export`.
+        options: The options for the mismatch verification.
+
+    Returns:
+        A GraphInfo object that contains the mismatch information.
+
+    Example::
+
+        >>> import torch
+        >>> import torch.onnx.verification
+        >>> torch.manual_seed(0)
+        >>> opset_version = 15
+        >>> # Define a custom symbolic function for aten::relu.
+        >>> # The custom symbolic function is incorrect, which will result in mismatches.
+        >>> def incorrect_relu_symbolic_function(g, self):
+        ...     return self
+        >>> torch.onnx.register_custom_op_symbolic(
+        ...     "aten::relu",
+        ...     incorrect_relu_symbolic_function,
+        ...     opset_version=opset_version,
+        ... )
+        >>> class Model(torch.nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.layers = torch.nn.Sequential(
+        ...             torch.nn.Linear(3, 4),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(4, 5),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(5, 6),
+        ...         )
+        ...     def forward(self, x):
+        ...         return self.layers(x)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> graph_info = torch.onnx.verification.find_mismatch(
+        ...     Model(),
+        ...     (torch.randn(2, 3),),
+        ...     opset_version=opset_version,
+        ... )
+        ===================== Mismatch info for graph partition : ======================
+        ================================ Mismatch error ================================
+        Tensor-likes are not close!
+        Mismatched elements: 12 / 12 (100.0%)
+        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
+        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
+        ==================================== Tree: =====================================
+        5 X   __2 X    __1 ✓
+        id:  |  id: 0 |  id: 00
+             |        |
+             |        |__1 X (aten::relu)
+             |           id: 01
+             |
+             |__3 X    __1 ✓
+                id: 1 |  id: 10
+                      |
+                      |__2 X     __1 X (aten::relu)
+                         id: 11 |  id: 110
+                                |
+                                |__1 ✓
+                                   id: 111
+        =========================== Mismatch leaf subgraphs: ===========================
+        ['01', '110']
+        ============================= Mismatch node kinds: =============================
+        {'aten::relu': 2}
+
+    """
+    if options is None:
+        options = VerificationOptions()
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+    """From aten graph, do binary search on graph partition to find operator export discrepancy."""
+    # TODO: Copied from utils.py `export` until `_optimize_graph`.
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad():
+        inputs_for_export = _prepare_input_for_export(input_args, {})
+        args = utils._decide_input_format(model, inputs_for_export)
+
+        model = utils._pre_trace_quant_model(model, args)
+        graph, params, torch_out, module = utils._create_jit_graph(model, args)
+        params_dict = utils._get_named_param_dict(graph, params)
+
+        utils._apply_friendly_debug_names(graph, params_dict)
+
+        graph_info = GraphInfo(
+            graph,
+            input_args,
+            params_dict,
+            _experimental.ExportOptions(
+                do_constant_folding=do_constant_folding,
+                training=training,
+                opset_version=opset_version,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                verbose=verbose,
+            ),
+        )
+        graph_info.find_mismatch(options)
+        graph_info.pretty_print_mismatch()
+        graph_info.pretty_print_tree()
+
+        return graph_info
diff --git a/MLPY/Lib/site-packages/torch/optim/__init__.py b/MLPY/Lib/site-packages/torch/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7e490071c3921f975606c5a12bf92ba8676d3d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/__init__.py
@@ -0,0 +1,39 @@
+"""
+:mod:`torch.optim` is a package implementing various optimization algorithms.
+
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can also be easily integrated in the
+future.
+"""
+
+from .adadelta import Adadelta
+from .adagrad import Adagrad
+from .adam import Adam
+from .adamw import AdamW
+from .sparse_adam import SparseAdam
+from .adamax import Adamax
+from .asgd import ASGD
+from .sgd import SGD
+from .radam import RAdam
+from .rprop import Rprop
+from .rmsprop import RMSprop
+from .optimizer import Optimizer
+from .nadam import NAdam
+from .lbfgs import LBFGS
+from . import lr_scheduler
+from . import swa_utils
+
+del adadelta  # noqa: F821
+del adagrad  # noqa: F821
+del adam  # noqa: F821
+del adamw  # noqa: F821
+del sparse_adam  # noqa: F821
+del adamax  # noqa: F821
+del asgd  # noqa: F821
+del sgd  # noqa: F821
+del radam  # noqa: F821
+del rprop  # noqa: F821
+del rmsprop  # noqa: F821
+del optimizer  # noqa: F821
+del nadam  # noqa: F821
+del lbfgs  # noqa: F821
diff --git a/MLPY/Lib/site-packages/torch/optim/__init__.pyi b/MLPY/Lib/site-packages/torch/optim/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7b711f16ffe65721a8b95593d4672ad48ff82808
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/__init__.pyi
@@ -0,0 +1,15 @@
+from . import lr_scheduler as lr_scheduler, swa_utils as swa_utils
+from .adadelta import Adadelta as Adadelta
+from .adagrad import Adagrad as Adagrad
+from .adam import Adam as Adam
+from .adamax import Adamax as Adamax
+from .adamw import AdamW as AdamW
+from .asgd import ASGD as ASGD
+from .lbfgs import LBFGS as LBFGS
+from .nadam import NAdam as NAdam
+from .optimizer import Optimizer as Optimizer
+from .radam import RAdam as RAdam
+from .rmsprop import RMSprop as RMSprop
+from .rprop import Rprop as Rprop
+from .sgd import SGD as SGD
+from .sparse_adam import SparseAdam as SparseAdam
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55eb25a916f84c8554b53b572beb4a2f83c47535
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/_functional.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/_functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37714d1749042a899a6ac30e467fb2efb2eb8711
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/_functional.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/adadelta.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/adadelta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d98891349fabfecd13c122f0c81658fc0214f22d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/adadelta.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/adagrad.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/adagrad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94e882ef15088aa1e98053870a7f0d50ed544993
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/adagrad.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/adam.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/adam.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c3a4cb6eb525163d164a5819703474b57c1527e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/adam.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/adamax.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/adamax.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d80a02fb780cba11cdaf5b99c2266e5f193c110c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/adamax.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/adamw.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/adamw.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54edb3428c5143763b9bbf83d4523ef5f7ba7ef6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/adamw.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/asgd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/asgd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..beea06f2fd1d6923fec9167cd538ea91b738b372
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/asgd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/lbfgs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/lbfgs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c1660196cf88ee0f15a095e753208502aae2ee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/lbfgs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/lr_scheduler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/lr_scheduler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa0203f1270c78bc7b2af4c7a2894be1ee3d5bc2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/lr_scheduler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/nadam.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/nadam.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab0664d51e8cdf302aaff0c5fba0cd0efad8fcbd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/nadam.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac8b473069e18ced5944d8ef243ed0abf2c6a929
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/radam.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/radam.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef73ba8ca83d43608561a6fc7818e1a59e9f7c1c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/radam.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/rmsprop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/rmsprop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a396b2b76eb73f3e83b18ebe503278f800dc8312
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/rmsprop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/rprop.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/rprop.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d1e8988e48703e16847a2e53b235cbfc4f2ff95
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/rprop.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/sgd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/sgd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36bebca86d90bc2d542da56110f911a4ee042a46
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/sgd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/sparse_adam.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/sparse_adam.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38bf679b17dbb1fb0915ce74614186a6768adefa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/sparse_adam.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/__pycache__/swa_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/__pycache__/swa_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..276b88a2dcddb78e9d229dbc5c0a804913175968
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/__pycache__/swa_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/_functional.py b/MLPY/Lib/site-packages/torch/optim/_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff9bdd980e1d5d1fd1ca5bcbad8155d399125bb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/_functional.py
@@ -0,0 +1,79 @@
+r"""Functional interface."""
+import math
+from torch import Tensor
+from typing import List
+
+from .adadelta import adadelta  # type: ignore[attr-defined] # noqa: F401
+from .adagrad import adagrad, _make_sparse  # type: ignore[attr-defined] # noqa: F401
+from .adam import adam  # type: ignore[attr-defined] # noqa: F401
+from .adamw import adamw  # type: ignore[attr-defined] # noqa: F401
+from .adamax import adamax  # type: ignore[attr-defined] # noqa: F401
+from .asgd import asgd  # type: ignore[attr-defined] # noqa: F401
+from .nadam import nadam  # type: ignore[attr-defined] # noqa: F401
+from .radam import radam  # type: ignore[attr-defined] # noqa: F401
+from .rmsprop import rmsprop  # type: ignore[attr-defined] # noqa: F401
+from .rprop import rprop  # type: ignore[attr-defined] # noqa: F401
+from .sgd import sgd  # type: ignore[attr-defined] # noqa: F401
+
+
+# TODO: use foreach API in optim._functional to do all the computation
+
+
+def sparse_adam(params: List[Tensor],
+                grads: List[Tensor],
+                exp_avgs: List[Tensor],
+                exp_avg_sqs: List[Tensor],
+                state_steps: List[int],
+                *,
+                eps: float,
+                beta1: float,
+                beta2: float,
+                lr: float,
+                maximize: bool):
+    r"""Functional API that performs Sparse Adam algorithm computation.
+
+    See :class:`~torch.optim.SparseAdam` for details.
+    """
+    for i, param in enumerate(params):
+        grad = grads[i]
+        grad = grad if not maximize else -grad
+        grad = grad.coalesce()  # the update is non-linear so indices must be unique
+        grad_indices = grad._indices()
+        grad_values = grad._values()
+        if grad_values.numel() == 0:
+            # Skip update for empty grad
+            continue
+        size = grad.size()
+
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step = state_steps[i]
+
+
+        def make_sparse(values):
+            constructor = grad.new
+            if grad_indices.dim() == 0 or values.dim() == 0:
+                return constructor().resize_as_(grad)
+            return constructor(grad_indices, values, size)
+
+        # Decay the first and second moment running average coefficient
+        #      old <- b * old + (1 - b) * new
+        # <==> old += (1 - b) * (new - old)
+        old_exp_avg_values = exp_avg.sparse_mask(grad)._values()
+        exp_avg_update_values = grad_values.sub(old_exp_avg_values).mul_(1 - beta1)
+        exp_avg.add_(make_sparse(exp_avg_update_values))
+        old_exp_avg_sq_values = exp_avg_sq.sparse_mask(grad)._values()
+        exp_avg_sq_update_values = grad_values.pow(2).sub_(old_exp_avg_sq_values).mul_(1 - beta2)
+        exp_avg_sq.add_(make_sparse(exp_avg_sq_update_values))
+
+        # Dense addition again is intended, avoiding another sparse_mask
+        numer = exp_avg_update_values.add_(old_exp_avg_values)
+        exp_avg_sq_update_values.add_(old_exp_avg_sq_values)
+        denom = exp_avg_sq_update_values.sqrt_().add_(eps)
+        del exp_avg_update_values, exp_avg_sq_update_values
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+        step_size = lr * math.sqrt(bias_correction2) / bias_correction1
+
+        param.add_(make_sparse(-step_size * numer.div_(denom)))
diff --git a/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.py b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9faf984be0d79f22ddf9c07362f85afe0736c55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.py
@@ -0,0 +1,28 @@
+"""
+:mod:`torch.optim._multi_tensor` is a package implementing various optimization algorithms.
+Most commonly used methods are already supported, and the interface is general
+enough, so that more sophisticated ones can be also easily integrated in the
+future.
+"""
+from functools import partialmethod
+from torch import optim
+
+def partialclass(cls, *args, **kwargs):
+
+    class NewCls(cls):
+        __init__ = partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+Adam = partialclass(optim.Adam, foreach=True)
+AdamW = partialclass(optim.AdamW, foreach=True)
+NAdam = partialclass(optim.NAdam, foreach=True)
+SGD = partialclass(optim.SGD, foreach=True)
+RAdam = partialclass(optim.RAdam, foreach=True)
+RMSprop = partialclass(optim.RMSprop, foreach=True)
+Rprop = partialclass(optim.Rprop, foreach=True)
+ASGD = partialclass(optim.ASGD, foreach=True)
+Adamax = partialclass(optim.Adamax, foreach=True)
+Adadelta = partialclass(optim.Adadelta, foreach=True)
+Adagrad = partialclass(optim.Adagrad, foreach=True)
diff --git a/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.pyi b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..b153c0a8f4abc314fd4ed21c2f71380cf12f7c53
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__init__.pyi
@@ -0,0 +1,15 @@
+from functools import partial
+
+from torch import optim
+
+Adam = partial(optim.Adam, foreach=True)
+AdamW = partial(optim.AdamW, foreach=True)
+NAdam = partial(optim.NAdam, foreach=True)
+SGD = partial(optim.SGD, foreach=True)
+RAdam = partial(optim.RAdam, foreach=True)
+RMSprop = partial(optim.RMSprop, foreach=True)
+Rprop = partial(optim.Rprop, foreach=True)
+ASGD = partial(optim.ASGD, foreach=True)
+Adamax = partial(optim.Adamax, foreach=True)
+Adadelta = partial(optim.Adadelta, foreach=True)
+Adagrad = partial(optim.Adagrad, foreach=True)
diff --git a/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bdf3f0a0fedf41385626102bf45977cb29ff6a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/optim/_multi_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/optim/adadelta.py b/MLPY/Lib/site-packages/torch/optim/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a494ee0951ddaaa4b7a559e818eaf8be8e24e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adadelta.py
@@ -0,0 +1,316 @@
+import torch
+from torch import Tensor
+
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
+                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
+from typing import List, Optional
+
+__all__ = ["Adadelta", "adadelta"]
+
+
+class Adadelta(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1.0,
+        rho=0.9,
+        eps=1e-6,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= rho <= 1.0:
+            raise ValueError(f"Invalid rho value: {rho}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(
+            lr=lr,
+            rho=rho,
+            eps=eps,
+            weight_decay=weight_decay,
+            maximize=maximize,
+            foreach=foreach,
+            differentiable=differentiable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+
+    def _init_group(self, group, params_with_grad, grads, square_avgs, acc_deltas):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            has_complex |= torch.is_complex(p)
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("Adadelta does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # Lazy state initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["square_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                state["acc_delta"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+
+            square_avgs.append(state["square_avg"])
+            acc_deltas.append(state["acc_delta"])
+
+            state["step"] += 1
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            square_avgs = []
+            acc_deltas = []
+            lr, rho, eps, weight_decay, foreach, maximize, differentiable = (
+                group["lr"],
+                group["rho"],
+                group["eps"],
+                group["weight_decay"],
+                group["foreach"],
+                group["maximize"],
+                group["differentiable"],
+            )
+
+            has_complex = self._init_group(group, params_with_grad, grads, square_avgs, acc_deltas)
+
+            adadelta(
+                params_with_grad,
+                grads,
+                square_avgs,
+                acc_deltas,
+                lr=lr,
+                rho=rho,
+                eps=eps,
+                weight_decay=weight_decay,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=differentiable,
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+Adadelta.__doc__ = r"""Implements Adadelta algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
+                \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
+                \: \lambda \text{ (weight decay)}                                                \\
+            &\textbf{initialize} :  v_0  \leftarrow 0 \: \text{ (square avg)},
+                \: u_0 \leftarrow 0 \: \text{ (accumulate variables)}                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm} v_t      \leftarrow v_{t-1} \rho + g^2_t (1 - \rho)                    \\
+            &\hspace{5mm}\Delta x_t    \leftarrow   \frac{\sqrt{u_{t-1} +
+                \epsilon }}{ \sqrt{v_t + \epsilon}  }g_t \hspace{21mm}                           \\
+            &\hspace{5mm} u_t  \leftarrow   u_{t-1}  \rho +
+                 \Delta x^2_t  (1 - \rho)                                                        \\
+            &\hspace{5mm}\theta_t      \leftarrow   \theta_{t-1} - \gamma  \Delta x_t            \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        rho (float, optional): coefficient used for computing a running average
+            of squared gradients (default: 0.9). A higher value of `rho` will
+            result in a slower average, which can be helpful for preventing
+            oscillations in the learning process.
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-6).
+        lr (float, optional): coefficient that scale delta before it is applied
+            to the parameters (default: 1.0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+
+    .. _ADADELTA\: An Adaptive Learning Rate Method:
+        https://arxiv.org/abs/1212.5701
+
+    """
+
+
+def adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    lr: float,
+    rho: float,
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+):
+    r"""Functional API that performs Adadelta algorithm computation.
+
+    See :class:`~torch.optim.Adadelta` for details.
+    """
+    # We still respect when the user inputs False for foreach.
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adadelta
+    else:
+        func = _single_tensor_adadelta
+
+    func(
+        params,
+        grads,
+        square_avgs,
+        acc_deltas,
+        lr=lr,
+        rho=rho,
+        eps=eps,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    *,
+    lr: float,
+    rho: float,
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    for (param, grad, square_avg, acc_delta) in zip(
+        params, grads, square_avgs, acc_deltas
+    ):
+        grad = grad if not maximize else -grad
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        if torch.is_complex(param):
+            square_avg = torch.view_as_real(square_avg)
+            acc_delta = torch.view_as_real(acc_delta)
+            grad = torch.view_as_real(grad)
+
+        square_avg.mul_(rho).addcmul_(grad, grad, value=1 - rho)
+        std = square_avg.add(eps).sqrt_()
+        delta = acc_delta.add(eps).sqrt_()
+        if differentiable:
+            delta = delta.clone()
+        delta.div_(std).mul_(grad)
+        acc_delta.mul_(rho).addcmul_(delta, delta, value=1 - rho)
+
+        if torch.is_complex(param):
+            delta = torch.view_as_complex(delta)
+        param.add_(delta, alpha=-lr)
+
+
+def _multi_tensor_adadelta(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    acc_deltas: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    rho: float,
+    eps: float,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    if len(params) == 0:
+        return
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, square_avgs, acc_deltas])
+    for ((device_params, device_grads, device_square_avgs, device_acc_deltas), _) in grouped_tensors.values():
+        if has_complex:
+            _view_as_real(device_params, device_grads, device_square_avgs, device_acc_deltas)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+            else:
+                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+
+        torch._foreach_mul_(device_square_avgs, rho)
+        torch._foreach_addcmul_(device_square_avgs, device_grads, device_grads, value=1 - rho)
+
+        std = torch._foreach_add(device_square_avgs, eps)
+        torch._foreach_sqrt_(std)
+
+        deltas = torch._foreach_add(device_acc_deltas, eps)
+        torch._foreach_sqrt_(deltas)
+        torch._foreach_div_(deltas, std)
+        torch._foreach_mul_(deltas, device_grads)
+
+        torch._foreach_add_(device_params, deltas, alpha=-lr)
+
+        torch._foreach_mul_(device_acc_deltas, rho)
+        torch._foreach_addcmul_(device_acc_deltas, deltas, deltas, value=1 - rho)
diff --git a/MLPY/Lib/site-packages/torch/optim/adadelta.pyi b/MLPY/Lib/site-packages/torch/optim/adadelta.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..f8c8203239cebae2ef4f1915f625d9febcc073fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adadelta.pyi
@@ -0,0 +1,11 @@
+from .optimizer import Optimizer, ParamsT
+
+class Adadelta(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        rho: float = ...,
+        eps: float = ...,
+        weight_decay: float = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/adagrad.py b/MLPY/Lib/site-packages/torch/optim/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0283a6418228ae0f4f9e268c18a9f2b32fcbb7a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adagrad.py
@@ -0,0 +1,384 @@
+import torch
+from torch import Tensor
+
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _view_as_real,
+                        _default_to_fused_or_foreach, _get_scalar_dtype, _differentiable_doc,
+                        _foreach_doc, _maximize_doc)
+from typing import List, Optional
+
+__all__ = ["Adagrad", "adagrad"]
+
+
+class Adagrad(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lr_decay=0,
+        weight_decay=0,
+        initial_accumulator_value=0,
+        eps=1e-10,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= lr_decay:
+            raise ValueError(f"Invalid lr_decay value: {lr_decay}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if not 0.0 <= initial_accumulator_value:
+            raise ValueError(
+                f"Invalid initial_accumulator_value value: {initial_accumulator_value}"
+            )
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+
+        defaults = dict(
+            lr=lr,
+            lr_decay=lr_decay,
+            eps=eps,
+            weight_decay=weight_decay,
+            initial_accumulator_value=initial_accumulator_value,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
+        super().__init__(params, defaults)
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                state = self.state[p]
+                state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype())
+                init_value = (
+                    complex(initial_accumulator_value, initial_accumulator_value)
+                    if torch.is_complex(p)
+                    else initial_accumulator_value
+                )
+                state["sum"] = torch.full_like(
+                    p, init_value, memory_format=torch.preserve_format
+                )
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+
+        state_values = list(self.state.values())
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
+            state_values[0]["step"]
+        )
+        if not step_is_tensor:
+            for s in state_values:
+                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
+
+    def share_memory(self):
+        for group in self.param_groups:
+            for p in group["params"]:
+                state = self.state[p]
+                state["sum"].share_memory_()
+
+    def _init_group(self, group, params_with_grad, grads, state_sums, state_steps):
+        has_sparse_grad, has_complex = False, False
+        for p in group["params"]:
+            if p.grad is not None:
+                has_sparse_grad |= p.grad.is_sparse
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                grads.append(p.grad)
+                state = self.state[p]
+                state_sums.append(state["sum"])
+                state_steps.append(state["step"])
+
+        return has_sparse_grad, has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            state_sums = []
+            state_steps = []
+
+            has_sparse_grad, has_complex = self._init_group(group, params_with_grad, grads, state_sums, state_steps)
+
+            adagrad(
+                params_with_grad,
+                grads,
+                state_sums,
+                state_steps,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                lr_decay=group["lr_decay"],
+                eps=group["eps"],
+                has_sparse_grad=has_sparse_grad,
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+                differentiable=group["differentiable"],
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+Adagrad.__doc__ = r"""Implements Adagrad algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
+                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
+            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
+            &\textbf{initialize} :  state\_sum_0 \leftarrow 0                             \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
+            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
+            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
+            &\hspace{5mm}\theta_t \leftarrow
+                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
+    and Stochastic Optimization`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lr_decay (float, optional): learning rate decay (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-10)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+
+    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
+        Optimization: http://jmlr.org/papers/v12/duchi11a.html
+
+    """
+
+
+def adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
+    has_sparse_grad: bool = None,
+    foreach: Optional[bool] = None,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    maximize: bool,
+):
+    r"""Functional API that performs Adagrad algorithm computation.
+
+    See :class:`~torch.optim.Adagrad` for details.
+    """
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adagrad
+    else:
+        func = _single_tensor_adagrad
+
+    func(
+        params,
+        grads,
+        state_sums,
+        state_steps,
+        lr=lr,
+        weight_decay=weight_decay,
+        lr_decay=lr_decay,
+        eps=eps,
+        has_sparse_grad=has_sparse_grad,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
+
+
+def _make_sparse(grad, grad_indices, values):
+    size = grad.size()
+    if grad_indices.numel() == 0 or values.numel() == 0:
+        return torch.empty_like(grad)
+    return torch.sparse_coo_tensor(grad_indices, values, size)
+
+
+def _single_tensor_adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    has_sparse_grad: bool,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    for (param, grad, state_sum, step_t) in zip(params, grads, state_sums, state_steps):
+        # update step
+        step_t += 1
+        step = _get_value(step_t)
+        grad = grad if not maximize else -grad
+
+        if weight_decay != 0:
+            if grad.is_sparse:
+                raise RuntimeError(
+                    "weight_decay option is not compatible with sparse gradients"
+                )
+            grad = grad.add(param, alpha=weight_decay)
+
+        clr = lr / (1 + (step - 1) * lr_decay)
+
+        if grad.is_sparse:
+            grad = grad.coalesce()  # the update is non-linear so indices must be unique
+            grad_indices = grad._indices()
+            grad_values = grad._values()
+
+            state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
+            std = state_sum.sparse_mask(grad)
+            std_values = std._values().sqrt_().add_(eps)
+            param.add_(
+                _make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr
+            )
+        else:
+            is_complex = torch.is_complex(param)
+            if is_complex:
+                grad = torch.view_as_real(grad)
+                state_sum = torch.view_as_real(state_sum)
+                param = torch.view_as_real(param)
+            state_sum.addcmul_(grad, grad, value=1)
+            if differentiable:
+                std = state_sum.sqrt() + eps
+            else:
+                std = state_sum.sqrt().add_(eps)
+            param.addcdiv_(grad, std, value=-clr)
+            if is_complex:
+                param = torch.view_as_complex(param)
+                state_sum = torch.view_as_complex(state_sum)
+
+
+def _multi_tensor_adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    has_sparse_grad: bool,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    # Foreach functions will throw errors if given empty lists
+    if len(params) == 0:
+        return
+
+    grouped_tensorlists = Optimizer._group_tensors_by_device_and_dtype([params, grads, state_sums, state_steps])
+    for ((device_params, device_grads, device_state_sums, device_state_steps), _) in grouped_tensorlists.values():
+        device_has_sparse_grad = has_sparse_grad and any(grad.is_sparse for grad in device_grads)
+
+        if device_has_sparse_grad:
+            _single_tensor_adagrad(
+                device_params,
+                device_grads,
+                device_state_sums,
+                device_state_steps,
+                lr=lr,
+                weight_decay=weight_decay,
+                lr_decay=lr_decay,
+                eps=eps,
+                has_sparse_grad=True,
+                maximize=False,
+                differentiable=differentiable,
+                has_complex=has_complex,
+            )
+            continue
+
+        # Handle complex parameters
+        if has_complex:
+            _view_as_real(device_params, device_grads, device_state_sums)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if device_state_steps[0].is_cpu:
+            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(device_state_steps, 1)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+            else:
+                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+
+        minus_clr = [-lr / (1 + (_get_value(step) - 1) * lr_decay) for step in device_state_steps]
+
+        torch._foreach_addcmul_(device_state_sums, device_grads, device_grads, value=1)
+
+        std = torch._foreach_sqrt(device_state_sums)
+        torch._foreach_add_(std, eps)
+
+        if weight_decay != 0 or maximize:
+            # Again, re-use the intermediate memory (device_grads) already allocated
+            torch._foreach_mul_(device_grads, minus_clr)
+            numerator = device_grads
+        else:
+            numerator = torch._foreach_mul(device_grads, minus_clr)
+
+        torch._foreach_addcdiv_(device_params, numerator, std)
diff --git a/MLPY/Lib/site-packages/torch/optim/adagrad.pyi b/MLPY/Lib/site-packages/torch/optim/adagrad.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..45d02bf89204a598571d7988f21509c153f9d8c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adagrad.pyi
@@ -0,0 +1,12 @@
+from .optimizer import Optimizer, ParamsT
+
+class Adagrad(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        lr_decay: float = ...,
+        weight_decay: float = ...,
+        initial_accumulator_value: float = ...,
+        eps: float = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/adam.py b/MLPY/Lib/site-packages/torch/optim/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..f191aab4bc9dc47b7948abb2b1b2f1584cfc14be
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adam.py
@@ -0,0 +1,660 @@
+from typing import List, Optional, Union, Tuple
+
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, ParamsT, _use_grad_for_differentiable, _get_value,
+                        _stack_if_compiling, _dispatch_sqrt, _default_to_fused_or_foreach,
+                        _get_scalar_dtype, _capturable_doc, _differentiable_doc, _foreach_doc,
+                        _fused_doc, _maximize_doc, _view_as_real)
+from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
+
+__all__ = ['Adam', 'adam']
+
+
+class Adam(Optimizer):
+    def __init__(self,
+                 params: ParamsT,
+                 lr: Union[float, Tensor] = 1e-3,
+                 betas: Tuple[float, float] = (0.9, 0.999),
+                 eps: float = 1e-8,
+                 weight_decay: float = 0,
+                 amsgrad: bool = False,
+                 *,
+                 foreach: Optional[bool] = None,
+                 maximize: bool = False,
+                 capturable: bool = False,
+                 differentiable: bool = False,
+                 fused: Optional[bool] = None):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if isinstance(lr, Tensor) and foreach and not capturable:
+            raise ValueError("lr as a Tensor is not supported for capturable=False and foreach=True")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad,
+                        maximize=maximize, foreach=foreach, capturable=capturable,
+                        differentiable=differentiable, fused=fused)
+        super().__init__(params, defaults)
+
+        if fused:
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            self._step_supports_amp_scaling = True
+            # TODO(crcrpar): [low prec params & their higher prec copy]
+            # Support AMP with FP16/BF16 model params which would need
+            # higher prec copy of params to do update math in higher prec to
+            # alleviate the loss of information.
+            fused_supported_devices = _get_fused_kernels_supported_devices()
+            if not all(
+                p.device.type in fused_supported_devices and
+                torch.is_floating_point(p) for pg in self.param_groups for p in pg['params']
+            ):
+                raise RuntimeError("`fused=True` requires all the params to be floating point Tensors of "
+                                   f"supported devices: {fused_supported_devices}.")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+            group.setdefault('maximize', False)
+            group.setdefault('foreach', None)
+            group.setdefault('capturable', False)
+            group.setdefault('differentiable', False)
+            fused = group.setdefault('fused', None)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (torch.tensor(step_val, dtype=_get_scalar_dtype(is_fused=fused), device=p.device)
+                                       if group['capturable'] or group['fused']
+                                       else torch.tensor(step_val, dtype=_get_scalar_dtype()))
+
+    def _init_group(
+        self,
+        group,
+        params_with_grad,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps
+    ):
+        has_complex = False
+        for p in group['params']:
+            if p.grad is not None:
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    # note(crcrpar): [special device hosting for step]
+                    # Deliberately host `step` on CPU if both capturable and fused are off.
+                    # This is because kernel launches are costly on CUDA and XLA.
+                    state['step'] = (
+                        torch.zeros((), dtype=_get_scalar_dtype(is_fused=group['fused']), device=p.device)
+                        if group['capturable'] or group['fused']
+                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    )
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if group['amsgrad']:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if group['amsgrad']:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                if group['differentiable'] and state['step'].requires_grad:
+                    raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
+
+                # Foreach without capturable does not support a tensor lr
+                if group['foreach'] and torch.is_tensor(group['lr']) and not group['capturable']:
+                    raise RuntimeError('lr as a Tensor is not supported for capturable=False and foreach=True')
+
+                state_steps.append(state['step'])
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            beta1, beta2 = group['betas']
+
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps)
+
+            adam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=group['amsgrad'],
+                has_complex=has_complex,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group['lr'],
+                weight_decay=group['weight_decay'],
+                eps=group['eps'],
+                maximize=group['maximize'],
+                foreach=group['foreach'],
+                capturable=group['capturable'],
+                differentiable=group['differentiable'],
+                fused=group['fused'],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
+            )
+
+        return loss
+
+
+Adam.__doc__ = r"""Implements Adam algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
+                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)}          \\
+            &\hspace{13mm}      \lambda \text{ (weight decay)},  \: \textit{amsgrad},
+                \:\textit{maximize}                                                              \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                v_0\leftarrow 0 \text{ (second moment)},\: \widehat{v_0}^{max}\leftarrow 0\\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+
+            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
+            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
+            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
+            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
+            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
+            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
+                \widehat{v_t})                                                                   \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
+            is not yet supported for all our implementations. Please use a float
+            LR if you are not also specifying fused=True or capturable=True.
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (bool, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_capturable_doc}
+        {_differentiable_doc}
+        {_fused_doc}
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+
+    """
+
+
+def adam(params: List[Tensor],
+         grads: List[Tensor],
+         exp_avgs: List[Tensor],
+         exp_avg_sqs: List[Tensor],
+         max_exp_avg_sqs: List[Tensor],
+         state_steps: List[Tensor],
+         # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+         # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+         foreach: Optional[bool] = None,
+         capturable: bool = False,
+         differentiable: bool = False,
+         fused: Optional[bool] = None,
+         grad_scale: Optional[Tensor] = None,
+         found_inf: Optional[Tensor] = None,
+         has_complex: bool = False,
+         *,
+         amsgrad: bool,
+         beta1: float,
+         beta2: float,
+         lr: Union[float, Tensor],
+         weight_decay: float,
+         eps: float,
+         maximize: bool):
+    r"""Functional API that performs Adam algorithm computation.
+
+    See :class:`~torch.optim.Adam` for details.
+    """
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
+        if foreach and isinstance(lr, Tensor) and not capturable:
+            foreach = False
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
+    # this check is slow during compilation, so we skip it
+    # if it's strictly needed we can add this check back in dynamo
+    if not torch._utils.is_compiling() and not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
+
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adam
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adam
+    else:
+        func = _single_tensor_adam
+
+    func(params,
+         grads,
+         exp_avgs,
+         exp_avg_sqs,
+         max_exp_avg_sqs,
+         state_steps,
+         amsgrad=amsgrad,
+         has_complex=has_complex,
+         beta1=beta1,
+         beta2=beta2,
+         lr=lr,
+         weight_decay=weight_decay,
+         eps=eps,
+         maximize=maximize,
+         capturable=capturable,
+         differentiable=differentiable,
+         grad_scale=grad_scale,
+         found_inf=found_inf)
+
+
+def _single_tensor_adam(params: List[Tensor],
+                        grads: List[Tensor],
+                        exp_avgs: List[Tensor],
+                        exp_avg_sqs: List[Tensor],
+                        max_exp_avg_sqs: List[Tensor],
+                        state_steps: List[Tensor],
+                        grad_scale: Optional[Tensor],
+                        found_inf: Optional[Tensor],
+                        *,
+                        amsgrad: bool,
+                        has_complex: bool,
+                        beta1: float,
+                        beta2: float,
+                        lr: Union[float, Tensor],
+                        weight_decay: float,
+                        eps: float,
+                        maximize: bool,
+                        capturable: bool,
+                        differentiable: bool):
+
+    assert grad_scale is None and found_inf is None
+
+    if torch.jit.is_scripting():
+        # this assert is due to JIT being dumb and not realizing that the ops below
+        # have overloads to handle both float and Tensor lrs, so we just assert it's
+        # a float since most people using JIT are using floats
+        assert isinstance(lr, float)
+
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (
+                (param.is_cuda and step_t.is_cuda) or (param.is_xla and step_t.is_xla)
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+        # update step
+        step_t += 1
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            exp_avg_sq = torch.view_as_real(exp_avg_sq)
+            if amsgrad:
+                max_exp_avg_sqs[i] = torch.view_as_real(max_exp_avg_sqs[i])
+            param = torch.view_as_real(param)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.lerp_(grad, 1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
+
+        if capturable or differentiable:
+            step = step_t
+
+            bias_correction1 = 1 - beta1 ** step
+            bias_correction2 = 1 - beta2 ** step
+
+            step_size = lr / bias_correction1
+            step_size_neg = step_size.neg()
+
+            bias_correction2_sqrt = bias_correction2.sqrt()
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                if differentiable:
+                    max_exp_avg_sq = max_exp_avg_sqs[i].clone()
+                else:
+                    max_exp_avg_sq = max_exp_avg_sqs[i]
+
+                max_exp_avg_sqs[i].copy_(torch.maximum(max_exp_avg_sq, exp_avg_sq))
+
+                # Uses the max. for normalizing running avg. of gradient
+                # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
+                # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
+                denom = (max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+            else:
+                denom = (exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)).add_(eps / step_size_neg)
+
+            param.addcdiv_(exp_avg, denom)
+        else:
+            step = _get_value(step_t)
+
+            bias_correction1 = 1 - beta1 ** step
+            bias_correction2 = 1 - beta2 ** step
+
+            step_size = lr / bias_correction1
+
+            bias_correction2_sqrt = _dispatch_sqrt(bias_correction2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
+
+                # Use the max. for normalizing running avg. of gradient
+                denom = (max_exp_avg_sqs[i].sqrt() / bias_correction2_sqrt).add_(eps)
+            else:
+                denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
+
+            param.addcdiv_(exp_avg, denom, value=-step_size)
+
+        # Lastly, switch back to complex view
+        if amsgrad and torch.is_complex(params[i]):
+            max_exp_avg_sqs[i] = torch.view_as_complex(max_exp_avg_sqs[i])
+
+
+def _multi_tensor_adam(params: List[Tensor],
+                       grads: List[Tensor],
+                       exp_avgs: List[Tensor],
+                       exp_avg_sqs: List[Tensor],
+                       max_exp_avg_sqs: List[Tensor],
+                       state_steps: List[Tensor],
+                       grad_scale: Optional[Tensor],
+                       found_inf: Optional[Tensor],
+                       *,
+                       amsgrad: bool,
+                       has_complex: bool,
+                       beta1: float,
+                       beta2: float,
+                       lr: Union[float, Tensor],
+                       weight_decay: float,
+                       eps: float,
+                       maximize: bool,
+                       capturable: bool,
+                       differentiable: bool):
+    if len(params) == 0:
+        return
+
+    if isinstance(lr, Tensor) and not capturable:
+        raise RuntimeError("lr as a Tensor is not supported for capturable=False and foreach=True")
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
+            "If capturable=True, params and state_steps must be CUDA tensors."
+
+    assert grad_scale is None and found_inf is None
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    for ((
+        device_params,
+        device_grads,
+        device_exp_avgs,
+        device_exp_avg_sqs,
+        device_max_exp_avg_sqs,
+        device_state_steps,
+    ), _) in grouped_tensors.values():
+
+        # Handle complex parameters
+        if has_complex:
+            if amsgrad:
+                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs, device_max_exp_avg_sqs)
+            else:
+                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if device_state_steps[0].is_cpu:
+            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(device_state_steps, 1)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+            else:
+                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
+
+        torch._foreach_mul_(device_exp_avg_sqs, beta2)
+        torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
+
+        # Delete the local intermediate since it won't be used anymore to save on peak memory
+        del device_grads
+
+        if capturable:
+            bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
+            bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
+            # foreach_sub doesn't allow a scalar as the first arg
+            torch._foreach_sub_(bias_correction1, 1)
+            torch._foreach_sub_(bias_correction2, 1)
+            # we do not negate bias_correction1 as it'll need to be negated later anyway
+            torch._foreach_neg_(bias_correction2)
+
+            # foreach_div doesn't allow a scalar as the first arg
+            torch._foreach_div_(bias_correction1, lr)
+            torch._foreach_reciprocal_(bias_correction1)
+
+            torch._foreach_sqrt_(bias_correction2)
+
+            # Re-assign for clarity as we maintain minimal intermediates: we'll have
+            # step_size = - lr / (1 - beta1 ^ t) where t = num_steps
+            # bias_correction2_sqrt = sqrt(1 - beta2 ^ t)
+            step_size = bias_correction1
+            bias_correction2_sqrt = bias_correction2
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch._foreach_maximum_(device_max_exp_avg_sqs, device_exp_avg_sqs)  # type: ignore[assignment]
+
+                # Set intermediate to the max. for normalizing running avg. of gradient when amsgrad
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_max_exp_avg_sqs)
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
+
+            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
+            torch._foreach_add_(exp_avg_sq_sqrt, eps)
+            torch._foreach_div_(exp_avg_sq_sqrt, step_size)
+
+            # at this point, exp_avg_sq_sqrt = - (1 - beta^t) * [sqrt(exp_avg_sq / (1 - beta2^t)) + eps] / lr
+            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt)
+        else:
+            bias_correction1 = [1 - beta1 ** _get_value(step) for step in device_state_steps]
+            bias_correction2 = [1 - beta2 ** _get_value(step) for step in device_state_steps]
+
+            step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
+
+            bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2]
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch._foreach_maximum_(device_max_exp_avg_sqs, device_exp_avg_sqs)
+
+                # Use the max. for normalizing running avg. of gradient
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_max_exp_avg_sqs)
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
+
+            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
+            torch._foreach_add_(exp_avg_sq_sqrt, eps)
+            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size)
+
+
+def _fused_adam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    has_complex: bool,  # Needed for consistency.
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,  # Needed for consistency.
+    differentiable: bool,
+) -> None:
+    if not params:
+        return
+    if differentiable:
+        raise RuntimeError("Adam with fused=True does not support differentiable=True")
+
+    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict = {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    for (device, _), ((device_params,
+                       device_grads,
+                       device_exp_avgs,
+                       device_exp_avg_sqs,
+                       device_max_exp_avg_sqs,
+                       device_state_steps,), _) in grouped_tensors.items():
+        device_grad_scale, device_found_inf = None, None
+        if grad_scale is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
+            device_grad_scale = grad_scale_dict[device]
+        if found_inf is not None:
+            if found_inf not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
+            device_found_inf = found_inf_dict[device]
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(device=device, non_blocking=True)
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        torch._fused_adam_(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
+        if device_found_inf is not None:
+            torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
diff --git a/MLPY/Lib/site-packages/torch/optim/adam.pyi b/MLPY/Lib/site-packages/torch/optim/adam.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8cecd1e6b422caf1408e089e05856694440a08e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adam.pyi
@@ -0,0 +1,22 @@
+from typing import Optional, Tuple, Union
+
+from torch import Tensor
+
+from .optimizer import Optimizer, ParamsT
+
+class Adam(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: Union[float, Tensor] = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 0,
+        amsgrad: bool = False,
+        *,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        capturable: bool = False,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/adamax.py b/MLPY/Lib/site-packages/torch/optim/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ce581acfe143a8a0cc67eb44a4152db5ea588c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adamax.py
@@ -0,0 +1,403 @@
+import torch
+from torch import Tensor
+
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _default_to_fused_or_foreach,
+                        _get_scalar_dtype, _differentiable_doc, _maximize_doc, _foreach_doc, _view_as_real,
+                        _capturable_doc)
+from typing import List, Optional
+
+__all__ = ["Adamax", "adamax"]
+
+
+class Adamax(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=2e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        *,
+        maximize: bool = False,
+        differentiable: bool = False,
+        capturable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            capturable=capturable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (torch.tensor(step_val, dtype=_get_scalar_dtype(), device=p.device) if group['capturable']
+                                       else torch.tensor(step_val, dtype=_get_scalar_dtype()))
+
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_infs, state_steps):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            has_complex |= torch.is_complex(p)
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("Adamax does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state['step'] = (torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                                 if group['capturable'] else torch.tensor(0.0, dtype=_get_scalar_dtype()))
+                state["exp_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                state["exp_inf"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+
+            exp_avgs.append(state["exp_avg"])
+            exp_infs.append(state["exp_inf"])
+            state_steps.append(state["step"])
+
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_infs = []
+            state_steps = []
+
+            beta1, beta2 = group["betas"]
+            eps = group["eps"]
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            foreach = group["foreach"]
+            maximize = group["maximize"]
+            differentiable = group["differentiable"]
+            capturable = group["capturable"]
+
+            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_infs, state_steps)
+
+            adamax(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_infs,
+                state_steps,
+                eps=eps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=lr,
+                weight_decay=weight_decay,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=differentiable,
+                capturable=capturable,
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+Adamax.__doc__ = r"""Implements Adamax algorithm (a variant of Adam based on infinity norm).
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \beta_1, \beta_2
+                \text{ (betas)},\theta_0 \text{ (params)},f(\theta) \text{ (objective)},
+                \: \lambda \text{ (weight decay)},                                                \\
+            &\hspace{13mm}    \epsilon \text{ (epsilon)}                                          \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                u_0 \leftarrow 0 \text{ ( infinity norm)}                                 \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}m_t      \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t               \\
+            &\hspace{5mm}u_t      \leftarrow   \mathrm{max}(\beta_2 u_{t-1}, |g_{t}|+\epsilon)   \\
+            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1} - \frac{\gamma m_t}{(1-\beta^t_1) u_t} \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Adam: A Method for Stochastic Optimization`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+        {_capturable_doc}
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+
+    """
+
+
+def adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+):
+    r"""Functional API that performs adamax algorithm computation.
+
+    See :class:`~torch.optim.Adamax` for details.
+    """
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adamax
+    else:
+        func = _single_tensor_adamax
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_infs,
+        state_steps,
+        eps=eps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+        capturable=capturable,
+    )
+
+
+def _single_tensor_adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    eps: float,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+    for i, param in enumerate(params):
+        grad = grads[i]
+        grad = grad if not maximize else -grad
+        exp_avg = exp_avgs[i]
+        exp_inf = exp_infs[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+        # update step
+        step_t += 1
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        if torch.is_complex(param):
+            param = torch.view_as_real(param)
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            exp_inf = torch.view_as_real(exp_inf)
+
+        # Update biased first moment estimate.
+        exp_avg.lerp_(grad, 1 - beta1)
+        # Update the exponentially weighted infinity norm.
+        if not differentiable:
+            torch.maximum(
+                exp_inf.mul_(beta2),
+                grad.abs().add_(eps),
+                out=exp_inf,
+            )
+        else:
+            norm_buf = torch.cat(
+                [exp_inf.mul_(beta2).unsqueeze(0), grad.abs().add_(eps).unsqueeze_(0)], 0
+            )
+            exp_inf.copy_(torch.amax(norm_buf, 0, keepdim=False))
+
+        if capturable:
+            # why jump through extra hoops and negate bias_correction? check out #121238
+            # once fixed, we should use bias_correction with addcdiv value=-1 for readability
+            neg_bias_correction = beta1 ** step_t - 1
+            neg_bias_correction.div_(lr)
+            denom = exp_inf * neg_bias_correction
+            param.addcdiv_(exp_avg, denom)
+        else:
+            bias_correction = 1 - beta1 ** _get_value(step_t)
+            clr = lr / bias_correction
+
+            param.addcdiv_(exp_avg, exp_inf, value=-clr)
+
+
+def _multi_tensor_adamax(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_infs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    differentiable: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    if len(params) == 0:
+        return
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if (not torch._utils.is_compiling() and capturable
+            and not all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps))):
+        raise RuntimeError("If capturable=True, params and state_steps must be CUDA tensors.")
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_infs, state_steps])
+    for ((grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_infs, grouped_state_steps), _) in grouped_tensors.values():
+        if has_complex:
+            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_infs)
+
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
+        if weight_decay != 0:
+            if maximize:
+                # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+                torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
+            else:
+                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+
+
+        # Update biased first moment estimate.
+        torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
+
+        # Update the exponentially weighted infinity norm.
+        torch._foreach_mul_(grouped_exp_infs, beta2)
+
+        # in this case, we need to introduce a copy of the grads
+        # since one has not been introduced previously
+        if not maximize and weight_decay == 0:
+            grouped_grads = torch._foreach_abs(grouped_grads)
+        else:
+            torch._foreach_abs_(grouped_grads)
+
+        torch._foreach_add_(grouped_grads, eps)
+        torch._foreach_maximum_(grouped_exp_infs, grouped_grads)
+
+        if capturable:
+            bias_corrections = torch._foreach_pow(beta1, grouped_state_steps)
+            # foreach_sub doesn't allow a scalar as the first arg
+            torch._foreach_sub_(bias_corrections, 1)
+            torch._foreach_div_(bias_corrections, lr)
+
+            denom = torch._foreach_mul(grouped_exp_infs, bias_corrections)
+            torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, denom)
+        else:
+            bias_corrections = [1 - beta1 ** _get_value(step) for step in grouped_state_steps]
+            step_size = [(lr / bc) * -1 for bc in bias_corrections]
+            torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, grouped_exp_infs, step_size)
diff --git a/MLPY/Lib/site-packages/torch/optim/adamax.pyi b/MLPY/Lib/site-packages/torch/optim/adamax.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..3067709195fcde7b1e0c0703cc335052c2ca07c3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adamax.pyi
@@ -0,0 +1,13 @@
+from typing import Tuple
+
+from .optimizer import Optimizer, ParamsT
+
+class Adamax(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        betas: Tuple[float, float] = ...,
+        eps: float = ...,
+        weight_decay: float = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/adamw.py b/MLPY/Lib/site-packages/torch/optim/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7357a1c09e92800354994657903c9a65961957
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adamw.py
@@ -0,0 +1,688 @@
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
+                        _stack_if_compiling, _get_scalar_dtype, _capturable_doc, _differentiable_doc,
+                        _foreach_doc, _fused_doc, _maximize_doc, _default_to_fused_or_foreach,
+                        ParamsT, _view_as_real)
+from typing import List, Optional, Tuple, Union
+from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
+
+__all__ = ["AdamW", "adamw"]
+
+
+class AdamW(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: Union[float, Tensor] = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 1e-2,
+        amsgrad: bool = False,
+        *,
+        maximize: bool = False,
+        foreach: Optional[bool] = None,
+        capturable: bool = False,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if isinstance(lr, Tensor) and foreach and not capturable:
+            raise ValueError("lr as a Tensor is not supported for capturable=False and foreach=True")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            foreach=foreach,
+            maximize=maximize,
+            capturable=capturable,
+            differentiable=differentiable,
+            fused=fused,
+        )
+        super().__init__(params, defaults)
+
+        if fused:
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            self._step_supports_amp_scaling = True
+            # TODO(crcrpar): [low prec params & their higher prec copy]
+            # Suppor AMP with FP16/BF16 model params which would need
+            # higher prec copy of params to do update math in higher prec to
+            # alleviate the loss of information.
+            fused_supported_devices = _get_fused_kernels_supported_devices()
+            if not all(
+                p.device.type in fused_supported_devices and
+                torch.is_floating_point(p)
+                for pg in self.param_groups for p in pg['params']
+            ):
+                raise RuntimeError("`fused=True` requires all the params to be floating point Tensors of "
+                                   f"supported devices: {fused_supported_devices}.")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("amsgrad", False)
+            group.setdefault("maximize", False)
+            group.setdefault("foreach", None)
+            group.setdefault("capturable", False)
+            group.setdefault("differentiable", False)
+            fused = group.setdefault("fused", None)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (torch.tensor(step_val, dtype=_get_scalar_dtype(is_fused=fused), device=p.device)
+                                       if group['capturable'] or group['fused']
+                                       else torch.tensor(step_val, dtype=_get_scalar_dtype()))
+
+    def _init_group(
+        self,
+        group,
+        params_with_grad,
+        grads,
+        amsgrad,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+    ):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            has_complex |= torch.is_complex(p)
+            params_with_grad.append(p)
+            if p.grad.is_sparse:
+                raise RuntimeError("AdamW does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                # note(crcrpar): Deliberately host `step` on CPU if both capturable and fused are off.
+                # This is because kernel launches are costly on CUDA and XLA.
+                state["step"] = (
+                    torch.zeros((), dtype=_get_scalar_dtype(is_fused=group["fused"]), device=p.device)
+                    if group["capturable"] or group["fused"]
+                    else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                )
+                # Exponential moving average of gradient values
+                state["exp_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                # Exponential moving average of squared gradient values
+                state["exp_avg_sq"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if amsgrad:
+                    # Maintains max of all exp. moving avg. of sq. grad. values
+                    state["max_exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+            exp_avgs.append(state["exp_avg"])
+            exp_avg_sqs.append(state["exp_avg_sq"])
+
+            if group['amsgrad']:
+                max_exp_avg_sqs.append(state["max_exp_avg_sq"])
+            if group['differentiable'] and state['step'].requires_grad:
+                raise RuntimeError('`requires_grad` is not supported for `step` in differentiable mode')
+
+            # Foreach without capturable does not support a tensor lr
+            if group['foreach'] and isinstance(group['lr'], Tensor) and not group['capturable']:
+                raise RuntimeError('lr as a Tensor is not supported for capturable=False and foreach=True')
+
+            state_steps.append(state["step"])
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group["amsgrad"]
+            beta1, beta2 = group["betas"]
+
+            has_complex = self._init_group(
+                group,
+                params_with_grad,
+                grads,
+                amsgrad,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+            )
+
+            adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=amsgrad,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                maximize=group["maximize"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+                fused=group["fused"],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+AdamW.__doc__ = r"""Implements AdamW algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{(lr)}, \: \beta_1, \beta_2
+                \text{(betas)}, \: \theta_0 \text{(params)}, \: f(\theta) \text{(objective)},
+                \: \epsilon \text{ (epsilon)}                                                    \\
+            &\hspace{13mm}      \lambda \text{(weight decay)},  \: \textit{amsgrad},
+                \: \textit{maximize}                                                             \\
+            &\textbf{initialize} : m_0 \leftarrow 0 \text{ (first moment)}, v_0 \leftarrow 0
+                \text{ ( second moment)}, \: \widehat{v_0}^{max}\leftarrow 0              \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+
+            &\hspace{5mm}\textbf{if} \: \textit{maximize}:                                       \\
+            &\hspace{10mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})          \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}         \\
+            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{5mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
+            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
+            &\hspace{5mm}\textbf{if} \: amsgrad                                                  \\
+            &\hspace{10mm}\widehat{v_t}^{max} \leftarrow \mathrm{max}(\widehat{v_t}^{max},
+                \widehat{v_t})                                                                   \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}^{max}} + \epsilon \big)                                 \\
+            &\hspace{5mm}\textbf{else}                                                           \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Decoupled Weight Decay Regularization`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, Tensor, optional): learning rate (default: 1e-3). A tensor LR
+            is not yet supported for all our implementations. Please use a float
+            LR if you are not also specifying fused=True or capturable=True.
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (bool, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+        {_maximize_doc}
+        {_foreach_doc}
+        {_capturable_doc}
+        {_differentiable_doc}
+        {_fused_doc}
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+
+    """
+
+
+def adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    capturable: bool = False,
+    differentiable: bool = False,
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
+    has_complex: bool = False,
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+):
+    r"""Functional API that performs AdamW algorithm computation.
+
+    See :class:`~torch.optim.AdamW` for details.
+    """
+    if not torch._utils.is_compiling() and not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
+        if foreach and isinstance(lr, Tensor) and not capturable:
+            foreach = False
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
+
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adamw
+    elif foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_adamw
+    else:
+        func = _single_tensor_adamw
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        max_exp_avg_sqs,
+        state_steps,
+        amsgrad=amsgrad,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        maximize=maximize,
+        capturable=capturable,
+        differentiable=differentiable,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[Tensor, float],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    assert grad_scale is None and found_inf is None
+
+    if torch.jit.is_scripting():
+        # this assert is due to JIT being dumb and not realizing that the ops below
+        # have overloads to handle both float and Tensor lrs, so we just assert it's
+        # a float since most people using JIT are using floats
+        assert isinstance(lr, float)
+
+    for i, param in enumerate(params):
+        grad = grads[i] if not maximize else -grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (
+                (param.is_cuda and step_t.is_cuda) or (param.is_xla and step_t.is_xla)
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            exp_avg_sq = torch.view_as_real(exp_avg_sq)
+            if amsgrad:
+                max_exp_avg_sqs[i] = torch.view_as_real(max_exp_avg_sqs[i])
+            param = torch.view_as_real(param)
+
+        # update step
+        step_t += 1
+
+        # Perform stepweight decay
+        param.mul_(1 - lr * weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.lerp_(grad, 1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+        if capturable or differentiable:
+            step = step_t
+
+            bias_correction1 = 1 - beta1 ** step
+            bias_correction2 = 1 - beta2 ** step
+
+            step_size = lr / bias_correction1
+            step_size_neg = step_size.neg()
+
+            bias_correction2_sqrt = bias_correction2.sqrt()
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                if differentiable:
+                    max_exp_avg_sq = max_exp_avg_sqs[i].clone()
+                else:
+                    max_exp_avg_sq = max_exp_avg_sqs[i]
+
+                max_exp_avg_sqs[i].copy_(torch.maximum(max_exp_avg_sq, exp_avg_sq))
+
+                # Uses the max. for normalizing running avg. of gradient
+                # Folds in (admittedly ugly) 1-elem step_size math here to avoid extra param-set-sized read+write
+                # (can't fold it into addcdiv_ below because addcdiv_ requires value is a Number, not a Tensor)
+                denom = (
+                    max_exp_avg_sqs[i].sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
+            else:
+                denom = (
+                    exp_avg_sq.sqrt() / (bias_correction2_sqrt * step_size_neg)
+                ).add_(eps / step_size_neg)
+
+            param.addcdiv_(exp_avg, denom)
+        else:
+            step = _get_value(step_t)
+
+            bias_correction1 = 1 - beta1 ** step
+            bias_correction2 = 1 - beta2 ** step
+
+            step_size = lr / bias_correction1
+
+            bias_correction2_sqrt = _dispatch_sqrt(bias_correction2)
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i])
+
+                # Use the max. for normalizing running avg. of gradient
+                denom = (max_exp_avg_sqs[i].sqrt() / bias_correction2_sqrt).add_(eps)
+            else:
+                denom = (exp_avg_sq.sqrt() / bias_correction2_sqrt).add_(eps)
+
+            param.addcdiv_(exp_avg, denom, value=-step_size)
+
+        # Lastly, switch back to complex view
+        if amsgrad and torch.is_complex(params[i]):
+            max_exp_avg_sqs[i] = torch.view_as_complex(max_exp_avg_sqs[i])
+
+
+def _multi_tensor_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[Tensor, float],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+    if len(params) == 0:
+        return
+
+    if isinstance(lr, Tensor) and not capturable:
+        raise RuntimeError("lr as a Tensor is not supported for capturable=False and foreach=True")
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(
+            p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)
+        ), "If capturable=True, params and state_steps must be CUDA tensors."
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    assert grad_scale is None and found_inf is None
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([
+        params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    for ((
+        device_params,
+        device_grads,
+        device_exp_avgs,
+        device_exp_avg_sqs,
+        device_max_exp_avg_sqs,
+        device_state_steps,
+    ), _) in grouped_tensors.values():
+        if has_complex:
+            if amsgrad:
+                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs, device_max_exp_avg_sqs)
+            else:
+                _view_as_real(device_params, device_grads, device_exp_avgs, device_exp_avg_sqs)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if device_state_steps[0].is_cpu:
+            torch._foreach_add_(device_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(device_state_steps, 1)
+
+        # Perform stepweight decay
+        if weight_decay != 0:
+            torch._foreach_mul_(device_params, 1 - lr * weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)
+
+        torch._foreach_mul_(device_exp_avg_sqs, beta2)
+        torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads, 1 - beta2)
+
+        # Delete the local intermediate since it won't be used anymore to save on peak memory
+        del device_grads
+
+        if capturable:
+            bias_correction1 = torch._foreach_pow(beta1, device_state_steps)
+            bias_correction2 = torch._foreach_pow(beta2, device_state_steps)
+            # foreach_sub doesn't allow a scalar as the first arg
+            torch._foreach_sub_(bias_correction1, 1)
+            torch._foreach_sub_(bias_correction2, 1)
+            # we do not negate bias_correction1 as it'll need to be negated later anyway
+            torch._foreach_neg_(bias_correction2)
+
+            # foreach_div doesn't allow a scalar as the first arg
+            torch._foreach_div_(bias_correction1, lr)
+            torch._foreach_reciprocal_(bias_correction1)
+
+            torch._foreach_sqrt_(bias_correction2)
+
+            # Re-assign for clarity as we maintain minimal intermediates: we'll have
+            # step_size = - lr / (1 - beta1 ^ t) where t = num_steps
+            # bias_correction2_sqrt = sqrt(1 - beta2 ^ t)
+            step_size = bias_correction1
+            bias_correction2_sqrt = bias_correction2
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch._foreach_maximum_(device_max_exp_avg_sqs, device_exp_avg_sqs)
+
+                # Use the max. for normalizing running avg. of gradient
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_max_exp_avg_sqs)
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
+
+            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
+            torch._foreach_add_(exp_avg_sq_sqrt, eps)
+            torch._foreach_div_(exp_avg_sq_sqrt, step_size)
+
+            # at this point, exp_avg_sq_sqrt = - (1 - beta^t) * [sqrt(exp_avg_sq / (1 - beta2^t)) + eps] / lr
+            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt)
+        else:
+            bias_correction1 = [1 - beta1 ** _get_value(step) for step in device_state_steps]
+            bias_correction2 = [1 - beta2 ** _get_value(step) for step in device_state_steps]
+
+            step_size = _stack_if_compiling([(lr / bc) * -1 for bc in bias_correction1])
+
+            bias_correction2_sqrt = [_dispatch_sqrt(bc) for bc in bias_correction2]
+
+            if amsgrad:
+                # Maintains the maximum of all 2nd moment running avg. till now
+                torch._foreach_maximum_(device_max_exp_avg_sqs, device_exp_avg_sqs)
+
+                # Use the max. for normalizing running avg. of gradient
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_max_exp_avg_sqs)
+            else:
+                exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
+
+            torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)
+            torch._foreach_add_(exp_avg_sq_sqrt, eps)
+            torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt, step_size)
+
+
+def _fused_adamw(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    max_exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    amsgrad: bool,
+    beta1: float,
+    beta2: float,
+    lr: Union[float, Tensor],
+    weight_decay: float,
+    eps: float,
+    maximize: bool,
+    capturable: bool,  # Needed for consistency.
+    differentiable: bool,
+    has_complex: bool,
+) -> None:
+    if not params:
+        return
+    if differentiable:
+        raise RuntimeError("Adam with fused=True does not support differentiable=True")
+
+    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+
+    # We only shuffle around the lr when it is a Tensor and on CUDA, otherwise, we prefer
+    # treating it as a scalar.
+    lr_dict = {lr.device: lr} if isinstance(lr, Tensor) and str(lr.device) != "cpu" else None
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps])
+    for (device, _), ((device_params,
+                       device_grads,
+                       device_exp_avgs,
+                       device_exp_avg_sqs,
+                       device_max_exp_avg_sqs,
+                       device_state_steps,), _) in grouped_tensors.items():
+        device_grad_scale, device_found_inf = None, None
+        if grad_scale is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)
+            device_grad_scale = grad_scale_dict[device]
+        if found_inf is not None:
+            if found_inf not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device, non_blocking=True)
+            device_found_inf = found_inf_dict[device]
+        if lr_dict is not None and device not in lr_dict:
+            lr_dict[device] = lr.to(device=device, non_blocking=True)
+            lr = lr_dict[device]
+        torch._foreach_add_(device_state_steps, 1)
+        torch._fused_adamw_(
+            device_params,
+            device_grads,
+            device_exp_avgs,
+            device_exp_avg_sqs,
+            device_max_exp_avg_sqs,
+            device_state_steps,
+            amsgrad=amsgrad,
+            lr=lr,
+            beta1=beta1,
+            beta2=beta2,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
+        if device_found_inf is not None:
+            torch._foreach_sub_(device_state_steps, [device_found_inf] * len(device_state_steps))
diff --git a/MLPY/Lib/site-packages/torch/optim/adamw.pyi b/MLPY/Lib/site-packages/torch/optim/adamw.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..1ffc76ea8945a62ca8b5e17afad38afc0c2effb5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/adamw.pyi
@@ -0,0 +1,22 @@
+from typing import Optional, Tuple, Union
+
+from torch import Tensor
+
+from .optimizer import Optimizer, ParamsT
+
+class AdamW(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: Union[float, Tensor] = 1e-3,
+        betas: Tuple[float, float] = (0.9, 0.999),
+        eps: float = 1e-8,
+        weight_decay: float = 1e-2,
+        amsgrad: bool = False,
+        *,
+        maximize: bool = False,
+        foreach: Optional[bool] = None,
+        capturable: bool = False,
+        differentiable: bool = False,
+        fused: Optional[bool] = None,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/asgd.py b/MLPY/Lib/site-packages/torch/optim/asgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a82d7771edee9fd0ccc3d5848a2bef84847998
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/asgd.py
@@ -0,0 +1,396 @@
+import torch
+from torch import Tensor
+
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _default_to_fused_or_foreach,
+                        _get_scalar_dtype, _view_as_real, _differentiable_doc, _foreach_doc, _maximize_doc,
+                        _capturable_doc)
+from typing import List, Optional
+
+__all__ = ["ASGD", "asgd"]
+
+def _to_tensor(x, device=None):
+    if not isinstance(x, torch.Tensor):
+        return torch.tensor(x, device=device)
+
+    return x
+
+class ASGD(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        lambd=1e-4,
+        alpha=0.75,
+        t0=1e6,
+        weight_decay=0,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+        capturable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(
+            lr=lr,
+            lambd=lambd,
+            alpha=alpha,
+            t0=t0,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+            capturable=capturable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0:
+                    if not torch.is_tensor(p_state['step']):
+                        step_val = float(p_state["step"])
+                        p_state["step"] = torch.tensor(step_val, dtype=_get_scalar_dtype(), device=p.device)
+                    if not torch.is_tensor(p_state["eta"]):
+                        p_state["eta"] = torch.tensor(p_state["eta"], dtype=_get_scalar_dtype(), device=p.device)
+                    if not torch.is_tensor(p_state["mu"]):
+                        p_state["mu"] = torch.tensor(p_state["mu"], dtype=_get_scalar_dtype(), device=p.device)
+
+
+    def _init_group(self, group, params_with_grad, grads, mus, axs, etas, state_steps):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is not None:
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError("ASGD does not support sparse gradients")
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = torch.zeros((), device=p.device, dtype=_get_scalar_dtype())
+                    state["eta"] = torch.tensor(group["lr"], device=p.device, dtype=_get_scalar_dtype())
+                    state["mu"] = torch.ones((), device=p.device, dtype=_get_scalar_dtype())
+                    state["ax"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                mus.append(state["mu"])
+                axs.append(state["ax"])
+                etas.append(state["eta"])
+                state_steps.append(state["step"])
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            mus = []
+            axs = []
+            etas = []
+            state_steps = []
+
+            has_complex = self._init_group(group, params_with_grad, grads, mus, axs, etas, state_steps)
+
+            asgd(
+                params_with_grad,
+                grads,
+                axs,
+                mus,
+                etas,
+                state_steps,
+                lambd=group["lambd"],
+                lr=group["lr"],
+                t0=group["t0"],
+                alpha=group["alpha"],
+                weight_decay=group["weight_decay"],
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+                differentiable=group["differentiable"],
+                capturable=group["capturable"],
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+ASGD.__doc__ = fr"""Implements Averaged Stochastic Gradient Descent.
+
+    It has been proposed in `Acceleration of stochastic approximation by
+    averaging`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        lambd (float, optional): decay term (default: 1e-4)
+        alpha (float, optional): power for eta update (default: 0.75)
+        t0 (float, optional): point at which to start averaging (default: 1e6)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+        {_capturable_doc}
+
+    .. _Acceleration of stochastic approximation by averaging:
+        https://dl.acm.org/citation.cfm?id=131098
+
+    """
+
+
+def asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+):
+    r"""Functional API that performs asgd algorithm computation.
+
+    See :class:`~torch.optim.ASGD` for details.
+    """
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_asgd
+    else:
+        func = _single_tensor_asgd
+
+    func(
+        params,
+        grads,
+        axs,
+        mus,
+        etas,
+        state_steps,
+        lambd=lambd,
+        lr=lr,
+        t0=t0,
+        alpha=alpha,
+        weight_decay=weight_decay,
+        maximize=maximize,
+        differentiable=differentiable,
+        capturable=capturable,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+    for i, param in enumerate(params):
+        grad = grads[i]
+        grad = grad if not maximize else -grad
+        mu = mus[i]
+        ax = axs[i]
+        eta = etas[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and mu.is_cuda and eta.is_cuda and step_t.is_cuda) or (
+                param.is_xla and mu.is_xla and eta.is_xla and step_t.is_xla
+            ), "If capturable=True, params, mus, etas, and state_steps must be CUDA or XLA tensors."
+
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            param = torch.view_as_real(param)
+            ax = torch.view_as_real(ax)
+
+        # update step
+        step_t += 1
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        if capturable:
+            param.mul_(1 - lambd * eta)
+            param.addcmul_(grad, eta, value=-1)  # update parameter
+        else:
+            eta_value = _get_value(eta)
+            param.mul_(1 - lambd * eta_value)  # decay term
+            param.add_(grad, alpha=-eta_value)  # update parameter
+
+        # averaging
+        if capturable or mu.item() != 1:
+            ax.add_(param.sub(ax).mul_(mu))
+        else:
+            ax.copy_(param)
+
+        if capturable:
+            eta.copy_(lr / ((1 + lambd * lr * step_t) ** alpha))
+            mu.copy_(1 / torch.maximum(step_t - t0, torch.ones_like(step_t)))
+        else:
+            step = _get_value(step_t)
+            new_eta = _to_tensor(lr / ((1 + lambd * lr * step) ** alpha))
+            eta.copy_(new_eta)
+            new_mu = _to_tensor(1 / max(1, step - t0))
+            mu.copy_(new_mu)
+
+
+def _multi_tensor_asgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    axs: List[Tensor],
+    mus: List[Tensor],
+    etas: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    lambd: float,
+    lr: float,
+    t0: float,
+    alpha: float,
+    weight_decay: float,
+    maximize: bool,
+    differentiable: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+    if len(params) == 0:
+        return
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(p.is_cuda and mu.is_cuda and eta.is_cuda and step.is_cuda
+                   for p, mu, eta, step in zip(params, mus, etas, state_steps)), \
+            "If capturable=True, params, mus, etas, and state_steps must be CUDA tensors."
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, axs, mus, etas, state_steps])
+    for ((device, _), ((grouped_params, grouped_grads, grouped_axs, grouped_mus,
+         grouped_etas, grouped_state_steps), _)) in grouped_tensors.items():
+        if has_complex:
+            _view_as_real(grouped_params, grouped_grads, grouped_axs)
+
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
+        # intermediate = grad + param * lambd
+        if weight_decay != 0:
+            if maximize:
+                torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
+                intermediate = grouped_grads
+            else:
+                intermediate = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+
+            torch._foreach_add_(intermediate, grouped_params, alpha=lambd)
+        else:
+            intermediate = torch._foreach_add(grouped_grads, grouped_params, alpha=lambd)
+
+        # update param
+        # param * (1 - lambd * eta) - eta * grad
+        # => param - param * lambd * eta - eta * grad
+        # => param - eta * intermediate
+        torch._foreach_addcmul_(grouped_params, intermediate, grouped_etas, value=-1)
+        del intermediate
+
+        # update grouped_axs
+        # averaging: ax = ax + mu * (param - ax)
+        # Note (mlazos): We can't use lerp here since it requires weight to be float64
+        # and our grouping code requires dtypes to match for all tensors in a group (and it should, since
+        # we use the mus in other places)
+        # all dtypes need to match, so we could introduce a cast in a loop
+        # but since this only adds one additional kernel launch, this looks like the cleaner
+        # and faster solution
+        intermediate = torch._foreach_sub(grouped_params, grouped_axs)
+        torch._foreach_addcmul_(grouped_axs, intermediate, grouped_mus)
+        del intermediate
+
+        if capturable:
+            # update grouped_mus
+            new_mus = torch._foreach_sub(grouped_state_steps, t0)
+            torch._foreach_maximum_(new_mus, 1.0)
+            torch._foreach_reciprocal_(new_mus)
+            torch._foreach_copy_(grouped_mus, new_mus)
+            del new_mus
+
+            # update eta = lr / (1 + lambd * lr * step^alpha)
+            new_etas = torch._foreach_pow(grouped_state_steps, alpha)
+            torch._foreach_mul_(new_etas, lambd)
+            torch._foreach_mul_(new_etas, lr)
+            torch._foreach_add_(new_etas, 1)
+            torch._foreach_reciprocal_(new_etas)
+            torch._foreach_mul_(new_etas, lr)
+            torch._foreach_copy_(grouped_etas, new_etas)
+        else:
+            step = grouped_state_steps[0].item()
+            new_etas = []
+            new_mus = []
+
+            for i in range(len(grouped_mus)):
+                new_eta = _to_tensor(
+                    lr / (1 + lambd * lr * step ** alpha), device=device
+                )
+                new_etas.append(new_eta)
+                new_mu = _to_tensor(1 / max(1, step - t0), device=device)
+                new_mus.append(new_mu)
+
+            torch._foreach_copy_(grouped_etas, new_etas)
+            torch._foreach_copy_(grouped_mus, new_mus)
diff --git a/MLPY/Lib/site-packages/torch/optim/asgd.pyi b/MLPY/Lib/site-packages/torch/optim/asgd.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..be4f6760260a2eb8a44ef85b85becb080ad128b6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/asgd.pyi
@@ -0,0 +1,12 @@
+from .optimizer import Optimizer, ParamsT
+
+class ASGD(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        lambd: float = ...,
+        alpha: float = ...,
+        t0: float = ...,
+        weight_decay: float = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/lbfgs.py b/MLPY/Lib/site-packages/torch/optim/lbfgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8bc07be354cb274525813aea2cccb34605b4df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/lbfgs.py
@@ -0,0 +1,483 @@
+import torch
+from .optimizer import Optimizer
+
+__all__ = ['LBFGS']
+
+def _cubic_interpolate(x1, f1, g1, x2, f2, g2, bounds=None):
+    # ported from https://github.com/torch/optim/blob/master/polyinterp.lua
+    # Compute bounds of interpolation area
+    if bounds is not None:
+        xmin_bound, xmax_bound = bounds
+    else:
+        xmin_bound, xmax_bound = (x1, x2) if x1 <= x2 else (x2, x1)
+
+    # Code for most common case: cubic interpolation of 2 points
+    #   w/ function and derivative values for both
+    # Solution in this case (where x2 is the farthest point):
+    #   d1 = g1 + g2 - 3*(f1-f2)/(x1-x2);
+    #   d2 = sqrt(d1^2 - g1*g2);
+    #   min_pos = x2 - (x2 - x1)*((g2 + d2 - d1)/(g2 - g1 + 2*d2));
+    #   t_new = min(max(min_pos,xmin_bound),xmax_bound);
+    d1 = g1 + g2 - 3 * (f1 - f2) / (x1 - x2)
+    d2_square = d1**2 - g1 * g2
+    if d2_square >= 0:
+        d2 = d2_square.sqrt()
+        if x1 <= x2:
+            min_pos = x2 - (x2 - x1) * ((g2 + d2 - d1) / (g2 - g1 + 2 * d2))
+        else:
+            min_pos = x1 - (x1 - x2) * ((g1 + d2 - d1) / (g1 - g2 + 2 * d2))
+        return min(max(min_pos, xmin_bound), xmax_bound)
+    else:
+        return (xmin_bound + xmax_bound) / 2.
+
+
+def _strong_wolfe(obj_func,
+                  x,
+                  t,
+                  d,
+                  f,
+                  g,
+                  gtd,
+                  c1=1e-4,
+                  c2=0.9,
+                  tolerance_change=1e-9,
+                  max_ls=25):
+    # ported from https://github.com/torch/optim/blob/master/lswolfe.lua
+    d_norm = d.abs().max()
+    g = g.clone(memory_format=torch.contiguous_format)
+    # evaluate objective and gradient using initial step
+    f_new, g_new = obj_func(x, t, d)
+    ls_func_evals = 1
+    gtd_new = g_new.dot(d)
+
+    # bracket an interval containing a point satisfying the Wolfe criteria
+    t_prev, f_prev, g_prev, gtd_prev = 0, f, g, gtd
+    done = False
+    ls_iter = 0
+    while ls_iter < max_ls:
+        # check conditions
+        if f_new > (f + c1 * t * gtd) or (ls_iter > 1 and f_new >= f_prev):
+            bracket = [t_prev, t]
+            bracket_f = [f_prev, f_new]
+            bracket_g = [g_prev, g_new.clone(memory_format=torch.contiguous_format)]
+            bracket_gtd = [gtd_prev, gtd_new]
+            break
+
+        if abs(gtd_new) <= -c2 * gtd:
+            bracket = [t]
+            bracket_f = [f_new]
+            bracket_g = [g_new]
+            done = True
+            break
+
+        if gtd_new >= 0:
+            bracket = [t_prev, t]
+            bracket_f = [f_prev, f_new]
+            bracket_g = [g_prev, g_new.clone(memory_format=torch.contiguous_format)]
+            bracket_gtd = [gtd_prev, gtd_new]
+            break
+
+        # interpolate
+        min_step = t + 0.01 * (t - t_prev)
+        max_step = t * 10
+        tmp = t
+        t = _cubic_interpolate(
+            t_prev,
+            f_prev,
+            gtd_prev,
+            t,
+            f_new,
+            gtd_new,
+            bounds=(min_step, max_step))
+
+        # next step
+        t_prev = tmp
+        f_prev = f_new
+        g_prev = g_new.clone(memory_format=torch.contiguous_format)
+        gtd_prev = gtd_new
+        f_new, g_new = obj_func(x, t, d)
+        ls_func_evals += 1
+        gtd_new = g_new.dot(d)
+        ls_iter += 1
+
+    # reached max number of iterations?
+    if ls_iter == max_ls:
+        bracket = [0, t]
+        bracket_f = [f, f_new]
+        bracket_g = [g, g_new]
+
+    # zoom phase: we now have a point satisfying the criteria, or
+    # a bracket around it. We refine the bracket until we find the
+    # exact point satisfying the criteria
+    insuf_progress = False
+    # find high and low points in bracket
+    low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[-1] else (1, 0)
+    while not done and ls_iter < max_ls:
+        # line-search bracket is so small
+        if abs(bracket[1] - bracket[0]) * d_norm < tolerance_change:
+            break
+
+        # compute new trial value
+        t = _cubic_interpolate(bracket[0], bracket_f[0], bracket_gtd[0],
+                               bracket[1], bracket_f[1], bracket_gtd[1])
+
+        # test that we are making sufficient progress:
+        # in case `t` is so close to boundary, we mark that we are making
+        # insufficient progress, and if
+        #   + we have made insufficient progress in the last step, or
+        #   + `t` is at one of the boundary,
+        # we will move `t` to a position which is `0.1 * len(bracket)`
+        # away from the nearest boundary point.
+        eps = 0.1 * (max(bracket) - min(bracket))
+        if min(max(bracket) - t, t - min(bracket)) < eps:
+            # interpolation close to boundary
+            if insuf_progress or t >= max(bracket) or t <= min(bracket):
+                # evaluate at 0.1 away from boundary
+                if abs(t - max(bracket)) < abs(t - min(bracket)):
+                    t = max(bracket) - eps
+                else:
+                    t = min(bracket) + eps
+                insuf_progress = False
+            else:
+                insuf_progress = True
+        else:
+            insuf_progress = False
+
+        # Evaluate new point
+        f_new, g_new = obj_func(x, t, d)
+        ls_func_evals += 1
+        gtd_new = g_new.dot(d)
+        ls_iter += 1
+
+        if f_new > (f + c1 * t * gtd) or f_new >= bracket_f[low_pos]:
+            # Armijo condition not satisfied or not lower than lowest point
+            bracket[high_pos] = t
+            bracket_f[high_pos] = f_new
+            bracket_g[high_pos] = g_new.clone(memory_format=torch.contiguous_format)
+            bracket_gtd[high_pos] = gtd_new
+            low_pos, high_pos = (0, 1) if bracket_f[0] <= bracket_f[1] else (1, 0)
+        else:
+            if abs(gtd_new) <= -c2 * gtd:
+                # Wolfe conditions satisfied
+                done = True
+            elif gtd_new * (bracket[high_pos] - bracket[low_pos]) >= 0:
+                # old high becomes new low
+                bracket[high_pos] = bracket[low_pos]
+                bracket_f[high_pos] = bracket_f[low_pos]
+                bracket_g[high_pos] = bracket_g[low_pos]
+                bracket_gtd[high_pos] = bracket_gtd[low_pos]
+
+            # new point becomes new low
+            bracket[low_pos] = t
+            bracket_f[low_pos] = f_new
+            bracket_g[low_pos] = g_new.clone(memory_format=torch.contiguous_format)
+            bracket_gtd[low_pos] = gtd_new
+
+    # return stuff
+    t = bracket[low_pos]
+    f_new = bracket_f[low_pos]
+    g_new = bracket_g[low_pos]
+    return f_new, g_new, t, ls_func_evals
+
+
+class LBFGS(Optimizer):
+    """Implements L-BFGS algorithm.
+
+    Heavily inspired by `minFunc
+    <https://www.cs.ubc.ca/~schmidtm/Software/minFunc.html>`_.
+
+    .. warning::
+        This optimizer doesn't support per-parameter options and parameter
+        groups (there can be only one).
+
+    .. warning::
+        Right now all parameters have to be on a single device. This will be
+        improved in the future.
+
+    .. note::
+        This is a very memory intensive optimizer (it requires additional
+        ``param_bytes * (history_size + 1)`` bytes). If it doesn't fit in memory
+        try reducing the history size, or use a different algorithm.
+
+    Args:
+        params (iterable): iterable of parameters to optimize. Parameters must be real.
+        lr (float): learning rate (default: 1)
+        max_iter (int): maximal number of iterations per optimization step
+            (default: 20)
+        max_eval (int): maximal number of function evaluations per optimization
+            step (default: max_iter * 1.25).
+        tolerance_grad (float): termination tolerance on first order optimality
+            (default: 1e-7).
+        tolerance_change (float): termination tolerance on function
+            value/parameter changes (default: 1e-9).
+        history_size (int): update history size (default: 100).
+        line_search_fn (str): either 'strong_wolfe' or None (default: None).
+    """
+
+    def __init__(self,
+                 params,
+                 lr=1,
+                 max_iter=20,
+                 max_eval=None,
+                 tolerance_grad=1e-7,
+                 tolerance_change=1e-9,
+                 history_size=100,
+                 line_search_fn=None):
+        if max_eval is None:
+            max_eval = max_iter * 5 // 4
+        defaults = dict(
+            lr=lr,
+            max_iter=max_iter,
+            max_eval=max_eval,
+            tolerance_grad=tolerance_grad,
+            tolerance_change=tolerance_change,
+            history_size=history_size,
+            line_search_fn=line_search_fn)
+        super().__init__(params, defaults)
+
+        if len(self.param_groups) != 1:
+            raise ValueError("LBFGS doesn't support per-parameter options "
+                             "(parameter groups)")
+
+        self._params = self.param_groups[0]['params']
+        self._numel_cache = None
+
+    def _numel(self):
+        if self._numel_cache is None:
+            self._numel_cache = sum(2 * p.numel() if torch.is_complex(p) else p.numel() for p in self._params)
+
+        return self._numel_cache
+
+    def _gather_flat_grad(self):
+        views = []
+        for p in self._params:
+            if p.grad is None:
+                view = p.new(p.numel()).zero_()
+            elif p.grad.is_sparse:
+                view = p.grad.to_dense().view(-1)
+            else:
+                view = p.grad.view(-1)
+            if torch.is_complex(view):
+                view = torch.view_as_real(view).view(-1)
+            views.append(view)
+        return torch.cat(views, 0)
+
+    def _add_grad(self, step_size, update):
+        offset = 0
+        for p in self._params:
+            if torch.is_complex(p):
+                p = torch.view_as_real(p)
+            numel = p.numel()
+            # view as to avoid deprecated pointwise semantics
+            p.add_(update[offset:offset + numel].view_as(p), alpha=step_size)
+            offset += numel
+        assert offset == self._numel()
+
+    def _clone_param(self):
+        return [p.clone(memory_format=torch.contiguous_format) for p in self._params]
+
+    def _set_param(self, params_data):
+        for p, pdata in zip(self._params, params_data):
+            p.copy_(pdata)
+
+    def _directional_evaluate(self, closure, x, t, d):
+        self._add_grad(t, d)
+        loss = float(closure())
+        flat_grad = self._gather_flat_grad()
+        self._set_param(x)
+        return loss, flat_grad
+
+    @torch.no_grad()
+    def step(self, closure):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable): A closure that reevaluates the model
+                and returns the loss.
+        """
+        assert len(self.param_groups) == 1
+
+        # Make sure the closure is always called with grad enabled
+        closure = torch.enable_grad()(closure)
+
+        group = self.param_groups[0]
+        lr = group['lr']
+        max_iter = group['max_iter']
+        max_eval = group['max_eval']
+        tolerance_grad = group['tolerance_grad']
+        tolerance_change = group['tolerance_change']
+        line_search_fn = group['line_search_fn']
+        history_size = group['history_size']
+
+        # NOTE: LBFGS has only global state, but we register it as state for
+        # the first param, because this helps with casting in load_state_dict
+        state = self.state[self._params[0]]
+        state.setdefault('func_evals', 0)
+        state.setdefault('n_iter', 0)
+
+        # evaluate initial f(x) and df/dx
+        orig_loss = closure()
+        loss = float(orig_loss)
+        current_evals = 1
+        state['func_evals'] += 1
+
+        flat_grad = self._gather_flat_grad()
+        opt_cond = flat_grad.abs().max() <= tolerance_grad
+
+        # optimal condition
+        if opt_cond:
+            return orig_loss
+
+        # tensors cached in state (for tracing)
+        d = state.get('d')
+        t = state.get('t')
+        old_dirs = state.get('old_dirs')
+        old_stps = state.get('old_stps')
+        ro = state.get('ro')
+        H_diag = state.get('H_diag')
+        prev_flat_grad = state.get('prev_flat_grad')
+        prev_loss = state.get('prev_loss')
+
+        n_iter = 0
+        # optimize for a max of max_iter iterations
+        while n_iter < max_iter:
+            # keep track of nb of iterations
+            n_iter += 1
+            state['n_iter'] += 1
+
+            ############################################################
+            # compute gradient descent direction
+            ############################################################
+            if state['n_iter'] == 1:
+                d = flat_grad.neg()
+                old_dirs = []
+                old_stps = []
+                ro = []
+                H_diag = 1
+            else:
+                # do lbfgs update (update memory)
+                y = flat_grad.sub(prev_flat_grad)
+                s = d.mul(t)
+                ys = y.dot(s)  # y*s
+                if ys > 1e-10:
+                    # updating memory
+                    if len(old_dirs) == history_size:
+                        # shift history by one (limited-memory)
+                        old_dirs.pop(0)
+                        old_stps.pop(0)
+                        ro.pop(0)
+
+                    # store new direction/step
+                    old_dirs.append(y)
+                    old_stps.append(s)
+                    ro.append(1. / ys)
+
+                    # update scale of initial Hessian approximation
+                    H_diag = ys / y.dot(y)  # (y*y)
+
+                # compute the approximate (L-BFGS) inverse Hessian
+                # multiplied by the gradient
+                num_old = len(old_dirs)
+
+                if 'al' not in state:
+                    state['al'] = [None] * history_size
+                al = state['al']
+
+                # iteration in L-BFGS loop collapsed to use just one buffer
+                q = flat_grad.neg()
+                for i in range(num_old - 1, -1, -1):
+                    al[i] = old_stps[i].dot(q) * ro[i]
+                    q.add_(old_dirs[i], alpha=-al[i])
+
+                # multiply by initial Hessian
+                # r/d is the final direction
+                d = r = torch.mul(q, H_diag)
+                for i in range(num_old):
+                    be_i = old_dirs[i].dot(r) * ro[i]
+                    r.add_(old_stps[i], alpha=al[i] - be_i)
+
+            if prev_flat_grad is None:
+                prev_flat_grad = flat_grad.clone(memory_format=torch.contiguous_format)
+            else:
+                prev_flat_grad.copy_(flat_grad)
+            prev_loss = loss
+
+            ############################################################
+            # compute step length
+            ############################################################
+            # reset initial guess for step size
+            if state['n_iter'] == 1:
+                t = min(1., 1. / flat_grad.abs().sum()) * lr
+            else:
+                t = lr
+
+            # directional derivative
+            gtd = flat_grad.dot(d)  # g * d
+
+            # directional derivative is below tolerance
+            if gtd > -tolerance_change:
+                break
+
+            # optional line search: user function
+            ls_func_evals = 0
+            if line_search_fn is not None:
+                # perform line search, using user function
+                if line_search_fn != "strong_wolfe":
+                    raise RuntimeError("only 'strong_wolfe' is supported")
+                else:
+                    x_init = self._clone_param()
+
+                    def obj_func(x, t, d):
+                        return self._directional_evaluate(closure, x, t, d)
+
+                    loss, flat_grad, t, ls_func_evals = _strong_wolfe(
+                        obj_func, x_init, t, d, loss, flat_grad, gtd)
+                self._add_grad(t, d)
+                opt_cond = flat_grad.abs().max() <= tolerance_grad
+            else:
+                # no line search, simply move with fixed-step
+                self._add_grad(t, d)
+                if n_iter != max_iter:
+                    # re-evaluate function only if not in last iteration
+                    # the reason we do this: in a stochastic setting,
+                    # no use to re-evaluate that function here
+                    with torch.enable_grad():
+                        loss = float(closure())
+                    flat_grad = self._gather_flat_grad()
+                    opt_cond = flat_grad.abs().max() <= tolerance_grad
+                    ls_func_evals = 1
+
+            # update func eval
+            current_evals += ls_func_evals
+            state['func_evals'] += ls_func_evals
+
+            ############################################################
+            # check conditions
+            ############################################################
+            if n_iter == max_iter:
+                break
+
+            if current_evals >= max_eval:
+                break
+
+            # optimal condition
+            if opt_cond:
+                break
+
+            # lack of progress
+            if d.mul(t).abs().max() <= tolerance_change:
+                break
+
+            if abs(loss - prev_loss) < tolerance_change:
+                break
+
+        state['d'] = d
+        state['t'] = t
+        state['old_dirs'] = old_dirs
+        state['old_stps'] = old_stps
+        state['ro'] = ro
+        state['H_diag'] = H_diag
+        state['prev_flat_grad'] = prev_flat_grad
+        state['prev_loss'] = prev_loss
+
+        return orig_loss
diff --git a/MLPY/Lib/site-packages/torch/optim/lbfgs.pyi b/MLPY/Lib/site-packages/torch/optim/lbfgs.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e7166497612265560fea0651893cf3ae6c3d35d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/lbfgs.pyi
@@ -0,0 +1,16 @@
+from typing import Optional
+
+from .optimizer import Optimizer, ParamsT
+
+class LBFGS(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        max_iter: int = ...,
+        max_eval: Optional[int] = ...,
+        tolerance_grad: float = ...,
+        tolerance_change: float = ...,
+        history_size: int = ...,
+        line_search_fn: Optional[str] = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/lr_scheduler.py b/MLPY/Lib/site-packages/torch/optim/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a952ee0e31d2445c3ac97e119e2aaced808041b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/lr_scheduler.py
@@ -0,0 +1,1806 @@
+import types
+import math
+from torch import inf
+from functools import wraps, partial
+import warnings
+import weakref
+from collections import Counter
+from bisect import bisect_right
+
+from .optimizer import Optimizer
+
+__all__ = ['LambdaLR', 'MultiplicativeLR', 'StepLR', 'MultiStepLR', 'ConstantLR', 'LinearLR',
+           'ExponentialLR', 'SequentialLR', 'CosineAnnealingLR', 'ChainedScheduler', 'ReduceLROnPlateau',
+           'CyclicLR', 'CosineAnnealingWarmRestarts', 'OneCycleLR', 'PolynomialLR', 'LRScheduler']
+
+EPOCH_DEPRECATION_WARNING = (
+    "The epoch parameter in `scheduler.step()` was not necessary and is being "
+    "deprecated where possible. Please use `scheduler.step()` to step the "
+    "scheduler. During the deprecation, if epoch is different from None, the "
+    "closed form is used instead of the new chainable form, where available. "
+    "Please open an issue if you are unable to replicate your use case: "
+    "https://github.com/pytorch/pytorch/issues/new/choose."
+)
+
+def _check_verbose_deprecated_warning(verbose):
+    """Raises a warning when verbose is not the default value."""
+    if verbose != "deprecated":
+        warnings.warn("The verbose parameter is deprecated. Please use get_last_lr() "
+                      "to access the learning rate.", UserWarning)
+        return verbose
+    return False
+
+class LRScheduler:
+
+    def __init__(self, optimizer, last_epoch=-1, verbose="deprecated"):
+
+        # Attach optimizer
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+        self.optimizer = optimizer
+
+        # Initialize epoch and base learning rates
+        if last_epoch == -1:
+            for group in optimizer.param_groups:
+                group.setdefault('initial_lr', group['lr'])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if 'initial_lr' not in group:
+                    raise KeyError("param 'initial_lr' is not specified "
+                                   f"in param_groups[{i}] when resuming an optimizer")
+        self.base_lrs = [group['initial_lr'] for group in optimizer.param_groups]
+        self.last_epoch = last_epoch
+
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `lr_scheduler.step()` is called after
+        # `optimizer.step()`
+        def with_counter(method):
+            if getattr(method, '_with_counter', False):
+                # `optimizer.step()` has already been replaced, return.
+                return method
+
+            # Keep a weak reference to the optimizer instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)
+            # Get the unbound method for the same purpose.
+            func = method.__func__
+            cls = instance_ref().__class__
+            del method
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._step_count += 1
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True
+            return wrapper
+
+        self.optimizer.step = with_counter(self.optimizer.step)
+        self.verbose = _check_verbose_deprecated_warning(verbose)
+
+        self._initial_step()
+
+    def _initial_step(self):
+        """Initialize step counts and performs a step"""
+        self.optimizer._step_count = 0
+        self._step_count = 0
+        self.step()
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        """
+        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        return self._last_lr
+
+    def get_lr(self):
+        # Compute learning rate using chainable form of the scheduler
+        raise NotImplementedError
+
+    def print_lr(self, is_verbose, group, lr, epoch=None):
+        """Display the current learning rate.
+        """
+        if is_verbose:
+            if epoch is None:
+                print(f'Adjusting learning rate of group {group} to {lr:.4e}.')
+            else:
+                epoch_str = ("%.2f" if isinstance(epoch, float) else
+                             "%.5d") % epoch
+                print(f'Epoch {epoch_str}: adjusting learning rate of group {group} to {lr:.4e}.')
+
+
+    def step(self, epoch=None):
+        # Raise a warning if old pattern is detected
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._step_count == 1:
+            if not hasattr(self.optimizer.step, "_with_counter"):
+                warnings.warn("Seems like `optimizer.step()` has been overridden after learning rate scheduler "
+                              "initialization. Please, make sure to call `optimizer.step()` before "
+                              "`lr_scheduler.step()`. See more details at "
+                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
+
+            # Just check if there were two first lr_scheduler.step() calls before optimizer.step()
+            elif self.optimizer._step_count < 1:
+                warnings.warn("Detected call of `lr_scheduler.step()` before `optimizer.step()`. "
+                              "In PyTorch 1.1.0 and later, you should call them in the opposite order: "
+                              "`optimizer.step()` before `lr_scheduler.step()`.  Failure to do this "
+                              "will result in PyTorch skipping the first value of the learning rate schedule. "
+                              "See more details at "
+                              "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
+        self._step_count += 1
+
+        with _enable_get_lr_call(self):
+            if epoch is None:
+                self.last_epoch += 1
+                values = self.get_lr()
+            else:
+                warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+                self.last_epoch = epoch
+                if hasattr(self, "_get_closed_form_lr"):
+                    values = self._get_closed_form_lr()
+                else:
+                    values = self.get_lr()
+
+        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
+            param_group, lr = data
+            param_group['lr'] = lr
+
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+
+# Including _LRScheduler for backwards compatibility
+# Subclass instead of assign because we want __name__ of _LRScheduler to be _LRScheduler (assigning would make it LRScheduler).
+class _LRScheduler(LRScheduler):
+    pass
+
+
+class _enable_get_lr_call:
+
+    def __init__(self, o):
+        self.o = o
+
+    def __enter__(self):
+        self.o._get_lr_called_within_step = True
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.o._get_lr_called_within_step = False
+
+
+class LambdaLR(LRScheduler):
+    """Sets the learning rate of each parameter group to the initial lr
+    times a given function. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        lr_lambda (function or list): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such
+            functions, one for each group in optimizer.param_groups.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer has two groups.
+        >>> lambda1 = lambda epoch: epoch // 30
+        >>> lambda2 = lambda epoch: 0.95 ** epoch
+        >>> scheduler = LambdaLR(optimizer, lr_lambda=[lambda1, lambda2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose="deprecated"):
+        self.optimizer = optimizer
+
+        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            if len(lr_lambda) != len(optimizer.param_groups):
+                raise ValueError(f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}")
+            self.lr_lambdas = list(lr_lambda)
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+        """
+
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
+        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+
+        for idx, fn in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+
+        lr_lambdas = state_dict.pop('lr_lambdas')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['lr_lambdas'] = lr_lambdas
+
+        for idx, fn in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.")
+
+        return [base_lr * lmbda(self.last_epoch)
+                for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
+
+
+class MultiplicativeLR(LRScheduler):
+    """Multiply the learning rate of each parameter group by the factor given
+    in the specified function. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        lr_lambda (function or list): A function which computes a multiplicative
+            factor given an integer parameter epoch, or a list of such
+            functions, one for each group in optimizer.param_groups.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> lmbda = lambda epoch: 0.95
+        >>> scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, lr_lambda, last_epoch=-1, verbose="deprecated"):
+        self.optimizer = optimizer
+
+        if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
+            self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
+        else:
+            if len(lr_lambda) != len(optimizer.param_groups):
+                raise ValueError(f"Expected {len(optimizer.param_groups)} lr_lambdas, but got {len(lr_lambda)}")
+            self.lr_lambdas = list(lr_lambda)
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The learning rate lambda functions will only be saved if they are callable objects
+        and not if they are functions or lambdas.
+        """
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', 'lr_lambdas')}
+        state_dict['lr_lambdas'] = [None] * len(self.lr_lambdas)
+
+        for idx, fn in enumerate(self.lr_lambdas):
+            if not isinstance(fn, types.FunctionType):
+                state_dict['lr_lambdas'][idx] = fn.__dict__.copy()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        lr_lambdas = state_dict.pop('lr_lambdas')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['lr_lambdas'] = lr_lambdas
+
+        for idx, fn in enumerate(lr_lambdas):
+            if fn is not None:
+                self.lr_lambdas[idx].__dict__.update(fn)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch > 0:
+            return [group['lr'] * lmbda(self.last_epoch)
+                    for lmbda, group in zip(self.lr_lambdas, self.optimizer.param_groups)]
+        else:
+            return [group['lr'] for group in self.optimizer.param_groups]
+
+
+class StepLR(LRScheduler):
+    """Decays the learning rate of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the learning rate from outside this scheduler. When
+    last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Default: 0.1.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.05     if epoch < 30
+        >>> # lr = 0.005    if 30 <= epoch < 60
+        >>> # lr = 0.0005   if 60 <= epoch < 90
+        >>> # ...
+        >>> scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1, verbose="deprecated"):
+        self.step_size = step_size
+        self.gamma = gamma
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [group['lr'] * self.gamma
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
+                for base_lr in self.base_lrs]
+
+
+class MultiStepLR(LRScheduler):
+    """Decays the learning rate of each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the learning rate from outside
+    this scheduler. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Default: 0.1.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.05     if epoch < 30
+        >>> # lr = 0.005    if 30 <= epoch < 80
+        >>> # lr = 0.0005   if epoch >= 80
+        >>> scheduler = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1, verbose="deprecated"):
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch not in self.milestones:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [group['lr'] * self.gamma ** self.milestones[self.last_epoch]
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        milestones = sorted(self.milestones.elements())
+        return [base_lr * self.gamma ** bisect_right(milestones, self.last_epoch)
+                for base_lr in self.base_lrs]
+
+
+class ConstantLR(LRScheduler):
+    """Multiply the learning rate of each parameter group by a small constant factor until the
+    number of epoch reaches a pre-defined milestone: total_iters.
+    Notice that such multiplication of the small constant factor can
+    happen simultaneously with other changes to the learning rate from outside this scheduler.
+    When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        factor (float): The number we multiply learning rate until the milestone. Default: 1./3.
+        total_iters (int): The number of steps that the scheduler multiplies the learning rate by the factor.
+            Default: 5.
+        last_epoch (int): The index of the last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.025   if epoch == 0
+        >>> # lr = 0.025   if epoch == 1
+        >>> # lr = 0.025   if epoch == 2
+        >>> # lr = 0.025   if epoch == 3
+        >>> # lr = 0.05    if epoch >= 4
+        >>> scheduler = ConstantLR(optimizer, factor=0.5, total_iters=4)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, factor=1.0 / 3, total_iters=5, last_epoch=-1, verbose="deprecated"):
+        if factor > 1.0 or factor < 0:
+            raise ValueError('Constant multiplicative factor expected to be between 0 and 1.')
+
+        self.factor = factor
+        self.total_iters = total_iters
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0:
+            return [group['lr'] * self.factor for group in self.optimizer.param_groups]
+
+        if self.last_epoch != self.total_iters:
+            return [group['lr'] for group in self.optimizer.param_groups]
+
+        return [group['lr'] * (1.0 / self.factor) for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [base_lr * (self.factor + (self.last_epoch >= self.total_iters) * (1 - self.factor))
+                for base_lr in self.base_lrs]
+
+
+class LinearLR(LRScheduler):
+    """Decays the learning rate of each parameter group by linearly changing small
+    multiplicative factor until the number of epoch reaches a pre-defined milestone: total_iters.
+    Notice that such decay can happen simultaneously with other changes to the learning rate
+    from outside this scheduler. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        start_factor (float): The number we multiply learning rate in the first epoch.
+            The multiplication factor changes towards end_factor in the following epochs.
+            Default: 1./3.
+        end_factor (float): The number we multiply learning rate at the end of linear changing
+            process. Default: 1.0.
+        total_iters (int): The number of iterations that multiplicative factor reaches to 1.
+            Default: 5.
+        last_epoch (int): The index of the last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 0.05 for all groups
+        >>> # lr = 0.025    if epoch == 0
+        >>> # lr = 0.03125  if epoch == 1
+        >>> # lr = 0.0375   if epoch == 2
+        >>> # lr = 0.04375  if epoch == 3
+        >>> # lr = 0.05    if epoch >= 4
+        >>> scheduler = LinearLR(optimizer, start_factor=0.5, total_iters=4)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, start_factor=1.0 / 3, end_factor=1.0, total_iters=5, last_epoch=-1,
+                 verbose="deprecated"):
+        if start_factor > 1.0 or start_factor <= 0:
+            raise ValueError('Starting multiplicative factor expected to be greater than 0 and less or equal to 1.')
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError('Ending multiplicative factor expected to be between 0 and 1.')
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.total_iters = total_iters
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0:
+            return [group['lr'] * self.start_factor for group in self.optimizer.param_groups]
+
+        if self.last_epoch > self.total_iters:
+            return [group['lr'] for group in self.optimizer.param_groups]
+
+        return [group['lr'] * (1. + (self.end_factor - self.start_factor) /
+                (self.total_iters * self.start_factor + (self.last_epoch - 1) * (self.end_factor - self.start_factor)))
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [base_lr * (self.start_factor +
+                (self.end_factor - self.start_factor) * min(self.total_iters, self.last_epoch) / self.total_iters)
+                for base_lr in self.base_lrs]
+
+
+class ExponentialLR(LRScheduler):
+    """Decays the learning rate of each parameter group by gamma every epoch.
+    When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        gamma (float): Multiplicative factor of learning rate decay.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+    """
+
+    def __init__(self, optimizer, gamma, last_epoch=-1, verbose="deprecated"):
+        self.gamma = gamma
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [group['lr'] * self.gamma
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [base_lr * self.gamma ** self.last_epoch
+                for base_lr in self.base_lrs]
+
+
+class SequentialLR(LRScheduler):
+    """Receives the list of schedulers that is expected to be called sequentially during
+    optimization process and milestone points that provides exact intervals to reflect
+    which scheduler is supposed to be called at a given epoch.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        schedulers (list): List of chained schedulers.
+        milestones (list): List of integers that reflects milestone points.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): Does nothing.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 1. for all groups
+        >>> # lr = 0.1     if epoch == 0
+        >>> # lr = 0.1     if epoch == 1
+        >>> # lr = 0.9     if epoch == 2
+        >>> # lr = 0.81    if epoch == 3
+        >>> # lr = 0.729   if epoch == 4
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
+        >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+        >>> scheduler = SequentialLR(optimizer, schedulers=[scheduler1, scheduler2], milestones=[2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, optimizer, schedulers, milestones, last_epoch=-1, verbose="deprecated"):
+        for scheduler_idx in range(len(schedulers)):
+            if schedulers[scheduler_idx].optimizer != optimizer:
+                raise ValueError(
+                    "Sequential Schedulers expects all schedulers to belong to the same optimizer, but "
+                    f"got schedulers at index {scheduler_idx} to be different than the optimizer passed in."
+                )
+
+            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+                raise ValueError(
+                    "Sequential Schedulers expects all schedulers to belong to the same optimizer, but "
+                    f"got schedulers at index {0} and {scheduler_idx} to be different."
+                )
+        if (len(milestones) != len(schedulers) - 1):
+            raise ValueError(
+                "Sequential Schedulers expects number of schedulers provided to be one more "
+                f"than the number of milestone points, but got number of schedulers {len(schedulers)} and the "
+                f"number of milestones to be equal to {len(milestones)}"
+            )
+        _check_verbose_deprecated_warning(verbose)
+        self._schedulers = schedulers
+        self._milestones = milestones
+        self.last_epoch = last_epoch + 1
+        self.optimizer = optimizer
+
+        # Reset learning rates back to initial values
+        for group in self.optimizer.param_groups:
+            group["lr"] = group["initial_lr"]
+
+        # "Undo" the step performed by other schedulers
+        for scheduler in self._schedulers:
+            scheduler.last_epoch -= 1
+
+        # Perform the initial step for only the first scheduler
+        self._schedulers[0]._initial_step()
+
+        self._last_lr = schedulers[0].get_last_lr()
+
+    def step(self):
+        self.last_epoch += 1
+        idx = bisect_right(self._milestones, self.last_epoch)
+        scheduler = self._schedulers[idx]
+        if idx > 0 and self._milestones[idx - 1] == self.last_epoch:
+            scheduler.step(0)
+        else:
+            scheduler.step()
+
+        self._last_lr = scheduler.get_last_lr()
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The wrapped scheduler states will also be saved.
+        """
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', '_schedulers')}
+        state_dict['_schedulers'] = [None] * len(self._schedulers)
+
+        for idx, s in enumerate(self._schedulers):
+            state_dict['_schedulers'][idx] = s.state_dict()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        _schedulers = state_dict.pop('_schedulers')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['_schedulers'] = _schedulers
+
+        for idx, s in enumerate(_schedulers):
+            self._schedulers[idx].load_state_dict(s)
+
+
+class PolynomialLR(LRScheduler):
+    """Decays the learning rate of each parameter group using a polynomial function
+    in the given total_iters. When last_epoch=-1, sets initial lr as lr.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        total_iters (int): The number of steps that the scheduler decays the learning rate. Default: 5.
+        power (float): The power of the polynomial. Default: 1.0.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined vars")
+        >>> # Assuming optimizer uses lr = 0.001 for all groups
+        >>> # lr = 0.001     if epoch == 0
+        >>> # lr = 0.00075   if epoch == 1
+        >>> # lr = 0.00050   if epoch == 2
+        >>> # lr = 0.00025   if epoch == 3
+        >>> # lr = 0.0       if epoch >= 4
+        >>> scheduler = PolynomialLR(optimizer, total_iters=4, power=1.0)
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+    def __init__(self, optimizer, total_iters=5, power=1.0, last_epoch=-1, verbose="deprecated"):
+        self.total_iters = total_iters
+        self.power = power
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0 or self.last_epoch > self.total_iters:
+            return [group["lr"] for group in self.optimizer.param_groups]
+
+        decay_factor = ((1.0 - self.last_epoch / self.total_iters) / (1.0 - (self.last_epoch - 1) / self.total_iters)) ** self.power
+        return [group["lr"] * decay_factor for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [
+            (
+                base_lr * (1.0 - min(self.total_iters, self.last_epoch) / self.total_iters) ** self.power
+            )
+            for base_lr in self.base_lrs
+        ]
+
+
+class CosineAnnealingLR(LRScheduler):
+    r"""Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial lr and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    When last_epoch=-1, sets initial lr as lr. Notice that because the schedule
+    is defined recursively, the learning rate can be simultaneously modified
+    outside this scheduler by other operators. If the learning rate is set
+    solely by this scheduler, the learning rate at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
+    implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum learning rate. Default: 0.
+        last_epoch (int): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1, verbose="deprecated"):
+        self.T_max = T_max
+        self.eta_min = eta_min
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        if self.last_epoch == 0:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        elif self._step_count == 1 and self.last_epoch > 0:
+            return [self.eta_min + (base_lr - self.eta_min) *
+                    (1 + math.cos((self.last_epoch) * math.pi / self.T_max)) / 2
+                    for base_lr, group in
+                    zip(self.base_lrs, self.optimizer.param_groups)]
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return [group['lr'] + (base_lr - self.eta_min) *
+                    (1 - math.cos(math.pi / self.T_max)) / 2
+                    for base_lr, group in
+                    zip(self.base_lrs, self.optimizer.param_groups)]
+        return [(1 + math.cos(math.pi * self.last_epoch / self.T_max)) /
+                (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) *
+                (group['lr'] - self.eta_min) + self.eta_min
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
+        return [self.eta_min + (base_lr - self.eta_min) *
+                (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+                for base_lr in self.base_lrs]
+
+
+class ChainedScheduler(LRScheduler):
+    """Chains list of learning rate schedulers. It takes a list of chainable learning
+    rate schedulers and performs consecutive step() functions belonging to them by just
+    one call.
+
+    Args:
+        schedulers (list): List of chained schedulers.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Assuming optimizer uses lr = 1. for all groups
+        >>> # lr = 0.09     if epoch == 0
+        >>> # lr = 0.081    if epoch == 1
+        >>> # lr = 0.729    if epoch == 2
+        >>> # lr = 0.6561   if epoch == 3
+        >>> # lr = 0.59049  if epoch >= 4
+        >>> scheduler1 = ConstantLR(optimizer, factor=0.1, total_iters=2)
+        >>> scheduler2 = ExponentialLR(optimizer, gamma=0.9)
+        >>> scheduler = ChainedScheduler([scheduler1, scheduler2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+
+    def __init__(self, schedulers):
+        for scheduler_idx in range(1, len(schedulers)):
+            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+                raise ValueError(
+                    "ChainedScheduler expects all schedulers to belong to the same optimizer, but "
+                    f"got schedulers at index {0} and {scheduler_idx} to be different"
+                )
+        self._schedulers = list(schedulers)
+        self.optimizer = schedulers[0].optimizer
+        self._last_lr = [group['lr'] for group in self._schedulers[-1].optimizer.param_groups]
+
+    def step(self):
+        for scheduler in self._schedulers:
+            scheduler.step()
+        self._last_lr = [group['lr'] for group in self._schedulers[-1].optimizer.param_groups]
+
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The wrapped scheduler states will also be saved.
+        """
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', '_schedulers')}
+        state_dict['_schedulers'] = [None] * len(self._schedulers)
+
+        for idx, s in enumerate(self._schedulers):
+            state_dict['_schedulers'][idx] = s.state_dict()
+
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        _schedulers = state_dict.pop('_schedulers')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['_schedulers'] = _schedulers
+
+        for idx, s in enumerate(_schedulers):
+            self._schedulers[idx].load_state_dict(s)
+
+
+class ReduceLROnPlateau(LRScheduler):
+    """Reduce learning rate when a metric has stopped improving.
+    Models often benefit from reducing the learning rate by a factor
+    of 2-10 once learning stagnates. This scheduler reads a metrics
+    quantity and if no improvement is seen for a 'patience' number
+    of epochs, the learning rate is reduced.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        mode (str): One of `min`, `max`. In `min` mode, lr will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `max` mode it will be reduced when the
+            quantity monitored has stopped increasing. Default: 'min'.
+        factor (float): Factor by which the learning rate will be
+            reduced. new_lr = lr * factor. Default: 0.1.
+        patience (int): The number of allowed epochs with no improvement after
+            which the learning rate will be reduced.
+            For example, consider the case of having no patience (`patience = 0`).
+            In the first epoch, a baseline is established and is always considered good as there's no previous baseline.
+            In the second epoch, if the performance is worse than the baseline,
+            we have what is considered an intolerable epoch.
+            Since the count of intolerable epochs (1) is greater than the patience level (0),
+            the learning rate is reduced at the end of this epoch.
+            From the third epoch onwards, the learning rate continues to be reduced at the end of each epoch
+            if the performance is worse than the baseline. If the performance improves or remains the same,
+            the learning rate is not adjusted.
+            Default: 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Default: 1e-4.
+        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
+            dynamic_threshold = best * ( 1 + threshold ) in 'max'
+            mode or best * ( 1 - threshold ) in `min` mode.
+            In `abs` mode, dynamic_threshold = best + threshold in
+            `max` mode or best - threshold in `min` mode. Default: 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after lr has been reduced. Default: 0.
+        min_lr (float or list): A scalar or a list of scalars. A
+            lower bound on the learning rate of all param groups
+            or each group respectively. Default: 0.
+        eps (float): Minimal decay applied to lr. If the difference
+            between new and old lr is smaller than eps, the update is
+            ignored. Default: 1e-8.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> scheduler = ReduceLROnPlateau(optimizer, 'min')
+        >>> for epoch in range(10):
+        >>>     train(...)
+        >>>     val_loss = validate(...)
+        >>>     # Note that step should be called after validate()
+        >>>     scheduler.step(val_loss)
+    """
+
+    def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
+                 threshold=1e-4, threshold_mode='rel', cooldown=0,
+                 min_lr=0, eps=1e-8, verbose="deprecated"):
+
+        if factor >= 1.0:
+            raise ValueError('Factor should be < 1.0.')
+        self.factor = factor
+
+        # Attach optimizer
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+        self.optimizer = optimizer
+
+        if isinstance(min_lr, (list, tuple)):
+            if len(min_lr) != len(optimizer.param_groups):
+                raise ValueError(f"expected {len(optimizer.param_groups)} min_lrs, got {len(min_lr)}")
+            self.min_lrs = list(min_lr)
+        else:
+            self.min_lrs = [min_lr] * len(optimizer.param_groups)
+
+        self.patience = patience
+
+        self.verbose = _check_verbose_deprecated_warning(verbose)
+        self.cooldown = cooldown
+        self.cooldown_counter = 0
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.best = None
+        self.num_bad_epochs = None
+        self.mode_worse = None  # the worse value for the chosen mode
+        self.eps = eps
+        self.last_epoch = 0
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._init_is_better(mode=mode, threshold=threshold,
+                             threshold_mode=threshold_mode)
+        self._reset()
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.mode_worse
+        self.cooldown_counter = 0
+        self.num_bad_epochs = 0
+
+    def step(self, metrics, epoch=None):
+        # convert `metrics` to float, in case it's a zero-dim Tensor
+        current = float(metrics)
+        if epoch is None:
+            epoch = self.last_epoch + 1
+        else:
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+        self.last_epoch = epoch
+
+        if self.is_better(current, self.best):
+            self.best = current
+            self.num_bad_epochs = 0
+        else:
+            self.num_bad_epochs += 1
+
+        if self.in_cooldown:
+            self.cooldown_counter -= 1
+            self.num_bad_epochs = 0  # ignore any bad epochs in cooldown
+
+        if self.num_bad_epochs > self.patience:
+            self._reduce_lr(epoch)
+            self.cooldown_counter = self.cooldown
+            self.num_bad_epochs = 0
+
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+    def _reduce_lr(self, epoch):
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            old_lr = float(param_group['lr'])
+            new_lr = max(old_lr * self.factor, self.min_lrs[i])
+            if old_lr - new_lr > self.eps:
+                param_group['lr'] = new_lr
+
+    @property
+    def in_cooldown(self):
+        return self.cooldown_counter > 0
+
+    def is_better(self, a, best):
+        if self.mode == 'min' and self.threshold_mode == 'rel':
+            rel_epsilon = 1. - self.threshold
+            return a < best * rel_epsilon
+
+        elif self.mode == 'min' and self.threshold_mode == 'abs':
+            return a < best - self.threshold
+
+        elif self.mode == 'max' and self.threshold_mode == 'rel':
+            rel_epsilon = self.threshold + 1.
+            return a > best * rel_epsilon
+
+        else:  # mode == 'max' and epsilon_mode == 'abs':
+            return a > best + self.threshold
+
+    def _init_is_better(self, mode, threshold, threshold_mode):
+        if mode not in {'min', 'max'}:
+            raise ValueError('mode ' + mode + ' is unknown!')
+        if threshold_mode not in {'rel', 'abs'}:
+            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
+
+        if mode == 'min':
+            self.mode_worse = inf
+        else:  # mode == 'max':
+            self.mode_worse = -inf
+
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+
+    def state_dict(self):
+        return {key: value for key, value in self.__dict__.items() if key != 'optimizer'}
+
+    def load_state_dict(self, state_dict):
+        self.__dict__.update(state_dict)
+        self._init_is_better(mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode)
+
+
+class CyclicLR(LRScheduler):
+    r"""Sets the learning rate of each parameter group according to
+    cyclical learning rate policy (CLR). The policy cycles the learning
+    rate between two boundaries with a constant frequency, as detailed in
+    the paper `Cyclical Learning Rates for Training Neural Networks`_.
+    The distance between the two boundaries can be scaled on a per-iteration
+    or per-cycle basis.
+
+    Cyclical learning rate policy changes the learning rate after every batch.
+    `step` should be called after a batch has been used for training.
+
+    This class has three built-in policies, as put forth in the paper:
+
+    * "triangular": A basic triangular cycle without amplitude scaling.
+    * "triangular2": A basic triangular cycle that scales initial amplitude by half each cycle.
+    * "exp_range": A cycle that scales initial amplitude by :math:`\text{gamma}^{\text{cycle iterations}}`
+      at each cycle iteration.
+
+    This implementation was adapted from the github repo: `bckenstler/CLR`_
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        base_lr (float or list): Initial learning rate which is the
+            lower boundary in the cycle for each parameter group.
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_lr - base_lr).
+            The lr at any cycle is the sum of base_lr
+            and some scaling of the amplitude; therefore
+            max_lr may not actually be reached depending on
+            scaling function.
+        step_size_up (int): Number of training iterations in the
+            increasing half of a cycle. Default: 2000
+        step_size_down (int): Number of training iterations in the
+            decreasing half of a cycle. If step_size_down is None,
+            it is set to step_size_up. Default: None
+        mode (str): One of {triangular, triangular2, exp_range}.
+            Values correspond to policies detailed above.
+            If scale_fn is not None, this argument is ignored.
+            Default: 'triangular'
+        gamma (float): Constant in 'exp_range' scaling function:
+            gamma**(cycle iterations)
+            Default: 1.0
+        scale_fn (function): Custom scaling policy defined by a single
+            argument lambda function, where
+            0 <= scale_fn(x) <= 1 for all x >= 0.
+            If specified, then 'mode' is ignored.
+            Default: None
+        scale_mode (str): {'cycle', 'iterations'}.
+            Defines whether scale_fn is evaluated on
+            cycle number or cycle iterations (training
+            iterations since start of cycle).
+            Default: 'cycle'
+        cycle_momentum (bool): If ``True``, momentum is cycled inversely
+            to learning rate between 'base_momentum' and 'max_momentum'.
+            Default: True
+        base_momentum (float or list): Lower momentum boundaries in the cycle
+            for each parameter group. Note that momentum is cycled inversely
+            to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr'.
+            Default: 0.8
+        max_momentum (float or list): Upper momentum boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_momentum - base_momentum).
+            The momentum at any cycle is the difference of max_momentum
+            and some scaling of the amplitude; therefore
+            base_momentum may not actually be reached depending on
+            scaling function. Note that momentum is cycled inversely
+            to learning rate; at the start of a cycle, momentum is 'max_momentum'
+            and learning rate is 'base_lr'
+            Default: 0.9
+        last_epoch (int): The index of the last batch. This parameter is used when
+            resuming a training job. Since `step()` should be invoked after each
+            batch instead of after each epoch, this number represents the total
+            number of *batches* computed, not the total number of epochs computed.
+            When last_epoch=-1, the schedule is started from the beginning.
+            Default: -1
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.01, max_lr=0.1)
+        >>> data_loader = torch.utils.data.DataLoader(...)
+        >>> for epoch in range(10):
+        >>>     for batch in data_loader:
+        >>>         train_batch(...)
+        >>>         scheduler.step()
+
+
+    .. _Cyclical Learning Rates for Training Neural Networks: https://arxiv.org/abs/1506.01186
+    .. _bckenstler/CLR: https://github.com/bckenstler/CLR
+    """
+
+    def __init__(self,
+                 optimizer,
+                 base_lr,
+                 max_lr,
+                 step_size_up=2000,
+                 step_size_down=None,
+                 mode='triangular',
+                 gamma=1.,
+                 scale_fn=None,
+                 scale_mode='cycle',
+                 cycle_momentum=True,
+                 base_momentum=0.8,
+                 max_momentum=0.9,
+                 last_epoch=-1,
+                 verbose="deprecated"):
+
+        # Attach optimizer
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+        self.optimizer = optimizer
+
+        base_lrs = self._format_param('base_lr', optimizer, base_lr)
+        if last_epoch == -1:
+            for lr, group in zip(base_lrs, optimizer.param_groups):
+                group['lr'] = lr
+
+        self.max_lrs = self._format_param('max_lr', optimizer, max_lr)
+
+        step_size_up = float(step_size_up)
+        step_size_down = float(step_size_down) if step_size_down is not None else step_size_up
+        self.total_size = step_size_up + step_size_down
+        self.step_ratio = step_size_up / self.total_size
+
+        if mode not in ['triangular', 'triangular2', 'exp_range'] \
+                and scale_fn is None:
+            raise ValueError('mode is invalid and scale_fn is None')
+
+        self.mode = mode
+        self.gamma = gamma
+
+        self._scale_fn_ref = None
+        self._scale_fn_custom = scale_fn
+        self.scale_mode = scale_mode
+        self._init_scale_fn()
+
+        self.cycle_momentum = cycle_momentum
+        if cycle_momentum:
+            if 'momentum' not in optimizer.defaults and 'betas' not in optimizer.defaults:
+                raise ValueError('optimizer must support momentum or beta1 with `cycle_momentum` option enabled')
+
+            self.use_beta1 = 'betas' in self.optimizer.defaults
+            self.base_momentums = self._format_param('base_momentum', optimizer, base_momentum)
+            self.max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
+            if last_epoch == -1:
+                for m_momentum, b_momentum, group in zip(self.max_momentums, self.base_momentums, optimizer.param_groups):
+                    if self.use_beta1:
+                        group['betas'] = (m_momentum, *group['betas'][1:])
+                    else:
+                        group['momentum'] = m_momentum
+                    group['max_momentum'] = m_momentum
+                    group['base_momentum'] = b_momentum
+
+        super().__init__(optimizer, last_epoch, verbose)
+        self.base_lrs = base_lrs
+
+    def _init_scale_fn(self):
+        if self._scale_fn_custom is not None:
+            return
+        if self.mode == 'triangular':
+            self._scale_fn_ref = self._triangular_scale_fn
+            self.scale_mode = 'cycle'
+        elif self.mode == 'triangular2':
+            self._scale_fn_ref = self._triangular2_scale_fn
+            self.scale_mode = 'cycle'
+        elif self.mode == 'exp_range':
+            self._scale_fn_ref = partial(self._exp_range_scale_fn, self.gamma)
+            self.scale_mode = 'iterations'
+
+    def _format_param(self, name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError(f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}")
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+    def scale_fn(self, x):
+        if self._scale_fn_custom is not None:
+            return self._scale_fn_custom(x)
+        else:
+            return self._scale_fn_ref(x)  # static method
+
+    @staticmethod
+    def _triangular_scale_fn(x):
+        return 1.
+
+    @staticmethod
+    def _triangular2_scale_fn(x):
+        return 1 / (2. ** (x - 1))
+
+    @staticmethod
+    def _exp_range_scale_fn(gamma, x):
+        return gamma ** x
+
+    def get_lr(self):
+        """Calculates the learning rate at batch index. This function treats
+        `self.last_epoch` as the last batch index.
+
+        If `self.cycle_momentum` is ``True``, this function has a side effect of
+        updating the optimizer's momentum.
+        """
+
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        cycle = math.floor(1 + self.last_epoch / self.total_size)
+        x = 1. + self.last_epoch / self.total_size - cycle
+        if x <= self.step_ratio:
+            scale_factor = x / self.step_ratio
+        else:
+            scale_factor = (x - 1) / (self.step_ratio - 1)
+
+        lrs = []
+        for base_lr, max_lr in zip(self.base_lrs, self.max_lrs):
+            base_height = (max_lr - base_lr) * scale_factor
+            if self.scale_mode == 'cycle':
+                lr = base_lr + base_height * self.scale_fn(cycle)
+            else:
+                lr = base_lr + base_height * self.scale_fn(self.last_epoch)
+            lrs.append(lr)
+
+        if self.cycle_momentum:
+            momentums = []
+            for base_momentum, max_momentum in zip(self.base_momentums, self.max_momentums):
+                base_height = (max_momentum - base_momentum) * scale_factor
+                if self.scale_mode == 'cycle':
+                    momentum = max_momentum - base_height * self.scale_fn(cycle)
+                else:
+                    momentum = max_momentum - base_height * self.scale_fn(self.last_epoch)
+                momentums.append(momentum)
+            for param_group, momentum in zip(self.optimizer.param_groups, momentums):
+                if self.use_beta1:
+                    param_group['betas'] = (momentum, *param_group['betas'][1:])
+                else:
+                    param_group['momentum'] = momentum
+
+        return lrs
+
+    def state_dict(self):
+        state = super().state_dict()
+        # We are dropping the `_scale_fn_ref` attribute because it is a
+        # `weakref.WeakMethod` and can't be pickled.
+        state.pop('_scale_fn_ref')
+        fn = state.pop('_scale_fn_custom')
+        state['_scale_fn_custom'] = None
+        if fn is not None and not isinstance(fn, types.FunctionType):
+            # The _scale_fn_custom will only be saved if it is a callable object
+            # and not if it is a function or lambda.
+            state['_scale_fn_custom'] = fn.__dict__.copy()
+
+        return state
+
+    def load_state_dict(self, state_dict):
+        fn = state_dict.pop('_scale_fn_custom')
+        super().load_state_dict(state_dict)
+        if fn is not None:
+            self._scale_fn_custom.__dict__.update(fn)
+        self._init_scale_fn()
+
+
+class CosineAnnealingWarmRestarts(LRScheduler):
+    r"""Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial lr, :math:`T_{cur}`
+    is the number of epochs since the last restart and :math:`T_{i}` is the number
+    of epochs between two warm restarts in SGDR:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{i}}\pi\right)\right)
+
+    When :math:`T_{cur}=T_{i}`, set :math:`\eta_t = \eta_{min}`.
+    When :math:`T_{cur}=0` after restart, set :math:`\eta_t=\eta_{max}`.
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        T_0 (int): Number of iterations for the first restart.
+        T_mult (int, optional): A factor increases :math:`T_{i}` after a restart. Default: 1.
+        eta_min (float, optional): Minimum learning rate. Default: 0.
+        last_epoch (int, optional): The index of last epoch. Default: -1.
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1, verbose="deprecated"):
+        if T_0 <= 0 or not isinstance(T_0, int):
+            raise ValueError(f"Expected positive integer T_0, but got {T_0}")
+        if T_mult < 1 or not isinstance(T_mult, int):
+            raise ValueError(f"Expected integer T_mult >= 1, but got {T_mult}")
+        if not isinstance(eta_min, (float, int)):
+            raise ValueError(f"Expected float or int eta_min, but got {eta_min} of type {type(eta_min)}")
+        self.T_0 = T_0
+        self.T_i = T_0
+        self.T_mult = T_mult
+        self.eta_min = eta_min
+        self.T_cur = last_epoch
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        return [self.eta_min + (base_lr - self.eta_min) * (1 + math.cos(math.pi * self.T_cur / self.T_i)) / 2
+                for base_lr in self.base_lrs]
+
+    def step(self, epoch=None):
+        """Step could be called after every batch update
+
+        Example:
+            >>> # xdoctest: +SKIP("Undefined vars")
+            >>> scheduler = CosineAnnealingWarmRestarts(optimizer, T_0, T_mult)
+            >>> iters = len(dataloader)
+            >>> for epoch in range(20):
+            >>>     for i, sample in enumerate(dataloader):
+            >>>         inputs, labels = sample['inputs'], sample['labels']
+            >>>         optimizer.zero_grad()
+            >>>         outputs = net(inputs)
+            >>>         loss = criterion(outputs, labels)
+            >>>         loss.backward()
+            >>>         optimizer.step()
+            >>>         scheduler.step(epoch + i / iters)
+
+        This function can be called in an interleaved way.
+
+        Example:
+            >>> # xdoctest: +SKIP("Undefined vars")
+            >>> scheduler = CosineAnnealingWarmRestarts(optimizer, T_0, T_mult)
+            >>> for epoch in range(20):
+            >>>     scheduler.step()
+            >>> scheduler.step(26)
+            >>> scheduler.step() # scheduler.step(27), instead of scheduler(20)
+        """
+
+        if epoch is None and self.last_epoch < 0:
+            epoch = 0
+
+        if epoch is None:
+            epoch = self.last_epoch + 1
+            self.T_cur = self.T_cur + 1
+            if self.T_cur >= self.T_i:
+                self.T_cur = self.T_cur - self.T_i
+                self.T_i = self.T_i * self.T_mult
+        else:
+            if epoch < 0:
+                raise ValueError(f"Expected non-negative epoch, but got {epoch}")
+            if epoch >= self.T_0:
+                if self.T_mult == 1:
+                    self.T_cur = epoch % self.T_0
+                else:
+                    n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult))
+                    self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1)
+                    self.T_i = self.T_0 * self.T_mult ** (n)
+            else:
+                self.T_i = self.T_0
+                self.T_cur = epoch
+        self.last_epoch = math.floor(epoch)
+
+        class _enable_get_lr_call:
+
+            def __init__(self, o):
+                self.o = o
+
+            def __enter__(self):
+                self.o._get_lr_called_within_step = True
+                return self
+
+            def __exit__(self, type, value, traceback):
+                self.o._get_lr_called_within_step = False
+                return self
+
+        with _enable_get_lr_call(self):
+            for i, data in enumerate(zip(self.optimizer.param_groups, self.get_lr())):
+                param_group, lr = data
+                param_group['lr'] = lr
+
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
+
+class OneCycleLR(LRScheduler):
+    r"""Sets the learning rate of each parameter group according to the
+    1cycle learning rate policy. The 1cycle policy anneals the learning
+    rate from an initial learning rate to some maximum learning rate and then
+    from that maximum learning rate to some minimum learning rate much lower
+    than the initial learning rate.
+    This policy was initially described in the paper `Super-Convergence:
+    Very Fast Training of Neural Networks Using Large Learning Rates`_.
+
+    The 1cycle learning rate policy changes the learning rate after every batch.
+    `step` should be called after a batch has been used for training.
+
+    This scheduler is not chainable.
+
+    Note also that the total number of steps in the cycle can be determined in one
+    of two ways (listed in order of precedence):
+
+    #. A value for total_steps is explicitly provided.
+    #. A number of epochs (epochs) and a number of steps per epoch
+       (steps_per_epoch) are provided.
+       In this case, the number of total steps is inferred by
+       total_steps = epochs * steps_per_epoch
+
+    You must either provide a value for total_steps or provide a value for both
+    epochs and steps_per_epoch.
+
+    The default behaviour of this scheduler follows the fastai implementation of 1cycle, which
+    claims that "unpublished work has shown even better results by using only two phases". To
+    mimic the behaviour of the original paper instead, set ``three_phase=True``.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group.
+        total_steps (int): The total number of steps in the cycle. Note that
+            if a value is not provided here, then it must be inferred by providing
+            a value for epochs and steps_per_epoch.
+            Default: None
+        epochs (int): The number of epochs to train for. This is used along
+            with steps_per_epoch in order to infer the total number of steps in the cycle
+            if a value for total_steps is not provided.
+            Default: None
+        steps_per_epoch (int): The number of steps per epoch to train for. This is
+            used along with epochs in order to infer the total number of steps in the
+            cycle if a value for total_steps is not provided.
+            Default: None
+        pct_start (float): The percentage of the cycle (in number of steps) spent
+            increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: "cos" for cosine annealing, "linear" for
+            linear annealing.
+            Default: 'cos'
+        cycle_momentum (bool): If ``True``, momentum is cycled inversely
+            to learning rate between 'base_momentum' and 'max_momentum'.
+            Default: True
+        base_momentum (float or list): Lower momentum boundaries in the cycle
+            for each parameter group. Note that momentum is cycled inversely
+            to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr'.
+            Default: 0.85
+        max_momentum (float or list): Upper momentum boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_momentum - base_momentum).
+            Note that momentum is cycled inversely
+            to learning rate; at the start of a cycle, momentum is 'max_momentum'
+            and learning rate is 'base_lr'
+            Default: 0.95
+        div_factor (float): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor
+            Default: 25
+        final_div_factor (float): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor
+            Default: 1e4
+        three_phase (bool): If ``True``, use a third phase of the schedule to annihilate the
+            learning rate according to 'final_div_factor' instead of modifying the second
+            phase (the first two phases will be symmetrical about the step indicated by
+            'pct_start').
+        last_epoch (int): The index of the last batch. This parameter is used when
+            resuming a training job. Since `step()` should be invoked after each
+            batch instead of after each epoch, this number represents the total
+            number of *batches* computed, not the total number of epochs computed.
+            When last_epoch=-1, the schedule is started from the beginning.
+            Default: -1
+        verbose (bool): If ``True``, prints a message to stdout for
+            each update. Default: ``False``.
+
+            .. deprecated:: 2.2
+                ``verbose`` is deprecated. Please use ``get_last_lr()`` to access the
+                learning rate.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> data_loader = torch.utils.data.DataLoader(...)
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(data_loader), epochs=10)
+        >>> for epoch in range(10):
+        >>>     for batch in data_loader:
+        >>>         train_batch(...)
+        >>>         optimizer.step()
+        >>>         scheduler.step()
+
+
+    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
+        https://arxiv.org/abs/1708.07120
+    """
+    def __init__(self,
+                 optimizer,
+                 max_lr,
+                 total_steps=None,
+                 epochs=None,
+                 steps_per_epoch=None,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 cycle_momentum=True,
+                 base_momentum=0.85,
+                 max_momentum=0.95,
+                 div_factor=25.,
+                 final_div_factor=1e4,
+                 three_phase=False,
+                 last_epoch=-1,
+                 verbose="deprecated"):
+
+        # Validate optimizer
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError(f'{type(optimizer).__name__} is not an Optimizer')
+        self.optimizer = optimizer
+
+        # Validate total_steps
+        if total_steps is None and epochs is None and steps_per_epoch is None:
+            raise ValueError("You must define either total_steps OR (epochs AND steps_per_epoch)")
+        elif total_steps is not None:
+            if total_steps <= 0 or not isinstance(total_steps, int):
+                raise ValueError(f"Expected positive integer total_steps, but got {total_steps}")
+            self.total_steps = total_steps
+        else:
+            if epochs <= 0 or not isinstance(epochs, int):
+                raise ValueError(f"Expected positive integer epochs, but got {epochs}")
+            if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int):
+                raise ValueError(f"Expected positive integer steps_per_epoch, but got {steps_per_epoch}")
+            self.total_steps = epochs * steps_per_epoch
+
+        if three_phase:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    'start_lr': 'initial_lr',
+                    'end_lr': 'max_lr',
+                    'start_momentum': 'max_momentum',
+                    'end_momentum': 'base_momentum',
+                },
+                {
+                    'end_step': float(2 * pct_start * self.total_steps) - 2,
+                    'start_lr': 'max_lr',
+                    'end_lr': 'initial_lr',
+                    'start_momentum': 'base_momentum',
+                    'end_momentum': 'max_momentum',
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    'start_lr': 'initial_lr',
+                    'end_lr': 'min_lr',
+                    'start_momentum': 'max_momentum',
+                    'end_momentum': 'max_momentum',
+                },
+            ]
+        else:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    'start_lr': 'initial_lr',
+                    'end_lr': 'max_lr',
+                    'start_momentum': 'max_momentum',
+                    'end_momentum': 'base_momentum',
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    'start_lr': 'max_lr',
+                    'end_lr': 'min_lr',
+                    'start_momentum': 'base_momentum',
+                    'end_momentum': 'max_momentum',
+                },
+            ]
+
+        # Validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError(f"Expected float between 0 and 1 pct_start, but got {pct_start}")
+
+        # Validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError(f"anneal_strategy must by one of 'cos' or 'linear', instead got {anneal_strategy}")
+        elif anneal_strategy == 'cos':
+            self.anneal_func = self._annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._annealing_linear
+
+        # Initialize learning rate variables
+        max_lrs = self._format_param('max_lr', self.optimizer, max_lr)
+        if last_epoch == -1:
+            for idx, group in enumerate(self.optimizer.param_groups):
+                group['initial_lr'] = max_lrs[idx] / div_factor
+                group['max_lr'] = max_lrs[idx]
+                group['min_lr'] = group['initial_lr'] / final_div_factor
+
+        # Initialize momentum variables
+        self.cycle_momentum = cycle_momentum
+        if self.cycle_momentum:
+            if 'momentum' not in self.optimizer.defaults and 'betas' not in self.optimizer.defaults:
+                raise ValueError('optimizer must support momentum or beta1 with `cycle_momentum` option enabled')
+            self.use_beta1 = 'betas' in self.optimizer.defaults
+            max_momentums = self._format_param('max_momentum', optimizer, max_momentum)
+            base_momentums = self._format_param('base_momentum', optimizer, base_momentum)
+            if last_epoch == -1:
+                for m_momentum, b_momentum, group in zip(max_momentums, base_momentums, optimizer.param_groups):
+                    if self.use_beta1:
+                        group['betas'] = (m_momentum, *group['betas'][1:])
+                    else:
+                        group['momentum'] = m_momentum
+                    group['max_momentum'] = m_momentum
+                    group['base_momentum'] = b_momentum
+
+        super().__init__(optimizer, last_epoch, verbose)
+
+    def _format_param(self, name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError(f"expected {len(optimizer.param_groups)} values for {name}, got {len(param)}")
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+    @staticmethod
+    def _annealing_cos(start, end, pct):
+        "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
+        cos_out = math.cos(math.pi * pct) + 1
+        return end + (start - end) / 2.0 * cos_out
+
+    @staticmethod
+    def _annealing_linear(start, end, pct):
+        "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0."
+        return (end - start) * pct + start
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+
+        lrs = []
+        step_num = self.last_epoch
+
+        if step_num > self.total_steps:
+            raise ValueError("Tried to step {} times. The specified number of total steps is {}"
+                             .format(step_num, self.total_steps))
+
+        for group in self.optimizer.param_groups:
+            start_step = 0
+            for i, phase in enumerate(self._schedule_phases):
+                end_step = phase['end_step']
+                if step_num <= end_step or i == len(self._schedule_phases) - 1:
+                    pct = (step_num - start_step) / (end_step - start_step)
+                    computed_lr = self.anneal_func(group[phase['start_lr']], group[phase['end_lr']], pct)
+                    if self.cycle_momentum:
+                        computed_momentum = self.anneal_func(group[phase['start_momentum']], group[phase['end_momentum']], pct)
+                    break
+                start_step = phase['end_step']
+
+            lrs.append(computed_lr)
+            if self.cycle_momentum:
+                if self.use_beta1:
+                    group['betas'] = (computed_momentum, *group['betas'][1:])
+                else:
+                    group['momentum'] = computed_momentum
+
+        return lrs
diff --git a/MLPY/Lib/site-packages/torch/optim/lr_scheduler.pyi b/MLPY/Lib/site-packages/torch/optim/lr_scheduler.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..a29d2b78dc042a7201ee68417bdd40f33fb14918
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/lr_scheduler.pyi
@@ -0,0 +1,251 @@
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+from .optimizer import Optimizer
+
+class LRScheduler:
+    optimizer: Optimizer = ...
+    base_lrs: List[float] = ...
+    last_epoch: int = ...
+    verbose: bool = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+    def state_dict(self) -> Dict[str, Any]: ...
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None: ...
+    def get_last_lr(self) -> List[float]: ...
+    def get_lr(self) -> float: ...
+    def step(self, epoch: Optional[int] = ...) -> None: ...
+    def print_lr(
+        self,
+        is_verbose: bool,
+        group: Dict[str, Any],
+        lr: float,
+        epoch: Optional[int] = ...,
+    ) -> None: ...
+
+class _LRScheduler(LRScheduler): ...
+
+class LambdaLR(LRScheduler):
+    lr_lambdas: List[Callable[[int], float]] = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]],
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class MultiplicativeLR(LRScheduler):
+    lr_lambdas: List[Callable[[int], float]] = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]],
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class StepLR(LRScheduler):
+    step_size: int = ...
+    gamma: float = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        step_size: int,
+        gamma: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class MultiStepLR(LRScheduler):
+    milestones: Iterable[int] = ...
+    gamma: float = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        milestones: Iterable[int],
+        gamma: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class ConstantLR(LRScheduler):
+    factor: float = ...
+    total_iters: int = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        factor: float = ...,
+        total_iters: int = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class LinearLR(LRScheduler):
+    start_factor: float = ...
+    end_factor: float = ...
+    total_iters: int = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        start_factor: float = ...,
+        end_factor: float = ...,
+        total_iters: int = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class ExponentialLR(LRScheduler):
+    gamma: float = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        gamma: float,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class ChainedScheduler(LRScheduler):
+    def __init__(self, schedulers: List[LRScheduler]) -> None: ...
+
+class SequentialLR(LRScheduler):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        schedulers: List[LRScheduler],
+        milestones: List[int],
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class CosineAnnealingLR(LRScheduler):
+    T_max: int = ...
+    eta_min: float = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        T_max: int,
+        eta_min: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class ReduceLROnPlateau(LRScheduler):
+    factor: float = ...
+    optimizer: Optimizer = ...
+    min_lrs: List[float] = ...
+    patience: int = ...
+    verbose: bool = ...
+    cooldown: int = ...
+    cooldown_counter: int = ...
+    mode: str = ...
+    threshold: float = ...
+    threshold_mode: str = ...
+    best: Optional[float] = ...
+    num_bad_epochs: Optional[int] = ...
+    mode_worse: Optional[float] = ...
+    eps: float = ...
+    last_epoch: int = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        mode: str = ...,
+        factor: float = ...,
+        patience: int = ...,
+        threshold: float = ...,
+        threshold_mode: str = ...,
+        cooldown: int = ...,
+        min_lr: Union[List[float], float] = ...,
+        eps: float = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+    def step(self, metrics: Any, epoch: Optional[int] = ...) -> None: ...  # type: ignore[override]
+    @property
+    def in_cooldown(self) -> bool: ...
+    def is_better(self, a: Any, best: Any) -> bool: ...
+    def state_dict(self) -> Dict[str, Any]: ...
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None: ...
+
+class CyclicLR(LRScheduler):
+    max_lrs: List[float] = ...
+    total_size: float = ...
+    step_ratio: float = ...
+    mode: str = ...
+    gamma: float = ...
+    scale_mode: str = ...
+    cycle_momentum: bool = ...
+    base_momentums: List[float] = ...
+    max_momentums: List[float] = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        base_lr: Union[float, List[float]],
+        max_lr: Union[float, List[float]],
+        step_size_up: int = ...,
+        step_size_down: Optional[int] = ...,
+        mode: str = ...,
+        gamma: float = ...,
+        scale_fn: Optional[Callable[[float], float]] = ...,
+        scale_mode: str = ...,
+        cycle_momentum: bool = ...,
+        base_momentum: float = ...,
+        max_momentum: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+    def scale_fn(self, x: Any) -> float: ...
+
+class CosineAnnealingWarmRestarts(LRScheduler):
+    T_0: int = ...
+    T_i: int = ...
+    T_mult: int = ...
+    eta_min: float = ...
+    T_cur: Any = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        T_0: int,
+        T_mult: int = ...,
+        eta_min: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class OneCycleLR(LRScheduler):
+    total_steps: int = ...
+    anneal_func: Callable[[float, float, float], float] = ...
+    cycle_momentum: bool = ...
+    use_beta1: bool = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        max_lr: Union[float, List[float]],
+        total_steps: int = ...,
+        epochs: int = ...,
+        steps_per_epoch: int = ...,
+        pct_start: float = ...,
+        anneal_strategy: str = ...,
+        cycle_momentum: bool = ...,
+        base_momentum: Union[float, List[float]] = ...,
+        max_momentum: Union[float, List[float]] = ...,
+        div_factor: float = ...,
+        final_div_factor: float = ...,
+        three_phase: bool = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
+
+class PolynomialLR(LRScheduler):
+    total_iters: int = ...
+    power: float = ...
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        total_iters: int = ...,
+        power: float = ...,
+        last_epoch: int = ...,
+        verbose: bool = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/nadam.py b/MLPY/Lib/site-packages/torch/optim/nadam.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ebeeaf4056914bd0582fa9e0a09e3ef7b2ef2d5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/nadam.py
@@ -0,0 +1,477 @@
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
+                        _stack_if_compiling, _get_scalar_dtype, _default_to_fused_or_foreach,
+                        _view_as_real, _capturable_doc, _differentiable_doc, _foreach_doc,)
+from typing import List, Optional
+
+__all__ = ['NAdam', 'nadam']
+
+class NAdam(Optimizer):
+    def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, momentum_decay=4e-3, decoupled_weight_decay: bool = False,
+                 *, foreach: Optional[bool] = None, capturable: bool = False,
+                 differentiable: bool = False):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if not 0.0 <= momentum_decay:
+            raise ValueError(f"Invalid momentum_decay value: {momentum_decay}")
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, momentum_decay=momentum_decay,
+                        decoupled_weight_decay=decoupled_weight_decay,
+                        foreach=foreach, capturable=capturable, differentiable=differentiable)
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('foreach', None)
+            group.setdefault('capturable', False)
+            group.setdefault('differentiable', False)
+            group.setdefault('decoupled_weight_decay', False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0:
+                    if not torch.is_tensor(p_state['step']):
+                        step_val = float(p_state["step"])
+                        p_state["step"] = (torch.tensor(step_val, dtype=_get_scalar_dtype(), device=p.device)
+                                           if group['capturable'] else torch.tensor(step_val, dtype=_get_scalar_dtype()))
+                    if not torch.is_tensor(p_state['mu_product']):
+                        mu_prod_val = p_state["mu_product"]
+                        p_state["mu_product"] = (torch.tensor(mu_prod_val, dtype=_get_scalar_dtype(), device=p.device)
+                                                 if group['capturable'] else torch.tensor(mu_prod_val, dtype=_get_scalar_dtype()))
+
+
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps):
+        has_complex = False
+        for p in group['params']:
+            if p.grad is not None:
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('NAdam does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    # note(crcrpar): [special device hosting for step]
+                    # Deliberately host `step` and `mu_product` on CPU if capturable is False.
+                    # This is because kernel launches are costly on CUDA and XLA.
+                    state['step'] = (
+                        torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                        if group['capturable'] else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    )
+                    state['mu_product'] = (
+                        torch.ones((), dtype=_get_scalar_dtype(), device=p.device)
+                        if group['capturable'] else torch.tensor(1.0, dtype=_get_scalar_dtype())
+                    )
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                mu_products.append(state['mu_product'])
+                state_steps.append(state['step'])
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            mu_products = []
+            state_steps = []
+            beta1, beta2 = group['betas']
+
+            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps)
+
+            nadam(params_with_grad,
+                  grads,
+                  exp_avgs,
+                  exp_avg_sqs,
+                  mu_products,
+                  state_steps,
+                  beta1=beta1,
+                  beta2=beta2,
+                  lr=group['lr'],
+                  weight_decay=group['weight_decay'],
+                  momentum_decay=group['momentum_decay'],
+                  eps=group['eps'],
+                  decoupled_weight_decay=group['decoupled_weight_decay'],
+                  foreach=group['foreach'],
+                  capturable=group['capturable'],
+                  differentiable=group['differentiable'],
+                  has_complex=has_complex)
+
+        return loss
+
+NAdam.__doc__ = r"""Implements NAdam algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
+                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
+            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
+            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}                                  \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
+            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
+            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
+            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
+            &\hspace{10mm}\textbf{else}                                                          \\
+            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
+            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
+            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
+            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
+            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
+            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
+            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
+                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 2e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
+        decoupled_weight_decay (bool, optional): whether to use decoupled weight
+            decay as in AdamW to obtain NAdamW (default: False)
+        {_foreach_doc}
+        {_capturable_doc}
+        {_differentiable_doc}
+
+    .. _Incorporating Nesterov Momentum into Adam:
+        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+
+def nadam(params: List[Tensor],
+          grads: List[Tensor],
+          exp_avgs: List[Tensor],
+          exp_avg_sqs: List[Tensor],
+          mu_products: List[Tensor],
+          state_steps: List[Tensor],
+          # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+          # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+          decoupled_weight_decay: bool = False,
+          foreach: Optional[bool] = None,
+          capturable: bool = False,
+          differentiable: bool = False,
+          has_complex: bool = False,
+          *,
+          beta1: float,
+          beta2: float,
+          lr: float,
+          weight_decay: float,
+          momentum_decay: float,
+          eps: float):
+    r"""Functional API that performs NAdam algorithm computation.
+
+    See :class:`~torch.optim.NAdam` for details.
+    """
+
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors")
+
+    if not all(isinstance(t, torch.Tensor) for t in mu_products):
+        raise RuntimeError("API has changed, `mu_products` argument must contain a list of singleton tensors")
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_nadam
+    else:
+        func = _single_tensor_nadam
+
+    func(params,
+         grads,
+         exp_avgs,
+         exp_avg_sqs,
+         mu_products,
+         state_steps,
+         beta1=beta1,
+         beta2=beta2,
+         lr=lr,
+         weight_decay=weight_decay,
+         momentum_decay=momentum_decay,
+         decoupled_weight_decay=decoupled_weight_decay,
+         eps=eps,
+         capturable=capturable,
+         differentiable=differentiable,
+         has_complex=has_complex)
+
+
+def _single_tensor_nadam(params: List[Tensor],
+                         grads: List[Tensor],
+                         exp_avgs: List[Tensor],
+                         exp_avg_sqs: List[Tensor],
+                         mu_products: List[Tensor],
+                         state_steps: List[Tensor],
+                         *,
+                         beta1: float,
+                         beta2: float,
+                         lr: float,
+                         weight_decay: float,
+                         momentum_decay: float,
+                         eps: float,
+                         decoupled_weight_decay: bool,
+                         capturable: bool,
+                         differentiable: bool,
+                         has_complex: bool):
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        mu_product = mu_products[i]
+        step_t = state_steps[i]
+
+        if torch.is_complex(param):
+            param = torch.view_as_real(param)
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            exp_avg_sq = torch.view_as_real(exp_avg_sq)
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (
+                (param.is_cuda and mu_product.is_cuda and step_t.is_cuda) or (param.is_xla and mu_product.is_xla and step_t.is_xla)
+            ), "If capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors."
+
+        # update step
+        step_t += 1
+
+        if capturable:
+            step = step_t
+        else:
+            step = _get_value(step_t)
+
+        bias_correction2 = 1 - beta2 ** step
+
+        if weight_decay != 0:
+            if decoupled_weight_decay:
+                # Perform stepweight decay
+                param.mul_(1 - lr * weight_decay)
+            else:
+                grad = grad.add(param, alpha=weight_decay)
+
+        # calculate the momentum cache \mu^{t} and \mu^{t+1}
+        mu = beta1 * (1. - 0.5 * (0.96 ** (step * momentum_decay)))
+        mu_next = beta1 * (1. - 0.5 * (0.96 ** ((step + 1) * momentum_decay)))
+
+        # update mu_product
+        mu_product *= mu
+
+        # decay the first and second moment running average coefficient
+        exp_avg.lerp_(grad, 1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+        denom = exp_avg_sq.div(bias_correction2).sqrt()
+
+        if differentiable or capturable:
+            denom = denom.add(eps)
+            # Make autograd track the operations
+            # by updating the grad and exp_avg directly and not using the
+            # scalar "value" argument of addcdiv.
+            mu_product_next = mu_product * mu_next
+            grad = grad * (-lr * (1. - mu) / (1. - mu_product))
+            exp_avg = exp_avg * (-lr * mu_next / (1. - mu_product_next))
+            param.addcdiv_(grad, denom)
+            param.addcdiv_(exp_avg, denom)
+        else:
+            mu_product_next = _get_value(mu_product) * mu_next
+            denom.add_(eps)
+            param.addcdiv_(grad, denom, value=(-lr * (1. - mu) / (1. - _get_value(mu_product))))
+            param.addcdiv_(exp_avg, denom, value=(-lr * mu_next) / (1. - mu_product_next))
+
+
+def _multi_tensor_nadam(params: List[Tensor],
+                        grads: List[Tensor],
+                        exp_avgs: List[Tensor],
+                        exp_avg_sqs: List[Tensor],
+                        mu_products: List[Tensor],
+                        state_steps: List[Tensor],
+                        *,
+                        beta1: float,
+                        beta2: float,
+                        lr: float,
+                        weight_decay: float,
+                        momentum_decay: float,
+                        eps: float,
+                        decoupled_weight_decay: bool,
+                        capturable: bool,
+                        differentiable: bool,
+                        has_complex: bool):
+
+    if len(params) == 0:
+        return
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(p.is_cuda and mp.is_cuda and step.is_cuda
+                   for p, mp, step in zip(params, mu_products, state_steps)), \
+            "If capturable=True, params, mu_products, and state_steps must be CUDA tensors."
+
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps])
+    for ((grouped_params, grouped_grads, grouped_exp_avgs,
+         grouped_exp_avg_sqs, grouped_mu_products, grouped_state_steps), _) in grouped_tensors.values():
+
+        # handle complex
+        if has_complex:
+            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs)
+
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
+        if weight_decay != 0:
+            if decoupled_weight_decay:
+                # Perform stepweight decay
+                torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
+            else:
+                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
+
+        torch._foreach_mul_(grouped_exp_avg_sqs, beta2)
+        torch._foreach_addcmul_(grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2)
+
+        exp_avg_sq_sqrt = torch._foreach_sqrt(grouped_exp_avg_sqs)
+
+        if capturable:
+            # mus will be beta1 * (1 - 0.5 * 0.96 ** (step * momentum_decay))
+            exponent = torch._foreach_mul(grouped_state_steps, momentum_decay)
+            mus = torch._foreach_pow(0.96, exponent)
+            torch._foreach_mul_(mus, -0.5)
+            torch._foreach_add_(mus, 1.0)
+            torch._foreach_mul_(mus, beta1)
+
+            # mu_nexts will be beta1 * (1 - 0.5 * 0.96 ** ((step + 1) * momentum_decay))
+            torch._foreach_add_(exponent, momentum_decay)
+            mu_nexts = torch._foreach_pow(0.96, exponent)
+            torch._foreach_mul_(mu_nexts, -0.5)
+            torch._foreach_add_(mu_nexts, 1.0)
+            torch._foreach_mul_(mu_nexts, beta1)
+
+            # save peak memory as we don't need exponent anymore
+            del exponent
+
+            bias_correction_sqrt = torch._foreach_pow(beta2, grouped_state_steps)
+            # foreach_sub doesn't allow a scalar as the first arg
+            torch._foreach_sub_(bias_correction_sqrt, 1.0)
+            torch._foreach_neg_(bias_correction_sqrt)
+            torch._foreach_sqrt_(bias_correction_sqrt)
+        else:
+            bias_correction_sqrt = [_dispatch_sqrt(1 - beta2 ** _get_value(step)) for step in grouped_state_steps]
+            mus = [beta1 * (1. - 0.5 * (0.96 ** (_get_value(step) * momentum_decay))) for step in grouped_state_steps]
+            mu_nexts = [beta1 * (1. - 0.5 * (0.96 ** ((_get_value(step) + 1) * momentum_decay)))
+                        for step in grouped_state_steps]
+
+        # update mu_products
+        torch._foreach_mul_(grouped_mu_products, mus)
+
+        torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt)
+        torch._foreach_add_(exp_avg_sq_sqrt, eps)
+
+        # explicitly delete bias_correction refs to save memory
+        del bias_correction_sqrt
+
+        if capturable:
+            # Build up the step_size multiplier for grad, reusing mus' memory
+            torch._foreach_sub_(mus, 1.0)
+            torch._foreach_mul_(mus, lr)
+            # foreach_sub doesn't allow a scalar as the first arg
+            denom = torch._foreach_sub(grouped_mu_products, 1.0)
+            torch._foreach_neg_(denom)
+            torch._foreach_div_(mus, denom)
+            # - lr * (1 - mu) / (1 - mu_product)
+            step_size_grads = mus
+            # explicitly delete denom to save memory
+            del denom
+
+            # Build up the step_size multiplier for exp_avg, reusing mu_nexts' memory
+            denom = torch._foreach_mul(grouped_mu_products, mu_nexts)
+            torch._foreach_mul_(mu_nexts, lr)
+            # foreach_sub doesn't allow a scalar as the first arg, but it's okay because
+            # we need a negative here anyway
+            torch._foreach_sub_(denom, 1.0)
+            torch._foreach_div_(mu_nexts, denom)
+            # - lr * mu_next / (1 - mu_product * mu_next)
+            step_size_expavg = mu_nexts
+            # explicitly delete denom to save memory
+            del denom
+
+            # we cannot inplace into step_size_grads cuz it is a list of ScalarTensors
+            # and mul'ing with grouped_grads will result in a list of bigger Tensors
+            numerator = torch._foreach_mul(step_size_grads, grouped_grads)
+            torch._foreach_addcmul_(numerator, step_size_expavg, grouped_exp_avgs)
+
+            # finally, update params
+            torch._foreach_addcdiv_(grouped_params, numerator, exp_avg_sq_sqrt)
+        else:
+            step_size_grads = _stack_if_compiling([(lr * (1. - mu) / (1. - _get_value(mu_product))) * -1
+                                                   for mu_product, mu in zip(grouped_mu_products, mus)])
+            step_size_expavg = _stack_if_compiling([(lr * mu_next / (1. - _get_value(mu_product) * mu_next)) * -1
+                                                    for mu_product, mu_next in zip(grouped_mu_products, mu_nexts)])
+
+            torch._foreach_addcdiv_(grouped_params, grouped_grads, exp_avg_sq_sqrt, step_size_grads)
+            torch._foreach_addcdiv_(grouped_params, grouped_exp_avgs, exp_avg_sq_sqrt, step_size_expavg)
diff --git a/MLPY/Lib/site-packages/torch/optim/nadam.pyi b/MLPY/Lib/site-packages/torch/optim/nadam.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..f398cfbcf952b59b583ca3545a4a4b33f5e381ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/nadam.pyi
@@ -0,0 +1,15 @@
+from typing import Tuple
+
+from .optimizer import Optimizer, ParamsT
+
+class NAdam(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        betas: Tuple[float, float] = ...,
+        eps: float = ...,
+        weight_decay: float = ...,
+        momentum_decay: float = ...,
+        decoupled_weight_decay: bool = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/optimizer.py b/MLPY/Lib/site-packages/torch/optim/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b89fa2b4b32871d3a6d32ff8d28295dab5bdf1a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/optimizer.py
@@ -0,0 +1,912 @@
+import math
+import functools
+import warnings
+from collections import OrderedDict, defaultdict
+from copy import deepcopy
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    DefaultDict,
+    Dict,
+    Hashable,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
+from typing_extensions import ParamSpec, Self, TypeAlias
+
+import torch
+import torch.utils.hooks as hooks
+from torch.utils.hooks import RemovableHandle
+from torch.utils._foreach_utils import (
+    Indices,
+    TensorListList,
+    _get_foreach_kernels_supported_devices,
+    _get_fused_kernels_supported_devices,
+)
+from torch._utils import is_compiling
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
+
+Args: TypeAlias = Tuple[Any, ...]
+Kwargs: TypeAlias = Dict[str, Any]
+StateDict: TypeAlias = Dict[str, Any]
+
+GlobalOptimizerPreHook: TypeAlias = Callable[["Optimizer", Args, Kwargs], Optional[Tuple[Args, Kwargs]]]
+GlobalOptimizerPostHook: TypeAlias = Callable[["Optimizer", Args, Kwargs], None]
+
+__all__ = ['Optimizer', 'register_optimizer_step_pre_hook', 'register_optimizer_step_post_hook']
+_global_optimizer_pre_hooks: Dict[int, GlobalOptimizerPreHook] = OrderedDict()
+_global_optimizer_post_hooks: Dict[int, GlobalOptimizerPostHook] = OrderedDict()
+_foreach_supported_types = [torch.Tensor, torch.nn.parameter.Parameter]
+
+class _RequiredParameter:
+    """Singleton class representing a required parameter for an Optimizer."""
+    def __repr__(self) -> str:
+        return "<required parameter>"
+
+required = _RequiredParameter()
+
+
+def _use_grad_for_differentiable(func):
+    def _use_grad(self, *args, **kwargs):
+        import torch._dynamo
+        prev_grad = torch.is_grad_enabled()
+        try:
+            # Note on graph break below:
+            # we need to graph break to ensure that aot respects the no_grad annotation.
+            # This is important for perf because without this, functionalization will generate an epilogue
+            # which updates the mutated parameters of the optimizer which is *not* visible to inductor, as a result,
+            # inductor will allocate for every parameter in the model, which is horrible.
+            # With this, aot correctly sees that this is an inference graph, and functionalization will generate
+            # an epilogue which is appended to the graph, which *is* visible to inductor, as a result, inductor sees that
+            # step is in place and is able to avoid the extra allocation.
+            # In the future, we will either 1) continue to graph break on backward, so this graph break does not matter
+            # or 2) have a fully fused forward and backward graph, which will have no_grad by default, and we can remove this
+            # graph break to allow the fully fused fwd-bwd-optimizer graph to be compiled.
+            # see https://github.com/pytorch/pytorch/issues/104053
+            torch.set_grad_enabled(self.defaults['differentiable'])
+            torch._dynamo.graph_break()
+            ret = func(self, *args, **kwargs)
+        finally:
+            torch._dynamo.graph_break()
+            torch.set_grad_enabled(prev_grad)
+        return ret
+    functools.update_wrapper(_use_grad, func)
+    return _use_grad
+
+def _get_value(x):
+    # item is significantly faster than a cpu tensor in eager mode
+    if not torch.jit.is_scripting() and is_compiling():
+        return x
+    else:
+        return x.item()
+
+def _stack_if_compiling(x):
+    if not torch.jit.is_scripting() and is_compiling():
+        return torch.stack(x)
+    else:
+        return x
+
+def _dispatch_sqrt(x: float):  # float annotation is needed because of torchscript type inference
+    if not torch.jit.is_scripting() and isinstance(x, torch.Tensor):
+        return x.sqrt()
+    else:
+        return math.sqrt(x)
+
+# For any optimizer with a faster implementation, we attempt to default to the
+# fastest + stablest whenever possible. For foreach, the requirements are to have
+# native params all on CUDA. For fused, there's currently the additional requirement
+# that the tensors' dtypes must be floating point. Neither alternative supports
+# torch.jit.script nor differentiable, so we fall back to the single tensor
+# implementation in those cases.
+def _default_to_fused_or_foreach(params: List[torch.Tensor],
+                                 differentiable: bool,
+                                 use_fused: bool = False) -> Tuple[bool, bool]:
+    if torch.jit.is_scripting() or differentiable:
+        return False, False
+
+    fused_supported_devices = _get_fused_kernels_supported_devices()
+    foreach_supported_devices = _get_foreach_kernels_supported_devices()
+    fused = use_fused and all(
+        p is None or (type(p) in _foreach_supported_types and
+                      p.device.type in fused_supported_devices and
+                      torch.is_floating_point(p)) for p in params
+    )
+    foreach = not fused and all(
+        p is None or (type(p) in _foreach_supported_types and
+                      p.device.type in foreach_supported_devices) for p in params
+    )
+    return fused, foreach
+
+def _view_as_real(params, *state_and_grads):
+    for i, p in enumerate(params):
+        if torch.is_complex(p):
+            params[i] = torch.view_as_real(params[i])
+            for s in state_and_grads:
+                s[i] = torch.view_as_real(s[i])
+
+def _get_scalar_dtype(is_fused=None):
+    if is_fused:
+        return torch.float32
+    return torch.float64 if torch.get_default_dtype() == torch.float64 else torch.float32
+
+# Common doc strings among optimizers
+_foreach_doc = r"""foreach (bool, optional): whether foreach implementation of optimizer
+            is used. If unspecified by the user (so foreach is None), we will try to use
+            foreach over the for-loop implementation on CUDA, since it is usually
+            significantly more performant. Note that the foreach implementation uses
+            ~ sizeof(params) more peak memory than the for-loop version due to the intermediates
+            being a tensorlist vs just one tensor. If memory is prohibitive, batch fewer
+            parameters through the optimizer at a time or switch this flag to False (default: None)"""
+
+_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used.
+            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
+            are supported. (default: None)
+
+    .. note:: The foreach and fused implementations are typically faster than the for-loop,
+              single-tensor implementation. Thus, if the user has not specified BOTH flags
+              (i.e., when foreach = fused = None), we will attempt defaulting to the foreach
+              implementation when the tensors are all on CUDA. For example, if the user specifies
+              True for fused but nothing for foreach, we will run the fused implementation. If
+              the user specifies False for foreach but nothing for fused (or False for fused but
+              nothing for foreach), we will run the for-loop implementation. If the user specifies
+              True for both foreach and fused, we will prioritize fused over foreach, as it is
+              typically faster. We attempt to use the fastest, so the hierarchy goes fused ->
+              foreach -> for-loop. HOWEVER, since the fused implementation is relatively new,
+              we want to give it sufficient bake-in time, so we default to foreach and NOT
+              fused when the user has not specified either flag."""
+
+_capturable_doc = r"""capturable (bool, optional): whether this instance is safe to
+            capture in a CUDA graph. Passing True can impair ungraphed performance,
+            so if you don't intend to graph capture this instance, leave it False
+            (default: False)"""
+
+_differentiable_doc = r"""differentiable (bool, optional): whether autograd should
+            occur through the optimizer step in training. Otherwise, the step()
+            function runs in a torch.no_grad() context. Setting to True can impair
+            performance, so leave it False if you don't intend to run autograd
+            through this instance (default: False)"""
+
+_maximize_doc = r"""maximize (bool, optional): maximize the objective with respect to the
+            params, instead of minimizing (default: False)"""
+
+
+def register_optimizer_step_pre_hook(hook: GlobalOptimizerPreHook) -> RemovableHandle:
+    r"""Register a pre hook common to all optimizers. The hook should have the following
+    signature::
+
+        hook(optimizer, args, kwargs) -> None or modified args and kwargs
+
+    Args:
+        hook (Callable): A user defined hook which is registered on all optimizers.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_optimizer_pre_hooks)
+    _global_optimizer_pre_hooks[handle.id] = hook
+    return handle
+
+
+def register_optimizer_step_post_hook(hook: GlobalOptimizerPostHook) -> RemovableHandle:
+    r"""Register a post hook common to all optimizers. The hook should have the following
+    signature::
+
+        hook(optimizer, args, kwargs) -> None
+
+    Args:
+        hook (Callable): A user defined hook which is registered on all optimizers.
+
+    Returns:
+        :class:`torch.utils.hooks.RemovableHandle`:
+            a handle that can be used to remove the added hook by calling
+            ``handle.remove()``
+    """
+    handle = hooks.RemovableHandle(_global_optimizer_post_hooks)
+    _global_optimizer_post_hooks[handle.id] = hook
+    return handle
+
+ParamsT: TypeAlias = Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]
+
+_P = ParamSpec("_P")
+R = TypeVar("R")
+T = TypeVar("T")
+
+
+class Optimizer:
+    r"""Base class for all optimizers.
+
+    .. warning::
+        Parameters need to be specified as collections that have a deterministic
+        ordering that is consistent between runs. Examples of objects that don't
+        satisfy those properties are sets and iterators over values of dictionaries.
+
+    Args:
+        params (iterable): an iterable of :class:`torch.Tensor` s or
+            :class:`dict` s. Specifies what Tensors should be optimized.
+        defaults: (dict): a dict containing default values of optimization
+            options (used when a parameter group doesn't specify them).
+    """
+
+    OptimizerPreHook: TypeAlias = Callable[[Self, Args, Kwargs], Optional[Tuple[Args, Kwargs]]]  # type: ignore[misc]
+    OptimizerPostHook: TypeAlias = Callable[[Self, Args, Kwargs], None]  # type: ignore[misc]
+
+    _optimizer_step_pre_hooks: Dict[int, OptimizerPreHook]
+    _optimizer_step_post_hooks: Dict[int, OptimizerPostHook]
+    _optimizer_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
+    _optimizer_state_dict_post_hooks: 'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    _optimizer_load_state_dict_pre_hooks: 'OrderedDict[int, Callable[["Optimizer", StateDict], Optional[StateDict]]]'
+    _optimizer_load_state_dict_post_hooks: 'OrderedDict[int, Callable[["Optimizer"], None]]'
+
+    def __init__(self, params: ParamsT, defaults: Dict[str, Any]) -> None:
+        torch._C._log_api_usage_once("python.optimizer")
+        self.defaults = defaults
+        self._optimizer_step_pre_hooks = OrderedDict()
+        self._optimizer_step_post_hooks = OrderedDict()
+        self._optimizer_state_dict_pre_hooks = OrderedDict()
+        self._optimizer_state_dict_post_hooks = OrderedDict()
+        self._optimizer_load_state_dict_pre_hooks = OrderedDict()
+        self._optimizer_load_state_dict_post_hooks = OrderedDict()
+
+        self._patch_step_function()
+
+        if isinstance(params, torch.Tensor):
+            if self.__class__.__name__ == 'SparseAdam':
+                warnings.warn(("Passing in a raw Tensor as ``params`` to SparseAdam "
+                               "is deprecated. In the future, this will raise an error. "
+                               "Please wrap your Tensor in an iterable instead."),
+                              FutureWarning)
+                params = [params]
+            else:
+                raise TypeError("params argument given to the optimizer should be "
+                                "an iterable of Tensors or dicts, but got " +
+                                torch.typename(params))
+
+        self.state: DefaultDict[torch.Tensor, Any] = defaultdict(dict)
+        self.param_groups: List[Dict[str, Any]] = []
+
+        param_groups = list(params)
+        if len(param_groups) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+        if not isinstance(param_groups[0], dict):
+            param_groups = [{'params': param_groups}]
+
+        for param_group in param_groups:
+            self.add_param_group(cast(dict, param_group))
+
+        # Allows _cuda_graph_capture_health_check to rig a poor man's TORCH_WARN_ONCE in python,
+        # which I don't think exists
+        # https://github.com/pytorch/pytorch/issues/72948
+        self._warned_capturable_if_run_uncaptured = True
+
+    def __getstate__(self) -> Dict[str, Any]:
+        return {
+            'defaults': self.defaults,
+            'state': self.state,
+            'param_groups': self.param_groups,
+        }
+
+    def __setstate__(self, state: Dict[str, Any]) -> None:
+        self.__dict__.update(state)
+        if '_optimizer_step_pre_hooks' not in self.__dict__:
+            self._optimizer_step_pre_hooks = OrderedDict()
+        if '_optimizer_step_post_hooks' not in self.__dict__:
+            self._optimizer_step_post_hooks = OrderedDict()
+        if '_optimizer_state_dict_pre_hooks' not in self.__dict__:
+            self._optimizer_state_dict_pre_hooks = OrderedDict()
+        if '_optimizer_state_dict_post_hooks' not in self.__dict__:
+            self._optimizer_state_dict_post_hooks = OrderedDict()
+        if '_optimizer_load_state_dict_pre_hooks' not in self.__dict__:
+            self._optimizer_load_state_dict_pre_hooks = OrderedDict()
+        if '_optimizer_load_state_dict_post_hooks' not in self.__dict__:
+            self._optimizer_load_state_dict_post_hooks = OrderedDict()
+        self._patch_step_function()  # To support multiprocessing pickle/unpickle
+        self.defaults.setdefault('differentiable', False)
+
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + ' ('
+        for i, group in enumerate(self.param_groups):
+            format_string += '\n'
+            format_string += f'Parameter Group {i}\n'
+            for key in sorted(group.keys()):
+                if key != 'params':
+                    format_string += f'    {key}: {group[key]}\n'
+        format_string += ')'
+        return format_string
+
+    # Currently needed by Adam and AdamW
+    def _cuda_graph_capture_health_check(self) -> None:
+        # Note [torch.compile x capturable]
+        # If we are compiling, we try to take the capturable path automatically by
+        # setting the flag to True during tracing. Due to this, we skip all the checks
+        # normally required for determining whether we can use CUDA graphs and
+        # shunt the responsibility to torch.inductor. This saves time during tracing
+        # since the checks are slow without sacrificing UX since inductor will warn
+        # later if CUDA graphs cannot be enabled, e.g.,
+        # https://github.com/pytorch/pytorch/blob/d3ba8901d8640eb16f88b2bfef9df7fa383d4b47/torch/_inductor/compile_fx.py#L390.
+        # Thus, when compiling, inductor will determine if cudagraphs
+        # can be enabled based on whether there is input mutation or CPU tensors.
+        if not is_compiling() and torch.backends.cuda.is_built() and torch.cuda.is_available():
+            capturing = torch.cuda.is_current_stream_capturing()
+
+            if capturing and not all(group['capturable'] for group in self.param_groups):
+                raise RuntimeError("Attempting CUDA graph capture of step() for an instance of " +
+                                   self.__class__.__name__ +
+                                   " but param_groups' capturable is False.")
+
+            if (
+                (not getattr(self, "_warned_capturable_if_run_uncaptured", False))
+                and all(group['capturable'] for group in self.param_groups)
+                and (not capturing)
+            ):
+                warnings.warn(
+                    "This instance was constructed with capturable=True or some of all the param_groups came with capturable=True, "
+                    "but step() is running without CUDA graph capture. If you never intend to graph-capture this "
+                    "instance, capturable=True can impair performance, and you should set capturable=False."
+                )
+                self._warned_capturable_if_run_uncaptured = True
+
+    def _optimizer_step_code(self) -> None:
+        """Entry point for `torch.profile.profiler`.
+
+        When python tracing is enabled the profiler will hook into this
+        function at the CPython level to inspect the optimizer's parameters and
+        param groups. It is called it after `step()` since many optimizers
+        lazily initialize state.
+
+        This is a workaround due to lack of a proper step hook on the optimizer,
+        and will be removed if it exists.
+        """
+        pass
+
+    @staticmethod
+    def profile_hook_step(func: Callable[_P, R]) -> Callable[_P, R]:
+
+        @functools.wraps(func)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> R:
+            self, *_ = args
+            self = cast(Optimizer, self)
+            profile_name = f"Optimizer.step#{self.__class__.__name__}.step"
+            with torch.autograd.profiler.record_function(profile_name):
+                # call optimizer step pre hooks
+                for pre_hook in chain(_global_optimizer_pre_hooks.values(), self._optimizer_step_pre_hooks.values()):
+                    result = pre_hook(self, args, kwargs)
+                    if result is not None:
+                        if isinstance(result, tuple) and len(result) == 2:
+                            args, kwargs = result  # type: ignore[assignment]
+                        else:
+                            raise RuntimeError(
+                                f"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}."
+                            )
+
+                out = func(*args, **kwargs)
+                self._optimizer_step_code()
+
+                # call optimizer step post hooks
+                for post_hook in chain(self._optimizer_step_post_hooks.values(), _global_optimizer_post_hooks.values()):
+                    post_hook(self, args, kwargs)
+
+                return out
+
+        return wrapper
+
+    @staticmethod
+    def _group_tensors_by_device_and_dtype(
+        tensorlistlist: TensorListList,
+        with_indices: bool = False,
+    ) -> Union[
+        Dict[Tuple[None, None], Tuple[TensorListList, Indices]],
+        Dict[Tuple[torch.device, torch.dtype], Tuple[TensorListList, Indices]],
+    ]:
+        """Groups a list of lists of tensors by device and dtype.
+        Skips this step if we are compiling since this will occur during inductor lowering."""
+        if is_compiling():
+            return {(None, None): (tensorlistlist, list(range(len(tensorlistlist[0]))))}
+        else:
+            return _group_tensors_by_device_and_dtype(tensorlistlist, with_indices)
+
+    def _patch_step_function(self) -> None:
+        self._zero_grad_profile_name = f"Optimizer.zero_grad#{self.__class__.__name__}.zero_grad"
+        hooked = getattr(self.__class__.step, "hooked", None)
+        if not hooked:
+            self.__class__.step = self.profile_hook_step(self.__class__.step)  # type: ignore[assignment]
+            self.__class__.step.hooked = True  # type: ignore[attr-defined]
+
+    def register_step_pre_hook(self, hook: OptimizerPreHook) -> RemovableHandle:
+        r"""Register an optimizer step pre hook which will be called before
+        optimizer step. It should have the following signature::
+
+            hook(optimizer, args, kwargs) -> None or modified args and kwargs
+
+        The ``optimizer`` argument is the optimizer instance being used. If
+        args and kwargs are modified by the pre-hook, then the transformed
+        values are returned as a tuple containing the new_args and new_kwargs.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_step_pre_hooks)
+        self._optimizer_step_pre_hooks[handle.id] = hook
+        return handle
+
+    def register_step_post_hook(self, hook: OptimizerPostHook) -> RemovableHandle:
+        r"""Register an optimizer step post hook which will be called after optimizer step.
+        It should have the following signature::
+
+            hook(optimizer, args, kwargs) -> None
+
+        The ``optimizer`` argument is the optimizer instance being used.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_step_post_hooks)
+        self._optimizer_step_post_hooks[handle.id] = hook
+        return handle
+
+
+    def register_state_dict_pre_hook(
+        self, hook: Callable[["Optimizer"], None], prepend: bool = False
+    ) -> RemovableHandle:
+        r"""Register a state dict pre-hook which will be called before
+        :meth:`~torch.optim.Optimizer.state_dict` is called. It should have the
+        following signature::
+
+            hook(optimizer) -> None
+
+        The ``optimizer`` argument is the optimizer instance being used.
+        The hook will be called with argument ``self`` before calling ``state_dict`` on ``self``.
+        The registered hook can be used to perform pre-processing before the ``state_dict``
+        call is made.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If True, the provided pre ``hook`` will be fired before
+                all the already registered pre-hooks on ``state_dict``. Otherwise,
+                the provided ``hook`` will be fired after all the already registered
+                pre-hooks. (default: False)
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_state_dict_pre_hooks)
+        self._optimizer_state_dict_pre_hooks[handle.id] = hook
+        if prepend:
+            self._optimizer_state_dict_pre_hooks.move_to_end(handle.id, last=False)
+        return handle
+
+
+    def register_state_dict_post_hook(
+        self,
+        hook: Callable[["Optimizer", StateDict], Optional[StateDict]],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a state dict post-hook which will be called after
+        :meth:`~torch.optim.Optimizer.state_dict` is called. It should have the
+        following signature::
+
+            hook(optimizer, state_dict) -> state_dict or None
+
+        The hook will be called with arguments ``self`` and ``state_dict`` after generating
+        a ``state_dict`` on ``self``. The hook may modify the state_dict inplace or optionally
+        return a new one. The registered hook can be used to perform post-processing
+        on the ``state_dict`` before it is returned.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If True, the provided post ``hook`` will be fired before
+                all the already registered post-hooks on ``state_dict``. Otherwise,
+                the provided ``hook`` will be fired after all the already registered
+                post-hooks. (default: False)
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_state_dict_post_hooks)
+        self._optimizer_state_dict_post_hooks[handle.id] = hook
+        if prepend:
+            self._optimizer_state_dict_post_hooks.move_to_end(handle.id, last=False)
+        return handle
+
+
+    @torch._disable_dynamo
+    def state_dict(self) -> StateDict:
+        r"""Returns the state of the optimizer as a :class:`dict`.
+
+        It contains two entries:
+
+        * ``state``: a Dict holding current optimization state. Its content
+            differs between optimizer classes, but some common characteristics
+            hold. For example, state is saved per parameter, and the parameter
+            itself is NOT saved. ``state`` is a Dictionary mapping parameter ids
+            to a Dict with state corresponding to each parameter.
+        * ``param_groups``: a List containing all parameter groups where each
+            parameter group is a Dict. Each parameter group contains metadata
+            specific to the optimizer, such as learning rate and weight decay,
+            as well as a List of parameter IDs of the parameters in the group.
+
+        NOTE: The parameter IDs may look like indices but they are just IDs
+        associating state with param_group. When loading from a state_dict,
+        the optimizer will zip the param_group ``params`` (int IDs) and the
+        optimizer ``param_groups`` (actual ``nn.Parameter`` s) in order to
+        match state WITHOUT additional verification.
+
+        A returned state dict might look something like:
+
+        .. code-block:: text
+
+            {
+                'state': {
+                    0: {'momentum_buffer': tensor(...), ...},
+                    1: {'momentum_buffer': tensor(...), ...},
+                    2: {'momentum_buffer': tensor(...), ...},
+                    3: {'momentum_buffer': tensor(...), ...}
+                },
+                'param_groups': [
+                    {
+                        'lr': 0.01,
+                        'weight_decay': 0,
+                        ...
+                        'params': [0]
+                    },
+                    {
+                        'lr': 0.001,
+                        'weight_decay': 0.5,
+                        ...
+                        'params': [1, 2, 3]
+                    }
+                ]
+            }
+
+        """
+
+        for pre_hook in self._optimizer_state_dict_pre_hooks.values():
+            pre_hook(self)
+
+        # Save order indices instead of Tensors
+        param_mappings: Dict[int, int] = {}
+        start_index = 0
+
+        def pack_group(group: Dict[str, Any]) -> Dict[str, Any]:
+            nonlocal start_index
+            packed = {k: v for k, v in group.items() if k != 'params'}
+            param_mappings.update({id(p): i for i, p in enumerate(group['params'], start_index)
+                                   if id(p) not in param_mappings})
+            packed['params'] = [param_mappings[id(p)] for p in group['params']]
+            start_index += len(packed['params'])
+            return packed
+        param_groups = [pack_group(g) for g in self.param_groups]
+        # Remap state to use order indices as keys
+        packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
+                        for k, v in self.state.items()}
+
+        state_dict = {
+            'state': packed_state,
+            'param_groups': param_groups,
+        }
+
+        for post_hook in self._optimizer_state_dict_post_hooks.values():
+            hook_result = post_hook(self, state_dict)
+            if hook_result is not None:
+                state_dict = hook_result
+        return state_dict
+
+    @staticmethod
+    def _process_value_according_to_param_policy(
+        param: torch.Tensor,
+        value: torch.Tensor,
+        param_id: int,
+        param_groups: List[Dict[Any, Any]],
+        key: Hashable = None,
+    ) -> torch.Tensor:
+        # Floating-point types are a bit special here. They are the only ones
+        # that are assumed to always match the type of params.
+        # Make sure state['step'] is not casted https://github.com/pytorch/pytorch/issues/74424
+        # UNLESS fused or capturable, see note [special device hosting for step]
+        fused = False
+        capturable = False
+        assert param_groups is not None
+        for pg in param_groups:
+            if param_id in pg["params"]:
+                fused = pg["fused"] if "fused" in pg else False
+                capturable = pg["capturable"] if "capturable" in pg else False
+                break
+        if key == "step":
+            if capturable or fused:
+                return value.to(dtype=torch.float32, device=param.device)
+            else:
+                return value
+        else:
+            if param.is_floating_point():
+                return value.to(dtype=param.dtype, device=param.device)
+            else:
+                return value.to(device=param.device)
+
+
+    def register_load_state_dict_pre_hook(
+        self,
+        hook: Callable[["Optimizer", StateDict], Optional[StateDict]],
+        prepend: bool = False,
+    ) -> RemovableHandle:
+        r"""Register a load_state_dict pre-hook which will be called before
+        :meth:`~torch.optim.Optimizer.load_state_dict` is called. It should have the
+        following signature::
+
+            hook(optimizer, state_dict) -> state_dict or None
+
+        The ``optimizer`` argument is the optimizer instance being used and the
+        ``state_dict`` argument is a shallow copy of the ``state_dict`` the user
+        passed in to ``load_state_dict``. The hook may modify the state_dict inplace
+        or optionally return a new one. If a state_dict is returned, it will be used
+        to be loaded into the optimizer.
+
+        The hook will be called with argument ``self`` and ``state_dict`` before
+        calling ``load_state_dict`` on ``self``. The registered hook can be used to
+        perform pre-processing before the ``load_state_dict`` call is made.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If True, the provided pre ``hook`` will be fired before
+                all the already registered pre-hooks on ``load_state_dict``. Otherwise,
+                the provided ``hook`` will be fired after all the already registered
+                pre-hooks. (default: False)
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_load_state_dict_pre_hooks)
+        self._optimizer_load_state_dict_pre_hooks[handle.id] = hook
+        if prepend:
+            self._optimizer_load_state_dict_pre_hooks.move_to_end(handle.id, last=False)
+        return handle
+
+
+    def register_load_state_dict_post_hook(
+        self, hook: Callable[["Optimizer"], None], prepend: bool = False
+    ) -> RemovableHandle:
+        r"""Register a load_state_dict post-hook which will be called after
+        :meth:`~torch.optim.Optimizer.load_state_dict` is called. It should have the
+        following signature::
+
+            hook(optimizer) -> None
+
+        The ``optimizer`` argument is the optimizer instance being used.
+
+        The hook will be called with argument ``self`` after calling
+        ``load_state_dict`` on ``self``. The registered hook can be used to
+        perform post-processing after ``load_state_dict`` has loaded the
+        ``state_dict``.
+
+        Args:
+            hook (Callable): The user defined hook to be registered.
+            prepend (bool): If True, the provided post ``hook`` will be fired before
+                all the already registered post-hooks on ``load_state_dict``. Otherwise,
+                the provided ``hook`` will be fired after all the already registered
+                post-hooks. (default: False)
+
+        Returns:
+            :class:`torch.utils.hooks.RemoveableHandle`:
+                a handle that can be used to remove the added hook by calling
+                ``handle.remove()``
+        """
+        handle = hooks.RemovableHandle(self._optimizer_load_state_dict_post_hooks)
+        self._optimizer_load_state_dict_post_hooks[handle.id] = hook
+        if prepend:
+            self._optimizer_load_state_dict_post_hooks.move_to_end(handle.id, last=False)  # type: ignore[attr-defined]
+        return handle
+
+
+    @torch._disable_dynamo
+    def load_state_dict(self, state_dict: StateDict) -> None:
+        r"""Loads the optimizer state.
+
+        Args:
+            state_dict (dict): optimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # shallow copy, to be consistent with module API
+        state_dict = state_dict.copy()
+
+        for pre_hook in self._optimizer_load_state_dict_pre_hooks.values():
+            hook_result = pre_hook(self, state_dict)
+            if hook_result is not None:
+                state_dict = hook_result
+
+        # Validate the state_dict
+        groups = self.param_groups
+
+        # Deepcopy as we write into saved_groups later to update state
+        saved_groups = deepcopy(state_dict['param_groups'])
+
+        if len(groups) != len(saved_groups):
+            raise ValueError("loaded state dict has a different number of "
+                             "parameter groups")
+        param_lens = (len(g['params']) for g in groups)
+        saved_lens = (len(g['params']) for g in saved_groups)
+        if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)):
+            raise ValueError("loaded state dict contains a parameter group "
+                             "that doesn't match the size of optimizer's group")
+
+        # Update the state
+        id_map = dict(zip(chain.from_iterable(g['params'] for g in saved_groups),
+                      chain.from_iterable(g['params'] for g in groups)))
+
+        def _cast(param, value, param_id=None, param_groups=None, key=None):
+            r"""Make a deep copy of value, casting all tensors to device of param."""
+            if isinstance(value, torch.Tensor):
+                return Optimizer._process_value_according_to_param_policy(param, value, param_id, param_groups, key)
+            elif isinstance(value, dict):
+                return {k: _cast(param, v, param_id=param_id, param_groups=param_groups, key=k) for k, v in value.items()}
+            elif isinstance(value, Iterable):
+                return type(value)(_cast(param, v, param_id=param_id, param_groups=param_groups) for v in value)  # type: ignore[call-arg]
+            else:
+                return value
+
+        # Copy state assigned to params (and cast tensors to appropriate types).
+        # State that is not assigned to params is copied as is (needed for
+        # backward compatibility).
+        state: DefaultDict[torch.Tensor, Dict[Any, Any]] = defaultdict(dict)
+        for k, v in state_dict['state'].items():
+            if k in id_map:
+                param = id_map[k]
+                state[param] = _cast(param, v, param_id=k, param_groups=state_dict['param_groups'])
+            else:
+                state[k] = v
+
+        # Update parameter groups, setting their 'params' value
+        def update_group(group: Dict[str, Any], new_group: Dict[str, Any]) -> Dict[str, Any]:
+            new_group['params'] = group['params']
+            return new_group
+        param_groups = [
+            update_group(g, ng) for g, ng in zip(groups, saved_groups)]
+        self.__setstate__({'state': state, 'param_groups': param_groups})
+
+        for post_hook in self._optimizer_load_state_dict_post_hooks.values():
+            post_hook(self)
+
+
+    @torch._disable_dynamo
+    def zero_grad(self, set_to_none: bool = True) -> None:
+        r"""Resets the gradients of all optimized :class:`torch.Tensor` s.
+
+        Args:
+            set_to_none (bool): instead of setting to zero, set the grads to None.
+                This will in general have lower memory footprint, and can modestly improve performance.
+                However, it changes certain behaviors. For example:
+                1. When the user tries to access a gradient and perform manual ops on it,
+                a None attribute or a Tensor full of 0s will behave differently.
+                2. If the user requests ``zero_grad(set_to_none=True)`` followed by a backward pass, ``.grad``\ s
+                are guaranteed to be None for params that did not receive a gradient.
+                3. ``torch.optim`` optimizers have a different behavior if the gradient is 0 or None
+                (in one case it does the step with a gradient of 0 and in the other it skips
+                the step altogether).
+        """
+        foreach = self.defaults.get('foreach', False) or self.defaults.get('fused', False)
+
+        if not hasattr(self, "_zero_grad_profile_name"):
+            self._patch_step_function()
+
+        per_device_and_dtype_grads: Optional[DefaultDict[torch.device, DefaultDict[torch.dtype, List[torch.Tensor]]]]
+        if foreach:
+            per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))
+        else:
+            per_device_and_dtype_grads = None
+
+        with torch.autograd.profiler.record_function(self._zero_grad_profile_name):
+            for group in self.param_groups:
+                for p in group['params']:
+                    if p.grad is not None:
+                        if set_to_none:
+                            p.grad = None
+                        else:
+                            if p.grad.grad_fn is not None:
+                                p.grad.detach_()
+                            else:
+                                p.grad.requires_grad_(False)
+                            if (not foreach or p.grad.is_sparse):
+                                p.grad.zero_()
+                            else:
+                                assert per_device_and_dtype_grads is not None
+                                per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append(p.grad)
+            if foreach:
+                assert per_device_and_dtype_grads is not None
+                for per_dtype_grads in per_device_and_dtype_grads.values():
+                    for grads in per_dtype_grads.values():
+                        torch._foreach_zero_(grads)
+
+    @overload
+    def step(self, closure: None = ...) -> None:
+        ...
+
+    @overload
+    def step(self, closure: Callable[[], float]) -> float:
+        ...
+
+    def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]:
+        r"""Performs a single optimization step (parameter update).
+
+        Args:
+            closure (Callable): A closure that reevaluates the model and
+                returns the loss. Optional for most optimizers.
+
+        .. note::
+            Unless otherwise specified, this function should not modify the
+            ``.grad`` field of the parameters.
+        """
+        raise NotImplementedError
+
+    @torch._disable_dynamo
+    def add_param_group(self, param_group: Dict[str, Any]) -> None:
+        r"""Add a param group to the :class:`Optimizer` s `param_groups`.
+
+        This can be useful when fine tuning a pre-trained network as frozen layers can be made
+        trainable and added to the :class:`Optimizer` as training progresses.
+
+        Args:
+            param_group (dict): Specifies what Tensors should be optimized along with group
+                specific optimization options.
+        """
+        if not isinstance(param_group, dict):
+            raise TypeError(f"param_group must be a dict, but got {type(param_group)}")
+
+        params = param_group['params']
+        if isinstance(params, torch.Tensor):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError('optimizer parameters need to be organized in ordered collections, but '
+                            'the ordering of tensors in sets will change between runs. Please use a list instead.')
+        else:
+            param_group['params'] = list(params)
+
+        for param in param_group['params']:
+            if not isinstance(param, torch.Tensor):
+                raise TypeError("optimizer can only optimize Tensors, "
+                                "but one of the params is " + torch.typename(param))
+            if not self.defaults.get('differentiable', None) and not (param.is_leaf or param.retains_grad):
+                raise ValueError("can't optimize a non-leaf Tensor")
+
+        for name, default in self.defaults.items():
+            if default is required and name not in param_group:
+                raise ValueError(f"parameter group didn't specify a value of required optimization parameter {name}")
+            else:
+                param_group.setdefault(name, default)
+
+        params = param_group['params']
+        if len(params) != len(set(params)):
+            warnings.warn("optimizer contains a parameter group with duplicate parameters; "
+                          "in future, this will cause an error; "
+                          "see github.com/pytorch/pytorch/issues/40967 for more information", stacklevel=3)
+
+        param_set: Set[torch.Tensor] = set()
+        for group in self.param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError("some parameters appear in more than one parameter group")
+
+        self.param_groups.append(param_group)
diff --git a/MLPY/Lib/site-packages/torch/optim/radam.py b/MLPY/Lib/site-packages/torch/optim/radam.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19092a8da896459c377ad3457a8c1a165e45b97
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/radam.py
@@ -0,0 +1,519 @@
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+from .optimizer import (
+    Optimizer,
+    _default_to_fused_or_foreach,
+    _differentiable_doc,
+    _capturable_doc,
+    _dispatch_sqrt,
+    _foreach_doc,
+    _get_scalar_dtype,
+    _get_value,
+    _use_grad_for_differentiable,
+    _view_as_real,
+)
+
+__all__ = ["RAdam", "radam"]
+
+
+class RAdam(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        decoupled_weight_decay: bool = False,
+        *,
+        foreach: Optional[bool] = None,
+        capturable: bool = False,
+        differentiable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            capturable=capturable,
+            decoupled_weight_decay=decoupled_weight_decay,
+            differentiable=differentiable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("differentiable", False)
+            group.setdefault("decoupled_weight_decay", False)
+            group.setdefault("capturable", False)
+            for p in group["params"]:
+                p_state = self.state.get(p, [])
+                if len(p_state) != 0 and not torch.is_tensor(p_state['step']):
+                    step_val = float(p_state["step"])
+                    p_state["step"] = (torch.tensor(step_val, dtype=_get_scalar_dtype(), device=p.device) if group['capturable']
+                                       else torch.tensor(step_val, dtype=_get_scalar_dtype()))
+
+    def _init_group(self, group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is not None:
+                has_complex |= torch.is_complex(p)
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError("RAdam does not support sparse gradients")
+                grads.append(p.grad)
+
+                state = self.state[p]
+                # Lazy state initialization
+                if len(state) == 0:
+                    state['step'] = (
+                        torch.zeros((), dtype=_get_scalar_dtype(), device=p.device)
+                        if group['capturable']
+                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                    )
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+                state_steps.append(state["step"])
+
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        self._cuda_graph_capture_health_check()
+
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_steps = []
+            beta1, beta2 = group["betas"]
+
+            has_complex = self._init_group(group, params_with_grad, grads, exp_avgs, exp_avg_sqs, state_steps)
+
+            radam(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                state_steps,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                foreach=group["foreach"],
+                capturable=group["capturable"],
+                differentiable=group["differentiable"],
+                decoupled_weight_decay=group["decoupled_weight_decay"],
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+RAdam.__doc__ = r"""Implements RAdam algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \beta_1, \beta_2
+                \text{ (betas)}, \: \theta_0 \text{ (params)}, \:f(\theta) \text{ (objective)}, \:
+                \lambda \text{ (weightdecay)},                                                   \\
+            &\hspace{13mm} \epsilon \text{ (epsilon)}, \textit{decoupled\_weight\_decay}         \\
+            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
+                v_0 \leftarrow 0 \text{ ( second moment)},                                       \\
+            &\hspace{18mm} \rho_{\infty} \leftarrow 2/(1-\beta_2) -1                      \\[-1.ex]
+            &\rule{110mm}{0.4pt}  \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{6mm} g_t \leftarrow \nabla_{\theta} f_t (\theta_{t-1})                      \\
+            &\hspace{6mm} \theta_t \leftarrow \theta_{t-1}                                       \\
+            &\hspace{6mm} \textbf{if} \: \lambda \neq 0                                          \\
+            &\hspace{12mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
+            &\hspace{18mm} \theta_t \leftarrow \theta_{t} - \gamma \lambda \theta_{t}            \\
+            &\hspace{12mm}\textbf{else}                                                          \\
+            &\hspace{18mm} g_t \leftarrow g_t + \lambda \theta_{t}                               \\
+            &\hspace{6mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
+            &\hspace{6mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
+            &\hspace{6mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
+            &\hspace{6mm}\rho_t \leftarrow \rho_{\infty} -
+                2 t \beta^t_2 /\big(1-\beta_2^t \big)                                    \\[0.1.ex]
+            &\hspace{6mm}\textbf{if} \: \rho_t > 5                                               \\
+            &\hspace{12mm} l_t \leftarrow \frac{\sqrt{ (1-\beta^t_2) }}{ \sqrt{v_t} +\epsilon  } \\
+            &\hspace{12mm} r_t \leftarrow
+      \sqrt{\frac{(\rho_t-4)(\rho_t-2)\rho_{\infty}}{(\rho_{\infty}-4)(\rho_{\infty}-2) \rho_t}} \\
+            &\hspace{12mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t} r_t l_t        \\
+            &\hspace{6mm}\textbf{else}                                                           \\
+            &\hspace{12mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}                \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to `On the variance of the adaptive learning rate and beyond`_.
+
+    This implementation provides an option to use either the original weight_decay implementation as in Adam
+    (where the weight_decay is applied to the gradient) or the one from AdamW (where weight_decay is applied
+    to the weight) through the decoupled_weight_decay option. When decoupled_weight_decay is set to False
+    (default), it uses the original Adam style weight decay, otherwise, it uses the AdamW style which
+    corresponds more closely to the `author's implementation`_ in the RAdam paper. Further information
+    about decoupled weight decay can be found in `Decoupled Weight Decay Regularization`_.
+
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        decoupled_weight_decay (bool, optional): whether to use decoupled weight
+            decay as in AdamW to obtain RAdamW (default: False)
+        {_foreach_doc}
+        {_differentiable_doc}
+        {_capturable_doc}
+
+    .. _On the variance of the adaptive learning rate and beyond:
+        https://arxiv.org/abs/1908.03265
+    .. _author's implementation:
+        https://github.com/LiyuanLucasLiu/RAdam
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+
+    """
+
+
+def radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    decoupled_weight_decay: bool = False,
+    foreach: Optional[bool] = None,
+    differentiable: bool = False,
+    capturable: bool = False,
+    has_complex: bool = False,
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+):
+    r"""Functional API that performs RAdam algorithm computation.
+
+    See :class:`~torch.optim.RAdam` for details.
+    """
+
+    if not all(isinstance(t, torch.Tensor) for t in state_steps):
+        raise RuntimeError(
+            "API has changed, `state_steps` argument must contain a list of singleton tensors"
+        )
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_radam
+    else:
+        func = _single_tensor_radam
+
+    func(
+        params,
+        grads,
+        exp_avgs,
+        exp_avg_sqs,
+        state_steps,
+        beta1=beta1,
+        beta2=beta2,
+        lr=lr,
+        weight_decay=weight_decay,
+        eps=eps,
+        decoupled_weight_decay=decoupled_weight_decay,
+        differentiable=differentiable,
+        capturable=capturable,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    differentiable: bool,
+    decoupled_weight_decay: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+    for i, param in enumerate(params):
+        grad = grads[i]
+        exp_avg = exp_avgs[i]
+        exp_avg_sq = exp_avg_sqs[i]
+        step_t = state_steps[i]
+
+        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+        if not torch._utils.is_compiling() and capturable:
+            assert (param.is_cuda and step_t.is_cuda) or (
+                param.is_xla and step_t.is_xla
+            ), "If capturable=True, params and state_steps must be CUDA or XLA tensors."
+
+        if torch.is_complex(param):
+            param = torch.view_as_real(param)
+            grad = torch.view_as_real(grad)
+            exp_avg = torch.view_as_real(exp_avg)
+            exp_avg_sq = torch.view_as_real(exp_avg_sq)
+
+        # update step
+        step_t += 1
+        step = step_t if capturable else _get_value(step_t)
+
+        if weight_decay != 0:
+            if decoupled_weight_decay:
+                param.mul_(1 - lr * weight_decay)
+            else:
+                grad = grad.add(param, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        exp_avg.lerp_(grad, 1 - beta1)
+        exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
+
+        bias_correction1 = 1 - beta1 ** step
+        bias_correction2 = 1 - beta2 ** step
+
+        # correcting bias for the first moving moment
+        bias_corrected_exp_avg = exp_avg / bias_correction1
+
+        # maximum length of the approximated SMA
+        rho_inf = 2 / (1 - beta2) - 1
+        # compute the length of the approximated SMA
+        rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
+
+        def _compute_rect():
+            return (
+                (rho_t - 4)
+                * (rho_t - 2)
+                * rho_inf
+                / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
+            ) ** 0.5
+
+        def _compute_adaptive_lr():
+            exp_avg_sq_sqrt = exp_avg_sq.sqrt()
+            if differentiable:
+                exp_avg_sq_sqrt = exp_avg_sq_sqrt.add(eps)
+            else:
+                exp_avg_sq_sqrt = exp_avg_sq_sqrt.add_(eps)
+
+            return (bias_correction2 ** 0.5) / exp_avg_sq_sqrt
+
+        # Compute the variance rectification term and update parameters accordingly
+        if capturable:
+            update = torch.where(rho_t > 5.0, _compute_rect() * _compute_adaptive_lr(), 1.0)
+            param.add_(bias_corrected_exp_avg * lr * update, alpha=-1.0)
+        else:
+            if rho_t > 5.0:
+                param.add_(bias_corrected_exp_avg * lr * _compute_adaptive_lr() * _compute_rect(), alpha=-1.0)
+            else:
+                param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
+
+
+def _multi_tensor_radam(
+    params: List[Tensor],
+    grads: List[Tensor],
+    exp_avgs: List[Tensor],
+    exp_avg_sqs: List[Tensor],
+    state_steps: List[Tensor],
+    *,
+    beta1: float,
+    beta2: float,
+    lr: float,
+    weight_decay: float,
+    eps: float,
+    decoupled_weight_decay: bool,
+    differentiable: bool,
+    capturable: bool,
+    has_complex: bool,
+):
+
+    if len(params) == 0:
+        return
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
+    if not torch._utils.is_compiling() and capturable:
+        assert all(p.is_cuda and step.is_cuda for p, step in zip(params, state_steps)), \
+            "If capturable=True, params and state_steps must be CUDA tensors."
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, exp_avgs, exp_avg_sqs, state_steps])
+    for ((
+        grouped_params,
+        grouped_grads,
+        grouped_exp_avgs,
+        grouped_exp_avg_sqs,
+        grouped_state_steps,
+    ), _) in grouped_tensors.values():
+        # Update steps
+        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
+        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
+        # wrapped it once now. The alpha is required to assure we go to the right overload.
+        if grouped_state_steps[0].is_cpu:
+            torch._foreach_add_(grouped_state_steps, torch.tensor(1.0, device='cpu'), alpha=1.0)
+        else:
+            torch._foreach_add_(grouped_state_steps, 1)
+
+        if has_complex:
+            _view_as_real(grouped_params, grouped_grads, grouped_exp_avgs, grouped_exp_avg_sqs)
+
+        # maximum length of the approximated SMA
+        rho_inf = 2 / (1 - beta2) - 1
+        # compute the length of the approximated SMA
+        if capturable:
+            bias_correction1 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_neg_(bias_correction1)
+            torch._foreach_add_(bias_correction1, 1)
+            bias_correction2 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_mul_(bias_correction2, grouped_state_steps)
+            torch._foreach_mul_(bias_correction2, 2)
+            torch._foreach_div_(bias_correction2, bias_correction1)
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_add_(bias_correction2, rho_inf)
+            rho_t_list = bias_correction2
+        else:
+            rho_t_list = [rho_inf - 2 * _get_value(step) * (beta2 ** _get_value(step)) /
+                          (1 - beta2 ** _get_value(step)) for step in grouped_state_steps]
+
+
+        if weight_decay != 0:
+            if decoupled_weight_decay:
+                torch._foreach_mul_(grouped_params, 1 - lr * weight_decay)
+            else:
+                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+
+        # Decay the first and second moment running average coefficient
+        torch._foreach_lerp_(grouped_exp_avgs, grouped_grads, 1 - beta1)
+
+        torch._foreach_mul_(grouped_exp_avg_sqs, beta2)
+        torch._foreach_addcmul_(grouped_exp_avg_sqs, grouped_grads, grouped_grads, 1 - beta2)
+
+        # Delete the local intermediate since it won't be used anymore to save on peak memory
+        del grouped_grads
+
+        if capturable:
+            num = torch._foreach_sub(rho_t_list, 4)
+            sub2 = torch._foreach_sub(rho_t_list, 2)
+            torch._foreach_mul_(num, sub2)
+            del sub2
+            torch._foreach_mul_(num, rho_inf)
+            rho_inf = ((rho_inf - 4) * (rho_inf - 2))
+            denom = torch._foreach_mul(rho_t_list, rho_inf)
+            torch._foreach_div_(num, denom)
+            del denom
+            torch._foreach_sqrt_(num)
+
+            # TODO(mlazos): we should try and get a foreach_where op https://github.com/pytorch/pytorch/issues/117884
+            rect = [torch.where(rho_t > 5.0, n, 0.0) for n, rho_t in zip(num, rho_t_list)]
+            del num
+            del rho_t_list
+            unrect_step_size = [torch.where(rect > 0, 0.0, 1.0) for rect in rect]
+            torch._foreach_mul_(unrect_step_size, lr)
+
+            bias_correction1 = torch._foreach_pow(beta1, grouped_state_steps)
+            torch._foreach_neg_(bias_correction1)
+            torch._foreach_add_(bias_correction1, 1)
+
+            torch._foreach_div_(unrect_step_size, bias_correction1)
+            torch._foreach_neg_(unrect_step_size)
+
+            bias_correction2 = torch._foreach_pow(beta2, grouped_state_steps)
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_add_(bias_correction2, 1)
+            torch._foreach_sqrt_(bias_correction2)
+            torch._foreach_mul_(bias_correction2, lr)
+            torch._foreach_mul_(bias_correction2, rect)
+            del rect
+            torch._foreach_neg_(bias_correction2)
+            torch._foreach_div_(bias_correction2, bias_correction1)
+            del bias_correction1
+        else:
+            rect = [
+                _dispatch_sqrt(
+                    (rho_t - 4)
+                    * (rho_t - 2)
+                    * rho_inf
+                    / ((rho_inf - 4) * (rho_inf - 2) * rho_t)
+                )
+                if rho_t > 5
+                else 0
+                for rho_t in rho_t_list
+            ]
+            unrectified = [0 if rect > 0 else 1.0 for rect in rect]
+
+            bias_correction1 = [1 - beta1 ** _get_value(step) for step in grouped_state_steps]
+            unrect_step_size = [(lr * rect / bc) * -1 for rect, bc in zip(unrectified, bias_correction1)]
+            bias_correction2 = [
+                _dispatch_sqrt(1 - beta2 ** _get_value(step)) * (lr * rect / bc) * -1
+                for step, rect, bc in zip(grouped_state_steps, rect, bias_correction1)
+            ]
+
+
+        buffer = torch._foreach_sqrt(grouped_exp_avg_sqs)
+        torch._foreach_add_(buffer, eps)
+        torch._foreach_div_(buffer, bias_correction2)
+        torch._foreach_reciprocal_(buffer)
+        torch._foreach_add_(buffer, unrect_step_size)
+
+        # Here, buffer = sqrt(1 - beta2^t) * rect_step_size / (sqrt(v) + eps) + unrect_step_size
+        torch._foreach_addcmul_(grouped_params, grouped_exp_avgs, buffer)
diff --git a/MLPY/Lib/site-packages/torch/optim/radam.pyi b/MLPY/Lib/site-packages/torch/optim/radam.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7acd84e53dc3dbed8fd5d40dca36831bfe037163
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/radam.pyi
@@ -0,0 +1,14 @@
+from typing import Tuple
+
+from .optimizer import Optimizer, ParamsT
+
+class RAdam(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        betas: Tuple[float, float] = ...,
+        eps: float = ...,
+        weight_decay: float = ...,
+        decoupled_weight_decay: bool = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/rmsprop.py b/MLPY/Lib/site-packages/torch/optim/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..804111f00fa4e110d7271fd6fff08f4bb921d52c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/rmsprop.py
@@ -0,0 +1,374 @@
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, _default_to_fused_or_foreach, _use_grad_for_differentiable,
+                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
+from typing import List, Optional
+
+__all__ = ["RMSprop", "rmsprop"]
+
+
+class RMSprop(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        alpha=0.99,
+        eps=1e-8,
+        weight_decay=0,
+        momentum=0,
+        centered=False,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 <= eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= momentum:
+            raise ValueError(f"Invalid momentum value: {momentum}")
+        if not 0.0 <= weight_decay:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+        if not 0.0 <= alpha:
+            raise ValueError(f"Invalid alpha value: {alpha}")
+
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            alpha=alpha,
+            eps=eps,
+            centered=centered,
+            weight_decay=weight_decay,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("momentum", 0)
+            group.setdefault("centered", False)
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+
+    def _init_group(self, group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            has_complex |= torch.is_complex(p)
+            params_with_grad.append(p)
+
+            if p.grad.is_sparse:
+                raise RuntimeError("RMSprop does not support sparse gradients")
+            grads.append(p.grad)
+
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["square_avg"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if group["momentum"] > 0:
+                    state["momentum_buffer"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+                if group["centered"]:
+                    state["grad_avg"] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format
+                    )
+            square_avgs.append(state["square_avg"])
+
+            if group["momentum"] > 0:
+                momentum_buffer_list.append(state["momentum_buffer"])
+            if group["centered"]:
+                grad_avgs.append(state["grad_avg"])
+
+            if group["differentiable"] and isinstance(state["step"], Tensor):
+                raise RuntimeError("`step` can't be a tensor")
+
+            state["step"] += 1
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            square_avgs = []
+            grad_avgs = []
+            momentum_buffer_list = []
+
+            has_complex = self._init_group(group, params_with_grad, grads, square_avgs, momentum_buffer_list, grad_avgs)
+
+            rmsprop(
+                params_with_grad,
+                grads,
+                square_avgs,
+                grad_avgs,
+                momentum_buffer_list,
+                lr=group["lr"],
+                alpha=group["alpha"],
+                eps=group["eps"],
+                weight_decay=group["weight_decay"],
+                momentum=group["momentum"],
+                centered=group["centered"],
+                foreach=group["foreach"],
+                maximize=group["maximize"],
+                differentiable=group["differentiable"],
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+RMSprop.__doc__ = r"""Implements RMSprop algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \alpha \text{ (alpha)},\: \gamma \text{ (lr)},
+                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
+            &\hspace{13mm}   \lambda \text{ (weight decay)},\: \mu \text{ (momentum)},\: centered\\
+            &\textbf{initialize} : v_0 \leftarrow 0 \text{ (square average)}, \:
+                \textbf{b}_0 \leftarrow 0 \text{ (buffer)}, \: g^{ave}_0 \leftarrow 0     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}v_t           \leftarrow   \alpha v_{t-1} + (1 - \alpha) g^2_t
+                \hspace{8mm}                                                                     \\
+            &\hspace{5mm} \tilde{v_t} \leftarrow v_t                                             \\
+            &\hspace{5mm}if \: centered                                                          \\
+            &\hspace{10mm} g^{ave}_t \leftarrow g^{ave}_{t-1} \alpha + (1-\alpha) g_t            \\
+            &\hspace{10mm} \tilde{v_t} \leftarrow \tilde{v_t} -  \big(g^{ave}_{t} \big)^2        \\
+            &\hspace{5mm}if \: \mu > 0                                                           \\
+            &\hspace{10mm} \textbf{b}_t\leftarrow \mu \textbf{b}_{t-1} +
+                g_t/ \big(\sqrt{\tilde{v_t}} +  \epsilon \big)                                   \\
+            &\hspace{10mm} \theta_t \leftarrow \theta_{t-1} - \gamma \textbf{b}_t                \\
+            &\hspace{5mm} else                                                                   \\
+            &\hspace{10mm}\theta_t      \leftarrow   \theta_{t-1} -
+                \gamma  g_t/ \big(\sqrt{\tilde{v_t}} + \epsilon \big)  \hspace{3mm}              \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to
+    `lecture notes <https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_ by G. Hinton.
+    and centered version `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+    The implementation here takes the square root of the gradient average before
+    adding epsilon (note that TensorFlow interchanges these two operations). The effective
+    learning rate is thus :math:`\gamma/(\sqrt{v} + \epsilon)` where :math:`\gamma`
+    is the scheduled learning rate and :math:`v` is the weighted moving average
+    of the squared gradient.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+
+    """
+
+
+def rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+):
+    r"""Functional API that performs rmsprop algorithm computation.
+    See :class:`~torch.optim.RMSProp` for details.
+    """
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rmsprop
+    else:
+        func = _single_tensor_rmsprop
+
+    func(
+        params,
+        grads,
+        square_avgs,
+        grad_avgs,
+        momentum_buffer_list,
+        lr=lr,
+        alpha=alpha,
+        eps=eps,
+        weight_decay=weight_decay,
+        momentum=momentum,
+        centered=centered,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        grad = grad if not maximize else -grad
+        square_avg = square_avgs[i]
+
+        if weight_decay != 0:
+            grad = grad.add(param, alpha=weight_decay)
+
+        is_complex_param = torch.is_complex(param)
+        if is_complex_param:
+            param = torch.view_as_real(param)
+            grad = torch.view_as_real(grad)
+            square_avg = torch.view_as_real(square_avg)
+
+        square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
+
+        if centered:
+            grad_avg = grad_avgs[i]
+            if is_complex_param:
+                grad_avg = torch.view_as_real(grad_avg)
+            grad_avg.lerp_(grad, 1 - alpha)
+            avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_()
+        else:
+            avg = square_avg.sqrt()
+
+        if differentiable:
+            avg = avg.add(eps)
+        else:
+            avg = avg.add_(eps)
+
+        if momentum > 0:
+            buf = momentum_buffer_list[i]
+            if is_complex_param:
+                buf = torch.view_as_real(buf)
+            buf.mul_(momentum).addcdiv_(grad, avg)
+            param.add_(buf, alpha=-lr)
+        else:
+            param.addcdiv_(grad, avg, value=-lr)
+
+
+def _multi_tensor_rmsprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    square_avgs: List[Tensor],
+    grad_avgs: List[Tensor],
+    momentum_buffer_list: List[Tensor],
+    *,
+    lr: float,
+    alpha: float,
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    centered: bool,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    if len(params) == 0:
+        return
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, square_avgs, grad_avgs, momentum_buffer_list])
+    for (((grouped_params, grouped_grads, grouped_square_avgs, grouped_grad_avgs,
+         grouped_momentum_buffer_list)), _) in grouped_tensors.values():
+        if has_complex:
+            state_and_grads = [grouped_grads, grouped_square_avgs]
+            if momentum > 0:
+                state_and_grads.append(grouped_momentum_buffer_list)
+            if centered:
+                state_and_grads.append(grouped_grad_avgs)
+            _view_as_real(grouped_params, *state_and_grads)
+
+        if maximize:
+            grouped_grads = torch._foreach_neg(grouped_grads)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (grouped_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(grouped_grads, grouped_params, alpha=weight_decay)
+            else:
+                grouped_grads = torch._foreach_add(grouped_grads, grouped_params, alpha=weight_decay)
+
+        torch._foreach_mul_(grouped_square_avgs, alpha)
+        torch._foreach_addcmul_(grouped_square_avgs, grouped_grads, grouped_grads, value=1 - alpha)
+
+        if centered:
+            torch._foreach_lerp_(grouped_grad_avgs, grouped_grads, 1 - alpha)
+            avg = torch._foreach_addcmul(grouped_square_avgs, grouped_grad_avgs, grouped_grad_avgs, value=-1)
+            torch._foreach_sqrt_(avg)
+            torch._foreach_add_(avg, eps)
+        else:
+            avg = torch._foreach_sqrt(grouped_square_avgs)
+            torch._foreach_add_(avg, eps)
+
+        if momentum > 0:
+            torch._foreach_mul_(grouped_momentum_buffer_list, momentum)
+            torch._foreach_addcdiv_(grouped_momentum_buffer_list, grouped_grads, avg)
+            torch._foreach_add_(grouped_params, grouped_momentum_buffer_list, alpha=-lr)
+        else:
+            torch._foreach_addcdiv_(grouped_params, grouped_grads, avg, value=-lr)
diff --git a/MLPY/Lib/site-packages/torch/optim/rmsprop.pyi b/MLPY/Lib/site-packages/torch/optim/rmsprop.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..d3f2f1a86e09fd4e502b9822a3e60989c206c2cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/rmsprop.pyi
@@ -0,0 +1,13 @@
+from .optimizer import Optimizer, ParamsT
+
+class RMSprop(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        alpha: float = ...,
+        eps: float = ...,
+        weight_decay: float = ...,
+        momentum: float = ...,
+        centered: bool = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/rprop.py b/MLPY/Lib/site-packages/torch/optim/rprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4c381e5b46dd736aabe05e0f3a10c06044c8dd2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/rprop.py
@@ -0,0 +1,331 @@
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
+                        _differentiable_doc, _foreach_doc, _maximize_doc, _view_as_real)
+from typing import List, Optional
+
+__all__ = ["Rprop", "rprop"]
+
+
+class Rprop(Optimizer):
+    def __init__(
+        self,
+        params,
+        lr=1e-2,
+        etas=(0.5, 1.2),
+        step_sizes=(1e-6, 50),
+        *,
+        foreach: Optional[bool] = None,
+        maximize: bool = False,
+        differentiable: bool = False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 < etas[0] < 1.0 < etas[1]:
+            raise ValueError(f"Invalid eta values: {etas[0]}, {etas[1]}")
+
+        defaults = dict(
+            lr=lr,
+            etas=etas,
+            step_sizes=step_sizes,
+            foreach=foreach,
+            maximize=maximize,
+            differentiable=differentiable,
+        )
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault("foreach", None)
+            group.setdefault("maximize", False)
+            group.setdefault("differentiable", False)
+
+    def _init_group(self, group, params, grads, prevs, step_sizes):
+        has_complex = False
+        for p in group["params"]:
+            if p.grad is None:
+                continue
+            has_complex |= torch.is_complex(p)
+            params.append(p)
+            grad = p.grad
+            if grad.is_sparse:
+                raise RuntimeError("Rprop does not support sparse gradients")
+
+            grads.append(grad)
+            state = self.state[p]
+
+            # State initialization
+            if len(state) == 0:
+                state["step"] = 0
+                state["prev"] = torch.zeros_like(
+                    p, memory_format=torch.preserve_format
+                )
+                if p.dtype.is_complex:
+                    # Complex Number should be as if they are two independent real numbers.
+                    # Hence the step_size shouldn't be zero for imaginary part.
+                    state["step_size"] = (
+                        torch.full_like(grad, complex(group["lr"], group["lr"]))
+                    )
+                else:
+                    state["step_size"] = torch.full_like(grad, group["lr"])
+
+            prevs.append(state["prev"])
+            step_sizes.append(state["step_size"])
+
+            state["step"] += 1
+        return has_complex
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params = []
+            grads = []
+            prevs = []
+            step_sizes = []
+            etaminus, etaplus = group["etas"]
+            step_size_min, step_size_max = group["step_sizes"]
+            foreach = group["foreach"]
+            maximize = group["maximize"]
+
+            has_complex = self._init_group(group, params, grads, prevs, step_sizes)
+
+            rprop(
+                params,
+                grads,
+                prevs,
+                step_sizes,
+                step_size_min=step_size_min,
+                step_size_max=step_size_max,
+                etaminus=etaminus,
+                etaplus=etaplus,
+                foreach=foreach,
+                maximize=maximize,
+                differentiable=group["differentiable"],
+                has_complex=has_complex,
+            )
+
+        return loss
+
+
+Rprop.__doc__ = r"""Implements the resilient backpropagation algorithm.
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \theta_0 \in \mathbf{R}^d \text{ (params)},f(\theta)
+                \text{ (objective)},                                                             \\
+            &\hspace{13mm}      \eta_{+/-} \text{ (etaplus, etaminus)}, \Gamma_{max/min}
+                \text{ (step sizes)}                                                             \\
+            &\textbf{initialize} :   g^0_{prev} \leftarrow 0,
+                \: \eta_0 \leftarrow \text{lr (learning rate)}                                   \\
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm} \textbf{for} \text{  } i = 0, 1, \ldots, d-1 \: \mathbf{do}            \\
+            &\hspace{10mm}  \textbf{if} \:   g^i_{prev} g^i_t  > 0                               \\
+            &\hspace{15mm}  \eta^i_t \leftarrow \mathrm{min}(\eta^i_{t-1} \eta_{+},
+                \Gamma_{max})                                                                    \\
+            &\hspace{10mm}  \textbf{else if}  \:  g^i_{prev} g^i_t < 0                           \\
+            &\hspace{15mm}  \eta^i_t \leftarrow \mathrm{max}(\eta^i_{t-1} \eta_{-},
+                \Gamma_{min})                                                                    \\
+            &\hspace{15mm}  g^i_t \leftarrow 0                                                   \\
+            &\hspace{10mm}  \textbf{else}  \:                                                    \\
+            &\hspace{15mm}  \eta^i_t \leftarrow \eta^i_{t-1}                                     \\
+            &\hspace{5mm}\theta_t \leftarrow \theta_{t-1}- \eta_t \mathrm{sign}(g_t)             \\
+            &\hspace{5mm}g_{prev} \leftarrow  g_t                                                \\
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    For further details regarding the algorithm we refer to the paper
+    `A Direct Adaptive Method for Faster Backpropagation Learning: The RPROP Algorithm
+    <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.1417>`_.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        etas (Tuple[float, float], optional): pair of (etaminus, etaplus), that
+            are multiplicative increase and decrease factors
+            (default: (0.5, 1.2))
+        step_sizes (Tuple[float, float], optional): a pair of minimal and
+            maximal allowed step sizes (default: (1e-6, 50))
+        {_foreach_doc}
+        {_maximize_doc}
+        {_differentiable_doc}
+
+    """
+
+def rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+    foreach: Optional[bool] = None,
+    maximize: bool = False,
+    differentiable: bool = False,
+    has_complex: bool = False,
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+):
+    r"""Functional API that performs rprop algorithm computation.
+
+    See :class:`~torch.optim.Rprop` for details.
+    """
+
+    if foreach is None:
+        _, foreach = _default_to_fused_or_foreach(params, differentiable, use_fused=False)
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_rprop
+    else:
+        func = _single_tensor_rprop
+
+    func(
+        params,
+        grads,
+        prevs,
+        step_sizes,
+        step_size_min=step_size_min,
+        step_size_max=step_size_max,
+        etaminus=etaminus,
+        etaplus=etaplus,
+        maximize=maximize,
+        differentiable=differentiable,
+        has_complex=has_complex,
+    )
+
+
+def _single_tensor_rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    for i, param in enumerate(params):
+        grad = grads[i]
+        grad = grad if not maximize else -grad
+        prev = prevs[i]
+        step_size = step_sizes[i]
+
+        if torch.is_complex(param):
+            grad = torch.view_as_real(grad)
+            prev = torch.view_as_real(prev)
+            param = torch.view_as_real(param)
+            step_size = torch.view_as_real(step_size)
+        if differentiable:
+            sign = grad.mul(prev.clone()).sign()
+        else:
+            sign = grad.mul(prev).sign()
+        sign[sign.gt(0)] = etaplus
+        sign[sign.lt(0)] = etaminus
+        sign[sign.eq(0)] = 1
+
+        # update stepsizes with step size updates
+        step_size.mul_(sign).clamp_(step_size_min, step_size_max)
+
+        # for dir<0, dfdx=0
+        # for dir>=0 dfdx=dfdx
+        grad = grad.clone(memory_format=torch.preserve_format)
+        grad[sign.eq(etaminus)] = 0
+
+        # update parameters
+        param.addcmul_(grad.sign(), step_size, value=-1)
+        prev.copy_(grad)
+
+
+def _multi_tensor_rprop(
+    params: List[Tensor],
+    grads: List[Tensor],
+    prevs: List[Tensor],
+    step_sizes: List[Tensor],
+    *,
+    step_size_min: float,
+    step_size_max: float,
+    etaminus: float,
+    etaplus: float,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+):
+
+    if len(params) == 0:
+        return
+
+    assert not differentiable, "_foreach ops don't support autograd"
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, prevs, step_sizes])
+    for ((grouped_params, grouped_grads, grouped_prevs, grouped_step_sizes), _) in grouped_tensors.values():
+        # Handle complex params
+        if has_complex:
+            _view_as_real(grouped_params, grouped_grads, grouped_prevs, grouped_step_sizes)
+
+        signs = torch._foreach_mul(grouped_grads, grouped_prevs)
+        if maximize:
+            torch._foreach_neg_(signs)
+
+        # At the end of the step, grouped_prevs will contain the current grads, so we reuse
+        # grouped_prevs memory instead of creating a new buffer, but, for clarity, we reassign
+        # to keep referring to the buffer as grouped_grads.
+        torch._foreach_copy_(grouped_prevs, grouped_grads)
+        if maximize:
+            torch._foreach_neg_(grouped_prevs)
+        grouped_grads = grouped_prevs
+
+        torch._foreach_sign_(signs)
+        for sign in signs:
+            sign[sign.gt(0)] = etaplus
+            sign[sign.lt(0)] = etaminus
+            sign[sign.eq(0)] = 1
+
+        # update stepsizes with step size updates
+        torch._foreach_mul_(grouped_step_sizes, signs)
+        for step_size in grouped_step_sizes:
+            step_size.clamp_(step_size_min, step_size_max)
+
+        # for dir<0, dfdx=0
+        # for dir>=0 dfdx=dfdx
+        grouped_grads = list(grouped_grads)
+        for i in range(len(grouped_grads)):
+            grouped_grads[i][signs[i].eq(etaminus)] = 0
+
+        # explicitly del signs as it's not used after here to save memory
+        del signs
+
+        # update parameters
+        grad_signs = [grad.sign() for grad in grouped_grads]
+        torch._foreach_addcmul_(grouped_params, grad_signs, grouped_step_sizes, value=-1)
+
+        # Logically, you may expect grouped_prevs to get updated to grouped_grads, but that's
+        # basically already happened since we've been using grouped_prevs' memory to store
+        # updated grouped_grads!
diff --git a/MLPY/Lib/site-packages/torch/optim/rprop.pyi b/MLPY/Lib/site-packages/torch/optim/rprop.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..f414364e13818f7a1b9a59e81babadd44b7216e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/rprop.pyi
@@ -0,0 +1,12 @@
+from typing import Tuple
+
+from .optimizer import Optimizer, ParamsT
+
+class Rprop(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        etas: Tuple[float, float] = ...,
+        step_sizes: Tuple[float, float] = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/sgd.py b/MLPY/Lib/site-packages/torch/optim/sgd.py
new file mode 100644
index 0000000000000000000000000000000000000000..33fa5fc6252e4303c2205239492c914389df1524
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/sgd.py
@@ -0,0 +1,419 @@
+import torch
+from torch import Tensor
+from .optimizer import (Optimizer, _use_grad_for_differentiable, _default_to_fused_or_foreach,
+                        _differentiable_doc, _foreach_doc, _maximize_doc, _fused_doc)
+from typing import List, Optional
+
+__all__ = ['SGD', 'sgd']
+
+
+class SGD(Optimizer):
+    def __init__(self, params, lr=1e-3, momentum=0, dampening=0,
+                 weight_decay=0, nesterov=False, *, maximize: bool = False, foreach: Optional[bool] = None,
+                 differentiable: bool = False, fused: Optional[bool] = None):
+        if lr < 0.0:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if momentum < 0.0:
+            raise ValueError(f"Invalid momentum value: {momentum}")
+        if weight_decay < 0.0:
+            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov,
+                        maximize=maximize, foreach=foreach,
+                        differentiable=differentiable, fused=fused)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super().__init__(params, defaults)
+
+        if fused:
+            self._step_supports_amp_scaling = True
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+            group.setdefault('maximize', False)
+            group.setdefault('foreach', None)
+            group.setdefault('differentiable', False)
+            group.setdefault('fused', False)
+
+    def _init_group(self, group, params_with_grad, d_p_list, momentum_buffer_list):
+        has_sparse_grad = False
+
+        for p in group['params']:
+            if p.grad is not None:
+                params_with_grad.append(p)
+                d_p_list.append(p.grad)
+                if p.grad.is_sparse:
+                    has_sparse_grad = True
+
+                state = self.state[p]
+                momentum_buffer_list.append(state.get('momentum_buffer'))
+
+        return has_sparse_grad
+
+    @_use_grad_for_differentiable
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            d_p_list = []
+            momentum_buffer_list = []
+
+            has_sparse_grad = self._init_group(group, params_with_grad, d_p_list, momentum_buffer_list)
+
+            sgd(params_with_grad,
+                d_p_list,
+                momentum_buffer_list,
+                weight_decay=group['weight_decay'],
+                momentum=group['momentum'],
+                lr=group['lr'],
+                dampening=group['dampening'],
+                nesterov=group['nesterov'],
+                maximize=group['maximize'],
+                has_sparse_grad=has_sparse_grad,
+                foreach=group['foreach'],
+                fused=group['fused'],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None))
+
+            # update momentum_buffers in state
+            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
+                state = self.state[p]
+                state['momentum_buffer'] = momentum_buffer
+
+        return loss
+
+
+SGD.__doc__ = r"""Implements stochastic gradient descent (optionally with momentum).
+
+    .. math::
+       \begin{aligned}
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
+                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
+            &\hspace{13mm} \:\mu \text{ (momentum)}, \:\tau \text{ (dampening)},
+            \:\textit{ nesterov,}\:\textit{ maximize}                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                                 \\
+            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
+            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
+            &\hspace{5mm}\textbf{if} \: \lambda \neq 0                                           \\
+            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
+            &\hspace{5mm}\textbf{if} \: \mu \neq 0                                               \\
+            &\hspace{10mm}\textbf{if} \: t > 1                                                   \\
+            &\hspace{15mm} \textbf{b}_t \leftarrow \mu \textbf{b}_{t-1} + (1-\tau) g_t           \\
+            &\hspace{10mm}\textbf{else}                                                          \\
+            &\hspace{15mm} \textbf{b}_t \leftarrow g_t                                           \\
+            &\hspace{10mm}\textbf{if} \: \textit{nesterov}                                       \\
+            &\hspace{15mm} g_t \leftarrow g_{t} + \mu \textbf{b}_t                             \\
+            &\hspace{10mm}\textbf{else}                                                   \\[-1.ex]
+            &\hspace{15mm} g_t  \leftarrow  \textbf{b}_t                                         \\
+            &\hspace{5mm}\textbf{if} \: \textit{maximize}                                          \\
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} + \gamma g_t                   \\[-1.ex]
+            &\hspace{5mm}\textbf{else}                                                    \\[-1.ex]
+            &\hspace{10mm}\theta_t \leftarrow \theta_{t-1} - \gamma g_t                   \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
+            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
+       \end{aligned}
+
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+    """ + fr"""
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+        {_maximize_doc}
+        {_foreach_doc}
+        {_differentiable_doc}
+        {_fused_doc}
+    """ + r"""
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+
+        Considering the specific case of Momentum, the update can be written as
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + g_{t+1}, \\
+                p_{t+1} & = p_{t} - \text{lr} * v_{t+1},
+            \end{aligned}
+
+        where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the
+        parameters, gradient, velocity, and momentum respectively.
+
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+
+        .. math::
+            \begin{aligned}
+                v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\
+                p_{t+1} & = p_{t} - v_{t+1}.
+            \end{aligned}
+
+        The Nesterov version is analogously modified.
+
+        Moreover, the initial value of the momentum buffer is set to the
+        gradient value at the first step. This is in contrast to some other
+        frameworks that initialize it to all zeros.
+
+    """
+
+
+def sgd(params: List[Tensor],
+        d_p_list: List[Tensor],
+        momentum_buffer_list: List[Optional[Tensor]],
+        # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
+        # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
+        has_sparse_grad: bool = None,
+        foreach: Optional[bool] = None,
+        fused: Optional[bool] = None,
+        grad_scale: Optional[Tensor] = None,
+        found_inf: Optional[Tensor] = None,
+        *,
+        weight_decay: float,
+        momentum: float,
+        lr: float,
+        dampening: float,
+        nesterov: bool,
+        maximize: bool):
+    r"""Functional API that performs SGD algorithm computation.
+
+    See :class:`~torch.optim.SGD` for details.
+    """
+
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if foreach is None and fused is None:
+        # why must we be explicit about an if statement for torch.jit.is_scripting here?
+        # because JIT can't handle Optionals nor fancy conditionals when scripting
+        if not torch.jit.is_scripting():
+            fused, foreach = _default_to_fused_or_foreach(params, differentiable=False, use_fused=False)
+        else:
+            foreach = False
+            fused = False
+    if foreach is None:
+        foreach = False
+    if fused is None:
+        fused = False
+
+    if foreach and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with foreach optimizers')
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError('torch.jit.script not supported with fused optimizers')
+
+    if foreach and not torch.jit.is_scripting():
+        func = _multi_tensor_sgd
+    elif fused and not torch.jit.is_scripting():
+        func = _fused_sgd
+    else:
+        func = _single_tensor_sgd
+
+    func(params,
+         d_p_list,
+         momentum_buffer_list,
+         weight_decay=weight_decay,
+         momentum=momentum,
+         lr=lr,
+         dampening=dampening,
+         nesterov=nesterov,
+         has_sparse_grad=has_sparse_grad,
+         maximize=maximize,
+         grad_scale=grad_scale,
+         found_inf=found_inf)
+
+def _single_tensor_sgd(params: List[Tensor],
+                       d_p_list: List[Tensor],
+                       momentum_buffer_list: List[Optional[Tensor]],
+                       grad_scale: Optional[Tensor],
+                       found_inf: Optional[Tensor],
+                       *,
+                       weight_decay: float,
+                       momentum: float,
+                       lr: float,
+                       dampening: float,
+                       nesterov: bool,
+                       maximize: bool,
+                       has_sparse_grad: bool):
+    assert grad_scale is None and found_inf is None
+
+    for i, param in enumerate(params):
+        d_p = d_p_list[i] if not maximize else -d_p_list[i]
+
+        if weight_decay != 0:
+            d_p = d_p.add(param, alpha=weight_decay)
+
+        if momentum != 0:
+            buf = momentum_buffer_list[i]
+
+            if buf is None:
+                buf = torch.clone(d_p).detach()
+                momentum_buffer_list[i] = buf
+            else:
+                buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+
+            if nesterov:
+                d_p = d_p.add(buf, alpha=momentum)
+            else:
+                d_p = buf
+
+        param.add_(d_p, alpha=-lr)
+
+
+def _multi_tensor_sgd(params: List[Tensor],
+                      grads: List[Tensor],
+                      momentum_buffer_list: List[Optional[Tensor]],
+                      grad_scale: Optional[Tensor],
+                      found_inf: Optional[Tensor],
+                      *,
+                      weight_decay: float,
+                      momentum: float,
+                      lr: float,
+                      dampening: float,
+                      nesterov: bool,
+                      maximize: bool,
+                      has_sparse_grad: bool):
+    assert grad_scale is None and found_inf is None
+
+    if len(params) == 0:
+        return
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype([params, grads, momentum_buffer_list], with_indices=True)
+    for ((device_params, device_grads, device_momentum_buffer_list), indices) in grouped_tensors.values():
+        device_has_sparse_grad = has_sparse_grad and any(grad.is_sparse for grad in device_grads)
+
+        if maximize:
+            device_grads = torch._foreach_neg(device_grads)
+
+        if weight_decay != 0:
+            # Re-use the intermediate memory (device_grads) already allocated for maximize
+            if maximize:
+                torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
+            else:
+                device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+
+        if momentum != 0:
+            bufs = []
+
+            all_states_with_momentum_buffer = True
+            for i in range(len(device_momentum_buffer_list)):
+                if device_momentum_buffer_list[i] is None:
+                    all_states_with_momentum_buffer = False
+                    break
+                else:
+                    bufs.append(device_momentum_buffer_list[i])
+
+            if all_states_with_momentum_buffer:
+                torch._foreach_mul_(bufs, momentum)
+                torch._foreach_add_(bufs, device_grads, alpha=1 - dampening)
+            else:
+                bufs = []
+                for i in range(len(device_momentum_buffer_list)):
+                    if device_momentum_buffer_list[i] is None:
+                        buf = device_momentum_buffer_list[i] = momentum_buffer_list[indices[i]] = \
+                            torch.clone(device_grads[i]).detach()
+                    else:
+                        buf = device_momentum_buffer_list[i]
+                        buf.mul_(momentum).add_(device_grads[i], alpha=1 - dampening)
+
+                    bufs.append(buf)
+
+            if nesterov:
+                torch._foreach_add_(device_grads, bufs, alpha=momentum)
+            else:
+                device_grads = bufs
+
+        if not device_has_sparse_grad:
+            torch._foreach_add_(device_params, device_grads, alpha=-lr)
+        else:
+            # foreach APIs don't support sparse
+            for i in range(len(device_params)):
+                device_params[i].add_(device_grads[i], alpha=-lr)
+
+
+def _fused_sgd(
+    params: List[Tensor],
+    grads: List[Tensor],
+    momentum_buffer_list: List[Optional[Tensor]],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    weight_decay: float,
+    momentum: float,
+    lr: float,
+    dampening: float,
+    nesterov: bool,
+    maximize: bool,
+    has_sparse_grad: bool,
+) -> None:
+    if not params:
+        return
+    if has_sparse_grad:
+        raise RuntimeError("`_fused_sgd` does not support sparse gradients")
+    grad_scale_dict = {grad_scale.device: grad_scale} if grad_scale is not None else None
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+
+    no_momentum_buffer = momentum == 0
+    is_first_step = all(t is None for t in momentum_buffer_list) and not no_momentum_buffer
+    if is_first_step:
+        for i, g in enumerate(grads):
+            momentum_buffer_list[i] = torch.empty_like(g)
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, momentum_buffer_list], with_indices=False)
+    for (device, dtype), ((device_params, device_grads, device_momentum_buffer_list), _) in grouped_tensors.items():
+        device_grad_scale, device_found_inf = None, None
+        if grad_scale is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device)
+            device_grad_scale = grad_scale_dict[device]
+        if found_inf is not None:
+            if device not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device)
+            device_found_inf = found_inf_dict[device]
+        torch._fused_sgd_(
+            device_params,
+            device_grads,
+            [] if no_momentum_buffer else device_momentum_buffer_list,
+            weight_decay=weight_decay,
+            momentum=momentum,
+            lr=lr,
+            dampening=dampening,
+            nesterov=nesterov,
+            maximize=maximize,
+            is_first_step=is_first_step,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
diff --git a/MLPY/Lib/site-packages/torch/optim/sgd.pyi b/MLPY/Lib/site-packages/torch/optim/sgd.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..85d73faabf3845018db46efc91b448e14a9848c4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/sgd.pyi
@@ -0,0 +1,12 @@
+from .optimizer import Optimizer, ParamsT
+
+class SGD(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        momentum: float = ...,
+        dampening: float = ...,
+        weight_decay: float = ...,
+        nesterov: bool = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/sparse_adam.py b/MLPY/Lib/site-packages/torch/optim/sparse_adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..877458a3c123a14d3e67eb49d18781c2e0bb5913
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/sparse_adam.py
@@ -0,0 +1,161 @@
+import torch
+from . import _functional as F
+from .optimizer import Optimizer, _maximize_doc
+
+__all__ = ['SparseAdam']
+
+class SparseAdam(Optimizer):
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, maximize: bool = False):
+        if not 0.0 < lr:
+            raise ValueError(f"Invalid learning rate: {lr}")
+        if not 0.0 < eps:
+            raise ValueError(f"Invalid epsilon value: {eps}")
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
+
+        defaults = dict(lr=lr, betas=betas, eps=eps, maximize=maximize)
+        super().__init__(params, defaults)
+
+        sparse_params = []
+        complex_params = []
+        for index, param_group in enumerate(self.param_groups):
+            assert isinstance(param_group, dict), f"param_groups must be a list of dicts, but got {type(param_group)}"
+            # given param group, convert given params to a list first before iterating
+            for d_index, d_param in enumerate(param_group['params']):
+                if d_param.is_sparse:
+                    sparse_params.append([index, d_index])
+                if d_param.is_complex():
+                    complex_params.append([index, d_index])
+        if sparse_params:
+            raise ValueError(
+                f"Sparse params at indices {sparse_params}: SparseAdam requires dense parameter tensors"
+            )
+        if complex_params:
+            raise ValueError(
+                f"Complex params at indices {complex_params}: SparseAdam does not support complex parameters"
+            )
+
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Perform a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_steps = []
+            eps = group['eps']
+            lr = group['lr']
+            beta1, beta2 = group['betas']
+            maximize = group.get('maximize', False)
+
+            for p in group['params']:
+                if p.grad is not None:
+                    params_with_grad.append(p)
+                    if not p.grad.is_sparse:
+                        raise RuntimeError('SparseAdam does not support dense gradients, please consider Adam instead')
+                    grads.append(p.grad)
+
+                    state = self.state[p]
+
+                    # State initialization
+                    if len(state) == 0:
+                        state['step'] = 0
+                        # Exponential moving average of gradient values
+                        state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                        # Exponential moving average of squared gradient values
+                        state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                    exp_avgs.append(state['exp_avg'])
+                    exp_avg_sqs.append(state['exp_avg_sq'])
+
+                    # update the steps for each param group update
+                    state['step'] += 1
+                    # record the step after step update
+                    state_steps.append(state['step'])
+
+            F.sparse_adam(params_with_grad,
+                          grads,
+                          exp_avgs,
+                          exp_avg_sqs,
+                          state_steps,
+                          beta1=beta1,
+                          beta2=beta2,
+                          lr=group['lr'],
+                          eps=group['eps'],
+                          maximize=maximize)
+
+        return loss
+
+SparseAdam.__doc__ = fr"""SparseAdam implements a masked version of the Adam algorithm
+    suitable for sparse gradients. Currently, due to implementation constraints (explained
+    below), SparseAdam is only intended for a narrow subset of use cases, specifically
+    parameters of a dense layout with gradients of a sparse layout. This occurs in a
+    special case where the module backwards produces grads already in a sparse layout.
+    One example NN module that behaves as such is ``nn.Embedding(sparse=True)``.
+
+    SparseAdam approximates the Adam algorithm by masking out the parameter and moment
+    updates corresponding to the zero values in the gradients. Whereas the Adam algorithm
+    will update the first moment, the second moment, and the parameters based on all values
+    of the gradients, SparseAdam only updates the moments and parameters corresponding
+    to the non-zero values of the gradients.
+
+    A simplified way of thinking about the `intended` implementation is as such:
+
+    1. Create a mask of the non-zero values in the sparse gradients. For example,
+       if your gradient looks like [0, 5, 0, 0, 9], the mask would be [0, 1, 0, 0, 1].
+    2. Apply this mask over the running moments and do computation on only the
+       non-zero values.
+    3. Apply this mask over the parameters and only apply an update on non-zero values.
+
+    In actuality, we use sparse layout Tensors to optimize this approximation, which means the
+    more gradients that are masked by not being materialized, the more performant the optimization.
+    Since we rely on using sparse layout tensors, we infer that any materialized value in the
+    sparse layout is non-zero and we do NOT actually verify that all values are not zero!
+    It is important to not conflate a semantically sparse tensor (a tensor where many
+    of its values are zeros) with a sparse layout tensor (a tensor where ``.is_sparse``
+    returns ``True``). The SparseAdam approximation is intended for `semantically` sparse
+    tensors and the sparse layout is only a implementation detail. A clearer implementation
+    would be to use MaskedTensors, but those are experimental.
+
+
+    .. note::
+
+        If you suspect your gradients are semantically sparse (but do not have sparse
+        layout), this variant may not be the best for you. Ideally, you want to avoid
+        materializing anything that is suspected to be sparse in the first place, since
+        needing to convert all your grads from dense layout to sparse layout may outweigh
+        the performance gain. Here, using Adam may be the best alternative, unless you
+        can easily rig up your module to output sparse grads similar to
+        ``nn.Embedding(sparse=True)``. If you insist on converting your grads, you can do
+        so by manually overriding your parameters' ``.grad`` fields with their sparse
+        equivalents before calling ``.step()``.
+
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        {_maximize_doc}
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+
+    """
diff --git a/MLPY/Lib/site-packages/torch/optim/sparse_adam.pyi b/MLPY/Lib/site-packages/torch/optim/sparse_adam.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..095ea3be32fe968f8011667ad68054ff4e062c17
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/sparse_adam.pyi
@@ -0,0 +1,12 @@
+from typing import Tuple
+
+from .optimizer import Optimizer, ParamsT
+
+class SparseAdam(Optimizer):
+    def __init__(
+        self,
+        params: ParamsT,
+        lr: float = ...,
+        betas: Tuple[float, float] = ...,
+        eps: float = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/optim/swa_utils.py b/MLPY/Lib/site-packages/torch/optim/swa_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b10b67cfcfae4e29d02387864f637aaa7e55d12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/swa_utils.py
@@ -0,0 +1,377 @@
+import itertools
+import math
+from copy import deepcopy
+import warnings
+
+import torch
+from torch.nn import Module
+from torch.optim.lr_scheduler import LRScheduler
+from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
+
+__all__ = [
+    'AveragedModel',
+    'update_bn',
+    'SWALR',
+    'get_ema_multi_avg_fn',
+    'get_swa_multi_avg_fn',
+    'get_ema_avg_fn',
+    'get_swa_avg_fn'
+]
+
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
+
+
+def get_ema_multi_avg_fn(decay=0.999):
+    @torch.no_grad()
+    def ema_update(ema_param_list, current_param_list, _):
+        # foreach lerp only handles float and complex
+        if torch.is_floating_point(ema_param_list[0]) or torch.is_complex(ema_param_list[0]):
+            torch._foreach_lerp_(ema_param_list, current_param_list, 1 - decay)
+        else:
+            for p_ema, p_model in zip(ema_param_list, current_param_list):
+                p_ema.copy_(p_ema * decay + p_model * (1 - decay))
+
+    return ema_update
+
+
+def get_swa_multi_avg_fn():
+    @torch.no_grad()
+    def swa_update(averaged_param_list, current_param_list, num_averaged):
+        # foreach lerp only handles float and complex
+        if torch.is_floating_point(averaged_param_list[0]) or torch.is_complex(averaged_param_list[0]):
+            torch._foreach_lerp_(averaged_param_list, current_param_list, 1 / (num_averaged + 1))
+        else:
+            diffs = torch._foreach_sub(current_param_list, averaged_param_list)
+            torch._foreach_addcdiv_(averaged_param_list, diffs, [num_averaged + 1] * len(averaged_param_list))
+
+    return swa_update
+
+
+def get_ema_avg_fn(decay=0.999):
+    @torch.no_grad()
+    def ema_update(ema_param, current_param, num_averaged):
+        return decay * ema_param + (1 - decay) * current_param
+
+    return ema_update
+
+
+def get_swa_avg_fn():
+    @torch.no_grad()
+    def swa_update(averaged_param, current_param, num_averaged):
+        return averaged_param + (current_param - averaged_param) / (num_averaged + 1)
+
+    return swa_update
+
+
+class AveragedModel(Module):
+    r"""Implements averaged model for Stochastic Weight Averaging (SWA) and
+    Exponential Moving Average (EMA).
+
+    Stochastic Weight Averaging was proposed in `Averaging Weights Leads to
+    Wider Optima and Better Generalization`_ by Pavel Izmailov, Dmitrii
+    Podoprikhin, Timur Garipov, Dmitry Vetrov and Andrew Gordon Wilson
+    (UAI 2018).
+
+    Exponential Moving Average is a variation of `Polyak averaging`_,
+    but using exponential weights instead of equal weights across iterations.
+
+    AveragedModel class creates a copy of the provided module :attr:`model`
+    on the device :attr:`device` and allows to compute running averages of the
+    parameters of the :attr:`model`.
+
+    Args:
+        model (torch.nn.Module): model to use with SWA/EMA
+        device (torch.device, optional): if provided, the averaged model will be
+            stored on the :attr:`device`
+        avg_fn (function, optional): the averaging function used to update
+            parameters; the function must take in the current value of the
+            :class:`AveragedModel` parameter, the current value of :attr:`model`
+            parameter, and the number of models already averaged; if None,
+            an equally weighted average is used (default: None)
+        multi_avg_fn (function, optional): the averaging function used to update
+            parameters inplace; the function must take in the current values of the
+            :class:`AveragedModel` parameters as a list, the current values of :attr:`model`
+            parameters as a list, and the number of models already averaged; if None,
+            an equally weighted average is used (default: None)
+        use_buffers (bool): if ``True``, it will compute running averages for
+            both the parameters and the buffers of the model. (default: ``False``)
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> loader, optimizer, model, loss_fn = ...
+        >>> swa_model = torch.optim.swa_utils.AveragedModel(model)
+        >>> scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
+        >>>                                     T_max=300)
+        >>> swa_start = 160
+        >>> swa_scheduler = SWALR(optimizer, swa_lr=0.05)
+        >>> for i in range(300):
+        >>>      for input, target in loader:
+        >>>          optimizer.zero_grad()
+        >>>          loss_fn(model(input), target).backward()
+        >>>          optimizer.step()
+        >>>      if i > swa_start:
+        >>>          swa_model.update_parameters(model)
+        >>>          swa_scheduler.step()
+        >>>      else:
+        >>>          scheduler.step()
+        >>>
+        >>> # Update bn statistics for the swa_model at the end
+        >>> torch.optim.swa_utils.update_bn(loader, swa_model)
+
+    You can also use custom averaging functions with the `avg_fn` or `multi_avg_fn` parameters.
+    If no averaging function is provided, the default is to compute
+    equally-weighted average of the weights (SWA).
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined variables")
+        >>> # Compute exponential moving averages of the weights and buffers
+        >>> ema_model = torch.optim.swa_utils.AveragedModel(model,
+        >>>             torch.optim.swa_utils.get_ema_multi_avg_fn(0.9), use_buffers=True)
+
+    .. note::
+        When using SWA/EMA with models containing Batch Normalization you may
+        need to update the activation statistics for Batch Normalization.
+        This can be done either by using the :meth:`torch.optim.swa_utils.update_bn`
+        or by setting :attr:`use_buffers` to `True`. The first approach updates the
+        statistics in a post-training step by passing data through the model. The
+        second does it during the parameter update phase by averaging all buffers.
+        Empirical evidence has shown that updating the statistics in normalization
+        layers increases accuracy, but you may wish to empirically test which
+        approach yields the best results in your problem.
+
+    .. note::
+        :attr:`avg_fn` and `multi_avg_fn` are not saved in the :meth:`state_dict` of the model.
+
+    .. note::
+        When :meth:`update_parameters` is called for the first time (i.e.
+        :attr:`n_averaged` is `0`) the parameters of `model` are copied
+        to the parameters of :class:`AveragedModel`. For every subsequent
+        call of :meth:`update_parameters` the function `avg_fn` is used
+        to update the parameters.
+
+    .. _Averaging Weights Leads to Wider Optima and Better Generalization:
+        https://arxiv.org/abs/1803.05407
+    .. _There Are Many Consistent Explanations of Unlabeled Data: Why You Should
+        Average:
+        https://arxiv.org/abs/1806.05594
+    .. _SWALP: Stochastic Weight Averaging in Low-Precision Training:
+        https://arxiv.org/abs/1904.11943
+    .. _Stochastic Weight Averaging in Parallel: Large-Batch Training That
+        Generalizes Well:
+        https://arxiv.org/abs/2001.02312
+    .. _Polyak averaging:
+        https://paperswithcode.com/method/polyak-averaging
+    """
+    def __init__(self, model, device=None, avg_fn=None, multi_avg_fn=None, use_buffers=False):
+        super().__init__()
+        assert avg_fn is None or multi_avg_fn is None, 'Only one of avg_fn and multi_avg_fn should be provided'
+        self.module = deepcopy(model)
+        if device is not None:
+            self.module = self.module.to(device)
+        self.register_buffer('n_averaged',
+                             torch.tensor(0, dtype=torch.long, device=device))
+        self.avg_fn = avg_fn
+        self.multi_avg_fn = multi_avg_fn
+        self.use_buffers = use_buffers
+
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+
+    def update_parameters(self, model):
+        self_param = (
+            itertools.chain(self.module.parameters(), self.module.buffers())
+            if self.use_buffers else self.parameters()
+        )
+        model_param = (
+            itertools.chain(model.parameters(), model.buffers())
+            if self.use_buffers else model.parameters()
+        )
+        self_param_detached = []
+        model_param_detached = []
+        for p_averaged, p_model in zip(self_param, model_param):
+            p_model_ = p_model.detach().to(p_averaged.device)
+            self_param_detached.append(p_averaged.detach())
+            model_param_detached.append(p_model_)
+            if self.n_averaged == 0:
+                p_averaged.detach().copy_(p_model_)
+
+        if self.n_averaged > 0:
+            if self.multi_avg_fn is not None or self.avg_fn is None:
+                grouped_tensors = _group_tensors_by_device_and_dtype([self_param_detached, model_param_detached])
+                for ((device, _), ([self_params, model_params], _)) in grouped_tensors.items():
+                    if self.multi_avg_fn:
+                        self.multi_avg_fn(self_params, model_params, self.n_averaged.to(device))
+                    elif device.type in _get_foreach_kernels_supported_devices():
+                        multi_avg_fn = get_swa_multi_avg_fn()
+                        multi_avg_fn(self_params, model_params, self.n_averaged.to(device))
+                    else:
+                        avg_fn = get_swa_avg_fn()
+                        n_averaged = self.n_averaged.to(device)
+                        for p_averaged, p_model in zip(self_params, model_params):
+                            p_averaged.copy_(avg_fn(p_averaged, p_model, n_averaged))
+            else:
+                for p_averaged, p_model in zip(self_param_detached, model_param_detached):
+                    n_averaged = self.n_averaged.to(p_averaged.device)
+                    p_averaged.detach().copy_(self.avg_fn(p_averaged.detach(), p_model, n_averaged))
+
+        if not self.use_buffers:
+            # If not apply running averages to the buffers,
+            # keep the buffers in sync with the source model.
+            for b_swa, b_model in zip(self.module.buffers(), model.buffers()):
+                b_swa.detach().copy_(b_model.detach().to(b_swa.device))
+        self.n_averaged += 1
+
+
+@torch.no_grad()
+def update_bn(loader, model, device=None):
+    r"""Updates BatchNorm running_mean, running_var buffers in the model.
+
+    It performs one pass over data in `loader` to estimate the activation
+    statistics for BatchNorm layers in the model.
+    Args:
+        loader (torch.utils.data.DataLoader): dataset loader to compute the
+            activation statistics on. Each data batch should be either a
+            tensor, or a list/tuple whose first element is a tensor
+            containing data.
+        model (torch.nn.Module): model for which we seek to update BatchNorm
+            statistics.
+        device (torch.device, optional): If set, data will be transferred to
+            :attr:`device` before being passed into :attr:`model`.
+
+    Example:
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> loader, model = ...
+        >>> torch.optim.swa_utils.update_bn(loader, model)
+
+    .. note::
+        The `update_bn` utility assumes that each data batch in :attr:`loader`
+        is either a tensor or a list or tuple of tensors; in the latter case it
+        is assumed that :meth:`model.forward()` should be called on the first
+        element of the list or tuple corresponding to the data batch.
+    """
+    momenta = {}
+    for module in model.modules():
+        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+            module.reset_running_stats()
+            momenta[module] = module.momentum
+
+    if not momenta:
+        return
+
+    was_training = model.training
+    model.train()
+    for module in momenta.keys():
+        module.momentum = None
+
+    for input in loader:
+        if isinstance(input, (list, tuple)):
+            input = input[0]
+        if device is not None:
+            input = input.to(device)
+
+        model(input)
+
+    for bn_module in momenta.keys():
+        bn_module.momentum = momenta[bn_module]
+    model.train(was_training)
+
+
+class SWALR(LRScheduler):
+    r"""Anneals the learning rate in each parameter group to a fixed value.
+
+    This learning rate scheduler is meant to be used with Stochastic Weight
+    Averaging (SWA) method (see `torch.optim.swa_utils.AveragedModel`).
+
+    Args:
+        optimizer (torch.optim.Optimizer): wrapped optimizer
+        swa_lrs (float or list): the learning rate value for all param groups
+            together or separately for each group.
+        annealing_epochs (int): number of epochs in the annealing phase
+            (default: 10)
+        annealing_strategy (str): "cos" or "linear"; specifies the annealing
+            strategy: "cos" for cosine annealing, "linear" for linear annealing
+            (default: "cos")
+        last_epoch (int): the index of the last epoch (default: -1)
+
+    The :class:`SWALR` scheduler can be used together with other
+    schedulers to switch to a constant learning rate late in the training
+    as in the example below.
+
+    Example:
+        >>> # xdoctest: +SKIP("Undefined variables")
+        >>> loader, optimizer, model = ...
+        >>> lr_lambda = lambda epoch: 0.9
+        >>> scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer,
+        >>>        lr_lambda=lr_lambda)
+        >>> swa_scheduler = torch.optim.swa_utils.SWALR(optimizer,
+        >>>        anneal_strategy="linear", anneal_epochs=20, swa_lr=0.05)
+        >>> swa_start = 160
+        >>> for i in range(300):
+        >>>      for input, target in loader:
+        >>>          optimizer.zero_grad()
+        >>>          loss_fn(model(input), target).backward()
+        >>>          optimizer.step()
+        >>>      if i > swa_start:
+        >>>          swa_scheduler.step()
+        >>>      else:
+        >>>          scheduler.step()
+
+    .. _Averaging Weights Leads to Wider Optima and Better Generalization:
+        https://arxiv.org/abs/1803.05407
+    """
+    def __init__(self, optimizer, swa_lr, anneal_epochs=10, anneal_strategy='cos', last_epoch=-1):
+        swa_lrs = self._format_param(optimizer, swa_lr)
+        for swa_lr, group in zip(swa_lrs, optimizer.param_groups):
+            group['swa_lr'] = swa_lr
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError("anneal_strategy must by one of 'cos' or 'linear', "
+                             f"instead got {anneal_strategy}")
+        elif anneal_strategy == 'cos':
+            self.anneal_func = self._cosine_anneal
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._linear_anneal
+        if not isinstance(anneal_epochs, int) or anneal_epochs < 0:
+            raise ValueError(f"anneal_epochs must be equal or greater than 0, got {anneal_epochs}")
+        self.anneal_epochs = anneal_epochs
+        super().__init__(optimizer, last_epoch)
+
+    @staticmethod
+    def _format_param(optimizer, swa_lrs):
+        if isinstance(swa_lrs, (list, tuple)):
+            if len(swa_lrs) != len(optimizer.param_groups):
+                raise ValueError("swa_lr must have the same length as "
+                                 f"optimizer.param_groups: swa_lr has {len(swa_lrs)}, "
+                                 f"optimizer.param_groups has {len(optimizer.param_groups)}")
+            return swa_lrs
+        else:
+            return [swa_lrs] * len(optimizer.param_groups)
+
+    @staticmethod
+    def _linear_anneal(t):
+        return t
+
+    @staticmethod
+    def _cosine_anneal(t):
+        return (1 - math.cos(math.pi * t)) / 2
+
+    @staticmethod
+    def _get_initial_lr(lr, swa_lr, alpha):
+        if alpha == 1:
+            return swa_lr
+        return (lr - alpha * swa_lr) / (1 - alpha)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", UserWarning)
+        step = self._step_count - 1
+        if self.anneal_epochs == 0:
+            step = max(1, step)
+        prev_t = max(0, min(1, (step - 1) / max(1, self.anneal_epochs)))
+        prev_alpha = self.anneal_func(prev_t)
+        prev_lrs = [self._get_initial_lr(group['lr'], group['swa_lr'], prev_alpha)
+                    for group in self.optimizer.param_groups]
+        t = max(0, min(1, step / max(1, self.anneal_epochs)))
+        alpha = self.anneal_func(t)
+        return [group['swa_lr'] * alpha + lr * (1 - alpha)
+                for group, lr in zip(self.optimizer.param_groups, prev_lrs)]
diff --git a/MLPY/Lib/site-packages/torch/optim/swa_utils.pyi b/MLPY/Lib/site-packages/torch/optim/swa_utils.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e4e4db96b537135bda5ce50d189ecdd5de988a76
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/optim/swa_utils.pyi
@@ -0,0 +1,32 @@
+from typing import Any, Callable, Iterable, Union
+
+from torch import device, Tensor
+from torch.nn.modules import Module
+from .lr_scheduler import _LRScheduler
+from .optimizer import Optimizer
+
+class AveragedModel(Module):
+    def __init__(
+        self,
+        model: Module,
+        device: Union[int, device] = ...,
+        avg_fn: Callable[[Tensor, Tensor, int], Tensor] = ...,
+        use_buffers: bool = ...,
+    ) -> None: ...
+    def update_parameters(self, model: Module) -> None: ...
+
+def update_bn(
+    loader: Iterable[Any],
+    model: Module,
+    device: Union[int, device] = ...,
+) -> None: ...
+
+class SWALR(_LRScheduler):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        swa_lr: float,
+        anneal_epochs: int,
+        anneal_strategy: str,
+        last_epoch: int = ...,
+    ) -> None: ...
diff --git a/MLPY/Lib/site-packages/torch/overrides.py b/MLPY/Lib/site-packages/torch/overrides.py
new file mode 100644
index 0000000000000000000000000000000000000000..127fd947fd0281a420a388d451b0b02ebd39572d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/overrides.py
@@ -0,0 +1,1973 @@
+"""
+Python implementation of ``__torch_function__``
+
+While most of the torch API and handling for ``__torch_function__`` happens
+at the C++ level, some of the torch API is written in Python so we need
+python-level handling for ``__torch_function__`` overrides as well. The main
+developer-facing functionality in this file are handle_torch_function and
+has_torch_function. See torch/functional.py and test/test_overrides.py
+for usage examples.
+
+Note
+----
+heavily inspired by NumPy's ``__array_function__`` (see:
+https://github.com/pytorch/pytorch/issues/24015 and
+https://www.numpy.org/neps/nep-0018-array-function-protocol.html
+)
+
+If changing this file in a way that can affect ``__torch_function__`` overhead,
+please report the benchmarks in ``benchmarks/overrides_benchmark``. See the
+instructions in the ``README.md`` in that directory.
+"""
+
+import __future__  # noqa: F404
+
+import collections
+import functools
+import types
+import warnings
+from typing import Dict, Set, List, Any, Callable, Iterable, Type, Tuple
+from functools import wraps
+import contextlib
+
+import torch
+from torch._C import (
+    _has_torch_function, _has_torch_function_unary,
+    _has_torch_function_variadic, _add_docstr,
+    _push_on_torch_function_stack, _pop_torch_function_stack, _get_function_stack_at, _len_torch_function_stack,
+    _is_torch_function_mode_enabled)
+
+__all__ = [
+    "get_ignored_functions",
+    "get_overridable_functions",
+    "get_testing_overrides",
+    "handle_torch_function",
+    "has_torch_function",
+    "resolve_name",
+    "is_tensor_like",
+    "is_tensor_method_or_property",
+    "wrap_torch_function",
+    "enable_reentrant_dispatch",
+]
+
+
+def _disable_user_warnings(
+        func: Callable, regex: str = '.*is deprecated, please use.*', module: str = 'torch') -> Callable:
+    """
+    Decorator that temporarily disables ``UserWarning``s for the given ``module`` if the warning message matches the
+    given ``regex`` pattern.
+
+    Arguments
+    ---------
+    func : function
+        Function to disable the warnings for.
+    regex : str
+        A regex pattern compilable by ``re.compile``. This is used to match the ``UserWarning`` message.
+    module : str
+        The python module to which the filtering should be restricted.
+
+    Returns
+    -------
+    function
+        The wrapped function.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=UserWarning, message=regex, module=module)
+            return func(*args, **kwargs)
+    return wrapper
+
+
+@functools.lru_cache(None)
+@_disable_user_warnings
+def get_ignored_functions() -> Set[Callable]:
+    """
+    Return public functions that cannot be overridden by ``__torch_function__``.
+
+    Returns
+    -------
+    Set[Callable]
+        A tuple of functions that are publicly available in the torch API but cannot
+        be overridden with ``__torch_function__``. Mostly this is because none of the
+        arguments of these functions are tensors or tensor-likes.
+
+    Examples
+    --------
+    >>> torch.Tensor.as_subclass in torch.overrides.get_ignored_functions()
+    True
+    >>> torch.add in torch.overrides.get_ignored_functions()
+    False
+    """
+    Tensor = torch.Tensor
+    return {
+        torch.typename,
+        torch.is_tensor,
+        torch.is_storage,
+        torch.set_default_tensor_type,
+        torch.set_default_device,
+        torch.get_default_device,
+        torch.set_rng_state,
+        torch.get_rng_state,
+        torch.manual_seed,
+        torch.initial_seed,
+        torch.seed,
+        torch.save,
+        torch.load,
+        torch.set_printoptions,
+        torch.fork,
+        torch.get_default_dtype,
+        torch.get_num_interop_threads,
+        torch.get_num_threads,
+        torch.init_num_threads,
+        torch.import_ir_module,
+        torch.import_ir_module_from_buffer,
+        torch.is_anomaly_enabled,
+        torch.is_anomaly_check_nan_enabled,
+        torch.is_grad_enabled,
+        torch.merge_type_from_type_comment,
+        torch.parse_ir,
+        torch.parse_schema,
+        torch.parse_type_comment,
+        torch.set_anomaly_enabled,
+        torch.set_flush_denormal,
+        torch.set_num_interop_threads,
+        torch.set_num_threads,
+        torch.wait,
+        torch.as_tensor,
+        torch.from_numpy,
+        torch.get_device,
+        torch.tensor,
+        torch.default_generator,
+        torch.has_cuda,
+        torch.has_cudnn,
+        torch.has_lapack,
+        torch.device,
+        torch.dtype,
+        torch.finfo,
+        torch.has_mkl,
+        torch.has_mps,
+        torch.has_mkldnn,
+        torch.has_openmp,
+        torch.iinfo,
+        torch.memory_format,
+        torch.qscheme,
+        torch.set_grad_enabled,
+        torch.no_grad,
+        torch.enable_grad,
+        torch.inference_mode,
+        torch.is_inference_mode_enabled,
+        torch.layout,
+        torch.align_tensors,
+        torch.arange,
+        torch.as_strided,
+        torch.bartlett_window,
+        torch.blackman_window,
+        torch.broadcast_shapes,
+        torch.can_cast,
+        torch.compile,
+        torch.cudnn_affine_grid_generator,
+        torch.cudnn_batch_norm,
+        torch.cudnn_convolution,
+        torch.cudnn_convolution_transpose,
+        torch.cudnn_convolution_relu,
+        torch.cudnn_convolution_add_relu,
+        torch.cudnn_grid_sampler,
+        torch.cudnn_is_acceptable,
+        torch.empty,
+        torch.empty_permuted,
+        torch.empty_strided,
+        torch.empty_quantized,
+        torch.export.dynamic_dim,
+        torch.export.export,
+        torch.export.load,
+        torch.export.register_dataclass,
+        torch.export.save,
+        torch.eye,
+        torch.fft.fftfreq,
+        torch.fft.rfftfreq,
+        torch.from_file,
+        torch.full,
+        torch.fill,
+        torch.hamming_window,
+        torch.hann_window,
+        torch.kaiser_window,
+        torch.linspace,
+        torch.logspace,
+        torch.mkldnn_adaptive_avg_pool2d,
+        torch.mkldnn_convolution,
+        torch.mkldnn_max_pool2d,
+        torch.mkldnn_max_pool3d,
+        torch.mkldnn_linear_backward_weights,
+        torch.mkldnn_rnn_layer,
+        torch.normal,
+        torch.ones,
+        torch.promote_types,
+        torch.rand,
+        torch.randn,
+        torch.randint,
+        torch.randperm,
+        torch.range,
+        torch.result_type,
+        torch.scalar_tensor,
+        torch.sparse_coo_tensor,
+        torch.sparse_compressed_tensor,
+        torch.sparse_csr_tensor,
+        torch.sparse_csc_tensor,
+        torch.sparse_bsr_tensor,
+        torch.sparse_bsc_tensor,
+        torch.sym_constrain_range,
+        torch.sym_constrain_range_for_size,
+        torch.tril_indices,
+        torch.triu_indices,
+        torch.vander,
+        torch.zeros,
+        torch._jit_internal.boolean_dispatch,
+        torch.nn.functional.assert_int_or_pair,
+        torch.nn.functional.upsample,
+        torch.nn.functional.upsample_bilinear,
+        torch.nn.functional.upsample_nearest,
+        torch.nn.functional.has_torch_function,
+        torch.nn.functional.has_torch_function_unary,
+        torch.nn.functional.has_torch_function_variadic,
+        torch.nn.functional.handle_torch_function,
+        torch.nn.functional.sigmoid,
+        torch.nn.functional.hardsigmoid,
+        torch.nn.functional.tanh,
+        torch.nn.functional._canonical_mask,
+        torch.nn.functional._none_or_dtype,
+        # Doesn't actually take or return tensor arguments
+        torch.nn.init.calculate_gain,
+        # These are deprecated; don't test them
+        torch.nn.init.uniform,
+        torch.nn.init.normal,
+        torch.nn.init.constant,
+        torch.nn.init.eye,
+        torch.nn.init.dirac,
+        torch.nn.init.xavier_uniform,
+        torch.nn.init.xavier_normal,
+        torch.nn.init.kaiming_uniform,
+        torch.nn.init.kaiming_normal,
+        torch.nn.init.orthogonal,
+        torch.nn.init.sparse,
+        torch.nested.to_padded_tensor,
+        has_torch_function,
+        handle_torch_function,
+        torch.set_autocast_enabled,
+        torch.is_autocast_enabled,
+        torch.clear_autocast_cache,
+        torch.set_autocast_cpu_enabled,
+        torch.is_autocast_cpu_enabled,
+        torch.set_autocast_xla_enabled,
+        torch.is_autocast_xla_enabled,
+        torch.set_autocast_ipu_enabled,
+        torch.is_autocast_ipu_enabled,
+        torch.set_autocast_cpu_dtype,
+        torch.get_autocast_cpu_dtype,
+        torch.set_autocast_ipu_dtype,
+        torch.get_autocast_ipu_dtype,
+        torch.get_autocast_gpu_dtype,
+        torch.set_autocast_gpu_dtype,
+        torch.get_autocast_xla_dtype,
+        torch.set_autocast_xla_dtype,
+        torch.autocast_increment_nesting,
+        torch.autocast_decrement_nesting,
+        torch.is_autocast_cache_enabled,
+        torch.set_autocast_cache_enabled,
+        torch.nn.functional.hardswish,
+        torch.is_vulkan_available,
+        torch.are_deterministic_algorithms_enabled,
+        torch.use_deterministic_algorithms,
+        torch.is_deterministic_algorithms_warn_only_enabled,
+        torch.set_deterministic_debug_mode,
+        torch.get_deterministic_debug_mode,
+        torch.set_float32_matmul_precision,
+        torch.get_float32_matmul_precision,
+        torch.unify_type_list,
+        torch.is_warn_always_enabled,
+        torch.set_warn_always,
+        torch.vitals_enabled,
+        torch.set_vital,
+        torch.read_vitals,
+        torch.vmap,
+        torch.cond,
+        torch.frombuffer,
+        torch.asarray,
+        torch._functional_sym_constrain_range,
+        torch._make_dep_token,
+        Tensor.__delitem__,
+        Tensor.__dir__,
+        Tensor.__getattribute__,
+        Tensor.__init__,
+        Tensor.__iter__,
+        Tensor.__init_subclass__,
+        Tensor.__delattr__,
+        Tensor.__setattr__,
+        Tensor.__torch_function__,
+        Tensor.__torch_dispatch__,
+        Tensor.__new__,
+        Tensor.__class__,
+        Tensor.__subclasshook__,
+        Tensor.__hash__,
+        Tensor.as_subclass,
+        Tensor.eig,
+        Tensor.lstsq,
+        Tensor.reinforce,
+        Tensor.new,
+        Tensor.new_tensor,
+        Tensor.new_empty,
+        Tensor.new_empty_strided,
+        Tensor.new_zeros,
+        Tensor.new_ones,
+        Tensor.new_full,
+        Tensor._make_subclass,
+        Tensor.solve,
+        Tensor.symeig,
+        Tensor.stride,
+        Tensor.unflatten,
+        Tensor.to_sparse_coo,
+        Tensor.to_sparse_csr,
+        Tensor.to_sparse_csc,
+        Tensor.to_sparse_bsr,
+        Tensor.to_sparse_bsc,
+        Tensor._to_sparse,
+        Tensor._to_sparse_csr,
+        Tensor._to_sparse_csc,
+        Tensor._to_sparse_bsr,
+        Tensor._to_sparse_bsc,
+        Tensor._typed_storage,
+        Tensor._reduce_ex_internal,
+        Tensor._fix_weakref,
+        Tensor._view_func,
+        Tensor._view_func_unsafe,
+        Tensor._rev_view_func_unsafe,
+        Tensor._make_wrapper_subclass,
+        Tensor._python_dispatch.__get__,
+        Tensor._has_symbolic_sizes_strides.__get__,
+        Tensor._conj,
+        Tensor._conj_physical,
+        Tensor._lazy_clone,
+        Tensor._neg_view,
+        Tensor._is_zerotensor,
+        Tensor._is_all_true,
+        Tensor._is_any_true,
+        Tensor._addmm_activation,
+        Tensor.to_padded_tensor,
+    }
+
+
+@functools.lru_cache(None)
+def get_default_nowrap_functions() -> Set[Callable]:
+    """
+    Return public functions that do not wrap in a subclass when invoked by
+    the default ``Tensor.__torch_function__`` that preserves subclasses.  Typically,
+    these functions represent field accesses (i.e., retrieving a Tensor that
+    is stored somewhere on the Tensor) as opposed to computation.  Users of
+    these functions expect object identity to be preserved over multiple accesses
+    (e.g., ``a.grad is a.grad``) which cannot be upheld if we're wrapping on
+    the fly every time (furthermore, the tensor stored here might already be
+    the subclass, in which case wrapping really ought not to happen).
+
+    Not ALL property accessors have this property; for example ``Tensor.T`` actually
+    just creates a new transposed tensor on the fly, and so we SHOULD interpose on
+    these calls (you need to check the implementation of the function to see if
+    this is the case or not).  Additionally, if a property accessor doesn't return a Tensor,
+    it doesn't have to be on this list (though it is harmless if it is).
+    """
+    Tensor = torch.Tensor
+    return {
+        Tensor._base.__get__,
+        Tensor.grad.__get__,
+        Tensor._grad.__get__,
+    }
+
+
+@functools.lru_cache(None)
+@_disable_user_warnings
+def get_testing_overrides() -> Dict[Callable, Callable]:
+    """Return a dict containing dummy overrides for all overridable functions
+
+    Returns
+    -------
+    Dict[Callable, Callable]
+        A dictionary that maps overridable functions in the PyTorch API to
+        lambda functions that have the same signature as the real function
+        and unconditionally return -1. These lambda functions are useful
+        for testing API coverage for a type that defines ``__torch_function__``.
+
+    Examples
+    --------
+    >>> import inspect
+    >>> my_add = torch.overrides.get_testing_overrides()[torch.add]
+    >>> inspect.signature(my_add)
+    <Signature (input, other, out=None)>
+    """
+    # Every function in the PyTorchAPI that can be overriden needs an entry
+    # in this dict.
+    #
+    # Optimally we would use inspect to get the function signature and define
+    # the lambda function procedurally but that is blocked by generating
+    # function signatures for native kernels that can be consumed by inspect.
+    # See Issue #28233.
+    Tensor = torch.Tensor
+    ret: Dict[Callable, Callable] = {
+        torch.abs: lambda input, out=None: -1,
+        torch.absolute: lambda input, out=None: -1,
+        torch.adaptive_avg_pool1d: lambda input, output_size: -1,
+        torch.adaptive_max_pool1d: lambda inputs, output_size: -1,
+        torch.acos: lambda input, out=None: -1,
+        torch.adjoint: lambda input: -1,
+        torch.arccos: lambda input, out=None: -1,
+        torch.acosh: lambda input, out=None: -1,
+        torch.arccosh: lambda input, out=None: -1,
+        torch.add: lambda input, other, out=None: -1,
+        torch.addbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1,
+        torch.addcdiv: lambda input, tensor1, tensor2, value=1, out=None: -1,
+        torch.addcmul: lambda input, tensor1, tensor2, value=1, out=None: -1,
+        torch.addmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.addmv: lambda input, mat, vec, beta=1, alpha=1, out=None: -1,
+        torch.addr: lambda input, vec1, vec2, beta=1, alpha=1, out=None: -1,
+        torch.affine_grid_generator: lambda theta, size, align_corners: -1,
+        torch.all: lambda input, dim=None: -1,
+        torch.allclose: lambda input, other, trol=1e-05, atol=1e-08, equal_nan=False: -1,
+        torch.alpha_dropout: lambda input, p, train, inplace=False: -1,
+        torch.amax: lambda input, dim=None: -1,
+        torch.amin: lambda input, dim=None: -1,
+        torch.aminmax: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.angle: lambda input, out=None: -1,
+        torch.any: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.argmax: lambda input: -1,
+        torch.argmin: lambda input: -1,
+        torch.argsort: lambda input, dim=None: -1,
+        torch.asin: lambda input, out=None: -1,
+        torch._assert_async: lambda input, msg: -1,
+        torch.arcsin: lambda input, out=None: -1,
+        torch.asinh: lambda input, out=None: -1,
+        torch.arcsinh: lambda input, out=None: -1,
+        torch.atan: lambda input, out=None: -1,
+        torch.arctan: lambda input, out=None: -1,
+        torch.atan2: lambda input, other, out=None: -1,
+        torch.arctan2: lambda input, other, out=None: -1,
+        torch.atanh: lambda input, out=None: -1,
+        torch.arctanh: lambda input, out=None: -1,
+        torch.atleast_1d: lambda *tensors: -1,
+        torch.atleast_2d: lambda *tensors: -1,
+        torch.atleast_3d: lambda *tensors: -1,
+        torch.avg_pool1d: lambda input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True: -1,
+        torch.baddbmm: lambda input, batch1, batch2, alpha=1, beta=1, out=None: -1,
+        torch.batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled: -1,
+        torch.batch_norm_backward_elemt: lambda grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count_tensor: -1,
+        torch.batch_norm_backward_reduce: lambda grad_out, input, mean, invstd, weight, input_g, weight_g, bias_g: -1,
+        torch.batch_norm_elemt: lambda input, weight, bias, mean, invstd, eps: -1,
+        torch.batch_norm_gather_stats: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1,
+        torch.batch_norm_gather_stats_with_counts: lambda input, mean, invstd, running_mean, running_var, momentum, eps, count: -1,
+        torch.batch_norm_stats: lambda input, eps: -1,
+        torch.batch_norm_update_stats: lambda input, running_mean, running_var, momentum: -1,
+        torch.bernoulli: lambda input, generator=None, out=None: -1,
+        torch.bilinear: lambda input1, input2, weight, bias: -1,
+        torch.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None, reduce=None,
+                                                 reduction='mean', pos_weight=None: -1),
+        torch.bincount: lambda input, weights=None, minlength=0: -1,
+        torch.binomial: lambda count, prob, generator=None: -1,
+        torch.bitwise_and: lambda input, other, out=None: -1,
+        torch.bitwise_not: lambda input, out=None: -1,
+        torch.bitwise_or: lambda input, other, out=None: -1,
+        torch.bitwise_xor: lambda input, other, out=None: -1,
+        torch.bitwise_left_shift: lambda input, other, out=None: -1,
+        torch.bitwise_right_shift: lambda input, other, out=None: -1,
+        torch.block_diag: lambda *tensors: -1,
+        torch.bmm: lambda input, mat2, out=None: -1,
+        torch.broadcast_tensors: lambda *tensors: -1,
+        torch.broadcast_to: lambda self, size: -1,
+        torch.bucketize: lambda input, boundaries, out_int32=False, right=False, out=None: -1,
+        torch.cartesian_prod: lambda *tensors: -1,
+        torch.cat: lambda tensors, dim=0, out=None: -1,
+        torch.concat: lambda tensors, dim=0, out=None: -1,  # alias for torch.cat
+        torch.concatenate: lambda tensors, dim=0, out=None: -1,  # alias for torch.concatenate
+        torch.cdist: lambda x1, x2, p=2.0, compute_mode='use_mm_for_euclid_dist_if_necessary': -1,
+        torch.ceil: lambda input, out=None: -1,
+        torch.celu: lambda input, alpha=1., inplace=False: -1,
+        torch.chain_matmul: lambda *matrices, out=None: -1,
+        torch.channel_shuffle: lambda input, groups : -1,
+        torch.cholesky: lambda input, upper=False, out=None: -1,
+        torch.linalg.cholesky: lambda input, out=None: -1,
+        torch.linalg.cholesky_ex: lambda input, check_errors=False, out=None: -1,
+        torch.cholesky_inverse: lambda input, upper=False, out=None: -1,
+        torch.cholesky_solve: lambda input1, input2, upper=False, out=None: -1,
+        torch.choose_qparams_optimized: lambda input, numel, n_bins, ratio, bit_width: -1,
+        torch.chunk: lambda input, chunks, dim=0: -1,
+        torch.clamp: lambda input, min=None, max=None, out=None: -1,
+        torch.clip: lambda input, min=None, max=None, out=None: -1,
+        torch.clamp_min: lambda input, min, out=None: -1,
+        torch.clamp_max: lambda input, max, out=None: -1,
+        torch.column_stack: lambda tensors, out=None: -1,
+        torch.cov: lambda input, correction=1, fweights=None, aweights=None: -1,
+        torch.clone: lambda input: -1,
+        torch.combinations: lambda input, r=2, with_replacement=False: -1,
+        torch.complex: lambda real, imag: -1,
+        torch.copysign: lambda input, other, out=None: -1,
+        torch.polar: lambda abs, ang: -1,
+        torch.linalg.cond: lambda input, ord=None: -1,
+        torch.conj: lambda input, out=None: -1,
+        torch.conj_physical: lambda input, out=None: -1,
+        torch.resolve_conj: lambda input, out=None: -1,
+        torch.resolve_neg: lambda input, out=None: -1,
+        torch.constant_pad_nd: lambda input, pad, value=0: -1,
+        torch.conv1d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.conv2d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.conv3d: lambda input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1: -1,
+        torch.convolution: lambda input, weight, bias, stride, padding, dilation, transposed, output_adding, groups: -1,
+        torch.conv_tbc: lambda input, weight, bias, pad=0: -1,
+        torch.conv_transpose1d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.conv_transpose2d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.conv_transpose3d: lambda input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1: -1,
+        torch.corrcoef: lambda input: -1,
+        torch.cos: lambda input, out=None: -1,
+        torch.cosine_embedding_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1,
+        torch.cosh: lambda input, out=None: -1,
+        torch.cosine_similarity: lambda x1, x2, dim=1, eps=1e-8: -1,
+        torch.count_nonzero: lambda input: -1,
+        torch.cross: lambda input, other, dim=None, out=None: -1,
+        torch.linalg.cross: lambda input, other, dim=-1, out=None: -1,
+        torch.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean',
+                         zero_infinity=False: -1),
+        torch.cummax: lambda input, dim, out=None: -1,
+        torch.cummin: lambda input, dim, out=None: -1,
+        torch.cumprod: lambda input, dim, out=None, dtype=None: -1,
+        torch.cumsum: lambda input, dim, out=None, dtype=None: -1,
+        torch.cumulative_trapezoid: lambda y, x=None, dim=-1: -1,
+        torch.logcumsumexp: lambda input, dim, out=None: -1,
+        torch.deg2rad: lambda input, out=None: -1,
+        torch.dequantize: lambda input: -1,
+        torch.det: lambda input: -1,
+        torch.linalg.det: lambda input: -1,  # alias for torch.det  # type: ignore[attr-defined]
+        torch.detach: lambda input: -1,
+        torch.diag: lambda input, diagonal=0, out=None: -1,
+        torch.diag_embed: lambda input, diagonal=0, out=None: -1,
+        torch.diagflat: lambda input, offset=0: -1,
+        torch.diff: lambda input, n=1, dim=-1, prepend=None, append=None, out=None: -1,
+        torch.diagonal: lambda input, offset=0, dim1=0, dim2=1: -1,
+        torch.linalg.diagonal: lambda input, offset=0, dim1=-2, dim2=-1: -1,
+        torch.diagonal_scatter: lambda input, src, offset=0, dim1=0, dim2=1: -1,
+        torch.as_strided_scatter: lambda self, src, size, stride, storage_offset=None: -1,
+        torch.digamma: lambda input, out=None: -1,
+        torch.dist: lambda input, other, p=2: -1,
+        torch.div: lambda input, other, rounding_mode=None, out=None: -1,
+        torch.divide: lambda input, other, rounding_mode=None, out=None: -1,
+        torch.dot: lambda input, other, out=None: -1,
+        torch.dropout: lambda input, p, train, inplace=False: -1,
+        torch.dsmm: lambda input, mat2: -1,
+        torch.hsmm: lambda mat1, mat2: -1,
+        torch.dsplit: lambda input, indices_or_sections: -1,
+        torch.dstack: lambda tensors, out=None: -1,
+        torch.linalg.eig: lambda input, out=None: -1,
+        torch.linalg.eigvals: lambda input, out=None: -1,
+        torch.linalg.eigh: lambda input, UPLO="L", out=None: -1,
+        torch.linalg.eigvalsh: lambda input, UPLO="L", out=None: -1,
+        torch.einsum: lambda equation, *operands: -1,
+        torch.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False,
+                          sparse=False: -1),
+        torch.embedding_bag: (lambda input, weight, offsets, max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                              mode='mean', sparse=False, per_sample_weights=None, padding_idx=None: -1),
+        torch.empty_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.eq: lambda input, other, out=None: -1,
+        torch.equal: lambda input, other: -1,
+        torch.erf: lambda input, out=None: -1,
+        torch.erfc: lambda input, out=None: -1,
+        torch.erfinv: lambda input, out=None: -1,
+        torch.exp: lambda input, out=None: -1,
+        torch.exp2: lambda input, out=None: -1,
+        torch.expm1: lambda input, out=None: -1,
+        torch.fake_quantize_per_channel_affine: lambda input, scale, zero_point, axis, quant_min, quant_max: -1,
+        torch.fake_quantize_per_tensor_affine: lambda input, scale, zero_point, quant_min, quant_max: -1,
+        torch.fused_moving_avg_obs_fake_quant: (lambda x, observer_on, fake_quant_on, averaging_const, running_min,
+                                                running_max, scale, zero_point, quant_min, quant_max, ch_axis,
+                                                per_row_fake_quant=False, symmetric_quant=False: -1),
+        torch.fbgemm_linear_fp16_weight: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_fp16_weight_fp32_activation: lambda input, packed_weight, bias: -1,
+        torch.fbgemm_linear_int8_weight: lambda input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias: -1,
+        torch.fbgemm_linear_int8_weight_fp32_activation: (lambda input, weight, packed, col_offsets, weight_scale,
+                                                          weight_zero_point, bias: -1),
+        torch.fbgemm_linear_quantize_weight: lambda input: -1,
+        torch.fbgemm_pack_gemm_matrix_fp16: lambda input: -1,
+        torch.fbgemm_pack_quantized_matrix: lambda input, a, b: -1,
+        torch.feature_alpha_dropout: lambda input, p, train: -1,
+        torch.feature_dropout: lambda input, p, train: -1,
+        torch.fft.ifft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.rfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.irfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.hfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.ihfft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fft.hfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.ihfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.hfftn: lambda input, s=None, dim=-1, norm=None: -1,
+        torch.fft.ihfftn: lambda input, s=None, dim=-1, norm=None: -1,
+        torch.fft.fftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.ifftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.rfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.irfftn: lambda input, s=None, dim=None, norm=None: -1,
+        torch.fft.fft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.ifft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.rfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.irfft2: lambda input, s=None, dim=(-2, -1), norm=None: -1,
+        torch.fft.fftshift: lambda input, dim=None: -1,
+        torch.fft.ifftshift: lambda input, dim=None: -1,
+        torch.fft.fft: lambda input, n=None, dim=-1, norm=None: -1,
+        torch.fix: lambda input, out=None: -1,
+        torch.flatten: lambda input, start_dim=0, end_dim=-1: -1,
+        torch.flip: lambda input, dims: -1,
+        torch.fliplr: lambda input: -1,
+        torch.flipud: lambda input: -1,
+        torch.frobenius_norm: lambda input, dim=None, keepdim=False, out=None: -1,
+        torch.floor: lambda input, out=None: -1,
+        torch.floor_divide: lambda input, other: -1,
+        torch.float_power: lambda input, exponent, out=None: -1,
+        torch.fmod: lambda input, other, out=None: -1,
+        torch.frac: lambda input, out=None: -1,
+        torch.frexp: lambda input, out=None: -1,
+        torch.full_like: lambda input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1,
+        torch._functional_assert_async: lambda input, msg, dep_token: -1,
+        torch.lu_unpack: lambda LU_data, LU_pivots, unpack_data=True, unpack_pivots=True: -1,
+        torch.gather: lambda input, dim, index, out=None, sparse_grad=False: -1,
+        torch.gcd: lambda input, other, out=None: -1,
+        torch.ge: lambda input, other, out=None: -1,
+        torch.greater_equal: lambda input, other, out=None: -1,
+        torch.geqrf: lambda input, out=None: -1,
+        torch.i0: lambda input, out=None: -1,
+        torch.inner: lambda input, other, out=None: -1,
+        torch.outer: lambda input, vec2, out=None: -1,
+        torch.ger: lambda input, vec2, out=None: -1,  # alias for torch.outer
+        torch.gradient: lambda input, spacing=None, dim=None, edge_order=1: -1,
+        torch.grid_sampler: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.grid_sampler_2d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.grid_sampler_3d: lambda input, grid, interpolation_mode, padding_mode, align_corners: -1,
+        torch.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05, cudnn_enabled=True: -1,
+        torch.gru: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.gru_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.gt: lambda input, other, out=None: -1,
+        torch.greater: lambda input, other, out=None: -1,
+        torch.hardshrink: lambda input, lambd=0.5: -1,
+        torch.heaviside: lambda input, values, out=None: -1,
+        torch.hinge_embedding_loss: lambda input, target, margin=1.0, size_average=None, reduce=None, reduction='mean': -1,
+        torch.histc: lambda input, bins=100, min=0, max=0, out=None: -1,
+        torch.histogram: lambda input, bins=100, min=None, max=None, weight=None, density=False, out=None: -1,
+        torch.histogramdd: lambda input, bins, range=None, weight=None, density=False: -1,
+        torch.linalg.householder_product: lambda input, tau: -1,
+        torch.hspmm: lambda mat1, mat2, out=None: -1,
+        torch.hsplit: lambda input, indices_or_sections: -1,
+        torch.hstack: lambda tensors, out=None: -1,
+        torch.hypot: lambda input, other, out=None: -1,
+        torch.igamma: lambda input, other, out=None: -1,
+        torch.igammac: lambda input, other, out=None: -1,
+        torch.imag: lambda input, out=None: -1,
+        torch.index_add: lambda input, dim, index, source: -1,
+        torch.index_copy: lambda input, dim, index, source: -1,
+        torch.index_put: lambda input, indices, values, accumulate=False: -1,
+        torch.index_select: lambda input, dim, index, out=None: -1,
+        torch.index_fill: lambda input, dim, index, value: -1,
+        torch.index_reduce: lambda input, dim, index, source, reduce, include_input=True: -1,
+        torch.isfinite: lambda tensor: -1,
+        torch.isin: lambda e, te, assume_unique=False, invert=False: -1,
+        torch.isinf: lambda tensor: -1,
+        torch.isreal: lambda tensor: -1,
+        torch.isposinf: lambda input, out=None: -1,
+        torch.isneginf: lambda input, out=None: -1,
+        torch.instance_norm: (lambda input, running_mean, running_var, weight, bias, use_input_stats, momentum, eps,
+                              cudnn_enabled: -1),
+        torch.int_repr: lambda input: -1,
+        torch.inverse: lambda input, out=None: -1,
+        torch.linalg.inv: lambda input, out=None: -1,
+        torch.linalg.inv_ex: lambda input, check_errors=False, out=None: -1,
+        torch.is_complex: lambda input: -1,
+        torch.is_conj: lambda input: -1,
+        torch.is_neg: lambda input: -1,
+        torch.is_distributed: lambda input: -1,
+        torch.is_inference: lambda input: -1,
+        torch.is_floating_point: lambda input: -1,
+        torch.is_nonzero: lambda input: -1,
+        torch.is_same_size: lambda input, other: -1,
+        torch.is_signed: lambda input: -1,
+        torch.isclose: lambda input, other, rtol=1e-05, atol=1e-08, equal_nan=False: -1,
+        torch.isnan: lambda input: -1,
+        torch.istft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True,
+                      normalized=False, onesided=None, length=None, return_complex=False: -1),
+        torch.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1,
+        torch.kron: lambda input, other: -1,
+        torch.kthvalue: lambda input, k, dim=None, keepdim=False, out=None: -1,
+        torch.linalg.ldl_factor_ex: lambda input, hermitian=False, check_errors=False, out=None: -1,
+        torch.linalg.ldl_factor: lambda input, hermitian=False, out=None: -1,
+        torch.linalg.ldl_solve: lambda LD, pivots, B, hermitian=False, out=None: -1,
+        torch.layer_norm: lambda input, normalized_shape, weight=None, bias=None, esp=1e-05, cudnn_enabled=True: -1,
+        torch.lcm: lambda input, other, out=None: -1,
+        torch.ldexp: lambda input, other, out=None: -1,
+        torch.le: lambda input, other, out=None: -1,
+        torch.less_equal: lambda input, other, out=None: -1,
+        torch.lerp: lambda input, end, weight, out=None: -1,
+        torch.lgamma: lambda input, out=None: -1,
+        torch.lobpcg: lambda input, k=None, B=None, X=None, n=None, iK=None, niter=None, tol=None, largest=None, method=None,
+        tracker=None, ortho_iparams=None, ortho_fparams=None, ortho_bparams=None: -1,
+        torch.log: lambda input, out=None: -1,
+        torch.log_softmax: lambda input, dim, dtype=None: -1,
+        torch.log10: lambda input, out=None: -1,
+        torch.log1p: lambda input, out=None: -1,
+        torch.log2: lambda input, out=None: -1,
+        torch.logaddexp: lambda input, other, out=None: -1,
+        torch.logaddexp2: lambda input, other, out=None: -1,
+        torch.logdet: lambda input: -1,
+        torch.xlogy: lambda x, y, out=None: -1,
+        torch.logical_and: lambda input, other, out=None: -1,
+        torch.logical_not: lambda input, out=None: -1,
+        torch.logical_or: lambda input, other, out=None: -1,
+        torch.logical_xor: lambda input, other, out=None: -1,
+        torch.logit: lambda input, eps=None: -1,
+        torch.logsumexp: lambda input, names, keepdim=False, out=None: -1,
+        torch.lstm: lambda data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional: -1,
+        torch.lstm_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.lt: lambda input, other, out=None: -1,
+        torch.less: lambda input, other, out=None: -1,
+        torch.lu: lambda A, pivot=True, get_infos=False, out=None: -1,
+        torch.lu_solve: lambda b, LU_data, LU_pivots, out=None: -1,
+        torch.margin_ranking_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1,  # type: ignore[attr-defined]  # noqa: B950
+        torch.masked_fill: lambda input, mask, value: -1,
+        torch.masked_scatter: lambda input, mask, source: -1,
+        torch.masked_select: lambda input, mask, out=None: -1,
+        torch.matmul: lambda input, other, out=None: -1,
+        torch.linalg.lu: lambda input, pivot=True, out=None: -1,
+        torch.linalg.lu_factor: lambda input, pivot=True, out=None: -1,
+        torch.linalg.lu_factor_ex: lambda input, pivot=True, check_errors=False, out=None: -1,
+        torch.linalg.lu_solve: lambda LU, pivots, B, left=True, adjoint=False, out=None: -1,
+        torch.linalg.matmul: lambda input, other, out=None: -1,  # alias for torch.matmul
+        torch.matrix_power: lambda input, n: -1,
+        torch.linalg.matrix_power: lambda input, n, out=None: -1,
+        torch.linalg.matrix_rank: lambda input, tol=None, hermitian=False: -1,
+        torch.linalg.multi_dot: lambda tensors, out=None: -1,
+        torch.matrix_exp: lambda input: -1,
+        torch.linalg.matrix_exp: lambda input: -1,
+        torch.max: lambda input, out=None: -1,
+        torch.maximum: lambda input, other, out=None: -1,
+        torch.fmax: lambda input, other, out=None: -1,
+        torch.max_pool1d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool2d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool3d: lambda input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False: -1,
+        torch.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                        return_indices=False, ceil_mode=False: -1),
+        torch.mean: lambda input, dim=None: -1,
+        torch.nanmean: lambda input, dim=None, keepdim=False, dtype=None, out=None: -1,
+        torch.median: lambda input, dim=None: -1,
+        torch.nanmedian: lambda input, dim=None: -1,
+        torch.meshgrid: lambda *tensors, **kwargs: -1,
+        torch.min: lambda input, out=None: -1,
+        torch.minimum: lambda input, other, out=None: -1,
+        torch.fmin: lambda input, other, out=None: -1,
+        torch.miopen_batch_norm: (lambda input, weight, bias, running_mean, running_var, training,
+                                  exponential_average_factor, epsilon: -1),
+        torch.miopen_convolution: lambda input, weight, bias, padding, stride, dilation, groups, benchmark, deterministic: -1,
+        torch.miopen_convolution_add_relu: lambda input, weight, z, alpha, bias, stride, padding, dilation, groups: -1,
+        torch.miopen_convolution_relu: lambda input, weight, bias, stride, padding, dilation, groups: -1,
+        torch.miopen_convolution_transpose: (lambda input, weight, bias, padding, output_padding, stride, dilation,
+                                             groups, benchmark, deterministic: -1),
+        torch.miopen_depthwise_convolution: (lambda input, weight, bias, padding, stride, dilation, groups, benchmark,
+                                             deterministic: -1),
+        torch.miopen_rnn: (lambda input, weight, weight_stride0, hx, cx, mode, hidden_size, num_layers, batch_first,
+                           dropout, train, bidirectional, batch_sizes, dropout_state: -1),
+        torch.mm: lambda input, mat2, out=None: -1,
+        torch.mode: lambda input, dim=-1, keepdim=False, out=None: -1,
+        torch.movedim: lambda input, source, destination: -1,
+        torch.moveaxis: lambda input, source, destination: -1,
+        torch.msort: lambda input, descending=False, out=None: -1,
+        torch.mul: lambda input, other, out=None: -1,
+        torch.multiply: lambda input, other, out=None: -1,
+        torch.multinomial: lambda input, num_samples, replacement=False, out=None: -1,
+        torch.mv: lambda input, vec, out=None: -1,
+        torch.mvlgamma: lambda input, p: -1,
+        torch.narrow: lambda input, dim, start, length: -1,
+        torch.nan_to_num: lambda input, nan=0.0, posinf=None, neginf=None, out=None: -1,
+        torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1,
+        torch._native_batch_norm_legit: lambda input, weight, bias, training, momentum, eps: -1,
+        torch.native_dropout: lambda input, p, train: -1,
+        torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
+        torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1,
+        torch.native_norm: lambda input, p=2, dim=None, keepdim=False, dtype=None: -1,
+        torch.native_channel_shuffle: lambda input, groups : -1,
+        torch.ne: lambda input, other, out=None: -1,
+        torch.not_equal: lambda input, other, out=None: -1,
+        torch.neg: lambda input, out=None: -1,
+        torch.negative: lambda input, out=None: -1,
+        torch.nextafter: lambda input, other, out=None: -1,
+        torch.nn.functional.adaptive_avg_pool2d: lambda input, output_size: -1,
+        torch.nn.functional.adaptive_avg_pool3d: lambda input, output_size: -1,
+        torch.nn.functional.adaptive_max_pool1d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool1d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool2d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool2d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool3d: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.adaptive_max_pool3d_with_indices: lambda input, output_size, return_indices=False: -1,
+        torch.nn.functional.affine_grid: lambda theta, size, align_corners=None: -1,
+        torch.nn.functional.alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1,
+        torch.nn.functional.avg_pool2d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False,
+                                         count_include_pad=True, divisor_override=None: -1),
+        torch.nn.functional.avg_pool3d: (lambda input, kernel_size, stride=None, padding=0, ceil_mode=False,
+                                         count_include_pad=True, divisor_override=None: -1),
+        torch.nn.functional.batch_norm: (lambda input, running_mean, running_var, weight=None, bias=None, training=False,
+                                         momentum=0.1, eps=1e-05: -1),
+        torch.nn.functional.bilinear: lambda input1, input2, weight, bias=None: -1,
+        torch.nn.functional.binary_cross_entropy: (lambda input, target, weight=None, size_average=None, reduce=None,
+                                                   reduction="mean": -1),
+        torch.nn.functional.binary_cross_entropy_with_logits: (lambda input, target, weight=None, size_average=None,
+                                                               reduce=None, reduction="mean", pos_weight=None: -1),
+        torch.nn.functional.celu: lambda input, alpha=1.0, inplace=False: -1,
+        torch.nn.functional.cosine_embedding_loss: (lambda input1, input2, target, margin=0, size_average=None,
+                                                    reduce=None, reduction='mean': -1),
+        torch.nn.functional.cross_entropy: (lambda input, target, weight=None, size_average=None, ignore_index=-100,
+                                            reduce=None, reduction="mean", label_smoothing=0.0: -1),
+        torch.nn.functional.ctc_loss: (lambda log_probs, targets, input_lengths, target_lengths, blank=0,
+                                       reduction='mean', zero_infinity=False: -1),
+        torch.nn.functional.dropout: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout1d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout2d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.dropout3d: lambda input, p=0.5, training=True, inplace=False: -1,
+        torch.nn.functional.elu: lambda input, alpha=1.0, inplace=False: -1,
+        torch.nn.functional.embedding: (lambda input, weight, padding_idx=None, max_norm=None, norm_type=2.0,
+                                        scale_grad_by_freq=False, sparse=False: -1),
+        torch.nn.functional.embedding_bag: (lambda input, weight, offsets=None, max_norm=None, norm_type=2,
+                                            scale_grad_by_freq=False, mode='mean', sparse=False, per_sample_weights=None,
+                                            include_last_offset=False, padding_idx=None: -1),
+        torch.nn.functional.feature_alpha_dropout: lambda input, p=0.5, training=False, inplace=False: -1,
+        torch.nn.functional.fold: lambda input, output_size, kernel_size, dilation=1, padding=0, stride=1: -1,
+        torch.nn.functional.fractional_max_pool2d: (lambda input, kernel_size, output_size=None, output_ratio=None,
+                                                    return_indices=False, _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool2d_with_indices: (
+            lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False,
+            _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool3d: (lambda input, kernel_size, output_size=None, output_ratio=None,
+                                                    return_indices=False, _random_samples=None: -1),
+        torch.nn.functional.fractional_max_pool3d_with_indices: (
+            lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False,
+            _random_samples=None: -1),
+        torch.nn.functional.gaussian_nll_loss: lambda input, target, var, full=False, eps=1e-06, reduction='mean': -1,
+        torch.nn.functional.gelu: lambda input, approximate='none': -1,
+        torch.nn.functional.glu: lambda input, dim=-1: -1,
+        torch.nn.functional.grid_sample: lambda input, grid, mode='bilinear', padding_mode='zeros', align_corners=None: -1,
+        torch.nn.functional.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05: -1,
+        torch.nn.functional.gumbel_softmax: lambda logits, tau=1, hard=False, eps=1e-10, dim=-1: -1,
+        torch.nn.functional.hardshrink: lambda input, lambd=0.5: -1,
+        torch.nn.functional.hardtanh: lambda input, min_val=-1., max_val=1., inplace=False: -1,
+        torch.nn.functional.hinge_embedding_loss: (lambda input, target, margin=1.0, size_average=None, reduce=None,
+                                                   reduction='mean': -1),
+        torch.nn.functional.instance_norm: (lambda input, running_mean=None, running_var=None, weight=None, bias=None,
+                                            use_input_stats=True, momentum=0.1, eps=1e-05: -1),
+        torch.nn.functional.interpolate: (lambda input, size=None, scale_factor=None, mode='nearest', align_corners=None,
+                                          recompute_scale_factor=None, antialias=False: -1),
+        torch.nn.functional.kl_div: lambda input, target, size_average=None, reduce=None, reduction='mean', log_target=False: -1,
+        torch.nn.functional.l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1,
+        torch.nn.functional.leaky_relu: lambda input, negative_slope=0.01, inplace=False: -1,
+        torch.nn.functional.linear: lambda input, weight, bias=None: -1,
+        torch.nn.functional.local_response_norm: lambda input, size, alpha=0.0001, beta=0.75, k=1.0: -1,
+        torch.nn.functional.log_softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.logsigmoid: lambda input: -1,
+        torch.nn.functional.lp_pool1d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.lp_pool2d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.lp_pool3d: lambda input, norm_type, kernel_size, stride=None, ceil_mode=False: -1,
+        torch.nn.functional.margin_ranking_loss: (lambda input1, input2, target, margin=0, size_average=None,
+                                                  reduce=None, reduction='mean': -1),
+        torch.nn.functional.max_pool1d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         ceil_mode=False, return_indices=False: -1),
+        torch.nn.functional.max_pool1d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool2d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         ceil_mode=False, return_indices=False: -1),
+        torch.nn.functional.max_pool2d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool3d: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                         return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_pool3d_with_indices: (lambda input, kernel_size, stride=None, padding=0, dilation=1,
+                                                      return_indices=False, ceil_mode=False: -1),
+        torch.nn.functional.max_unpool1d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.max_unpool2d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.max_unpool3d: lambda input, indices, kernel_size, stride=None, padding=0, output_size=None: -1,
+        torch.nn.functional.mse_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.multi_head_attention_forward: (
+            lambda query, key, value, embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v,
+            add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=True, key_padding_mask=None,
+            need_weights=True, attn_mask=None, use_separate_proj_weight=False, q_proj_weight=None, k_proj_weight=None,
+            v_proj_weight=None, static_k=None, static_v=None, average_attn_weights=None, is_causal=False: -1),
+        torch.nn.functional.multi_margin_loss: (lambda input, target, p=1, margin=1.0, weight=None, size_average=None,
+                                                reduce=None, reduction='mean': -1),
+        torch.nn.functional.multilabel_margin_loss: (lambda input, target, size_average=None, reduce=None,
+                                                     reduction='mean': -1),
+        torch.nn.functional.multilabel_soft_margin_loss: (lambda input, target, weight=None, size_average=None,
+                                                          reduce=None, reduction='mean': -1),
+        torch.nn.functional.nll_loss: (lambda input, target, weight=None, size_average=None, ignore_index=-100,
+                                       reduce=None, reduction='mean': -1),
+        torch.nn.functional.normalize: lambda input, p=2, dim=1, eps=1e-12, out=None: -1,
+        torch.nn.functional.one_hot: lambda tensor, num_classes=-1: -1,
+        torch.nn.functional.pad: lambda input, pad, mode='constant', value=0: -1,
+        torch.nn.functional.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1,
+        torch.nn.functional.poisson_nll_loss: (lambda input, target, log_input=True, full=False, size_average=None,
+                                               eps=1e-08, reduce=None, reduction='mean': -1),
+        torch.nn.functional.prelu: lambda input, weight: -1,
+        torch.nn.functional.relu: lambda input, inplace=False: -1,
+        torch.nn.functional.relu6: lambda input, inplace=False: -1,
+        torch.nn.functional.rrelu: lambda input, lower=0.125, upper=0.3333333333333333, training=False, inplace=False: -1,
+        torch.nn.functional.selu: lambda input, inplace=False: -1,
+        torch.nn.functional.silu: lambda input, inplace=False: -1,
+        torch.nn.functional.mish: lambda input, inplace=False: -1,
+        torch.nn.functional.scaled_dot_product_attention: lambda query, key, value, attn_mask=None, dropout_p=0.0: -1,
+        torch.nn.functional.smooth_l1_loss: lambda input, target, size_average=None, reduce=None, reduction='mean', beta=1.: -1,
+        torch.nn.functional.huber_loss: lambda input, target, reduction='mean', delta=1.: -1,
+        torch.nn.functional.soft_margin_loss: lambda input, target, size_average=None, reduce=None, reduction='mean': -1,
+        torch.nn.functional.softmax: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.softmin: lambda input, dim=None, _stacklevel=3, dtype=None: -1,
+        torch.nn.functional.softplus: lambda input, beta=1, threshold=20: -1,
+        torch.nn.functional.softshrink: lambda input, lambd=0.5: -1,
+        torch.nn.functional.softsign: lambda input: -1,
+        torch.nn.functional.tanhshrink: lambda input: -1,
+        torch.nn.functional.threshold: lambda input, threshold, value, inplace=False: -1,
+        torch.nn.functional.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06,
+                                                  swap=False, size_average=None, reduce=None, reduction='mean': -1),
+        torch.nn.functional.triplet_margin_with_distance_loss: (lambda anchor, positive, negative, *,
+                                                                distance_function=None, margin=1.0,
+                                                                swap=False, reduction='mean': -1),
+        torch.nn.functional.unfold: lambda input, kernel_size, dilation=1, padding=0, stride=1: -1,
+        torch.nn.init.uniform_: lambda tensor, a=0., b=1., generator=None: -1,
+        torch.nn.init.normal_: lambda tensor, mean=0., std=1., generator=None: -1,
+        torch.nn.init.constant_: lambda tensor, val: -1,
+        torch.nn.init.kaiming_uniform_: lambda tensor, a=0, mode='fan_in', nonlinearity='leaky_relu', generator=None: -1,
+        torch.nonzero: lambda input, as_tuple=False: -1,
+        torch.nonzero_static: lambda input, *, size, fill_value=-1: -1,
+        torch.argwhere: lambda input: -1,
+        torch.norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.norm: lambda input, ord=None, dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.vector_norm: lambda input, ord=2, dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.linalg.matrix_norm: lambda input, ord='fro', dim=(-2, -1), keepdim=False, out=None, dtype=None: -1,
+        torch.norm_except_dim: lambda v, pow=2, dim=0: -1,
+        torch.nuclear_norm: lambda input, p='fro', dim=None, keepdim=False, out=None, dtype=None: -1,
+        torch.numel: lambda input: -1,
+        torch.orgqr: lambda input, tau: -1,
+        torch.ormqr: lambda input, input2, input3, left=True, transpose=False: -1,
+        torch.pairwise_distance: lambda x1, x2, p=2.0, eps=1e-06, keepdim=False: -1,
+        torch.permute: lambda self, dim: -1,
+        torch.pca_lowrank: lambda input, q=None, center=True, niter=2: -1,
+        torch.pdist: lambda input, p=2: -1,
+        torch.pinverse: lambda input, rcond=1e-15: -1,
+        torch.linalg.pinv: lambda input, rcond=1e-15, hermitian=False: -1,
+        torch.pixel_shuffle: lambda input, upscale_factor: -1,
+        torch.pixel_unshuffle: lambda input, downscale_factor: -1,
+        torch.poisson: lambda input, generator=None: -1,
+        torch.poisson_nll_loss: lambda input, target, log_input, full, eps, reduction: -1,
+        torch.polygamma: lambda input, n, out=None: -1,
+        torch.positive: lambda input, out=None: -1,
+        torch.prelu: lambda input, weight: -1,
+        torch.ones_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.pow: lambda input, exponent, out=None: -1,
+        torch.prod: lambda input, dtype=None: -1,
+        torch.put: lambda input, index, source, accumulate=False: -1,
+        torch.q_per_channel_axis: lambda input: -1,
+        torch.q_per_channel_scales: lambda input: -1,
+        torch.q_per_channel_zero_points: lambda input: -1,
+        torch.q_scale: lambda input: -1,
+        torch.q_zero_point: lambda input: -1,
+        torch.qr: lambda input, some=True, out=None: -1,
+        torch.linalg.qr: lambda input, mode='reduced', out=None: -1,
+        torch.quantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1,
+        torch.nanquantile: lambda input, q, dim=None, keepdim=False, interpolation='linear', out=None: -1,
+        torch.quantize_per_channel: lambda input, scales, zero_points, axis, dtype: -1,
+        torch.quantize_per_tensor: lambda input, scale, zero_point, dtype: -1,
+        torch.quantize_per_tensor_dynamic: lambda input, dtype, reduce_range: -1,
+        torch.quantized_batch_norm: lambda input, weight, bias, mean, var, eps, output_scale, output_zero_point: -1,
+        torch.quantized_gru_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                   col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+
+        torch.quantized_lstm_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                    col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_max_pool1d: (lambda input, kernel_size, stride=tuple(), padding=(0,),
+                                     dilation=(1,), ceil_mode=False: -1),
+        torch.quantized_max_pool2d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0),
+                                     dilation=(1, 1), ceil_mode=False: -1),
+        torch.quantized_max_pool3d: (lambda input, kernel_size, stride=tuple(), padding=(0, 0, 0),
+                                     dilation=(1, 1, 1), ceil_mode=False: -1),
+        torch.quantized_rnn_relu_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                        col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.quantized_rnn_tanh_cell: (lambda input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih,
+                                        col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh: -1),
+        torch.rad2deg: lambda input, out=None: -1,
+        torch.rand_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.randint_like: lambda input, high, dtype=None, layout=torch.strided, device=None, requires_grad=False: -1,
+        torch.randn_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch.ravel: lambda input: -1,
+        torch.real: lambda input, out=None: -1,
+        torch.vdot: lambda input, other, out=None: -1,
+        torch.linalg.vecdot: lambda input, other, dim=-1, out=None: -1,
+        torch.view_as_real: lambda input: -1,
+        torch.view_as_complex: lambda input: -1,
+        torch.reciprocal: lambda input, out=None: -1,
+        torch.relu: lambda input, inplace=False: -1,
+        torch.remainder: lambda input, other, out=None: -1,
+        torch.renorm: lambda input, p, dim, maxnorm, out=None: -1,
+        torch.repeat_interleave: lambda input, dim=None: -1,
+        torch.reshape: lambda input, shape: -1,
+        torch.rnn_relu: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.rnn_relu_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.rnn_tanh: lambda input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first: -1,
+        torch.rnn_tanh_cell: lambda input, hx, w_ih, w_hh, b_ih=None, b_hh=None: -1,
+        torch.roll: lambda input, shifts, dims=None: -1,
+        torch.rot90: lambda input, k=1, dims=(0, 1): -1,
+        torch.round: lambda input, out=None: -1,
+        torch.row_stack: lambda tensors, out=None: -1,  # alias for torch.vstack
+        torch._rowwise_prune: (lambda weight, mask, compressed_indices_dtype: -1),
+        torch.rrelu: lambda input, lower=1. / 8, upper=1. / 3, training=False, inplace=False: -1,
+        torch.rsqrt: lambda input, out=None: -1,
+        torch.rsub: lambda input, other, alpha=1: -1,
+        torch.saddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.scatter: lambda input, dim, index, src: -1,
+        torch.scatter_add: lambda input, dim, index, src: -1,
+        torch.scatter_reduce: lambda input, dim, index, src, reduce, include_self=True: -1,
+        torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1,
+        torch._segment_reduce: lambda data, reduce="max", lengths=None, indices=None, offsets=None, axis=0, unsafe=False: -1,
+        torch.select: lambda input, dim, index: -1,
+        torch.select_scatter: lambda input, src, dim, index: -1,
+        torch.slice_inverse: lambda input, src, dim=0, start=None, end=None, step=1: -1,
+        torch.slice_scatter: lambda input, src, dim=0, start=None, end=None, step=1: -1,
+        torch.selu: lambda input, inplace=False: -1,
+        torch.sigmoid: lambda input, out=None: -1,
+        torch.sign: lambda input, out=None: -1,
+        torch.signbit: lambda input, out=None: -1,
+        torch.sgn: lambda input, out=None: -1,
+        torch.sin: lambda input, out=None: -1,
+        torch.sinc: lambda input, out=None: -1,
+        torch.sinh: lambda input, out=None: -1,
+        torch.slogdet: lambda input: -1,
+        torch.linalg.slogdet: lambda input: -1,
+        torch.smm: lambda input, mat2: -1,
+        torch.spmm: lambda input, mat2: -1,
+        torch.softmax: lambda input, dim, dtype=None: -1,
+        torch.linalg.solve: lambda A, B, left=True, out=None: -1,
+        torch.linalg.solve_ex: lambda A, B, left=True, check_errors=False, out=None: -1,
+        torch.sort: lambda input, dim=-1, descending=False, *, stable=False, out=None: -1,
+        torch.split: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.sqrt: lambda input, out=None: -1,
+        torch.square: lambda input, out=None: -1,
+        torch.squeeze: lambda input, dim=None, out=None: -1,
+        torch.sspaddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        torch.stack: lambda tensors, dim=0, out=None: -1,
+        torch.std: lambda input, dim=None: -1,
+        torch.std_mean: lambda input, dim=None: -1,
+        torch.stft: (lambda input, n_fft, hop_length=None, win_length=None, window=None, center=True,
+                     pad_mode='reflect', normalized=False, onesided=True, return_complex=None: -1),
+        torch.sub: lambda input, other, out=None: -1,
+        torch.subtract: lambda input, other, out=None: -1,
+        torch.sum: lambda input, dim=None: -1,
+        torch.sym_float: lambda input: -1,
+        torch.sym_int: lambda input: -1,
+        torch.sym_max: lambda a, b: -1,
+        torch.sym_min: lambda a, b: -1,
+        torch.sym_not: lambda input: -1,
+        torch.sym_ite: lambda a, b, c: -1,
+        torch._sym_sqrt: lambda input: -1,
+        torch._sym_cos: lambda input: -1,
+        torch._sym_cosh: lambda input: -1,
+        torch._sym_sin: lambda input: -1,
+        torch._sym_sinh: lambda input: -1,
+        torch._sym_tan: lambda input: -1,
+        torch._sym_tanh: lambda input: -1,
+        torch._sym_asin: lambda input: -1,
+        torch._sym_acos: lambda input: -1,
+        torch._sym_atan: lambda input: -1,
+        torch.nansum: lambda input, dim=None: -1,
+        torch.svd: lambda input, some=True, compute_uv=True, out=None: -1,
+        torch.svd_lowrank: lambda input, q=6, niter=2, M=None: -1,
+        torch.linalg.svd: lambda input, full_matrices=True, out=None: -1,
+        torch.linalg.svdvals: lambda input, out=None: -1,
+        torch.swapaxes: lambda input, dim0, dim1: -1,
+        torch.swapdims: lambda input, axis0, axis1: -1,
+        torch.special.airy_ai: lambda input: -1,
+        torch.special.bessel_j0: lambda input: -1,
+        torch.special.bessel_j1: lambda input: -1,
+        torch.special.bessel_y0: lambda input: -1,
+        torch.special.bessel_y1: lambda input: -1,
+        torch.special.chebyshev_polynomial_t: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_u: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_v: lambda input, n, out=None: -1,
+        torch.special.chebyshev_polynomial_w: lambda input, n, out=None: -1,
+        torch.special.digamma: lambda input: -1,
+        torch.special.entr: lambda input: -1,
+        torch.special.erf: lambda input: -1,
+        torch.special.erfc: lambda input: -1,
+        torch.special.erfcx: lambda input: -1,
+        torch.special.erfinv: lambda input: -1,
+        torch.special.exp2: lambda input: -1,
+        torch.special.expit: lambda input: -1,
+        torch.special.expm1: lambda input: -1,
+        torch.special.gammainc: lambda input, other, out=None: -1,
+        torch.special.gammaincc: lambda input, other, out=None: -1,
+        torch.special.gammaln: lambda input: -1,
+        torch.special.hermite_polynomial_h: lambda input, n, out=None: -1,
+        torch.special.hermite_polynomial_he: lambda input, n, out=None: -1,
+        torch.special.i0: lambda input: -1,
+        torch.special.i0e: lambda input: -1,
+        torch.special.i1: lambda input: -1,
+        torch.special.i1e: lambda input: -1,
+        torch.special.laguerre_polynomial_l: lambda input, n, out=None: -1,
+        torch.special.legendre_polynomial_p: lambda input, n, out=None: -1,
+        torch.special.log1p: lambda input: -1,
+        torch.special.log_ndtr: lambda input: -1,
+        torch.special.log_softmax: lambda input, dim, dtype=None: -1,
+        torch.special.logit: lambda input: -1,
+        torch.special.logsumexp: lambda input, dim, keepdim=False, out=None: -1,
+        torch.special.modified_bessel_i0: lambda input: -1,
+        torch.special.modified_bessel_i1: lambda input: -1,
+        torch.special.modified_bessel_k0: lambda input: -1,
+        torch.special.modified_bessel_k1: lambda input: -1,
+        torch.special.multigammaln: lambda input, p: -1,
+        torch.special.ndtr: lambda input: -1,
+        torch.special.ndtri: lambda input: -1,
+        torch.special.polygamma: lambda input, n, out=None: -1,
+        torch.special.psi: lambda input: -1,
+        torch.special.round: lambda input: -1,
+        torch.special.scaled_modified_bessel_k0: lambda input: -1,
+        torch.special.scaled_modified_bessel_k1: lambda input: -1,
+        torch.special.shifted_chebyshev_polynomial_t: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_u: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_v: lambda input, n, out=None: -1,
+        torch.special.shifted_chebyshev_polynomial_w: lambda input, n, out=None: -1,
+        torch.special.sinc: lambda input: -1,
+        torch.special.softmax: lambda input, dim, dtype=None: -1,
+        torch.special.spherical_bessel_j0: lambda input: -1,
+        torch.special.xlog1py: lambda input, other, out=None: -1,
+        torch.special.xlogy: lambda input, other, out=None: -1,
+        torch.special.zeta: lambda self, other, out=None: -1,
+        torch.t: lambda input: -1,
+        torch.take: lambda input, index: -1,
+        torch.take_along_dim: lambda input, indices, dim=None, out=None: -1,
+        torch.tan: lambda input, out=None: -1,
+        torch.tanh: lambda input, out=None: -1,
+        torch.linalg.tensorinv: lambda a, ind=2: -1,
+        torch.linalg.tensorsolve: lambda a, b, dims=None: -1,
+        torch.tensordot: lambda a, b, dims=2, out=None: -1,
+        torch.tensor_split: lambda input, indices_or_sections, dim=0: -1,
+        torch.threshold: lambda input, threshold, value, inplace=False: -1,
+        torch.tile: lambda input, dims: -1,
+        torch.topk: lambda input, k, dim=-1, descending=False, out=None: -1,
+        torch.trace: lambda input: -1,
+        torch.transpose: lambda input, dim0, dim1: -1,
+        torch.trapz: lambda y, x=None, dim=-1: -1,
+        torch.trapezoid: lambda y, x=None, dim=-1: -1,
+        torch.triangular_solve: lambda input, A, upper=True, transpose=False, unitriangular=False: -1,
+        torch.linalg.solve_triangular: lambda input, B, upper, left=True, unitriangular=False: -1,
+        torch.tril: lambda input, diagonal=0, out=None: -1,
+        torch.triplet_margin_loss: (lambda anchor, positive, negative, margin=1.0, p=2, eps=1e-06, swap=False,
+
+                                    size_average=None, reduce=None, reduction='mean': -1),
+        torch.triu: lambda input, diagonal=0, out=None: -1,
+        torch.true_divide: lambda input, other: -1,
+        torch.trunc: lambda input, out=None: -1,
+        torch.unbind: lambda input, dim=0: -1,
+        torch.unflatten: lambda input, dim, sizes, names: -1,
+        torch.unique: lambda input, sorted=True, return_inverse=False, return_counts=False, dim=None: -1,
+        torch.unique_consecutive: lambda input, return_inverse=False, return_counts=False, dim=None: -1,
+        torch.unravel_index: lambda indices, shape: -1,
+        torch.unsafe_chunk: lambda input, chunks, dim=0: -1,
+        torch.unsafe_split: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.unsafe_split_with_sizes: lambda tensor, split_size_or_sections, dim=0: -1,
+        torch.unsqueeze: lambda input, dim, out=None: -1,
+        torch.linalg.vander: lambda x, N=None: -1,
+        torch.var: lambda input, dim=None: -1,
+        torch.var_mean: lambda input, dim=None: -1,
+        torch.vsplit: lambda input, indices_or_sections: -1,
+        torch.vstack: lambda tensors, out=None: -1,
+        torch.where: lambda condition, x=None, y=None: -1,
+        torch.zeros_like: lambda input, dtype=None, layout=None, device=None, requires_grad=False: -1,
+        torch._fw_primal_copy: lambda self, level: -1,
+        torch._make_dual_copy: lambda primal, tangent, level: -1,
+        torch.view_as_real_copy: lambda self: -1,
+        torch.view_as_complex_copy: lambda self: -1,
+        torch._conj_copy: lambda self: -1,
+        torch._neg_view_copy: lambda self: -1,
+        torch.as_strided_copy: lambda self, size, stride, storage_offset=None: -1,
+        torch._sparse_broadcast_to_copy: lambda self, size: -1,
+        torch.diagonal_copy: lambda self, offset=0, dim1=0, dim2=1: -1,
+        torch.expand_copy: lambda self, size, *, implicit=False: -1,
+        torch.narrow_copy: lambda self, dim, start, length: -1,
+        torch.permute_copy: lambda self, dims: -1,
+        torch._reshape_alias_copy: lambda self, size, stride: -1,
+        torch.select_copy: lambda self, dim, index: -1,
+        torch.detach_copy: lambda self: -1,
+        torch.slice_copy: lambda self, dim=0, start=None, end=None, step=1: -1,
+        torch.split_copy: lambda self, split_size, dim=0: -1,
+        torch.split_with_sizes_copy: lambda self, split_sizes, dim=0: -1,
+        torch.squeeze_copy: lambda self, dim: -1,
+        torch.t_copy: lambda self: -1,
+        torch.transpose_copy: lambda self, dim0, dim1: -1,
+        torch.unsqueeze_copy: lambda self, dim: -1,
+        torch._indices_copy: lambda self: -1,
+        torch._values_copy: lambda self: -1,
+        torch.indices_copy: lambda self: -1,
+        torch.values_copy: lambda self: -1,
+        torch.crow_indices_copy: lambda self: -1,
+        torch.col_indices_copy: lambda self: -1,
+        torch.ccol_indices_copy: lambda self: -1,
+        torch.row_indices_copy: lambda self: -1,
+        torch.unbind_copy: lambda self, dim=0: -1,
+        torch.view_copy: lambda self, dtype: -1,
+        torch.unfold_copy: lambda self, dimension, size, step: -1,
+        torch.alias_copy: lambda self: -1,
+        Tensor.__floordiv__: lambda self, other: -1,
+        Tensor.__rfloordiv__: lambda self, other: -1,
+        Tensor.__ifloordiv__: lambda self, other: -1,
+        Tensor.__truediv__: lambda self, other: -1,
+        Tensor.__rtruediv__: lambda self, other: -1,
+        Tensor.__itruediv__: lambda self, other: -1,
+        Tensor.__lshift__: lambda self, other: -1,
+        Tensor.__rlshift__: lambda self, other: -1,
+        Tensor.__ilshift__: lambda self, other: -1,
+        Tensor.__rshift__: lambda self, other: -1,
+        Tensor.__rrshift__: lambda self, other: -1,
+        Tensor.__irshift__: lambda self, other: -1,
+        Tensor.__and__: lambda self, other: -1,
+        Tensor.__or__: lambda self, other: -1,
+        Tensor.__xor__: lambda self, other: -1,
+        Tensor.__float__: lambda self: -1,
+        Tensor.__complex__: lambda self: -1,
+        Tensor.__array__: lambda self, dtype: -1,
+        Tensor.__bool__: lambda self: -1,
+        Tensor.__contains__: lambda self, other: -1,
+        Tensor.__neg__: lambda self: -1,
+        Tensor.__invert__: lambda self: -1,
+        Tensor.__mod__: lambda self, other: -1,
+        Tensor.__rmod__: lambda self, other: -1,
+        Tensor.__imod__: lambda self, other: -1,
+        Tensor.__array_wrap__: lambda self, array: -1,
+        Tensor.__getitem__: lambda self, idx: -1,
+        Tensor.__deepcopy__: lambda self, memo: -1,
+        Tensor.__int__: lambda self: -1,
+        Tensor.__long__: lambda self: -1,
+        Tensor.__index__: lambda self: -1,
+        Tensor.__len__: lambda self: -1,
+        Tensor.__format__: lambda self, format_spec: -1,
+        Tensor.__reduce_ex__: lambda self, proto: -1,
+        Tensor.__reversed__: lambda self: -1,
+        Tensor.__repr__: lambda self, *, tensor_contents=None: -1,
+        Tensor.__setitem__: lambda self, k, v: -1,
+        Tensor.__setstate__: lambda self, d: -1,
+        Tensor.T.__get__: lambda self: -1,
+        Tensor.H.__get__: lambda self: -1,
+        Tensor.mT.__get__: lambda self: -1,
+        Tensor.mH.__get__: lambda self: -1,
+        Tensor._backward_hooks.__get__: lambda self: -1,
+        Tensor._post_accumulate_grad_hooks.__get__: lambda self: -1,
+        Tensor._base.__get__: lambda self: -1,
+        Tensor._cdata.__get__: lambda self: -1,
+        Tensor.grad.__get__: lambda self: -1,
+        Tensor._grad.__get__: lambda self: -1,
+        Tensor._grad_fn.__get__: lambda self: -1,
+        Tensor.grad_fn.__get__: lambda self: -1,
+        Tensor._version.__get__: lambda self: -1,
+        Tensor._autocast_to_reduced_precision: lambda self, cuda_enabled, cpu_enabled, cuda_dtype, cpu_dtype: -1,
+        Tensor._autocast_to_full_precision: lambda self, cuda_enabled, cpu_enabled: -1,
+        Tensor.data.__get__: lambda self: -1,
+        Tensor.device.__get__: lambda self: -1,
+        Tensor.dtype.__get__: lambda self: -1,
+        Tensor.is_cuda.__get__: lambda self: -1,
+        Tensor.is_cpu.__get__: lambda self: -1,
+        Tensor.is_xla.__get__: lambda self: -1,
+        Tensor.is_xpu.__get__: lambda self: -1,
+        Tensor.is_ipu.__get__: lambda self: -1,
+        Tensor.is_leaf.__get__: lambda self: -1,
+        Tensor.retains_grad.__get__: lambda self: -1,
+        Tensor.is_meta.__get__: lambda self: -1,
+        Tensor.is_mps.__get__: lambda self: -1,
+        Tensor.is_mtia.__get__: lambda self: -1,
+        Tensor.is_nested.__get__: lambda self: -1,
+        Tensor.is_ort.__get__: lambda self: -1,
+        Tensor.is_mkldnn.__get__: lambda self: -1,
+        Tensor.is_quantized.__get__: lambda self: -1,
+        Tensor.is_sparse.__get__: lambda self: -1,
+        Tensor.is_sparse_csr.__get__: lambda self: -1,
+        Tensor.is_vulkan.__get__: lambda self: -1,
+        Tensor.itemsize.__get__: lambda self: -1,
+        Tensor.layout.__get__: lambda self: -1,
+        Tensor.name.__get__: lambda self: -1,
+        Tensor.names.__get__: lambda self: -1,
+        Tensor.nbytes.__get__: lambda self: -1,
+        Tensor.ndim.__get__: lambda self: -1,
+        Tensor.output_nr.__get__: lambda self: -1,
+        Tensor.requires_grad.__get__: lambda self: -1,
+        Tensor.shape.__get__: lambda self: -1,
+        Tensor.volatile.__get__: lambda self: -1,
+        Tensor.real.__get__: lambda self: -1,
+        Tensor.imag.__get__: lambda self: -1,
+        Tensor.__cuda_array_interface__.__get__: lambda self: -1,
+        Tensor.type: lambda self, dtype=None, non_blocking=False, **kwargs: -1,
+        Tensor._dimI: lambda self: -1,
+        Tensor._dimV: lambda self: -1,
+        Tensor._indices: lambda self: -1,
+        Tensor._is_view: lambda self: -1,
+        Tensor._nnz: lambda self: -1,
+        Tensor.crow_indices: lambda self: -1,
+        Tensor.col_indices: lambda self: -1,
+        Tensor.ccol_indices: lambda self: -1,
+        Tensor.row_indices: lambda self: -1,
+        Tensor._update_names: lambda self, names, inplace: -1,
+        Tensor._values: lambda self: -1,
+        Tensor.adjoint: lambda self: -1,
+        Tensor.align_as: lambda self, other: -1,
+        Tensor.align_to: lambda self, order, ellipsis_idx: -1,
+        Tensor.apply_: lambda self, callable: -1,
+        Tensor.as_strided: lambda self, size, stride: -1,
+        Tensor.as_strided_: lambda self, size, stride: -1,
+        Tensor.backward: lambda self, gradient=None, retain_graph=None, create_graph=False, inputs=None: -1,
+        Tensor.bfloat16: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.bool: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.byte: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.char: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cauchy_: lambda self, median=0, sigma=1, *, generator=None: -1,
+        Tensor.coalesce: lambda self: -1,
+        Tensor._coalesced_: lambda self, coalesced: -1,
+        Tensor.contiguous: lambda self, memory_format=torch.contiguous_format: -1,
+        Tensor.copy_: lambda self, src, non_blocking=False: -1,
+        Tensor.cpu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cuda: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.xpu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.ipu: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.data_ptr: lambda self: -1,
+        Tensor.dense_dim: lambda self: -1,
+        Tensor.diagonal_scatter: lambda self, src, offset=0, dim1=0, dim2=1: -1,
+        Tensor.dim: lambda self: -1,
+        Tensor.dim_order: lambda self: -1,
+        Tensor.double: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cdouble: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.element_size: lambda self: -1,
+        Tensor.expand: lambda self, size: -1,
+        Tensor.expand_as: lambda self, other: -1,
+        Tensor.exponential_: lambda self, lambd=1, *, generator=None: -1,
+        Tensor.fill_: lambda self, value: -1,
+        Tensor.fill_diagonal_: lambda self, value: -1,
+        Tensor.float: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.cfloat: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.geometric_: lambda self, p, *, generator=None: -1,
+        Tensor.get_device: lambda self: -1,
+        Tensor.half: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.chalf: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.has_names: lambda self: -1,
+        Tensor.indices: lambda self: -1,
+        Tensor.int: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.is_coalesced: lambda self: -1,
+        Tensor.is_contiguous: lambda self: -1,
+        Tensor.is_inference: lambda self: -1,
+        Tensor.is_pinned: lambda self: -1,
+        Tensor.is_set_to: lambda self, tensor: -1,
+        Tensor.is_shared: lambda self: -1,
+        Tensor.item: lambda self: -1,
+        Tensor.log_normal_: lambda self, mean=1, std=2, *, generator=None: -1,
+        Tensor.log_softmax: lambda self, dim: -1,
+        Tensor.long: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.map_: lambda self, tensor, callable: -1,
+        Tensor.map2_: lambda self, x, y, callable: -1,
+        Tensor.mm: lambda self, mat2: -1,
+        Tensor.module_load: lambda self, other, assign=False: -1,
+        Tensor.narrow_copy: lambda self, dimension, start, length: -1,
+        Tensor.ndimension: lambda self: -1,
+        Tensor.nelement: lambda self: -1,
+        Tensor._nested_tensor_size: lambda self: -1,
+        Tensor._nested_tensor_storage_offsets: lambda self: -1,
+        Tensor._nested_tensor_strides: lambda self: -1,
+        Tensor.normal_: lambda self: -1,
+        Tensor.numpy: lambda self: -1,
+        Tensor.permute: lambda self, dim: -1,
+        Tensor.pin_memory: lambda self: -1,
+        Tensor.put_: lambda self, indices, tensor, accumulate=False: -1,
+        Tensor.qscheme: lambda self: -1,
+        Tensor.random_: lambda self, from_=0, to=None, *, generator=None: -1,
+        Tensor.record_stream: lambda self, stream: -1,
+        Tensor.refine_names: lambda self, names: -1,
+        Tensor.register_hook: lambda self, hook: -1,
+        Tensor.register_post_accumulate_grad_hook: lambda self, hook: -1,
+        Tensor.rename: lambda self, name: -1,
+        Tensor.repeat: lambda self, *size: -1,
+        Tensor.requires_grad_: lambda self, requires_grad=True: -1,
+        Tensor.reshape_as: lambda self, other: -1,
+        Tensor.resize: lambda self, *size: -1,
+        Tensor.resize_: lambda self, size: -1,
+        Tensor.resize_as: lambda self, other: -1,
+        Tensor.resize_as_sparse_: lambda self, other: -1,
+        Tensor.retain_grad: lambda self: -1,
+        Tensor.set_: lambda self, source=None, storage_offset=0, size=None, stride=None: -1,
+        Tensor.select_scatter: lambda self, src, dim, index: -1,
+        Tensor.share_memory_: lambda self: -1,
+        Tensor.short: lambda self, memory_format=torch.preserve_format: -1,
+        Tensor.size: lambda self: -1,
+        Tensor.slice_scatter: lambda self, src, dim=0, start=None, end=None, step=1: -1,
+        Tensor.sparse_dim: lambda self: -1,
+        Tensor.sparse_mask: lambda self, mask: -1,
+        Tensor._sparse_mask_projection: lambda self, mask, accumulate_matches=False: -1,
+        Tensor.sparse_resize_: lambda self, size1, size2, dense_dim: -1,
+        Tensor.sparse_resize_and_clear_: lambda self, size1, size2, dense_dim: -1,
+        Tensor.sspaddmm: lambda self, mat1, mat2, beta=1, alpha=1, out=None: -1,
+        Tensor.storage: lambda self: -1,
+        Tensor.untyped_storage: lambda self: -1,
+        Tensor.storage_offset: lambda self: -1,
+        Tensor.storage_type: lambda self: -1,
+        Tensor.sum_to_size: lambda self, size: -1,
+        Tensor.tile: lambda self, *reps: -1,
+        Tensor.to: lambda self, dtype, non_blocking=False, copy=False, memory_format=torch.preserve_format: -1,
+        Tensor.to_dense: lambda self, dtype=None, *, masked_grad=None: -1,
+        Tensor._to_dense: lambda self, dtype=None, masked_grad=None: -1,
+        Tensor.to_sparse: lambda self: -1,
+        Tensor.tolist: lambda self: -1,
+        Tensor.to_mkldnn: lambda self: -1,
+        Tensor.type_as: lambda self, other: -1,
+        Tensor.unfold: lambda self, dimension, size, step: -1,
+        Tensor.uniform_: lambda self, from_=0, to=1: -1,
+        Tensor.values: lambda self: -1,
+        Tensor.view: lambda self, shape: -1,
+        Tensor.view_as: lambda self, other: -1,
+        Tensor.zero_: lambda self: -1,
+        Tensor.__dlpack__: lambda self, stream=None: -1,
+        Tensor.__dlpack_device__: lambda self: -1,
+        torch.linalg.lstsq: lambda self, b, cond=None, driver=None: -1,
+    }
+
+    ret2 = {}
+    ignored = get_ignored_functions()
+
+    for k, v in ret.items():
+        # Generate methods like __add__ and add_ by default from add
+        names = [
+            k.__name__,  # Default method
+            k.__name__ + "_",  # Inplace variant
+            "__" + k.__name__ + "__",  # Dunder method
+            "__i" + k.__name__ + "__",  # Inplace dunder method
+            "__r" + k.__name__ + "__",  # Reverse dunder method
+        ]
+
+        if k.__name__.startswith("bitwise_"):
+            # bitwise_<op> have dunder methods of the form __<op>__
+            # And so on.
+            subname = k.__name__[len("bitwise_"):]
+            names.extend([
+                "__" + subname + "__",
+                "__i" + subname + "__",
+                "__r" + subname + "__"
+            ])
+
+        for name in names:
+            func = getattr(Tensor, name, None)
+            if callable(func) and func not in ret and func not in ignored:
+                ret2[func] = v
+
+    ret.update(ret2)
+    return ret
+
+def wrap_torch_function(dispatcher: Callable):
+    """Wraps a given function with ``__torch_function__`` -related functionality.
+
+    Parameters
+    ----------
+    dispatcher: Callable
+        A callable that returns an iterable of Tensor-likes passed into the function.
+
+    Note
+    ----
+    This decorator may reduce the performance of your code. Generally, it's enough to express
+    your code as a series of functions that, themselves, support __torch_function__. If you
+    find yourself in the rare situation where this is not the case, e.g. if you're wrapping a
+    low-level library and you also need it to work for Tensor-likes, then this function is available.
+
+    Examples
+    --------
+    >>> def dispatcher(a): # Must have the same signature as func
+    ...     return (a,)
+    >>> @torch.overrides.wrap_torch_function(dispatcher)
+    >>> def func(a): # This will make func dispatchable by __torch_function__
+    ...     return a + 0
+    """
+    def inner(func):
+        @functools.wraps(func)
+        def wrapped(*args, **kwargs):
+            relevant_args = dispatcher(*args, **kwargs)
+            if has_torch_function(relevant_args):
+                return handle_torch_function(wrapped, relevant_args, *args, **kwargs)
+
+            return func(*args, **kwargs)
+
+        return wrapped
+
+    return inner
+
+def _get_overloaded_args(relevant_args: Iterable[Any], get_type_fn: Callable[[Any], Type] = None) -> List[Any]:
+    """Returns a list of arguments on which to call __torch_function__.
+
+    Checks arguments in relevant_args for __torch_function__ implementations,
+    storing references to the arguments and their types in overloaded_args and
+    overloaded_types in order of calling precedence. Only distinct types are
+    considered. If a type is a subclass of another type it will have higher
+    precedence, otherwise the precedence order is the same as the order of
+    arguments in relevant_args, that is, from left-to-right in the argument list.
+
+    The precedence-determining algorithm implemented in this function is
+    described in `NEP-0018`_.
+
+    See torch::append_overloaded_arg for the equivalent function in the C++
+    implementation.
+
+    Parameters
+    ----------
+    relevant_args : iterable of array-like
+        Iterable of array-like arguments to check for __torch_function__
+        methods.
+
+    get_type_fn : callable, optional
+        Function to call on each argument in relevant_args to get its type.
+
+    Returns
+    -------
+    overloaded_args : list
+        Arguments from relevant_args on which to call __torch_function__
+        methods, in the order in which they should be called.
+
+    .. _NEP-0018:
+       https://numpy.org/neps/nep-0018-array-function-protocol.html
+    """
+    if get_type_fn is None:
+        get_type_fn = type
+
+    # If torch function is not enabled, there are no overloaded types
+    if not torch._C._is_torch_function_enabled():
+        return []
+    # Runtime is O(num_arguments * num_unique_types)
+    overloaded_types: Set[Type] = set()
+    overloaded_args: List[Any] = []
+    for arg in relevant_args:
+        arg_type = get_type_fn(arg)
+        # We only collect arguments if they have a unique type, which ensures
+        # reasonable performance even with a long list of possibly overloaded
+        # arguments.
+        #
+        # NB: Important to exclude _disabled_torch_function_impl, otherwise
+        # https://github.com/pytorch/pytorch/issues/64687
+        if (arg_type not in overloaded_types and hasattr(arg_type, '__torch_function__') and
+                arg_type.__torch_function__ != torch._C._disabled_torch_function_impl):
+            # Create lists explicitly for the first type (usually the only one
+            # done) to avoid setting up the iterator for overloaded_args.
+            if overloaded_types:
+                overloaded_types.add(arg_type)
+                # By default, insert argument at the end, but if it is
+                # subclass of another argument, insert it before that argument.
+                # This ensures "subclasses before superclasses".
+                index = len(overloaded_args)
+                for i, old_arg in enumerate(overloaded_args):
+                    if issubclass(arg_type, get_type_fn(old_arg)):
+                        index = i
+                        break
+                overloaded_args.insert(index, arg)
+            else:
+                overloaded_types = {arg_type}
+                overloaded_args = [arg]
+    return overloaded_args
+
+
+def handle_torch_function(
+        public_api: Callable, relevant_args: Iterable[Any], *args, **kwargs) -> Any:
+    """Implement a function with checks for ``__torch_function__`` overrides.
+
+    See torch::autograd::handle_torch_function for the equivalent of this
+    function in the C++ implementation.
+
+    Arguments
+    ---------
+    public_api : function
+        Function exposed by the public torch API originally called like
+        ``public_api(*args, **kwargs)`` on which arguments are now being
+        checked.
+    relevant_args : iterable
+        Iterable of arguments to check for __torch_function__ methods.
+    args : tuple
+        Arbitrary positional arguments originally passed into ``public_api``.
+    kwargs : tuple
+        Arbitrary keyword arguments originally passed into ``public_api``.
+
+    Returns
+    -------
+    object
+        Result from calling ``implementation`` or an ``__torch_function__``
+        method, as appropriate.
+
+    Raises
+    ------
+    TypeError : if no implementation is found.
+
+    Example
+    -------
+    >>> def func(a):
+    ...     if has_torch_function_unary(a):
+    ...         return handle_torch_function(func, (a,), a)
+    ...     return a + 0
+    """
+    # Check for __torch_function__ methods.
+    overloaded_args = _get_overloaded_args(relevant_args)
+    # overloaded_args already have unique types.
+    types = tuple(map(type, overloaded_args))
+
+    # Check for __torch_function__ mode.
+    if _is_torch_function_mode_enabled():
+        # if we're here, the mode must be set to a TorchFunctionStackMode
+        # this unsets it and calls directly into TorchFunctionStackMode's torch function
+        with _pop_mode_temporarily() as mode:
+            result = mode.__torch_function__(public_api, types, args, kwargs)
+        if result is not NotImplemented:
+            return result
+
+    # Call overrides
+    for overloaded_arg in overloaded_args:
+        # This call needs to become a classmethod call in the future.
+        # See https://github.com/pytorch/pytorch/issues/63767
+        torch_func_method = overloaded_arg.__torch_function__
+        if hasattr(torch_func_method, "__self__") and torch_func_method.__self__ is overloaded_arg and \
+                torch_func_method is not torch._C._disabled_torch_function_impl:
+            warnings.warn("Defining your `__torch_function__ as a plain method is deprecated and "
+                          "will be an error in future, please define it as a classmethod.",
+                          DeprecationWarning)
+
+        # Use `public_api` instead of `implementation` so __torch_function__
+        # implementations can do equality/identity comparisons.
+        result = torch_func_method(public_api, types, args, kwargs)
+
+        if result is not NotImplemented:
+            return result
+
+    func_name = f'{public_api.__module__}.{public_api.__name__}'
+    msg = (
+        f"no implementation found for '{func_name}' on types that implement "
+        f'__torch_function__: {[type(arg) for arg in overloaded_args]}'
+    )
+    if _is_torch_function_mode_enabled():
+        msg += f" nor in mode {_get_current_function_mode()}"
+    raise TypeError(msg)
+
+has_torch_function = _add_docstr(
+    _has_torch_function,
+    r"""Check for __torch_function__ implementations in the elements of an iterable
+    or if a __torch_function__ mode is enabled.  Considers exact ``Tensor`` s
+    and ``Parameter`` s non-dispatchable.  Use this to guard a call to
+    :func:`handle_torch_function`; don't use it to test if something
+    is Tensor-like, use :func:`is_tensor_like` instead.
+    Arguments
+    ---------
+    relevant_args : iterable
+        Iterable or arguments to check for __torch_function__ methods.
+    Returns
+    -------
+    bool
+        True if any of the elements of relevant_args have __torch_function__
+        implementations, False otherwise.
+    See Also
+    ________
+    torch.is_tensor_like
+        Checks if something is a Tensor-like, including an exact ``Tensor``.
+    """
+)
+
+has_torch_function_unary = _add_docstr(
+    _has_torch_function_unary,
+    r"""Special case of `has_torch_function` for single inputs.
+    Instead of:
+      `has_torch_function((t,))`
+    call:
+      `has_torch_function_unary(t)`
+    which skips unnecessary packing and unpacking work.
+    """
+)
+
+has_torch_function_variadic = _add_docstr(
+    _has_torch_function_variadic,
+    r"""Special case of `has_torch_function` that skips tuple creation.
+
+    This uses the METH_FASTCALL protocol introduced in Python 3.7
+
+    Instead of:
+      `has_torch_function((a, b))`
+    call:
+      `has_torch_function_variadic(a, b)`
+    which skips unnecessary packing and unpacking work.
+    """
+)
+
+@functools.lru_cache(None)
+def _get_overridable_functions() -> Tuple[Dict[Any, List[Callable]], Dict[Callable, str]]:
+    overridable_funcs = collections.defaultdict(list)
+    index = {}
+    tested_namespaces = [
+        ("torch", torch, torch.__all__),
+        ("torch.functional", torch.functional, torch.functional.__all__),
+        ("torch.nn.functional", torch.nn.functional, dir(torch.nn.functional)),
+        ("torch.nn.init", torch.nn.init, dir(torch.nn.init)),
+        ("torch.Tensor", torch.Tensor, dir(torch.Tensor)),
+        ("torch.linalg", torch.linalg, dir(torch.linalg)),
+        ("torch.fft", torch.fft, dir(torch.fft)),
+        ("torch.special", torch.special, dir(torch.special)),
+    ]
+    for namespace_str, namespace, ns_funcs in tested_namespaces:
+        for func_name in ns_funcs:
+            ignore = False
+            # ignore private functions or functions that are deleted in torch.__init__
+            if namespace is not torch.Tensor:
+                if func_name.startswith('__'):
+                    continue
+                elif func_name.startswith('_'):
+                    ignore = True
+                elif func_name.endswith('_'):
+                    ignore = True
+                elif not func_name[0].islower():
+                    ignore = True
+                elif func_name == 'unique_dim':
+                    continue
+            else:
+                func = getattr(namespace, func_name)
+                if getattr(object, func_name, None) == func:
+                    continue
+                if func_name == '__weakref__':
+                    continue
+            func = getattr(namespace, func_name)
+            if namespace is torch.Tensor and getattr(object, func_name, None) == func:
+                continue
+            # ignore re-exported modules
+            if isinstance(func, types.ModuleType):
+                continue
+            # ignore __future__ imports
+            if isinstance(func, __future__._Feature):
+                continue
+
+            if not callable(func) and hasattr(func, "__get__"):
+                index[func.__get__] = f"{namespace_str}.{func_name}.__get__"
+                index[func.__set__] = f"{namespace_str}.{func_name}.__set__"
+                if ignore:
+                    continue
+                if func.__get__ in get_ignored_functions():
+                    msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
+                           "but still has an explicit override")
+                    assert func.__get__ not in get_testing_overrides(), msg.format(namespace, func.__name__)
+                    continue
+                else:
+                    overridable_funcs[func].append(func.__get__)
+                    continue
+
+            if not callable(func):
+                continue
+
+            index[func] = f"{namespace_str}.{func_name}"
+
+            if ignore:
+                continue
+
+            # cannot be overriden by __torch_function__
+            if func in get_ignored_functions():
+                msg = ("{}.{} is in the tuple returned by torch._overrides.get_ignored_functions "
+                       "but still has an explicit override")
+                assert func not in get_testing_overrides(), msg.format(namespace, func.__name__)
+                continue
+            overridable_funcs[namespace].append(func)
+    return overridable_funcs, index
+
+@_disable_user_warnings
+def get_overridable_functions() -> Dict[Any, List[Callable]]:
+    """List functions that are overridable via __torch_function__
+
+    Returns
+    -------
+    Dict[Any, List[Callable]]
+        A dictionary that maps namespaces that contain overridable functions
+        to functions in that namespace that can be overridden.
+    """
+    return _get_overridable_functions()[0]
+
+@_disable_user_warnings
+def resolve_name(f):
+    """Get a human readable string name for a function passed to
+    __torch_function__
+
+    Arguments
+    ---------
+    f : Callable
+        Function to resolve the name of.
+
+    Returns
+    -------
+    str
+        Name of the function; if eval'ed it should give back the input
+        function.
+    """
+    if isinstance(f, (torch._ops.OpOverload, torch._ops.OpOverloadPacket)):
+        return str(f)
+    return _get_overridable_functions()[1].get(f)
+
+@functools.lru_cache(None)
+def _get_tensor_methods() -> Set[Callable]:
+    """ Returns a set of the overridable methods on ``torch.Tensor`` """
+    overridable_funcs = get_overridable_functions()
+    methods = set(overridable_funcs[torch.Tensor])
+    return methods
+
+@_disable_user_warnings
+def is_tensor_method_or_property(func: Callable) -> bool:
+    """
+    Returns True if the function passed in is a handler for a
+    method or property belonging to ``torch.Tensor``, as passed
+    into ``__torch_function__``.
+
+    .. note::
+       For properties, their ``__get__`` method must be passed in.
+
+    This may be needed, in particular, for the following reasons:
+
+    1. Methods/properties sometimes don't contain a `__module__` slot.
+    2. They require that the first passed-in argument is an instance
+       of ``torch.Tensor``.
+
+    Examples
+    --------
+    >>> is_tensor_method_or_property(torch.Tensor.add)
+    True
+    >>> is_tensor_method_or_property(torch.add)
+    False
+    """
+    return func in _get_tensor_methods() or func.__name__ == "__get__"
+
+def is_tensor_like(inp):
+    """
+    Returns ``True`` if the passed-in input is a Tensor-like.
+
+    Currently, this occurs whenever there's a ``__torch_function__``
+    attribute on the type of the input.
+
+    Examples
+    --------
+    A subclass of tensor is generally a Tensor-like.
+
+    >>> class SubTensor(torch.Tensor): ...
+    >>> is_tensor_like(SubTensor([0]))
+    True
+
+    Built-in or user types aren't usually Tensor-like.
+
+    >>> is_tensor_like(6)
+    False
+    >>> is_tensor_like(None)
+    False
+    >>> class NotATensor: ...
+    >>> is_tensor_like(NotATensor())
+    False
+
+    But, they can be made Tensor-like by implementing __torch_function__.
+
+    >>> class TensorLike:
+    ...     @classmethod
+    ...     def __torch_function__(cls, func, types, args, kwargs):
+    ...         return -1
+    >>> is_tensor_like(TensorLike())
+    True
+    """
+    return type(inp) is torch.Tensor or hasattr(inp, "__torch_function__")
+
+class TorchFunctionMode:
+    """
+    A ``TorchFunctionMode`` allows you to override the meaning of all
+    ``__torch_function__`` overrideable functions within a dynamic scope,
+    without having to actually create a tensor subclass or manually
+    monkey-patch functions in the PyTorch API.  Some common situations
+    where you should use a mode:
+
+        * You want to override the meaning of factory functions, or other
+          functions that do not otherwise take a tensor as an argument
+          (these cannot be overridden with tensor subclasses).
+
+        * You want to override the behavior of all functions without needing
+          to wrap your inputs in tensor subclasses; e.g., if you are just
+          interested in logging intermediate computations.
+
+        * You want to control the order of execution of various tensor
+          subclasses explicitly, rather than implicitly via the return of
+          ``NotImplemented``.
+
+    Independent subclasses of :class:`TorchFunctionMode` are compositional:
+    modes can be pushed onto a stack using ``with MyMode():``.
+    When you call functions in the PyTorch API inside your
+    ``__torch_function__`` implementation, by default, they will forward on to
+    the next mode on the mode stack.  If you want recursively call back into
+    your current ``__torch_function__`` implementation, either explicitly
+    invoke ``self.__torch_function__(...)``, or use the context manager
+    ``enable_torch_function_mode(self, replace=self.inner)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+    inner: "TorchFunctionMode"
+
+    # Force metaclass to generate constructor at the base of the hierarchy
+    def __init__(self):
+        pass
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        raise NotImplementedError()
+
+    def __enter__(self):
+        _push_mode(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        _pop_mode()
+
+    @classmethod
+    def push(cls, *args, **kwargs):
+        warnings.warn("`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`")
+        instance = cls(*args, **kwargs)
+        return instance
+
+
+def _get_current_function_mode():
+    stack_len = _len_torch_function_stack()
+    return _get_function_stack_at(stack_len - 1) if stack_len > 0 else None
+
+
+def _get_current_function_mode_stack():
+    stack_len = _len_torch_function_stack()
+    return [_get_function_stack_at(i) for i in range(stack_len)]
+
+def _push_mode(mode):
+    _push_on_torch_function_stack(mode)
+
+
+def _pop_mode():
+    old = _pop_torch_function_stack()
+    return old
+
+
+@contextlib.contextmanager
+def _pop_mode_temporarily():
+    old = _pop_mode()
+    try:
+        yield old
+    finally:
+        _push_mode(old)
+
+class BaseTorchFunctionMode(TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+
+@contextlib.contextmanager
+def enable_reentrant_dispatch():
+    # NB: this can't simply be
+    # `enable_reentrant_dispatch = torch._C._RestorePythonTLSSnapshot`
+    # because:
+    # 1. torch._C._RestorePythonTLSSnapshot is unavailable when this file
+    #    initially gets imported. Probably an import order thing.
+    # 2. enable_reentrant_dispatch is technically public API; assigning
+    #    it the object would change the __module__ to look private.
+    with torch._C._RestorePythonTLSSnapshot():
+        try:
+            yield
+        finally:
+            pass
diff --git a/MLPY/Lib/site-packages/torch/package/__init__.py b/MLPY/Lib/site-packages/torch/package/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed5dcbe21c580d69a4f14318e8a505e1f62a9f2a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/__init__.py
@@ -0,0 +1,12 @@
+from .analyze.is_from_package import is_from_package
+from .file_structure_representation import Directory
+from .glob_group import GlobGroup
+from .importer import (
+    Importer,
+    ObjMismatchError,
+    ObjNotFoundError,
+    OrderedImporter,
+    sys_importer,
+)
+from .package_exporter import EmptyMatchError, PackageExporter, PackagingError
+from .package_importer import PackageImporter
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78f3572126f54f0211d26daf79ecb73001020398
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_digraph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_digraph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..682a97664083b184ff62dbf1eeaacfe7363238f1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_digraph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_directory_reader.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_directory_reader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49ac5a83edbc4d218fa3e9bc18038223e6ed74be
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_directory_reader.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_importlib.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_importlib.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac5840f43e0a71821dc0a0a4de0d164a4186f7c5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_importlib.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_mangling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_mangling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6c03005e8a1352c58c6b550dbd26b257bceaaf6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_mangling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_mock.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_mock.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13366d0b693b073710daa7eb42a7aac060d52277
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_mock.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_package_pickler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_package_pickler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89b42caffc7442843d504c3c8afc8cb0d63adf05
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_package_pickler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_package_unpickler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_package_unpickler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..646de20ca08521de889eed422bb5800660915851
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_package_unpickler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/_stdlib.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/_stdlib.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6105cd1a6c7844280824fdffb525931b7c9c280
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/_stdlib.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/file_structure_representation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/file_structure_representation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48b378c33d5cb2c5a4ce78a8f0b120e12e588c77
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/file_structure_representation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/find_file_dependencies.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/find_file_dependencies.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54ec11b93411c6df646c3efc3bab989f2d00d77d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/find_file_dependencies.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/glob_group.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/glob_group.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..116d4b14abe11b644b72bd13421680ebc67f3105
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/glob_group.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/importer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/importer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71a169541769881c223083d360ad08158ea7df38
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/importer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/package_exporter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/package_exporter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49fc9253063ce7835ffa882fab3880fa99653f34
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/package_exporter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/__pycache__/package_importer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/__pycache__/package_importer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..242f19e524c2ece12c466b466e2038e4aa1256a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/__pycache__/package_importer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/_digraph.py b/MLPY/Lib/site-packages/torch/package/_digraph.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc9dde57bbd226d58c90885f270a3cb5a13ef0e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_digraph.py
@@ -0,0 +1,173 @@
+from collections import deque
+from typing import List, Set
+
+
+class DiGraph:
+    """Really simple unweighted directed graph data structure to track dependencies.
+
+    The API is pretty much the same as networkx so if you add something just
+    copy their API.
+    """
+
+    def __init__(self):
+        # Dict of node -> dict of arbitrary attributes
+        self._node = {}
+        # Nested dict of node -> successor node -> nothing.
+        # (didn't implement edge data)
+        self._succ = {}
+        # Nested dict of node -> predecessor node -> nothing.
+        self._pred = {}
+
+        # Keep track of the order in which nodes are added to
+        # the graph.
+        self._node_order = {}
+        self._insertion_idx = 0
+
+    def add_node(self, n, **kwargs):
+        """Add a node to the graph.
+
+        Args:
+            n: the node. Can we any object that is a valid dict key.
+            **kwargs: any attributes you want to attach to the node.
+        """
+        if n not in self._node:
+            self._node[n] = kwargs
+            self._succ[n] = {}
+            self._pred[n] = {}
+            self._node_order[n] = self._insertion_idx
+            self._insertion_idx += 1
+        else:
+            self._node[n].update(kwargs)
+
+    def add_edge(self, u, v):
+        """Add an edge to graph between nodes ``u`` and ``v``
+
+        ``u`` and ``v`` will be created if they do not already exist.
+        """
+        # add nodes
+        self.add_node(u)
+        self.add_node(v)
+
+        # add the edge
+        self._succ[u][v] = True
+        self._pred[v][u] = True
+
+    def successors(self, n):
+        """Returns an iterator over successor nodes of n."""
+        try:
+            return iter(self._succ[n])
+        except KeyError as e:
+            raise ValueError(f"The node {n} is not in the digraph.") from e
+
+    def predecessors(self, n):
+        """Returns an iterator over predecessors nodes of n."""
+        try:
+            return iter(self._pred[n])
+        except KeyError as e:
+            raise ValueError(f"The node {n} is not in the digraph.") from e
+
+    @property
+    def edges(self):
+        """Returns an iterator over all edges (u, v) in the graph"""
+        for n, successors in self._succ.items():
+            for succ in successors:
+                yield n, succ
+
+    @property
+    def nodes(self):
+        """Returns a dictionary of all nodes to their attributes."""
+        return self._node
+
+    def __iter__(self):
+        """Iterate over the nodes."""
+        return iter(self._node)
+
+    def __contains__(self, n):
+        """Returns True if ``n`` is a node in the graph, False otherwise."""
+        try:
+            return n in self._node
+        except TypeError:
+            return False
+
+    def forward_transitive_closure(self, src: str) -> Set[str]:
+        """Returns a set of nodes that are reachable from src"""
+
+        result = set(src)
+        working_set = deque(src)
+        while len(working_set) > 0:
+            cur = working_set.popleft()
+            for n in self.successors(cur):
+                if n not in result:
+                    result.add(n)
+                    working_set.append(n)
+        return result
+
+    def backward_transitive_closure(self, src: str) -> Set[str]:
+        """Returns a set of nodes that are reachable from src in reverse direction"""
+
+        result = set(src)
+        working_set = deque(src)
+        while len(working_set) > 0:
+            cur = working_set.popleft()
+            for n in self.predecessors(cur):
+                if n not in result:
+                    result.add(n)
+                    working_set.append(n)
+        return result
+
+    def all_paths(self, src: str, dst: str):
+        """Returns a subgraph rooted at src that shows all the paths to dst."""
+
+        result_graph = DiGraph()
+        # First compute forward transitive closure of src (all things reachable from src).
+        forward_reachable_from_src = self.forward_transitive_closure(src)
+
+        if dst not in forward_reachable_from_src:
+            return result_graph
+
+        # Second walk the reverse dependencies of dst, adding each node to
+        # the output graph iff it is also present in forward_reachable_from_src.
+        # we don't use backward_transitive_closures for optimization purposes
+        working_set = deque(dst)
+        while len(working_set) > 0:
+            cur = working_set.popleft()
+            for n in self.predecessors(cur):
+                if n in forward_reachable_from_src:
+                    result_graph.add_edge(n, cur)
+                    # only explore further if its reachable from src
+                    working_set.append(n)
+
+        return result_graph.to_dot()
+
+    def first_path(self, dst: str) -> List[str]:
+        """Returns a list of nodes that show the first path that resulted in dst being added to the graph."""
+        path = []
+
+        while dst:
+            path.append(dst)
+            candidates = self._pred[dst].keys()
+            dst, min_idx = "", None
+            for candidate in candidates:
+                idx = self._node_order.get(candidate, None)
+                if idx is None:
+                    break
+                if min_idx is None or idx < min_idx:
+                    min_idx = idx
+                    dst = candidate
+
+        return list(reversed(path))
+
+    def to_dot(self) -> str:
+        """Returns the dot representation of the graph.
+
+        Returns:
+            A dot representation of the graph.
+        """
+        edges = "\n".join(f'"{f}" -> "{t}";' for f, t in self.edges)
+        return f"""\
+digraph G {{
+rankdir = LR;
+node [shape=box];
+{edges}
+}}
+"""
diff --git a/MLPY/Lib/site-packages/torch/package/_directory_reader.py b/MLPY/Lib/site-packages/torch/package/_directory_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1bdd20f7dcc34f3d45b3353806ed480b031f621
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_directory_reader.py
@@ -0,0 +1,63 @@
+import os.path
+from glob import glob
+from typing import cast
+
+import torch
+from torch.types import Storage
+
+__serialization_id_record_name__ = ".data/serialization_id"
+
+
+# because get_storage_from_record returns a tensor!?
+class _HasStorage:
+    def __init__(self, storage):
+        self._storage = storage
+
+    def storage(self):
+        return self._storage
+
+
+class DirectoryReader:
+    """
+    Class to allow PackageImporter to operate on unzipped packages. Methods
+    copy the behavior of the internal PyTorchFileReader class (which is used for
+    accessing packages in all other cases).
+
+    N.B.: ScriptObjects are not depickleable or accessible via this DirectoryReader
+    class due to ScriptObjects requiring an actual PyTorchFileReader instance.
+    """
+
+    def __init__(self, directory):
+        self.directory = directory
+
+    def get_record(self, name):
+        filename = f"{self.directory}/{name}"
+        with open(filename, "rb") as f:
+            return f.read()
+
+    def get_storage_from_record(self, name, numel, dtype):
+        filename = f"{self.directory}/{name}"
+        nbytes = torch._utils._element_size(dtype) * numel
+        storage = cast(Storage, torch.UntypedStorage)
+        return _HasStorage(storage.from_file(filename=filename, nbytes=nbytes))
+
+    def has_record(self, path):
+        full_path = os.path.join(self.directory, path)
+        return os.path.isfile(full_path)
+
+    def get_all_records(
+        self,
+    ):
+        files = []
+        for filename in glob(f"{self.directory}/**", recursive=True):
+            if not os.path.isdir(filename):
+                files.append(filename[len(self.directory) + 1 :])
+        return files
+
+    def serialization_id(
+        self,
+    ):
+        if self.has_record(__serialization_id_record_name__):
+            return self.get_record(__serialization_id_record_name__)
+        else:
+            return ""
diff --git a/MLPY/Lib/site-packages/torch/package/_importlib.py b/MLPY/Lib/site-packages/torch/package/_importlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0398a0de25640638be057097a4b9c6f49a0a49
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_importlib.py
@@ -0,0 +1,93 @@
+import _warnings
+import os.path
+
+# note: implementations
+# copied from cpython's import code
+
+
+# _zip_searchorder defines how we search for a module in the Zip
+# archive: we first search for a package __init__, then for
+# non-package .pyc, and .py entries. The .pyc entries
+# are swapped by initzipimport() if we run in optimized mode. Also,
+# '/' is replaced by path_sep there.
+
+_zip_searchorder = (
+    ("/__init__.py", True),
+    (".py", False),
+)
+
+
+# Replace any occurrences of '\r\n?' in the input string with '\n'.
+# This converts DOS and Mac line endings to Unix line endings.
+def _normalize_line_endings(source):
+    source = source.replace(b"\r\n", b"\n")
+    source = source.replace(b"\r", b"\n")
+    return source
+
+
+def _resolve_name(name, package, level):
+    """Resolve a relative module name to an absolute one."""
+    bits = package.rsplit(".", level - 1)
+    if len(bits) < level:
+        raise ValueError("attempted relative import beyond top-level package")
+    base = bits[0]
+    return f"{base}.{name}" if name else base
+
+
+def _sanity_check(name, package, level):
+    """Verify arguments are "sane"."""
+    if not isinstance(name, str):
+        raise TypeError(f"module name must be str, not {type(name)}")
+    if level < 0:
+        raise ValueError("level must be >= 0")
+    if level > 0:
+        if not isinstance(package, str):
+            raise TypeError("__package__ not set to a string")
+        elif not package:
+            raise ImportError("attempted relative import with no known parent package")
+    if not name and level == 0:
+        raise ValueError("Empty module name")
+
+
+def _calc___package__(globals):
+    """Calculate what __package__ should be.
+
+    __package__ is not guaranteed to be defined or could be set to None
+    to represent that its proper value is unknown.
+
+    """
+    package = globals.get("__package__")
+    spec = globals.get("__spec__")
+    if package is not None:
+        if spec is not None and package != spec.parent:
+            _warnings.warn(  # noqa: G010
+                f"__package__ != __spec__.parent ({package!r} != {spec.parent!r})",  # noqa: G004
+                ImportWarning,
+                stacklevel=3,
+            )
+        return package
+    elif spec is not None:
+        return spec.parent
+    else:
+        _warnings.warn(  # noqa: G010
+            "can't resolve package from __spec__ or __package__, "
+            "falling back on __name__ and __path__",
+            ImportWarning,
+            stacklevel=3,
+        )
+        package = globals["__name__"]
+        if "__path__" not in globals:
+            package = package.rpartition(".")[0]
+    return package
+
+
+def _normalize_path(path):
+    """Normalize a path by ensuring it is a string.
+
+    If the resulting string contains path separators, an exception is raised.
+    """
+    parent, file_name = os.path.split(path)
+    if parent:
+        raise ValueError(f"{path!r} must be only a file name")
+    else:
+        return file_name
diff --git a/MLPY/Lib/site-packages/torch/package/_mangling.py b/MLPY/Lib/site-packages/torch/package/_mangling.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9243a15b4f0d5843264ff16b4f784d31286df02
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_mangling.py
@@ -0,0 +1,62 @@
+"""Import mangling.
+See mangling.md for details.
+"""
+import re
+
+_mangle_index = 0
+
+
+class PackageMangler:
+    """
+    Used on import, to ensure that all modules imported have a shared mangle parent.
+    """
+
+    def __init__(self):
+        global _mangle_index
+        self._mangle_index = _mangle_index
+        # Increment the global index
+        _mangle_index += 1
+        # Angle brackets are used so that there is almost no chance of
+        # confusing this module for a real module. Plus, it is Python's
+        # preferred way of denoting special modules.
+        self._mangle_parent = f"<torch_package_{self._mangle_index}>"
+
+    def mangle(self, name) -> str:
+        assert len(name) != 0
+        return self._mangle_parent + "." + name
+
+    def demangle(self, mangled: str) -> str:
+        """
+        Note: This only demangles names that were mangled by this specific
+        PackageMangler. It will pass through names created by a different
+        PackageMangler instance.
+        """
+        if mangled.startswith(self._mangle_parent + "."):
+            return mangled.partition(".")[2]
+
+        # wasn't a mangled name
+        return mangled
+
+    def parent_name(self):
+        return self._mangle_parent
+
+
+def is_mangled(name: str) -> bool:
+    return bool(re.match(r"<torch_package_\d+>", name))
+
+
+def demangle(name: str) -> str:
+    """
+    Note: Unlike PackageMangler.demangle, this version works on any
+    mangled name, irrespective of which PackageMangler created it.
+    """
+    if is_mangled(name):
+        first, sep, last = name.partition(".")
+        # If there is only a base mangle prefix, e.g. '<torch_package_0>',
+        # then return an empty string.
+        return last if len(sep) != 0 else ""
+    return name
+
+
+def get_mangle_prefix(name: str) -> str:
+    return name.partition(".")[0] if is_mangled(name) else name
diff --git a/MLPY/Lib/site-packages/torch/package/_mock.py b/MLPY/Lib/site-packages/torch/package/_mock.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20e14df8b573891bff7fb4102306f6643bf824b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_mock.py
@@ -0,0 +1,122 @@
+_magic_methods = [
+    "__subclasscheck__",
+    "__hex__",
+    "__rmul__",
+    "__float__",
+    "__idiv__",
+    "__setattr__",
+    "__div__",
+    "__invert__",
+    "__nonzero__",
+    "__rshift__",
+    "__eq__",
+    "__pos__",
+    "__round__",
+    "__rand__",
+    "__or__",
+    "__complex__",
+    "__divmod__",
+    "__len__",
+    "__reversed__",
+    "__copy__",
+    "__reduce__",
+    "__deepcopy__",
+    "__rdivmod__",
+    "__rrshift__",
+    "__ifloordiv__",
+    "__hash__",
+    "__iand__",
+    "__xor__",
+    "__isub__",
+    "__oct__",
+    "__ceil__",
+    "__imod__",
+    "__add__",
+    "__truediv__",
+    "__unicode__",
+    "__le__",
+    "__delitem__",
+    "__sizeof__",
+    "__sub__",
+    "__ne__",
+    "__pow__",
+    "__bytes__",
+    "__mul__",
+    "__itruediv__",
+    "__bool__",
+    "__iter__",
+    "__abs__",
+    "__gt__",
+    "__iadd__",
+    "__enter__",
+    "__floordiv__",
+    "__call__",
+    "__neg__",
+    "__and__",
+    "__ixor__",
+    "__getitem__",
+    "__exit__",
+    "__cmp__",
+    "__getstate__",
+    "__index__",
+    "__contains__",
+    "__floor__",
+    "__lt__",
+    "__getattr__",
+    "__mod__",
+    "__trunc__",
+    "__delattr__",
+    "__instancecheck__",
+    "__setitem__",
+    "__ipow__",
+    "__ilshift__",
+    "__long__",
+    "__irshift__",
+    "__imul__",
+    "__lshift__",
+    "__dir__",
+    "__ge__",
+    "__int__",
+    "__ior__",
+]
+
+
+class MockedObject:
+    _name: str
+
+    def __new__(cls, *args, **kwargs):
+        # _suppress_err is set by us in the mocked module impl, so that we can
+        # construct instances of MockedObject to hand out to people looking up
+        # module attributes.
+
+        # Any other attempt to construct a MockedObject instance (say, in the
+        # unpickling process) should give an error.
+        if not kwargs.get("_suppress_err"):
+            raise NotImplementedError(
+                f"Object '{cls._name}' was mocked out during packaging "
+                f"but it is being used in '__new__'. If this error is "
+                "happening during 'load_pickle', please ensure that your "
+                "pickled object doesn't contain any mocked objects."
+            )
+        # Otherwise, this is just a regular object creation
+        # (e.g. `x = MockedObject("foo")`), so pass it through normally.
+        return super().__new__(cls)
+
+    def __init__(self, name: str, _suppress_err: bool):
+        self.__dict__["_name"] = name
+
+    def __repr__(self):
+        return f"MockedObject({self._name})"
+
+
+def install_method(method_name):
+    def _not_implemented(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"Object '{self._name}' was mocked out during packaging but it is being used in {method_name}"
+        )
+
+    setattr(MockedObject, method_name, _not_implemented)
+
+
+for method_name in _magic_methods:
+    install_method(method_name)
diff --git a/MLPY/Lib/site-packages/torch/package/_package_pickler.py b/MLPY/Lib/site-packages/torch/package/_package_pickler.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e20eb05b2798ea6062adc30d2c71336fbeba09
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_package_pickler.py
@@ -0,0 +1,118 @@
+"""isort:skip_file"""
+from pickle import (  # type: ignore[attr-defined]
+    _compat_pickle,
+    _extension_registry,
+    _getattribute,
+    _Pickler,
+    EXT1,
+    EXT2,
+    EXT4,
+    GLOBAL,
+    Pickler,
+    PicklingError,
+    STACK_GLOBAL,
+)
+from struct import pack
+from types import FunctionType
+
+from .importer import Importer, ObjMismatchError, ObjNotFoundError, sys_importer
+
+
+class PackagePickler(_Pickler):
+    """Package-aware pickler.
+
+    This behaves the same as a normal pickler, except it uses an `Importer`
+    to find objects and modules to save.
+    """
+
+    def __init__(self, importer: Importer, *args, **kwargs):
+        self.importer = importer
+        super().__init__(*args, **kwargs)
+
+        # Make sure the dispatch table copied from _Pickler is up-to-date.
+        # Previous issues have been encountered where a library (e.g. dill)
+        # mutate _Pickler.dispatch, PackagePickler makes a copy when this lib
+        # is imported, then the offending library removes its dispatch entries,
+        # leaving PackagePickler with a stale dispatch table that may cause
+        # unwanted behavior.
+        self.dispatch = _Pickler.dispatch.copy()  # type: ignore[misc]
+        self.dispatch[FunctionType] = PackagePickler.save_global  # type: ignore[assignment]
+
+    def save_global(self, obj, name=None):
+        # unfortunately the pickler code is factored in a way that
+        # forces us to copy/paste this function. The only change is marked
+        # CHANGED below.
+        write = self.write  # type: ignore[attr-defined]
+        memo = self.memo  # type: ignore[attr-defined]
+
+        # CHANGED: import module from module environment instead of __import__
+        try:
+            module_name, name = self.importer.get_name(obj, name)
+        except (ObjNotFoundError, ObjMismatchError) as err:
+            raise PicklingError(f"Can't pickle {obj}: {str(err)}") from None
+
+        module = self.importer.import_module(module_name)
+        _, parent = _getattribute(module, name)
+        # END CHANGED
+
+        if self.proto >= 2:  # type: ignore[attr-defined]
+            code = _extension_registry.get((module_name, name))
+            if code:
+                assert code > 0
+                if code <= 0xFF:
+                    write(EXT1 + pack("<B", code))
+                elif code <= 0xFFFF:
+                    write(EXT2 + pack("<H", code))
+                else:
+                    write(EXT4 + pack("<i", code))
+                return
+        lastname = name.rpartition(".")[2]
+        if parent is module:
+            name = lastname
+        # Non-ASCII identifiers are supported only with protocols >= 3.
+        if self.proto >= 4:  # type: ignore[attr-defined]
+            self.save(module_name)  # type: ignore[attr-defined]
+            self.save(name)  # type: ignore[attr-defined]
+            write(STACK_GLOBAL)
+        elif parent is not module:
+            self.save_reduce(getattr, (parent, lastname))  # type: ignore[attr-defined]
+        elif self.proto >= 3:  # type: ignore[attr-defined]
+            write(
+                GLOBAL
+                + bytes(module_name, "utf-8")
+                + b"\n"
+                + bytes(name, "utf-8")
+                + b"\n"
+            )
+        else:
+            if self.fix_imports:  # type: ignore[attr-defined]
+                r_name_mapping = _compat_pickle.REVERSE_NAME_MAPPING
+                r_import_mapping = _compat_pickle.REVERSE_IMPORT_MAPPING
+                if (module_name, name) in r_name_mapping:
+                    module_name, name = r_name_mapping[(module_name, name)]
+                elif module_name in r_import_mapping:
+                    module_name = r_import_mapping[module_name]
+            try:
+                write(
+                    GLOBAL
+                    + bytes(module_name, "ascii")
+                    + b"\n"
+                    + bytes(name, "ascii")
+                    + b"\n"
+                )
+            except UnicodeEncodeError:
+                raise PicklingError(
+                    "can't pickle global identifier '%s.%s' using "
+                    "pickle protocol %i" % (module, name, self.proto)  # type: ignore[attr-defined]
+                ) from None
+
+        self.memoize(obj)  # type: ignore[attr-defined]
+
+
+def create_pickler(data_buf, importer, protocol=4):
+    if importer is sys_importer:
+        # if we are using the normal import library system, then
+        # we can use the C implementation of pickle which is faster
+        return Pickler(data_buf, protocol=protocol)
+    else:
+        return PackagePickler(importer, data_buf, protocol=protocol)
diff --git a/MLPY/Lib/site-packages/torch/package/_package_unpickler.py b/MLPY/Lib/site-packages/torch/package/_package_unpickler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f84f47eded60af9123348ac300bb3c8bbcc1e68
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_package_unpickler.py
@@ -0,0 +1,26 @@
+import _compat_pickle
+import pickle
+
+from .importer import Importer
+
+
+class PackageUnpickler(pickle._Unpickler):  # type: ignore[name-defined]
+    """Package-aware unpickler.
+
+    This behaves the same as a normal unpickler, except it uses `importer` to
+    find any global names that it encounters while unpickling.
+    """
+
+    def __init__(self, importer: Importer, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._importer = importer
+
+    def find_class(self, module, name):
+        # Subclasses may override this.
+        if self.proto < 3 and self.fix_imports:  # type: ignore[attr-defined]
+            if (module, name) in _compat_pickle.NAME_MAPPING:
+                module, name = _compat_pickle.NAME_MAPPING[(module, name)]
+            elif module in _compat_pickle.IMPORT_MAPPING:
+                module = _compat_pickle.IMPORT_MAPPING[module]
+        mod = self._importer.import_module(module)
+        return getattr(mod, name)
diff --git a/MLPY/Lib/site-packages/torch/package/_stdlib.py b/MLPY/Lib/site-packages/torch/package/_stdlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..0daf06cfa69ba4416d4c7a4e4238cf18c5cc4da3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/_stdlib.py
@@ -0,0 +1,464 @@
+"""List of Python standard library modules.
+
+Sadly, there is no reliable way to tell whether a module is part of the
+standard library except by comparing to a canonical list.
+
+This is taken from https://github.com/PyCQA/isort/tree/develop/isort/stdlibs,
+which itself is sourced from the Python documentation.
+"""
+
+import sys
+
+
+def is_stdlib_module(module: str) -> bool:
+    base_module = module.partition(".")[0]
+    return base_module in _get_stdlib_modules()
+
+
+def _get_stdlib_modules():
+    if sys.version_info.major == 3:
+        if sys.version_info.minor == 8:
+            return stdlib3_8
+        if sys.version_info.minor == 9:
+            return stdlib3_9
+        if sys.version_info.minor >= 10:
+            return sys.stdlib_module_names  # type: ignore[attr-defined]
+    elif sys.version_info.major > 3:
+        return sys.stdlib_module_names  # type: ignore[attr-defined]
+
+    raise RuntimeError(f"Unsupported Python version: {sys.version_info}")
+
+
+stdlib3_8 = {
+    "_dummy_thread",
+    "_thread",
+    "abc",
+    "aifc",
+    "argparse",
+    "array",
+    "ast",
+    "asynchat",
+    "asyncio",
+    "asyncore",
+    "atexit",
+    "audioop",
+    "base64",
+    "bdb",
+    "binascii",
+    "binhex",
+    "bisect",
+    "builtins",
+    "bz2",
+    "cProfile",
+    "calendar",
+    "cgi",
+    "cgitb",
+    "chunk",
+    "cmath",
+    "cmd",
+    "code",
+    "codecs",
+    "codeop",
+    "collections",
+    "colorsys",
+    "compileall",
+    "concurrent",
+    "configparser",
+    "contextlib",
+    "contextvars",
+    "copy",
+    "copyreg",
+    "crypt",
+    "csv",
+    "ctypes",
+    "curses",
+    "dataclasses",
+    "datetime",
+    "dbm",
+    "decimal",
+    "difflib",
+    "dis",
+    "distutils",
+    "doctest",
+    "dummy_threading",
+    "email",
+    "encodings",
+    "ensurepip",
+    "enum",
+    "errno",
+    "faulthandler",
+    "fcntl",
+    "filecmp",
+    "fileinput",
+    "fnmatch",
+    "formatter",
+    "fractions",
+    "ftplib",
+    "functools",
+    "gc",
+    "getopt",
+    "getpass",
+    "gettext",
+    "glob",
+    "grp",
+    "gzip",
+    "hashlib",
+    "heapq",
+    "hmac",
+    "html",
+    "http",
+    "imaplib",
+    "imghdr",
+    "imp",
+    "importlib",
+    "inspect",
+    "io",
+    "ipaddress",
+    "itertools",
+    "json",
+    "keyword",
+    "lib2to3",
+    "linecache",
+    "locale",
+    "logging",
+    "lzma",
+    "mailbox",
+    "mailcap",
+    "marshal",
+    "math",
+    "mimetypes",
+    "mmap",
+    "modulefinder",
+    "msilib",
+    "msvcrt",
+    "multiprocessing",
+    "netrc",
+    "nis",
+    "nntplib",
+    "ntpath",
+    "numbers",
+    "operator",
+    "optparse",
+    "os",
+    "ossaudiodev",
+    "parser",
+    "pathlib",
+    "pdb",
+    "pickle",
+    "pickletools",
+    "pipes",
+    "pkgutil",
+    "platform",
+    "plistlib",
+    "poplib",
+    "posix",
+    "posixpath",
+    "pprint",
+    "profile",
+    "pstats",
+    "pty",
+    "pwd",
+    "py_compile",
+    "pyclbr",
+    "pydoc",
+    "queue",
+    "quopri",
+    "random",
+    "re",
+    "readline",
+    "reprlib",
+    "resource",
+    "rlcompleter",
+    "runpy",
+    "sched",
+    "secrets",
+    "select",
+    "selectors",
+    "shelve",
+    "shlex",
+    "shutil",
+    "signal",
+    "site",
+    "smtpd",
+    "smtplib",
+    "sndhdr",
+    "socket",
+    "socketserver",
+    "spwd",
+    "sqlite3",
+    "sre",
+    "sre_compile",
+    "sre_constants",
+    "sre_parse",
+    "ssl",
+    "stat",
+    "statistics",
+    "string",
+    "stringprep",
+    "struct",
+    "subprocess",
+    "sunau",
+    "symbol",
+    "symtable",
+    "sys",
+    "sysconfig",
+    "syslog",
+    "tabnanny",
+    "tarfile",
+    "telnetlib",
+    "tempfile",
+    "termios",
+    "test",
+    "textwrap",
+    "threading",
+    "time",
+    "timeit",
+    "tkinter",
+    "token",
+    "tokenize",
+    "trace",
+    "traceback",
+    "tracemalloc",
+    "tty",
+    "turtle",
+    "turtledemo",
+    "types",
+    "typing",
+    "unicodedata",
+    "unittest",
+    "urllib",
+    "uu",
+    "uuid",
+    "venv",
+    "warnings",
+    "wave",
+    "weakref",
+    "webbrowser",
+    "winreg",
+    "winsound",
+    "wsgiref",
+    "xdrlib",
+    "xml",
+    "xmlrpc",
+    "zipapp",
+    "zipfile",
+    "zipimport",
+    "zlib",
+}
+
+stdlib3_9 = {
+    "_thread",
+    "abc",
+    "aifc",
+    "argparse",
+    "array",
+    "ast",
+    "asynchat",
+    "asyncio",
+    "asyncore",
+    "atexit",
+    "audioop",
+    "base64",
+    "bdb",
+    "binascii",
+    "binhex",
+    "bisect",
+    "builtins",
+    "bz2",
+    "cProfile",
+    "calendar",
+    "cgi",
+    "cgitb",
+    "chunk",
+    "cmath",
+    "cmd",
+    "code",
+    "codecs",
+    "codeop",
+    "collections",
+    "colorsys",
+    "compileall",
+    "concurrent",
+    "configparser",
+    "contextlib",
+    "contextvars",
+    "copy",
+    "copyreg",
+    "crypt",
+    "csv",
+    "ctypes",
+    "curses",
+    "dataclasses",
+    "datetime",
+    "dbm",
+    "decimal",
+    "difflib",
+    "dis",
+    "distutils",
+    "doctest",
+    "email",
+    "encodings",
+    "ensurepip",
+    "enum",
+    "errno",
+    "faulthandler",
+    "fcntl",
+    "filecmp",
+    "fileinput",
+    "fnmatch",
+    "formatter",
+    "fractions",
+    "ftplib",
+    "functools",
+    "gc",
+    "getopt",
+    "getpass",
+    "gettext",
+    "glob",
+    "graphlib",
+    "grp",
+    "gzip",
+    "hashlib",
+    "heapq",
+    "hmac",
+    "html",
+    "http",
+    "imaplib",
+    "imghdr",
+    "imp",
+    "importlib",
+    "inspect",
+    "io",
+    "ipaddress",
+    "itertools",
+    "json",
+    "keyword",
+    "lib2to3",
+    "linecache",
+    "locale",
+    "logging",
+    "lzma",
+    "mailbox",
+    "mailcap",
+    "marshal",
+    "math",
+    "mimetypes",
+    "mmap",
+    "modulefinder",
+    "msilib",
+    "msvcrt",
+    "multiprocessing",
+    "netrc",
+    "nis",
+    "nntplib",
+    "ntpath",
+    "numbers",
+    "operator",
+    "optparse",
+    "os",
+    "ossaudiodev",
+    "parser",
+    "pathlib",
+    "pdb",
+    "pickle",
+    "pickletools",
+    "pipes",
+    "pkgutil",
+    "platform",
+    "plistlib",
+    "poplib",
+    "posix",
+    "posixpath",
+    "pprint",
+    "profile",
+    "pstats",
+    "pty",
+    "pwd",
+    "py_compile",
+    "pyclbr",
+    "pydoc",
+    "queue",
+    "quopri",
+    "random",
+    "re",
+    "readline",
+    "reprlib",
+    "resource",
+    "rlcompleter",
+    "runpy",
+    "sched",
+    "secrets",
+    "select",
+    "selectors",
+    "shelve",
+    "shlex",
+    "shutil",
+    "signal",
+    "site",
+    "smtpd",
+    "smtplib",
+    "sndhdr",
+    "socket",
+    "socketserver",
+    "spwd",
+    "sqlite3",
+    "sre",
+    "sre_compile",
+    "sre_constants",
+    "sre_parse",
+    "ssl",
+    "stat",
+    "statistics",
+    "string",
+    "stringprep",
+    "struct",
+    "subprocess",
+    "sunau",
+    "symbol",
+    "symtable",
+    "sys",
+    "sysconfig",
+    "syslog",
+    "tabnanny",
+    "tarfile",
+    "telnetlib",
+    "tempfile",
+    "termios",
+    "test",
+    "textwrap",
+    "threading",
+    "time",
+    "timeit",
+    "tkinter",
+    "token",
+    "tokenize",
+    "trace",
+    "traceback",
+    "tracemalloc",
+    "tty",
+    "turtle",
+    "turtledemo",
+    "types",
+    "typing",
+    "unicodedata",
+    "unittest",
+    "urllib",
+    "uu",
+    "uuid",
+    "venv",
+    "warnings",
+    "wave",
+    "weakref",
+    "webbrowser",
+    "winreg",
+    "winsound",
+    "wsgiref",
+    "xdrlib",
+    "xml",
+    "xmlrpc",
+    "zipapp",
+    "zipfile",
+    "zipimport",
+    "zlib",
+    "zoneinfo",
+}
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/__init__.py b/MLPY/Lib/site-packages/torch/package/analyze/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c921dbacd653585998cd7540cb27bd0ba2f6929
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/analyze/__init__.py
@@ -0,0 +1,2 @@
+from .find_first_use_of_broken_modules import find_first_use_of_broken_modules
+from .trace_dependencies import trace_dependencies
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66167960db96abd25b02a7468d4802d0e608376f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/find_first_use_of_broken_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/find_first_use_of_broken_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c5f395b4d7362f95b30e3d446a96e036876dc88
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/find_first_use_of_broken_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/is_from_package.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/is_from_package.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ce09d339746044e53ae7df6506b8736c02df778
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/is_from_package.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/trace_dependencies.cpython-39.pyc b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/trace_dependencies.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac33dea9044d679af186d8424a3f91ed5ab1b2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/package/analyze/__pycache__/trace_dependencies.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/find_first_use_of_broken_modules.py b/MLPY/Lib/site-packages/torch/package/analyze/find_first_use_of_broken_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aba737b7900db8b10892459c49e3797b22b46d7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/analyze/find_first_use_of_broken_modules.py
@@ -0,0 +1,31 @@
+from typing import Dict, List
+
+from ..package_exporter import PackagingError
+
+__all__ = ["find_first_use_of_broken_modules"]
+
+
+def find_first_use_of_broken_modules(exc: PackagingError) -> Dict[str, List[str]]:
+    """
+    Find all broken modules in a PackagingError, and for each one, return the
+    dependency path in which the module was first encountered.
+
+    E.g. broken module m.n.o was added to a dependency graph while processing a.b.c,
+    then re-encountered while processing d.e.f. This method would return
+    {'m.n.o': ['a', 'b', 'c']}
+
+    Args:
+        exc: a PackagingError
+
+    Returns: A dict from broken module names to lists of module names in the path.
+    """
+
+    assert isinstance(exc, PackagingError), "exception must be a PackagingError"
+    uses = {}
+    broken_module_names = [
+        m for m, attr in exc.dependency_graph.nodes.items() if attr.get("error", False)
+    ]
+    for module_name in broken_module_names:
+        path = exc.dependency_graph.first_path(module_name)
+        uses[module_name] = path
+    return uses
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/is_from_package.py b/MLPY/Lib/site-packages/torch/package/analyze/is_from_package.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e853b1769fe13eb370bc4acdad40876537a54ba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/analyze/is_from_package.py
@@ -0,0 +1,16 @@
+from types import ModuleType
+from typing import Any
+
+from .._mangling import is_mangled
+
+
+def is_from_package(obj: Any) -> bool:
+    """
+    Return whether an object was loaded from a package.
+
+    Note: packaged objects from externed modules will return ``False``.
+    """
+    if type(obj) == ModuleType:
+        return is_mangled(obj.__name__)
+    else:
+        return is_mangled(type(obj).__module__)
diff --git a/MLPY/Lib/site-packages/torch/package/analyze/trace_dependencies.py b/MLPY/Lib/site-packages/torch/package/analyze/trace_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e48893adc012b56c1394e165242253cba99759d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/analyze/trace_dependencies.py
@@ -0,0 +1,62 @@
+import sys
+from typing import Any, Callable, Iterable, List, Tuple
+
+__all__ = ["trace_dependencies"]
+
+
+def trace_dependencies(
+    callable: Callable[[Any], Any], inputs: Iterable[Tuple[Any, ...]]
+) -> List[str]:
+    """Trace the execution of a callable in order to determine which modules it uses.
+
+    Args:
+        callable: The callable to execute and trace.
+        inputs: The input to use during tracing. The modules used by 'callable' when invoked by each set of inputs
+            are union-ed to determine all modules used by the callable for the purpooses of packaging.
+
+    Returns: A list of the names of all modules used during callable execution.
+    """
+    modules_used = set()
+
+    def record_used_modules(frame, event, arg):
+        # If the event being profiled is not a Python function
+        # call, there is nothing to do.
+        if event != "call":
+            return
+
+        # This is the name of the function that was called.
+        name = frame.f_code.co_name
+        module = None
+
+        # Try to determine the name of the module that the function
+        # is in:
+        #   1) Check the global namespace of the frame.
+        #   2) Check the local namespace of the frame.
+        #   3) To handle class instance method calls, check
+        #       the attribute named 'name' of the object
+        #       in the local namespace corresponding to "self".
+        if name in frame.f_globals:
+            module = frame.f_globals[name].__module__
+        elif name in frame.f_locals:
+            module = frame.f_locals[name].__module__
+        elif "self" in frame.f_locals:
+            method = getattr(frame.f_locals["self"], name, None)
+            module = method.__module__ if method else None
+
+        # If a module was found, add it to the set of used modules.
+        if module:
+            modules_used.add(module)
+
+    try:
+        # Attach record_used_modules as the profiler function.
+        sys.setprofile(record_used_modules)
+
+        # Execute the callable with all inputs.
+        for inp in inputs:
+            callable(*inp)
+
+    finally:
+        # Detach the profiler function.
+        sys.setprofile(None)
+
+    return list(modules_used)
diff --git a/MLPY/Lib/site-packages/torch/package/file_structure_representation.py b/MLPY/Lib/site-packages/torch/package/file_structure_representation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d458af6769337c89612a8996157c05fe8454ddfd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/file_structure_representation.py
@@ -0,0 +1,133 @@
+from typing import Dict, List
+
+from .glob_group import GlobGroup, GlobPattern
+
+__all__ = ["Directory"]
+
+
+class Directory:
+    """A file structure representation. Organized as Directory nodes that have lists of
+    their Directory children. Directories for a package are created by calling
+    :meth:`PackageImporter.file_structure`."""
+
+    def __init__(self, name: str, is_dir: bool):
+        self.name = name
+        self.is_dir = is_dir
+        self.children: Dict[str, Directory] = {}
+
+    def _get_dir(self, dirs: List[str]) -> "Directory":
+        """Builds path of Directories if not yet built and returns last directory
+        in list.
+
+        Args:
+            dirs (List[str]): List of directory names that are treated like a path.
+
+        Returns:
+            :class:`Directory`: The last Directory specified in the dirs list.
+        """
+        if len(dirs) == 0:
+            return self
+        dir_name = dirs[0]
+        if dir_name not in self.children:
+            self.children[dir_name] = Directory(dir_name, True)
+        return self.children[dir_name]._get_dir(dirs[1:])
+
+    def _add_file(self, file_path: str):
+        """Adds a file to a Directory.
+
+        Args:
+            file_path (str): Path of file to add. Last element is added as a file while
+                other paths items are added as directories.
+        """
+        *dirs, file = file_path.split("/")
+        dir = self._get_dir(dirs)
+        dir.children[file] = Directory(file, False)
+
+    def has_file(self, filename: str) -> bool:
+        """Checks if a file is present in a :class:`Directory`.
+
+        Args:
+            filename (str): Path of file to search for.
+        Returns:
+            bool: If a :class:`Directory` contains the specified file.
+        """
+        lineage = filename.split("/", maxsplit=1)
+        child = lineage[0]
+        grandchildren = lineage[1] if len(lineage) > 1 else None
+        if child in self.children.keys():
+            if grandchildren is None:
+                return True
+            else:
+                return self.children[child].has_file(grandchildren)
+        return False
+
+    def __str__(self):
+        str_list: List[str] = []
+        self._stringify_tree(str_list)
+        return "".join(str_list)
+
+    def _stringify_tree(
+        self, str_list: List[str], preamble: str = "", dir_ptr: str = "─── "
+    ):
+        """Recursive method to generate print-friendly version of a Directory."""
+        space = "    "
+        branch = "│   "
+        tee = "├── "
+        last = "└── "
+
+        # add this directory's representation
+        str_list.append(f"{preamble}{dir_ptr}{self.name}\n")
+
+        # add directory's children representations
+        if dir_ptr == tee:
+            preamble = preamble + branch
+        else:
+            preamble = preamble + space
+
+        file_keys: List[str] = []
+        dir_keys: List[str] = []
+        for key, val in self.children.items():
+            if val.is_dir:
+                dir_keys.append(key)
+            else:
+                file_keys.append(key)
+
+        for index, key in enumerate(sorted(dir_keys)):
+            if (index == len(dir_keys) - 1) and len(file_keys) == 0:
+                self.children[key]._stringify_tree(str_list, preamble, last)
+            else:
+                self.children[key]._stringify_tree(str_list, preamble, tee)
+        for index, file in enumerate(sorted(file_keys)):
+            pointer = last if (index == len(file_keys) - 1) else tee
+            str_list.append(f"{preamble}{pointer}{file}\n")
+
+
+def _create_directory_from_file_list(
+    filename: str,
+    file_list: List[str],
+    include: "GlobPattern" = "**",
+    exclude: "GlobPattern" = (),
+) -> Directory:
+    """Return a :class:`Directory` file structure representation created from a list of files.
+
+    Args:
+        filename (str): The name given to the top-level directory that will be the
+            relative root for all file paths found in the file_list.
+
+        file_list (List[str]): List of files to add to the top-level directory.
+
+        include (Union[List[str], str]): An optional pattern that limits what is included from the file_list to
+            files whose name matches the pattern.
+
+        exclude (Union[List[str], str]): An optional pattern that excludes files whose name match the pattern.
+
+    Returns:
+            :class:`Directory`: a :class:`Directory` file structure representation created from a list of files.
+    """
+    glob_pattern = GlobGroup(include, exclude=exclude, separator="/")
+
+    top_dir = Directory(filename, True)
+    for file in file_list:
+        if glob_pattern.matches(file):
+            top_dir._add_file(file)
+    return top_dir
diff --git a/MLPY/Lib/site-packages/torch/package/find_file_dependencies.py b/MLPY/Lib/site-packages/torch/package/find_file_dependencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..537cebfece73c1c7ef7e537201976b7e9f82d5b1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/find_file_dependencies.py
@@ -0,0 +1,95 @@
+import ast
+from typing import List, Optional, Tuple
+
+from ._importlib import _resolve_name
+
+
+class _ExtractModuleReferences(ast.NodeVisitor):
+    """
+    Extract the list of global variables a block of code will read and write
+    """
+
+    @classmethod
+    def run(cls, src: str, package: str) -> List[Tuple[str, Optional[str]]]:
+        visitor = cls(package)
+        tree = ast.parse(src)
+        visitor.visit(tree)
+        return list(visitor.references.keys())
+
+    def __init__(self, package):
+        super().__init__()
+        self.package = package
+        self.references = {}
+
+    def _absmodule(self, module_name: str, level: int) -> str:
+        if level > 0:
+            return _resolve_name(module_name, self.package, level)
+        return module_name
+
+    def visit_Import(self, node):
+        for alias in node.names:
+            self.references[(alias.name, None)] = True
+
+    def visit_ImportFrom(self, node):
+        name = self._absmodule(node.module, 0 if node.level is None else node.level)
+        for alias in node.names:
+            # from my_package import foo
+            # foo may be a module, so we have to add it to the list of
+            # potential references, if import of it fails, we will ignore it
+            if alias.name != "*":
+                self.references[(name, alias.name)] = True
+            else:
+                self.references[(name, None)] = True
+
+    def _grab_node_int(self, node):
+        return node.value
+
+    def _grab_node_str(self, node):
+        return node.value
+
+    def visit_Call(self, node):
+        # __import__ calls aren't routed to the visit_Import/From nodes
+        if hasattr(node.func, "id") and node.func.id == "__import__":
+            try:
+                name = self._grab_node_str(node.args[0])
+                fromlist = []
+                level = 0
+                if len(node.args) > 3:
+                    for v in node.args[3].elts:
+                        fromlist.append(self._grab_node_str(v))
+                elif hasattr(node, "keywords"):
+                    for keyword in node.keywords:
+                        if keyword.arg == "fromlist":
+                            for v in keyword.value.elts:
+                                fromlist.append(self._grab_node_str(v))
+                if len(node.args) > 4:
+                    level = self._grab_node_int(node.args[4])
+                elif hasattr(node, "keywords"):
+                    for keyword in node.keywords:
+                        if keyword.arg == "level":
+                            level = self._grab_node_int(keyword.value)
+                if fromlist == []:
+                    # the top-level package (the name up till the first dot) is returned
+                    # when the fromlist argument is empty in normal import system,
+                    # we need to include top level package to match this behavior and last
+                    # level package to capture the intended dependency of user
+                    self.references[(name, None)] = True
+                    top_name = name.rsplit(".", maxsplit=1)[0]
+                    if top_name != name:
+                        top_name = self._absmodule(top_name, level)
+                        self.references[(top_name, None)] = True
+                else:
+                    name = self._absmodule(name, level)
+                    for alias in fromlist:
+                        # fromlist args may be submodules, so we have to add the fromlist args
+                        # to the list of potential references. If import of an arg fails we
+                        # will ignore it, similar to visit_ImportFrom
+                        if alias != "*":
+                            self.references[(name, alias)] = True
+                        else:
+                            self.references[(name, None)] = True
+            except Exception as e:
+                return
+
+
+find_files_source_depends_on = _ExtractModuleReferences.run
diff --git a/MLPY/Lib/site-packages/torch/package/glob_group.py b/MLPY/Lib/site-packages/torch/package/glob_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc0c6136559b1bfbc1718fe9413e1a69ca6a397
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/glob_group.py
@@ -0,0 +1,82 @@
+import re
+from typing import Iterable, Union
+
+GlobPattern = Union[str, Iterable[str]]
+
+
+class GlobGroup:
+    """A set of patterns that candidate strings will be matched against.
+
+    A candidate is composed of a list of segments separated by ``separator``, e.g. "foo.bar.baz".
+
+    A pattern contains one or more segments. Segments can be:
+        - A literal string (e.g. "foo"), which matches exactly.
+        - A string containing a wildcard (e.g. "torch*", or "foo*baz*"). The wildcard matches
+          any string, including the empty string.
+        - A double wildcard ("**"). This matches against zero or more complete segments.
+
+    Examples:
+        ``torch.**``: matches ``torch`` and all its submodules, e.g. ``torch.nn`` and ``torch.nn.functional``.
+        ``torch.*``: matches ``torch.nn`` or ``torch.functional``, but not ``torch.nn.functional``.
+        ``torch*.**``: matches ``torch``, ``torchvision``, and all their submodules.
+
+    A candidates will match the ``GlobGroup`` if it matches any of the ``include`` patterns and
+    none of the ``exclude`` patterns.
+
+    Args:
+        include (Union[str, Iterable[str]]): A string or list of strings,
+            each representing a pattern to be matched against. A candidate
+            will match if it matches *any* include pattern
+        exclude (Union[str, Iterable[str]]): A string or list of strings,
+            each representing a pattern to be matched against. A candidate
+            will be excluded from matching if it matches *any* exclude pattern.
+        separator (str): A string that delimits segments in candidates and
+            patterns. By default this is "." which corresponds to how modules are
+            named in Python. Another common value for this is "/", which is
+            the Unix path separator.
+    """
+
+    def __init__(
+        self, include: GlobPattern, *, exclude: GlobPattern = (), separator: str = "."
+    ):
+        self._dbg = f"GlobGroup(include={include}, exclude={exclude})"
+        self.include = GlobGroup._glob_list(include, separator)
+        self.exclude = GlobGroup._glob_list(exclude, separator)
+        self.separator = separator
+
+    def __str__(self):
+        return self._dbg
+
+    def __repr__(self):
+        return self._dbg
+
+    def matches(self, candidate: str) -> bool:
+        candidate = self.separator + candidate
+        return any(p.fullmatch(candidate) for p in self.include) and all(
+            not p.fullmatch(candidate) for p in self.exclude
+        )
+
+    @staticmethod
+    def _glob_list(elems: GlobPattern, separator: str = "."):
+        if isinstance(elems, str):
+            return [GlobGroup._glob_to_re(elems, separator)]
+        else:
+            return [GlobGroup._glob_to_re(e, separator) for e in elems]
+
+    @staticmethod
+    def _glob_to_re(pattern: str, separator: str = "."):
+        # to avoid corner cases for the first component, we prefix the candidate string
+        # with '.' so `import torch` will regex against `.torch`, assuming '.' is the separator
+        def component_to_re(component):
+            if "**" in component:
+                if component == "**":
+                    return "(" + re.escape(separator) + "[^" + separator + "]+)*"
+                else:
+                    raise ValueError("** can only appear as an entire path segment")
+            else:
+                return re.escape(separator) + ("[^" + separator + "]*").join(
+                    re.escape(x) for x in component.split("*")
+                )
+
+        result = "".join(component_to_re(c) for c in pattern.split(separator))
+        return re.compile(result)
diff --git a/MLPY/Lib/site-packages/torch/package/importer.py b/MLPY/Lib/site-packages/torch/package/importer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c35e4556204d363522f228d40beb879c8edb8e4c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/importer.py
@@ -0,0 +1,237 @@
+import importlib
+from abc import ABC, abstractmethod
+from pickle import (  # type: ignore[attr-defined]  # type: ignore[attr-defined]
+    _getattribute,
+    _Pickler,
+    whichmodule as _pickle_whichmodule,
+)
+from types import ModuleType
+from typing import Any, Dict, List, Optional, Tuple
+
+from ._mangling import demangle, get_mangle_prefix, is_mangled
+
+__all__ = ["ObjNotFoundError", "ObjMismatchError", "Importer", "OrderedImporter"]
+
+
+class ObjNotFoundError(Exception):
+    """Raised when an importer cannot find an object by searching for its name."""
+
+    pass
+
+
+class ObjMismatchError(Exception):
+    """Raised when an importer found a different object with the same name as the user-provided one."""
+
+    pass
+
+
+class Importer(ABC):
+    """Represents an environment to import modules from.
+
+    By default, you can figure out what module an object belongs by checking
+    __module__ and importing the result using __import__ or importlib.import_module.
+
+    torch.package introduces module importers other than the default one.
+    Each PackageImporter introduces a new namespace. Potentially a single
+    name (e.g. 'foo.bar') is present in multiple namespaces.
+
+    It supports two main operations:
+        import_module: module_name -> module object
+        get_name: object -> (parent module name, name of obj within module)
+
+    The guarantee is that following round-trip will succeed or throw an ObjNotFoundError/ObjMisMatchError.
+        module_name, obj_name = env.get_name(obj)
+        module = env.import_module(module_name)
+        obj2 = getattr(module, obj_name)
+        assert obj1 is obj2
+    """
+
+    modules: Dict[str, ModuleType]
+
+    @abstractmethod
+    def import_module(self, module_name: str) -> ModuleType:
+        """Import `module_name` from this environment.
+
+        The contract is the same as for importlib.import_module.
+        """
+        pass
+
+    def get_name(self, obj: Any, name: Optional[str] = None) -> Tuple[str, str]:
+        """Given an object, return a name that can be used to retrieve the
+        object from this environment.
+
+        Args:
+            obj: An object to get the module-environment-relative name for.
+            name: If set, use this name instead of looking up __name__ or __qualname__ on `obj`.
+                This is only here to match how Pickler handles __reduce__ functions that return a string,
+                don't use otherwise.
+        Returns:
+            A tuple (parent_module_name, attr_name) that can be used to retrieve `obj` from this environment.
+            Use it like:
+                mod = importer.import_module(parent_module_name)
+                obj = getattr(mod, attr_name)
+
+        Raises:
+            ObjNotFoundError: we couldn't retrieve `obj by name.
+            ObjMisMatchError: we found a different object with the same name as `obj`.
+        """
+        if name is None and obj and _Pickler.dispatch.get(type(obj)) is None:
+            # Honor the string return variant of __reduce__, which will give us
+            # a global name to search for in this environment.
+            # TODO: I guess we should do copyreg too?
+            reduce = getattr(obj, "__reduce__", None)
+            if reduce is not None:
+                try:
+                    rv = reduce()
+                    if isinstance(rv, str):
+                        name = rv
+                except Exception:
+                    pass
+        if name is None:
+            name = getattr(obj, "__qualname__", None)
+        if name is None:
+            name = obj.__name__
+
+        orig_module_name = self.whichmodule(obj, name)
+        # Demangle the module name before importing. If this obj came out of a
+        # PackageImporter, `__module__` will be mangled. See mangling.md for
+        # details.
+        module_name = demangle(orig_module_name)
+
+        # Check that this name will indeed return the correct object
+        try:
+            module = self.import_module(module_name)
+            obj2, _ = _getattribute(module, name)
+        except (ImportError, KeyError, AttributeError):
+            raise ObjNotFoundError(
+                f"{obj} was not found as {module_name}.{name}"
+            ) from None
+
+        if obj is obj2:
+            return module_name, name
+
+        def get_obj_info(obj):
+            assert name is not None
+            module_name = self.whichmodule(obj, name)
+            is_mangled_ = is_mangled(module_name)
+            location = (
+                get_mangle_prefix(module_name)
+                if is_mangled_
+                else "the current Python environment"
+            )
+            importer_name = (
+                f"the importer for {get_mangle_prefix(module_name)}"
+                if is_mangled_
+                else "'sys_importer'"
+            )
+            return module_name, location, importer_name
+
+        obj_module_name, obj_location, obj_importer_name = get_obj_info(obj)
+        obj2_module_name, obj2_location, obj2_importer_name = get_obj_info(obj2)
+        msg = (
+            f"\n\nThe object provided is from '{obj_module_name}', "
+            f"which is coming from {obj_location}."
+            f"\nHowever, when we import '{obj2_module_name}', it's coming from {obj2_location}."
+            "\nTo fix this, make sure this 'PackageExporter's importer lists "
+            f"{obj_importer_name} before {obj2_importer_name}."
+        )
+        raise ObjMismatchError(msg)
+
+    def whichmodule(self, obj: Any, name: str) -> str:
+        """Find the module name an object belongs to.
+
+        This should be considered internal for end-users, but developers of
+        an importer can override it to customize the behavior.
+
+        Taken from pickle.py, but modified to exclude the search into sys.modules
+        """
+        module_name = getattr(obj, "__module__", None)
+        if module_name is not None:
+            return module_name
+
+        # Protect the iteration by using a list copy of self.modules against dynamic
+        # modules that trigger imports of other modules upon calls to getattr.
+        for module_name, module in self.modules.copy().items():
+            if (
+                module_name == "__main__"
+                or module_name == "__mp_main__"  # bpo-42406
+                or module is None
+            ):
+                continue
+            try:
+                if _getattribute(module, name)[0] is obj:
+                    return module_name
+            except AttributeError:
+                pass
+
+        return "__main__"
+
+
+class _SysImporter(Importer):
+    """An importer that implements the default behavior of Python."""
+
+    def import_module(self, module_name: str):
+        return importlib.import_module(module_name)
+
+    def whichmodule(self, obj: Any, name: str) -> str:
+        return _pickle_whichmodule(obj, name)
+
+
+sys_importer = _SysImporter()
+
+
+class OrderedImporter(Importer):
+    """A compound importer that takes a list of importers and tries them one at a time.
+
+    The first importer in the list that returns a result "wins".
+    """
+
+    def __init__(self, *args):
+        self._importers: List[Importer] = list(args)
+
+    def _is_torchpackage_dummy(self, module):
+        """Returns true iff this module is an empty PackageNode in a torch.package.
+
+        If you intern `a.b` but never use `a` in your code, then `a` will be an
+        empty module with no source. This can break cases where we are trying to
+        re-package an object after adding a real dependency on `a`, since
+        OrderedImportere will resolve `a` to the dummy package and stop there.
+
+        See: https://github.com/pytorch/pytorch/pull/71520#issuecomment-1029603769
+        """
+        if not getattr(module, "__torch_package__", False):
+            return False
+        if not hasattr(module, "__path__"):
+            return False
+        if not hasattr(module, "__file__"):
+            return True
+        return module.__file__ is None
+
+    def import_module(self, module_name: str) -> ModuleType:
+        last_err = None
+        for importer in self._importers:
+            if not isinstance(importer, Importer):
+                raise TypeError(
+                    f"{importer} is not a Importer. "
+                    "All importers in OrderedImporter must inherit from Importer."
+                )
+            try:
+                module = importer.import_module(module_name)
+                if self._is_torchpackage_dummy(module):
+                    continue
+                return module
+            except ModuleNotFoundError as err:
+                last_err = err
+
+        if last_err is not None:
+            raise last_err
+        else:
+            raise ModuleNotFoundError(module_name)
+
+    def whichmodule(self, obj: Any, name: str) -> str:
+        for importer in self._importers:
+            module_name = importer.whichmodule(obj, name)
+            if module_name != "__main__":
+                return module_name
+
+        return "__main__"
diff --git a/MLPY/Lib/site-packages/torch/package/package_exporter.py b/MLPY/Lib/site-packages/torch/package/package_exporter.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f6808eefeaed5f36f648e8df9f0f25b0e19684
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/package_exporter.py
@@ -0,0 +1,1199 @@
+import collections
+import importlib.machinery
+import io
+import linecache
+import pickletools
+import platform
+import types
+from collections import defaultdict, OrderedDict
+from dataclasses import dataclass
+from enum import Enum
+from importlib.machinery import SourceFileLoader
+from pathlib import Path
+from typing import (
+    Any,
+    BinaryIO,
+    Callable,
+    cast,
+    DefaultDict,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Union,
+)
+
+import torch
+from torch.serialization import location_tag, normalize_storage_type
+from torch.types import Storage
+from torch.utils.hooks import RemovableHandle
+
+from ._digraph import DiGraph
+from ._importlib import _normalize_path
+from ._mangling import demangle, is_mangled
+from ._package_pickler import create_pickler
+from ._stdlib import is_stdlib_module
+from .find_file_dependencies import find_files_source_depends_on
+from .glob_group import GlobGroup, GlobPattern
+from .importer import Importer, OrderedImporter, sys_importer
+
+__all__ = [
+    "PackagingErrorReason",
+    "EmptyMatchError",
+    "PackagingError",
+    "PackageExporter",
+]
+
+_gate_torchscript_serialization = True
+
+ActionHook = Callable[["PackageExporter", str], None]
+
+
+class _ModuleProviderAction(Enum):
+    """Represents one of the actions that :class:`PackageExporter` can take on a module.
+
+    See :meth:`PackageExporter.extern` and friends for a description of what the actions do.
+    """
+
+    INTERN = 1
+    EXTERN = 2
+    MOCK = 3
+    DENY = 4
+    # Special case: when a module is mocked, PackageExporter writes out a
+    # `_mock` module that implements our mocking stubs. If we re-package code,
+    # we may encounter a `_mock` module from the original package. If we do,
+    # just ignore it and write a `_mock` module once.
+    REPACKAGED_MOCK_MODULE = 5
+    # Special case: PackageImporter adds a fake module
+    # (`torch_package_importer`) that allows packaged code to access it. Don't
+    # re-export this.
+    SKIP = 6
+
+
+class PackagingErrorReason(Enum):
+    """Listing of different reasons a dependency may fail to package.
+
+    This enum is used to provide good error messages when
+    :class:`PackagingError` is raised.
+    """
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    IS_EXTENSION_MODULE = (
+        "Module is a C extension module. torch.package supports Python modules only."
+    )
+    NO_DUNDER_FILE = "Module had no __file__ defined."
+    SOURCE_FILE_NOT_FOUND = (
+        "Module had a __file__, but we could not find it in your filesystem."
+    )
+    DEPENDENCY_RESOLUTION_FAILED = "Dependency resolution failed."
+    NO_ACTION = (
+        "Module did not match against any action pattern. Extern, mock, or intern it."
+    )
+    DENIED = "Module was denied by a pattern."
+    MOCKED_BUT_STILL_USED = (
+        "Module was mocked out, but is still being used in the package. "
+        "Please intern or extern the mocked modules if objects are supposed to be in "
+        "the package."
+    )
+
+
+@dataclass
+class _PatternInfo:
+    """Holds :class:`PackageExporter`-specific info about how to execute matches against"""
+
+    # What action to take on a module that matches this pattern.
+    action: _ModuleProviderAction
+    # The value of `allow_empty` the user gave when specifying the pattern.
+    allow_empty: bool
+    # Whether this pattern has been matched during packaging.
+    was_matched: bool
+
+    def __init__(self, action, allow_empty):
+        self.action = action
+        self.allow_empty = allow_empty
+        self.was_matched = False
+
+
+class EmptyMatchError(Exception):
+    """This is an exception that is thrown when a mock or extern is marked as
+    ``allow_empty=False``, and is not matched with any module during packaging.
+    """
+
+    pass
+
+
+class PackagingError(Exception):
+    """This exception is raised when there is an issue with exporting a package.
+    ``PackageExporter`` will attempt to gather up all the errors and present
+    them to you at once.
+    """
+
+    def __init__(self, dependency_graph: DiGraph, debug=False):
+        # Group errors by reason.
+        broken: Dict[PackagingErrorReason, List[str]] = defaultdict(list)
+        for module_name, attrs in dependency_graph.nodes.items():
+            error = attrs.get("error")
+            if error is None:
+                continue
+            if error == PackagingErrorReason.NO_ACTION:
+                assert "action" not in attrs
+            broken[error].append(module_name)
+
+        message = io.StringIO()
+        message.write("\n")
+
+        for reason, module_names in broken.items():
+            message.write(f"* {reason.value}\n")
+            for module_name in module_names:
+                message.write(f"    {module_name}\n")
+
+                # Print additional context if it's provided.
+                error_context = dependency_graph.nodes[module_name].get("error_context")
+                if error_context is not None:
+                    message.write(f"      Context: {error_context}\n")
+                if module_name in _DISALLOWED_MODULES:
+                    message.write(
+                        "      Note: While we usually use modules in the python standard library "
+                        f"from the local environment, `{module_name}` has a lot of system "
+                        "level access and therefore can pose a security risk. We heavily "
+                        f"recommend removing `{module_name}` from your packaged code. However, if that "
+                        "is not possible, add it to the extern list by calling "
+                        f'PackageExporter.extern("`{module_name}`")\n'
+                    )
+                if debug:
+                    module_path = dependency_graph.first_path(module_name)
+                    message.write(
+                        f"      A path to {module_name}: {' -> '.join(module_path)}"
+                    )
+        if not debug:
+            message.write("\n")
+            message.write(
+                "Set debug=True when invoking PackageExporter for a visualization of where "
+                "broken modules are coming from!\n"
+            )
+        # Save the dependency graph so that tooling can get at it.
+        self.dependency_graph = dependency_graph
+        super().__init__(message.getvalue())
+
+
+class PackageExporter:
+    """Exporters allow you to write packages of code, pickled Python data, and
+    arbitrary binary and text resources into a self-contained package.
+
+    Imports can load this code in a hermetic way, such that code is loaded
+    from the package rather than the normal Python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The code contained in packages is copied file-by-file from the original
+    source when it is created, and the file format is a specially organized
+    zip file. Future users of the package can unzip the package, and edit the code
+    in order to perform custom modifications to it.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external using :meth:`extern`.
+    The file ``extern_modules`` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+
+    When source code is added to the package, the exporter can optionally scan it
+    for further code dependencies (``dependencies=True``). It looks for import statements,
+    resolves relative references to qualified module names, and performs an action specified by the user
+    (See: :meth:`extern`, :meth:`mock`, and :meth:`intern`).
+    """
+
+    """A importer that will be searched in order to find the modules referenced by other modules or by
+    pickled objects. The default module environment just uses sys_importer, which searches the Python environment.
+    """
+    importer: Importer
+
+    def __init__(
+        self,
+        f: Union[str, Path, BinaryIO],
+        importer: Union[Importer, Sequence[Importer]] = sys_importer,
+        debug: bool = False,
+    ):
+        """
+        Create an exporter.
+
+        Args:
+            f: The location to export to. Can be a  ``string``/``Path`` object containing a filename
+                or a binary I/O object.
+            importer: If a single Importer is passed, use that to search for modules.
+                If a sequence of importers are passed, an ``OrderedImporter`` will be constructed out of them.
+            debug: If set to True, add path of broken modules to PackagingErrors.
+        """
+        torch._C._log_api_usage_once("torch.package.PackageExporter")
+        self.debug = debug
+        if isinstance(f, (Path, str)):
+            f = str(f)
+            self.buffer: Optional[BinaryIO] = None
+        else:  # is a byte buffer
+            self.buffer = f
+
+        self.zip_file = torch._C.PyTorchFileWriter(f)
+        self.zip_file.set_min_version(6)
+        self._written_files: Set[str] = set()
+
+        self.serialized_reduces: Dict[int, Any] = {}
+
+        # A graph tracking all the modules and pickle objects added to this
+        # package and the dependencies between them.
+        # - Each node is a module name (or a pickle name that looks like '<foo.obj.pkl>')
+        # - Each directed edge (u, v) means u depends on v.
+        # - Nodes may contain metadata that describe how to write the thing to the zipfile.
+        self.dependency_graph = DiGraph()
+        self.script_module_serializer = torch._C.ScriptModuleSerializer(self.zip_file)
+        self.storage_context = self.script_module_serializer.storage_context()
+
+        # These are OrderedDicts for compatibility with RemovableHandle.
+        # Generic OrderedDict type annotations are not present until 3.7.
+        # The real type signature is OrderedDict[int, Callable[[PackageExporter, str], None]]
+        self._extern_hooks: OrderedDict = OrderedDict()
+        self._mock_hooks: OrderedDict = OrderedDict()
+        self._intern_hooks: OrderedDict = OrderedDict()
+
+        if isinstance(importer, Importer):
+            self.importer = importer
+        else:
+            if not isinstance(importer, collections.abc.Sequence):
+                raise TypeError(
+                    "importer arg should be an Importer or a sequence of Importers, "
+                    f"got {type(importer)} instead."
+                )
+            self.importer = OrderedImporter(*importer)
+
+        self.patterns: Dict[GlobGroup, _PatternInfo] = {}
+        self._unique_id = 0
+
+    def save_source_file(
+        self, module_name: str, file_or_directory: str, dependencies=True
+    ):
+        """Adds the local file system ``file_or_directory`` to the source package to provide the code
+        for ``module_name``.
+
+        Args:
+            module_name (str): e.g. ``"my_package.my_subpackage"``, code will be saved to provide code for this package.
+            file_or_directory (str): the path to a file or directory of code. When a directory, all python files in the directory
+                are recursively copied using :meth:`save_source_file`. If a file is named ``"/__init__.py"`` the code is treated
+                as a package.
+            dependencies (bool, optional): If ``True``, we scan the source for dependencies.
+        """
+        path = Path(file_or_directory)
+        if path.is_dir():
+            to_save = []  # list of tuples with arguments to save_source_string
+            module_path = module_name.replace(".", "/")
+            for filename in path.glob("**/*.py"):
+                relative_path = filename.relative_to(path).as_posix()
+                archivename = module_path + "/" + relative_path
+                submodule_name = None
+                if filename.name == "__init__.py":
+                    submodule_name = archivename[: -len("/__init__.py")].replace(
+                        "/", "."
+                    )
+                    is_package = True
+                else:
+                    submodule_name = archivename[: -len(".py")].replace("/", ".")
+                    is_package = False
+
+                # we delay the call to save_source_string so that we record all the source files
+                # being provided by this directory structure _before_ attempting to resolve the dependencies
+                # on the source. This makes sure we don't try to copy over modules that will just get
+                # overwritten by this directory blob
+                to_save.append(
+                    (
+                        submodule_name,
+                        _read_file(str(filename)),
+                        is_package,
+                        dependencies,
+                    )
+                )
+
+            for item in to_save:
+                self.save_source_string(*item)
+        else:
+            is_package = path.name == "__init__.py"
+            self.save_source_string(
+                module_name,
+                _read_file(file_or_directory),
+                is_package,
+                dependencies,
+            )
+
+    def get_unique_id(self) -> str:
+        """Get an id. This id is guaranteed to only be handed out once for this package."""
+        ret = str(self._unique_id)
+        self._unique_id += 1
+        return ret
+
+    def _get_dependencies(
+        self, src: str, module_name: str, is_package: bool
+    ) -> List[str]:
+        """Return all modules that this source code depends on.
+
+        Dependencies are found by scanning the source code for import-like statements.
+
+        Arguments:
+            src: The Python source code to analyze for dependencies.
+            module_name: The name of the module that ``src`` corresponds to.
+            is_package: Whether this module should be treated as a package.
+                See :py:meth:`save_source_string` for more info.
+
+        Returns:
+            A list containing modules detected as direct dependencies in
+            ``src``.  The items in the list are guaranteed to be unique.
+        """
+        package_name = (
+            module_name if is_package else module_name.rsplit(".", maxsplit=1)[0]
+        )
+        try:
+            dep_pairs = find_files_source_depends_on(src, package_name)
+        except Exception as e:
+            self.dependency_graph.add_node(
+                module_name,
+                error=PackagingErrorReason.DEPENDENCY_RESOLUTION_FAILED,
+                error_context=str(e),
+            )
+            return []
+
+        # Use a dict to get uniquing but also deterministic order
+        dependencies = {}
+        for dep_module_name, dep_module_obj in dep_pairs:
+            # handle the case where someone did something like `from pack import sub`
+            # where `sub` is a submodule. In this case we don't have to save pack, just sub.
+            # this ensures we don't pick up additional dependencies on pack.
+            # However, in the case where `sub` is not a submodule but an object, then we do have
+            # to save pack.
+            if dep_module_obj is not None:
+                possible_submodule = f"{dep_module_name}.{dep_module_obj}"
+                if self._module_exists(possible_submodule):
+                    dependencies[possible_submodule] = True
+                    # we don't need to save `pack`
+                    continue
+            if self._module_exists(dep_module_name):
+                dependencies[dep_module_name] = True
+
+        return list(dependencies.keys())
+
+    def save_source_string(
+        self,
+        module_name: str,
+        src: str,
+        is_package: bool = False,
+        dependencies: bool = True,
+    ):
+        """Adds ``src`` as the source code for ``module_name`` in the exported package.
+
+        Args:
+            module_name (str): e.g. ``my_package.my_subpackage``, code will be saved to provide code for this package.
+            src (str): The Python source code to save for this package.
+            is_package (bool, optional): If ``True``, this module is treated as a package. Packages are allowed to have submodules
+                (e.g. ``my_package.my_subpackage.my_subsubpackage``), and resources can be saved inside them. Defaults to ``False``.
+            dependencies (bool, optional): If ``True``, we scan the source for dependencies.
+        """
+        self.dependency_graph.add_node(
+            module_name,
+            source=src,
+            is_package=is_package,
+            provided=True,
+            action=_ModuleProviderAction.INTERN,
+        )
+
+        if dependencies:
+            deps = self._get_dependencies(src, module_name, is_package)
+
+            for dep in deps:
+                self.dependency_graph.add_edge(module_name, dep)
+                self.add_dependency(dep)
+
+    def _write_source_string(
+        self,
+        module_name: str,
+        src: str,
+        is_package: bool = False,
+    ):
+        """Write ``src`` as the source code for ``module_name`` in the zip archive.
+
+        Arguments are otherwise the same as for :meth:`save_source_string`.
+        """
+        extension = "/__init__.py" if is_package else ".py"
+        filename = module_name.replace(".", "/") + extension
+
+        self._write(filename, src)
+
+    def _import_module(self, module_name: str):
+        try:
+            return self.importer.import_module(module_name)
+        except ModuleNotFoundError as e:
+            if not is_mangled(module_name):
+                raise
+            msg = (
+                f"Module not found: '{module_name}'. Make sure the PackageImporter that "
+                "created this module is present in `self.importer`"
+            )
+            raise ModuleNotFoundError(msg) from None
+
+    def _module_exists(self, module_name: str) -> bool:
+        try:
+            self._import_module(module_name)
+            return True
+        except Exception:
+            return False
+
+    def _get_source_of_module(self, module: types.ModuleType) -> Optional[str]:
+        filename = None
+        spec = getattr(module, "__spec__", None)
+        if spec is not None:
+            loader = getattr(spec, "loader", None)
+            if loader is not None and isinstance(loader, SourceFileLoader):
+                try:
+                    filename = loader.get_filename(module.__name__)
+                except ImportError:
+                    pass
+        if filename is None:
+            filename = getattr(module, "__file__", None)
+        if isinstance(filename, str) and filename.endswith(".py"):
+            return "".join(linecache.getlines(filename, module.__dict__))
+        return None
+
+    def add_dependency(self, module_name: str, dependencies=True):
+        """Given a module, add it to the dependency graph according to patterns
+        specified by the user.
+        """
+        if (
+            module_name in self.dependency_graph
+            and self.dependency_graph.nodes[module_name].get("provided") is True
+        ):
+            return
+
+        # Special case: PackageImporter provides a special module called
+        # `torch_package_importer` that allows packaged modules to reference
+        # their PackageImporter. We don't want to re-export this.
+        if module_name == "torch_package_importer":
+            self.dependency_graph.add_node(
+                module_name,
+                action=_ModuleProviderAction.SKIP,
+                provided=True,
+            )
+            return
+
+        if module_name == "_mock":
+            self.dependency_graph.add_node(
+                module_name,
+                action=_ModuleProviderAction.REPACKAGED_MOCK_MODULE,
+                provided=True,
+            )
+            return
+
+        if self._can_implicitly_extern(module_name):
+            self.dependency_graph.add_node(
+                module_name, action=_ModuleProviderAction.EXTERN, provided=True
+            )
+            return
+
+        for pattern, pattern_info in self.patterns.items():
+            if pattern.matches(module_name):
+                pattern_info.was_matched = True
+                self.dependency_graph.add_node(
+                    module_name, action=pattern_info.action, provided=True
+                )
+
+                if pattern_info.action == _ModuleProviderAction.DENY:
+                    # Requiring a denied module just adds an error to the graph.
+                    self.dependency_graph.add_node(
+                        module_name, error=PackagingErrorReason.DENIED
+                    )
+
+                # If we are interning this module, we need to retrieve its
+                # dependencies and package those as well.
+                if pattern_info.action == _ModuleProviderAction.INTERN:
+                    self._intern_module(module_name, dependencies)
+                return
+
+        # No patterns have matched. Explicitly add this as an error.
+        self.dependency_graph.add_node(
+            module_name, error=PackagingErrorReason.NO_ACTION
+        )
+
+    def save_module(self, module_name: str, dependencies=True):
+        """Save the code for ``module`` into the package. Code for the module is resolved using the ``importers`` path to find the
+        module object, and then using its ``__file__`` attribute to find the source code.
+
+        Args:
+            module_name (str): e.g. ``my_package.my_subpackage``, code will be saved to provide code
+                for this package.
+            dependencies (bool, optional): If ``True``, we scan the source for dependencies.
+        """
+        if not isinstance(module_name, str):
+            raise TypeError(
+                "save_module() expects a string input, did you perhaps mean to pass `__name__`?"
+            )
+
+        self._intern_module(module_name, dependencies)
+
+    def _intern_module(
+        self,
+        module_name: str,
+        dependencies: bool,
+    ):
+        """Adds the module to the dependency graph as an interned module,
+        along with any metadata needed to write it out to the zipfile at serialization time.
+        """
+        module_obj = self._import_module(module_name)
+        # Subtle: if the import above succeeded, either:
+        #   1. The module name is not mangled, and this was just a regular import, or
+        #   2. The module name is mangled, but one of the importers was able to
+        #      recognize the mangling and import it.
+        # Either way, it is now safe to demangle this name so that we don't
+        # serialize the mangled version to the package.
+        module_name = demangle(module_name)
+
+        # Find dependencies of this module and require them as well.
+        is_package = hasattr(module_obj, "__path__")
+        source = self._get_source_of_module(module_obj)
+        if source is None:
+            # Couldn't find a source!  Add it to our dependency graph as broken
+            # and continue.
+            filename = getattr(module_obj, "__file__", None)
+            error_context = None
+            if filename is None:
+                packaging_error = PackagingErrorReason.NO_DUNDER_FILE
+            elif filename.endswith(tuple(importlib.machinery.EXTENSION_SUFFIXES)):
+                packaging_error = PackagingErrorReason.IS_EXTENSION_MODULE
+            else:
+                packaging_error = PackagingErrorReason.SOURCE_FILE_NOT_FOUND
+                error_context = f"filename: {filename}"
+            self.dependency_graph.add_node(
+                module_name,
+                action=_ModuleProviderAction.INTERN,
+                is_package=is_package,
+                error=packaging_error,
+                error_context=error_context,
+                provided=True,
+            )
+            return
+
+        self.dependency_graph.add_node(
+            module_name,
+            action=_ModuleProviderAction.INTERN,
+            is_package=is_package,
+            source=source,
+            provided=True,
+        )
+
+        if dependencies:
+            deps = self._get_dependencies(source, module_name, is_package)
+            for dep in deps:
+                self.dependency_graph.add_edge(module_name, dep)
+                self.add_dependency(dep)
+
+    def save_pickle(
+        self,
+        package: str,
+        resource: str,
+        obj: Any,
+        dependencies: bool = True,
+        pickle_protocol: int = 3,
+    ):
+        """Save a python object to the archive using pickle. Equivalent to :func:`torch.save` but saving into
+        the archive rather than a stand-alone file. Standard pickle does not save the code, only the objects.
+        If ``dependencies`` is true, this method will also scan the pickled objects for which modules are required
+        to reconstruct them and save the relevant code.
+
+        To be able to save an object where ``type(obj).__name__`` is ``my_module.MyObject``,
+        ``my_module.MyObject`` must resolve to the class of the object according to the ``importer`` order. When saving objects that
+        have previously been packaged, the importer's ``import_module`` method will need to be present in the ``importer`` list
+        for this to work.
+
+        Args:
+            package (str): The name of module package this resource should go in (e.g. ``"my_package.my_subpackage"``).
+            resource (str): A unique name for the resource, used to identify it to load.
+            obj (Any): The object to save, must be picklable.
+            dependencies (bool, optional): If ``True``, we scan the source for dependencies.
+        """
+
+        assert (pickle_protocol == 4) or (
+            pickle_protocol == 3
+        ), "torch.package only supports pickle protocols 3 and 4"
+
+        filename = self._filename(package, resource)
+        # Write the pickle data for `obj`
+        data_buf = io.BytesIO()
+        pickler = create_pickler(data_buf, self.importer, protocol=pickle_protocol)
+        pickler.persistent_id = self._persistent_id
+        pickler.dump(obj)
+        data_value = data_buf.getvalue()
+        mocked_modules = defaultdict(list)
+        name_in_dependency_graph = f"<{package}.{resource}>"
+        self.dependency_graph.add_node(
+            name_in_dependency_graph,
+            action=_ModuleProviderAction.INTERN,
+            provided=True,
+            is_pickle=True,
+        )
+
+        def _check_mocked_error(module: Optional[str], field: Optional[str]):
+            """
+            checks if an object (field) comes from a mocked module and then adds
+            the pair to mocked_modules which contains mocked modules paired with their
+            list of mocked objects present in the pickle.
+
+            We also hold the invariant that the first user defined rule that applies
+            to the module is the one we use.
+            """
+
+            assert isinstance(module, str)
+            assert isinstance(field, str)
+            if self._can_implicitly_extern(module):
+                return
+            for pattern, pattern_info in self.patterns.items():
+                if pattern.matches(module):
+                    if pattern_info.action == _ModuleProviderAction.MOCK:
+                        mocked_modules[module].append(field)
+                    return
+
+        if dependencies:
+            all_dependencies = []
+            module = None
+            field = None
+            memo: DefaultDict[int, str] = defaultdict(None)
+            memo_count = 0
+            # pickletools.dis(data_value)
+            for opcode, arg, pos in pickletools.genops(data_value):
+                if pickle_protocol == 4:
+                    if (
+                        opcode.name == "SHORT_BINUNICODE"
+                        or opcode.name == "BINUNICODE"
+                        or opcode.name == "BINUNICODE8"
+                    ):
+                        assert isinstance(arg, str)
+                        module = field
+                        field = arg
+                        memo[memo_count] = arg
+                    elif (
+                        opcode.name == "LONG_BINGET"
+                        or opcode.name == "BINGET"
+                        or opcode.name == "GET"
+                    ):
+                        assert isinstance(arg, int)
+                        module = field
+                        field = memo.get(arg, None)
+                    elif opcode.name == "MEMOIZE":
+                        memo_count += 1
+                    elif opcode.name == "STACK_GLOBAL":
+                        if module is None:
+                            # If not module was passed on in the entries preceeding this one, continue.
+                            continue
+                        assert isinstance(module, str)
+                        if module not in all_dependencies:
+                            all_dependencies.append(module)
+                        _check_mocked_error(module, field)
+                elif (
+                    pickle_protocol == 3 and opcode.name == "GLOBAL"
+                ):  # a global reference
+                    assert isinstance(arg, str)
+                    module, field = arg.split(" ")
+                    if module not in all_dependencies:
+                        all_dependencies.append(module)
+                    _check_mocked_error(module, field)
+            for module_name in all_dependencies:
+                self.dependency_graph.add_edge(name_in_dependency_graph, module_name)
+
+                """ If an object happens to come from a mocked module, then we collect these errors and spit them
+                    out with the other errors found by package exporter.
+                """
+                if module in mocked_modules:
+                    assert isinstance(module, str)
+                    fields = mocked_modules[module]
+                    self.dependency_graph.add_node(
+                        module_name,
+                        action=_ModuleProviderAction.MOCK,
+                        error=PackagingErrorReason.MOCKED_BUT_STILL_USED,
+                        error_context=f"Object(s) '{fields}' from module `{module_name}` was mocked out during packaging "
+                        f"but is being used in resource - `{resource}` in package `{package}`. ",
+                        provided=True,
+                    )
+                else:
+                    self.add_dependency(module_name)
+
+        self._write(filename, data_value)
+
+    def save_text(self, package: str, resource: str, text: str):
+        """Save text data to the package.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. ``"my_package.my_subpackage"``).
+            resource (str): A unique name for the resource, used to identify it to load.
+            text (str): The contents to save.
+        """
+        return self.save_binary(package, resource, text.encode("utf-8"))
+
+    def save_binary(self, package, resource, binary: bytes):
+        """Save raw bytes to the package.
+
+        Args:
+            package (str): The name of module package this resource should go it (e.g. ``"my_package.my_subpackage"``).
+            resource (str): A unique name for the resource, used to identify it to load.
+            binary (str): The data to save.
+        """
+        filename = self._filename(package, resource)
+        self._write(filename, binary)
+
+    def register_extern_hook(self, hook: ActionHook) -> RemovableHandle:
+        """Registers an extern hook on the exporter.
+
+        The hook will be called each time a module matches against an :meth:`extern` pattern.
+        It should have the following signature::
+
+            hook(exporter: PackageExporter, module_name: str) -> None
+
+        Hooks will be called in order of registration.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                A handle that can be used to remove the added hook by calling
+                ``handle.remove()``.
+        """
+        handle = RemovableHandle(self._extern_hooks)
+        self._extern_hooks[handle.id] = hook
+        return handle
+
+    def register_mock_hook(self, hook: ActionHook) -> RemovableHandle:
+        """Registers a mock hook on the exporter.
+
+        The hook will be called each time a module matches against a :meth:`mock` pattern.
+        It should have the following signature::
+
+            hook(exporter: PackageExporter, module_name: str) -> None
+
+        Hooks will be called in order of registration.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                A handle that can be used to remove the added hook by calling
+                ``handle.remove()``.
+        """
+        handle = RemovableHandle(self._mock_hooks)
+        self._mock_hooks[handle.id] = hook
+        return handle
+
+    def register_intern_hook(self, hook: ActionHook) -> RemovableHandle:
+        """Registers an intern hook on the exporter.
+
+        The hook will be called each time a module matches against an :meth:`intern` pattern.
+        It should have the following signature::
+
+            hook(exporter: PackageExporter, module_name: str) -> None
+
+        Hooks will be called in order of registration.
+
+        Returns:
+            :class:`torch.utils.hooks.RemovableHandle`:
+                A handle that can be used to remove the added hook by calling
+                ``handle.remove()``.
+        """
+        handle = RemovableHandle(self._intern_hooks)
+        self._intern_hooks[handle.id] = hook
+        return handle
+
+    def intern(
+        self,
+        include: "GlobPattern",
+        *,
+        exclude: "GlobPattern" = (),
+        allow_empty: bool = True,
+    ):
+        """Specify modules that should be packaged. A module must match some ``intern`` pattern in order to be
+        included in the package and have its dependencies processed recursively.
+
+        Args:
+            include (Union[List[str], str]): A string e.g. "my_package.my_subpackage", or list of strings
+                for the names of the modules to be externed. This can also be a glob-style pattern, as described in :meth:`mock`.
+
+            exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the include string.
+
+            allow_empty (bool): An optional flag that specifies whether the intern modules specified by this call
+                to the ``intern`` method must be matched to some module during packaging. If an ``intern`` module glob
+                pattern is added with ``allow_empty=False``, and :meth:`close` is called (either explicitly or via ``__exit__``)
+                before any modules match that pattern, an exception is thrown. If ``allow_empty=True``, no such exception is thrown.
+
+        """
+        self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
+            _ModuleProviderAction.INTERN, allow_empty
+        )
+
+    def mock(
+        self,
+        include: "GlobPattern",
+        *,
+        exclude: "GlobPattern" = (),
+        allow_empty: bool = True,
+    ):
+        """Replace some required modules with a mock implementation.  Mocked modules will return a fake
+        object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes
+        find files that are imported by model files but whose functionality is never used
+        (e.g. custom serialization code or training helpers).
+        Use this function to mock this functionality out without having to modify the original code.
+
+        Args:
+            include (Union[List[str], str]): A string e.g. ``"my_package.my_subpackage"``, or list of strings
+                for the names of the modules to be mocked out. Strings can also be a glob-style pattern
+                string that may match multiple modules. Any required dependencies that match this pattern
+                string will be mocked out automatically.
+
+                Examples :
+                    ``'torch.**'`` -- matches ``torch`` and all submodules of torch, e.g. ``'torch.nn'``
+                    and ``'torch.nn.functional'``
+
+                    ``'torch.*'`` -- matches ``'torch.nn'`` or ``'torch.functional'``, but not
+                    ``'torch.nn.functional'``
+
+            exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the include string.
+                e.g. ``include='torch.**', exclude='torch.foo'`` will mock all torch packages except ``'torch.foo'``,
+                Default: is ``[]``.
+
+            allow_empty (bool): An optional flag that specifies whether the mock implementation(s) specified by this call
+                to the :meth:`mock` method must be matched to some module during packaging. If a mock is added with
+                ``allow_empty=False``, and :meth:`close` is called (either explicitly or via ``__exit__``) and the mock has
+                not been matched to a module used by the package being exported, an exception is thrown.
+                If ``allow_empty=True``, no such exception is thrown.
+
+        """
+        self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
+            _ModuleProviderAction.MOCK, allow_empty
+        )
+
+    def extern(
+        self,
+        include: "GlobPattern",
+        *,
+        exclude: "GlobPattern" = (),
+        allow_empty: bool = True,
+    ):
+        """Include ``module`` in the list of external modules the package can import.
+        This will prevent dependency discovery from saving
+        it in the package. The importer will load an external module directly from the standard import system.
+        Code for extern modules must also exist in the process loading the package.
+
+        Args:
+            include (Union[List[str], str]): A string e.g. ``"my_package.my_subpackage"``, or list of strings
+                for the names of the modules to be externed. This can also be a glob-style pattern, as
+                described in :meth:`mock`.
+
+            exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the
+                include string.
+
+            allow_empty (bool): An optional flag that specifies whether the extern modules specified by this call
+                to the ``extern`` method must be matched to some module during packaging. If an extern module glob
+                pattern is added with ``allow_empty=False``, and :meth:`close` is called (either explicitly or via
+                ``__exit__``) before any modules match that pattern, an exception is thrown. If ``allow_empty=True``,
+                no such exception is thrown.
+
+        """
+        self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
+            _ModuleProviderAction.EXTERN, allow_empty
+        )
+
+    def deny(self, include: "GlobPattern", *, exclude: "GlobPattern" = ()):
+        """Blocklist modules who names match the given glob patterns from the list of modules the package can import.
+        If a dependency on any matching packages is found, a :class:`PackagingError` is raised.
+
+        Args:
+            include (Union[List[str], str]): A string e.g. ``"my_package.my_subpackage"``, or list of strings
+                for the names of the modules to be externed. This can also be a glob-style pattern, as described in :meth:`mock`.
+
+            exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the include string.
+        """
+        self.patterns[GlobGroup(include, exclude=exclude)] = _PatternInfo(
+            _ModuleProviderAction.DENY, allow_empty=True
+        )
+
+    def _persistent_id(self, obj):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
+            storage: Storage
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, we can
+                # remove this case
+                untyped_storage = obj._untyped_storage
+                storage_type_str = obj.pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage = cast(Storage, untyped_storage)
+                storage_numel = obj.size()
+
+            elif isinstance(obj, torch.UntypedStorage):
+                untyped_storage = obj
+                storage = cast(Storage, untyped_storage)
+                storage_type = normalize_storage_type(type(storage))
+                storage_numel = storage.nbytes()
+            else:
+                raise RuntimeError(f"storage type not recognized: {type(obj)}")
+
+            location = location_tag(storage)
+
+            # serialize storage if not already written
+            storage_present = self.storage_context.has_storage(storage)
+            storage_id = self.storage_context.get_or_add_storage(storage)
+            if not storage_present:
+                if storage.device.type != "cpu":
+                    storage = storage.cpu()
+                num_bytes = storage.nbytes()
+                self.zip_file.write_record(
+                    f".data/{storage_id}.storage", storage, num_bytes
+                )
+            return ("storage", storage_type, storage_id, location, storage_numel)
+
+        if hasattr(obj, "__reduce_package__"):
+            if _gate_torchscript_serialization and isinstance(
+                obj, torch.jit.RecursiveScriptModule
+            ):
+                raise Exception(
+                    "Serializing ScriptModules directly into a package is a beta feature. "
+                    "To use, set global "
+                    "`torch.package.package_exporter._gate_torchscript_serialization` to `False`."
+                )
+            if self.serialized_reduces.get(id(obj)) is None:
+                self.serialized_reduces[id(obj)] = (
+                    "reduce_package",
+                    id(obj),
+                    *obj.__reduce_package__(self),
+                )
+
+            return self.serialized_reduces[id(obj)]
+
+        return None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        # If __exit__ was called because an exception was raised, we do not
+        # attempt to finalize the package. Instead, control is returned to the
+        # caller to continue raising the exception.
+        if exc_type is not None:
+            # Do the bare minimum to leave the open buffer in a valid state.
+            self._finalize_zip()
+            return
+
+        self.close()
+
+    def _write(self, filename, str_or_bytes):
+        if filename in self._written_files:
+            raise AssertionError(
+                f"Tried to write file '{filename}', but it already exists in this archive. "
+                "Please file a bug."
+            )
+        self._written_files.add(filename)
+
+        if is_mangled(filename):
+            raise AssertionError(
+                f"Tried to save a torch.package'd module as '{filename}'. "
+                "Directly saving torch.package'd modules is not allowed."
+            )
+        if isinstance(str_or_bytes, str):
+            str_or_bytes = str_or_bytes.encode("utf-8")
+        self.zip_file.write_record(filename, str_or_bytes, len(str_or_bytes))
+
+    def _validate_dependency_graph(self):
+        # 1. Check the graph for any errors inserted during dependency analysis.
+        for attrs in self.dependency_graph.nodes.values():
+            if "error" in attrs:
+                raise PackagingError(self.dependency_graph, debug=self.debug)
+
+        # 2. Check that all patterns for which allow_empty=False have been matched at least once.
+        for pattern, pattern_info in self.patterns.items():
+            if not pattern_info.allow_empty and not pattern_info.was_matched:
+                raise EmptyMatchError(
+                    f"Exporter did not match any modules to {pattern}, which was marked as allow_empty=False"
+                )
+
+    def _write_mock_file(self):
+        if "_mock.py" not in self._written_files:
+            mock_file = str(Path(__file__).parent / "_mock.py")
+            self._write_source_string("_mock", _read_file(mock_file), is_package=False)
+
+    def _execute_dependency_graph(self):
+        """Takes a finalized dependency graph describing how to package all
+        modules and executes it, writing to the ZIP archive.
+        """
+        self._validate_dependency_graph()
+
+        extern_modules = []
+        for module_name, attrs in self.dependency_graph.nodes.items():
+            action = attrs["action"]
+
+            if action == _ModuleProviderAction.EXTERN:
+                for hook in self._extern_hooks.values():
+                    hook(self, module_name)
+
+                extern_modules.append(module_name)
+
+            elif action == _ModuleProviderAction.MOCK:
+                for hook in self._mock_hooks.values():
+                    hook(self, module_name)
+
+                self._write_mock_file()
+
+                is_package = hasattr(self._import_module(module_name), "__path__")
+                self._write_source_string(module_name, _MOCK_IMPL, is_package)
+
+            elif action == _ModuleProviderAction.INTERN:
+                for hook in self._intern_hooks.values():
+                    hook(self, module_name)
+
+                # The node in the dependency graph contains metadata that tells us
+                # how to intern the module.
+                if "provided" not in attrs:
+                    raise AssertionError(
+                        f"Module was marked `intern` but not provided: {module_name}"
+                    )
+
+                if attrs.get("is_pickle") is True:
+                    # This node came from save_pickle, we don't need to write any source for it.
+                    continue
+
+                is_package = attrs["is_package"]
+                source = attrs["source"]
+                self._write_source_string(module_name, source, is_package)
+
+            elif action == _ModuleProviderAction.REPACKAGED_MOCK_MODULE:
+                self._write_mock_file()
+            elif action == _ModuleProviderAction.SKIP:
+                continue
+            else:
+                raise AssertionError(
+                    f"Invalid action: {module_name}, {action}. Please report a bug to PyTorch."
+                )
+
+        extern_file_contents = "\n".join(extern_modules) + "\n"
+        self._write(".data/extern_modules", extern_file_contents)
+
+    def _write_python_version(self):
+        """Writes the python version that the package was created with to .data/python_version"""
+        self._write(".data/python_version", platform.python_version())
+
+    def close(self):
+        """Write the package to the filesystem. Any calls after :meth:`close` are now invalid.
+        It is preferable to use resource guard syntax instead::
+
+            with PackageExporter("file.zip") as e:
+                ...
+        """
+        self._execute_dependency_graph()
+        self._write_python_version()
+
+        self.script_module_serializer.write_files()
+        self._finalize_zip()
+
+    def _finalize_zip(self):
+        """Called at the very end of packaging to leave the zipfile in a closed but valid state."""
+        del self.zip_file
+        if self.buffer:
+            self.buffer.flush()
+
+    def _filename(self, package, resource):
+        package_path = package.replace(".", "/")
+        resource = _normalize_path(resource)
+        return f"{package_path}/{resource}"
+
+    def _can_implicitly_extern(self, module_name: str):
+        top_level_package_name = module_name.partition(".")[0]
+        return top_level_package_name == "torch" or (
+            top_level_package_name not in _DISALLOWED_MODULES
+            and is_stdlib_module(top_level_package_name)
+        )
+
+    def dependency_graph_string(self) -> str:
+        """Returns digraph string representation of dependencies in package.
+
+        Returns:
+            A string representation of dependencies in package.
+        """
+        return self.dependency_graph.to_dot()
+
+    def _nodes_with_action_type(
+        self, action: Optional[_ModuleProviderAction]
+    ) -> List[str]:
+        result = []
+        for name, node_dict in self.dependency_graph.nodes.items():
+            node_action = node_dict.get("action", None)
+            if node_action == action and "is_pickle" not in node_dict:
+                result.append(name)
+        result.sort()
+        return result
+
+    def externed_modules(self) -> List[str]:
+        """Return all modules that are currently externed.
+
+        Returns:
+            A list containing the names of modules which will be
+            externed in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.EXTERN)
+
+    def interned_modules(self) -> List[str]:
+        """Return all modules that are currently interned.
+
+        Returns:
+            A list containing the names of modules which will be
+            interned in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.INTERN)
+
+    def mocked_modules(self) -> List[str]:
+        """Return all modules that are currently mocked.
+
+        Returns:
+            A list containing the names of modules which will be
+            mocked in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.MOCK)
+
+    def denied_modules(self) -> List[str]:
+        """Return all modules that are currently denied.
+
+        Returns:
+            A list containing the names of modules which will be
+            denied in this package.
+        """
+        return self._nodes_with_action_type(_ModuleProviderAction.DENY)
+
+    def get_rdeps(self, module_name: str) -> List[str]:
+        """Return a list of all modules which depend on the module ``module_name``.
+
+        Returns:
+            A list containing the names of modules which depend on ``module_name``.
+        """
+        if module_name in self.dependency_graph._pred.keys():
+            return list(self.dependency_graph._pred[module_name].keys())
+        else:
+            return []
+
+    def all_paths(self, src: str, dst: str) -> str:
+        """Return a dot representation of the subgraph
+           that has all paths from src to dst.
+
+        Returns:
+            A dot representation containing all paths from src to dst.
+            (https://graphviz.org/doc/info/lang.html)
+        """
+        return self.dependency_graph.all_paths(src, dst)
+
+
+# even though these are in the standard library, we do not allow them to be
+# automatically externed since they offer a lot of system level access
+_DISALLOWED_MODULES = ["sys", "io"]
+
+_MOCK_IMPL = """\
+from _mock import MockedObject
+def __getattr__(attr: str):
+    return MockedObject(__name__ + '.' + attr, _suppress_err=True)
+"""
+
+
+def _read_file(filename: str) -> str:
+    with open(filename, "rb") as f:
+        b = f.read()
+        return b.decode("utf-8")
diff --git a/MLPY/Lib/site-packages/torch/package/package_importer.py b/MLPY/Lib/site-packages/torch/package/package_importer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e620a0f895a961adf787a3dae814dc3acda83a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/package/package_importer.py
@@ -0,0 +1,759 @@
+import builtins
+import importlib
+import importlib.machinery
+import inspect
+import io
+import linecache
+import os
+import types
+from contextlib import contextmanager
+from typing import Any, BinaryIO, Callable, cast, Dict, Iterable, List, Optional, Union
+from weakref import WeakValueDictionary
+
+import torch
+from torch.serialization import _get_restore_location, _maybe_decode_ascii
+
+from ._directory_reader import DirectoryReader
+from ._importlib import (
+    _calc___package__,
+    _normalize_line_endings,
+    _normalize_path,
+    _resolve_name,
+    _sanity_check,
+)
+from ._mangling import demangle, PackageMangler
+from ._package_unpickler import PackageUnpickler
+from .file_structure_representation import _create_directory_from_file_list, Directory
+from .glob_group import GlobPattern
+from .importer import Importer
+
+__all__ = ["PackageImporter"]
+
+
+# This is a list of imports that are implicitly allowed even if they haven't
+# been marked as extern. This is to work around the fact that Torch implicitly
+# depends on numpy and package can't track it.
+# https://github.com/pytorch/MultiPy/issues/46
+IMPLICIT_IMPORT_ALLOWLIST: Iterable[str] = [
+    "numpy",
+    "numpy.core",
+    "numpy.core._multiarray_umath",
+    # FX GraphModule might depend on builtins module and users usually
+    # don't extern builtins. Here we import it here by default.
+    "builtins",
+]
+
+
+class PackageImporter(Importer):
+    """Importers allow you to load code written to packages by :class:`PackageExporter`.
+    Code is loaded in a hermetic way, using files from the package
+    rather than the normal python import system. This allows
+    for the packaging of PyTorch model code and data so that it can be run
+    on a server or used in the future for transfer learning.
+
+    The importer for packages ensures that code in the module can only be loaded from
+    within the package, except for modules explicitly listed as external during export.
+    The file ``extern_modules`` in the zip archive lists all the modules that a package externally depends on.
+    This prevents "implicit" dependencies where the package runs locally because it is importing
+    a locally-installed package, but then fails when the package is copied to another machine.
+    """
+
+    """The dictionary of already loaded modules from this package, equivalent to ``sys.modules`` but
+    local to this importer.
+    """
+
+    modules: Dict[str, types.ModuleType]
+
+    def __init__(
+        self,
+        file_or_buffer: Union[str, torch._C.PyTorchFileReader, os.PathLike, BinaryIO],
+        module_allowed: Callable[[str], bool] = lambda module_name: True,
+    ):
+        """Open ``file_or_buffer`` for importing. This checks that the imported package only requires modules
+        allowed by ``module_allowed``
+
+        Args:
+            file_or_buffer: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`),
+                a string, or an ``os.PathLike`` object containing a filename.
+            module_allowed (Callable[[str], bool], optional): A method to determine if a externally provided module
+                should be allowed. Can be used to ensure packages loaded do not depend on modules that the server
+                does not support. Defaults to allowing anything.
+
+        Raises:
+            ImportError: If the package will use a disallowed module.
+        """
+        torch._C._log_api_usage_once("torch.package.PackageImporter")
+
+        self.zip_reader: Any
+        if isinstance(file_or_buffer, torch._C.PyTorchFileReader):
+            self.filename = "<pytorch_file_reader>"
+            self.zip_reader = file_or_buffer
+        elif isinstance(file_or_buffer, (os.PathLike, str)):
+            self.filename = os.fspath(file_or_buffer)
+            if not os.path.isdir(self.filename):
+                self.zip_reader = torch._C.PyTorchFileReader(self.filename)
+            else:
+                self.zip_reader = DirectoryReader(self.filename)
+        else:
+            self.filename = "<binary>"
+            self.zip_reader = torch._C.PyTorchFileReader(file_or_buffer)
+
+        torch._C._log_api_usage_metadata(
+            "torch.package.PackageImporter.metadata",
+            {
+                "serialization_id": self.zip_reader.serialization_id(),
+                "file_name": self.filename,
+            },
+        )
+
+        self.root = _PackageNode(None)
+        self.modules = {}
+        self.extern_modules = self._read_extern()
+
+        for extern_module in self.extern_modules:
+            if not module_allowed(extern_module):
+                raise ImportError(
+                    f"package '{file_or_buffer}' needs the external module '{extern_module}' "
+                    f"but that module has been disallowed"
+                )
+            self._add_extern(extern_module)
+
+        for fname in self.zip_reader.get_all_records():
+            self._add_file(fname)
+
+        self.patched_builtins = builtins.__dict__.copy()
+        self.patched_builtins["__import__"] = self.__import__
+        # Allow packaged modules to reference their PackageImporter
+        self.modules["torch_package_importer"] = self  # type: ignore[assignment]
+
+        self._mangler = PackageMangler()
+
+        # used for reduce deserializaiton
+        self.storage_context: Any = None
+        self.last_map_location = None
+
+        # used for torch.serialization._load
+        self.Unpickler = lambda *args, **kwargs: PackageUnpickler(self, *args, **kwargs)
+
+    def import_module(self, name: str, package=None):
+        """Load a module from the package if it hasn't already been loaded, and then return
+        the module. Modules are loaded locally
+        to the importer and will appear in ``self.modules`` rather than ``sys.modules``.
+
+        Args:
+            name (str): Fully qualified name of the module to load.
+            package ([type], optional): Unused, but present to match the signature of importlib.import_module. Defaults to ``None``.
+
+        Returns:
+            types.ModuleType: The (possibly already) loaded module.
+        """
+        # We should always be able to support importing modules from this package.
+        # This is to support something like:
+        #   obj = importer.load_pickle(...)
+        #   importer.import_module(obj.__module__)  <- this string will be mangled
+        #
+        # Note that _mangler.demangle will not demangle any module names
+        # produced by a different PackageImporter instance.
+        name = self._mangler.demangle(name)
+
+        return self._gcd_import(name)
+
+    def load_binary(self, package: str, resource: str) -> bytes:
+        """Load raw bytes.
+
+        Args:
+            package (str): The name of module package (e.g. ``"my_package.my_subpackage"``).
+            resource (str): The unique name for the resource.
+
+        Returns:
+            bytes: The loaded data.
+        """
+
+        path = self._zipfile_path(package, resource)
+        return self.zip_reader.get_record(path)
+
+    def load_text(
+        self,
+        package: str,
+        resource: str,
+        encoding: str = "utf-8",
+        errors: str = "strict",
+    ) -> str:
+        """Load a string.
+
+        Args:
+            package (str): The name of module package (e.g. ``"my_package.my_subpackage"``).
+            resource (str): The unique name for the resource.
+            encoding (str, optional): Passed to ``decode``. Defaults to ``'utf-8'``.
+            errors (str, optional): Passed to ``decode``. Defaults to ``'strict'``.
+
+        Returns:
+            str: The loaded text.
+        """
+        data = self.load_binary(package, resource)
+        return data.decode(encoding, errors)
+
+    def load_pickle(self, package: str, resource: str, map_location=None) -> Any:
+        """Unpickles the resource from the package, loading any modules that are needed to construct the objects
+        using :meth:`import_module`.
+
+        Args:
+            package (str): The name of module package (e.g. ``"my_package.my_subpackage"``).
+            resource (str): The unique name for the resource.
+            map_location: Passed to `torch.load` to determine how tensors are mapped to devices. Defaults to ``None``.
+
+        Returns:
+            Any: The unpickled object.
+        """
+        pickle_file = self._zipfile_path(package, resource)
+        restore_location = _get_restore_location(map_location)
+        loaded_storages = {}
+        loaded_reduces = {}
+        storage_context = torch._C.DeserializationStorageContext()
+
+        def load_tensor(dtype, size, key, location, restore_location):
+            name = f"{key}.storage"
+
+            if storage_context.has_storage(name):
+                storage = storage_context.get_storage(name, dtype)._typed_storage()
+            else:
+                tensor = self.zip_reader.get_storage_from_record(
+                    ".data/" + name, size, dtype
+                )
+                if isinstance(self.zip_reader, torch._C.PyTorchFileReader):
+                    storage_context.add_storage(name, tensor)
+                storage = tensor._typed_storage()
+            loaded_storages[key] = restore_location(storage, location)
+
+        def persistent_load(saved_id):
+            assert isinstance(saved_id, tuple)
+            typename = _maybe_decode_ascii(saved_id[0])
+            data = saved_id[1:]
+
+            if typename == "storage":
+                storage_type, key, location, size = data
+                dtype = storage_type.dtype
+
+                if key not in loaded_storages:
+                    load_tensor(
+                        dtype,
+                        size,
+                        key,
+                        _maybe_decode_ascii(location),
+                        restore_location,
+                    )
+                storage = loaded_storages[key]
+                # TODO: Once we decide to break serialization FC, we can
+                # stop wrapping with TypedStorage
+                return torch.storage.TypedStorage(
+                    wrap_storage=storage._untyped_storage, dtype=dtype, _internal=True
+                )
+            elif typename == "reduce_package":
+                # to fix BC breaking change, objects on this load path
+                # will be loaded multiple times erroneously
+                if len(data) == 2:
+                    func, args = data
+                    return func(self, *args)
+                reduce_id, func, args = data
+                if reduce_id not in loaded_reduces:
+                    loaded_reduces[reduce_id] = func(self, *args)
+                return loaded_reduces[reduce_id]
+            else:
+                f"Unknown typename for persistent_load, expected 'storage' or 'reduce_package' but got '{typename}'"
+
+        # Load the data (which may in turn use `persistent_load` to load tensors)
+        data_file = io.BytesIO(self.zip_reader.get_record(pickle_file))
+        unpickler = self.Unpickler(data_file)
+        unpickler.persistent_load = persistent_load  # type: ignore[assignment]
+
+        @contextmanager
+        def set_deserialization_context():
+            # to let reduce_package access deserializaiton context
+            self.storage_context = storage_context
+            self.last_map_location = map_location
+            try:
+                yield
+            finally:
+                self.storage_context = None
+                self.last_map_location = None
+
+        with set_deserialization_context():
+            result = unpickler.load()
+
+        # TODO from zdevito:
+        #   This stateful weird function will need to be removed in our efforts
+        #   to unify the format. It has a race condition if multiple python
+        #   threads try to read independent files
+        torch._utils._validate_loaded_sparse_tensors()
+
+        return result
+
+    def id(self):
+        """
+        Returns internal identifier that torch.package uses to distinguish :class:`PackageImporter` instances.
+        Looks like::
+
+            <torch_package_0>
+        """
+        return self._mangler.parent_name()
+
+    def file_structure(
+        self, *, include: "GlobPattern" = "**", exclude: "GlobPattern" = ()
+    ) -> Directory:
+        """Returns a file structure representation of package's zipfile.
+
+        Args:
+            include (Union[List[str], str]): An optional string e.g. ``"my_package.my_subpackage"``, or optional list of strings
+                for the names of the files to be included in the zipfile representation. This can also be
+                a glob-style pattern, as described in :meth:`PackageExporter.mock`
+
+            exclude (Union[List[str], str]): An optional pattern that excludes files whose name match the pattern.
+
+        Returns:
+            :class:`Directory`
+        """
+        return _create_directory_from_file_list(
+            self.filename, self.zip_reader.get_all_records(), include, exclude
+        )
+
+    def python_version(self):
+        """Returns the version of python that was used to create this package.
+
+        Note: this function is experimental and not Forward Compatible. The plan is to move this into a lock
+        file later on.
+
+        Returns:
+            :class:`Optional[str]` a python version e.g. 3.8.9 or None if no version was stored with this package
+        """
+        python_version_path = ".data/python_version"
+        return (
+            self.zip_reader.get_record(python_version_path).decode("utf-8").strip()
+            if self.zip_reader.has_record(python_version_path)
+            else None
+        )
+
+    def _read_extern(self):
+        return (
+            self.zip_reader.get_record(".data/extern_modules")
+            .decode("utf-8")
+            .splitlines(keepends=False)
+        )
+
+    def _make_module(
+        self, name: str, filename: Optional[str], is_package: bool, parent: str
+    ):
+        mangled_filename = self._mangler.mangle(filename) if filename else None
+        spec = importlib.machinery.ModuleSpec(
+            name,
+            self,  # type: ignore[arg-type]
+            origin="<package_importer>",
+            is_package=is_package,
+        )
+        module = importlib.util.module_from_spec(spec)
+        self.modules[name] = module
+        module.__name__ = self._mangler.mangle(name)
+        ns = module.__dict__
+        ns["__spec__"] = spec
+        ns["__loader__"] = self
+        ns["__file__"] = mangled_filename
+        ns["__cached__"] = None
+        ns["__builtins__"] = self.patched_builtins
+        ns["__torch_package__"] = True
+
+        # Add this module to our private global registry. It should be unique due to mangling.
+        assert module.__name__ not in _package_imported_modules
+        _package_imported_modules[module.__name__] = module
+
+        # pre-emptively install on the parent to prevent IMPORT_FROM from trying to
+        # access sys.modules
+        self._install_on_parent(parent, name, module)
+
+        if filename is not None:
+            assert mangled_filename is not None
+            # pre-emptively install the source in `linecache` so that stack traces,
+            # `inspect`, etc. work.
+            assert filename not in linecache.cache  # type: ignore[attr-defined]
+            linecache.lazycache(mangled_filename, ns)
+
+            code = self._compile_source(filename, mangled_filename)
+            exec(code, ns)
+
+        return module
+
+    def _load_module(self, name: str, parent: str):
+        cur: _PathNode = self.root
+        for atom in name.split("."):
+            if not isinstance(cur, _PackageNode) or atom not in cur.children:
+                if name in IMPLICIT_IMPORT_ALLOWLIST:
+                    module = self.modules[name] = importlib.import_module(name)
+                    return module
+                raise ModuleNotFoundError(
+                    f'No module named "{name}" in self-contained archive "{self.filename}"'
+                    f" and the module is also not in the list of allowed external modules: {self.extern_modules}",
+                    name=name,
+                )
+            cur = cur.children[atom]
+            if isinstance(cur, _ExternNode):
+                module = self.modules[name] = importlib.import_module(name)
+                return module
+        return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode), parent)  # type: ignore[attr-defined]
+
+    def _compile_source(self, fullpath: str, mangled_filename: str):
+        source = self.zip_reader.get_record(fullpath)
+        source = _normalize_line_endings(source)
+        return compile(source, mangled_filename, "exec", dont_inherit=True)
+
+    # note: named `get_source` so that linecache can find the source
+    # when this is the __loader__ of a module.
+    def get_source(self, module_name) -> str:
+        # linecache calls `get_source` with the `module.__name__` as the argument, so we must demangle it here.
+        module = self.import_module(demangle(module_name))
+        return self.zip_reader.get_record(demangle(module.__file__)).decode("utf-8")
+
+    # note: named `get_resource_reader` so that importlib.resources can find it.
+    # This is otherwise considered an internal method.
+    def get_resource_reader(self, fullname):
+        try:
+            package = self._get_package(fullname)
+        except ImportError:
+            return None
+        if package.__loader__ is not self:
+            return None
+        return _PackageResourceReader(self, fullname)
+
+    def _install_on_parent(self, parent: str, name: str, module: types.ModuleType):
+        if not parent:
+            return
+        # Set the module as an attribute on its parent.
+        parent_module = self.modules[parent]
+        if parent_module.__loader__ is self:
+            setattr(parent_module, name.rpartition(".")[2], module)
+
+    # note: copied from cpython's import code, with call to create module replaced with _make_module
+    def _do_find_and_load(self, name):
+        path = None
+        parent = name.rpartition(".")[0]
+        module_name_no_parent = name.rpartition(".")[-1]
+        if parent:
+            if parent not in self.modules:
+                self._gcd_import(parent)
+            # Crazy side-effects!
+            if name in self.modules:
+                return self.modules[name]
+            parent_module = self.modules[parent]
+
+            try:
+                path = parent_module.__path__  # type: ignore[attr-defined]
+
+            except AttributeError:
+                # when we attempt to import a package only containing pybinded files,
+                # the parent directory isn't always a package as defined by python,
+                # so we search if the package is actually there or not before calling the error.
+                if isinstance(
+                    parent_module.__loader__,
+                    importlib.machinery.ExtensionFileLoader,
+                ):
+                    if name not in self.extern_modules:
+                        msg = (
+                            _ERR_MSG
+                            + "; {!r} is a c extension module which was not externed. C extension modules \
+                            need to be externed by the PackageExporter in order to be used as we do not support interning them.}."
+                        ).format(name, name)
+                        raise ModuleNotFoundError(msg, name=name) from None
+                    if not isinstance(
+                        parent_module.__dict__.get(module_name_no_parent),
+                        types.ModuleType,
+                    ):
+                        msg = (
+                            _ERR_MSG
+                            + "; {!r} is a c extension package which does not contain {!r}."
+                        ).format(name, parent, name)
+                        raise ModuleNotFoundError(msg, name=name) from None
+                else:
+                    msg = (_ERR_MSG + "; {!r} is not a package").format(name, parent)
+                    raise ModuleNotFoundError(msg, name=name) from None
+
+        module = self._load_module(name, parent)
+
+        self._install_on_parent(parent, name, module)
+
+        return module
+
+    # note: copied from cpython's import code
+    def _find_and_load(self, name):
+        module = self.modules.get(name, _NEEDS_LOADING)
+        if module is _NEEDS_LOADING:
+            return self._do_find_and_load(name)
+
+        if module is None:
+            message = f"import of {name} halted; None in sys.modules"
+            raise ModuleNotFoundError(message, name=name)
+
+        # To handle https://github.com/pytorch/pytorch/issues/57490, where std's
+        # creation of fake submodules via the hacking of sys.modules is not import
+        # friendly
+        if name == "os":
+            self.modules["os.path"] = cast(Any, module).path
+        elif name == "typing":
+            self.modules["typing.io"] = cast(Any, module).io
+            self.modules["typing.re"] = cast(Any, module).re
+
+        return module
+
+    def _gcd_import(self, name, package=None, level=0):
+        """Import and return the module based on its name, the package the call is
+        being made from, and the level adjustment.
+
+        This function represents the greatest common denominator of functionality
+        between import_module and __import__. This includes setting __package__ if
+        the loader did not.
+
+        """
+        _sanity_check(name, package, level)
+        if level > 0:
+            name = _resolve_name(name, package, level)
+
+        return self._find_and_load(name)
+
+    # note: copied from cpython's import code
+    def _handle_fromlist(self, module, fromlist, *, recursive=False):
+        """Figure out what __import__ should return.
+
+        The import_ parameter is a callable which takes the name of module to
+        import. It is required to decouple the function from assuming importlib's
+        import implementation is desired.
+
+        """
+        module_name = demangle(module.__name__)
+        # The hell that is fromlist ...
+        # If a package was imported, try to import stuff from fromlist.
+        if hasattr(module, "__path__"):
+            for x in fromlist:
+                if not isinstance(x, str):
+                    if recursive:
+                        where = module_name + ".__all__"
+                    else:
+                        where = "``from list''"
+                    raise TypeError(
+                        f"Item in {where} must be str, " f"not {type(x).__name__}"
+                    )
+                elif x == "*":
+                    if not recursive and hasattr(module, "__all__"):
+                        self._handle_fromlist(module, module.__all__, recursive=True)
+                elif not hasattr(module, x):
+                    from_name = f"{module_name}.{x}"
+                    try:
+                        self._gcd_import(from_name)
+                    except ModuleNotFoundError as exc:
+                        # Backwards-compatibility dictates we ignore failed
+                        # imports triggered by fromlist for modules that don't
+                        # exist.
+                        if (
+                            exc.name == from_name
+                            and self.modules.get(from_name, _NEEDS_LOADING) is not None
+                        ):
+                            continue
+                        raise
+        return module
+
+    def __import__(self, name, globals=None, locals=None, fromlist=(), level=0):
+        if level == 0:
+            module = self._gcd_import(name)
+        else:
+            globals_ = globals if globals is not None else {}
+            package = _calc___package__(globals_)
+            module = self._gcd_import(name, package, level)
+        if not fromlist:
+            # Return up to the first dot in 'name'. This is complicated by the fact
+            # that 'name' may be relative.
+            if level == 0:
+                return self._gcd_import(name.partition(".")[0])
+            elif not name:
+                return module
+            else:
+                # Figure out where to slice the module's name up to the first dot
+                # in 'name'.
+                cut_off = len(name) - len(name.partition(".")[0])
+                # Slice end needs to be positive to alleviate need to special-case
+                # when ``'.' not in name``.
+                module_name = demangle(module.__name__)
+                return self.modules[module_name[: len(module_name) - cut_off]]
+        else:
+            return self._handle_fromlist(module, fromlist)
+
+    def _get_package(self, package):
+        """Take a package name or module object and return the module.
+
+        If a name, the module is imported.  If the passed or imported module
+        object is not a package, raise an exception.
+        """
+        if hasattr(package, "__spec__"):
+            if package.__spec__.submodule_search_locations is None:
+                raise TypeError(f"{package.__spec__.name!r} is not a package")
+            else:
+                return package
+        else:
+            module = self.import_module(package)
+            if module.__spec__.submodule_search_locations is None:
+                raise TypeError(f"{package!r} is not a package")
+            else:
+                return module
+
+    def _zipfile_path(self, package, resource=None):
+        package = self._get_package(package)
+        assert package.__loader__ is self
+        name = demangle(package.__name__)
+        if resource is not None:
+            resource = _normalize_path(resource)
+            return f"{name.replace('.', '/')}/{resource}"
+        else:
+            return f"{name.replace('.', '/')}"
+
+    def _get_or_create_package(
+        self, atoms: List[str]
+    ) -> "Union[_PackageNode, _ExternNode]":
+        cur = self.root
+        for i, atom in enumerate(atoms):
+            node = cur.children.get(atom, None)
+            if node is None:
+                node = cur.children[atom] = _PackageNode(None)
+            if isinstance(node, _ExternNode):
+                return node
+            if isinstance(node, _ModuleNode):
+                name = ".".join(atoms[:i])
+                raise ImportError(
+                    f"inconsistent module structure. module {name} is not a package, but has submodules"
+                )
+            assert isinstance(node, _PackageNode)
+            cur = node
+        return cur
+
+    def _add_file(self, filename: str):
+        """Assembles a Python module out of the given file. Will ignore files in the .data directory.
+
+        Args:
+            filename (str): the name of the file inside of the package archive to be added
+        """
+        *prefix, last = filename.split("/")
+        if len(prefix) > 1 and prefix[0] == ".data":
+            return
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            raise ImportError(
+                f"inconsistent module structure. package contains a module file {filename}"
+                f" that is a subpackage of a module marked external."
+            )
+        if last == "__init__.py":
+            package.source_file = filename
+        elif last.endswith(".py"):
+            package_name = last[: -len(".py")]
+            package.children[package_name] = _ModuleNode(filename)
+
+    def _add_extern(self, extern_name: str):
+        *prefix, last = extern_name.split(".")
+        package = self._get_or_create_package(prefix)
+        if isinstance(package, _ExternNode):
+            return  # the shorter extern covers this extern case
+        package.children[last] = _ExternNode()
+
+
+_NEEDS_LOADING = object()
+_ERR_MSG_PREFIX = "No module named "
+_ERR_MSG = _ERR_MSG_PREFIX + "{!r}"
+
+
+class _PathNode:
+    pass
+
+
+class _PackageNode(_PathNode):
+    def __init__(self, source_file: Optional[str]):
+        self.source_file = source_file
+        self.children: Dict[str, _PathNode] = {}
+
+
+class _ModuleNode(_PathNode):
+    __slots__ = ["source_file"]
+
+    def __init__(self, source_file: str):
+        self.source_file = source_file
+
+
+class _ExternNode(_PathNode):
+    pass
+
+
+# A private global registry of all modules that have been package-imported.
+_package_imported_modules: WeakValueDictionary = WeakValueDictionary()
+
+# `inspect` by default only looks in `sys.modules` to find source files for classes.
+# Patch it to check our private registry of package-imported modules as well.
+_orig_getfile = inspect.getfile
+
+
+def _patched_getfile(object):
+    if inspect.isclass(object):
+        if object.__module__ in _package_imported_modules:
+            return _package_imported_modules[object.__module__].__file__
+    return _orig_getfile(object)
+
+
+inspect.getfile = _patched_getfile
+
+
+class _PackageResourceReader:
+    """Private class used to support PackageImporter.get_resource_reader().
+
+    Confirms to the importlib.abc.ResourceReader interface. Allowed to access
+    the innards of PackageImporter.
+    """
+
+    def __init__(self, importer, fullname):
+        self.importer = importer
+        self.fullname = fullname
+
+    def open_resource(self, resource):
+        from io import BytesIO
+
+        return BytesIO(self.importer.load_binary(self.fullname, resource))
+
+    def resource_path(self, resource):
+        # The contract for resource_path is that it either returns a concrete
+        # file system path or raises FileNotFoundError.
+        if isinstance(
+            self.importer.zip_reader, DirectoryReader
+        ) and self.importer.zip_reader.has_record(
+            os.path.join(self.fullname, resource)
+        ):
+            return os.path.join(
+                self.importer.zip_reader.directory, self.fullname, resource
+            )
+        raise FileNotFoundError
+
+    def is_resource(self, name):
+        path = self.importer._zipfile_path(self.fullname, name)
+        return self.importer.zip_reader.has_record(path)
+
+    def contents(self):
+        from pathlib import Path
+
+        filename = self.fullname.replace(".", "/")
+
+        fullname_path = Path(self.importer._zipfile_path(self.fullname))
+        files = self.importer.zip_reader.get_all_records()
+        subdirs_seen = set()
+        for filename in files:
+            try:
+                relative = Path(filename).relative_to(fullname_path)
+            except ValueError:
+                continue
+            # If the path of the file (which is relative to the top of the zip
+            # namespace), relative to the package given when the resource
+            # reader was created, has a parent, then it's a name in a
+            # subdirectory and thus we skip it.
+            parent_name = relative.parent.name
+            if len(parent_name) == 0:
+                yield relative.name
+            elif parent_name not in subdirs_seen:
+                subdirs_seen.add(parent_name)
+                yield parent_name
diff --git a/MLPY/Lib/site-packages/torch/profiler/__init__.py b/MLPY/Lib/site-packages/torch/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..695d7fc60af4299bdb50d41293817889077c2614
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/__init__.py
@@ -0,0 +1,48 @@
+r"""
+PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
+Profiler's context manager API can be used to better understand what model operators are the most expensive,
+examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
+
+.. note::
+    An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
+
+"""
+import os
+
+from torch._C._autograd import _supported_activities, DeviceType, kineto_available
+from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
+from torch.autograd.profiler import KinetoStepTracker, record_function
+from torch.optim.optimizer import register_optimizer_step_post_hook
+
+from .profiler import (
+    _KinetoProfile,
+    ExecutionTraceObserver,
+    profile,
+    ProfilerAction,
+    schedule,
+    supported_activities,
+    tensorboard_trace_handler,
+)
+
+__all__ = [
+    "profile",
+    "schedule",
+    "supported_activities",
+    "tensorboard_trace_handler",
+    "ProfilerAction",
+    "ProfilerActivity",
+    "kineto_available",
+    "DeviceType",
+    "record_function",
+    "ExecutionTraceObserver",
+]
+
+from . import itt
+
+
+def _optimizer_post_hook(optimizer, args, kwargs):
+    KinetoStepTracker.increment_step("Optimizer")
+
+
+if os.environ.get("KINETO_USE_DAEMON", None):
+    _ = register_optimizer_step_post_hook(_optimizer_post_hook)
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..244d55768034b09839fadc5409e792608543172a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/_memory_profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_memory_profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ab4b215110e9f969725ad379243d796f9605e31
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_memory_profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/_pattern_matcher.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_pattern_matcher.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..991f043e6ed090d834ecbb305640f9900de8757a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_pattern_matcher.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d25bed83fc62714c705136974b45bdc8006419b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/itt.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/itt.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23b8f936879dd5bf47ff25807d20d1f452b30764
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/itt.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/profiler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c22b527302c45f89adabc4b4a99e0d54203d976
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/profiler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/__pycache__/python_tracer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/profiler/__pycache__/python_tracer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47721934dd5d0eeb1e471eecbddd12f5dee07f5a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/profiler/__pycache__/python_tracer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/profiler/_memory_profiler.py b/MLPY/Lib/site-packages/torch/profiler/_memory_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35a8f7364720e042ccf961d23892efa87299240
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/_memory_profiler.py
@@ -0,0 +1,1202 @@
+import collections
+import dataclasses
+import enum
+import itertools as it
+import logging
+from typing import (
+    Any,
+    cast,
+    DefaultDict,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+
+from typing_extensions import Literal
+
+import torch
+from torch._C import FunctionSchema
+from torch._C._autograd import _ProfilerResult
+from torch._C._profiler import (
+    _EventType,
+    _ExtraFields_Allocation,
+    _ExtraFields_TorchOp,
+    _ProfilerEvent,
+    _TensorMetadata,
+    RecordScope,
+)
+from torch._utils import _element_size
+from torch.profiler import _utils
+
+KeyAndID = Tuple["Key", int]
+TensorAndID = Tuple["TensorKey", int]
+
+log = logging.getLogger(__name__)
+
+
+class Category(enum.Enum):
+    INPUT = enum.auto()
+    TEMPORARY = enum.auto()
+    ACTIVATION = enum.auto()
+    GRADIENT = enum.auto()
+    AUTOGRAD_DETAIL = enum.auto()
+    PARAMETER = enum.auto()
+    OPTIMIZER_STATE = enum.auto()
+
+
+_CATEGORY_TO_COLORS = {
+    Category.PARAMETER: "darkgreen",
+    Category.OPTIMIZER_STATE: "goldenrod",
+    Category.INPUT: "black",
+    Category.TEMPORARY: "mediumpurple",
+    Category.ACTIVATION: "red",
+    Category.GRADIENT: "mediumblue",
+    Category.AUTOGRAD_DETAIL: "royalblue",
+    None: "grey",
+}
+
+_CATEGORY_TO_INDEX = {c: i for i, c in enumerate(_CATEGORY_TO_COLORS)}
+
+
+class Action(enum.Enum):
+    PREEXISTING = enum.auto()
+    CREATE = enum.auto()
+    INCREMENT_VERSION = enum.auto()
+    DESTROY = enum.auto()
+
+
+_ACTION_TO_INDEX = {i: i.value for i in Action}
+
+
+@dataclasses.dataclass(eq=True, unsafe_hash=False, frozen=True)
+class Key:
+    device: torch.device
+
+
+@dataclasses.dataclass
+class _Storage:
+    """Bundle storage pointer and id.
+
+    All profiling logic should use `allocation_id`, however it is useful to
+    print storage pointers for debugging and unit tests sometimes look up
+    values using the storage data pointer of a live Tensor."""
+
+    ptr: int
+    allocation_id: int
+
+    def __repr__(self) -> str:
+        return f"{hex(self.ptr):>18} ({self.allocation_id})"
+
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, _Storage) and self.allocation_id == other.allocation_id
+
+    def __hash__(self) -> int:
+        return hash(self.allocation_id)
+
+
+@dataclasses.dataclass(eq=True, unsafe_hash=True, frozen=True)
+class TensorKey(Key):
+    """Hashable identifier for a storage which has been asigned an ID.
+
+    A detailed description of Tensor IDs and why they are needed is given in
+    `torch/csrc/profiler/collection.h` when `TensorID` is declared. To
+    summarize, multiple Storage buffers can map to the same logical Tensor.
+    This dataclass is used to refer to a concrete in-memory StorageImpl of
+    a Tensor.
+    """
+
+    id: int
+    storage: _Storage
+
+    def __repr__(self) -> str:
+        return f"id={self.id}: {repr(self.storage):<24} ({self.device})"
+
+    def __lt__(self, other: "TensorKey") -> bool:
+        return self._as_sortable < other._as_sortable
+
+    @staticmethod
+    def _make(
+        tensor_id: Optional[int],
+        storage_ptr: Optional[int],
+        allocation_id: Optional[int],
+        device: torch.device,
+    ) -> Optional["TensorKey"]:
+        if (
+            tensor_id is not None
+            and storage_ptr is not None
+            and allocation_id is not None
+        ):
+            return TensorKey(device, tensor_id, _Storage(storage_ptr, allocation_id))
+        return None
+
+    @classmethod
+    def from_allocation(cls, alloc: _ExtraFields_Allocation) -> Optional["TensorKey"]:
+        return cls._make(alloc.id, alloc.ptr, alloc.allocation_id, alloc.device)
+
+    @classmethod
+    def from_tensor(cls, t: Optional[_TensorMetadata]) -> Optional["TensorKey"]:
+        if t is not None:
+            return cls._make(t.id, t.storage_data_ptr, t.allocation_id, t.device)
+        return None
+
+    @property
+    def _as_sortable(self) -> Tuple[int, int, str, int]:
+        return self.id, self.storage.allocation_id, self.device.type, self.device.index
+
+
+def _extract_parameters_and_gradients(
+    node: _ProfilerEvent,
+) -> Iterator[Tuple[Optional[TensorKey], Optional[TensorKey]]]:
+    children = node.children
+
+    # AccumulateGrad is used in the Autograd engine to handle gradient updates.
+    # There are two possible cases:
+    # 1) This is a newly created gradient Tensor. In that case there is nothing
+    #    to accumulate, so autograd simply detaches the Tensor.
+    #
+    # 2) There is a preexisting gradient Tensor and we need to add the newly
+    #    computed update. This is done with an in-place add (aten::add_) op.
+    #    (The underscore suffix denotes "in-place".)
+    if (
+        node.typed[0] == _EventType.TorchOp
+        and node.typed[1].scope == RecordScope.BACKWARD_FUNCTION
+        # TODO(robieta): Move away from load bearing names
+        and node.name == "torch::autograd::AccumulateGrad"
+        and children
+        and children[0].typed[0] == _EventType.TorchOp
+        and children[0].name in ("aten::detach", "aten::add_")
+        and children[0].typed[1].inputs
+        and isinstance(children[0].typed[1].inputs[0], _TensorMetadata)
+    ):
+        yield None, TensorKey.from_tensor(children[0].typed[1].inputs[0])
+
+    # We directly instrument `torch.nn.Module` and `torch.optim.Optimizer`
+    # NOTE: The values captured by the python tracer are cached; they can be
+    #       used to build up labels but do not imply that a Tensor was live at
+    #       a particular time.
+    elif node.typed[0] == _EventType.PyCall:
+        typed_fields = node.typed[1]
+        assert typed_fields.module is None or typed_fields.optimizer is None
+        if typed_fields.module is not None:
+            for _, p, p_grad in typed_fields.module.parameters:
+                yield TensorKey.from_tensor(p), TensorKey.from_tensor(p_grad)
+
+        if typed_fields.optimizer is not None:
+            for p, p_grad, _ in typed_fields.optimizer.parameters:
+                yield TensorKey.from_tensor(p), TensorKey.from_tensor(p_grad)
+
+
+def extract_parameters(node: _ProfilerEvent) -> Iterator[TensorKey]:
+    for p, p_grad in _extract_parameters_and_gradients(node):
+        if p is not None:
+            yield p
+
+
+def extract_gradients(
+    node: _ProfilerEvent,
+) -> Iterator[Tuple[Optional[TensorKey], TensorKey]]:
+    for p, p_grad in _extract_parameters_and_gradients(node):
+        if p_grad is not None:
+            yield p, p_grad
+
+
+def get_scopes(event: Optional[_ProfilerEvent]) -> Tuple[RecordScope, ...]:
+    scopes = []
+    while event:
+        if event.typed[0] == _EventType.TorchOp:
+            scopes.append(event.typed[1].scope)
+        event = event.parent
+    return tuple(scopes)
+
+
+class SchemaMatcher:
+    """Lookup operator schema based on profiled name.
+
+    When profiling we record the operator's name but not the schema. However
+    some analysis requires that information. Fortunately we can look up
+    registered schema from the recorded name. We do not, however, record the
+    overload and so we must compare the profiled arguments with all overloads
+    to determine viable matches.
+
+    Note: Once https://github.com/pytorch/pytorch/issues/78871 is completed
+    this code will be obsolete.
+    """
+
+    @classmethod
+    def inputs_are_mutable(cls, t: _ExtraFields_TorchOp) -> Tuple[Optional[bool], ...]:
+        """Determine which inputs may have mutated based on function schema.
+
+        Note that we don't need to resolve down to a single schema to perform
+        this analysis. An input is mutable if it is mutable in any overload. In
+        practice, however, it is overwhelmingly common to match a single
+        overload. If we cannot find any valid schema then we must be
+        conservative and assume all inputs are mutable.
+        """
+        mutable: Optional[List[bool]] = None
+        for schema in cls.match_schemas(t):
+            mutable = mutable or [False for _ in schema.arguments]
+            for i, arg in enumerate(schema.arguments):
+                mutable[i] |= getattr(arg.alias_info, "is_write", False)
+
+        return tuple(mutable or (None for _ in t.inputs))
+
+    @classmethod
+    def match_schemas(cls, t: _ExtraFields_TorchOp) -> Tuple[FunctionSchema, ...]:
+        signature = tuple(
+            # Tensor
+            TensorKey.from_tensor(i) if isinstance(i, _TensorMetadata)
+            #
+            # TensorList
+            else [TensorKey.from_tensor(j) for j in i] if isinstance(i, list)
+            #
+            # Scalar and uncaptured inputs.
+            else i
+            for i in t.inputs
+        )
+
+        def matches(schema) -> bool:
+            return len(schema.arguments) == len(signature) and all(
+                cls._types_match(observed, schema_arg.type)
+                for observed, schema_arg in zip(signature, schema.arguments)
+            )
+
+        return tuple(s for s in cls.lookup_schemas(t.name) or () if matches(s))
+
+    @classmethod
+    def _types_match(cls, observed, schema_type) -> bool:
+        if isinstance(schema_type, torch._C.OptionalType):
+            schema_type = schema_type.getElementType()
+            return observed is None or cls._types_match(observed, schema_type)
+
+        if isinstance(schema_type, torch._C.AnyType):
+            return True
+
+        if schema_type.isSubtypeOf(torch._C.ListType.ofTensors()):
+            return isinstance(observed, list) and all(
+                isinstance(i, TensorKey) for i in observed
+            )
+
+        type_map: Tuple[Tuple[Any, Union[type, Tuple[type, ...]]], ...] = (
+            (torch._C.TensorType, TensorKey),
+            (torch._C.NoneType, type(None)),
+            (torch._C.BoolType, bool),
+            (torch._C.IntType, int),
+            (torch._C.FloatType, float),
+            (torch._C.ComplexType, complex),
+            (torch._C.NumberType, (bool, int, float, complex)),
+        )
+
+        for jit_type, py_types in type_map:
+            if isinstance(schema_type, jit_type):
+                return isinstance(observed, py_types)
+
+        # Profiler only records a subset of possible argument types. If we
+        # reach this point then the schema must call for a type that profiler
+        # does not record. Thus, the schema can only be a match if `observed`
+        # is also None.
+        return observed is None
+
+    @staticmethod
+    def lookup_schemas(name: str) -> Optional[Tuple[FunctionSchema, ...]]:
+        # TODO(robieta):
+        #   _jit_get_schemas_for_operator is quite expensive. (~100us / call)
+        #   Consider adding `functools.lru_cache` if that becomes an issue.
+
+        try:
+            # Schema lookup will throw if `name` is malformed. (For example,
+            # schemas must be namespaced and schema lookup will fail if name
+            # does not include "::".) We simply catch the exception and return
+            # `None` to denote that `name` cannot be an operator name.
+            #
+            # Note that record_function annotations also go through this path,
+            # so it is expected that some names will not correspond to PyTorch
+            # operators.
+            if "::" not in name:
+                return None
+            return tuple(torch._C._jit_get_schemas_for_operator(name))
+        except RuntimeError:
+            return None
+
+
+class OpTree:
+    def __init__(self, result: _ProfilerResult) -> None:
+        self._root_nodes = result.experimental_event_tree()
+        self._sorted_nodes = tuple(sorted(self.dfs(), key=lambda x: x.start_time_ns))
+
+    def dfs(self, *args, **kwargs) -> Iterator[_ProfilerEvent]:
+        yield from _utils.traverse_dfs(self._root_nodes, *args, **kwargs)
+
+    @property
+    def sorted_nodes(self) -> Tuple[_ProfilerEvent, ...]:
+        return self._sorted_nodes
+
+
+class SizeMap:
+    def __init__(self, op_tree: OpTree) -> None:
+        self._values: Dict[TensorKey, int] = {}
+
+        for node in op_tree.sorted_nodes:
+            if node.typed[0] == _EventType.TorchOp:
+                for t in self._flat_tensor_inputs(node.typed[1]):
+                    self._update_values(t)
+
+            elif node.typed[0] == _EventType.PyCall:
+                typed_fields = node.typed[1]
+                assert typed_fields.module is None or typed_fields.optimizer is None
+                if typed_fields.module is not None:
+                    for _, p, p_grad in typed_fields.module.parameters:
+                        self._update_values(p)
+                        self._update_values(p_grad)
+
+                if typed_fields.optimizer is not None:
+                    for p, p_grad, state in typed_fields.optimizer.parameters:
+                        self._update_values(p)
+                        self._update_values(p_grad)
+                        for _, t in state:
+                            self._update_values(t)
+
+        allocations: Dict[TensorKey, int] = {}
+        for node in op_tree.sorted_nodes:
+            if node.typed[0] == _EventType.Allocation:
+                alloc_fields = node.typed[1]
+                key = TensorKey.from_allocation(alloc_fields)
+                if key:
+                    new_size = abs(alloc_fields.alloc_size)
+                    prior_size = allocations.setdefault(key, new_size)
+
+                    # It is possible to resize Storage in PyTorch, however we
+                    # key on data pointer so most resizes will be treated as a
+                    # change in storage. The one corner case that cannot be
+                    # handled is `realloc` which successfully resizes the
+                    # storage. At time of writing this is not done anywhere in
+                    # the core PyTorch codebase.
+                    if prior_size != new_size:
+                        delta = f"{prior_size} vs. {new_size}"
+                        log.warning("Mismatch between allocation and free: %s", delta)
+
+        self._values.update(allocations)
+
+    def _update_values(self, t: Optional[_TensorMetadata]) -> None:
+        key = TensorKey.from_tensor(t)
+        if key is not None and t is not None and t.layout == torch.strided:
+            # Scalars are represented as zero dim Tensors
+            n = max(i[0] * i[1] for i in zip(t.sizes or [1], t.strides or [1]))
+
+            num_bytes = n * _element_size(t.dtype)
+            assert num_bytes >= 0, f"{num_bytes}"
+            self._values[key] = max(self._values.get(key, 0), num_bytes)
+
+    @staticmethod
+    def _flat_tensor_inputs(op: _ExtraFields_TorchOp) -> Iterator[_TensorMetadata]:
+        for i in op.inputs:
+            if isinstance(i, _TensorMetadata):
+                yield i
+            elif isinstance(i, list):
+                yield from i
+
+    def __getitem__(self, key: TensorKey):
+        return self._values[key]
+
+
+@dataclasses.dataclass()
+class DataFlowEdge:
+    input_version: Optional[int] = None
+    mutated: Optional[bool] = False
+
+    @property
+    def is_allocation(self) -> bool:
+        return self.input_version is None
+
+    @property
+    def is_deletion(self) -> bool:
+        return self.mutated is None
+
+
+class DataFlowNode:
+    def __init__(self, event: _ProfilerEvent, graph: "DataFlowGraph") -> None:
+        self._event = event
+        self._graph = graph
+        self._edges: Dict[TensorKey, DataFlowEdge] = self._determine_edges()
+
+        for key, edge in self._edges.items():
+            if edge.mutated and not edge.is_allocation:
+                self._graph.bump(key)
+
+        # Make sure the version bumping behavior matches what we expect.
+        versions = {k: (v, self._graph.lookup(k)) for k, v in self.outputs.items()}
+        assert all(i == j for i, j in versions.values()), f"{versions}, {self._edges}"
+
+    def _determine_edges(self) -> Dict[TensorKey, DataFlowEdge]:
+        subtree = tuple(_utils.traverse_dfs([self._event]))
+
+        # Start by populating edges from op inputs and outputs.
+        mutable_by_key: Dict[Optional[TensorKey], Set[Optional[bool]]] = {}
+        for op in (i.typed[1] for i in subtree if i.typed[0] == _EventType.TorchOp):
+            for op_input, mutable in zip(
+                op.inputs, SchemaMatcher.inputs_are_mutable(op)
+            ):
+                # Tensor
+                if isinstance(op_input, _TensorMetadata):
+                    key = TensorKey.from_tensor(op_input)
+                    mutable_by_key.setdefault(key, set()).add(mutable)
+
+                # TensorList
+                elif isinstance(op_input, list):
+                    for op_input_i in op_input:
+                        key = TensorKey.from_tensor(op_input_i)
+                        mutable_by_key.setdefault(key, set()).add(mutable)
+
+        edges: DefaultDict[Optional[TensorKey], DataFlowEdge]
+        edges = collections.defaultdict(DataFlowEdge)
+        for key, mutable_set in mutable_by_key.items():
+            if key is not None:
+                edges[key].input_version = self._graph.lookup(key) if key else -1
+
+                # We consider an op to be mutated if we encounter a schema where it
+                # is a mutable argument OR if it is ambiguous. (We never explicitly
+                # see it in any schema.)
+                mutated = (True in mutable_set) or (tuple(mutable_set) == (None,))
+                edges[key].mutated = mutated
+
+        # Then handle deletions. Note that deleting a Tensor implicitly adds
+        # it as an input edge.
+        for i in subtree:
+            if i.typed[0] == _EventType.Allocation and i.typed[1].alloc_size < 0:
+                key = TensorKey.from_allocation(i.typed[1])
+                edge = edges[key]
+                assert key is None or edge.mutated is not None, f"Double delete: {key}"
+                edge.mutated = None
+                edge.input_version = self._graph.lookup(key) if key else -1
+
+        # And finally handle allocations. This step must be last, because the
+        # previous two steps optimistically add input edges.
+        for i in subtree:
+            if i.typed[0] == _EventType.Allocation and i.typed[1].alloc_size > 0:
+                edges[TensorKey.from_allocation(i.typed[1])].input_version = None
+
+        # We don't need to sort the inputs, but it makes debugging and unit tests nicer.
+        return dict(sorted((k, v) for k, v in edges.items() if k is not None))
+
+    @property
+    def inputs(self) -> Dict[TensorKey, Tuple[bool, int]]:
+        return {
+            # MyPy can't see through `is_allocation` to know that
+            # `v.input_version` is not None.
+            k: (bool(v.mutated), cast(int, v.input_version))
+            for k, v in self._edges.items()
+            if not v.is_allocation
+        }
+
+    @property
+    def outputs(self) -> Dict[TensorKey, int]:
+        return {
+            k: 0 if v.input_version is None else v.input_version + 1
+            for k, v in self._edges.items()
+            if (v.is_allocation and not v.is_deletion) or v.mutated
+        }
+
+    @property
+    def intermediates(self) -> Tuple[TensorKey, ...]:
+        return tuple(
+            k for k, v in self._edges.items() if v.is_allocation and v.is_deletion
+        )
+
+    @property
+    def start_time(self) -> int:
+        return self._event.start_time_ns
+
+
+class DataFlowGraph:
+    def __init__(self, op_tree: OpTree) -> None:
+        self._op_tree = op_tree
+        self._leaf_events = self._extract_leaf_events(op_tree)
+        self._active_version: Dict[TensorKey, Optional[int]] = {}
+        self._flow_nodes = [DataFlowNode(e, self) for e in self.leaf_events]
+        self._flow_nodes.sort(key=lambda x: x.start_time)
+        self.validate()
+
+    @property
+    def flow_nodes(self) -> Tuple[DataFlowNode, ...]:
+        return tuple(self._flow_nodes)
+
+    def validate(self):
+        # Check that each (Tensor, version) pair has a unique creation node
+        outputs: Set[Tuple[TensorKey, int]] = set()
+        for node in self.flow_nodes:
+            node_outputs = set(node.outputs.items())
+            duplicates = outputs & node_outputs
+            assert not duplicates, f"{node._event.name} {node._edges} {duplicates}"
+            outputs |= node_outputs
+
+        # And check that `self._nodes` forms a valid topologically sorted DAG.
+        tensor_versions: Dict[TensorKey, int] = {}
+        for node in self.flow_nodes:
+            for key, (_, version) in node.inputs.items():
+                expected = tensor_versions.get(key, 0)
+                assert expected == version, (expected, version)
+
+            for key, version in node.outputs.items():
+                prior_version = tensor_versions.get(key, version)
+                assert version >= prior_version, (version, prior_version)
+                tensor_versions[key] = version
+
+    @property
+    def leaf_events(self) -> Tuple[_ProfilerEvent, ...]:
+        return self._leaf_events
+
+    @staticmethod
+    def _extract_leaf_events(op_tree: OpTree) -> Tuple[_ProfilerEvent, ...]:
+        """Partially traverse the op tree and extract top level ops.
+
+        Consider the following code:
+        ```
+        with record_function("My annotation"):
+            x.zero_()
+            y.zero_()
+        ```
+
+        The op tree (assuming no Autograd) will look like:
+          <Python context>
+            TorchOp: "My annotation"
+              TorchOp: zero_
+                TorchOp: fill_
+              TorchOp: zero_
+                TorchOp: fill_
+
+        The recursive structure of operator calls makes data flow unwieldy.
+        In order to simplify analysis we would like to select the highest level
+        ops to represent in the graph. In this case those are the `zero_` ops;
+        the fact that `fill_` is called is an implementation detail. We also
+        do not want to group everything under "My annotation" as this could
+        create overly coarse bundles and lose critical semantics.
+
+        To address this issue we walk over the graph and select the topmost
+        torch ops ** which match at least one operator schema **. These form
+        the leaves of the first pass through the op tree. (As well as any
+        allocations or frees which do are not part of a kernel.) These events
+        form the logical nodes in our data flow graph.
+        """
+
+        leaf_events: List[_ProfilerEvent] = []
+
+        def leaf_op(e: _ProfilerEvent) -> bool:
+            return e.typed[0] == _EventType.TorchOp and (
+                e.typed[1].scope == RecordScope.BACKWARD_FUNCTION
+                or bool(SchemaMatcher.match_schemas(e.typed[1]))
+            )
+
+        def children_fn(e: _ProfilerEvent):
+            if leaf_op(e) or e.tag == _EventType.Allocation:
+                leaf_events.append(e)
+                return []
+
+            return e.children
+
+        for _ in op_tree.dfs(children_fn=children_fn):
+            pass
+
+        return tuple(sorted(leaf_events, key=lambda x: x.start_time_ns))
+
+    def lookup(self, key: TensorKey) -> int:
+        version = self._active_version.setdefault(key, 0)
+        assert version is not None
+        return version
+
+    def bump(self, key: TensorKey) -> None:
+        prior_version = self._active_version.get(key, None)
+        assert prior_version is not None
+        self._active_version[key] = prior_version + 1
+
+    def delete(self, key: TensorKey) -> None:
+        assert self._active_version.setdefault(key, 0) is not None
+        self._active_version[key] = None
+
+
+@dataclasses.dataclass
+class CategoryElement:
+    by_id: Optional[Category] = None
+    by_key: Dict[TensorKey, Category] = dataclasses.field(default_factory=dict)
+    by_version: Dict[TensorAndID, Category] = dataclasses.field(default_factory=dict)
+
+    # Used by unit tests to check internals. (And consequently by
+    # MemoryProfile.lookup) This should not be used in any other capacity.
+    _by_id_keyset: Set[TensorKey] = dataclasses.field(default_factory=set)
+
+
+@dataclasses.dataclass
+class CategoryDict:
+    _values: DefaultDict[int, CategoryElement] = dataclasses.field(
+        default_factory=lambda: collections.defaultdict(CategoryElement)
+    )
+
+    def set_by_id(self, key: TensorKey, category: Category) -> None:
+        self._values[key.id].by_id = category
+        self._values[key.id]._by_id_keyset.add(key)
+
+    def set_by_key(self, key: TensorKey, category: Category) -> None:
+        self._values[key.id].by_key[key] = category
+
+    def set_by_version(self, key: TensorKey, version: int, category: Category) -> None:
+        self._values[key.id].by_version[(key, version)] = category
+
+    def setdefault_by_version(
+        self, key: TensorKey, version: int, category: Category
+    ) -> None:
+        self._values[key.id].by_version.setdefault((key, version), category)
+
+    def get(self, key: Key, version: int) -> Optional[Category]:
+        if isinstance(key, Key) and not isinstance(key, TensorKey):
+            return None
+        element = self._values[key.id]
+        return (
+            element.by_id
+            or element.by_key.get(key, None)
+            or element.by_version.get((key, version), None)
+        )
+
+
+class MemoryProfile:
+    def __init__(self, result: _ProfilerResult) -> None:
+        self._op_tree = OpTree(result)
+        self._data_flow_graph = DataFlowGraph(self._op_tree)
+        self._size_map = SizeMap(self._op_tree)
+        self._categories = CategoryDict()
+
+        self._set_gradients_and_temporaries()
+        self._set_parameters_using_python_tracer()
+        self._set_inputs()
+        self._set_parameters_using_data_flow()
+        self._set_activations()
+        self._set_optimizer_state()
+        self._set_autograd_detail()
+
+    @property
+    def timeline(self) -> Tuple[Tuple[int, Action, KeyAndID, int], ...]:
+        output: List[Tuple[int, Action, KeyAndID, int]] = []
+        allocation_times: Dict[Tuple[TensorKey, bool], int] = {}
+        live_unknown: Dict[Tuple[int, torch.device], Literal[True]] = {}
+        for event in self._op_tree.dfs():
+            if event.typed[0] == _EventType.Allocation:
+                alloc_fields = event.typed[1]
+                alloc_size = alloc_fields.alloc_size
+                is_allocation = alloc_size > 0
+                t = event.start_time_ns
+
+                tkey = TensorKey.from_allocation(alloc_fields)
+                if tkey is not None:
+                    allocation_times[(tkey, is_allocation)] = t
+
+                else:
+                    key = Key(alloc_fields.device)
+                    ptr_and_device = (alloc_fields.ptr, key.device)
+                    if is_allocation:
+                        if ptr_and_device in live_unknown:
+                            output.append(
+                                (t, Action.INCREMENT_VERSION, (key, 0), alloc_size)
+                            )
+                        else:
+                            live_unknown[ptr_and_device] = True
+                            output.append((t, Action.CREATE, (key, 0), alloc_size))
+                    else:
+                        output.append((t, Action.DESTROY, (key, 0), -alloc_size))
+                        if not live_unknown.pop(ptr_and_device, False):
+                            output.append(
+                                (-1, Action.PREEXISTING, (key, 0), -alloc_size)
+                            )
+
+        snapshot = self._category_snapshot()
+        last_version = dict(sorted(snapshot.keys()))
+
+        events: List[Tuple[int, Action, TensorAndID]] = [
+            (-1, Action.PREEXISTING, (key, version))
+            for key, version in snapshot.keys()
+            if (key, True) not in allocation_times and version == 0
+        ]
+
+        for node in self._data_flow_graph.flow_nodes:
+            for key, edge in node._edges.items():
+                if edge.is_allocation:
+                    t = allocation_times[(key, True)]
+                    events.append((t, Action.CREATE, (key, 0)))
+
+                elif edge.mutated:
+                    t = node._event.start_time_ns
+                    version = edge.input_version
+                    assert version is not None
+                    events.append((t, Action.INCREMENT_VERSION, (key, version)))
+
+                if edge.is_deletion:
+                    t = allocation_times[(key, False)]
+                    events.append((t, Action.DESTROY, (key, last_version[key])))
+
+        output.extend(
+            (time, action, (key, version), self._size_map[key])
+            for time, action, (key, version) in events
+        )
+
+        output.sort(key=lambda x: (x[0], x[1].value))
+        return tuple(output)
+
+    def _is_gradient(self, *args, **kwargs) -> bool:
+        return self._categories.get(*args, **kwargs) == Category.GRADIENT
+
+    def _category_snapshot(self) -> Dict[TensorAndID, Optional[Category]]:
+        all_tensor_versions: Set[TensorAndID] = set()
+
+        for node in self._data_flow_graph.flow_nodes:
+            all_tensor_versions.update(((k, v) for k, (_, v) in node.inputs.items()))
+            all_tensor_versions.update((key, 0) for key in node.intermediates)
+            all_tensor_versions.update(node.outputs.items())
+
+        for i in self._categories._values.values():
+            all_tensor_versions.update((key, 0) for key in i._by_id_keyset)
+
+        return {
+            (key, version): self._categories.get(key, version)
+            for key, version in sorted(all_tensor_versions)
+        }
+
+    def _any_version_depends_on_gradient(self) -> Set[int]:
+        """Extract IDs of Tensors which depend or will depend on a gradient.
+
+        Note that this weakened definition of "depends" requires us to loop
+        over the data flow graph multiple times because it allows dependency
+        information to flow backward through edges and removes the guarantee
+        that nodes are topologically sorted. (Or indeed, even that a valid
+        topological order exists.) Put another way, we have converted an
+        acyclic data flow graph into a cyclic graph and we are attempting to
+        partition cycles involving a gradient from the rest of the graph.
+        """
+        depends_on_gradient: Set[int] = set()
+        while True:
+            start_size = len(depends_on_gradient)
+            for node in self._data_flow_graph.flow_nodes:
+                ids = tuple(
+                    key.id
+                    for key, (_, version) in node.inputs.items()
+                    if self._categories.get(key, version)
+                    in (Category.GRADIENT, Category.PARAMETER)
+                    or key.id in depends_on_gradient
+                )
+
+                if ids:
+                    depends_on_gradient.update(ids)
+                    depends_on_gradient.update(key.id for key in node.outputs)
+
+            # We are guaranteed to exit because there is a finite set of
+            # TensorAndID pairs. In practice we do not expect to loop more than
+            # three times: once to identify the core parameter update loop,
+            # once to fold the first step into that loop, and a third time
+            # where no new elements are added.
+            if len(depends_on_gradient) == start_size:
+                return depends_on_gradient
+
+    def _set_gradients_and_temporaries(self) -> None:
+        """Mark Tensors which are unambiguous and simple to reason about."""
+
+        # Gradients are straightforward to detect. We directly check the
+        # `.grad` property in the Python tracer, and we can detect any new
+        # gradient Tensors from `AccumulateGrad` ops.
+        for event in self._op_tree.dfs():
+            for _, p_grad in extract_gradients(event):
+                self._categories.set_by_id(p_grad, Category.GRADIENT)
+
+        # Similarly, temporary Tensors are easy to identify and are useful to
+        # flag since they can make memory use "spikier" than one would
+        # otherwise expect.
+        for node in self._data_flow_graph.flow_nodes:
+            for i in node.intermediates:
+                self._categories.set_by_key(i, Category.TEMPORARY)
+
+    def _set_parameters_using_python_tracer(self) -> None:
+        for event in self._op_tree.dfs():
+            for p in extract_parameters(event):
+                if p is not None:
+                    self._categories.set_by_id(p, Category.PARAMETER)
+
+    def _set_inputs(self) -> None:
+        """Mark inputs based on which Tensors are updated using gradients.
+
+        The process for differentiating between inputs and activations is more
+        involved. Most Tensors in a training loop depend on at least one
+        gradient: parameters depend on them through updates, and activations
+        and optimizer state depend on them transitively through parameters.
+        Critically, we do not need to know which Tensors are parameters to
+        apply this method; we can simply walk the data flow graph to build the
+        set of all values which depend on a gradient and then obtain the set
+        of inputs from the conjugate set.
+
+        There is, however, one hiccup. The first time we see a parameter is
+        generally on the forward pass of the first step. We know from
+        inspection of the data flow graph that v1 of that Tensor depends on
+        a gradient (provided we profile an optimizer step), but not v0. To
+        address this problem we weaken the definition of "depends on a
+        gradient" to "any version of this Tensor depends on a gradient",
+        which in turn strengthens the criteria for the input set enough to
+        filter the activations in the forward pass of the first step."""
+
+        # All of this analysis is predicated on using at least one training
+        # step (or parameters from the python tracer) to partition the graph.
+        # Absent that we cannot determine which Tensors are inputs and which
+        # ones are part of the model.
+        depends_on_gradient = self._any_version_depends_on_gradient()
+
+        # We only want to annotate Tensors which actually contribute to the
+        # model calculation.
+        produces_gradient: Set[TensorAndID] = set()
+        for node in reversed(self._data_flow_graph.flow_nodes):
+            tensors = {(key, version) for key, (_, version) in node.inputs.items()}
+            tensors |= node.outputs.items()
+            if any(
+                self._categories.get(*i) in (Category.GRADIENT, Category.PARAMETER)
+                or i in produces_gradient
+                for i in tensors
+            ):
+                produces_gradient |= tensors
+
+        # Don't include Tensors created in the backward pass, as these are
+        # generally Autograd implementation details rather than proper inputs.
+        input_candidates = produces_gradient.copy()
+        for node in self._data_flow_graph.flow_nodes:
+            if RecordScope.BACKWARD_FUNCTION in get_scopes(node._event):
+                input_candidates -= set(node.outputs.items())
+
+        for key, version in input_candidates:
+            if key.id not in depends_on_gradient:
+                self._categories.setdefault_by_version(key, version, Category.INPUT)
+
+    def _set_parameters_using_data_flow(self) -> None:
+        """Deduce which Tensors are parameters.
+
+        Consider the following code for the step of SGD with momentum
+        (nesterov=False), where `d_p` is the gradient of `param` and `buf` is
+        the momentum buffer.
+        ```
+          buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
+          d_p = buf
+          param.add_(d_p, alpha=-lr)
+        ```
+        Both `param` and `buf` take a gradient and perform an in-place update.
+
+        The python tracer will inspect calls to `nn.Module.forward` and
+        `optim.Optimizer.step` to extract parameter and optimizer state
+        respectively (including parameters), so this is generally a non-issue.
+
+        However as a fallback we can also exploit several properties of
+        parameters to distinguish them from other model state.
+
+        First, they are directly used in the forward pass. (At this point we
+        haven't established which parts of the graph correspond to the forward
+        pass but we can deduce enough to suffice.) Some mutable state such as
+        batch norm moving averages also contribute to the forward pass, but
+        optimizer state does not.
+
+        Second, a parameter is by definition used to compute at least one
+        gradient and depends on at least one gradient.
+        """
+        snapshot = self._category_snapshot()
+
+        # Determine which Tensors might be parameters based on forward pass
+        # data flow. Note this these are only candidates; we filter nodes that
+        # we know are part of the backward pass but that doesn't guarantee that
+        # they are part of the forward pass.
+        candidate_parameters: Set[TensorAndID] = set()
+        candidate_fwd_tensors: Set[TensorAndID] = {
+            i for i, category in snapshot.items() if category == Category.INPUT
+        }
+
+        for node in self._data_flow_graph.flow_nodes:
+            inputs = {(key, value) for key, (_, value) in node.inputs.items()}
+            if (
+                # Don't check nodes in the backward pass.
+                RecordScope.BACKWARD_FUNCTION not in get_scopes(node._event)
+                and not any(self._is_gradient(*i) for i in inputs)
+                and not any(self._is_gradient(*i) for i in node.outputs.items())
+                #
+                # and only check nodes which depend on an input.
+                and candidate_fwd_tensors.intersection(inputs)
+            ):
+                candidate_fwd_tensors |= node.outputs.items()
+                candidate_parameters |= inputs.difference(candidate_fwd_tensors)
+
+        # Require that each parameter eventually contributes to the value of a gradient
+        used_for_gradient: Set[TensorAndID] = set()
+        for node in reversed(self._data_flow_graph.flow_nodes):
+            if any(
+                self._is_gradient(*i) or i in used_for_gradient
+                for i in node.outputs.items()
+            ):
+                for key, (_, version) in node.inputs.items():
+                    used_for_gradient.add((key, version))
+        candidate_parameters.intersection_update(used_for_gradient)
+
+        # and depends on a gradient.
+        parameter_keys = {key.id for key, _ in candidate_parameters}
+        parameter_keys &= self._any_version_depends_on_gradient()
+
+        for key, _ in snapshot.keys():
+            if key.id in parameter_keys:
+                self._categories.set_by_id(key, Category.PARAMETER)
+
+    def _set_activations(self) -> None:
+        """Flood the graph to identify activations."""
+
+        required = {Category.INPUT, Category.ACTIVATION}
+        also_allowed = {Category.PARAMETER, Category.TEMPORARY}
+        for node in self._data_flow_graph.flow_nodes:
+            inputs = {(key, value) for key, (_, value) in node.inputs.items()}
+            input_categories = {self._categories.get(*i) for i in inputs}
+
+            if (
+                (input_categories & required)
+                and not (input_categories - (required | also_allowed))
+                #
+                # Stop filling when we reach the backward pass.
+                and RecordScope.BACKWARD_FUNCTION not in get_scopes(node._event)
+            ):
+                for i in node.outputs.items():
+                    self._categories.setdefault_by_version(*i, Category.ACTIVATION)
+
+    def _set_optimizer_state(self) -> None:
+        for event in self._op_tree.dfs():
+            if event.typed[0] == _EventType.PyCall and event.typed[1].optimizer:
+                parameters = event.typed[1].optimizer.parameters
+                for _, t in it.chain(*[state for _, _, state in parameters]):
+                    key = TensorKey.from_tensor(t)
+                    if key is not None:
+                        self._categories.set_by_id(key, Category.OPTIMIZER_STATE)
+
+    def _set_autograd_detail(self):
+        prior = {None, Category.AUTOGRAD_DETAIL}
+        for node in self._data_flow_graph.flow_nodes:
+            if RecordScope.BACKWARD_FUNCTION in get_scopes(node._event):
+                for key, version in node.outputs.items():
+                    if version == 0 or self._categories.get(key, version - 1) in prior:
+                        self._categories.setdefault_by_version(
+                            key, version, Category.AUTOGRAD_DETAIL
+                        )
+
+
+class MemoryProfileTimeline:
+    def __init__(self, memory_profile):
+        """The minimum representation of the memory profile timeline
+        includes the memory timeline and categories. The timeline
+        consists of [timestamp, action, (TensorKey, version), numbytes]
+        elements, to denote any actions (pre-existing, create, destroy,
+        or increment_version) that occurred to a specific Tensor for a
+        chunk of memory. The categories help map each (TensorKey,
+        version) pair into a category."""
+        self.timeline = memory_profile.timeline
+        self.categories = memory_profile._categories
+
+    def _coalesce_timeline(self, device_str):
+        """Convert the memory timeline and categories into a memory plot
+        consisting of timestamps and their respective sizes by category
+        for a given device.
+
+        Input: device
+        Output: [timestamps, sizes by category]
+        """
+        device = torch.device(device_str)
+        times: List[int] = []
+        sizes: List[List[int]] = []
+
+        def update(key, version, delta):
+            category = (
+                self.categories.get(key, version)
+                if isinstance(key, TensorKey)
+                else None
+            )
+            index = _CATEGORY_TO_INDEX[category] + 1
+            sizes[-1][index] += int(delta)
+
+        t_min = -1
+        for t, action, (key, version), numbytes in self.timeline:
+            if key.device != device:
+                continue
+
+            # Convert timestamps from ns to us, to match trace events.
+            if t != -1:
+                t = int(t / 1000)
+
+            # Save the smallest timestamp to populate pre-existing allocs.
+            if t_min == -1 or (t < t_min and t > 0):
+                t_min = t
+
+            # Handle timestep
+            if len(times) == 0:
+                times.append(t)
+                sizes.append([0] + [0 for _ in _CATEGORY_TO_INDEX])
+
+            elif t != times[-1]:
+                times.append(t)
+                sizes.append(sizes[-1].copy())
+
+            # Handle memory and categories
+            if action in (Action.PREEXISTING, Action.CREATE):
+                update(key, version, numbytes)
+
+            elif action == Action.INCREMENT_VERSION:
+                update(key, version, -numbytes)
+                update(key, version + 1, numbytes)
+
+            elif action == Action.DESTROY:
+                update(key, version, -numbytes)
+
+            else:
+                raise ValueError(f"Unknown action: {action}")
+
+        times = [t_min if t < 0 else t for t in times]
+        return times, sizes
+
+    def export_memory_timeline(self, path, device_str) -> None:
+        """Saves the memory timeline as [times, sizes by category]
+        as a JSON formatted file to the given path for the given
+        device."""
+        times, sizes = self._coalesce_timeline(device_str)
+        # TODO: Write a faster serialize (orjson not available in CI)
+        import json
+
+        with open(path, "w") as f:
+            json.dump([times, sizes], f)
+
+    def export_memory_timeline_raw(self, path, device_str) -> None:
+        """Saves the memory timeline as raw memory event tuples in the
+        form of (timestamp, action, numbytes, category)
+        as a JSON formatted file to the given path for the given
+        device."""
+        device = torch.device(device_str)
+        raw_events: List[Tuple[int, int, int, int]] = []
+
+        def get_category_index(key, version):
+            category = (
+                self.categories.get(key, version)
+                if isinstance(key, TensorKey)
+                else None
+            )
+            return _CATEGORY_TO_INDEX[category]
+
+        for t, action, (key, version), numbytes in self.timeline:
+            if key.device != device:
+                continue
+
+            if action in (Action.PREEXISTING, Action.CREATE):
+                raw_events.append(
+                    (
+                        t,
+                        _ACTION_TO_INDEX[action],
+                        numbytes,
+                        get_category_index(key, version),
+                    )
+                )
+
+            elif action == Action.INCREMENT_VERSION:
+                raw_events.append(
+                    (
+                        t,
+                        _ACTION_TO_INDEX[action],
+                        -numbytes,
+                        get_category_index(key, version),
+                    )
+                )
+                raw_events.append(
+                    (
+                        t,
+                        _ACTION_TO_INDEX[action],
+                        numbytes,
+                        get_category_index(key, version + 1),
+                    )
+                )
+
+            elif action == Action.DESTROY:
+                raw_events.append(
+                    (
+                        t,
+                        _ACTION_TO_INDEX[action],
+                        -numbytes,
+                        get_category_index(key, version),
+                    )
+                )
+
+            else:
+                raise ValueError(f"Unknown action: {action}")
+
+        import json
+
+        with open(path, "w") as f:
+            json.dump(raw_events, f)
+
+    def export_memory_timeline_html(
+        self, path, device_str, figsize=(20, 12), title=None
+    ) -> None:
+        """Exports the memory timeline as an HTML file which contains
+        the memory timeline plot embedded as a PNG file."""
+        # Check if user has matplotlib installed, return gracefully if not.
+        import importlib.util
+
+        matplotlib_spec = importlib.util.find_spec("matplotlib")
+        if matplotlib_spec is None:
+            print(
+                "export_memory_timeline_html failed because matplotlib was not found."
+            )
+            return
+
+        from base64 import b64encode
+        from os import remove
+        from tempfile import NamedTemporaryFile
+
+        import matplotlib.pyplot as plt
+        import numpy as np
+
+        mt = self._coalesce_timeline(device_str)
+        times, sizes = np.array(mt[0]), np.array(mt[1])
+        # For this timeline, start at 0 to match Chrome traces.
+        t_min = min(times)
+        times -= t_min
+        stacked = np.cumsum(sizes, axis=1) / 1024**3
+        device = torch.device(device_str)
+        max_memory_allocated = torch.cuda.max_memory_allocated(device)
+        max_memory_reserved = torch.cuda.max_memory_reserved(device)
+
+        # Plot memory timeline as stacked data
+        fig = plt.figure(figsize=figsize, dpi=80)
+        axes = fig.gca()
+        for category, color in _CATEGORY_TO_COLORS.items():
+            i = _CATEGORY_TO_INDEX[category]
+            axes.fill_between(
+                times / 1e3, stacked[:, i], stacked[:, i + 1], color=color, alpha=0.7
+            )
+        fig.legend(["Unknown" if i is None else i.name for i in _CATEGORY_TO_COLORS])
+        # Usually training steps are in magnitude of ms.
+        axes.set_xlabel("Time (ms)")
+        axes.set_ylabel("Memory (GB)")
+        title = "\n\n".join(
+            ([title] if title else [])
+            + [
+                f"Max memory allocated: {max_memory_allocated/(1024**3):.2f} GiB \n"
+                f"Max memory reserved: {max_memory_reserved/(1024**3):.2f} GiB"
+            ]
+        )
+        axes.set_title(title)
+
+        # Embed the memory timeline image into the HTML file
+        tmpfile = NamedTemporaryFile("wb", suffix=".png", delete=False)
+        tmpfile.close()
+        fig.savefig(tmpfile.name, format="png")
+
+        with open(tmpfile.name, "rb") as tmp:
+            encoded = b64encode(tmp.read()).decode("utf-8")
+            html = f"""<html>
+<head><meta charset="utf-8" /><title>GPU Memory Timeline HTML</title></head>
+<body>
+  <img src='data:image/png;base64,{encoded}'>
+</body>
+</html>"""
+
+            with open(path, "w") as f:
+                f.write(html)
+        remove(tmpfile.name)
diff --git a/MLPY/Lib/site-packages/torch/profiler/_pattern_matcher.py b/MLPY/Lib/site-packages/torch/profiler/_pattern_matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..c23903d438900bcba7ff26129a1b2bbdf4757e81
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/_pattern_matcher.py
@@ -0,0 +1,662 @@
+import json
+import math
+import os
+import re
+from typing import Dict, List, Optional, Set
+
+import torch
+import torch.utils.benchmark as benchmark
+from torch._C._profiler import (
+    _EventType,
+    _ExtraFields_PyCall,
+    _ExtraFields_PyCCall,
+    _ExtraFields_TorchOp,
+    _ProfilerEvent,
+)
+from torch.profiler import profile
+from torch.profiler._utils import index_of_first_match, traverse_bfs, traverse_dfs
+
+
+class Pattern:
+    """
+    Base class for all patterns, subclass this class and implement match()
+    to define custom patterns.
+
+    In subclass, define description and skip property.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        self.prof = prof
+        self.should_benchmark = should_benchmark
+        self.name = "Please specify a name for pattern"
+        self.description = "Please specify a description for pattern"
+        self.url = ""
+        assert prof.profiler is not None and prof.profiler.kineto_results is not None
+        self.event_tree = prof.profiler.kineto_results.experimental_event_tree()
+        self.tid_root: Dict[int, List[_ProfilerEvent]] = {}
+        for event in self.event_tree:
+            self.tid_root.setdefault(event.start_tid, []).append(event)
+
+    @property
+    def skip(self):
+        return False
+
+    def report(self, event: _ProfilerEvent):
+        msg = (
+            f"{self.description}\n[Source Code Location] {source_code_location(event)}"
+        )
+        return msg
+
+    def eventTreeTraversal(self):
+        """
+        Traverse the event tree and yield all events.
+        Override this method in subclass to customize the traversal.
+        """
+        yield from traverse_dfs(self.event_tree)
+
+    def summary(self, events: List[_ProfilerEvent]):
+        default_summary = f"{self.name}: {len(events)} events matched."
+        if self.should_benchmark:
+            # If benchmark summary is not empty, use it.
+            return (
+                self.benchmark_summary(events)
+                if hasattr(self, "benchmark")  # type: ignore[attr-defined]
+                else default_summary
+            )
+        return default_summary
+
+    def benchmark_summary(self, events: List[_ProfilerEvent]):
+        def format_time(time_ns: int):
+            unit_lst = ["ns", "us", "ms"]
+            for unit in unit_lst:
+                if time_ns < 1000:
+                    return f"{time_ns:.2f} {unit}"
+                time_ns //= 1000
+            return f"{time_ns:.2f} s"
+
+        assert hasattr(self, "benchmark"), "Please implement benchmark()"
+        shapes_factor_map = self.benchmark(events)  # type: ignore[attr-defined]
+        original_time = sum(event.duration_time_ns for event in events)
+        new_time = sum(
+            shapes_factor_map[input_shapes(event)] * event.duration_time_ns
+            for event in events
+        )
+        return (
+            f"{self.name}: {len(events)} events matched. "
+            f"Total Estimated Speedup: {format_time(original_time - new_time)} ({round(original_time/new_time, 2)}X)"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        """
+        Return True if the event matches the pattern.
+        This method should be overriden in subclass.
+        """
+        raise NotImplementedError
+
+    def matched_events(self):
+        if self.skip:
+            return []
+        matched_events = []
+        for event in self.eventTreeTraversal():
+            if self.match(event):
+                matched_events.append(event)
+        return matched_events
+
+    def root_of(self, event: _ProfilerEvent):
+        while event.parent:
+            event = event.parent
+        return event
+
+    def siblings_of(self, event: _ProfilerEvent):
+        if event.parent:
+            children = event.parent.children
+        else:
+            children = self.tid_root[event.start_tid]
+        index = children.index(event)
+        return children[:index], children[index + 1 :]
+
+    def next_of(self, event: _ProfilerEvent):
+        _, next_events = self.siblings_of(event)
+        return next_events[0] if next_events else None
+
+    def prev_of(self, event: _ProfilerEvent):
+        prev_events, _ = self.siblings_of(event)
+        return prev_events[-1] if prev_events else None
+
+    def go_up_until(self, event: _ProfilerEvent, predicate):
+        if not event:
+            return None
+        while event.parent and not predicate(event):
+            event = event.parent
+        return event
+
+
+# Patterns
+
+
+class NamePattern(Pattern):
+    def __init__(self, prof: profile, name: str, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.description = f"Matched Name Event: {name}"
+        self.name = name
+
+    def match(self, event: _ProfilerEvent):
+        return re.search(self.name, event.name) is not None
+
+
+class ExtraCUDACopyPattern(Pattern):
+    """
+    This pattern identifies if we creates a constant tensor on CPU and immediately moves it to GPU.
+    example: torch.zeros((100, 100)).to("cuda")
+
+    Pattern:
+    build-in method                 |build-in method
+        ...                         |    aten::to
+            aten::fill_/aten::zero_ |        aten::_to_copy
+
+    Algorithm:
+    We start at node aten::to, go parent events' previous events,
+    and check if we have a aten::fill_/aten::zero_ as we keep going down the tree.
+    We always select the last child in the children list when we go down the tree.
+    If at any step we failed, it is not a match.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Extra CUDA Copy Pattern"
+        self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
+        self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#create-tensors-directly-on-the-target-device"
+        self.init_ops = {
+            "aten::fill_",
+            "aten::zero_",
+            "aten::normal_",
+            "aten::uniform_",
+        }
+
+    @property
+    def skip(self):
+        return not self.prof.with_stack or not self.prof.record_shapes
+
+    def match(self, event):
+        # TODO: We should also check tensor identities
+        if event.name != "aten::to":
+            return False
+        to_event = event
+        if not event.children:
+            return False
+        event = event.children[-1]
+        if event.name != "aten::_to_copy":
+            return False
+        if not event.children:
+            return False
+        event = event.children[-1]
+        if event.name != "aten::copy_":
+            return False
+        # aten::copy_ should have the first 2 args dtype the same
+        dtypes = input_dtypes(event)
+        if len(dtypes) < 2:
+            return False
+        if dtypes[0] is None or dtypes[0] != dtypes[1]:
+            return False
+        event = to_event
+        # Up one level
+        event = event.parent
+        if event is None:
+            return False
+        # Check if we have a aten::fill_ in previous leaf
+        event = self.prev_of(event)
+        if event is None:
+            return False
+        while event.children:
+            event = event.children[-1]
+            # aten::zero_ is a special optimzation case where fill_ is not called
+            if event.name in self.init_ops:
+                return True
+        return event.name in self.init_ops
+        # TODO: Check if tensor is reused
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            size = shape[0]
+            to_timer = benchmark.Timer(
+                stmt='torch.ones(size).to("cuda")', globals={"size": size}
+            )
+            de_timer = benchmark.Timer(
+                stmt='torch.ones(size, device="cuda")', globals={"size": size}
+            )
+            to_time = to_timer.timeit(10).mean
+            de_time = de_timer.timeit(10).mean
+            shapes_factor_map[shape] = de_time / to_time
+        return shapes_factor_map
+
+
+class ForLoopIndexingPattern(Pattern):
+    """
+    This pattern identifies if we use a for loop to index a tensor that
+    can be vectorized.
+    example:
+    tensor = torch.empty((100, 100))
+    for i in range(100):
+        tensor[i] = i
+
+    Pattern:
+    aten::select | ... | aten::select | ... (Repeat)
+
+    Algorithm:
+    We start at node aten::select, and we check if we can find this alternating patterns.
+    We also keep a dictionary to avoid duplicate match in the for loop.
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "For Loop Indexing Pattern"
+        self.description = "For loop indexing detected. Vectorization recommended."
+        self.visited: Set[int] = set()
+
+    def eventTreeTraversal(self):
+        """
+        We need to use BFS traversal order to avoid duplicate match.
+        """
+        yield from traverse_bfs(self.event_tree)
+
+    def match(self, event: _ProfilerEvent):
+        if event.name != "aten::select":
+            return False
+        if event.id in self.visited:
+            return False
+        repeat_count = 1
+        _, next = self.siblings_of(event)
+        if len(next) <= 1:
+            return False
+
+        # Custom event list matching
+        def same_ops(list1, list2):
+            if len(list1) != len(list2):
+                return False
+            for op1, op2 in zip(list1, list2):
+                if op1.name != op2.name:
+                    return False
+            return True
+
+        # Record the ops between two aten::select
+        next_select_idx = index_of_first_match(next, lambda e: e.name == "aten::select")
+        if next_select_idx is None:
+            return False
+        indexing_ops = [event] + next[:next_select_idx]
+        next = next[len(indexing_ops) - 1 :]
+        for i in range(0, len(next), len(indexing_ops)):
+            if same_ops(indexing_ops, next[i : i + len(indexing_ops)]):
+                repeat_count += 1
+                self.visited.add(next[i].id)
+            else:
+                break
+        return repeat_count >= 10
+
+
+class FP32MatMulPattern(Pattern):
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "FP32 MatMul Pattern"
+        self.description = (
+            "You are currently using GPU that supports TF32. "
+            "Please enable TF32 by setting 'torch.backends.cuda.matmul.allow_tf32 = True'"
+        )
+        self.url = "https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+
+    @property
+    def skip(self):
+        if torch.version.hip is not None:
+            has_tf32 = False
+        else:
+            # Anything less than sm_80 is not Ampere which doesn't support TF32
+            has_tf32 = all(int(arch[3:]) >= 80 for arch in torch.cuda.get_arch_list())
+        return has_tf32 is False or super().skip or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+        # If we saw this pattern once, we don't need to match it again
+        if event.tag != _EventType.TorchOp:
+            return False
+        assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+        if event.name == "aten::mm":
+            if event.extra_fields.allow_tf32_cublas is False:
+                return True
+        return False
+
+    def report(self, event: _ProfilerEvent):
+        return self.description
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32)
+            matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32)
+            fp32_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            tf32_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                setup="torch.backends.cuda.matmul.allow_tf32 = True",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            torch.backends.cuda.matmul.allow_tf32 = False
+            fp32_time = fp32_timer.timeit(10).mean
+            tf32_time = tf32_timer.timeit(10).mean
+            shapes_factor_map[shape] = tf32_time / fp32_time
+        return shapes_factor_map
+
+
+class OptimizerSingleTensorPattern(Pattern):
+    """
+    This pattern identifies if we are using the single-tensor version of an optimizer.
+    example:
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    By adding foreach=True to enable multi-tensor optimizer, we can gain speedup when
+    the kernels are relatively small.
+
+    Pattern:
+    XXXXX: _single_tenser_<OPTIMIZER_NAME>
+
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Optimizer Single Tensor Pattern"
+        self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
+        self.description = (
+            "Deteced optimizer running with single tensor implementation. "
+            "Please enable multi tensor implementation by passing 'foreach=True' into optimizer."
+        )
+        self.url = ""
+
+    def match(self, event: _ProfilerEvent):
+        for optimizer in self.optimizers_with_foreach:
+            if event.name.endswith(f"_single_tensor_{optimizer}"):
+                return True
+        return False
+
+
+class SynchronizedDataLoaderPattern(Pattern):
+    """
+    This pattern identifies if we are using num_workers=0 in DataLoader.
+    example:
+    torch.utils.data.DataLoader(dataset, batch_size=batch_size)
+    Add num_workers=N to the arguments. N depends on system configuration.
+
+    Pattern:
+    dataloader.py(...): __iter__
+        dataloader.py(...): _get_iterator
+            NOT dataloader.py(...): check_worker_number_rationality
+
+    Algorithm:
+    If we don't see check_worker_number_rationality call in the dataloader __iter__,
+    It is not an asynchronous dataloader.
+
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Synchronized DataLoader Pattern"
+        self.description = (
+            "Detected DataLoader running with synchronized implementation. "
+            "Please enable asynchronous dataloading by setting num_workers > 0 when initializing DataLoader."
+        )
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#enable-async-data-loading-and-augmentation"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        def is_dataloader_function(name: str, function_name: str):
+            return name.startswith(
+                os.path.join("torch", "utils", "data", "dataloader.py")
+            ) and name.endswith(function_name)
+
+        # TODO: fixme! Due to lifetime issues of the function name, this field might
+        # actually point to an already freed string when the even is a PyCall.
+        # Just silently skip this to unblock testing.
+        try:
+            event.name
+        except UnicodeDecodeError:
+            return False
+
+        if not is_dataloader_function(event.name, "__iter__"):
+            return False
+        if not event.children:
+            return False
+        event = event.children[0]
+        if not is_dataloader_function(event.name, "_get_iterator"):
+            return False
+        if not event.children:
+            return False
+        event = event.children[0]
+        return not is_dataloader_function(event.name, "check_worker_number_rationality")
+        # TODO: We should also check if the loader is bottleneck.
+
+
+class GradNotSetToNonePattern(Pattern):
+    """
+    This pattern identifies if we are not setting grad to None in zero_grad.
+    example:
+    optimizer.zero_grad()
+    By setting set_to_none=True, we can gain speedup
+
+    Pattern:
+    XXXXX: _zero_grad
+        NOT aten::zeros
+            aten::zero_
+
+    aten::zero_ is called on each parameter in the model.
+    We also want to make sure it is not called by aten::zeros.
+
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Gradient Set To Zero Instead of None Pattern"
+        self.description = (
+            "Detected gradient set to zero instead of None. "
+            "Please add 'set_to_none=True' when calling zero_grad()."
+        )
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#disable-gradient-calculation-for-validation-or-inference"
+        )
+
+    def match(self, event: _ProfilerEvent):
+        if not event.name.endswith(": zero_grad"):
+            return False
+        if not event.children:
+            return False
+
+        for sub_event in traverse_dfs(event.children):
+            if (
+                sub_event.name == "aten::zero_"
+                and sub_event.parent.name != "aten::zeros"
+            ):
+                return True
+        # TODO: We should also check if the optimizer's numerical behavior will change.
+        return False
+
+
+class Conv2dBiasFollowedByBatchNorm2dPattern(Pattern):
+    """
+    This pattern identifies if we are enabling bias in Conv2d which is followed by BatchNorm2d.
+    Bias doesn't do anything when followed by batchnorm.
+    Pattern:
+    nn.Module: Conv2d            | nn.Module: BatchNorm2d
+        ...
+            aten::conv2d AND dtype of third argument is not null
+    The third argument is the bias
+    Algorithm:
+    String match
+    """
+
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Enabling Bias in Conv2d Followed By BatchNorm Pattern"
+        self.description = "Detected bias enabled in Conv2d that is followed by BatchNorm2d. Please set 'bias=False' in Conv2d."
+        self.url = (
+            "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
+            "#disable-bias-for-convolutions-directly-followed-by-a-batch-norm"
+        )
+
+    @property
+    def skip(self):
+        return self.prof.record_shapes is False or super().skip
+
+    def match(self, event: _ProfilerEvent):
+        if event.name != "aten::conv2d":
+            return False
+        if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] is None:
+            return False
+        # This means bias=True
+        event = self.go_up_until(
+            event, lambda e: e.name.startswith("nn.Module: Conv2d")
+        )
+        if not event:
+            return False
+        event = self.next_of(event)
+        if not event:
+            return False
+        return event.name.startswith("nn.Module: BatchNorm2d")
+
+
+class MatMulDimInFP16Pattern(Pattern):
+    def __init__(self, prof: profile, should_benchmark: bool = False):
+        super().__init__(prof, should_benchmark)
+        self.name = "Matrix Multiplication Dimension Not Aligned Pattern"
+        self.description = "Detected matmul with dimension not aligned. Please use matmul with aligned dimension."
+        self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-mixed-precision-and-amp"
+
+    @property
+    def skip(self):
+        return not self.prof.with_stack or not self.prof.record_shapes
+
+    def match(self, event: _ProfilerEvent):
+        def mutiple_of(shapes, multiple):
+            return all(dim % multiple == 0 for shape in shapes for dim in shape[-2:])
+
+        if event.name not in ("aten::mm", "aten::bmm", "aten::addmm"):
+            return False
+        if not input_dtypes(event):
+            return False
+        arg_dtype = input_dtypes(event)[0]
+        if arg_dtype in (torch.bfloat16, torch.half) and not mutiple_of(
+            input_shapes(event), 8
+        ):
+            return True
+        return False
+
+    def benchmark(self, events: List[_ProfilerEvent]):
+        def closest_multiple(shapes, multiple):
+            return [multiple * math.ceil(shape / multiple) for shape in shapes]
+
+        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
+        for shape in shapes_factor_map:
+            matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16)
+            matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16)
+            not_aligned_dim_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            matrixA = torch.randn(
+                closest_multiple(shape[0], 8), device="cuda", dtype=torch.float16
+            )
+            matrixB = torch.randn(
+                closest_multiple(shape[1], 8), device="cuda", dtype=torch.float16
+            )
+            aligned_dim_timer = benchmark.Timer(
+                stmt="torch.mm(matrixA, matrixB)",
+                globals={"matrixA": matrixA, "matrixB": matrixB},
+            )
+            not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean
+            aligned_dim_time = aligned_dim_timer.timeit(10).mean
+            shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time
+        return shapes_factor_map
+
+
+def source_code_location(event: Optional[_ProfilerEvent]):
+    while event:
+        if event.tag == _EventType.PyCall or event.tag == _EventType.PyCCall:
+            assert isinstance(
+                event.extra_fields, (_ExtraFields_PyCall, _ExtraFields_PyCCall)
+            )
+            if not event.extra_fields.caller.file_name.startswith("torch" + os.sep):
+                return f"{event.extra_fields.caller.file_name}:{event.extra_fields.caller.line_number}"
+        event = event.parent
+    return "No source code location found"
+
+
+def input_shapes(event: _ProfilerEvent):
+    assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+    return tuple(tuple(getattr(i, "sizes", ())) for i in event.extra_fields.inputs)
+
+
+def input_dtypes(event: _ProfilerEvent):
+    assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
+    return tuple(getattr(i, "dtype", None) for i in event.extra_fields.inputs)
+
+
+def report_all_anti_patterns(
+    prof,
+    should_benchmark: bool = False,
+    print_enable: bool = True,
+    json_report_dir: Optional[str] = None,
+):
+    report_dict: Dict = {}
+    anti_patterns = [
+        ExtraCUDACopyPattern(prof, should_benchmark),
+        # ForLoopIndexingPattern(prof, should_benchmark),
+        FP32MatMulPattern(prof, should_benchmark),
+        OptimizerSingleTensorPattern(prof, should_benchmark),
+        SynchronizedDataLoaderPattern(prof, should_benchmark),
+        GradNotSetToNonePattern(prof, should_benchmark),
+        Conv2dBiasFollowedByBatchNorm2dPattern(prof, should_benchmark),
+        MatMulDimInFP16Pattern(prof, should_benchmark),
+    ]
+    reported = set()
+    summaries = []
+    message_list = [f"{'-'*40}TorchTidy Report{'-'*40}"]
+    message_list.append("Matched Events:")
+
+    for anti_pattern in anti_patterns:
+        matched_events = anti_pattern.matched_events()
+        if not matched_events:
+            continue
+        summaries.append(anti_pattern.summary(matched_events))
+        for event in matched_events:
+            report_msg = anti_pattern.report(event)
+            if report_msg not in reported:
+                message_list.append(report_msg)
+                reported.add(report_msg)
+                src_location, line_no = source_code_location(event).split(":")
+                report_dict.setdefault(src_location, []).append(
+                    {
+                        "line_number": int(line_no),
+                        "name": anti_pattern.name,
+                        "url": anti_pattern.url,
+                        "message": anti_pattern.description,
+                    }
+                )
+
+    if json_report_dir is not None:
+        json_report_path = os.path.join(json_report_dir, "torchtidy_report.json")
+        if os.path.exists(json_report_path):
+            with open(json_report_path) as f:
+                exisiting_report = json.load(f)
+                exisiting_report.update(report_dict)
+                report_dict = exisiting_report
+        with open(json_report_path, "w") as f:
+            json.dump(report_dict, f, indent=4)
+
+    message_list.append("Summary:")
+    message_list += summaries
+    message_list.append(f"{'-'*40}TorchTidy Report{'-'*40}")
+    if print_enable:
+        print("\n".join(message_list))
diff --git a/MLPY/Lib/site-packages/torch/profiler/_utils.py b/MLPY/Lib/site-packages/torch/profiler/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5620bb0e75007d3f4f84578cfd6f94993ee40ece
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/_utils.py
@@ -0,0 +1,373 @@
+import functools
+import re
+from collections import deque
+from dataclasses import dataclass
+from typing import Dict, List
+
+from torch.autograd import _KinetoEvent
+from torch.autograd.profiler import profile
+
+from torch.profiler import DeviceType
+
+
+def _traverse(tree, next_fn, children_fn=lambda x: x.children, reverse: bool = False):
+    order = reversed if reverse else lambda x: x
+    remaining = deque(order(tree))
+    while remaining:
+        curr_event = next_fn(remaining)
+        yield curr_event
+        for child_event in order(children_fn(curr_event)):
+            remaining.append(child_event)
+
+
+traverse_dfs = functools.partial(_traverse, next_fn=lambda x: x.pop(), reverse=True)
+traverse_bfs = functools.partial(
+    _traverse, next_fn=lambda x: x.popleft(), reverse=False
+)
+
+
+@dataclass
+class EventMetrics:
+    duration_time_ns: int = 0
+    self_time_ns: int = 0
+    idle_time_ns: int = 0
+    queue_depth: int = 0
+
+    @property
+    def fraction_idle_time(self):
+        if self.duration_time_ns == 0:
+            return 0.0
+        return self.idle_time_ns / self.duration_time_ns
+
+
+@dataclass
+class Interval:
+    start: int
+    end: int
+    queue_depth: int = 0
+
+
+class EventKey:
+    def __init__(self, event):
+        self.event = event
+
+    def __hash__(self):
+        return hash(self.event.id)
+
+    def __eq__(self, other):
+        return self.event.id == other.event.id
+
+    def __repr__(self):
+        return f"{self.event.name}"
+
+    def intervals_overlap(self, intervals: List[Interval]):
+        overlap_time = 0
+        intervals = sorted(intervals, key=lambda x: x.start)
+
+        if intervals:
+            overlap_start = max(self.event.start_time_ns, intervals[0].start)
+            overlap_end = min(self.event.end_time_ns, intervals[0].end)
+
+            if overlap_start < overlap_end:
+                overlap_time += overlap_end - overlap_start
+
+        i, j = 0, 1
+        while j < len(intervals):
+            prev_interval = intervals[i]
+            curr_interval = intervals[j]
+            j += 1
+            if prev_interval.end > curr_interval.start:
+                # Completely subsumed by previous interval
+                if prev_interval.end > curr_interval.end:
+                    j += 1
+                    continue
+                else:
+                    curr_interval.start = prev_interval.end
+                    i = j
+
+            overlap_start = max(self.event.start_time_ns, curr_interval.start)
+            overlap_end = min(self.event.end_time_ns, curr_interval.end)
+            if overlap_start < overlap_end:
+                overlap_time += overlap_end - overlap_start
+
+        return overlap_time
+
+
+class BasicEvaluation:
+    def __init__(self, prof: profile):
+        self.profile = prof
+        self.metrics: Dict[EventKey, EventMetrics] = {}
+        self.compute_self_time()
+        self.event_keys = sorted(
+            (e for e in self.metrics.keys()), key=lambda x: x.event.start_time_ns
+        )
+        self.events = [e.event for e in self.event_keys]
+        self.cuda_events: List[_KinetoEvent] = []
+        self.queue_depth_list = self.compute_queue_depth()
+        self.compute_idle_time()
+
+    def compute_self_time(self):
+        """
+        Computes event's self time(total time - time in child ops).
+        """
+        assert self.profile.kineto_results is not None
+        stack = deque(self.profile.kineto_results.experimental_event_tree())
+
+        # standard iterating dfs
+        while stack:
+            curr_event = stack.pop()
+            self_time = curr_event.duration_time_ns
+            for child_event in curr_event.children:
+                self_time -= child_event.duration_time_ns
+                stack.append(child_event)
+            assert (
+                EventKey(curr_event) not in self.metrics
+            ), f"Duplicate id: {curr_event.id}, {curr_event.name}"
+            self.metrics[EventKey(curr_event)] = EventMetrics(self_time_ns=self_time)
+            self.metrics[
+                EventKey(curr_event)
+            ].duration_time_ns = curr_event.duration_time_ns
+
+    def compute_queue_depth(self):
+        """
+        Computes queue_depth at each event. This will calculate the queue depth data for
+        All the events in the tree.
+        This will return a list of Interval of queue depth data of cuda launch and kernels.
+        """
+        assert self.profile.kineto_results is not None
+        cuda_event_list = self.profile.kineto_results.events()
+
+        def is_cuda_launch_kernel(e):
+            # TODO: find a better way to identify cudaLaunchKernel
+            return e.name == "cudaLaunchKernel"
+
+        def is_cuda_kernel(e):
+            # TODO: find a better way to identify CUDA Kernel
+            return e.device_type() == DeviceType.CUDA and "mem" not in e.name.lower()
+
+        cuda_launch_events = sorted(
+            (e for e in cuda_event_list if is_cuda_launch_kernel(e)),
+            key=lambda x: x.start_us(),
+        )
+        cuda_kernel_events = sorted(
+            (e for e in cuda_event_list if is_cuda_kernel(e)),
+            key=lambda x: x.start_us(),
+        )
+
+        self.cuda_events = sorted(
+            cuda_launch_events + cuda_kernel_events, key=lambda x: x.start_us()
+        )
+
+        kernel_mapping: Dict[_KinetoEvent, int] = {}
+        last_mapped_kernel = 0
+        for cuda_launch_event in cuda_launch_events:
+            index = index_of_first_match(
+                cuda_kernel_events,
+                lambda x: x.linked_correlation_id()
+                == cuda_launch_event.linked_correlation_id(),
+                start=last_mapped_kernel,
+            )
+            kernel_mapping[cuda_launch_event] = index
+            last_mapped_kernel = index if index is not None else last_mapped_kernel
+
+        current_kernel_index = 0
+        spawned_kernel_index = -1
+
+        all_events = cuda_launch_events + cuda_kernel_events + self.events
+
+        def new_old_event_comparator(event):
+            if hasattr(event, "start_us"):
+                return event.start_us() * 1000
+            if hasattr(event, "start_time_ns"):
+                return event.start_time_ns
+            raise Exception("Unknown Event Type")
+
+        queue_depth_list: List[Interval] = []
+        all_events.sort(key=new_old_event_comparator)
+        for event in all_events:
+            # Find latest cuda kernel event
+            if hasattr(event, "start_us"):
+                start_time = event.start_us() * 1000
+                end_time = (event.start_us() + event.duration_us()) * 1000
+                # Find current spawned cuda kernel event
+                if event in kernel_mapping and kernel_mapping[event] is not None:
+                    spawned_kernel_index = kernel_mapping[event]
+            elif hasattr(event, "start_time_ns"):
+                start_time = event.start_time_ns  # type: ignore[attr-defined]
+                end_time = event.end_time_ns  # type: ignore[attr-defined]
+
+            while (
+                current_kernel_index < len(cuda_kernel_events)
+                and (cuda_kernel_events[current_kernel_index].start_us()) * 1000
+                <= start_time  # type: ignore[possibly-undefined]
+            ):
+                current_kernel_index += 1
+            current_queue_depth = spawned_kernel_index - current_kernel_index + 1
+            current_queue_depth = max(current_queue_depth, 0)
+
+            if hasattr(event, "start_us"):
+                queue_depth_list.append(
+                    Interval(start_time, end_time, current_queue_depth)  # type: ignore[possibly-undefined]
+                )
+            elif hasattr(event, "start_time_ns"):
+                self.metrics[EventKey(event)].queue_depth = current_queue_depth
+
+        return queue_depth_list
+
+    def compute_idle_time(self):
+        """
+        Computes idle time of the profile.
+        """
+        # Based on queue_depth_list, we can calculate idle time for all the events
+        idle = False
+        idle_start = 0
+        idle_intervals: List[Interval] = []
+        if self.queue_depth_list and self.events:
+            idle_intervals += [
+                Interval(self.events[0].start_time_ns, self.queue_depth_list[0].start),
+                Interval(self.queue_depth_list[-1].end, self.events[-1].end_time_ns),
+            ]
+
+        for data_point in self.queue_depth_list:
+            if data_point.queue_depth == 0 and not idle:
+                idle_start = data_point.end
+                idle = True
+            if data_point.queue_depth > 0 and idle:
+                idle_intervals.append(Interval(idle_start, data_point.start))
+                idle = False
+
+        event_list = [e.event for e in self.metrics.keys()]
+        for event in event_list:
+            self.metrics[EventKey(event)].idle_time_ns = EventKey(
+                event
+            ).intervals_overlap(idle_intervals)
+
+    def rank_events(self, length):
+        """
+        Filter and Rank the events based on some heuristics:
+        1) Events that are in the falling phase of the queue depth.
+        2) Events that have a high idle_time, self_time difference.
+
+        Parameters:
+            length: The number of events to return.
+        """
+
+        # Find the interval when qd is falling to 0
+        import torch
+
+        queue_depth_list = list(reversed(self.queue_depth_list))
+        qd_values = [e.queue_depth for e in queue_depth_list]
+
+        bottom_threashold = 0
+        top_threashold = 4
+        decrease_interval = []
+        i = 0
+        while i < len(qd_values):
+            if qd_values[i] > bottom_threashold:
+                i += 1
+                continue
+            for j in range(i + 1, len(qd_values)):
+                # Find next zero and if the max value between them exceeds
+                # the threshold, then we have a falling interval
+                next_minimum_idx = index_of_first_match(
+                    qd_values, lambda x: x <= bottom_threashold, start=j
+                )
+                peak_idx = argmax(qd_values, start=j, end=next_minimum_idx)
+
+                # if is a valid peak, we add to list and continue
+                if peak_idx is not None and qd_values[peak_idx] >= top_threashold:
+                    decrease_interval.append(
+                        Interval(
+                            queue_depth_list[peak_idx].start, queue_depth_list[i].start
+                        )
+                    )
+                    i = next_minimum_idx if next_minimum_idx is not None else i
+                    break
+            i += 1
+        # Filter out events that are not in the decrease interval
+        event_list = [
+            event
+            for event in self.metrics.keys()
+            if event.intervals_overlap(decrease_interval)
+        ]
+        if event_list:
+            self_time = torch.tensor(
+                [self.metrics[event].self_time_ns for event in event_list],
+                dtype=torch.float32,
+            )
+            idle_time = torch.tensor(
+                [self.metrics[event].fraction_idle_time for event in event_list],
+                dtype=torch.float32,
+            )
+            normalized_gain = (idle_time - torch.mean(idle_time)) / torch.std(idle_time)
+            normalized_self = (self_time - torch.mean(self_time)) / torch.std(self_time)
+            heuristic_score_list = normalized_gain + 0.6 * normalized_self
+
+            # Sort events by heuristic
+            event_list = [
+                event
+                for _, event in sorted(
+                    zip(heuristic_score_list, event_list),
+                    key=lambda x: x[0],
+                    reverse=True,
+                )
+            ]
+            event_list = event_list[:length]
+        return event_list
+
+    def get_optimizable_events(self, length: int = 1, print_enable: bool = True):
+        event_list = self.rank_events(length)
+        if not print_enable:
+            return event_list
+        output = "Optimizable events:\n" if event_list else "No events to optimize\n"
+
+        output += "\n".join(
+            [
+                f"""{'-'*80}
+Event:                {event}
+Source code location: {source_code_location(event.event)}
+Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
+{'-'*80}"""
+                for event in event_list
+            ]
+        )
+        if print_enable:
+            print(output)
+        return event_list
+
+
+def index_of_first_match(seq, predicate, start=0, end=None):
+    if end is None or end >= len(seq):
+        end = len(seq)
+    for i in range(start, end):
+        if predicate(seq[i]):
+            return i
+    return None
+
+
+def argmax(seq, key=lambda x: x, start=0, end=None):
+    seq = seq[start:end]
+    if len(seq) == 0:
+        return None
+    return seq.index(max(seq, key=key)) + start
+
+
+def source_code_location(event):
+    while event is not None:
+        match = re.search(r"\.py\(.*\)", event.name)
+        if match is None:
+            event = event.parent
+            continue
+        return event.name
+    return "No source code location found"
+
+
+# Provide an OSS workaround for cudagraphs + CUPTI issue
+# https://github.com/pytorch/pytorch/issues/75504
+# TODO(dberard) - deprecate / remove workaround for CUDA >= 12, when
+# we stop supporting older CUDA versions.
+def _init_for_cuda_graphs():
+    from torch.autograd.profiler import profile
+
+    with profile():
+        pass
diff --git a/MLPY/Lib/site-packages/torch/profiler/itt.py b/MLPY/Lib/site-packages/torch/profiler/itt.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc3ad8554b56c3d998a75d8e14cd7fd16ac46ad3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/itt.py
@@ -0,0 +1,78 @@
+from contextlib import contextmanager
+
+try:
+    from torch._C import _itt
+except ImportError:
+
+    class _ITTStub:
+        @staticmethod
+        def _fail(*args, **kwargs):
+            raise RuntimeError(
+                "ITT functions not installed. Are you sure you have a ITT build?"
+            )
+
+        @staticmethod
+        def is_available():
+            return False
+
+        rangePush = _fail
+        rangePop = _fail
+        mark = _fail
+
+    _itt = _ITTStub()  # type: ignore[assignment]
+
+
+__all__ = ["is_available", "range_push", "range_pop", "mark", "range"]
+
+
+def is_available():
+    """
+    Check if ITT feature is available or not
+    """
+    return _itt.is_available()
+
+
+def range_push(msg):
+    """
+    Pushes a range onto a stack of nested range span.  Returns zero-based
+    depth of the range that is started.
+
+    Arguments:
+        msg (str): ASCII message to associate with range
+    """
+    return _itt.rangePush(msg)
+
+
+def range_pop():
+    """
+    Pops a range off of a stack of nested range spans. Returns the
+    zero-based depth of the range that is ended.
+    """
+    return _itt.rangePop()
+
+
+def mark(msg):
+    """
+    Describe an instantaneous event that occurred at some point.
+
+    Arguments:
+        msg (str): ASCII message to associate with the event.
+    """
+    return _itt.mark(msg)
+
+
+@contextmanager
+def range(msg, *args, **kwargs):
+    """
+    Context manager / decorator that pushes an ITT range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+
+    Args:
+        msg (str): message to associate with the range
+    """
+    range_push(msg.format(*args, **kwargs))
+    try:
+        yield
+    finally:
+        range_pop()
diff --git a/MLPY/Lib/site-packages/torch/profiler/profiler.py b/MLPY/Lib/site-packages/torch/profiler/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6fa439fc71c1a949d4b74dd6e337f3cde00f9ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/profiler.py
@@ -0,0 +1,839 @@
+import gzip
+import json
+import os
+import tempfile
+from abc import ABC, abstractmethod
+from enum import Enum
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from warnings import warn
+
+from typing_extensions import Self
+
+import torch
+import torch.autograd.profiler as prof
+from torch._C import _get_privateuse1_backend_name
+from torch._C._profiler import (
+    _add_execution_trace_observer,
+    _disable_execution_trace_observer,
+    _enable_execution_trace_observer,
+    _ExperimentalConfig,
+    _remove_execution_trace_observer,
+)
+from torch.autograd import kineto_available, ProfilerActivity
+from torch.profiler._memory_profiler import MemoryProfile, MemoryProfileTimeline
+
+
+__all__ = [
+    "supported_activities",
+    "ProfilerAction",
+    "schedule",
+    "tensorboard_trace_handler",
+    "profile",
+    "ExecutionTraceObserver",
+]
+PROFILER_STEP_NAME = "ProfilerStep"
+
+
+def supported_activities():
+    """
+    Returns a set of supported profiler tracing activities.
+
+    Note: profiler uses CUPTI library to trace on-device CUDA kernels.
+    In case when CUDA is enabled but CUPTI is not available, passing
+    ``ProfilerActivity.CUDA`` to profiler results in using the legacy CUDA
+    profiling code (same as in the legacy ``torch.autograd.profiler``).
+    This, in turn, results in including CUDA time in the profiler table output,
+    but not in the JSON trace.
+    """
+    return torch.autograd._supported_activities()
+
+
+class _ITraceObserver(ABC):
+    """Abstract interface for a Trace observer.
+    This satisfies 3 methods: start, stop and cleanup"""
+
+    @abstractmethod
+    def start(self):
+        pass
+
+    @abstractmethod
+    def stop(self):
+        pass
+
+    @abstractmethod
+    def cleanup(self):
+        pass
+
+
+class _KinetoProfile:
+    """Low-level profiler wrap the autograd profile
+
+    Args:
+        activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA.
+        record_shapes (bool): save information about operator's input shapes.
+        profile_memory (bool): track tensor memory allocation/deallocation (see ``export_memory_timeline``
+            for more details).
+        with_stack (bool): record source information (file and line number) for the ops.
+        with_flops (bool): use formula to estimate the FLOPS of specific operators
+            (matrix multiplication and 2D convolution).
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler.
+
+    .. note::
+        This API is experimental and subject to change in the future.
+
+        Enabling shape and stack tracing results in additional overhead.
+        When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
+        that may further prevent certain optimizations that depend on the reference count and introduce
+        extra tensor copies.
+    """
+
+    def __init__(
+        self,
+        *,
+        activities: Optional[Iterable[ProfilerActivity]] = None,
+        record_shapes: bool = False,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+        experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
+    ):
+        self.activities = set(activities) if activities else supported_activities()
+        self.record_shapes = record_shapes
+        self.with_flops = with_flops
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_modules = with_modules
+        self.experimental_config = experimental_config
+        self.execution_trace_observer = execution_trace_observer
+        self.profiler: Optional[prof.profile] = None
+        self.mem_tl: Optional[MemoryProfileTimeline] = None
+        self.use_device = None
+        privateuse1_backend = _get_privateuse1_backend_name()
+        if privateuse1_backend != "privateuseone":
+            self.use_device = privateuse1_backend
+        # user-defined metadata to be amended to the trace
+        self.preset_metadata: Dict[str, str] = dict()
+
+    def start(self):
+        self.prepare_trace()
+        self.start_trace()
+
+    def stop(self):
+        self.stop_trace()
+
+    def prepare_trace(self):
+        self.profiler = prof.profile(
+            use_cuda=(ProfilerActivity.CUDA in self.activities),
+            use_cpu=(ProfilerActivity.CPU in self.activities),
+            use_mtia=(ProfilerActivity.MTIA in self.activities),
+            use_device=None,
+            record_shapes=self.record_shapes,
+            with_flops=self.with_flops,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_modules=self.with_modules,
+            use_kineto=True,
+            experimental_config=self.experimental_config,
+        )
+        self.profiler._prepare_trace()
+
+    def start_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.start()
+        assert self.profiler is not None
+        self.profiler._start_trace()
+
+        if self.profile_memory:
+            self.add_metadata_json("profile_memory", "1")
+        if self.with_stack:
+            self.add_metadata_json("with_stack", "1")
+        if self.record_shapes:
+            self.add_metadata_json("record_shapes", "1")
+        if self.with_modules:
+            self.add_metadata_json("with_modules", "1")
+        if self.with_flops:
+            self.add_metadata_json("with_flops", "1")
+
+        if kineto_available():
+            dist_info = self._get_distributed_info()
+            if dist_info:
+                self.add_metadata_json("distributedInfo", json.dumps(dist_info))
+
+            if hasattr(torch, "_inductor"):
+                import torch._inductor.config as inductor_config
+
+                if inductor_config.triton.cudagraphs:
+                    os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
+                    self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
+                    # FIXME: CUDA Graph does not work well with CUPTI teardown.
+                    #   1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
+                    #   2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
+                    # Workaround: turn off CUPTI teardown when using CUDA Graphs.
+                    os.environ["TEARDOWN_CUPTI"] = "0"
+
+            # Insert the preset user metadata to the trace
+            for k, v in self.preset_metadata.items():
+                self.add_metadata_json(k, v)
+
+    def stop_trace(self):
+        if self.execution_trace_observer:
+            self.execution_trace_observer.stop()
+        assert self.profiler is not None
+        self.profiler.__exit__(None, None, None)
+
+    def export_chrome_trace(self, path: str):
+        """
+        Exports the collected trace in Chrome JSON format.
+        """
+        assert self.profiler
+        if path.endswith(".gz"):
+            fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
+            fp.close()
+            retvalue = self.profiler.export_chrome_trace(fp.name)
+            with open(fp.name) as fin:
+                with gzip.open(path, "wt") as fout:
+                    fout.writelines(fin)
+            os.remove(fp.name)
+            return retvalue
+        else:
+            return self.profiler.export_chrome_trace(path)
+
+    def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
+        """Save stack traces in a file in a format suitable for visualization.
+
+        Args:
+            path (str): save stacks file to this location;
+            metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total"
+
+        .. note::
+            Example of using FlameGraph tool:
+
+            - git clone https://github.com/brendangregg/FlameGraph
+            - cd FlameGraph
+            - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
+        """
+        assert self.profiler
+        return self.profiler.export_stacks(path, metric)
+
+    def key_averages(
+        self, group_by_input_shape: bool = False, group_by_stack_n: int = 0
+    ):
+        """Averages events, grouping them by operator name and (optionally) input shapes and
+        stack.
+
+        .. note::
+            To use shape/stack functionality make sure to set record_shapes/with_stack
+            when creating profiler context manager.
+        """
+        assert self.profiler
+        return self.profiler.key_averages(group_by_input_shape, group_by_stack_n)
+
+    def events(self):
+        """
+        Returns the list of unaggregated profiler events,
+        to be used in the trace callback or after the profiling is finished
+        """
+        assert self.profiler
+        return self.profiler.function_events
+
+    def add_metadata(self, key: str, value: str):
+        """
+        Adds a user defined metadata with a string key and a string value
+        into the trace file
+        """
+        wrapped_value = '"' + value.replace('"', '\\"') + '"'
+        torch.autograd._add_metadata_json(key, wrapped_value)
+
+    def add_metadata_json(self, key: str, value: str):
+        """
+        Adds a user defined metadata with a string key and a valid json value
+        into the trace file
+        """
+        torch.autograd._add_metadata_json(key, value)
+
+    def preset_metadata_json(self, key: str, value: str):
+        """
+        Preset a user defined metadata when the profiler is not started
+        and added into the trace file later.
+        Metadata is in the format of a string key and a valid json value
+        """
+        self.preset_metadata[key] = value
+
+    def _get_distributed_info(self):
+        import torch.distributed as dist
+
+        if not dist.is_available() or not dist.is_initialized():
+            return None
+
+        backend = dist.get_backend()
+        dist_info = {
+            "backend": backend,
+            "rank": dist.get_rank(),
+            "world_size": dist.get_world_size(),
+            "pg_count": dist.get_pg_count(),
+            "pg_config": dist.distributed_c10d._get_all_pg_configs(),
+        }
+        if backend == "nccl":
+            nccl_version = torch.cuda.nccl.version()
+            dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
+        return dist_info
+
+    def _memory_profile(self) -> MemoryProfile:
+        required = ("record_shapes", "profile_memory", "with_stack")
+        missing = [f"{i}=True" for i in required if not getattr(self, i)]
+        if missing:
+            raise ValueError(f"{', '.join(missing)} required for memory profiling.")
+
+        assert self.profiler is not None and self.profiler.kineto_results is not None
+        return MemoryProfile(self.profiler.kineto_results)
+
+    def export_memory_timeline(self, path: str, device: Optional[str] = None) -> None:
+        """Export memory event information from the profiler collected
+        tree for a given device, and export a timeline plot. There are 3
+        exportable files using ``export_memory_timeline``, each controlled by the
+        ``path``'s suffix.
+
+        - For an HTML compatible plot, use the suffix ``.html``, and a memory timeline
+          plot will be embedded as a PNG file in the HTML file.
+
+        - For plot points consisting of ``[times, [sizes by category]]``, where
+          ``times`` are timestamps and ``sizes`` are memory usage for each category.
+          The memory timeline plot will be saved a JSON (``.json``) or gzipped JSON
+          (``.json.gz``) depending on the suffix.
+
+        - For raw memory points, use the suffix ``.raw.json.gz``. Each raw memory
+          event will consist of ``(timestamp, action, numbytes, category)``, where
+          ``action`` is one of ``[PREEXISTING, CREATE, INCREMENT_VERSION, DESTROY]``,
+          and ``category`` is one of the enums from
+          ``torch.profiler._memory_profiler.Category``.
+
+        Output: Memory timeline written as gzipped JSON, JSON, or HTML.
+        """
+        # Default to device 0, if unset. Fallback on cpu.
+        if device is None and self.use_device and self.use_device != "cuda":
+            device = self.use_device + ":0"
+
+        if device is None:
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+        # Construct the memory timeline plot data
+        self.mem_tl = MemoryProfileTimeline(self._memory_profile())
+
+        # Depending on the file suffix, save the data as json.gz or json.
+        # For html, we can embed the image into an HTML file.
+        if path.endswith(".html"):
+            self.mem_tl.export_memory_timeline_html(path, device)
+        elif path.endswith(".gz"):
+            fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
+            fp.close()
+            if path.endswith("raw.json.gz"):
+                self.mem_tl.export_memory_timeline_raw(fp.name, device)
+            else:
+                self.mem_tl.export_memory_timeline(fp.name, device)
+            with open(fp.name) as fin:
+                with gzip.open(path, "wt") as fout:
+                    fout.writelines(fin)
+            os.remove(fp.name)
+        else:
+            self.mem_tl.export_memory_timeline(path, device)
+
+
+class ProfilerAction(Enum):
+    """
+    Profiler actions that can be taken at the specified intervals
+    """
+
+    NONE = 0
+    WARMUP = 1
+    RECORD = 2
+    RECORD_AND_SAVE = 3
+
+
+def schedule(
+    *, wait: int, warmup: int, active: int, repeat: int = 0, skip_first: int = 0
+) -> Callable:
+    """
+    Returns a callable that can be used as profiler ``schedule`` argument. The profiler will skip
+    the first ``skip_first`` steps, then wait for ``wait`` steps, then do the warmup for the next ``warmup`` steps,
+    then do the active recording for the next ``active`` steps and then repeat the cycle starting with ``wait`` steps.
+    The optional number of cycles is specified with the ``repeat`` parameter, the zero value means that
+    the cycles will continue until the profiling is finished.
+    """
+
+    def schedule_fn(step: int) -> ProfilerAction:
+        assert step >= 0
+        if step < skip_first:
+            return ProfilerAction.NONE
+        else:
+            step -= skip_first
+        num_steps = wait + warmup + active
+        if repeat > 0 and step / num_steps >= repeat:
+            return ProfilerAction.NONE
+        mod_step = step % num_steps
+        if mod_step < wait:
+            return ProfilerAction.NONE
+        elif mod_step < wait + warmup:
+            return ProfilerAction.WARMUP
+        else:
+            return (
+                ProfilerAction.RECORD
+                if mod_step < num_steps - 1
+                else ProfilerAction.RECORD_AND_SAVE
+            )
+
+    assert (
+        wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
+    ), "Invalid profiler schedule arguments"
+    if warmup == 0:
+        warn("Profiler won't be using warmup, this can skew profiler results")
+    return schedule_fn
+
+
+def _default_schedule_fn(_: int) -> ProfilerAction:
+    """
+    Default profiler behavior - immediately starts recording the events,
+    keeps doing it on every profiler step.
+    """
+    return ProfilerAction.RECORD
+
+
+def tensorboard_trace_handler(
+    dir_name: str, worker_name: Optional[str] = None, use_gzip: bool = False
+):
+    """
+    Outputs tracing files to directory of ``dir_name``, then that directory can be
+    directly delivered to tensorboard as logdir.
+    ``worker_name`` should be unique for each worker in distributed scenario,
+    it will be set to '[hostname]_[pid]' by default.
+    """
+    import os
+    import socket
+    import time
+
+    def handler_fn(prof) -> None:
+        nonlocal worker_name
+        if not os.path.isdir(dir_name):
+            try:
+                os.makedirs(dir_name, exist_ok=True)
+            except Exception as e:
+                raise RuntimeError("Can't create directory: " + dir_name) from e
+        if not worker_name:
+            worker_name = f"{socket.gethostname()}_{os.getpid()}"
+        # Use nanosecond here to avoid naming clash when exporting the trace
+        file_name = f"{worker_name}.{time.time_ns()}.pt.trace.json"
+        if use_gzip:
+            file_name = file_name + ".gz"
+        prof.export_chrome_trace(os.path.join(dir_name, file_name))
+
+    return handler_fn
+
+
+class profile(_KinetoProfile):
+    """Profiler context manager.
+
+    Args:
+        activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
+            ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``.
+            Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA.
+        schedule (Callable): callable that takes step (int) as a single parameter and returns
+            ``ProfilerAction`` value that specifies the profiler action to perform at each step.
+        on_trace_ready (Callable): callable that is called at each step when ``schedule``
+            returns ``ProfilerAction.RECORD_AND_SAVE`` during the profiling.
+        record_shapes (bool): save information about operator's input shapes.
+        profile_memory (bool): track tensor memory allocation/deallocation.
+        with_stack (bool): record source information (file and line number) for the ops.
+        with_flops (bool): use formula to estimate the FLOPs (floating point operations) of specific operators
+            (matrix multiplication and 2D convolution).
+        with_modules (bool): record module hierarchy (including function names)
+            corresponding to the callstack of the op. e.g. If module A's forward call's
+            module B's forward which contains an aten::add op,
+            then aten::add's module hierarchy is A.B
+            Note that this support exist, at the moment, only for TorchScript models
+            and not eager mode models.
+        experimental_config (_ExperimentalConfig) : A set of experimental options
+            used for Kineto library features. Note, backward compatibility is not guaranteed.
+        execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
+            `PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
+            representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
+            When this argument is included the observer start() and stop() will be called for the
+            same time window as PyTorch profiler. See the examples section below for a code sample.
+        use_cuda (bool):
+            .. deprecated:: 1.8.1
+                use ``activities`` instead.
+
+    .. note::
+        Use :func:`~torch.profiler.schedule` to generate the callable schedule.
+        Non-default schedules are useful when profiling long training jobs
+        and allow the user to obtain multiple traces at the different iterations
+        of the training process.
+        The default schedule simply records all the events continuously for the
+        duration of the context manager.
+
+    .. note::
+        Use :func:`~torch.profiler.tensorboard_trace_handler` to generate result files for TensorBoard:
+
+        ``on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name)``
+
+        After profiling, result files can be found in the specified directory. Use the command:
+
+        ``tensorboard --logdir dir_name``
+
+        to see the results in TensorBoard.
+        For more information, see
+        `PyTorch Profiler TensorBoard Plugin <https://github.com/pytorch/kineto/tree/master/tb_plugin>`__
+
+    .. note::
+        Enabling shape and stack tracing results in additional overhead.
+        When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
+        that may further prevent certain optimizations that depend on the reference count and introduce
+        extra tensor copies.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ]
+        ) as p:
+            code_to_profile()
+        print(p.key_averages().table(
+            sort_by="self_cuda_time_total", row_limit=-1))
+
+    Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
+
+    .. code-block:: python
+
+        # Non-default profiler schedule allows user to turn profiler on and off
+        # on different iterations of the training loop;
+        # trace_handler is called every time a new trace becomes available
+        def trace_handler(prof):
+            print(prof.key_averages().table(
+                sort_by="self_cuda_time_total", row_limit=-1))
+            # prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+
+            # In this example with wait=1, warmup=1, active=2, repeat=1,
+            # profiler will skip the first step/iteration,
+            # start warming up on the second, record
+            # the third and the forth iterations,
+            # after which the trace will become available
+            # and on_trace_ready (when set) is called;
+            # the cycle repeats starting with the next step
+
+            schedule=torch.profiler.schedule(
+                wait=1,
+                warmup=1,
+                active=2,
+                repeat=1),
+            on_trace_ready=trace_handler
+            # on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
+            # used when outputting for tensorboard
+            ) as p:
+                for iter in range(N):
+                    code_iteration_to_profile(iter)
+                    # send a signal to the profiler that the next iteration has started
+                    p.step()
+
+    The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
+
+    .. code-block:: python
+
+        with torch.profiler.profile(
+            ...
+            execution_trace_observer=(
+                ExecutionTraceObserver().register_callback("./execution_trace.json")
+            ),
+        ) as p:
+            for iter in range(N):
+                code_iteration_to_profile(iter)
+                p.step()
+
+    You can also refer to test_execution_trace_with_kineto() in tests/profiler/test_profiler.py.
+    Note: One can also pass any object satisfying the _ITraceObserver interface.
+    """
+
+    def __init__(
+        self,
+        *,
+        activities: Optional[Iterable[ProfilerActivity]] = None,
+        schedule: Optional[Callable[[int], ProfilerAction]] = None,
+        on_trace_ready: Optional[Callable[..., Any]] = None,
+        record_shapes: bool = False,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+        experimental_config: Optional[_ExperimentalConfig] = None,
+        execution_trace_observer: Optional[_ITraceObserver] = None,
+        # deprecated:
+        use_cuda: Optional[bool] = None,
+    ):
+        activities_set = set(activities) if activities else supported_activities()
+        if use_cuda is not None:
+            warn("use_cuda is deprecated, use activities argument instead")
+            if use_cuda:
+                activities_set.add(ProfilerActivity.CUDA)
+            elif ProfilerActivity.CUDA in activities_set:
+                activities_set.remove(ProfilerActivity.CUDA)
+        assert len(activities_set) > 0, "No valid profiler activities found"
+
+        super().__init__(
+            activities=activities,
+            record_shapes=record_shapes,
+            profile_memory=profile_memory,
+            with_stack=with_stack,
+            with_flops=with_flops,
+            with_modules=with_modules,
+            experimental_config=experimental_config,
+            execution_trace_observer=execution_trace_observer,
+        )
+
+        if schedule:
+            self.schedule = schedule
+            # add step markers into the trace and table view
+            self.record_steps = True
+        else:
+            self.schedule = _default_schedule_fn
+            self.record_steps = False
+        self.on_trace_ready = on_trace_ready
+        self.step_num = 0
+        self.current_action = self.schedule(self.step_num)
+        self.step_rec_fn: Optional[prof.record_function] = None
+
+        self.action_map: Dict[
+            Tuple[ProfilerAction, Optional[ProfilerAction]], List[Any]
+        ] = {
+            # key is (prev_action, current_action), value is action list corresponding to the state pair.
+            (ProfilerAction.NONE, ProfilerAction.NONE): [],
+            (ProfilerAction.NONE, ProfilerAction.WARMUP): [self.prepare_trace],
+            (ProfilerAction.NONE, ProfilerAction.RECORD): [
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.NONE, ProfilerAction.RECORD_AND_SAVE): [
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.WARMUP, ProfilerAction.NONE): [
+                partial(warn, "Incorrect schedule: WARMUP followed by NONE"),
+                self.start_trace,
+                self.stop_trace,
+            ],
+            (ProfilerAction.WARMUP, ProfilerAction.WARMUP): [],
+            (ProfilerAction.WARMUP, ProfilerAction.RECORD): [self.start_trace],
+            (ProfilerAction.WARMUP, ProfilerAction.RECORD_AND_SAVE): [self.start_trace],
+            (ProfilerAction.RECORD, ProfilerAction.NONE): [
+                partial(warn, "Incorrect schedule: RECORD followed by NONE"),
+                self.stop_trace,
+            ],
+            (ProfilerAction.RECORD, ProfilerAction.WARMUP): [
+                partial(warn, "Incorrect schedule: RECORD followed by WARMUP"),
+                self.stop_trace,
+            ],
+            (ProfilerAction.RECORD, ProfilerAction.RECORD): [],
+            (ProfilerAction.RECORD, ProfilerAction.RECORD_AND_SAVE): [],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.NONE): [
+                self.stop_trace,
+                self._trace_ready,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.WARMUP): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            (ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD_AND_SAVE): [
+                self.stop_trace,
+                self._trace_ready,
+                self.prepare_trace,
+                self.start_trace,
+            ],
+            # used for exit action
+            (ProfilerAction.WARMUP, None): [self.start_trace, self.stop_trace],
+            (ProfilerAction.RECORD, None): [self.stop_trace, self._trace_ready],
+            (ProfilerAction.RECORD_AND_SAVE, None): [
+                self.stop_trace,
+                self._trace_ready,
+            ],
+        }
+        # Start tracking increments to profiler step, this will be used
+        # by Kineto
+        prof.KinetoStepTracker.init_step_count(PROFILER_STEP_NAME)
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+        prof.KinetoStepTracker.erase_step_count(PROFILER_STEP_NAME)
+        if self.execution_trace_observer:
+            self.execution_trace_observer.cleanup()
+
+    def start(self):
+        self._transit_action(ProfilerAction.NONE, self.current_action)
+        if self.record_steps:
+            self.step_rec_fn = prof.record_function(
+                "ProfilerStep#" + str(self.step_num)
+            )
+            self.step_rec_fn.__enter__()
+
+    def stop(self):
+        if self.record_steps and self.step_rec_fn:
+            self.step_rec_fn.__exit__(None, None, None)
+        self._transit_action(self.current_action, None)
+
+    def step(self):
+        """
+        Signals the profiler that the next profiling step has started.
+        """
+        if self.record_steps and self.step_rec_fn:
+            self.step_rec_fn.__exit__(None, None, None)
+        prev_action = self.current_action
+        self.step_num += 1
+        self.current_action = self.schedule(self.step_num)
+
+        self._transit_action(prev_action, self.current_action)
+        prof.KinetoStepTracker.increment_step(PROFILER_STEP_NAME)
+
+        if self.record_steps:
+            self.step_rec_fn = prof.record_function(
+                "ProfilerStep#" + str(self.step_num)
+            )
+            self.step_rec_fn.__enter__()
+
+    def _trace_ready(self):
+        if self.on_trace_ready:
+            self.on_trace_ready(self)
+
+    def _transit_action(self, prev_action, current_action):
+        action_list = self.action_map.get((prev_action, current_action))
+        if action_list:
+            for action in action_list:
+                action()
+
+
+class ExecutionTraceObserver(_ITraceObserver):
+    """Execution Trace Observer
+
+    Each process can have a single ExecutionTraceObserver instance. The observer
+    can be added to record function callbacks via calling register_callback()
+    explicitly. Without calling unregister_callback(), repeated calls to
+    register_callback() will not add additional observers to record function
+    callbacks. Once an ExecutionTraceObserver is created, the start() and stop()
+    methods control when the event data is recorded.
+
+    Deleting or calling unregister_callback() will remove the observer from the
+    record function callbacks, finalize the output file, and will stop
+    incurring any overheads.
+    """
+
+    def __init__(self):
+        """
+        Initializes the default states.
+        """
+        self._registered = False
+        self._execution_trace_running = False
+
+    def __del__(self):
+        """
+        Calls unregister_callback() to make sure to finalize outputs.
+        """
+        self.unregister_callback()
+
+    def register_callback(self, output_file_path: str) -> Self:
+        """
+        Adds ET observer to record function callbacks. The data will be
+        written to output_file_path.
+        """
+        if not self._registered:
+            self._output_file_path = output_file_path
+            self._registered = _add_execution_trace_observer(output_file_path)
+        return self
+
+    def unregister_callback(self):
+        """
+        Removes ET observer from record function callbacks.
+        """
+        if self._registered:
+            self.stop()
+            _remove_execution_trace_observer()
+            self._registered = False
+
+    @property
+    def is_registered(self):
+        """
+        Returns True if the execution trace observer is registered, otherwise False.
+        """
+        return self._registered
+
+    def is_running(self):
+        """
+        Returns True if the observer is running, otherwise False.
+        """
+        return self._execution_trace_running
+
+    def start(self):
+        """
+        Starts to capture.
+        """
+        if self._registered and not self._execution_trace_running:
+            _enable_execution_trace_observer()
+            self._execution_trace_running = True
+
+    def stop(self):
+        """
+        Stops to capture.
+        """
+        if self._execution_trace_running:
+            _disable_execution_trace_observer()
+            self._execution_trace_running = False
+
+    def cleanup(self):
+        """
+        Calls unregister_callback() to make sure to finalize outputs.
+        """
+        self.unregister_callback()
+
+    def get_output_file_path(self) -> str:
+        """
+        Returns the output file name.
+        """
+        if self.is_registered:
+            return self._output_file_path
+        else:
+            raise RuntimeError(
+                "A callback to the ET profiler needs to be registered "
+                "first before getting the output file path"
+            )
diff --git a/MLPY/Lib/site-packages/torch/profiler/python_tracer.py b/MLPY/Lib/site-packages/torch/profiler/python_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce306175bf4362e7373a22fe3fb9c1a4a3f0ad7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/profiler/python_tracer.py
@@ -0,0 +1,20 @@
+import os
+import site
+import sys
+import typing
+
+import torch
+
+
+def _prefix_regex() -> typing.List[str]:
+    raw_paths = (
+        site.getsitepackages()
+        + sys.path
+        + [site.getuserbase()]
+        + [site.getusersitepackages()]
+        + [os.path.dirname(os.path.dirname(torch.__file__))]
+    )
+
+    path_prefixes = sorted({os.path.abspath(i) for i in raw_paths}, reverse=True)
+    assert all(isinstance(i, str) for i in path_prefixes)
+    return [i + os.sep for i in path_prefixes]
diff --git a/MLPY/Lib/site-packages/torch/py.typed b/MLPY/Lib/site-packages/torch/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/quantization/__init__.py b/MLPY/Lib/site-packages/torch/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cfafd928f883392aaa72ba0f4133e4474cd43bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/__init__.py
@@ -0,0 +1,87 @@
+from .quantize import *  # noqa: F403
+from .observer import *  # noqa: F403
+from .qconfig import *  # noqa: F403
+from .fake_quantize import *  # noqa: F403
+from .fuse_modules import fuse_modules
+from .stubs import *  # noqa: F403
+from .quant_type import *  # noqa: F403
+from .quantize_jit import *  # noqa: F403
+
+# from .quantize_fx import *
+from .quantization_mappings import *  # noqa: F403
+from .fuser_method_mappings import *  # noqa: F403
+
+
+def default_eval_fn(model, calib_data):
+    r"""
+    Default evaluation function takes a torch.utils.data.Dataset or a list of
+    input Tensors and run the model on the dataset
+    """
+    for data, target in calib_data:
+        model(data)
+
+
+__all__ = [
+    "QuantWrapper",
+    "QuantStub",
+    "DeQuantStub",
+    # Top level API for eager mode quantization
+    "quantize",
+    "quantize_dynamic",
+    "quantize_qat",
+    "prepare",
+    "convert",
+    "prepare_qat",
+    # Top level API for graph mode quantization on TorchScript
+    "quantize_jit",
+    "quantize_dynamic_jit",
+    "_prepare_ondevice_dynamic_jit",
+    "_convert_ondevice_dynamic_jit",
+    "_quantize_ondevice_dynamic_jit",
+    # Top level API for graph mode quantization on GraphModule(torch.fx)
+    # 'fuse_fx', 'quantize_fx',  # TODO: add quantize_dynamic_fx
+    # 'prepare_fx', 'prepare_dynamic_fx', 'convert_fx',
+    "QuantType",  # quantization type
+    # custom module APIs
+    "get_default_static_quant_module_mappings",
+    "get_static_quant_module_class",
+    "get_default_dynamic_quant_module_mappings",
+    "get_default_qat_module_mappings",
+    "get_default_qconfig_propagation_list",
+    "get_default_compare_output_module_list",
+    "get_quantized_operator",
+    "get_fuser_method",
+    # Sub functions for `prepare` and `swap_module`
+    "propagate_qconfig_",
+    "add_quant_dequant",
+    "swap_module",
+    "default_eval_fn",
+    # Observers
+    "ObserverBase",
+    "WeightObserver",
+    "HistogramObserver",
+    "observer",
+    "default_observer",
+    "default_weight_observer",
+    "default_placeholder_observer",
+    "default_per_channel_weight_observer",
+    # FakeQuantize (for qat)
+    "default_fake_quant",
+    "default_weight_fake_quant",
+    "default_fixed_qparams_range_neg1to1_fake_quant",
+    "default_fixed_qparams_range_0to1_fake_quant",
+    "default_per_channel_weight_fake_quant",
+    "default_histogram_fake_quant",
+    # QConfig
+    "QConfig",
+    "default_qconfig",
+    "default_dynamic_qconfig",
+    "float16_dynamic_qconfig",
+    "float_qparams_weight_only_qconfig",
+    # QAT utilities
+    "default_qat_qconfig",
+    "prepare_qat",
+    "quantize_qat",
+    # module transformations
+    "fuse_modules",
+]
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a31397251c1cb1b15e71f6b954436c5eeac369c8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b834da37f458e6065b819b9fe9fb045120598e7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27f29e19622fc4fb0f000b15bd6012a33f64f4ee
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_numeric_suite_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/_quantized_conversions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_quantized_conversions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..749559ac3256a4dbfc051524fb37c1ea6f76f412
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/_quantized_conversions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab5917bee1221d754116356816565a4737481d1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fake_quantize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuse_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuse_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64479bcbbf89c1b9ec0b17c23acd45525c88a022
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuse_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f67ac2dafdb677be5218379c303d1457f013d6a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/fuser_method_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/observer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/observer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59b2cbc03f19ecbc30dc6a8a92bc9aaacbeaa25f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/observer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/qconfig.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/qconfig.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dd588812ac2469747379d50ea1cf0e08d9e0434
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/qconfig.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/quant_type.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quant_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06e357e7fa64c9cf47067107e56ec98d231db55f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quant_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantization_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantization_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c27bc0c5ab716811986a710b4208d73c526cd01f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantization_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11e266fc9c1548739ccf4ff9da44ded99f7be018
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..682be1d7479ef59417829c2881f92cc24e2f91d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_jit.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_jit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be5662f230c443d6e5efe60fc86b346545b7783c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/quantize_jit.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/stubs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/stubs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c621987dd148f8dab467d377c35eeb10a6d6ad5d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/stubs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b0a07a0c7fbacfd94fcaf8ea847b2d341b5af6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/_numeric_suite.py b/MLPY/Lib/site-packages/torch/quantization/_numeric_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..b018249b10c73022695afde18cab90251cd01cbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/_numeric_suite.py
@@ -0,0 +1,28 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/ns/_numeric_suite.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.ns._numeric_suite import (
+    _convert_tuple_to_list,
+    _dequantize_tensor_list,
+    _find_match,
+    _get_logger_dict_helper,
+    _is_identical_module_type,
+    compare_model_outputs,
+    compare_model_stub,
+    compare_weights,
+    get_logger_dict,
+    get_matching_activations,
+    Logger,
+    NON_LEAF_MODULE_TO_ADD_OBSERVER_ALLOW_LIST,
+    OutputLogger,
+    prepare_model_outputs,
+    prepare_model_with_stubs,
+    Shadow,
+    ShadowLogger,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/_numeric_suite_fx.py b/MLPY/Lib/site-packages/torch/quantization/_numeric_suite_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9ac7651ea61bb70861a9d1b0d0b0c28b6c38ebc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/_numeric_suite_fx.py
@@ -0,0 +1,26 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/ns/_numeric_suite_fx.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.ns._numeric_suite_fx import (
+    _add_loggers_impl,
+    _add_loggers_one_model,
+    _add_shadow_loggers_impl,
+    _extract_logger_info_one_model,
+    _extract_weights_impl,
+    _extract_weights_one_model,
+    add_loggers,
+    add_shadow_loggers,
+    extend_logger_results_with_comparison,
+    extract_logger_info,
+    extract_shadow_logger_info,
+    extract_weights,
+    NSTracer,
+    OutputLogger,
+    RNNReturnType,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/_quantized_conversions.py b/MLPY/Lib/site-packages/torch/quantization/_quantized_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..af368faafc0fbd83275b8318f86344fe490522ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/_quantized_conversions.py
@@ -0,0 +1,132 @@
+import torch
+
+
+# Pack pairs of int4 values into int8, in row major order; first int4
+# value goes into lower order bits, and second int4 value into higher
+# order bits of resulting int8 value.
+def pack_int4_to_int8(weight):
+    assert weight.dim() == 2
+    assert weight.shape[1] % 2 == 0
+    assert weight.dtype == torch.int8
+    return ((weight[:, 1::2] & 0xF) << 4) | (weight[:, 0::2] & 0xF)
+
+
+# Unpack quandruples of bits in int8 values into int4 values, in row
+# major order; lower 4 bits go into first int4 value goes, and upper 4
+# bits go into second int4 value.
+def unpack_int8_to_int4(weight):
+    assert weight.dim() == 2
+    assert weight.dtype == torch.int8
+    return torch.stack((weight & 0xF, (weight >> 4) & 0xF), dim=2).view(
+        weight.shape[0], 2 * weight.shape[1]
+    )
+
+
+# Transpose the weight matrix, and then reorder its elements according
+# to underlying requirements of CUTLASS library, so that it could be
+# used for CUTLASS-based mixed datatypes linear operation.
+def quantized_weight_reorder_for_mixed_dtypes_linear_cutlass(
+    weight, dtypeq, transpose=False
+):
+    assert weight.dim() == 2
+    assert weight.dtype == torch.int8
+    assert dtypeq == torch.int8 or dtypeq == torch.quint4x2
+    assert weight.device.type == "cuda"
+
+    device = weight.device
+
+    # subbyte_transpose
+    if not transpose:
+        if dtypeq == torch.int8:
+            outp = weight.T
+        elif dtypeq == torch.quint4x2:
+            outp = pack_int4_to_int8(unpack_int8_to_int4(weight.view(torch.int8)).T)
+    else:
+        outp = weight
+
+    ncols, nrows = outp.shape  # type: ignore[possibly-undefined]
+    assert nrows % (32 if dtypeq == torch.quint4x2 else 64) == 0
+    assert ncols % 64 == 0
+
+    # permute_B_rows_for_mixed_gemm
+    # (permute cols actually, as transpose is applied first here)
+    if dtypeq == torch.quint4x2:
+        cols_permuted = (
+            torch.tensor(
+                [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15],
+                device=device,
+            )
+            + (torch.arange(0, nrows // 16, device=device).reshape(-1, 1) * 16).expand(
+                nrows // 16, 16
+            )
+        ).view(-1)
+    else:
+        cols_permuted = (
+            torch.tensor(
+                [0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15],
+                device=device,
+            )
+            + (torch.arange(0, nrows // 16, device=device).reshape(-1, 1) * 16).expand(
+                nrows // 16, 16
+            )
+        ).view(-1)
+    outp = outp.index_copy(1, cols_permuted, outp)
+
+    # interleave_column_major_tensor
+    magic0 = 4 if dtypeq == torch.quint4x2 else 2
+    magic1 = 32 // magic0
+
+    tmp0 = (
+        (torch.arange(0, ncols // magic0, device=device) * (nrows // 4 * magic0))
+        .view(-1, 1)
+        .repeat(1, nrows // 4 * magic0)
+        .view(-1)
+    )
+    tmp1 = (
+        (torch.arange(0, nrows // 4 // magic1, device=device) * (magic0 * magic1))
+        .view(-1, 1)
+        .repeat(1, magic1)
+        .view(-1)
+        .repeat(ncols)
+    )
+    tmp2 = (
+        (torch.arange(0, magic0, device=device) * magic1)
+        .view(-1, 1)
+        .repeat(1, nrows // 4)
+        .view(-1)
+        .repeat(ncols // magic0)
+    )
+    tmp3 = torch.arange(0, magic1, device=device).repeat(nrows // 4 * ncols // magic1)
+
+    outp_offsets = tmp0 + tmp1 + tmp2 + tmp3
+
+    tmp = outp.view(-1).view(torch.int32)
+    outp = torch.zeros_like(tmp)
+    outp.scatter_(0, outp_offsets, tmp)
+    outp = outp.view(weight.dtype)
+
+    # add_bias_and_interleave_quantized_tensor_inplace
+    tmp = outp.view(-1)
+
+    outp = torch.empty_like(tmp)
+    if dtypeq == torch.int8:
+        tmp = (tmp.to(torch.int) + 128).to(tmp.dtype)
+        outp[0::4] = tmp[0::4]
+        outp[1::4] = tmp[2::4]
+        outp[2::4] = tmp[1::4]
+        outp[3::4] = tmp[3::4]
+    elif dtypeq == torch.quint4x2:
+        tmp0 = ((tmp & 0xF) + 8) & 0xF
+        tmp0 = (tmp0[1::2] << 4) | tmp0[0::2]
+        tmp1 = (((tmp >> 4) & 0xF) + 8) & 0xF
+        tmp1 = (tmp1[1::2] << 4) | tmp1[0::2]
+        outp[0::4] = tmp0[0::2]
+        outp[1::4] = tmp0[1::2]
+        outp[2::4] = tmp1[0::2]
+        outp[3::4] = tmp1[1::2]
+
+    if dtypeq == torch.quint4x2:
+        nrows *= 2
+        ncols //= 2
+
+    return outp.view(nrows, ncols).view(torch.uint8)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fake_quantize.py b/MLPY/Lib/site-packages/torch/quantization/fake_quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..797411932100ddbbbb0d1bd654eadcc51c8a6283
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fake_quantize.py
@@ -0,0 +1,32 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/fake_quantize.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.fake_quantize import (
+    _is_fake_quant_script_module,
+    _is_per_channel,
+    _is_per_tensor,
+    _is_symmetric_quant,
+    default_fake_quant,
+    default_fixed_qparams_range_0to1_fake_quant,
+    default_fixed_qparams_range_neg1to1_fake_quant,
+    default_fused_act_fake_quant,
+    default_fused_per_channel_wt_fake_quant,
+    default_fused_wt_fake_quant,
+    default_histogram_fake_quant,
+    default_per_channel_weight_fake_quant,
+    default_weight_fake_quant,
+    disable_fake_quant,
+    disable_observer,
+    enable_fake_quant,
+    enable_observer,
+    FakeQuantize,
+    FakeQuantizeBase,
+    FixedQParamsFakeQuantize,
+    FusedMovingAvgObsFakeQuantize,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fuse_modules.py b/MLPY/Lib/site-packages/torch/quantization/fuse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..13971c16ccfdabbb61ee086018c1c91ffd8f9544
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fuse_modules.py
@@ -0,0 +1,22 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/fuse_modules.py`, while adding an import statement
+here.
+"""
+
+# TODO: These functions are not used outside the `fuse_modules.py`
+#       Keeping here for now, need to remove them later.
+from torch.ao.quantization.fuse_modules import (
+    _fuse_modules,
+    _get_module,
+    _set_module,
+    fuse_known_modules,
+    fuse_modules,
+    get_fuser_method,
+)
+
+# for backward compatiblity
+from torch.ao.quantization.fuser_method_mappings import fuse_conv_bn, fuse_conv_bn_relu
diff --git a/MLPY/Lib/site-packages/torch/quantization/fuser_method_mappings.py b/MLPY/Lib/site-packages/torch/quantization/fuser_method_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..431a6bf7f2f5e353ed0e6531024a94f301f432cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fuser_method_mappings.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/fuser_method_mappings.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fuser_method_mappings import (
+    _DEFAULT_OP_LIST_TO_FUSER_METHOD,
+    fuse_conv_bn,
+    fuse_conv_bn_relu,
+    fuse_linear_bn,
+    get_fuser_method,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__init__.py b/MLPY/Lib/site-packages/torch/quantization/fx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf8a6b2d01ecb789d18e7da0d8673f0a662e0669
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/__init__.py
@@ -0,0 +1,15 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.fx.convert import convert
+from torch.ao.quantization.fx.fuse import fuse
+
+# omitting files that's unlikely to be used right now, for example
+# the newly added lower_to_fbgemm etc.
+from torch.ao.quantization.fx.prepare import prepare
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00fb7bf5148bd24c138d52d6cf7709c4501dbf56
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3de2d7d4f8e6462dd0bc5c6b76859ea8de90faf9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/_equalize.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/convert.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/convert.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..244024217947f99c40b2be7b19ce35ce6e8c25a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/convert.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fuse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fuse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a449fe255a3b89243226409c61014042e678352a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fuse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c196dd276342205bbaeb0e04f7abc4bf2b2820f3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/fusion_patterns.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/graph_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/graph_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab988cab17105ae22ee170a430d42621c68bebf2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/graph_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/match_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/match_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fcd606ab80ead5084ac32795913af6a8d563b24
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/match_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46a42194a3b1b4138193bab8a40f3cda1de33e6b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/pattern_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/prepare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/prepare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ace061e41a9abf509be6b9d8a1643aafa8703f10
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/prepare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_patterns.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_patterns.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fd7b02f9f1586baede7cf399425e8c3fe5f4b32
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_patterns.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_types.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df6485dc3e833c7bf87f5c20d87732d0b1d01cea
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/quantization_types.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b426e925cea297badeb1bf4305b2223e8e07573
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/quantization/fx/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/_equalize.py b/MLPY/Lib/site-packages/torch/quantization/fx/_equalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..101dba36b0129e7a176df10d7444624680779363
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/_equalize.py
@@ -0,0 +1,38 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx._equalize import (
+    _convert_equalization_ref,
+    _InputEqualizationObserver,
+    _WeightEqualizationObserver,
+    calculate_equalization_scale,
+    clear_weight_quant_obs_node,
+    convert_eq_obs,
+    CUSTOM_MODULE_SUPP_LIST,
+    custom_module_supports_equalization,
+    default_equalization_qconfig,
+    EqualizationQConfig,
+    fused_module_supports_equalization,
+    get_equalization_qconfig_dict,
+    get_layer_sqnr_dict,
+    get_op_node_and_weight_eq_obs,
+    input_equalization_observer,
+    is_equalization_observer,
+    maybe_get_next_equalization_scale,
+    maybe_get_next_input_eq_obs,
+    maybe_get_weight_eq_obs_node,
+    nn_module_supports_equalization,
+    node_supports_equalization,
+    remove_node,
+    reshape_scale,
+    scale_input_observer,
+    scale_weight_functional,
+    scale_weight_node,
+    update_obs_for_equalization,
+    weight_equalization_observer,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/convert.py b/MLPY/Lib/site-packages/torch/quantization/fx/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..9abaf1d57c69582d2b4ea5d5a000216ceb74cc27
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/convert.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.convert import convert
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/fuse.py b/MLPY/Lib/site-packages/torch/quantization/fx/fuse.py
new file mode 100644
index 0000000000000000000000000000000000000000..892d0eb089851c21da4286bd707990987487980f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/fuse.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.fuse import fuse
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/fusion_patterns.py b/MLPY/Lib/site-packages/torch/quantization/fx/fusion_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b24f136e3dc4008f3bb72a618d3ea2d66843533
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/fusion_patterns.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.fuse_handler import DefaultFuseHandler, FuseHandler
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/graph_module.py b/MLPY/Lib/site-packages/torch/quantization/fx/graph_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..2092616a97d49a3a4a11cd12ec8ebd9eff37aed1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/graph_module.py
@@ -0,0 +1,17 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.graph_module import (
+    _is_observed_module,
+    _is_observed_standalone_module,
+    FusedGraphModule,
+    GraphModule,
+    ObservedGraphModule,
+    ObservedStandaloneGraphModule,
+    QuantizedGraphModule,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/match_utils.py b/MLPY/Lib/site-packages/torch/quantization/fx/match_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a498b1cd1abfeab1540e6f00c0a2c871776c436
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/match_utils.py
@@ -0,0 +1,14 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.match_utils import (
+    _find_matches,
+    _is_match,
+    _MatchResult,
+    MatchAllNode,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/pattern_utils.py b/MLPY/Lib/site-packages/torch/quantization/fx/pattern_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..786ac04b3fe3fab73c3b73a0e45eb67e56babaa9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/pattern_utils.py
@@ -0,0 +1,34 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.pattern_utils import (
+    _register_fusion_pattern,
+    _register_quant_pattern,
+    get_default_fusion_patterns,
+    get_default_output_activation_post_process_map,
+    get_default_quant_patterns,
+    QuantizeHandler,
+)
+
+# QuantizeHandler.__module__ = _NAMESPACE
+_register_fusion_pattern.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_fusion_patterns.__module__ = "torch.ao.quantization.fx.pattern_utils"
+_register_quant_pattern.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_quant_patterns.__module__ = "torch.ao.quantization.fx.pattern_utils"
+get_default_output_activation_post_process_map.__module__ = (
+    "torch.ao.quantization.fx.pattern_utils"
+)
+
+# __all__ = [
+#     "QuantizeHandler",
+#     "_register_fusion_pattern",
+#     "get_default_fusion_patterns",
+#     "_register_quant_pattern",
+#     "get_default_quant_patterns",
+#     "get_default_output_activation_post_process_map",
+# ]
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/prepare.py b/MLPY/Lib/site-packages/torch/quantization/fx/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7cfa8c01d40c054c8b5f91ee73eac340b4af88
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/prepare.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.prepare import prepare
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/quantization_patterns.py b/MLPY/Lib/site-packages/torch/quantization/fx/quantization_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e3f1217feba3f9332b9171564560ad433ed1932
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/quantization_patterns.py
@@ -0,0 +1,47 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.quantize_handler import (
+    BatchNormQuantizeHandler,
+    BinaryOpQuantizeHandler,
+    CatQuantizeHandler,
+    ConvReluQuantizeHandler,
+    CopyNodeQuantizeHandler,
+    CustomModuleQuantizeHandler,
+    DefaultNodeQuantizeHandler,
+    EmbeddingQuantizeHandler,
+    FixedQParamsOpQuantizeHandler,
+    GeneralTensorShapeOpQuantizeHandler,
+    LinearReLUQuantizeHandler,
+    QuantizeHandler,
+    RNNDynamicQuantizeHandler,
+    StandaloneModuleQuantizeHandler,
+)
+
+QuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+BinaryOpQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+CatQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+ConvReluQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+LinearReLUQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+BatchNormQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+EmbeddingQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+RNNDynamicQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+DefaultNodeQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+FixedQParamsOpQuantizeHandler.__module__ = (
+    "torch.ao.quantization.fx.quantization_patterns"
+)
+CopyNodeQuantizeHandler.__module__ = "torch.ao.quantization.fx.quantization_patterns"
+CustomModuleQuantizeHandler.__module__ = (
+    "torch.ao.quantization.fx.quantization_patterns"
+)
+GeneralTensorShapeOpQuantizeHandler.__module__ = (
+    "torch.ao.quantization.fx.quantization_patterns"
+)
+StandaloneModuleQuantizeHandler.__module__ = (
+    "torch.ao.quantization.fx.quantization_patterns"
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/quantization_types.py b/MLPY/Lib/site-packages/torch/quantization/fx/quantization_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..f27846199ba123477170eff35431663385738b03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/quantization_types.py
@@ -0,0 +1,9 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.utils import Pattern, QuantizerCls
diff --git a/MLPY/Lib/site-packages/torch/quantization/fx/utils.py b/MLPY/Lib/site-packages/torch/quantization/fx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f547529d36d7c7ae87a0c8d7859ebdd8f346e133
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/fx/utils.py
@@ -0,0 +1,20 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate files under `torch/ao/quantization/fx/`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.fx.utils import (
+    all_node_args_have_no_tensors,
+    assert_and_get_unique_device,
+    create_getattr_from_value,
+    get_custom_module_class_keys,
+    get_linear_prepack_op_for_dtype,
+    get_new_attr_name_with_prefix,
+    get_non_observable_arg_indexes_and_types,
+    get_qconv_prepack_op,
+    graph_module_from_producer_nodes,
+    maybe_get_next_module,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/observer.py b/MLPY/Lib/site-packages/torch/quantization/observer.py
new file mode 100644
index 0000000000000000000000000000000000000000..692a24b2dd9c3e08eab3f6e7e052db7be431d933
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/observer.py
@@ -0,0 +1,36 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/observer.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.observer import (
+    _is_activation_post_process,
+    _is_per_channel_script_obs_instance,
+    _ObserverBase,
+    _PartialWrapper,
+    _with_args,
+    _with_callable_args,
+    ABC,
+    default_debug_observer,
+    default_dynamic_quant_observer,
+    default_float_qparams_observer,
+    default_histogram_observer,
+    default_observer,
+    default_per_channel_weight_observer,
+    default_placeholder_observer,
+    default_weight_observer,
+    get_observer_state_dict,
+    HistogramObserver,
+    load_observer_state_dict,
+    MinMaxObserver,
+    MovingAverageMinMaxObserver,
+    MovingAveragePerChannelMinMaxObserver,
+    NoopObserver,
+    ObserverBase,
+    PerChannelMinMaxObserver,
+    PlaceholderObserver,
+    RecordingObserver,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/qconfig.py b/MLPY/Lib/site-packages/torch/quantization/qconfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..38064e0931e4240b383ceb59da3cecf6b9a4e9a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/qconfig.py
@@ -0,0 +1,30 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/qconfig.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.qconfig import (
+    _add_module_to_qconfig_obs_ctr,
+    _assert_valid_qconfig,
+    default_activation_only_qconfig,
+    default_debug_qconfig,
+    default_dynamic_qconfig,
+    default_per_channel_qconfig,
+    default_qat_qconfig,
+    default_qat_qconfig_v2,
+    default_qconfig,
+    default_weight_only_qconfig,
+    float16_dynamic_qconfig,
+    float16_static_qconfig,
+    float_qparams_weight_only_qconfig,
+    get_default_qat_qconfig,
+    get_default_qconfig,
+    per_channel_dynamic_qconfig,
+    QConfig,
+    qconfig_equals,
+    QConfigAny,
+    QConfigDynamic,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/quant_type.py b/MLPY/Lib/site-packages/torch/quantization/quant_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..1798eeb2f0c6ecfc5bd22a1ee13f3d45ace7063a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/quant_type.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quant_type.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.quant_type import _get_quant_type_to_str, QuantType
diff --git a/MLPY/Lib/site-packages/torch/quantization/quantization_mappings.py b/MLPY/Lib/site-packages/torch/quantization/quantization_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..83df68c3f78394c8c03601520459f4fb4a85a513
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/quantization_mappings.py
@@ -0,0 +1,29 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantization_mappings.py`, while adding an import statement
+here.
+"""
+from torch.ao.quantization.quantization_mappings import (
+    _get_special_act_post_process,
+    _has_special_act_post_process,
+    _INCLUDE_QCONFIG_PROPAGATE_LIST,
+    DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS,
+    DEFAULT_FLOAT_TO_QUANTIZED_OPERATOR_MAPPINGS,
+    DEFAULT_MODULE_TO_ACT_POST_PROCESS,
+    DEFAULT_QAT_MODULE_MAPPINGS,
+    DEFAULT_REFERENCE_STATIC_QUANT_MODULE_MAPPINGS,
+    DEFAULT_STATIC_QUANT_MODULE_MAPPINGS,
+    get_default_compare_output_module_list,
+    get_default_dynamic_quant_module_mappings,
+    get_default_float_to_quantized_operator_mappings,
+    get_default_qat_module_mappings,
+    get_default_qconfig_propagation_list,
+    get_default_static_quant_module_mappings,
+    get_dynamic_quant_module_class,
+    get_quantized_operator,
+    get_static_quant_module_class,
+    no_observer_set,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/quantize.py b/MLPY/Lib/site-packages/torch/quantization/quantize.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cecac35d129094f3cb17ae1cad5dbd8ab61548d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/quantize.py
@@ -0,0 +1,30 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantize.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.quantize import (
+    _add_observer_,
+    _convert,
+    _get_observer_dict,
+    _get_unique_devices_,
+    _is_activation_post_process,
+    _observer_forward_hook,
+    _propagate_qconfig_helper,
+    _register_activation_post_process_hook,
+    _remove_activation_post_process,
+    _remove_qconfig,
+    add_quant_dequant,
+    convert,
+    prepare,
+    prepare_qat,
+    propagate_qconfig_,
+    quantize,
+    quantize_dynamic,
+    quantize_qat,
+    swap_module,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/quantize_fx.py b/MLPY/Lib/site-packages/torch/quantization/quantize_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ead95af081d7537f94e096c8e7c72ab2b6c89af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/quantize_fx.py
@@ -0,0 +1,26 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantize_fx.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.fx.graph_module import ObservedGraphModule
+from torch.ao.quantization.quantize_fx import (
+    _check_is_graph_module,
+    _convert_fx,
+    _convert_standalone_module_fx,
+    _fuse_fx,
+    _prepare_fx,
+    _prepare_standalone_module_fx,
+    _swap_ff_with_fxff,
+    convert_fx,
+    fuse_fx,
+    prepare_fx,
+    prepare_qat_fx,
+    QuantizationTracer,
+    Scope,
+    ScopeContextManager,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/quantize_jit.py b/MLPY/Lib/site-packages/torch/quantization/quantize_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..25a9e7ed9afd1ff9883fa9d75489682894340022
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/quantize_jit.py
@@ -0,0 +1,26 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/quantize_jit.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.quantize_jit import (
+    _check_forward_method,
+    _check_is_script_module,
+    _convert_jit,
+    _prepare_jit,
+    _prepare_ondevice_dynamic_jit,
+    _quantize_jit,
+    convert_dynamic_jit,
+    convert_jit,
+    fuse_conv_bn_jit,
+    prepare_dynamic_jit,
+    prepare_jit,
+    quantize_dynamic_jit,
+    quantize_jit,
+    script_qconfig,
+    script_qconfig_dict,
+)
diff --git a/MLPY/Lib/site-packages/torch/quantization/stubs.py b/MLPY/Lib/site-packages/torch/quantization/stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..707578ae3d67bf687c14698574166393ea47bb5a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/stubs.py
@@ -0,0 +1,10 @@
+# flake8: noqa: F401
+r"""
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/stubs.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.stubs import DeQuantStub, QuantStub, QuantWrapper
diff --git a/MLPY/Lib/site-packages/torch/quantization/utils.py b/MLPY/Lib/site-packages/torch/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..139e154662cdda844052770115242bf9ebfd30f7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quantization/utils.py
@@ -0,0 +1,29 @@
+# flake8: noqa: F401
+r"""
+Utils shared by different modes of quantization (eager/graph)
+
+This file is in the process of migration to `torch/ao/quantization`, and
+is kept here for compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+`torch/ao/quantization/utils.py`, while adding an import statement
+here.
+"""
+
+from torch.ao.quantization.utils import (
+    activation_dtype,
+    activation_is_int8_quantized,
+    activation_is_statically_quantized,
+    calculate_qmin_qmax,
+    check_min_max_valid,
+    get_combined_dict,
+    get_qconfig_dtypes,
+    get_qparam_dict,
+    get_quant_type,
+    get_swapped_custom_module_class,
+    getattr_from_fqn,
+    is_per_channel,
+    is_per_tensor,
+    weight_dtype,
+    weight_is_quantized,
+    weight_is_statically_quantized,
+)
diff --git a/MLPY/Lib/site-packages/torch/quasirandom.py b/MLPY/Lib/site-packages/torch/quasirandom.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cc4059c81d643d794c13b79e1d5020c566c0ebb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/quasirandom.py
@@ -0,0 +1,180 @@
+import torch
+from typing import Optional
+
+
+class SobolEngine:
+    r"""
+    The :class:`torch.quasirandom.SobolEngine` is an engine for generating
+    (scrambled) Sobol sequences. Sobol sequences are an example of low
+    discrepancy quasi-random sequences.
+
+    This implementation of an engine for Sobol sequences is capable of
+    sampling sequences up to a maximum dimension of 21201. It uses direction
+    numbers from https://web.maths.unsw.edu.au/~fkuo/sobol/ obtained using the
+    search criterion D(6) up to the dimension 21201. This is the recommended
+    choice by the authors.
+
+    References:
+      - Art B. Owen. Scrambling Sobol and Niederreiter-Xing points.
+        Journal of Complexity, 14(4):466-489, December 1998.
+
+      - I. M. Sobol. The distribution of points in a cube and the accurate
+        evaluation of integrals.
+        Zh. Vychisl. Mat. i Mat. Phys., 7:784-802, 1967.
+
+    Args:
+        dimension (Int): The dimensionality of the sequence to be drawn
+        scramble (bool, optional): Setting this to ``True`` will produce
+                                   scrambled Sobol sequences. Scrambling is
+                                   capable of producing better Sobol
+                                   sequences. Default: ``False``.
+        seed (Int, optional): This is the seed for the scrambling. The seed
+                              of the random number generator is set to this,
+                              if specified. Otherwise, it uses a random seed.
+                              Default: ``None``
+
+    Examples::
+
+        >>> # xdoctest: +SKIP("unseeded random state")
+        >>> soboleng = torch.quasirandom.SobolEngine(dimension=5)
+        >>> soboleng.draw(3)
+        tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
+                [0.5000, 0.5000, 0.5000, 0.5000, 0.5000],
+                [0.7500, 0.2500, 0.2500, 0.2500, 0.7500]])
+    """
+    MAXBIT = 30
+    MAXDIM = 21201
+
+    def __init__(self, dimension, scramble=False, seed=None):
+        if dimension > self.MAXDIM or dimension < 1:
+            raise ValueError("Supported range of dimensionality "
+                             f"for SobolEngine is [1, {self.MAXDIM}]")
+
+        self.seed = seed
+        self.scramble = scramble
+        self.dimension = dimension
+
+        cpu = torch.device("cpu")
+
+        self.sobolstate = torch.zeros(dimension, self.MAXBIT, device=cpu, dtype=torch.long)
+        torch._sobol_engine_initialize_state_(self.sobolstate, self.dimension)
+
+        if not self.scramble:
+            self.shift = torch.zeros(self.dimension, device=cpu, dtype=torch.long)
+        else:
+            self._scramble()
+
+        self.quasi = self.shift.clone(memory_format=torch.contiguous_format)
+        self._first_point = (self.quasi / 2 ** self.MAXBIT).reshape(1, -1)
+        self.num_generated = 0
+
+    def draw(self, n: int = 1, out: Optional[torch.Tensor] = None,
+             dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        r"""
+        Function to draw a sequence of :attr:`n` points from a Sobol sequence.
+        Note that the samples are dependent on the previous samples. The size
+        of the result is :math:`(n, dimension)`.
+
+        Args:
+            n (Int, optional): The length of sequence of points to draw.
+                               Default: 1
+            out (Tensor, optional): The output tensor
+            dtype (:class:`torch.dtype`, optional): the desired data type of the
+                                                    returned tensor.
+                                                    Default: ``torch.float32``
+        """
+        if self.num_generated == 0:
+            if n == 1:
+                result = self._first_point.to(dtype)
+            else:
+                result, self.quasi = torch._sobol_engine_draw(
+                    self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated, dtype=dtype,
+                )
+                result = torch.cat((self._first_point, result), dim=-2)
+        else:
+            result, self.quasi = torch._sobol_engine_draw(
+                self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1, dtype=dtype,
+            )
+
+        self.num_generated += n
+
+        if out is not None:
+            out.resize_as_(result).copy_(result)
+            return out
+
+        return result
+
+    def draw_base2(self, m: int, out: Optional[torch.Tensor] = None,
+                   dtype: torch.dtype = torch.float32) -> torch.Tensor:
+        r"""
+        Function to draw a sequence of :attr:`2**m` points from a Sobol sequence.
+        Note that the samples are dependent on the previous samples. The size
+        of the result is :math:`(2**m, dimension)`.
+
+        Args:
+            m (Int): The (base2) exponent of the number of points to draw.
+            out (Tensor, optional): The output tensor
+            dtype (:class:`torch.dtype`, optional): the desired data type of the
+                                                    returned tensor.
+                                                    Default: ``torch.float32``
+        """
+        n = 2 ** m
+        total_n = self.num_generated + n
+        if not (total_n & (total_n - 1) == 0):
+            raise ValueError("The balance properties of Sobol' points require "
+                             f"n to be a power of 2. {self.num_generated} points have been "
+                             f"previously generated, then: n={self.num_generated}+2**{m}={total_n}. "
+                             "If you still want to do this, please use "
+                             "'SobolEngine.draw()' instead."
+                             )
+        return self.draw(n=n, out=out, dtype=dtype)
+
+    def reset(self):
+        r"""
+        Function to reset the ``SobolEngine`` to base state.
+        """
+        self.quasi.copy_(self.shift)
+        self.num_generated = 0
+        return self
+
+    def fast_forward(self, n):
+        r"""
+        Function to fast-forward the state of the ``SobolEngine`` by
+        :attr:`n` steps. This is equivalent to drawing :attr:`n` samples
+        without using the samples.
+
+        Args:
+            n (Int): The number of steps to fast-forward by.
+        """
+        if self.num_generated == 0:
+            torch._sobol_engine_ff_(self.quasi, n - 1, self.sobolstate, self.dimension, self.num_generated)
+        else:
+            torch._sobol_engine_ff_(self.quasi, n, self.sobolstate, self.dimension, self.num_generated - 1)
+        self.num_generated += n
+        return self
+
+    def _scramble(self):
+        g: Optional[torch.Generator] = None
+        if self.seed is not None:
+            g = torch.Generator()
+            g.manual_seed(self.seed)
+
+        cpu = torch.device("cpu")
+
+        # Generate shift vector
+        shift_ints = torch.randint(2, (self.dimension, self.MAXBIT), device=cpu, generator=g)
+        self.shift = torch.mv(shift_ints, torch.pow(2, torch.arange(0, self.MAXBIT, device=cpu)))
+
+        # Generate lower triangular matrices (stacked across dimensions)
+        ltm_dims = (self.dimension, self.MAXBIT, self.MAXBIT)
+        ltm = torch.randint(2, ltm_dims, device=cpu, generator=g).tril()
+
+        torch._sobol_engine_scramble_(self.sobolstate, ltm, self.dimension)
+
+    def __repr__(self):
+        fmt_string = [f'dimension={self.dimension}']
+        if self.scramble:
+            fmt_string += ['scramble=True']
+        if self.seed is not None:
+            fmt_string += [f'seed={self.seed}']
+        return self.__class__.__name__ + '(' + ', '.join(fmt_string) + ')'
diff --git a/MLPY/Lib/site-packages/torch/random.py b/MLPY/Lib/site-packages/torch/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb79b4e5d23ad9e152c0d410c1924637a044370
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/random.py
@@ -0,0 +1,182 @@
+import contextlib
+from typing import Generator
+import warnings
+
+from torch._C import default_generator
+import torch
+
+
+def set_rng_state(new_state: torch.Tensor) -> None:
+    r"""Sets the random number generator state.
+
+    .. note:: This function only works for CPU. For CUDA, please use
+        :func:`torch.manual_seed`, which works for both CPU and CUDA.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+    """
+    default_generator.set_state(new_state)
+
+
+def get_rng_state() -> torch.Tensor:
+    r"""Returns the random number generator state as a `torch.ByteTensor`.
+
+    .. note:: The returned state is for the default generator on CPU only.
+
+    See also: :func:`torch.random.fork_rng`.
+    """
+    return default_generator.get_state()
+
+
+def manual_seed(seed) -> torch._C.Generator:
+    r"""Sets the seed for generating random numbers on all devices. Returns a
+    `torch.Generator` object.
+
+    Args:
+        seed (int): The desired seed. Value must be within the inclusive range
+            `[-0x8000_0000_0000_0000, 0xffff_ffff_ffff_ffff]`. Otherwise, a RuntimeError
+            is raised. Negative inputs are remapped to positive values with the formula
+            `0xffff_ffff_ffff_ffff + seed`.
+    """
+    seed = int(seed)
+    import torch.cuda
+
+    if not torch.cuda._is_in_bad_fork():
+        torch.cuda.manual_seed_all(seed)
+
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
+        torch.xpu.manual_seed_all(seed)
+
+    _seed_custom_device(seed)
+
+    return default_generator.manual_seed(seed)
+
+
+def seed() -> int:
+    r"""Sets the seed for generating random numbers to a non-deterministic
+    random number on all devices. Returns a 64 bit number used to seed the RNG.
+    """
+    seed = default_generator.seed()
+    import torch.cuda
+
+    if not torch.cuda._is_in_bad_fork():
+        torch.cuda.manual_seed_all(seed)
+
+    import torch.mps
+    if not torch.mps._is_in_bad_fork():
+        torch.mps.manual_seed(seed)
+
+    import torch.xpu
+    if not torch.xpu._is_in_bad_fork():
+        torch.xpu.manual_seed_all(seed)
+
+    _seed_custom_device(seed)
+
+    return seed
+
+
+def _seed_custom_device(seed) -> None:
+    r"""Sets the seed to generate random numbers for custom device.
+
+    Args:
+        seed (int): The desired seed.
+
+    See [Note: support the custom device with privateuse1]
+    """
+    seed = int(seed)
+    custom_backend_name = torch._C._get_privateuse1_backend_name()
+    if hasattr(torch, custom_backend_name):
+        custom_device_mod = getattr(torch, custom_backend_name)
+        _bad_fork_name = "_is_in_bad_fork"
+        _seed_all_name = "manual_seed_all"
+        if hasattr(custom_device_mod, _bad_fork_name) and hasattr(custom_device_mod, _seed_all_name):
+            if not getattr(custom_device_mod, _bad_fork_name)():
+                getattr(custom_device_mod, _seed_all_name)(seed)
+        else:
+            message = f"Set seed for `{custom_backend_name}` device does not take effect, please add API's "
+            message += f"`{_bad_fork_name}` and `{_seed_all_name}` to `{custom_backend_name}` device module."
+            warnings.warn(message, UserWarning, stacklevel=3)
+
+
+def initial_seed() -> int:
+    r"""Returns the initial seed for generating random numbers as a
+    Python `long`.
+
+    .. note:: The returned seed is for the default generator on CPU only.
+    """
+    return default_generator.initial_seed()
+
+
+_fork_rng_warned_already = False
+
+
+@contextlib.contextmanager
+def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices", device_type="cuda") -> Generator:
+    """
+    Forks the RNG, so that when you return, the RNG is reset
+    to the state that it was previously in.
+
+    Args:
+        devices (iterable of Device IDs): devices for which to fork
+            the RNG. CPU RNG state is always forked. By default, :meth:`fork_rng` operates
+            on all devices, but will emit a warning if your machine has a lot
+            of devices, since this function will run very slowly in that case.
+            If you explicitly specify devices, this warning will be suppressed
+        enabled (bool): if ``False``, the RNG is not forked.  This is a convenience
+            argument for easily disabling the context manager without having
+            to delete it and unindent your Python code under it.
+        device_type (str): device type str, default is `cuda`. As for custom device,
+            see details in [Note: support the custom device with privateuse1]
+    """
+
+    device_type = torch.device(device_type).type
+    device_mod = getattr(torch, device_type, None)
+    if device_mod is None:
+        raise RuntimeError(f"torch has no module of `{device_type}`, you should register " +
+                           "a module by `torch._register_device_module`.")
+    global _fork_rng_warned_already
+
+    # Internal arguments:
+    #   _caller: the function which called fork_rng, which the user used
+    #   _devices_kw: the devices keyword of _caller
+
+    if not enabled:
+        yield
+        return
+
+    if devices is None:
+        num_devices = device_mod.device_count()
+        if num_devices > 1 and not _fork_rng_warned_already:
+            message = (f"{device_type.upper()} reports that you have {num_devices} available devices, and "
+                       f"you have used {_caller} without explicitly specifying which devices are being used. "
+                       f"For safety, we initialize *every* {device_type.upper()} device by default, which can "
+                       f"be quite slow if you have a lot of {device_type.upper()}s. If you know that you are only"
+                       f" making use of a few {device_type.upper()} devices, set the environment variable "
+                       f"{device_type.upper()}_VISIBLE_DEVICES or the '{_devices_kw}' keyword argument of {_caller} "
+                       "with the set of devices you are actually using. For example, if you are using CPU only, "
+                       "set device.upper()_VISIBLE_DEVICES= or devices=[]; if you are using device 0 only, "
+                       f"set {device_type.upper()}_VISIBLE_DEVICES=0 or devices=[0].  To initialize all devices "
+                       f"and suppress this warning, set the '{_devices_kw}' keyword argument to "
+                       f"`range(torch.{device_type}.device_count())`.")
+            warnings.warn(message)
+            _fork_rng_warned_already = True
+        devices = list(range(num_devices))
+    else:
+        # Protect against user passing us a generator; we need to traverse this
+        # multiple times but a generator will be exhausted upon first traversal
+        devices = list(devices)
+
+    cpu_rng_state = torch.get_rng_state()
+    device_rng_states = [device_mod.get_rng_state(device) for device in devices]
+
+    try:
+        yield
+    finally:
+        torch.set_rng_state(cpu_rng_state)
+        for device, device_rng_state in zip(devices, device_rng_states):
+            device_mod.set_rng_state(device_rng_state, device)
diff --git a/MLPY/Lib/site-packages/torch/return_types.py b/MLPY/Lib/site-packages/torch/return_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..01afeb30671183095a2f2ecc554a279aa7355a7d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/return_types.py
@@ -0,0 +1,48 @@
+import torch
+import inspect
+
+from torch.utils._pytree import register_pytree_node, SequenceKey
+
+__all__ = ["pytree_register_structseq", "all_return_types"]
+
+all_return_types = []
+
+# error: Module has no attribute "_return_types"
+return_types = torch._C._return_types  # type: ignore[attr-defined]
+
+def pytree_register_structseq(cls):
+    def structseq_flatten(structseq):
+        return list(structseq), None
+
+    def structseq_flatten_with_keys(structseq):
+        values, context = structseq_flatten(structseq)
+        return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+    def structseq_unflatten(values, context):
+        return cls(values)
+
+    register_pytree_node(
+        cls,
+        structseq_flatten,
+        structseq_unflatten,
+        flatten_with_keys_fn=structseq_flatten_with_keys,
+    )
+
+for name in dir(return_types):
+    if name.startswith('__'):
+        continue
+
+    _attr = getattr(return_types, name)
+    globals()[name] = _attr
+
+    if not name.startswith('_'):
+        __all__.append(name)
+        all_return_types.append(_attr)
+
+    # Today everything in torch.return_types is a structseq, aka a "namedtuple"-like
+    # thing defined by the Python C-API. We're going to need to modify this when that
+    # is no longer the case.
+    # NB: I don't know how to check that something is a "structseq" so we do a fuzzy
+    # check for tuple
+    if inspect.isclass(_attr) and issubclass(_attr, tuple):
+        pytree_register_structseq(_attr)
diff --git a/MLPY/Lib/site-packages/torch/return_types.pyi b/MLPY/Lib/site-packages/torch/return_types.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bdd28ebc62456b5ea6d6facf85b2e8547929c8e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/return_types.pyi
@@ -0,0 +1,437 @@
+# @generated from torch/_C/return_types.pyi
+
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    Iterator,
+    List,
+    Literal,
+    NamedTuple,
+    NoReturn,
+    Optional,
+    overload,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+from torch import contiguous_format, Generator, inf, memory_format, strided, Tensor, SymInt
+from torch.types import (
+    _bool,
+    _device,
+    _dtype,
+    _float,
+    _int,
+    _layout,
+    _qscheme,
+    _size,
+    Number,
+)
+
+class _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tuple[Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def mask(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _fused_moving_avg_obs_fq_helper(Tuple[Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def mask(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _linalg_det(Tuple[Tensor, Tensor, Tensor]):
+    @property
+    def result(self) -> Tensor: ...
+    @property
+    def LU(self) -> Tensor: ...
+    @property
+    def pivots(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 3
+    n_sequeunce_fields: _int = 3
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _linalg_eigh(Tuple[Tensor, Tensor]):
+    @property
+    def eigenvalues(self) -> Tensor: ...
+    @property
+    def eigenvectors(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _linalg_slogdet(Tuple[Tensor, Tensor, Tensor, Tensor]):
+    @property
+    def sign(self) -> Tensor: ...
+    @property
+    def logabsdet(self) -> Tensor: ...
+    @property
+    def LU(self) -> Tensor: ...
+    @property
+    def pivots(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 4
+    n_sequeunce_fields: _int = 4
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _linalg_solve_ex(Tuple[Tensor, Tensor, Tensor, Tensor]):
+    @property
+    def result(self) -> Tensor: ...
+    @property
+    def LU(self) -> Tensor: ...
+    @property
+    def pivots(self) -> Tensor: ...
+    @property
+    def info(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 4
+    n_sequeunce_fields: _int = 4
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _linalg_svd(Tuple[Tensor, Tensor, Tensor]):
+    @property
+    def U(self) -> Tensor: ...
+    @property
+    def S(self) -> Tensor: ...
+    @property
+    def Vh(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 3
+    n_sequeunce_fields: _int = 3
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _lu_with_info(Tuple[Tensor, Tensor, Tensor]):
+    @property
+    def LU(self) -> Tensor: ...
+    @property
+    def pivots(self) -> Tensor: ...
+    @property
+    def info(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 3
+    n_sequeunce_fields: _int = 3
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _scaled_dot_product_cudnn_attention(Tuple[Tensor, Tensor, Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def logsumexp(self) -> Tensor: ...
+    @property
+    def philox_seed(self) -> Tensor: ...
+    @property
+    def philox_offset(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 4
+    n_sequeunce_fields: _int = 4
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _scaled_dot_product_efficient_attention(Tuple[Tensor, Tensor, Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def log_sumexp(self) -> Tensor: ...
+    @property
+    def philox_seed(self) -> Tensor: ...
+    @property
+    def philox_offset(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 4
+    n_sequeunce_fields: _int = 4
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _scaled_dot_product_flash_attention(Tuple[Tensor, Tensor, Tensor, Tensor, Union[_int, SymInt], Union[_int, SymInt], Tensor, Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def logsumexp(self) -> Tensor: ...
+    @property
+    def cum_seq_q(self) -> Tensor: ...
+    @property
+    def cum_seq_k(self) -> Tensor: ...
+    @property
+    def max_q(self) -> Union[_int, SymInt]: ...
+    @property
+    def max_k(self) -> Union[_int, SymInt]: ...
+    @property
+    def philox_seed(self) -> Tensor: ...
+    @property
+    def philox_offset(self) -> Tensor: ...
+    @property
+    def debug_attn_mask(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor, Tensor, Union[_int, SymInt], Union[_int, SymInt], Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 9
+    n_sequeunce_fields: _int = 9
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _scaled_dot_product_flash_attention_for_cpu(Tuple[Tensor, Tensor]):
+    @property
+    def output(self) -> Tensor: ...
+    @property
+    def logsumexp(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class _unpack_dual(Tuple[Tensor, Tensor]):
+    @property
+    def primal(self) -> Tensor: ...
+    @property
+    def tangent(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class aminmax(Tuple[Tensor, Tensor]):
+    @property
+    def min(self) -> Tensor: ...
+    @property
+    def max(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class cummax(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class cummin(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class frexp(Tuple[Tensor, Tensor]):
+    @property
+    def mantissa(self) -> Tensor: ...
+    @property
+    def exponent(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class geqrf(Tuple[Tensor, Tensor]):
+    @property
+    def a(self) -> Tensor: ...
+    @property
+    def tau(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class histogram(Tuple[Tensor, Tensor]):
+    @property
+    def hist(self) -> Tensor: ...
+    @property
+    def bin_edges(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class histogramdd(Tuple[Tensor, Tuple[Tensor, ...]]):
+    @property
+    def hist(self) -> Tensor: ...
+    @property
+    def bin_edges(self) -> Tuple[Tensor, ...]: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tuple[Tensor, ...]]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class kthvalue(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class lu_unpack(Tuple[Tensor, Tensor, Tensor]):
+    @property
+    def P(self) -> Tensor: ...
+    @property
+    def L(self) -> Tensor: ...
+    @property
+    def U(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 3
+    n_sequeunce_fields: _int = 3
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class max(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class median(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class min(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class mode(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class nanmedian(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class qr(Tuple[Tensor, Tensor]):
+    @property
+    def Q(self) -> Tensor: ...
+    @property
+    def R(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class slogdet(Tuple[Tensor, Tensor]):
+    @property
+    def sign(self) -> Tensor: ...
+    @property
+    def logabsdet(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class sort(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class svd(Tuple[Tensor, Tensor, Tensor]):
+    @property
+    def U(self) -> Tensor: ...
+    @property
+    def S(self) -> Tensor: ...
+    @property
+    def V(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor, Tensor]): ...
+    n_fields: _int = 3
+    n_sequeunce_fields: _int = 3
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class topk(Tuple[Tensor, Tensor]):
+    @property
+    def values(self) -> Tensor: ...
+    @property
+    def indices(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+class triangular_solve(Tuple[Tensor, Tensor]):
+    @property
+    def solution(self) -> Tensor: ...
+    @property
+    def cloned_coefficient(self) -> Tensor: ...
+    def __new__(cls, sequence: Tuple[Tensor, Tensor]): ...
+    n_fields: _int = 2
+    n_sequeunce_fields: _int = 2
+    n_unnamed_fields: _int = 0
+    def __init_subclass__(cls) -> NoReturn: ...  # prohibit subclassing
+
+all_return_types: List[Type] = []
diff --git a/MLPY/Lib/site-packages/torch/serialization.py b/MLPY/Lib/site-packages/torch/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e449c14f5658cfdf4524ec9bc4dba74139d7524
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/serialization.py
@@ -0,0 +1,1456 @@
+import difflib
+import os
+import io
+import shutil
+import struct
+import sys
+import torch
+import tarfile
+import tempfile
+import warnings
+from contextlib import closing, contextmanager
+from enum import Enum
+from ._utils import _import_dotted_name
+from torch._sources import get_source_lines_and_file
+from torch.types import Storage
+from torch.storage import _get_dtype_from_pickle_storage_type
+from typing import Any, BinaryIO, Callable, cast, Dict, Optional, Type, Tuple, Union, IO, List
+from typing_extensions import TypeAlias, TypeGuard  # Python 3.10+
+import copyreg
+import pickle
+import torch._weights_only_unpickler as _weights_only_unpickler
+
+DEFAULT_PROTOCOL = 2
+
+LONG_SIZE = struct.Struct('=l').size
+INT_SIZE = struct.Struct('=i').size
+SHORT_SIZE = struct.Struct('=h').size
+
+MAGIC_NUMBER = 0x1950a86a20f9469cfc6c
+PROTOCOL_VERSION = 1001
+STORAGE_KEY_SEPARATOR = ','
+
+FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]]
+MAP_LOCATION: TypeAlias = Optional[Union[Callable[[torch.Tensor, str], torch.Tensor], torch.device, str, Dict[str, str]]]
+STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]
+
+__all__ = [
+    'SourceChangeWarning',
+    'mkdtemp',
+    'register_package',
+    'check_module_version_greater_or_equal',
+    'validate_cuda_device',
+    'validate_hpu_device',
+    'location_tag',
+    'default_restore_location',
+    'normalize_storage_type',
+    'storage_to_tensor_type',
+    'save',
+    'load',
+    'StorageType',
+    'LoadEndianness',
+    'get_default_load_endianness',
+    'set_default_load_endianness',
+]
+
+
+class SourceChangeWarning(Warning):
+    pass
+
+
+@contextmanager
+def mkdtemp():
+    path = tempfile.mkdtemp()
+    try:
+        yield path
+    finally:
+        shutil.rmtree(path)
+
+
+_package_registry: List[Tuple[int, Callable[[STORAGE], Optional[str]], Callable[[STORAGE, str], Optional[STORAGE]]]] = []
+
+class LoadEndianness(Enum):
+    NATIVE = 1
+    LITTLE = 2
+    BIG = 3
+
+_default_load_endian: Optional[LoadEndianness] = None
+
+def get_default_load_endianness() -> Optional[LoadEndianness]:
+    '''
+    Get fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Returns:
+        default_load_endian: Optional[LoadEndianness]
+    '''
+    return _default_load_endian
+
+def set_default_load_endianness(endianness):
+    '''
+    Set fallback byte order for loading files
+
+    If byteorder mark is not present in saved checkpoint,
+    this byte order is used as fallback.
+    By default, it's "native" byte order.
+
+    Args:
+        endianness: the new fallback byte order
+    '''
+    global _default_load_endian
+    if not isinstance(endianness, LoadEndianness) and endianness is not None:
+        raise TypeError("Invalid argument type in function set_default_load_endianness")
+    _default_load_endian = endianness
+
+def _is_zipfile(f) -> bool:
+    # This is a stricter implementation than zipfile.is_zipfile().
+    # zipfile.is_zipfile() is True if the magic number appears anywhere in the
+    # binary. Since we expect the files here to be generated by torch.save or
+    # torch.jit.save, it's safe to only check the start bytes and avoid
+    # collisions and assume the zip has only 1 file.
+    # See bugs.python.org/issue28494.
+
+    start = f.tell()
+    # Read the first few bytes and match against the ZIP file signature
+    local_header_magic_number = b'PK\x03\x04'
+    read_bytes = f.read(len(local_header_magic_number))
+    f.seek(start)
+    return read_bytes == local_header_magic_number
+
+
+def register_package(
+    priority: int,
+    tagger: Callable[[STORAGE], Optional[str]],
+    deserializer: Callable[[STORAGE, str], Optional[STORAGE]]
+):
+    '''
+    Registers callables for tagging and deserializing storage objects with an associated priority.
+    Tagging associates a device with a storage object at save time while deserializing moves a
+    storage object to an appropriate device at load time. :attr:`tagger` and :attr:`deserializer`
+    are run in the order given by their :attr:`priority` until a tagger/deserializer returns a
+    value that is not `None`.
+
+    To override the deserialization behavior for a device in the global registry, one can register a
+    tagger with a higher priority than the existing tagger.
+
+    This function can also be used to register a tagger and deserializer for new devices.
+
+    Args:
+        priority: Indicates the priority associated with the tagger and deserializer, where a lower
+            value indicates higher priority.
+        tagger: Callable that takes in a storage object and returns its tagged device as a string
+            or None.
+        deserializer: Callable that takes in storage object and a device string and returns a storage
+            object on the appropriate device or None.
+
+    Returns:
+        `None`
+
+    Example:
+        >>> def ipu_tag(obj):
+        >>>     if obj.device.type == 'ipu':
+        >>>         return 'ipu'
+        >>> def ipu_deserialize(obj, location):
+        >>>     if location.startswith('ipu'):
+        >>>         ipu = getattr(torch, "ipu", None)
+        >>>         assert ipu is not None, "IPU device module is not loaded"
+        >>>         assert torch.ipu.is_available(), "ipu is not available"
+        >>>         return obj.ipu(location)
+        >>> torch.serialization.register_package(11, ipu_tag, ipu_deserialize)
+    '''
+    queue_elem = (priority, tagger, deserializer)
+    _package_registry.append(queue_elem)
+    _package_registry.sort()
+
+
+def check_module_version_greater_or_equal(module, req_version_tuple, error_if_malformed=True):
+    '''
+    Check if a module's version satisfies requirements
+
+    Usually, a module's version string will be like 'x.y.z', which would be represented
+    as a tuple (x, y, z), but sometimes it could be an unexpected format. If the version
+    string does not match the given tuple's format up to the length of the tuple, then
+    error and exit or emit a warning.
+
+    Args:
+        module: the module to check the version of
+        req_version_tuple: tuple (usually of ints) representing the required version
+        error_if_malformed: whether we should exit if module version string is malformed
+
+    Returns:
+        requirement_is_met: bool
+    '''
+    try:
+        version_strs = module.__version__.split('.')
+        # Cast module version fields to match the types of the required version
+        module_version = tuple(
+            type(req_field)(version_strs[idx]) for idx, req_field in enumerate(req_version_tuple)
+        )
+        requirement_is_met = module_version >= req_version_tuple
+
+    except Exception as e:
+        message = (
+            f"'{module.__name__}' module version string is malformed '{module.__version__}' and cannot be compared"
+            f" with tuple {str(req_version_tuple)}"
+        )
+        if error_if_malformed:
+            raise RuntimeError(message) from e
+        else:
+            warnings.warn(message + ', but continuing assuming that requirement is met')
+            requirement_is_met = True
+
+    return requirement_is_met
+
+
+def _cpu_tag(obj):
+    if obj.device.type == 'cpu':
+        return 'cpu'
+
+
+def _cuda_tag(obj):
+    if obj.device.type == 'cuda':
+        return 'cuda:' + str(obj.device.index)
+
+def _hpu_tag(obj):
+    if obj.device.type == 'hpu':
+        return 'hpu:' + str(obj.device.index)
+
+def _mps_tag(obj):
+    if obj.device.type == 'mps':
+        return 'mps'
+
+
+def _meta_tag(obj):
+    if obj.device.type == 'meta':
+        return 'meta'
+
+
+def _privateuse1_tag(obj):
+    backend_name = torch._C._get_privateuse1_backend_name()
+    if obj.device.type == backend_name:
+        if obj.device.index is None:
+            return backend_name
+        else:
+            return backend_name + ':' + str(obj.device.index)
+
+
+def _cpu_deserialize(obj, location):
+    if location == 'cpu':
+        return obj
+
+
+def validate_cuda_device(location):
+    device = torch.cuda._utils._get_device_index(location, True)
+
+    if not torch.cuda.is_available():
+        raise RuntimeError('Attempting to deserialize object on a CUDA '
+                           'device but torch.cuda.is_available() is False. '
+                           'If you are running on a CPU-only machine, '
+                           'please use torch.load with map_location=torch.device(\'cpu\') '
+                           'to map your storages to the CPU.')
+    device_count = torch.cuda.device_count()
+    if device >= device_count:
+        raise RuntimeError('Attempting to deserialize object on CUDA device '
+                           f'{device} but torch.cuda.device_count() is {device_count}. Please use '
+                           'torch.load with map_location to map your storages '
+                           'to an existing device.')
+    return device
+
+
+def _cuda_deserialize(obj, location):
+    if location.startswith('cuda'):
+        device = validate_cuda_device(location)
+        if getattr(obj, "_torch_load_uninitialized", False):
+            with torch.cuda.device(device):
+                return torch.UntypedStorage(obj.nbytes(), device=torch.device(location))
+        else:
+            return obj.cuda(device)
+
+
+def validate_hpu_device(location):
+    hpu = getattr(torch, "hpu", None)
+    assert hpu is not None, "HPU device module is not loaded"
+    device = hpu._utils._get_device_index(location, optional=True)
+
+    if not hpu.is_available():
+        raise RuntimeError('Attempting to deserialize object on a HPU '
+                           'device but torch.hpu.is_available() is False. '
+                           'If you are running on a CPU-only machine, '
+                           'please use torch.load with map_location=torch.device(\'cpu\') '
+                           'to map your storages to the CPU.')
+    device_count = hpu.device_count()
+    if device >= device_count:
+        raise RuntimeError('Attempting to deserialize object on HPU device '
+                           f'{device} but torch.hpu.device_count() is {device_count}. Please use '
+                           'torch.load with map_location to map your storages '
+                           'to an existing device.')
+    return device
+
+
+def _hpu_deserialize(obj, location):
+    if location.startswith('hpu'):
+        hpu = getattr(torch, "hpu", None)
+        assert hpu is not None, "HPU device module is not loaded"
+        device = validate_hpu_device(location)
+        if getattr(obj, "_torch_load_uninitialized", False):
+            with hpu.device(device):
+                return torch.UntypedStorage(obj.nbytes(), device=torch.device(location))
+        else:
+            return obj.hpu(device)
+
+
+def _mps_deserialize(obj, location):
+    if location.startswith('mps'):
+        return obj.mps()
+
+
+def _meta_deserialize(obj, location):
+    if location == 'meta':
+        return torch.UntypedStorage(obj.nbytes(), device='meta')
+
+
+def _validate_privateuse1_device(location, backend_name):
+    '''
+    Check whether the device index of privateuse1 is valid
+
+    Register a device_module of privateuse1 by torch._register_device_module.
+    Implement the following methods in device_module like cuda:
+    device_module._utils._get_device_index(location, True),
+    device_module.device_count().
+
+    Args:
+        location: string of device
+        backend_name: the name of privateuse1, which can be renamed
+
+    Returns:
+        device_index: int
+    '''
+    if not hasattr(torch, backend_name):
+        raise RuntimeError(f'The {backend_name.upper()} device module is not registered. '
+                           'If you are running on a CPU-only machine, '
+                           'please use torch.load with map_location=torch.device(\'cpu\') '
+                           'to map your storages to the CPU.')
+    device_module = getattr(torch, backend_name)
+    if hasattr(device_module, '_utils') and hasattr(device_module._utils, '_get_device_index'):
+        device_index = device_module._utils._get_device_index(location, True)
+    else:
+        device = torch.device(location)
+        device_index = device.index if device.index else 0
+    if hasattr(device_module, 'is_available') and not device_module.is_available():
+        raise RuntimeError(f'Attempting to deserialize object on a {backend_name.upper()} '
+                           f'device but torch.{backend_name}.is_available() is False. '
+                           'If you are running on a CPU-only machine, '
+                           'please use torch.load with map_location=torch.device(\'cpu\') '
+                           'to map your storages to the CPU.')
+    if hasattr(device_module, 'device_count'):
+        device_count = device_module.device_count()
+        if device_index >= device_count:
+            raise RuntimeError(f'Attempting to deserialize object on {backend_name.upper()} device '
+                               f'{device_index} but torch.{backend_name}.device_count() is {device_count}. '
+                               'Please use torch.load with map_location to map your storages '
+                               'to an existing device.')
+    return device_index
+
+
+def _privateuse1_deserialize(obj, location):
+    backend_name = torch._C._get_privateuse1_backend_name()
+    if location.startswith(backend_name):
+        if not hasattr(obj, backend_name):
+            raise RuntimeError(f'Attempting to load the storages to the {backend_name.upper()} device '
+                               f'but torch.storage._StorageBase.{backend_name}() or '
+                               f'torch.storage.TypedStorage.{backend_name}() is not generated. '
+                               'Please use torch.utils.generate_methods_for_privateuse1_backend '
+                               f'to generate storage.{backend_name}() method first.')
+        device_index = _validate_privateuse1_device(location, backend_name)
+        return getattr(obj, backend_name)(device_index)
+
+
+register_package(10, _cpu_tag, _cpu_deserialize)
+register_package(20, _cuda_tag, _cuda_deserialize)
+register_package(21, _mps_tag, _mps_deserialize)
+register_package(22, _meta_tag, _meta_deserialize)
+register_package(23, _privateuse1_tag, _privateuse1_deserialize)
+register_package(24, _hpu_tag, _hpu_deserialize)
+
+
+def location_tag(storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]):
+    for _, tagger, _ in _package_registry:
+        location = tagger(storage)
+        if location:
+            return location
+    raise RuntimeError("don't know how to determine data location of "
+                       + torch.typename(storage))
+
+
+def default_restore_location(storage, location):
+    for _, _, fn in _package_registry:
+        result = fn(storage, location)
+        if result is not None:
+            return result
+    raise RuntimeError("don't know how to restore data location of "
+                       + torch.typename(storage) + " (tagged with "
+                       + location + ")")
+
+
+def normalize_storage_type(storage_type):
+    return getattr(torch, storage_type.__name__)
+
+
+def storage_to_tensor_type(storage):
+    storage_type = type(storage)
+    module = _import_dotted_name(storage_type.__module__)
+    return getattr(module, storage_type.__name__.replace('Storage', 'Tensor'))
+
+
+def _is_path(name_or_buffer) -> TypeGuard[Union[str, os.PathLike]]:
+    return isinstance(name_or_buffer, (str, os.PathLike))
+
+
+class _opener:
+    def __init__(self, file_like):
+        self.file_like = file_like
+
+    def __enter__(self):
+        return self.file_like
+
+    def __exit__(self, *args):
+        pass
+
+
+class _open_file(_opener):
+    def __init__(self, name, mode):
+        super().__init__(open(name, mode))
+
+    def __exit__(self, *args):
+        self.file_like.close()
+
+
+class _open_buffer_reader(_opener):
+    def __init__(self, buffer):
+        super().__init__(buffer)
+        _check_seekable(buffer)
+
+
+class _open_buffer_writer(_opener):
+    def __exit__(self, *args):
+        self.file_like.flush()
+
+
+def _open_file_like(name_or_buffer, mode):
+    if _is_path(name_or_buffer):
+        return _open_file(name_or_buffer, mode)
+    else:
+        if 'w' in mode:
+            return _open_buffer_writer(name_or_buffer)
+        elif 'r' in mode:
+            return _open_buffer_reader(name_or_buffer)
+        else:
+            raise RuntimeError(f"Expected 'r' or 'w' in mode but got {mode}")
+
+
+class _open_zipfile_reader(_opener):
+    def __init__(self, name_or_buffer) -> None:
+        super().__init__(torch._C.PyTorchFileReader(name_or_buffer))
+
+
+class _open_zipfile_writer_file(_opener):
+    def __init__(self, name) -> None:
+        self.file_stream = None
+        self.name = str(name)
+        try:
+            self.name.encode('ascii')
+        except UnicodeEncodeError:
+            # PyTorchFileWriter only supports ascii filename.
+            # For filenames with non-ascii characters, we rely on Python
+            # for writing out the file.
+            self.file_stream = io.FileIO(self.name, mode='w')
+            super().__init__(torch._C.PyTorchFileWriter(self.file_stream))
+        else:
+            super().__init__(torch._C.PyTorchFileWriter(self.name))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        if self.file_stream is not None:
+            self.file_stream.close()
+
+
+class _open_zipfile_writer_buffer(_opener):
+    def __init__(self, buffer) -> None:
+        if not callable(getattr(buffer, "write", None)):
+            msg = f"Buffer of {str(type(buffer)).strip('<>')} has no callable attribute 'write'"
+            if not hasattr(buffer, "write"):
+                raise AttributeError(msg)
+            raise TypeError(msg)
+        self.buffer = buffer
+        super().__init__(torch._C.PyTorchFileWriter(buffer))
+
+    def __exit__(self, *args) -> None:
+        self.file_like.write_end_of_file()
+        self.buffer.flush()
+
+
+def _open_zipfile_writer(name_or_buffer):
+    container: Type[_opener]
+    if _is_path(name_or_buffer):
+        container = _open_zipfile_writer_file
+    else:
+        container = _open_zipfile_writer_buffer
+    return container(name_or_buffer)
+
+
+def _is_compressed_file(f) -> bool:
+    compress_modules = ['gzip']
+    try:
+        return f.__module__ in compress_modules
+    except AttributeError:
+        return False
+
+
+def _should_read_directly(f):
+    """
+    Checks if f is a file that should be read directly. It should be read
+    directly if it is backed by a real file (has a fileno) and is not a
+    a compressed file (e.g. gzip)
+    """
+    if _is_compressed_file(f):
+        return False
+    try:
+        return f.fileno() >= 0
+    except io.UnsupportedOperation:
+        return False
+    except AttributeError:
+        return False
+
+
+def _check_seekable(f) -> bool:
+
+    def raise_err_msg(patterns, e):
+        for p in patterns:
+            if p in str(e):
+                msg = (str(e) + ". You can only torch.load from a file that is seekable."
+                                + " Please pre-load the data into a buffer like io.BytesIO and"
+                                + " try to load from it instead.")
+                raise type(e)(msg)
+        raise e
+
+    try:
+        f.seek(f.tell())
+        return True
+    except (io.UnsupportedOperation, AttributeError) as e:
+        raise_err_msg(["seek", "tell"], e)
+    return False
+
+
+def _check_dill_version(pickle_module) -> None:
+    '''Checks if using dill as the pickle module, and if so, checks if it is the correct version.
+    If dill version is lower than 0.3.1, a ValueError is raised.
+
+    Args:
+        pickle_module: module used for pickling metadata and objects
+
+    '''
+    if pickle_module is not None and pickle_module.__name__ == 'dill':
+        required_dill_version = (0, 3, 1)
+        if not check_module_version_greater_or_equal(pickle_module, required_dill_version, False):
+            raise ValueError((
+                "'torch' supports dill >= {}, but you have dill {}."
+                " Please upgrade dill or switch to 'pickle'"
+            ).format(
+                '.'.join([str(num) for num in required_dill_version]),
+                pickle_module.__version__
+            ))
+
+
+def _check_save_filelike(f):
+    if not _is_path(f) and not hasattr(f, 'write'):
+        raise AttributeError(
+            "expected 'f' to be string, path, or a file-like object with "
+            "a 'write' attribute")
+
+
+def save(
+    obj: object,
+    f: FILE_LIKE,
+    pickle_module: Any = pickle,
+    pickle_protocol: int = DEFAULT_PROTOCOL,
+    _use_new_zipfile_serialization: bool = True,
+    _disable_byteorder_record: bool = False
+) -> None:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """save(obj, f, pickle_module=pickle, pickle_protocol=DEFAULT_PROTOCOL, _use_new_zipfile_serialization=True)
+
+    Saves an object to a disk file.
+
+    See also: :ref:`saving-loading-tensors`
+
+    Args:
+        obj: saved object
+        f: a file-like object (has to implement write and flush) or a string or
+           os.PathLike object containing a file name
+        pickle_module: module used for pickling metadata and objects
+        pickle_protocol: can be specified to override the default protocol
+
+    .. note::
+        A common PyTorch convention is to save tensors using .pt file extension.
+
+    .. note::
+        PyTorch preserves storage sharing across serialization. See
+        :ref:`preserve-storage-sharing` for more details.
+
+    .. note::
+        The 1.6 release of PyTorch switched ``torch.save`` to use a new
+        zipfile-based file format. ``torch.load`` still retains the ability to
+        load files in the old format. If for any reason you want ``torch.save``
+        to use the old format, pass the kwarg ``_use_new_zipfile_serialization=False``.
+
+    Example:
+        >>> # xdoctest: +SKIP("makes cwd dirty")
+        >>> # Save to file
+        >>> x = torch.tensor([0, 1, 2, 3, 4])
+        >>> torch.save(x, 'tensor.pt')
+        >>> # Save to io.BytesIO buffer
+        >>> buffer = io.BytesIO()
+        >>> torch.save(x, buffer)
+    """
+    torch._C._log_api_usage_once("torch.save")
+    _check_dill_version(pickle_module)
+    _check_save_filelike(f)
+
+    if _use_new_zipfile_serialization:
+        with _open_zipfile_writer(f) as opened_zipfile:
+            _save(obj, opened_zipfile, pickle_module, pickle_protocol, _disable_byteorder_record)
+            return
+    else:
+        with _open_file_like(f, 'wb') as opened_file:
+            _legacy_save(obj, opened_file, pickle_module, pickle_protocol)
+
+
+def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None:
+    import torch.nn as nn
+    serialized_container_types = {}
+    serialized_storages = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj: Any) -> Optional[Tuple]:
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            if obj in serialized_container_types:
+                return None
+            serialized_container_types[obj] = True
+            source_file = source = None
+            try:
+                source_lines, _, source_file = get_source_lines_and_file(obj)
+                source = ''.join(source_lines)
+            except Exception:  # saving the source is optional, so we can ignore any errors
+                warnings.warn("Couldn't retrieve source code for container of "
+                              "type " + obj.__name__ + ". It won't be checked "
+                              "for correctness upon loading.")
+            return ('module', obj, source_file, source)
+
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+            storage: torch.UntypedStorage
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                dtype = obj.dtype
+                storage_numel = obj._size()
+
+            elif isinstance(obj, torch.UntypedStorage):
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                dtype = torch.uint8
+                storage_numel = storage.nbytes()
+            else:
+                raise TypeError(f'type not recognized: {type(obj)}')
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            'Cannot save multiple tensors or storages that '
+                            'view the same data as different types')
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            view_metadata: Optional[Tuple[str, int, int]]
+
+            # Offset is always 0, but we keep it for backwards compatibility
+            # with the old serialization format (which supported storage views)
+            offset = 0
+            storage_key = str(storage._cdata)
+            location = location_tag(storage)
+
+            # TODO: There's an issue here with FC. It might be impossible to
+            # solve, but it's worth noting. Imagine we save a list `[storage,
+            # tensor]`, where `tensor.storage()` is the same as `storage`, and
+            # `tensor.element_size() > 1`. Let's say that `tensor.dtype ==
+            # torch.float`.  The storage will be serialized with element size
+            # of 1, since we're choosing to serialize the first occurance of
+            # a duplicate storage. Since this legacy serialization format saves
+            # the numel of the storage, rather than nbytes directly, we'll be
+            # effectively saving nbytes in this case.  We'll be able to load it
+            # and the tensor back up with no problems in _this_ and future
+            # versions of pytorch, but in older versions, here's the problem:
+            # the storage will be loaded up as a UntypedStorage, and then the
+            # FloatTensor will loaded and the UntypedStorage will be assigned to
+            # it. Since the storage dtype does not match the tensor dtype, this
+            # will cause an error.  If we reverse the list, like `[tensor,
+            # storage]`, then we will save the `tensor.storage()` as a faked
+            # `FloatStorage`, and the saved size will be the correct
+            # dtype-specific numel count that old versions expect. `tensor`
+            # will be able to load up properly in old versions, pointing to
+            # a FloatStorage. However, `storage` is still being translated to
+            # a UntypedStorage, and it will try to resolve to the same
+            # FloatStorage that `tensor` contains. This will also cause an
+            # error. It doesn't seem like there's any way around this.
+            # Probably, we just cannot maintain FC for the legacy format if the
+            # saved list contains both a tensor and a storage that point to the
+            # same data.  We should still be able to maintain FC for lists of
+            # just tensors, as long as all views share the same dtype as the
+            # tensor they are viewing.
+
+            if storage_key not in serialized_storages:
+                serialized_storages[storage_key] = (storage, dtype)
+            is_view = storage._cdata != storage._cdata
+            if is_view:
+                view_metadata = (str(storage._cdata), offset, storage.nbytes())
+            else:
+                view_metadata = None
+
+            res = ('storage',
+                   storage_type,
+                   storage_key,
+                   location,
+                   storage_numel,
+                   view_metadata)
+            return res
+        return None
+
+    sys_info = dict(
+        protocol_version=PROTOCOL_VERSION,
+        little_endian=sys.byteorder == 'little',
+        type_sizes=dict(
+            short=SHORT_SIZE,
+            int=INT_SIZE,
+            long=LONG_SIZE,
+        ),
+    )
+
+    pickle_module.dump(MAGIC_NUMBER, f, protocol=pickle_protocol)
+    pickle_module.dump(PROTOCOL_VERSION, f, protocol=pickle_protocol)
+    pickle_module.dump(sys_info, f, protocol=pickle_protocol)
+    pickler = pickle_module.Pickler(f, protocol=pickle_protocol)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+
+    serialized_storage_keys = sorted(serialized_storages.keys())
+    pickle_module.dump(serialized_storage_keys, f, protocol=pickle_protocol)
+    f.flush()
+    for key in serialized_storage_keys:
+        storage, dtype = serialized_storages[key]
+        storage._write_file(f, _should_read_directly(f), True, torch._utils._element_size(dtype))
+
+
+def _save(obj, zip_file, pickle_module, pickle_protocol, _disable_byteorder_record):
+    serialized_storages = {}
+    id_map: Dict[int, str] = {}
+
+    # Since loading storages that view the same data with different dtypes is
+    # not supported, we need to keep track of the dtype associated with each
+    # storage data_ptr and throw an error if the dtype is ever different.
+    # TODO: This feature could be added in the future
+    storage_dtypes: Dict[int, torch.dtype] = {}
+
+    def persistent_id(obj):
+        # FIXME: the docs say that persistent_id should only return a string
+        # but torch store returns tuples. This works only in the binary protocol
+        # see
+        # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects
+        # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537
+        if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj):
+
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, this case
+                # can be deleted
+                storage = obj._untyped_storage
+                storage_dtype = obj.dtype
+                storage_type_str = obj._pickle_storage_type()
+                storage_type = getattr(torch, storage_type_str)
+                storage_numel = obj._size()
+
+            else:
+                storage = obj
+                storage_dtype = torch.uint8
+                storage_type = normalize_storage_type(type(obj))
+                storage_numel = storage.nbytes()
+
+            # If storage is allocated, ensure that any other saved storages
+            # pointing to the same data all have the same dtype. If storage is
+            # not allocated, don't perform this check
+            if storage.data_ptr() != 0:
+                if storage.data_ptr() in storage_dtypes:
+                    if storage_dtype != storage_dtypes[storage.data_ptr()]:
+                        raise RuntimeError(
+                            'Cannot save multiple tensors or storages that '
+                            'view the same data as different types')
+                else:
+                    storage_dtypes[storage.data_ptr()] = storage_dtype
+
+            storage_key = id_map.setdefault(storage._cdata, str(len(id_map)))
+            location = location_tag(storage)
+            serialized_storages[storage_key] = storage
+
+            return ('storage',
+                    storage_type,
+                    storage_key,
+                    location,
+                    storage_numel)
+
+        return None
+
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = pickle_module.Pickler(data_buf, protocol=pickle_protocol)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    zip_file.write_record('data.pkl', data_value, len(data_value))
+
+    # Write byte order marker
+    if not _disable_byteorder_record:
+        if sys.byteorder not in ['little', 'big']:
+            raise ValueError('Unknown endianness type: ' + sys.byteorder)
+
+        zip_file.write_record('byteorder', sys.byteorder, len(sys.byteorder))
+
+    # Write each tensor to a file named tensor/the_tensor_key in the zip archive
+    for key in sorted(serialized_storages.keys()):
+        name = f'data/{key}'
+        storage = serialized_storages[key]
+        # given that we copy things around anyway, we might use storage.cpu()
+        # this means to that to get tensors serialized, you need to implement
+        # .cpu() on the underlying Storage
+        if storage.device.type != 'cpu':
+            storage = storage.cpu()
+        # Now that it is on the CPU we can directly copy it into the zip file
+        num_bytes = storage.nbytes()
+        zip_file.write_record(name, storage, num_bytes)
+
+
+def load(
+    f: FILE_LIKE,
+    map_location: MAP_LOCATION = None,
+    pickle_module: Any = None,
+    *,
+    weights_only: bool = False,
+    mmap: Optional[bool] = None,
+    **pickle_load_args: Any
+) -> Any:
+    # Reference: https://github.com/pytorch/pytorch/issues/54354
+    # The first line of this docstring overrides the one Sphinx generates for the
+    # documentation. We need it so that Sphinx doesn't leak `pickle`s path from
+    # the build environment (e.g. `<module 'pickle' from '/leaked/path').
+
+    """load(f, map_location=None, pickle_module=pickle, *, weights_only=False, mmap=None, **pickle_load_args)
+
+    Loads an object saved with :func:`torch.save` from a file.
+
+    :func:`torch.load` uses Python's unpickling facilities but treats storages,
+    which underlie tensors, specially. They are first deserialized on the
+    CPU and are then moved to the device they were saved from. If this fails
+    (e.g. because the run time system doesn't have certain devices), an exception
+    is raised. However, storages can be dynamically remapped to an alternative
+    set of devices using the :attr:`map_location` argument.
+
+    If :attr:`map_location` is a callable, it will be called once for each serialized
+    storage with two arguments: storage and location. The storage argument
+    will be the initial deserialization of the storage, residing on the CPU.
+    Each serialized storage has a location tag associated with it which
+    identifies the device it was saved from, and this tag is the second
+    argument passed to :attr:`map_location`. The builtin location tags are ``'cpu'``
+    for CPU tensors and ``'cuda:device_id'`` (e.g. ``'cuda:2'``) for CUDA tensors.
+    :attr:`map_location` should return either ``None`` or a storage. If
+    :attr:`map_location` returns a storage, it will be used as the final deserialized
+    object, already moved to the right device. Otherwise, :func:`torch.load` will
+    fall back to the default behavior, as if :attr:`map_location` wasn't specified.
+
+    If :attr:`map_location` is a :class:`torch.device` object or a string containing
+    a device tag, it indicates the location where all tensors should be loaded.
+
+    Otherwise, if :attr:`map_location` is a dict, it will be used to remap location tags
+    appearing in the file (keys), to ones that specify where to put the
+    storages (values).
+
+    User extensions can register their own location tags and tagging and
+    deserialization methods using :func:`torch.serialization.register_package`.
+
+    Args:
+        f: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`),
+            or a string or os.PathLike object containing a file name
+        map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+            locations
+        pickle_module: module used for unpickling metadata and objects (has to
+            match the :attr:`pickle_module` used to serialize file)
+        weights_only: Indicates whether unpickler should be restricted to
+            loading only tensors, primitive types and dictionaries
+        mmap: Indicates whether the file should be mmaped rather than loading all the storages into memory.
+            Typically, tensor storages in the file will first be moved from disk to CPU memory, after which they
+            are moved to the location that they were tagged with when saving, or specified by ``map_location``. This
+            second step is a no-op if the final location is CPU. When the ``mmap`` flag is set, instead of copying the
+            tensor storages from disk to CPU memory in the first step, ``f`` is mmaped.
+        pickle_load_args: (Python 3 only) optional keyword arguments passed over to
+            :func:`pickle_module.load` and :func:`pickle_module.Unpickler`, e.g.,
+            :attr:`errors=...`.
+
+    .. warning::
+        :func:`torch.load()` unless `weights_only` parameter is set to `True`,
+        uses ``pickle`` module implicitly, which is known to be insecure.
+        It is possible to construct malicious pickle data which will execute arbitrary code
+        during unpickling. Never load data that could have come from an untrusted
+        source in an unsafe mode, or that could have been tampered with. **Only load data you trust**.
+
+    .. note::
+        When you call :func:`torch.load()` on a file which contains GPU tensors, those tensors
+        will be loaded to GPU by default. You can call ``torch.load(.., map_location='cpu')``
+        and then :meth:`load_state_dict` to avoid GPU RAM surge when loading a model checkpoint.
+
+    .. note::
+        By default, we decode byte strings as ``utf-8``.  This is to avoid a common error
+        case ``UnicodeDecodeError: 'ascii' codec can't decode byte 0x...``
+        when loading files saved by Python 2 in Python 3.  If this default
+        is incorrect, you may use an extra :attr:`encoding` keyword argument to specify how
+        these objects should be loaded, e.g., :attr:`encoding='latin1'` decodes them
+        to strings using ``latin1`` encoding, and :attr:`encoding='bytes'` keeps them
+        as byte arrays which can be decoded later with ``byte_array.decode(...)``.
+
+    Example:
+        >>> # xdoctest: +SKIP("undefined filepaths")
+        >>> torch.load('tensors.pt', weights_only=True)
+        # Load all tensors onto the CPU
+        >>> torch.load('tensors.pt', map_location=torch.device('cpu'), weights_only=True)
+        # Load all tensors onto the CPU, using a function
+        >>> torch.load('tensors.pt', map_location=lambda storage, loc: storage, weights_only=True)
+        # Load all tensors onto GPU 1
+        >>> torch.load('tensors.pt', map_location=lambda storage, loc: storage.cuda(1), weights_only=True)
+        # Map tensors from GPU 1 to GPU 0
+        >>> torch.load('tensors.pt', map_location={'cuda:1': 'cuda:0'}, weights_only=True)
+        # Load tensor from io.BytesIO object
+        # Loading from a buffer setting weights_only=False, warning this can be unsafe
+        >>> with open('tensor.pt', 'rb') as f:
+        ...     buffer = io.BytesIO(f.read())
+        >>> torch.load(buffer, weights_only=False)
+        # Load a module with 'ascii' encoding for unpickling
+        # Loading from a module setting weights_only=False, warning this can be unsafe
+        >>> torch.load('module.pt', encoding='ascii', weights_only=False)
+    """
+    torch._C._log_api_usage_once("torch.load")
+    UNSAFE_MESSAGE = (
+        "Weights only load failed. Re-running `torch.load` with `weights_only` set to `False`"
+        " will likely succeed, but it can result in arbitrary code execution."
+        "Do it only if you get the file from a trusted source. WeightsUnpickler error: "
+    )
+    # Add ability to force safe only weight loads via environment variable
+    if os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0").lower() in ['1', 'y', 'yes', 'true']:
+        weights_only = True
+
+    if weights_only:
+        if pickle_module is not None:
+            raise RuntimeError("Can not safely load weights when explicit pickle_module is specified")
+    else:
+        if pickle_module is None:
+            pickle_module = pickle
+
+    # make flipping default BC-compatible
+    if mmap is None:
+        mmap = False
+
+    _check_dill_version(pickle_module)
+
+    if 'encoding' not in pickle_load_args.keys():
+        pickle_load_args['encoding'] = 'utf-8'
+
+    with _open_file_like(f, 'rb') as opened_file:
+        if _is_zipfile(opened_file):
+            # The zipfile reader is going to advance the current file position.
+            # If we want to actually tail call to torch.jit.load, we need to
+            # reset back to the original position.
+            orig_position = opened_file.tell()
+            overall_storage = None
+            with _open_zipfile_reader(opened_file) as opened_zipfile:
+                if _is_torchscript_zip(opened_zipfile):
+                    warnings.warn("'torch.load' received a zip file that looks like a TorchScript archive"
+                                  " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
+                                  " silence this warning)", UserWarning)
+                    opened_file.seek(orig_position)
+                    return torch.jit.load(opened_file, map_location=map_location)
+                if mmap:
+                    if not _is_path(f):
+                        raise ValueError("f must be a file path in order to use the mmap argument")
+                    size = os.path.getsize(f)
+                    overall_storage = torch.UntypedStorage.from_file(os.fspath(f), False, size)
+                if weights_only:
+                    try:
+                        return _load(opened_zipfile,
+                                     map_location,
+                                     _weights_only_unpickler,
+                                     overall_storage=overall_storage,
+                                     **pickle_load_args)
+                    except RuntimeError as e:
+                        raise pickle.UnpicklingError(UNSAFE_MESSAGE + str(e)) from None
+                return _load(opened_zipfile,
+                             map_location,
+                             pickle_module,
+                             overall_storage=overall_storage,
+                             **pickle_load_args)
+        if mmap:
+            f_name = "" if not isinstance(f, str) else f"{f}, "
+            raise RuntimeError("mmap can only be used with files saved with "
+                               f"`torch.save({f_name}_use_new_zipfile_serialization=True), "
+                               "please torch.save your checkpoint with this option in order to use mmap.")
+        if weights_only:
+            try:
+                return _legacy_load(opened_file, map_location, _weights_only_unpickler, **pickle_load_args)
+            except RuntimeError as e:
+                raise pickle.UnpicklingError(UNSAFE_MESSAGE + str(e)) from None
+        return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
+
+
+# Register pickling support for layout instances such as
+# torch.sparse_coo, etc
+def _get_layout(name):
+    """Get layout extension object from its string representation.
+    """
+    cache = _get_layout.cache   # type: ignore[attr-defined]
+    if not cache:
+        for v in torch.__dict__.values():
+            if isinstance(v, torch.layout):
+                cache[str(v)] = v
+    return cache[name]
+
+# There are yet not good way to type annotate function attributes https://github.com/python/mypy/issues/2087
+_get_layout.cache = {}   # type: ignore[attr-defined]
+copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),)))
+
+
+def _legacy_load(f, map_location, pickle_module, **pickle_load_args):
+    deserialized_objects: Dict[int, Any] = {}
+
+    restore_location = _get_restore_location(map_location)
+
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+
+        def find_class(self, mod_name, name):
+            if type(name) is str and 'Storage' in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            return super().find_class(mod_name, name)
+
+    def _check_container_source(container_type, source_file, original_source):
+        try:
+            current_source = ''.join(get_source_lines_and_file(container_type)[0])
+        except Exception:  # saving the source is optional, so we can ignore any errors
+            warnings.warn("Couldn't retrieve source code for container of "
+                          "type " + container_type.__name__ + ". It won't be checked "
+                          "for correctness upon loading.")
+            return
+        if original_source != current_source:
+            if container_type.dump_patches:
+                file_name = container_type.__name__ + '.patch'
+                diff = difflib.unified_diff(current_source.split('\n'),
+                                            original_source.split('\n'),
+                                            source_file,
+                                            source_file, lineterm="")
+                lines = '\n'.join(diff)
+                try:
+                    with open(file_name, 'a+') as f:
+                        file_size = f.seek(0, 2)
+                        f.seek(0)
+                        if file_size == 0:
+                            f.write(lines)
+                        elif file_size != len(lines) or f.read() != lines:
+                            raise OSError
+                    msg = ("Saved a reverse patch to " + file_name + ". "
+                           "Run `patch -p0 < " + file_name + "` to revert your "
+                           "changes.")
+                except OSError:
+                    msg = ("Tried to save a patch, but couldn't create a "
+                           "writable file " + file_name + ". Make sure it "
+                           "doesn't exist and your working directory is "
+                           "writable.")
+            else:
+                msg = ("you can retrieve the original source code by "
+                       "accessing the object's source attribute or set "
+                       "`torch.nn.Module.dump_patches = True` and use the "
+                       "patch tool to revert the changes.")
+            msg = f"source code of class '{torch.typename(container_type)}' has changed. {msg}"
+            warnings.warn(msg, SourceChangeWarning)
+
+    def legacy_load(f):
+        deserialized_objects: Dict[int, Any] = {}
+
+        def persistent_load(saved_id):
+            if isinstance(saved_id, tuple):
+                # Ignore containers that don't have any sources saved
+                if all(saved_id[1:]):
+                    _check_container_source(*saved_id)
+                return saved_id[0]
+            return deserialized_objects[int(saved_id)]
+
+        with closing(tarfile.open(fileobj=f, mode='r:', format=tarfile.PAX_FORMAT)) as tar, \
+                mkdtemp() as tmpdir:
+
+            tar.extract('storages', path=tmpdir)
+            with open(os.path.join(tmpdir, 'storages'), 'rb', 0) as f:
+                num_storages = pickle_module.load(f, **pickle_load_args)
+                for i in range(num_storages):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, location, storage_type = args
+                    dtype = storage_type._dtype
+                    obj = cast(Storage, torch.UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype))
+                    obj = restore_location(obj, location)
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[key] = torch.storage.TypedStorage(
+                        wrap_storage=obj,
+                        dtype=dtype,
+                        _internal=True)
+
+                storage_views = pickle_module.load(f, **pickle_load_args)
+                for target_cdata, root_cdata, offset, numel in storage_views:
+                    root = deserialized_objects[root_cdata]
+                    element_size = torch._utils._element_size(root.dtype)
+                    offset_bytes = offset * element_size
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[target_cdata] = torch.storage.TypedStorage(
+                        wrap_storage=root._untyped_storage[offset_bytes:offset_bytes + numel * element_size],
+                        dtype=root.dtype,
+                        _internal=True)
+
+            tar.extract('tensors', path=tmpdir)
+            with open(os.path.join(tmpdir, 'tensors'), 'rb', 0) as f:
+                num_tensors = pickle_module.load(f, **pickle_load_args)
+                for _ in range(num_tensors):
+                    args = pickle_module.load(f, **pickle_load_args)
+                    key, storage_id, original_tensor_type = args
+                    storage = deserialized_objects[storage_id]
+                    ndim, = struct.unpack('<i', f.read(4))
+                    # skip next 4 bytes; legacy encoding treated ndim as 8 bytes
+                    f.read(4)
+                    numel = struct.unpack(f'<{ndim}q', f.read(8 * ndim))
+                    stride = struct.unpack(f'<{ndim}q', f.read(8 * ndim))
+                    storage_offset, = struct.unpack('<q', f.read(8))
+                    tensor = torch.empty((0,), dtype=storage.dtype).set_(
+                        storage._untyped_storage, storage_offset, numel, stride)
+                    deserialized_objects[key] = tensor
+
+            pickle_file = tar.extractfile('pickle')
+            unpickler = UnpicklerWrapper(pickle_file, **pickle_load_args)
+            unpickler.persistent_load = persistent_load
+            result = unpickler.load()
+            return result
+
+    deserialized_objects = {}
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        if typename == 'module':
+            # Ignore containers that don't have any sources saved
+            if all(data[1:]):
+                _check_container_source(*data)
+            return data[0]
+        elif typename == 'storage':
+            storage_type, root_key, location, numel, view_metadata = data
+            location = _maybe_decode_ascii(location)
+            dtype = storage_type.dtype
+
+            nbytes = numel * torch._utils._element_size(dtype)
+
+            if root_key not in deserialized_objects:
+                if torch._guards.active_fake_mode() is not None:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes, device='meta'))
+                else:
+                    obj = cast(Storage, torch.UntypedStorage(nbytes))
+                    obj._torch_load_uninitialized = True
+                    obj = restore_location(obj, location)
+                # TODO: Once we decide to break serialization FC, we can
+                # stop wrapping with TypedStorage
+                typed_storage = torch.storage.TypedStorage(
+                    wrap_storage=obj,
+                    dtype=dtype,
+                    _internal=True)
+                deserialized_objects[root_key] = typed_storage
+            else:
+                typed_storage = deserialized_objects[root_key]
+                if typed_storage._data_ptr() == 0:
+                    typed_storage = torch.storage.TypedStorage(
+                        device=typed_storage._untyped_storage.device,
+                        dtype=dtype,
+                        _internal=True)
+
+            if view_metadata is not None:
+                view_key, offset, view_size = view_metadata
+                offset_bytes = offset * torch._utils._element_size(dtype)
+                view_size_bytes = view_size * torch._utils._element_size(dtype)
+                if view_key not in deserialized_objects:
+                    # TODO: Once we decide to break serialization FC, we can
+                    # stop wrapping with TypedStorage
+                    deserialized_objects[view_key] = torch.storage.TypedStorage(
+                        wrap_storage=typed_storage._untyped_storage[offset_bytes:offset_bytes + view_size_bytes],
+                        dtype=dtype,
+                        _internal=True)
+                res = deserialized_objects[view_key]
+
+            else:
+                res = typed_storage
+            return res
+        else:
+            raise RuntimeError(f"Unknown saved id type: {saved_id[0]}")
+
+    _check_seekable(f)
+    f_should_read_directly = _should_read_directly(f)
+
+    if f_should_read_directly and f.tell() == 0:
+        # legacy_load requires that f has fileno()
+        # only if offset is zero we can attempt the legacy tar file loader
+        try:
+            return legacy_load(f)
+        except tarfile.TarError:
+            if _is_zipfile(f):
+                # .zip is used for torch.jit.save and will throw an un-pickling error here
+                raise RuntimeError(
+                    f"{f.name} is a zip archive (did you mean to use torch.jit.load()?)") from None
+            # if not a tarfile, reset file offset and proceed
+            f.seek(0)
+
+    if not hasattr(f, 'readinto') and (3, 8, 0) <= sys.version_info < (3, 8, 2):
+        raise RuntimeError(
+            "torch.load does not work with file-like objects that do not implement readinto on Python 3.8.0 and 3.8.1. "
+            f"Received object of type \"{type(f)}\". Please update to Python 3.8.2 or newer to restore this "
+            "functionality.")
+
+    magic_number = pickle_module.load(f, **pickle_load_args)
+    if magic_number != MAGIC_NUMBER:
+        raise RuntimeError("Invalid magic number; corrupt file?")
+    protocol_version = pickle_module.load(f, **pickle_load_args)
+    if protocol_version != PROTOCOL_VERSION:
+        raise RuntimeError(f"Invalid protocol version: {protocol_version}")
+
+    _sys_info = pickle_module.load(f, **pickle_load_args)
+    unpickler = UnpicklerWrapper(f, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    result = unpickler.load()
+
+    deserialized_storage_keys = pickle_module.load(f, **pickle_load_args)
+
+    if torch._guards.active_fake_mode() is None:
+        offset = f.tell() if f_should_read_directly else None
+        for key in deserialized_storage_keys:
+            assert key in deserialized_objects
+            typed_storage = deserialized_objects[key]
+            typed_storage._untyped_storage._set_from_file(
+                f, offset, f_should_read_directly,
+                torch._utils._element_size(typed_storage.dtype))
+            if offset is not None:
+                offset = f.tell()
+
+    torch._utils._validate_loaded_sparse_tensors()
+
+    return result
+
+
+def _maybe_decode_ascii(bytes_str: Union[bytes, str]) -> str:
+    # When using encoding='bytes' in Py3, some **internal** keys stored as
+    # strings in Py2 are loaded as bytes. This function decodes them with
+    # ascii encoding, one that Py3 uses by default.
+    #
+    # NOTE: This should only be used on internal keys (e.g., `typename` and
+    #       `location` in `persistent_load` below!
+    if isinstance(bytes_str, bytes):
+        return bytes_str.decode('ascii')
+    return bytes_str
+
+
+def _get_restore_location(map_location):
+    if map_location is None:
+        restore_location = default_restore_location
+    elif isinstance(map_location, dict):
+        def restore_location(storage, location):
+            location = map_location.get(location, location)
+            return default_restore_location(storage, location)
+    elif isinstance(map_location, (str, bytes)):
+        def restore_location(storage, location):
+            return default_restore_location(storage, map_location)
+    elif isinstance(map_location, torch.device):
+        def restore_location(storage, location):
+            return default_restore_location(storage, str(map_location))
+    else:
+        def restore_location(storage, location):
+            result = map_location(storage, location)
+            if result is None:
+                result = default_restore_location(storage, location)
+            return result
+    return restore_location
+
+
+class StorageType:
+    def __init__(self, name):
+        self._dtype = _get_dtype_from_pickle_storage_type(name)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def __str__(self):
+        return f'StorageType(dtype={self.dtype})'
+
+
+def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', overall_storage=None, **pickle_load_args):
+    restore_location = _get_restore_location(map_location)
+
+    loaded_storages = {}
+
+    # check if byteswapping is needed
+    byteordername = 'byteorder'
+    byteorderdata = None
+    if zip_file.has_record(byteordername):
+        byteorderdata = zip_file.get_record(byteordername)
+        if byteorderdata not in [b'little', b'big']:
+            raise ValueError('Unknown endianness type: ' + byteorderdata.decode())
+    elif get_default_load_endianness() == LoadEndianness.LITTLE or \
+            get_default_load_endianness() is None:
+        byteorderdata = b'little'
+    elif get_default_load_endianness() == LoadEndianness.BIG:
+        byteorderdata = b'big'
+    elif get_default_load_endianness() == LoadEndianness.NATIVE:
+        pass
+    else:
+        raise ValueError('Invalid load endianness type')
+
+    if not zip_file.has_record(byteordername) and \
+            get_default_load_endianness() is None and \
+            sys.byteorder == 'big':
+        # Default behaviour was changed
+        # See https://github.com/pytorch/pytorch/issues/101688
+        warnings.warn("The default load endianness for checkpoints without a byteorder mark "
+                      "on big endian machines was changed from 'native' to 'little' endian, "
+                      "to avoid this behavior please use "
+                      "torch.serialization.set_default_load_endianness to set "
+                      "the desired default load endianness",
+                      UserWarning)
+
+    def load_tensor(dtype, numel, key, location):
+        name = f'data/{key}'
+        if torch._guards.detect_fake_mode(None) is not None:
+            nbytes = numel * torch._utils._element_size(dtype)
+            storage = torch.UntypedStorage(nbytes, device='meta')
+        elif overall_storage is not None:
+            storage_offset = zip_file.get_record_offset(name)
+            storage = overall_storage[storage_offset:storage_offset + numel]
+        else:
+            storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage)._typed_storage()._untyped_storage
+        # swap here if byteswapping is needed
+        if byteorderdata is not None:
+            if byteorderdata.decode() != sys.byteorder:
+                storage.byteswap(dtype)
+
+        # TODO: Once we decide to break serialization FC, we can
+        # stop wrapping with TypedStorage
+        typed_storage = torch.storage.TypedStorage(
+            wrap_storage=restore_location(storage, location),
+            dtype=dtype,
+            _internal=True)
+
+        if typed_storage._data_ptr() != 0:
+            loaded_storages[key] = typed_storage
+
+        return typed_storage
+
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+
+        assert typename == 'storage', \
+            f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
+        storage_type, key, location, numel = data
+        if storage_type is torch.UntypedStorage:
+            dtype = torch.uint8
+        else:
+            dtype = storage_type.dtype
+
+        if key in loaded_storages:
+            typed_storage = loaded_storages[key]
+        else:
+            nbytes = numel * torch._utils._element_size(dtype)
+            typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+
+        return typed_storage
+
+    load_module_mapping: Dict[str, str] = {
+        # See https://github.com/pytorch/pytorch/pull/51633
+        'torch.tensor': 'torch._tensor'
+    }
+
+    # Need to subclass Unpickler instead of directly monkey-patching the find_class method
+    # because it's marked readonly in pickle.
+    # The type: ignore is because mypy can't statically determine the type of this class.
+    class UnpicklerWrapper(pickle_module.Unpickler):  # type: ignore[name-defined]
+        # from https://stackoverflow.com/questions/13398462/unpickling-python-objects-with-a-changed-module-path/13405732
+        # Lets us override the imports that pickle uses when unpickling an object.
+        # This is useful for maintaining BC if we change a module path that tensor instantiation relies on.
+        def find_class(self, mod_name, name):
+            if type(name) is str and 'Storage' in name:
+                try:
+                    return StorageType(name)
+                except KeyError:
+                    pass
+            mod_name = load_module_mapping.get(mod_name, mod_name)
+            return super().find_class(mod_name, name)
+
+    # Load the data (which may in turn use `persistent_load` to load tensors)
+    data_file = io.BytesIO(zip_file.get_record(pickle_file))
+
+    unpickler = UnpicklerWrapper(data_file, **pickle_load_args)
+    unpickler.persistent_load = persistent_load
+    result = unpickler.load()
+
+    torch._utils._validate_loaded_sparse_tensors()
+    torch._C._log_api_usage_metadata(
+        "torch.load.metadata", {"serialization_id": zip_file.serialization_id()}
+    )
+    return result
+
+
+def _is_torchscript_zip(zip_file):
+    return 'constants.pkl' in zip_file.get_all_records()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/ATen/ATenConfig.cmake b/MLPY/Lib/site-packages/torch/share/cmake/ATen/ATenConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..58447cec99c9c67e50e7850a199631a2d9fd25cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/ATen/ATenConfig.cmake
@@ -0,0 +1,9 @@
+# Find the TH includes and library
+#
+# ATEN_INCLUDE_DIR -- where to find the includes
+# ATEN_LIBRARIES -- list of libraries to link against
+# ATEN_FOUND -- set to 1 if found
+
+set(ATEN_FOUND 1)
+set(ATEN_INCLUDE_DIR "C:/actions-runner/_work/pytorch/pytorch/builder/windows/pytorch/torch/include")
+set(ATEN_LIBRARIES "")
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5aa7d5bab10a9a61be4ac70437b5a39774e71545
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Config.cmake
@@ -0,0 +1,140 @@
+# - Config file for the Caffe2 package
+# It defines the following variable(s)
+#   CAFFE2_INCLUDE_DIRS     - include directories for FooBar
+# as well as Caffe2 targets for other cmake libraries to use.
+
+# library version information
+
+# Utils functions.
+include("${CMAKE_CURRENT_LIST_DIR}/public/utils.cmake")
+
+# Depending on whether Caffe2 uses gflags during compile time or
+# not, invoke gflags.
+if(OFF)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/gflags.cmake")
+  if(NOT TARGET gflags)
+    message(FATAL_ERROR
+        "Your installed Caffe2 version uses gflags but the gflags library "
+        "cannot be found. Did you accidentally remove it, or have you set "
+        "the right CMAKE_PREFIX_PATH and/or GFLAGS_ROOT_DIR? If you do not "
+        "have gflags, you will need to install gflags and set the library "
+        "path accordingly.")
+  endif()
+endif()
+
+# Depending on whether Caffe2 uses glog during compile time or
+# not, invoke glog.
+if(OFF)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/glog.cmake")
+  if(NOT TARGET glog::glog)
+    message(FATAL_ERROR
+        "Your installed Caffe2 version uses glog but the glog library "
+        "cannot be found. Did you accidentally remove it, or have you set "
+        "the right CMAKE_PREFIX_PATH and/or GFLAGS_ROOT_DIR? If you do not "
+        "have glog, you will need to install glog and set the library "
+        "path accordingly.")
+  endif()
+endif()
+
+# Protobuf
+if(ON)
+  if(NOT TARGET protobuf::libprotobuf)
+    # Define protobuf::libprotobuf as a dummy target to resolve references to
+    # protobuf::libprotobuf in Caffe2Targets.cmake.
+    add_library(dummy INTERFACE)
+    add_library(protobuf::libprotobuf ALIAS dummy)
+  endif()
+else()
+  include("${CMAKE_CURRENT_LIST_DIR}/public/protobuf.cmake")
+  if(NOT TARGET protobuf::libprotobuf)
+    message(FATAL_ERROR
+        "Your installed Caffe2 version uses protobuf but the protobuf library "
+        "cannot be found. Did you accidentally remove it, or have you set "
+        "the right CMAKE_PREFIX_PATH? If you do not have protobuf, you will "
+        "need to install protobuf and set the library path accordingly.")
+  endif()
+  message(STATUS "Caffe2: Protobuf version " ${Protobuf_VERSION})
+  # If during build time we know the protobuf version, we will also do a sanity
+  # check to ensure that the protobuf library that Caffe2 found is consistent
+  # with the compiled version.
+  if(FALSE)
+    if(NOT (${Protobuf_VERSION} VERSION_EQUAL Protobuf_VERSION_NOTFOUND))
+      message(FATAL_ERROR
+          "Your installed Caffe2 is built with protobuf "
+          "Protobuf_VERSION_NOTFOUND"
+          ", while your current cmake setting discovers protobuf version "
+          ${Protobuf_VERSION}
+          ". Please specify a protobuf version that is the same as the built "
+          "version.")
+    endif()
+  endif()
+endif()
+
+if (OFF)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/LoadHIP.cmake")
+endif()
+
+if(0)
+  # The file public/cuda.cmake exclusively uses CAFFE2_USE_*.
+  # If Caffe2 was compiled with the libraries below, they must
+  # be found again when including the Caffe2 target.
+  set(CAFFE2_USE_CUDA 0)
+  set(CAFFE2_USE_TENSORRT OFF)
+
+  # Add current directory to module path so we pick up FindCUDAToolkit.cmake
+  set(old_CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}")
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+  include("${CMAKE_CURRENT_LIST_DIR}/public/cuda.cmake")
+  set(CMAKE_MODULE_PATH "${old_CMAKE_MODULE_PATH}")
+
+  if( AND NOT CAFFE2_USE_CUDA)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses CUDA but I cannot find the CUDA "
+      "libraries. Please set the proper CUDA prefixes and / or install "
+      "CUDA.")
+  endif()
+  if( AND NOT CAFFE2_USE_TENSORRT)
+    message(FATAL_ERROR
+      "Your installed Caffe2 version uses TensorRT but I cannot find the TensorRT "
+      "libraries. Please set the proper TensorRT prefixes and / or install "
+      "TensorRT.")
+  endif()
+endif()
+
+if(OFF)
+  # Add current directory to module path so we pick up FindSYCLToolkit.cmake
+  set(old_CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}")
+  list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+  include("${CMAKE_CURRENT_LIST_DIR}/public/xpu.cmake")
+  set(CMAKE_MODULE_PATH "${old_CMAKE_MODULE_PATH}")
+endif()
+
+if(ON)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/mkl.cmake")
+endif()
+
+if(ON)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake")
+endif()
+
+# import targets
+include ("${CMAKE_CURRENT_LIST_DIR}/Caffe2Targets.cmake")
+
+# Interface libraries, that allows one to build proper link flags.
+# We will also define a helper variable, Caffe2_MAIN_LIBS, that resolves to
+# the main caffe2 libraries in cases of cuda presence / absence.
+set(Caffe2_MAIN_LIBS torch_library)
+
+# include directory.
+#
+# Newer versions of CMake set the INTERFACE_INCLUDE_DIRECTORIES property
+# of the imported targets. It is hence not necessary to add this path
+# manually to the include search path for targets which link to gflags.
+# The following lines are here for backward compatibility, in case one
+# would like to use the old-style include path.
+get_filename_component(
+    CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+# Note: the current list dir is _INSTALL_PREFIX/share/cmake/Gloo.
+get_filename_component(
+    _INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+set(CAFFE2_INCLUDE_DIRS "${_INSTALL_PREFIX}/include")
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets-release.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets-release.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c273be2b35ed02e42388a42317d4bf2e6ab5e5e3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets-release.cmake
@@ -0,0 +1,40 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "c10" for configuration "Release"
+set_property(TARGET c10 APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(c10 PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/c10.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/c10.dll"
+  )
+
+list(APPEND _cmake_import_check_targets c10 )
+list(APPEND _cmake_import_check_files_for_c10 "${_IMPORT_PREFIX}/lib/c10.lib" "${_IMPORT_PREFIX}/lib/c10.dll" )
+
+# Import target "torch_cpu" for configuration "Release"
+set_property(TARGET torch_cpu APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(torch_cpu PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/torch_cpu.lib"
+  IMPORTED_LINK_DEPENDENT_LIBRARIES_RELEASE "fbgemm"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/torch_cpu.dll"
+  )
+
+list(APPEND _cmake_import_check_targets torch_cpu )
+list(APPEND _cmake_import_check_files_for_torch_cpu "${_IMPORT_PREFIX}/lib/torch_cpu.lib" "${_IMPORT_PREFIX}/lib/torch_cpu.dll" )
+
+# Import target "torch" for configuration "Release"
+set_property(TARGET torch APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(torch PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/torch.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/torch.dll"
+  )
+
+list(APPEND _cmake_import_check_targets torch )
+list(APPEND _cmake_import_check_files_for_torch "${_IMPORT_PREFIX}/lib/torch.lib" "${_IMPORT_PREFIX}/lib/torch.dll" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..6e30b79aacfe55e85f65c12455122ec5a887acfc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Caffe2Targets.cmake
@@ -0,0 +1,161 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
+   message(FATAL_ERROR "CMake >= 2.8.0 required")
+endif()
+if(CMAKE_VERSION VERSION_LESS "2.8.3")
+   message(FATAL_ERROR "CMake >= 2.8.3 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.8.3...3.24)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_cmake_targets_defined "")
+set(_cmake_targets_not_defined "")
+set(_cmake_expected_targets "")
+foreach(_cmake_expected_target IN ITEMS c10 torch_cpu torch_cpu_library torch torch_library)
+  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
+  if(TARGET "${_cmake_expected_target}")
+    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
+  else()
+    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
+  endif()
+endforeach()
+unset(_cmake_expected_target)
+if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
+  unset(_cmake_targets_defined)
+  unset(_cmake_targets_not_defined)
+  unset(_cmake_expected_targets)
+  unset(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT _cmake_targets_defined STREQUAL "")
+  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
+  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
+endif()
+unset(_cmake_targets_defined)
+unset(_cmake_targets_not_defined)
+unset(_cmake_expected_targets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target c10
+add_library(c10 SHARED IMPORTED)
+
+set_target_properties(c10 PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target torch_cpu
+add_library(torch_cpu SHARED IMPORTED)
+
+set_target_properties(torch_cpu PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "USE_DISTRIBUTED;USE_C10D_GLOO"
+  INTERFACE_COMPILE_OPTIONS "\$<\$<COMPILE_LANGUAGE:CXX>:;\$<\$<OR:\$<CONFIG:Debug>,\$<CONFIG:RelWithDebInfo>>:/Z7>;/EHsc;/bigobj>"
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+  INTERFACE_LINK_LIBRARIES "protobuf::libprotobuf;c10;caffe2::mkl"
+)
+
+# Create imported target torch_cpu_library
+add_library(torch_cpu_library INTERFACE IMPORTED)
+
+set_target_properties(torch_cpu_library PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "\$<TARGET_PROPERTY:torch_cpu,INTERFACE_COMPILE_DEFINITIONS>"
+  INTERFACE_COMPILE_OPTIONS "\$<TARGET_PROPERTY:torch_cpu,INTERFACE_COMPILE_OPTIONS>"
+  INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_cpu,INTERFACE_INCLUDE_DIRECTORIES>"
+  INTERFACE_LINK_LIBRARIES "torch_cpu;\$<TARGET_PROPERTY:torch_cpu,INTERFACE_LINK_LIBRARIES>"
+  INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch_cpu,INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>"
+)
+
+# Create imported target torch
+add_library(torch SHARED IMPORTED)
+
+set_target_properties(torch PROPERTIES
+  INTERFACE_LINK_LIBRARIES "torch_cpu_library"
+)
+
+# Create imported target torch_library
+add_library(torch_library INTERFACE IMPORTED)
+
+set_target_properties(torch_library PROPERTIES
+  INTERFACE_COMPILE_DEFINITIONS "\$<TARGET_PROPERTY:torch,INTERFACE_COMPILE_DEFINITIONS>"
+  INTERFACE_COMPILE_OPTIONS "\$<TARGET_PROPERTY:torch,INTERFACE_COMPILE_OPTIONS>"
+  INTERFACE_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch,INTERFACE_INCLUDE_DIRECTORIES>"
+  INTERFACE_LINK_LIBRARIES "torch;\$<TARGET_PROPERTY:torch,INTERFACE_LINK_LIBRARIES>"
+  INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "\$<TARGET_PROPERTY:torch,INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>"
+)
+
+if(CMAKE_VERSION VERSION_LESS 3.0.0)
+  message(FATAL_ERROR "This file relies on consumers using CMake 3.0.0 or greater.")
+endif()
+
+# Load information for each installed configuration.
+file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/Caffe2Targets-*.cmake")
+foreach(_cmake_config_file IN LISTS _cmake_config_files)
+  include("${_cmake_config_file}")
+endforeach()
+unset(_cmake_config_file)
+unset(_cmake_config_files)
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(_cmake_target IN LISTS _cmake_import_check_targets)
+  foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
+    if(NOT EXISTS "${_cmake_file}")
+      message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
+   \"${_cmake_file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_cmake_file)
+  unset("_cmake_import_check_files_for_${_cmake_target}")
+endforeach()
+unset(_cmake_target)
+unset(_cmake_import_check_targets)
+
+# Make sure the targets which have been exported in some other
+# export set exist.
+unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
+foreach(_target "protobuf::libprotobuf" )
+  if(NOT TARGET "${_target}" )
+    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
+  endif()
+endforeach()
+
+if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
+  if(CMAKE_FIND_PACKAGE_NAME)
+    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
+    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
+  else()
+    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
+  endif()
+endif()
+unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUDAToolkit.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUDAToolkit.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..cde7527c72419e435d9251dbc645fc995a4f4107
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUDAToolkit.cmake
@@ -0,0 +1,1073 @@
+
+# This module is back-ported from CMake 3.17 and above to work with CMake 3.10
+
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+.. versionadded:: 3.17
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+.. versionadded:: 3.19
+  QNX support.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+The CUDA Toolkit search behavior uses the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` or
+   the appropriate ``version.txt`` file can be found underneath the specified
+   directory.
+
+3. If the CUDA_PATH environment variable is defined, it will be searched
+   for ``nvcc``.
+
+4. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+6. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system(e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Arguments
+^^^^^^^^^
+
+``[<version>]``
+    The ``[<version>]`` argument requests a version with which the package found
+    should be compatible. See :ref:`find_package version format <FIND_PACKAGE_VERSION_FORMAT>`
+    for more details.
+
+Options
+^^^^^^^
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`cuPTI<cuda_toolkit_cupti>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`.
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+- ``CUDA::cublasLt`` starting in CUDA 10.1
+- ``CUDA::cublasLt_static`` starting in CUDA 10.1
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_cupti`:
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface <https://developer.nvidia.com/CUPTI>`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+  Removed starting in CUDA 11.0, use :ref:`nvJPEG<cuda_toolkit_nvJPEG>` instead.
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+Removed starting in CUDA 11.0
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvml``
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`cuda_toolkit_opencl`:
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version`` or ``version.txt``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MINOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    The path to the CUDA Toolkit ``include`` folder containing the header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_LIBRARY_ROOT``
+    .. versionadded:: 3.18
+
+    The path to the CUDA Toolkit directory containing the nvvm directory and
+    version.txt.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalent to
+    the parent directory of ``CUDAToolkit_BIN_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as
+# CMAKE_CUDA_COMPILER_TOOLKIT_ROOT and CMAKE_CUDA_COMPILER_LIBRARY_ROOT.
+# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly
+# different installation.
+if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT)
+  set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
+  set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+
+  if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+  endif()
+else()
+  function(_CUDAToolkit_find_root_dir )
+    cmake_parse_arguments(arg "" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN})
+
+    if(NOT CUDAToolkit_BIN_DIR)
+      if(NOT CUDAToolkit_SENTINEL_FILE)
+        find_program(CUDAToolkit_NVCC_EXECUTABLE
+          NAMES nvcc nvcc.exe
+          PATHS ${arg_SEARCH_PATHS}
+          ${arg_FIND_FLAGS}
+        )
+      endif()
+
+      if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+        find_file(CUDAToolkit_SENTINEL_FILE
+          NAMES version.txt
+          PATHS ${arg_SEARCH_PATHS}
+          NO_DEFAULT_PATH
+        )
+      endif()
+
+      if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}")
+        # If NVCC exists  then invoke it to find the toolkit location.
+        # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit,
+        # NVIDIA HPC SDK, and distro's splayed layouts
+        execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "-v" "__cmake_determine_cuda"
+          OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT)
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)")
+          get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE)
+        else()
+          get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+        endif()
+        unset(_CUDA_NVCC_OUT)
+
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+      endif()
+
+      if(CUDAToolkit_SENTINEL_FILE)
+        get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin")
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+    endif()
+
+    if(CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+      set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
+    endif()
+
+  endfunction()
+
+  # For NVCC we can easily deduce the SDK binary directory from the compiler path.
+  if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+    set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "")
+    # Try language provided path first.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH)
+    mark_as_advanced(CUDAToolkit_BIN_DIR)
+  endif()
+
+  # Try user provided path
+  if(NOT CUDAToolkit_ROOT_DIR AND CUDAToolkit_ROOT)
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
+  endif()
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin)
+  endif()
+
+  # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error.
+  if(NOT CUDAToolkit_ROOT_DIR AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+    # Declare error messages now, print later depending on find_package args.
+    set(fail_base "Could not find nvcc executable in path specified by")
+    set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+    if(CUDAToolkit_FIND_REQUIRED)
+      if(DEFINED CUDAToolkit_ROOT)
+        message(FATAL_ERROR ${cuda_root_fail})
+      elseif(DEFINED ENV{CUDAToolkit_ROOT})
+        message(FATAL_ERROR ${env_cuda_root_fail})
+      endif()
+    else()
+      if(NOT CUDAToolkit_FIND_QUIETLY)
+        if(DEFINED CUDAToolkit_ROOT)
+          message(STATUS ${cuda_root_fail})
+        elseif(DEFINED ENV{CUDAToolkit_ROOT})
+          message(STATUS ${env_cuda_root_fail})
+        endif()
+      endif()
+      set(CUDAToolkit_FOUND FALSE)
+      unset(fail_base)
+      unset(cuda_root_fail)
+      unset(env_cuda_root_fail)
+      return()
+    endif()
+  endif()
+
+  # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+  #
+  # - Linux: /usr/local/cuda-X.Y
+  # - macOS: /Developer/NVIDIA/CUDA-X.Y
+  # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+  #
+  # We will also search the default symlink location /usr/local/cuda first since
+  # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+  # directory is the desired location.
+  if(NOT CUDAToolkit_ROOT_DIR)
+    if(UNIX)
+      if(NOT APPLE)
+        set(platform_base "/usr/local/cuda-")
+      else()
+        set(platform_base "/Developer/NVIDIA/CUDA-")
+      endif()
+    else()
+      set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+    endif()
+
+    # Build out a descending list of possible cuda installations, e.g.
+    file(GLOB possible_paths "${platform_base}*")
+    # Iterate the glob results and create a descending list.
+    set(versions)
+    foreach(p ${possible_paths})
+      # Extract version number from end of string
+      string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+      if(IS_DIRECTORY ${p} AND p_version)
+        list(APPEND versions ${p_version})
+      endif()
+    endforeach()
+
+    # Sort numerically in descending order, so we try the newest versions first.
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      list(SORT versions COMPARE NATURAL ORDER DESCENDING)
+    elseif(versions)
+      # Alphabetical sort here is not ideal but better than nothing
+      list(SORT versions)
+      list(REVERSE versions)
+    endif()
+
+    # With a descending list of versions, populate possible paths to search.
+    set(search_paths)
+    foreach(v ${versions})
+      list(APPEND search_paths "${platform_base}${v}")
+    endforeach()
+
+    # Force the global default /usr/local/cuda to the front on Unix.
+    if(UNIX)
+      list(INSERT search_paths 0 "/usr/local/cuda")
+    endif()
+
+    # Now search for the toolkit again using the platform default search paths.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin)
+
+    # We are done with these variables now, cleanup for caller.
+    unset(platform_base)
+    unset(possible_paths)
+    unset(versions)
+    unset(search_paths)
+
+    if(NOT CUDAToolkit_ROOT_DIR)
+      if(CUDAToolkit_FIND_REQUIRED)
+        message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      elseif(NOT CUDAToolkit_FIND_QUIETLY)
+        message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+      endif()
+
+      set(CUDAToolkit_FOUND FALSE)
+      return()
+    endif()
+  endif()
+endif()
+
+if(NOT CUDAToolkit_BIN_DIR)
+  set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin")
+endif()
+
+if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+  set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}")
+endif()
+
+if(CMAKE_CUDA_COMPILER_TOOLKIT_VERSION)
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+else()
+  function(_CUDAToolkit_find_version_file result_variable)
+    # We first check for a non-scattered installation to prefer it over a scattered installation.
+    if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/version.txt")
+      set(${result_variable} "${CUDAToolkit_ROOT}/version.txt" PARENT_SCOPE)
+    elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/version.txt")
+      set(${result_variable} "${CUDAToolkit_ROOT_DIR}/version.txt" PARENT_SCOPE)
+    elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt")
+      set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/version.txt" PARENT_SCOPE)
+    elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt")
+      set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/version.txt" PARENT_SCOPE)
+    endif()
+  endfunction()
+
+  _CUDAToolkit_find_version_file( _CUDAToolkit_version_file )
+  if(_CUDAToolkit_version_file)
+    # CUDAToolkit_LIBRARY_ROOT contains the device library and version file.
+    get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE)
+  endif()
+  unset(_CUDAToolkit_version_file)
+
+  if(CUDAToolkit_NVCC_EXECUTABLE AND
+     CMAKE_CUDA_COMPILER_VERSION AND
+     CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+    # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+    # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+    if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+    endif()
+  elseif(CUDAToolkit_NVCC_EXECUTABLE)
+    # Compute the version by invoking nvcc
+    execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+    if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+    endif()
+    unset(NVCC_OUT)
+  else()
+    _CUDAToolkit_find_version_file(version_file)
+    if(version_file)
+      file(READ "${version_file}" VERSION_INFO)
+      if(VERSION_INFO MATCHES [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+        set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+        set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+        set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+        set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+      endif()
+    endif()
+  endif()
+endif()
+
+# Find target directory when crosscompiling.
+if(CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    if(ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi")
+    elseif(CMAKE_SYSTEM_NAME STREQUAL "QNX")
+      set(CUDAToolkit_TARGET_NAME "aarch64-qnx")
+    else()
+      set(CUDAToolkit_TARGET_NAME "aarch64-linux")
+    endif(ANDROID_ARCH_NAME STREQUAL "arm64")
+  elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(CUDAToolkit_TARGET_NAME "x86_64-linux")
+  endif()
+
+  if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+    # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+    list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+    # Mark that we need to pop the root search path changes after we have
+    # found all cuda libraries so that searches for our cross-compilation
+    # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+    # PATh
+    set(_CUDAToolkit_Pop_ROOT_PATH True)
+  endif()
+endif()
+
+# If not already set we can simply use the toolkit root or it's a scattered installation.
+if(NOT CUDAToolkit_TARGET_DIR)
+  # Not cross compiling
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+  # Now that we have the real ROOT_DIR, find components inside it.
+  list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+  # Mark that we need to pop the prefix path changes after we have
+  # found the cudart library.
+  set(_CUDAToolkit_Pop_Prefix True)
+endif()
+
+# CUDAToolkit_TARGET_DIR always points to the directory containing the include directory.
+# On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux.
+if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h")
+  set(CUDAToolkit_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/include")
+elseif(NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIR.")
+endif()
+
+# The NVHPC layout moves math library headers and libraries to a sibling directory.
+# Create a separate variable so this directory can be selectively added to math targets.
+if(NOT EXISTS "${CUDAToolkit_INCLUDE_DIR}/cublas_v2.h")
+  set(CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_TARGET_DIR}/../../math_libs/include")
+  get_filename_component(CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_MATH_INCLUDE_DIR}" ABSOLUTE)
+  if(NOT EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/cublas_v2.h")
+    if(NOT CUDAToolkit_FIND_QUIETLY)
+      message(STATUS "Unable to find cublas_v2.h in either \"${CUDAToolkit_INCLUDE_DIR}\" or \"${CUDAToolkit_MATH_INCLUDE_DIR}\"")
+    endif()
+    unset(CUDAToolkit_MATH_INCLUDE_DIR)
+  endif()
+endif()
+
+# Find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64 lib/x64
+)
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64/stubs lib/x64/stubs
+)
+
+if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+if(_CUDAToolkit_Pop_Prefix)
+  list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+  unset(_CUDAToolkit_Pop_Prefix)
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIR
+    CUDAToolkit_VERSION
+    CUDA_CUDART
+    CUDAToolkit_BIN_DIR
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_INCLUDE_DIR
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 CUDAToolkit_SENTINEL_FILE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+  set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
+  get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_HINTS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS" ${ARGN})
+
+    set(search_names ${lib_name} ${arg_ALT})
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+            ${arg_EXTRA_HINTS}
+      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories until we have exhausted all other
+    # search locations.
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+            ${arg_EXTRA_HINTS}
+      PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs
+                    # Support NVHPC splayed math library layout
+                    ../../math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64
+                    ../../math_libs/lib64
+    )
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} UNKNOWN IMPORTED)
+      set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+          INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+      set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+          INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+      if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR)
+        string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs)
+        if(NOT ${math_libs} EQUAL -1)
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_MATH_INCLUDE_DIRS}")
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_MATH_INCLUDE_DIRS}")
+        endif()
+      endif()
+      set_property(TARGET CUDA::${lib_name} PROPERTY IMPORTED_LOCATION "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+              INTERFACE_LINK_LIBRARIES CUDA::${dep})
+        endif()
+      endforeach()
+      if(arg_EXTRA_INCLUDE_DIRS)
+        set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+            INTERFACE_INCLUDE_DIRECTORIES "${arg_EXTRA_INCLUDE_DIRS}")
+        set_property(TARGET CUDA::${lib_name} APPEND PROPERTY
+            INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${arg_EXTRA_INCLUDE_DIRS}")
+      endif()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    set_property(TARGET CUDA::toolkit APPEND PROPERTY
+        INTERFACE_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+    set_property(TARGET CUDA::toolkit APPEND PROPERTY
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${CUDAToolkit_INCLUDE_DIRS}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda)
+
+  _CUDAToolkit_find_and_add_import_lib(cudart)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static)
+
+  # setup dependencies that are required for cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps
+     AND TARGET CUDA::cudart_static)
+
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    set_property(TARGET CUDA::cudart_static APPEND PROPERTY
+        INTERFACE_LINK_LIBRARIES CUDA::cudart_static_deps)
+
+    if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER))
+      find_package(Threads REQUIRED)
+      set_property(TARGET CUDA::cudart_static_deps APPEND PROPERTY
+          INTERFACE_LINK_LIBRARIES Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX"))
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        set_property(TARGET CUDA::cudart_static_deps APPEND PROPERTY
+            INTERFACE_LINK_LIBRARIES ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach(cuda_lib cublasLt cufft curand cusparse nppc nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0)
+    # cublas depends on cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.0/cublas/index.html#static-library
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static)
+  else()
+    _CUDAToolkit_find_and_add_import_lib(cublas)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
+  endif()
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2)
+    _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos)
+  endif()
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse)
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos)
+
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2)
+    # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2,
+    # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver/index.html#static-link-lapack
+    _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib
+    _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_lapack_static)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1)
+    # cusolver depends on libcusolver_metis and cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver/index.html#link-dependency
+    _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublasLt)
+
+    _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib
+    _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cusolver_metis_static cublasLt_static)
+  endif()
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach(cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS
+      "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include"
+      "${CUDAToolkit_INCLUDE_DIR}/../extras/CUPTI/include"
+      "${CUDAToolkit_INCLUDE_DIR}"
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_CUPTI_INCLUDE_DIR)
+
+  if(CUDAToolkit_CUPTI_INCLUDE_DIR)
+    _CUDAToolkit_find_and_add_import_lib(cupti
+                                        EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                            ../extras/CUPTI/lib/
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                        EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/
+                                                            ../extras/CUPTI/lib/
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver)
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+
+  # nvtools can be installed outside the CUDA toolkit directory,
+  # so search the NVTOOLSEXT_PATH windows only environment variable
+  set(nvToolsExt_EXTRA_PATH)
+  if(WIN32)
+     set(nvToolsExt_EXTRA_PATH "C:\\Program Files\\NVIDIA Corporation\\NvToolsExt")
+  endif()
+
+  find_path(CUDAToolkit_nvToolsExt_INCLUDE_DIR nvToolsExt.h
+      PATHS "${CUDAToolkit_INCLUDE_DIR}"
+            "${CUDAToolkit_ROOT_DIR}"
+            ENV NVTOOLSEXT_PATH
+            "${nvToolsExt_EXTRA_PATH}"
+      PATH_SUFFIXES include
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_nvToolsExt_INCLUDE_DIR)
+
+  if(CUDAToolkit_nvToolsExt_INCLUDE_DIR)
+    _CUDAToolkit_find_and_add_import_lib(nvToolsExt
+        ALT nvToolsExt64 nvToolsExt64_1
+        EXTRA_HINTS ENV NVTOOLSEXT_PATH
+                    "${nvToolsExt_EXTRA_PATH}"
+        EXTRA_INCLUDE_DIRS "${CUDAToolkit_nvToolsExt_INCLUDE_DIR}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+endif()
+
+unset(CUDAToolkit_ROOT_DIR)
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUSPARSELT.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUSPARSELT.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a3bc46ea61baa90ec95ecd449c983fb267dbc866
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindCUSPARSELT.cmake
@@ -0,0 +1,67 @@
+# Find the CUSPARSELT library
+#
+# The following variables are optionally searched for defaults
+#  CUSPARSELT_ROOT: Base directory where CUSPARSELT is found
+#  CUSPARSELT_INCLUDE_DIR: Directory where CUSPARSELT header is searched for
+#  CUSPARSELT_LIBRARY: Directory where CUSPARSELT library is searched for
+#
+# The following are set after configuration is done:
+#  CUSPARSELT_FOUND
+#  CUSPARSELT_INCLUDE_PATH
+#  CUSPARSELT_LIBRARY_PATH
+
+include(FindPackageHandleStandardArgs)
+
+set(CUSPARSELT_ROOT $ENV{CUSPARSELT_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuSPARSELt")
+if (DEFINED $ENV{CUSPARSELT_ROOT_DIR})
+  message(WARNING "CUSPARSELT_ROOT_DIR is deprecated. Please set CUSPARSELT_ROOT instead.")
+endif()
+list(APPEND CUSPARSELT_ROOT $ENV{CUSPARSELT_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUSPARSELT_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUSPARSELT_ROOT})
+
+set(CUSPARSELT_INCLUDE_DIR $ENV{CUSPARSELT_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuSPARSELt header files")
+
+find_path(CUSPARSELT_INCLUDE_PATH cusparseLt.h
+  HINTS ${CUSPARSELT_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
+
+set(CUSPARSELT_LIBRARY $ENV{CUSPARSELT_LIBRARY} CACHE PATH "Path to the cusparselt library file (e.g., libcusparseLt.so)")
+
+set(CUSPARSELT_LIBRARY_NAME "libcusparseLt.so")
+if(MSVC)
+  set(CUSPARSELT_LIBRARY_NAME "cusparseLt.lib")
+endif()
+
+find_library(CUSPARSELT_LIBRARY_PATH ${CUSPARSELT_LIBRARY_NAME}
+  PATHS ${CUSPARSELT_LIBRARY}
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+find_package_handle_standard_args(CUSPARSELT DEFAULT_MSG CUSPARSELT_LIBRARY_PATH CUSPARSELT_INCLUDE_PATH)
+
+if(CUSPARSELT_FOUND)
+  # Get cuSPARSELt version
+  file(READ ${CUSPARSELT_INCLUDE_PATH}/cusparseLt.h CUSPARSELT_HEADER_CONTENTS)
+  string(REGEX MATCH "define CUSPARSELT_VER_MAJOR * +([0-9]+)"
+               CUSPARSELT_VERSION_MAJOR "${CUSPARSELT_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUSPARSELT_VER_MAJOR * +([0-9]+)" "\\1"
+               CUSPARSELT_VERSION_MAJOR "${CUSPARSELT_VERSION_MAJOR}")
+  string(REGEX MATCH "define CUSPARSELT_VER_MINOR * +([0-9]+)"
+               CUSPARSELT_VERSION_MINOR "${CUSPARSELT_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUSPARSELT_VER_MINOR * +([0-9]+)" "\\1"
+               CUSPARSELT_VERSION_MINOR "${CUSPARSELT_VERSION_MINOR}")
+  string(REGEX MATCH "define CUSPARSELT_VER_PATCH * +([0-9]+)"
+               CUSPARSELT_VERSION_PATCH "${CUSPARSELT_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUSPARSELT_VER_PATCH * +([0-9]+)" "\\1"
+               CUSPARSELT_VERSION_PATCH "${CUSPARSELT_VERSION_PATCH}")
+  # Assemble cuSPARSELt version. Use minor version since current major version is 0.
+  if(NOT CUSPARSELT_VERSION_MINOR)
+    set(CUSPARSELT_VERSION "?")
+  else()
+    set(CUSPARSELT_VERSION
+        "${CUSPARSELT_VERSION_MAJOR}.${CUSPARSELT_VERSION_MINOR}.${CUSPARSELT_VERSION_PATCH}")
+  endif()
+endif()
+
+mark_as_advanced(CUSPARSELT_ROOT CUSPARSELT_INCLUDE_DIR CUSPARSELT_LIBRARY CUSPARSELT_VERSION)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindSYCLToolkit.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindSYCLToolkit.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7121330c4260e8f068334db0004627c18bf3adfb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/FindSYCLToolkit.cmake
@@ -0,0 +1,70 @@
+# This will define the following variables:
+# SYCL_FOUND               : True if the system has the SYCL library.
+# SYCL_INCLUDE_DIR         : Include directories needed to use SYCL.
+# SYCL_LIBRARY_DIR         ：The path to the SYCL library.
+# SYCL_LIBRARY             : SYCL library fullname.
+
+include(FindPackageHandleStandardArgs)
+
+set(SYCL_ROOT "")
+if(DEFINED ENV{SYCL_ROOT})
+  set(SYCL_ROOT $ENV{SYCL_ROOT})
+elseif(DEFINED ENV{CMPLR_ROOT})
+  set(SYCL_ROOT $ENV{CMPLR_ROOT})
+endif()
+
+string(COMPARE EQUAL "${SYCL_ROOT}" "" nosyclfound)
+if(nosyclfound)
+  set(SYCL_FOUND False)
+  set(SYCL_REASON_FAILURE "SYCL library not set!!")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  return()
+endif()
+
+# Find include path from binary.
+find_file(
+  SYCL_INCLUDE_DIR
+  NAMES include
+  HINTS ${SYCL_ROOT}
+  NO_DEFAULT_PATH
+  )
+
+# Find include/sycl path from include path.
+find_file(
+  SYCL_INCLUDE_SYCL_DIR
+  NAMES sycl
+  HINTS ${SYCL_ROOT}/include/
+  NO_DEFAULT_PATH
+  )
+
+# Due to the unrecognized compilation option `-fsycl` in other compiler.
+list(APPEND SYCL_INCLUDE_DIR ${SYCL_INCLUDE_SYCL_DIR})
+
+# Find library directory from binary.
+find_file(
+  SYCL_LIBRARY_DIR
+  NAMES lib lib64
+  HINTS ${SYCL_ROOT}
+  NO_DEFAULT_PATH
+  )
+
+# Find SYCL library fullname.
+find_library(
+  SYCL_LIBRARY
+  NAMES sycl
+  HINTS ${SYCL_LIBRARY_DIR}
+  NO_DEFAULT_PATH
+)
+
+if((NOT SYCL_INCLUDE_DIR) OR (NOT SYCL_LIBRARY_DIR) OR (NOT SYCL_LIBRARY))
+  set(SYCL_FOUND False)
+  set(SYCL_REASON_FAILURE "SYCL library is incomplete!!")
+  set(SYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}")
+  return()
+endif()
+
+find_package_handle_standard_args(
+  SYCL
+  FOUND_VAR SYCL_FOUND
+  REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_LIBRARY
+  REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}")
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDA.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDA.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..feca8b62d8e6d649a33e0cb3df947f4ddaf1bec8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDA.cmake
@@ -0,0 +1,11 @@
+# This is a wrapper of the upstream `./upstream/FindCUDA.cmake` that
+# automatically includes `./upstream/CMakeInitializeConfigs.cmake` before
+# `./upstream/FindCUDA.cmake`. The `CMakeInitializeConfigs.cmake`, which is
+# absent in old CMake versions, creates some necessary variables for the later
+# to run.
+# See ./README.md for details.
+
+set(UPSTREAM_FIND_CUDA_DIR "${CMAKE_CURRENT_LIST_DIR}/upstream/")
+
+include("${UPSTREAM_FIND_CUDA_DIR}/CMakeInitializeConfigs.cmake")
+include("${UPSTREAM_FIND_CUDA_DIR}/FindCUDA.cmake")
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDNN.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDNN.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c31e8cc9b0b011107fbd063af661f1cb158f3ce1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/FindCUDNN.cmake
@@ -0,0 +1,78 @@
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+#  CUDNN_ROOT: Base directory where CUDNN is found
+#  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+#  CUDNN_LIBRARY: Directory where CUDNN library is searched for
+#  CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+#  CUDNN_FOUND
+#  CUDNN_INCLUDE_PATH
+#  CUDNN_LIBRARY_PATH
+#
+
+include(FindPackageHandleStandardArgs)
+
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+  message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+  HINTS ${CUDNN_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
+
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+  set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+  set(CUDNN_LIBNAME "cudnn")
+endif()
+
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+  message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+  PATHS ${CUDNN_LIBRARY}
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+find_package_handle_standard_args(CUDNN DEFAULT_MSG CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH)
+
+if(CUDNN_FOUND)
+  # Get cuDNN version
+  if(EXISTS ${CUDNN_INCLUDE_PATH}/cudnn_version.h)
+    file(READ ${CUDNN_INCLUDE_PATH}/cudnn_version.h CUDNN_HEADER_CONTENTS)
+  else()
+    file(READ ${CUDNN_INCLUDE_PATH}/cudnn.h CUDNN_HEADER_CONTENTS)
+  endif()
+  string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+               CUDNN_VERSION_MAJOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+  string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+               CUDNN_VERSION_MINOR "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+               CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+  string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+               CUDNN_VERSION_PATCH "${CUDNN_HEADER_CONTENTS}")
+  string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+               CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+  # Assemble cuDNN version
+  if(NOT CUDNN_VERSION_MAJOR)
+    set(CUDNN_VERSION "?")
+  else()
+    set(CUDNN_VERSION
+        "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+  endif()
+endif()
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY CUDNN_VERSION)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..95d1d2d88f43ba3b351421f3ec84bac11527fe0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/CMakeInitializeConfigs.cmake
@@ -0,0 +1,40 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+# Present in upstream, but not supported on versions of cmake we need to support
+# include_guard(GLOBAL)
+
+# Initializes `<_PREFIX>_<CONFIG>` variables from the corresponding
+# `<_PREFIX>_<CONFIG>_INIT`, for the configurations currently used.
+function(cmake_initialize_per_config_variable _PREFIX _DOCSTRING)
+  string(STRIP "${${_PREFIX}_INIT}" _INIT)
+  set("${_PREFIX}" "${_INIT}"
+    CACHE STRING "${_DOCSTRING} during all build types.")
+  mark_as_advanced("${_PREFIX}")
+
+  if (NOT CMAKE_NOT_USING_CONFIG_FLAGS)
+    set(_CONFIGS Debug Release MinSizeRel RelWithDebInfo)
+
+    get_property(_GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+    if (_GENERATOR_IS_MULTI_CONFIG)
+      list(APPEND _CONFIGS ${CMAKE_CONFIGURATION_TYPES})
+    else()
+      if (NOT CMAKE_NO_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE_INIT}" CACHE STRING
+          "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...")
+      endif()
+      list(APPEND _CONFIGS ${CMAKE_BUILD_TYPE})
+    endif()
+
+    list(REMOVE_DUPLICATES _CONFIGS)
+    foreach(_BUILD_TYPE IN LISTS _CONFIGS)
+      if (NOT "${_BUILD_TYPE}" STREQUAL "")
+        string(TOUPPER "${_BUILD_TYPE}" _BUILD_TYPE)
+        string(STRIP "${${_PREFIX}_${_BUILD_TYPE}_INIT}" _INIT)
+        set("${_PREFIX}_${_BUILD_TYPE}" "${_INIT}"
+          CACHE STRING "${_DOCSTRING} during ${_BUILD_TYPE} builds.")
+        mark_as_advanced("${_PREFIX}_${_BUILD_TYPE}")
+      endif()
+    endforeach()
+  endif()
+endfunction()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..149ceeb0ca57839a43646b6a007520a6832d1d75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA.cmake
@@ -0,0 +1,1982 @@
+#.rst:
+# FindCUDA
+# --------
+#
+# .. note::
+#
+#   The FindCUDA module has been superseded by first-class support
+#   for the CUDA language in CMake.  It is no longer necessary to
+#   use this module or call ``find_package(CUDA)``.  This module
+#   now exists only for compatibility with projects that have not
+#   been ported.
+#
+#   Instead, list ``CUDA`` among the languages named in the top-level
+#   call to the :command:`project` command, or call the
+#   :command:`enable_language` command with ``CUDA``.
+#   Then one can add CUDA (``.cu``) sources to programs directly
+#   in calls to :command:`add_library` and :command:`add_executable`.
+#
+# Tools for building CUDA C files: libraries and build dependencies.
+#
+# This script locates the NVIDIA CUDA C tools.  It should work on Linux,
+# Windows, and macOS and should be reasonably up to date with CUDA C
+# releases.
+#
+# This script makes use of the standard :command:`find_package` arguments of
+# ``<VERSION>``, ``REQUIRED`` and ``QUIET``.  ``CUDA_FOUND`` will report if an
+# acceptable version of CUDA was found.
+#
+# The script will prompt the user to specify ``CUDA_TOOLKIT_ROOT_DIR`` if
+# the prefix cannot be determined by the location of nvcc in the system
+# path and ``REQUIRED`` is specified to :command:`find_package`.  To use
+# a different installed version of the toolkit set the environment variable
+# ``CUDA_BIN_PATH`` before running cmake (e.g.
+# ``CUDA_BIN_PATH=/usr/local/cuda1.0`` instead of the default
+# ``/usr/local/cuda``) or set ``CUDA_TOOLKIT_ROOT_DIR`` after configuring.  If
+# you change the value of ``CUDA_TOOLKIT_ROOT_DIR``, various components that
+# depend on the path will be relocated.
+#
+# It might be necessary to set ``CUDA_TOOLKIT_ROOT_DIR`` manually on certain
+# platforms, or to use a CUDA runtime not installed in the default
+# location.  In newer versions of the toolkit the CUDA library is
+# included with the graphics driver -- be sure that the driver version
+# matches what is needed by the CUDA runtime version.
+#
+# The following variables affect the behavior of the macros in the
+# script (in alphebetical order).  Note that any of these flags can be
+# changed multiple times in the same directory before calling
+# ``CUDA_ADD_EXECUTABLE``, ``CUDA_ADD_LIBRARY``, ``CUDA_COMPILE``,
+# ``CUDA_COMPILE_PTX``, ``CUDA_COMPILE_FATBIN``, ``CUDA_COMPILE_CUBIN``
+# or ``CUDA_WRAP_SRCS``::
+#
+#   CUDA_64_BIT_DEVICE_CODE (Default matches host bit size)
+#   -- Set to ON to compile for 64 bit device code, OFF for 32 bit device code.
+#      Note that making this different from the host code when generating object
+#      or C files from CUDA code just won't work, because size_t gets defined by
+#      nvcc in the generated source.  If you compile to PTX and then load the
+#      file yourself, you can mix bit sizes between device and host.
+#
+#   CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE (Default ON)
+#   -- Set to ON if you want the custom build rule to be attached to the source
+#      file in Visual Studio.  Turn OFF if you add the same cuda file to multiple
+#      targets.
+#
+#      This allows the user to build the target from the CUDA file; however, bad
+#      things can happen if the CUDA source file is added to multiple targets.
+#      When performing parallel builds it is possible for the custom build
+#      command to be run more than once and in parallel causing cryptic build
+#      errors.  VS runs the rules for every source file in the target, and a
+#      source can have only one rule no matter how many projects it is added to.
+#      When the rule is run from multiple targets race conditions can occur on
+#      the generated file.  Eventually everything will get built, but if the user
+#      is unaware of this behavior, there may be confusion.  It would be nice if
+#      this script could detect the reuse of source files across multiple targets
+#      and turn the option off for the user, but no good solution could be found.
+#
+#   CUDA_BUILD_CUBIN (Default OFF)
+#   -- Set to ON to enable and extra compilation pass with the -cubin option in
+#      Device mode. The output is parsed and register, shared memory usage is
+#      printed during build.
+#
+#   CUDA_BUILD_EMULATION (Default OFF for device mode)
+#   -- Set to ON for Emulation mode. -D_DEVICEEMU is defined for CUDA C files
+#      when CUDA_BUILD_EMULATION is TRUE.
+#
+#   CUDA_LINK_LIBRARIES_KEYWORD (Default "")
+#    -- The <PRIVATE|PUBLIC|INTERFACE> keyword to use for internal
+#       target_link_libraries calls. The default is to use no keyword which
+#       uses the old "plain" form of target_link_libraries. Note that is matters
+#       because whatever is used inside the FindCUDA module must also be used
+#       outside - the two forms of target_link_libraries cannot be mixed.
+#
+#   CUDA_GENERATED_OUTPUT_DIR (Default CMAKE_CURRENT_BINARY_DIR)
+#   -- Set to the path you wish to have the generated files placed.  If it is
+#      blank output files will be placed in CMAKE_CURRENT_BINARY_DIR.
+#      Intermediate files will always be placed in
+#      CMAKE_CURRENT_BINARY_DIR/CMakeFiles.
+#
+#   CUDA_HOST_COMPILATION_CPP (Default ON)
+#   -- Set to OFF for C compilation of host code.
+#
+#   CUDA_HOST_COMPILER (Default CMAKE_C_COMPILER)
+#   -- Set the host compiler to be used by nvcc.  Ignored if -ccbin or
+#      --compiler-bindir is already present in the CUDA_NVCC_FLAGS or
+#      CUDA_NVCC_FLAGS_<CONFIG> variables.  For Visual Studio targets,
+#      the host compiler is constructed with one or more visual studio macros
+#      such as $(VCInstallDir), that expands out to the path when
+#      the command is run from within VS.
+#      If the CUDAHOSTCXX environment variable is set it will
+#      be used as the default.
+#
+#   CUDA_NVCC_FLAGS
+#   CUDA_NVCC_FLAGS_<CONFIG>
+#   -- Additional NVCC command line arguments.  NOTE: multiple arguments must be
+#      semi-colon delimited (e.g. --compiler-options;-Wall)
+#
+#   CUDA_PROPAGATE_HOST_FLAGS (Default ON)
+#   -- Set to ON to propagate CMAKE_{C,CXX}_FLAGS and their configuration
+#      dependent counterparts (e.g. CMAKE_C_FLAGS_DEBUG) automatically to the
+#      host compiler through nvcc's -Xcompiler flag.  This helps make the
+#      generated host code match the rest of the system better.  Sometimes
+#      certain flags give nvcc problems, and this will help you turn the flag
+#      propagation off.  This does not affect the flags supplied directly to nvcc
+#      via CUDA_NVCC_FLAGS or through the OPTION flags specified through
+#      CUDA_ADD_LIBRARY, CUDA_ADD_EXECUTABLE, or CUDA_WRAP_SRCS.  Flags used for
+#      shared library compilation are not affected by this flag.
+#
+#   CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST (Default "")
+#   -- A list containing the host flags that should not be propagated when
+#      CUDA_PROPAGATE_HOST_FLAGS is ON.
+#
+#   CUDA_SEPARABLE_COMPILATION (Default OFF)
+#   -- If set this will enable separable compilation for all CUDA runtime object
+#      files.  If used outside of CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY
+#      (e.g. calling CUDA_WRAP_SRCS directly),
+#      CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME and
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS should be called.
+#
+#   CUDA_SOURCE_PROPERTY_FORMAT
+#   -- If this source file property is set, it can override the format specified
+#      to CUDA_WRAP_SRCS (OBJ, PTX, CUBIN, or FATBIN).  If an input source file
+#      is not a .cu file, setting this file will cause it to be treated as a .cu
+#      file. See documentation for set_source_files_properties on how to set
+#      this property.
+#
+#   CUDA_USE_STATIC_CUDA_RUNTIME (Default ON)
+#   -- When enabled the static version of the CUDA runtime library will be used
+#      in CUDA_LIBRARIES.  If the version of CUDA configured doesn't support
+#      this option, then it will be silently disabled.
+#
+#   CUDA_VERBOSE_BUILD (Default OFF)
+#   -- Set to ON to see all the commands used when building the CUDA file.  When
+#      using a Makefile generator the value defaults to VERBOSE (run make
+#      VERBOSE=1 to see output), although setting CUDA_VERBOSE_BUILD to ON will
+#      always print the output.
+#
+# The script creates the following macros (in alphebetical order)::
+#
+#   CUDA_ADD_CUFFT_TO_TARGET( cuda_target )
+#   -- Adds the cufft library to the target (can be any target).  Handles whether
+#      you are in emulation mode or not.
+#
+#   CUDA_ADD_CUBLAS_TO_TARGET( cuda_target )
+#   -- Adds the cublas library to the target (can be any target).  Handles
+#      whether you are in emulation mode or not.
+#
+#   CUDA_ADD_EXECUTABLE( cuda_target file0 file1 ...
+#                        [WIN32] [MACOSX_BUNDLE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Creates an executable "cuda_target" which is made up of the files
+#      specified.  All of the non CUDA C files are compiled using the standard
+#      build rules specified by CMAKE and the cuda files are compiled to object
+#      files using nvcc and the host compiler.  In addition CUDA_INCLUDE_DIRS is
+#      added automatically to include_directories().  Some standard CMake target
+#      calls can be used on the target after calling this macro
+#      (e.g. set_target_properties and target_link_libraries), but setting
+#      properties that adjust compilation flags will not affect code compiled by
+#      nvcc.  Such flags should be modified before calling CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY or CUDA_WRAP_SRCS.
+#
+#   CUDA_ADD_LIBRARY( cuda_target file0 file1 ...
+#                     [STATIC | SHARED | MODULE] [EXCLUDE_FROM_ALL] [OPTIONS ...] )
+#   -- Same as CUDA_ADD_EXECUTABLE except that a library is created.
+#
+#   CUDA_BUILD_CLEAN_TARGET()
+#   -- Creates a convenience target that deletes all the dependency files
+#      generated.  You should make clean after running this target to ensure the
+#      dependency files get regenerated.
+#
+#   CUDA_COMPILE( generated_files file0 file1 ... [STATIC | SHARED | MODULE]
+#                 [OPTIONS ...] )
+#   -- Returns a list of generated files from the input source files to be used
+#      with ADD_LIBRARY or ADD_EXECUTABLE.
+#
+#   CUDA_COMPILE_PTX( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of PTX files generated from the input source files.
+#
+#   CUDA_COMPILE_FATBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of FATBIN files generated from the input source files.
+#
+#   CUDA_COMPILE_CUBIN( generated_files file0 file1 ... [OPTIONS ...] )
+#   -- Returns a list of CUBIN files generated from the input source files.
+#
+#   CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME( output_file_var
+#                                                        cuda_target
+#                                                        object_files )
+#   -- Compute the name of the intermediate link file used for separable
+#      compilation.  This file name is typically passed into
+#      CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS.  output_file_var is produced
+#      based on cuda_target the list of objects files that need separable
+#      compilation as specified by object_files.  If the object_files list is
+#      empty, then output_file_var will be empty.  This function is called
+#      automatically for CUDA_ADD_LIBRARY and CUDA_ADD_EXECUTABLE.  Note that
+#      this is a function and not a macro.
+#
+#   CUDA_INCLUDE_DIRECTORIES( path0 path1 ... )
+#   -- Sets the directories that should be passed to nvcc
+#      (e.g. nvcc -Ipath0 -Ipath1 ... ). These paths usually contain other .cu
+#      files.
+#
+#
+#   CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS( output_file_var cuda_target
+#                                            nvcc_flags object_files)
+#   -- Generates the link object required by separable compilation from the given
+#      object files.  This is called automatically for CUDA_ADD_EXECUTABLE and
+#      CUDA_ADD_LIBRARY, but can be called manually when using CUDA_WRAP_SRCS
+#      directly.  When called from CUDA_ADD_LIBRARY or CUDA_ADD_EXECUTABLE the
+#      nvcc_flags passed in are the same as the flags passed in via the OPTIONS
+#      argument.  The only nvcc flag added automatically is the bitness flag as
+#      specified by CUDA_64_BIT_DEVICE_CODE.  Note that this is a function
+#      instead of a macro.
+#
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Kepler Maxwell Kepler+Tesla Maxwell+Tegra Pascal Volta Turing
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            3.5 3.7 5.0 5.2 5.3 6.0 6.1 6.2 7.0 7.2 7.5
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#      Note that this is a function instead of a macro.
+#
+#   CUDA_WRAP_SRCS ( cuda_target format generated_files file0 file1 ...
+#                    [STATIC | SHARED | MODULE] [OPTIONS ...] )
+#   -- This is where all the magic happens.  CUDA_ADD_EXECUTABLE,
+#      CUDA_ADD_LIBRARY, CUDA_COMPILE, and CUDA_COMPILE_PTX all call this
+#      function under the hood.
+#
+#      Given the list of files (file0 file1 ... fileN) this macro generates
+#      custom commands that generate either PTX or linkable objects (use "PTX" or
+#      "OBJ" for the format argument to switch).  Files that don't end with .cu
+#      or have the HEADER_FILE_ONLY property are ignored.
+#
+#      The arguments passed in after OPTIONS are extra command line options to
+#      give to nvcc.  You can also specify per configuration options by
+#      specifying the name of the configuration followed by the options.  General
+#      options must precede configuration specific options.  Not all
+#      configurations need to be specified, only the ones provided will be used.
+#
+#         OPTIONS -DFLAG=2 "-DFLAG_OTHER=space in flag"
+#         DEBUG -g
+#         RELEASE --use_fast_math
+#         RELWITHDEBINFO --use_fast_math;-g
+#         MINSIZEREL --use_fast_math
+#
+#      For certain configurations (namely VS generating object files with
+#      CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE set to ON), no generated file will
+#      be produced for the given cuda file.  This is because when you add the
+#      cuda file to Visual Studio it knows that this file produces an object file
+#      and will link in the resulting object file automatically.
+#
+#      This script will also generate a separate cmake script that is used at
+#      build time to invoke nvcc.  This is for several reasons.
+#
+#        1. nvcc can return negative numbers as return values which confuses
+#        Visual Studio into thinking that the command succeeded.  The script now
+#        checks the error codes and produces errors when there was a problem.
+#
+#        2. nvcc has been known to not delete incomplete results when it
+#        encounters problems.  This confuses build systems into thinking the
+#        target was generated when in fact an unusable file exists.  The script
+#        now deletes the output files if there was an error.
+#
+#        3. By putting all the options that affect the build into a file and then
+#        make the build rule dependent on the file, the output files will be
+#        regenerated when the options change.
+#
+#      This script also looks at optional arguments STATIC, SHARED, or MODULE to
+#      determine when to target the object compilation for a shared library.
+#      BUILD_SHARED_LIBS is ignored in CUDA_WRAP_SRCS, but it is respected in
+#      CUDA_ADD_LIBRARY.  On some systems special flags are added for building
+#      objects intended for shared libraries.  A preprocessor macro,
+#      <target_name>_EXPORTS is defined when a shared library compilation is
+#      detected.
+#
+#      Flags passed into add_definitions with -D or /D are passed along to nvcc.
+#
+#
+#
+# The script defines the following variables::
+#
+#   CUDA_VERSION_MAJOR    -- The major version of cuda as reported by nvcc.
+#   CUDA_VERSION_MINOR    -- The minor version.
+#   CUDA_VERSION
+#   CUDA_VERSION_STRING   -- CUDA_VERSION_MAJOR.CUDA_VERSION_MINOR
+#   CUDA_HAS_FP16         -- Whether a short float (float16,fp16) is supported.
+#
+#   CUDA_TOOLKIT_ROOT_DIR -- Path to the CUDA Toolkit (defined if not set).
+#   CUDA_SDK_ROOT_DIR     -- Path to the CUDA SDK.  Use this to find files in the
+#                            SDK.  This script will not directly support finding
+#                            specific libraries or headers, as that isn't
+#                            supported by NVIDIA.  If you want to change
+#                            libraries when the path changes see the
+#                            FindCUDA.cmake script for an example of how to clear
+#                            these variables.  There are also examples of how to
+#                            use the CUDA_SDK_ROOT_DIR to locate headers or
+#                            libraries, if you so choose (at your own risk).
+#   CUDA_INCLUDE_DIRS     -- Include directory for cuda headers.  Added automatically
+#                            for CUDA_ADD_EXECUTABLE and CUDA_ADD_LIBRARY.
+#   CUDA_LIBRARIES        -- Cuda RT library.
+#   CUDA_CUFFT_LIBRARIES  -- Device or emulation library for the Cuda FFT
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUFFT_TO_TARGET macro)
+#   CUDA_CUBLAS_LIBRARIES -- Device or emulation library for the Cuda BLAS
+#                            implementation (alternative to:
+#                            CUDA_ADD_CUBLAS_TO_TARGET macro).
+#   CUDA_cudart_static_LIBRARY -- Statically linkable cuda runtime library.
+#                                 Only available for CUDA version 5.5+
+#   CUDA_cudadevrt_LIBRARY -- Device runtime library.
+#                             Required for separable compilation.
+#   CUDA_cupti_LIBRARY    -- CUDA Profiling Tools Interface library.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_curand_LIBRARY   -- CUDA Random Number Generation library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_cusolver_LIBRARY -- CUDA Direct Solver library.
+#                            Only available for CUDA version 7.0+.
+#   CUDA_cusparse_LIBRARY -- CUDA Sparse Matrix library.
+#                            Only available for CUDA version 3.2+.
+#   CUDA_npp_LIBRARY      -- NVIDIA Performance Primitives lib.
+#                            Only available for CUDA version 4.0+.
+#   CUDA_nppc_LIBRARY     -- NVIDIA Performance Primitives lib (core).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nppi_LIBRARY     -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 5.5 - 8.0.
+#   CUDA_nppial_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppicc_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppicom_LIBRARY  -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppidei_LIBRARY  -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppif_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppig_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppim_LIBRARY    -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppist_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppisu_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_nppitc_LIBRARY   -- NVIDIA Performance Primitives lib (image processing).
+#                            Only available for CUDA version 9.0.
+#   CUDA_npps_LIBRARY     -- NVIDIA Performance Primitives lib (signal processing).
+#                            Only available for CUDA version 5.5+.
+#   CUDA_nvcuvenc_LIBRARY -- CUDA Video Encoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#   CUDA_nvcuvid_LIBRARY  -- CUDA Video Decoder library.
+#                            Only available for CUDA version 3.2+.
+#                            Windows only.
+#
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+# FindCUDA.cmake
+
+# This macro helps us find the location of helper files we will need the full path to
+macro(CUDA_FIND_HELPER_FILE _name _extension)
+  set(_full_name "${_name}.${_extension}")
+  # CMAKE_CURRENT_LIST_FILE contains the full path to the file currently being
+  # processed.  Using this variable, we can pull out the current path, and
+  # provide a way to get access to the other files we need local to here.
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  set(CUDA_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindCUDA/${_full_name}")
+  if(NOT EXISTS "${CUDA_${_name}}")
+    set(error_message "${_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindCUDA")
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "${error_message}")
+    else()
+      if(NOT CUDA_FIND_QUIETLY)
+        message(STATUS "${error_message}")
+      endif()
+    endif()
+  endif()
+  # Set this variable as internal, so the user isn't bugged with it.
+  set(CUDA_${_name} ${CUDA_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+#####################################################################
+## CUDA_INCLUDE_NVCC_DEPENDENCIES
+##
+
+# So we want to try and include the dependency file if it exists.  If
+# it doesn't exist then we need to create an empty one, so we can
+# include it.
+
+# If it does exist, then we need to check to see if all the files it
+# depends on exist.  If they don't then we should clear the dependency
+# file and regenerate it later.  This covers the case where a header
+# file has disappeared or moved.
+
+macro(CUDA_INCLUDE_NVCC_DEPENDENCIES dependency_file)
+  set(CUDA_NVCC_DEPEND)
+  set(CUDA_NVCC_DEPEND_REGENERATE FALSE)
+
+
+  # Include the dependency file.  Create it first if it doesn't exist .  The
+  # INCLUDE puts a dependency that will force CMake to rerun and bring in the
+  # new info when it changes.  DO NOT REMOVE THIS (as I did and spent a few
+  # hours figuring out why it didn't work.
+  if(NOT EXISTS ${dependency_file})
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+  # Always include this file to force CMake to run again next
+  # invocation and rebuild the dependencies.
+  #message("including dependency_file = ${dependency_file}")
+  include(${dependency_file})
+
+  # Now we need to verify the existence of all the included files
+  # here.  If they aren't there we need to just blank this variable and
+  # make the file regenerate again.
+#   if(DEFINED CUDA_NVCC_DEPEND)
+#     message("CUDA_NVCC_DEPEND set")
+#   else()
+#     message("CUDA_NVCC_DEPEND NOT set")
+#   endif()
+  if(CUDA_NVCC_DEPEND)
+    #message("CUDA_NVCC_DEPEND found")
+    foreach(f ${CUDA_NVCC_DEPEND})
+      # message("searching for ${f}")
+      if(NOT EXISTS ${f})
+        #message("file ${f} not found")
+        set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+      endif()
+    endforeach()
+  else()
+    #message("CUDA_NVCC_DEPEND false")
+    # No dependencies, so regenerate the file.
+    set(CUDA_NVCC_DEPEND_REGENERATE TRUE)
+  endif()
+
+  #message("CUDA_NVCC_DEPEND_REGENERATE = ${CUDA_NVCC_DEPEND_REGENERATE}")
+  # No incoming dependencies, so we need to generate them.  Make the
+  # output depend on the dependency file itself, which should cause the
+  # rule to re-run.
+  if(CUDA_NVCC_DEPEND_REGENERATE)
+    set(CUDA_NVCC_DEPEND ${dependency_file})
+    #message("Generating an empty dependency_file: ${dependency_file}")
+    file(WRITE ${dependency_file} "#FindCUDA.cmake generated file.  Do not edit.\n")
+  endif()
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# Setup variables' defaults
+###############################################################################
+###############################################################################
+
+# Allow the user to specify if the device code is supposed to be 32 or 64 bit.
+if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT ON)
+else()
+  set(CUDA_64_BIT_DEVICE_CODE_DEFAULT OFF)
+endif()
+option(CUDA_64_BIT_DEVICE_CODE "Compile device code in 64 bit mode" ${CUDA_64_BIT_DEVICE_CODE_DEFAULT})
+
+# Attach the build rule to the source file in VS.  This option
+option(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE "Attach the build rule to the CUDA source file.  Enable only when the CUDA source file is added to at most one target." ON)
+
+# Prints out extra information about the cuda file during compilation
+option(CUDA_BUILD_CUBIN "Generate and parse .cubin files in Device mode." OFF)
+
+# Set whether we are using emulation or device mode.
+option(CUDA_BUILD_EMULATION "Build in Emulation mode" OFF)
+
+# Where to put the generated output.
+set(CUDA_GENERATED_OUTPUT_DIR "" CACHE PATH "Directory to put all the output files.  If blank it will default to the CMAKE_CURRENT_BINARY_DIR")
+
+# Parse HOST_COMPILATION mode.
+option(CUDA_HOST_COMPILATION_CPP "Generated file extension" ON)
+
+# Extra user settable flags
+cmake_initialize_per_config_variable(CUDA_NVCC_FLAGS "Semi-colon delimit multiple arguments.")
+
+if(DEFINED ENV{CUDAHOSTCXX})
+  set(CUDA_HOST_COMPILER "$ENV{CUDAHOSTCXX}" CACHE FILEPATH "Host side compiler used by NVCC")
+elseif(CMAKE_GENERATOR MATCHES "Visual Studio")
+  set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)Tools/MSVC/$(VCToolsVersion)/bin/Host$(Platform)/$(PlatformTarget)")
+  if(MSVC_VERSION LESS 1910)
+   set(_CUDA_MSVC_HOST_COMPILER "$(VCInstallDir)bin")
+  endif()
+
+  set(CUDA_HOST_COMPILER "${_CUDA_MSVC_HOST_COMPILER}" CACHE FILEPATH "Host side compiler used by NVCC")
+
+else()
+  if(APPLE
+      AND "${CMAKE_C_COMPILER_ID}" MATCHES "Clang"
+      AND "${CMAKE_C_COMPILER}" MATCHES "/cc$")
+    # Using cc which is symlink to clang may let NVCC think it is GCC and issue
+    # unhandled -dumpspecs option to clang. Also in case neither
+    # CMAKE_C_COMPILER is defined (project does not use C language) nor
+    # CUDA_HOST_COMPILER is specified manually we should skip -ccbin and let
+    # nvcc use its own default C compiler.
+    # Only care about this on APPLE with clang to avoid
+    # following symlinks to things like ccache
+    if(DEFINED CMAKE_C_COMPILER AND NOT DEFINED CUDA_HOST_COMPILER)
+      get_filename_component(c_compiler_realpath "${CMAKE_C_COMPILER}" REALPATH)
+      # if the real path does not end up being clang then
+      # go back to using CMAKE_C_COMPILER
+      if(NOT "${c_compiler_realpath}" MATCHES "/clang$")
+        set(c_compiler_realpath "${CMAKE_C_COMPILER}")
+      endif()
+    else()
+      set(c_compiler_realpath "")
+    endif()
+    set(CUDA_HOST_COMPILER "${c_compiler_realpath}" CACHE FILEPATH "Host side compiler used by NVCC")
+  elseif(MSVC AND "${CMAKE_C_COMPILER}" MATCHES "clcache|sccache")
+    # NVCC does not think it will work if it is passed clcache.exe or sccache.exe
+    # as the host compiler, which means that builds with CC=cl.exe won't work.
+    # Best to just feed it whatever the actual cl.exe is as the host compiler.
+    set(CUDA_HOST_COMPILER "cl.exe" CACHE FILEPATH "Host side compiler used by NVCC")
+  else()
+    set(CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}"
+      CACHE FILEPATH "Host side compiler used by NVCC")
+  endif()
+endif()
+
+# Propagate the host flags to the host compiler via -Xcompiler
+option(CUDA_PROPAGATE_HOST_FLAGS "Propagate C/CXX_FLAGS and friends to the host compiler via -Xcompile" ON)
+
+# Blacklisted flags to prevent propagation
+set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST  "" CACHE STRING "Blacklisted flags to prevent propagation")
+
+# Enable CUDA_SEPARABLE_COMPILATION
+option(CUDA_SEPARABLE_COMPILATION "Compile CUDA objects with separable compilation enabled.  Requires CUDA 5.0+" OFF)
+
+# Specifies whether the commands used when compiling the .cu file will be printed out.
+option(CUDA_VERBOSE_BUILD "Print out the commands run while compiling the CUDA source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+
+mark_as_advanced(
+  CUDA_64_BIT_DEVICE_CODE
+  CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
+  CUDA_GENERATED_OUTPUT_DIR
+  CUDA_HOST_COMPILATION_CPP
+  CUDA_NVCC_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS
+  CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST
+  CUDA_BUILD_CUBIN
+  CUDA_BUILD_EMULATION
+  CUDA_VERBOSE_BUILD
+  CUDA_SEPARABLE_COMPILATION
+  )
+
+# Single config generators like Makefiles or Ninja don't usually have
+# CMAKE_CONFIGURATION_TYPES defined (but note that it can be defined if set by
+# projects or developers). Even CMAKE_BUILD_TYPE might not be defined for
+# single config generators (and should not be defined for multi-config
+# generators). To ensure we get a complete superset of all possible
+# configurations, we combine CMAKE_CONFIGURATION_TYPES, CMAKE_BUILD_TYPE and
+# all of the standard configurations, then weed out duplicates with
+# list(REMOVE_DUPLICATES). Looping over the unique set then ensures we have
+# each configuration-specific set of nvcc flags defined and marked as advanced.
+set(CUDA_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES CUDA_configuration_types)
+
+###############################################################################
+###############################################################################
+# Locate CUDA, Set Build Type, etc.
+###############################################################################
+###############################################################################
+
+macro(cuda_unset_include_and_libraries)
+  unset(CUDA_TOOLKIT_INCLUDE CACHE)
+  unset(CUDA_CUDART_LIBRARY CACHE)
+  unset(CUDA_CUDA_LIBRARY CACHE)
+  # Make sure you run this before you unset CUDA_VERSION.
+  unset(CUDA_cudart_static_LIBRARY CACHE)
+  unset(CUDA_cudadevrt_LIBRARY CACHE)
+  unset(CUDA_cublas_LIBRARY CACHE)
+  unset(CUDA_cublas_device_LIBRARY CACHE)
+  unset(CUDA_cublasemu_LIBRARY CACHE)
+  unset(CUDA_cublasLt_LIBRARY CACHE)
+  unset(CUDA_cufft_LIBRARY CACHE)
+  unset(CUDA_cufftemu_LIBRARY CACHE)
+  unset(CUDA_cupti_LIBRARY CACHE)
+  unset(CUDA_curand_LIBRARY CACHE)
+  unset(CUDA_cusolver_LIBRARY CACHE)
+  unset(CUDA_cusparse_LIBRARY CACHE)
+  unset(CUDA_npp_LIBRARY CACHE)
+  unset(CUDA_nppc_LIBRARY CACHE)
+  unset(CUDA_nppi_LIBRARY CACHE)
+  unset(CUDA_npps_LIBRARY CACHE)
+  unset(CUDA_nvcuvenc_LIBRARY CACHE)
+  unset(CUDA_nvcuvid_LIBRARY CACHE)
+  unset(CUDA_GPU_DETECT_OUTPUT CACHE)
+endmacro()
+
+# Check to see if the CUDA_TOOLKIT_ROOT_DIR and CUDA_SDK_ROOT_DIR have changed,
+# if they have then clear the cache variables, so that will be detected again.
+if(NOT "${CUDA_TOOLKIT_ROOT_DIR}" STREQUAL "${CUDA_TOOLKIT_ROOT_DIR_INTERNAL}")
+  unset(CUDA_TOOLKIT_TARGET_DIR CACHE)
+  unset(CUDA_NVCC_EXECUTABLE CACHE)
+  cuda_unset_include_and_libraries()
+  unset(CUDA_VERSION CACHE)
+endif()
+
+if(NOT "${CUDA_TOOLKIT_TARGET_DIR}" STREQUAL "${CUDA_TOOLKIT_TARGET_DIR_INTERNAL}")
+  cuda_unset_include_and_libraries()
+endif()
+
+#
+#  End of unset()
+#
+
+#
+#  Start looking for things
+#
+
+# Search for the cuda distribution.
+if(NOT CUDA_TOOLKIT_ROOT_DIR AND NOT CMAKE_CROSSCOMPILING)
+  # Search in the CUDA_BIN_PATH first.
+  find_program(CUDA_TOOLKIT_ROOT_DIR_NVCC
+    NAMES nvcc nvcc.exe
+    PATHS
+      ENV CUDA_TOOLKIT_ROOT
+      ENV CUDA_PATH
+      ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    DOC "Toolkit location."
+    NO_DEFAULT_PATH
+    )
+
+  # Now search default paths
+  find_program(CUDA_TOOLKIT_ROOT_DIR_NVCC
+    NAMES nvcc nvcc.exe
+    PATHS /opt/cuda/bin
+    PATH_SUFFIXES cuda/bin
+    DOC "Toolkit location."
+    )
+
+  if (CUDA_TOOLKIT_ROOT_DIR_NVCC)
+    get_filename_component(CUDA_TOOLKIT_ROOT_DIR_NVCC_PAR "${CUDA_TOOLKIT_ROOT_DIR_NVCC}" DIRECTORY)
+    get_filename_component(CUDA_TOOLKIT_ROOT_DIR "${CUDA_TOOLKIT_ROOT_DIR_NVCC_PAR}" DIRECTORY CACHE)
+    string(REGEX REPLACE "[/\\\\]?bin[64]*[/\\\\]?$" "" CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+    # We need to force this back into the cache.
+    set(CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT_DIR} CACHE PATH "Toolkit location." FORCE)
+    set(CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  unset(CUDA_TOOLKIT_ROOT_DIR_NVCC CACHE)
+
+  if (NOT EXISTS ${CUDA_TOOLKIT_ROOT_DIR})
+    if(CUDA_FIND_REQUIRED)
+      message(FATAL_ERROR "Specify CUDA_TOOLKIT_ROOT_DIR")
+    elseif(NOT CUDA_FIND_QUIETLY)
+      message("CUDA_TOOLKIT_ROOT_DIR not found or specified")
+    endif()
+  endif ()
+endif ()
+
+if(CMAKE_CROSSCOMPILING)
+  SET (CUDA_TOOLKIT_ROOT $ENV{CUDA_TOOLKIT_ROOT})
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set (CUDA_TOOLKIT_TARGET_NAMES "armv7-linux-androideabi")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    # Support for arm cross compilation
+    set(CUDA_TOOLKIT_TARGET_NAMES "armv7-linux-gnueabihf")
+  elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    # Support for aarch64 cross compilation
+    if (ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDA_TOOLKIT_TARGET_NAMES "aarch64-linux-androideabi")
+    else()
+      set(CUDA_TOOLKIT_TARGET_NAMES "aarch64-linux" "sbsa-linux")
+    endif (ANDROID_ARCH_NAME STREQUAL "arm64")
+  endif()
+
+  foreach(CUDA_TOOLKIT_TARGET_NAME IN LISTS CUDA_TOOLKIT_TARGET_NAMES)
+    if (EXISTS "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}")
+      set(CUDA_TOOLKIT_TARGET_DIR "${CUDA_TOOLKIT_ROOT}/targets/${CUDA_TOOLKIT_TARGET_NAME}" CACHE PATH "CUDA Toolkit target location.")
+      SET (CUDA_TOOLKIT_ROOT_DIR ${CUDA_TOOLKIT_ROOT} CACHE PATH "Toolkit location." FORCE)
+      mark_as_advanced(CUDA_TOOLKIT_TARGET_DIR)
+      break()
+    endif()
+  endforeach()
+
+  # add known CUDA targetr root path to the set of directories we search for programs, libraries and headers
+  set( CMAKE_FIND_ROOT_PATH "${CUDA_TOOLKIT_TARGET_DIR};${CMAKE_FIND_ROOT_PATH}")
+  macro( cuda_find_host_program )
+    if (COMMAND find_host_program)
+      find_host_program( ${ARGN} )
+    else()
+      find_program( ${ARGN} )
+    endif()
+  endmacro()
+else()
+  # for non-cross-compile, find_host_program == find_program and CUDA_TOOLKIT_TARGET_DIR == CUDA_TOOLKIT_ROOT_DIR
+  macro( cuda_find_host_program )
+    find_program( ${ARGN} )
+  endmacro()
+  SET (CUDA_TOOLKIT_TARGET_DIR ${CUDA_TOOLKIT_ROOT_DIR})
+endif()
+
+
+# CUDA_NVCC_EXECUTABLE
+if(DEFINED ENV{CUDA_NVCC_EXECUTABLE})
+  set(CUDA_NVCC_EXECUTABLE "$ENV{CUDA_NVCC_EXECUTABLE}" CACHE FILEPATH "The CUDA compiler")
+else()
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE
+    NAMES nvcc
+    PATHS "${CUDA_TOOLKIT_ROOT_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_BIN_PATH
+    PATH_SUFFIXES bin bin64
+    NO_DEFAULT_PATH
+    )
+  # Search default search paths, after we search our own set of paths.
+  cuda_find_host_program(CUDA_NVCC_EXECUTABLE nvcc)
+endif()
+
+if(CUDA_NVCC_EXECUTABLE AND NOT CUDA_VERSION)
+  # Compute the version.
+  execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} "--version"
+    OUTPUT_VARIABLE NVCC_OUT
+    RESULT_VARIABLE NVCC_RC)
+  if(NOT (${NVCC_RC} EQUAL 0))
+    message(WARNING "Failed to execute '${CUDA_NVCC_EXECUTABLE} --version'")
+    set(CUDA_FOUND FALSE)
+    return()
+  endif()
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR ${NVCC_OUT})
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR ${NVCC_OUT})
+  set(CUDA_VERSION "${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}" CACHE STRING "Version of CUDA as computed from nvcc.")
+  mark_as_advanced(CUDA_VERSION)
+else()
+  # Need to set these based off of the cached value
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\1" CUDA_VERSION_MAJOR "${CUDA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\\.([0-9]+).*" "\\2" CUDA_VERSION_MINOR "${CUDA_VERSION}")
+endif()
+
+# Always set this convenience variable
+set(CUDA_VERSION_STRING "${CUDA_VERSION}")
+
+# CUDA_TOOLKIT_INCLUDE
+find_path(CUDA_TOOLKIT_INCLUDE
+  device_functions.h # Header included in toolkit
+  PATHS ${CUDA_TOOLKIT_TARGET_DIR}
+  ENV CUDA_PATH
+  ENV CUDA_INC_PATH
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  )
+# Search default search paths, after we search our own set of paths.
+find_path(CUDA_TOOLKIT_INCLUDE device_functions.h)
+mark_as_advanced(CUDA_TOOLKIT_INCLUDE)
+
+set(CUDA_HAS_FP16 TRUE)
+
+# Set the user list of include dir to nothing to initialize it.
+set (CUDA_NVCC_INCLUDE_DIRS_USER "")
+set (CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+
+macro(cuda_find_library_local_first_with_path_ext _var _names _doc _path_ext )
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # CUDA 3.2+ on Windows moved the library directories, so we need the new
+    # and old paths.
+    set(_cuda_64bit_lib_dir "${_path_ext}lib/x64" "${_path_ext}lib64" "${_path_ext}libx64" )
+  endif()
+  # CUDA 3.2+ on Windows moved the library directories, so we need to new
+  # (lib/Win32) and the old path (lib).
+  find_library(${_var}
+    NAMES ${_names}
+    PATHS "${CUDA_TOOLKIT_TARGET_DIR}"
+    ENV CUDA_PATH
+    ENV CUDA_LIB_PATH
+    PATH_SUFFIXES ${_cuda_64bit_lib_dir} "${_path_ext}lib/Win32" "${_path_ext}lib" "${_path_ext}libWin32"
+    DOC ${_doc}
+    NO_DEFAULT_PATH
+    )
+  if (NOT CMAKE_CROSSCOMPILING)
+    # Search default search paths, after we search our own set of paths.
+    find_library(${_var}
+      NAMES ${_names}
+      PATHS "/usr/lib/nvidia-current"
+      DOC ${_doc}
+      )
+  endif()
+endmacro()
+
+macro(cuda_find_library_local_first _var _names _doc)
+  cuda_find_library_local_first_with_path_ext( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+macro(find_library_local_first _var _names _doc )
+  cuda_find_library_local_first( "${_var}" "${_names}" "${_doc}" "" )
+endmacro()
+
+
+# CUDA_LIBRARIES
+cuda_find_library_local_first(CUDA_CUDART_LIBRARY cudart "\"cudart\" library")
+
+cuda_find_library_local_first(CUDA_cudart_static_LIBRARY cudart_static "static CUDA runtime library")
+mark_as_advanced(CUDA_cudart_static_LIBRARY)
+
+
+if(CUDA_cudart_static_LIBRARY)
+  # If static cudart available, use it by default, but provide a user-visible option to disable it.
+  option(CUDA_USE_STATIC_CUDA_RUNTIME "Use the static version of the CUDA runtime library if available" ON)
+else()
+  # If not available, silently disable the option.
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
+endif()
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_cudart_static_LIBRARY)
+else()
+  set(CUDA_CUDART_LIBRARY_VAR CUDA_CUDART_LIBRARY)
+endif()
+
+cuda_find_library_local_first(CUDA_cudadevrt_LIBRARY cudadevrt "\"cudadevrt\" library")
+mark_as_advanced(CUDA_cudadevrt_LIBRARY)
+
+if(CUDA_USE_STATIC_CUDA_RUNTIME)
+  if(UNIX)
+    # Check for the dependent libraries.  Here we look for pthreads.
+    if (DEFINED CMAKE_THREAD_PREFER_PTHREAD)
+      set(_cuda_cmake_thread_prefer_pthread ${CMAKE_THREAD_PREFER_PTHREAD})
+    endif()
+    set(CMAKE_THREAD_PREFER_PTHREAD 1)
+
+    # Many of the FindXYZ CMake comes with makes use of try_compile with int main(){return 0;}
+    # as the source file.  Unfortunately this causes a warning with -Wstrict-prototypes and
+    # -Werror causes the try_compile to fail.  We will just temporarily disable other flags
+    # when doing the find_package command here.
+    set(_cuda_cmake_c_flags ${CMAKE_C_FLAGS})
+    set(CMAKE_C_FLAGS "-fPIC")
+    find_package(Threads REQUIRED)
+    set(CMAKE_C_FLAGS ${_cuda_cmake_c_flags})
+
+    if (DEFINED _cuda_cmake_thread_prefer_pthread)
+      set(CMAKE_THREAD_PREFER_PTHREAD ${_cuda_cmake_thread_prefer_pthread})
+      unset(_cuda_cmake_thread_prefer_pthread)
+    else()
+      unset(CMAKE_THREAD_PREFER_PTHREAD)
+    endif()
+
+    if(NOT APPLE)
+      #On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDA_rt_LIBRARY rt)
+      if (NOT CUDA_rt_LIBRARY)
+        message(WARNING "Expecting to find librt for libcudart_static, but didn't find it.")
+      endif()
+    endif()
+  endif()
+endif()
+
+cuda_find_library_local_first_with_path_ext(CUDA_cupti_LIBRARY cupti "\"cupti\" library" "extras/CUPTI/")
+mark_as_advanced(CUDA_cupti_LIBRARY)
+
+# Set the CUDA_LIBRARIES variable.  This is the set of stuff to link against if you are
+# using the CUDA runtime.  For the dynamic version of the runtime, most of the
+# dependencies are brough in, but for the static version there are additional libraries
+# and linker commands needed.
+# Initialize to empty
+set(CUDA_LIBRARIES)
+
+# If we are using emulation mode and we found the cudartemu library then use
+# that one instead of cudart.
+if(CUDA_BUILD_EMULATION AND CUDA_CUDARTEMU_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDARTEMU_LIBRARY})
+elseif(CUDA_USE_STATIC_CUDA_RUNTIME AND CUDA_cudart_static_LIBRARY)
+  list(APPEND CUDA_LIBRARIES ${CUDA_cudart_static_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
+  if (CUDA_rt_LIBRARY)
+    list(APPEND CUDA_LIBRARIES ${CUDA_rt_LIBRARY})
+  endif()
+  if(APPLE)
+    # We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
+    # the static cuda runtime can find it at runtime.
+    list(APPEND CUDA_LIBRARIES -Wl,-rpath,/usr/local/cuda/lib)
+  endif()
+else()
+  list(APPEND CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY})
+endif()
+
+# 1.1 toolkit on linux doesn't appear to have a separate library on
+# some platforms.
+cuda_find_library_local_first(CUDA_CUDA_LIBRARY cuda "\"cuda\" library (older versions only).")
+
+mark_as_advanced(
+  CUDA_CUDA_LIBRARY
+  CUDA_CUDART_LIBRARY
+  )
+
+#######################
+# Look for some of the toolkit helper libraries
+macro(FIND_CUDA_HELPER_LIBS _name)
+  cuda_find_library_local_first(CUDA_${_name}_LIBRARY ${_name} "\"${_name}\" library")
+  mark_as_advanced(CUDA_${_name}_LIBRARY)
+endmacro()
+
+if(CUDA_BUILD_EMULATION)
+  message(FATAL_ERROR "CUDA_BUILD_EMULATION is not supported in version 3.1 and onwards.  You must disable it to proceed.  You have version ${CUDA_VERSION}.")
+endif()
+
+find_cuda_helper_libs(cufft)
+find_cuda_helper_libs(cublas)
+find_cuda_helper_libs(cublasLt)
+# cusparse showed up in version 3.2
+find_cuda_helper_libs(cusparse)
+find_cuda_helper_libs(curand)
+if (WIN32)
+  find_cuda_helper_libs(nvcuvenc)
+  find_cuda_helper_libs(nvcuvid)
+endif()
+
+# In CUDA 9.0 NPP was nppi was removed
+find_cuda_helper_libs(nppc)
+find_cuda_helper_libs(nppial)
+find_cuda_helper_libs(nppicc)
+find_cuda_helper_libs(nppicom)
+find_cuda_helper_libs(nppidei)
+find_cuda_helper_libs(nppif)
+find_cuda_helper_libs(nppig)
+find_cuda_helper_libs(nppim)
+find_cuda_helper_libs(nppist)
+find_cuda_helper_libs(nppisu)
+find_cuda_helper_libs(nppitc)
+find_cuda_helper_libs(npps)
+set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppial_LIBRARY};${CUDA_nppicc_LIBRARY};${CUDA_nppicom_LIBRARY};${CUDA_nppidei_LIBRARY};${CUDA_nppif_LIBRARY};${CUDA_nppig_LIBRARY};${CUDA_nppim_LIBRARY};${CUDA_nppist_LIBRARY};${CUDA_nppisu_LIBRARY};${CUDA_nppitc_LIBRARY};${CUDA_npps_LIBRARY}")
+# cusolver showed up in version 7.0
+find_cuda_helper_libs(cusolver)
+
+if (CUDA_BUILD_EMULATION)
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufftemu_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublasemu_LIBRARY})
+else()
+  set(CUDA_CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY})
+  set(CUDA_CUBLAS_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
+endif()
+
+########################
+# Look for the SDK stuff.  As of CUDA 3.0 NVSDKCUDA_ROOT has been replaced with
+# NVSDKCOMPUTE_ROOT with the old CUDA C contents moved into the C subdirectory
+find_path(CUDA_SDK_ROOT_DIR common/inc/cutil.h
+ HINTS
+  "$ENV{NVSDKCOMPUTE_ROOT}/C"
+  ENV NVSDKCUDA_ROOT
+  "[HKEY_LOCAL_MACHINE\\SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA SDK 10\\Compute;InstallDir]"
+ PATHS
+  "/Developer/GPU\ Computing/C"
+  )
+
+# Keep the CUDA_SDK_ROOT_DIR first in order to be able to override the
+# environment variables.
+set(CUDA_SDK_SEARCH_PATH
+  "${CUDA_SDK_ROOT_DIR}"
+  "${CUDA_TOOLKIT_ROOT_DIR}/local/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NVSDK0.2"
+  "${CUDA_TOOLKIT_ROOT_DIR}/NV_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK"
+  "$ENV{HOME}/NVIDIA_CUDA_SDK_MACOSX"
+  "/Developer/CUDA"
+  )
+
+# Example of how to find an include file from the CUDA_SDK_ROOT_DIR
+
+# find_path(CUDA_CUT_INCLUDE_DIR
+#   cutil.h
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   PATH_SUFFIXES "common/inc"
+#   DOC "Location of cutil.h"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_path(CUDA_CUT_INCLUDE_DIR cutil.h DOC "Location of cutil.h")
+
+# mark_as_advanced(CUDA_CUT_INCLUDE_DIR)
+
+
+# Example of how to find a library in the CUDA_SDK_ROOT_DIR
+
+# # cutil library is called cutil64 for 64 bit builds on windows.  We don't want
+# # to get these confused, so we are setting the name based on the word size of
+# # the build.
+
+# if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+#   set(cuda_cutil_name cutil64)
+# else()
+#   set(cuda_cutil_name cutil32)
+# endif()
+
+# find_library(CUDA_CUT_LIBRARY
+#   NAMES cutil ${cuda_cutil_name}
+#   PATHS ${CUDA_SDK_SEARCH_PATH}
+#   # The new version of the sdk shows up in common/lib, but the old one is in lib
+#   PATH_SUFFIXES "common/lib" "lib"
+#   DOC "Location of cutil library"
+#   NO_DEFAULT_PATH
+#   )
+# # Now search system paths
+# find_library(CUDA_CUT_LIBRARY NAMES cutil ${cuda_cutil_name} DOC "Location of cutil library")
+# mark_as_advanced(CUDA_CUT_LIBRARY)
+# set(CUDA_CUT_LIBRARIES ${CUDA_CUT_LIBRARY})
+
+
+
+#############################
+# Check for required components
+set(CUDA_FOUND TRUE)
+
+set(CUDA_TOOLKIT_ROOT_DIR_INTERNAL "${CUDA_TOOLKIT_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was set successfully." FORCE)
+set(CUDA_TOOLKIT_TARGET_DIR_INTERNAL "${CUDA_TOOLKIT_TARGET_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was set successfully." FORCE)
+set(CUDA_SDK_ROOT_DIR_INTERNAL "${CUDA_SDK_ROOT_DIR}" CACHE INTERNAL
+  "This is the value of the last time CUDA_SDK_ROOT_DIR was set successfully." FORCE)
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake)
+
+find_package_handle_standard_args(CUDA
+  REQUIRED_VARS
+    CUDA_TOOLKIT_ROOT_DIR
+    CUDA_NVCC_EXECUTABLE
+    CUDA_INCLUDE_DIRS
+    ${CUDA_CUDART_LIBRARY_VAR}
+  VERSION_VAR
+    CUDA_VERSION
+  )
+
+
+
+###############################################################################
+###############################################################################
+# Macros
+###############################################################################
+###############################################################################
+
+###############################################################################
+# Add include directories to pass to the nvcc command.
+macro(CUDA_INCLUDE_DIRECTORIES)
+  foreach(dir ${ARGN})
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS_USER ${dir})
+  endforeach()
+endmacro()
+
+
+##############################################################################
+cuda_find_helper_file(parse_cubin cmake)
+cuda_find_helper_file(make2cmake cmake)
+cuda_find_helper_file(run_nvcc cmake)
+include("${CMAKE_CURRENT_LIST_DIR}/FindCUDA/select_compute_arch.cmake")
+
+##############################################################################
+# Separate the OPTIONS out from the sources
+#
+macro(CUDA_GET_SOURCES_AND_OPTIONS _sources _cmake_options _options)
+  set( ${_sources} )
+  set( ${_cmake_options} )
+  set( ${_options} )
+  set( _found_options FALSE )
+  foreach(arg ${ARGN})
+    if("x${arg}" STREQUAL "xOPTIONS")
+      set( _found_options TRUE )
+    elseif(
+        "x${arg}" STREQUAL "xWIN32" OR
+        "x${arg}" STREQUAL "xMACOSX_BUNDLE" OR
+        "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+        "x${arg}" STREQUAL "xSTATIC" OR
+        "x${arg}" STREQUAL "xSHARED" OR
+        "x${arg}" STREQUAL "xMODULE"
+        )
+      list(APPEND ${_cmake_options} ${arg})
+    else()
+      if ( _found_options )
+        list(APPEND ${_options} ${arg})
+      else()
+        # Assume this is a file
+        list(APPEND ${_sources} ${arg})
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Parse the OPTIONS from ARGN and set the variables prefixed by _option_prefix
+#
+macro(CUDA_PARSE_NVCC_OPTIONS _option_prefix)
+  set( _found_config )
+  foreach(arg ${ARGN})
+    # Determine if we are dealing with a perconfiguration flag
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      if (arg STREQUAL "${config_upper}")
+        set( _found_config _${arg})
+        # Set arg to nothing to keep it from being processed further
+        set( arg )
+      endif()
+    endforeach()
+
+    if ( arg )
+      list(APPEND ${_option_prefix}${_found_config} "${arg}")
+    endif()
+  endforeach()
+endmacro()
+
+##############################################################################
+# Helper to add the include directory for CUDA only once
+function(CUDA_ADD_CUDA_INCLUDE_ONCE)
+  get_directory_property(_include_directories INCLUDE_DIRECTORIES)
+  set(_add TRUE)
+  if(_include_directories)
+    foreach(dir ${_include_directories})
+      if("${dir}" STREQUAL "${CUDA_INCLUDE_DIRS}")
+        set(_add FALSE)
+      endif()
+    endforeach()
+  endif()
+  if(_add)
+    include_directories(${CUDA_INCLUDE_DIRS})
+  endif()
+endfunction()
+
+function(CUDA_BUILD_SHARED_LIBRARY shared_flag)
+  set(cmake_args ${ARGN})
+  # If SHARED, MODULE, or STATIC aren't already in the list of arguments, then
+  # add SHARED or STATIC based on the value of BUILD_SHARED_LIBS.
+  list(FIND cmake_args SHARED _cuda_found_SHARED)
+  list(FIND cmake_args MODULE _cuda_found_MODULE)
+  list(FIND cmake_args STATIC _cuda_found_STATIC)
+  if( _cuda_found_SHARED GREATER -1 OR
+      _cuda_found_MODULE GREATER -1 OR
+      _cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs)
+  else()
+    if (BUILD_SHARED_LIBS)
+      set(_cuda_build_shared_libs SHARED)
+    else()
+      set(_cuda_build_shared_libs STATIC)
+    endif()
+  endif()
+  set(${shared_flag} ${_cuda_build_shared_libs} PARENT_SCOPE)
+endfunction()
+
+##############################################################################
+# Helper to avoid clashes of files with the same basename but different paths.
+# This doesn't attempt to do exactly what CMake internals do, which is to only
+# add this path when there is a conflict, since by the time a second collision
+# in names is detected it's already too late to fix the first one.  For
+# consistency sake the relative path will be added to all files.
+function(CUDA_COMPUTE_BUILD_PATH path build_path)
+  #message("CUDA_COMPUTE_BUILD_PATH([${path}] ${build_path})")
+  # Only deal with CMake style paths from here on out
+  file(TO_CMAKE_PATH "${path}" bpath)
+  if (IS_ABSOLUTE "${bpath}")
+    # Absolute paths are generally unnessary, especially if something like
+    # file(GLOB_RECURSE) is used to pick up the files.
+
+    string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+    if (_binary_dir_pos EQUAL 0)
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+    else()
+      file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+    endif()
+  endif()
+
+  # This recipe is from cmLocalGenerator::CreateSafeUniqueObjectFileName in the
+  # CMake source.
+
+  # Remove leading /
+  string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+  # Avoid absolute paths by removing ':'
+  string(REPLACE ":" "_" bpath "${bpath}")
+  # Avoid relative paths that go up the tree
+  string(REPLACE "../" "__/" bpath "${bpath}")
+  # Avoid spaces
+  string(REPLACE " " "_" bpath "${bpath}")
+
+  # Strip off the filename.  I wait until here to do it, since removin the
+  # basename can make a path that looked like path/../basename turn into
+  # path/.. (notice the trailing slash).
+  get_filename_component(bpath "${bpath}" PATH)
+
+  set(${build_path} "${bpath}" PARENT_SCOPE)
+  #message("${build_path} = ${bpath}")
+endfunction()
+
+##############################################################################
+# This helper macro populates the following variables and setups up custom
+# commands and targets to invoke the nvcc compiler to generate C or PTX source
+# dependent upon the format parameter.  The compiler is invoked once with -M
+# to generate a dependency file and a second time with -cuda or -ptx to generate
+# a .cpp or .ptx file.
+# INPUT:
+#   cuda_target         - Target name
+#   format              - PTX, CUBIN, FATBIN or OBJ
+#   FILE1 .. FILEN      - The remaining arguments are the sources to be wrapped.
+#   OPTIONS             - Extra options to NVCC
+# OUTPUT:
+#   generated_files     - List of generated files
+##############################################################################
+##############################################################################
+
+macro(CUDA_WRAP_SRCS cuda_target format generated_files)
+
+  # Put optional arguments in list.
+  set(_argn_list "${ARGN}")
+  # If one of the given optional arguments is "PHONY", make a note of it, then
+  # remove it from the list.
+  list(FIND _argn_list "PHONY" _phony_idx)
+  if("${_phony_idx}" GREATER "-1")
+    set(_target_is_phony true)
+    list(REMOVE_AT _argn_list ${_phony_idx})
+  else()
+    set(_target_is_phony false)
+  endif()
+
+  # If CMake doesn't support separable compilation, complain
+  if(CUDA_SEPARABLE_COMPILATION AND CMAKE_VERSION VERSION_LESS "2.8.10.1")
+    message(SEND_ERROR "CUDA_SEPARABLE_COMPILATION isn't supported for CMake versions less than 2.8.10.1")
+  endif()
+
+  # Set up all the command line flags here, so that they can be overridden on a per target basis.
+
+  set(nvcc_flags "")
+
+  # Emulation if the card isn't present.
+  if (CUDA_BUILD_EMULATION)
+    # Emulation.
+    set(nvcc_flags ${nvcc_flags} --device-emulation -D_DEVICEEMU -g)
+  else()
+    # Device mode.  No flags necessary.
+  endif()
+
+  if(CUDA_HOST_COMPILATION_CPP)
+    set(CUDA_C_OR_CXX CXX)
+  else()
+    message(WARNING "--host-compilation flag is deprecated in CUDA version >= 3.0.  Removing --host-compilation C flag" )
+    set(CUDA_C_OR_CXX C)
+  endif()
+
+  set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+
+  if(CUDA_64_BIT_DEVICE_CODE)
+    set(nvcc_flags ${nvcc_flags} -m64)
+  else()
+    set(nvcc_flags ${nvcc_flags} -m32)
+  endif()
+
+  if(CUDA_TARGET_CPU_ARCH)
+    set(nvcc_flags ${nvcc_flags} "--target-cpu-architecture=${CUDA_TARGET_CPU_ARCH}")
+  endif()
+
+  # This needs to be passed in at this stage, because VS needs to fill out the
+  # various macros from within VS.  Note that CCBIN is only used if
+  # -ccbin or --compiler-bindir isn't used and CUDA_HOST_COMPILER matches
+  # _CUDA_MSVC_HOST_COMPILER
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set(ccbin_flags -D "\"CCBIN:PATH=${_CUDA_MSVC_HOST_COMPILER}\"" )
+  else()
+    set(ccbin_flags)
+  endif()
+
+  # Figure out which configure we will use and pass that in as an argument to
+  # the script.  We need to defer the decision until compilation time, because
+  # for VS projects we won't know if we are making a debug or release build
+  # until build time.
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    set( CUDA_build_configuration "$(ConfigurationName)" )
+  else()
+    set( CUDA_build_configuration "${CMAKE_BUILD_TYPE}")
+  endif()
+
+  # Initialize our list of includes with the user ones followed by the CUDA system ones.
+  set(CUDA_NVCC_INCLUDE_DIRS ${CUDA_NVCC_INCLUDE_DIRS_USER} "${CUDA_INCLUDE_DIRS}")
+  if(_target_is_phony)
+    # If the passed in target name isn't a real target (i.e., this is from a call to one of the
+    # cuda_compile_* functions), need to query directory properties to get include directories
+    # and compile definitions.
+    get_directory_property(_dir_include_dirs INCLUDE_DIRECTORIES)
+    get_directory_property(_dir_compile_defs COMPILE_DEFINITIONS)
+
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "${_dir_include_dirs}")
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "${_dir_compile_defs}")
+  else()
+    # Append the include directories for this target via generator expression, which is
+    # expanded by the FILE(GENERATE) call below.  This generator expression captures all
+    # include dirs set by the user, whether via directory properties or target properties
+    list(APPEND CUDA_NVCC_INCLUDE_DIRS "$<TARGET_PROPERTY:${cuda_target},INCLUDE_DIRECTORIES>")
+
+    # Do the same thing with compile definitions
+    set(CUDA_NVCC_COMPILE_DEFINITIONS "$<TARGET_PROPERTY:${cuda_target},COMPILE_DEFINITIONS>")
+  endif()
+
+
+  # Reset these variables
+  set(CUDA_WRAP_OPTION_NVCC_FLAGS)
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper})
+  endforeach()
+
+  CUDA_GET_SOURCES_AND_OPTIONS(_cuda_wrap_sources _cuda_wrap_cmake_options _cuda_wrap_options ${_argn_list})
+  CUDA_PARSE_NVCC_OPTIONS(CUDA_WRAP_OPTION_NVCC_FLAGS ${_cuda_wrap_options})
+
+  # Figure out if we are building a shared library.  BUILD_SHARED_LIBS is
+  # respected in CUDA_ADD_LIBRARY.
+  set(_cuda_build_shared_libs FALSE)
+  # SHARED, MODULE
+  list(FIND _cuda_wrap_cmake_options SHARED _cuda_found_SHARED)
+  list(FIND _cuda_wrap_cmake_options MODULE _cuda_found_MODULE)
+  if(_cuda_found_SHARED GREATER -1 OR _cuda_found_MODULE GREATER -1)
+    set(_cuda_build_shared_libs TRUE)
+  endif()
+  # STATIC
+  list(FIND _cuda_wrap_cmake_options STATIC _cuda_found_STATIC)
+  if(_cuda_found_STATIC GREATER -1)
+    set(_cuda_build_shared_libs FALSE)
+  endif()
+
+  # CUDA_HOST_FLAGS
+  if(_cuda_build_shared_libs)
+    # If we are setting up code for a shared library, then we need to add extra flags for
+    # compiling objects for shared libraries.
+    set(CUDA_HOST_SHARED_FLAGS ${CMAKE_SHARED_LIBRARY_${CUDA_C_OR_CXX}_FLAGS})
+  else()
+    set(CUDA_HOST_SHARED_FLAGS)
+  endif()
+
+  macro(_filter_blocklisted_host_flags CUDA_FLAGS)
+    string(REGEX REPLACE "[ \t]+" ";" ${CUDA_FLAGS} "${${CUDA_FLAGS}}")
+    foreach(_blacklisted ${CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST})
+      list(REMOVE_ITEM ${CUDA_FLAGS} "${_blacklisted}")
+    endforeach()
+    string(REPLACE ";" " " ${CUDA_FLAGS} "${${CUDA_FLAGS}}")
+  endmacro()
+
+  # Only add the CMAKE_{C,CXX}_FLAGS if we are propagating host flags.  We
+  # always need to set the SHARED_FLAGS, though.
+  if(CUDA_PROPAGATE_HOST_FLAGS)
+    set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    _filter_blocklisted_host_flags(_cuda_C_FLAGS)
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${_cuda_C_FLAGS} ${CUDA_HOST_SHARED_FLAGS})")
+  else()
+    set(_cuda_host_flags "set(CMAKE_HOST_FLAGS ${CUDA_HOST_SHARED_FLAGS})")
+  endif()
+
+  set(_cuda_nvcc_flags_config "# Build specific configuration flags")
+  # Loop over all the configuration types to generate appropriate flags for run_nvcc.cmake
+  foreach(config ${CUDA_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    # CMAKE_FLAGS are strings and not lists.  By not putting quotes around CMAKE_FLAGS
+    # we convert the strings to lists (like we want).
+
+    if(CUDA_PROPAGATE_HOST_FLAGS)
+      # nvcc chokes on -g3 in versions previous to 3.0, so replace it with -g
+      set(_cuda_fix_g3 FALSE)
+
+      set(_cuda_C_FLAGS "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      _filter_blocklisted_host_flags(_cuda_C_FLAGS)
+      if(_cuda_fix_g3)
+        string(REPLACE "-g3" "-g" _cuda_C_FLAGS "${_cuda_C_FLAGS}")
+      endif()
+
+      string(APPEND _cuda_host_flags "\nset(CMAKE_HOST_FLAGS_${config_upper} ${_cuda_C_FLAGS})")
+    endif()
+
+    # Note that if we ever want CUDA_NVCC_FLAGS_<CONFIG> to be string (instead of a list
+    # like it is currently), we can remove the quotes around the
+    # ${CUDA_NVCC_FLAGS_${config_upper}} variable like the CMAKE_HOST_FLAGS_<CONFIG> variable.
+    string(APPEND _cuda_nvcc_flags_config "\nset(CUDA_NVCC_FLAGS_${config_upper} ${CUDA_NVCC_FLAGS_${config_upper}} ;; ${CUDA_WRAP_OPTION_NVCC_FLAGS_${config_upper}})")
+  endforeach()
+
+  # Process the C++14 flag.  If the host sets the flag, we need to add it to nvcc and
+  # remove it from the host. This is because -Xcompile -std=c++ will choke nvcc (it uses
+  # the C preprocessor).  In order to get this to work correctly, we need to use nvcc's
+  # specific c++14 flag.
+  if( "${_cuda_host_flags}" MATCHES "-std=c\\+\\+11")
+    # Add the c++14 flag to nvcc if it isn't already present.  Note that we only look at
+    # the main flag instead of the configuration specific flags.
+    if( NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+14" )
+      list(APPEND nvcc_flags --std c++14)
+    endif()
+    string(REGEX REPLACE "[-]+std=c\\+\\+14" "" _cuda_host_flags "${_cuda_host_flags}")
+  endif()
+
+  if(_cuda_build_shared_libs)
+    list(APPEND nvcc_flags "-D${cuda_target}_EXPORTS")
+  endif()
+
+  # Reset the output variable
+  set(_cuda_wrap_generated_files "")
+
+  # Iterate over the macro arguments and create custom
+  # commands for all the .cu files.
+  foreach(file ${_argn_list})
+    # Ignore any file marked as a HEADER_FILE_ONLY
+    get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+    # Allow per source file overrides of the format.  Also allows compiling non-.cu files.
+    get_source_file_property(_cuda_source_format ${file} CUDA_SOURCE_PROPERTY_FORMAT)
+    if((${file} MATCHES "\\.cu$" OR _cuda_source_format) AND NOT _is_header)
+
+      if(NOT _cuda_source_format)
+        set(_cuda_source_format ${format})
+      endif()
+      # If file isn't a .cu file, we need to tell nvcc to treat it as such.
+      if(NOT file MATCHES "\\.cu$")
+        set(cuda_language_flag -x=cu)
+      else()
+        set(cuda_language_flag)
+      endif()
+
+      if( ${_cuda_source_format} MATCHES "OBJ")
+        set( cuda_compile_to_external_module OFF )
+      else()
+        set( cuda_compile_to_external_module ON )
+        if( ${_cuda_source_format} MATCHES "PTX" )
+          set( cuda_compile_to_external_module_type "ptx" )
+        elseif( ${_cuda_source_format} MATCHES "CUBIN")
+          set( cuda_compile_to_external_module_type "cubin" )
+        elseif( ${_cuda_source_format} MATCHES "FATBIN")
+          set( cuda_compile_to_external_module_type "fatbin" )
+        else()
+          message( FATAL_ERROR "Invalid format flag passed to CUDA_WRAP_SRCS or set with CUDA_SOURCE_PROPERTY_FORMAT file property for file '${file}': '${_cuda_source_format}'.  Use OBJ, PTX, CUBIN or FATBIN.")
+        endif()
+      endif()
+
+      if(cuda_compile_to_external_module)
+        # Don't use any of the host compilation flags for PTX targets.
+        set(CUDA_HOST_FLAGS)
+        set(CUDA_NVCC_FLAGS_CONFIG)
+      else()
+        set(CUDA_HOST_FLAGS ${_cuda_host_flags})
+        set(CUDA_NVCC_FLAGS_CONFIG ${_cuda_nvcc_flags_config})
+      endif()
+
+      # Determine output directory
+      cuda_compute_build_path("${file}" cuda_build_path)
+      set(cuda_compile_intermediate_directory "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${cuda_build_path}")
+      if(CUDA_GENERATED_OUTPUT_DIR)
+        set(cuda_compile_output_dir "${CUDA_GENERATED_OUTPUT_DIR}")
+      else()
+        if ( cuda_compile_to_external_module )
+          set(cuda_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}")
+        else()
+          set(cuda_compile_output_dir "${cuda_compile_intermediate_directory}")
+        endif()
+      endif()
+
+      # Add a custom target to generate a c or ptx file. ######################
+
+      get_filename_component( basename ${file} NAME )
+      if( cuda_compile_to_external_module )
+        set(generated_file_path "${cuda_compile_output_dir}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}.${cuda_compile_to_external_module_type}")
+        set(format_flag "-${cuda_compile_to_external_module_type}")
+        file(MAKE_DIRECTORY "${cuda_compile_output_dir}")
+      else()
+        set(generated_file_path "${cuda_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+        set(generated_file_basename "${cuda_target}_generated_${basename}${generated_extension}")
+        if(CUDA_SEPARABLE_COMPILATION)
+          set(format_flag "-dc")
+        else()
+          set(format_flag "-c")
+        endif()
+      endif()
+
+      # Set all of our file names.  Make sure that whatever filenames that have
+      # generated_file_path in them get passed in through as a command line
+      # argument, so that the ${CMAKE_CFG_INTDIR} gets expanded at run time
+      # instead of configure time.
+      set(generated_file "${generated_file_path}/${generated_file_basename}")
+      set(cmake_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.depend")
+      set(NVCC_generated_dependency_file "${cuda_compile_intermediate_directory}/${generated_file_basename}.NVCC-depend")
+      set(generated_cubin_file "${generated_file_path}/${generated_file_basename}.cubin.txt")
+      set(custom_target_script_pregen "${cuda_compile_intermediate_directory}/${generated_file_basename}.cmake.pre-gen")
+      set(custom_target_script "${cuda_compile_intermediate_directory}/${generated_file_basename}$<$<BOOL:$<CONFIG>>:.$<CONFIG>>.cmake")
+
+      # Setup properties for obj files:
+      if( NOT cuda_compile_to_external_module )
+        set_source_files_properties("${generated_file}"
+          PROPERTIES
+          EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked.
+          )
+      endif()
+
+      # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path.
+      get_filename_component(file_path "${file}" PATH)
+      if(IS_ABSOLUTE "${file_path}")
+        set(source_file "${file}")
+      else()
+        set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+      endif()
+
+      if( NOT cuda_compile_to_external_module AND CUDA_SEPARABLE_COMPILATION)
+        list(APPEND ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS "${generated_file}")
+      endif()
+
+      # Bring in the dependencies.  Creates a variable CUDA_NVCC_DEPEND #######
+      cuda_include_nvcc_dependencies(${cmake_dependency_file})
+
+      # Convenience string for output #########################################
+      if(CUDA_BUILD_EMULATION)
+        set(cuda_build_type "Emulation")
+      else()
+        set(cuda_build_type "Device")
+      endif()
+
+      # Build the NVCC made dependency file ###################################
+      set(build_cubin OFF)
+      if ( NOT CUDA_BUILD_EMULATION AND CUDA_BUILD_CUBIN )
+         if ( NOT cuda_compile_to_external_module )
+           set ( build_cubin ON )
+         endif()
+      endif()
+
+      # Configure the build script
+      configure_file("${CUDA_run_nvcc}" "${custom_target_script_pregen}" @ONLY)
+      file(GENERATE
+        OUTPUT "${custom_target_script}"
+        INPUT "${custom_target_script_pregen}"
+        )
+
+      # So if a user specifies the same cuda file as input more than once, you
+      # can have bad things happen with dependencies.  Here we check an option
+      # to see if this is the behavior they want.
+      if(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE)
+        set(main_dep MAIN_DEPENDENCY ${source_file})
+      else()
+        set(main_dep DEPENDS ${source_file})
+      endif()
+
+      if(CUDA_VERBOSE_BUILD)
+        set(verbose_output ON)
+      elseif(CMAKE_GENERATOR MATCHES "Makefiles")
+        set(verbose_output "$(VERBOSE)")
+      # This condition lets us also turn on verbose output when someone
+      # specifies CMAKE_VERBOSE_MAKEFILE, even if the generator isn't
+      # the Makefiles generator (this is important for us, Ninja users.)
+      elseif(CMAKE_VERBOSE_MAKEFILE)
+        set(verbose_output ON)
+      else()
+        set(verbose_output OFF)
+      endif()
+
+      # Create up the comment string
+      file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+      if(cuda_compile_to_external_module)
+        set(cuda_build_comment_string "Building NVCC ${cuda_compile_to_external_module_type} file ${generated_file_relative_path}")
+      else()
+        set(cuda_build_comment_string "Building NVCC (${cuda_build_type}) object ${generated_file_relative_path}")
+      endif()
+
+      set(_verbatim VERBATIM)
+      if(ccbin_flags MATCHES "\\$\\(VCInstallDir\\)")
+        set(_verbatim "")
+      endif()
+
+      # Build the generated file and dependency file ##########################
+      add_custom_command(
+        OUTPUT ${generated_file}
+        # These output files depend on the source_file and the contents of cmake_dependency_file
+        ${main_dep}
+        DEPENDS ${CUDA_NVCC_DEPEND}
+        DEPENDS ${custom_target_script}
+        # Make sure the output directory exists before trying to write to it.
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+        COMMAND ${CMAKE_COMMAND} ARGS
+          -D verbose:BOOL=${verbose_output}
+          ${ccbin_flags}
+          -D build_configuration:STRING=${CUDA_build_configuration}
+          -D "generated_file:STRING=${generated_file}"
+          -D "generated_cubin_file:STRING=${generated_cubin_file}"
+          -P "${custom_target_script}"
+        WORKING_DIRECTORY "${cuda_compile_intermediate_directory}"
+        COMMENT "${cuda_build_comment_string}"
+        ${_verbatim}
+        )
+
+      # Make sure the build system knows the file is generated.
+      set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+
+      list(APPEND _cuda_wrap_generated_files ${generated_file})
+
+      # Add the other files that we want cmake to clean on a cleanup ##########
+      list(APPEND CUDA_ADDITIONAL_CLEAN_FILES "${cmake_dependency_file}")
+      list(REMOVE_DUPLICATES CUDA_ADDITIONAL_CLEAN_FILES)
+      set(CUDA_ADDITIONAL_CLEAN_FILES ${CUDA_ADDITIONAL_CLEAN_FILES} CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+
+    endif()
+  endforeach()
+
+  # Set the return parameter
+  set(${generated_files} ${_cuda_wrap_generated_files})
+endmacro()
+
+function(_cuda_get_important_host_flags important_flags flag_string)
+  if(CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(REGEX MATCHALL "/M[DT][d]?" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  else()
+    string(REGEX MATCHALL "-fPIC" flags "${flag_string}")
+    list(APPEND ${important_flags} ${flags})
+  endif()
+  set(${important_flags} ${${important_flags}} PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+###############################################################################
+# Separable Compilation Link
+###############################################################################
+###############################################################################
+
+# Compute the filename to be used by CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS
+function(CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME output_file_var cuda_target object_files)
+  if (object_files)
+    set(generated_extension ${CMAKE_${CUDA_C_OR_CXX}_OUTPUT_EXTENSION})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${cuda_target}.dir/${CMAKE_CFG_INTDIR}/${cuda_target}_intermediate_link${generated_extension}")
+  else()
+    set(output_file)
+  endif()
+
+  set(${output_file_var} "${output_file}" PARENT_SCOPE)
+endfunction()
+
+# Setup the build rule for the separable compilation intermediate link file.
+function(CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS output_file cuda_target options object_files)
+  if (object_files)
+
+    set_source_files_properties("${output_file}"
+      PROPERTIES
+      EXTERNAL_OBJECT TRUE # This is an object file not to be compiled, but only
+                           # be linked.
+      GENERATED TRUE       # This file is generated during the build
+      )
+
+    # For now we are ignoring all the configuration specific flags.
+    set(nvcc_flags)
+    CUDA_PARSE_NVCC_OPTIONS(nvcc_flags ${options})
+    if(CUDA_64_BIT_DEVICE_CODE)
+      list(APPEND nvcc_flags -m64)
+    else()
+      list(APPEND nvcc_flags -m32)
+    endif()
+    # If -ccbin, --compiler-bindir has been specified, don't do anything.  Otherwise add it here.
+    list( FIND nvcc_flags "-ccbin" ccbin_found0 )
+    list( FIND nvcc_flags "--compiler-bindir" ccbin_found1 )
+    if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+      # Match VERBATIM check below.
+      if(CUDA_HOST_COMPILER MATCHES "\\$\\(VCInstallDir\\)")
+        list(APPEND nvcc_flags -ccbin "\"${CUDA_HOST_COMPILER}\"")
+      else()
+        list(APPEND nvcc_flags -ccbin "${CUDA_HOST_COMPILER}")
+      endif()
+    endif()
+
+    # Create a list of flags specified by CUDA_NVCC_FLAGS_${CONFIG} and CMAKE_${CUDA_C_OR_CXX}_FLAGS*
+    set(config_specific_flags)
+    set(flags)
+    foreach(config ${CUDA_configuration_types})
+      string(TOUPPER ${config} config_upper)
+      # Add config specific flags
+      foreach(f ${CUDA_NVCC_FLAGS_${config_upper}})
+        list(APPEND config_specific_flags $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+      set(important_host_flags)
+      _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS_${config_upper}}")
+      foreach(f ${important_host_flags})
+        list(APPEND flags $<$<CONFIG:${config}>:-Xcompiler> $<$<CONFIG:${config}>:${f}>)
+      endforeach()
+    endforeach()
+    # Add CMAKE_${CUDA_C_OR_CXX}_FLAGS
+    set(important_host_flags)
+    _cuda_get_important_host_flags(important_host_flags "${CMAKE_${CUDA_C_OR_CXX}_FLAGS}")
+    foreach(f ${important_host_flags})
+      list(APPEND flags -Xcompiler ${f})
+    endforeach()
+
+    # Add our general CUDA_NVCC_FLAGS with the configuration specifig flags
+    set(nvcc_flags ${CUDA_NVCC_FLAGS} ${config_specific_flags} ${nvcc_flags})
+
+    file(RELATIVE_PATH output_file_relative_path "${CMAKE_BINARY_DIR}" "${output_file}")
+
+    # Some generators don't handle the multiple levels of custom command
+    # dependencies correctly (obj1 depends on file1, obj2 depends on obj1), so
+    # we work around that issue by compiling the intermediate link object as a
+    # pre-link custom command in that situation.
+    set(do_obj_build_rule TRUE)
+    if (MSVC_VERSION GREATER 1599 AND MSVC_VERSION LESS 1800)
+      # VS 2010 and 2012 have this problem.
+      set(do_obj_build_rule FALSE)
+    endif()
+
+    set(_verbatim VERBATIM)
+    if(nvcc_flags MATCHES "\\$\\(VCInstallDir\\)")
+      set(_verbatim "")
+    endif()
+
+    if (do_obj_build_rule)
+      add_custom_command(
+        OUTPUT ${output_file}
+        DEPENDS ${object_files}
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} -dlink ${object_files} -o ${output_file}
+        ${flags}
+        COMMENT "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND_EXPAND_LISTS
+        ${_verbatim}
+        )
+    else()
+      get_filename_component(output_file_dir "${output_file}" DIRECTORY)
+      add_custom_command(
+        TARGET ${cuda_target}
+        PRE_LINK
+        COMMAND ${CMAKE_COMMAND} -E echo "Building NVCC intermediate link file ${output_file_relative_path}"
+        COMMAND ${CMAKE_COMMAND} -E make_directory "${output_file_dir}"
+        COMMAND ${CUDA_NVCC_EXECUTABLE} ${nvcc_flags} ${flags} -dlink ${object_files} -o "${output_file}"
+        COMMAND_EXPAND_LISTS
+        ${_verbatim}
+        )
+    endif()
+ endif()
+endfunction()
+
+###############################################################################
+###############################################################################
+# ADD LIBRARY
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_LIBRARY cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  CUDA_BUILD_SHARED_LIBRARY(_cuda_shared_flag ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources}
+    ${_cmake_options} ${_cuda_shared_flag}
+    OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_library(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+    ${CUDA_LIBRARIES}
+    )
+
+  if(CUDA_SEPARABLE_COMPILATION)
+    target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+      ${CUDA_cudadevrt_LIBRARY}
+      )
+  endif()
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# ADD EXECUTABLE
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_EXECUTABLE cuda_target)
+
+  CUDA_ADD_CUDA_INCLUDE_ONCE()
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${cuda_target} OBJ _generated_files ${_sources} OPTIONS ${_options} )
+
+  # Compute the file name of the intermedate link file used for separable
+  # compilation.
+  CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME(link_file ${cuda_target} "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  # Add the library.
+  add_executable(${cuda_target} ${_cmake_options}
+    ${_generated_files}
+    ${_sources}
+    ${link_file}
+    )
+
+  # Add a link phase for the separable compilation if it has been enabled.  If
+  # it has been enabled then the ${cuda_target}_SEPARABLE_COMPILATION_OBJECTS
+  # variable will have been defined.
+  CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS("${link_file}" ${cuda_target} "${_options}" "${${cuda_target}_SEPARABLE_COMPILATION_OBJECTS}")
+
+  target_link_libraries(${cuda_target} ${CUDA_LINK_LIBRARIES_KEYWORD}
+    ${CUDA_LIBRARIES}
+    )
+
+  # We need to set the linker language based on what the expected generated file
+  # would be. CUDA_C_OR_CXX is computed based on CUDA_HOST_COMPILATION_CPP.
+  set_target_properties(${cuda_target}
+    PROPERTIES
+    LINKER_LANGUAGE ${CUDA_C_OR_CXX}
+    )
+
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# (Internal) helper for manually added cuda source files with specific targets
+###############################################################################
+###############################################################################
+macro(cuda_compile_base cuda_target format generated_files)
+  # Update a counter in this directory, to keep phony target names unique.
+  set(_cuda_target "${cuda_target}")
+  get_property(_counter DIRECTORY PROPERTY _cuda_internal_phony_counter)
+  if(_counter)
+    math(EXPR _counter "${_counter} + 1")
+  else()
+    set(_counter 1)
+  endif()
+  string(APPEND _cuda_target "_${_counter}")
+  set_property(DIRECTORY PROPERTY _cuda_internal_phony_counter ${_counter})
+
+  # Separate the sources from the options
+  CUDA_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _options ${ARGN})
+
+  # Create custom commands and targets for each file.
+  CUDA_WRAP_SRCS( ${_cuda_target} ${format} _generated_files ${_sources}
+                  ${_cmake_options} OPTIONS ${_options} PHONY)
+
+  set( ${generated_files} ${_generated_files})
+
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE generated_files)
+  cuda_compile_base(cuda_compile OBJ ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE PTX
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_PTX generated_files)
+  cuda_compile_base(cuda_compile_ptx PTX ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE FATBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_FATBIN generated_files)
+  cuda_compile_base(cuda_compile_fatbin FATBIN ${generated_files} ${ARGN})
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA COMPILE CUBIN
+###############################################################################
+###############################################################################
+macro(CUDA_COMPILE_CUBIN generated_files)
+  cuda_compile_base(cuda_compile_cubin CUBIN ${generated_files} ${ARGN})
+endmacro()
+
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUFFT TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUFFT_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufftemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cufft_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA ADD CUBLAS TO TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_ADD_CUBLAS_TO_TARGET target)
+  if (CUDA_BUILD_EMULATION)
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublasemu_LIBRARY})
+  else()
+    target_link_libraries(${target} ${CUDA_LINK_LIBRARIES_KEYWORD} ${CUDA_cublas_LIBRARY} ${CUDA_cublas_device_LIBRARY} ${CUDA_cublasLt_LIBRARY})
+  endif()
+endmacro()
+
+###############################################################################
+###############################################################################
+# CUDA BUILD CLEAN TARGET
+###############################################################################
+###############################################################################
+macro(CUDA_BUILD_CLEAN_TARGET)
+  # Call this after you add all your CUDA targets, and you will get a
+  # convenience target.  You should also make clean after running this target
+  # to get the build system to generate all the code again.
+
+  set(cuda_clean_target_name clean_cuda_depends)
+  if (CMAKE_GENERATOR MATCHES "Visual Studio")
+    string(TOUPPER ${cuda_clean_target_name} cuda_clean_target_name)
+  endif()
+  add_custom_target(${cuda_clean_target_name}
+    COMMAND ${CMAKE_COMMAND} -E remove ${CUDA_ADDITIONAL_CLEAN_FILES})
+
+  # Clear out the variable, so the next time we configure it will be empty.
+  # This is useful so that the files won't persist in the list after targets
+  # have been removed.
+  set(CUDA_ADDITIONAL_CLEAN_FILES "" CACHE INTERNAL "List of intermediate files that are part of the cuda dependency scanning.")
+endmacro()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..01ce8224604ee7801e15c0767a3d9903cfa74336
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/make2cmake.cmake
@@ -0,0 +1,106 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# This converts a file written in makefile syntax into one that can be included
+# by CMake.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Extra output
+#
+# input_file:FILEPATH=<>   Path to dependency file in makefile format
+#
+# output_file:FILEPATH=<>  Path to file with dependencies in CMake readable variable
+#
+
+file(READ ${input_file} depend_text)
+
+if (NOT "${depend_text}" STREQUAL "")
+
+  # message("FOUND DEPENDS")
+
+  string(REPLACE "\\ " " " depend_text ${depend_text})
+
+  # This works for the nvcc -M generated dependency files.
+  string(REGEX REPLACE "^.* : " "" depend_text ${depend_text})
+  string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+
+  set(dependency_list "")
+
+  foreach(file ${depend_text})
+
+    string(REGEX REPLACE "^ +" "" file ${file})
+
+    # OK, now if we had a UNC path, nvcc has a tendency to only output the first '/'
+    # instead of '//'.  Here we will test to see if the file exists, if it doesn't then
+    # try to prepend another '/' to the path and test again.  If it still fails remove the
+    # path.
+
+    if(NOT EXISTS "${file}")
+      if (EXISTS "/${file}")
+        set(file "/${file}")
+      else()
+        if(verbose)
+          message(WARNING " Removing non-existent dependency file: ${file}")
+        endif()
+        set(file "")
+      endif()
+    endif()
+
+    # Make sure we check to see if we have a file, before asking if it is not a directory.
+    # if(NOT IS_DIRECTORY "") will return TRUE.
+    if(file AND NOT IS_DIRECTORY "${file}")
+      # If softlinks start to matter, we should change this to REALPATH.  For now we need
+      # to flatten paths, because nvcc can generate stuff like /bin/../include instead of
+      # just /include.
+      get_filename_component(file_absolute "${file}" ABSOLUTE)
+      list(APPEND dependency_list "${file_absolute}")
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
+
+# Remove the duplicate entries and sort them.
+list(REMOVE_DUPLICATES dependency_list)
+list(SORT dependency_list)
+
+foreach(file ${dependency_list})
+  string(APPEND cuda_nvcc_depend " \"${file}\"\n")
+endforeach()
+
+file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n")
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e1468bfdab7b04154cffe34e39088a1ec00237db
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/parse_cubin.cmake
@@ -0,0 +1,109 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#  Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  Copyright (c) 2007-2009
+#  Scientific Computing and Imaging Institute, University of Utah
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+
+#######################################################################
+# Parses a .cubin file produced by nvcc and reports statistics about the file.
+
+
+file(READ ${input_file} file_text)
+
+if (NOT "${file_text}" STREQUAL "")
+
+  string(REPLACE ";" "\\;" file_text ${file_text})
+  string(REPLACE "\ncode" ";code" file_text ${file_text})
+
+  list(LENGTH file_text len)
+
+  foreach(line ${file_text})
+
+    # Only look at "code { }" blocks.
+    if(line MATCHES "^code")
+
+      # Break into individual lines.
+      string(REGEX REPLACE "\n" ";" line ${line})
+
+      foreach(entry ${line})
+
+        # Extract kernel names.
+        if (${entry} MATCHES "[^g]name = ([^ ]+)")
+          set(entry "${CMAKE_MATCH_1}")
+
+          # Check to see if the kernel name starts with "_"
+          set(skip FALSE)
+          # if (${entry} MATCHES "^_")
+            # Skip the rest of this block.
+            # message("Skipping ${entry}")
+            # set(skip TRUE)
+          # else ()
+            message("Kernel:    ${entry}")
+          # endif ()
+
+        endif()
+
+        # Skip the rest of the block if necessary
+        if(NOT skip)
+
+          # Registers
+          if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Registers: ${entry}")
+          endif()
+
+          # Local memory
+          if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Local:     ${entry}")
+          endif()
+
+          # Shared memory
+          if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)")
+            set(entry "${CMAKE_MATCH_3}")
+            message("Shared:    ${entry}")
+          endif()
+
+          if (${entry} MATCHES "^}")
+            message("")
+          endif()
+
+        endif()
+
+
+      endforeach()
+
+    endif()
+
+  endforeach()
+
+else()
+  # message("FOUND NO DEPENDS")
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9a745d7a370a196eb1b6562a9308cf1ef7c8b01a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/run_nvcc.cmake
@@ -0,0 +1,303 @@
+#  James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#
+#  Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#  This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#  for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+##########################################################################
+# This file runs the nvcc commands to produce the desired output file along with
+# the dependency file needed by CMake to compute dependencies.  In addition the
+# file checks the output of each command and if the command fails it deletes the
+# output files.
+
+# Input variables
+#
+# verbose:BOOL=<>          OFF: Be as quiet as possible (default)
+#                          ON : Describe each step
+#
+# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or
+#                               RelWithDebInfo, but it should match one of the
+#                               entries in CUDA_HOST_FLAGS. This is the build
+#                               configuration used when compiling the code.  If
+#                               blank or unspecified Debug is assumed as this is
+#                               what CMake does.
+#
+# generated_file:STRING=<> File to generate.  This argument must be passed in.
+#
+# generated_cubin_file:STRING=<> File to generate.  This argument must be passed
+#                                                   in if build_cubin is true.
+
+cmake_policy(PUSH)
+cmake_policy(SET CMP0007 NEW)
+cmake_policy(SET CMP0010 NEW)
+if(NOT generated_file)
+  message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
+set(source_file "@source_file@") # path
+set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path
+set(cmake_dependency_file "@cmake_dependency_file@") # path
+set(CUDA_make2cmake "@CUDA_make2cmake@") # path
+set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path
+set(build_cubin @build_cubin@) # bool
+set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # path
+# We won't actually use these variables for now, but we need to set this, in
+# order to force this file to be run again if it changes.
+set(generated_file_path "@generated_file_path@") # path
+set(generated_file_internal "@generated_file@") # path
+set(generated_cubin_file_internal "@generated_cubin_file@") # path
+
+set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path
+set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list
+@CUDA_NVCC_FLAGS_CONFIG@
+set(nvcc_flags @nvcc_flags@) # list
+set(CUDA_NVCC_INCLUDE_DIRS [==[@CUDA_NVCC_INCLUDE_DIRS@]==]) # list (needs to be in lua quotes to address backslashes)
+string(REPLACE "\\" "/" CUDA_NVCC_INCLUDE_DIRS "${CUDA_NVCC_INCLUDE_DIRS}")
+set(CUDA_NVCC_COMPILE_DEFINITIONS [==[@CUDA_NVCC_COMPILE_DEFINITIONS@]==]) # list (needs to be in lua quotes see #16510 ).
+set(format_flag "@format_flag@") # string
+set(cuda_language_flag @cuda_language_flag@) # list
+
+# Clean up list of include directories and add -I flags
+list(REMOVE_DUPLICATES CUDA_NVCC_INCLUDE_DIRS)
+set(CUDA_NVCC_INCLUDE_ARGS)
+foreach(dir ${CUDA_NVCC_INCLUDE_DIRS})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  list(APPEND CUDA_NVCC_INCLUDE_ARGS "-I${dir}")
+endforeach()
+
+# Clean up list of compile definitions, add -D flags, and append to nvcc_flags
+list(REMOVE_DUPLICATES CUDA_NVCC_COMPILE_DEFINITIONS)
+foreach(def ${CUDA_NVCC_COMPILE_DEFINITIONS})
+  list(APPEND nvcc_flags "-D${def}")
+endforeach()
+
+if(build_cubin AND NOT generated_cubin_file)
+  message(FATAL_ERROR "You must specify generated_cubin_file on the command line")
+endif()
+
+# This is the list of host compilation flags.  It C or CXX should already have
+# been chosen by FindCUDA.cmake.
+@CUDA_HOST_FLAGS@
+
+# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler
+set(nvcc_host_compiler_flags "")
+# If we weren't given a build_configuration, use Debug.
+if(NOT build_configuration)
+  set(build_configuration Debug)
+endif()
+string(TOUPPER "${build_configuration}" build_configuration)
+#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}")
+foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+  # Extra quotes are added around each flag to help nvcc parse out flags with spaces.
+  string(APPEND nvcc_host_compiler_flags ",\"${flag}\"")
+endforeach()
+if (nvcc_host_compiler_flags)
+  set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags})
+endif()
+#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"")
+# Add the build specific configuration flags
+list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}})
+
+# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority
+list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 )
+list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 )
+if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 AND CUDA_HOST_COMPILER )
+  if (CUDA_HOST_COMPILER STREQUAL "@_CUDA_MSVC_HOST_COMPILER@" AND DEFINED CCBIN)
+    set(CCBIN -ccbin "${CCBIN}")
+  else()
+    set(CCBIN -ccbin "${CUDA_HOST_COMPILER}")
+  endif()
+endif()
+
+# cuda_execute_process - Executes a command with optional command echo and status message.
+#
+#   status  - Status message to print if verbose is true
+#   command - COMMAND argument from the usual execute_process argument structure
+#   ARGN    - Remaining arguments are the command with arguments
+#
+#   CUDA_result - return value from running the command
+#
+# Make this a macro instead of a function, so that things like RESULT_VARIABLE
+# and other return variables are present after executing the process.
+macro(cuda_execute_process status command)
+  set(_command ${command})
+  if(NOT "x${_command}" STREQUAL "xCOMMAND")
+    message(FATAL_ERROR "Malformed call to cuda_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+  endif()
+  if(verbose)
+    execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+    # Now we need to build up our command string.  We are accounting for quotes
+    # and spaces, anything else is left up to the user to fix if they want to
+    # copy and paste a runnable command line.
+    set(cuda_execute_process_string)
+    foreach(arg ${ARGN})
+      # If there are quotes, excape them, so they come through.
+      string(REPLACE "\"" "\\\"" arg ${arg})
+      # Args with spaces need quotes around them to get them to be parsed as a single argument.
+      if(arg MATCHES " ")
+        list(APPEND cuda_execute_process_string "\"${arg}\"")
+      else()
+        list(APPEND cuda_execute_process_string ${arg})
+      endif()
+    endforeach()
+    # Echo the command
+    execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string})
+  endif()
+  # Run the command
+  execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result )
+endmacro()
+
+# Delete the target file
+cuda_execute_process(
+  "Removing ${generated_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+  )
+
+# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag
+# for dependency generation and hope for the best.
+set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}")
+set(CUDA_VERSION @CUDA_VERSION@)
+
+# nvcc doesn't define __CUDACC__ for some reason when generating dependency files.  This
+# can cause incorrect dependencies when #including files based on this macro which is
+# defined in the generating passes of nvcc invocation.  We will go ahead and manually
+# define this for now until a future version fixes this bug.
+set(CUDACC_DEFINE -D__CUDACC__)
+
+# Generate the dependency file
+cuda_execute_process(
+  "Generating dependency file: ${NVCC_generated_dependency_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  -M
+  ${CUDACC_DEFINE}
+  "${source_file}"
+  -o "${NVCC_generated_dependency_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${depends_CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file.  Don't put the
+# quotes just around the filenames for the input_file and output_file variables.
+# CMake will pass the quotes through and not be able to find the file.
+cuda_execute_process(
+  "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+  COMMAND "${CMAKE_COMMAND}"
+  -D "input_file:FILEPATH=${NVCC_generated_dependency_file}"
+  -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+  -D "verbose=${verbose}"
+  -P "${CUDA_make2cmake}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+cuda_execute_process(
+  "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+cuda_execute_process(
+  "Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}"
+  COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}"
+  )
+
+if(CUDA_result)
+  message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the code
+cuda_execute_process(
+  "Generating ${generated_file}"
+  COMMAND "${CUDA_NVCC_EXECUTABLE}"
+  "${source_file}"
+  ${cuda_language_flag}
+  ${format_flag} -o "${generated_file}"
+  ${CCBIN}
+  ${nvcc_flags}
+  ${nvcc_host_compiler_flags}
+  ${CUDA_NVCC_FLAGS}
+  -DNVCC
+  ${CUDA_NVCC_INCLUDE_ARGS}
+  )
+
+if(CUDA_result)
+  # Since nvcc can sometimes leave half done files make sure that we delete the output file.
+  cuda_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+  message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+  if(verbose)
+    message("Generated ${generated_file} successfully.")
+  endif()
+endif()
+
+# Cubin resource report commands.
+if( build_cubin )
+  # Run with -cubin to produce resource usage report.
+  cuda_execute_process(
+    "Generating ${generated_cubin_file}"
+    COMMAND "${CUDA_NVCC_EXECUTABLE}"
+    "${source_file}"
+    ${CUDA_NVCC_FLAGS}
+    ${nvcc_flags}
+    ${CCBIN}
+    ${nvcc_host_compiler_flags}
+    -DNVCC
+    -cubin
+    -o "${generated_cubin_file}"
+    ${CUDA_NVCC_INCLUDE_ARGS}
+    )
+
+  # Execute the parser script.
+  cuda_execute_process(
+    "Executing the parser script"
+    COMMAND  "${CMAKE_COMMAND}"
+    -D "input_file:STRING=${generated_cubin_file}"
+    -P "${CUDA_parse_cubin}"
+    )
+
+endif()
+
+cmake_policy(POP)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0756cc525adc979fb6d15506ae784099dd1a7038
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
@@ -0,0 +1,280 @@
+# Synopsis:
+#   CUDA_SELECT_NVCC_ARCH_FLAGS(out_variable [target_CUDA_architectures])
+#   -- Selects GPU arch flags for nvcc based on target_CUDA_architectures
+#      target_CUDA_architectures : Auto | Common | All | LIST(ARCH_AND_PTX ...)
+#       - "Auto" detects local machine GPU compute arch at runtime.
+#       - "Common" and "All" cover common and entire subsets of architectures
+#      ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+#      NAME: Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing Ampere
+#      NUM: Any number. Only those pairs are currently accepted by NVCC though:
+#            3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5 8.0
+#      Returns LIST of flags to be added to CUDA_NVCC_FLAGS in ${out_variable}
+#      Additionally, sets ${out_variable}_readable to the resulting numeric list
+#      Example:
+#       CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.0 3.5+PTX 5.2(5.0) Maxwell)
+#        LIST(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+#
+#      More info on CUDA architectures: https://en.wikipedia.org/wiki/CUDA
+#
+
+if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+  if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA"
+      AND CMAKE_CUDA_COMPILER_VERSION MATCHES "^([0-9]+\\.[0-9]+)")
+    set(CUDA_VERSION "${CMAKE_MATCH_1}")
+  endif()
+endif()
+
+# See: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES  "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.5" "5.0")
+
+# This list is used to filter CUDA archs when autodetecting
+set(CUDA_ALL_GPU_ARCHITECTURES "3.5" "5.0")
+
+if(CUDA_VERSION VERSION_GREATER "10.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0")
+
+  if(CUDA_VERSION VERSION_LESS "11.1")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX")
+  endif()
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "11.1")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6")
+  set(CUDA_LIMIT_GPU_ARCHITECUTRE "8.6")
+
+  if(CUDA_VERSION VERSION_LESS "11.8")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "8.9")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6+PTX")
+  endif()
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "11.8")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ada")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Hopper")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.9")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0")
+
+  if(CUDA_VERSION VERSION_LESS "12.0")
+    set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.9+PTX")
+    list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0+PTX")
+  endif()
+endif()
+
+if(NOT CUDA_VERSION VERSION_LESS "12.0")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "9.0a")
+  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "9.0a")
+  list(REMOVE_ITEM CUDA_COMMON_GPU_ARCHITECTURES "3.5")
+  list(REMOVE_ITEM CUDA_ALL_GPU_ARCHITECTURES "3.5")
+endif()
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   CUDA_DETECT_INSTALLED_GPUS(OUT_VARIABLE)
+#
+function(CUDA_DETECT_INSTALLED_GPUS OUT_VARIABLE)
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+      set(file "${PROJECT_BINARY_DIR}/detect_cuda_compute_capabilities.cu")
+    else()
+      set(file "${PROJECT_BINARY_DIR}/detect_cuda_compute_capabilities.cpp")
+    endif()
+
+    file(WRITE ${file} ""
+      "#include <cuda_runtime.h>\n"
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    if(CMAKE_CUDA_COMPILER_LOADED) # CUDA as a language
+      try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+              RUN_OUTPUT_VARIABLE compute_capabilities)
+    else()
+      try_run(run_result compile_result ${PROJECT_BINARY_DIR} ${file}
+              CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+              LINK_LIBRARIES ${CUDA_LIBRARIES}
+              RUN_OUTPUT_VARIABLE compute_capabilities)
+    endif()
+
+    # Filter unrelated content out of the output.
+    string(REGEX MATCHALL "[0-9]+\\.[0-9]+" compute_capabilities "${compute_capabilities}")
+
+    if(run_result EQUAL 0)
+      string(REPLACE "2.1" "2.1(2.0)" compute_capabilities "${compute_capabilities}")
+      set(CUDA_GPU_DETECT_OUTPUT ${compute_capabilities}
+        CACHE INTERNAL "Returned GPU architectures from detect_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_GPU_DETECT_OUTPUT)
+    message(STATUS "Automatic GPU detection failed. Building for common architectures.")
+    set(${OUT_VARIABLE} ${CUDA_COMMON_GPU_ARCHITECTURES} PARENT_SCOPE)
+  else()
+    # Filter based on CUDA version supported archs
+    set(CUDA_GPU_DETECT_OUTPUT_FILTERED "")
+    separate_arguments(CUDA_GPU_DETECT_OUTPUT)
+    foreach(ITEM IN ITEMS ${CUDA_GPU_DETECT_OUTPUT})
+        if(CUDA_LIMIT_GPU_ARCHITECTURE AND (ITEM VERSION_GREATER CUDA_LIMIT_GPU_ARCHITECTURE OR
+                                            ITEM VERSION_EQUAL CUDA_LIMIT_GPU_ARCHITECTURE))
+        list(GET CUDA_COMMON_GPU_ARCHITECTURES -1 NEWITEM)
+        string(APPEND CUDA_GPU_DETECT_OUTPUT_FILTERED " ${NEWITEM}")
+      else()
+        string(APPEND CUDA_GPU_DETECT_OUTPUT_FILTERED " ${ITEM}")
+      endif()
+    endforeach()
+
+    set(${OUT_VARIABLE} ${CUDA_GPU_DETECT_OUTPUT_FILTERED} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA architectures from parameter list
+# Usage:
+#   SELECT_NVCC_ARCH_FLAGS(out_variable [list of CUDA compute archs])
+function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
+  set(CUDA_ARCH_LIST "${ARGN}")
+
+  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
+    set(CUDA_ARCH_LIST "Auto")
+  endif()
+
+  set(cuda_arch_bin)
+  set(cuda_arch_ptx)
+
+  if("${CUDA_ARCH_LIST}" STREQUAL "All")
+    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
+    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
+    CUDA_DETECT_INSTALLED_GPUS(CUDA_ARCH_LIST)
+    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+  endif()
+
+  # Now process the list and look for names
+  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+  foreach(arch_name ${CUDA_ARCH_LIST})
+    set(arch_bin)
+    set(arch_ptx)
+    set(add_ptx FALSE)
+    # Check to see if we are compiling PTX
+    if(arch_name MATCHES "(.*)\\+PTX$")
+      set(add_ptx TRUE)
+      set(arch_name ${CMAKE_MATCH_1})
+    endif()
+    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+      set(arch_bin ${CMAKE_MATCH_1})
+      set(arch_ptx ${arch_bin})
+    else()
+      # Look for it in our list of known architectures
+      if(${arch_name} STREQUAL "Kepler+Tesla")
+        set(arch_bin 3.7)
+      elseif(${arch_name} STREQUAL "Kepler")
+        set(arch_bin 3.5)
+        set(arch_ptx 3.5)
+      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+        set(arch_bin 5.3)
+      elseif(${arch_name} STREQUAL "Maxwell")
+        set(arch_bin 5.0 5.2)
+        set(arch_ptx 5.2)
+      elseif(${arch_name} STREQUAL "Pascal")
+        set(arch_bin 6.0 6.1)
+        set(arch_ptx 6.1)
+     elseif(${arch_name} STREQUAL "Volta+Tegra")
+        set(arch_bin 7.2)
+      elseif(${arch_name} STREQUAL "Volta")
+        set(arch_bin 7.0 7.0)
+        set(arch_ptx 7.0)
+      elseif(${arch_name} STREQUAL "Turing")
+        set(arch_bin 7.5)
+        set(arch_ptx 7.5)
+      elseif(${arch_name} STREQUAL "Ampere+Tegra")
+        set(arch_bin 8.7)
+      elseif(${arch_name} STREQUAL "Ampere")
+        set(arch_bin 8.0 8.6)
+        set(arch_ptx 8.0 8.6)
+      elseif(${arch_name} STREQUAL "Ada")
+        set(arch_bin 8.9)
+        set(arch_ptx 8.9)
+      elseif(${arch_name} STREQUAL "Hopper")
+        set(arch_bin 9.0)
+        set(arch_ptx 9.0)
+      else()
+        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+      endif()
+    endif()
+    if(NOT arch_bin)
+      message(SEND_ERROR "arch_bin wasn't set for some reason")
+    endif()
+    list(APPEND cuda_arch_bin ${arch_bin})
+    if(add_ptx)
+      if (NOT arch_ptx)
+        set(arch_ptx ${arch_bin})
+      endif()
+      list(APPEND cuda_arch_ptx ${arch_ptx})
+    endif()
+  endforeach()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+  if(cuda_arch_bin)
+    list(REMOVE_DUPLICATES cuda_arch_bin)
+  endif()
+  if(cuda_arch_ptx)
+    list(REMOVE_DUPLICATES cuda_arch_ptx)
+  endif()
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified ARCH for the concrete CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2b8f27294b3483671c20c1464fbca86a5c823845
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageHandleStandardArgs.cmake
@@ -0,0 +1,386 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindPackageHandleStandardArgs
+-----------------------------
+
+This module provides a function intended to be used in :ref:`Find Modules`
+implementing :command:`find_package(<PackageName>)` calls.  It handles the
+``REQUIRED``, ``QUIET`` and version-related arguments of ``find_package``.
+It also sets the ``<PackageName>_FOUND`` variable.  The package is
+considered found if all variables listed contain valid results, e.g.
+valid filepaths.
+
+.. command:: find_package_handle_standard_args
+
+  There are two signatures::
+
+    find_package_handle_standard_args(<PackageName>
+      (DEFAULT_MSG|<custom-failure-message>)
+      <required-var>...
+      )
+
+    find_package_handle_standard_args(<PackageName>
+      [FOUND_VAR <result-var>]
+      [REQUIRED_VARS <required-var>...]
+      [VERSION_VAR <version-var>]
+      [HANDLE_COMPONENTS]
+      [CONFIG_MODE]
+      [FAIL_MESSAGE <custom-failure-message>]
+      )
+
+  The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
+  the variables ``<required-var>...`` are valid and any optional
+  constraints are satisfied, and ``FALSE`` otherwise.  A success or
+  failure message may be displayed based on the results and on
+  whether the ``REQUIRED`` and/or ``QUIET`` option was given to
+  the :command:`find_package` call.
+
+  The options are:
+
+  ``(DEFAULT_MSG|<custom-failure-message>)``
+    In the simple signature this specifies the failure message.
+    Use ``DEFAULT_MSG`` to ask for a default message to be computed
+    (recommended).  Not valid in the full signature.
+
+  ``FOUND_VAR <result-var>``
+    Obsolete.  Specifies either ``<PackageName>_FOUND`` or
+    ``<PACKAGENAME>_FOUND`` as the result variable.  This exists only
+    for compatibility with older versions of CMake and is now ignored.
+    Result variables of both names are always set for compatibility.
+
+  ``REQUIRED_VARS <required-var>...``
+    Specify the variables which are required for this package.
+    These may be named in the generated failure message asking the
+    user to set the missing variable values.  Therefore these should
+    typically be cache entries such as ``FOO_LIBRARY`` and not output
+    variables like ``FOO_LIBRARIES``.
+
+  ``VERSION_VAR <version-var>``
+    Specify the name of a variable that holds the version of the package
+    that has been found.  This version will be checked against the
+    (potentially) specified required version given to the
+    :command:`find_package` call, including its ``EXACT`` option.
+    The default messages include information about the required
+    version and the version which has been actually found, both
+    if the version is ok or not.
+
+  ``HANDLE_COMPONENTS``
+    Enable handling of package components.  In this case, the command
+    will report which components have been found and which are missing,
+    and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
+    if any of the required components (i.e. not the ones listed after
+    the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
+    missing.
+
+  ``CONFIG_MODE``
+    Specify that the calling find module is a wrapper around a
+    call to ``find_package(<PackageName> NO_MODULE)``.  This implies
+    a ``VERSION_VAR`` value of ``<PackageName>_VERSION``.  The command
+    will automatically check whether the package configuration file
+    was found.
+
+  ``FAIL_MESSAGE <custom-failure-message>``
+    Specify a custom failure message instead of using the default
+    generated message.  Not recommended.
+
+Example for the simple signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibXml2 DEFAULT_MSG
+    LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
+
+The ``LibXml2`` package is considered to be found if both
+``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
+Then also ``LibXml2_FOUND`` is set to ``TRUE``.  If it is not found
+and ``REQUIRED`` was used, it fails with a
+:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
+used or not.  If it is found, success will be reported, including
+the content of the first ``<required-var>``.  On repeated CMake runs,
+the same message will not be printed again.
+
+Example for the full signature:
+
+.. code-block:: cmake
+
+  find_package_handle_standard_args(LibArchive
+    REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
+    VERSION_VAR LibArchive_VERSION)
+
+In this case, the ``LibArchive`` package is considered to be found if
+both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
+Also the version of ``LibArchive`` will be checked by using the version
+contained in ``LibArchive_VERSION``.  Since no ``FAIL_MESSAGE`` is given,
+the default messages will be printed.
+
+Another example for the full signature:
+
+.. code-block:: cmake
+
+  find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
+  find_package_handle_standard_args(Automoc4  CONFIG_MODE)
+
+In this case, a ``FindAutmoc4.cmake`` module wraps a call to
+``find_package(Automoc4 NO_MODULE)`` and adds an additional search
+directory for ``automoc4``.  Then the call to
+``find_package_handle_standard_args`` produces a proper success/failure
+message.
+#]=======================================================================]
+
+include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
+
+# internal helper macro
+macro(_FPHSA_FAILURE_MESSAGE _msg)
+  if (${_NAME}_FIND_REQUIRED)
+    message(FATAL_ERROR "${_msg}")
+  else ()
+    if (NOT ${_NAME}_FIND_QUIETLY)
+      message(STATUS "${_msg}")
+    endif ()
+  endif ()
+endmacro()
+
+
+# internal helper macro to generate the failure message when used in CONFIG_MODE:
+macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
+  # <name>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
+  if(${_NAME}_CONFIG)
+    _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
+  else()
+    # If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
+    # List them all in the error message:
+    if(${_NAME}_CONSIDERED_CONFIGS)
+      set(configsText "")
+      list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
+      math(EXPR configsCount "${configsCount} - 1")
+      foreach(currentConfigIndex RANGE ${configsCount})
+        list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
+        list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
+        string(APPEND configsText "    ${filename} (version ${version})\n")
+      endforeach()
+      if (${_NAME}_NOT_FOUND_MESSAGE)
+        string(APPEND configsText "    Reason given by package: ${${_NAME}_NOT_FOUND_MESSAGE}\n")
+      endif()
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:\n${configsText}")
+
+    else()
+      # Simple case: No Config-file was found at all:
+      _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
+    endif()
+  endif()
+endmacro()
+
+
+function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
+
+# Set up the arguments for `cmake_parse_arguments`.
+  set(options  CONFIG_MODE  HANDLE_COMPONENTS)
+  set(oneValueArgs  FAIL_MESSAGE  VERSION_VAR  FOUND_VAR)
+  set(multiValueArgs REQUIRED_VARS)
+
+# Check whether we are in 'simple' or 'extended' mode:
+  set(_KEYWORDS_FOR_EXTENDED_MODE  ${options} ${oneValueArgs} ${multiValueArgs} )
+  list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
+
+  if(${INDEX} EQUAL -1)
+    set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
+    set(FPHSA_REQUIRED_VARS ${ARGN})
+    set(FPHSA_VERSION_VAR)
+  else()
+    cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}"  ${_FIRST_ARG} ${ARGN})
+
+    if(FPHSA_UNPARSED_ARGUMENTS)
+      message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
+    endif()
+
+    if(NOT FPHSA_FAIL_MESSAGE)
+      set(FPHSA_FAIL_MESSAGE  "DEFAULT_MSG")
+    endif()
+
+    # In config-mode, we rely on the variable <package>_CONFIG, which is set by find_package()
+    # when it successfully found the config-file, including version checking:
+    if(FPHSA_CONFIG_MODE)
+      list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
+      list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
+      set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
+    endif()
+
+    if(NOT FPHSA_REQUIRED_VARS)
+      message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
+    endif()
+  endif()
+
+# now that we collected all arguments, process them
+
+  if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
+    set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
+  endif()
+
+  list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
+
+  string(TOUPPER ${_NAME} _NAME_UPPER)
+  string(TOLOWER ${_NAME} _NAME_LOWER)
+
+  if(FPHSA_FOUND_VAR)
+    if(FPHSA_FOUND_VAR MATCHES "^${_NAME}_FOUND$"  OR  FPHSA_FOUND_VAR MATCHES "^${_NAME_UPPER}_FOUND$")
+      set(_FOUND_VAR ${FPHSA_FOUND_VAR})
+    else()
+      message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_NAME}_FOUND\" and \"${_NAME_UPPER}_FOUND\" are valid names.")
+    endif()
+  else()
+    set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
+  endif()
+
+  # collect all variables which were not found, so they can be printed, so the
+  # user knows better what went wrong (#6375)
+  set(MISSING_VARS "")
+  set(DETAILS "")
+  # check if all passed variables are valid
+  set(FPHSA_FOUND_${_NAME} TRUE)
+  foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
+    if(NOT ${_CURRENT_VAR})
+      set(FPHSA_FOUND_${_NAME} FALSE)
+      string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
+    else()
+      string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
+    endif()
+  endforeach()
+  if(FPHSA_FOUND_${_NAME})
+    set(${_NAME}_FOUND TRUE)
+    set(${_NAME_UPPER}_FOUND TRUE)
+  else()
+    set(${_NAME}_FOUND FALSE)
+    set(${_NAME_UPPER}_FOUND FALSE)
+  endif()
+
+  # component handling
+  unset(FOUND_COMPONENTS_MSG)
+  unset(MISSING_COMPONENTS_MSG)
+
+  if(FPHSA_HANDLE_COMPONENTS)
+    foreach(comp ${${_NAME}_FIND_COMPONENTS})
+      if(${_NAME}_${comp}_FOUND)
+
+        if(NOT DEFINED FOUND_COMPONENTS_MSG)
+          set(FOUND_COMPONENTS_MSG "found components: ")
+        endif()
+        string(APPEND FOUND_COMPONENTS_MSG " ${comp}")
+
+      else()
+
+        if(NOT DEFINED MISSING_COMPONENTS_MSG)
+          set(MISSING_COMPONENTS_MSG "missing components: ")
+        endif()
+        string(APPEND MISSING_COMPONENTS_MSG " ${comp}")
+
+        if(${_NAME}_FIND_REQUIRED_${comp})
+          set(${_NAME}_FOUND FALSE)
+          string(APPEND MISSING_VARS " ${comp}")
+        endif()
+
+      endif()
+    endforeach()
+    set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
+    string(APPEND DETAILS "[c${COMPONENT_MSG}]")
+  endif()
+
+  # version handling:
+  set(VERSION_MSG "")
+  set(VERSION_OK TRUE)
+
+  # check with DEFINED here as the requested or found version may be "0"
+  if (DEFINED ${_NAME}_FIND_VERSION)
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})
+
+      if(${_NAME}_FIND_VERSION_EXACT)       # exact version required
+        # count the dots in the version string
+        string(REGEX REPLACE "[^.]" "" _VERSION_DOTS "${_FOUND_VERSION}")
+        # add one dot because there is one dot more than there are components
+        string(LENGTH "${_VERSION_DOTS}." _VERSION_DOTS)
+        if (_VERSION_DOTS GREATER ${_NAME}_FIND_VERSION_COUNT)
+          # Because of the C++ implementation of find_package() ${_NAME}_FIND_VERSION_COUNT
+          # is at most 4 here. Therefore a simple lookup table is used.
+          if (${_NAME}_FIND_VERSION_COUNT EQUAL 1)
+            set(_VERSION_REGEX "[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 2)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*")
+          elseif (${_NAME}_FIND_VERSION_COUNT EQUAL 3)
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*")
+          else ()
+            set(_VERSION_REGEX "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
+          endif ()
+          string(REGEX REPLACE "^(${_VERSION_REGEX})\\..*" "\\1" _VERSION_HEAD "${_FOUND_VERSION}")
+          unset(_VERSION_REGEX)
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _VERSION_HEAD)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+          unset(_VERSION_HEAD)
+        else ()
+          if (NOT ${_NAME}_FIND_VERSION VERSION_EQUAL _FOUND_VERSION)
+            set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is exact version \"${${_NAME}_FIND_VERSION}\"")
+            set(VERSION_OK FALSE)
+          else ()
+            set(VERSION_MSG "(found suitable exact version \"${_FOUND_VERSION}\")")
+          endif ()
+        endif ()
+        unset(_VERSION_DOTS)
+
+      else()     # minimum version specified:
+        if (${_NAME}_FIND_VERSION VERSION_GREATER _FOUND_VERSION)
+          set(VERSION_MSG "Found unsuitable version \"${_FOUND_VERSION}\", but required is at least \"${${_NAME}_FIND_VERSION}\"")
+          set(VERSION_OK FALSE)
+        else ()
+          set(VERSION_MSG "(found suitable version \"${_FOUND_VERSION}\", minimum required is \"${${_NAME}_FIND_VERSION}\")")
+        endif ()
+      endif()
+
+    else()
+
+      # if the package was not found, but a version was given, add that to the output:
+      if(${_NAME}_FIND_VERSION_EXACT)
+         set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
+      else()
+         set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
+      endif()
+
+    endif()
+  else ()
+    # Check with DEFINED as the found version may be 0.
+    if(DEFINED ${FPHSA_VERSION_VAR})
+      set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
+    endif()
+  endif ()
+
+  if(VERSION_OK)
+    string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
+  else()
+    set(${_NAME}_FOUND FALSE)
+  endif()
+
+
+  # print the result:
+  if (${_NAME}_FOUND)
+    FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
+  else ()
+
+    if(FPHSA_CONFIG_MODE)
+      _FPHSA_HANDLE_FAILURE_CONFIG_MODE()
+    else()
+      if(NOT VERSION_OK)
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (found ${${_FIRST_REQUIRED_VAR}})")
+      else()
+        _FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
+      endif()
+    endif()
+
+  endif ()
+
+  set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+  set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
+endfunction()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageMessage.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageMessage.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..1334e2bebb33e0bfc58d6db1fead28c31ec36d5d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindPackageMessage.cmake
@@ -0,0 +1,47 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#.rst:
+# FindPackageMessage
+# ------------------
+#
+#
+#
+# FIND_PACKAGE_MESSAGE(<name> "message for user" "find result details")
+#
+# This macro is intended to be used in FindXXX.cmake modules files.  It
+# will print a message once for each unique find result.  This is useful
+# for telling the user where a package was found.  The first argument
+# specifies the name (XXX) of the package.  The second argument
+# specifies the message to display.  The third argument lists details
+# about the find result so that if they change the message will be
+# displayed again.  The macro also obeys the QUIET argument to the
+# find_package command.
+#
+# Example:
+#
+# ::
+#
+#   if(X11_FOUND)
+#     FIND_PACKAGE_MESSAGE(X11 "Found X11: ${X11_X11_LIB}"
+#       "[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
+#   else()
+#    ...
+#   endif()
+
+function(FIND_PACKAGE_MESSAGE pkg msg details)
+  # Avoid printing a message repeatedly for the same find result.
+  if(NOT ${pkg}_FIND_QUIETLY)
+    string(REPLACE "\n" "" details "${details}")
+    set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
+    if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
+      # The message has not yet been printed.
+      message(STATUS "${msg}")
+
+      # Save the find details in the cache to avoid printing the same
+      # message again.
+      set("${DETAILS_VAR}" "${details}"
+        CACHE INTERNAL "Details about finding ${pkg}")
+    endif()
+  endif()
+endfunction()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/LoadHIP.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/LoadHIP.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0f4a014688aafed5511532522e670501855f82b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/LoadHIP.cmake
@@ -0,0 +1,301 @@
+set(PYTORCH_FOUND_HIP FALSE)
+
+if(NOT DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH /opt/rocm)
+else()
+  set(ROCM_PATH $ENV{ROCM_PATH})
+endif()
+if(NOT DEFINED ENV{ROCM_INCLUDE_DIRS})
+  set(ROCM_INCLUDE_DIRS ${ROCM_PATH}/include)
+else()
+  set(ROCM_INCLUDE_DIRS $ENV{ROCM_INCLUDE_DIRS})
+endif()
+
+if(NOT EXISTS ${ROCM_PATH})
+  return()
+endif()
+
+# MAGMA_HOME
+if(NOT DEFINED ENV{MAGMA_HOME})
+  set(MAGMA_HOME ${ROCM_PATH}/magma)
+  set(ENV{MAGMA_HOME} ${ROCM_PATH}/magma)
+else()
+  set(MAGMA_HOME $ENV{MAGMA_HOME})
+endif()
+
+torch_hip_get_arch_list(PYTORCH_ROCM_ARCH)
+if(PYTORCH_ROCM_ARCH STREQUAL "")
+  message(FATAL_ERROR "No GPU arch specified for ROCm build. Please use PYTORCH_ROCM_ARCH environment variable to specify GPU archs to build for.")
+endif()
+message("Building PyTorch for GPU arch: ${PYTORCH_ROCM_ARCH}")
+
+# Add HIP to the CMAKE Module Path
+set(CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ${CMAKE_MODULE_PATH})
+
+macro(find_package_and_print_version PACKAGE_NAME)
+  find_package("${PACKAGE_NAME}" ${ARGN})
+  message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}")
+endmacro()
+
+# Find the HIP Package
+find_package_and_print_version(HIP 1.0)
+
+if(HIP_FOUND)
+  set(PYTORCH_FOUND_HIP TRUE)
+  set(FOUND_ROCM_VERSION_H FALSE)
+
+  set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+  set(file "${PROJECT_BINARY_DIR}/detect_rocm_version.cc")
+
+  # Find ROCM version for checks
+  # ROCM 5.0 and later will have header api for version management
+  if(EXISTS ${ROCM_INCLUDE_DIRS}/rocm_version.h)
+    set(FOUND_ROCM_VERSION_H TRUE)
+    file(WRITE ${file} ""
+      "#include <rocm_version.h>\n"
+      )
+  elseif(EXISTS ${ROCM_INCLUDE_DIRS}/rocm-core/rocm_version.h)
+    set(FOUND_ROCM_VERSION_H TRUE)
+    file(WRITE ${file} ""
+      "#include <rocm-core/rocm_version.h>\n"
+      )
+  else()
+    message("********************* rocm_version.h couldnt be found ******************\n")
+  endif()
+
+  if(FOUND_ROCM_VERSION_H)
+    file(APPEND ${file} ""
+      "#include <cstdio>\n"
+
+      "#ifndef ROCM_VERSION_PATCH\n"
+      "#define ROCM_VERSION_PATCH 0\n"
+      "#endif\n"
+      "#define STRINGIFYHELPER(x) #x\n"
+      "#define STRINGIFY(x) STRINGIFYHELPER(x)\n"
+      "int main() {\n"
+      "  printf(\"%d.%d.%s\", ROCM_VERSION_MAJOR, ROCM_VERSION_MINOR, STRINGIFY(ROCM_VERSION_PATCH));\n"
+      "  return 0;\n"
+      "}\n"
+      )
+
+    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+      RUN_OUTPUT_VARIABLE rocm_version_from_header
+      COMPILE_OUTPUT_VARIABLE output_var
+      )
+    # We expect the compile to be successful if the include directory exists.
+    if(NOT compile_result)
+      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+    endif()
+    message(STATUS "Caffe2: Header version is: " ${rocm_version_from_header})
+    set(ROCM_VERSION_DEV_RAW ${rocm_version_from_header})
+    message("\n***** ROCm version from rocm_version.h ****\n")
+  endif()
+
+  string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+).*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW})
+
+  if(ROCM_VERSION_DEV_MATCH)
+    set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1})
+    set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2})
+    set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3})
+    set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}")
+    math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}")
+  endif()
+
+  message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}")
+  message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}")
+  message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}")
+  message("ROCM_VERSION_DEV_PATCH: ${ROCM_VERSION_DEV_PATCH}")
+  message("ROCM_VERSION_DEV_INT:   ${ROCM_VERSION_DEV_INT}")
+
+  math(EXPR TORCH_HIP_VERSION "(${HIP_VERSION_MAJOR} * 100) + ${HIP_VERSION_MINOR}")
+  message("HIP_VERSION_MAJOR: ${HIP_VERSION_MAJOR}")
+  message("HIP_VERSION_MINOR: ${HIP_VERSION_MINOR}")
+  message("TORCH_HIP_VERSION: ${TORCH_HIP_VERSION}")
+
+  message("\n***** Library versions from dpkg *****\n")
+  execute_process(COMMAND dpkg -l COMMAND grep rocm-dev COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep rocm-libs COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hsakmt-roct COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep rocr-dev COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep -w hcc COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hip-base COMMAND awk "{print $2 \" VERSION: \" $3}")
+  execute_process(COMMAND dpkg -l COMMAND grep hip_hcc COMMAND awk "{print $2 \" VERSION: \" $3}")
+
+  message("\n***** Library versions from cmake find_package *****\n")
+
+  set(CMAKE_HIP_CLANG_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+  set(CMAKE_HIP_CLANG_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  ### Remove setting of Flags when FindHIP.CMake PR #558 is accepted.###
+
+  set(hip_DIR ${ROCM_PATH}/lib/cmake/hip)
+  set(hsa-runtime64_DIR ${ROCM_PATH}/lib/cmake/hsa-runtime64)
+  set(AMDDeviceLibs_DIR ${ROCM_PATH}/lib/cmake/AMDDeviceLibs)
+  set(amd_comgr_DIR ${ROCM_PATH}/lib/cmake/amd_comgr)
+  set(rocrand_DIR ${ROCM_PATH}/lib/cmake/rocrand)
+  set(hiprand_DIR ${ROCM_PATH}/lib/cmake/hiprand)
+  set(rocblas_DIR ${ROCM_PATH}/lib/cmake/rocblas)
+  set(hipblas_DIR ${ROCM_PATH}/lib/cmake/hipblas)
+  set(hipblaslt_DIR ${ROCM_PATH}/lib/cmake/hipblaslt)
+  set(miopen_DIR ${ROCM_PATH}/lib/cmake/miopen)
+  set(rocfft_DIR ${ROCM_PATH}/lib/cmake/rocfft)
+  set(hipfft_DIR ${ROCM_PATH}/lib/cmake/hipfft)
+  set(hipsparse_DIR ${ROCM_PATH}/lib/cmake/hipsparse)
+  set(rccl_DIR ${ROCM_PATH}/lib/cmake/rccl)
+  set(rocprim_DIR ${ROCM_PATH}/lib/cmake/rocprim)
+  set(hipcub_DIR ${ROCM_PATH}/lib/cmake/hipcub)
+  set(rocthrust_DIR ${ROCM_PATH}/lib/cmake/rocthrust)
+  set(hipsolver_DIR ${ROCM_PATH}/lib/cmake/hipsolver)
+
+
+  find_package_and_print_version(hip REQUIRED)
+  find_package_and_print_version(hsa-runtime64 REQUIRED)
+  find_package_and_print_version(amd_comgr REQUIRED)
+  find_package_and_print_version(rocrand REQUIRED)
+  find_package_and_print_version(hiprand REQUIRED)
+  find_package_and_print_version(rocblas REQUIRED)
+  find_package_and_print_version(hipblas REQUIRED)
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
+    find_package_and_print_version(hipblaslt REQUIRED)
+  endif()
+  find_package_and_print_version(miopen REQUIRED)
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "4.1.0")
+    find_package_and_print_version(hipfft REQUIRED)
+  else()
+    find_package_and_print_version(rocfft REQUIRED)
+  endif()
+  find_package_and_print_version(hipsparse REQUIRED)
+  find_package_and_print_version(rccl)
+  find_package_and_print_version(rocprim REQUIRED)
+  find_package_and_print_version(hipcub REQUIRED)
+  find_package_and_print_version(rocthrust REQUIRED)
+  find_package_and_print_version(hipsolver REQUIRED)
+
+
+  find_library(PYTORCH_HIP_LIBRARIES amdhip64 HINTS ${ROCM_PATH}/lib)
+  # TODO: miopen_LIBRARIES should return fullpath to the library file,
+  # however currently it's just the lib name
+  if(TARGET ${miopen_LIBRARIES})
+    set(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES})
+  else()
+    find_library(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${ROCM_PATH}/lib)
+  endif()
+  # TODO: rccl_LIBRARIES should return fullpath to the library file,
+  # however currently it's just the lib name
+  if(TARGET ${rccl_LIBRARIES})
+    set(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES})
+  else()
+    find_library(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${ROCM_PATH}/lib)
+  endif()
+  find_library(ROCM_HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib)
+  # roctx is part of roctracer
+  find_library(ROCM_ROCTX_LIB roctx64 HINTS ${ROCM_PATH}/lib)
+
+  if(ROCM_VERSION_DEV VERSION_GREATER_EQUAL "5.7.0")
+    # check whether hipblaslt is using its own datatype
+    set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_data_type.cc")
+    file(WRITE ${file} ""
+      "#include <hipblaslt/hipblaslt.h>\n"
+      "int main() {\n"
+      "    hipblasltDatatype_t bar = HIPBLASLT_R_16F;\n"
+      "    return 0;\n"
+      "}\n"
+      )
+
+    try_compile(hipblaslt_compile_result_custom_datatype ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+      COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+      OUTPUT_VARIABLE hipblaslt_compile_output)
+
+    if(hipblaslt_compile_result_custom_datatype)
+      set(HIPBLASLT_CUSTOM_DATA_TYPE ON)
+      #message("hipblaslt is using custom data type: ${hipblaslt_compile_output}")
+      message("hipblaslt is using custom data type")
+    else()
+      set(HIPBLASLT_CUSTOM_DATA_TYPE OFF)
+      #message("hipblaslt is NOT using custom data type: ${hipblaslt_compile_output}")
+      message("hipblaslt is NOT using custom data type")
+    endif()
+
+    # check whether hipblaslt is using its own compute type
+    set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_compute_type.cc")
+    file(WRITE ${file} ""
+      "#include <hipblaslt/hipblaslt.h>\n"
+      "int main() {\n"
+      "    hipblasLtComputeType_t baz = HIPBLASLT_COMPUTE_F32;\n"
+      "    return 0;\n"
+      "}\n"
+      )
+
+    try_compile(hipblaslt_compile_result_custom_compute_type ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+      COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+      OUTPUT_VARIABLE hipblaslt_compile_output)
+
+    if(hipblaslt_compile_result_custom_compute_type)
+      set(HIPBLASLT_CUSTOM_COMPUTE_TYPE ON)
+      #message("hipblaslt is using custom compute type: ${hipblaslt_compile_output}")
+      message("hipblaslt is using custom compute type")
+    else()
+      set(HIPBLASLT_CUSTOM_COMPUTE_TYPE OFF)
+      #message("hipblaslt is NOT using custom compute type: ${hipblaslt_compile_output}")
+      message("hipblaslt is NOT using custom compute type")
+    endif()
+
+    # check whether hipblaslt provides getIndexFromAlgo
+    set(file "${PROJECT_BINARY_DIR}/hipblaslt_test_getIndexFromAlgo.cc")
+    file(WRITE ${file} ""
+      "#include <hipblaslt/hipblaslt.h>\n"
+      "#include <hipblaslt/hipblaslt-ext.hpp>\n"
+      "int main() {\n"
+      "    hipblasLtMatmulAlgo_t algo;\n"
+      "    return hipblaslt_ext::getIndexFromAlgo(algo);\n"
+      "    return 0;\n"
+      "}\n"
+      )
+
+    try_compile(hipblaslt_compile_result_getindexfromalgo ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS
+        "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+        "-DLINK_DIRECTORIES=${ROCM_PATH}/lib"
+      LINK_LIBRARIES ${hipblaslt_LIBRARIES}
+      COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+      OUTPUT_VARIABLE hipblaslt_compile_output)
+
+    if(hipblaslt_compile_result_getindexfromalgo)
+      set(HIPBLASLT_HAS_GETINDEXFROMALGO ON)
+      #message("hipblaslt provides getIndexFromAlgo: ${hipblaslt_compile_output}")
+      message("hipblaslt provides getIndexFromAlgo")
+    else()
+      set(HAS_GETINDEXFROMALGO OFF)
+      #message("hipblaslt does not provide getIndexFromAlgo: ${hipblaslt_compile_output}")
+      message("hipblaslt does not provide getIndexFromAlgo")
+    endif()
+  endif()
+
+  # check whether HIP declares new types
+  set(file "${PROJECT_BINARY_DIR}/hip_new_types.cc")
+  file(WRITE ${file} ""
+    "#include <hip/library_types.h>\n"
+    "int main() {\n"
+    "    hipDataType baz = HIP_R_8F_E4M3_FNUZ;\n"
+    "    return 0;\n"
+    "}\n"
+    )
+
+  try_compile(hipblaslt_compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${ROCM_INCLUDE_DIRS}"
+    COMPILE_DEFINITIONS -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__
+    OUTPUT_VARIABLE hipblaslt_compile_output)
+
+  if(hipblaslt_compile_result)
+    set(HIP_NEW_TYPE_ENUMS ON)
+    #message("HIP is using new type enums: ${hipblaslt_compile_output}")
+    message("HIP is using new type enums")
+  else()
+    set(HIP_NEW_TYPE_ENUMS OFF)
+    #message("HIP is NOT using new type enums: ${hipblaslt_compile_output}")
+    message("HIP is NOT using new type enums")
+  endif()
+
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2e38f7b9fabad7de8acc5d29728841d18457d8d8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/cuda.cmake
@@ -0,0 +1,398 @@
+# ---[ cuda
+
+# Poor man's include guard
+if(TARGET torch::cudart)
+  return()
+endif()
+
+# sccache is only supported in CMake master and not in the newest official
+# release (3.11.3) yet. Hence we need our own Modules_CUDA_fix to enable sccache.
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/../Modules_CUDA_fix)
+
+# We don't want to statically link cudart, because we rely on it's dynamic linkage in
+# python (follow along torch/cuda/__init__.py and usage of cudaGetErrorName).
+# Technically, we can link cudart here statically, and link libtorch_python.so
+# to a dynamic libcudart.so, but that's just wasteful.
+# However, on Windows, if this one gets switched off, the error "cuda: unknown error"
+# will be raised when running the following code:
+# >>> import torch
+# >>> torch.cuda.is_available()
+# >>> torch.cuda.current_device()
+# More details can be found in the following links.
+# https://github.com/pytorch/pytorch/issues/20635
+# https://github.com/pytorch/pytorch/issues/17108
+if(NOT MSVC)
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
+endif()
+
+# Find CUDA.
+find_package(CUDA)
+if(NOT CUDA_FOUND)
+  message(WARNING
+    "Caffe2: CUDA cannot be found. Depending on whether you are building "
+    "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+    "give you more info.")
+  set(CAFFE2_USE_CUDA OFF)
+  return()
+endif()
+
+# Enable CUDA language support
+set(CUDAToolkit_ROOT "${CUDA_TOOLKIT_ROOT_DIR}")
+# Pass clang as host compiler, which according to the docs
+# Must be done before CUDA language is enabled, see
+# https://cmake.org/cmake/help/v3.15/variable/CMAKE_CUDA_HOST_COMPILER.html
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_C_COMPILER}")
+endif()
+enable_language(CUDA)
+if("X${CMAKE_CUDA_STANDARD}" STREQUAL "X" )
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+endif()
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+# CMP0074 - find_package will respect <PackageName>_ROOT variables
+cmake_policy(PUSH)
+if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12.0)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+cmake_policy(POP)
+
+if(NOT CMAKE_CUDA_COMPILER_VERSION VERSION_EQUAL CUDAToolkit_VERSION)
+  message(FATAL_ERROR "Found two conflicting CUDA versions:\n"
+                      "V${CMAKE_CUDA_COMPILER_VERSION} in '${CUDA_INCLUDE_DIRS}' and\n"
+                      "V${CUDAToolkit_VERSION} in '${CUDAToolkit_INCLUDE_DIRS}'")
+endif()
+
+if(NOT TARGET CUDA::nvToolsExt)
+  message(FATAL_ERROR "Failed to find nvToolsExt")
+endif()
+
+message(STATUS "Caffe2: CUDA detected: " ${CUDA_VERSION})
+message(STATUS "Caffe2: CUDA nvcc is: " ${CUDA_NVCC_EXECUTABLE})
+message(STATUS "Caffe2: CUDA toolkit directory: " ${CUDA_TOOLKIT_ROOT_DIR})
+if(CUDA_VERSION VERSION_LESS 11.0)
+  message(FATAL_ERROR "PyTorch requires CUDA 11.0 or above.")
+endif()
+
+if(CUDA_FOUND)
+  # Sometimes, we may mismatch nvcc with the CUDA headers we are
+  # compiling with, e.g., if a ccache nvcc is fed to us by CUDA_NVCC_EXECUTABLE
+  # but the PATH is not consistent with CUDA_HOME.  It's better safe
+  # than sorry: make sure everything is consistent.
+  if(MSVC AND CMAKE_GENERATOR MATCHES "Visual Studio")
+    # When using Visual Studio, it attempts to lock the whole binary dir when
+    # `try_run` is called, which will cause the build to fail.
+    string(RANDOM BUILD_SUFFIX)
+    set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}/${BUILD_SUFFIX}")
+  else()
+    set(PROJECT_RANDOM_BINARY_DIR "${PROJECT_BINARY_DIR}")
+  endif()
+  set(file "${PROJECT_BINARY_DIR}/detect_cuda_version.cc")
+  file(WRITE ${file} ""
+    "#include <cuda.h>\n"
+    "#include <cstdio>\n"
+    "int main() {\n"
+    "  printf(\"%d.%d\", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);\n"
+    "  return 0;\n"
+    "}\n"
+    )
+  if(NOT CMAKE_CROSSCOMPILING)
+    try_run(run_result compile_result ${PROJECT_RANDOM_BINARY_DIR} ${file}
+      CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS}"
+      LINK_LIBRARIES ${CUDA_LIBRARIES}
+      RUN_OUTPUT_VARIABLE cuda_version_from_header
+      COMPILE_OUTPUT_VARIABLE output_var
+      )
+    if(NOT compile_result)
+      message(FATAL_ERROR "Caffe2: Couldn't determine version from header: " ${output_var})
+    endif()
+    message(STATUS "Caffe2: Header version is: " ${cuda_version_from_header})
+    if(NOT cuda_version_from_header STREQUAL ${CUDA_VERSION_STRING})
+      # Force CUDA to be processed for again next time
+      # TODO: I'm not sure if this counts as an implementation detail of
+      # FindCUDA
+      set(${cuda_version_from_findcuda} ${CUDA_VERSION_STRING})
+      unset(CUDA_TOOLKIT_ROOT_DIR_INTERNAL CACHE)
+      # Not strictly necessary, but for good luck.
+      unset(CUDA_VERSION CACHE)
+      # Error out
+      message(FATAL_ERROR "FindCUDA says CUDA version is ${cuda_version_from_findcuda} (usually determined by nvcc), "
+        "but the CUDA headers say the version is ${cuda_version_from_header}.  This often occurs "
+        "when you set both CUDA_HOME and CUDA_NVCC_EXECUTABLE to "
+        "non-standard locations, without also setting PATH to point to the correct nvcc.  "
+        "Perhaps, try re-running this command again with PATH=${CUDA_TOOLKIT_ROOT_DIR}/bin:$PATH.  "
+        "See above log messages for more diagnostics, and see https://github.com/pytorch/pytorch/issues/8092 for more details.")
+    endif()
+  endif()
+endif()
+
+# Optionally, find TensorRT
+if(CAFFE2_USE_TENSORRT)
+  find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES include)
+  find_library(TENSORRT_LIBRARY nvinfer
+    HINTS ${TENSORRT_ROOT} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/x64)
+  find_package_handle_standard_args(
+    TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR TENSORRT_LIBRARY)
+  if(TENSORRT_FOUND)
+    execute_process(COMMAND /bin/sh -c "[ -r \"${TENSORRT_INCLUDE_DIR}/NvInferVersion.h\" ] && awk '/^\#define NV_TENSORRT_MAJOR/ {print $3}' \"${TENSORRT_INCLUDE_DIR}/NvInferVersion.h\"" OUTPUT_VARIABLE TENSORRT_VERSION_MAJOR)
+    execute_process(COMMAND /bin/sh -c "[ -r \"${TENSORRT_INCLUDE_DIR}/NvInferVersion.h\" ] && awk '/^\#define NV_TENSORRT_MINOR/ {print $3}' \"${TENSORRT_INCLUDE_DIR}/NvInferVersion.h\"" OUTPUT_VARIABLE TENSORRT_VERSION_MINOR)
+    if(TENSORRT_VERSION_MAJOR)
+      string(STRIP ${TENSORRT_VERSION_MAJOR} TENSORRT_VERSION_MAJOR)
+      string(STRIP ${TENSORRT_VERSION_MINOR} TENSORRT_VERSION_MINOR)
+      set(TENSORRT_VERSION "${TENSORRT_VERSION_MAJOR}.${TENSORRT_VERSION_MINOR}")
+      #CAFFE2_USE_TRT is set in Dependencies
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR}")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTENSORRT_VERSION_MINOR=${TENSORRT_VERSION_MINOR}")
+    else()
+      message(WARNING "Caffe2: Cannot find ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h. Assuming TRT 5.0 which is no longer supported. Turning the option off.")
+      set(CAFFE2_USE_TENSORRT OFF)
+    endif()
+  else()
+    message(WARNING
+      "Caffe2: Cannot find TensorRT library. Turning the option off.")
+    set(CAFFE2_USE_TENSORRT OFF)
+  endif()
+endif()
+
+# ---[ CUDA libraries wrapper
+
+# find libcuda.so and lbnvrtc.so
+# For libcuda.so, we will find it under lib, lib64, and then the
+# stubs folder, in case we are building on a system that does not
+# have cuda driver installed. On windows, we also search under the
+# folder lib/x64.
+set(CUDA_CUDA_LIB "${CUDA_cuda_driver_LIBRARY}" CACHE FILEPATH "")
+set(CUDA_NVRTC_LIB "${CUDA_nvrtc_LIBRARY}" CACHE FILEPATH "")
+if(CUDA_NVRTC_LIB AND NOT CUDA_NVRTC_SHORTHASH)
+  if("${PYTHON_EXECUTABLE}" STREQUAL "")
+    set(_python_exe "python")
+  else()
+    set(_python_exe "${PYTHON_EXECUTABLE}")
+  endif()
+  execute_process(
+    COMMAND "${_python_exe}" -c
+    "import hashlib;hash=hashlib.sha256();hash.update(open('${CUDA_NVRTC_LIB}','rb').read());print(hash.hexdigest()[:8])"
+    RESULT_VARIABLE _retval
+    OUTPUT_VARIABLE CUDA_NVRTC_SHORTHASH)
+  if(NOT _retval EQUAL 0)
+    message(WARNING "Failed to compute shorthash for libnvrtc.so")
+    set(CUDA_NVRTC_SHORTHASH "XXXXXXXX")
+  else()
+    string(STRIP "${CUDA_NVRTC_SHORTHASH}" CUDA_NVRTC_SHORTHASH)
+    message(STATUS "${CUDA_NVRTC_LIB} shorthash is ${CUDA_NVRTC_SHORTHASH}")
+  endif()
+endif()
+
+# Create new style imported libraries.
+# Several of these libraries have a hardcoded path if CAFFE2_STATIC_LINK_CUDA
+# is set. This path is where sane CUDA installations have their static
+# libraries installed. This flag should only be used for binary builds, so
+# end-users should never have this flag set.
+
+# cuda
+add_library(caffe2::cuda INTERFACE IMPORTED)
+set_property(
+    TARGET caffe2::cuda PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::cuda_driver)
+
+# cudart
+add_library(torch::cudart INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA)
+    set_property(
+        TARGET torch::cudart PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cudart_static)
+else()
+    set_property(
+        TARGET torch::cudart PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cudart)
+endif()
+
+# nvToolsExt
+add_library(torch::nvtoolsext INTERFACE IMPORTED)
+set_property(
+    TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::nvToolsExt)
+
+# cublas
+add_library(caffe2::cublas INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
+    set_property(
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        # NOTE: cublas is always linked dynamically
+        CUDA::cublas CUDA::cublasLt)
+    set_property(
+        TARGET caffe2::cublas APPEND PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cudart_static rt)
+else()
+    set_property(
+        TARGET caffe2::cublas PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cublas CUDA::cublasLt)
+endif()
+
+# cudnn interface
+# static linking is handled by USE_STATIC_CUDNN environment variable
+if(CAFFE2_USE_CUDNN)
+  if(USE_STATIC_CUDNN)
+    set(CUDNN_STATIC ON CACHE BOOL "")
+  else()
+    set(CUDNN_STATIC OFF CACHE BOOL "")
+  endif()
+
+  find_package(CUDNN)
+
+  if(NOT CUDNN_FOUND)
+    message(WARNING
+      "Cannot find cuDNN library. Turning the option off")
+    set(CAFFE2_USE_CUDNN OFF)
+  else()
+    if(CUDNN_VERSION VERSION_LESS "8.1.0")
+      message(FATAL_ERROR "PyTorch requires cuDNN 8.1 and above.")
+    endif()
+  endif()
+
+  add_library(torch::cudnn INTERFACE IMPORTED)
+  target_include_directories(torch::cudnn INTERFACE ${CUDNN_INCLUDE_PATH})
+  if(CUDNN_STATIC AND NOT WIN32)
+    target_link_options(torch::cudnn INTERFACE
+        "-Wl,--exclude-libs,libcudnn_static.a")
+  else()
+    target_link_libraries(torch::cudnn INTERFACE ${CUDNN_LIBRARY_PATH})
+  endif()
+else()
+  message(STATUS "USE_CUDNN is set to 0. Compiling without cuDNN support")
+endif()
+
+if(CAFFE2_USE_CUSPARSELT)
+  find_package(CUSPARSELT)
+
+  if(NOT CUSPARSELT_FOUND)
+    message(WARNING
+      "Cannot find cuSPARSELt library. Turning the option off")
+    set(CAFFE2_USE_CUSPARSELT OFF)
+  else()
+    add_library(torch::cusparselt INTERFACE IMPORTED)
+    target_include_directories(torch::cusparselt INTERFACE ${CUSPARSELT_INCLUDE_PATH})
+    target_link_libraries(torch::cusparselt INTERFACE ${CUSPARSELT_LIBRARY_PATH})
+  endif()
+else()
+  message(STATUS "USE_CUSPARSELT is set to 0. Compiling without cuSPARSELt support")
+endif()
+
+# curand
+add_library(caffe2::curand INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
+    set_property(
+        TARGET caffe2::curand PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::curand_static)
+else()
+    set_property(
+        TARGET caffe2::curand PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::curand)
+endif()
+
+# cufft
+add_library(caffe2::cufft INTERFACE IMPORTED)
+if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
+    set_property(
+        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cufft_static_nocallback)
+else()
+    set_property(
+        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+        CUDA::cufft)
+endif()
+
+# TensorRT
+if(CAFFE2_USE_TENSORRT)
+  add_library(caffe2::tensorrt UNKNOWN IMPORTED)
+  set_property(
+      TARGET caffe2::tensorrt PROPERTY IMPORTED_LOCATION
+      ${TENSORRT_LIBRARY})
+  set_property(
+      TARGET caffe2::tensorrt PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${TENSORRT_INCLUDE_DIR})
+endif()
+
+# nvrtc
+add_library(caffe2::nvrtc INTERFACE IMPORTED)
+set_property(
+    TARGET caffe2::nvrtc PROPERTY INTERFACE_LINK_LIBRARIES
+    CUDA::nvrtc)
+
+# Add onnx namepsace definition to nvcc
+if(ONNX_NAMESPACE)
+  list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=${ONNX_NAMESPACE}")
+else()
+  list(APPEND CUDA_NVCC_FLAGS "-DONNX_NAMESPACE=onnx_c2")
+endif()
+
+# Don't activate VC env again for Ninja generators with MSVC on Windows if CUDAHOSTCXX is not defined
+# by adding --use-local-env.
+if(MSVC AND CMAKE_GENERATOR STREQUAL "Ninja" AND NOT DEFINED ENV{CUDAHOSTCXX})
+  list(APPEND CUDA_NVCC_FLAGS "--use-local-env")
+endif()
+
+# setting nvcc arch flags
+torch_cuda_get_nvcc_gencode_flag(NVCC_FLAGS_EXTRA)
+# CMake 3.18 adds integrated support for architecture selection, but we can't rely on it
+set(CMAKE_CUDA_ARCHITECTURES OFF)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA}")
+
+# disable some nvcc diagnostic that appears in boost, glog, glags, opencv, etc.
+foreach(diag cc_clobber_ignored
+             field_without_dll_interface
+             base_class_has_different_dll_interface
+             dll_interface_conflict_none_assumed
+             dll_interface_conflict_dllexport_assumed
+             bad_friend_decl)
+  list(APPEND SUPPRESS_WARNING_FLAGS --diag_suppress=${diag})
+endforeach()
+string(REPLACE ";" "," SUPPRESS_WARNING_FLAGS "${SUPPRESS_WARNING_FLAGS}")
+list(APPEND CUDA_NVCC_FLAGS -Xcudafe ${SUPPRESS_WARNING_FLAGS})
+
+set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror")
+if(MSVC)
+  list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call")
+  list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward")
+endif()
+
+# Debug and Release symbol support
+if(MSVC)
+  if(${CAFFE2_USE_MSVC_STATIC_RUNTIME})
+    string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -Xcompiler /MTd")
+    string(APPEND CMAKE_CUDA_FLAGS_MINSIZEREL " -Xcompiler /MT")
+    string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -Xcompiler /MT")
+    string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -Xcompiler /MT")
+  else()
+    string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -Xcompiler /MDd")
+    string(APPEND CMAKE_CUDA_FLAGS_MINSIZEREL " -Xcompiler /MD")
+    string(APPEND CMAKE_CUDA_FLAGS_RELEASE " -Xcompiler /MD")
+    string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -Xcompiler /MD")
+  endif()
+  if(CUDA_NVCC_FLAGS MATCHES "Zi")
+    list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-FS")
+  endif()
+elseif(CUDA_DEVICE_DEBUG)
+  list(APPEND CUDA_NVCC_FLAGS "-g" "-G")  # -G enables device code debugging symbols
+endif()
+
+# Set expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+# Set expt-extended-lambda to support lambda on device
+list(APPEND CUDA_NVCC_FLAGS "--expt-extended-lambda")
+
+foreach(FLAG ${CUDA_NVCC_FLAGS})
+  string(FIND "${FLAG}" " " flag_space_position)
+  if(NOT flag_space_position EQUAL -1)
+    message(FATAL_ERROR "Found spaces in CUDA_NVCC_FLAGS entry '${FLAG}'")
+  endif()
+  string(APPEND CMAKE_CUDA_FLAGS " ${FLAG}")
+endforeach()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/gflags.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/gflags.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2093595b29c63cf8b410cfe311f21cb6515c00b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/gflags.cmake
@@ -0,0 +1,83 @@
+# ---[ gflags
+
+# We will try to use the config mode first, and then manual find.
+find_package(gflags CONFIG QUIET)
+if(NOT TARGET gflags)
+  find_package(gflags MODULE QUIET)
+endif()
+
+if(TARGET gflags)
+  message(STATUS "Caffe2: Found gflags with new-style gflags target.")
+elseif(GFLAGS_FOUND)
+  message(STATUS "Caffe2: Found gflags with old-style gflag starget.")
+  add_library(gflags UNKNOWN IMPORTED)
+  set_property(
+      TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARY})
+  set_property(
+      TARGET gflags PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${GFLAGS_INCLUDE_DIR})
+else()
+  message(STATUS
+      "Caffe2: Cannot find gflags automatically. Using legacy find.")
+
+  # - Try to find GFLAGS in the legacy way.
+  #
+  # The following variables are optionally searched for defaults
+  #  GFLAGS_ROOT_DIR: Base directory where all GFLAGS components are found
+  #
+  # The following are set after configuration is done:
+  #  GFLAGS_FOUND
+  #  GFLAGS_INCLUDE_DIRS
+  #  GFLAGS_LIBRARIES
+  #  GFLAGS_LIBRARYRARY_DIRS
+  include(FindPackageHandleStandardArgs)
+  set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags")
+
+  # We are testing only a couple of files in the include directories
+  if(WIN32)
+    find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+        PATHS ${GFLAGS_ROOT_DIR}/src/windows)
+  else()
+    find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+        PATHS ${GFLAGS_ROOT_DIR})
+  endif()
+
+  if(WIN32)
+    find_library(GFLAGS_LIBRARY_RELEASE
+        NAMES libgflags
+        PATHS ${GFLAGS_ROOT_DIR}
+        PATH_SUFFIXES Release)
+
+    find_library(GFLAGS_LIBRARY_DEBUG
+        NAMES libgflags-debug
+        PATHS ${GFLAGS_ROOT_DIR}
+        PATH_SUFFIXES Debug)
+    set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG})
+  else()
+    find_library(GFLAGS_LIBRARY gflags)
+  endif()
+
+  find_package_handle_standard_args(
+      gflags DEFAULT_MSG GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
+
+  if(GFLAGS_FOUND)
+    message(
+        STATUS
+        "Caffe2: Found gflags  (include: ${GFLAGS_INCLUDE_DIR}, "
+        "library: ${GFLAGS_LIBRARY})")
+    add_library(gflags UNKNOWN IMPORTED)
+    set_property(
+        TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARY})
+    set_property(
+        TARGET gflags PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${GFLAGS_INCLUDE_DIR})
+  endif()
+endif()
+
+# After above, we should have the gflags target now.
+if(NOT TARGET gflags)
+  message(WARNING
+      "Caffe2: gflags cannot be found. Depending on whether you are building "
+      "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+      "give you more info.")
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/glog.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/glog.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..07e78d2a507428a90e02e73025ee8c7652752333
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/glog.cmake
@@ -0,0 +1,70 @@
+# ---[ glog
+
+# We will try to use the config mode first, and then manual find.
+find_package(glog CONFIG QUIET)
+if(NOT TARGET glog::glog)
+  find_package(glog MODULE QUIET)
+endif()
+
+if(TARGET glog::glog)
+  message(STATUS "Caffe2: Found glog with new-style glog target.")
+elseif(GLOG_FOUND)
+  message(
+      STATUS
+      "Caffe2: Found glog with old-style glog starget. Glog never shipped "
+      "old style glog targets, so somewhere in your cmake path there might "
+      "be a custom Findglog.cmake file that got triggered. We will make a "
+      "best effort to create the new style glog target for you.")
+  add_library(glog::glog UNKNOWN IMPORTED)
+  set_property(
+      TARGET glog::glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARY})
+  set_property(
+      TARGET glog::glog PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+      ${GLOG_INCLUDE_DIR})
+else()
+  message(STATUS "Caffe2: Cannot find glog automatically. Using legacy find.")
+
+  # - Try to find Glog
+  #
+  # The following variables are optionally searched for defaults
+  #  GLOG_ROOT_DIR: Base directory where all GLOG components are found
+  #
+  # The following are set after configuration is done:
+  #  GLOG_FOUND
+  #  GLOG_INCLUDE_DIRS
+  #  GLOG_LIBRARIES
+  #  GLOG_LIBRARYRARY_DIRS
+
+  include(FindPackageHandleStandardArgs)
+  set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog")
+  if(NOT WIN32)
+      find_path(GLOG_INCLUDE_DIR glog/logging.h
+          PATHS ${GLOG_ROOT_DIR})
+  endif()
+
+  find_library(GLOG_LIBRARY glog
+      PATHS ${GLOG_ROOT_DIR}
+      PATH_SUFFIXES lib lib64)
+
+  find_package_handle_standard_args(glog DEFAULT_MSG GLOG_INCLUDE_DIR GLOG_LIBRARY)
+
+  if(GLOG_FOUND)
+    message(STATUS
+        "Caffe2: Found glog (include: ${GLOG_INCLUDE_DIR}, "
+        "library: ${GLOG_LIBRARY})")
+    add_library(glog::glog UNKNOWN IMPORTED)
+    set_property(
+        TARGET glog::glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARY})
+    set_property(
+        TARGET glog::glog PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+        ${GLOG_INCLUDE_DIR})
+  endif()
+endif()
+
+# After above, we should have the glog::glog target now.
+if(NOT TARGET glog::glog)
+  message(WARNING
+      "Caffe2: glog cannot be found. Depending on whether you are building "
+      "Caffe2 or a Caffe2 dependent library, the next warning / error will "
+      "give you more info.")
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkl.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkl.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c9e8a786c3810b39b9a4c2ac74fd76f9d54888f3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkl.cmake
@@ -0,0 +1,23 @@
+find_package(MKL QUIET)
+
+if(TARGET caffe2::mkl)
+  return()
+endif()
+
+add_library(caffe2::mkl INTERFACE IMPORTED)
+target_include_directories(caffe2::mkl INTERFACE ${MKL_INCLUDE_DIR})
+target_link_libraries(caffe2::mkl INTERFACE ${MKL_LIBRARIES})
+foreach(MKL_LIB IN LISTS MKL_LIBRARIES)
+  if(EXISTS "${MKL_LIB}")
+    get_filename_component(MKL_LINK_DIR "${MKL_LIB}" DIRECTORY)
+    if(IS_DIRECTORY "${MKL_LINK_DIR}")
+      target_link_directories(caffe2::mkl INTERFACE "${MKL_LINK_DIR}")
+    endif()
+  endif()
+endforeach()
+
+# TODO: This is a hack, it will not pick up architecture dependent
+# MKL libraries correctly; see https://github.com/pytorch/pytorch/issues/73008
+set_property(
+  TARGET caffe2::mkl PROPERTY INTERFACE_LINK_DIRECTORIES
+  ${MKL_ROOT}/lib ${MKL_ROOT}/lib/intel64 ${MKL_ROOT}/lib/intel64_win ${MKL_ROOT}/lib/win-x64)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkldnn.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkldnn.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c09e8d0a17471fb0f2d5ef03747d13395a5ad142
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/mkldnn.cmake
@@ -0,0 +1,18 @@
+set(MKLDNN_USE_NATIVE_ARCH ${USE_NATIVE_ARCH})
+
+if(CPU_AARCH64)
+  include(${CMAKE_CURRENT_LIST_DIR}/ComputeLibrary.cmake)
+endif()
+
+find_package(MKLDNN QUIET)
+
+if(NOT TARGET caffe2::mkldnn)
+  add_library(caffe2::mkldnn INTERFACE IMPORTED)
+endif()
+
+set_property(
+  TARGET caffe2::mkldnn PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+  ${MKLDNN_INCLUDE_DIR})
+set_property(
+  TARGET caffe2::mkldnn PROPERTY INTERFACE_LINK_LIBRARIES
+  ${MKLDNN_LIBRARIES})
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/protobuf.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/protobuf.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8764539c63e229014080b12c8dd5c35b8c74f529
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/protobuf.cmake
@@ -0,0 +1,92 @@
+# ---[ Protobuf
+
+# We will try to use the config mode first, and then manual find.
+find_package(Protobuf CONFIG QUIET)
+if(NOT Protobuf_FOUND)
+  find_package(Protobuf MODULE QUIET)
+endif()
+
+if((TARGET protobuf::libprotobuf OR TARGET protobuf::libprotobuf-lite) AND TARGET protobuf::protoc)
+  # Hooray. This is the most ideal situation, meaning that you either have a
+  # Protobuf config file installed (like on Windows), or you are using a
+  # modern CMake that ships with a FindProtobuf.cmake file that produces
+  # modern targets.
+  message(STATUS "Caffe2: Found protobuf with new-style protobuf targets.")
+elseif(Protobuf_FOUND OR PROTOBUF_FOUND)
+  # If the modern targets are not present, we will generate them for you for
+  # backward compatibility. This is backported from CMake's new FindProtobuf.cmake
+  # content.
+  if((NOT PROTOBUF_LIBRARY) AND (NOT PROTOBUF_LITE_LIBRARY))
+    message(FATAL_ERROR
+        "Caffe2: Found protobuf with old style targets, but could not find targets."
+        " PROTOBUF_LIBRARY: " ${PROTOBUF_LIBRARY}
+        " PROTOBUF_LITE_LIBRARY: " ${PROTOBUF_LITE_LIBRARY}
+        " Protobuf_LIBRARY: " ${Protobuf_LIBRARY}
+        " Protobuf_LITE_LIBRARY: " ${Protobuf_LITE_LIBRARY})
+  endif()
+  message(STATUS "Caffe2: Found protobuf with old-style protobuf targets.")
+
+  if(PROTOBUF_LIBRARY)
+    if(NOT TARGET protobuf::libprotobuf)
+      add_library(protobuf::libprotobuf UNKNOWN IMPORTED)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIRS}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY}")
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION "${PROTOBUF_LIBRARY}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY_RELEASE}")
+      set_property(TARGET protobuf::libprotobuf APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS RELEASE)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION_RELEASE "${PROTOBUF_LIBRARY_RELEASE}")
+    endif()
+    if(EXISTS "${PROTOBUF_LIBRARY_DEBUG}")
+      set_property(TARGET protobuf::libprotobuf APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS DEBUG)
+      set_target_properties(protobuf::libprotobuf PROPERTIES
+          IMPORTED_LOCATION_DEBUG "${PROTOBUF_LIBRARY_DEBUG}")
+    endif()
+  endif()
+
+  if(PROTOBUF_LITE_LIBRARY)
+    if(NOT TARGET protobuf::libprotobuf-lite)
+      add_library(protobuf::libprotobuf-lite UNKNOWN IMPORTED)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          INTERFACE_INCLUDE_DIRECTORIES "${PROTOBUF_INCLUDE_DIRS}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY}")
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION "${PROTOBUF_LITE_LIBRARY}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY_RELEASE}")
+      set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS RELEASE)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION_RELEASE "${PROTOBUF_LITE_LIBRARY_RELEASE}")
+    endif()
+    if(EXISTS "${PROTOBUF_LITE_LIBRARY_DEBUG}")
+      set_property(TARGET protobuf::libprotobuf-lite APPEND PROPERTY
+          IMPORTED_CONFIGURATIONS DEBUG)
+      set_target_properties(protobuf::libprotobuf-lite PROPERTIES
+          IMPORTED_LOCATION_DEBUG "${PROTOBUF_LITE_LIBRARY_DEBUG}")
+    endif()
+  endif()
+
+  if(PROTOBUF_PROTOC_EXECUTABLE)
+    if(NOT TARGET protobuf::protoc)
+      add_executable(protobuf::protoc IMPORTED)
+    endif()
+    set_property(TARGET protobuf::protoc PROPERTY
+        IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+  endif()
+endif()
+
+# After above, we should have the protobuf related target now.
+if((NOT TARGET protobuf::libprotobuf) AND (NOT TARGET protobuf::libprotobuf-lite))
+  message(WARNING
+      "Protobuf cannot be found. Depending on whether you are building Caffe2 "
+      "or a Caffe2 dependent library, the next warning / error will give you "
+      "more info.")
+endif()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/utils.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/utils.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae31987a125f58d40946c238f0c06795e12903
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/utils.cmake
@@ -0,0 +1,559 @@
+################################################################################################
+# Exclude and prepend functionalities
+function(exclude OUTPUT INPUT)
+set(EXCLUDES ${ARGN})
+foreach(EXCLUDE ${EXCLUDES})
+        list(REMOVE_ITEM INPUT "${EXCLUDE}")
+endforeach()
+set(${OUTPUT} ${INPUT} PARENT_SCOPE)
+endfunction(exclude)
+
+function(prepend OUTPUT PREPEND)
+set(OUT "")
+foreach(ITEM ${ARGN})
+        list(APPEND OUT "${PREPEND}${ITEM}")
+endforeach()
+set(${OUTPUT} ${OUT} PARENT_SCOPE)
+endfunction(prepend)
+
+
+################################################################################################
+# Clears variables from list
+# Usage:
+#   caffe_clear_vars(<variables_list>)
+macro(caffe_clear_vars)
+  foreach(_var ${ARGN})
+    unset(${_var})
+  endforeach()
+endmacro()
+
+################################################################################################
+# Prints list element per line
+# Usage:
+#   caffe_print_list(<list>)
+function(caffe_print_list)
+  foreach(e ${ARGN})
+    message(STATUS ${e})
+  endforeach()
+endfunction()
+
+################################################################################################
+# Reads set of version defines from the header file
+# Usage:
+#   caffe_parse_header(<file> <define1> <define2> <define3> ..)
+macro(caffe_parse_header FILENAME FILE_VAR)
+  set(vars_regex "")
+  set(__parnet_scope OFF)
+  set(__add_cache OFF)
+  foreach(name ${ARGN})
+    if("${name}" STREQUAL "PARENT_SCOPE")
+      set(__parnet_scope ON)
+    elseif("${name}" STREQUAL "CACHE")
+      set(__add_cache ON)
+    elseif(vars_regex)
+      set(vars_regex "${vars_regex}|${name}")
+    else()
+      set(vars_regex "${name}")
+    endif()
+  endforeach()
+  if(EXISTS "${FILENAME}")
+    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
+  else()
+    unset(${FILE_VAR})
+  endif()
+  foreach(name ${ARGN})
+    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
+      if(${FILE_VAR})
+        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
+          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
+        else()
+          set(${name} "")
+        endif()
+        if(__add_cache)
+          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
+        elseif(__parnet_scope)
+          set(${name} "${${name}}" PARENT_SCOPE)
+        endif()
+      else()
+        unset(${name} CACHE)
+      endif()
+    endif()
+  endforeach()
+endmacro()
+
+################################################################################################
+# Parses a version string that might have values beyond major, minor, and patch
+# and set version variables for the library.
+# Usage:
+#   caffe2_parse_version_str(<library_name> <version_string>)
+function(caffe2_parse_version_str LIBNAME VERSIONSTR)
+  string(REGEX REPLACE "^([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${VERSIONSTR}")
+  string(REGEX REPLACE "^[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${VERSIONSTR}")
+  string(REGEX REPLACE "[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${VERSIONSTR}")
+  set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
+  set(${LIBNAME}_VERSION "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
+endfunction()
+
+###
+# Removes common indentation from a block of text to produce code suitable for
+# setting to `python -c`, or using with pycmd. This allows multiline code to be
+# nested nicely in the surrounding code structure.
+#
+# This function respsects PYTHON_EXECUTABLE if it defined, otherwise it uses
+# `python` and hopes for the best. An error will be thrown if it is not found.
+#
+# Args:
+#     outvar : variable that will hold the stdout of the python command
+#     text   : text to remove indentation from
+#
+function(dedent outvar text)
+  # Use PYTHON_EXECUTABLE if it is defined, otherwise default to python
+  if("${PYTHON_EXECUTABLE}" STREQUAL "")
+    set(_python_exe "python")
+  else()
+    set(_python_exe "${PYTHON_EXECUTABLE}")
+  endif()
+  set(_fixup_cmd "import sys; from textwrap import dedent; print(dedent(sys.stdin.read()))")
+  file(WRITE "${CMAKE_BINARY_DIR}/indented.txt" "${text}")
+  execute_process(
+    COMMAND "${_python_exe}" -c "${_fixup_cmd}"
+    INPUT_FILE "${CMAKE_BINARY_DIR}/indented.txt"
+    RESULT_VARIABLE _dedent_exitcode
+    OUTPUT_VARIABLE _dedent_text)
+  if(NOT _dedent_exitcode EQUAL 0)
+    message(ERROR " Failed to remove indentation from: \n\"\"\"\n${text}\n\"\"\"
+    Python dedent failed with error code: ${_dedent_exitcode}")
+    message(FATAL_ERROR " Python dedent failed with error code: ${_dedent_exitcode}")
+  endif()
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_dedent_text}" _dedent_text)
+  set(${outvar} "${_dedent_text}" PARENT_SCOPE)
+endfunction()
+
+
+function(pycmd_no_exit outvar exitcode cmd)
+  # Use PYTHON_EXECUTABLE if it is defined, otherwise default to python
+  if("${PYTHON_EXECUTABLE}" STREQUAL "")
+    set(_python_exe "python")
+  else()
+    set(_python_exe "${PYTHON_EXECUTABLE}")
+  endif()
+  # run the actual command
+  execute_process(
+    COMMAND "${_python_exe}" -c "${cmd}"
+    RESULT_VARIABLE _exitcode
+    OUTPUT_VARIABLE _output)
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_output}" _output)
+  set(${outvar} "${_output}" PARENT_SCOPE)
+  set(${exitcode} "${_exitcode}" PARENT_SCOPE)
+endfunction()
+
+
+###
+# Helper function to run `python -c "<cmd>"` and capture the results of stdout
+#
+# Runs a python command and populates an outvar with the result of stdout.
+# Common indentation in the text of `cmd` is removed before the command is
+# executed, so the caller does not need to worry about indentation issues.
+#
+# This function respsects PYTHON_EXECUTABLE if it defined, otherwise it uses
+# `python` and hopes for the best. An error will be thrown if it is not found.
+#
+# Args:
+#     outvar : variable that will hold the stdout of the python command
+#     cmd    : text representing a (possibly multiline) block of python code
+#
+function(pycmd outvar cmd)
+  dedent(_dedent_cmd "${cmd}")
+  pycmd_no_exit(_output _exitcode "${_dedent_cmd}")
+
+  if(NOT _exitcode EQUAL 0)
+    message(ERROR " Failed when running python code: \"\"\"\n${_dedent_cmd}\n\"\"\"")
+    message(FATAL_ERROR " Python command failed with error code: ${_exitcode}")
+  endif()
+  # Remove supurflous newlines (artifacts of print)
+  string(STRIP "${_output}" _output)
+  set(${outvar} "${_output}" PARENT_SCOPE)
+endfunction()
+
+
+##############################################################################
+# Macro to update cached options.
+macro(caffe2_update_option variable value)
+  if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO)
+    get_property(__help_string CACHE ${variable} PROPERTY HELPSTRING)
+    set(${variable} ${value} CACHE BOOL ${__help_string} FORCE)
+  else()
+    set(${variable} ${value})
+  endif()
+endmacro()
+
+
+##############################################################################
+# Add an interface library definition that is dependent on the source.
+#
+# It's probably easiest to explain why this macro exists, by describing
+# what things would look like if we didn't have this macro.
+#
+# Let's suppose we want to statically link against torch.  We've defined
+# a library in cmake called torch, and we might think that we just
+# target_link_libraries(my-app PUBLIC torch).  This will result in a
+# linker argument 'libtorch.a' getting passed to the linker.
+#
+# Unfortunately, this link command is wrong!  We have static
+# initializers in libtorch.a that would get improperly pruned by
+# the default link settings.  What we actually need is for you
+# to do -Wl,--whole-archive,libtorch.a -Wl,--no-whole-archive to ensure
+# that we keep all symbols, even if they are (seemingly) not used.
+#
+# What caffe2_interface_library does is create an interface library
+# that indirectly depends on the real library, but sets up the link
+# arguments so that you get all of the extra link settings you need.
+# The result is not a "real" library, and so we have to manually
+# copy over necessary properties from the original target.
+#
+# (The discussion above is about static libraries, but a similar
+# situation occurs for dynamic libraries: if no symbols are used from
+# a dynamic library, it will be pruned unless you are --no-as-needed)
+macro(caffe2_interface_library SRC DST)
+  add_library(${DST} INTERFACE)
+  add_dependencies(${DST} ${SRC})
+  # Depending on the nature of the source library as well as the compiler,
+  # determine the needed compilation flags.
+  get_target_property(__src_target_type ${SRC} TYPE)
+  # Depending on the type of the source library, we will set up the
+  # link command for the specific SRC library.
+  if(${__src_target_type} STREQUAL "STATIC_LIBRARY")
+    # In the case of static library, we will need to add whole-static flags.
+    if(APPLE)
+      target_link_libraries(
+          ${DST} INTERFACE -Wl,-force_load,\"$<TARGET_FILE:${SRC}>\")
+    elseif(MSVC)
+      # In MSVC, we will add whole archive in default.
+      target_link_libraries(
+         ${DST} INTERFACE "$<TARGET_FILE:${SRC}>")
+      target_link_options(
+         ${DST} INTERFACE "-WHOLEARCHIVE:$<TARGET_FILE:${SRC}>")
+    else()
+      # Assume everything else is like gcc
+      target_link_libraries(${DST} INTERFACE
+          "-Wl,--whole-archive,\"$<TARGET_FILE:${SRC}>\" -Wl,--no-whole-archive")
+    endif()
+    # Link all interface link libraries of the src target as well.
+    # For static library, we need to explicitly depend on all the libraries
+    # that are the dependent library of the source library. Note that we cannot
+    # use the populated INTERFACE_LINK_LIBRARIES property, because if one of the
+    # dependent library is not a target, cmake creates a $<LINK_ONLY:src> wrapper
+    # and then one is not able to find target "src". For more discussions, check
+    #   https://gitlab.kitware.com/cmake/cmake/issues/15415
+    #   https://cmake.org/pipermail/cmake-developers/2013-May/019019.html
+    # Specifically the following quote
+    #
+    # """
+    # For STATIC libraries we can define that the PUBLIC/PRIVATE/INTERFACE keys
+    # are ignored for linking and that it always populates both LINK_LIBRARIES
+    # LINK_INTERFACE_LIBRARIES.  Note that for STATIC libraries the
+    # LINK_LIBRARIES property will not be used for anything except build-order
+    # dependencies.
+    # """
+    target_link_libraries(${DST} INTERFACE
+        $<TARGET_PROPERTY:${SRC},LINK_LIBRARIES>)
+  elseif(${__src_target_type} STREQUAL "SHARED_LIBRARY")
+    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+      target_link_libraries(${DST} INTERFACE
+          "-Wl,--no-as-needed,\"$<TARGET_FILE:${SRC}>\" -Wl,--as-needed")
+    else()
+      target_link_libraries(${DST} INTERFACE ${SRC})
+    endif()
+    # Link all interface link libraries of the src target as well.
+    # For shared libraries, we can simply depend on the INTERFACE_LINK_LIBRARIES
+    # property of the target.
+    target_link_libraries(${DST} INTERFACE
+        $<TARGET_PROPERTY:${SRC},INTERFACE_LINK_LIBRARIES>)
+  else()
+    message(FATAL_ERROR
+        "You made a CMake build file error: target " ${SRC}
+        " must be of type either STATIC_LIBRARY or SHARED_LIBRARY. However, "
+        "I got " ${__src_target_type} ".")
+  endif()
+  # For all other interface properties, manually inherit from the source target.
+  set_target_properties(${DST} PROPERTIES
+    INTERFACE_COMPILE_DEFINITIONS
+    $<TARGET_PROPERTY:${SRC},INTERFACE_COMPILE_DEFINITIONS>
+    INTERFACE_COMPILE_OPTIONS
+    $<TARGET_PROPERTY:${SRC},INTERFACE_COMPILE_OPTIONS>
+    INTERFACE_INCLUDE_DIRECTORIES
+    $<TARGET_PROPERTY:${SRC},INTERFACE_INCLUDE_DIRECTORIES>
+    INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
+    $<TARGET_PROPERTY:${SRC},INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>)
+endmacro()
+
+
+##############################################################################
+# Creating a Caffe2 binary target with sources specified with relative path.
+# Usage:
+#   caffe2_binary_target(target_name_or_src <src1> [<src2>] [<src3>] ...)
+# If only target_name_or_src is specified, this target is build with one single
+# source file and the target name is autogen from the filename. Otherwise, the
+# target name is given by the first argument and the rest are the source files
+# to build the target.
+function(caffe2_binary_target target_name_or_src)
+  # https://cmake.org/cmake/help/latest/command/function.html
+  # Checking that ARGC is greater than # is the only way to ensure
+  # that ARGV# was passed to the function as an extra argument.
+  if(ARGC GREATER 1)
+    set(__target ${target_name_or_src})
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${ARGN}")
+  else()
+    get_filename_component(__target ${target_name_or_src} NAME_WE)
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${target_name_or_src}")
+  endif()
+  add_executable(${__target} ${__srcs})
+  target_link_libraries(${__target} torch_library)
+  # If we have Caffe2_MODULES defined, we will also link with the modules.
+  if(DEFINED Caffe2_MODULES)
+    target_link_libraries(${__target} ${Caffe2_MODULES})
+  endif()
+  if(USE_TBB AND NOT USE_SYSTEM_TBB)
+    target_include_directories(${__target} PUBLIC ${TBB_INCLUDE_DIR})
+  endif()
+  install(TARGETS ${__target} DESTINATION bin)
+endfunction()
+
+function(caffe2_hip_binary_target target_name_or_src)
+  if(ARGC GREATER 1)
+    set(__target ${target_name_or_src})
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${ARGN}")
+  else()
+    get_filename_component(__target ${target_name_or_src} NAME_WE)
+    prepend(__srcs "${CMAKE_CURRENT_SOURCE_DIR}/" "${target_name_or_src}")
+  endif()
+
+  caffe2_binary_target(${target_name_or_src})
+
+  target_compile_options(${__target} PRIVATE ${HIP_CXX_FLAGS})
+  target_include_directories(${__target} PRIVATE ${Caffe2_HIP_INCLUDE})
+endfunction()
+
+
+##############################################################################
+# Multiplex between adding libraries for CUDA versus HIP (AMD Software Stack).
+# Usage:
+#   torch_cuda_based_add_library(cuda_target)
+#
+macro(torch_cuda_based_add_library cuda_target)
+  if(USE_ROCM)
+    hip_add_library(${cuda_target} ${ARGN})
+  elseif(USE_CUDA)
+    add_library(${cuda_target} ${ARGN})
+  else()
+  endif()
+endmacro()
+
+##############################################################################
+# Get the HIP arch flags specified by PYTORCH_ROCM_ARCH.
+# Usage:
+#   torch_hip_get_arch_list(variable_to_store_flags)
+#
+macro(torch_hip_get_arch_list store_var)
+  if(DEFINED ENV{PYTORCH_ROCM_ARCH})
+    set(_TMP $ENV{PYTORCH_ROCM_ARCH})
+  else()
+    # Use arch of installed GPUs as default
+    execute_process(COMMAND "rocm_agent_enumerator" COMMAND bash "-c" "grep -v gfx000 | sort -u | xargs | tr -d '\n'"
+                    RESULT_VARIABLE ROCM_AGENT_ENUMERATOR_RESULT
+                    OUTPUT_VARIABLE ROCM_ARCH_INSTALLED)
+    if(NOT ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0)
+      message(FATAL_ERROR " Could not detect ROCm arch for GPUs on machine. Result: '${ROCM_AGENT_ENUMERATOR_RESULT}'")
+    endif()
+    set(_TMP ${ROCM_ARCH_INSTALLED})
+  endif()
+  string(REPLACE " " ";" ${store_var} "${_TMP}")
+endmacro()
+
+##############################################################################
+# Get the NVCC arch flags specified by TORCH_CUDA_ARCH_LIST and CUDA_ARCH_NAME.
+# Usage:
+#   torch_cuda_get_nvcc_gencode_flag(variable_to_store_flags)
+#
+macro(torch_cuda_get_nvcc_gencode_flag store_var)
+  # setting nvcc arch flags
+  if((NOT DEFINED TORCH_CUDA_ARCH_LIST) AND (DEFINED ENV{TORCH_CUDA_ARCH_LIST}))
+    message(WARNING
+        "In the future we will require one to explicitly pass "
+        "TORCH_CUDA_ARCH_LIST to cmake instead of implicitly setting it as an "
+        "env variable. This will become a FATAL_ERROR in future version of "
+        "pytorch.")
+    set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
+  endif()
+  if(DEFINED CUDA_ARCH_NAME)
+    message(WARNING
+        "CUDA_ARCH_NAME is no longer used. Use TORCH_CUDA_ARCH_LIST instead. "
+        "Right now, CUDA_ARCH_NAME is ${CUDA_ARCH_NAME} and "
+        "TORCH_CUDA_ARCH_LIST is ${TORCH_CUDA_ARCH_LIST}.")
+    set(TORCH_CUDA_ARCH_LIST TORCH_CUDA_ARCH_LIST ${CUDA_ARCH_NAME})
+  endif()
+
+  # Invoke cuda_select_nvcc_arch_flags from proper cmake FindCUDA.
+  cuda_select_nvcc_arch_flags(${store_var} ${TORCH_CUDA_ARCH_LIST})
+endmacro()
+
+
+##############################################################################
+# Add standard compile options.
+# Usage:
+#   torch_compile_options(lib_name)
+function(torch_compile_options libname)
+  set_property(TARGET ${libname} PROPERTY CXX_STANDARD 17)
+  set(private_compile_options "")
+
+  # ---[ Check if warnings should be errors.
+  if(WERROR)
+    list(APPEND private_compile_options -Werror)
+  endif()
+
+  # until they can be unified, keep these lists synced with setup.py
+  if(MSVC)
+
+    if(MSVC_Z7_OVERRIDE)
+      set(MSVC_DEBINFO_OPTION "/Z7")
+    else()
+      set(MSVC_DEBINFO_OPTION "/Zi")
+    endif()
+
+    target_compile_options(${libname} PUBLIC
+      $<$<COMPILE_LANGUAGE:CXX>:
+        ${MSVC_RUNTIME_LIBRARY_OPTION}
+        $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:${MSVC_DEBINFO_OPTION}>
+        /EHsc
+        /bigobj>
+      )
+  else()
+    list(APPEND private_compile_options
+      -Wall
+      -Wextra
+      -Wdeprecated
+      -Wno-unused-parameter
+      -Wno-unused-function
+      -Wno-missing-field-initializers
+      -Wno-unknown-pragmas
+      -Wno-type-limits
+      -Wno-array-bounds
+      -Wno-unknown-pragmas
+      -Wno-strict-overflow
+      -Wno-strict-aliasing
+      )
+    if(NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+      list(APPEND private_compile_options
+        # Considered to be flaky.  See the discussion at
+        # https://github.com/pytorch/pytorch/pull/9608
+        -Wno-maybe-uninitialized)
+    endif()
+
+  endif()
+
+  if(MSVC)
+  elseif(WERROR)
+    list(APPEND private_compile_options -Wno-strict-overflow)
+  endif()
+
+  target_compile_options(${libname} PRIVATE
+      $<$<COMPILE_LANGUAGE:CXX>:${private_compile_options}>)
+  if(USE_CUDA)
+    string(FIND "${private_compile_options}" " " space_position)
+    if(NOT space_position EQUAL -1)
+      message(FATAL_ERROR "Found spaces in private_compile_options='${private_compile_options}'")
+    endif()
+    # Convert CMake list to comma-separated list
+    string(REPLACE ";" "," private_compile_options "${private_compile_options}")
+    target_compile_options(${libname} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${private_compile_options}>)
+  endif()
+
+  if(NOT WIN32 AND NOT USE_ASAN)
+    # Enable hidden visibility by default to make it easier to debug issues with
+    # TORCH_API annotations. Hidden visibility with selective default visibility
+    # behaves close enough to Windows' dllimport/dllexport.
+    #
+    # Unfortunately, hidden visibility messes up some ubsan warnings because
+    # templated classes crossing library boundary get duplicated (but identical)
+    # definitions. It's easier to just disable it.
+    target_compile_options(${libname} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>: -fvisibility=hidden>)
+  endif()
+
+  # Use -O2 for release builds (-O3 doesn't improve perf, and -Os results in perf regression)
+  target_compile_options(${libname} PRIVATE
+      $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<OR:$<CONFIG:Release>,$<CONFIG:RelWithDebInfo>>>:-O2>)
+
+endfunction()
+
+##############################################################################
+# Set old-style FindCuda.cmake compile flags from modern CMake cuda flags.
+# Usage:
+#   torch_update_find_cuda_flags()
+function(torch_update_find_cuda_flags)
+  # Convert -O2 -Xcompiler="-O2 -Wall" to "-O2;-Xcompiler=-O2,-Wall"
+  if(USE_CUDA)
+    separate_arguments(FLAGS UNIX_COMMAND "${CMAKE_CUDA_FLAGS}")
+    string(REPLACE " " "," FLAGS "${FLAGS}")
+    set(CUDA_NVCC_FLAGS ${FLAGS} PARENT_SCOPE)
+
+    separate_arguments(FLAGS_DEBUG UNIX_COMMAND "${CMAKE_CUDA_FLAGS_DEBUG}")
+    string(REPLACE " " "," FLAGS_DEBUG "${FLAGS_DEBUG}")
+    set(CUDA_NVCC_FLAGS_DEBUG "${FLAGS_DEBUG}" PARENT_SCOPE)
+
+    separate_arguments(FLAGS_RELEASE UNIX_COMMAND "${CMAKE_CUDA_FLAGS_RELEASE}")
+    string(REPLACE " " "," FLAGS_RELEASE "${FLAGS_RELEASE}")
+    set(CUDA_NVCC_FLAGS_RELEASE "${FLAGS_RELEASE}" PARENT_SCOPE)
+
+    separate_arguments(FLAGS_MINSIZEREL UNIX_COMMAND "${CMAKE_CUDA_FLAGS_MINSIZEREL}")
+    string(REPLACE " " "," FLAGS_MINSIZEREL "${FLAGS_MINSIZEREL}")
+    set(CUDA_NVCC_FLAGS_MINSIZEREL "${FLAGS_MINSIZEREL}" PARENT_SCOPE)
+
+    separate_arguments(FLAGS_RELWITHDEBINFO UNIX_COMMAND "${CMAKE_CUDA_FLAGS_RELWITHDEBINFO}")
+    string(REPLACE " " "," FLAGS_RELWITHDEBINFO "${FLAGS_RELWITHDEBINFO}")
+    set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "${FLAGS_RELWITHDEBINFO}" PARENT_SCOPE)
+
+    message(STATUS "Converting CMAKE_CUDA_FLAGS to CUDA_NVCC_FLAGS:\n"
+                    "    CUDA_NVCC_FLAGS                = ${FLAGS}\n"
+                    "    CUDA_NVCC_FLAGS_DEBUG          = ${FLAGS_DEBUG}\n"
+                    "    CUDA_NVCC_FLAGS_RELEASE        = ${FLAGS_RELEASE}\n"
+                    "    CUDA_NVCC_FLAGS_RELWITHDEBINFO = ${FLAGS_RELWITHDEBINFO}\n"
+                    "    CUDA_NVCC_FLAGS_MINSIZEREL     = ${FLAGS_MINSIZEREL}")
+  endif()
+endfunction()
+
+include(CheckCXXCompilerFlag)
+
+##############################################################################
+# CHeck if given flag is supported and append it to provided outputvar
+# Also define HAS_UPPER_CASE_FLAG_NAME variable
+# Usage:
+#   append_cxx_flag_if_supported("-Werror" CMAKE_CXX_FLAGS)
+function(append_cxx_flag_if_supported flag outputvar)
+    string(TOUPPER "HAS${flag}" _FLAG_NAME)
+    string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
+    # GCC silents unknown -Wno-XXX flags, so we detect the corresponding -WXXX.
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      string(REGEX REPLACE "Wno-" "W" new_flag "${flag}")
+    else()
+      set(new_flag ${flag})
+    endif()
+    check_cxx_compiler_flag("${new_flag}" ${_FLAG_NAME})
+    if(${_FLAG_NAME})
+        string(APPEND ${outputvar} " ${flag}")
+        set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(target_compile_options_if_supported target flag)
+  set(_compile_options "")
+  append_cxx_flag_if_supported("${flag}" _compile_options)
+  if(NOT "${_compile_options}" STREQUAL "")
+    target_compile_options(${target} PRIVATE ${flag})
+  endif()
+endfunction()
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/xpu.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/xpu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..21c82a279035ed2a75d3de1d8eac7fad944c6b85
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Caffe2/public/xpu.cmake
@@ -0,0 +1,30 @@
+# ---[ xpu
+
+# Poor man's include guard
+if(TARGET torch::xpurt)
+  return()
+endif()
+
+# Find SYCL library.
+find_package(SYCLToolkit REQUIRED)
+if(NOT SYCL_FOUND)
+  set(PYTORCH_FOUND_XPU FALSE)
+  return()
+endif()
+set(PYTORCH_FOUND_XPU TRUE)
+
+# SYCL library interface
+add_library(torch::sycl INTERFACE IMPORTED)
+
+set_property(
+    TARGET torch::sycl PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+    ${SYCL_INCLUDE_DIR})
+set_property(
+    TARGET torch::sycl PROPERTY INTERFACE_LINK_LIBRARIES
+    ${SYCL_LIBRARY})
+
+# xpurt
+add_library(torch::xpurt INTERFACE IMPORTED)
+set_property(
+    TARGET torch::xpurt PROPERTY INTERFACE_LINK_LIBRARIES
+    torch::sycl)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfig.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..47345e19de6e417fbaa33e9dc8420b4628f82285
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfig.cmake
@@ -0,0 +1,190 @@
+# FindTorch
+# -------
+#
+# Finds the Torch library
+#
+# This will define the following variables:
+#
+#   TORCH_FOUND        -- True if the system has the Torch library
+#   TORCH_INCLUDE_DIRS -- The include directories for torch
+#   TORCH_LIBRARIES    -- Libraries to link against
+#   TORCH_CXX_FLAGS    -- Additional (required) compiler flags
+#
+# and the following imported targets:
+#
+#   torch
+macro(append_torchlib_if_found)
+  foreach (_arg ${ARGN})
+    find_library(${_arg}_LIBRARY ${_arg} PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    if(${_arg}_LIBRARY)
+      list(APPEND TORCH_LIBRARIES ${${_arg}_LIBRARY})
+    else()
+      message(WARNING "static library ${${_arg}_LIBRARY} not found.")
+    endif()
+  endforeach()
+endmacro()
+
+macro(append_wholearchive_lib_if_found)
+  foreach (_arg ${ARGN})
+    find_library(${_arg}_LIBRARY ${_arg} PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    if(${_arg}_LIBRARY)
+      if(APPLE)
+        list(APPEND TORCH_LIBRARIES "-Wl,-force_load,${${_arg}_LIBRARY}")
+      elseif(MSVC)
+        list(APPEND TORCH_LIBRARIES "-WHOLEARCHIVE:${${_arg}_LIBRARY}")
+      else()
+        # Linux
+        list(APPEND TORCH_LIBRARIES "-Wl,--whole-archive ${${_arg}_LIBRARY} -Wl,--no-whole-archive")
+      endif()
+    else()
+      message(WARNING "static library ${${_arg}_LIBRARY} not found.")
+    endif()
+  endforeach()
+endmacro()
+
+include(FindPackageHandleStandardArgs)
+
+if(DEFINED ENV{TORCH_INSTALL_PREFIX})
+  set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX})
+else()
+  # Assume we are in <install-prefix>/share/cmake/Torch/TorchConfig.cmake
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+endif()
+
+# Include directories.
+if(EXISTS "${TORCH_INSTALL_PREFIX}/include")
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/include
+    ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include)
+else()
+  set(TORCH_INCLUDE_DIRS
+    ${TORCH_INSTALL_PREFIX}/include
+    ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include)
+endif()
+
+# Library dependencies.
+if(ON)
+  find_package(Caffe2 REQUIRED PATHS ${CMAKE_CURRENT_LIST_DIR}/../Caffe2)
+  set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS})
+  append_torchlib_if_found(c10)
+else()
+  add_library(torch STATIC IMPORTED) # set imported_location at the bottom
+  #library need whole archive
+  append_wholearchive_lib_if_found(torch torch_cpu)
+  if(0)
+    append_wholearchive_lib_if_found(torch_cuda c10_cuda)
+  endif()
+
+  # We need manually add dependent libraries when they are not linked into the
+  # shared library.
+  # TODO: this list might be incomplete.
+  append_torchlib_if_found(c10)
+  if(OFF)
+    append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx)
+  endif()
+
+  if(OFF)
+    append_torchlib_if_found(nnpack)
+  endif()
+
+  if(OFF)
+    append_torchlib_if_found(pytorch_qnnpack)
+  endif()
+
+  if(OFF)
+    append_torchlib_if_found(qnnpack)
+  endif()
+
+  if(ON)
+    append_torchlib_if_found(XNNPACK)
+  endif()
+
+  append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)
+  append_torchlib_if_found(onnx onnx_proto)
+
+  append_torchlib_if_found(foxi_loader fmt)
+  append_torchlib_if_found(cpuinfo clog)
+
+  if(NOT OFF)
+    append_torchlib_if_found(pthreadpool)
+  endif()
+
+  append_torchlib_if_found(eigen_blas)
+
+  if(ON)
+    append_torchlib_if_found(fbgemm)
+  endif()
+
+  if(ON)
+    append_torchlib_if_found(dnnl mkldnn)
+  endif()
+
+  append_torchlib_if_found(sleef asmjit)
+endif()
+
+if(ON)
+  append_torchlib_if_found(kineto)
+endif()
+
+if(0)
+  if(MSVC)
+    if(NOT NVTOOLEXT_HOME)
+      set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+    endif()
+    if(DEFINED ENV{NVTOOLSEXT_PATH})
+      set(NVTOOLEXT_HOME $ENV{NVTOOLSEXT_PATH})
+    endif()
+    set(TORCH_CUDA_LIBRARIES
+      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
+      ${CUDA_LIBRARIES})
+    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
+    find_library(CAFFE2_NVRTC_LIBRARY caffe2_nvrtc PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    list(APPEND TORCH_CUDA_LIBRARIES ${CAFFE2_NVRTC_LIBRARY})
+  elseif(APPLE)
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
+      ${CUDA_LIBRARIES})
+  else()
+    find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/)
+    set(TORCH_CUDA_LIBRARIES
+      ${CUDA_CUDA_LIB}
+      ${CUDA_NVRTC_LIB}
+      ${LIBNVTOOLSEXT}
+      ${CUDA_LIBRARIES})
+  endif()
+  if(ON)
+    find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    list(APPEND TORCH_CUDA_LIBRARIES ${C10_CUDA_LIBRARY})
+  endif()
+  list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
+endif()
+
+# When we build libtorch with the old libstdc++ ABI, dependent libraries must too.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+  set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=")
+endif()
+
+find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
+# the statements below changes target properties on
+# - the imported target from Caffe2Targets.cmake in shared library mode (see the find_package above)
+#    - this is untested whether it is the correct (or desired) methodology in CMake
+# - the imported target created in this file in static library mode
+if(NOT ON)
+  # do not set this property on the shared library target, as it will cause confusion in some builds
+  # as the configuration specific property is set in the Caffe2Targets.cmake file
+  set_target_properties(torch PROPERTIES
+      IMPORTED_LOCATION "${TORCH_LIBRARY}"
+  )
+endif()
+set_target_properties(torch PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}"
+    CXX_STANDARD 17
+)
+if(TORCH_CXX_FLAGS)
+  set_property(TARGET torch PROPERTY INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}")
+endif()
+
+find_package_handle_standard_args(Torch DEFAULT_MSG TORCH_LIBRARY TORCH_INCLUDE_DIRS)
diff --git a/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfigVersion.cmake b/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfigVersion.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..a2d92d7de959975d3395b12e768c848e997503fb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/share/cmake/Torch/TorchConfigVersion.cmake
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "2.3.1")
+
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/MLPY/Lib/site-packages/torch/signal/__init__.py b/MLPY/Lib/site-packages/torch/signal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fae5e78bae1a71bd313dd5b605ea93b5d9403a8b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/signal/__init__.py
@@ -0,0 +1,5 @@
+from . import windows
+
+__all__ = [
+    'windows'
+]
diff --git a/MLPY/Lib/site-packages/torch/signal/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/signal/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7df2d8bf5e92a648c953e5159caf97b6bf847ddf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/signal/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/signal/windows/__init__.py b/MLPY/Lib/site-packages/torch/signal/windows/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a2482b9349574075da32706dfdd78308fd2ae0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/signal/windows/__init__.py
@@ -0,0 +1,28 @@
+from .windows import (
+    bartlett,
+    blackman,
+    cosine,
+    exponential,
+    gaussian,
+    general_cosine,
+    general_hamming,
+    hamming,
+    hann,
+    kaiser,
+    nuttall,
+)
+
+
+__all__ = [
+    'bartlett',
+    'blackman',
+    'cosine',
+    'exponential',
+    'gaussian',
+    'general_cosine',
+    'general_hamming',
+    'hamming',
+    'hann',
+    'kaiser',
+    'nuttall',
+]
diff --git a/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1eaa968dac6466237c7c484e06b3fb058245ea2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/windows.cpython-39.pyc b/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/windows.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d88ab5593b8a3d82ce5a75b1da15da9b08a3247
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/signal/windows/__pycache__/windows.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/signal/windows/windows.py b/MLPY/Lib/site-packages/torch/signal/windows/windows.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec8ff4e1d8e9f331546c20542da45a13d6d3911b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/signal/windows/windows.py
@@ -0,0 +1,805 @@
+from typing import Optional, Iterable
+
+import torch
+from math import sqrt
+
+from torch import Tensor
+from torch._torch_docs import factory_common_args, parse_kwargs, merge_dicts
+
+__all__ = [
+    'bartlett',
+    'blackman',
+    'cosine',
+    'exponential',
+    'gaussian',
+    'general_cosine',
+    'general_hamming',
+    'hamming',
+    'hann',
+    'kaiser',
+    'nuttall',
+]
+
+window_common_args = merge_dicts(
+    parse_kwargs(
+        """
+    M (int): the length of the window.
+        In other words, the number of points of the returned window.
+    sym (bool, optional): If `False`, returns a periodic window suitable for use in spectral analysis.
+        If `True`, returns a symmetric window suitable for use in filter design. Default: `True`.
+"""
+    ),
+    factory_common_args,
+    {
+        "normalization": "The window is normalized to 1 (maximum value is 1). However, the 1 doesn't appear if "
+                         ":attr:`M` is even and :attr:`sym` is `True`.",
+    }
+)
+
+
+def _add_docstr(*args):
+    r"""Adds docstrings to a given decorated function.
+
+    Specially useful when then docstrings needs string interpolation, e.g., with
+    str.format().
+    REMARK: Do not use this function if the docstring doesn't need string
+    interpolation, just write a conventional docstring.
+
+    Args:
+        args (str):
+    """
+
+    def decorator(o):
+        o.__doc__ = "".join(args)
+        return o
+
+    return decorator
+
+
+def _window_function_checks(function_name: str, M: int, dtype: torch.dtype, layout: torch.layout) -> None:
+    r"""Performs common checks for all the defined windows.
+     This function should be called before computing any window.
+
+     Args:
+         function_name (str): name of the window function.
+         M (int): length of the window.
+         dtype (:class:`torch.dtype`): the desired data type of returned tensor.
+         layout (:class:`torch.layout`): the desired layout of returned tensor.
+     """
+    if M < 0:
+        raise ValueError(f'{function_name} requires non-negative window length, got M={M}')
+    if layout is not torch.strided:
+        raise ValueError(f'{function_name} is implemented for strided tensors only, got: {layout}')
+    if dtype not in [torch.float32, torch.float64]:
+        raise ValueError(f'{function_name} expects float32 or float64 dtypes, got: {dtype}')
+
+
+@_add_docstr(
+    r"""
+Computes a window with an exponential waveform.
+Also known as Poisson window.
+
+The exponential window is defined as follows:
+
+.. math::
+    w_n = \exp{\left(-\frac{|n - c|}{\tau}\right)}
+
+where `c` is the ``center`` of the window.
+    """,
+    r"""
+
+{normalization}
+
+Args:
+    {M}
+
+Keyword args:
+    center (float, optional): where the center of the window will be located.
+        Default: `M / 2` if `sym` is `False`, else `(M - 1) / 2`.
+    tau (float, optional): the decay value.
+        Tau is generally associated with a percentage, that means, that the value should
+        vary within the interval (0, 100]. If tau is 100, it is considered the uniform window.
+        Default: 1.0.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric exponential window of size 10 and with a decay value of 1.0.
+    >>> # The center will be at (M - 1) / 2, where M is 10.
+    >>> torch.signal.windows.exponential(10)
+    tensor([0.0111, 0.0302, 0.0821, 0.2231, 0.6065, 0.6065, 0.2231, 0.0821, 0.0302, 0.0111])
+
+    >>> # Generates a periodic exponential window and decay factor equal to .5
+    >>> torch.signal.windows.exponential(10, sym=False,tau=.5)
+    tensor([4.5400e-05, 3.3546e-04, 2.4788e-03, 1.8316e-02, 1.3534e-01, 1.0000e+00, 1.3534e-01, 1.8316e-02, 2.4788e-03, 3.3546e-04])
+    """.format(
+        **window_common_args
+    ),
+)
+def exponential(
+        M: int,
+        *,
+        center: Optional[float] = None,
+        tau: float = 1.0,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('exponential', M, dtype, layout)
+
+    if tau <= 0:
+        raise ValueError(f'Tau must be positive, got: {tau} instead.')
+
+    if sym and center is not None:
+        raise ValueError('Center must be None for symmetric windows')
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if center is None:
+        center = (M if not sym and M > 1 else M - 1) / 2.0
+
+    constant = 1 / tau
+
+    k = torch.linspace(start=-center * constant,
+                       end=(-center + (M - 1)) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return torch.exp(-torch.abs(k))
+
+
+@_add_docstr(
+    r"""
+Computes a window with a simple cosine waveform, following the same implementation as SciPy.
+This window is also known as the sine window.
+
+The cosine window is defined as follows:
+
+.. math::
+    w_n = \sin\left(\frac{\pi (n + 0.5)}{M}\right)
+
+This formula differs from the typical cosine window formula by incorporating a 0.5 term in the numerator,
+which shifts the sample positions. This adjustment results in a window that starts and ends with non-zero values.
+
+""",
+    r"""
+
+{normalization}
+
+Args:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric cosine window.
+    >>> torch.signal.windows.cosine(10)
+    tensor([0.1564, 0.4540, 0.7071, 0.8910, 0.9877, 0.9877, 0.8910, 0.7071, 0.4540, 0.1564])
+
+    >>> # Generates a periodic cosine window.
+    >>> torch.signal.windows.cosine(10, sym=False)
+    tensor([0.1423, 0.4154, 0.6549, 0.8413, 0.9595, 1.0000, 0.9595, 0.8413, 0.6549, 0.4154])
+""".format(
+        **window_common_args,
+    ),
+)
+def cosine(
+        M: int,
+        *,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('cosine', M, dtype, layout)
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    start = 0.5
+    constant = torch.pi / (M + 1 if not sym and M > 1 else M)
+
+    k = torch.linspace(start=start * constant,
+                       end=(start + (M - 1)) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return torch.sin(k)
+
+
+@_add_docstr(
+    r"""
+Computes a window with a gaussian waveform.
+
+The gaussian window is defined as follows:
+
+.. math::
+    w_n = \exp{\left(-\left(\frac{n}{2\sigma}\right)^2\right)}
+    """,
+    r"""
+
+{normalization}
+
+Args:
+    {M}
+
+Keyword args:
+    std (float, optional): the standard deviation of the gaussian. It controls how narrow or wide the window is.
+        Default: 1.0.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric gaussian window with a standard deviation of 1.0.
+    >>> torch.signal.windows.gaussian(10)
+    tensor([4.0065e-05, 2.1875e-03, 4.3937e-02, 3.2465e-01, 8.8250e-01, 8.8250e-01, 3.2465e-01, 4.3937e-02, 2.1875e-03, 4.0065e-05])
+
+    >>> # Generates a periodic gaussian window and standard deviation equal to 0.9.
+    >>> torch.signal.windows.gaussian(10, sym=False,std=0.9)
+    tensor([1.9858e-07, 5.1365e-05, 3.8659e-03, 8.4658e-02, 5.3941e-01, 1.0000e+00, 5.3941e-01, 8.4658e-02, 3.8659e-03, 5.1365e-05])
+""".format(
+        **window_common_args,
+    ),
+)
+def gaussian(
+        M: int,
+        *,
+        std: float = 1.0,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('gaussian', M, dtype, layout)
+
+    if std <= 0:
+        raise ValueError(f'Standard deviation must be positive, got: {std} instead.')
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    start = -(M if not sym and M > 1 else M - 1) / 2.0
+
+    constant = 1 / (std * sqrt(2))
+
+    k = torch.linspace(start=start * constant,
+                       end=(start + (M - 1)) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return torch.exp(-k ** 2)
+
+
+@_add_docstr(
+    r"""
+Computes the Kaiser window.
+
+The Kaiser window is defined as follows:
+
+.. math::
+    w_n = I_0 \left( \beta \sqrt{1 - \left( {\frac{n - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta )
+
+where ``I_0`` is the zeroth order modified Bessel function of the first kind (see :func:`torch.special.i0`), and
+``N = M - 1 if sym else M``.
+    """,
+    r"""
+
+{normalization}
+
+Args:
+    {M}
+
+Keyword args:
+    beta (float, optional): shape parameter for the window. Must be non-negative. Default: 12.0
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric gaussian window with a standard deviation of 1.0.
+    >>> torch.signal.windows.kaiser(5)
+    tensor([4.0065e-05, 2.1875e-03, 4.3937e-02, 3.2465e-01, 8.8250e-01, 8.8250e-01, 3.2465e-01, 4.3937e-02, 2.1875e-03, 4.0065e-05])
+    >>> # Generates a periodic gaussian window and standard deviation equal to 0.9.
+    >>> torch.signal.windows.kaiser(5, sym=False,std=0.9)
+    tensor([1.9858e-07, 5.1365e-05, 3.8659e-03, 8.4658e-02, 5.3941e-01, 1.0000e+00, 5.3941e-01, 8.4658e-02, 3.8659e-03, 5.1365e-05])
+""".format(
+        **window_common_args,
+    ),
+)
+def kaiser(
+        M: int,
+        *,
+        beta: float = 12.0,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('kaiser', M, dtype, layout)
+
+    if beta < 0:
+        raise ValueError(f'beta must be non-negative, got: {beta} instead.')
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    # Avoid NaNs by casting `beta` to the appropriate dtype.
+    beta = torch.tensor(beta, dtype=dtype, device=device)
+
+    start = -beta
+    constant = 2.0 * beta / (M if not sym else M - 1)
+    end = torch.minimum(beta, start + (M - 1) * constant)
+
+    k = torch.linspace(start=start,
+                       end=end,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return torch.i0(torch.sqrt(beta * beta - torch.pow(k, 2))) / torch.i0(beta)
+
+
+@_add_docstr(
+    r"""
+Computes the Hamming window.
+
+The Hamming window is defined as follows:
+
+.. math::
+    w_n = \alpha - \beta\ \cos \left( \frac{2 \pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    alpha (float, optional): The coefficient :math:`\alpha` in the equation above.
+    beta (float, optional): The coefficient :math:`\beta` in the equation above.
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric Hamming window.
+    >>> torch.signal.windows.hamming(10)
+    tensor([0.0800, 0.1876, 0.4601, 0.7700, 0.9723, 0.9723, 0.7700, 0.4601, 0.1876, 0.0800])
+
+    >>> # Generates a periodic Hamming window.
+    >>> torch.signal.windows.hamming(10, sym=False)
+    tensor([0.0800, 0.1679, 0.3979, 0.6821, 0.9121, 1.0000, 0.9121, 0.6821, 0.3979, 0.1679])
+""".format(
+        **window_common_args
+    ),
+)
+def hamming(M: int,
+            *,
+            sym: bool = True,
+            dtype: Optional[torch.dtype] = None,
+            layout: torch.layout = torch.strided,
+            device: Optional[torch.device] = None,
+            requires_grad: bool = False) -> Tensor:
+    return general_hamming(M, sym=sym, dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Hann window.
+
+The Hann window is defined as follows:
+
+.. math::
+    w_n = \frac{1}{2}\ \left[1 - \cos \left( \frac{2 \pi n}{M - 1} \right)\right] =
+    \sin^2 \left( \frac{\pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric Hann window.
+    >>> torch.signal.windows.hann(10)
+    tensor([0.0000, 0.1170, 0.4132, 0.7500, 0.9698, 0.9698, 0.7500, 0.4132, 0.1170, 0.0000])
+
+    >>> # Generates a periodic Hann window.
+    >>> torch.signal.windows.hann(10, sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def hann(M: int,
+         *,
+         sym: bool = True,
+         dtype: Optional[torch.dtype] = None,
+         layout: torch.layout = torch.strided,
+         device: Optional[torch.device] = None,
+         requires_grad: bool = False) -> Tensor:
+    return general_hamming(M,
+                           alpha=0.5,
+                           sym=sym,
+                           dtype=dtype,
+                           layout=layout,
+                           device=device,
+                           requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Blackman window.
+
+The Blackman window is defined as follows:
+
+.. math::
+    w_n = 0.42 - 0.5 \cos \left( \frac{2 \pi n}{M - 1} \right) + 0.08 \cos \left( \frac{4 \pi n}{M - 1} \right)
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric Blackman window.
+    >>> torch.signal.windows.blackman(5)
+    tensor([-1.4901e-08,  3.4000e-01,  1.0000e+00,  3.4000e-01, -1.4901e-08])
+
+    >>> # Generates a periodic Blackman window.
+    >>> torch.signal.windows.blackman(5, sym=False)
+    tensor([-1.4901e-08,  2.0077e-01,  8.4923e-01,  8.4923e-01,  2.0077e-01])
+""".format(
+        **window_common_args
+    ),
+)
+def blackman(M: int,
+             *,
+             sym: bool = True,
+             dtype: Optional[torch.dtype] = None,
+             layout: torch.layout = torch.strided,
+             device: Optional[torch.device] = None,
+             requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('blackman', M, dtype, layout)
+
+    return general_cosine(M, a=[0.42, 0.5, 0.08], sym=sym, dtype=dtype, layout=layout, device=device,
+                          requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the Bartlett window.
+
+The Bartlett window is defined as follows:
+
+.. math::
+    w_n = 1 - \left| \frac{2n}{M - 1} - 1 \right| = \begin{cases}
+        \frac{2n}{M - 1} & \text{if } 0 \leq n \leq \frac{M - 1}{2} \\
+        2 - \frac{2n}{M - 1} & \text{if } \frac{M - 1}{2} < n < M \\ \end{cases}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric Bartlett window.
+    >>> torch.signal.windows.bartlett(10)
+    tensor([0.0000, 0.2222, 0.4444, 0.6667, 0.8889, 0.8889, 0.6667, 0.4444, 0.2222, 0.0000])
+
+    >>> # Generates a periodic Bartlett window.
+    >>> torch.signal.windows.bartlett(10, sym=False)
+    tensor([0.0000, 0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 0.8000, 0.6000, 0.4000, 0.2000])
+""".format(
+        **window_common_args
+    ),
+)
+def bartlett(M: int,
+             *,
+             sym: bool = True,
+             dtype: Optional[torch.dtype] = None,
+             layout: torch.layout = torch.strided,
+             device: Optional[torch.device] = None,
+             requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('bartlett', M, dtype, layout)
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    start = -1
+    constant = 2 / (M if not sym else M - 1)
+
+    k = torch.linspace(start=start,
+                       end=start + (M - 1) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    return 1 - torch.abs(k)
+
+
+@_add_docstr(
+    r"""
+Computes the general cosine window.
+
+The general cosine window is defined as follows:
+
+.. math::
+    w_n = \sum^{M-1}_{i=0} (-1)^i a_i \cos{ \left( \frac{2 \pi i n}{M - 1}\right)}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    a (Iterable): the coefficients associated to each of the cosine functions.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric general cosine window with 3 coefficients.
+    >>> torch.signal.windows.general_cosine(10, a=[0.46, 0.23, 0.31], sym=True)
+    tensor([0.5400, 0.3376, 0.1288, 0.4200, 0.9136, 0.9136, 0.4200, 0.1288, 0.3376, 0.5400])
+
+    >>> # Generates a periodic general cosine window wit 2 coefficients.
+    >>> torch.signal.windows.general_cosine(10, a=[0.5, 1 - 0.5], sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def general_cosine(M, *,
+                   a: Iterable,
+                   sym: bool = True,
+                   dtype: Optional[torch.dtype] = None,
+                   layout: torch.layout = torch.strided,
+                   device: Optional[torch.device] = None,
+                   requires_grad: bool = False) -> Tensor:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+
+    _window_function_checks('general_cosine', M, dtype, layout)
+
+    if M == 0:
+        return torch.empty((0,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if M == 1:
+        return torch.ones((1,), dtype=dtype, layout=layout, device=device, requires_grad=requires_grad)
+
+    if not isinstance(a, Iterable):
+        raise TypeError("Coefficients must be a list/tuple")
+
+    if not a:
+        raise ValueError("Coefficients cannot be empty")
+
+    constant = 2 * torch.pi / (M if not sym else M - 1)
+
+    k = torch.linspace(start=0,
+                       end=(M - 1) * constant,
+                       steps=M,
+                       dtype=dtype,
+                       layout=layout,
+                       device=device,
+                       requires_grad=requires_grad)
+
+    a_i = torch.tensor([(-1) ** i * w for i, w in enumerate(a)], device=device, dtype=dtype, requires_grad=requires_grad)
+    i = torch.arange(a_i.shape[0], dtype=a_i.dtype, device=a_i.device, requires_grad=a_i.requires_grad)
+    return (a_i.unsqueeze(-1) * torch.cos(i.unsqueeze(-1) * k)).sum(0)
+
+
+@_add_docstr(
+    r"""
+Computes the general Hamming window.
+
+The general Hamming window is defined as follows:
+
+.. math::
+    w_n = \alpha - (1 - \alpha) \cos{ \left( \frac{2 \pi n}{M-1} \right)}
+    """,
+    r"""
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    alpha (float, optional): the window coefficient. Default: 0.54.
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+Examples::
+
+    >>> # Generates a symmetric Hamming window with the general Hamming window.
+    >>> torch.signal.windows.general_hamming(10, sym=True)
+    tensor([0.0800, 0.1876, 0.4601, 0.7700, 0.9723, 0.9723, 0.7700, 0.4601, 0.1876, 0.0800])
+
+    >>> # Generates a periodic Hann window with the general Hamming window.
+    >>> torch.signal.windows.general_hamming(10, alpha=0.5, sym=False)
+    tensor([0.0000, 0.0955, 0.3455, 0.6545, 0.9045, 1.0000, 0.9045, 0.6545, 0.3455, 0.0955])
+""".format(
+        **window_common_args
+    ),
+)
+def general_hamming(M,
+                    *,
+                    alpha: float = 0.54,
+                    sym: bool = True,
+                    dtype: Optional[torch.dtype] = None,
+                    layout: torch.layout = torch.strided,
+                    device: Optional[torch.device] = None,
+                    requires_grad: bool = False) -> Tensor:
+    return general_cosine(M,
+                          a=[alpha, 1. - alpha],
+                          sym=sym,
+                          dtype=dtype,
+                          layout=layout,
+                          device=device,
+                          requires_grad=requires_grad)
+
+
+@_add_docstr(
+    r"""
+Computes the minimum 4-term Blackman-Harris window according to Nuttall.
+
+.. math::
+    w_n = 1 - 0.36358 \cos{(z_n)} + 0.48917 \cos{(2z_n)} - 0.13659 \cos{(3z_n)} + 0.01064 \cos{(4z_n)}
+
+where ``z_n = 2 π n/ M``.
+    """,
+    """
+
+{normalization}
+
+Arguments:
+    {M}
+
+Keyword args:
+    {sym}
+    {dtype}
+    {layout}
+    {device}
+    {requires_grad}
+
+References::
+
+    - A. Nuttall, “Some windows with very good sidelobe behavior,”
+      IEEE Transactions on Acoustics, Speech, and Signal Processing, vol. 29, no. 1, pp. 84-91,
+      Feb 1981. https://doi.org/10.1109/TASSP.1981.1163506
+
+    - Heinzel G. et al., “Spectrum and spectral density estimation by the Discrete Fourier transform (DFT),
+      including a comprehensive list of window functions and some new flat-top windows”,
+      February 15, 2002 https://holometer.fnal.gov/GH_FFT.pdf
+
+Examples::
+
+    >>> # Generates a symmetric Nutall window.
+    >>> torch.signal.windows.general_hamming(5, sym=True)
+    tensor([3.6280e-04, 2.2698e-01, 1.0000e+00, 2.2698e-01, 3.6280e-04])
+
+    >>> # Generates a periodic Nuttall window.
+    >>> torch.signal.windows.general_hamming(5, sym=False)
+    tensor([3.6280e-04, 1.1052e-01, 7.9826e-01, 7.9826e-01, 1.1052e-01])
+""".format(
+        **window_common_args
+    ),
+)
+def nuttall(
+        M: int,
+        *,
+        sym: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        layout: torch.layout = torch.strided,
+        device: Optional[torch.device] = None,
+        requires_grad: bool = False
+) -> Tensor:
+    return general_cosine(M,
+                          a=[0.3635819, 0.4891775, 0.1365995, 0.0106411],
+                          sym=sym,
+                          dtype=dtype,
+                          layout=layout,
+                          device=device,
+                          requires_grad=requires_grad)
diff --git a/MLPY/Lib/site-packages/torch/sparse/__init__.py b/MLPY/Lib/site-packages/torch/sparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..36de2dff634260c50ddb6b7a91a318210fdf40b8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/__init__.py
@@ -0,0 +1,604 @@
+# The Tensor classes are added to this module by python_tensor.cpp
+from typing import Optional, Tuple, List, Union, Any
+
+import torch
+from torch._C import _add_docstr, _sparse  # type: ignore[attr-defined]
+from torch import Tensor
+
+# Semi structured sparsity support
+from .semi_structured import (
+    SparseSemiStructuredTensor,
+    SparseSemiStructuredTensorCUSPARSELT,
+    SparseSemiStructuredTensorCUTLASS,
+    to_sparse_semi_structured
+)
+
+# A workaround to support both TorchScript and MyPy:
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from torch.types import _dtype as DType
+    DimOrDims = Optional[Union[int, Tuple[int], List[int]]]
+else:
+    # The JIT doesn't understand Union, nor torch.dtype here
+    DType = int
+    DimOrDims = Optional[Tuple[int]]
+
+
+__all__ = [
+    'addmm',
+    'check_sparse_tensor_invariants',
+    'mm',
+    'sum',
+    'softmax',
+    'log_softmax',
+    'SparseSemiStructuredTensor',
+    'SparseSemiStructuredTensorCUTLASS',
+    'SparseSemiStructuredTensorCUSPARSELT',
+    'to_sparse_semi_structured',
+    'as_sparse_gradcheck',
+]
+
+addmm = _add_docstr(_sparse._sparse_addmm, r"""
+sparse.addmm(mat, mat1, mat2, *, beta=1., alpha=1.) -> Tensor
+
+This function does exact same thing as :func:`torch.addmm` in the forward,
+except that it supports backward for sparse COO matrix :attr:`mat1`.
+When :attr:`mat1` is a COO tensor it must have `sparse_dim = 2`.
+When inputs are COO tensors, this function also supports backward for both inputs.
+
+Supports both CSR and COO storage formats.
+
+.. note::
+    This function doesn't support computing derivaties with respect to CSR matrices.
+
+Args:
+    mat (Tensor): a dense matrix to be added
+    mat1 (Tensor): a sparse matrix to be multiplied
+    mat2 (Tensor): a dense matrix to be multiplied
+    beta (Number, optional): multiplier for :attr:`mat` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+""")
+
+
+mm = _add_docstr(_sparse._sparse_mm, r"""
+    Performs a matrix multiplication of the sparse matrix :attr:`mat1`
+    and the (sparse or strided) matrix :attr:`mat2`. Similar to :func:`torch.mm`, if :attr:`mat1` is a
+    :math:`(n \times m)` tensor, :attr:`mat2` is a :math:`(m \times p)` tensor, out will be a
+    :math:`(n \times p)` tensor.
+    When :attr:`mat1` is a COO tensor it must have `sparse_dim = 2`.
+    When inputs are COO tensors, this function also supports backward for both inputs.
+
+    Supports both CSR and COO storage formats.
+
+.. note::
+    This function doesn't support computing derivaties with respect to CSR matrices.
+
+    This function also additionally accepts an optional :attr:`reduce` argument that allows
+    specification of an optional reduction operation, mathematically performs the following operation:
+
+.. math::
+
+    z_{ij} = \bigoplus_{k = 0}^{K - 1} x_{ik} y_{kj}
+
+where :math:`\bigoplus` defines the reduce operator. :attr:`reduce` is implemented only for
+CSR storage format on CPU device.
+
+Args:
+    mat1 (Tensor): the first sparse matrix to be multiplied
+    mat2 (Tensor): the second matrix to be multiplied, which could be sparse or dense
+    reduce (str, optional): the reduction operation to apply for non-unique indices
+        (:obj:`"sum"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`). Default :obj:`"sum"`.
+
+Shape:
+    The format of the output tensor of this function follows:
+    - sparse x sparse -> sparse
+    - sparse x dense -> dense
+
+Example::
+
+    >>> a = torch.tensor([[1., 0, 2], [0, 3, 0]]).to_sparse().requires_grad_()
+    >>> a
+    tensor(indices=tensor([[0, 0, 1],
+                           [0, 2, 1]]),
+           values=tensor([1., 2., 3.]),
+           size=(2, 3), nnz=3, layout=torch.sparse_coo, requires_grad=True)
+    >>> b = torch.tensor([[0, 1.], [2, 0], [0, 0]], requires_grad=True)
+    >>> b
+    tensor([[0., 1.],
+            [2., 0.],
+            [0., 0.]], requires_grad=True)
+    >>> y = torch.sparse.mm(a, b)
+    >>> y
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseAddmmBackward0>)
+    >>> y.sum().backward()
+    >>> a.grad
+    tensor(indices=tensor([[0, 0, 1],
+                           [0, 2, 1]]),
+           values=tensor([1., 0., 2.]),
+           size=(2, 3), nnz=3, layout=torch.sparse_coo)
+    >>> c = a.detach().to_sparse_csr()
+    >>> c
+    tensor(crow_indices=tensor([0, 2, 3]),
+           col_indices=tensor([0, 2, 1]),
+           values=tensor([1., 2., 3.]), size=(2, 3), nnz=3,
+           layout=torch.sparse_csr)
+    >>> y1 = torch.sparse.mm(c, b, 'sum')
+    >>> y1
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseMmReduceImplBackward0>)
+    >>> y2 = torch.sparse.mm(c, b, 'max')
+    >>> y2
+    tensor([[0., 1.],
+            [6., 0.]], grad_fn=<SparseMmReduceImplBackward0>)
+""")
+
+
+sampled_addmm = _add_docstr(_sparse.sparse_sampled_addmm, r"""
+sparse.sampled_addmm(input, mat1, mat2, *, beta=1., alpha=1., out=None) -> Tensor
+
+Performs a matrix multiplication of the dense matrices :attr:`mat1` and :attr:`mat2` at the locations
+specified by the sparsity pattern of :attr:`input`. The matrix :attr:`input` is added to the final result.
+
+Mathematically this performs the following operation:
+
+.. math::
+
+    \text{out} = \alpha\ (\text{mat1} \mathbin{@} \text{mat2})*\text{spy}(\text{input}) + \beta\ \text{input}
+
+where :math:`\text{spy}(\text{input})` is the sparsity pattern matrix of :attr:`input`, :attr:`alpha`
+and :attr:`beta` are the scaling factors.
+:math:`\text{spy}(\text{input})` has value 1 at the positions where :attr:`input` has non-zero values, and 0 elsewhere.
+
+.. note::
+    :attr:`input` must be a sparse CSR tensor. :attr:`mat1` and :attr:`mat2` must be dense tensors.
+
+Args:
+    input (Tensor): a sparse CSR matrix of shape `(m, n)` to be added and used to compute
+        the sampled matrix multiplication
+    mat1 (Tensor): a dense matrix of shape `(m, k)` to be multiplied
+    mat2 (Tensor): a dense matrix of shape `(k, n)` to be multiplied
+
+Keyword args:
+    beta (Number, optional): multiplier for :attr:`input` (:math:`\beta`)
+    alpha (Number, optional): multiplier for :math:`mat1 @ mat2` (:math:`\alpha`)
+    out (Tensor, optional): output tensor. Ignored if `None`. Default: `None`.
+
+Examples::
+
+    >>> input = torch.eye(3, device='cuda').to_sparse_csr()
+    >>> mat1 = torch.randn(3, 5, device='cuda')
+    >>> mat2 = torch.randn(5, 3, device='cuda')
+    >>> torch.sparse.sampled_addmm(input, mat1, mat2)
+    tensor(crow_indices=tensor([0, 1, 2, 3]),
+        col_indices=tensor([0, 1, 2]),
+        values=tensor([ 0.2847, -0.7805, -0.1900]), device='cuda:0',
+        size=(3, 3), nnz=3, layout=torch.sparse_csr)
+    >>> torch.sparse.sampled_addmm(input, mat1, mat2).to_dense()
+    tensor([[ 0.2847,  0.0000,  0.0000],
+        [ 0.0000, -0.7805,  0.0000],
+        [ 0.0000,  0.0000, -0.1900]], device='cuda:0')
+    >>> torch.sparse.sampled_addmm(input, mat1, mat2, beta=0.5, alpha=0.5)
+    tensor(crow_indices=tensor([0, 1, 2, 3]),
+        col_indices=tensor([0, 1, 2]),
+        values=tensor([ 0.1423, -0.3903, -0.0950]), device='cuda:0',
+        size=(3, 3), nnz=3, layout=torch.sparse_csr)
+""")
+
+def sum(input: Tensor, dim: DimOrDims = None,
+        dtype: Optional[DType] = None) -> Tensor:
+    r"""Return the sum of each row of the given sparse tensor.
+
+    Returns the sum of each row of the sparse tensor :attr:`input` in the given
+    dimensions :attr:`dim`. If :attr:`dim` is a list of dimensions,
+    reduce over all of them. When sum over all ``sparse_dim``, this method
+    returns a dense tensor instead of a sparse tensor.
+
+    All summed :attr:`dim` are squeezed (see :func:`torch.squeeze`), resulting an output
+    tensor having :attr:`dim` fewer dimensions than :attr:`input`.
+
+    During backward, only gradients at ``nnz`` locations of :attr:`input`
+    will propagate back. Note that the gradients of :attr:`input` is coalesced.
+
+    Args:
+        input (Tensor): the input sparse tensor
+        dim (int or tuple of ints): a dimension or a list of dimensions to reduce. Default: reduce
+            over all dims.
+        dtype (:class:`torch.dtype`, optional): the desired data type of returned Tensor.
+            Default: dtype of :attr:`input`.
+
+    Example::
+
+        >>> nnz = 3
+        >>> dims = [5, 5, 2, 3]
+        >>> I = torch.cat([torch.randint(0, dims[0], size=(nnz,)),
+                           torch.randint(0, dims[1], size=(nnz,))], 0).reshape(2, nnz)
+        >>> V = torch.randn(nnz, dims[2], dims[3])
+        >>> size = torch.Size(dims)
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> S = torch.sparse_coo_tensor(I, V, size)
+        >>> S
+        tensor(indices=tensor([[2, 0, 3],
+                               [2, 4, 1]]),
+               values=tensor([[[-0.6438, -1.6467,  1.4004],
+                               [ 0.3411,  0.0918, -0.2312]],
+
+                              [[ 0.5348,  0.0634, -2.0494],
+                               [-0.7125, -1.0646,  2.1844]],
+
+                              [[ 0.1276,  0.1874, -0.6334],
+                               [-1.9682, -0.5340,  0.7483]]]),
+               size=(5, 5, 2, 3), nnz=3, layout=torch.sparse_coo)
+
+        # when sum over only part of sparse_dims, return a sparse tensor
+        >>> torch.sparse.sum(S, [1, 3])
+        tensor(indices=tensor([[0, 2, 3]]),
+               values=tensor([[-1.4512,  0.4073],
+                              [-0.8901,  0.2017],
+                              [-0.3183, -1.7539]]),
+               size=(5, 2), nnz=3, layout=torch.sparse_coo)
+
+        # when sum over all sparse dim, return a dense tensor
+        # with summed dims squeezed
+        >>> torch.sparse.sum(S, [0, 1, 3])
+        tensor([-2.6596, -1.1450])
+    """
+    if dtype is None:
+        if dim is not None:
+            return torch._sparse_sum(input, dim)
+        else:
+            return torch._sparse_sum(input)
+    else:
+        if dim is not None:
+            return torch._sparse_sum(input, dim, dtype=dtype)
+        else:
+            return torch._sparse_sum(input, dtype=dtype)
+
+
+softmax = _add_docstr(_sparse._sparse_softmax, r"""
+sparse.softmax(input, dim, *, dtype=None) -> Tensor
+
+Applies a softmax function.
+
+Softmax is defined as:
+
+:math:`\text{Softmax}(x_{i}) = \frac{exp(x_i)}{\sum_j exp(x_j)}`
+
+where :math:`i, j` run over sparse tensor indices and unspecified
+entries are ignores. This is equivalent to defining unspecified
+entries as negative infinity so that :math:`exp(x_k) = 0` when the
+entry with index :math:`k` has not specified.
+
+It is applied to all slices along `dim`, and will re-scale them so
+that the elements lie in the range `[0, 1]` and sum to 1.
+
+Args:
+    input (Tensor): input
+    dim (int): A dimension along which softmax will be computed.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+        of returned tensor.  If specified, the input tensor is
+        casted to :attr:`dtype` before the operation is
+        performed. This is useful for preventing data type
+        overflows. Default: None
+""")
+
+
+log_softmax = _add_docstr(_sparse._sparse_log_softmax, r"""
+sparse.log_softmax(input, dim, *, dtype=None) -> Tensor
+
+Applies a softmax function followed by logarithm.
+
+See :class:`~torch.sparse.softmax` for more details.
+
+Args:
+    input (Tensor): input
+    dim (int): A dimension along which softmax will be computed.
+    dtype (:class:`torch.dtype`, optional): the desired data type
+        of returned tensor.  If specified, the input tensor is
+        casted to :attr:`dtype` before the operation is
+        performed. This is useful for preventing data type
+        overflows. Default: None
+""")
+
+
+spdiags = _add_docstr(
+    _sparse._spdiags,
+    r"""
+sparse.spdiags(diagonals, offsets, shape, layout=None) -> Tensor
+
+Creates a sparse 2D tensor by placing the values from rows of
+:attr:`diagonals` along specified diagonals of the output
+
+The :attr:`offsets` tensor controls which diagonals are set.
+
+- If :attr:`offsets[i]` = 0, it is the main diagonal
+- If :attr:`offsets[i]` < 0, it is below the main diagonal
+- If :attr:`offsets[i]` > 0, it is above the main diagonal
+
+The number of rows in :attr:`diagonals` must match the length of :attr:`offsets`,
+and an offset may not be repeated.
+
+Args:
+    diagonals (Tensor): Matrix storing diagonals row-wise
+    offsets (Tensor): The diagonals to be set, stored as a vector
+    shape (2-tuple of ints): The desired shape of the result
+Keyword args:
+    layout (:class:`torch.layout`, optional): The desired layout of the
+        returned tensor. ``torch.sparse_coo``, ``torch.sparse_csc`` and ``torch.sparse_csr``
+        are supported. Default: ``torch.sparse_coo``
+
+Examples:
+
+Set the main and first two lower diagonals of a matrix::
+
+    >>> diags = torch.arange(9).reshape(3, 3)
+    >>> diags
+    tensor([[0, 1, 2],
+            [3, 4, 5],
+            [6, 7, 8]])
+    >>> s = torch.sparse.spdiags(diags, torch.tensor([0, -1, -2]), (3, 3))
+    >>> s
+    tensor(indices=tensor([[0, 1, 2, 1, 2, 2],
+                           [0, 1, 2, 0, 1, 0]]),
+           values=tensor([0, 1, 2, 3, 4, 6]),
+           size=(3, 3), nnz=6, layout=torch.sparse_coo)
+    >>> s.to_dense()
+    tensor([[0, 0, 0],
+            [3, 1, 0],
+            [6, 4, 2]])
+
+
+Change the output layout::
+
+    >>> diags = torch.arange(9).reshape(3, 3)
+    >>> diags
+    tensor([[0, 1, 2],[3, 4, 5], [6, 7, 8])
+    >>> s = torch.sparse.spdiags(diags, torch.tensor([0, -1, -2]), (3, 3), layout=torch.sparse_csr)
+    >>> s
+    tensor(crow_indices=tensor([0, 1, 3, 6]),
+           col_indices=tensor([0, 0, 1, 0, 1, 2]),
+           values=tensor([0, 3, 1, 6, 4, 2]), size=(3, 3), nnz=6,
+           layout=torch.sparse_csr)
+    >>> s.to_dense()
+    tensor([[0, 0, 0],
+            [3, 1, 0],
+            [6, 4, 2]])
+
+Set partial diagonals of a large output::
+
+    >>> diags = torch.tensor([[1, 2], [3, 4]])
+    >>> offsets = torch.tensor([0, -1])
+    >>> torch.sparse.spdiags(diags, offsets, (5, 5)).to_dense()
+    tensor([[1, 0, 0, 0, 0],
+            [3, 2, 0, 0, 0],
+            [0, 4, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0]])
+
+.. note::
+
+    When setting the values along a given diagonal the index into the diagonal
+    and the index into the row of :attr:`diagonals` is taken as the
+    column index in the output. This has the effect that when setting a diagonal
+    with a positive offset `k` the first value along that diagonal will be
+    the value in position `k` of the row of :attr:`diagonals`
+
+Specifying a positive offset::
+
+    >>> diags = torch.tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
+    >>> torch.sparse.spdiags(diags, torch.tensor([0, 1, 2]), (5, 5)).to_dense()
+    tensor([[1, 2, 3, 0, 0],
+            [0, 2, 3, 0, 0],
+            [0, 0, 3, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0]])
+""")
+
+
+class check_sparse_tensor_invariants:
+    """A tool to control checking sparse tensor invariants.
+
+    The following options exists to manage sparsr tensor invariants
+    checking in sparse tensor construction:
+
+    1. Using a context manager:
+
+       .. code:: python
+
+           with torch.sparse.check_sparse_tensor_invariants():
+               run_my_model()
+
+    2. Using a procedural approach:
+
+       .. code:: python
+
+           prev_checks_enabled = torch.sparse.check_sparse_tensor_invariants.is_enabled()
+           torch.sparse.check_sparse_tensor_invariants.enable()
+
+           run_my_model()
+
+           if not prev_checks_enabled:
+               torch.sparse.check_sparse_tensor_invariants.disable()
+
+    3. Using function decoration:
+
+       .. code:: python
+
+           @torch.sparse.check_sparse_tensor_invariants()
+           def run_my_model():
+               ...
+
+           run_my_model()
+
+    4. Using ``check_invariants`` keyword argument in sparse tensor constructor call.
+       For example:
+
+       >>> torch.sparse_csr_tensor([0, 1, 3], [0, 1], [1, 2], check_invariants=True)
+       Traceback (most recent call last):
+         File "<stdin>", line 1, in <module>
+       RuntimeError: `crow_indices[..., -1] == nnz` is not satisfied.
+    """
+
+    @staticmethod
+    def is_enabled():
+        r"""Return True if the sparse tensor invariants checking is enabled.
+
+        .. note::
+
+            Use :func:`torch.sparse.check_sparse_tensor_invariants.enable` or
+            :func:`torch.sparse.check_sparse_tensor_invariants.disable` to
+            manage the state of the sparse tensor invariants checks.
+        """
+        return torch._C._check_sparse_tensor_invariants()
+
+    @staticmethod
+    def enable():
+        r"""Enable sparse tensor invariants checking in sparse tensor constructors.
+
+        .. note::
+
+            By default, the sparse tensor invariants checks are disabled. Use
+            :func:`torch.sparse.check_sparse_tensor_invariants.is_enabled` to
+            retrieve the current state of sparse tensor invariants checking.
+
+        .. note::
+
+            The sparse tensor invariants check flag is effective to all sparse
+            tensor constructors, both in Python and ATen.
+
+        The flag can be locally overridden by the ``check_invariants``
+        optional argument of the sparse tensor constructor functions.
+        """
+        torch._C._set_check_sparse_tensor_invariants(True)
+
+    @staticmethod
+    def disable():
+        r"""Disable sparse tensor invariants checking in sparse tensor constructors.
+
+        See :func:`torch.sparse.check_sparse_tensor_invariants.enable` for more information.
+        """
+        torch._C._set_check_sparse_tensor_invariants(False)
+
+    # context manager support
+    def __init__(self, enable=True):
+        self.state = enable
+        self.saved_state : Optional[bool] = None
+
+    def __enter__(self):
+        if self.saved_state is not None:
+            raise RuntimeError('This context manager instance is already activated.'
+                               ' Use a different context manager instance for context nesting.')
+        self.saved_state = self.is_enabled()
+        torch._C._set_check_sparse_tensor_invariants(self.state)
+
+    def __exit__(self, type, value, traceback):
+        assert self.saved_state is not None
+        torch._C._set_check_sparse_tensor_invariants(self.saved_state)
+        self.saved_state = None
+
+    # decorator support
+    def __call__(self, mth):
+
+        def test_mth(*args, **kwargs):
+            with type(self)(self.state):
+                return mth(*args, **kwargs)
+
+        return test_mth
+
+
+def as_sparse_gradcheck(gradcheck):
+    """Decorate function, to extend gradcheck for sparse tensors.
+
+    Decorator for torch.autograd.gradcheck or its functools.partial
+    variants that extends the gradcheck function with support to input
+    functions that operate on or/and return sparse tensors.
+
+    The specified gradcheck function itself is guaranteed to operate
+    on strided tensors only.
+
+    For example:
+
+    >>> gradcheck = torch.sparse.as_sparse_gradcheck(torch.autograd.gradcheck)
+    >>> x = torch.tensor([[0, 1], [2, 3]], dtype=torch.float64).to_sparse_coo().requires_grad_(True)
+    >>> gradcheck(lambda x: x.to_sparse_csr(), x)
+    True
+    """
+
+    def gradcheck_with_sparse_support(func, inputs, **kwargs):
+        """
+        Create gradcheck with support for sparse tensors.
+
+        Same as :func:`torch.autograd.gradcheck` but with sparse tensors inputs and outputs support.
+        """
+        masked = kwargs.pop('masked', False)
+        sparse_layouts = {torch.sparse_coo, torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+        sparse_compressed_layouts = {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+        sparse_block_layouts = {torch.sparse_bsr, torch.sparse_bsc}
+        STRIDED_REPRESENTATION = '__STRIDED_REPRESENTATION__'
+
+        def convert_to_strided_representation(args):
+            """Convert differentiable non-strided tensors to a representation containing differentiable strided tensors."""
+            if not isinstance(args, (list, tuple)):
+                args = args,
+            new_args: List[Any] = []
+            for obj in args:
+                if isinstance(obj, torch.Tensor) and obj.requires_grad and obj.layout in sparse_layouts:
+                    d = dict(layout=obj.layout, shape=obj.shape)
+                    if not masked:
+                        # Materialize unspecified elements with zero values
+                        batch_dim = obj.ndim - obj.dense_dim() - obj.sparse_dim()
+                        blocksize = obj.values().shape[batch_dim + 1:batch_dim + 3] if obj.layout in sparse_block_layouts else None
+                        full_mask = torch.ones(obj.shape, device=obj.device, dtype=torch.bool).to_sparse(
+                            layout=obj.layout, blocksize=blocksize, dense_dim=obj.dense_dim())
+                        obj = obj.to_dense().sparse_mask(full_mask)
+                    if obj.layout is torch.sparse_coo:
+                        d.update(indices=obj._indices(), is_coalesced=obj.is_coalesced())
+                        values = obj._values()
+                    elif obj.layout in {torch.sparse_csr, torch.sparse_bsr}:
+                        d.update(compressed_indices=obj.crow_indices(), plain_indices=obj.col_indices())
+                        values = obj.values()
+                    else:
+                        d.update(compressed_indices=obj.ccol_indices(), plain_indices=obj.row_indices())
+                        values = obj.values()
+                    new_args.extend((STRIDED_REPRESENTATION, d, values.requires_grad_(True)))
+                else:
+                    new_args.append(obj)
+            return tuple(new_args)
+
+        def restore_from_strided_representation(args):
+            """Restore non-strided differentiable tensosr from their strided representations."""
+            new_args = []
+            args = list(args)
+            while args:
+                a = args.pop(0)
+                if a == STRIDED_REPRESENTATION:
+                    d, values = args.pop(0), args.pop(0)
+                    if d['layout'] is torch.sparse_coo:
+                        a = torch.sparse_coo_tensor(d['indices'], values, size=d['shape'], is_coalesced=d['is_coalesced'])
+                    elif d['layout'] in sparse_compressed_layouts:
+                        a = torch.sparse_compressed_tensor(d['compressed_indices'], d['plain_indices'], values,
+                                                           size=d['shape'], layout=d['layout'])
+                    else:
+                        raise NotImplementedError(f'conversion of {d["layout"]} strided representation to tensor')
+                new_args.append(a)
+            return tuple(new_args)
+
+        def func_wrapper(*args, **kwargs):
+            restored_args = restore_from_strided_representation(args)
+
+            # convert differentiable output sparse tensors to strided
+            # tensors:
+            outputs = func(*restored_args, **kwargs)
+
+            strided_outputs = tuple(outputs) if isinstance(outputs, (list, tuple)) else (outputs,)
+            strided_outputs = tuple((o.to_dense(masked_grad=masked)
+                                     if isinstance(o, torch.Tensor) and o.requires_grad and o.layout in sparse_layouts else o)
+                                    for o in strided_outputs)
+
+            return strided_outputs if isinstance(outputs, (list, tuple)) else strided_outputs[0]
+
+        args = (func_wrapper, convert_to_strided_representation(inputs))
+
+        return gradcheck(*args, **kwargs)
+
+    return gradcheck_with_sparse_support
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f29ccbaece379ef436f9b357ee5d9b5f51f1349d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_conversions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_conversions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98d1fe300997c1004f97b0960dd40de39ddf3ac5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_conversions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d8774ea20f63da0db53f67de5a189899ac14aae
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_semi_structured_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53838a9acabcb649305cd075ce277f6abd501936
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops_meta.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops_meta.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75dccb6c474320769dddedb88a318e8e101f3208
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/_triton_ops_meta.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/__pycache__/semi_structured.cpython-39.pyc b/MLPY/Lib/site-packages/torch/sparse/__pycache__/semi_structured.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38b96b39a4cdee067d6eb5387e5c470b8c96159d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/sparse/__pycache__/semi_structured.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/sparse/_semi_structured_conversions.py b/MLPY/Lib/site-packages/torch/sparse/_semi_structured_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea66d9e0ca0a3ae7776e601e216eec17e0b26ef5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/_semi_structured_conversions.py
@@ -0,0 +1,275 @@
+import torch
+
+
+# This is PyTorch implementation of main part of reorder_meta()
+# function, from tools/util/include/cutlass/util/host_reorder.h file
+# of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
+# GEMM decides upon layout of this matrix, and at the moment for the
+# sparse GEMM executed on tensor cores, this is layout described by
+# ColumnMajorInterleaved<2> data structure, in
+# include/cutlass/layout/matrix.h of CUTLASS source tree.  The
+# reordering of meta matrix into meta_reordered matrix calculated
+# according to these segments of CUTLASS code is re-implemented here.
+# Note that this calculation produces offsets for scattering metadata
+# matrix elements into reordered metadata matrix elements (or,
+# equivalently, for gathering reordered metadata matrix element back
+# into metadata matrix elements).
+def _calculate_meta_reordering_scatter_offsets(m, meta_ncols, meta_dtype, device):
+    dst_rows = torch.arange(0, m, device=device)[:, None].repeat(1, meta_ncols)
+    dst_cols = torch.arange(0, meta_ncols, device=device).repeat(m, 1)
+
+    # Reorder the rows, then swizzle the 2x2 blocks.
+    group = 32 if meta_dtype.itemsize == 2 else 16
+    interweave = 4 if meta_dtype.itemsize == 2 else 2
+    dst_rows = (
+        dst_rows // group * group
+        + (dst_rows % 8) * interweave
+        + (dst_rows % group) // 8
+    )
+
+    topright = ((dst_rows % 2 == 0) & (dst_cols % 2 == 1)).to(torch.int8)
+    bottomleft = ((dst_rows % 2 == 1) & (dst_cols % 2 == 0)).to(torch.int8)
+    dst_rows += topright - bottomleft
+    dst_cols -= topright - bottomleft
+
+    # Assumed that meta tensor is to be stored in CUTLASS
+    # InterleavedColumnMajor layout, and reverse engineered
+    # corresponding code to store values into this tensor.
+    interleave = 2
+    cols_maj = dst_cols // interleave
+    cols_min = dst_cols % interleave
+    return (cols_maj * m * interleave + dst_rows * interleave + cols_min).view(-1)
+
+
+# This function converts dense matrix into sparse semi-structured
+# representation, producing "compressed" matrix, in the layout used by
+# CUTLASS backend, and corresponding metadata matrix.
+def sparse_semi_structured_from_dense_cutlass(dense):
+    if dense.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional dense tensor, got {dense.dim()}-dimensional tensor"
+        )
+
+    m, k = dense.shape
+    device = dense.device
+
+    meta_dtype = torch.int8
+    if dense.dtype == torch.int8:
+        meta_dtype = torch.int32
+    elif dense.dtype in [torch.half, torch.bfloat16, torch.float]:
+        meta_dtype = torch.int16
+    else:
+        raise RuntimeError(f"Invalid datatype {dense.dtype} of dense matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+    if quadbits_per_meta_elem not in (4, 8):
+        raise RuntimeError("Invalid number of elements per meta element calculated")
+
+    if meta_dtype == torch.int32:
+        if m % 16 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 16"
+            )
+    else:
+        if m % 32 != 0:
+            raise RuntimeError(
+                f"Number of rows of dense matrix {m} must be divisible by 32"
+            )
+    if k % (4 * quadbits_per_meta_elem) != 0:
+        raise RuntimeError(
+            f"Number of columns of dense matrix {k} must be divisible by {4 * quadbits_per_meta_elem}"
+        )
+
+    if dense.dtype != torch.float:
+        ksparse = 4
+        dense_4 = dense.view(-1, k // ksparse, ksparse)
+        m0, m1, m2, m3 = (dense_4 != 0).unbind(-1)
+    else:
+        ksparse = 2
+        dense_2 = dense.view(-1, k // ksparse, ksparse)
+        m0, m2 = m1, m3 = (dense_2 != 0).unbind(-1)
+    meta_ncols = k // (ksparse * quadbits_per_meta_elem)
+
+    # Encoding quadruples of True/False values as follows:
+    #     [True,  True,  False, False] -> 0b0100
+    #     [True,  False, True,  False] -> 0b1000
+    #     [False, True,  True,  False] -> 0b1001
+    #     [True,  False, False, True ] -> 0b1100
+    #     [False, True,  False, True ] -> 0b1101
+    #     [False, False, True,  True ] -> 0b1110
+    # Thus, lower two bits in the encoding are index of the True value
+    # at the lowest index in the quadruple, and the higher two bits in
+    # the encoding are index of the other True value in the quadruple.
+    # In case there are less than two True values, than False value or
+    # values at some index or indices are considered True for the
+    # encoding.  In case there are more than two True values, then the
+    # excess True value(s) at some indices are considered False for
+    # the encoding.  The exact encodings used for these cases are as
+    # follows:
+    #     [False, False, False, False] -> 0b1110
+    #     [False, False, False, True ] -> 0b1110
+    #     [False, False, True,  False] -> 0b1110
+    #     [False, True,  False, False] -> 0b1001
+    #     [False, True,  True,  True ] -> 0b1101
+    #     [True,  False, False, False] -> 0b1000
+    #     [True,  False, True,  True ] -> 0b1100
+    #     [True,  True,  False, True ] -> 0b0100
+    #     [True,  True,  True,  False] -> 0b0100
+    #     [True,  True,  True,  True ] -> 0b0100
+    # These particular encodings are chosen, with the help of Espresso
+    # logic minimizer software, for the purpose of minimization of
+    # corresponding Boolean functions, that translate non-zero flags
+    # into encoding bits.  Note also possible choices for the first
+    # and last of these encodings were limited only to (0b0100,
+    # 0b1110), in order to produce valid encodings for 1:2 sparsity
+    # case.
+
+    expr0 = m0 & m1
+    expr1 = ~m0 & m1
+    expr2 = ~m0 & ~m1
+    bit0 = expr1
+    bit1 = expr2
+    bit2 = expr0 | expr2 | m3
+    bit3 = expr1 | ~m1
+    idxs0 = bit0 | (bit1.to(torch.int64) << 1)
+    idxs1 = bit2 | (bit3.to(torch.int64) << 1)
+
+    if dense.dtype != torch.float:
+        sparse0 = dense_4.gather(-1, idxs0.unsqueeze(-1))  # type: ignore[possibly-undefined]
+        sparse1 = dense_4.gather(-1, idxs1.unsqueeze(-1))
+        sparse = torch.stack((sparse0, sparse1), dim=-1).view(m, k // 2)
+    else:
+        sparse = dense_2.gather(-1, idxs0.unsqueeze(-1) // 2).view(m, k // 2)  # type: ignore[possibly-undefined]
+
+    meta_4 = idxs0 | (idxs1 << 2)
+    meta_n = meta_4.view((-1, meta_ncols, quadbits_per_meta_elem)).to(meta_dtype)
+
+    if quadbits_per_meta_elem == 4:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+        )
+    elif quadbits_per_meta_elem == 8:
+        meta = (
+            meta_n[:, :, 0]
+            | (meta_n[:, :, 1] << 4)
+            | (meta_n[:, :, 2] << 8)
+            | (meta_n[:, :, 3] << 12)
+            | (meta_n[:, :, 4] << 16)
+            | (meta_n[:, :, 5] << 20)
+            | (meta_n[:, :, 6] << 24)
+            | (meta_n[:, :, 7] << 28)
+        )
+
+    # Reorder meta tensor elements.
+    meta_reordered = meta.new_empty((m * meta_ncols,))  # type: ignore[possibly-undefined]
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device
+    )
+    meta_reordered.scatter_(0, meta_offsets, meta.view(-1))
+
+    return (sparse, meta_reordered.view(m, meta_ncols))
+
+
+# This function performs reverse of the function above - it
+# reconstructs dense matrix from a pair of "compressed" matrix, given
+# in the layout used by CUTLASS backend, and accompanying metadata
+# matrix.
+def sparse_semi_structured_to_dense_cutlass(sparse, meta_reordered):
+    if sparse.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional sparse tensor, got {sparse.dim()}-dimensional tensor"
+        )
+
+    m, k = sparse.shape
+    device = sparse.device
+
+    if meta_reordered.dim() != 2:
+        raise RuntimeError(
+            f"Expected 2-dimensional meta tensor, got {meta_reordered.dim()}-dimensional tensor"
+        )
+    if meta_reordered.device != device:
+        raise RuntimeError(
+            f"Expected meta matrix to be on {device} device, got matrix on {meta_reordered.device} device"
+        )
+
+    meta_dtype = meta_reordered.dtype
+    if meta_dtype not in (torch.int16, torch.int32):
+        raise RuntimeError(f"Invalid datatype {meta_dtype} of meta matrix")
+    quadbits_per_meta_elem = meta_dtype.itemsize * 8 // 4
+
+    if sparse.dtype != torch.float:
+        ksparse = 4
+    else:
+        ksparse = 2
+
+    meta_nrows, meta_ncols = meta_reordered.shape
+    if meta_nrows != m:
+        raise RuntimeError(
+            f"Number of rows of meta matrix {meta_nrows} must be equal to number of columns of spase matrix {m}"
+        )
+    if meta_ncols * ksparse * quadbits_per_meta_elem != 2 * k:
+        raise RuntimeError(
+            f"Number of columns of sparse matrix {k} different from the {meta_ncols * ksparse * quadbits_per_meta_elem // 2}, "
+            "expected according to the number of columns of meta matrix"
+        )
+
+    # Undo meta tensor elements reordering.
+    meta_offsets = _calculate_meta_reordering_scatter_offsets(
+        m, meta_ncols, meta_dtype, device
+    )
+    meta = torch.gather(meta_reordered.view(-1), 0, meta_offsets).view(m, meta_ncols)
+
+    # Unpack sparse tensor back to original dense tensor, using
+    # information provided by meta tensor.  Note that torch.float
+    # datatype is handled pretty much the same as
+    # torch.half/torch.bfloat16, as metadata for a pair of torch.float
+    # value is encoded as if underlying 8 bytes contain four
+    # torch.half/torch.bfloat16 values, where either first two or last
+    # two are zeros.
+    meta_2 = torch.empty(
+        (m, meta_ncols, 2 * quadbits_per_meta_elem),
+        dtype=meta_dtype,
+        device=device,
+    )
+    if quadbits_per_meta_elem == 4:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+    elif quadbits_per_meta_elem == 8:
+        meta_2[:, :, 0] = meta & 0b11
+        meta_2[:, :, 1] = (meta >> 2) & 0b11
+        meta_2[:, :, 2] = (meta >> 4) & 0b11
+        meta_2[:, :, 3] = (meta >> 6) & 0b11
+        meta_2[:, :, 4] = (meta >> 8) & 0b11
+        meta_2[:, :, 5] = (meta >> 10) & 0b11
+        meta_2[:, :, 6] = (meta >> 12) & 0b11
+        meta_2[:, :, 7] = (meta >> 14) & 0b11
+        meta_2[:, :, 8] = (meta >> 16) & 0b11
+        meta_2[:, :, 9] = (meta >> 18) & 0b11
+        meta_2[:, :, 10] = (meta >> 20) & 0b11
+        meta_2[:, :, 11] = (meta >> 22) & 0b11
+        meta_2[:, :, 12] = (meta >> 24) & 0b11
+        meta_2[:, :, 13] = (meta >> 26) & 0b11
+        meta_2[:, :, 14] = (meta >> 28) & 0b11
+        meta_2[:, :, 15] = (meta >> 30) & 0b11
+
+    dense_offsets = meta_2.view(-1) + (
+        torch.arange(0, 2 * m * k // ksparse, device=device) * 4
+    ).view(-1, 1).repeat(1, 2).view(-1)
+
+    dense = torch.zeros((m * 2 * k,), dtype=sparse.dtype, device=device)
+    if sparse.dtype != torch.float:
+        dense.scatter_(0, dense_offsets, sparse.view(-1))
+    else:
+        dense.view(torch.half).scatter_(
+            0, dense_offsets, sparse.view(torch.half).view(-1)
+        )
+
+    return dense.view(m, 2 * k)
diff --git a/MLPY/Lib/site-packages/torch/sparse/_semi_structured_ops.py b/MLPY/Lib/site-packages/torch/sparse/_semi_structured_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac7a1648ff020645edc4ca1693fe123c67d6a9d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/_semi_structured_ops.py
@@ -0,0 +1,166 @@
+import contextlib
+
+import torch
+
+__all__ = [
+    "fallback_dispatcher",
+    "semi_sparse_values",
+    "semi_sparse_indices",
+    "semi_sparse_t",
+    "semi_sparse_view",
+    "semi_sparse_detach",
+    "semi_sparse_mm",
+    "semi_sparse_addmm",
+    "semi_sparse_linear",
+]
+
+
+@contextlib.contextmanager
+def no_dispatch():
+    guard = torch._C._DisableTorchDispatch()
+    try:
+        yield
+    finally:
+        del guard
+
+
+def fallback_dispatcher(func, types, args, kwargs):
+    with no_dispatch():
+        return func(*args)
+
+
+def semi_sparse_values(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    A = args[0]
+    assert isinstance(A, torch.sparse.SparseSemiStructuredTensor)
+    assert A.packed is not None
+    if A.meta is None:
+        m, k = A.shape
+        num_kept_elements = m * k // 2
+        return A.packed[:num_kept_elements:].view(m, -1)
+    else:
+        return A.packed.detach()
+
+
+def semi_sparse_indices(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    A = args[0]
+    assert isinstance(A, torch.sparse.SparseSemiStructuredTensor)
+    assert A.packed is not None
+    if A.meta is None:
+        m, k = A.shape
+        num_kept_elements = m * k // 2
+        metadata = A.packed[num_kept_elements:].view(m, -1)
+        return metadata.view(torch.int32 if A.dtype == torch.int32 else torch.int16)
+    else:
+        return A.meta
+
+
+def semi_sparse_t(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 1
+    self = args[0]
+    assert isinstance(self, torch.sparse.SparseSemiStructuredTensor)
+    assert len(self.shape) == 2
+    # Because we cannot go from the compressed representation back to the dense representation currently,
+    # we just keep track of how many times we have been transposed. Depending on whether the sparse matrix
+    # is the first or second argument, we expect an even / odd number of calls to transpose respectively.
+    return self.__class__(
+        torch.Size([self.shape[-1], self.shape[0]]),
+        packed=self.packed_t,
+        meta=self.meta_t,
+        packed_t=self.packed,
+        meta_t=self.meta,
+        threads_masks=self.threads_masks.transpose(0, 1)
+        if self.threads_masks is not None
+        else None,
+        fuse_transpose_cusparselt=args[0].fuse_transpose_cusparselt,
+        alg_id_cusparselt=args[0].alg_id_cusparselt,
+    )
+
+
+def semi_sparse_view(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 2
+    self, shape = args
+    if tuple(shape) != self.shape:
+        raise NotImplementedError(
+            f"`view` is not implemented for SparseSemiStructuredTensor, except for the dummy case (shape={shape})"
+        )
+    return self
+
+
+def semi_sparse_detach(func, types, args, kwargs) -> torch.Tensor:
+    assert len(args) == 1
+    self = args[0]
+    return self.__class__(
+        shape=self.shape,
+        packed=self.packed,
+        meta=self.meta,
+        packed_t=self.packed_t,
+        meta_t=self.meta_t,
+        threads_masks=self.threads_masks,
+        requires_grad=False,
+    )
+
+
+def semi_sparse_mm(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 2
+    A, B = args
+    if A.ndim != 2 or B.ndim != 2:
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: Broadcasting is not implemented"
+        )
+    if isinstance(A, torch.sparse.SparseSemiStructuredTensor):
+        row, col = B.shape
+        B_padded = A._pad_dense_input(B)
+        res = A._mm(B_padded)
+        return res[:, :col]
+    else:
+        B_t = B.t()
+        assert isinstance(B_t, torch.sparse.SparseSemiStructuredTensor)
+        row, col = A.shape
+        A_padded = B._pad_dense_input(A)
+        res = B_t._mm(A_padded.t()).t()
+        return res[:row, :]
+
+
+def semi_sparse_addmm(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) == 3
+    bias, A, B = args
+    if A.ndim != 2 or B.ndim != 2:
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: Broadcasting is not implemented"
+        )
+    if bias.ndim != 1:
+        raise NotImplementedError(
+            f"`SparseSemiStructuredTensor` matmul: only bias dim=1 supported. Shape={bias.shape}"
+        )
+    if isinstance(A, torch.sparse.SparseSemiStructuredTensor):
+        raise NotImplementedError(
+            "`SparseSemiStructuredTensor` matmul: only operand B of `addmm` can be sparse"
+        )
+    B_t = B.t()
+    assert isinstance(B_t, torch.sparse.SparseSemiStructuredTensor)
+    row, col = A.shape
+    A_padded = B_t._pad_dense_input(A)
+    result = B_t._mm(A_padded.t(), bias=bias).t()
+    return result[:row, :]
+
+
+def semi_sparse_linear(func, types, args=(), kwargs=None) -> torch.Tensor:
+    assert len(args) in [2, 3]
+    A, B = args[:2]
+    bias = args[2] if len(args) == 3 else None
+
+    shape = A.shape
+    A_2d = A.view(-1, shape[-1])
+
+    if bias is None:
+        res = A_2d @ B.t()
+    else:
+        res = semi_sparse_addmm(
+            func=None,
+            types=None,
+            args=[bias, A_2d, B.t()],
+        )
+
+    return res.view(*shape[:-1], -1)
diff --git a/MLPY/Lib/site-packages/torch/sparse/_triton_ops.py b/MLPY/Lib/site-packages/torch/sparse/_triton_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1faa4c24a7d3bd21d3d95a5dca453e59b708d66a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/_triton_ops.py
@@ -0,0 +1,1928 @@
+import math
+import os
+import torch
+import weakref
+from functools import lru_cache
+from torch.utils._triton import has_triton
+from ._triton_ops_meta import get_meta
+from typing import Optional, Tuple
+
+TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE = int(os.getenv('TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE', 2))
+
+
+def check(cond, msg):
+    if not cond:
+        raise ValueError(msg)
+
+
+def check_bsr_layout(f_name, t):
+    check(
+        t.layout == torch.sparse_bsr,
+        f"{f_name}(): only BSR sparse format is supported for the sparse argument.",
+    )
+
+
+def check_device(f_name, t, device):
+    check(
+        t.device == device and t.device.type == "cuda",
+        f"{f_name}(): all inputs are expected to be on the same GPU device.",
+    )
+
+
+def check_mm_compatible_shapes(f_name, lhs, rhs):
+    check(
+        lhs.dim() >= 2 and rhs.dim() >= 2,
+        f"{f_name}(): all inputs involved in the matrix product are expected to be at least 2D, "
+        f"but got lhs.dim() == {lhs.dim()} and rhs.dim() == {rhs.dim()}."
+    )
+
+    m, kl = lhs.shape[-2:]
+    kr, n = rhs.shape[-2:]
+
+    check(
+        kl == kr,
+        f"{f_name}(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, "
+        f"got lhs.shape[-1] == {kl} which is not equal to rhs.shape[-2] == {kr}.",
+    )
+
+
+def check_dtype(f_name, t, dtype, *additional_dtypes):
+    check(
+        t.dtype == dtype
+        and t.dtype in ((torch.half, torch.bfloat16, torch.float) + tuple(*additional_dtypes)),
+        f"{f_name}(): all inputs are expected to be of the same dtype "
+        f"and one of (half, bfloat16, float32) or {additional_dtypes}, "
+        f"but got dtype == {t.dtype}.",
+    )
+
+
+def check_blocksize(f_name, blocksize):
+    assert len(blocksize) == 2
+
+    def is_power_of_two(v):
+        return not (v & (v - 1))
+
+    def is_compatible_blocksize(b):
+        res = True
+        for blocksize in b:
+            # Triton loads only blocks which are at least 16 and powers of 2.
+            res = (blocksize >= 16 and is_power_of_two(blocksize)) and res
+        return res
+
+    check(
+        is_compatible_blocksize(blocksize),
+        f"{f_name}(): sparse inputs' blocksize ({blocksize[0]}, {blocksize[1]}) "
+        "should be at least 16 and a power of 2 in each dimension.",
+    )
+
+
+def make_triton_contiguous(t):
+    """Return input as a triton-contiguous tensor.
+
+    A triton-contiguous tensor is defined as a tensor that has strides
+    with minimal value equal to 1.
+
+    While triton kernels support triton-non-contiguous tensors (all
+    strides being greater than 1 or having 0 strides) arguments, a
+    considerable slow-down occurs because tensor data is copied
+    element-wise rather than chunk-wise.
+    """
+    if min(t.stride()) != 1:
+        # TODO: investigate if contiguity along other axes than the
+        # last one can be beneficial for performance
+        return t.contiguous()
+    else:
+        return t
+
+
+def broadcast_batch_dims(f_name, *tensors):
+    try:
+        return torch.broadcast_shapes(*(t.shape[:-2] for t in tensors))
+    except Exception:
+        check(False, f"{f_name}(): inputs' batch dimensions are not broadcastable!")
+
+
+def slicer(dim, slice_range, *tensors):
+    for t in tensors:
+        slices = [slice(None)] * t.dim()
+        slices[dim] = slice_range
+        yield t[slices]
+
+
+def multidim_slicer(dims, slices, *tensors):
+    for t in tensors:
+        s = [slice(None)] * t.dim()
+        for d, d_slice in zip(dims, slices):
+            if d is not None:
+                s[d] = d_slice
+        yield t[s]
+
+
+def ptr_stride_extractor(*tensors):
+    for t in tensors:
+        yield t
+        yield from t.stride()
+
+
+def grid_partitioner(full_grid, grid_blocks, tensor_dims_map):
+    assert 0 <= len(full_grid) <= 3
+    assert 0 <= len(grid_blocks) <= 3
+
+    import itertools
+
+    def generate_grid_points():
+        for fg, mg in zip(full_grid, grid_blocks):
+            yield range(0, fg, mg)
+
+    def generate_sliced_tensors(slices):
+        for t, t_dims in tensor_dims_map.items():
+            yield next(multidim_slicer(t_dims, slices, t))
+
+    for grid_point in itertools.product(*generate_grid_points()):
+        grid = [min(fg - gp, mg) for fg, gp, mg in zip(full_grid, grid_point, grid_blocks)]
+        slices = [slice(gp, gp + g) for gp, g in zip(grid_point, grid)]
+        # grid_points are iterated in a "contiguous" order, i.e.
+        # left dimensions traversed slower than right dimensions.
+        # This order is reversed for CUDA grids.
+        yield grid[::-1], *generate_sliced_tensors(slices)
+
+
+def launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks=None):
+    # cuda_max_grid = (2 ** 31 - 1, 2 ** 16 - 1, 2 ** 16 - 1)
+    cuda_max_grid = (2147483647, 65535, 65535)[::-1]
+    if grid_blocks is None:
+        grid_blocks = cuda_max_grid
+    else:
+
+        def valid_grid_dim(g, mg):
+            if g is None:
+                return mg
+            else:
+                # grid must be at least 1 and no greater than mg
+                return max(1, min(g, mg))
+
+        grid_blocks = tuple(
+            valid_grid_dim(g, mg) for g, mg in zip(grid_blocks, cuda_max_grid)
+        )  # type: ignore[assignment]
+
+    for grid, *sliced_tensors in grid_partitioner(full_grid, grid_blocks, tensor_dims_map):
+        kernel(grid, *sliced_tensors)
+
+
+def prepare_inputs(bsr, *dense_tensors):
+    # Introduce fake batch dimension if not present for convenience.
+    crow_indices = bsr.crow_indices().unsqueeze(0)
+    col_indices = bsr.col_indices().unsqueeze(0)
+    values = make_triton_contiguous(bsr.values().unsqueeze(0))
+    tensors = [make_triton_contiguous(t.unsqueeze(0)) for t in dense_tensors]
+
+    # Compute broadcasted batch dimension
+    batch_dims_broadcasted = torch.broadcast_shapes(values.shape[:-3], *(t.shape[:-2] for t in tensors))
+
+    # Broadcast batch dimensions and squash.
+    # The result can be either a view or a copy.
+    def batch_broadcast_and_squash(t, batch_dims, invariant_dims):
+        return t.broadcast_to(batch_dims + invariant_dims).flatten(
+            0, len(batch_dims) - 1
+        )
+
+    crow_indices = batch_broadcast_and_squash(
+        crow_indices, batch_dims_broadcasted, (-1,)
+    )
+
+    col_indices = batch_broadcast_and_squash(
+        col_indices, batch_dims_broadcasted, (-1,)
+    )
+    values = batch_broadcast_and_squash(
+        values, batch_dims_broadcasted, values.shape[-3:]
+    )
+    tensors = [
+        batch_broadcast_and_squash(t, batch_dims_broadcasted, t.shape[-2:]) for t in tensors
+    ]
+
+    return crow_indices, col_indices, values, *tensors
+
+
+def broadcast_batch_dims_bsr(f_name, bsr, *tensors):
+    batch_shape = broadcast_batch_dims(f_name, bsr, *tensors)
+
+    crow_indices = bsr.crow_indices().broadcast_to(batch_shape + (-1,))
+    col_indices = bsr.col_indices().broadcast_to(batch_shape + (-1,))
+    values = bsr.values().broadcast_to(batch_shape + bsr.values().shape[-3:])
+    size = batch_shape + bsr.shape[-2:]
+    return torch.sparse_compressed_tensor(crow_indices, col_indices, values, size=size, layout=bsr.layout)
+
+
+# NOTE: this function will ALWAYS create a view
+def tile_to_blocksize(t, blocksize):
+    *rest, m, n = t.shape
+    new_shape = rest + [
+        m // blocksize[0],
+        blocksize[0],
+        n // blocksize[1],
+        blocksize[1],
+    ]
+    # using .view instead of .reshape to ensure that the result is
+    # indeed a view:
+    return t.view(new_shape).transpose(-3, -2)
+
+
+def as1Dbatch(tensor):
+    """Return tensor as 3D tensor by either prepending new dimensions to
+    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
+    starting dimensions into the first dimension (when ``tensor.ndim >
+    3``).
+    """
+    while tensor.ndim < 3:
+        tensor = tensor.unsqueeze(0)
+    if tensor.ndim > 3:
+        tensor = tensor.flatten(0, tensor.ndim - 3)
+    assert tensor.ndim == 3, tensor.shape
+    return tensor
+
+
+def scatter_mm(blocks, others, indices_data, *, accumulators=None):
+    """Scattered matrix multiplication of tensors.
+
+    A scattered matrix multiplication is defined as a series of matrix
+    multiplications applied to input tensors according to the input
+    and output mappings specified by indices data.
+
+    The following indices data formats are supported for defining a
+    scattered matrix multiplication operation (:attr:`indices_data[0]`
+    holds the name of the indices data format as specified below):
+
+    - ``"scatter_mm"`` - matrix multiplications scattered in batches
+      of tensors.
+
+      If :attr:`blocks` is a :math:`(* \times M \times K) tensor,
+      :attr:`others` is a :math:`(* \times K \times N)` tensor,
+      :attr:`accumulators` is a :math:`(* \times M \times N)` tensor,
+      and :attr:`indices = indices_data['indices']` is a :math:`(*
+      \times 3)` tensor, then the operation is equivalent to the
+      following code::
+
+        c_offsets, pq = indices_data[1:]
+        for r in range(len(c_offsets) - 1):
+            for g in range(c_offsets[r], c_offsets[r + 1]):
+                p, q = pq[g]
+                accumulators[r] += blocks[p] @ others[q]
+
+    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
+      batches of tensors and a tensor.
+
+      If :attr:`blocks` is a :math:`(Ms \times Ks) tensor,
+      :attr:`others` is a :math:`(* \times K \times N)` tensor,
+      :attr:`accumulators` is a :math:`(* \times M \times N)` tensor, then
+      the operation is equivalent to the following code::
+
+        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
+        for b in range(nbatches):
+            for i, r in enumerate(r_offsets):
+                r0, r1 = divmod(r, N)
+                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                for g in range(c_indices[i], c_indices[i+1]):
+                    p = p_offsets[g]
+                    q0, q1 = divmod(q_offsets[g], N)
+                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+
+      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
+      integer multiples of ``Ms`` and ``Ks``, respectively.
+
+    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
+      scattered in batches of tensors and a tensor. A memory and
+      processor efficient version of ``"bsr_strided_mm"`` format.  If
+      :attr:`blocks` is a :math:`(Ms \times Ks) tensor, :attr:`others`
+      is a :math:`(* \times K \times N)` tensor, :attr:`accumulators`
+      is a :math:`(* \times M \times N)` tensor, then the operation is
+      equivalent to the following code::
+
+        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
+        for b in range(nbatches):
+            for r in r_offsets:
+                m = (r // N) // Ms
+                n = (r % N) // Ns
+                r0, r1 = divmod(r, N)
+                c0, c1 = c_indices[m], c_indices[m + 1]
+                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                for i, p in enumerate(range(c0, c1)):
+                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
+                    q0, q1 = divmod(q, N)
+                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+
+      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
+      integer multiples of ``Ms`` and ``Ks``, respectively.
+
+      Notice that the order of ``r_offsets`` items can be arbitrary;
+      this property enables defining swizzle operators via
+      rearrangements of ``r_offsets`` items..
+
+    Auxilary functions are provided for pre-computing
+    :attr:`indices_data`. For example,
+    :func:`bsr_scatter_mm_indices_data` is used to define indices data
+    for matrix multiplication of BSR and strided tensors.
+
+    Parameters
+    ----------
+    blocks (Tensor): a 3-D tensor of first matrices to be multiplied
+
+    others (Tensor): a tensor of second matrices to be multiplied. If
+      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
+      tensor of second input matrices to be multiplied. Otherwise, the
+      second input matrices are slices of the :attr:`others` tensor.
+    indices_data (tuple): a format data that defines the inputs and
+      outputs of scattered matrix multiplications.
+
+    Keyword arguments
+    -----------------
+
+    accumulators (Tensor, optional): a tensor of matrix product
+      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
+      is a 1-D batch tensor of output matrices. Otherwise, output
+      matrices are slices of the :attr:`accumulators` tensor.
+    """
+    indices_format = indices_data[0]
+
+    assert blocks.ndim == 3
+    P, Ms, Ks = blocks.shape
+
+    if indices_format == 'scatter_mm':
+        c_offsets, pq = indices_data[1:]
+
+        assert others.ndim == 3
+        Q, Ks_, Ns = others.shape
+        assert Ks == Ks_
+
+        if accumulators is None:
+            R = c_offsets.shape[0] - 1
+            accumulators = torch.zeros((R, Ms, Ns), dtype=blocks.dtype, device=blocks.device)
+        else:
+            R, Ms_, Ns_ = accumulators.shape
+            assert Ms_ == Ms
+            assert Ns_ == Ns
+
+        if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm2 is None:
+            for r in range(c_offsets.shape[0] - 1):
+                g0 = c_offsets[r]
+                g1 = c_offsets[r + 1]
+                for g in range(g0, g1):
+                    p, q = pq[g]
+                    accumulators[r] += blocks[p] @ others[q]
+        else:
+            _scatter_mm2(blocks, others, c_offsets, pq, accumulators)
+        return accumulators
+
+    elif indices_format == 'bsr_strided_mm':
+        others_shape = others.shape
+        others = as1Dbatch(others)
+
+        B, K, N = others.shape
+        assert K % Ks == 0
+
+        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
+        SPLIT_N = meta['SPLIT_N']
+
+        if accumulators is None:
+            M = Ms + (r_offsets.max().item() + 1) // N
+            accumulators = torch.zeros((*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device)
+        else:
+            M, N_ = accumulators.shape[-2:]
+            assert N_ == N
+
+        accumulators_shape = accumulators.shape
+        accumulators = as1Dbatch(accumulators)
+
+        Ns = N // SPLIT_N
+
+        if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None:
+            accumulators.zero_()
+            for b in range(B):
+                for r in range(r_offsets.shape[0]):
+                    r_ = r_offsets[r].item()
+                    g0 = c_indices[r].item()
+                    g1 = c_indices[r + 1].item()
+                    r0, r1 = divmod(r_, N)
+                    acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                    for g in range(g0, g1):
+                        p, q = p_offsets[g], q_offsets[g]
+                        q0, q1 = divmod(q.item(), N)
+                        acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+        else:
+            _scatter_mm6(blocks, others, c_indices, r_offsets, p_offsets, q_offsets, meta, accumulators)
+        return accumulators.view(accumulators_shape)
+
+    elif indices_format == 'bsr_strided_mm_compressed':
+        others_shape = others.shape
+        others = as1Dbatch(others)
+
+        B, K, N = others.shape
+        assert K % Ks == 0
+
+        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
+        SPLIT_N = meta['SPLIT_N']
+
+        if accumulators is None:
+            M = Ms + (r_offsets.max().item() + 1) // N
+            accumulators = torch.zeros((*others_shape[:-2], M, N), dtype=blocks.dtype, device=blocks.device)
+        else:
+            M, N_ = accumulators.shape[-2:]
+            assert N_ == N
+
+        accumulators_shape = accumulators.shape
+        accumulators = as1Dbatch(accumulators)
+
+        Ns = N // SPLIT_N
+
+        if Ms % 16 or Ks % 16 or Ns % 16 or _scatter_mm6 is None:
+            for b in range(B):
+                for j in range(len(r_offsets)):
+                    r0, r1 = divmod(r_offsets[j].item(), N)
+                    m = r0 // Ms
+                    n = r1 // Ns
+                    c0 = c_indices[m].item()
+                    c1 = c_indices[m + 1].item()
+                    acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
+                    for i, p in enumerate(range(c0, c1)):
+                        q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i].item()
+                        q0, q1 = divmod(q, N)
+                        acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]
+        else:
+            p_offsets = torch.empty((0, ), dtype=q_offsets.dtype, device=q_offsets.device)
+            _scatter_mm6(blocks, others, c_indices, r_offsets, p_offsets, q_offsets, meta, accumulators)
+        return accumulators.view(accumulators_shape)
+
+    else:
+        raise NotImplementedError(indices_format)
+
+
+def scatter_mm_meta(M, K, N, Ms, Ks,
+                    GROUP_SIZE=None, TILE_M=None, TILE_N=None, SPLIT_N=None, num_warps=None, num_stages=None, **extra):
+    if {TILE_M, TILE_N, SPLIT_N, num_warps, num_stages, GROUP_SIZE} == {None}:
+        device_name = torch.cuda.get_device_name()
+        meta = get_meta('scatter_mm', (M, K, N, Ms, Ks), device_name,
+                        version=(0, torch.float16, 0.5))
+        if meta is not None:
+            meta.update(**extra)
+            return meta
+        # The following parameters are optimized for the performance
+        # equilibrium points of bsr-dense and dense-dense matrix
+        # multiplications when using GPU card NVIDIA GeForce RTX 2060
+        # SUPER. For points far from the performance equilibrium
+        # points as well as for other GPU cards, the optimal
+        # parameters are likely different from what specified below.
+        if (M, K, N) == (256,) * 3:
+            if (Ms, Ks) == (16, 16):
+                SPLIT_N=1;TILE_M=16;TILE_N=16;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (32, 32):
+                SPLIT_N=2;TILE_M=32;TILE_N=16;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (64, 64):
+                SPLIT_N=1;TILE_M=32;TILE_N=32;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (128, 128):
+                SPLIT_N=1;TILE_M=32;TILE_N=32;GROUP_SIZE=2;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+        elif (M, K, N) == (512,) * 3:
+            if (Ms, Ks) == (16, 16):
+                SPLIT_N=8;TILE_M=16;TILE_N=64;GROUP_SIZE=2;num_stages=1;num_warps=2  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (32, 32):
+                SPLIT_N=8;TILE_M=32;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=2  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (64, 64):
+                SPLIT_N=4;TILE_M=32;TILE_N=128;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (128, 128):
+                SPLIT_N=8;TILE_M=64;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+        elif (M, K, N) == (1024,) * 3:
+            if (Ms, Ks) == (16, 16):
+                SPLIT_N=4;TILE_M=16;TILE_N=128;GROUP_SIZE=2;num_stages=1;num_warps=1  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (32, 32):
+                SPLIT_N=8;TILE_M=32;TILE_N=64;GROUP_SIZE=2;num_stages=1;num_warps=1  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (64, 64):
+                SPLIT_N=16;TILE_M=64;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=2  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (128, 128):
+                SPLIT_N=16;TILE_M=64;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (256, 256):
+                SPLIT_N=16;TILE_M=64;TILE_N=64;GROUP_SIZE=2;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+        elif (M, K, N) == (2048,) * 3:
+            if (Ms, Ks) == (16, 16):
+                SPLIT_N=4;TILE_M=16;TILE_N=128;GROUP_SIZE=8;num_stages=1;num_warps=1  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (32, 32):
+                SPLIT_N=4;TILE_M=32;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=1  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (64, 64):
+                SPLIT_N=4;TILE_M=64;TILE_N=128;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (128, 128):
+                SPLIT_N=8;TILE_M=64;TILE_N=64;GROUP_SIZE=4;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (256, 256):
+                SPLIT_N=4;TILE_M=64;TILE_N=64;GROUP_SIZE=2;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+        elif (M, K, N) == (4096,) * 3:
+            if (Ms, Ks) == (16, 16):
+                SPLIT_N=2;TILE_M=16;TILE_N=256;GROUP_SIZE=2;num_stages=1;num_warps=2  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (32, 32):
+                SPLIT_N=2;TILE_M=32;TILE_N=64;GROUP_SIZE=2;num_stages=1;num_warps=1  # noqa: E225,E231,E702
+            elif (Ms, Ks) == (64, 64):
+                SPLIT_N=2;TILE_M=64;TILE_N=128;GROUP_SIZE=2;num_stages=1;num_warps=4  # noqa: E225,E231,E702
+
+    if SPLIT_N is None:
+        # Assume NVIDIA GeForce RTX 2060 SUPER:
+        # With the probality of 92% (99.9% when N > 512), the
+        # performance will not be worse more than 2% from the
+        # performance when using an optimal value.  Otherwise, when N
+        # <= 512, using the following heuristics may give upto 15%
+        # lower performance.
+        SPLIT_N = {16: 1, 32: 2, 64: 4, 128: 8, 256: 16, 512: 8, 1024: 16, 4096: 32, 8192: 64}.get(N, 16)
+        if Ms >= 512 and N >= 2048:
+            SPLIT_N = 1
+    Ns = N // SPLIT_N
+    if TILE_M is None:
+        TILE_M = min(64 if Ns < 512 else 32, Ms)
+    if TILE_N is None:
+        TILE_N = min(64 if Ns < 512 else 32, Ns)
+    num_stages = num_stages or 1
+    if num_warps is None:
+        if min(M, N) > 1024:
+            num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4)
+        elif min(M, N) == 1024:
+            num_warps = {16: 1, 32: 1, 64: 2}.get(Ms, 4)
+        elif min(M, N) == 256:
+            num_warps = {16: 1, 32: 4}.get(Ms, 4)
+        else:
+            num_warps = {16: 1, 32: 2}.get(Ms, 4)
+    GROUP_SIZE = GROUP_SIZE or 4
+
+    assert TILE_M <= Ms, dict(TILE_M=TILE_M, Ms=Ms)
+    assert TILE_N <= Ns, dict(TILE_N=TILE_N, Ns=Ns)
+    assert Ms <= M, dict(M=M, Ms=Ms)
+    assert Ns <= N, dict(N=N, Ns=Ns)
+    assert Ks <= K, dict(K=K, Ks=Ks)
+
+    return dict(TILE_M=TILE_M, TILE_N=TILE_N, GROUP_SIZE=GROUP_SIZE,
+                num_stages=num_stages, num_warps=num_warps, SPLIT_N=SPLIT_N, **extra)
+
+
+def bsr_dense_addmm_meta(M, K, N, Ms, Ks, beta, alpha,
+                         SPLIT_N=None, GROUP_SIZE_ROW=None, num_warps=None, num_stages=None, sparsity=None, dtype=None, **extra):
+    if dtype is None:
+        dtype = torch.float16
+    if sparsity is None:
+        sparsity = 0.5
+    if {SPLIT_N, num_warps, num_stages, GROUP_SIZE_ROW} == {None}:
+        device_name = torch.cuda.get_device_name()
+        key = (M, K, N, Ms, Ks, beta == 0, beta == 1, alpha == 1)
+        meta = get_meta('bsr_dense_addmm', key,
+                        device_name, version=(0, dtype, sparsity))
+        if meta is None and sparsity != 0.5:
+            meta = get_meta('bsr_dense_addmm', key,
+                            device_name, version=(0, dtype, 0.5))
+        if meta is not None:
+            meta.update(**extra)
+            return meta
+    SPLIT_N = SPLIT_N or max(N // Ms, 1)
+    GROUP_SIZE_ROW = GROUP_SIZE_ROW or 4
+    num_stages = num_stages or 1
+    num_warps = num_warps or 4
+    return dict(SPLIT_N=SPLIT_N, GROUP_SIZE_ROW=GROUP_SIZE_ROW, num_stages=num_stages, num_warps=num_warps, **extra)
+
+
+class TensorAsKey:
+    """A light-weight wrapper of a tensor that enables storing tensors as
+    keys with efficient memory reference based comparision as an
+    approximation to data equality based keys.
+
+    Motivation: the hash value of a torch tensor is tensor instance
+    based that does not use data equality and makes the usage of
+    tensors as keys less useful. For instance, the result of
+    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
+    the tensor results from `crow_indices` method call are equal, in
+    fact, these share the same data storage.
+    On the other hand, for efficient caching of tensors we want to
+    avoid calling torch.equal that compares tensors item-wise.
+
+    TensorAsKey offers a compromise in that it guarantees key equality
+    of tensors that references data in the same storage in the same
+    manner and without accessing underlying data. However, this
+    approach does not always guarantee correctness. For instance, for
+    a complex tensor ``x``, we have ``TensorAsKey(x) ==
+    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
+    return False.
+    """
+
+    def __init__(self, obj):
+
+        def get_tensor_key(obj):
+            # Warning: TensorAsKey does not track negative nor
+            # conjugate bits of its input object because in the use
+            # case of wrapping compressed/plain indices of compressed
+            # sparse tensors (that are always integer tensors with
+            # non-negative items) these bits are never set. However,
+            # when extending the use of TensorAsKey to float or
+            # complex tensors, the values of these bits (see is_neg
+            # and is_conj methods) must be included in the key as
+            # well.
+            assert not (obj.dtype.is_floating_point or obj.dtype.is_complex), obj.dtype
+            return (obj.data_ptr(), obj.storage_offset(), obj.shape, obj.stride(), obj.dtype)
+
+        self._obj_ref = weakref.ref(obj)
+        if obj.layout is torch.strided:
+            self.key = get_tensor_key(obj)
+        elif obj.layout in {torch.sparse_csr, torch.sparse_bsr}:
+            self.key = (get_tensor_key(obj.crow_indices()), get_tensor_key(obj.col_indices()))
+        elif obj.layout in {torch.sparse_csc, torch.sparse_bsc}:
+            self.key = (get_tensor_key(obj.ccol_indices()), get_tensor_key(obj.row_indices()))
+        else:
+            raise NotImplementedError(obj.layout)
+        self._hash = hash(self.key)
+
+    def __hash__(self):
+        return self._hash
+
+    def __eq__(self, other):
+        if not isinstance(other, TensorAsKey):
+            return False
+        if self.obj is None or other.obj is None:
+            # dead objects always compare unequal unless these are
+            # same objects
+            return self is other
+        return self.key == other.key
+
+    @property
+    def obj(self):
+        """Return object if alive, otherwise None."""
+        return self._obj_ref()
+
+
+@lru_cache(maxsize=TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE)
+def _bsr_scatter_mm_indices_data(indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, compressed_sparse_tensor_as_key):
+    bsr = compressed_sparse_tensor_as_key.obj
+    assert bsr is not None
+    crow_indices, col_indices = bsr.crow_indices(), bsr.col_indices()
+    device = crow_indices.device
+    indices_dtype = torch.int32
+
+    if indices_format == 'bsr_strided_mm_compressed':
+        Ns = N // SPLIT_N
+        q_offsets_lst = []
+        b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns
+        for m in range(M // Ms):
+            r0 = crow_indices[m].item()
+            r1 = crow_indices[m + 1].item()
+            if r1 == r0:
+                continue
+            q_offsets_lst.append((col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) + b.repeat_interleave(r1 - r0))
+        q_offsets = torch.cat(q_offsets_lst)
+        crow_indices_diff = crow_indices.diff()
+        non_zero_row_indices = crow_indices_diff.nonzero()
+        a = non_zero_row_indices * (Ms * N)
+        r_offsets = (a + b).view(-1)
+        c_indices = crow_indices
+        # swizzle operation: mm elements with longer sums are computed first:
+        nnz_per_row = crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N)
+        nnz_per_row, indices = nnz_per_row.sort(descending=True, stable=True)
+        r_offsets = r_offsets[indices]
+        return (indices_format, c_indices, r_offsets, q_offsets)
+
+    elif indices_format == 'bsr_strided_mm':
+        Ns = N // SPLIT_N
+        p_offsets_lst = []
+        q_offsets_lst = []
+        b = torch.arange(SPLIT_N, dtype=indices_dtype, device=device) * Ns
+        for m in range(M // Ms):
+            r0 = crow_indices[m].item()
+            r1 = crow_indices[m + 1].item()
+            if r1 == r0:
+                continue
+            p_offsets_lst.append(torch.arange(r0, r1, dtype=indices_dtype, device=device).repeat(SPLIT_N))
+            q_offsets_lst.append((col_indices[r0:r1] * (Ks * N)).repeat(SPLIT_N) + b.repeat_interleave(r1 - r0))
+        q_offsets = torch.cat(q_offsets_lst)
+        crow_indices_diff = crow_indices.diff()
+        non_zero_row_indices = crow_indices_diff.nonzero()
+        a = non_zero_row_indices * (Ms * N)
+        r_offsets = (a + b).view(-1)
+        c_indices = torch.cat((crow_indices[:1],
+                               torch.cumsum(crow_indices_diff[non_zero_row_indices].repeat_interleave(SPLIT_N), 0)))
+        p_offsets = torch.cat(p_offsets_lst)
+        return (indices_format, c_indices, r_offsets, p_offsets, q_offsets)
+
+    elif indices_format == 'scatter_mm':
+        Ns = Ms
+        c_indices = [0]
+        pq_offsets = []
+        # todo: eliminate inner for-loops for efficiency
+        for b in range(nbatches):
+            for m in range(M // Ms):
+                r0 = crow_indices[m].item()
+                r1 = crow_indices[m + 1].item()
+                for n in range(N // Ns):
+                    c_indices.append(c_indices[-1] + r1 - r0)
+                    for t in range(r1 - r0):
+                        p = r0 + t
+                        q = (col_indices[p].item() + b * (K // Ks)) * (N // Ns) + n
+                        pq_offsets.append([p, q])
+
+        return (indices_format,
+                torch.tensor(c_indices, dtype=indices_dtype, device=device),
+                torch.tensor(pq_offsets, dtype=indices_dtype, device=device))
+
+    else:
+        raise ValueError(f'Invalid {indices_format=}. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm')
+
+
+def bsr_scatter_mm_indices_data(bsr, other, indices_format='bsr_strided_mm_compressed', **meta_input):
+    """Computes indices data for :func:`scatter_mm` used in BSR and
+    strided tensor matrix multiplication.
+    """
+    assert bsr.dense_dim() == 0
+    assert bsr.ndim == 2  # no batch dims
+    crow_indices = bsr.crow_indices()
+    col_indices = bsr.col_indices()
+    blocksize = bsr.values().shape[-2:]
+    M, K = bsr.shape
+    Ms, Ks = blocksize
+    K_, N = other.shape[-2:]
+    assert K_ == K
+    nbatches = other.shape[:-2].numel()
+
+    meta = scatter_mm_meta(M, K, N, Ms, Ks, **meta_input)
+    if 'allow_tf32' not in meta_input:
+        meta.update(allow_tf32=bsr.dtype in {torch.float16, torch.bfloat16})
+    SPLIT_N = meta['SPLIT_N']
+    indices_data = _bsr_scatter_mm_indices_data(
+        indices_format, M, K, N, Ms, Ks, nbatches, SPLIT_N, TensorAsKey(bsr))
+
+    if indices_format == 'bsr_strided_mm_compressed':
+        meta.update(is_compressed=True)
+        return indices_data + (meta,)
+    elif indices_format == 'bsr_strided_mm':
+        meta.update(is_compressed=False)
+        return indices_data + (meta,)
+    else:
+        return indices_data
+
+
+def bsr_scatter_mm(bsr, other, indices_data=None, out=None):
+    """BSR @ strided -> strided
+    """
+
+    assert bsr.ndim == 2
+    assert other.ndim >= 2
+
+    Ms, Ks, Ns = bsr.shape[-2], bsr.shape[-1], other.shape[-1]
+    blocksize = bsr.values().shape[-2:]
+
+    if indices_data is None:
+        indices_data = bsr_scatter_mm_indices_data(bsr, other, indices_format='bsr_strided_mm_compressed')
+
+    indices_format = indices_data[0]
+
+    if out is None:
+        out = torch.empty((*other.shape[:-2], Ms, Ns), dtype=bsr.dtype, device=bsr.device)
+    out_shape = out.shape
+    out = as1Dbatch(out)
+
+    if bsr._nnz() == 0:
+        out.zero_()
+    elif indices_format in {'bsr_strided_mm_compressed', 'bsr_strided_mm'}:
+        out.zero_()
+        scatter_mm(bsr.values(), other, indices_data, accumulators=out)
+    elif indices_format == 'scatter_mm':
+        nbatches = other.shape[:-2].numel()
+        accumulators = torch.zeros((nbatches * Ms // blocksize[0] * Ns // blocksize[0], blocksize[0], blocksize[0]),
+                                   dtype=bsr.dtype, device=bsr.device)
+        others = (as1Dbatch(other)
+                  .transpose(-2, -1)
+                  .view(nbatches, Ns // blocksize[0], blocksize[0], Ks // blocksize[1], blocksize[1])
+                  .movedim((3, 1, 4, 2), (1, 2, 3, 4))  # equivalent to .transpose(-3, -2).transpose(-2, -1).transpose(-4, -3)
+                  .flatten(0, 2)
+                  )
+        scatter_mm(bsr.values(), others, indices_data, accumulators=accumulators)
+        out.copy_(accumulators
+                  .unflatten(0, (nbatches, Ms // blocksize[0], Ns // blocksize[0]))
+                  .movedim((1, 2, 3, 4), (3, 1, 4, 2))  # equivalent to .transpose(-4, -3).transpose(-2, -1).transpose(-3, -2)
+                  .reshape(nbatches, Ns, Ms)
+                  .transpose(-2, -1))
+    else:
+        raise NotImplementedError(indices_format)
+
+    return out.view(out_shape)
+
+
+def bsr_dense_addmm(
+        input: torch.Tensor,
+        bsr: torch.Tensor,
+        dense: torch.Tensor,
+        *,
+        beta=1,
+        alpha=1,
+        out: Optional[torch.Tensor] = None,
+        skip_checks: bool = False,
+        max_grid: Optional[Tuple[Optional[int], Optional[int], Optional[int]]] = None,
+        meta: Optional[dict] = None):
+    f_name = 'bsr_dense_addmm'
+    values = bsr.values()
+    crow_indices = bsr.crow_indices()
+    col_indices = bsr.col_indices()
+    batch_ndim = crow_indices.dim() - 1
+    M, K = bsr.shape[batch_ndim:batch_ndim + 2]
+    blocksize = values.shape[batch_ndim + 1:batch_ndim + 3]
+    N = dense.shape[-1]
+
+    # todo: implement checks
+
+    if out is None:
+        original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense)
+        out = dense.new_empty(original_batch_dims_broadcasted + (M, N))
+
+    if bsr._nnz() == 0 or alpha == 0 or N == 0 or M == 0 or K == 0:
+        if beta == 0:
+            out.zero_()
+        else:
+            out.copy_(input)
+            if beta != 1:
+                out.mul_(beta)
+        return out
+
+    if meta is None:
+        sparsity = round(1 - bsr._nnz() * blocksize[0] * blocksize[1] / (M * K), 2)
+        meta = bsr_dense_addmm_meta(M, K, N, blocksize[0], blocksize[1], beta, alpha, sparsity=sparsity, dtype=out.dtype)
+    out_backup = out
+
+    crow_indices, col_indices, values, input, dense, out = prepare_inputs(bsr, input, dense, out)
+
+    BM, BK = blocksize
+    SPLIT_N = meta.get('SPLIT_N', N // BM)
+    BN = N // SPLIT_N
+
+    out_untiled = out
+    out = tile_to_blocksize(out, (BM, BN))
+    dense = tile_to_blocksize(dense, (BK, BN))
+    input = tile_to_blocksize(input, (BM, BN))
+
+    dot_out_dtype = {torch.float16: tl.float32,
+                     torch.bfloat16: tl.float32,
+                     torch.float32: tl.float64,
+                     torch.float64: tl.float64}[out.dtype]
+
+    n_batches = dense.size(0)
+    n_block_rows = crow_indices.size(-1) - 1
+    n_block_cols = dense.size(-3)
+
+    full_grid = (n_batches, n_block_cols, n_block_rows)
+    if max_grid is not None:
+        grid_blocks = tuple(max_grid[:3][::-1]) + (None,) * (3 - len(max_grid[:3]))
+    else:
+        grid_blocks = None
+
+    tensor_dims_map = {
+        values: (0, None, None),
+        crow_indices: (0, None, -1),
+        col_indices: (0, None, None),
+        input: (0, -3, -4),
+        dense: (0, -3, None),
+        out: (0, -3, -4),
+    }
+
+    assert alpha != 0
+
+    def kernel(grid, *sliced_tensors):
+        _bsr_strided_addmm_kernel[grid](
+            *ptr_stride_extractor(*sliced_tensors),
+            beta, alpha,
+            beta_is_one=beta == 1,
+            beta_is_nonzero=beta != 0,
+            alpha_is_one=alpha == 1,
+            BLOCKSIZE_ROW=BM,
+            BLOCKSIZE_INNER=BK,
+            BLOCKSIZE_COL=BN,
+            allow_tf32=dot_out_dtype == tl.float32,
+            acc_dtype=dot_out_dtype,
+            **meta)
+
+    launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks)
+
+    if out.data_ptr() != out_backup.data_ptr():
+        # prepare_inputs has made a copy of out, copy its content back
+        # to out_backup:
+        out_backup.copy_(out_untiled.view(out_backup.shape))
+
+    return out_backup
+
+
+if has_triton():
+    import triton
+    import triton.language as tl
+
+    @triton.jit
+    def _sampled_addmm_kernel(
+        alpha,
+        beta,
+        IS_BETA_ZERO: tl.constexpr,
+        BLOCKSIZE_ROW: tl.constexpr,
+        BLOCKSIZE_COL: tl.constexpr,
+        k,
+        TILE_K: tl.constexpr,
+        values_ptr,
+        values_batch_stride,
+        values_nnz_stride,
+        values_row_block_stride,
+        values_col_block_stride,
+        crow_indices_ptr,
+        crow_indices_batch_stride,
+        crow_indices_stride,
+        col_indices_ptr,
+        col_indices_batch_stride,
+        col_indices_stride,
+        mat1_ptr,
+        mat1_batch_stride,
+        mat1_tiled_row_stride,
+        mat1_tiled_col_stride,
+        mat1_row_block_stride,
+        mat1_col_block_stride,
+        mat2_ptr,
+        mat2_batch_stride,
+        mat2_tiled_row_stride,
+        mat2_tiled_col_stride,
+        mat2_row_block_stride,
+        mat2_col_block_stride,
+        acc_dtype: tl.constexpr,
+        allow_tf32: tl.constexpr,
+    ):
+        batch_pid = tl.program_id(axis=1)
+        row_block_pid = tl.program_id(axis=0)
+
+        crow_indices_offset_ptr = (
+            crow_indices_ptr
+            + crow_indices_batch_stride * batch_pid
+            + crow_indices_stride * row_block_pid
+        )
+        nnz_offset = tl.load(crow_indices_offset_ptr)
+        nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride)
+
+        # Compute nnz for the row with number row_block_pid.
+        # If it is zero, skip the row.
+        row_nnz = nnz_offset_next - nnz_offset
+        if row_nnz == 0:
+            return
+
+        row_block_arange = tl.arange(0, BLOCKSIZE_ROW)
+        col_block_arange = tl.arange(0, BLOCKSIZE_COL)
+
+        # Pointers are set to the first block of the current row.
+        values_block_ptrs = (
+            values_ptr
+            + values_batch_stride * batch_pid
+            + values_nnz_stride * nnz_offset
+            + values_row_block_stride * row_block_arange[:, None]
+            + values_col_block_stride * col_block_arange[None, :]
+        )
+
+        col_index_nnz_ptr = (
+            col_indices_ptr
+            + col_indices_batch_stride * batch_pid
+            + col_indices_stride * nnz_offset
+        )
+
+        # Advance mat1 to the current tiled row, ignore columns.
+        mat1_block_ptrs = (
+            mat1_ptr
+            + mat1_batch_stride * batch_pid
+            + mat1_tiled_row_stride * row_block_pid
+            + mat1_row_block_stride * row_block_arange[:, None]
+        )
+
+        # Advance mat2 in batch and block col dimension.
+        mat2_block_ptrs = (
+            mat2_ptr
+            + mat2_batch_stride * batch_pid
+            + mat2_col_block_stride * col_block_arange[None, :]
+        )
+
+        k_tile_arange = tl.arange(0, TILE_K)
+        for _ in range(row_nnz):
+            acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype)
+
+            # find column block index
+            col_block = tl.load(col_index_nnz_ptr)
+
+            for k_tile in range(0, k, TILE_K):
+                k_offsets = k_tile + k_tile_arange
+                mask_k = k_offsets < k
+
+                mat1_block = tl.load(
+                    mat1_block_ptrs
+                    + mat1_col_block_stride * k_offsets[None, :],
+                    mask=mask_k[None, :], other=0.0
+                )
+
+                mat2_block = tl.load(
+                    mat2_block_ptrs
+                    + mat2_tiled_col_stride * col_block
+                    + mat2_row_block_stride * k_offsets[:, None],
+                    mask=mask_k[:, None], other=0.0
+                )
+
+                acc_block += tl.dot(mat1_block, mat2_block, allow_tf32=allow_tf32, out_dtype=acc_dtype)
+
+            if IS_BETA_ZERO:
+                acc_block *= alpha
+            else:
+                acc_block = alpha * acc_block + beta * tl.load(values_block_ptrs)
+
+            # write result
+            tl.store(values_block_ptrs, acc_block.to(values_ptr.dtype.element_ty))
+
+            # advance val/col_index ptrs to the next block in the row.
+            values_block_ptrs += values_nnz_stride
+            col_index_nnz_ptr += col_indices_stride
+
+    @triton.jit
+    def _bsr_strided_dense_rowspace_kernel(
+        # values prologue
+        values_ptr,
+        values_batch_stride,
+        values_nnz_stride,
+        values_row_block_stride,
+        values_col_block_stride,
+        # values epilogue
+        # crow_indices prologue
+        crow_indices_ptr,
+        crow_indices_batch_stride,
+        crow_indices_stride,
+        # crow_indices epilogue
+        # col_indices prologue
+        col_indices_ptr,
+        col_indices_batch_stride,
+        col_indices_stride,
+        # col_indices epilogue
+        # dense prologue
+        dense_ptr,
+        dense_batch_stride,
+        dense_tiled_row_stride,
+        dense_tiled_col_stride,
+        dense_row_block_stride,
+        dense_col_block_stride,
+        # dense epilogue
+        # output prologue
+        output_ptr,
+        output_batch_stride,
+        output_tiled_row_stride,
+        output_tiled_col_stride,
+        output_row_block_stride,
+        output_col_block_stride,
+        # output epilogue
+        #
+        # gh-113754: Always keep all constexpr arguments at the end of
+        # triton kernel arguments list because with triton 2.1 or
+        # earlier non-contiguous outputs will corrupt CUDA state due
+        # to a triton bug (fixed in openai/triton#2262).
+        BLOCKSIZE_ROW: tl.constexpr,
+        BLOCKSIZE_COL: tl.constexpr,
+        acc_dtype: tl.constexpr,
+        allow_tf32: tl.constexpr,
+        GROUP_SIZE_ROW: tl.constexpr,
+    ):
+        batch_pid = tl.program_id(axis=2)
+        row_block_pid = tl.program_id(axis=0)
+        col_block_pid = tl.program_id(axis=1)
+        n_block_rows = tl.num_programs(axis=0)
+        n_block_cols = tl.num_programs(axis=1)
+
+        row_block_pid, col_block_pid = tl.swizzle2d(
+            row_block_pid, col_block_pid, n_block_rows, n_block_cols, GROUP_SIZE_ROW
+        )
+
+        crow_indices_offset_ptr = (
+            crow_indices_ptr
+            + crow_indices_batch_stride * batch_pid
+            + crow_indices_stride * row_block_pid
+        )
+        nnz_offset = tl.load(crow_indices_offset_ptr)
+        nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride)
+
+        # Compute nnz for the row with number row_block_pid.
+        # If it is zero, skip the row.
+        row_nnz = nnz_offset_next - nnz_offset
+        if row_nnz == 0:
+            return
+
+        row_block_arange = tl.arange(0, BLOCKSIZE_ROW)
+        col_block_arange = tl.arange(0, BLOCKSIZE_COL)
+
+        # Pointers are set to the first block of the current row.
+        values_block_ptrs = (
+            values_ptr
+            + values_batch_stride * batch_pid
+            + values_nnz_stride * nnz_offset
+            + values_row_block_stride * row_block_arange[:, None]
+            + values_col_block_stride * col_block_arange[None, :]
+        )
+
+        # NOTE: dense is advanced into all dimensions but the tiled row one.
+        # That will be advanced in the loop according to values in col_indices.
+        dense_block_ptrs = (
+            dense_ptr
+            + dense_batch_stride * batch_pid
+            + dense_tiled_col_stride * col_block_pid
+            + dense_row_block_stride * col_block_arange[:, None]
+            + dense_col_block_stride * row_block_arange[None, :]
+        )
+
+        # Pointers are set to exact write-to locations
+        output_ptrs = (
+            output_ptr
+            + output_batch_stride * batch_pid
+            + output_tiled_row_stride * row_block_pid
+            + output_tiled_col_stride * col_block_pid
+            + output_row_block_stride * row_block_arange[:, None]
+            + output_col_block_stride * row_block_arange[None, :]
+        )
+
+        # Set pointer to the first nonzero element in the current row
+        col_index_nnz_ptr = (
+            col_indices_ptr
+            + col_indices_batch_stride * batch_pid
+            + col_indices_stride * nnz_offset
+        )
+
+        output_acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype)
+        for _ in range(row_nnz):
+            values_block = tl.load(values_block_ptrs)
+
+            # find which row of dense needs to get loaded
+            # for multiplication with values_block.
+            dense_row_idx = tl.load(col_index_nnz_ptr)
+            dense_block = tl.load(dense_block_ptrs + dense_tiled_row_stride * dense_row_idx)
+
+            # do block mm
+            output_acc_block += tl.dot(values_block, dense_block, allow_tf32=allow_tf32, out_dtype=acc_dtype)
+
+            # move val/col_index ptrs to the next block in the row
+            values_block_ptrs += values_nnz_stride
+            col_index_nnz_ptr += col_indices_stride
+
+        # write back the result
+        tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty))
+
+
+    def _run_sampled_addmm_kernel(
+        alpha, beta, is_beta_zero,
+        blocksize, k, tile_k,
+        values, crow_indices, col_indices,
+        mat1, mat2,
+        max_grid
+    ):
+        n_batches = values.size(0)
+        n_block_rows = crow_indices.size(-1) - 1
+
+        full_grid = (n_batches, n_block_rows)
+        if max_grid is not None:
+            grid_blocks = tuple(max_grid[:2][::-1]) + (None,) * (2 - len(max_grid[:2]))
+        else:
+            grid_blocks = None
+        tensor_dims_map = {
+            values: (0, None),
+            crow_indices: (0, -1),
+            col_indices: (0, None),
+            mat1: (0, -4),
+            mat2: (0, None),
+        }
+        if values.dtype in (torch.half, torch.bfloat16):
+            acc_dtype = tl.float32
+            allow_tf32 = True
+        else:
+            acc_dtype = tl.float64
+            allow_tf32 = False
+
+        def kernel(grid, *sliced_tensors):
+            _sampled_addmm_kernel[grid](
+                alpha, beta, is_beta_zero,
+                *blocksize, k, tile_k,
+                *ptr_stride_extractor(*sliced_tensors),
+                acc_dtype=acc_dtype,
+                allow_tf32=allow_tf32,
+                num_stages=1,
+                num_warps=4
+            )
+
+        launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks)
+
+
+    def sampled_addmm(
+        input: torch.Tensor,
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        *,
+        beta=1.0,
+        alpha=1.0,
+        out: Optional[torch.Tensor] = None,
+        skip_checks: bool = False,
+        max_grid: Optional[Tuple[Optional[int], Optional[int], Optional[int]]] = None,
+    ):
+        f_name = "sampled_addmm"
+
+        check_bsr_layout(f_name, input)
+        input_broadcasted = broadcast_batch_dims_bsr(f_name, input, mat1, mat2)
+
+        if not skip_checks:
+            check_device(f_name, mat1, input.device)
+            check_device(f_name, mat2, input.device)
+            if beta != 0.0 and input.dtype is torch.bool:
+                check(
+                    False,
+                    f"{f_name}(): having beta == {beta} not equal to 0.0 with boolean mask is not allowed."
+                )
+            if input.dtype is not torch.bool:
+                check_dtype(f_name, mat1, input.dtype)
+                check_dtype(f_name, mat2, input.dtype)
+            else:
+                check_dtype(f_name, mat1, mat2.dtype)
+            check_mm_compatible_shapes(f_name, mat1, mat2)
+            if out is not None:
+                check_bsr_layout(f_name, out)
+                check_device(f_name, out, mat1.device)
+                check_dtype(f_name, out, input.dtype)
+                check(
+                    out.shape == input_broadcasted.shape
+                    and out._nnz() == input._nnz(),
+                    f"{f_name}(): Expects `out` to be of shape {input_broadcasted.shape} "
+                    f"and with nnz equal to {input_broadcasted._nnz()} "
+                    f"but got out.shape = {out.shape} and out.nnz = {out._nnz()}"
+                )
+
+        if out is None:
+            out = input_broadcasted.to(mat1.dtype, copy=True)
+        else:
+            out.copy_(input_broadcasted)
+
+        if out.numel() == 0 or out._nnz() == 0:
+            return out
+
+        blocksize = out.values().shape[-2:]
+        m = mat1.size(-2)
+        n = mat2.size(-1)
+        k = mat1.size(-1)
+
+        # NOTE: (m, 0) @ (0, n) == zeros(m, n)
+        if alpha == 0.0 or k == 0:
+            out.values().mul_(beta)
+            return out
+
+        # prepare inputs by reshaping them to be kernel-compatible
+        out_backup = out
+        crow_indices, col_indices, values, mat1, mat2 = prepare_inputs(out, mat1, mat2)
+
+        mat1 = tile_to_blocksize(mat1, (blocksize[0], k))
+        mat2 = tile_to_blocksize(mat2, (k, blocksize[1]))
+        tile_k = max(*blocksize)
+
+        _run_sampled_addmm_kernel(
+            alpha, beta, beta == 0.0,
+            blocksize, k, tile_k,
+            values, crow_indices, col_indices,
+            mat1, mat2,
+            max_grid
+        )
+
+        # If nnz x block strides are not the same in out_backup.values and values,
+        # it means that out_backup.values and values are not the views of each other,
+        # so we have to copy.
+        if out_backup.values().stride()[-3:] != values.stride()[-3:]:
+            out_backup.values().copy_(values.reshape(out_backup.values().shape))
+        return out_backup
+
+
+    def bsr_dense_mm(
+        bsr: torch.Tensor,
+        dense: torch.Tensor,
+        *,
+        out: Optional[torch.Tensor] = None,
+        skip_checks: bool = False,
+        max_grid: Optional[Tuple[Optional[int], Optional[int], Optional[int]]] = None,
+        meta: Optional[dict] = None
+    ):
+        f_name = "bsr_dense_mm"
+        m, kl = bsr.shape[-2:]
+        if not skip_checks:
+            check_bsr_layout(f_name, bsr)
+            check_device(f_name, bsr, dense.device)
+            check_dtype(f_name, bsr, dense.dtype)
+            check_mm_compatible_shapes(f_name, bsr, dense)
+
+            n = dense.size(-1)
+            row_block, col_block = bsr.values().shape[-2:]
+            check_blocksize(f_name, (row_block, col_block))
+            check(
+                not n % 16,
+                f"{f_name}(): dense.size(-1) == {n} should be divisible by 16"
+            )
+        else:
+            kr, n = dense.shape[-2:]
+
+        original_batch_dims_broadcasted = broadcast_batch_dims(f_name, bsr, dense)
+
+        if out is not None and not skip_checks:
+            expected_out_shape = original_batch_dims_broadcasted + (m, n)
+            check(
+                out.shape == expected_out_shape,
+                "bsr_dense_mm(): `out` argument has wrong shape, "
+                f"expected {expected_out_shape}, but got {out.shape}.",
+            )
+            check(
+                out.is_contiguous() or out.transpose(-2, -1).is_contiguous(),
+                "bsr_dense_mm(): only row-major/col-major `out` arguments are supported, "
+                "i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) "
+                "should be True.",
+            )
+
+        # Allocate out
+        if out is None:
+            out = dense.new_empty(original_batch_dims_broadcasted + (m, n))
+
+        # Short circuit if lhs is zero
+        if bsr._nnz() == 0:
+            return out.zero_()
+
+        # with beta==0, addmm ignores input content, so we can use out
+        # as a placeholder for input because their shapes match:
+        return bsr_dense_addmm(out, bsr, dense, alpha=1, beta=0, out=out)
+
+
+    @triton.jit
+    def _bsr_softmax_kernel(
+        crow_indices_ptr,
+        crow_indices_batch_stride,
+        crow_indices_stride,
+        values_ptr,
+        values_batch_stride,
+        values_row_block_stride,
+        values_nnz_col_block_stride,
+        row_block, col_block,
+        MAX_ROW_NNZ: tl.constexpr,
+        TILE: tl.constexpr
+    ):
+        batch_pid = tl.program_id(axis=2)
+        row_block_offset_pid = tl.program_id(axis=1)
+        row_block_pid = tl.program_id(axis=0)
+
+        crow_indices_offset_ptr = (
+            crow_indices_ptr
+            + crow_indices_batch_stride * batch_pid
+            + crow_indices_stride * row_block_pid
+        )
+        nnz_offset = tl.load(crow_indices_offset_ptr)
+        nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride)
+
+        # Compute nnz for the row with number row_block_pid.
+        # If it is zero, skip the row.
+        row_nnz = nnz_offset_next - nnz_offset
+        if row_nnz == 0:
+            return
+
+        row_arange = tl.arange(0, TILE)
+        mask = row_arange < row_nnz * col_block
+
+        curr_row_values_ptrs = (
+            values_ptr
+            + values_batch_stride * batch_pid
+            + values_row_block_stride * row_block_offset_pid
+            + nnz_offset * col_block
+        )
+
+        # find max in the row
+        row_tile = tl.load(curr_row_values_ptrs + row_arange, mask=mask, other=-float('inf')).to(tl.float32)
+        max_row_value = tl.max(row_tile, axis=0)
+        for _ in range(TILE, MAX_ROW_NNZ, TILE):
+            row_arange += TILE
+            mask = row_arange < row_nnz * col_block
+            row_tile = tl.load(curr_row_values_ptrs + row_arange, mask=mask, other=-float('inf')).to(tl.float32)
+            curr_max_row_value = tl.max(row_tile, axis=0)
+            max_row_value = tl.where(max_row_value > curr_max_row_value, max_row_value, curr_max_row_value)
+
+        # find denominator for stable softmax
+        num = tl.exp(row_tile - max_row_value)
+        denom = tl.sum(num, axis=0)
+        for _ in range(TILE, MAX_ROW_NNZ, TILE):
+            row_arange -= TILE
+            mask = row_arange < row_nnz * col_block
+            row_tile = tl.load(curr_row_values_ptrs + row_arange, mask=mask, other=-float('inf')).to(tl.float32)
+            num = tl.exp(row_tile - max_row_value)
+            denom += tl.sum(num, axis=0)
+
+        # populate output
+        tl.store(curr_row_values_ptrs + row_arange, (num / denom).to(values_ptr.dtype.element_ty), mask=mask)
+        for _ in range(TILE, MAX_ROW_NNZ, TILE):
+            row_arange += TILE
+            mask = row_arange < row_nnz * col_block
+            row_tile = tl.load(curr_row_values_ptrs + row_arange, mask=mask, other=-float('inf')).to(tl.float32)
+            num = tl.exp(row_tile - max_row_value)
+            tl.store(curr_row_values_ptrs + row_arange, (num / denom).to(values_ptr.dtype.element_ty), mask=mask)
+
+
+    def bsr_softmax(input, max_row_nnz=None):
+        f_name = "bsr_softmax"
+
+        check_bsr_layout(f_name, input)
+        check_dtype(f_name, input, input.dtype)
+
+        if input._nnz() == 0 or input.numel() == 0:
+            return input.clone()
+
+        m, n = input.shape[-2:]
+        nnz = input._nnz()
+        row_block, col_block = input.values().shape[-2:]
+
+        if max_row_nnz is None:
+            max_row_nnz = triton.next_power_of_2(n)
+        else:
+            max_row_nnz = triton.next_power_of_2(max_row_nnz)
+
+        crow_indices = input.crow_indices().unsqueeze(0).flatten(0, -2)
+        # reshape values from
+        # (b1, ..., bn, nnz, row_block, col_block) to
+        # (b1 * ... * bn, row_block, nnz * col_block).
+        # This simplifies batch dim manipulation and unlocks
+        # the possibility to access all nnzs in any given row.
+        if input.values().transpose(-3, -2).is_contiguous():
+            # Need to clone to avoid `contiguous` returning a view.
+            values = input.values().clone()
+        else:
+            values = input.values()
+        values = values.transpose(-3, -2).contiguous().unsqueeze(0).flatten(0, -4).reshape(-1, row_block, nnz * col_block)
+        full_grid = (values.shape[0], row_block, m // row_block)
+        grid_blocks = None
+        tensor_dims_map = {
+            # We span nnz number of blocks, not nnz + 1,
+            # hence crow_indices[..., :-1]
+            crow_indices[..., :-1]: (0, None, -1),
+            values: (0, None, None),
+        }
+
+        def kernel(grid, *sliced_tensors):
+            _bsr_softmax_kernel[grid](
+                *ptr_stride_extractor(*sliced_tensors),
+                row_block, col_block,
+                max_row_nnz,
+                # Triton's max numel is bounded by 2 ** 17.
+                min(2 ** 17, max_row_nnz)
+            )
+
+        launch_kernel(kernel, tensor_dims_map, full_grid, grid_blocks)
+
+        values = values.reshape(-1, row_block, nnz, col_block).transpose(-3, -2).reshape(*input.values().shape)
+
+        return torch.sparse_compressed_tensor(
+            input.crow_indices().clone(),
+            input.col_indices().clone(),
+            values,
+            size=input.shape,
+            layout=input.layout
+        )
+
+    def _scaled_dot_product_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor],
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None
+    ):
+        f_name = "_scaled_dot_product_attention"
+        check(
+            not is_causal,
+            f"{f_name}(): is_causal == True is not supported."
+        )
+        check(
+            attn_mask is not None,
+            f"{f_name}(): attn_mask == None is not supported."
+        )
+        assert attn_mask is not None
+
+        check(
+            attn_mask.layout == torch.sparse_bsr,
+            f"{f_name}(): "
+            f"attn_mask.layout must be {torch.sparse_bsr}, but got "
+            f"attn_mask.layout == {attn_mask.layout}."
+        )
+
+        check_device(f_name, key, query.device)
+        check_device(f_name, value, query.device)
+        check_device(f_name, attn_mask, query.device)
+
+        check_dtype(f_name, key, query.dtype)
+        check_dtype(f_name, value, query.dtype)
+        if attn_mask.dtype is not torch.bool:
+            check_dtype(f_name, attn_mask, query.dtype)
+
+        sdpa = sampled_addmm(attn_mask, query, key.transpose(-2, -1), beta=0.0, skip_checks=False)
+        if scale is None and query.size(-1) == 0 or scale == 0.0:
+            check(
+                False,
+                f"{f_name}(): current value of scale == {scale} "
+                "results in division by zero."
+            )
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        sdpa.values().mul_(scale_factor)
+        sdpa = bsr_softmax(sdpa)
+        torch.nn.functional.dropout(sdpa.values(), p=dropout_p, inplace=True)
+        sdpa = bsr_dense_mm(sdpa, value)
+        return sdpa
+
+    @triton.jit
+    def _scatter_mm2_kernel(
+            M: tl.constexpr, K: tl.constexpr, N: tl.constexpr,
+            blocks_ptr, blocks_stride_P, blocks_stride_M, blocks_stride_K,
+            others_ptr, others_stride_Q, others_stride_K, others_stride_N,
+            accumulators_ptr, accumulators_stride_R, accumulators_stride_M, accumulators_stride_N,
+            pq_offsets_ptr, pq_offsets_stride,
+            pq_ptr, pq_stride_T, pq_stride_1,
+            dot_out_dtype: tl.constexpr,
+            TILE_M: tl.constexpr,
+            TILE_N: tl.constexpr,
+            allow_tf32: tl.constexpr):
+
+        Ms = M // TILE_M
+        Ns = N // TILE_N
+
+        pid_t = tl.program_id(axis=0)
+
+        pid = tl.program_id(axis=1)
+        pid_m = pid // Ms
+        pid_n = pid % Ms
+
+        rm = (pid_m * TILE_M + tl.arange(0, TILE_M))
+        rn = (pid_n * TILE_N + tl.arange(0, TILE_N))
+        rk = tl.arange(0, K)
+
+        A_ptr = blocks_ptr + (rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K)
+        B_ptr = others_ptr + (rk[:, None] * others_stride_K + rn[None, :] * others_stride_N)
+
+        g0 = tl.load(pq_offsets_ptr + pid_t * pq_offsets_stride)
+        g1 = tl.load(pq_offsets_ptr + (pid_t + 1) * pq_offsets_stride)
+
+        if g0 == g1:
+            return
+
+        acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype)
+
+        for i in range(g0, g1):
+            p = tl.load(pq_ptr + i * pq_stride_T)
+            q = tl.load(pq_ptr + i * pq_stride_T + pq_stride_1)
+            A = tl.load(A_ptr + p * blocks_stride_P)
+            B = tl.load(B_ptr + q * others_stride_Q)
+            acc_block += tl.dot(A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32)
+
+        C_ptr = accumulators_ptr + pid_t * accumulators_stride_R + (
+            rm[:, None] * accumulators_stride_M + rn[None, :] * accumulators_stride_N)
+        tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty))
+
+    def _scatter_mm2(
+            blocks: torch.Tensor,
+            others: torch.Tensor,
+            pq_offsets: torch.Tensor,
+            pq_indices: torch.Tensor,
+            accumulators: torch.Tensor
+    ):
+        P, M, K = blocks.shape
+        Q, _, N = others.shape
+        R, _, _ = accumulators.shape
+
+        meta = dict(TILE_M=max(16, M // 4), TILE_N=max(16, N // 4), num_stages=1, num_warps=2)
+
+        def grid(META):
+            return (pq_offsets.shape[0] - 1, triton.cdiv(M, META['TILE_M']) * triton.cdiv(N, META['TILE_N']), 1)
+
+        dot_out_dtype = {torch.float16: tl.float32,
+                         torch.bfloat16: tl.float32,
+                         torch.float32: tl.float64,
+                         torch.float64: tl.float64}[accumulators.dtype]
+        if 'allow_tf32' not in meta:
+            meta.update(allow_tf32=dot_out_dtype == tl.float32)
+        _scatter_mm2_kernel[grid](
+            M, K, N,
+            blocks, blocks.stride(0), blocks.stride(1), blocks.stride(2),
+            others, others.stride(0), others.stride(1), others.stride(2),
+            accumulators, accumulators.stride(0), accumulators.stride(1), accumulators.stride(2),
+            pq_offsets, pq_offsets.stride(0),
+            pq_indices, pq_indices.stride(0), pq_indices.stride(1),
+            dot_out_dtype=dot_out_dtype,
+            **meta
+        )
+
+    @triton.jit
+    def _scatter_mm6_kernel(
+            nbatches, Ms, Ks: tl.constexpr, N,
+            blocks_ptr, blocks_stride_P, blocks_stride_M, blocks_stride_K,
+            others_ptr, others_stride_B, others_stride_K, others_stride_N,
+            accumulators_ptr, accumulators_stride_B, accumulators_stride_M, accumulators_stride_N,
+            c_indices_ptr, r_offsets_ptr,
+            p_offsets_ptr, q_offsets_ptr,
+            is_compressed: tl.constexpr,
+            dot_out_dtype: tl.constexpr,
+            SPLIT_N: tl.constexpr,
+            TILE_M: tl.constexpr,
+            TILE_N: tl.constexpr,
+            GROUP_SIZE: tl.constexpr,
+            allow_tf32: tl.constexpr):
+        Ns = N // SPLIT_N
+        BLOCKS_M = Ms // TILE_M
+        BLOCKS_N = Ns // TILE_N
+
+        pid_t_ = tl.program_id(axis=0)
+        pid = tl.program_id(axis=1)
+        pid_b = pid_t_ % nbatches
+        pid_t = pid_t_ // nbatches
+
+        num_pid_in_group = GROUP_SIZE * BLOCKS_N
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE
+        group_size_m = min(BLOCKS_M - first_pid_m, GROUP_SIZE)
+        pid_m = first_pid_m + (pid % group_size_m)
+        pid_n = (pid % num_pid_in_group) // group_size_m
+
+        rm = (pid_m * TILE_M + tl.arange(0, TILE_M))
+        rn = (pid_n * TILE_N + tl.arange(0, TILE_N))
+        rk = tl.arange(0, Ks)
+        A_ptr = blocks_ptr + (rm[:, None] * blocks_stride_M + rk[None, :] * blocks_stride_K)
+        B_ptr = others_ptr + pid_b * others_stride_B + (rk[:, None] * others_stride_K + rn[None, :] * others_stride_N)
+
+        # When is_compressed is True, r is the only variable that
+        # depends on pid_t. This property allows sorting r values
+        # before calling the kernel. The sorting of r is equivalent to
+        # defining swizzle operator outside of the kernel.
+        r = tl.load(r_offsets_ptr + pid_t)
+
+        if is_compressed:
+            m = (r // N) // Ms
+            n = (r % N) // Ns
+            r0 = tl.load(c_indices_ptr + m)
+            r1 = tl.load(c_indices_ptr + m + 1)
+            g0 = n * r1 + (SPLIT_N - n) * r0
+            nnz = r1 - r0
+        else:
+            g0 = tl.load(c_indices_ptr + pid_t)
+            g1 = tl.load(c_indices_ptr + pid_t + 1)
+            nnz = g1 - g0
+
+        q_ptr = q_offsets_ptr + g0
+        acc_block = tl.zeros((TILE_M, TILE_N), dtype=dot_out_dtype)
+
+        if is_compressed:
+            A_ptr += r0 * blocks_stride_P  # type: ignore[possibly-undefined]
+            for _ in range(nnz):
+                q = tl.load(q_ptr)
+                B = tl.load(B_ptr + q)
+                A = tl.load(A_ptr)
+                acc_block += tl.dot(A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32)
+                A_ptr += blocks_stride_P
+                q_ptr += 1
+        else:
+            p_ptr = p_offsets_ptr + g0
+            for _ in range(nnz):
+                q = tl.load(q_ptr)
+                B = tl.load(B_ptr + q)
+                p = tl.load(p_ptr)
+                A = tl.load(A_ptr + p * blocks_stride_P)
+                p_ptr += 1
+                q_ptr += 1
+                acc_block += tl.dot(A, B, out_dtype=dot_out_dtype, allow_tf32=allow_tf32)
+
+        C_ptr = accumulators_ptr + r + pid_b * accumulators_stride_B + (
+            rm[:, None] * accumulators_stride_M + rn[None, :] * accumulators_stride_N)
+        tl.store(C_ptr, acc_block.to(accumulators_ptr.dtype.element_ty))
+
+    def _scatter_mm6(
+            blocks: torch.Tensor,
+            others: torch.Tensor,
+            c_indices: torch.Tensor,
+            r_offsets: torch.Tensor,
+            p_offsets: torch.Tensor,
+            q_offsets: torch.Tensor,
+            meta: dict,
+            accumulators: torch.Tensor,
+            force_contiguous: bool = True,
+    ):
+        SPLIT_N = meta['SPLIT_N']
+        P, Ms, Ks = blocks.shape
+        B, K_, N = others.shape
+        B_, M, N_ = accumulators.shape
+        assert N_ == N
+        Ns = N // SPLIT_N
+        assert B_ == B
+
+        def grid(META):
+            return (r_offsets.shape[0] * B, triton.cdiv(Ms, META['TILE_M']) * triton.cdiv(Ns, META['TILE_N']))
+
+        dot_out_dtype = {torch.float16: tl.float32,
+                         torch.bfloat16: tl.float32,
+                         torch.float32: tl.float64,
+                         torch.float64: tl.float64}[accumulators.dtype]
+        if 'allow_tf32' not in meta:
+            meta.update(allow_tf32=dot_out_dtype == tl.float32)
+
+        assert c_indices.stride(0) == 1
+        assert r_offsets.stride(0) == 1
+        assert p_offsets.stride(0) == 1
+        assert q_offsets.stride(0) == 1
+
+        # Re non-contiguous tensor arguments. Sometimes triton kernel
+        # launches may fail with
+        #
+        #   RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered
+        #
+        # that appears to be case when the size of a non-contiguous
+        # tensor argument is larger than a certain threshold. Could
+        # this be related to shared memory or L1 cache size of a GPU
+        # card? In anycase, ensuring that tensor arguments are
+        # contiguous seems to avoid the above exception. So, in the
+        # following we'll always convert tensor arguments to
+        # C-contiguous tensors.
+
+        if force_contiguous:
+            blocks = blocks.contiguous()
+            others = others.contiguous()
+            if not accumulators.is_contiguous():
+                accumulators_ = accumulators.contiguous()
+            else:
+                accumulators_ = accumulators
+        else:
+            accumulators_ = accumulators
+
+        _scatter_mm6_kernel[grid](
+            B, Ms, Ks, N,
+            blocks, blocks.stride(0), blocks.stride(1), blocks.stride(2),
+            others, others.stride(0), others.stride(1), others.stride(2),
+            accumulators_, accumulators_.stride(0), accumulators_.stride(1), accumulators_.stride(2),
+            c_indices,
+            r_offsets,
+            p_offsets,
+            q_offsets,
+            dot_out_dtype=dot_out_dtype,
+            **meta
+        )
+
+        if force_contiguous and not accumulators.is_contiguous():
+            accumulators.copy_(accumulators_)
+
+    @triton.jit
+    def _bsr_strided_addmm_kernel(
+        # values prologue
+        values_ptr,
+        values_batch_stride,
+        values_nnz_stride,
+        values_row_block_stride,
+        values_col_block_stride,
+        # values epilogue
+        # crow_indices prologue
+        crow_indices_ptr,
+        crow_indices_batch_stride,
+        crow_indices_stride,
+        # crow_indices epilogue
+        # col_indices prologue
+        col_indices_ptr,
+        col_indices_batch_stride,
+        col_indices_stride,
+        # col_indices epilogue
+        # input prologue
+        input_ptr,
+        input_batch_stride,
+        input_tiled_row_stride,
+        input_tiled_col_stride,
+        input_row_block_stride,
+        input_col_block_stride,
+        # input epilogue
+        # dense prologue
+        dense_ptr,
+        dense_batch_stride,
+        dense_tiled_row_stride,
+        dense_tiled_col_stride,
+        dense_row_block_stride,
+        dense_col_block_stride,
+        # dense epilogue
+        # output prologue
+        output_ptr,
+        output_batch_stride,
+        output_tiled_row_stride,
+        output_tiled_col_stride,
+        output_row_block_stride,
+        output_col_block_stride,
+        # output epilogue
+        beta,
+        alpha,
+        beta_is_one: tl.constexpr,
+        beta_is_nonzero: tl.constexpr,
+        alpha_is_one: tl.constexpr,
+        BLOCKSIZE_ROW: tl.constexpr,
+        BLOCKSIZE_COL: tl.constexpr,
+        BLOCKSIZE_INNER: tl.constexpr,
+        acc_dtype: tl.constexpr,
+        allow_tf32: tl.constexpr,
+        GROUP_SIZE_ROW: tl.constexpr,
+        SPLIT_N: tl.constexpr
+    ):
+
+        batch_pid = tl.program_id(axis=2)
+        row_block_pid = tl.program_id(axis=0)
+        col_block_pid = tl.program_id(axis=1)
+        n_block_rows = tl.num_programs(axis=0)
+        n_block_cols = tl.num_programs(axis=1)
+
+        row_block_pid, col_block_pid = tl.swizzle2d(
+            row_block_pid, col_block_pid, n_block_rows, n_block_cols, GROUP_SIZE_ROW
+        )
+
+        crow_indices_offset_ptr = (
+            crow_indices_ptr
+            + crow_indices_batch_stride * batch_pid
+            + crow_indices_stride * row_block_pid
+        )
+        nnz_offset = tl.load(crow_indices_offset_ptr)
+        nnz_offset_next = tl.load(crow_indices_offset_ptr + crow_indices_stride)
+
+        # Compute nnz for the row with number row_block_pid.
+        row_nnz = nnz_offset_next - nnz_offset
+
+        row_block_arange = tl.arange(0, BLOCKSIZE_ROW)
+        inner_block_arange = tl.arange(0, BLOCKSIZE_INNER)
+        col_block_arange = tl.arange(0, BLOCKSIZE_COL)
+
+        if beta_is_nonzero:
+            # Pointers are set to exact write-to locations
+            input_ptrs = (
+                input_ptr
+                + input_batch_stride * batch_pid
+                + input_tiled_row_stride * row_block_pid
+                + input_tiled_col_stride * col_block_pid
+                + input_row_block_stride * row_block_arange[:, None]
+                + input_col_block_stride * col_block_arange[None, :]
+            )
+
+        # Pointers are set to the first block of the current row.
+        values_block_ptrs = (
+            values_ptr
+            + values_batch_stride * batch_pid
+            + values_nnz_stride * nnz_offset
+            + values_row_block_stride * row_block_arange[:, None]
+            + values_col_block_stride * inner_block_arange[None, :]
+        )
+
+        # NOTE: dense is advanced into all dimensions but the tiled row one.
+        # That will be advanced in the loop according to values in col_indices.
+        dense_block_ptrs = (
+            dense_ptr
+            + dense_batch_stride * batch_pid
+            + dense_tiled_col_stride * col_block_pid
+            + dense_row_block_stride * inner_block_arange[:, None]
+            + dense_col_block_stride * col_block_arange[None, :]
+        )
+
+        # Pointers are set to exact write-to locations
+        output_ptrs = (
+            output_ptr
+            + output_batch_stride * batch_pid
+            + output_tiled_row_stride * row_block_pid
+            + output_tiled_col_stride * col_block_pid
+            + output_row_block_stride * row_block_arange[:, None]
+            + output_col_block_stride * col_block_arange[None, :]
+        )
+
+        # Set pointer to the first nonzero element in the current row
+        col_index_nnz_ptr = (
+            col_indices_ptr
+            + col_indices_batch_stride * batch_pid
+            + col_indices_stride * nnz_offset
+        )
+
+        # alpha is never 0
+        if beta_is_nonzero:
+            output_acc_block = tl.load(input_ptrs).to(acc_dtype)  # type: ignore[possibly-undefined]
+            if not (beta_is_one and alpha_is_one):
+                beta_alpha = beta / alpha
+                output_acc_block *= beta_alpha
+        else:
+            output_acc_block = tl.zeros((BLOCKSIZE_ROW, BLOCKSIZE_COL), dtype=acc_dtype)
+
+        for _ in range(row_nnz):
+            values_block = tl.load(values_block_ptrs)
+
+            # find which row of dense needs to get loaded
+            # for multiplication with values_block.
+            dense_row_idx = tl.load(col_index_nnz_ptr)
+            dense_block = tl.load(dense_block_ptrs + dense_tiled_row_stride * dense_row_idx)
+
+            # do block mm
+            output_acc_block += tl.dot(values_block, dense_block, allow_tf32=allow_tf32, out_dtype=acc_dtype)
+
+            # move val/col_index ptrs to the next block in the row
+            values_block_ptrs += values_nnz_stride
+            col_index_nnz_ptr += col_indices_stride
+
+        if not alpha_is_one:
+            output_acc_block *= alpha
+
+        # write back the result
+        tl.store(output_ptrs, output_acc_block.to(output_ptr.dtype.element_ty))
+
+
+else:
+    bsr_softmax = None  # type: ignore[assignment]
+    bsr_dense_mm = None  # type: ignore[assignment]
+    sampled_addmm = None  # type: ignore[assignment]
+    _scaled_dot_product_attention = None  # type: ignore[assignment]
+    _scatter_mm2 = None  # type: ignore[assignment]
+    _scatter_mm6 = None  # type: ignore[assignment]
+    _bsr_strided_addmm_kernel = None  # type: ignore[assignment]
diff --git a/MLPY/Lib/site-packages/torch/sparse/_triton_ops_meta.py b/MLPY/Lib/site-packages/torch/sparse/_triton_ops_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..008f0e5f8bc7cd179874fb05a17f40d58e013734
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/_triton_ops_meta.py
@@ -0,0 +1,4029 @@
+"""Provides optimal triton kernel parameters.
+
+Aim
+---
+
+The usage of optimal triton kernel parameters may increase the
+performance of operations several times. For example, for large tensor
+shapes, the usage of a bsr tensor as mat1 argument in addmm-based
+operations typically outperforms the corresponding operation with
+strided-only inputs when the blocked representation of a tensor
+provides a better alignement with memory access than what the strided
+representation would provide.
+
+Pre-computed kernel parameters
+------------------------------
+
+This script finds and stores the optimal triton kernel parameters for
+a specific set of shape configurations. For instance, the set of shape
+configurations of the bsr_dense_addmm kernel is defined as
+
+  input, out: M x N strided tensor
+  mat1: M x K bsr tensor with blocksize (BM, BK) and given sparsity
+  mat2: M x N strided tensor
+  dtype = float16, bfloat16, float32
+  sparsity = 0.5
+  M = 256, 512, ..., 16384
+  K = M
+  N = 256, 512, ..., 131072
+  BM = 16, 32, ..., 128
+  BK = BM
+  alpha = 1
+  beta = 0, 1
+  GPUs: NVIDIA A100-SXM4-80GB
+
+Approximations
+--------------
+
+It is practically infeasible to pre-compute optimal kernel parameter
+for all possible shape configurations as well as for all existing
+GPUs. Therefore, we'll assume that the pre-computed optimal parameters
+are good enough approximations when
+1) the used GPU is any of NVIDIA A100 Tensor Core GPUs,
+2) the actual sparsity of mat1 is different from sparsity value 0.5.
+
+If a particular shape configuration does not fall in the set of
+pre-computed kernel parameters, or it does not match with the listed
+approximations above, or the used GPU device is not a NVIDIA A100 GPU,
+then a reference set of triton kernel parameters will be used when
+executing operations. The reference kernel parameters are defined in
+torch/sparse/_triton_ops.py, see bsr_dense_addmm_meta function, for
+instance.
+
+Computing optimal kernel parameters
+-----------------------------------
+
+If the approximations listed above are unacceptable, e.g. when one
+seeks a maximal performance possible, the optimal kernel parameters
+for a particular GPU can be computed by simply running this script in
+the pytorch developement tree::
+
+  cd /path/to/pytorch
+  python setup.py develop
+  python torch/sparse/_triton_ops_meta.py
+
+This will compute the optimal kernel parameters for the GPU device
+available in the host system for all shape configurations listed in
+"Pre-computed kernel parameters" above. The results will be stored in
+the database of kernel parameters. Currently, this database is defined
+as this module (see "BEGIN GENERATED DATA" comment below) that will be
+modified when the script is run. Create a pytorch PR with the
+corresponding modifications in this file to make the computed optimal
+kernel parameters available for other users as pre-computed kernel
+parameters.
+
+Moreover, one can compute the optimal kernel parameters for a specific
+set of shape configurations and specific sparsity patterns. For that,
+use tuning functions provided by this module:
+
+  tune_bsr_dense_addmm(input, mat1, mat2, beta=1, alpha=1, out=None, verbose=False, store=False) -> meta
+
+The tuning functions return a dictionary of optimal kernel parameters
+that can be passed to the corresponding operation, e.g.
+
+  bsr_dense_addmm(..., meta=meta)
+
+Or, when store==True, the optimal kernel parameters will be stored in
+the database of pre-computed kernel parameters in runtime so that all
+addmm-based operations such as torch.addmm, torch.mm,
+torch.nn.functional.linear will benefit from using the computed
+optimal set of kernel parameters.
+
+Note that running tune_bsr_dense_addmm can take several minutes. So,
+use it wisely, e.g. by implementing persisten storage of optimized
+kernel parameters. See the source code of get_meta and
+tune_bsr_dense_addmm to learn how to register a custom set of optimal
+kernel parameters for addmm-based operations.
+
+"""
+__all__ = ["get_meta", "tune_bsr_dense_addmm"]
+
+import inspect
+import itertools
+import re
+import warnings
+from typing import Any, Dict
+
+import torch
+from torch.hub import tqdm
+from torch.testing import make_tensor
+
+
+def get_meta(op, key, device_name=None, version=(0, torch.float16, 0.5), exact=False):
+    """Return triton kernel meta parameters of the specified op and its inputs key.
+
+    Parameters
+    ----------
+    op (str): The name of an operation that implementation uses meta parameters.
+    key (tuple): A tuple of op input parameters, e.g. shapes, etc.
+    device_name (optional, str): The name of a device for which op
+      parameters are provided.
+    version (optional, hashable): Specifies the version of parameters.
+    exact (optional, bool): When True, the returned data (if
+      available) corresponds exactly to the specified device_name and
+      version information. Otherwise, if the corresponding data is not
+      available but there exists a data set that is computed for a
+      similar GPU device, then this data set will be returned.
+
+    Returns
+    -------
+    result (dict): The requested mapping of parameter names and
+      values, or None when no data is available.
+    """
+    if device_name is None:
+        device_name = torch.cuda.get_device_name()
+    op_data = _operation_device_version_data.get((op, device_name, version))
+    if op_data is None and not exact:
+        # A lack of op data could be due to using a (slightly)
+        # different GPU model compared to a model for which optimal
+        # meta parameters have been computed. In the following we'll
+        # assume that there is a set of GPU models that all have
+        # a similar set of optimal meta parameters.
+        if re.match(r"NVIDIA A100[^\d]", device_name) is not None:
+            device_name = "NVIDIA A100-SXM4-80GB"
+        else:
+            return
+        op_data = _operation_device_version_data.get((op, device_name, version))
+    if op_data is None:
+        return
+    values = op_data.get(key)
+    if values is not None:
+        if op == "scatter_mm":
+            names = (
+                "GROUP_SIZE",
+                "SPLIT_N",
+                "TILE_M",
+                "TILE_N",
+                "num_stages",
+                "num_warps",
+            )
+            return dict(zip(names, values))
+        elif op == "bsr_dense_addmm":
+            return dict(
+                zip(("GROUP_SIZE_ROW", "SPLIT_N", "num_stages", "num_warps"), values)
+            )
+        raise NotImplementedError(f"names for {op=}")
+
+
+def update(op, device_name, version, key, value):
+    """Update the db of op parameters."""
+    if (op, device_name, version) in _operation_device_version_data:
+        if _operation_device_version_data[op, device_name, version].get(key) == value:
+            return
+        _operation_device_version_data[op, device_name, version][key] = value
+    else:
+        _operation_device_version_data[op, device_name, version] = {key: value}
+
+
+def dump():
+    """Store the current runtime db state to the module file."""
+    current_file = inspect.getfile(dump)
+    f = open(current_file)
+    current_content = f.read()
+    f.close()
+    begin_data_str = "# BEGIN GENERATED DATA\n"
+    begin_data_index = current_content.find(begin_data_str)
+    end_data_index = current_content.find("    # END GENERATED DATA\n")
+    if begin_data_index == -1 or end_data_index == -1:
+        warnings.warn(
+            f"{current_file} cannot be updated:"
+            " BEGIN/END GENERATED DATA comment blocks appear to be corrupted"
+        )
+        return
+
+    def sort_key(key):
+        op, device_name, version = key
+        version = tuple(
+            (str(item) if isinstance(item, torch.dtype) else item) for item in version
+        )
+        return (op, device_name, version)
+
+    part1 = current_content[: begin_data_index + len(begin_data_str)]
+    part2 = current_content[end_data_index:]
+    data_part = []
+    for op_key in sorted(_operation_device_version_data, key=sort_key):
+        data_part.append("    " + repr(op_key).replace("'", '"') + ": {")
+        op_data = _operation_device_version_data[op_key]
+        for key in sorted(op_data):
+            data_part.append(f"        {key}: {op_data[key]},")
+        data_part.append("    },")
+    new_content = part1 + "\n".join(data_part) + "\n" + part2
+    if current_content != new_content:
+        f = open(current_file, "w")
+        f.write(new_content)
+        f.close()
+
+
+def minimize(
+    target_func,
+    initial_parameters,
+    reference_parameters,
+    step_func,
+    max_step=2,
+    verbose=False,
+    all_values=None,
+):
+    """Find a dict of parameters that minimizes the target function using
+    the initial dict of parameters and a step function that progresses
+    a specified parameter in a dict of parameters.
+
+    Parameters
+    ----------
+    target_func (callable): a functional with the signature
+      ``target_func(parameters: dict) -> float``
+    initial_parameters (dict): a set of parameters used as an initial
+      value to the minimization process.
+    reference_parameters (dict): a set of parameters used as an
+      reference value with respect to which the speed up is computed.
+    step_func (callable): a functional with the signature
+      ``step_func(parameter_name:str, parameter_value:int, direction:int, parameters:dict) -> int``
+      that increments or decrements (when ``direction`` is positive or
+      negative, respectively) the parameter with given name and value.
+      When return value is equal to ``parameter_value``, it means that
+      no step along the given direction can be made.
+
+    Returns
+    -------
+    parameters (dict): a set of parameters that minimizes the target
+      function.
+    speedup_incr (float): a speedup change given in percentage.
+    timing (float): the value of the target function at the parameters.
+    sensitivity_message (str): a message containing sensitivity.
+      information of parameters around the target function minimizer.
+    """
+
+    def to_key(parameters):
+        return tuple(parameters[k] for k in sorted(parameters))
+
+    def from_key(key, parameters):
+        return dict(zip(sorted(parameters), key))
+
+    if all_values is None:
+        all_values = dict()
+
+    directions = list(range(-max_step, max_step + 1))
+    names = sorted(initial_parameters)
+    all_directions = []
+    for d_tuple in itertools.product(*((directions,) * len(names))):
+        dist = sum(map(abs, d_tuple))
+        if dist > 0 and dist <= max_step:
+            all_directions.append((dist, d_tuple))
+    all_directions.sort()
+
+    try:
+        reference_target = target_func(reference_parameters)
+    except Exception as msg:
+        if verbose and "out of resource" not in str(msg):
+            print(f"{reference_parameters=} lead to failure: {msg}.")
+        reference_target = None
+    if reference_target is not None:
+        all_values[to_key(reference_parameters)] = reference_target
+
+    parameters = initial_parameters
+    try:
+        initial_target = target_func(parameters)
+    except Exception as msg:
+        if reference_target is None:
+            if verbose:
+                print(
+                    f"{initial_parameters=} lead to failure: {msg}. Optimization failed!"
+                )
+            return {}, -1, -1, f"{msg}"
+        if verbose and "out of resource" not in str(msg):
+            print(
+                f"{initial_parameters=} lead to failure: {msg}. Using reference parameters instead of initial parameters."
+            )
+        parameters = reference_parameters
+        initial_target = reference_target
+
+    if reference_target is None:
+        if verbose:
+            print("Using initial parameters instead of reference parameters.")
+        reference_target = initial_target
+
+    initial_key = to_key(parameters)
+    minimal_target = all_values[initial_key] = initial_target
+    pbar = tqdm(
+        total=len(all_directions),
+        desc="Tuning...",
+        disable=not verbose,
+        ncols=75,
+    )
+    while True:
+        for i, (_, d_tuple) in enumerate(all_directions):
+            pbar.update(1)
+            next_parameters = parameters.copy()
+            for name, direction in zip(names, d_tuple):
+                value = next_parameters[name]
+                if direction == 0:
+                    continue
+                next_value = step_func(name, value, direction, parameters)
+                if next_value == value:
+                    break
+                next_parameters[name] = next_value
+            else:
+                next_key = to_key(next_parameters)
+                if next_key in all_values:
+                    continue
+                try:
+                    next_target = target_func(next_parameters)
+                except Exception as msg:
+                    all_values[next_key] = str(msg)
+                    if verbose and "out of resource" not in str(msg):
+                        print(f"{next_parameters=} lead to failure: {msg}. Skipping.")
+                    continue
+                all_values[next_key] = next_target
+
+                if next_target < minimal_target:
+                    minimal_target = next_target
+                    parameters = next_parameters
+                    pbar.total += i + 1
+                    break
+        else:
+            # ensure stable minimizer:
+            minimizer_keys = {
+                k
+                for k, v in all_values.items()
+                if isinstance(v, float) and abs(1 - v / minimal_target) < 0.001
+            }
+            minimizer_key = (
+                initial_key if initial_key in minimizer_keys else min(minimizer_keys)
+            )
+            minimizer_target = all_values[minimizer_key]
+            parameters = from_key(minimizer_key, parameters)
+            speedup_incr = (1 - minimal_target / reference_target) * 100
+            if speedup_incr < 0:
+                if verbose:
+                    print(
+                        f"{speedup_incr=} is negative. Rerunning minimize with reference parameters as initial parameters."
+                    )
+                return minimize(
+                    target_func,
+                    reference_parameters,
+                    reference_parameters,
+                    step_func,
+                    max_step=max_step,
+                    verbose=verbose,
+                    all_values=all_values,
+                )
+            sensitivity = []
+            for name in parameters:
+                value = parameters[name]
+                rel_diffs = []
+                for direction in range(-max_step, max_step + 1):
+                    if direction == 0:
+                        continue
+                    next_value = step_func(name, value, direction, parameters)
+                    if next_value == value:
+                        rel_diffs.append(0)
+                        continue
+                    next_parameters = parameters.copy()
+                    next_parameters[name] = next_value
+                    next_key = to_key(next_parameters)
+                    next_target = all_values.get(next_key)
+                    if next_target is None or isinstance(next_target, str):
+                        rel_diffs.append(0)
+                        continue
+                    rel_diff = (next_target / minimal_target - 1) * 100
+                    rel_diffs.append(rel_diff)
+                sensitivity.append((max(rel_diffs), rel_diffs, name))
+
+            sensitivity_message = [f"timing0={initial_target:.3f}"]
+            for _, rel_diffs, name in sorted(sensitivity, reverse=True):
+                left_diffs = "|".join(
+                    [f"{rel_diff:.1f}" for rel_diff in rel_diffs[:max_step]]
+                )
+                right_diffs = "|".join(
+                    [f"{rel_diff:.1f}" for rel_diff in rel_diffs[max_step:]]
+                )
+                sensitivity_message.append(
+                    f"{name}={parameters[name]} ({left_diffs}...{right_diffs} %)"
+                )
+            sensitivity_message = ", ".join(sensitivity_message)
+            return parameters, speedup_incr, minimal_target, sensitivity_message
+
+
+def create_blocked_tensor(B, M, N, blocksize, sparsity, dtype, device):
+    assert (
+        sparsity <= 1.0 and sparsity >= 0.0
+    ), "sparsity should be a value between 0 and 1"
+    assert M % blocksize[0] == 0
+    assert N % blocksize[1] == 0
+    shape = (B, M // blocksize[0], N // blocksize[1])[int(B == 0) :]
+    A = torch.bernoulli(torch.full(shape, 1 - sparsity, dtype=dtype, device=device))
+    expected_nnz = int((1 - sparsity) * M * N / (blocksize[0] * blocksize[1]))
+    nonzero_indices = A.flatten().nonzero()
+    actual_nnz = nonzero_indices.shape[0]
+    if actual_nnz > expected_nnz:
+        selected_nonzeros = torch.randperm(actual_nnz)[: actual_nnz - expected_nnz]
+        A.flatten()[nonzero_indices[selected_nonzeros]] = 0
+    elif actual_nnz < expected_nnz:
+        zero_indices = (A == 0).flatten().nonzero()
+        selected_zeros = torch.randperm(zero_indices.shape[0])[
+            : expected_nnz - actual_nnz
+        ]
+        A.flatten()[zero_indices[selected_zeros]] = 1
+    A = torch.repeat_interleave(A, blocksize[0], dim=-2)
+    A = torch.repeat_interleave(A, blocksize[1], dim=-1)
+    return A
+
+
+def optimize_scatter_mm(
+    m, k, n, bm, bk, dtype=torch.float16, device="cuda", sparsity=0.5, force=False
+):
+    import triton
+
+    from torch.sparse._triton_ops import bsr_scatter_mm, bsr_scatter_mm_indices_data
+
+    key = (m, k, n, bm, bk)
+
+    version = (0, dtype, sparsity)
+    device_name = torch.cuda.get_device_name()
+
+    reference_meta = dict(
+        GROUP_SIZE=1,
+        TILE_M=16,
+        TILE_N=16,
+        SPLIT_N=n // 16,
+        num_stages=1,
+        num_warps=1,
+    )
+
+    initial_meta = get_meta(
+        "scatter_mm", key, device_name=device_name, version=version, exact=True
+    )
+
+    if initial_meta is None:
+        initial_meta = get_meta(
+            "bsr_dense_addmm",
+            key,
+            device_name=device_name,
+            version=(0, dtype, 0.5),
+            exact=True,
+        )
+        if initial_meta is None:
+            initial_meta = reference_meta
+    elif not force:
+        return
+
+    print(f"{m, k, n, bm, bk, initial_meta, reference_meta=}")
+    torch.manual_seed(0)
+    bsr = create_blocked_tensor(
+        0, m, k, (bm, bk), sparsity, dtype, device
+    ).to_sparse_bsr((bm, bk))
+    dense = make_tensor(k, n, dtype=dtype, device=device)
+
+    def bench(meta, bsr=bsr, dense=dense):
+        indices_data = bsr_scatter_mm_indices_data(
+            bsr, dense, indices_format="bsr_strided_mm_compressed", **meta
+        )
+
+        def test_func():
+            return bsr_scatter_mm(bsr, dense, indices_data=indices_data)
+
+        ms_min = triton.testing.do_bench(
+            test_func, warmup=500, rep=100, fast_flush=False
+        )
+
+        return ms_min
+
+    def step_meta_parameter(name, value, direction, meta, m=m, n=n, k=k, bm=bm, bk=bk):
+        # return next value in positive or negative direction, or
+        # input value if the step will result an invalid
+        # value. The input value is assumed to be valid.
+
+        is_log = name in {"SPLIT_N", "TILE_M", "TILE_N", "num_warps"}
+        min_value = dict(
+            SPLIT_N=1, TILE_M=16, TILE_N=16, num_warps=1, num_stages=1, GROUP_SIZE=1
+        )[name]
+        max_value = dict(
+            SPLIT_N=n // meta["TILE_N"], TILE_M=bm, TILE_N=n // meta["SPLIT_N"]
+        ).get(name)
+        value_step = dict(
+            SPLIT_N=2, TILE_M=2, TILE_N=2, num_warps=2, num_stages=1, GROUP_SIZE=1
+        )[name]
+        if is_log:
+            next_value = (
+                value * value_step**direction
+                if direction > 0
+                else value // (value_step ** abs(direction))
+            )
+        else:
+            next_value = value + value_step * direction
+        if min_value is not None:
+            next_value = max(next_value, min_value)
+        if max_value is not None:
+            next_value = min(next_value, max_value)
+        if name == "SPLIT_N" and n % next_value != 0:
+            return value
+        # Hard-skip parameter combinations that break CUDA state for pytorch:
+        if (dtype, name, next_value, m, n, k, bm, bk) in {
+            (torch.float32, "num_warps", 32, 256, 256, 256, 16, 16),
+            (torch.float32, "num_warps", 16, 256, 256, 256, 32, 32),
+            (torch.float32, "num_warps", 16, 256, 256, 256, 64, 64),
+            (torch.float32, "num_warps", 16, 256, 256, 256, 128, 128),
+            (torch.float32, "num_warps", 16, 512, 512, 256, 128, 128),
+        } and re.match(r"NVIDIA A100[^\d]", device_name) is not None:
+            return value
+        return next_value
+
+    meta, speedup, timing, sensitivity_message = minimize(
+        bench, initial_meta, reference_meta, step_meta_parameter
+    )
+    if initial_meta is not reference_meta and initial_meta == meta and not force:
+        return
+    print(f"{meta=} {speedup=:.1f} % {timing=:.3f} ms")
+    if speedup < 0:
+        return
+    device_name = torch.cuda.get_device_name()
+
+    update(
+        "scatter_mm", device_name, version, key, tuple(meta[k] for k in sorted(meta))
+    )
+
+
+def tune_bsr_dense_addmm(
+    input,
+    bsr,
+    dense,
+    *,
+    beta=1,
+    alpha=1,
+    out=None,
+    store=False,
+    verbose=False,
+    force=False,
+):
+    """Tune bsr_dense_addmm kernel parameters against the given inputs.
+
+    When store is True, the tuning results will be stored in the
+    database of kernel parameters.
+    """
+    import triton
+
+    from torch.sparse._triton_ops import bsr_dense_addmm
+
+    N = dense.shape[-1]
+    values = bsr.values()
+    crow_indices = bsr.crow_indices()
+    batch_ndim = crow_indices.dim() - 1
+    M, K = bsr.shape[batch_ndim : batch_ndim + 2]
+    BM, BK = values.shape[batch_ndim + 1 : batch_ndim + 3]
+
+    # Reference parameters is a set of parameters that leads to a
+    # successful kernel call and the corresponding timing is used as a
+    # reference for computing speedups. Avoid changing the reference
+    # parameters when possible.
+    reference_meta = dict(
+        GROUP_SIZE_ROW=1, num_stages=1, num_warps=4, SPLIT_N=max(N // BM, 1)
+    )
+
+    # Compute the key of parameters:
+    sparsity = round(1 - bsr._nnz() * BM * BK / (M * K), 2)
+    dtype = bsr.dtype
+    version = (0, dtype, sparsity)
+    key = (M, K, N, BM, BK, beta == 0, beta == 1, alpha == 1)
+
+    # For tuning, for an initial state, use parameters from the
+    # database if available, otherwise, use the reference parameters.
+    initial_meta = get_meta("bsr_dense_addmm", key, version=version, exact=True)
+    if initial_meta is None:
+        may_skip_update = False
+        initial_meta = get_meta(
+            "bsr_dense_addmm", key, version=(0, dtype, 0.5), exact=True
+        )
+        if initial_meta is None:
+            initial_meta = reference_meta
+    elif not force:
+        return initial_meta
+    else:
+        may_skip_update = True
+
+    # The target function that is minimized in the tuning process:
+    def bench(meta, input=input, bsr=bsr, dense=dense, alpha=alpha, out=out):
+        def test_func():
+            return bsr_dense_addmm(
+                input, bsr, dense, beta=beta, alpha=alpha, meta=meta, out=out
+            )
+
+        return triton.testing.do_bench(test_func, warmup=500, rep=100, fast_flush=False)
+
+    # The step function that increments a specified meta parameter:
+    def step_meta_parameter(name, value, direction, meta, M=M, N=N, K=K, BM=BM, BK=BK):
+        # return next value in positive or negative direction, or
+        # input value if the step will result an invalid
+        # value. The input value is assumed to be valid.
+        is_log = name in {"SPLIT_N", "num_warps"}
+        min_value = dict(SPLIT_N=1, num_warps=1, num_stages=1, GROUP_SIZE_ROW=1)[name]
+        max_value = dict(SPLIT_N=max(N // BM, 1)).get(name)
+        value_step = dict(SPLIT_N=2, num_warps=2, num_stages=1, GROUP_SIZE_ROW=1)[name]
+        if is_log:
+            next_value = (
+                value * value_step**direction
+                if direction > 0
+                else value // (value_step ** abs(direction))
+            )
+        else:
+            next_value = value + value_step * direction
+        if min_value is not None:
+            next_value = max(next_value, min_value)
+        if max_value is not None:
+            next_value = min(next_value, max_value)
+        if name == "SPLIT_N" and N % next_value != 0:
+            return value
+        return next_value
+
+    # Tune:
+    meta, speedup, timing, sensitivity_message = minimize(
+        bench,
+        initial_meta,
+        reference_meta,
+        step_meta_parameter,
+        max_step=2,
+        verbose=verbose,
+    )
+    if verbose:
+        print(f"-> {sensitivity_message}, {speedup=:.1f} %, {timing=:.3f} ms")
+
+    if store and not (
+        may_skip_update and meta == initial_meta and initial_meta is not reference_meta
+    ):
+        device_name = torch.cuda.get_device_name()
+        update(
+            "bsr_dense_addmm",
+            device_name,
+            version,
+            key,
+            tuple(meta[k] for k in sorted(meta)),
+        )
+
+    return meta
+
+
+def optimize_bsr_dense_addmm(
+    m,
+    k,
+    n,
+    bm,
+    bk,
+    beta=1,
+    alpha=1,
+    dtype=torch.float16,
+    device="cuda",
+    sparsity=0.5,
+    force=False,
+    verbose=False,
+):
+    torch.manual_seed(0)
+    bsr = create_blocked_tensor(
+        0, m, k, (bm, bk), sparsity, dtype, device
+    ).to_sparse_bsr((bm, bk))
+    dense = make_tensor(k, n, dtype=dtype, device=device)
+    input = make_tensor(m, n, dtype=dtype, device=device)
+    tune_bsr_dense_addmm(
+        input,
+        bsr,
+        dense,
+        beta=beta,
+        alpha=alpha,
+        store=True,
+        force=force,
+        verbose=verbose,
+    )
+
+
+def main(op="scatter_mm", force=False, dtype=torch.float16, verbose=True):
+    import itertools
+
+    sizes_lst = [256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+    shapes_lst = [(sz, sz) for sz in sizes_lst[:-3]]
+    blocksize_lst = [(16, 16), (32, 32), (64, 64), (128, 128)]
+    sparsity_lst = [0.5, 0.7, 0.3][:1]
+    for sparsity in sparsity_lst:
+        print(f"{op, dtype, sparsity=}")
+        try:
+            for (M, K), N, (BM, BK) in itertools.product(
+                shapes_lst, sizes_lst, blocksize_lst
+            ):
+                if op == "scatter_mm":
+                    optimize_scatter_mm(
+                        M, K, N, BM, BK, force=force, sparsity=sparsity, dtype=dtype
+                    )
+                elif op == "bsr_dense_addmm":
+                    for alpha, beta in [(1, 1), (1, 0)]:
+                        optimize_bsr_dense_addmm(
+                            M,
+                            K,
+                            N,
+                            BM,
+                            BK,
+                            beta=beta,
+                            alpha=alpha,
+                            force=force,
+                            sparsity=sparsity,
+                            dtype=dtype,
+                            verbose=verbose,
+                        )
+                else:
+                    raise NotImplementedError(op)
+        except KeyboardInterrupt:
+            break
+        except Exception as msg:
+            dump()
+            raise
+    dump()
+
+    if 0:
+        # Check performance dependence on sparsity and apply
+        # adjustments when differences are noticable (more than 10%).
+        #
+        # When using NVIDIA A100 GPU, the performance dependence on
+        # sparsity is insignificant (0 % ... 10 %) for majority of
+        # shapes/blocksizes combinations. However, for a very few
+        # specific size combinations, the effect of sparsity on
+        # performance can be up to 20 %.
+        for (M, K), N, (BM, BK) in itertools.product(
+            shapes_lst, sizes_lst, blocksize_lst
+        ):
+            meta_lst: list = []
+            key = (M, K, N, BM, BK)
+            for sparsity1 in sparsity_lst:
+                torch.manual_seed(0)
+                bsr = create_blocked_tensor(
+                    0, M, K, (BM, BK), sparsity1, dtype, device="cuda"
+                ).to_sparse_bsr((BM, BK))
+                dense = make_tensor(K, N, dtype=dtype, device="cuda")
+                meta_lst = []
+                for sparsity in sparsity_lst:
+                    meta = get_meta(op, key, version=(0, dtype, sparsity), exact=True)
+                    if meta is None:
+                        continue
+
+                    def bench(meta, bsr=bsr, dense=dense):
+                        import triton
+
+                        if op == "scatter_mm":
+                            from torch.sparse._triton_ops import (
+                                bsr_scatter_mm,
+                                bsr_scatter_mm_indices_data,
+                            )
+
+                            indices_data = bsr_scatter_mm_indices_data(
+                                bsr,
+                                dense,
+                                indices_format="bsr_strided_mm_compressed",
+                                **meta,
+                            )
+
+                            def test_func():
+                                return bsr_scatter_mm(
+                                    bsr, dense, indices_data=indices_data
+                                )
+
+                        else:
+                            raise NotImplementedError(op)
+
+                        ms_min = triton.testing.do_bench(
+                            test_func, warmup=500, rep=100, fast_flush=False
+                        )
+
+                        return ms_min
+
+                    meta_lst.append(
+                        (bench(meta), sparsity, tuple(meta[k] for k in sorted(meta)))
+                    )
+                if not meta_lst:
+                    continue
+                meta_lst = sorted(meta_lst)
+                index = next(
+                    i for i, item in enumerate(meta_lst) if item[1] == sparsity1
+                )
+                if meta_lst[0][2] == meta_lst[index][2]:
+                    continue
+                speeddiff = (1 - meta_lst[index][0] / meta_lst[0][0]) * 100
+                if abs(speeddiff) < 10:
+                    continue
+
+                print(sparsity1, index, key, meta_lst, speeddiff)
+
+                if index > 0:
+                    device_name = torch.cuda.get_device_name()
+                    meta = get_meta(
+                        op, key, version=(0, dtype, meta_lst[0][1]), exact=True
+                    )
+                    update(
+                        op,
+                        device_name,
+                        (0, dtype, sparsity1),
+                        key,
+                        tuple(meta[k] for k in sorted(meta)),
+                    )
+                    print("update")
+                    dump()
+
+
+_operation_device_version_data: Dict[Any, Dict] = {
+    # Warning: the data in between the BEGIN/END DATA comment lines
+    # below is generated. It can be updated either manually or via
+    # calling dump function defined above.
+    #
+    # Legend [op: key -> data]:
+    #   scatter_mm : M, K, N, Ms, Ks -> GROUP_SIZE, SPLIT_N, TILE_M, TILE_N, num_stages, num_warps
+    #   bsr_dense_addmm : M, K, N, Ms, Ks, beta==0, beta==1, alpha==1  -> GROUP_SIZE_ROW, SPLIT_N, num_stages, num_warps
+    #
+    # BEGIN GENERATED DATA
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): {
+        (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 2),
+        (16, 16, 16, 16, 16, False, False, True): (1, 1, 1, 4),
+        (16, 16, 16, 16, 16, False, True, False): (1, 1, 3, 16),
+        (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8),
+        (16, 16, 16, 16, 16, True, False, False): (2, 1, 1, 8),
+        (16, 16, 16, 16, 16, True, False, True): (1, 1, 1, 8),
+        (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8),
+        (16, 16, 32, 16, 16, False, False, True): (1, 2, 2, 4),
+        (16, 16, 32, 16, 16, False, True, False): (1, 1, 2, 4),
+        (16, 16, 32, 16, 16, False, True, True): (1, 1, 2, 4),
+        (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4),
+        (16, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2),
+        (16, 16, 64, 16, 16, False, False, False): (1, 4, 2, 4),
+        (16, 16, 64, 16, 16, False, False, True): (1, 2, 1, 2),
+        (16, 16, 64, 16, 16, False, True, False): (2, 1, 1, 2),
+        (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8),
+        (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 1),
+        (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 4),
+        (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 2),
+        (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4),
+        (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2),
+        (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 1),
+        (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 2),
+        (16, 32, 16, 16, 16, True, False, True): (2, 1, 1, 2),
+        (16, 32, 16, 16, 32, False, False, False): (1, 1, 1, 4),
+        (16, 32, 16, 16, 32, False, False, True): (1, 1, 1, 8),
+        (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8),
+        (16, 32, 16, 16, 32, False, True, True): (1, 1, 2, 4),
+        (16, 32, 16, 16, 32, True, False, False): (1, 1, 1, 2),
+        (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 1),
+        (16, 32, 32, 16, 16, False, False, False): (2, 2, 1, 4),
+        (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 2),
+        (16, 32, 32, 16, 16, False, True, False): (1, 1, 2, 8),
+        (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 1),
+        (16, 32, 32, 16, 16, True, False, False): (1, 1, 1, 8),
+        (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4),
+        (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 8),
+        (16, 32, 32, 16, 32, False, False, True): (2, 1, 1, 8),
+        (16, 32, 32, 16, 32, False, True, False): (1, 1, 1, 4),
+        (16, 32, 32, 16, 32, False, True, True): (1, 1, 1, 4),
+        (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 8),
+        (16, 32, 32, 16, 32, True, False, True): (1, 1, 1, 4),
+        (16, 32, 64, 16, 16, False, False, False): (1, 4, 3, 8),
+        (16, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, True, True): (2, 4, 1, 4),
+        (16, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4),
+        (16, 32, 64, 16, 16, True, False, True): (1, 2, 1, 4),
+        (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8),
+        (16, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (16, 32, 64, 16, 32, False, True, False): (1, 4, 1, 2),
+        (16, 32, 64, 16, 32, False, True, True): (1, 2, 1, 4),
+        (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 4),
+        (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2),
+        (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2),
+        (16, 64, 16, 16, 32, False, False, True): (1, 1, 2, 2),
+        (16, 64, 16, 16, 32, False, True, False): (1, 1, 2, 8),
+        (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8),
+        (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2),
+        (16, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4),
+        (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 4),
+        (16, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4),
+        (16, 64, 32, 16, 32, True, False, False): (1, 2, 1, 4),
+        (16, 64, 32, 16, 32, True, False, True): (1, 2, 1, 8),
+        (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4),
+        (16, 64, 64, 16, 32, False, False, True): (1, 4, 2, 2),
+        (16, 64, 64, 16, 32, False, True, False): (1, 1, 1, 4),
+        (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 2),
+        (16, 64, 64, 16, 32, True, False, False): (1, 2, 1, 4),
+        (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4),
+        (32, 16, 16, 16, 16, False, False, False): (1, 1, 1, 8),
+        (32, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, False, True, False): (1, 1, 1, 4),
+        (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2),
+        (32, 16, 16, 16, 16, True, False, True): (1, 1, 1, 4),
+        (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4),
+        (32, 16, 32, 16, 16, False, False, True): (2, 2, 1, 4),
+        (32, 16, 32, 16, 16, False, True, False): (1, 2, 2, 2),
+        (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4),
+        (32, 16, 32, 16, 16, True, False, False): (1, 2, 2, 8),
+        (32, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2),
+        (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (32, 16, 64, 16, 16, False, False, True): (1, 4, 2, 4),
+        (32, 16, 64, 16, 16, False, True, False): (1, 2, 2, 2),
+        (32, 16, 64, 16, 16, False, True, True): (3, 4, 1, 4),
+        (32, 16, 64, 16, 16, True, False, False): (1, 2, 1, 2),
+        (32, 16, 64, 16, 16, True, False, True): (1, 2, 1, 4),
+        (32, 32, 16, 16, 16, False, False, False): (1, 1, 3, 4),
+        (32, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2),
+        (32, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4),
+        (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 2),
+        (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, True): (3, 1, 2, 4),
+        (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 8),
+        (32, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 32, 32, False, True, False): (1, 1, 2, 1),
+        (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 2),
+        (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8),
+        (32, 32, 16, 32, 32, True, False, True): (2, 1, 3, 4),
+        (32, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4),
+        (32, 32, 32, 16, 16, False, True, False): (1, 1, 1, 8),
+        (32, 32, 32, 16, 16, False, True, True): (2, 2, 1, 4),
+        (32, 32, 32, 16, 16, True, False, False): (1, 1, 1, 4),
+        (32, 32, 32, 16, 16, True, False, True): (2, 2, 2, 4),
+        (32, 32, 32, 16, 32, False, False, False): (2, 2, 1, 8),
+        (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2),
+        (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2),
+        (32, 32, 32, 32, 32, False, False, False): (1, 1, 3, 8),
+        (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 8),
+        (32, 32, 32, 32, 32, False, True, False): (2, 1, 3, 4),
+        (32, 32, 32, 32, 32, False, True, True): (2, 1, 1, 2),
+        (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2),
+        (32, 32, 32, 32, 32, True, False, True): (4, 1, 1, 1),
+        (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, False, True, False): (1, 2, 1, 8),
+        (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 2),
+        (32, 32, 64, 16, 16, True, False, False): (2, 4, 1, 2),
+        (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2),
+        (32, 32, 64, 16, 32, False, False, False): (1, 2, 1, 8),
+        (32, 32, 64, 16, 32, False, False, True): (1, 4, 2, 2),
+        (32, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4),
+        (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 32, True, False, False): (1, 4, 2, 2),
+        (32, 32, 64, 16, 32, True, False, True): (3, 4, 2, 2),
+        (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4),
+        (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 4),
+        (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8),
+        (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4),
+        (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 2),
+        (32, 32, 64, 32, 32, True, False, True): (3, 2, 1, 8),
+        (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2),
+        (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4),
+        (32, 64, 16, 16, 32, False, True, False): (1, 1, 2, 4),
+        (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2),
+        (32, 64, 16, 16, 32, True, False, True): (2, 1, 2, 2),
+        (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 1),
+        (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4),
+        (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 1),
+        (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2),
+        (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4),
+        (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 4),
+        (32, 64, 32, 16, 32, False, False, False): (2, 2, 1, 4),
+        (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 4),
+        (32, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4),
+        (32, 64, 32, 16, 32, False, True, True): (2, 2, 3, 4),
+        (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 2),
+        (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 2),
+        (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 2),
+        (32, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4),
+        (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8),
+        (32, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4),
+        (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4),
+        (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4),
+        (32, 64, 64, 16, 32, False, False, False): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, False, False, True): (1, 4, 2, 4),
+        (32, 64, 64, 16, 32, False, True, False): (1, 4, 2, 2),
+        (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 8),
+        (32, 64, 64, 16, 32, True, False, True): (1, 4, 2, 1),
+        (32, 64, 64, 32, 32, False, False, False): (1, 1, 1, 4),
+        (32, 64, 64, 32, 32, False, False, True): (2, 2, 1, 4),
+        (32, 64, 64, 32, 32, False, True, False): (1, 1, 1, 4),
+        (32, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4),
+        (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4),
+        (32, 64, 64, 32, 32, True, False, True): (2, 2, 3, 4),
+        (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4),
+        (64, 32, 16, 32, 32, False, False, True): (1, 1, 1, 4),
+        (64, 32, 16, 32, 32, False, True, False): (1, 1, 1, 8),
+        (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4),
+        (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 16),
+        (64, 32, 16, 32, 32, True, False, True): (2, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, False, False): (1, 1, 3, 4),
+        (64, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, True, False): (1, 1, 2, 4),
+        (64, 32, 32, 32, 32, False, True, True): (2, 1, 1, 4),
+        (64, 32, 32, 32, 32, True, False, False): (2, 1, 1, 16),
+        (64, 32, 32, 32, 32, True, False, True): (2, 1, 1, 4),
+        (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4),
+        (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4),
+        (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4),
+        (64, 32, 64, 32, 32, False, True, True): (2, 2, 1, 4),
+        (64, 32, 64, 32, 32, True, False, False): (1, 2, 1, 8),
+        (64, 32, 64, 32, 32, True, False, True): (2, 2, 3, 4),
+        (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 16),
+        (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4),
+        (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2),
+        (64, 64, 16, 32, 32, False, True, True): (2, 1, 1, 4),
+        (64, 64, 16, 32, 32, True, False, False): (2, 1, 3, 2),
+        (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4),
+        (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8),
+        (64, 64, 32, 32, 32, False, False, True): (2, 1, 2, 4),
+        (64, 64, 32, 32, 32, False, True, False): (2, 1, 1, 4),
+        (64, 64, 32, 32, 32, False, True, True): (1, 1, 2, 4),
+        (64, 64, 32, 32, 32, True, False, False): (2, 1, 1, 4),
+        (64, 64, 32, 32, 32, True, False, True): (1, 1, 2, 4),
+        (64, 64, 64, 32, 32, False, False, False): (1, 2, 2, 4),
+        (64, 64, 64, 32, 32, False, False, True): (1, 2, 2, 2),
+        (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 2),
+        (64, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4),
+        (256, 256, 256, 16, 16, False, True, True): (4, 8, 5, 1),
+        (256, 256, 256, 16, 16, True, False, True): (2, 8, 4, 2),
+        (256, 256, 256, 32, 32, False, True, True): (2, 8, 5, 2),
+        (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4),
+        (256, 256, 256, 64, 64, False, True, True): (2, 4, 4, 4),
+        (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 4),
+        (256, 256, 256, 128, 128, False, True, True): (4, 2, 2, 8),
+        (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 1),
+        (256, 256, 512, 16, 16, True, False, True): (3, 16, 3, 2),
+        (256, 256, 512, 32, 32, False, True, True): (2, 8, 5, 2),
+        (256, 256, 512, 32, 32, True, False, True): (1, 16, 4, 4),
+        (256, 256, 512, 64, 64, False, True, True): (1, 8, 4, 4),
+        (256, 256, 512, 64, 64, True, False, True): (3, 8, 3, 4),
+        (256, 256, 512, 128, 128, False, True, True): (1, 4, 2, 8),
+        (256, 256, 512, 128, 128, True, False, True): (1, 4, 2, 8),
+        (256, 256, 1024, 16, 16, False, True, True): (1, 16, 5, 4),
+        (256, 256, 1024, 16, 16, True, False, True): (5, 16, 4, 2),
+        (256, 256, 1024, 32, 32, False, True, True): (1, 32, 5, 2),
+        (256, 256, 1024, 32, 32, True, False, True): (2, 16, 5, 2),
+        (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4),
+        (256, 256, 1024, 64, 64, True, False, True): (1, 16, 4, 4),
+        (256, 256, 1024, 128, 128, False, True, True): (1, 8, 2, 8),
+        (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8),
+        (256, 256, 2048, 16, 16, False, True, True): (1, 16, 4, 4),
+        (256, 256, 2048, 16, 16, True, False, True): (2, 32, 5, 1),
+        (256, 256, 2048, 32, 32, False, True, True): (1, 64, 4, 1),
+        (256, 256, 2048, 32, 32, True, False, True): (2, 32, 4, 2),
+        (256, 256, 2048, 64, 64, False, True, True): (8, 16, 5, 4),
+        (256, 256, 2048, 64, 64, True, False, True): (1, 16, 4, 4),
+        (256, 256, 2048, 128, 128, False, True, True): (2, 16, 2, 8),
+        (256, 256, 2048, 128, 128, True, False, True): (1, 16, 2, 8),
+        (256, 256, 4096, 16, 16, False, True, True): (1, 64, 1, 4),
+        (256, 256, 4096, 16, 16, True, False, True): (1, 16, 3, 2),
+        (256, 256, 4096, 32, 32, False, True, True): (6, 32, 3, 2),
+        (256, 256, 4096, 32, 32, True, False, True): (4, 32, 4, 2),
+        (256, 256, 4096, 64, 64, False, True, True): (6, 64, 3, 4),
+        (256, 256, 4096, 64, 64, True, False, True): (2, 64, 3, 4),
+        (256, 256, 4096, 128, 128, False, True, True): (1, 32, 2, 8),
+        (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8),
+        (256, 256, 8192, 16, 16, False, True, True): (2, 32, 3, 4),
+        (256, 256, 8192, 16, 16, True, False, True): (4, 64, 3, 2),
+        (256, 256, 8192, 32, 32, False, True, True): (1, 64, 3, 4),
+        (256, 256, 8192, 32, 32, True, False, True): (3, 128, 1, 2),
+        (256, 256, 8192, 64, 64, False, True, True): (9, 128, 1, 4),
+        (256, 256, 8192, 64, 64, True, False, True): (8, 128, 1, 4),
+        (256, 256, 8192, 128, 128, False, True, True): (7, 64, 1, 4),
+        (256, 256, 8192, 128, 128, True, False, True): (1, 32, 1, 16),
+        (256, 256, 16384, 16, 16, False, True, True): (3, 128, 3, 2),
+        (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2),
+        (256, 256, 16384, 32, 32, False, True, True): (3, 128, 3, 2),
+        (256, 256, 16384, 32, 32, True, False, True): (1, 128, 3, 2),
+        (256, 256, 16384, 64, 64, False, True, True): (3, 128, 1, 4),
+        (256, 256, 16384, 64, 64, True, False, True): (2, 128, 1, 4),
+        (256, 256, 16384, 128, 128, False, True, True): (7, 128, 1, 4),
+        (256, 256, 16384, 128, 128, True, False, True): (1, 128, 2, 8),
+        (256, 256, 32768, 16, 16, False, True, True): (2, 128, 3, 2),
+        (256, 256, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (256, 256, 32768, 32, 32, False, True, True): (1, 256, 3, 4),
+        (256, 256, 32768, 32, 32, True, False, True): (3, 256, 3, 2),
+        (256, 256, 32768, 64, 64, False, True, True): (1, 256, 1, 4),
+        (256, 256, 32768, 64, 64, True, False, True): (3, 256, 1, 4),
+        (256, 256, 32768, 128, 128, False, True, True): (9, 256, 1, 4),
+        (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4),
+        (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
+        (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2),
+        (256, 256, 65536, 32, 32, False, True, True): (2, 512, 3, 2),
+        (256, 256, 65536, 32, 32, True, False, True): (2, 512, 3, 2),
+        (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4),
+        (256, 256, 65536, 64, 64, True, False, True): (1, 512, 1, 4),
+        (256, 256, 65536, 128, 128, False, True, True): (7, 512, 1, 4),
+        (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 4),
+        (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 2),
+        (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2),
+        (256, 256, 131072, 32, 32, False, True, True): (1, 1024, 3, 2),
+        (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2),
+        (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4),
+        (256, 256, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (512, 512, 256, 16, 16, False, True, True): (2, 4, 5, 4),
+        (512, 512, 256, 16, 16, True, False, True): (3, 4, 5, 4),
+        (512, 512, 256, 32, 32, False, True, True): (1, 4, 5, 2),
+        (512, 512, 256, 32, 32, True, False, True): (4, 8, 5, 1),
+        (512, 512, 256, 64, 64, False, True, True): (4, 4, 5, 4),
+        (512, 512, 256, 64, 64, True, False, True): (5, 4, 5, 4),
+        (512, 512, 256, 128, 128, False, True, True): (3, 2, 2, 8),
+        (512, 512, 256, 128, 128, True, False, True): (2, 2, 2, 8),
+        (512, 512, 512, 16, 16, False, True, True): (1, 8, 5, 4),
+        (512, 512, 512, 16, 16, True, False, True): (4, 8, 5, 2),
+        (512, 512, 512, 32, 32, False, True, True): (1, 16, 4, 1),
+        (512, 512, 512, 32, 32, True, False, True): (1, 8, 5, 2),
+        (512, 512, 512, 64, 64, False, True, True): (4, 8, 5, 4),
+        (512, 512, 512, 64, 64, True, False, True): (2, 8, 5, 4),
+        (512, 512, 512, 128, 128, False, True, True): (2, 4, 2, 8),
+        (512, 512, 512, 128, 128, True, False, True): (1, 4, 2, 8),
+        (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4),
+        (512, 512, 1024, 16, 16, True, False, True): (1, 8, 4, 4),
+        (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2),
+        (512, 512, 1024, 32, 32, True, False, True): (1, 16, 5, 2),
+        (512, 512, 1024, 64, 64, False, True, True): (2, 8, 3, 4),
+        (512, 512, 1024, 64, 64, True, False, True): (2, 16, 3, 4),
+        (512, 512, 1024, 128, 128, False, True, True): (2, 8, 2, 8),
+        (512, 512, 1024, 128, 128, True, False, True): (3, 8, 2, 8),
+        (512, 512, 2048, 16, 16, False, True, True): (4, 16, 3, 2),
+        (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2),
+        (512, 512, 2048, 32, 32, False, True, True): (3, 32, 3, 2),
+        (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2),
+        (512, 512, 2048, 64, 64, False, True, True): (6, 32, 3, 2),
+        (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2),
+        (512, 512, 2048, 128, 128, False, True, True): (4, 16, 2, 8),
+        (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8),
+        (512, 512, 4096, 16, 16, False, True, True): (1, 16, 3, 2),
+        (512, 512, 4096, 16, 16, True, False, True): (4, 32, 3, 2),
+        (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2),
+        (512, 512, 4096, 32, 32, True, False, True): (2, 32, 3, 2),
+        (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (512, 512, 4096, 64, 64, True, False, True): (1, 64, 3, 4),
+        (512, 512, 4096, 128, 128, False, True, True): (4, 32, 1, 4),
+        (512, 512, 4096, 128, 128, True, False, True): (4, 32, 2, 8),
+        (512, 512, 8192, 16, 16, False, True, True): (8, 64, 3, 2),
+        (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 2),
+        (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2),
+        (512, 512, 8192, 32, 32, True, False, True): (3, 64, 3, 2),
+        (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+        (512, 512, 8192, 64, 64, True, False, True): (7, 64, 3, 4),
+        (512, 512, 8192, 128, 128, False, True, True): (1, 64, 1, 4),
+        (512, 512, 8192, 128, 128, True, False, True): (4, 64, 2, 8),
+        (512, 512, 16384, 16, 16, False, True, True): (1, 64, 3, 2),
+        (512, 512, 16384, 16, 16, True, False, True): (1, 128, 3, 2),
+        (512, 512, 16384, 32, 32, False, True, True): (3, 128, 3, 2),
+        (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2),
+        (512, 512, 16384, 64, 64, False, True, True): (4, 64, 2, 4),
+        (512, 512, 16384, 64, 64, True, False, True): (2, 64, 2, 4),
+        (512, 512, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4),
+        (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 2),
+        (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2),
+        (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2),
+        (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (512, 512, 32768, 64, 64, True, False, True): (2, 256, 3, 4),
+        (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4),
+        (512, 512, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
+        (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
+        (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
+        (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2),
+        (512, 512, 65536, 64, 64, False, True, True): (4, 256, 2, 4),
+        (512, 512, 65536, 64, 64, True, False, True): (2, 512, 3, 4),
+        (512, 512, 65536, 128, 128, False, True, True): (6, 512, 1, 4),
+        (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 2),
+        (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1),
+        (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2),
+        (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2),
+        (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4),
+        (512, 512, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
+        (512, 512, 131072, 128, 128, False, True, True): (6, 1024, 1, 4),
+        (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (1024, 1024, 256, 16, 16, False, True, True): (1, 4, 5, 4),
+        (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 4, 4),
+        (1024, 1024, 256, 32, 32, False, True, True): (4, 4, 5, 2),
+        (1024, 1024, 256, 32, 32, True, False, True): (3, 4, 5, 2),
+        (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 5, 4),
+        (1024, 1024, 256, 64, 64, True, False, True): (1, 4, 5, 4),
+        (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (1024, 1024, 256, 128, 128, True, False, True): (2, 2, 2, 8),
+        (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 4, 4),
+        (1024, 1024, 512, 16, 16, True, False, True): (4, 8, 5, 2),
+        (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2),
+        (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2),
+        (1024, 1024, 512, 64, 64, False, True, True): (4, 8, 4, 4),
+        (1024, 1024, 512, 64, 64, True, False, True): (2, 8, 3, 4),
+        (1024, 1024, 512, 128, 128, False, True, True): (2, 4, 2, 8),
+        (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8),
+        (1024, 1024, 1024, 16, 16, False, True, True): (3, 8, 4, 4),
+        (1024, 1024, 1024, 16, 16, True, False, True): (4, 8, 4, 2),
+        (1024, 1024, 1024, 32, 32, False, True, True): (1, 16, 3, 2),
+        (1024, 1024, 1024, 32, 32, True, False, True): (1, 16, 3, 2),
+        (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 2),
+        (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 2, 8),
+        (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 2, 8),
+        (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4),
+        (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 2),
+        (1024, 1024, 2048, 32, 32, False, True, True): (5, 16, 3, 4),
+        (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2),
+        (1024, 1024, 2048, 64, 64, False, True, True): (6, 16, 4, 4),
+        (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4),
+        (1024, 1024, 2048, 128, 128, False, True, True): (4, 16, 2, 8),
+        (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 8),
+        (1024, 1024, 4096, 16, 16, False, True, True): (8, 32, 3, 2),
+        (1024, 1024, 4096, 16, 16, True, False, True): (4, 32, 3, 2),
+        (1024, 1024, 4096, 32, 32, False, True, True): (2, 32, 3, 4),
+        (1024, 1024, 4096, 32, 32, True, False, True): (3, 32, 3, 2),
+        (1024, 1024, 4096, 64, 64, False, True, True): (3, 32, 3, 4),
+        (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4),
+        (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8),
+        (1024, 1024, 4096, 128, 128, True, False, True): (1, 32, 2, 8),
+        (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2),
+        (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2),
+        (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 4),
+        (1024, 1024, 8192, 32, 32, True, False, True): (4, 32, 3, 4),
+        (1024, 1024, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4),
+        (1024, 1024, 8192, 128, 128, False, True, True): (4, 64, 2, 8),
+        (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 2),
+        (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 2),
+        (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4),
+        (1024, 1024, 16384, 128, 128, False, True, True): (2, 128, 1, 4),
+        (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 2),
+        (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2),
+        (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
+        (1024, 1024, 32768, 128, 128, False, True, True): (2, 256, 1, 4),
+        (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (1024, 1024, 65536, 32, 32, False, True, True): (9, 256, 3, 4),
+        (1024, 1024, 65536, 32, 32, True, False, True): (7, 256, 3, 4),
+        (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (1024, 1024, 65536, 64, 64, True, False, True): (2, 512, 3, 4),
+        (1024, 1024, 65536, 128, 128, False, True, True): (2, 512, 1, 4),
+        (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2),
+        (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2),
+        (1024, 1024, 131072, 32, 32, False, True, True): (4, 512, 3, 4),
+        (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4),
+        (1024, 1024, 131072, 64, 64, False, True, True): (2, 512, 2, 4),
+        (1024, 1024, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
+        (1024, 1024, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 5, 2),
+        (2048, 2048, 256, 16, 16, True, False, True): (4, 4, 5, 2),
+        (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 6, 2),
+        (2048, 2048, 256, 32, 32, True, False, True): (2, 4, 5, 2),
+        (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4),
+        (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 3, 4),
+        (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8),
+        (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 2, 8),
+        (2048, 2048, 512, 16, 16, False, True, True): (3, 4, 4, 4),
+        (2048, 2048, 512, 16, 16, True, False, True): (1, 4, 4, 4),
+        (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4),
+        (2048, 2048, 512, 32, 32, True, False, True): (1, 4, 4, 2),
+        (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+        (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 4),
+        (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8),
+        (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8),
+        (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4),
+        (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 2),
+        (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4),
+        (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2),
+        (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4),
+        (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 1, 4),
+        (2048, 2048, 1024, 128, 128, True, False, True): (2, 8, 1, 4),
+        (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2),
+        (2048, 2048, 2048, 16, 16, True, False, True): (4, 16, 3, 2),
+        (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 2),
+        (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2),
+        (2048, 2048, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (2048, 2048, 2048, 64, 64, True, False, True): (4, 16, 3, 4),
+        (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8),
+        (2048, 2048, 2048, 128, 128, True, False, True): (3, 16, 1, 4),
+        (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2),
+        (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2),
+        (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8),
+        (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 8),
+        (2048, 2048, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (2048, 2048, 4096, 64, 64, True, False, True): (3, 32, 3, 4),
+        (2048, 2048, 4096, 128, 128, False, True, True): (2, 32, 1, 4),
+        (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4),
+        (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 4, 8),
+        (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 3, 8),
+        (2048, 2048, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 8192, 128, 128, False, True, True): (2, 64, 1, 4),
+        (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4),
+        (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 2),
+        (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4),
+        (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4),
+        (2048, 2048, 16384, 128, 128, False, True, True): (2, 128, 1, 4),
+        (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4),
+        (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2),
+        (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 64, 64, False, True, True): (1, 128, 2, 4),
+        (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4),
+        (2048, 2048, 32768, 128, 128, False, True, True): (2, 256, 1, 4),
+        (2048, 2048, 32768, 128, 128, True, False, True): (2, 256, 1, 4),
+        (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 4, 4),
+        (2048, 2048, 65536, 16, 16, True, False, True): (7, 256, 4, 4),
+        (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4),
+        (2048, 2048, 65536, 32, 32, True, False, True): (3, 256, 3, 4),
+        (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (2048, 2048, 65536, 64, 64, True, False, True): (6, 512, 3, 4),
+        (2048, 2048, 65536, 128, 128, False, True, True): (2, 512, 1, 4),
+        (2048, 2048, 65536, 128, 128, True, False, True): (2, 512, 1, 4),
+        (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 4, 4),
+        (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4),
+        (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 4, 4),
+        (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4),
+        (2048, 2048, 131072, 64, 64, False, True, True): (2, 512, 2, 4),
+        (2048, 2048, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
+        (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
+        (2048, 2048, 131072, 128, 128, True, False, True): (2, 1024, 1, 4),
+        (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 5, 4),
+        (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 4, 2),
+        (4096, 4096, 256, 32, 32, False, True, True): (1, 2, 4, 4),
+        (4096, 4096, 256, 32, 32, True, False, True): (3, 2, 4, 2),
+        (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4),
+        (4096, 4096, 256, 64, 64, True, False, True): (1, 4, 3, 2),
+        (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4),
+        (4096, 4096, 512, 16, 16, True, False, True): (1, 2, 3, 4),
+        (4096, 4096, 512, 32, 32, False, True, True): (1, 4, 3, 4),
+        (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2),
+        (4096, 4096, 512, 64, 64, False, True, True): (4, 4, 4, 4),
+        (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4),
+        (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8),
+        (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4),
+        (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2),
+        (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2),
+        (4096, 4096, 1024, 32, 32, False, True, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2),
+        (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 4),
+        (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8),
+        (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 3, 8),
+        (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8),
+        (4096, 4096, 2048, 64, 64, False, True, True): (4, 16, 3, 4),
+        (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4),
+        (4096, 4096, 2048, 128, 128, False, True, True): (1, 16, 1, 4),
+        (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4),
+        (4096, 4096, 4096, 16, 16, True, False, True): (2, 32, 4, 4),
+        (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8),
+        (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 4, 8),
+        (4096, 4096, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (4096, 4096, 4096, 64, 64, True, False, True): (2, 32, 3, 4),
+        (4096, 4096, 4096, 128, 128, False, True, True): (2, 32, 1, 4),
+        (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4),
+        (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8),
+        (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8),
+        (4096, 4096, 8192, 64, 64, False, True, True): (4, 64, 3, 4),
+        (4096, 4096, 8192, 64, 64, True, False, True): (4, 64, 3, 4),
+        (4096, 4096, 8192, 128, 128, False, True, True): (1, 64, 1, 4),
+        (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4),
+        (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 4, 4),
+        (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8),
+        (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8),
+        (4096, 4096, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (4096, 4096, 16384, 64, 64, True, False, True): (4, 128, 3, 4),
+        (4096, 4096, 16384, 128, 128, False, True, True): (1, 128, 1, 4),
+        (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4),
+        (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 4, 4),
+        (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4),
+        (4096, 4096, 32768, 32, 32, False, True, True): (5, 128, 4, 4),
+        (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8),
+        (4096, 4096, 32768, 64, 64, False, True, True): (3, 256, 3, 4),
+        (4096, 4096, 32768, 64, 64, True, False, True): (2, 256, 3, 4),
+        (4096, 4096, 32768, 128, 128, False, True, True): (1, 256, 1, 4),
+        (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4),
+        (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4),
+        (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8),
+        (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 4, 8),
+        (4096, 4096, 65536, 64, 64, False, True, True): (1, 512, 3, 4),
+        (4096, 4096, 65536, 64, 64, True, False, True): (3, 512, 3, 4),
+        (4096, 4096, 65536, 128, 128, False, True, True): (1, 512, 1, 4),
+        (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4),
+        (4096, 4096, 131072, 16, 16, False, True, True): (5, 512, 4, 4),
+        (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4),
+        (4096, 4096, 131072, 32, 32, False, True, True): (4, 512, 4, 4),
+        (4096, 4096, 131072, 32, 32, True, False, True): (2, 512, 3, 4),
+        (4096, 4096, 131072, 64, 64, False, True, True): (1, 1024, 3, 4),
+        (4096, 4096, 131072, 64, 64, True, False, True): (3, 1024, 3, 4),
+        (4096, 4096, 131072, 128, 128, False, True, True): (1, 1024, 1, 4),
+        (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (8192, 8192, 256, 16, 16, False, True, True): (1, 1, 3, 4),
+        (8192, 8192, 256, 16, 16, True, False, True): (4, 1, 3, 4),
+        (8192, 8192, 256, 32, 32, False, True, True): (1, 2, 3, 4),
+        (8192, 8192, 256, 32, 32, True, False, True): (1, 2, 3, 4),
+        (8192, 8192, 256, 64, 64, False, True, True): (6, 2, 3, 8),
+        (8192, 8192, 256, 64, 64, True, False, True): (4, 2, 3, 8),
+        (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 4),
+        (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4),
+        (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2),
+        (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4),
+        (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 32, 32, True, False, True): (3, 4, 3, 2),
+        (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 64, 64, True, False, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8),
+        (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8),
+        (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4),
+        (8192, 8192, 1024, 16, 16, True, False, True): (2, 8, 4, 4),
+        (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8),
+        (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4),
+        (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4),
+        (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 4),
+        (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4),
+        (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8),
+        (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8),
+        (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4),
+        (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4),
+        (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 4),
+        (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (8192, 8192, 4096, 16, 16, False, True, True): (4, 16, 4, 4),
+        (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8),
+        (8192, 8192, 4096, 32, 32, True, False, True): (2, 16, 4, 8),
+        (8192, 8192, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4),
+        (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 4),
+        (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4),
+        (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8),
+        (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8),
+        (8192, 8192, 8192, 64, 64, False, True, True): (4, 32, 3, 8),
+        (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4),
+        (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 4),
+        (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 4, 4),
+        (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 3, 4),
+        (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8),
+        (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4),
+        (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 2, 4),
+        (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4),
+        (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4),
+        (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8),
+        (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8),
+        (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 2, 4),
+        (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 4),
+        (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4),
+        (8192, 8192, 65536, 16, 16, True, False, True): (3, 256, 4, 4),
+        (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 3, 4),
+        (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 4),
+        (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (8192, 8192, 65536, 64, 64, True, False, True): (2, 256, 3, 8),
+        (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 4),
+        (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (8192, 8192, 131072, 16, 16, False, True, True): (3, 512, 4, 4),
+        (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4),
+        (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 4),
+        (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 3, 4),
+        (8192, 8192, 131072, 64, 64, False, True, True): (4, 512, 2, 4),
+        (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4),
+        (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 6, 4),
+        (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4),
+        (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 2),
+        (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2),
+        (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 4, 4),
+        (16384, 16384, 256, 64, 64, True, False, True): (4, 2, 3, 8),
+        (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8),
+        (16384, 16384, 256, 128, 128, True, False, True): (4, 2, 2, 8),
+        (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4),
+        (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4),
+        (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 4, 8),
+        (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8),
+        (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4),
+        (16384, 16384, 512, 64, 64, True, False, True): (4, 4, 3, 4),
+        (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8),
+        (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8),
+        (16384, 16384, 1024, 16, 16, False, True, True): (3, 4, 4, 4),
+        (16384, 16384, 1024, 16, 16, True, False, True): (2, 8, 4, 4),
+        (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8),
+        (16384, 16384, 1024, 32, 32, True, False, True): (1, 4, 4, 8),
+        (16384, 16384, 1024, 64, 64, False, True, True): (2, 8, 3, 4),
+        (16384, 16384, 1024, 64, 64, True, False, True): (2, 8, 3, 4),
+        (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 4),
+        (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4),
+        (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (16384, 16384, 2048, 32, 32, False, True, True): (1, 8, 4, 8),
+        (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8),
+        (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4),
+        (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4),
+        (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 4),
+        (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4),
+        (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4),
+        (16384, 16384, 4096, 32, 32, False, True, True): (1, 8, 3, 8),
+        (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4),
+        (16384, 16384, 4096, 64, 64, False, True, True): (2, 16, 2, 4),
+        (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+        (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 4),
+        (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4),
+        (16384, 16384, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (16384, 16384, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8),
+        (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 3, 4),
+        (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 4, 8),
+        (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 3, 8),
+        (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 4),
+        (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4),
+        (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 3, 8),
+        (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4),
+        (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 4, 8),
+        (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4),
+        (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4),
+        (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 4, 2),
+        (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8),
+        (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 3, 8),
+        (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 4),
+        (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 4, 4),
+        (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 3, 4),
+        (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (16384, 16384, 65536, 64, 64, False, True, True): (1, 256, 2, 4),
+        (16384, 16384, 65536, 64, 64, True, False, True): (2, 256, 2, 4),
+        (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 4),
+        (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (16384, 16384, 131072, 16, 16, False, True, True): (2, 512, 4, 4),
+        (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 4, 4),
+        (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8),
+        (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 4),
+        (16384, 16384, 131072, 64, 64, False, True, True): (2, 512, 2, 4),
+        (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4),
+        (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+    },
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): {
+        (16, 16, 16, 16, 16, False, False, False): (1, 1, 1, 1),
+        (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 2),
+        (16, 16, 16, 16, 16, False, True, False): (1, 1, 1, 1),
+        (16, 16, 16, 16, 16, False, True, True): (1, 1, 1, 8),
+        (16, 16, 16, 16, 16, True, False, False): (3, 1, 3, 4),
+        (16, 16, 16, 16, 16, True, False, True): (1, 1, 2, 1),
+        (16, 16, 32, 16, 16, False, False, False): (1, 2, 1, 8),
+        (16, 16, 32, 16, 16, False, False, True): (1, 2, 1, 2),
+        (16, 16, 32, 16, 16, False, True, False): (2, 1, 1, 4),
+        (16, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4),
+        (16, 16, 32, 16, 16, True, False, False): (1, 1, 1, 4),
+        (16, 16, 32, 16, 16, True, False, True): (1, 2, 1, 2),
+        (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 1),
+        (16, 16, 64, 16, 16, False, False, True): (1, 2, 2, 4),
+        (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4),
+        (16, 16, 64, 16, 16, False, True, True): (1, 2, 1, 4),
+        (16, 16, 64, 16, 16, True, False, False): (1, 4, 1, 2),
+        (16, 16, 64, 16, 16, True, False, True): (1, 1, 1, 2),
+        (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 4),
+        (16, 32, 16, 16, 16, False, False, True): (1, 1, 1, 4),
+        (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2),
+        (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 2),
+        (16, 32, 16, 16, 16, True, False, False): (1, 1, 2, 16),
+        (16, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4),
+        (16, 32, 16, 16, 32, False, False, False): (2, 1, 1, 8),
+        (16, 32, 16, 16, 32, False, False, True): (2, 1, 1, 8),
+        (16, 32, 16, 16, 32, False, True, False): (1, 1, 2, 1),
+        (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (16, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8),
+        (16, 32, 16, 16, 32, True, False, True): (1, 1, 2, 4),
+        (16, 32, 32, 16, 16, False, False, False): (1, 1, 1, 16),
+        (16, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2),
+        (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 8),
+        (16, 32, 32, 16, 16, False, True, True): (3, 2, 1, 4),
+        (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4),
+        (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 2),
+        (16, 32, 32, 16, 32, False, False, False): (1, 2, 1, 2),
+        (16, 32, 32, 16, 32, False, False, True): (1, 1, 1, 4),
+        (16, 32, 32, 16, 32, False, True, False): (1, 1, 2, 4),
+        (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2),
+        (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2),
+        (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 16),
+        (16, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, True, False, False): (3, 4, 1, 2),
+        (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 1),
+        (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 16),
+        (16, 32, 64, 16, 32, False, False, True): (1, 2, 1, 2),
+        (16, 32, 64, 16, 32, False, True, False): (1, 4, 2, 2),
+        (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 8),
+        (16, 32, 64, 16, 32, True, False, False): (1, 4, 1, 8),
+        (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4),
+        (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2),
+        (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4),
+        (16, 64, 16, 16, 32, False, True, False): (2, 1, 2, 4),
+        (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 4),
+        (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 2),
+        (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, True, False): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, True, True): (1, 2, 3, 2),
+        (16, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, True, False, True): (1, 1, 2, 4),
+        (16, 64, 64, 16, 32, False, False, False): (1, 4, 1, 8),
+        (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 1),
+        (16, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4),
+        (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4),
+        (16, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4),
+        (32, 16, 16, 16, 16, False, False, False): (2, 1, 2, 4),
+        (32, 16, 16, 16, 16, False, False, True): (2, 1, 1, 2),
+        (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, False, True, True): (1, 1, 1, 2),
+        (32, 16, 16, 16, 16, True, False, False): (1, 1, 1, 4),
+        (32, 16, 16, 16, 16, True, False, True): (2, 1, 1, 2),
+        (32, 16, 32, 16, 16, False, False, False): (1, 1, 1, 4),
+        (32, 16, 32, 16, 16, False, False, True): (1, 1, 1, 4),
+        (32, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4),
+        (32, 16, 32, 16, 16, False, True, True): (2, 2, 1, 4),
+        (32, 16, 32, 16, 16, True, False, False): (2, 1, 1, 4),
+        (32, 16, 32, 16, 16, True, False, True): (2, 2, 1, 2),
+        (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 2),
+        (32, 16, 64, 16, 16, False, False, True): (1, 4, 1, 4),
+        (32, 16, 64, 16, 16, False, True, False): (1, 2, 1, 4),
+        (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 2),
+        (32, 16, 64, 16, 16, True, False, False): (1, 4, 2, 8),
+        (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 1),
+        (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 4),
+        (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4),
+        (32, 32, 16, 16, 16, False, True, False): (1, 1, 2, 4),
+        (32, 32, 16, 16, 16, False, True, True): (1, 1, 2, 2),
+        (32, 32, 16, 16, 16, True, False, False): (1, 1, 1, 8),
+        (32, 32, 16, 16, 16, True, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, False, False): (1, 1, 3, 2),
+        (32, 32, 16, 16, 32, False, False, True): (2, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, False): (3, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, True, False, False): (2, 1, 1, 8),
+        (32, 32, 16, 16, 32, True, False, True): (1, 1, 3, 2),
+        (32, 32, 16, 32, 32, False, False, False): (1, 1, 1, 2),
+        (32, 32, 16, 32, 32, False, False, True): (2, 1, 1, 8),
+        (32, 32, 16, 32, 32, False, True, False): (1, 1, 1, 2),
+        (32, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8),
+        (32, 32, 16, 32, 32, True, False, False): (1, 1, 2, 4),
+        (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 2),
+        (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4),
+        (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 4),
+        (32, 32, 32, 16, 16, False, True, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 16, False, True, True): (1, 2, 1, 2),
+        (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 2),
+        (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 2),
+        (32, 32, 32, 16, 32, True, False, False): (1, 2, 1, 1),
+        (32, 32, 32, 16, 32, True, False, True): (1, 2, 1, 2),
+        (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4),
+        (32, 32, 32, 32, 32, False, False, True): (2, 1, 1, 4),
+        (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8),
+        (32, 32, 32, 32, 32, False, True, True): (1, 1, 1, 8),
+        (32, 32, 32, 32, 32, True, False, False): (1, 1, 3, 4),
+        (32, 32, 32, 32, 32, True, False, True): (1, 1, 1, 8),
+        (32, 32, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, False, False, True): (1, 4, 1, 2),
+        (32, 32, 64, 16, 16, False, True, False): (1, 1, 1, 4),
+        (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, True, False, False): (1, 4, 1, 8),
+        (32, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2),
+        (32, 32, 64, 16, 32, False, False, False): (1, 1, 1, 4),
+        (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 32, False, True, False): (1, 1, 1, 4),
+        (32, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 32, True, False, False): (2, 2, 1, 8),
+        (32, 32, 64, 16, 32, True, False, True): (1, 2, 1, 2),
+        (32, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4),
+        (32, 32, 64, 32, 32, False, False, True): (1, 2, 1, 1),
+        (32, 32, 64, 32, 32, False, True, False): (1, 2, 2, 8),
+        (32, 32, 64, 32, 32, False, True, True): (1, 1, 1, 4),
+        (32, 32, 64, 32, 32, True, False, False): (1, 2, 1, 4),
+        (32, 32, 64, 32, 32, True, False, True): (2, 2, 1, 4),
+        (32, 64, 16, 16, 32, False, False, False): (1, 1, 1, 8),
+        (32, 64, 16, 16, 32, False, False, True): (1, 1, 1, 4),
+        (32, 64, 16, 16, 32, False, True, False): (2, 1, 1, 4),
+        (32, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (32, 64, 16, 16, 32, True, False, False): (1, 1, 2, 4),
+        (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 2),
+        (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 8),
+        (32, 64, 16, 32, 32, False, False, True): (2, 1, 1, 4),
+        (32, 64, 16, 32, 32, False, True, False): (1, 1, 1, 4),
+        (32, 64, 16, 32, 32, False, True, True): (1, 1, 2, 2),
+        (32, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2),
+        (32, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4),
+        (32, 64, 32, 16, 32, False, False, False): (1, 1, 1, 4),
+        (32, 64, 32, 16, 32, False, False, True): (1, 2, 1, 2),
+        (32, 64, 32, 16, 32, False, True, False): (1, 2, 3, 4),
+        (32, 64, 32, 16, 32, False, True, True): (2, 2, 1, 4),
+        (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 4),
+        (32, 64, 32, 16, 32, True, False, True): (1, 2, 2, 1),
+        (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 8),
+        (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, False, True, False): (1, 1, 2, 4),
+        (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, True, False, False): (2, 1, 1, 2),
+        (32, 64, 32, 32, 32, True, False, True): (1, 1, 1, 4),
+        (32, 64, 64, 16, 32, False, False, False): (1, 4, 2, 1),
+        (32, 64, 64, 16, 32, False, False, True): (3, 4, 1, 4),
+        (32, 64, 64, 16, 32, False, True, False): (1, 1, 1, 8),
+        (32, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, True, False, True): (2, 2, 3, 4),
+        (32, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4),
+        (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4),
+        (32, 64, 64, 32, 32, False, True, False): (1, 2, 2, 8),
+        (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4),
+        (32, 64, 64, 32, 32, True, False, False): (1, 2, 2, 4),
+        (32, 64, 64, 32, 32, True, False, True): (1, 2, 1, 4),
+        (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 1),
+        (64, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4),
+        (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 8),
+        (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 4),
+        (64, 32, 16, 32, 32, True, False, False): (2, 1, 1, 2),
+        (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, False, False): (3, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, False, True): (1, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8),
+        (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 2),
+        (64, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2),
+        (64, 32, 32, 32, 32, True, False, True): (1, 1, 1, 4),
+        (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 2),
+        (64, 32, 64, 32, 32, False, False, True): (3, 2, 1, 4),
+        (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 1),
+        (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 4),
+        (64, 32, 64, 32, 32, True, False, False): (1, 1, 3, 4),
+        (64, 32, 64, 32, 32, True, False, True): (1, 2, 2, 4),
+        (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 2),
+        (64, 64, 16, 32, 32, False, False, True): (1, 1, 3, 2),
+        (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 8),
+        (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4),
+        (64, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4),
+        (64, 64, 16, 32, 32, True, False, True): (2, 1, 2, 4),
+        (64, 64, 32, 32, 32, False, False, False): (1, 1, 2, 8),
+        (64, 64, 32, 32, 32, False, False, True): (1, 1, 2, 4),
+        (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4),
+        (64, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4),
+        (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4),
+        (64, 64, 32, 32, 32, True, False, True): (2, 1, 2, 4),
+        (64, 64, 64, 32, 32, False, False, False): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, True, True): (3, 2, 1, 4),
+        (64, 64, 64, 32, 32, True, False, False): (1, 2, 1, 8),
+        (64, 64, 64, 32, 32, True, False, True): (1, 2, 3, 4),
+        (256, 256, 256, 16, 16, False, True, True): (4, 8, 6, 2),
+        (256, 256, 256, 16, 16, True, False, True): (5, 16, 5, 1),
+        (256, 256, 256, 32, 32, False, True, True): (1, 8, 7, 4),
+        (256, 256, 256, 32, 32, True, False, True): (1, 8, 5, 4),
+        (256, 256, 256, 64, 64, False, True, True): (1, 4, 5, 4),
+        (256, 256, 256, 64, 64, True, False, True): (2, 4, 3, 4),
+        (256, 256, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (256, 256, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (256, 256, 512, 16, 16, False, True, True): (4, 8, 4, 4),
+        (256, 256, 512, 16, 16, True, False, True): (4, 8, 6, 2),
+        (256, 256, 512, 32, 32, False, True, True): (3, 8, 5, 4),
+        (256, 256, 512, 32, 32, True, False, True): (2, 8, 5, 4),
+        (256, 256, 512, 64, 64, False, True, True): (2, 8, 4, 4),
+        (256, 256, 512, 64, 64, True, False, True): (1, 8, 7, 4),
+        (256, 256, 512, 128, 128, False, True, True): (2, 4, 2, 8),
+        (256, 256, 512, 128, 128, True, False, True): (5, 4, 2, 8),
+        (256, 256, 1024, 16, 16, False, True, True): (1, 8, 4, 4),
+        (256, 256, 1024, 16, 16, True, False, True): (1, 16, 4, 2),
+        (256, 256, 1024, 32, 32, False, True, True): (5, 32, 5, 1),
+        (256, 256, 1024, 32, 32, True, False, True): (1, 16, 4, 2),
+        (256, 256, 1024, 64, 64, False, True, True): (1, 16, 4, 4),
+        (256, 256, 1024, 64, 64, True, False, True): (2, 16, 3, 4),
+        (256, 256, 1024, 128, 128, False, True, True): (9, 8, 2, 8),
+        (256, 256, 1024, 128, 128, True, False, True): (1, 8, 2, 8),
+        (256, 256, 2048, 16, 16, False, True, True): (6, 32, 5, 2),
+        (256, 256, 2048, 16, 16, True, False, True): (2, 32, 4, 2),
+        (256, 256, 2048, 32, 32, False, True, True): (1, 32, 3, 2),
+        (256, 256, 2048, 32, 32, True, False, True): (1, 32, 3, 2),
+        (256, 256, 2048, 64, 64, False, True, True): (2, 32, 4, 4),
+        (256, 256, 2048, 64, 64, True, False, True): (2, 16, 4, 4),
+        (256, 256, 2048, 128, 128, False, True, True): (3, 16, 2, 8),
+        (256, 256, 2048, 128, 128, True, False, True): (4, 16, 2, 8),
+        (256, 256, 4096, 16, 16, False, True, True): (1, 32, 3, 4),
+        (256, 256, 4096, 16, 16, True, False, True): (3, 16, 3, 2),
+        (256, 256, 4096, 32, 32, False, True, True): (3, 32, 3, 2),
+        (256, 256, 4096, 32, 32, True, False, True): (1, 32, 3, 2),
+        (256, 256, 4096, 64, 64, False, True, True): (2, 32, 3, 4),
+        (256, 256, 4096, 64, 64, True, False, True): (2, 32, 3, 4),
+        (256, 256, 4096, 128, 128, False, True, True): (5, 32, 2, 8),
+        (256, 256, 4096, 128, 128, True, False, True): (1, 32, 2, 8),
+        (256, 256, 8192, 16, 16, False, True, True): (8, 32, 3, 4),
+        (256, 256, 8192, 16, 16, True, False, True): (1, 32, 3, 2),
+        (256, 256, 8192, 32, 32, False, True, True): (3, 64, 3, 4),
+        (256, 256, 8192, 32, 32, True, False, True): (2, 128, 1, 2),
+        (256, 256, 8192, 64, 64, False, True, True): (7, 128, 1, 4),
+        (256, 256, 8192, 64, 64, True, False, True): (4, 128, 1, 4),
+        (256, 256, 8192, 128, 128, False, True, True): (2, 64, 1, 4),
+        (256, 256, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 2),
+        (256, 256, 16384, 16, 16, True, False, True): (5, 64, 3, 2),
+        (256, 256, 16384, 32, 32, False, True, True): (5, 128, 3, 2),
+        (256, 256, 16384, 32, 32, True, False, True): (5, 128, 3, 2),
+        (256, 256, 16384, 64, 64, False, True, True): (1, 256, 1, 4),
+        (256, 256, 16384, 64, 64, True, False, True): (5, 128, 3, 4),
+        (256, 256, 16384, 128, 128, False, True, True): (11, 128, 2, 8),
+        (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 4),
+        (256, 256, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 2),
+        (256, 256, 32768, 32, 32, False, True, True): (4, 256, 3, 2),
+        (256, 256, 32768, 32, 32, True, False, True): (1, 256, 3, 2),
+        (256, 256, 32768, 64, 64, False, True, True): (2, 256, 1, 4),
+        (256, 256, 32768, 64, 64, True, False, True): (2, 256, 1, 4),
+        (256, 256, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (256, 256, 32768, 128, 128, True, False, True): (2, 256, 1, 4),
+        (256, 256, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
+        (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 2),
+        (256, 256, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
+        (256, 256, 65536, 32, 32, True, False, True): (4, 512, 3, 2),
+        (256, 256, 65536, 64, 64, False, True, True): (2, 512, 1, 4),
+        (256, 256, 65536, 64, 64, True, False, True): (5, 512, 1, 4),
+        (256, 256, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (256, 256, 65536, 128, 128, True, False, True): (1, 512, 1, 4),
+        (256, 256, 131072, 16, 16, False, True, True): (1, 512, 3, 1),
+        (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 2),
+        (256, 256, 131072, 32, 32, False, True, True): (2, 1024, 3, 2),
+        (256, 256, 131072, 32, 32, True, False, True): (1, 1024, 3, 2),
+        (256, 256, 131072, 64, 64, False, True, True): (1, 1024, 1, 4),
+        (256, 256, 131072, 64, 64, True, False, True): (1, 1024, 1, 4),
+        (256, 256, 131072, 128, 128, False, True, True): (7, 1024, 1, 4),
+        (256, 256, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (512, 512, 256, 16, 16, False, True, True): (1, 8, 5, 1),
+        (512, 512, 256, 16, 16, True, False, True): (2, 16, 5, 1),
+        (512, 512, 256, 32, 32, False, True, True): (2, 8, 5, 2),
+        (512, 512, 256, 32, 32, True, False, True): (4, 4, 5, 2),
+        (512, 512, 256, 64, 64, False, True, True): (1, 4, 5, 4),
+        (512, 512, 256, 64, 64, True, False, True): (3, 4, 5, 4),
+        (512, 512, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (512, 512, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (512, 512, 512, 16, 16, False, True, True): (1, 8, 4, 4),
+        (512, 512, 512, 16, 16, True, False, True): (4, 16, 5, 1),
+        (512, 512, 512, 32, 32, False, True, True): (4, 8, 5, 2),
+        (512, 512, 512, 32, 32, True, False, True): (7, 16, 4, 1),
+        (512, 512, 512, 64, 64, False, True, True): (3, 8, 5, 4),
+        (512, 512, 512, 64, 64, True, False, True): (1, 8, 4, 4),
+        (512, 512, 512, 128, 128, False, True, True): (4, 4, 2, 8),
+        (512, 512, 512, 128, 128, True, False, True): (4, 4, 2, 8),
+        (512, 512, 1024, 16, 16, False, True, True): (2, 8, 4, 4),
+        (512, 512, 1024, 16, 16, True, False, True): (2, 16, 4, 2),
+        (512, 512, 1024, 32, 32, False, True, True): (3, 16, 4, 2),
+        (512, 512, 1024, 32, 32, True, False, True): (3, 16, 3, 2),
+        (512, 512, 1024, 64, 64, False, True, True): (5, 8, 5, 4),
+        (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 4),
+        (512, 512, 1024, 128, 128, False, True, True): (6, 8, 2, 8),
+        (512, 512, 1024, 128, 128, True, False, True): (4, 8, 2, 8),
+        (512, 512, 2048, 16, 16, False, True, True): (2, 16, 3, 4),
+        (512, 512, 2048, 16, 16, True, False, True): (1, 16, 4, 2),
+        (512, 512, 2048, 32, 32, False, True, True): (2, 32, 3, 2),
+        (512, 512, 2048, 32, 32, True, False, True): (2, 32, 3, 2),
+        (512, 512, 2048, 64, 64, False, True, True): (1, 32, 3, 4),
+        (512, 512, 2048, 64, 64, True, False, True): (1, 32, 3, 2),
+        (512, 512, 2048, 128, 128, False, True, True): (3, 16, 2, 8),
+        (512, 512, 2048, 128, 128, True, False, True): (1, 16, 2, 8),
+        (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 2),
+        (512, 512, 4096, 16, 16, True, False, True): (1, 32, 3, 2),
+        (512, 512, 4096, 32, 32, False, True, True): (3, 32, 3, 2),
+        (512, 512, 4096, 32, 32, True, False, True): (3, 32, 3, 2),
+        (512, 512, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (512, 512, 4096, 64, 64, True, False, True): (1, 64, 1, 4),
+        (512, 512, 4096, 128, 128, False, True, True): (7, 32, 2, 8),
+        (512, 512, 4096, 128, 128, True, False, True): (1, 32, 2, 8),
+        (512, 512, 8192, 16, 16, False, True, True): (4, 64, 3, 2),
+        (512, 512, 8192, 16, 16, True, False, True): (1, 64, 3, 2),
+        (512, 512, 8192, 32, 32, False, True, True): (3, 64, 3, 2),
+        (512, 512, 8192, 32, 32, True, False, True): (1, 64, 3, 2),
+        (512, 512, 8192, 64, 64, False, True, True): (1, 64, 3, 4),
+        (512, 512, 8192, 64, 64, True, False, True): (1, 64, 3, 4),
+        (512, 512, 8192, 128, 128, False, True, True): (7, 64, 2, 8),
+        (512, 512, 8192, 128, 128, True, False, True): (1, 64, 1, 4),
+        (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 2),
+        (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (512, 512, 16384, 32, 32, False, True, True): (1, 128, 3, 2),
+        (512, 512, 16384, 32, 32, True, False, True): (1, 128, 3, 2),
+        (512, 512, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (512, 512, 16384, 64, 64, True, False, True): (4, 128, 3, 4),
+        (512, 512, 16384, 128, 128, False, True, True): (5, 128, 2, 8),
+        (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 4),
+        (512, 512, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 2),
+        (512, 512, 32768, 32, 32, False, True, True): (1, 256, 3, 2),
+        (512, 512, 32768, 32, 32, True, False, True): (1, 256, 3, 2),
+        (512, 512, 32768, 64, 64, False, True, True): (1, 256, 3, 4),
+        (512, 512, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
+        (512, 512, 32768, 128, 128, False, True, True): (5, 256, 1, 4),
+        (512, 512, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (512, 512, 65536, 16, 16, False, True, True): (1, 256, 3, 2),
+        (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
+        (512, 512, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
+        (512, 512, 65536, 32, 32, True, False, True): (1, 512, 3, 2),
+        (512, 512, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (512, 512, 65536, 64, 64, True, False, True): (1, 512, 3, 4),
+        (512, 512, 65536, 128, 128, False, True, True): (7, 512, 1, 4),
+        (512, 512, 65536, 128, 128, True, False, True): (5, 512, 1, 4),
+        (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 1),
+        (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 1),
+        (512, 512, 131072, 32, 32, False, True, True): (1, 1024, 3, 2),
+        (512, 512, 131072, 32, 32, True, False, True): (1, 1024, 3, 2),
+        (512, 512, 131072, 64, 64, False, True, True): (4, 512, 2, 4),
+        (512, 512, 131072, 64, 64, True, False, True): (2, 512, 2, 4),
+        (512, 512, 131072, 128, 128, False, True, True): (5, 1024, 1, 4),
+        (512, 512, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (1024, 1024, 256, 16, 16, False, True, True): (3, 4, 5, 4),
+        (1024, 1024, 256, 16, 16, True, False, True): (3, 4, 5, 4),
+        (1024, 1024, 256, 32, 32, False, True, True): (2, 4, 6, 2),
+        (1024, 1024, 256, 32, 32, True, False, True): (2, 4, 6, 2),
+        (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 4, 4),
+        (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 6, 4),
+        (1024, 1024, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (1024, 1024, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (1024, 1024, 512, 16, 16, False, True, True): (3, 4, 5, 4),
+        (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 4, 2),
+        (1024, 1024, 512, 32, 32, False, True, True): (1, 8, 4, 2),
+        (1024, 1024, 512, 32, 32, True, False, True): (1, 8, 4, 2),
+        (1024, 1024, 512, 64, 64, False, True, True): (2, 8, 3, 4),
+        (1024, 1024, 512, 64, 64, True, False, True): (1, 4, 4, 4),
+        (1024, 1024, 512, 128, 128, False, True, True): (7, 4, 2, 8),
+        (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 2, 8),
+        (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 4, 2),
+        (1024, 1024, 1024, 16, 16, True, False, True): (3, 8, 5, 2),
+        (1024, 1024, 1024, 32, 32, False, True, True): (1, 8, 4, 4),
+        (1024, 1024, 1024, 32, 32, True, False, True): (1, 8, 4, 2),
+        (1024, 1024, 1024, 64, 64, False, True, True): (1, 16, 3, 4),
+        (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 3, 4),
+        (1024, 1024, 1024, 128, 128, False, True, True): (6, 8, 2, 8),
+        (1024, 1024, 1024, 128, 128, True, False, True): (4, 8, 2, 8),
+        (1024, 1024, 2048, 16, 16, False, True, True): (3, 8, 3, 4),
+        (1024, 1024, 2048, 16, 16, True, False, True): (3, 8, 3, 4),
+        (1024, 1024, 2048, 32, 32, False, True, True): (1, 16, 3, 4),
+        (1024, 1024, 2048, 32, 32, True, False, True): (1, 16, 3, 2),
+        (1024, 1024, 2048, 64, 64, False, True, True): (5, 16, 3, 4),
+        (1024, 1024, 2048, 64, 64, True, False, True): (5, 16, 3, 4),
+        (1024, 1024, 2048, 128, 128, False, True, True): (3, 16, 2, 8),
+        (1024, 1024, 2048, 128, 128, True, False, True): (4, 16, 2, 16),
+        (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 2),
+        (1024, 1024, 4096, 16, 16, True, False, True): (8, 32, 3, 2),
+        (1024, 1024, 4096, 32, 32, False, True, True): (9, 32, 3, 2),
+        (1024, 1024, 4096, 32, 32, True, False, True): (1, 32, 3, 2),
+        (1024, 1024, 4096, 64, 64, False, True, True): (6, 32, 3, 4),
+        (1024, 1024, 4096, 64, 64, True, False, True): (1, 32, 3, 4),
+        (1024, 1024, 4096, 128, 128, False, True, True): (4, 32, 2, 8),
+        (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 4),
+        (1024, 1024, 8192, 16, 16, False, True, True): (4, 64, 3, 2),
+        (1024, 1024, 8192, 16, 16, True, False, True): (4, 64, 3, 2),
+        (1024, 1024, 8192, 32, 32, False, True, True): (8, 64, 3, 2),
+        (1024, 1024, 8192, 32, 32, True, False, True): (6, 64, 3, 2),
+        (1024, 1024, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+        (1024, 1024, 8192, 64, 64, True, False, True): (2, 64, 3, 4),
+        (1024, 1024, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (1024, 1024, 8192, 128, 128, True, False, True): (2, 64, 1, 4),
+        (1024, 1024, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (1024, 1024, 16384, 16, 16, True, False, True): (1, 64, 3, 2),
+        (1024, 1024, 16384, 32, 32, False, True, True): (1, 128, 3, 4),
+        (1024, 1024, 16384, 32, 32, True, False, True): (1, 64, 3, 4),
+        (1024, 1024, 16384, 64, 64, False, True, True): (1, 128, 3, 4),
+        (1024, 1024, 16384, 64, 64, True, False, True): (1, 128, 3, 4),
+        (1024, 1024, 16384, 128, 128, False, True, True): (11, 128, 1, 4),
+        (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (1024, 1024, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (1024, 1024, 32768, 16, 16, True, False, True): (1, 128, 3, 1),
+        (1024, 1024, 32768, 32, 32, False, True, True): (1, 256, 3, 2),
+        (1024, 1024, 32768, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1024, 1024, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (1024, 1024, 32768, 64, 64, True, False, True): (1, 256, 3, 4),
+        (1024, 1024, 32768, 128, 128, False, True, True): (7, 256, 1, 4),
+        (1024, 1024, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (1024, 1024, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (1024, 1024, 65536, 16, 16, True, False, True): (1, 256, 3, 1),
+        (1024, 1024, 65536, 32, 32, False, True, True): (1, 512, 3, 2),
+        (1024, 1024, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (1024, 1024, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (1024, 1024, 65536, 64, 64, True, False, True): (1, 512, 3, 4),
+        (1024, 1024, 65536, 128, 128, False, True, True): (10, 512, 1, 4),
+        (1024, 1024, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (1024, 1024, 131072, 16, 16, False, True, True): (11, 512, 3, 2),
+        (1024, 1024, 131072, 16, 16, True, False, True): (11, 512, 3, 2),
+        (1024, 1024, 131072, 32, 32, False, True, True): (7, 1024, 3, 2),
+        (1024, 1024, 131072, 32, 32, True, False, True): (6, 512, 3, 4),
+        (1024, 1024, 131072, 64, 64, False, True, True): (1, 512, 2, 4),
+        (1024, 1024, 131072, 64, 64, True, False, True): (4, 1024, 3, 4),
+        (1024, 1024, 131072, 128, 128, False, True, True): (12, 1024, 1, 4),
+        (1024, 1024, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (2048, 2048, 256, 16, 16, False, True, True): (4, 4, 6, 2),
+        (2048, 2048, 256, 16, 16, True, False, True): (2, 8, 4, 1),
+        (2048, 2048, 256, 32, 32, False, True, True): (3, 4, 4, 2),
+        (2048, 2048, 256, 32, 32, True, False, True): (1, 4, 5, 2),
+        (2048, 2048, 256, 64, 64, False, True, True): (2, 4, 4, 4),
+        (2048, 2048, 256, 64, 64, True, False, True): (2, 4, 4, 4),
+        (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 2, 8),
+        (2048, 2048, 256, 128, 128, True, False, True): (5, 2, 2, 8),
+        (2048, 2048, 512, 16, 16, False, True, True): (5, 4, 4, 4),
+        (2048, 2048, 512, 16, 16, True, False, True): (2, 4, 4, 2),
+        (2048, 2048, 512, 32, 32, False, True, True): (1, 4, 3, 4),
+        (2048, 2048, 512, 32, 32, True, False, True): (3, 4, 4, 2),
+        (2048, 2048, 512, 64, 64, False, True, True): (1, 8, 3, 4),
+        (2048, 2048, 512, 64, 64, True, False, True): (1, 8, 3, 2),
+        (2048, 2048, 512, 128, 128, False, True, True): (3, 4, 2, 8),
+        (2048, 2048, 512, 128, 128, True, False, True): (2, 4, 2, 8),
+        (2048, 2048, 1024, 16, 16, False, True, True): (3, 4, 3, 4),
+        (2048, 2048, 1024, 16, 16, True, False, True): (2, 8, 3, 2),
+        (2048, 2048, 1024, 32, 32, False, True, True): (3, 8, 3, 4),
+        (2048, 2048, 1024, 32, 32, True, False, True): (1, 8, 3, 2),
+        (2048, 2048, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (2048, 2048, 1024, 64, 64, True, False, True): (1, 8, 3, 4),
+        (2048, 2048, 1024, 128, 128, False, True, True): (4, 8, 2, 8),
+        (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 4),
+        (2048, 2048, 2048, 16, 16, False, True, True): (4, 16, 3, 2),
+        (2048, 2048, 2048, 16, 16, True, False, True): (2, 16, 3, 2),
+        (2048, 2048, 2048, 32, 32, False, True, True): (1, 16, 3, 4),
+        (2048, 2048, 2048, 32, 32, True, False, True): (1, 16, 3, 2),
+        (2048, 2048, 2048, 64, 64, False, True, True): (1, 16, 3, 4),
+        (2048, 2048, 2048, 64, 64, True, False, True): (1, 16, 3, 4),
+        (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 2, 8),
+        (2048, 2048, 2048, 128, 128, True, False, True): (5, 16, 1, 4),
+        (2048, 2048, 4096, 16, 16, False, True, True): (4, 32, 4, 2),
+        (2048, 2048, 4096, 16, 16, True, False, True): (4, 32, 3, 2),
+        (2048, 2048, 4096, 32, 32, False, True, True): (4, 16, 3, 8),
+        (2048, 2048, 4096, 32, 32, True, False, True): (4, 16, 3, 4),
+        (2048, 2048, 4096, 64, 64, False, True, True): (4, 32, 3, 4),
+        (2048, 2048, 4096, 64, 64, True, False, True): (4, 32, 3, 4),
+        (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 2, 8),
+        (2048, 2048, 4096, 128, 128, True, False, True): (2, 32, 1, 4),
+        (2048, 2048, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (2048, 2048, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (2048, 2048, 8192, 32, 32, False, True, True): (4, 32, 3, 8),
+        (2048, 2048, 8192, 32, 32, True, False, True): (4, 32, 4, 8),
+        (2048, 2048, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+        (2048, 2048, 8192, 64, 64, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (2048, 2048, 8192, 128, 128, True, False, True): (2, 64, 1, 4),
+        (2048, 2048, 16384, 16, 16, False, True, True): (4, 64, 3, 4),
+        (2048, 2048, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (2048, 2048, 16384, 32, 32, False, True, True): (4, 64, 3, 4),
+        (2048, 2048, 16384, 32, 32, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 16384, 64, 64, False, True, True): (4, 128, 3, 4),
+        (2048, 2048, 16384, 64, 64, True, False, True): (4, 128, 3, 4),
+        (2048, 2048, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (2048, 2048, 16384, 128, 128, True, False, True): (2, 128, 1, 4),
+        (2048, 2048, 32768, 16, 16, False, True, True): (8, 128, 3, 2),
+        (2048, 2048, 32768, 16, 16, True, False, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 32, 32, False, True, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 32, 32, True, False, True): (8, 128, 3, 4),
+        (2048, 2048, 32768, 64, 64, False, True, True): (8, 256, 3, 4),
+        (2048, 2048, 32768, 64, 64, True, False, True): (8, 256, 3, 4),
+        (2048, 2048, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (2048, 2048, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (2048, 2048, 65536, 16, 16, False, True, True): (9, 256, 3, 2),
+        (2048, 2048, 65536, 16, 16, True, False, True): (9, 256, 4, 4),
+        (2048, 2048, 65536, 32, 32, False, True, True): (7, 256, 3, 4),
+        (2048, 2048, 65536, 32, 32, True, False, True): (7, 256, 3, 4),
+        (2048, 2048, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (2048, 2048, 65536, 64, 64, True, False, True): (9, 512, 3, 4),
+        (2048, 2048, 65536, 128, 128, False, True, True): (5, 512, 1, 4),
+        (2048, 2048, 65536, 128, 128, True, False, True): (1, 512, 1, 4),
+        (2048, 2048, 131072, 16, 16, False, True, True): (9, 512, 3, 2),
+        (2048, 2048, 131072, 16, 16, True, False, True): (9, 512, 4, 4),
+        (2048, 2048, 131072, 32, 32, False, True, True): (7, 512, 3, 4),
+        (2048, 2048, 131072, 32, 32, True, False, True): (3, 512, 3, 4),
+        (2048, 2048, 131072, 64, 64, False, True, True): (1, 512, 2, 4),
+        (2048, 2048, 131072, 64, 64, True, False, True): (2, 1024, 3, 4),
+        (2048, 2048, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (2048, 2048, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (4096, 4096, 256, 16, 16, False, True, True): (2, 2, 6, 4),
+        (4096, 4096, 256, 16, 16, True, False, True): (2, 2, 5, 4),
+        (4096, 4096, 256, 32, 32, False, True, True): (7, 2, 4, 4),
+        (4096, 4096, 256, 32, 32, True, False, True): (1, 2, 4, 4),
+        (4096, 4096, 256, 64, 64, False, True, True): (3, 4, 3, 4),
+        (4096, 4096, 256, 64, 64, True, False, True): (3, 4, 3, 4),
+        (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 2, 8),
+        (4096, 4096, 256, 128, 128, True, False, True): (1, 2, 2, 8),
+        (4096, 4096, 512, 16, 16, False, True, True): (4, 2, 3, 4),
+        (4096, 4096, 512, 16, 16, True, False, True): (2, 4, 3, 2),
+        (4096, 4096, 512, 32, 32, False, True, True): (3, 4, 3, 4),
+        (4096, 4096, 512, 32, 32, True, False, True): (3, 4, 3, 2),
+        (4096, 4096, 512, 64, 64, False, True, True): (3, 4, 3, 4),
+        (4096, 4096, 512, 64, 64, True, False, True): (3, 4, 3, 4),
+        (4096, 4096, 512, 128, 128, False, True, True): (2, 4, 2, 8),
+        (4096, 4096, 512, 128, 128, True, False, True): (2, 4, 1, 4),
+        (4096, 4096, 1024, 16, 16, False, True, True): (2, 8, 3, 2),
+        (4096, 4096, 1024, 16, 16, True, False, True): (2, 8, 3, 2),
+        (4096, 4096, 1024, 32, 32, False, True, True): (3, 8, 3, 4),
+        (4096, 4096, 1024, 32, 32, True, False, True): (1, 8, 3, 2),
+        (4096, 4096, 1024, 64, 64, False, True, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 64, 64, True, False, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 128, 128, False, True, True): (2, 8, 2, 8),
+        (4096, 4096, 1024, 128, 128, True, False, True): (2, 8, 2, 8),
+        (4096, 4096, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (4096, 4096, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (4096, 4096, 2048, 32, 32, False, True, True): (4, 8, 4, 8),
+        (4096, 4096, 2048, 32, 32, True, False, True): (4, 8, 4, 8),
+        (4096, 4096, 2048, 64, 64, False, True, True): (1, 16, 3, 4),
+        (4096, 4096, 2048, 64, 64, True, False, True): (4, 16, 3, 4),
+        (4096, 4096, 2048, 128, 128, False, True, True): (2, 16, 2, 8),
+        (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (4096, 4096, 4096, 16, 16, False, True, True): (4, 32, 4, 4),
+        (4096, 4096, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (4096, 4096, 4096, 32, 32, False, True, True): (4, 16, 4, 8),
+        (4096, 4096, 4096, 32, 32, True, False, True): (4, 16, 3, 8),
+        (4096, 4096, 4096, 64, 64, False, True, True): (1, 32, 3, 4),
+        (4096, 4096, 4096, 64, 64, True, False, True): (1, 32, 3, 4),
+        (4096, 4096, 4096, 128, 128, False, True, True): (3, 32, 1, 4),
+        (4096, 4096, 4096, 128, 128, True, False, True): (2, 32, 1, 4),
+        (4096, 4096, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (4096, 4096, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (4096, 4096, 8192, 32, 32, False, True, True): (4, 32, 4, 8),
+        (4096, 4096, 8192, 32, 32, True, False, True): (4, 32, 4, 8),
+        (4096, 4096, 8192, 64, 64, False, True, True): (2, 64, 3, 4),
+        (4096, 4096, 8192, 64, 64, True, False, True): (2, 64, 3, 4),
+        (4096, 4096, 8192, 128, 128, False, True, True): (3, 64, 1, 4),
+        (4096, 4096, 8192, 128, 128, True, False, True): (1, 64, 1, 4),
+        (4096, 4096, 16384, 16, 16, False, True, True): (4, 64, 3, 4),
+        (4096, 4096, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (4096, 4096, 16384, 32, 32, False, True, True): (4, 64, 4, 8),
+        (4096, 4096, 16384, 32, 32, True, False, True): (4, 64, 4, 8),
+        (4096, 4096, 16384, 64, 64, False, True, True): (1, 64, 2, 4),
+        (4096, 4096, 16384, 64, 64, True, False, True): (1, 64, 3, 8),
+        (4096, 4096, 16384, 128, 128, False, True, True): (3, 128, 1, 4),
+        (4096, 4096, 16384, 128, 128, True, False, True): (1, 128, 1, 4),
+        (4096, 4096, 32768, 16, 16, False, True, True): (8, 128, 3, 2),
+        (4096, 4096, 32768, 16, 16, True, False, True): (5, 128, 4, 4),
+        (4096, 4096, 32768, 32, 32, False, True, True): (3, 128, 4, 4),
+        (4096, 4096, 32768, 32, 32, True, False, True): (3, 128, 4, 8),
+        (4096, 4096, 32768, 64, 64, False, True, True): (1, 128, 2, 4),
+        (4096, 4096, 32768, 64, 64, True, False, True): (3, 256, 3, 4),
+        (4096, 4096, 32768, 128, 128, False, True, True): (3, 256, 1, 4),
+        (4096, 4096, 32768, 128, 128, True, False, True): (1, 256, 1, 4),
+        (4096, 4096, 65536, 16, 16, False, True, True): (5, 256, 4, 4),
+        (4096, 4096, 65536, 16, 16, True, False, True): (5, 256, 4, 4),
+        (4096, 4096, 65536, 32, 32, False, True, True): (4, 256, 4, 8),
+        (4096, 4096, 65536, 32, 32, True, False, True): (4, 256, 3, 8),
+        (4096, 4096, 65536, 64, 64, False, True, True): (1, 256, 2, 4),
+        (4096, 4096, 65536, 64, 64, True, False, True): (1, 512, 3, 4),
+        (4096, 4096, 65536, 128, 128, False, True, True): (3, 512, 1, 4),
+        (4096, 4096, 65536, 128, 128, True, False, True): (1, 512, 1, 4),
+        (4096, 4096, 131072, 16, 16, False, True, True): (4, 512, 3, 4),
+        (4096, 4096, 131072, 16, 16, True, False, True): (5, 512, 4, 4),
+        (4096, 4096, 131072, 32, 32, False, True, True): (1, 512, 4, 8),
+        (4096, 4096, 131072, 32, 32, True, False, True): (4, 512, 4, 8),
+        (4096, 4096, 131072, 64, 64, False, True, True): (1, 512, 2, 4),
+        (4096, 4096, 131072, 64, 64, True, False, True): (1, 512, 2, 4),
+        (4096, 4096, 131072, 128, 128, False, True, True): (3, 1024, 1, 4),
+        (4096, 4096, 131072, 128, 128, True, False, True): (1, 1024, 1, 4),
+        (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 6, 4),
+        (8192, 8192, 256, 16, 16, True, False, True): (2, 4, 2, 2),
+        (8192, 8192, 256, 32, 32, False, True, True): (4, 2, 3, 4),
+        (8192, 8192, 256, 32, 32, True, False, True): (4, 2, 3, 4),
+        (8192, 8192, 256, 64, 64, False, True, True): (2, 2, 3, 8),
+        (8192, 8192, 256, 64, 64, True, False, True): (6, 2, 3, 8),
+        (8192, 8192, 256, 128, 128, False, True, True): (3, 2, 1, 4),
+        (8192, 8192, 256, 128, 128, True, False, True): (1, 2, 1, 4),
+        (8192, 8192, 512, 16, 16, False, True, True): (4, 4, 3, 2),
+        (8192, 8192, 512, 16, 16, True, False, True): (4, 4, 3, 4),
+        (8192, 8192, 512, 32, 32, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 32, 32, True, False, True): (5, 4, 3, 2),
+        (8192, 8192, 512, 64, 64, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 64, 64, True, False, True): (2, 2, 3, 8),
+        (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 2, 8),
+        (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 2, 8),
+        (8192, 8192, 1024, 16, 16, False, True, True): (4, 8, 4, 4),
+        (8192, 8192, 1024, 16, 16, True, False, True): (4, 8, 4, 4),
+        (8192, 8192, 1024, 32, 32, False, True, True): (2, 4, 4, 8),
+        (8192, 8192, 1024, 32, 32, True, False, True): (1, 4, 3, 4),
+        (8192, 8192, 1024, 64, 64, False, True, True): (4, 8, 3, 4),
+        (8192, 8192, 1024, 64, 64, True, False, True): (2, 8, 3, 4),
+        (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 2, 8),
+        (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 4),
+        (8192, 8192, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (8192, 8192, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (8192, 8192, 2048, 32, 32, False, True, True): (2, 8, 4, 8),
+        (8192, 8192, 2048, 32, 32, True, False, True): (2, 8, 4, 8),
+        (8192, 8192, 2048, 64, 64, False, True, True): (4, 8, 2, 4),
+        (8192, 8192, 2048, 64, 64, True, False, True): (4, 16, 3, 4),
+        (8192, 8192, 2048, 128, 128, False, True, True): (6, 16, 1, 4),
+        (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (8192, 8192, 4096, 16, 16, False, True, True): (4, 32, 4, 2),
+        (8192, 8192, 4096, 16, 16, True, False, True): (4, 32, 4, 2),
+        (8192, 8192, 4096, 32, 32, False, True, True): (2, 16, 4, 8),
+        (8192, 8192, 4096, 32, 32, True, False, True): (4, 16, 4, 8),
+        (8192, 8192, 4096, 64, 64, False, True, True): (4, 16, 2, 4),
+        (8192, 8192, 4096, 64, 64, True, False, True): (4, 16, 2, 4),
+        (8192, 8192, 4096, 128, 128, False, True, True): (6, 32, 1, 4),
+        (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 4),
+        (8192, 8192, 8192, 16, 16, False, True, True): (4, 64, 4, 2),
+        (8192, 8192, 8192, 16, 16, True, False, True): (4, 64, 4, 2),
+        (8192, 8192, 8192, 32, 32, False, True, True): (2, 32, 4, 8),
+        (8192, 8192, 8192, 32, 32, True, False, True): (2, 32, 4, 8),
+        (8192, 8192, 8192, 64, 64, False, True, True): (2, 32, 2, 4),
+        (8192, 8192, 8192, 64, 64, True, False, True): (4, 32, 2, 4),
+        (8192, 8192, 8192, 128, 128, False, True, True): (6, 64, 1, 4),
+        (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (8192, 8192, 16384, 16, 16, False, True, True): (4, 64, 3, 4),
+        (8192, 8192, 16384, 16, 16, True, False, True): (4, 64, 4, 4),
+        (8192, 8192, 16384, 32, 32, False, True, True): (4, 64, 4, 8),
+        (8192, 8192, 16384, 32, 32, True, False, True): (4, 64, 4, 8),
+        (8192, 8192, 16384, 64, 64, False, True, True): (4, 64, 2, 4),
+        (8192, 8192, 16384, 64, 64, True, False, True): (4, 64, 3, 8),
+        (8192, 8192, 16384, 128, 128, False, True, True): (6, 128, 1, 4),
+        (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (8192, 8192, 32768, 16, 16, False, True, True): (3, 128, 4, 4),
+        (8192, 8192, 32768, 16, 16, True, False, True): (3, 128, 4, 4),
+        (8192, 8192, 32768, 32, 32, False, True, True): (2, 128, 4, 8),
+        (8192, 8192, 32768, 32, 32, True, False, True): (2, 128, 4, 8),
+        (8192, 8192, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (8192, 8192, 32768, 64, 64, True, False, True): (2, 128, 3, 8),
+        (8192, 8192, 32768, 128, 128, False, True, True): (6, 256, 1, 4),
+        (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (8192, 8192, 65536, 16, 16, False, True, True): (3, 256, 4, 4),
+        (8192, 8192, 65536, 16, 16, True, False, True): (4, 256, 4, 4),
+        (8192, 8192, 65536, 32, 32, False, True, True): (2, 256, 4, 8),
+        (8192, 8192, 65536, 32, 32, True, False, True): (2, 256, 3, 8),
+        (8192, 8192, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (8192, 8192, 65536, 64, 64, True, False, True): (4, 256, 3, 8),
+        (8192, 8192, 65536, 128, 128, False, True, True): (6, 512, 1, 4),
+        (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (8192, 8192, 131072, 16, 16, False, True, True): (4, 512, 4, 4),
+        (8192, 8192, 131072, 16, 16, True, False, True): (3, 512, 4, 4),
+        (8192, 8192, 131072, 32, 32, False, True, True): (2, 512, 4, 8),
+        (8192, 8192, 131072, 32, 32, True, False, True): (2, 512, 4, 8),
+        (8192, 8192, 131072, 64, 64, False, True, True): (2, 512, 2, 4),
+        (8192, 8192, 131072, 64, 64, True, False, True): (2, 512, 2, 4),
+        (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+        (16384, 16384, 256, 16, 16, False, True, True): (2, 2, 3, 2),
+        (16384, 16384, 256, 16, 16, True, False, True): (2, 2, 6, 4),
+        (16384, 16384, 256, 32, 32, False, True, True): (4, 2, 3, 4),
+        (16384, 16384, 256, 32, 32, True, False, True): (4, 2, 3, 2),
+        (16384, 16384, 256, 64, 64, False, True, True): (2, 2, 5, 4),
+        (16384, 16384, 256, 64, 64, True, False, True): (2, 2, 3, 8),
+        (16384, 16384, 256, 128, 128, False, True, True): (4, 2, 2, 8),
+        (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 4),
+        (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 4, 4),
+        (16384, 16384, 512, 16, 16, True, False, True): (1, 2, 4, 4),
+        (16384, 16384, 512, 32, 32, False, True, True): (2, 2, 3, 8),
+        (16384, 16384, 512, 32, 32, True, False, True): (2, 2, 4, 8),
+        (16384, 16384, 512, 64, 64, False, True, True): (4, 4, 3, 4),
+        (16384, 16384, 512, 64, 64, True, False, True): (2, 4, 3, 4),
+        (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 2, 8),
+        (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 2, 8),
+        (16384, 16384, 1024, 16, 16, False, True, True): (4, 8, 4, 4),
+        (16384, 16384, 1024, 16, 16, True, False, True): (2, 4, 4, 4),
+        (16384, 16384, 1024, 32, 32, False, True, True): (2, 4, 4, 8),
+        (16384, 16384, 1024, 32, 32, True, False, True): (2, 4, 4, 8),
+        (16384, 16384, 1024, 64, 64, False, True, True): (4, 4, 2, 4),
+        (16384, 16384, 1024, 64, 64, True, False, True): (2, 4, 2, 4),
+        (16384, 16384, 1024, 128, 128, False, True, True): (6, 8, 1, 4),
+        (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 4),
+        (16384, 16384, 2048, 16, 16, False, True, True): (2, 8, 4, 4),
+        (16384, 16384, 2048, 16, 16, True, False, True): (2, 8, 4, 4),
+        (16384, 16384, 2048, 32, 32, False, True, True): (2, 8, 4, 8),
+        (16384, 16384, 2048, 32, 32, True, False, True): (2, 8, 4, 8),
+        (16384, 16384, 2048, 64, 64, False, True, True): (2, 8, 2, 4),
+        (16384, 16384, 2048, 64, 64, True, False, True): (2, 8, 2, 4),
+        (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 2, 8),
+        (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 4),
+        (16384, 16384, 4096, 16, 16, False, True, True): (2, 16, 4, 4),
+        (16384, 16384, 4096, 16, 16, True, False, True): (2, 16, 4, 4),
+        (16384, 16384, 4096, 32, 32, False, True, True): (1, 16, 4, 8),
+        (16384, 16384, 4096, 32, 32, True, False, True): (2, 16, 3, 4),
+        (16384, 16384, 4096, 64, 64, False, True, True): (1, 16, 2, 4),
+        (16384, 16384, 4096, 64, 64, True, False, True): (2, 16, 2, 4),
+        (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 2, 8),
+        (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 4),
+        (16384, 16384, 8192, 16, 16, False, True, True): (2, 64, 4, 2),
+        (16384, 16384, 8192, 16, 16, True, False, True): (2, 64, 4, 2),
+        (16384, 16384, 8192, 32, 32, False, True, True): (2, 32, 4, 8),
+        (16384, 16384, 8192, 32, 32, True, False, True): (2, 32, 4, 8),
+        (16384, 16384, 8192, 64, 64, False, True, True): (2, 32, 2, 4),
+        (16384, 16384, 8192, 64, 64, True, False, True): (2, 32, 4, 8),
+        (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 2, 8),
+        (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 4),
+        (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 4, 4),
+        (16384, 16384, 16384, 16, 16, True, False, True): (1, 64, 4, 4),
+        (16384, 16384, 16384, 32, 32, False, True, True): (1, 64, 4, 8),
+        (16384, 16384, 16384, 32, 32, True, False, True): (1, 64, 4, 8),
+        (16384, 16384, 16384, 64, 64, False, True, True): (1, 64, 2, 4),
+        (16384, 16384, 16384, 64, 64, True, False, True): (1, 64, 3, 8),
+        (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 4),
+        (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 4),
+        (16384, 16384, 32768, 16, 16, False, True, True): (1, 128, 4, 4),
+        (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 4, 4),
+        (16384, 16384, 32768, 32, 32, False, True, True): (1, 128, 3, 4),
+        (16384, 16384, 32768, 32, 32, True, False, True): (1, 128, 3, 8),
+        (16384, 16384, 32768, 64, 64, False, True, True): (2, 128, 2, 4),
+        (16384, 16384, 32768, 64, 64, True, False, True): (1, 128, 4, 8),
+        (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 2, 8),
+        (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 4),
+        (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (16384, 16384, 65536, 16, 16, True, False, True): (1, 256, 4, 4),
+        (16384, 16384, 65536, 32, 32, False, True, True): (1, 256, 4, 8),
+        (16384, 16384, 65536, 32, 32, True, False, True): (1, 256, 3, 4),
+        (16384, 16384, 65536, 64, 64, False, True, True): (2, 256, 2, 4),
+        (16384, 16384, 65536, 64, 64, True, False, True): (1, 256, 3, 8),
+        (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 2, 8),
+        (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 4),
+        (16384, 16384, 131072, 16, 16, False, True, True): (1, 512, 4, 4),
+        (16384, 16384, 131072, 16, 16, True, False, True): (1, 512, 3, 2),
+        (16384, 16384, 131072, 32, 32, False, True, True): (1, 512, 4, 8),
+        (16384, 16384, 131072, 32, 32, True, False, True): (1, 512, 3, 2),
+        (16384, 16384, 131072, 64, 64, False, True, True): (1, 512, 2, 4),
+        (16384, 16384, 131072, 64, 64, True, False, True): (1, 512, 2, 4),
+        (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 4),
+        (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 4),
+    },
+    ("bsr_dense_addmm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): {
+        (16, 16, 16, 16, 16, False, False, False): (2, 1, 1, 16),
+        (16, 16, 16, 16, 16, False, False, True): (1, 1, 2, 4),
+        (16, 16, 16, 16, 16, False, True, False): (1, 1, 2, 16),
+        (16, 16, 16, 16, 16, False, True, True): (2, 1, 2, 8),
+        (16, 16, 16, 16, 16, True, False, False): (1, 1, 1, 2),
+        (16, 16, 16, 16, 16, True, False, True): (2, 1, 1, 4),
+        (16, 16, 32, 16, 16, False, False, False): (1, 1, 1, 2),
+        (16, 16, 32, 16, 16, False, False, True): (1, 1, 2, 8),
+        (16, 16, 32, 16, 16, False, True, False): (1, 2, 1, 4),
+        (16, 16, 32, 16, 16, False, True, True): (1, 2, 2, 4),
+        (16, 16, 32, 16, 16, True, False, False): (1, 1, 2, 4),
+        (16, 16, 32, 16, 16, True, False, True): (1, 2, 2, 4),
+        (16, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (16, 16, 64, 16, 16, False, False, True): (2, 2, 1, 4),
+        (16, 16, 64, 16, 16, False, True, False): (1, 4, 1, 4),
+        (16, 16, 64, 16, 16, False, True, True): (1, 4, 1, 8),
+        (16, 16, 64, 16, 16, True, False, False): (1, 2, 1, 4),
+        (16, 16, 64, 16, 16, True, False, True): (1, 4, 2, 8),
+        (16, 32, 16, 16, 16, False, False, False): (1, 1, 2, 8),
+        (16, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4),
+        (16, 32, 16, 16, 16, False, True, False): (1, 1, 1, 4),
+        (16, 32, 16, 16, 16, False, True, True): (1, 1, 1, 4),
+        (16, 32, 16, 16, 16, True, False, False): (1, 1, 1, 4),
+        (16, 32, 16, 16, 16, True, False, True): (1, 1, 2, 8),
+        (16, 32, 16, 16, 32, False, False, False): (1, 1, 2, 4),
+        (16, 32, 16, 16, 32, False, False, True): (2, 1, 2, 2),
+        (16, 32, 16, 16, 32, False, True, False): (1, 1, 1, 8),
+        (16, 32, 16, 16, 32, False, True, True): (1, 1, 1, 2),
+        (16, 32, 16, 16, 32, True, False, False): (3, 1, 1, 4),
+        (16, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (16, 32, 32, 16, 16, False, False, False): (1, 2, 1, 4),
+        (16, 32, 32, 16, 16, False, False, True): (2, 2, 1, 4),
+        (16, 32, 32, 16, 16, False, True, False): (1, 2, 1, 2),
+        (16, 32, 32, 16, 16, False, True, True): (1, 2, 1, 4),
+        (16, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4),
+        (16, 32, 32, 16, 16, True, False, True): (1, 2, 1, 4),
+        (16, 32, 32, 16, 32, False, False, False): (1, 1, 2, 4),
+        (16, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4),
+        (16, 32, 32, 16, 32, False, True, False): (1, 2, 2, 8),
+        (16, 32, 32, 16, 32, False, True, True): (1, 2, 1, 1),
+        (16, 32, 32, 16, 32, True, False, False): (1, 2, 1, 2),
+        (16, 32, 32, 16, 32, True, False, True): (1, 2, 1, 4),
+        (16, 32, 64, 16, 16, False, False, False): (1, 2, 1, 4),
+        (16, 32, 64, 16, 16, False, False, True): (2, 4, 1, 4),
+        (16, 32, 64, 16, 16, False, True, False): (1, 4, 2, 4),
+        (16, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4),
+        (16, 32, 64, 16, 16, True, False, False): (1, 2, 2, 8),
+        (16, 32, 64, 16, 16, True, False, True): (1, 4, 1, 2),
+        (16, 32, 64, 16, 32, False, False, False): (1, 4, 1, 4),
+        (16, 32, 64, 16, 32, False, False, True): (1, 4, 3, 4),
+        (16, 32, 64, 16, 32, False, True, False): (1, 2, 1, 4),
+        (16, 32, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (16, 32, 64, 16, 32, True, False, False): (1, 2, 1, 8),
+        (16, 32, 64, 16, 32, True, False, True): (1, 2, 1, 4),
+        (16, 64, 16, 16, 32, False, False, False): (1, 1, 1, 2),
+        (16, 64, 16, 16, 32, False, False, True): (1, 1, 1, 8),
+        (16, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8),
+        (16, 64, 16, 16, 32, False, True, True): (1, 1, 1, 4),
+        (16, 64, 16, 16, 32, True, False, False): (1, 1, 1, 8),
+        (16, 64, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4),
+        (16, 64, 32, 16, 32, False, False, True): (1, 1, 1, 4),
+        (16, 64, 32, 16, 32, False, True, False): (1, 2, 1, 1),
+        (16, 64, 32, 16, 32, False, True, True): (1, 2, 1, 8),
+        (16, 64, 32, 16, 32, True, False, False): (2, 2, 1, 4),
+        (16, 64, 32, 16, 32, True, False, True): (2, 2, 1, 4),
+        (16, 64, 64, 16, 32, False, False, False): (1, 2, 1, 4),
+        (16, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (16, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4),
+        (16, 64, 64, 16, 32, False, True, True): (1, 4, 1, 4),
+        (16, 64, 64, 16, 32, True, False, False): (1, 4, 1, 2),
+        (16, 64, 64, 16, 32, True, False, True): (3, 4, 1, 4),
+        (32, 16, 16, 16, 16, False, False, False): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, False, False, True): (1, 1, 1, 2),
+        (32, 16, 16, 16, 16, False, True, False): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, False, True, True): (1, 1, 2, 4),
+        (32, 16, 16, 16, 16, True, False, False): (1, 1, 3, 8),
+        (32, 16, 16, 16, 16, True, False, True): (1, 1, 2, 4),
+        (32, 16, 32, 16, 16, False, False, False): (1, 2, 1, 4),
+        (32, 16, 32, 16, 16, False, False, True): (1, 2, 3, 4),
+        (32, 16, 32, 16, 16, False, True, False): (1, 1, 1, 8),
+        (32, 16, 32, 16, 16, False, True, True): (1, 2, 1, 4),
+        (32, 16, 32, 16, 16, True, False, False): (1, 1, 1, 2),
+        (32, 16, 32, 16, 16, True, False, True): (1, 1, 1, 4),
+        (32, 16, 64, 16, 16, False, False, False): (1, 4, 1, 4),
+        (32, 16, 64, 16, 16, False, False, True): (3, 4, 1, 4),
+        (32, 16, 64, 16, 16, False, True, False): (1, 4, 1, 1),
+        (32, 16, 64, 16, 16, False, True, True): (1, 4, 1, 4),
+        (32, 16, 64, 16, 16, True, False, False): (1, 4, 1, 4),
+        (32, 16, 64, 16, 16, True, False, True): (1, 4, 1, 4),
+        (32, 32, 16, 16, 16, False, False, False): (1, 1, 1, 2),
+        (32, 32, 16, 16, 16, False, False, True): (2, 1, 1, 4),
+        (32, 32, 16, 16, 16, False, True, False): (1, 1, 1, 2),
+        (32, 32, 16, 16, 16, False, True, True): (2, 1, 1, 4),
+        (32, 32, 16, 16, 16, True, False, False): (3, 1, 2, 4),
+        (32, 32, 16, 16, 16, True, False, True): (1, 1, 2, 4),
+        (32, 32, 16, 16, 32, False, False, False): (2, 1, 1, 2),
+        (32, 32, 16, 16, 32, False, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, False): (1, 1, 1, 4),
+        (32, 32, 16, 16, 32, False, True, True): (1, 1, 1, 8),
+        (32, 32, 16, 16, 32, True, False, False): (1, 1, 1, 8),
+        (32, 32, 16, 16, 32, True, False, True): (1, 1, 1, 4),
+        (32, 32, 16, 32, 32, False, False, False): (2, 1, 1, 4),
+        (32, 32, 16, 32, 32, False, False, True): (1, 1, 2, 4),
+        (32, 32, 16, 32, 32, False, True, False): (2, 1, 1, 1),
+        (32, 32, 16, 32, 32, False, True, True): (2, 1, 2, 4),
+        (32, 32, 16, 32, 32, True, False, False): (1, 1, 1, 8),
+        (32, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4),
+        (32, 32, 32, 16, 16, False, False, False): (1, 1, 1, 4),
+        (32, 32, 32, 16, 16, False, False, True): (1, 2, 1, 2),
+        (32, 32, 32, 16, 16, False, True, False): (2, 2, 1, 4),
+        (32, 32, 32, 16, 16, False, True, True): (1, 2, 2, 4),
+        (32, 32, 32, 16, 16, True, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 16, True, False, True): (2, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, False, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, False, True): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, True, False): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, False, True, True): (1, 2, 1, 4),
+        (32, 32, 32, 16, 32, True, False, False): (2, 1, 1, 2),
+        (32, 32, 32, 16, 32, True, False, True): (2, 2, 2, 4),
+        (32, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4),
+        (32, 32, 32, 32, 32, False, False, True): (1, 1, 1, 2),
+        (32, 32, 32, 32, 32, False, True, False): (1, 1, 1, 4),
+        (32, 32, 32, 32, 32, False, True, True): (1, 1, 2, 2),
+        (32, 32, 32, 32, 32, True, False, False): (1, 1, 1, 2),
+        (32, 32, 32, 32, 32, True, False, True): (1, 1, 2, 1),
+        (32, 32, 64, 16, 16, False, False, False): (2, 4, 1, 4),
+        (32, 32, 64, 16, 16, False, False, True): (1, 4, 2, 4),
+        (32, 32, 64, 16, 16, False, True, False): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, False, True, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 16, True, False, False): (1, 2, 1, 4),
+        (32, 32, 64, 16, 16, True, False, True): (2, 4, 1, 4),
+        (32, 32, 64, 16, 32, False, False, False): (1, 4, 1, 8),
+        (32, 32, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (32, 32, 64, 16, 32, False, True, False): (1, 4, 1, 4),
+        (32, 32, 64, 16, 32, False, True, True): (2, 4, 1, 4),
+        (32, 32, 64, 16, 32, True, False, False): (1, 2, 2, 4),
+        (32, 32, 64, 16, 32, True, False, True): (2, 4, 1, 4),
+        (32, 32, 64, 32, 32, False, False, False): (2, 2, 1, 4),
+        (32, 32, 64, 32, 32, False, False, True): (1, 1, 1, 4),
+        (32, 32, 64, 32, 32, False, True, False): (1, 1, 1, 8),
+        (32, 32, 64, 32, 32, False, True, True): (2, 1, 1, 4),
+        (32, 32, 64, 32, 32, True, False, False): (1, 1, 1, 4),
+        (32, 32, 64, 32, 32, True, False, True): (1, 2, 1, 1),
+        (32, 64, 16, 16, 32, False, False, False): (1, 1, 2, 2),
+        (32, 64, 16, 16, 32, False, False, True): (2, 1, 1, 4),
+        (32, 64, 16, 16, 32, False, True, False): (1, 1, 1, 8),
+        (32, 64, 16, 16, 32, False, True, True): (1, 1, 3, 4),
+        (32, 64, 16, 16, 32, True, False, False): (1, 1, 1, 2),
+        (32, 64, 16, 16, 32, True, False, True): (1, 1, 2, 4),
+        (32, 64, 16, 32, 32, False, False, False): (1, 1, 1, 2),
+        (32, 64, 16, 32, 32, False, False, True): (1, 1, 3, 4),
+        (32, 64, 16, 32, 32, False, True, False): (1, 1, 2, 4),
+        (32, 64, 16, 32, 32, False, True, True): (1, 1, 1, 8),
+        (32, 64, 16, 32, 32, True, False, False): (1, 1, 2, 4),
+        (32, 64, 16, 32, 32, True, False, True): (1, 1, 1, 8),
+        (32, 64, 32, 16, 32, False, False, False): (1, 2, 1, 4),
+        (32, 64, 32, 16, 32, False, False, True): (1, 2, 3, 4),
+        (32, 64, 32, 16, 32, False, True, False): (1, 2, 1, 8),
+        (32, 64, 32, 16, 32, False, True, True): (3, 2, 1, 4),
+        (32, 64, 32, 16, 32, True, False, False): (1, 1, 1, 8),
+        (32, 64, 32, 16, 32, True, False, True): (1, 2, 1, 4),
+        (32, 64, 32, 32, 32, False, False, False): (1, 1, 1, 1),
+        (32, 64, 32, 32, 32, False, False, True): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, False, True, False): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, False, True, True): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4),
+        (32, 64, 32, 32, 32, True, False, True): (1, 1, 2, 8),
+        (32, 64, 64, 16, 32, False, False, False): (2, 4, 1, 4),
+        (32, 64, 64, 16, 32, False, False, True): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, False, True, False): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, False, True, True): (2, 4, 1, 4),
+        (32, 64, 64, 16, 32, True, False, False): (1, 4, 1, 4),
+        (32, 64, 64, 16, 32, True, False, True): (1, 4, 1, 4),
+        (32, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4),
+        (32, 64, 64, 32, 32, False, False, True): (1, 2, 1, 8),
+        (32, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4),
+        (32, 64, 64, 32, 32, False, True, True): (1, 2, 1, 4),
+        (32, 64, 64, 32, 32, True, False, False): (2, 2, 1, 4),
+        (32, 64, 64, 32, 32, True, False, True): (1, 2, 3, 8),
+        (64, 32, 16, 32, 32, False, False, False): (1, 1, 1, 4),
+        (64, 32, 16, 32, 32, False, False, True): (3, 1, 2, 4),
+        (64, 32, 16, 32, 32, False, True, False): (2, 1, 1, 2),
+        (64, 32, 16, 32, 32, False, True, True): (1, 1, 1, 8),
+        (64, 32, 16, 32, 32, True, False, False): (1, 1, 1, 2),
+        (64, 32, 16, 32, 32, True, False, True): (1, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, False, False): (1, 1, 1, 4),
+        (64, 32, 32, 32, 32, False, False, True): (1, 1, 2, 8),
+        (64, 32, 32, 32, 32, False, True, False): (1, 1, 1, 8),
+        (64, 32, 32, 32, 32, False, True, True): (1, 1, 1, 4),
+        (64, 32, 32, 32, 32, True, False, False): (1, 1, 2, 4),
+        (64, 32, 32, 32, 32, True, False, True): (1, 1, 3, 8),
+        (64, 32, 64, 32, 32, False, False, False): (1, 2, 1, 4),
+        (64, 32, 64, 32, 32, False, False, True): (2, 2, 1, 4),
+        (64, 32, 64, 32, 32, False, True, False): (1, 1, 1, 4),
+        (64, 32, 64, 32, 32, False, True, True): (1, 2, 1, 8),
+        (64, 32, 64, 32, 32, True, False, False): (2, 2, 1, 4),
+        (64, 32, 64, 32, 32, True, False, True): (1, 2, 1, 8),
+        (64, 64, 16, 32, 32, False, False, False): (1, 1, 2, 8),
+        (64, 64, 16, 32, 32, False, False, True): (2, 1, 2, 4),
+        (64, 64, 16, 32, 32, False, True, False): (1, 1, 1, 2),
+        (64, 64, 16, 32, 32, False, True, True): (1, 1, 2, 4),
+        (64, 64, 16, 32, 32, True, False, False): (1, 1, 1, 2),
+        (64, 64, 16, 32, 32, True, False, True): (1, 1, 2, 4),
+        (64, 64, 32, 32, 32, False, False, False): (1, 1, 1, 4),
+        (64, 64, 32, 32, 32, False, False, True): (2, 1, 1, 4),
+        (64, 64, 32, 32, 32, False, True, False): (1, 1, 1, 8),
+        (64, 64, 32, 32, 32, False, True, True): (2, 1, 1, 4),
+        (64, 64, 32, 32, 32, True, False, False): (1, 1, 1, 4),
+        (64, 64, 32, 32, 32, True, False, True): (1, 1, 1, 8),
+        (64, 64, 64, 32, 32, False, False, False): (2, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, False, True): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, True, False): (1, 2, 1, 4),
+        (64, 64, 64, 32, 32, False, True, True): (2, 2, 1, 4),
+        (64, 64, 64, 32, 32, True, False, False): (1, 1, 1, 8),
+        (64, 64, 64, 32, 32, True, False, True): (1, 2, 2, 4),
+        (256, 256, 256, 16, 16, False, True, True): (1, 16, 3, 4),
+        (256, 256, 256, 16, 16, True, False, True): (2, 16, 1, 4),
+        (256, 256, 256, 32, 32, False, True, True): (1, 8, 4, 8),
+        (256, 256, 256, 32, 32, True, False, True): (4, 8, 4, 4),
+        (256, 256, 256, 64, 64, False, True, True): (1, 4, 4, 8),
+        (256, 256, 256, 64, 64, True, False, True): (1, 4, 3, 8),
+        (256, 256, 256, 128, 128, False, True, True): (7, 2, 1, 32),
+        (256, 256, 256, 128, 128, True, False, True): (3, 2, 1, 32),
+        (256, 256, 512, 16, 16, False, True, True): (1, 16, 5, 4),
+        (256, 256, 512, 16, 16, True, False, True): (1, 16, 3, 2),
+        (256, 256, 512, 32, 32, False, True, True): (4, 16, 4, 4),
+        (256, 256, 512, 32, 32, True, False, True): (4, 16, 3, 4),
+        (256, 256, 512, 64, 64, False, True, True): (1, 8, 3, 8),
+        (256, 256, 512, 64, 64, True, False, True): (1, 8, 3, 8),
+        (256, 256, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (256, 256, 512, 128, 128, True, False, True): (3, 4, 1, 32),
+        (256, 256, 1024, 16, 16, False, True, True): (3, 32, 5, 2),
+        (256, 256, 1024, 16, 16, True, False, True): (2, 32, 5, 2),
+        (256, 256, 1024, 32, 32, False, True, True): (1, 32, 4, 4),
+        (256, 256, 1024, 32, 32, True, False, True): (1, 32, 5, 4),
+        (256, 256, 1024, 64, 64, False, True, True): (4, 16, 3, 8),
+        (256, 256, 1024, 64, 64, True, False, True): (1, 16, 3, 8),
+        (256, 256, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (256, 256, 1024, 128, 128, True, False, True): (3, 8, 1, 32),
+        (256, 256, 2048, 16, 16, False, True, True): (3, 32, 3, 4),
+        (256, 256, 2048, 16, 16, True, False, True): (1, 64, 3, 2),
+        (256, 256, 2048, 32, 32, False, True, True): (1, 64, 3, 4),
+        (256, 256, 2048, 32, 32, True, False, True): (1, 64, 3, 4),
+        (256, 256, 2048, 64, 64, False, True, True): (2, 32, 1, 8),
+        (256, 256, 2048, 64, 64, True, False, True): (2, 32, 1, 8),
+        (256, 256, 2048, 128, 128, False, True, True): (4, 16, 1, 32),
+        (256, 256, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (256, 256, 4096, 16, 16, False, True, True): (1, 32, 2, 4),
+        (256, 256, 4096, 16, 16, True, False, True): (1, 32, 3, 4),
+        (256, 256, 4096, 32, 32, False, True, True): (1, 128, 2, 4),
+        (256, 256, 4096, 32, 32, True, False, True): (1, 128, 2, 4),
+        (256, 256, 4096, 64, 64, False, True, True): (2, 64, 4, 8),
+        (256, 256, 4096, 64, 64, True, False, True): (3, 64, 2, 8),
+        (256, 256, 4096, 128, 128, False, True, True): (3, 32, 1, 32),
+        (256, 256, 4096, 128, 128, True, False, True): (2, 32, 1, 32),
+        (256, 256, 8192, 16, 16, False, True, True): (1, 64, 3, 4),
+        (256, 256, 8192, 16, 16, True, False, True): (2, 128, 3, 2),
+        (256, 256, 8192, 32, 32, False, True, True): (3, 128, 3, 4),
+        (256, 256, 8192, 32, 32, True, False, True): (1, 128, 3, 4),
+        (256, 256, 8192, 64, 64, False, True, True): (3, 128, 1, 4),
+        (256, 256, 8192, 64, 64, True, False, True): (4, 128, 2, 8),
+        (256, 256, 8192, 128, 128, False, True, True): (6, 64, 1, 32),
+        (256, 256, 8192, 128, 128, True, False, True): (2, 64, 1, 32),
+        (256, 256, 16384, 16, 16, False, True, True): (4, 128, 3, 4),
+        (256, 256, 16384, 16, 16, True, False, True): (3, 128, 3, 4),
+        (256, 256, 16384, 32, 32, False, True, True): (4, 256, 3, 4),
+        (256, 256, 16384, 32, 32, True, False, True): (2, 256, 3, 4),
+        (256, 256, 16384, 64, 64, False, True, True): (3, 256, 1, 4),
+        (256, 256, 16384, 64, 64, True, False, True): (2, 256, 2, 4),
+        (256, 256, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (256, 256, 16384, 128, 128, True, False, True): (3, 128, 1, 32),
+        (256, 256, 32768, 16, 16, False, True, True): (1, 256, 3, 4),
+        (256, 256, 32768, 16, 16, True, False, True): (2, 128, 3, 4),
+        (256, 256, 32768, 32, 32, False, True, True): (2, 512, 3, 4),
+        (256, 256, 32768, 32, 32, True, False, True): (4, 512, 3, 4),
+        (256, 256, 32768, 64, 64, False, True, True): (1, 512, 1, 8),
+        (256, 256, 32768, 64, 64, True, False, True): (1, 512, 2, 4),
+        (256, 256, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (256, 256, 32768, 128, 128, True, False, True): (1, 256, 1, 32),
+        (256, 256, 65536, 16, 16, False, True, True): (2, 512, 3, 4),
+        (256, 256, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (256, 256, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (256, 256, 65536, 32, 32, True, False, True): (2, 1024, 3, 4),
+        (256, 256, 65536, 64, 64, False, True, True): (1, 1024, 2, 4),
+        (256, 256, 65536, 64, 64, True, False, True): (1, 1024, 2, 4),
+        (256, 256, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (256, 256, 65536, 128, 128, True, False, True): (2, 512, 1, 32),
+        (256, 256, 131072, 16, 16, False, True, True): (1, 1024, 3, 4),
+        (256, 256, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (256, 256, 131072, 32, 32, False, True, True): (1, 2048, 3, 4),
+        (256, 256, 131072, 32, 32, True, False, True): (1, 2048, 3, 4),
+        (256, 256, 131072, 64, 64, False, True, True): (1, 2048, 1, 8),
+        (256, 256, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
+        (256, 256, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (256, 256, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (512, 512, 256, 16, 16, False, True, True): (1, 8, 4, 4),
+        (512, 512, 256, 16, 16, True, False, True): (1, 8, 3, 2),
+        (512, 512, 256, 32, 32, False, True, True): (4, 8, 3, 4),
+        (512, 512, 256, 32, 32, True, False, True): (4, 8, 3, 4),
+        (512, 512, 256, 64, 64, False, True, True): (3, 4, 3, 8),
+        (512, 512, 256, 64, 64, True, False, True): (5, 4, 3, 8),
+        (512, 512, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (512, 512, 256, 128, 128, True, False, True): (3, 2, 1, 32),
+        (512, 512, 512, 16, 16, False, True, True): (2, 16, 3, 2),
+        (512, 512, 512, 16, 16, True, False, True): (1, 8, 4, 4),
+        (512, 512, 512, 32, 32, False, True, True): (3, 16, 3, 4),
+        (512, 512, 512, 32, 32, True, False, True): (5, 16, 2, 4),
+        (512, 512, 512, 64, 64, False, True, True): (1, 8, 3, 8),
+        (512, 512, 512, 64, 64, True, False, True): (3, 8, 3, 8),
+        (512, 512, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (512, 512, 512, 128, 128, True, False, True): (3, 4, 1, 16),
+        (512, 512, 1024, 16, 16, False, True, True): (1, 16, 3, 4),
+        (512, 512, 1024, 16, 16, True, False, True): (3, 16, 3, 4),
+        (512, 512, 1024, 32, 32, False, True, True): (3, 32, 3, 4),
+        (512, 512, 1024, 32, 32, True, False, True): (3, 32, 2, 4),
+        (512, 512, 1024, 64, 64, False, True, True): (1, 16, 3, 8),
+        (512, 512, 1024, 64, 64, True, False, True): (4, 16, 3, 8),
+        (512, 512, 1024, 128, 128, False, True, True): (4, 8, 1, 32),
+        (512, 512, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (512, 512, 2048, 16, 16, False, True, True): (5, 16, 3, 4),
+        (512, 512, 2048, 16, 16, True, False, True): (5, 16, 3, 4),
+        (512, 512, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (512, 512, 2048, 32, 32, True, False, True): (1, 32, 4, 4),
+        (512, 512, 2048, 64, 64, False, True, True): (4, 32, 3, 8),
+        (512, 512, 2048, 64, 64, True, False, True): (4, 32, 3, 8),
+        (512, 512, 2048, 128, 128, False, True, True): (3, 16, 1, 32),
+        (512, 512, 2048, 128, 128, True, False, True): (3, 16, 1, 32),
+        (512, 512, 4096, 16, 16, False, True, True): (4, 32, 3, 4),
+        (512, 512, 4096, 16, 16, True, False, True): (4, 64, 3, 2),
+        (512, 512, 4096, 32, 32, False, True, True): (3, 64, 3, 4),
+        (512, 512, 4096, 32, 32, True, False, True): (3, 64, 3, 4),
+        (512, 512, 4096, 64, 64, False, True, True): (4, 64, 2, 4),
+        (512, 512, 4096, 64, 64, True, False, True): (1, 64, 2, 4),
+        (512, 512, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (512, 512, 4096, 128, 128, True, False, True): (1, 32, 1, 32),
+        (512, 512, 8192, 16, 16, False, True, True): (1, 64, 3, 4),
+        (512, 512, 8192, 16, 16, True, False, True): (4, 64, 3, 4),
+        (512, 512, 8192, 32, 32, False, True, True): (2, 128, 3, 4),
+        (512, 512, 8192, 32, 32, True, False, True): (3, 128, 3, 4),
+        (512, 512, 8192, 64, 64, False, True, True): (1, 128, 2, 4),
+        (512, 512, 8192, 64, 64, True, False, True): (1, 128, 2, 4),
+        (512, 512, 8192, 128, 128, False, True, True): (6, 64, 1, 32),
+        (512, 512, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (512, 512, 16384, 16, 16, False, True, True): (1, 128, 3, 4),
+        (512, 512, 16384, 16, 16, True, False, True): (1, 64, 3, 4),
+        (512, 512, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (512, 512, 16384, 32, 32, True, False, True): (4, 256, 3, 4),
+        (512, 512, 16384, 64, 64, False, True, True): (1, 256, 2, 4),
+        (512, 512, 16384, 64, 64, True, False, True): (1, 256, 2, 4),
+        (512, 512, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (512, 512, 16384, 128, 128, True, False, True): (2, 128, 1, 32),
+        (512, 512, 32768, 16, 16, False, True, True): (1, 256, 3, 4),
+        (512, 512, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (512, 512, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (512, 512, 32768, 32, 32, True, False, True): (1, 512, 3, 4),
+        (512, 512, 32768, 64, 64, False, True, True): (1, 512, 2, 4),
+        (512, 512, 32768, 64, 64, True, False, True): (2, 512, 2, 4),
+        (512, 512, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (512, 512, 32768, 128, 128, True, False, True): (2, 256, 1, 32),
+        (512, 512, 65536, 16, 16, False, True, True): (1, 512, 3, 4),
+        (512, 512, 65536, 16, 16, True, False, True): (1, 256, 3, 4),
+        (512, 512, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (512, 512, 65536, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (512, 512, 65536, 64, 64, False, True, True): (1, 1024, 2, 4),
+        (512, 512, 65536, 64, 64, True, False, True): (1, 1024, 2, 4),
+        (512, 512, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (512, 512, 65536, 128, 128, True, False, True): (4, 512, 1, 32),
+        (512, 512, 131072, 16, 16, False, True, True): (1, 512, 3, 4),
+        (512, 512, 131072, 16, 16, True, False, True): (1, 512, 3, 4),
+        (512, 512, 131072, 32, 32, False, True, True): (1, 2048, 3, 4),
+        (512, 512, 131072, 32, 32, True, False, True): (1, 2048, 3, 4),
+        (512, 512, 131072, 64, 64, False, True, True): (1, 2048, 2, 4),
+        (512, 512, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
+        (512, 512, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (512, 512, 131072, 128, 128, True, False, True): (2, 1024, 1, 32),
+        (1024, 1024, 256, 16, 16, False, True, True): (4, 8, 3, 2),
+        (1024, 1024, 256, 16, 16, True, False, True): (2, 8, 3, 2),
+        (1024, 1024, 256, 32, 32, False, True, True): (1, 8, 3, 4),
+        (1024, 1024, 256, 32, 32, True, False, True): (1, 8, 3, 4),
+        (1024, 1024, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+        (1024, 1024, 256, 64, 64, True, False, True): (2, 4, 3, 8),
+        (1024, 1024, 256, 128, 128, False, True, True): (3, 2, 1, 32),
+        (1024, 1024, 256, 128, 128, True, False, True): (5, 2, 1, 32),
+        (1024, 1024, 512, 16, 16, False, True, True): (3, 8, 3, 4),
+        (1024, 1024, 512, 16, 16, True, False, True): (3, 8, 3, 4),
+        (1024, 1024, 512, 32, 32, False, True, True): (1, 16, 3, 4),
+        (1024, 1024, 512, 32, 32, True, False, True): (3, 16, 3, 4),
+        (1024, 1024, 512, 64, 64, False, True, True): (6, 8, 3, 8),
+        (1024, 1024, 512, 64, 64, True, False, True): (8, 8, 3, 8),
+        (1024, 1024, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (1024, 1024, 512, 128, 128, True, False, True): (1, 4, 1, 32),
+        (1024, 1024, 1024, 16, 16, False, True, True): (4, 8, 3, 4),
+        (1024, 1024, 1024, 16, 16, True, False, True): (1, 8, 3, 4),
+        (1024, 1024, 1024, 32, 32, False, True, True): (4, 16, 4, 4),
+        (1024, 1024, 1024, 32, 32, True, False, True): (5, 16, 3, 4),
+        (1024, 1024, 1024, 64, 64, False, True, True): (6, 16, 3, 8),
+        (1024, 1024, 1024, 64, 64, True, False, True): (3, 16, 2, 4),
+        (1024, 1024, 1024, 128, 128, False, True, True): (1, 8, 1, 32),
+        (1024, 1024, 1024, 128, 128, True, False, True): (2, 8, 1, 32),
+        (1024, 1024, 2048, 16, 16, False, True, True): (4, 16, 3, 4),
+        (1024, 1024, 2048, 16, 16, True, False, True): (1, 16, 3, 4),
+        (1024, 1024, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (1024, 1024, 2048, 32, 32, True, False, True): (2, 32, 3, 4),
+        (1024, 1024, 2048, 64, 64, False, True, True): (4, 32, 2, 4),
+        (1024, 1024, 2048, 64, 64, True, False, True): (8, 32, 2, 4),
+        (1024, 1024, 2048, 128, 128, False, True, True): (1, 16, 1, 32),
+        (1024, 1024, 2048, 128, 128, True, False, True): (1, 16, 1, 32),
+        (1024, 1024, 4096, 16, 16, False, True, True): (4, 32, 3, 4),
+        (1024, 1024, 4096, 16, 16, True, False, True): (1, 64, 3, 2),
+        (1024, 1024, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (1024, 1024, 4096, 32, 32, True, False, True): (1, 64, 3, 4),
+        (1024, 1024, 4096, 64, 64, False, True, True): (2, 64, 2, 4),
+        (1024, 1024, 4096, 64, 64, True, False, True): (2, 64, 2, 4),
+        (1024, 1024, 4096, 128, 128, False, True, True): (1, 32, 1, 32),
+        (1024, 1024, 4096, 128, 128, True, False, True): (4, 32, 1, 32),
+        (1024, 1024, 8192, 16, 16, False, True, True): (1, 128, 3, 1),
+        (1024, 1024, 8192, 16, 16, True, False, True): (1, 128, 3, 1),
+        (1024, 1024, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (1024, 1024, 8192, 32, 32, True, False, True): (1, 128, 3, 4),
+        (1024, 1024, 8192, 64, 64, False, True, True): (2, 128, 2, 4),
+        (1024, 1024, 8192, 64, 64, True, False, True): (2, 128, 2, 4),
+        (1024, 1024, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (1024, 1024, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (1024, 1024, 16384, 16, 16, False, True, True): (1, 128, 2, 4),
+        (1024, 1024, 16384, 16, 16, True, False, True): (4, 256, 3, 1),
+        (1024, 1024, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (1024, 1024, 16384, 32, 32, True, False, True): (1, 256, 3, 4),
+        (1024, 1024, 16384, 64, 64, False, True, True): (1, 256, 2, 4),
+        (1024, 1024, 16384, 64, 64, True, False, True): (1, 256, 2, 4),
+        (1024, 1024, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (1024, 1024, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (1024, 1024, 32768, 16, 16, False, True, True): (1, 256, 2, 4),
+        (1024, 1024, 32768, 16, 16, True, False, True): (4, 512, 3, 1),
+        (1024, 1024, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (1024, 1024, 32768, 32, 32, True, False, True): (1, 512, 3, 4),
+        (1024, 1024, 32768, 64, 64, False, True, True): (1, 512, 2, 4),
+        (1024, 1024, 32768, 64, 64, True, False, True): (1, 512, 2, 4),
+        (1024, 1024, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (1024, 1024, 32768, 128, 128, True, False, True): (1, 256, 1, 32),
+        (1024, 1024, 65536, 16, 16, False, True, True): (1, 512, 2, 4),
+        (1024, 1024, 65536, 16, 16, True, False, True): (1, 1024, 3, 1),
+        (1024, 1024, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (1024, 1024, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (1024, 1024, 65536, 64, 64, False, True, True): (1, 1024, 2, 4),
+        (1024, 1024, 65536, 64, 64, True, False, True): (1, 1024, 2, 4),
+        (1024, 1024, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (1024, 1024, 65536, 128, 128, True, False, True): (1, 512, 1, 32),
+        (1024, 1024, 131072, 16, 16, False, True, True): (4, 2048, 3, 1),
+        (1024, 1024, 131072, 16, 16, True, False, True): (4, 2048, 3, 1),
+        (1024, 1024, 131072, 32, 32, False, True, True): (1, 2048, 3, 4),
+        (1024, 1024, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (1024, 1024, 131072, 64, 64, False, True, True): (1, 2048, 2, 4),
+        (1024, 1024, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
+        (1024, 1024, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (1024, 1024, 131072, 128, 128, True, False, True): (1, 1024, 1, 32),
+        (2048, 2048, 256, 16, 16, False, True, True): (1, 4, 3, 4),
+        (2048, 2048, 256, 16, 16, True, False, True): (1, 4, 3, 4),
+        (2048, 2048, 256, 32, 32, False, True, True): (3, 8, 3, 4),
+        (2048, 2048, 256, 32, 32, True, False, True): (3, 8, 3, 4),
+        (2048, 2048, 256, 64, 64, False, True, True): (4, 4, 4, 8),
+        (2048, 2048, 256, 64, 64, True, False, True): (8, 4, 4, 8),
+        (2048, 2048, 256, 128, 128, False, True, True): (3, 2, 1, 32),
+        (2048, 2048, 256, 128, 128, True, False, True): (3, 2, 1, 32),
+        (2048, 2048, 512, 16, 16, False, True, True): (4, 8, 3, 2),
+        (2048, 2048, 512, 16, 16, True, False, True): (4, 8, 3, 2),
+        (2048, 2048, 512, 32, 32, False, True, True): (3, 8, 3, 4),
+        (2048, 2048, 512, 32, 32, True, False, True): (1, 16, 2, 4),
+        (2048, 2048, 512, 64, 64, False, True, True): (4, 8, 2, 4),
+        (2048, 2048, 512, 64, 64, True, False, True): (4, 8, 2, 4),
+        (2048, 2048, 512, 128, 128, False, True, True): (1, 4, 1, 32),
+        (2048, 2048, 512, 128, 128, True, False, True): (4, 4, 1, 32),
+        (2048, 2048, 1024, 16, 16, False, True, True): (4, 8, 3, 4),
+        (2048, 2048, 1024, 16, 16, True, False, True): (4, 8, 3, 4),
+        (2048, 2048, 1024, 32, 32, False, True, True): (4, 16, 3, 4),
+        (2048, 2048, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (2048, 2048, 1024, 64, 64, False, True, True): (2, 16, 2, 4),
+        (2048, 2048, 1024, 64, 64, True, False, True): (2, 16, 2, 4),
+        (2048, 2048, 1024, 128, 128, False, True, True): (8, 8, 1, 32),
+        (2048, 2048, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (2048, 2048, 2048, 16, 16, False, True, True): (4, 32, 3, 1),
+        (2048, 2048, 2048, 16, 16, True, False, True): (3, 32, 3, 2),
+        (2048, 2048, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (2048, 2048, 2048, 32, 32, True, False, True): (1, 32, 3, 4),
+        (2048, 2048, 2048, 64, 64, False, True, True): (2, 32, 2, 4),
+        (2048, 2048, 2048, 64, 64, True, False, True): (2, 32, 2, 4),
+        (2048, 2048, 2048, 128, 128, False, True, True): (6, 16, 1, 32),
+        (2048, 2048, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (2048, 2048, 4096, 16, 16, False, True, True): (4, 64, 3, 1),
+        (2048, 2048, 4096, 16, 16, True, False, True): (1, 64, 3, 1),
+        (2048, 2048, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (2048, 2048, 4096, 32, 32, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 4096, 64, 64, False, True, True): (2, 64, 2, 4),
+        (2048, 2048, 4096, 64, 64, True, False, True): (2, 64, 2, 4),
+        (2048, 2048, 4096, 128, 128, False, True, True): (4, 32, 1, 32),
+        (2048, 2048, 4096, 128, 128, True, False, True): (4, 32, 1, 32),
+        (2048, 2048, 8192, 16, 16, False, True, True): (4, 128, 3, 1),
+        (2048, 2048, 8192, 16, 16, True, False, True): (1, 128, 3, 1),
+        (2048, 2048, 8192, 32, 32, False, True, True): (4, 128, 3, 4),
+        (2048, 2048, 8192, 32, 32, True, False, True): (4, 64, 3, 4),
+        (2048, 2048, 8192, 64, 64, False, True, True): (1, 128, 2, 4),
+        (2048, 2048, 8192, 64, 64, True, False, True): (2, 128, 2, 4),
+        (2048, 2048, 8192, 128, 128, False, True, True): (1, 64, 1, 32),
+        (2048, 2048, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (2048, 2048, 16384, 16, 16, False, True, True): (4, 256, 3, 1),
+        (2048, 2048, 16384, 16, 16, True, False, True): (1, 256, 3, 1),
+        (2048, 2048, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (2048, 2048, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (2048, 2048, 16384, 64, 64, False, True, True): (1, 256, 2, 4),
+        (2048, 2048, 16384, 64, 64, True, False, True): (1, 256, 2, 4),
+        (2048, 2048, 16384, 128, 128, False, True, True): (1, 128, 1, 32),
+        (2048, 2048, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (2048, 2048, 32768, 16, 16, False, True, True): (8, 512, 3, 1),
+        (2048, 2048, 32768, 16, 16, True, False, True): (1, 512, 3, 1),
+        (2048, 2048, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (2048, 2048, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (2048, 2048, 32768, 64, 64, False, True, True): (1, 512, 2, 4),
+        (2048, 2048, 32768, 64, 64, True, False, True): (1, 512, 2, 4),
+        (2048, 2048, 32768, 128, 128, False, True, True): (1, 256, 1, 32),
+        (2048, 2048, 32768, 128, 128, True, False, True): (4, 256, 1, 32),
+        (2048, 2048, 65536, 16, 16, False, True, True): (4, 1024, 3, 1),
+        (2048, 2048, 65536, 16, 16, True, False, True): (1, 1024, 3, 1),
+        (2048, 2048, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (2048, 2048, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (2048, 2048, 65536, 64, 64, False, True, True): (1, 1024, 2, 4),
+        (2048, 2048, 65536, 64, 64, True, False, True): (1, 1024, 2, 4),
+        (2048, 2048, 65536, 128, 128, False, True, True): (1, 512, 1, 32),
+        (2048, 2048, 65536, 128, 128, True, False, True): (4, 512, 1, 32),
+        (2048, 2048, 131072, 16, 16, False, True, True): (4, 2048, 3, 1),
+        (2048, 2048, 131072, 16, 16, True, False, True): (1, 2048, 3, 1),
+        (2048, 2048, 131072, 32, 32, False, True, True): (1, 2048, 3, 4),
+        (2048, 2048, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (2048, 2048, 131072, 64, 64, False, True, True): (1, 2048, 2, 4),
+        (2048, 2048, 131072, 64, 64, True, False, True): (1, 2048, 2, 4),
+        (2048, 2048, 131072, 128, 128, False, True, True): (1, 1024, 1, 32),
+        (2048, 2048, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (4096, 4096, 256, 16, 16, False, True, True): (1, 4, 3, 2),
+        (4096, 4096, 256, 16, 16, True, False, True): (1, 2, 3, 4),
+        (4096, 4096, 256, 32, 32, False, True, True): (4, 4, 4, 4),
+        (4096, 4096, 256, 32, 32, True, False, True): (4, 4, 4, 4),
+        (4096, 4096, 256, 64, 64, False, True, True): (1, 4, 3, 8),
+        (4096, 4096, 256, 64, 64, True, False, True): (4, 4, 2, 4),
+        (4096, 4096, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (4096, 4096, 256, 128, 128, True, False, True): (3, 2, 1, 32),
+        (4096, 4096, 512, 16, 16, False, True, True): (1, 4, 3, 4),
+        (4096, 4096, 512, 16, 16, True, False, True): (5, 8, 3, 2),
+        (4096, 4096, 512, 32, 32, False, True, True): (4, 8, 3, 4),
+        (4096, 4096, 512, 32, 32, True, False, True): (4, 8, 3, 4),
+        (4096, 4096, 512, 64, 64, False, True, True): (1, 8, 2, 4),
+        (4096, 4096, 512, 64, 64, True, False, True): (1, 8, 2, 4),
+        (4096, 4096, 512, 128, 128, False, True, True): (4, 4, 1, 32),
+        (4096, 4096, 512, 128, 128, True, False, True): (4, 4, 1, 32),
+        (4096, 4096, 1024, 16, 16, False, True, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 16, 16, True, False, True): (1, 8, 3, 4),
+        (4096, 4096, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (4096, 4096, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (4096, 4096, 1024, 64, 64, False, True, True): (4, 16, 2, 4),
+        (4096, 4096, 1024, 64, 64, True, False, True): (4, 16, 2, 4),
+        (4096, 4096, 1024, 128, 128, False, True, True): (4, 8, 1, 32),
+        (4096, 4096, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (4096, 4096, 2048, 16, 16, False, True, True): (1, 32, 3, 1),
+        (4096, 4096, 2048, 16, 16, True, False, True): (6, 8, 3, 4),
+        (4096, 4096, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (4096, 4096, 2048, 32, 32, True, False, True): (1, 32, 3, 4),
+        (4096, 4096, 2048, 64, 64, False, True, True): (4, 32, 2, 4),
+        (4096, 4096, 2048, 64, 64, True, False, True): (4, 32, 2, 4),
+        (4096, 4096, 2048, 128, 128, False, True, True): (4, 16, 1, 32),
+        (4096, 4096, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (4096, 4096, 4096, 16, 16, False, True, True): (1, 16, 3, 4),
+        (4096, 4096, 4096, 16, 16, True, False, True): (1, 64, 3, 1),
+        (4096, 4096, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (4096, 4096, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (4096, 4096, 4096, 64, 64, False, True, True): (4, 64, 2, 4),
+        (4096, 4096, 4096, 64, 64, True, False, True): (4, 64, 2, 4),
+        (4096, 4096, 4096, 128, 128, False, True, True): (4, 32, 1, 32),
+        (4096, 4096, 4096, 128, 128, True, False, True): (4, 32, 1, 32),
+        (4096, 4096, 8192, 16, 16, False, True, True): (4, 128, 3, 1),
+        (4096, 4096, 8192, 16, 16, True, False, True): (1, 128, 3, 1),
+        (4096, 4096, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (4096, 4096, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (4096, 4096, 8192, 64, 64, False, True, True): (4, 128, 2, 4),
+        (4096, 4096, 8192, 64, 64, True, False, True): (4, 128, 2, 4),
+        (4096, 4096, 8192, 128, 128, False, True, True): (4, 64, 1, 32),
+        (4096, 4096, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (4096, 4096, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (4096, 4096, 16384, 16, 16, True, False, True): (1, 256, 3, 1),
+        (4096, 4096, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (4096, 4096, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (4096, 4096, 16384, 64, 64, False, True, True): (4, 256, 2, 4),
+        (4096, 4096, 16384, 64, 64, True, False, True): (4, 256, 2, 4),
+        (4096, 4096, 16384, 128, 128, False, True, True): (4, 128, 1, 32),
+        (4096, 4096, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (4096, 4096, 32768, 16, 16, False, True, True): (1, 128, 3, 4),
+        (4096, 4096, 32768, 16, 16, True, False, True): (1, 512, 3, 1),
+        (4096, 4096, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (4096, 4096, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (4096, 4096, 32768, 64, 64, False, True, True): (4, 512, 2, 4),
+        (4096, 4096, 32768, 64, 64, True, False, True): (4, 512, 2, 4),
+        (4096, 4096, 32768, 128, 128, False, True, True): (4, 256, 1, 32),
+        (4096, 4096, 32768, 128, 128, True, False, True): (4, 256, 1, 32),
+        (4096, 4096, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (4096, 4096, 65536, 16, 16, True, False, True): (1, 1024, 3, 1),
+        (4096, 4096, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (4096, 4096, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (4096, 4096, 65536, 64, 64, False, True, True): (4, 1024, 2, 4),
+        (4096, 4096, 65536, 64, 64, True, False, True): (2, 1024, 2, 4),
+        (4096, 4096, 65536, 128, 128, False, True, True): (4, 512, 1, 32),
+        (4096, 4096, 65536, 128, 128, True, False, True): (4, 512, 1, 32),
+        (4096, 4096, 131072, 16, 16, False, True, True): (2, 2048, 3, 1),
+        (4096, 4096, 131072, 16, 16, True, False, True): (1, 2048, 3, 1),
+        (4096, 4096, 131072, 32, 32, False, True, True): (2, 2048, 3, 4),
+        (4096, 4096, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (4096, 4096, 131072, 64, 64, False, True, True): (2, 2048, 2, 4),
+        (4096, 4096, 131072, 64, 64, True, False, True): (2, 2048, 2, 4),
+        (4096, 4096, 131072, 128, 128, False, True, True): (4, 1024, 1, 32),
+        (4096, 4096, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (8192, 8192, 256, 16, 16, False, True, True): (2, 2, 4, 4),
+        (8192, 8192, 256, 16, 16, True, False, True): (1, 1, 3, 4),
+        (8192, 8192, 256, 32, 32, False, True, True): (2, 4, 3, 4),
+        (8192, 8192, 256, 32, 32, True, False, True): (2, 4, 3, 4),
+        (8192, 8192, 256, 64, 64, False, True, True): (4, 4, 2, 4),
+        (8192, 8192, 256, 64, 64, True, False, True): (4, 4, 2, 4),
+        (8192, 8192, 256, 128, 128, False, True, True): (1, 2, 1, 32),
+        (8192, 8192, 256, 128, 128, True, False, True): (4, 2, 1, 32),
+        (8192, 8192, 512, 16, 16, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 512, 16, 16, True, False, True): (3, 4, 3, 4),
+        (8192, 8192, 512, 32, 32, False, True, True): (1, 8, 3, 4),
+        (8192, 8192, 512, 32, 32, True, False, True): (6, 8, 3, 4),
+        (8192, 8192, 512, 64, 64, False, True, True): (4, 8, 2, 4),
+        (8192, 8192, 512, 64, 64, True, False, True): (4, 8, 2, 4),
+        (8192, 8192, 512, 128, 128, False, True, True): (4, 4, 1, 32),
+        (8192, 8192, 512, 128, 128, True, False, True): (4, 4, 1, 32),
+        (8192, 8192, 1024, 16, 16, False, True, True): (1, 4, 3, 4),
+        (8192, 8192, 1024, 16, 16, True, False, True): (1, 32, 3, 1),
+        (8192, 8192, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (8192, 8192, 1024, 32, 32, True, False, True): (1, 16, 3, 4),
+        (8192, 8192, 1024, 64, 64, False, True, True): (4, 16, 2, 4),
+        (8192, 8192, 1024, 64, 64, True, False, True): (4, 16, 2, 4),
+        (8192, 8192, 1024, 128, 128, False, True, True): (4, 8, 1, 32),
+        (8192, 8192, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (8192, 8192, 2048, 16, 16, False, True, True): (4, 8, 3, 4),
+        (8192, 8192, 2048, 16, 16, True, False, True): (1, 32, 3, 1),
+        (8192, 8192, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (8192, 8192, 2048, 32, 32, True, False, True): (1, 16, 4, 4),
+        (8192, 8192, 2048, 64, 64, False, True, True): (4, 32, 2, 4),
+        (8192, 8192, 2048, 64, 64, True, False, True): (4, 32, 2, 4),
+        (8192, 8192, 2048, 128, 128, False, True, True): (4, 16, 1, 32),
+        (8192, 8192, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (8192, 8192, 4096, 16, 16, False, True, True): (3, 16, 3, 4),
+        (8192, 8192, 4096, 16, 16, True, False, True): (2, 64, 3, 1),
+        (8192, 8192, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (8192, 8192, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (8192, 8192, 4096, 64, 64, False, True, True): (4, 64, 2, 4),
+        (8192, 8192, 4096, 64, 64, True, False, True): (2, 64, 2, 4),
+        (8192, 8192, 4096, 128, 128, False, True, True): (4, 32, 1, 32),
+        (8192, 8192, 4096, 128, 128, True, False, True): (4, 32, 1, 32),
+        (8192, 8192, 8192, 16, 16, False, True, True): (2, 128, 3, 1),
+        (8192, 8192, 8192, 16, 16, True, False, True): (2, 128, 3, 1),
+        (8192, 8192, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (8192, 8192, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (8192, 8192, 8192, 64, 64, False, True, True): (4, 128, 2, 4),
+        (8192, 8192, 8192, 64, 64, True, False, True): (2, 128, 2, 4),
+        (8192, 8192, 8192, 128, 128, False, True, True): (4, 64, 1, 32),
+        (8192, 8192, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (8192, 8192, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (8192, 8192, 16384, 16, 16, True, False, True): (1, 256, 3, 1),
+        (8192, 8192, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (8192, 8192, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (8192, 8192, 16384, 64, 64, False, True, True): (2, 256, 2, 4),
+        (8192, 8192, 16384, 64, 64, True, False, True): (2, 256, 2, 4),
+        (8192, 8192, 16384, 128, 128, False, True, True): (4, 128, 1, 32),
+        (8192, 8192, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (8192, 8192, 32768, 16, 16, False, True, True): (1, 512, 3, 1),
+        (8192, 8192, 32768, 16, 16, True, False, True): (1, 512, 3, 1),
+        (8192, 8192, 32768, 32, 32, False, True, True): (1, 512, 3, 4),
+        (8192, 8192, 32768, 32, 32, True, False, True): (1, 256, 3, 4),
+        (8192, 8192, 32768, 64, 64, False, True, True): (2, 512, 2, 4),
+        (8192, 8192, 32768, 64, 64, True, False, True): (2, 512, 2, 4),
+        (8192, 8192, 32768, 128, 128, False, True, True): (4, 256, 1, 32),
+        (8192, 8192, 32768, 128, 128, True, False, True): (4, 256, 1, 32),
+        (8192, 8192, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (8192, 8192, 65536, 16, 16, True, False, True): (1, 1024, 3, 1),
+        (8192, 8192, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (8192, 8192, 65536, 32, 32, True, False, True): (1, 512, 3, 4),
+        (8192, 8192, 65536, 64, 64, False, True, True): (4, 1024, 2, 4),
+        (8192, 8192, 65536, 64, 64, True, False, True): (2, 1024, 2, 4),
+        (8192, 8192, 65536, 128, 128, False, True, True): (4, 512, 1, 32),
+        (8192, 8192, 65536, 128, 128, True, False, True): (4, 512, 1, 32),
+        (8192, 8192, 131072, 16, 16, False, True, True): (1, 2048, 3, 1),
+        (8192, 8192, 131072, 16, 16, True, False, True): (2, 2048, 3, 1),
+        (8192, 8192, 131072, 32, 32, False, True, True): (4, 2048, 3, 4),
+        (8192, 8192, 131072, 32, 32, True, False, True): (1, 1024, 3, 4),
+        (8192, 8192, 131072, 64, 64, False, True, True): (2, 2048, 2, 4),
+        (8192, 8192, 131072, 64, 64, True, False, True): (2, 2048, 2, 4),
+        (8192, 8192, 131072, 128, 128, False, True, True): (4, 1024, 1, 32),
+        (8192, 8192, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+        (16384, 16384, 256, 16, 16, False, True, True): (1, 2, 3, 4),
+        (16384, 16384, 256, 16, 16, True, False, True): (1, 2, 3, 4),
+        (16384, 16384, 256, 32, 32, False, True, True): (1, 4, 3, 4),
+        (16384, 16384, 256, 32, 32, True, False, True): (1, 4, 3, 4),
+        (16384, 16384, 256, 64, 64, False, True, True): (2, 4, 2, 4),
+        (16384, 16384, 256, 64, 64, True, False, True): (2, 4, 2, 4),
+        (16384, 16384, 256, 128, 128, False, True, True): (2, 2, 1, 32),
+        (16384, 16384, 256, 128, 128, True, False, True): (2, 2, 1, 32),
+        (16384, 16384, 512, 16, 16, False, True, True): (1, 2, 3, 4),
+        (16384, 16384, 512, 16, 16, True, False, True): (5, 2, 3, 4),
+        (16384, 16384, 512, 32, 32, False, True, True): (1, 8, 3, 4),
+        (16384, 16384, 512, 32, 32, True, False, True): (1, 4, 3, 4),
+        (16384, 16384, 512, 64, 64, False, True, True): (4, 8, 2, 4),
+        (16384, 16384, 512, 64, 64, True, False, True): (4, 8, 2, 4),
+        (16384, 16384, 512, 128, 128, False, True, True): (4, 4, 1, 32),
+        (16384, 16384, 512, 128, 128, True, False, True): (4, 4, 1, 32),
+        (16384, 16384, 1024, 16, 16, False, True, True): (1, 4, 3, 4),
+        (16384, 16384, 1024, 16, 16, True, False, True): (2, 16, 3, 1),
+        (16384, 16384, 1024, 32, 32, False, True, True): (1, 16, 3, 4),
+        (16384, 16384, 1024, 32, 32, True, False, True): (1, 8, 3, 4),
+        (16384, 16384, 1024, 64, 64, False, True, True): (4, 16, 2, 4),
+        (16384, 16384, 1024, 64, 64, True, False, True): (4, 16, 2, 4),
+        (16384, 16384, 1024, 128, 128, False, True, True): (4, 8, 1, 32),
+        (16384, 16384, 1024, 128, 128, True, False, True): (4, 8, 1, 32),
+        (16384, 16384, 2048, 16, 16, False, True, True): (1, 8, 3, 4),
+        (16384, 16384, 2048, 16, 16, True, False, True): (2, 32, 3, 1),
+        (16384, 16384, 2048, 32, 32, False, True, True): (1, 32, 3, 4),
+        (16384, 16384, 2048, 32, 32, True, False, True): (1, 16, 3, 4),
+        (16384, 16384, 2048, 64, 64, False, True, True): (4, 32, 2, 4),
+        (16384, 16384, 2048, 64, 64, True, False, True): (2, 32, 2, 4),
+        (16384, 16384, 2048, 128, 128, False, True, True): (4, 16, 1, 32),
+        (16384, 16384, 2048, 128, 128, True, False, True): (4, 16, 1, 32),
+        (16384, 16384, 4096, 16, 16, False, True, True): (1, 16, 3, 4),
+        (16384, 16384, 4096, 16, 16, True, False, True): (2, 64, 3, 1),
+        (16384, 16384, 4096, 32, 32, False, True, True): (1, 64, 3, 4),
+        (16384, 16384, 4096, 32, 32, True, False, True): (1, 32, 3, 4),
+        (16384, 16384, 4096, 64, 64, False, True, True): (4, 64, 2, 4),
+        (16384, 16384, 4096, 64, 64, True, False, True): (2, 64, 2, 4),
+        (16384, 16384, 4096, 128, 128, False, True, True): (4, 32, 1, 32),
+        (16384, 16384, 4096, 128, 128, True, False, True): (4, 32, 1, 32),
+        (16384, 16384, 8192, 16, 16, False, True, True): (1, 128, 3, 1),
+        (16384, 16384, 8192, 16, 16, True, False, True): (2, 128, 3, 1),
+        (16384, 16384, 8192, 32, 32, False, True, True): (1, 128, 3, 4),
+        (16384, 16384, 8192, 32, 32, True, False, True): (1, 64, 3, 4),
+        (16384, 16384, 8192, 64, 64, False, True, True): (2, 128, 2, 4),
+        (16384, 16384, 8192, 64, 64, True, False, True): (2, 128, 2, 4),
+        (16384, 16384, 8192, 128, 128, False, True, True): (4, 64, 1, 32),
+        (16384, 16384, 8192, 128, 128, True, False, True): (4, 64, 1, 32),
+        (16384, 16384, 16384, 16, 16, False, True, True): (1, 64, 3, 4),
+        (16384, 16384, 16384, 16, 16, True, False, True): (2, 256, 3, 1),
+        (16384, 16384, 16384, 32, 32, False, True, True): (1, 256, 3, 4),
+        (16384, 16384, 16384, 32, 32, True, False, True): (1, 128, 3, 4),
+        (16384, 16384, 16384, 64, 64, False, True, True): (2, 256, 2, 4),
+        (16384, 16384, 16384, 64, 64, True, False, True): (2, 256, 2, 4),
+        (16384, 16384, 16384, 128, 128, False, True, True): (4, 128, 1, 32),
+        (16384, 16384, 16384, 128, 128, True, False, True): (4, 128, 1, 32),
+        (16384, 16384, 32768, 16, 16, False, True, True): (1, 512, 3, 1),
+        (16384, 16384, 32768, 16, 16, True, False, True): (1, 128, 3, 4),
+        (16384, 16384, 32768, 32, 32, False, True, True): (2, 512, 3, 4),
+        (16384, 16384, 32768, 32, 32, True, False, True): (1, 256, 4, 4),
+        (16384, 16384, 32768, 64, 64, False, True, True): (2, 512, 2, 4),
+        (16384, 16384, 32768, 64, 64, True, False, True): (2, 512, 2, 4),
+        (16384, 16384, 32768, 128, 128, False, True, True): (4, 256, 1, 32),
+        (16384, 16384, 32768, 128, 128, True, False, True): (4, 256, 1, 32),
+        (16384, 16384, 65536, 16, 16, False, True, True): (1, 256, 3, 4),
+        (16384, 16384, 65536, 16, 16, True, False, True): (1, 1024, 3, 1),
+        (16384, 16384, 65536, 32, 32, False, True, True): (1, 1024, 3, 4),
+        (16384, 16384, 65536, 32, 32, True, False, True): (1, 512, 4, 4),
+        (16384, 16384, 65536, 64, 64, False, True, True): (2, 1024, 2, 4),
+        (16384, 16384, 65536, 64, 64, True, False, True): (2, 1024, 2, 4),
+        (16384, 16384, 65536, 128, 128, False, True, True): (4, 512, 1, 32),
+        (16384, 16384, 65536, 128, 128, True, False, True): (4, 512, 1, 32),
+        (16384, 16384, 131072, 16, 16, False, True, True): (1, 1024, 4, 4),
+        (16384, 16384, 131072, 16, 16, True, False, True): (2, 2048, 3, 1),
+        (16384, 16384, 131072, 32, 32, False, True, True): (1, 1024, 2, 4),
+        (16384, 16384, 131072, 32, 32, True, False, True): (1, 1024, 2, 4),
+        (16384, 16384, 131072, 64, 64, False, True, True): (4, 2048, 2, 4),
+        (16384, 16384, 131072, 64, 64, True, False, True): (2, 2048, 2, 4),
+        (16384, 16384, 131072, 128, 128, False, True, True): (4, 1024, 1, 32),
+        (16384, 16384, 131072, 128, 128, True, False, True): (4, 1024, 1, 32),
+    },
+    ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.bfloat16, 0.5)): {
+        (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 2),
+        (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4),
+        (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 1),
+        (256, 256, 256, 128, 128): (2, 4, 16, 64, 1, 4),
+        (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4),
+        (256, 256, 512, 32, 32): (1, 1, 16, 32, 1, 4),
+        (256, 256, 512, 64, 64): (1, 1, 16, 32, 1, 1),
+        (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4),
+        (256, 256, 1024, 16, 16): (1, 1, 16, 16, 1, 4),
+        (256, 256, 1024, 32, 32): (1, 2, 16, 32, 1, 1),
+        (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 2),
+        (256, 256, 1024, 128, 128): (1, 1, 32, 64, 1, 4),
+        (256, 256, 2048, 16, 16): (1, 1, 16, 64, 1, 8),
+        (256, 256, 2048, 32, 32): (2, 1, 32, 64, 1, 2),
+        (256, 256, 2048, 64, 64): (1, 1, 32, 32, 1, 1),
+        (256, 256, 2048, 128, 128): (1, 1, 64, 64, 1, 4),
+        (256, 256, 4096, 16, 16): (1, 1, 16, 64, 1, 1),
+        (256, 256, 4096, 32, 32): (2, 2, 32, 64, 1, 2),
+        (256, 256, 4096, 64, 64): (1, 1, 32, 128, 1, 4),
+        (256, 256, 4096, 128, 128): (1, 1, 64, 64, 1, 4),
+        (256, 256, 8192, 16, 16): (1, 2, 16, 64, 1, 2),
+        (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 2),
+        (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 2),
+        (256, 256, 8192, 128, 128): (1, 1, 64, 64, 1, 4),
+        (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2),
+        (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 2),
+        (256, 256, 16384, 64, 64): (1, 1, 64, 64, 1, 2),
+        (256, 256, 16384, 128, 128): (2, 16, 64, 64, 1, 4),
+        (256, 256, 32768, 16, 16): (1, 1, 16, 128, 1, 2),
+        (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 2),
+        (256, 256, 32768, 64, 64): (1, 1, 64, 64, 1, 2),
+        (256, 256, 32768, 128, 128): (2, 32, 64, 64, 1, 4),
+        (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 1),
+        (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 2),
+        (256, 256, 65536, 64, 64): (1, 1, 64, 32, 1, 1),
+        (256, 256, 65536, 128, 128): (2, 32, 64, 64, 1, 4),
+        (256, 256, 131072, 16, 16): (1, 1, 16, 64, 1, 1),
+        (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 2),
+        (256, 256, 131072, 64, 64): (4, 1, 64, 32, 1, 1),
+        (256, 256, 131072, 128, 128): (2, 64, 64, 64, 1, 4),
+        (512, 512, 256, 16, 16): (1, 1, 16, 16, 1, 2),
+        (512, 512, 256, 32, 32): (1, 1, 16, 32, 1, 1),
+        (512, 512, 256, 64, 64): (1, 2, 16, 32, 1, 1),
+        (512, 512, 256, 128, 128): (2, 16, 64, 16, 2, 4),
+        (512, 512, 512, 16, 16): (1, 1, 16, 16, 1, 4),
+        (512, 512, 512, 32, 32): (1, 1, 16, 32, 1, 1),
+        (512, 512, 512, 64, 64): (1, 1, 32, 32, 1, 2),
+        (512, 512, 512, 128, 128): (2, 8, 32, 64, 1, 4),
+        (512, 512, 1024, 16, 16): (1, 1, 16, 64, 1, 8),
+        (512, 512, 1024, 32, 32): (1, 1, 32, 32, 3, 1),
+        (512, 512, 1024, 64, 64): (1, 4, 32, 64, 1, 2),
+        (512, 512, 1024, 128, 128): (1, 4, 64, 64, 1, 4),
+        (512, 512, 2048, 16, 16): (1, 1, 16, 64, 1, 2),
+        (512, 512, 2048, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 2048, 64, 64): (1, 1, 64, 64, 3, 4),
+        (512, 512, 2048, 128, 128): (1, 1, 64, 64, 1, 4),
+        (512, 512, 4096, 16, 16): (1, 1, 16, 64, 1, 2),
+        (512, 512, 4096, 32, 32): (2, 64, 32, 64, 1, 2),
+        (512, 512, 4096, 64, 64): (1, 1, 64, 64, 3, 4),
+        (512, 512, 4096, 128, 128): (1, 1, 64, 64, 1, 4),
+        (512, 512, 8192, 16, 16): (1, 2, 16, 128, 1, 2),
+        (512, 512, 8192, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 8192, 64, 64): (1, 1, 64, 64, 1, 2),
+        (512, 512, 8192, 128, 128): (1, 1, 64, 64, 1, 4),
+        (512, 512, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (512, 512, 16384, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 16384, 64, 64): (1, 1, 64, 64, 3, 2),
+        (512, 512, 16384, 128, 128): (2, 1, 64, 64, 1, 4),
+        (512, 512, 32768, 16, 16): (1, 2, 16, 128, 1, 2),
+        (512, 512, 32768, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 32768, 64, 64): (1, 1, 64, 64, 3, 4),
+        (512, 512, 32768, 128, 128): (2, 1, 64, 64, 1, 4),
+        (512, 512, 65536, 16, 16): (1, 2, 16, 128, 1, 2),
+        (512, 512, 65536, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 65536, 64, 64): (1, 1, 64, 64, 3, 4),
+        (512, 512, 65536, 128, 128): (2, 1, 64, 64, 1, 4),
+        (512, 512, 131072, 16, 16): (1, 1, 16, 64, 1, 1),
+        (512, 512, 131072, 32, 32): (1, 1, 32, 64, 1, 2),
+        (512, 512, 131072, 64, 64): (1, 1, 64, 64, 3, 4),
+        (512, 512, 131072, 128, 128): (2, 4, 64, 64, 1, 4),
+        (1024, 1024, 256, 16, 16): (1, 1, 16, 16, 1, 4),
+        (1024, 1024, 256, 32, 32): (2, 16, 32, 16, 3, 4),
+        (1024, 1024, 256, 64, 64): (1, 4, 32, 32, 1, 2),
+        (1024, 1024, 256, 128, 128): (1, 4, 128, 16, 3, 16),
+        (1024, 1024, 512, 16, 16): (1, 1, 16, 64, 1, 2),
+        (1024, 1024, 512, 32, 32): (2, 2, 32, 64, 1, 2),
+        (1024, 1024, 512, 64, 64): (2, 8, 64, 64, 3, 4),
+        (1024, 1024, 512, 128, 128): (1, 4, 64, 64, 1, 8),
+        (1024, 1024, 1024, 16, 16): (1, 1, 16, 64, 1, 2),
+        (1024, 1024, 1024, 32, 32): (1, 1, 32, 64, 1, 2),
+        (1024, 1024, 1024, 64, 64): (1, 8, 64, 64, 3, 4),
+        (1024, 1024, 1024, 128, 128): (1, 8, 64, 64, 1, 4),
+        (1024, 1024, 2048, 16, 16): (1, 2, 16, 64, 1, 2),
+        (1024, 1024, 2048, 32, 32): (1, 1, 32, 64, 1, 2),
+        (1024, 1024, 2048, 64, 64): (2, 16, 64, 64, 2, 2),
+        (1024, 1024, 2048, 128, 128): (2, 32, 64, 64, 1, 4),
+        (1024, 1024, 4096, 16, 16): (2, 16, 16, 128, 1, 2),
+        (1024, 1024, 4096, 32, 32): (1, 16, 32, 64, 3, 2),
+        (1024, 1024, 4096, 64, 64): (1, 1, 64, 64, 3, 4),
+        (1024, 1024, 4096, 128, 128): (2, 64, 128, 64, 1, 4),
+        (1024, 1024, 8192, 16, 16): (2, 16, 16, 128, 1, 2),
+        (1024, 1024, 8192, 32, 32): (1, 16, 32, 64, 3, 2),
+        (1024, 1024, 8192, 64, 64): (1, 1, 64, 64, 3, 4),
+        (1024, 1024, 8192, 128, 128): (2, 1, 64, 64, 1, 4),
+        (1024, 1024, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (1024, 1024, 16384, 32, 32): (1, 16, 32, 64, 3, 2),
+        (1024, 1024, 16384, 64, 64): (1, 1, 64, 64, 3, 4),
+        (1024, 1024, 16384, 128, 128): (2, 16, 128, 64, 1, 4),
+        (1024, 1024, 32768, 16, 16): (1, 1, 16, 128, 1, 2),
+        (1024, 1024, 32768, 32, 32): (1, 1, 32, 128, 1, 2),
+        (1024, 1024, 32768, 64, 64): (1, 32, 64, 32, 2, 1),
+        (1024, 1024, 32768, 128, 128): (2, 8, 128, 64, 1, 4),
+        (1024, 1024, 65536, 16, 16): (3, 2, 16, 128, 1, 2),
+        (1024, 1024, 65536, 32, 32): (1, 1, 32, 128, 1, 2),
+        (1024, 1024, 65536, 64, 64): (2, 4, 64, 32, 2, 1),
+        (1024, 1024, 65536, 128, 128): (2, 8, 128, 64, 1, 4),
+        (1024, 1024, 131072, 16, 16): (2, 1, 16, 128, 1, 2),
+        (1024, 1024, 131072, 32, 32): (1, 1, 32, 128, 1, 2),
+        (1024, 1024, 131072, 64, 64): (1, 4, 64, 32, 2, 1),
+        (1024, 1024, 131072, 128, 128): (4, 1, 128, 64, 1, 4),
+        (2048, 2048, 256, 16, 16): (1, 1, 16, 64, 1, 8),
+        (2048, 2048, 256, 32, 32): (1, 1, 32, 32, 3, 1),
+        (2048, 2048, 256, 64, 64): (1, 1, 32, 32, 2, 1),
+        (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8),
+        (2048, 2048, 512, 16, 16): (1, 2, 16, 64, 1, 2),
+        (2048, 2048, 512, 32, 32): (1, 2, 32, 64, 1, 4),
+        (2048, 2048, 512, 64, 64): (1, 4, 64, 64, 1, 8),
+        (2048, 2048, 512, 128, 128): (1, 4, 64, 64, 1, 4),
+        (2048, 2048, 1024, 16, 16): (1, 2, 16, 128, 1, 2),
+        (2048, 2048, 1024, 32, 32): (1, 1, 32, 64, 1, 2),
+        (2048, 2048, 1024, 64, 64): (1, 8, 64, 64, 1, 4),
+        (2048, 2048, 1024, 128, 128): (1, 8, 128, 64, 1, 4),
+        (2048, 2048, 2048, 16, 16): (3, 4, 16, 128, 1, 2),
+        (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 5, 2),
+        (2048, 2048, 2048, 64, 64): (1, 1, 64, 64, 3, 4),
+        (2048, 2048, 2048, 128, 128): (1, 8, 128, 64, 1, 4),
+        (2048, 2048, 4096, 16, 16): (1, 2, 16, 128, 1, 2),
+        (2048, 2048, 4096, 32, 32): (1, 8, 32, 64, 3, 2),
+        (2048, 2048, 4096, 64, 64): (1, 1, 64, 64, 3, 4),
+        (2048, 2048, 4096, 128, 128): (1, 8, 128, 64, 1, 4),
+        (2048, 2048, 8192, 16, 16): (2, 4, 16, 128, 1, 2),
+        (2048, 2048, 8192, 32, 32): (1, 4, 32, 128, 3, 2),
+        (2048, 2048, 8192, 64, 64): (1, 8, 64, 64, 3, 2),
+        (2048, 2048, 8192, 128, 128): (1, 8, 128, 64, 1, 4),
+        (2048, 2048, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (2048, 2048, 16384, 32, 32): (1, 4, 32, 128, 3, 2),
+        (2048, 2048, 16384, 64, 64): (1, 8, 64, 64, 3, 2),
+        (2048, 2048, 16384, 128, 128): (1, 4, 128, 64, 1, 4),
+        (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2),
+        (2048, 2048, 32768, 32, 32): (1, 1, 32, 128, 3, 2),
+        (2048, 2048, 32768, 64, 64): (1, 1, 64, 64, 3, 2),
+        (2048, 2048, 32768, 128, 128): (1, 4, 128, 64, 1, 4),
+        (2048, 2048, 65536, 16, 16): (1, 2, 16, 128, 1, 2),
+        (2048, 2048, 65536, 32, 32): (1, 4, 32, 128, 1, 2),
+        (2048, 2048, 65536, 64, 64): (1, 1, 64, 64, 3, 2),
+        (2048, 2048, 65536, 128, 128): (1, 2, 128, 64, 1, 4),
+        (2048, 2048, 131072, 16, 16): (4, 2, 16, 128, 1, 2),
+        (2048, 2048, 131072, 32, 32): (1, 1, 32, 128, 3, 2),
+        (2048, 2048, 131072, 64, 64): (1, 1, 64, 64, 3, 2),
+        (2048, 2048, 131072, 128, 128): (1, 2, 128, 64, 1, 4),
+        (4096, 4096, 256, 16, 16): (1, 1, 16, 64, 1, 2),
+        (4096, 4096, 256, 32, 32): (1, 1, 32, 64, 3, 4),
+        (4096, 4096, 256, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 256, 128, 128): (3, 4, 128, 32, 1, 4),
+        (4096, 4096, 512, 16, 16): (1, 2, 16, 128, 1, 2),
+        (4096, 4096, 512, 32, 32): (1, 2, 32, 64, 3, 2),
+        (4096, 4096, 512, 64, 64): (1, 4, 64, 64, 1, 4),
+        (4096, 4096, 512, 128, 128): (1, 4, 128, 64, 1, 4),
+        (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2),
+        (4096, 4096, 1024, 32, 32): (1, 8, 32, 64, 3, 2),
+        (4096, 4096, 1024, 64, 64): (1, 4, 64, 64, 1, 4),
+        (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 1, 4),
+        (4096, 4096, 2048, 16, 16): (1, 1, 16, 128, 1, 2),
+        (4096, 4096, 2048, 32, 32): (1, 4, 32, 128, 1, 4),
+        (4096, 4096, 2048, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 2048, 128, 128): (1, 16, 128, 64, 1, 4),
+        (4096, 4096, 4096, 16, 16): (1, 1, 16, 64, 3, 1),
+        (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2),
+        (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 4096, 128, 128): (5, 1, 128, 64, 1, 4),
+        (4096, 4096, 8192, 16, 16): (1, 1, 16, 128, 1, 2),
+        (4096, 4096, 8192, 32, 32): (1, 1, 32, 128, 3, 2),
+        (4096, 4096, 8192, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 8192, 128, 128): (2, 1, 128, 64, 1, 4),
+        (4096, 4096, 16384, 16, 16): (1, 1, 16, 128, 1, 2),
+        (4096, 4096, 16384, 32, 32): (1, 1, 32, 128, 3, 2),
+        (4096, 4096, 16384, 64, 64): (1, 1, 64, 64, 4, 4),
+        (4096, 4096, 16384, 128, 128): (2, 1, 128, 64, 1, 4),
+        (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2),
+        (4096, 4096, 32768, 32, 32): (1, 1, 32, 128, 3, 2),
+        (4096, 4096, 32768, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 32768, 128, 128): (2, 1, 128, 64, 1, 4),
+        (4096, 4096, 65536, 16, 16): (2, 2, 16, 128, 1, 2),
+        (4096, 4096, 65536, 32, 32): (1, 1, 32, 128, 4, 2),
+        (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 4, 4),
+        (4096, 4096, 65536, 128, 128): (2, 1, 128, 64, 1, 4),
+        (4096, 4096, 131072, 16, 16): (2, 1, 16, 128, 1, 2),
+        (4096, 4096, 131072, 32, 32): (1, 1, 32, 128, 3, 2),
+        (4096, 4096, 131072, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 131072, 128, 128): (2, 1, 128, 64, 1, 4),
+        (8192, 8192, 256, 16, 16): (1, 2, 16, 64, 1, 2),
+        (8192, 8192, 256, 32, 32): (1, 1, 32, 64, 1, 2),
+        (8192, 8192, 256, 64, 64): (1, 2, 64, 64, 1, 4),
+        (8192, 8192, 256, 128, 128): (3, 16, 128, 16, 1, 2),
+        (8192, 8192, 512, 16, 16): (1, 2, 16, 128, 1, 2),
+        (8192, 8192, 512, 32, 32): (1, 4, 32, 64, 3, 2),
+        (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4),
+        (8192, 8192, 512, 128, 128): (1, 8, 128, 64, 1, 4),
+        (8192, 8192, 1024, 16, 16): (4, 2, 16, 128, 1, 2),
+        (8192, 8192, 1024, 32, 32): (1, 8, 32, 128, 1, 2),
+        (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2),
+        (8192, 8192, 1024, 128, 128): (2, 16, 128, 64, 2, 4),
+        (8192, 8192, 2048, 16, 16): (2, 1, 16, 64, 4, 1),
+        (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2),
+        (8192, 8192, 2048, 64, 64): (1, 16, 64, 64, 3, 2),
+        (8192, 8192, 2048, 128, 128): (2, 16, 128, 64, 2, 4),
+        (8192, 8192, 4096, 16, 16): (1, 1, 16, 64, 4, 1),
+        (8192, 8192, 4096, 32, 32): (1, 16, 32, 64, 5, 2),
+        (8192, 8192, 4096, 64, 64): (1, 16, 64, 64, 3, 2),
+        (8192, 8192, 4096, 128, 128): (2, 64, 128, 64, 2, 4),
+        (8192, 8192, 8192, 16, 16): (1, 1, 16, 64, 4, 1),
+        (8192, 8192, 8192, 32, 32): (1, 8, 32, 128, 5, 4),
+        (8192, 8192, 8192, 64, 64): (1, 8, 64, 64, 3, 2),
+        (8192, 8192, 8192, 128, 128): (2, 8, 128, 64, 1, 4),
+        (8192, 8192, 16384, 16, 16): (1, 1, 16, 64, 4, 1),
+        (8192, 8192, 16384, 32, 32): (1, 8, 32, 64, 5, 2),
+        (8192, 8192, 16384, 64, 64): (1, 8, 64, 64, 3, 2),
+        (8192, 8192, 16384, 128, 128): (1, 8, 128, 64, 1, 4),
+        (8192, 8192, 32768, 16, 16): (1, 1, 16, 64, 4, 1),
+        (8192, 8192, 32768, 32, 32): (1, 8, 32, 64, 5, 2),
+        (8192, 8192, 32768, 64, 64): (3, 8, 64, 64, 3, 2),
+        (8192, 8192, 32768, 128, 128): (2, 8, 128, 64, 1, 4),
+        (8192, 8192, 65536, 16, 16): (1, 1, 16, 64, 4, 1),
+        (8192, 8192, 65536, 32, 32): (5, 4, 32, 64, 3, 2),
+        (8192, 8192, 65536, 64, 64): (1, 8, 64, 64, 3, 2),
+        (8192, 8192, 65536, 128, 128): (2, 8, 128, 64, 1, 4),
+        (8192, 8192, 131072, 16, 16): (2, 1, 16, 64, 4, 1),
+        (8192, 8192, 131072, 32, 32): (1, 4, 32, 64, 5, 2),
+        (8192, 8192, 131072, 64, 64): (1, 4, 64, 128, 3, 4),
+        (8192, 8192, 131072, 128, 128): (2, 8, 128, 64, 1, 4),
+        (16384, 16384, 256, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 256, 32, 32): (1, 4, 32, 64, 3, 2),
+        (16384, 16384, 256, 64, 64): (2, 4, 64, 64, 4, 4),
+        (16384, 16384, 256, 128, 128): (1, 4, 128, 64, 1, 16),
+        (16384, 16384, 512, 16, 16): (1, 2, 16, 128, 3, 2),
+        (16384, 16384, 512, 32, 32): (1, 4, 32, 128, 5, 4),
+        (16384, 16384, 512, 64, 64): (1, 8, 64, 64, 3, 2),
+        (16384, 16384, 512, 128, 128): (2, 8, 128, 64, 1, 4),
+        (16384, 16384, 1024, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 1024, 32, 32): (1, 8, 32, 64, 5, 2),
+        (16384, 16384, 1024, 64, 64): (1, 16, 64, 64, 3, 2),
+        (16384, 16384, 1024, 128, 128): (5, 16, 128, 64, 2, 4),
+        (16384, 16384, 2048, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2),
+        (16384, 16384, 2048, 64, 64): (1, 16, 64, 64, 3, 2),
+        (16384, 16384, 2048, 128, 128): (4, 32, 128, 64, 2, 4),
+        (16384, 16384, 4096, 16, 16): (3, 2, 16, 128, 1, 2),
+        (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 4096, 64, 64): (2, 16, 64, 64, 3, 2),
+        (16384, 16384, 4096, 128, 128): (3, 32, 128, 64, 2, 4),
+        (16384, 16384, 8192, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 8192, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2),
+        (16384, 16384, 8192, 128, 128): (5, 8, 128, 64, 1, 4),
+        (16384, 16384, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 16384, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 16384, 64, 64): (2, 4, 64, 128, 3, 4),
+        (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 1, 4),
+        (16384, 16384, 32768, 16, 16): (4, 2, 16, 128, 1, 2),
+        (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 32768, 64, 64): (1, 8, 64, 64, 3, 2),
+        (16384, 16384, 32768, 128, 128): (2, 512, 128, 64, 2, 4),
+        (16384, 16384, 65536, 16, 16): (3, 2, 16, 128, 1, 2),
+        (16384, 16384, 65536, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 65536, 64, 64): (1, 4, 64, 128, 3, 4),
+        (16384, 16384, 65536, 128, 128): (2, 1024, 128, 64, 2, 4),
+        (16384, 16384, 131072, 16, 16): (1, 2, 16, 128, 1, 2),
+        (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 131072, 64, 64): (3, 4, 64, 128, 3, 4),
+        (16384, 16384, 131072, 128, 128): (4, 2048, 128, 64, 2, 4),
+    },
+    ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float16, 0.5)): {
+        (256, 256, 256, 16, 16): (5, 4, 16, 16, 1, 4),
+        (256, 256, 256, 32, 32): (5, 2, 32, 16, 1, 4),
+        (256, 256, 256, 64, 64): (4, 1, 32, 32, 1, 8),
+        (256, 256, 256, 128, 128): (2, 1, 32, 32, 1, 4),
+        (256, 256, 512, 16, 16): (2, 2, 16, 32, 1, 4),
+        (256, 256, 512, 32, 32): (4, 8, 32, 32, 1, 8),
+        (256, 256, 512, 64, 64): (4, 8, 32, 64, 1, 4),
+        (256, 256, 512, 128, 128): (4, 8, 32, 64, 1, 4),
+        (256, 256, 1024, 16, 16): (4, 2, 16, 64, 1, 2),
+        (256, 256, 1024, 32, 32): (4, 16, 32, 64, 1, 2),
+        (256, 256, 1024, 64, 64): (4, 16, 32, 64, 1, 4),
+        (256, 256, 1024, 128, 128): (4, 16, 64, 64, 1, 8),
+        (256, 256, 2048, 16, 16): (2, 16, 16, 64, 1, 8),
+        (256, 256, 2048, 32, 32): (4, 16, 32, 64, 1, 2),
+        (256, 256, 2048, 64, 64): (4, 16, 32, 64, 1, 4),
+        (256, 256, 2048, 128, 128): (4, 16, 64, 64, 1, 4),
+        (256, 256, 4096, 16, 16): (4, 32, 16, 64, 1, 1),
+        (256, 256, 4096, 32, 32): (2, 64, 32, 64, 1, 2),
+        (256, 256, 4096, 64, 64): (4, 64, 64, 64, 1, 4),
+        (256, 256, 4096, 128, 128): (4, 32, 64, 64, 1, 4),
+        (256, 256, 8192, 16, 16): (4, 64, 16, 64, 1, 1),
+        (256, 256, 8192, 32, 32): (4, 128, 32, 64, 1, 2),
+        (256, 256, 8192, 64, 64): (4, 64, 64, 64, 1, 4),
+        (256, 256, 8192, 128, 128): (4, 64, 64, 64, 1, 4),
+        (256, 256, 16384, 16, 16): (4, 128, 16, 64, 1, 1),
+        (256, 256, 16384, 32, 32): (2, 128, 32, 64, 1, 2),
+        (256, 256, 16384, 64, 64): (4, 32, 32, 128, 1, 4),
+        (256, 256, 16384, 128, 128): (4, 16, 64, 64, 1, 4),
+        (256, 256, 32768, 16, 16): (4, 64, 16, 64, 1, 1),
+        (256, 256, 32768, 32, 32): (2, 256, 32, 64, 1, 2),
+        (256, 256, 32768, 64, 64): (4, 32, 32, 128, 1, 4),
+        (256, 256, 32768, 128, 128): (4, 32, 64, 64, 1, 4),
+        (256, 256, 65536, 16, 16): (4, 128, 16, 64, 1, 1),
+        (256, 256, 65536, 32, 32): (4, 1, 32, 64, 1, 2),
+        (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 2),
+        (256, 256, 65536, 128, 128): (4, 32, 64, 64, 1, 4),
+        (256, 256, 131072, 16, 16): (4, 64, 16, 64, 1, 1),
+        (256, 256, 131072, 32, 32): (2, 1, 32, 64, 1, 2),
+        (256, 256, 131072, 64, 64): (4, 32, 32, 128, 1, 4),
+        (256, 256, 131072, 128, 128): (4, 32, 64, 64, 1, 4),
+        (512, 512, 256, 16, 16): (4, 16, 16, 16, 1, 4),
+        (512, 512, 256, 32, 32): (2, 4, 32, 16, 1, 4),
+        (512, 512, 256, 64, 64): (2, 16, 64, 16, 3, 8),
+        (512, 512, 256, 128, 128): (4, 16, 64, 16, 1, 4),
+        (512, 512, 512, 16, 16): (1, 1, 16, 64, 1, 8),
+        (512, 512, 512, 32, 32): (2, 4, 16, 32, 1, 1),
+        (512, 512, 512, 64, 64): (2, 1, 32, 32, 1, 2),
+        (512, 512, 512, 128, 128): (4, 8, 32, 64, 1, 4),
+        (512, 512, 1024, 16, 16): (2, 8, 16, 64, 1, 8),
+        (512, 512, 1024, 32, 32): (4, 16, 32, 64, 1, 2),
+        (512, 512, 1024, 64, 64): (4, 16, 64, 64, 1, 4),
+        (512, 512, 1024, 128, 128): (2, 8, 64, 64, 1, 4),
+        (512, 512, 2048, 16, 16): (4, 16, 16, 64, 1, 4),
+        (512, 512, 2048, 32, 32): (4, 16, 32, 64, 1, 2),
+        (512, 512, 2048, 64, 64): (4, 16, 64, 64, 1, 8),
+        (512, 512, 2048, 128, 128): (4, 16, 64, 64, 1, 4),
+        (512, 512, 4096, 16, 16): (4, 32, 16, 128, 1, 2),
+        (512, 512, 4096, 32, 32): (4, 32, 32, 64, 1, 2),
+        (512, 512, 4096, 64, 64): (4, 32, 64, 64, 1, 4),
+        (512, 512, 4096, 128, 128): (4, 32, 64, 64, 1, 4),
+        (512, 512, 8192, 16, 16): (2, 32, 16, 128, 1, 2),
+        (512, 512, 8192, 32, 32): (4, 64, 32, 64, 1, 2),
+        (512, 512, 8192, 64, 64): (4, 128, 64, 64, 1, 2),
+        (512, 512, 8192, 128, 128): (4, 64, 64, 64, 1, 4),
+        (512, 512, 16384, 16, 16): (4, 32, 16, 64, 1, 1),
+        (512, 512, 16384, 32, 32): (4, 64, 32, 64, 1, 2),
+        (512, 512, 16384, 64, 64): (4, 16, 64, 64, 1, 4),
+        (512, 512, 16384, 128, 128): (4, 32, 64, 64, 1, 4),
+        (512, 512, 32768, 16, 16): (7, 16, 16, 128, 1, 2),
+        (512, 512, 32768, 32, 32): (4, 64, 32, 64, 1, 2),
+        (512, 512, 32768, 64, 64): (2, 32, 64, 64, 3, 2),
+        (512, 512, 32768, 128, 128): (2, 32, 64, 64, 1, 4),
+        (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1),
+        (512, 512, 65536, 32, 32): (4, 64, 32, 64, 1, 2),
+        (512, 512, 65536, 64, 64): (3, 32, 64, 64, 3, 2),
+        (512, 512, 65536, 128, 128): (4, 16, 64, 64, 1, 4),
+        (512, 512, 131072, 16, 16): (3, 32, 16, 128, 1, 2),
+        (512, 512, 131072, 32, 32): (4, 64, 32, 64, 1, 2),
+        (512, 512, 131072, 64, 64): (2, 32, 64, 64, 3, 2),
+        (512, 512, 131072, 128, 128): (3, 1, 64, 64, 1, 4),
+        (1024, 1024, 256, 16, 16): (4, 16, 16, 16, 1, 4),
+        (1024, 1024, 256, 32, 32): (4, 16, 32, 16, 1, 4),
+        (1024, 1024, 256, 64, 64): (4, 4, 64, 32, 1, 16),
+        (1024, 1024, 256, 128, 128): (4, 16, 64, 16, 1, 8),
+        (1024, 1024, 512, 16, 16): (2, 8, 16, 64, 1, 8),
+        (1024, 1024, 512, 32, 32): (3, 2, 32, 64, 1, 2),
+        (1024, 1024, 512, 64, 64): (4, 8, 32, 64, 1, 8),
+        (1024, 1024, 512, 128, 128): (4, 8, 64, 64, 1, 8),
+        (1024, 1024, 1024, 16, 16): (2, 2, 16, 64, 1, 2),
+        (1024, 1024, 1024, 32, 32): (2, 8, 32, 64, 1, 2),
+        (1024, 1024, 1024, 64, 64): (2, 8, 32, 128, 1, 4),
+        (1024, 1024, 1024, 128, 128): (2, 8, 64, 64, 1, 4),
+        (1024, 1024, 2048, 16, 16): (2, 16, 16, 128, 3, 2),
+        (1024, 1024, 2048, 32, 32): (4, 32, 32, 64, 1, 2),
+        (1024, 1024, 2048, 64, 64): (4, 16, 64, 64, 1, 4),
+        (1024, 1024, 2048, 128, 128): (4, 32, 64, 64, 1, 4),
+        (1024, 1024, 4096, 16, 16): (4, 16, 16, 128, 1, 2),
+        (1024, 1024, 4096, 32, 32): (3, 32, 32, 64, 1, 2),
+        (1024, 1024, 4096, 64, 64): (4, 32, 64, 64, 1, 4),
+        (1024, 1024, 4096, 128, 128): (4, 32, 64, 64, 1, 4),
+        (1024, 1024, 8192, 16, 16): (5, 16, 16, 128, 1, 2),
+        (1024, 1024, 8192, 32, 32): (2, 32, 32, 64, 3, 2),
+        (1024, 1024, 8192, 64, 64): (1, 16, 64, 64, 3, 2),
+        (1024, 1024, 8192, 128, 128): (4, 32, 64, 64, 1, 4),
+        (1024, 1024, 16384, 16, 16): (4, 16, 16, 128, 1, 2),
+        (1024, 1024, 16384, 32, 32): (1, 32, 32, 64, 3, 2),
+        (1024, 1024, 16384, 64, 64): (4, 16, 64, 64, 3, 2),
+        (1024, 1024, 16384, 128, 128): (4, 32, 128, 64, 1, 4),
+        (1024, 1024, 32768, 16, 16): (3, 16, 16, 128, 1, 2),
+        (1024, 1024, 32768, 32, 32): (1, 8, 32, 64, 3, 2),
+        (1024, 1024, 32768, 64, 64): (4, 16, 64, 64, 3, 2),
+        (1024, 1024, 32768, 128, 128): (4, 8, 128, 64, 2, 4),
+        (1024, 1024, 65536, 16, 16): (1, 2, 16, 128, 1, 2),
+        (1024, 1024, 65536, 32, 32): (2, 4, 32, 64, 3, 2),
+        (1024, 1024, 65536, 64, 64): (5, 16, 64, 64, 3, 2),
+        (1024, 1024, 65536, 128, 128): (5, 8, 128, 64, 2, 4),
+        (1024, 1024, 131072, 16, 16): (5, 2, 16, 128, 1, 2),
+        (1024, 1024, 131072, 32, 32): (1, 2, 32, 64, 3, 2),
+        (1024, 1024, 131072, 64, 64): (5, 16, 64, 64, 3, 2),
+        (1024, 1024, 131072, 128, 128): (2, 1, 128, 64, 2, 4),
+        (2048, 2048, 256, 16, 16): (4, 4, 16, 64, 1, 8),
+        (2048, 2048, 256, 32, 32): (4, 8, 32, 32, 1, 8),
+        (2048, 2048, 256, 64, 64): (4, 16, 64, 16, 1, 8),
+        (2048, 2048, 256, 128, 128): (4, 4, 128, 32, 3, 8),
+        (2048, 2048, 512, 16, 16): (2, 2, 16, 64, 1, 2),
+        (2048, 2048, 512, 32, 32): (2, 4, 32, 64, 3, 2),
+        (2048, 2048, 512, 64, 64): (4, 4, 64, 64, 1, 8),
+        (2048, 2048, 512, 128, 128): (4, 8, 64, 64, 1, 4),
+        (2048, 2048, 1024, 16, 16): (1, 8, 16, 64, 1, 2),
+        (2048, 2048, 1024, 32, 32): (2, 16, 32, 64, 3, 2),
+        (2048, 2048, 1024, 64, 64): (4, 8, 64, 64, 1, 4),
+        (2048, 2048, 1024, 128, 128): (4, 8, 128, 64, 1, 4),
+        (2048, 2048, 2048, 16, 16): (5, 4, 16, 128, 1, 2),
+        (2048, 2048, 2048, 32, 32): (1, 16, 32, 64, 3, 2),
+        (2048, 2048, 2048, 64, 64): (2, 8, 64, 64, 1, 4),
+        (2048, 2048, 2048, 128, 128): (2, 8, 128, 64, 1, 4),
+        (2048, 2048, 4096, 16, 16): (4, 2, 16, 128, 1, 2),
+        (2048, 2048, 4096, 32, 32): (2, 16, 32, 64, 3, 2),
+        (2048, 2048, 4096, 64, 64): (2, 8, 64, 64, 3, 2),
+        (2048, 2048, 4096, 128, 128): (4, 8, 128, 64, 1, 4),
+        (2048, 2048, 8192, 16, 16): (5, 4, 16, 128, 1, 2),
+        (2048, 2048, 8192, 32, 32): (2, 8, 32, 64, 3, 2),
+        (2048, 2048, 8192, 64, 64): (4, 8, 64, 64, 3, 2),
+        (2048, 2048, 8192, 128, 128): (4, 8, 128, 64, 1, 4),
+        (2048, 2048, 16384, 16, 16): (3, 2, 16, 128, 1, 2),
+        (2048, 2048, 16384, 32, 32): (2, 4, 32, 128, 3, 2),
+        (2048, 2048, 16384, 64, 64): (4, 8, 64, 64, 3, 2),
+        (2048, 2048, 16384, 128, 128): (4, 4, 128, 64, 1, 4),
+        (2048, 2048, 32768, 16, 16): (3, 2, 16, 128, 1, 2),
+        (2048, 2048, 32768, 32, 32): (3, 4, 32, 128, 3, 2),
+        (2048, 2048, 32768, 64, 64): (6, 4, 64, 64, 3, 2),
+        (2048, 2048, 32768, 128, 128): (3, 4, 128, 64, 1, 4),
+        (2048, 2048, 65536, 16, 16): (6, 2, 16, 128, 1, 2),
+        (2048, 2048, 65536, 32, 32): (1, 2, 32, 128, 1, 2),
+        (2048, 2048, 65536, 64, 64): (5, 4, 64, 64, 3, 2),
+        (2048, 2048, 65536, 128, 128): (5, 1, 128, 64, 2, 4),
+        (2048, 2048, 131072, 16, 16): (3, 2, 16, 128, 1, 2),
+        (2048, 2048, 131072, 32, 32): (2, 1, 32, 128, 3, 2),
+        (2048, 2048, 131072, 64, 64): (4, 1, 64, 64, 3, 2),
+        (2048, 2048, 131072, 128, 128): (3, 1, 128, 64, 2, 4),
+        (4096, 4096, 256, 16, 16): (5, 8, 16, 32, 1, 4),
+        (4096, 4096, 256, 32, 32): (4, 16, 32, 16, 2, 4),
+        (4096, 4096, 256, 64, 64): (2, 1, 64, 64, 3, 4),
+        (4096, 4096, 256, 128, 128): (4, 4, 128, 32, 1, 4),
+        (4096, 4096, 512, 16, 16): (4, 2, 16, 128, 1, 2),
+        (4096, 4096, 512, 32, 32): (4, 8, 32, 64, 1, 2),
+        (4096, 4096, 512, 64, 64): (4, 4, 64, 64, 1, 4),
+        (4096, 4096, 512, 128, 128): (4, 8, 128, 64, 2, 4),
+        (4096, 4096, 1024, 16, 16): (1, 2, 16, 128, 1, 2),
+        (4096, 4096, 1024, 32, 32): (6, 8, 32, 64, 3, 2),
+        (4096, 4096, 1024, 64, 64): (2, 16, 64, 64, 4, 4),
+        (4096, 4096, 1024, 128, 128): (2, 4, 128, 64, 2, 4),
+        (4096, 4096, 2048, 16, 16): (3, 1, 16, 128, 1, 2),
+        (4096, 4096, 2048, 32, 32): (1, 4, 32, 64, 5, 2),
+        (4096, 4096, 2048, 64, 64): (3, 16, 64, 64, 3, 2),
+        (4096, 4096, 2048, 128, 128): (4, 32, 128, 64, 2, 4),
+        (4096, 4096, 4096, 16, 16): (1, 2, 16, 128, 1, 2),
+        (4096, 4096, 4096, 32, 32): (1, 4, 32, 64, 3, 2),
+        (4096, 4096, 4096, 64, 64): (1, 1, 64, 64, 4, 4),
+        (4096, 4096, 4096, 128, 128): (2, 1, 128, 128, 1, 8),
+        (4096, 4096, 8192, 16, 16): (3, 1, 16, 128, 1, 2),
+        (4096, 4096, 8192, 32, 32): (2, 2, 32, 64, 5, 2),
+        (4096, 4096, 8192, 64, 64): (4, 16, 64, 64, 3, 2),
+        (4096, 4096, 8192, 128, 128): (4, 16, 128, 64, 2, 4),
+        (4096, 4096, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (4096, 4096, 16384, 32, 32): (4, 2, 32, 64, 5, 2),
+        (4096, 4096, 16384, 64, 64): (4, 16, 64, 64, 3, 2),
+        (4096, 4096, 16384, 128, 128): (4, 16, 128, 64, 2, 4),
+        (4096, 4096, 32768, 16, 16): (3, 1, 16, 128, 1, 2),
+        (4096, 4096, 32768, 32, 32): (3, 1, 32, 128, 1, 4),
+        (4096, 4096, 32768, 64, 64): (3, 1, 64, 64, 3, 4),
+        (4096, 4096, 32768, 128, 128): (5, 16, 128, 64, 2, 4),
+        (4096, 4096, 65536, 16, 16): (5, 1, 16, 128, 1, 2),
+        (4096, 4096, 65536, 32, 32): (5, 1, 32, 128, 1, 4),
+        (4096, 4096, 65536, 64, 64): (1, 1, 64, 64, 3, 4),
+        (4096, 4096, 65536, 128, 128): (3, 16, 128, 64, 2, 4),
+        (4096, 4096, 131072, 16, 16): (3, 1, 16, 128, 1, 2),
+        (4096, 4096, 131072, 32, 32): (3, 1, 32, 128, 3, 2),
+        (4096, 4096, 131072, 64, 64): (2, 1, 64, 64, 3, 4),
+        (4096, 4096, 131072, 128, 128): (1, 1, 128, 64, 1, 4),
+        (8192, 8192, 256, 16, 16): (4, 16, 16, 16, 1, 4),
+        (8192, 8192, 256, 32, 32): (1, 16, 32, 16, 4, 4),
+        (8192, 8192, 256, 64, 64): (4, 16, 64, 16, 3, 8),
+        (8192, 8192, 256, 128, 128): (4, 16, 128, 16, 1, 2),
+        (8192, 8192, 512, 16, 16): (2, 8, 16, 64, 1, 4),
+        (8192, 8192, 512, 32, 32): (4, 8, 32, 64, 3, 2),
+        (8192, 8192, 512, 64, 64): (2, 8, 64, 64, 4, 4),
+        (8192, 8192, 512, 128, 128): (4, 8, 128, 64, 2, 4),
+        (8192, 8192, 1024, 16, 16): (4, 16, 16, 64, 1, 8),
+        (8192, 8192, 1024, 32, 32): (2, 8, 32, 64, 5, 2),
+        (8192, 8192, 1024, 64, 64): (1, 16, 64, 64, 3, 2),
+        (8192, 8192, 1024, 128, 128): (5, 16, 128, 64, 2, 4),
+        (8192, 8192, 2048, 16, 16): (7, 2, 16, 128, 1, 2),
+        (8192, 8192, 2048, 32, 32): (1, 16, 32, 64, 5, 2),
+        (8192, 8192, 2048, 64, 64): (4, 16, 64, 64, 3, 2),
+        (8192, 8192, 2048, 128, 128): (6, 16, 128, 64, 2, 4),
+        (8192, 8192, 4096, 16, 16): (4, 2, 16, 128, 1, 2),
+        (8192, 8192, 4096, 32, 32): (2, 8, 32, 64, 5, 2),
+        (8192, 8192, 4096, 64, 64): (3, 16, 64, 64, 3, 2),
+        (8192, 8192, 4096, 128, 128): (3, 64, 128, 64, 2, 4),
+        (8192, 8192, 8192, 16, 16): (4, 2, 16, 128, 1, 2),
+        (8192, 8192, 8192, 32, 32): (1, 4, 32, 128, 5, 4),
+        (8192, 8192, 8192, 64, 64): (4, 4, 64, 64, 1, 4),
+        (8192, 8192, 8192, 128, 128): (2, 2, 128, 128, 3, 8),
+        (8192, 8192, 16384, 16, 16): (1, 2, 16, 128, 1, 2),
+        (8192, 8192, 16384, 32, 32): (4, 8, 32, 64, 5, 2),
+        (8192, 8192, 16384, 64, 64): (5, 8, 64, 64, 3, 2),
+        (8192, 8192, 16384, 128, 128): (3, 16, 128, 64, 2, 4),
+        (8192, 8192, 32768, 16, 16): (7, 2, 16, 128, 1, 2),
+        (8192, 8192, 32768, 32, 32): (3, 4, 32, 64, 3, 2),
+        (8192, 8192, 32768, 64, 64): (2, 8, 64, 64, 3, 2),
+        (8192, 8192, 32768, 128, 128): (6, 16, 128, 64, 2, 4),
+        (8192, 8192, 65536, 16, 16): (9, 2, 16, 128, 1, 2),
+        (8192, 8192, 65536, 32, 32): (7, 4, 32, 64, 5, 2),
+        (8192, 8192, 65536, 64, 64): (4, 8, 64, 64, 3, 2),
+        (8192, 8192, 65536, 128, 128): (3, 16, 128, 64, 2, 4),
+        (8192, 8192, 131072, 16, 16): (9, 2, 16, 128, 1, 2),
+        (8192, 8192, 131072, 32, 32): (1, 8, 32, 64, 5, 2),
+        (8192, 8192, 131072, 64, 64): (1, 8, 64, 64, 3, 2),
+        (8192, 8192, 131072, 128, 128): (4, 16, 128, 64, 2, 4),
+        (16384, 16384, 256, 16, 16): (5, 16, 16, 16, 1, 4),
+        (16384, 16384, 256, 32, 32): (4, 16, 32, 16, 4, 4),
+        (16384, 16384, 256, 64, 64): (4, 16, 64, 16, 3, 8),
+        (16384, 16384, 256, 128, 128): (4, 16, 128, 16, 1, 2),
+        (16384, 16384, 512, 16, 16): (2, 8, 16, 64, 1, 4),
+        (16384, 16384, 512, 32, 32): (1, 4, 32, 64, 5, 2),
+        (16384, 16384, 512, 64, 64): (4, 8, 64, 64, 1, 4),
+        (16384, 16384, 512, 128, 128): (3, 8, 128, 64, 2, 4),
+        (16384, 16384, 1024, 16, 16): (4, 2, 16, 128, 1, 2),
+        (16384, 16384, 1024, 32, 32): (4, 8, 32, 64, 5, 2),
+        (16384, 16384, 1024, 64, 64): (6, 16, 64, 64, 3, 2),
+        (16384, 16384, 1024, 128, 128): (3, 16, 128, 64, 2, 4),
+        (16384, 16384, 2048, 16, 16): (3, 2, 16, 128, 1, 2),
+        (16384, 16384, 2048, 32, 32): (1, 8, 32, 64, 5, 2),
+        (16384, 16384, 2048, 64, 64): (5, 16, 64, 64, 3, 2),
+        (16384, 16384, 2048, 128, 128): (2, 32, 128, 64, 2, 4),
+        (16384, 16384, 4096, 16, 16): (2, 2, 16, 128, 1, 2),
+        (16384, 16384, 4096, 32, 32): (1, 4, 32, 64, 3, 2),
+        (16384, 16384, 4096, 64, 64): (2, 8, 64, 64, 3, 2),
+        (16384, 16384, 4096, 128, 128): (3, 16, 128, 64, 2, 4),
+        (16384, 16384, 8192, 16, 16): (3, 2, 16, 128, 1, 2),
+        (16384, 16384, 8192, 32, 32): (2, 4, 32, 64, 5, 2),
+        (16384, 16384, 8192, 64, 64): (4, 8, 64, 64, 3, 2),
+        (16384, 16384, 8192, 128, 128): (8, 32, 128, 64, 2, 4),
+        (16384, 16384, 16384, 16, 16): (1, 2, 16, 256, 1, 4),
+        (16384, 16384, 16384, 32, 32): (1, 4, 32, 128, 3, 4),
+        (16384, 16384, 16384, 64, 64): (5, 4, 64, 64, 1, 4),
+        (16384, 16384, 16384, 128, 128): (4, 8, 128, 64, 2, 4),
+        (16384, 16384, 32768, 16, 16): (2, 2, 16, 128, 1, 2),
+        (16384, 16384, 32768, 32, 32): (1, 4, 32, 64, 3, 2),
+        (16384, 16384, 32768, 64, 64): (5, 4, 64, 64, 1, 4),
+        (16384, 16384, 32768, 128, 128): (5, 8, 128, 64, 2, 4),
+        (16384, 16384, 65536, 16, 16): (8, 2, 16, 128, 1, 2),
+        (16384, 16384, 65536, 32, 32): (6, 4, 32, 64, 5, 2),
+        (16384, 16384, 65536, 64, 64): (2, 4, 64, 64, 1, 4),
+        (16384, 16384, 65536, 128, 128): (4, 8, 128, 64, 2, 4),
+        (16384, 16384, 131072, 16, 16): (3, 1, 16, 128, 1, 2),
+        (16384, 16384, 131072, 32, 32): (1, 4, 32, 64, 3, 2),
+        (16384, 16384, 131072, 64, 64): (4, 4, 64, 64, 1, 4),
+        (16384, 16384, 131072, 128, 128): (1, 8, 128, 64, 2, 4),
+        (32768, 32768, 256, 16, 16): (4, 16, 16, 16, 1, 4),
+        (32768, 32768, 512, 16, 16): (4, 2, 16, 128, 1, 2),
+        (32768, 32768, 1024, 16, 16): (3, 2, 16, 128, 1, 2),
+        (32768, 32768, 2048, 16, 16): (4, 2, 16, 128, 1, 2),
+        (32768, 32768, 4096, 16, 16): (5, 4, 16, 64, 1, 1),
+        (32768, 32768, 8192, 16, 16): (4, 4, 16, 64, 1, 1),
+        (32768, 32768, 16384, 16, 16): (4, 4, 16, 64, 1, 1),
+        (32768, 32768, 32768, 16, 16): (5, 4, 16, 64, 1, 1),
+    },
+    ("scatter_mm", "NVIDIA A100-SXM4-80GB", (0, torch.float32, 0.5)): {
+        (256, 256, 256, 16, 16): (1, 1, 16, 16, 1, 8),
+        (256, 256, 256, 32, 32): (1, 1, 16, 16, 1, 4),
+        (256, 256, 256, 64, 64): (1, 1, 16, 16, 1, 4),
+        (256, 256, 256, 128, 128): (1, 1, 16, 16, 1, 1),
+        (256, 256, 512, 16, 16): (1, 1, 16, 16, 1, 4),
+        (256, 256, 512, 32, 32): (1, 16, 16, 16, 1, 1),
+        (256, 256, 512, 64, 64): (1, 1, 16, 16, 1, 1),
+        (256, 256, 512, 128, 128): (1, 1, 32, 32, 1, 4),
+        (256, 256, 1024, 16, 16): (1, 1, 16, 32, 1, 2),
+        (256, 256, 1024, 32, 32): (1, 4, 16, 16, 1, 1),
+        (256, 256, 1024, 64, 64): (1, 1, 32, 32, 1, 4),
+        (256, 256, 1024, 128, 128): (1, 1, 32, 32, 1, 4),
+        (256, 256, 2048, 16, 16): (1, 2, 16, 32, 1, 2),
+        (256, 256, 2048, 32, 32): (1, 1, 16, 32, 1, 2),
+        (256, 256, 2048, 64, 64): (2, 1, 16, 32, 1, 2),
+        (256, 256, 2048, 128, 128): (1, 1, 16, 16, 1, 1),
+        (256, 256, 4096, 16, 16): (1, 1, 16, 32, 1, 2),
+        (256, 256, 4096, 32, 32): (1, 1, 16, 32, 1, 2),
+        (256, 256, 4096, 64, 64): (1, 1, 32, 32, 1, 4),
+        (256, 256, 4096, 128, 128): (3, 1, 32, 64, 1, 4),
+        (256, 256, 8192, 16, 16): (1, 32, 16, 64, 1, 2),
+        (256, 256, 8192, 32, 32): (1, 1, 32, 64, 1, 4),
+        (256, 256, 8192, 64, 64): (1, 1, 32, 64, 1, 4),
+        (256, 256, 8192, 128, 128): (2, 1, 64, 32, 1, 4),
+        (256, 256, 16384, 16, 16): (1, 1, 16, 64, 1, 2),
+        (256, 256, 16384, 32, 32): (1, 1, 32, 64, 1, 4),
+        (256, 256, 16384, 64, 64): (1, 128, 64, 64, 1, 4),
+        (256, 256, 16384, 128, 128): (2, 1, 64, 32, 1, 4),
+        (256, 256, 32768, 16, 16): (2, 128, 16, 64, 1, 1),
+        (256, 256, 32768, 32, 32): (1, 1, 32, 64, 1, 4),
+        (256, 256, 32768, 64, 64): (1, 128, 64, 64, 1, 4),
+        (256, 256, 32768, 128, 128): (2, 1, 64, 64, 1, 4),
+        (256, 256, 65536, 16, 16): (1, 1, 16, 64, 1, 2),
+        (256, 256, 65536, 32, 32): (1, 1, 32, 64, 1, 4),
+        (256, 256, 65536, 64, 64): (2, 1, 64, 64, 1, 4),
+        (256, 256, 65536, 128, 128): (1, 1, 128, 32, 1, 4),
+        (256, 256, 131072, 16, 16): (3, 128, 16, 64, 1, 1),
+        (256, 256, 131072, 32, 32): (1, 1, 32, 64, 1, 4),
+        (256, 256, 131072, 64, 64): (2, 1, 64, 64, 1, 4),
+        (256, 256, 131072, 128, 128): (1, 8192, 64, 16, 1, 4),
+        (512, 512, 256, 16, 16): (1, 2, 16, 16, 1, 1),
+        (512, 512, 256, 32, 32): (1, 4, 16, 16, 1, 1),
+        (512, 512, 256, 64, 64): (1, 16, 16, 16, 1, 1),
+        (512, 512, 256, 128, 128): (1, 1, 16, 32, 1, 4),
+        (512, 512, 512, 16, 16): (1, 8, 16, 32, 1, 2),
+        (512, 512, 512, 32, 32): (1, 8, 16, 32, 1, 2),
+        (512, 512, 512, 64, 64): (1, 2, 16, 32, 1, 2),
+        (512, 512, 512, 128, 128): (1, 1, 32, 32, 1, 4),
+        (512, 512, 1024, 16, 16): (1, 1, 16, 32, 1, 2),
+        (512, 512, 1024, 32, 32): (1, 1, 16, 32, 1, 2),
+        (512, 512, 1024, 64, 64): (1, 1, 16, 32, 1, 2),
+        (512, 512, 1024, 128, 128): (1, 1, 64, 32, 1, 4),
+        (512, 512, 2048, 16, 16): (1, 16, 16, 64, 1, 2),
+        (512, 512, 2048, 32, 32): (1, 1, 32, 32, 1, 4),
+        (512, 512, 2048, 64, 64): (1, 1, 32, 32, 1, 4),
+        (512, 512, 2048, 128, 128): (2, 1, 32, 32, 1, 4),
+        (512, 512, 4096, 16, 16): (2, 64, 16, 64, 1, 1),
+        (512, 512, 4096, 32, 32): (1, 64, 32, 64, 1, 4),
+        (512, 512, 4096, 64, 64): (1, 1, 32, 32, 1, 4),
+        (512, 512, 4096, 128, 128): (1, 1, 64, 32, 1, 4),
+        (512, 512, 8192, 16, 16): (2, 64, 16, 64, 1, 1),
+        (512, 512, 8192, 32, 32): (1, 256, 32, 32, 1, 1),
+        (512, 512, 8192, 64, 64): (1, 64, 64, 64, 1, 4),
+        (512, 512, 8192, 128, 128): (2, 1, 64, 32, 1, 8),
+        (512, 512, 16384, 16, 16): (2, 64, 16, 64, 1, 1),
+        (512, 512, 16384, 32, 32): (1, 128, 32, 32, 1, 1),
+        (512, 512, 16384, 64, 64): (1, 64, 64, 64, 1, 4),
+        (512, 512, 16384, 128, 128): (3, 1, 64, 32, 1, 8),
+        (512, 512, 32768, 16, 16): (2, 64, 16, 64, 1, 1),
+        (512, 512, 32768, 32, 32): (1, 128, 32, 32, 1, 1),
+        (512, 512, 32768, 64, 64): (1, 64, 64, 64, 1, 4),
+        (512, 512, 32768, 128, 128): (2, 1, 64, 32, 1, 8),
+        (512, 512, 65536, 16, 16): (2, 32, 16, 64, 1, 1),
+        (512, 512, 65536, 32, 32): (1, 128, 32, 32, 1, 1),
+        (512, 512, 65536, 64, 64): (1, 64, 64, 64, 1, 4),
+        (512, 512, 65536, 128, 128): (2, 1, 64, 32, 1, 8),
+        (512, 512, 131072, 16, 16): (2, 32, 16, 64, 1, 1),
+        (512, 512, 131072, 32, 32): (1, 128, 32, 32, 1, 1),
+        (512, 512, 131072, 64, 64): (3, 64, 64, 64, 1, 4),
+        (512, 512, 131072, 128, 128): (1, 8192, 64, 16, 1, 4),
+        (1024, 1024, 256, 16, 16): (1, 4, 16, 32, 1, 2),
+        (1024, 1024, 256, 32, 32): (1, 4, 16, 32, 1, 2),
+        (1024, 1024, 256, 64, 64): (1, 1, 16, 32, 1, 2),
+        (1024, 1024, 256, 128, 128): (1, 1, 16, 16, 1, 1),
+        (1024, 1024, 512, 16, 16): (1, 8, 16, 32, 1, 2),
+        (1024, 1024, 512, 32, 32): (1, 8, 16, 32, 1, 1),
+        (1024, 1024, 512, 64, 64): (1, 8, 32, 32, 1, 4),
+        (1024, 1024, 512, 128, 128): (2, 1, 32, 32, 1, 4),
+        (1024, 1024, 1024, 16, 16): (1, 16, 16, 32, 1, 2),
+        (1024, 1024, 1024, 32, 32): (1, 16, 32, 64, 1, 4),
+        (1024, 1024, 1024, 64, 64): (1, 16, 32, 64, 1, 4),
+        (1024, 1024, 1024, 128, 128): (1, 1, 32, 32, 1, 4),
+        (1024, 1024, 2048, 16, 16): (2, 32, 16, 64, 1, 1),
+        (1024, 1024, 2048, 32, 32): (1, 32, 32, 64, 1, 4),
+        (1024, 1024, 2048, 64, 64): (1, 32, 64, 64, 1, 4),
+        (1024, 1024, 2048, 128, 128): (1, 1, 32, 64, 1, 4),
+        (1024, 1024, 4096, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 4096, 32, 32): (1, 64, 32, 32, 1, 1),
+        (1024, 1024, 4096, 64, 64): (1, 64, 64, 64, 1, 4),
+        (1024, 1024, 4096, 128, 128): (2, 64, 64, 32, 1, 8),
+        (1024, 1024, 8192, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 8192, 32, 32): (1, 64, 32, 32, 1, 1),
+        (1024, 1024, 8192, 64, 64): (1, 64, 64, 64, 1, 4),
+        (1024, 1024, 8192, 128, 128): (4, 1, 32, 64, 1, 4),
+        (1024, 1024, 16384, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 16384, 32, 32): (1, 64, 32, 32, 1, 1),
+        (1024, 1024, 16384, 64, 64): (1, 32, 64, 64, 1, 4),
+        (1024, 1024, 16384, 128, 128): (2, 64, 64, 32, 1, 4),
+        (1024, 1024, 32768, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 32768, 32, 32): (1, 64, 32, 32, 1, 1),
+        (1024, 1024, 32768, 64, 64): (1, 32, 64, 64, 1, 4),
+        (1024, 1024, 32768, 128, 128): (4, 1, 32, 64, 1, 4),
+        (1024, 1024, 65536, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 65536, 32, 32): (1, 32, 32, 32, 1, 1),
+        (1024, 1024, 65536, 64, 64): (2, 32, 64, 64, 1, 4),
+        (1024, 1024, 65536, 128, 128): (4, 1, 64, 32, 1, 4),
+        (1024, 1024, 131072, 16, 16): (2, 16, 16, 64, 1, 1),
+        (1024, 1024, 131072, 32, 32): (1, 32, 32, 32, 1, 1),
+        (1024, 1024, 131072, 64, 64): (1, 16, 64, 64, 1, 4),
+        (1024, 1024, 131072, 128, 128): (1, 8192, 64, 16, 1, 4),
+        (2048, 2048, 256, 16, 16): (1, 4, 16, 32, 1, 2),
+        (2048, 2048, 256, 32, 32): (1, 8, 16, 32, 1, 1),
+        (2048, 2048, 256, 64, 64): (1, 8, 32, 32, 1, 4),
+        (2048, 2048, 256, 128, 128): (1, 4, 64, 64, 1, 8),
+        (2048, 2048, 512, 16, 16): (2, 8, 16, 32, 1, 2),
+        (2048, 2048, 512, 32, 32): (2, 8, 32, 64, 1, 4),
+        (2048, 2048, 512, 64, 64): (2, 4, 64, 64, 1, 4),
+        (2048, 2048, 512, 128, 128): (1, 8, 32, 64, 1, 4),
+        (2048, 2048, 1024, 16, 16): (2, 16, 16, 64, 3, 1),
+        (2048, 2048, 1024, 32, 32): (1, 32, 32, 32, 1, 1),
+        (2048, 2048, 1024, 64, 64): (1, 16, 64, 64, 1, 4),
+        (2048, 2048, 1024, 128, 128): (2, 4, 64, 64, 1, 8),
+        (2048, 2048, 2048, 16, 16): (2, 16, 16, 64, 1, 1),
+        (2048, 2048, 2048, 32, 32): (1, 32, 32, 32, 1, 1),
+        (2048, 2048, 2048, 64, 64): (1, 16, 64, 64, 1, 4),
+        (2048, 2048, 2048, 128, 128): (2, 32, 32, 64, 1, 4),
+        (2048, 2048, 4096, 16, 16): (3, 2, 16, 64, 1, 1),
+        (2048, 2048, 4096, 32, 32): (3, 4, 32, 32, 1, 1),
+        (2048, 2048, 4096, 64, 64): (1, 16, 64, 64, 1, 4),
+        (2048, 2048, 4096, 128, 128): (2, 32, 64, 32, 1, 4),
+        (2048, 2048, 8192, 16, 16): (3, 4, 16, 64, 1, 1),
+        (2048, 2048, 8192, 32, 32): (2, 4, 32, 32, 1, 1),
+        (2048, 2048, 8192, 64, 64): (2, 32, 64, 32, 1, 2),
+        (2048, 2048, 8192, 128, 128): (4, 1, 32, 64, 1, 4),
+        (2048, 2048, 16384, 16, 16): (3, 4, 16, 64, 1, 1),
+        (2048, 2048, 16384, 32, 32): (1, 4, 32, 32, 1, 1),
+        (2048, 2048, 16384, 64, 64): (2, 8, 64, 32, 1, 2),
+        (2048, 2048, 16384, 128, 128): (2, 8, 64, 32, 1, 4),
+        (2048, 2048, 32768, 16, 16): (2, 4, 16, 64, 1, 1),
+        (2048, 2048, 32768, 32, 32): (2, 8, 32, 32, 1, 1),
+        (2048, 2048, 32768, 64, 64): (1, 16, 64, 32, 1, 2),
+        (2048, 2048, 32768, 128, 128): (4, 1, 32, 64, 1, 4),
+        (2048, 2048, 65536, 16, 16): (3, 4, 16, 64, 1, 1),
+        (2048, 2048, 65536, 32, 32): (1, 8, 32, 32, 1, 1),
+        (2048, 2048, 65536, 64, 64): (1, 8, 64, 32, 1, 2),
+        (2048, 2048, 65536, 128, 128): (4, 1, 64, 32, 1, 4),
+        (2048, 2048, 131072, 16, 16): (2, 4, 16, 64, 1, 1),
+        (2048, 2048, 131072, 32, 32): (1, 8, 32, 32, 1, 1),
+        (2048, 2048, 131072, 64, 64): (3, 1, 64, 32, 1, 2),
+        (2048, 2048, 131072, 128, 128): (1, 8192, 128, 16, 1, 8),
+        (4096, 4096, 256, 16, 16): (2, 4, 16, 32, 1, 2),
+        (4096, 4096, 256, 32, 32): (1, 4, 32, 64, 1, 4),
+        (4096, 4096, 256, 64, 64): (1, 4, 64, 64, 1, 4),
+        (4096, 4096, 256, 128, 128): (1, 4, 32, 64, 1, 4),
+        (4096, 4096, 512, 16, 16): (2, 8, 16, 64, 3, 1),
+        (4096, 4096, 512, 32, 32): (2, 16, 32, 32, 1, 1),
+        (4096, 4096, 512, 64, 64): (1, 8, 64, 64, 1, 4),
+        (4096, 4096, 512, 128, 128): (1, 8, 32, 64, 1, 4),
+        (4096, 4096, 1024, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 1024, 32, 32): (1, 16, 32, 32, 1, 1),
+        (4096, 4096, 1024, 64, 64): (1, 16, 64, 32, 1, 2),
+        (4096, 4096, 1024, 128, 128): (1, 16, 32, 64, 1, 4),
+        (4096, 4096, 2048, 16, 16): (1, 16, 16, 64, 3, 1),
+        (4096, 4096, 2048, 32, 32): (1, 16, 32, 32, 1, 1),
+        (4096, 4096, 2048, 64, 64): (3, 16, 64, 32, 1, 2),
+        (4096, 4096, 2048, 128, 128): (4, 8, 32, 64, 1, 4),
+        (4096, 4096, 4096, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 4096, 32, 32): (1, 1, 32, 32, 1, 1),
+        (4096, 4096, 4096, 64, 64): (2, 16, 64, 32, 1, 2),
+        (4096, 4096, 4096, 128, 128): (4, 8, 32, 64, 1, 4),
+        (4096, 4096, 8192, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 8192, 32, 32): (2, 1, 32, 32, 1, 1),
+        (4096, 4096, 8192, 64, 64): (1, 16, 64, 32, 1, 2),
+        (4096, 4096, 8192, 128, 128): (2, 1, 32, 64, 1, 4),
+        (4096, 4096, 16384, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 16384, 32, 32): (1, 1, 32, 32, 1, 1),
+        (4096, 4096, 16384, 64, 64): (2, 8, 64, 32, 1, 2),
+        (4096, 4096, 16384, 128, 128): (2, 1, 32, 64, 1, 4),
+        (4096, 4096, 32768, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 32768, 32, 32): (1, 1, 32, 32, 1, 1),
+        (4096, 4096, 32768, 64, 64): (1, 8, 64, 32, 1, 2),
+        (4096, 4096, 32768, 128, 128): (2, 1, 32, 64, 1, 4),
+        (4096, 4096, 65536, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 65536, 32, 32): (3, 1, 32, 32, 1, 1),
+        (4096, 4096, 65536, 64, 64): (3, 4, 64, 32, 1, 2),
+        (4096, 4096, 65536, 128, 128): (2, 1, 32, 64, 1, 4),
+        (4096, 4096, 131072, 16, 16): (1, 8, 16, 64, 3, 1),
+        (4096, 4096, 131072, 32, 32): (1, 1, 32, 32, 1, 1),
+        (4096, 4096, 131072, 64, 64): (2, 8, 64, 32, 1, 2),
+        (4096, 4096, 131072, 128, 128): (1, 8192, 128, 16, 1, 8),
+        (8192, 8192, 256, 16, 16): (2, 4, 16, 64, 3, 1),
+        (8192, 8192, 256, 32, 32): (1, 8, 32, 32, 1, 1),
+        (8192, 8192, 256, 64, 64): (1, 4, 64, 64, 1, 4),
+        (8192, 8192, 256, 128, 128): (1, 4, 32, 64, 1, 4),
+        (8192, 8192, 512, 16, 16): (1, 4, 16, 64, 3, 1),
+        (8192, 8192, 512, 32, 32): (1, 16, 32, 32, 1, 1),
+        (8192, 8192, 512, 64, 64): (2, 4, 64, 64, 1, 4),
+        (8192, 8192, 512, 128, 128): (2, 1, 32, 64, 1, 4),
+        (8192, 8192, 1024, 16, 16): (3, 8, 16, 64, 3, 1),
+        (8192, 8192, 1024, 32, 32): (1, 16, 32, 32, 1, 1),
+        (8192, 8192, 1024, 64, 64): (1, 8, 64, 32, 1, 2),
+        (8192, 8192, 1024, 128, 128): (2, 4, 32, 64, 1, 4),
+        (8192, 8192, 2048, 16, 16): (1, 8, 16, 64, 3, 1),
+        (8192, 8192, 2048, 32, 32): (1, 16, 32, 32, 1, 1),
+        (8192, 8192, 2048, 64, 64): (2, 8, 64, 32, 1, 2),
+        (8192, 8192, 2048, 128, 128): (4, 1, 32, 64, 1, 4),
+        (8192, 8192, 4096, 16, 16): (1, 8, 16, 64, 3, 1),
+        (8192, 8192, 4096, 32, 32): (1, 16, 32, 32, 1, 1),
+        (8192, 8192, 4096, 64, 64): (1, 4, 64, 32, 1, 2),
+        (8192, 8192, 4096, 128, 128): (3, 1, 32, 64, 1, 4),
+        (8192, 8192, 8192, 16, 16): (1, 8, 16, 64, 3, 1),
+        (8192, 8192, 8192, 32, 32): (1, 8, 32, 32, 1, 1),
+        (8192, 8192, 8192, 64, 64): (1, 8, 64, 32, 1, 2),
+        (8192, 8192, 8192, 128, 128): (4, 1, 32, 64, 1, 4),
+        (8192, 8192, 16384, 16, 16): (3, 4, 16, 64, 3, 1),
+        (8192, 8192, 16384, 32, 32): (1, 8, 32, 32, 1, 1),
+        (8192, 8192, 16384, 64, 64): (2, 2, 64, 32, 1, 2),
+        (8192, 8192, 16384, 128, 128): (7, 1, 32, 64, 1, 4),
+        (8192, 8192, 32768, 16, 16): (1, 4, 16, 64, 3, 1),
+        (8192, 8192, 32768, 32, 32): (1, 8, 32, 32, 1, 1),
+        (8192, 8192, 32768, 64, 64): (3, 2, 64, 32, 1, 2),
+        (8192, 8192, 32768, 128, 128): (6, 1, 32, 64, 1, 4),
+        (8192, 8192, 65536, 16, 16): (1, 4, 16, 64, 3, 1),
+        (8192, 8192, 65536, 32, 32): (4, 8, 32, 32, 1, 1),
+        (8192, 8192, 65536, 64, 64): (1, 2, 64, 32, 1, 2),
+        (8192, 8192, 65536, 128, 128): (4, 1, 32, 64, 1, 4),
+        (8192, 8192, 131072, 16, 16): (1, 4, 16, 64, 3, 1),
+        (8192, 8192, 131072, 32, 32): (1, 8, 32, 32, 1, 1),
+        (8192, 8192, 131072, 64, 64): (5, 4, 64, 32, 1, 2),
+        (8192, 8192, 131072, 128, 128): (1, 4096, 128, 16, 1, 8),
+        (16384, 16384, 256, 16, 16): (1, 4, 16, 64, 3, 1),
+        (16384, 16384, 256, 32, 32): (1, 8, 32, 32, 1, 1),
+        (16384, 16384, 256, 64, 64): (1, 4, 64, 32, 1, 2),
+        (16384, 16384, 256, 128, 128): (1, 4, 32, 64, 1, 4),
+        (16384, 16384, 512, 16, 16): (1, 8, 16, 64, 3, 1),
+        (16384, 16384, 512, 32, 32): (1, 16, 32, 32, 1, 1),
+        (16384, 16384, 512, 64, 64): (1, 4, 64, 32, 1, 2),
+        (16384, 16384, 512, 128, 128): (3, 1, 32, 64, 1, 4),
+        (16384, 16384, 1024, 16, 16): (1, 8, 16, 64, 3, 1),
+        (16384, 16384, 1024, 32, 32): (1, 16, 32, 32, 1, 1),
+        (16384, 16384, 1024, 64, 64): (2, 4, 64, 32, 1, 2),
+        (16384, 16384, 1024, 128, 128): (1, 2, 32, 64, 1, 4),
+        (16384, 16384, 2048, 16, 16): (1, 4, 16, 64, 3, 1),
+        (16384, 16384, 2048, 32, 32): (1, 16, 32, 32, 1, 1),
+        (16384, 16384, 2048, 64, 64): (3, 4, 64, 32, 1, 2),
+        (16384, 16384, 2048, 128, 128): (2, 1, 32, 64, 1, 4),
+        (16384, 16384, 4096, 16, 16): (4, 8, 16, 64, 3, 1),
+        (16384, 16384, 4096, 32, 32): (5, 16, 32, 32, 1, 1),
+        (16384, 16384, 4096, 64, 64): (3, 2, 64, 32, 1, 2),
+        (16384, 16384, 4096, 128, 128): (2, 1, 32, 64, 1, 4),
+        (16384, 16384, 8192, 16, 16): (1, 4, 16, 64, 3, 1),
+        (16384, 16384, 8192, 32, 32): (1, 4, 32, 32, 1, 1),
+        (16384, 16384, 8192, 64, 64): (1, 2, 64, 32, 1, 2),
+        (16384, 16384, 8192, 128, 128): (2, 1, 32, 64, 1, 4),
+        (16384, 16384, 16384, 16, 16): (1, 8, 16, 64, 3, 1),
+        (16384, 16384, 16384, 32, 32): (1, 4, 32, 32, 1, 1),
+        (16384, 16384, 16384, 64, 64): (1, 2, 64, 32, 1, 2),
+        (16384, 16384, 16384, 128, 128): (3, 1, 32, 64, 1, 4),
+        (16384, 16384, 32768, 16, 16): (1, 4, 16, 64, 3, 1),
+        (16384, 16384, 32768, 32, 32): (1, 2, 32, 32, 1, 1),
+        (16384, 16384, 32768, 64, 64): (3, 2, 64, 32, 1, 2),
+        (16384, 16384, 32768, 128, 128): (3, 1, 32, 64, 1, 4),
+        (16384, 16384, 65536, 16, 16): (1, 8, 16, 64, 3, 1),
+        (16384, 16384, 65536, 32, 32): (1, 4, 32, 32, 1, 1),
+        (16384, 16384, 65536, 64, 64): (4, 4, 64, 32, 1, 2),
+        (16384, 16384, 65536, 128, 128): (5, 1, 32, 64, 1, 4),
+        (16384, 16384, 131072, 16, 16): (1, 2, 16, 64, 3, 1),
+        (16384, 16384, 131072, 32, 32): (1, 4, 32, 32, 1, 1),
+        (16384, 16384, 131072, 64, 64): (1, 2, 64, 32, 1, 2),
+        (16384, 16384, 131072, 128, 128): (1, 4096, 128, 16, 1, 8),
+    },
+    # END GENERATED DATA
+}
+
+if __name__ == "__main__":
+    for dtype in [torch.float16, torch.bfloat16, torch.float32]:
+        for op in ["bsr_dense_addmm"]:
+            main(op=op, force=False, dtype=dtype)
diff --git a/MLPY/Lib/site-packages/torch/sparse/semi_structured.py b/MLPY/Lib/site-packages/torch/sparse/semi_structured.py
new file mode 100644
index 0000000000000000000000000000000000000000..4df52a9bd2b4cdc24612d63d923bc02311cead1d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/sparse/semi_structured.py
@@ -0,0 +1,518 @@
+import warnings
+from collections import namedtuple
+from typing import Any, Optional, Tuple, List, Callable, Dict
+
+import torch
+from torch.sparse._semi_structured_conversions import (
+    sparse_semi_structured_from_dense_cutlass,
+    sparse_semi_structured_to_dense_cutlass,
+)
+from torch.sparse._semi_structured_ops import (
+    fallback_dispatcher,
+    semi_sparse_values,
+    semi_sparse_indices,
+    semi_sparse_detach,
+    semi_sparse_t,
+    semi_sparse_view,
+    semi_sparse_mm,
+    semi_sparse_addmm,
+    semi_sparse_linear,
+)
+
+__all__ = [
+    "SparseSemiStructuredTensor",
+    "SparseSemiStructuredTensorCUTLASS",
+    "SparseSemiStructuredTensorCUSPARSELT",
+    "to_sparse_semi_structured",
+]
+
+_SEMI_STRUCTURED_SPARSE_CONFIG = namedtuple(
+    "_SEMI_STRUCTURED_SPARSE_CONFIG",
+    "sparse_min_rows sparse_min_cols dense_min_rows dense_min_cols",
+)
+
+
+class SparseSemiStructuredTensor(torch.Tensor):
+    """
+    This class implementes semi-structured sparsity as a Tensor subclass.
+
+    Semi-structured sparsity describes a sparsity pattern where n in every 2n elements are sparse,
+    depending on the datatype. It is also referred to as 2:4 sparsity or fine-grained
+    structured sparsity.
+
+    There are two backends available for semi_structred sparsity, either cuSPARSELt or CUTLASS.
+    This class is meant to serve as a base class for both implementations. SparseSemiStructuredCUTLASS
+    and SparseSemiStructuredCUSPARSELT both inherit from this class and define three backend-specific items.
+    Note that as such, this class cannot be insantiated directly.
+
+    -`_DTYPE_SHAPE_CONSTRAINTS` - A dictionary holding backend specific dense/sparse min shape constraints
+    - `def from_dense()` - backend specific compression routines
+    - `def _mm()` - backend specifc mm op (either torch._cslt_sparse_mm or torch._sparse_semi_structured_linear)
+    """
+
+    _DEFAULT_ALG_ID: int = 0
+    _DTYPE_SHAPE_CONSTRAINTS: Dict[torch.dtype, _SEMI_STRUCTURED_SPARSE_CONFIG]
+    _FORCE_CUTLASS: bool = True
+    _FUSE_TRANSPOSE: bool = False
+    _PROTOTYPE_WARNING_SHOWN: bool = False
+
+    SPARSE_DISPATCH: Dict[Callable, Callable]
+
+    packed: Optional[torch.Tensor]
+    meta: Optional[torch.Tensor]
+    packed_t: Optional[torch.Tensor]
+    meta_t: Optional[torch.Tensor]
+    threads_masks: Optional[torch.Tensor]
+    fuse_transpose_cusparselt: bool
+    alg_id_cusparselt: int
+
+    __slots__ = ["packed", "meta", "packed_t", "meta_t", "threads_masks"]
+
+    @staticmethod
+    def __new__(  # noqa: PYI034
+        cls,
+        shape: torch.Size,
+        packed: Optional[torch.Tensor],
+        meta: Optional[torch.Tensor],
+        packed_t: Optional[torch.Tensor],
+        meta_t: Optional[torch.Tensor],
+        threads_masks: Optional[torch.Tensor],
+        fuse_transpose_cusparselt: bool = False,
+        alg_id_cusparselt: int = 0,
+        requires_grad: bool = False,
+    ):
+        """
+        Create a new instance of the tensor subclass from the compressed sparse representation.
+
+        We have the option to create the subclass with the compressed representations of both X and X', for training.
+        For inference, we only need a single representation (either X or X'), while the corresponding other set will be None.
+
+        Depending on the backend selected, certain fields will be set to None. (CUSPARSELT vs CUTLASS)
+
+        Args:
+            shape: The shape of the original dense tensor
+            packed: The compressed representation of the original dense tensor
+            meta: The metadata of the original dense tensor, if it is stored separately
+            packed_t: The compressed representation of the transposed original dense tensor
+            meta_t: The metadata of the transposed original dense tensor, if it is stored separately
+            threads_masks: The masks used by the CUTLASS backend to determine which threads should participate in the computation.
+                           Used for pointwise ops.
+            fuse_transpose_cusparselt: When running with cuSPARSELt, we have the option to fuse a transposition
+                                       with a matmul, which is useful in the case of 2:4 sparse training.
+            alg_id_cusparselt: The algorithm id to use when using cuSPARSELT, will have effect on performance
+
+        Returns:
+            torch.Tensor: A torch.Tensor wrapper subclass.
+
+        Raises:
+            ValueError: If all of the tensor arguments are None.
+        """
+        if not cls._PROTOTYPE_WARNING_SHOWN:
+            warnings.warn(
+                (
+                    "The PyTorch API of SparseSemiStructuredTensor is in prototype stage "
+                    "and will change in the near future. Please open a Github issue "
+                    "for features requests and see our documentation on the torch.sparse "
+                    "module for further information about the project."
+                ),
+                UserWarning,
+            )
+            cls._PROTOTYPE_WARNING_SHOWN = True
+
+            # Because this only runs onces, we also load the dispatch table here as well.
+            # We can't define the dispatch table explicitly because of torch.ops import errors, so we do this instead
+            # But this is useful since it allows users to overload the dispatch table for debugging / testing.
+            cls._load_dispatch_table()
+
+        if packed is not None:
+            previous_tensor = packed
+        elif packed_t is not None:
+            previous_tensor = packed_t
+        else:
+            raise ValueError("At least one of packed or packed_t must be provided")
+
+        kwargs = {
+            "device": previous_tensor.device,
+            "dtype": previous_tensor.dtype,
+            "layout": previous_tensor.layout,
+            "requires_grad": requires_grad,
+        }
+        tensor = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+        tensor.packed = packed
+        tensor.meta = meta
+        tensor.packed_t = packed_t
+        tensor.meta_t = meta_t
+        tensor.threads_masks = threads_masks
+        tensor.fuse_transpose_cusparselt = fuse_transpose_cusparselt
+        tensor.alg_id_cusparselt = alg_id_cusparselt
+        return tensor
+
+    def __repr__(self) -> str:  # type: ignore[override]
+        assert hasattr(self, "shape")
+        return f"{self.__class__.__name__}(shape={self.shape})"
+
+    def __tensor_flatten__(
+        self,
+    ) -> Tuple[List[str], Tuple[torch.Size, bool, int, bool]]:
+        inner_tensors = list(
+            filter(lambda x: getattr(self, x) is not None, self.__slots__)
+        )
+        tensor_meta = (
+            self.shape,
+            self.fuse_transpose_cusparselt,
+            self.alg_id_cusparselt,
+            self.requires_grad,
+        )
+        return inner_tensors, tensor_meta
+
+    @classmethod
+    def __tensor_unflatten__(
+        cls,
+        inner_tensors,
+        tensor_meta : Tuple[torch.Size, bool, int, bool],
+        outer_size,
+        outer_stride,
+    ) -> torch.Tensor:
+        shape, fuse_transpose_cusparselt, alg_id_cusparselt, requires_grad = tensor_meta
+        return cls(
+            shape=shape,
+            packed=inner_tensors.get("packed", None),
+            meta=inner_tensors.get("meta", None),
+            packed_t=inner_tensors.get("packed_t", None),
+            meta_t=inner_tensors.get("meta_t", None),
+            threads_masks=inner_tensors.get("threads_masks", None),
+            fuse_transpose_cusparselt=fuse_transpose_cusparselt,
+            alg_id_cusparselt=alg_id_cusparselt,
+            requires_grad=requires_grad,
+        )
+
+    __torch_function__ = torch._C._disabled_torch_function_impl
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs) -> Any:
+        if func._overloadpacket not in cls.SPARSE_DISPATCH:
+            raise NotImplementedError(
+                f"{cls.__name__} only supports a specific set of operations, "
+                f"can't perform requested op ({func.__name__})"
+            )
+        return cls.SPARSE_DISPATCH[func._overloadpacket](func, types, args, kwargs)
+
+    @classmethod
+    def _load_dispatch_table(cls, custom_dispatch_table=None) -> None:
+        """
+        Loads the op overload sparse dispatch table for the current class.
+        """
+        if getattr(cls, "SPARSE_DISPATCH", None) is None:
+            cls.SPARSE_DISPATCH = {
+                torch.ops.aten.values: semi_sparse_values,
+                torch.ops.aten.indices: semi_sparse_indices,
+                torch.ops.aten.is_same_size: fallback_dispatcher,
+                torch.ops.aten.detach_: fallback_dispatcher,
+                torch.ops.aten.detach: semi_sparse_detach,
+                torch.ops.aten.t: semi_sparse_t,
+                torch.ops.aten.view: semi_sparse_view,
+                torch.ops.aten.mm: semi_sparse_mm,
+                torch.ops.aten.matmul: semi_sparse_mm,
+                torch.ops.aten.addmm: semi_sparse_addmm,
+                torch.ops.aten.linear: semi_sparse_linear,
+            }
+            if custom_dispatch_table is not None:
+                cls.SPARSE_DISPATCH.update(custom_dispatch_table)
+
+    @classmethod
+    def _validate_device_dim_dtype_shape(cls, original_tensor : torch.Tensor) -> None:
+        """
+        Assert that the given tensor is valid for semi-structured sparse compression.
+        """
+        # check device
+        if not original_tensor.is_cuda:
+            raise RuntimeError(
+                f"Error original_tensor.device= {original_tensor.device} is not supported! "
+                "Only CUDA tensors are currently supported."
+            )
+
+        # check dim
+        if original_tensor.dim() != 2:
+            raise RuntimeError(
+                f"Error original_tensor.dim = {original_tensor.dim()} is not supported! "
+                "Only 2d tensors are currently supported."
+            )
+
+        # check contiguous
+        if not original_tensor.is_contiguous():
+            raise RuntimeError(
+                "Error original_tensor is not contiguous!"
+                "Only contiguous tensors are currently supported."
+            )
+
+        # check dtype
+        if original_tensor.dtype not in cls._DTYPE_SHAPE_CONSTRAINTS:
+            raise RuntimeError(
+                f"Error original_tensor.dtype {original_tensor.dtype} is not a supported dtype! "
+                "dtype must be one of: {cls._DTYPE_SHAPE_CONSTRAINTS}"
+            )
+
+        # check shape
+        m, n = original_tensor.shape
+        min_rows = cls._DTYPE_SHAPE_CONSTRAINTS[original_tensor.dtype].sparse_min_rows
+        min_cols = cls._DTYPE_SHAPE_CONSTRAINTS[original_tensor.dtype].sparse_min_cols
+        if m < min_rows or m % min_rows or n < min_cols or n % min_cols:
+            # TODO in the future we can add in padding to support sparse dimensions that aren't perfect multiples
+            raise RuntimeError(
+                f"Error original_tensor.shape {original_tensor.shape} is not supported! "
+                f"Both dimensions must be larger or equal than and a multiple of ({min_rows}, {min_cols})"
+            )
+
+    @classmethod
+    def _pad_dense_input(cls, dense_input: torch.Tensor) -> torch.Tensor:
+        """
+        Calculates padding for dense tensor and pads tensor if necessary.
+        If padding is not required, this function returns the original tensor.
+        """
+        # only 2d matmul
+        assert dense_input.dim() == 2
+
+        # check shape
+        m, n = dense_input.shape
+        min_rows = cls._DTYPE_SHAPE_CONSTRAINTS[dense_input.dtype].dense_min_rows
+        min_cols = cls._DTYPE_SHAPE_CONSTRAINTS[dense_input.dtype].dense_min_cols
+
+        # calculate padding
+        to_pad_m = -m % min_rows if m < min_rows or m % min_rows else 0
+        to_pad_n = -n % min_cols if n < min_cols or n % min_rows else 0
+        if to_pad_m or to_pad_n:
+            return torch.nn.functional.pad(dense_input, (0, to_pad_n, 0, to_pad_m))
+        else:
+            return dense_input
+
+    def to_dense(self):
+        col = self.shape[-1]
+        return torch.mm(self, torch.eye(col, dtype=self.dtype, device=self.device))
+
+    @classmethod
+    def from_dense(cls, original_tensor : torch.Tensor) -> "SparseSemiStructuredTensor":
+        raise NotImplementedError
+
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+
+def to_sparse_semi_structured(
+    original_tensor: torch.Tensor,
+    transposed: bool = False,
+) -> SparseSemiStructuredTensor:
+    """
+    This function converts a dense tensor into a sparse semi-structured tensor.
+    It will return a SparseSemiStructuredTensor, a subclass of torch.Tensor.
+
+    This function will check to ensure the dense tensor has the right dtype, size, dims, and device.
+    We currently only support semi-structured sparse tensors for 2d CUDA tensors.
+    Additionally, your tensor must be a positive multiple of the mininum sparse block size, given in
+    `_DTYPE_TO_SHAPE_CONSTRAINTS` for each dtype (float32, float16, bfloat16, int8).
+
+    Args:
+        original_tensor (Tensor): the dense tensor to convert
+        transposed (bool, optional): deprecated arg to be removed in another release. Do not use.
+    Returns:
+        SparseSemiStructuredTensor: A sparse semi-structured tensor created from the given original_tensor
+    Raises:
+        None
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> A = torch.Tensor([0, 0, 1, 1]).tile((128, 32)).half().cuda()
+        tensor([[0., 0., 1.,  ..., 0., 1., 1.],
+                [0., 0., 1.,  ..., 0., 1., 1.],
+                [0., 0., 1.,  ..., 0., 1., 1.],
+                ...,
+                [0., 0., 1.,  ..., 0., 1., 1.],
+                [0., 0., 1.,  ..., 0., 1., 1.],
+                [0., 0., 1.,  ..., 0., 1., 1.]], device='cuda:0', dtype=torch.float16)
+        >>> A_sparse = to_sparse_semi_structured(A)
+        SparseSemiStructuredTensor(shape=torch.Size([128, 128]))
+        >>> A_sparse.values()
+        tensor([[1., 1., 1.,  ..., 1., 1., 1.],
+                [1., 1., 1.,  ..., 1., 1., 1.],
+                [1., 1., 1.,  ..., 1., 1., 1.],
+                ...,
+                [1., 1., 1.,  ..., 1., 1., 1.],
+                [1., 1., 1.,  ..., 1., 1., 1.],
+                [1., 1., 1.,  ..., 1., 1., 1.]], device='cuda:0', dtype=torch.float16),
+        >>> A_sparse.indices()
+        tensor([[-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+                ...,
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370],
+                [-4370, -4370, -4370,  ..., -4370, -4370, -4370]], device='cuda:0', dtype=torch.int16))
+    """
+    if transposed:
+        raise DeprecationWarning(
+            "Setting transpose from to_sparse_semi_structured is deprecated and will be removed in a future release."
+            "SparseSemiStructuredTensor only support contiguous input tensors. "
+        )
+
+    sparse_subclass = (
+        torch.sparse.SparseSemiStructuredTensorCUTLASS
+        if SparseSemiStructuredTensor._FORCE_CUTLASS
+        else torch.sparse.SparseSemiStructuredTensorCUSPARSELT
+    )
+    return sparse_subclass.from_dense(original_tensor)
+
+
+class SparseSemiStructuredTensorCUTLASS(SparseSemiStructuredTensor):
+    """
+    This class implements semi-structured sparsity for the CUTLASS backend.
+
+    In this implementation, the specified elements and metadata are stored seprately,
+    in packed and meta respectively.
+
+    When _FORCE_CUTLASS is set, or when cuSPARSELt is not available, this subclass calls into _sparse_semi_structured_linear
+    and sparse_semi_structured_from_dense for conversion to the compressed format.
+    """
+
+    _DTYPE_SHAPE_CONSTRAINTS = {
+        torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 128, 16, 16),
+        torch.float16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
+        torch.bfloat16: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 64, 8, 8),
+        torch.float32: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 32, 4, 4),
+    }
+
+    @classmethod
+    def from_dense(
+        cls, original_tensor: torch.Tensor
+    ) -> "SparseSemiStructuredTensorCUTLASS":
+        cls._validate_device_dim_dtype_shape(original_tensor)
+        (
+            sparse_tensor_cutlass,
+            meta_tensor_cutlass,
+        ) = sparse_semi_structured_from_dense_cutlass(original_tensor)
+        return cls(
+            original_tensor.shape,
+            packed=sparse_tensor_cutlass,
+            meta=meta_tensor_cutlass,
+            packed_t=None,
+            meta_t=None,
+            threads_masks=None,
+            requires_grad=original_tensor.requires_grad,
+        )
+
+    def to_dense(self):
+        assert self.meta is not None and self.packed is not None
+        return (
+            sparse_semi_structured_to_dense_cutlass(
+                self.packed,
+                self.meta,
+            )
+            if self.meta.ndim == 2
+            else super().to_dense()
+        )
+
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if isinstance(B, SparseSemiStructuredTensor):
+            raise ValueError(
+                "`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardware"
+            )
+        cls_name = self.__class__.__name__
+        if self.ndim != 2 or B.ndim != 2:
+            raise NotImplementedError(
+                f"`{cls_name}` matmul: Broadcasting is not implemented"
+            )
+        if self.packed is None or self.meta is None:
+            raise NotImplementedError(
+                f"`{cls_name}` matmul: operation is not supported"
+            )
+        else:
+            res = torch._sparse_semi_structured_linear(
+                B.t(), self.packed, self.meta, bias=bias
+            ).t()
+            return res[: self.shape[0]]
+
+
+class SparseSemiStructuredTensorCUSPARSELT(SparseSemiStructuredTensor):
+    """
+    The cuSPARSELt backend expects the specified elements and the metadata to be stored in a single tensor:
+    packed = [ specified elements of original tensor | metadata ]
+    For an original tensor of size (m, k) we expect the first m * k // 2 elements to be the kept elements
+    The rest of the tensor is metadata. Since there is only one tensor, we only use the packed and packed_t
+    attributes respectively.
+
+    cuSPARSELt also supports transposition fusion, which is necessary for performant 2:4 sparse training, as well
+    as specifying alg_id, a config that affects the performance of the matmul depending on matmul sizes.
+    """
+
+    _DTYPE_SHAPE_CONSTRAINTS = {
+        torch.int8: _SEMI_STRUCTURED_SPARSE_CONFIG(32, 32, 16, 16),
+        torch.float16: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 16, 8, 8),
+        torch.bfloat16: _SEMI_STRUCTURED_SPARSE_CONFIG(16, 16, 8, 8),
+        torch.float32: _SEMI_STRUCTURED_SPARSE_CONFIG(8, 8, 4, 4),
+    }
+
+    @classmethod
+    def from_dense(cls, original_tensor : torch.Tensor) -> "SparseSemiStructuredTensorCUSPARSELT":
+        cls._validate_device_dim_dtype_shape(original_tensor)
+        return cls(
+            shape=original_tensor.shape,
+            packed=torch._cslt_compress(original_tensor),
+            meta=None,
+            packed_t=None,
+            meta_t=None,
+            threads_masks=None,
+            fuse_transpose_cusparselt=SparseSemiStructuredTensor._FUSE_TRANSPOSE,
+            alg_id_cusparselt=SparseSemiStructuredTensor._DEFAULT_ALG_ID,
+            requires_grad=original_tensor.requires_grad,
+        )
+
+    def _mm(
+        self,
+        B: torch.Tensor,
+        *,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+        if isinstance(B, SparseSemiStructuredTensor):
+            raise ValueError(
+                "`SparseSemiStructuredTensor @ SparseSemiStructuredTensor` is not supported by the hardware"
+            )
+        if self.ndim != 2 or B.ndim != 2:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: Broadcasting is not implemented"
+            )
+        if B.dtype != self.dtype:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: trying to do `A={tuple(self.shape)} @ B={tuple(B.shape)}`, "
+                f"with A.dtype={self.dtype} and B.dtype={B.dtype}. "
+                "This operation is only supported when A and B have the same data type."
+            )
+        if bias is not None and bias.dtype != self.dtype:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: trying to do `A={tuple(self.shape)} @ B={tuple(B.shape)} + C`, "
+                "with A.dtype=B.dtype={self.dtype} and C.dtype={B.dtype}. "
+                "This operation is only supported when A, B and C have the same data type."
+            )
+        if self.packed is None:
+            raise NotImplementedError(
+                f"`{self.__class__.__name__}` matmul: operation is not supported"
+            )
+        else:
+            res = torch._cslt_sparse_mm(
+                self.packed,
+                B,
+                bias=bias,
+                transpose_result=self.fuse_transpose_cusparselt,
+                alg_id=self.alg_id_cusparselt,
+            )
+            return res.t() if self.fuse_transpose_cusparselt else res
diff --git a/MLPY/Lib/site-packages/torch/special/__init__.py b/MLPY/Lib/site-packages/torch/special/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c07f9af6a4da2d53ababff7a3093dfb8fc39a862
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/special/__init__.py
@@ -0,0 +1,1283 @@
+import torch
+from torch._C import _add_docstr, _special  # type: ignore[attr-defined]
+from torch._torch_docs import common_args, multi_dim_common
+
+__all__ = [
+    'airy_ai',
+    'bessel_j0',
+    'bessel_j1',
+    'bessel_y0',
+    'bessel_y1',
+    'chebyshev_polynomial_t',
+    'chebyshev_polynomial_u',
+    'chebyshev_polynomial_v',
+    'chebyshev_polynomial_w',
+    'digamma',
+    'entr',
+    'erf',
+    'erfc',
+    'erfcx',
+    'erfinv',
+    'exp2',
+    'expit',
+    'expm1',
+    'gammainc',
+    'gammaincc',
+    'gammaln',
+    'hermite_polynomial_h',
+    'hermite_polynomial_he',
+    'i0',
+    'i0e',
+    'i1',
+    'i1e',
+    'laguerre_polynomial_l',
+    'legendre_polynomial_p',
+    'log1p',
+    'log_ndtr',
+    'log_softmax',
+    'logit',
+    'logsumexp',
+    'modified_bessel_i0',
+    'modified_bessel_i1',
+    'modified_bessel_k0',
+    'modified_bessel_k1',
+    'multigammaln',
+    'ndtr',
+    'ndtri',
+    'polygamma',
+    'psi',
+    'round',
+    'shifted_chebyshev_polynomial_t',
+    'shifted_chebyshev_polynomial_u',
+    'shifted_chebyshev_polynomial_v',
+    'shifted_chebyshev_polynomial_w',
+    'scaled_modified_bessel_k0',
+    'scaled_modified_bessel_k1',
+    'sinc',
+    'softmax',
+    'spherical_bessel_j0',
+    'xlog1py',
+    'xlogy',
+    'zeta',
+]
+
+Tensor = torch.Tensor
+
+entr = _add_docstr(_special.special_entr,
+                   r"""
+entr(input, *, out=None) -> Tensor
+Computes the entropy on :attr:`input` (as defined below), elementwise.
+
+.. math::
+    \begin{align}
+    \text{entr(x)} = \begin{cases}
+        -x * \ln(x)  & x > 0 \\
+        0 &  x = 0.0 \\
+        -\infty & x < 0
+    \end{cases}
+    \end{align}
+""" + """
+
+Args:
+   input (Tensor): the input tensor.
+
+Keyword args:
+    out (Tensor, optional): the output tensor.
+
+Example::
+    >>> a = torch.arange(-0.5, 1, 0.5)
+    >>> a
+    tensor([-0.5000,  0.0000,  0.5000])
+    >>> torch.special.entr(a)
+    tensor([  -inf, 0.0000, 0.3466])
+""")
+
+psi = _add_docstr(_special.special_psi,
+                  r"""
+psi(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.special.digamma`.
+""")
+
+digamma = _add_docstr(_special.special_digamma,
+                      r"""
+digamma(input, *, out=None) -> Tensor
+
+Computes the logarithmic derivative of the gamma function on `input`.
+
+.. math::
+    \digamma(x) = \frac{d}{dx} \ln\left(\Gamma\left(x\right)\right) = \frac{\Gamma'(x)}{\Gamma(x)}
+""" + r"""
+Args:
+    input (Tensor): the tensor to compute the digamma function on
+
+Keyword args:
+    {out}
+
+.. note::  This function is similar to SciPy's `scipy.special.digamma`.
+
+.. note::  From PyTorch 1.8 onwards, the digamma function returns `-Inf` for `0`.
+           Previously it returned `NaN` for `0`.
+
+Example::
+
+    >>> a = torch.tensor([1, 0.5])
+    >>> torch.special.digamma(a)
+    tensor([-0.5772, -1.9635])
+
+""".format(**common_args))
+
+gammaln = _add_docstr(_special.special_gammaln,
+                      r"""
+gammaln(input, *, out=None) -> Tensor
+
+Computes the natural logarithm of the absolute value of the gamma function on :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \ln \Gamma(|\text{input}_{i}|)
+""" + """
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.arange(0.5, 2, 0.5)
+    >>> torch.special.gammaln(a)
+    tensor([ 0.5724,  0.0000, -0.1208])
+
+""".format(**common_args))
+
+polygamma = _add_docstr(_special.special_polygamma,
+                        r"""
+polygamma(n, input, *, out=None) -> Tensor
+
+Computes the :math:`n^{th}` derivative of the digamma function on :attr:`input`.
+:math:`n \geq 0` is called the order of the polygamma function.
+
+.. math::
+    \psi^{(n)}(x) = \frac{d^{(n)}}{dx^{(n)}} \psi(x)
+
+.. note::
+    This function is implemented only for nonnegative integers :math:`n \geq 0`.
+""" + """
+Args:
+    n (int): the order of the polygamma function
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> a = torch.tensor([1, 0.5])
+    >>> torch.special.polygamma(1, a)
+    tensor([1.64493, 4.9348])
+    >>> torch.special.polygamma(2, a)
+    tensor([ -2.4041, -16.8288])
+    >>> torch.special.polygamma(3, a)
+    tensor([ 6.4939, 97.4091])
+    >>> torch.special.polygamma(4, a)
+    tensor([ -24.8863, -771.4742])
+""".format(**common_args))
+
+erf = _add_docstr(_special.special_erf,
+                  r"""
+erf(input, *, out=None) -> Tensor
+
+Computes the error function of :attr:`input`. The error function is defined as follows:
+
+.. math::
+    \mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.erf(torch.tensor([0, -1., 10.]))
+    tensor([ 0.0000, -0.8427,  1.0000])
+""".format(**common_args))
+
+erfc = _add_docstr(_special.special_erfc,
+                   r"""
+erfc(input, *, out=None) -> Tensor
+
+Computes the complementary error function of :attr:`input`.
+The complementary error function is defined as follows:
+
+.. math::
+    \mathrm{erfc}(x) = 1 - \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.erfc(torch.tensor([0, -1., 10.]))
+    tensor([ 1.0000, 1.8427,  0.0000])
+""".format(**common_args))
+
+erfcx = _add_docstr(_special.special_erfcx,
+                    r"""
+erfcx(input, *, out=None) -> Tensor
+
+Computes the scaled complementary error function for each element of :attr:`input`.
+The scaled complementary error function is defined as follows:
+
+.. math::
+    \mathrm{erfcx}(x) = e^{x^2} \mathrm{erfc}(x)
+""" + r"""
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.erfcx(torch.tensor([0, -1., 10.]))
+    tensor([ 1.0000, 5.0090, 0.0561])
+""".format(**common_args))
+
+erfinv = _add_docstr(_special.special_erfinv,
+                     r"""
+erfinv(input, *, out=None) -> Tensor
+
+Computes the inverse error function of :attr:`input`.
+The inverse error function is defined in the range :math:`(-1, 1)` as:
+
+.. math::
+    \mathrm{erfinv}(\mathrm{erf}(x)) = x
+""" + r"""
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.erfinv(torch.tensor([0, 0.5, -1.]))
+    tensor([ 0.0000,  0.4769,    -inf])
+""".format(**common_args))
+
+logit = _add_docstr(_special.special_logit,
+                    r"""
+logit(input, eps=None, *, out=None) -> Tensor
+
+Returns a new tensor with the logit of the elements of :attr:`input`.
+:attr:`input` is clamped to [eps, 1 - eps] when eps is not None.
+When eps is None and :attr:`input` < 0 or :attr:`input` > 1, the function will yields NaN.
+
+.. math::
+    \begin{align}
+    y_{i} &= \ln(\frac{z_{i}}{1 - z_{i}}) \\
+    z_{i} &= \begin{cases}
+        x_{i} & \text{if eps is None} \\
+        \text{eps} & \text{if } x_{i} < \text{eps} \\
+        x_{i} & \text{if } \text{eps} \leq x_{i} \leq 1 - \text{eps} \\
+        1 - \text{eps} & \text{if } x_{i} > 1 - \text{eps}
+    \end{cases}
+    \end{align}
+""" + r"""
+Args:
+    {input}
+    eps (float, optional): the epsilon for input clamp bound. Default: ``None``
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.rand(5)
+    >>> a
+    tensor([0.2796, 0.9331, 0.6486, 0.1523, 0.6516])
+    >>> torch.special.logit(a, eps=1e-6)
+    tensor([-0.9466,  2.6352,  0.6131, -1.7169,  0.6261])
+""".format(**common_args))
+
+logsumexp = _add_docstr(_special.special_logsumexp,
+                        r"""
+logsumexp(input, dim, keepdim=False, *, out=None)
+
+Alias for :func:`torch.logsumexp`.
+""".format(**multi_dim_common))
+
+expit = _add_docstr(_special.special_expit,
+                    r"""
+expit(input, *, out=None) -> Tensor
+
+Computes the expit (also known as the logistic sigmoid function) of the elements of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \frac{1}{1 + e^{-\text{input}_{i}}}
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> t = torch.randn(4)
+    >>> t
+    tensor([ 0.9213,  1.0887, -0.8858, -1.7683])
+    >>> torch.special.expit(t)
+    tensor([ 0.7153,  0.7481,  0.2920,  0.1458])
+""".format(**common_args))
+
+exp2 = _add_docstr(_special.special_exp2,
+                   r"""
+exp2(input, *, out=None) -> Tensor
+
+Computes the base two exponential function of :attr:`input`.
+
+.. math::
+    y_{i} = 2^{x_{i}}
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.exp2(torch.tensor([0, math.log2(2.), 3, 4]))
+    tensor([ 1.,  2.,  8., 16.])
+""".format(**common_args))
+
+expm1 = _add_docstr(_special.special_expm1,
+                    r"""
+expm1(input, *, out=None) -> Tensor
+
+Computes the exponential of the elements minus 1
+of :attr:`input`.
+
+.. math::
+    y_{i} = e^{x_{i}} - 1
+
+.. note:: This function provides greater precision than exp(x) - 1 for small values of x.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.special.expm1(torch.tensor([0, math.log(2.)]))
+    tensor([ 0.,  1.])
+""".format(**common_args))
+
+xlog1py = _add_docstr(_special.special_xlog1py,
+                      r"""
+xlog1py(input, other, *, out=None) -> Tensor
+
+Computes ``input * log1p(other)`` with the following cases.
+
+.. math::
+    \text{out}_{i} = \begin{cases}
+        \text{NaN} & \text{if } \text{other}_{i} = \text{NaN} \\
+        0 & \text{if } \text{input}_{i} = 0.0 \text{ and } \text{other}_{i} != \text{NaN} \\
+        \text{input}_{i} * \text{log1p}(\text{other}_{i})& \text{otherwise}
+    \end{cases}
+
+Similar to SciPy's `scipy.special.xlog1py`.
+
+""" + r"""
+
+Args:
+    input (Number or Tensor) : Multiplier
+    other (Number or Tensor) : Argument
+
+.. note:: At least one of :attr:`input` or :attr:`other` must be a tensor.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.zeros(5,)
+    >>> y = torch.tensor([-1, 0, 1, float('inf'), float('nan')])
+    >>> torch.special.xlog1py(x, y)
+    tensor([0., 0., 0., 0., nan])
+    >>> x = torch.tensor([1, 2, 3])
+    >>> y = torch.tensor([3, 2, 1])
+    >>> torch.special.xlog1py(x, y)
+    tensor([1.3863, 2.1972, 2.0794])
+    >>> torch.special.xlog1py(x, 4)
+    tensor([1.6094, 3.2189, 4.8283])
+    >>> torch.special.xlog1py(2, y)
+    tensor([2.7726, 2.1972, 1.3863])
+""".format(**common_args))
+
+xlogy = _add_docstr(_special.special_xlogy,
+                    r"""
+xlogy(input, other, *, out=None) -> Tensor
+
+Computes ``input * log(other)`` with the following cases.
+
+.. math::
+    \text{out}_{i} = \begin{cases}
+        \text{NaN} & \text{if } \text{other}_{i} = \text{NaN} \\
+        0 & \text{if } \text{input}_{i} = 0.0 \\
+        \text{input}_{i} * \log{(\text{other}_{i})} & \text{otherwise}
+    \end{cases}
+
+Similar to SciPy's `scipy.special.xlogy`.
+
+""" + r"""
+
+Args:
+    input (Number or Tensor) : Multiplier
+    other (Number or Tensor) : Argument
+
+.. note:: At least one of :attr:`input` or :attr:`other` must be a tensor.
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> x = torch.zeros(5,)
+    >>> y = torch.tensor([-1, 0, 1, float('inf'), float('nan')])
+    >>> torch.special.xlogy(x, y)
+    tensor([0., 0., 0., 0., nan])
+    >>> x = torch.tensor([1, 2, 3])
+    >>> y = torch.tensor([3, 2, 1])
+    >>> torch.special.xlogy(x, y)
+    tensor([1.0986, 1.3863, 0.0000])
+    >>> torch.special.xlogy(x, 4)
+    tensor([1.3863, 2.7726, 4.1589])
+    >>> torch.special.xlogy(2, y)
+    tensor([2.1972, 1.3863, 0.0000])
+""".format(**common_args))
+
+i0 = _add_docstr(_special.special_i0,
+                 r"""
+i0(input, *, out=None) -> Tensor
+
+Computes the zeroth order modified Bessel function of the first kind for each element of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = I_0(\text{input}_{i}) = \sum_{k=0}^{\infty} \frac{(\text{input}_{i}^2/4)^k}{(k!)^2}
+
+""" + r"""
+Args:
+    input (Tensor): the input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> torch.i0(torch.arange(5, dtype=torch.float32))
+    tensor([ 1.0000,  1.2661,  2.2796,  4.8808, 11.3019])
+
+""".format(**common_args))
+
+i0e = _add_docstr(_special.special_i0e,
+                  r"""
+i0e(input, *, out=None) -> Tensor
+Computes the exponentially scaled zeroth order modified Bessel function of the first kind (as defined below)
+for each element of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \exp(-|x|) * i0(x) = \exp(-|x|) * \sum_{k=0}^{\infty} \frac{(\text{input}_{i}^2/4)^k}{(k!)^2}
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.i0e(torch.arange(5, dtype=torch.float32))
+    tensor([1.0000, 0.4658, 0.3085, 0.2430, 0.2070])
+""".format(**common_args))
+
+i1 = _add_docstr(_special.special_i1,
+                 r"""
+i1(input, *, out=None) -> Tensor
+Computes the first order modified Bessel function of the first kind (as defined below)
+for each element of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \frac{(\text{input}_{i})}{2} * \sum_{k=0}^{\infty} \frac{(\text{input}_{i}^2/4)^k}{(k!) * (k+1)!}
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.i1(torch.arange(5, dtype=torch.float32))
+    tensor([0.0000, 0.5652, 1.5906, 3.9534, 9.7595])
+""".format(**common_args))
+
+i1e = _add_docstr(_special.special_i1e,
+                  r"""
+i1e(input, *, out=None) -> Tensor
+Computes the exponentially scaled first order modified Bessel function of the first kind (as defined below)
+for each element of :attr:`input`.
+
+.. math::
+    \text{out}_{i} = \exp(-|x|) * i1(x) =
+        \exp(-|x|) * \frac{(\text{input}_{i})}{2} * \sum_{k=0}^{\infty} \frac{(\text{input}_{i}^2/4)^k}{(k!) * (k+1)!}
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.i1e(torch.arange(5, dtype=torch.float32))
+    tensor([0.0000, 0.2079, 0.2153, 0.1968, 0.1788])
+""".format(**common_args))
+
+ndtr = _add_docstr(_special.special_ndtr,
+                   r"""
+ndtr(input, *, out=None) -> Tensor
+Computes the area under the standard Gaussian probability density function,
+integrated from minus infinity to :attr:`input`, elementwise.
+
+.. math::
+    \text{ndtr}(x) = \frac{1}{\sqrt{2 \pi}}\int_{-\infty}^{x} e^{-\frac{1}{2}t^2} dt
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
+    tensor([0.0013, 0.0228, 0.1587, 0.5000, 0.8413, 0.9772, 0.9987])
+""".format(**common_args))
+
+ndtri = _add_docstr(_special.special_ndtri,
+                    r"""
+ndtri(input, *, out=None) -> Tensor
+Computes the argument, x, for which the area under the Gaussian probability density function
+(integrated from minus infinity to x) is equal to :attr:`input`, elementwise.
+
+.. math::
+    \text{ndtri}(p) = \sqrt{2}\text{erf}^{-1}(2p - 1)
+
+.. note::
+    Also known as quantile function for Normal Distribution.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.ndtri(torch.tensor([0, 0.25, 0.5, 0.75, 1]))
+    tensor([   -inf, -0.6745,  0.0000,  0.6745,     inf])
+""".format(**common_args))
+
+log_ndtr = _add_docstr(_special.special_log_ndtr,
+                       r"""
+log_ndtr(input, *, out=None) -> Tensor
+Computes the log of the area under the standard Gaussian probability density function,
+integrated from minus infinity to :attr:`input`, elementwise.
+
+.. math::
+    \text{log\_ndtr}(x) = \log\left(\frac{1}{\sqrt{2 \pi}}\int_{-\infty}^{x} e^{-\frac{1}{2}t^2} dt \right)
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> torch.special.log_ndtr(torch.tensor([-3., -2, -1, 0, 1, 2, 3]))
+    tensor([-6.6077 -3.7832 -1.841  -0.6931 -0.1728 -0.023  -0.0014])
+""".format(**common_args))
+
+log1p = _add_docstr(_special.special_log1p,
+                    r"""
+log1p(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.log1p`.
+""")
+
+sinc = _add_docstr(_special.special_sinc,
+                   r"""
+sinc(input, *, out=None) -> Tensor
+
+Computes the normalized sinc of :attr:`input.`
+
+.. math::
+    \text{out}_{i} =
+    \begin{cases}
+      1, & \text{if}\ \text{input}_{i}=0 \\
+      \sin(\pi \text{input}_{i}) / (\pi \text{input}_{i}), & \text{otherwise}
+    \end{cases}
+""" + r"""
+
+Args:
+    {input}
+
+Keyword args:
+    {out}
+
+Example::
+    >>> t = torch.randn(4)
+    >>> t
+    tensor([ 0.2252, -0.2948,  1.0267, -1.1566])
+    >>> torch.special.sinc(t)
+    tensor([ 0.9186,  0.8631, -0.0259, -0.1300])
+""".format(**common_args))
+
+round = _add_docstr(_special.special_round,
+                    r"""
+round(input, *, out=None) -> Tensor
+
+Alias for :func:`torch.round`.
+""")
+
+softmax = _add_docstr(_special.special_softmax,
+                      r"""
+softmax(input, dim, *, dtype=None) -> Tensor
+
+Computes the softmax function.
+
+Softmax is defined as:
+
+:math:`\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}`
+
+It is applied to all slices along dim, and will re-scale them so that the elements
+lie in the range `[0, 1]` and sum to 1.
+
+Args:
+    input (Tensor): input
+    dim (int): A dimension along which softmax will be computed.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        If specified, the input tensor is cast to :attr:`dtype` before the operation
+        is performed. This is useful for preventing data type overflows. Default: None.
+
+Examples::
+    >>> t = torch.ones(2, 2)
+    >>> torch.special.softmax(t, 0)
+    tensor([[0.5000, 0.5000],
+            [0.5000, 0.5000]])
+
+""")
+
+log_softmax = _add_docstr(_special.special_log_softmax,
+                          r"""
+log_softmax(input, dim, *, dtype=None) -> Tensor
+
+Computes softmax followed by a logarithm.
+
+While mathematically equivalent to log(softmax(x)), doing these two
+operations separately is slower and numerically unstable. This function
+is computed as:
+
+.. math::
+    \text{log\_softmax}(x_{i}) = \log\left(\frac{\exp(x_i) }{ \sum_j \exp(x_j)} \right)
+""" + r"""
+
+Args:
+    input (Tensor): input
+    dim (int): A dimension along which log_softmax will be computed.
+    dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
+        If specified, the input tensor is cast to :attr:`dtype` before the operation
+        is performed. This is useful for preventing data type overflows. Default: None.
+
+Example::
+    >>> t = torch.ones(2, 2)
+    >>> torch.special.log_softmax(t, 0)
+    tensor([[-0.6931, -0.6931],
+            [-0.6931, -0.6931]])
+""")
+
+zeta = _add_docstr(_special.special_zeta,
+                   r"""
+zeta(input, other, *, out=None) -> Tensor
+
+Computes the Hurwitz zeta function, elementwise.
+
+.. math::
+    \zeta(x, q) = \sum_{k=0}^{\infty} \frac{1}{(k + q)^x}
+
+""" + r"""
+Args:
+    input (Tensor): the input tensor corresponding to `x`.
+    other (Tensor): the input tensor corresponding to `q`.
+
+.. note::
+    The Riemann zeta function corresponds to the case when `q = 1`
+
+Keyword args:
+    {out}
+
+Example::
+    >>> x = torch.tensor([2., 4.])
+    >>> torch.special.zeta(x, 1)
+    tensor([1.6449, 1.0823])
+    >>> torch.special.zeta(x, torch.tensor([1., 2.]))
+    tensor([1.6449, 0.0823])
+    >>> torch.special.zeta(2, torch.tensor([1., 2.]))
+    tensor([1.6449, 0.6449])
+""".format(**common_args))
+
+multigammaln = _add_docstr(_special.special_multigammaln,
+                           r"""
+multigammaln(input, p, *, out=None) -> Tensor
+
+Computes the `multivariate log-gamma function
+<https://en.wikipedia.org/wiki/Multivariate_gamma_function>`_ with dimension
+:math:`p` element-wise, given by
+
+.. math::
+    \log(\Gamma_{p}(a)) = C + \displaystyle \sum_{i=1}^{p} \log\left(\Gamma\left(a - \frac{i - 1}{2}\right)\right)
+
+where :math:`C = \log(\pi) \cdot \frac{p (p - 1)}{4}` and :math:`\Gamma(-)` is the Gamma function.
+
+All elements must be greater than :math:`\frac{p - 1}{2}`, otherwise the behavior is undefiend.
+""" + """
+
+Args:
+    input (Tensor): the tensor to compute the multivariate log-gamma function
+    p (int): the number of dimensions
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a = torch.empty(2, 3).uniform_(1, 2)
+    >>> a
+    tensor([[1.6835, 1.8474, 1.1929],
+            [1.0475, 1.7162, 1.4180]])
+    >>> torch.special.multigammaln(a, 2)
+    tensor([[0.3928, 0.4007, 0.7586],
+            [1.0311, 0.3901, 0.5049]])
+""".format(**common_args))
+
+gammainc = _add_docstr(_special.special_gammainc,
+                       r"""
+gammainc(input, other, *, out=None) -> Tensor
+
+Computes the regularized lower incomplete gamma function:
+
+.. math::
+    \text{out}_{i} = \frac{1}{\Gamma(\text{input}_i)} \int_0^{\text{other}_i} t^{\text{input}_i-1} e^{-t} dt
+
+where both :math:`\text{input}_i` and :math:`\text{other}_i` are weakly positive
+and at least one is strictly positive.
+If both are zero or either is negative then :math:`\text{out}_i=\text{nan}`.
+:math:`\Gamma(\cdot)` in the equation above is the gamma function,
+
+.. math::
+    \Gamma(\text{input}_i) = \int_0^\infty t^{(\text{input}_i-1)} e^{-t} dt.
+
+See :func:`torch.special.gammaincc` and :func:`torch.special.gammaln` for related functions.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`
+and float inputs.
+
+.. note::
+    The backward pass with respect to :attr:`input` is not yet supported.
+    Please open an issue on PyTorch's Github to request it.
+
+""" + r"""
+Args:
+    input (Tensor): the first non-negative input tensor
+    other (Tensor): the second non-negative input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a1 = torch.tensor([4.0])
+    >>> a2 = torch.tensor([3.0, 4.0, 5.0])
+    >>> a = torch.special.gammaincc(a1, a2)
+    tensor([0.3528, 0.5665, 0.7350])
+    tensor([0.3528, 0.5665, 0.7350])
+    >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
+    tensor([1., 1., 1.])
+
+""".format(**common_args))
+
+gammaincc = _add_docstr(_special.special_gammaincc,
+                        r"""
+gammaincc(input, other, *, out=None) -> Tensor
+
+Computes the regularized upper incomplete gamma function:
+
+.. math::
+    \text{out}_{i} = \frac{1}{\Gamma(\text{input}_i)} \int_{\text{other}_i}^{\infty} t^{\text{input}_i-1} e^{-t} dt
+
+where both :math:`\text{input}_i` and :math:`\text{other}_i` are weakly positive
+and at least one is strictly positive.
+If both are zero or either is negative then :math:`\text{out}_i=\text{nan}`.
+:math:`\Gamma(\cdot)` in the equation above is the gamma function,
+
+.. math::
+    \Gamma(\text{input}_i) = \int_0^\infty t^{(\text{input}_i-1)} e^{-t} dt.
+
+See :func:`torch.special.gammainc` and :func:`torch.special.gammaln` for related functions.
+
+Supports :ref:`broadcasting to a common shape <broadcasting-semantics>`
+and float inputs.
+
+.. note::
+    The backward pass with respect to :attr:`input` is not yet supported.
+    Please open an issue on PyTorch's Github to request it.
+
+""" + r"""
+Args:
+    input (Tensor): the first non-negative input tensor
+    other (Tensor): the second non-negative input tensor
+
+Keyword args:
+    {out}
+
+Example::
+
+    >>> a1 = torch.tensor([4.0])
+    >>> a2 = torch.tensor([3.0, 4.0, 5.0])
+    >>> a = torch.special.gammaincc(a1, a2)
+    tensor([0.6472, 0.4335, 0.2650])
+    >>> b = torch.special.gammainc(a1, a2) + torch.special.gammaincc(a1, a2)
+    tensor([1., 1., 1.])
+
+""".format(**common_args))
+
+airy_ai = _add_docstr(_special.special_airy_ai,
+                      r"""
+airy_ai(input, *, out=None) -> Tensor
+
+Airy function :math:`\text{Ai}\left(\text{input}\right)`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+bessel_j0 = _add_docstr(_special.special_bessel_j0,
+                        r"""
+bessel_j0(input, *, out=None) -> Tensor
+
+Bessel function of the first kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+bessel_j1 = _add_docstr(_special.special_bessel_j1,
+                        r"""
+bessel_j1(input, *, out=None) -> Tensor
+
+Bessel function of the first kind of order :math:`1`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+bessel_y0 = _add_docstr(_special.special_bessel_y0,
+                        r"""
+bessel_y0(input, *, out=None) -> Tensor
+
+Bessel function of the second kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+bessel_y1 = _add_docstr(_special.special_bessel_y1,
+                        r"""
+bessel_y1(input, *, out=None) -> Tensor
+
+Bessel function of the second kind of order :math:`1`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+chebyshev_polynomial_t = _add_docstr(_special.special_chebyshev_polynomial_t,
+                                     r"""
+chebyshev_polynomial_t(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the first kind :math:`T_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
+is returned. If :math:`n < 6` or :math:`|\text{input}| > 1` the recursion:
+
+.. math::
+    T_{n + 1}(\text{input}) = 2 \times \text{input} \times T_{n}(\text{input}) - T_{n - 1}(\text{input})
+
+is evaluated. Otherwise, the explicit trigonometric formula:
+
+.. math::
+    T_{n}(\text{input}) = \text{cos}(n \times \text{arccos}(x))
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+chebyshev_polynomial_u = _add_docstr(_special.special_chebyshev_polynomial_u,
+                                     r"""
+chebyshev_polynomial_t(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the second kind :math:`U_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`,
+:math:`2 \times \text{input}` is returned. If :math:`n < 6` or
+:math:`|\text{input}| > 1`, the recursion:
+
+.. math::
+    T_{n + 1}(\text{input}) = 2 \times \text{input} \times T_{n}(\text{input}) - T_{n - 1}(\text{input})
+
+is evaluated. Otherwise, the explicit trigonometric formula:
+
+.. math::
+    \frac{\text{sin}((n + 1) \times \text{arccos}(\text{input}))}{\text{sin}(\text{arccos}(\text{input}))}
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+chebyshev_polynomial_v = _add_docstr(_special.special_chebyshev_polynomial_v,
+                                     r"""
+chebyshev_polynomial_v(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the third kind :math:`V_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+chebyshev_polynomial_w = _add_docstr(_special.special_chebyshev_polynomial_w,
+                                     r"""
+chebyshev_polynomial_w(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the fourth kind :math:`W_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+hermite_polynomial_h = _add_docstr(_special.special_hermite_polynomial_h,
+                                   r"""
+hermite_polynomial_h(input, n, *, out=None) -> Tensor
+
+Physicist’s Hermite polynomial :math:`H_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
+is returned. Otherwise, the recursion:
+
+.. math::
+    H_{n + 1}(\text{input}) = 2 \times \text{input} \times H_{n}(\text{input}) - H_{n - 1}(\text{input})
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+hermite_polynomial_he = _add_docstr(_special.special_hermite_polynomial_he,
+                                    r"""
+hermite_polynomial_he(input, n, *, out=None) -> Tensor
+
+Probabilist’s Hermite polynomial :math:`He_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
+is returned. Otherwise, the recursion:
+
+.. math::
+    He_{n + 1}(\text{input}) = 2 \times \text{input} \times He_{n}(\text{input}) - He_{n - 1}(\text{input})
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+laguerre_polynomial_l = _add_docstr(_special.special_laguerre_polynomial_l,
+                                    r"""
+laguerre_polynomial_l(input, n, *, out=None) -> Tensor
+
+Laguerre polynomial :math:`L_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
+is returned. Otherwise, the recursion:
+
+.. math::
+    L_{n + 1}(\text{input}) = 2 \times \text{input} \times L_{n}(\text{input}) - L_{n - 1}(\text{input})
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+legendre_polynomial_p = _add_docstr(_special.special_legendre_polynomial_p,
+                                    r"""
+legendre_polynomial_p(input, n, *, out=None) -> Tensor
+
+Legendre polynomial :math:`P_{n}(\text{input})`.
+
+If :math:`n = 0`, :math:`1` is returned. If :math:`n = 1`, :math:`\text{input}`
+is returned. Otherwise, the recursion:
+
+.. math::
+    P_{n + 1}(\text{input}) = 2 \times \text{input} \times P_{n}(\text{input}) - P_{n - 1}(\text{input})
+
+is evaluated.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+modified_bessel_i0 = _add_docstr(_special.special_modified_bessel_i0,
+                                 r"""
+modified_bessel_i0(input, *, out=None) -> Tensor
+
+Modified Bessel function of the first kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+modified_bessel_i1 = _add_docstr(_special.special_modified_bessel_i1,
+                                 r"""
+modified_bessel_i1(input, *, out=None) -> Tensor
+
+Modified Bessel function of the first kind of order :math:`1`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+modified_bessel_k0 = _add_docstr(_special.special_modified_bessel_k0,
+                                 r"""
+modified_bessel_k0(input, *, out=None) -> Tensor
+
+Modified Bessel function of the second kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+modified_bessel_k1 = _add_docstr(_special.special_modified_bessel_k1,
+                                 r"""
+modified_bessel_k1(input, *, out=None) -> Tensor
+
+Modified Bessel function of the second kind of order :math:`1`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+scaled_modified_bessel_k0 = _add_docstr(_special.special_scaled_modified_bessel_k0,
+                                        r"""
+scaled_modified_bessel_k0(input, *, out=None) -> Tensor
+
+Scaled modified Bessel function of the second kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+scaled_modified_bessel_k1 = _add_docstr(_special.special_scaled_modified_bessel_k1,
+                                        r"""
+scaled_modified_bessel_k1(input, *, out=None) -> Tensor
+
+Scaled modified Bessel function of the second kind of order :math:`1`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+shifted_chebyshev_polynomial_t = _add_docstr(_special.special_shifted_chebyshev_polynomial_t,
+                                             r"""
+shifted_chebyshev_polynomial_t(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the first kind :math:`T_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+shifted_chebyshev_polynomial_u = _add_docstr(_special.special_shifted_chebyshev_polynomial_u,
+                                             r"""
+shifted_chebyshev_polynomial_u(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the second kind :math:`U_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+shifted_chebyshev_polynomial_v = _add_docstr(_special.special_shifted_chebyshev_polynomial_v,
+                                             r"""
+shifted_chebyshev_polynomial_v(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the third kind :math:`V_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+shifted_chebyshev_polynomial_w = _add_docstr(_special.special_shifted_chebyshev_polynomial_w,
+                                             r"""
+shifted_chebyshev_polynomial_w(input, n, *, out=None) -> Tensor
+
+Chebyshev polynomial of the fourth kind :math:`W_{n}^{\ast}(\text{input})`.
+
+""" + r"""
+Args:
+    {input}
+    n (Tensor): Degree of the polynomial.
+
+Keyword args:
+    {out}
+""".format(**common_args))
+
+spherical_bessel_j0 = _add_docstr(_special.special_spherical_bessel_j0,
+                                  r"""
+spherical_bessel_j0(input, *, out=None) -> Tensor
+
+Spherical Bessel function of the first kind of order :math:`0`.
+
+""" + r"""
+Args:
+    {input}
+
+Keyword args:
+    {out}
+""".format(**common_args))
diff --git a/MLPY/Lib/site-packages/torch/special/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/special/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1975373a39e9f0efa9cfb90b7c311222d136bf4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/special/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/storage.py b/MLPY/Lib/site-packages/torch/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5123e88697b776f75e3fe897cee138484e84375
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/storage.py
@@ -0,0 +1,1228 @@
+import io
+
+import torch
+from ._utils import _type, _cuda, _hpu
+from torch.types import Storage
+from typing import cast, Any, Dict as _Dict, Optional as _Optional, TypeVar, Type, Union
+import copy
+import collections
+from functools import lru_cache
+import warnings
+import threading
+import functools
+try:
+    import numpy as np
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    np = None  # type: ignore[assignment]
+
+_share_memory_lock = threading.Lock()
+_share_memory_map: _Dict[int, threading.RLock] = {}
+
+T = TypeVar('T', bound='Union[_StorageBase, TypedStorage]')
+class _StorageBase:
+    _cdata: Any
+    is_sparse: bool = False
+    is_sparse_csr: bool = False
+    device: torch.device
+
+    def __init__(self, *args, **kwargs): ...  # noqa: E704
+    def __len__(self) -> int: ...  # type: ignore[empty-body] # noqa: E704
+    def __getitem__(self, idx): ...  # noqa: E704
+    def __setitem__(self, *args, **kwargs): ...  # noqa: E704
+    def copy_(self, source: T, non_blocking: _Optional[bool] = None) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    def new(self) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def nbytes(self) -> int: ...  # type: ignore[empty-body] # noqa: E704
+
+    def size(self) -> int:
+        return self.nbytes()
+
+    def type(self, dtype: _Optional[str] = None, non_blocking: bool = False) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def cuda(self, device=None, non_blocking=False, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def hpu(self, device=None, non_blocking=False, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def element_size(self) -> int: ...  # type: ignore[empty-body, type-var] # noqa: E704
+
+    def get_device(self) -> int:
+        return self.device.index
+
+    def data_ptr(self) -> int: ...  # type: ignore[empty-body] # noqa: E704
+
+    def resizable(self) -> bool: ...  # type: ignore[empty-body] # noqa: E704
+
+    # Defined in torch/csrc/generic/StorageSharing.cpp
+    def _share_filename_cpu_(self, *args, **kwargs): ...  # noqa: E704
+    def _share_fd_cpu_(self, *args, **kwargs): ...  # noqa: E704
+    @classmethod
+    def _new_using_filename_cpu(cls: Type[T], size: int) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def _new_using_fd_cpu(cls: Type[T], size: int) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def from_buffer(cls: Type[T], *args, **kwargs) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def _new_shared_filename_cpu(cls: Type[T], manager, obj, size, *, device=None, dtype=None) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def _release_ipc_counter_cuda(cls: Type[T], *args, **kwargs) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def _new_with_weak_ptr(cls: Type[T], *args, **kwargs) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    def _shared_decref(self) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def _write_file(self, *args, **kwargs): ...  # noqa: E704
+    def resize_(self, size: int): ...  # noqa: E704
+    def _weak_ref(self, *args, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def _set_from_file(self, *args, **kwargs): ...  # noqa: E704
+    def _set_cdata(self, *args, **kwargs): ...  # noqa: E704
+    def _share_cuda_(self, *args, **kwargs): ...  # noqa: E704
+    def is_shared(self) -> bool: ...  # type: ignore[empty-body] # noqa: E704
+    @classmethod
+    def _new_shared_cuda(cls: Type[T], *args, **kwargs) -> T: ...  # type: ignore[empty-body] # noqa: E704
+    def _shared_incref(self, *args, **kwargs): ...  # noqa: E704
+    @classmethod
+    def _free_weak_ref(cls, *args, **kwargs): ...  # noqa: E704
+    @property
+    def is_cuda(self): ...  # noqa: E704
+    @property
+    def is_hpu(self): ...  # noqa: E704
+    @classmethod
+    def from_file(cls, filename, shared, nbytes) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    @classmethod
+    def _expired(cls, *args, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+    def _byteswap(self, *args, **kwargs): ...  # noqa: E704
+    def _get_filename(self, *args, **kwargs) -> _Optional[str]: ...  # type: ignore[empty-body, misc] # noqa: E704
+
+    def __str__(self):
+        info_str = (
+            f'[{torch.typename(self)}(device={self.device}) '
+            f'of size {len(self)}]')
+        if self.device.type == 'meta':
+            return '...\n' + info_str
+        else:
+            data_str = ' ' + '\n '.join(str(self[i]) for i in range(self.size()))
+            return data_str + '\n' + info_str
+
+    def __repr__(self):
+        return str(self)
+
+    def __iter__(self):
+        return iter(self[i] for i in range(self.size()))
+
+    def __copy__(self):
+        return self.clone()
+
+    def __deepcopy__(self, memo):
+        memo = memo.setdefault('torch', {})
+        if self._cdata in memo:
+            return memo[self._cdata]
+        new_storage = self.clone()
+        memo[self._cdata] = new_storage
+        return new_storage
+
+    def __reduce__(self):
+        b = io.BytesIO()
+        torch.save(self, b, _use_new_zipfile_serialization=False)
+        return (_load_from_bytes, (b.getvalue(),))
+
+    def __sizeof__(self):
+        return super().__sizeof__() + self.size()
+
+    def clone(self):
+        """Return a copy of this storage."""
+        return type(self)(self.nbytes(), device=self.device).copy_(self)
+
+    def tolist(self):
+        """Return a list containing the elements of this storage."""
+        return list(self)
+
+    def cpu(self):
+        """Return a CPU copy of this storage if it's not already on the CPU."""
+        if self.device.type != 'cpu':
+            return torch.UntypedStorage(self.size()).copy_(self, False)
+        else:
+            return self
+
+    def mps(self):
+        """Return a MPS copy of this storage if it's not already on the MPS."""
+        if self.device.type != 'mps':
+            return torch.UntypedStorage(self.size(), device="mps").copy_(self, False)
+        else:
+            return self
+
+    def _to(self, dtype):
+        if not isinstance(dtype, torch.dtype):
+            raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
+        storage = torch.tensor([], dtype=torch.uint8, device=self.device).set_(cast(Storage, self)).to(dtype)._typed_storage()
+        if storage.data_ptr() == self.data_ptr():
+            storage = storage.clone()
+        return storage
+
+    def double(self):
+        """Casts this storage to double type."""
+        return self._to(torch.double)
+
+    def float(self):
+        """Casts this storage to float type."""
+        return self._to(torch.float)
+
+    def half(self):
+        """Casts this storage to half type."""
+        return self._to(torch.half)
+
+    def long(self):
+        """Casts this storage to long type."""
+        return self._to(torch.long)
+
+    def int(self):
+        """Casts this storage to int type."""
+        return self._to(torch.int)
+
+    def short(self):
+        """Casts this storage to short type."""
+        return self._to(torch.short)
+
+    def char(self):
+        """Casts this storage to char type."""
+        return self._to(torch.int8)
+
+    def byte(self):
+        """Casts this storage to byte type."""
+        return self._to(torch.uint8)
+
+    def bool(self):
+        """Casts this storage to bool type."""
+        return self._to(torch.bool)
+
+    def bfloat16(self):
+        """Casts this storage to bfloat16 type."""
+        return self._to(torch.bfloat16)
+
+    def complex_double(self):
+        """Casts this storage to complex double type."""
+        return self._to(torch.cdouble)
+
+    def complex_float(self):
+        """Casts this storage to complex float type."""
+        return self._to(torch.cfloat)
+
+    def float8_e5m2(self):
+        """Casts this storage to float8_e5m2 type"""
+        return self._to(torch.float8_e5m2)
+
+    def float8_e4m3fn(self):
+        """Casts this storage to float8_e4m3fn type"""
+        return self._to(torch.float8_e4m3fn)
+
+    def float8_e5m2fnuz(self):
+        """Casts this storage to float8_e5m2fnuz type"""
+        return self._to(torch.float8_e5m2fnuz)
+
+    def float8_e4m3fnuz(self):
+        """Casts this storage to float8_e4m3fnuz type"""
+        return self._to(torch.float8_e4m3fnuz)
+
+    def is_pinned(self, device: Union[str, torch.device] = 'cuda'):
+        r"""Determine whether the CPU storage is already pinned on device.
+
+        Args:
+            device (str or torch.device): The device to pin memory on. Default: ``'cuda'``.
+
+        Returns:
+            A boolean variable.
+        """
+        return torch.tensor([], dtype=torch.uint8, device=self.device).set_(
+            cast(Storage, self)).is_pinned(device)
+
+    def pin_memory(self, device: Union[str, torch.device] = 'cuda'):
+        r"""Copy the CPU storage to pinned memory, if it's not already pinned.
+
+        Args:
+            device (str or torch.device): The device to pin memory on. Default: ``'cuda'``.
+
+        Returns:
+            A pinned CPU storage.
+        """
+        if self.device.type != 'cpu':
+            raise TypeError(f"cannot pin '{self.type()}' only CPU memory can be pinned")
+
+        pinned_tensor = torch.tensor([], dtype=torch.uint8, device=self.device).set_(
+            cast(Storage, self)).pin_memory(device)
+        return pinned_tensor.untyped_storage()
+
+    def share_memory_(self):
+        """See :meth:`torch.UntypedStorage.share_memory_`"""
+        from torch.multiprocessing import get_sharing_strategy
+        if self.device.type in ["cuda", torch._C._get_privateuse1_backend_name()]:
+            pass  # CUDA or PrivateUse1 doesn't use POSIX shared memory
+        elif get_sharing_strategy() == 'file_system':
+            self._share_filename_cpu_()
+        else:
+            self._share_fd_cpu_()
+        return self
+
+    @classmethod
+    def _new_shared(cls, size, *, device='cpu'):
+        """Create a new storage in shared memory with the same data type."""
+        from torch.multiprocessing import get_sharing_strategy
+        device = torch.device(device)
+        if device.type in ["cuda", torch._C._get_privateuse1_backend_name(), "hpu"]:
+            return cls(size, device=device)
+        elif get_sharing_strategy() == 'file_system':
+            return cls._new_using_filename_cpu(size)
+        else:
+            return cls._new_using_fd_cpu(size)
+
+    def untyped(self):
+        return self
+
+    def byteswap(self, dtype):
+        """Swap bytes in underlying data."""
+        elem_size = torch._utils._element_size(dtype)
+        # for complex types, don't swap first and second numbers
+        if dtype.is_complex:
+            elem_size = max(int(elem_size / 2), 1)
+        self._byteswap(elem_size)
+
+
+def _share_memory_lock_protected(fn):
+    @functools.wraps(fn)
+    def wrapper(self, *args, **kwargs):
+        to_free = None
+        to_wait = None
+        with _share_memory_lock:
+            key = self._cdata
+            if key in _share_memory_map:
+                to_wait = _share_memory_map[key]
+            else:
+                _share_memory_map[key] = threading.RLock()
+                _share_memory_map[key].acquire()
+                to_free = key
+
+        # If we're already in the process of sharing the storage, wait
+        # for it to be done.
+        if to_wait is not None:
+            with to_wait:
+                pass
+
+        try:
+            return fn(self, *args, **kwargs)
+        finally:
+            # If we acquired the storage lock here and we're done working on it
+            # we can now release it and free the entry.
+            if to_free is not None:
+                # Ensure that the cdata from the storage didn't change and only
+                # the data_ptr did.
+                assert self._cdata == to_free
+                with _share_memory_lock:
+                    _share_memory_map[to_free].release()
+                    del _share_memory_map[to_free]
+    return wrapper
+
+class UntypedStorage(torch._C.StorageBase, _StorageBase):
+    def __getitem__(self, *args, **kwargs):
+        if self.device.type == 'meta':
+            raise NotImplementedError("Not available for 'meta' device type")
+        return super().__getitem__(*args, **kwargs)
+
+    @property
+    def is_cuda(self):
+        return self.device.type == 'cuda'
+
+    @property
+    def is_hpu(self):
+        return self.device.type == 'hpu'
+
+    @property
+    def filename(self) -> _Optional[str]:
+        """Returns the file name associated with this storage if the storage was memory mapped from a file.
+           or ``None`` if the storage was not created by memory mapping a file."""
+        return self._get_filename()
+
+    @_share_memory_lock_protected
+    def share_memory_(self, *args, **kwargs):
+        """
+        Moves the storage to shared memory.
+
+        This is a no-op for storages already in shared memory and for CUDA
+        storages, which do not need to be moved for sharing across processes.
+        Storages in shared memory cannot be resized.
+
+        Note that to mitigate issues like `this <https://github.com/pytorch/pytorch/issues/95606>`_
+        it is thread safe to call this function from multiple threads on the same object.
+        It is NOT thread safe though to call any other function on self without proper
+        synchronization. Please see :doc:`/notes/multiprocessing` for more details.
+
+        .. note::
+            When all references to a storage in shared memory are deleted, the associated shared memory
+            object will also be deleted. PyTorch has a special cleanup process to ensure that this happens
+            even if the current process exits unexpectedly.
+
+            It is worth noting the difference between :meth:`share_memory_` and :meth:`from_file` with ``shared = True``
+
+            #. ``share_memory_`` uses `shm_open(3) <https://man7.org/linux/man-pages/man3/shm_open.3.html>`_ to create a
+               POSIX shared memory object while :meth:`from_file` uses
+               `open(2) <https://man7.org/linux/man-pages/man2/open.2.html>`_ to open the filename passed by the user.
+            #. Both use an `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_ with ``MAP_SHARED``
+               to map the file/object into the current virtual address space
+            #. ``share_memory_`` will call ``shm_unlink(3)`` on the object after mapping it to make sure the shared memory
+               object is freed when no process has the object open. ``torch.from_file(shared=True)`` does not unlink the
+               file. This file is persistent and will remain until it is deleted by the user.
+
+        Returns:
+            ``self``
+        """
+        return super().share_memory_(*args, **kwargs)
+
+    @_share_memory_lock_protected
+    def _share_fd_cpu_(self, *args, **kwargs):
+        return super()._share_fd_cpu_(*args, **kwargs)
+
+    @_share_memory_lock_protected
+    def _share_filename_cpu_(self, *args, **kwargs):
+        return super()._share_filename_cpu_(*args, **kwargs)
+
+def _load_from_bytes(b):
+    return torch.load(io.BytesIO(b))
+
+
+_StorageBase.type = _type  # type: ignore[assignment]
+_StorageBase.cuda = _cuda  # type: ignore[assignment]
+_StorageBase.hpu = _hpu  # type: ignore[assignment]
+
+
+@lru_cache(maxsize=None)
+def _dtype_to_storage_type_map():
+    # NOTE: We should no longer add dtypes to this map. This map
+    # is only used for BC/FC with older PyTorch versions. Going forward,
+    # new dtypes of TypedStorage should not translate to a legacy
+    # <type>Storage class. Instead, new dtypes of TypedStorage should
+    # be serialized as an UntypedStorage paired with a torch.dtype
+    return {
+        torch.double: 'DoubleStorage',
+        torch.float: 'FloatStorage',
+        torch.half: 'HalfStorage',
+        torch.long: 'LongStorage',
+        torch.int: 'IntStorage',
+        torch.int16: 'ShortStorage',
+        torch.int8: 'CharStorage',
+        torch.uint8: 'ByteStorage',
+        torch.bool: 'BoolStorage',
+        torch.bfloat16: 'BFloat16Storage',
+        torch.cdouble: 'ComplexDoubleStorage',
+        torch.cfloat: 'ComplexFloatStorage',
+        torch.qint8: 'QInt8Storage',
+        torch.qint32: 'QInt32Storage',
+        torch.quint8: 'QUInt8Storage',
+        torch.quint4x2: 'QUInt4x2Storage',
+        torch.quint2x4: 'QUInt2x4Storage',
+    }
+
+@lru_cache(maxsize=None)
+def _storage_type_to_dtype_map():
+    dtype_map = {
+        val: key for key, val in _dtype_to_storage_type_map().items()}
+    return dtype_map
+
+def _get_storage_from_sequence(sequence, dtype, device):
+    if dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+        interpret_dtypes = {
+            torch.quint8: torch.uint8,
+            torch.quint4x2: torch.uint8,
+            torch.quint2x4: torch.uint8,
+            torch.qint32: torch.int32,
+            torch.qint8: torch.int8
+        }
+        tmp_tensor = torch.tensor(
+            sequence,
+            dtype=interpret_dtypes[dtype],
+            device=device)
+
+    else:
+        tmp_tensor = torch.tensor(
+            sequence,
+            dtype=dtype,
+            device=device)
+
+    return tmp_tensor._typed_storage()._untyped_storage
+
+def _isint(x):
+    if HAS_NUMPY:
+        return isinstance(x, (int, np.integer))
+    else:
+        return isinstance(x, int)
+
+_always_warn_typed_storage_removal = False
+
+def _get_always_warn_typed_storage_removal():
+    return _always_warn_typed_storage_removal
+
+def _set_always_warn_typed_storage_removal(always_warn):
+    global _always_warn_typed_storage_removal
+    assert isinstance(always_warn, bool)
+    _always_warn_typed_storage_removal = always_warn
+
+def _warn_typed_storage_removal(stacklevel=2):
+    global _always_warn_typed_storage_removal
+
+    def is_first_time():
+        if not hasattr(_warn_typed_storage_removal, 'has_warned'):
+            return True
+        else:
+            return not _warn_typed_storage_removal.__dict__['has_warned']
+
+    if _get_always_warn_typed_storage_removal() or is_first_time():
+        message = (
+            "TypedStorage is deprecated. It will be removed in the future and "
+            "UntypedStorage will be the only storage class. This should only matter "
+            "to you if you are using storages directly.  To access UntypedStorage "
+            "directly, use tensor.untyped_storage() instead of tensor.storage()"
+        )
+        warnings.warn(message, UserWarning, stacklevel=stacklevel + 1)
+        _warn_typed_storage_removal.__dict__['has_warned'] = True
+
+def _reset_warn_typed_storage_removal():
+    _warn_typed_storage_removal.__dict__['has_warned'] = False
+
+def _get_device_from_module(module: str):
+    last_part = module.rsplit(".", 1)[-1]
+    if last_part in ["cuda", torch._C._get_privateuse1_backend_name(), "hpu"]:
+        return last_part
+    else:
+        return "cpu"
+
+class TypedStorage:
+    is_sparse = False
+
+    dtype: torch.dtype
+
+    @property
+    def _dtype(self):
+        return self.dtype
+
+    @property
+    def filename(self) -> _Optional[str]:
+        """Returns the file name associated with this storage if the storage was memory mapped from a file.
+           or ``None`` if the storage was not created by memory mapping a file."""
+        return self._untyped_storage.filename
+
+    def fill_(self, value):
+        _warn_typed_storage_removal()
+        self._setitem(slice(0, self._size()), value)
+        return self
+
+    def __new__(cls, *args, wrap_storage=None, dtype=None, device=None, _internal=False):
+        if not _internal:
+            _warn_typed_storage_removal()
+
+        if cls == torch.storage._LegacyStorage:
+            raise RuntimeError("Only child classes of _LegacyStorage can be instantiated")
+
+        if cls == TypedStorage:
+            return super().__new__(cls)
+
+        else:
+            arg_error_msg = (
+                f'{cls}.__new__ received an invalid combination '
+                f'of arguments. Expected one of:\n'
+                ' * no arguments\n'
+                ' * (int size)\n'
+                ' * (Sequence data)\n'
+                ' * (*, UntypedStorage wrap_storage)')
+
+            if device is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nKeyword argument 'device' cannot be specified")
+
+            if dtype is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nKeyword argument 'dtype' cannot be specified")
+
+            if wrap_storage is None:
+                if len(args) > 1:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        "\nToo many positional arguments")
+
+                if len(args) == 1 and not _isint(args[0]) and not isinstance(args[0], collections.abc.Sequence):
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument type not recognized: {type(args[0])}")
+
+                return TypedStorage(
+                    *args,
+                    dtype=cls._dtype,
+                    device=_get_device_from_module(cls.__module__),
+                    _internal=True)
+
+            else:
+                if len(args) != 0:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        "\nNo positional arguments should be given when using "
+                        "'wrap_storage'")
+
+                if not isinstance(wrap_storage, torch.UntypedStorage):
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument 'wrap_storage' must be UntypedStorage, but got {type(wrap_storage)}")
+
+                cls_device = _get_device_from_module(cls.__module__)
+
+                if wrap_storage.device.type != cls_device:
+                    raise RuntimeError(
+                        arg_error_msg +
+                        f"\nDevice of 'wrap_storage' must be {cls_device}"
+                        f", but got {wrap_storage.device.type}")
+
+                return TypedStorage(
+                    *args,
+                    wrap_storage=wrap_storage,
+                    dtype=cls.dtype,
+                    _internal=True)
+
+    def __init__(self, *args, device=None, dtype=None, wrap_storage=None, _internal=False):
+        if not _internal:
+            _warn_typed_storage_removal()
+        arg_error_msg = (
+            'TypedStorage.__init__ received an invalid combination '
+            'of arguments. Expected one of:\n'
+            ' * (*, torch.device device, torch.dtype dtype)\n'
+            ' * (int size, *, torch.device device, torch.dtype dtype)\n'
+            ' * (Sequence data, *, torch.device device, torch.dtype dtype)\n'
+            ' * (*, UntypedStorage wrap_storage, torch.dtype dtype)')
+
+        if wrap_storage is not None:
+            if len(args) != 0:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nNo positional arguments should be given when using "
+                    "'wrap_storage'")
+
+            if dtype is None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nArgument 'dtype' must be specified")
+
+            if not isinstance(dtype, torch.dtype):
+                raise TypeError(
+                    arg_error_msg +
+                    f"\nArgument 'dtype' must be torch.dtype, not {type(dtype)}")
+
+            if device is not None:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nArgument 'device' should not be specified when 'wrap_storage' is given")
+
+            self.dtype = dtype
+
+            if not isinstance(wrap_storage, torch.UntypedStorage):
+                raise TypeError(
+                    arg_error_msg +
+                    f"\nArgument 'wrap_storage' must be UntypedStorage, but got {type(wrap_storage)}")
+
+            self._untyped_storage = wrap_storage
+
+        else:
+            self.dtype = torch.get_default_dtype() if dtype is None else dtype
+            device = torch.device('cpu' if device is None else device)
+
+            if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+                if device.type == 'cuda':
+                    raise RuntimeError("Cannot create CUDA storage with quantized dtype")
+
+            if len(args) == 0:
+                self._untyped_storage = torch.UntypedStorage(device=device)
+
+            elif len(args) == 1:
+                if _isint(args[0]):
+                    self._untyped_storage = torch.UntypedStorage(int(args[0]) * self._element_size(), device=device)
+                elif isinstance(args[0], collections.abc.Sequence):
+                    self._untyped_storage = _get_storage_from_sequence(args[0], self.dtype, device)
+                else:
+                    raise TypeError(
+                        arg_error_msg +
+                        f"\nArgument type not recognized: {type(args[0])}")
+
+            else:
+                raise RuntimeError(
+                    arg_error_msg +
+                    "\nToo many positional arguments")
+
+    @property
+    def is_cuda(self):
+        _warn_typed_storage_removal()
+        return self._untyped_storage.device.type == 'cuda'
+
+    @property
+    def is_hpu(self):
+        _warn_typed_storage_removal()
+        return self._untyped_storage.device.type == 'hpu'
+
+    def untyped(self):
+        """Return the internal :class:`torch.UntypedStorage`."""
+        _warn_typed_storage_removal()
+        return self._untyped_storage
+
+    def _new_wrapped_storage(self, untyped_storage):
+        assert type(untyped_storage) == torch.UntypedStorage
+
+        if type(self) == TypedStorage:
+            return TypedStorage(
+                wrap_storage=untyped_storage,
+                dtype=self.dtype,
+                _internal=True)
+        else:
+            return type(self)(wrap_storage=untyped_storage)
+
+    def __len__(self):
+        _warn_typed_storage_removal()
+        return self._size()
+
+    def _maybe_wrap_index(self, idx, is_stop=False):
+        if idx is None:
+            if is_stop:
+                return self._size()
+            else:
+                return 0
+
+        else:
+            if type(idx) != int:
+                raise TypeError(
+                    f"can't index a {type(self)} with {type(idx)}")
+            if is_stop:
+                if (idx > self._size()) or (idx < -self._size()):
+                    raise IndexError(
+                        f'index {idx} out of range for storage of size {self.size()}')
+                if idx > 0:
+                    return idx
+                else:
+                    return idx % self._size()
+            else:
+                if (idx >= self._size()) or (idx < -self._size()):
+                    raise IndexError(
+                        f'index {idx} out of range for storage of size {self.size()}')
+                return idx % self._size()
+
+    def __setitem__(self, idx, value):
+        _warn_typed_storage_removal()
+        return self._setitem(idx, value)
+
+    def _setitem(self, idx, value):
+        if not isinstance(idx, (int, slice)):
+            raise RuntimeError(f"can't index a {type(self)} with {type(idx)}")
+        if torch.is_storage(value):
+            raise RuntimeError(f'cannot set item with value type {type(value)}')
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            interpret_dtypes = {
+                torch.quint8: torch.uint8,
+                torch.quint4x2: torch.uint8,
+                torch.quint2x4: torch.uint8,
+                torch.qint32: torch.int32,
+                torch.qint8: torch.int8
+            }
+            tmp_dtype = interpret_dtypes[self.dtype]
+            tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self._untyped_storage.device)
+            tmp_tensor.set_(TypedStorage(
+                wrap_storage=self._untyped_storage,
+                dtype=tmp_dtype,
+                _internal=True))
+        else:
+            tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
+
+        tmp_tensor[idx] = value
+
+    def __getitem__(self, idx):
+        _warn_typed_storage_removal()
+        return self._getitem(idx)
+
+    def _getitem(self, idx):
+        if self._untyped_storage.device.type == 'meta':
+            raise NotImplementedError("Not available for 'meta' device type")
+
+        # NOTE: Before TypedStorage existed, indexing with a slice used to be
+        # possible for <type>Storage objects. However, it would return
+        # a storage view, which would be a hassle to implement in TypedStorage,
+        # so it was disabled
+        if isinstance(idx, slice):
+            raise RuntimeError('slices are only supported in UntypedStorage.__getitem__')
+        elif not isinstance(idx, int):
+            raise RuntimeError(f"can't index a {type(self)} with {type(idx)}")
+
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            interpret_dtypes = {
+                torch.quint8: torch.uint8,
+                torch.quint4x2: torch.uint8,
+                torch.quint2x4: torch.uint8,
+                torch.qint32: torch.int32,
+                torch.qint8: torch.int8
+            }
+            return TypedStorage(
+                wrap_storage=self._untyped_storage,
+                dtype=interpret_dtypes[self.dtype],
+                _internal=True)._getitem(idx)
+
+        idx_wrapped = self._maybe_wrap_index(idx)
+        from torch._subclasses.fake_tensor import unset_fake_temporarily
+
+        with unset_fake_temporarily():
+            tmp_tensor = torch.tensor([], dtype=self.dtype, device=self._untyped_storage.device).set_(self)
+            return tmp_tensor[idx_wrapped].item()
+
+    def copy_(self, source: T, non_blocking: _Optional[bool] = None):
+        _warn_typed_storage_removal()
+        if isinstance(source, TypedStorage):
+            self._untyped_storage.copy_(source._untyped_storage, non_blocking)  # type: ignore[arg-type]
+        else:
+            self._untyped_storage.copy_(source, non_blocking)  # type: ignore[arg-type]
+        return self
+
+    def nbytes(self):
+        _warn_typed_storage_removal()
+        return self._nbytes()
+
+    # For internal use only, to avoid deprecation warning
+    def _nbytes(self):
+        return self._untyped_storage.nbytes()
+
+    def type(self, dtype: _Optional[str] = None, non_blocking: bool = False) -> Union[T, str]:
+        _warn_typed_storage_removal()
+        if dtype is None:
+            legacy_class = self._get_legacy_storage_class()
+
+            if legacy_class is not None:
+                return legacy_class.__module__ + '.' + legacy_class.__name__
+
+            return '.'.join([self.__module__, type(self).__name__])
+
+        else:
+            return self._untyped_storage.type(dtype, non_blocking)
+
+    def cuda(self, device=None, non_blocking=False, **kwargs) -> T:  # type: ignore[misc, type-var]
+        _warn_typed_storage_removal()
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            raise RuntimeError("Cannot create CUDA storage with quantized dtype")
+        cuda_storage: torch.UntypedStorage = self._untyped_storage.cuda(device, non_blocking, **kwargs)
+        return self._new_wrapped_storage(cuda_storage)
+
+    def hpu(self, device=None, non_blocking=False, **kwargs) -> T:  # type: ignore[misc, type-var]
+        _warn_typed_storage_removal()
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            raise RuntimeError("Cannot create HPU storage with quantized dtype")
+        hpu_storage: torch.UntypedStorage = self._untyped_storage.hpu(device, non_blocking, **kwargs)
+        return self._new_wrapped_storage(hpu_storage)
+
+    def element_size(self):
+        _warn_typed_storage_removal()
+        return self._element_size()
+
+    # For internal use only, to avoid deprecation warning
+    def _element_size(self):
+        return torch._utils._element_size(self.dtype)
+
+    def get_device(self) -> int:
+        _warn_typed_storage_removal()
+        return self._untyped_storage.get_device()
+
+    def __str__(self):
+        _warn_typed_storage_removal()
+        info_str = (
+            f'[{torch.typename(self)}(dtype={self.dtype}, '
+            f'device={self.device}) of size {len(self)}]')
+        if self.device.type == 'meta':
+            return '...\n' + info_str
+        else:
+            data_str = ' ' + '\n '.join(str(self[i]) for i in range(self.size()))
+            return data_str + '\n' + info_str
+
+    def __repr__(self):
+        _warn_typed_storage_removal()
+        return str(self)
+
+    def __iter__(self):
+        _warn_typed_storage_removal()
+        return iter(self[i] for i in range(self.size()))
+
+    def __copy__(self):
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(copy.copy(self._untyped_storage))
+
+    def __deepcopy__(self, memo):
+        _warn_typed_storage_removal()
+        return self._deepcopy(memo)
+
+    # For internal use only, to avoid deprecation warning
+    def _deepcopy(self, memo):
+        return self._new_wrapped_storage(copy.deepcopy(self._untyped_storage, memo))
+
+    def __sizeof__(self):
+        _warn_typed_storage_removal()
+        return super().__sizeof__() + self.nbytes()
+
+    def clone(self):
+        """Return a copy of this storage."""
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.clone())
+
+    def tolist(self):
+        """Return a list containing the elements of this storage."""
+        _warn_typed_storage_removal()
+        return list(self)
+
+    def cpu(self):
+        """Return a CPU copy of this storage if it's not already on the CPU."""
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.cpu())
+
+    def is_pinned(self, device: Union[str, torch.device] = 'cuda'):
+        r"""Determine whether the CPU TypedStorage is already pinned on device.
+
+        Args:
+            device (str or torch.device): The device to pin memory on. Default: ``'cuda'``
+
+        Returns:
+            A boolean variable.
+        """
+        _warn_typed_storage_removal()
+        return self._untyped_storage.is_pinned(device)
+
+    def pin_memory(self, device: Union[str, torch.device] = 'cuda'):
+        r"""Copy the CPU TypedStorage to pinned memory, if it's not already pinned.
+
+        Args:
+            device (str or torch.device): The device to pin memory on. Default: ``'cuda'``.
+
+        Returns:
+            A pinned CPU storage.
+        """
+        _warn_typed_storage_removal()
+        return self._new_wrapped_storage(self._untyped_storage.pin_memory(device=device))
+
+    def share_memory_(self):
+        """See :meth:`torch.UntypedStorage.share_memory_`"""
+        _warn_typed_storage_removal()
+        return self._share_memory_()
+
+    # For internal use only, to avoid deprecation warning
+    def _share_memory_(self):
+        self._untyped_storage.share_memory_()
+        return self
+
+    def _new_shared(self, size, *, device=None):
+        """Create a new storage in shared memory with the same data type."""
+        if device is None:
+            device = 'cpu'
+        device = torch.device(device)
+        untyped_storage = torch.UntypedStorage._new_shared(size * self._element_size(), device=device)
+        return TypedStorage(
+            wrap_storage=untyped_storage,
+            dtype=self.dtype,
+            _internal=True)
+
+    @property
+    def _cdata(self):
+        return self._untyped_storage._cdata
+
+    @property
+    def device(self):
+        _warn_typed_storage_removal()
+        return self._untyped_storage.device
+
+    def size(self):
+        _warn_typed_storage_removal()
+        return self._size()
+
+    # For internal use only, to avoid deprecation warning
+    def _size(self):
+        # NB: don't indirect through __len__, as that requires
+        # an int to be returned
+        return self._untyped_storage.nbytes() // self._element_size()
+
+    def pickle_storage_type(self):
+        _warn_typed_storage_removal()
+        return self._pickle_storage_type()
+
+    # For internal use only, to avoid deprecation warning
+    def _pickle_storage_type(self):
+        try:
+            return _dtype_to_storage_type_map()[self.dtype]
+        except KeyError as e:
+            raise KeyError(f'dtype {self.dtype} is not recognized') from e
+
+    def __reduce__(self):
+        b = io.BytesIO()
+        torch.save(self, b, _use_new_zipfile_serialization=False)
+        return (_load_from_bytes, (b.getvalue(),))
+
+    def data_ptr(self):
+        _warn_typed_storage_removal()
+        return self._data_ptr()
+
+    # For internal use only, to avoid deprecation warning
+    def _data_ptr(self):
+        return self._untyped_storage.data_ptr()
+
+    def resizable(self):
+        _warn_typed_storage_removal()
+        return self._untyped_storage.resizable()
+
+    def resize_(self, size):
+        _warn_typed_storage_removal()
+        self._resize_(size)
+
+    # For internal use only, to avoid deprecation warning
+    def _resize_(self, size):
+        self._untyped_storage.resize_(size * self._element_size())
+
+    @classmethod
+    def _free_weak_ref(cls, *args, **kwargs):
+        return UntypedStorage._free_weak_ref(*args, **kwargs)
+
+    def _weak_ref(self, *args, **kwargs):
+        return self._untyped_storage._weak_ref(*args, **kwargs)
+
+    @classmethod
+    def from_buffer(cls, *args, **kwargs):
+        _warn_typed_storage_removal()
+        return cls._from_buffer(*args, **kwargs)
+
+    @classmethod
+    def _from_buffer(cls, *args, dtype=None, device=None, **kwargs):
+        if cls == TypedStorage:
+            dtype = torch.get_default_dtype() if dtype is None else dtype
+            device = torch.device('cpu' if device is None else device)
+            if device.type != 'cpu':
+                raise RuntimeError(f'TypedStorage.from_buffer: Not available for device {device.type}')
+            untyped_storage: torch.UntypedStorage = torch.UntypedStorage.from_buffer(*args, dtype=dtype, **kwargs)
+
+        else:
+            if dtype is not None or len(args) == 5:
+                raise RuntimeError(
+                    "from_buffer: 'dtype' can only be specified in "
+                    "UntypedStorage.from_buffer and TypedStorage.from_buffer")
+            if device is not None:
+                raise RuntimeError(
+                    "from_buffer: 'device' can only be specified in "
+                    "UntypedStorage.from_buffer and TypedStorage.from_buffer")
+
+            dtype = cls._dtype
+            untyped_storage = torch.UntypedStorage.from_buffer(*args, dtype=dtype, **kwargs)
+
+        return TypedStorage(
+            wrap_storage=untyped_storage,
+            dtype=dtype,
+            _internal=True)
+
+    def _to(self, dtype):
+        if not isinstance(dtype, torch.dtype):
+            raise TypeError(f"Argument 'dtype' must be torch.dtype, not {type(dtype)}")
+        storage = torch.tensor([], dtype=self.dtype, device=self.device).set_(self).to(dtype)._typed_storage()
+        if storage.data_ptr() == self.data_ptr():
+            storage = storage.clone()
+        return storage
+
+    def double(self):
+        """Casts this storage to double type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.double)
+
+    def float(self):
+        """Casts this storage to float type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.float)
+
+    def half(self):
+        """Casts this storage to half type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.half)
+
+    def long(self):
+        """Casts this storage to long type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.long)
+
+    def int(self):
+        """Casts this storage to int type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.int)
+
+    def short(self):
+        """Casts this storage to short type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.short)
+
+    def char(self):
+        """Casts this storage to char type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.int8)
+
+    def byte(self):
+        """Casts this storage to byte type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.uint8)
+
+    def bool(self):
+        """Casts this storage to bool type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.bool)
+
+    def bfloat16(self):
+        """Casts this storage to bfloat16 type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.bfloat16)
+
+    def complex_double(self):
+        """Casts this storage to complex double type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.cdouble)
+
+    def complex_float(self):
+        """Casts this storage to complex float type."""
+        _warn_typed_storage_removal()
+        return self._to(torch.cfloat)
+
+    def float8_e5m2(self):
+        """Casts this storage to float8_e5m2 type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e5m2)
+
+    def float8_e4m3fn(self):
+        """Casts this storage to float8_e4m3fn type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e4m3fn)
+
+    def float8_e5m2fnuz(self):
+        """Casts this storage to float8_e5m2fnuz type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e5m2fnuz)
+
+    def float8_e4m3fnuz(self):
+        """Casts this storage to float8_e4m3fnuz type"""
+        _warn_typed_storage_removal()
+        return self._to(torch.float8_e4m3fnuz)
+
+    @classmethod
+    def from_file(cls, filename, shared, size):
+        """from_file(filename, shared=False, size=0) -> Storage
+
+        Creates a CPU storage backed by a memory-mapped file.
+
+        If ``shared`` is ``True``, then memory is shared between all processes.
+        All changes are written to the file. If ``shared`` is ``False``, then the changes on
+        the storage do not affect the file.
+
+        ``size`` is the number of elements in the storage. If ``shared`` is ``False``,
+        then the file must contain at least ``size * sizeof(Type)`` bytes
+        (``Type`` is the type of storage). If ``shared`` is ``True`` the file will be created if needed.
+
+        Args:
+            filename (str): file name to map
+            shared (bool): whether to share memory (whether ``MAP_SHARED`` or ``MAP_PRIVATE`` is passed to the
+                            underlying `mmap(2) call <https://man7.org/linux/man-pages/man2/mmap.2.html>`_)
+            size (int): number of elements in the storage
+        """
+        _warn_typed_storage_removal()
+        if cls == TypedStorage:
+            raise RuntimeError('from_file can only be called on derived classes')
+        untyped_storage: UntypedStorage = UntypedStorage.from_file(
+            filename,
+            shared,
+            size * torch._utils._element_size(cls.dtype))
+        storage = cls(wrap_storage=untyped_storage)
+        return storage
+
+    @classmethod
+    def _expired(cls, *args, **kwargs):
+        return UntypedStorage._expired(*args, **kwargs)
+
+    def _write_file(self, *args, **kwargs):
+        return self._untyped_storage._write_file(*args, **kwargs)
+
+    def _set_from_file(self, *args, **kwargs):
+        return self._untyped_storage._set_from_file(*args, **kwargs)
+
+    def _set_cdata(self, *args, **kwargs):
+        return self._untyped_storage._set_cdata(*args, **kwargs)
+
+    def _share_cuda_(self, *args, **kwargs):
+        return self._untyped_storage._share_cuda_(*args, **kwargs)
+
+    def is_shared(self):
+        _warn_typed_storage_removal()
+        return self._is_shared()
+
+    # For internal use only, to avoid deprecation warning
+    def _is_shared(self):
+        return self._untyped_storage.is_shared()
+
+    @classmethod
+    def _new_shared_cuda(cls, *args, **kwargs):
+        return torch.UntypedStorage._new_shared_cuda(*args, **kwargs)
+
+    def _share_filename_cpu_(self, *args, **kwargs):
+        manager_handle, storage_handle, size = self._untyped_storage._share_filename_cpu_(*args, **kwargs)
+        return manager_handle, storage_handle, size // self._element_size()
+
+    def _shared_decref(self):
+        self._untyped_storage._shared_decref()
+        return self
+
+    @classmethod
+    def _release_ipc_counter(cls, *args, device=None, **kwargs):
+        return torch.UntypedStorage._release_ipc_counter_cuda(*args, **kwargs)
+
+    def _shared_incref(self, *args, **kwargs):
+        return self._untyped_storage._shared_incref(*args, **kwargs)
+
+    def _share_fd_cpu_(self, *args, **kwargs):
+        fd, size = self._untyped_storage._share_fd_cpu_(*args, **kwargs)
+        return fd, size // self._element_size()
+
+    def _get_legacy_storage_class(self):
+        if self.dtype not in _dtype_to_storage_type_map():
+            return None
+
+        storage_name = _dtype_to_storage_type_map()[self.dtype]
+
+        if self.device.type not in ['cpu', 'cuda', "hpu", torch._C._get_privateuse1_backend_name()]:
+            return None
+
+        module = torch if self.device.type == 'cpu' else getattr(torch, self.device.type)
+
+        try:
+            return getattr(module, storage_name)
+        except AttributeError:
+            return None
+
+TypedStorage.type.__doc__ = _type.__doc__
+TypedStorage.cuda.__doc__ = _cuda.__doc__
+TypedStorage.hpu.__doc__ = _hpu.__doc__
+
+class _LegacyStorageMeta(type):
+    dtype: torch.dtype
+
+    def __instancecheck__(cls, instance):
+        if type(instance) == TypedStorage:
+            cls_device = _get_device_from_module(cls.__module__)
+            return (cls_device == instance.device.type) and (cls.dtype == instance.dtype)
+        return False
+
+class _LegacyStorage(TypedStorage, metaclass=_LegacyStorageMeta):
+    @classmethod
+    def _new_shared(cls, size):
+        """Create a new storage in shared memory with the same data type."""
+        untyped_storage = torch.UntypedStorage._new_shared(size * cls()._element_size())
+        return cls(wrap_storage=untyped_storage)
+
+    @classmethod
+    def _release_ipc_counter(cls, *args, **kwargs):
+        return torch.UntypedStorage._release_ipc_counter_cuda(*args, **kwargs)
+
+    @classmethod
+    def _new_shared_filename(cls, manager, obj, size):
+        bytes_size = size * torch._utils._element_size(cls.dtype)
+        return cls(wrap_storage=torch.UntypedStorage._new_shared_filename_cpu(manager, obj, bytes_size))
+
+def _get_dtype_from_pickle_storage_type(pickle_storage_type: str):
+    try:
+        return _storage_type_to_dtype_map()[pickle_storage_type]
+    except KeyError as e:
+        raise KeyError(
+            f'pickle storage type "{pickle_storage_type}" is not recognized') from e
diff --git a/MLPY/Lib/site-packages/torch/testing/__init__.py b/MLPY/Lib/site-packages/torch/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd9102cc6ba926c29cbd8e6046a2f8d2fbf48c6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/__init__.py
@@ -0,0 +1,3 @@
+from torch._C import FileCheck as FileCheck
+from ._comparison import assert_allclose, assert_close as assert_close
+from ._creation import make_tensor as make_tensor
diff --git a/MLPY/Lib/site-packages/torch/testing/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfbed95273fa40792f361e6024a541d70cab4958
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/__pycache__/_comparison.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/__pycache__/_comparison.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7277c31183ae3b9109c2652fe793e9b97f5d43d2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/__pycache__/_comparison.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/__pycache__/_creation.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/__pycache__/_creation.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abe1b578ff7d0abd68049cc5002de98564e593c6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/__pycache__/_creation.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_comparison.py b/MLPY/Lib/site-packages/torch/testing/_comparison.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c8f077c815f13a5f159a7bff2e433b42f24e307
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_comparison.py
@@ -0,0 +1,1575 @@
+import abc
+import cmath
+import collections.abc
+import contextlib
+import warnings
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    Dict,
+    List,
+    NoReturn,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+
+try:
+    import numpy as np
+
+    NUMPY_AVAILABLE = True
+except ModuleNotFoundError:
+    NUMPY_AVAILABLE = False
+
+
+class ErrorMeta(Exception):
+    """Internal testing exception that makes that carries error metadata."""
+
+    def __init__(
+        self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()
+    ) -> None:
+        super().__init__(
+            "If you are a user and see this message during normal operation "
+            "please file an issue at https://github.com/pytorch/pytorch/issues. "
+            "If you are a developer and working on the comparison functions, please `raise ErrorMeta().to_error()` "
+            "for user facing errors."
+        )
+        self.type = type
+        self.msg = msg
+        self.id = id
+
+    def to_error(
+        self, msg: Optional[Union[str, Callable[[str], str]]] = None
+    ) -> Exception:
+        if not isinstance(msg, str):
+            generated_msg = self.msg
+            if self.id:
+                generated_msg += f"\n\nThe failure occurred for item {''.join(str([item]) for item in self.id)}"
+
+            msg = msg(generated_msg) if callable(msg) else generated_msg
+
+        return self.type(msg)
+
+
+# Some analysis of tolerance by logging tests from test_torch.py can be found in
+# https://github.com/pytorch/pytorch/pull/32538.
+# {dtype: (rtol, atol)}
+_DTYPE_PRECISIONS = {
+    torch.float16: (0.001, 1e-5),
+    torch.bfloat16: (0.016, 1e-5),
+    torch.float32: (1.3e-6, 1e-5),
+    torch.float64: (1e-7, 1e-7),
+    torch.complex32: (0.001, 1e-5),
+    torch.complex64: (1.3e-6, 1e-5),
+    torch.complex128: (1e-7, 1e-7),
+}
+# The default tolerances of torch.float32 are used for quantized dtypes, because quantized tensors are compared in
+# their dequantized and floating point representation. For more details see `TensorLikePair._compare_quantized_values`
+_DTYPE_PRECISIONS.update(
+    dict.fromkeys(
+        (torch.quint8, torch.quint2x4, torch.quint4x2, torch.qint8, torch.qint32),
+        _DTYPE_PRECISIONS[torch.float32],
+    )
+)
+
+
+def default_tolerances(
+    *inputs: Union[torch.Tensor, torch.dtype],
+    dtype_precisions: Optional[Dict[torch.dtype, Tuple[float, float]]] = None,
+) -> Tuple[float, float]:
+    """Returns the default absolute and relative testing tolerances for a set of inputs based on the dtype.
+
+    See :func:`assert_close` for a table of the default tolerance for each dtype.
+
+    Returns:
+        (Tuple[float, float]): Loosest tolerances of all input dtypes.
+    """
+    dtypes = []
+    for input in inputs:
+        if isinstance(input, torch.Tensor):
+            dtypes.append(input.dtype)
+        elif isinstance(input, torch.dtype):
+            dtypes.append(input)
+        else:
+            raise TypeError(
+                f"Expected a torch.Tensor or a torch.dtype, but got {type(input)} instead."
+            )
+    dtype_precisions = dtype_precisions or _DTYPE_PRECISIONS
+    rtols, atols = zip(*[dtype_precisions.get(dtype, (0.0, 0.0)) for dtype in dtypes])
+    return max(rtols), max(atols)
+
+
+def get_tolerances(
+    *inputs: Union[torch.Tensor, torch.dtype],
+    rtol: Optional[float],
+    atol: Optional[float],
+    id: Tuple[Any, ...] = (),
+) -> Tuple[float, float]:
+    """Gets absolute and relative to be used for numeric comparisons.
+
+    If both ``rtol`` and ``atol`` are specified, this is a no-op. If both are not specified, the return value of
+    :func:`default_tolerances` is used.
+
+    Raises:
+        ErrorMeta: With :class:`ValueError`, if only ``rtol`` or ``atol`` is specified.
+
+    Returns:
+        (Tuple[float, float]): Valid absolute and relative tolerances.
+    """
+    if (rtol is None) ^ (atol is None):
+        # We require both tolerance to be omitted or specified, because specifying only one might lead to surprising
+        # results. Imagine setting atol=0.0 and the tensors still match because rtol>0.0.
+        raise ErrorMeta(
+            ValueError,
+            f"Both 'rtol' and 'atol' must be either specified or omitted, "
+            f"but got no {'rtol' if rtol is None else 'atol'}.",
+            id=id,
+        )
+    elif rtol is not None and atol is not None:
+        return rtol, atol
+    else:
+        return default_tolerances(*inputs)
+
+
+def _make_mismatch_msg(
+    *,
+    default_identifier: str,
+    identifier: Optional[Union[str, Callable[[str], str]]] = None,
+    extra: Optional[str] = None,
+    abs_diff: float,
+    abs_diff_idx: Optional[Union[int, Tuple[int, ...]]] = None,
+    atol: float,
+    rel_diff: float,
+    rel_diff_idx: Optional[Union[int, Tuple[int, ...]]] = None,
+    rtol: float,
+) -> str:
+    """Makes a mismatch error message for numeric values.
+
+    Args:
+        default_identifier (str): Default description of the compared values, e.g. "Tensor-likes".
+        identifier (Optional[Union[str, Callable[[str], str]]]): Optional identifier that overrides
+            ``default_identifier``. Can be passed as callable in which case it will be called with
+            ``default_identifier`` to create the description at runtime.
+        extra (Optional[str]): Extra information to be placed after the message header and the mismatch statistics.
+        abs_diff (float): Absolute difference.
+        abs_diff_idx (Optional[Union[int, Tuple[int, ...]]]): Optional index of the absolute difference.
+        atol (float): Allowed absolute tolerance. Will only be added to mismatch statistics if it or ``rtol`` are
+            ``> 0``.
+        rel_diff (float): Relative difference.
+        rel_diff_idx (Optional[Union[int, Tuple[int, ...]]]): Optional index of the relative difference.
+        rtol (float): Allowed relative tolerance. Will only be added to mismatch statistics if it or ``atol`` are
+            ``> 0``.
+    """
+    equality = rtol == 0 and atol == 0
+
+    def make_diff_msg(
+        *,
+        type: str,
+        diff: float,
+        idx: Optional[Union[int, Tuple[int, ...]]],
+        tol: float,
+    ) -> str:
+        if idx is None:
+            msg = f"{type.title()} difference: {diff}"
+        else:
+            msg = f"Greatest {type} difference: {diff} at index {idx}"
+        if not equality:
+            msg += f" (up to {tol} allowed)"
+        return msg + "\n"
+
+    if identifier is None:
+        identifier = default_identifier
+    elif callable(identifier):
+        identifier = identifier(default_identifier)
+
+    msg = f"{identifier} are not {'equal' if equality else 'close'}!\n\n"
+
+    if extra:
+        msg += f"{extra.strip()}\n"
+
+    msg += make_diff_msg(type="absolute", diff=abs_diff, idx=abs_diff_idx, tol=atol)
+    msg += make_diff_msg(type="relative", diff=rel_diff, idx=rel_diff_idx, tol=rtol)
+
+    return msg.strip()
+
+
+def make_scalar_mismatch_msg(
+    actual: Union[bool, int, float, complex],
+    expected: Union[bool, int, float, complex],
+    *,
+    rtol: float,
+    atol: float,
+    identifier: Optional[Union[str, Callable[[str], str]]] = None,
+) -> str:
+    """Makes a mismatch error message for scalars.
+
+    Args:
+        actual (Union[bool, int, float, complex]): Actual scalar.
+        expected (Union[bool, int, float, complex]): Expected scalar.
+        rtol (float): Relative tolerance.
+        atol (float): Absolute tolerance.
+        identifier (Optional[Union[str, Callable[[str], str]]]): Optional description for the scalars. Can be passed
+            as callable in which case it will be called by the default value to create the description at runtime.
+            Defaults to "Scalars".
+    """
+    abs_diff = abs(actual - expected)
+    rel_diff = float("inf") if expected == 0 else abs_diff / abs(expected)
+    return _make_mismatch_msg(
+        default_identifier="Scalars",
+        identifier=identifier,
+        extra=f"Expected {expected} but got {actual}.",
+        abs_diff=abs_diff,
+        atol=atol,
+        rel_diff=rel_diff,
+        rtol=rtol,
+    )
+
+
+def make_tensor_mismatch_msg(
+    actual: torch.Tensor,
+    expected: torch.Tensor,
+    matches: torch.Tensor,
+    *,
+    rtol: float,
+    atol: float,
+    identifier: Optional[Union[str, Callable[[str], str]]] = None,
+):
+    """Makes a mismatch error message for tensors.
+
+    Args:
+        actual (torch.Tensor): Actual tensor.
+        expected (torch.Tensor): Expected tensor.
+        matches (torch.Tensor): Boolean mask of the same shape as ``actual`` and ``expected`` that indicates the
+            location of matches.
+        rtol (float): Relative tolerance.
+        atol (float): Absolute tolerance.
+        identifier (Optional[Union[str, Callable[[str], str]]]): Optional description for the tensors. Can be passed
+            as callable in which case it will be called by the default value to create the description at runtime.
+            Defaults to "Tensor-likes".
+    """
+
+    def unravel_flat_index(flat_index: int) -> Tuple[int, ...]:
+        if not matches.shape:
+            return ()
+
+        inverse_index = []
+        for size in matches.shape[::-1]:
+            div, mod = divmod(flat_index, size)
+            flat_index = div
+            inverse_index.append(mod)
+
+        return tuple(inverse_index[::-1])
+
+    number_of_elements = matches.numel()
+    total_mismatches = number_of_elements - int(torch.sum(matches))
+    extra = (
+        f"Mismatched elements: {total_mismatches} / {number_of_elements} "
+        f"({total_mismatches / number_of_elements:.1%})"
+    )
+
+    actual_flat = actual.flatten()
+    expected_flat = expected.flatten()
+    matches_flat = matches.flatten()
+
+    if not actual.dtype.is_floating_point and not actual.dtype.is_complex:
+        # TODO: Instead of always upcasting to int64, it would be sufficient to cast to the next higher dtype to avoid
+        #  overflow
+        actual_flat = actual_flat.to(torch.int64)
+        expected_flat = expected_flat.to(torch.int64)
+
+    abs_diff = torch.abs(actual_flat - expected_flat)
+    # Ensure that only mismatches are used for the max_abs_diff computation
+    abs_diff[matches_flat] = 0
+    max_abs_diff, max_abs_diff_flat_idx = torch.max(abs_diff, 0)
+
+    rel_diff = abs_diff / torch.abs(expected_flat)
+    # Ensure that only mismatches are used for the max_rel_diff computation
+    rel_diff[matches_flat] = 0
+    max_rel_diff, max_rel_diff_flat_idx = torch.max(rel_diff, 0)
+    return _make_mismatch_msg(
+        default_identifier="Tensor-likes",
+        identifier=identifier,
+        extra=extra,
+        abs_diff=max_abs_diff.item(),
+        abs_diff_idx=unravel_flat_index(int(max_abs_diff_flat_idx)),
+        atol=atol,
+        rel_diff=max_rel_diff.item(),
+        rel_diff_idx=unravel_flat_index(int(max_rel_diff_flat_idx)),
+        rtol=rtol,
+    )
+
+
+class UnsupportedInputs(Exception):  # noqa: B903
+    """Exception to be raised during the construction of a :class:`Pair` in case it doesn't support the inputs."""
+
+
+class Pair(abc.ABC):
+    """ABC for all comparison pairs to be used in conjunction with :func:`assert_equal`.
+
+    Each subclass needs to overwrite :meth:`Pair.compare` that performs the actual comparison.
+
+    Each pair receives **all** options, so select the ones applicable for the subclass and forward the rest to the
+    super class. Raising an :class:`UnsupportedInputs` during constructions indicates that the pair is not able to
+    handle the inputs and the next pair type will be tried.
+
+    All other errors should be raised as :class:`ErrorMeta`. After the instantiation, :meth:`Pair._make_error_meta` can
+    be used to automatically handle overwriting the message with a user supplied one and id handling.
+    """
+
+    def __init__(
+        self,
+        actual: Any,
+        expected: Any,
+        *,
+        id: Tuple[Any, ...] = (),
+        **unknown_parameters: Any,
+    ) -> None:
+        self.actual = actual
+        self.expected = expected
+        self.id = id
+        self._unknown_parameters = unknown_parameters
+
+    @staticmethod
+    def _inputs_not_supported() -> NoReturn:
+        raise UnsupportedInputs()
+
+    @staticmethod
+    def _check_inputs_isinstance(*inputs: Any, cls: Union[Type, Tuple[Type, ...]]):
+        """Checks if all inputs are instances of a given class and raise :class:`UnsupportedInputs` otherwise."""
+        if not all(isinstance(input, cls) for input in inputs):
+            Pair._inputs_not_supported()
+
+    def _fail(
+        self, type: Type[Exception], msg: str, *, id: Tuple[Any, ...] = ()
+    ) -> NoReturn:
+        """Raises an :class:`ErrorMeta` from a given exception type and message and the stored id.
+
+        .. warning::
+
+            If you use this before the ``super().__init__(...)`` call in the constructor, you have to pass the ``id``
+            explicitly.
+        """
+        raise ErrorMeta(type, msg, id=self.id if not id and hasattr(self, "id") else id)
+
+    @abc.abstractmethod
+    def compare(self) -> None:
+        """Compares the inputs and raises an :class`ErrorMeta` in case they mismatch."""
+
+    def extra_repr(self) -> Sequence[Union[str, Tuple[str, Any]]]:
+        """Returns extra information that will be included in the representation.
+
+        Should be overwritten by all subclasses that use additional options. The representation of the object will only
+        be surfaced in case we encounter an unexpected error and thus should help debug the issue. Can be a sequence of
+        key-value-pairs or attribute names.
+        """
+        return []
+
+    def __repr__(self) -> str:
+        head = f"{type(self).__name__}("
+        tail = ")"
+        body = [
+            f"    {name}={value!s},"
+            for name, value in [
+                ("id", self.id),
+                ("actual", self.actual),
+                ("expected", self.expected),
+                *[
+                    (extra, getattr(self, extra)) if isinstance(extra, str) else extra
+                    for extra in self.extra_repr()
+                ],
+            ]
+        ]
+        return "\n".join((head, *body, *tail))
+
+
+class ObjectPair(Pair):
+    """Pair for any type of inputs that will be compared with the `==` operator.
+
+    .. note::
+
+        Since this will instantiate for any kind of inputs, it should only be used as fallback after all other pairs
+        couldn't handle the inputs.
+
+    """
+
+    def compare(self) -> None:
+        try:
+            equal = self.actual == self.expected
+        except Exception as error:
+            # We are not using `self._raise_error_meta` here since we need the exception chaining
+            raise ErrorMeta(
+                ValueError,
+                f"{self.actual} == {self.expected} failed with:\n{error}.",
+                id=self.id,
+            ) from error
+
+        if not equal:
+            self._fail(AssertionError, f"{self.actual} != {self.expected}")
+
+
+class NonePair(Pair):
+    """Pair for ``None`` inputs."""
+
+    def __init__(self, actual: Any, expected: Any, **other_parameters: Any) -> None:
+        if not (actual is None or expected is None):
+            self._inputs_not_supported()
+
+        super().__init__(actual, expected, **other_parameters)
+
+    def compare(self) -> None:
+        if not (self.actual is None and self.expected is None):
+            self._fail(
+                AssertionError, f"None mismatch: {self.actual} is not {self.expected}"
+            )
+
+
+class BooleanPair(Pair):
+    """Pair for :class:`bool` inputs.
+
+    .. note::
+
+        If ``numpy`` is available, also handles :class:`numpy.bool_` inputs.
+
+    """
+
+    def __init__(
+        self,
+        actual: Any,
+        expected: Any,
+        *,
+        id: Tuple[Any, ...],
+        **other_parameters: Any,
+    ) -> None:
+        actual, expected = self._process_inputs(actual, expected, id=id)
+        super().__init__(actual, expected, **other_parameters)
+
+    @property
+    def _supported_types(self) -> Tuple[Type, ...]:
+        cls: List[Type] = [bool]
+        if NUMPY_AVAILABLE:
+            cls.append(np.bool_)
+        return tuple(cls)
+
+    def _process_inputs(
+        self, actual: Any, expected: Any, *, id: Tuple[Any, ...]
+    ) -> Tuple[bool, bool]:
+        self._check_inputs_isinstance(actual, expected, cls=self._supported_types)
+        actual, expected = (
+            self._to_bool(bool_like, id=id) for bool_like in (actual, expected)
+        )
+        return actual, expected
+
+    def _to_bool(self, bool_like: Any, *, id: Tuple[Any, ...]) -> bool:
+        if isinstance(bool_like, bool):
+            return bool_like
+        elif isinstance(bool_like, np.bool_):
+            return bool_like.item()
+        else:
+            raise ErrorMeta(
+                TypeError, f"Unknown boolean type {type(bool_like)}.", id=id
+            )
+
+    def compare(self) -> None:
+        if self.actual is not self.expected:
+            self._fail(
+                AssertionError,
+                f"Booleans mismatch: {self.actual} is not {self.expected}",
+            )
+
+
+class NumberPair(Pair):
+    """Pair for Python number (:class:`int`, :class:`float`, and :class:`complex`) inputs.
+
+    .. note::
+
+        If ``numpy`` is available, also handles :class:`numpy.number` inputs.
+
+    Kwargs:
+        rtol (Optional[float]): Relative tolerance. If specified ``atol`` must also be specified. If omitted, default
+            values based on the type are selected with the below table.
+        atol (Optional[float]): Absolute tolerance. If specified ``rtol`` must also be specified. If omitted, default
+            values based on the type are selected with the below table.
+        equal_nan (bool): If ``True``, two ``NaN`` values are considered equal. Defaults to ``False``.
+        check_dtype (bool): If ``True``, the type of the inputs will be checked for equality. Defaults to ``False``.
+
+    The following table displays correspondence between Python number type and the ``torch.dtype``'s. See
+    :func:`assert_close` for the corresponding tolerances.
+
+    +------------------+-------------------------------+
+    | ``type``         | corresponding ``torch.dtype`` |
+    +==================+===============================+
+    | :class:`int`     | :attr:`~torch.int64`          |
+    +------------------+-------------------------------+
+    | :class:`float`   | :attr:`~torch.float64`        |
+    +------------------+-------------------------------+
+    | :class:`complex` | :attr:`~torch.complex64`      |
+    +------------------+-------------------------------+
+    """
+
+    _TYPE_TO_DTYPE = {
+        int: torch.int64,
+        float: torch.float64,
+        complex: torch.complex128,
+    }
+    _NUMBER_TYPES = tuple(_TYPE_TO_DTYPE.keys())
+
+    def __init__(
+        self,
+        actual: Any,
+        expected: Any,
+        *,
+        id: Tuple[Any, ...] = (),
+        rtol: Optional[float] = None,
+        atol: Optional[float] = None,
+        equal_nan: bool = False,
+        check_dtype: bool = False,
+        **other_parameters: Any,
+    ) -> None:
+        actual, expected = self._process_inputs(actual, expected, id=id)
+        super().__init__(actual, expected, id=id, **other_parameters)
+
+        self.rtol, self.atol = get_tolerances(
+            *[self._TYPE_TO_DTYPE[type(input)] for input in (actual, expected)],
+            rtol=rtol,
+            atol=atol,
+            id=id,
+        )
+        self.equal_nan = equal_nan
+        self.check_dtype = check_dtype
+
+    @property
+    def _supported_types(self) -> Tuple[Type, ...]:
+        cls = list(self._NUMBER_TYPES)
+        if NUMPY_AVAILABLE:
+            cls.append(np.number)
+        return tuple(cls)
+
+    def _process_inputs(
+        self, actual: Any, expected: Any, *, id: Tuple[Any, ...]
+    ) -> Tuple[Union[int, float, complex], Union[int, float, complex]]:
+        self._check_inputs_isinstance(actual, expected, cls=self._supported_types)
+        actual, expected = (
+            self._to_number(number_like, id=id) for number_like in (actual, expected)
+        )
+        return actual, expected
+
+    def _to_number(
+        self, number_like: Any, *, id: Tuple[Any, ...]
+    ) -> Union[int, float, complex]:
+        if NUMPY_AVAILABLE and isinstance(number_like, np.number):
+            return number_like.item()
+        elif isinstance(number_like, self._NUMBER_TYPES):
+            return number_like  # type: ignore[return-value]
+        else:
+            raise ErrorMeta(
+                TypeError, f"Unknown number type {type(number_like)}.", id=id
+            )
+
+    def compare(self) -> None:
+        if self.check_dtype and type(self.actual) is not type(self.expected):
+            self._fail(
+                AssertionError,
+                f"The (d)types do not match: {type(self.actual)} != {type(self.expected)}.",
+            )
+
+        if self.actual == self.expected:
+            return
+
+        if self.equal_nan and cmath.isnan(self.actual) and cmath.isnan(self.expected):
+            return
+
+        abs_diff = abs(self.actual - self.expected)
+        tolerance = self.atol + self.rtol * abs(self.expected)
+
+        if cmath.isfinite(abs_diff) and abs_diff <= tolerance:
+            return
+
+        self._fail(
+            AssertionError,
+            make_scalar_mismatch_msg(
+                self.actual, self.expected, rtol=self.rtol, atol=self.atol
+            ),
+        )
+
+    def extra_repr(self) -> Sequence[str]:
+        return (
+            "rtol",
+            "atol",
+            "equal_nan",
+            "check_dtype",
+        )
+
+
+class TensorLikePair(Pair):
+    """Pair for :class:`torch.Tensor`-like inputs.
+
+    Kwargs:
+        allow_subclasses (bool):
+        rtol (Optional[float]): Relative tolerance. If specified ``atol`` must also be specified. If omitted, default
+            values based on the type are selected. See :func:assert_close: for details.
+        atol (Optional[float]): Absolute tolerance. If specified ``rtol`` must also be specified. If omitted, default
+            values based on the type are selected. See :func:assert_close: for details.
+        equal_nan (bool): If ``True``, two ``NaN`` values are considered equal. Defaults to ``False``.
+        check_device (bool): If ``True`` (default), asserts that corresponding tensors are on the same
+            :attr:`~torch.Tensor.device`. If this check is disabled, tensors on different
+            :attr:`~torch.Tensor.device`'s are moved to the CPU before being compared.
+        check_dtype (bool): If ``True`` (default), asserts that corresponding tensors have the same ``dtype``. If this
+            check is disabled, tensors with different ``dtype``'s are promoted  to a common ``dtype`` (according to
+            :func:`torch.promote_types`) before being compared.
+        check_layout (bool): If ``True`` (default), asserts that corresponding tensors have the same ``layout``. If this
+            check is disabled, tensors with different ``layout``'s are converted to strided tensors before being
+            compared.
+        check_stride (bool): If ``True`` and corresponding tensors are strided, asserts that they have the same stride.
+    """
+
+    def __init__(
+        self,
+        actual: Any,
+        expected: Any,
+        *,
+        id: Tuple[Any, ...] = (),
+        allow_subclasses: bool = True,
+        rtol: Optional[float] = None,
+        atol: Optional[float] = None,
+        equal_nan: bool = False,
+        check_device: bool = True,
+        check_dtype: bool = True,
+        check_layout: bool = True,
+        check_stride: bool = False,
+        **other_parameters: Any,
+    ):
+        actual, expected = self._process_inputs(
+            actual, expected, id=id, allow_subclasses=allow_subclasses
+        )
+        super().__init__(actual, expected, id=id, **other_parameters)
+
+        self.rtol, self.atol = get_tolerances(
+            actual, expected, rtol=rtol, atol=atol, id=self.id
+        )
+        self.equal_nan = equal_nan
+        self.check_device = check_device
+        self.check_dtype = check_dtype
+        self.check_layout = check_layout
+        self.check_stride = check_stride
+
+    def _process_inputs(
+        self, actual: Any, expected: Any, *, id: Tuple[Any, ...], allow_subclasses: bool
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        directly_related = isinstance(actual, type(expected)) or isinstance(
+            expected, type(actual)
+        )
+        if not directly_related:
+            self._inputs_not_supported()
+
+        if not allow_subclasses and type(actual) is not type(expected):
+            self._inputs_not_supported()
+
+        actual, expected = (self._to_tensor(input) for input in (actual, expected))
+        for tensor in (actual, expected):
+            self._check_supported(tensor, id=id)
+        return actual, expected
+
+    def _to_tensor(self, tensor_like: Any) -> torch.Tensor:
+        if isinstance(tensor_like, torch.Tensor):
+            return tensor_like
+
+        try:
+            return torch.as_tensor(tensor_like)
+        except Exception:
+            self._inputs_not_supported()
+
+    def _check_supported(self, tensor: torch.Tensor, *, id: Tuple[Any, ...]) -> None:
+        if tensor.layout not in {
+            torch.strided,
+            torch.sparse_coo,
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            raise ErrorMeta(
+                ValueError, f"Unsupported tensor layout {tensor.layout}", id=id
+            )
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        if any(input.device.type == "meta" for input in (actual, expected)):
+            return
+
+        actual, expected = self._equalize_attributes(actual, expected)
+        self._compare_values(actual, expected)
+
+    def _compare_attributes(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+    ) -> None:
+        """Checks if the attributes of two tensors match.
+
+        Always checks
+
+        - the :attr:`~torch.Tensor.shape`,
+        - whether both inputs are quantized or not,
+        - and if they use the same quantization scheme.
+
+        Checks for
+
+        - :attr:`~torch.Tensor.layout`,
+        - :meth:`~torch.Tensor.stride`,
+        - :attr:`~torch.Tensor.device`, and
+        - :attr:`~torch.Tensor.dtype`
+
+        are optional and can be disabled through the corresponding ``check_*`` flag during construction of the pair.
+        """
+
+        def raise_mismatch_error(
+            attribute_name: str, actual_value: Any, expected_value: Any
+        ) -> NoReturn:
+            self._fail(
+                AssertionError,
+                f"The values for attribute '{attribute_name}' do not match: {actual_value} != {expected_value}.",
+            )
+
+        if actual.shape != expected.shape:
+            raise_mismatch_error("shape", actual.shape, expected.shape)
+
+        if actual.is_quantized != expected.is_quantized:
+            raise_mismatch_error(
+                "is_quantized", actual.is_quantized, expected.is_quantized
+            )
+        elif actual.is_quantized and actual.qscheme() != expected.qscheme():
+            raise_mismatch_error("qscheme()", actual.qscheme(), expected.qscheme())
+
+        if actual.layout != expected.layout:
+            if self.check_layout:
+                raise_mismatch_error("layout", actual.layout, expected.layout)
+        elif (
+            actual.layout == torch.strided
+            and self.check_stride
+            and actual.stride() != expected.stride()
+        ):
+            raise_mismatch_error("stride()", actual.stride(), expected.stride())
+
+        if self.check_device and actual.device != expected.device:
+            raise_mismatch_error("device", actual.device, expected.device)
+
+        if self.check_dtype and actual.dtype != expected.dtype:
+            raise_mismatch_error("dtype", actual.dtype, expected.dtype)
+
+    def _equalize_attributes(
+        self, actual: torch.Tensor, expected: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Equalizes some attributes of two tensors for value comparison.
+
+        If ``actual`` and ``expected`` are ...
+
+        - ... not on the same :attr:`~torch.Tensor.device`, they are moved CPU memory.
+        - ... not of the same ``dtype``, they are promoted  to a common ``dtype`` (according to
+            :func:`torch.promote_types`).
+        - ... not of the same ``layout``, they are converted to strided tensors.
+
+        Args:
+            actual (Tensor): Actual tensor.
+            expected (Tensor): Expected tensor.
+
+        Returns:
+            (Tuple[Tensor, Tensor]): Equalized tensors.
+        """
+        # The comparison logic uses operators currently not supported by the MPS backends.
+        #  See https://github.com/pytorch/pytorch/issues/77144 for details.
+        # TODO: Remove this conversion as soon as all operations are supported natively by the MPS backend
+        if actual.is_mps or expected.is_mps:  # type: ignore[attr-defined]
+            actual = actual.cpu()
+            expected = expected.cpu()
+
+        if actual.device != expected.device:
+            actual = actual.cpu()
+            expected = expected.cpu()
+
+        if actual.dtype != expected.dtype:
+            actual_dtype = actual.dtype
+            expected_dtype = expected.dtype
+            # For uint64, this is not sound in general, which is why promote_types doesn't
+            # allow it, but for easy testing, we're unlikely to get confused
+            # by large uint64 overflowing into negative int64
+            if actual_dtype in [torch.uint64, torch.uint32, torch.uint16]:
+                actual_dtype = torch.int64
+            if expected_dtype in [torch.uint64, torch.uint32, torch.uint16]:
+                expected_dtype = torch.int64
+            dtype = torch.promote_types(actual_dtype, expected_dtype)
+            actual = actual.to(dtype)
+            expected = expected.to(dtype)
+
+        if actual.layout != expected.layout:
+            # These checks are needed, since Tensor.to_dense() fails on tensors that are already strided
+            actual = actual.to_dense() if actual.layout != torch.strided else actual
+            expected = (
+                expected.to_dense() if expected.layout != torch.strided else expected
+            )
+
+        return actual, expected
+
+    def _compare_values(self, actual: torch.Tensor, expected: torch.Tensor) -> None:
+        if actual.is_quantized:
+            compare_fn = self._compare_quantized_values
+        elif actual.is_sparse:
+            compare_fn = self._compare_sparse_coo_values
+        elif actual.layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            compare_fn = self._compare_sparse_compressed_values
+        else:
+            compare_fn = self._compare_regular_values_close
+
+        compare_fn(
+            actual, expected, rtol=self.rtol, atol=self.atol, equal_nan=self.equal_nan
+        )
+
+    def _compare_quantized_values(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
+    ) -> None:
+        """Compares quantized tensors by comparing the :meth:`~torch.Tensor.dequantize`'d variants for closeness.
+
+        .. note::
+
+            A detailed discussion about why only the dequantized variant is checked for closeness rather than checking
+            the individual quantization parameters for closeness and the integer representation for equality can be
+            found in https://github.com/pytorch/pytorch/issues/68548.
+        """
+        return self._compare_regular_values_close(
+            actual.dequantize(),
+            expected.dequantize(),
+            rtol=rtol,
+            atol=atol,
+            equal_nan=equal_nan,
+            identifier=lambda default_identifier: f"Quantized {default_identifier.lower()}",
+        )
+
+    def _compare_sparse_coo_values(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
+    ) -> None:
+        """Compares sparse COO tensors by comparing
+
+        - the number of sparse dimensions,
+        - the number of non-zero elements (nnz) for equality,
+        - the indices for equality, and
+        - the values for closeness.
+        """
+        if actual.sparse_dim() != expected.sparse_dim():
+            self._fail(
+                AssertionError,
+                (
+                    f"The number of sparse dimensions in sparse COO tensors does not match: "
+                    f"{actual.sparse_dim()} != {expected.sparse_dim()}"
+                ),
+            )
+
+        if actual._nnz() != expected._nnz():
+            self._fail(
+                AssertionError,
+                (
+                    f"The number of specified values in sparse COO tensors does not match: "
+                    f"{actual._nnz()} != {expected._nnz()}"
+                ),
+            )
+
+        self._compare_regular_values_equal(
+            actual._indices(),
+            expected._indices(),
+            identifier="Sparse COO indices",
+        )
+        self._compare_regular_values_close(
+            actual._values(),
+            expected._values(),
+            rtol=rtol,
+            atol=atol,
+            equal_nan=equal_nan,
+            identifier="Sparse COO values",
+        )
+
+    def _compare_sparse_compressed_values(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
+    ) -> None:
+        """Compares sparse compressed tensors by comparing
+
+        - the number of non-zero elements (nnz) for equality,
+        - the plain indices for equality,
+        - the compressed indices for equality, and
+        - the values for closeness.
+        """
+        format_name, compressed_indices_method, plain_indices_method = {
+            torch.sparse_csr: (
+                "CSR",
+                torch.Tensor.crow_indices,
+                torch.Tensor.col_indices,
+            ),
+            torch.sparse_csc: (
+                "CSC",
+                torch.Tensor.ccol_indices,
+                torch.Tensor.row_indices,
+            ),
+            torch.sparse_bsr: (
+                "BSR",
+                torch.Tensor.crow_indices,
+                torch.Tensor.col_indices,
+            ),
+            torch.sparse_bsc: (
+                "BSC",
+                torch.Tensor.ccol_indices,
+                torch.Tensor.row_indices,
+            ),
+        }[actual.layout]
+
+        if actual._nnz() != expected._nnz():
+            self._fail(
+                AssertionError,
+                (
+                    f"The number of specified values in sparse {format_name} tensors does not match: "
+                    f"{actual._nnz()} != {expected._nnz()}"
+                ),
+            )
+
+        # Compressed and plain indices in the CSR / CSC / BSR / BSC sparse formates can be `torch.int32` _or_
+        # `torch.int64`. While the same dtype is enforced for the compressed and plain indices of a single tensor, it
+        # can be different between two tensors. Thus, we need to convert them to the same dtype, or the comparison will
+        # fail.
+        actual_compressed_indices = compressed_indices_method(actual)
+        expected_compressed_indices = compressed_indices_method(expected)
+        indices_dtype = torch.promote_types(
+            actual_compressed_indices.dtype, expected_compressed_indices.dtype
+        )
+
+        self._compare_regular_values_equal(
+            actual_compressed_indices.to(indices_dtype),
+            expected_compressed_indices.to(indices_dtype),
+            identifier=f"Sparse {format_name} {compressed_indices_method.__name__}",
+        )
+        self._compare_regular_values_equal(
+            plain_indices_method(actual).to(indices_dtype),
+            plain_indices_method(expected).to(indices_dtype),
+            identifier=f"Sparse {format_name} {plain_indices_method.__name__}",
+        )
+        self._compare_regular_values_close(
+            actual.values(),
+            expected.values(),
+            rtol=rtol,
+            atol=atol,
+            equal_nan=equal_nan,
+            identifier=f"Sparse {format_name} values",
+        )
+
+    def _compare_regular_values_equal(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        equal_nan: bool = False,
+        identifier: Optional[Union[str, Callable[[str], str]]] = None,
+    ) -> None:
+        """Checks if the values of two tensors are equal."""
+        self._compare_regular_values_close(
+            actual, expected, rtol=0, atol=0, equal_nan=equal_nan, identifier=identifier
+        )
+
+    def _compare_regular_values_close(
+        self,
+        actual: torch.Tensor,
+        expected: torch.Tensor,
+        *,
+        rtol: float,
+        atol: float,
+        equal_nan: bool,
+        identifier: Optional[Union[str, Callable[[str], str]]] = None,
+    ) -> None:
+        """Checks if the values of two tensors are close up to a desired tolerance."""
+        matches = torch.isclose(
+            actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan
+        )
+        if torch.all(matches):
+            return
+
+        if actual.shape == torch.Size([]):
+            msg = make_scalar_mismatch_msg(
+                actual.item(),
+                expected.item(),
+                rtol=rtol,
+                atol=atol,
+                identifier=identifier,
+            )
+        else:
+            msg = make_tensor_mismatch_msg(
+                actual, expected, matches, rtol=rtol, atol=atol, identifier=identifier
+            )
+        self._fail(AssertionError, msg)
+
+    def extra_repr(self) -> Sequence[str]:
+        return (
+            "rtol",
+            "atol",
+            "equal_nan",
+            "check_device",
+            "check_dtype",
+            "check_layout",
+            "check_stride",
+        )
+
+
+def originate_pairs(
+    actual: Any,
+    expected: Any,
+    *,
+    pair_types: Sequence[Type[Pair]],
+    sequence_types: Tuple[Type, ...] = (collections.abc.Sequence,),
+    mapping_types: Tuple[Type, ...] = (collections.abc.Mapping,),
+    id: Tuple[Any, ...] = (),
+    **options: Any,
+) -> List[Pair]:
+    """Originates pairs from the individual inputs.
+
+    ``actual`` and ``expected`` can be possibly nested :class:`~collections.abc.Sequence`'s or
+    :class:`~collections.abc.Mapping`'s. In this case the pairs are originated by recursing through them.
+
+    Args:
+        actual (Any): Actual input.
+        expected (Any): Expected input.
+        pair_types (Sequence[Type[Pair]]): Sequence of pair types that will be tried to construct with the inputs.
+            First successful pair will be used.
+        sequence_types (Tuple[Type, ...]): Optional types treated as sequences that will be checked elementwise.
+        mapping_types (Tuple[Type, ...]): Optional types treated as mappings that will be checked elementwise.
+        id (Tuple[Any, ...]): Optional id of a pair that will be included in an error message.
+        **options (Any): Options passed to each pair during construction.
+
+    Raises:
+        ErrorMeta: With :class`AssertionError`, if the inputs are :class:`~collections.abc.Sequence`'s, but their
+            length does not match.
+        ErrorMeta: With :class`AssertionError`, if the inputs are :class:`~collections.abc.Mapping`'s, but their set of
+            keys do not match.
+        ErrorMeta: With :class`TypeError`, if no pair is able to handle the inputs.
+        ErrorMeta: With any expected exception that happens during the construction of a pair.
+
+    Returns:
+        (List[Pair]): Originated pairs.
+    """
+    # We explicitly exclude str's here since they are self-referential and would cause an infinite recursion loop:
+    # "a" == "a"[0][0]...
+    if (
+        isinstance(actual, sequence_types)
+        and not isinstance(actual, str)
+        and isinstance(expected, sequence_types)
+        and not isinstance(expected, str)
+    ):
+        actual_len = len(actual)
+        expected_len = len(expected)
+        if actual_len != expected_len:
+            raise ErrorMeta(
+                AssertionError,
+                f"The length of the sequences mismatch: {actual_len} != {expected_len}",
+                id=id,
+            )
+
+        pairs = []
+        for idx in range(actual_len):
+            pairs.extend(
+                originate_pairs(
+                    actual[idx],
+                    expected[idx],
+                    pair_types=pair_types,
+                    sequence_types=sequence_types,
+                    mapping_types=mapping_types,
+                    id=(*id, idx),
+                    **options,
+                )
+            )
+        return pairs
+
+    elif isinstance(actual, mapping_types) and isinstance(expected, mapping_types):
+        actual_keys = set(actual.keys())
+        expected_keys = set(expected.keys())
+        if actual_keys != expected_keys:
+            missing_keys = expected_keys - actual_keys
+            additional_keys = actual_keys - expected_keys
+            raise ErrorMeta(
+                AssertionError,
+                (
+                    f"The keys of the mappings do not match:\n"
+                    f"Missing keys in the actual mapping: {sorted(missing_keys)}\n"
+                    f"Additional keys in the actual mapping: {sorted(additional_keys)}"
+                ),
+                id=id,
+            )
+
+        keys: Collection = actual_keys
+        # Since the origination aborts after the first failure, we try to be deterministic
+        with contextlib.suppress(Exception):
+            keys = sorted(keys)
+
+        pairs = []
+        for key in keys:
+            pairs.extend(
+                originate_pairs(
+                    actual[key],
+                    expected[key],
+                    pair_types=pair_types,
+                    sequence_types=sequence_types,
+                    mapping_types=mapping_types,
+                    id=(*id, key),
+                    **options,
+                )
+            )
+        return pairs
+
+    else:
+        for pair_type in pair_types:
+            try:
+                return [pair_type(actual, expected, id=id, **options)]
+            # Raising an `UnsupportedInputs` during origination indicates that the pair type is not able to handle the
+            # inputs. Thus, we try the next pair type.
+            except UnsupportedInputs:
+                continue
+            # Raising an `ErrorMeta` during origination is the orderly way to abort and so we simply re-raise it. This
+            # is only in a separate branch, because the one below would also except it.
+            except ErrorMeta:
+                raise
+            # Raising any other exception during origination is unexpected and will give some extra information about
+            # what happened. If applicable, the exception should be expected in the future.
+            except Exception as error:
+                raise RuntimeError(
+                    f"Originating a {pair_type.__name__}() at item {''.join(str([item]) for item in id)} with\n\n"
+                    f"{type(actual).__name__}(): {actual}\n\n"
+                    f"and\n\n"
+                    f"{type(expected).__name__}(): {expected}\n\n"
+                    f"resulted in the unexpected exception above. "
+                    f"If you are a user and see this message during normal operation "
+                    "please file an issue at https://github.com/pytorch/pytorch/issues. "
+                    "If you are a developer and working on the comparison functions, "
+                    "please except the previous error and raise an expressive `ErrorMeta` instead."
+                ) from error
+        else:
+            raise ErrorMeta(
+                TypeError,
+                f"No comparison pair was able to handle inputs of type {type(actual)} and {type(expected)}.",
+                id=id,
+            )
+
+
+def not_close_error_metas(
+    actual: Any,
+    expected: Any,
+    *,
+    pair_types: Sequence[Type[Pair]] = (ObjectPair,),
+    sequence_types: Tuple[Type, ...] = (collections.abc.Sequence,),
+    mapping_types: Tuple[Type, ...] = (collections.abc.Mapping,),
+    **options: Any,
+) -> List[ErrorMeta]:
+    """Asserts that inputs are equal.
+
+    ``actual`` and ``expected`` can be possibly nested :class:`~collections.abc.Sequence`'s or
+    :class:`~collections.abc.Mapping`'s. In this case the comparison happens elementwise by recursing through them.
+
+    Args:
+        actual (Any): Actual input.
+        expected (Any): Expected input.
+        pair_types (Sequence[Type[Pair]]): Sequence of :class:`Pair` types that will be tried to construct with the
+            inputs. First successful pair will be used. Defaults to only using :class:`ObjectPair`.
+        sequence_types (Tuple[Type, ...]): Optional types treated as sequences that will be checked elementwise.
+        mapping_types (Tuple[Type, ...]): Optional types treated as mappings that will be checked elementwise.
+        **options (Any): Options passed to each pair during construction.
+    """
+    # Hide this function from `pytest`'s traceback
+    __tracebackhide__ = True
+
+    try:
+        pairs = originate_pairs(
+            actual,
+            expected,
+            pair_types=pair_types,
+            sequence_types=sequence_types,
+            mapping_types=mapping_types,
+            **options,
+        )
+    except ErrorMeta as error_meta:
+        # Explicitly raising from None to hide the internal traceback
+        raise error_meta.to_error() from None
+
+    error_metas: List[ErrorMeta] = []
+    for pair in pairs:
+        try:
+            pair.compare()
+        except ErrorMeta as error_meta:
+            error_metas.append(error_meta)
+        # Raising any exception besides `ErrorMeta` while comparing is unexpected and will give some extra information
+        # about what happened. If applicable, the exception should be expected in the future.
+        except Exception as error:
+            raise RuntimeError(
+                f"Comparing\n\n"
+                f"{pair}\n\n"
+                f"resulted in the unexpected exception above. "
+                f"If you are a user and see this message during normal operation "
+                "please file an issue at https://github.com/pytorch/pytorch/issues. "
+                "If you are a developer and working on the comparison functions, "
+                "please except the previous error and raise an expressive `ErrorMeta` instead."
+            ) from error
+
+    # [ErrorMeta Cycles]
+    # ErrorMeta objects in this list capture
+    # tracebacks that refer to the frame of this function.
+    # The local variable `error_metas` refers to the error meta
+    # objects, creating a reference cycle. Frames in the traceback
+    # would not get freed until cycle collection, leaking cuda memory in tests.
+    # We break the cycle by removing the reference to the error_meta objects
+    # from this frame as it returns.
+    error_metas = [error_metas]
+    return error_metas.pop()
+
+
+def assert_close(
+    actual: Any,
+    expected: Any,
+    *,
+    allow_subclasses: bool = True,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = False,
+    check_device: bool = True,
+    check_dtype: bool = True,
+    check_layout: bool = True,
+    check_stride: bool = False,
+    msg: Optional[Union[str, Callable[[str], str]]] = None,
+):
+    r"""Asserts that ``actual`` and ``expected`` are close.
+
+    If ``actual`` and ``expected`` are strided, non-quantized, real-valued, and finite, they are considered close if
+
+    .. math::
+
+        \lvert \text{actual} - \text{expected} \rvert \le \texttt{atol} + \texttt{rtol} \cdot \lvert \text{expected} \rvert
+
+    Non-finite values (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are
+    only considered equal to each other if ``equal_nan`` is ``True``.
+
+    In addition, they are only considered close if they have the same
+
+    - :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``),
+    - ``dtype`` (if ``check_dtype`` is ``True``),
+    - ``layout`` (if ``check_layout`` is ``True``), and
+    - stride (if ``check_stride`` is ``True``).
+
+    If either ``actual`` or ``expected`` is a meta tensor, only the attribute checks will be performed.
+
+    If ``actual`` and ``expected`` are sparse (either having COO, CSR, CSC, BSR, or BSC layout), their strided members are
+    checked individually. Indices, namely ``indices`` for COO, ``crow_indices`` and ``col_indices`` for CSR and BSR,
+    or ``ccol_indices``  and ``row_indices`` for CSC and BSC layouts, respectively,
+    are always checked for equality whereas the values are checked for closeness according to the definition above.
+
+    If ``actual`` and ``expected`` are quantized, they are considered close if they have the same
+    :meth:`~torch.Tensor.qscheme` and the result of :meth:`~torch.Tensor.dequantize` is close according to the
+    definition above.
+
+    ``actual`` and ``expected`` can be :class:`~torch.Tensor`'s or any tensor-or-scalar-likes from which
+    :class:`torch.Tensor`'s can be constructed with :func:`torch.as_tensor`. Except for Python scalars the input types
+    have to be directly related. In addition, ``actual`` and ``expected`` can be :class:`~collections.abc.Sequence`'s
+    or :class:`~collections.abc.Mapping`'s in which case they are considered close if their structure matches and all
+    their elements are considered close according to the above definition.
+
+    .. note::
+
+        Python scalars are an exception to the type relation requirement, because their :func:`type`, i.e.
+        :class:`int`, :class:`float`, and :class:`complex`, is equivalent to the ``dtype`` of a tensor-like. Thus,
+        Python scalars of different types can be checked, but require ``check_dtype=False``.
+
+    Args:
+        actual (Any): Actual input.
+        expected (Any): Expected input.
+        allow_subclasses (bool): If ``True`` (default) and except for Python scalars, inputs of directly related types
+            are allowed. Otherwise type equality is required.
+        rtol (Optional[float]): Relative tolerance. If specified ``atol`` must also be specified. If omitted, default
+            values based on the :attr:`~torch.Tensor.dtype` are selected with the below table.
+        atol (Optional[float]): Absolute tolerance. If specified ``rtol`` must also be specified. If omitted, default
+            values based on the :attr:`~torch.Tensor.dtype` are selected with the below table.
+        equal_nan (Union[bool, str]): If ``True``, two ``NaN`` values will be considered equal.
+        check_device (bool): If ``True`` (default), asserts that corresponding tensors are on the same
+            :attr:`~torch.Tensor.device`. If this check is disabled, tensors on different
+            :attr:`~torch.Tensor.device`'s are moved to the CPU before being compared.
+        check_dtype (bool): If ``True`` (default), asserts that corresponding tensors have the same ``dtype``. If this
+            check is disabled, tensors with different ``dtype``'s are promoted  to a common ``dtype`` (according to
+            :func:`torch.promote_types`) before being compared.
+        check_layout (bool): If ``True`` (default), asserts that corresponding tensors have the same ``layout``. If this
+            check is disabled, tensors with different ``layout``'s are converted to strided tensors before being
+            compared.
+        check_stride (bool): If ``True`` and corresponding tensors are strided, asserts that they have the same stride.
+        msg (Optional[Union[str, Callable[[str], str]]]): Optional error message to use in case a failure occurs during
+            the comparison. Can also passed as callable in which case it will be called with the generated message and
+            should return the new message.
+
+    Raises:
+        ValueError: If no :class:`torch.Tensor` can be constructed from an input.
+        ValueError: If only ``rtol`` or ``atol`` is specified.
+        AssertionError: If corresponding inputs are not Python scalars and are not directly related.
+        AssertionError: If ``allow_subclasses`` is ``False``, but corresponding inputs are not Python scalars and have
+            different types.
+        AssertionError: If the inputs are :class:`~collections.abc.Sequence`'s, but their length does not match.
+        AssertionError: If the inputs are :class:`~collections.abc.Mapping`'s, but their set of keys do not match.
+        AssertionError: If corresponding tensors do not have the same :attr:`~torch.Tensor.shape`.
+        AssertionError: If ``check_layout`` is ``True``, but corresponding tensors do not have the same
+            :attr:`~torch.Tensor.layout`.
+        AssertionError: If only one of corresponding tensors is quantized.
+        AssertionError: If corresponding tensors are quantized, but have different :meth:`~torch.Tensor.qscheme`'s.
+        AssertionError: If ``check_device`` is ``True``, but corresponding tensors are not on the same
+            :attr:`~torch.Tensor.device`.
+        AssertionError: If ``check_dtype`` is ``True``, but corresponding tensors do not have the same ``dtype``.
+        AssertionError: If ``check_stride`` is ``True``, but corresponding strided tensors do not have the same stride.
+        AssertionError: If the values of corresponding tensors are not close according to the definition above.
+
+    The following table displays the default ``rtol`` and ``atol`` for different ``dtype``'s. In case of mismatching
+    ``dtype``'s, the maximum of both tolerances is used.
+
+    +---------------------------+------------+----------+
+    | ``dtype``                 | ``rtol``   | ``atol`` |
+    +===========================+============+==========+
+    | :attr:`~torch.float16`    | ``1e-3``   | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.bfloat16`   | ``1.6e-2`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.float32`    | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.float64`    | ``1e-7``   | ``1e-7`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.complex32`  | ``1e-3``   | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.complex64`  | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.complex128` | ``1e-7``   | ``1e-7`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.quint8`     | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.quint2x4`   | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.quint4x2`   | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.qint8`      | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | :attr:`~torch.qint32`     | ``1.3e-6`` | ``1e-5`` |
+    +---------------------------+------------+----------+
+    | other                     | ``0.0``    | ``0.0``  |
+    +---------------------------+------------+----------+
+
+    .. note::
+
+        :func:`~torch.testing.assert_close` is highly configurable with strict default settings. Users are encouraged
+        to :func:`~functools.partial` it to fit their use case. For example, if an equality check is needed, one might
+        define an ``assert_equal`` that uses zero tolerances for every ``dtype`` by default:
+
+        >>> import functools
+        >>> assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
+        >>> assert_equal(1e-9, 1e-10)
+        Traceback (most recent call last):
+        ...
+        AssertionError: Scalars are not equal!
+        <BLANKLINE>
+        Expected 1e-10 but got 1e-09.
+        Absolute difference: 9.000000000000001e-10
+        Relative difference: 9.0
+
+    Examples:
+        >>> # tensor to tensor comparison
+        >>> expected = torch.tensor([1e0, 1e-1, 1e-2])
+        >>> actual = torch.acos(torch.cos(expected))
+        >>> torch.testing.assert_close(actual, expected)
+
+        >>> # scalar to scalar comparison
+        >>> import math
+        >>> expected = math.sqrt(2.0)
+        >>> actual = 2.0 / math.sqrt(2.0)
+        >>> torch.testing.assert_close(actual, expected)
+
+        >>> # numpy array to numpy array comparison
+        >>> import numpy as np
+        >>> expected = np.array([1e0, 1e-1, 1e-2])
+        >>> actual = np.arccos(np.cos(expected))
+        >>> torch.testing.assert_close(actual, expected)
+
+        >>> # sequence to sequence comparison
+        >>> import numpy as np
+        >>> # The types of the sequences do not have to match. They only have to have the same
+        >>> # length and their elements have to match.
+        >>> expected = [torch.tensor([1.0]), 2.0, np.array(3.0)]
+        >>> actual = tuple(expected)
+        >>> torch.testing.assert_close(actual, expected)
+
+        >>> # mapping to mapping comparison
+        >>> from collections import OrderedDict
+        >>> import numpy as np
+        >>> foo = torch.tensor(1.0)
+        >>> bar = 2.0
+        >>> baz = np.array(3.0)
+        >>> # The types and a possible ordering of mappings do not have to match. They only
+        >>> # have to have the same set of keys and their elements have to match.
+        >>> expected = OrderedDict([("foo", foo), ("bar", bar), ("baz", baz)])
+        >>> actual = {"baz": baz, "bar": bar, "foo": foo}
+        >>> torch.testing.assert_close(actual, expected)
+
+        >>> expected = torch.tensor([1.0, 2.0, 3.0])
+        >>> actual = expected.clone()
+        >>> # By default, directly related instances can be compared
+        >>> torch.testing.assert_close(torch.nn.Parameter(actual), expected)
+        >>> # This check can be made more strict with allow_subclasses=False
+        >>> torch.testing.assert_close(
+        ...     torch.nn.Parameter(actual), expected, allow_subclasses=False
+        ... )
+        Traceback (most recent call last):
+        ...
+        TypeError: No comparison pair was able to handle inputs of type
+        <class 'torch.nn.parameter.Parameter'> and <class 'torch.Tensor'>.
+        >>> # If the inputs are not directly related, they are never considered close
+        >>> torch.testing.assert_close(actual.numpy(), expected)
+        Traceback (most recent call last):
+        ...
+        TypeError: No comparison pair was able to handle inputs of type <class 'numpy.ndarray'>
+        and <class 'torch.Tensor'>.
+        >>> # Exceptions to these rules are Python scalars. They can be checked regardless of
+        >>> # their type if check_dtype=False.
+        >>> torch.testing.assert_close(1.0, 1, check_dtype=False)
+
+        >>> # NaN != NaN by default.
+        >>> expected = torch.tensor(float("Nan"))
+        >>> actual = expected.clone()
+        >>> torch.testing.assert_close(actual, expected)
+        Traceback (most recent call last):
+        ...
+        AssertionError: Scalars are not close!
+        <BLANKLINE>
+        Expected nan but got nan.
+        Absolute difference: nan (up to 1e-05 allowed)
+        Relative difference: nan (up to 1.3e-06 allowed)
+        >>> torch.testing.assert_close(actual, expected, equal_nan=True)
+
+        >>> expected = torch.tensor([1.0, 2.0, 3.0])
+        >>> actual = torch.tensor([1.0, 4.0, 5.0])
+        >>> # The default error message can be overwritten.
+        >>> torch.testing.assert_close(actual, expected, msg="Argh, the tensors are not close!")
+        Traceback (most recent call last):
+        ...
+        AssertionError: Argh, the tensors are not close!
+        >>> # If msg is a callable, it can be used to augment the generated message with
+        >>> # extra information
+        >>> torch.testing.assert_close(
+        ...     actual, expected, msg=lambda msg: f"Header\n\n{msg}\n\nFooter"
+        ... )
+        Traceback (most recent call last):
+        ...
+        AssertionError: Header
+        <BLANKLINE>
+        Tensor-likes are not close!
+        <BLANKLINE>
+        Mismatched elements: 2 / 3 (66.7%)
+        Greatest absolute difference: 2.0 at index (1,) (up to 1e-05 allowed)
+        Greatest relative difference: 1.0 at index (1,) (up to 1.3e-06 allowed)
+        <BLANKLINE>
+        Footer
+    """
+    # Hide this function from `pytest`'s traceback
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        msg=msg,
+    )
+
+    if error_metas:
+        # TODO: compose all metas into one AssertionError
+        raise error_metas[0].to_error(msg)
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: str = "",
+) -> None:
+    """
+    .. warning::
+
+       :func:`torch.testing.assert_allclose` is deprecated since ``1.12`` and will be removed in a future release.
+       Please use :func:`torch.testing.assert_close` instead. You can find detailed upgrade instructions
+       `here <https://github.com/pytorch/pytorch/issues/61844>`_.
+    """
+    warnings.warn(
+        "`torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. "
+        "Please use `torch.testing.assert_close()` instead. "
+        "You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.",
+        FutureWarning,
+        stacklevel=2,
+    )
+
+    if not isinstance(actual, torch.Tensor):
+        actual = torch.tensor(actual)
+    if not isinstance(expected, torch.Tensor):
+        expected = torch.tensor(expected, dtype=actual.dtype)
+
+    if rtol is None and atol is None:
+        rtol, atol = default_tolerances(
+            actual,
+            expected,
+            dtype_precisions={
+                torch.float16: (1e-3, 1e-3),
+                torch.float32: (1e-4, 1e-5),
+                torch.float64: (1e-5, 1e-8),
+            },
+        )
+
+    torch.testing.assert_close(
+        actual,
+        expected,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=True,
+        check_dtype=False,
+        check_stride=False,
+        msg=msg or None,
+    )
diff --git a/MLPY/Lib/site-packages/torch/testing/_creation.py b/MLPY/Lib/site-packages/torch/testing/_creation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca722ab03f4e6a1acf50b59a96a1a6fe3699cb50
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_creation.py
@@ -0,0 +1,267 @@
+"""
+This module contains tensor creation utilities.
+"""
+
+import collections.abc
+import math
+import warnings
+from typing import cast, List, Optional, Tuple, Union
+
+import torch
+
+_INTEGRAL_TYPES = [
+    torch.uint8,
+    torch.int8,
+    torch.int16,
+    torch.int32,
+    torch.int64,
+    torch.uint16,
+    torch.uint32,
+    torch.uint64,
+]
+_FLOATING_TYPES = [torch.float16, torch.bfloat16, torch.float32, torch.float64]
+_FLOATING_8BIT_TYPES = [
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e4m3fnuz,
+    torch.float8_e5m2fnuz,
+]
+_COMPLEX_TYPES = [torch.complex32, torch.complex64, torch.complex128]
+_BOOLEAN_OR_INTEGRAL_TYPES = [torch.bool, *_INTEGRAL_TYPES]
+_FLOATING_OR_COMPLEX_TYPES = [*_FLOATING_TYPES, *_COMPLEX_TYPES]
+
+
+def _uniform_random_(t: torch.Tensor, low: float, high: float) -> torch.Tensor:
+    # uniform_ requires to-from <= std::numeric_limits<scalar_t>::max()
+    # Work around this by scaling the range before and after the PRNG
+    if high - low >= torch.finfo(t.dtype).max:
+        return t.uniform_(low / 2, high / 2).mul_(2)
+    else:
+        return t.uniform_(low, high)
+
+
+def make_tensor(
+    *shape: Union[int, torch.Size, List[int], Tuple[int, ...]],
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    low: Optional[float] = None,
+    high: Optional[float] = None,
+    requires_grad: bool = False,
+    noncontiguous: bool = False,
+    exclude_zero: bool = False,
+    memory_format: Optional[torch.memory_format] = None,
+) -> torch.Tensor:
+    r"""Creates a tensor with the given :attr:`shape`, :attr:`device`, and :attr:`dtype`, and filled with
+    values uniformly drawn from ``[low, high)``.
+
+    If :attr:`low` or :attr:`high` are specified and are outside the range of the :attr:`dtype`'s representable
+    finite values then they are clamped to the lowest or highest representable finite value, respectively.
+    If ``None``, then the following table describes the default values for :attr:`low` and :attr:`high`,
+    which depend on :attr:`dtype`.
+
+    +---------------------------+------------+----------+
+    | ``dtype``                 | ``low``    | ``high`` |
+    +===========================+============+==========+
+    | boolean type              | ``0``      | ``2``    |
+    +---------------------------+------------+----------+
+    | unsigned integral type    | ``0``      | ``10``   |
+    +---------------------------+------------+----------+
+    | signed integral types     | ``-9``     | ``10``   |
+    +---------------------------+------------+----------+
+    | floating types            | ``-9``     | ``9``    |
+    +---------------------------+------------+----------+
+    | complex types             | ``-9``     | ``9``    |
+    +---------------------------+------------+----------+
+
+    Args:
+        shape (Tuple[int, ...]): Single integer or a sequence of integers defining the shape of the output tensor.
+        dtype (:class:`torch.dtype`): The data type of the returned tensor.
+        device (Union[str, torch.device]): The device of the returned tensor.
+        low (Optional[Number]): Sets the lower limit (inclusive) of the given range. If a number is provided it is
+            clamped to the least representable finite value of the given dtype. When ``None`` (default),
+            this value is determined based on the :attr:`dtype` (see the table above). Default: ``None``.
+        high (Optional[Number]): Sets the upper limit (exclusive) of the given range. If a number is provided it is
+            clamped to the greatest representable finite value of the given dtype. When ``None`` (default) this value
+            is determined based on the :attr:`dtype` (see the table above). Default: ``None``.
+
+            .. deprecated:: 2.1
+
+                Passing ``low==high`` to :func:`~torch.testing.make_tensor` for floating or complex types is deprecated
+                since 2.1 and will be removed in 2.3. Use :func:`torch.full` instead.
+
+        requires_grad (Optional[bool]): If autograd should record operations on the returned tensor. Default: ``False``.
+        noncontiguous (Optional[bool]): If `True`, the returned tensor will be noncontiguous. This argument is
+            ignored if the constructed tensor has fewer than two elements. Mutually exclusive with ``memory_format``.
+        exclude_zero (Optional[bool]): If ``True`` then zeros are replaced with the dtype's small positive value
+            depending on the :attr:`dtype`. For bool and integer types zero is replaced with one. For floating
+            point types it is replaced with the dtype's smallest positive normal number (the "tiny" value of the
+            :attr:`dtype`'s :func:`~torch.finfo` object), and for complex types it is replaced with a complex number
+            whose real and imaginary parts are both the smallest positive normal number representable by the complex
+            type. Default ``False``.
+        memory_format (Optional[torch.memory_format]): The memory format of the returned tensor. Mutually exclusive
+            with ``noncontiguous``.
+
+    Raises:
+        ValueError: If ``requires_grad=True`` is passed for integral `dtype`
+        ValueError: If ``low >= high``.
+        ValueError: If either :attr:`low` or :attr:`high` is ``nan``.
+        ValueError: If both :attr:`noncontiguous` and :attr:`memory_format` are passed.
+        TypeError: If :attr:`dtype` isn't supported by this function.
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> from torch.testing import make_tensor
+        >>> # Creates a float tensor with values in [-1, 1)
+        >>> make_tensor((3,), device='cpu', dtype=torch.float32, low=-1, high=1)
+        >>> # xdoctest: +SKIP
+        tensor([ 0.1205, 0.2282, -0.6380])
+        >>> # Creates a bool tensor on CUDA
+        >>> make_tensor((2, 2), device='cuda', dtype=torch.bool)
+        tensor([[False, False],
+                [False, True]], device='cuda:0')
+    """
+
+    def modify_low_high(
+        low: Optional[float],
+        high: Optional[float],
+        *,
+        lowest_inclusive: float,
+        highest_exclusive: float,
+        default_low: float,
+        default_high: float,
+    ) -> Tuple[float, float]:
+        """
+        Modifies (and raises ValueError when appropriate) low and high values given by the user (input_low, input_high)
+        if required.
+        """
+
+        def clamp(a: float, l: float, h: float) -> float:
+            return min(max(a, l), h)
+
+        low = low if low is not None else default_low
+        high = high if high is not None else default_high
+
+        if any(isinstance(value, float) and math.isnan(value) for value in [low, high]):
+            raise ValueError(
+                f"`low` and `high` cannot be NaN, but got {low=} and {high=}"
+            )
+        elif low == high and dtype in _FLOATING_OR_COMPLEX_TYPES:
+            warnings.warn(
+                "Passing `low==high` to `torch.testing.make_tensor` for floating or complex types "
+                "is deprecated since 2.1 and will be removed in 2.3. "
+                "Use torch.full(...) instead.",
+                FutureWarning,
+            )
+        elif low >= high:
+            raise ValueError(f"`low` must be less than `high`, but got {low} >= {high}")
+        elif high < lowest_inclusive or low >= highest_exclusive:
+            raise ValueError(
+                f"The value interval specified by `low` and `high` is [{low}, {high}), "
+                f"but {dtype} only supports [{lowest_inclusive}, {highest_exclusive})"
+            )
+
+        low = clamp(low, lowest_inclusive, highest_exclusive)
+        high = clamp(high, lowest_inclusive, highest_exclusive)
+
+        if dtype in _BOOLEAN_OR_INTEGRAL_TYPES:
+            # 1. `low` is ceiled to avoid creating values smaller than `low` and thus outside the specified interval
+            # 2. Following the same reasoning as for 1., `high` should be floored. However, the higher bound of
+            #    `torch.randint` is exclusive, and thus we need to ceil here as well.
+            return math.ceil(low), math.ceil(high)
+
+        return low, high
+
+    if len(shape) == 1 and isinstance(shape[0], collections.abc.Sequence):
+        shape = shape[0]  # type: ignore[assignment]
+    shape = cast(Tuple[int, ...], tuple(shape))
+
+    if noncontiguous and memory_format is not None:
+        raise ValueError(
+            f"The parameters `noncontiguous` and `memory_format` are mutually exclusive, "
+            f"but got {noncontiguous=} and {memory_format=}"
+        )
+
+    if requires_grad and dtype in _BOOLEAN_OR_INTEGRAL_TYPES:
+        raise ValueError(
+            f"`requires_grad=True` is not supported for boolean and integral dtypes, but got {dtype=}"
+        )
+
+    if dtype is torch.bool:
+        low, high = cast(
+            Tuple[int, int],
+            modify_low_high(
+                low,
+                high,
+                lowest_inclusive=0,
+                highest_exclusive=2,
+                default_low=0,
+                default_high=2,
+            ),
+        )
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)
+    elif dtype in _BOOLEAN_OR_INTEGRAL_TYPES:
+        low, high = cast(
+            Tuple[int, int],
+            modify_low_high(
+                low,
+                high,
+                lowest_inclusive=torch.iinfo(dtype).min,
+                highest_exclusive=torch.iinfo(dtype).max
+                # In theory, `highest_exclusive` should always be the maximum value + 1. However, `torch.randint`
+                # internally converts the bounds to an int64 and would overflow. In other words: `torch.randint` cannot
+                # sample 2**63 - 1, i.e. the maximum value of `torch.int64` and we need to account for that here.
+                + (1 if dtype is not torch.int64 else 0),
+                # This is incorrect for `torch.uint8`, but since we clamp to `lowest`, i.e. 0 for `torch.uint8`,
+                # _after_ we use the default value, we don't need to special case it here
+                default_low=-9,
+                default_high=10,
+            ),
+        )
+        result = torch.randint(low, high, shape, device=device, dtype=dtype)
+    elif dtype in _FLOATING_OR_COMPLEX_TYPES:
+        low, high = modify_low_high(
+            low,
+            high,
+            lowest_inclusive=torch.finfo(dtype).min,
+            highest_exclusive=torch.finfo(dtype).max,
+            default_low=-9,
+            default_high=9,
+        )
+        result = torch.empty(shape, device=device, dtype=dtype)
+        _uniform_random_(
+            torch.view_as_real(result) if dtype in _COMPLEX_TYPES else result, low, high
+        )
+    elif dtype in _FLOATING_8BIT_TYPES:
+        low, high = modify_low_high(
+            low,
+            high,
+            lowest_inclusive=torch.finfo(dtype).min,
+            highest_exclusive=torch.finfo(dtype).max,
+            default_low=-9,
+            default_high=9,
+        )
+        result = torch.empty(shape, device=device, dtype=torch.float32)
+        _uniform_random_(result, low, high)
+        result = result.to(dtype)
+    else:
+        raise TypeError(
+            f"The requested dtype '{dtype}' is not supported by torch.testing.make_tensor()."
+            " To request support, file an issue at: https://github.com/pytorch/pytorch/issues"
+        )
+
+    if noncontiguous and result.numel() > 1:
+        result = torch.repeat_interleave(result, 2, dim=-1)
+        result = result[..., ::2]
+    elif memory_format is not None:
+        result = result.clone(memory_format=memory_format)
+
+    if exclude_zero:
+        result[result == 0] = (
+            1 if dtype in _BOOLEAN_OR_INTEGRAL_TYPES else torch.finfo(dtype).tiny
+        )
+
+    if dtype in _FLOATING_OR_COMPLEX_TYPES:
+        result.requires_grad = requires_grad
+
+    return result
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b30bdddbb3b6c97a35660c8002932bd72628aca
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autocast_test_lists.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autocast_test_lists.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16ceeddb8e70a2b847910d4044f6fd8e27fa8b40
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autocast_test_lists.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autograd_function_db.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autograd_function_db.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b59de690cf2b255c4f3816b363029896d491ef2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/autograd_function_db.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/check_kernel_launches.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/check_kernel_launches.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a1ecdd3ed91873b064f47f5c1fd625e116b7ba6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/check_kernel_launches.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_cuda.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_cuda.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8aa1a3475a52b3159cf1ea9e812ad5a14034c1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_cuda.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_device_type.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_device_type.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59fe5c023041546d861d5fa055fb16d6b365fb39
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_device_type.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dist_composable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dist_composable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad218e708cd55292297c4bab63c042a30f015bbb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dist_composable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50ca765605d726d04ff97cae8c7d8603f7c8ed2e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dtype.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dtype.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..229608482135019c2223a63379dc21f51e505d2f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_dtype.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_fsdp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_fsdp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1df5715a081ba583149d16473459653eade3ef23
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_fsdp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_jit.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_jit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c99e75c144dcc1d7831c39032703cffb3be7429
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_jit.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_methods_invocations.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_methods_invocations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdbca73472e0d73975e40ec44218bae40d1c458d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_methods_invocations.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_mkldnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_mkldnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d912b00a3f8526697cb9a88a57764d05d4baa6e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_mkldnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_modules.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ae9b7f64513ce4497036b9f052493dc0096e0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_modules.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_nn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_nn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0fb1e094d1a7bc83e1c59f4f8bf60d3407cc15b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_nn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_optimizers.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_optimizers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4702af395d14ae369ef41feba0d308c963596a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_optimizers.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_pruning.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_pruning.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95607c9e468152ff0d3fa1b366690209cfcde0ed
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_pruning.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantization.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2717507792a650f22f4bd71c7905a29ce837a1a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantization.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantized.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantized.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61e5fa1d43c2a77bd5c7aa2b635b79223a1e7829
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_quantized.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_subclass.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_subclass.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92afc4dd5bdb0906a2719e43a355bce473612848
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_subclass.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87c27bd5dae7e95a07073df1d82155f65ee167e1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/common_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/composite_compliance.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/composite_compliance.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01cad2ebafcc4dc8e5dbc851590d9621bbfaef66
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/composite_compliance.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/control_flow_opinfo_db.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/control_flow_opinfo_db.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db8a41b8430fec83d4ba24845e86e084ff3adb6f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/control_flow_opinfo_db.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/custom_op_db.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/custom_op_db.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a6765c081598c22079d2fcad73dafe17f23e6eb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/custom_op_db.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dist_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dist_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bfb059d073dd6679181f688ab3c4b518ea3220a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dist_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dynamo_test_failures.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dynamo_test_failures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11dbf447e32a758682bb5a92d388479fbf298693
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/dynamo_test_failures.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/hypothesis_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/hypothesis_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec0f5a8c5ac5ea01cfb56a4c57c81e4717f47df0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/hypothesis_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/inductor_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/inductor_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddfdeeae73244f4332a78a329c05dbb9f7b0dedf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/inductor_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_metaprogramming_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_metaprogramming_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56c01a4ffd617915068a1a4f2c29cc9842adac30
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_metaprogramming_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ab5c3b2548c5f262e84940e9c5646543c118282
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/jit_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6904ae53c0f1cda590c3d5a8cacbf5a0c4f3c414
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51b96198e646ff62f9b9d3078eed35d982bacbf0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/logging_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/quantization_torch_package_models.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/quantization_torch_package_models.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be36f895cbc8c0b46b8770784e6e9b8c4ba934d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/quantization_torch_package_models.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/static_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/static_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7187418857cdb815c6d5f20a32a0e5b8266c418c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/static_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/triton_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/triton_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b76d654fe340a77569fe28a3da66ee4c5e0bada
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/triton_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/two_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/two_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14a4b48fa878aac075bb369405817176b765f8ac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/__pycache__/two_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/autocast_test_lists.py b/MLPY/Lib/site-packages/torch/testing/_internal/autocast_test_lists.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda7f3aa5949f42aac65bdc3d9a14a2f9e870253
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/autocast_test_lists.py
@@ -0,0 +1,369 @@
+# mypy: ignore-errors
+
+import torch
+from torch.testing._internal.common_utils import TEST_WITH_ROCM
+
+
+class AutocastTestLists:
+    def _rnn_cell_args(self, n, num_chunks, is_lstm, dev, dtype):
+        input = (torch.randn((n, n), device=dev, dtype=torch.float32),)
+
+        hx = ((torch.randn((n, n), device=dev, dtype=torch.float32),
+               torch.randn((n, n), device=dev, dtype=torch.float32)) if is_lstm else
+              torch.randn((n, n), device=dev, dtype=torch.float32),)
+
+        weights = (torch.randn((num_chunks * n, n), device=dev, dtype=torch.float32),  # weight_ih
+                   torch.randn((num_chunks * n, n), device=dev, dtype=torch.float32),  # weight_hh
+                   torch.randn((num_chunks * n), device=dev, dtype=torch.float32),  # bias_ih
+                   torch.randn((num_chunks * n), device=dev, dtype=torch.float32))  # bias_hh
+
+        # returns args as a tuple
+        return input + hx + weights
+
+    # Supplies ops and arguments for test_autocast_* in test/test_cuda.py
+    def __init__(self, dev):
+        super().__init__()
+        n = 8
+        # Utility arguments, created as one-element tuples
+        pointwise0_fp16 = (torch.randn(n, dtype=torch.float16, device=dev),)
+        pointwise1_fp16 = (torch.randn(n, dtype=torch.float16, device=dev),)
+        pointwise2_fp16 = (torch.randn(n, dtype=torch.float16, device=dev),)
+        mat0_fp16 = (torch.randn((n, n), dtype=torch.float16, device=dev),)
+        mat1_fp16 = (torch.randn((n, n), dtype=torch.float16, device=dev),)
+        mat2_fp16 = (torch.randn((n, n), dtype=torch.float16, device=dev),)
+
+        dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
+        conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
+                           torch.randn(dimset, dtype=torch.float32, device=dev))
+                          for dimset in dimsets]
+        bias_fp32 = (torch.randn((n,), dtype=torch.float32, device=dev),)
+        element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)
+        pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
+        pointwise1_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
+        mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+
+        # The lists below organize ops that autocast needs to test.
+        # self.list_name corresponds to test_autocast_list_name in test/test_cuda.py.
+        # Each op is associated with a tuple of valid arguments.
+        # In addition, cudnn conv ops are not supported on ROCm and hence will
+        # be skipped by passing TEST_WITH_ROCM flag to those ops in self.torch_fp16 list.
+
+        # Some ops implement built-in type promotion.  These don't need autocasting,
+        # but autocasting relies on their promotion, so we include tests to double-check.
+        self.torch_expect_builtin_promote = [
+            ("eq", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("ge", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("gt", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("le", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("lt", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("ne", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("add", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("div", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("mul", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("cat", (pointwise0_fp16 + pointwise1_fp32,), torch.float32),
+            ("equal", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("stack", (pointwise0_fp16 + pointwise1_fp32,), torch.float32),
+        ]
+        self.methods_expect_builtin_promote = [
+            ("__eq__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__ge__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__gt__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__le__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__lt__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__ne__", pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__add__", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("__div__", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("__mul__", pointwise0_fp32 + pointwise1_fp16, torch.float32),
+        ]
+
+        # The remaining lists organize ops that autocast treats explicitly.
+        self.torch_fp16 = [
+            # deprecated _convolution
+            ("_convolution", conv_args_fp32[1] + bias_fp32 + ((1, 1), (0, 0), (1, 1), False,
+                                                              (0, 0), 1, False, True, True)),
+            # the current  _convolution
+            ("_convolution", conv_args_fp32[1] + bias_fp32 + ((1, 1), (0, 0), (1, 1), False,
+                                                              (0, 0), 1, False, True, True, True)),
+            ("conv1d", conv_args_fp32[0]),
+            ("conv2d", conv_args_fp32[1]),
+            ("conv3d", conv_args_fp32[2]),
+            ("conv_tbc", conv_args_fp32[0] + bias_fp32),
+            ("conv_transpose1d", conv_args_fp32[0]),
+            ("conv_transpose2d", conv_args_fp32[1]),
+            ("conv_transpose3d", conv_args_fp32[2]),
+            ("convolution", conv_args_fp32[1] + bias_fp32 + ((1, 1), (0, 0), (1, 1), False, (0, 0), 1)),
+            ("cudnn_convolution", conv_args_fp32[1] + ((0, 0), (1, 1), (1, 1), 1, False, True, True), TEST_WITH_ROCM),
+            ("cudnn_convolution_transpose", conv_args_fp32[1] + ((0, 0), (0, 0), (1, 1),
+                                                                 (1, 1), 1, False, True, True), TEST_WITH_ROCM),
+            ("prelu", pointwise0_fp32 + element0_fp32),
+            ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
+            ("addmv", pointwise0_fp32 + mat2_fp32 + pointwise1_fp32),
+            ("addr", mat0_fp32 + pointwise0_fp32 + pointwise1_fp32),
+            ("matmul", mat0_fp32 + mat1_fp32),
+            ("einsum", "bkhd,bqhd->bqkh", mat0_fp32 + mat1_fp32),
+            ("mm", mat0_fp32 + mat1_fp32),
+            ("mv", mat0_fp32 + pointwise0_fp32),
+            ("chain_matmul", mat0_fp32 + mat1_fp32 + mat2_fp32),
+            ("addbmm", mat0_fp32 + (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                                    torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                         torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                         torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            # _thnn_fused_lstm_cell and _thnn_fused_gru_cell are not Python-exposed as far as I can tell.
+            # ("_thnn_fused_lstm_cell", mat0_fp32 + mat1_fp32 + mat2_fp32 + pointwise0_fp32 + pointwise1_fp32),
+            # ("_thnn_fused_gru_cell", mat0_fp32 + mat1_fp32 + mat2_fp32 + pointwise0_fp32 + pointwise1_fp32),
+            ("lstm_cell", self._rnn_cell_args(n, num_chunks=4, is_lstm=True, dev=dev, dtype=torch.float32)),
+            ("gru_cell", self._rnn_cell_args(n, num_chunks=3, is_lstm=False, dev=dev, dtype=torch.float32)),
+            ("rnn_tanh_cell", self._rnn_cell_args(n, num_chunks=1, is_lstm=False, dev=dev, dtype=torch.float32)),
+            ("rnn_relu_cell", self._rnn_cell_args(n, num_chunks=1, is_lstm=False, dev=dev, dtype=torch.float32)),
+        ]
+        self.torch_fp32 = [
+            ("acos", (pointwise0_fp16[0].clamp(-.9, 0.9),)),
+            ("asin", (pointwise0_fp16[0].clamp(-.9, 0.9),)),
+            ("cosh", pointwise0_fp16),
+            ("erfinv", (pointwise0_fp16[0].clamp(-.9, .9),)),
+            ("exp", pointwise0_fp16),
+            ("expm1", pointwise0_fp16),
+            ("log", (pointwise0_fp16[0].clamp(0.1, 100.0),)),
+            ("log10", (pointwise0_fp16[0].clamp(0.1, 100.0),)),
+            ("log2", (pointwise0_fp16[0].clamp(0.1, 100.0),)),
+            ("log1p", (pointwise0_fp16[0].clamp(-0.9, 100.0),)),
+            ("reciprocal", pointwise0_fp16),
+            ("rsqrt", (pointwise0_fp16[0].clamp(0.0, 100.0),)),
+            ("sinh", pointwise0_fp16),
+            ("tan", (pointwise0_fp16[0].clamp(-3.1 / 2, 3.1 / 2),)),
+            ("pow", ((pointwise0_fp16[0] + 1.).clamp(0.0, 100.0),) + pointwise1_fp16),
+            ("pow", ((pointwise0_fp16[0] + 1.).clamp(0.0, 100.0),) + (1.7,)),
+            # ("pow", (1.7,) + pointwise0_fp16), # This variant has a backend, but is not documented in the API.
+            ("softmax", pointwise0_fp16 + (0,)),
+            ("log_softmax", pointwise0_fp16 + (0,)),
+            ("layer_norm", pointwise0_fp16 + ((pointwise0_fp16[0].numel(),),)),
+            ("group_norm", mat0_fp16 + (1,)),
+            ("norm", pointwise0_fp16),
+            ("norm", pointwise0_fp16, {"dim": 0}),
+            # these need magma
+            # ("norm", mat0_fp16, {"p": "nuc"}),
+            # ("norm", mat0_fp16, {"p": "nuc", "dim": 0}),
+            ("norm", pointwise0_fp16, {"p": 1}),
+            ("norm", pointwise0_fp16, {"p": 1, "dim": 0}),
+            ("cosine_similarity", mat0_fp16 + mat1_fp16),
+            ("poisson_nll_loss", mat0_fp16 + mat1_fp16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))),
+            ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.float16),
+                                       torch.tensor([[1, 3, 4]], device=dev, dtype=torch.float16),
+                                       torch.tensor([1], device=dev, dtype=torch.int))),
+            ("hinge_embedding_loss", mat0_fp16 + (torch.ones(n, device=dev, dtype=torch.int),)),
+            ("kl_div", mat0_fp16 + (torch.rand((n, n), device=dev, dtype=torch.float16),)),
+            ("margin_ranking_loss", mat0_fp16 + mat1_fp16 + (torch.ones((n,), device=dev, dtype=torch.float16),)),
+            ("triplet_margin_loss", mat0_fp16 + mat1_fp16 + mat2_fp16),
+            ("binary_cross_entropy_with_logits", mat0_fp16 + (torch.rand((n, n), device=dev, dtype=torch.float16),)),
+            ("cumprod", pointwise0_fp16 + (0,)),
+            ("cumsum", pointwise0_fp16 + (0,)),
+            ("dist", pointwise0_fp16 + pointwise1_fp16),
+            ("pdist", mat0_fp16),
+            ("cdist", mat0_fp16 + mat1_fp16),
+            ("prod", pointwise0_fp16),
+            ("prod", pointwise0_fp16 + (0,)),
+            ("renorm", mat0_fp16 + (2, 0, 1.0)),
+            ("sum", pointwise0_fp16),
+            ("sum", mat0_fp16 + (1,)),
+            ("logsumexp", mat0_fp16 + (1,)),
+        ]
+        self.torch_need_autocast_promote = [
+            ("addcdiv", pointwise0_fp32 + pointwise1_fp16 + (pointwise2_fp16[0].clamp(0.1, 100),)),
+            ("addcmul", pointwise0_fp32 + pointwise1_fp16 + pointwise2_fp16),
+            ("atan2", pointwise0_fp32 + (pointwise1_fp16[0].clamp(0.1, 100),)),
+            ("bilinear", (torch.randn((1, 2), dtype=torch.float16, device=dev),
+                          torch.randn((1, 2), dtype=torch.float32, device=dev),
+                          torch.randn((1, 2, 2), dtype=torch.float16, device=dev),
+                          torch.randn((1,), dtype=torch.float32, device=dev))),
+            ("cross", (torch.randn(3, dtype=torch.float32, device=dev),
+                       torch.randn(3, dtype=torch.float16, device=dev))),
+            ("dot", pointwise0_fp16 + pointwise1_fp32),
+            ("grid_sampler", (torch.randn((2, 3, 33, 22), dtype=torch.float16, device=dev),
+                              torch.randn((2, 22, 11, 2), dtype=torch.float32, device=dev),
+                              0, 0, False)),
+            ("index_put", pointwise0_fp32 + ((torch.tensor([1], device=dev, dtype=torch.long),),
+                                             torch.randn(1, device=dev, dtype=torch.float16))),
+            ("index_put", pointwise0_fp16 + ((torch.tensor([1], device=dev, dtype=torch.long),),
+                                             torch.randn(1, device=dev, dtype=torch.float32))),
+            ("tensordot", (torch.randn((2, 2, 2), dtype=torch.float32, device=dev),
+                           torch.randn((2, 2, 2), dtype=torch.float16, device=dev))),
+            ("scatter_add", (torch.zeros(2, 2, 2, dtype=torch.float32, device=dev),
+                             0,
+                             torch.randint(0, 2, (2, 2, 2), device=dev),
+                             torch.randn((2, 2, 2), dtype=torch.float16, device=dev))),
+            ("scatter_add", (torch.zeros(2, 2, 2, dtype=torch.float16, device=dev),
+                             0,
+                             torch.randint(0, 2, (2, 2, 2), device=dev),
+                             torch.randn((2, 2, 2), dtype=torch.float32, device=dev))),
+        ]
+        self.nn_fp16 = [
+            ("linear", mat0_fp32 + mat1_fp32 + mat2_fp32),
+        ]
+        self.nn_fp32 = [
+            ("softplus", pointwise0_fp16),
+            ("nll_loss", (torch.rand((n, n), device=dev, dtype=torch.float),
+                          torch.zeros((n,), device=dev, dtype=torch.long))),
+            ("nll_loss2d", (torch.rand((n, n, n, n), device=dev, dtype=torch.half),
+                            torch.zeros((n, n, n), device=dev, dtype=torch.long))),
+            ("l1_loss", mat0_fp16 + mat1_fp16),
+            ("smooth_l1_loss", mat0_fp16 + mat1_fp16),
+            ("mse_loss", mat0_fp16 + mat1_fp16),
+            ("multilabel_margin_loss", mat0_fp16 + (torch.ones((n, n), device=dev, dtype=torch.long),)),
+            ("soft_margin_loss", mat0_fp16 + (torch.ones((n, n), device=dev, dtype=torch.long),)),
+            ("multi_margin_loss", mat0_fp16 + (torch.ones((n,), device=dev, dtype=torch.long),)),
+        ]
+        self.linalg_fp16 = [
+            ("linalg_vecdot", mat0_fp32 + mat0_fp32),
+            ("linalg_multi_dot", (mat0_fp32 + mat1_fp32 + mat2_fp32,)),
+        ]
+        self.methods_fp16 = [
+            ("__matmul__", mat0_fp32 + mat1_fp32)
+        ]
+        self.methods_fp32 = [
+            ("__pow__", (torch.rand(n, device=dev, dtype=torch.float16), 1.5)),
+        ]
+        self.banned = [
+            ("binary_cross_entropy", (torch.rand((n, n), device=dev, dtype=torch.float32),
+                                      torch.rand((n, n), device=dev, dtype=torch.float32)), torch._C._nn),
+        ]
+
+class AutocastCPUTestLists:
+    # Supplies ops and arguments for test_autocast_* in test/test_cpu.py
+    def __init__(self, dev):
+        super().__init__()
+        n = 8
+        # Utility arguments, created as one-element tuples
+        pointwise0_bf16 = (torch.randn(n, dtype=torch.bfloat16, device=dev),)
+        pointwise1_bf16 = (torch.randn(n, dtype=torch.bfloat16, device=dev),)
+        pointwise2_bf16 = (torch.randn(n, dtype=torch.bfloat16, device=dev),)
+        mat0_bf16 = (torch.randn((n, n), dtype=torch.bfloat16, device=dev),)
+        mat1_bf16 = (torch.randn((n, n), dtype=torch.bfloat16, device=dev),)
+        mat2_bf16 = (torch.randn((n, n), dtype=torch.bfloat16, device=dev),)
+
+        pointwise0_fp16 = (torch.randn(n, dtype=torch.float16, device=dev),)
+        pointwise1_fp16 = (torch.randn(n, dtype=torch.float16, device=dev),)
+
+        dummy_dimsets = ((n,), (n, n), (n, n, n), (n, n, n, n), (n, n, n, n, n))
+
+        dummy_bf16 = [(torch.randn(dimset, dtype=torch.bfloat16, device=dev),)
+                      for dimset in dummy_dimsets]
+
+        dimsets = ((n, n, n), (n, n, n, n), (n, n, n, n, n))
+        conv_args_bf16 = [(torch.randn(dimset, dtype=torch.bfloat16, device=dev),
+                           torch.randn(dimset, dtype=torch.bfloat16, device=dev))
+                          for dimset in dimsets]
+        conv_args_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),
+                           torch.randn(dimset, dtype=torch.float32, device=dev))
+                          for dimset in dimsets]
+
+        bias_fp32 = (torch.randn((n,), dtype=torch.float32, device=dev),)
+        element0_fp32 = (torch.randn(1, dtype=torch.float32, device=dev),)
+        pointwise0_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
+        pointwise1_fp32 = (torch.randn(n, dtype=torch.float32, device=dev),)
+        mat0_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat1_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat2_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+        mat3_fp32 = (torch.randn((n, n), dtype=torch.float32, device=dev),)
+
+        dummy_fp32 = [(torch.randn(dimset, dtype=torch.float32, device=dev),)
+                      for dimset in dummy_dimsets]
+        # The lists below organize ops that autocast needs to test.
+        # self.list_name corresponds to test_autocast_list_name in test/test_cpu.py.
+        # Each op is associated with a tuple of valid arguments.
+
+        # Some ops implement built-in type promotion.  These don't need autocasting,
+        # but autocasting relies on their promotion, so we include tests to double-check.
+        self.torch_expect_builtin_promote = [
+            ("eq", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("ge", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("gt", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("le", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("lt", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("ne", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("add", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("div", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("mul", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+        ]
+
+        self.methods_expect_builtin_promote = [
+            ("__eq__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__ge__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__gt__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__le__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__lt__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__ne__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.bool),
+            ("__add__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("__div__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+            ("__mul__", pointwise0_fp32 + pointwise1_bf16, pointwise0_fp32 + pointwise1_fp16, torch.float32),
+        ]
+        # The remaining lists organize ops that autocast treats explicitly.
+        self.torch_16 = [
+            ("conv1d", conv_args_fp32[0]),
+            ("conv2d", conv_args_fp32[1]),
+            ("conv3d", conv_args_fp32[2]),
+            ("bmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                     torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("mm", mat0_fp32 + mat1_fp32),
+            ("matmul", mat0_fp32 + mat1_fp32),
+            ("baddbmm", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                         torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                         torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("addmm", mat1_fp32 + mat2_fp32 + mat3_fp32),
+            ("addbmm", mat0_fp32 + (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                                    torch.randn((n, n, n), device=dev, dtype=torch.float32))),
+            ("conv_tbc", (torch.randn((10, 7, 3), device=dev, dtype=torch.float32),
+                          torch.randn((5, 3, 5), device=dev, dtype=torch.float32),
+                          torch.randn(5, device=dev, dtype=torch.float32),
+                          0)),
+            ("conv_transpose1d", conv_args_fp32[0]),
+            ("conv_transpose2d", conv_args_fp32[1]),
+            ("conv_transpose3d", conv_args_fp32[2]),
+            ("prelu", pointwise0_fp32 + element0_fp32),
+            ("_native_multi_head_attention", (torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                                              torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                                              torch.randn((n, n, n), device=dev, dtype=torch.float32),
+                                              n, 4, torch.randn((3 * n, n), device=dev, dtype=torch.float32),
+                                              torch.randn((3 * n), device=dev, dtype=torch.float32),
+                                              torch.randn((n, n), device=dev, dtype=torch.float32),
+                                              torch.randn((n), device=dev, dtype=torch.float32))),
+        ]
+        self.torch_fp32 = [
+            ("poisson_nll_loss", mat0_bf16 + mat1_bf16 + (True, False, 1.e-8, torch.nn._reduction.get_enum('mean'))),
+            ("cosine_embedding_loss", (torch.tensor([[1, 2, 3]], device=dev, dtype=torch.bfloat16),
+                                       torch.tensor([[1, 3, 4]], device=dev, dtype=torch.bfloat16),
+                                       torch.tensor([1], device=dev, dtype=torch.int))),
+            ("hinge_embedding_loss", mat0_bf16 + (torch.ones(n, device=dev, dtype=torch.int),)),
+            ("margin_ranking_loss", mat0_bf16 + mat1_bf16 + (torch.ones((n,), device=dev, dtype=torch.bfloat16),)),
+            ("triplet_margin_loss", mat0_bf16 + mat1_bf16 + mat2_bf16),
+            ("binary_cross_entropy_with_logits", mat0_bf16 + (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
+        ]
+        self.nn_16 = [
+            ("linear", mat0_fp32 + mat1_fp32, {}),
+        ]
+        self.nn_fp32 = [
+            ("avg_pool3d", dummy_bf16[3], {"kernel_size": (3, 3, 3), "stride": (1, 1, 1)}),
+            ("binary_cross_entropy", (torch.rand((n, n), device=dev, dtype=torch.bfloat16),) +
+                                     (torch.rand((n, n), device=dev, dtype=torch.bfloat16),)),
+            ("reflection_pad1d", dummy_bf16[2], {"padding": (3, 3)}),
+            ("nll_loss", (torch.rand((n, n), device=dev, dtype=torch.bfloat16),
+                          torch.zeros((n,), device=dev, dtype=torch.long))),
+            ("nll_loss2d", (torch.rand((n, n, n, n), device=dev, dtype=torch.bfloat16),
+                            torch.zeros((n, n, n), device=dev, dtype=torch.long))),
+            ("l1_loss", mat0_bf16 + mat1_bf16),
+            ("smooth_l1_loss", mat0_bf16 + mat1_bf16),
+            ("mse_loss", mat0_bf16 + mat1_bf16),
+            ("multilabel_margin_loss", mat0_bf16 + (torch.ones((n, n), device=dev, dtype=torch.long),)),
+            ("soft_margin_loss", mat0_bf16 + (torch.ones((n, n), device=dev, dtype=torch.long),)),
+            ("multi_margin_loss", mat0_bf16 + (torch.ones((n,), device=dev, dtype=torch.long),)),
+            ("huber_loss", mat0_bf16 + mat1_bf16),
+        ]
+        self.torch_need_autocast_promote = [
+            ("cat", (pointwise0_bf16 + pointwise1_fp32,), (pointwise0_fp16 + pointwise1_fp32,)),
+            ("stack", (pointwise0_bf16 + pointwise1_fp32,), (pointwise0_fp16 + pointwise1_fp32,)),
+        ]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/autograd_function_db.py b/MLPY/Lib/site-packages/torch/testing/_internal/autograd_function_db.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b044f9475becb7c519d0a5257a6c5f2be6541d9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/autograd_function_db.py
@@ -0,0 +1,632 @@
+# mypy: ignore-errors
+
+import torch
+from functools import partial
+from torch.testing import make_tensor
+from torch.testing._internal.opinfo.core import (
+    OpInfo,
+    SampleInput,
+)
+from torch.testing._internal.common_dtype import all_types_and
+import numpy as np
+
+# Note: [autograd.Function db]
+#
+# This is a collection of autograd.Function test cases written as OpInfos
+# so they can easily be consumed by OpInfo-based tests to check if a subsystem
+# supports autograd.Function.
+#
+# Axes:
+# - saves {output, input, intermediate, non-tensor}
+# - {inputs, output} x {single tensor, tensors, arbitrary objects}
+# - Uses {mark_dirty, mark_non_differentiable, once_differentiable}
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+class NumpyCube(torch.autograd.Function):
+    @staticmethod
+    def forward(input):
+        input_np = to_numpy(input)
+        dinput = torch.tensor(3 * input_np ** 2, device=input.device)
+        return torch.tensor(input_np ** 3, device=input.device), dinput
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        ctx.save_for_backward(inputs[0], output[1])
+        ctx.save_for_forward(inputs[0], output[1])
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_saved):
+        input, dinput = ctx.saved_tensors
+        return NumpyMul.apply(grad_output, dinput) + 6 * NumpyMul.apply(grad_saved, input)
+
+    @staticmethod
+    def vmap(info, in_dims, input):
+        result = NumpyCube.apply(input)
+        return result, (in_dims[0], in_dims[0])
+
+    @staticmethod
+    def jvp(ctx, input_tangent):
+        input, dinput = ctx.saved_tensors
+        return NumpyMul.apply(input_tangent, dinput), 6 * NumpyMul.apply(input_tangent, input)
+
+
+class CubeGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x):
+        return x ** 3, 3 * x ** 2
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        ctx.save_for_backward(inputs[0], outputs[1])
+        ctx.save_for_forward(inputs[0], outputs[1])
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_saved):
+        input, dinput = ctx.saved_tensors
+        result = grad_output * dinput + 6 * dinput
+        return result
+
+    @staticmethod
+    def jvp(ctx, input_tangent):
+        input, dinput = ctx.saved_tensors
+        return MulGenVmap.apply(input_tangent, dinput), 6 * NumpyMul.apply(input_tangent, input)
+
+
+def sample_inputs_numpy_cube(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(1, low=0.8, high=2), args=())
+
+
+class NumpyCubeNotComposable(torch.autograd.Function):
+    @staticmethod
+    def forward(input):
+        input_np = to_numpy(input)
+        return torch.tensor(input_np ** 3, device=input.device), input_np
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        _, input_np = output
+        ctx.input_np = input_np
+        ctx.device = inputs[0].device
+
+    @staticmethod
+    @torch.autograd.function.once_differentiable
+    def backward(ctx, grad_output, grad_saved):
+        result_np = 3 * (ctx.input_np ** 2)
+        return torch.tensor(result_np, device=ctx.device)
+
+
+class NumpyMul(torch.autograd.Function):
+    @staticmethod
+    def forward(x, y):
+        return torch.tensor(to_numpy(x) * to_numpy(y), device=x.device)
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        ctx.save_for_backward(*inputs)
+        ctx.save_for_forward(*inputs)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x, y = ctx.saved_tensors
+        gx = None
+        if ctx.needs_input_grad[0]:
+            gx = NumpyMul.apply(grad_output, y)
+        gy = None
+        if ctx.needs_input_grad[1]:
+            gy = NumpyMul.apply(grad_output, x)
+        return gx, gy
+
+    @staticmethod
+    def vmap(info, in_dims, x, y):
+        x_bdim, y_bdim = in_dims
+        x = x.movedim(x_bdim, -1) if x_bdim is not None else x.unsqueeze(-1)
+        y = y.movedim(y_bdim, -1) if y_bdim is not None else y.unsqueeze(-1)
+        result = NumpyMul.apply(x, y)
+        result = result.movedim(-1, 0)
+        return result, 0
+
+    @staticmethod
+    def jvp(ctx, x_tangent, y_tangent):
+        x, y = ctx.saved_tensors
+        return x_tangent * y + y_tangent * x
+
+def sample_inputs_numpy_mul(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # Broadcasting
+    yield SampleInput(make_arg(4, low=0.9, high=2), args=(make_arg(3, 4, low=0.9, high=2),))
+
+
+class MulGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x, y):
+        return x * y
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        ctx.save_for_backward(*inputs)
+        ctx.save_for_forward(*inputs)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x, y = ctx.saved_tensors
+        gx = None
+        if ctx.needs_input_grad[0]:
+            gx = MulGenVmap.apply(grad_output, y)
+        gy = None
+        if ctx.needs_input_grad[1]:
+            gy = MulGenVmap.apply(grad_output, x)
+        return gx, gy
+
+    @staticmethod
+    def jvp(ctx, x_tangent, y_tangent):
+        x, y = ctx.saved_tensors
+        return x_tangent * y + y_tangent * x
+
+
+class NumpyExp_(torch.autograd.Function):
+    @staticmethod
+    def forward(x):
+        x_np = to_numpy(x)
+        np.exp(x_np, x_np)
+        return x
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        x, = inputs
+        ctx.mark_dirty(x)
+        ctx.save_for_backward(output)
+        ctx.save_for_forward(output)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        output, = ctx.saved_tensors
+        return NumpyMul.apply(grad_output, output)
+
+    @staticmethod
+    def vmap(info, in_dims, x):
+        NumpyExp_.apply(x)
+        return x, in_dims[0]
+
+    @staticmethod
+    def jvp(ctx, x_tangent):
+        # Doesn't call numpy operations because I didn't want to write NumpyMul_
+        output, = ctx.saved_tensors
+        x_tangent.mul_(output)
+        return x_tangent
+
+class NumpySort(torch.autograd.Function):
+    @staticmethod
+    def forward(x, dim):
+        device = x.device
+        x = to_numpy(x)
+        ind = np.argsort(x, axis=dim)
+        ind_inv = np.argsort(ind, axis=dim)
+        result = np.take_along_axis(x, ind, axis=dim)
+        return (
+            torch.tensor(x, device=device),
+            torch.tensor(ind, device=device),
+            torch.tensor(ind_inv, device=device),
+        )
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        x, dim = inputs
+        _, ind, ind_inv = output
+        ctx.mark_non_differentiable(ind, ind_inv)
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.save_for_forward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output, _0, _1):
+        ind, ind_inv = ctx.saved_tensors
+        return NumpyTake.apply(grad_output, ind_inv, ind, ctx.dim), None
+
+    @staticmethod
+    def vmap(info, in_dims, x, dim):
+        x_bdim, _ = in_dims
+        x = x.movedim(x_bdim, 0)
+        # wrap dim
+        dim = dim if dim >= 0 else dim + x.dim() - 1
+        return NumpySort.apply(x, dim + 1), (0, 0, 0)
+
+    @staticmethod
+    def jvp(ctx, x_tangent, _):
+        ind, ind_inv = ctx.saved_tensors
+        return NumpyTake.apply(x_tangent, ind, ind_inv, ctx.dim), None, None
+
+class SortGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x, dim):
+        device = x.device
+        ind = torch.argsort(x, dim=dim)
+        ind_inv = torch.argsort(ind, axis=dim)
+        result = torch.take_along_dim(x, ind, dim=dim)
+        return result, ind, ind_inv
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, dim = inputs
+        _, ind, ind_inv = outputs
+        ctx.mark_non_differentiable(ind, ind_inv)
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.save_for_forward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output, _0, _1):
+        ind, ind_inv = ctx.saved_tensors
+        return TakeGenVmap.apply(grad_output, ind_inv, ind, ctx.dim), None
+
+    @staticmethod
+    def jvp(ctx, x_tangent, _):
+        ind, ind_inv = ctx.saved_tensors
+        return TakeGenVmap.apply(x_tangent, ind, ind_inv, ctx.dim), None, None
+
+
+def sample_inputs_numpy_sort(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(3, 5), args=(1,))
+
+
+def sample_inputs_numpy_take(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    tensor = make_arg(3, 5)
+    dim = 1
+    _, ind, ind_inv = NumpySort.apply(tensor, 1)
+    yield SampleInput(tensor, args=(ind, ind_inv, dim))
+
+
+class NumpyTake(torch.autograd.Function):
+    @staticmethod
+    def forward(x, ind, ind_inv, dim):
+        device = x.device
+        x = to_numpy(x)
+        ind = to_numpy(ind)
+        return torch.tensor(np.take_along_axis(x, ind, dim), device=device)
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        x, ind, ind_inv, dim = inputs
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.save_for_forward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        ind, ind_inv = ctx.saved_tensors
+        result = NumpyTake.apply(grad_output, ind_inv, ind, ctx.dim)
+        return result, None, None, None
+
+    @staticmethod
+    def vmap(info, in_dims, x, ind, ind_inv, dim):
+        x_bdim, ind_bdim, ind_inv_bdim, _ = in_dims
+
+        # wrap dim
+        logical_dim = x.dim() if x_bdim is None else x_bdim - 1
+        dim = dim if dim >= 0 else dim + logical_dim
+
+        def expand_bdim(x, x_bdim):
+            if x_bdim is None:
+                return x.expand(info.batch_size, *x.shape)
+            return x.movedim(x_bdim, 0)
+
+        x = expand_bdim(x, x_bdim)
+        ind = expand_bdim(ind, ind_bdim)
+        ind_inv = expand_bdim(ind_inv, ind_inv_bdim)
+
+        return NumpyTake.apply(x, ind, ind_inv, dim + 1), 0
+
+    @staticmethod
+    def jvp(ctx, x_tangent, ind_tangent, ind_inv_tangent, _):
+        assert ind_tangent is None
+        assert ind_inv_tangent is None
+        ind, ind_inv = ctx.saved_tensors
+        return NumpyTake.apply(x_tangent, ind, ind_inv, ctx.dim)
+
+class TakeGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x, ind, ind_inv, dim):
+        return torch.take_along_dim(x, ind, dim)
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, ind, ind_inv, dim = inputs
+        ctx.save_for_backward(ind, ind_inv)
+        ctx.save_for_forward(ind, ind_inv)
+        ctx.dim = dim
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        ind, ind_inv = ctx.saved_tensors
+        result = TakeGenVmap.apply(grad_output, ind_inv, ind, ctx.dim)
+        return result, None, None, None
+
+    @staticmethod
+    def jvp(ctx, x_tangent, ind_tangent, ind_inv_tangent, _):
+        ind, ind_inv = ctx.saved_tensors
+        return TakeGenVmap.apply(x_tangent, ind, ind_inv, ctx.dim)
+
+class Select(torch.autograd.Function):
+    @staticmethod
+    def forward(x, idx):
+        return x[idx]
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        x, idx = inputs
+        ctx.x_shape = x.shape
+        ctx.idx = idx
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        result = grad_output.new_zeros(ctx.x_shape)
+        result[ctx.idx] = grad_output
+        return result, None
+
+    @staticmethod
+    def vmap(info, in_dims, x, idx):
+        x_bdim, _ = in_dims
+        x = x.movedim(x_bdim, 1)
+        return Select.apply(x, idx), 0
+
+    @staticmethod
+    def jvp(ctx, x_tangent, _):
+        return Select.apply(x_tangent, ctx.idx)
+
+class SelectGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x, idx):
+        return x[idx]
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        x, idx = inputs
+        ctx.x_shape = x.shape
+        ctx.idx = idx
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        result = grad_output.new_zeros(ctx.x_shape)
+        result[ctx.idx] = grad_output
+        return result, None
+
+    @staticmethod
+    def jvp(ctx, x_tangent, _):
+        return SelectGenVmap.apply(x_tangent, ctx.idx)
+
+
+def sample_inputs_select(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(3, 5), args=(2,))
+
+class ScaleGradGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+    scale = 3.14
+
+    @staticmethod
+    def forward(x):
+        return x.clone()
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        pass
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ScaleGradGenVmap.scale
+
+    @staticmethod
+    def jvp(ctx, x_tangent):
+        return x_tangent * ScaleGradGenVmap.scale
+
+class ZeroGradientsGenVmap(torch.autograd.Function):
+    generate_vmap_rule = True
+
+    @staticmethod
+    def forward(x, y):
+        return x.clone(), y.clone()
+
+    @staticmethod
+    def setup_context(ctx, inputs, outputs):
+        pass
+
+    @staticmethod
+    def backward(ctx, gx, gy):
+        # Intentionally returning torch.zeros instead of zeros_like or new_zeros.
+        # Also intentionally not None.
+        return (
+            # Intentionally too-large gradient
+            torch.zeros(3, 4, *gx.shape, dtype=gx.dtype, device=gx.device),
+            torch.zeros(gy.shape, dtype=gy.dtype, device=gy.device),
+        )
+
+    @staticmethod
+    def jvp(ctx, gx, gy):
+        # Intentionally returning torch.zeros instead of zeros_like or new_zeros.
+        # Also intentionally not None.
+        return (
+            torch.zeros(gx.shape, dtype=gx.dtype, device=gx.device),
+            torch.zeros(gy.shape, dtype=gy.dtype, device=gy.device),
+        )
+
+
+def sample_inputs_forward_default_args(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(3, 5))
+
+
+class ForwardHasDefaultArgs(torch.autograd.Function):
+    @staticmethod
+    def forward(x, idx=(2,)):
+        return x[idx]
+
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        x, idx = inputs
+        ctx.x_shape = x.shape
+        ctx.idx = idx
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        result = grad_output.new_zeros(ctx.x_shape)
+        result[ctx.idx] = grad_output
+        return result, None
+
+    @staticmethod
+    def vmap(info, in_dims, x, idx):
+        x_bdim, _ = in_dims
+        x = x.movedim(x_bdim, 1)
+        return ForwardHasDefaultArgs.apply(x, idx), 0
+
+    @staticmethod
+    def jvp(ctx, x_tangent, _):
+        return ForwardHasDefaultArgs.apply(x_tangent, ctx.idx)
+
+
+autograd_function_db = [
+    OpInfo(
+        'NumpyCubeAutogradFunction',
+        op=NumpyCube.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyExpMarkDirtyAutogradFunction',
+        op=lambda x: NumpyExp_.apply(x.clone()),
+        inplace_variant=NumpyExp_.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyMulAutogradFunction',
+        op=NumpyMul.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_mul,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyCubeNotComposableAutogradFunction',
+        op=lambda x: NumpyCubeNotComposable.apply(x)[0],
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpySortAutogradFunction',
+        op=NumpySort.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_sort,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        gradcheck_wrapper=lambda y, ind: y,
+    ),
+    OpInfo(
+        'NumpyTakeAutogradFunction',
+        op=NumpyTake.apply,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=False,
+        sample_inputs_func=sample_inputs_numpy_take,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'SelectAutogradFunction',
+        op=Select.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_select,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'CubeGenVmapAutogradFunction',
+        op=CubeGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'MulGenVmapAutogradFunction',
+        op=MulGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_mul,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'SortGenVmapAutogradFunction',
+        op=SortGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_sort,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        gradcheck_wrapper=lambda y, ind: y,
+    ),
+    OpInfo(
+        'SelectGenVmapAutogradFunction',
+        op=SelectGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_select,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'ScaleGradGenVmapAutogradFunction',
+        op=ScaleGradGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'ZeroGradientsGenVmapAutogradFunction',
+        op=ZeroGradientsGenVmap.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_numpy_mul,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'ForwardHasDefaultArgsAutogradFunction',
+        op=ForwardHasDefaultArgs.apply,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_forward_default_args,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/check_kernel_launches.py b/MLPY/Lib/site-packages/torch/testing/_internal/check_kernel_launches.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bd984975f0913b2db9082d72eda694d0bf05674
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/check_kernel_launches.py
@@ -0,0 +1,165 @@
+# mypy: ignore-errors
+
+import os
+import re
+import sys
+from typing import List
+
+__all__ = [
+    "check_code_for_cuda_kernel_launches",
+    "check_cuda_kernel_launches",
+]
+
+# FILES TO EXCLUDE (match is done with suffix using `endswith`)
+# You wouldn't drive without a seatbelt, though, so why would you
+# launch a kernel without some safety? Use this as a quick workaround
+# for a problem with the checker, fix the checker, then de-exclude
+# the files in question.
+exclude_files: List[str] = []
+
+# Without using a C++ AST we can't 100% detect kernel launches, so we
+# model them as having the pattern "<<<parameters>>>(arguments);"
+# We then require that `C10_CUDA_KERNEL_LAUNCH_CHECK` be
+# the next statement.
+#
+# We model the next statement as ending at the next `}` or `;`.
+# If we see `}` then a clause ended (bad) if we see a semi-colon then
+# we expect the launch check just before it.
+#
+# Since the kernel launch can include lambda statements, it's important
+# to find the correct end-paren of the kernel launch. Doing this with
+# pure regex requires recursive regex, which aren't part of the Python
+# standard library. To avoid an additional dependency, we build a prefix
+# regex that finds the start of a kernel launch, use a paren-matching
+# algorithm to find the end of the launch, and then another regex to
+# determine if a launch check is present.
+
+# Finds potential starts of kernel launches
+kernel_launch_start = re.compile(
+    r"^.*<<<[^>]+>>>\s*\(", flags=re.MULTILINE
+)
+
+# This pattern should start at the character after the final paren of the
+# kernel launch. It returns a match if the launch check is not the next statement
+has_check = re.compile(
+    r"\s*;(?![^;}]*C10_CUDA_KERNEL_LAUNCH_CHECK\(\);)", flags=re.MULTILINE
+)
+
+def find_matching_paren(s: str, startpos: int) -> int:
+    """Given a string "prefix (unknown number of characters) suffix"
+    and the position of the first `(` returns the index of the character
+    1 past the `)`, accounting for paren nesting
+    """
+    opening = 0
+    for i, c in enumerate(s[startpos:]):
+        if c == '(':
+            opening += 1
+        elif c == ')':
+            opening -= 1
+            if opening == 0:
+                return startpos + i + 1
+
+    raise IndexError("Closing parens not found!")
+
+
+def should_exclude_file(filename) -> bool:
+    for exclude_suffix in exclude_files:
+        if filename.endswith(exclude_suffix):
+            return True
+    return False
+
+
+def check_code_for_cuda_kernel_launches(code, filename=None):
+    """Checks code for CUDA kernel launches without cuda error checks.
+
+    Args:
+        filename - Filename of file containing the code. Used only for display
+                   purposes, so you can put anything here.
+        code     - The code to check
+
+    Returns:
+        The number of unsafe kernel launches in the code
+    """
+    if filename is None:
+        filename = "##Python Function Call##"
+
+    # We break the code apart and put it back together to add
+    # helpful line numberings for identifying problem areas
+    code = enumerate(code.split("\n"))                             # Split by line breaks
+    code = [f"{lineno}: {linecode}" for lineno, linecode in code]  # Number the lines
+    code = '\n'.join(code)                                         # Put it back together
+
+    num_launches_without_checks = 0
+    for m in kernel_launch_start.finditer(code):
+        end_paren = find_matching_paren(code, m.end() - 1)
+        if has_check.match(code, end_paren):
+            num_launches_without_checks += 1
+            context = code[m.start():end_paren + 1]
+            print(f"Missing C10_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{context}", file=sys.stderr)
+
+    return num_launches_without_checks
+
+
+def check_file(filename):
+    """Checks a file for CUDA kernel launches without cuda error checks
+
+    Args:
+        filename - File to check
+
+    Returns:
+        The number of unsafe kernel launches in the file
+    """
+    if not (filename.endswith((".cu", ".cuh"))):
+        return 0
+    if should_exclude_file(filename):
+        return 0
+    with open(filename) as fo:
+        contents = fo.read()
+        unsafeCount = check_code_for_cuda_kernel_launches(contents, filename)
+    return unsafeCount
+
+
+def check_cuda_kernel_launches():
+    """Checks all pytorch code for CUDA kernel launches without cuda error checks
+
+    Returns:
+        The number of unsafe kernel launches in the codebase
+    """
+    torch_dir = os.path.dirname(os.path.realpath(__file__))
+    torch_dir = os.path.dirname(torch_dir)  # Go up to parent torch
+    torch_dir = os.path.dirname(torch_dir)  # Go up to parent caffe2
+
+    kernels_without_checks = 0
+    files_without_checks = []
+    for root, dirnames, filenames in os.walk(torch_dir):
+        # `$BASE/build` and `$BASE/torch/include` are generated
+        # so we don't want to flag their contents
+        if root == os.path.join(torch_dir, "build") or root == os.path.join(torch_dir, "torch/include"):
+            # Curtail search by modifying dirnames and filenames in place
+            # Yes, this is the way to do this, see `help(os.walk)`
+            dirnames[:] = []
+            continue
+
+        for x in filenames:
+            filename = os.path.join(root, x)
+            file_result = check_file(filename)
+            if file_result > 0:
+                kernels_without_checks += file_result
+                files_without_checks.append(filename)
+
+    if kernels_without_checks > 0:
+        count_str = f"Found {kernels_without_checks} instances in " \
+                    f"{len(files_without_checks)} files where kernel " \
+                    "launches didn't have checks."
+        print(count_str, file=sys.stderr)
+        print("Files without checks:", file=sys.stderr)
+        for x in files_without_checks:
+            print(f"\t{x}", file=sys.stderr)
+        print(count_str, file=sys.stderr)
+
+    return kernels_without_checks
+
+
+if __name__ == "__main__":
+    unsafe_launches = check_cuda_kernel_launches()
+    sys.exit(0 if unsafe_launches == 0 else 1)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ee76da0bd8a1c5c7522a820a99c7503d904c32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a737f8e220034264761d73f1ff71f45bbb175a00
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/codegen/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_cuda.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bb9aaf7c38891f3daa18320a1de7aae09459c75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_cuda.py
@@ -0,0 +1,270 @@
+# mypy: ignore-errors
+
+r"""This file is allowed to initialize CUDA context when imported."""
+
+import functools
+import torch
+import torch.cuda
+from torch.testing._internal.common_utils import LazyVal, TEST_NUMBA, TEST_WITH_ROCM, TEST_CUDA, IS_WINDOWS
+import inspect
+import contextlib
+import os
+
+
+CUDA_ALREADY_INITIALIZED_ON_IMPORT = torch.cuda.is_initialized()
+
+
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+CUDA_DEVICE = torch.device("cuda:0") if TEST_CUDA else None
+# note: if ROCm is targeted, TEST_CUDNN is code for TEST_MIOPEN
+if TEST_WITH_ROCM:
+    TEST_CUDNN = LazyVal(lambda: TEST_CUDA)
+else:
+    TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
+
+TEST_CUDNN_VERSION = LazyVal(lambda: torch.backends.cudnn.version() if TEST_CUDNN else 0)
+
+SM53OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3))
+SM60OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0))
+SM70OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 0))
+SM75OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 5))
+SM80OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (8, 0))
+SM90OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0))
+
+def evaluate_gfx_arch_exact(matching_arch):
+    if not torch.cuda.is_available():
+        return False
+    gcn_arch_name = torch.cuda.get_device_properties('cuda').gcnArchName
+    arch = os.environ.get('PYTORCH_DEBUG_FLASH_ATTENTION_GCN_ARCH_OVERRIDE', gcn_arch_name)
+    return arch == matching_arch
+
+GFX90A_Exact = LazyVal(lambda: evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-'))
+GFX942_Exact = LazyVal(lambda: evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-'))
+
+def evaluate_platform_supports_flash_attention():
+    if TEST_WITH_ROCM:
+        return evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-') or evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-')
+    if TEST_CUDA:
+        return not IS_WINDOWS and SM80OrLater
+    return False
+
+PLATFORM_SUPPORTS_FLASH_ATTENTION: bool = LazyVal(lambda: evaluate_platform_supports_flash_attention())
+PLATFORM_SUPPORTS_MEM_EFF_ATTENTION: bool = LazyVal(lambda: TEST_CUDA and not TEST_WITH_ROCM)
+# TODO(eqy): gate this against a cuDNN version
+PLATFORM_SUPPORTS_CUDNN_ATTENTION: bool = LazyVal(lambda: TEST_CUDA and not TEST_WITH_ROCM and
+                                                  torch.backends.cuda.cudnn_sdp_enabled())
+# This condition always evaluates to PLATFORM_SUPPORTS_MEM_EFF_ATTENTION but for logical clarity we keep it separate
+PLATFORM_SUPPORTS_FUSED_ATTENTION: bool = LazyVal(lambda: PLATFORM_SUPPORTS_FLASH_ATTENTION or PLATFORM_SUPPORTS_MEM_EFF_ATTENTION)
+
+PLATFORM_SUPPORTS_FUSED_SDPA: bool = TEST_CUDA and not TEST_WITH_ROCM
+
+if TEST_NUMBA:
+    try:
+        import numba.cuda
+        TEST_NUMBA_CUDA = numba.cuda.is_available()
+    except Exception as e:
+        TEST_NUMBA_CUDA = False
+        TEST_NUMBA = False
+else:
+    TEST_NUMBA_CUDA = False
+
+# Used below in `initialize_cuda_context_rng` to ensure that CUDA context and
+# RNG have been initialized.
+__cuda_ctx_rng_initialized = False
+
+
+# after this call, CUDA context and RNG must have been initialized on each GPU
+def initialize_cuda_context_rng():
+    global __cuda_ctx_rng_initialized
+    assert TEST_CUDA, 'CUDA must be available when calling initialize_cuda_context_rng'
+    if not __cuda_ctx_rng_initialized:
+        # initialize cuda context and rng for memory tests
+        for i in range(torch.cuda.device_count()):
+            torch.randn(1, device=f"cuda:{i}")
+        __cuda_ctx_rng_initialized = True
+
+
+# Test whether hardware TF32 math mode enabled. It is enabled only on:
+# - CUDA >= 11
+# - arch >= Ampere
+def tf32_is_not_fp32():
+    if not torch.cuda.is_available() or torch.version.cuda is None:
+        return False
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+        return False
+    if int(torch.version.cuda.split('.')[0]) < 11:
+        return False
+    return True
+
+
+@contextlib.contextmanager
+def tf32_off():
+    old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
+    try:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=False):
+            yield
+    finally:
+        torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
+
+
+@contextlib.contextmanager
+def tf32_on(self, tf32_precision=1e-5):
+    old_allow_tf32_matmul = torch.backends.cuda.matmul.allow_tf32
+    old_precision = self.precision
+    try:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        self.precision = tf32_precision
+        with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True):
+            yield
+    finally:
+        torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32_matmul
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_tf32=True, another with allow_tf32=False. When running with
+# allow_tf32=True, it will use reduced precision as specified by the
+# argument. For example:
+#    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+#    @tf32_on_and_off(0.005)
+#    def test_matmul(self, device, dtype):
+#        a = ...; b = ...;
+#        c = torch.matmul(a, b)
+#        self.assertEqual(c, expected)
+# In the above example, when testing torch.float32 and torch.complex64 on CUDA
+# on a CUDA >= 11 build on an >=Ampere architecture, the matmul will be running at
+# TF32 mode and TF32 mode off, and on TF32 mode, the assertEqual will use reduced
+# precision to check values.
+#
+# This decorator can be used for function with or without device/dtype, such as
+# @tf32_on_and_off(0.005)
+# def test_my_op(self)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, device, dtype)
+# @tf32_on_and_off(0.005)
+# def test_my_op(self, dtype)
+# if neither device nor dtype is specified, it will check if the system has ampere device
+# if device is specified, it will check if device is cuda
+# if dtype is specified, it will check if dtype is float32 or complex64
+# tf32 and fp32 are different only when all the three checks pass
+def tf32_on_and_off(tf32_precision=1e-5):
+    def with_tf32_disabled(self, function_call):
+        with tf32_off():
+            function_call()
+
+    def with_tf32_enabled(self, function_call):
+        with tf32_on(self, tf32_precision):
+            function_call()
+
+    def wrapper(f):
+        params = inspect.signature(f).parameters
+        arg_names = tuple(params.keys())
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            for k, v in zip(arg_names, args):
+                kwargs[k] = v
+            cond = tf32_is_not_fp32()
+            if 'device' in kwargs:
+                cond = cond and (torch.device(kwargs['device']).type == 'cuda')
+            if 'dtype' in kwargs:
+                cond = cond and (kwargs['dtype'] in {torch.float32, torch.complex64})
+            if cond:
+                with_tf32_disabled(kwargs['self'], lambda: f(**kwargs))
+                with_tf32_enabled(kwargs['self'], lambda: f(**kwargs))
+            else:
+                f(**kwargs)
+
+        return wrapped
+    return wrapper
+
+
+# This is a wrapper that wraps a test to run it with TF32 turned off.
+# This wrapper is designed to be used when a test uses matmul or convolutions
+# but the purpose of that test is not testing matmul or convolutions.
+# Disabling TF32 will enforce torch.float tensors to be always computed
+# at full precision.
+def with_tf32_off(f):
+    @functools.wraps(f)
+    def wrapped(*args, **kwargs):
+        with tf32_off():
+            return f(*args, **kwargs)
+
+    return wrapped
+
+def _get_magma_version():
+    if 'Magma' not in torch.__config__.show():
+        return (0, 0)
+    position = torch.__config__.show().find('Magma ')
+    version_str = torch.__config__.show()[position + len('Magma '):].split('\n')[0]
+    return tuple(int(x) for x in version_str.split("."))
+
+def _get_torch_cuda_version():
+    if torch.version.cuda is None:
+        return (0, 0)
+    cuda_version = str(torch.version.cuda)
+    return tuple(int(x) for x in cuda_version.split("."))
+
+def _get_torch_rocm_version():
+    if not TEST_WITH_ROCM:
+        return (0, 0)
+    rocm_version = str(torch.version.hip)
+    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+    return tuple(int(x) for x in rocm_version.split("."))
+
+def _check_cusparse_generic_available():
+    return not TEST_WITH_ROCM
+
+def _check_hipsparse_generic_available():
+    if not TEST_WITH_ROCM:
+        return False
+
+    rocm_version = str(torch.version.hip)
+    rocm_version = rocm_version.split("-")[0]    # ignore git sha
+    rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+    return not (rocm_version_tuple is None or rocm_version_tuple < (5, 1))
+
+
+TEST_CUSPARSE_GENERIC = _check_cusparse_generic_available()
+TEST_HIPSPARSE_GENERIC = _check_hipsparse_generic_available()
+
+# Shared by test_torch.py and test_multigpu.py
+def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+    # Create a module+optimizer that will use scaling, and a control module+optimizer
+    # that will not use scaling, against which the scaling-enabled module+optimizer can be compared.
+    mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
+    mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
+    with torch.no_grad():
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+            s.copy_(c)
+
+    kwargs = {"lr": 1.0}
+    if optimizer_kwargs is not None:
+        kwargs.update(optimizer_kwargs)
+    opt_control = optimizer_ctor(mod_control.parameters(), **kwargs)
+    opt_scaling = optimizer_ctor(mod_scaling.parameters(), **kwargs)
+
+    return mod_control, mod_scaling, opt_control, opt_scaling
+
+# Shared by test_torch.py, test_cuda.py and test_multigpu.py
+def _create_scaling_case(device="cuda", dtype=torch.float, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+    data = [(torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device))]
+
+    loss_fn = torch.nn.MSELoss().to(device)
+
+    skip_iter = 2
+
+    return _create_scaling_models_optimizers(
+        device=device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+    ) + (data, loss_fn, skip_iter)
+
+
+# Importing this module should NOT eagerly initialize CUDA
+if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
+    assert not torch.cuda.is_initialized()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_device_type.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_device_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eccc30dac5ee1586014aba3ab6b48c8447b0ba5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_device_type.py
@@ -0,0 +1,1525 @@
+# mypy: ignore-errors
+
+import copy
+import gc
+import inspect
+import runpy
+import sys
+import threading
+from collections import namedtuple
+from enum import Enum
+from functools import wraps, partial
+from typing import List, Any, ClassVar, Optional, Sequence, Tuple, Union, Dict, Set
+import unittest
+import os
+import torch
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
+    skipCUDANonDefaultStreamIf, TEST_WITH_ASAN, TEST_WITH_UBSAN, TEST_WITH_TSAN, \
+    IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, IS_WINDOWS, TEST_MPS, \
+    _TestParametrizer, compose_parametrize_fns, dtype_name, \
+    TEST_WITH_MIOPEN_SUGGEST_NHWC, NATIVE_DEVICES, skipIfTorchDynamo, \
+    get_tracked_input, clear_tracked_input, PRINT_REPRO_ON_FAILURE, \
+    TEST_WITH_TORCHINDUCTOR
+from torch.testing._internal.common_cuda import _get_torch_cuda_version, \
+    TEST_CUSPARSE_GENERIC, TEST_HIPSPARSE_GENERIC, _get_torch_rocm_version
+from torch.testing._internal.common_dtype import get_all_dtypes
+
+try:
+    import psutil  # type: ignore[import]
+    HAS_PSUTIL = True
+except ImportError:
+    HAS_PSUTIL = False
+
+# Note [Writing Test Templates]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# This note was written shortly after the PyTorch 1.9 release.
+# If you notice it's out-of-date or think it could be improved then please
+# file an issue.
+#
+# PyTorch has its own framework for instantiating test templates. That is, for
+#   taking test classes that look similar to unittest or pytest
+#   compatible test classes and optionally doing the following:
+#
+#     - instantiating a version of the test class for each available device type
+#         (often the CPU, CUDA, and META device types)
+#     - further instantiating a version of each test that's always specialized
+#         on the test class's device type, and optionally specialized further
+#         on datatypes or operators
+#
+# This functionality is similar to pytest's parametrize functionality
+#   (see https://docs.pytest.org/en/6.2.x/parametrize.html), but with considerable
+#   additional logic that specializes the instantiated test classes for their
+#   device types (see CPUTestBase and CUDATestBase below), supports a variety
+#   of composable decorators that allow for test filtering and setting
+#   tolerances, and allows tests parametrized by operators to instantiate
+#   only the subset of device type x dtype that operator supports.
+#
+# This framework was built to make it easier to write tests that run on
+#   multiple device types, multiple datatypes (dtypes), and for multiple
+#   operators. It's also useful for controlling which tests are run. For example,
+#   only tests that use a CUDA device can be run on platforms with CUDA.
+#   Let's dive in with an example to get an idea for how it works:
+#
+# --------------------------------------------------------
+# A template class (looks like a regular unittest TestCase)
+# class TestClassFoo(TestCase):
+#
+#   # A template test that can be specialized with a device
+#   # NOTE: this test case is not runnable by unittest or pytest because it
+#   #   accepts an extra positional argument, "device", that they do not understand
+#   def test_bar(self, device):
+#     pass
+#
+# # Function that instantiates a template class and its tests
+# instantiate_device_type_tests(TestCommon, globals())
+# --------------------------------------------------------
+#
+# In the above code example we see a template class and a single test template
+#   that can be instantiated with a device. The function
+#   instantiate_device_type_tests(), called at file scope, instantiates
+#   new test classes, one per available device type, and new tests in those
+#   classes from these templates. It actually does this by removing
+#   the class TestClassFoo and replacing it with classes like TestClassFooCPU
+#   and TestClassFooCUDA, instantiated test classes that inherit from CPUTestBase
+#   and CUDATestBase respectively. Additional device types, like XLA,
+#   (see https://github.com/pytorch/xla) can further extend the set of
+#   instantiated test classes to create classes like TestClassFooXLA.
+#
+# The test template, test_bar(), is also instantiated. In this case the template
+#   is only specialized on a device, so (depending on the available device
+#   types) it might become test_bar_cpu() in TestClassFooCPU and test_bar_cuda()
+#   in TestClassFooCUDA. We can think of the instantiated test classes as
+#   looking like this:
+#
+# --------------------------------------------------------
+# # An instantiated test class for the CPU device type
+# class TestClassFooCPU(CPUTestBase):
+#
+#   # An instantiated test that calls the template with the string representation
+#   #   of a device from the test class's device type
+#   def test_bar_cpu(self):
+#     test_bar(self, 'cpu')
+#
+# # An instantiated test class for the CUDA device type
+# class TestClassFooCUDA(CUDATestBase):
+#
+#   # An instantiated test that calls the template with the string representation
+#   #   of a device from the test class's device type
+#   def test_bar_cuda(self):
+#     test_bar(self, 'cuda:0')
+# --------------------------------------------------------
+#
+# These instantiated test classes ARE discoverable and runnable by both
+#   unittest and pytest. One thing that may be confusing, however, is that
+#   attempting to run "test_bar" will not work, despite it appearing in the
+#   original template code. This is because "test_bar" is no longer discoverable
+#   after instantiate_device_type_tests() runs, as the above snippet shows.
+#   Instead "test_bar_cpu" and "test_bar_cuda" may be run directly, or both
+#   can be run with the option "-k test_bar".
+#
+# Removing the template class and adding the instantiated classes requires
+#   passing "globals()" to instantiate_device_type_tests(), because it
+#   edits the file's Python objects.
+#
+# As mentioned, tests can be additionally parametrized on dtypes or
+#   operators. Datatype parametrization uses the @dtypes decorator and
+#   require a test template like this:
+#
+# --------------------------------------------------------
+# # A template test that can be specialized with a device and a datatype (dtype)
+# @dtypes(torch.float32, torch.int64)
+# def test_car(self, device, dtype)
+#   pass
+# --------------------------------------------------------
+#
+# If the CPU and CUDA device types are available this test would be
+#   instantiated as 4 tests that cover the cross-product of the two dtypes
+#   and two device types:
+#
+#     - test_car_cpu_float32
+#     - test_car_cpu_int64
+#     - test_car_cuda_float32
+#     - test_car_cuda_int64
+#
+# The dtype is passed as a torch.dtype object.
+#
+# Tests parametrized on operators (actually on OpInfos, more on that in a
+#   moment...) use the @ops decorator and require a test template like this:
+# --------------------------------------------------------
+# # A template test that can be specialized with a device, dtype, and OpInfo
+# @ops(op_db)
+# def test_car(self, device, dtype, op)
+#   pass
+# --------------------------------------------------------
+#
+# See the documentation for the @ops decorator below for additional details
+#   on how to use it and see the note [OpInfos] in
+#   common_methods_invocations.py for more details on OpInfos.
+#
+# A test parametrized over the entire "op_db", which contains hundreds of
+#   OpInfos, will likely have hundreds or thousands of instantiations. The
+#   test will be instantiated on the cross-product of device types, operators,
+#   and the dtypes the operator supports on that device type. The instantiated
+#   tests will have names like:
+#
+#     - test_car_add_cpu_float32
+#     - test_car_sub_cuda_int64
+#
+# The first instantiated test calls the original test_car() with the OpInfo
+#   for torch.add as its "op" argument, the string 'cpu' for its "device" argument,
+#   and the dtype torch.float32 for is "dtype" argument. The second instantiated
+#   test calls the test_car() with the OpInfo for torch.sub, a CUDA device string
+#   like 'cuda:0' or 'cuda:1' for its "device" argument, and the dtype
+#   torch.int64 for its "dtype argument."
+#
+# In addition to parametrizing over device, dtype, and ops via OpInfos, the
+#   @parametrize decorator is supported for arbitrary parametrizations:
+# --------------------------------------------------------
+# # A template test that can be specialized with a device, dtype, and value for x
+# @parametrize("x", range(5))
+# def test_car(self, device, dtype, x)
+#   pass
+# --------------------------------------------------------
+#
+# See the documentation for @parametrize in common_utils.py for additional details
+#   on this. Note that the instantiate_device_type_tests() function will handle
+#   such parametrizations; there is no need to additionally call
+#   instantiate_parametrized_tests().
+#
+# Clever test filtering can be very useful when working with parametrized
+#   tests. "-k test_car" would run every instantiated variant of the test_car()
+#   test template, and "-k test_car_add" runs every variant instantiated with
+#   torch.add.
+#
+# It is important to use the passed device and dtype as appropriate. Use
+#   helper functions like make_tensor() that require explicitly specifying
+#   the device and dtype so they're not forgotten.
+#
+# Test templates can use a variety of composable decorators to specify
+#   additional options and requirements, some are listed here:
+#
+#     - @deviceCountAtLeast(<minimum number of devices to run test with>)
+#         Passes a list of strings representing all available devices of
+#         the test class's device type as the test template's "device" argument.
+#         If there are fewer devices than the value passed to the decorator
+#         the test is skipped.
+#     - @dtypes(<list of tuples of dtypes>)
+#         In addition to accepting multiple dtypes, the @dtypes decorator
+#         can accept a sequence of tuple pairs of dtypes. The test template
+#         will be called with each tuple for its "dtype" argument.
+#     - @onlyNativeDeviceTypes
+#         Skips the test if the device is not a native device type (currently CPU, CUDA, Meta)
+#     - @onlyCPU
+#         Skips the test if the device is not a CPU device
+#     - @onlyCUDA
+#         Skips the test if the device is not a CUDA device
+#     - @onlyMPS
+#         Skips the test if the device is not a MPS device
+#     - @skipCPUIfNoLapack
+#         Skips the test if the device is a CPU device and LAPACK is not installed
+#     - @skipCPUIfNoMkl
+#         Skips the test if the device is a CPU device and MKL is not installed
+#     - @skipCUDAIfNoMagma
+#         Skips the test if the device is a CUDA device and MAGMA is not installed
+#     - @skipCUDAIfRocm
+#         Skips the test if the device is a CUDA device and ROCm is being used
+
+
+# Note [Adding a Device Type]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# To add a device type:
+#
+#   (1) Create a new "TestBase" extending DeviceTypeTestBase.
+#       See CPUTestBase and CUDATestBase below.
+#   (2) Define the "device_type" attribute of the base to be the
+#       appropriate string.
+#   (3) Add logic to this file that appends your base class to
+#       device_type_test_bases when your device type is available.
+#   (4) (Optional) Write setUpClass/tearDownClass class methods that
+#       instantiate dependencies (see MAGMA in CUDATestBase).
+#   (5) (Optional) Override the "instantiate_test" method for total
+#       control over how your class creates tests.
+#
+# setUpClass is called AFTER tests have been created and BEFORE and ONLY IF
+# they are run. This makes it useful for initializing devices and dependencies.
+
+
+# Note [Overriding methods in generic tests]
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Device generic tests look a lot like normal test classes, but they differ
+# from ordinary classes in some important ways.  In particular, overriding
+# methods in generic tests doesn't work quite the way you expect.
+#
+#     class TestFooDeviceType(TestCase):
+#         # Intention is to override
+#         def assertEqual(self, x, y):
+#             # This DOESN'T WORK!
+#             super().assertEqual(x, y)
+#
+# If you try to run this code, you'll get an error saying that TestFooDeviceType
+# is not in scope.  This is because after instantiating our classes, we delete
+# it from the parent scope.  Instead, you need to hardcode a direct invocation
+# of the desired subclass call, e.g.,
+#
+#     class TestFooDeviceType(TestCase):
+#         # Intention is to override
+#         def assertEqual(self, x, y):
+#             TestCase.assertEqual(x, y)
+#
+# However, a less error-prone way of customizing the behavior of TestCase
+# is to either (1) add your functionality to TestCase and make it toggled
+# by a class attribute, or (2) create your own subclass of TestCase, and
+# then inherit from it for your generic test.
+
+
+def _dtype_test_suffix(dtypes):
+    """ Returns the test suffix for a dtype, sequence of dtypes, or None. """
+    if isinstance(dtypes, (list, tuple)):
+        if len(dtypes) == 0:
+            return ''
+        return '_' + '_'.join(dtype_name(d) for d in dtypes)
+    elif dtypes:
+        return f'_{dtype_name(dtypes)}'
+    else:
+        return ''
+
+
+def _update_param_kwargs(param_kwargs, name, value):
+    """ Adds a kwarg with the specified name and value to the param_kwargs dict. """
+    # Make name plural (e.g. devices / dtypes) if the value is composite.
+    plural_name = f'{name}s'
+
+    # Clear out old entries of the arg if any.
+    if name in param_kwargs:
+        del param_kwargs[name]
+    if plural_name in param_kwargs:
+        del param_kwargs[plural_name]
+
+    if isinstance(value, (list, tuple)):
+        param_kwargs[plural_name] = value
+    elif value is not None:
+        param_kwargs[name] = value
+
+    # Leave param_kwargs as-is when value is None.
+
+
+class DeviceTypeTestBase(TestCase):
+    device_type: str = 'generic_device_type'
+
+    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
+    _stop_test_suite = False
+
+    # Precision is a thread-local setting since it may be overridden per test
+    _tls = threading.local()
+    _tls.precision = TestCase._precision
+    _tls.rel_tol = TestCase._rel_tol
+
+    @property
+    def precision(self):
+        return self._tls.precision
+
+    @precision.setter
+    def precision(self, prec):
+        self._tls.precision = prec
+
+    @property
+    def rel_tol(self):
+        return self._tls.rel_tol
+
+    @rel_tol.setter
+    def rel_tol(self, prec):
+        self._tls.rel_tol = prec
+
+    # Returns a string representing the device that single device tests should use.
+    # Note: single device tests use this device exclusively.
+    @classmethod
+    def get_primary_device(cls):
+        return cls.device_type
+
+    @classmethod
+    def _init_and_get_primary_device(cls):
+        try:
+            return cls.get_primary_device()
+        except Exception:
+            # For CUDATestBase, XLATestBase, and possibly others, the primary device won't be available
+            # until setUpClass() sets it. Call that manually here if needed.
+            if hasattr(cls, 'setUpClass'):
+                cls.setUpClass()
+            return cls.get_primary_device()
+
+    # Returns a list of strings representing all available devices of this
+    # device type. The primary device must be the first string in the list
+    # and the list must contain no duplicates.
+    # Note: UNSTABLE API. Will be replaced once PyTorch has a device generic
+    #   mechanism of acquiring all available devices.
+    @classmethod
+    def get_all_devices(cls):
+        return [cls.get_primary_device()]
+
+    # Returns the dtypes the test has requested.
+    # Prefers device-specific dtype specifications over generic ones.
+    @classmethod
+    def _get_dtypes(cls, test):
+        if not hasattr(test, 'dtypes'):
+            return None
+
+        default_dtypes = test.dtypes.get('all')
+        msg = f"@dtypes is mandatory when using @dtypesIf however '{test.__name__}' didn't specify it"
+        assert default_dtypes is not None, msg
+
+        return test.dtypes.get(cls.device_type, default_dtypes)
+
+    def _get_precision_override(self, test, dtype):
+        if not hasattr(test, 'precision_overrides'):
+            return self.precision
+        return test.precision_overrides.get(dtype, self.precision)
+
+    def _get_tolerance_override(self, test, dtype):
+        if not hasattr(test, 'tolerance_overrides'):
+            return self.precision, self.rel_tol
+        return test.tolerance_overrides.get(dtype, tol(self.precision, self.rel_tol))
+
+    def _apply_precision_override_for_test(self, test, param_kwargs):
+        dtype = param_kwargs['dtype'] if 'dtype' in param_kwargs else None
+        dtype = param_kwargs['dtypes'] if 'dtypes' in param_kwargs else dtype
+        if dtype:
+            self.precision = self._get_precision_override(test, dtype)
+            self.precision, self.rel_tol = self._get_tolerance_override(test, dtype)
+
+    # Creates device-specific tests.
+    @classmethod
+    def instantiate_test(cls, name, test, *, generic_cls=None):
+
+        def instantiate_test_helper(cls, name, *, test, param_kwargs=None, decorator_fn=lambda _: []):
+            # Add the device param kwarg if the test needs device or devices.
+            param_kwargs = {} if param_kwargs is None else param_kwargs
+            test_sig_params = inspect.signature(test).parameters
+            if 'device' in test_sig_params or 'devices' in test_sig_params:
+                device_arg: str = cls._init_and_get_primary_device()
+                if hasattr(test, 'num_required_devices'):
+                    device_arg = cls.get_all_devices()
+                _update_param_kwargs(param_kwargs, 'device', device_arg)
+
+            # Apply decorators based on param kwargs.
+            for decorator in decorator_fn(param_kwargs):
+                test = decorator(test)
+
+            # Constructs the test
+            @wraps(test)
+            def instantiated_test(self, param_kwargs=param_kwargs):
+                # Sets precision and runs test
+                # Note: precision is reset after the test is run
+                guard_precision = self.precision
+                guard_rel_tol = self.rel_tol
+                try:
+                    self._apply_precision_override_for_test(test, param_kwargs)
+                    result = test(self, **param_kwargs)
+                except RuntimeError as rte:
+                    # check if rte should stop entire test suite.
+                    self._stop_test_suite = self._should_stop_test_suite()
+                    # Check if test has been decorated with `@expectedFailure`
+                    # Using `__unittest_expecting_failure__` attribute, see
+                    # https://github.com/python/cpython/blob/ffa505b580464/Lib/unittest/case.py#L164
+                    # In that case, make it fail with "unexpected success" by suppressing exception
+                    if getattr(test, "__unittest_expecting_failure__", False) and self._stop_test_suite:
+                        import sys
+                        print("Suppressing fatal exception to trigger unexpected success", file=sys.stderr)
+                        return
+                    # raise the runtime error as is for the test suite to record.
+                    raise rte
+                finally:
+                    self.precision = guard_precision
+                    self.rel_tol = guard_rel_tol
+
+                return result
+
+            assert not hasattr(cls, name), f"Redefinition of test {name}"
+            setattr(cls, name, instantiated_test)
+
+        def default_parametrize_fn(test, generic_cls, device_cls):
+            # By default, no parametrization is needed.
+            yield (test, '', {}, lambda _: [])
+
+        # Parametrization decorators set the parametrize_fn attribute on the test.
+        parametrize_fn = getattr(test, "parametrize_fn", default_parametrize_fn)
+
+        # If one of the @dtypes* decorators is present, also parametrize over the dtypes set by it.
+        dtypes = cls._get_dtypes(test)
+        if dtypes is not None:
+
+            def dtype_parametrize_fn(test, generic_cls, device_cls, dtypes=dtypes):
+                for dtype in dtypes:
+                    param_kwargs: Dict[str, Any] = {}
+                    _update_param_kwargs(param_kwargs, "dtype", dtype)
+
+                    # Note that an empty test suffix is set here so that the dtype can be appended
+                    # later after the device.
+                    yield (test, '', param_kwargs, lambda _: [])
+
+            parametrize_fn = compose_parametrize_fns(dtype_parametrize_fn, parametrize_fn)
+
+        # Instantiate the parametrized tests.
+        for (test, test_suffix, param_kwargs, decorator_fn) in parametrize_fn(test, generic_cls, cls):  # noqa: B020
+            test_suffix = '' if test_suffix == '' else '_' + test_suffix
+            device_suffix = '_' + cls.device_type
+
+            # Note: device and dtype suffix placement
+            # Special handling here to place dtype(s) after device according to test name convention.
+            dtype_kwarg = None
+            if 'dtype' in param_kwargs or 'dtypes' in param_kwargs:
+                dtype_kwarg = param_kwargs['dtypes'] if 'dtypes' in param_kwargs else param_kwargs['dtype']
+            test_name = f'{name}{test_suffix}{device_suffix}{_dtype_test_suffix(dtype_kwarg)}'
+
+            instantiate_test_helper(cls=cls, name=test_name, test=test, param_kwargs=param_kwargs,
+                                    decorator_fn=decorator_fn)
+
+    def run(self, result=None):
+        super().run(result=result)
+        # Early terminate test if _stop_test_suite is set.
+        if self._stop_test_suite:
+            result.stop()
+
+
+class CPUTestBase(DeviceTypeTestBase):
+    device_type = 'cpu'
+
+    # No critical error should stop CPU test suite
+    def _should_stop_test_suite(self):
+        return False
+
+class CUDATestBase(DeviceTypeTestBase):
+    device_type = 'cuda'
+    _do_cuda_memory_leak_check = True
+    _do_cuda_non_default_stream = True
+    primary_device: ClassVar[str]
+    cudnn_version: ClassVar[Any]
+    no_magma: ClassVar[bool]
+    no_cudnn: ClassVar[bool]
+
+    def has_cudnn(self):
+        return not self.no_cudnn
+
+    @classmethod
+    def get_primary_device(cls):
+        return cls.primary_device
+
+    @classmethod
+    def get_all_devices(cls):
+        primary_device_idx = int(cls.get_primary_device().split(':')[1])
+        num_devices = torch.cuda.device_count()
+
+        prim_device = cls.get_primary_device()
+        cuda_str = 'cuda:{0}'
+        non_primary_devices = [cuda_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx]
+        return [prim_device] + non_primary_devices
+
+    @classmethod
+    def setUpClass(cls):
+        # has_magma shows up after cuda is initialized
+        t = torch.ones(1).cuda()
+        cls.no_magma = not torch.cuda.has_magma
+
+        # Determines if cuDNN is available and its version
+        cls.no_cudnn = not torch.backends.cudnn.is_acceptable(t)
+        cls.cudnn_version = None if cls.no_cudnn else torch.backends.cudnn.version()
+
+        # Acquires the current device as the primary (test) device
+        cls.primary_device = f'cuda:{torch.cuda.current_device()}'
+
+# See Note [Lazy Tensor tests in device agnostic testing]
+lazy_ts_backend_init = False
+class LazyTestBase(DeviceTypeTestBase):
+    device_type = 'lazy'
+
+    def _should_stop_test_suite(self):
+        return False
+
+    @classmethod
+    def setUpClass(cls):
+        import torch._lazy
+        import torch._lazy.metrics
+        import torch._lazy.ts_backend
+        global lazy_ts_backend_init
+        if not lazy_ts_backend_init:
+            # Need to connect the TS backend to lazy key before running tests
+            torch._lazy.ts_backend.init()
+            lazy_ts_backend_init = True
+
+class MPSTestBase(DeviceTypeTestBase):
+    device_type = 'mps'
+    primary_device: ClassVar[str]
+
+    @classmethod
+    def get_primary_device(cls):
+        return cls.primary_device
+
+    @classmethod
+    def get_all_devices(cls):
+        # currently only one device is supported on MPS backend
+        prim_device = cls.get_primary_device()
+        return [prim_device]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.primary_device = 'mps:0'
+
+    def _should_stop_test_suite(self):
+        return False
+
+class PrivateUse1TestBase(DeviceTypeTestBase):
+    primary_device: ClassVar[str]
+    device_mod = None
+    device_type = 'privateuse1'
+
+    @classmethod
+    def get_primary_device(cls):
+        return cls.primary_device
+
+    @classmethod
+    def get_all_devices(cls):
+        primary_device_idx = int(cls.get_primary_device().split(':')[1])
+        num_devices = cls.device_mod.device_count()
+        prim_device = cls.get_primary_device()
+        device_str = f'{cls.device_type}:{{0}}'
+        non_primary_devices = [device_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx]
+        return [prim_device] + non_primary_devices
+
+    @classmethod
+    def setUpClass(cls):
+        cls.device_type = torch._C._get_privateuse1_backend_name()
+        cls.device_mod = getattr(torch, cls.device_type, None)
+        assert cls.device_mod is not None, f'''torch has no module of `{cls.device_type}`, you should register
+                                            a module by `torch._register_device_module`.'''
+        cls.primary_device = f'{cls.device_type}:{cls.device_mod.current_device()}'
+
+# Adds available device-type-specific test base classes
+def get_device_type_test_bases():
+    # set type to List[Any] due to mypy list-of-union issue:
+    # https://github.com/python/mypy/issues/3351
+    test_bases: List[Any] = list()
+
+    if IS_SANDCASTLE or IS_FBCODE:
+        if IS_REMOTE_GPU:
+            # Skip if sanitizer is enabled
+            if not TEST_WITH_ASAN and not TEST_WITH_TSAN and not TEST_WITH_UBSAN:
+                test_bases.append(CUDATestBase)
+        else:
+            test_bases.append(CPUTestBase)
+    else:
+        test_bases.append(CPUTestBase)
+        if torch.cuda.is_available():
+            test_bases.append(CUDATestBase)
+        device_type = torch._C._get_privateuse1_backend_name()
+        device_mod = getattr(torch, device_type, None)
+        if hasattr(device_mod, "is_available") and device_mod.is_available():
+            test_bases.append(PrivateUse1TestBase)
+        # Disable MPS testing in generic device testing temporarily while we're
+        # ramping up support.
+        # elif torch.backends.mps.is_available():
+        #   test_bases.append(MPSTestBase)
+
+    return test_bases
+
+device_type_test_bases = get_device_type_test_bases()
+
+
+def filter_desired_device_types(device_type_test_bases, except_for=None, only_for=None):
+    # device type cannot appear in both except_for and only_for
+    intersect = set(except_for if except_for else []) & set(only_for if only_for else [])
+    assert not intersect, f"device ({intersect}) appeared in both except_for and only_for"
+
+    if except_for:
+        device_type_test_bases = filter(
+            lambda x: x.device_type not in except_for, device_type_test_bases)
+    if only_for:
+        device_type_test_bases = filter(
+            lambda x: x.device_type in only_for, device_type_test_bases)
+
+    return list(device_type_test_bases)
+
+
+# Note [How to extend DeviceTypeTestBase to add new test device]
+# The following logic optionally allows downstream projects like pytorch/xla to
+# add more test devices.
+# Instructions:
+#  - Add a python file (e.g. pytorch/xla/test/pytorch_test_base.py) in downstream project.
+#    - Inside the file, one should inherit from `DeviceTypeTestBase` class and define
+#      a new DeviceTypeTest class (e.g. `XLATestBase`) with proper implementation of
+#      `instantiate_test` method.
+#    - DO NOT import common_device_type inside the file.
+#      `runpy.run_path` with `globals()` already properly setup the context so that
+#      `DeviceTypeTestBase` is already available.
+#    - Set a top-level variable `TEST_CLASS` equal to your new class.
+#      E.g. TEST_CLASS = XLATensorBase
+#  - To run tests with new device type, set `TORCH_TEST_DEVICE` env variable to path
+#    to this file. Multiple paths can be separated by `:`.
+# See pytorch/xla/test/pytorch_test_base.py for a more detailed example.
+_TORCH_TEST_DEVICES = os.environ.get('TORCH_TEST_DEVICES', None)
+if _TORCH_TEST_DEVICES:
+    for path in _TORCH_TEST_DEVICES.split(':'):
+        # runpy (a stdlib module) lacks annotations
+        mod = runpy.run_path(path, init_globals=globals())  # type: ignore[func-returns-value]
+        device_type_test_bases.append(mod['TEST_CLASS'])
+
+
+PYTORCH_CUDA_MEMCHECK = os.getenv('PYTORCH_CUDA_MEMCHECK', '0') == '1'
+
+PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY = 'PYTORCH_TESTING_DEVICE_ONLY_FOR'
+PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY = 'PYTORCH_TESTING_DEVICE_EXCEPT_FOR'
+
+
+def get_desired_device_type_test_bases(except_for=None, only_for=None, include_lazy=False, allow_mps=False):
+    # allow callers to specifically opt tests into being tested on MPS, similar to `include_lazy`
+    test_bases = device_type_test_bases.copy()
+    if allow_mps and TEST_MPS and MPSTestBase not in test_bases:
+        test_bases.append(MPSTestBase)
+    # Filter out the device types based on user inputs
+    desired_device_type_test_bases = filter_desired_device_types(test_bases, except_for, only_for)
+    if include_lazy:
+        # Note [Lazy Tensor tests in device agnostic testing]
+        # Right now, test_view_ops.py runs with LazyTensor.
+        # We don't want to opt every device-agnostic test into using the lazy device,
+        # because many of them will fail.
+        # So instead, the only way to opt a specific device-agnostic test file into
+        # lazy tensor testing is with include_lazy=True
+        if IS_FBCODE:
+            print("TorchScript backend not yet supported in FBCODE/OVRSOURCE builds", file=sys.stderr)
+        else:
+            desired_device_type_test_bases.append(LazyTestBase)
+
+    def split_if_not_empty(x: str):
+        return x.split(",") if x else []
+
+    # Filter out the device types based on environment variables if available
+    # Usage:
+    # export PYTORCH_TESTING_DEVICE_ONLY_FOR=cuda,cpu
+    # export PYTORCH_TESTING_DEVICE_EXCEPT_FOR=xla
+    env_only_for = split_if_not_empty(os.getenv(PYTORCH_TESTING_DEVICE_ONLY_FOR_KEY, ''))
+    env_except_for = split_if_not_empty(os.getenv(PYTORCH_TESTING_DEVICE_EXCEPT_FOR_KEY, ''))
+
+    return filter_desired_device_types(desired_device_type_test_bases, env_except_for, env_only_for)
+
+
+
+# Adds 'instantiated' device-specific test cases to the given scope.
+# The tests in these test cases are derived from the generic tests in
+# generic_test_class. This function should be used instead of
+# instantiate_parametrized_tests() if the test class contains
+# device-specific tests (NB: this supports additional @parametrize usage).
+#
+# See note "Writing Test Templates"
+def instantiate_device_type_tests(generic_test_class, scope, except_for=None, only_for=None, include_lazy=False, allow_mps=False):
+    # Removes the generic test class from its enclosing scope so its tests
+    # are not discoverable.
+    del scope[generic_test_class.__name__]
+
+    # Creates an 'empty' version of the generic_test_class
+    # Note: we don't inherit from the generic_test_class directly because
+    #   that would add its tests to our test classes and they would be
+    #   discovered (despite not being runnable). Inherited methods also
+    #   can't be removed later, and we can't rely on load_tests because
+    #   pytest doesn't support it (as of this writing).
+    empty_name = generic_test_class.__name__ + "_base"
+    empty_class = type(empty_name, generic_test_class.__bases__, {})
+
+    # Acquires members names
+    # See Note [Overriding methods in generic tests]
+    generic_members = set(generic_test_class.__dict__.keys()) - set(empty_class.__dict__.keys())
+    generic_tests = [x for x in generic_members if x.startswith('test')]
+
+    # Creates device-specific test cases
+    for base in get_desired_device_type_test_bases(except_for, only_for, include_lazy, allow_mps):
+        class_name = generic_test_class.__name__ + base.device_type.upper()
+
+        # type set to Any and suppressed due to unsupport runtime class:
+        # https://github.com/python/mypy/wiki/Unsupported-Python-Features
+        device_type_test_class: Any = type(class_name, (base, empty_class), {})
+
+        for name in generic_members:
+            if name in generic_tests:  # Instantiates test member
+                test = getattr(generic_test_class, name)
+                # XLA-compat shim (XLA's instantiate_test takes doesn't take generic_cls)
+                sig = inspect.signature(device_type_test_class.instantiate_test)
+                if len(sig.parameters) == 3:
+                    # Instantiates the device-specific tests
+                    device_type_test_class.instantiate_test(name, copy.deepcopy(test), generic_cls=generic_test_class)
+                else:
+                    device_type_test_class.instantiate_test(name, copy.deepcopy(test))
+            else:  # Ports non-test member
+                assert name not in device_type_test_class.__dict__, f"Redefinition of directly defined member {name}"
+                nontest = getattr(generic_test_class, name)
+                setattr(device_type_test_class, name, nontest)
+
+        # Mimics defining the instantiated class in the caller's file
+        # by setting its module to the given class's and adding
+        # the module to the given scope.
+        # This lets the instantiated class be discovered by unittest.
+        device_type_test_class.__module__ = generic_test_class.__module__
+        scope[class_name] = device_type_test_class
+
+
+# Category of dtypes to run an OpInfo-based test for
+# Example use: @ops(dtype=OpDTypes.supported)
+#
+# There are 5 categories:
+# - supported: Every dtype supported by the operator. Use for exhaustive
+#              testing of all dtypes.
+# - unsupported: Run tests on dtypes not supported by the operator. e.g. for
+#                testing the operator raises an error and doesn't crash.
+# - supported_backward: Every dtype supported by the operator's backward pass.
+# - unsupported_backward: Run tests on dtypes not supported by the operator's backward pass.
+# - any_one: Runs a test for one dtype the operator supports. Prioritizes dtypes the
+#     operator supports in both forward and backward.
+# - none: Useful for tests that are not dtype-specific. No dtype will be passed to the test
+#         when this is selected.
+class OpDTypes(Enum):
+    supported = 0  # Test all supported dtypes (default)
+    unsupported = 1  # Test only unsupported dtypes
+    supported_backward = 2  # Test all supported backward dtypes
+    unsupported_backward = 3  # Test only unsupported backward dtypes
+    any_one = 4  # Test precisely one supported dtype
+    none = 5  # Instantiate no dtype variants (no dtype kwarg needed)
+    any_common_cpu_cuda_one = 6  # Test precisely one supported dtype that is common to both cuda and cpu
+
+
+# Arbitrary order
+ANY_DTYPE_ORDER = (
+    torch.float32,
+    torch.float64,
+    torch.complex64,
+    torch.complex128,
+    torch.float16,
+    torch.bfloat16,
+    torch.long,
+    torch.int32,
+    torch.int16,
+    torch.int8,
+    torch.uint8,
+    torch.bool
+)
+
+def _serialize_sample(sample_input):
+    # NB: For OpInfos, SampleInput.summary() prints in a cleaner way.
+    if getattr(sample_input, "summary", None) is not None:
+        return sample_input.summary()
+    return str(sample_input)
+
+# Decorator that defines the OpInfos a test template should be instantiated for.
+#
+# Example usage:
+#
+# @ops(unary_ufuncs)
+# def test_numerics(self, device, dtype, op):
+#   <test_code>
+#
+# This will instantiate variants of test_numerics for each given OpInfo,
+# on each device the OpInfo's operator supports, and for every dtype supported by
+# that operator. There are a few caveats to the dtype rule, explained below.
+#
+# The @ops decorator can accept two
+# additional arguments, "dtypes" and "allowed_dtypes". If "dtypes" is specified
+# then the test variants are instantiated for those dtypes, regardless of
+# what the operator supports. If given "allowed_dtypes" then test variants
+# are instantiated only for the intersection of allowed_dtypes and the dtypes
+# they would otherwise be instantiated with. That is, allowed_dtypes composes
+# with the options listed above and below.
+#
+# The "dtypes" argument can also accept additional values (see OpDTypes above):
+#   OpDTypes.supported - the test is instantiated for all dtypes the operator
+#     supports
+#   OpDTypes.unsupported - the test is instantiated for all dtypes the operator
+#     doesn't support
+#   OpDTypes.supported_backward - the test is instantiated for all dtypes the
+#     operator's gradient formula supports
+#   OpDTypes.unsupported_backward - the test is instantiated for all dtypes the
+#     operator's gradient formula doesn't support
+#   OpDTypes.any_one - the test is instantiated for one dtype the
+#     operator supports. The dtype supports forward and backward if possible.
+#   OpDTypes.none - the test is instantiated without any dtype. The test signature
+#     should not include a dtype kwarg in this case.
+#
+# These options allow tests to have considerable control over the dtypes
+#   they're instantiated for.
+
+class ops(_TestParametrizer):
+    def __init__(self, op_list, *, dtypes: Union[OpDTypes, Sequence[torch.dtype]] = OpDTypes.supported,
+                 allowed_dtypes: Optional[Sequence[torch.dtype]] = None, skip_if_dynamo=True):
+        self.op_list = list(op_list)
+        self.opinfo_dtypes = dtypes
+        self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
+        self.skip_if_dynamo = skip_if_dynamo
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        """ Parameterizes the given test function across each op and its associated dtypes. """
+        if device_cls is None:
+            raise RuntimeError('The @ops decorator is only intended to be used in a device-specific '
+                               'context; use it with instantiate_device_type_tests() instead of '
+                               'instantiate_parametrized_tests()')
+
+        op = check_exhausted_iterator = object()
+        for op in self.op_list:
+            # Determine the set of dtypes to use.
+            dtypes: Union[Set[torch.dtype], Set[None]]
+            if isinstance(self.opinfo_dtypes, Sequence):
+                dtypes = set(self.opinfo_dtypes)
+            elif self.opinfo_dtypes == OpDTypes.unsupported_backward:
+                dtypes = set(get_all_dtypes()).difference(op.supported_backward_dtypes(device_cls.device_type))
+            elif self.opinfo_dtypes == OpDTypes.supported_backward:
+                dtypes = op.supported_backward_dtypes(device_cls.device_type)
+            elif self.opinfo_dtypes == OpDTypes.unsupported:
+                dtypes = set(get_all_dtypes()).difference(op.supported_dtypes(device_cls.device_type))
+            elif self.opinfo_dtypes == OpDTypes.supported:
+                dtypes = op.supported_dtypes(device_cls.device_type)
+            elif self.opinfo_dtypes == OpDTypes.any_one:
+                # Tries to pick a dtype that supports both forward or backward
+                supported = op.supported_dtypes(device_cls.device_type)
+                supported_backward = op.supported_backward_dtypes(device_cls.device_type)
+                supported_both = supported.intersection(supported_backward)
+                dtype_set = supported_both if len(supported_both) > 0 else supported
+                for dtype in ANY_DTYPE_ORDER:
+                    if dtype in dtype_set:
+                        dtypes = {dtype}
+                        break
+                else:
+                    dtypes = {}
+            elif self.opinfo_dtypes == OpDTypes.any_common_cpu_cuda_one:
+                # Tries to pick a dtype that supports both CPU and CUDA
+                supported = op.dtypes.intersection(op.dtypesIfCUDA)
+                if supported:
+                    dtypes = {next(dtype for dtype in ANY_DTYPE_ORDER if dtype in supported)}
+                else:
+                    dtypes = {}
+
+            elif self.opinfo_dtypes == OpDTypes.none:
+                dtypes = {None}
+            else:
+                raise RuntimeError(f"Unknown OpDType: {self.opinfo_dtypes}")
+
+            if self.allowed_dtypes is not None:
+                dtypes = dtypes.intersection(self.allowed_dtypes)
+
+            # Construct the test name; device / dtype parts are handled outside.
+            # See [Note: device and dtype suffix placement]
+            test_name = op.formatted_name
+
+            for dtype in dtypes:
+                # Construct parameter kwargs to pass to the test.
+                param_kwargs = {'op': op}
+                _update_param_kwargs(param_kwargs, 'dtype', dtype)
+
+                # NOTE: test_wrapper exists because we don't want to apply
+                #   op-specific decorators to the original test.
+                #   Test-specific decorators are applied to the original test,
+                #   however.
+                try:
+                    @wraps(test)
+                    def test_wrapper(*args, **kwargs):
+                        try:
+                            return test(*args, **kwargs)
+                        except unittest.SkipTest as e:
+                            raise e
+                        except Exception as e:
+                            tracked_input = get_tracked_input()
+                            if PRINT_REPRO_ON_FAILURE and tracked_input is not None:
+                                raise Exception(
+                                    f"Caused by {tracked_input.type_desc} "
+                                    f"at index {tracked_input.index}: "
+                                    f"{_serialize_sample(tracked_input.val)}") from e
+                            raise e
+                        finally:
+                            clear_tracked_input()
+
+                    if self.skip_if_dynamo and not TEST_WITH_TORCHINDUCTOR:
+                        test_wrapper = skipIfTorchDynamo("Policy: we don't run OpInfo tests w/ Dynamo")(test_wrapper)
+
+                    # Initialize info for the last input seen. This is useful for tracking
+                    # down which inputs caused a test failure. Note that TrackedInputIter is
+                    # responsible for managing this.
+                    test.tracked_input = None
+
+                    decorator_fn = partial(op.get_decorators, generic_cls.__name__,
+                                           test.__name__, device_cls.device_type, dtype)
+
+                    yield (test_wrapper, test_name, param_kwargs, decorator_fn)
+                except Exception as ex:
+                    # Provides an error message for debugging before rethrowing the exception
+                    print(f"Failed to instantiate {test_name} for op {op.name}!")
+                    raise ex
+        if op is check_exhausted_iterator:
+            raise ValueError('An empty op_list was passed to @ops. '
+                             'Note that this may result from reuse of a generator.')
+
+# Decorator that skips a test if the given condition is true.
+# Notes:
+#   (1) Skip conditions stack.
+#   (2) Skip conditions can be bools or strings. If a string the
+#       test base must have defined the corresponding attribute to be False
+#       for the test to run. If you want to use a string argument you should
+#       probably define a new decorator instead (see below).
+#   (3) Prefer the existing decorators to defining the 'device_type' kwarg.
+class skipIf:
+
+    def __init__(self, dep, reason, device_type=None):
+        self.dep = dep
+        self.reason = reason
+        self.device_type = device_type
+
+    def __call__(self, fn):
+
+        @wraps(fn)
+        def dep_fn(slf, *args, **kwargs):
+            if self.device_type is None or self.device_type == slf.device_type:
+                if (isinstance(self.dep, str) and getattr(slf, self.dep, True)) or (isinstance(self.dep, bool) and self.dep):
+                    raise unittest.SkipTest(self.reason)
+
+            return fn(slf, *args, **kwargs)
+        return dep_fn
+
+
+# Skips a test on CPU if the condition is true.
+class skipCPUIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='cpu')
+
+
+# Skips a test on CUDA if the condition is true.
+class skipCUDAIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='cuda')
+
+# Skips a test on Lazy if the condition is true.
+class skipLazyIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='lazy')
+
+# Skips a test on Meta if the condition is true.
+class skipMetaIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='meta')
+
+# Skips a test on MPS if the condition is true.
+class skipMPSIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='mps')
+
+# Skips a test on XLA if the condition is true.
+class skipXLAIf(skipIf):
+
+    def __init__(self, dep, reason):
+        super().__init__(dep, reason, device_type='xla')
+
+class skipPRIVATEUSE1If(skipIf):
+
+    def __init__(self, dep, reason):
+        device_type = torch._C._get_privateuse1_backend_name()
+        super().__init__(dep, reason, device_type=device_type)
+
+def _has_sufficient_memory(device, size):
+    if torch.device(device).type == 'cuda':
+        if not torch.cuda.is_available():
+            return False
+        gc.collect()
+        torch.cuda.empty_cache()
+        # torch.cuda.mem_get_info, aka cudaMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
+        if device == 'cuda':
+            device = 'cuda:0'
+        return torch.cuda.memory.mem_get_info(device)[0] >= size
+
+    if device == 'xla':
+        raise unittest.SkipTest('TODO: Memory availability checks for XLA?')
+
+    if device != 'cpu':
+        raise unittest.SkipTest('Unknown device type')
+
+    # CPU
+    if not HAS_PSUTIL:
+        raise unittest.SkipTest('Need psutil to determine if memory is sufficient')
+
+    # The sanitizers have significant memory overheads
+    if TEST_WITH_ASAN or TEST_WITH_TSAN or TEST_WITH_UBSAN:
+        effective_size = size * 10
+    else:
+        effective_size = size
+
+    if psutil.virtual_memory().available < effective_size:
+        gc.collect()
+    return psutil.virtual_memory().available >= effective_size
+
+
+def largeTensorTest(size, device=None):
+    """Skip test if the device has insufficient memory to run the test
+
+    size may be a number of bytes, a string of the form "N GB", or a callable
+
+    If the test is a device generic test, available memory on the primary device will be checked.
+    It can also be overriden by the optional `device=` argument.
+    In other tests, the `device=` argument needs to be specified.
+    """
+    if isinstance(size, str):
+        assert size.endswith(('GB', 'gb')), "only bytes or GB supported"
+        size = 1024 ** 3 * int(size[:-2])
+
+    def inner(fn):
+        @wraps(fn)
+        def dep_fn(self, *args, **kwargs):
+            size_bytes = size(self, *args, **kwargs) if callable(size) else size
+            _device = device if device is not None else self.get_primary_device()
+            if not _has_sufficient_memory(_device, size_bytes):
+                raise unittest.SkipTest(f'Insufficient {_device} memory')
+
+            return fn(self, *args, **kwargs)
+        return dep_fn
+    return inner
+
+
+class expectedFailure:
+
+    def __init__(self, device_type):
+        self.device_type = device_type
+
+    def __call__(self, fn):
+
+        @wraps(fn)
+        def efail_fn(slf, *args, **kwargs):
+            if self.device_type is None or self.device_type == slf.device_type:
+                try:
+                    fn(slf, *args, **kwargs)
+                except Exception:
+                    return
+                else:
+                    slf.fail('expected test to fail, but it passed')
+
+            return fn(slf, *args, **kwargs)
+        return efail_fn
+
+
+class onlyOn:
+
+    def __init__(self, device_type):
+        self.device_type = device_type
+
+    def __call__(self, fn):
+
+        @wraps(fn)
+        def only_fn(slf, *args, **kwargs):
+            if self.device_type != slf.device_type:
+                reason = f"Only runs on {self.device_type}"
+                raise unittest.SkipTest(reason)
+
+            return fn(slf, *args, **kwargs)
+
+        return only_fn
+
+
+# Decorator that provides all available devices of the device type to the test
+# as a list of strings instead of providing a single device string.
+# Skips the test if the number of available devices of the variant's device
+# type is less than the 'num_required_devices' arg.
+class deviceCountAtLeast:
+
+    def __init__(self, num_required_devices):
+        self.num_required_devices = num_required_devices
+
+    def __call__(self, fn):
+        assert not hasattr(fn, 'num_required_devices'), f"deviceCountAtLeast redefinition for {fn.__name__}"
+        fn.num_required_devices = self.num_required_devices
+
+        @wraps(fn)
+        def multi_fn(slf, devices, *args, **kwargs):
+            if len(devices) < self.num_required_devices:
+                reason = f"fewer than {self.num_required_devices} devices detected"
+                raise unittest.SkipTest(reason)
+
+            return fn(slf, devices, *args, **kwargs)
+
+        return multi_fn
+
+# Only runs the test on the native device type (currently CPU, CUDA, Meta and PRIVATEUSE1)
+def onlyNativeDeviceTypes(fn):
+    @wraps(fn)
+    def only_fn(self, *args, **kwargs):
+        if self.device_type not in NATIVE_DEVICES:
+            reason = f"onlyNativeDeviceTypes: doesn't run on {self.device_type}"
+            raise unittest.SkipTest(reason)
+
+        return fn(self, *args, **kwargs)
+
+    return only_fn
+
+# Specifies per-dtype precision overrides.
+# Ex.
+#
+# @precisionOverride({torch.half : 1e-2, torch.float : 1e-4})
+# @dtypes(torch.half, torch.float, torch.double)
+# def test_X(self, device, dtype):
+#   ...
+#
+# When the test is instantiated its class's precision will be set to the
+# corresponding override, if it exists.
+# self.precision can be accessed directly, and it also controls the behavior of
+# functions like self.assertEqual().
+#
+# Note that self.precision is a scalar value, so if you require multiple
+# precisions (or are working with multiple dtypes) they should be specified
+# explicitly and computed using self.precision (e.g.
+# self.precision *2, max(1, self.precision)).
+class precisionOverride:
+
+    def __init__(self, d):
+        assert isinstance(d, dict), "precisionOverride not given a dtype : precision dict!"
+        for dtype in d.keys():
+            assert isinstance(dtype, torch.dtype), f"precisionOverride given unknown dtype {dtype}"
+
+        self.d = d
+
+    def __call__(self, fn):
+        fn.precision_overrides = self.d
+        return fn
+
+# Specifies per-dtype tolerance overrides tol(atol, rtol). It has priority over
+# precisionOverride.
+# Ex.
+#
+# @toleranceOverride({torch.float : tol(atol=1e-2, rtol=1e-3},
+#                     torch.double : tol{atol=1e-4, rtol = 0})
+# @dtypes(torch.half, torch.float, torch.double)
+# def test_X(self, device, dtype):
+#   ...
+#
+# When the test is instantiated its class's tolerance will be set to the
+# corresponding override, if it exists.
+# self.rtol and self.precision can be accessed directly, and they also control
+# the behavior of functions like self.assertEqual().
+#
+# The above example sets atol = 1e-2 and rtol = 1e-3 for torch.float and
+# atol = 1e-4 and rtol = 0 for torch.double.
+tol = namedtuple('tol', ['atol', 'rtol'])
+
+class toleranceOverride:
+    def __init__(self, d):
+        assert isinstance(d, dict), "toleranceOverride not given a dtype : tol dict!"
+        for dtype, prec in d.items():
+            assert isinstance(dtype, torch.dtype), f"toleranceOverride given unknown dtype {dtype}"
+            assert isinstance(prec, tol), "toleranceOverride not given a dtype : tol dict!"
+
+        self.d = d
+
+    def __call__(self, fn):
+        fn.tolerance_overrides = self.d
+        return fn
+
+# Decorator that instantiates a variant of the test for each given dtype.
+# Notes:
+#   (1) Tests that accept the dtype argument MUST use this decorator.
+#   (2) Can be overridden for CPU or CUDA, respectively, using dtypesIfCPU
+#       or dtypesIfCUDA.
+#   (3) Can accept an iterable of dtypes or an iterable of tuples
+#       of dtypes.
+# Examples:
+# @dtypes(torch.float32, torch.float64)
+# @dtypes((torch.long, torch.float32), (torch.int, torch.float64))
+class dtypes:
+
+    def __init__(self, *args, device_type="all"):
+        if len(args) > 0 and isinstance(args[0], (list, tuple)):
+            for arg in args:
+                assert isinstance(arg, (list, tuple)), \
+                    "When one dtype variant is a tuple or list, " \
+                    "all dtype variants must be. " \
+                    f"Received non-list non-tuple dtype {str(arg)}"
+                assert all(isinstance(dtype, torch.dtype) for dtype in arg), f"Unknown dtype in {str(arg)}"
+        else:
+            assert all(isinstance(arg, torch.dtype) for arg in args), f"Unknown dtype in {str(args)}"
+
+        self.args = args
+        self.device_type = device_type
+
+    def __call__(self, fn):
+        d = getattr(fn, 'dtypes', {})
+        assert self.device_type not in d, f"dtypes redefinition for {self.device_type}"
+        d[self.device_type] = self.args
+        fn.dtypes = d
+        return fn
+
+
+# Overrides specified dtypes on the CPU.
+class dtypesIfCPU(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type='cpu')
+
+
+# Overrides specified dtypes on CUDA.
+class dtypesIfCUDA(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type='cuda')
+
+class dtypesIfMPS(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type='mps')
+
+class dtypesIfPRIVATEUSE1(dtypes):
+
+    def __init__(self, *args):
+        super().__init__(*args, device_type=torch._C._get_privateuse1_backend_name())
+
+def onlyCPU(fn):
+    return onlyOn('cpu')(fn)
+
+
+def onlyCUDA(fn):
+    return onlyOn('cuda')(fn)
+
+
+def onlyMPS(fn):
+    return onlyOn('mps')(fn)
+
+def onlyPRIVATEUSE1(fn):
+    device_type = torch._C._get_privateuse1_backend_name()
+    device_mod = getattr(torch, device_type, None)
+    if device_mod is None:
+        reason = f"Skip as torch has no module of {device_type}"
+        return unittest.skip(reason)(fn)
+    return onlyOn(device_type)(fn)
+
+def onlyCUDAAndPRIVATEUSE1(fn):
+    @wraps(fn)
+    def only_fn(self, *args, **kwargs):
+        if self.device_type not in ('cuda', torch._C._get_privateuse1_backend_name()):
+            reason = f"onlyCUDAAndPRIVATEUSE1: doesn't run on {self.device_type}"
+            raise unittest.SkipTest(reason)
+
+        return fn(self, *args, **kwargs)
+
+    return only_fn
+
+def disablecuDNN(fn):
+
+    @wraps(fn)
+    def disable_cudnn(self, *args, **kwargs):
+        if self.device_type == 'cuda' and self.has_cudnn():
+            with torch.backends.cudnn.flags(enabled=False):
+                return fn(self, *args, **kwargs)
+        return fn(self, *args, **kwargs)
+
+    return disable_cudnn
+
+def disableMkldnn(fn):
+
+    @wraps(fn)
+    def disable_mkldnn(self, *args, **kwargs):
+        if torch.backends.mkldnn.is_available():
+            with torch.backends.mkldnn.flags(enabled=False):
+                return fn(self, *args, **kwargs)
+        return fn(self, *args, **kwargs)
+
+    return disable_mkldnn
+
+
+def expectedFailureCPU(fn):
+    return expectedFailure('cpu')(fn)
+
+
+def expectedFailureCUDA(fn):
+    return expectedFailure('cuda')(fn)
+
+def expectedFailureMeta(fn):
+    return skipIfTorchDynamo()(expectedFailure('meta')(fn))
+
+def expectedFailureXLA(fn):
+    return expectedFailure('xla')(fn)
+
+# Skips a test on CPU if LAPACK is not available.
+def skipCPUIfNoLapack(fn):
+    return skipCPUIf(not torch._C.has_lapack, "PyTorch compiled without Lapack")(fn)
+
+
+# Skips a test on CPU if FFT is not available.
+def skipCPUIfNoFFT(fn):
+    return skipCPUIf(not torch._C.has_spectral, "PyTorch is built without FFT support")(fn)
+
+
+# Skips a test on CPU if MKL is not available.
+def skipCPUIfNoMkl(fn):
+    return skipCPUIf(not TEST_MKL, "PyTorch is built without MKL support")(fn)
+
+
+# Skips a test on CPU if MKL Sparse is not available (it's not linked on Windows).
+def skipCPUIfNoMklSparse(fn):
+    return skipCPUIf(IS_WINDOWS or not TEST_MKL, "PyTorch is built without MKL support")(fn)
+
+
+# Skips a test on CPU if mkldnn is not available.
+def skipCPUIfNoMkldnn(fn):
+    return skipCPUIf(not torch.backends.mkldnn.is_available(), "PyTorch is built without mkldnn support")(fn)
+
+
+# Skips a test on CUDA if MAGMA is not available.
+def skipCUDAIfNoMagma(fn):
+    return skipCUDAIf('no_magma', "no MAGMA library detected")(skipCUDANonDefaultStreamIf(True)(fn))
+
+def has_cusolver():
+    return not TEST_WITH_ROCM
+
+def has_hipsolver():
+    rocm_version = _get_torch_rocm_version()
+    # hipSOLVER is disabled on ROCM < 5.3
+    return rocm_version >= (5, 3)
+
+# Skips a test on CUDA/ROCM if cuSOLVER/hipSOLVER is not available
+def skipCUDAIfNoCusolver(fn):
+    return skipCUDAIf(not has_cusolver() and not has_hipsolver(), "cuSOLVER not available")(fn)
+
+
+# Skips a test if both cuSOLVER and MAGMA are not available
+def skipCUDAIfNoMagmaAndNoCusolver(fn):
+    if has_cusolver():
+        return fn
+    else:
+        # cuSolver is disabled on cuda < 10.1.243, tests depend on MAGMA
+        return skipCUDAIfNoMagma(fn)
+
+# Skips a test if both cuSOLVER/hipSOLVER and MAGMA are not available
+def skipCUDAIfNoMagmaAndNoLinalgsolver(fn):
+    if has_cusolver() or has_hipsolver():
+        return fn
+    else:
+        # cuSolver is disabled on cuda < 10.1.243, tests depend on MAGMA
+        return skipCUDAIfNoMagma(fn)
+
+# Skips a test on CUDA when using ROCm.
+def skipCUDAIfRocm(func=None, *, msg="test doesn't currently work on the ROCm stack"):
+    def dec_fn(fn):
+        reason = f"skipCUDAIfRocm: {msg}"
+        return skipCUDAIf(TEST_WITH_ROCM, reason=reason)(fn)
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
+# Skips a test on CUDA when not using ROCm.
+def skipCUDAIfNotRocm(fn):
+    return skipCUDAIf(not TEST_WITH_ROCM, "test doesn't currently work on the CUDA stack")(fn)
+
+# Skips a test on CUDA if ROCm is unavailable or its version is lower than requested.
+def skipCUDAIfRocmVersionLessThan(version=None):
+
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if self.device_type == 'cuda':
+                if not TEST_WITH_ROCM:
+                    reason = "ROCm not available"
+                    raise unittest.SkipTest(reason)
+                rocm_version_tuple = _get_torch_rocm_version()
+                if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
+                    reason = f"ROCm {rocm_version_tuple} is available but {version} required"
+                    raise unittest.SkipTest(reason)
+
+            return fn(self, *args, **kwargs)
+
+        return wrap_fn
+    return dec_fn
+
+# Skips a test on CUDA when using ROCm.
+def skipCUDAIfNotMiopenSuggestNHWC(fn):
+    return skipCUDAIf(not TEST_WITH_MIOPEN_SUGGEST_NHWC, "test doesn't currently work without MIOpen NHWC activation")(fn)
+
+# Skips a test for specified CUDA versions, given in the form of a list of [major, minor]s.
+def skipCUDAVersionIn(versions : List[Tuple[int, int]] = None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            version = _get_torch_cuda_version()
+            if version == (0, 0):  # cpu or rocm
+                return fn(self, *args, **kwargs)
+            if version in (versions or []):
+                reason = f"test skipped for CUDA version {version}"
+                raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+
+        return wrap_fn
+    return dec_fn
+
+# Skips a test for CUDA versions less than specified, given in the form of [major, minor].
+def skipCUDAIfVersionLessThan(versions : Tuple[int, int] = None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            version = _get_torch_cuda_version()
+            if version == (0, 0):  # cpu or rocm
+                return fn(self, *args, **kwargs)
+            if version < versions:
+                reason = f"test skipped for CUDA versions < {version}"
+                raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+
+        return wrap_fn
+    return dec_fn
+
+# Skips a test on CUDA if cuDNN is unavailable or its version is lower than requested.
+def skipCUDAIfCudnnVersionLessThan(version=0):
+
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if self.device_type == 'cuda':
+                if self.no_cudnn:
+                    reason = "cuDNN not available"
+                    raise unittest.SkipTest(reason)
+                if self.cudnn_version is None or self.cudnn_version < version:
+                    reason = f"cuDNN version {self.cudnn_version} is available but {version} required"
+                    raise unittest.SkipTest(reason)
+
+            return fn(self, *args, **kwargs)
+
+        return wrap_fn
+    return dec_fn
+
+# Skips a test on CUDA if cuSparse generic API is not available
+def skipCUDAIfNoCusparseGeneric(fn):
+    return skipCUDAIf(not TEST_CUSPARSE_GENERIC, "cuSparse Generic API not available")(fn)
+
+def skipCUDAIfNoHipsparseGeneric(fn):
+    return skipCUDAIf(not TEST_HIPSPARSE_GENERIC, "hipSparse Generic API not available")(fn)
+
+def skipCUDAIfNoSparseGeneric(fn):
+    return skipCUDAIf(not (TEST_CUSPARSE_GENERIC or TEST_HIPSPARSE_GENERIC), "Sparse Generic API not available")(fn)
+
+def skipCUDAIfNoCudnn(fn):
+    return skipCUDAIfCudnnVersionLessThan(0)(fn)
+
+def skipCUDAIfMiopen(fn):
+    return skipCUDAIf(torch.version.hip is not None, "Marked as skipped for MIOpen")(fn)
+
+def skipCUDAIfNoMiopen(fn):
+    return skipCUDAIf(torch.version.hip is None, "MIOpen is not available")(skipCUDAIfNoCudnn(fn))
+
+def skipLazy(fn):
+    return skipLazyIf(True, "test doesn't work with lazy tensors")(fn)
+
+def skipMeta(fn):
+    return skipMetaIf(True, "test doesn't work with meta tensors")(fn)
+
+def skipXLA(fn):
+    return skipXLAIf(True, "Marked as skipped for XLA")(fn)
+
+def skipMPS(fn):
+    return skipMPSIf(True, "test doesn't work on MPS backend")(fn)
+
+def skipPRIVATEUSE1(fn):
+    return skipPRIVATEUSE1If(True, "test doesn't work on privateuse1 backend")(fn)
+
+# TODO: the "all" in the name isn't true anymore for quite some time as we have also have for example XLA and MPS now.
+#  This should probably enumerate all available device type test base classes.
+def get_all_device_types() -> List[str]:
+    return ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda']
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_dist_composable.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_dist_composable.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c488b42933d4ab8550a07091b23ef629617f03
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_dist_composable.py
@@ -0,0 +1,111 @@
+# mypy: ignore-errors
+
+# Owner(s): ["oncall: distributed"]
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+
+
+class UnitModule(nn.Module):
+    def __init__(self, device: torch.device):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100, device=device)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100, device=device),
+            nn.ReLU(),
+        )
+        self.l2 = nn.Linear(100, 100, device=device)
+
+    def forward(self, x):
+        return self.l2(self.seq(self.l1(x)))
+
+
+class CompositeModel(nn.Module):
+    def __init__(self, device: torch.device):
+        super().__init__()
+        self.l1 = nn.Linear(100, 100, device=device)
+        self.u1 = UnitModule(device)
+        self.u2 = UnitModule(device)
+        self.l2 = nn.Linear(100, 100, device=device)
+
+    def forward(self, x):
+        return self.l2(self.u2(self.u1(self.l1(x))))
+
+
+class UnitParamModule(nn.Module):
+    def __init__(self, device: torch.device):
+        super().__init__()
+        self.l = nn.Linear(100, 100, device=device)
+        self.seq = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(100, 100, device=device),
+            nn.ReLU(),
+        )
+        self.p = nn.Parameter(torch.randn((100, 100), device=device))
+
+    def forward(self, x):
+        return torch.mm(self.seq(self.l(x)), self.p)
+
+
+class CompositeParamModel(nn.Module):
+    def __init__(self, device: torch.device):
+        super().__init__()
+        self.l = nn.Linear(100, 100, device=device)
+        self.u1 = UnitModule(device)
+        self.u2 = UnitModule(device)
+        self.p = nn.Parameter(torch.randn((100, 100), device=device))
+        self.register_buffer(
+            "buffer", torch.randn((100, 100), device=device), persistent=True
+        )
+
+    def forward(self, x):
+        a = self.u2(self.u1(self.l(x)))
+        b = self.p
+        return torch.mm(a, b)
+
+
+class FakeSequential(nn.Module):
+    # Define this class to achieve a desired nested wrapping using the module
+    # wrap policy with `nn.Sequential`
+    def __init__(self, *modules: Tuple[nn.Module, ...]) -> None:
+        super().__init__()
+        self._module_sequence = list(modules)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for module in self._module_sequence:
+            x = module(x)
+        return x
+
+
+class NestedSequentialModel(nn.Module):
+    def __init__(self, device: torch.device) -> None:
+        super().__init__()
+        # This nested structure exercises traversal order to catch differences
+        # between valid traversals (e.g. BFS and DFS variations).
+        self.seq1 = nn.Sequential(
+            nn.Linear(1, 1, device=device),
+            FakeSequential(
+                nn.Linear(1, 1, device=device),
+                nn.ReLU(),
+                FakeSequential(
+                    nn.Linear(1, 1, device=device),
+                ),
+                nn.ReLU(),
+            ),
+            nn.Linear(1, 2, device=device),
+        )
+        self.lin = nn.Linear(2, 2, device=device)
+        self.seq2 = nn.Sequential(
+            nn.ReLU(),
+            nn.Linear(2, 3, device=device),
+            FakeSequential(
+                nn.Linear(3, 2, bias=False, device=device),
+                nn.Linear(2, 4, bias=False, device=device),
+            ),
+        )
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return self.seq2(self.lin(self.seq1(x)))
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_distributed.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2aa588a93cc40f6a7c3ac7ff553eee6f9e04e43
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_distributed.py
@@ -0,0 +1,1322 @@
+# mypy: ignore-errors
+
+import faulthandler
+import logging
+import multiprocessing
+import os
+import queue
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import types
+import unittest
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import timedelta
+from enum import Enum
+from functools import partial, reduce, wraps
+from io import StringIO
+from typing import Dict, NamedTuple, Optional, Union
+from unittest.mock import patch
+
+import torch
+import torch._dynamo.test_case
+import torch.cuda.nccl
+import torch.distributed as c10d
+import torch.nn as nn
+from torch.testing._internal.common_utils import (
+    FILE_SCHEMA,
+    find_free_port,
+    IS_SANDCASTLE,
+    retry_on_connect_failures,
+    skip_but_pass_in_sandcastle,
+    skip_but_pass_in_sandcastle_if,
+    TEST_WITH_ROCM,
+    TEST_WITH_TSAN,
+    TestCase,
+)
+from torch.testing._internal.common_utils import (
+    parametrize,
+    subtest,
+)
+from torch.testing._internal.distributed.multi_threaded_pg import (
+    _install_threaded_pg,
+    _uninstall_threaded_pg,
+    ProcessLocalGroup,
+)
+import operator
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestSkip(NamedTuple):
+    exit_code: int
+    message: str
+
+
+TEST_SKIPS = {
+    "backend_unavailable": TestSkip(
+        72, "Skipped because distributed backend is not available."
+    ),
+    "small_worldsize": TestSkip(73, "Skipped due to small world size."),
+    "odd_worldsize": TestSkip(87, "Skipped due to odd world size."),
+    "no_cuda": TestSkip(74, "CUDA is not available."),
+    "multi-gpu-1": TestSkip(75, "Need at least 1 CUDA device"),
+    "multi-gpu-2": TestSkip(77, "Need at least 2 CUDA devices"),
+    "multi-gpu-3": TestSkip(80, "Need at least 3 CUDA devices"),
+    "multi-gpu-4": TestSkip(81, "Need at least 4 CUDA devices"),
+    "multi-gpu-5": TestSkip(82, "Need at least 5 CUDA devices"),
+    "multi-gpu-6": TestSkip(83, "Need at least 6 CUDA devices"),
+    "multi-gpu-7": TestSkip(84, "Need at least 7 CUDA devices"),
+    "multi-gpu-8": TestSkip(85, "Need at least 8 CUDA devices"),
+    "nccl": TestSkip(76, "c10d not compiled with NCCL support"),
+    "skipIfRocm": TestSkip(78, "Test skipped for ROCm"),
+    "no_peer_access": TestSkip(79, "Test skipped because no GPU peer access"),
+    "generic": TestSkip(
+        86, "Test skipped at subprocess level, look at subprocess log for skip reason"
+    ),
+    "importerror": TestSkip(88, "Test skipped due to missing import"),
+}
+
+
+@dataclass
+class DistTestCases:
+    # Backends that do not support a specific collective
+    skip_collective = {}
+    skip_collective["allgather_coalesced"] = {"nccl", "mpi", "ucc"}
+    skip_collective["reduce"] = set()
+    skip_collective["sendrecv anysource"] = {"nccl", "ucc"}
+    skip_collective["cpu barrier"] = {"nccl", "ucc"}
+
+    # Sets showing that something is implemented
+    backend_feature = {}
+    backend_feature["gpu"] = {"nccl", "gloo", "ucc"}
+    backend_feature["cuda"] = {"nccl", "gloo", "ucc"}
+    backend_feature["ddp"] = {"nccl", "gloo", "ucc"}
+    backend_feature["subgroup"] = {"nccl", "gloo", "ucc"}
+    backend_feature["plugin"] = set()
+
+
+def skip_if_no_gpu(func):
+    """Skips if the world size exceeds the number of GPUs, ensuring that if the
+    test is run, each rank has its own GPU via ``torch.cuda.device(rank)``."""
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.cuda.is_available():
+            sys.exit(TEST_SKIPS["no_cuda"].exit_code)
+        world_size = int(os.environ["WORLD_SIZE"])
+        if torch.cuda.device_count() < world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{world_size}"].exit_code)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_small_worldsize(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) <= 2:
+            sys.exit(TEST_SKIPS["small_worldsize"].exit_code)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def skip_if_odd_worldsize(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if (os.environ["BACKEND"] != "mpi") and int(os.environ["WORLD_SIZE"]) % 2 == 1:
+            sys.exit(TEST_SKIPS["odd_worldsize"].exit_code)
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+def require_n_gpus_for_nccl_backend(n, backend):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if backend == "nccl" and torch.cuda.device_count() < n:
+                sys.exit(TEST_SKIPS[f"multi-gpu-{n}"].exit_code)
+            else:
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def import_transformers_or_skip():
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                from transformers import (  # noqa: F401
+                    AutoModelForMaskedLM,
+                    BertConfig,
+                )
+
+                return func(*args, **kwargs)
+            except ImportError:
+                sys.exit(TEST_SKIPS["importerror"].exit_code)
+
+        return wrapper
+
+    return decorator
+
+
+def skip_if_lt_x_gpu(x):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+                return func(*args, **kwargs)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+
+        return wrapper
+
+    return decorator
+
+
+# This decorator helps avoiding initializing cuda while testing other backends
+def nccl_skip_if_lt_x_gpu(backend, x):
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if backend != "nccl":
+                return func(*args, **kwargs)
+            if torch.cuda.is_available() and torch.cuda.device_count() >= x:
+                return func(*args, **kwargs)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)
+
+        return wrapper
+
+    return decorator
+
+
+def verify_ddp_error_logged(model_DDP, err_substr):
+    # Verify error was logged in ddp_logging_data.
+    ddp_logging_data = model_DDP._get_ddp_logging_data()
+    assert "iteration" in ddp_logging_data
+    assert "has_error" in ddp_logging_data
+    assert "error" in ddp_logging_data
+    logging_err = ddp_logging_data["error"]
+    # Remove C++ stacktrace if needed.
+    actual = (
+        err_substr
+        if err_substr.find("\nException raised from ") == -1
+        else err_substr.split("\nException raised from ")[0]
+    )
+    assert (
+        actual in logging_err
+    ), f"Did not find expected {actual} in ddp logging data error: {logging_err}"
+
+
+def with_nccl_blocking_wait(func):
+    """
+    Convenience decorator to set/unset TORCH_NCCL_BLOCKING_WAIT flag. Note that use of
+    this decorator will override the setting of TORCH_NCCL_ASYNC_ERROR_HANDLING for
+    the particular test. After the test, both TORCH_NCCL_BLOCKING_WAIT and
+    TORCH_NCCL_ASYNC_ERROR_HANDLING will be restored to their original values.
+    """
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Save and unset TORCH_NCCL_ASYNC_ERROR_HANDLING
+        try:
+            cached_nccl_async_error_handling: Union[str, None] = os.environ[
+                "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+            ]
+            del os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"]
+        except KeyError:
+            # TORCH_NCCL_ASYNC_ERROR_HANDLING was unset
+            cached_nccl_async_error_handling = None
+
+        # Save val of TORCH_NCCL_BLOCKING_WAIT and set it.
+        try:
+            cached_nccl_blocking_wait: Union[str, None] = os.environ[
+                "TORCH_NCCL_BLOCKING_WAIT"
+            ]
+        except KeyError:
+            cached_nccl_blocking_wait = None
+        finally:
+            os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+
+        try:
+            ret = func(*args, **kwargs)
+            return ret
+        finally:
+            # restore old values.
+            if cached_nccl_async_error_handling is not None:
+                os.environ[
+                    "TORCH_NCCL_ASYNC_ERROR_HANDLING"
+                ] = cached_nccl_async_error_handling
+
+            if cached_nccl_blocking_wait is not None:
+                os.environ["TORCH_NCCL_BLOCKING_WAIT"] = cached_nccl_blocking_wait
+
+    return wrapper
+
+
+def with_dist_debug_levels(levels):
+    """
+    Runs a test for each distributed debug level specified in levels.
+    """
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            old_level = os.environ.get("TORCH_DISTRIBUTED_DEBUG", None)
+            for level in levels:
+                os.environ["TORCH_DISTRIBUTED_DEBUG"] = level
+                c10d.set_debug_level_from_env()
+                ret = func(*args, **kwargs)
+                c10d.barrier()
+                if old_level is not None:
+                    os.environ["TORCH_DISTRIBUTED_DEBUG"] = old_level
+            # Only returns test return for last test, but since these are
+            # unittests the return value is not really used and earlier tests
+            # would've raised had they failed.
+            return ret
+
+        return wrapper
+
+    return decorator
+
+
+def requires_gloo():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_gloo_available(),
+        "c10d was not compiled with the Gloo backend",
+    )
+
+
+def requires_nccl_version(version, msg):
+    if not c10d.is_nccl_available():
+        return skip_but_pass_in_sandcastle(
+            "c10d was not compiled with the NCCL backend",
+        )
+    else:
+        return skip_but_pass_in_sandcastle_if(
+            torch.cuda.nccl.version() < version,
+            "Requires NCCL version greater than or equal to: {}, found: {}, reason: {}".format(
+                version, torch.cuda.nccl.version(), msg
+            ),
+        )
+
+
+def requires_nccl():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_nccl_available(),
+        "c10d was not compiled with the NCCL backend",
+    )
+
+def requires_ucc():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_ucc_available(),
+        "c10d was not compiled with the UCC backend",
+    )
+
+def requires_mpi():
+    return skip_but_pass_in_sandcastle_if(
+        not c10d.is_mpi_available(),
+        "c10d was not compiled with the MPI backend",
+    )
+
+
+def skip_if_rocm(func):
+    """Skips a test for ROCm"""
+    func.skip_if_rocm = True
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_ROCM:
+            return func(*args, **kwargs)
+        sys.exit(TEST_SKIPS["skipIfRocm"].exit_code)
+
+    return wrapper
+
+
+def skip_if_win32():
+    return skip_but_pass_in_sandcastle_if(
+        sys.platform == "win32",
+        "This unit test case is not supported on Windows platform",
+    )
+
+
+@retry_on_connect_failures
+def create_tcp_store(
+    addr="localhost",
+    world_size=1,
+    is_master=True,
+    timeout=timedelta(minutes=5),
+    wait_for_workers=True,
+    jit_class=False,
+    use_libuv=False
+):
+    """
+    Creates a TCP store. Retries if the chosen port is already in use.
+    """
+    port = find_free_port()
+    if jit_class:
+        timeout_millisecond = int(timeout / timedelta(milliseconds=1))
+        return torch.classes.dist_c10d.TCPStore(
+            addr, port, world_size, is_master, timeout_millisecond
+        )
+    else:
+        return c10d.TCPStore(
+            addr, port, world_size, is_master, wait_for_workers=wait_for_workers, use_libuv=use_libuv
+        )
+
+
+if TEST_WITH_TSAN:
+    # TSAN runs much slower.
+    TIMEOUT_DEFAULT = 500
+else:
+    TIMEOUT_DEFAULT = int(os.getenv('DISTRIBUTED_TESTS_DEFAULT_TIMEOUT', '300'))
+TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
+
+
+# https://github.com/pytorch/pytorch/issues/75665
+if TEST_WITH_ROCM:
+    TIMEOUT_OVERRIDE["test_join_kwargs"] = 200
+
+
+def create_device(interface=None):
+    if sys.platform == "win32" or interface is None:
+        return c10d.ProcessGroupGloo.create_device(hostname="127.0.0.1")
+    else:
+        return c10d.ProcessGroupGloo.create_device(interface=interface)
+
+
+def get_timeout(test_id) -> int:
+    return TIMEOUT_OVERRIDE.get(test_id.split(".")[-1], TIMEOUT_DEFAULT)
+
+
+@contextmanager
+def captured_output():
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+def simple_sparse_reduce_tests(rank: int, world_size: int, num_inputs: int = 1):
+    """
+    Generate a number of basic test cases for sparse reduction.
+    These cover tensors with a varying number of sparse dimensions and a varying
+    number of dense dimensions. The only reduction operation we support is sum.
+    """
+
+    def generate(rank: int, world_size: int, sparse_dims: int = 1, dense_dims: int = 0):
+        # First sparse dimension is [0..rank].
+        # Subsequent dimensions are always 0, so we know there is
+        # a non-empty intersection between any two sparse tensors.
+        indices = torch.reshape(torch.arange(rank + 1), (1, rank + 1))
+        shape = [world_size] + [2 for _ in range(dense_dims)]
+        for _ in range(sparse_dims - 1):
+            indices = torch.cat((indices, torch.zeros(1, rank + 1)))
+            shape.append(world_size)
+        values = torch.ones([rank + 1] + [2 for _ in range(dense_dims)])
+        return torch.sparse_coo_tensor(indices, values, shape)
+
+    def compute_sum(fn, world_size: int):
+        return reduce(
+            operator.add, [fn(rank, world_size) for rank in range(world_size)]
+        )
+
+    return [
+        (
+            [
+                fn(num_inputs * rank + i, num_inputs * world_size)
+                for i in range(num_inputs)
+            ],
+            [compute_sum(fn, num_inputs * world_size) for i in range(num_inputs)],
+        )
+        for fn in [
+            partial(generate, sparse_dims=1),
+            partial(generate, sparse_dims=2),
+            partial(generate, sparse_dims=3),
+            partial(generate, dense_dims=1),
+            partial(generate, dense_dims=2),
+            partial(generate, dense_dims=3),
+        ]
+    ]
+
+
+# HELPER FOR MULTIGPU TESTS
+def init_multigpu_helper(world_size: int, backend: str):
+    """Multigpu tests are designed to simulate the multi nodes with multi
+    GPUs on each node. Nccl backend requires equal #GPUs in each process.
+    On a single node, all visible GPUs are evenly
+    divided to subsets, each process only uses a subset.
+    """
+    nGPUs = torch.cuda.device_count()
+    visible_devices = range(nGPUs)
+
+    # If rank is less than or equal to number of available GPU's
+    # then each rank can be mapped to corresponding GPU.
+    nGPUs_per_process = 1
+    if world_size > nGPUs:
+        nGPUs_per_process = nGPUs // world_size
+    rank_to_GPU = {
+        i: list(visible_devices[i * nGPUs_per_process : (i + 1) * nGPUs_per_process])
+        for i in range(world_size)
+    }
+    return rank_to_GPU
+
+
+tmp_dir: Optional[tempfile.TemporaryDirectory] = None
+
+
+def initialize_temp_directories(init_method: Optional[str] = None) -> None:
+    global tmp_dir
+    tmp_dir = tempfile.TemporaryDirectory()
+    os.environ["TEMP_DIR"] = tmp_dir.name
+    os.mkdir(os.path.join(tmp_dir.name, "barrier"))
+    os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
+    init_dir_path = os.path.join(tmp_dir.name, "init_dir")
+    os.mkdir(init_dir_path)
+    # Set init method if specified.
+    if init_method is not None:
+        os.environ["INIT_METHOD"] = init_method
+    else:
+        os.environ["INIT_METHOD"] = FILE_SCHEMA + os.path.join(
+            init_dir_path, "shared_init_file"
+        )
+
+
+def cleanup_temp_dir() -> None:
+    if tmp_dir is not None:
+        tmp_dir.cleanup()
+
+
+# Most tests operate with this worldsize
+DEFAULT_WORLD_SIZE = 4
+
+# [How does MultiProcessTestCase work?]
+# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
+# default `world_size()` returns 4. Let's take `test_rpc_spawn.py` as an
+# example which inherits from this class. Its `Setup()` methods calls into
+# `MultiProcessTestCase._spawn_processes()` which spawns `world_size()`
+# subprocesses. During the spawn, the main process passes the test name to
+# subprocesses, and the name is acquired from self.id(). The subprocesses
+# then use the provided test function name to retrieve the function attribute
+# from the test instance and run it. The main process simply waits for all
+# subprocesses to join.
+
+
+class MultiProcessTestCase(TestCase):
+    MAIN_PROCESS_RANK = -1
+    # This exit code is used to indicate that the test code had an error and
+    # exited abnormally. There are certain tests that might use sys.exit() to
+    # simulate failures and in those cases, we can't have an exit code of 0,
+    # but we still want to ensure we didn't run into any other errors.
+    TEST_ERROR_EXIT_CODE = 10
+
+    # do not early terminate for distributed tests.
+    def _should_stop_test_suite(self) -> bool:
+        return False
+
+    @property
+    def world_size(self) -> int:
+        return DEFAULT_WORLD_SIZE
+
+    def join_or_run(self, fn):
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_PROCESS_RANK:
+                self._join_processes(fn)
+            else:
+                fn()
+
+        return types.MethodType(wrapper, self)
+
+    # The main process spawns N subprocesses that run the test.
+    # Constructor patches current instance test method to
+    # assume the role of the main process and join its subprocesses,
+    # or run the underlying test function.
+    def __init__(self, method_name: str = "runTest") -> None:
+        super().__init__(method_name)
+        fn = getattr(self, method_name)
+        setattr(self, method_name, self.join_or_run(fn))
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.skip_return_code_checks = []  # type: ignore[var-annotated]
+        self.processes = []  # type: ignore[var-annotated]
+        self.rank = self.MAIN_PROCESS_RANK
+        self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        # pid to pipe consisting of error message from process.
+        self.pid_to_pipe = {}  # type: ignore[var-annotated]
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        for p in self.processes:
+            p.terminate()
+        # Each Process instance holds a few open file descriptors. The unittest
+        # runner creates a new TestCase instance for each test method and keeps
+        # it alive until the end of the entire suite. We must thus reset the
+        # processes to prevent an effective file descriptor leak.
+        self.processes = []
+
+    def _current_test_name(self) -> str:
+        # self.id() == e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+        return self.id().split(".")[-1]
+
+    def _start_processes(self, proc) -> None:
+        self.processes = []
+        for rank in range(int(self.world_size)):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            process = proc(
+                target=self.__class__._run,
+                name="process " + str(rank),
+                args=(rank, self._current_test_name(), self.file_name, child_conn),
+            )
+            process.start()
+            logger.info("Started process %s with pid %s", rank, process.pid)
+            self.pid_to_pipe[process.pid] = parent_conn
+            self.processes.append(process)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context("spawn").Process
+        self._start_processes(proc)
+
+    class Event(Enum):
+        GET_TRACEBACK = 1
+
+    @staticmethod
+    def _event_listener(parent_pipe, signal_pipe, rank: int):
+        logger.info("Starting event listener thread for rank %s", rank)
+        while True:
+            ready_pipes = multiprocessing.connection.wait([parent_pipe, signal_pipe])
+
+            if parent_pipe in ready_pipes:
+
+                if parent_pipe.closed:
+                    logger.info(
+                        "Pipe closed for process %s, stopping event listener thread", rank
+                    )
+                    return
+
+                event = parent_pipe.recv()
+                logger.info("Received event %s on process %s", event, rank)
+
+                if event == MultiProcessTestCase.Event.GET_TRACEBACK:
+                    # Return traceback to the parent process.
+                    with tempfile.NamedTemporaryFile(mode="r+") as tmp_file:
+                        faulthandler.dump_traceback(tmp_file)
+                        # Flush buffers and seek to read from the beginning
+                        tmp_file.flush()
+                        tmp_file.seek(0)
+                        parent_pipe.send(tmp_file.read())
+
+                        logger.info("Process %s sent traceback", rank)
+
+            if signal_pipe in ready_pipes:
+                return
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
+
+    def run_test(self, test_name: str, parent_pipe) -> None:
+        # Start event listener thread.
+        signal_recv_pipe, signal_send_pipe = torch.multiprocessing.Pipe(duplex=False)
+        event_listener_thread = threading.Thread(
+            target=MultiProcessTestCase._event_listener,
+            args=(parent_pipe, signal_recv_pipe, self.rank),
+            daemon=True,
+        )
+        event_listener_thread.start()
+        if sys.platform != "win32" and sys.platform != "darwin":
+            # Register signal handler to dump stack traces on FATALs.
+            # Windows and MacOS do not support the signal handlers.
+            torch._C._set_print_stack_traces_on_fatal_signal(True)
+        # Show full C++ stacktraces when a Python error originating from C++ is raised.
+        os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+
+        # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+        # We're retrieving a corresponding test and executing it.
+        try:
+            getattr(self, test_name)()
+        except unittest.SkipTest as se:
+            logger.info(
+                "Process %s skipping test %s for following reason: %s", self.rank, test_name, str(se)
+            )
+            sys.exit(TEST_SKIPS["generic"].exit_code)
+        except Exception as e:
+            logger.error(
+                "Caught exception: \n%s exiting "
+                "process %s with exit code: %s",
+                traceback.format_exc(), self.rank, MultiProcessTestCase.TEST_ERROR_EXIT_CODE
+            )
+            # Send error to parent process.
+            parent_pipe.send(traceback.format_exc())
+            sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE)
+        finally:
+            if signal_send_pipe is not None:
+                signal_send_pipe.send(None)
+
+            assert event_listener_thread is not None
+            event_listener_thread.join()
+            # Close pipe after done with test.
+            parent_pipe.close()
+
+    def _get_timedout_process_traceback(self) -> None:
+        pipes = []
+        for i, process in enumerate(self.processes):
+            if process.exitcode is None:
+                pipe = self.pid_to_pipe[process.pid]
+                try:
+                    pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
+                    pipes.append((i, pipe))
+                except ConnectionError as e:
+                    logger.error(
+                        "Encountered error while trying to get traceback for process %s: %s", i, e
+                    )
+
+        # Wait for results.
+        for rank, pipe in pipes:
+            try:
+                # Wait for traceback
+                if pipe.poll(5):
+                    if pipe.closed:
+                        logger.info(
+                            "Pipe closed for process %s, cannot retrieve traceback", rank
+                        )
+                        continue
+
+                    traceback = pipe.recv()
+                    logger.error(
+                        "Process %s timed out with traceback: \n\n%s", rank, traceback
+                    )
+                else:
+                    logger.error(
+                        "Could not retrieve traceback for timed out process: %s", rank
+                    )
+            except ConnectionError as e:
+                logger.error(
+                    "Encountered error while trying to get traceback for process %s: %s", rank, e
+                )
+
+    def _join_processes(self, fn) -> None:
+        timeout = get_timeout(self.id())
+        start_time = time.time()
+        subprocess_error = False
+        try:
+            while True:
+                # check to see if any subprocess exited with an error early.
+                for (i, p) in enumerate(self.processes):
+                    # This is the exit code processes exit with if they
+                    # encountered an exception.
+                    if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE:
+                        print(
+                            f"Process {i} terminated with exit code {p.exitcode}, terminating remaining processes."
+                        )
+                        active_children = torch.multiprocessing.active_children()
+                        for ac in active_children:
+                            ac.terminate()
+                        subprocess_error = True
+                        break
+                if subprocess_error:
+                    break
+                # All processes have joined cleanly if they all a valid exitcode
+                if all(p.exitcode is not None for p in self.processes):
+                    break
+                # Check if we should time out the test. If so, we terminate each process.
+                elapsed = time.time() - start_time
+                if elapsed > timeout:
+                    self._get_timedout_process_traceback()
+                    print(
+                        f"Timing out after {timeout} seconds and killing subprocesses."
+                    )
+                    for p in self.processes:
+                        p.terminate()
+                    break
+                # Sleep to avoid excessive busy polling.
+                time.sleep(0.1)
+
+            elapsed_time = time.time() - start_time
+
+            if fn in self.skip_return_code_checks:
+                self._check_no_test_errors(elapsed_time)
+            else:
+                self._check_return_codes(elapsed_time)
+        finally:
+            # Close all pipes
+            for pipe in self.pid_to_pipe.values():
+                pipe.close()
+
+    def _check_no_test_errors(self, elapsed_time) -> None:
+        """
+        Checks that we didn't have any errors thrown in the child processes.
+        """
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    f"Process {i} timed out after {elapsed_time} seconds"
+                )
+            self.assertNotEqual(self.TEST_ERROR_EXIT_CODE, p.exitcode)
+
+    def _check_return_codes(self, elapsed_time) -> None:
+        """
+        Checks that the return codes of all spawned processes match, and skips
+        tests if they returned a return code indicating a skipping condition.
+        """
+        # If no processes are spawned, there is nothing to check.
+        if not self.processes:
+            logger.warning("Note: no subprocesses were spawned, test was likely skipped.")
+            return
+
+        first_process = self.processes[0]
+        # first, we check if there are errors in actual processes
+        # (via TEST_ERROR_EXIT CODE), and raise an exception for those.
+        # the reason we do this is to attempt to raise a more helpful error
+        # message than "Process x terminated/timed out"
+        # TODO: we should pipe the exception of the failed subprocess here.
+        # Currently, the actual exception is displayed as a logging output.
+        errored_processes = [
+            (i, p)
+            for i, p in enumerate(self.processes)
+            if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE
+        ]
+        if errored_processes:
+            error = ""
+            for i, process in errored_processes:
+                # Get error from pipe.
+                error_message = self.pid_to_pipe[process.pid].recv()
+                error += (
+                    "Process {} exited with error code {} and exception:\n{}\n".format(
+                        i, MultiProcessTestCase.TEST_ERROR_EXIT_CODE, error_message
+                    )
+                )
+
+            raise RuntimeError(error)
+        # If no process exited uncleanly, we check for timeouts, and then ensure
+        # each process exited cleanly.
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    f"Process {i} terminated or timed out after {elapsed_time} seconds"
+                )
+            self.assertEqual(
+                p.exitcode,
+                first_process.exitcode,
+                msg="Expect process {} exit code to match Process 0 exit code of {}, but got {}".format(
+                    i, first_process.exitcode, p.exitcode
+                ),
+            )
+        for skip in TEST_SKIPS.values():
+            if first_process.exitcode == skip.exit_code:
+                if IS_SANDCASTLE:
+                    # Don't use unittest.skip to skip the test on sandcastle
+                    # since it creates tasks for skipped tests assuming there
+                    # is some follow-up needed. Instead just "pass" the test
+                    # with an appropriate message.
+                    logger.info(
+                        "Skipping %s on sandcastle for the following reason: %s", self.id(), skip.message
+                    )
+                    return
+                else:
+                    raise unittest.SkipTest(skip.message)
+        self.assertEqual(
+            first_process.exitcode,
+            0,
+            msg=f"Expected zero exit code but got {first_process.exitcode} for pid: {first_process.pid}",
+        )
+
+    @property
+    def is_master(self) -> bool:
+        return self.rank == 0
+
+
+# Cannot use functools.cache as it requires python 3.9
+EFA_PROBE_RESULT = None
+
+
+def has_efa() -> bool:
+    """
+    If shell command `fi_info -p efa -t FI_EP_RDM` returns exit code 0 then we assume that the machine has
+    Libfabric EFA interfaces and EFA software components installed,
+    see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html.
+    """
+    global EFA_PROBE_RESULT
+    if EFA_PROBE_RESULT is not None:
+        return EFA_PROBE_RESULT
+
+    try:
+        EFA_PROBE_RESULT = (
+            subprocess.run(["fi_info", "-p", "efa", "-t", "FI_EP_RDM"], check=False).returncode == 0
+        )
+    except FileNotFoundError:
+        EFA_PROBE_RESULT = False
+    return EFA_PROBE_RESULT
+
+
+def tp_transports():
+    """
+    If the machine has Libfabric EFA interfaces and EFA software components installed it may cause
+    'RuntimeError: In operator() at tensorpipe/common/ibv.h:172 "": Operation not supported' if tensorpipe
+    uses InfiniBand transport, so we exclude it from tensorpipe transports,
+    see https://github.com/pytorch/pytorch/issues/73885 and https://github.com/pytorch/pytorch/issues/65022
+    """
+    return ["shm", "uv"] if has_efa() else None
+
+
+def spawn_threads_and_init_comms(
+    func=None, timeout=TIMEOUT_DEFAULT, world_size=DEFAULT_WORLD_SIZE
+):
+    """
+    Wrapper to use with a test method
+    """
+    if func is None:
+        return partial(
+            spawn_threads_and_init_comms, timeout=timeout, world_size=world_size
+        )
+
+
+    def _run_test_method_with_multi_threads(world_size, callback):
+        world = _install_threaded_pg()
+        global_store = c10d.HashStore()
+
+        def world_is_valid():
+            return world == c10d.distributed_c10d._world
+
+        def worker(rank, world_pg, store):
+            c10d.init_process_group(
+                backend="threaded", rank=rank, world_size=world_size, store=store
+            )
+            try:
+                callback()
+            except BaseException as ex:
+                # Exceptions are handled in MultiThreadedTestCase
+                MultiThreadedTestCase.exception_queue.put((rank, sys.exc_info()))
+                ProcessLocalGroup.exception_handle(ex)  # trigger _terminate event and awaken worker threads
+            finally:
+                if world_is_valid():
+                    c10d.destroy_process_group()
+
+        threads = []
+        for rank in range(world_size):
+            t = threading.Thread(target=worker, args=(rank, world, global_store))
+            t.start()
+            threads.append(t)
+
+        return threads
+
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        # TODO: get test name from kwargs
+        torch._C._distributed_c10d._set_thread_isolation_mode(True)
+        try:
+            threads = _run_test_method_with_multi_threads(world_size, lambda: func(self, *args, **kwargs))
+            # join and error handling
+            MultiThreadedTestCase._join_threads(threads, func)
+        finally:
+            torch._C._distributed_c10d._set_thread_isolation_mode(False)
+
+    return wrapper
+
+
+class MultiThreadedTestCase(TestCase):
+    """
+    Test runner that runs all tests with the in-proc process group using
+    multiple threads with the threaded process group.
+
+    Each test spawns world_size threads and run the test method in each thread.
+
+    Difference from regular MultiProcess test runner:
+    Must explicitly defines SetUp and call self._spawn_threads() to run the tests.
+    Cannot use setUp / tearDown (must use perThreadSetup / perThreadShutdown)
+        to set up / tear down each thread when running each test.
+    No global state possible
+        How bad of a limitation is this?
+    """
+    exception_queue = queue.Queue()
+
+    MAIN_THREAD_RANK = -1
+
+    def join_or_run(self, fn):
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_THREAD_RANK:
+                self._join_threads(self.threads, fn)
+            else:
+                fn()
+
+        return types.MethodType(wrapper, self)
+
+    def __init__(self, method_name: str = "runTest") -> None:
+        super().__init__(method_name)
+        test_fn = getattr(self, method_name, None)
+        setattr(self, method_name, self.join_or_run(test_fn))
+
+    def perThreadSetUp(self):
+        # super().setUp()  # TestCase.setUp() calls torch.manual_seed()
+        pass
+
+    def perThreadTearDown(self):
+        pass
+
+    def setUp(self) -> None:
+        """
+        setUp only set up things in the main thread, if you want to configure things
+        in the spawned threads, use perThreadSetUp
+        """
+        super().setUp()
+        self.rank = self.MAIN_THREAD_RANK
+        self.threads = []
+        # Show full C++ stacktraces when a Python error originating from C++ is raised.
+        os.environ["TORCH_SHOW_CPP_STACKTRACES"] = "1"
+
+    def tearDown(self):
+        """
+        tearDown only set up things in the main thread, if you want to configure things
+        in the spawned threads, use perThreadTearDown
+        """
+        super().tearDown()
+        self.threads = []
+
+    def _spawn_threads(self):
+        """
+        class method to spawn threads and run test, use this method in the SetUp of your TestCase
+        """
+        torch._C._distributed_c10d._set_thread_isolation_mode(True)
+        test_name = self._current_test_name
+        # for each test case, we need to create thread local world, and a global store
+        world = _install_threaded_pg()
+        self.__class__.global_store = c10d.HashStore()
+
+        def world_is_valid():
+            return world == c10d.distributed_c10d._world
+
+        if not world_is_valid():
+            raise RuntimeError("Invalid world")
+
+        for rank in range(self.world_size):
+            t = threading.Thread(target=self.__class__._run, args=(test_name, rank, self.world_size))
+            t.start()
+            self.threads.append(t)
+
+    @classmethod
+    def _run(cls, test_name, rank, world_size):
+        self = cls(test_name)
+        self.rank = rank
+
+        # precision/rel_tol is a thread-local setting since it may be overridden per test, need to make
+        # every thread have the same value. This would be relevant when we use op db tests, where it
+        # needs those states to be set i.e. using instantiate_device_type_tests()
+        # TODO: figure out a better way to do this
+        if hasattr(self, "_tls"):
+            self._tls = threading.local()
+            self._tls.precision = TestCase._precision
+            self._tls.rel_tol = TestCase._rel_tol
+
+        self.run_test_with_threaded_pg(test_name, rank, world_size)
+
+    def run_test_with_threaded_pg(self, test_name, rank, world_size):
+        """
+        Run the current test associated with `test_name` using the threaded process group.
+        """
+        c10d.init_process_group(
+            backend="threaded", rank=rank, world_size=world_size, store=self.__class__.global_store
+        )
+        self.perThreadSetUp()
+
+        try:
+            getattr(self, test_name)()
+        except BaseException as ex:
+            self.exception_queue.put((rank, sys.exc_info()))
+            ProcessLocalGroup.exception_handle(ex)  # trigger _terminate event and awaken worker threads
+        finally:
+            c10d.destroy_process_group()
+            self.perThreadTearDown()
+
+
+    @classmethod
+    def _join_threads(cls, threads, fn):
+        timeout = TIMEOUT_DEFAULT
+        try:
+            for idx, thread in enumerate(threads):
+                thread.join(max(0, timeout))
+                if thread.is_alive():
+                    MultiThreadedTestCase.exception_queue.put(
+                        (
+                            idx,
+                            (
+                                TimeoutError,
+                                TimeoutError(
+                                    f"Rank failed to join in under {timeout} seconds"
+                                ),
+                                None,
+                            ),
+                        )
+                    )
+            ProcessLocalGroup.reset()
+            failed_ranks = []
+            while not cls.exception_queue.empty():
+                failure = cls.exception_queue.get()
+                failed_ranks.append(failure)
+        finally:
+            _uninstall_threaded_pg()
+            torch._C._distributed_c10d._set_thread_isolation_mode(False)
+
+        cls._check_return_codes(failed_ranks, timeout, fn)
+
+    @classmethod
+    def _check_return_codes(cls, failed_ranks, timeout, fn):
+        # Print based on exceptions raised from threads
+        #   SkipTest: print info for each thread
+        #   TimeoutError: raise RuntimeError for any timed out thread
+        #   Normal Exception: print error for each thread that raises exception
+        #   and raise a RuntimeError
+        error_msg = ""
+        skip_code = -1
+        for rank, exc_info in failed_ranks:
+            exc = exc_info[1]
+            if isinstance(exc, unittest.SkipTest):
+                logger.info(
+                    "Thread %s skipping test %s for following reason: %s", rank, fn, str(exc)
+                )
+                if skip_code < 0:
+                    skip_code = TEST_SKIPS["generic"].exit_code
+            elif isinstance(exc, TimeoutError):
+                msg = f"Thread {rank} terminated or timed out after {timeout} seconds\n"
+                logger.error(msg)
+                raise RuntimeError(msg)
+            elif isinstance(exc, Exception):
+                msg = "".join(traceback.format_exception(*exc_info))
+                logger.error(
+                    "Caught exception: \n%s exiting thread %s", msg, rank
+                )
+                error_msg += (
+                    f"Thread {rank} exited with exception:\n{msg}\n"
+                )
+            elif isinstance(exc, SystemExit):
+                if type(exc.code) == int and skip_code < 0:
+                    skip_code = exc.code
+
+        # check exceptions
+        if len(error_msg) > 0:
+            raise RuntimeError(error_msg)
+        # check skip
+        if skip_code > 0:
+            for skip in TEST_SKIPS.values():
+                if skip_code == skip.exit_code:
+                    if IS_SANDCASTLE:
+                        # "pass" the test with an appropriate message.
+                        logger.info(
+                            "Skipping %s on sandcastle for the following reason: %s", fn, skip.message
+                        )
+                        return
+                    else:
+                        raise unittest.SkipTest(skip.message)
+
+    @property
+    def world_size(self) -> int:
+        return DEFAULT_WORLD_SIZE
+
+    @property
+    def _current_test_name(self) -> str:
+        # self.id() == e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+        return self.id().split(".")[-1]
+
+    def assertEqualOnRank(self, x, y, msg=None, *, rank=0):
+        """
+        The reason why we have this util function instead of
+        self.assertEqual is all threads are sharing one CPU RNG
+        so the assertion result is only reliable on rank 0
+        """
+        if self.rank == rank:
+            self.assertEqual(x, y, msg)
+
+    def assertNotEqualOnRank(self, x, y, msg=None, *, rank=0):
+        if self.rank == rank:
+            self.assertNotEqual(x, y)
+
+
+class SaveForwardInputsModule(nn.Module):
+    def __init__(
+        self,
+        forward_inputs: Dict[nn.Module, torch.Tensor],
+        cast_forward_inputs: bool,
+    ) -> None:
+        super().__init__()
+        self.l = nn.Linear(100, 100)
+        self.forward_inputs = forward_inputs
+        self.cast_forward_inputs = cast_forward_inputs
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self.forward_inputs[self] = x
+        return self.l(x.to(self.l.weight.dtype) if self.cast_forward_inputs else x)
+
+
+class SaveForwardInputsModel(nn.Module):
+    def __init__(
+        self,
+        forward_inputs: Dict[nn.Module, torch.Tensor],
+        cast_forward_inputs: bool,
+    ) -> None:
+        super().__init__()
+        self.c1 = SaveForwardInputsModule(forward_inputs, cast_forward_inputs)
+        self.c2 = SaveForwardInputsModule(forward_inputs, cast_forward_inputs)
+        self.forward_inputs = forward_inputs
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        self.forward_inputs[self] = x
+        return self.c2(self.c1(x))
+
+@contextmanager
+def _dynamo_dist_per_rank_init(rank, world_size, init_pg=True):
+    # To avoid multiple inheritance from _dynamo.test_case.TestCase and MultiProcessTestCase,
+    # Just manually implement the most important part of the dynamo behavior to reset/clear.
+    torch.cuda.set_device(rank)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '6789'
+    if init_pg:
+        c10d.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch._dynamo.reset()
+    torch._dynamo.utils.counters.clear()
+    try:
+        yield
+    finally:
+        torch._dynamo.reset()
+        torch._dynamo.utils.counters.clear()
+        if init_pg:
+            c10d.destroy_process_group()
+
+
+class DynamoDistributedSingleProcTestCase(torch._dynamo.test_case.TestCase):
+    """
+    Test harness for single-process dynamo distributed tests,
+    initializes dist process group.
+
+    Prefer this for simple tests, as it's easier to debug.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        # _exit_stack is set up in TestCase
+        cls._exit_stack.enter_context(
+            patch.dict(
+                os.environ,
+                {
+                    "MASTER_ADDR": "localhost",
+                    "MASTER_PORT": "12355",
+                },
+            )
+        )
+        cls.rank = 0
+        cls.device = f"cuda:{cls.rank}"
+        cls.device_ids = None if "cuda" in cls.device else [cls.rank]
+        c10d.init_process_group("nccl", rank=cls.rank, world_size=1)
+
+    @classmethod
+    def tearDownClass(cls):
+        c10d.destroy_process_group()
+        super().tearDownClass()
+
+
+class DynamoDistributedMultiProcTestCase(MultiProcessTestCase):
+    """
+    Use this for tests that actually run on multiple GPUs.
+
+    Decorate tests with @skip_if_lt_x_gpu(ngpu)
+
+    Note: MultiProcTestCase spawns processes per test and is slow.
+    Prefer MultiThreadedTestCase for most tests. Perhaps use this one
+    sparingly for integration tests.
+    """
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        super().tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def world_size(self) -> int:
+        return torch.cuda.device_count()
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str, parent_pipe) -> None:
+        # The rest is copypasta from MultiProcessTestCase._run
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
+
+
+# NOTE [test parametrization utils for native funcol migration]
+#
+# Between the time we switch to the native funcol by default and the time when
+# we are confident that we can remove the legacy implementation, we want to
+# ensure that the legacy funcol remains covered by unit tests. This is to
+# prepare for any potential (but unlikely) reverts. The following utilities
+# help achieve this goal.
+#
+# run_with_{native,legacy}_funcol - mark a test to run with only
+# {native,legacy} funcol. These decorators are for impl specific tests (e.g.
+# verifying generated code with FileCheck).
+#
+# run_with_both_funcol_impls - parametrize a test to run with both legacy and
+# native funcol.
+#
+# run_with_both_funcol_impls_with_arg - same as run_with_both_funcol_impls, but
+# passes `enable_native_funcol` to the test so impl specific checks can be
+# carried out.
+def with_native_funcol(use_native_funcol: bool, remove_arg: bool):
+    import torch.distributed._functional_collectives_impl as funcol_impl
+
+    def decorator(fn):
+        def inner(*args, **kwargs):
+            if remove_arg:
+                del kwargs["use_native_funcol"]
+            with patch.object(funcol_impl, '_use_native_funcol', new=use_native_funcol):
+                return fn(*args, **kwargs)
+
+        return inner
+
+    return decorator
+
+
+run_with_native_funcol = with_native_funcol(True, remove_arg=False)
+run_with_legacy_funcol = with_native_funcol(False, remove_arg=False)
+
+
+run_with_both_funcol_impls = parametrize(
+    "use_native_funcol",
+    [
+        subtest(True, decorators=[with_native_funcol(True, remove_arg=True)]),
+        subtest(False, decorators=[with_native_funcol(False, remove_arg=True)]),
+    ]
+)
+
+run_with_both_funcol_impls_with_arg = parametrize(
+    "use_native_funcol",
+    [
+        subtest(True, decorators=[with_native_funcol(True, remove_arg=False)]),
+        subtest(False, decorators=[with_native_funcol(False, remove_arg=False)]),
+    ]
+)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_dtype.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbcd2e3d5f6eaa2068df71ac8bb17b3c56e80a97
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_dtype.py
@@ -0,0 +1,134 @@
+# mypy: ignore-errors
+
+from typing import List
+
+import torch
+
+
+# Functions and classes for describing the dtypes a function supports
+# NOTE: these helpers should correspond to PyTorch's C++ dispatch macros
+
+# Verifies each given dtype is a torch.dtype
+def _validate_dtypes(*dtypes):
+    for dtype in dtypes:
+        assert isinstance(dtype, torch.dtype)
+    return dtypes
+
+# class for tuples corresponding to a PyTorch dispatch macro
+class _dispatch_dtypes(tuple):
+    def __add__(self, other):
+        assert isinstance(other, tuple)
+        return _dispatch_dtypes(tuple.__add__(self, other))
+
+_empty_types = _dispatch_dtypes(())
+def empty_types():
+    return _empty_types
+
+_floating_types = _dispatch_dtypes((torch.float32, torch.float64))
+def floating_types():
+    return _floating_types
+
+_floating_types_and_half = _floating_types + (torch.half,)
+def floating_types_and_half():
+    return _floating_types_and_half
+
+def floating_types_and(*dtypes):
+    return _floating_types + _validate_dtypes(*dtypes)
+
+_floating_and_complex_types = _floating_types + (torch.cfloat, torch.cdouble)
+def floating_and_complex_types():
+    return _floating_and_complex_types
+
+def floating_and_complex_types_and(*dtypes):
+    return _floating_and_complex_types + _validate_dtypes(*dtypes)
+
+_double_types = _dispatch_dtypes((torch.float64, torch.complex128))
+def double_types():
+    return _double_types
+
+# NB: Does not contain uint16/uint32/uint64 for BC reasons
+_integral_types = _dispatch_dtypes((torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64))
+def integral_types():
+    return _integral_types
+
+def integral_types_and(*dtypes):
+    return _integral_types + _validate_dtypes(*dtypes)
+
+_all_types = _floating_types + _integral_types
+def all_types():
+    return _all_types
+
+def all_types_and(*dtypes):
+    return _all_types + _validate_dtypes(*dtypes)
+
+_complex_types = _dispatch_dtypes((torch.cfloat, torch.cdouble))
+def complex_types():
+    return _complex_types
+
+def complex_types_and(*dtypes):
+    return _complex_types + _validate_dtypes(*dtypes)
+
+_all_types_and_complex = _all_types + _complex_types
+def all_types_and_complex():
+    return _all_types_and_complex
+
+def all_types_and_complex_and(*dtypes):
+    return _all_types_and_complex + _validate_dtypes(*dtypes)
+
+_all_types_and_half = _all_types + (torch.half,)
+def all_types_and_half():
+    return _all_types_and_half
+
+def custom_types(*dtypes):
+    """Create a list of arbitrary dtypes"""
+    return _empty_types + _validate_dtypes(*dtypes)
+
+# The functions below are used for convenience in our test suite and thus have no corresponding C++ dispatch macro
+
+# See AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS.
+def get_all_dtypes(include_half=True,
+                   include_bfloat16=True,
+                   include_bool=True,
+                   include_complex=True,
+                   include_complex32=False,
+                   include_qint=False,
+                   ) -> List[torch.dtype]:
+    dtypes = get_all_int_dtypes() + get_all_fp_dtypes(include_half=include_half, include_bfloat16=include_bfloat16)
+    if include_bool:
+        dtypes.append(torch.bool)
+    if include_complex:
+        dtypes += get_all_complex_dtypes(include_complex32)
+    if include_qint:
+        dtypes += get_all_qint_dtypes()
+    return dtypes
+
+def get_all_math_dtypes(device) -> List[torch.dtype]:
+    return get_all_int_dtypes() + get_all_fp_dtypes(include_half=device.startswith('cuda'),
+                                                    include_bfloat16=False) + get_all_complex_dtypes()
+
+def get_all_complex_dtypes(include_complex32=False) -> List[torch.dtype]:
+    return [torch.complex32, torch.complex64, torch.complex128] if include_complex32 else [torch.complex64, torch.complex128]
+
+
+def get_all_int_dtypes() -> List[torch.dtype]:
+    return [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
+
+
+def get_all_fp_dtypes(include_half=True, include_bfloat16=True) -> List[torch.dtype]:
+    dtypes = [torch.float32, torch.float64]
+    if include_half:
+        dtypes.append(torch.float16)
+    if include_bfloat16:
+        dtypes.append(torch.bfloat16)
+    return dtypes
+
+
+def get_all_qint_dtypes() -> List[torch.dtype]:
+    return [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2, torch.quint2x4]
+
+
+float_to_corresponding_complex_type_map = {
+    torch.float16: torch.complex32,
+    torch.float32: torch.complex64,
+    torch.float64: torch.complex128,
+}
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_fsdp.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3c29afd9445274c5ae2441dc875dcf3d4a6e3a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_fsdp.py
@@ -0,0 +1,1441 @@
+# Owner(s): ["oncall: distributed"]
+
+import contextlib
+import itertools
+import os
+import re
+import sys
+import warnings
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+from copy import deepcopy
+from enum import auto, Enum
+from functools import partial, wraps
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    no_type_check,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+from unittest import mock
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributed._composable.fsdp._fsdp_param_group import (
+    FSDPParamGroup,
+    RegisterPostBackwardFunction,
+)
+from torch.distributed._tensor import distribute_tensor, DTensor, Shard
+from torch.distributed.fsdp import CPUOffload, FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp._common_utils import TrainingState
+from torch.distributed.fsdp._init_utils import NO_RESHARD_AFTER_FORWARD_STRATEGIES
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    BackwardPrefetch,
+    MixedPrecision,
+    ShardingStrategy,
+)
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import always_wrap_policy, ModuleWrapPolicy, wrap
+from torch.nn import TransformerDecoderLayer, TransformerEncoderLayer
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    MultiThreadedTestCase,
+    TEST_SKIPS,
+)
+from torch.testing._internal.common_utils import FILE_SCHEMA, get_cycles_per_ms
+from torch.utils._triton import has_triton
+
+
+class FSDPInitMode(Enum):
+    # No FSDP wrapping
+    NO_FSDP = auto()
+    # FSDP recursive wrapping
+    RECURSIVE = auto()
+    # TODO: FSDP non-recursive wrapping
+    # NONRECURSIVE = auto()
+
+
+class CUDAInitMode(Enum):
+    # Move model to CUDA before passing to the FSDP constructor
+    CUDA_BEFORE = auto()
+    # Move model to CUDA after passing to the FSDP constructor
+    CUDA_AFTER = auto()
+    # Keep on CPU
+    CUDA_NEVER = auto()
+
+
+class FSDPTestModel(nn.Module, ABC):
+    """This defines the interface expected from all models used commonly for
+    FSDP unit tests."""
+
+    @abstractmethod
+    def get_input(self, device) -> Tuple[torch.Tensor, ...]:
+        """Returns an input for the model as as tuple."""
+        ...
+
+    @abstractmethod
+    def get_loss(self, input, output) -> torch.Tensor:
+        """Returns the loss given the input and output."""
+        ...
+
+    @abstractmethod
+    def run_backward(self, loss) -> None:
+        """Runs the backward pass (e.g. including ``loss.backward()``)."""
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def init(*args: Any, **kwargs: Any) -> nn.Module:
+        """Initializes an instance of this model."""
+        ...
+
+
+def _assert_module_states(
+    model: nn.Module,
+    process_group: dist.ProcessGroup,
+    assert_fn: Callable,
+):
+    """
+    All-gathers module states across ranks and calls ``assert_fn`` on each pair
+    of corresponding states from rank 0 and a nonzero rank. For example, if
+    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
+    states are equal across ranks.
+    """
+    # Include names for debugging convenience
+    named_module_states = [
+        (param_name, param.detach().cpu())
+        for param_name, param in model.named_parameters()
+    ]
+    named_module_states += [
+        (buffer_name, buffer.detach().cpu())
+        for buffer_name, buffer in model.named_buffers()
+    ]
+    world_size = dist.get_world_size(process_group)
+    olist = [None for _ in range(world_size)]
+    dist.all_gather_object(olist, named_module_states, group=process_group)
+    rank0_states = olist[0]
+    assert rank0_states is not None  # mypy
+    for state in olist[1:]:
+        assert state is not None  # mypy
+        for (_, p1), (_, p2) in zip(rank0_states, state):
+            assert_fn(p1, p2)
+
+
+def _zero_model(
+    model: nn.Module,
+    zero_buffers: bool = False,
+    summon_full=True,
+):
+    """Zeros the parameters and optionally buffers of ``model`` in place."""
+    ctx = FSDP.summon_full_params(model) if summon_full else nullcontext()
+    with ctx:
+        for param in model.parameters():
+            with torch.no_grad():
+                param.zero_()
+        if zero_buffers:
+            for buffer in model.buffers():
+                with torch.no_grad():
+                    buffer.zero_()
+
+
+def _get_state_dict(model, cpu_offload=False, half=False):
+    if not cpu_offload:
+        model = model.cuda()
+    if half:
+        model.half()
+
+    return model.state_dict()
+
+
+def subtest_name(test_name_mapping, *args):
+    return "_".join(
+        [test_name_mapping[str(s)] if s is not None else "none" for s in args]
+    )
+
+
+def _broadcast_state_dict(rank, state_dict):
+    # For non-FSDP roots, some parts of the model state on rank 0 may
+    # not be on CPU, so we move everything to CPU to avoid issues like:
+    # https://github.com/pytorch/pytorch/issues/77113.
+    for param_name, param in state_dict.items():
+        if param.device != torch.device("cpu"):
+            state_dict[param_name] = param.cpu()
+
+    olist = [state_dict if rank == 0 else None]
+    dist.broadcast_object_list(olist)
+    state_dict = olist[0]
+    # Ensure that the state is on CUDA
+    for param_name in state_dict.keys():
+        state_dict[param_name] = state_dict[param_name].cuda()
+    return state_dict
+
+
+def get_full_params(model: nn.Module, recurse: bool = True):
+    """
+    Returns the full unsharded parameters of ``model``. Any FSDP-managed
+    parameters offloaded to CPU are moved to GPU in the returned list.
+
+    Args:
+        recurse (bool): If ``False``, only unshards the parameters immediate to
+            ``model``; if ``True``, recurses through the module hierarchy
+            rooted at ``model``.
+    """
+    with FSDP.summon_full_params(model, recurse=recurse):
+        return deepcopy(list(model.parameters()))
+
+
+def _maybe_cuda(model: nn.Module, move_to_cuda: bool):
+    return model.cuda() if move_to_cuda else model
+
+
+def _maybe_wrap_fsdp(model: nn.Module, wrap_fsdp: bool, *args, **kwargs):
+    return model if not wrap_fsdp else FSDP(model, *args, **kwargs)
+
+
+class DummyProcessGroup:
+    def __init__(self, rank: int, size: int):
+        self._rank = rank
+        self._size = size
+
+    def rank(self) -> int:
+        return self._rank
+
+    def size(self) -> int:
+        return self._size
+
+    def allreduce(self, *args, **kwargs):
+        dist_wait = mock.Mock()
+
+        def get_future():
+            future: torch.futures.Future = torch.futures.Future()
+            future.set_result(1)
+            return future
+
+        dist_wait.get_future = get_future
+        return dist_wait
+
+
+class TransformerWithSharedParams(FSDPTestModel):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        cuda_init_mode: CUDAInitMode,
+        add_bn: bool,
+        deterministic: bool,
+    ):
+        super().__init__()
+        self.rank = group.rank()
+        self.world_size = group.size()
+        if deterministic:
+            torch.manual_seed(0)
+        d_vocab = 23
+        d_model = 16
+
+        self.embed_tokens = nn.Embedding(d_vocab, d_model)
+        self.transformer = nn.Transformer(
+            d_model=d_model,
+            num_encoder_layers=2,
+            num_decoder_layers=2,
+            dim_feedforward=8,
+            dropout=0.1,
+        )
+        self.output_proj = nn.Linear(d_model, d_vocab)
+
+        # share the embedding and output projection weights
+        self.output_proj.weight = self.embed_tokens.weight
+        self.register_buffer(
+            "vocab_bias", self.embed_tokens.weight.new_ones((d_model,))
+        )
+        self.register_buffer(
+            "long_buffer",
+            torch.zeros_like(self.vocab_bias, dtype=torch.long),
+        )  # type: ignore[arg-type]
+
+        self.bs = 2
+        self.bn = torch.nn.BatchNorm1d(self.bs) if add_bn else torch.nn.Identity()
+        if cuda_init_mode == CUDAInitMode.CUDA_BEFORE:
+            self = self.cuda()
+        if deterministic:
+            self.eval()
+
+    def get_input(self, device):
+        torch.manual_seed(1 + self.rank)  # keep everything deterministic
+        src = torch.arange(12, device=device).view(6, self.bs)  # T x B
+        tgt = torch.arange(self.bs * 4, device=device).view(4, self.bs)  # T x B
+        return (src, tgt)
+
+    def forward(self, src_ids, tgt_ids):
+        src = self.embed_tokens(src_ids)
+        src = src + self.vocab_bias + self.long_buffer.type_as(src)  # type: ignore[operator]
+        tgt = self.embed_tokens(tgt_ids)
+        tgt = self.bn(tgt)
+        x = self.transformer(src, tgt)
+        return self.output_proj(x)
+
+    def get_loss(self, input, output):
+        _, tgt = input
+        return nn.functional.cross_entropy(
+            output.view(-1, output.size(-1)), tgt.view(-1), reduction="sum"
+        )
+
+    def run_backward(self, loss):
+        loss.backward()
+
+    @staticmethod
+    def init(
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+        add_bn: bool = True,
+    ) -> Union[nn.Module, FSDP]:
+        """
+        Initializes a :class:`TransformerWithSharedParams` instance.
+
+        Args:
+            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
+                any modules with FSDP. If ``RECURSIVE``, then wraps with
+                top-level FSDP. By default, the top-level FSDP uses the
+                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
+                different auto wrap policy may be specified via
+                ``fsdp_kwargs``.
+            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
+            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
+                forwarded to the FSDP constructor.
+            deterministic (bool): Whether to make the model deterministic
+                across constructions.
+            add_bn (bool): Whether to include batch norm in the model.
+        """
+
+        if fsdp_kwargs is None:
+            fsdp_kwargs = {}
+        if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            if isinstance(group, tuple):
+                pg = group[0]
+            else:
+                pg = group
+            return TransformerWithSharedParams(
+                pg, cuda_init_mode, add_bn, deterministic
+            )
+        elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            # Default to the `ModuleWrapPolicy`
+            if "auto_wrap_policy" not in fsdp_kwargs:
+                auto_wrap_policy = ModuleWrapPolicy(
+                    {
+                        TransformerEncoderLayer,
+                        TransformerDecoderLayer,
+                    }
+                )
+            else:
+                auto_wrap_policy = fsdp_kwargs.pop("auto_wrap_policy")
+
+            if (
+                "sharding_strategy" in fsdp_kwargs
+                and fsdp_kwargs["sharding_strategy"]
+                in {ShardingStrategy.HYBRID_SHARD, ShardingStrategy._HYBRID_SHARD_ZERO2}
+                and not isinstance(group, tuple)
+            ):
+                fsdp_pg = None
+            else:
+                fsdp_pg = group
+
+            if isinstance(group, tuple):
+                tformer_pg = group[0]
+            else:
+                tformer_pg = group
+
+            m = TransformerWithSharedParams(
+                tformer_pg, cuda_init_mode, add_bn, deterministic
+            )
+            fsdp_model = FSDP(
+                m,
+                fsdp_pg,
+                auto_wrap_policy=auto_wrap_policy,
+                **fsdp_kwargs,
+            )
+            if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+                fsdp_model = fsdp_model.cuda()
+            return fsdp_model
+        raise ValueError(f"Unsupported FSDP init mode: {fsdp_init_mode}")
+
+    def get_ignored_modules(self):
+        return [self.transformer]
+
+
+class NestedWrappedModule(FSDPTestModel):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        wrap_fsdp: bool,
+        cuda_init_mode: CUDAInitMode,
+        deterministic: bool,
+        **fsdp_kwargs,
+    ):
+        super().__init__()
+        self.rank = group.rank()
+        self.world_size = group.size()
+        move_to_cuda = cuda_init_mode == CUDAInitMode.CUDA_BEFORE
+
+        def _maybe_wrap(layer):
+            if wrap_fsdp:
+                return FSDP(layer, group, **fsdp_kwargs)
+            return layer
+
+        if deterministic:
+            torch.manual_seed(0)
+        self.module = nn.Sequential(
+            _maybe_cuda(nn.Linear(8, 4), move_to_cuda),
+            _maybe_wrap(
+                nn.Sequential(
+                    _maybe_wrap(_maybe_cuda(nn.Linear(4, 16), move_to_cuda)),
+                    _maybe_cuda(nn.Linear(16, 16), move_to_cuda),
+                ),
+            ),
+            _maybe_wrap(_maybe_cuda(nn.Linear(16, 4), move_to_cuda)),
+            _maybe_cuda(nn.Linear(4, 8), move_to_cuda),
+        )
+
+    def get_input(self, device):
+        torch.manual_seed(1 + self.rank)  # keep everything deterministic
+        return (torch.rand(4, 8, device=device),)
+
+    def forward(self, x):
+        return self.module(x)
+
+    def get_loss(self, input, output):
+        loss = output.sum()
+        return loss
+
+    def run_backward(self, loss):
+        loss.backward()
+
+    @staticmethod
+    def init(
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+    ) -> nn.Module:
+        """
+        Initializes a :class:`NestedWrappedModule` instance.
+
+        Args:
+            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
+                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
+                modules with FSDP but not the top-level module. The model may
+                later be wrapped with a top-level FSDP external to this method
+                if desired.
+            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
+            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
+                forwarded to the FSDP constructor.
+            deterministic (bool): Whether to make the model deterministic
+                across constructions.
+        """
+        if fsdp_kwargs is None:
+            fsdp_kwargs = {}
+        if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            return NestedWrappedModule(
+                group,
+                wrap_fsdp=False,
+                cuda_init_mode=cuda_init_mode,
+                deterministic=deterministic,
+            )
+        elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            # Does not wrap with top-level FSDP
+            fsdp_model = NestedWrappedModule(
+                group,
+                wrap_fsdp=True,
+                cuda_init_mode=cuda_init_mode,
+                deterministic=deterministic,
+                **fsdp_kwargs,
+            )
+            if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+                fsdp_model = fsdp_model.cuda()
+            return fsdp_model
+        raise ValueError(f"Unsupported FSDP init mode: {fsdp_init_mode}")
+
+
+class AlwaysWrapNestedWrappedModule(NestedWrappedModule):
+    @staticmethod
+    def init(
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+    ):
+        """
+        Initializes a :class:`NestedWrappedModule` instance, but unlike
+        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
+        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
+        policy.
+        """
+        model = super(
+            AlwaysWrapNestedWrappedModule, AlwaysWrapNestedWrappedModule
+        ).init(
+            group=group,
+            fsdp_init_mode=FSDPInitMode.NO_FSDP,
+            cuda_init_mode=cuda_init_mode,
+            fsdp_kwargs=fsdp_kwargs,
+            deterministic=deterministic,
+        )
+        if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            return model
+        elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            fsdp_kwargs = fsdp_kwargs or {}
+            fsdp_model = FSDP(model, auto_wrap_policy=always_wrap_policy, **fsdp_kwargs)
+            if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+                fsdp_model = fsdp_model.cuda()
+            return fsdp_model
+
+
+class NonUniformReqGradNWM(NestedWrappedModule):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        wrap_fsdp: bool,
+        cuda_init_mode: CUDAInitMode,
+        deterministic: bool,
+        **fsdp_kwargs,
+    ):
+        super(NestedWrappedModule, self).__init__()
+        # This `__init__` only differs from `NestedWrappedModule.__init__` in that
+        # the last two `nn.Linear` layers are FSDP wrapped in a `nn.Sequential`
+        # container. This arrangement results in all elements of the last two parameters
+        # residing on a single rank. Freezing all parameters except those two allows us
+        # to verify that `ShardedGradScaler` accommodates situations where some ranks
+        # have no (non-zero sized) parameter shards.
+        self.rank = group.rank()
+        self.world_size = group.size()
+        move_to_cuda = cuda_init_mode == CUDAInitMode.CUDA_BEFORE
+
+        def _maybe_wrap(layer):
+            if wrap_fsdp:
+                return FSDP(layer, group, **fsdp_kwargs)
+            return layer
+
+        if deterministic:
+            torch.manual_seed(0)
+        self.module = nn.Sequential(
+            _maybe_cuda(nn.Linear(8, 4), move_to_cuda),
+            _maybe_wrap(
+                nn.Sequential(
+                    _maybe_wrap(_maybe_cuda(nn.Linear(4, 16), move_to_cuda)),
+                    _maybe_cuda(nn.Linear(16, 16), move_to_cuda),
+                ),
+            ),
+            _maybe_wrap(
+                nn.Sequential(
+                    _maybe_cuda(nn.Linear(16, 4), move_to_cuda),
+                    _maybe_cuda(nn.Linear(4, 8), move_to_cuda),
+                ),
+            ),
+        )
+
+    @staticmethod
+    def _set_nonuniform_req_grad(model, req_grad_mask) -> None:
+        for n, p in model.named_parameters():
+            if not re.match(req_grad_mask, n):
+                p.requires_grad_(False)
+
+    @staticmethod
+    def init(
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+    ):
+        """
+        Initializes a :class:`NestedWrappedModule` instance, but unlike
+        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
+        container to enable the desired non-uniform ``requires_grad``
+        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
+        init modes, freezes all parameters except the last two to validate
+        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
+        FSDP ``use_orig_params=True`` mode.
+        """
+        # The parameters that should remain unfrozen are in `module.2.1`. The regex
+        # pattern below matches the relevant parameter names both with and without
+        # an interstitial FSDP module indicator (`_fsdp_wrapped_module`) present.
+        req_grad_pattern = re.compile(r"module\.2.*\.1.*")
+        if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            ddp_model = NonUniformReqGradNWM(
+                group,
+                wrap_fsdp=False,
+                cuda_init_mode=cuda_init_mode,
+                deterministic=deterministic,
+            )
+            NonUniformReqGradNWM._set_nonuniform_req_grad(ddp_model, req_grad_pattern)
+            return ddp_model
+        elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            if fsdp_kwargs is None:
+                fsdp_kwargs = {}
+            fsdp_model = NonUniformReqGradNWM(
+                group,
+                wrap_fsdp=True,
+                cuda_init_mode=cuda_init_mode,
+                deterministic=deterministic,
+                **fsdp_kwargs,
+            )
+            if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+                fsdp_model = fsdp_model.cuda()
+            NonUniformReqGradNWM._set_nonuniform_req_grad(fsdp_model, req_grad_pattern)
+            return fsdp_model
+        raise ValueError(f"Unsupported FSDP init mode: {fsdp_init_mode}")
+
+
+class ModuleWithDelay(FSDPTestModel):
+    """This class wraps a :class:`FSDPTestModel` to optionally add a delay
+    after computing the loss and/or before the gradient reduction."""
+
+    def __init__(
+        self,
+        module: nn.Module,
+        delay_after_loss_ms: int,
+        delay_before_reduction_ms: int,
+    ):
+        super().__init__()
+        self.delay_after_loss_ms = delay_after_loss_ms
+        self.delay_before_reduction_ms = delay_before_reduction_ms
+        self.module = module
+
+    def get_input(self, device):
+        return self.module.get_input(device)
+
+    def forward(self, x):
+        return self.module(x)
+
+    def get_loss(self, input, output):
+        loss = self.module.get_loss(input, output)
+        if self.delay_after_loss_ms > 0:
+            torch.cuda._sleep(int(self.delay_after_loss_ms * get_cycles_per_ms()))
+        return loss
+
+    def run_backward(self, loss):
+        orig_reduce_scatter = torch.distributed.reduce_scatter_tensor
+
+        def _delayed_reduce_scatter(*args, **kwargs):
+            if self.delay_before_reduction_ms > 0:
+                torch.cuda._sleep(
+                    int(self.delay_before_reduction_ms * get_cycles_per_ms())
+                )
+            return orig_reduce_scatter(*args, **kwargs)
+
+        with mock.patch(
+            "torch.distributed.reduce_scatter_tensor", _delayed_reduce_scatter
+        ):
+            self.module.run_backward(loss)
+
+    @staticmethod
+    def init(
+        module_class: Type[FSDPTestModel],
+        *model_args: Any,
+        delay_after_loss_ms: int,
+        delay_before_reduction_ms: int,
+        **model_kwargs: Any,
+    ):
+        """
+        Args:
+            module_class (Type[FSDPTestModel]): Wrapped module class to which
+                to add delays.
+            model_args: Positional arguments forwarded to the ``module_class``
+                ``init()``.
+            delay_after_loss_ms (int): Delay after computing the loss/before
+                the optimizer step (in ms).
+            delay_before_reduction_ms (int): Delay before reduce-scattering
+                gradients (in ms).
+            model_kwargs: Keyword arguments forwarded to the ``module_class``
+                ``init()``.
+        """
+        return ModuleWithDelay(
+            module_class.init(*model_args, **model_kwargs),
+            delay_after_loss_ms,
+            delay_before_reduction_ms,
+        )
+
+
+class NestedWrappedModuleWithDelay(ModuleWithDelay):
+    @staticmethod
+    def init(  # type: ignore[override]
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode = CUDAInitMode.CUDA_AFTER,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+        delay_after_loss_ms: int = 0,
+        delay_before_reduction_ms: int = 0,
+    ):
+        return ModuleWithDelay.init(
+            NestedWrappedModule,
+            group=group,
+            fsdp_init_mode=fsdp_init_mode,
+            cuda_init_mode=cuda_init_mode,
+            fsdp_kwargs=fsdp_kwargs,
+            deterministic=deterministic,
+            delay_after_loss_ms=delay_after_loss_ms,
+            delay_before_reduction_ms=delay_before_reduction_ms,
+        )
+
+
+class DummyDDP(nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+
+
+class MixtureOfExperts(NestedWrappedModule):
+    def __init__(
+        self,
+        group: dist.ProcessGroup,
+        wrap_fsdp: bool,
+        cuda_init_mode: CUDAInitMode,
+        delay_before_free_ms: int,
+        deterministic: bool,
+        **fsdp_kwargs,
+    ):
+        super().__init__(
+            group=group,
+            wrap_fsdp=wrap_fsdp,
+            cuda_init_mode=cuda_init_mode,
+            deterministic=deterministic,
+        )
+        self.group = group
+        self.delay_before_free_ms = delay_before_free_ms
+        self.wrap_fsdp = wrap_fsdp
+        self.move_to_cuda = cuda_init_mode == CUDAInitMode.CUDA_BEFORE
+        if deterministic:
+            # Give each rank different expert parameters
+            torch.manual_seed(42 + self.rank)
+        d_expert = 23
+        d_shared = 12
+        d_input = 8
+        expert = _maybe_cuda(nn.Linear(d_expert, d_shared), self.move_to_cuda)
+
+        self.num_expert_params = sum([p.numel() for p in expert.parameters()])
+        for p in expert.parameters():
+            p.expert = True  # type: ignore[attr-defined]
+
+        if deterministic:
+            # Keep all other parameters the same across ranks
+            torch.manual_seed(0)
+
+        shared = _maybe_cuda(nn.Linear(d_shared, d_expert), self.move_to_cuda)
+
+        if wrap_fsdp:
+            # we create a process group of size 1 for the expert params
+            expert_group = torch.distributed.new_group(
+                [group.rank()]
+            )  # world size 1 means no shard
+            expert = FSDP(expert, expert_group, **fsdp_kwargs)  # type: ignore[assignment]
+            shared = FSDP(shared, group, **fsdp_kwargs)  # type: ignore[assignment]
+
+        self.module = nn.Sequential(
+            _maybe_cuda(nn.Linear(d_input, d_shared), self.move_to_cuda),
+            shared,
+            expert,
+            _maybe_cuda(nn.Linear(d_shared, d_input), self.move_to_cuda),
+        )
+
+    def forward(self, x):
+        if self.delay_before_free_ms > 0:
+            expert = self.module[2]
+            if isinstance(expert, FSDP):
+                orig_reshard = torch.distributed.fsdp._runtime_utils._reshard
+
+                def _delayed_reshard(*args, **kwargs):
+                    torch.cuda._sleep(
+                        int(self.delay_before_free_ms * get_cycles_per_ms())
+                    )
+                    return orig_reshard(*args, **kwargs)
+
+                # This patch covers any `import torch..._reshard` uses.
+                with mock.patch(
+                    "torch.distributed.fsdp._runtime_utils._reshard", _delayed_reshard
+                ):
+                    return self.module(x)
+
+        return self.module(x)
+
+    def run_backward(self, loss):
+        loss.backward()
+        # Manually reduce gradients if not wrapped in FullyShardedDataParallel
+        if not self.wrap_fsdp:
+            with torch.no_grad():
+                for p in self.parameters():
+                    if hasattr(p, "expert"):
+                        continue  # these params don't need grad reduction
+                    if p.grad is not None:
+                        p.grad.div_(self.world_size)
+                        torch.distributed.all_reduce(p.grad, group=self.group)
+
+    @staticmethod
+    def init(
+        group: dist.ProcessGroup,
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        fsdp_kwargs: Optional[Dict[str, Any]] = None,
+        deterministic: bool = False,
+        delay_before_free_ms: int = 0,
+    ):
+        """
+        Initializes a :class:`MixtureOfExperts` instance.
+
+        Args:
+            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
+                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
+                modules with FSDP, including the expert and shared layers, but
+                not the top-level module. The model may later be wrapped with a
+                top-level FSDP external to this method if desired.
+            cuda_init_mode (CUDAInitMode): Determines model movement to CUDA.
+            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
+                forwarded to the FSDP constructor.
+            deterministic (bool): Whether to make the model deterministic
+                across constructions.
+            delay_before_free_ms (int): Delay before resharding expert
+                parameters in the forward pass (in ms).
+        """
+        if fsdp_kwargs is None:
+            fsdp_kwargs = {}
+        if fsdp_init_mode == FSDPInitMode.NO_FSDP:
+            return MixtureOfExperts(
+                group,
+                wrap_fsdp=False,
+                cuda_init_mode=cuda_init_mode,
+                delay_before_free_ms=delay_before_free_ms,
+                deterministic=deterministic,
+            )
+        elif fsdp_init_mode == FSDPInitMode.RECURSIVE:
+            # Does not wrap with top-level FSDP
+            fsdp_model = MixtureOfExperts(
+                group,
+                wrap_fsdp=True,
+                cuda_init_mode=cuda_init_mode,
+                delay_before_free_ms=delay_before_free_ms,
+                deterministic=deterministic,
+                **fsdp_kwargs,
+            )
+            if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+                fsdp_model = fsdp_model.cuda()
+            return fsdp_model
+        raise ValueError(f"Unsupported FSDP init mode: {fsdp_init_mode}")
+
+
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        device: Optional[torch.device] = None,
+        with_buffer: bool = False,
+        dim_multiplier: int = 4,
+    ):
+        super().__init__()
+        self.in_proj = nn.Linear(dim, dim_multiplier * dim, device=device)
+        self.out_proj = nn.Linear(dim_multiplier * dim, dim, device=device)
+        if with_buffer:
+            self.register_buffer("buffer", torch.randn((dim,), device=device))
+        else:
+            self.buffer = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.in_proj(x)
+        z = F.relu(z)
+        z = self.out_proj(z)
+        z = F.relu(z)
+        if self.buffer is not None:
+            z = z + self.buffer
+        return z
+
+    def reset_parameters(self):
+        if self.buffer is not None:
+            torch.nn.init.normal_(self.buffer)
+
+
+class DoubleLinear(nn.Module):
+    """
+    This can be used for returning multiple outputs from a module
+    (``use_second_linear=True``) or for having an unused module (``False``).
+    """
+
+    def __init__(self, dim: int, use_second_linear: bool = True):
+        super().__init__()
+        self.lin1 = nn.Linear(dim, dim)
+        self.lin2 = nn.Linear(dim, dim)
+        self.relu = nn.ReLU()
+        self.use_second_linear = use_second_linear
+
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
+        if self.use_second_linear:
+            return self.relu(self.lin1(x)), self.relu(self.lin2(x))
+        return self.relu(self.lin1(x))
+
+
+@contextlib.contextmanager
+def patch_all_gather(new_all_gather_into_tensor: Callable):
+    orig_all_gather = dist.all_gather_into_tensor
+    dist.all_gather_into_tensor = new_all_gather_into_tensor
+    try:
+        yield
+    finally:
+        dist.all_gather_into_tensor = orig_all_gather
+
+
+@contextlib.contextmanager
+def patch_reduce_scatter(new_reduce_scatter_tensor: Callable):
+    orig_reduce_scatter = dist.reduce_scatter_tensor
+    dist.reduce_scatter_tensor = new_reduce_scatter_tensor
+    try:
+        yield
+    finally:
+        dist.reduce_scatter_tensor = orig_reduce_scatter
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_unshard(new_unshard: Callable):
+    orig_unshard = FSDPParamGroup.unshard
+    FSDPParamGroup.unshard = new_unshard
+    try:
+        yield
+    finally:
+        FSDPParamGroup.unshard = orig_unshard
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_post_backward(new_post_backward: Callable):
+    orig_post_backward = FSDPParamGroup.post_backward
+    FSDPParamGroup.post_backward = new_post_backward
+    try:
+        yield
+    finally:
+        FSDPParamGroup.post_backward = orig_post_backward
+
+
+@no_type_check
+@contextlib.contextmanager
+def patch_register_post_backward_hook_backward(new_backward: Callable):
+    orig_backward = RegisterPostBackwardFunction.backward
+    RegisterPostBackwardFunction.backward = new_backward
+    try:
+        yield
+    finally:
+        RegisterPostBackwardFunction.backward = orig_backward
+
+
+def reduce_scatter_with_assert(
+    cls,
+    orig_reduce_scatter: Callable,
+    assert_fn: Callable,  # `assert_fn(output: Tensor)`
+    *args: Any,
+    **kwargs: Any,
+):
+    if len(args) > 0:
+        output = args[0]
+    elif "output" in kwargs:
+        output = kwargs["output"]
+    else:
+        raise AssertionError(
+            f"Cannot get reduce-scatter output from\nargs: {args}\nkwargs: {kwargs}"
+        )
+    assert_fn(output)
+    return orig_reduce_scatter(*args, **kwargs)
+
+
+def check_sharded_parity(
+    cls,  # unit test class
+    replicated_module: nn.Module,
+    sharded_module: nn.Module,
+    prefixes_to_ignore: Tuple[str, ...] = (),
+):
+    for (replicated_name, replicated_param), (sharded_name, sharded_param) in zip(
+        replicated_module.named_parameters(), sharded_module.named_parameters()
+    ):
+        clean_sharded_name = sharded_name
+        for prefix in prefixes_to_ignore:
+            clean_sharded_name = clean_sharded_name.replace(prefix, "")
+        cls.assertEqual(replicated_name, clean_sharded_name)
+        cls.assertIsInstance(sharded_param, DTensor)
+        assert isinstance(sharded_param, DTensor)  # mypy
+        mesh, placements = sharded_param.device_mesh, sharded_param.placements
+        if tuple(placements) == (Shard(0), Shard(0)):
+            raise AssertionError(
+                "FSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), "
+                "so we cannot check for equality using it"
+            )
+        sharded_ref_param = distribute_tensor(replicated_param, mesh, placements)
+        cls.assertEqual(sharded_param.to_local(), sharded_ref_param.to_local())
+        if replicated_param.grad is None:
+            cls.assertIsNone(sharded_param.grad)
+            continue
+        cls.assertIsNotNone(sharded_param.grad)
+        sharded_ref_grad = distribute_tensor(replicated_param.grad, mesh, placements)
+        cls.assertIsInstance(sharded_param.grad, DTensor)
+        assert isinstance(sharded_param.grad, DTensor)  # mypy
+        cls.assertEqual(sharded_param.grad.to_local(), sharded_ref_grad.to_local())
+
+
+def run_subtests(
+    cls_inst,
+    subtest_config: Dict[str, List[Any]],
+    test_fn: Callable,
+    *test_args,
+    **test_kwargs: Any,
+):
+    """
+    Runs a test function given by ``test_fn`` as a subtest according to the
+    configurations specified by ``subtest_config``. This amortizes the
+    costly setup overhead (including process spawn and initializing the
+    process group) over the subtests.
+
+    Args:
+        subtest_config (Dict[str, List[Any]]): A mapping from subtest
+            keyword argument name to a list of its possible values.
+        test_fn (Callable): A callable that runs the actual test.
+        test_args: Positional arguments to pass to ``test_fn``.
+        test_kwargs: Keyword arguments to pass to ``test_fn``.
+    """
+    # Convert the config mapping to a list to have a fixed order
+    subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
+    subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
+    subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
+    for values in itertools.product(*subtest_config_values):
+        # Map keyword to chosen value
+        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        with cls_inst.subTest(**subtest_kwargs):
+            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
+        dist.barrier()
+
+
+class FSDPTestMultiThread(MultiThreadedTestCase):
+    @property
+    def world_size(self):
+        return torch.cuda.device_count() if torch.cuda.is_available() else 4
+
+    def setUp(self):
+        super().setUp()
+        self._spawn_threads()
+
+    def run_subtests(self, *args, **kwargs):
+        return run_subtests(self, *args, **kwargs)
+
+
+class FSDPTest(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        # Set TORCH_NCCL_DESYNC_DEBUG=0 to disable the NCCL `workCleanupLoop()`,
+        # which can cause unit test flakiness:
+        # https://github.com/pytorch/pytorch/issues/90848
+        os.environ["TORCH_NCCL_DESYNC_DEBUG"] = "0"
+        self._spawn_processes()
+
+    @property
+    def world_size(self):
+        return min(torch.cuda.device_count(), 8) if torch.cuda.is_available() else 4
+
+    @property
+    def process_group(self):
+        return dist.distributed_c10d._get_default_group()
+
+    @property
+    def init_method(self):
+        return f"{FILE_SCHEMA}{self.file_name}"
+
+    def _check_cpu_offload(self, fsdp_model, cpu_offload):
+        self.assertEqual(cpu_offload, fsdp_model.cpu_offload)
+
+    def _check_backward_prefetch(self, fsdp_model, backward_prefetch):
+        self.assertEqual(backward_prefetch, fsdp_model.backward_prefetch)
+
+    def _check_forward_prefetch(self, fsdp_model, forward_prefetch):
+        self.assertEqual(forward_prefetch, fsdp_model.forward_prefetch)
+
+    def run_subtests(self, *args, **kwargs):
+        return run_subtests(self, *args, **kwargs)
+
+    @classmethod
+    def _run(cls, rank, test_name, file_name, pipe):
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+
+        print(f"dist init r={self.rank}, world={self.world_size}")
+
+        # Specify gloo backend to make 'init_process_group()' succeed,
+        # Actual tests will be skipped if there is no enough GPUs.
+        backend = "nccl" if torch.cuda.is_available() else "gloo"
+
+        try:
+            dist.init_process_group(
+                init_method=self.init_method,
+                backend=backend,
+                world_size=int(self.world_size),
+                rank=self.rank,
+            )
+        except RuntimeError as e:
+            if "recompile" in e.args[0]:
+                sys.exit(TEST_SKIPS["backend_unavailable"].exit_code)
+
+            raise
+
+        if torch.cuda.is_available() and torch.cuda.device_count():
+            torch.cuda.set_device(self.rank % torch.cuda.device_count())
+
+        # Execute barrier prior to running test to ensure that every process
+        # has finished initialization and that the following test
+        # immediately exiting due to a skip doesn't cause flakiness.
+        dist.barrier()
+
+        self.run_test(test_name, pipe)
+
+        dist.barrier()
+
+        dist.destroy_process_group()
+
+    def _train_for_several_steps(
+        self,
+        model: nn.Module,
+        num_steps: int,
+        autocast: bool,
+        lr: float = 0.01,
+        fsdp_cpu_offload: Optional[CPUOffload] = None,
+        save_model: bool = False,
+        mixed_precision: Optional[MixedPrecision] = None,
+        enable_sharded_grad_scaler: bool = False,
+        use_pure_fp16: bool = False,
+        sharded_grad_scaler_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        cpu_offload_params = fsdp_cpu_offload and fsdp_cpu_offload.offload_params
+
+        model_device = next(model.parameters()).device
+        if sharded_grad_scaler_kwargs is None:
+            sharded_grad_scaler_kwargs = {}
+        sharded_grad_scaler = ShardedGradScaler(
+            enabled=enable_sharded_grad_scaler, **sharded_grad_scaler_kwargs
+        )
+        # use SGD with momentum instead of Adam, since Adam is scale invariant
+        # and this makes it bad for tests
+        optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
+        for _ in range(num_steps):
+            optim.zero_grad()
+            with torch.cuda.amp.autocast(enabled=autocast):
+                # Inputs always cuda regardless of cpu offloading, or model.device
+                input = model.module.get_input(torch.device("cuda"))
+                if use_pure_fp16 or (mixed_precision and not isinstance(model, FSDP)):
+                    if isinstance(input, torch.Tensor):
+                        input = input.half()
+                    else:
+                        input = tuple(x.half() for x in input)
+                output = model(*input)
+                # Post-forward, if CPU offloading model param should be on CPU.
+                if (
+                    cpu_offload_params
+                    and isinstance(model, FSDP)
+                    # If not resharding after forward, the parameters are still
+                    # exposed as unsharded views into the GPU flat parameter
+                    and model.sharding_strategy
+                    not in NO_RESHARD_AFTER_FORWARD_STRATEGIES
+                ):
+                    for p in model.parameters():
+                        # Params should always be on CPU
+                        self.assertEqual(p.device, torch.device("cpu"))
+
+                loss = model.module.get_loss(input, output).to(model_device)
+            loss = sharded_grad_scaler.scale(loss)
+
+            if not mixed_precision and not use_pure_fp16:
+                assert (
+                    loss.dtype == torch.float32
+                ), "loss data type should be float32, as the original \
+                    parameter data type is float32."
+            else:
+                if use_pure_fp16:
+                    self.assertEqual(loss.dtype, torch.float16)
+                # FSDP loss is fp16, DDP AMP loss is fp32
+                elif isinstance(model, FSDP):
+                    assert mixed_precision is not None  # mypy
+                    self.assertEqual(loss.dtype, mixed_precision.param_dtype)
+                else:
+                    self.assertEqual(loss.dtype, torch.float32)
+            model.module.run_backward(loss)
+            # Post-backward, if CPU offloading model params should be on CPU.
+            if cpu_offload_params and isinstance(model, FSDP):
+                for p in model.parameters():
+                    # Params should always be on CPU
+                    self.assertEqual(p.device, torch.device("cpu"))
+            # Unscale the gradients and step
+            sharded_grad_scaler.step(optim)
+            # Update the scale factor
+            sharded_grad_scaler.update()
+            # if save_model, simulate save + load.
+            if save_model:
+                state_dict = {k: v.clone() for k, v in model.state_dict().items()}
+                # Zero params, if save/load state_dict did not work properly, this
+                # would break the parity test with DDP.
+                _zero_model(model)
+                model.load_state_dict(state_dict)
+
+        if isinstance(model, FSDP):
+            model._assert_state(TrainingState.IDLE)
+        return loss.detach()  # type: ignore[possibly-undefined]
+
+    def _test_fsdp_parity(
+        self,
+        model_class: Type[FSDPTestModel],
+        fsdp_init_mode: FSDPInitMode,
+        cuda_init_mode: CUDAInitMode,
+        ref_init_fn: Optional[Callable] = None,
+        num_iters: int = 2,
+        save_model: bool = True,
+        cpu_offload: CPUOffload = CPUOffload(),
+        backward_prefetch: Optional[BackwardPrefetch] = None,
+        sharding_strategy: Optional[ShardingStrategy] = None,
+        mixed_precision: Optional[MixedPrecision] = None,
+        forward_prefetch: bool = False,
+        use_orig_params: bool = False,
+        enable_sharded_grad_scaler: bool = False,
+        use_pure_fp16: bool = False,
+        init_kwargs: Optional[Dict[str, Any]] = None,
+        sharded_grad_scaler_kwargs: Optional[Dict[str, Any]] = None,
+        **fsdp_kwargs,
+    ):
+        """
+        Tests FSDP training against a reference, which defaults to DDP but
+        may be customized with ``ref_init_fn``.
+
+        Args:
+            model_class (Type[FSDPTestModel]): A model class that inherits from
+                ``FSDPTestModel``, which defines the expected interface.
+            fsdp_init_mode (FSDPInitMode): The mode to initialize the
+                FSDP-wrapped model. This should not be ``NO_FSDP``.
+            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
+                non-wrapped model to construct the reference model, where this
+                wrapper should provide data parallel semantics. If ``None``,
+                then the callable defaults to the DDP constructor.
+        """
+        assert (
+            fsdp_init_mode != FSDPInitMode.NO_FSDP
+        ), "Expects an FSDP init mode that wraps with FSDP"
+        if init_kwargs is None:
+            init_kwargs = {}
+        lr = 1e-2
+        rank = self.process_group.rank()
+        # Establish reference behavior with DDP
+        model = model_class.init(
+            self.process_group,
+            FSDPInitMode.NO_FSDP,
+            CUDAInitMode.CUDA_BEFORE,
+            deterministic=True,
+            **init_kwargs,
+        )
+        if ref_init_fn is None:
+            ref_model = DDP(model, device_ids=[rank], output_device=rank)
+        else:
+            ref_model = ref_init_fn(model)
+        if use_pure_fp16:
+            ref_model = ref_model.half()
+        ref_loss = self._train_for_several_steps(
+            ref_model,
+            num_iters,
+            autocast=mixed_precision is not None,
+            lr=lr,
+            fsdp_cpu_offload=cpu_offload,
+            mixed_precision=mixed_precision,
+            enable_sharded_grad_scaler=enable_sharded_grad_scaler,
+            use_pure_fp16=use_pure_fp16,
+            sharded_grad_scaler_kwargs=sharded_grad_scaler_kwargs,
+        )
+        ddp_params = list(ref_model.parameters())
+        # Check against FSDP behavior
+        fsdp_kwargs.update(
+            {
+                "cpu_offload": cpu_offload,
+                "backward_prefetch": backward_prefetch,
+                "sharding_strategy": sharding_strategy,
+                "mixed_precision": mixed_precision,
+                "forward_prefetch": forward_prefetch,
+                "use_orig_params": use_orig_params,
+            }
+        )
+        try:
+            fsdp_model = model_class.init(
+                self.process_group,
+                fsdp_init_mode,
+                cuda_init_mode,
+                fsdp_kwargs,
+                deterministic=True,
+                **init_kwargs,
+            )
+        except Exception as e:
+            raise ValueError(f"Initializing {model_class} raised error {str(e)}") from e
+        if not isinstance(fsdp_model, FSDP):
+            # Enforce that we wrap with top-level FSDP since we are comparing
+            # assuming a data parallel reference and some test models may not
+            # do so in their `init()` method
+            fsdp_model = FSDP(fsdp_model, self.process_group, **fsdp_kwargs)
+        if use_pure_fp16:
+            # Change the model parameter dtype after FSDP initialization
+            fsdp_model = fsdp_model.half()
+        if cuda_init_mode == CUDAInitMode.CUDA_AFTER:
+            fsdp_model = fsdp_model.cuda()
+        offload_params = cpu_offload is not None and cpu_offload.offload_params
+        # Offloading parameters with `CUDA_AFTER` should raise an error during
+        # lazy initialization due to the parameter devices not being CPU;
+        # otherwise, all parameter devices should be CPU
+        expects_device_error = (
+            offload_params and cuda_init_mode == CUDAInitMode.CUDA_AFTER
+        )
+        expects_cpu_device = (
+            offload_params and cuda_init_mode != CUDAInitMode.CUDA_AFTER
+        )
+        if expects_cpu_device:
+            cpu_device = torch.device("cpu")
+            for param in fsdp_model.parameters():
+                self.assertEqual(param.device, cpu_device)
+        context = (
+            self.assertRaisesRegex(
+                RuntimeError,
+                "An FSDP-managed module with parameter CPU offloading enabled "
+                "has parameters on cuda",
+            )
+            if expects_device_error
+            else nullcontext()
+        )
+        with context:
+            fsdp_loss = self._train_for_several_steps(
+                fsdp_model,
+                num_iters,
+                autocast=False,
+                lr=lr,
+                fsdp_cpu_offload=cpu_offload,
+                save_model=save_model,
+                mixed_precision=mixed_precision,
+                enable_sharded_grad_scaler=enable_sharded_grad_scaler,
+                use_pure_fp16=use_pure_fp16,
+                sharded_grad_scaler_kwargs=sharded_grad_scaler_kwargs,
+            )
+        # No need to check for parameter and loss parity if expecting an error
+        if expects_device_error:
+            return
+        # Check parameter devices are CPU if offloading to CPU before calling
+        # `get_full_params()`, which will cast the parameters to FP32
+        if offload_params:
+            cpu_device = torch.device("cpu")
+            for param in fsdp_model.parameters():
+                self.assertEqual(param.device, cpu_device)
+            fsdp_loss = fsdp_loss.cuda()
+        fsdp_unsharded_params = get_full_params(fsdp_model)
+        # Do not check dtype since the reference DDP loss may not be the same
+        # dtype as the FSDP loss in the case of mixed precision
+        torch.testing.assert_close(ref_loss, fsdp_loss, check_dtype=False)
+        # Do not check for parameter parity if using mixed precision since (1)
+        # the DDP parameters are in FP16 (from `half()`) while the FSDP
+        # parameters are in FP32 (from `summon_full_params()`) and (2) DDP runs
+        # the optimizer in FP16 while FSDP runs it in FP32
+        # TODO: Disable checking the parameters for pure FP16 due to floating
+        # point inaccuracy. Note that this means that the backward pass is not
+        # checked: https://github.com/pytorch/pytorch/issues/90784
+        if mixed_precision is None and not use_pure_fp16:
+            self.assertEqual(
+                ddp_params,
+                fsdp_unsharded_params,
+                exact_device=True,
+                msg="FSDP did not match DDP",
+            )
+
+
+def test_compiled_fsdp(compile_compute_on_module: Optional[type] = None):
+    def fully_shard_with_compiled_compute(*args, **kwargs):
+        # compile ``module._call_impl``
+        # to showcase how to include user-registered hooks
+        if compile_compute_on_module is None or isinstance(
+            args[0], compile_compute_on_module
+        ):
+            args[0].compile()
+        return torch.distributed._composable.fsdp.fully_shard(*args, **kwargs)  # type: ignore[operator]
+
+    class FullyShardPatch(Enum):
+        # apply ``partial`` in order to use ``Enum.value``
+        EAGER = partial(torch.distributed._composable.fsdp.fully_shard)  # type: ignore[var-annotated, arg-type]
+        COMPILED_COMPUTE = partial(fully_shard_with_compiled_compute)  # type: ignore[arg-type]
+        # add FULL for tracing FSDP
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            original_fully_shard = torch.distributed._composable.fsdp.fully_shard
+            for fully_shard_patch in FullyShardPatch:
+                if fully_shard_patch != FullyShardPatch.EAGER and not has_triton():
+                    warnings.warn("Inductor on GPU needs Triton and recent GPU arch")
+                    continue
+                imported_fully_shard = (
+                    f"{func.__module__}.{original_fully_shard.__name__}"
+                )
+                with mock.patch(
+                    imported_fully_shard,
+                    fully_shard_patch.value,
+                ):
+                    func(*args, **kwargs)
+                    torch.distributed.barrier()
+                # mock.patch.__exit__ does not work with multi-thread
+                # thread 1 set {func.__module__}.fully_shard
+                # thread 2 read {func.__module__}.fully_shard and thought it is original
+                # hence we manually reset them after __exit__
+                import_path, _ = mock._get_target(imported_fully_shard)  # type: ignore[attr-defined]
+                setattr(
+                    import_path(), original_fully_shard.__name__, original_fully_shard
+                )
+
+        return wrapper
+
+    return decorator
+
+
+class SkipModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = nn.Linear(10, 10, bias=False)
+
+    def forward(self, x):
+        return self.lin(x)
+
+
+class NestedLinear(nn.Module):
+    def __init__(self, fsdp_wrap):
+        super().__init__()
+        if fsdp_wrap:
+            self.nested_linear = wrap(nn.Linear(10, 10, bias=False).cuda())
+        else:
+            self.nested_linear = nn.Linear(10, 10, bias=False).cuda()
+
+    def forward(self, x):
+        return self.nested_linear(x)
+
+
+class SkipModel(nn.Module):
+    def __init__(self, double_nest):
+        super().__init__()
+        self.linear = nn.Linear(10, 10, bias=False).cuda()
+        self.linear_skip = SkipModule().cuda()
+        self.nested_linear = wrap(NestedLinear(fsdp_wrap=double_nest))
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.linear_skip(x)
+        x = self.nested_linear(x)
+        return x
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_jit.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..040aa4b81b772f12986282e6e2477b26cc87b711
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_jit.py
@@ -0,0 +1,323 @@
+# mypy: ignore-errors
+
+# Torch
+import torch
+import torch.cuda
+import torch.jit
+import torch.jit._logging
+import torch.jit.frontend
+import torch.jit.quantized
+
+# Testing utils
+from torch.testing._internal.common_dtype import floating_and_complex_types_and
+from torch.testing._internal.common_utils import TestCase, \
+    freeze_rng_state, TemporaryFileName, enable_profiling_mode_for_profiling_tests, is_iterable_of_tensors
+from torch.testing._internal.common_utils import enable_profiling_mode  # noqa: F401
+
+# Standard library
+from itertools import chain
+from typing import List, Union
+from torch._C import TensorType
+
+import io
+
+def check_output_types(self, func, ref_outputs, args, kwargs):
+    graph = getattr(func, 'last_graph', None)
+    types = [o.type() for o in graph.outputs()]
+    self.assertTrue(len(types) == 1)
+    t = types[0]
+    torch._C._jit_assert_is_instance(ref_outputs, t)
+
+# Test names in this set are only checked for a single derivative
+nn_functional_single_grad = frozenset('test_nn_' + name for name in [
+    'pdist',
+    'multilabel_margin_loss',
+    'max_unpool3d',
+    'multi_margin_loss',
+    'binary_cross_entropy',
+    'binary_cross_entropy_size_average',
+    'ctc_loss',
+    'grid_sample',
+])
+
+def check_against_reference(self, func, reference_func, output_func, args, kwargs=None,
+                            allow_unused=True, check_types=True, no_grad=False, no_gradgrad=False):
+    """Verifies a function performs identically to some reference implementation.
+
+    Commonly, this is used to verify that a JIT implementation
+    (output_func) matches the behavior of the eager implementation
+    (reference_func).
+    """
+    kwargs = kwargs if kwargs else {}
+
+    def allSum(vs):
+        if isinstance(vs, torch.Tensor):
+            vs = (vs,)
+        return sum((i + 1) * v.sum().abs() if v.dtype.is_complex else (i + 1) * v.sum()
+                   for i, v in enumerate(vs)
+                   if v is not None and v.dtype in floating_and_complex_types_and(torch.half, torch.bfloat16))
+
+    def clone_tensor(t, preserve_requires_grad):
+        require_grad = preserve_requires_grad and t.requires_grad
+        return t.detach().clone().requires_grad_(require_grad)
+
+    def clone_inputs(preserve_requires_grad: bool):
+        inputs: List[Union[torch.Tensor, List[torch.Tensor]]] = []
+
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                inputs.append(clone_tensor(arg, preserve_requires_grad))
+            elif is_iterable_of_tensors(arg):
+                inputs.append([clone_tensor(t, preserve_requires_grad) for t in arg])
+            else:
+                inputs.append(arg)
+
+        return inputs
+
+    # Returns tensors in args that requires_grad, including tensors in TensorList args
+    def get_recording_tensors(args):
+        recording_tensors: List[torch.Tensor] = []
+
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and arg.requires_grad:
+                recording_tensors.append(arg)
+            elif is_iterable_of_tensors(arg):
+                recording_tensors.extend(filter(lambda t: t.requires_grad, arg))
+
+        return recording_tensors
+
+    # test no gradients case
+    nograd_inputs = clone_inputs(preserve_requires_grad=False)
+    outputs = self.runAndSaveRNG(reference_func, nograd_inputs, kwargs)
+    with enable_profiling_mode_for_profiling_tests():
+        outputs_test = self.runAndSaveRNG(func, nograd_inputs, kwargs)
+    self.assertEqual(outputs, outputs_test)
+
+    if check_types:
+        check_output_types(self, func, outputs_test, nograd_inputs, kwargs)
+
+    if no_grad:
+        # skip grad tests
+        return
+
+    with enable_profiling_mode_for_profiling_tests():
+        # test single grad case
+        recording_inputs = clone_inputs(preserve_requires_grad=True)
+        recording_tensors = get_recording_tensors(recording_inputs)
+        outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs))
+        grads = torch.autograd.grad(allSum(outputs), recording_tensors,
+                                    allow_unused=allow_unused)
+        outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs))
+        grads_test = torch.autograd.grad(allSum(outputs_test), recording_tensors,
+                                         allow_unused=allow_unused)
+        self.assertEqual(outputs, outputs_test)
+        self.assertEqual(grads, grads_test)
+        # test the grad grad case
+        if self._testMethodName in nn_functional_single_grad or no_gradgrad:
+            return
+
+        outputs = output_func(self.runAndSaveRNG(reference_func, recording_inputs, kwargs))
+        l1 = allSum(outputs)
+        grads = torch.autograd.grad(l1, recording_tensors, create_graph=True,
+                                    allow_unused=allow_unused)
+
+        l2 = (allSum(grads) * l1)
+        grads2 = torch.autograd.grad(l2, recording_tensors, allow_unused=allow_unused)
+        recording_inputs = clone_inputs(preserve_requires_grad=True)
+        recording_tensors = get_recording_tensors(recording_inputs)
+        outputs_test = output_func(self.runAndSaveRNG(func, recording_inputs, kwargs))
+        l1_test = allSum(outputs_test)
+        grads_test = torch.autograd.grad(
+            l1_test, recording_tensors, create_graph=True, allow_unused=allow_unused)
+
+        l2_test = (allSum(grads_test) * l1_test)
+        grads2_test = torch.autograd.grad(l2_test, recording_tensors, allow_unused=allow_unused)
+
+        self.assertEqual(outputs, outputs_test)
+        self.assertEqual(grads, grads_test)
+        for g2, g2_test in zip(grads2, grads2_test):
+            if g2 is None and g2_test is None:
+                continue
+            self.assertEqual(g2, g2_test, atol=5e-4, rtol=1e-4)
+
+class JitCommonTestCase(TestCase):
+    def createFunctionFromGraph(self, trace):
+        graph = trace if isinstance(trace, torch._C.Graph) else trace.graph()
+        return torch._C._create_function_from_graph("forward", graph)
+
+    def assertExportImport(self, trace, inputs):
+        m = self.createFunctionFromGraph(trace)
+        self.assertExportImportModule(m, inputs)
+
+    def assertExportImportModule(self, m, inputs):
+        m_import = self.getExportImportCopy(m)
+        a = self.runAndSaveRNG(m, inputs)
+        b = self.runAndSaveRNG(m_import, inputs)
+        self.assertEqual(a, b, "Results of original model and "
+                               "exported/imported version of model differed")
+
+    def runAndSaveRNG(self, func, inputs, kwargs=None):
+        kwargs = kwargs if kwargs else {}
+        with freeze_rng_state():
+            results = func(*inputs, **kwargs)
+        return results
+
+    def getExportImportCopy(self, m, also_test_file=True, map_location=None):
+        buffer = io.BytesIO()
+        torch.jit.save(m, buffer)
+        buffer.seek(0)
+        imported = torch.jit.load(buffer, map_location=map_location)
+
+        if not also_test_file:
+            return imported
+
+        with TemporaryFileName() as fname:
+            torch.jit.save(imported, fname)
+            return torch.jit.load(fname, map_location=map_location)
+
+    def autoDiffErrorMessage(self, should_autodiff_node, nodes_not_in_diff_graph,
+                             fusion_nodes_not_found, non_fusible_nodes_being_fused,
+                             fusion_nodes_found, nodes_in_diff_graph):
+        err_msg = "\nFailure in testing nodes' autodifferentiation. "
+        if should_autodiff_node:
+            err_msg += "One or more nodes were expected to be autodiffed, " \
+                "but were not found in specified fusible/nonfusible " \
+                "DifferentiableGraph groups. \nSpecifically:"
+            # The node is intended to appear in a differentiable graph but doesn't
+            diff_nodes_missing = []
+            # The node is intended to appear in a differentiable graph
+            # outside of a fusion group but instead is in a fusion group
+            diff_nodes_in_fusion = []
+            # The node is intended to appear in a fusion group but doesn't
+            fusion_nodes_missing = []
+            # The node is intended to appear in a fusion group but instead
+            # is just in an outer differentiable graph
+            fusion_nodes_in_diff = []
+            for node in nodes_not_in_diff_graph:
+                if node in non_fusible_nodes_being_fused:
+                    diff_nodes_in_fusion.append(node)
+                else:
+                    diff_nodes_missing.append(node)
+            for node in fusion_nodes_not_found:
+                if node in nodes_in_diff_graph:
+                    fusion_nodes_in_diff.append(node)
+                else:
+                    fusion_nodes_missing.append(node)
+            if len(diff_nodes_missing) > 0:
+                err_msg += f"\n  {diff_nodes_missing} were not in one of the " \
+                    "DifferentiableGraphs when they were expected to be. " \
+                    "Did you intend for these nodes to be autodiffed? " \
+                    "If not, remove them from the list of nonfusible nodes."
+            if len(diff_nodes_in_fusion) > 0:
+                err_msg += f"\n  {diff_nodes_in_fusion} were found in one of the FusionGroups " \
+                    "when they were expected to be just in a DifferentiableGraph. If it was " \
+                    "intended for these nodes to be in FusionGroups, reclassify these nodes as " \
+                    "fusible nodes. If these nodes were not intended to be fused, your " \
+                    "autodifferentiation logic might be wrong."
+            if len(fusion_nodes_missing) > 0:
+                err_msg += f"\n  {fusion_nodes_missing} were not in one of the FusionGroups " \
+                    "of the DifferentiableGraphs when they were expected to be. " \
+                    "They were also not found in an outer DifferentiableGraph. Did you " \
+                    "intend for these nodes to be autodifferentiated? If not, you should " \
+                    "remove these nodes from the test's fusible nodes. Otherwise your " \
+                    "autodifferentiation logic might be wrong."
+            if len(fusion_nodes_in_diff) > 0:
+                err_msg += f"\n  {fusion_nodes_in_diff} were not in one of the FusionGroups " \
+                    "of the DifferentiableGraphs when they were expected to be, " \
+                    "instead they were found just in an outer DifferentiableGraph. " \
+                    "Did you intend for these nodes to be fused? If not, you should " \
+                    "move these nodes into the test's nonfusible nodes. Otherwise your " \
+                    "autodifferentiation logic might be wrong."
+        else:
+            err_msg += "One or more nodes were not expected to be autodiffed " \
+                "but were found in a DifferentiableGraph or in a FusionGroup " \
+                "of a DifferentiableGraph. Did you intend for these nodes to be " \
+                "autodiffed? If so, change this test to expect autodifferentiation. " \
+                "\nSpecifically:"
+            if len(fusion_nodes_found) > 0:
+                err_msg += f"\n  {fusion_nodes_found} were not expected to be in " \
+                    "one of the DifferentiableGraphs, but appeared in a FusionGroup " \
+                    "of a DifferentiableGraph. "
+            if len(nodes_in_diff_graph) > 0:
+                err_msg += f"\n  {nodes_in_diff_graph} were not expected to " \
+                    "be in one of the DifferentiableGraphs but were."
+        return err_msg
+
+    def assertAutodiffNode(self, graph, should_autodiff_node, nonfusible_nodes, fusible_nodes):
+        diff_nodes = graph.findAllNodes('prim::DifferentiableGraph')
+        diff_subgraphs = [node.g('Subgraph') for node in diff_nodes]
+
+        # Note: currently no tests have fusible_nodes
+        fusion_nodes = list(chain.from_iterable([g.findAllNodes('prim::FusionGroup') for g in diff_subgraphs]))
+        fusion_subgraphs = [node.g('Subgraph') for node in fusion_nodes]
+
+        # For any non-fusible node, it must show up in one of the DifferentiableGraphs.
+        nodes_in_diff_graph = []
+        nodes_not_in_diff_graph = []
+        non_fusible_nodes_being_fused = []
+        for node in nonfusible_nodes:
+            if any(g.findNode(node) is not None for g in diff_subgraphs):
+                nodes_in_diff_graph.append(node)
+            else:
+                nodes_not_in_diff_graph.append(node)
+            if any(g.findNode(node) is not None for g in fusion_subgraphs):
+                non_fusible_nodes_being_fused.append(node)
+        found_all_nonfusible_nodes = len(nodes_in_diff_graph) == len(nonfusible_nodes)
+
+        # For any fusible node, it must show up in one of the FusionGroups in one of the DifferentiableGraphs.
+        fusion_nodes_found = []
+        fusion_nodes_not_found = []
+        for node in fusible_nodes:
+            if any(g.findNode(node) is not None for g in fusion_subgraphs):
+                fusion_nodes_found.append(node)
+            else:
+                fusion_nodes_not_found.append(node)
+        found_all_fusible_nodes = len(fusion_nodes_found) == len(fusible_nodes)
+
+        if should_autodiff_node is not None:
+            err_msg = self.autoDiffErrorMessage(should_autodiff_node,
+                                                nodes_not_in_diff_graph,
+                                                fusion_nodes_not_found,
+                                                non_fusible_nodes_being_fused,
+                                                fusion_nodes_found,
+                                                nodes_in_diff_graph)
+            self.assertEqual(should_autodiff_node,
+                             found_all_nonfusible_nodes and found_all_fusible_nodes, err_msg)
+
+    def checkShapeAnalysis(self, out_sizes: Union[List[int], List[List[int]]],
+                           traced_graph, assert_propagation, constant_prop=True):
+        # repropagte input shapes provided by tracing,
+        prev_symbolic_shapes_test_enabled = torch._C._jit_symbolic_shapes_test_mode_enabled()
+        for enable_test_mode in [True, False]:
+            # here we are testing allowing/disallowing substituting in complete shapes as constants,
+            # disallowing constants helps stress test partial eval and substitution pipeline
+            torch._C._jit_set_symbolic_shapes_test_mode(enable_test_mode)
+            torch._C._jit_erase_non_input_shape_information(traced_graph)
+            if constant_prop:
+                torch._C._jit_pass_constant_propagation(traced_graph)
+            torch._C._jit_pass_propagate_shapes_on_graph(traced_graph)
+            # Add sizes to default tensor type to avoid checking something out of scope
+            # and difficulties with tracer leaving in other parts of tensor type
+            output = next(traced_graph.outputs()).type()
+
+            def test_type(type, actual_size):
+                sizes = type.symbolic_sizes()
+                out_type = TensorType.get().with_sizes(sizes)
+                actual_type = TensorType.get().with_sizes(actual_size)
+
+                # always check actual shape is a subtype of the output
+                self.assertTrue(actual_type.isSubtypeOf(out_type))
+
+                # and then if assertion flag is provided, check shape analysis
+                # is successful
+                if assert_propagation:
+                    self.assertEqual(out_type.sizes(), actual_size)
+
+            if output.isSubtypeOf(torch._C.TensorType.get()):
+                test_type(output, out_sizes)
+            else:
+                tuple_elements = output.elements()
+                for i in range(len(tuple_elements)):
+                    test_type(tuple_elements[i], out_sizes[i])
+
+        torch._C._jit_set_symbolic_shapes_test_mode(prev_symbolic_shapes_test_enabled)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_methods_invocations.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_methods_invocations.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ada970c6f06a1922da63d31a3c662c11e7cfbf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_methods_invocations.py
@@ -0,0 +1,22742 @@
+# mypy: ignore-errors
+
+from functools import wraps, partial
+from itertools import product, chain, islice
+import itertools
+import functools
+import copy
+import operator
+import random
+import unittest
+import math
+import enum
+
+import torch
+import numpy as np
+from torch import inf, nan
+
+from typing import Any, Dict, List, Tuple, Union, Sequence
+from torch.testing import make_tensor
+from torch.testing._internal.common_dtype import (
+    _dispatch_dtypes, floating_types, floating_types_and, complex_types, floating_and_complex_types,
+    floating_and_complex_types_and, all_types_and_complex_and, all_types_and, all_types_and_complex, integral_types_and,
+    all_types, empty_types, complex_types_and, integral_types, custom_types
+)
+from torch.testing._internal.common_device_type import \
+    (onlyCPU, onlyCUDA, onlyNativeDeviceTypes, disablecuDNN, skipCUDAIfNoMagma, skipCUDAIfNoMagmaAndNoCusolver,
+     skipCUDAIfNoCusolver, skipCPUIfNoLapack, skipCPUIfNoFFT, skipCUDAIf, precisionOverride,
+     skipCPUIfNoMklSparse,
+     toleranceOverride, tol)
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_FUSED_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+    SM53OrLater, SM80OrLater, SM90OrLater, with_tf32_off, TEST_CUDNN, _get_torch_cuda_version,
+    _get_torch_rocm_version,
+)
+from torch.testing._internal.common_utils import (
+    make_fullrank_matrices_with_distinct_singular_values,
+    TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, TEST_SCIPY,
+    torch_to_numpy_dtype_dict, TEST_WITH_ASAN,
+    GRADCHECK_NONDET_TOL, freeze_rng_state, slowTest, TEST_WITH_SLOW,
+    TEST_WITH_TORCHINDUCTOR
+)
+
+import torch._refs as refs  # noqa: F401
+import torch._refs.nn.functional
+import torch._refs.special
+import torch._refs.linalg
+import torch._prims as prims  # noqa: F401
+from torch.utils import _pytree as pytree
+
+
+from packaging import version
+
+from torch.testing._internal.opinfo.core import (  # noqa: F401
+    L,
+    M,
+    S,
+    XS,
+    _NOTHING,
+    _getattr_qual,
+    DecorateInfo,
+    SampleInput,
+    ErrorInput,
+    AliasInfo,
+    NumericsFilter,
+    OpInfo,
+    _generate_reduction_inputs,
+    _generate_reduction_kwargs,
+    sample_inputs_reduction,
+    ReductionOpInfo,
+    reference_inputs_elementwise_binary,
+    make_error_inputs_elementwise_binary,
+    generate_elementwise_binary_tensors,
+    generate_elementwise_binary_arbitrarily_strided_tensors,
+    generate_elementwise_binary_small_value_tensors,
+    generate_elementwise_binary_large_value_tensors,
+    generate_elementwise_binary_extremal_value_tensors,
+    generate_elementwise_binary_broadcasting_tensors,
+    generate_elementwise_binary_with_scalar_samples,
+    generate_elementwise_binary_with_scalar_and_type_promotion_samples,
+    generate_elementwise_binary_noncontiguous_tensors,
+    sample_inputs_elementwise_binary,
+    BinaryUfuncInfo,
+    sample_inputs_elementwise_unary,
+    generate_elementwise_unary_tensors,
+    generate_elementwise_unary_small_value_tensors,
+    generate_elementwise_unary_large_value_tensors,
+    generate_elementwise_unary_extremal_value_tensors,
+    reference_inputs_elementwise_unary,
+    UnaryUfuncInfo,
+    sample_inputs_spectral_ops,
+    SpectralFuncType,
+    SpectralFuncInfo,
+    ShapeFuncInfo,
+    sample_inputs_foreach,
+    ForeachFuncInfo,
+    gradcheck_wrapper_hermitian_input,
+    gradcheck_wrapper_triangular_input,
+    gradcheck_wrapper_triangular_input_real_positive_diagonal,
+    gradcheck_wrapper_masked_operation,
+    gradcheck_wrapper_masked_pointwise_operation,
+    clone_sample,
+)
+from torch.testing._internal.opinfo.refs import (  # NOQA: F401
+    _find_referenced_opinfo,
+    _inherit_constructor_args,
+    PythonRefInfo,
+    ReductionPythonRefInfo,
+    ElementwiseUnaryPythonRefInfo,
+    ElementwiseBinaryPythonRefInfo,
+)
+from torch.testing._internal.opinfo.utils import (
+    np_unary_ufunc_integer_promotion_wrapper,
+    reference_reduction_numpy,
+    prod_numpy
+)
+from torch.testing._internal import opinfo
+from torch.testing._internal.opinfo.definitions.linalg import (
+    sample_inputs_linalg_cholesky,
+    sample_inputs_linalg_cholesky_inverse,
+    sample_inputs_cross,
+    sample_inputs_linalg_qr_geqrf,
+    sample_inputs_linalg_invertible,
+    sample_inputs_lu_solve,
+    sample_inputs_legacy_solve,
+    sample_inputs_svd,
+    sample_inputs_linalg_det_logdet_slogdet,
+    sample_inputs_linalg_lu,
+    sample_inputs_diagonal_diag_embed,
+    error_inputs_diagonal_diag_embed,
+)
+from torch.testing._internal.opinfo.definitions.special import (
+    sample_inputs_i0_i1,
+    sample_inputs_polygamma,
+    reference_polygamma,
+)
+from torch.testing._internal.opinfo.definitions._masked import (
+    sample_inputs_softmax_variant,
+)
+from torch.testing._internal.opinfo.definitions.sparse import (
+    error_inputs_sparse_like_fns,
+    sample_inputs_sparse_like_fns,
+    error_inputs_sparse_mul,
+    sample_inputs_sparse_mul,
+    error_inputs_sparse_reduction_sum,
+    sample_inputs_sparse_reduction_sum
+)
+
+if TEST_SCIPY:
+    from scipy import stats
+    import scipy.spatial
+    import scipy.special
+
+
+# test if a tensor is close to an integer
+def close_to_int(x, eps=0.1):
+    if x.is_complex():
+        y = torch.abs(torch.view_as_complex(torch.frac(torch.view_as_real(x))))
+    else:
+        y = torch.abs(torch.frac(x))
+    return (y < eps) | (y > (1 - eps))
+
+
+def sample_inputs_slice(op_info, device, dtype, requires_grad, **kwargs):
+
+    make_input = partial(make_tensor, device=device, dtype=dtype,
+                         low=None, high=None, requires_grad=requires_grad)
+
+    yield SampleInput(make_input(3), 0)
+
+    yield SampleInput(make_input(20, 30, 40), dim=1, start=1, end=-2)
+
+    yield SampleInput(make_input(20, 30, 40), dim=1, start=1, end=-2, step=3)
+
+    yield SampleInput(make_input(20, 30, 40), dim=0, start=-10, end=-2, step=2)
+
+
+def sample_inputs_tensor_split(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype,
+                         low=None, high=None, requires_grad=requires_grad)
+
+    args_cases = (
+        # Cases with tensor indices.
+        (torch.tensor([1, 2, 3]),),
+        (torch.tensor(1),),
+        (torch.tensor([1, 2, 3]), 1),
+        (torch.tensor([1, 4, 2, 5, 3, 6])[::2], 1),
+        # Cases with list of indices.
+        ((2, 4),),
+        ((2, 4), 1),
+        ((2, 4), -1),
+        # Cases with integer section.
+        (3,),
+        (3, 1),
+        (3, -1),
+    )
+
+    for args in args_cases:
+        yield SampleInput(make_input((S, S, S)), args=args)
+
+
+def sample_inputs_hsplit(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg(6), 2)
+    yield SampleInput(make_arg(S, S, S), [1, 2, 3])
+
+def sample_inputs_vsplit(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg(6, S), 2)
+    yield SampleInput(make_arg(S, S, S), [1, 2, 3])
+
+def sample_inputs_dsplit(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg(S, S, S), [1, 2, 3])
+    yield SampleInput(make_arg(S, S, 6), 2)
+
+def error_inputs_hsplit(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+    err_msg1 = ("torch.hsplit requires a tensor with at least 1 dimension, "
+                "but got a tensor with 0 dimensions!")
+    yield ErrorInput(SampleInput(make_arg(()), 0), error_regex=err_msg1)
+
+    err_msg2 = (f"torch.hsplit attempted to split along dimension 1, "
+                f"but the size of the dimension {S} "
+                f"is not divisible by the split_size 0!")
+    yield ErrorInput(SampleInput(make_arg((S, S, S)), 0), error_regex=err_msg2)
+
+    # Incorrect type for indices_or_section argument
+    err_msg3 = ("received an invalid combination of arguments.")
+    yield ErrorInput(
+        SampleInput(make_arg((S, S, S)), "abc"),
+        error_type=TypeError, error_regex=err_msg3)
+
+def error_inputs_vsplit(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+    err_msg1 = ("torch.vsplit requires a tensor with at least 2 dimension, "
+                "but got a tensor with 1 dimensions!")
+    yield ErrorInput(SampleInput(make_arg(S), 0), error_regex=err_msg1)
+
+    err_msg2 = (f"torch.vsplit attempted to split along dimension 0, "
+                f"but the size of the dimension {S} "
+                f"is not divisible by the split_size 0!")
+    yield ErrorInput(SampleInput(make_arg(S, S, S), 0),
+                     error_regex=err_msg2)
+
+    # Incorrect type for indices_or_section argument
+    err_msg3 = ("received an invalid combination of arguments.")
+    yield ErrorInput(SampleInput(make_arg(S, S, S), "abc"),
+                     error_type=TypeError, error_regex=err_msg3)
+
+def error_inputs_dsplit(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+    err_msg1 = ("torch.dsplit requires a tensor with at least 3 dimension, "
+                "but got a tensor with 1 dimensions!")
+    yield ErrorInput(SampleInput(make_arg(S), 0), error_regex=err_msg1)
+
+    err_msg2 = (f"torch.dsplit attempted to split along dimension 2, "
+                f"but the size of the dimension {S} "
+                f"is not divisible by the split_size 0!")
+    yield ErrorInput(SampleInput(make_arg(S, S, S), 0), error_regex=err_msg2)
+
+
+def sample_inputs_as_strided(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # input shape, output shape, output stride, output storage offset
+    test_cases = (
+        ((1,), (1,), (1,), 0),
+        ((3, 3), (2, 2), (1, 2), 0),
+        ((3, 3), (2, 2), (1, 2), 1),
+        ((16,), (2, 2, 2, 2), (1, 1, 1, 1), 0),
+        ((16,), (2, 1, 1, 2), (1, 7, 7, 1), 0),
+    )
+
+    for input_shape, output_shape, stride, storage_offset in test_cases:
+        input_t = make_arg(input_shape)
+        kwargs = dict(storage_offset=storage_offset)
+        yield SampleInput(input_t, args=(output_shape, stride), kwargs=kwargs)
+
+def sample_inputs_as_strided_partial_views(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg():
+        base = make_tensor((20,), device=device, dtype=dtype)
+        return base[5:15].requires_grad_(requires_grad)
+
+    # as_strided on offset, partial views
+    yield SampleInput(make_arg(), (2, 2), (1, 2))
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=0)
+    yield SampleInput(make_arg(), (2, 2), (1, 2), storage_offset=10)
+
+def sample_inputs_as_strided_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # input shape, output shape, output stride, output storage offset
+    test_cases = [
+        ((1,), (), (), 0),
+        ((1,), (1,), (1,), 0),
+        ((3, 3), (2, 2), (1, 2), 0),
+        ((3, 3), (2, 2), (1, 2), 1),
+        ((3, 3), (2, 2), (2, 1), 0),
+        # Scatter to larger dimensions
+        ((16,), (2, 2, 2, 2), (8, 4, 2, 1), 0),
+        # Scatter to larger dimensions with strides inverted
+        ((16,), (2, 1, 1, 2), (1, 2, 4, 8), 0),
+    ]
+
+    for input_shape, output_shape, stride, storage_offset in test_cases:
+        input_t = make_arg(input_shape)
+        input_src = make_arg(output_shape)
+        yield SampleInput(input_t, input_src, output_shape, stride, storage_offset=storage_offset)
+
+
+def error_inputs_as_strided_scatter(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+
+    # Create a small tensor and try to scatter it out of bounds
+    input_t = make_arg([4, 4])
+    input_src = make_arg([2, 2])
+    yield ErrorInput(
+        SampleInput(input_t, input_src, [2, 2], [200, 200], storage_offset=0),
+        error_regex="itemsize 4 requiring a storage size of 1604 are out of bounds for storage of size 64"
+    )
+
+
+def sample_inputs_combinations(op_info, device, dtype, requires_grad, **kwargs):
+    inputs = (
+        (0,),
+        (0, 1),
+        (0, 1, 2, 3),
+    )
+
+    rvals = [1, 2, 4]
+
+    products = product(inputs, rvals, [False, True])
+
+    for input_data, r, with_replacement in products:
+        input_t = torch.tensor(input_data, device=device, dtype=dtype, requires_grad=requires_grad)
+        yield SampleInput(input_t, r=r, with_replacement=with_replacement)
+
+def sample_inputs_cartesian_prod(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(torch.tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # constructs 1-D tensors with varying number of elements
+    a = make_arg((0,))
+    b = make_arg((0, 1))
+    c = make_arg((0, 1, 2, 3))
+
+    # sample with only 1 tensor
+    yield SampleInput(a)
+
+    # sample with 2 tensors
+    yield SampleInput(a, b)
+
+    # sample with 3 tensors
+    yield SampleInput(a, b, c)
+
+def sample_inputs_cosine_similarity(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input_shape, dict of dim and eps
+    cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
+        ((S, S), {'dim': 1}),
+        ((S, 2), {'dim': -1}),
+        ((S,), {'dim': 0, 'eps': 0.5}),
+        ((), {'dim': 0}),
+        ((S, S, M), {'dim': 2}),
+        ((S, S), {})
+    )
+
+    for input_shape, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(make_arg(input_shape),), kwargs=kwargs)
+    # Test for Broadcasting
+    yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
+    yield SampleInput(make_arg((1, 2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -2})
+    yield SampleInput(make_arg((2, 3)), args=(make_arg((2, 1, 3)),), kwargs={'dim': -1})
+
+
+def sample_inputs_item(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+
+    cases = (
+        (),
+        (()),
+        (1),
+        ((1,)),
+    )
+
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
+
+def error_inputs_item(op, device, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device, requires_grad=False)
+
+    cases = (
+        (M),
+        ((S,)),
+        (S, S),
+        (S, M, L),
+    )
+
+    for shape in cases:
+        yield ErrorInput(
+            SampleInput(make_arg(shape)), error_type=RuntimeError,
+            error_regex="elements cannot be converted to Scalar")
+
+
+def sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_arg_without_requires_grad = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    # Ordered as: input shape, kwargs for training, momentum, eps
+    cases: Tuple[Tuple[int], dict] = (  # type: ignore[assignment]
+        ((S, S, S), {'training': True, 'momentum': 0.5, 'eps': 0.6}),
+        ((3, 2, 4), {'training': False, 'momentum': -1.2}),
+        ((3, 1), {'training': True, 'momentum': 0.0}),
+        ((0,), {'training': True}),
+        ((0,), {'training': False}),
+        ((3, 2, 3, 4), {'training': True, 'momentum': -1.0, 'eps': 0.5}),
+        ((3, 2, 3, 4), {'training': False, 'momentum': -1.0, 'eps': 0.5}),
+        ((2, 1), {}),
+    )
+
+    for input_shape, kwargs in cases:
+        # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        weight = make_arg(channels) if channels > 0 else None
+        bias = make_arg(channels) if channels > 0 else None
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
+
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(
+                running_mean,
+                running_var,
+                weight,
+                bias
+            ),
+            kwargs=kwargs
+        )
+
+    # Checking for permutations of weights and biases as `None`
+    weights = [channels, None, None]
+    biases = [None, channels, None]
+    is_training = [True, False, False]
+
+    for weight, bias, training in zip(weights, biases, is_training):
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(
+                running_mean,
+                running_var,
+                make_arg(channels),
+                make_arg(channels)
+            ),
+            kwargs={'training': training}
+        )
+
+    # Test case for no optional kwargs
+    # running_mean and running_var are required in evaluation mode (training: False) but not in training mode
+    yield SampleInput(make_arg((1, 2, 3)), args=(None, None, None, None), kwargs={'training': True})
+
+def sample_inputs_softmax_backward_data(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    cases = [
+        ((S,), 0),
+        ((S, S), 0),
+        ((S, M, S), -1),
+    ]
+    input_dtypes = [dtype]
+    if dtype == torch.float and device == 'cuda':
+        input_dtypes += [torch.float16]
+
+    for (shape, dim), input_dtype in product(cases, input_dtypes):
+        input = make_arg(shape)
+        output = torch.nn.functional.softmax(input, dim=dim, dtype=input_dtype)
+        yield SampleInput(make_arg(shape), output, dim, input_dtype)
+
+def sample_inputs_native_batch_norm(op_info, device, dtype, requires_grad, **kwargs):
+    samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
+    for sample in samples:
+        # torch.native_batch_norm does not support 0 numel tensors
+        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
+        if sample.input.numel() == 0:
+            continue
+        args = sample.args
+        training = sample.kwargs.get('training', True)
+        momentum = sample.kwargs.get('momentum', 0.5)
+        eps = sample.kwargs.get('eps', 1e-5)
+        yield SampleInput(sample.input, args=(args[2], args[3], args[0], args[1], training, momentum, eps))
+
+
+def sample_inputs__native_batch_norm_legit(op_info, device, dtype, requires_grad, **kwargs):
+    samples = sample_inputs_batch_norm(op_info, device, dtype, requires_grad, **kwargs)
+    for sample in samples:
+        # torch.native_batch_norm does not support 0 numel tensors
+        # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
+        if sample.input.numel() == 0:
+            continue
+        args = sample.args
+        training = sample.kwargs.get('training', True)
+        momentum = sample.kwargs.get('momentum', 0.5)
+        eps = sample.kwargs.get('eps', 1e-5)
+        if args[0] is not None and args[1] is not None:
+            yield SampleInput(sample.input, args=(args[2], args[3], args[0], args[1], training, momentum, eps))
+        else:
+            yield SampleInput(sample.input, args=(args[2], args[3], training, momentum, eps))
+
+
+def sample_inputs_nn_activation_relu(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        (()),
+        ((S, )),
+        ((S, S)),
+        ((S, M, S))
+    )
+
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
+
+def sample_inputs_prelu(op_info, device, dtype, requires_grad, **kwargs):
+    op_kwargs = op_info.sample_kwargs(device, dtype, None)[0]
+    yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad,
+                                               op_kwargs=op_kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        (()),
+        ((S, )),
+        ((S, S)),
+        ((S, M, S))
+    )
+
+    for shape in cases:
+        for weight in [-1., 0., 0.8, 1.]:
+            weight_tensor = torch.tensor(weight, device=device, dtype=dtype, requires_grad=requires_grad)
+            yield SampleInput(make_arg(shape), args=(weight_tensor,))
+
+        channel_size = shape[1] if len(shape) >= 2 else 1
+        yield SampleInput(make_arg(shape), args=(make_arg((channel_size,)),))
+
+    weight_tensor = torch.tensor(1., device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, S)), kwargs=dict(weight=weight_tensor,))
+    yield SampleInput(make_arg((S, S)), kwargs=dict(weight=make_arg((S,)),))
+
+def reference_inputs_prelu(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_prelu(op, device, dtype, requires_grad, **kwargs)
+    yield from reference_inputs_elementwise_unary(op, device, dtype, requires_grad, **kwargs)
+
+def sample_kwargs_prelu_scalar_weight(device, dtype, input):
+    weight = torch.rand(tuple(), device=device, dtype=dtype)
+    # NumPy does not support bfloat16, so we default to float32 (only for NumPy) in that case
+    if dtype == torch.bfloat16:
+        weight_cpu = weight.to(dtype=torch.float32, device="cpu")
+    else:
+        weight_cpu = weight.cpu()
+    np_weight = weight_cpu.numpy()
+    return ({'weight': weight}, {'weight': np_weight})
+
+def error_inputs_prelu(op, device):
+    # Weight has numel != 1, but self.ndim is zero-dim tensor
+    inp = make_tensor(tuple(), device=device, dtype=torch.float32)
+    weight = make_tensor((2,), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(inp, kwargs={'weight': weight}),
+                     error_regex="Not allow zero-dim input tensor.")
+
+    # Weight has numel != 1, but numel does not match channel size
+    inp = make_tensor((2, 8, 3), device=device, dtype=torch.float32)
+    weight = make_tensor((9,), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(inp, kwargs={'weight': weight}),
+                     error_regex="Mismatch of parameter numbers and input channel size.")
+
+    # Weight is neither a scalar nor 1-D tensor
+    inp = make_tensor((2, 8, 3), device=device, dtype=torch.float32)
+    weight = make_tensor((2, 4), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(inp, kwargs={'weight': weight}),
+                     error_regex="prelu: Expected `weight` to be a scalar or 1D tensor, but got: ndim = 2")
+
+    # src and index tensors must have the same # of dimensions
+def sample_inputs_norm(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # ord = inf is tested in inputs_norm_inf as it fails on some tests
+    cases = [
+        ((S, S), (2,), '2'),
+        ((S, S), (0,), '0'),
+        ((S, S), (0.5,), '0_5'),
+        ((S, S), (1,), '1'),
+        ((S, S), (3,), '3'),
+        ((S, S), (-1,), 'neg_1'),
+        ((S, S), (-2,), 'neg_2'),
+        ((S, S), (-0.5,), 'neg_0_5'),
+        ((S, S), (-1.5,), 'neg_1_5'),
+    ]
+
+    cases_nonzero_input = (
+        ((S, S, S), (1.5,), '1_5_default'),
+        ((S, S, S), (1.5, 1), '1_5_dim'),
+        ((S, S, S), (1.5, -1), '1_5_neg_dim'),
+        ((S, S, S), (1.5, 1, True), 'keepdim_1_5_dim'),
+        ((S, S, S), (1.5, -1, True), 'keepdim_1_5_neg_dim'),
+    )
+
+    cases_posdim = (
+        ((S, S), (-2, 1,), 'neg_2_dim'),
+        ((S, S), (-1, 1,), 'neg_1_dim'),
+        ((S, S), (0, 1,), '0_dim'),
+        ((S, S), (1, 1,), '1_dim'),
+        ((S, S), (2, 1,), '2_dim'),
+        ((S, S), (3, 1,), '3_dim'),
+        ((S, S, S), (2, 1), '2_dim'),
+        ((S, S, S), (3, 1), '3_dim'),
+        ((S, S, S), (2, 1, True), 'keepdim_2_dim'),
+        ((S, S, S), (3, 1, True), 'keepdim_3_dim'),
+        ((), (2, 0), '2_dim_scalar'),
+        ((), (3, 0), '3_dim_scalar'),
+        ((), (2, 0, True), 'keepdim_2_dim_scalar'),
+        ((), (3, 0, True), 'keepdim_3_dim_scalar'),
+    )
+
+    cases_negdim = ((shape, args[:1] + (-args[1],) + args[2:], name.replace("_dim", "_neg_dim"))
+                    for shape, args, name in cases_posdim)
+
+    for shape, args, name in itertools.chain(cases, cases_posdim, cases_negdim):
+        yield SampleInput(make_arg(shape), args=args, name=name)
+
+    for shape, args, name in cases_nonzero_input:
+        yield SampleInput(make_arg(shape, exclude_zero=True), args=args, name=name)
+
+
+def sample_inputs_norm_fro(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        ((S, S), (), 'default'),
+        ((S, S), ('fro',), 'fro_default'),
+        ((S, S), ('fro', [0, 1],), 'fro'),
+    )
+
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
+
+
+def sample_inputs_norm_nuc(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        ((S, S), ('nuc',), 'nuc'),
+        ((S, S, S), ('nuc', [1, 2]), 'nuc_batched'),
+    )
+
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
+
+
+def sample_inputs_norm_inf(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        ((S, S), (-inf,), '-inf'),
+        ((S, S), (inf,), 'inf'),
+        ((S, S), (inf, 1,), 'inf_2_dim'),
+        ((S, S), (inf, -1,), 'inf_2_neg_dim'),
+    )
+
+    for shape, args, name in cases:
+        yield SampleInput(make_arg(shape), args=args, name=name)
+
+
+def sample_inputs_equal(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        ((), ()),
+        ((S,), ()),
+        ((), (S,)),
+        ((S, 1), (S,)),
+        ((M, S), ()),
+        ((S, S), (S, S))
+    )
+
+    for shape_lhs, shape_rhs in shapes:
+        lhs = make_arg(shape_lhs)
+        rhs = make_arg(shape_rhs)
+        broadcasts_input = shape_lhs != torch.broadcast_shapes(shape_lhs, shape_rhs)
+
+        yield SampleInput(lhs, args=(rhs,), broadcasts_input=broadcasts_input)
+        if shape_lhs == shape_rhs:
+            yield SampleInput(lhs, args=(lhs.clone().detach_(),))
+
+
+
+def sample_inputs_jiterator(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        ((), ()),
+        ((S,), ()),
+        ((S, 1), (S,)),
+        ((M, S), ()),
+        ((S, M, S), (M, S)),
+        ((S, M, S), (S, M, S)),
+        ((M, 1, S), (M, S)),
+        ((M, 1, S), (1, M, S)),
+        ((0, 1, 3), (0, 10, 3))
+    )
+
+    num_inputs = kwargs.get('num_inputs')
+    sample_kwargs = kwargs.get('sample_kwargs', {})
+
+    for shape_lhs, shape_rhs in shapes:
+        lhs = make_arg(shape_lhs)
+
+        args = []
+        for i in range(num_inputs - 1):
+            args.append(make_arg(shape_rhs))
+        broadcasts_input = (shape_lhs != torch.broadcast_shapes(shape_lhs, shape_rhs))
+
+        yield SampleInput(lhs, args=tuple(args), kwargs=sample_kwargs, broadcasts_input=broadcasts_input)
+
+def sample_inputs_broadcast_shapes(op, device, dtype, requires_grad, **kwargs):
+    shapes = (
+        ((), ()),
+        ((S,), ()),
+        ((S, 1), (S,)),
+        ((S, 1), S),
+        ((M, S), ()),
+        ((S, M, S), (M, S)),
+        ((S, M, S), (S, M, S)),
+        ((M, 1, S), (M, S)),
+        ((M, 1, S), (1, M, S)),
+        ((0, 1, 3), (0, 10, 3))
+    )
+
+    for shape in shapes:
+        inp, *arg0 = shape
+        yield SampleInput(inp, args=tuple(arg0))
+
+def sample_inputs_add_sub(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
+
+    # Adds alpha kwarg cases
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
+    if dtype is not torch.bool:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': 2})
+    else:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': True})
+    neg_alpha = -3.125 if (dtype.is_floating_point or dtype.is_complex) else -3
+    lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
+    if dtype is not torch.bool:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': neg_alpha})
+    else:
+        yield SampleInput(lhs, args=(rhs,), kwargs={'alpha': False})
+
+def error_inputs_arange(op, device, **kwargs):
+    yield ErrorInput(SampleInput(0, args=(3, 0)), error_type=RuntimeError, error_regex='step must be nonzer')
+    yield ErrorInput(SampleInput(0, args=(-3, 2)), error_type=RuntimeError, error_regex='bound inconsistent with step sign')
+    yield ErrorInput(SampleInput(0, args=(3, -2)), error_type=RuntimeError, error_regex='bound inconsistent with step sign')
+    yield ErrorInput(SampleInput(0, args=(float('inf'), 2)), error_type=RuntimeError, error_regex='unsupported range')
+    yield ErrorInput(SampleInput(float('-inf'), args=(1, 2)), error_type=RuntimeError, error_regex='unsupported range')
+
+def sample_inputs_arange(op, device, dtype, requires_grad, **kwargs):
+    int_samples = (
+        # positive direction
+        (-1, 2, 2),
+        # negative direction
+        (2, -3, -1),
+        # start == end
+        (1, 1, 1),
+        (1, 1, -1),
+        # divides evenly
+        (0, -8, -4),
+        (1, 5, 2),
+        # bool
+        (False, True, True),
+        # default step
+        (0, 1, None),
+        # default start
+        (None, 3, None),
+    )
+
+    def to_float(start, end, step):
+        start = start + 0.1 if start is not None else None
+        end = end + 0.1
+        step = float(step) if step is not None else None
+        return start, end, step
+
+    float_samples = (
+        # includes endpoint
+        (0., -8. - 1e-6, -4.),
+        (1., 5. + 1e-6, 2.),
+        (0., -8., -4.),
+        (1., 5., 2.),
+        *(to_float(start, end, step) for (start, end, step) in int_samples),
+    )
+
+    large_samples = (
+        (0, 10000, None),
+    )
+
+    samples = int_samples + float_samples
+    if dtype not in (torch.int8, torch.uint8):
+        samples += large_samples
+
+    for start, end, step in samples:
+        if start is None:
+            assert step is None
+            # Pass end as positional arg
+            yield SampleInput(end, kwargs={"dtype": dtype, "device": device})
+            # (Similar to) calling torch.arange(end=3)
+            yield SampleInput(0, kwargs={"end": end, "dtype": dtype, "device": device})
+        elif step is None:
+            yield SampleInput(start, args=(end,), kwargs={"dtype": dtype, "device": device})
+        else:
+            yield SampleInput(start, args=(end, step), kwargs={"dtype": dtype, "device": device})
+
+    yield SampleInput(2)
+    yield SampleInput(1, args=(3, 1))
+
+def sample_inputs_randn(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+
+    shapes = (
+        (M,),
+        (S, S)
+    )
+
+    for shape in shapes:
+        yield SampleInput(input=shape, kwargs=dict(dtype=dtype, device=device, requires_grad=requires_grad))
+
+def sample_inputs_normal(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((S, S), 0, 5),
+        ((S, S, S), -2, 0.5),
+    )
+    for shape, mean, std in samples:
+        yield SampleInput(make_arg(shape), args=(mean, std))
+
+def error_inputs_normal(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_std = -1
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_std)),
+        error_type=RuntimeError,
+        error_regex=fr"normal expects std >= 0.0, but found std {invalid_std}",
+    )
+
+def sample_inputs_cauchy(op, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0, 0.5),
+        ((S, S), 0, 1),
+        ((S, S, S), -2, 1),
+    )
+    for shape, median, gamma in samples:
+        yield SampleInput(make_arg(shape), args=(median, gamma))
+
+
+def error_inputs_cauchy(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_scale = 0
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_scale,)),
+        error_type=RuntimeError,
+        error_regex=fr"cauchy_ expects sigma > 0.0, but found sigma={invalid_scale}",
+    )
+
+
+def sample_inputs_exponential(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0.5),
+        ((S, S), 1),
+        ((S, S, S), 1.5),
+    )
+    for shape, rate in samples:
+        yield SampleInput(make_arg(shape), args=(rate,))
+
+
+def error_inputs_exponential(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_rate = 0
+    yield ErrorInput(
+        SampleInput(t, args=(invalid_rate,)),
+        error_type=RuntimeError,
+        error_regex=fr"exponential_ expects lambda > 0.0, but found lambda={invalid_rate}",
+    )
+
+
+def sample_inputs_geometric(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0.2),
+        ((S, S), 0.5),
+        ((S, S, S), 0.8),
+    )
+    for shape, rate in samples:
+        yield SampleInput(make_arg(shape), args=(rate,))
+
+
+def error_inputs_geometric(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    neg_prob = -1
+    yield ErrorInput(
+        SampleInput(t, args=(neg_prob,)),
+        error_type=RuntimeError,
+        error_regex=fr"geometric_ expects p to be in \(0, 1\), but got p={neg_prob}",
+    )
+
+
+def sample_inputs_log_normal(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), 0, 0.25),
+        ((S, S), 0.5, 1),
+        ((S, S, S), 0, 0.5),
+    )
+    for shape, mean, std in samples:
+        yield SampleInput(make_arg(shape), args=(mean, std))
+
+
+def error_inputs_log_normal(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    invalid_std = 0
+    yield ErrorInput(
+        SampleInput(t, args=(0, invalid_std)),
+        error_type=RuntimeError,
+        error_regex=fr"log_normal_ expects std > 0.0, but found std={invalid_std}",
+    )
+
+
+def sample_inputs_uniform(op, device, dtype, requires_grad, **kwargs):
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=False)
+    samples = (
+        ((M,), -100, 100),
+        ((S, S), 0, 1),
+        ((S, S, S), 1, 2),
+    )
+    for shape, hi, lo in samples:
+        yield SampleInput(make_arg(shape), args=(hi, lo))
+
+def sample_inputs_ones_zeros(op, device, dtype, requires_grad, **kwargs):
+    # this is a bit messy, as we want the args to be tuples
+    # so if we pass size as a tuple, we have a tuple containing a tuple
+    sizes = (
+        (M,),
+        (S, S),
+    )
+    for size in sizes:
+        yield SampleInput(size, kwargs={'dtype': dtype, 'device': device})
+
+def sample_inputs_full(op, device, dtype, requires_grad, **kwargs):
+    def get_val(dtype):
+        return make_tensor([], dtype=dtype, device="cpu").item()
+
+    sizes = (
+        (M,),
+        (S, S),
+    )
+    fill_values = [get_val(dtype), get_val(torch.int)]
+
+    for size, fill_value in product(sizes, fill_values):
+        yield SampleInput(size, fill_value, dtype=dtype, device=device)
+
+
+def error_inputs_uniform(op, device, **kwargs):
+    t = torch.zeros([10], device=device)
+    yield ErrorInput(
+        SampleInput(t, args=(3, -1)),
+        error_type=RuntimeError,
+        error_regex=r"uniform_ expects to return a \[from, to\) range, but found from=3 > to=-1",
+    )
+
+
+def error_inputs_linspace(op, device, **kwargs):
+    yield ErrorInput(SampleInput(0, args=(3, -1)), error_type=RuntimeError, error_regex='number of steps must be non-negative')
+    yield ErrorInput(
+        SampleInput(0, args=(3, 1.)),
+        error_type=TypeError,
+        error_regex="received an invalid combination of arguments - got \\(int, int, float",
+    )
+    yield ErrorInput(
+        SampleInput(torch.tensor([1, 1], device=device), args=(torch.tensor([3, 3], device=device), 1)),
+        error_type=RuntimeError,
+        error_regex="only supports 0-dimensional start and end tensors"
+    )
+
+
+def sample_inputs_linspace(op, device, dtype, requires_grad, **kwargs):
+    ends = (-3, 0, 1, 4, 50)
+    starts = (-2., 0, 4.3, 50)
+    nsteps = (0, 1, 50)
+    # Extra case to replicate off-by-one issue on CUDA
+    cases = list(product(starts, ends, nsteps)) + [(0, 7, 50)]
+    for start, end, nstep in cases:
+        if dtype == torch.uint8 and (end < 0 or start < 0):
+            continue
+        yield SampleInput(start, args=(end, nstep), kwargs={"dtype": dtype, "device": device})
+
+    yield SampleInput(1, args=(3, 1))
+
+
+def sample_inputs_linspace_tensor_overload(op, device, dtype, requires_grad, **kwargs):
+    ends = (-3, 0, 1, 4, 50)
+    starts = (-2., 0, 4.3, 50)
+    nsteps = (0, 1, 50)
+    is_start_end_tensors = ((True, True), (True, False), (False, True))
+    make_arg = partial(torch.tensor, device=device, requires_grad=False)
+
+    # Extra case to replicate off-by-one issue on CUDA
+    cases = list(product(starts, ends, nsteps, is_start_end_tensors)) + [(0, 7, 50, (True, True))]
+    for start, end, nstep, (is_start_tensor, is_end_tensor) in cases:
+        if dtype == torch.uint8 and (end < 0 or start < 0):
+            continue
+
+        tensor_options = {"dtype": dtype, "device": device}
+        if is_start_tensor:
+            start = make_arg(start, dtype=torch.float32 if isinstance(start, float) else torch.int64)
+        if is_end_tensor:
+            end = make_arg(end, dtype=torch.float32 if isinstance(end, float) else torch.int64)
+
+        yield SampleInput(start, args=(end, nstep), kwargs=tensor_options)
+
+    yield SampleInput(1, args=(3, 1))
+
+
+def sample_inputs_logspace(op, device, dtype, requires_grad, **kwargs):
+    ends = (-3, 0, 1.2, 2, 4)
+    starts = (-2., 0, 1, 2, 4.3)
+    nsteps = (0, 1, 2, 4)
+    bases = (2., 1.1) if dtype in (torch.int8, torch.uint8) else (None, 2., 3., 1.1, 5.)
+    for start, end, nstep, base in product(starts, ends, nsteps, bases):
+        if dtype == torch.uint8 and end < 0 or start < 0:
+            continue
+        if nstep == 1 and isinstance(start, float) and not (dtype.is_complex or dtype.is_floating_point):
+            # https://github.com/pytorch/pytorch/issues/82242
+            continue
+        if base is None:
+            yield SampleInput(start, args=(end, nstep), kwargs={"dtype": dtype, "device": device})
+        else:
+            yield SampleInput(start, args=(end, nstep, base), kwargs={"dtype": dtype, "device": device})
+
+    yield SampleInput(1, args=(3, 1, 2.))
+
+
+def sample_inputs_logspace_tensor_overload(op, device, dtype, requires_grad, **kwargs):
+    ends = (-3, 0, 1.2, 2, 4)
+    starts = (-2., 0, 1, 2, 4.3)
+    nsteps = (0, 1, 2, 4)
+    bases = (2., 1.1) if dtype in (torch.int8, torch.uint8) else (None, 2., 3., 1.1, 5.)
+    is_start_end_tensors = ((True, True), (True, False), (False, True))
+    make_arg = partial(torch.tensor, device=device)
+    for start, end, nstep, base, (is_start_tensor, is_end_tensor) in product(starts, ends, nsteps, bases, is_start_end_tensors):
+        if dtype == torch.uint8 and end < 0 or start < 0:
+            continue
+        if nstep == 1 and isinstance(start, float) and not (dtype.is_complex or dtype.is_floating_point):
+            # https://github.com/pytorch/pytorch/issues/82242
+            continue
+
+        tensor_options = {"dtype": dtype, "device": device}
+
+        if (is_start_tensor):
+            start = make_arg(start, dtype=torch.float32 if isinstance(start, float) else torch.int64)
+        if (is_end_tensor):
+            end = make_arg(end, dtype=torch.float32 if isinstance(end, float) else torch.int64)
+
+        if base is None:
+            yield SampleInput(start, args=(end, nstep), kwargs=tensor_options)
+        else:
+            yield SampleInput(start, args=(end, nstep, base), kwargs=tensor_options)
+
+    yield SampleInput(1, args=(3, 1, 2.))
+
+
+def sample_inputs_isclose(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
+
+    # Creates additional inputs to test the rtol, atol, and equal_nan params
+    rtols = [0., 1e-7]
+    atols = [0., 1e-7]
+    equal_nans = [False, True]
+
+    products = product(rtols, atols, equal_nans)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    for rtol, atol, equal_nan in products:
+        lhs = make_arg((S, S), **op.lhs_make_tensor_kwargs)
+        rhs = make_arg((S, S), **op.rhs_make_tensor_kwargs)
+
+        yield SampleInput(lhs, args=(rhs,),
+                          kwargs=dict(rtol=rtol, atol=atol, equal_nan=equal_nan))
+
+
+def error_inputs_isclose(op, device, **kwargs):
+    make_float_arg = partial(make_tensor, device=device, dtype=torch.float, requires_grad=False)
+
+    yield ErrorInput(SampleInput(make_float_arg(()), args=(make_float_arg(()),), kwargs={'rtol': -0.4}),
+                     error_type=RuntimeError,
+                     error_regex='rtol must be greater than or equal to zero')
+
+    yield ErrorInput(SampleInput(make_float_arg(()), args=(make_float_arg(()),), kwargs={'atol': -0.4}),
+                     error_type=RuntimeError,
+                     error_regex='atol must be greater than or equal to zero')
+
+
+def sample_inputs_t(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg((1, 2)))
+    yield SampleInput(make_arg((2,)))
+    yield SampleInput(make_arg(()))
+
+
+def sample_inputs_mm(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_arg_conj(size):
+        return make_arg(size).conj().requires_grad_(requires_grad)
+
+    first_shape, second_shape = (S, M), (M, S)
+
+    yield SampleInput(make_arg(first_shape), args=(make_arg(second_shape),))
+
+    if dtype.is_complex:
+        yield SampleInput(make_arg(first_shape), args=(make_arg_conj(second_shape),))
+
+    # Matmul of empty matrices
+    yield SampleInput(make_arg((0, S)), args=(make_arg(S, M),))
+    yield SampleInput(make_arg((S, 0)), args=(make_arg(0, M),))
+
+
+def sample_inputs_addmm(op_info, device, dtype, requires_grad, **kwargs):
+    alpha_val = kwargs.get('alpha', 2 + 3j if dtype.is_complex else 0.6)
+    beta_val = kwargs.get('beta', 1 + 2j if dtype.is_complex else 0.2)
+    tests_list = [
+        ((2, 3), (2, 2), (2, 3), False),
+        ((3, 3), (3, 3), (3, 3), False),
+    ]
+    tests_with_lhs_broadcasting = [
+        ((1,), (2, 2), (2, 3), True),
+        ((), (2, 2), (2, 3), True),
+    ]
+    test_cases = tests_list + tests_with_lhs_broadcasting  # type: ignore[operator]
+
+    kwargs = dict(alpha=alpha_val, beta=beta_val)
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape_a, shape_b, shape_c, broadcasts_input in test_cases:
+        yield SampleInput(
+            make_arg(shape_a),
+            make_arg(shape_b),
+            make_arg(shape_c),
+            **kwargs,
+        ).with_metadata(broadcasts_input=broadcasts_input)
+
+    if dtype.is_complex:
+        shape = (3, 3)
+        yield SampleInput(
+            make_arg(shape),
+            make_arg(shape, requires_grad=False).mH.requires_grad_(requires_grad),
+            make_arg(shape),
+            **kwargs,
+        )
+        yield SampleInput(
+            make_arg(shape),
+            make_arg(shape),
+            make_arg(shape, requires_grad=False).mH.requires_grad_(requires_grad),
+            **kwargs,
+        )
+    # addmm of empty matrices
+    if dtype.is_floating_point:
+        yield SampleInput(make_arg(S, M), make_arg(S, 0), make_arg(0, M), **kwargs)
+        # empty matmul with broadcastable input
+        yield SampleInput(make_arg(M), make_arg(S, 0), make_arg(0, M), **kwargs).with_metadata(broadcasts_input=True)
+
+def sample_inputs_sparse_sampled_addmm(op_info, device, dtype, requires_grad, **kwargs):
+    alpha = 2 + 3j if dtype.is_complex else 0.6
+    beta = 1 + 2j if dtype.is_complex else 0.2
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # sparse.sampled_addmm performs: alpha * (A @ B) * sparse_ones_like(C) + beta * C
+    for m, n, k in itertools.product([0, 5], repeat=3):
+        yield SampleInput(
+            torch.eye(m, n, device=device, dtype=dtype)
+            .to_sparse_csr()
+            .requires_grad_(requires_grad),
+            make_arg((m, k)),
+            make_arg((k, n)),
+            alpha=alpha,
+            beta=beta,
+        )
+
+def sample_inputs_sparse_mm_reduce(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    reductions = ["sum", "mean", "amax", "amin"]
+    for m, k, reduce in product([5, 7], [3, 11], reductions):
+        yield SampleInput(
+            torch.eye(m, m)
+            .to(device=device, dtype=dtype)
+            .to_sparse_csr()
+            .requires_grad_(requires_grad),
+            make_arg((m, k)),
+            reduce,
+        )
+
+
+def sample_inputs_mv(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg(S, M), make_arg(M))
+
+def sample_inputs_bmm(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg(M, S, M), make_arg(M, M, S))
+
+def sample_inputs_dot_vdot(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_arg_conj(size):
+        return make_arg(size).conj().requires_grad_(requires_grad)
+
+    yield SampleInput(make_arg((S, )), make_arg((S, )))
+    if dtype.is_complex:
+        # dot/vdot for (conj(input), conj(arg_tensor)) and (conj(input), arg_tensor)
+        # is tested in test_conj_view (which tests operations with only conjugated input tensor
+        # -- not conjugated arg tensors)
+        yield SampleInput(make_arg((S, )), make_arg_conj((S, )))
+
+
+def error_inputs_dot_vdot(op_info, device, is_ref=False, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+
+    if not is_ref:
+        yield ErrorInput(SampleInput(make_input(1), args=(make_input(3, dtype=torch.float16),)),
+                         error_regex='dot : expected both vectors to have same dtype')
+    yield ErrorInput(SampleInput(make_input(1, 1), args=(make_input(3),)),
+                     error_regex='1D tensors expected')
+    yield ErrorInput(SampleInput(make_input(9), args=(make_input(3),)),
+                     error_regex='inconsistent tensor size')
+    if device != "cpu" and not is_ref:
+        yield ErrorInput(SampleInput(make_input(3), args=(make_input(3, device="cpu"),)),
+                         error_regex='Expected all tensors to be on the same device')
+
+
+def sample_inputs_addmv(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    test_cases = (((S,), (S, M), (M,), 1, 1, False),
+                  ((S,), (S, M), (M,), 0.2, 0.6, False),
+                  )
+
+    test_cases_with_broadcast = (((1,), (S, M), (M,), 1, 1, True),
+                                 ((1,), (S, M), (M,), 0.2, 0.6, True),
+                                 ((), (S, M), (M,), 1, 1, True),
+                                 ((), (S, M), (M,), 0.2, 0.6, True),
+                                 )
+
+    cases = test_cases + test_cases_with_broadcast
+
+    # addmv performs: beta * M + alpha * (mat @ vec)
+    for size, mat, vec, beta, alpha, broadcasts_input in cases:
+        yield SampleInput(make_arg(size), args=(make_arg(mat), make_arg(vec)),
+                          kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=broadcasts_input)
+
+def sample_inputs_addbmm(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # input_shape, batch1_shape, batch2_shape, beta_val, alpha_val, is_broadcasting
+    test_cases = [((S, M), (S, S, S), (S, S, M), 1, 1, False),
+                  ((1,), (S, S, S), (S, S, M), 1, 1, True),
+                  ((S, M), (S, S, S), (S, S, M), 0.6, 0.2, False),
+                  ((1,), (S, S, S), (S, S, M), 0.6, 0.2, True),
+                  ((), (S, S, S), (S, S, M), 1, 1, True),
+                  ((), (S, S, S), (S, S, M), 0.6, 0.2, True),
+                  ]
+
+    for input_shape, batch1_shape, batch2_shape, beta, alpha, is_broadcasting in test_cases:
+        if dtype.is_complex:
+            beta_complex, alpha_complex = beta * (1 + 2j), alpha * (2 + 3j)
+            yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)),
+                              kwargs=dict(beta=beta_complex, alpha=alpha_complex), broadcasts_input=is_broadcasting)
+        yield SampleInput(make_arg(input_shape), args=(make_arg(batch1_shape), make_arg(batch2_shape)),
+                          kwargs=dict(beta=beta, alpha=alpha), broadcasts_input=is_broadcasting)
+
+def sample_inputs_addcmul_addcdiv(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    test_cases = [(((S, S), (S, S), (S, S)), False),
+                  (((S, S), (S, 1), (1, S)), False),
+                  (((1,), (S, S, 1), (1, S)), True),
+                  (((), (), ()), False),
+                  (((S, S), (), ()), True),
+                  (((), (S, S, 1), (1, S)), True)
+                  ]
+
+    for input_args, broadcasts_input in test_cases:
+        # addcdiv should accept inputs with zero value
+        # Currently, it throws ZeroDivisionError when the denominator is zero
+        # TODO: exclude_zeros can be removed after https://github.com/pytorch/pytorch/issues/73638 is fixed
+        args = tuple(make_arg(arg, exclude_zero=True) if isinstance(arg, tuple) else arg
+                     for arg in input_args)
+        yield SampleInput(*args).with_metadata(broadcasts_input=broadcasts_input)
+
+        # addcdiv should accept inputs with zero value
+        # Currently, it throws ZeroDivisionError when the denominator is zero
+        # TODO: exclude_zeros can be removed after https://github.com/pytorch/pytorch/issues/73638 is fixed
+        args = tuple(make_arg(arg, exclude_zero=True) if isinstance(arg, tuple) else arg
+                     for arg in input_args)
+        yield SampleInput(
+            *args, value=3.14 if dtype.is_floating_point or dtype.is_complex else 3
+        ).with_metadata(broadcasts_input=broadcasts_input)
+
+def reference_inputs_addcmul_addcdiv(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_addcmul_addcdiv(
+        op_info, device, dtype, requires_grad, **kwargs)
+
+    # type promotion cases
+    supported_dtypes = op_info.supported_dtypes(device)
+    make_arg = partial(make_tensor, device=device, requires_grad=requires_grad)
+
+    types = (
+        (torch.float64, torch.complex128),
+        (torch.bfloat16, torch.float32),
+    )
+
+    values = (
+        None,
+        True, False,
+        3.14, 3,
+        1.0, 1,
+        0.0, 0,
+        -3.14, -3,
+        3.14 + 2.71j,
+    )
+
+    for (type2, type3), value in product(types, values):
+        if (type2 not in supported_dtypes or
+                type3 not in supported_dtypes):
+            continue
+
+        # RuntimeError: value cannot be converted without overflow
+        if (type(value) is complex and
+                type2 is not torch.complex128):
+            continue
+
+        arg1 = make_arg([5, 5], dtype=dtype)
+        arg2 = make_arg([5, 5], dtype=type2)
+        arg3 = make_arg([1, 5], dtype=type3)
+
+        # TypeError: addcdiv(): argument 'value' must be Number, not NoneType
+        if value is not None:
+            yield SampleInput(arg1, args=(arg2, arg3), kwargs=dict(value=value))
+        else:
+            yield SampleInput(arg1, args=(arg2, arg3))
+
+def sample_inputs_baddbmm(op_info, device, dtype, requires_grad, **kwargs):
+    test_cases = [((S, S, M), (S, S, S), (S, S, M), 1, 1, False),
+                  ((1,), (S, S, S), (S, S, M), 1, 1, True),
+                  ((S, S, M), (S, S, S), (S, S, M), 0.6, 0.2, False),
+                  ((1,), (S, S, S), (S, S, M), 0.6, 0.2, True),
+                  ((), (S, S, S), (S, S, M), 1, 1, True),
+                  ((), (S, S, S), (S, S, M), 0.6, 0.2, True),
+                  ]
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    for (input_shape, batch1_shape, batch2_shape, alpha, beta, broadcasts_input) in test_cases:
+        yield SampleInput(
+            make_arg(input_shape),
+            make_arg(batch1_shape),
+            make_arg(batch2_shape),
+            beta=beta,
+            alpha=alpha
+        ).with_metadata(broadcasts_input=broadcasts_input)
+
+        if dtype.is_complex:
+            yield SampleInput(
+                make_arg(input_shape),
+                make_arg(batch1_shape),
+                make_arg(batch2_shape),
+                beta=beta * (1 + 2j),
+                alpha=alpha * (2 + 3j),
+            ).with_metadata(broadcasts_input=broadcasts_input)
+
+    if dtype.is_complex:
+        shapes = [(S, S, S), (S, M, S), (S, S, M)]
+        args = tuple(make_arg(s) for s in shapes)
+        yield SampleInput(
+            args[0].transpose_(-1, 1),
+            args[1].transpose(-1, 1).conj().requires_grad_(requires_grad),
+            args[2].transpose(-1, 1).conj().requires_grad_(requires_grad),
+            beta=beta * (1 + 2j),
+            alpha=alpha * (2 + 3j),
+        )
+
+# TODO: add reduction kwargs
+def sample_inputs_multilabel_soft_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        (S,),
+        (S, S),
+    )
+
+    for shape in shapes:
+        # Produce one with weight and one without.
+        yield SampleInput(_make_tensor(shape), args=(_make_tensor(shape, requires_grad=False),), kwargs={})
+        yield SampleInput(_make_tensor(shape), args=(_make_tensor(shape, requires_grad=False),),
+                          kwargs={'weight': _make_tensor(shape, requires_grad=False)})
+
+def sample_inputs_addr(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None
+    )
+    yield SampleInput(make_arg(S, M), make_arg(S), make_arg(M))
+
+    yield SampleInput(make_arg(), make_arg(S), make_arg(M)).with_metadata(broadcasts_input=True)
+
+    if dtype.is_complex:
+        alpha, beta = 0.1 + 0.3j, 0.4 + 0.6j
+    elif dtype.is_floating_point:
+        alpha, beta = 0.2, 0.6
+    else:
+        alpha, beta = 2, 3
+
+    yield SampleInput(make_arg(S, M), make_arg(S), make_arg(M), beta=beta, alpha=alpha)
+
+    yield SampleInput(
+        make_arg(),
+        make_arg(S),
+        make_arg(M),
+        beta=beta,
+        alpha=alpha,
+    ).with_metadata(broadcasts_input=True)
+
+    # These samples fail gradcheck
+    if dtype.is_floating_point and not requires_grad:
+        tensor_options = dict(device=device, dtype=dtype, requires_grad=requires_grad)
+        yield SampleInput(
+            torch.tensor([[math.nan]], **tensor_options),
+            torch.tensor([0.0], **tensor_options),
+            torch.tensor([0.0], **tensor_options),
+            beta=0.0,
+            alpha=0.0,
+        ).with_metadata(broadcasts_input=True)
+
+        yield SampleInput(
+            torch.tensor([[0.0]], **tensor_options),
+            torch.tensor([math.nan], **tensor_options),
+            torch.tensor([math.nan], **tensor_options),
+            beta=0.0,
+            alpha=0.0,
+        ).with_metadata(broadcasts_input=True)
+
+def sample_inputs_zero_(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = ((), (S, S, S), (S,))
+
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
+
+def sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+    make_weight = partial(_make_tensor, requires_grad=False)
+
+    inputs = (
+        ((), make_target([], low=0, high=1), {}),
+        ((S,), make_target([], low=0, high=S), {"p": 1}),
+        ((S,), make_target([1], low=0, high=S), {"p": 2}),
+        ((S, M), make_target([S], low=0, high=M), {"margin": 1.0}),
+        ((S, M), make_target([S], low=0, high=M), {"margin": -3.14}),
+        ((M, S), make_target([M], low=0, high=S), {"weight": None}),
+        ((M, S), make_target([M], low=0, high=S), {"weight": make_weight([S], low=-10., high=10.)}),
+        ((M, S), make_target([M], low=0, high=S), {"reduction": "none"}),
+        ((M, S), make_target([M], low=0, high=S), {"reduction": "mean"}),
+        ((M, S), make_target([M], low=0, high=S), {"reduction": "sum"}),
+    )
+
+    for input_shape, target, kwargs in inputs:
+        yield SampleInput(_make_tensor(input_shape), args=(target,), kwargs=kwargs)
+
+
+def reference_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_multi_margin_loss(op_info, device, dtype, requires_grad, **kwargs)
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+    make_weight = partial(_make_tensor, requires_grad=False)
+
+    inputs = (
+        ((), make_target([], low=0, high=1)),
+        ((S,), make_target([], low=0, high=S)),
+        ((S,), make_target([1], low=0, high=S)),
+        ((M, S), make_target([M], low=0, high=S)),
+    )
+    ps = (1, 2)
+    margins = (0, 7, -3.14)
+    weights = (False, True)
+    reductions = (None, "none", "mean", "sum")
+
+    for (input_shape, target), p, margin, weight, reduction in product(inputs, ps, margins, weights, reductions):
+        input = _make_tensor(input_shape)
+        weight_shape = [input.size(-1)] if input.ndim > 0 else [1]
+        weight = make_weight(weight_shape, low=-10., high=10.) if weight else None
+        kwargs = {"p": p, "margin": margin, "weight": weight}
+        if reduction is not None:
+            kwargs["reduction"] = reduction
+        yield SampleInput(input, args=(target,), kwargs=kwargs)
+
+
+def error_inputs_multi_margin_loss(op, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+    # invalid reduction
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={'reduction': 'abc'}),
+                     error_type=ValueError, error_regex='abc is not a valid value for reduction')
+    # invalid input
+    yield ErrorInput(SampleInput(make_input(5, 0), args=(make_input(5,),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'Expected non-empty vector or matrix with optional 0-dim batch size, but got: \[5, 0\]')
+    yield ErrorInput(SampleInput(make_input(0,), args=(make_input(5,),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'Expected non-empty vector or matrix with optional 0-dim batch size, but got: \[0\]')
+    # invalid target
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4),), kwargs={}),
+                     error_type=RuntimeError, error_regex=r'inconsistent target size, expected 5 but got \[5, 4\]')
+    # invalid target dtype
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={}),
+                     error_type=RuntimeError, error_regex='expected scalar type Long but found Float')
+    # invalid weight
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={'weight': make_input(())}),
+                     error_type=ValueError, error_regex='weight must be one-dimensional')
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={'weight': make_input(5, 4)}),
+                     error_type=ValueError, error_regex='weight must be one-dimensional')
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={'weight': make_input(5,)}),
+                     error_type=RuntimeError, error_regex=r'inconsistent weight size, expected 4 but got \[5\]')
+    # invalid p
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5,),), kwargs={'p': 3}),
+                     error_type=ValueError, error_regex='only p == 1 and p == 2 supported')
+
+
+def sample_inputs_logsumexp(self, device, dtype, requires_grad, **kwargs):
+    inputs = (
+        ((), (0,), True),
+        ((S, S), (1,), True),
+        ((S, S), (1,), False),
+        ((S, S), (-2,), False),
+        ((S, S), (0, 1), False),
+    )
+    # Test large inputs to check numerical stability
+    lows = (None, 1e3, 1e6) if dtype in (torch.float32, torch.float64) else (None,)
+    for low in lows:
+        high = low * 2 if low is not None else None
+        for shape, dim, keepdim in inputs:
+            t = make_tensor(shape, dtype=dtype, device=device,
+                            low=low, high=high,
+                            requires_grad=requires_grad)
+            yield SampleInput(t, dim, keepdim)
+
+def reference_inputs_logsumexp(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_logsumexp(op, device, dtype, requires_grad, **kwargs)
+
+    # https://github.com/pytorch/pytorch/issues/91843
+    t = torch.tensor([20, 30, 100], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(t, 0, False)
+
+    t = torch.tensor((), dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(t, 0, False)
+
+    # tests masking
+    # https://github.com/pytorch/pytorch/pull/91860#pullrequestreview-1241344073
+    t = torch.tensor(float("inf"))
+    yield SampleInput(t, 0, True)
+
+def sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
+    inputs = [
+        ((), {}),
+        ((S, S), {}),
+        ((0, S, 0), {}),
+        ((S,), {'dtype': dtype, 'device': device}),
+        # Hard-code some dtypes/devices. We want to test cases where the
+        # (dtype, device) is different from the input's (dtype, device)
+        ((S,), {'dtype': torch.double}),
+        ((S,), {'device': 'cpu'}),
+        ((S,), {'dtype': torch.double, 'device': 'cpu'}),
+    ]
+    if torch.cuda.is_available():
+        inputs.append(((S,), {'device': 'cuda'}))
+
+    for shape, kwargs in inputs:
+        t = make_tensor(shape, dtype=dtype, device=device,
+                        low=None, high=None,
+                        requires_grad=requires_grad)
+        yield SampleInput(t, **kwargs)
+
+def reference_inputs_like_fns(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_like_fns(op, device, dtype, requires_grad, **kwargs)
+
+    # shape
+    cases = (
+        (), (0,), (1, 0), (1, 1, 4, 5), (5, 3, 0, 1), (1, 4, 3, 1, 1)
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in cases:
+        yield SampleInput(make_arg(shape))
+        yield SampleInput(make_arg(shape).transpose(0, -1))
+        yield SampleInput(make_arg(shape, noncontiguous=True))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1))
+
+def sample_inputs_multilabel_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+
+    inputs = (
+        ([], make_target([], low=0, high=1), {}),
+        ([S], make_target([S], low=0, high=S), {}),
+        ([M, S], make_target([M, S], low=0, high=S), {}),
+        ([M, S], make_target([M, S], low=0, high=S), {"reduction": "none"}),
+        ([M, S], make_target([M, S], low=0, high=S), {"reduction": "mean"}),
+        ([M, S], make_target([M, S], low=0, high=S), {"reduction": "sum"}),
+    )
+
+    for shape, target, kwargs in inputs:
+        yield SampleInput(_make_tensor(shape), args=(target,), kwargs=kwargs)
+
+
+def reference_inputs_multilabel_margin_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_multilabel_margin_loss(op_info, device, dtype, requires_grad, **kwargs)
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(_make_tensor, dtype=torch.long, requires_grad=False)
+    make_target_tensor = partial(torch.tensor, device=device, dtype=torch.long, requires_grad=False)
+
+    inputs = (
+        # random tests including -1 target labels
+        ([], make_target([], low=-1, high=1)),
+        ([S], make_target([S], low=-1, high=S)),
+        ([M, S], make_target([M, S], low=-1, high=S)),
+        # repeated target labels and -1 (labels after the first -1 are ignored)
+        ([], make_target_tensor(-1)),
+        ([7], make_target_tensor([2, 0, 6, -1, 4, -1, 6])),
+        ([4, 5], make_target_tensor([[4, -1, 0, -1, 2], [0, 0, 4, 1, 4], [-1, 3, -1, 1, 0], [4, 3, 2, 1, 0]])),
+    )
+    reductions = (None, "none", "mean", "sum")
+
+    for (shape, target), reduction in product(inputs, reductions):
+        kwargs = {}
+        if reduction is not None:
+            kwargs["reduction"] = reduction
+        yield SampleInput(_make_tensor(shape), args=(target,), kwargs=kwargs)
+
+
+def error_inputs_multilabel_margin_loss(op, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+    # invalid reduction
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4),), kwargs={'reduction': 'abc'}),
+                     error_type=ValueError, error_regex='abc is not a valid value for reduction')
+    # invalid input
+    yield ErrorInput(SampleInput(make_input(5, 0), args=(make_input(5, 4),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'Expected non-empty vector or matrix with optional 0-dim batch size, but got: \[5, 0\]')
+    yield ErrorInput(SampleInput(make_input(0,), args=(make_input(0,),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'Expected non-empty vector or matrix with optional 0-dim batch size, but got: \[0\]')
+    # invalid target
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(4,),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'inconsistent target size: \[4\] for input of size: \[5, 4\]')
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input((),),), kwargs={}),
+                     error_type=RuntimeError,
+                     error_regex=r'inconsistent target size: \[\] for input of size: \[5, 4\]')
+
+
+def get_independent_tensor(tensor):
+    return tensor.clone().requires_grad_(tensor.requires_grad)
+
+def sample_inputs_randint(self, device, dtype, requires_grad, **kwargs):
+    low = 2
+    high = 10
+
+    for sample in sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
+        sample.kwargs.setdefault('device', device)
+        # With high
+        yield SampleInput(high, sample.input.shape, *sample.args, **sample.kwargs)
+        # With low and high
+        yield SampleInput(low, high, sample.input.shape, *sample.args, **sample.kwargs)
+
+def sample_inputs_randint_like(self, device, dtype, requires_grad, **kwargs):
+    low = 2
+    high = 10
+
+    for sample in sample_inputs_like_fns(self, device, dtype, requires_grad, **kwargs):
+        # With high
+        yield SampleInput(
+            sample.input,
+            high,
+            *sample.args,
+            **sample.kwargs)
+        # With low and high
+        yield SampleInput(
+            get_independent_tensor(sample.input),
+            low,
+            high,
+            *sample.args,
+            **sample.kwargs)
+
+def sample_inputs_margin_ranking_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes = (
+        (),
+        (S,),
+        (S, S),
+        (S, S, S),
+    )
+
+    margins = (0., 1.)
+    reductions = ('sum', 'mean', 'none')
+
+    for shape in shapes:
+        for margin, reduction in product(margins, reductions):
+            kwargs = {'margin': margin, 'reduction': reduction}
+            yield SampleInput(_make_tensor(shape),
+                              args=(_make_tensor(shape, requires_grad=False),
+                                    _make_tensor(shape, requires_grad=False)),
+                              kwargs=kwargs)
+
+def reference_inputs_margin_ranking_loss(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_margin_ranking_loss(op, device, dtype, requires_grad, **kwargs)
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for reduction in ('sum', 'mean', 'none'):
+        if dtype.is_floating_point:  # only supports ints and floats
+            # NaN propagation
+            inp1 = make_input((10, ))
+            inp1[2] = float('nan')
+            inp2 = make_input((10, ))
+            inp2[4] = float('nan')
+            target = make_input((10, ))
+            inp2[9] = float('nan')
+            yield SampleInput(inp1, args=(inp2, target), kwargs={'reduction': reduction})
+
+            # Inf handling
+            inp1 = make_input((10, ))
+            inp2[1] = float('inf')
+            inp2 = make_input((10, ))
+            inp2[4] = float('inf')
+            target = make_input((10, ))
+            inp2[7] = float('inf')
+            yield SampleInput(inp1, args=(inp2, target), kwargs={'reduction': reduction})
+
+        # Broadcasting
+        inp1 = make_input((5, 2))
+        inp2 = make_input((5, 1))
+        target = make_input((1, 2))
+        yield SampleInput(inp1, args=(inp2, target), kwargs={'reduction': reduction})
+
+def error_inputs_margin_ranking_loss(op, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+    # invalid reduction value.
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4), make_input(5, 4),), kwargs={'reduction': 'abc'}),
+                     error_type=ValueError, error_regex='is not a valid value')
+    # invalid input shapes
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4), make_input(5,),)),
+                     error_regex='margin_ranking_loss : All input tensors should')
+
+def sample_inputs_new_fns(self, device, dtype, requires_grad, *, is_strided=False, **kwargs):
+    # input_shape, output_shape, strides, kwargs
+    # lengths of output_shape and strides must be equal
+    inputs = [
+        ((), (), (), {}),
+        ((S, S), (2, 0), (3, 4), {}),
+        ((0, S, 0), (3, 2, 2), (1, 2, 3), {}),
+        ((S,), (2, 3), (7, 8), {'dtype': dtype, 'device': device}),
+        # Hard-code some dtypes/devices. We want to test cases where the
+        # (dtype, device) is different from the input's (dtype, device)
+        ((S,), (10,), (S,), {'dtype': torch.double}),
+        ((S,), (1, 1, 12), (S, L, M), {'device': 'cpu'}),
+        ((S,), (2, 2, 2), (L, M, S), {'dtype': torch.double, 'device': 'cpu'}),
+    ]
+    if torch.cuda.is_available():
+        inputs.append(((S,), (7, 2), (3, 4), {'device': 'cuda'}))
+
+    for input_shape, output_shape, strides, kwargs in inputs:
+        t = make_tensor(input_shape, dtype=dtype, device=device,
+                        low=None, high=None,
+                        requires_grad=requires_grad)
+        if is_strided:
+            yield SampleInput(t, output_shape, strides, **kwargs)
+        else:
+            yield SampleInput(t, output_shape, **kwargs)
+
+def sample_inputs_empty_strided(op, device, dtype, requires_grad=False, **kwargs):
+
+    inputs = [
+        ((), (), {'dtype': dtype, 'device': device}),
+        ((S,), (4,), {'dtype': dtype, 'device': device}),
+        ((S, S), (2, 1), {'dtype': dtype, 'device': device}),
+        ((S, S, S), (2, 0, 1), {'dtype': dtype, 'device': device}),
+    ]
+
+    for shape, strides, kwargs in inputs:
+        yield SampleInput(shape, strides, requires_grad=requires_grad, **kwargs)
+
+def sample_inputs_empty(op, device, dtype, requires_grad, **kwargs):
+    # shape
+    cases = (
+        (), (0,), (1,), (1, 3, 5), (5, 3, 1), (1, 0, 5, 1),
+    )
+
+    for case in cases:
+        yield SampleInput(case, device=device, dtype=dtype, requires_grad=requires_grad)
+
+def sample_inputs_empty_permuted(op, device, dtype, requires_grad, **kwargs):
+    # shape
+    cases = (
+        (), (0,), (1,), (1, 3, 5), (5, 3, 1), (1, 0, 5, 1),
+    )
+
+    for case in cases:
+        for layout in itertools.permutations(range(len(case))):
+            yield SampleInput(case, layout, device=device, dtype=dtype, requires_grad=requires_grad)
+
+def error_inputs_empty_permuted(op_info, device, **kwargs):
+    yield ErrorInput(
+        SampleInput((2,), args=((0, 1),)),
+        error_type=RuntimeError,
+        error_regex="Number of dimensions in size does not match the length of the physical_layout"
+    )
+    yield ErrorInput(
+        SampleInput((2,), args=((3,),)),
+        error_type=RuntimeError,
+        error_regex="Dimension out of range"
+    )
+    yield ErrorInput(
+        SampleInput((2, 3), args=((0, 0),)),
+        error_type=RuntimeError,
+        error_regex="Duplicate dim not allowed"
+    )
+
+def sample_inputs_scalar_tensor(op, device, dtype, requires_grad, **kwargs):
+    # Not including a scalar tensor in vals because meta tests start failing due to
+    # lack of meta support for _local_scalar_dense
+    # torch.tensor(2, device=device)
+    vals = (-5, 0, 1)
+
+    for item in vals:
+        yield SampleInput(item, device=device, dtype=dtype, requires_grad=requires_grad)
+
+def sample_inputs_eye(op, device, dtype, requires_grad, **kwargs):
+    # only ints >= 0 are allowed for both arguments, unless m is omitted
+    sizes = (None, 0, 1, 2, 3, 4, 7, L, M, S)
+
+    for n, m in product(sizes, sizes):
+        if n is None:
+            continue
+
+        # TODO: no layout
+        _kwargs = {'device': device, 'dtype': dtype, 'requires_grad': requires_grad}
+        if m is None:
+            yield SampleInput(n, args=(), kwargs=_kwargs)
+        else:
+            yield SampleInput(n, args=(m,), kwargs=_kwargs)
+
+def error_inputs_eye(op_info, device, **kwargs):
+    # TODO: no layout
+    _kwargs = {'device': device, 'dtype': torch.float32}
+
+    yield ErrorInput(
+        SampleInput(-1, args=(), kwargs=_kwargs),
+        error_regex="n must be greater or equal to 0, got -1"
+    )
+
+    yield ErrorInput(
+        SampleInput(-7, args=(42,), kwargs=_kwargs),
+        error_regex="n must be greater or equal to 0, got -7"
+    )
+
+    yield ErrorInput(
+        SampleInput(0, args=(-3,), kwargs=_kwargs),
+        error_regex="m must be greater or equal to 0, got -3"
+    )
+
+
+def sample_inputs_new_full(self, device, dtype, requires_grad, **kwargs):
+    def get_val(dtype):
+        return make_tensor([], dtype=dtype, device="cpu").item()
+
+    for sample in sample_inputs_new_fns(self, device, dtype, requires_grad, **kwargs):
+        # The scalar we are passing to new_full must be the same dtype
+        # as the one of the resulting tensor
+        use_dtype = sample.kwargs['dtype'] if 'dtype' in sample.kwargs else dtype
+        yield SampleInput(
+            sample.input, *sample.args, get_val(use_dtype), **sample.kwargs)
+
+def sample_inputs_full_like(self, device, dtype, requires_grad, **kwargs):
+    def get_val(dtype):
+        return make_tensor([], dtype=dtype, device="cpu").item()
+
+    inputs = [
+        ((), get_val(dtype), {}),
+        ((S, S), get_val(dtype), {}),
+        ((0, S, 0), get_val(dtype), {}),
+        ((S,), get_val(dtype), {'dtype': dtype, 'device': device}),
+        # Hard-code some dtypes/devices. We want to test cases where the
+        # (dtype, device) is different from the input's (dtype, device)
+        ((S,), get_val(torch.double), {'dtype': torch.double}),
+        ((S,), get_val(dtype), {'device': 'cpu'}),
+        ((S,), get_val(torch.double), {'dtype': torch.double, 'device': 'cpu'}),
+    ]
+    if torch.cuda.is_available():
+        inputs.append(((S,), get_val(dtype), {'device': 'cuda'}))
+
+    for shape, fill_value, kwargs in inputs:
+        t = make_tensor(shape, dtype=dtype, device=device,
+                        low=None, high=None,
+                        requires_grad=requires_grad)
+        yield SampleInput(t, fill_value, **kwargs)
+
+def sample_inputs_multinomial(self, device, dtype, requires_grad, **kwargs):
+    cases = [
+        ([3], 3, {}),
+        ([10], 3, {}),
+        ([3, 10], 3, {}),
+        ([3], 3, dict(replacement=False)),
+        ([3], 3, dict(replacement=True)),
+        ([3, 4], 4, dict(replacement=True)),
+        ([3, 4], 4, dict(replacement=False)),
+    ]
+
+    for shape, num_samples, kwargs in cases:
+        t = make_tensor(shape, dtype=dtype, device=device,
+                        low=0, high=None,
+                        requires_grad=requires_grad)
+        yield SampleInput(t, num_samples, **kwargs)
+
+def sample_inputs_normal_common(self, device, dtype, requires_grad, cases, **kwargs):
+    def get_value_or_make_tensor(value_or_shape):
+        if isinstance(value_or_shape, list):
+            return make_tensor(value_or_shape, dtype=dtype, device=device,
+                               low=0, high=None,
+                               requires_grad=requires_grad)
+        return value_or_shape
+
+    for value_or_mean_shape, value_or_std_shape, kwargs in cases:
+        mean = get_value_or_make_tensor(value_or_mean_shape)
+        std = get_value_or_make_tensor(value_or_std_shape)
+        yield SampleInput(mean, std, **kwargs)
+
+def sample_inputs_normal_tensor_first(self, device, dtype, requires_grad, **kwargs):
+    # value_or_size, value_or_size, kwargs
+    cases = [
+        ([], [], {}),
+        ([3], [3], {}),
+        ([3, 4, 2], [3, 4, 2], {}),
+        ([2, 3], 1.1, {}),
+        ([1, 2, 3], [5, 2, 3], {}),  # broadcasting
+    ]
+
+    return sample_inputs_normal_common(self, device, dtype, requires_grad, cases, **kwargs)
+
+def sample_inputs_normal_tensor_second(self, device, dtype, requires_grad, **kwargs):
+    yield SampleInput(1.6, 0.3, [2, 3], dtype=dtype, device=device)
+    yield SampleInput(1.6, 0.3, [2, 2, 2], dtype=dtype, layout=torch.strided, device=device)
+    yield SampleInput(2.7, make_tensor([4, 3], dtype=dtype, device=device, low=0, high=None, requires_grad=requires_grad))
+
+def sample_inputs_bernoulli(self, device, dtype, requires_grad, **kwargs):
+    shapes = [
+        [3],
+        [],
+        [0, 3],
+        [2, 3, 4],
+    ]
+
+    for shape in shapes:
+        t = make_tensor(shape, dtype=dtype, device=device,
+                        low=0, high=1,
+                        requires_grad=requires_grad)
+        yield SampleInput(t)
+
+def error_inputs_bernoulli(op_info, device, **kwargs):
+    # more than one element of the written-to tensor refers to a single memory location
+    x = torch.rand((1,), device=device).expand((6,))
+    err_msg = 'unsupported operation'
+    yield ErrorInput(SampleInput(torch.rand_like(x), kwargs={'out': x}),
+                     error_regex=err_msg)
+
+def sample_inputs_logcumsumexp(self, device, dtype, requires_grad, **kwargs):
+    inputs = (
+        ((S, S, S), 0),
+        ((S, S, S), 1),
+        ((), 0),
+    )
+
+    for large_number in (True, False):
+        for shape, dim in inputs:
+            t = make_tensor(shape, dtype=dtype, device=device,
+                            low=None, high=None,
+                            requires_grad=requires_grad)
+
+            if large_number and t.dim() > 0:
+                t[0] = 10000
+            yield SampleInput(t, dim)
+
+def sample_inputs_trace(self, device, dtype, requires_grad, **kwargs):
+    yield SampleInput(
+        make_tensor((S, S), dtype=dtype, device=device,
+                    low=None, high=None,
+                    requires_grad=requires_grad))
+
+
+def error_inputs_trace(op, device):
+    yield ErrorInput(SampleInput(make_tensor((3, 4, 5), dtype=torch.float32, device=device)), error_regex="expected a matrix")
+
+
+def sample_inputs_renorm(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    cases = (((S, S, S), (2, 1, 0.5)),
+             ((S, S, S), (2, -1, 0.5)),
+             ((S, S, S), (1, 2, 3)),
+             ((S, S, S), (float('inf'), 2, 0.5)),
+             )
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+
+def sample_inputs_transpose_swapdims(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((1, 2, 3), (-1, -2)),
+             ((1, 2, 3), (-1, 2)),
+             ((1, 2, 3), (1, -2)),
+             ((1, 2, 3), (1, 2)),
+             ((), (0, 0)),
+             ((1, ), (0, 0)),
+             ((M, M), (0, 1)),
+             ((S, S, S), (2, 0)), )
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+def _numpy_ref_transpose(a, dim0, dim1):
+    if a.ndim <= 1:
+        return a
+
+    return np.swapaxes(a, dim0, dim1)
+
+def sample_inputs_adjoint(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    shapes = ((1, 2, 3), (M, M), (S, S, S), (S, M, S), (M, S, M, S))
+    return (SampleInput(make_arg(shape)) for shape in shapes)
+
+def sample_inputs_T(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    shapes = ((M, M), (M, L))
+    return (SampleInput(make_arg(shape)) for shape in shapes)
+
+def error_inputs_T(self, device, has_ndims_error=False):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # Deprecated behavior in regular PyTorch, but throws an error in primTorch:
+    # https://github.com/pytorch/pytorch/issues/86968
+    if has_ndims_error:
+        # ndims == 1
+        yield ErrorInput(SampleInput(make_arg(M)),
+                         error_regex=(r'The use of `x\.T` on tensors of dimension other than 0 or 2 '
+                                      r'to reverse their shape is not supported\.'))
+
+        # ndims > 2
+        yield ErrorInput(SampleInput(make_arg(M, S, L)),
+                         error_regex=(r'The use of `x\.T` on tensors of dimension other than 0 or 2 '
+                                      r'to reverse their shape is not supported\.'))
+
+
+def sample_inputs_singular_matrix_factors(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function produces two tensors of shape (*, m, k) and (*, n, k) with k <= min(m, n).
+    Their matrix product could be used to generate tensor of shape (*, m, n) of rank k.
+    """
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    batches = [(), (0, ), (2, ), (1, 1)]
+    size = [1, 5, 10]
+
+    for batch, m, n in product(batches, size, size):
+        for k in range(min(3, m, n)):
+            a = make_arg((*batch, m, k))
+            b = make_arg((*batch, n, k))
+            yield SampleInput(a, b, **kwargs)
+
+
+def sample_inputs_svd_lowrank(op_info, device, dtype, requires_grad=False, **kwargs):
+    for sample in sample_inputs_singular_matrix_factors(op_info, device, dtype, requires_grad, **kwargs):
+        *batch, m, k = sample.input.shape
+        *_, n, _ = sample.args[0].shape
+
+        # NOTE: since svd_lowrank relies on non rank-revealing SVD,
+        # it inherits the problem of unstable behavior with repeated
+        # singular values including zeros.
+        # Since we want to avoid (repeated) zeros as singular values,
+        # we can only use k for q.
+        # This issues could be resolved with using a rank-revealing SVD
+        # which does not include "zero" singular values.
+        op_kwargs = {
+            'q': k,
+            'M': None
+        }
+
+        # without M specified
+        yield clone_sample(sample, **op_kwargs)
+
+        # now with M
+        # TODO: fix bug in the documentation for svd_lowrank:
+        # M has to be (*, m, n), and not (*, 1, n) as written
+        # in the documentation
+        op_kwargs['M'] = make_tensor((*batch, m, n), dtype=dtype, device=device, requires_grad=requires_grad)
+        yield clone_sample(sample, **op_kwargs)
+
+def chunk_iter(iterable, size):
+    it = iter(iterable)
+    while True:
+        chunk = tuple(islice(it, size))
+        if not chunk:
+            break
+        yield chunk
+
+def sample_inputs_pca_lowrank(op_info, device, dtype, requires_grad=False, **kwargs):
+    # we reuse samples from svd_lowrank which come in group of two with
+    # kwarg['M'] = None and with kwarg['M'] = <some tensor>
+    samples = sample_inputs_svd_lowrank(op_info, device, dtype, requires_grad, **kwargs)
+    for s1, s2 in chunk_iter(samples, 2):
+        del s1.kwargs['M']
+        del s2.kwargs['M']
+        s1.kwargs['center'] = False
+        s2.kwargs['center'] = True
+        yield s1
+        yield s2
+
+def np_sinc_with_fp16_as_fp32(x):
+    # Wraps numpy's sinc function so that fp16 values are promoted to fp32
+    # before sinc is invoked. Context: numpy's sinc returns NaN when evaluated
+    # at 0 for fp16.
+    if x.dtype == np.float16:
+        return np.sinc(x.astype(np.float32))
+    else:
+        return np.sinc(x)
+
+def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad, **kwargs):
+    test_cases = (
+        ((S, 1, 1), (S, S, S)),
+        ((S, 1, S), (S, S, S)),
+        ((S, 1), (S, S, S)),
+        ((1,), (S, S, S)),
+        ((1, S), (1, 1, S)),
+        ((), ()),
+        ((), (1, 3, 2)),
+    )
+
+    return (
+        SampleInput(
+            make_tensor(size, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad),
+            shape,
+        ) for size, shape in test_cases)
+
+def sample_inputs_broadcast_tensors(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    test_cases: Tuple[tuple] = (((3,), (1, 2, 1), (1, 1), (5, 1, 1),),)
+
+    for shape, *other_shapes in test_cases:
+        yield SampleInput(make_arg(shape), args=tuple(make_arg(s) for s in other_shapes))
+
+def reference_inputs_broadcast_tensors(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_broadcast_tensors(op, device, dtype, requires_grad, **kwargs)
+
+    m = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    n = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True)
+
+    cases = (
+        ((), (1, 1), (1, 1, 7, 1), (3, 1, 1)),
+        ((3, 5, 6), (1, 3, 5, 6), (1, 1, 1, 1, 6), (8, 3, 5, 6))
+    )
+
+    for a, b, c, d in cases:
+        yield SampleInput(m(a), args=(m(b), m(c), m(d)))
+        yield SampleInput(n(a), args=(n(b), n(c), n(d)))
+
+def sample_inputs_block_diag(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    test_cases: Tuple[tuple] = (
+        ((1, S), (2, S), (3, S),),
+        ((S, 1), (S, 2), (S, 3),),
+        ((1,), (2,), (3,),),
+        ((2, S), (S,))
+    )
+
+    for shape, *other_shapes in test_cases:
+        yield SampleInput(make_arg(shape), args=tuple(make_arg(s) for s in other_shapes))
+        # We also want to test mixed complex-non-complex inputs to block_diag
+        if dtype == torch.complex32 or dtype == torch.complex64:
+            non_complex_dtype = torch.float32 if dtype == torch.complex32 else torch.float64
+            make_arg_non_complex = partial(make_tensor, dtype=non_complex_dtype, device=device, requires_grad=requires_grad)
+            yield SampleInput(make_arg_non_complex(shape), args=tuple(make_arg(s) for s in other_shapes))
+
+def sample_inputs_cdist(op_info, device, dtype, requires_grad, **kwargs):
+    small_S = 2
+    test_cases = (
+        ((S, S, 2), (S, S + 1, 2)),
+        ((S, S), (S, S)),
+        ((S, S, S), (S, S, S)),
+        ((3, 5), (3, 5)),
+        ((2, 3, 5), (2, 3, 5)),
+        ((1, 2, 3), (1, 2, 3)),
+        ((1, 1), (S, 1)),
+        ((0, 5), (4, 5)),
+        ((4, 5), (0, 5)),
+        ((0, 4, 5), (3, 5)),
+        ((4, 5), (0, 3, 5)),
+        ((0, 4, 5), (1, 3, 5)),
+        ((1, 4, 5), (0, 3, 5)),
+        # Using S here would make this one test take 9s
+        ((small_S, small_S, small_S + 1, 2), (small_S, small_S, small_S + 2, 2)),
+        ((small_S, 1, 1, small_S), (1, small_S, small_S)),
+        ((1, 1, small_S), (small_S, 1, small_S, small_S)),
+    )
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    for cm in ['use_mm_for_euclid_dist', 'donot_use_mm_for_euclid_dist']:
+        # FIXME add an override for JIT and revert 0. back to 0
+        # since it's accepted by eager
+        for p in [0., 1., 2., 3., 0.5, 1.5, 2.5, float("inf")]:
+            for t1_size, t2_size in test_cases:
+                # The args should never be non-contiguous as this is not supported in the backward
+                yield SampleInput(make_arg(t1_size), make_arg(t2_size), p, cm)
+
+def _fill_np(a, value):
+    a = a.copy()
+    a.fill(value)
+    return a
+
+def _fill_sample_kwargs(device, dtype, input):
+    if dtype is torch.bool:
+        value = True
+    else:
+        value = 3
+
+    return ({'value': value}, {'value': value})
+
+def sample_inputs_comparison_ops(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs)
+
+    # Adds a sample input where both tensors have the same values
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    lhs = make_arg((S, S))
+    yield SampleInput(lhs, args=(lhs.clone(),))
+
+def sample_inputs_stack(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # shape x number of tensors
+    cases = (
+        ((3, 4), 1),
+        ((1, 2, 1, 4), 3),
+        ((0, 1, 0), 2),)
+
+    for shape, num_tensors in cases:
+        tensors = []
+        for _ in range(num_tensors):
+            tensors.append(make_arg(shape))
+        for dim in range(-1, len(shape) - 1):
+            yield SampleInput(tensors, args=(dim,))
+
+
+def sample_inputs_chunk_cat(op_info, device, dtype, requires_grad, **kwargs):
+    # 1. If input tensors have different ndims, dim should be non-negative and be less than the ndims of every input tensors.
+    #    If all input tensors have the same ndims, we support both negative and non-negative dim.
+    # 2. For wrapped_dim, all tensors should have the same size for 0,...,wrapped_dim-1 dimensions.
+    #        No requirements for (wrapped_dim, ...)-th dimension.
+    # 3. Expect positive num_chunks
+    # 4. Expect non-empty input tensor list and each input tensor should have at least 1 element
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    same_ndim_cases = (
+        (
+            [
+                torch.Size([1, 2, 3]),
+                torch.Size([1, 2, 3]),
+            ], -1, 5
+        ),
+        (
+            [
+                torch.Size([1, 2, 3]),
+                torch.Size([1, 2, 3]),
+            ], 1, 5
+        ),
+        (
+            [
+                torch.Size([3, 3, 2, 1]),
+                torch.Size([1, 4, 2, 2]),
+                torch.Size([2, 1, 3, 3]),
+            ], 0, 2
+        ),
+    )
+    for sizes, dim, num_chunks in same_ndim_cases:
+        tensors = []
+        for size in sizes:
+            tensors.append(make_arg(size))
+        yield SampleInput(tensors, args=(dim, num_chunks))
+
+    different_ndim_case = [
+        torch.Size([2, 3, 3]),
+        torch.Size([2, 3, 1, 2]),
+        torch.Size([2, 3]),
+        torch.Size([2, 3, 2]),
+    ]
+    max_dim, num_chunks = 2, 3
+    for dim in range(max_dim):
+        tensors = []
+        for size in different_ndim_case:
+            tensors.append(make_arg(size))
+        yield SampleInput(tensors, args=(dim, num_chunks))
+
+
+def error_inputs_chunk_cat(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # input tensors have different ndims but dim is negative
+    sizes, dim, num_chunks = [torch.Size([2, 3]), torch.Size([4,])], -1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects non-negative dim when input tensors have different ndims',
+    )
+
+    # input tensors have different ndims but dim >= ndim of some input tensors
+    sizes, dim, num_chunks = [torch.Size([2, 3]), torch.Size([4,])], 1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects dim < ndim for all input tensors',
+    )
+
+    # some tensors have different sizes for 0, ..., dim-1 dimensions.
+    sizes, dim, num_chunks = [torch.Size([2, 3, 4]), torch.Size([4, 3])], 1, 3
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors',
+    )
+
+    # negative num_chunks
+    sizes, dim, num_chunks = [torch.Size([2,]), torch.Size([3,])], 0, -1
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects positive num_chunks',
+    )
+
+    # zero as num_chunks
+    sizes, dim, num_chunks = [torch.Size([2,]), torch.Size([3,])], 0, 0
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects positive num_chunks',
+    )
+
+    # empty input tensor list
+    dim, num_chunks = 0, 1
+    yield ErrorInput(
+        SampleInput([], args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects a non-empty input tensor list',
+    )
+
+    # empty input tensor with 0 elements
+    sizes, dim, num_chunks = [torch.Size([0,]), torch.Size([3,])], 0, 1
+    tensors = [make_arg(size) for size in sizes]
+    yield ErrorInput(
+        SampleInput(tensors, args=(dim, num_chunks)),
+        error_regex='_chunk_cat expects non-empty tensor',
+    )
+
+
+def sample_inputs_cat_concat(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases: Tuple[tuple, tuple, dict] = (  # type: ignore[assignment]
+        ((S, S), (S, S), {'dim': -1}),
+        ((S, S), (S, S), {'dim': 1}),
+        ((M, S), (S, S), {'dim': 0}),  # different shapes
+        ((1, 2, 3), (1, 2, 3), {'dim': -2}),
+        ((0,), (0,), {'dim': 0}),  # empty tensor
+        ((0,), (S, S), {'dim': 1}),  # empty tensor with unempty and dim=1 (special case for legacy_cat_wrap_dim)
+        ((0, S), (S, S), {'dim': 0}),
+        ((1,), (1,), {})  # dim not passed, fallback to default
+    )
+
+    for input_shape1, input_shape2, kwargs in cases:
+        yield SampleInput([make_arg(input_shape1), make_arg(input_shape2)], kwargs=kwargs)
+
+    # from coat_lite_mini
+    yield SampleInput([make_arg((2, 2, 2, 2), memory_format=torch.channels_last)], args=(1,),)
+
+def error_inputs_cat(op_info, device, **kwargs):
+
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for more than one element of the written-to tensor refer to a single memory location
+    yield ErrorInput(SampleInput([make_arg((S, S)), make_arg((S, S))],
+                                 kwargs={'out': make_arg((1, S)).expand((2 * S, S))}),
+                     error_regex='unsupported operation')
+
+    # error inputs for empty tensors
+    yield ErrorInput(SampleInput([], kwargs={'dim': 1}),
+                     error_regex='non-empty list of Tensors')
+
+    # error inputs for different sizes
+    yield ErrorInput(SampleInput([make_arg((S, S, L, L)), make_arg((S, 0, L - 1, L))], kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match except in dimension')
+    yield ErrorInput(SampleInput([make_arg((S, 0, L - 1, L)), make_arg((S, S, L, L))], kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match except in dimension')
+
+    # error inputs for different dimensions
+    yield ErrorInput(SampleInput([make_arg((S - 1, 0)), make_arg((S, 0, L - 1, L))], kwargs={'dim': 1}),
+                     error_regex='Tensors must have same number of dimensions')
+    yield ErrorInput(SampleInput([make_arg((S, 0, L - 1, L)), make_arg((S - 1, 0))], kwargs={'dim': 1}),
+                     error_regex='Tensors must have same number of dimensions')
+
+    # error inputs for same memory locations
+    x = torch.zeros((0), device=device)
+    y = torch.randn((4, 6), device=device)
+
+    err_msg = "the written-to tensor refer to a single memory location"
+
+    yield ErrorInput(SampleInput((x, y), kwargs={'dim': 0, 'out': x}),
+                     error_regex=err_msg)
+    yield ErrorInput(SampleInput((x, y), kwargs={'dim': 0, 'out': y}),
+                     error_regex=err_msg)
+
+    z = torch.zeros((4, 6), device=device)
+    yield ErrorInput(SampleInput((y, z), kwargs={'out': z[:2, :]}),
+                     error_regex=err_msg)
+
+    # error inputs for different devices
+    if torch.device(device).type == 'cuda':
+        x_cuda = make_tensor((3, 3), device=device, dtype=torch.float32)
+        y_cpu = make_tensor((3, 3), device='cpu', dtype=torch.float32)
+        yield ErrorInput(SampleInput((x_cuda, y_cpu)),
+                         error_regex='Expected all tensors to be on the same device')
+
+    # error inputs for different input sizes for more than 2 tensors
+    yield ErrorInput(SampleInput([make_arg((L, 1)), make_arg((L, 1, 1)), make_arg((L, 1, 1))]),
+                     error_regex='Tensors must have same number of dimensions')
+
+    yield ErrorInput(SampleInput([make_arg((S, 1, M)), make_arg((S, 1, 1)), make_arg((S, M, 1))],
+                                 kwargs={'dim': 1}),
+                     error_regex='Sizes of tensors must match')
+
+    # error inputs for None input
+    yield ErrorInput(SampleInput((make_arg((S, 1, 1)), None)), error_type=TypeError,
+                     error_regex='got None')
+
+    # error inputs for zero-dimensional tensors
+    yield ErrorInput(SampleInput([make_arg(()), make_arg(())]),
+                     error_regex='zero-dimensional.*cannot be concatenated')
+
+    # error inputs for different dtype of out tensors
+    d = make_tensor((2, 3), device=device, dtype=torch.double)
+    x = make_tensor((2, 3), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'out': d}), error_type=TypeError,
+                     error_regex='invalid combination of arguments')
+
+def reference_inputs_cat(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_cat_concat(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Noncontiguous type promoting tensors
+    a = make_arg((3, 4, 2))
+    b = make_arg((3, 2, 2), noncontiguous=True, dtype=torch.double)
+    c = make_arg((3, 3, 2), dtype=torch.float16).permute(1, 0, 2)
+
+    yield SampleInput((a, b, c), kwargs={'dim': 1})
+
+    # Special 1D tensor with dim length of 0 case
+    a = make_arg((0,))
+    b = make_arg((3, 2, 2))
+
+    yield SampleInput((a, b, a))
+    yield SampleInput((a, a, a))
+
+def _elementwise_type_promo_np(*args, type_promotion_kind):
+    def _maybe_torch(x):
+        if isinstance(x, np.ndarray):
+            return torch.from_numpy(x)
+        return x
+
+    flattened = pytree.arg_tree_leaves(*args)
+    transformed = tuple(_maybe_torch(a) for a in flattened)
+    result_dtype, _ = prims.utils.elementwise_dtypes(
+        *transformed,
+        type_promotion_kind=type_promotion_kind)
+    return torch_to_numpy_dtype_dict[result_dtype]
+
+def _cat_np(input_seq, dim=0):
+    inputs = tuple(a for a in input_seq if not (a.ndim == 1 and a.size == 0))
+
+    if len(inputs) == 0:
+        np_dtype = _elementwise_type_promo_np(
+            input_seq,
+            type_promotion_kind=prims.utils.ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH)
+        return np.empty(0, dtype=np_dtype)
+
+    return np.concatenate(inputs, axis=dim)
+
+def _floor_divide_np(a, b):
+    dtype = _elementwise_type_promo_np(
+        a,
+        b,
+        type_promotion_kind=prims.utils.ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT)
+    if isinstance(a, np.ndarray):
+        a = a.astype(dtype)
+    if isinstance(b, np.ndarray):
+        b = b.astype(dtype)
+    return np.floor_divide(a, b)
+
+def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    tensor_shapes = (
+        # First Tensor being 1-D is special
+        # case for hstack
+        ((S,), (S,), (S,)),
+        ((S, S), (S, S), (S, S)),
+    )
+    for s1, s2, s3 in tensor_shapes:
+        tensors = (make_arg(s1,), make_arg(s2,), make_arg(s3))
+        yield SampleInput(tensors)
+
+def error_inputs_hstack_dstack_vstack(op, device):
+    make_arg = partial(make_tensor, dtype=torch.int32, device=device, requires_grad=False)
+    tensor_shapes = (
+        ((S,), (S, S, S, S), (S,)),
+    )
+    for s1, s2, s3 in tensor_shapes:
+        tensors = (make_arg(s1,), make_arg(s2,), make_arg(s3))
+        # Different dimension tensor
+        yield ErrorInput(SampleInput(tensors), error_regex="Tensors must have same number of dimensions")
+
+    # empty tensor list
+    yield ErrorInput(SampleInput(()), error_regex="expects a non-empty TensorList")
+
+def sample_inputs_unbind(op_info, device, dtype, requires_grad, **kwargs):
+    # Note: we don't do any tests where we unbind along 0-length dims
+    # because in that case unbind returns and empty tuple, and that breaks
+    # some assumptions in some backward tests in test_ops.py
+    shape_dims = (((S,), 0),
+                  ((S, S), 0),
+                  ((S, S), 1),
+                  ((S, S), -1),
+                  ((S, 0, S), 0),
+                  ((S, S, S), 1),
+                  )
+    for shape, dim in shape_dims:
+        yield SampleInput(make_tensor(shape, dtype=dtype, device=device,
+                                      requires_grad=requires_grad),
+                          args=(dim,))
+
+def error_inputs_unbind(op_info, device):
+    make_arg = partial(make_tensor, dtype=torch.int32, device=device, requires_grad=False)
+    yield ErrorInput(SampleInput(make_arg(()), args=(0,)), error_type=IndexError,
+                     error_regex="Dimension specified as 0 but tensor has no dimensions")
+    yield ErrorInput(SampleInput(make_arg((2,)), args=(2,)), error_type=IndexError,
+                     error_regex="Dimension out of range")
+
+def reference_unbind(t, dim):
+    """A numpy implementation of torch.unbind"""
+    return tuple(s.squeeze(dim) for s in np.split(t, t.shape[dim], dim))
+
+def sample_inputs_gather(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    yield SampleInput(
+        make_arg((M, S)),
+        0,
+        gather_variable((S, S), 1, M, True, device=device))
+    yield SampleInput(
+        make_arg((M, S)),
+        1,
+        gather_variable((M, S // 2), 0, S, True, device=device))
+    # Empty index tensor case, see: https://github.com/pytorch/pytorch/pull/65006
+    yield SampleInput(
+        make_arg((S,)),
+        0,
+        torch.tensor([], dtype=torch.uint8, device=device))
+    # 0D tensor case
+    yield SampleInput(
+        make_arg(()),
+        0,
+        torch.tensor([0], dtype=torch.int64, device=device))
+    yield SampleInput(
+        make_arg(()),
+        0,
+        torch.tensor(0, dtype=torch.int64, device=device))
+
+def _fill_indices(idx, dim, dim_size, elems_per_row, m, n, o):
+    for i in range(1 if dim == 0 else m):
+        for j in range(1 if dim == 1 else n):
+            for k in range(1 if dim == 2 else o):
+                ii = [i, j, k]
+                ii[dim] = slice(0, idx.size(dim) + 1)
+                idx[tuple(ii)] = torch.randperm(dim_size)[0:elems_per_row]
+
+def error_inputs_gather(op_info, device, **kwargs):
+    # src is [1, 2]
+    #        [3, 4]
+    src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
+
+    # idx is [0, 0]
+    #        [1, 0]
+    idx = torch.tensor(((0, 0), (1, 0)), device=device, dtype=torch.long)
+
+    # Index should be smaller than self except on dimension 1
+    bad_src = make_tensor((1, 1), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(bad_src, args=(1, idx,)),
+                     error_regex="Size does not match at dimension 0")
+
+    # Index must have long dtype
+    bad_idx = idx.to(torch.int32)
+    yield ErrorInput(SampleInput(src, args=(1, bad_idx)),
+                     error_regex="Expected dtype int64 for index")
+
+    # TODO: FIXME
+    # out.dtype must match src.dtype
+    # Creates new src & idx since SampleInputs can't share tensors
+    src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
+    idx = torch.tensor(((0, 0), (1, 0)), device=device, dtype=torch.long)
+    out = torch.empty((2, 2), device=device, dtype=torch.float64)
+    yield ErrorInput(SampleInput(src, args=(1, idx), kwargs={'out': out}),
+                     error_regex="Expected out tensor to have dtype")
+
+    # src and index tensors must have the same # of dimensions
+    # idx too few dimensions
+    src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
+    idx = torch.tensor((0, 0), device=device, dtype=torch.long)
+    yield ErrorInput(SampleInput(src, args=(1, idx)),
+                     error_regex="Index tensor must have the same number of dimensions")
+
+    # src too few dimensions
+    src = torch.tensor((1, 2), device=device, dtype=torch.float32)
+    idx = torch.tensor(((0, 0), (1, 0)), device=device, dtype=torch.long)
+    yield ErrorInput(SampleInput(src, args=(0, idx)),
+                     error_regex="Index tensor must have the same number of dimensions")
+
+    # index out of bounds
+    # NOTE: this ErrorInput is guarded because bounds checking does not occur on CUDA devices
+    if torch.device(device).type == 'cpu':
+        src = torch.tensor(((1, 2), (3, 4)), device=device, dtype=torch.float32)
+        idx = torch.tensor(((0, 23), (1, 0)), device=device, dtype=torch.long)
+        yield ErrorInput(SampleInput(src, args=(1, idx,)),
+                         error_regex="index 23 is out of bounds for dimension")
+
+    x = torch.rand((1,), device=device).expand((3,))
+    src = torch.rand((6,), device=device)
+    ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
+
+    yield ErrorInput(SampleInput(src, args=(0, ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(src, args=(0, ind,), kwargs=dict(out=src)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(ind.clone(), args=(0, ind[1:],), kwargs=dict(out=ind[:1])),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+def error_inputs_take(op_info, device, **kwargs):
+    x = torch.rand((1,), device=device).expand((3,))
+    src = torch.rand((6,), device=device)
+    ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64)
+
+    yield ErrorInput(SampleInput(src, args=(ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(src, args=(ind,), kwargs=dict(out=src)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(ind.clone(), args=(ind[1:],), kwargs=dict(out=ind[:-1])),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+# Error inputs for scatter
+def error_inputs_scatter_and_scatter_add(op_info, device, **kwargs):
+    # Error when self.dtype != src.dtype (and src is not a scalar)
+    src = make_tensor((2, 5), device=device, dtype=torch.float32)
+    idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.long)
+    dst = torch.zeros((3, 5), device=device, dtype=torch.double)
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
+                     error_regex="Expected self.dtype to be equal to src.dtype")
+
+    # Index dtype must be long
+    src = make_tensor((2, 5), device=device, dtype=torch.float32)
+    idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.int32)
+    dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
+                     error_regex="Expected dtype int64 for index")
+
+    # Index and destination must have the same number of dimensions
+    src = make_tensor((2, 5), device=device, dtype=torch.float32)
+    idx = torch.tensor(((0, 1), (1, 2)), device=device, dtype=torch.long)
+    dst = torch.zeros((3, 5, 3), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
+                     error_regex="Index tensor must have the same number of dimensions as self tensor")
+
+    # Index and src must have the same number of dimensions when src is not a scalar
+    src = make_tensor((2, 5, 2), device=device, dtype=torch.float32)
+    idx = torch.tensor(((34, 1), (1, 2)), device=device, dtype=torch.long)
+    dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
+                     error_regex="Index tensor must have the same number of dimensions as src tensor")
+
+    # Index out of bounds
+    # NOTE: this ErrorInput is guarded because bounds checking does not occur on CUDA devices
+    if torch.device(device).type == 'cpu':
+        src = make_tensor((2, 5), device=device, dtype=torch.float32)
+        idx = torch.tensor(((34, 1), (1, 2)), device=device, dtype=torch.long)
+        dst = torch.zeros((3, 5), device=device, dtype=torch.float32)
+        yield ErrorInput(SampleInput(dst, args=(0, idx, src)),
+                         error_regex="index 34 is out of bounds for dimension 0 with size 3")
+
+def error_inputs_renorm(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(0.5, 0, 1.0)), error_type=RuntimeError,
+                     error_regex="needs at least 2 dimensions, got 0 dimensions")
+
+
+def error_inputs_ormqr(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(zero_d, zero_d)), error_type=RuntimeError,
+                     error_regex="input must have at least 2 dimensions")
+
+    # https://github.com/pytorch/pytorch/issues/85218
+    tensor_0 = torch.full((5, 0,), 1, device=device)
+    tensor_1 = torch.full((5,), 1, device=device)
+    tensor_2 = torch.full((5, 5,), 1, device=device)
+    bool_3 = True
+    bool_4 = True
+    yield ErrorInput(SampleInput(tensor_0, args=(tensor_1, tensor_2, bool_3, bool_4)), error_type=RuntimeError,
+                     error_regex=r"tau.shape\[-1\] must be less than or equal to input.shape\[-1\]")
+
+
+def error_inputs_diag(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError,
+                     error_regex="1D or 2D")
+    zero_d = torch.randn(1, 1, 1, device=device)
+    yield ErrorInput(SampleInput(zero_d, args=(0,)), error_type=RuntimeError,
+                     error_regex="1D or 2D")
+
+def error_inputs_embedding(op_info, device, **kwargs):
+    indices = torch.rand(2, 2, device=device).long()
+    weights = [
+        torch.tensor(1.0, device=device),
+        torch.tensor(1.0, device=device).reshape(1, 1, 1),
+    ]
+
+    for weight in weights:
+        yield ErrorInput(SampleInput(weight, args=(indices,)), error_type=RuntimeError,
+                         error_regex="'weight' must be 2-D")
+
+
+def error_inputs_t(op_info, device, **kwargs):
+    yield ErrorInput(
+        SampleInput(torch.randn(2, 3, 4, 5, device=device)),
+        error_regex="expects a tensor with <= 2",
+    )
+
+
+def error_inputs_multinomial(op_info, device, **kwargs):
+    x = torch.empty(1, 2, 3, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,)),
+                     error_regex="prob_dist must be 1 or 2 dim")
+
+    x = torch.empty(1, 2, dtype=torch.long, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,)),
+                     error_regex="multinomial only supports floating-point dtypes for input")
+
+    x = torch.empty(1, 2, dtype=torch.double, device=device)
+    y = torch.empty(1, 2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(2,), kwargs=dict(out=y)),
+                     error_regex="multinomial expects Long tensor out")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(0,)),
+                     error_regex="cannot sample n_sample <= 0 samples")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(-1,)),
+                     error_regex="cannot sample n_sample <= 0 samples")
+
+    x = torch.empty(2, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(3, False,)),
+                     error_regex="cannot sample n_sample > prob_dist")
+
+    x = torch.empty(16777217, dtype=torch.double, device=device)
+    yield ErrorInput(SampleInput(x, args=(3,)),
+                     error_regex="number of categories cannot exceed")
+
+    inputs = ((1., -1., 1.), (1., inf, 1.), (1., -inf, 1.), (1., 1., nan))
+
+    err_msg1 = "probability tensor contains either `inf`, `nan` or element < 0"
+    err_msg2 = "invalid multinomial distribution"
+
+    rep_arg = (False, True) if torch.device(device).type == 'cpu' else (False,)
+
+    for rep in rep_arg:
+        kwargs = {'num_samples': 2, 'replacement': rep}
+
+        for shape in inputs:
+            # error case when input tensor contains `inf`, `nan` or negative element
+            yield ErrorInput(SampleInput(torch.tensor(shape), kwargs=kwargs),
+                             error_regex=err_msg1 if rep is False else err_msg2)
+
+        # error case for the invalid multinomial distribution (sum of probabilities <= 0), 1-D input
+        x = torch.zeros(3, device=device)
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
+        # error case for the invalid multinomial distribution (sum of probabilities <= 0), 2-D input
+        x = torch.zeros(3, 3, device=device)
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
+        # error case for the invalid multinomial distribution
+        x[1, :] = 1
+        yield ErrorInput(SampleInput(x, kwargs=kwargs),
+                         error_regex=err_msg2)
+
+def error_inputs_gradient(op_info, device, **kwargs):
+    for dtype in [torch.long, torch.float32, torch.complex64]:
+        t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], device=device, dtype=dtype)
+
+        dim = (1, 0)
+        spacing = [0.1]
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=spacing, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected spacing to be unspecified, a scalar ')
+
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=3)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient only supports edge_order=1 and edge_order=2.')
+
+        dim = (1, 1)
+        spacing = 0.1
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=spacing, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='dim 1 appears multiple times in the list of dims')
+
+        dim = (0, 1)
+        coordinates = [torch.tensor([1, 2, 4], device='cpu'), torch.tensor([1, 2, 4], device='meta')]
+        yield ErrorInput(SampleInput(t, kwargs=dict(spacing=coordinates, dim=dim, edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each tensor to be on the same device,')
+
+        yield ErrorInput(SampleInput(t, kwargs=dict(dim=3)),
+                         error_type=IndexError, error_regex='')
+
+        t = torch.tensor([[1], [2], [3]])
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=1)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each dimension size to be at least')
+
+        t = torch.tensor([[1, 2], [3, 4]])
+        yield ErrorInput(SampleInput(t, kwargs=dict(edge_order=2)),
+                         error_type=RuntimeError,
+                         error_regex='torch.gradient expected each dimension size to be at least')
+
+def error_inputs_rrelu(op_info, device, **kwargs):
+    input = make_tensor((S, S), device=device, dtype=torch.float32)
+    yield ErrorInput(SampleInput(input, kwargs={'lower': 0.3, 'upper': 0.1}),
+                     error_regex='Lower bound should be less than or equal to the upper bound')
+
+def error_inputs_masked_select(op_info, device, **kwargs):
+    x = torch.rand((1,), device=device).expand((3,))
+    y = torch.rand((6,), device=device)
+    mask = torch.tensor([True, False, True, True, False, False], device=device)
+
+    yield ErrorInput(SampleInput(y, args=(mask,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(y, args=(mask,), kwargs=dict(out=y)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+    yield ErrorInput(SampleInput(mask.clone(), args=(mask,), kwargs=dict(out=mask)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+def error_inputs_median(op_info, device, **kwargs):
+    x = torch.tensor([[[[[[[[[[[[[[[[[[[[[[[[[nan],
+                               [nan]]]]]]]]]]]]]]]]]]]]]]]]], device=device)
+    if device == 'cuda':
+        yield ErrorInput(SampleInput(x, kwargs=dict(dim=(-1))),
+                         error_type=RuntimeError,
+                         error_regex='CUDA Tensors cannot have more than 25 dimensions')
+    else:
+        return
+
+
+def error_inputs_index_select(op_info, device, **kwargs):
+    x = torch.rand((1, 6), device=device).expand((2, 6))
+    y = torch.rand((3, 6), device=device)
+    ind = torch.tensor([0, 1], dtype=torch.int64, device=device)
+
+    yield ErrorInput(SampleInput(y, args=(1, ind,), kwargs=dict(out=x)),
+                     error_type=RuntimeError,
+                     error_regex='unsupported operation')
+
+def error_inputs_index_add(op_info, device, **kwargs):
+    result = torch.tensor([[1., 2.], [4., 5.], [7., 8.]])
+    source = torch.tensor([2., 4.])
+
+    yield ErrorInput(SampleInput(result, args=(0, torch.tensor([0, 2]), source)),
+                     error_type=RuntimeError,
+                     error_regex=r'source tensor shape must match self tensor shape, '
+                     r'excluding the specified dimension. Got self.shape = \[3, 2\] source.shape = \[2\]')
+
+def error_inputs_logcumsumexp(op_info, device, **kwargs):
+    dim = 3
+    srcs = [torch.randn(5, 2, device=device), torch.randn(0, 2, device=device)]
+    for src in srcs:
+        yield ErrorInput(SampleInput(src, args=(dim,)),
+                         error_type=IndexError,
+                         error_regex='Dimension out of range')
+
+def sample_inputs_take_along_dim(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    yield SampleInput(
+        make_arg((S, S)), gather_variable((S, S), 1, S, True, device=device), 0)
+
+    # `indices` broadcast
+    yield SampleInput(
+        make_arg((S, S)), gather_variable((1, S // 2), 0, S, True, device=device), 1)
+
+    # `self` broadcast
+    yield SampleInput(
+        make_arg((1, S)), gather_variable((S, S // 2), 0, S, True, device=device), 1)
+
+    # without `dim` arg
+    yield SampleInput(
+        make_arg((S, S)), gather_variable((S, S // 2), 0, S, True, device=device))
+
+
+def error_inputs_aminmax_amax_amin(op_info, device, is_ref=False, **kwargs):
+
+    # Error Inputs for zero-dim tensors, when 'dim' arg is not provided.
+    shape = (S, 0, S)
+    err_msg_amax_amin = "reduction"
+    err_msg_aminmax = "cannot compute aminmax over an empty dimension as the operation has no identity"
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_amax_amin)
+    elif op_info.name in ['aminmax']:
+        yield ErrorInput(SampleInput(torch.rand(shape, device=device)), error_regex=err_msg_aminmax)
+
+    # Error Inputs for tensors with more than 64 dimension
+    sizes = [1] * 65
+    err_msg1 = "only tensors with up to 64 dims are supported"
+    yield ErrorInput(SampleInput(torch.randn(sizes, device=device), kwargs={'dim': -1}),
+                     error_regex=err_msg1)
+    yield ErrorInput(SampleInput(torch.randn(sizes, device=device), kwargs={'dim': 64}),
+                     error_regex=err_msg1)
+
+    # Error Inputs for repeated 'dim'
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        dims = [(0, 0), (0, -4)]
+        err_msg2 = "in the list of dims"
+        x = torch.randn(S, S, S, S, device=device)
+        for dim in dims:
+            yield ErrorInput(SampleInput(x, kwargs={'dim': dim}), error_regex=err_msg2)
+
+    # Error Input for illegal dtype
+    input5 = torch.randn(L, L, dtype=torch.float32, device=device)
+    max_values = torch.empty(L, dtype=torch.float32, device=device)
+    min_values = torch.empty(L, dtype=torch.double, device=device)
+    illegal_values = torch.empty(L, dtype=torch.int, device=device)
+
+    # Unlike regular PyTorch, amax and amin refs don't require input and out
+    # dtypes to match exactly:
+    # https://github.com/pytorch/pytorch/pull/87765#pullrequestreview-1162023824
+    if is_ref:
+        err_msg_amax_amin2 = ("Attempting to cast from torch.float32 to out tensor with dtype "
+                              "torch.int32, but this can't be cast because it is not safe!")
+    else:
+        err_msg_amax_amin2 = ("Expected the dtype for input and out to match, but got Float "
+                              "for input's dtype and Int for out's dtype.")
+    err_msg_aminmax2 = "Expected out tensor to have dtype float, but got double instead"
+
+    if op_info.name in ['amax', 'amin', '_refs.amax', '_refs.amin']:
+        yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': illegal_values}),
+                         error_regex=err_msg_amax_amin2)
+    elif op_info.name in ['aminmax']:
+        yield ErrorInput(SampleInput(input5, kwargs={'dim': 0, 'out': (max_values, min_values)}),
+                         error_regex=err_msg_aminmax2)
+
+    # Error Inputs for functions to raise an error on specified zero'd dimension as reduction dim
+    err_msg3 = "reduction"
+    # FIXME: eager and ref impl throw different types of errors
+    error_type = IndexError if 'refs' not in op_info.name else RuntimeError
+    yield ErrorInput(SampleInput(torch.rand(shape, device=device), kwargs={'dim': 1}),
+                     error_type=error_type, error_regex=err_msg3)
+
+def sample_inputs_aminmax(op_info, device, dtype, requires_grad, **kwargs):
+    test_cases: Tuple[tuple, dict] = (  # type: ignore[assignment]
+        ((S, S, S), {}),
+        ((S, S, S), {'dim': 1}),
+        ((S, S, S), {'dim': 1, 'keepdim': True}),
+        ((), {'dim': 0}),
+        ((), {}),
+        ((), {'dim': 0, 'keepdim': True}),
+        ((S, 0, S), {'dim': 0}),
+    )
+
+    for shape, kwargs in test_cases:
+        yield SampleInput(
+            make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad),
+            **kwargs)
+
+def error_inputs_diff(op_info, device, **kwargs):
+    t = torch.rand((1, 3), device=device)
+    n = -1
+    yield ErrorInput(SampleInput(t, args=(n, ), kwargs=kwargs),
+                     error_type=RuntimeError,
+                     error_regex=f'order must be non-negative but got {n}')
+
+def sample_inputs_diff(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    test_cases = (
+        ((1,), 0, None, None),
+        ((S,), 0, None, None),
+        ((S, 1), 0, None, None),
+        ((S, 1), 1, None, None),
+        ((S, S), 0, None, None),
+        ((S, S), 1, None, None),
+        ((S, S), 0, (1, S), (2, S)),
+        ((S, S), 0, None, (2, S)),
+        ((XS, XS, XS), 1, None, None),
+        ((XS, XS, XS), 2, None, None),
+        ((XS, XS, XS), 1, (XS, 1, XS), (XS, 1, XS)),
+        ((XS, XS, XS), 2, (XS, XS, 1), (XS, XS, 1)),
+        ((XS, XS, XS), 2, (XS, XS, XS), (XS, XS, XS)),)
+
+    sample_inputs = []
+    for size, dim, size_prepend, size_append in test_cases:
+        prepend_size = 0 if (size_prepend is None) else size_prepend[dim]
+        append_size = 0 if (size_append is None) else size_append[dim]
+        dim_size = size[dim] + prepend_size + append_size
+        for n in range(dim_size):
+            input_tensor = make_arg(size)
+            prepend = make_arg(size_prepend) if size_prepend else None
+            append = make_arg(size_append) if size_append else None
+            yield SampleInput(input_tensor, n, dim, prepend, append)
+
+    # add some samples with n > dim_size
+    yield SampleInput(make_arg((XS, XS, XS)), S + 1, 1)
+    yield SampleInput(make_arg((XS, XS, XS)), S * 3 + 2, 2, make_arg((XS, XS, XS)), make_arg((XS, XS, XS)))
+
+def sample_inputs_histogram(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = ((), (S,), (S, S), (S, S, S), (S, 1, S), (S, 0, S))
+
+    for size, bin_ct, weighted, density in product(sizes, range(1, 5), [False, True], [False, True]):
+        input_tensor = make_arg(size)
+        weight_tensor = make_arg(size) if weighted else None
+
+        yield SampleInput(input_tensor, bin_ct,
+                          weight=weight_tensor, density=density)
+
+        bins_tensor = make_arg((bin_ct + 1,))
+        yield SampleInput(input_tensor, bins_tensor,
+                          weight=weight_tensor, density=density)
+
+def sample_inputs_histogramdd(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = ((S, S), (S, S, S), (S, 1, S), (S, 0, S))
+    bin_ct_patterns = ((1, 1, 1, 1, 1), (2, 3, 2, 3, 2), (3, 2, 3, 2, 3))
+
+    for size, bin_ct_pattern, weighted, density in product(sizes, bin_ct_patterns, [False, True], [False, True]):
+        input_tensor = make_arg(size)
+        bin_ct = bin_ct_pattern[:size[-1]]
+        weight_tensor = make_arg(size[:-1]) if weighted else None
+
+        yield SampleInput(input_tensor, bin_ct,
+                          weight=weight_tensor, density=density)
+
+        bins_tensor = [make_arg(ct + 1) for ct in bin_ct]
+        yield SampleInput(input_tensor, bins_tensor,
+                          weight=weight_tensor, density=density)
+
+def error_inputs_histogramdd(opinfo, device, **kwargs):
+    invalid_bins = [1, 1, 1, 1, 1]
+    make_arg = partial(make_tensor, dtype=torch.float, device=device, requires_grad=False)
+    msg = "histogramdd: The size of bins must be equal to the innermost dimension of the input."
+    yield ErrorInput(SampleInput(make_arg(5, 6), invalid_bins), error_regex=msg)
+
+def sample_inputs_histc(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = ((), (S,), (S, S), (S, S, S), (S, 1, S), (S, 0, S))
+
+    for size, min, max in product(sizes, [0, -10], [0, 10]):
+        # construct sample input omitting bins arg
+        yield SampleInput(make_arg(size), min=min, max=max)
+
+        # construct sample inputs with a few different bins values
+        for bins in [1, 3, 10]:
+            yield SampleInput(make_arg(size), bins=bins, min=min, max=max)
+
+def sample_inputs_bincount(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    for size, weighted in product((S, M), [False, True]):
+        input_tensor = torch.randint(0, size, (size,), dtype=dtype, device=device)
+        weight_tensor = make_arg((size,)) if weighted else None
+
+        max_val = int(input_tensor.max().item())
+
+        for minlength in [0, max_val // 2, max_val, 2 * max_val]:
+            yield SampleInput(
+                input_tensor, weights=weight_tensor, minlength=minlength)
+
+def sample_inputs_bucketize(op_info, device, dtype, requires_grad, reference_inputs_mode=False, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = (((), S), ((S,), S), ((S, S), S), ((S, S, S), S), ((S, 1, S), S), ((S, 0, S), S))
+
+    if reference_inputs_mode:
+        sizes += (((256,), 128), ((128,), 256), ((32, 32), 11), ((32, 4, 32), 33))
+
+    for (input_shape, nb), out_int32, right in product(sizes, [False, True], [False, True]):
+        input_tensor = make_arg(input_shape)
+        boundaries = make_arg(nb).msort()
+
+        yield SampleInput(input_tensor, boundaries,
+                          out_int32=out_int32, right=right)
+
+reference_inputs_bucketize = partial(sample_inputs_bucketize, reference_inputs_mode=True)
+
+def error_inputs_bucketize(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float, device=device, requires_grad=False)
+    yield ErrorInput(SampleInput(make_arg((S, S, S)), make_arg((S, S))),
+                     error_regex="boundaries tensor must be 1 dimension")
+
+def sample_inputs_searchsorted(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # (unsorted tensor size, (input sizes,), is_scalar)
+    sizes = (
+        ((0,), ((0,),), False),
+        ((M,), ((), (M,), (M, M)), False),
+        ((0, 0), ((0, 0),), False),
+        ((M, M), ((M, M),), False),
+        ((0, 0, 0), ((0, 0, 0),), False),
+        ((M, M, M), ((M, M, M),), False),
+        ((L,), ((),), True),
+    )
+
+    for (size, input_sizes, is_scalar), noncontiguous, out_int32, right in product(
+        sizes, [False, True], [False, True], [False, True]
+    ):
+        unsorted_tensor = make_arg(size, noncontiguous=noncontiguous)
+        for input_size in input_sizes:
+            input = make_arg(input_size, noncontiguous=noncontiguous)
+            if is_scalar:
+                input = input.item()
+            if np.prod(size) == 0:
+                boundary_tensor = unsorted_tensor
+                sorter = make_tensor(size, dtype=torch.int64, device=device, noncontiguous=noncontiguous)
+            else:
+                boundary_tensor, sorter = torch.sort(unsorted_tensor)
+            side = "right" if right else "left"
+
+            yield SampleInput(boundary_tensor, input, out_int32=out_int32, right=right)
+            yield SampleInput(boundary_tensor, input, out_int32=out_int32, side=side)
+
+            yield SampleInput(unsorted_tensor, input, out_int32=out_int32, right=right, sorter=sorter)
+            yield SampleInput(unsorted_tensor, input, out_int32=out_int32, side=side, sorter=sorter)
+
+def sample_inputs_gradient(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    test_cases_float = (
+        ((S,), None, None, 1),
+        ((S,), 2., None, 1),
+        ((S, S), None, None, 2),
+        ((S, S), [2.0, 2.1], None, 1),
+        ((S, S), [2.0, 2.1], (0, 1), 1),
+        ((4, 4, 4), [2., 1.], (0, 1), 2),
+    )
+    for size, spacing, dim, edge_order in test_cases_float:
+        t = make_arg(size)
+        yield SampleInput(t, dim=dim, spacing=spacing, edge_order=edge_order)
+
+    test_cases_tensor = (
+        ((3, 3, 3), ((1.1, 2.0, 3.5), (4.0, 2, 6.0)), (0, -1), 1),
+        ((3, 3, 3), ((1.0, 3.0, 2.0), (8.0, 6.0, 1.0)), (0, 1), 2),
+    )
+    for size, coordinates, dim, edge_order in test_cases_tensor:
+        t = make_arg(size)
+        coordinates_tensor_list = []
+        for coords in coordinates:
+            # `coords` will always contain floating point values and Python 3.10 does not support this
+            # implicit conversion to an integer using `__int__`
+            # TODO: this can be simplified after https://github.com/pytorch/pytorch/issues/69316 is fixed
+            a = torch.tensor(coords, device=device)
+            coordinates_tensor_list.append(a.to(dtype))
+        yield SampleInput(t, dim=dim, spacing=coordinates_tensor_list, edge_order=edge_order)
+
+def sample_inputs_getitem(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    test_args = [
+        ([1, 2],),
+        (slice(0, 3),),
+        ([slice(0, 3), 1],),
+        ([[0, 2, 3], [1, 3, 3], [0, 0, 2]],),
+        ([[0, 0, 3], [1, 1, 3], [0, 0, 2]],),
+        ([slice(None), slice(None), [0, 3]],),
+        ([slice(None), [0, 3], slice(None)],),
+        ([[0, 3], slice(None), slice(None)],),
+        ([[0, 3], [1, 2], slice(None)],),
+        ([[0, 3], ],),
+        ([[0, 3], slice(None)],),
+        ([[0, 3], Ellipsis],),
+        ([[0, 2, 3], [1, 3, 3], torch.LongTensor([0, 0, 2])],),
+        (index_variable(2, S, device=device),),
+        (mask_not_all_zeros((S,)),),
+    ]
+
+    for args in test_args:
+        yield SampleInput(make_arg((S, S, S)), args=args)
+
+    yield SampleInput(make_arg((S, S, S, S)), args=([slice(None), [0, 1], slice(None), [0, 1]],))
+
+def sample_inputs_index_put(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    for accumulate in [False, True]:
+        # Test with indices arg
+        yield SampleInput(
+            make_arg((S, S,)),
+            (index_variable(2, S, device=device),),
+            make_arg((2, S)),
+            accumulate=accumulate)
+
+        # Test with mask arg
+        mask = torch.zeros(S, dtype=torch.bool) if accumulate else mask_not_all_zeros((S,))
+        yield SampleInput(
+            make_arg((S, S)), (mask, ), make_arg((S,)), accumulate=accumulate)
+
+def sample_inputs_sort(op_info, device, dtype, requires_grad, **kwargs):
+    def small_3d_unique():
+        res = torch.randperm(S * S * S, dtype=torch.int64, device=device).view(S, S, S)
+        res = res.to(dtype).requires_grad_(requires_grad)
+        return res
+
+    def large_1d_unique():
+        res = torch.randperm(L * L * L, dtype=torch.int64, device=device)
+        res = res.to(dtype).requires_grad_(requires_grad)
+        return res
+
+    # Test case for large tensor.
+    yield SampleInput(large_1d_unique())
+
+    # Test cases for small 3d tensors.
+    # Imitates legacy tests from test/test_torch.py
+    dims = range(-3, 3)
+    flag = [True, False]
+    for dim, descending, stable in product(dims, flag, flag):
+        # default schema without stable sort
+        yield SampleInput(small_3d_unique(), dim, descending)
+        # schema with stable sort, no CUDA support yet
+        if torch.device(device).type == 'cpu':
+            yield SampleInput(
+                small_3d_unique(), dim=dim, descending=descending, stable=stable)
+
+    # Test cases for scalar tensor
+    tensor_opt = dict(dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(torch.tensor(1, **tensor_opt))
+    yield SampleInput(torch.tensor(1, **tensor_opt), 0)
+    yield SampleInput(torch.tensor(1, **tensor_opt), 0, True)
+
+    # Test cases for empty tensor
+    yield SampleInput(torch.tensor((), **tensor_opt))
+    yield SampleInput(torch.tensor((), **tensor_opt), 0)
+    yield SampleInput(torch.tensor((), **tensor_opt), 0, True)
+
+    # Test cases for stable sort
+    yield SampleInput(small_3d_unique(), stable=True)
+    yield SampleInput(small_3d_unique(), dim=0, stable=True)
+    yield SampleInput(small_3d_unique(), dim=0, descending=True, stable=True)
+
+def sample_inputs_threshold(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    sizes = ((), (S,), (S, S), (S, S, S))
+    for x_size in sizes:
+        # threshold and values args must be numbers
+        yield SampleInput(make_arg(x_size), make_arg(()).item(), make_arg(()).item())
+
+def sample_inputs_unique(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    sizes = ((), (S,), (S, S), (S, S, S), (S, 1, S), (S, 0, S))
+
+    for shape, sorted, return_inverse, return_counts, dim in \
+            product(sizes, [False, True], [False, True], [False, True], [None, -2, -1, 0, 1, 2]):
+        # torch.unique cannot be called if the input tensor has a zero dimension which isn't the selected dim
+        if 0 in shape and shape.index(0) is not dim:
+            continue
+
+        # skip invalid dim args
+        if dim is not None and (dim < -len(shape) or dim >= len(shape)):
+            continue
+
+        kwargs = dict(sorted=sorted, return_inverse=return_inverse, return_counts=return_counts, dim=dim)
+
+        # construct a test case with only one distinct value
+        input_t = torch.zeros(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+        yield SampleInput(input_t, **kwargs)
+
+        # construct a test case with mixed 0s and 1s
+        input_t = make_arg(shape, dtype=torch.bool, requires_grad=False)\
+            .to(dtype).requires_grad_(requires_grad)
+        yield SampleInput(input_t, **kwargs)
+
+        # construct a test case with many different values
+        yield SampleInput(make_arg(shape), **kwargs)
+
+def sample_inputs_unique_consecutive(*args, **kwargs):
+    for sample_input in sample_inputs_unique(*args, **kwargs):
+        if not sample_input.kwargs["sorted"]:
+            sample_input.kwargs.pop("sorted")
+            yield sample_input
+
+def sample_inputs_adaptive_avg_pool1d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        ((0, 8, 8), (5,)),
+        ((3, 8, 8), 5),
+        ((3, 8, 8), 1)
+    )
+
+    for input_shape, output_size in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(output_size,))
+
+
+def error_inputs_adaptive_avg_pool1d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3)), output_size=()),
+                     error_regex="'output_size' should contain one int")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1)), output_size=(-1,)),
+                     error_regex="elements of output_size must be greater than or equal to 0")
+
+
+def sample_inputs_adaptive_avg_pool2d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        ((1, 8, 8, 8), (5, 7)),
+        ((2, 8, 8, 8), (None, 7)),
+        ((1, 8, 4, 3), (5, None)),
+        ((1, 8, 4, 3), (None, None)),
+        ((1, 8, 4, 3), (5)),
+    )
+
+    for input_shape, output_size in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(output_size,))
+
+
+def error_inputs_adaptive_avg_pool2d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for incorrect input dimension
+    yield ErrorInput(SampleInput(make_arg((2, 2)), output_size=(2, 2)),
+                     error_type=ValueError, error_regex="Input dimension should be at least 3")
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3, 4)), output_size=()),
+                     error_regex="output_size must be 2")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1, 1)), output_size=(-1, 0)),
+                     error_regex="elements of output_size must be greater than or equal to 0")
+
+
+def sample_inputs_adaptive_avg_pool3d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        ((0, 8, 8, 8, 8), (5, 7, 4)),
+        ((1, 8, 4, 3, 7), (None, None, None)),
+        ((1, 8, 4, 3, 7), (1, 1, 1)),
+        ((3, 3, 8, 8, 6), (5, 7, None)),
+        ((1, 3, 8, 8, 6), (5, None, 2)),
+        ((3, 3, 8, 8, 6), (None, 3, 2)),
+    )
+
+    for input_shape, output_size in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(output_size,))
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(output_size,))
+
+
+def error_inputs_adaptive_avg_pool3d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for incorrect input dimension
+    yield ErrorInput(SampleInput(make_arg((2, 2, 2)), output_size=(2, 2, 2)),
+                     error_type=ValueError, error_regex="Input dimension should be at least 4")
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3, 4)), output_size=()),
+                     error_regex="output_size must be 3")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1, 1, 1)), output_size=(-1, 0, 2)),
+                     error_regex="elements of output_size must be greater than or equal to 0")
+
+
+def sample_inputs_adaptive_max_pool1d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        # ((0, 8, 8), (5,)),
+        # 0 batch size doesn't work,  cannot reshape tensor of 0 elements into shape [0, 8, -1]
+        ((3, 4, 4), 3),
+        ((3, 4, 4), 1)
+    )
+
+    for shapes, return_idx in product(cases, (True, False)):
+        # Batched
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
+        # Unbatched
+        yield SampleInput(make_arg(shapes[0][1:]), args=(shapes[1], return_idx))
+
+
+def error_inputs_adaptive_max_pool1d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3)), output_size=()),
+                     error_regex="'output_size' should contain one int")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1)), output_size=(-1,)),
+                     error_regex="Trying to create tensor with negative dimension")
+
+def sample_inputs_adaptive_max_pool2d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        # ((0, 8, 8, 8), (5, 7)),
+        # 0 batch size doesn't work,  cannot reshape tensor of 0 elements into shape [0, 8, -1]
+        ((1, 4, 4, 4), (2, 3)),
+        ((2, 4, 4, 4), (None, 3)),
+        ((2, 4, 4, 4), (1, 1)),
+        ((1, 4, 4, 3), (3, None)),
+        ((1, 4, 4, 3), (None, None)),
+        ((1, 4, 4, 3), (3)),
+    )
+
+    for shapes, return_idx in product(cases, (True, False)):
+        # Batched
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
+        # Unbatched
+        yield SampleInput(make_arg(shapes[0][1:]), args=(shapes[1], return_idx))
+
+def error_inputs_adaptive_max_pool2d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for incorrect input dimension
+    yield ErrorInput(SampleInput(make_arg((2, 2)), output_size=(2, 2)),
+                     error_type=ValueError, error_regex="Input dimension should be at least 3")
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3, 4)), output_size=()),
+                     error_regex="internal error")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1, 1)), output_size=(-1, 0)),
+                     error_regex="Trying to create tensor with negative dimension")
+
+
+def sample_inputs_adaptive_max_pool3d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as (input shape, output size)
+    cases = (
+        # ((0, 8, 8, 8, 8), (5, 7, 4)),
+        # 0 batch size doesn't work,  cannot reshape tensor of 0 elements into shape [0, 8, -1]
+        ((1, 4, 4, 3, 5), (None, None, None)),
+        ((1, 4, 4, 3, 5), (1, 1, 1)),
+        ((3, 3, 4, 4, 6), (2, 3, None)),
+        ((1, 3, 4, 4, 6), (3, None, 2)),
+        ((3, 3, 4, 4, 6), (None, 3, 2)),
+    )
+
+    for shapes, return_idx in product(cases, (True, False)):
+        # Batched
+        yield SampleInput(make_arg(shapes[0]), args=(shapes[1], return_idx))
+        # Unbatched
+        yield SampleInput(make_arg(shapes[0][1:]), args=(shapes[1], return_idx))
+
+def error_inputs_adaptive_max_pool3d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for incorrect input dimension
+    yield ErrorInput(SampleInput(make_arg((2, 2, 2)), output_size=(2, 2, 2)),
+                     error_type=ValueError, error_regex="Input dimension should be at least 4")
+
+    # error inputs for empty output
+    yield ErrorInput(SampleInput(make_arg((1, 2, 3, 4)), output_size=()),
+                     error_regex="internal error")
+
+    # error inputs for output_size lesser than 0
+    yield ErrorInput(SampleInput(make_arg((1, 1, 1, 1, 1)), output_size=(-1, 0, 2)),
+                     error_regex="Trying to create tensor with negative dimension")
+
+
+class _TestParamsMaxPoolBase:
+
+    def __init__(self):
+        self.kwargs = {
+            'kernel_size': [3],
+            'stride': [2, None],
+            'ceil_mode': [True, False],
+            'padding': [0, 1],
+            'dilation': [1],
+            'return_indices': [True, False]
+        }
+
+        self.shapes = [
+            [1, 2, None],  # batch
+            [2],  # channels
+            [3, 6]  # signal
+        ]
+
+    def _gen_shape(self):
+        for shape in product(*self.shapes):
+            # shape[0] is None indicates missing batch dimension
+            if shape[0] is None:
+                shape = shape[1:]
+
+            yield shape, torch.contiguous_format
+            # only 2d (N, C, H, W) rank 4 tensors support channels_last memory format
+            if len(self.shapes) == 4 and len(shape) == 4:
+                yield shape, torch.channels_last
+
+    def _gen_kwargs(self):
+        keys = self.kwargs.keys()
+        for values in product(*self.kwargs.values()):
+            yield dict(zip(keys, values))
+
+    def gen_input_params(self):
+        yield from product(self._gen_shape(), self._gen_kwargs())
+
+class _TestParamsMaxPool1d(_TestParamsMaxPoolBase):
+
+    def __init__(self):
+        super().__init__()
+        self.kwargs['kernel_size'] += [(3,)]
+        self.kwargs['stride'] += [(2,)]
+        self.kwargs['padding'] += [(1,)]
+        self.kwargs['dilation'] += [(1,)]
+
+class _TestParamsMaxPool2d(_TestParamsMaxPoolBase):
+
+    def __init__(self):
+        super().__init__()
+        self.kwargs['kernel_size'] += [(3, 2)]
+        self.kwargs['stride'] += [(2, 1)]
+        self.kwargs['padding'] += [(1, 1)]
+        self.kwargs['dilation'] += [(1, 2)]
+
+        self.shapes.append([6])
+
+class _TestParamsMaxPool3d(_TestParamsMaxPoolBase):
+
+    def __init__(self):
+        super().__init__()
+        self.kwargs['kernel_size'] += [(3, 2, 3)]
+        self.kwargs['stride'] += [(2, 1, 2)]
+        self.kwargs['dilation'] += [(1, 2, 1)]
+
+        self.shapes.append([6])
+        self.shapes.append([5])
+
+def sample_inputs_max_pool(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    params_generator_type_dict = {
+        'nn.functional.max_pool1d': _TestParamsMaxPool1d,
+        'nn.functional.max_pool2d': _TestParamsMaxPool2d,
+        'nn.functional.max_pool3d': _TestParamsMaxPool3d,
+        'max_pool2d_with_indices_backward': _TestParamsMaxPool2d,
+    }
+
+    params_generator = params_generator_type_dict[op_info.name]()
+    for (shape, memory_format), kwargs in params_generator.gen_input_params():
+        arg = make_arg(shape).to(memory_format=memory_format).requires_grad_(requires_grad)
+        yield SampleInput(arg, kwargs=kwargs)
+
+def max_pool2d_backward(*args, kernel_size=(), stride=(), padding=(0,), dilation=(1,), ceil_mode=False, **kwargs):
+    out, indices = torch.nn.functional.max_pool2d_with_indices(
+        *args, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, ceil_mode=ceil_mode, return_indices=True)
+    grad_out = torch.ones_like(out)
+    if stride is None:
+        stride = kernel_size
+    out_b = torch.ops.aten.max_pool2d_with_indices_backward.default(
+        grad_out, *args, kernel_size, stride, padding, dilation, ceil_mode, indices)
+    return out_b
+
+def error_inputs_max_pool1d(op_info, device, **kwargs):
+    # Toggle requires_grad because `max_pool1d` has different path
+    # based on whether `requires_grad` is set or not.
+    for requires_grad in (True, False):
+        make_arg = partial(make_tensor, device=device, dtype=torch.float, requires_grad=requires_grad)
+        # error inputs when pad is negative
+        x = make_arg((0, 1, 49))
+        yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1, 'return_indices': True}),
+                         error_regex='pad must be non-negative')
+
+        # error inputs when pad > kernel_size / 2
+        yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
+                         error_regex='pad should be at most half of effective kernel size')
+
+        # error inputs when pad > ((kernel_size - 1) * dilation + 1) / 2, when dilation is not default
+        yield ErrorInput(SampleInput(x,
+                         kwargs={'kernel_size': 3, 'dilation': 2, 'stride': 1, 'padding': 3, 'return_indices': True}),
+                         error_regex='pad should be at most half of effective kernel size')
+
+        # error inputs for input tensor
+        error_msg = r'Expected 2D or 3D \(batch mode\) tensor with optional 0 dim batch size for input'
+        yield ErrorInput(SampleInput(make_arg((), requires_grad=requires_grad), kwargs={'kernel_size': 1}),
+                         error_regex=error_msg)
+
+        # error inputs for empty input
+        yield ErrorInput(SampleInput(torch.tensor([], device=device, requires_grad=requires_grad),
+                                     kwargs={'kernel_size': 1}),
+                         error_regex=error_msg)
+
+        # error: unbatched input with 0 sized non-batch dims.
+        yield ErrorInput(SampleInput(make_arg((0, 10), requires_grad=requires_grad),
+                                     kwargs={'kernel_size': 1}),
+                         error_regex=error_msg)
+
+        # error: batched input with 0 sized non-batch dims.
+        yield ErrorInput(SampleInput(make_arg((1, 10, 0), requires_grad=requires_grad),
+                                     kwargs={'kernel_size': 1}),
+                         error_regex=error_msg)
+
+        # error inputs for empty input with stride=0
+        error_msg = 'stride must be greater than zero, but got 0'
+        yield ErrorInput(SampleInput(make_arg((3, 3, 3)), kwargs={'kernel_size': 1, 'stride': 0}),
+                         error_regex=error_msg)
+
+        # error inputs for empty input with dilation=0
+        error_msg = 'dilation must be greater than zero, but got 0'
+        yield ErrorInput(SampleInput(make_arg((3, 3, 3)),
+                                     kwargs={'kernel_size': 1, 'stride': 1, 'padding': 0, 'dilation': 0}),
+                         error_regex=error_msg)
+
+        # error inputs for invalid output size
+        error_msg = 'Invalid computed output size: -2'
+        yield ErrorInput(SampleInput(make_arg((2, 2, 2)),
+                                     kwargs={'kernel_size': 5, 'stride': 1, 'padding': 0, 'dilation': 1}),
+                         error_regex=error_msg)
+
+        # error inputs when kernel_size=0
+        error_msg = 'kernel_size must be greater than zero'
+        yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 0}),
+                         error_regex=error_msg)
+
+        # error inputs for strides > 0
+        error_msg = 'stride must be greater than zero'
+        yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 0}),
+                         error_regex=error_msg)
+
+
+def error_inputs_max_pool2d(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float, requires_grad=False)
+    # error inputs when pad is negative
+    x = make_arg((0, 1, 49))
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1, 'return_indices': True}),
+                     error_regex='pad must be non-negative')
+    # 2-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': -1, 'return_indices': True}),
+                     error_regex='pad must be non-negative')
+
+    # error inputs when pad > kernel_size / 2 (kernel_size : int)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error inputs when pad > kernel_size / 2 (kernel_size : tuple)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': 4, 'return_indices': True}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error: unbatched input with 0 sized non-batch dims.
+    err_msg = r'Expected 3D or 4D \(batch mode\) tensor with optional 0 dim batch size for input'
+    yield ErrorInput(SampleInput(make_arg((1, 0, 10)),
+                                 kwargs={'kernel_size': 1}),
+                     error_regex=err_msg)
+
+    # error: batched input with 0 sized non-batch dims.
+    yield ErrorInput(SampleInput(make_arg((2, 1, 10, 0)),
+                                 kwargs={'kernel_size': 1}),
+                     error_regex=err_msg)
+
+
+def error_inputs_max_pool3d(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float, requires_grad=False)
+    # error inputs when pad is negative
+    x = make_arg((0, 1, 49, 50))
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1, 'return_indices': True}),
+                     error_regex='pad must be non-negative')
+    # 3-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50,
+                                            'padding': -1, 'return_indices': True}),
+                     error_regex='pad must be non-negative')
+
+    # error inputs when pad > kernel_size / 2 (kernel_size: int)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4, 'return_indices': True}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error inputs when pad > kernel_size / 2 (kernel_size: tuple)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50,
+                                            'padding': 4, 'return_indices': True}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error: unbatched input with 0 sized non-batch dims.
+    err_msg = r'Expected input\'s non-batch dimensions to have positive length'
+    yield ErrorInput(SampleInput(make_arg((0, 1, 2, 10)),
+                                 kwargs={'kernel_size': 1}),
+                     error_regex=err_msg)
+
+    # error: batched inputs with 0 sized non-batch dims.
+    yield ErrorInput(SampleInput(make_arg((2, 1, 0, 1, 2)),
+                                 kwargs={'kernel_size': 1}),
+                     error_regex=err_msg)
+
+
+def sample_inputs_normalize(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, low=-1, high=1, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases: Tuple[Tuple[int], dict] = (  # type: ignore[assignment]
+                                     ((2, 1, 4, 5), {'p': 1., 'dim': 2}),
+                                     ((2, 3, 4, 5), {'p': 2., 'dim': 1}),
+                                     ((1, 2, 4, 5), {'p': 0.5, 'dim': 0}),
+                                     ((1, 3, 4, 5), {'p': -1., 'dim': 1}),
+                                     ((1, 3, 4, 5), {'p': 0., 'dim': -1}),
+                                     ((), {'p': 1.2, 'dim': 0}),
+                                     ((2, 3, 4, 5), {}),
+                                     ((2, 3, 4, 5), {'eps': 1e-4}))
+
+    for input_shape, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), kwargs=kwargs)
+
+
+def complex_conv(fn, input_size, weight, grad_output, stride, padding, dilation, groups):
+    # conv(W, x, b) = conv(Wr, xr, br) - conv(Wi, xi, 0) + i(conv(Wi, xr, bi) + conv(Wr, xi, 0))
+    # a = conv(Wr, xr, br),
+    # b = conv(Wi, xi, 0),
+    # c = conv(Wr + Wi, xr + xi, br + bi)
+    # conv(W, x, b) = a - b + i(c - a - b)
+
+    grad_output_ = torch.view_as_real(grad_output)
+    grad_output_r = grad_output_[..., 0]
+    grad_output_i = grad_output_[..., 1]
+
+    weight_ = torch.view_as_real(weight)
+    weight_r = weight_[..., 0]
+    weight_i = weight_[..., 1]
+
+    a = fn(input_size, weight_r, grad_output_r, stride, padding, dilation, groups)
+    b = fn(input_size, weight_i, grad_output_i, stride, padding, dilation, groups)
+    c = fn(input_size, weight_r + weight_i, grad_output_r + grad_output_i, stride, padding, dilation, groups)
+
+    return (a - b) + 1j * (c - a - b)
+
+
+def conv_transpose_ref(input, weight, bias, stride=1, padding=0,
+                       output_padding=0, dilation=1, groups=1,
+                       fn=None):
+    # Derivative of `conv` is `conv_transpose`.
+    # To verify the correctness of `conv_transpose`,
+    # we rely `torch.nn.grad` implementation (which is tested in test_nn.py)
+    # for floating dtypes.
+
+    assert fn is not None
+
+    grad_fn_map = {torch.nn.functional.conv_transpose1d: torch.nn.grad.conv1d_input,
+                   torch.nn.functional.conv_transpose2d: torch.nn.grad.conv2d_input,
+                   torch.nn.functional.conv_transpose3d: torch.nn.grad.conv3d_input}
+    batched_dim_map = {torch.nn.functional.conv_transpose1d: 3,
+                       torch.nn.functional.conv_transpose2d: 4,
+                       torch.nn.functional.conv_transpose3d: 5}
+
+    # Input for `ref` is ndarray.
+    input, weight = torch.from_numpy(input), torch.from_numpy(weight)
+
+    is_batched = len(input.shape) == batched_dim_map[fn]
+    if not is_batched:
+        input = input.unsqueeze(0)
+
+    if bias is not None:
+        bias = torch.from_numpy(bias)
+        unsqueeze_dims = input.ndim - 2
+        for _ in range(unsqueeze_dims):
+            bias = bias.unsqueeze(1)
+
+    grad_output = input
+    # Get the input shape for grad_fn.
+    conv_transpose_output = fn(grad_output.to('meta'), weight.to('meta'), None,
+                               stride=stride, padding=padding, output_padding=output_padding,
+                               groups=groups, dilation=dilation)
+    input_size = conv_transpose_output.shape
+
+    grad_fn = grad_fn_map[fn]
+    if weight.dtype.is_complex:
+        out = complex_conv(grad_fn, input_size, weight, grad_output, stride, padding, dilation, groups)
+    else:  # Floating
+        out = grad_fn(input_size, weight, grad_output, stride, padding, dilation, groups)
+
+    if bias is not None:
+        out = out + bias
+
+    return out.squeeze(0) if not is_batched else out
+
+
+def sample_inputs_conv_transpose1d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias
+    # and a dict of values of (stride, padding, output_padding, groups, dilation)
+    cases: Tuple[Tuple[int], Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 3, 4), (3, 3, 3), (3,),
+         {'stride': (2,), 'padding': 2, 'output_padding': (1,), 'groups': 1}),
+        ((2, 2, 4), (2, 2, 4), (4,),
+         {'stride': (3,), 'padding': (1,), 'output_padding': (2,), 'groups': 2, 'dilation': (4,)}),
+        ((1, 1, 4), (1, 1, 4), (1,),
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2,)}),
+        ((1, 1, 4), (1, 2, 3), None,
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}),
+        ((1, 4, 5), (4, 8, 3), None,
+         {})
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+
+def sample_inputs_conv_transpose2d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias
+    # and a dict of values of (stride, padding, output_padding, groups, dilation)
+    cases: Tuple[Tuple[int], Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 3, 4, 4), (3, 3, 3, 3), (3,),
+         {'stride': (2, 2), 'padding': 2, 'output_padding': (1, 1), 'groups': 1}),
+        ((2, 2, 4, 4), (2, 2, 4, 5), (4,),
+         {'stride': (3, 2), 'padding': (1, 2), 'output_padding': (2, 3), 'groups': 2, 'dilation': (4, 4)}),
+        ((1, 1, 4, 5), (1, 1, 4, 3), (1,),
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2, 3)}),
+        ((1, 1, 4, 3), (1, 2, 3, 4), None,
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}),
+        ((2, 4, 4, 4), (4, 1, 3, 3), None, {'groups': 4}),
+        ((1, 2, 5, 5), (2, 4, 3, 3), None, {})
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+def sample_inputs_conv_transpose3d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias
+    # and a dict of values of (stride, padding, output_padding, groups, dilation)
+    cases: Tuple[Tuple[int], Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 3, 4, 4, 4), (3, 3, 3, 3, 3), (3,),
+         {'stride': (2, 2, 2), 'padding': 2, 'output_padding': (1, 1, 1), 'groups': 1}),
+        ((2, 2, 4, 4, 4), (2, 2, 4, 5, 6), (4,),
+         {'stride': (3, 2, 1), 'padding': (1, 2, 3), 'output_padding': (2, 3, 1), 'groups': 2, 'dilation': (4, 4, 4)}),
+        ((1, 1, 4, 5, 2), (1, 1, 4, 3, 1), (1,),
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1, 'dilation': (2, 3, 2)}),
+        ((1, 1, 4, 3, 4), (1, 2, 3, 4, 5), None,
+         {'stride': 2, 'padding': 1, 'output_padding': 1, 'groups': 1}),
+        ((1, 4, 5, 5, 5), (4, 8, 3, 3, 3), None,
+         {})
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+
+def sample_inputs_conv1d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias,
+    # and a dict of values of (stride, padding, dilation, groups)
+    cases: Tuple = (
+        ((1, 3, 4), (3, 3, 3), (3,), {'stride': (2,), 'padding': 2, 'groups': 1}),
+        ((2, 4, 8), (2, 2, 3), (2,), {'stride': 3, 'padding': 1, 'groups': 2, 'dilation': 2}),
+        ((1, 4, 5), (1, 4, 3), None, {'stride': (2,), 'padding': 'valid'}),
+        ((2, 2, 4), (2, 1, 4), (2,), {'stride': (1,), 'padding': 'same', 'groups': 2, 'dilation': (2,)}),
+        # With defaults
+        ((1, 4, 5), (3, 4, 3), None, {}),
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+
+def error_inputs_conv1d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float64)
+    make_int_arg = partial(make_tensor, device=device, dtype=torch.int64)
+    make_complex_arg = partial(make_tensor, device=device, dtype=torch.complex128)
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_int_arg((1, 1, 4)), args=(make_int_arg((1, 1, 2)), make_arg((1,)))),
+        error_regex="should be the same")
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 1, 2)), make_complex_arg((1,)))),
+        error_regex="should be the same")
+
+    # error inputs for negative strides
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 2, 2)), make_arg((1,))),
+                    kwargs={'stride': (-1,)}), error_regex="non-positive stride is not supported")
+
+    # error inputs for negative padding
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 2, 2)), make_arg((1,))),
+                    kwargs={'padding': (-1,)}), error_regex="negative padding is not supported")
+
+    # error inputs for negative dilation
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 1, 2)), make_arg((1,))),
+                    kwargs={'dilation': (-1,)}), error_regex="dilation should be greater than zero")
+
+    # FIXME: https://github.com/pytorch/pytorch/issues/85656
+    # error inputs for bias shape not equal to the output channels
+    # yield ErrorInput(SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 1, 3)), make_arg((2,)))),
+    #                  error_regex="expected bias to be 1-dimensional with 1 elements")
+
+    # error inputs for input.ndim != weight.ndim
+    yield ErrorInput(SampleInput(make_arg((1, 1, 4)), args=(make_arg((1, 2)), make_arg((1,)))),
+                     error_regex="weight should have at least three dimensions")
+
+    # error inputs for the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4)), args=(make_arg((2, 2, 2)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': 3}), error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4)), args=(make_arg((2, 2, 2)), make_arg((2,))),
+                    kwargs={'groups': 3}), error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for invalid groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4)), args=(make_arg((2, 2, 2)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': -1}), error_regex="non-positive groups is not supported")
+
+    # error inputs for invalid groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4)), args=(make_arg((2, 2, 2)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': 0}), error_regex="non-positive groups is not supported")
+
+
+def error_inputs_conv2d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float64)
+    make_int_arg = partial(make_tensor, device=device, dtype=torch.int64)
+    make_complex_arg = partial(make_tensor, device=device, dtype=torch.complex128)
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_int_arg((2, 4, 4)), args=(make_int_arg((3, 2, 3, 3)), make_arg((3,)))),
+        error_regex="should be the same")
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_arg((2, 4, 4)), args=(make_arg((3, 2, 3, 3)), make_complex_arg((3,)))),
+        error_regex="should be the same")
+
+    # error inputs for negative strides
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 4)), args=(make_arg((1, 2, 2, 3)), make_arg((1,))),
+                    kwargs={'stride': (-1,)}), error_regex="non-positive stride is not supported")
+
+    # error inputs for negative padding
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 3)), args=(make_arg((1, 2, 2, 4)), make_arg((1,))),
+                    kwargs={'padding': (-1,)}), error_regex="negative padding is not supported")
+
+    # error inputs for negative dilation
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 2)), args=(make_arg((1, 1, 2, 5)), make_arg((1,))),
+                    kwargs={'dilation': (-1,)}), error_regex="dilation should be greater than zero")
+
+    # FIXME: https://github.com/pytorch/pytorch/issues/85656
+    # error inputs for bias shape not equal to the output channels
+    # yield ErrorInput(SampleInput(make_arg((1, 1, 4, 4)), args=(make_arg((1, 1, 3, 2)), make_arg((2,)))),
+    #                  error_regex="expected bias to be 1-dimensional with 1 elements")
+
+    # error inputs for input.ndim != weight.ndim
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 3)), args=(make_arg((1, 2, 2)), make_arg((1,))),
+                    kwargs={'padding': 'same'}), error_regex="Expected 3-dimensional input for 3-dimensional weight")
+
+    # error inputs for the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4, 3)), args=(make_arg((2, 2, 1, 3)), make_arg((2,))),
+                    kwargs={'groups': 3}), error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for groups the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4, 3)), args=(make_arg((2, 2, 1, 3)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': 3}), error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for invalid groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4, 5)), args=(make_arg((2, 2, 1, 4)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': -1}), error_regex="non-positive groups is not supported")
+
+    # error inputs for invalid groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 4, 3)), args=(make_arg((2, 2, 4, 3)), make_arg((2,))),
+                    kwargs={'padding': 'same', 'groups': 0}), error_regex="non-positive groups is not supported")
+
+
+def sample_inputs_conv2d(op_info, device, dtype, requires_grad, jit_fail_sample=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias
+    # and a dict of values of (stride, padding, groups, dilation)
+    cases: Tuple = (
+        ((1, 3, 4, 4), (3, 3, 3, 3), (3,),
+            {'stride': (2, 2), 'padding': 2, 'groups': 1}),
+        ((2, 4, 8, 8), (2, 2, 3, 3), (2,),
+            {'stride': (3, 2), 'padding': (2, 1), 'groups': 2, 'dilation': (4, 4)}),
+        ((1, 4, 5, 5), (1, 4, 2, 3), (1,),
+            {'stride': 2, 'padding': 1, 'groups': 1, 'dilation': (2, 3)}),
+        ((1, 4, 5, 5), (1, 4, 2, 3), (1,),
+            {'stride': 2, 'padding': 1, 'groups': 1, 'dilation': (2, 3)}),
+        ((1, 2, 4, 3), (4, 2, 3, 4), None,
+            {'stride': 2, 'padding': 1, 'groups': 1}),
+        ((1, 4, 5, 5), (1, 4, 2, 3), (1,),
+            {'stride': 2, 'padding': "valid"}),
+        ((1, 4, 5, 5), (1, 4, 2, 3), (1,),
+            {'stride': 1, 'padding': "same", 'dilation': 3}),
+        # Below are the group related samples from common_nn.py
+        ((2, 4, 6, 6), (4, 1, 3, 3), (4,), {'groups': 4}),
+        ((2, 4, 6, 6), (8, 1, 3, 3), (8,), {'groups': 4}),
+        ((2, 4, 6, 6), (8, 1, 3, 3), None, {'groups': 4}),
+        ((2, 4, 6, 6), (4, 1, 3, 3), (4,), {'groups': 4, 'stride': (3, 2)}),
+        ((2, 4, 6, 6), (4, 1, 3, 3), (4,), {'groups': 4, 'padding': (1, 1)}),
+        ((2, 4, 5, 5), (4, 1, 2, 2), (4,), {'groups': 4, 'dilation': (2, 2)}),
+        ((2, 4, 6, 5), (6, 2, 3, 2), (6,), {'groups': 2}),
+        # With defaults
+        ((1, 4, 5, 5), (3, 4, 3, 3), None, {}),
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+
+def sample_inputs_conv3d(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as shapes for input, weight, bias
+    # and dict of values of (stride, padding, dilation, groups)
+    cases: Tuple = (
+        ((1, 1, 4, 4, 4), (1, 1, 1, 1, 1), (1,), {'padding': 'same'}),
+        ((1, 1, 4, 4, 4), (1, 1, 4, 4, 4), (1,), {'stride': (2, 2, 2)}),
+        ((1, 1, 5, 5, 5), (1, 1, 3, 3, 3), (1,), {'dilation': 2}),
+        ((1, 1, 1, 1, 10), (1, 1, 1, 1, 4), None, {'padding': 'valid'}),
+        ((1, 1, 10, 11, 12), (1, 1, 1, 2, 5), None, {'padding': 'same'}),
+        ((1, 1, 10, 11, 12), (1, 1, 1, 2, 5), None, {'padding': 'same', 'dilation': 2}),
+        ((1, 1, 10, 11, 12), (1, 1, 4, 4, 4), None, {'padding': 'same', 'dilation': 3}),
+        ((1, 1, 1, 1, 10), (1, 1, 1, 1, 4), None, {'padding': 'valid'}),
+        ((3, 9, 3, 1, 9), (3, 3, 3, 1, 9), (3,), {'groups': 3}),
+        ((3, 9, 3, 1, 9), (3, 3, 3, 1, 9), (3,), {'stride': (2, 2, 2), 'dilation': 1, 'groups': 3}),
+    )
+
+    for input_shape, weight, bias, kwargs in cases:
+        # Batched
+        yield SampleInput(make_arg(input_shape), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+        # Unbatched
+        yield SampleInput(make_arg(input_shape[1:]), args=(
+            make_arg(weight),
+            make_arg(bias) if bias is not None else bias
+        ), kwargs=kwargs)
+
+
+def error_inputs_conv3d(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float64)
+    make_int_arg = partial(make_tensor, device=device, dtype=torch.int64)
+    make_complex_arg = partial(make_tensor, device=device, dtype=torch.complex128)
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_int_arg((1, 1, 4, 4, 4)), args=(make_int_arg((1, 1, 2, 2, 2)), make_arg((1,)))),
+        error_regex="should be the same")
+
+    # error inputs for different dtypes of input tensor and bias
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 4, 4)), args=(make_arg((1, 1, 2, 2, 2)), make_complex_arg((1,)))),
+        error_regex="should be the same")
+
+    # error inputs for negative strides
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 4, 4)), args=(make_arg((1, 1, 2, 2, 2)), make_arg((1,))),
+                    kwargs={'stride': (-1,)}), error_regex="non-positive stride is not supported")
+
+    # error inputs for negative padding
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 4, 4)), args=(make_arg((1, 1, 2, 2, 2)), make_arg((1,))),
+                    kwargs={'padding': (-1,)}), error_regex="negative padding is not supported")
+
+    # error inputs for negative dilation
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 4, 4, 4)), args=(make_arg((1, 1, 2, 2, 2)), make_arg((1,))),
+                    kwargs={'dilation': (-1,)}), error_regex="dilation should be greater than zero")
+
+    # FIXME: https://github.com/pytorch/pytorch/issues/85656
+    # error inputs for bias shape not equal to the output channels
+    # yield ErrorInput(SampleInput(make_arg((1, 1, 4, 4, 4)), args=(make_arg((1, 1, 3, 3, 3)), make_arg((2,)))),
+    #                  error_regex="expected bias to be 1-dimensional with 1 elements")
+
+    # error inputs for input.ndim != weight.ndim
+    yield ErrorInput(
+        SampleInput(make_arg((1, 1, 3, 4, 5)), args=(make_arg((1, 1, 4, 3)), make_arg((1,))),
+                    kwargs={'padding': 'same'}), error_regex="Expected 4-dimensional input for 4-dimensional weight")
+
+    # error inputs for the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 3, 4, 5)), args=(make_arg((2, 2, 4, 3, 3)),
+                    make_arg((2,))), kwargs={'groups': 3}),
+        error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for the weight[0] are less than the number of groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 3, 4, 5)), args=(make_arg((2, 2, 4, 3, 3)),
+                    make_arg((2,))), kwargs={'padding': 'same', 'groups': 3}),
+        error_regex="expected weight to be at least 3 at dimension 0")
+
+    # error inputs for invalid groups
+    yield ErrorInput(
+        SampleInput(make_arg((2, 2, 3, 4, 5)), args=(make_arg((2, 2, 4, 3, 3)),
+                    make_arg((2,))), kwargs={'padding': 'same', 'groups': 0}),
+        error_regex="non-positive groups is not supported")
+
+    # error inputs for padding='same' not supported by strided convolutions
+    yield ErrorInput(
+        SampleInput(make_arg((18, 27, 9, 1, 9)), args=(make_arg((9, 9, 9, 1, 9)),
+                    make_arg((9,))), kwargs={'stride': 2, 'padding': 'same', 'groups': 3}),
+        error_regex="padding='same' is not supported for strided convolutions")
+
+
+def sample_inputs_group_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, num groups, and kwargs for eps
+    cases: Tuple[Tuple[int], int, float] = (  # type: ignore[assignment]
+        ((1, 6, 3), 2, {'eps' : 0.5}),
+        ((2, 6, 3), 2, {'eps' : -0.5}),
+        ((1, 3), 1, {'eps' : 1e-5}),
+        ((0, 2), 1, {'eps' : 1e-5}),
+        ((S, S, S), 1, {'eps' : 0.5}),
+    )
+
+    # num_channels is inferred to be input.shape[1] dimension
+    for input_shape, num_groups, kwargs in cases:
+        # Shape of weight and bias should be the same as num_channels
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        weight_tensor = make_arg(channels)
+        bias_tensor = make_arg(channels)
+
+        # Checking for permutations of weights and biases as `None`
+        weights = [weight_tensor, None]
+        biases = [bias_tensor, None]
+        for weight, bias in itertools.product(weights, biases):
+            kwargs = {
+                'weight': weight,
+                'bias': bias,
+                **kwargs
+            }
+            yield SampleInput(make_arg(input_shape), num_groups, **kwargs)
+
+    # Without any optional args
+    yield SampleInput(make_arg((1, 2)), args=(1,))
+
+def reference_inputs_group_norm(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_group_norm(
+        op_info, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, num groups, and kwargs for eps
+    cases: Tuple[Tuple[int], int, float] = (  # type: ignore[assignment]
+        ((20, 6, 10, 10), 3, {'eps' : 1e-5}),
+        # equivalent with InstanceNorm
+        # GroupNorm(C, num_groups=C) == InstanceNorm(num_features=C)
+        ((20, 6, 10, 10), 6, {'eps' : 1e-5}),
+        # equivalent with LayerNorm
+        # GroupNorm(C, num_groups=1, affine=False) == LayerNorm(normalized_shape=[C, H, W], elementwise_affine=False)
+        ((20, 6, 10, 10), 1, {'eps' : 1e-5}),
+    )
+
+    # num_channels is inferred to be input.shape[1] dimension
+    for input_shape, num_groups, kwargs in cases:
+        # Shape of weight and bias should be the same as num_channels
+        channels = input_shape[1] if len(input_shape) > 1 else 0
+        input_tensor = make_arg(input_shape)
+        weight_tensor = make_arg(channels)
+        bias_tensor = make_arg(channels)
+
+        # Checking for permutations of weights and biases as `None`
+        weights = [weight_tensor, None]
+        biases = [bias_tensor, None]
+        for weight, bias in itertools.product(weights, biases):
+            kwargs = {
+                'weight': weight,
+                'bias': bias,
+                **kwargs
+            }
+            yield SampleInput(input_tensor, num_groups, **kwargs)
+
+
+def sample_inputs_instance_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_arg_without_requires_grad = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    # Ordered as: input shape, kwargs for momentum, eps
+    cases: Tuple[Tuple[int], dict] = (  # type: ignore[assignment]
+        ((S, S, S), {'momentum': 0.5, 'eps': 0.6}),
+        ((S, S, S), {'momentum': 0.5, 'eps': 0.6, 'use_input_stats': True}),
+        ((3, 2, 4), {'momentum': -1.2}),
+        ((3, 2, 4), {'momentum': 0.0}),
+        ((3, 2, 3, 4), {'momentum': -1.0, 'eps': 0.5}),
+        ((3, 2, 3, 4), {'momentum': -1.0, 'eps': 0.5}),
+    )
+
+    for input_shape, kwargs in cases:
+        # args: running mean, running var, weight and bias should necessarily be of shape: (channels,)
+        channels = input_shape[1]
+        weight = make_arg(channels)
+        bias = make_arg(channels)
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
+        new_kwargs = {
+            'running_mean': running_mean,
+            'running_var': running_var,
+            'weight': weight,
+            'bias': bias,
+            **kwargs
+        }
+
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(),
+            kwargs=new_kwargs
+        )
+
+    # Checking for permutations of weights and biases as `None`
+    # instance_norm assumes that if there's a bias, there's a weight
+    weights = [channels, None]
+    biases = [None, None]
+
+    for weight_channels, bias_channels in zip(weights, biases):
+        running_mean = make_arg_without_requires_grad(channels, low=0)
+        running_var = make_arg_without_requires_grad(channels, low=0)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(),
+            kwargs={
+                'running_mean': running_mean,
+                'running_var': running_var,
+                'weight': make_arg(weight_channels) if weight_channels is not None else None,
+                'bias': make_arg(bias_channels) if bias_channels is not None else None
+            }
+        )
+
+    # Test case for no optional kwargs
+    yield SampleInput(make_arg((1, 2, 3)), kwargs={})
+
+
+def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, normalized_shape and a kwarg dict for eps
+    cases: Tuple[Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), {'eps': 0.5}),
+        ((2, 2, 3), (2, 3), {'eps': -0.5}),
+        ((1,), (1,), {}),
+        ((1, 2), (2,), {}),
+        ((0, 1), (1,), {}),
+    )
+
+    for input_shape, normalized_shape, kwargs in cases:
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)
+        bias = make_arg(normalized_shape)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, weight, bias),
+            kwargs=kwargs
+        )
+    # Without any optional args
+    yield SampleInput(make_arg((1, 2)), args=((2,),))
+
+    # TODO: @krshrimali, once to_numpy method in SampleInput class is modified to take None inputs,
+    # enable these inputs; see https://github.com/pytorch/pytorch/pull/63276#discussion_r691950400
+
+    # With weight and a `None` bias
+    # yield SampleInput(make_arg((1, 2)), args=((2,), make_arg((2,)), None))
+
+    # With `None` weight and bias (tests failing for this, see the link above)
+    # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,))))
+
+
+def sample_inputs_native_layer_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, normalized_shape, eps
+    cases: Tuple[Tuple[int], Tuple[int], float] = (  # type: ignore[assignment]
+        ((1, 2, 3), (1, 2, 3), 0.5),
+        ((2, 2, 3), (2, 3), -0.5),
+        ((1,), (1,), 1e-5),
+        ((1, 2), (2,), 1e-5),
+        ((0, 1), (1,), 1e-5),
+    )
+
+    for input_shape, normalized_shape, eps in cases:
+        # Shape of weight and bias should be the same as normalized_shape
+        weight = make_arg(normalized_shape)
+        bias = make_arg(normalized_shape)
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, weight, bias, eps),
+        )
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, None, bias, eps),
+        )
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, weight, None, eps),
+        )
+        yield SampleInput(
+            make_arg(input_shape),
+            args=(normalized_shape, None, None, eps),
+        )
+
+def error_inputs_group_norm(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+
+    # check that input has minimum number of dimensions
+    err_msg1 = "Expected at least 2 dimensions for input tensor but received"
+    s1 = SampleInput(make_arg(1), args=(1,))
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    # check that the channels dimension is compatible with number of groups
+    err_msg2 = "Expected number of channels in input to be divisible by num_groups, but got input of shape"
+    s2 = SampleInput(make_arg((2, 7, 4)), args=(2,))
+    yield ErrorInput(s2, error_regex=err_msg2)
+
+def error_inputs_native_layer_norm(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32, requires_grad=False)
+    input_shape = (1, 2, 3)
+
+    err_msg1 = "Expected normalized_shape to be at least 1-dimensional"
+    s1 = SampleInput(
+        make_arg(input_shape), args=(tuple(), None, None, 1e-5)
+    )
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    normalized_shape = (1, 2, 3)
+    weight = make_arg((1, 2))
+    err_msg2 = "Expected weight to be of same shape as normalized_shape"
+    s2 = SampleInput(
+        make_arg(input_shape), args=(normalized_shape, weight, None, 1e-5)
+    )
+    yield ErrorInput(s2, error_regex=err_msg2)
+
+    bias = make_arg((1, 2))
+    err_msg3 = "Expected bias to be of same shape as normalized_shape"
+    s3 = SampleInput(
+        make_arg(input_shape), args=(normalized_shape, None, bias, 1e-5)
+    )
+    yield ErrorInput(s3, error_regex=err_msg3)
+
+    err_msg4 = "Given normalized_shape="
+    s4 = SampleInput(
+        make_arg((2, 2, 3)), args=((2, 2), None, None, 1e-5)
+    )
+    yield ErrorInput(s4, error_regex=err_msg4)
+
+
+def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Ordered as input shape, size and a kwarg dict for alpha, beta, and k
+    cases: Tuple[Tuple[int], Tuple[int], dict] = (  # type: ignore[assignment]
+        ((1, 6, 3), 2, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}),
+        ((1, 6, 3), 2, {'beta': 0.5, 'k': 1.25}),
+        ((1, 6, 3), 2, {'alpha': 3e-05, 'k': 1.25}),
+        ((1, 6, 3), 2, {'alpha': 3e-05, 'beta': 0.5}),
+        ((1, 6, 3), 2, {'alpha': 3e-05}),
+        ((1, 6, 3), 2, {'beta': 0.5}),
+        ((1, 6, 3), 2, {'k': 1.25}),
+        ((1, 6, 3), 2, {}),
+        ((2, 6, 3), 2, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}),
+        ((1, 1, 2), 1, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}),
+        ((0, 1, 2), 1, {'alpha': 3e-05, 'beta': 0.5, 'k': 1.25}),
+    )
+
+    for input_shape, size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs)
+
+def sample_inputs_hardswish(self, device, dtype, requires_grad, **kwargs):
+    N = 5
+    # make sure we are testing -3 -> 3 range. default is -10 -> 10 so maybe unnecessary ?
+    make_arg = partial(make_tensor, device=device, dtype=dtype,
+                       requires_grad=requires_grad, low=-5, high=5)
+    return (SampleInput(make_arg((N * 2, N * 2))) for _ in range(1, N))
+
+def sample_inputs_linear(self, device, dtype, requires_grad, **kwargs):
+    features_options = [[3, 4], [8, 8]]
+    batch_options: List[List[int]] = [
+        [],  # no batch
+        [0],
+        [8],
+        [2, 3],
+    ]
+    create_tensor = partial(make_tensor, device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-2, high=2)
+
+    for has_bias, (in_feat, out_feat), batch_shape in \
+            itertools.product([True, False], features_options, batch_options):
+        input_tensor = create_tensor(batch_shape + [in_feat])
+        weight = create_tensor([out_feat, in_feat])
+        if not has_bias:
+            yield SampleInput(input_tensor, weight)
+            continue
+
+        bias = create_tensor([out_feat])
+        yield SampleInput(input_tensor, weight, bias)
+
+    # 5D tensor, used to crash on MPS, see https://github.com/pytorch/pytorch/issues/114942
+    yield SampleInput(create_tensor(2, 1, 2, 1, 2), create_tensor(4, 2))
+    yield SampleInput(create_tensor(2, 1, 2, 1, 2), create_tensor(4, 2), create_tensor(4))
+
+def sample_inputs_bilinear(self, device, dtype, requires_grad, **kwargs):
+    features_options = [[3, 4, 5], [8, 8, 8]]
+    batch_options: List[List[int]] = [
+        [],  # no batch
+        [0],
+        [8],
+        [2, 3],
+    ]
+    create_tensor = partial(make_tensor, device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-2, high=2)
+
+    for has_bias, (in_feat1, in_feat2, out_feat), batch_shape in \
+            itertools.product([True, False], features_options, batch_options):
+        input_tensor1 = create_tensor(batch_shape + [in_feat1])
+        input_tensor2 = create_tensor(batch_shape + [in_feat2])
+        weight = create_tensor([out_feat, in_feat1, in_feat2])
+        if not has_bias:
+            yield SampleInput(input_tensor1, input_tensor2, weight)
+            continue
+        bias = create_tensor([out_feat])
+        yield SampleInput(input_tensor1, input_tensor2, weight, bias)
+
+def sample_inputs_glu(self, device, dtype, requires_grad, **kwargs):
+    features_options = [[2], [2, 4], [8, 8], [3, 6, 8], [1, 4, 6, 7]]
+    batch_options: List[List[int]] = [
+        [],  # no batch
+        [0],
+        [8],
+        [2, 3],
+    ]
+    create_tensor = partial(make_tensor, device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-2, high=2)
+
+    for features, batch_shape in itertools.product(features_options, batch_options):
+        ndim = len(features) + len(batch_shape)
+        for dim in range(ndim):
+            input_tensor = create_tensor(batch_shape + features)
+            dim_size = input_tensor.size(dim)
+            if dim_size > 0 and dim_size % 2 == 0:
+                yield SampleInput(input_tensor, dim)
+
+def sample_inputs_interpolate(mode, self, device, dtype, requires_grad, **kwargs):
+    N, C = 2, 3
+    D = 4
+    S = 3
+    L = 5
+
+    align_corners_options: Tuple[Any, ...] = (None,)
+    if mode in ('linear', 'bilinear', 'bicubic', 'trilinear'):
+        align_corners_options = (True, False, None)
+    ranks_for_mode = {
+        'nearest': [1, 2, 3],
+        'nearest-exact': [1, 2, 3],
+        'linear': [1],
+        'bilinear': [2],
+        'bicubic': [2],
+        'trilinear': [3],
+        'area': [1, 2, 3]
+    }
+
+    def shape(size, rank, with_batch_channel=True):
+        if with_batch_channel:
+            return tuple([N, C] + ([size] * rank))
+        return tuple([size] * rank)
+
+    if mode in ('bilinear', 'bicubic') and dtype == torch.uint8:
+        make_arg = partial(
+            make_tensor,
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            # we pick more realistic upper bound 256 instead of default 10 for uint8 dtype
+            high=256 if dtype == torch.uint8 else None,
+        )
+        # provide few samples for a more close to typical image processing usage
+        rank = 2
+        for memory_format in [torch.contiguous_format, torch.channels_last]:
+            yield SampleInput(
+                make_arg(shape(270, rank), memory_format=memory_format),
+                shape(130, rank, False),
+                scale_factor=None,
+                mode=mode,
+                align_corners=False,
+            )
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for align_corners in align_corners_options:
+        for rank in ranks_for_mode[mode]:
+            yield SampleInput(
+                make_arg(shape(D, rank)),
+                shape(S, rank, False),
+                scale_factor=None,
+                mode=mode,
+                align_corners=align_corners,
+            )
+            yield SampleInput(
+                make_arg(shape(D, rank)),
+                shape(L, rank, False),
+                scale_factor=None,
+                mode=mode,
+                align_corners=align_corners,
+            )
+            for recompute_scale_factor in [False, True]:
+                for scale_factor in [1.7, 0.6]:
+                    yield SampleInput(
+                        make_arg(shape(D, rank)),
+                        size=None,
+                        scale_factor=scale_factor,
+                        mode=mode,
+                        align_corners=align_corners,
+                        recompute_scale_factor=recompute_scale_factor,
+                    )
+
+def reference_inputs_interpolate(mode, self, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_interpolate(mode, self, device, dtype, requires_grad, **kwargs)
+
+    if mode in ('bilinear', 'bicubic'):
+        make_arg = partial(
+            make_tensor,
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            # we pick more realistic upper bound 256 instead of default 10 for uint8 dtype
+            high=256 if dtype == torch.uint8 else None,
+        )
+        # provide few samples for more typical image processing usage
+        for memory_format in [torch.contiguous_format, torch.channels_last]:
+            for aa in [True, False]:
+                yield SampleInput(
+                    make_arg((2, 3, 345, 456), memory_format=memory_format),
+                    (270, 270),
+                    scale_factor=None,
+                    mode=mode,
+                    align_corners=False,
+                    antialias=aa,
+                )
+
+def sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
+    N, C = 2, 3
+    D = 4
+    S = 3
+    L = 5
+
+    ranks_for_mode = {
+        'nearest': [1, 2, 3],
+        'bilinear': [2],
+    }
+
+    def shape(size, rank, with_batch_channel=True):
+        if with_batch_channel:
+            return torch.Size([N, C] + ([size] * rank))
+        return torch.Size([size] * rank)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for rank in ranks_for_mode[mode]:
+        yield SampleInput(make_arg(shape(D, rank)), size=shape(S, rank, False))
+        yield SampleInput(make_arg(shape(D, rank)), size=shape(L, rank, False))
+        yield SampleInput(make_arg(shape(D, rank)), scale_factor=1.7)
+        yield SampleInput(make_arg(shape(D, rank)), scale_factor=0.6)
+
+def reference_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs)
+
+    if mode in ('bilinear', ):
+        make_arg = partial(
+            make_tensor,
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            # we pick more realistic upper bound 256 instead of default 10 for uint8 dtype
+            high=256 if dtype == torch.uint8 else None,
+        )
+        # provide a single sample for more typical image processing usage
+        for memory_format in [torch.contiguous_format, torch.channels_last]:
+            yield SampleInput(
+                make_arg((2, 3, 345, 456), memory_format=memory_format),
+                (270, 270),
+            )
+
+def sample_inputs_upsample_aa(mode, self, device, dtype, requires_grad, **kwargs):
+    N = 6
+    C = 3
+    H = 10
+    W = 20
+    S = 3
+    L = 5
+
+    input_tensor = make_tensor(torch.Size([N, C, H, W]), device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(input_tensor, output_size=torch.Size([S, S]), align_corners=False, scale_factors=None)
+    yield SampleInput(input_tensor, output_size=torch.Size([L, L]), align_corners=False, scale_factors=None)
+    yield SampleInput(input_tensor, output_size=None, align_corners=False, scale_factors=[1.7, 0.9])
+    yield SampleInput(input_tensor, output_size=None, align_corners=True, scale_factors=[0.8, 1.0])
+
+    yield SampleInput(input_tensor, output_size=torch.Size([S, S]), align_corners=False, scales_h=None, scales_w=None)
+    yield SampleInput(input_tensor, output_size=torch.Size([S, S]), align_corners=False, scales_h=1.7, scales_w=0.9)
+    yield SampleInput(input_tensor, output_size=torch.Size([S, S]), align_corners=True, scales_h=1.7, scales_w=0.9)
+
+def sample_inputs_gelu(self, device, dtype, requires_grad, **kwargs):
+    N = 5
+    for _ in range(1, N):
+        for approximate in ['none', 'tanh']:
+            yield SampleInput(
+                make_tensor((N * 2, N * 2), device=device, dtype=dtype,
+                            requires_grad=requires_grad, low=-3, high=3),
+                approximate=approximate)
+
+
+def error_inputs_gelu(op, device, **kwargs):
+    # Tests that gelu errors out when passed an approximation we don't know.
+    yield ErrorInput(SampleInput(make_tensor((), dtype=torch.float, device=device), kwargs={"approximate": "asdf"}),
+                     error_regex="approximate argument must be either")
+
+
+def sample_inputs_max_min_reduction_with_dim(op_info, device, dtype, requires_grad, **kwargs):
+    inputs = []
+    args_for_reduction_with_dim = (
+        ((S, S, S), (1,),),
+        ((S, S, S), (1, True, ),),
+        ((), (0,),),
+        ((), (0, True,),),
+    )
+    return ((SampleInput(make_tensor(input_tensor, dtype=dtype, device=device,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                         *args))
+            for input_tensor, args in args_for_reduction_with_dim)
+
+def sample_inputs_max_min_reduction_no_dim(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    yield SampleInput(make_arg((S, S, S)))
+    yield SampleInput(make_arg(()))
+
+def _generate_nan_reduction_inputs(device, dtype, requires_grad, **kwargs):
+    yield from _generate_reduction_inputs(device, dtype, requires_grad)
+    # NaN only exists for floating point numbers
+    if dtype.is_complex or dtype.is_floating_point:
+        yield torch.tensor([2, torch.nan, -1], device=device, dtype=dtype, requires_grad=requires_grad)
+        yield torch.tensor([[torch.nan, 2], [0, 1]], device=device, dtype=dtype, requires_grad=requires_grad)
+
+def sample_inputs_nan_reduction(supports_multiple_dims):
+    # Generates sample inputs for reduction ops that contain the input tensor
+    # and dim and keepdim kwargs. If a reduction op needs to test additional
+    # args/kwargs then create a separate sample_inputs function
+    def fn(op_info, device, dtype, requires_grad, **kwargs):
+        for t in _generate_nan_reduction_inputs(device, dtype, requires_grad):
+            # Add case without dim and keepdim kwargs
+            yield SampleInput(t.clone().requires_grad_(requires_grad))
+            for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims):
+                yield SampleInput(t.clone().requires_grad_(requires_grad), **kwargs)
+
+    return fn
+
+def sample_inputs_reduction_quantile(op_info, device, dtype, requires_grad, **kwargs):
+    test_quantiles = (0.5, make_tensor((2,), dtype=dtype, device=device, low=0, high=1, requires_grad=requires_grad))
+    test_interpolations = ['linear', 'midpoint']
+
+    for quantiles in test_quantiles:
+        for t in _generate_reduction_inputs(device, dtype, requires_grad):
+            # Add case without dim and keepdim kwargs
+            input = t.clone().requires_grad_(requires_grad)
+            yield SampleInput(input, quantiles)
+            for kwargs in _generate_reduction_kwargs(t.ndim, supports_multiple_dims=False):
+                # Interpolation kwarg for now is only supported when providing both dim and keepdim
+                kwargs.setdefault('dim', 0)
+                kwargs.setdefault('keepdim', False)
+                for interpolation in test_interpolations:
+                    kwargs['interpolation'] = interpolation
+                    input = t.clone().requires_grad_(requires_grad)
+                    yield SampleInput(input, quantiles, **kwargs)
+
+def sample_inputs_reduction_count_nonzero(*args, **kwargs):
+    """Sample inputs for count_nonzero"""
+    # count_nonzero does not support keepdim yet
+    for sample in sample_inputs_reduction(*args, **kwargs):
+        sample.kwargs.pop('keepdim', None)
+        yield sample
+
+def sample_inputs_leaky_relu(op_info, device, dtype, requires_grad, **kwargs):
+    N = 10
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    return (SampleInput(make_arg((N, N))) for _ in range(1, N))
+
+def sample_inputs_fractional_max_pool2d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Order: input_shape, kernel_size
+    cases = (((1, 3, 9, 9), 3),
+             ((1, 3, 9, 9), (4, 4)),
+             ((1, 3, 9, 9), (6, 6)),
+             ((2, 3, 9, 9), (3, 3)),
+             ((1, 1, 4, 4), (2, 2)),
+             ((1, 2, 6, 6), (4, 4)))
+
+    for input_shape, kernel_size in cases:
+        for return_indices in [False, True]:
+            # test case passing a single output size
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_size=2,
+                return_indices=return_indices,
+            )
+
+            # test case passing a tuple output size
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_size=(2, 3),
+                return_indices=return_indices,
+            )
+
+            # test case passing an output ratio
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_ratio=(0.5, 0.5),
+                return_indices=return_indices,
+            )
+
+def sample_inputs_fractional_max_pool3d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Order: input_shape, kernel_size
+    cases = (((2, 3, 5, 5, 5), (2, 2, 2)),
+             ((1, 2, 6, 5, 4), 2),
+             ((1, 2, 5, 6, 5), (2, 3, 2)),
+             ((1, 2, 6, 6, 6), (2, 3, 2)),
+             ((1, 1, 7, 6, 7), (2, 3, 4)),
+             ((1, 1, 4, 5, 4), (2, 2, 1)),
+             ((1, 1, 8, 7, 6), (4, 3, 2)),
+             ((0, 1, 4, 5, 4), (2, 2, 1)))
+
+    for input_shape, kernel_size in cases:
+        for return_indices in [False, True]:
+            # test case passing a single output size
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_size=2,
+                return_indices=return_indices,
+            )
+
+            # test case passing a tuple output size
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_size=(2, 3, 2),
+                return_indices=return_indices,
+            )
+
+            # test case passing an output ratio
+            yield SampleInput(
+                make_arg(input_shape),
+                kernel_size,
+                output_ratio=(0.5, 0.5, 0.5),
+                return_indices=return_indices,
+            )
+
+def sample_inputs_avgpool2d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Order: input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override
+    cases = (((1, 3, 9, 9), 3, 1, 1, True, False, 2),
+             ((1, 3, 9, 9), (4, 4), (2, 3), 1, True, False, 2),
+             ((1, 3, 9, 9), (6, 6), (3, 3), (2, 3), True, True, 2),
+             ((2, 3, 9, 9), (3, 3), (1, 1), (1, ), True, False, 2),
+             ((1, 1, 4, 4), (2, 2), (), (0, ), False, True, -2),
+             ((1, 2, 6, 6), (4, 4), (2, 2), (2, ), True, True, None))
+
+    for input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override in cases:
+        yield SampleInput(make_arg(input_shape),
+                          args=(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override))
+    # Case with just input_shape and kernel_size
+    yield SampleInput(make_arg((1, 3, 9, 9)), args=((3, 3)))
+
+def sample_inputs_avgpool1d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Order: input_shape, kernel_size, kwargs
+    cases: List[Tuple[Tuple[int, ...], Union[int, Tuple[int, ...]], Dict]] = [
+        ((2, 3, 9), (3,), {}),
+        ((1, 3, 9), 3, dict(stride=1, padding=1, ceil_mode=True, count_include_pad=False)),
+        ((1, 3, 9), (6,), dict(stride=(3,), padding=(2,), ceil_mode=True, count_include_pad=True)),
+        ((2, 3, 9), (3,), dict(stride=(1,), padding=(1,), ceil_mode=False, count_include_pad=True)),
+        ((0, 3, 9), (6,), dict(stride=(3,), padding=(2,), ceil_mode=False, count_include_pad=True)),
+        ((1, 2, 9), (7,), dict(stride=(3,), padding=(2,), ceil_mode=False)),
+        ((1, 2, 9), (7,), dict(stride=(3,), padding=(3,), ceil_mode=True)),
+        ((1, 2, 9), (7,), dict(stride=(3,), ceil_mode=False)),
+        ((1, 2, 9), (7,), dict(stride=(3,), ceil_mode=True)),
+    ]
+
+    for input_shape, kernel_size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
+
+def sample_inputs_avgpool3d(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Order: input_shape, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override
+    cases: List[Tuple[Tuple[int, ...], Union[int, Tuple[int, ...]], Dict]] = [
+        ((2, 3, 3, 4, 4), (2, 2, 2), {}),
+        ((1, 2, 4, 4, 4), 2, dict(stride=1, padding=1, ceil_mode=True,
+                                  count_include_pad=False, divisor_override=2)),
+        ((1, 2, 5, 5, 5), (2, 3, 4), dict(stride=(1, 2, 2), padding=(0, 1, 2), ceil_mode=True,
+                                          count_include_pad=True, divisor_override=2)),
+        ((1, 2, 5, 5, 5), (2, 3, 4), dict(stride=(1, 2, 2), padding=(0, 1, 2), ceil_mode=False)),
+        ((1, 1, 7, 5, 7), (6, 3, 4), dict(stride=(2, 3, 2), padding=(3, 1, 0), ceil_mode=False,
+                                          count_include_pad=False, divisor_override=2)),
+        ((1, 1, 4, 5, 4), (2, 2, 3), dict(stride=(2, 2, 1), padding=0, ceil_mode=False,
+                                          count_include_pad=True, divisor_override=-2)),
+        ((1, 1, 6, 5, 6), (4, 5, 6), dict(stride=(2, 3, 2), padding=2, ceil_mode=True,
+                                          count_include_pad=True, divisor_override=None)),
+        ((0, 1, 4, 5, 4), (2, 3, 1), dict(stride=(2, 1, 2), padding=0, ceil_mode=False,
+                                          count_include_pad=True, divisor_override=None)),
+    ]
+
+    for input_shape, kernel_size, kwargs in cases:
+        yield SampleInput(make_arg(input_shape), args=(kernel_size,), kwargs=kwargs)
+
+def error_inputs_avg_pool1d(op_info, device, **kwargs):
+    # error inputs when pad is negative
+    x = torch.rand([0, 1, 49], dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1}),
+                     error_regex='pad must be non-negative')
+
+    # error inputs when pad > kernel_size / 2
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+def error_inputs_avg_pool2d(op_info, device, **kwargs):
+    # error inputs when pad is negative
+    x = torch.rand([0, 1, 49], dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1}),
+                     error_regex='pad must be non-negative')
+    # 2-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': -1}),
+                     error_regex='pad must be non-negative')
+
+    # error inputs when pad > kernel_size / 2
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
+                     error_regex='pad should be at most half of effective kernel size')
+    # 2-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2), 'stride': 50, 'padding': 4}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error inputs for zero divisor
+    x = torch.zeros(3, 3, 3)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (2, 2), 'divisor_override': 0}),
+                     error_regex='divisor must be not zero')
+
+def error_inputs_avg_pool3d(op_info, device, **kwargs):
+    # error inputs when pad is negative
+    x = torch.rand([0, 1, 49, 50], dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': -1}),
+                     error_regex='pad must be non-negative')
+    # 3-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50, 'padding': -1}),
+                     error_regex='pad must be non-negative')
+
+    # error inputs when pad > kernel_size / 2
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 4}),
+                     error_regex='pad should be at most half of effective kernel size')
+    # 3-dimensional kernel
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (3, 2, 2), 'stride': 50, 'padding': 4}),
+                     error_regex='pad should be at most half of effective kernel size')
+
+    # error inputs for zero divisor
+    x = torch.zeros(3, 3, 3, 3)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': (2, 2, 2), 'divisor_override': 0}),
+                     error_regex='divisor must be not zero')
+
+    # error inputs for invalid input dimension
+    x = torch.rand([0, 1, 49], dtype=torch.float32)
+    yield ErrorInput(SampleInput(x, kwargs={'kernel_size': 2, 'stride': 50, 'padding': 0}),
+                     error_regex='non-empty 4D or 5D')
+
+
+def sample_inputs_to(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # test_multiple_devices_to_cuda would fail if we use a different device than given
+    devices = [device]
+    if torch.device(device).type == 'cpu':
+        devices = [torch.device('cpu'), torch.device('cuda:0')] if torch.cuda.is_available() else devices
+    memory_formats = [torch.preserve_format, torch.channels_last]
+
+    # TODO: can't switch `to.device` overload to use positional arguments
+    # https://github.com/pytorch/pytorch/issues/84265
+    # to.device overload
+    for device, nb, cp, mem_f in product(devices, [True, False], [True, False], memory_formats):
+        kwargs = {
+            "memory_format": mem_f,
+        }
+        yield SampleInput(make_arg((S, S, S, S)), args=(device, torch.float64, nb, cp), kwargs=kwargs)
+
+    # to.dtype overload
+    for nb, cp, mem_f in product([True, False], [True, False], memory_formats):
+        kwargs = {
+            "memory_format": mem_f,
+        }
+        yield SampleInput(make_arg((S, S, S, S)), args=(torch.float64, nb, cp), kwargs=kwargs)
+
+    # to.other overload
+    for device, nb, cp, mem_f in product(devices, [True, False], [True, False], memory_formats):
+        kwargs = {
+            "memory_format": mem_f,
+        }
+        other = make_arg((S, S, S, S), dtype=torch.float64, device=device)
+        yield SampleInput(make_arg((S, S, S, S)), args=(other, nb, cp), kwargs=kwargs)
+
+
+def sample_inputs_topk(op_info, device, dtype, requires_grad, **kwargs):
+    def get_tensor_input(size):
+        return make_tensor(size, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    yield SampleInput(get_tensor_input((S, M, S)), 3)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, 1)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, -2)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, 1, True)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, -2, True)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, 1, True, True)
+    yield SampleInput(get_tensor_input((S, M, S)), 3, -2, True, True)
+
+    yield SampleInput(get_tensor_input(()), 1)
+    yield SampleInput(get_tensor_input(()), 1, 0)
+    yield SampleInput(get_tensor_input(()), 1, -1)
+    yield SampleInput(get_tensor_input(()), 1, 0, True)
+    yield SampleInput(get_tensor_input(()), 1, -1, True)
+    yield SampleInput(get_tensor_input(()), 1, 0, True, True)
+    yield SampleInput(get_tensor_input(()), 1, -1, True, True)
+
+def sample_inputs_outer(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg(S), make_arg(M))
+
+def sample_inputs_dist(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    sizes = ((S, S, S), (S,), (S, 1, S), (), (S, S))
+    ps = (2, 4)
+
+    for size_x, size_y, p in product(sizes, sizes, ps):
+        yield SampleInput(make_arg(size_x), args=(make_arg(size_y), p))
+
+# Missing to test the nondeterminism of the operation
+# https://github.com/pytorch/pytorch/issues/53352
+def sample_inputs_index(op_info, device, dtype, requires_grad, reference=False, **kwargs):
+    # target.index_select(dim, idx)
+    select = "index_select" in op_info.name
+    # target.index_add(dim, idx, source, *, alpha=1)
+    add = "index_add" in op_info.name
+    # target.index_copy(dim, idx, source)
+    copy = "index_copy" in op_info.name
+    # target.index_fill(dim, idx, value)
+    fill = "index_fill" in op_info.name
+
+    # Extended reference inputs. We generate that exercise atomic adds / writing
+    # several times to one location
+    if reference:
+        make_arg = partial(torch.ones, device=device, dtype=dtype, requires_grad=requires_grad)
+        make_idx = partial(torch.zeros, device=device, dtype=torch.int64)
+    else:
+        make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+        # idx They need to be different for copy and add to be deterministic
+        if copy or add:
+            make_idx = partial(torch.randperm, device=device, dtype=torch.int64)
+        else:
+            def make_idx(n):
+                return make_tensor((n,), device=device, dtype=torch.int64, low=0, high=n)
+
+    shapes = [(), (1,), (S, S)]
+    # extra parameter for add
+    if add:
+        if dtype == torch.bool:
+            alphas = (True, False)
+        else:
+            alphas = (-1, 0, 2)
+    else:
+        alphas = (None,)
+
+    if fill:
+        # A weird number to catch errors.
+        # The former one tests `index_fill.int_Scalar`, and the latter one tests `index_fill.int_Tensor`.
+        values = (make_arg((1,)).item(), make_arg(()))
+    else:
+        values = (None,)
+
+    for shape, alpha, value in product(shapes, alphas, values):
+        t = make_arg(shape)
+        args = []
+
+        # dim. We handle the scalar case
+        dim = -1 if t.ndim == 2 else 0
+        args.append(dim)
+
+        idx = make_idx(t.shape[dim] if t.ndim != 0 else 1)
+        args.append(idx)
+
+        # source
+        if copy or add:
+            args.append(make_arg(shape))
+        elif fill:
+            args.append(value)
+
+        args = tuple(args)
+        kwargs = {} if alpha is None else {"alpha": alpha}
+
+        yield SampleInput(t, args=args, kwargs=kwargs)
+
+def sample_inputs_index_reduce(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_idx(n, m):
+        return make_tensor((n,), device=device, dtype=torch.int64, low=0, high=m)
+
+    shapes = [((), ()), ((1,), (1,)), ((S, S), (S, M)), ((S, S, S), (S, M, S))]
+    include_selfs = (True, False)
+    reduces = ('prod', 'mean', 'amin', 'amax')
+
+    for shape, include_self, reduce in product(shapes, include_selfs, reduces):
+        self_shape, src_shape = shape
+        # dim. We handle the scalar case
+        dim = 1 if len(self_shape) >= 2 else 0
+        idx = make_idx(src_shape[dim] if len(src_shape) != 0 else 1,
+                       self_shape[dim] if len(self_shape) != 0 else 1)
+        args = (dim, idx, make_arg(src_shape), reduce)
+        yield SampleInput(make_arg(self_shape),
+                          args=args,
+                          kwargs={'include_self' : include_self})
+
+    # Sample inputs to test edge cases for backward
+    if requires_grad:
+        # Check that gradients are propagated correctly for prod when zeros in self/src are reduced
+        # This sample tests gradients for the following cases
+        # (a) 1 zero reduced (from source (self[0, 1]), from self (self[0, 0]))
+        # (b) 2 zeros reduced (1 from src and 1 from self (self[1, 0], self[1, 1])
+        # (c) no zeros reduced (self[2, 1], self[2, 2])
+        # (d) 2 zeros reduced (both from src) is tested in test/test_autograd.py
+        #     test_scatter_index_reduce_prod_gradgrad_error as this case is not supported for gradgrad
+        input = torch.tensor([[0, 13], [0, 0], [15, 19]], dtype=dtype, device=device, requires_grad=requires_grad)
+        src = torch.tensor([[2, 0], [0, 0], [2, 3], [2, 2]], dtype=dtype, device=device, requires_grad=requires_grad)
+        idx = torch.tensor([0, 1, 2, 0], dtype=torch.long, device=device)
+
+        yield SampleInput(input,
+                          args=(0, idx, src, 'prod'),
+                          kwargs={'include_self': True})
+
+def sample_inputs_mode(op_info, device, dtype, requires_grad, **kwargs):
+    args = (
+        ((S, S, S), (),),
+        ((S, S, S), (1, ),),
+        ((S, S, S), (1, True, ),),
+        ((), (),),
+        ((), (0,),),
+        ((), (0, True,),),
+        # Non-fused mode kernel on CUDA
+        ((3000,), ()),
+    )
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       requires_grad=requires_grad, low=None, high=None)
+    return (SampleInput(make_arg(input_tensor), *args)
+            for input_tensor, args in args)
+
+# Missing to test the nondeterminism of the operation
+# https://github.com/pytorch/pytorch/issues/53352
+def sample_inputs_put(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    make_idx = partial(make_tensor, low=0, dtype=torch.int64, device=device, requires_grad=False)
+
+    S = 3
+
+    # Generic inputs
+    idx = torch.randperm(S * S, device=device, dtype=torch.int64)[:S]
+    idx_list = [idx, -idx - 1]
+    for idx, acc in product(idx_list, (True, False)):
+        yield SampleInput(input=make_arg((S, S)),
+                          args=(idx.clone(),
+                                make_arg((S,)),
+                                acc))
+
+    # Scalar cases
+    scalar_sizes = [(), (1,)]
+    tgt_gen = (make_arg(size) for size in scalar_sizes)
+    idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
+    src_gen = (make_arg(size) for size in scalar_sizes)
+    for tgt, idx, src, acc in product(tgt_gen, idx_gen, src_gen, (True, False)):
+        yield SampleInput(input=tgt.clone().requires_grad_(requires_grad),
+                          args=(idx.clone(),
+                                src.clone().requires_grad_(requires_grad),
+                                acc))
+
+    # Empty cases
+    tgt_sizes = [(0,), (), (1,), (3, 2)]
+    tgt_gen = (make_arg(size) for size in tgt_sizes)
+    idx = make_idx((0,), high=1)
+    src = make_arg((0,))
+    for tgt, acc in product(tgt_gen, (True, False)):
+        yield SampleInput(input=tgt.clone().requires_grad_(requires_grad),
+                          args=(idx.clone(),
+                                src.clone().requires_grad_(requires_grad),
+                                acc))
+
+def sample_inputs_take(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    make_idx = partial(make_tensor, low=0, dtype=torch.int64, device=device, requires_grad=False)
+
+    S = 3
+
+    # Generic inputs: take S elements out of S * S
+    index = make_idx((S,), high=(S * S))
+    for idx in (index, -index - 1):
+        yield SampleInput(input=make_arg((S, S)), args=(idx,))
+
+    # Scalar cases
+    scalar_sizes = [(), (1,)]
+    src_gen = (make_arg(size) for size in scalar_sizes)
+    idx_gen = (make_idx(size, high=1) for size in scalar_sizes)
+    for src, idx in product(src_gen, idx_gen):
+        yield SampleInput(input=src.clone().requires_grad_(requires_grad),
+                          args=(idx.clone(),))
+
+    # Empty cases
+    src_sizes = [(0,), (), (1,), (3, 2)]
+    src_gen = (make_arg(size) for size in src_sizes)
+
+    idx = make_idx((0,), high=1)
+    for src in src_gen:
+        yield SampleInput(input=src.clone().requires_grad_(requires_grad),
+                          args=(idx.clone(),))
+
+def sample_movedim_moveaxis(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg((4, 3, 2, 1)), [0, 1, 2, 3], [3, 2, 1, 0])
+    yield SampleInput(make_arg((4, 3, 2, 1)), [0, -1, -2, -3], [-3, -2, -1, -0])
+
+def reference_movedim_moveaxis(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_movedim_moveaxis(op_info, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # shape, source, destination
+    args = (
+        # empty inputs
+        ((), (), ()),
+        # int inputs, negative
+        ((3, 5, 7, 2), -2, 1),
+        # swap bounds
+        ((3, 5, 7, 2), (-1, 0), (0, -1)),
+        # non-sequential, negative
+        ((2, 3, 4, 5, 6), (3, -3, 4), (1, 0, -1)),
+        # idempotence, negative
+        ((2, 3, 4, 5, 6), (-3, 4, 3, 1), (-3, 4, 3, 1)),
+        # reverse, sequential, positive
+        ((6, 2, 3, 5, 4), (4, 3, 2, 1, 0), (0, 1, 2, 3, 4)),
+        # reverse, non-sequential
+        ((6, 2, 3, 5, 4), (-3, -2, -4, -5, -1), (2, 1, 3, 4, 0)),
+        # reverse, sequential, negative
+        ((6, 2, 3, 5, 4), (4, -2, 2, -4, -5), (-5, 1, 2, -2, -1)),
+    )
+
+    for shape, source, destination in args:
+        yield SampleInput(make_arg(shape), args=(source, destination))
+
+def error_movedim_moveaxis(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # source length < destination length
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((3, -3), (1, 0, -1))),
+        error_regex=(r"movedim: Invalid source or destination dims: source "
+                     r"\(\[3, -3\] dims\) should contain the same number of "
+                     r"dims as destination \(\[1, 0, -1\] dims\)"),
+    )
+
+    # source length > destination length
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((3, -3, 4), (1, 0))),
+        error_regex=(r"movedim: Invalid source or destination dims: source "
+                     r"\(\[3, -3, 4\] dims\) should contain the same number of "
+                     r"dims as destination \(\[1, 0\] dims\)"),
+    )
+
+    # repeated source dim, with negative indices
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((0, 4, -5), (1, 0, 2))),
+        error_regex=r"movedim: repeated dim in `source` \(\[0, 4, -5\]\)",
+    )
+
+    # repeated destination dim, with negative indices
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((1, 0, 2), (0, 4, -5))),
+        error_regex=r"movedim: repeated dim in `destination` \(\[0, 4, -5\]\)",
+    )
+
+    # repeated dim (both), with negative indices
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((1, 0, -4), (0, 4, -5))),
+        error_regex=r"movedim: repeated dim in `source` \(\[1, 0, -4\]\)",
+    )
+
+    # out of bounds source inputs, with negative indices
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((0, 1, -6), (1, 4, 2))),
+        error_regex=r"Dimension out of range \(expected to be in range of \[-5, 4\], but got -6\)",
+        error_type=IndexError,
+    )
+
+    # out of bounds destination inputs, with negative indices
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=((1, 4, 2), (0, 1, -6))),
+        error_regex=r"Dimension out of range \(expected to be in range of \[-5, 4\], but got -6\)",
+        error_type=IndexError,
+    )
+
+    # out of bounds source input, int
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=(-6, 1)),
+        error_regex=r"Dimension out of range \(expected to be in range of \[-5, 4\], but got -6\)",
+        error_type=IndexError,
+    )
+
+    # out of bounds destination input, int
+    yield ErrorInput(
+        SampleInput(make_arg(2, 3, 4, 5, 6), args=(3, -6)),
+        error_regex=r"Dimension out of range \(expected to be in range of \[-5, 4\], but got -6\)",
+        error_type=IndexError,
+    )
+
+def sample_repeat_tile(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    rep_dims = ((), (0, ), (1, ), (0, 2), (1, 1), (2, 3), (2, 3, 2), (0, 2, 3), (2, 1, 1, 1),)
+    shapes = ((), (0,), (2,), (3, 0), (3, 2), (3, 0, 1))
+
+    if requires_grad:
+        # Tests for variant_consistency_jit, grad, gradgrad
+        # are slower. Use smaller bags of `rep_dims` and `shapes`
+        # in this case.
+        rep_dims = ((), (0, ), (0, 2), (1, 1), (2, 3), (1, 3, 2), (3, 1, 1))  # type: ignore[assignment]
+        shapes = ((), (0,), (2,), (3, 2))  # type: ignore[assignment]
+
+    is_repeat_op = op_info.name in ['repeat', '_refs.repeat']
+    for rep_dim, shape in product(rep_dims, shapes):
+        # `torch.repeat` errors for `len(rep_dims) < t.dim()`,
+        # so we filter such combinations.
+        if is_repeat_op and len(rep_dim) < len(shape):
+            continue
+        yield SampleInput(make_arg(shape), rep_dim)
+
+
+def sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
+    shapes_and_args = (
+        ((S, S, S), 1, 2, 2),
+        ((S, S, S), -1, 2, 2),
+        ((S, S, S), 1, 0, 0),
+        ((S, S, S), -1, 0, 0),
+        ((S, S, S), 2, 1, 2),
+    )
+
+    for shape, dim, start, length in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
+
+def reference_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, *, is_narrow, **kwargs):
+    yield from sample_inputs_narrow_narrow_copy(op_info, device, dtype, requires_grad, is_narrow=is_narrow, **kwargs)
+
+    shapes_and_args = (
+        # 1-dim
+        ((M,), 0, 0, 0),    # 0 elems from the left
+        ((M,), -1, -1, 0),  # 0 elems from the right
+        ((M,), 0, 5, 3),    # 3 elems from the left
+        ((M,), 0, -5, 2),   # 2 elems from the right
+        ((M,), -1, 0, M),   # M elems from the left
+        ((M,), 0, -M, M),   # M elems from the right
+
+        # 2-dim
+        ((M, S), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, M), -2, -1, 0),  # dim 0, 0 elems from the right
+        ((L, S), 1, 2, 3),    # dim 1, 3 elems from the left
+        ((L, S), -1, 3, 2),   # dim 1, 2 elems from the left
+        ((M, L), 0, 0, M),    # dim 0, M elems from the left
+        ((M, L), -1, -L, L),  # dim 1, L elems from the right
+
+        # 3-dim
+        ((L, M, S), 2, 0, 0),    # dim 2, 0 elems from the left
+        ((M, S, L), -1, -1, 0),  # dim 2, 0 elems from the right
+        ((S, L, M), 2, 0, M),    # dim 2, M elems from the left
+        ((L, S, M), -1, -M, M),  # dim 2, M elems from the right
+        ((S, L, M), 1, 0, 0),    # dim 1, 0 elems from the left
+        ((S, L, M), 0, 2, 1),    # dim 0, 1 elem from the left
+        ((M, S, M), -1, -5, 4),  # dim 2, 4 elems from the right
+    )
+
+    for shape, dim, start, length in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, dim, start, length)
+        # narrow also accepts the start argument being a Tensor
+        if is_narrow:
+            yield SampleInput(tensor, dim, torch.tensor(start), length)
+
+def error_inputs_narrow_narrow_copy(op_info, device, *, is_narrow, is_ref):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # 0-dim
+    yield ErrorInput(SampleInput(make_arg(()), 0, 0, 1),
+                     error_type=RuntimeError,
+                     error_regex=r"narrow\(\) cannot be applied to a 0-dim tensor\.")
+
+    # out of bounds dim
+    if not is_narrow and not is_ref and torch.device(device).type == 'cpu':
+        # narrow_copy_dense_cpu_out
+        yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
+                         error_type=RuntimeError,
+                         error_regex=r"Expected dim < static_cast<int64_t>\(self_sizes.size\(\)\) to be true, but got false\.")
+    else:
+        yield ErrorInput(SampleInput(make_arg((M, S, L)), 3, 0, 0),
+                         error_type=IndexError,
+                         error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got 3\)")
+    # out of bounds dim (negative)
+    yield ErrorInput(SampleInput(make_arg((L, S, M)), -4, 0, 0),
+                     error_type=IndexError,
+                     error_regex=r"Dimension out of range \(expected to be in range of \[-3, 2\], but got -4\)")
+
+    # out of bounds start
+    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, M + 1, 0),
+                     error_type=IndexError,
+                     error_regex=r"start out of range \(expected to be in range of \[-10, 10\], but got 11\)")
+    # out of bounds start (negative)
+    yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, -M - 1, 0),
+                     error_type=IndexError,
+                     error_regex=r"start out of range \(expected to be in range of \[-10, 10\], but got -11\)")
+
+    # out of bounds length
+    yield ErrorInput(SampleInput(make_arg((S, L, M)), 2, 0, M + 1),
+                     error_type=RuntimeError,
+                     error_regex=r"start \(0\) \+ length \(11\) exceeds dimension size \(10\)\.")
+    # out of bounds length (negative)
+    if not is_narrow and not is_ref and torch.device(device).type == 'cpu':
+        # narrow_copy_dense_cpu_out
+        yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
+                         error_type=RuntimeError,
+                         error_regex=r"start \(0\) \+ length \(-1\) exceeds dimension size \(10\)\.")
+    else:
+        yield ErrorInput(SampleInput(make_arg((M,)), 0, 0, -1),
+                         error_type=RuntimeError,
+                         error_regex=r"narrow\(\): length must be non-negative\.")
+
+    # Test Tensor overload that was added for XLA. Start must be an 0-dim
+    # integral Tensor. narrow_copy doesn't have this overload.
+    # https://github.com/pytorch/pytorch/issues/31558
+    if is_narrow:
+        # *1-dim* integral Tensor
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), 1, make_arg(S, dtype=torch.int), 2),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
+
+        # 0-dim *bool* Tensor (bools are not allowed)
+        yield ErrorInput(SampleInput(make_arg((L, M, S)), -3, make_arg((), dtype=torch.bool), 3),
+                         error_type=RuntimeError,
+                         error_regex=r"start must be an 0-dim integral Tensor\.")
+
+
+def sample_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
+    y_shape_x_shape_and_kwargs = [
+        ((2, 3), (2, 3), {}),
+        ((2, 3), (2, 3), {'dim': 1}),
+        ((6,), (6,), {}),
+        ((6,), None, {}),
+        # When 'trapezoid' is called with an empty input, it does not produce an output with requires_grad
+        # See Issue #{61619}
+        # ((6,0), (6,0), {}),
+        ((2, 3), (1, 3), {}),
+        ((3, 3), (3, 3), {}),
+        ((3, 3), (3, 3), {'dim': -2}),
+        ((5,), None, {'dx': 2.0}),
+        ((2, 2), None, {'dx': 3.0})
+    ]
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None,
+                       requires_grad=requires_grad)
+    for y_shape, x_shape, kwarg in y_shape_x_shape_and_kwargs:
+        y_tensor = make_arg(y_shape)
+        if x_shape is not None:
+            x_tensor = make_arg(x_shape)
+            yield SampleInput(y_tensor, x_tensor, **kwarg)
+        else:
+            yield SampleInput(y_tensor, **kwarg)
+
+def sample_cumulative_trapezoid(op_info, device, dtype, requires_grad, **kwargs):
+
+    y_shape_x_shape_and_kwargs = [
+        ((2, 3), (2, 3), {}),
+        ((2, 3), (2, 3), {'dim': 1}),
+        ((6,), (6,), {}),
+        ((6,), None, {}),
+        # When 'cumulative_trapezoid' is called with an empty input, it does not produce an output with requires_grad
+        # See Issue #{61619}
+        # ((6,0), (6,0), {}),
+        ((2, 3), (1, 3), {}),
+        ((3, 3), (3, 3), {}),
+        ((3, 3), (3, 3), {'dim': -2}),
+        ((5,), None, {'dx': 2.0}),
+        ((2, 2), None, {'dx': 3.0})
+    ]
+    make_arg = partial(make_tensor, device=device, dtype=dtype,
+                       requires_grad=requires_grad, low=None, high=None)
+    for y_shape, x_shape, kwarg in y_shape_x_shape_and_kwargs:
+        y_tensor = make_arg(y_shape)
+        if x_shape is not None:
+            x_tensor = make_arg(x_shape)
+            yield SampleInput(y_tensor, x_tensor, **kwarg)
+        else:
+            yield SampleInput(y_tensor, **kwarg)
+
+def sample_unsqueeze(op_info, device, dtype, requires_grad, **kwargs):
+    shapes_and_axes = [
+        ((3, 4, 5), 0),
+        ((3, 4, 5), 1),
+        ((3, 4, 5), 3),
+        ((3, 4, 5), -1),
+        ((3, 4, 5), -3),
+        ((), 0),
+        ((), -1),
+        ((1,), 0),
+        ((1,), -1),
+    ]
+
+    for shape, axis in shapes_and_axes:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+        yield SampleInput(tensor, axis)
+
+
+def sample_inputs_nn_unfold(op_info, device, dtype, requires_grad, **kwargs):
+    shapes = ((0, 1, 5, 5), (2, 3, 5, 5))
+    kernel_sizes = (2, (2, 2), (2, 3))
+    dilations = (1, 2, (1, 2))
+    paddings = (0, 1, (1, 2))
+    strides = (1, 2, (1, 2))
+
+    cases = product(shapes, kernel_sizes, dilations, paddings, strides)
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    for shape, kernel_size, dilation, padding, stride in cases:
+        tensor = make_arg(shape)
+        yield SampleInput(tensor, kernel_size, dilation, padding, stride)
+
+    # With default args
+    yield SampleInput(make_arg((1, 1, 5, 5)), (3, 3))
+
+
+def sample_inputs_squeeze(op_info, device, dtype, requires_grad, **kwargs):
+    shapes_and_args = (
+        ((S, 1, S, 1), ()),
+        ((1, 1, 1, 1), ()),
+        ((1, 1, 1, 1), (0,)),
+        ((S, 1, S, 1), (1,)),
+        ((S, 1, S, 1), (-1,)),
+        ((S, 1, S, 1), (2,)),
+        ((S, 1, S, 1), (-2,)),
+        ((), (0, )),
+    )
+
+    for shape, args in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+
+        yield SampleInput(tensor, args=args)
+
+
+def sample_inputs_squeeze_multiple(op_info, device, dtype, requires_grad, **kwargs):
+    shapes_and_args = (
+        ((1, 1, 1, 1), ()),
+        ((S, 1, S, 1), (1,)),
+        ((S, 1, S, 1), (-1,)),
+        ((S, 1, S, 1), (1, 3)),
+        ((S, 1, S, 1), (1, 2,)),
+        ((), (0,)),
+    )
+
+    for shape, dims in shapes_and_args:
+        tensor = make_tensor(shape, dtype=dtype, device=device, low=None, high=None,
+                             requires_grad=requires_grad)
+
+        yield SampleInput(tensor, dims)
+
+
+def _squeeze_ref(x, axis=None):
+    # NumPy doesn't allow squeezing scalars
+    if x.ndim == 0:
+        return x
+
+    if isinstance(axis, Sequence):
+        # Numpy doesn't allow specifying non-singular dimensions
+        axis = tuple(a for a in axis if x.shape[a] == 1)
+
+    if isinstance(axis, int) and x.shape[axis] != 1:
+        return x
+
+    return np.squeeze(x, axis)
+
+def sample_inputs_nn_pad(op_info, device, dtype, requires_grad, mode, **kwargs):
+    assert mode in ('constant', 'reflect', 'replicate', 'circular')
+    if mode in ['reflect', 'replicate']:
+        cases: tuple = (  # ignore
+            ((1, 3), (1, 2)),
+            ((1, 3), (0, 1)),
+            ((0, 3, 3), (1, 2)),
+            ((0, 3, 3), (0, 1)),
+            ((1, 3, 3), (1, 2)),
+            ((1, 3, 3), (0, 1)),
+            ((1, 3, 3), (0, 2, 0, 1)),
+            ((0, 3, 3, 3), (0, 2, 0, 1)),
+            ((3, 3, 5, 5), (0, 2, 0, 1)),
+            ((3, 3, 5, 5), (1, 1, 1, 1, 1, 1)),
+            ((1, 3, 3, 3, 3), (1, 1, 1, 1, 1, 1)),
+            ((1, 3, 4, 4), (-1, 1, -2, 1)),
+        )
+    elif mode == 'constant':
+        cases = (
+            ((1, 3), (1, 2)),
+            ((1, 3), (0, 1)),
+            ((1, 3), (0, 2, 0, 1)),
+            ((0, 3, 3), (1, 2)),
+            ((0, 3, 3), (0, 1)),
+            ((0, 3, 3), (0, 2, 0, 1)),
+            ((0, 3, 3), (1, 1, 1, 1, 1, 1)),
+            ((1, 3, 3), (1, 2)),
+            ((1, 3, 3), (0, 1)),
+            ((1, 3, 3), (0, 2, 0, 1)),
+            ((1, 3, 3), (1, 1, 1, 1, 1, 1)),
+            ((0, 3, 3, 3), (1, 2)),
+            ((0, 3, 3, 3), (0, 1)),
+            ((0, 3, 3, 3), (0, 2, 0, 1)),
+            ((0, 3, 3, 3), (1, 1, 1, 1, 1, 1)),
+            ((3, 3, 5, 5), (1, 2)),
+            ((3, 3, 5, 5), (0, 1)),
+            ((3, 3, 5, 5), (0, 2, 0, 1)),
+            ((3, 3, 5, 5), (1, 1, 1, 1, 1, 1)),
+            ((1, 3, 3, 3, 3), (1, 2)),
+            ((1, 3, 3, 3, 3), (0, 1)),
+            ((1, 3, 3, 3, 3), (0, 2, 0, 1)),
+            ((1, 3, 3, 3, 3), (1, 1, 1, 1, 1, 1)),
+            ((1, 3, 4, 4), (-1, 1, -2, 1)),
+        )
+    else:  # mode == 'circular'
+        if dtype == torch.bool:
+            # test_dtypes fails on ASAN with for the case ab
+            # runtime error: load of value 190, which is not a valid value for type 'bool'
+            # Reference: https://github.com/pytorch/pytorch/pull/62814#issuecomment-894156562
+            # Reference Issue: https://github.com/pytorch/pytorch/issues/63034
+            cases = (
+                ((2, 3, 3), (1, 2)),
+                ((1, 3, 3), (1, 2)),
+            )
+        else:
+            cases = (
+                ((0, 3, 3), (1, 2)),
+                ((0, 3, 3), (0, 1)),
+                ((1, 3, 3), (1, 2)),
+                ((1, 3, 3), (0, 1)),
+                ((0, 3, 3, 3), (0, 2, 0, 1)),
+                ((3, 3, 5, 5), (0, 2, 0, 1)),
+                ((1, 3, 3, 3, 3), (1, 1, 1, 1, 1, 1)),
+                ((1, 3, 4, 4), (-1, 1, -2, 1)),
+            )
+
+    make_inp = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    if mode == 'constant':
+        # Default args
+        yield SampleInput(make_inp((1, 3, 3)), args=((2, 2),))
+
+    if mode in ['reflect', 'replicate', 'circular']:
+        for shape, pad in cases:
+            yield SampleInput(make_inp(shape), args=(pad, mode))
+    else:  # mode == 'constant'
+        for pad_value in (1., 2.):
+            for shape, pad in cases:
+                yield SampleInput(make_inp(shape), args=(pad, mode, pad_value))
+
+def sample_inputs_nn_pad_replicate_negative(op_info, device, dtype, requires_grad, **kwargs):
+    cases: tuple = (
+        ((5, 3, 4, 4), (-4, 5, 0, 0)),
+        ((6, 2, 4, 4), (0, 0, 2, -4)),
+        ((5, 6, 4, 4), (5, -4, -4, 3)),
+        ((4, 2, 5, 5), (-2, -1, 4, 6)),
+        ((2, 6, 5, 5), (8, -1, -1, -3)),
+        ((8, 1, 5, 5), (-2, -1, -1, -3)),
+    )
+    make_inp = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for shape, pad in cases:
+        yield SampleInput(make_inp(shape), args=(pad, 'replicate'))
+
+def sample_inputs_constant_pad_nd(op_info, device, dtype, *args, **kwargs):
+    # Inherit sample inputs from nn.pad, but transform them to fit
+    # constant_pad_nd's interface
+    nn_samples = sample_inputs_nn_pad(op_info, device, dtype, *args,
+                                      mode='constant', **kwargs)
+
+    # NOTE: primTorch is more strict about the type of the fill value argument
+    # So we must cast it to the correct dtype
+    from torch._prims_common import dtype_to_type
+    scalar_type = dtype_to_type(dtype)
+
+    def drop_mode_argument(input, pad, mode=None, value=None):
+        if value is None:
+            return SampleInput(input, args=(pad,))
+        else:
+            return SampleInput(input, args=(pad, scalar_type(value)))
+
+    for sample in nn_samples:
+        yield drop_mode_argument(sample.input, *sample.args, **sample.kwargs)
+
+def sample_inputs_repeat_interleave(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_input(()), repeats=2)
+    yield SampleInput(make_input((2, 3, 4)), repeats=2)
+    yield SampleInput(make_input((2, 3, 4)), repeats=2, dim=1)
+    yield SampleInput(make_input((2, 3, 4)), repeats=torch.arange(3, device=device), dim=1)
+
+
+def sample_inputs_stft(op_info, device, dtype, requires_grad, **kwargs):
+    def mt(shape, **kwargs):
+        return make_tensor(shape, device=device, dtype=dtype,
+                           requires_grad=requires_grad, **kwargs)
+
+    yield SampleInput(mt(100), n_fft=10, return_complex=True)
+    yield SampleInput(mt(100), n_fft=10, return_complex=False)
+    if dtype.is_complex:
+        yield SampleInput(mt(100), n_fft=10)
+
+    for center in [False, True]:
+        yield SampleInput(mt(10), n_fft=7, center=center, return_complex=True)
+        yield SampleInput(mt((10, 100)), n_fft=16, hop_length=4,
+                          center=center, return_complex=True)
+
+    window = mt(16, low=.5, high=2.0)
+    yield SampleInput(
+        mt((2, 100)), kwargs=dict(n_fft=16, window=window, return_complex=True, center=center))
+    yield SampleInput(
+        mt((3, 100)), kwargs=dict(n_fft=16, window=window, return_complex=True, center=center))
+    if not dtype.is_complex:
+        yield SampleInput(
+            mt((10, 100)), n_fft=16, window=window, onesided=False,
+            return_complex=True)
+
+
+def sample_inputs_istft(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def mt(shape, **kwargs):
+        real_shape = shape if dtype.is_complex else shape + (2,)
+        return make_arg(real_shape, **kwargs)
+
+    yield SampleInput(mt((10, 2)), kwargs=dict(n_fft=10))
+    yield SampleInput(mt((6, 3)), kwargs=dict(n_fft=6, onesided=False))
+    yield SampleInput(mt((6, 4)), kwargs=dict(n_fft=10, onesided=True))
+
+    for center in [False, True]:
+        yield SampleInput(mt((10, 10, 6)), kwargs=dict(n_fft=10, center=center))
+        yield SampleInput(mt((1, 9, 10)), kwargs=dict(n_fft=16, hop_length=4, center=center))
+
+    window = make_arg(10, low=.5, high=2.0)
+    yield SampleInput(mt((10, 10, 6)), kwargs=dict(
+        n_fft=10, window=window, center=center, return_complex=dtype.is_complex))
+    yield SampleInput(mt((10, 10, 10)), kwargs=dict(
+        n_fft=10, window=window[:8], win_length=8, center=center, return_complex=True))
+
+    real_window = window if not dtype.is_complex else window.real
+    yield SampleInput(mt((10, 5, 6)), kwargs=dict(n_fft=8, window=real_window[:8], center=center))
+
+def sample_inputs_ormqr(op_info, device, dtype, requires_grad, **kwargs):
+    # create a helper function wrapping `make_tensor`
+    make_input = partial(make_tensor, dtype=dtype, device=device, low=-1, high=1)
+
+    batches = [(), (0, ), (2, ), (2, 1)]
+    ns = [5, 2, 0]
+    tf = [True, False]
+    for batch, (m, n), left, transpose in product(batches, product(ns, ns), tf, tf):
+        input = make_input((*batch, m, n))
+        reflectors, tau = torch.geqrf(input)
+        reflectors.requires_grad_(requires_grad)
+        tau.requires_grad_(requires_grad)
+        other_matrix_shape = (m, n) if left else (n, m)
+        other = make_input((*batch, *other_matrix_shape), requires_grad=requires_grad)
+        yield SampleInput(reflectors, tau, other, left=left, transpose=transpose)
+
+
+def sample_inputs_cholesky_solve(op_info, device, dtype, requires_grad=False, **kwargs):
+    cholesky_inverse_samples = sample_inputs_linalg_cholesky_inverse(
+        op_info, device, dtype, requires_grad=False
+    )
+
+    for sample in cholesky_inverse_samples:
+        psd_matrix = sample.input
+        sample.input = make_tensor(psd_matrix.shape, dtype=dtype, device=device, requires_grad=requires_grad, low=None, high=None)
+        sample.args = (psd_matrix.requires_grad_(requires_grad),)
+        yield sample
+
+
+def sample_inputs_lu(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_fullrank_matrices_with_distinct_singular_values,
+                       dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # not needed once OpInfo tests support Iterables
+    batch_shapes = ((), (3,), (3, 3))
+    for batch_shape, get_infos, size_delta in product(batch_shapes, (True, False), (-2, -1, 0, +1, +2)):
+        shape = batch_shape + (S + size_delta, S)
+        input = make_arg(*shape)
+        yield SampleInput(input, args=(True, get_infos))
+
+
+def sample_inputs_lu_unpack(op_info, device, dtype, requires_grad=False, **kwargs):
+    def out_fn(output):
+        return output[1], output[2]
+
+    for lu_sample in sample_inputs_linalg_lu(op_info, device, dtype, requires_grad, **kwargs):
+        lu_data, pivots = torch.linalg.lu_factor(lu_sample.input)
+        lu_data.requires_grad_(requires_grad)
+        yield SampleInput(lu_data, pivots).with_metadata(output_process_fn_grad=out_fn)
+
+
+def sample_inputs_roll(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    args = ((0, 0), (1, 2), (0, 2), (2, 0), (-1, 0), (10000, 1), (2,), ((1, 2, -1), (0, 1, 2)))
+
+    for arg in args:
+        yield SampleInput(make_arg((0, 0, 0)), args=arg)
+        yield SampleInput(make_arg((S, S, S)), args=arg)
+
+    # Scalar tensor
+    yield SampleInput(make_arg(()), args=(10, ))
+
+def error_inputs_roll(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    err_msg1 = "`shifts` required"
+    s1 = SampleInput(make_arg((S,)), ())
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    err_msg2 = ("shifts and dimensions must align")
+    s2 = SampleInput(make_arg((S, S)), (2, 1), 0)
+    yield ErrorInput(s2, error_regex=err_msg2)
+
+    err_msg3 = ("out of range")
+    s3 = SampleInput(make_arg((S, )), 0, 2)
+    yield ErrorInput(s3, error_regex=err_msg3, error_type=IndexError)
+
+    err_msg4 = ("Dimension specified as 0")
+    s4 = SampleInput(make_arg(()), 0, 0)
+    yield ErrorInput(s4, error_regex=err_msg4, error_type=IndexError)
+
+def sample_inputs_rot90(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    args = itertools.product(range(-5, 6), [(0, 1), (1, 2), (1, -1)])
+
+    yield SampleInput(make_arg((S, S, S)))
+    for arg in args:
+        yield SampleInput(make_arg((S, S, S)), args=arg)
+
+
+def error_inputs_rot90(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    err_msg1 = "expected total rotation dims"
+    s1 = SampleInput(make_arg((S, S)), dims=(0,))
+    yield ErrorInput(s1, error_regex=err_msg1)
+
+    err_msg2 = "expected total dims >= 2"
+    s2 = SampleInput(make_arg((S,)))
+    yield ErrorInput(s2, error_regex=err_msg2)
+
+    err_msg3 = "expected rotation dims to be different"
+    s3 = SampleInput(make_arg((S, S)), dims=(1, 1))
+    yield ErrorInput(s3, error_regex=err_msg3)
+
+
+def sample_inputs_std_var(op_info, device, dtype, requires_grad, **kwargs):
+    tensor_nd = partial(make_tensor, (S, S, S), device=device, dtype=dtype,
+                        requires_grad=requires_grad)
+    tensor_1d = partial(make_tensor, (S,), device=device, dtype=dtype,
+                        requires_grad=requires_grad)
+
+    yield SampleInput(tensor_nd())
+    yield SampleInput(tensor_nd(), dim=1)
+    yield SampleInput(tensor_nd(), dim=1, unbiased=True, keepdim=True)
+    yield SampleInput(tensor_1d(), dim=0, unbiased=True, keepdim=True)
+    yield SampleInput(tensor_1d(), dim=0, unbiased=False, keepdim=False)
+
+    yield SampleInput(tensor_nd(), dim=(1,), correction=1.3)
+    yield SampleInput(tensor_nd(), dim=(1,), correction=S // 2)
+    yield SampleInput(tensor_nd(), dim=None, correction=0, keepdim=True)
+    yield SampleInput(tensor_nd(), dim=None, correction=None)
+    yield SampleInput(tensor_nd(), correction=0, keepdim=True)
+    yield SampleInput(make_tensor(3, 4, 5, device=device, dtype=dtype, requires_grad=requires_grad), dim=-3)
+
+
+def sample_inputs_std_var_unbiased(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype,
+                       requires_grad=requires_grad)
+
+    # Test var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
+    yield SampleInput(make_arg((S, S)), True)
+    yield SampleInput(make_arg((S,)), False)
+
+
+def _generate_correlation_inputs(device, dtype, requires_grad, **kwargs):
+    shapes = [(2,), (1, 2), (3, 2), (2, 3)]
+    for shape in shapes:
+        yield make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+
+
+def sample_inputs_corrcoef(op_info, device, dtype, requires_grad, **kwargs):
+    return (SampleInput(t) for t in _generate_correlation_inputs(device, dtype, requires_grad))
+
+
+def sample_inputs_cov(op_info, device, dtype, requires_grad, **kwargs):
+    for t in _generate_correlation_inputs(device, dtype, requires_grad):
+        yield SampleInput(t)
+        num_observations = t.numel() if t.ndimension() < 2 else t.size(1)
+        fweights = make_tensor((num_observations,), dtype=torch.int, device=device, low=1, high=10)
+        aweights = make_tensor((num_observations,), dtype=torch.float, device=device, low=0, high=1, requires_grad=requires_grad)
+        for correction, fw, aw in product(range(num_observations), [None, fweights], [None, aweights]):
+            yield SampleInput(t.clone().requires_grad_(requires_grad),
+                              correction=correction, fweights=fw, aweights=aw)
+
+
+def error_inputs_cov(op_info, device, **kwargs):
+    a = torch.rand(S, device=device)
+    yield ErrorInput(
+        SampleInput(torch.rand(S, S, S, device=device)),
+        error_regex="expected input to have two or fewer dimensions")
+    yield ErrorInput(
+        SampleInput(a, fweights=torch.rand(S, S, device=device)),
+        error_regex="expected fweights to have one or fewer dimensions")
+    yield ErrorInput(
+        SampleInput(a, aweights=torch.rand(S, S, device=device)),
+        error_regex="expected aweights to have one or fewer dimensions")
+    yield ErrorInput(
+        SampleInput(a, fweights=torch.rand(S, device=device)),
+        error_regex="expected fweights to have integral dtype")
+    yield ErrorInput(
+        SampleInput(a, aweights=torch.tensor([1, 1], device=device)),
+        error_regex="expected aweights to have floating point dtype")
+    yield ErrorInput(
+        SampleInput(a, fweights=torch.tensor([1], device=device)),
+        error_regex="expected fweights to have the same numel")
+    yield ErrorInput(
+        SampleInput(a, aweights=torch.rand(1, device=device)),
+        error_regex="expected aweights to have the same numel")
+    yield ErrorInput(
+        SampleInput(a, fweights=torch.tensor([-1, -2, -3, -4 , -5], device=device)),
+        error_regex="fweights cannot be negative")
+    yield ErrorInput(
+        SampleInput(a, aweights=torch.tensor([-1., -2., -3., -4., -5.], device=device)),
+        error_regex="aweights cannot be negative")
+
+
+def sample_inputs_permute(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = [((1, 2, 3, 4), (0, 2, 3, 1)),
+             ((1, 2, 3, 4), (0, -2, -1, 1)),
+             ((), ()),
+             ((1, 2, 3, 4), (2, 1, 3, 0))]
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=(args,))
+
+def reference_inputs_permute(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_permute(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (
+        ((), ()),
+        ((1,), (0,)),
+        ((2, 2), (1, 0)),
+        ((2, 2), (0, 1)),
+        ((2, 0, 1), (0, 2, 1)),
+        ((3, 4, 2), (2, 1, 0)),
+        ((3, 4, 2), (1, 0, 2)),
+        ((3, 4, 2), (0, 1, 2)),
+    )
+
+    # Adds tricky permutations and permutations with noncontiguity
+    for shape, permutation in cases:
+        for p in itertools.permutations(permutation):
+            a = make_arg(shape).permute(p)
+            yield SampleInput(a, args=(permutation,))
+
+            a = make_arg(shape, noncontiguous=True).permute(p)
+            yield SampleInput(a, args=(permutation,))
+
+def error_inputs_softshrink(op, device, **kwargs):
+    yield ErrorInput(SampleInput(make_tensor((1,), dtype=torch.float, device=device), kwargs={"lambd": -0.5}),
+                     error_regex="lambda must be greater or equal to 0, but found to be -0.5")
+
+def sample_inputs_softshrink(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # The additional sample is to check additional values of lambd beyond the default
+    # value (what is already checked by sample_inputs_elementwise_unary)
+    for lbda in (0., 0.5):
+        yield SampleInput(make_arg(S, S), kwargs={"lambd": lbda})
+
+    yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad)
+
+def sample_inputs_hardshrink(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # The additional sample is to check additional values of lambd beyond the default
+    # value (what is already checked by sample_inputs_elementwise_unary)
+    # Note that unlike softshrink, lambd is allowed to be negative for hardshrink
+    for lbda in (-0.5, 0., 0.5):
+        yield SampleInput(make_arg(S, S), kwargs={"lambd": lbda})
+
+    yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad)
+
+
+def sample_inputs_hardtanh(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # The additional sample is to check additional values of min_val and max_val beyond the default
+    # value (what is already checked by sample_inputs_elementwise_unary)
+    for max_val, min_val in ((-0.5, 0.5), (0.5, -0.5), (0., 0.)):
+        yield SampleInput(make_arg(S, S), kwargs={"min_val": min_val, "max_val": max_val})
+
+    yield from sample_inputs_elementwise_unary(op_info, device, dtype, requires_grad)
+
+
+def sample_inputs_einsum(op_info, device, dtype, requires_grad=False, **kwargs):
+    def c(t):
+        return t.clone().requires_grad_(requires_grad)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    x = make_arg((3,))
+    y = make_arg((4,))
+    A = make_arg((2, 3,))
+    B = make_arg((1, 3,))
+    C = make_arg((1, 2, 3,))
+    D = make_arg((1, 3, 4,))
+    E = make_arg((4, 4,))
+    H = make_arg((3, 3,))
+    I = make_arg((1, 3, 1,))
+
+    # Vector operations
+    yield SampleInput([c(x)], 'i->')                      # sum
+    yield SampleInput([c(x), c(y)], 'i,j->ij')            # outer
+
+    # Matrix operations
+    yield SampleInput([c(A)], "ij->i")                    # col sum
+    yield SampleInput([c(A), c(B)], "ij,kj->ik")          # matmul
+    yield SampleInput([c(A), c(E)], "ij,Ab->ijAb")        # matrix outer product
+
+    # Tensor operations
+    yield SampleInput([c(C), c(D)], "aij,ajk->aik")       # batch matmul
+    yield SampleInput([c(D), c(E)], "aij,jk->aik")        # tensor matrix contraction
+    yield SampleInput([c(C), c(B)], "ijk,ik->j")          # non contiguous
+
+    # Test diagonals
+    yield SampleInput([c(I)], 'iji->j')                   # non-contiguous trace
+
+    # Test ellipsis
+    yield SampleInput([c(H)], "i...->...")
+    yield SampleInput([c(C), c(x)], '...ik, ...j -> ij')
+
+
+def sample_inputs_flip(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    sizes = ((S, M, S), (S, 0, M))
+    all_dims = ((0, 1, 2), (0,), (0, 2), (-1,), ())
+
+    for size, dims in product(sizes, all_dims):
+        yield SampleInput(make_arg(size), kwargs={"dims": dims})
+
+def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad, **kwargs):
+    shapes = [
+        (S, M, S),
+        (S, 0, M),
+    ]
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    return (SampleInput(make_arg(shape, low=None, high=None)) for shape in shapes)
+
+def error_inputs_fliplr(op, device, **kwargs):
+    yield ErrorInput(SampleInput(make_tensor((1,), dtype=torch.float, device=device)),
+                     error_regex="Input must be >= 2-d.")
+
+def error_inputs_flipud(op, device, **kwargs):
+    yield ErrorInput(SampleInput(make_tensor((), dtype=torch.float, device=device)),
+                     error_regex="Input must be >= 1-d.")
+
+def sample_inputs_clamp(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None, high=None, requires_grad=requires_grad)
+    shape = (S, M, S)
+
+    yield SampleInput(make_arg(shape), args=(make_arg(shape), make_arg(shape)))
+    yield SampleInput(make_arg(shape), args=(make_arg(shape[1:]), make_arg(shape[1:])))
+    yield SampleInput(make_arg(shape), args=(make_arg((S, 1, S)),))
+    yield SampleInput(make_arg(shape), args=(None, make_arg(shape)))
+    yield SampleInput(make_arg(shape), args=(make_arg(shape), None))
+
+def reference_inputs_elementwise_ternary(op, device, dtype, requires_grad, *, sample_inputs_func, supports_scalars=False, **kwargs):
+    yield from sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_scalar_tensor = partial(make_tensor, (), device='cpu', dtype=dtype, requires_grad=requires_grad)
+    supported_dtypes = op.supported_dtypes(device)
+
+    # broadcasting and oncontiguous cases
+    cases = (
+        ((4, 4), (4, 4), (4, 4)),
+        ((4, 4), (1, 4, 4), (4, 4)),
+        ((4, 4), (1, 4, 4), (4, 1, 4)),
+        ((4, 4, 1), (1, 4, 4), (4, 4)),
+        ((4, 1), (1, 4, 4), (1, 4)),
+        ((4, 4), (), (4, 4)),
+        ((4, 4), (), ()),
+        ((), (4, 4), (1, 4, 4)),
+    )
+
+    for a, b, c in cases:
+        yield SampleInput(make_arg(a), args=(make_arg(b), make_arg(c)))
+        yield SampleInput(make_arg(a, noncontiguous=True),
+                          args=(make_arg(b).transpose(0, -1), make_arg(c, noncontiguous=True).transpose(0, -1)))
+
+    # scalar cases
+    if supports_scalars:
+        cases = [
+            ((), 1, 2,),
+            ((), 1., 2),
+            ((4, 4), 1., 2,),
+            ((3, 4), make_scalar_tensor(), make_scalar_tensor()),
+        ]
+
+        if torch.complex64 in supported_dtypes:
+            cases.extend([
+                ((3, 1, 4), complex(1, 2), 3.),
+            ])
+
+        for a, b, c in cases:
+            yield SampleInput(make_arg(a), args=(b, c))
+
+    # type promotion cases
+    # int x float
+    if torch.float in supported_dtypes and torch.long in supported_dtypes:
+        a = make_arg((), dtype=torch.long)
+        b = make_arg((1, 4), dtype=torch.float)
+        c = make_arg((3, 4))
+
+        cases = (
+            (a, b, c),
+            (c, a, b),
+        )
+
+        for a, b, c in cases:
+            yield SampleInput(a, args=(b, c))
+
+    # NaN propagation
+    if dtype.is_floating_point or dtype.is_complex:
+        nan = float('nan') if dtype.is_floating_point else complex(float('nan'), float('nan'))
+
+        a = make_arg((12,))
+        a[4] = nan
+        a[7] = nan
+        b = make_arg((12,))
+        b[1] = nan
+        b[7] = nan
+        c = make_arg((12,))
+        c[9] = nan
+
+        yield SampleInput(a, args=(b, c))
+
+
+def _clamp_min_numpy(a, min=None):
+    return np.maximum(a, min)
+
+
+def _clamp_max_numpy(a, max=None):
+    return np.minimum(a, max)
+
+
+def _clamp_numpy(a, min=None, max=None):
+    if min is None:
+        return np.minimum(a, max)
+    if max is None:
+        return np.maximum(a, min)
+
+    return np.minimum(max, np.maximum(a, min))
+
+
+def sample_inputs_cumprod(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg(shape):
+        # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
+        return make_tensor(shape, dtype=dtype, device=device, low=-1, high=+1, requires_grad=requires_grad)
+
+    def prod_zeros(dim_select):
+        assert len(dim_select) == 2
+        result = make_arg(3 * (S,))
+        result.narrow(dim_select[0], 0, 1).narrow(dim_select[1], 1, 1).zero_()
+        result.narrow(dim_select[0], 2, 1).narrow(dim_select[1], 3, 1).zero_()
+        result.narrow(dim_select[0], 4, 1).narrow(dim_select[1], 3, 1).zero_()
+        return result
+
+    for dim in range(3):
+        yield SampleInput(make_arg((S, S, S)), args=(dim,))
+    # Scalar tensors and empty tensor
+    for size in [(), (1,), (0,)]:
+        yield SampleInput(make_arg(size), args=(0,))
+
+    yield SampleInput(prod_zeros([0, 1]), args=(1,))
+    yield SampleInput(prod_zeros([0, 2]), args=(1,))
+    yield SampleInput(prod_zeros([1, 2]), args=(1,))
+
+    # test dtype kwarg
+    yield SampleInput(prod_zeros([1, 2]), args=(1,), kwargs={'dtype': dtype})
+
+def sample_inputs_view_as_complex(op_info, device, dtype, requires_grad, **kwargs):
+    yield SampleInput(make_tensor((S, 2), dtype=dtype, device=device, requires_grad=requires_grad))
+
+def sample_inputs_view_as_real(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    sizes = ((S, S), ())
+    return (SampleInput(make_arg(size)) for size in sizes)
+
+def error_inputs_complex(op_info, device, is_ref=False, **kwargs):
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device)
+
+    if is_ref:
+        error_float = "Expected both inputs to be Half, Float or Double tensors but got torch.float32 and torch.int32"
+        error_dtype = "Expected object of scalar type torch.float32 but got scalar type torch.float64 for second argument"
+        error_out = "Expected out tensor to have dtype torch.complex128 but got torch.complex64 instead"
+    else:
+        error_float = "Expected both inputs to be Half, Float or Double tensors but got Float and Int"
+        error_dtype = "Expected object of scalar type Float but got scalar type Double for second argument"
+        error_out = "Expected object of scalar type ComplexDouble but got scalar type ComplexFloat for argument 'out'"
+
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.int)),
+                     error_type=RuntimeError, error_regex=error_float)
+
+    yield ErrorInput(SampleInput(make_arg(M, S), make_arg(M, S, dtype=torch.float64)),
+                     error_type=RuntimeError, error_regex=error_dtype)
+
+    yield ErrorInput(SampleInput(make_arg(M, S, dtype=torch.float64), make_arg(M, S, dtype=torch.float64),
+                                 out=make_arg(M, S, dtype=torch.complex64)),
+                     error_type=RuntimeError, error_regex=error_out)
+
+def sample_inputs_logaddexp(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    shape = (S, S)
+    yield SampleInput(make_arg(shape), make_arg(shape))
+
+def sample_inputs_prod(op_info, device, dtype, requires_grad, **kwargs):
+    def make_arg(shape):
+        # shrink values to be in the interval [-1, +1] for better precision in gradgradcheck
+        return make_tensor(shape, dtype=dtype, device=device, low=-1, high=+1, requires_grad=requires_grad)
+
+    def prod_single_zero():
+        result = make_arg(2 * (S,))
+        result[0, 1] = 0
+        return result
+
+    for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
+        # only Tensor, ignore other inputs
+        yield SampleInput(sample.input.clone().requires_grad_(requires_grad))
+        yield sample
+
+    # Generates samples with keepdim = True
+    for sample in sample_inputs_cumprod(op_info, device, dtype, requires_grad):
+        sample.kwargs['keepdim'] = True
+        yield sample
+
+    yield SampleInput(prod_single_zero())
+    yield SampleInput(make_arg((3, 3, 3)), args=(1,))
+    yield SampleInput(make_arg((3, 3, 3)), args=(1,), kwargs={'keepdim': True})
+
+    yield SampleInput(make_arg((3, 0)), args=(1,))
+    yield SampleInput(make_arg((3, 0)), args=(1,), kwargs={'keepdim': True})
+    yield SampleInput(torch.tensor([2., 3, 0, 0], dtype=dtype, device=device, requires_grad=requires_grad))
+
+    # test zero scalar tensor
+    zero = make_arg(())
+    zero.zero_()
+    yield SampleInput(zero.clone().requires_grad_(requires_grad))
+    yield SampleInput(zero.clone().requires_grad_(requires_grad), args=(0,))
+    yield SampleInput(zero.clone().requires_grad_(requires_grad),
+                      args=(0,),
+                      kwargs={'keepdim': True})
+
+def error_inputs_neg(op_info, device, **kwargs):
+    si = SampleInput(torch.tensor((False, True), device=device))
+    msg = ("Negation, the `\\-` operator, on a bool tensor is not supported."
+           " If you are trying to invert a mask, use the `\\~` or"
+           " `logical_not\\(\\)` operator instead.")
+    yield ErrorInput(si, error_regex=msg)
+
+def sample_inputs_diag(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+    yield SampleInput(make_arg(M))
+
+    tensors = (
+        make_arg((M, M)),
+        make_arg((3, 5)),
+        make_arg((5, 3)),
+    )
+
+    args = ((), (2,), (-2,), (1,), (2,))
+
+    for tensor, arg in product(tensors, args):
+        yield SampleInput(tensor.clone().requires_grad_(requires_grad), *arg)
+
+def reference_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_diagonal_diag_embed(
+        op_info, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shapes1d = ((0,), (1,))
+    shapes2d = ((L, M),)
+    shapes3d = ((L, M, S),)
+
+    kwargs1d = {}
+
+    kwargs2d = (
+        # dim1 > dim2 is allowed
+        dict(dim1=1, dim2=0),
+        # negative dims are allowed
+        dict(dim1=-2, dim2=-1),
+        # one dim negative and the other nonnegative is allowed
+        dict(dim1=-1, dim2=0),
+        # out of bounds offset should return an empty tensor in diagonal and
+        # offset the diagonal in diag_embed
+        dict(offset=100),
+    )
+
+    kwargs3d = kwargs2d + (
+        # make sure we can use non-sequential dims
+        dict(offset=-1, dim1=0, dim2=2),
+    )
+
+    samples1d = product(shapes1d, kwargs1d)
+    samples2d = product(shapes2d, kwargs2d)
+    samples3d = product(shapes3d, kwargs3d)
+
+    for shape, kwargs in chain(samples1d, samples2d, samples3d):
+        if 'diagonal' in op_info.name:
+            # these are error inputs for diagonal
+            if shape in ((0,), (1,)):
+                continue
+        yield SampleInput(input=make_arg(shape), kwargs=kwargs)
+
+
+def sample_inputs_diagonal_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # Shapes for 2D Tensors
+    shapes_2d = ((M, M), (3, 5), (5, 3))
+
+    # Shapes for 3D Tensors
+    shapes_3d = ((M, M, M),)
+
+    args_2d = ((), (2,), (-2,), (1,))
+    args_3d = ((1, 1, 2), (2, 0, 1), (-2, 0, 1))
+
+    for input_shape, arg in chain(product(shapes_2d, args_2d), product(shapes_3d, args_3d)):
+        input_ = make_arg(input_shape)
+        # We can programmatically figure out the right shape for src:
+        # It should be the same size as input.diagonal(other_args...)
+        if not isinstance(arg, tuple):
+            arg_tuple = (arg,)
+        else:
+            arg_tuple = arg
+        src_shape = input_.diagonal(*arg_tuple).size()
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *arg_tuple))
+
+
+def sample_inputs_to_sparse(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, S))).with_metadata(output_process_fn_grad=lambda x: x.to_dense())
+    yield SampleInput(make_arg((S, S)), 1).with_metadata(output_process_fn_grad=lambda x: x.to_dense())
+
+def sample_inputs_cross_entropy(op_info, device, dtype, requires_grad, **kwargs):
+    batch_size, num_classes = shape = (2, 3)
+    reductions = ("mean", "sum", "none")
+
+    input_shape_and_kwargs: List[Tuple[Tuple[int, ...], Dict[str, Any]]] = [
+        (shape, {}),
+        ((*shape, 1), {}),
+        ((*shape, 1, 2), {}),
+        ((*shape, 1, 2, 3), {}),
+        *[(shape, dict(reduction=reduction)) for reduction in reductions],
+        *[
+            (
+                shape,
+                dict(
+                    weight=make_tensor((num_classes,), device=device, dtype=dtype),
+                    reduction=reduction,
+                ),
+            )
+            for reduction in reductions
+        ],
+        (shape, dict(ignore_index=1)),
+    ]
+
+    for (input_shape, kwargs), probabilities_target in itertools.product(input_shape_and_kwargs, (False, True)):
+        input = make_tensor(input_shape, device=device, dtype=dtype, requires_grad=requires_grad)
+
+        if probabilities_target:
+            # ignore_index is not supported for probabilities target
+            if "ignore_index" in kwargs:
+                continue
+
+            target = make_tensor(
+                input_shape,
+                low=0,
+                high=1,
+                device=device,
+                dtype=dtype,
+                requires_grad=requires_grad,
+            )
+        else:
+            target = make_tensor(
+                (batch_size, *input_shape[2:]),
+                low=0,
+                high=num_classes,
+                device=device,
+                dtype=torch.long,
+            )
+
+            if "ignore_index" in kwargs and torch.all(target == kwargs["ignore_index"]):
+                # make sure at least one item in target is not ignored
+                target[0] = random.sample(sorted(set(range(num_classes)) - {kwargs["ignore_index"]}), 1)[0]
+
+        yield SampleInput(input, target, **kwargs)
+
+
+def sample_inputs_logit(op_info, device, dtype, requires_grad, **kwargs):
+    low, high = op_info.domain
+
+    # Note: Operator is very sensitive at points near the
+    # start and end of domain and leads to NaN for float16
+    # if domain_eps is 1e-5.
+    if dtype.is_floating_point or dtype.is_complex:
+        domain_eps = op_info._domain_eps if dtype != torch.float16 else 3e-2
+
+        low = low + domain_eps
+        high = high - domain_eps
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, S, S)))
+    yield SampleInput(make_arg((S, S, S)), 0.2)
+    yield SampleInput(make_arg(()))
+    yield SampleInput(make_arg(()), 0.2)
+
+def sample_inputs_isin(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # isin has two paths based on the size of elements and test_elements.
+    # if elements.numel() < 10 * pow(test_elements.numel(), 0.145):
+    yield SampleInput(make_arg((L,)), args=(make_arg((S,)),))
+    # else:
+    yield SampleInput(make_arg((S,)), args=(make_arg((L,)),))
+
+def sample_inputs_masked_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, make_arg((S, S))))
+    yield SampleInput(make_arg((S, S)), args=(torch.randn((S,), device=device) > 0, make_arg((S, S))))
+    yield SampleInput(make_arg((S, S)), args=(bernoulli_scalar().to(device), make_arg((S, S))))
+    yield SampleInput(make_arg((S,)),
+                      args=(torch.randn(S, S, device=device) > 0, make_arg((S, S))),
+                      broadcasts_input=True)
+
+def error_inputs_masked_scatter(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float)
+    for mask_dtype in [torch.float, torch.uint8]:
+        yield ErrorInput(SampleInput(make_arg(1, 3), args=(torch.ones(1, 3, device=device, dtype=mask_dtype),
+                                                           make_arg(3, 4))),
+                         error_regex=r"masked_scatter_ only supports boolean masks")
+
+def sample_inputs_masked_fill(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, 10))
+    yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, make_arg(())))
+    yield SampleInput(make_arg((S, S)), args=(torch.randn(S, device=device) > 0, 10))
+    yield SampleInput(make_arg(()), args=(torch.randn((), device=device) > 0, 10))
+    yield SampleInput(make_arg(()), args=(torch.randn((), device=device) > 0, make_arg(())))
+    yield SampleInput(make_arg((S, S)), args=(torch.randn((), device=device) > 0, 10))
+
+    yield SampleInput(make_arg((S,)),
+                      args=(torch.randn(S, S, device=device) > 0, make_arg(())),
+                      broadcasts_input=True)
+    yield SampleInput(make_arg((S,)),
+                      args=(torch.randn(S, S, device=device) > 0, 10),
+                      broadcasts_input=True)
+
+    if torch.device(device).type == 'cuda':
+        # `self` and `mask` on CUDA but `value` is a CPU scalar tensor.
+        yield SampleInput(make_arg((S, S)), args=(torch.randn(S, S, device=device) > 0, torch.randn(())))
+
+def error_inputs_masked_fill(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float, requires_grad=False)
+    # `value` is not a 0-D tensor.
+    yield ErrorInput(SampleInput(make_arg((2, 2)), args=(make_arg(()) > 0, make_arg((1,)))),
+                     error_regex="only supports a 0-dimensional value tensor, but got tensor with 1 dimension")
+    # downcasting complex value (scalar overload)
+    yield ErrorInput(SampleInput(make_arg((2, 2)), args=(make_arg(()) > 0, 1j)),
+                     error_regex=r"value cannot be converted to type .* without overflow")
+    # downcasting complex value (tensor overload)
+    yield ErrorInput(SampleInput(torch.ones(2, dtype=torch.long, device=device),
+                                 args=(make_arg(()) > 0, torch.tensor(1j, device=device))),
+                     error_regex=r"value cannot be converted to type .* without overflow")
+
+    if torch.device(device).type == 'cuda':
+        # `self` and `mask` on CPU but `value` is a CUDA scalar tensor.
+        yield ErrorInput(SampleInput(torch.randn((S, S), device='cpu'),
+                                     args=(torch.randn(S, S, device='cpu') > 0,
+                                           torch.randn((), device='cuda'))),
+                         error_regex=r"to be on same device")
+
+
+def sample_inputs_masked_select(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad, low=None, high=None)
+
+    yield SampleInput(make_arg((M, M)), torch.randn(M, M, device=device) > 0)
+
+    yield SampleInput(make_arg((M, M)), torch.randn((M,), device=device) > 0)
+    yield SampleInput(make_arg((M,)), torch.randn((M, M), device=device) > 0)
+
+    yield SampleInput(make_arg((M, 1, M)), torch.randn((M, M), device=device) > 0)
+
+    yield SampleInput(make_arg(()), torch.tensor(1, device=device, dtype=torch.bool))
+
+    yield SampleInput(make_arg((M, M)), torch.tensor(1, device=device, dtype=torch.bool))
+
+    yield SampleInput(make_arg(()), torch.randn((M, M), device=device) > 0)
+
+def sample_inputs_matrix_exp(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(make_arg((S, S)))
+    yield SampleInput(make_arg((S, S, S)))
+
+def sample_inputs_matmul(op_info, device, dtype, requires_grad, is_rmatmul=False, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, low=None,
+                       high=None, requires_grad=requires_grad)
+    test_cases = (((L,), (L,)),
+                  ((S, M), (M,)),
+                  ((M,), (M, S)),
+                  ((S, M), (M, S)),
+                  ((S, 0), (0, M)),
+                  ((S, S, M), (M,)),
+                  ((S, S, M), (M, S)),
+                  ((S, S, 0), (0, S)),
+                  ((M,), (S, M, S)),
+                  ((S, M), (S, M, S)),
+                  ((0, 0), (S, 0, 0)),
+                  ((S, S, M, M), (S, S, M, S)),
+                  ((S, S, M, M), (M,)),
+                  ((M,), (S, S, M, S)),
+                  ((S, S, S), (1, S, S))
+                  )
+    for lhs_shape, rhs_shape in test_cases:
+        lhs = make_arg(lhs_shape)
+        rhs = make_arg(rhs_shape)
+        if not is_rmatmul:
+            yield SampleInput(lhs, rhs)
+        else:
+            yield SampleInput(rhs, lhs)
+
+
+def sample_inputs_meshgrid(op_info: OpInfo, device: torch.device, dtype: torch.dtype,
+                           requires_grad: bool,
+                           *, variant: str, **kwargs) -> List[SampleInput]:
+    if variant == 'variadic':
+        def make_inputs(
+                tensors: List[torch.Tensor]) -> Tuple[Union[torch.Tensor,
+                                                            List[torch.Tensor]],
+                                                      Tuple[torch.Tensor, ...]]:
+            return tensors
+    elif variant == 'list':
+        def make_inputs(
+                tensors: List[torch.Tensor]) -> Tuple[Union[torch.Tensor,
+                                                            List[torch.Tensor]],
+                                                      Tuple[torch.Tensor, ...]]:
+            return [tensors]
+    else:
+        raise ValueError(
+            'Unsupported variant, must be one of {"variadic", "list"}. '
+            f'Got "{variant}".')
+
+    SCALAR = torch.Size([])
+    VECTOR = torch.Size([3])
+    test_cases: List[List[torch.Size]] = [
+        [SCALAR],
+        [VECTOR],
+        [VECTOR, SCALAR],
+        [VECTOR, SCALAR, VECTOR],
+        [VECTOR, SCALAR, VECTOR, SCALAR],
+    ]
+
+    for shapes, indexing in itertools.product(test_cases, {'xy', 'ij'}):
+        args = make_inputs(
+            [make_tensor(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+             for shape in shapes])
+        yield SampleInput(*args, indexing=indexing)
+
+
+def sample_inputs_mvlgamma(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    tensor_shapes = ((S, S), ())
+    ns = (1, 2, 3, 4, 5)
+
+    # Since the accepted lower bound for input
+    # to mvlgamma depends on `p` argument,
+    # the following function computes the lower bound
+    # which we pass to `make_tensor`.
+    def compute_min_val(p):
+        return (p - 1.) / 2
+
+    for shape, n in product(tensor_shapes, ns):
+        min_val = compute_min_val(n)
+        if not dtype.is_floating_point:
+            # Round-up minimum value for integral dtypes
+            min_val += 1
+        else:
+            min_val += 2 * torch.finfo(dtype).eps
+        yield SampleInput(make_arg(shape, low=min_val), args=(n,))
+
+
+# Since `mvlgamma` has multiple entries,
+# there are multiple common skips for the additional
+# entries. Following function is a helper to that end.
+def skips_mvlgamma(skip_redundant=False):
+    skips = (
+        # outside domain values are hard error for mvlgamma op.
+        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_float_domains'),
+        DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
+                     'test_reference_numerics_extremal'),
+        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                     'test_reference_numerics_large',
+                     dtypes=(torch.float16, torch.int8)),
+        DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                     'test_reference_numerics_small',
+                     dtypes=(torch.int8,)),
+    )
+    if skip_redundant:
+        # Redundant tests
+        skips = skips + (  # type: ignore[assignment]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
+        )
+    return skips
+
+
+# To test reference numerics against multiple values of argument `p`,
+# we make multiple OpInfo entries with each entry corresponding to different value of p.
+# We run the op tests from test_ops.py only for `p=1` to avoid redundancy in testing.
+def make_mvlgamma_opinfo(variant_test_name, domain, skips, sample_kwargs):
+    return UnaryUfuncInfo('mvlgamma',
+                          ref=reference_mvlgamma if TEST_SCIPY else None,
+                          aliases=('special.multigammaln',),
+                          variant_test_name=variant_test_name,
+                          domain=domain,
+                          decorators=(precisionOverride({torch.float16: 5e-2}),),
+                          dtypes=all_types_and(torch.half, torch.bfloat16),
+                          dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                          sample_inputs_func=sample_inputs_mvlgamma,
+                          supports_forward_ad=True,
+                          supports_fwgrad_bwgrad=True,
+                          promotes_int_to_float=True,
+                          skips=skips,
+                          sample_kwargs=sample_kwargs)
+
+
+def sample_inputs_cumulative_ops(op_info, device, dtype, requires_grad, supports_dtype_kwargs=True, **kwargs):
+    def _make_tensor_helper(shape, low=None, high=None):
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    yield SampleInput(_make_tensor_helper((S, S, S)), 0)
+    yield SampleInput(_make_tensor_helper((S, S, S)), 1)
+    yield SampleInput(_make_tensor_helper(()), 0)
+
+    if supports_dtype_kwargs:
+        # NOTE: if `dtype` is not same as input, then inplace variants fail with
+        # `provided dtype must match the dtype of self tensor in cumsum`
+        yield SampleInput(_make_tensor_helper((S, S, S)), 1, dtype=dtype)
+
+
+def sample_inputs_unfold(op_info, device, dtype, requires_grad, **kwargs):
+    test_cases = (
+        ((), (0, 1, 1)),
+        ((S, S, S, S), (0, 3, 1)),
+        ((S, S, S, S), (1, 3, 1)),
+        ((S, S, S, S), (2, 3, 1)),
+        ((S, S, S, S), (3, 3, 1)),
+        ((S, S, S, S), (0, 3, 2)),
+        ((S, S, S, S), (1, 3, 2)),
+        ((S, S, S, S), (2, 3, 2)),
+        ((S, S, S, S), (3, 3, 2)),
+        ((S, S, S, S), (0, 4, 1)),
+        ((S, S, S, S), (1, 4, 1)),
+        ((S, S, S, S), (2, 4, 1)),
+        ((S, S, S, S), (3, 4, 1)),
+        ((M,), (0, 3, 1)),
+        ((M,), (0, 3, 2)),
+        ((M,), (0, 3, 3)),
+        ((1000,), (0, 3, 11)),
+        ((1000,), (0, 2, 27)),
+        ((10, 10), (0, 1, 2)),
+        ((10, 10), (1, 2, 3)),
+        ((10, 10), (1, 2, 2)),
+        ((S, S, S), (2, 3, 2)),
+    )
+
+    for shape, arguments in test_cases:
+        yield SampleInput(make_tensor(shape, dtype=dtype, device=device,
+                                      low=None, high=None,
+                                      requires_grad=requires_grad),
+                          *arguments)
+
+def sample_inputs_split(op_info, device, dtype, requires_grad, *, list_args=False, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    if list_args:
+        cases = (
+            ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), 2),),
+            ((S, S, S), (torch.Size([int(S / 2), S - int(S / 2) * 2, int(S / 2)]), -2),)
+        )
+    else:
+        cases = (  # type: ignore[assignment]
+            ((S, S, S), (2,)),
+            ((S, S, S), (S, 1)),
+        )
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+
+def sample_inputs_split_with_sizes(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases = (((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3), 0]),)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), 2)),
+             ((S, S, S), (torch.Size([int(S / 3), S - int(S / 3) * 2, int(S / 3)]), -2)),
+             )
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+
+def sample_inputs_msort(op_info, device, dtype, requires_grad, **kwargs):
+    def apply_grad(t):
+        if dtype in floating_types_and(torch.float16, torch.bfloat16):
+            t.requires_grad_(requires_grad)
+
+    def large_1d_unique(dtype, device):
+        res = torch.randperm(L * L * L, dtype=torch.int64, device=device)
+        res = res.to(dtype)
+        apply_grad(res)
+        return res
+
+    # Test case for large tensor.
+    yield SampleInput(large_1d_unique(dtype, device))
+
+    yield SampleInput(make_tensor((S, M, S), dtype=dtype, device=device,
+                                  low=None, high=None,
+                                  requires_grad=requires_grad))
+
+def sample_inputs_lerp(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # no broadcast
+    yield SampleInput(make_arg((S, S)), make_arg((S, S)), 0.4)
+    # broadcast rhs
+    yield SampleInput(make_arg((S, S)), make_arg((S,)), 0.4)
+    # scalar tensor
+    yield SampleInput(make_arg(()), make_arg(()), 0.4)
+    # broadcast rhs scalar-tensor
+    yield SampleInput(make_arg((S, S)), make_arg(()), 0.4)
+    # broadcast rhs with weight tensor
+    yield SampleInput(make_arg((S, S)), make_arg((S,)), make_arg((S, S)))
+    # broadcast rhs and weight tensor
+    yield SampleInput(make_arg((S, S)), make_arg((S, 1)), make_arg((S,)))
+    # broadcast lhs
+    yield SampleInput(make_arg((S,)), make_arg((S, S)), 0.4).with_metadata(broadcasts_input=True)
+    # scalar broadcast_lhs
+    yield SampleInput(make_arg(()), make_arg((S, S)), 0.4).with_metadata(broadcasts_input=True)
+    # broadcast all
+    yield SampleInput(make_arg((S, 1)), make_arg((S, S)), 0.4).with_metadata(broadcasts_input=True)
+    # tensor broadcast all
+    yield SampleInput(make_arg((S, 1)), make_arg((S, S)), make_arg((S, 1))).with_metadata(
+        broadcasts_input=True)
+    # no broadcast with weight tensor
+    yield SampleInput(make_arg((S, S)), make_arg((S, S)), make_arg((S, S)))
+    # broadcast lhs with weight tensor
+    yield SampleInput(make_arg((S,)), make_arg((S, S)), make_arg((S, S))).with_metadata(
+        broadcasts_input=True)
+    # broadcast lhs and weight tensor
+    yield SampleInput(make_arg((S,)), make_arg((S, S, S)), make_arg((S, S))).with_metadata(
+        broadcasts_input=True)
+    # broadcast lhs and weight tensor variant
+    yield SampleInput(make_arg((S, S)), make_arg((S, S, S)), make_arg((S,))).with_metadata(
+        broadcasts_input=True)
+
+    if dtype.is_complex:
+        # no broadcast
+        yield SampleInput(make_arg((S, S)), make_arg((S, S)), 0.4j)
+        yield SampleInput(make_arg((S, S)), make_arg((S, S)), 1.2 + 0.1j)
+        # broadcast rhs
+        yield SampleInput(make_arg((S, S)), make_arg((S,)), 0.4j)
+        yield SampleInput(make_arg((S, S)), make_arg((S, S)), 5.4 + 9j)
+        # scalar tensor
+        yield SampleInput(make_arg(()), make_arg(()), 0.4j)
+        yield SampleInput(make_arg(()), make_arg(()), 6.1 + 0.004j)
+        # broadcast rhs scalar-tensor
+        yield SampleInput(make_arg((S, S)), make_arg(()), 0.4j)
+        yield SampleInput(make_arg((S, S)), make_arg(()), 1 + 2j)
+
+def sample_inputs_tensordot(self, device, dtype, requires_grad, **kwargs):
+    cases = (
+        ((2, 2, 2), (2, 2, 2), (2)),
+        ((2, 2, 1), (2, 1, 2), ([0, 1], [2, 0])),
+    )
+    for first_shape, second_shape, dims in cases:
+        yield SampleInput(make_tensor(first_shape, dtype=dtype, device=device,
+                                      requires_grad=requires_grad),
+                          make_tensor(second_shape, dtype=dtype, device=device,
+                                      requires_grad=requires_grad),
+                          dims=dims)
+
+def sample_inputs_kron(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad, low=None, high=None)
+    test_cases = (
+        ((S, S), (M, L)),
+    )
+
+    for input_shape, other_shape in test_cases:
+        input = make_arg(input_shape)
+        other = make_arg(other_shape)
+        yield SampleInput(input, other)
+
+def sample_inputs_inner(self, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(make_arg(S), make_arg(S))
+    yield SampleInput(make_arg(), make_arg(S, S))
+
+def sample_inputs_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    def _tensor(shape, dtype=dtype, low=None, high=None):
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    def _gather(shape, index_dim, max_indices):
+        return gather_variable(shape, index_dim, max_indices, device=device)
+
+    zero = torch.tensor(0, dtype=torch.long, device=device)
+    test_cases = (
+        (_tensor((M, S)), (0, _gather((S, S), 1, M), _tensor((S, S)))),
+        (_tensor((M, S)), (1, _gather((S, S), 0, S), _tensor((S, S)))),
+        (_tensor((M, S)), (-1, _gather((S, S), 0, S), _tensor((S, S)))),
+        (_tensor((M, S)), (0, _gather((M, S // 2), 1, M), _tensor((M, S // 2)))),
+        (_tensor((M, S)), (1, _gather((M, S // 2), 0, S), _tensor((M, S // 2)))),
+        (_tensor((M, S)), (-1, _gather((M, S // 2), 0, S), _tensor((M, S // 2)))),
+        (_tensor(()), (0, zero.clone().detach(), _tensor(()))),
+        (_tensor(()), (0, zero.clone().detach(), 2.5)),
+    )
+
+    for tensor, args in test_cases:
+        yield SampleInput(tensor, *args)
+
+        if not requires_grad:
+            yield SampleInput(tensor.clone().detach(), *args, reduce='add')
+
+            if dtype.is_floating_point:
+                yield SampleInput(tensor.clone().detach(), *args, reduce='multiply')
+
+def sample_inputs_scatter_add(op_info, device, dtype, requires_grad, **kwargs):
+    def _tensor(shape, dtype=dtype, low=None, high=None):
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    def _gather(shape, index_dim, max_indices):
+        return gather_variable(shape, index_dim, max_indices, device=device)
+
+    zero = torch.tensor(0, dtype=torch.long, device=device)
+    yield SampleInput(_tensor((M, S)), 0, _gather((S, S), 1, M), _tensor((S, S)))
+    yield SampleInput(_tensor((M, S)), 1, _gather((S, S), 0, S), _tensor((S, S)))
+    yield SampleInput(_tensor((M, S)), -1, _gather((S, S), 0, S), _tensor((S, S)))
+    yield SampleInput(_tensor((M, S)), 0, _gather((M, S // 2), 1, M), _tensor((M, S // 2)))
+    yield SampleInput(_tensor((M, S)), 1, _gather((M, S // 2), 0, S), _tensor((M, S // 2)))
+    yield SampleInput(_tensor((M, S)), -1, _gather((M, S // 2), 0, S), _tensor((M, S // 2)))
+    yield SampleInput(_tensor(()), 0, zero.clone().detach(), _tensor(()))
+
+def sample_inputs_scatter_reduce(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    gather = partial(gather_variable, device=device)
+
+    zero = torch.tensor(0, dtype=torch.long, device=device)
+    test_cases = (
+        ((M, S), 0, gather((S, S), 1, M), (S, S)),
+        ((M, S), 1, gather((S, S), 0, S), (S, S)),
+        ((M, S), -1, gather((S, S), 0, S), (S, S)),
+        ((M, S), 0, gather((M, S // 2), 1, M), (M, S // 2)),
+        ((M, S), 1, gather((M, S // 2), 0, S), (M, S // 2)),
+        ((M, S), -1, gather((M, S // 2), 0, S), (M, S // 2)),
+        ((), 0, zero.clone().detach(), ()),
+    )
+
+    reduce = op_info.variant_test_name
+    for (inp_shape, dim, index, src_shape), include_self in product(test_cases, [False, True, False]):
+        yield SampleInput(make_arg(inp_shape),
+                          args=(dim, index, make_arg(src_shape), reduce),
+                          kwargs={'include_self': include_self})
+
+
+    # Sample inputs to test edge cases for backward
+    # Check that gradients are propagated correctly for prod when zeros in self/src are reduced
+    if requires_grad and reduce == 'prod':
+        # This sample tests gradients for the following cases
+        # (a) 1 zero reduced (from src (self[0, 1], self[1, 1]), from self (self[0, 0], self[2, 0]))
+        # (b) 2 zeros reduced (1 from src and 1 from self (self[1, 0])
+        # (c) no zeros reduced (self([2, 1]))
+        # (d) 2 zeros reduced (both from src) is tested in test/test_autograd.py
+        #     test_scatter_index_reduce_prod_gradgrad_error as this case is not supported for gradgrad
+        input = torch.tensor([[0, 13], [0, 17], [0, 19]], dtype=dtype, device=device, requires_grad=requires_grad)
+        src = torch.tensor([[0, 1, 2, 3], [0, 4, 0, 1], [2, 3, 5, 6]], dtype=dtype, device=device, requires_grad=requires_grad)
+        idx = torch.tensor([[1, 1, 0, 0], [0, 0, 1, 1], [0, 0, 0, 1]], dtype=torch.long, device=device)
+
+        yield SampleInput(input,
+                          args=(1, idx, src, reduce),
+                          kwargs={'include_self': True})
+
+def sample_inputs_segment_reduce(op_info, device, dtype, requires_grad, *, mode='lengths', **kwargs):
+    def _tensor(shape, dtype=dtype, low=None, high=None):
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    zero = torch.tensor(0, dtype=torch.long, device=device)
+    test_cases = (
+        # inp_shape, dim, lengths, unsafe
+        ((S,), 0, [0, 1, 2, 2], False),
+        ((S,), 0, [0, 1, 2, 2], True),
+        ((S,), 0, [2, 0, 3, 0], False),
+        ((S, S), 0, [0, 1, 2, 2], False),
+        # test when lengths do not sum to dim size
+        ((M, S, S), 0, [1, 2, 0, 6, 0], True),
+        # test for higher dimensions
+        ((S, S), 1, [[0, 1, 2, 2] for _ in range(S)], False),
+        ((S, S), 1, [[2, 0, 3, 0], [0, 1, 2, 2], [3, 0, 2, 0], [1, 1, 1, 2], [0, 1, 2, 2]], False),
+        ((S, S, S), 1, [[0, 1, 2, 2] for _ in range(S)], False),
+        ((S, S, S), 1, [[2, 0, 3, 0], [0, 1, 2, 2], [3, 0, 2, 0], [1, 1, 1, 2], [0, 1, 2, 2]], False),
+    )
+
+    reductions = ["max", "mean", "min", "sum", "prod"]
+    for args, reduce, initial in product(test_cases, reductions, [1, 2]):
+        inp_shape, dim, lengths, unsafe = args
+        lengths_t = torch.tensor(lengths, dtype=torch.long, device=device)
+        sample_input_kwargs = {'axis': dim, 'unsafe': unsafe, 'initial': initial}
+        if mode == 'lengths':
+            sample_input_kwargs['lengths'] = lengths_t
+        elif mode == 'offsets':
+            zeros_shape = list(lengths_t.shape)
+            zeros_shape[dim] = 1
+            offsets_t = torch.cat((lengths_t.new_zeros(zeros_shape), lengths_t), dim).cumsum_(dim)
+            sample_input_kwargs['offsets'] = offsets_t
+        else:
+            raise RuntimeError(f"mode most be one of 'offsets' or 'lengths' got '{mode}'.")
+        yield SampleInput(_tensor(inp_shape),
+                          args=(reduce,),
+                          kwargs=sample_input_kwargs)
+
+
+def sample_inputs_ravel(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(make_arg((S, S, S)))
+    yield SampleInput(make_arg(()))
+    yield SampleInput(make_arg((S, S, S), noncontiguous=True))
+
+def sample_inputs_unravel_index(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device,
+                       low=None, high=None, requires_grad=requires_grad)
+    yield SampleInput(
+        torch.tensor(
+            [[3, 8, 13], [0, 5, 10]],
+            device=device,
+            dtype=dtype),
+        (4, 5))
+    yield SampleInput(
+        torch.tensor([[3, 8, 13], [0, 5, 10]], device=device, dtype=dtype),
+        (4, 2**30))
+    yield SampleInput(
+        torch.tensor([[3, 8, 13], [0, 5, 10]], device=device, dtype=dtype),
+        (2**30, 4))
+    yield SampleInput(
+        torch.tensor(2, device=device, dtype=dtype),
+        (2, 2))
+    max_val = 2**(8 * dtype.itemsize - (1 if dtype.is_signed else 0)) - 1
+    yield SampleInput(
+        torch.tensor(max_val - 1, device=device, dtype=dtype),
+        (1, max_val))
+    yield SampleInput(
+        torch.tensor([22, 41, 37], device=device, dtype=dtype),
+        (7, 6))
+    yield SampleInput(
+        torch.tensor(min(1621, max_val), device=device, dtype=dtype),
+        (6, 7, 8, 9))
+    yield SampleInput(
+        torch.tensor([], device=device, dtype=dtype),
+        (10, 3, 5))
+    yield SampleInput(
+        torch.tensor(
+            [[1, 0, 1, 2, 3, 4], [1, 6, 1, 3, 2, 0]],
+            device=device,
+            dtype=dtype),
+        (5, 8))
+    yield SampleInput(
+        torch.tensor(
+            [[1, 0, 1, 2, 3, 4], [1, 6, 1, 3, 2, 0], [1, 3, 1, 0, 9, 5]],
+            device=device,
+            dtype=dtype),
+        (5, 8, 10))
+    yield SampleInput(
+        torch.tensor(0, device=device, dtype=dtype),
+        ())
+
+    a = np.array([[2, 4, 5, 6], [7, 8, 1, 15]])
+    b = np.array([[3, 2, 7, 6], [10, 12, 8, 9]])
+    _, i1, i2 = np.intersect1d(a, b, assume_unique=True, return_indices=True)
+    yield SampleInput(torch.tensor(i1, device=device, dtype=dtype), a.shape)
+    yield SampleInput(torch.tensor(i2, device=device, dtype=dtype), b.shape)
+
+    a = np.array([[2, 4, 5, 6, 6], [4, 7, 8, 7, 2]])
+    b = np.array([[3, 2, 7, 7], [10, 12, 8, 7]])
+    _, i1, i2 = np.intersect1d(a, b, return_indices=True)
+    yield SampleInput(torch.tensor(i1, device=device, dtype=dtype), a.shape)
+    yield SampleInput(torch.tensor(i2, device=device, dtype=dtype), b.shape)
+
+
+def sample_inputs_tril_triu(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    cases = (((M, M), ()),
+             ((M, M), (2,),),
+             ((M, S), ()),
+             ((M, S), (-1,)),
+             ((M, M), (2,),),
+             ((S, M, S), ()),
+             ((S, M, S), (2,)),
+             ((3, 3, S, S), ()),)
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+def error_inputs_tril_triu(opinfo, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # error inputs for input.ndim <= 2
+    yield ErrorInput(SampleInput(make_arg((4,))), error_regex="input tensor must have at least 2 dimensions")
+
+def sample_inputs_trilu_indices(op_info, device, dtype, requires_grad, **kwargs):
+    # (row, col, offset)
+    args_list = ((0, 0),
+                 (20, 0),
+                 (0, 20),
+                 (20, 21, 0),
+                 (20, 21, 7),
+                 (20, 21, -7),
+                 # Large test cases below are deliberately commented out to speed up CI
+                 # tests and to avoid OOM error. When modifying implementations of
+                 # tril_indices and triu_indices, please enable these tests and make sure
+                 # they pass.
+                 # (2, 68435455, 3),
+                 # (5000, 5000),
+                 # (5000, 5000, 1234),
+                 # (5000, 5000, -1233),
+                 )
+    for args in args_list:
+        yield SampleInput(args[0], args=args[1:], kwargs={"dtype": dtype, "device": device})
+
+def sample_inputs_clone_contiguous(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    yield SampleInput(make_arg((S, M, S)))
+    yield SampleInput(make_arg(()))
+
+def reference_inputs_clone_contiguous(op, device, dtype, requires_grad, **kwargs):
+    # NOTE: the default memory format for clone is torch.preserve_format, for contiguous it's torch.contiguous_format
+    # This exploits that default to test torch.preserve_format for clone, without causing an error when testing contiguous
+    yield from sample_inputs_clone_contiguous(op, device, dtype, requires_grad, **kwargs)
+
+    shapes = (
+        (3, 5, 6),
+        (1, 1, 3, 5, 6),
+        (1, 1, 3, 5, 6, 1, 1),
+        (1, 0, 3, 5, 0, 2),
+        (1, 0, 3, 5, 0, 0, 1, 1, 2),
+        (),
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in shapes:
+        yield SampleInput(make_arg(shape))
+        yield SampleInput(make_arg(shape).transpose(0, -1))
+        yield SampleInput(make_arg(shape, noncontiguous=True))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1))
+
+        yield SampleInput(make_arg(shape), kwargs={'memory_format': torch.contiguous_format})
+        yield SampleInput(make_arg(shape).transpose(0, -1), kwargs={'memory_format': torch.contiguous_format})
+        yield SampleInput(make_arg(shape, noncontiguous=True), kwargs={'memory_format': torch.contiguous_format})
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1), kwargs={'memory_format': torch.contiguous_format})
+
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    for shape, strides, offset in strided_cases:
+        yield SampleInput(make_arg(500,).as_strided(shape, strides, offset))
+        yield SampleInput(make_arg(500,).as_strided(shape, strides, offset), kwargs={'memory_format': torch.contiguous_format})
+
+    # channels last 2D
+    yield SampleInput(make_arg((2, 2, 2, 2)), kwargs={'memory_format': torch.channels_last})
+    a = make_arg((2, 2, 2, 2)).permute(0, 3, 1, 2)
+    yield SampleInput(a, kwargs={'memory_format': torch.channels_last})
+
+    # channels last 3D
+    yield SampleInput(make_arg((2, 2, 2, 2, 2)), kwargs={'memory_format': torch.channels_last_3d})
+    a = make_arg((2, 2, 2, 2, 2)).permute(0, 4, 1, 2, 3)
+    yield SampleInput(a, kwargs={'memory_format': torch.channels_last_3d})
+
+
+def sample_inputs_sum_to_size(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # list of tuples (shape, shape) defining the shapes of the input and output tensors
+    sample_shapes = [
+        ((), ()),
+        ((S,), (1,)),
+        ((S, S), (1, 1)),
+        ((S, S), (1, S)),
+        ((S, S), (S, S)),
+        ((S, S, S), (S, 1, S)),
+    ]
+
+    for input_shape, output_shape in sample_shapes:
+        yield SampleInput(make_arg(input_shape), args=(output_shape,))
+        if output_shape == ():
+            continue
+        yield SampleInput(make_arg(input_shape), args=(list(output_shape),))
+        yield SampleInput(make_arg(input_shape), args=(*output_shape,))
+
+
+def error_inputs_sum_to_size(op_info, device, **kwargs):
+    shape = (M, S, M)
+    err_msg = "is not expandable to size"
+    si = SampleInput(make_tensor(shape, device=device, dtype=torch.float32), args=(M, M))
+    yield ErrorInput(si, error_regex=err_msg)
+
+    shape = (M + 1, S, S, M)
+    err_msg = "is not expandable to size"
+    si = SampleInput(make_tensor(shape, device=device, dtype=torch.float32), args=(M + 1, 1))
+    yield ErrorInput(si, error_regex=err_msg)
+
+
+def sample_inputs_resize_ops(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device)
+    cases = (((S, S, S), (S * S, S)),
+             ((), ()),
+             ((), (1, 1, 1)),
+             )
+
+    for shape, args_or_shape in cases:
+        # Update `args` based on operator
+        if op_info.name == 'resize_':
+            # resize_ takes shape/tuple of ints,
+            args = (args_or_shape, )
+        elif op_info.name == 'resize_as_':
+            # resize_as_ takes another tensor
+            args = (make_arg(shape, requires_grad=False), )  # type:ignore[assignment]
+        else:
+            raise ValueError("sample_inputs_resize_ops is being used with incorrect operator")
+
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad), args=args)
+
+def sample_inputs_view_reshape(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (
+        # a, b, is_tensor_supported
+        ((S, S, S), (S * S, S), True),
+        ((S * S, S), (S, S, S), True),
+        ((S * S, S), (S, -1, S), False),  # neg index
+        ((S * S * 2, S), (S, -1), False),  # neg index
+        ((S,), (S,), True),
+        ((), (), False),  # empty
+        ((), (1,), True),
+    )
+
+    for a, b, is_tensor_supported in cases:
+        # skip unsupported cases
+        if kwargs.get("tensor_arg") and not is_tensor_supported:
+            continue
+
+        # convert to tensor
+        if kwargs.get("tensor_arg"):
+            b = make_arg(b, requires_grad=False)
+
+        yield SampleInput(make_arg(a), args=(b,))
+
+def reference_inputs_view_reshape(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_view_reshape(op, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        # a, b, is_tensor_supported
+        ((125,), (25, 5), True),
+        ((25, 25), (1, 5, 5, 1, 5, 1, 5, 1), True),
+        ((16, 32), (2, 4, 1, 4, 4, 1, 4), True),
+        ((16, 12), (12, 16), True),
+        ((1, 16, 12), (12, 16), True),
+        ((1, 5, 1, 5), (25, 1), True),
+        ((2, 4, 2), (4, 4), True),
+        ((1, 4), (1, 1, 2, 1, 2), True),
+        ((3, 5, 7), (7, 5, 3), True),
+        ((1,), (), False),  # empty
+        ((5, 0, 2, 3), (5, 0, 2, 3), True),
+        ((2, 1, 0, 3, 1), (5, 0), True),
+        ((1,), (), False),  # empty
+        ((4, 5, 6), (4, 5, 6, 1, 1, 1), True),
+        ((), (1, 1, 1, 1), False),  # empty
+    )
+
+    irreversible_cases = (
+        ((), (-1,), False),  # neg index, empty
+        ((4, 7, 9, 1, 1), (1, 4, 3, -1, 1), False),  # neg index
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for a, b, is_tensor_supported in cases:
+        # skip unsupported cases
+        if kwargs.get("tensor_arg") and not is_tensor_supported:
+            continue
+
+        if kwargs.get("tensor_arg"):
+            # convert to tensor
+            yield SampleInput(make_arg(a), args=(make_arg(b, requires_grad=False),))
+            yield SampleInput(make_arg(b), args=(make_arg(a, requires_grad=False),))
+        else:
+            yield SampleInput(make_arg(a), args=(b,))
+            yield SampleInput(make_arg(b), args=(a,))
+
+    for a, b, is_tensor_supported in irreversible_cases:
+        # skip unsupported cases
+        if kwargs.get("tensor_arg") and not is_tensor_supported:
+            continue
+
+        # convert to tensor
+        if kwargs.get("tensor_arg"):
+            b = make_arg(b, requires_grad=False)
+
+        yield SampleInput(make_arg(a), args=(b,))
+
+def error_inputs_view_reshape(op, device, **kwargs):
+
+    cases = (
+        # a, b, is_tensor_supported
+        # Reshape to different numel
+        ((2,), (), False),  # empty
+        ((1, 3, 0), (), False),  # empty
+        ((4, 3), (4, 2), True),
+        ((1, 3, 5), (5, 2, 2), True),
+        # No valid inference
+        ((1, 3, 5), (5, -1, 2), False),  # neg index
+        # Two inferred shapes
+        ((1, 3, 5), (5, -1, -1), False),  # neg index
+        ((1), (0, -1), False),  # neg index
+        ((0, 5), (0, -1), False),  # neg index
+    )
+
+    make_arg = partial(make_tensor, dtype=torch.float32, device=device, requires_grad=False)
+    for a, b, is_tensor_supported in cases:
+        # skip unsupported cases
+        if kwargs.get("tensor_arg") and not is_tensor_supported:
+            continue
+
+        if b == (5, -1, -1):
+            error_regex = "only one dimension can be inferred"
+        elif a == (0, 5):
+            error_regex = (r"cannot reshape tensor of 0 elements into shape "
+                           r"\[0, -1\] because the unspecified dimension size "
+                           r"-1 can be any value and is ambiguous")
+        else:
+            # to avoid having issues with a regex
+            shape = ', '.join(map(str, b))
+            size = a if type(a) is int else functools.reduce(operator.mul, a, 1)
+            error_regex = rf"shape '\[{shape}\]' is invalid for input of size {size}"
+
+        # convert to tensor
+        if kwargs.get("tensor_arg"):
+            b = make_arg(b, requires_grad=False)
+
+        yield ErrorInput(SampleInput(make_arg(a), args=(b,)), error_type=Exception,
+                         error_regex=error_regex)
+
+
+def sample_inputs_atleast1d2d3d(op_info, device, dtype, requires_grad, **kwargs):
+    input_list = []
+    shapes = ((S, S, S, S), (S, S, S), (S, S), (S, ), (),)
+    make_tensor_partial = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in shapes:
+        yield SampleInput(make_tensor_partial(shape))
+    yield SampleInput([make_tensor_partial(shape) for shape in shapes])
+
+def sample_inputs_column_stack(op_info, device, dtype, requires_grad, **kwargs):
+    cases: Tuple[tuple, tuple] = (  # type: ignore[assignment]
+        ((S, 2, 1), (S, 3, 1)),
+        ((S), (S, 5)), ((), (1, S))
+    )
+    make_tensor_partial = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape1, shape2 in cases:
+        yield SampleInput([make_tensor_partial(shape1), make_tensor_partial(shape2)])
+
+def sample_inputs_flatten(op_info, device, dtype, requires_grad, **kwargs):
+    shapes = ((S, S, S), (S, S), (S, ), (),)
+    make_tensor_partial = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape in shapes:
+        yield SampleInput(make_tensor_partial(shape))
+        if len(shape) > 1:
+            yield SampleInput(make_tensor_partial(shape), start_dim=1, end_dim=-1)
+
+def reference_inputs_flatten(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_flatten(op, device, dtype, requires_grad, **kwargs)
+
+    # shape x start_dim x end_dim
+    cases = (
+        ((5, 4, 0, 1, 3, 7), 1, 3),
+        ((5, 4, 0, 1, 3, 7), 4, 5),
+        ((5, 4, 1, 1, 3, 7), 2, 3),
+        ((), 0, -1),
+        ((1,), 0, -1),
+        ((3, 7, 5), 1, 2),
+        ((4, 5), 1, 1),
+        ((1, 5, 5, 1, 5, 1, 5, 1), 0, 2),
+        ((1, 5, 5, 1, 5, 1, 5, 1), 3, -1),
+        ((1, 5, 5, 1, 5, 7, 5, 1), -2, -1),
+        ((2, 4, 2), 0, 1),
+        ((4, 2, 2), 1, 2),
+        ((0, 3, 4, 5), 1, 3),
+    )
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for shape, start, end in cases:
+        yield SampleInput(make_arg(shape), args=(start, end,))
+        yield SampleInput(make_arg(shape, noncontiguous=True).transpose(0, -1), args=(start, end,))
+        yield SampleInput(make_arg(shape).transpose(0, -1), args=(start, end,))
+
+def sample_inputs_unflatten(op_info, device, dtype, requires_grad, **kwargs):
+    # in_shape, dim, sizes
+    args = (((8,), 0, (8,)),
+            ((8,), 0, (4, 2)),
+            ((8,), -1, (2, 2, 2)),
+            ((8,), -1, (-1, 2)),
+            ((3, 6, 2), 1, (2, 3)),
+            ((3, 6, 2), -2, (2, 3)),
+            ((3, 6, 2), -2, (-1, 3)),
+            ((3, 2, 12), 2, (3, 2, 2)),
+            ((4, 0), 0, (2, 2)),
+            ((4, 0), 1, (2, 0, 0, 0)),
+            )
+    make_tensor_partial = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+    for in_shape, dim, sizes in args:
+        yield SampleInput(make_tensor_partial(in_shape), args=(dim, sizes))
+
+
+def sample_inputs_select(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((S, S, S), (1, 2)),
+             ((S, S, S), (-1, 2)),
+             ((S, S, S), (-1, -1)),
+             ((S, S, S), (1, -1)),
+             ((S,), (0, 2))
+             )
+
+    for shape, args in cases:
+        yield SampleInput(make_arg(shape), args=args)
+
+
+def sample_inputs_select_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((S, S, S), (S, S), (1, 2)),
+             ((S, S, S), (S, S), (-1, 2)),
+             ((S, S, S), (S, S), (-1, -1)),
+             ((S, S, S), (S, S), (1, -1)),
+             ((S,), (), (0, 2))
+             )
+
+    for input_shape, src_shape, args in cases:
+        input_ = make_arg(input_shape)
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *args))
+
+
+def sample_inputs_slice_scatter(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((L, L, L), (L, L, L,), (0, 0, L, 1)),
+             ((L, L, L), (L // 2, L, L,), (0, L // 2, L, 1)),
+             ((L, L, L), (L // 4, L, L,), (0, L // 2, L, 2)),
+             ((L, L, L), (L, L, L,), (1, 0, L, 1)),
+             ((L, L, L), (L, L // 2, L,), (1, L // 2, L, 1)),
+             ((L, L, L), (L, L // 4, L,), (1, L // 2, L, 2)),
+             ((L, L, L), (L, L, L,), (2, 0, L, 1)),
+             ((L, L, L), (L, L, L // 2,), (2, L // 2, L, 1)),
+             ((L, L, L), (L, L, L // 4,), (2, L // 2, L, 2)),
+             )
+
+    for input_shape, src_shape, args in cases:
+        input_ = make_arg(input_shape)
+        src = make_arg(src_shape)
+        yield SampleInput(input_, args=(src, *args))
+
+def sample_inputs_expand(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((S, 1, 1), (S, S, S)),
+             ((S, 1, S), (S, S, S)),
+             ((S, 1, S), (-1, S, -1)),
+             ((S, 1, S), (-1, S, S)),
+             ((S, 1), (S, S, S)),
+             ((1,), (S, S, S)),
+             ((1, S), (1, 1, S)),
+             ((), ()),
+             ((), (1, 3, 2)),
+             )
+
+    for case in cases:
+        shape, args = case
+        yield SampleInput(make_arg(shape), args=(args,))
+
+def sample_inputs_conversion(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    shapes = ((),
+              (2, 3))
+    memory_format_options = [None, torch.contiguous_format]
+
+    for shape, memory_format in itertools.product(shapes, memory_format_options):
+        yield SampleInput(make_arg(shape),
+                          kwargs={'memory_format': memory_format} if memory_format else {})
+    yield SampleInput(make_arg((2, 3, 2, 3)), kwargs={'memory_format': torch.channels_last})
+
+def sample_inputs_expand_as(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device)
+
+    cases = (((S, 1, 1), (S, S, S)),
+             ((), ()),
+             ((), (1, 1)),
+             )
+
+    for shape, shape_other in cases:
+        yield SampleInput(make_arg(shape, requires_grad=requires_grad),
+                          args=(make_arg(shape_other, requires_grad=False),))
+
+
+def sample_inputs_where(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    def make_bool_mask(shape):
+        # Make sure atleast one element is nonzero,
+        # except for empty tensor
+        mask_t = make_tensor(shape, dtype=torch.bool, device=device, requires_grad=False)
+
+        if mask_t.numel() == 0:
+            return mask_t
+        elif mask_t.numel() == 1:
+            mask_t.fill_(True)
+            return mask_t
+
+        if mask_t.sum() == 0:
+            def random_index(shape):
+                return tuple(random.randrange(0, max_idx) for max_idx in shape)
+
+            mask_t[random_index(mask_t.shape)] = True
+            return mask_t
+
+        return mask_t
+
+    cases = (((M, M), (M, M), (M, M), False),
+             ((M, 1, M), (M, M), (M, M, 1), True),
+             ((), (), (), False),
+             ((M, 1, M), (), (M, M, 1), True),
+             ((), (M, M), (), True),
+             ((), (2), (1, 1), True),
+             )
+
+    for shape, mask_shape, other_shape, broadcasts_input in cases:
+        yield SampleInput(make_arg(shape),
+                          args=(make_bool_mask(mask_shape), make_arg(other_shape)),
+                          broadcasts_input=broadcasts_input)
+
+# TODO: add reference inputs for where(condition) signature
+def reference_inputs_where(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_where(op, device, dtype, requires_grad, **kwargs)
+
+    make_cond = partial(make_tensor, dtype=torch.bool, device=device, requires_grad=requires_grad)
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # noncontiguous
+    c = make_cond((10, 3), noncontiguous=True)
+    a = make_arg((10, 1), noncontiguous=True)
+    b = make_arg((3, 10, 3)).transpose(0, -1)
+
+    # NOTE that the OpInfo for where takes samples of the form a, cond, b
+    yield SampleInput(a, args=(c, b))
+
+    # type promoting
+    other_dtype = torch.double if dtype is not torch.double else torch.long
+    c = make_cond((10, 3), noncontiguous=True)
+    a = make_arg((10, 1), dtype=torch.long)
+    b = make_arg((10, 1))
+
+    yield SampleInput(a, args=(c, b))
+
+    # two python scalars
+    c = make_cond((10, 3), noncontiguous=True)
+    a = make_arg((1,)).item()
+    b = make_arg((1,)).item()
+
+    yield SampleInput(a, args=(c, b))
+
+    # NaN propagation
+    if dtype.is_floating_point or dtype.is_complex:
+        if dtype.is_floating_point:
+            nan = float('nan')
+        else:
+            # dtype.is_complex
+            nan = complex(float('nan'), float('nan'))
+        c = make_cond((1, 10, 3))
+        a = make_arg((10, 3), noncontiguous=True)
+        a[2, 1] = nan
+        b = make_arg((1, 3))
+        b[0, 2] = nan
+
+        yield SampleInput(a, args=(c, b))
+
+    # Python scalars type promotion
+    for scalar in (0, 0.0, 2j, False):
+        yield SampleInput(scalar, args=(c, b))
+        yield SampleInput(a, args=(c, scalar))
+
+
+def error_inputs_where(op_info, device, **kwargs):
+    shape = (S,)
+    err_msg = "Expected all tensors to be on the same device"
+    for devices in product(('cpu', device), repeat=3):
+        if len(set(devices)) == 2:
+            si = SampleInput(make_tensor(shape, device=devices[0], dtype=torch.float32),
+                             args=(make_tensor(shape, dtype=torch.bool, device=devices[1]),
+                             make_tensor(shape, device=devices[2], dtype=torch.float32)))
+            yield ErrorInput(si, error_regex=err_msg)
+
+def sample_inputs_nonzero(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = ((), (S,), (S, S), (S, S, S), (S, 1, S), (S, 0, S))
+
+    inputs = []
+    for shape in sizes:
+        # construct input without any non-zero elements
+        zeros = torch.zeros(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+        inputs.append(zeros)
+
+        # construct input with mixed zero and non-zero elements
+        mixed = make_arg(shape).requires_grad_(False)
+        mask_t = make_tensor(shape, dtype=torch.bool, device=device, requires_grad=False)
+        mixed[mask_t] = 0
+        inputs.append(mixed)
+
+    for input_t, as_tuple in product(inputs, [False, True]):
+        yield SampleInput(input_t.clone().requires_grad_(requires_grad),
+                          kwargs=dict(as_tuple=as_tuple))
+
+def sample_inputs_nonzero_static(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    sizes = ((), (S,), (S, S), (S, S, S), (S, 1, S), (S, 0, S))
+
+    inputs = []
+    for shape in sizes:
+        # construct input without any non-zero elements
+        zeros = torch.zeros(shape, dtype=dtype, device=device, requires_grad=requires_grad)
+        inputs.append(zeros)
+
+        # construct input with mixed zero and non-zero elements
+        mixed = make_arg(shape).requires_grad_(False)
+        mask_t = make_tensor(shape, dtype=torch.bool, device=device, requires_grad=False)
+        mixed[mask_t] = 0
+        inputs.append(mixed)
+
+    nonzero_sizes = [0, 1, XS, S, M]
+
+    for input_t, nonzero_size in product(inputs, nonzero_sizes):
+        yield SampleInput(input_t.clone().requires_grad_(requires_grad),
+                          kwargs=dict(size=nonzero_size))
+
+def sample_inputs_chunk(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    cases = (((S, S, S), (2,)),
+             ((S, S, S), (S, 1)),
+             ((S, S, S), (S, -1)))
+
+    for case in cases:
+        shape, args = case
+        yield SampleInput(make_arg(shape), args=args)
+
+def reference_inputs_chunk(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_chunk(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    # shape x chunks x dim
+    cases = (
+        ((13, 9, 11), 17, -1),
+        ((13, 9, 11), 11, -1),
+        ((13,), 12, -1),
+        ((15,), 12, -1),
+        ((15,), 7, 0),
+        ((15,), 9, 0),
+        ((3, 7), 9, 1),
+        ((3, 7), 9, 0),
+        ((3, 7), 2, 0),
+        ((3, 7), 3, 0),
+        ((3, 7), 1, 0),
+        ((3, 7), 1, 1),
+        ((4, 4), 2, 0),
+    )
+
+    for shape, chunks, dim in cases:
+        yield SampleInput(make_arg(shape), args=(chunks, dim))
+
+def sample_inputs_kthvalue(op_info, device, dtype, requires_grad, **kwargs):
+    def _tensor(shape, dtype=dtype, low=None, high=None):
+        return make_tensor(shape, dtype=dtype, device=device, low=low, high=high, requires_grad=requires_grad)
+
+    test_cases = [
+        ((S, S, S), (2,)),
+        ((S, S, S), (2, 1,)),
+        ((S, S, S), (2, -1,)),
+        ((S, S, S), (2, 1, True,)),
+        ((S, S, S), (2, -1, True,)),
+        ((S,), (2, 0,)),
+        ((S,), (2, 0, True,)),
+        ((), (1,)),
+        ((), (1, 0,)),
+        ((), (1, 0, True)),
+    ]
+
+    yield from (SampleInput(_tensor(tensor), *args) for tensor, args in test_cases)
+
+def error_inputs_kthvalue(op_info, device, **kwargs):
+    # tests overlapping output fails
+    t = make_tensor(10, dtype=torch.float32, device=device)
+    indices = torch.empty((), device=device, dtype=torch.long)
+    yield ErrorInput(SampleInput(t, 5, out=(t, indices)),
+                     error_regex="unsupported operation")
+
+    k_out_of_range_err = "selected number k out of range for dimension"
+    yield ErrorInput(SampleInput(torch.randn(2, 2, device=device), 3, 0),
+                     error_regex=k_out_of_range_err)
+    yield ErrorInput(SampleInput(torch.randn(2, 2, device=device), 3),
+                     error_regex=k_out_of_range_err)
+    yield ErrorInput(SampleInput(torch.tensor(2, device=device), 3),
+                     error_regex=k_out_of_range_err)
+
+def sample_inputs_dropout(op_info, device, dtype, requires_grad, *,
+                          train=None, valid_input_dim=None, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    if valid_input_dim:
+        cases = ((S,) * i for i in valid_input_dim)
+    else:
+        cases = ((S, S), (S,), ())
+    p_vals = [0.0, 0.5, 1.0]
+    # This is to handle special case for feature_alpha_dropout which has different
+    # supported dtypes depending on `train` parameter
+    training_vals = [train] if train is not None else [True, False]
+
+    for case, p, training in product(cases, p_vals, training_vals):
+        yield SampleInput(make_arg(case), p=p, training=training)
+    yield SampleInput(make_arg(case))
+
+def sample_inputs_dropout_backward(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_mask = partial(make_tensor, device=device, dtype=torch.bool, requires_grad=False)
+
+    cases = ((S, S, S, S), (S,), ())
+    scale_vals = [0.0, 1.0, 2.0]
+
+    for case, scale in product(cases, scale_vals):
+        yield SampleInput(make_arg(case), make_mask(case), scale)
+
+def sample_inputs_embedding_bag(op_info, device, dtype, requires_grad, **kwargs):
+    def make_input(shape):
+        return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_long_input(shape, *, low, high, noncontiguous=False):
+        return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high,
+                           noncontiguous=noncontiguous)
+
+    def make_per_sample_weight(flag, idx):
+        # a tensor of float / double weights, or None
+        # to indicate all weights should be taken to be 1
+        if flag:
+            return make_input(idx.shape)
+        return None
+
+    offsets = torch.tensor([0, 3], device=device, dtype=torch.long)
+    for generate_per_sample_weight in (True, False):
+        for mode in ('sum', 'mean', 'max'):
+            # per_sample_weights is only supported for mode='sum' (got mode='****')
+            if generate_per_sample_weight and mode in ('mean', 'max'):
+                continue
+
+            # 1-D index tensor
+            idx = make_long_input((S,), low=0, high=M)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((M, S)), args=(idx,),
+                              kwargs={'offsets': offsets, 'mode': mode,
+                                      'per_sample_weights': per_sample_weights})
+
+            idx = make_long_input((S,), low=0, high=M, noncontiguous=True)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((M, S)), args=(idx,),
+                              kwargs={'offsets': offsets, 'mode': mode,
+                                      'per_sample_weights': per_sample_weights})
+
+            # bag with zero length
+            idx = make_long_input((S,), low=0, high=M, noncontiguous=True)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((M, S)), args=(idx,),
+                              kwargs={'offsets': torch.tensor([0, 0, 3], device=device, dtype=torch.long),
+                                      'mode': mode,
+                                      'per_sample_weights': per_sample_weights})
+
+            # 2-D index tensor
+            idx = make_long_input((S, S), low=0, high=M)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((M, S)), args=(idx,),
+                              kwargs={'mode': mode, 'per_sample_weights': per_sample_weights})
+
+            idx = make_long_input((S, S), low=0, high=M, noncontiguous=True)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((M, S)), args=(idx,),
+                              kwargs={'mode': mode, 'per_sample_weights': per_sample_weights})
+
+            # The gradient vector at `padding_idx` is not updated.
+            # Negative padding_idx
+            idx = make_long_input((6,), low=0, high=S)
+            idx[0] = 4
+            idx[4] = 4
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((S, S)), args=(idx,),
+                              kwargs={'padding_idx': -1, 'offsets': offsets,
+                                      'mode': mode, 'per_sample_weights': per_sample_weights},)
+
+            idx = make_long_input((3, 3), low=0, high=S)
+            # Positive padding_idx
+            idx[0, 0] = 2
+            idx[1, 1] = 2
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(make_input((S, S)), args=(idx,),
+                              kwargs={'padding_idx': 2, 'mode': mode,
+                                      'per_sample_weights': per_sample_weights},)
+
+            idx = make_long_input((6, ), low=0, high=S)
+            weights = make_input((S, S))
+            offsets_ = torch.tensor([0, 3, 6], device=device, dtype=torch.long)
+            per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+            yield SampleInput(weights, args=(idx,),
+                              kwargs={'mode': mode, 'offsets': offsets_, 'include_last_offset': True},)
+
+            if not requires_grad:
+                # Following inputs return different gradient from the numerical gradient.
+                # This is expected and relevant tests are present in `test_nn.py`.
+
+                # Due to inplace renorming of weight, the numerical gradient doesn't match the
+                # analytical gradient.
+                idx = make_long_input((2, 2), low=0, high=S)
+                weights = make_input((S, S)) * 2
+                per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+                yield SampleInput(weights, args=(idx,),
+                                  kwargs={'max_norm': 1., 'mode': mode,
+                                          'per_sample_weights': per_sample_weights},)
+
+                idx = make_long_input((6, ), low=0, high=S)
+                weights = make_input((S, S)) * 2
+                per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+                yield SampleInput(weights, args=(idx,),
+                                  kwargs={'max_norm': 1., 'norm_type': 1.0,
+                                          'mode': mode, 'offsets': offsets,
+                                          'per_sample_weights': per_sample_weights},)
+
+                if mode != 'max':
+                    # Scale the gradient based on the inverse frequency of a particular index.
+                    # Note : smax mode does not support sparse weights
+                    idx = make_long_input((2, 2), low=0, high=S)
+                    idx[0, 0] = 1
+                    idx[0, 1] = 1
+                    weights = make_input((S, S))
+                    per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+                    yield SampleInput(weights, args=(idx,),
+                                      kwargs={'scale_grad_by_freq': True, 'mode': mode,
+                                              'per_sample_weights': per_sample_weights},)
+
+                    # gradcheck not implemented for sparse tensors.
+                    # Note : max mode does not support sparse weights
+                    idx = make_long_input((6, ), low=0, high=S)
+                    weights = make_input((S, S))
+                    per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+                    yield SampleInput(weights, args=(idx,),
+                                      kwargs={'sparse': True, 'offsets': offsets,
+                                              'mode': mode, 'per_sample_weights': per_sample_weights})
+
+                    idx = make_long_input((6, ), low=0, high=S)
+                    idx[0] = 1  # freq more than 1
+                    idx[1] = 1  # freq more than 1
+                    idx[3] = 0  # padding_idx
+                    weights = make_input((S, S)) * 2
+                    per_sample_weights = make_per_sample_weight(generate_per_sample_weight, idx)
+                    yield SampleInput(weights, args=(idx,),
+                                      kwargs={'sparse': True, 'scale_grad_by_freq': True, 'padding_idx': 0,
+                                              'max_norm': 1., 'offsets': offsets,
+                                              'mode': mode, 'per_sample_weights': per_sample_weights})
+
+
+def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs):
+    def make_input(shape):
+        return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_long_input(shape, *, low, high):
+        return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high)
+
+    # 0-D index tensor
+    idx = make_long_input((), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
+
+    # 1-D index tensor
+    idx = make_long_input((S,), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
+
+    # 2-D index tensor
+    idx = make_long_input((S, S), low=0, high=M)
+    yield SampleInput(make_input((M, S)), args=(idx,),)
+
+    if not requires_grad:
+        # Following inputs return different gradient from the numerical gradient.
+        # This is expected and relevant tests are present in `test_nn.py`.
+
+        # The gradient vector at `padding_idx` is not updated.
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 2
+        idx[1, 1] = 2
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
+
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 4
+        idx[1, 1] = 4
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
+
+        # Due to inplace renorming of weight, the numerical gradient doesn't match the
+        # analytical gradient.
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1.},)
+
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,), kwargs={'max_norm': 1., 'norm_type': 1.0},)
+
+        # Scale the gradient based on the inverse frequency of a particular index.
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 1
+        idx[0, 1] = 1
+        weights = make_input((S, S))
+        yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
+
+        # gradcheck not implemented for sparse tensors.
+        idx = make_long_input((2, 2), low=0, high=S)
+        weights = make_input((S, S))
+        yield SampleInput(weights, args=(idx,), kwargs={'sparse': True})
+
+        idx = make_long_input((3, 3), low=0, high=S)
+        idx[0, 0] = 1  # freq more than 1
+        idx[0, 1] = 1  # freq more than 1
+        idx[1, 0] = 0  # padding_idx
+        weights = make_input((S, S)) * 2
+        yield SampleInput(weights, args=(idx,),
+                          kwargs={'sparse': True, 'scale_grad_by_freq': True,
+                                  'padding_idx': 0, 'max_norm': 1.})
+
+
+def sample_inputs_one_hot(op_info, device, dtype, requires_grad, **kwargs):
+    def make_input(shape, *, low, high):
+        return make_tensor(shape, device=device, dtype=dtype, low=low, high=high, requires_grad=requires_grad)
+
+    shapes = ((), (S,), (L, M, S))
+    num_classess = (-1, 10)
+
+    return (
+        SampleInput(
+            make_input(
+                shape,
+                low=0,
+                high=10 if num_classes == -1 else num_classes // 2,
+            ),
+            kwargs=dict(num_classes=num_classes),
+        )
+        for shape, num_classes in itertools.product(shapes, num_classess)
+    )
+
+
+def sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs):
+    rhs_requires_grad = kwargs.get('rhs_requires_grad', requires_grad)
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Although most losses also support the reduce and size_average combination instead of reduce, the former is
+    # deprecated since 0.4.1 and thus is not tested
+    shapes_and_kwargs = (
+        ((), None),
+        ((S,), dict(reduction="mean")),
+        ((S,), dict(reduction="sum")),
+        ((S,), dict(reduction="none")),
+        ((S, S), None),
+        ((S, S, S), None),
+    )
+
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(_make_tensor(shape),
+                          args=(_make_tensor(shape, requires_grad=rhs_requires_grad),),
+                          kwargs=kwargs)
+
+def sample_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs):
+    # We get better tests if we change the range of the values to something like [-2,2]
+    # because for grid (second tensor argument) the "useful" range is [-1,1] and this way
+    # you get a better combination of out-of-range and in-range test cases
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
+                           low=-2, high=2)
+
+    batch_size = 2
+    num_channels = 3
+    modes = ("bilinear", "nearest")
+    align_cornerss = (False, True)
+    padding_modes = ("zeros", "border", "reflection")
+
+    for dim in (2, 3):
+
+        modes_ = (*modes, "bicubic") if dim == 2 else modes
+
+        for mode, padding_mode, align_corners in itertools.product(modes_, padding_modes, align_cornerss):
+            yield SampleInput(
+                _make_tensor((batch_size, num_channels, *[S] * dim)),
+                _make_tensor((batch_size, *[S] * dim, dim)),
+                mode=mode,
+                padding_mode=padding_mode,
+                align_corners=align_corners,
+            )
+
+def reference_inputs_grid_sample(op_info, device, dtype, requires_grad, **kwargs):
+
+    batch_size = 2
+    num_channels = 3
+    height = 345
+    width = 456
+    modes = ("bilinear", "nearest", "bicubic")
+    align_cornerss = (False, True)
+    padding_modes = ('zeros', 'border', 'reflection')
+
+    # Create an affine transformation matrix
+    a = torch.deg2rad(torch.tensor(45.0))
+    ca, sa = torch.cos(a), torch.sin(a)  # rotation angles
+    s1, s2 = 1.23, 1.34  # scales
+
+    theta = torch.tensor([[
+        [ca / s1, sa, 0.0],
+        [-sa, ca / s2, 0.0],
+    ]], dtype=dtype, device=device)
+    theta = theta.expand(batch_size, 2, 3).contiguous()
+
+    x = torch.arange(batch_size * num_channels * height * width, device=device)
+    x = x.reshape(batch_size, num_channels, height, width).to(torch.uint8)
+    x = x.to(dtype=dtype)
+    x.requires_grad_(requires_grad)
+
+    for mode, padding_mode, align_corners in itertools.product(modes, padding_modes, align_cornerss):
+        grid = torch.nn.functional.affine_grid(
+            theta, size=(batch_size, num_channels, height, width), align_corners=align_corners
+        )
+        yield SampleInput(
+            x,
+            grid,
+            mode,
+            padding_mode,
+            align_corners,
+        )
+
+def sample_inputs_grid_sampler_2d(op_info, device, dtype, requires_grad, **kwargs):
+    # We get better tests if we change the range of the values to something like [-2,2]
+    # because for grid (second tensor argument) the "useful" range is [-1,1] and this way
+    # you get a better combination of out-of-range and in-range test cases
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad,
+                           low=-2, high=2)
+
+    batch_size = 2
+    num_channels = 3
+    modes = (0, 1, 2)
+    align_cornerss = (False, True)
+    padding_modes = (0, 1, 2)
+
+    for mode, padding_mode, align_corners in itertools.product(modes, padding_modes, align_cornerss):
+        yield SampleInput(
+            _make_tensor((batch_size, num_channels, S, L)),
+            _make_tensor((batch_size, M + 3, M, 2)),
+            mode,
+            padding_mode,
+            align_corners,
+        )
+
+def sample_inputs_cosine_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_target(shape):
+        shape = () if len(shape) == 1 else (shape[0], )
+        t = torch.randint(0, 2, shape, device=device, dtype=torch.long)
+        # Label with -1 or 1
+        t = t * 2 - 1
+        target = t.to(dtype=dtype).detach_().requires_grad_(requires_grad)
+        return target
+
+    shapes = ((S, S), (S,))
+    reductions = ('none', 'mean', 'sum')
+    for s, r in product(shapes, reductions):
+        yield SampleInput(
+            make_input(s),
+            args=(make_input(s), make_target(s)),
+            kwargs=dict(reduction=r, margin=random.uniform(-1, 1))
+        )
+
+def sample_inputs_ctc_loss(op_info, device, dtype, requires_grad, **kwargs):
+    input_length = 50
+    batch = 16
+    num_char = 20
+    target_length = 30
+
+    def make_log_probs(s):
+        t = make_tensor(s, device=device, dtype=dtype)
+        log_probs = t.log_softmax(2).to(device=device, dtype=dtype).detach().requires_grad_(requires_grad=requires_grad)
+        return log_probs
+
+    reductions = ('none', 'mean', 'sum')
+    zero_inf = (True, False)
+    lengths_type = (list, torch.Tensor)
+    for r, z, lt in product(reductions, zero_inf, lengths_type):
+        log_probs = make_log_probs((input_length, batch, num_char))
+        targets = torch.randint(1, num_char, (batch, target_length), dtype=torch.long, device=device)
+        input_lengths = torch.full((batch, ), input_length, dtype=torch.long, device=device)
+        target_lengths = torch.randint(10, target_length, (batch, ), dtype=torch.long, device=device)
+
+        # Dont generate int[] types if reduction = "Mean" since this results in non composite compliant calls
+        # to ctc_loss.IntList since a tensor needs to be created from the target lengths.
+        # Creating such a tensor requires the use of pointers to copy data from int[] -> torch.Tensor
+        # e.g. via std::copy. Similarly symbolic/real tracing with fx will also not work
+        if lt is list and r in ["none", "sum"]:
+            input_lengths = input_lengths.tolist()
+            target_lengths = target_lengths.tolist()
+
+        yield SampleInput(log_probs, args=(targets, input_lengths, target_lengths,), kwargs=dict(reduction=r, zero_infinity=z))
+
+def sample_inputs_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
+    shape = (2, 3)
+    num_classes = shape[1]
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # FIXME: Derivative wrt. weight not implemented
+    make_weight = partial(make_tensor, num_classes, device=device, dtype=dtype, requires_grad=False)
+
+    def make_target(shape, zeros=False):
+        s = (shape[0], *shape[2:]) if len(shape) > 1 else ()
+        if zeros:
+            return torch.zeros(s, device=device, dtype=torch.long)
+        else:
+            return make_tensor(s,
+                               low=0,
+                               high=shape[1] if len(shape) > 1 else shape[0],
+                               device=device,
+                               dtype=torch.long)
+
+
+    def gen_shape_kwargs():
+        # Batched, non-batched and 2d
+        shapes = (shape, (num_classes,), shape + (2, 2))
+        reductions = ('none', 'mean', 'sum')
+        for reduction, s in product(reductions, shapes):
+            yield make_input(s), make_target(s), dict(reduction=reduction)
+            yield make_input(s), make_target(s), dict(weight=make_weight(), reduction=reduction)
+            yield make_input(s), make_target(s), dict(weight=make_weight(low=0), reduction=reduction)
+            yield make_input(s), make_target(s), dict(weight=make_weight(high=0), reduction=reduction)
+            t = make_target(s)
+            ignore = num_classes // 2
+            # If "mean", nll returns NaN, so it's not differentiable at those points
+            if t.eq(ignore).all() and reduction == "mean":
+                t.fill_(0)
+            yield make_input(s), t, dict(ignore_index=num_classes // 2, reduction=reduction)
+            yield make_input(s), t, dict(ignore_index=num_classes // 2, reduction=reduction, weight=make_weight())
+            # Test ignoring all the targets
+            # If "mean", nll returns NaN, so it's not differentiable at those points
+            if reduction != "mean":
+                yield make_input(s), make_target(s, zeros=True), dict(ignore_index=0, reduction=reduction)
+
+    for input, target, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target,), kwargs=kwargs)
+
+    target = torch.tensor([-1, 2], device=device, dtype=torch.long)
+    yield SampleInput(make_input(shape), args=(target,), kwargs={'ignore_index': -1})
+
+
+def sample_inputs_binary_cross_entropy_with_logits(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    make = partial(make_tensor, device=device, dtype=dtype)
+    make_prob = partial(make, low=0, high=1)
+    reductions = ("mean", "sum", "none")
+
+    def make_weight_shape_kwargs():
+        kwargs = []
+        for shape in ((1,), (1, S), (S), (S, S)):
+            kwargs.extend([((S, S), dict(reduction=reduction, weight=make(shape))) for reduction in reductions])
+        return kwargs
+
+    shapes_and_kwargs = [
+        *[(shape, None) for shape in ((), (1,), (S,), (S, S), (S, S, S))],
+        *[((S, S), dict(reduction=reduction)) for reduction in reductions],
+        *make_weight_shape_kwargs(),
+        *[((S, S), dict(reduction=reduction, pos_weight=make((S,), low=0))) for reduction in reductions],
+        *[((S, S), dict(reduction=reduction, weight=make((S, S)), pos_weight=make((S,), low=0))) for reduction in reductions],
+    ]
+
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(
+            make(shape, requires_grad=requires_grad),
+            args=(make_prob(shape, requires_grad=requires_grad),),
+            kwargs=kwargs,
+        )
+
+def sample_inputs_argwhere(op_info, device, dtype, requires_grad, **kwargs):
+    yield SampleInput(torch.tensor([1, 0, 2, 0], dtype=dtype, device=device, requires_grad=requires_grad))
+    mask = torch.tensor([[0, 1, 0, 1, 0],
+                         [1, 1, 1, 1, 0],
+                         [0, 0, 0, 1, 0],
+                         [1, 0, 1, 1, 0],
+                         [1, 0, 0, 1, 0]], dtype=torch.bool, device=device)
+    t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad)
+    t[mask] = 0
+    yield SampleInput(t)
+
+    t = make_tensor((S, S), dtype=dtype, device=device, requires_grad=requires_grad, noncontiguous=True)
+    t[mask] = 0
+    yield SampleInput(t)
+
+    t = make_tensor((S, 0), dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(t)
+
+    yield SampleInput(torch.zeros((S,), dtype=dtype, device=device, requires_grad=requires_grad))
+    yield SampleInput(make_tensor((), dtype=dtype, device=device, requires_grad=requires_grad))
+
+def _generate_sample_shape_reduction():
+    shapes = ((S,), (S, S), (S, S, S))
+    reductions = ('none', 'mean', 'sum')
+    yield from product(shapes, reductions)
+
+def sample_inputs_gaussian_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    # Set low slightly above 0 so gradcheck doesn't accidentally dip below 0
+    make_var = partial(make_tensor, low=0.1, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def gen_shape(shape):
+        yield shape
+        # Broadcast
+        yield (*shape[:-1], 1)
+        yield shape[:-1]
+
+    def gen_shape_kwargs():
+        for s, r in _generate_sample_shape_reduction():
+            for t_s, v_s in product(gen_shape(s), gen_shape(s)):
+                yield _make_tensor(s), _make_tensor(t_s), make_var(v_s), dict(reduction=r)
+                yield (
+                    _make_tensor(s), _make_tensor(t_s), make_var(v_s),
+                    dict(full=True, reduction=r)
+                )
+                yield (
+                    _make_tensor(s), _make_tensor(t_s), make_var(v_s),
+                    dict(eps=random.uniform(1e-6, 1e-3), reduction=r)
+                )
+                yield (
+                    _make_tensor(s), _make_tensor(t_s), make_var(v_s),
+                    dict(full=True, eps=random.uniform(1e-6, 1e-3), reduction=r)
+                )
+
+    for input, target, var, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target, var, ), kwargs=kwargs)
+
+def error_inputs_gaussian_nll_loss(op_info, device, **kwargs):
+    _make = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # invalid reduction value
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 3), _make((10, 2, 3), low=0), reduction="abc"),
+                     error_type=ValueError, error_regex="abc is not valid")
+
+    # var is of incorrect shape
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 3), _make((10, 2, 2), low=0)),
+                     error_type=ValueError, error_regex="var is of incorrect size")
+
+    # target is of incorrect shape
+    yield ErrorInput(SampleInput(_make(10, 2, 3), _make(10, 2, 2), _make((10, 2, 3), low=0)),
+                     error_type=RuntimeError,
+                     error_regex=(r"The size of tensor a \(3\) must match the size of tensor b \(2\) "
+                                  r"at non-singleton dimension 2"))
+
+def _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for s, r in _generate_sample_shape_reduction():
+        yield _make_tensor(s), _make_tensor(s), dict(reduction=r)
+
+def sample_inputs_hinge_embedding_loss(op_info, device, dtype, requires_grad, **kwargs):
+    for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
+        # target should contain either 1 or -1 as per docs
+        mask = torch.rand_like(target) > 0.5
+        target[mask] = 1
+        target[~mask] = -1
+        d['margin'] = random.uniform(-9, 9)
+        yield SampleInput(input, args=(target, ), kwargs=d)
+
+    # scalar input and target.
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput(_make_tensor(()), args=(_make_tensor(()), ))
+
+def error_inputs_hinge_embedding_loss(op, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+    # invalid reduction value
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4),), kwargs={'reduction': 'abc'}),
+                     error_type=ValueError, error_regex='is not a valid value')
+
+def reference_inputs_hinge_embedding_loss(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_hinge_embedding_loss(op, device, dtype, requires_grad, **kwargs)
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    for reduction in ('sum', 'mean', 'none'):
+        if dtype.is_floating_point:  # only supports ints and floats
+            # NaN propagation
+            inp = make_input((10, ))
+            inp[2] = float('nan')
+            target = make_input((10, ))
+            # target should contain either 1 or -1 as per docs
+            mask = torch.rand_like(target) > 0.5
+            target[mask] = -1
+            target[~mask] = 1
+            yield SampleInput(inp, args=(target,), kwargs={'reduction': reduction})
+
+            # Inf Handling
+            inp = make_input((10, ))
+            inp[4] = float('inf')
+            target = make_input((10, ))
+            mask = torch.rand_like(target) > 0.5
+            target[mask] = -1
+            target[~mask] = 1
+            yield SampleInput(inp, args=(target,), kwargs={'reduction': reduction})
+
+        # Broadcasting
+        inp = make_input((5, 5))
+        target = make_input((1, 5))
+        mask = torch.rand_like(target) > 0.5
+        target[mask] = -1
+        target[~mask] = 1
+        yield SampleInput(inp, args=(target,), kwargs={'reduction': reduction})
+
+def sample_inputs_huber_loss(op_info, device, dtype, requires_grad, **kwargs):
+    for input, target, d in _generate_sample_inputs_nn_loss(op_info, device, dtype, requires_grad, **kwargs):
+        d['delta'] = random.uniform(1e-3, 9)
+        yield SampleInput(input, args=(target, ), kwargs=d)
+
+def error_inputs_huber_loss(op, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+    # invalid reduction value
+    err = 'is not a valid value for reduction'
+    yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4),), kwargs={'reduction': 'abc'}),
+                     error_type=ValueError, error_regex=err)
+    # delta <= 0
+    for delta in (0, -1):
+        err = 'huber_loss does not support non-positive values for delta.'
+        yield ErrorInput(SampleInput(make_input(5, 4), args=(make_input(5, 4),), kwargs={'delta': delta}),
+                         error_type=RuntimeError, error_regex=err)
+
+def sample_inputs_poisson_nll_loss(op_info, device, dtype, requires_grad, **kwargs):
+    _make_tensor = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def gen_shape_kwargs():
+        for s, r in _generate_sample_shape_reduction():
+            for li in (True, False):
+                for f in (True, False):
+                    i1 = _make_tensor(s)
+                    i2 = _make_tensor(s)
+                    # For Poisson NLL Loss,
+                    # target is assumed to be from
+                    # Poisson Distribution which
+                    # always has positive samples
+                    t1 = _make_tensor(s, low=0)
+                    t2 = _make_tensor(s, low=0)
+
+                    if not li:
+                        i1.abs_()
+                        i2.abs_()
+                    t1.abs_()
+                    t2.abs_()
+
+                    yield (
+                        i1, t1,
+                        dict(log_input=li, full=f, reduction=r)
+                    )
+                    yield (
+                        i2, t2,
+                        dict(log_input=li, full=f,
+                             eps=random.uniform(1e-8, 1e-3),
+                             reduction=r)
+                    )
+
+    for input, target, kwargs in gen_shape_kwargs():
+        yield SampleInput(input, args=(target, ), kwargs=kwargs)
+
+    # test INT_TO_FLOAT promotion
+    if dtype.is_complex:
+        for d in (torch.bool, torch.int64):
+            yield SampleInput(_make_tensor(dtype=dtype), args=(_make_tensor(dtype=d),))
+            yield SampleInput(_make_tensor(dtype=d), args=(_make_tensor(dtype=dtype),))
+
+def error_inputs_poisson_nll_loss(op_info, device, **kwargs):
+    make = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # invalid reduction value
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5, 4),),
+                     kwargs={'reduction': 'abc'}),
+                     error_type=ValueError,
+                     error_regex='abc is not a valid value for reduction')
+    # invalid input shapes
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(5\) must match the '
+                                  r'size of tensor b \(4\) at non-singleton '
+                                  r'dimension 1)'))
+
+def error_inputs_soft_margin_loss(op_info, device, **kwargs):
+    make = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # invalid reduction value
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5, 4),),
+                     kwargs={'reduction': 'abc'}),
+                     error_type=ValueError,
+                     error_regex='abc is not a valid value for reduction')
+    # invalid input shapes
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(4\) must match the '
+                                  r'size of tensor b \(5\) at non-singleton '
+                                  r'dimension 1)'))
+
+def sample_inputs_triplet_margin_loss(op_info, device, dtype, requires_grad, with_distance=False, **kwargs):
+    make = partial(make_tensor, (S, M), device=device, dtype=dtype, requires_grad=requires_grad)
+
+    kwargss = (
+        *[dict(margin=margin) for margin in (1e-6, 1.0, 10.0)],
+        dict(swap=True),
+        *[dict(reduction=reduction) for reduction in ("mean", "sum", "none")],
+    )
+
+    for kwargs in kwargss:
+        input = make()
+        args = (make(), make())
+        if with_distance:
+            kwargs["distance_function"] = torch.nn.PairwiseDistance()
+        yield SampleInput(input, args=args, kwargs=kwargs)
+
+def error_inputs_triplet_margin_loss(op_info, device, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=torch.float32)
+
+    samples = (
+        # input, args, kwargs, error_type, error_regex
+        # invalid reduction
+        (make_input(3, 4), (make_input(3, 4), make_input(3, 4)),
+         dict(reduction="abc"),
+         ValueError, "abc is not a valid value for reduction"),
+
+        # shape mismatch
+        (make_input(3, 5), (make_input(3, 4), make_input(3, 4)),
+         dict(),
+         RuntimeError,
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(5\) must match the size of tensor b \(4\) "
+          r"at non-singleton dimension 1)")),
+        (make_input(3, 4), (make_input(3, 5), make_input(3, 4)),
+         dict(),
+         RuntimeError,
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
+          r"at non-singleton dimension 1)")),
+        (make_input(3, 4), (make_input(3, 4), make_input(3, 5)),
+         dict(),
+         RuntimeError,
+         (r'(Attempting to broadcast a dimension of length|'
+          r"The size of tensor a \(4\) must match the size of tensor b \(5\) "
+          r"at non-singleton dimension 1)")),
+
+        # different dimensions
+        (make_input(3,), (make_input(3, 4), make_input(3, 4)),
+         dict(),
+         RuntimeError,
+         (r"The anchor, positive, and negative tensors are expected to have "
+          r"the same number of dimensions, but got: anchor 1D, positive 2D, "
+          r"and negative 2D inputs")),
+        (make_input(3, 4), (make_input(3,), make_input(3, 4)),
+         dict(),
+         RuntimeError,
+         (r"The anchor, positive, and negative tensors are expected to have "
+          r"the same number of dimensions, but got: anchor 2D, positive 1D, "
+          r"and negative 2D inputs")),
+        (make_input(3, 4), (make_input(3, 4), make_input(3,)),
+         dict(),
+         RuntimeError,
+         (r"The anchor, positive, and negative tensors are expected to have "
+          r"the same number of dimensions, but got: anchor 2D, positive 2D, "
+          r"and negative 1D inputs")),
+    )
+
+    for input, args, kwargs, error_type, error_regex in samples:
+        yield ErrorInput(SampleInput(input, args=args, kwargs=kwargs),
+                         error_type=error_type, error_regex=error_regex)
+
+def sample_inputs_scaled_mm(op_info, device, dtype, requires_grad, **kwargs):
+    make_mat_e4m3 = partial(make_tensor, device=device, dtype=torch.float8_e4m3fn, requires_grad=requires_grad)
+    make_mat_e5m2 = partial(make_tensor, device=device, dtype=torch.float8_e5m2, requires_grad=requires_grad)
+    M, N, K = 15, 32, 16
+    samples = []
+    # two e4m3
+    mat1 = make_mat_e4m3((M, K))
+    mat2 = make_mat_e4m3((K, N)).t().contiguous().t()
+    samples.append(SampleInput(mat1, mat2))
+    # mat1 e4m3 mat2 e5m2
+    mat1 = make_mat_e4m3((M, K))
+    mat2 = make_mat_e5m2((K, N)).t().contiguous().t()
+    samples.append(SampleInput(mat1, mat2))
+    # mat1 e5m2 mat2 e4m3
+    mat1 = make_mat_e5m2((M, K))
+    mat2 = make_mat_e4m3((K, N)).t().contiguous().t()
+    samples.append(SampleInput(mat1, mat2))
+
+    yield from samples
+
+def sample_inputs_scaled_dot_product_attention(op_info, device, dtype, requires_grad, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    batch, seq_q, seq_kv, num_heads, head_dim = 4, 3, 6, 4, 8
+
+    dim_3_q_shape = (batch, seq_q, head_dim)
+    dim_3_kv_shape = (batch, seq_kv, head_dim)
+    dim_4_q_shape = (batch, num_heads, seq_q, head_dim)
+    dim_4_kv_shape = (batch, num_heads, seq_kv, head_dim)
+
+    broadcast_tuple = ((num_heads, seq_q, head_dim), (batch, num_heads, seq_kv, head_dim))
+
+    qkv_shapes = [(dim_3_q_shape, dim_3_kv_shape), (dim_4_q_shape, dim_4_kv_shape), broadcast_tuple]
+    samples = []
+    for qkv_shape, is_causal, dropout_p in product(
+            qkv_shapes, [True, False], [0.0, 0.5]):
+        shape_q, shape_kv = qkv_shape
+        samples.append(SampleInput(
+            make(shape_q),
+            make(shape_kv),
+            make(shape_kv),
+            is_causal=is_causal,
+            dropout_p=dropout_p
+        ))
+
+    # Add non standard shapes
+    diff_v_head_dim = SampleInput(
+        make((batch, num_heads, seq_q, head_dim)),
+        make((batch, num_heads, seq_kv, head_dim)),
+        make((batch, num_heads, seq_kv, head_dim + 8)),
+        is_causal=is_causal,
+        dropout_p=dropout_p
+    )
+
+    # Add an attn_mask
+    samples.append(
+        SampleInput(
+            make((batch, num_heads, seq_q, head_dim)),
+            make((batch, num_heads, seq_kv, head_dim)),
+            make((batch, num_heads, seq_kv, head_dim)),
+            attn_mask=make((seq_q, seq_kv)),
+            is_causal=False,
+            dropout_p=0.0)
+    )
+
+    yield from samples
+
+
+def sample_inputs_efficient_attention_forward(op_info, device, dtype, requires_grad, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    batch, num_heads, head_dim = 4, 4, 8
+    seq_q = 11
+    seq_kv = 32
+
+    dim_4_q_shape = (batch, num_heads, seq_q, head_dim)
+    dim_4_kv_shape = (batch, num_heads, seq_kv, head_dim)
+
+    qkv_shapes = [(dim_4_q_shape, dim_4_kv_shape)]
+    samples = []
+    mask_types = [1, 2]  # UpperLeft, LowerRight
+    scales = [None, 1.0]
+
+    for qkv_shape, is_causal, dropout_p, mask_type, scale in product(
+            qkv_shapes, [True, False], [0.0, 0.5], mask_types, scales):
+        shape_q, shape_kv = qkv_shape
+        samples.append(SampleInput(
+            make(shape_q).transpose(1, 2),
+            make(shape_kv).transpose(1, 2),
+            make(shape_kv).transpose(1, 2),
+            bias=None,
+            cu_seqlens_q=None,
+            cu_seqlens_k=None,
+            max_seqlen_q=None,
+            max_seqlen_k=None,
+            dropout_p=dropout_p,
+            custom_mask_type=mask_type,
+            compute_log_sumexp=requires_grad,
+            scale=scale,
+            causal_diagonal=None,
+            seqlen_k=None
+        ))
+
+    # Add non standard shapes
+    diff_v_head_dim = SampleInput(
+        make((batch, seq_q, num_heads, head_dim)),
+        make((batch, seq_kv, num_heads, head_dim)),
+        make((batch, seq_kv, num_heads, head_dim + 8)),
+        bias=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+        dropout_p=dropout_p,
+        custom_mask_type=0,  # No Mask
+        compute_log_sumexp=requires_grad,
+        scale=None,
+        causal_diagonal=None,
+        seqlen_k=None
+    )
+
+    # Add an attn_mask
+    samples.append(
+        SampleInput(
+            make((batch, seq_q, num_heads, head_dim)),
+            make((batch, seq_kv, num_heads, head_dim)),
+            make((batch, seq_kv, num_heads, head_dim)),
+            bias=make(batch, num_heads, seq_q, seq_kv),
+            cu_seqlens_q=None,
+            cu_seqlens_k=None,
+            max_seqlen_q=None,
+            max_seqlen_k=None,
+            dropout_p=dropout_p,
+            custom_mask_type=0,  # No Mask
+            compute_log_sumexp=requires_grad,
+            scale=None,
+            causal_diagonal=None,
+            seqlen_k=None
+        )
+    )
+
+    # jagged (with query/keys offsets)
+    cu_seqlens_k = torch.arange(-1, 32 * 2 + 1, dtype=torch.int32, device=device)
+    cu_seqlens_k[-1] = 62
+    cu_seqlens_k[0] = 0
+    samples.append(
+        SampleInput(
+            make((32, 2, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            bias=None,
+            cu_seqlens_q=torch.arange(0, 32 * 2 + 2, dtype=torch.int32, device=device),
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=2,
+            max_seqlen_k=2,
+            dropout_p=0.0,
+            custom_mask_type=0,  # No Mask
+            compute_log_sumexp=requires_grad,
+            scale=None,
+            causal_diagonal=None,
+            seqlen_k=None,
+        )
+    )
+
+    yield from samples
+
+def sample_inputs_flash_attention_forward(op_info, device, dtype, requires_grad, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    batch, num_heads, head_dim = 4, 4, 8
+    seq_q = 11
+    seq_kv = 32
+
+    dim_4_q_shape = (batch, num_heads, seq_q, head_dim)
+    dim_4_kv_shape = (batch, num_heads, seq_kv, head_dim)
+
+    qkv_shapes = [(dim_4_q_shape, dim_4_kv_shape)]
+    samples = []
+    scales = [None, 1.0]
+
+    for qkv_shape, is_causal, dropout_p, scale in product(
+            qkv_shapes, [True, False], [0.0, 0.5], scales):
+        shape_q, shape_kv = qkv_shape
+        samples.append(SampleInput(
+            make(shape_q).transpose(1, 2),
+            make(shape_kv).transpose(1, 2),
+            make(shape_kv).transpose(1, 2),
+            cum_seq_q=None,
+            cum_seq_k=None,
+            max_q=seq_q,
+            max_k=seq_kv,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            return_debug_mask=False,
+            scale=scale,
+        ))
+
+    yield from samples
+
+def sample_inputs_pairwise_distance(op_info, device, dtype, requires_grad, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    shape = (3,)
+    batched_shape = (2, *shape)
+    shapes_and_kwargs = [
+        (shape, None),
+        (batched_shape, None),
+        (shape, dict(keepdim=True)),
+        (batched_shape, dict(keepdim=True)),
+        (shape, dict(p=5.0)),
+        (shape, dict(p=-1.0)),
+        (shape, dict(eps=1.0)),
+    ]
+
+    return (
+        SampleInput(make(shape), args=(make(shape),), kwargs=kwargs) for shape, kwargs in shapes_and_kwargs
+    )
+
+def sample_inputs_pixel_shuffle(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield from (
+        SampleInput(make_arg((1, 9, 2, 2)), upscale_factor=upscale_factor)
+        for upscale_factor in (1, 3)
+    )
+    yield from (
+        SampleInput(make_arg(shape), upscale_factor=1)
+        for shape in [
+            (1, 0, 1, 1),
+            (1, 1, 0, 1),
+            (1, 1, 1, 0),
+        ]
+    )
+
+def sample_inputs_pixel_unshuffle(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield from (
+        SampleInput(make_arg((1, 1, 6, 6)), downscale_factor=downscale_factor)
+        for downscale_factor in (1, 3)
+    )
+    yield from (
+        SampleInput(make_arg(shape), downscale_factor=1)
+        for shape in [
+            (1, 0, 1, 1),
+            (1, 1, 0, 1),
+            (1, 1, 1, 0),
+        ]
+    )
+
+def sample_inputs_binary_cross_entropy(op_info, device, dtype, requires_grad, logits=False, **kwargs):
+    make = partial(make_tensor, device=device, dtype=dtype)
+    # Lower bounds must be greater than 'eps' defined in gradcheck.py::gradgradcheck() -> eps
+    # otherwise perturbation calculation causes Tensor value to become negative triggering
+    # a device-side hardware assertion
+    make_prob = partial(make, low=1e-6, high=1)
+
+    reductions = ("mean", "sum", "none")
+
+    shapes_and_kwargs = [
+        *[(shape, None) for shape in ((), (1,), (S,), (S, S), (S, S, S))],
+        *[((S, S), dict(reduction=reduction)) for reduction in reductions],
+        *[((S, S), dict(reduction=reduction, weight=make((S, S)))) for reduction in reductions],
+    ]
+
+    if logits:
+        shapes_and_kwargs.extend(
+            [((S, S), dict(reduction=reduction, pos_weight=make((S,), low=0))) for reduction in reductions]
+        )
+
+    for shape, kwargs in shapes_and_kwargs:
+        yield SampleInput(
+            (make if logits else make_prob)(shape, requires_grad=requires_grad),
+            args=(make_prob(shape, requires_grad=requires_grad),),
+            kwargs=kwargs,
+        )
+
+def sample_inputs_allclose(op_info, device, dtype, requires_grad, **kwargs):
+    sample_shapes = [(), (S), (S, S, S)]
+    atols = [1e-2, 1e-16]
+    rtols = [1e-1, 0.5]
+    eps = 1e-8
+    for s, rtol, atol in product(sample_shapes, rtols, atols):
+        # close sample
+        t = make_tensor(s, device=device, dtype=dtype, requires_grad=requires_grad)
+        close = (t + atol).detach().requires_grad_(requires_grad)
+        yield SampleInput(t, close, rtol=rtol, atol=atol)
+
+        # random sample
+        a = make_tensor(s, device=device, dtype=dtype, requires_grad=requires_grad)
+        b = make_tensor(s, device=device, dtype=dtype, requires_grad=requires_grad)
+        yield SampleInput(a, b, rtol=rtol, atol=atol)
+
+
+def sample_inputs_l1_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs)
+
+    # test COMPLEX_TO_FLOAT promotion
+    if dtype.is_complex:
+        make = partial(make_tensor, (), device=device, requires_grad=requires_grad)
+        yield SampleInput(make(dtype=dtype), args=(make(dtype=torch.double),))
+        yield SampleInput(make(dtype=torch.double), args=(make(dtype=dtype),))
+
+def error_inputs_l1_loss(op_info, device, **kwargs):
+    make = partial(make_tensor, device=device, dtype=torch.float32)
+
+    # invalid reduction value
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5, 4),),
+                     kwargs={'reduction': 'abc'}),
+                     error_type=ValueError,
+                     error_regex='abc is not a valid value for reduction')
+    # invalid input shapes
+    yield ErrorInput(SampleInput(make(5, 4), args=(make(5,),)),
+                     error_regex=(r'(Attempting to broadcast a dimension of length|'
+                                  r'The size of tensor a \(4\) must match the '
+                                  r'size of tensor b \(5\) at non-singleton '
+                                  r'dimension 1)')
+                     )
+
+def sample_inputs_smooth_l1_loss(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_loss(op_info, device, dtype, requires_grad, **kwargs)
+
+    make = partial(make_tensor, (S, S), device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # This test case always triggers the smooth condition, since absolute difference of input and target
+    # is smaller than beta
+    yield SampleInput(make(low=0, high=2), args=(make(low=-2, high=0),), kwargs=dict(beta=5))
+    yield SampleInput(make(), args=(make(),), kwargs=dict(beta=0))
+
+def sample_inputs_kl_div(op_info, device, dtype, requires_grad, **kwargs):
+    # kl_div works with inputs in [0, 1] (aka the pdf of a probability measure)
+    # Then log [0, 1] = (-inf, 0], so this is the log space
+    make_arg = partial(make_tensor, low=0., device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_log(shape):
+        out = torch.nn.functional.log_softmax(make_arg(shape), -1)
+        out.requires_grad_(requires_grad)
+        return out
+
+    def make_prob(shape):
+        out = torch.nn.functional.softmax(make_arg(shape), -1)
+        out.requires_grad_(requires_grad)
+        return out
+
+    shapes = ((2,), (2, 3))
+    reductions = ("none", "mean", "batchmean", "sum")
+    for shape, reduction, log_target in product(shapes, reductions, (True, False)):
+        input = make_log(shape)
+        target = make_log(shape) if log_target else make_prob(shape)
+        yield SampleInput(input, args=(target,), kwargs=dict(reduction=reduction, log_target=log_target))
+
+def sample_inputs_pdist(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield from (SampleInput(make_input((n, m))) for n, m in itertools.product((1, S), repeat=2))
+    yield from (SampleInput(make_input((S, S)), kwargs=dict(p=p)) for p in (0.0, 1.0, 2.0, 10.0, float("inf")))
+
+def reference_pdist(input, p=2):
+    pdist = scipy.spatial.distance.pdist
+    if p == 0:
+        output = pdist(input, "hamming") * input.shape[1]
+    elif p == float("inf"):
+        output = pdist(input, lambda x, y: np.abs(x - y).max())
+    else:
+        output = pdist(input, "minkowski", p=p)
+    return output.astype(input.dtype)
+
+def sample_inputs_diagflat(op_info, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(make_input(()))
+    yield SampleInput(make_input((2,)))
+    yield SampleInput(make_input((2, 2)))
+    yield SampleInput(make_input((2,)), offset=1)
+    yield SampleInput(make_input((2,)), offset=-1)
+
+def sample_inputs_max_unpool(op_info, device, dtype, requires_grad, **kwargs):
+    unpool_name_to_pool_method_dict = {
+        'nn.functional.max_unpool1d': torch.nn.functional.max_pool1d,
+        'nn.functional.max_unpool2d': torch.nn.functional.max_pool2d,
+        'nn.functional.max_unpool3d': torch.nn.functional.max_pool3d
+    }
+
+    unpool_name_to_dim = {
+        'nn.functional.max_unpool1d': 1,
+        'nn.functional.max_unpool2d': 2,
+        'nn.functional.max_unpool3d': 3
+    }
+
+    unpool_to_pool_name_dict = {k: f'nn.functional.{v.__name__}' for k, v in unpool_name_to_pool_method_dict.items()}
+
+    pool_dim = unpool_name_to_dim[op_info.name]
+    pool_method = unpool_name_to_pool_method_dict[op_info.name]
+
+    pool_op_info = copy.copy(op_info)
+    pool_op_info.name = unpool_to_pool_name_dict[op_info.name]
+
+    for sample in sample_inputs_max_pool(pool_op_info, device, dtype, requires_grad, **kwargs):
+        # shapes (C, ...) do not work as of now,
+        # see https://github.com/pytorch/pytorch/issues/68337
+        # TODO: remove once the issue is resolved
+        if sample.input.dim() != pool_dim + 2:
+            continue
+
+        # No dilation > 1 for max_unpool,
+        # see https://github.com/pytorch/pytorch/issues/68420
+        if sample.kwargs['dilation'] != 1:
+            continue
+
+        # Can't unpool without indices
+        if sample.kwargs['return_indices']:
+            pool, indices = pool_method(sample.input, **sample.kwargs)
+            # arg has to be a leaf
+            arg = pool.detach().requires_grad_(requires_grad)
+            sample_kwargs = {
+                'kernel_size': sample.kwargs['kernel_size'],
+                'stride': sample.kwargs['stride'],
+                'padding': sample.kwargs['padding'],
+                # output_size could be None but we specify it explicitly
+                # to compensate for the information lose in pool due
+                # to the floor/ceil operation used to compute the shapes
+                'output_size': sample.input.size()
+            }
+
+            yield SampleInput(arg, args=(indices,), kwargs=sample_kwargs)
+
+def sample_inputs_max_unpool_grad(op_info, device, dtype, requires_grad, **kwargs):
+    for sample in sample_inputs_max_unpool(op_info, device, dtype, requires_grad, **kwargs):
+        indices = sample.args[0]
+        # The samples for max_unpool are generated with max_pool.
+        # It could be that a single element from the max_pool's
+        # input is mapped to several locations in its output.
+        # This situation leads to failed gradchecks because
+        # the finite difference algorithm perturbs the elements
+        # of the output one by one, and not in classes of
+        # equivalences determined by whether two elements
+        # in the output are coming from the same location in the
+        # input (simply put, they have the same corresponding index).
+        # So, there are two ways to resolve this issue:
+        # 1. Extract a perturbation for one element and apply it all
+        #    the elements from the same equivalence class, or
+        # 2. Make sure that the equivalence classes are all singletons,
+        # i.e. the index tensor has to be comprised of only unique
+        # indices.
+        # Here we go with the solution 2, the easiest of all.
+        if indices.unique().numel() == indices.numel():
+            yield sample
+
+def sample_inputs_multi_head_attention_forward(opinfo, device, dtype, requires_grad, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    if requires_grad:
+        # backward tests would take too long to complete, causing the job timeout.
+        bsz = 2
+        is_batcheds = (True,)
+        use_separate_proj_weights = (False,)
+        emb_sizes = (2,)
+        src_lens = (XS,)
+        tgt_lens = (XS,)
+        heads = (2,)
+        dropouts = (0.5,)
+        mask_types = ("2d",)
+    else:
+        bsz = 2
+        is_batcheds = (False, True)
+        use_separate_proj_weights = (False, True)
+        emb_sizes = (2, 4)
+        src_lens = (XS,)
+        tgt_lens = (XS, S)
+        heads = (1, 2)
+        dropouts = (0.0, 0.5)
+        mask_types = (None, "2d", "3d")
+
+    for is_batched, use_separate_proj_weight, mask_type, emb_size, src_len, tgt_len, num_heads, dropout_p in itertools.product(
+        is_batcheds, use_separate_proj_weights, mask_types, emb_sizes, src_lens, tgt_lens, heads, dropouts
+    ):
+        attn_mask = None
+        if mask_type == "2d":
+            attn_mask = make_input(src_len, tgt_len)
+        elif mask_type == "3d":
+            attn_mask = make_input((bsz if is_batched else 1) * num_heads, src_len, tgt_len)
+
+        if is_batched:
+            q = make_input(src_len, bsz, emb_size)
+            k = make_input(tgt_len, bsz, emb_size)
+            v = make_input(tgt_len, bsz, emb_size)
+        else:
+            q = make_input(src_len, emb_size)
+            k = make_input(tgt_len, emb_size)
+            v = make_input(tgt_len, emb_size)
+        if use_separate_proj_weight:
+            in_proj_weight = None
+            q_proj_weight = make_input(emb_size, emb_size)
+            k_proj_weight = make_input(emb_size, emb_size)
+            v_proj_weight = make_input(emb_size, emb_size)
+        else:
+            in_proj_weight = make_input(emb_size * 3, emb_size)
+            q_proj_weight = None
+            k_proj_weight = None
+            v_proj_weight = None
+
+        bias_k = make_input(emb_size)
+        bias_v = make_input(emb_size)
+        in_proj_bias = make_input(emb_size * 3)
+        out_proj_weight = make_input(emb_size, emb_size)
+        out_proj_bias = make_input(emb_size)
+        sample_args = (
+            k, v, emb_size, num_heads, in_proj_weight,
+            in_proj_bias, bias_k, bias_v, False,
+            dropout_p, out_proj_weight, out_proj_bias
+        )
+        sample_kwargs = {
+            "q_proj_weight" : q_proj_weight,
+            "k_proj_weight" : k_proj_weight,
+            "v_proj_weight" : v_proj_weight,
+            "attn_mask" : attn_mask,
+            "training" : True if dropout_p > 0.0 else False,
+            "use_separate_proj_weight" : use_separate_proj_weight
+        }
+
+        yield SampleInput(q, args=sample_args, kwargs=sample_kwargs)
+
+
+# Includes some values such that N * N won't be a multiple of 4,
+# which should ensure we test the vectorized and non-vectorized
+# kernel code paths.
+NUM_SIZE0_TENSORS = 10000
+foreach_num_tensors = [20, 23] if not TEST_WITH_SLOW else [23, 30, 300]
+_foreach_inputs_default_kwargs = {"noncontiguous": False, "same_size": False, "low": None, "high": None}
+
+
+class ForeachRightmostArgType(enum.Enum):
+    TensorList = enum.auto()
+    ScalarList = enum.auto()
+    Scalar = enum.auto()
+    Tensor = enum.auto()
+
+
+class ForeachSampleInput(SampleInput):
+    # For TensorList <op> Scalar/Tensor, we compute the reference
+    # by converting it into TensorList <op> ScalarList/TensorList and
+    # then converting into multiple Tensor <op> Scalar/Tensor.
+    # ref_args contains the args converted to TensorList <op> ScalarList/TensorList
+    ref_args: Any
+    disable_fastpath: bool
+
+    def __init__(self, *args, disable_fastpath=False, ref_args=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.ref_args = ref_args or self.args
+        self.disable_fastpath = disable_fastpath
+
+
+class foreach_inputs_sample_func:
+    def __init__(
+        self,
+        arity: int,
+        rightmost_supports_scalar: bool,
+        rightmost_supports_scalarlist: bool,
+        rightmost_supports_tensor: bool = False,
+    ) -> None:
+        self.arity = arity
+        self._set_rightmost_arg_types(
+            rightmost_supports_scalar, rightmost_supports_scalarlist, rightmost_supports_tensor,
+        )
+
+    def _set_rightmost_arg_types(
+        self,
+        rightmost_supports_scalar: bool,
+        rightmost_supports_scalarlist: bool,
+        rightmost_supports_tensor: bool,
+    ) -> None:
+        self._rightmost_arg_types = [ForeachRightmostArgType.TensorList]
+        if self.arity > 1:
+            if rightmost_supports_scalar:
+                self._rightmost_arg_types.append(ForeachRightmostArgType.Scalar)
+            if rightmost_supports_scalarlist:
+                self._rightmost_arg_types.append(ForeachRightmostArgType.ScalarList)
+            if rightmost_supports_tensor:
+                self._rightmost_arg_types.append(ForeachRightmostArgType.Tensor)
+
+    def _sample_rightmost_arg(self, opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
+        if rightmost_arg_type == ForeachRightmostArgType.TensorList:
+            return [sample_inputs_foreach(None, device, dtype, num_tensors, **_foreach_inputs_kwargs)]
+        if rightmost_arg_type == ForeachRightmostArgType.Tensor:
+            return [make_tensor(
+                (), device=device, dtype=dtype,
+                noncontiguous=_foreach_inputs_kwargs["noncontiguous"],
+                requires_grad=_foreach_inputs_kwargs.get("requires_grad", False),
+            )]
+        should_use_simpler_scalars = opinfo.name == "_foreach_pow" and dtype in (torch.float16, torch.bfloat16)
+
+        def sample_float():
+            s = random.random()
+            if should_use_simpler_scalars:
+                return 1.0 if s > 0.5 else 2.0
+            else:
+                return 1.0 - s
+
+        high = 2 if should_use_simpler_scalars else 9
+        if rightmost_arg_type == ForeachRightmostArgType.ScalarList:
+            return [
+                [random.randint(0, high) + 1 for _ in range(num_tensors)],
+                [sample_float() for _ in range(num_tensors)],
+                [complex(sample_float(), sample_float()) for _ in range(num_tensors)],
+                [True for _ in range(num_tensors)],
+                [1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 3)],
+                [True, 1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 4)],
+            ]
+        if rightmost_arg_type == ForeachRightmostArgType.Scalar:
+            return (
+                random.randint(1, high + 1),
+                sample_float(),
+                True,
+                complex(sample_float(), sample_float()),
+            )
+        raise AssertionError(f"Invalid rightmost_arg_type of {rightmost_arg_type}")
+
+    def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
+        if self.arity == 1:
+            if "foreach_abs" in opinfo.name and dtype in complex_types():
+                return True
+            # unary
+            if opinfo.ref in (torch.abs, torch.neg):
+                return False
+            return dtype in integral_types_and(torch.bool)
+        if self.arity < 2 or rightmost_arg_type == ForeachRightmostArgType.Tensor:
+            return None
+        if "foreach_pow" in opinfo.name and dtype in integral_types():
+            return True
+        if rightmost_arg_type == ForeachRightmostArgType.TensorList:
+            disable_fastpath = "foreach_div" in opinfo.name and dtype in integral_types_and(torch.bool)
+            if "foreach_add" in opinfo.name and dtype == torch.bool:
+                disable_fastpath = True
+            return disable_fastpath
+        elif rightmost_arg_type == ForeachRightmostArgType.Scalar:
+            disable_fastpath = "foreach_div" in opinfo.name and dtype in integral_types_and(torch.bool)
+            if isinstance(rightmost_arg, bool):
+                disable_fastpath |= dtype == torch.bool
+                if opinfo.ref in (torch.add, torch.mul):
+                    disable_fastpath = False
+            elif isinstance(rightmost_arg, int):
+                disable_fastpath |= dtype == torch.bool
+            elif isinstance(rightmost_arg, float):
+                disable_fastpath |= dtype in integral_types_and(torch.bool)
+            elif isinstance(rightmost_arg, complex):
+                disable_fastpath |= dtype not in complex_types()
+            else:
+                raise AssertionError(f"Invalid scalar of type {rightmost_arg_type} - {rightmost_arg}")
+            return disable_fastpath
+        elif rightmost_arg_type == ForeachRightmostArgType.ScalarList:
+            disable_fastpath = opinfo.ref == torch.div and dtype in integral_types_and(torch.bool)
+            elmt_t = type(rightmost_arg[0])
+            has_same_type = all(isinstance(v, elmt_t) for v in rightmost_arg)
+            if not has_same_type:
+                return dtype not in complex_types()
+            if isinstance(rightmost_arg[0], bool):
+                if ("foreach_add" in opinfo.name or "foreach_mul" in opinfo.name) and dtype == torch.bool:
+                    disable_fastpath = False
+            elif isinstance(rightmost_arg[0], int):
+                disable_fastpath |= dtype == torch.bool
+            elif isinstance(rightmost_arg[0], float):
+                disable_fastpath |= dtype in integral_types_and(torch.bool)
+            elif isinstance(rightmost_arg[0], complex):
+                disable_fastpath |= dtype not in complex_types()
+            else:
+                raise AssertionError(f"Invalid scalarlist of {rightmost_arg}")
+            return disable_fastpath
+        else:
+            raise AssertionError(f"Invalid rightmost_arg_type of {rightmost_arg_type}")
+
+    def _sample_kwargs(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
+        kwargs = {}
+        if rightmost_arg_type == ForeachRightmostArgType.TensorList and opinfo.supports_alpha_param:
+            if dtype in integral_types_and(torch.bool):
+                kwargs["alpha"] = 3
+            elif dtype.is_complex:
+                kwargs["alpha"] = complex(3, 3)
+            else:
+                kwargs["alpha"] = 3.14
+        if self.arity > 1:
+            kwargs["disable_fastpath"] = self._should_disable_fastpath(opinfo, rightmost_arg, rightmost_arg_type, dtype)
+        return kwargs
+
+    def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, **kwargs):
+        assert "num_input_tensors" not in kwargs
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+        for rightmost_arg_type in self._rightmost_arg_types:
+            zero_size_foreach_inputs_kwargs = copy.deepcopy(_foreach_inputs_kwargs)
+            zero_size_foreach_inputs_kwargs["zero_size"] = True
+            input = sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, **zero_size_foreach_inputs_kwargs)
+            if self.arity > 1:
+                args = [
+                    sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, **zero_size_foreach_inputs_kwargs)
+                    for _ in range(self.arity - 2)
+                ]
+                args.append(
+                    self._sample_rightmost_arg(
+                        opinfo, ForeachRightmostArgType.TensorList, device, dtype, NUM_SIZE0_TENSORS,
+                        **zero_size_foreach_inputs_kwargs)[0])
+                kwargs = self._sample_kwargs(
+                    opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype, zero_size=True)
+            else:
+                args = []
+                kwargs = {}
+                if opinfo.ref in (torch.abs, torch.neg):
+                    kwargs["disable_fastpath"] = False
+                else:
+                    kwargs["disable_fastpath"] = dtype in integral_types_and(torch.bool)
+            yield ForeachSampleInput(input, *args, **kwargs)
+
+    def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
+        num_input_tensors_specified = "num_input_tensors" in kwargs
+        num_input_tensors = kwargs.pop("num_input_tensors") if num_input_tensors_specified else foreach_num_tensors
+        assert isinstance(num_input_tensors, list)
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+        _foreach_inputs_kwargs["zero_size"] = False
+
+        # add empty tensor interspersion to test fully fixing #100701
+        for num_tensors, rightmost_arg_type, intersperse_empty_tensors in itertools.product(
+                num_input_tensors, self._rightmost_arg_types, (True, False)):
+            if intersperse_empty_tensors and (num_tensors != max(num_input_tensors) or str(device) == 'cpu'):
+                # generate interspersed empty tensors for only 1 N on non-cpu device to lessen redundancy
+                continue
+            _foreach_inputs_kwargs["intersperse_empty_tensors"] = intersperse_empty_tensors
+            input = sample_inputs_foreach(
+                None, device, dtype, num_tensors, **_foreach_inputs_kwargs)
+            args = []
+            if self.arity > 1:
+                args = [
+                    sample_inputs_foreach(
+                        None, device, dtype, num_tensors, **_foreach_inputs_kwargs)
+                    for _ in range(self.arity - 2)
+                ]
+                rightmost_arg_list = self._sample_rightmost_arg(
+                    opinfo, rightmost_arg_type, device, dtype, num_tensors,
+                    **_foreach_inputs_kwargs)
+                for rightmost_arg in rightmost_arg_list:
+                    args.append(rightmost_arg)
+                    kwargs = self._sample_kwargs(opinfo, rightmost_arg, rightmost_arg_type, dtype)
+                    ref_args = args
+                    if rightmost_arg_type in (ForeachRightmostArgType.Scalar, ForeachRightmostArgType.Tensor):
+                        ref_args = args[:-1] + [[args[-1] for _ in range(num_tensors)]]
+                    sample = ForeachSampleInput(input, *args, ref_args=ref_args, **kwargs)
+                    yield sample
+                    args.pop()
+            else:
+                yield ForeachSampleInput(
+                    input,
+                    *args,
+                    disable_fastpath=self._should_disable_fastpath(opinfo, None, None, dtype),
+                )
+
+
+class foreach_norm_sample_func(foreach_inputs_sample_func):
+    def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, **kwargs):
+        assert "num_input_tensors" not in kwargs
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+        for ord in (0, 1, 2, -1, -2, float('inf'), float('-inf')):
+            input = sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
+            disable_fastpath = True
+            if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
+                disable_fastpath = False
+            yield ForeachSampleInput(input, ord=ord, disable_fastpath=disable_fastpath)
+
+    def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
+        num_input_tensors = kwargs.pop("num_input_tensors", foreach_num_tensors)
+        assert isinstance(num_input_tensors, list)
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+
+        for num_tensors, ord in product(num_input_tensors, (0, 1, 2, -1, -2, float('inf'), float('-inf'))):
+            input = sample_inputs_foreach(None, device, dtype, num_tensors, zero_size=False, **_foreach_inputs_kwargs)
+            disable_fastpath = True
+            if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
+                disable_fastpath = False
+            yield ForeachSampleInput(input, ord=ord, disable_fastpath=disable_fastpath)
+
+        # Also test nan propagation with a single tensor, but skip autograd testing
+        if not requires_grad:
+            nan_inputs = [
+                [float('nan')],
+                [float('nan'), 1.0],
+                [1.0, float('nan')],
+                [1.0, 2.0, 3.0, float('nan'), float('nan'), 7.0, float('nan'), float('nan'), -1.5, 6.0],
+                [7.0, 3.0, float('nan'), float('nan'), -1.5, 6.0],
+                [3.0, float('nan'), float('nan'), -1.5, 6.0],
+            ]
+            for input in nan_inputs:
+                x = torch.tensor(input, device=device)
+                disable_fastpath = True
+                if ord in (1, 2, float('inf')) and dtype in floating_types_and(torch.half, torch.bfloat16):
+                    disable_fastpath = False
+                yield ForeachSampleInput([x], ord=ord, disable_fastpath=disable_fastpath)
+
+
+
+
+class foreach_lerp_sample_func(foreach_inputs_sample_func):
+    def _sample_rightmost_arg(self, opinfo, rightmost_arg_type, device, dtype, num_tensors, **_foreach_inputs_kwargs):
+        if rightmost_arg_type == ForeachRightmostArgType.TensorList:
+            return [sample_inputs_foreach(None, device, dtype, num_tensors, **_foreach_inputs_kwargs)]
+        if rightmost_arg_type == ForeachRightmostArgType.ScalarList:
+            return [
+                [random.randint(0, 9) + 1 for _ in range(num_tensors)],
+                [1.0 - random.random() for _ in range(num_tensors)],
+                [complex(1.0 - random.random(), 1.0 - random.random()) for _ in range(num_tensors)],
+                [True for _ in range(num_tensors)],
+                [1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 3)],
+                [True, 1, 2.0, 3.0 + 4.5j] + [3.0 for _ in range(num_tensors - 4)],
+            ]
+        if rightmost_arg_type == ForeachRightmostArgType.Scalar:
+            return [random.random()]
+        raise AssertionError(f"Invalid rightmost_arg_type of {rightmost_arg_type}")
+
+
+class foreach_pointwise_sample_func(foreach_inputs_sample_func):
+
+    def __init__(
+        self,
+        arity: int = 3,
+        rightmost_supports_scalar: bool = False,
+        rightmost_supports_scalarlist: bool = False,
+    ):
+        super().__init__(arity, rightmost_supports_scalar, rightmost_supports_scalarlist)
+
+    def _should_disable_fastpath(self, opinfo, rightmost_arg, rightmost_arg_type, dtype):
+        return dtype in integral_types_and(torch.bool) and opinfo.ref in (torch.addcmul,)
+
+    def sample_zero_size_tensor_inputs(self, opinfo, device, dtype, requires_grad, **kwargs):
+        assert "num_input_tensors" not in kwargs
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+        # zero_size tensor
+        input = sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
+        args = [
+            sample_inputs_foreach(None, device, dtype, NUM_SIZE0_TENSORS, zero_size=True, **_foreach_inputs_kwargs)
+            for _ in range(2)
+        ]
+        if "scalars" in kwargs:
+            del kwargs["scalars"]
+        kwargs.update(self._sample_kwargs(opinfo, args[-1], ForeachRightmostArgType.TensorList, dtype))
+        yield ForeachSampleInput(input, *args, **kwargs)
+
+    def __call__(self, opinfo, device, dtype, requires_grad, **kwargs):
+        num_input_tensors_specified = "num_input_tensors" in kwargs
+        num_input_tensors = kwargs.pop("num_input_tensors") if num_input_tensors_specified else foreach_num_tensors
+        assert isinstance(num_input_tensors, list)
+        _foreach_inputs_kwargs = {k: kwargs.pop(k, v) for k, v in _foreach_inputs_default_kwargs.items()}
+        _foreach_inputs_kwargs["requires_grad"] = requires_grad
+
+        for num_tensors, rightmost_arg_type in itertools.product(num_input_tensors, self._rightmost_arg_types):
+            input = sample_inputs_foreach(None, device, dtype, num_tensors, zero_size=False, **_foreach_inputs_kwargs)
+            args = [
+                sample_inputs_foreach(None, device, dtype, num_tensors, zero_size=False, **_foreach_inputs_kwargs)
+                for _ in range(2 - int(rightmost_arg_type == ForeachRightmostArgType.TensorList))
+            ]
+            rightmost_arg_list = self._sample_rightmost_arg(
+                opinfo, rightmost_arg_type, device, dtype, num_tensors, zero_size=False, **_foreach_inputs_kwargs)
+            for rightmost_arg in rightmost_arg_list:
+                kwargs = {}
+                if rightmost_arg_type == ForeachRightmostArgType.TensorList:
+                    args.append(rightmost_arg)
+                elif rightmost_arg_type in [ForeachRightmostArgType.Tensor, ForeachRightmostArgType.ScalarList]:
+                    kwargs["scalars"] = rightmost_arg
+                else:
+                    kwargs["value"] = rightmost_arg
+                kwargs.update(self._sample_kwargs(opinfo, rightmost_arg, rightmost_arg_type, dtype))
+                assert len(args) == 2, f"{len(args)=}"
+                sample = ForeachSampleInput(input, *args, **kwargs)
+                yield sample
+                if rightmost_arg_type == ForeachRightmostArgType.TensorList:
+                    args.pop()
+
+
+foreach_unary_op_db: List[OpInfo] = [
+    ForeachFuncInfo(
+        'exp',
+        foreach_inputs_sample_func(1, False, False),
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        'acos',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'asin',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'atan',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'cos',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'cosh',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'log',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'log10',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'log2',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'tan',
+        foreach_inputs_sample_func(1, False, False),
+        backward_requires_result=True,
+        decorators=(
+            # due to https://github.com/pytorch/pytorch/pull/102427 enabling jiterator for complex
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.complex64: tol(atol=3e-04, rtol=2e-05)
+                    }
+                ),
+                'TestForeach',
+                'test_parity',
+                device_type='cuda'
+            ),
+        ),
+    ),
+    ForeachFuncInfo(
+        'tanh',
+        foreach_inputs_sample_func(1, False, False),
+        backward_requires_result=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride(
+                    {torch.complex64: tol(atol=5e-03, rtol=1e-04)}
+                ),
+                'TestForeach',
+                'test_parity',
+                device_type='cuda'
+            ),
+        ),
+    ),
+    ForeachFuncInfo(
+        'sin',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'sinh',
+        foreach_inputs_sample_func(1, False, False),
+    ),
+    ForeachFuncInfo(
+        'neg',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and_complex(),
+        dtypesIfCUDA=all_types_and_complex(),
+    ),
+    ForeachFuncInfo(
+        'sqrt',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        'ceil',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'erf',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'erfc',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'expm1',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        'floor',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'log1p',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_and_complex_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+    ),
+    ForeachFuncInfo(
+        'round',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'frac',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'reciprocal',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.half),
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        'sigmoid',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.half),
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        'trunc',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.half, torch.bfloat16),
+    ),
+    ForeachFuncInfo(
+        'abs',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
+                         "test_dispatch_symbolic_meta_inplace", dtypes=complex_types()),
+            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
+                         "test_dispatch_meta_inplace", dtypes=complex_types()),
+            DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestMeta",
+                         "test_meta_inplace", dtypes=complex_types()),
+        ),
+    ),
+    ForeachFuncInfo(
+        'zero',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+        supports_out=False,
+    ),
+    ForeachFuncInfo(
+        'sign',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=floating_types_and(torch.bool, torch.bfloat16, torch.half),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+    ),
+    ForeachFuncInfo(
+        'lgamma',
+        foreach_inputs_sample_func(1, False, False),
+        dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
+        dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
+        skips=(
+            DecorateInfo(unittest.skip("In-place lgamma not supported for integral tensors"), "TestMeta",
+                         "test_dispatch_symbolic_meta_inplace", dtypes=integral_types_and(torch.bool)),
+            DecorateInfo(unittest.skip("In-place lgamma not supported for integral tensors"), "TestMeta",
+                         "test_dispatch_meta_inplace", dtypes=integral_types_and(torch.bool)),
+            DecorateInfo(unittest.skip("In-place lgamma not supported for integral tensors"), "TestMeta",
+                         "test_meta_inplace", dtypes=integral_types_and(torch.bool)),
+        ),
+    ),
+]
+
+foreach_binary_op_db: List[OpInfo] = [
+    ForeachFuncInfo(
+        "add",
+        foreach_inputs_sample_func(2, True, True, True),
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        supports_alpha_param=True,
+        skips=(
+            # These tests fail with aten._local_scalar_dense not being implemented.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            # Samples have complex types and inplace only works if the dtype is complex.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+        ),
+    ),
+    ForeachFuncInfo(
+        "sub",
+        foreach_inputs_sample_func(2, True, True),
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        supports_alpha_param=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    ForeachFuncInfo(
+        "mul",
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
+        skips=(
+            # Samples have complex types and inplace only works if the dtype is complex.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+        ),
+    ),
+    ForeachFuncInfo(
+        "div",
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True, True),
+        skips=(
+            # Samples have complex types and inplace only works if the dtype is complex.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides",
+                         dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16)),
+            # fails with div_cpu is not implemented with ComplexHalf
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace",
+                         dtypes=(torch.float16,), device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace",
+                         dtypes=(torch.float16,), device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace",
+                         dtypes=(torch.float16,), device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides",
+                         dtypes=(torch.float16,), device_type='cpu'),
+        ),
+    ),
+    ForeachFuncInfo(
+        "clamp_min",
+        foreach_inputs_sample_func(2, True, True),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    ForeachFuncInfo(
+        "clamp_max",
+        foreach_inputs_sample_func(2, True, True),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    # note(crcrpar): forward ad not implemented.
+    ForeachFuncInfo(
+        "minimum",
+        foreach_inputs_sample_func(2, True, True),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=False,
+        supports_inplace_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    # note(crcrpar): forward ad not implemented.
+    ForeachFuncInfo(
+        "maximum",
+        foreach_inputs_sample_func(2, True, True),
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=False,
+        supports_inplace_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    ForeachFuncInfo(
+        "pow",
+        dtypes=all_types_and(torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+        supports_alpha_param=False,
+        supports_scalar_self_arg=True,
+        sample_inputs_func=foreach_inputs_sample_func(2, True, True),
+        supports_autograd=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+        supports_forward_ad=True,
+        backward_requires_result=True,
+    ),
+    ForeachFuncInfo(
+        "copy",
+        foreach_inputs_sample_func(2, False, False),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+        supports_out=False,
+        supports_forward_ad=False,
+        supports_autograd=False,
+    )
+]
+
+foreach_pointwise_op_db: List[ForeachFuncInfo] = [
+    ForeachFuncInfo(
+        "addcmul",
+        foreach_pointwise_sample_func(4, True, True),
+        dtypes=all_types_and_complex(),
+        dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+    ForeachFuncInfo(
+        "addcdiv",
+        sample_inputs_func=foreach_pointwise_sample_func(4, True, True),
+        dtypes=all_types_and_complex(),
+        dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+        ),
+    ),
+]
+
+foreach_reduce_op_db: List[ForeachFuncInfo] = [
+    ForeachFuncInfo(
+        "norm",
+        foreach_norm_sample_func(1, False, False),
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_inplace_all_strides"),
+        ),
+    ),
+]
+
+foreach_other_op_db: List[ForeachFuncInfo] = [
+    ForeachFuncInfo(
+        "lerp",
+        foreach_lerp_sample_func(3, True, False),
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
+    ),
+]
+
+def reference_sign(x):
+    if x.dtype == np.bool_:
+        # `np.sign` doesn't support `bool`.
+        # >>> np.sign(True)
+        # ufunc 'sign' did not contain a loop
+        # with signature matching types dtype('bool') -> dtype('bool')
+        return np.sign(x, dtype=np.uint8).astype(np.bool_)
+    return np.sign(x)
+
+
+def reference_sgn(x):
+    # NumPy doesn't have an equivalent to `torch.sgn` when the dtype is complex.
+    # For complex inputs, `np.sign` returns sign(x.real) + 0j if x.real != 0 else sign(x.imag) + 0j.
+    # while `torch.sgn` returns, 0 if abs(input) == 0 else input/abs(input)
+    if x.dtype not in [np.complex64, np.complex128]:
+        return reference_sign(x)
+
+    out = (x / np.abs(x))
+    if out.ndim == 0:
+        # Handle x == 0 case
+        if (x == 0):
+            # Can't assign to np.complex object
+            # So make a new one.
+            return np.array(complex(0, 0), dtype=x.dtype)
+        return out
+
+    # Handle x == 0 case
+    mask = (x == 0)
+    out[mask] = complex(0, 0)
+    return out
+
+
+def reference_sigmoid(x):
+    # 'scipy.special.expit' not supported for the input types
+    if x.dtype in [np.complex64, np.complex128]:
+        return (1 / (1 + np.exp(-x)))
+    return scipy.special.expit(x)
+
+
+def reference_logsigmoid(x):
+    return np.where(
+        x < 0,
+        x - np.log1p(np.exp(x)),
+        -np.log1p(np.exp(-x)))
+
+
+def reference_hardsigmoid(x):
+    intermediate = x / 6 + 0.5
+    y = np.clip(intermediate, 0, None)
+    return np.where(y > 1, 1, y).astype(x.dtype)
+
+
+def reference_lgamma(x):
+    # scipy.special.gammaln returns `-inf` when input is `-inf`.
+    # While Pytorch, C and C++, all return `inf` when input is `-inf`.
+    # Reference:
+    # https://en.cppreference.com/w/cpp/numeric/math/lgamma
+    # https://en.cppreference.com/w/c/numeric/math/lgamma
+
+    # To handle the above discrepancy,
+    # we replace -inf with inf so values
+    # that were originally -inf map to inf as expected
+    if x.dtype.kind == 'f':
+        x = np.where(x == float('-inf'), np.array(float('inf'), dtype=x.dtype), x)
+
+    out = scipy.special.gammaln(x)
+
+    if x.dtype == np.float16:
+        # `scipy.special.gammaln` returns output of float32 when input is float16,
+        # while `torch.lgamma` preserves `float16`. But due to smaller range of float16,
+        # Pytorch version outputs `inf` while SciPy returns finite values.
+        out = out.astype(np.float16)
+
+    return out
+
+
+def reference_mvlgamma(x, d):
+    if x.dtype == np.float16:
+        return scipy.special.multigammaln(x, d).astype(np.float16)
+
+    return scipy.special.multigammaln(x, d)
+
+def reference_softplus(input, beta=1, threshold=20):
+    non_linear = input * beta <= threshold
+    output = input.copy()
+    output[non_linear] = np.log(1 + np.exp(beta * input[non_linear])) / beta
+    return output
+
+def reference_gelu(X, *, approximate='none'):
+    def _gelu_ref(X):
+        return X * stats.norm.cdf(X)
+
+    def _tanh_gelu_ref(X):
+        M_SQRT_2_PI = math.sqrt(2 / math.pi)
+        Z = M_SQRT_2_PI * (X + 0.044715 * np.power(X, 3.0))
+        return 0.5 * X * (1.0 + np.tanh(Z))
+
+    if approximate == 'tanh':
+        return _tanh_gelu_ref(X)
+    else:
+        return _gelu_ref(X)
+
+
+def reference_one_hot(a: np.ndarray, num_classes: int = -1) -> np.ndarray:
+    if num_classes == -1:
+        num_classes = int(np.amax(a) + 1)
+
+    idcs = a.reshape(-1) + np.arange(0, a.size, dtype=np.int64) * num_classes
+    one_hot = np.zeros((a.size, num_classes), dtype=a.dtype)
+    np.put(one_hot, idcs, 1)
+    return one_hot.reshape(*a.shape, -1)
+
+
+def reference_mse_loss(input, target, reduction="mean"):
+    se = (input - target) ** 2
+    if reduction == "mean":
+        return np.mean(se)
+    elif reduction == "sum":
+        return np.sum(se)
+    else:  # reduction == "none"
+        return se
+
+
+def wrapper_set_seed(op, *args, **kwargs):
+    """Wrapper to set seed manually for some functions like dropout
+    See: https://github.com/pytorch/pytorch/pull/62315#issuecomment-896143189 for more details.
+    """
+    with freeze_rng_state():
+        torch.manual_seed(42)
+        output = op(*args, **kwargs)
+
+        if isinstance(output, torch.Tensor) and output.device.type == "lazy":
+            # We need to call mark step inside freeze_rng_state so that numerics
+            # match eager execution
+            torch._lazy.mark_step()
+
+        return output
+
+
+def reference_layer_norm(inp: np.ndarray, normalized_shape: Tuple[int], weight=None, bias=None, eps=1e-5):
+    return reference_native_layer_norm(inp, normalized_shape, weight, bias, eps)[0]
+
+
+def reference_native_layer_norm(inp: np.ndarray, normalized_shape: Tuple[int], weight, bias, eps):
+    feature_size = np.prod(normalized_shape)
+    inp_view = inp.reshape(-1, feature_size)  # type: ignore[call-overload]
+    mean = inp_view.mean(axis=-1, keepdims=True)
+    var = inp_view.var(axis=-1, ddof=0, keepdims=True)
+    Y = (inp_view - mean) / np.sqrt(var + eps)
+    if weight is None and bias is not None:
+        Y = Y + bias.reshape(-1)
+    elif weight is not None and bias is None:
+        Y = Y * weight.reshape(-1)
+    elif weight is not None and bias is not None:
+        Y = Y * weight.reshape(-1) + bias.reshape(-1)
+    axis = inp.ndim - len(normalized_shape)
+    stat_shape = inp.shape[:axis] + (1,) * len(normalized_shape)
+    return Y.reshape(*inp.shape), mean.reshape(stat_shape), (1.0 / np.sqrt(var + eps)).reshape(stat_shape)
+
+
+def reference_group_norm(inp: np.ndarray, num_groups: int, weight=None, bias=None, eps=1e-5):
+    inp_view = inp
+    if np.prod(inp.shape) != 0:
+        inp_view = inp.reshape((inp.shape[0], num_groups, -1))
+    mean = inp_view.mean(axis=-1, keepdims=True)
+    var = inp_view.var(axis=-1, ddof=0, keepdims=True)
+    Y = (inp_view - mean) / np.sqrt(var + eps)
+    Y = Y.reshape(inp.shape)
+    if weight is not None:
+        # weight is a vector of length equal to the channel
+        if len(Y.shape) > 2:
+            weight = np.expand_dims(weight, [0] + [idx + 2 for idx in range(inp.ndim - 2)])
+        Y = Y * weight
+    if bias is not None:
+        # bias is a vector of length equal to the channel
+        if len(Y.shape) > 2:
+            bias = np.expand_dims(bias, [0] + [idx + 2 for idx in range(inp.ndim - 2)])
+        Y = Y + bias
+    return Y
+
+
+# using a custom reference function since numpy only has a string side arg (instead of right and side) and doesn't
+# have an out_int32 arg. Additionally, numpy doesn't support searchsorted with ND arrays, so this splits those into
+# stacked 1D cases
+def reference_searchsorted(sorted_sequence, boundary, out_int32=False, right=False, side='left', sorter=None):
+    side = 'right' if (right or side == 'right') else 'left'
+    if len(sorted_sequence.shape) == 1 :
+        ret = np.searchsorted(sorted_sequence, boundary, side=side, sorter=sorter)
+        return ret.astype(np.int32) if out_int32 else ret
+    elif sorted_sequence.shape[0] == 0:
+        if sorter is not None:
+            sorter = sorter.flatten()
+        ret = np.searchsorted(sorted_sequence.flatten(), boundary.flatten(), side=side, sorter=sorter)
+        ret = ret.astype(np.int32) if out_int32 else ret
+        return ret.reshape(boundary.shape)
+    else:
+        # numpy searchsorted only supports 1D inputs so we split up ND inputs
+        orig_shape = boundary.shape
+        num_splits = np.prod(sorted_sequence.shape[:-1])
+        splits = range(0, num_splits)
+        sorted_sequence, boundary = sorted_sequence.reshape(num_splits, -1), boundary.reshape(num_splits, -1)
+        if sorter is not None:
+            sorter = sorter.reshape(num_splits, -1)
+
+        split_sequence = [sorted_sequence[i] for i in splits]
+        split_boundary = [boundary[i] for i in splits]
+        split_sorter = [sorter[i] if (sorter is not None) else None for i in splits]
+
+        split_ret = [np.searchsorted(s_seq, b, side=side, sorter=s_sort)
+                     for (s_seq, b, s_sort) in zip(split_sequence, split_boundary, split_sorter)]
+        split_ret = [i.astype(np.int32) for i in split_ret] if out_int32 else split_ret
+        return np.stack(split_ret).reshape(orig_shape)
+
+def loss_reference_reduction_wrapper(fn):
+    def wrapper(input, target, *, size_average=None, reduce=None, reduction="mean", **other_kwargs):
+        if size_average is not None or reduce is not None:
+            raise RuntimeError(
+                "The keyword arguments 'size_average' and 'reduce' are deprecated and not supported by this wrapper"
+            )
+        output = fn(input, target, **other_kwargs)
+        if reduction == "mean":
+            return np.mean(output)
+        elif reduction == "sum":
+            return np.sum(output)
+        else:  # reduction == "none"
+            return output
+
+    return wrapper
+
+@loss_reference_reduction_wrapper
+def reference_smooth_l1_loss(input, target, beta=1.0):
+    diff = input - target
+    abs_diff = np.abs(diff)
+    above_threshold = abs_diff >= beta
+
+    loss = np.empty_like(input)
+    loss[above_threshold] = abs_diff[above_threshold] - 0.5 * beta
+    loss[~above_threshold] = diff[~above_threshold] ** 2 / (2 * beta)
+
+    return loss
+
+def reference_std_var(f):
+    """Forwards unbiased/correction kwargs as NumPy's equivalent ddof"""
+    g = reference_reduction_numpy(f)
+
+    @wraps(g)
+    def wrapper(x: np.ndarray, *args, **kwargs):
+        assert not ('unbiased' in kwargs and 'correction' in kwargs)
+
+        if 'unbiased' in kwargs:
+            kwargs['ddof'] = int(kwargs.pop('unbiased'))
+        elif 'correction' in kwargs:
+            kwargs['ddof'] = kwargs.pop('correction')
+
+        return g(x, *args, **kwargs)
+
+    return wrapper
+
+def generate_std_var_kwargs(t: torch.Tensor, **kwargs):
+    """Generates unbiased/correction kwargs for std/var operators"""
+    yield ((), {'unbiased': True})
+    yield ((), {'unbiased': False})
+
+    # Currently, calling std with correction is only enabled when
+    # both dim and keepdim are provided.
+    if 'dim' in kwargs and 'keepdim' in kwargs:
+        yield ((), {'correction': 0})
+        yield ((), {'correction': 1})
+
+        numel = torch.tensor(t.shape)[kwargs.get('dim')].prod()
+        yield ((), {'correction': numel // 2})
+
+def error_inputs_mean(op_info, device, is_ref=False, **kwargs):
+    if is_ref:
+        err_msg1 = (r"mean\(\): could not infer output dtype. "
+                    r"Input dtype must be either a floating point or complex dtype. "
+                    r"Got: torch.int64")
+    else:
+        err_msg1 = (r"mean\(\): could not infer output dtype. "
+                    r"Input dtype must be either a floating point or complex dtype. "
+                    r"Got: Long")
+    yield ErrorInput(
+        SampleInput(make_tensor((3, 4, 5), dtype=torch.int64, device=device), []),
+        error_regex=err_msg1,
+    )
+
+    if is_ref:
+        err_msg2 = (r"mean\(\): could not infer output dtype. "
+                    r"Optional dtype must be either a floating point or complex dtype. "
+                    r"Got: torch.int64")
+    else:
+        err_msg2 = (r"mean\(\): could not infer output dtype. "
+                    r"Optional dtype must be either a floating point or complex dtype. "
+                    r"Got: Long")
+    yield ErrorInput(
+        SampleInput(
+            make_tensor((3, 4, 5), dtype=torch.float32, device=device),
+            [],
+            dtype=torch.int64),
+        error_regex=err_msg2
+    )
+
+    if is_ref:
+        err_msg3 = "Expected out tensor to have dtype torch.float64, but got torch.float32 instead"
+    else:
+        err_msg3 = "Expected out tensor to have dtype double, but got float instead"
+    yield ErrorInput(
+        SampleInput(
+            make_tensor((3, 4, 5), dtype=torch.int64, device=device),
+            [],
+            dtype=torch.float64,
+            out=make_tensor([], dtype=torch.float32, device=device),
+        ),
+        error_regex=err_msg3
+    )
+
+# numpy implementation of torch.flatten
+# unfortunately there's no np.flatten. we figure out the desired shape and call np.reshape
+def reference_flatten(input, start_dim=0, end_dim=-1):
+    in_shape = input.shape
+    in_rank = len(in_shape)
+    for d in start_dim, end_dim:
+        if not ((in_rank == 0 and d in (-1, 0)) or -in_rank <= d < in_rank):
+            raise IndexError(f"Dimension out of range (expected to be in range of [{-in_rank}, {in_rank-1}], but got {d}")
+    end_dim = end_dim if end_dim >= 0 else in_rank + end_dim
+    start_dim = start_dim if start_dim >= 0 else in_rank + start_dim
+    if in_rank == 0:
+        end_dim = start_dim
+    if end_dim < start_dim:
+        raise RuntimeError("flatten() has invalid args: start_dim cannot come after end_dim")
+    flatten_bit_dim = functools.reduce(operator.mul, in_shape[start_dim:end_dim + 1], 1)
+    out_shape = in_shape[:start_dim] + (flatten_bit_dim,) + in_shape[end_dim + 1:]
+    return np.reshape(input, out_shape)
+
+# Operator database (sorted alphabetically)
+op_db: List[OpInfo] = [
+    UnaryUfuncInfo('abs',
+                   aliases=('absolute', ),
+                   ref=np.abs,
+                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   skips=(
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
+                                    'test_inplace_grad', dtypes=(torch.cdouble,)),
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestBwdGradients',
+                                    'test_inplace_gradgrad', dtypes=(torch.cdouble,)),
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), 'TestFwdGradients',
+                                    'test_inplace_forward_mode_AD', dtypes=(torch.cdouble,)),
+                       DecorateInfo(unittest.skip("In-place abs not supported for complex tensors"), "TestSparseUnaryUfuncs",
+                                    "test_inplace", dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
+                       # Reference: https://github.com/pytorch/pytorch/issues/49224
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    dtypes=[torch.int8], active_if=TEST_WITH_ASAN),
+                       # TODO: Fix test_out_arg_all_dtypes as torch.empty_like(expected_output) where expected_output=op(input)
+                       # We can break the logic of the loop over all possible types but it is OK.
+                       # https://github.com/pytorch/pytorch/blob/master/test/test_unary_ufuncs.py#L440-L449
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_out_arg_all_dtypes',
+                                    dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_inplace',
+                                    dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace',
+                                    dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace',
+                                    dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace_all_strides',
+                                    dtypes=(torch.cdouble, torch.cfloat, torch.chalf)),
+                   ),
+                   supports_fwgrad_bwgrad=True,
+                   assert_autodiffed=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_forward_ad=True),
+    # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
+    UnaryUfuncInfo('acos',
+                   aliases=('arccos', ),
+                   ref=np.arccos,
+                   domain=(-1, 1),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-1,
+                                                  torch.complex64: 1e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad',
+                                    dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_method_grad',
+                                    dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_inplace_grad',
+                                    dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
+                                    dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_inplace_forward_mode_AD',
+                                    dtypes=[torch.cdouble], active_if=IS_WINDOWS),)),
+    # NOTE: the derivative for inplace acosh is not implemented
+    UnaryUfuncInfo('acosh',
+                   aliases=('arccosh', ),
+                   ref=np.arccosh,
+                   domain=(1, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+                   supports_inplace_autograd=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       # Failing with wrong imaginary sign on at least some Windows jobs
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                   ),
+                   # acosh is not defined at x < 1 (real)
+                   reference_numerics_filter=NumericsFilter(
+                       condition=lambda x: (x < 1 if not x.is_complex() else torch.zeros_like(x, dtype=torch.bool)),
+                       safe_val=2)),
+    BinaryUfuncInfo('add',
+                    # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+                    ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
+                    else np.add(input, np.multiply(alpha, other)),
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                     torch.float16, torch.chalf),
+                    assert_autodiffed=True,
+                    sample_inputs_func=sample_inputs_add_sub,
+                    supports_fwgrad_bwgrad=True,
+                    supports_forward_ad=True,
+                    supports_two_python_scalars=True,
+                    decorators=(
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                            'TestBinaryUfuncs', 'test_reference_numerics'),
+                    ),
+                    skips=(
+                        # boolean alpha not handled properly
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestNNCOpInfo',
+                                     'test_nnc_correctness',
+                                     dtypes=(torch.bool,)),
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestCommon',
+                                     'test_numpy_refs',
+                                     dtypes=(torch.complex128,)),
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics_extremal_values',
+                                     dtypes=(torch.complex64, torch.complex128)),
+                    )),
+    OpInfo('item',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.item, inp, *args, **kwargs),
+           ref=np.ndarray.item,
+           method_variant=None,
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.chalf, torch.bool),
+           supports_out=False,
+           supports_autograd=False,
+           error_inputs_func=error_inputs_item,
+           sample_inputs_func=sample_inputs_item,
+           skips=(
+               # Error testing item function variant
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.float32, torch.complex64)),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # RuntimeError: Composite compliance check failed with the above error.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               # Booleans mismatch: AssertionError: False is not true
+               DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast'),
+               # Booleans mismatch: AssertionError: False is not true
+               DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake'),
+           )),
+    OpInfo('arange',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           supports_out=True,
+           supports_autograd=False,
+           is_factory_function=True,
+           error_inputs_func=error_inputs_arange,
+           sample_inputs_func=sample_inputs_arange,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/81774
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+               # Lazy tensor failures
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestLazyOpInfo', 'test_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+
+               # Exception raised from analyzeImpl at ../torch/csrc/jit/ir/alias_analysis.cpp:608
+               # We don't have an op for aten::arange but it isn't a special case.
+               # Argument types: bool, bool, bool, int, int, Device, boo
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+
+               # Captured graph does not contain aten::arange (succeeds on complex!)
+               # g: graph():
+               #   %25 : Long(1, strides=[1], requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+               #   return (%25)
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
+    OpInfo('cauchy',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.cauchy_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.cauchy_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_cauchy,
+           error_inputs_func=error_inputs_cauchy,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('exponential',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.exponential_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.exponential_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_exponential,
+           error_inputs_func=error_inputs_exponential,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.expectedFailure, "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('geometric',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.geometric_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.geometric_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16, torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8),
+           supports_out=False,
+           supports_autograd=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_geometric,
+           error_inputs_func=error_inputs_geometric,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('log_normal',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.log_normal_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.log_normal_,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_log_normal,
+           error_inputs_func=error_inputs_log_normal,
+           skips=(
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('normal',
+           variant_test_name='in_place',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.normal_, inp, *args, **kwargs),
+           inplace_variant=torch.Tensor.normal_,
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_normal,
+           error_inputs_func=error_inputs_normal,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestCommon", "test_noncontiguous_samples"),
+
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # vmap: calling random operator not supported
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+           )),
+    OpInfo('uniform',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.Tensor.uniform_, inp, *args, **kwargs),
+           method_variant=None,
+           inplace_variant=torch.Tensor.uniform_,
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_autograd=False,
+           is_factory_function=False,
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_uniform,
+           error_inputs_func=error_inputs_uniform,
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # aten.uniform was not decomposed
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    BinaryUfuncInfo('clamp_max',
+                    ref=_clamp_max_numpy,
+                    dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+                    supports_forward_ad=True,
+                    supports_rhs_python_scalar=False,
+                    supports_fwgrad_bwgrad=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=False),
+                    skips=(
+                        # RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion',
+                                     device_type='cuda'),
+                        # dispatch to lazy test failed
+                        DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+                        # test error disabled since rhs non-tensor python scalar is supported
+                        DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
+                    )),
+    BinaryUfuncInfo('clamp_min',
+                    ref=_clamp_min_numpy,
+                    dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+                    supports_forward_ad=True,
+                    supports_rhs_python_scalar=False,
+                    supports_fwgrad_bwgrad=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=False),
+                    skips=(
+                        # RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion',
+                                     device_type='cuda'),
+                        # dispatch to lazy test failed
+                        DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+                        # test error disabled since rhs non-tensor python scalar is supported
+                        DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors'),
+                    )),
+    BinaryUfuncInfo('mul',
+                    aliases=('multiply',),
+                    dtypes=all_types_and_complex_and(torch.chalf, torch.float16, torch.bfloat16, torch.bool),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
+                    error_inputs_sparse_func=error_inputs_sparse_mul,
+                    sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_coo),
+                    sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csr),
+                    sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_csc),
+                    sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsr),
+                    sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_mul, layout=torch.sparse_bsc)),
+    BinaryUfuncInfo('sub',
+                    # NumPy has no builtin reference for the alpha kwarg, but it is easy enough to emulate
+                    ref=lambda input, other, *, alpha=1: np.subtract(input, np.multiply(alpha, other)),
+                    aliases=('subtract',),
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.chalf),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    sample_inputs_func=sample_inputs_add_sub,
+                    supports_two_python_scalars=True,
+                    decorators=(
+                        DecorateInfo(
+                            toleranceOverride({torch.float16: tol(atol=1e-2, rtol=0),
+                                               torch.bfloat16: tol(atol=1e-5, rtol=5e-3),
+                                               torch.complex32: tol(atol=1e-5, rtol=1e-3)}),
+                            'TestBinaryUfuncs', 'test_reference_numerics'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                            'TestCommon', 'test_complex_half_reference_testing', device_type='cpu'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                            'TestDecomp', 'test_comprehensive', device_type='cpu'),
+                        DecorateInfo(
+                            toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                            'TestDecomp', 'test_quick', device_type='cpu'),
+                    ),
+                    skips=(
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics',
+                                     dtypes=(torch.uint8,)),
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.uint8,)),
+                    )),
+    OpInfo('addmm',
+           # This addmm OpInfo is for when alpha and beta are not both equal to 1.
+           # alpha=beta=1 is tested in the following opinfo, because that special case will
+           # trigger addmm being decomposed by a jit pass.
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=sample_inputs_addmm,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           )),
+    OpInfo('addmm',
+           # When alpha=beta=1 as compile-time constants, JIT will decompose addmm into mm and add.
+           variant_test_name='decomposed',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           autodiff_nonfusible_nodes=['aten::add', 'aten::mm'],
+           sample_inputs_func=partial(sample_inputs_addmm, alpha=1, beta=1),
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+               # https://github.com/pytorch/pytorch/issues/71784
+               DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
+                            device_type='cpu', dtypes=(torch.float16,)),
+           )),
+    OpInfo('addmv',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
+                                           torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.half: tol(atol=1e-5, rtol=3e-3)}),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+           ],
+           sample_inputs_func=sample_inputs_addmv),
+    OpInfo('addbmm',
+           ref=lambda M, batch1, batch2, beta=1, alpha=1: np.add(np.multiply(np.asarray(beta, dtype=M.dtype), M),
+                                                                 np.multiply(np.asarray(alpha, dtype=batch1.dtype),
+                                                                             np.sum(np.matmul(batch1, batch2), axis=0))),
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.3e-05, rtol=1.3e-05),
+                                      torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestCommon', 'test_numpy_refs'),
+               # MPS has slightly worse precision. Is this acceptable?
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-04),
+                                      torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestCommon', 'test_numpy_ref_mps'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+                   'TestConsistency',
+                   'test_output_match',
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.5e-05, rtol=1e-05)}),
+                   'TestCommon', 'test_out'),
+               DecorateInfo(
+                   toleranceOverride({torch.half: tol(atol=6e-3, rtol=6e-3)}),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+           ],
+           skips=(
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               # addbmm does not correctly warn when resizing out= inputs
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # https://github.com/pytorch/pytorch/issues/55907
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+           ),
+           sample_inputs_func=sample_inputs_addbmm),
+    OpInfo('baddbmm',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.complex64, torch.complex128,
+                                           torch.bfloat16),
+           backward_dtypesIfCUDA=floating_types_and(torch.float16,
+                                                    *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else [],
+                                                    torch.complex64, torch.complex128),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                   'TestMathBits', 'test_conj_view', device_type='cuda'),
+           ],
+           sample_inputs_func=sample_inputs_baddbmm,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           )),
+    OpInfo('dot',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           sample_inputs_func=sample_inputs_dot_vdot,
+           error_inputs_func=error_inputs_dot_vdot,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           )),
+    OpInfo('vdot',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_dot_vdot,
+           error_inputs_func=error_inputs_dot_vdot,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           )),
+    OpInfo('bmm',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
+           assert_autodiffed=True,
+           assert_jit_shape_analysis=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+                            "TestCommon", "test_out")
+           ),
+           sample_inputs_func=sample_inputs_bmm),
+    OpInfo('mv',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_mv),
+    OpInfo('addr',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           # Reference: https://github.com/pytorch/pytorch/issues/50747
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Reference: https://github.com/pytorch/pytorch/issues/50747
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16)),
+           ),
+           sample_inputs_func=sample_inputs_addr,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('addcmul',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # TODO: update sample inputs with for_inplace_variant kwarg to support this test
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+           ),
+           sample_inputs_func=sample_inputs_addcmul_addcdiv,
+           reference_inputs_func=partial(
+               reference_inputs_elementwise_ternary, sample_inputs_func=reference_inputs_addcmul_addcdiv)),
+    OpInfo('addcdiv',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # TODO: update sample inputs with for_inplace_variant kwarg to support this test
+               DecorateInfo(unittest.expectedFailure,
+                            'TestCommon',
+                            'test_variant_consistency_eager'),
+           ),
+           sample_inputs_func=sample_inputs_addcmul_addcdiv,
+           reference_inputs_func=partial(
+               reference_inputs_elementwise_ternary, sample_inputs_func=reference_inputs_addcmul_addcdiv)),
+    UnaryUfuncInfo('asin',
+                   aliases=('arcsin', ),
+                   ref=np.arcsin,
+                   domain=(-1, 1),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   decorators=[
+                       DecorateInfo(
+                           toleranceOverride({torch.float16: tol(atol=1e-05, rtol=1e-03)}),
+                           'TestUnaryUfuncs', device_type='cuda'),
+                       precisionOverride({torch.bfloat16: 1e-2}),
+                   ],
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    # NOTE: derivative for inplace asinh is not implemented
+    UnaryUfuncInfo('asinh',
+                   aliases=('arcsinh', ),
+                   ref=np.arcsinh,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+                   supports_inplace_autograd=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    UnaryUfuncInfo('atan',
+                   aliases=('arctan', ),
+                   ref=np.arctan,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex64, torch.complex128]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    BinaryUfuncInfo('atan2',
+                    aliases=('arctan2',),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    promotes_int_to_float=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # Incorrectly attempts to use a scalar for the second argument
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
+                    )),
+    UnaryUfuncInfo('atanh',
+                   aliases=('arctanh', ),
+                   ref=np.arctanh,
+                   domain=(-1, 1),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+                   supports_inplace_autograd=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.cfloat],
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=TEST_WITH_ROCM, device_type='cuda', dtypes=[torch.complex128]),
+                   )),
+    OpInfo('allclose',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           ref=np.allclose,
+           supports_autograd=False,
+           supports_forward_ad=False,
+           sample_inputs_func=sample_inputs_allclose,
+           skips=(
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+           ),
+           supports_out=False),
+    OpInfo('broadcast_to',
+           ref=np.broadcast_to,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_broadcast_to),
+    OpInfo('broadcast_shapes',
+           op=torch.broadcast_shapes,
+           ref=np.broadcast_shapes if np.lib.NumpyVersion(np.__version__) >= '1.20.0' else None,
+           dtypes=_dispatch_dtypes((torch.float32,)),
+           supports_out=False,
+           supports_gradgrad=False,
+           assert_autodiffed=False,
+           supports_autograd=False,
+           supports_scripting=False,
+           sample_inputs_func=sample_inputs_broadcast_shapes,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/64997
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # skip dtype tests since broadcast_shape is not device dependent.
+               # having dtypes limited to torch.float32 would cause test_dtypes to report unexpected success
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_dtypes'),
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('broadcast_tensors',
+           ref=np.broadcast_arrays,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_broadcast_tensors,
+           reference_inputs_func=reference_inputs_broadcast_tensors,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/64997
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # JIT does not support variadic tensors.
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+           )),
+    OpInfo('block_diag',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # Default batching rule in core doesn't work for ops with TensorList args
+           check_batched_forward_grad=False,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/64997
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # JIT does not support variadic tensors.
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+           ),
+           sample_inputs_func=sample_inputs_block_diag),
+    UnaryUfuncInfo('bitwise_not',
+                   ref=np.bitwise_not,
+                   dtypes=integral_types_and(torch.bool),
+                   operator_variant=operator.invert,
+                   supports_autograd=False),
+    BinaryUfuncInfo('bitwise_left_shift',
+                    op=torch.bitwise_left_shift,
+                    dtypes=integral_types(),
+                    dtypesIfCUDA=integral_types(),
+                    operator_variant=operator.lshift,
+                    inplace_operator_variant=operator.ilshift,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    skips=(
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # https://github.com/pytorch/pytorch/issues/70904
+                        DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+                    )),
+    BinaryUfuncInfo('bitwise_right_shift',
+                    op=torch.bitwise_right_shift,
+                    dtypes=integral_types(),
+                    dtypesIfCUDA=integral_types(),
+                    operator_variant=operator.rshift,
+                    inplace_operator_variant=operator.irshift,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    skips=(
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # https://github.com/pytorch/pytorch/issues/70904
+                        DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+                    )),
+    OpInfo('combinations',
+           op=torch.combinations,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           supports_out=False,
+           sample_inputs_func=sample_inputs_combinations),
+    OpInfo('cartesian_prod',
+           op=torch.cartesian_prod,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_cartesian_prod,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
+               DecorateInfo(unittest.expectedFailure,
+                            'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+           )),
+    OpInfo('cdist',
+           dtypes=floating_types(),
+           supports_out=False,
+           supports_gradgrad=False,
+           assert_autodiffed=False,
+           sample_inputs_func=sample_inputs_cdist),
+    UnaryUfuncInfo('ceil',
+                   ref=np.ceil,
+                   dtypes=all_types_and(torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   skips=(
+                       DecorateInfo(unittest.expectedFailure,
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=tuple(t for t in integral_types() if t != torch.uint8)),
+                   ),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True),
+    OpInfo('cholesky',
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_linalg_cholesky,
+           gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],),
+    OpInfo('cholesky_inverse',
+           dtypes=floating_and_complex_types(),
+           backward_dtypes=floating_and_complex_types(),
+           # https://github.com/pytorch/pytorch/issues/80411
+           gradcheck_fast_mode=True,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           check_batched_gradgrad=True,
+           sample_inputs_func=sample_inputs_linalg_cholesky_inverse,
+           gradcheck_wrapper=gradcheck_wrapper_triangular_input_real_positive_diagonal,
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+           skips=(
+               # Strides are not the same! Original strides were ((4, 2, 1),) and strides are now ((4, 1, 2),)
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),)),
+    OpInfo('cholesky_solve',
+           op=torch.cholesky_solve,
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_cholesky_solve,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_wrapper=lambda *args, **kwargs: gradcheck_wrapper_triangular_input(*args, idx=1, **kwargs),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    OpInfo('chunk',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_chunk,
+           reference_inputs_func=reference_inputs_chunk,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('unsafe_chunk',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_chunk,
+           check_batched_forward_grad=False,
+           reference_inputs_func=reference_inputs_chunk,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('clone',
+           ref=np.copy,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_clone_contiguous,
+           reference_inputs_func=reference_inputs_clone_contiguous,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           skips=(
+               # TypeError: _copy_dispatcher() got an unexpected keyword argument 'memory_format'
+               # (NumPy reference needs to be extended with memory_format)
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
+           ),),
+    OpInfo('contiguous',
+           op=lambda x, *args, **kwargs: x.contiguous(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_clone_contiguous,
+           reference_inputs_func=reference_inputs_clone_contiguous,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           autodiff_fusible_nodes=['aten::contiguous'],
+           assert_jit_shape_analysis=True,
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+           )),
+    OpInfo('sum_to_size',
+           op=lambda x, *args, **kwargs: x.sum_to_size(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_sum_to_size,
+           error_inputs_func=error_inputs_sum_to_size,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float,)),
+           )),
+    OpInfo('clamp',
+           aliases=('clip',),
+           ref=_clamp_numpy,
+           dtypes=all_types_and(torch.bfloat16, torch.half),
+           sample_inputs_func=sample_inputs_clamp,
+           reference_inputs_func=partial(reference_inputs_elementwise_ternary, sample_inputs_func=sample_inputs_clamp),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # NNC appear to not handle boolean clamp
+               DecorateInfo(unittest.expectedFailure,
+                            'TestNNCOpInfo',
+                            'test_nnc_correctness',
+                            dtypes=(torch.bool,)),
+           )),
+    UnaryUfuncInfo('positive',
+                   ref=np.positive,
+                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                   supports_out=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   ),
+    UnaryUfuncInfo('conj',
+                   ref=np.conj,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
+                   supports_sparse=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   # See https://github.com/pytorch/pytorch/pull/78358
+                   check_batched_forward_grad=False,
+                   supports_out=False),
+    UnaryUfuncInfo('conj_physical',
+                   decomp_aten_name='_conj_physical',
+                   ref=np.conj,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16,
+                                                    torch.half, torch.chalf),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   skips=(
+                       # RuntimeError: inputSet && outputSet
+                       # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":118,
+                       # please report a bug to PyTorch.
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, )),
+                       DecorateInfo(unittest.skip("Skipped! conj_physical_ not implemented for sparse"),
+                                    'TestSparseUnaryUfuncs', 'test_inplace'),
+                   )),
+    OpInfo('resolve_conj',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_view_as_real,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           ),
+    OpInfo('resolve_neg',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_view_as_real,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           ),
+    OpInfo('view_as_real',
+           dtypes=complex_types(),
+           supports_forward_ad=True,
+           supports_out=False,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_view_as_real,
+           test_conjugated_samples=False,
+           ),
+    OpInfo('view_as_complex',
+           dtypes=floating_types_and(torch.half),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           test_neg_view=False,
+           sample_inputs_func=sample_inputs_view_as_complex,
+           skips=(
+               # RuntimeError: Tensor must have a last dimension with stride 1
+               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_noncontiguous_samples"),
+               # RuntimeError: "eq_cpu" not implemented for 'ComplexHalf'
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.half,)),
+               # RuntimeError: view size is not compatible with input tensor's size and stride
+               DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+           )),
+    BinaryUfuncInfo('complex',
+                    dtypes=floating_types_and(torch.half),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False,
+                    error_inputs_func=error_inputs_complex,
+                    skips=(
+                        # Tests don't account for complex's type promotion semantics
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out', device_type='mps'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),)),
+    BinaryUfuncInfo('copysign',
+                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                    promotes_int_to_float=True,
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True),
+    OpInfo('corrcoef',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_corrcoef,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           ),
+           supports_out=False),
+    UnaryUfuncInfo('cos',
+                   ref=np.cos,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   handles_large_floats=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
+                       # This fails on CUDA but passes on ROCm
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,), device_type='cuda'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+                       # AssertionError: Tensor-likes are not close!
+                       # Greatest absolute difference: nan at index (700,) (up to 1e-05 allowed)
+                       # Greatest relative difference: nan at index (700,) (up to 0.001 allowed)
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda',
+                                    dtypes=(torch.chalf,), active_if=IS_WINDOWS),
+                   )),
+    UnaryUfuncInfo('cosh',
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/48641
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.int8]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu',
+                                    dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+                       # AssertionError: Tensor-likes are not close!
+                       # Greatest absolute difference: nan at index (6000,) (up to 1e-05 allowed)
+                       # Greatest relative difference: nan at index (6000,) (up to 0.001 allowed)
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda',
+                                    dtypes=(torch.chalf,), active_if=IS_WINDOWS),
+                   )),
+    OpInfo('cov',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_cov,
+           error_inputs_func=error_inputs_cov,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+               # Float did not match double
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
+               # Jacobian mismatch
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip("Barely fails"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+               # JIT test not working for tensor kwargs (https://github.com/pytorch/pytorch/issues/58507)
+               # RuntimeError:
+               # undefined value tensor:
+               #   File "<string>", line 3
+               # def the_method(i0):
+               #     return torch.cov(i0, correction=0, fweights=None, aweights=tensor([0.0518, 0.4681], dtype=torch.float32, requires_grad=True)) # noqa: B950
+               #                                                                ~~~~~~ <--- HERE
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('cross',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_cross,
+           supports_fwgrad_bwgrad=True,
+           supports_out=True,
+           supports_forward_ad=True),
+    OpInfo('cumsum',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # cumsum does not handle correctly out= dtypes
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           ),
+           sample_inputs_func=sample_inputs_cumulative_ops),
+    OpInfo('cumprod',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # cumprod does not handle correctly out= dtypes
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           ),
+           # gradgradcheck fails in fast_mode=True: #56275
+           sample_inputs_func=sample_inputs_cumprod,
+           gradcheck_fast_mode=False),
+    OpInfo('cummax',
+           dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_cumulative_ops, supports_dtype_kwargs=False),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('cummin',
+           dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_cumulative_ops, supports_dtype_kwargs=False),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    UnaryUfuncInfo('deg2rad',
+                   ref=np.radians,
+                   decorators=(precisionOverride({torch.bfloat16: 7e-1,
+                                                  torch.float16: 7e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True),
+    OpInfo('diff',
+           op=torch.diff,
+           # np.diff has np._NoValue as default values for prepend and append, compare_with_reference breaks if prepend/append
+           # are set as None when converting to numpy
+           ref=lambda input, n=1, dim=-1, prepend=np._NoValue, append=np._NoValue: (
+               np.diff(input, n, dim, np._NoValue if prepend is None else prepend, np._NoValue if append is None else append)
+           ),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diff,
+           error_inputs_func=error_inputs_diff,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+           )),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='no_rounding_mode',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                    # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    promotes_int_to_float=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='trunc_rounding',
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_elementwise_binary, sample_kwargs=dict(rounding_mode="trunc")),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    decorators=(
+                        # See https://github.com/pytorch/pytorch/issues/111126
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                    ),
+                    skips=(
+                        # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
+                        DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
+                    )),
+    BinaryUfuncInfo('div',
+                    aliases=('divide',),
+                    variant_test_name='floor_rounding',
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    sample_inputs_func=partial(sample_inputs_elementwise_binary, sample_kwargs=dict(rounding_mode="floor")),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
+                    assert_autodiffed=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    decorators=(
+                        # See https://github.com/pytorch/pytorch/issues/111126
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                    ),
+                    skips=(
+                        # RuntimeError: MALFORMED INPUT: Unhandled node kind (in computeValue): aten::div
+                        DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_working'),
+                    )),
+    BinaryUfuncInfo('true_divide',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                    supports_forward_ad=True,
+                    promotes_int_to_float=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_two_python_scalars=True,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True)),
+    OpInfo('equal',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           ref=lambda input, other: (input == other).all(),
+           sample_inputs_func=sample_inputs_equal,
+           supports_autograd=False,
+           supports_tracing=False,
+           skips=(
+           )),
+    UnaryUfuncInfo('exp',
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.exp),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/48010
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                   ),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True),
+    OpInfo('expand',
+           op=lambda self, shape: self.expand(shape),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_expand,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+           )),
+    OpInfo('expand_as',
+           op=lambda self, other: self.expand_as(other),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_expand_as,
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),),
+           ),
+    OpInfo('diag',
+           ref=np.diag,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_diag,
+           error_inputs_func=error_inputs_diag),
+    OpInfo('diag_embed',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_out=False,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diagonal_diag_embed,
+           reference_inputs_func=reference_inputs_diagonal_diag_embed,
+           error_inputs_func=error_inputs_diagonal_diag_embed),
+    OpInfo('diagonal',
+           aten_backward_name='diagonal_backward',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diagonal_diag_embed,
+           reference_inputs_func=reference_inputs_diagonal_diag_embed,
+           error_inputs_func=error_inputs_diagonal_diag_embed),
+    OpInfo('diagonal_copy',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diagonal_diag_embed,
+           reference_inputs_func=reference_inputs_diagonal_diag_embed,
+           error_inputs_func=error_inputs_diagonal_diag_embed),
+    OpInfo('diagonal_scatter',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_diagonal_scatter),
+    BinaryUfuncInfo('eq',
+                    ref=np.equal,
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    sample_inputs_func=sample_inputs_comparison_ops,
+                    skips=(
+                    )),
+    BinaryUfuncInfo('fmax',
+                    op=torch.fmax,
+                    dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                    )),
+    BinaryUfuncInfo('fmin',
+                    op=torch.fmin,
+                    dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                    )),
+    BinaryUfuncInfo('fmod',
+                    ref=np.fmod,
+                    dtypes=all_types_and(torch.float16, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=None,
+                    rhs_make_tensor_kwargs={'exclude_zero': True},
+                    decorators=(
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_contig_vs_every_other',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_non_contig',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.uint8,)),
+                    )),
+    BinaryUfuncInfo('remainder',
+                    ref=np.remainder,
+                    dtypes=all_types_and(torch.float16, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=None,
+                    operator_variant=operator.mod,
+                    inplace_operator_variant=operator.imod,
+                    supports_one_python_scalar=True,
+                    rhs_make_tensor_kwargs={'exclude_zero': True},
+                    decorators=(
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_contig_vs_every_other',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_non_contig',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics',
+                                     dtypes=(torch.bfloat16,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.uint8,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo',
+                                     'test_nnc_correctness',
+                                     dtypes=(torch.bfloat16,)),
+                        # Fails on XLA
+                        # False is not true : Tensors failed to compare as equal!
+                        # Attempted to compare equality of tensors with different dtypes
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
+                    )),
+    UnaryUfuncInfo('frac',
+                   ref=lambda x: np.modf(x)[0],
+                   dtypes=floating_types_and(torch.bfloat16, torch.float16),
+                   dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.bfloat16, torch.float16, torch.float32, torch.float64)),
+                       # 76047
+                       DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                                    dtypes=(torch.bfloat16, torch.float32, torch.float64)),
+                   )),
+    OpInfo('stft',
+           decorators=[
+               skipCPUIfNoFFT,
+               DecorateInfo(unittest.skip("Skipped! stft does not match the native function"),
+                            'TestJit', 'test_variant_consistency_jit'),
+           ],
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_stft,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_out=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           ),
+    OpInfo('istft',
+           dtypes=complex_types(),
+           sample_inputs_func=sample_inputs_istft,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_out=False,
+           decorators=(
+               DecorateInfo(unittest.skip("Skipped! istft does not match the native function"),
+                            'TestJit', 'test_variant_consistency_jit'),
+           ),
+           skips=(
+               skipCPUIfNoFFT,
+               # gradcheck fails on ROCm (gh-68429)
+               # grad is computed improperly (probably for weights tensor)
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
+               # Pre-existing condition (calls .item); needs to be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+           )),
+    UnaryUfuncInfo('floor',
+                   ref=np.floor,
+                   dtypes=all_types_and(torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   skips=(
+                       DecorateInfo(unittest.expectedFailure,
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=tuple(t for t in integral_types() if t != torch.uint8)),
+                   ),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True),
+    OpInfo('flip',
+           op=torch.flip,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_flip,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('fliplr',
+           op=torch.fliplr,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           error_inputs_func=error_inputs_fliplr,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('flipud',
+           op=torch.flipud,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           error_inputs_func=error_inputs_flipud,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('sparse.sampled_addmm',
+           dtypes=floating_and_complex_types(),
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_sparse_sampled_addmm,
+           decorators=[
+               skipCUDAIf(not ((_get_torch_cuda_version() >= (11, 3))
+                               or (_get_torch_rocm_version() >= (5, 2))),
+                          "cusparseSDDMM was added in 11.2.1"),
+               skipCPUIfNoMklSparse, ],
+           skips=(
+               # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: Sparse CSR tensors do not have strides.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestTags', 'test_tags'),
+               # RuntimeError: sampled_addmm: Expected result to have sparse csr layout, but got Strided
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_operator'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: unsupported memory format option Preserve
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: sparse_mask does not support automatic differentiation for outputs with complex dtype
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+               # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               # RuntimeError: sparse_mask does not support automatic differentiation for outputs with complex dtype.
+               # RuntimeError: Sparse CSR tensors do not have is_contiguous
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # NotImplementedError: Could not run 'aten::sparse_sampled_addmm' with arguments from the 'SparseCsrMeta' backend.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_no_amp'),
+           )),
+    OpInfo('sparse.mm',
+           dtypes=floating_types_and(torch.bfloat16),
+           variant_test_name='reduce',
+           supports_autograd=True,
+           supports_out=False,
+           supports_gradgrad=False,
+           supports_forward_ad=False,
+           sample_inputs_func=sample_inputs_sparse_mm_reduce,
+           decorators=[onlyCPU],
+           skips=(
+               # NotImplementedError: Tensors of type SparseCsrTensorImpl do not have is_contiguous
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: Sparse CSR tensors do not have strides.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestTags', 'test_tags'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_operator'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: unsupported memory format option Preserve
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+               # RuntimeError: Sparse CSR tensors do not have is_contiguou
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               # RuntimeError: Sparse CSR tensors do not have strides
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # ValueError: Sparse output is not supported at gradcheck yet. Please call to_dense(masked_grad=...) ...
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_fail_gradgrad'),
+               # NotImplementedError: Could not run 'aten::_sparse_mm_reduce_impl' with arguments from the 'SparseCsrMeta' backend
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace'),
+           )),
+    UnaryUfuncInfo('i0',
+                   ref=np_unary_ufunc_integer_promotion_wrapper(
+                       scipy.special.i0) if TEST_SCIPY else None,
+                   aliases=('special.i0',),
+                   decorators=(precisionOverride({torch.bfloat16: 3e-1,
+                                                  torch.float16: 5e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   backward_dtypes=floating_types(),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   sample_inputs_func=sample_inputs_i0_i1,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.int8,)),
+                   )),
+    BinaryUfuncInfo('floor_divide',
+                    ref=_floor_divide_np,
+                    dtypes=all_types_and(torch.half, torch.bfloat16),
+                    supports_autograd=False,
+                    rhs_make_tensor_kwargs=dict(exclude_zero=True),
+                    supports_two_python_scalars=True,
+                    skips=(
+                        # AssertionError: Results of original model and exported/imported version of model differed
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+                        # bfloat16 floor_divide compared with a float32 reference works inconsistently
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs',
+                                     dtypes=(torch.bfloat16,)),
+                        # int8 floor divide has different results for -128 // -1 vs. NumPy
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=(torch.int8,)),
+                        # The following tests fails on some jobs
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs', 'test_reference_numerics_extremal_values',
+                                     dtypes=(torch.float16,)),
+                        DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=5e-3)}),
+                                     'TestBinaryUfuncs', 'test_reference_numerics'),
+                    )),
+    UnaryUfuncInfo('frexp',
+                   op=torch.frexp,
+                   ref=np.frexp,
+                   dtypes=floating_types_and(torch.half, torch.bfloat16),
+                   dtypesIfCUDA=floating_types_and(torch.half),
+                   # skip testing torch.frexp as it is not supported by ROCm platform yet
+                   decorators=[],
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   skips=(
+                       # skips below tests as torch.frexp returns tuple-like (mantissa, exponent) as outputs,
+                       # while theses tests currently requires output to a single tensor.
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_batch_vs_slicing'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_contig_vs_every_other'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_contig_vs_transposed'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_non_contig_expand'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_variant_consistency'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
+
+                       # skips test_reference_numerics due to error in Windows CI.
+                       # The np.frexp returns exponent as np.intc dtype on Windows platform,
+                       # and np.intc does not have the correspond torch dtype
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=IS_WINDOWS),
+                   )),
+    UnaryUfuncInfo('log1p',
+                   ref=np.log1p,
+                   aliases=('special.log1p',),
+                   domain=(-1, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True,
+                   promotes_int_to_float=True),
+    BinaryUfuncInfo('ge',
+                    ref=np.greater_equal,
+                    aliases=('greater_equal',),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    skips=(
+                    )),
+    OpInfo('geqrf',
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_linalg_qr_geqrf,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           supports_autograd=False,
+           skips=(
+               # FIXME: geqrf can't forward with complex inputs that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
+    BinaryUfuncInfo('gt',
+                    ref=np.greater,
+                    aliases=('greater',),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    skips=(
+                    )),
+    UnaryUfuncInfo('imag',
+                   ref=np.imag,
+                   dtypes=complex_types_and(torch.chalf),
+                   supports_out=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   # See https://github.com/pytorch/pytorch/issues/66357
+                   # RuntimeError: view_as_real doesn't work on unresolved conjugated tensors.
+                   check_batched_forward_grad=False,
+                   skips=(
+                       # Skip since real and imag don't have out variants.
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
+                   )),
+    OpInfo('gradient',
+           dtypes=floating_and_complex_types_and(torch.int8, torch.int16,
+                                                 torch.int32, torch.int64,
+                                                 torch.bfloat16, torch.half),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # following tests give a runtime error with undefined value tensor
+               # see discussion : https://github.com/pytorch/pytorch/issues/56660
+               # RuntimeError:
+               # Arguments for call are not valid.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32, torch.complex64)),  # noqa: B950
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+           ),
+           supports_inplace_autograd=False,
+           sample_inputs_func=sample_inputs_gradient,
+           error_inputs_func=error_inputs_gradient),
+    OpInfo('isin',
+           dtypes=all_types(),
+           dtypesIfCUDA=all_types_and(torch.half),
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_isin),
+    OpInfo('kthvalue',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_kthvalue,
+           error_inputs_func=error_inputs_kthvalue),
+    BinaryUfuncInfo('le',
+                    ref=np.less_equal,
+                    aliases=('less_equal',),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    skips=(
+                    )),
+    OpInfo('linspace',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           is_factory_function=True,
+           supports_out=True,
+           supports_autograd=False,
+           error_inputs_func=error_inputs_linspace,
+           sample_inputs_func=sample_inputs_linspace,
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API
+               # in __main__.TestJitCUDA.test_variant_consistency_jit_logspace_cuda_complex64!
+               # Caching allocator allocated memory was 0 and is now reported as 307200 on device 0.
+               # CUDA driver allocated memory was 1254555648 and is now 1242955776.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.cfloat,), device_type="cuda"),
+           )),
+    OpInfo('linspace',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           is_factory_function=True,
+           supports_out=True,
+           supports_autograd=False,
+           error_inputs_func=error_inputs_linspace,
+           sample_inputs_func=sample_inputs_linspace_tensor_overload,
+           variant_test_name="tensor_overload",
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # TypeError: 'int' object is not subscriptable
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API
+               # in __main__.TestJitCUDA.test_variant_consistency_jit_logspace_cuda_complex64!
+               # Caching allocator allocated memory was 0 and is now reported as 307200 on device 0.
+               # CUDA driver allocated memory was 1254555648 and is now 1242955776.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.cfloat,), device_type="cuda"),
+           )),
+    OpInfo('logspace',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           is_factory_function=True,
+           supports_out=True,
+           supports_autograd=False,
+           error_inputs_func=error_inputs_linspace,
+           sample_inputs_func=sample_inputs_logspace,
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+
+               # Off-by-one issue when casting floats to ints
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick',
+                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cuda"),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive',
+                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cuda"),
+               # UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API
+               # in __main__.TestJitCUDA.test_variant_consistency_jit_logspace_cuda_complex64!
+               # Caching allocator allocated memory was 0 and is now reported as 307200 on device 0.
+               # CUDA driver allocated memory was 1254555648 and is now 1242955776.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.cfloat,), device_type="cuda"),
+           )),
+    OpInfo('logspace',
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           is_factory_function=True,
+           supports_out=True,
+           supports_autograd=False,
+           error_inputs_func=error_inputs_linspace,
+           sample_inputs_func=sample_inputs_logspace_tensor_overload,
+           variant_test_name="tensor_overload",
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # TypeError: 'int' object is not subscriptable
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+
+               # Off-by-one issue when casting floats to ints
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick',
+                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cuda"),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_comprehensive',
+                            dtypes=(torch.int16, torch.int32, torch.int64), device_type="cuda"),
+               # UserWarning: CUDA caching allocator reports a memory leak not verified by the driver API
+               # in __main__.TestJitCUDA.test_variant_consistency_jit_logspace_cuda_complex64!
+               # Caching allocator allocated memory was 0 and is now reported as 307200 on device 0.
+               # CUDA driver allocated memory was 1254555648 and is now 1242955776.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.cfloat,), device_type="cuda"),
+           )),
+    UnaryUfuncInfo('log',
+                   ref=np.log,
+                   domain=(0, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+                   backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.chalf),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                   ),
+                   # log(z)->-inf for |z|->0
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: torch.abs(x) < 0.1, safe_val=1)),
+    UnaryUfuncInfo('log10',
+                   ref=np.log10,
+                   domain=(0, None),
+                   decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=IS_WINDOWS),
+                   ),
+                   # log10(z)->-inf for |z|->0
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: torch.abs(x) < 0.1, safe_val=1)),
+    UnaryUfuncInfo('log2',
+                   ref=np.log2,
+                   domain=(0, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.cfloat, torch.cdouble]),
+                   ),
+                   # log2(z)->-inf for |z|->0
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: torch.abs(x) < 0.1, safe_val=1)),
+    BinaryUfuncInfo('ldexp',
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_inplace_autograd=False,
+                    promotes_int_to_float=True,
+                    supports_out=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # RuntimeError: mul(): functions with out=... arguments don't support
+                        # automatic differentiation, but one of the arguments requires grad
+                        # https://github.com/pytorch/pytorch/issues/68966
+                        DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+                    ),
+                    decorators=[
+                        DecorateInfo(
+                            toleranceOverride({
+                                torch.complex64: tol(atol=1e-05, rtol=1e-05)
+                            }),
+                            'TestCommon', device_type='cpu',
+                        ),
+                    ], ),
+    BinaryUfuncInfo('logaddexp',
+                    dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+                    dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # TODO: FIXME: RuntimeError: not implemented for 'ComplexFloat'
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
+                    )),
+    OpInfo('logaddexp2',
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_logaddexp),
+    UnaryUfuncInfo('logical_not',
+                   ref=np.logical_not,
+                   decorators=(precisionOverride({torch.bfloat16: 7e-1,
+                                                  torch.float16: 5e-1}),),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_autograd=False,
+                   skips=(
+                       # The function variant always returns BoolTensor
+                       # while the inplace variant preserves the input dtype.
+                       # >>> t = torch.randn(3)
+                       # >>> torch.logical_not(t)
+                       # tensor([False, False, False])
+                       # >>> torch.logical_not(t).dtype
+                       # torch.bool
+                       # >>> t.logical_not_().dtype
+                       # torch.float32
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_variant_consistency',
+                                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16)),
+                   )),
+    BinaryUfuncInfo('lt',
+                    ref=np.less,
+                    aliases=('less',),
+                    dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    skips=(
+                    )),
+    OpInfo('lu_unpack',
+           op=torch.lu_unpack,
+           dtypes=floating_and_complex_types(),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(skipCPUIfNoLapack,),
+           sample_inputs_func=sample_inputs_lu_unpack),
+    OpInfo('lu',
+           op=torch.lu,
+           dtypes=floating_and_complex_types(),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_lu,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # we skip jit tests because `lu` is a torch function
+               # RuntimeError:
+               # 'Tensor (inferred)' object has no attribute or method 'lu'.:
+               # File "<string>", line 3
+               # def the_method(i0):
+               #     return i0.lu(True, True)
+               #            ~~~~~ <--- HERE
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError not raised: Expected RuntimeError when calling with input.device=cpu and out.device=cuda
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
+    OpInfo('lu_solve',
+           op=torch.lu_solve,
+           dtypes=floating_and_complex_types(),
+           supports_forward_ad=True,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_lu_solve,
+           skips=(
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Tests different backward paths"),
+                            "TestCommon", "test_floating_inputs_are_differentiable"),),
+           decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver]),
+    OpInfo('masked_fill',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_masked_fill,
+           error_inputs_func=error_inputs_masked_fill,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           supports_out=False),
+    OpInfo('masked_scatter',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_masked_scatter,
+           error_inputs_func=error_inputs_masked_scatter,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           supports_out=False,
+           skips=(
+           )),
+    OpInfo('masked_select',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_masked_select,
+           error_inputs_func=error_inputs_masked_select,
+           skips=(
+               # Compiler issue on ROCm. Might need to skip until ROCm5.5
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+    OpInfo('matrix_exp',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           aliases=('linalg.matrix_exp',),
+           sample_inputs_func=sample_inputs_matrix_exp,
+           # Needs to construct a 2nx2n matrix by copy_ ing into it
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           skips=(
+               # mexp does not support bf16 and fp16
+               DecorateInfo(unittest.skip('Skipped!'), 'TestInductorOpInfo', 'test_comprehensive',
+                            dtypes=[torch.half], device_type="cpu"),
+           ),
+           supports_out=False,
+           ),
+    OpInfo('matmul',
+           aliases=('linalg.matmul',),
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
+           assert_autodiffed=True,
+           assert_jit_shape_analysis=True,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           sample_inputs_func=partial(sample_inputs_matmul, is_rmatmul=False),
+           decorators=[
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               # ROCm intermittently fails the test with standard atol/rtol
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}),
+                            'TestCommon', 'test_noncontiguous_samples', device_type='cuda',
+                            active_if=TEST_WITH_ROCM),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=0)}),
+                            'TestCommon', 'test_out', device_type='cuda',
+                            active_if=TEST_WITH_ROCM),
+               # mv for the sample with shapes (S, S, M, M), (M,) has some variance in the
+               # backward on CPU
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=0, rtol=1e-5)}),
+                            'TestCommon', 'test_noncontiguous_samples',
+                            device_type='cpu'),
+               DecorateInfo(
+                   toleranceOverride({
+                       torch.float32: tol(atol=1e-5, rtol=1e-5),
+                       torch.complex64: tol(atol=1e-5, rtol=1e-5),
+                   }),
+                   "TestDecomp", "test_comprehensive", device_type="cuda",
+               ),
+           ],
+           skips=(
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # https://github.com/pytorch/pytorch/issues/67470
+               DecorateInfo(unittest.skip("67470!"),
+                            'TestCommon', 'test_noncontiguous_samples',
+                            device_type='cpu', dtypes=(torch.long,)),
+               # AssertionError: False is not true : Tensors failed to compare as equal!
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo',
+                            device_type='xla', dtypes=(torch.long,)),
+               # https://github.com/pytorch/pytorch/issues/71774
+               DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
+                            device_type='cpu', dtypes=(torch.long,)),
+           )),
+    OpInfo('max',
+           variant_test_name='reduction_with_dim',
+           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+           sample_inputs_func=sample_inputs_max_min_reduction_with_dim,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+           ),
+           supports_forward_ad=True),
+    OpInfo('max',
+           variant_test_name='reduction_no_dim',
+           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_max_min_reduction_no_dim,
+           skips=(
+           )),
+    OpInfo('median',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           # TODO: some signatures of median do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_median,
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
+    OpInfo('nanmedian',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           # TODO: some signatures of nanmedian do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=partial(sample_inputs_reduction, supports_multiple_dims=False)),
+    OpInfo('var_mean',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var,
+           # TODO: some signatures of var_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
+    OpInfo('var_mean',
+           variant_test_name='unbiased',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var_unbiased,
+           # TODO: some signatures of var_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
+    OpInfo('std_mean',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var,
+           # TODO: some signatures of std_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
+    OpInfo('std_mean',
+           variant_test_name='unbiased',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_std_var_unbiased,
+           # TODO: some signatures of var_mean do support out
+           supports_out=False,
+           supports_forward_ad=True,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float64: tol(atol=2e-7, rtol=2e-7)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda"),
+           )),
+    OpInfo('meshgrid',
+           variant_test_name='variadic_tensors',
+           ref=np.meshgrid,
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.bool, torch.float16),
+           sample_inputs_func=partial(sample_inputs_meshgrid, variant='variadic'),
+           skips=[
+               # JIT does not support variadic tensors.
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # meshgrid is defined in torch.functional to take a
+               # variadic list of tensors. Variadic parameters are not
+               # compatible with the normalize operator tests.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Skip operator schema test because this is a functional and not an operator
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+           ],
+           supports_out=False,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,),
+    OpInfo('meshgrid',
+           variant_test_name='list_of_tensors',
+           # Unlike the variant above, we do not use np.meshgrid as a
+           # ref since it does not officially support list of numpy
+           # arrays.
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.bool, torch.float16),
+           sample_inputs_func=partial(sample_inputs_meshgrid, variant='list'),
+           skips=[
+               # meshgrid is defined in torch.functional to take a
+               # variadic list of tensors. Variadic parameters are not
+               # compatible with the normalize operator tests.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+           ],
+           assert_autodiffed=True,
+           supports_out=False,
+           autodiff_nonfusible_nodes=[],
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,),
+    OpInfo('min',
+           variant_test_name='reduction_with_dim',
+           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+           sample_inputs_func=sample_inputs_max_min_reduction_with_dim,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           skips=(
+           )),
+    OpInfo('min',
+           variant_test_name='reduction_no_dim',
+           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_max_min_reduction_no_dim,
+           skips=(
+           )),
+    OpInfo('quantile',
+           dtypes=floating_types(),
+           sample_inputs_func=sample_inputs_reduction_quantile,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           # Relies on copy_ to broadcast, but the forward AD path calls broadcast_to which
+           # does not have a batching rule in core
+           check_batched_forward_grad=False),
+    OpInfo('nanquantile',
+           dtypes=floating_types(),
+           sample_inputs_func=sample_inputs_reduction_quantile,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           # Relies on copy_ to broadcast, but the forward AD path calls broadcast_to which
+           # does not have a batching rule in core
+           check_batched_forward_grad=False),
+    BinaryUfuncInfo(
+        'max',
+        aliases=('maximum',),
+        variant_test_name='binary',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        ref=np.maximum,
+        supports_rhs_python_scalar=False,
+        skips=(
+            # Incorrectly attempts to use a scalar for the second argument
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
+            # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
+        )),
+    BinaryUfuncInfo(
+        'maximum',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        ref=np.maximum,
+        supports_rhs_python_scalar=False,
+        skips=(
+            # TODO: FIXME: RuntimeError: "max_elementwise_cuda" not implemented for 'ComplexFloat'
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
+        )),
+    BinaryUfuncInfo(
+        'min',
+        aliases=('minimum',),
+        variant_test_name='binary',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        ref=np.minimum,
+        supports_rhs_python_scalar=False,
+        skips=(
+            # Incorrectly attempts to use a scalar for the second argument
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
+            # TODO: FIXME: RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
+            DecorateInfo(unittest.expectedFailure,
+                         'TestBinaryUfuncs',
+                         'test_type_promotion',
+                         device_type='cuda'),
+        )),
+    BinaryUfuncInfo(
+        'minimum',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        ref=np.minimum,
+        supports_rhs_python_scalar=False,
+        skips=(
+            # TODO: FIXME: RuntimeError: "min_elementwise_cuda" not implemented for 'ComplexFloat'
+            DecorateInfo(unittest.expectedFailure,
+                         'TestBinaryUfuncs',
+                         'test_type_promotion',
+                         device_type='cuda'),
+        ),
+    ),
+    BinaryUfuncInfo('logical_and',
+                    ref=np.logical_and,
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    supports_autograd=False,
+                    always_returns_bool=True,
+                    supports_rhs_python_scalar=False),
+    BinaryUfuncInfo('logical_or',
+                    ref=np.logical_or,
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    supports_autograd=False,
+                    always_returns_bool=True,
+                    supports_rhs_python_scalar=False),
+    BinaryUfuncInfo('logical_xor',
+                    ref=np.logical_xor,
+                    dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                    supports_autograd=False,
+                    always_returns_bool=True,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                    )),
+    BinaryUfuncInfo('bitwise_and',
+                    ref=np.bitwise_and,
+                    dtypes=integral_types_and(torch.bool),
+                    operator_variant=operator.and_,
+                    inplace_operator_variant=operator.iand,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        # RuntimeError: "bitwise_and_cuda" not implemented for 'Half'
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
+                                     'test_type_promotion', device_type='cuda'),
+                    )),
+    BinaryUfuncInfo('bitwise_or',
+                    ref=np.bitwise_or,
+                    dtypes=integral_types_and(torch.bool),
+                    operator_variant=operator.or_,
+                    inplace_operator_variant=operator.ior,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        # TODO: FIXME: RuntimeError: "bitwise_or_cuda" not implemented for 'Half'
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion',
+                                     device_type='cuda'),
+                    )),
+    BinaryUfuncInfo('bitwise_xor',
+                    ref=np.bitwise_xor,
+                    dtypes=integral_types_and(torch.bool),
+                    operator_variant=operator.xor,
+                    inplace_operator_variant=operator.ixor,
+                    supports_autograd=False,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        # TODO: FIXME: RuntimeError: "bitwise_xor_cuda" not implemented for 'Half'
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion',
+                                     device_type='cuda'),
+                    )),
+    BinaryUfuncInfo('heaviside',
+                    ref=lambda a, b: (
+                        # necessary because np.heaviside incorrectly returns float64 when passed args of dtype int64
+                        np.int64(np.heaviside(a, b)) if a.dtype == np.int64 and b.dtype == np.int64 else np.heaviside(a, b)
+                    ),
+                    dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+                    supports_autograd=False,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # RuntimeError: heaviside is not yet implemented for tensors with different dtypes.
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+                        # PyTorch's heaviside does not appear to propagate NaNs
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics_extremal_values'),
+                    )),
+    BinaryUfuncInfo('lcm',
+                    ref=np.lcm,
+                    dtypes=integral_types_and(),
+                    supports_autograd=False,
+                    supports_rhs_python_scalar=False),
+    BinaryUfuncInfo('gcd',
+                    ref=np.gcd,
+                    dtypes=integral_types_and(),
+                    supports_autograd=False,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics_small_values',
+                                     dtypes=(torch.int8,)),)),
+    BinaryUfuncInfo('isclose',
+                    ref=np.isclose,
+                    dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+                    sample_inputs_func=sample_inputs_isclose,
+                    error_inputs_func=error_inputs_isclose,
+                    supports_autograd=False,
+                    supports_out=False,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestCommon',
+                                     'test_numpy_refs', dtypes=(torch.complex128,)),
+                        # RuntimeError: Short did not match Int
+                        DecorateInfo(unittest.expectedFailure,
+                                     'TestBinaryUfuncs',
+                                     'test_type_promotion'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+                        DecorateInfo(unittest.skip("Skipped!"),
+                                     'TestBinaryUfuncs',
+                                     'test_reference_numerics_extremal_values'),
+                    )),
+    # `softmax` supports different dtypes based on whether `dtype` argument,
+    # is passed or not. Hence two OpInfo entries, one with dtype and other without.
+    # https://github.com/pytorch/pytorch/issues/68752
+    OpInfo('softmax',
+           aliases=('special.softmax', 'nn.functional.softmax',),
+           aten_name='softmax',
+           aten_backward_name='_softmax_backward_data',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_softmax_variant,
+           assert_jit_shape_analysis=True,
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=True),
+    OpInfo('softmax',
+           aliases=('special.softmax', 'nn.functional.softmax',),
+           variant_test_name="with_dtype",
+           aten_name='softmax',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=True),
+    OpInfo(
+        '_softmax_backward_data',
+        op=torch.ops.aten._softmax_backward_data,
+        aten_name='_softmax_backward_data',
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_softmax_backward_data,
+        assert_autodiffed=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+        ),
+    ),
+    # `softmin` supports different dtypes based on whether `dtype` argument,
+    # is passed or not. Hence two OpInfo entries, one with dtype and other without.
+    # https://github.com/pytorch/pytorch/issues/68752
+    OpInfo('nn.functional.softmin',
+           aten_name='softmin',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_softmax_variant,
+           assert_jit_shape_analysis=False,
+           assert_autodiffed=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('nn.functional.softmin',
+           variant_test_name="with_dtype",
+           aten_name='softmin',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
+           assert_autodiffed=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo(
+        "nn.functional.cross_entropy",
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_cross_entropy,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-3)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cpu",
+            ),
+        ),
+        skips=(
+            # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 1536
+            # test_ops.TestJitCUDA.test_variant_consistency_jit_nn_functional_cross_entropy_cuda_float32 leaked
+            # 1536 bytes CUDA memory on device 0
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        )
+    ),
+    OpInfo('nn.functional.normalize',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_normalize,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True),
+    OpInfo('aminmax',
+           ref=lambda x, dim=None, keepdim=False: (np.amin(x, axis=dim, keepdims=keepdim), np.amax(x, axis=dim, keepdims=keepdim)),
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+           decorators=(onlyNativeDeviceTypes,),
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_aminmax,
+           error_inputs_func=error_inputs_aminmax_amax_amin),
+    OpInfo('as_strided',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided,
+           skips=(
+               # Note: This xfail is fine -- it's inherent to how as_strided works
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
+               # AssertionError: False is not true : Scalars failed to compare as equal!
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                            'TestCommon', 'test_variant_consistency_eager'),
+               # Not close
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               # Not close
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Numerous errors"), 'TestBwdGradients'),
+           )),
+    OpInfo('as_strided',
+           variant_test_name='partial_views',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided_partial_views,
+           skips=(
+               # Note: This xfail is fine -- it's inherent to how as_strided works
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples'),
+               # RuntimeError: This operator is not Composite Compliant: the
+               # storage_offset of the tensor was modified directly without
+               # going through the PyTorch dispatcher.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+
+               # These fail because the test changes the input's in-memory layout
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_inplace_gradgrad'),
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo',
+                            'test_make_fx_symbolic_exhaustive_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Fail but are also flaky
+               DecorateInfo(unittest.skip("Test changes in memory layout"), 'TestMathBits'),
+               DecorateInfo(unittest.skip("Modifies input strides and storage_offset"), 'TestCommon',
+                            'test_non_standard_bool_values'),
+               # RuntimeError: setStorage: sizes [2, 2], strides [1, 2], storage offset 10, and itemsize 2 requiring a
+               # storage size of 28 are out of bounds for storage of size 20
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace_all_strides'),
+           )),
+    OpInfo('as_strided_scatter',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_as_strided_scatter,
+           error_inputs_func=error_inputs_as_strided_scatter,
+           skips=(
+               DecorateInfo(unittest.skip('Works for int64, fails for everything else'), 'TestCommon', 'test_noncontiguous_samples'),  # noqa: B950
+               DecorateInfo(unittest.skip('Fails in most cases, passes on LAZY for some reason'), 'TestCommon', 'test_variant_consistency_eager'),  # noqa: B950
+               DecorateInfo(unittest.skip('Fails on cuda + rocm'), 'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.skip('Passes on complex128 and float64 only'), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+               # AssertionError: Tensor-likes are not close! (new_empty_strided.default)
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),)),
+    OpInfo('native_layer_norm',
+           aten_name='native_layer_norm',
+           ref=reference_native_layer_norm,
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           assert_jit_shape_analysis=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_native_layer_norm,
+           error_inputs_func=error_inputs_native_layer_norm,
+           skips=(
+               # IndexError: tuple index out of range
+               DecorateInfo(unittest.skip('Skipped!'), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # Tests fail when weight=None and bias is defined
+               # https://github.com/pytorch/pytorch/issues/79705
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
+               # JIT test also tries to compute double backward, which fails
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+           )),
+    OpInfo('native_batch_norm',
+           aten_name='native_batch_norm',
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_native_batch_norm,
+           skips=(
+               # NotImplementedError: Could not run
+               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
+               # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
+               # Problem with _get_numerical_jacobian
+               # IndexError: tuple index out of range
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # RuntimeError: deepEquals(input.iValue, deepCopiedInput) INTERNAL ASSERT FAILED
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # https://github.com/pytorch/pytorch/issues/85960
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               # AssertionError: Booleans mismatch: True is not False
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_autocast'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-5)}),
+                            "TestCompositeCompliance", "test_forward_ad"),
+           )
+           ),
+    OpInfo('_native_batch_norm_legit',
+           aten_name='_native_batch_norm_legit',
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs__native_batch_norm_legit,
+           skips=(
+               # NotImplementedError: Could not run
+               # 'aten::native_batch_norm.out' with arguments from the 'CPU' backend.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type="cpu"),
+               # RuntimeError: out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type="cuda"),
+               # Problem with _get_numerical_jacobian
+               # IndexError: tuple index out of range
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+               # RuntimeError: deepEquals(input.iValue, deepCopiedInput) INTERNAL ASSERT FAILED
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # https://github.com/pytorch/pytorch/issues/85960
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-5)}),
+                            "TestCompositeCompliance", "test_forward_ad"),
+           )
+           ),
+    OpInfo('nn.functional.cosine_similarity',
+           aten_name="cosine_similarity",
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_cosine_similarity),
+    OpInfo('nn.functional.adaptive_avg_pool1d',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_avg_pool1d,
+           sample_inputs_func=sample_inputs_adaptive_avg_pool1d),
+    OpInfo('nn.functional.adaptive_avg_pool2d',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           decorators=(
+               # RuntimeError:
+               # adaptive_avg_pool2d(Tensor input, int[2] output_size) -> (Tensor):
+               # Expected a value of type 'List[int]' for argument 'output_size' but
+               # instead found type 'Tuple[NoneType, int]'. :
+               #   File "<string>", line 3
+               # def the_method(i0):
+               #     return torch.nn.functional.adaptive_avg_pool2d(i0, (None, 7))
+               #            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_avg_pool2d,
+           sample_inputs_func=sample_inputs_adaptive_avg_pool2d),
+    OpInfo('nn.functional.adaptive_avg_pool3d',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           decorators=(
+               # RuntimeError:
+               # adaptive_avg_pool3d(Tensor input, int[3] output_size) -> (Tensor):
+               # Expected a value of type 'List[int]' for argument 'output_size' but
+               # instead found type 'Tuple[NoneType, NoneType, NoneType]'. :
+               #   File "<string>", line 3
+               #
+               # def the_method(i0):
+               #     return torch.nn.functional.adaptive_avg_pool3d(i0, (None, None, None))
+               #            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+               #
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_avg_pool3d,
+           sample_inputs_func=sample_inputs_adaptive_avg_pool3d),
+    OpInfo('nn.functional.adaptive_max_pool1d',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_max_pool1d,
+           sample_inputs_func=sample_inputs_adaptive_max_pool1d),
+    OpInfo('nn.functional.adaptive_max_pool2d',
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           decorators=(
+               # RuntimeError:
+               # adaptive_max_pool2d(Tensor input, int[2] output_size) -> (Tensor):
+               # Expected a value of type 'List[int]' for argument 'output_size' but
+               # instead found type 'Tuple[NoneType, int]'. :
+               #   File "<string>", line 3
+               # def the_method(i0):
+               #     return torch.nn.functional.adaptive_max_pool2d(i0, (None, 7))
+               #            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_max_pool2d,
+           sample_inputs_func=sample_inputs_adaptive_max_pool2d),
+    OpInfo('nn.functional.adaptive_max_pool3d',
+           dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           decorators=(
+               # RuntimeError:
+               # adaptive_max_pool3d(Tensor input, int[3] output_size) -> (Tensor):
+               # Expected a value of type 'List[int]' for argument 'output_size' but
+               # instead found type 'Tuple[NoneType, NoneType, NoneType]'. :
+               #   File "<string>", line 3
+               #
+               # def the_method(i0):
+               #     return torch.nn.functional.adaptive_max_pool3d(i0, (None, None, None))
+               #            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
+               #
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_adaptive_max_pool3d,
+           sample_inputs_func=sample_inputs_adaptive_max_pool3d),
+    OpInfo('nn.functional.avg_pool1d',
+           aten_name='avg_pool1d',
+           supports_autograd=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_avg_pool1d,
+           sample_inputs_func=sample_inputs_avgpool1d),
+    OpInfo('nn.functional.avg_pool3d',
+           aten_name='avg_pool3d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.int64),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_avg_pool3d,
+           sample_inputs_func=sample_inputs_avgpool3d,
+           skips=(
+               # AssertionError: Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
+           )),
+    OpInfo(
+        "nn.functional.binary_cross_entropy_with_logits",
+        aten_name="binary_cross_entropy_with_logits",
+        supports_autograd=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        sample_inputs_func=sample_inputs_binary_cross_entropy_with_logits,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                'TestJit',
+                'test_variant_consistency_jit',
+                dtypes=(torch.float32,)
+            ),
+        ),
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.relu',
+        aten_name="relu",
+        ref=lambda a: np.where(a <= 0, 0, a),
+        supports_autograd=True,
+        supports_sparse=True,
+        supports_sparse_csr=True,
+        supports_sparse_csc=True,
+        supports_sparse_bsr=True,
+        supports_sparse_bsc=True,
+        dtypes=all_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_nn_activation_relu,
+        supports_out=False,
+        supports_fwgrad_bwgrad=True,
+        supports_forward_ad=True),
+    OpInfo('nn.functional.conv_transpose1d',
+           # `ref` for this function is backward of
+           # corresponding `conv*d`
+           ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose1d),
+           aten_name='conv_transpose1d',
+           aliases=('conv_transpose1d',),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
+                                                       torch.bfloat16),
+           sample_inputs_func=sample_inputs_conv_transpose1d,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           decorators=(
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }),
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=5e-2, rtol=5e-2), }),
+                   'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(
+                   toleranceOverride({torch.float: tol(atol=1.5e-5, rtol=1.5e-5), }),
+                   'TestCommon', 'test_numpy_ref_mps'),
+               DecorateInfo(
+                   toleranceOverride({torch.half: tol(atol=1e-3, rtol=2e-3), }),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+           ),
+           skips=(
+               # Reason for Skip: https://github.com/pytorch/pytorch/pull/79694#issuecomment-1186949486
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.complex64,)),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.float,)),
+               # RuntimeError: "slow_conv2d_cpu_grad_input" not implemented for 'Long'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.int64,)),
+           ),
+           supports_out=False,),
+    OpInfo('nn.functional.conv_transpose2d',
+           aten_name='conv_transpose2d',
+           aliases=('conv_transpose2d',),
+           # `ref` for this function is backward of
+           # corresponding `conv*d`
+           ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose2d),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
+                                                       torch.bfloat16),
+           sample_inputs_func=sample_inputs_conv_transpose2d,
+           # Runs very slowly on slow-gradcheck for complex.
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06), }),
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=2e-05, rtol=5e-05), }),
+                   'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=8e-2, rtol=8e-2), }),
+                   'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(
+                   toleranceOverride({torch.half: tol(atol=1e-3, rtol=2e-3), }),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cpu')],
+           skips=(
+               # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: "slow_conv2d_cpu_grad_input" not implemented for 'Long'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.int64,)),
+               # Reference: https://github.com/pytorch/pytorch/issues/86356
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.double, torch.cdouble)),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+               # AssertionError: None mismatch: torch.complex64 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules', 'test_custom_rules',
+                            dtypes=(torch.complex64, torch.complex128)),
+           ),
+           supports_out=False,),
+    OpInfo('nn.functional.conv_transpose3d',
+           aten_name='conv_transpose3d',
+           aliases=('conv_transpose3d',),
+           # `ref` for this function is backward of
+           # corresponding `conv*d`
+           ref=partial(conv_transpose_ref, fn=torch.nn.functional.conv_transpose3d),
+           dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(
+               torch.float16, torch.chalf, torch.bfloat16),
+           sample_inputs_func=sample_inputs_conv_transpose3d,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # Runs very slowly on slow-gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.float16: tol(atol=5e-2, rtol=5e-2), }),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1.3e-06),
+                                     torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}),
+                   'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=2e-04, rtol=2e-04), }),
+                   'TestCompositeCompliance', 'test_operator', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1.3e-04, rtol=1.3e-06),
+                                     torch.complex64: tol(atol=1.3e-04, rtol=1.3e-05)}),
+                   'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-04, rtol=2e-05), }),
+                   'TestCompositeCompliance', 'test_forward_ad', device_type='cuda',
+                   active_if=TEST_CUDNN),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1e-4)}),
+                   "TestMathBits", "test_conj_view", device_type='cuda'),
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=9e-2, rtol=9e-2), }),
+                   'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(
+                   toleranceOverride({torch.half: tol(atol=1e-3, rtol=2e-1), }),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cpu')],
+           skips=(
+               # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":104, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: "slow_conv3d_cpu_grad_input" not implemented for 'Long'
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.int64,)),
+               # Reference: https://github.com/pytorch/pytorch/issues/86356
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref',
+                            dtypes=(torch.double, torch.cdouble)),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness',
+                            dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.skip('Skipped for ROCm!'), 'TestCommon', 'test_complex_half_reference_testing',
+                            dtypes=[torch.complex32], active_if=TEST_WITH_ROCM),
+           ),
+           supports_out=False,),
+    OpInfo('nn.functional.conv1d',
+           aliases=('conv1d',),
+           aten_name='conv1d',
+           dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
+                                                       torch.bfloat16),
+           sample_inputs_func=sample_inputs_conv1d,
+           error_inputs_func=error_inputs_conv1d,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           decorators=(
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=5e-2)}),
+                   'TestCommon', 'test_complex_half_reference_testing'
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float16: tol(atol=2e-3, rtol=1e-3)}),
+                   'TestInductorOpInfo', 'test_comprehensive', device_type='cuda',
+               ),
+           ),
+           skips=(
+               # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":103, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # AssertionError: None mismatch: torch.complex128 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules',
+                            'test_custom_rules', dtypes=(torch.complex64, torch.complex128)),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.complex64, torch.complex128)),
+           ),
+           supports_expanded_weight=True,
+           supports_out=False,),
+    OpInfo('nn.functional.conv2d',
+           aliases=('conv2d',),
+           aten_name='conv2d',
+           dtypes=floating_and_complex_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf,
+                                                       torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_conv2d),
+           error_inputs_func=error_inputs_conv2d,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           decorators=(
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=6e-2, rtol=5e-2)}),
+                   'TestCommon', 'test_complex_half_reference_testing',
+               ),
+           ),
+           skips=(
+               # RuntimeError: !lhs.isAliasOf(rhs)INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":103, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Works on some configs!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Ref: https://github.com/pytorch/pytorch/issues/75309
+               # AssertionError: None mismatch: torch.complex128 is not None
+               DecorateInfo(unittest.expectedFailure, 'TestDtypeCustomRules',
+                            'test_custom_rules', dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.complex64, torch.complex128)),
+           ),
+           supports_expanded_weight=True,
+           supports_out=False,),
+    OpInfo('nn.functional.conv3d',
+           aliases=('conv3d',),
+           aten_name='conv3d',
+           dtypes=floating_and_complex_types_and(torch.int64, torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.chalf, torch.bfloat16),
+           sample_inputs_func=sample_inputs_conv3d,
+           error_inputs_func=error_inputs_conv3d,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           decorators=(
+               DecorateInfo(
+                   toleranceOverride({torch.chalf: tol(atol=6e-2, rtol=5e-2)}),
+                   'TestCommon', 'test_complex_half_reference_testing',
+               ),
+               # TF32
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-3, rtol=1e-3)}),
+                   'TestCommon', 'test_noncontiguous_samples',
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestMathBits', 'test_conj_view',
+               ),
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=5e-5, rtol=5e-6)}),
+                   'TestOperators', 'test_vjpvmap',
+               ),
+           ),
+           skips=(
+               # RuntimeError: !lhs.isAliasOf(rhs) INTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":103, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: UNSUPPORTED DTYPE: complex
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.complex64, torch.complex128)),
+               # RuntimeError: Conv3D is not supported on MPS
+               DecorateInfo(unittest.expectedFailure, 'TestConsistency'),
+               # AssertionError: Tensor-likes are not close!
+               # break slow tests
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_compare_cpu'),
+           ),
+           supports_expanded_weight=True,
+           supports_out=False,),
+    OpInfo('nn.functional.group_norm',
+           aten_name='group_norm',
+           aliases=('group_norm',),
+           ref=reference_group_norm,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           error_inputs_func=error_inputs_group_norm,
+           decorators=[
+               # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
+               # Consider making it a parameter or input, or detaching the gradient
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,))
+           ],
+           sample_inputs_func=sample_inputs_group_norm,
+           reference_inputs_func=reference_inputs_group_norm,
+           supports_expanded_weight=True,),
+    OpInfo('nn.functional.instance_norm',
+           # no ref because instance_norm will often have numerical instability (large numbers or nan)
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           decorators=[
+               # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
+               # Consider making it a parameter or input, or detaching the gradient
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            active_if=TEST_WITH_ROCM)
+           ],
+           sample_inputs_func=sample_inputs_instance_norm,
+           supports_expanded_weight=True,),
+    OpInfo('nn.functional.layer_norm',
+           aten_name='layer_norm',
+           aten_backward_name='layer_norm_backward',
+           aliases=('layer_norm',),
+           ref=reference_layer_norm,
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           decorators=[
+               DecorateInfo(
+                   toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1e-03)}),
+                   'TestCommon', 'test_numpy_refs'
+               ),
+               DecorateInfo(unittest.skip("Bug in MPS backend!"), 'TestCommon', 'test_numpy_ref_mps'),
+           ],
+           sample_inputs_func=sample_inputs_layer_norm,
+           supports_expanded_weight=True,),
+    OpInfo('nn.functional.local_response_norm',
+           dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           decorators=[
+               # RuntimeError: falseINTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+           ],
+           sample_inputs_func=sample_inputs_local_response_norm,),
+    OpInfo('constant_pad_nd',
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           sample_inputs_func=sample_inputs_constant_pad_nd,
+           supports_out=False,
+           skips=(
+               # bool can't be passed to Scalar arguments in JIT tracer because
+               # BoolType is not a subtype of ScalarType.
+               DecorateInfo(
+                   unittest.expectedFailure, 'TestNNCOpInfo',
+                   'test_nnc_correctness', dtypes=(torch.bool,)),
+           )),
+    OpInfo('nn.functional.pad',
+           variant_test_name='constant',
+           aten_name='constant_pad_nd',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           sample_inputs_func=partial(sample_inputs_nn_pad, mode='constant'),
+           supports_out=False),
+    OpInfo('nn.functional.pad',
+           variant_test_name='reflect',
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_nn_pad, mode='reflect'),
+           skips=(
+               # Doesn't have a corresponding aten operator.
+               # RuntimeError: falseINTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_out=False),
+    OpInfo('nn.functional.pad',
+           variant_test_name='replicate',
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_nn_pad, mode='replicate'),
+           skips=(
+               # Doesn't have a corresponding aten operator.
+               # RuntimeError: falseINTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_out=False),
+    OpInfo('nn.functional.pad',
+           variant_test_name='replicate_negative',
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           dtypes=all_types_and_complex_and(torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_nn_pad_replicate_negative,
+           skips=(
+               # Doesn't have a corresponding aten operator.
+               # RuntimeError: falseINTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               # Some negative padding cases cause a segfault on MPS
+               DecorateInfo(unittest.skip("Not fully supported on MPS"), 'TestConsistency'),
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_out=False),
+    OpInfo('nn.functional.pad',
+           variant_test_name='circular',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+           sample_inputs_func=partial(sample_inputs_nn_pad, mode='circular'),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_grad=False,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           skips=(
+               # Doesn't have a corresponding aten operator.
+               # RuntimeError: falseINTERNAL ASSERT FAILED at
+               # "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185, please report a bug to PyTorch.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.float32,)),
+               # Difference from <type> is larger with decomposition new_empty_strided.default than original on output 0
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"), 'TestDecomp', 'test_comprehensive'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.hardswish',
+           aten_name="hardswish",
+           aten_backward_name='hardswish_backward',
+           supports_autograd=True,
+           assert_autodiffed=True,
+           sample_inputs_func=sample_inputs_hardswish,
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           supports_gradgrad=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           autodiff_nonfusible_nodes=["aten::hardswish"]),
+    OpInfo('nn.functional.unfold',
+           aten_name='im2col',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_nn_unfold,
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           skips=(
+               # NOTE: this failure may not reproduce consistently on different systems
+               # false INTERNAL ASSERT FAILED at "...torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185
+               DecorateInfo(unittest.skip("Internal assert failed!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='nearest',
+           supports_autograd=True,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'nearest'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='nearest-exact',
+           supports_autograd=True,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           dtypes=floating_types_and(torch.half, torch.bfloat16, torch.uint8),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'nearest-exact'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # RuntimeError: aten::_upsample_nearest_exact*d hit the vmap fallback which is currently disabled
+               DecorateInfo(unittest.expectedFailure, 'TestOperators', 'test_vmapjvpall_has_batch_rule'),
+               DecorateInfo(unittest.expectedFailure, 'TestOperators', 'test_vmapvjp_has_batch_rule'),
+               DecorateInfo(unittest.expectedFailure, 'TestVmapOperatorsOpInfo', 'test_op_has_batch_rule'),
+               # NotImplementedError: The operator 'aten::_upsample_nearest_exact3d.out' is not currently implemented
+               # for the MPS device.
+               DecorateInfo(unittest.expectedFailure, 'TestConsistency'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='linear',
+           supports_autograd=True,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'linear'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='bilinear',
+           supports_fwgrad_bwgrad=True,
+           supports_autograd=True,
+           supports_forward_ad=True,
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_interpolate, 'bilinear'),
+           reference_inputs_func=partial(reference_inputs_interpolate, 'bilinear'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='bicubic',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'bicubic'),
+           reference_inputs_func=partial(reference_inputs_interpolate, 'bicubic'),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='trilinear',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_interpolate, 'trilinear'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.interpolate',
+           aten_name="interpolate",
+           variant_test_name='area',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           sample_inputs_func=partial(sample_inputs_interpolate, 'area'),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('nn.functional.upsample_bilinear',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_upsample, 'bilinear'),
+           reference_inputs_func=partial(reference_inputs_upsample, 'bilinear'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo('_upsample_bilinear2d_aa',
+           op=torch.ops.aten._upsample_bilinear2d_aa,
+           aten_name='_upsample_bilinear2d_aa',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.uint8),
+           dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_upsample_aa, 'bilinear'),
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestDTensorOps', 'test_dtensor_op_db'),
+               DecorateInfo(unittest.expectedFailure, 'TestEagerFusionOpInfo', 'test_aot_autograd_symbolic_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestInductorOpInfo', 'test_comprehensive'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+           )),
+    OpInfo(
+        "nn.functional.soft_margin_loss",
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        # doesn't support grad on target
+        sample_inputs_func=partial(sample_inputs_loss, rhs_requires_grad=False),
+        error_inputs_func=error_inputs_soft_margin_loss,
+    ),
+    OpInfo('nn.functional.upsample_nearest',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.uint8, torch.half, torch.bfloat16),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=partial(sample_inputs_upsample, 'nearest'),
+           skips=(
+               # RuntimeError: false
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":185,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           supports_out=False),
+    OpInfo(
+        "nn.functional.margin_ranking_loss",
+        dtypes=all_types_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_margin_ranking_loss,
+        error_inputs_func=error_inputs_margin_ranking_loss,
+        reference_inputs_func=reference_inputs_margin_ranking_loss,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True),
+    OpInfo(
+        "nn.functional.multi_margin_loss",
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=sample_inputs_multi_margin_loss,
+        reference_inputs_func=reference_inputs_multi_margin_loss,
+        error_inputs_func=error_inputs_multi_margin_loss,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.multilabel_margin_loss",
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        supports_out=False,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=sample_inputs_multilabel_margin_loss,
+        reference_inputs_func=reference_inputs_multilabel_margin_loss,
+        error_inputs_func=error_inputs_multilabel_margin_loss,
+    ),
+    OpInfo('nn.functional.leaky_relu',
+           aliases=None,
+           aten_name="leaky_relu",
+           aten_backward_name='leaky_relu_backward',
+           sample_inputs_func=sample_inputs_leaky_relu,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           inplace_variant=lambda x, negative_slope=0.01:
+               torch.nn.functional.leaky_relu(x, negative_slope, inplace=True),
+           supports_autograd=True,
+           assert_autodiffed=True,
+           supports_gradgrad=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           autodiff_nonfusible_nodes=["aten::leaky_relu"]),
+    OpInfo(
+        "nn.functional.multilabel_soft_margin_loss",
+        supports_out=False,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_multilabel_soft_margin_loss,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+        ),
+        skips=(
+            # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 4096
+            # __main__.TestJitCUDA.test_variant_consistency_jit_nn_functional_multilabel_soft_margin_loss_cuda_float32
+            # leaked 4096 bytes CUDA memory on device 0
+            DecorateInfo(
+                # Skip instead of expectedFailure because this fails
+                # locally for me but passes in CI.
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        ),
+    ),
+    OpInfo('nn.functional.avg_pool2d',
+           aten_name='avg_pool2d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_types_and(torch.int64, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           error_inputs_func=error_inputs_avg_pool2d,
+           sample_inputs_func=sample_inputs_avgpool2d,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'),
+           )),
+    OpInfo('nn.functional.fractional_max_pool2d',
+           supports_autograd=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           op=lambda input, *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional.fractional_max_pool2d, input, *args, **kwargs),
+           # vmap does not support random operations
+           check_batched_forward_grad=False,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           test_neg_view=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_fractional_max_pool2d,
+           decorators=(
+               # FIXME: AssertionError: False is not true : Tensors failed to compare as equal!
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit')),
+           skips=(
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),)),
+    OpInfo('nn.functional.fractional_max_pool3d',
+           supports_autograd=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           op=lambda input, *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional.fractional_max_pool3d, input, *args, **kwargs),
+           # vmap does not support random operations
+           check_batched_forward_grad=False,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           test_neg_view=False,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           sample_inputs_func=sample_inputs_fractional_max_pool3d,
+           decorators=(
+               # FIXME: both derivatives are implemented incorrectly
+               # https://github.com/pytorch/pytorch/issues/69322
+               # FIXME: AssertionError: False is not true : Tensors failed to compare as equal!
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit')),
+           skips=(
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),)),
+    OpInfo('nn.functional.max_pool1d',
+           aten_name='max_pool1d',
+           supports_autograd=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           # TODO: add shape checks
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           skips=(
+               # Pre-existing condition; Needs to be fixed
+               DecorateInfo(unittest.skip("Works on some configs"), 'TestNNCOpInfo',
+                            'test_nnc_correctness', dtypes=(torch.bfloat16,)),
+               # RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
+               # Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data()
+               # to actually allocate memory
+               DecorateInfo(unittest.skip("Skipped!"), 'TestTags', 'test_tags'),
+           ),
+           error_inputs_func=error_inputs_max_pool1d,
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('nn.functional.max_pool2d',
+           aten_name='max_pool2d',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           # Vmap is not happy with non-contiguous (channels_last) inputs
+           check_batched_gradgrad=False,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           assert_jit_shape_analysis=True,
+           dtypes=all_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           error_inputs_func=error_inputs_max_pool2d,
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('max_pool2d_with_indices_backward',
+           op=max_pool2d_backward,
+           # We've defined a custom op, so there's no corresponding aten op
+           aten_name=None,
+           method_variant=None,
+           inplace_variant=None,
+           operator_variant=None,
+           inplace_operator_variant=None,
+           check_batched_gradgrad=False,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           sample_inputs_func=sample_inputs_max_pool,
+           skips=(
+               # We've defined a custom op here, and we don't handle the case where we receive an out kwarg
+               DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_out"),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # object has no attribute max_pool2d_with_indices_backward (It's not available on torch -- so expected)
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit')
+           )),
+    OpInfo('nn.functional.max_pool3d',
+           aten_name='max_pool3d',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # got: Batching rule not implemented for aten::flatten.using_ints
+           check_batched_forward_grad=False,
+           # TODO: add shape checks
+           assert_jit_shape_analysis=False,
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           # TODO: investigate nondeterminism
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           error_inputs_func=error_inputs_max_pool3d,
+           sample_inputs_func=sample_inputs_max_pool),
+    OpInfo('nn.functional.max_unpool1d',
+           aten_name='max_unpool1d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
+           skips=(
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type='cpu'),
+           )),
+    OpInfo('nn.functional.max_unpool1d',
+           variant_test_name='grad',
+           aten_name='max_unpool1d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
+    OpInfo('nn.functional.max_unpool2d',
+           aten_name='max_unpool2d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
+           skips=(
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad'),
+           )),
+    OpInfo('nn.functional.max_unpool2d',
+           variant_test_name='grad',
+           aten_name='max_unpool2d',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # Vmap is not happy with non-contiguous (channels_last) inputs
+           check_batched_grad=False,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
+    OpInfo('nn.functional.max_unpool3d',
+           aten_name='max_unpool3d',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool,
+           skips=(
+               # Gradients are tested in `variant_test_name=grad` below.
+               # We skip tests here because there is non-determinism in backward
+               # with gather, when there are writes into the same memory location,
+               # and if there are several indices pointing to the same memory,
+               # gradcheck is oblivious about that and cannot perturb them all at once
+               # (see sample_inputs_max_unpool_grad to find out more).
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD',
+                            active_if=(not IS_MACOS)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_grad'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad'),
+           )),
+    OpInfo('nn.functional.max_unpool3d',
+           variant_test_name='grad',
+           aten_name='max_unpool3d',
+           supports_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           assert_jit_shape_analysis=False,
+           dtypes=floating_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16),
+           sample_inputs_func=sample_inputs_max_unpool_grad),
+    OpInfo('nn.functional.linear',
+           aten_name='linear',
+           supports_autograd=True,
+           supports_gradgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_linear,
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           # linear calls mm under the hood which is nondeterministic on CUDA
+           # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           supports_expanded_weight=True,
+           decorators=(
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
+    OpInfo('nn.functional.bilinear',
+           aten_name='bilinear',
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_bilinear,
+           dtypes=all_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16,
+                                           *[torch.bfloat16] if SM53OrLater or TEST_WITH_ROCM else []),
+           decorators=(
+               DecorateInfo(toleranceOverride({torch.float16: tol(atol=5e-05, rtol=1e-03)}),
+                            'TestInductorOpInfo', 'test_comprehensive', device_type='cpu'),
+           ),
+           skips=(
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bfloat16,)),
+           ),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('nn.functional.glu',
+           aten_name='glu',
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           sample_inputs_func=sample_inputs_glu,
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    UnaryUfuncInfo(
+        'nn.functional.elu',
+        aten_backward_name='elu_backward',
+        ref=lambda x, alpha=1.0, inplace=False:
+            np.maximum(0., x) + np.minimum(0., alpha * (np.exp(x) - 1)),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        sample_kwargs=lambda device, dtype, input:
+            ({'alpha': 0.8}, {'alpha': 0.8}),
+        inplace_variant=lambda x, alpha=1.0:
+            torch.nn.functional.elu(x, alpha, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-03, rtol=1.2e-03),
+                    torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+    ),
+    # Marked as a Unary function because it has some rather odd broadcasting semantics in its
+    # second argument
+    UnaryUfuncInfo(
+        'nn.functional.prelu',
+        aten_backward_name='_prelu_kernel_backward',
+        ref=lambda x, weight:
+            np.maximum(0., x) + np.minimum(0., x) *
+            (weight if x.ndim == 1 else weight.reshape([weight.size if i == 1 else 1 for i in range(0, x.ndim)])),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        # test_reference_numerics only tests the case when the weight tensor is a scalar
+        sample_kwargs=sample_kwargs_prelu_scalar_weight,
+        error_inputs_func=error_inputs_prelu,
+        sample_inputs_func=sample_inputs_prelu,
+        reference_inputs_func=reference_inputs_prelu,
+        decorators=[
+            # RuntimeError: Cannot insert a Tensor that requires grad as a constant.
+            # Consider making it a parameter or input, or detaching the gradient
+            # https://github.com/pytorch/pytorch/issues/68752
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'), ],
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.celu',
+        ref=lambda x, alpha=1.0, inplace=False:
+            np.maximum(0., x) + np.minimum(0., alpha * (np.exp(x / alpha) - 1)),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        sample_kwargs=lambda device, dtype, input:
+            ({'alpha': 0.8}, {'alpha': 0.8}),
+        inplace_variant=lambda x, alpha=1.0:
+            torch.nn.functional.celu(x, alpha, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-03, rtol=1.2e-03),
+                    torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.rrelu',
+        aten_backward_name='rrelu_with_noise_backward',
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.rrelu, input, *args, **kwargs),
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.rrelu, input, *args, inplace=True, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        gradcheck_wrapper=wrapper_set_seed,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_kwargs=lambda device, dtype, input:
+            (dict(lower=0., upper=1., training=True), dict(lower=0., upper=1., training=True)),
+        sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs=dict(lower=0., upper=1., training=True)),
+        error_inputs_func=error_inputs_rrelu,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-03, rtol=1.2e-03),
+                    torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ),),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # In-place operations do not play well with forward AD
+            # https://github.com/pytorch/pytorch/issues/77447
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients',
+                         'test_inplace_forward_mode_AD'),
+            # The noise vector that's generated in these tests is not the same elementwise
+            DecorateInfo(unittest.skip("Different noise"), 'TestUnaryUfuncs', 'test_batch_vs_slicing'),
+            DecorateInfo(unittest.skip("Different noise"), 'TestUnaryUfuncs', 'test_contig_vs_every_other'),
+            DecorateInfo(unittest.skip("Different noise"), 'TestUnaryUfuncs', 'test_non_contig_expand'),
+            DecorateInfo(unittest.skip("Different noise"), 'TestUnaryUfuncs', 'test_contig_vs_transposed'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
+    UnaryUfuncInfo(
+        'nn.functional.selu',
+        ref=lambda x, inplace=False:
+            1.0507009873554804934193349852946 * (
+                np.maximum(0., x) + np.minimum(0., 1.6732632423543772848170429916717 * (np.exp(x) - 1))
+            ),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,  # depends on 'elu'
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        inplace_variant=lambda x: torch.nn.functional.selu(x, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-2, rtol=1.8e-2),
+                    torch.bfloat16: tol(atol=1e-2, rtol=1.8e-2)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+    ),
+    OpInfo(
+        'torch._scaled_mm',
+        sample_inputs_func=sample_inputs_scaled_mm,
+        dtypes=empty_types(),
+        dtypesIfCUDA=empty_types() + (torch.float8_e4m3fn,),
+        supports_out=True,
+        supports_forward_ad=False,
+        supports_autograd=False,
+        decorators=[skipCUDAIf(not SM90OrLater or TEST_WITH_ROCM, 'Requires CUDA SM >= 9.0')],
+        skips=(
+            # Sample inputs isn't really parametrized on dtype
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
+                         device_type='cuda'),
+            # "mul_cuda" not implemented for float8_e4m3fn
+            # https://github.com/pytorch/pytorch/issues/107256
+            DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness',
+                         dtypes=(torch.float8_e4m3fn,)),
+        )
+    ),
+    OpInfo(
+        'nn.functional.scaled_dot_product_attention',
+        op=lambda *args, **kwargs:
+               wrapper_set_seed(torch.nn.functional.scaled_dot_product_attention, *args, **kwargs),
+        sample_inputs_func=sample_inputs_scaled_dot_product_attention,
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=False,
+        supports_fwgrad_bwgrad=True,
+        check_batched_forward_grad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[DecorateInfo(toleranceOverride(
+            {torch.float32: tol(atol=5e-05, rtol=5e-6)}), 'TestCommon',), ],
+        skips=(
+            # When attn mask is a composite tensor this fails backward by returning a none
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward', device_type='cuda'),
+            # This is only failing on Linux Bionic 3.10 Cuda 11.6
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes',
+                         device_type='cuda', active_if=_get_torch_cuda_version() >= (11, 6)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples',
+                         dtypes=(torch.float32,)),
+            # AssertionError: JIT Test does not execute any logic
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            # Forward works for dtype=float64 which is the math path
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+            # Not implemented for Forward AD
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                         device_type='cpu'),
+            # Not implemented for backward derivative
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients', 'test_fn_gradgrad',
+                         device_type='cpu'),
+            # CPU and CUDA have inconsistencies for intermediate outputs
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace',
+                         device_type='cpu'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                         device_type='cpu'),
+            # TODO: Do not work even on MI200 because of stride mismatching.
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                         device_type='cuda', dtypes=[torch.float16, torch.bfloat16],
+                         active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace',
+                         device_type='cuda', dtypes=[torch.float16, torch.bfloat16],
+                         active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', 'test_fake_crossref_backward_amp',
+                         device_type='cuda', active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION),
+            # When changing input from Tensor to CompositeCompliantTensor, input.requires_grad() changes from true to false
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward',
+                         device_type='cpu'),
+            # OpInfo was implemented with a lambda
+            DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # TODO Need to understand what this is testing and why it doesn't work
+            DecorateInfo(unittest.skip("Skipped"), 'TestDecomp', 'test_comprehensive'),
+            DecorateInfo(unittest.skip('output is non-deterministic (when dropout_p > 0)'), 'TestCommon', 'test_compare_cpu'),
+            # TODO skip this for now since we can't skip on runtime arch support
+            DecorateInfo(unittest.skip('This is '), 'TestInductorOpInfo', 'test_comprehensive'),
+            # skip for sm < 80
+            DecorateInfo(unittest.skip("Skipped!"), 'TestSchemaCheckModeOpInfo', 'test_schema_correctness',
+                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_meta_outplace',
+                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace',
+                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                         device_type='cuda', dtypes=(torch.bfloat16,), active_if=not SM80OrLater),
+            # registered in fake_impls.py instead of _meta_registrations.py, so meta kernels will fail.
+            # However, for implementations that fall back to the constituent ops, the meta kernels may not
+            # fail. Fused kernels will fail, whereas unfused kernels will not fail.
+            # All fused kernels support bf16 and fp16 - so if fused attention is supported, the test will fail.
+            # mem_eff_attention also supports fp32 - so if it is supported the test will fail.
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace",
+                         dtypes=(torch.bfloat16, torch.float16), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace",
+                         dtypes=(torch.bfloat16, torch.float16), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides",
+                         dtypes=(torch.bfloat16, torch.float16,), active_if=PLATFORM_SUPPORTS_FUSED_ATTENTION),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides",
+                         dtypes=(torch.float32,), active_if=PLATFORM_SUPPORTS_MEM_EFF_ATTENTION),),
+    ),
+    OpInfo(
+        'torch.ops.aten._flash_attention_forward',
+        sample_inputs_func=sample_inputs_flash_attention_forward,
+        dtypes=empty_types(),
+        dtypesIfCUDA=custom_types(torch.float16)
+        if not SM80OrLater
+        else custom_types(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_autograd=True,
+        supports_fwgrad_bwgrad=False,
+        supports_forward_ad=False,
+        check_batched_forward_grad=False,
+        decorators=[skipCUDAIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "This platform doesn't support Flash Attention")],
+        skips=(
+            # Device mismatch due to philox seed and offset
+            DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake', device_type='cuda'),
+            # meta implementation is in fake_impls.py instead of being a meta registration
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            # Checking the scalar value of the philox seed and offset
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),
+            # None Mismatch Tensor
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type='cuda'),
+            # TODO: Do not work on MI200 because of stride mismatching.
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                         device_type='cuda', dtypes=[torch.float16, torch.bfloat16],
+                         active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_meta_outplace',
+                         device_type='cuda', dtypes=[torch.float16, torch.bfloat16],
+                         active_if=TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION),
+        )
+    ),
+    OpInfo(
+        'torch.ops.aten._efficient_attention_forward',
+        sample_inputs_func=sample_inputs_efficient_attention_forward,
+        dtypes=empty_types(),
+        dtypesIfCUDA=custom_types(torch.float16, torch.float32)
+        if not SM80OrLater
+        else custom_types(torch.float16, torch.float32, torch.bfloat16),
+        supports_out=False,
+        supports_autograd=True,
+        supports_fwgrad_bwgrad=False,
+        supports_forward_ad=False,
+        check_batched_forward_grad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[skipCUDAIf(TEST_WITH_ROCM, "ROCm doesn't support efficient attention")],
+        skips=(
+            # Device mismatch due to philox seed and offset
+            DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_autocast', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake', device_type='cuda'),
+            # meta implementation is in fake_impls.py instead of being a meta registration
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_inplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_meta_outplace"),
+            DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace"),
+            # Checking the scaler value of the philox seed and offset
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_noncontiguous_samples', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),
+            # None Mismatch Tensor
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward', device_type='cuda'),
+        )
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.silu',
+        aten_backward_name='silu_backward',
+        ref=lambda x, inplace=False: x / (1 + np.exp(-x)),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,
+        supports_autograd=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        supports_out=False,
+        inplace_variant=lambda x: torch.nn.functional.silu(x, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-3, rtol=1e-3),
+                    torch.bfloat16: tol(atol=1e-4, rtol=1e-4)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=(torch.cfloat,), device_type='cpu'),
+        ),
+        autodiff_nonfusible_nodes=["aten::silu"],
+    ),
+    # TODO: combine this with the nn.functional.silu OpInfo when
+    # complex autodiff for silu is supported or when
+    # the forward bug is fixed
+    # Note: silu errors when given inputs that require grad
+    #   but it doesn't support grad in their dtype
+    #   This is why the dtypes list above passes test_dtypes,
+    #   because it's getting lucky and failing in forward
+    #   because test_dtypes sets requires_grad to True
+    #   THIS IS A BUG
+    UnaryUfuncInfo(
+        'nn.functional.silu',
+        variant_test_name='complex',
+        ref=lambda x, inplace=False:
+            x / (1 + np.exp(-x)),
+        dtypes=complex_types(),
+        dtypesIfCUDA=complex_types(),
+        supports_forward_ad=False,
+        supports_autograd=False,
+        assert_autodiffed=False,
+        supports_out=False,
+        inplace_variant=lambda x: torch.nn.functional.silu(x, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-3, rtol=1e-3),
+                    torch.bfloat16: tol(atol=1e-4, rtol=1e-4)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=(torch.cfloat,)),
+            # FIXME: intentionally misreports dtypes
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+            # FIXME: numpy reference diverges: Comparing (nan+nanj) and (-0+0j)
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_large',
+                         dtypes=(torch.complex64, torch.cdouble)),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_small',
+                         dtypes=(torch.complex64,)),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         dtypes=(torch.complex64,)))),
+    UnaryUfuncInfo(
+        'nn.functional.hardsigmoid',
+        aten_backward_name='hardsigmoid_backward',
+        ref=reference_hardsigmoid,
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=False,
+        supports_forward_ad=True,
+        supports_out=False,
+        inplace_variant=partial(torch.nn.functional.hardsigmoid, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-04, rtol=0.001)}), 'TestUnaryUfuncs', device_type='cuda',), ],
+        skips=[
+            # still want to test that first derivative works though second derivative isn't supported
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', "test_inplace_gradgrad"),
+            # produces 0 instead of nan on ROCM
+            DecorateInfo(unittest.expectedFailure,
+                         'TestUnaryUfuncs', "test_reference_numerics_extremal",
+                         device_type='cuda',
+                         active_if=(TEST_WITH_ROCM)), ]
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.logsigmoid',
+        aten_name="log_sigmoid",
+        aten_backward_name='log_sigmoid_backward',
+        ref=reference_logsigmoid,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_gradgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        # autodiff_nonfusible_nodes=["aten::log_sigmoid"],
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_small'),
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_large'),
+            DecorateInfo(
+                precisionOverride({torch.float16: 1e-2, torch.bfloat16: 5e-3}),
+                'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+        ],
+        skips=(
+            # Resized a non-empty tensor but did not warn about it.
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cpu'),
+        ),
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.mish',
+        aten_backward_name='mish_backward',
+        ref=lambda x: x * np.tanh(reference_softplus(x)),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        inplace_variant=partial(torch.nn.functional.mish, inplace=True),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-03)}), 'TestUnaryUfuncs',), ],
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.softsign',
+        ref=lambda x: x / (np.abs(x) + 1),
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1.3e-04)}), 'TestUnaryUfuncs',), ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                         dtypes=(torch.int, torch.int8)),),
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.tanhshrink',
+        ref=lambda x: x - np.tanh(x),
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_autograd=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        decorators=[
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(
+                toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}), 'TestUnaryUfuncs',),
+            DecorateInfo(toleranceOverride({torch.complex64: tol(atol=6e-04, rtol=1e-05),
+                                            torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'),
+        ],
+        skips=(
+            # in each case, pytorch will produce a nan while numpy will not
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
+                         'TestUnaryUfuncs', "test_reference_numerics_large",
+                         dtypes=(torch.complex64, torch.complex128), active_if=(IS_MACOS)),
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
+                         'TestUnaryUfuncs', "test_reference_numerics_extremal",
+                         dtypes=(torch.complex64, torch.complex128), device_type='cpu',
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+        ),
+        # tan(j * pi/2 * odd_number) is nan which also make tanhshrink nan.
+        reference_numerics_filter=NumericsFilter(
+            condition=lambda x: (close_to_int(x / (math.pi * 0.5j))
+                                 if x.is_complex() else x.new_tensor(False, dtype=torch.bool)),
+            safe_val=0)
+    ),
+    UnaryUfuncInfo(
+        'nn.functional.threshold',
+        ref=lambda x, threshold, value: np.where(x <= threshold, value, x).astype(x.dtype),
+        dtypes=all_types_and(torch.half, torch.bfloat16),
+        inplace_variant=lambda x, threshold, value:
+            torch.nn.functional.threshold(x, threshold, value, inplace=True),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=False,
+        supports_gradgrad=True,
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_kwargs=lambda device, dtype, input: ({'threshold': float.fromhex('0x1.3ap-3'),
+                                                    'value': -9},
+                                                    {'threshold': float.fromhex('0x1.3ap-3'),
+                                                    'value': -9}),
+        # TODO(whc) should not need sample_inputs_func, but without it
+        # kwargs aren't being hooked up properly
+        sample_inputs_func=sample_inputs_threshold,
+    ),
+    OpInfo(
+        "nn.functional.triplet_margin_loss",
+        sample_inputs_func=sample_inputs_triplet_margin_loss,
+        error_inputs_func=error_inputs_triplet_margin_loss,
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    OpInfo(
+        "nn.functional.triplet_margin_with_distance_loss",
+        sample_inputs_func=partial(sample_inputs_triplet_margin_loss, with_distance=True),
+        error_inputs_func=error_inputs_triplet_margin_loss,
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # This test cannot handle a callable passed to `distance_function`. If we would use
+            # `distance_function=None`, the test would pass fine.
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+        ),
+    ),
+    BinaryUfuncInfo('nextafter',
+                    dtypes=floating_types_and(torch.bfloat16, torch.half),
+                    dtypesIfCUDA=floating_types_and(torch.bfloat16),
+                    supports_autograd=False,
+                    supports_rhs_python_scalar=False),
+    OpInfo(
+        "to",
+        op=lambda x, *args, **kwargs: x.to(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        sample_inputs_func=sample_inputs_to,
+        skips=(
+            # RuntimeError: undefined value cpu
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cpu",
+            ),
+            # NotImplementedError: Cannot copy out of meta tensor; no data!
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestMeta",
+                "test_meta_outplace",
+            ),
+            # https://github.com/pytorch/pytorch/issues/84335
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestProxyTensorOpInfo",
+                "test_make_fx_symbolic_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+        ),
+    ),
+    OpInfo('topk',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           sample_inputs_func=sample_inputs_topk),
+    # Multiple variants for batch_norm to test with and without cuDNN disabled
+    # See https://github.com/pytorch/pytorch/pull/63218#discussion_r688549391 for more details
+    OpInfo('nn.functional.batch_norm',
+           aten_name='batch_norm',
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_batch_norm,
+           skips=(
+               # see https://github.com/pytorch/pytorch/issues/71286
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
+                            device_type='cpu', dtypes=(torch.bfloat16, torch.float16)),
+               # Trying to use forward AD with miopen_batch_norm that does not support it
+               # because it has not been implemented yet.
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad',
+                            device_type="cuda", active_if=TEST_WITH_ROCM),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-05, rtol=1e-05)}),
+                            'TestCompositeCompliance', 'test_forward_ad', device_type="cpu"),
+           )),
+    # This variant tests batch_norm with cuDNN disabled only on CUDA devices
+    OpInfo('nn.functional.batch_norm',
+           variant_test_name='without_cudnn',
+           aten_name='batch_norm',
+           dtypes=empty_types(),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           decorators=[onlyCUDA, disablecuDNN],
+           skips=(
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-04)}),
+                            'TestJit', 'test_variant_consistency_jit'),
+           ),
+           sample_inputs_func=sample_inputs_batch_norm),
+    OpInfo(
+        "nn.functional.binary_cross_entropy",
+        aten_backward_name='binary_cross_entropy_backward',
+        sample_inputs_func=sample_inputs_binary_cross_entropy,
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        gradcheck_fast_mode=False,
+        supports_autograd=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=(
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCudaFuserOpInfo",
+            ),
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+            ),
+            # Fails for unknown reason: https://github.com/pytorch/pytorch/issues/120783
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCompositeCompliance",
+                "test_cow_input",
+                device_type='cuda',
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-3, rtol=1e-3)}),
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+            # RuntimeError: output with shape [] doesn't match the broadcast shape [5, 5]
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+        ),
+        skips=(
+            # RuntimeError: expected int at position 0, but got: Tensor
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+            ),
+        ),
+    ),
+    # We have to add 2 OpInfo entry for `igamma` and `igammac`.First is the
+    # standard entry, second is to run gradcheck tests on the second argument.
+    BinaryUfuncInfo('igamma',
+                    dtypes=floating_types_and(torch.bfloat16, torch.float16),
+                    aliases=('torch.special.gammainc',),
+                    dtypesIfCUDA=floating_types(),
+                    # TODO: FIXME
+                    supports_rhs_python_scalar=False,
+                    supports_autograd=False,
+                    skips=(
+                        # FIXME: incorrectly tries to pass a rhs scalar
+                        DecorateInfo(unittest.expectedFailure, 'TestJit',
+                                     'test_jit_alias_remapping'),
+                    )),
+    # TODO: FIXME, ideally by implemented grad for both inputs
+    # BinaryUfuncInfo('igamma',
+    #                 variant_test_name='grad_other',
+    #                 # Since autograd formula is implemented only for other and
+    #                 # gradcheck test verifies the formula for input in SampleInput,
+    #                 # we permute the arguments.
+    #                 op=lambda self, other, **kwargs: torch.igamma(other, self, **kwargs),
+    #                 inplace_variant=None,
+    #                 method_variant=None,
+    #                 supports_rhs_python_scalar=False,
+    #                 rhs_make_tensor_kwargs=dict(requires_grad=False),
+    #                 dtypes=floating_types_and(torch.bfloat16, torch.float16),
+    #                 backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
+    #                 dtypesIfCUDA=floating_types(),
+    #                 backward_dtypesIfCUDA=floating_types(),
+    #                 supports_inplace_autograd=False,
+    #                 skips=(
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable"),"),
+    #                     # test does not work with passing lambda for op
+    #                     # AssertionError: False is not true : Tensors failed to compare as equal!
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+    #                     # test fails are we permute the arguments function variant
+    #                     # but not for inplace or method.
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+    #                     # TypeError: igamma(): argument 'input' (position 1) must be Tensor, not float
+    #                     DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+    #                 )),
+    BinaryUfuncInfo('igammac',
+                    dtypes=floating_types_and(torch.bfloat16, torch.float16),
+                    aliases=('torch.special.gammaincc',),
+                    dtypesIfCUDA=floating_types(),
+                    supports_autograd=False,
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # FIXME: incorrectly tries to pass a rhs scalar
+                        DecorateInfo(unittest.expectedFailure, 'TestJit',
+                                     'test_jit_alias_remapping'),
+                    )),
+    # TODO: FIXME, ideally by implementing grad for both inputs
+    # BinaryUfuncInfo('igammac',
+    #                 variant_test_name='grad_other',
+    #                 # Since autograd formula is implemented only for other and
+    #                 # gradcheck test verifies the formula for input in SampleInput,
+    #                 # we permute the arguments
+    #                 op=lambda self, other, **kwargs: torch.igammac(other, self, **kwargs),
+    #                 inplace_variant=None,
+    #                 method_variant=None,
+    #                 supports_rhs_python_scalar=False,
+    #                 rhs_make_tensor_kwargs=dict(requires_grad=False),
+    #                 dtypes=floating_types_and(torch.bfloat16, torch.float16),
+    #                 backward_dtypesIfCPU=floating_types_and(torch.bfloat16),
+    #                 dtypesIfCUDA=floating_types(),
+    #                 backward_dtypesIfCUDA=floating_types(),
+    #                 supports_inplace_autograd=False,
+    #                 decorators=[
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable"),
+    #                 ],
+    #                 skips=(
+    #                     # test does not work with passing lambda for op
+    #                     # AssertionError: False is not true : Tensors failed to compare as equal!
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+    #                     # test fails are we permute the arguments function variant
+    #                     # but not for inplace or method.
+    #                     DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+    #                     # TypeError: igammac(): argument 'input' (position 1) must be Tensor, not float
+    #                     DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs'),
+    #                 )),
+    UnaryUfuncInfo('nn.functional.softshrink',
+                   aten_name="softshrink",
+                   aten_backward_name='softshrink_backward',
+                   dtypes=floating_types_and(torch.bfloat16, torch.float16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   assert_autodiffed=False,
+                   sample_inputs_func=sample_inputs_softshrink,
+                   error_inputs_func=error_inputs_softshrink),
+    UnaryUfuncInfo('nn.functional.hardshrink',
+                   aten_name="hardshrink",
+                   aten_backward_name='hardshrink_backward',
+                   dtypes=floating_types_and(torch.bfloat16, torch.float16),
+                   assert_autodiffed=True,
+                   sample_inputs_func=sample_inputs_hardshrink,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   autodiff_nonfusible_nodes=["aten::hardshrink"]),
+    UnaryUfuncInfo('nn.functional.hardtanh',
+                   aten_name="hardtanh",
+                   aten_backward_name='hardtanh_backward',
+                   dtypes=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64, torch.half, torch.bfloat16),
+                   backward_dtypes=all_types_and(torch.half, torch.bfloat16),
+                   backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+                   assert_autodiffed=True,
+                   sample_inputs_func=sample_inputs_hardtanh,
+                   supports_out=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   autodiff_nonfusible_nodes=["aten::hardtanh"]),
+    OpInfo('nn.functional.gelu',
+           aten_name="gelu",
+           aten_backward_name='gelu_backward',
+           ref=reference_gelu if TEST_SCIPY else None,
+           error_inputs_func=error_inputs_gelu,
+           supports_autograd=True,
+           assert_autodiffed=True,
+           sample_inputs_func=sample_inputs_gelu,
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           supports_gradgrad=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           autodiff_nonfusible_nodes=["aten::gelu"],
+           skips=(
+               # AssertionError: Tensor-likes are not close!
+               # May not replicate in CI
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+           )),
+    UnaryUfuncInfo('nn.functional.relu6',
+                   aten_name="relu6",
+                   dtypes=all_types_and(torch.half, torch.bfloat16),
+                   backward_dtypes=floating_types_and(torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_out=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   autodiff_nonfusible_nodes=["aten::relu6"]),
+    OpInfo('mm',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_mm,
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+           )),
+    OpInfo('mode',
+           op=torch.mode,
+           dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Resized a non-empty tensor but did not warn about it
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           ),
+           sample_inputs_func=sample_inputs_mode,),
+    make_mvlgamma_opinfo(variant_test_name='mvlgamma_p_1',
+                         domain=(1, None),
+                         skips=skips_mvlgamma(),
+                         sample_kwargs=lambda device, dtype, input: ({'p': 1}, {'d': 1})),
+    make_mvlgamma_opinfo(variant_test_name='mvlgamma_p_3',
+                         domain=(2, None),
+                         skips=skips_mvlgamma(),
+                         sample_kwargs=lambda device, dtype, input: ({'p': 3}, {'d': 3})),
+    make_mvlgamma_opinfo(variant_test_name='mvlgamma_p_5',
+                         domain=(3, None),
+                         skips=skips_mvlgamma(),
+                         sample_kwargs=lambda device, dtype, input: ({'p': 5}, {'d': 5})),
+    BinaryUfuncInfo('ne',
+                    ref=np.not_equal,
+                    aliases=('not_equal',),
+                    dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                    always_returns_bool=True,
+                    supports_autograd=False,
+                    skips=(
+                    )),
+    OpInfo('narrow',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=True),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=True),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True, is_ref=False),
+           skips=(
+               # Use of .item()
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+           )),
+    OpInfo('narrow_copy',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           supports_out=True,
+           supports_forward_ad=False,
+           supports_fwgrad_bwgrad=False,
+           supports_autograd=False,
+           # https://github.com/pytorch/pytorch/issues/86931
+           sample_inputs_func=partial(sample_inputs_narrow_narrow_copy, is_narrow=False),
+           reference_inputs_func=partial(reference_inputs_narrow_narrow_copy, is_narrow=False),
+           error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False, is_ref=False),
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/84577
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # Lazy tensor failures: mutating and aliasing ops should all have codegen'd kernels
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness'),
+               DecorateInfo(unittest.expectedFailure, 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+               # Could not run 'aten::narrow_copy.out' with arguments from the 'CUDA' backend
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_outplace',
+                            device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace',
+                            device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace',
+                            device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+           )),
+    OpInfo('view_copy',
+           dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+           ref=lambda x, newshape: np.reshape(x, newshape).copy(),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_autograd=True,
+           sample_inputs_func=sample_inputs_view_reshape,
+           error_inputs_func=error_inputs_view_reshape),
+    UnaryUfuncInfo('neg',
+                   aliases=('negative', ),
+                   ref=np.negative,
+                   dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                   error_inputs_func=error_inputs_neg,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True),
+    OpInfo('dist',
+           op=torch.dist,
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+           # Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_dist),
+    OpInfo('outer',
+           op=torch.outer,
+           aliases=('ger', ),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_outer,),
+    OpInfo('ormqr',
+           op=torch.ormqr,
+           dtypes=floating_and_complex_types(),
+           # https://github.com/pytorch/pytorch/issues/80411
+           gradcheck_fast_mode=True,
+           supports_forward_ad=False,
+           supports_fwgrad_bwgrad=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_ormqr,
+           error_inputs_func=error_inputs_ormqr,
+           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               # Strides are not the same!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+           )),
+    OpInfo('permute',
+           ref=np.transpose,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           assert_autodiffed=True,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           assert_jit_shape_analysis=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_varargs=True,
+           sample_inputs_func=sample_inputs_permute,
+           reference_inputs_func=reference_inputs_permute),
+    BinaryUfuncInfo('pow',
+                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+                    dtypesIfCUDA=all_types_and_complex_and(torch.half, torch.bfloat16, torch.chalf),
+                    ref=np.power,
+                    # Due to AVX2 currently not being fully supported for Float16, log_vml_cpu can't be enabled
+                    # for Float16, causing this test to fail. pow's autograd for Float16 is thus currently
+                    # unsupported on CPU.
+                    backward_dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+                    backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_inplace_autograd=False,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=True,
+                    supports_one_python_scalar=True,
+                    # Integer types do not support negative exponentes
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    # Raising negative real numbers to fractional powers is not supported
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    decorators=(
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_reference_numerics'),
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05),
+                                                        torch.complex128: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_scalar_support'),
+                    ),
+                    skips=(
+                        # Skipping integers because they are being raised to negative powers causing an error
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=[torch.int8, torch.int16, torch.int32, torch.int64]),
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=[torch.int16, torch.int32, torch.int64]),
+                        # FIXME Complex values error with: Greatest absolute difference: nan at index
+                        # Ref: https://github.com/pytorch/pytorch/issues/76853
+                        # For `chalf`, reference computation in `numpy` is computed in `cfloat`.
+                        # Output of `chalf` saturates to `inf` quicker than reference due to its small range
+                        # which leads to failure of this test.
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick',
+                                     dtypes=(torch.complex32,), active_if=TEST_WITH_ROCM),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive',
+                                     dtypes=(torch.complex32,), active_if=TEST_WITH_ROCM),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_complex_half_reference_testing',
+                                     dtypes=(torch.complex32,), active_if=TEST_WITH_ROCM),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_batch_vs_slicing',
+                                     dtypes=(torch.complex32,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_non_contig',
+                                     dtypes=(torch.complex32,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics',
+                                     dtypes=(torch.complex32,)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_extremal_values',
+                                     dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+                    )),
+    BinaryUfuncInfo('float_power',
+                    ref=np.float_power,
+                    dtypes=all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool),
+                    promotes_int_to_float=True,
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_one_python_scalar=True,
+                    # Integer types do not support negative exponentes
+                    rhs_make_tensor_kwargs=dict(low=0),
+                    # Raising negative real numbers to fractional powers is not supported
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    decorators=(
+                        DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05),
+                                                        torch.complex128: tol(atol=1e-4, rtol=1.3e-05)}),
+                                     'TestBinaryUfuncs', 'test_scalar_support'),
+                    ),
+                    skips=(
+                        # FIXME
+                        # AssertionError: Object comparison failed: torch.float64 != torch.float32
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        # -3.43399e+38 is outside the range of representable values of type 'float'
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+                        # Complex values error with: Greatest absolute difference: nan at index
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_small_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_large_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_reference_numerics_extremal_values',
+                                     dtypes=[torch.complex64, torch.complex128]),
+                        # Inplace always promotes to double and thus other floating dtypes are not supported
+                        DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_inplace',
+                                     dtypes=[torch.bfloat16, torch.float16, torch.float32]),
+                    )),
+    OpInfo('qr',
+           op=torch.qr,
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_linalg_qr_geqrf,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # In-place ops
+           check_batched_gradgrad=False,
+           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack]),
+    UnaryUfuncInfo('rad2deg',
+                   ref=np.degrees,
+                   decorators=(precisionOverride({torch.bfloat16: 7e-1,
+                                                  torch.float16: 7e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True),
+    UnaryUfuncInfo('real',
+                   ref=np.real,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+                   supports_out=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   # See https://github.com/pytorch/pytorch/issues/66357
+                   check_batched_forward_grad=False,
+                   skips=(
+                       # Skip since real and imag don't have out variants.
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_out_arg_all_dtypes'),
+                   )),
+    OpInfo(
+        "roll",
+        ref=np.roll,
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+        error_inputs_func=error_inputs_roll,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_roll,
+        decorators=(onlyNativeDeviceTypes,),
+    ),
+    OpInfo(
+        "rot90",
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half),
+        error_inputs_func=error_inputs_rot90,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_rot90,
+    ),
+    # To test reference numerics against multiple values of argument `decimals`,
+    # we make multiple OpInfo entries with each entry corresponding to different value of decimals.
+    UnaryUfuncInfo('round',
+                   ref=np.round,
+                   aliases=('special.round',),
+                   dtypes=all_types_and(torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   skips=(
+                       DecorateInfo(unittest.expectedFailure,
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=tuple(t for t in integral_types() if t != torch.uint8)),
+                       DecorateInfo(unittest.skip("Skipped!"),
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=(torch.bfloat16,)),
+                   ),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True,
+                   ),
+    UnaryUfuncInfo('round',
+                   ref=np.round,
+                   variant_test_name='decimals_0',
+                   aliases=('special.round',),
+                   dtypes=floating_types_and(torch.half, torch.bfloat16),
+                   sample_kwargs=lambda device, dtype, input: ({'decimals': 0}, {'decimals': 0}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': 0}),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   assert_autodiffed=False,
+                   supports_sparse_csr=False),
+    UnaryUfuncInfo('round',
+                   ref=np.round,
+                   variant_test_name='decimals_3',
+                   aliases=('special.round',),
+                   dtypes=floating_types_and(torch.bfloat16),
+                   dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+                   sample_kwargs=lambda device, dtype, input: ({'decimals': 3}, {'decimals': 3}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': 3}),
+                   skips=(
+                       # test_ops already tested for this overload with `decimals_0` opinfo entry
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits'),
+                       DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
+                                    "TestUnaryUfuncs", "test_reference_numerics_extremal",
+                                    device_type="cuda"),
+                       DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
+                                    "TestUnaryUfuncs", "test_reference_numerics_normal",
+                                    device_type="cuda"),
+                   ),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   assert_autodiffed=False,
+                   supports_sparse_csr=False),
+    UnaryUfuncInfo('round',
+                   ref=np.round,
+                   variant_test_name='decimals_neg_3',
+                   aliases=('special.round',),
+                   dtypes=floating_types_and(torch.bfloat16),
+                   dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+                   sample_kwargs=lambda device, dtype, input: ({'decimals': -3}, {'decimals': -3}),
+                   sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'decimals': -3}),
+                   skips=(
+                       # test_ops already tested for this overload with `decimals_0` opinfo entry
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits'),
+                   ),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   assert_autodiffed=False,
+                   supports_sparse_csr=False),
+    UnaryUfuncInfo('sin',
+                   ref=np.sin,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   handles_large_floats=False,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       # Fails on CUDA but passes on ROCm
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,), device_type='cuda'),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu', active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   ),
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2}),)),
+    UnaryUfuncInfo('sinc',
+                   ref=np_sinc_with_fp16_as_fp32,
+                   aliases=('special.sinc',),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   handles_large_floats=False,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True),
+    UnaryUfuncInfo('sinh',
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.sinh),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.float16: 1e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.cdouble,)),
+                       # Reference: https://github.com/pytorch/pytorch/issues/48641
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.int8]),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    UnaryUfuncInfo('sign',
+                   ref=reference_sign,
+                   dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.bfloat16, torch.half),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/41245
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.bfloat16, torch.float16, torch.float32, torch.float64]),
+                   )),
+    UnaryUfuncInfo('sgn',
+                   ref=reference_sgn,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+                   backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+                   backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.half, torch.chalf),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/41245
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.bfloat16, torch.float16, torch.float32, torch.float64]),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    OpInfo('split',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=partial(sample_inputs_split, list_args=False),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           assert_autodiffed=True),
+    OpInfo('split',
+           # Cannot declare this aten_name because of
+           # test_variant_consistency_jit_split_list_args_cpu_float32
+           decomp_aten_name='split_with_sizes',
+           variant_test_name='list_args',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+           sample_inputs_func=partial(sample_inputs_split, list_args=True),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    # `unsafe_split` supports only `int` for split_size argument
+    OpInfo('unsafe_split',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=partial(sample_inputs_split, list_args=False),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           assert_autodiffed=True,
+           check_batched_forward_grad=False),
+    OpInfo('split_with_sizes',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=sample_inputs_split_with_sizes,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_autodiffed=True),
+    OpInfo('split_with_sizes_copy',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=sample_inputs_split_with_sizes,
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # No error raised
+               DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out_requires_grad_error"),
+           )),
+    BinaryUfuncInfo('__radd__',
+                    op=torch.Tensor.__radd__,
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+                    supports_out=False,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+
+                    ),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    autodiff_nonfusible_nodes=['aten::add'],),
+    BinaryUfuncInfo('__rdiv__',
+                    op=torch.Tensor.__rdiv__,
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+                    promotes_int_to_float=True,
+                    lhs_make_tensor_kwargs={'exclude_zero': True},
+                    # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+                    gradcheck_fast_mode=True,
+                    supports_out=False,
+                    skips=(
+                        # https://github.com/pytorch/pytorch/issues/76806
+                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                    ),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    assert_autodiffed=True,
+                    autodiff_nonfusible_nodes=['aten::mul', 'aten::reciprocal'],),
+    BinaryUfuncInfo('__rmul__',
+                    op=torch.Tensor.__rmul__,
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool),
+                    supports_out=False,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                    ),
+                    assert_autodiffed=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    autodiff_nonfusible_nodes=['aten::mul'],),
+    BinaryUfuncInfo('__rand__',
+                    op=torch.Tensor.__rand__,
+                    dtypes=integral_types_and(torch.bool),
+                    supports_out=False,
+                    supports_autograd=False,
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
+    BinaryUfuncInfo('__ror__',
+                    op=torch.Tensor.__ror__,
+                    dtypes=integral_types_and(torch.bool),
+                    supports_out=False,
+                    supports_autograd=False,
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
+    BinaryUfuncInfo('__rxor__',
+                    op=torch.Tensor.__rxor__,
+                    dtypes=integral_types_and(torch.bool),
+                    supports_out=False,
+                    supports_autograd=False,
+                    supports_forward_ad=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                    )),
+    OpInfo('__rmatmul__',
+           op=torch.Tensor.__rmatmul__,
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16,
+                                                       *[torch.bfloat16]
+                                                       if SM53OrLater or TEST_WITH_ROCM else []),
+           assert_autodiffed=True,
+           sample_inputs_func=partial(sample_inputs_matmul, is_rmatmul=True),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           decorators=(
+               # NVIDIA only assures that bfloat16 is supported by bmm if SM >= 5.3
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes', device_type='cuda', active_if=not SM53OrLater),
+               DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
+                            'TestMathBits', 'test_conj_view'),
+               DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-05, rtol=1.2e-03)}),
+                            'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1e-05)}),
+                            "TestDecomp", "test_comprehensive", device_type="cuda",
+                            active_if=TEST_WITH_ROCM),
+           ),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+               # https://github.com/pytorch/pytorch/issues/67470
+               DecorateInfo(unittest.skip("67470!"),
+                            'TestCommon', 'test_noncontiguous_samples',
+                            device_type='cpu', dtypes=(torch.long,)),
+               # Fails on XLA.
+               # AssertionError: False is not true : Tensors failed to compare as equal
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla', dtypes=(torch.long,)),
+               # https://github.com/pytorch/pytorch/issues/71774
+               DecorateInfo(unittest.skip('Skipped!'), 'TestNNCOpInfo', 'test_nnc_correctness',
+                            device_type='cpu', dtypes=(torch.long,)),
+           )),
+    BinaryUfuncInfo('__rmod__',
+                    op=torch.Tensor.__rmod__,
+                    dtypes=floating_types_and(torch.bfloat16, torch.half,),
+                    dtypesIfCUDA=all_types_and(torch.bfloat16, torch.half),
+                    # https://github.com/pytorch/pytorch/issues/80411
+                    gradcheck_fast_mode=True,
+                    supports_out=False,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                    ),
+                    # Support autograd after torch.remainder(Tensor, Tensor) supports
+                    # autograd of the second argument.
+                    # https://github.com/pytorch/pytorch/pull/58476/files#r637167630
+                    # supports_autograd=False,
+                    assert_autodiffed=True,
+                    autodiff_nonfusible_nodes=['aten::remainder'],),
+    BinaryUfuncInfo('__rpow__',
+                    op=torch.Tensor.__rpow__,
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+                    # Reference: https://github.com/pytorch/pytorch/issues/54774
+                    # "log2" "_vml_cpu" not implemented for Half
+                    backward_dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+                    supports_out=False,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                        # TODO: FIXME tolerance is too high
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestFwdGradients'),
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBwdGradients'),
+                    ),
+                    assert_autodiffed=True,
+                    autodiff_nonfusible_nodes=['aten::pow'],),
+    BinaryUfuncInfo('__rsub__',
+                    op=torch.Tensor.__rsub__,
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_out=False,
+                    supports_one_python_scalar=True,
+                    skips=(
+                        DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+                        DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',),
+                    ),
+                    assert_autodiffed=True,
+                    autodiff_nonfusible_nodes=['aten::rsub'],),
+    BinaryUfuncInfo('rsub',
+                    dtypes=all_types_and_complex_and(torch.bfloat16, torch.half),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_out=False,
+                    supports_inplace_autograd=False,
+                    assert_autodiffed=None,
+                    sample_inputs_func=sample_inputs_add_sub),
+    OpInfo('select',
+           aten_backward_name='select_backward',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=sample_inputs_select,
+           assert_jit_shape_analysis=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('select_scatter',
+           dtypes=all_types_and(torch.bfloat16, torch.half, torch.bool),
+           sample_inputs_func=sample_inputs_select_scatter,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_out=False),
+    OpInfo('slice',
+           op=torch.ops.aten.slice.Tensor,
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.half, torch.bool, torch.chalf),
+           sample_inputs_func=sample_inputs_slice,
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_scripting=False,
+           supports_inplace_autograd=False,
+           supports_out=False),
+    OpInfo('slice_scatter',
+           dtypes=all_types_and(torch.bfloat16, torch.half, torch.bool),
+           sample_inputs_func=sample_inputs_slice_scatter,
+           # https://github.com/pytorch/pytorch/issues/80411
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # RuntimeError: Internal error: pybind11::error_already_set called while
+               # Python error indicator not set.
+               # TODO: Investigate this more
+               DecorateInfo(unittest.expectedFailure, 'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive_out'),
+           ),
+           supports_out=True),
+    UnaryUfuncInfo('signbit',
+                   ref=np.signbit,
+                   dtypes=all_types_and(torch.bool, torch.bfloat16, torch.half),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_autograd=False,),
+    UnaryUfuncInfo('tan',
+                   ref=np.tan,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   decorators=(DecorateInfo(
+                               toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=1e-05)}),
+                               'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                               device_type='cuda'),),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   ),
+                   # tan(pi/2 * odd_number) is nan
+                   reference_numerics_filter=NumericsFilter(
+                       condition=lambda x: close_to_int(x / (math.pi * 0.5)), safe_val=math.pi)),
+    UnaryUfuncInfo('tanh',
+                   ref=np.tanh,
+                   aten_backward_name='tanh_backward',
+                   aliases=('nn.functional.tanh',),
+                   decorators=(precisionOverride({torch.bfloat16: 1e-2}),
+                               DecorateInfo(
+                                   toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=2e-05)}),
+                                   'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                   device_type='cuda'),),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   assert_jit_shape_analysis=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                                    active_if=(IS_MACOS or IS_WINDOWS)),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   ),
+                   # tan(j * pi/2 * odd_number) is nan
+                   reference_numerics_filter=NumericsFilter(
+                       condition=lambda x: (close_to_int(x / (math.pi * 0.5j))
+                                            if x.is_complex() else x.new_tensor(False, dtype=torch.bool)),
+                       safe_val=0)),
+    OpInfo('tensor_split',
+           ref=np.array_split,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # Pre-existing condition; Needs to be fixed
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+           ),
+           sample_inputs_func=sample_inputs_tensor_split,),
+    OpInfo('hsplit',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_hsplit,
+           error_inputs_func=error_inputs_hsplit,),
+    OpInfo('vsplit',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_vsplit,
+           error_inputs_func=error_inputs_vsplit,),
+    OpInfo('dsplit',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.bfloat16, torch.float16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_dsplit,
+           error_inputs_func=error_inputs_dsplit,),
+    OpInfo('triangular_solve',
+           op=torch.triangular_solve,
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_legacy_solve,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_wrapper=lambda *args, **kwargs: gradcheck_wrapper_triangular_input(*args, idx=1, **kwargs),
+           decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+           skips=(
+               # AssertionError: Scalars are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # Gradcheck fails
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad',
+                            dtypes=floating_and_complex_types()),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
+    UnaryUfuncInfo('trunc',
+                   aliases=('fix', ),
+                   ref=np.trunc,
+                   dtypes=all_types_and(torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   skips=(
+                       DecorateInfo(unittest.expectedFailure,
+                                    'TestNNCOpInfo',
+                                    'test_nnc_correctness',
+                                    dtypes=tuple(t for t in integral_types() if t != torch.uint8)),
+                   ),
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   assert_autodiffed=True),
+    UnaryUfuncInfo('exp2',
+                   aliases=('special.exp2', ),
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.exp2),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.cdouble]),
+                       # Reference: https://github.com/pytorch/pytorch/issues/48010
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+                   )),
+    UnaryUfuncInfo('expm1',
+                   aliases=('special.expm1', ),
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.expm1),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   promotes_int_to_float=True,
+                   assert_autodiffed=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cuda', dtypes=[torch.complex128]),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    UnaryUfuncInfo('nan_to_num',
+                   ref=np.nan_to_num,
+                   dtypes=all_types_and(torch.half, torch.bool, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.half, torch.bool, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   ),
+                   # Passing numpy_kwargs via sample_kwargs, as numpy does comparison
+                   # with BFloat16 in float, since it currently doesn't support BFloat16.
+                   # Ref: https://github.com/pytorch/pytorch/issues/57982#issuecomment-839150556
+                   sample_kwargs=lambda device, dtype, input: ({},
+                                                               {'posinf': torch.finfo(torch.bfloat16).max,
+                                                                'neginf': torch.finfo(torch.bfloat16).min})
+                   if dtype is torch.bfloat16 else ({}, {})),
+    UnaryUfuncInfo('reciprocal',
+                   ref=np_unary_ufunc_integer_promotion_wrapper(np.reciprocal),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/45690
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.cfloat, torch.cdouble]),
+                   )),
+    UnaryUfuncInfo('rsqrt',
+                   ref=lambda x: np.reciprocal(np.sqrt(x)),
+                   domain=(0, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   decorators=(precisionOverride({torch.half: 5e-2}),),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=(torch.cfloat, torch.cdouble)),
+                       # AssertionError: Tensor-likes are not close!
+                       # Greatest absolute difference: nan at index (700,) (up to 0.01 allowed)
+                       # Greatest relative difference: nan at index (700,) (up to 0.001 allowed)
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=(torch.chalf,)),
+                   )),
+    UnaryUfuncInfo('sqrt',
+                   ref=np.sqrt,
+                   supports_sparse=True,
+                   domain=(0, None),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(
+                       precisionOverride({torch.bfloat16: 7e-2}),
+                       DecorateInfo(
+                           toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                           'TestUnaryUfuncs', 'test_reference_numerics_large'),
+                   ),
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/47358
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    device_type='cpu', dtypes=(torch.cfloat, torch.cdouble),
+                                    active_if=IS_MACOS),
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+                   )),
+    UnaryUfuncInfo('square',
+                   ref=np.square,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+                   decorators=(precisionOverride({torch.complex64: 3e-4, torch.bfloat16: 3e-1}),),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/52549
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.cfloat, torch.cdouble]),
+                       # >>> t = torch.tensor(complex(-0.01, float("inf")))
+                       # >>> np.square(t.numpy())
+                       # (-inf-infj)
+                       # >>> t.square()
+                       # tensor(-inf-infj)
+                       # >>> t.cuda().square()
+                       # tensor(inf+nanj, device='cuda:0')
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_meta_inplace',
+                                    dtypes=[torch.bool]),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_inplace',
+                                    dtypes=[torch.bool]),
+                       DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_inplace',
+                                    dtypes=[torch.bool]),
+                   ),),
+    OpInfo('lerp',
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.chalf, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_lerp,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_autodiffed=True),
+    UnaryUfuncInfo('angle',
+                   ref=np.angle,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool),
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-2}),),
+                   backward_dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+                   backward_dtypesIfCUDA=floating_and_complex_types_and(torch.chalf),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_complex_to_float=True,
+                   skips=(
+                       # Ref: https://github.com/pytorch/pytorch/issues/78413
+                       DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    dtypes=(torch.bfloat16, torch.float16, torch.float32, torch.float64),),
+                   )),
+    UnaryUfuncInfo('isfinite',
+                   ref=np.isfinite,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+                   supports_out=False,
+                   supports_autograd=False),
+    UnaryUfuncInfo('isinf',
+                   ref=np.isinf,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+                   supports_out=False,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_autograd=False),
+    UnaryUfuncInfo('isposinf',
+                   ref=np.isposinf,
+                   dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_autograd=False),
+    UnaryUfuncInfo('isneginf',
+                   ref=np.isneginf,
+                   dtypes=all_types_and(torch.bool, torch.bfloat16, torch.float16),
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_autograd=False),
+    UnaryUfuncInfo('isreal',
+                   ref=np.isreal,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+                   supports_out=False,
+                   supports_autograd=False),
+    UnaryUfuncInfo('isnan',
+                   ref=np.isnan,
+                   dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+                   supports_out=False,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_autograd=False),
+    OpInfo('einsum',
+           # we need this lambda because SampleInput expects tensor input as the first argument
+           # TODO(@heitorschueroff) update SampleInput to handle such cases
+           op=lambda tensors, equation: torch.einsum(equation, tensors),
+           dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           sample_inputs_func=sample_inputs_einsum,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # test does not work with passing lambda for op
+               # there's a test `test_einsum` in `test_jit.py` to handle this case
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('svd',
+           op=torch.svd,
+           dtypes=floating_and_complex_types(),
+           sample_inputs_func=sample_inputs_svd,
+           # Runs very slowly on slow-gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           # We're using at::allclose, which does not have a batching rule
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+           skips=(
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
+    OpInfo('svd_lowrank',
+           op=lambda *args, **kwargs: wrapper_set_seed(
+               lambda a, b, **kwargs: torch.svd_lowrank(a @ b.mT, **kwargs),
+               *args, **kwargs
+           ),
+           dtypes=floating_types(),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           supports_forward_ad=True,
+           sample_inputs_func=sample_inputs_svd_lowrank,
+           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack, with_tf32_off,
+                       DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-03)}),
+                                    'TestCommon', 'test_noncontiguous_samples',
+                                    device_type='cuda')],
+           skips=(
+               # test does not work with passing lambda for op
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(slowTest, 'TestCompositeCompliance', 'test_forward_ad'),
+           )),
+    OpInfo('pca_lowrank',
+           op=lambda *args, **kwargs: wrapper_set_seed(
+               lambda a, b, **kwargs: torch.pca_lowrank(a @ b.mT, **kwargs),
+               *args, **kwargs
+           ),
+           dtypes=floating_types(),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           check_batched_forward_grad=False,
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_pca_lowrank,
+           decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack, with_tf32_off,
+                       DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-03)}),
+                                    'TestCommon', 'test_noncontiguous_samples',
+                                    device_type='cuda')],
+           skips=(
+               # test does not work with passing lambda for op
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    BinaryUfuncInfo('polar',
+                    dtypes=floating_types(),
+                    # this function is undefined if 'abs' values are <0
+                    supports_forward_ad=True,
+                    lhs_make_tensor_kwargs=dict(low=0),
+                    supports_rhs_python_scalar=False,
+                    skips=(
+                        # RuntimeError: Expected object of scalar type Float but got scalar type Double for second argument
+                        DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs', 'test_type_promotion'),
+                        DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+                        # GradcheckError: Jacobian computed with forward mode mismatch for output 0 with respect to input 0
+                        # Numerical:
+                        #  tensor([[0.]], dtype=torch.float64)
+                        # Analytical:
+                        # tensor([[-0.0047]], dtype=torch.float64, grad_fn=<CopySlices>)
+                        DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+                    )),
+    # TODO(@kshitij12345): Refactor similar to `mvlgamma` entries.
+    # To test reference numerics against multiple values of argument `n`,
+    # we make multiple OpInfo entries with each entry corresponding to different value of n (currently 0 to 4).
+    # We run the op tests from test_ops.py only for `n=0` to avoid redundancy in testing.
+    UnaryUfuncInfo('polygamma',
+                   op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
+                   variant_test_name='polygamma_n_0',
+                   ref=reference_polygamma if TEST_SCIPY else None,
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   sample_inputs_func=sample_inputs_polygamma,
+                   skips=(
+                       DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+                   ),
+                   sample_kwargs=lambda device, dtype, input: ({'n': 0}, {'n': 0}),
+                   # polygamma functions have multiple singularities at x having non-positive integer value
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4),
+                                                            safe_val=1)),
+    *(UnaryUfuncInfo('polygamma',
+                     op=lambda x, n, **kwargs: torch.polygamma(n, x, **kwargs),
+                     variant_test_name=f'polygamma_n_{n_}',
+                     ref=reference_polygamma if TEST_SCIPY else None,
+                     dtypes=all_types_and(torch.bool, torch.bfloat16),
+                     dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                     supports_forward_ad=True,
+                     supports_fwgrad_bwgrad=True,
+                     promotes_int_to_float=True,
+                     sample_inputs_func=sample_inputs_polygamma,
+                     decorators=(
+                         DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-3)}), 'TestUnaryUfuncs'),
+                         DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e1, rtol=1e-1),
+                                                         torch.float32: tol(atol=1e-4, rtol=1e-2)}),
+                                      'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                                      active_if=IS_WINDOWS),
+                     ),
+                     skips=(
+                         # Redundant tests
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestJit'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestNormalizeOperators'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestCommon'),
+                         # Mismatch: https://github.com/pytorch/pytorch/issues/55357
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+                         DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large'),
+                     ),
+                     sample_kwargs=lambda device, dtype, input: ({'n': n_}, {'n': n_}),
+                     # polygamma functions have multiple singularities at x having non-positive integer value
+                     reference_numerics_filter=NumericsFilter(condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4),
+                                                              safe_val=1))
+      for n_ in (1, 2, 3, 4)),
+    OpInfo('ravel',
+           ref=np.ravel,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_ravel,
+           ),
+    OpInfo('unravel_index',
+           ref=np.unravel_index,
+           dtypes=integral_types_and(),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_unravel_index,
+           ),
+    OpInfo('reshape',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_view_reshape,
+           reference_inputs_func=reference_inputs_view_reshape,
+           error_inputs_func=error_inputs_view_reshape,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           ),
+    OpInfo('reshape_as',
+           op=lambda x, other: x.reshape_as(other),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           sample_inputs_func=partial(sample_inputs_view_reshape, tensor_arg=True),
+           reference_inputs_func=partial(reference_inputs_view_reshape, tensor_arg=True),
+           error_inputs_func=partial(error_inputs_view_reshape, tensor_arg=True),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           )),
+    OpInfo('view',
+           op=lambda x, shape: x.view(shape),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           sample_inputs_func=sample_inputs_view_reshape,
+           reference_inputs_func=reference_inputs_view_reshape,
+           error_inputs_func=error_inputs_view_reshape,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # RuntimeError: view size is not compatible with input tensor's size and stride
+               # (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
+               DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+           )),
+    OpInfo('view_as',
+           op=lambda x, other: x.view_as(other),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=partial(sample_inputs_view_reshape, tensor_arg=True),
+           reference_inputs_func=partial(reference_inputs_view_reshape, tensor_arg=True),
+           error_inputs_func=partial(error_inputs_view_reshape, tensor_arg=True),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # RuntimeError: view size is not compatible with input tensor's size and stride
+               DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides")
+           )),
+    OpInfo('atleast_1d',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_atleast1d2d3d,
+           skips=(
+               # JIT does not support variadic tensors.
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+               # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
+               # please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+           ),
+           ),
+    OpInfo('atleast_2d',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+           ),
+           sample_inputs_func=sample_inputs_atleast1d2d3d,
+           ),
+    OpInfo('atleast_3d',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=[torch.float32]),
+           ),
+           sample_inputs_func=sample_inputs_atleast1d2d3d,
+           ),
+    OpInfo('flatten',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           ref=reference_flatten,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_flatten,
+           reference_inputs_func=reference_inputs_flatten,
+           ),
+    OpInfo('unflatten',
+           op=torch.unflatten,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_unflatten,
+           ),
+    OpInfo('column_stack',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),),
+           sample_inputs_func=sample_inputs_column_stack,),
+    OpInfo('pinverse',
+           op=torch.pinverse,
+           dtypes=floating_and_complex_types(),
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_out=False,
+           sample_inputs_func=sample_inputs_linalg_invertible,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           skips=(
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager',
+                            device_type='mps', dtypes=[torch.float32]),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit',
+                            device_type='mps', dtypes=[torch.float32]),
+           )),
+    OpInfo('gather',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_gather,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_gather,
+           ),
+    OpInfo('index_fill',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           skips=(
+               # RuntimeError: Mismatch on aten._unique.default: Shapes torch.Size([2]) and torch.Size([1]) are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_crossref_backward_no_amp'),
+               # RuntimeError: Mismatch on aten._unique.default: Shapes torch.Size([2]) and torch.Size([1]) are not equal!
+               DecorateInfo(unittest.expectedFailure, 'TestFakeTensor', 'test_fake_crossref_backward_amp'),
+           ),
+           sample_inputs_func=sample_inputs_index,
+           reference_inputs_func=partial(sample_inputs_index, reference=True)),
+    OpInfo('index_copy',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_index,
+           reference_inputs_func=partial(sample_inputs_index, reference=True),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('index_select',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_index,
+           reference_inputs_func=partial(sample_inputs_index, reference=True),
+           error_inputs_func=error_inputs_index_select,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           assert_jit_shape_analysis=True,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('index_add',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_index,
+           reference_inputs_func=partial(sample_inputs_index, reference=True),
+           error_inputs_func=error_inputs_index_add,
+           skips=(
+               # boolean alpha not handled properly
+               DecorateInfo(unittest.expectedFailure,
+                            'TestNNCOpInfo',
+                            'test_nnc_correctness',
+                            dtypes=(torch.bool,)),
+           ),
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL),
+    OpInfo('index_reduce',
+           dtypes=all_types_and(torch.float16, torch.bfloat16),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_index_reduce),
+    OpInfo('__getitem__',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_inplace_autograd=False,
+           supports_scripting=False,
+           op=torch.Tensor.__getitem__,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: False is not true : Scalars failed to compare as equal! 0 != 104448
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit', device_type='cuda'),),
+           sample_inputs_func=sample_inputs_getitem),
+    OpInfo('index_put',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_inplace_autograd=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           test_neg_view=False,
+           sample_inputs_func=sample_inputs_index_put,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.skip("Skipped"), 'TestBwdGradients', 'test_fn_grad', dtypes=[torch.float64],
+                            device_type='cuda', active_if=(TEST_WITH_ROCM and TEST_WITH_TORCHINDUCTOR)),
+           )),
+    OpInfo('sort',
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_sort,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+           )),
+    OpInfo('unique',
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
+           sample_inputs_func=sample_inputs_unique,
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('Output order is undefined when sorted=False'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('unique_consecutive',
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and(torch.bool, torch.float16),
+           sample_inputs_func=sample_inputs_unique_consecutive,
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('put',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_forward_grad=False,
+           check_batched_gradgrad=False,  # vmap complains of the sizes
+           sample_inputs_func=sample_inputs_put),
+    OpInfo('take',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           check_batched_grad=False,  # vmap complains of the sizes
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_take,
+           error_inputs_func=error_inputs_take),
+    OpInfo('scatter',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_scatter,
+           error_inputs_func=error_inputs_scatter_and_scatter_add),
+    UnaryUfuncInfo(
+        'bfloat16',
+        op=lambda x, *args, **kwargs: x.bfloat16(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        skips=(
+            # autograd tests don't handle operators that change dtype
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+        )),
+    UnaryUfuncInfo(
+        'bool',
+        op=lambda x, *args, **kwargs: x.bool(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attributis not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+        )),
+    UnaryUfuncInfo(
+        'byte',
+        op=lambda x, *args, **kwargs: x.byte(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        # The autograd test runner cannot handle functions that change dtype
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )),
+    UnaryUfuncInfo(
+        'char',
+        op=lambda x, *args, **kwargs: x.char(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        # The autograd test runner cannot handle functions that change dtype
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )),
+    UnaryUfuncInfo(
+        'double',
+        op=lambda x, *args, **kwargs: x.double(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+        )),
+    UnaryUfuncInfo(
+        'float',
+        op=lambda x, *args, **kwargs: x.float(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        skips=(
+            # autograd tests don't handle operators that change dtype
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+        )),
+    UnaryUfuncInfo(
+        'half',
+        op=lambda x, *args, **kwargs: x.half(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_autograd=True,
+        skips=(
+            # autograd tests don't handle operators that change dtype
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+        )),
+    UnaryUfuncInfo(
+        'int',
+        op=lambda x, *args, **kwargs: x.int(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )),
+    UnaryUfuncInfo(
+        'long',
+        op=lambda x, *args, **kwargs: x.long(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )),
+    UnaryUfuncInfo(
+        'short',
+        op=lambda x, *args, **kwargs: x.short(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )),
+    UnaryUfuncInfo(
+        'cdouble',
+        op=torch.Tensor.cdouble,
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+        )),
+    UnaryUfuncInfo(
+        'cfloat',
+        op=torch.Tensor.cfloat,
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        skips=(
+            # autograd tests don't handle operators that change dtype
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # RuntimeError: attribute lookup is not defined on builtin
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+        )),
+    UnaryUfuncInfo(
+        'chalf',
+        op=lambda x, *args, **kwargs: x.chalf(*args, **kwargs),
+        dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_conversion,
+        skips=(
+            # autograd tests don't handle operators that change dtype
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients'),
+            DecorateInfo(unittest.expectedFailure, 'TestBwdGradients'),
+            # use of lambda doesn't work with test_normalize_operator_exhaustive
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager',
+                         device_type='cpu'),
+            # TypeError: 'int' object is not iterable
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view',
+                         device_type='cpu'),
+            # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view',
+                         device_type='cpu'),
+            # RuntimeError: "sum_cpu" not implemented for 'ComplexHalf'
+            # RuntimeError: "neg_conj_cuda" not implemented for 'ComplexHalf'
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        )
+    ),
+    OpInfo('empty_like',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_like_fns,
+           reference_inputs_func=reference_inputs_like_fns,
+           supports_autograd=False,
+           skips=(
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"),
+                            "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_complex_half_reference_testing'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: empty_like is not comparable"), 'TestCompositeCompliance',
+                            'test_operator'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('zeros_like',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_like_fns,
+           supports_autograd=False,
+           error_inputs_sparse_func=error_inputs_sparse_like_fns,
+           sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_coo),
+           sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csr),
+           sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csc),
+           sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsr),
+           sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsc),
+           skips=(
+           )),
+    OpInfo('ones_like',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_like_fns,
+           supports_autograd=False,
+           skips=(
+           )),
+    OpInfo('randn',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.complex32),
+           op=lambda *args, **kwargs: wrapper_set_seed(torch.randn, *args, **kwargs),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_randn,
+           supports_autograd=False,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+               # CPU randn generates different values based on the strides of out tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cpu'),
+               # randn fails to warn when resizing its out tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.expectedFailure, 'TestDecomp', 'test_quick'),
+           )),
+    OpInfo('randn_like',
+           dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16, torch.complex32),
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.randn_like, inp, *args, **kwargs),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_like_fns,
+           supports_autograd=False,
+           error_inputs_sparse_func=error_inputs_sparse_like_fns,
+           sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_coo),
+           sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csr),
+           sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_csc),
+           sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsr),
+           sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_like_fns, layout=torch.sparse_bsc),
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Expected: randn_like is not comparable between dtypes"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('rand_like',
+           dtypes=floating_types_and(torch.half, torch.bfloat16, torch.complex32, torch.complex64, torch.complex128),
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.randn_like, inp, *args, **kwargs),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_like_fns,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Expected: randn_like is not comparable between dtypes"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('randint',
+           dtypes=all_types_and(torch.half, torch.bfloat16),
+           op=lambda *args, **kwargs:
+               wrapper_set_seed(torch.randint, *args, **kwargs),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_randint,
+           supports_autograd=False,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_vmap_exhaustive"),
+               DecorateInfo(unittest.skip("Test expects tensor input"), "TestVmapOperatorsOpInfo", "test_op_has_batch_rule"),
+               # CPU randint generates different values based on the strides of out tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # randint fails to warn when resizing its out tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # Tests that assume input tensor has a meaningful effect on output tensor
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # Might need to skip until ROCm5.5
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_multiple_devices',
+                            dtypes=[torch.float32, torch.int64], active_if=TEST_WITH_ROCM),
+           )),
+    OpInfo('randint_like',
+           dtypes=all_types_and(torch.half, torch.bfloat16),
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.randint_like, inp, *args, **kwargs),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_randint_like,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('full_like',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_full_like,
+           supports_autograd=False,
+           skips=(
+           )),
+    OpInfo('new_zeros',
+           op=lambda x, *args, **kwargs: x.new_zeros(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_new_fns,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           ),
+           supports_autograd=False),
+    OpInfo('new_ones',
+           op=lambda x, *args, **kwargs: x.new_ones(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_new_fns,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           ),
+           supports_autograd=False),
+    OpInfo('ones',
+           op=torch.ones,
+           supports_autograd=False,
+           supports_varargs=True,
+           is_factory_function=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_ones_zeros,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
+    OpInfo('zeros',
+           op=torch.zeros,
+           supports_autograd=False,
+           is_factory_function=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_ones_zeros,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
+    OpInfo('full',
+           op=torch.full,
+           supports_autograd=False,
+           is_factory_function=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_full,
+           skips=(
+               # Tests that assume input is a tensor or sequence of tensors
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+               # Same failure as arange: cannot find linspace in captured graph
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # RuntimeError: UNSUPPORTED DTYPE: bool
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness', dtypes=(torch.bool,)),
+           )),
+    OpInfo('new_empty',
+           op=lambda x, *args, **kwargs: x.new_empty(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_new_fns,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: new_empty is not comparable"), 'TestCompositeCompliance',
+                            'test_operator'),
+               DecorateInfo(unittest.skip("Expected: new_empty is not comparable"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           ),
+           supports_autograd=False),
+    OpInfo('new_empty_strided',
+           op=lambda x, *args, **kwargs: x.new_empty_strided(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=partial(sample_inputs_new_fns, is_strided=True),
+           supports_autograd=False,
+           skips=(
+               # FX failed to normalize op
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Lazy tensor failures
+               DecorateInfo(unittest.skip("Skipped!"), 'TestLazyOpInfo', 'test_correctness'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestLazyOpInfo', 'test_correctness_with_reusing_ir'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestDecomp', 'test_comprehensive'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestDecomp', 'test_quick'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestProxyTensorOpInfo', 'test_make_fx_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestProxyTensorOpInfo', 'test_make_fx_fake_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestProxyTensorOpInfo', 'test_make_fx_symbolic_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: new_empty_strided is not comparable"),
+                            'TestNNCOpInfo', 'test_nnc_correctness'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('empty_strided',
+           op=lambda inp, *args, **kwargs: wrapper_set_seed(torch.empty_strided, inp, *args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.bool, torch.half),
+           supports_out=False,
+           supports_autograd=False,
+           sample_inputs_func=sample_inputs_empty_strided,
+           skips=(
+               # FX failed to normalize op - add the op to the op_skip list.
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"), 'TestCompositeCompliance', 'test_operator'),
+               # Lazy tensor failures
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"), 'TestLazyOpInfo'),
+               # RuntimeError: unsupported operation: more than one element of the written-to tensor refers to a single
+               # memory location. Please clone() the tensor before performing the operation.
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_meta_outplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace'),
+               DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'),
+           )),
+    OpInfo('empty',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_empty,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"), 'TestCompositeCompliance',
+                            'test_operator'),
+               # requires_grad doesn't exist in the jit schema
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestCommon',
+                            'test_out'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestCommon',
+                            'test_out_warning'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestLazyOpInfo'),
+               DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('eye',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_eye,
+           error_inputs_func=error_inputs_eye,
+           supports_out=True,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # TODO: same as this?
+               # https://github.com/pytorch/pytorch/issues/81774
+               # also see: arange, new_full
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+           )),
+    OpInfo('empty_permuted',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_empty_permuted,
+           error_inputs_func=error_inputs_empty_permuted,
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCudaFuserOpInfo'),
+               # Empty tensor data is garbage so it's hard to make comparisons with it.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"), 'TestCompositeCompliance',
+                            'test_operator'),
+               # requires_grad doesn't exist in the jit schema
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon',
+                            'test_out'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon',
+                            'test_out_warning'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestLazyOpInfo'),
+               DecorateInfo(unittest.skip("Expected: empty_permuted is not comparable"),
+                            'TestCommon', 'test_complex_half_reference_testing'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+           )),
+    OpInfo('scalar_tensor',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           sample_inputs_func=sample_inputs_scalar_tensor,
+           supports_autograd=False,
+           supports_out=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               # fails to match any schemas despite working in the interpreter
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), "TestCommon", "test_noncontiguous_samples"),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+           )),
+    OpInfo('new_full',
+           op=lambda x, *args, **kwargs: x.new_full(*args, **kwargs),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           sample_inputs_func=sample_inputs_new_full,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+           ),
+           supports_autograd=False),
+    OpInfo('multinomial',
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.multinomial, inp, *args, **kwargs),
+           method_variant=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.Tensor.multinomial, inp, *args, **kwargs),
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           supports_out=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_multinomial,
+           error_inputs_func=error_inputs_multinomial,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Strides are not the same!
+               # This may not be reproducible in CI
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+           supports_autograd=False),
+    OpInfo('normal',
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.normal, inp, *args, **kwargs),
+           # The inplace variant (Tensor.normal_) is different from torch.normal
+           inplace_variant=None,
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_normal_tensor_first,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Tensor-likes are not close!
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # Computed gradient is incorrect -- would be an exfail but gradgrad somehow passes
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Gradients are incorrect!"), 'TestBwdGradients'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+               # RuntimeError: Difference from {dtype} is larger with decomposition
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'),
+               # The inplace variant (Tensor.normal_) is different from torch.normal
+               # inplace varaint Tensor.normal_ is decomposed using randn_like()
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMeta', 'test_dispatch_symbolic_meta_outplace_all_strides'))),
+    OpInfo('normal',
+           # This has its own variant b/c OpInfos assume the first arg is a Tensor but it is not here
+           variant_test_name='number_mean',
+           op=lambda std, mean, *args, **kwargs:
+               wrapper_set_seed(torch.normal, mean, std, *args, **kwargs),
+           # The inplace variant (Tensor.normal_) is different from torch.normal
+           inplace_variant=None,
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.half),
+           supports_out=True,
+           sample_inputs_func=sample_inputs_normal_tensor_second,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_out_warning'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestBwdGradients'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_compare_cpu'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestEagerFusionOpInfo'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOperators'),
+               # AssertionError
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_comprehensive'),
+               # AssertionError
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDecomp', 'test_quick'),
+               # AssertionError in CUDA variant
+               DecorateInfo(unittest.skip("Skipped!"), 'TestFakeTensor', device_type='cuda'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestDeviceUtils', 'test_device_mode_ops'))),
+    OpInfo('bernoulli',
+           op=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.bernoulli, inp, *args, **kwargs),
+           # The inplace variant (Tensor.bernoulli_) is different from torch.bernoulli
+           inplace_variant=None,
+           method_variant=lambda inp, *args, **kwargs:
+               wrapper_set_seed(torch.Tensor.bernoulli, inp, *args, **kwargs),
+           dtypes=floating_types_and(torch.bfloat16, torch.half),
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           sample_inputs_func=sample_inputs_bernoulli,
+           error_inputs_func=error_inputs_bernoulli,
+           skips=(
+               # vmap: We do not yet support calling random operations inside of vmap
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # AssertionError: JIT Test does not execute any logic
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # Expected RuntimeError when doing an unsafe cast from a result of
+               # dtype torch.float32 into an out= with dtype torch.lon
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'))),
+    OpInfo('scatter_add',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_scatter_add,
+           error_inputs_func=error_inputs_scatter_and_scatter_add,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           ),
+    OpInfo('stack',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_stack,
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/77046
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+               DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+           ),
+           ),
+    OpInfo('_chunk_cat',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_chunk_cat,
+           error_inputs_func=error_inputs_chunk_cat,
+           supports_autograd=False,
+           supports_out=True,
+           ),
+    OpInfo('hstack',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack,
+           error_inputs_func=error_inputs_hstack_dstack_vstack,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           ),
+    BinaryUfuncInfo('hypot',
+                    dtypes=floating_types_and(torch.bfloat16, torch.half),
+                    dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_rhs_python_scalar=False),
+    OpInfo('histogram',
+           dtypes=floating_types(),
+           dtypesIfCUDA=_dispatch_dtypes(),  # histogram is only implemented on CPU
+           sample_inputs_func=sample_inputs_histogram,
+           supports_autograd=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           skips=(
+               # JIT tests don't work with Tensor keyword arguments
+               # https://github.com/pytorch/pytorch/issues/58507
+               # RuntimeError:
+               # undefined value tensor:
+               #   File "<string>", line 3
+               # def the_method(i0):
+               #     return torch.histogram(i0, 1, weight=tensor(-0.5735, dtype=torch.float32), density=False)
+               #                                          ~~~~~~ <--- HERE
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # Not Implemented on XLA.
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOpInfo', device_type='xla'),
+           )),
+    OpInfo('histogramdd',
+           dtypes=floating_types(),
+           dtypesIfCUDA=_dispatch_dtypes(),  # histogramdd is only implemented on CPU
+           sample_inputs_func=sample_inputs_histogramdd,
+           error_inputs_func=error_inputs_histogramdd,
+           supports_autograd=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           skips=(
+               # Not implemented on CUDA
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_errors', device_type='cuda'),
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # JIT tests don't work with Tensor keyword arguments
+               # https://github.com/pytorch/pytorch/issues/58507
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('histc',
+           dtypes=floating_types_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_types_and(torch.int8, torch.int16, torch.int32, torch.int64),
+           sample_inputs_func=sample_inputs_histc,
+           supports_out=True,
+           supports_autograd=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           skips=(
+               # CUDA histc returns a float tensor but does not correctly warn when passed an integral out tensor
+               # "AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast
+               # from a result of dtype torch.float32 into an out= with dtype torch.long"
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out', device_type='cuda'),
+           )),
+    OpInfo('bincount',
+           dtypes=integral_types_and(),
+           sample_inputs_func=sample_inputs_bincount,
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # JIT tests don't work with Tensor keyword arguments
+               # https://github.com/pytorch/pytorch/issues/58507
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('bucketize',
+           dtypes=all_types_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+           sample_inputs_func=sample_inputs_bucketize,
+           reference_inputs_func=reference_inputs_bucketize,
+           error_inputs_func=error_inputs_bucketize,
+           supports_autograd=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           skips=(
+               # JIT tests don't work with Tensor keyword arguments
+               DecorateInfo(unittest.skip("Expected failure!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('searchsorted',
+           dtypes=all_types_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=all_types_and(torch.bfloat16, torch.float16),
+           sample_inputs_func=sample_inputs_searchsorted,
+           supports_autograd=False,
+           # TODO: Avoid COW materialize
+           supports_cow_input_no_materialize=False,
+           ref=reference_searchsorted,
+           skips=(
+               # JIT tests don't work with Tensor keyword arguments
+               # https://github.com/pytorch/pytorch/issues/58507
+               DecorateInfo(unittest.skip("Expected failure!"), 'TestJit', 'test_variant_consistency_jit'),
+           )),
+    OpInfo('cat',
+           ref=_cat_np,
+           aliases=('concat', 'concatenate'),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.complex32),
+           sample_inputs_func=sample_inputs_cat_concat,
+           reference_inputs_func=reference_inputs_cat,
+           error_inputs_func=error_inputs_cat,
+           # https://github.com/pytorch/pytorch/issues/80411
+           gradcheck_fast_mode=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           assert_autodiffed=True,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/89353
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_numpy_ref_mps'),
+               # RuntimeError: Arguments for call not valid.
+               #               Expected a value of type 'List[Tensor]' for argument
+               #               'tensors' but instead found type 'Tensor (inferred)'.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),
+               # see https://github.com/pytorch/pytorch/issues/71286
+               DecorateInfo(unittest.expectedFailure, 'TestNNCOpInfo', 'test_nnc_correctness'),
+               # see https://github.com/pytorch/pytorch/issues/99806
+               # RuntimeError: The size of tensor a (25) must match the size of tensor b (0) at non-singleton dimension 0.
+               DecorateInfo(unittest.expectedFailure, 'TestBwdGradients', 'test_fn_gradgrad'),
+           )),
+    OpInfo('unbind',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           ref=reference_unbind,
+           sample_inputs_func=sample_inputs_unbind,
+           error_inputs_func=error_inputs_unbind,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_gradgrad=True,
+           supports_out=False,
+           ),
+    OpInfo('vstack',
+           aliases=('row_stack',),
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack,
+           error_inputs_func=error_inputs_hstack_dstack_vstack,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # RuntimeError: _fn() Expected a value of type
+               #   'Tensor (inferred)' for argument 't0' but instead found type 'tuple'.
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_jit_alias_remapping'),)),
+    OpInfo('dstack',
+           dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack,
+           error_inputs_func=error_inputs_hstack_dstack_vstack,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           ),
+    OpInfo('unfold',
+           op=lambda x, *args: x.unfold(*args),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           backward_dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_gradgrad=False,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Skip operator schema test because this is a functional and not an operator
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+           ),
+           sample_inputs_func=sample_inputs_unfold),
+    OpInfo('unfold_copy',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           backward_dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           check_batched_gradgrad=False,
+           # See https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_unfold),
+    OpInfo('msort',
+           dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+           check_batched_gradgrad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_msort,
+           skips=(
+           )),
+    OpInfo('movedim',
+           aliases=('moveaxis',),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_movedim_moveaxis,
+           reference_inputs_func=reference_movedim_moveaxis,
+           error_inputs_func=error_movedim_moveaxis),
+    OpInfo('renorm',
+           dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           sample_inputs_func=sample_inputs_renorm,
+           error_inputs_func=error_inputs_renorm,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # RuntimeError: Difference from float64 is larger with decomposition
+               # linalg_vector_norm.default than original on output 0.
+               # Original max diff: 2.560596747969157e-07,
+               # Decomp max diff: 1.8187482915266173e-06
+               DecorateInfo(unittest.skip("Inconsistent accuracy"), 'TestDecomp', 'test_comprehensive',
+                            device_type='cpu', dtypes=(torch.float16,)),
+           )),
+    ShapeFuncInfo('repeat',
+                  op=lambda x, dims: x.repeat(dims),
+                  ref=np.tile,
+                  dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+                  # https://github.com/pytorch/pytorch/issues/80411
+                  gradcheck_fast_mode=True,
+                  supports_out=False,
+                  supports_forward_ad=True,
+                  supports_fwgrad_bwgrad=True,
+                  sample_inputs_func=sample_repeat_tile,
+                  skips=(
+                      DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+                  )),
+    OpInfo('squeeze',
+           ref=_squeeze_ref,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           assert_autodiffed=True,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           assert_jit_shape_analysis=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_squeeze),
+    OpInfo('squeeze',
+           ref=_squeeze_ref,
+           variant_test_name="multiple",
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           assert_autodiffed=True,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           # https://github.com/pytorch/pytorch/issues/66357
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_squeeze_multiple),
+    UnaryUfuncInfo(
+        'fill',
+        ref=_fill_np,
+        method_variant=None,
+        sample_kwargs=_fill_sample_kwargs,
+        sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'value': True}),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # https://github.com/pytorch/pytorch/issues/66357
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.complex32, torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        skips=(
+            # JIT has issue when op is passed as lambda
+            # AssertionError: JIT Test does not execute any logic
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip("No fill_ op"), 'TestCudaFuserOpInfo'),
+            DecorateInfo(unittest.skip("No fill_ op"), 'TestNNCOpInfo'),
+        )),
+    OpInfo('resize_',
+           op=lambda x, shape: x.clone().resize_(shape),
+           method_variant=None,
+           inplace_variant=torch.Tensor.resize_,
+           # the test fails because resize_ doesn't work with imag views as expected by the test
+           # https://github.com/pytorch/pytorch/issues/65945
+           test_neg_view=False,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # Cannot resize variables that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
+           ),
+           sample_inputs_func=sample_inputs_resize_ops),
+    OpInfo('resize_as_',
+           op=lambda x, other: torch.resize_as_(x.clone(), other),
+           method_variant=None,
+           inplace_variant=torch.Tensor.resize_as_,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # Cannot resize variables that require grad
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_dtypes'),
+               DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
+           ),
+           sample_inputs_func=sample_inputs_resize_ops),
+    OpInfo('take_along_dim',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           supports_inplace_autograd=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_take_along_dim,
+           gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+           decorators=(
+               # RuntimeError: view size is not compatible with input tensor's size and stride
+               DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+           )),
+    ShapeFuncInfo('tile',
+                  ref=np.tile,
+                  dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+                  # https://github.com/pytorch/pytorch/issues/80411
+                  gradcheck_fast_mode=True,
+                  supports_out=False,
+                  supports_forward_ad=True,
+                  supports_fwgrad_bwgrad=True,
+                  sample_inputs_func=sample_repeat_tile),
+    OpInfo('trapz',  # TODO: in the future, 'trapz' should be made a proper alias of 'trapezoid'
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_trapezoid),
+    OpInfo('trapezoid',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_trapezoid),
+    OpInfo('cumulative_trapezoid',
+           dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           supports_out=False,
+           sample_inputs_func=sample_cumulative_trapezoid,),
+    OpInfo('unsqueeze',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           assert_jit_shape_analysis=True,
+           assert_autodiffed=True,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           sample_inputs_func=sample_unsqueeze),
+    BinaryUfuncInfo('xlogy',
+                    aliases=('special.xlogy',),
+                    dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                    promotes_int_to_float=True,
+                    supports_forward_ad=True,
+                    supports_fwgrad_bwgrad=True,
+                    supports_one_python_scalar=True,
+                    # We don't test 0 as the gradient will be NaN and it'll break
+                    rhs_make_tensor_kwargs=dict(low=0.01)),
+    OpInfo('zero_',
+           op=lambda x: torch.zero_(x.clone()),
+           method_variant=None,
+           inplace_variant=torch.Tensor.zero_,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           # https://github.com/pytorch/pytorch/issues/80411
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           supports_gradgrad=True,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           sample_inputs_func=sample_inputs_zero_),
+    OpInfo('logsumexp',
+           aliases=('special.logsumexp',),
+           dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+           assert_autodiffed=True,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           gradcheck_fast_mode=False,
+           sample_inputs_func=sample_inputs_logsumexp,
+           reference_inputs_func=reference_inputs_logsumexp),
+    OpInfo('trace',
+           dtypes=all_types_and_complex(),
+           dtypesIfCUDA=all_types_and_complex_and(torch.chalf, torch.bool, torch.half, torch.bfloat16),
+           error_inputs_func=error_inputs_trace,
+           supports_inplace_autograd=False,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_trace),
+    OpInfo('transpose',
+           ref=_numpy_ref_transpose,
+           aliases=('swapdims', 'swapaxes'),
+           assert_jit_shape_analysis=True,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_transpose_swapdims),
+    OpInfo('T',
+           op=lambda x: x.T,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
+           sample_inputs_func=sample_inputs_T,
+           error_inputs_func=error_inputs_T),
+    OpInfo('H',
+           op=lambda x: x.H,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
+           sample_inputs_func=sample_inputs_T),
+    OpInfo('mT',
+           op=lambda x: x.mT,
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
+           sample_inputs_func=sample_inputs_adjoint),
+    OpInfo('mH',
+           op=lambda x: x.mH,
+           aliases=('adjoint',),
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half, torch.chalf),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),),
+           sample_inputs_func=sample_inputs_adjoint),
+    OpInfo('tril',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_tril_triu,
+           sample_inputs_func=sample_inputs_tril_triu),
+    OpInfo('triu',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           error_inputs_func=error_inputs_tril_triu,
+           sample_inputs_func=sample_inputs_tril_triu),
+    OpInfo('triu_indices',
+           dtypes=_dispatch_dtypes((torch.int32, torch.int64)),
+           sample_inputs_func=sample_inputs_trilu_indices,
+           ref=lambda h, w, ofs=0, dtype=torch.long, device='cpu' : np.array(np.triu_indices(h, ofs, w), dtype=dtype),
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestMathBits', 'test_neg_view'),
+           )),
+    OpInfo('tril_indices',
+           dtypes=_dispatch_dtypes((torch.int32, torch.int64)),
+           sample_inputs_func=sample_inputs_trilu_indices,
+           ref=lambda h, w, ofs=0, dtype=torch.long, device='cpu' : np.array(np.tril_indices(h, ofs, w), dtype=dtype),
+           supports_out=False,
+           supports_autograd=False,
+           skips=(
+               # skip these tests since we have non tensor input
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_noncontiguous_samples'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+               DecorateInfo(unittest.skip('Skipped!'), 'TestMathBits', 'test_neg_view'),
+           )),
+    OpInfo('kron',
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+           gradcheck_fast_mode=True,
+           supports_inplace_autograd=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           sample_inputs_func=sample_inputs_kron,
+           decorators=(
+               # RuntimeError: view size is not compatible with input tensor's size and stride
+               DecorateInfo(unittest.expectedFailure, "TestMeta", "test_dispatch_symbolic_meta_outplace_all_strides"),
+           )),
+    OpInfo('inner',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_inner,
+           ),
+    OpInfo('tensordot',
+           dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           dtypesIfROCM=floating_and_complex_types_and(torch.half, torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           sample_inputs_func=sample_inputs_tensordot,
+           skips=(
+               # Skip operator schema test because this is a functional and not an operator.
+               # Reference: https://github.com/pytorch/pytorch/issues/54574
+               DecorateInfo(unittest.skip("Skipped!"), 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+           )
+           ),
+    OpInfo('to_sparse',
+           op=lambda x, *args: x.to_sparse(*args),
+           sample_inputs_func=sample_inputs_to_sparse,
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           backward_dtypes=floating_types(),
+           backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_sparse_csr=True,
+           supports_sparse_csc=True,
+           check_batched_grad=False,
+           check_batched_gradgrad=False,
+           supports_cow_input_no_materialize=False,
+           skips=(
+               # NotImplementedError: Could not run 'aten::normal_' with arguments from the 'SparseCPU' backend
+               DecorateInfo(unittest.skip(""), 'TestCommon', 'test_noncontiguous_samples'),
+               # TODO: FIXME: complex inputs requiring grad error in forward
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'),
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               # Allowed exception: sparse tensors don't have strides
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_operator'),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestCompositeCompliance', 'test_backward'),
+               DecorateInfo(unittest.skip("Allowed exception"), 'TestTags', 'test_tags'),
+               # TODO: implement csr.to_sparse(sample_dim) where sampled_dim is 1.
+               DecorateInfo(unittest.skip("csr.to_sparse(1) not implemented. Skipped!"),
+                            'TestSparseCSR', 'test_sparse_csr_consistency'),
+               # Compiler issue on ROCm. Might need to skip until ROCm5.5
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )
+           ),
+    OpInfo('logcumsumexp',
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.half),
+           backward_dtypes=floating_and_complex_types_and(torch.bfloat16),
+           backward_dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16),
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # AssertionError: UserWarning not triggered : Resized a non-empty tensor but did not warn about it.
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning', device_type='cuda'),
+               # RuntimeError: "max_values_cpu" not implemented for 'ComplexDouble'
+               # Falling back to non-numerically stablized exp, causing nan in the results.
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD', dtypes=[torch.complex128]),
+               DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad', dtypes=[torch.complex128]),
+           ),
+           sample_inputs_func=sample_inputs_logcumsumexp,
+           error_inputs_func=error_inputs_logcumsumexp),
+    UnaryUfuncInfo('sigmoid',
+                   aliases=('special.expit', 'nn.functional.sigmoid'),
+                   aten_backward_name='sigmoid_backward',
+                   ref=reference_sigmoid if TEST_SCIPY else None,
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.complex64: 1e-1,
+                                                  torch.bfloat16: 1e-2}),),
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/issues/56012
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.complex64, torch.cdouble]),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.chalf, torch.complex64, torch.cdouble])),
+                   dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.complex32, torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   assert_autodiffed=True,
+                   # sigmoid(z) = 1 / (1 + exp(-z)), at z = j * pi * odd_number, the denominator is zero
+                   reference_numerics_filter=NumericsFilter(
+                       condition=lambda x: (close_to_int(x / (math.pi * 1j))
+                                            if x.is_complex() else x.new_tensor(False, dtype=torch.bool)),
+                       safe_val=0)),
+    UnaryUfuncInfo('digamma',
+                   ref=scipy.special.digamma if TEST_SCIPY else None,
+                   aliases=('special.psi', 'special.digamma',),
+                   decorators=(precisionOverride({torch.float16: 5e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True),
+    UnaryUfuncInfo('erf',
+                   ref=scipy.special.erf if TEST_SCIPY else None,
+                   aliases=('special.erf', ),
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-2}),),
+                   skips=(
+                       DecorateInfo(unittest.skip("Skipped! sparse backward not supported"),
+                                    'TestSparseUnaryUfuncs', 'test_sparse_fn_grad'),
+
+                   ),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   assert_jit_shape_analysis=True,
+                   supports_sparse=True,
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True),
+    UnaryUfuncInfo('erfc',
+                   ref=scipy.special.erfc if TEST_SCIPY else None,
+                   aliases=('special.erfc', ),
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-2}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   assert_autodiffed=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True),
+    UnaryUfuncInfo('erfinv',
+                   ref=scipy.special.erfinv if TEST_SCIPY else None,
+                   aliases=('special.erfinv', ),
+                   decorators=(precisionOverride({torch.float16: 1e-2,
+                                                  torch.bfloat16: 1e-2,
+                                                  torch.float32: 1e-4}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_sparse_csr=True,
+                   supports_sparse_csc=True,
+                   supports_sparse_bsr=True,
+                   supports_sparse_bsc=True,
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   domain=(-1, 1),
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_small',
+                                    active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+                   )),
+    OpInfo("nn.functional.smooth_l1_loss",
+           ref=reference_smooth_l1_loss,
+           sample_inputs_func=sample_inputs_smooth_l1_loss,
+           dtypes=floating_types_and(torch.float16, torch.bfloat16),
+           backward_dtypes=floating_types_and(torch.bfloat16),
+           dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           backward_dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # RuntimeError: input->type()->kind() == TypeKind::OptionalTypeINTERNAL ASSERT FAILED
+               # at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270, please report a bug to PyTorch.
+               DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"),)),
+    OpInfo(
+        "nn.functional.l1_loss",
+        ref=loss_reference_reduction_wrapper(lambda input, target: np.abs(input - target)),
+        sample_inputs_func=sample_inputs_l1_loss,
+        error_inputs_func=error_inputs_l1_loss,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # RuntimeError: input->type()->kind() == TypeKind::OptionalTypeINTERNAL ASSERT FAILED
+            # at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270, please report a bug to PyTorch.
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32,),
+            ),
+        ),
+    ),
+    UnaryUfuncInfo('lgamma',
+                   ref=reference_lgamma if TEST_SCIPY else None,
+                   aliases=('special.gammaln', ),
+                   decorators=(precisionOverride({torch.float16: 7e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   skips=(
+                       # Reference: https://github.com/pytorch/pytorch/pull/50140#issuecomment-756150214
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                                    dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
+                       DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                                    dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
+                   ),
+                   # lgamma have multiple singularities at x <= 0
+                   reference_numerics_filter=NumericsFilter(condition=lambda x: x < 0.1, safe_val=1)),
+    OpInfo(
+        'logdet',
+        dtypes=floating_and_complex_types(),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_det_logdet_slogdet,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    # `log_softmax` supports different dtypes based on whether `dtype` argument,
+    # is passed or not. Hence two OpInfo entries, one with dtype and other without.
+    OpInfo(
+        'log_softmax',
+        aliases=('special.log_softmax', 'nn.functional.log_softmax'),
+        supports_out=True,
+        aten_backward_name='_log_softmax_backward_data',
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_softmax_variant,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        assert_autodiffed=True),
+    OpInfo(
+        'log_softmax',
+        variant_test_name='with_dtype',
+        aliases=('special.log_softmax', 'nn.functional.log_softmax'),
+        supports_out=True,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        sample_inputs_func=partial(sample_inputs_softmax_variant, with_dtype=True),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True),
+    UnaryUfuncInfo('logit',
+                   aten_backward_name='logit_backward',
+                   ref=scipy.special.logit if TEST_SCIPY else None,
+                   domain=(0, 1),
+                   aliases=('special.logit', ),
+                   supports_forward_ad=True,
+                   supports_fwgrad_bwgrad=True,
+                   promotes_int_to_float=True,
+                   decorators=(precisionOverride({torch.bfloat16: 5e-1,
+                                                  torch.float16: 5e-1}),),
+                   dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                   sample_inputs_func=sample_inputs_logit),
+    OpInfo('where',
+           # Currently only the `input` is tested in gradcheck.
+           # If we pass `condition` first, none of the input which supports
+           # autograd will be tested. Hence the following lambda.
+           op=lambda self, condition, other, **kwargs: torch.where(condition, self, other, **kwargs),
+           ref=lambda self, condition, other: np.where(condition, self, other),
+           sample_inputs_func=sample_inputs_where,
+           reference_inputs_func=reference_inputs_where,
+           error_inputs_func=error_inputs_where,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           decorators=(
+               DecorateInfo(onlyCUDA, "TestCommon", 'test_errors'),),
+           skips=(
+               # lambda impl
+               DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+           ),
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16, torch.chalf)),
+    OpInfo('nonzero',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_nonzero,
+           supports_autograd=False,
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+               # nonzero(): argument 'out' must be Tensor, not tuple
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               # https://github.com/pytorch/pytorch/issues/67458
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+               # nonzero is not raising a warning when the out is resized
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               # Can't find schemas for this operator for some reason
+               DecorateInfo(unittest.expectedFailure, 'TestOperatorSignatures', 'test_get_torch_func_signature_exhaustive'),
+               # Compiler issue on ROCm. Might need to skip until ROCm5.5
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+    OpInfo('nonzero_static',
+           dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16, torch.chalf),
+           sample_inputs_func=sample_inputs_nonzero_static,
+           supports_out=False,
+           supports_autograd=False,
+           decorators=[onlyCPU],
+           skips=(
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+               DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out_warning'),
+               DecorateInfo(unittest.expectedFailure, 'TestDTensorOps', 'test_dtensor_op_db'),
+               DecorateInfo(unittest.expectedFailure, 'TestInductorOpInfo', 'test_comprehensive'),
+               DecorateInfo(unittest.expectedFailure, 'TestVmapOperatorsOpInfo', 'test_op_has_batch_rule'),
+               DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_non_standard_bool_values',
+                            dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+           )),
+    # Following tests are for jiterator's python interface
+    # Jiterator can be used to author elementwise CUDA kernel
+    # jiterator._create_jit_fn returns a callable that behaves like a regular pytorch op
+    # See create_jit_fn in jiterator.py for more information
+    UnaryUfuncInfo(
+        'jiterator_unary',
+        op=torch.cuda.jiterator._create_jit_fn("template <typename T> T unary(T x) { return x * x + x; }"),
+        ref=lambda x: x * x + x,
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[
+            onlyCUDA,
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_extremal'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_hard'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_normal'),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                         'TestUnaryUfuncs', 'test_reference_numerics_small'),
+        ],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't support CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Skip reference_numerics tests for bool type, as the defined function doesn't work for bool
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         dtypes=[torch.bool]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_hard',
+                         dtypes=[torch.bool]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_normal',
+                         dtypes=[torch.bool]),
+            # ROCm generates -inf+infj instead of nan+infj for complex64 for some of the results
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_large',
+                         dtypes=[torch.complex64], active_if=TEST_WITH_ROCM),
+            # Expected failure: torch.jiterator_unary is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    BinaryUfuncInfo(
+        'jiterator_binary',
+        op=torch.cuda.jiterator._create_jit_fn(
+            "template <typename T> T binary(T x, T y, T alpha) { return x + alpha * y; }", alpha=1),
+        ref=lambda input, other, *, alpha=1: np.add(input, other) if alpha == 1 \
+            else np.add(input, np.multiply(alpha, other)),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2, alpha=-3.14),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        supports_rhs_python_scalar=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[onlyCUDA],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't support CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_binary is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    OpInfo(
+        'jiterator_4inputs_with_extra_args',
+        op=torch.cuda.jiterator._create_jit_fn(
+            "template <typename T> T binary(T i0, T i1, T i2, T i3, T alpha, T beta) { return alpha * i0 + beta * i1 + i2 + i3; }",
+            alpha=1, beta=1),
+        ref=lambda i0, i1, i2, i3, *, alpha=1, beta=1: alpha * i0 + beta * i1 + i2 + i3,
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=4, alpha=3.14, beta=-4.20),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[onlyCUDA],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't support CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_4inputs_with_extra_args is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    BinaryUfuncInfo(
+        'jiterator_binary_return_by_ref',
+        op=torch.cuda.jiterator._create_multi_output_jit_fn(
+            """
+            template <typename T>
+            void binary_return_by_ref(T i0, T i1, T& out0) {
+                out0 = i0 + i1;
+            }
+            """,
+            num_outputs=1),
+        ref=operator.add,
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2, alpha=-0.42),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        supports_rhs_python_scalar=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[onlyCUDA],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't support CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_4inputs_with_extra_args is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    OpInfo(
+        'jiterator_2inputs_2outputs',
+        op=torch.cuda.jiterator._create_multi_output_jit_fn(
+            """
+            template <typename T>
+            void binary_2outputs(T i0, T i1, T& out0, T& out1) {
+                out0 = i0 + i1;
+                out1 = i0 - i1;
+            }
+            """,
+            num_outputs=2),
+        ref=lambda i0, i1, *, alpha=1: (i0 + i1, i0 - i1),
+        dtypes=all_types_and_complex_and(torch.bfloat16, torch.float16, torch.bool),
+        sample_inputs_func=partial(sample_inputs_jiterator, num_inputs=2),
+        supports_out=False,
+        supports_autograd=False,  # jiterator ops doesn't have backward defined
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        decorators=[onlyCUDA],
+        skips=(
+            # Jiterator ops doesn't support neg or conj view
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+            # Jiterator ops doesn't support CompositeCompliantTensor
+            # Following test should expectedFailure, but it's causing cascading failures in CUDA, thus skipped
+            DecorateInfo(unittest.skip("skip"), 'TestCompositeCompliance', 'test_operator'),
+            # Expected failure: torch.jiterator_4inputs_with_extra_args is not a valid op
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # Skip Nvfuser
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCudaFuserOpInfo'),
+        )
+    ),
+    # `torch.norm` has multiple code paths depending on the value of `p`.
+    # These paths have different dtype support. Also JIT supports,
+    # most variants but not all of them. So we split the OpInfo entries,
+    # for `norm` based on the code-paths and JIT support.
+    OpInfo(
+        "norm",
+        sample_inputs_func=sample_inputs_norm,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        # TODO Benchmark again with the new implementation
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        check_batched_forward_grad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # Dispatches in Python to vector_norm. Not sure how to make this test happy
+            # Happens to pass on complex64. Also a mystery
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                         dtypes=(torch.float32,)),)
+    ),
+    OpInfo('norm',
+           variant_test_name='nuc',
+           sample_inputs_func=sample_inputs_norm_nuc,
+           decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+           check_batched_gradgrad=False,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+           # got: Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           dtypes=floating_and_complex_types(),
+           dtypesIfCUDA=floating_and_complex_types(),
+           skips=(
+               # Dispatches in Python to matrix_norm. Not sure how to make this test happy
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.complex64, torch.float32,)),)
+           ),
+    OpInfo('norm',
+           variant_test_name='fro',
+           sample_inputs_func=sample_inputs_norm_fro,
+           dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
+           dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+           supports_forward_ad=True,
+           # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+           # got: Could not allocate memory to change Tensor SizesAndStrides!
+           check_batched_forward_grad=False,
+           supports_fwgrad_bwgrad=True,
+           skips=(
+               # MPS has some mild accuracy issues for float16. We divide the tolerances by 10
+               DecorateInfo(
+                   toleranceOverride({torch.float16: tol(atol=1e-4, rtol=0.01)}),
+                   'TestConsistency',
+                   'test_output_match',
+
+               ),
+               # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+               DecorateInfo(
+                   unittest.skip("Skipped!"),
+                   'TestSchemaCheckModeOpInfo',
+                   'test_schema_correctness',
+                   dtypes=(torch.complex64, torch.complex128)),
+               # Dispatches in Python to vector_norm. Not sure how to make this test happy
+               DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                            dtypes=(torch.complex64, torch.float32,)),)
+           ),
+    OpInfo(
+        "norm",
+        variant_test_name="inf",
+        sample_inputs_func=sample_inputs_norm_inf,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        # fast gradcheck produces NaNs
+        gradcheck_fast_mode=False,
+        skips=(
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=2e-3, rtol=1e-3)}),
+                'TestInductorOpInfo', 'test_comprehensive', device_type='cuda',
+            ),
+            # Dispatches in Python to vector_norm. Not sure how to make this test happy
+            # Happens to pass on complex64. Also a mystery
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit',
+                         dtypes=(torch.float32,))
+        ),
+    ),
+    OpInfo('t',
+           sample_inputs_func=sample_inputs_t,
+           supports_out=False,
+           supports_forward_ad=True,
+           supports_fwgrad_bwgrad=True,
+           # See https://github.com/pytorch/pytorch/pull/78358
+           check_batched_forward_grad=False,
+           # vmap does not support inplace views
+           check_inplace_batched_forward_grad=False,
+           autodiff_fusible_nodes=[],  # aliases inputs, shouldn't be fused
+           autodiff_nonfusible_nodes=[],  # aliases inputs, shouldn't be fused
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           assert_autodiffed=True,
+           error_inputs_func=error_inputs_t),
+    OpInfo(
+        "nn.functional.dropout",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # Probably because we have used lambda for the op here
+            # AssertionError: JIT Test does not execute any logic
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # inplace variant dispatches to dropout kernel, while on CUDA
+            # the op dispatches to _fused_dropout (with a few more conditions)
+            # hence, different values and this skip here
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # https://github.com/pytorch/pytorch/issues/66357
+        check_batched_forward_grad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        supports_out=False,
+        sample_inputs_func=sample_inputs_dropout,
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "native_dropout_backward",
+        op=torch.ops.aten.native_dropout_backward.default,
+        aten_name="native_dropout_backward",
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=sample_inputs_dropout_backward,
+        skips=(
+            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+            # Lazy tensor failures
+            DecorateInfo(unittest.skip('Skipped!'), 'TestLazyOpInfo', 'test_dispatched_to_lazy'),
+            # These tests fail only when built with ASAN
+            DecorateInfo(unittest.skip("Fails with ASAN"), 'TestLazyOpInfo', 'test_correctness', active_if=TEST_WITH_ASAN),
+            DecorateInfo(
+                unittest.skip("Fails with ASAN"),
+                'TestLazyOpInfo',
+                'test_correctness_with_reusing_ir',
+                active_if=TEST_WITH_ASAN
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.dropout2d",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout2d, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        check_batched_forward_grad=False,
+        # As per the docs, valid input dims are (3, 4)
+        sample_inputs_func=partial(sample_inputs_dropout, valid_input_dim=(3, 4)),
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout2d, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "nn.functional.dropout3d",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout3d, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        check_batched_forward_grad=False,
+        # As per the docs, valid input dims are (4, 5)
+        sample_inputs_func=partial(sample_inputs_dropout, valid_input_dim=(4, 5)),
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.dropout3d, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "nn.functional.alpha_dropout",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.alpha_dropout, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        gradcheck_wrapper=wrapper_set_seed,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        sample_inputs_func=sample_inputs_dropout,
+        check_batched_forward_grad=False,
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.alpha_dropout, input, *args, **kwargs, inplace=True),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # AssertionError: Tensor-likes are not close!
+            # Fails in cuda11.7
+            # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),),
+    # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype
+    # unlike when `train=False`, it supports complex inputs, hence 2 OpInfos to cover all cases
+    OpInfo(
+        "nn.functional.feature_alpha_dropout",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.feature_alpha_dropout, input, *args, **kwargs),
+        variant_test_name="with_train",
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+            # vmap: We do not yet support calling random operations inside of vmap.
+            # Please perform random operations outside of vmap as a workaround
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', "test_forward_mode_AD"),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', "test_inplace_forward_mode_AD"),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu')),
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        # As per the docs, valid input dims are (4, 5)
+        sample_inputs_func=partial(sample_inputs_dropout, train=True, valid_input_dim=(4, 5)),
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.feature_alpha_dropout, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "nn.functional.feature_alpha_dropout",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.feature_alpha_dropout, input, *args, **kwargs),
+        variant_test_name="without_train",
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),
+        gradcheck_wrapper=wrapper_set_seed,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        sample_inputs_func=partial(sample_inputs_dropout, train=False),
+        inplace_variant=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.feature_alpha_dropout, input, *args, **kwargs, inplace=True)),
+    OpInfo(
+        "nn.functional.one_hot",
+        ref=reference_one_hot,
+        supports_out=False,
+        dtypes=_dispatch_dtypes((torch.int64,)),
+        sample_inputs_func=sample_inputs_one_hot,
+    ),
+    OpInfo(
+        "nn.functional.embedding",
+        aten_backward_name="embedding_dense_backward",
+        # We use lambda to reshuffle the positional arguments.
+        # This is because currently only the `input` field of SampleInput
+        # is tested in gradient tests.
+        op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(idx, weight, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_embedding,
+        error_inputs_func=error_inputs_embedding,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # Fails on CI https://github.com/pytorch/pytorch/issues/85377
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_compare_cpu'),
+            # Reference: https://github.com/pytorch/pytorch/issues/67084
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'),
+            # Not a problem: embedding does weird stuff to its input (it renormalizes)
+            DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
+        ),
+        supports_expanded_weight=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        "nn.functional.embedding_bag",
+        # We use lambda to reshuffle the positional arguments.
+        # This is because currently only the `input` field of SampleInput
+        # is tested in gradient tests.
+        op=lambda weight, idx, **kwargs: torch.nn.functional.embedding_bag(idx, weight, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        # backward is not supported for mode `max` and dtype `bfloat16`
+        backward_dtypesIfCUDA=floating_types_and(torch.float16),
+        sample_inputs_func=sample_inputs_embedding_bag,
+        skips=(
+            # lambda impl
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'),
+            # Not a problem: embedding_bag does weird stuff to its input (it renormalizes)
+            DecorateInfo(unittest.skip('Allowed exemption'), 'TestCompositeCompliance', 'test_operator'),
+        ),
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        supports_out=False,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+    ),
+    OpInfo(
+        "nn.functional.multi_head_attention_forward",
+        op=lambda input, *args, **kwargs:
+            wrapper_set_seed(torch.nn.functional.multi_head_attention_forward, input, *args, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_multi_head_attention_forward,
+        skips=(
+            # Tensor-likes are not close
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples', dtypes=(torch.float32,)),
+            DecorateInfo(toleranceOverride({torch.float32: tol(atol=5e-3, rtol=0)}), 'TestDecomp', 'test_comprehensive'),
+
+            # TODO skip this for now since we can't skip on runtime arch support (taken from scaled_dot_product_attention)
+            DecorateInfo(unittest.skip("Skipped!"), 'TestInductorOpInfo', 'test_comprehensive'),
+            # randomness
+            DecorateInfo(unittest.skip("Skipped!"), 'TestFwdGradients', 'test_forward_mode_AD'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            # lambda impl
+            # AssertionError: JIT Test does not execute any logic
+            DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.expectedFailure, "TestNormalizeOperators", "test_normalize_operator_exhaustive"),
+            # tests running very slowly break slow tests, so we skip them instead of using `slowTest`.
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_forward_ad'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCompositeCompliance', 'test_operator'),
+            DecorateInfo(
+                unittest.skip("Skipped - baddbmm decomp does not have enough precision for 16-bit float"),
+                'TestDecomp',
+                'test_comprehensive',
+                dtypes=(torch.bfloat16, torch.float16),
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped - baddbmm decomp does not have enough precision for 16-bit float"),
+                'TestDecomp',
+                'test_quick',
+                dtypes=(torch.bfloat16, torch.float16))),
+        supports_out=False,
+        supports_gradgrad=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+    ),
+    UnaryUfuncInfo(
+        "nn.functional.softplus",
+        aten_backward_name='softplus_backward',
+        ref=reference_softplus,
+        sample_kwargs=lambda device, dtype, input: ({'beta': 3, 'threshold': .2}, {'beta': 3, 'threshold': .2}),
+        sample_inputs_func=partial(sample_inputs_elementwise_unary, op_kwargs={'beta': 3, 'threshold': .2}),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        decorators=(
+            DecorateInfo(
+                toleranceOverride
+                ({
+                    torch.half: tol(atol=1e-2, rtol=1e-2),
+                    torch.bfloat16: tol(atol=1e-2, rtol=1e-2),
+                }),
+                'TestUnaryUfuncs'),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.mse_loss",
+        aten_backward_name='mse_loss_backward',
+        ref=loss_reference_reduction_wrapper(lambda input, target: (input - target) ** 2),
+        sample_inputs_func=sample_inputs_loss,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        dtypes=floating_types_and(torch.float16),
+        backward_dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        backward_dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+        skips=(
+            # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+            # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":252,
+            # please report a bug to PyTorch.
+            DecorateInfo(unittest.expectedFailure, "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.grid_sample",
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_grid_sample,
+        reference_inputs_func=reference_inputs_grid_sample,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        gradcheck_nondet_tol=1e-15),
+    # TODO: delete this OpInfo once we add meta support for grid_sampler_3d
+    OpInfo(
+        "grid_sampler_2d",
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_grid_sampler_2d,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        gradcheck_nondet_tol=1e-15),
+    OpInfo(
+        "argwhere",
+        ref=np.argwhere,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_argwhere,
+        skips=(
+            # Compiler issue on ROCm. Might need to skip until ROCm5.5
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_non_standard_bool_values',
+                         dtypes=[torch.bool], active_if=TEST_WITH_ROCM),
+        ),
+    ),
+    ReductionOpInfo(
+        'all',
+        identity=True,
+        supports_autograd=False,
+        result_dtype=torch.bool,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.all),
+        skips=(
+            # FIXME: uint8 input returns uint8 instead of bool
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionOpInfo(
+        'any',
+        identity=False,
+        supports_autograd=False,
+        result_dtype=torch.bool,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.any),
+        skips=(
+            # FIXME: uint8 input returns uint8 instead of bool
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_result_dtype', dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionOpInfo(
+        'amax',
+        nan_policy='propagate',
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        ref=reference_reduction_numpy(np.amax),
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+        error_inputs_func=error_inputs_aminmax_amax_amin,
+    ),
+    ReductionOpInfo(
+        'amin',
+        nan_policy='propagate',
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        ref=reference_reduction_numpy(np.amin),
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+        error_inputs_func=error_inputs_aminmax_amax_amin,
+    ),
+    ReductionOpInfo(
+        'argmax',
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        assert_jit_shape_analysis=True,
+        result_dtype=torch.int64,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
+    ),
+    ReductionOpInfo(
+        'argmin',
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        result_dtype=torch.int64,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmin, supports_keepdims=False),
+    ),
+    ReductionOpInfo(
+        'count_nonzero',
+        identity=0,
+        supports_out=False,
+        supports_autograd=False,
+        result_dtype=torch.int64,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_reduction_count_nonzero,
+        ref=reference_reduction_numpy(np.count_nonzero),
+        skips=(
+            # FIXME: count_nonzero does not accept keepdim kwarg
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_single_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_multi_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_multi_unsorted_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_offbounds_keepdim'),
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+        ),
+    ),
+    ReductionOpInfo(
+        'mean',
+        nan_policy='propagate',
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # FIXME: mean needs 'dim' parameter when using the 'out' overload.
+        # Adding it with 'generate_args_kwargs' does not work, since these also get passed
+        # onto the reference implementations.
+        supports_out=False,
+        assert_autodiffed=True,
+        assert_jit_shape_analysis=True,
+        promotes_int_to_float=True,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.mean),
+        error_inputs_func=error_inputs_mean,
+        skips=(
+            # FIXME: mean does not support passing keepdim without passing dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: mean reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=[torch.float16]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_extremal_values',
+                         device_type='cuda', dtypes=[torch.complex64]),
+        ),
+    ),
+    ReductionOpInfo(
+        'nanmean',
+        nan_policy='omit',
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
+        sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True),
+        ref=reference_reduction_numpy(np.nanmean),
+        skips=(
+            # AssertionError: False is not true :
+            # Failure in testing nodes' autodifferentiation.
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            # FIXME: prod reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=[torch.float16]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
+                         device_type='cuda', dtypes=[torch.float16]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_extremal_values',
+                         device_type='cuda', dtypes=[torch.complex64]),
+        ),
+    ),
+    ReductionOpInfo(
+        'std',
+        nan_policy='propagate',
+        supports_out=True,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var,
+        ref=reference_std_var(np.std),
+        generate_args_kwargs=generate_std_var_kwargs,
+        skips=(
+            # FIXME: cannot specify keepdim without dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=(torch.float16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
+                         dtypes=(torch.float16,)),
+        ),
+    ),
+    ReductionOpInfo(
+        'std',
+        variant_test_name='unbiased',
+        nan_policy='propagate',
+        supports_out=False,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var_unbiased,
+        skips=(
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'var',
+        nan_policy='propagate',
+        supports_out=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var,
+        ref=reference_std_var(np.var),
+        generate_args_kwargs=generate_std_var_kwargs,
+        skips=(
+            # FIXME: cannot specify keepdim without dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values'),
+            # NumPy is giving NaN for this
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_large_input'),
+        ),
+    ),
+    ReductionOpInfo(
+        'var',
+        variant_test_name='unbiased',
+        nan_policy='propagate',
+        supports_out=False,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_autodiffed=True,
+        promotes_int_to_float=True,
+        check_batched_forward_grad=False,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_std_var_unbiased,
+        skips=(
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionOpInfo(
+        'prod',
+        identity=1,
+        nan_policy='propagate',
+        supports_multiple_dims=False,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        promotes_int_to_int64=True,
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        sample_inputs_func=sample_inputs_prod,
+        ref=prod_numpy,
+        skips=(
+            # FIXME: prod does not support passing keepdim without passing dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: prod reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: prod does not support passing None to dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none_keepdim'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=[torch.float16, torch.complex64]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
+                         dtypes=[torch.uint8, torch.float16, torch.complex64]),
+            # FIXME: ValueError: The data in MaskedTensor a and Tensor b do not match
+            DecorateInfo(unittest.skip("Skipped!"), 'TestOperators', 'test_reduction_all',
+                         dtypes=[torch.float16]),
+        ),
+    ),
+    ReductionOpInfo(
+        'sum',
+        identity=0,
+        nan_policy='propagate',
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        promotes_int_to_int64=True,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        ref=reference_reduction_numpy(np.sum),
+        error_inputs_sparse_func=error_inputs_sparse_reduction_sum,
+        sample_inputs_sparse_coo_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_coo),
+        sample_inputs_sparse_csr_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_csr),
+        sample_inputs_sparse_csc_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_csc),
+        sample_inputs_sparse_bsr_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_bsr),
+        sample_inputs_sparse_bsc_func=partial(sample_inputs_sparse_reduction_sum, layout=torch.sparse_bsc),
+        skips=(
+            # FIXME: sum does not support passing keepdim without passing dim
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_default_keepdim'),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=[torch.float16]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_duplicate_values',
+                         dtypes=[torch.float16]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestOperators', 'test_reduction_all',
+                         dtypes=[torch.float32]),
+        ),
+    ),
+    ReductionOpInfo(
+        'nansum',
+        identity=0,
+        nan_policy='omit',
+        supports_out=True,
+        promotes_int_to_int64=True,
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        sample_inputs_func=sample_inputs_nan_reduction(supports_multiple_dims=True),
+        ref=reference_reduction_numpy(np.nansum),
+        skips=(
+            # please report a bug to PyTorch.
+            DecorateInfo(unittest.skip("Skipped!"), 'TestJit', 'test_variant_consistency_jit'),
+            # FIXME: nansum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: flaky test so skipped instead of xfailed
+            # possibly bad low precision reference in numpy
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                         dtypes=[torch.float16]),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.ctc_loss",
+        dtypes=floating_types(),
+        supports_out=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=sample_inputs_ctc_loss,
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/67462
+            # torch.autograd.gradcheck.GradcheckError: Jacobian mismatch for output 0 with respect to input 0
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestBwdGradients",
+                "test_fn_grad",
+                dtypes=(torch.float64,),
+            ),
+            # RuntimeError: derivative for aten::_ctc_loss_backward is not implemented
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestBwdGradients",
+                "test_fn_gradgrad",
+                dtypes=(torch.float64,),
+            ),
+            # RuntimeError: derivative for aten::_ctc_loss_backward is not implemented
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32,),
+            ),
+            # Ref: https://github.com/pytorch/pytorch/issues/85231
+            DecorateInfo(unittest.skip("Fails with ASAN"),
+                         'TestProxyTensorOpInfo',
+                         'test_make_fx_fake_exhaustive', active_if=TEST_WITH_ASAN),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.cosine_embedding_loss",
+        dtypes=all_types_and(torch.half, torch.bfloat16, torch.bool),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_cosine_embedding_loss,
+    ),
+    OpInfo(
+        "nn.functional.nll_loss",
+        dtypes=floating_types_and(torch.bfloat16),
+        dtypesIfCUDA=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        sample_inputs_func=sample_inputs_nll_loss,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        assert_jit_shape_analysis=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        skips=(
+            # RuntimeError:
+            # undefined value tensor:
+            #   File "<string>", line 3
+            # def the_method(i0, i1):
+            #     return torch.nn.functional.nll_loss(i0, i1, weight=tensor([8.4784, 1.7658, 4.3228], dtype=torch.float32))
+            #                                                        ~~~~~~ <--- HERE
+            DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+            # Fails for unknown reason: https://github.com/pytorch/pytorch/issues/120782
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCompositeCompliance",
+                "test_cow_input",
+                device_type='cuda',
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.gaussian_nll_loss",
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_gaussian_nll_loss,
+        error_inputs_func=error_inputs_gaussian_nll_loss,
+        skips=(
+            # Pre-existing condition (calls .item); needs to be fixed
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_backward'),
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_forward_ad'),
+            # Pre-existing condition (calls .item); needs to be fixed
+            DecorateInfo(unittest.expectedFailure, 'TestCompositeCompliance', 'test_operator'),
+            # JIT does not support variadic tensors.
+            # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+            # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270,
+            # please report a bug to PyTorch.
+            DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.hinge_embedding_loss",
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_hinge_embedding_loss,
+        error_inputs_func=error_inputs_hinge_embedding_loss,
+        reference_inputs_func=reference_inputs_hinge_embedding_loss,
+    ),
+    OpInfo(
+        "nn.functional.huber_loss",
+        aten_backward_name='huber_loss_backward',
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        sample_inputs_func=sample_inputs_huber_loss,
+        error_inputs_func=error_inputs_huber_loss,
+        skips=(
+            # JIT does not support variadic tensors.
+            # RuntimeError: input->type()->kind() == TypeKind::OptionalType
+            # INTERNAL ASSERT FAILED at "../torch/csrc/jit/passes/utils/check_alias_annotation.cpp":270,
+            # please report a bug to PyTorch.
+            DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit", dtypes=(torch.float32,),),
+        )
+    ),
+    OpInfo(
+        "nn.functional.pdist",
+        ref=reference_pdist,
+        sample_inputs_func=sample_inputs_pdist,
+        dtypes=floating_types(),
+        supports_out=False,
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        skips=(
+            DecorateInfo(unittest.skip("Unsupported on MPS for now"), 'TestCommon', 'test_numpy_ref_mps'),
+        )
+    ),
+    OpInfo(
+        "nn.functional.poisson_nll_loss",
+        dtypes=all_types_and(torch.half, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_poisson_nll_loss,
+        error_inputs_func=error_inputs_poisson_nll_loss,
+    ),
+    OpInfo(
+        "argsort",
+        dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_sort,
+        supports_out=False,
+        supports_autograd=False,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32,),
+            ),
+        ),
+    ),
+    OpInfo(
+        "repeat_interleave",
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16, torch.chalf),
+        backward_dtypesIfCUDA=floating_and_complex_types_and(torch.float16, torch.bfloat16, torch.chalf),
+        sample_inputs_func=sample_inputs_repeat_interleave,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32, torch.complex64),
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.pairwise_distance",
+        ref=lambda a, b, p=2.0, eps=1e-6, keepdim=False: (
+            np.sum(np.abs(a - b + eps) ** p, axis=-1, keepdims=keepdim) ** (1 / p)
+        ),
+        sample_inputs_func=sample_inputs_pairwise_distance,
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32, torch.complex64),
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.pixel_shuffle",
+        sample_inputs_func=sample_inputs_pixel_shuffle,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32, torch.complex64),
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.pixel_unshuffle",
+        sample_inputs_func=sample_inputs_pixel_unshuffle,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=(torch.float32, torch.complex64),
+            ),
+        ),
+    ),
+    OpInfo(
+        "nn.functional.kl_div",
+        sample_inputs_func=sample_inputs_kl_div,
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    OpInfo(
+        "diagflat",
+        ref=lambda input, offset=0: np.diagflat(input, k=offset),
+        sample_inputs_func=sample_inputs_diagflat,
+        dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16, torch.float16),
+        dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='sum',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='prod',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        sample_inputs_func=sample_inputs_scatter_reduce,
+        skips=(
+            # Not implemented
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_inplace_forward_mode_AD'),
+            DecorateInfo(unittest.expectedFailure, 'TestFwdGradients', 'test_fn_fwgrad_bwgrad'),
+        ),
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='mean',
+        # complex not added to dtypes as complex gradients are not properly handled
+        # and scatter_reduce hasn't been added to the whitelist in gen_variable_type yet
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='amin',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        'scatter_reduce',
+        variant_test_name='amax',
+        dtypes=all_types_and(torch.float16, torch.bfloat16, torch.bool),
+        dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_scatter_reduce,
+    ),
+    OpInfo(
+        '_segment_reduce',
+        aten_name='segment_reduce',
+        variant_test_name='lengths',
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        # RuntimeError: derivative for aten::_segment_reduce_backward is not implemented
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=sample_inputs_segment_reduce,
+        skips=(
+            # FIXME: CUDA driver API confirmed a leak in
+            # __main__.TestJitCUDA.test_variant_consistency_jit_segment_reduce_cuda_float32
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        ),
+    ),
+    OpInfo(
+        '_segment_reduce',
+        aten_name='segment_reduce',
+        variant_test_name='offsets',
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        # RuntimeError: derivative for aten::_segment_reduce_backward is not implemented
+        supports_gradgrad=False,
+        # TODO: Avoid COW materialize
+        supports_cow_input_no_materialize=False,
+        sample_inputs_func=partial(sample_inputs_segment_reduce, mode='offsets'),
+        skips=(
+            # FIXME: CUDA driver API confirmed a leak in
+            # __main__.TestJitCUDA.test_variant_consistency_jit_segment_reduce_cuda_float32
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        ),
+    ),
+]
+op_db += opinfo.definitions.op_db
+
+
+# Separate registry for experimental Python Reference OpInfos.
+python_ref_db = [
+    #
+    # Elementwise Unary OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.abs",
+        torch_opinfo_name="abs",
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/49224
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         dtypes=[torch.int8], active_if=TEST_WITH_ASAN),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.acos",
+        torch_opinfo_name="acos",
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_normal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            # Failing with wrong imaginary sign on at least some Windows jobs
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            # Failing with wrong imaginary sign on at least some Windows jobs
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.acosh",
+        torch_opinfo_name="acosh",
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_normal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            # Failing with wrong imaginary sign on at least some Windows jobs
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.asin",
+        torch_opinfo_name="asin",
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-05, rtol=1e-03)}),
+                'TestUnaryUfuncs', device_type='cuda'),
+            precisionOverride({torch.bfloat16: 1e-2}),
+        ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.asinh",
+        torch_opinfo_name="asinh",
+        decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_normal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.lerp",
+        torch_opinfo_name="lerp",
+    ),
+    PythonRefInfo(
+        "_refs.ones",
+        torch_opinfo_name="ones",
+        skips=(
+            # Tests that assume input is a tensor or sequence of tensors
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.zeros",
+        torch_opinfo_name="zeros",
+        skips=(
+            # Tests that assume input is a tensor or sequence of tensors
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.cauchy",
+        torch_opinfo_name="cauchy",
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip("Expected: cauchy is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.exponential",
+        torch_opinfo_name="exponential",
+        supports_out=True,
+        decorators=(
+            # dtypes that do not support check_uniform_bounds of rand_like
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_dtypes'),
+
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip("Expected: exponential is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.geometric",
+        torch_opinfo_name="geometric",
+        supports_out=True,
+        decorators=(
+            # dtypes that do not support check_uniform_bounds of rand_like
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_dtypes'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64)),
+
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: geometric is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.log_normal",
+        torch_opinfo_name="log_normal",
+        supports_out=True,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: log_normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.normal",
+        torch_opinfo_name="normal",
+        supports_out=True,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"), 'TestDecomp', 'test_comprehensive'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.normal",
+        torch_opinfo_name="normal",
+        torch_opinfo_variant_name="number_mean",
+        supports_out=True,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"), 'TestDecomp', 'test_comprehensive'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.normal_",
+        op=torch.Tensor.normal_,
+        torch_opinfo_name="normal",
+        torch_opinfo_variant_name="in_place",
+        supports_out=False,
+        decorators=(
+            # TODO: RuntimeError: no _refs support for torch.rand_like
+            DecorateInfo(unittest.skip("TODO: RuntimeError: no _refs support for torch.rand_like"),
+                         'TestCommon',
+                         'test_python_ref'),
+
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: normal is not comparable"), 'TestDecomp', 'test_comprehensive'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+            DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.arange",
+        torch_opinfo_name="arange",
+        skips=(
+            # Tests that assume input is a tensor or sequence of tensors
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.linspace",
+        torch_opinfo_name="linspace",
+        skips=(
+            # Tests that assume input is a tensor or sequence of tensors
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+            # cpu implementation is wrong on some integral types
+            # https://github.com/pytorch/pytorch/issues/81996
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
+
+            # cuda implementation is off-by-one on some inputs due to precision issues
+            # https://github.com/pytorch/pytorch/issues/82230
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.linspace",
+        torch_opinfo_name="linspace",
+        torch_opinfo_variant_name="tensor_overload",
+        skips=(
+            # TypeError: 'int' object is not subscriptable
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+
+            # cpu implementation is wrong on some integral types
+            # https://github.com/pytorch/pytorch/issues/81996
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.int8, torch.uint8, torch.int16, torch.int32, torch.int64), device_type="cpu"),
+
+            # cuda implementation is off-by-one on some inputs due to precision issues
+            # https://github.com/pytorch/pytorch/issues/82230
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.logspace",
+        torch_opinfo_name="logspace",
+        skips=(
+            # Tests that assume input is a tensor or sequence of tensors
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_conj_view'),
+
+            # Off-by-one issue when casting floats to ints
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.logspace",
+        torch_opinfo_name="logspace",
+        torch_opinfo_variant_name="tensor_overload",
+        skips=(
+            # TypeError: 'int' object is not subscriptable
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestMathBits', 'test_conj_view'),
+
+            # Off-by-one issue when casting floats to ints
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.int16, torch.int32, torch.int64),
+                         device_type="cuda"),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.meshgrid",
+        torch_opinfo_name="meshgrid",
+        torch_opinfo_variant_name="variadic_tensors",
+    ),
+    PythonRefInfo(
+        "_refs.take_along_dim",
+        torch_opinfo_name="take_along_dim",
+        skips=(
+            DecorateInfo(unittest.expectedFailure,
+                         'TestCommon',
+                         'test_python_ref'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.to",
+        torch_opinfo_name="to",
+    ),
+    PythonRefInfo(
+        "_refs.triu",
+        torch_opinfo_name="triu",
+    ),
+    PythonRefInfo(
+        "_refs.tril",
+        torch_opinfo_name="tril",
+    ),
+    PythonRefInfo(
+        "_refs.triu_indices",
+        torch_opinfo_name="triu_indices",
+        # the implementation uses torch.stack that violates view consistency
+        validate_view_consistency=False,
+        skips=(
+            # skip these tests since we have non tensor input
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_noncontiguous_samples'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestMathBits', 'test_neg_view'),
+        )),
+    PythonRefInfo(
+        "_refs.tril_indices",
+        torch_opinfo_name="tril_indices",
+        # the implementation uses torch.stack that violates view consistency
+        validate_view_consistency=False,
+        skips=(
+            # skip these tests since we have non tensor input
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_noncontiguous_samples'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestCommon', 'test_variant_consistency_eager'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestJit', 'test_variant_consistency_jit'),
+            DecorateInfo(unittest.skip('Skipped!'), 'TestMathBits', 'test_neg_view'),
+        )),
+    PythonRefInfo(
+        "_refs.meshgrid",
+        torch_opinfo_name="meshgrid",
+        torch_opinfo_variant_name="list_of_tensors",
+    ),
+    PythonRefInfo(
+        "_refs.movedim",
+        aliases=('moveaxis',),
+        torch_opinfo_name="movedim",
+    ),
+    PythonRefInfo(
+        "_refs.bucketize",
+        torch_opinfo_name="bucketize",
+        skips=(
+            # RuntimeError: It appears that you're trying to get value out of a tracing tensor with
+            #  aten._local_scalar_dense.default - erroring out! [...]
+            # triggered by mid_val = boundaries[mid]
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_ref_executor"),
+        )
+    ),
+    PythonRefInfo(
+        "_refs.equal",
+        torch_opinfo_name="equal",
+        skips=(
+            # RuntimeError: Cannot cast FakeTensor to number
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta',),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.atan",
+        torch_opinfo_name="atan",
+        decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         active_if=TEST_WITH_ROCM, device_type='cuda',
+                         dtypes=[torch.complex64, torch.complex128]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         active_if=TEST_WITH_ROCM, device_type='cuda',
+                         dtypes=[torch.complex128]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.atanh",
+        torch_opinfo_name="atanh",
+        decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda', dtypes=[torch.cfloat],
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         active_if=TEST_WITH_ROCM, device_type='cuda',
+                         dtypes=[torch.complex128]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.bitwise_not",
+        torch_opinfo_name="bitwise_not",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.ceil",
+        torch_opinfo_name="ceil",
+        # Fails on int32
+        # https://github.com/pytorch/pytorch/issues/85258
+    ),
+    PythonRefInfo(
+        "_refs.item",
+        torch_opinfo_name="item",
+        skips=(
+            # RuntimeError: Cannot cast FakeTensor(FakeTensor(..., device='meta', size=()), cpu) to number
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
+            # ValueError: Can't convert a tensor with 10 elements to a number!
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.conj_physical",
+        torch_opinfo_name="conj_physical",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.cos",
+        torch_opinfo_name="cos",
+        decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu',
+                         active_if=IS_WINDOWS),
+            # This fails on CUDA but passes on ROCm
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.cdouble,), device_type='cuda'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+            # AssertionError: Tensor-likes are not close!
+            # Greatest absolute difference: nan at index (700,) (up to 1e-05 allowed)
+            # Greatest relative difference: nan at index (700,) (up to 0.001 allowed)
+            DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda',
+                         dtypes=(torch.chalf,), active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.cosh",
+        torch_opinfo_name="cosh",
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/48641
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.int8]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu',
+                         dtypes=[torch.cfloat, torch.cdouble], active_if=IS_MACOS),
+            # AssertionError: Tensor-likes are not close!
+            # Greatest absolute difference: nan at index (6000,) (up to 1e-05 allowed)
+            # Greatest relative difference: nan at index (6000,) (up to 0.001 allowed)
+            DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cuda',
+                         dtypes=(torch.chalf,), active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.digamma",
+        torch_opinfo_name="digamma",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erf",
+        torch_opinfo_name="erf",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erfinv",
+        torch_opinfo_name="erfinv",
+        decorators=(precisionOverride({torch.float16: 1e-2,
+                                       torch.bfloat16: 1e-2,
+                                       torch.float32: 1e-4}),),
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/pull/49155#issuecomment-742664611
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                'test_reference_numerics_extremal',
+                active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                'test_reference_numerics_large',
+                active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                'test_reference_numerics_small',
+                active_if=TEST_SCIPY and version.parse(scipy.__version__) < version.parse("1.4.0")),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.erfc",
+        torch_opinfo_name="erfc",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.exp",
+        torch_opinfo_name="exp",
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/48010
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.expm1",
+        torch_opinfo_name="expm1",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.exp2",
+        torch_opinfo_name="exp2",
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.cdouble]),
+            # Reference: https://github.com/pytorch/pytorch/issues/48010
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.fill",
+        torch_opinfo_name="fill",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.floor",
+        torch_opinfo_name="floor",
+        # Fails on int32
+        # https://github.com/pytorch/pytorch/issues/85258
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.frexp",
+        torch_opinfo_name="frexp",
+        # Skipped due to numerical failures on Windows CI.
+        # This is also skipped in frexp earlier in the file.
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs', 'test_reference_numerics_extremal',
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.frac",
+        torch_opinfo_name="frac",
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                'test_reference_numerics_extremal',
+                dtypes=(torch.bfloat16, torch.float16, torch.float32, torch.float64)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.imag",
+        torch_opinfo_name="imag",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isfinite",
+        torch_opinfo_name="isfinite",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isinf",
+        torch_opinfo_name="isinf",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isposinf",
+        torch_opinfo_name="isposinf",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isneginf",
+        torch_opinfo_name="isneginf",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isnan",
+        torch_opinfo_name="isnan",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.isreal",
+        torch_opinfo_name="isreal",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.i0",
+        torch_opinfo_name="i0",
+        decorators=(precisionOverride({torch.bfloat16: 3e-1,
+                                       torch.float16: 5e-1}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.int8,)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.lgamma",
+        torch_opinfo_name="lgamma",
+        decorators=(precisionOverride({torch.float16: 7e-1}),),
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/pull/50140#issuecomment-756150214
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.float32, torch.float64], active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.multigammaln",
+        torch_opinfo_name="mvlgamma",
+        torch_opinfo_variant_name="mvlgamma_p_1",
+        skips=skips_mvlgamma(),
+        decorators=(
+            DecorateInfo(torch.testing._internal.common_utils.markDynamoStrictTest, 'TestUnaryUfuncs',
+                         'test_reference_numerics_large'),
+            DecorateInfo(torch.testing._internal.common_utils.xfailIfTorchDynamo, 'TestUnaryUfuncs',
+                         'test_reference_numerics_large'),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.multigammaln",
+        torch_opinfo_name="mvlgamma",
+        torch_opinfo_variant_name="mvlgamma_p_3",
+        skips=skips_mvlgamma(),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.multigammaln",
+        torch_opinfo_name="mvlgamma",
+        torch_opinfo_variant_name="mvlgamma_p_5",
+        skips=skips_mvlgamma(),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log",
+        torch_opinfo_name="log",
+        decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log1p",
+        torch_opinfo_name="log1p",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log10",
+        torch_opinfo_name="log10",
+        decorators=(precisionOverride({torch.bfloat16: 5e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.log2",
+        torch_opinfo_name="log2",
+        decorators=(precisionOverride({torch.bfloat16: 1e-1}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.cfloat, torch.cdouble]),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.logsumexp",
+        torch_opinfo_name="logsumexp",
+        # When keepdim=False logsumexp function uses squeeze operation
+        # that is not yet exposed in nvFuser's Python API.
+    ),
+    PythonRefInfo(
+        "_refs.log_softmax",
+        torch_opinfo_name="log_softmax",
+        torch_opinfo_variant_name="with_dtype",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nan_to_num",
+        torch_opinfo_name="nan_to_num",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.neg",
+        torch_opinfo_name="neg",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.positive",
+        torch_opinfo_name="positive",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.real",
+        torch_opinfo_name="real",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.reciprocal",
+        torch_opinfo_name="reciprocal",
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/45690
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.cfloat, torch.cdouble]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.round",
+        torch_opinfo_name="round",
+        # Fails on int32
+        # https://github.com/pytorch/pytorch/issues/85258
+        skips=(
+            DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
+                         "TestUnaryUfuncs", "test_reference_numerics_extremal",
+                         device_type="cuda"),
+            DecorateInfo(toleranceOverride({torch.bfloat16: tol(atol=1e-3, rtol=0.016)}),
+                         "TestUnaryUfuncs", "test_reference_numerics_normal",
+                         device_type="cuda"),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.rsqrt",
+        torch_opinfo_name="rsqrt",
+        decorators=(precisionOverride({torch.half: 5e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=(torch.cfloat, torch.cdouble)),
+            # AssertionError: Tensor-likes are not close!
+            # Greatest absolute difference: nan at index (700,) (up to 0.01 allowed)
+            # Greatest relative difference: nan at index (700,) (up to 0.001 allowed)
+            DecorateInfo(unittest.expectedFailure, 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.chalf,)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sigmoid",
+        torch_opinfo_name="sigmoid",
+        aliases=('_refs.special.expit',),
+        # Reference: https://github.com/pytorch/pytorch/issues/56012
+        handles_complex_extremal_values=False,
+        handles_large_floats=False,
+        decorators=(precisionOverride({torch.float16: 1e-2,
+                                       torch.complex64: 1e-1,
+                                       torch.bfloat16: 1e-2}),),
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/56012
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.complex64, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.chalf, torch.complex64, torch.cdouble])
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sign",
+        torch_opinfo_name="sign",
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/41245
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.bfloat16, torch.float16, torch.float32,
+                                 torch.float64]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sgn",
+        torch_opinfo_name="sgn",
+        # This is an issue with the vectorised abs on CPU
+        handles_complex_extremal_values=False,
+        handles_large_floats=False,
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/41245
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=[torch.bfloat16, torch.float16, torch.float32,
+                                 torch.float64]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.signbit",
+        torch_opinfo_name="signbit",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sin",
+        torch_opinfo_name="sin",
+        decorators=(precisionOverride({torch.bfloat16: 1e-2}),),
+        skips=(
+            # Fails on CUDA but passes on ROCm
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.cdouble,), device_type='cuda'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu',
+                         active_if=IS_WINDOWS),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.cfloat, torch.cdouble,), device_type='cpu',
+                         active_if=IS_WINDOWS),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sinc",
+        torch_opinfo_name="sinc",
+        decorators=(precisionOverride({torch.bfloat16: 1e-2,
+                                       torch.float16: 1e-2}),),
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/49133
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_small',
+                         dtypes=[torch.cfloat]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sinh",
+        torch_opinfo_name="sinh",
+        decorators=(precisionOverride({torch.float16: 1e-2}),),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.cdouble,)),
+            # Reference: https://github.com/pytorch/pytorch/issues/48641
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.int8]),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.softmax",
+        torch_opinfo_name="softmax",
+        torch_opinfo_variant_name="with_dtype",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.sqrt",
+        torch_opinfo_name="sqrt",
+        decorators=(
+            precisionOverride({torch.bfloat16: 7e-2}),
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                'TestUnaryUfuncs', 'test_reference_numerics_large'),
+        ),
+        skips=(
+            # Reference: https://github.com/pytorch/pytorch/issues/47358
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=(torch.cfloat, torch.cdouble),
+                         active_if=IS_MACOS),
+            # Reference: https://github.com/pytorch/pytorch/pull/47293#issuecomment-721774436
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=(torch.bfloat16,)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.square",
+        torch_opinfo_name="square",
+        decorators=(precisionOverride({torch.complex64: 3e-4, torch.bfloat16: 3e-1}),),
+        skips=(
+            # AssertionError: Reference result was farther (2.2417024338305655e-07) from the precise computation
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_executor', dtypes=(torch.complex64,)),
+            # Reference: https://github.com/pytorch/pytorch/issues/52549
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cuda', dtypes=[torch.cfloat, torch.cdouble]),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.tan",
+        torch_opinfo_name="tan",
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=1e-05)}),
+                'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'),
+        ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.tanh",
+        torch_opinfo_name="tanh",
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.complex64: tol(atol=1e-04, rtol=2e-05)}),
+                'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'),
+        ],
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_extremal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_large',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble],
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.trunc",
+        torch_opinfo_name="trunc",
+        # Fails on int32
+        # https://github.com/pytorch/pytorch/issues/85258
+    ),
+    PythonRefInfo(
+        "_refs.special.log_softmax",
+        torch_opinfo_name="log_softmax",  # alias
+        torch_opinfo_variant_name="with_dtype",
+        supports_out=False,
+    ),
+    PythonRefInfo(
+        "_refs.special.softmax",
+        torch_opinfo_name="softmax",  # alias
+        torch_opinfo_variant_name="with_dtype",
+        supports_out=False,
+    ),
+    #
+    # Elementwise Unary Special OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.logit",
+        torch_opinfo_name="logit",
+    ),
+    #
+    # Elementwise Unary nn.functional OpInfos
+    #
+    PythonRefInfo(
+        "_refs.nn.functional.alpha_dropout",
+        torch_opinfo_name="nn.functional.alpha_dropout",
+        decorators=(
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor', device_type='cuda'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # AssertionError: Tensor-likes are not close!
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.celu",
+        torch_opinfo_name="nn.functional.celu",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.threshold",
+        torch_opinfo_name="nn.functional.threshold",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.dropout",
+        torch_opinfo_name="nn.functional.dropout",
+        decorators=(
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: dropout is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # dropout is not comparable
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.elu",
+        torch_opinfo_name="nn.functional.elu",
+        supports_out=True,
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-03, rtol=1.2e-03),
+                    torch.bfloat16: tol(atol=1e-03, rtol=1.2e-03)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.hardtanh",
+        torch_opinfo_name="nn.functional.hardtanh",
+        supports_out=True,
+    ),
+    PythonRefInfo(  # TODO: Port this to an UnaryOpInfo
+        "_refs.nn.functional.gelu",
+        torch_opinfo_name="nn.functional.gelu",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.layer_norm",
+        torch_opinfo_name="nn.functional.layer_norm",
+        skips=(
+            # Reference result was farther (3.5762786809723224e-07) from the precise computation
+            # than the torch result was (2.5068410824946596e-07)!
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.float32,), device_type='cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.glu",
+        torch_opinfo_name="nn.functional.glu",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.pairwise_distance",
+        torch_opinfo_name="nn.functional.pairwise_distance",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.pdist",
+        torch_opinfo_name="nn.functional.pdist",
+        supports_out=True,
+        skips=(
+            # RunTimeError: no _refs support for torch.Tensor.index_select
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+        )),
+    PythonRefInfo(
+        "_refs.nn.functional.leaky_relu",
+        torch_opinfo_name="nn.functional.leaky_relu",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.log_softmax",
+        torch_opinfo_name="log_softmax",  # alias
+        torch_opinfo_variant_name="with_dtype",
+        supports_out=False,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.pixel_shuffle",
+        torch_opinfo_name="nn.functional.pixel_shuffle",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.pixel_unshuffle",
+        torch_opinfo_name="nn.functional.pixel_unshuffle",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.poisson_nll_loss",
+        torch_opinfo_name="nn.functional.poisson_nll_loss",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.prelu",
+        torch_opinfo_name="nn.functional.prelu",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.relu",
+        torch_opinfo_name="nn.functional.relu",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.relu6",
+        torch_opinfo_name="nn.functional.relu6",
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.mish",
+        torch_opinfo_name="nn.functional.mish",
+        supports_out=True,
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-03)}),
+                'TestUnaryUfuncs',), ],
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.selu",
+        torch_opinfo_name="nn.functional.selu",
+        supports_out=True,
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({
+                    torch.float16: tol(atol=1e-2, rtol=1.8e-2),
+                    torch.bfloat16: tol(atol=1e-2, rtol=1.8e-2)
+                }),
+                'TestUnaryUfuncs', device_type='cuda',
+            ), ],
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.softmax",
+        torch_opinfo_name="softmax",  # alias
+        torch_opinfo_variant_name="with_dtype",
+        supports_out=False,
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.softmin",
+        torch_opinfo_name="nn.functional.softmin",
+        torch_opinfo_variant_name="with_dtype",
+        supports_out=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.softplus",
+        torch_opinfo_name="nn.functional.softplus",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.l1_loss",
+        torch_opinfo_name="nn.functional.l1_loss",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.margin_ranking_loss",
+        torch_opinfo_name="nn.functional.margin_ranking_loss",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.mse_loss",
+        torch_opinfo_name="nn.functional.mse_loss",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.smooth_l1_loss",
+        torch_opinfo_name="nn.functional.smooth_l1_loss",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.hinge_embedding_loss",
+        torch_opinfo_name="nn.functional.hinge_embedding_loss",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.nll_loss",
+        torch_opinfo_name="nn.functional.nll_loss",
+        # The corresponding PyTorch op doesn't support out.  But the ref is
+        # registered as a decomp and ATen has an out variant.
+        supports_out=True,
+        # For simpler indexing, we flatten target indices, then reshape the result tensor.
+        # This creates inconsistent view state with reference impl.
+        validate_view_consistency=False,
+        skips=(
+            # RuntimeError: It appears that you're trying to get value out of a tracing tensor - erroring out!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor', device_type="cuda"
+            ),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.huber_loss",
+        torch_opinfo_name="nn.functional.huber_loss",
+        # The corresponding PyTorch op doesn't support out.  But the ref is
+        # registered as a decomp and ATen has an out variant.
+        supports_out=True,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.tanhshrink",
+        torch_opinfo_name="nn.functional.tanhshrink",
+        decorators=[
+            DecorateInfo(unittest.skip("Skipped!"), 'TestUnaryUfuncs',
+                         'test_reference_numerics_normal',
+                         device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),
+            DecorateInfo(
+                toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1.6e-02),
+                                   torch.complex64: tol(atol=6e-04, rtol=1e-05)}),
+                'TestUnaryUfuncs', 'test_reference_numerics_extremal', device_type='cuda'),
+        ],
+        skips=(
+            # in each case, pytorch will produce a nan while numpy will not
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
+                         'TestUnaryUfuncs', "test_reference_numerics_large",
+                         dtypes=(torch.complex64, torch.complex128),
+                         active_if=(IS_MACOS)),
+            DecorateInfo(unittest.skip("Fails on some jobs works on others!"),
+                         'TestUnaryUfuncs', "test_reference_numerics_extremal",
+                         dtypes=(torch.complex64, torch.complex128),
+                         device_type='cpu',
+                         active_if=(IS_MACOS or IS_WINDOWS)),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.hardshrink",
+        torch_opinfo_name="nn.functional.hardshrink",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.nn.functional.softshrink",
+        torch_opinfo_name="nn.functional.softshrink",
+    ),
+    #
+    # Elementwise Binary Reference OpInfos
+    #
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.add",
+        torch_opinfo_name="add",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                'TestBinaryUfuncs', 'test_reference_numerics'),
+        ),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values',
+                         dtypes=(torch.complex64, torch.complex128)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.atan2",
+        torch_opinfo_name="atan2",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_and",
+        torch_opinfo_name="bitwise_and",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_left_shift",
+        torch_opinfo_name="bitwise_left_shift",
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/70904
+            DecorateInfo(unittest.skip("Some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_right_shift",
+        torch_opinfo_name="bitwise_right_shift",
+        skips=(
+            # # https://github.com/pytorch/pytorch/issues/70904
+            DecorateInfo(unittest.skip("Skipped some inputs produce undefined outputs"), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_or",
+        torch_opinfo_name="bitwise_or",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.bitwise_xor",
+        torch_opinfo_name="bitwise_xor",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.copysign",
+        torch_opinfo_name="copysign",
+        skips=(
+            # RuntimeError: Expected divisor (b) to be on the same device (cuda:0) as dividend (a), but it is found on cpu!
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs', 'test_type_promotion'),
+            # FIXME output 0: meta disagrees with real impl
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.div",
+        torch_opinfo_name="div",
+        torch_opinfo_variant_name="no_rounding_mode",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        skips=(
+            # NotImplementedError: argument of type: <class 'complex'>
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_executor',
+                dtypes=(torch.complex32, torch.complex64, torch.complex128,)
+            ),
+            # Reference result was farther (0.7433461727239705) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+            # Reference result was farther (0.7433461727239705) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.div",
+        torch_opinfo_name="div",
+        torch_opinfo_variant_name="trunc_rounding",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        decorators=(
+            # See https://github.com/pytorch/pytorch/issues/111126
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.div",
+        torch_opinfo_name="div",
+        torch_opinfo_variant_name="floor_rounding",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        decorators=(
+            # See https://github.com/pytorch/pytorch/issues/111126
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.eq",
+        torch_opinfo_name="eq",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.float_power",
+        torch_opinfo_name="float_power",
+        skips=(
+            # Test doesn't account for float -> double type promotion
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+            # Complex values error with: Greatest absolute difference: nan at index
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=[torch.complex64, torch.complex128]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_large_values',
+                         dtypes=[torch.complex64, torch.complex128]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values',
+                         dtypes=[torch.complex64, torch.complex128]),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logaddexp",
+        torch_opinfo_name="logaddexp",
+        skips=(
+            # failure due to mismatch in edge cases, which boils down to what torch.exp(inf + infj) should be
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', device_type='cpu',
+                         dtypes=(torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback', device_type='cpu',
+                         dtypes=(torch.complex64, torch.complex128)),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.logaddexp2",
+        torch_opinfo_name="logaddexp2",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.floor_divide",
+        torch_opinfo_name="floor_divide",
+        rhs_make_tensor_kwargs=dict(exclude_zero=True),
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        # bfloat16 floor_divide compared with a float32 reference works inconsistently
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.bfloat16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.bfloat16,)),
+            # bfloat16 floor_divide compared with a float32 reference works inconsistently
+            DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs',
+                         dtypes=(torch.bfloat16,)),
+            # int8 floor divide has different results for -128 // -1 vs. NumPy
+            DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.int8,)),
+            # The following tests fails on some jobs
+            DecorateInfo(unittest.skip('Skipped!'), 'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values',
+                         dtypes=(torch.float16,)),
+            DecorateInfo(toleranceOverride({torch.float16: tol(atol=1e-3, rtol=5e-3)}),
+                         'TestBinaryUfuncs', 'test_reference_numerics'),
+            # FIXME output 0: meta disagrees with real impl
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.fmax",
+        torch_opinfo_name="fmax",
+        supports_rhs_python_scalar=False,
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.fmin",
+        torch_opinfo_name="fmin",
+        supports_rhs_python_scalar=False,
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.fmod",
+        torch_opinfo_name="fmod",
+        rhs_make_tensor_kwargs={'exclude_zero': True},
+        supports_rhs_python_scalar=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.bfloat16,), device_type='cpu'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.bfloat16,), device_type='cpu'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_contig_vs_every_other',
+                         dtypes=(torch.bfloat16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_non_contig',
+                         dtypes=(torch.bfloat16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics',
+                         dtypes=(torch.bfloat16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.uint8,)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.gcd",
+        torch_opinfo_name="gcd",
+        skips=(
+            DecorateInfo(unittest.expectedFailure,
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.int8,)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.ge",
+        torch_opinfo_name="ge",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.gt",
+        torch_opinfo_name="gt",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.heaviside",
+        torch_opinfo_name="heaviside",
+        supports_rhs_python_scalar=False,
+        skips=(
+            # PyTorch's heaviside does not appear to propagate NaNs
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.hypot",
+        torch_opinfo_name="hypot",
+        supports_rhs_python_scalar=False,
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.igamma",
+        torch_opinfo_name="igamma",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.igammac",
+        torch_opinfo_name="igammac",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.isclose",
+        torch_opinfo_name="isclose",
+        skips=(
+            # Intentional xfail -- isclose does not type promote
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.lcm",
+        torch_opinfo_name="lcm",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.le",
+        torch_opinfo_name="le",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logical_and",
+        torch_opinfo_name="logical_and",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.logical_not",
+        torch_opinfo_name="logical_not",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logical_or",
+        torch_opinfo_name="logical_or",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.logical_xor",
+        torch_opinfo_name="logical_xor",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.lt",
+        torch_opinfo_name="lt",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.maximum",
+        torch_opinfo_name="maximum",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.minimum",
+        torch_opinfo_name="minimum",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.mul",
+        torch_opinfo_name="mul",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        skips=(
+            # Reference result was farther (0.0) from the precise computation
+            # than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                dtypes=(torch.complex32,),
+            ),
+            # Reference result was farther (0.0) from the precise computation
+            # than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                dtypes=(torch.complex32,), device_type='cuda'
+            ),
+            # Reference result was farther (0.0) from the precise computation
+            # than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                dtypes=(torch.complex32,), device_type='cuda'
+            ),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.ne",
+        torch_opinfo_name="ne",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.nextafter",
+        torch_opinfo_name="nextafter",
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.pow",
+        torch_opinfo_name="pow",
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05)}),
+                'TestBinaryUfuncs', 'test_reference_numerics'),
+            DecorateInfo(
+                toleranceOverride({torch.complex64: tol(atol=1e-4, rtol=1.3e-05),
+                                   torch.complex128: tol(atol=1e-4, rtol=1.3e-05)}),
+                'TestBinaryUfuncs', 'test_scalar_support'),
+        ),
+        skips=(
+            # Reference result was farther (inf) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                dtypes=(torch.complex32,),
+            ),
+            # Reference result was farther (inf) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+            # Reference result was farther (inf) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+            # Skipping integers because they are being raised to negative powers causing an error
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=[torch.int8, torch.int16, torch.int32, torch.int64]),
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs',
+                         'test_reference_numerics_large_values',
+                         dtypes=[torch.int16, torch.int32, torch.int64]),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics',
+                         dtypes=(torch.complex32,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_large_values',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_extremal_values',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.remainder",
+        torch_opinfo_name="remainder",
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.bfloat16,), device_type='cpu'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.bfloat16,), device_type='cpu'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics',
+                         dtypes=(torch.bfloat16,)),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.uint8,)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.rsub",
+        torch_opinfo_name="rsub",
+        # https://github.com/pytorch/pytorch/issues/76944
+        skips=(
+            # Reference result was farther (nan) from the precise computation than
+            # the torch result was (nan)!
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.chalf,), device_type='cpu'),
+            # Reference result was farther (nan) from the precise computation than
+            # the torch result was (nan)!
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.chalf,), device_type='cpu'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.sub",
+        torch_opinfo_name="sub",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-2, rtol=0),
+                                   torch.bfloat16: tol(atol=1e-5, rtol=5e-3),
+                                   torch.complex32: tol(atol=1e-5, rtol=1e-3)}),
+                'TestBinaryUfuncs', 'test_reference_numerics'),
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(atol=1e-2, rtol=0)}),
+                'TestCommon', 'test_complex_half_reference_testing', device_type='cpu'),
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                'TestDecomp', 'test_comprehensive', device_type='cpu'),
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(atol=5e-3, rtol=0)}),
+                'TestDecomp', 'test_quick', device_type='cpu'),
+        ),
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics',
+                         dtypes=(torch.uint8,)),
+            DecorateInfo(unittest.skip("Skipped!"),
+                         'TestBinaryUfuncs',
+                         'test_reference_numerics_small_values',
+                         dtypes=(torch.uint8,)),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.true_divide",
+        torch_opinfo_name="true_divide",
+        # https://github.com/pytorch/pytorch/issues/76944
+        supports_two_python_scalars=True,
+        supports_one_python_scalar=True,
+        skips=(
+            # Reference result was farther (0.7433461727239705) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                dtypes=(torch.complex32,),
+            ),
+            # Reference result was farther (0.7433461727239705) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+            # Reference result was farther (0.7433461727239705) from the precise
+            # computation than the torch result was (nan)!
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                dtypes=(torch.complex32,), device_type="cuda"
+            ),
+        ),
+    ),
+    #
+    # Elementwise Ternary Reference OpInfos
+    #
+    PythonRefInfo(
+        "_refs.addcdiv",
+        torch_opinfo_name="addcdiv",
+    ),
+    PythonRefInfo(
+        "_refs.addcmul",
+        torch_opinfo_name="addcmul",
+        skips=(
+            # Reference result was farther (1.3343989849090576e-05)
+            # from the precise computation than the torch result
+            # was (9.592622518539429e-06)!
+            # FIXME: enable dtype-based tolerances in test_ops.py:TestCommon._ref_test_helper
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.float16,), device_type="cpu"),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.float16,), device_type="cpu"),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.clamp_min",
+        torch_opinfo_name="clamp_min",
+        skips=(
+            # test error disabled since rhs non-tensor python scalar is supported
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.clamp_max",
+        torch_opinfo_name="clamp_max",
+        skips=(
+            # test error disabled since rhs non-tensor python scalar is supported
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.clamp",
+        torch_opinfo_name="clamp",
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.triplet_margin_loss",
+        torch_opinfo_name="nn.functional.triplet_margin_loss",
+        supports_out=False,
+        # TODO: Uses minimum and clamp
+        skips=(
+            # AssertionError: Tensor-likes are not close!
+            # Greatest absolute difference: 6.103515625e-05 at index (4,) (up to 1e-05 allowed)
+            # Greatest relative difference: 8.519846983548175e-06 at index (4,) (up to 1.3e-06 allowed)
+            DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.uint8,), device_type="cpu"),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.xlogy",
+        torch_opinfo_name="xlogy",
+        supports_one_python_scalar=True,
+    ),
+    #
+    # Elementwise Binary Special OpInfos
+    #
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.special.xlog1py",
+        torch_opinfo_name="special.xlog1py",
+        supports_one_python_scalar=True,
+    ),
+    #
+    # Data Conversion & Data Movement Opinfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.bfloat16",
+        torch_opinfo_name="bfloat16",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.bool",
+        torch_opinfo_name="bool",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.byte",
+        torch_opinfo_name="byte",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.char",
+        torch_opinfo_name="char",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs._conversions.complex",
+        torch_opinfo_name="complex",
+        error_inputs_func=partial(error_inputs_complex, is_ref=True),
+        skips=(
+            # Tests don't account for complex's type promotion semantics
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+        )
+    ),
+    ElementwiseBinaryPythonRefInfo(
+        "_refs._conversions.polar",
+        torch_opinfo_name="polar",
+        skips=(
+            # Tests don't account for complex's type promotion semantics
+            DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion'),
+            DecorateInfo(unittest.expectedFailure, 'TestMeta', 'test_binary_ufuncs_mixed_dtype'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.double",
+        torch_opinfo_name="double",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.float",
+        torch_opinfo_name="float",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.half",
+        torch_opinfo_name="half",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.int",
+        torch_opinfo_name="int",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.long",
+        torch_opinfo_name="long",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.short",
+        torch_opinfo_name="short",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+        skips=(
+            DecorateInfo(unittest.skip('Overflow when downcasting signed type is undefined'), 'TestCommon', 'test_compare_cpu'),
+        )
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.chalf",
+        torch_opinfo_name="chalf",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.cfloat",
+        torch_opinfo_name="cfloat",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs._conversions.cdouble",
+        torch_opinfo_name="cdouble",
+        # TODO: If self already has the correct dtype and device, then self is
+        # returned ignoring memory_format.
+        # https://github.com/pytorch/pytorch/issues/86558
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.clone",
+        torch_opinfo_name="clone",
+    ),
+    #
+    # View & Shape OpInfos
+    #
+    PythonRefInfo(
+        "_refs.atleast_1d",
+        torch_opinfo_name="atleast_1d",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.atleast_2d",
+        torch_opinfo_name="atleast_2d",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.atleast_3d",
+        torch_opinfo_name="atleast_3d",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.as_strided",
+        torch_opinfo_name="as_strided",
+        torch_opinfo_variant_name="partial_views",
+        # FIXME: doesn't support chalf
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            # cloned_mutable_input.is_same(returned_output) INTERNAL ASSERT FAILED
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Errors when storage_offset is included"), 'TestMathBits', 'test_neg_conj_view'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.as_strided_scatter",
+        torch_opinfo_name="as_strided_scatter",
+        # returns a view of an intermediate tensor (as_strided)
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.block_diag",
+        torch_opinfo_name="block_diag",
+    ),
+    PythonRefInfo(
+        "_refs.broadcast_shapes",
+        torch_opinfo_name="broadcast_shapes",
+    ),
+    PythonRefInfo(
+        "_refs.broadcast_tensors",
+        torch_opinfo_name="broadcast_tensors",
+    ),
+    PythonRefInfo(
+        "_refs.broadcast_to",
+        torch_opinfo_name="broadcast_to",
+    ),
+    PythonRefInfo(
+        "_refs.cat",
+        torch_opinfo_name="cat",
+        skips=(
+            # FIXME: AssertionError: RuntimeError not raised
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.chunk",
+        torch_opinfo_name="chunk",
+    ),
+    PythonRefInfo(
+        "_refs.column_stack",
+        torch_opinfo_name="column_stack",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.conj",
+        torch_opinfo_name="conj",
+    ),
+    PythonRefInfo(
+        "_refs.constant_pad_nd",
+        torch_opinfo_name="constant_pad_nd",
+    ),
+    PythonRefInfo(
+        "_refs.contiguous",
+        torch_opinfo_name="contiguous",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.deg2rad",
+        torch_opinfo_name="deg2rad",
+        decorators=(precisionOverride({torch.bfloat16: 7e-1,
+                                       torch.float16: 7e-1}),),
+    ),
+    PythonRefInfo(
+        "_refs.dsplit",
+        torch_opinfo_name="dsplit",
+    ),
+    PythonRefInfo(
+        "_refs.diag",
+        torch_opinfo_name="diag",
+    ),
+    PythonRefInfo(
+        "_refs.diagonal",
+        torch_opinfo_name="diagonal",
+    ),
+    PythonRefInfo(
+        "_refs.diagonal_copy",
+        torch_opinfo_name="diagonal_copy",
+    ),
+    PythonRefInfo(
+        "_refs.diagonal_scatter",
+        torch_opinfo_name="diagonal_scatter",
+        supports_out=True,
+        # returns a view of an intermediate tensor (as_strided)
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.diag_embed",
+        torch_opinfo_name="diag_embed",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.dstack",
+        torch_opinfo_name="dstack",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.expand",
+        torch_opinfo_name="expand",
+    ),
+    PythonRefInfo(
+        "_refs.expand_as",
+        torch_opinfo_name="expand_as",
+    ),
+    PythonRefInfo(
+        "_refs.flatten",
+        torch_opinfo_name="flatten",
+    ),
+    PythonRefInfo(
+        "_refs.flip",
+        torch_opinfo_name="flip",
+    ),
+    PythonRefInfo(
+        "_refs.fliplr",
+        torch_opinfo_name="fliplr",
+    ),
+    PythonRefInfo(
+        "_refs.flipud",
+        torch_opinfo_name="flipud",
+    ),
+    PythonRefInfo(
+        "_refs.hstack",
+        torch_opinfo_name="hstack",
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/78613
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.narrow",
+        torch_opinfo_name="narrow",
+        error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=True, is_ref=True),
+    ),
+    PythonRefInfo(
+        "_refs.narrow_copy",
+        torch_opinfo_name="narrow_copy",
+        supports_out=True,
+        error_inputs_func=partial(error_inputs_narrow_narrow_copy, is_narrow=False, is_ref=True),
+    ),
+    PythonRefInfo(
+        "_refs.nn.functional.group_norm",
+        torch_opinfo_name="nn.functional.group_norm",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.native_layer_norm",
+        torch_opinfo_name="native_layer_norm",
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_python_ref",
+                         device_type="cpu", dtypes=(torch.float32,)),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_python_ref_torch_fallback",
+                         device_type="cpu", dtypes=(torch.float32,)),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.permute",
+        torch_opinfo_name="permute",
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.rad2deg",
+        torch_opinfo_name="rad2deg",
+        decorators=(precisionOverride({torch.bfloat16: 7e-1,
+                                       torch.float16: 7e-1}),),
+    ),
+    PythonRefInfo(
+        "_refs.ravel",
+        torch_opinfo_name="ravel",
+    ),
+    PythonRefInfo(
+        "_refs.renorm",
+        torch_opinfo_name="renorm",
+    ),
+    PythonRefInfo(
+        "_refs.repeat",
+        torch_opinfo_name="repeat",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.reshape",
+        torch_opinfo_name="reshape",
+    ),
+    PythonRefInfo(
+        "_refs.reshape_as",
+        torch_opinfo_name="reshape_as",
+    ),
+    PythonRefInfo(
+        "_refs.roll",
+        torch_opinfo_name="roll",
+        validate_view_consistency=False,
+        skips=(
+            # RuntimeError: no _refs support for torch.Tensor.__getitem__
+            # Leaving it as a ref because fftshift uses it
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.rot90",
+        torch_opinfo_name="rot90",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.stack",
+        torch_opinfo_name="stack",
+        validate_view_consistency=False,
+    ),
+    PythonRefInfo(
+        "_refs.squeeze",
+        torch_opinfo_name="squeeze",
+    ),
+    PythonRefInfo(
+        "_refs.squeeze",
+        torch_opinfo_name="squeeze",
+        torch_opinfo_variant_name="multiple",
+    ),
+    PythonRefInfo(
+        "_refs.tensor_split",
+        torch_opinfo_name="tensor_split",
+        skips=(
+            # TensorMeta doesn't support tolist
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_meta'),
+            # RuntimeError: no _refs support for torch.Tensor.tolist
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.hsplit",
+        torch_opinfo_name="hsplit",
+    ),
+    PythonRefInfo(
+        "_refs.vsplit",
+        torch_opinfo_name="vsplit",
+    ),
+    PythonRefInfo(
+        "_refs.dot",
+        torch_opinfo_name="dot",
+        error_inputs_func=partial(error_inputs_dot_vdot, is_ref=True),
+        # .conj() does not set ._is_view() correctly in ATen
+        validate_view_consistency=False,
+        skips=(
+            # RuntimeError: no _refs support for torch.Tensor.is_conj
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', dtypes=[torch.complex64, torch.complex128]),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.vdot",
+        torch_opinfo_name="vdot",
+        error_inputs_func=partial(error_inputs_dot_vdot, is_ref=True),
+        # .conj() does not set ._is_view() correctly in ATen
+        validate_view_consistency=False,
+        skips=(
+            # RuntimeError: no _refs support for torch.Tensor.is_conj
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', dtypes=[torch.complex64, torch.complex128]),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.transpose",
+        torch_opinfo_name="transpose",
+    ),
+    PythonRefInfo(
+        "_refs.t",
+        torch_opinfo_name="t",
+    ),
+    PythonRefInfo(
+        "_refs.T",
+        torch_opinfo_name="T",
+        error_inputs_func=partial(error_inputs_T, has_ndims_error=True),
+    ),
+    PythonRefInfo(
+        "_refs.unfold",
+        torch_opinfo_name="unfold",
+    ),
+    PythonRefInfo(
+        "_refs.unfold_copy",
+        torch_opinfo_name="unfold_copy",
+        supports_out=True,
+    ),
+    PythonRefInfo(
+        "_refs.unsqueeze",
+        torch_opinfo_name="unsqueeze",
+    ),
+    PythonRefInfo(
+        "_refs.view",
+        torch_opinfo_name="view",
+    ),
+    PythonRefInfo(
+        "_refs.view_as",
+        torch_opinfo_name="view_as",
+    ),
+    PythonRefInfo(
+        "_refs.vstack",
+        torch_opinfo_name="vstack",
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/78613
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.unflatten",
+        torch_opinfo_name="unflatten",
+    ),
+    PythonRefInfo(
+        "_refs.unbind",
+        torch_opinfo_name="unbind",
+    ),
+    #
+    # Reduction Reference OpInfos
+    #
+    ReductionPythonRefInfo(
+        "_refs.all",
+        torch_opinfo_name="all",
+        skips=(
+            # FIXME: uint8 input returns uint8 instead of bool
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_result_dtype',
+                dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.amax",
+        torch_opinfo_name="amax",
+        error_inputs_func=partial(error_inputs_aminmax_amax_amin, is_ref=True),
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.amin",
+        torch_opinfo_name="amin",
+        error_inputs_func=partial(error_inputs_aminmax_amax_amin, is_ref=True),
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.any",
+        torch_opinfo_name="any",
+        skips=(
+            # FIXME: uint8 input returns uint8 instead of bool
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_result_dtype',
+                dtypes=[torch.uint8]),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.count_nonzero",
+        torch_opinfo_name="count_nonzero",
+        skips=(
+            # FIXME: count_nonzero does not accept keepdim kwarg
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions',
+                'test_dim_default_keepdim'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_dim_none_keepdim'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_dim_single_keepdim'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_dim_multi_keepdim'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions',
+                'test_dim_multi_unsorted_keepdim'),
+            # FIXME: dim=[] reduces all dimensions
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.mean",
+        torch_opinfo_name="mean",
+        supports_out=True,
+        error_inputs_func=partial(error_inputs_mean, is_ref=True),
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.std",
+        torch_opinfo_name="std",
+        supports_out=True,
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                dtypes=(torch.float16,)),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions',
+                'test_ref_duplicate_values',
+                dtypes=(torch.float16,)),
+        ),
+    ),
+    # std_mean and var_mean are not ReductionInfos
+    PythonRefInfo(
+        "_refs.std_mean",
+        torch_opinfo_name="std_mean",
+    ),
+    ReductionPythonRefInfo(
+        "_refs.sum",
+        torch_opinfo_name="sum",
+        supports_out=True,
+        skips=(
+            # FIXME: doesn't test out behavior properly for this operator
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+            # FIXME: mean reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                dtypes=[torch.float16]),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions',
+                'test_ref_duplicate_values',
+                dtypes=[torch.float16]),
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestOperators', 'test_reduction_all',
+                dtypes=[torch.float32]),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.cumsum",
+        torch_opinfo_name="cumsum",
+        supports_out=True,
+        skips=(
+            # doesn't test out behavior properly for this operator
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.cumprod",
+        torch_opinfo_name="cumprod",
+        supports_out=True,
+        skips=(
+            # doesn't test out behavior properly for this operator
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.sum_to_size",
+        torch_opinfo_name="sum_to_size",
+        validate_view_consistency=False,
+    ),
+    ReductionPythonRefInfo(
+        "_refs.prod",
+        torch_opinfo_name="prod",
+        supports_out=True,
+        supports_multiple_dims=True,
+        skips=(
+            # FIXME: doesn't test out behavior properly for this operator
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_out'),
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input',
+                dtypes=[torch.float16, torch.complex64]),
+        ),
+    ),
+    ReductionPythonRefInfo(
+        "_refs.var",
+        torch_opinfo_name="var",
+        supports_out=True,
+        skips=(
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty'),
+            DecorateInfo(
+                unittest.expectedFailure, 'TestReductions', 'test_dim_empty_keepdim'),
+            # FIXME: improve precision
+            DecorateInfo(
+                unittest.skip("Skipped!"), 'TestReductions', 'test_ref_small_input'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.var_mean",
+        torch_opinfo_name="var_mean",
+        validate_view_consistency=False,
+    ),
+    #
+    # Linear Algebra Operators
+    #
+    PythonRefInfo(
+        "_refs.addr",
+        torch_opinfo_name="addr",
+        decorators=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.trace",
+        torch_opinfo_name="trace",
+    ),
+    PythonRefInfo(
+        "_refs.norm",
+        torch_opinfo_name="norm",
+        supports_out=True,
+        # Uses vector_norm inside and vector_norm is affected by
+        # https://github.com/pytorch/pytorch/issues/77216
+        validate_view_consistency=False,
+    ),
+    #
+    # Tensor Creation Reference OpInfos
+    #
+    PythonRefInfo(
+        "_refs.empty",
+        torch_opinfo_name="empty",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # FIXME: shouldn't check empty results
+            DecorateInfo(unittest.skip("Can't check result for empty"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.empty_like",
+        torch_opinfo_name="empty_like",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # FIXME: should not compare results of empty_like
+            DecorateInfo(unittest.skip("Can't check result for empty_like"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.randn",
+        torch_opinfo_name="randn",
+        op=lambda *args, **kwargs: wrapper_set_seed(refs.randn, *args, **kwargs),
+        skips=(
+            # see https://github.com/pytorch/pytorch/issues/85121
+            DecorateInfo(unittest.skip("make_traced() doesn't set seed properly!"),
+                         'TestCommon',
+                         'test_python_ref_executor'),
+            # These tests expect the input to be a tensor or a sequence of tensors
+            DecorateInfo(unittest.skip("Test expects tensor input"), "TestCommon", "test_noncontiguous_samples"),
+            DecorateInfo(unittest.skip("Test expects tensor input"), 'TestMathBits', 'test_neg_view'),
+            DecorateInfo(unittest.skip("Test expects tensor input"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Test expects tensor input"), 'TestMathBits', 'test_neg_conj_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.eye",
+        torch_opinfo_name="eye",
+        skips=(
+            # skip these tests since we have non tensor input
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_conj_view'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.new_empty",
+        torch_opinfo_name="new_empty",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestCommon',
+                         'test_out_warning'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            # FIXME: should not compare results of empty_like
+            DecorateInfo(unittest.skip("Can't check result for new_empty"), 'TestCommon', 'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.new_empty_strided",
+        torch_opinfo_name="new_empty_strided",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.empty_strided",
+        torch_opinfo_name="empty_strided",
+        skips=(
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_torch_fallback'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_neg_conj_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestMathBits',
+                         'test_neg_view'),
+            DecorateInfo(unittest.skip("Expected: empty_strided is not comparable"),
+                         'TestCommon',
+                         'test_python_ref_executor'),
+            DecorateInfo(unittest.skip('output is non-deterministic'), 'TestCommon', 'test_compare_cpu'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.new_full",
+        torch_opinfo_name="new_full",
+    ),
+    PythonRefInfo(
+        "_refs.new_ones",
+        torch_opinfo_name="new_ones",
+    ),
+    PythonRefInfo(
+        "_refs.new_zeros",
+        torch_opinfo_name="new_zeros",
+    ),
+    #
+    # Conditional Reference OpInfos
+    #
+    PythonRefInfo(
+        "_refs.masked_fill",
+        torch_opinfo_name="masked_fill",
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.where",
+        torch_opinfo_name="where",
+        op=lambda self, condition, other: refs.where(condition, self, other),
+        supports_out=False,
+        skips=(
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors', device_type='cuda'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.index_select",
+        torch_opinfo_name="index_select",
+        # empty_strided
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+            # Sample out= with a stride of zero. This _out operation checks that the input has no
+            # inner overlap
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),)
+    ),
+    PythonRefInfo(
+        "_refs.index_copy",
+        torch_opinfo_name="index_copy",
+        # empty_strided
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.index_add",
+        torch_opinfo_name="index_add",
+        # empty_strided
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_errors'),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.index_fill",
+        torch_opinfo_name="index_fill",
+        # empty_strided
+        skips=(
+            # no _refs support for Tensor.__setitem__
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref'),)
+    ),
+    #
+    # Test-related functions
+    #
+    PythonRefInfo(
+        "_refs.allclose",
+        torch_opinfo_name="allclose",
+    ),
+    #
+    # Misc functions
+    #
+    PythonRefInfo(
+        "_refs.stft",
+        torch_opinfo_name="stft",
+        skips=[
+            # RuntimeError: no _refs support for aten.pad
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref'
+            ),
+        ],
+    ),
+    PythonRefInfo(
+        "_refs.istft",
+        torch_opinfo_name="istft",
+        skips=[
+            # RuntimeError: no _refs support for aten.unfold_backward
+            DecorateInfo(
+                unittest.expectedFailure, 'TestCommon', 'test_python_ref'
+            ),
+        ],
+    ),
+    PythonRefInfo(
+        "_refs.view_as_complex",
+        torch_opinfo_name="view_as_complex",
+    ),
+]
+python_ref_db += opinfo.definitions.python_ref_db
+
+# Common operator groupings
+ops_and_refs = op_db + python_ref_db
+unary_ufuncs = [op for op in ops_and_refs if isinstance(op, UnaryUfuncInfo)]
+binary_ufuncs = [op for op in ops_and_refs if isinstance(op, BinaryUfuncInfo)]
+binary_ufuncs_and_refs = tuple(op for op in ops_and_refs if isinstance(op, BinaryUfuncInfo))
+spectral_funcs = [op for op in ops_and_refs if isinstance(op, SpectralFuncInfo)]
+sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse]
+sparse_csr_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse_csr]
+sparse_reduction_ops = [op for op in op_db if isinstance(op, ReductionOpInfo) and op.supports_sparse]
+shape_funcs = [op for op in ops_and_refs if isinstance(op, ShapeFuncInfo)]
+reduction_ops = [op for op in ops_and_refs if isinstance(op, ReductionOpInfo)]
+reference_filtered_ops = [op for op in reduction_ops if op.ref is not None]
+reference_masked_ops = [op for op in reference_filtered_ops if op.name.startswith('masked.')]
+sparse_masked_reduction_ops = [op for op in sparse_reduction_ops if op.name.startswith('masked.')]
+
+# TODO: review porting these to make_tensor
+def index_variable(shape, max_indices, device=torch.device('cpu')):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
+    index = torch.rand(*shape, dtype=torch.double, device=device).mul_(max_indices).floor_().long()
+    return index
+
+def gather_variable(shape, index_dim, max_indices, duplicate=False, device=torch.device('cpu')):
+    assert len(shape) == 2
+    assert index_dim < 2
+    batch_dim = 1 - index_dim
+    index = torch.zeros(*shape, dtype=torch.long, device=device)
+    for i in range(shape[index_dim]):
+        index.select(index_dim, i).copy_(
+            torch.randperm(max_indices, device=device)[:shape[batch_dim]])
+    if duplicate:
+        index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
+    return index
+
+def bernoulli_scalar():
+    return torch.tensor(0, dtype=torch.bool).bernoulli_()
+
+def mask_not_all_zeros(shape):
+    assert len(shape) > 0
+    while True:
+        result = torch.randn(shape).gt(0)
+        if result.sum() > 0:
+            return result
+
+# Copied from functorch
+def xfail(op_name, variant_name='', *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, True)
+
+
+def skip(op_name, variant_name='', *, device_type=None, dtypes=None):
+    return (op_name, variant_name, device_type, dtypes, False)
+
+
+def skipOps(test_case_name, base_test_name, to_skip):
+    all_opinfos = op_db
+    for xfail in to_skip:
+        op_name, variant_name, device_type, dtypes, expected_failure = xfail
+        matching_opinfos = [o for o in all_opinfos
+                            if o.name == op_name and o.variant_test_name == variant_name]
+        assert len(matching_opinfos) >= 1, f"Couldn't find OpInfo for {xfail}"
+        for op in matching_opinfos:
+            decorators = list(op.decorators)
+            if expected_failure:
+                decorator = DecorateInfo(unittest.expectedFailure,
+                                         test_case_name, base_test_name,
+                                         device_type=device_type, dtypes=dtypes)
+                decorators.append(decorator)
+            else:
+                decorator = DecorateInfo(unittest.skip("Skipped!"),
+                                         test_case_name, base_test_name,
+                                         device_type=device_type, dtypes=dtypes)
+                decorators.append(decorator)
+            op.decorators = tuple(decorators)
+
+    # This decorator doesn't modify fn in any way
+    def wrapped(fn):
+        return fn
+    return wrapped
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_mkldnn.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3125c399607c7b5fb9d77dbf52c7f5b9a8ec1f15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_mkldnn.py
@@ -0,0 +1,78 @@
+# mypy: ignore-errors
+
+import contextlib
+import functools
+import inspect
+
+import torch
+
+
+# Test whether hardware BF32 math mode enabled. It is enabled only on:
+# - MKLDNN is available
+# - BF16 is supported by MKLDNN
+def bf32_is_not_fp32():
+    if not torch.backends.mkldnn.is_available():
+        return False
+    if not torch.ops.mkldnn._is_mkldnn_bf16_supported():
+        return False
+    return True
+
+
+@contextlib.contextmanager
+def bf32_off():
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    try:
+        torch.set_float32_matmul_precision("highest")
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+
+
+@contextlib.contextmanager
+def bf32_on(self, bf32_precision=1e-5):
+    old_matmul_precision = torch.get_float32_matmul_precision()
+    old_precision = self.precision
+    try:
+        torch.set_float32_matmul_precision("medium")
+        self.precision = bf32_precision
+        yield
+    finally:
+        torch.set_float32_matmul_precision(old_matmul_precision)
+        self.precision = old_precision
+
+
+# This is a wrapper that wraps a test to run this test twice, one with
+# allow_bf32=True, another with allow_bf32=False. When running with
+# allow_bf32=True, it will use reduced precision as specified by the
+# argument
+def bf32_on_and_off(bf32_precision=1e-5):
+    def with_bf32_disabled(self, function_call):
+        with bf32_off():
+            function_call()
+
+    def with_bf32_enabled(self, function_call):
+        with bf32_on(self, bf32_precision):
+            function_call()
+
+    def wrapper(f):
+        params = inspect.signature(f).parameters
+        arg_names = tuple(params.keys())
+
+        @functools.wraps(f)
+        def wrapped(*args, **kwargs):
+            for k, v in zip(arg_names, args):
+                kwargs[k] = v
+            cond = bf32_is_not_fp32()
+            if "device" in kwargs:
+                cond = cond and (torch.device(kwargs["device"]).type == "cpu")
+            if "dtype" in kwargs:
+                cond = cond and (kwargs["dtype"] == torch.float)
+            if cond:
+                with_bf32_disabled(kwargs["self"], lambda: f(**kwargs))
+                with_bf32_enabled(kwargs["self"], lambda: f(**kwargs))
+            else:
+                f(**kwargs)
+
+        return wrapped
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_modules.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..94942e8b134d4933b3ed726dfb5e749e7134800b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_modules.py
@@ -0,0 +1,4400 @@
+# mypy: ignore-errors
+
+import torch
+import unittest
+from copy import deepcopy
+from enum import Enum
+from functools import wraps, partial
+from itertools import chain, product
+import itertools
+import math
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.testing import make_tensor
+from torch.testing._internal.common_cuda import TEST_CUDNN
+from torch.testing._internal.common_dtype import (
+    floating_types, floating_and_complex_types_and, get_all_fp_dtypes)
+from torch.testing._internal.common_device_type import (
+    _TestParametrizer, _update_param_kwargs, toleranceOverride, tol,
+    skipCUDAIfCudnnVersionLessThan, skipCUDAIfRocm, precisionOverride, skipMeta, skipMPS, skipCUDAVersionIn)
+from torch.testing._internal.common_methods_invocations import DecorateInfo
+from torch.testing._internal.common_nn import (
+    cosineembeddingloss_reference, cross_entropy_loss_reference, ctcloss_reference,
+    hingeembeddingloss_reference, huberloss_reference, kldivloss_reference,
+    marginrankingloss_reference, multimarginloss_reference, multilabelmarginloss_reference,
+    nllloss_reference, nlllossNd_reference, smoothl1loss_reference, softmarginloss_reference, get_reduction)
+from torch.testing._internal.common_utils import (
+    freeze_rng_state, set_single_threaded_if_parallel_tbb, skipIfMps, GRADCHECK_NONDET_TOL, TEST_WITH_ROCM, IS_WINDOWS,
+    skipIfTorchDynamo)
+from types import ModuleType
+from typing import List, Tuple, Type, Set, Dict
+
+# List of all namespaces containing modules to test.
+MODULE_NAMESPACES: List[ModuleType] = [
+    torch.nn.modules,
+    torch.ao.nn.qat.modules,
+    torch.ao.nn.quantizable.modules,
+    torch.ao.nn.quantized.modules,
+    torch.ao.nn.quantized.modules,
+]
+
+# Modules that shouldn't be tested for one reason or another.
+MODULES_TO_SKIP: Set[Type] = {
+    torch.nn.Module,  # abstract base class
+    torch.nn.Container,  # deprecated
+    torch.nn.NLLLoss2d,  # deprecated
+    torch.ao.nn.quantized.MaxPool2d,  # aliases to nn.MaxPool2d
+    torch.ao.nn.quantized.MaxPool2d,  # aliases to nn.MaxPool2d
+}
+
+# List of all module classes to test.
+MODULE_CLASSES: List[Type] = list(chain(*[
+    [getattr(namespace, module_name) for module_name in namespace.__all__]  # type: ignore[attr-defined]
+    for namespace in MODULE_NAMESPACES]))
+MODULE_CLASSES = [cls for cls in MODULE_CLASSES if cls not in MODULES_TO_SKIP]
+
+# Dict of module class -> common name. Useful for making test names more intuitive.
+# Example: torch.nn.modules.linear.Linear -> "nn.Linear"
+MODULE_CLASS_NAMES: Dict[Type, str] = {}
+for namespace in MODULE_NAMESPACES:
+    for module_name in namespace.__all__:  # type: ignore[attr-defined]
+        module_cls = getattr(namespace, module_name)
+        namespace_name = namespace.__name__.replace('torch.', '').replace('.modules', '')
+
+        # Deal with any aliases by preferring earlier names.
+        if module_cls not in MODULE_CLASS_NAMES:
+            MODULE_CLASS_NAMES[module_cls] = f'{namespace_name}.{module_name}'
+
+
+# Specifies the modes (i.e. train, eval) to test over.
+TrainEvalMode = Enum('TrainEvalMode', ('train_only', 'eval_only', 'train_and_eval'))
+
+
+class modules(_TestParametrizer):
+    """ PROTOTYPE: Decorator for specifying a list of modules over which to run a test. """
+
+    def __init__(self, module_info_iterable, allowed_dtypes=None,
+                 train_eval_mode=TrainEvalMode.train_and_eval, skip_if_dynamo=True):
+        self.module_info_list = list(module_info_iterable)
+        self.allowed_dtypes = set(allowed_dtypes) if allowed_dtypes is not None else None
+        self.train_eval_mode = train_eval_mode
+        self.skip_if_dynamo = skip_if_dynamo
+
+    def _get_training_flags(self, module_info):
+        training_flags = []
+        if (self.train_eval_mode == TrainEvalMode.train_only or
+                self.train_eval_mode == TrainEvalMode.train_and_eval):
+            training_flags.append(True)
+
+        if (self.train_eval_mode == TrainEvalMode.eval_only or
+                self.train_eval_mode == TrainEvalMode.train_and_eval):
+            training_flags.append(False)
+
+        # If train and eval modes don't differ for the module, don't bother using more than one.
+        if not module_info.train_and_eval_differ:
+            training_flags = training_flags[:1]
+
+        return training_flags
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        if device_cls is None:
+            raise RuntimeError('The @modules decorator is only intended to be used in a device-specific '
+                               'context; use it with instantiate_device_type_tests() instead of '
+                               'instantiate_parametrized_tests()')
+
+        for module_info in self.module_info_list:
+            dtypes = set(module_info.supported_dtypes(device_cls.device_type))
+            if self.allowed_dtypes is not None:
+                dtypes = dtypes.intersection(self.allowed_dtypes)
+
+            training_flags = self._get_training_flags(module_info)
+            for (training, dtype) in product(training_flags, dtypes):
+                # Construct the test name; device / dtype parts are handled outside.
+                # See [Note: device and dtype suffix placement]
+                test_name = module_info.formatted_name
+                if len(training_flags) > 1:
+                    test_name += f"_{'train_mode' if training else 'eval_mode'}"
+
+                # Construct parameter kwargs to pass to the test.
+                param_kwargs = {'module_info': module_info}
+                _update_param_kwargs(param_kwargs, 'dtype', dtype)
+                _update_param_kwargs(param_kwargs, 'training', training)
+
+                try:
+
+                    @wraps(test)
+                    def test_wrapper(*args, **kwargs):
+                        return test(*args, **kwargs)
+
+                    if self.skip_if_dynamo and not torch.testing._internal.common_utils.TEST_WITH_TORCHINDUCTOR:
+                        test_wrapper = skipIfTorchDynamo("Policy: we don't run ModuleInfo tests w/ Dynamo")(test_wrapper)
+
+                    decorator_fn = partial(module_info.get_decorators, generic_cls.__name__,
+                                           test.__name__, device_cls.device_type, dtype)
+
+                    yield (test_wrapper, test_name, param_kwargs, decorator_fn)
+                except Exception as ex:
+                    # Provides an error message for debugging before rethrowing the exception
+                    print(f"Failed to instantiate {test_name} for module {module_info.name}!")
+                    raise ex
+
+
+def get_module_common_name(module_cls):
+    if module_cls in MODULE_CLASS_NAMES:
+        # Example: "nn.Linear"
+        return MODULE_CLASS_NAMES[module_cls]
+    else:
+        return module_cls.__name__
+
+
+class FunctionInput:
+    """ Contains args and kwargs to pass as input to a function. """
+    __slots__ = ['args', 'kwargs']
+
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+
+class ModuleInput:
+    """ Contains args / kwargs for module instantiation + forward pass. """
+    __slots__ = ['constructor_input', 'forward_input', 'desc', 'reference_fn']
+
+    def __init__(self, constructor_input, forward_input=None, desc='', reference_fn=None):
+        self.constructor_input = constructor_input  # Inputs to pass during construction
+        self.forward_input = forward_input  # Inputs to pass to forward()
+        self.desc = desc  # Description for this set of inputs
+        self.reference_fn = reference_fn  # Reference with signature: reference_fn(module, parameters, *args, **kwargs)
+
+        if reference_fn is not None:
+
+            @wraps(reference_fn)
+            def copy_reference_fn(m, *args, **kwargs):
+                # Copy inputs to avoid undesired side effects from calling the reference.
+                args, kwargs = deepcopy(args), deepcopy(kwargs)
+
+                # Note that module parameters are passed in for convenience.
+                return reference_fn(m, list(m.parameters()), *args, **kwargs)
+
+            self.reference_fn = copy_reference_fn
+
+class ModuleErrorEnum(Enum):
+    """ Enumerates when error is raised when testing modules. """
+    CONSTRUCTION_ERROR = 0
+    FORWARD_ERROR = 1
+
+class ErrorModuleInput:
+    """
+    A ModuleInput that will cause the operation to throw an error plus information
+    about the resulting error.
+    """
+
+    __slots__ = ["module_error_input", "error_on", "error_type", "error_regex"]
+
+    def __init__(self,
+                 module_error_input,
+                 *,
+                 error_on=ModuleErrorEnum.CONSTRUCTION_ERROR,
+                 error_type=RuntimeError,
+                 error_regex):
+        self.module_error_input = module_error_input
+        self.error_on = error_on
+        self.error_type = error_type
+        self.error_regex = error_regex
+
+
+class ModuleInfo:
+    """ Module information to be used in testing. """
+
+    def __init__(self,
+                 module_cls,  # Class object for the module under test
+                 *,
+                 module_inputs_func,  # Function to generate module inputs
+                 skips=(),  # Indicates which tests to skip
+                 decorators=None,  # Additional decorators to apply to generated tests
+                 dtypes=floating_types(),  # dtypes this function is expected to work with
+                 dtypesIfMPS=(torch.float16, torch.float32,),  # dtypes this function is expected to work with on MPS
+                 supports_gradgrad=True,  # whether the op supports second order gradients
+                 gradcheck_nondet_tol=0.0,  # tolerance for nondeterminism while performing gradcheck
+                 module_memformat_affects_out=False,  # whether converting module to channels last will generate
+                                                      # channels last output
+                 train_and_eval_differ=False,  # whether the module has differing behavior between train and eval
+                 module_error_inputs_func=None,  # Function to generate module inputs that error
+                 ):
+        self.module_cls = module_cls
+        self.module_inputs_func = module_inputs_func
+        self.decorators = (*(decorators if decorators else []), *(skips if skips else []))
+        self.dtypes = dtypes
+        self.dtypesIfMPS = dtypesIfMPS
+        self.supports_gradgrad = supports_gradgrad
+        self.gradcheck_nondet_tol = gradcheck_nondet_tol
+        self.module_memformat_affects_out = module_memformat_affects_out
+        self.train_and_eval_differ = train_and_eval_differ
+        self.module_error_inputs_func = module_error_inputs_func
+        self.is_lazy = issubclass(module_cls, torch.nn.modules.lazy.LazyModuleMixin)
+
+    def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
+        result = [set_single_threaded_if_parallel_tbb]
+        for decorator in self.decorators:
+            if isinstance(decorator, DecorateInfo):
+                if decorator.is_active(test_class, test_name, device, dtype, param_kwargs):
+                    result.extend(decorator.decorators)
+            else:
+                result.append(decorator)
+        return result
+
+    def supported_dtypes(self, device_type):
+        if device_type == 'mps':
+            return self.dtypesIfMPS
+        else:
+            return self.dtypes
+
+    @property
+    def name(self):
+        return get_module_common_name(self.module_cls)
+
+    @property
+    def formatted_name(self):
+        return self.name.replace('.', '_')
+
+# Start of module inputs functions.
+
+def module_inputs_torch_nn_Linear(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    module_inputs = [
+        ModuleInput(constructor_input=FunctionInput(10, 8),
+                    forward_input=FunctionInput(input=make_input((4, 10))),
+                    reference_fn=lambda m, p, input: torch.mm(input, p[0].t()) + p[1].view(1, -1).expand(4, 8)),
+        ModuleInput(constructor_input=FunctionInput(10, 8, bias=False),
+                    forward_input=FunctionInput(make_input((4, 10))),
+                    desc='no_bias',
+                    reference_fn=lambda m, p, i: torch.mm(i, p[0].t())),
+        ModuleInput(constructor_input=FunctionInput(3, 5),
+                    forward_input=FunctionInput(make_input(3)),
+                    desc='no_batch_dim',
+                    reference_fn=lambda m, p, i: torch.mm(i.view(1, -1), p[0].t()).view(-1) + p[1])
+    ]
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_Bilinear(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def bilinear_reference_fn(m, p, x1, x2, bias=True):
+        result = torch.einsum('bn,anm,bm->ba', x1, p[0], x2)
+        if bias:
+            if x1.shape[0] == 1:
+                result = result.view(-1) + p[1]
+            else:
+                result = result + p[1].view(1, -1).expand(x1.shape[0], p[0].shape[0])
+        return result
+
+    module_inputs = [
+        ModuleInput(constructor_input=FunctionInput(2, 3, 4),
+                    forward_input=FunctionInput(make_input((8, 2)), make_input((8, 3))),
+                    reference_fn=bilinear_reference_fn),
+        ModuleInput(constructor_input=FunctionInput(2, 3, 4, bias=False),
+                    forward_input=FunctionInput(make_input((8, 2)), make_input((8, 3))),
+                    desc='no_bias',
+                    reference_fn=lambda m, p, x1, x2: bilinear_reference_fn(m, p, x1, x2, bias=False)),
+        ModuleInput(constructor_input=FunctionInput(2, 3, 4),
+                    forward_input=FunctionInput(make_input(2), make_input(3)),
+                    desc='no_batch_dim',
+                    reference_fn=lambda m, p, x1, x2: bilinear_reference_fn(m, p, x1.view(1, -1), x2.view(1, -1))),
+    ]
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_KLDivLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_batchmean', {'reduction': 'batchmean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('log_target', {'log_target': True})
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return kldivloss_reference(i, t, **constructor_kwargs)
+
+        input = make_input((10, 10)).log()
+        target = make_input((10, 10)) if kwargs.get('log_target', False) else make_input((10, 10)).log()
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(input, target),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+        scalar_input = make_input(()).log()
+        scalar_target = make_input(()) if kwargs.get('log_target', False) else make_input(()).log()
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(scalar_input, scalar_input),
+                        desc='scalar_' + desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_NLLLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    def make_input(shape, device=device, dtype=dtype, requires_grad=requires_grad):
+        return make_tensor(shape, device=device, dtype=dtype,
+                           requires_grad=False).log_softmax(dim=1).requires_grad_(requires_grad)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('ignore_index', {'ignore_index': 2}),
+        ('weights', {'weight': make_weight(4).abs()}),
+        ('weights_ignore_index', {'weight': make_weight(4).abs(), 'ignore_index': 2}),
+        ('weights_ignore_index_neg', {'weight': make_weight(4).abs(), 'ignore_index': -1})
+    ]
+
+    # TODO: Uncomment when negative weights is supported.
+    # negative_weight = make_weight(10)
+    # negative_weight[0] = -1
+    # cases.append(('weights_negative', {'weight': negative_weight}))
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return nllloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((15, 4)),
+                                                    torch.empty(15, device=device).uniform_().mul(4).floor().long()),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+        def nd_reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return nlllossNd_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(
+                            make_input((2, 4, 5, 5)),
+                            torch.empty(2, 5, 5, device=device).uniform_().mul(4).floor().long()),
+                        desc=f"nd_{desc}",
+                        reference_fn=nd_reference_fn)
+        )
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(
+                            make_input((2, 4, 5, 5, 2, 2)),
+                            torch.empty(2, 5, 5, 2, 2, device=device).uniform_().mul(4).floor().long()),
+                        desc=f"higher_dim_{desc}",
+                        reference_fn=nd_reference_fn)
+        )
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(
+                            make_input((2, 4, 5)),
+                            torch.empty(2, 5, device=device).uniform_().mul(4).floor().long()),
+                        desc=f"3d_{desc}",
+                        reference_fn=nd_reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_GaussianNLLLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input(3),
+                                                    make_target(3),
+                                                    make_input(1).abs()),
+                        desc=desc,
+                        reference_fn=no_batch_dim_reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_PoissonNLLLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('full', {'full': True}),
+        ('no_log_input', {'log_input': False}),
+        ('full_no_log_input', {'full': True, 'log_input': False}),
+    ]
+
+    def poissonnllloss_reference_fn(i, t, log_input=True, full=False, reduction='mean', eps=1e-8):
+        if log_input:
+            result = i.exp() - t.mul(i)
+        else:
+            result = i - t.mul((i + eps).log())
+
+        if full:
+            result += (t.mul(t.log()) - t + 0.5 * (2. * math.pi * t).log()).masked_fill(t <= 1, 0)
+
+        if reduction == 'none':
+            return result
+        elif reduction == 'mean':
+            return result.sum() / i.numel()
+        else:
+            return result.sum()
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return poissonnllloss_reference_fn(i, t, **constructor_kwargs)
+
+        log_input = constructor_kwargs.get('log_input', True)
+        input = make_input((2, 3, 4, 5)) if log_input else make_input((2, 3, 4, 5)).abs().add(0.001)
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(input,
+                                                    make_target((2, 3, 4, 5)).floor_().abs_()),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_MSELoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    def mse_loss_reference_fn(m, p, i, t, reduction='mean'):
+        if reduction == 'none':
+            return (i - t).pow(2)
+        elif reduction == 'mean':
+            return (i - t).pow(2).sum() / i.numel()
+        else:
+            return (i - t).pow(2).sum()
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((2, 3, 4, 5)),
+                                                    make_target((2, 3, 4, 5))),
+                        desc=desc,
+                        reference_fn=partial(mse_loss_reference_fn, **constructor_kwargs))
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input(()),
+                                                    make_target(())),
+                        desc=f'{desc}_scalar',
+                        reference_fn=partial(mse_loss_reference_fn, **constructor_kwargs))
+        )
+
+    return module_inputs
+
+
+def no_batch_dim_reference_fn(m, p, *args, **kwargs):
+    """Reference function for modules supporting no batch dimensions.
+
+    Unbatched inputs are unsqueezed to form a
+    single batch input before passing them to the module.
+    The output is squeezed to compare with the
+    output of unbatched input to the module.
+
+    Currently it only supports modules which return a single Tensor as output.
+    You can bind the following kwargs.
+    Kwargs:
+        batch_first[bool] : If True, all the Tensors in `args` while be unsqueezed at dim `0` .
+                        and output will be squeezed at dim `0` else dim `1` for both.
+        kwargs_to_batchify[dict] : Dictionary specifying the name of the argument and dimension to unsqueeze.
+                               Useful if there are few arguments whose batch dimension are different
+                               from the ones selected by `batch_first`.
+        is_criterion[bool] : Specify if the module is a criterion and handle the reduction for output accordingly.
+    """
+    def get_and_pop(key, default):
+        v = kwargs.get(key, default)
+        if key in kwargs:
+            kwargs.pop(key)
+        return v
+
+    batch_dim = 0 if get_and_pop('batch_first', True) else 1
+    kwargs_to_batchify = get_and_pop('kwargs_to_batchify', None)
+    is_criterion = get_and_pop('is_criterion', False)
+
+    if kwargs_to_batchify is not None:
+        assert isinstance(kwargs_to_batchify, dict)
+        for k, v in kwargs.items():
+            if k in kwargs_to_batchify and v is not None:
+                bdim = kwargs_to_batchify[k]
+                kwargs[k] = v.unsqueeze(bdim)
+
+    single_batch_input_args = [input.unsqueeze(batch_dim) for input in args]
+    with freeze_rng_state():
+        output = m(*single_batch_input_args, **kwargs).squeeze(batch_dim)
+
+    if is_criterion:
+        reduction = get_reduction(m)
+        if reduction == 'none':
+            return output.squeeze(0)
+    return output
+
+
+def no_batch_dim_reference_mha(m, p, *args, **kwargs):
+    """Reference function for MultiheadAttention supporting no batch dimensions.
+
+    Unbatched inputs are unsqueezed to form a
+    single batch input before passing them to the module.
+    The output is squeezed to compare with the
+    output of unbatched input to the module.
+    """
+    batch_dim = 0 if kwargs.get('batch_first', True) else 1
+    if 'batch_first' in kwargs:
+        kwargs.pop('batch_first')
+    if 'key_padding_mask' in kwargs and kwargs['key_padding_mask'] is not None:
+        kwargs['key_padding_mask'] = kwargs['key_padding_mask'].unsqueeze(0)
+    single_batch_input_args = [input.unsqueeze(batch_dim) for input in args]
+    with freeze_rng_state():
+        output = m(*single_batch_input_args, **kwargs)
+        return (output[0].squeeze(batch_dim), output[1].squeeze(0))
+
+
+def no_batch_dim_reference_rnn_gru(m, p, *args, **kwargs):
+    """Reference function for RNN and GRU supporting no batch dimensions.
+
+    Unbatched inputs are unsqueezed to form a
+    single batch input before passing them to the module.
+    The output is squeezed to compare with the
+    output of unbatched input to the module.
+    """
+    if len(args) == 1:
+        inp, = args
+        h = None
+    elif len(args) == 2:
+        inp, h = args
+        h = h.unsqueeze(1)
+
+    batch_dim = 0 if kwargs['batch_first'] else 1
+    kwargs.pop('batch_first')
+    inp = inp.unsqueeze(batch_dim)
+    single_batch_input_args = (inp, h)
+    with freeze_rng_state():
+        output = m(*single_batch_input_args, **kwargs)
+        return (output[0].squeeze(batch_dim), output[1].squeeze(1))
+
+
+def no_batch_dim_reference_lstm(m, p, *args, **kwargs):
+    """Reference function for LSTM supporting no batch dimensions.
+
+    Unbatched inputs are unsqueezed to form a
+    single batch input before passing them to the module.
+    The output is squeezed to compare with the
+    output of unbatched input to the module.
+    """
+    if len(args) == 1:
+        inp, = args
+        h = None
+    elif len(args) == 2:
+        inp, h = args
+        h = (h[0].unsqueeze(1), h[1].unsqueeze(1))
+
+    batch_dim = 0 if kwargs['batch_first'] else 1
+    kwargs.pop('batch_first')
+    inp = inp.unsqueeze(batch_dim)
+    single_batch_input_args = (inp, h)
+    with freeze_rng_state():
+        output = m(*single_batch_input_args, **kwargs)
+        return (output[0].squeeze(batch_dim), (output[1][0].squeeze(1), output[1][1].squeeze(1)))
+
+
+def no_batch_dim_reference_lstmcell(m, p, *args, **kwargs):
+    """Reference function for LSTMCell supporting no batch dimensions.
+
+    The module is passed the input and target in batched form with a single item.
+    The output is squeezed to compare with the no-batch input.
+    """
+    inp, (h, c) = args
+    single_batch_input_args = (inp.unsqueeze(0), (h.unsqueeze(0), c.unsqueeze(0)))
+    with freeze_rng_state():
+        output = m(*single_batch_input_args, **kwargs)
+        return (output[0].squeeze(0), output[1].squeeze(0))
+
+
+def generate_regression_criterion_inputs(make_input):
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(reduction=reduction),
+            forward_input=FunctionInput(make_input((4, )), make_input(4,)),
+            reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True),
+            desc=f'no_batch_dim_{reduction}'
+        ) for reduction in ['none', 'mean', 'sum']]
+
+
+def module_inputs_torch_nn_AvgPool1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(kernel_size=2),
+                    forward_input=FunctionInput(make_input((3, 6))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn),
+        ModuleInput(constructor_input=FunctionInput(2),
+                    forward_input=FunctionInput(make_input((2, 3, 6)))),
+        ModuleInput(constructor_input=FunctionInput((2,), (2,)),
+                    forward_input=FunctionInput(make_input((2, 3, 6))),
+                    desc='stride'),
+        ModuleInput(constructor_input=FunctionInput(2, 2, 1),
+                    forward_input=FunctionInput(make_input((2, 3, 6))),
+                    desc='stride_pad')]
+
+
+def module_inputs_torch_nn_AvgPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput((2, 2)),
+                    forward_input=FunctionInput(make_input((3, 6, 6))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn),
+        ModuleInput(constructor_input=FunctionInput((2, 2)),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6)))),
+        ModuleInput(constructor_input=FunctionInput((2, 2), (2, 2)),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='stride'),
+        ModuleInput(constructor_input=FunctionInput((2, 2), (2, 2), (1, 1)),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='stride_pad'),
+        ModuleInput(constructor_input=FunctionInput((2, 2), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='divisor'),
+        ModuleInput(constructor_input=FunctionInput((2, 2), (2, 2), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='divisor_stride'),
+        ModuleInput(constructor_input=FunctionInput((2, 2), (2, 2), (1, 1), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='divisor_stride_pad')]
+
+
+
+def module_inputs_torch_nn_AvgPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput((2, 2, 2)),
+                    forward_input=FunctionInput(make_input((3, 4, 4, 4))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn),
+        ModuleInput(constructor_input=FunctionInput((2, 2, 2)),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4)))),
+        ModuleInput(constructor_input=FunctionInput(2, (2, 2, 2)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='stride'),
+        ModuleInput(constructor_input=FunctionInput(2, 2, (1, 1, 1)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='stride_pad'),
+        ModuleInput(constructor_input=FunctionInput(4, 2, (1, 2, 1)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='stride_pad_gpu_fixedkw_output'),
+        ModuleInput(constructor_input=FunctionInput((2, 4, 8), 1, (1, 1, 2)),
+                    forward_input=FunctionInput(make_input((2, 3, 2, 4, 8))),
+                    desc='stride_pad_gpu_general_output'),
+        ModuleInput(constructor_input=FunctionInput(3, 1, 0),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='stride1_pad0_gpu_input'),
+        ModuleInput(constructor_input=FunctionInput(2, 2, (1, 1, 1)),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='stride_pad_gpu_input_nooverlap'),
+        ModuleInput(constructor_input=FunctionInput((2, 2, 2), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='divisor'),
+        ModuleInput(constructor_input=FunctionInput(2, (2, 2, 2), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='divisor_stride'),
+        ModuleInput(constructor_input=FunctionInput(2, 2, (1, 1, 1), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='divisor_stride_pad'),
+        ModuleInput(constructor_input=FunctionInput(4, 2, (1, 2, 1), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+                    desc='divisor_stride_pad_gpu_fixedkw_output'),
+        ModuleInput(constructor_input=FunctionInput((2, 4, 8), 1, (1, 1, 2), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 2, 4, 8))),
+                    desc='divisor_stride_pad_gpu_general_output'),
+        ModuleInput(constructor_input=FunctionInput(3, 1, 0, divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='divisor_stride1_pad0_gpu_input'),
+        ModuleInput(constructor_input=FunctionInput(2, 2, (1, 1, 1), divisor_override=1),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='divisor_stride_pad_gpu_input_nooverlap')]
+
+
+
+def module_inputs_torch_nn_AdaptiveAvgPool1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((1, 3, 5))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(1,),
+                    forward_input=FunctionInput(make_input((1, 3, 5))),
+                    desc='one_output')]
+
+
+def module_inputs_torch_nn_AdaptiveAvgPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5, 6))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(1,),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='single_1x1output'),
+        ModuleInput(constructor_input=FunctionInput((3, 4)),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='tuple'),
+        ModuleInput(constructor_input=FunctionInput((3, None)),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='tuple_none')]
+
+def module_inputs_torch_nn_AdaptiveAvgPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 2, 7))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5, 2, 7))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput((3, 4, 5)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 3, 7))),
+                    desc='tuple'),
+        ModuleInput(constructor_input=FunctionInput((None, 4, 5)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 3, 7))),
+                    desc='tuple_none'),
+        ModuleInput(constructor_input=FunctionInput((3, 2, 2)),
+                    forward_input=FunctionInput(make_input((1, 1, 3, 2, 6))),
+                    desc='last_dim')]
+
+
+def module_inputs_torch_nn_AdaptiveMaxPool1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((1, 3, 5))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_AdaptiveMaxPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5, 6))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput((3, 4)),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='tuple'),
+        ModuleInput(constructor_input=FunctionInput((3, None)),
+                    forward_input=FunctionInput(make_input((1, 3, 5, 6))),
+                    desc='tuple_none')]
+
+
+def module_inputs_torch_nn_AdaptiveMaxPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 6, 7))),
+                    desc='single'),
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((3, 5, 6, 7))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput((3, 4, 5)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 6, 7))),
+                    desc='tuple'),
+        ModuleInput(constructor_input=FunctionInput((3, None, 5)),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 6, 7))),
+                    desc='tuple_none'),
+        ModuleInput(constructor_input=FunctionInput(3),
+                    forward_input=FunctionInput(make_input((2, 3, 12, 9, 3))),
+                    desc='single_nonatomic'),
+        ModuleInput(constructor_input=FunctionInput((3, 4, 5)),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 4, 10))),
+                    desc='tuple_nonatomic')]
+
+
+def module_inputs_torch_nn_BatchNorm1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(10,),
+                    forward_input=FunctionInput(make_input((4, 10))),
+                    desc='affine'),
+        ModuleInput(constructor_input=FunctionInput(5,),
+                    forward_input=FunctionInput(make_input((4, 5, 3))),
+                    desc='3d_input'),
+        ModuleInput(constructor_input=FunctionInput(10, 1e-3, None),
+                    forward_input=FunctionInput(make_input((4, 10))),
+                    desc='affine_simple_average'),
+        ModuleInput(constructor_input=FunctionInput(10, 1e-3, 0.3, False),
+                    forward_input=FunctionInput(make_input((4, 10))),
+                    desc='not_affine'),
+        ModuleInput(constructor_input=FunctionInput(10, 1e-3, 0.3, True, False),
+                    forward_input=FunctionInput(make_input((4, 10))),
+                    desc='not_tracking_stats'),
+        ModuleInput(constructor_input=FunctionInput(5, 1e-3, 0.3, False),
+                    forward_input=FunctionInput(make_input((4, 5, 3))),
+                    desc='3d_input_not_affine'),
+        ModuleInput(constructor_input=FunctionInput(5, 1e-3, 0.3, False),
+                    forward_input=FunctionInput(make_input((0, 5, 9))),
+                    desc='zero_batch')]
+
+
+def module_inputs_torch_nn_BatchNorm2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6)))),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, None),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='2d_simple_average'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.8),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='momentum'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.8, False),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='not_affine'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.8, True, False),
+                    forward_input=FunctionInput(make_input((2, 3, 6, 6))),
+                    desc='not_tracking_stats'),
+        ModuleInput(constructor_input=FunctionInput(5, 1e-3, 0.3, False),
+                    forward_input=FunctionInput(make_input((0, 5, 2, 2))),
+                    desc='zero_batch')]
+
+
+def module_inputs_torch_nn_BatchNorm3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(3,),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4)))),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, None),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='3d_simple_average'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.7),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='momentum'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.7, False),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='not_affine'),
+        ModuleInput(constructor_input=FunctionInput(3, 1e-3, 0.7, True, False),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 4, 4))),
+                    desc='not_tracking_stats'),
+        ModuleInput(constructor_input=FunctionInput(5, 1e-3, 0.3, False),
+                    forward_input=FunctionInput(make_input((0, 5, 2, 2, 2))),
+                    desc='zero_batch')]
+
+
+def module_inputs_torch_nn_ConvNd(module_info, device, dtype, requires_grad, training, **kwargs):
+    N = kwargs['N']
+    lazy = kwargs.get('lazy', False)
+    transposed = kwargs.get('transposed', False)
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    conv_kwargs_list = [{}] if transposed else [{}, {'padding': 'same'}]
+    kernel_size, C_in, C_out = 3, 4, 5
+    input_no_batch_shape = (C_in,) + tuple(i + 3 for i in range(N))
+    input_batch_shape = (2,) + input_no_batch_shape
+    return [
+        ModuleInput(constructor_input=(FunctionInput(C_out, kernel_size, **conv_kwargs) if lazy else
+                                       FunctionInput(C_in, C_out, kernel_size, **conv_kwargs)),
+                    forward_input=FunctionInput(make_input(
+                        input_batch_shape if with_batch else input_no_batch_shape)),
+                    desc=('' if with_batch else 'no_batch_dim'),
+                    reference_fn=(None if with_batch else no_batch_dim_reference_fn))
+        for with_batch, conv_kwargs in itertools.product([True, False], conv_kwargs_list)
+    ]
+
+
+def module_inputs_torch_nn_CosineEmbeddingLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('margin', {'margin': 0.7})
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i1, i2, t, constructor_kwargs=constructor_kwargs):
+            return cosineembeddingloss_reference(i1, i2, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((15, 10)), make_input((15, 10)),
+                                                    make_target((15,)).sign()),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_ELU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2 * (i.exp() - 1))),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input((2, 3, 2, 5))),
+                    desc='4d_input')]
+
+
+def module_inputs_torch_nn_CELU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1))),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, i: torch.where(i >= 0, i, 2. * ((.5 * i).exp() - 1)),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(alpha=2.),
+                    forward_input=FunctionInput(make_input((3,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_GLU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((5, 6)))),
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((5, 6, 7))),
+                    desc='dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((4,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_GELU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput('none'),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput('none'),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    reference_fn=lambda m, p, x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3,))),
+                    desc='no_batch_dim',
+                    reference_fn=no_batch_dim_reference_fn)]
+
+
+def module_inputs_torch_nn_ReLU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    desc='channels_last_mem_format'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 3, 4, 5))),
+                    desc='channels_last_3d_mem_format')]
+
+
+def module_inputs_torch_nn_ReLU6(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    desc='channels_last_mem_format'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 3, 4, 5))),
+                    desc='channels_last_3d_mem_format')]
+
+
+def module_inputs_torch_nn_LeakyReLU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3, 2, 5)))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(0.5),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    desc='with_negval'),
+        ModuleInput(constructor_input=FunctionInput(0.0),
+                    forward_input=FunctionInput(make_input((10, 10))),
+                    desc='with_zero_negval'),
+        ModuleInput(constructor_input=FunctionInput(0.5),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='with_negval_scalar')]
+
+
+def module_inputs_torch_nn_PReLU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='1d'),
+        ModuleInput(constructor_input=FunctionInput(3),
+                    forward_input=FunctionInput(make_input((2, 3, 4))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='1d_multiparam'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='2d'),
+        ModuleInput(constructor_input=FunctionInput(3),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='2d_multiparam'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5, 6))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='3d'),
+        ModuleInput(constructor_input=FunctionInput(3),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5, 6))),
+                    reference_fn=lambda m, p, i: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
+                    desc='3d_multiparam')]
+
+
+def module_inputs_torch_nn_SELU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3, 2, 5)))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar')]
+
+
+def module_inputs_torch_nn_SiLU(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, x, *_: x * torch.sigmoid(x),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((5, 6, 7))),
+                    reference_fn=lambda m, p, x, *_: x * torch.sigmoid(x))]
+
+
+def module_inputs_torch_nn_Softmax(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((10, 20))),
+                    reference_fn=lambda m, p, i: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20))),
+        ModuleInput(constructor_input=FunctionInput(0),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, i: torch.exp(i).div(torch.exp(i).sum(0, True)),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(-1),
+                    forward_input=FunctionInput(make_input((4, 5))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Softmax2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((1, 3, 10, 20))),
+                    reference_fn=lambda m, p, i: torch.exp(i).div(torch.exp(i).sum(1, False))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3, 4, 5))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_LogSoftmax(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((10, 20))),
+                    reference_fn=lambda m, p, i: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_()),
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((1, 3, 10, 20))),
+                    reference_fn=lambda m, p, i: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
+                    desc='multiparam'),
+        ModuleInput(constructor_input=FunctionInput(0),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, i: torch.exp(i).div_(torch.exp(i).sum(0, False)).log_(),
+                    desc='multiparam_scalar'),
+        ModuleInput(constructor_input=FunctionInput(-1),
+                    forward_input=FunctionInput(make_input((4, 5))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Softmin(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((10, 20)))),
+        ModuleInput(constructor_input=FunctionInput(1),
+                    forward_input=FunctionInput(make_input((2, 3, 5, 10))),
+                    desc='multidim'),
+        ModuleInput(constructor_input=FunctionInput(0),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(-1),
+                    forward_input=FunctionInput(make_input((3, 4, 10))),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Softplus(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((10, 20))),
+                    reference_fn=lambda m, p, i: torch.log(1 + torch.exp(i))),
+        ModuleInput(constructor_input=FunctionInput(2),
+                    forward_input=FunctionInput(make_input((10, 20))),
+                    reference_fn=lambda m, p, i: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
+                    desc='beta'),
+        ModuleInput(constructor_input=FunctionInput(2, -100),
+                    forward_input=FunctionInput(make_input((10, 20))),
+                    reference_fn=(
+                        lambda m, p, i: ((i * 2) > -100).type_as(i) * i
+                        + ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))),
+                    desc='beta_threshold'),
+        ModuleInput(constructor_input=FunctionInput(2, -100),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=(
+                        lambda m, p, i: ((i * 2) > -100).type_as(i) * i
+                        + ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))),
+                    desc='beta_threshold_scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Softshrink(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3, 2, 5)))),
+        ModuleInput(constructor_input=FunctionInput(1,),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    desc='lambda'),
+        ModuleInput(constructor_input=FunctionInput(1,),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='lambda_scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Softsign(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((3, 2, 5))),
+                    reference_fn=lambda m, p, i: i.div(1 + torch.abs(i))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, i: i.div(1 + torch.abs(i)),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Tanh(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5)))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+
+def module_inputs_torch_nn_Tanhshrink(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5)))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Threshold(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(2., 1.),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    desc='threshold_value'),
+        ModuleInput(constructor_input=FunctionInput(2., 10.),
+                    forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+                    desc='large_value'),
+        ModuleInput(constructor_input=FunctionInput(2., 1.),
+                    forward_input=FunctionInput(make_input(())),
+                    desc='threshold_value_scalar'),
+        ModuleInput(constructor_input=FunctionInput(2., 1.),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_Mish(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((5, 6, 7))),
+                    reference_fn=lambda m, p, i: i * torch.tanh(F.softplus(i))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(())),
+                    reference_fn=lambda m, p, i: i * torch.tanh(F.softplus(i)),
+                    desc='scalar'),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(4)),
+                    reference_fn=no_batch_dim_reference_fn,
+                    desc='no_batch_dim')]
+
+
+def module_inputs_torch_nn_L1Loss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input((2, 3, 4)),
+                                                make_input((2, 3, 4))),
+                    reference_fn=lambda m, p, i, t: 1. / i.numel() * sum((a - b).abs().sum()
+                                                                         for a, b in zip(i, t))),
+        ModuleInput(constructor_input=FunctionInput(),
+                    forward_input=FunctionInput(make_input(()), make_input(())),
+                    reference_fn=lambda m, p, i, t: 1. / i.numel() * (i - t).abs().sum(),
+                    desc='scalar')] + generate_regression_criterion_inputs(make_input)
+
+
+def module_inputs_torch_nn_SmoothL1Loss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return smoothl1loss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 10)),
+                                                    make_input((5, 10))),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input(()),
+                                                    make_input(())),
+                        desc=f'scalar_{desc}',
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+
+def module_inputs_torch_nn_BCELoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('weights', {'weight': make_weight((10,))}),
+    ]
+
+    def bce_loss_reference_fn(m, p, i, t, reduction='mean', weight=None):
+        result = -(t * i.log() + (1 - t) * (1 - i).log())
+
+        if weight is not None:
+            result = result * weight
+
+        if reduction == 'none':
+            return result
+        elif reduction == 'mean':
+            return result.sum() / i.numel()
+        else:
+            return result.sum()
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((15, 10), low=1e-2, high=1 - 1e-2),
+                                                    make_target((15, 10)).gt(0).to(dtype)),
+                        desc=desc,
+                        reference_fn=partial(bce_loss_reference_fn, **constructor_kwargs))
+        )
+
+    scalar_weight = make_weight(())
+    module_inputs.append(
+        ModuleInput(constructor_input=FunctionInput(weight=scalar_weight),
+                    forward_input=FunctionInput(make_input((), low=1e-2, high=1 - 1e-2),
+                                                make_target(()).gt(0).to(dtype)),
+                    desc='scalar_weight',
+                    reference_fn=partial(bce_loss_reference_fn, weight=scalar_weight))
+    )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_BCEWithLogitsLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('weights', {'weight': make_weight((10,))}),
+        ('scalar_weights', {'weight': make_weight(())})
+    ]
+
+    def bce_withlogitsloss_reference_fn(m, p, i, t, reduction='mean', weight=None):
+        # TODO: add pos_weight to the definition here and corresponding SampleInputs
+        max_val = (-i).clamp(min=0)
+        result = (1 - t).mul_(i).add_(max_val).add_((-max_val).exp_().add_((-i - max_val).exp_()).log_())
+
+        if weight is not None:
+            result = result * weight
+
+        if reduction == 'none':
+            return result
+        elif reduction == 'mean':
+            return result.sum() / i.numel()
+        else:
+            return result.sum()
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((15, 10), low=1e-2, high=1 - 1e-2),
+                                                    make_target((15, 10)).gt(0).to(dtype)),
+                        desc=desc,
+                        reference_fn=partial(bce_withlogitsloss_reference_fn, **constructor_kwargs))
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_CrossEntropyLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=torch.long, requires_grad=False)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    reductions: List[str] = ['mean', 'sum', 'none']
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('weights', {'weight': make_weight((3,))}),
+        ('ignore_index', {'ignore_index': 1}),
+        ('label_smoothing', {'label_smoothing': 0.15}),
+        ('ignore_index_label_smoothing', {'ignore_index': 1, 'label_smoothing': 0.15})
+    ]
+
+    module_inputs = []
+    for reduction, (desc, constructor_kwargs) in product(reductions, cases):
+        def reference_fn(m, p, i, t, reduction=reduction, constructor_kwargs=constructor_kwargs):
+            return cross_entropy_loss_reference(i, t, reduction=reduction, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                        forward_input=FunctionInput(make_input((2, 3, 5, 5)),
+                                                    make_target((2, 5, 5), low=0, high=3)),
+                        desc=f"4d_{desc}_{reduction}",
+                        reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                        forward_input=FunctionInput(make_input((2, 3, 5)),
+                                                    make_target((2, 5), low=0, high=3)),
+                        desc=f"3d_{desc}_{reduction}",
+                        reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                        forward_input=FunctionInput(make_input((2, 3)),
+                                                    make_target((2), low=0, high=3)),
+                        desc=f"2d_{desc}_{reduction}",
+                        reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                        forward_input=FunctionInput(make_input((2, 3, 5, 5, 2, 2)),
+                                                    make_target((2, 5, 5, 2, 2), low=0, high=3)),
+                        desc=f"higher_dim_{desc}_{reduction}",
+                        reference_fn=reference_fn)
+        )
+
+        if constructor_kwargs.get('ignore_index', None) is None:
+            module_inputs.append(
+                ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                            forward_input=FunctionInput(make_input((5, 3, 4, 2)),
+                                                        make_input((5, 3, 4, 2)).softmax(dim=1)),
+                            desc=f"4d_prob_target_{desc}_{reduction}",
+                            reference_fn=reference_fn)
+            )
+            module_inputs.append(
+                ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                            forward_input=FunctionInput(make_input((5, 3, 4)),
+                                                        make_input((5, 3, 4)).softmax(dim=1)),
+                            desc=f"3d_prob_target_{desc}_{reduction}",
+                            reference_fn=reference_fn)
+            )
+            module_inputs.append(
+                ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                            forward_input=FunctionInput(make_input((5, 3)),
+                                                        make_input((5, 3)).softmax(dim=1)),
+                            desc=f"2d_prob_target_{desc}_{reduction}",
+                            reference_fn=reference_fn)
+            )
+            module_inputs.append(
+                ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                            forward_input=FunctionInput(make_input((2, 3, 5, 5, 2, 2)),
+                                                        make_input((2, 3, 5, 5, 2, 2)).softmax(dim=1)),
+                            desc=f"higher_dim_prob_target_{desc}_{reduction}",
+                            reference_fn=reference_fn)
+            )
+            module_inputs.append(
+                ModuleInput(constructor_input=FunctionInput(reduction=reduction, **constructor_kwargs),
+                            forward_input=FunctionInput(make_input((3,)),
+                                                        make_target((), low=0, high=3)),
+                            desc=f"no_batch_dim_{desc}_{reduction}",
+                            reference_fn=partial(no_batch_dim_reference_fn, is_criterion=True))
+            )
+
+    return module_inputs
+
+
+
+def module_inputs_torch_nn_CTCLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('blank', {'blank': 14})
+    ]
+    target_dtypes = [torch.int, torch.long]
+
+    module_inputs = []
+    for target_dtype, (desc, constructor_kwargs) in product(target_dtypes, cases):
+        def reference_fn(m, p, i, t, il, tl, constructor_kwargs=constructor_kwargs):
+            return ctcloss_reference(i, t, il, tl, **constructor_kwargs)
+
+        blank = constructor_kwargs.get('blank', 0)
+        low = 0 if blank == 14 else 1
+        high = 14 if blank == 14 else 15
+
+        module_inputs.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**constructor_kwargs),
+                forward_input=FunctionInput(make_input((50, 3, 15)).log_softmax(2),
+                                            make_target((3, 30), dtype=target_dtype, low=low, high=high),
+                                            (50, 50, 50), (30, 25, 20)),
+                desc=f'{desc}_lengths_intlists',
+                reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**constructor_kwargs),
+                forward_input=FunctionInput(make_input((50, 3, 15)).log_softmax(2),
+                                            make_target((3, 30), dtype=target_dtype, low=low, high=high),
+                                            torch.tensor((50, 50, 50), device=device),
+                                            torch.tensor((30, 25, 20), device=device)),
+                desc=f'{desc}_lengths_tensors',
+                reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**constructor_kwargs),
+                forward_input=FunctionInput(make_input((50, 3, 15)).log_softmax(2),
+                                            make_target((30 + 25 + 20,), dtype=target_dtype, low=low, high=high),
+                                            (50, 50, 50), (30, 25, 20)),
+                desc=f'{desc}_1d_target_lengths_intlists',
+                reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**constructor_kwargs),
+                forward_input=FunctionInput(make_input((50, 3, 15)).log_softmax(2),
+                                            make_target((30 + 25 + 20,), dtype=target_dtype, low=low, high=high),
+                                            torch.tensor((50, 50, 50), device=device),
+                                            torch.tensor((30, 25, 20), device=device)),
+                desc=f'{desc}_1d_target_lengths_tensors',
+                reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_GroupNorm(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(3, 6, 1e-3),
+            forward_input=FunctionInput(make_input((4, 6, 5))),
+            desc='1d_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput(3, 12, 1e-3),
+            forward_input=FunctionInput(make_input((4, 12))),
+            desc='1d_affine_GN'),
+        ModuleInput(
+            constructor_input=FunctionInput(1, 6, 1e-3),
+            forward_input=FunctionInput(make_input((150, 6))),
+            desc='1d_affine_large_batch'),
+        ModuleInput(
+            constructor_input=FunctionInput(5, 5, 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 5, 5))),
+            desc='1d_no_affine_IN'),
+        ModuleInput(
+            constructor_input=FunctionInput(1, 10, 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 10))),
+            desc='1d_no_affine_LN'),
+        ModuleInput(
+            constructor_input=FunctionInput(3, 6, 1e-3),
+            forward_input=FunctionInput(make_input((4, 6, 2, 3))),
+            desc='2d_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput(3, 6, 1e-3),
+            forward_input=FunctionInput(make_input((4, 6, 28, 28))),
+            desc='2d_affine_large_feature'),
+        ModuleInput(
+            constructor_input=FunctionInput(3, 51, 1e-5, False),
+            forward_input=FunctionInput(make_input((2, 51, 28, 28))),
+            desc='2d_no_affine_large_feature'),
+        ModuleInput(
+            constructor_input=FunctionInput(3, 3, 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 3, 2, 3))),
+            desc='2d_no_affine_IN'),
+        ModuleInput(
+            constructor_input=FunctionInput(1, 3, 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 3, 2, 3))),
+            desc='2d_no_affine_LN'),
+    ]
+
+
+def module_inputs_torch_nn_Hardshrink(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(2.),
+            forward_input=FunctionInput(make_input((4, 3, 2, 4))),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(2.),
+            forward_input=FunctionInput(make_input(())),
+            desc='scalar',
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(4)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim',
+        )
+    ]
+
+
+def module_inputs_torch_nn_Hardswish(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(4)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim',
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input((2, 3, 2, 5))),
+            desc='4d_input')
+    ]
+
+
+def module_inputs_torch_nn_Hardtanh(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input((3, 2, 5))),
+            reference_fn=lambda m, p, i: i.clamp(-1, 1),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(())),
+            reference_fn=lambda m, p, i: i.clamp(-1, 1),
+            desc='scalar',
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(4)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim',
+        )
+    ]
+
+
+def module_inputs_torch_nn_HingeEmbeddingLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('margin', {'margin': 0.5})
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return hingeembeddingloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((10,)),
+                                                    make_target((10,)).gt(0).to(dtype).mul_(2).sub_(1)),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input(()),
+                                                    make_target(()).gt(0).to(dtype).mul_(2).sub_(1)),
+                        desc=f'scalar_{desc}',
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_HuberLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return huberloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 10)),
+                                                    make_input((5, 10))),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_InstanceNormNd(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    lazy = kwargs.get('lazy', False)
+    N = kwargs['N']
+    num_features, eps, momentum, affine, track_running_stats = 3, 1e-3, 0.3, False, True
+    input_no_batch_shape_dict = {1: (3, 15), 2: (3, 6, 6), 3: (3, 4, 4, 4)}
+    input_no_batch_shape = input_no_batch_shape_dict[N]
+    input_batch_shape = (4,) + input_no_batch_shape
+
+    return [
+        ModuleInput(
+            constructor_input=(
+                FunctionInput(eps, momentum) if lazy else FunctionInput(num_features, eps, momentum)
+            ),
+            forward_input=FunctionInput(make_input(input_batch_shape))),
+        ModuleInput(
+            constructor_input=(
+                FunctionInput(eps, momentum, affine, track_running_stats) if lazy else
+                FunctionInput(num_features, eps, momentum, affine, track_running_stats)
+            ),
+            forward_input=FunctionInput(make_input(input_batch_shape)),
+            desc='tracking_stats'),
+        ModuleInput(
+            constructor_input=(
+                FunctionInput(eps, momentum) if lazy else FunctionInput(num_features, eps, momentum)
+            ),
+            forward_input=FunctionInput(make_input(input_no_batch_shape)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='tracking_stats_no_batch_dim'),
+        ModuleInput(
+            constructor_input=(
+                FunctionInput(eps, momentum, affine, track_running_stats) if lazy else
+                FunctionInput(num_features, eps, momentum, affine, track_running_stats)
+            ),
+            forward_input=FunctionInput(make_input(input_no_batch_shape)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim')
+    ]
+
+def module_inputs_torch_nn_LayerNorm(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((4, 5, 5))),
+            desc='1d_elementwise_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((128, 5, 5))),
+            desc='1d_elementwise_affine_large_batch'),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 5, 5))),
+            desc='1d_no_elementwise_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput([2, 2, 5], 1e-3),
+            forward_input=FunctionInput(make_input((4, 2, 2, 5))),
+            desc='3d_elementwise_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput([2, 2, 5], 1e-3, False),
+            forward_input=FunctionInput(make_input((4, 2, 2, 5))),
+            desc='3d_no_elementwise_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput([5], 1e-3),
+            forward_input=FunctionInput(make_input((0, 5))),
+            desc='1d_empty_elementwise_affine'),
+        ModuleInput(
+            constructor_input=FunctionInput([2, 2, 5], 1e-3, elementwise_affine=True, bias=False),
+            forward_input=FunctionInput(make_input((4, 2, 2, 5))),
+            desc='3d_elementwise_affine_no_bias'),
+    ]
+
+
+def module_inputs_torch_nn_LocalResponseNorm(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(3,),
+            forward_input=FunctionInput(make_input((1, 5, 7))),
+            desc='1d'),
+        ModuleInput(
+            constructor_input=FunctionInput(2,),
+            forward_input=FunctionInput(make_input((1, 5, 7, 7))),
+            desc='2d_uneven_pad'),
+        ModuleInput(
+            constructor_input=FunctionInput(1, 1., 0.5, 2.),
+            forward_input=FunctionInput(make_input((1, 5, 7, 7, 7))),
+            desc='3d_custom_params'),
+    ]
+
+
+def module_inputs_torch_nn_LPPool1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1.5, 2),
+            forward_input=FunctionInput(make_input((1, 3, 7))),
+            desc='norm'),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 3),
+            forward_input=FunctionInput(make_input((1, 3, 7)))),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 3),
+            forward_input=FunctionInput(make_input((3, 7))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim'),
+    ]
+
+
+
+def module_inputs_torch_nn_LPPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 2),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7)))),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 2),
+            forward_input=FunctionInput(make_input((3, 7, 7))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim'),
+        ModuleInput(
+            constructor_input=FunctionInput(1.5, 2),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7))),
+            desc='norm'),
+    ]
+
+
+def module_inputs_torch_nn_LPPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 2),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7, 7)))),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, 2),
+            forward_input=FunctionInput(make_input((3, 7, 7, 7))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim'),
+        ModuleInput(
+            constructor_input=FunctionInput(1.5, 2),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7, 7))),
+            desc='norm'),
+    ]
+
+
+def module_inputs_torch_nn_MaxPool1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(4),
+            forward_input=FunctionInput(make_input((2, 10, 4))),
+            desc='3d_input'),
+        ModuleInput(
+            constructor_input=FunctionInput(4, 4),
+            forward_input=FunctionInput(make_input((2, 10, 4))),
+            desc='stride'),
+        ModuleInput(
+            constructor_input=FunctionInput(4, return_indices=True),
+            forward_input=FunctionInput(make_input((2, 10, 4))),
+            desc='return_indices'),
+    ]
+
+
+def module_inputs_torch_nn_MaxPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3), (2, 2), (1, 1)),
+            forward_input=FunctionInput(make_input((3, 7, 7))),
+            desc='3d_input'),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3), (2, 2), (1, 1)),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7))),
+            desc='4d_input'),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3), (2, 2), (1, 1), return_indices=True),
+            forward_input=FunctionInput(make_input((1, 3, 7, 7))),
+            desc='return_indices'),
+    ]
+
+def module_inputs_torch_nn_MaxPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput((2, 2, 2)),
+            forward_input=FunctionInput(make_input((2, 3, 5, 5, 5)))),
+        ModuleInput(
+            constructor_input=FunctionInput(2, (2, 2, 2)),
+            forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+            desc='stride'),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, (1, 1, 1)),
+            forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+            desc='stride_padding'),
+        ModuleInput(
+            constructor_input=FunctionInput(2, 2, (1, 1, 1), return_indices=True),
+            forward_input=FunctionInput(make_input((2, 3, 5, 5, 5))),
+            desc='return_indices'),
+    ]
+
+
+def module_inputs_torch_nn_FractionalMaxPool2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_random_samples():
+        return torch.empty((1, 3, 2), dtype=torch.double, device=device).uniform_()
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(2, output_ratio=0.5, _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((1, 3, 5, 7))),
+            desc='ratio'),
+        ModuleInput(
+            constructor_input=FunctionInput((2, 3), output_size=(4, 3), _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((1, 3, 7, 6))),
+            desc='size'),
+        ModuleInput(
+            constructor_input=FunctionInput(
+                2, output_ratio=0.5, _random_samples=make_random_samples(), return_indices=True
+            ),
+            forward_input=FunctionInput(make_input((1, 3, 5, 7))),
+            desc='ratio_return_indices'),
+        ModuleInput(
+            constructor_input=FunctionInput(2, output_ratio=0.5, _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((3, 5, 7))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='ratio_no_batch_dim'),
+        ModuleInput(
+            constructor_input=FunctionInput((2, 3), output_size=(4, 3), _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((3, 7, 6))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='size_no_batch_dim'),
+    ]
+
+
+def module_inputs_torch_nn_FractionalMaxPool3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_random_samples():
+        return torch.empty((2, 4, 3), dtype=torch.double, device=device).uniform_()
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(2, output_ratio=0.5, _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((2, 4, 5, 5, 5))),
+            desc='ratio'),
+        ModuleInput(
+            constructor_input=FunctionInput((2, 2, 2), output_size=(4, 4, 4), _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((2, 4, 7, 7, 7))),
+            desc='size'),
+        ModuleInput(
+            constructor_input=FunctionInput((4, 2, 3), output_size=(10, 3, 2), _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((2, 4, 16, 7, 5))),
+            desc='asymsize'),
+        ModuleInput(
+            constructor_input=FunctionInput(
+                2, output_ratio=0.5, _random_samples=make_random_samples(), return_indices=True
+            ),
+            forward_input=FunctionInput(make_input((2, 4, 5, 5, 5))),
+            desc='ratio_return_indices'),
+        ModuleInput(
+            constructor_input=FunctionInput(2, output_ratio=0.5, _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((4, 5, 5, 5))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='ratio_no_batch_dim'),
+        ModuleInput(
+            constructor_input=FunctionInput((2, 2, 2), output_size=(4, 4, 4), _random_samples=make_random_samples()),
+            forward_input=FunctionInput(make_input((4, 7, 7, 7))),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='size_no_batch_dim'),
+    ]
+
+
+def module_inputs_torch_nn_Sigmoid(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(())),
+            desc='scalar'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(4)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim',
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+            desc='channels_last_mem_format'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input((2, 3, 3, 4, 5))),
+            desc='channels_last_3d_mem_format'
+        )
+    ]
+
+
+def module_inputs_torch_nn_LogSigmoid(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(())),
+            reference_fn=lambda m, p, i: i.sigmoid().log(),
+            desc='scalar'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input((2, 3, 4))),
+            reference_fn=lambda m, p, i: i.sigmoid().log(),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(),
+            forward_input=FunctionInput(make_input(4)),
+            reference_fn=no_batch_dim_reference_fn,
+            desc='no_batch_dim',
+        ),
+    ]
+
+
+def module_inputs_torch_nn_MarginRankingLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=torch.long, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('margin', {'margin': 0.5})
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i1, i2, t, constructor_kwargs=constructor_kwargs):
+            return marginrankingloss_reference(i1, i2, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((50,)), make_input((50,)),
+                                                    make_target((50,)).sign()),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_MultiLabelMarginLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=torch.long, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return multilabelmarginloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((10,)),
+                                                    make_target((10), low=0, high=10)),
+                        desc=f'1d_{desc}',
+                        reference_fn=reference_fn)
+        )
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 10)),
+                                                    make_target((5, 10), low=0, high=10)),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_MultiMarginLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=torch.long, requires_grad=False)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('p', {'p': 2}),
+        ('margin', {'margin': 0.5}),
+        ('weights', {'weight': make_weight(10)})
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return multimarginloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 10)),
+                                                    make_target((5), low=0, high=10)),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_MultiLabelSoftMarginLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=torch.long, requires_grad=False)
+    make_weight = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+        ('weight', {'weight': make_weight(10)}),
+    ]
+
+    def multilabelsoftmargin_loss_reference_fn(m, p, i, t, reduction='mean', weight=None):
+        result = t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()
+        if weight is not None:
+            result *= weight
+        result = (-result).sum(i.dim() - 1) / i.size(-1)
+
+        if reduction == 'none':
+            return result
+        elif reduction == 'mean':
+            return result.mean()
+        else:
+            return result.sum()
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 10)),
+                                                    make_target((5, 10), low=0, high=2)),
+                        desc=desc,
+                        reference_fn=partial(multilabelsoftmargin_loss_reference_fn, **constructor_kwargs))
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_SoftMarginLoss(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    make_target = partial(make_tensor, device=device, dtype=dtype, requires_grad=False)
+
+    cases: List[Tuple[str, dict]] = [
+        ('', {}),
+        ('reduction_sum', {'reduction': 'sum'}),
+        ('reduction_mean', {'reduction': 'mean'}),
+        ('reduction_none', {'reduction': 'none'}),
+    ]
+
+    module_inputs = []
+    for desc, constructor_kwargs in cases:
+        def reference_fn(m, p, i, t, constructor_kwargs=constructor_kwargs):
+            return softmarginloss_reference(i, t, **constructor_kwargs)
+
+        module_inputs.append(
+            ModuleInput(constructor_input=FunctionInput(**constructor_kwargs),
+                        forward_input=FunctionInput(make_input((5, 5)),
+                                                    make_target((5, 5)).sign()),
+                        desc=desc,
+                        reference_fn=reference_fn)
+        )
+
+    return module_inputs
+
+
+def module_inputs_torch_nn_TransformerEncoder(module_info, device, dtype, requires_grad, training, **kwargs):
+    # Reuse the TransformerEncoderLayer samples since the forward args are nearly the same.
+    samples = []
+    for layer_module_input in module_inputs_torch_nn_TransformerEncoderLayer(
+            None, device, dtype, requires_grad, training):
+        # Construct a TransformerEncoderLayer object to pass to TransformerEncoder.
+        l_args, l_kwargs = (layer_module_input.constructor_input.args,
+                            layer_module_input.constructor_input.kwargs)
+        l_kwargs['device'] = device
+        l_kwargs['dtype'] = dtype
+        encoder_layer = torch.nn.TransformerEncoderLayer(*l_args, **l_kwargs)
+        num_layers = 2
+        # Note: TransformerEncoderLayer takes a "src_mask" while
+        # TransformerEncoder takes a "mask"; rename kwarg appropriately.
+        forward_input = layer_module_input.forward_input
+        if 'src_mask' in forward_input.kwargs:
+            forward_input.kwargs['mask'] = forward_input.kwargs['src_mask']
+            del forward_input.kwargs['src_mask']
+        samples.append(ModuleInput(
+            constructor_input=FunctionInput(encoder_layer, num_layers),
+            forward_input=forward_input,
+            desc=layer_module_input.desc
+        ))
+    return samples
+
+def module_inputs_torch_nn_TransformerEncoderLayer(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    samples = [
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 16, 0.0),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4))
+            ),
+            desc='relu_activation'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 8, 0.0, F.gelu),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4))
+            ),
+            desc='gelu_activation'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 8, 0.0, bias=False),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4))
+            ),
+            desc='no_bias'
+        ),]
+
+    # Samples below are for validating the no-batch-dim support.
+    key_padding_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool))
+    attn_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool).expand((3, 3)))
+    for src_mask, src_key_padding_mask, norm_first, batch_first, bias in \
+            itertools.product(attn_masks, key_padding_masks, (True, False), (True, False), (True, False)):
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
+                                                dropout=0.0, batch_first=batch_first,
+                                                norm_first=norm_first, bias=bias),
+                forward_input=FunctionInput(
+                    make_input((3, 4)), src_mask=src_mask, src_key_padding_mask=src_key_padding_mask
+                ),
+                reference_fn=partial(no_batch_dim_reference_fn,
+                                     batch_first=batch_first, kwargs_to_batchify={'src_key_padding_mask': 0}),
+                desc=f'no_batch_dim_batch_first_{batch_first}'
+            ))
+
+    # Samples below where we pass reference_fn are for validating the fast path,
+    # since the fast path requires no_grad mode, we run the fast path in .eval()
+    # and no_grad() in the reference_fn and verify that against the results in train mode.
+    def fast_path_reference_fn(module, parameters, *args, **kwargs):
+        assert module.training
+        module.train(False)
+        with torch.no_grad():
+            output = module(*args, **kwargs)
+        module.train(True)
+        return output
+
+    if training:
+        for norm_first, bias in itertools.product((True, False), (True, False)):
+            samples.append(
+                ModuleInput(
+                    constructor_input=FunctionInput(
+                        4, 2, 8, dropout=0.0, batch_first=True, norm_first=norm_first, bias=bias
+                    ),
+                    forward_input=FunctionInput(
+                        make_input((2, 3, 4)),
+                    ),
+                    # fastpath doesn't run when bias=False
+                    reference_fn=fast_path_reference_fn if bias else None,
+                    desc=f'fastpath_{bias}_norm_first_{norm_first}'
+                )
+            )
+
+    return samples
+
+
+def module_inputs_torch_nn_TransformerDecoderLayer(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    samples = [
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 16, 0.0),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4)), make_input((2, 3, 4))
+            ),
+            desc='relu_activation'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 8, 0.0, F.gelu),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4)), make_input((2, 3, 4))
+            ),
+            desc='gelu_activation'
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(4, 2, 8, 0.0, bias=False),
+            forward_input=FunctionInput(
+                make_input((2, 3, 4)), make_input((2, 3, 4))
+            ),
+            desc='no_bias'
+        ), ]
+
+    key_padding_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool))
+    attn_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool).expand((3, 3)))
+    for tgt_mask, tgt_key_padding_mask, norm_first, bias, batch_first in \
+            itertools.product(attn_masks, key_padding_masks, (True, False), (True, False), (True, False)):
+        # Using same mask for tgt and memory
+        memory_mask = tgt_mask
+        memory_key_padding_mask = tgt_key_padding_mask
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
+                                                dropout=0.0, batch_first=batch_first,
+                                                norm_first=norm_first, bias=bias),
+                forward_input=FunctionInput(
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, memory_mask=memory_mask,
+                    tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
+                ),
+                reference_fn=partial(no_batch_dim_reference_fn,
+                                     batch_first=batch_first,
+                                     kwargs_to_batchify={'tgt_key_padding_mask': 0, 'memory_key_padding_mask': 0}),
+                desc=f'no_batch_dim_batch_first_{batch_first}'
+            ))
+        src, tgt = make_input((2, 3, 4)), make_input((2, 3, 4))
+        if not batch_first:
+            src, tgt = src.transpose(0, 1), tgt.transpose(0, 1)
+        if tgt_key_padding_mask is not None:
+            memory_key_padding_mask, tgt_key_padding_mask = (tgt_key_padding_mask.expand(2, 3),) * 2
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
+                                                dropout=0.0, batch_first=batch_first,
+                                                norm_first=norm_first, bias=bias),
+                forward_input=FunctionInput(
+                    src, tgt, tgt_mask=tgt_mask, memory_mask=memory_mask,
+                    tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
+                ),
+                desc=f'norm_first_{norm_first}_batch_first_{batch_first}_bias_{bias}'
+            ))
+
+    return samples
+
+
+def module_inputs_torch_nn_Transformer(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = []
+    # Samples below are for validating the no-batch-dim support.
+    key_padding_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool))
+    attn_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool).expand((3, 3)))
+    for mask, key_padding_mask, norm_first, bias, batch_first in \
+            itertools.product(attn_masks, key_padding_masks, (True, False), (True, False), (True, False)):
+        # Using same mask for tgt and memory
+        src_mask , tgt_mask = (mask,) * 2
+        src_key_padding_mask, tgt_key_padding_mask = (key_padding_mask,) * 2
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
+                                                num_encoder_layers=1, num_decoder_layers=1,
+                                                dropout=0.0, batch_first=batch_first, norm_first=norm_first, bias=bias),
+                forward_input=FunctionInput(
+                    make_input((3, 4)), make_input((3, 4)), tgt_mask=tgt_mask, src_mask=src_mask,
+                    tgt_key_padding_mask=tgt_key_padding_mask, src_key_padding_mask=src_key_padding_mask
+                ),
+                reference_fn=partial(no_batch_dim_reference_fn,
+                                     batch_first=batch_first,
+                                     kwargs_to_batchify={'tgt_key_padding_mask': 0, 'src_key_padding_mask': 0}),
+                desc=f'no_batch_dim_batch_first_{batch_first}'
+            ))
+
+        src, tgt = make_input((2, 3, 4)), make_input((2, 3, 4))
+        if not batch_first:
+            src = src.transpose(0, 1)
+            tgt = tgt.transpose(0, 1)
+        if key_padding_mask is not None:
+            src_key_padding_mask, tgt_key_padding_mask = (key_padding_mask.expand(2, 3),) * 2
+
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(d_model=4, nhead=2, dim_feedforward=8,
+                                                num_encoder_layers=1, num_decoder_layers=1,
+                                                dropout=0.0, batch_first=batch_first, norm_first=norm_first, bias=bias),
+                forward_input=FunctionInput(
+                    src, tgt, tgt_mask=tgt_mask, src_mask=src_mask,
+                    tgt_key_padding_mask=tgt_key_padding_mask, src_key_padding_mask=src_key_padding_mask
+                ),
+            ))
+    return samples
+
+
+def module_inputs_torch_nn_Embedding(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_empty = partial(torch.empty, device=device, dtype=torch.long, requires_grad=False)
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(num_embeddings=4, embedding_dim=3),
+            forward_input=FunctionInput(make_empty(2, 3).random_(4))
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(num_embeddings=4, embedding_dim=3),
+            forward_input=FunctionInput(make_empty(1, 512).random_(4).expand(7, 512)),
+            desc='discontiguous'
+        ),
+    ]
+
+
+def module_inputs_torch_nn_MultiheadAttention(module_info, device, dtype, requires_grad, training, **kwargs):
+    # Currently all samples below are for validating the no-batch-dim support.
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = []
+    bool_vals = (True, False)
+    key_padding_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool))
+    attn_masks = (None, torch.tensor([False, False, True], device=device, dtype=torch.bool).expand((3, 3, 3)))
+    products = itertools.product(bool_vals, bool_vals, bool_vals, key_padding_masks, attn_masks)
+    for bias, add_bias_kv, add_zero_attn, key_padding_mask, attn_mask in products:
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(embed_dim=3, num_heads=3, batch_first=True,
+                                                bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn),
+                forward_input=FunctionInput(make_input((3, 3)), make_input((3, 3)), make_input((3, 3)),
+                                            key_padding_mask=key_padding_mask, attn_mask=attn_mask),
+                reference_fn=no_batch_dim_reference_mha,
+            )
+        )
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(embed_dim=3, num_heads=3, batch_first=False,
+                                                bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn),
+                forward_input=FunctionInput(make_input((3, 3)), make_input((3, 3)), make_input((3, 3)),
+                                            key_padding_mask=key_padding_mask, attn_mask=attn_mask),
+                reference_fn=partial(no_batch_dim_reference_mha, batch_first=False),
+            )
+        )
+
+    return samples
+
+
+def module_inputs_torch_nn_RNN_GRU_Cell(module_info, device, dtype, requires_grad, training, **kwargs):
+    # Currently all samples below are for validating the no-batch-dim support.
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = [
+        ModuleInput(
+            constructor_input=FunctionInput(5, 10),
+            forward_input=FunctionInput(make_input(5), make_input(10)),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(5, 10, bias=True),
+            forward_input=FunctionInput(make_input(5), make_input(10)),
+            reference_fn=no_batch_dim_reference_fn,
+        )
+    ]
+
+    is_rnn = kwargs.get('is_rnn', False)
+    if is_rnn:
+        # RNN also supports `nonlinearity` argument.
+        # `tanh` is the default, so we check with `relu`
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(5, 10, bias=True, nonlinearity='relu'),
+                forward_input=FunctionInput(make_input(5), make_input(10)),
+                reference_fn=no_batch_dim_reference_fn,
+            )
+        )
+
+    return samples
+
+
+def module_inputs_torch_nn_LSTMCell(module_info, device, dtype, requires_grad, training, **kwargs):
+    # Currently all samples below are for validating the no-batch-dim support.
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = (
+        ModuleInput(
+            constructor_input=FunctionInput(5, 10),
+            forward_input=FunctionInput(make_input(5), (make_input(10), make_input(10))),
+            reference_fn=no_batch_dim_reference_lstmcell,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput(5, 10, bias=True),
+            forward_input=FunctionInput(make_input(5), (make_input(10), make_input(10))),
+            reference_fn=no_batch_dim_reference_lstmcell,
+        ),
+    )
+
+    return samples
+
+def make_packed_sequence(inp, batch_sizes):
+    required_grad = inp.requires_grad
+    inp.requires_grad_(False)  # user won't have access to inp so won't be able to get its grads
+    seq = pack_padded_sequence(inp, batch_sizes)
+    seq.data.requires_grad_(required_grad)
+    return seq
+
+
+def module_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, training, with_packed_sequence=False, **kwargs):
+    # Currently all samples below are for validating the no-batch-dim support.
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    is_rnn = kwargs['is_rnn']
+    nonlinearity = ('relu', 'tanh')
+    bias = (False, True)
+    batch_first = (False, True)
+    bidirectional = (False, True)
+
+    samples = []
+    if is_rnn:
+        prod_gen = product(nonlinearity, bias, batch_first, bidirectional)
+    else:
+        prod_gen = product(bias, batch_first, bidirectional)
+
+    for args in prod_gen:
+        if is_rnn:
+            nl, b, b_f, bidir = args
+        else:
+            b, b_f, bidir = args
+
+        cons_args = {'input_size': 2, 'hidden_size': 2, 'num_layers': 2,
+                     'batch_first': b_f, 'bias': b, 'bidirectional': bidir}
+        cons_args_hidden = {'input_size': 2, 'hidden_size': 3, 'num_layers': 2,
+                            'batch_first': b_f, 'bias': b, 'bidirectional': bidir}
+
+        if is_rnn:
+            cons_args['nonlinearity'] = nl
+            cons_args_hidden['nonlinearity'] = nl
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**cons_args),
+                forward_input=FunctionInput(make_input((3, 2))),
+                reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+            )
+        )
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**cons_args_hidden),
+                forward_input=FunctionInput(make_input((3, 2)), make_input((4 if bidir else 2, 3))),
+                reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+            )
+        )
+        if with_packed_sequence:
+            samples.append(
+                ModuleInput(
+                    constructor_input=FunctionInput(**cons_args),
+                    forward_input=FunctionInput(make_packed_sequence(make_input((5, 2, 2)), torch.tensor([5, 3]))),
+                    reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+                )
+            )
+            samples.append(
+                ModuleInput(
+                    constructor_input=FunctionInput(**cons_args),
+                    forward_input=FunctionInput(make_packed_sequence(make_input((5, 5, 2)), torch.tensor([5, 3, 3, 2, 2]))),
+                    reference_fn=partial(no_batch_dim_reference_rnn_gru, batch_first=b_f),
+                )
+            )
+
+    return samples
+
+
+def module_inputs_torch_nn_LSTM(module_info, device, dtype, requires_grad, training, **kwargs):
+    # Currently all samples below are for validating the no-batch-dim support.
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    bias = (False, True)
+    batch_first = (False, True)
+    bidirectional = (False, True)
+    proj_sizes = (0, 2)
+
+    samples = []
+    prod_gen = product(bias, batch_first, bidirectional, proj_sizes)
+
+    for args in prod_gen:
+        b, b_f, bidir, proj_size = args
+        hidden_size = 3
+        cons_args = {'input_size': 2, 'hidden_size': hidden_size, 'num_layers': 2, 'proj_size': proj_size,
+                     'batch_first': b_f, 'bias': b, 'bidirectional': bidir}
+        cons_args_hidden = {'input_size': 2, 'hidden_size': hidden_size, 'num_layers': 2, 'proj_size': proj_size,
+                            'batch_first': b_f, 'bias': b, 'bidirectional': bidir}
+
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**cons_args),
+                forward_input=FunctionInput(make_input((2, 2))),
+                reference_fn=partial(no_batch_dim_reference_lstm, batch_first=b_f),
+            )
+        )
+
+        h_out = proj_size if proj_size > 0 else hidden_size
+        hx = (make_input((4 if bidir else 2, h_out)), make_input((4 if bidir else 2, hidden_size)))
+        samples.append(
+            ModuleInput(
+                constructor_input=FunctionInput(**cons_args_hidden),
+                forward_input=FunctionInput(make_input((3, 2)), hx),
+                reference_fn=partial(no_batch_dim_reference_lstm, batch_first=b_f),
+            )
+        )
+
+
+    return samples
+
+
+
+def module_inputs_torch_nn_ReflectionPad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((2, 3))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2)),
+            forward_input=FunctionInput(make_input((2, 3, 4))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ReflectionPad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4)),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ReflectionPad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+            reference_fn=no_batch_dim_reference_fn
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 1, 2, 1, 2)),
+            forward_input=FunctionInput(make_input((3, 3, 3, 3, 3))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ReplicationPad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4))),
+            reference_fn=no_batch_dim_reference_fn
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2)),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ReplicationPad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4)),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ReplicationPad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4, 5, 6)),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6, 7))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ZeroPad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2)),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ZeroPad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((1, 2, 3))),
+            reference_fn=no_batch_dim_reference_fn
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4)),
+            forward_input=FunctionInput(make_input((1, 2, 3, 4))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ZeroPad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4, 5, 6)),
+            forward_input=FunctionInput(make_input((1, 2, 3, 4, 5))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ConstantPad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1, 2),
+            forward_input=FunctionInput(make_input((3, 4))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2), 3),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ConstantPad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1, 3),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+            reference_fn=no_batch_dim_reference_fn
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4), 5),
+            forward_input=FunctionInput(make_input((1, 2, 3, 4))),
+        ),
+    ]
+
+def module_inputs_torch_nn_ConstantPad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1, 3),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 3, 4, 5, 6), 7),
+            forward_input=FunctionInput(make_input((1, 2, 1, 2, 1))),
+        ),
+    ]
+
+def module_inputs_torch_nn_CircularPad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def padding1d_circular_ref(inp, pad):
+        r""" input:
+                [[[0., 1., 2.],
+                  [3., 4., 5.]]]
+                pad: (1, 2)
+                output:
+                    [[[2., 0., 1., 2., 0., 1.],
+                      [5., 3., 4., 5., 3., 4.]]]
+            """
+        return torch.cat([inp[:, :, -pad[0]:], inp, inp[:, :, :pad[1]]], dim=2)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4))),
+            reference_fn=no_batch_dim_reference_fn
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2)),
+            forward_input=FunctionInput(make_input((1, 2, 3))),
+            reference_fn=lambda m, p, i: padding1d_circular_ref(i, m.padding),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 1)),
+            forward_input=FunctionInput(make_input((1, 2, 3))),
+            reference_fn=lambda m, p, i: padding1d_circular_ref(i, m.padding),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3)),
+            forward_input=FunctionInput(make_input((1, 2, 3))),
+            reference_fn=lambda m, p, i: padding1d_circular_ref(i, m.padding),
+        ),
+    ]
+
+def module_inputs_torch_nn_CircularPad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def padding2d_circular_ref(inp, pad):
+        r"""input:
+                [[[[0., 1., 2],
+                   [3., 4., 5.]]]]
+                pad: (1, 2, 2, 1)
+        output:
+            [[[[2., 0., 1., 2., 0., 1.],
+               [5., 3., 4., 5., 3., 4.],
+               [2., 0., 1., 2., 0., 1.],
+               [5., 3., 4., 5., 3., 4.],
+               [2., 0., 1., 2., 0., 1.]]]]
+        """
+        inp = torch.cat([inp[:, :, -pad[2]:], inp, inp[:, :, :pad[3]]], dim=2)
+        return torch.cat([inp[:, :, :, -pad[0]:], inp, inp[:, :, :, :pad[1]]], dim=3)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 2, 1)),
+            forward_input=FunctionInput(make_input((1, 1, 2, 3))),
+            reference_fn=lambda m, p, i: padding2d_circular_ref(i, m.padding),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((2, 3, 2, 2)),
+            forward_input=FunctionInput(make_input((1, 1, 2, 3))),
+            reference_fn=lambda m, p, i: padding2d_circular_ref(i, m.padding),
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3, 3, 1)),
+            forward_input=FunctionInput(make_input((1, 1, 3, 3))),
+            reference_fn=lambda m, p, i: padding2d_circular_ref(i, m.padding),
+        ),
+    ]
+
+def module_inputs_torch_nn_CircularPad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+
+    def padding3d_circular_ref(inp, pad):
+        r"""input:
+                [[[[[ 0.,  1.,  2.],
+                    [ 3.,  4.,  5.]],
+                   [[ 6.,  7.,  8.],
+                    [ 9., 10., 11.]]]]]
+            pad: (1, 2, 2, 1, 1, 2)
+            output: [[[[[ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.]],
+
+                       [[ 2.,  0.,  1.,  2.,  0.,  1.],
+                        [ 5.,  3.,  4.,  5.,  3.,  4.],
+                        [ 2.,  0.,  1.,  2.,  0.,  1.],
+                        [ 5.,  3.,  4.,  5.,  3.,  4.],
+                        [ 2.,  0.,  1.,  2.,  0.,  1.]],
+
+                       [[ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.]],
+
+                       [[ 2.,  0.,  1.,  2.,  0.,  1.],
+                        [ 5.,  3.,  4.,  5.,  3.,  4.],
+                        [ 2.,  0.,  1.,  2.,  0.,  1.],
+                        [ 5.,  3.,  4.,  5.,  3.,  4.],
+                        [ 2.,  0.,  1.,  2.,  0.,  1.]],
+
+                       [[ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.],
+                        [11.,  9., 10., 11.,  9., 10.],
+                        [ 8.,  6.,  7.,  8.,  6.,  7.]]]]]
+        """
+        inp = torch.cat([inp[:, :, -pad[4]:], inp, inp[:, :, :pad[5]]], dim=2)
+        inp = torch.cat([inp[:, :, :, -pad[2]:], inp, inp[:, :, :, :pad[3]]], dim=3)
+        return torch.cat([inp[:, :, :, :, -pad[0]:], inp, inp[:, :, :, :, :pad[1]]], dim=4)
+
+    return [
+        ModuleInput(
+            constructor_input=FunctionInput(1),
+            forward_input=FunctionInput(make_input((3, 4, 5, 6))),
+            reference_fn=no_batch_dim_reference_fn,
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((1, 2, 1, 2, 1, 2)),
+            forward_input=FunctionInput(make_input((1, 1, 2, 2, 3))),
+            reference_fn=lambda m, p, i: padding3d_circular_ref(i, m.padding)
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 2, 2, 1, 1, 2)),
+            forward_input=FunctionInput(make_input((1, 1, 2, 2, 3))),
+            reference_fn=lambda m, p, i: padding3d_circular_ref(i, m.padding)
+        ),
+        ModuleInput(
+            constructor_input=FunctionInput((3, 3, 2, 1, 2, 2)),
+            forward_input=FunctionInput(make_input((1, 1, 2, 2, 3))),
+            reference_fn=lambda m, p, i: padding3d_circular_ref(i, m.padding)
+        ),
+    ]
+
+
+# All these operators share similar issues on cuDNN and MIOpen
+rnn_gru_lstm_module_info_decorators = (
+    # RuntimeError: Batching rule not implemented for aten::_cudnn_rnn_backward.
+    # We could not generate a fallback
+    DecorateInfo(
+        unittest.expectedFailure, "TestModule", "test_grad",
+        active_if=(TEST_CUDNN and not TEST_WITH_ROCM), device_type='cuda'
+    ),
+    # NotImplementedError: the derivative for '_cudnn_rnn_backward' is not implemented.
+    # Double backwards is not supported for CuDNN RNNs due to limitations in the CuDNN API
+    DecorateInfo(
+        unittest.expectedFailure, "TestModule", "test_gradgrad",
+        active_if=(TEST_CUDNN and not TEST_WITH_ROCM), device_type='cuda'
+    ),
+    # CUDNN GRU doesn't accept non-contiguous hx
+    DecorateInfo(
+        unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors",
+        active_if=(TEST_CUDNN and not TEST_WITH_ROCM), device_type='cuda'
+    ),
+    # MIOPEN GRU doesn't accept non-contiguous hx (this is dispatched to miopen only for float).
+    DecorateInfo(
+        unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors",
+        active_if=(TEST_CUDNN and TEST_WITH_ROCM), dtypes=(torch.float,), device_type='cuda'
+    ),
+    DecorateInfo(
+        skipCUDAVersionIn([(11, 7)]), "TestExpandedWeightModule", "test_module",
+        device_type='cuda'
+    ),
+    DecorateInfo(
+        skipCUDAVersionIn([(11, 7)]), "TestDecomp", "test_rnn_decomp_module",
+        device_type='cuda'
+    )
+)
+
+# Start of module error inputs functions.
+
+def module_error_inputs_torch_nn_RNN_GRU_Cell(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 11), make_input(3, 20)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="input has inconsistent input_size: got 11 expected 10"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), make_input(3, 21)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="hidden0 has inconsistent hidden_size: got 21, expected 20"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), make_input(5, 20)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="Input batch size 3 doesn't match hidden0 batch size 5"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), make_input(3, 1, 1, 20)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex="Expected hidden to be 1D or 2D, got 4D instead"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20, 'relu'),
+                forward_input=FunctionInput(make_input(3, 10), make_input(3, 21)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="hidden0 has inconsistent hidden_size: got 21, expected 20"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20, 'tanh'),
+                forward_input=FunctionInput(make_input(3, 10), make_input(3, 21)),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="hidden0 has inconsistent hidden_size: got 21, expected 20"
+        ),
+    ]
+    return samples
+
+def module_error_inputs_torch_nn_LSTMCell(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    samples = [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 11), (make_input(3, 20), make_input(3, 20))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="input has inconsistent input_size: got 11 expected 10"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), (make_input(3, 21), make_input(3, 21))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="hidden0 has inconsistent hidden_size: got 21, expected 20"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), (make_input(5, 20), make_input(5, 20))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=RuntimeError,
+            error_regex="Input batch size 3 doesn't match hidden0 batch size 5"
+        ),
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(10, 20),
+                forward_input=FunctionInput(make_input(3, 10), (make_input(3, 1, 1, 20), make_input(3, 1, 1, 20))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex="Expected hx\\[0\\] to be 1D or 2D, got 4D instead"
+        ),
+    ]
+    return samples
+
+
+def module_error_inputs_torch_nn_RNN_GRU(module_info, device, dtype, requires_grad, training, **kwargs):
+    samples = [
+        ErrorModuleInput(
+            ModuleInput(constructor_input=FunctionInput(10, 0, 1)),
+            error_on=ModuleErrorEnum.CONSTRUCTION_ERROR,
+            error_type=ValueError,
+            error_regex="hidden_size must be greater than zero"
+        ),
+        ErrorModuleInput(
+            ModuleInput(constructor_input=FunctionInput(10, 10, 0)),
+            error_on=ModuleErrorEnum.CONSTRUCTION_ERROR,
+            error_type=ValueError,
+            error_regex="num_layers must be greater than zero"
+        ),
+    ]
+    return samples
+
+def module_error_inputs_torch_nn_Pad1d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    is_constant = kwargs.get('is_constant', False)
+
+    return [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(1, 3) if is_constant else FunctionInput(3),
+                forward_input=FunctionInput(make_input((2, 3, 4, 5))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex=r"expected 2D or 3D input \(got 4D input\)",
+
+        ),
+    ]
+
+def module_error_inputs_torch_nn_Pad2d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    is_constant = kwargs.get('is_constant', False)
+
+    return [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(1, 3) if is_constant else FunctionInput(3),
+                forward_input=FunctionInput(make_input((2, 3))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex=r"expected 3D or 4D input \(got 2D input\)",
+
+        ),
+    ]
+
+def module_error_inputs_torch_nn_Pad3d(module_info, device, dtype, requires_grad, training, **kwargs):
+    make_input = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    is_constant = kwargs.get('is_constant', False)
+
+    return [
+        ErrorModuleInput(
+            ModuleInput(
+                constructor_input=FunctionInput(1, 3) if is_constant else FunctionInput(3),
+                forward_input=FunctionInput(make_input((2, 3))),
+            ),
+            error_on=ModuleErrorEnum.FORWARD_ERROR,
+            error_type=ValueError,
+            error_regex=r"expected 4D or 5D input \(got 2D input\)",
+
+        ),
+    ]
+
+
+# Database of ModuleInfo entries in alphabetical order.
+module_db: List[ModuleInfo] = [
+    ModuleInfo(torch.nn.AdaptiveAvgPool1d,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveAvgPool1d,
+               skips=(
+                   # Fails on MPS backend if input/output sizes are not divisible
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.AdaptiveAvgPool2d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveAvgPool2d,
+               skips=(
+                   # Fails on MPS backend if input/output sizes are not divisible
+                   DecorateInfo(skipMPS),
+                   # Fails on backward check if output size is 1x1
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                   ),)
+               ),
+    ModuleInfo(torch.nn.AdaptiveAvgPool3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveAvgPool3d,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.AdaptiveMaxPool1d,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveMaxPool1d,
+               ),
+    ModuleInfo(torch.nn.AdaptiveMaxPool2d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveMaxPool2d,
+               ),
+    ModuleInfo(torch.nn.AdaptiveMaxPool3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_inputs_func=module_inputs_torch_nn_AdaptiveMaxPool3d,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.AvgPool1d,
+               module_inputs_func=module_inputs_torch_nn_AvgPool1d,
+               ),
+    ModuleInfo(torch.nn.AvgPool2d,
+               module_inputs_func=module_inputs_torch_nn_AvgPool2d,
+               skips=(
+                   # The difference between channels last backward and
+                   # channels first backward of AvgPool2d on CUDA is too large
+                   # See https://github.com/pytorch/pytorch/issues/107201
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='cuda',
+                   ),
+                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),),
+               ),
+    ModuleInfo(torch.nn.AvgPool3d,
+               module_inputs_func=module_inputs_torch_nn_AvgPool3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   # No channels_last support for AvgPool1d as it does not take 4D inputs
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.BatchNorm1d,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_BatchNorm1d,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),
+                   # tracking here rather than in the list in test_aotdispatch.py as eval mode passes
+                   # RuntimeError: tried to get Double out of SymInt
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_symbolic_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ),
+                   # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ))
+               ),
+    ModuleInfo(torch.nn.BatchNorm2d,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_BatchNorm2d,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),
+                   # tracking here rather than in the list in test_aotdispatch.py as eval mode passes
+                   # RuntimeError: tried to get Double out of SymInt
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_symbolic_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ),
+                   # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ),)
+               ),
+    ModuleInfo(torch.nn.BatchNorm3d,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_BatchNorm3d,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # tracking here rather than in the list in test_aotdispatch.py as eval mode passes
+                   # RuntimeError: tried to get Double out of SymInt
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_symbolic_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ),
+                   # torch._subclasses.fake_tensor.DataDependentOutputException: aten._local_scalar_dense.default
+                   DecorateInfo(
+                       unittest.expectedFailure, 'TestEagerFusionModuleInfo',
+                       'test_aot_autograd_module_exhaustive',
+                       active_if=lambda p: p['training']
+                   ),)
+               ),
+    ModuleInfo(torch.nn.CELU,
+               module_inputs_func=module_inputs_torch_nn_CELU,
+               # not MPS specific, will be xfailed for all devices in next PR
+               skips=(
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_check_inplace',
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.Conv1d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.Conv2d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=False),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='cuda', dtypes=[torch.float64]),
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.Conv3d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=3, lazy=False),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 8005
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Conv3d is not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.ConvTranspose1d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=False, transposed=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               dtypes=floating_and_complex_types_and(torch.chalf),
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Not implmented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.ConvTranspose2d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=False, transposed=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               dtypes=floating_and_complex_types_and(torch.chalf),
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Fails on backward check because ViewAsRealBackward apply contiguous for grad
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format',
+                                dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.float64, torch.complex128]),
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32]),
+                   # Not implemented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.ConvTranspose3d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=3, lazy=False, transposed=True),
+               dtypes=floating_and_complex_types_and(torch.chalf),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 8005
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # ConvTranspose3d is not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+                   # These fail only on ROCm
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.complex32, torch.complex64], active_if=TEST_WITH_ROCM),
+                   # Not implmented for chalf on CPU
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_cpu_gpu_parity',
+                                dtypes=(torch.chalf,), device_type='cuda'),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(precisionOverride({torch.complex64: 1e-04}), 'TestModule', 'test_cpu_gpu_parity'),
+                   DecorateInfo(precisionOverride({torch.chalf: 5e-03}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.CosineEmbeddingLoss,
+               module_inputs_func=module_inputs_torch_nn_CosineEmbeddingLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.ELU,
+               module_inputs_func=module_inputs_torch_nn_ELU,
+               # not MPS specific, will be xfailed for all devices in next PR
+               skips=(
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_check_inplace',
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.FractionalMaxPool2d,
+               module_inputs_func=module_inputs_torch_nn_FractionalMaxPool2d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.FractionalMaxPool3d,
+               module_inputs_func=module_inputs_torch_nn_FractionalMaxPool3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.L1Loss,
+               module_inputs_func=module_inputs_torch_nn_L1Loss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.SmoothL1Loss,
+               module_inputs_func=module_inputs_torch_nn_SmoothL1Loss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', 'test_non_contiguous_tensors', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.LazyConv1d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.LazyConv2d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='cuda', dtypes=[torch.float64]),
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.LazyConv3d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=3, lazy=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 8005
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # LazyConv3d is not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.LazyConvTranspose1d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=1, lazy=True, transposed=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.LazyConvTranspose2d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=2, lazy=True, transposed=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 7603
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=7603), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='cuda',
+                                dtypes=[torch.float64]),
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float32]),
+                   # See #119108: MPSNDArrayConvolutionA14.mm:3976: failed assertion `destination datatype must be fp32'
+                   # xfail does not work due to Fatal Python error: Aborted
+                   DecorateInfo(skipIfMps, "TestModule", "test_memory_format",
+                                device_type='mps', dtypes=[torch.float16]),
+                   DecorateInfo(skipIfMps, "TestModule", "test_non_contiguous_tensors",
+                                device_type='mps', dtypes=[torch.float16]),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.LazyConvTranspose3d,
+               module_inputs_func=partial(module_inputs_torch_nn_ConvNd, N=3, lazy=True, transposed=True),
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               module_memformat_affects_out=True,
+               skips=(
+                   # channels_last support on cuda requires cudnn >= 8005
+                   DecorateInfo(skipCUDAIfCudnnVersionLessThan(version=8005), 'TestModule', 'test_memory_format'),
+                   # Failure on ROCM for float32 issue #70125
+                   DecorateInfo(skipCUDAIfRocm, 'TestModule', 'test_memory_format', dtypes=[torch.float32]),
+                   # Lazy modules don't currently play well with ModuleInfo tests on the meta device.
+                   # See https://github.com/pytorch/pytorch/issues/70505 for more info.
+                   DecorateInfo(skipMeta),
+                   # LazyConvTranspose3d is not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # This was wrongly being skipped before and needs investigation.
+                   # See https://github.com/pytorch/pytorch/issues/80247
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),
+               ),
+               decorators=(
+                   DecorateInfo(precisionOverride({torch.float32: 1e-04}), 'TestModule', 'test_memory_format'),
+               )),
+    ModuleInfo(torch.nn.Linear,
+               module_inputs_func=module_inputs_torch_nn_Linear,
+               skips=(
+                   # No channels_last support for Linear currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.Bilinear,
+               module_inputs_func=module_inputs_torch_nn_Bilinear,
+               decorators=[
+                   DecorateInfo(
+                       toleranceOverride({
+                           torch.float32: tol(atol=1e-4, rtol=1e-4),
+                           torch.float64: tol(atol=1e-4, rtol=1e-4)}),
+                       'TestModule', 'test_forward', device_type='cpu'),
+               ],
+               skips=(
+                   # No channels_last support for Bilinear currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.LPPool1d,
+               module_inputs_func=module_inputs_torch_nn_LPPool1d,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),)
+               ),
+    ModuleInfo(torch.nn.LPPool2d,
+               module_inputs_func=module_inputs_torch_nn_LPPool2d,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),)
+               ),
+    ModuleInfo(torch.nn.LPPool3d,
+               module_inputs_func=module_inputs_torch_nn_LPPool3d,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   DecorateInfo(skipIfMps),)
+               ),
+    ModuleInfo(torch.nn.MaxPool1d,
+               module_inputs_func=module_inputs_torch_nn_MaxPool1d,
+               ),
+    ModuleInfo(torch.nn.MaxPool2d,
+               module_inputs_func=module_inputs_torch_nn_MaxPool2d,
+               ),
+    ModuleInfo(torch.nn.MaxPool3d,
+               module_inputs_func=module_inputs_torch_nn_MaxPool3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.KLDivLoss,
+               module_inputs_func=module_inputs_torch_nn_KLDivLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # https://github.com/pytorch/pytorch/issues/115588
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),)
+               ),
+    ModuleInfo(torch.nn.MSELoss,
+               module_inputs_func=module_inputs_torch_nn_MSELoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', 'test_non_contiguous_tensors', dtypes=[torch.float16]),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.MarginRankingLoss,
+               module_inputs_func=module_inputs_torch_nn_MarginRankingLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.MultiLabelMarginLoss,
+               module_inputs_func=module_inputs_torch_nn_MultiLabelMarginLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # 'aten::multilabel_margin_loss_forward' is not currently implemented for the MPS device.
+                   DecorateInfo(skipIfMps, 'TestModule'),
+                   # derivative for aten::multilabel_margin_loss_backward is not implemented
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),)
+               ),
+    ModuleInfo(torch.nn.MultiMarginLoss,
+               module_inputs_func=module_inputs_torch_nn_MultiMarginLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # 'aten::multi_margin_loss' is not currently implemented for the MPS device.
+                   DecorateInfo(skipIfMps, 'TestModule'),
+                   # RuntimeError: derivative for aten::multi_margin_loss_backward is not implemented
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),)
+               ),
+    ModuleInfo(torch.nn.SoftMarginLoss,
+               module_inputs_func=module_inputs_torch_nn_SoftMarginLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.MultiLabelSoftMarginLoss,
+               module_inputs_func=module_inputs_torch_nn_MultiLabelSoftMarginLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.NLLLoss,
+               module_inputs_func=module_inputs_torch_nn_NLLLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.GaussianNLLLoss,
+               module_inputs_func=module_inputs_torch_nn_GaussianNLLLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)),
+    ModuleInfo(torch.nn.PoissonNLLLoss,
+               module_inputs_func=module_inputs_torch_nn_PoissonNLLLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)),
+    ModuleInfo(torch.nn.HingeEmbeddingLoss,
+               module_inputs_func=module_inputs_torch_nn_HingeEmbeddingLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.HuberLoss,
+               module_inputs_func=module_inputs_torch_nn_HuberLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: seemingly incorrect output dtype
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.BCELoss,
+               module_inputs_func=module_inputs_torch_nn_BCELoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # error: input types 'tensor<f32>' and 'tensor<15x10xf16>' are not broadcast compatible
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.BCEWithLogitsLoss,
+               module_inputs_func=module_inputs_torch_nn_BCEWithLogitsLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # see #119108: tolerance issue
+                   DecorateInfo(skipIfMps, 'TestModule', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.CrossEntropyLoss,
+               module_inputs_func=module_inputs_torch_nn_CrossEntropyLoss,
+               dtypes=get_all_fp_dtypes(include_half=True, include_bfloat16=False),
+               decorators=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_memory_format'),
+                   # Expect failures for tests that rely on torch.half implementation on CPU
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", dtypes=[torch.float16], device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_if_train_and_eval_modes_differ",
+                                dtypes=[torch.float16], device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_save_load", dtypes=[torch.float16],
+                                device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_non_contiguous_tensors", dtypes=[torch.float16],
+                                device_type='cpu'),
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_multiple_device_transfer", dtypes=[torch.float16],
+                                device_type='cuda'),
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_cpu_gpu_parity", dtypes=[torch.float16],
+                                device_type='cuda'),),
+               ),
+    ModuleInfo(torch.nn.CTCLoss,
+               module_inputs_func=module_inputs_torch_nn_CTCLoss,
+               skips=(
+                   # No channels_last support for loss functions.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # The operator aten::_ctc_loss is not currently implemented for the MPS device.
+                   DecorateInfo(skipIfMps, 'TestModule'),
+                   # derivative for aten::_ctc_loss_backward is not implemented
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_grad'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad'),
+                   # https://github.com/pytorch/pytorch/issues/115585
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_non_contiguous_tensors'),)
+               ),
+    ModuleInfo(torch.nn.GELU,
+               module_inputs_func=module_inputs_torch_nn_GELU,
+               skips=(
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward",
+                                device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.GLU,
+               module_inputs_func=module_inputs_torch_nn_GLU,
+               ),
+    ModuleInfo(torch.nn.GroupNorm,
+               module_inputs_func=module_inputs_torch_nn_GroupNorm,
+               dtypes=get_all_fp_dtypes(include_bfloat16=True, include_half=True),
+               skips=(
+                   # Tracking at https://github.com/pytorch/pytorch/issues/98089
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_cpu_gpu_parity'),
+                   DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                                'TestModule', 'test_memory_format', device_type='cpu'),
+                   # No channels_last support for GroupNorm currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format', device_type='cuda'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format', device_type='mps'),
+                   DecorateInfo(unittest.skip("Skipped!"), "TestModule", "test_grad",
+                                active_if=TEST_WITH_ROCM, device_type='cuda'),)
+               ),
+    ModuleInfo(torch.nn.Hardshrink,
+               module_inputs_func=module_inputs_torch_nn_Hardshrink,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),),
+               ),
+    ModuleInfo(torch.nn.Hardswish,
+               module_inputs_func=module_inputs_torch_nn_Hardswish,
+               skips=(
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),),
+               supports_gradgrad=False),
+    ModuleInfo(torch.nn.Hardtanh,
+               module_inputs_func=module_inputs_torch_nn_Hardtanh,
+               ),
+    ModuleInfo(torch.nn.InstanceNorm1d,
+               module_inputs_func=partial(module_inputs_torch_nn_InstanceNormNd, N=1),
+               train_and_eval_differ=True,
+               skips=(
+                   # No channels_last support for InstanceNorm1d currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.InstanceNorm2d,
+               module_inputs_func=partial(module_inputs_torch_nn_InstanceNormNd, N=2),
+               train_and_eval_differ=True,
+               skips=(
+                   # No channels_last support for InstanceNorm2d currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.InstanceNorm3d,
+               module_inputs_func=partial(module_inputs_torch_nn_InstanceNormNd, N=3),
+               train_and_eval_differ=True,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),
+                   # No channels_last support for InstanceNorm3d currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.LocalResponseNorm,
+               module_inputs_func=module_inputs_torch_nn_LocalResponseNorm,
+               skips=(
+                   # uses avg_pool3d which is not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.LayerNorm,
+               module_inputs_func=module_inputs_torch_nn_LayerNorm,
+               skips=(
+                   # No channels_last support for LayerNorm currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    # TransformerEncoder takes the same inputs as TransformerEncoderLayer
+    ModuleInfo(torch.nn.TransformerEncoder,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_TransformerEncoder,
+               decorators=[
+                   # Not implemented for SDPA backward derivative
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad',
+                                device_type='cpu'),
+               ],
+               skips=(
+                   # No channels_last support for TransformerEncoderLayer currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # Doesn't support device / dtype kwargs directly because it is just a
+                   # container of TransformerEncoderLayers.
+                   DecorateInfo(unittest.expectedFailure, 'TestModule', 'test_factory_kwargs'),)
+               ),
+    ModuleInfo(torch.nn.TransformerEncoderLayer,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_TransformerEncoderLayer,
+               decorators=[
+                   DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                                'TestModule', 'test_non_contiguous_tensors',
+                                device_type='cpu', active_if=IS_WINDOWS),
+                   # Not implemented for SDPA backward derivative
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad',
+                                device_type='cpu'),
+               ],
+               skips=(
+                   # No channels_last support for TransformerEncoderLayer currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.TransformerDecoderLayer,
+               module_inputs_func=module_inputs_torch_nn_TransformerDecoderLayer,
+               decorators=[
+                   # Not implemented for SDPA backward derivative
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad',
+                                device_type='cpu'),
+               ],
+               skips=(
+                   # No channels_last support for TransformerDecoderLayer currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.Transformer,
+               module_inputs_func=module_inputs_torch_nn_Transformer,
+               decorators=[
+                   # Not implemented for SDPA backward derivative
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_gradgrad',
+                                device_type='cpu'),
+               ],
+               skips=(
+                   # No channels_last support for Transformer currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.MultiheadAttention,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_MultiheadAttention,
+               skips=(
+                   # No channels_last support for MultiheadAttention currently.
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.Embedding,
+               module_inputs_func=module_inputs_torch_nn_Embedding,
+               decorators=[
+                   DecorateInfo(toleranceOverride({torch.float32: tol(atol=1e-4, rtol=1e-4)}),
+                                'TestModule', 'test_non_contiguous_tensors',
+                                device_type='mps')],
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.ReLU,
+               module_inputs_func=module_inputs_torch_nn_ReLU,
+               skips=(
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),)
+               ),
+    ModuleInfo(torch.nn.LeakyReLU,
+               module_inputs_func=module_inputs_torch_nn_LeakyReLU,
+               ),
+    ModuleInfo(torch.nn.ReLU6,
+               module_inputs_func=module_inputs_torch_nn_ReLU6,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.PReLU,
+               module_inputs_func=module_inputs_torch_nn_PReLU,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.RNNCell,
+               module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU_Cell, is_rnn=True),
+               module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU_Cell,
+               ),
+    ModuleInfo(torch.nn.GRUCell,
+               module_inputs_func=module_inputs_torch_nn_RNN_GRU_Cell,
+               module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU_Cell,
+               ),
+    ModuleInfo(torch.nn.LSTMCell,
+               module_inputs_func=module_inputs_torch_nn_LSTMCell,
+               module_error_inputs_func=module_error_inputs_torch_nn_LSTMCell,
+               ),
+    ModuleInfo(torch.nn.Sigmoid,
+               module_inputs_func=module_inputs_torch_nn_Sigmoid,
+               skips=(
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),)
+               ),
+    ModuleInfo(torch.nn.LogSigmoid,
+               module_inputs_func=module_inputs_torch_nn_LogSigmoid,
+               skips=(
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.SiLU,
+               module_inputs_func=module_inputs_torch_nn_SiLU,
+               ),
+    ModuleInfo(torch.nn.Softmax,
+               module_inputs_func=module_inputs_torch_nn_Softmax,
+               ),
+    ModuleInfo(torch.nn.Softmax2d,
+               module_inputs_func=module_inputs_torch_nn_Softmax2d,
+               skips=(
+                   # no channels last support for Softmax2d currently
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: tolerance issue
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.LogSoftmax,
+               module_inputs_func=module_inputs_torch_nn_LogSoftmax,
+               skips=(
+                   # no channels last support for LogSoftmax currently
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),
+                   # See #119108: inf nan error
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_forward", device_type='mps', dtypes=[torch.float16]),)
+               ),
+    ModuleInfo(torch.nn.Softmin,
+               module_inputs_func=module_inputs_torch_nn_Softmin,
+               skips=(
+                   # no channels last support for Softmin currently
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format'),)
+               ),
+    ModuleInfo(torch.nn.Softplus,
+               module_inputs_func=module_inputs_torch_nn_Softplus,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.Softshrink,
+               module_inputs_func=module_inputs_torch_nn_Softshrink,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.Softsign,
+               module_inputs_func=module_inputs_torch_nn_Softsign,
+               ),
+    ModuleInfo(torch.nn.Tanh,
+               module_inputs_func=module_inputs_torch_nn_Tanh,
+               skips=(
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),)
+               ),
+    ModuleInfo(torch.nn.Tanhshrink,
+               module_inputs_func=module_inputs_torch_nn_Tanhshrink,
+               skips=(
+                   # Fails on backward check on MPS
+                   # See https://github.com/pytorch/pytorch/issues/107214
+                   DecorateInfo(
+                       unittest.expectedFailure,
+                       'TestModule',
+                       'test_memory_format',
+                       active_if=lambda p: p['training'],
+                       device_type='mps',
+                   ),)
+               ),
+    ModuleInfo(torch.nn.Threshold,
+               module_inputs_func=module_inputs_torch_nn_Threshold,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.Mish,
+               module_inputs_func=module_inputs_torch_nn_Mish,
+               skips=(
+                   # not supported on MPS backend
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.RNN,
+               train_and_eval_differ=True,
+               module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=True),
+               module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU,
+               decorators=rnn_gru_lstm_module_info_decorators
+               ),
+    ModuleInfo(torch.nn.GRU,
+               train_and_eval_differ=True,
+               module_inputs_func=partial(module_inputs_torch_nn_RNN_GRU, is_rnn=False),
+               module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU,
+               decorators=rnn_gru_lstm_module_info_decorators),
+    ModuleInfo(torch.nn.LSTM,
+               train_and_eval_differ=True,
+               module_inputs_func=module_inputs_torch_nn_LSTM,
+               module_error_inputs_func=module_error_inputs_torch_nn_RNN_GRU,
+               skips=(
+                   # LSTM with projections is not currently supported with MPS
+                   DecorateInfo(skipMPS),),
+               decorators=rnn_gru_lstm_module_info_decorators),
+    ModuleInfo(torch.nn.ReflectionPad1d,
+               module_inputs_func=module_inputs_torch_nn_ReflectionPad1d,
+               ),
+    ModuleInfo(torch.nn.ReflectionPad2d,
+               module_inputs_func=module_inputs_torch_nn_ReflectionPad2d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='cuda'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.ReflectionPad3d,
+               module_inputs_func=module_inputs_torch_nn_ReflectionPad3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='cuda'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.ReplicationPad1d,
+               module_inputs_func=module_inputs_torch_nn_ReplicationPad1d,
+               ),
+    ModuleInfo(torch.nn.ReplicationPad2d,
+               module_inputs_func=module_inputs_torch_nn_ReplicationPad2d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='cuda'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.ReplicationPad3d,
+               module_inputs_func=module_inputs_torch_nn_ReplicationPad3d,
+               gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+               skips=(
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='cuda'),
+                   DecorateInfo(unittest.skip("Skipped!"), 'TestModule', 'test_memory_format',
+                                device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.SELU,
+               module_inputs_func=module_inputs_torch_nn_SELU,
+               skips=(
+                   # test fails on MPS backend and is being investigated.
+                   # See https://github.com/pytorch/pytorch/issues/100914
+                   DecorateInfo(skipMPS),)
+               ),
+    ModuleInfo(torch.nn.ZeroPad1d,
+               module_inputs_func=module_inputs_torch_nn_ZeroPad1d,
+               ),
+    ModuleInfo(torch.nn.ZeroPad2d,
+               module_inputs_func=module_inputs_torch_nn_ZeroPad2d,
+               skips=(
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.ZeroPad3d,
+               module_inputs_func=module_inputs_torch_nn_ZeroPad3d,
+               skips=(
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.CircularPad1d,
+               module_inputs_func=module_inputs_torch_nn_CircularPad1d,
+               module_error_inputs_func=module_error_inputs_torch_nn_Pad1d,
+               ),
+    ModuleInfo(torch.nn.CircularPad2d,
+               module_inputs_func=module_inputs_torch_nn_CircularPad2d,
+               module_error_inputs_func=module_error_inputs_torch_nn_Pad2d,
+               ),
+    ModuleInfo(torch.nn.CircularPad3d,
+               module_inputs_func=module_inputs_torch_nn_CircularPad3d,
+               module_error_inputs_func=module_error_inputs_torch_nn_Pad3d,
+               skips=(
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format"),)
+               ),
+    ModuleInfo(torch.nn.ConstantPad1d,
+               module_inputs_func=module_inputs_torch_nn_ConstantPad1d,
+               ),
+    ModuleInfo(torch.nn.ConstantPad2d,
+               module_inputs_func=module_inputs_torch_nn_ConstantPad2d,
+               skips=(
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
+               ),
+    ModuleInfo(torch.nn.ConstantPad3d,
+               module_inputs_func=module_inputs_torch_nn_ConstantPad3d,
+               skips=(
+                   # Fails with channels last test on MPS backend
+                   DecorateInfo(unittest.expectedFailure, "TestModule", "test_memory_format", device_type='mps'),)
+               )
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_nn.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a2fb47b9ada89f55e57754e165e2253a42d436
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_nn.py
@@ -0,0 +1,3987 @@
+# mypy: ignore-errors
+
+from abc import abstractmethod
+import tempfile
+import unittest
+
+from copy import deepcopy
+from functools import reduce, partial
+from itertools import product
+from operator import mul
+
+
+import torch
+import torch.cuda
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import _reduction as _Reduction
+from torch.testing._internal.common_utils import TestCase, to_gpu, freeze_rng_state, is_iterable, \
+    gradcheck, gradgradcheck, set_default_dtype, skipIfTorchDynamo
+from torch.testing._internal.common_cuda import TEST_CUDA, SM90OrLater
+from torch.autograd.gradcheck import _get_numerical_jacobian, _iter_tensors
+from torch.autograd import Variable
+from torch.types import _TensorOrTensors
+import torch.backends.cudnn
+
+from typing import Dict, Callable, Tuple, List, Sequence, Union, Any
+
+TemporaryFile = tempfile.TemporaryFile
+PRECISION = 1e-5
+
+
+def get_reduction(m):
+    result = getattr(m, 'reduction', None)
+    if result is None:
+        result = _Reduction.legacy_get_string(getattr(m, 'sizeAverage', None), True, emit_warning=False)
+    assert result is not None
+    return result
+
+
+def get_weight(m):
+    result = getattr(m, 'weight', None)
+    if result is not None:
+        return result
+    return getattr(m, 'weights', None)
+
+# NOTE [How to check NN module / functional API parity between Python and C++ frontends]
+#
+# The way to check API parity is to add parity tests for the NN module / functional of interest.
+# Here are the detailed steps:
+#
+# For NN module:
+# 1. Make sure you already have a test dict with the module configuration you want to test.
+# 2. Add `cpp_constructor_args` entry to the test dict, with its value exactly matching
+#    the Python module constructor arguments. For example, if in the test dict we pass
+#    `(10, 8)` to `torch.nn.Linear` constructor, then we should pass `torch::nn::LinearOptions(10, 8)`
+#    as the corresponding C++ constructor argument to `torch::nn::Linear`.
+# 3. If in the process of performing the above step you referenced any variables
+#    in the `cpp_constructor_args` entry, you must add `cpp_var_map` entry
+#    to the test dict to make sure that those variables are populated with the right Python values.
+#    For example, if the Python constructor call is
+#    `torch.nn.FractionalMaxPool2d(2, output_ratio=0.5, _random_samples=random_samples)`,
+#    the corresponding C++ constructor argument is
+#    `torch::nn::FractionalMaxPool2dOptions(2).output_ratio(0.5)._random_samples(random_samples)`,
+#    and the `cpp_var_map` entry must be
+#    `{'random_samples': random_samples}` in order to populate the C++ variable `random_samples`
+#    used in the C++ constructor argument with the Python tensor value `random_samples`.
+#
+# For NN functional:
+# 1. Make sure you already have a test dict with the functional configuration you want to test.
+# 2. If the test dict's `constructor` entry looks like `wrap_functional(F.some_functional_name, ...)`,
+#    then you must add `cpp_options_args` entry to the test dict, with its value exactly matching the Python
+#    functional optional arguments. For example, if the test dict's `constructor` entry is
+#    `wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest')`,
+#    then the `cpp_options_args` entry should be
+#    "F::InterpolateFuncOptions().size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)".
+# 3. Otherwise, if the test dict's `constructor` entry looks like
+#    `wrap_functional(lambda i: F.some_functional_name(...))`,
+#    then you must add `cpp_function_call` entry to the test dict, with its value exactly matching the Python
+#    functional function call. For example, if the test dict's `constructor` entry is
+#    `wrap_functional(lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none'))`,
+#    then the `cpp_function_call` entry should be
+#    "F::poisson_nll_loss(i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))".
+# 4. If in the process of performing the above two steps you referenced any variables
+#    in the `cpp_options_args` or `cpp_function_call` entry, you must
+#    add `cpp_var_map` entry to the test dict to make sure that those variables
+#    are populated with the right Python values. For example, if the test dict's `constructor` entry is
+#    `wrap_functional(lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none'))`,
+#    then the `cpp_function_call` entry should be
+#    "F::poisson_nll_loss(i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))".
+#    Notice that there are two variables `i` and `t` that need to have their values provided,
+#    and the way to do so is to add a `cpp_var_map` entry: `cpp_var_map={'i': '_get_input()', 't': t}`.
+#    (Note that for `i`, since we want it to take the Python input value, we pass '_get_input()' string as value
+#    and the C++ parity test mechanism will populate `i` with the Python input value correctly.)
+#
+# There are also a few optional flags in the test dict to control the C++ parity test behavior:
+#
+# - `test_cpp_api_parity`: if `False`, skips the C++ parity test for this test dict. Default: True.
+# - `has_parity`: if `False`, expects this test dict to fail the C++ parity test. Default: True.
+
+
+module_tests = [
+    dict(
+        module_name='Linear',
+        constructor_args=(10, 8),
+        cpp_constructor_args='torch::nn::LinearOptions(10, 8)',
+        input_size=(4, 10),
+        reference_fn=lambda i, p, _: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Linear',
+        constructor_args=(10, 8, False),
+        cpp_constructor_args='torch::nn::LinearOptions(10, 8).bias(false)',
+        input_size=(4, 10),
+        desc='no_bias',
+        reference_fn=lambda i, p, _: torch.mm(i, p[0].t()),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='RReLU',
+        input_size=(1, 2, 2),
+        test_cuda=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='RReLU',
+        constructor_args=(0.1, 0.9),
+        cpp_constructor_args='torch::nn::RReLUOptions().lower(0.1).upper(0.9)',
+        input_size=(4, 4, 5),
+        desc='with_up_down',
+        test_cuda=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Flatten',
+        input_size=(2, 3, 4, 5),
+        reference_fn=lambda i, *_: torch.flatten(i, 1),
+        default_dtype=torch.double,
+    ),
+    # TODO: reference function
+    dict(
+        module_name='CrossMapLRN2d',
+        constructor_args=(5, 5e-3, 1e-3, 2),
+        cpp_constructor_args='torch::nn::CrossMapLRN2dOptions(5).alpha(5e-3).beta(1e-3).k(2)',
+        input_size=(2, 3, 6, 6),
+        check_gradgrad=False,
+        # TODO(#50743): Figure out the error. "RuntimeError: Unrecognized tensor type ID: Batched"
+        check_batched_grad=False,
+        default_dtype=torch.double,
+    ),
+]
+
+
+# Generates rand tensor with non-equal values. This ensures that duplicate
+# values won't be causing test failure for modules like MaxPooling.
+# size should be small, otherwise randperm fails / long overflows.
+def _rand_tensor_non_equal(*size):
+    total = reduce(mul, size, 1)
+    return torch.randperm(total).view(*size).double()
+
+
+def wrap_functional(fn, **kwargs):
+    class FunctionalModule(nn.Module):
+        def forward(self, *args):
+            return fn(*args, **kwargs)
+    return FunctionalModule
+
+
+def poissonnllloss_no_reduce_test():
+    t = torch.randn(10, 10)
+    return dict(
+        fullname='PoissonNLLLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.poisson_nll_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::poisson_nll_loss('
+                          'i, t.to(i.options()), F::PoissonNLLLossFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(10, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: i.exp() - t.mul(i),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def bceloss_no_reduce_test():
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
+    return dict(
+        fullname='BCELoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::binary_cross_entropy('
+                          'i, t.to(i.options()), F::BinaryCrossEntropyFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
+        pickle=False,
+        precision=7e-4,
+        default_dtype=torch.double)
+
+
+def bceloss_no_reduce_scalar_test():
+    t = torch.randn(()).gt(0).to(torch.double)
+    return dict(
+        fullname='BCELoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::binary_cross_entropy('
+                          'i, t.to(i.options()), F::BinaryCrossEntropyFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def bceloss_weights_no_reduce_test():
+    t = Variable(torch.randn(15, 10, dtype=torch.double).gt(0).to(torch.double))
+    weights = torch.rand(10, dtype=torch.double)
+    return dict(
+        fullname='BCELoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i),
+                                             weight=weights.type_as(i), reduction='none')),
+        cpp_function_call='F::binary_cross_entropy('
+                          'i, t.to(i.options()), '
+                          'F::BinaryCrossEntropyFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
+        reference_fn=lambda i, p, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
+        pickle=False,
+        precision=3e-4,
+        default_dtype=torch.double,
+    )
+
+
+def bceloss_weights_no_reduce_scalar_test():
+    t = torch.randn(()).gt(0).to(torch.double)
+    weights = torch.rand((), dtype=torch.double)
+    return dict(
+        fullname='BCELoss_weights_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy(i, t.type_as(i),
+                                             weight=weights.type_as(i), reduction='none')),
+        cpp_function_call='''F::binary_cross_entropy(
+            i, t.to(i.options()),
+            F::BinaryCrossEntropyFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
+        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def bce_with_logistic_legacy_enum_test():
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
+    sigmoid = nn.Sigmoid()
+    return dict(
+        fullname='BCEWithLogitsLoss_legacy_enum',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduce=False)),
+        cpp_function_call='''F::binary_cross_entropy_with_logits(
+            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def bce_with_logistic_no_reduce_test():
+    t = Variable(torch.randn(15, 10).gt(0).to(torch.double))
+    sigmoid = nn.Sigmoid()
+    return dict(
+        fullname='BCEWithLogitsLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::binary_cross_entropy_with_logits(
+            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def bce_with_logistic_no_reduce_scalar_test():
+    t = torch.randn(()).gt(0).to(torch.double)
+    sigmoid = nn.Sigmoid()
+    return dict(
+        fullname='BCEWithLogitsLoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.binary_cross_entropy_with_logits(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::binary_cross_entropy_with_logits(
+            i, t.to(i.options()), F::BinaryCrossEntropyWithLogitsFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: -(t * sigmoid(i).log() + (1 - t) * (1 - sigmoid(i)).log()),
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def kldivloss_with_target_no_reduce_test():
+    t = torch.rand(10, 10, dtype=torch.double)
+    return dict(
+        fullname='KLDivLoss_with_target_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(10, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def kldivloss_no_reduce_test():
+    t = torch.rand(10, 10, dtype=torch.double)
+    return dict(
+        fullname='KLDivLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(10, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def kldivloss_no_reduce_scalar_test():
+    t = torch.rand((), dtype=torch.double)
+    return dict(
+        fullname='KLDivLoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.rand(()).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def kldivloss_with_log_target_no_reduce_test():
+    t = torch.rand(10, 10, dtype=torch.double).log()
+    return dict(
+        fullname='KLDivLoss_with_log_target_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none', log_target=True)),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone).log_target(true))',
+        input_fn=lambda: torch.rand(10, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def kldivloss_no_reduce_log_target_test():
+    t = torch.rand(10, 10, dtype=torch.double).log()
+    return dict(
+        fullname='KLDivLoss_no_reduce_log_target',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none', log_target=True)),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone).log_target(true))',
+        input_fn=lambda: torch.rand(10, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double,
+    )
+
+
+def kldivloss_no_reduce_scalar_log_target_test():
+    t = torch.rand((), dtype=torch.double).log()
+    return dict(
+        fullname='KLDivLoss_no_reduce_scalar_log_target',
+        constructor=wrap_functional(
+            lambda i: F.kl_div(i, t.type_as(i), reduction='none', log_target=True)),
+        cpp_function_call='F::kl_div(i, t.to(i.options()), F::KLDivFuncOptions().reduction(torch::kNone).log_target(true))',
+        input_fn=lambda: torch.rand(()).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['KLDivLoss_log_target'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def l1loss_no_reduce_test():
+    t = torch.randn(2, 3, 4, dtype=torch.double)
+    return dict(
+        fullname='L1Loss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::l1_loss(i, t.to(i.options()), F::L1LossFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def l1loss_no_reduce_complex_test():
+    t = torch.randn(2, 3, 4, dtype=torch.cdouble)
+    return dict(
+        fullname='L1Loss_no_reduce_complex',
+        constructor=wrap_functional(
+            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::l1_loss(i, t.to(i.options()), F::L1LossFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.randn(2, 3, 4, dtype=torch.cdouble),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
+        supports_forward_ad=True,
+        pickle=False)
+
+
+def l1loss_no_reduce_scalar_test():
+    t = torch.randn((), dtype=torch.double)
+    return dict(
+        fullname='L1Loss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.l1_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='F::l1_loss(i, t.to(i.options()), F::L1LossFuncOptions().reduction(torch::kNone))',
+        input_fn=lambda: torch.randn(()),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_: (i - t.type_as(i)).abs(),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def mseloss_no_reduce_test():
+    input_size = (2, 3, 4, 5)
+    target = torch.randn(*input_size, dtype=torch.double)
+    return dict(
+        fullname='MSELoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
+        cpp_function_call='F::mse_loss(i, target.to(i.options()), F::MSELossFuncOptions().reduction(torch::kNone))',
+        input_size=input_size,
+        cpp_var_map={'i': '_get_input()', 'target': target},
+        reference_fn=lambda i, *_: (i - target).pow(2),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def mseloss_no_reduce_scalar_test():
+    input_size = ()
+    target = torch.randn(input_size, dtype=torch.double)
+    return dict(
+        fullname='MSELoss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.mse_loss(i, target.type_as(i), reduction='none')),
+        cpp_function_call='F::mse_loss(i, target.to(i.options()), F::MSELossFuncOptions().reduction(torch::kNone))',
+        input_size=input_size,
+        cpp_var_map={'i': '_get_input()', 'target': target},
+        reference_fn=lambda i, *_: (i - target).pow(2),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss_no_reduce_test():
+    t = Variable(torch.empty(15).uniform_().mul(10).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), reduction=kwargs['reduction'])),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(15, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss_no_reduce_ignore_index_test():
+    t = Variable(torch.empty(15).uniform_().mul(10).floor().long())
+    kwargs: Dict[str, Union[int, str]] = {'ignore_index': 2, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), ignore_index=int(kwargs['ignore_index']),
+                                 reduction=str(kwargs['reduction']))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(2).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(15, 10).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss_no_reduce_weights_test():
+    t = Variable(torch.empty(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss_no_reduce_weights_ignore_index_test():
+    t = Variable(torch.empty(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none',
+                'ignore_index': 2}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i.data))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(2))''',
+        input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss_no_reduce_weights_ignore_index_neg_test():
+    t = Variable(torch.empty(15).uniform_().mul(10).floor().long())
+    weight = torch.rand(10)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none',
+                'ignore_index': -1}
+
+    return dict(
+        fullname='NLLLoss_no_reduce_weights_ignore_index_neg',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone).ignore_index(-1))''',
+        input=torch.rand(15, 10, dtype=torch.double).add(1e-2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLoss'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss2d_no_reduce_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss2d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), reduction=kwargs['reduction'])),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss2d_no_reduce_ignore_index_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    kwargs: Dict[str, Union[int, str]] = {'ignore_index': 1, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLoss2d_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), ignore_index=int(kwargs['ignore_index']),
+                                 reduction=str(kwargs['reduction']))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(1).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nllloss2d_no_reduce_weights_test():
+    t = Variable(torch.rand(2, 5, 5).mul(3).floor().long())
+    weight = torch.rand(3)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLoss2d_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5).log(),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nlllossNd_no_reduce_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    kwargs = {'reduction': 'none'}
+    return dict(
+        fullname='NLLLossNd_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), reduction=kwargs['reduction'])),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nlllossNd_no_reduce_ignore_index_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    kwargs: Dict[str, Union[int, str]] = {'ignore_index': 1, 'reduction': 'none'}
+    return dict(
+        fullname='NLLLossNd_no_reduce_ignore_index',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), ignore_index=int(kwargs['ignore_index']),
+                                 reduction=str(kwargs['reduction']))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong), F::NLLLossFuncOptions().ignore_index(1).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def nlllossNd_no_reduce_weights_test():
+    t = Variable(torch.rand(2, 5, 5, 2, 2).mul(3).floor().long())
+    weight = torch.rand(3)
+
+    def kwargs(i):
+        return {'weight': weight.type_as(i), 'reduction': 'none'}
+
+    return dict(
+        fullname='NLLLossNd_no_reduce_weights',
+        constructor=wrap_functional(
+            lambda i: F.nll_loss(i, t.type_as(i).long(), **kwargs(i))),
+        cpp_function_call='''F::nll_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::NLLLossFuncOptions().weight(weight.to(i.options())).reduction(torch::kNone))''',
+        input_fn=lambda: torch.rand(2, 3, 5, 5, 2, 2).log(),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weight': weight},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['NLLLossNd'](i, t.type_as(i).long(), **kwargs(i)),
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def smoothl1loss_no_reduce_test():
+    t = torch.randn(2, 3, 4, dtype=torch.double)
+    return dict(
+        fullname='SmoothL1Loss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def smoothl1loss_no_reduce_scalar_test():
+    t = torch.randn((), dtype=torch.double)
+    return dict(
+        fullname='SmoothL1Loss_no_reduce_scalar',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(()),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def smoothl1loss_beta_test():
+    t = torch.randn(2, 3, 4, dtype=torch.double)
+    return dict(
+        fullname='SmoothL1Loss_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0.5)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0.5)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0.5),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def smoothl1loss_zero_beta_test():
+    t = torch.randn(2, 3, 4, dtype=torch.double)
+    return dict(
+        fullname='SmoothL1Loss_zero_beta',
+        constructor=wrap_functional(
+            lambda i: F.smooth_l1_loss(i, t.type_as(i), reduction='none', beta=0)),
+        cpp_function_call='''F::smooth_l1_loss(
+            i, t.to(i.options()), F::SmoothL1LossFuncOptions().reduction(torch::kNone), 0)''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SmoothL1Loss'](i, t.type_as(i), reduction='none', beta=0),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def huberloss_delta_test():
+    t = torch.randn(2, 3, 4)
+    return dict(
+        fullname='HuberLoss_delta',
+        constructor=wrap_functional(
+            lambda i: F.huber_loss(i, t.type_as(i), reduction='none', delta=0.5)),
+        cpp_function_call='''F::huber_loss(
+            i, t.to(i.options()), F::HuberLossFuncOptions().reduction(torch::kNone).delta(0.5))''',
+        input_fn=lambda: torch.randn(2, 3, 4),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['HuberLoss'](i, t.type_as(i), reduction='none', delta=0.5),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multilabelmarginloss_0d_no_reduce_test():
+    t = torch.zeros(()).long()
+    return dict(
+        fullname='MultiLabelMarginLoss_0d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multilabel_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(()),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False)
+
+
+def multilabelmarginloss_1d_no_reduce_test():
+    t = Variable(torch.rand(10).mul(10).floor().long())
+    return dict(
+        fullname='MultiLabelMarginLoss_1d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multilabel_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multilabelmarginloss_index_neg_test():
+    t = Variable(torch.clamp(torch.rand(5, 10).add(-.5).mul(20).floor().long(), min=-1))
+    return dict(
+        fullname='MultiLabelMarginLoss_index_neg',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multilabel_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multilabelmarginloss_no_reduce_test():
+    t = Variable(torch.rand(5, 10).mul(10).floor().long())
+    return dict(
+        fullname='MultiLabelMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multilabel_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultilabelMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiLabelMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def hingeembeddingloss_no_reduce_test():
+    t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1))
+    return dict(
+        fullname='HingeEmbeddingLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.hinge_embedding_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::hinge_embedding_loss(
+            i, t.to(i.options()), F::HingeEmbeddingLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), reduction='none'),
+        check_sum_reduction=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def hingeembeddingloss_margin_no_reduce_test():
+    t = Variable(torch.randn(10).gt(0).to(torch.double).mul_(2).sub(1))
+    return dict(
+        fullname='HingeEmbeddingLoss_margin_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.hinge_embedding_loss(i, t.type_as(i), margin=0.5, reduction='none')),
+        cpp_function_call='''F::hinge_embedding_loss(
+            i, t.to(i.options()), F::HingeEmbeddingLossFuncOptions().margin(0.5).reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['HingeEmbeddingLoss'](i, t.type_as(i), margin=0.5, reduction='none'),
+        check_sum_reduction=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def softmarginloss_no_reduce_test():
+    t = torch.randn(5, 5, dtype=torch.double)
+    return dict(
+        fullname='SoftMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.soft_margin_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::soft_margin_loss(
+            i, t.to(i.options()), F::SoftMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 5),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['SoftMarginLoss'](i, t.type_as(i), reduction='none'),
+        supports_forward_ad=True,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multilabelsoftmarginloss_no_reduce_test():
+    t = torch.rand(5, 10).mul(2).floor()
+    return dict(
+        fullname='MultiLabelSoftMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i), reduction='none')),
+        cpp_function_call='''F::multilabel_soft_margin_loss(
+            i, t.to(i.options()), F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log())).sum(dim=1) / i.size(1),
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multilabelsoftmarginloss_weights_no_reduce_test():
+    t = torch.rand(5, 10).mul(2).floor()
+    weights = torch.rand(10)
+    return dict(
+        fullname='MultiLabelSoftMarginLoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multilabel_soft_margin_loss(i, t.type_as(i),
+                                                    weight=weights.type_as(i), reduction='none')),
+        cpp_function_call='''F::multilabel_soft_margin_loss(
+            i, t.to(i.options()),
+            F::MultilabelSoftMarginLossFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
+        reference_fn=lambda i, *_:
+            (-(t * i.sigmoid().log() + (1 - t) * (-i).sigmoid().log()) * weights).sum(dim=1) / i.size(1),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_1d_no_reduce_test():
+    t = torch.rand(1).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_1d_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_1d_input_0d_target_no_reduce_test():
+    t = torch.rand(()).mul(8).floor().long()
+    return dict(
+        fullname='multimarginloss_1d_input_0d_target_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_p_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_p_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), p=2, reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong), F::MultiMarginLossFuncOptions().p(2).reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10).clamp_(1e-2, 1 - 1e-2),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(), p=2, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_margin_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    return dict(
+        fullname='MultiMarginLoss_margin_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), margin=0.5, reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::MultiMarginLossFuncOptions().margin(0.5).reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
+                                                  margin=0.5, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def multimarginloss_weights_no_reduce_test():
+    t = torch.rand(5).mul(8).floor().long()
+    weights = torch.rand(10, dtype=torch.double)
+    return dict(
+        fullname='MultiMarginLoss_weights_no_reduce',
+        constructor=wrap_functional(
+            lambda i: F.multi_margin_loss(i, t.type_as(i).long(), weight=weights.type_as(i),
+                                          reduction='none')),
+        cpp_function_call='''F::multi_margin_loss(
+            i, t.to(i.options()).to(torch::kLong),
+            F::MultiMarginLossFuncOptions().weight(weights.to(i.options())).reduction(torch::kNone))''',
+        input_fn=lambda: torch.randn(5, 10),
+        cpp_var_map={'i': '_get_input()', 't': t, 'weights': weights},
+        reference_fn=lambda i, *_:
+            loss_reference_fns['MultiMarginLoss'](i, t.data.type_as(i).long(),
+                                                  weight=weights, reduction='none'),
+        check_sum_reduction=True,
+        check_gradgrad=False,
+        pickle=False,
+        default_dtype=torch.double)
+
+
+def single_batch_reference_fn(input, parameters, module):
+    """Reference function for modules supporting no batch dimensions.
+
+    The module is passed the input and target in batched form with a single item.
+    The output is squeezed to compare with the no-batch input.
+    """
+    def unsqueeze_inp(inp):
+        if isinstance(inp, (list, tuple)):
+            return [t.unsqueeze(0) for t in inp]
+        return inp.unsqueeze(0)
+
+    single_batch_input = unsqueeze_inp(input)
+    single_batch_input = [single_batch_input] if isinstance(single_batch_input, torch.Tensor) else single_batch_input
+    with freeze_rng_state():
+        return module(*single_batch_input).squeeze(0)
+
+
+new_module_tests = [
+    poissonnllloss_no_reduce_test(),
+    bceloss_no_reduce_test(),
+    bceloss_weights_no_reduce_test(),
+    bce_with_logistic_legacy_enum_test(),
+    bce_with_logistic_no_reduce_test(),
+    bceloss_no_reduce_scalar_test(),
+    bceloss_weights_no_reduce_scalar_test(),
+    bce_with_logistic_no_reduce_scalar_test(),
+    kldivloss_with_target_no_reduce_test(),
+    kldivloss_no_reduce_test(),
+    kldivloss_no_reduce_scalar_test(),
+    kldivloss_with_log_target_no_reduce_test(),
+    kldivloss_no_reduce_log_target_test(),
+    kldivloss_no_reduce_scalar_log_target_test(),
+    l1loss_no_reduce_test(),
+    l1loss_no_reduce_complex_test(),
+    l1loss_no_reduce_scalar_test(),
+    mseloss_no_reduce_test(),
+    mseloss_no_reduce_scalar_test(),
+    nllloss_no_reduce_test(),
+    nllloss_no_reduce_ignore_index_test(),
+    nllloss_no_reduce_weights_test(),
+    nllloss_no_reduce_weights_ignore_index_test(),
+    nllloss_no_reduce_weights_ignore_index_neg_test(),
+    nllloss2d_no_reduce_test(),
+    nllloss2d_no_reduce_weights_test(),
+    nllloss2d_no_reduce_ignore_index_test(),
+    nlllossNd_no_reduce_test(),
+    nlllossNd_no_reduce_weights_test(),
+    nlllossNd_no_reduce_ignore_index_test(),
+    smoothl1loss_no_reduce_test(),
+    smoothl1loss_no_reduce_scalar_test(),
+    smoothl1loss_beta_test(),
+    smoothl1loss_zero_beta_test(),
+    huberloss_delta_test(),
+    multilabelmarginloss_0d_no_reduce_test(),
+    multilabelmarginloss_1d_no_reduce_test(),
+    multilabelmarginloss_index_neg_test(),
+    multilabelmarginloss_no_reduce_test(),
+    hingeembeddingloss_no_reduce_test(),
+    hingeembeddingloss_margin_no_reduce_test(),
+    softmarginloss_no_reduce_test(),
+    multilabelsoftmarginloss_no_reduce_test(),
+    multilabelsoftmarginloss_weights_no_reduce_test(),
+    multimarginloss_no_reduce_test(),
+    multimarginloss_1d_no_reduce_test(),
+    multimarginloss_1d_input_0d_target_no_reduce_test(),
+    multimarginloss_p_no_reduce_test(),
+    multimarginloss_margin_no_reduce_test(),
+    multimarginloss_weights_no_reduce_test(),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3, 2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).stride(2)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='stride',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3, 1, 1),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).stride(1).padding(1)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='pad1',
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 5, 1, 2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 5).stride(1).padding(2)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        desc='pad2',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 4, 3, 1, 1),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 4, 3).stride(1).padding(1)',
+        input_size=(1, 4, 1),
+        cudnn=True,
+        desc='pad1size1',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 4, 5, 1, 2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 4, 5).stride(1).padding(2)',
+        input_size=(1, 4, 1),
+        cudnn=True,
+        desc='pad2size1',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv1d',
+        constructor_args=(4, 5, 3),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3)',
+        input_size=(0, 4, 10),
+        cudnn=True,
+        desc='zero_batch',
+        with_tf32=True,
+        tf32_precision=0.005,
+    ),
+    dict(
+        fullname='Conv1d_dilated',
+        constructor=lambda: nn.Conv1d(4, 5, kernel_size=3, dilation=2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).dilation(2)',
+        input_size=(2, 4, 10),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv1d_groups',
+        constructor=lambda: nn.Conv1d(4, 6, kernel_size=3, groups=2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 6, 3).groups(2)',
+        input_size=(2, 4, 6),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv1d_pad_valid',
+        constructor=lambda: nn.Conv1d(4, 5, 3, padding="valid"),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).padding(torch::kValid)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv1d_pad_same',
+        constructor=lambda: nn.Conv1d(4, 5, 3, padding="same"),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).padding(torch::kSame)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv1d_pad_same2',
+        constructor=lambda: nn.Conv1d(4, 5, 4, padding="same"),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 4).padding(torch::kSame)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv1d_pad_same_dilated',
+        constructor=lambda: nn.Conv1d(4, 5, 4, padding="same", dilation=2),
+        cpp_constructor_args='torch::nn::Conv1dOptions(4, 5, 3).padding(torch::kSame).dilation(2)',
+        input_size=(2, 4, 10),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='ConvTranspose1d',
+        constructor=lambda: nn.ConvTranspose1d(3, 4, kernel_size=3, stride=(3,), padding=1, output_padding=(1,)),
+        cpp_constructor_args='torch::nn::ConvTranspose1dOptions(3, 4, 3).stride(3).padding(1).output_padding(1)',
+        cudnn=True,
+        input_size=(1, 3, 7),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose1d',
+        constructor_args=(3, 4, 3, 2, 1, 1, 1, False),
+        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(3, 4, 3)
+                                .stride(2).padding(1).output_padding(1).groups(1).bias(false)''',
+        input_size=(1, 3, 6),
+        cudnn=True,
+        desc='no_bias',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose1d',
+        constructor_args=(3, 4, 3, 2, 1, 1, 1, True, 2),
+        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(3, 4, 3)
+                                .stride(2).padding(1).output_padding(1).groups(1).bias(true).dilation(2)''',
+        input_size=(1, 3, 6),
+        cudnn=True,
+        desc='dilated',
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='ConvTranspose1d_groups',
+        constructor=lambda: nn.ConvTranspose1d(4, 6, 3, stride=(3,), padding=1, output_padding=(1,), groups=2),
+        cpp_constructor_args='''torch::nn::ConvTranspose1dOptions(4, 6, 3)
+                                .stride(3).padding(1).output_padding(1).groups(2)''',
+        cudnn=True,
+        input_size=(2, 4, 7),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 2)),
+        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 2})',
+        input_size=(2, 3, 7, 5),
+        cudnn=True,
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 3), (2, 2)),
+        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 3}).stride({2, 2})',
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        desc='strided',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 3), (2, 2), (1, 1)),
+        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 3}).stride({2, 2}).padding({1, 1})',
+        input_size=(2, 3, 6, 6),
+        cudnn=True,
+        desc='padding',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 2, (3, 3), (2, 2), (1, 1), (2, 2)),
+        cpp_constructor_args='torch::nn::Conv2dOptions(3, 2, {3, 3}).stride({2, 2}).padding({1, 1}).dilation({2, 2})',
+        input_size=(2, 3, 8, 8),
+        cudnn=True,
+        desc='dilated',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 2), 1, 0, 1, 1, False),
+        cpp_constructor_args='''torch::nn::Conv2dOptions(3, 4, {3, 2})
+                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
+        input_size=(2, 3, 6, 5),
+        cudnn=True,
+        desc='no_bias',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.015,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv2d',
+        constructor_args=(3, 4, (3, 2)),
+        cpp_constructor_args='torch::nn::Conv2dOptions(3, 4, {3, 2})',
+        input_size=(0, 3, 7, 5),
+        cudnn=True,
+        desc='zero_batch',
+        check_with_long_tensor=True,
+        with_tf32=True,
+    ),
+    dict(
+        fullname='Conv2d_groups',
+        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 6, {3, 2}).groups(2)',
+        input_size=(2, 4, 6, 5),
+        cudnn=True,
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.015,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_groups_thnn',
+        constructor=lambda: nn.Conv2d(4, 6, (3, 2), groups=2),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 6, {3, 2}).groups(2)',
+        input_size=(2, 4, 6, 5),
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.015,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_pad_valid',
+        constructor=lambda: nn.Conv2d(2, 4, (3, 4), padding="valid"),
+        cpp_constructor_args='torch::nn::Conv2dOptions(2, 4, {3, 4}).padding(torch::kValid)',
+        input_size=(2, 2, 6, 5),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_pad_same',
+        constructor=lambda: nn.Conv2d(2, 4, (3, 4), padding="same"),
+        cpp_constructor_args='torch::nn::Conv2dOptions(2, 4, {3, 4}).padding(torch::kSame)',
+        input_size=(2, 2, 6, 5),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_pad_same_dilated',
+        constructor=lambda: nn.Conv2d(2, 4, (3, 4), padding="same", dilation=2),
+        cpp_constructor_args='torch::nn::Conv2dOptions(2, 4, {3, 4}).padding(torch::kSame).dilation(2)',
+        input_size=(2, 2, 6, 5),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (3, 2), 1, (1, 1)),
+        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
+                                .stride({3, 2}).padding(1).output_padding({1, 1})''',
+        cudnn=True,
+        input_size=(1, 3, 7, 6),
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False, (2, 2)),
+        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
+                                .stride({2, 3})
+                                .padding(1)
+                                .output_padding({1, 1})
+                                .groups(1)
+                                .bias(false)
+                                .dilation({2, 2})''',
+        input_size=(1, 3, 6, 7),
+        cudnn=True,
+        desc='dilated',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose2d',
+        constructor_args=(3, 4, 3, (2, 3), 1, (1, 1), 1, False),
+        cpp_constructor_args='''torch::nn::ConvTranspose2dOptions(3, 4, 3)
+                                .stride({2, 3}).padding(1).output_padding({1, 1}).groups(1).bias(false)''',
+        input_size=(1, 3, 6, 7),
+        cudnn=True,
+        desc='no_bias',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='ConvTranspose2d_groups',
+        constructor=lambda: nn.ConvTranspose2d(2, 4, (2, 3), groups=2),
+        cpp_constructor_args='torch::nn::ConvTranspose2dOptions(2, 4, {2, 3}).groups(2)',
+        input_size=(1, 2, 4, 5),
+        cudnn=True,
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.01,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_depthwise',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), groups=4),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).groups(4)',
+        input_size=(2, 4, 6, 6),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_depthwise_with_multiplier',
+        constructor=lambda: nn.Conv2d(4, 8, (3, 3), groups=4),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 8, {3, 3}).groups(4)',
+        input_size=(2, 4, 6, 6),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_depthwise_strided',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), stride=(2, 2), groups=4),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).stride({2, 2}).groups(4)',
+        input_size=(2, 4, 6, 6),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_depthwise_padded',
+        constructor=lambda: nn.Conv2d(4, 4, (3, 3), padding=(1, 1), groups=4),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {3, 3}).padding({1, 1}).groups(4)',
+        input_size=(2, 4, 6, 6),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv2d_depthwise_dilated',
+        constructor=lambda: nn.Conv2d(4, 4, (2, 2), dilation=(2, 2), groups=4),
+        cpp_constructor_args='torch::nn::Conv2dOptions(4, 4, {2, 2}).dilation({2, 2}).groups(4)',
+        input_size=(2, 4, 5, 5),
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(2, 3, (2, 3, 2)),
+        cpp_constructor_args='torch::nn::Conv3dOptions(2, 3, {2, 3, 2})',
+        input_size=(1, 2, 4, 5, 4),
+        cudnn=True,
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(2, 3, (2, 3, 4), 1, 0, 1, 1, False),
+        cpp_constructor_args='''torch::nn::Conv3dOptions(2, 3, {2, 3, 4})
+                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
+        input_size=(1, 2, 3, 4, 5),
+        cudnn=True,
+        desc='no_bias',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(2, 3, (1, 1, 1), 1, 0, 1, 1, False),
+        cpp_constructor_args='''torch::nn::Conv3dOptions(2, 3, {2, 3, 4})
+                                .stride(1).padding(0).dilation(1).groups(1).bias(false)''',
+        input_size=(1, 2, 3, 4, 5),
+        cudnn=True,
+        desc='1x1x1_no_bias',
+        check_with_long_tensor=False,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, 2, 2),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).stride(2)',
+        input_size=(2, 3, 5, 5, 5),
+        cudnn=True,
+        desc='stride',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, 2, 2, 1),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).stride(2).padding(1)',
+        input_size=(2, 3, 5, 5, 5),
+        cudnn=True,
+        desc='stride_padding',
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Conv3d',
+        constructor_args=(3, 4, (2, 3, 4)),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4})',
+        input_size=(0, 3, 3, 4, 5),
+        cudnn=True,
+        check_with_long_tensor=True,
+        desc='zero_batch',
+        with_tf32=True,
+    ),
+    dict(
+        fullname='Conv3d_groups',
+        constructor=lambda: nn.Conv3d(2, 4, kernel_size=3, groups=2),
+        cpp_constructor_args='torch::nn::Conv3dOptions(2, 4, 3).groups(2)',
+        input_size=(1, 2, 4, 5, 4),
+        cudnn=True,
+        check_with_long_tensor=True,
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv3d_dilated',
+        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2)',
+        input_size=(2, 3, 5, 5, 5),
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv3d_dilated_strided',
+        constructor=lambda: nn.Conv3d(3, 4, kernel_size=2, dilation=2, stride=2),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, 2).dilation(2).stride(2)',
+        input_size=(2, 3, 5, 5, 5),
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv3d_pad_valid',
+        constructor=lambda: nn.Conv3d(3, 4, (2, 3, 4), padding="valid"),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4}).padding(torch::kValid)',
+        input_size=(2, 3, 6, 5, 4),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv3d_pad_same',
+        constructor=lambda: nn.Conv3d(3, 4, (2, 3, 4), padding="same"),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4}).padding(torch::kSame)',
+        input_size=(2, 3, 6, 5, 4),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Conv3d_pad_same_dilated',
+        constructor=lambda: nn.Conv3d(3, 4, (2, 3, 4), padding="same", dilation=2),
+        cpp_constructor_args='torch::nn::Conv3dOptions(3, 4, {2, 3, 4}).padding(torch::kSame).dilation(2)',
+        input_size=(2, 3, 6, 5, 4),
+        cudnn=True,
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose3d',
+        constructor_args=(2, 3, (2, 3, 2)),
+        cpp_constructor_args='torch::nn::ConvTranspose3dOptions(2, 3, {2, 3, 2})',
+        cudnn=True,
+        input_size=(1, 2, 4, 5, 4),
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ConvTranspose3d',
+        constructor_args=(2, 3, (2, 3, 2), 1, 0, 0, 1, True, (2, 2, 2)),
+        cpp_constructor_args='''torch::nn::ConvTranspose3dOptions(2, 3, {2, 3, 2})
+                                .stride(1).padding(0).output_padding(0).groups(1).bias(true).dilation({2, 2, 2})''',
+        cudnn=True,
+        input_size=(1, 2, 4, 5, 4),
+        desc='dilated',
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ReplicationPad3d',
+        constructor_args=((1, 2, 3, 3, 2, 1),),
+        cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 3, 2, 1})',
+        input_size=(2, 3, 2, 2, 2),
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ReplicationPad3d',
+        constructor_args=((1, 2, 3, 3, 2, 1),),
+        cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 3, 2, 1})',
+        input_size=(3, 2, 2, 2),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='ReplicationPad3d',
+        constructor_args=((1, 2, 3, 3, 2, 1),),
+        cpp_constructor_args='torch::nn::ReplicationPad3dOptions({1, 2, 3, 3, 2, 1})',
+        input_fn=lambda: torch.rand(2, 3, 2, 2, 2, dtype=torch.complex128, requires_grad=True),
+        skip_half=True,
+        desc='complex'
+    ),
+    dict(
+        module_name='Embedding',
+        constructor_args=(4, 3),
+        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        check_gradgrad=False,
+        default_dtype=torch.double,
+        decorator=skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/117971")
+    ),
+    dict(
+        module_name='Embedding',
+        constructor_args=(4, 3),
+        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3)',
+        input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512),
+        check_gradgrad=False,
+        desc='discontiguous',
+        default_dtype=torch.double,
+        decorator=skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/117971")
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3),
+        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3)',
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        check_gradgrad=False,
+        desc='mean',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3),
+        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3)',
+        input_fn=lambda: torch.empty(1, 512, dtype=torch.long).random_(4).expand(7, 512),
+        check_gradgrad=False,
+        desc='discontiguous',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3, None, 2., False, 'sum'),
+        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
+                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kSum)''',
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        check_gradgrad=False,
+        desc='sum',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='EmbeddingBag',
+        constructor_args=(4, 3, None, 2., False, 'max'),
+        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
+                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kMax)''',
+        input_fn=lambda: torch.empty(2, 3, dtype=torch.long).random_(4),
+        check_gradgrad=False,
+        desc='max',
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='EmbeddingBag_mean_padding_idx',
+        constructor=lambda: nn.EmbeddingBag(4, 3, padding_idx=1),
+        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).padding_idx(1)',
+        input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
+        check_gradgrad=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='EmbeddingBag_sum_padding_idx',
+        constructor=lambda: nn.EmbeddingBag(4, 3, None, 2., False, 'sum', padding_idx=1),
+        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
+                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kSum).padding_idx(1)''',
+        input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
+        check_gradgrad=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='EmbeddingBag_max_padding_idx',
+        constructor=lambda: nn.EmbeddingBag(4, 3, None, 2., False, 'max', padding_idx=1),
+        cpp_constructor_args='''torch::nn::EmbeddingBagOptions(4, 3)
+                                .max_norm(c10::nullopt).norm_type(2.).scale_grad_by_freq(false).mode(torch::kMax).padding_idx(1)''',
+        input_fn=lambda: torch.stack([torch.randperm(3), torch.randperm(3)]),
+        check_gradgrad=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='EmbeddingBag_sparse',
+        constructor=lambda: nn.EmbeddingBag(4, 3, sparse=True, dtype=torch.double),
+        cpp_constructor_args='torch::nn::EmbeddingBagOptions(4, 3).sparse(true)._weight(torch::rand({4, 3}).to(torch::kFloat64))',
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
+        check_gradgrad=False,
+        has_sparse_gradients=True,
+    ),
+    dict(
+        constructor=lambda: nn.Embedding(4, 3, dtype=torch.double, sparse=True),
+        cpp_constructor_args='torch::nn::EmbeddingOptions(4, 3).sparse(true)._weight(torch::rand({4, 3}).to(torch::kFloat64))',
+        input_fn=lambda: torch.randperm(2).repeat(1, 2),
+        fullname='Embedding_sparse',
+        check_gradgrad=False,
+        has_sparse_gradients=True,
+    ),
+    dict(
+        module_name='PixelShuffle',
+        constructor_args=(3,),
+        cpp_constructor_args='torch::nn::PixelShuffleOptions(3)',
+        input_size=(1, 9, 4, 4),
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PixelUnshuffle',
+        constructor_args=(3,),
+        cpp_constructor_args='torch::nn::PixelUnshuffleOptions(3)',
+        input_size=(1, 1, 12, 12),
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_nearest_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
+        input_size=(0, 2, 4),
+        fullname='interpolate_nearest_1d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(12, ), scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12})).scale_factor(c10::nullopt).mode(torch::kNearest)''',
+        input_size=(1, 2, 3),
+        fullname='interpolate_nearest_tuple_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt).scale_factor(std::vector<double>({4.})).mode(torch::kNearest)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_nearest_scale_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kLinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_linear_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, ), scale_factor=None, mode='linear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kLinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 3),
+        fullname='interpolate_linear_tuple_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4.}))
+                            .mode(torch::kLinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_linear_scale_1d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kLinear)
+                            .align_corners(false)''',
+        input_size=(0, 2, 4),
+        fullname='interpolate_linear_1d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='linear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kLinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_linear_1d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='linear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4.}))
+                            .mode(torch::kLinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4),
+        fullname='interpolate_linear_scale_1d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=2, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({2, 2}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(1, 128, 1, 1),
+        fullname='interpolate_nearest_2d_launch_configs',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_nearest_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(12, 16), scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 16}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 3, 4),
+        fullname='interpolate_nearest_tuple_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4., 4.}))
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_nearest_scale_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(0, 2, 4, 4),
+        fullname='interpolate_nearest_2d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(0, 2, 4, 4),
+        fullname='interpolate_bilinear_2d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None,
+                                    mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 2, 3),
+        fullname='interpolate_bilinear_tuple_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4.,
+                                    mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4., 4.}))
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_scale_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
+                                    mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 2.}))
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_scale_tuple_shared_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
+                                    mode='bilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 1.}))
+                            .mode(torch::kBilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_scale_tuple_skewed_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bilinear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBilinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_tuple_2d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
+                                    mode='bilinear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 1.}))
+                            .mode(torch::kBilinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bilinear_scale_tuple_skewed_2d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(0, 2, 4, 4),
+        fullname='interpolate_bicubic_2d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None,
+                                    mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(1, 2, 2, 3),
+        fullname='interpolate_bicubic_tuple_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4., 4.}))
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_scale_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 2.),
+                                    mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 2.}))
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_scale_tuple_shared_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
+                                    mode='bicubic', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 1.}))
+                            .mode(torch::kBicubic)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_scale_tuple_skewed_2d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6), scale_factor=None, mode='bicubic', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kBicubic)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_tuple_2d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=(2., 1.),
+                                    mode='bicubic', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({2., 1.}))
+                            .mode(torch::kBicubic)
+                            .align_corners(true)''',
+        input_size=(1, 2, 4, 4),
+        fullname='interpolate_bicubic_scale_tuple_skewed_2d_align_corners',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 4, 4, 4),
+        fullname='interpolate_nearest_3d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(0, 2, 4, 4, 4),
+        fullname='interpolate_nearest_3d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(12, 16, 16), scale_factor=None, mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 16, 16}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 3, 4, 4),
+        fullname='interpolate_nearest_tuple_3d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=4., mode='nearest'),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({4., 4., 4.}))
+                            .mode(torch::kNearest)''',
+        input_size=(1, 2, 4, 4, 4),
+        fullname='interpolate_nearest_scale_3d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kTrilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 4, 4, 4),
+        fullname='interpolate_trilinear_3d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=12, scale_factor=None, mode='trilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({12, 12, 12}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kTrilinear)
+                            .align_corners(false)''',
+        input_size=(0, 2, 4, 4, 4),
+        fullname='interpolate_trilinear_3d_zero_dim',
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6, 6),
+                                    scale_factor=None, mode='trilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kTrilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 2, 3, 3),
+        fullname='interpolate_trilinear_tuple_3d',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=False),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({3., 3., 3.}))
+                            .mode(torch::kTrilinear)
+                            .align_corners(false)''',
+        input_size=(1, 2, 3, 4, 5),
+        fullname='interpolate_trilinear_scale_3d',
+        # See https://github.com/pytorch/pytorch/issues/5006
+        precision=3e-4,
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=(4, 6, 6), scale_factor=None,
+                                    mode='trilinear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(std::vector<int64_t>({4, 6, 6}))
+                            .scale_factor(c10::nullopt)
+                            .mode(torch::kTrilinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 2, 3, 3),
+        fullname='interpolate_trilinear_tuple_3d_align_corners',
+        pickle=False,
+        default_dtype=torch.double
+    ),
+    dict(
+        constructor=wrap_functional(F.interpolate, size=None, scale_factor=3., mode='trilinear', align_corners=True),
+        cpp_options_args='''F::InterpolateFuncOptions()
+                            .size(c10::nullopt)
+                            .scale_factor(std::vector<double>({3., 3., 3.}))
+                            .mode(torch::kTrilinear)
+                            .align_corners(true)''',
+        input_size=(1, 2, 3, 4, 4),
+        fullname='interpolate_trilinear_scale_3d_align_corners',
+        # See https://github.com/pytorch/pytorch/issues/5006
+        precision=3e-4,
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=-1),
+        cpp_options_args='F::SoftmaxFuncOptions(-1)',
+        input_size=(2, 128),  # trigger the last-dim algo in CUDA
+        fullname='softmax_lastdim',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
+        cpp_options_args='F::SoftmaxFuncOptions(1).dtype(torch::kFloat64)',
+        input_size=(2, 128),
+        fullname='softmax_lastdim_dtype',
+        pickle=False,
+        test_cuda=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1),
+        cpp_options_args='F::SoftmaxFuncOptions(1)',
+        input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
+        fullname='softmax_spatial_special',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1),
+        cpp_options_args='F::SoftmaxFuncOptions(1)',
+        input_size=(2, 2, 4, 4),  # regular spatial algorithm
+        fullname='softmax_spatial',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=1, dtype=torch.float64),
+        cpp_options_args='F::SoftmaxFuncOptions(1).dtype(torch::kFloat64)',
+        input_size=(2, 2, 4, 4),  # regular spatial algorithm
+        fullname='softmax_spatial_dtype',
+        pickle=False,
+        test_cuda=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=0),
+        cpp_options_args='F::SoftmaxFuncOptions(0)',
+        input_size=(2, 3, 4, 5),
+        fullname='softmax_functional_dim0',
+        test_cuda=False,
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=3),
+        cpp_options_args='F::SoftmaxFuncOptions(3)',
+        input_size=(2, 3, 4, 5),
+        fullname='softmax_functional_dim3',
+        test_cuda=False,
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.softmax, dim=-1),
+        cpp_options_args='F::SoftmaxFuncOptions(-1)',
+        input_size=(),
+        fullname='softmax_functional_scalar',
+        test_cuda=False,
+        pickle=False,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=-1),
+        cpp_options_args='F::LogSoftmaxFuncOptions(-1)',
+        input_size=(2, 128),  # trigger the last-dim algo in CUDA
+        fullname='log_softmax_lastdim',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=1),
+        cpp_options_args='F::LogSoftmaxFuncOptions(1)',
+        input_size=(2, 128, 2, 2),  # trigger special case of spatial CUDA algo
+        fullname='log_softmax_spatial_special',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=1),
+        cpp_options_args='F::LogSoftmaxFuncOptions(1)',
+        input_size=(2, 2, 4, 4),  # regular spatial algorithm
+        fullname='log_softmax_spatial',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=0),
+        cpp_options_args='F::LogSoftmaxFuncOptions(0)',
+        input_size=(2, 3, 4, 5),
+        fullname='log_softmax_dim0',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=3),
+        cpp_options_args='F::LogSoftmaxFuncOptions(3)',
+        input_size=(2, 3, 4, 5),
+        fullname='log_softmax_dim3',
+        pickle=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        constructor=wrap_functional(F.log_softmax, dim=0),
+        cpp_options_args='F::LogSoftmaxFuncOptions(0)',
+        input_size=(),
+        fullname='log_softmax_scalar',
+        pickle=False,
+    ),
+    dict(
+        fullname='Unfold',
+        constructor=lambda: nn.Unfold((2, 2), (1, 1), (0, 0), (1, 1)),
+        cpp_constructor_args='torch::nn::UnfoldOptions({2, 2}).dilation({1, 1}).padding({0, 0}).stride({1, 1})',
+        input_size=(2, 4, 3, 3),
+        check_gradgrad=False,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Fold',
+        constructor=lambda: nn.Fold((3, 3), (2, 2), (1, 1), (0, 0), (1, 1)),
+        cpp_constructor_args='torch::nn::FoldOptions({3, 3}, {2, 2}).dilation({1, 1}).padding({0, 0}).stride({1, 1})',
+        input_size=(2, 16, 4),
+        check_gradgrad=False,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Fold_no_batch_dim_input',
+        constructor=lambda: nn.Fold((3, 3), (2, 2), (1, 1), (0, 0), (1, 1)),
+        cpp_constructor_args='torch::nn::FoldOptions({3, 3}, {2, 2}).dilation({1, 1}).padding({0, 0}).stride({1, 1})',
+        input_size=(16, 4),
+        check_gradgrad=False,
+        ref=single_batch_reference_fn,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Unfold_int_input',
+        constructor=lambda: nn.Unfold(2, 1, 0, 1),
+        cpp_constructor_args='torch::nn::UnfoldOptions(2).dilation(1).padding(0).stride(1)',
+        input_size=(2, 4, 3, 3),
+        check_gradgrad=False,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Fold_int_input',
+        constructor=lambda: nn.Fold(3, 2, 1, 0, 1),
+        cpp_constructor_args='torch::nn::FoldOptions(3, 2).dilation(1).padding(0).stride(1)',
+        input_size=(2, 16, 4),
+        check_gradgrad=False,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        fullname='Fold_no_batch_dim_int_input',
+        constructor=lambda: nn.Fold(3, 2, 1, 0, 1),
+        cpp_constructor_args='torch::nn::FoldOptions(3, 2).dilation(1).padding(0).stride(1)',
+        input_size=(16, 4),
+        ref=single_batch_reference_fn,
+        check_gradgrad=False,
+        test_cuda=True,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='RReLU',
+        constructor_args=(0.1, 0.9),
+        cpp_constructor_args='torch::nn::RReLUOptions().lower(0.1).upper(0.9)',
+        input_size=(),
+        desc='with_up_down_scalar',
+        test_cuda=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PairwiseDistance',
+        input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)),
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PairwiseDistance',
+        input_fn=lambda: (torch.randn(10, 1), torch.randn(10, 8)),
+        desc='broadcast_lhs',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PairwiseDistance',
+        input_fn=lambda: (torch.randn(10, 8), torch.randn(1, 8)),
+        desc='broadcast_rhs',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PairwiseDistance',
+        constructor_args=(1.5, 1e-05, True),
+        cpp_constructor_args='torch::nn::PairwiseDistanceOptions().p(1.5).eps(1e-05).keepdim(true)',
+        input_fn=lambda: (torch.randn(10, 8), torch.randn(10, 8)),
+        desc='with_non_default_args',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='PairwiseDistance',
+        input_fn=lambda: (torch.randn(8), torch.randn(8)),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 16, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(16)
+                                .dropout(0.0)''',
+        input_size=(2, 3, 4),
+        desc='relu_activation',
+        with_tf32=True,
+        tf32_precision=0.1,
+        # TODO(#50743): figure out the error
+        # RuntimeError: The size of tensor a (6) must match the size of tensor b (4)
+        # at non-singleton dimension 2
+        check_batched_grad=False,
+        check_gradgrad=False,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='TransformerEncoderLayer',
+        constructor_args=(4, 2, 8, 0.0, F.gelu),
+        cpp_constructor_args='''torch::nn::TransformerEncoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_size=(2, 3, 4),
+        check_gradgrad=False,
+        desc='gelu_activation',
+        with_tf32=True,
+        tf32_precision=0.08 if SM90OrLater else 0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='relu_activation',
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='TransformerDecoderLayer',
+        constructor_args=(4, 2, 8, 0.0, F.gelu),
+        cpp_constructor_args='''torch::nn::TransformerDecoderLayerOptions(4, 2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kGELU)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4)),
+        check_gradgrad=False,
+        desc='gelu_activation',
+        with_tf32=True,
+        tf32_precision=0.05,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Transformer',
+        constructor_args=(4, 2, 2, 2, 8, 0.0, F.relu),
+        cpp_constructor_args='''torch::nn::TransformerOptions()
+                                .d_model(4)
+                                .nhead(2)
+                                .num_encoder_layers(2)
+                                .num_decoder_layers(2)
+                                .dim_feedforward(8)
+                                .dropout(0.0)
+                                .activation(torch::kReLU)''',
+        input_fn=lambda: (torch.rand(3, 3, 4), torch.rand(2, 3, 4), torch.rand(3, 3)),
+        check_gradgrad=False,
+        desc='multilayer_coder',
+        with_tf32=True,
+        tf32_precision=0.05 if SM90OrLater else 0.03,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Linear',
+        constructor_args=(3, 5),
+        cpp_constructor_args='torch::nn::LinearOptions(3, 5)',
+        input_fn=lambda: torch.rand(3),
+        reference_fn=lambda i, p, _: torch.mm(i.view(1, -1), p[0].t()).view(-1) + p[1],
+        desc="no_batch_dim",
+        with_tf32=True,
+        tf32_precision=0.005,
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Flatten',
+        cpp_constructor_args='torch::nn::FlattenOptions().start_dim(-3).end_dim(-1)',
+        constructor_args=(-3, -1),
+        input_size=(3, 4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc="no_batch_dim",
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='Unflatten',
+        cpp_constructor_args='torch::nn::UnflattenOptions(-2, {2, 2})',
+        constructor_args=(-2, torch.Size([2, 2])),
+        input_size=(3, 4, 5),
+        reference_fn=single_batch_reference_fn,
+        desc="no_batch_dim",
+        default_dtype=torch.double,
+    ),
+    dict(
+        module_name='LayerNorm',
+        constructor_args=([56, 56, 56], 1e-5, False),
+        cpp_constructor_args='torch::nn::LayerNormOptions({56, 56, 56}).eps(1e-5).elementwise_affine(false)',
+        input_size=(4, 56, 56, 56),
+        cudnn=True,
+        check_eval=True,
+        gradcheck_fast_mode=True,
+        check_half=True,
+        desc='3d_no_affine_large_feature',
+    ),
+]
+
+# add conv padding mode tests:
+for padding_mode, cpp_padding_mode in zip(
+        ['reflect', 'circular', 'replicate', 'zeros'],
+        ['torch::kReflect', 'torch::kCircular', 'torch::kReplicate', 'torch::kZeros']):
+    # conv signature:
+    #     in_channels, out_channels, kernel_size, stride=1,
+    #     padding=0, dilation=1, groups=1,
+    #     bias=True, padding_mode='zeros'
+    for d in (1, 2, 3):
+        if d == 3 and padding_mode == 'reflect':
+            # FIXME: remove after implementing reflection pad 3d
+            #        https://github.com/pytorch/pytorch/issues/27655
+            continue
+        padding = tuple(range(1, d + 1))
+        cpp_padding = '{' + ', '.join(map(str, padding)) + '}'
+        input_size = (2, 2) + (4,) * d
+        output_size = (2, 3) + tuple(p + 1 for p in padding)  # simplified from `(4 + 2 * p - 3) // 2 + 1`
+        new_module_tests.append(
+            dict(
+                module_name=f'Conv{d}d',
+                constructor_args=(2, 3, 3, 2, padding, 1, 1, True, padding_mode),
+                cpp_constructor_args=f'''torch::nn::Conv{d}dOptions(2, 3, 3)
+                                        .stride(2)
+                                        .padding({cpp_padding})
+                                        .dilation(1)
+                                        .groups(1)
+                                        .bias(true)
+                                        .padding_mode({cpp_padding_mode})''',
+                input_size=input_size,
+                output_size=output_size,
+                cudnn=True,
+                desc=f'{padding_mode}_stride2_pad2',
+                with_tf32=True,
+                tf32_precision=0.05,
+                default_dtype=torch.double,
+            ),
+        )
+
+# Check that non linear activations work with no batch dimensions
+non_linear_activations_no_batch = [
+    'ELU', 'Hardshrink', 'Hardsigmoid', 'Hardtanh', 'Hardswish', 'LeakyReLU',
+    'LogSigmoid', 'PReLU', 'ReLU', 'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU', 'GLU',
+    'Sigmoid', 'SiLU', 'Mish', 'Softplus', 'Softshrink', 'Softsign', 'Tanh',
+    'Tanhshrink', 'Threshold'
+]
+non_linear_activations_extra_info: Dict[str, dict] = {
+    'CELU': {'constructor_args': (2.,), 'default_dtype': torch.double},
+    'Threshold': {'constructor_args': (2., 1.)},
+    'Hardsigmoid': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double},
+    'Hardswish': {'check_gradgrad': False, 'check_jit': False, 'default_dtype': torch.double},
+    # For RRelu, test that compare CPU and GPU results fail because RNG
+    # is different between CPU and GPU
+    'RReLU': {'test_cuda': False, 'default_dtype': torch.double},
+    'ELU': {'default_dtype': torch.double},
+    'GELU': {'default_dtype': torch.double},
+    'GLU': {'default_dtype': torch.double},
+    'Hardshrink': {'default_dtype': torch.double},
+    'Hardtanh': {'default_dtype': torch.double},
+    'LeakyReLU': {'default_dtype': torch.double},
+    'LogSigmoid': {'default_dtype': torch.double},
+    'Mish': {'default_dtype': torch.double},
+    'PReLU': {'default_dtype': torch.double},
+    'ReLU6': {'default_dtype': torch.double},
+    'ReLU': {'default_dtype': torch.double},
+    'SELU': {'default_dtype': torch.double},
+    'SiLU': {'default_dtype': torch.double},
+    'Sigmoid': {'default_dtype': torch.double},
+    'Softplus': {'default_dtype': torch.double},
+    'Softshrink': {'default_dtype': torch.double},
+    'Softsign': {'default_dtype': torch.double},
+    'Tanh': {'default_dtype': torch.double},
+    'Tanhshrink': {'default_dtype': torch.double},
+}
+for non_linear_activation in non_linear_activations_no_batch:
+    activation_test_info = dict(
+        module_name=non_linear_activation,
+        input_size=(4,),
+        reference_fn=single_batch_reference_fn,
+        desc='no_batch_dim',
+        test_cpp_api_parity=False,
+    )
+    extra_info = non_linear_activations_extra_info.get(non_linear_activation, {})
+    activation_test_info.update(extra_info)
+    new_module_tests.append(activation_test_info)
+
+
+def kldivloss_reference(input, target, reduction='mean', log_target=False):
+    if log_target:
+        result = torch.exp(target) * (target - input)
+    else:
+        result = target * (target.log() - input)
+    if reduction == 'mean':
+        return result.mean()
+    elif reduction == 'sum':
+        return result.sum()
+    elif reduction == 'batchmean' and result.dim() != 0:
+        return result.sum() / result.size(0)
+    return result
+
+
+def nlllossNd_reference(input, target, weight=None, ignore_index=-100,
+                        reduction='mean'):
+    assert input.dim() >= 3
+    N = input.size(0)
+    C = input.size(1)
+    out_size = (N,) + input.size()[2:]
+    output = torch.zeros(out_size).type_as(input)
+
+    if weight is None:
+        weight = torch.ones(C).type_as(input)
+    total_weight = 0
+    for tup in product(*[range(size) for size in out_size]):
+        t_nx = target[tup]
+        norm = 0. if ignore_index == t_nx else weight[t_nx].item()
+        input_index = list(tup)
+        input_index.insert(1, t_nx)
+        output[tup] = -input[tuple(input_index)] * norm
+        total_weight += norm
+
+    if reduction == 'mean':
+        return output.sum() / total_weight
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def cross_entropy_loss_prob_target_reference(input, target, weight=None, reduction='mean',
+                                             label_smoothing=0.0):
+    assert input.dim() >= 2
+
+    input = torch.log_softmax(input, 1)
+    C = input.size(1)
+    if weight is None:
+        weight = torch.ones(C).type_as(input)
+    weight = weight.view(1, C, *(1 for _ in input.shape[2:]))
+
+    if label_smoothing > 0.0:
+        assert label_smoothing <= 1.0
+        target = (target * (1 - label_smoothing) + label_smoothing / C)
+
+    output = -(input * target * weight).sum(dim=1)
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def cross_entropy_loss_indices_target_reference(input, target, weight=None, ignore_index=-100,
+                                                reduction='mean', label_smoothing=0.0):
+    log_softmax_input = torch.log_softmax(input, 1)
+    nllloss = F.nll_loss(
+        log_softmax_input,
+        target,
+        weight,
+        ignore_index=ignore_index,
+        reduction=reduction)
+
+    if label_smoothing == 0.0:
+        return nllloss
+
+    assert 0.0 < label_smoothing <= 1.0
+
+    input = torch.log_softmax(input, 1)
+    C = input.size(1)
+    if weight is not None:
+        input = input * weight.view(1, C, *(1 for _ in input.shape[2:]))
+
+    smooth_loss = -torch.sum(input, 1)
+
+    ignore_mask = target == ignore_index
+    smooth_loss.masked_fill_(ignore_mask, 0.0)
+
+    if reduction == 'mean':
+        if weight is not None:
+            # TODO: This code can path can be removed if #61309 is resolved
+            # loss is normalized by the weights to be consistent with nll_loss_nd
+            ret = torch.sum(smooth_loss) / weight.gather(0, target.masked_select(ignore_mask.logical_not()).flatten()).sum()
+        else:
+            ret = torch.mean(smooth_loss.masked_select(ignore_mask.logical_not()))
+    elif reduction == 'sum':
+        ret = torch.sum(smooth_loss)
+    else:
+        ret = smooth_loss
+
+    return (1 - label_smoothing) * nllloss + ret * (label_smoothing / C)
+
+
+def cross_entropy_loss_reference(input, target, weight=None, ignore_index=-100, reduction='mean',
+                                 label_smoothing=0.0):
+    if input.shape == target.shape:
+        return cross_entropy_loss_prob_target_reference(
+            input,
+            target,
+            weight=weight,
+            reduction=reduction,
+            label_smoothing=label_smoothing)
+    else:
+        return cross_entropy_loss_indices_target_reference(
+            input, target, weight=weight, reduction=reduction,
+            ignore_index=ignore_index, label_smoothing=label_smoothing
+        )
+
+
+def nllloss_reference(input, target, weight=None, ignore_index=-100,
+                      reduction='mean'):
+
+    def nll_loss_helper(input, target, weight, ignore_index):
+        if target == ignore_index:
+            return (0, 0)
+        norm = 1 if weight is None else weight[target]
+        result = -input[target] * norm
+        return (result, norm)
+
+    losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
+                          for i, t in zip(input, target)]
+    losses, weights = zip(*losses_and_weights)
+    losses_tensor = input.new_tensor(losses)
+    if reduction == 'mean':
+        return sum(losses_tensor) / sum(weights)
+    elif reduction == 'sum':
+        return sum(losses_tensor)
+    else:
+        return losses_tensor
+
+
+def smoothl1loss_reference(input, target, reduction='mean', beta=1.0):
+    abs_diff = (input - target).abs()
+    ge_beta_mask = (abs_diff >= beta).type_as(abs_diff)
+    lt_beta_mask = (abs_diff < beta).type_as(abs_diff)
+    # when beta <= 0 we should just use l1_loss
+    if beta == 0:
+        output = abs_diff
+    else:
+        output = ge_beta_mask * (abs_diff - 0.5 * beta) + lt_beta_mask * 0.5 * (abs_diff ** 2) / beta
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def huberloss_reference(input, target, reduction='mean', delta=1.0):
+    abs_diff = (input - target).abs()
+    ge_delta_mask = (abs_diff >= delta)
+    lt_delta_mask = (abs_diff < delta)
+    output = ge_delta_mask * delta * (abs_diff - 0.5 * delta) + lt_delta_mask * 0.5 * (abs_diff ** 2)
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def _multilabelmarginloss_reference(input, target):
+    targets = []
+    for target_index in target:
+        if target_index < 0:
+            break
+        targets.append(target_index)
+
+    sum = 0
+    for target_index in targets:
+        for i in range(0, len(input)):
+            if i not in targets:
+                sum += max(0, 1 - input[target_index] + input[i])
+
+    return sum
+
+
+def multilabelmarginloss_reference(input, target, reduction='mean'):
+    # make everything 2-dimensional
+    input_dim = input.dim()
+    if input.dim() < 2:
+        assert target.dim() < 2
+        input = input.unsqueeze(0) if input.dim() == 1 else input.unsqueeze(0).unsqueeze(0)
+        target = target.unsqueeze(0) if target.dim() == 1 else target.unsqueeze(0).unsqueeze(0)
+
+    n = input.size(0)
+    dim = input.size(1)
+    output = input.new(n).zero_()
+    for i in range(0, n):
+        output[i] = _multilabelmarginloss_reference(input[i], target[i])
+
+    if reduction == 'mean':
+        return output.mean() / dim
+    elif reduction == 'sum':
+        return output.sum() / dim
+    elif input_dim < 2:
+        # we know we have (1, C) X (1, C) -> (1,), so squeeze will get us
+        # back to correct dimensionality
+        return output.squeeze() / dim
+    else:
+        return output / dim
+
+
+def hingeembeddingloss_reference(input, target, margin=1.0, reduction='mean'):
+    margin_clamp = (margin - input).clamp(min=0).type_as(input)
+    output = torch.where(target == 1, input, margin_clamp)
+
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def softmarginloss_reference(input, target, reduction='mean'):
+    output = (1 + (-input * target).exp()).log()
+
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def _multimarginloss_reference(input, target_idx, p, margin, weight):
+    if weight is None:
+        weight = input.new(len(input)).fill_(1)
+
+    output = 0
+    for i in range(0, len(input)):
+        if i != target_idx:
+            output += weight[target_idx] * (max(0, (margin - input[target_idx] + input[i])) ** p)
+    return output
+
+
+def multimarginloss_reference(input, target, p=1, margin=1, weight=None, reduction='mean'):
+    if input.dim() < 2:
+        input = input.unsqueeze(0) if input.dim() == 1 else input.unsqueeze(0).unsqueeze(0)
+
+    target_dim = target.dim()
+    if target.dim() == 0:
+        target = target.unsqueeze(0)
+
+    n = input.size(0)
+    dim = input.size(1)
+    output = input.new(n)
+    for x in range(0, n):
+        output[x] = _multimarginloss_reference(input[x], target[x], p, margin, weight)
+
+    if reduction == 'mean':
+        return output.mean() / dim
+    elif reduction == 'sum':
+        return output.sum() / dim
+    elif target_dim == 0:
+        return output.squeeze(0) / dim
+    return output / dim
+
+
+def cosineembeddingloss_reference(input1, input2, target, margin=0, reduction='mean'):
+    def _cos(a, b):
+        cos = a.new(a.size(0))
+        for i in range(0, a.size(0)):
+            cos[i] = (a[i] * b[i]).sum() / ((((a[i] * a[i]).sum() + 1e-12) * ((b[i] * b[i]).sum() + 1e-12)) ** 0.5)
+        return cos
+
+    output = torch.where(target == 1, 1 - _cos(input1, input2), (_cos(input1, input2) - margin).clamp(min=0))
+
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def tripletmarginloss_reference(anchor, positive, negative, margin=1.0, p=2, eps=1e-6, swap=False,
+                                reduction='mean'):
+    d_p = torch.pairwise_distance(anchor, positive, p, eps)
+    d_n = torch.pairwise_distance(anchor, negative, p, eps)
+    if swap:
+        d_s = torch.pairwise_distance(positive, negative, p, eps)
+        d_n = torch.min(d_n, d_s)
+
+    output = torch.clamp(margin + d_p - d_n, min=0.0)
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+def marginrankingloss_reference(input1, input2, target, margin=0, reduction='mean'):
+    output = (-target * (input1 - input2) + margin).clamp(min=0)
+    if reduction == 'mean':
+        return output.mean()
+    elif reduction == 'sum':
+        return output.sum()
+    return output
+
+
+# this directly follows Graves et al's paper, in contrast to the production implementation, it does not use log-space
+def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean'):
+    input_lengths = torch.as_tensor(input_lengths, dtype=torch.long)
+    target_lengths = torch.as_tensor(target_lengths, dtype=torch.long)
+    dt = log_probs.dtype
+    log_probs = log_probs.double()  # we need the accuracy as we are not in logspace
+    targets = targets.long()
+    cum_target_lengths = target_lengths.cumsum(0)
+    losses = []
+    for i in range(log_probs.size(1)):
+        input_length = input_lengths[i].item()
+        target_length = target_lengths[i].item()
+        cum_target_length = cum_target_lengths[i].item()
+        targets_prime = targets.new_full((2 * target_length + 1,), blank)
+        if targets.dim() == 2:
+            targets_prime[1::2] = targets[i, :target_length]
+        else:
+            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
+        probs = log_probs[:input_length, i].exp()
+        alpha = log_probs.new_zeros((target_length * 2 + 1,))
+        alpha[0] = probs[0, blank]
+        alpha[1] = probs[0, targets_prime[1]]
+        mask_third = (targets_prime[:-2] != targets_prime[2:])
+        for t in range(1, input_length):
+            alpha_next = alpha.clone()
+            alpha_next[1:] += alpha[:-1]
+            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
+            alpha = probs[t, targets_prime] * alpha_next
+        losses.append(-alpha[-2:].sum().log()[None])
+    output = torch.cat(losses, 0)
+    if reduction == 'mean':
+        output = (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
+    elif reduction == 'sum':
+        output = output.sum()
+    output = output.to(dt)
+    return output
+
+
+loss_reference_fns: Dict['str', Callable] = {
+    'KLDivLoss': kldivloss_reference,
+    'KLDivLoss_log_target': partial(kldivloss_reference, log_target=True),
+    'NLLLoss': nllloss_reference,
+    'NLLLossNd': nlllossNd_reference,
+    'SmoothL1Loss': smoothl1loss_reference,
+    'HuberLoss': huberloss_reference,
+    'MultiLabelMarginLoss': multilabelmarginloss_reference,
+    'HingeEmbeddingLoss': hingeembeddingloss_reference,
+    'SoftMarginLoss': softmarginloss_reference,
+    'MultiMarginLoss': multimarginloss_reference,
+    'CosineEmbeddingLoss': cosineembeddingloss_reference,
+    'TripletMarginLoss': tripletmarginloss_reference,
+    'MarginRankingLoss': marginrankingloss_reference,
+    'CTCLoss': ctcloss_reference,
+    'CrossEntropyLoss': cross_entropy_loss_reference
+}
+
+
+criterion_tests = []
+
+
+def single_batch_reference_criterion_fn(*args):
+    """Reference function for criterion supporting no batch dimensions.
+
+    The criterion is passed the input and target in batched form with a single item.
+    The output is squeezed to compare with the no-batch input.
+    """
+    criterion = args[-1]
+
+    def unsqueeze_inp(inp):
+        if isinstance(inp, (list, tuple)):
+            return [t.unsqueeze(0) for t in inp]
+        return inp.unsqueeze(0)
+
+    def flatten(xs):
+        result = []
+        if isinstance(xs, (list, tuple)):
+            for x in xs:
+                result.extend(flatten(x))
+        else:
+            result.append(xs)
+        return result
+
+    single_batch_input_args = flatten([unsqueeze_inp(input) for input in args[:-1]])
+
+    output = criterion(*single_batch_input_args)
+    reduction = get_reduction(criterion)
+
+    if reduction == 'none':
+        return output.squeeze(0)
+    # reduction is 'sum' or 'mean' which results in a scalar
+    return output
+
+
+# Check that regression criterion work with no batch dimensions
+regression_criterion_no_batch = [
+    'L1Loss', 'MSELoss', 'PoissonNLLLoss', 'HuberLoss', 'SmoothL1Loss'
+]
+reductions = ['none', 'mean', 'sum']
+for name, reduction in product(regression_criterion_no_batch, reductions):
+    regression_test_info = dict(
+        fullname=f"{name}_no_batch_dim_{reduction}",
+        constructor=lambda *args, name=name: getattr(nn, name)(reduction=reduction),
+        input_size=(3, ),
+        target_size=(3, ),
+        reference_fn=single_batch_reference_criterion_fn,
+        test_cpp_api_parity=False,
+        default_dtype=torch.double,
+    )
+    criterion_tests.append(regression_test_info)
+
+
+for reduction in reductions:
+    regression_test_info = dict(
+        fullname=f"KLDivLoss_no_batch_dim_{reduction}",
+        constructor=lambda: nn.KLDivLoss(reduction=reduction),
+        input_fn=lambda: torch.rand((3,)).log(),
+        target_fn=lambda: torch.rand((3,)),
+        reference_fn=single_batch_reference_criterion_fn,
+        test_cpp_api_parity=False,
+        default_dtype=torch.double,
+    )
+    criterion_tests.append(regression_test_info)
+
+
+# Check that classification criterion work with no batch dimensions
+# List of tuples of (name, input_fn, target_fn)
+classification_criterion_no_batch = [
+    (
+        'BCELoss',
+        lambda: torch.sigmoid(torch.randn(9, dtype=torch.double)),
+        lambda: torch.randn(9, dtype=torch.double).gt(0).to(torch.double)
+    ),
+    ('BCEWithLogitsLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9, dtype=torch.double)),
+    ('HingeEmbeddingLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)),
+    ('MultiLabelMarginLoss', lambda: torch.randn(4, dtype=torch.double), lambda: torch.tensor([3, 0, -1, 1])),
+    ('SoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.tensor([-1, 1, 1] * 3)),
+    ('NLLLoss', lambda: F.log_softmax(torch.randn(3, dtype=torch.double), dim=0), lambda: torch.tensor(1)),
+    (
+        'CosineEmbeddingLoss',
+        lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)),
+        lambda: torch.tensor(1, dtype=torch.double)
+    ),
+    # For MarginRankingLoss, input_fn : (x1, x2) and target_fn : target
+    ('MarginRankingLoss', lambda: (torch.randn(()), torch.randn(())), lambda: torch.randn(()).sign()),
+    # For TripletMarginLoss, input_fn : (anchor, positive) and target_fn : negative
+    (
+        'TripletMarginLoss',
+        lambda: (torch.randn(9, dtype=torch.double), torch.randn(9, dtype=torch.double)),
+        lambda: torch.randn(9, dtype=torch.double)
+    ),
+    ('MultiLabelSoftMarginLoss', lambda: torch.randn(9, dtype=torch.double), lambda: torch.randn(9)),
+]
+classification_criterion_no_batch_extra_info: Dict[str, dict] = {
+    'MultiLabelMarginLoss': {'check_gradgrad': False},
+}
+# TODO : Fix these discrepancies
+classification_cpp_parity = {
+    'BCELoss': False,
+    'BCEWithLogitsLoss': False,
+    'HingeEmbeddingLoss': False,
+    'NLLLoss': False,
+    'SoftMarginLoss': False,
+}
+reductions = ['none', 'mean', 'sum']
+for (name, input_fn, target_fn), reduction in product(classification_criterion_no_batch,
+                                                      reductions):
+    classification_test_info = dict(
+        fullname=f"{name}_no_batch_dim_{reduction}",
+        constructor=lambda *args, name=name: getattr(nn, name)(reduction=reduction),
+        input_fn=lambda f=input_fn: f(),
+        target_fn=lambda f=target_fn: f(),
+        reference_fn=single_batch_reference_criterion_fn,
+        test_cpp_api_parity=True,
+        has_parity=classification_cpp_parity.get(name, True)
+    )
+    extra_info = classification_criterion_no_batch_extra_info.get(name, {})
+    classification_test_info.update(extra_info)
+    criterion_tests.append(classification_test_info)
+
+
+class NNTestCase(TestCase):
+
+    # _forward is defined in classes inheriting from NNTestCase
+    @abstractmethod
+    def _forward(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def _get_parameters(self, module: nn.Module) -> Tuple[List[nn.Parameter], List[nn.Parameter]]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _zero_grad_parameters(self, module: nn.Module) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def _backward(self, module: nn.Module,
+                  input: _TensorOrTensors, output: torch.Tensor,
+                  grad_output: Union[torch.Tensor, Sequence[torch.Tensor]],
+                  create_graph: bool = False):
+        raise NotImplementedError
+
+    def _jacobian(self, input, num_out):
+        if isinstance(input, tuple):
+            return tuple(self._jacobian(elem, num_out) for elem in input)
+        elif isinstance(input, list):
+            return [self._jacobian(elem, num_out) for elem in input]
+        else:
+            return torch.zeros(input.nelement(), num_out)
+
+    def _flatten_tensors(self, x):
+        if isinstance(x, torch.Tensor):
+            if x.is_sparse:
+                return x.to_dense().view(-1)
+            else:
+                return x.view(-1)
+        else:
+            return tuple(self._flatten_tensors(a) for a in x)
+
+    def _zero_grad_input(self, input):
+        if isinstance(input, torch.Tensor):
+            if input.requires_grad and input.grad is not None:
+                input.grad.zero_()
+                input.grad.detach_()
+        else:
+            for i in input:
+                self._zero_grad_input(i)
+
+    def _analytical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
+        output = self._forward(module, input)
+        output_size = output.nelement()
+
+        if jacobian_input:
+            jacobian_inp = self._jacobian(input, output_size)
+            flat_jacobian_input = list(_iter_tensors(jacobian_inp))
+
+        if jacobian_parameters:
+            num_param = sum(p.numel() for p in self._get_parameters(module)[0])
+            jacobian_param = torch.zeros(num_param, output_size)
+
+        for i in range(output_size):
+            param, d_param = self._get_parameters(module)
+            # make non grad zeros
+            d_param = [torch.zeros_like(p) if d is None else d for (p, d) in zip(param, d_param)]
+
+            d_out = torch.zeros_like(output)
+            flat_d_out = d_out.view(-1)
+            flat_d_out[i] = 1
+
+            if jacobian_parameters:
+                self._zero_grad_parameters(module)
+            # Tensors will accumulate gradient from multiple steps
+            if jacobian_input:
+                self._zero_grad_input(input)
+            d_input = self._backward(module, input, output, d_out)
+
+            if jacobian_input:
+                for jacobian_x, d_x in zip(flat_jacobian_input, _iter_tensors(d_input)):
+                    jacobian_x[:, i] = d_x.contiguous().view(-1)
+            if jacobian_parameters:
+                jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)
+
+        res: Tuple[torch.Tensor, ...] = tuple()
+        if jacobian_input:
+            res += jacobian_inp,
+        if jacobian_parameters:
+            res += jacobian_param,
+
+        return res
+
+    def _numerical_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True, jacobian_parameters=True):
+        def fw(*input):
+            return self._forward(module, input).detach()
+
+        res: Tuple[torch.Tensor, ...] = tuple()
+        if jacobian_input:
+            res += _get_numerical_jacobian(fw, input, eps=1e-6),
+        if jacobian_parameters:
+            param, _ = self._get_parameters(module)
+            to_cat = []
+            for p in param:
+                jacobian = _get_numerical_jacobian(fw, input, target=p, eps=1e-6)
+                # get_numerical_jacobian returns a list of tuples but we require a tensor
+                to_cat.append(jacobian[0][0])
+            res += (torch.cat(to_cat, 0),)
+        return res
+
+    def check_jacobian(self, module, input: _TensorOrTensors, jacobian_input=True):
+        jacobian_parameters = bool(self._get_parameters(module)[0])
+        analytical = self._analytical_jacobian(module, input, jacobian_input, jacobian_parameters)
+        numerical = self._numerical_jacobian(module, input, jacobian_input, jacobian_parameters)
+        analytical_t = list(_iter_tensors(analytical))
+        numerical_t = list(_iter_tensors(numerical))
+
+        differences = []
+        for a, n in zip(analytical_t, numerical_t):
+            if a.numel() != 0:
+                differences.append(a.add(n, alpha=-1).abs().max())
+            # TODO: compare structure (ensure analytic jacobian has correct shape)
+        if len(differences) > 0:
+            self.assertLessEqual(max(differences), PRECISION)  # type: ignore[type-var]
+
+
+class TestBase:
+
+    _required_arg_names = {'constructor_args', 'input', 'extra_args'}
+
+    def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
+        self.desc = desc
+        self.fullname = fullname
+        self.constructor = constructor
+        self.reference_fn = reference_fn
+        for name in self._required_arg_names:
+            if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
+                if name in {'constructor_args', 'extra_args'}:
+                    kwargs[name] = tuple()
+                else:
+                    raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
+                                     .format(self.get_name(), name))
+        self._extra_kwargs = kwargs
+        self._arg_cache = {}
+
+    def get_name(self):
+        if self.fullname is not None:
+            return 'test_' + self.fullname
+
+        test_name = 'test_' + self.constructor.__name__
+        if self.desc:
+            test_name += '_' + self.desc
+        return test_name
+
+    def _unpack(self, value):
+        if isinstance(value, torch.Tensor):
+            return value
+        elif is_iterable(value):
+            return type(value)(self._unpack(v) for v in value)
+        else:
+            return value
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', True)
+
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', True)
+
+    def _get_arg(self, name, unpack):
+        assert name in self._required_arg_names
+
+        if name not in self._arg_cache:
+            fn_name = name + '_fn'
+            size_name = name + '_size'
+
+            if name in self._extra_kwargs:
+                self._arg_cache[name] = self._extra_kwargs[name]
+            elif fn_name in self._extra_kwargs:
+                self._arg_cache[name] = self._extra_kwargs[fn_name]()
+            else:
+                assert size_name in self._extra_kwargs, \
+                    f"Missing `{name}`, `{size_name}` or `{fn_name}` for {self.get_name()}"
+
+                def map_tensor_sizes(sizes):
+                    if isinstance(sizes, list):
+                        return [map_tensor_sizes(s) for s in sizes]
+                    elif isinstance(sizes, torch.Tensor):
+                        return sizes.double()
+                    else:
+                        return torch.randn(sizes)
+
+                self._arg_cache[name] = map_tensor_sizes(self._extra_kwargs[size_name])
+
+        return self._unpack(self._arg_cache[name]) if unpack else self._arg_cache[name]
+
+    def _get_input(self, unpack=True):
+        return self._get_arg('input', unpack)
+
+    def __call__(self, test_case):
+        raise NotImplementedError
+
+
+class ModuleTest(TestBase):
+
+    @abstractmethod
+    def _do_test(self, test_case: Any, module: nn.Module, input: Any) -> Any:
+        raise NotImplementedError
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.jacobian_input = kwargs.get('jacobian_input', True)
+        self.should_test_cuda = kwargs.get('test_cuda', True)
+        self.should_test_pickle = kwargs.get('pickle', True)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.FIXME_no_cuda_gradgrad_comparison = \
+            kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
+        self.precision = kwargs.get('precision', 2e-4)
+        self.check_forward_only = kwargs.get('check_forward_only', False)
+        self.default_dtype = kwargs.get('default_dtype', None)
+        if self.default_dtype is None:
+            self.default_dtype = torch.get_default_dtype()
+
+    def __call__(self, test_case):
+        with set_default_dtype(self.default_dtype):
+            module = self.constructor(*self.constructor_args)
+            input = self._get_input()
+
+            if self.reference_fn is not None:
+                out = test_case._forward(module, input)
+                ref_input = deepcopy(input)
+                ref_module = deepcopy(module)
+                expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0], ref_module)
+                test_case.assertEqual(out, expected_out, exact_dtype=False)
+            if self.check_forward_only:
+                return
+            self.test_noncontig(test_case, module, input)
+
+            if self.should_test_pickle:
+                # TODO: do this with in-memory files as soon as torch.save will support it
+                with tempfile.TemporaryFile() as f:
+                    test_case._forward(module, input)
+                    torch.save(module, f)
+                    f.seek(0)
+                    module_copy = torch.load(f)
+                    test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input))
+
+            self._do_test(test_case, module, input)
+
+    def noncontiguize(self, obj):
+        if isinstance(obj, list):
+            return [self.noncontiguize(o) for o in obj]
+        elif isinstance(obj, tuple):
+            return tuple(self.noncontiguize(o) for o in obj)
+        tensor = obj
+        ndim = tensor.dim()
+        # Always making only the last dimension noncontiguous is easy to hide
+        # bugs because .view(-1) will still work. So try to find a dim with size
+        # > 1 and make that non-contiguous, i.e., stack + select on the
+        # dimension directly after that.
+        dim = ndim
+        for d in range(ndim):
+            if tensor.size(d) > 1:
+                dim = d + 1
+                break
+        noncontig = torch.stack([torch.empty_like(tensor), tensor], dim).select(dim, 1).detach()
+        assert noncontig.numel() == 1 or noncontig.numel() == 0 or not noncontig.is_contiguous()
+        noncontig.requires_grad = tensor.requires_grad
+        return noncontig
+
+    def test_noncontig(self, test_case, module, input):
+        # check no scalars, can't make non-contig
+        if isinstance(input, torch.Tensor) and input.dim() == 0:
+            return
+        if any(i.dim() == 0 for i in input if isinstance(i, torch.Tensor)):
+            return
+
+        test_case._zero_grad_parameters(module)
+        test_case._zero_grad_input(input)
+        with freeze_rng_state():
+            output = test_case._forward(module, input)
+            if getattr(module, "return_indices", False):
+                output = output[0]
+            grad_output = output.new(output.shape).normal_()
+            output = output.clone()
+            d_input = deepcopy(test_case._backward(module, input, output, grad_output))
+            d_param = deepcopy(test_case._get_parameters(module)[1])
+
+        nc_input = self.noncontiguize(input)
+        nc_grad_output = self.noncontiguize(grad_output)
+        for contig_i, contig_g in product((True, False), repeat=2):
+            i = input if contig_i else nc_input
+            # Some ops, e.g., nn.Flatten, return gradient that shares
+            # storage with the grad_output. Hence we copy here.
+            go = deepcopy(grad_output if contig_g else nc_grad_output)
+            test_case._zero_grad_parameters(module)
+            test_case._zero_grad_input(i)
+            with freeze_rng_state():
+                out = test_case._forward(module, i)
+                if getattr(module, "return_indices", False):
+                    out = out[0]
+                grad = test_case._backward(module, i, out, go)
+
+                test_case.assertEqual(out, output)
+                test_case.assertEqual(grad, d_input, atol=1e-4, rtol=0)
+                test_case.assertEqual(test_case._get_parameters(module)[1], d_param)
+
+    def test_cuda(self, test_case):
+        if not TEST_CUDA or not self.should_test_cuda:
+            raise unittest.SkipTest('Excluded from CUDA tests')
+
+        with set_default_dtype(self.default_dtype):
+            cpu_input = self._get_input()
+
+            type_map = {torch.double: torch.float}
+            cpu_input_tuple = cpu_input if isinstance(cpu_input, tuple) else (cpu_input,)
+
+            is_any_input_complex = any(isinstance(t, torch.Tensor) and t.dtype.is_complex for t in cpu_input_tuple)
+
+            gpu_input_tuple = to_gpu(cpu_input_tuple, type_map=type_map)
+
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args).float().cuda()
+            cpu_param = test_case._get_parameters(cpu_module)
+            gpu_param = test_case._get_parameters(gpu_module)
+            for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
+                gpu_p.data.copy_(cpu_p)
+
+            test_case._zero_grad_input(cpu_input_tuple)
+            test_case._zero_grad_input(gpu_input_tuple)
+            test_case._zero_grad_parameters(cpu_module)
+            test_case._zero_grad_parameters(gpu_module)
+            cpu_output = test_case._forward(cpu_module, cpu_input_tuple)
+            gpu_output = test_case._forward(gpu_module, gpu_input_tuple)
+            if getattr(cpu_module, "return_indices", False):
+                cpu_output = cpu_output[0]
+                gpu_output = gpu_output[0]
+            test_case.assertEqual(cpu_output, gpu_output, atol=self.precision, rtol=0, exact_dtype=False)
+
+            # Run backwards on CPU and GPU and compare results
+            for _ in range(5):
+                cpu_gradOutput = cpu_output.clone().normal_()
+                gpu_gradOutput = cpu_gradOutput.type_as(gpu_output)
+                cpu_gradInput = test_case._backward(cpu_module, cpu_input_tuple, cpu_output, cpu_gradOutput)
+                gpu_gradInput = test_case._backward(gpu_module, gpu_input_tuple, gpu_output, gpu_gradOutput)
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+                for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0)
+
+            # Run double-backwards on CPU and GPU and compare results
+            if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
+                cpu_output = cpu_module(*cpu_input_tuple)
+                gpu_output = gpu_module(*gpu_input_tuple)
+                if getattr(cpu_module, "return_indices", False):
+                    cpu_output = cpu_output[0]
+                    gpu_output = gpu_output[0]
+
+                cpu_gradOutput = torch.randn_like(cpu_output, requires_grad=True)
+                gpu_gradOutput = cpu_gradOutput.type_as(gpu_output).detach()
+                gpu_gradOutput.requires_grad = True
+
+                cpu_gradInputs = torch.autograd.grad(
+                    cpu_output,
+                    cpu_input_tuple + tuple(cpu_module.parameters()),
+                    cpu_gradOutput,
+                    create_graph=True)
+                gpu_gradInputs = torch.autograd.grad(
+                    gpu_output,
+                    gpu_input_tuple + tuple(gpu_module.parameters()),
+                    gpu_gradOutput,
+                    create_graph=True)
+
+                for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
+                    test_case.assertEqual(cpu_d_i, gpu_d_i, atol=self.precision, rtol=0, exact_dtype=False)
+
+                # We mix output into the second backwards computation so that
+                # torch.autograd.grad doesn't complain that some inputs
+                # are unreachable (which can happen if you differentiate
+                # only on the gradient.
+                if is_any_input_complex:
+                    outputs_cpu = cpu_output.sum().abs() + sum(x.sum().abs() for x in cpu_gradInputs)
+                    outputs_gpu = gpu_output.sum().abs() + sum(x.sum().abs() for x in gpu_gradInputs)
+                else:
+                    outputs_cpu = cpu_output.sum() + sum(x.sum() for x in cpu_gradInputs)
+                    outputs_gpu = gpu_output.sum() + sum(x.sum() for x in gpu_gradInputs)
+
+                cpu_gg = torch.autograd.grad(
+                    outputs_cpu,
+                    cpu_input_tuple + (cpu_gradOutput,) + tuple(cpu_module.parameters()),
+                    retain_graph=True)
+                gpu_gg = torch.autograd.grad(
+                    outputs_gpu,
+                    gpu_input_tuple + (gpu_gradOutput,) + tuple(gpu_module.parameters()),
+                    retain_graph=True)
+                test_case.assertEqual(cpu_gradInput, gpu_gradInput, atol=self.precision, rtol=0, exact_dtype=False)
+                for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
+                    test_case.assertEqual(cpu_d_p, gpu_d_p, atol=self.precision, rtol=0, exact_dtype=False)
+
+            self.test_noncontig(test_case, gpu_module, gpu_input_tuple)
+
+
+class InputVariableMixin:
+    def _get_input(self):
+        input = TestBase._get_input(self, False)  # type: ignore[arg-type]
+
+        def map_variables(i):
+            if isinstance(i, torch.Tensor):
+                if i.is_floating_point() or i.is_complex():
+                    i.requires_grad = True
+                return i
+            else:
+                return type(i)(map_variables(elem) for elem in i)
+
+        return map_variables(input)
+
+
+class NewModuleTest(InputVariableMixin, ModuleTest):  # type: ignore[misc]
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cudnn = kwargs.get('cudnn', False)
+        self.check_inplace = kwargs.get('check_inplace', False)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.skip_double = kwargs.get('skip_double', False)
+        self.skip_half = kwargs.get('skip_half', False)
+        self.with_tf32 = kwargs.get('with_tf32', False)
+        self.tf32_precision = kwargs.get('tf32_precision', 0.001)
+        self.test_cpu = kwargs.get('test_cpu', True)
+        self.has_sparse_gradients = kwargs.get('has_sparse_gradients', False)
+        self.check_batched_grad = kwargs.get('check_batched_grad', True)
+        self.gradcheck_fast_mode = kwargs.get('gradcheck_fast_mode', None)
+        self.supports_forward_ad = kwargs.get('supports_forward_ad', False)
+        self.supports_fwgrad_bwgrad = kwargs.get('supports_fwgrad_bwgrad', False)
+
+    def _check_gradients(self, test_case, module, input_tuple):
+        params = tuple(x for x in module.parameters())
+        num_inputs = len(input_tuple)
+
+        def fn_to_gradcheck(*inputs_and_params, **kwargs):
+            assert not kwargs
+            return test_case._forward(module, inputs_and_params[:num_inputs])
+
+        # gradcheck doesn't support operators that take in dense inputs but
+        # return sparse parameters. This only happens in the case of nn.Embedding
+        # and nn.EmbeddingBag. Instead, we call `self.check_jacobian`, which
+        # is a slightly different version of gradcheck that can handle this.
+        if self.has_sparse_gradients:
+            assert num_inputs == 1
+            test_input_jacobian = torch.is_floating_point(input_tuple[0])
+            test_case.check_jacobian(module, input_tuple[0], test_input_jacobian)
+        else:
+            test_case.assertTrue(gradcheck(fn_to_gradcheck, input_tuple + params,
+                                           check_batched_grad=self.check_batched_grad,
+                                           fast_mode=self.gradcheck_fast_mode,
+                                           check_forward_ad=self.supports_forward_ad))
+
+        if self.check_gradgrad:
+            test_case.assertTrue(gradgradcheck(fn_to_gradcheck, input_tuple + params,
+                                               check_batched_grad=self.check_batched_grad,
+                                               fast_mode=self.gradcheck_fast_mode,
+                                               check_fwd_over_rev=self.supports_fwgrad_bwgrad))
+
+    def _do_test(self, test_case, module, input):
+        num_threads = torch.get_num_threads()
+        torch.set_num_threads(1)
+        input_tuple = input if isinstance(input, tuple) else (input,)
+
+        self._check_gradients(test_case, module, input_tuple)
+
+        # check if module can be printed
+        module.__repr__()
+
+        if self.check_inplace:
+            # check if the inplace variant of the module gives the same result
+            # as the out-of-place
+
+            # check_inplace doesn't support multiple input tensors, since we don't have any modules
+            # that modify the inputs in-place and that accept more than one input
+            assert len(input_tuple) == 1
+            input = input_tuple[0]
+
+            module_ip = self.constructor(*self.constructor_args, inplace=True)
+
+            input_version = input._version
+            with freeze_rng_state():
+                output = module(input)
+            test_case.assertEqual(input._version, input_version)
+
+            input_ip = deepcopy(input)
+            input_ip_clone = input_ip.clone()
+            with freeze_rng_state():
+                output_ip = module_ip(input_ip_clone)
+            test_case.assertNotEqual(input_ip_clone._version, input_version)
+            test_case.assertEqual(output, output_ip)
+            grad = output.data.clone().normal_()
+            if input.grad is not None:
+                with torch.no_grad():
+                    input.grad.zero_()
+            if input_ip.grad is not None:
+                with torch.no_grad():
+                    input_ip.grad.zero_()
+            output.backward(grad)
+            output_ip.backward(grad)
+            test_case.assertEqual(input.grad, input_ip.grad)
+
+        def assert_module_parameters_are(tensor_type, device_id=None):
+            for p in module.parameters():
+                test_case.assertIsInstance(p, tensor_type)
+                if device_id is not None:
+                    test_case.assertEqual(p.get_device(), device_id)
+
+        if all(isinstance(t, torch.LongTensor) for t in input_tuple) and TEST_CUDA:
+            # check that cuda() moves module parameters to correct GPU device,
+            # and that float() casts parameters correctly
+            input_tuple = tuple(t.cuda() for t in input_tuple)
+            module.float().cuda()
+            module(*input_tuple)
+            assert_module_parameters_are(torch.cuda.FloatTensor, 0)  # type: ignore[attr-defined]
+
+            if torch.cuda.device_count() > 1:
+                input_tuple = tuple(t.cuda(1) for t in input_tuple)
+                module.cuda(1)
+                with torch.cuda.device(1):
+                    module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 1)  # type: ignore[attr-defined]
+        else:
+            # check that float()/double() casters work correctly
+            def to_type(tensor, real, complex):
+                if tensor.is_complex():
+                    return tensor.to(complex)
+                elif tensor.is_floating_point():
+                    return tensor.to(real)
+                else:
+                    return tensor
+
+            def to_half(x):
+                # TODO: torch.complex32 when properly supported
+                return to_type(x, torch.float16, None)
+
+            def to_single(x):
+                return to_type(x, torch.float32, torch.complex64)
+
+            def to_double(x):
+                return to_type(x, torch.float64, torch.complex128)
+
+            # to float
+            input_tuple = tuple(to_single(t) for t in input_tuple)
+            module.float()
+            module(*input_tuple)
+            assert_module_parameters_are(torch.FloatTensor)
+
+            # and back to double
+            input_tuple = tuple(to_double(t) for t in input_tuple)
+            module.double()
+            module(*input_tuple)
+            assert_module_parameters_are(torch.DoubleTensor)
+
+            if TEST_CUDA and self.should_test_cuda:
+                # check that cuda() moves module parameters to correct GPU device,
+                # and that float() casts parameters correctly
+
+                # to GPU0
+                input_tuple = tuple(to_single(t).cuda() for t in input_tuple)
+                module.float().cuda()
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)  # type: ignore[attr-defined]
+
+                # to CPU
+                input_tuple = tuple(t.cpu() for t in input_tuple)
+                module.cpu()
+                module(*input_tuple)
+                assert_module_parameters_are(torch.FloatTensor)
+
+                # back to GPU0
+                input_tuple = tuple(t.cuda() for t in input_tuple)
+                module.cuda()
+                module(*input_tuple)
+                assert_module_parameters_are(torch.cuda.FloatTensor, 0)  # type: ignore[attr-defined]
+
+                # test that forwards of module runs correctly without cuDNN
+                if self.cudnn:
+                    with torch.backends.cudnn.flags(enabled=False):
+                        module(*input_tuple)
+                        assert_module_parameters_are(torch.cuda.FloatTensor, 0)  # type: ignore[attr-defined]
+
+                if torch.cuda.device_count() >= 2:
+                    # test cross-GPU transfer works
+                    # to GPU1
+                    input_tuple = tuple(t.cuda(1) for t in input_tuple)
+                    module.cuda(1)
+                    with torch.cuda.device(1):
+                        module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.FloatTensor, 1)  # type: ignore[attr-defined]
+
+                if not self.skip_double:
+                    # test double()
+                    input_tuple = tuple(to_double(t).cuda() for t in input_tuple)
+                    module.double().cuda()
+                    module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.DoubleTensor, 0)  # type: ignore[attr-defined]
+
+                # test half()
+                if not self.skip_half:
+                    input_tuple = tuple(to_half(t).cuda() for t in input_tuple)
+                    module.half().cuda()
+                    module(*input_tuple)
+                    assert_module_parameters_are(torch.cuda.HalfTensor, 0)  # type: ignore[attr-defined]
+        torch.set_num_threads(num_threads)
+
+    def _get_target(self):
+        return self._get_arg('target', False)
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', False)
+
+
+class CriterionTest(InputVariableMixin, TestBase):  # type: ignore[misc]
+    # TODO: check that criterions don't ignore grad_output
+
+    _required_arg_names = TestBase._required_arg_names.union({'target'})
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.should_test_cuda = kwargs.get('test_cuda', True)
+        self.check_forward_only = kwargs.get('check_forward_only', False)
+        self.check_gradgrad = kwargs.get('check_gradgrad', True)
+        self.check_half = kwargs.get('check_half', True)
+        self.check_bfloat16 = kwargs.get('check_bfloat16', False)
+        self.check_complex = kwargs.get('check_complex', False)
+        self.test_cpu = kwargs.get('test_cpu', True)
+        self.with_tf32 = kwargs.get('with_tf32', True)
+        self.tf32_precision = kwargs.get('tf32_precision', 0.001)
+        self.check_batched_grad = kwargs.get('check_batched_grad', True)
+        self.default_dtype = kwargs.get('default_dtype', None)
+        if self.default_dtype is None:
+            self.default_dtype = torch.get_default_dtype()
+
+    def __call__(self, test_case):
+        with set_default_dtype(self.default_dtype):
+            module = self.constructor(*self.constructor_args)
+            input = self._get_input()
+
+            # Check that these methods don't raise errors
+            module.__repr__()
+            str(module)
+
+            target = self._get_target()
+
+            if self.reference_fn is not None:
+                out = test_case._forward_criterion(module, input, target, extra_args=self.extra_args)
+                ref_args = (deepcopy(input), deepcopy(target)) + self.extra_args + (module,)
+                expected_out = self.reference_fn(*ref_args)
+                test_case.assertEqual(out, expected_out)
+
+            if self.check_forward_only:
+                return
+
+            params = tuple(x for x in module.parameters())
+            if not isinstance(input, tuple):
+                inputs = (input,) + params + (target,)
+
+                def apply_fn(input, target, *params):
+                    return module(input, target)
+            else:
+                inputs = input + params + (target,)
+
+                def apply_fn(input1, input2, target, *params):  # type: ignore[misc]
+                    return module(input1, input2, target)
+
+            gradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
+
+            if self.check_gradgrad:
+                gradgradcheck(apply_fn, inputs, check_batched_grad=self.check_batched_grad)
+
+    def test_cuda(self, test_case, dtype, extra_args=None):
+        def convert_dtype(obj, dtype, requires_grad=False):
+            if isinstance(obj, torch.Tensor):
+                return obj.detach().to(dtype=dtype).requires_grad_(requires_grad)
+            elif isinstance(obj, tuple):
+                return tuple(convert_dtype(o, dtype, requires_grad) for o in obj)
+            else:
+                return obj
+
+        if not TEST_CUDA or not self.should_test_cuda:
+            raise unittest.SkipTest('Excluded from CUDA tests')
+
+        with set_default_dtype(self.default_dtype):
+            cpu_input = self._get_input()
+            cpu_target = self._get_target()
+            cpu_module = self.constructor(*self.constructor_args)
+            gpu_module = self.constructor(*self.constructor_args)
+
+            # Convert input, target and module parameters to dtype
+            cpu_input = convert_dtype(cpu_input, dtype, True)
+            if cpu_target.is_floating_point() or cpu_target.is_complex():
+                cpu_target = convert_dtype(cpu_target, dtype)
+            cpu_module.type(dtype)
+            gpu_module.type(dtype)
+
+            # GPU setup
+            gpu_input = to_gpu(cpu_input)
+            gpu_target = to_gpu(cpu_target)
+            gpu_module.cuda()
+
+            # torch.HalfTensor doesn't support most operations, converting back to default
+            if dtype in {torch.half, torch.bfloat16}:
+                cpu_input = self._get_input()
+                cpu_target = self._get_target()
+                # Loss modules with weights require consistent input/module weight types
+                cpu_module = self.constructor(*self.constructor_args)
+
+            cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target, extra_args=extra_args)
+            gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target, extra_args=extra_args)
+            # dtype used to be able to be None, so set precision in this way instead of a precision map
+            test_case.assertEqual(cpu_output, gpu_output,
+                                  atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
+
+            cpu_gradInput = test_case._backward_criterion(
+                cpu_module, cpu_input, cpu_output, cpu_target, extra_args=extra_args)
+            gpu_gradInput = test_case._backward_criterion(
+                gpu_module, gpu_input, gpu_output, gpu_target, extra_args=extra_args)
+            # dtype used to be able to be None, so set precision in this way instead of a precision map
+            test_case.assertEqual(cpu_gradInput, gpu_gradInput,
+                                  atol=1e-1 if dtype in {torch.half, torch.bfloat16} else 4e-4, rtol=0, exact_dtype=False)
+
+    def _get_target(self):
+        return self._get_arg('target', False)
+
+    @property
+    def constructor_args(self):
+        return self._get_arg('constructor_args', False)
+
+    @property
+    def extra_args(self):
+        return self._get_arg('extra_args', False)
+
+
+def _test_bfloat16_ops(test_case, op, device, inp_dims=(), prec=1e-2, scale_factor=None):
+    # fp32 compute
+    input1 = torch.randn(inp_dims, dtype=torch.float32, device=device, requires_grad=True)
+    if scale_factor is not None:
+        input1 = (torch.rand(inp_dims, dtype=torch.bfloat16, device=device) * scale_factor).float().requires_grad_()
+    out1 = op(input1)
+    grad_input1 = torch.randn_like(out1, device=device)
+    out1.backward(grad_input1)
+
+    # bfloat16 compute
+    op_bfp16 = op.bfloat16()
+    input2 = input1.detach().bfloat16().requires_grad_()
+    grad_input2 = grad_input1.bfloat16()
+    out2 = op_bfp16(input2)
+    out2.backward(grad_input2)
+
+    test_case.assertEqual(out1, out2, atol=prec, rtol=prec, exact_dtype=False)
+    test_case.assertEqual(input1.grad.data, input2.grad.data, atol=prec, rtol=prec, exact_dtype=False)
+
+def _test_module_empty_input(test_case, module, inp, check_size=True, inference=False):
+    if not inference:
+        inp.requires_grad_(True)
+    out = module(inp)
+    if not inference:
+        gO = torch.rand_like(out)
+        out.backward(gO)
+    if check_size:
+        test_case.assertEqual(out.size(), inp.size())
+    if not inference:
+        for p in module.parameters():
+            if p.requires_grad:
+                test_case.assertEqual(p.grad, torch.zeros_like(p.grad))
+        test_case.assertEqual(inp.grad, torch.zeros_like(inp))
+
+
+def _create_basic_net():
+    class Layer(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layer_dummy_param = nn.Parameter(torch.empty(3, 5))
+            self.register_buffer('layer_dummy_buf', torch.zeros(1, 3, 3, 7))
+
+    class Net(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.l1 = Layer()
+            self.dummy_param = nn.Parameter(torch.empty(3, 5))
+            self.register_buffer('dummy_buf', torch.zeros(7, 3, 3, 1))
+
+    l = Layer()
+    n = Net()
+    s = nn.Sequential(n, n)
+
+    return l, n, s
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_optimizers.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac60167c91e21a08888edcfbf99e246118d36b19
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_optimizers.py
@@ -0,0 +1,2033 @@
+# mypy: ignore-errors
+
+import functools
+import itertools
+import unittest
+from copy import deepcopy
+from enum import Enum
+from typing import Any, Dict, List, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn import Parameter
+from torch.optim import (
+    Adadelta,
+    Adagrad,
+    Adam,
+    Adamax,
+    AdamW,
+    ASGD,
+    LBFGS,
+    NAdam,
+    Optimizer,
+    RAdam,
+    RMSprop,
+    Rprop,
+    SGD,
+    SparseAdam,
+)
+from torch.testing._internal.common_device_type import tol, toleranceOverride
+from torch.testing._internal.common_methods_invocations import DecorateInfo
+from torch.testing._internal.common_utils import (
+    _TestParametrizer,
+    set_single_threaded_if_parallel_tbb,
+    skipIfMps,
+    skipIfTorchDynamo,
+    TEST_WITH_TORCHDYNAMO,
+)
+from torch.utils._foreach_utils import (
+    _get_foreach_kernels_supported_devices,
+    _get_fused_kernels_supported_devices,
+)
+
+
+class OptimizerInput:
+    """Contains args / kwargs to be passed to an optimizer constructor."""
+
+    __slots__ = ["params", "kwargs", "desc"]
+
+    def __init__(
+        self,
+        params: Union[List[Parameter], List[Tensor], Dict[Any, Any]],
+        kwargs: Dict[str, Any],
+        desc: str = "",
+    ):
+        # params can be a list of Tensors OR param_groups OR None
+        self.params = params
+        self.kwargs = kwargs
+        self.desc = desc
+
+    def __repr__(self):
+        return f"params={self.params}, kwargs={self.kwargs}, desc={self.desc}"
+
+
+class OptimizerErrorEnum(Enum):
+    """Enumerates when an error is raised when testing optimizers."""
+
+    CONSTRUCTION_ERROR = 0
+    STEP_ERROR = 1
+
+
+class ErrorOptimizerInput:
+    """
+    An OptimizerInput that will cause the optimizer to throw an error when constructed.
+    Includes the type and string of the resulting error.
+    """
+
+    __slots__ = ["optimizer_error_input", "error_on", "error_type", "error_regex"]
+
+    def __init__(
+        self,
+        optimizer_error_input,
+        *,
+        error_on=OptimizerErrorEnum.CONSTRUCTION_ERROR,
+        error_type=RuntimeError,
+        error_regex="",
+    ):
+        self.optimizer_error_input = optimizer_error_input
+        self.error_on = error_on
+        self.error_type = error_type
+        self.error_regex = error_regex
+
+
+class OptimizerInfo:
+    """Optimizer information to be used in testing."""
+
+    def __init__(
+        self,
+        optim_cls: Optimizer,  # Class object for the Optimizer under test
+        *,
+        # Function to generate optimizer inputs EXCLUDING params. We delegate params responsibility
+        # to the test using the OptimizerInfo. OptimizerInput.params is likely None.
+        # Can optionally take in device to filter out certain unsupported configs
+        optim_inputs_func,
+        # A subset of the global-cliquey flags (fused, foreach, differentiable) the optimizer
+        # supports. See NOTE: [optimizer kwarg categories] for what global-cliquey means.
+        supported_impls: Tuple[str] = ("foreach", "differentiable"),
+        # the devices on which the optim supports sparse tensors for params and grads, see SGD
+        supports_sparse_on: Tuple[str] = (),
+        # the optim only supports one config: sparse grads w/ dense params, see SparseAdam
+        only_supports_sparse_grads: bool = False,
+        # the optim supports complex parameters
+        supports_complex: bool = True,
+        # whether the optimizer.step() function requires a closure to be passed
+        step_requires_closure: bool = False,
+        # whether the optimizer supports per-param options with parameter groups
+        supports_param_groups: bool = True,
+        # whether the optimizer supports parameters on multiple devices
+        supports_multiple_devices: bool = True,
+        skips=(),  # Indicates which tests to skip
+        decorators=None,  # Additional decorators to apply to generated tests
+        optim_error_inputs_func=None,  # Function to generate optim inputs that error
+    ):
+        self.optim_cls = optim_cls
+        self.optim_inputs_func = optim_inputs_func
+        self.supported_impls = supported_impls
+        self.supports_sparse_on = supports_sparse_on
+        self.only_supports_sparse_grads = only_supports_sparse_grads
+        self.supports_complex = supports_complex
+        self.step_requires_closure = step_requires_closure
+        self.supports_param_groups = supports_param_groups
+        self.supports_multiple_devices = supports_multiple_devices
+        self.decorators = (
+            *(decorators if decorators else []),
+            *(skips if skips else []),
+        )
+        self.optim_error_inputs_func = optim_error_inputs_func
+
+    def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
+        result = [set_single_threaded_if_parallel_tbb]
+        for decorator in self.decorators:
+            if isinstance(decorator, DecorateInfo):
+                if decorator.is_active(
+                    test_class, test_name, device, dtype, param_kwargs
+                ):
+                    result.extend(decorator.decorators)
+            else:
+                result.append(decorator)
+        return result
+
+    @property
+    def name(self):
+        return self.optim_cls.__name__
+
+
+class optims(_TestParametrizer):
+    """Decorator for specifying a list of optimizers over which to run a test."""
+
+    def __init__(self, optim_info_iterable, dtypes=None):
+        self.optim_info_list = list(optim_info_iterable)
+
+        # optimizers aren't limited to be one dtype as parameters can have different dtypes
+        # We default to torch.float32, but dtypes should be specified through passed in
+        # parameters.
+        self.dtypes = dtypes if dtypes is not None else [torch.float32]
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        if device_cls is None:
+            raise RuntimeError(
+                "The @optims decorator is only intended to be used in a device-specific "
+                "context; use it with instantiate_device_type_tests() instead of "
+                "instantiate_parametrized_tests()"
+            )
+
+        for optim_info, dtype in itertools.product(self.optim_info_list, self.dtypes):
+            # Construct the test name; device / dtype parts are handled outside.
+            # See [Note: device and dtype suffix placement]
+            test_name = optim_info.name
+
+            # Construct parameter kwargs to pass to the test.
+            param_kwargs = {"optim_info": optim_info, "dtype": dtype}
+
+            try:
+
+                @functools.wraps(test)
+                def test_wrapper(*args, **kwargs):
+                    return test(*args, **kwargs)
+
+                decorator_fn = functools.partial(
+                    optim_info.get_decorators,
+                    generic_cls.__name__,
+                    test.__name__,
+                    device_cls.device_type,
+                    dtype,
+                )
+
+                yield (test_wrapper, test_name, param_kwargs, decorator_fn)
+            except Exception as ex:
+                # Provides an error message for debugging before rethrowing the exception
+                print(
+                    f"Failed to instantiate {test_name} for module {optim_info.name}!"
+                )
+                raise ex
+
+
+# Helper function for generating error inputs for all optimizers, used below.
+def get_error_inputs_for_all_optims(device, dtype):
+    if str(device) == "cpu":
+        sample_param = Parameter(torch.randn(1, device=device, dtype=dtype))
+        return [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=sample_param,
+                    kwargs={},
+                    desc="invalid param type",
+                ),
+                error_type=TypeError,
+                error_regex="params argument given to the optimizer should be an iterable of Tensors or dicts",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[sample_param, sample_param],
+                    kwargs={},
+                    desc="a param group cannot have duplicate parameters",
+                ),
+                error_type=UserWarning,
+                error_regex=".*a parameter group with duplicate parameters.*",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[{"params": sample_param}, {"params": sample_param}],
+                    kwargs={},
+                    desc="duplicate parameters should not occur across param groups either",
+                ),
+                error_type=ValueError,
+                error_regex="some parameters appear in more than one parameter group",
+            ),
+        ]
+    else:
+        return []
+
+
+# ------------------------------------------------------------------------------------------
+# NOTE: [optimizer kwarg categories]
+# We categorize optimizer kwargs as 3 types:
+#  1. optimizer-specific flags are like amsgrad or rho or beta, flags that are specific to
+#     algorithms and thus only show up for certain optimizers. There are many of these, so I
+#     do not bother gathering them all and listing them here. The converse to these would be
+#     global flags that every optimizer ideally _should_ support. We break global flags into
+#     2 further categories and list them all below.
+#  2. global-friendly = ["lr", "weight_decay", "maximize", "capturable"]
+#     global-friendly flags are global flags who play nicely with all other global flags,
+#     i.e., are mutually exclusive in function. This means that any pair of the following
+#     flags can be toggled at once (e.g., maximize and weight_decay). Furthermore, any of the
+#     following flags theoretically can be enabled with ANY other global flag, including the
+#     cliquey ones (e.g, capturable and foreach).
+#  3. global-cliquey = ["foreach", "fused", "differentiable"]
+#     global-cliquey flags are global flags that do NOT coexist with other cliquey flags,
+#     usually because they contradict each other in function. For example, one should not flip
+#     both foreach AND fused to True, because they are two differing performance optimizations
+#     in which you can only opt into one.
+#
+# The following optim_inputs_func_* sampling functions only return constructor combinations of
+# optimizer-specific and global-friendly flags. This is because we are confident they would mesh
+# well with additional kwargs. On the flip side of the same coin, we reserve setting the
+# global-cliquey flags to individual tests and fully expect tests to edit OptimizerInput.kwargs.
+
+
+def optim_inputs_func_adadelta(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize",
+        ),
+        OptimizerInput(
+            params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
+        ),
+    ]
+
+
+def optim_error_inputs_func_adadelta(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, rho=1.1),
+                    desc="rho should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid rho value: 1.1",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_adagrad(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize",
+        ),
+        OptimizerInput(params=None, kwargs={"lr": 0.1}, desc="non-default lr"),
+        OptimizerInput(
+            params=None,
+            kwargs={"initial_accumulator_value": 0.1, "weight_decay": 0.1},
+            desc="initial_accumulator_value",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": 0.1, "lr_decay": 0.5, "weight_decay": 0.1},
+            desc="lr_decay",
+        ),  # TODO: Move out to testing in param_group?
+    ]
+
+
+def optim_error_inputs_func_adagrad(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, lr_decay=-0.5),
+                    desc="lr_decay must be bigger than 0",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid lr_decay value: -0.5",
+            ),
+        ]
+    return error_inputs
+
+
+# TODO: consider tensor LR! See multi_tensor_optimizer_configs in test_optim.py --> tensor LR should work
+# with all implementation code paths...
+def optim_inputs_func_adam(device):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "amsgrad": True, "capturable": True},
+            desc="capturable, amsgrad",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"lr": torch.tensor(0.001), "amsgrad": True, "capturable": True},
+            desc="Tensor lr with capturable and amsgrad",
+        ),
+    ]
+
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize",
+        ),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1, "amsgrad": True}, desc="amsgrad"
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+
+
+def optim_error_inputs_func_adam(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, betas=(1.0, 0.0)),
+                    desc="beta1 should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid beta parameter at index 0: 1.0",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, weight_decay=-1),
+                    desc="weight_decay should > 0",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid weight_decay value: -1",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=torch.tensor(0.001), foreach=True),
+                    desc="lr as Tensor doesn't work with foreach & not capturable",
+                ),
+                error_type=ValueError,
+                error_regex="lr as a Tensor is not supported for capturable=False and foreach=True",
+            ),
+        ]
+    if "cuda" in str(device):
+        sample_tensor = torch.empty((), device=device, dtype=dtype)
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[sample_tensor],
+                    kwargs={"foreach": True, "fused": True},
+                    desc="`fused` and `foreach` cannot be `True` together",
+                ),
+                error_type=RuntimeError,
+                error_regex="`fused` and `foreach` cannot be `True` together",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[sample_tensor],
+                    kwargs={"fused": True, "differentiable": True},
+                    desc="`fused` does not support `differentiable`",
+                ),
+                error_type=RuntimeError,
+                error_regex="`fused` does not support `differentiable`",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_adamax(device):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "maximize": True, "capturable": True},
+            desc="capturable, maximize, weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0, "maximize": True, "capturable": True},
+            desc="capturable, maximize",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "maximize": False, "capturable": True},
+            desc="capturable, weight_decay",
+        ),
+    ]
+
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.1}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize",
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+
+
+def optim_error_inputs_func_adamax(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, betas=(0.0, 1.0)),
+                    desc="beta2 should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid beta parameter at index 1: 1.0",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_adamw(device):
+    return optim_inputs_func_adam(device)
+
+
+def optim_error_inputs_func_adamw(device, dtype):
+    return optim_error_inputs_func_adam(device, dtype)
+
+
+def optim_inputs_func_asgd(device):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"maximize": True, "capturable": True},
+            desc="maximize, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "capturable": True},
+            desc="weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True, "capturable": True},
+            desc="maximize, weight_decay, capturable",
+        ),
+    ]
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.02}, desc="non-default lr"),
+        OptimizerInput(params=None, kwargs={"t0": 100}, desc="t0"),
+        OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize, nonzero weight_decay",
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+
+
+def optim_error_inputs_func_asgd(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, weight_decay=-0.5),
+                    desc="weight_decay should > 0",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid weight_decay value: -0.5",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_lbfgs(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 0.01}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"tolerance_grad": 1e-6}, desc="tolerance_grad"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"line_search_fn": "strong_wolfe"},
+            desc="strong_wolfe",
+        ),
+    ]
+
+
+def optim_error_inputs_func_lbfgs(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    return error_inputs
+
+
+# Weird story bro, NAdam and RAdam do not have maximize.
+def optim_inputs_func_nadam(device):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.9, "momentum_decay": 6e-3, "capturable": True},
+            desc="weight_decay, capturable",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "weight_decay": 0.9,
+                "momentum_decay": 6e-3,
+                "decoupled_weight_decay": True,
+                "capturable": True,
+            },
+            desc="decoupled_weight_decay, capturable",
+        ),
+    ]
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 1e-3}, desc="non-default lr"),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum_decay": 6e-3},
+            desc="non-zero momentum_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "momentum_decay": 6e-3},
+            desc="weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "weight_decay": 0.1,
+                "momentum_decay": 6e-3,
+                "decoupled_weight_decay": True,
+            },
+            desc="decoupled_weight_decay",
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+
+
+def optim_error_inputs_func_nadam(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, betas=(1.0, 0.0)),
+                    desc="beta1 should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid beta parameter at index 0: 1.0",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, momentum_decay=-0.2),
+                    desc="momentum_decay should > 0",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid momentum_decay value: -0.2",
+            ),
+        ]
+    return error_inputs
+
+
+# Weird story bro, NAdam and RAdam do not have maximize.
+def optim_inputs_func_radam(device=None):
+    cuda_supported_configs = [
+        OptimizerInput(params=None, kwargs={"capturable": True}, desc="capturable"),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "capturable": True,
+                "weight_decay": 0.1,
+            },
+            desc="capturable, weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "capturable": True,
+                "weight_decay": 0.1,
+                "decoupled_weight_decay": True,
+            },
+            desc="capturable, weight_decay, decoupled_weight_decay",
+        ),
+    ]
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 2e-3}, desc="non-default lr"),
+        OptimizerInput(params=None, kwargs={"eps": 1e-6}, desc="non-default eps"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "decoupled_weight_decay": True},
+            desc="decoupled_weight_decay",
+        ),
+    ] + (cuda_supported_configs if "cuda" in str(device) else [])
+
+
+def optim_error_inputs_func_radam(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, betas=(1.0, 0.0)),
+                    desc="beta1 should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid beta parameter at index 0: 1.0",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, weight_decay=-1),
+                    desc="weight_decay should > 0",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid weight_decay value: -1",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_rmsprop(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 1e-3}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"weight_decay": 0.1}, desc="nonzero weight_decay"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "centered": True},
+            desc="centered",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "centered": True, "momentum": 0.1},
+            desc="momentum",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={
+                "weight_decay": 0.1,
+                "centered": True,
+                "momentum": 0.1,
+                "maximize": True,
+            },
+            desc="maximize",
+        ),
+    ]
+
+
+def optim_error_inputs_func_rmsprop(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, momentum=-1.0),
+                    desc="momentum should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid momentum value: -1.0",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_rprop(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 2e-4}, desc="non-default lr"),
+        OptimizerInput(
+            params=None, kwargs={"etas": (0.5, 1.5)}, desc="non-default etas"
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"step_sizes": (2e-6, 100)},
+            desc="non-default step_sizes",
+        ),
+        OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
+    ]
+
+
+def optim_error_inputs_func_rprop(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, etas=(1.0, 0.5)),
+                    desc="0 < eta1 < 1 < eta2",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid eta values: 1.0, 0.5",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_sgd(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lr": 1e-2}, desc="non-default lr"),
+        OptimizerInput(params=None, kwargs={"momentum": 0.9}, desc="momentum"),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum": 0.9, "dampening": 0.5},
+            desc="dampening",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum": 0.9, "weight_decay": 0.1},
+            desc="non-zero weight_decay",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"momentum": 0.9, "nesterov": True, "weight_decay": 0.1},
+            desc="nesterov",
+        ),
+        OptimizerInput(
+            params=None,
+            kwargs={"weight_decay": 0.1, "maximize": True},
+            desc="maximize",
+        ),
+    ]
+
+
+def optim_error_inputs_func_sgd(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+    if str(device) == "cpu":
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, momentum=-0.5),
+                    desc="momentum should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid momentum value: -0.5",
+            ),
+        ]
+    return error_inputs
+
+
+def optim_inputs_func_sparseadam(device):
+    return [
+        OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(
+            params=None, kwargs={"lr": 0.01}, desc="non-default lr"
+        ),  # TODO: Move out to testing in param_group?
+        OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
+    ]
+
+
+def optim_error_inputs_func_sparseadam(device, dtype):
+    error_inputs = get_error_inputs_for_all_optims(device, dtype)
+
+    if str(device) == "cpu":
+        # SparseAdam raises a warning and not an error for the first entry. We
+        # update it here:
+        error_inputs[0].error_type = FutureWarning
+        error_inputs[
+            0
+        ].error_regex = "Passing in a raw Tensor as ``params`` to SparseAdam"
+
+        error_inputs += [
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=None,
+                    kwargs=dict(lr=1e-2, betas=(1.0, 0.0)),
+                    desc="beta1 should be between 0 and 1",
+                ),
+                error_type=ValueError,
+                error_regex="Invalid beta parameter at index 0: 1.0",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[
+                        torch.zeros(
+                            3, layout=torch.sparse_coo, device=device, dtype=dtype
+                        )
+                    ],
+                    kwargs={},
+                    desc="dense params required",
+                ),
+                error_type=ValueError,
+                error_regex="SparseAdam requires dense parameter tensors",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[
+                        {
+                            "params": [
+                                torch.zeros(
+                                    3,
+                                    layout=torch.sparse_coo,
+                                    device=device,
+                                    dtype=dtype,
+                                )
+                            ]
+                        }
+                    ],
+                    kwargs={},
+                    desc="dense params required in param_groups",
+                ),
+                error_type=ValueError,
+                error_regex="SparseAdam requires dense parameter tensors",
+            ),
+            ErrorOptimizerInput(
+                OptimizerInput(
+                    params=[torch.rand(2, 3, device=device, dtype=torch.complex64)],
+                    kwargs=dict(),
+                    desc="complex not supported",
+                ),
+                error_type=ValueError,
+                error_regex="SparseAdam does not support complex parameters",
+            ),
+        ]
+    return error_inputs
+
+
+def _get_device_type(device: Union[str, torch.device]) -> str:
+    # Returns the device type as a string, e.g., "cpu" or "cuda"
+    if isinstance(device, torch.device):
+        device = str(device.type)
+    assert isinstance(device, str)
+    return device.split(":")[0]
+
+
+def _get_optim_inputs_including_global_cliquey_kwargs(
+    device, dtype, optim_info, skip=()
+) -> List[OptimizerInput]:
+    """
+    Return a list of all configs for a given optimizer as a list of OptimizerInputs,
+    including configs that have supported global cliquey kwargs (foreach, fused,
+    differentiable) based on optim_info.supported_impls.
+
+    The configs (optim_inputs) returned by optim_info.optim_inputs_func(...)
+    intentionally do NOT include global cliquey kwargs to give flexibility to tests.
+    For example, testing correctness between toggling foreach on and off is now
+    trivial. That said, we sometimes want to test for all possible configs on an
+    optimizer including all supported flags, so this helper returns all optim inputs.
+    """
+    assert all(
+        x in ["foreach", "fused", "differentiable"] for x in skip
+    ), "skip must be a subset of ['foreach', 'fused', 'differentiable']"
+
+    optim_inputs = optim_info.optim_inputs_func(device)
+
+    supported_impls = tuple(
+        x
+        for x in optim_info.supported_impls
+        if x not in skip
+        and (
+            _get_device_type(device) in _get_fused_kernels_supported_devices()
+            or x != "fused"
+        )
+        and (
+            _get_device_type(device) in _get_foreach_kernels_supported_devices()
+            or x != "foreach"
+        )
+    )
+
+    all_optim_inputs = []
+    for optim_input in optim_inputs:
+        # Add the base config where all the flags are False
+        base_kwargs = deepcopy(optim_input.kwargs)
+        if len(supported_impls) != 0:
+            for flag in supported_impls:
+                base_kwargs[flag] = False
+            all_optim_inputs.append(
+                OptimizerInput(params=None, kwargs=base_kwargs, desc=optim_input.desc)
+            )
+        else:
+            all_optim_inputs.append(optim_input)
+        # Add a config for when each of the global cliquey kwargs is True
+        # Note that in [optimizer kwarg categories], these kwargs are mutually
+        # exclusive, so we do not need to product them together.
+        for flag in supported_impls:
+            new_kwargs = deepcopy(base_kwargs)
+            new_kwargs[flag] = True
+            all_optim_inputs.append(
+                OptimizerInput(
+                    params=None, kwargs=new_kwargs, desc=f"{optim_input.desc} & {flag}"
+                )
+            )
+    return all_optim_inputs
+
+
+# Database of OptimizerInfo entries in alphabetical order.
+optim_db: List[OptimizerInfo] = [
+    OptimizerInfo(
+        Adadelta,
+        optim_inputs_func=optim_inputs_func_adadelta,
+        optim_error_inputs_func=optim_error_inputs_func_adadelta,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+            # Note on tolerances:
+            # test_correctness_Adadelta_cuda_float32
+            # Mismatched elements: 10 / 100 (10.0%)
+            # Greatest absolute difference: 4.838220775127411e-05 at index (7, 4) (up to 1e-05 allowed)
+            # Greatest relative difference: 0.007270356640219688 at index (7, 2) (up to 1e-05 allowed)
+            # This is due to floating point ordering error + usage of sqrt
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float32: tol(
+                            rtol=5.5e-4,
+                            atol=5e-5,
+                        )
+                    }
+                ),
+                "CompiledOptimizerParityTests",
+                "test_correctness",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        Adagrad,
+        optim_inputs_func=optim_inputs_func_adagrad,
+        optim_error_inputs_func=optim_error_inputs_func_adagrad,
+        supported_impls=("foreach", "differentiable"),
+        supports_sparse_on=("cpu"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115607"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115607 and #116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        Adam,
+        optim_inputs_func=optim_inputs_func_adam,
+        optim_error_inputs_func=optim_error_inputs_func_adam,
+        supported_impls=("foreach", "differentiable", "fused"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Fixing #115607 should fix this test. fused is correct, but forloop is not."
+                ),
+                "TestOptimRenewed",
+                "test_fused_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        Adamax,
+        optim_inputs_func=optim_inputs_func_adamax,
+        optim_error_inputs_func=optim_error_inputs_func_adamax,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("Mismatched _foreach_addcdiv_ types, see #118159"),
+                "TestOptimRenewed",
+                "test_complex",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115607"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115607 and #116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                unittest.skip("Uses too much memory, even for H100, surprisingly."),
+                "TestOptimRenewed",
+                "test_foreach_large_tensor",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cpu fails due to #115607"),
+                "TestOptimRenewed",
+                "test_can_load_older_state_dict",
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "capturable path no longer called after hitting cache limit, see #121178"
+                ),
+                "TestOptimRenewed",
+                "test_save_load_equality_with_weights_only",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "capturable path no longer called after hitting cache limit, see #121178"
+                ),
+                "TestOptimRenewed",
+                "test_load_nontensor_step",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "capturable path no longer called after hitting cache limit, see #121178"
+                ),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        AdamW,
+        optim_inputs_func=optim_inputs_func_adamw,
+        optim_error_inputs_func=optim_error_inputs_func_adamw,
+        supported_impls=("foreach", "differentiable", "fused"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Fixing #115607 should fix this test. fused is correct, but forloop is not."
+                ),
+                "TestOptimRenewed",
+                "test_fused_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        ASGD,
+        optim_inputs_func=optim_inputs_func_asgd,
+        optim_error_inputs_func=optim_error_inputs_func_asgd,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See discrepancy in https://github.com/pytorch/pytorch/issues/115607"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float32: tol(atol=1.5e-5, rtol=1e-5),
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_step_is_noop_for_zero_grads",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        LBFGS,
+        optim_inputs_func=optim_inputs_func_lbfgs,
+        optim_error_inputs_func=optim_error_inputs_func_lbfgs,
+        supported_impls=(),
+        step_requires_closure=True,
+        supports_param_groups=False,
+        supports_multiple_devices=False,
+        skips=(
+            # Fails on MacOS 13.2.1 in CI https://github.com/pytorch/pytorch/issues/117094
+            DecorateInfo(
+                skipIfMps, "TestOptimRenewed", "test_can_load_older_state_dict"
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+            DecorateInfo(
+                unittest.skip("Does not support param groups"),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+            ),
+            DecorateInfo(
+                unittest.skip("Does not support param groups"),
+                "TestOptimRenewed",
+                "test_param_groups_weight_decay",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                unittest.skip("LBFGS doesn't support multidevice"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        NAdam,
+        optim_inputs_func=optim_inputs_func_nadam,
+        optim_error_inputs_func=optim_error_inputs_func_nadam,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116499"
+                ),
+                "TestOptimRenewed",
+                "test_can_load_older_state_dict",
+                device_type="cuda",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors, https://github.com/pytorch/pytorch/issues/117150"
+                ),
+                "TestOptimRenewed",
+                "test_load_nontensor_step",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors, see https://github.com/pytorch/pytorch/issues/117150"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        RAdam,
+        optim_inputs_func=optim_inputs_func_radam,
+        optim_error_inputs_func=optim_error_inputs_func_radam,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115607"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        # previously atol=1e-7, rtol=1e-7
+                        torch.float64: tol(atol=1.5e-7, rtol=1.1e-7)
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Should be fixed by https://github.com/pytorch/pytorch/issues/115607"
+                ),
+                "TestOptimRenewed",
+                "test_can_load_older_state_dict",
+                device_type="cpu",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        RMSprop,
+        optim_inputs_func=optim_inputs_func_rmsprop,
+        optim_error_inputs_func=optim_error_inputs_func_rmsprop,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {  # previously atol=5-05, rtol=0.001, https://github.com/pytorch/pytorch/issues/116202
+                        torch.float32: tol(atol=5e-04, rtol=0.01),
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_mixed_device_dtype",
+                active_if=TEST_WITH_TORCHDYNAMO,
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        Rprop,
+        optim_inputs_func=optim_inputs_func_rprop,
+        optim_error_inputs_func=optim_error_inputs_func_rprop,
+        supported_impls=("foreach", "differentiable"),
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # Rprop doesn't update for non-contiguous, see #118117
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+                active_if=lambda kwargs: not kwargs["contiguous"],
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_foreach_matches_forloop",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679 and #116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "See https://github.com/pytorch/pytorch/issues/115679"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        SGD,
+        optim_inputs_func=optim_inputs_func_sgd,
+        optim_error_inputs_func=optim_error_inputs_func_sgd,
+        supported_impls=("foreach", "differentiable", "fused"),
+        supports_sparse_on=("cpu", "cuda"),
+        skips=(
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Dynamo memory usage is flaky, see https://github.com/pytorch/pytorch/issues/116046"
+                ),
+                "TestOptimRenewed",
+                "test_peak_memory_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors w/ Global state changed, see https://github.com/pytorch/pytorch/issues/116028"
+                ),
+                "TestOptimRenewed",
+                "test_set_default_dtype_works_with_foreach",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Accessing grad.real errors, see https://github.com/pytorch/pytorch/issues/117184"
+                ),
+                "TestOptimRenewed",
+                "test_complex_2d",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {  # previously atol=5-05, rtol=0.001, https://github.com/pytorch/pytorch/issues/116202
+                        torch.float32: tol(atol=5e-04, rtol=0.007),
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_mixed_device_dtype",
+                active_if=TEST_WITH_TORCHDYNAMO,
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors with list out of range, see https://github.com/pytorch/pytorch/issues/116061"
+                ),
+                "TestOptimRenewed",
+                "test_step_is_noop_for_zero_grads",
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "No closure handling, https://github.com/pytorch/pytorch/issues/116494"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors with list out of range, see https://github.com/pytorch/pytorch/issues/116061"
+                ),
+                "TestOptimRenewed",
+                "test_param_groups_weight_decay",
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors with list out of range, see https://github.com/pytorch/pytorch/issues/116061"
+                ),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "Errors with list out of range, see https://github.com/pytorch/pytorch/issues/116061"
+                ),
+                "TestOptimRenewed",
+                "test_load_nontensor_step",
+                device_type="cpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "momentum_buffer inconsistency, https://github.com/pytorch/pytorch/issues/117147"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo(
+                    "fails, https://github.com/pytorch/pytorch/issues/117165"
+                ),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+    OptimizerInfo(
+        SparseAdam,
+        optim_inputs_func=optim_inputs_func_sparseadam,
+        optim_error_inputs_func=optim_error_inputs_func_sparseadam,
+        supported_impls=(),
+        only_supports_sparse_grads=True,
+        supports_complex=False,  # Missing complex support, see #118153
+        skips=(
+            DecorateInfo(
+                skipIfMps,  # SparseAdam does not support MPS
+                "TestOptimRenewed",
+            ),
+            DecorateInfo(
+                unittest.skip(
+                    "SparseAdam does not support dense gradients, see #116507"
+                ),
+                "TestOptimRenewed",
+                "test_state_dict_deterministic",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_param_groups_lr",
+            ),
+            DecorateInfo(
+                unittest.skip(
+                    "SparseAdam does not support dense gradients, see #116507"
+                ),
+                "TestOptimRenewed",
+                "test_can_load_older_state_dict",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_load_nontensor_step",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_forloop_goes_right_direction_multigpu",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_state_dict_with_cuda_params",
+            ),
+            DecorateInfo(
+                skipIfTorchDynamo("cannot call to_sparse on p.grad, see #117184"),
+                "TestOptimRenewed",
+                "test_deepcopy_copies_all_public_attrs",
+            ),
+        ),
+    ),
+]
+
+
+class TensorTracker:
+    """
+    A utility to track tensor clones in a list, with the expectation of popping them later (in
+    order) to make fair comparisons between two multi-step computation. The intended use case is
+    usually when comparing two supposed equal computations, such as an optimizer step that each
+    individually consists of multiple steps, where numerical deviation could multiply.
+
+    The goal is to be able to compare and align numbers at every milestone so as to minimize
+    numerical discrepancies, and so when the test fails, it is likely a real problem.
+    """
+
+    def __init__(self):
+        self.tensors = []
+
+    def add(self, tensor):
+        """
+        Add a clone().detach()'d version of the tensor
+        """
+        self.tensors.append(tensor.clone().detach())
+
+    # pops from beginning, like a queue and not a stack!
+    def pop_check_set(self, tensor_to_set, testcase):
+        """
+        Pop the first element in the tensor tracker, assert equality between the popped tensor and
+        the input tensor, and then set the input tensor to have the same values as the popped tensor
+        (with copy_).
+        """
+        testcase.assertGreater(len(self.tensors), 0, "no tensors to pop")
+        ref = self.tensors.pop(0)
+
+        testcase.assertTrue(isinstance(ref, Tensor), f"{type(ref)=}")
+        testcase.assertEqual(tensor_to_set, ref)
+
+        with torch.no_grad():
+            tensor_to_set.copy_(ref)
+
+    def all_popped(self):
+        return len(self.tensors) == 0
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_pruning.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..1913deb0f0654ece2e849409e613efb2f68156ee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_pruning.py
@@ -0,0 +1,386 @@
+# mypy: ignore-errors
+
+# Owner(s): ["module: unknown"]
+
+from torch.ao.pruning import BaseSparsifier
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+class ImplementedSparsifier(BaseSparsifier):
+    def __init__(self, **kwargs):
+        super().__init__(defaults=kwargs)
+
+    def update_mask(self, module, **kwargs):
+        module.parametrizations.weight[0].mask[0] = 0
+        linear_state = self.state['linear1.weight']
+        linear_state['step_count'] = linear_state.get('step_count', 0) + 1
+
+
+class MockSparseLinear(nn.Linear):
+    """
+    This class is a MockSparseLinear class to check convert functionality.
+    It is the same as a normal Linear layer, except with a different type, as
+    well as an additional from_dense method.
+    """
+    @classmethod
+    def from_dense(cls, mod):
+        """
+        """
+        linear = cls(mod.in_features,
+                     mod.out_features)
+        return linear
+
+
+def rows_are_subset(subset_tensor, superset_tensor) -> bool:
+    """
+    Checks to see if all rows in subset tensor are present in the superset tensor
+    """
+    i = 0
+    for row in subset_tensor:
+        while i < len(superset_tensor):
+            if not torch.equal(row, superset_tensor[i]):
+                i += 1
+            else:
+                break
+        else:
+            return False
+    return True
+
+
+class SimpleLinear(nn.Module):
+    r"""Model with only Linear layers without biases, some wrapped in a Sequential,
+    some following the Sequential. Used to test basic pruned Linear-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=False),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 4, bias=False),
+        )
+        self.linear1 = nn.Linear(4, 4, bias=False)
+        self.linear2 = nn.Linear(4, 10, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+class LinearBias(nn.Module):
+    r"""Model with only Linear layers, alternating layers with biases,
+    wrapped in a Sequential. Used to test pruned Linear-Bias-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.Linear(5, 6, bias=False),
+            nn.Linear(6, 3, bias=True),
+            nn.Linear(3, 3, bias=True),
+            nn.Linear(3, 10, bias=False),
+        )
+
+    def forward(self, x):
+        x = self.seq(x)
+        return x
+
+
+class LinearActivation(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.Tanh(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.act1 = nn.ReLU()
+        self.linear2 = nn.Linear(3, 10, bias=False)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = self.act1(x)
+        x = self.linear2(x)
+        x = self.act2(x)
+        return x
+
+
+class LinearActivationFunctional(nn.Module):
+    r"""Model with only Linear layers, some with bias, some in a Sequential and some following.
+    Activation functions modules in between each Linear in the Sequential, and functional
+    activationals are called in between each outside layer.
+    Used to test pruned Linear(Bias)-Activation-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Linear(7, 5, bias=True),
+            nn.ReLU(),
+            nn.Linear(5, 6, bias=False),
+            nn.ReLU(),
+            nn.Linear(6, 4, bias=True),
+        )
+        self.linear1 = nn.Linear(4, 3, bias=True)
+        self.linear2 = nn.Linear(3, 8, bias=False)
+        self.linear3 = nn.Linear(8, 10, bias=False)
+        self.act1 = nn.ReLU()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.linear1(x)
+        x = F.relu(x)
+        x = self.linear2(x)
+        x = F.relu(x)
+        x = self.linear3(x)
+        x = F.relu(x)
+        return x
+
+
+class SimpleConv2d(nn.Module):
+    r"""Model with only Conv2d layers, all without bias, some in a Sequential and some following.
+    Used to test pruned Conv2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=False),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dBias(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some outside.
+    Used to test pruned Conv2d-Bias-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 32, 3, 1, bias=True),
+            nn.Conv2d(32, 64, 3, 1, bias=False),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=True)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=False)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.conv2d2(x)
+        return x
+
+
+class Conv2dActivation(nn.Module):
+    r"""Model with only Conv2d layers, some with bias, some in a Sequential and some following.
+    Activation function modules in between each Sequential layer, functional activations called
+    in-between each outside layer.
+    Used to test pruned Conv2d-Bias-Activation-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+            nn.Conv2d(64, 64, 3, 1, bias=False),
+            nn.ReLU(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, bias=False)
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.relu(x)
+        x = self.conv2d2(x)
+        x = F.hardtanh(x)
+        return x
+
+
+class Conv2dPadBias(nn.Module):
+    r"""Model with only Conv2d layers, all with bias and some with padding > 0,
+    some in a Sequential and some following. Activation function modules in between each layer.
+    Used to test that bias is propagated correctly in the special case of
+    pruned Conv2d-Bias-(Activation)Conv2d fusion, when the second Conv2d layer has padding > 0."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 32, 3, 1, padding=1, bias=True),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, 3, 1, bias=True),
+            nn.Tanh(),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, 3, 1, padding=1, bias=True)
+        self.act1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, 3, 1, padding=1, bias=True)
+        self.act2 = nn.Tanh()
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.act1(x)
+        x = self.conv2d2(x)
+        x = self.act2(x)
+        return x
+
+
+class Conv2dPool(nn.Module):
+    r"""Model with only Conv2d layers, all with bias, some in a Sequential and some following.
+    Activation function modules in between each layer, Pool2d modules in between each layer.
+    Used to test pruned Conv2d-Pool2d-Conv2d fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(64, 48, kernel_size=3, padding=1, bias=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2, padding=1)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(48, 52, kernel_size=3, padding=1, bias=True)
+        self.conv2d3 = nn.Conv2d(52, 52, kernel_size=3, padding=1, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = self.maxpool(x)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = F.relu(x)
+        x = self.conv2d3(x)
+        return x
+
+
+class Conv2dPoolFlattenFunctional(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a functional Flatten followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(11, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = torch.flatten(x, 1)  # test functional flatten
+        x = self.fc(x)
+        return x
+
+
+class Conv2dPoolFlatten(nn.Module):
+    r"""Model with Conv2d layers, all with bias, some in a Sequential and some following, and then a Pool2d
+    and a Flatten module followed by a Linear layer.
+    Activation functions and Pool2ds in between each layer also.
+    Used to test pruned Conv2d-Pool2d-Flatten-Linear fusion."""
+
+    def __init__(self):
+        super().__init__()
+        self.seq = nn.Sequential(
+            nn.Conv2d(1, 3, kernel_size=3, padding=1, bias=True),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(3, 5, kernel_size=3, padding=1, bias=True),
+            nn.Tanh(),
+            nn.AvgPool2d(kernel_size=2, stride=2, padding=1),
+        )
+        self.conv2d1 = nn.Conv2d(5, 7, kernel_size=3, padding=1, bias=True)
+        self.af1 = nn.ReLU()
+        self.conv2d2 = nn.Conv2d(7, 11, kernel_size=3, padding=1, bias=True)
+        self.avg_pool = nn.AdaptiveAvgPool2d((2, 2))
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(44, 13, bias=True)
+
+    def forward(self, x):
+        x = self.seq(x)
+        x = self.conv2d1(x)
+        x = F.max_pool2d(x, kernel_size=2, stride=2, padding=1)
+        x = self.af1(x)
+        x = self.conv2d2(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+class LSTMLinearModel(nn.Module):
+    """Container module with an encoder, a recurrent module, and a linear."""
+
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers)
+        self.linear = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, input):
+        output, hidden = self.lstm(input)
+        decoded = self.linear(output)
+        return decoded, output
+
+
+class LSTMLayerNormLinearModel(nn.Module):
+    """Container module with an LSTM, a LayerNorm, and a linear."""
+
+    def __init__(
+        self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int
+    ):
+        super().__init__()
+        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers)
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.linear = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        x, state = self.lstm(x)
+        x = self.norm(x)
+        x = self.linear(x)
+        return x, state
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_quantization.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..b107f84b68008462fcdbaeb71752d0522c088ab9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_quantization.py
@@ -0,0 +1,2835 @@
+# mypy: ignore-errors
+
+r"""Importing this file includes common utility methods and base clases for
+checking quantization api and properties of resulting modules.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.ao.nn.intrinsic.quantized.dynamic as nniqd
+import torch.ao.nn.quantized as nnq
+import torch.ao.nn.quantized.dynamic as nnqd
+from torch.ao.nn.intrinsic import _FusedModule
+import torch.distributed as dist
+from torch.testing._internal.common_utils import TestCase, TEST_WITH_ROCM
+
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization import (
+    QuantType,
+    default_dynamic_qat_qconfig,
+    default_embedding_qat_qconfig,
+    default_symmetric_qnnpack_qat_qconfig,
+)
+from torch.ao.quantization.quantize_pt2e import (
+    _convert_to_reference_decomposed_fx,
+    convert_pt2e,
+    prepare_pt2e,
+    prepare_qat_pt2e,
+)
+from torch.ao.quantization.backend_config import (
+    get_executorch_backend_config,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    XNNPACKQuantizer,
+    get_symmetric_quantization_config,
+)
+from torch.ao.quantization import QuantWrapper, QuantStub, DeQuantStub, \
+    default_qconfig, default_dynamic_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
+    propagate_qconfig_, convert, get_default_qconfig, quantize_dynamic_jit, quantize_jit, float_qparams_weight_only_qconfig, \
+    get_default_qat_qconfig, PerChannelMinMaxObserver, default_dynamic_quant_observer, quantize, \
+    QConfigMapping, get_default_qconfig_mapping, get_default_qat_qconfig_mapping
+from torch.ao.quantization.quantization_mappings import (
+    get_default_dynamic_quant_module_mappings,
+    get_default_qconfig_propagation_list,
+    get_default_qat_module_mappings,
+)
+from torch.testing._internal.common_quantized import (
+    override_quantized_engine,
+)
+from torch.jit.mobile import _load_for_lite_interpreter
+
+try:
+    # graph mode quantization based on fx
+    from torch.ao.quantization.quantize_fx import (
+        prepare_fx,
+        prepare_qat_fx,
+        convert_fx,
+        convert_to_reference_fx,
+    )
+    from torch.ao.ns.fx.ns_types import NSSingleResultValuesType, NSSubgraph
+    from torch.fx.graph import Node
+    from torch.fx import GraphModule
+    HAS_FX = True
+except ImportError:
+    HAS_FX = False
+
+import copy
+import io
+import functools
+import time
+import os
+
+import unittest
+import numpy as np
+from torch.testing import FileCheck
+from typing import Callable, Tuple, Dict, Any, Union, Type, Optional
+import torch._dynamo as torchdynamo
+
+class NodeSpec:
+    ''' Used for checking GraphModule Node
+    '''
+    def __init__(self, op, target):
+        '''
+        op: call_function | call_module
+        target:
+          for call_function, target would be a function
+          for call_module, target would be the type of PyTorch module
+        '''
+        self.op = op
+        self.target = target
+
+    @classmethod
+    def call_function(cls, target):
+        return NodeSpec('call_function', target)
+
+    @classmethod
+    def call_method(cls, target):
+        return NodeSpec('call_method', target)
+
+    @classmethod
+    def call_module(cls, target):
+        return NodeSpec('call_module', target)
+
+    def __hash__(self):
+        return hash((self.op, self.target))
+
+    def __eq__(self, other):
+        if not isinstance(other, NodeSpec):
+            return NotImplemented
+
+        return self.op == other.op and self.target == other.target
+
+    def __repr__(self):
+        return repr(self.op) + " " + repr(self.target)
+
+def get_supported_device_types():
+    return ['cpu', 'cuda'] if torch.cuda.is_available() and not TEST_WITH_ROCM else ['cpu']
+
+def test_only_eval_fn(model, calib_data):
+    r"""
+    Default evaluation function takes a torch.utils.data.Dataset or a list of
+    input Tensors and run the model on the dataset
+    """
+    for inp in calib_data:
+        output = model(*inp)
+
+_default_loss_fn = torch.nn.CrossEntropyLoss()
+def test_only_train_fn(model, train_data, loss_fn=_default_loss_fn):
+    r"""
+    Default train function takes a torch.utils.data.Dataset and train the model
+    on the dataset
+    """
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    train_loss, correct, total = 0, 0, 0
+    for i in range(10):
+        model.train()
+
+        for data, target in train_data:
+            optimizer.zero_grad()
+            output = model(data)
+            loss = loss_fn(output, target)
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            _, predicted = torch.max(output, 1)
+            total += target.size(0)
+            correct += (predicted == target).sum().item()
+    return train_loss, correct, total
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+def train_one_epoch(model, criterion, optimizer, data_loader, device, ntrain_batches):
+    model.train()
+    cnt = 0
+    for image, target in data_loader:
+        start_time = time.time()
+        print('.', end='')
+        cnt += 1
+        image, target = image.to(device), target.to(device)
+        output = model(image)
+        loss = criterion(output, target)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        if cnt >= ntrain_batches:
+            return
+    return
+
+def ddp_setup(rank, world_size):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+def ddp_cleanup():
+    dist.destroy_process_group()
+
+def run_ddp(rank, world_size, prepared):
+    ddp_setup(rank, world_size)
+    prepared.cuda()
+    prepared = torch.nn.parallel.DistributedDataParallel(prepared, device_ids=[rank])
+    prepared.to(rank)
+    model_with_ddp = prepared
+    optimizer = torch.optim.SGD(model_with_ddp.parameters(), lr=0.0001)
+    train_one_epoch(model_with_ddp, criterion, optimizer, dataset, rank, 1)  # noqa: F821
+    ddp_cleanup()
+
+
+def convert_dynamic(module):
+    convert(module, get_default_dynamic_quant_module_mappings(), inplace=True)
+
+def prepare_dynamic(model, qconfig_dict=None):
+    propagate_qconfig_(model, qconfig_dict)
+
+def _make_conv_test_input(
+    batch_size, in_channels_per_group, input_feature_map_size,
+    out_channels_per_group, groups, kernel_size, X_scale, X_zero_point, W_scale,
+    W_zero_point, use_bias, use_channelwise,
+):
+    in_channels = in_channels_per_group * groups
+    out_channels = out_channels_per_group * groups
+
+    (X_value_min, X_value_max) = (0, 4)
+    X_init = torch.randint(
+        X_value_min, X_value_max,
+        (batch_size, in_channels,) + input_feature_map_size)
+    X = X_scale * (X_init - X_zero_point).float()
+    X_q = torch.quantize_per_tensor(
+        X, scale=X_scale, zero_point=X_zero_point, dtype=torch.quint8)
+
+    W_scale = W_scale * out_channels
+    W_zero_point = W_zero_point * out_channels
+    # Resize W_scale and W_zero_points arrays equal to out_channels
+    W_scale = W_scale[:out_channels]
+    W_zero_point = W_zero_point[:out_channels]
+    # For testing, we use small values for weights and for activations so that
+    # no overflow occurs in vpmaddubsw instruction. If the overflow occurs in
+    # qconv implementation and if there is no overflow.
+    # In reference we can't exactly match the results with reference.
+    # Please see the comment in qconv implementation file
+    #   aten/src/ATen/native/quantized/cpu/qconv.cpp for more details.
+    (W_value_min, W_value_max) = (-5, 5)
+    # The operator expects them in the format
+    # (out_channels, in_channels/groups,) + kernel_size
+    W_init = torch.randint(
+        W_value_min, W_value_max,
+        (out_channels, in_channels_per_group,) + kernel_size)
+    b_init = torch.randint(0, 10, (out_channels,))
+
+    if use_channelwise:
+        W_shape = (-1, 1) + (1,) * len(kernel_size)
+        W_scales_tensor = torch.tensor(W_scale, dtype=torch.float)
+        W_zero_points_tensor = torch.tensor(W_zero_point, dtype=torch.float)
+        W = W_scales_tensor.reshape(*W_shape) * (
+            W_init.float() - W_zero_points_tensor.reshape(*W_shape)).float()
+        b = X_scale * W_scales_tensor * b_init.float()
+        W_q = torch.quantize_per_channel(
+            W, W_scales_tensor.double(), W_zero_points_tensor.long(), 0,
+            dtype=torch.qint8)
+    else:
+        W = W_scale[0] * (W_init - W_zero_point[0]).float()
+        b = X_scale * W_scale[0] * b_init.float()
+        W_q = torch.quantize_per_tensor(
+            W, scale=W_scale[0], zero_point=W_zero_point[0], dtype=torch.qint8)
+
+    return (X, X_q, W, W_q, b if use_bias else None)
+
+def _make_conv_add_extra_input_tensor(scale, zero_point, sizes):
+    (X_value_min, X_value_max) = (0, 4)
+    X_init = torch.randint(
+        X_value_min,
+        X_value_max,
+        sizes  # Infer the size of tensor to do the add
+    )
+    X = scale * (X_init - zero_point).float()
+    X_q = torch.quantize_per_tensor(
+        X, scale=scale, zero_point=zero_point, dtype=torch.quint8)
+    return X, X_q
+
+def skipIfNoFBGEMM(fn):
+    reason = 'Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs with instruction set support AVX2 or newer.'
+    if isinstance(fn, type):
+        if 'fbgemm' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'fbgemm' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoQNNPACK(fn):
+    reason = 'Quantized operations require QNNPACK.'
+    if isinstance(fn, type):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch.onnx._CAFFE2_ATEN_FALLBACK:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def withQNNPACKBackend(fn):
+    # TODO(future PR): consider combining with skipIfNoQNNPACK,
+    # will require testing of existing callsites
+    reason = 'Quantized operations require QNNPACK.'
+    if isinstance(fn, type):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'qnnpack' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        with override_quantized_engine('qnnpack'):
+            fn(*args, **kwargs)
+
+    return wrapper
+
+def skipIfNoONEDNN(fn):
+    reason = 'Quantized operations require ONEDNN.'
+    if isinstance(fn, type):
+        if 'onednn' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'onednn' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoONEDNNBF16(fn):
+    reason = 'Quantized operations require BF16 support.'
+    if isinstance(fn, type):
+        if not torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch.ops.mkldnn._is_mkldnn_bf16_supported():
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoX86(fn):
+    reason = 'Quantized operations require X86.'
+    if isinstance(fn, type):
+        if 'x86' not in torch.backends.quantized.supported_engines:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if 'x86' not in torch.backends.quantized.supported_engines:
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoDynamoSupport(fn):
+    reason = "dynamo doesn't support."
+    if isinstance(fn, type):
+        if not torchdynamo.is_dynamo_supported():
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torchdynamo.is_dynamo_supported():
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoInductorSupport(fn):
+    reason = "inductor doesn't support."
+    if isinstance(fn, type):
+        if not torchdynamo.is_inductor_supported():
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = reason
+        return fn
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torchdynamo.is_inductor_supported():
+            raise unittest.SkipTest(reason)
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+try:
+    import torchvision  # noqa: F401
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+skip_if_no_torchvision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
+
+def get_script_module(model, tracing, data):
+    return torch.jit.trace(model, data) if tracing else torch.jit.script(model)
+
+def lengths_to_offsets(t, offset_type=np.int64, use_begin_offset=True):
+    """
+    Convert lengths to offsets for embedding_bag
+    """
+    tt = np.zeros((t.shape[0] + 1,), dtype=offset_type)
+    tt[1:] = t
+    tt = torch.from_numpy(np.cumsum(tt, dtype=offset_type))
+    if use_begin_offset:
+        return tt[:-1]
+    return tt[1:]
+
+# QuantizationTestCase used as a base class for testing quantization on modules
+class QuantizationTestCase(TestCase):
+    def setUp(self):
+        super().setUp()
+        self.calib_data = [[torch.rand(2, 5, dtype=torch.float)] for _ in range(2)]
+        self.train_data = [[torch.rand(2, 5, dtype=torch.float), torch.randint(0, 1, (2,), dtype=torch.long)] for _ in range(2)]
+        self.img_data_1d = [[torch.rand(2, 3, 10, dtype=torch.float)]
+                            for _ in range(2)]
+        self.img_data_2d = [[torch.rand(1, 3, 10, 10, dtype=torch.float)]
+                            for _ in range(2)]
+        self.img_data_3d = [[torch.rand(1, 3, 5, 5, 5, dtype=torch.float)]
+                            for _ in range(2)]
+        self.img_data_1d_train = [[torch.rand(2, 3, 10, dtype=torch.float),
+                                   torch.randint(0, 1, (1,), dtype=torch.long)]
+                                  for _ in range(2)]
+        self.img_data_2d_train = [[torch.rand(1, 3, 10, 10, dtype=torch.float),
+                                   torch.randint(0, 1, (1,), dtype=torch.long)]
+                                  for _ in range(2)]
+        self.img_data_3d_train = [[torch.rand(1, 3, 5, 5, 5, dtype=torch.float),
+                                   torch.randint(0, 1, (1,), dtype=torch.long)]
+                                  for _ in range(2)]
+
+        self.img_data_dict = {1 : self.img_data_1d,
+                              2 : self.img_data_2d,
+                              3 : self.img_data_3d}
+
+        # Quant types that produce statically quantized ops
+        self.static_quant_types = [QuantType.STATIC, QuantType.QAT]
+        # All quant types for (fx based) graph mode quantization
+        self.all_quant_types = [QuantType.DYNAMIC, QuantType.STATIC, QuantType.QAT]
+
+    def checkNoPrepModules(self, module):
+        r"""Checks the module does not contain child
+            modules for quantization preparation, e.g.
+            quant, dequant and observer
+        """
+        self.assertFalse(hasattr(module, 'quant'))
+        self.assertFalse(hasattr(module, 'dequant'))
+
+    def checkNoQconfig(self, module):
+        r"""Checks the module does not contain qconfig
+        """
+        self.assertFalse(hasattr(module, 'qconfig'))
+
+        for child in module.children():
+            self.checkNoQconfig(child)
+
+    def checkHasPrepModules(self, module):
+        r"""Checks the module contains child
+            modules for quantization preparation, e.g.
+            quant, dequant and observer
+        """
+        self.assertTrue(hasattr(module, 'module'))
+        self.assertTrue(hasattr(module, 'quant'))
+        self.assertTrue(hasattr(module, 'dequant'))
+
+    def checkObservers(self, module, propagate_qconfig_list=None, prepare_custom_config_dict=None):
+        r"""Checks the module or module's leaf descendants
+            have observers in preparation for quantization
+        """
+        if propagate_qconfig_list is None:
+            propagate_qconfig_list = get_default_qconfig_propagation_list()
+        if prepare_custom_config_dict is None:
+            prepare_custom_config_dict = {}
+        float_to_observed_module_class_mapping = prepare_custom_config_dict.get("float_to_observed_custom_module_class", {})
+
+        # check if a module is a leaf module, ignoring activation_post_process attribute
+        def is_leaf_module(module):
+            submodule_name_count = 0
+            for name, _ in module.named_children():
+                if name != 'activation_post_process':
+                    submodule_name_count += 1
+            return submodule_name_count == 0
+
+        if hasattr(module, 'qconfig') and module.qconfig is not None and \
+           ((is_leaf_module(module) and not isinstance(module, torch.nn.Sequential)
+            and type(module) in propagate_qconfig_list) or
+           type(module) in float_to_observed_module_class_mapping.keys()) and \
+           not isinstance(module, torch.ao.quantization.DeQuantStub):
+            self.assertTrue(hasattr(module, 'activation_post_process'),
+                            'module: ' + str(type(module)) + ' do not have observer')
+        # we don't need to check observers for child modules of the
+        # qat modules
+        if type(module) not in get_default_qat_module_mappings().values() and \
+           type(module) not in float_to_observed_module_class_mapping.values() and \
+           not isinstance(module, _FusedModule):
+            for child in module.children():
+                if type(child) in [nn.Dropout]:
+                    continue
+                self.checkObservers(child, propagate_qconfig_list, prepare_custom_config_dict)
+
+    def checkQuantDequant(self, mod):
+        r"""Checks that mod has nn.Quantize and
+            nn.DeQuantize submodules inserted
+        """
+        self.assertEqual(type(mod.quant), nnq.Quantize)
+        self.assertEqual(type(mod.dequant), nnq.DeQuantize)
+
+    def checkWrappedQuantizedLinear(self, mod):
+        r"""Checks that mod has been swapped for an nnq.Linear
+            module, the bias is qint32, and that the module
+            has Quantize and DeQuantize submodules
+        """
+        self.assertEqual(type(mod.module), nnq.Linear)
+        self.checkQuantDequant(mod)
+
+    def checkQuantizedLinear(self, mod):
+        self.assertEqual(type(mod), nnq.Linear)
+
+    def checkDynamicQuantizedLinear(self, mod, dtype):
+        r"""Checks that mod has been swapped for an nnqd.Linear
+            module, the bias is float.
+        """
+        self.assertEqual(type(mod), nnqd.Linear)
+        self.assertEqual(mod._packed_params.dtype, dtype)
+
+    def checkDynamicQuantizedLinearRelu(self, mod, dtype):
+        r"""Checks that mod has been swapped for an nnqd.Linear
+            module, the bias is float.
+        """
+        self.assertEqual(type(mod), nniqd.LinearReLU)
+        self.assertEqual(mod._packed_params.dtype, dtype)
+
+    def check_eager_serialization(self, ref_model, loaded_model, x):
+        # Check state dict serialization and torch.save APIs
+        model_dict = ref_model.state_dict()
+        b = io.BytesIO()
+        torch.save(model_dict, b)
+        b.seek(0)
+        loaded_dict = torch.load(b)
+        loaded_model.load_state_dict(loaded_dict)
+        ref_out = ref_model(*x)
+        load_out = loaded_model(*x)
+
+        def check_outputs(ref_out, load_out):
+            self.assertEqual(ref_out[0], load_out[0])
+            if isinstance(ref_out[1], tuple):
+                self.assertEqual(ref_out[1][0], load_out[1][0])
+                self.assertEqual(ref_out[1][1], load_out[1][1])
+            else:
+                self.assertEqual(ref_out[1], load_out[1])
+
+        check_outputs(ref_out, load_out)
+        b = io.BytesIO()
+        torch.save(ref_model, b)
+        b.seek(0)
+        loaded = torch.load(b)
+        load_out = loaded(*x)
+        check_outputs(ref_out, load_out)
+
+    def check_weight_bias_api(self, ref_model, weight_keys, bias_keys):
+        weight = ref_model.get_weight()
+        bias = ref_model.get_bias()
+        self.assertEqual(weight_keys ^ weight.keys(), set())
+        self.assertEqual(bias_keys ^ bias.keys(), set())
+
+    def checkDynamicQuantizedLSTM(self, mod, reference_module_type, dtype):
+        r"""Checks that mod has been swapped for an nnqd.LSTM type
+            module, the bias is float.
+        """
+        wt_dtype_map = {torch.qint8: 'quantized_dynamic', torch.float16: 'quantized_fp16'}
+        self.assertEqual(type(mod), reference_module_type)
+        for packed_params in mod._all_weight_values:
+            self.assertEqual(packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype])
+
+    def checkLinear(self, mod):
+        self.assertEqual(type(mod), torch.nn.Linear)
+
+    def checkDynamicQuantizedModule(self, mod, reference_module_type, dtype):
+        r"""Checks that mod has been swapped for an nnqd.Linear
+            module, the bias is float.
+        """
+        wt_dtype_map = {torch.qint8: 'quantized_dynamic', torch.float16: 'quantized_fp16'}
+        self.assertEqual(type(mod), reference_module_type)
+        if hasattr(mod, '_all_weight_values'):
+            for packed_params in mod._all_weight_values:
+                self.assertEqual(packed_params.param.__getstate__()[0][0], wt_dtype_map[dtype])
+
+    def checkScriptable(self, orig_mod, calib_data, check_save_load=False):
+        scripted = torch.jit.script(orig_mod)
+        self._checkScriptable(orig_mod, scripted, calib_data, check_save_load)
+
+        # Use first calib_data entry as trace input
+        traced = torch.jit.trace(orig_mod, calib_data[0])
+        self._checkScriptable(orig_mod, traced, calib_data, check_save_load)
+
+    # Call this twice: once for a scripted module and once for a traced module
+    def _checkScriptable(self, orig_mod, script_mod, calib_data, check_save_load):
+        self._checkModuleCorrectnessAgainstOrig(orig_mod, script_mod, calib_data)
+
+        # Test save/load
+        buffer = io.BytesIO()
+        torch.jit.save(script_mod, buffer)
+
+        buffer.seek(0)
+        loaded_mod = torch.jit.load(buffer)
+        # Pending __get_state_ and __set_state__ support
+        # See tracking task https://github.com/pytorch/pytorch/issues/23984
+        if check_save_load:
+            self._checkModuleCorrectnessAgainstOrig(orig_mod, loaded_mod, calib_data)
+
+    def _checkModuleCorrectnessAgainstOrig(self, orig_mod, test_mod, calib_data):
+        for inp in calib_data:
+            ref_output = orig_mod(*inp)
+            scripted_output = test_mod(*inp)
+            self.assertEqual(scripted_output, ref_output)
+
+
+    def checkGraphModeOp(self, module, inputs, quantized_op, tracing=False, debug=False,
+                         check=True, eval_mode=True, dynamic=False, qconfig=None):
+        if debug:
+            print('Testing:', str(module))
+        qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
+
+        if eval_mode:
+            module = module.eval()
+        if dynamic:
+            qconfig_dict = {'': default_dynamic_qconfig if qconfig is None else qconfig}
+        model = get_script_module(module, tracing, inputs[0]).eval()
+        if debug:
+            print('input graph:', model.graph)
+        models = {}
+        outputs = {}
+        for debug in [True, False]:
+            if dynamic:
+                models[debug] = quantize_dynamic_jit(model, qconfig_dict, debug=debug)
+                # make sure it runs
+                outputs[debug] = models[debug](inputs)
+            else:
+                # module under test can contain in-place ops, and we depend on
+                # input data staying constant for comparisons
+                inputs_copy = copy.deepcopy(inputs)
+                models[debug] = quantize_jit(
+                    model, qconfig_dict, test_only_eval_fn, [inputs_copy], inplace=False,
+                    debug=debug)
+                # make sure it runs
+                outputs[debug] = models[debug](*inputs[0])
+
+        if debug:
+            print('debug graph:', models[True].graph)
+            print('non debug graph:', models[False].graph)
+
+        if check:
+            # debug and non-debug option should have the same numerics
+            self.assertEqual(outputs[True], outputs[False])
+
+            # non debug graph should produce quantized op
+            FileCheck().check(quantized_op) \
+                       .run(models[False].graph)
+
+        return models[False]
+
+    def checkGraphModuleNodes(
+            self, graph_module,
+            expected_node=None,
+            expected_node_occurrence=None,
+            expected_node_list=None):
+        """ Check if GraphModule contains the target node
+        Args:
+            graph_module: the GraphModule instance we want to check
+            expected_node, expected_node_occurrence, expected_node_list:
+               see docs for checkGraphModeFxOp
+        """
+        nodes_in_graph = {}
+        node_list = []
+        modules = dict(graph_module.named_modules(remove_duplicate=False))
+        for node in graph_module.graph.nodes:
+            n = None
+            if node.op == 'call_function' or node.op == 'call_method':
+                n = NodeSpec(node.op, node.target)
+            elif node.op == 'call_module':
+                n = NodeSpec(node.op, type(modules[node.target]))
+
+            if n is not None:
+                node_list.append(n)
+                if n in nodes_in_graph:
+                    nodes_in_graph[n] += 1
+                else:
+                    nodes_in_graph[n] = 1
+
+        if expected_node is not None:
+            self.assertTrue(expected_node in nodes_in_graph, 'node:' + str(expected_node) +
+                            ' not found in the graph module')
+
+        if expected_node_occurrence is not None:
+            for expected_node, occurrence in expected_node_occurrence.items():
+                if occurrence != 0:
+                    self.assertTrue(
+                        expected_node in nodes_in_graph,
+                        'Check failed for node:' + str(expected_node) +
+                        ' not found')
+                    self.assertTrue(
+                        nodes_in_graph[expected_node] == occurrence,
+                        'Check failed for node:' + str(expected_node) +
+                        ' Expected occurrence:' + str(occurrence) +
+                        ' Found occurrence:' + str(nodes_in_graph[expected_node]))
+                else:
+                    self.assertTrue(
+                        expected_node not in nodes_in_graph,
+                        'Check failed for node:' + str(expected_node) +
+                        ' expected no occurrence but found')
+
+        if expected_node_list is not None:
+            cur_index = 0
+            for n in node_list:
+                if cur_index == len(expected_node_list):
+                    return
+                if n == expected_node_list[cur_index]:
+                    cur_index += 1
+            self.assertTrue(
+                cur_index == len(expected_node_list),
+                "Check failed for graph:" +
+                self.printGraphModule(graph_module, print_str=False) +
+                "Expected ordered list:" +
+                str(expected_node_list))
+
+    def printGraphModule(self, graph_module, print_str=True):
+        modules = dict(graph_module.named_modules(remove_duplicate=False))
+        node_infos = []
+        for n in graph_module.graph.nodes:
+            node_info = ' '.join(map(repr, [n.op, n.name, n.target, n.args, n.kwargs]))
+            if n.op == 'call_module':
+                node_info += ' module type: ' + repr(type(modules[n.target]))
+            node_infos.append(node_info)
+        str_to_print = '\n'.join(node_infos)
+        if print_str:
+            print(str_to_print)
+        return str_to_print
+
+    if HAS_FX:
+
+        def assert_types_for_matched_subgraph_pairs(
+            self,
+            matched_subgraph_pairs: Dict[str, Tuple[NSSubgraph, NSSubgraph]],
+            expected_types: Dict[str, Tuple[Tuple[Callable, Callable], Tuple[Callable, Callable]]],
+            gm_a: GraphModule,
+            gm_b: GraphModule,
+        ) -> None:
+            """
+            Verifies that the types specified in expected_types match
+            the underlying objects pointed to by the nodes in matched_subgraph_pairs.
+
+            An example successful test case:
+
+              matched_subgraph_pairs = {'x0': (graph_a_conv_0_node, graph_b_conv_0_node)}
+              expected_types = {'x0': (nn.Conv2d, nnq.Conv2d)}
+
+            The function tests for key equivalence, and verifies types with
+            instance checks.
+            """
+
+            def _get_underlying_op_type(
+                node: Node, gm: GraphModule
+            ) -> Union[Callable, str]:
+                if node.op == 'call_module':
+                    mod = getattr(gm, node.target)
+                    return type(mod)
+                else:
+                    assert node.op in ('call_function', 'call_method')
+                    return node.target
+
+            self.assertTrue(
+                len(matched_subgraph_pairs) == len(expected_types),
+                f'Expected length of results to match, but got {len(matched_subgraph_pairs)} and {len(expected_types)}'
+            )
+            for k, v in expected_types.items():
+                expected_types_a, expected_types_b = v
+                exp_type_start_a, exp_type_end_a = expected_types_a
+                exp_type_start_b, exp_type_end_b = expected_types_b
+                subgraph_a, subgraph_b = matched_subgraph_pairs[k]
+
+                act_type_start_a = _get_underlying_op_type(subgraph_a.start_node, gm_a)
+                act_type_start_b = _get_underlying_op_type(subgraph_b.start_node, gm_b)
+                act_type_end_a = _get_underlying_op_type(subgraph_a.end_node, gm_a)
+                act_type_end_b = _get_underlying_op_type(subgraph_b.end_node, gm_b)
+                types_match = (exp_type_start_a is act_type_start_a) and \
+                    (exp_type_end_a is act_type_end_a) and \
+                    (exp_type_start_b is act_type_start_b) and \
+                    (exp_type_end_b is act_type_end_b)
+                self.assertTrue(
+                    types_match,
+                    'Type mismatch at {}: expected {}, got {}'.format(
+                        k,
+                        (exp_type_start_a, exp_type_end_a, exp_type_start_b, exp_type_end_b),
+                        (act_type_start_a, act_type_end_a, act_type_start_b, act_type_end_b))
+                )
+
+        def assert_ns_compare_dict_valid(
+            self,
+            act_compare_dict: Dict[str, Dict[str, Dict[str, Any]]],
+        ) -> None:
+            """
+            Verifies that the act_compare_dict (output of Numeric Suite APIs) is valid:
+            1. for each layer, results are recorded for two models
+            2. number of seen tensors match
+            3. shapes of each pair of seen tensors match
+            """
+            for layer_name, result_type_to_data in act_compare_dict.items():
+                for result_type, layer_data in result_type_to_data.items():
+                    self.assertTrue(
+                        len(layer_data) == 2,
+                        f"Layer {layer_name} does not have exactly two model results.")
+                    model_name_0, model_name_1 = layer_data.keys()
+                    for res_idx in range(len(layer_data[model_name_0])):
+                        layer_data_0 = layer_data[model_name_0][res_idx]
+                        layer_data_1 = layer_data[model_name_1][res_idx]
+                        self.assertTrue(
+                            layer_data_0['type'] == layer_data_0['type'],
+                            f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same type.")
+
+                        self.assertTrue(
+                            len(layer_data_0['values']) ==
+                            len(layer_data_1['values']),
+                            f"Layer {layer_name}, {model_name_0} and {model_name_1} do not have the same number of seen Tensors.")
+
+                        # F.conv1d weight has rank 3, and toq.conv1d unpacked weight
+                        # has rank 4. For now, skip the length check for conv1d only.
+                        is_weight_functional_conv1d = (
+                            result_type == NSSingleResultValuesType.WEIGHT.value and
+                            (
+                                'conv1d' in layer_data_0['prev_node_target_type'] or
+                                'conv1d' in layer_data_1['prev_node_target_type']
+                            )
+                        )
+                        if not is_weight_functional_conv1d:
+                            for idx in range(len(layer_data_0['values'])):
+                                values_0 = layer_data_0['values'][idx]
+                                values_1 = layer_data_1['values'][idx]
+                                if isinstance(values_0, torch.Tensor):
+                                    self.assertTrue(
+                                        values_0.shape == values_1.shape,
+                                        f"Layer {layer_name}, {model_name_0} and {model_name_1} " +
+                                        f"have a shape mismatch at idx {idx}.")
+                                elif isinstance(values_0, list):
+                                    values_0 = values_0[0]
+                                    values_1 = values_1[0]
+                                    self.assertTrue(
+                                        values_0.shape == values_1.shape,
+                                        f"Layer {layer_name}, {model_name_0} and {model_name_1} " +
+                                        f"have a shape mismatch at idx {idx}.")
+                                else:
+                                    assert isinstance(values_0, tuple), \
+                                        f"unhandled type {type(values_0)}"
+                                    assert len(values_0) == 2
+                                    assert len(values_0[1]) == 2
+                                    assert values_0[0].shape == values_1[0].shape
+                                    assert values_0[1][0].shape == values_1[1][0].shape
+                                    assert values_0[1][1].shape == values_1[1][1].shape
+
+                        # verify that ref_node_name is valid
+                        ref_node_name_0 = layer_data_0['ref_node_name']
+                        ref_node_name_1 = layer_data_1['ref_node_name']
+                        prev_node_name_0 = layer_data_0['prev_node_name']
+                        prev_node_name_1 = layer_data_1['prev_node_name']
+                        if layer_data_0['type'] == NSSingleResultValuesType.NODE_OUTPUT.value:
+                            self.assertTrue(ref_node_name_0 == prev_node_name_0)
+                            self.assertTrue(ref_node_name_1 == prev_node_name_1)
+                        elif layer_data_0['type'] == NSSingleResultValuesType.NODE_INPUT.value:
+                            self.assertTrue(ref_node_name_0 != prev_node_name_0)
+                            self.assertTrue(ref_node_name_1 != prev_node_name_1)
+
+        def checkGraphModeFxOp(
+                self,
+                model,
+                inputs,
+                quant_type,
+                expected_node=None,
+                expected_node_occurrence=None,
+                expected_node_list=None,
+                is_reference=False,
+                print_debug_info=False,
+                custom_qconfig_dict=None,
+                prepare_expected_node=None,
+                prepare_expected_node_occurrence=None,
+                prepare_expected_node_list=None,
+                prepare_custom_config=None,
+                backend_config=None):
+            """ Quantizes model with graph mode quantization on fx and check if the
+                quantized model contains the quantized_node
+
+                Args:
+                    model: floating point torch.nn.Module
+                    inputs: one positional sample input arguments for model
+                    expected_node: NodeSpec
+                        e.g. NodeSpec.call_function(torch.quantize_per_tensor)
+                    expected_node_occurrence: a dict from NodeSpec to
+                        expected number of occurrences (int)
+                        e.g. {NodeSpec.call_function(torch.quantize_per_tensor) : 1,
+                                NodeSpec.call_method('dequantize'): 1}
+                    expected_node_list: a list of NodeSpec, used to check the order
+                        of the occurrence of Node
+                        e.g. [NodeSpec.call_function(torch.quantize_per_tensor),
+                                NodeSpec.call_module(nnq.Conv2d),
+                                NodeSpec.call_function(F.hardtanh_),
+                                NodeSpec.call_method('dequantize')]
+                    is_reference: if True, enables reference mode
+                    print_debug_info: if True, prints debug info
+                    custom_qconfig_dict: overrides default qconfig_dict
+                    prepare_expected_node: same as expected_node, but for prepare
+                    prepare_expected_node_occurrence: same as
+                        expected_node_occurrence, but for prepare
+                    prepare_expected_node_list: same as expected_node_list, but
+                        for prepare
+
+                Returns:
+                    A dictionary with the following structure:
+                   {
+                       "prepared": ...,  # the prepared model
+                       "quantized": ...,  # the quantized non-reference model
+                       "quantized_reference": ...,  # the quantized reference model
+                       "result": ...,  # the result for either quantized or
+                                       # quantized_reference model depending on the
+                                       # is_reference argument
+                   }
+            """
+            # TODO: make img_data a single example instead of a list
+            if type(inputs) == list:
+                inputs = inputs[0]
+
+            if quant_type == QuantType.QAT:
+                qconfig_mapping = get_default_qat_qconfig_mapping(torch.backends.quantized.engine)
+                model.train()
+            elif quant_type == QuantType.STATIC:
+                qconfig_mapping = get_default_qconfig_mapping(torch.backends.quantized.engine)
+                model.eval()
+            else:
+                qconfig = default_dynamic_qconfig
+                qconfig_mapping = QConfigMapping().set_global(qconfig)
+                model.eval()
+
+            if quant_type == QuantType.QAT:
+                prepare = prepare_qat_fx
+            else:
+                prepare = prepare_fx
+
+            # overwrite qconfig_dict with custom_qconfig_dict
+            if custom_qconfig_dict is not None:
+                assert type(custom_qconfig_dict) in (QConfigMapping, dict), \
+                    'custom_qconfig_dict should be a QConfigMapping or a dict'
+                if isinstance(custom_qconfig_dict, QConfigMapping):
+                    qconfig_mapping = custom_qconfig_dict
+                else:
+                    qconfig_mapping = QConfigMapping.from_dict(custom_qconfig_dict)
+            prepared = prepare(
+                model, qconfig_mapping,
+                example_inputs=inputs,
+                prepare_custom_config=prepare_custom_config,
+                backend_config=backend_config)
+            if not quant_type == QuantType.DYNAMIC:
+                prepared(*inputs)
+
+            if print_debug_info:
+                print()
+                print('quant type:\n', quant_type)
+                print('original model:\n', model)
+                print()
+                print('prepared model:\n', prepared)
+
+            self.checkGraphModuleNodes(
+                prepared, prepare_expected_node,
+                prepare_expected_node_occurrence, prepare_expected_node_list)
+
+            prepared_copy = copy.deepcopy(prepared)
+            qgraph = convert_fx(copy.deepcopy(prepared))
+            qgraph_reference = convert_to_reference_fx(copy.deepcopy(prepared))
+            result = qgraph(*inputs)
+            result_reference = qgraph_reference(*inputs)
+            qgraph_copy = copy.deepcopy(qgraph)
+            qgraph_reference_copy = copy.deepcopy(qgraph_reference)
+
+            qgraph_to_check = qgraph_reference if is_reference else qgraph
+            if print_debug_info:
+                print()
+                print('quantized model:\n', qgraph_to_check)
+                self.printGraphModule(qgraph_to_check)
+                print()
+            self.checkGraphModuleNodes(
+                qgraph_to_check, expected_node, expected_node_occurrence, expected_node_list)
+            return {"prepared": prepared_copy,
+                    "quantized": qgraph_copy,
+                    "quantized_reference": qgraph_reference_copy,
+                    "quantized_output": result,
+                    "quantized_reference_output": result_reference}
+
+
+    def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indices, offsets,
+                                    set_qconfig, is_emb_bag, dtype=torch.quint8):
+        # Test serialization of dynamic EmbeddingBag module using state_dict
+        if is_emb_bag:
+            inputs = [indices, offsets]
+        else:
+            inputs = [indices]
+        emb_dict = qemb.state_dict()
+        b = io.BytesIO()
+        torch.save(emb_dict, b)
+        b.seek(0)
+        loaded_dict = torch.load(b)
+        embedding_unpack = torch.ops.quantized.embedding_bag_unpack
+        # Check unpacked weight values explicitly
+        for key in emb_dict:
+            if isinstance(emb_dict[key], torch._C.ScriptObject):
+                assert isinstance(loaded_dict[key], torch._C.ScriptObject)
+                emb_weight = embedding_unpack(emb_dict[key])
+                loaded_weight = embedding_unpack(loaded_dict[key])
+                self.assertEqual(emb_weight, loaded_weight)
+
+        # Check state dict serialization and torch.save APIs
+        if is_emb_bag:
+            loaded_qemb = nnq.EmbeddingBag(num_embeddings=num_embeddings, embedding_dim=embedding_dim,
+                                           include_last_offset=True, mode='sum', dtype=dtype)
+        else:
+            loaded_qemb = nnq.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, dtype=dtype)
+        self.check_eager_serialization(qemb, loaded_qemb, inputs)
+
+        loaded_qemb.load_state_dict(loaded_dict)
+        self.assertEqual(embedding_unpack(qemb._packed_params._packed_weight),
+                         embedding_unpack(loaded_qemb._packed_params._packed_weight))
+
+
+        # Test JIT serialization
+        self.checkScriptable(qemb, [inputs], check_save_load=True)
+
+        # Test from_float call
+        if is_emb_bag:
+            float_embedding = torch.nn.EmbeddingBag(num_embeddings=num_embeddings, embedding_dim=embedding_dim,
+                                                    include_last_offset=True, scale_grad_by_freq=False, mode='sum')
+        else:
+            float_embedding = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
+
+        if set_qconfig:
+            float_qparams_observer = PerChannelMinMaxObserver.with_args(dtype=dtype,
+                                                                        qscheme=torch.per_channel_affine_float_qparams,
+                                                                        ch_axis=0)
+            float_embedding.qconfig = QConfig(activation=default_dynamic_quant_observer,
+                                              weight=float_qparams_observer)
+
+        prepare_dynamic(float_embedding)
+
+        float_embedding(*inputs)
+        if is_emb_bag:
+            q_embeddingbag = nnq.EmbeddingBag.from_float(float_embedding)
+            expected_name = "QuantizedEmbeddingBag"
+        else:
+            q_embeddingbag = nnq.Embedding.from_float(float_embedding)
+            expected_name = "QuantizedEmbedding"
+
+        q_embeddingbag(*inputs)
+
+        self.assertTrue(expected_name in str(q_embeddingbag))
+
+class QuantizationLiteTestCase(QuantizationTestCase):
+    def _create_quantized_model(self, model_class: Type[torch.nn.Module], **kwargs):
+        # Creates quantized model for testing mobile script modules
+        qengine = "qnnpack"
+        with override_quantized_engine(qengine):
+            qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+            model = model_class(**kwargs)
+            model = quantize(model, test_only_eval_fn, [self.calib_data])
+
+        return model
+
+    def _compare_script_and_mobile(self,
+                                   model: torch.nn.Module,
+                                   input: torch.Tensor):
+        # Compares the numerical outputs for script and lite modules
+        qengine = "qnnpack"
+        with override_quantized_engine(qengine):
+            script_module = torch.jit.script(model)
+            script_module_result = script_module(input)
+
+            max_retry = 5
+            for retry in range(1, max_retry + 1):
+                # retries `max_retry` times; breaks iff succeeds else throws exception
+                try:
+                    buffer = io.BytesIO(script_module._save_to_buffer_for_lite_interpreter())
+                    buffer.seek(0)
+                    mobile_module = _load_for_lite_interpreter(buffer)
+
+                    mobile_module_result = mobile_module(input)
+
+                    torch.testing.assert_close(script_module_result, mobile_module_result)
+                    mobile_module_forward_result = mobile_module.forward(input)
+                    torch.testing.assert_close(script_module_result, mobile_module_forward_result)
+
+                    mobile_module_run_method_result = mobile_module.run_method("forward", input)
+                    torch.testing.assert_close(script_module_result, mobile_module_run_method_result)
+                except AssertionError as e:
+                    if retry == max_retry:
+                        raise e
+                    else:
+                        continue
+                break
+
+
+class PT2EQuantizationTestCase(QuantizationTestCase):
+    """
+    Base QuantizationTestCase for PT2 with some helper methods.
+    """
+    _MAP_TO_FX_TRACED_OPS = {
+        torch.ops.quantized_decomposed.quantize_per_tensor: torch.ops.quantized_decomposed.quantize_per_tensor.default,
+        torch.ops.quantized_decomposed.dequantize_per_tensor: torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+        torch.ops.quantized_decomposed.quantize_per_channel: torch.ops.quantized_decomposed.quantize_per_channel.default,
+        torch.ops.quantized_decomposed.dequantize_per_channel: torch.ops.quantized_decomposed.dequantize_per_channel.default,
+        torch.ops.quantized_decomposed.quantize_per_tensor.tensor: torch.ops.quantized_decomposed.quantize_per_tensor.tensor,
+        torch.ops.quantized_decomposed.dequantize_per_tensor.tensor: torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+    }
+
+    def _test_quantizer(
+        self,
+        model,
+        example_inputs,
+        quantizer,
+        expected_node_occurrence,
+        expected_node_list=None,
+        check_against_fx_quant=False,
+        fx_qconfig_mapping=None,
+        export_with_dynamic_shape=False,
+        is_qat=False,
+    ):
+        # resetting dynamo cache
+        torch._dynamo.reset()
+        m_eager = model.eval()
+
+        # program capture
+        m = copy.deepcopy(m_eager)
+        dynamic_shapes = tuple(
+            {0: torch.export.Dim("dim")} if i == 0 else None
+            for i in range(len(example_inputs))
+        )
+        m = capture_pre_autograd_graph(
+            m,
+            example_inputs,
+            dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
+        )
+
+        if is_qat:
+            m = prepare_qat_pt2e(m, quantizer)
+        else:
+            m = prepare_pt2e(m, quantizer)
+        # Calibrate
+        m(*example_inputs)
+        m = convert_pt2e(m)
+
+        pt2_quant_output = m(*example_inputs)
+        ns = NodeSpec
+        node_occurrence = {
+            ns.call_function(k): v for k, v in expected_node_occurrence.items()
+        }
+        if expected_node_list is None:
+            expected_node_list = []
+        node_list = [ns.call_function(n) for n in expected_node_list]
+        self.checkGraphModuleNodes(
+            m, expected_node_occurrence=node_occurrence, expected_node_list=node_list
+        )
+        if check_against_fx_quant:
+            qconfig_mapping = fx_qconfig_mapping
+            backend_config = get_executorch_backend_config()
+            m_copy = copy.deepcopy(m_eager)
+            m_fx = prepare_fx(
+                m_copy, qconfig_mapping, example_inputs, backend_config=backend_config
+            )
+            m_fx(*example_inputs)
+            m_fx = _convert_to_reference_decomposed_fx(
+                m_fx, backend_config=backend_config
+            )
+            m_fx = capture_pre_autograd_graph(
+                m_fx,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes if export_with_dynamic_shape else None,
+            )
+            node_occurrence = {}
+            for k, v in PT2EQuantizationTestCase._MAP_TO_FX_TRACED_OPS.items():
+                if k in expected_node_occurrence:
+                    node_occurrence[ns.call_function(v)] = expected_node_occurrence[k]
+            self.checkGraphModuleNodes(m_fx, expected_node_occurrence=node_occurrence)
+            fx_quant_output = m_fx(*example_inputs)
+            self.assertEqual(fx_quant_output, pt2_quant_output)
+
+    def _quantize(self, m, quantizer, example_inputs):
+        # resetting dynamo cache
+        torch._dynamo.reset()
+
+        m = capture_pre_autograd_graph(
+            m,
+            example_inputs,
+        )
+        m = prepare_pt2e(m, quantizer)
+        m(*example_inputs)
+        m = convert_pt2e(m)
+        return m
+
+    def _get_pt2e_quantized_linear(self, is_per_channel=False) -> torch.fx.GraphModule:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        quantizer = XNNPACKQuantizer()
+        operator_config = get_symmetric_quantization_config(is_per_channel=is_per_channel)
+        quantizer.set_global(operator_config)
+        example_inputs = (torch.randn(2, 2),)
+        m = M().eval()
+        return self._quantize(m, quantizer, example_inputs)
+
+# Below are a series of toy models to use in testing quantization
+
+class SingleLayerLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class AnnotatedSingleLayerLinearModel(torch.nn.Module):
+    def __init__(self, qengine='fbgemm'):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.fc1 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
+
+    def forward(self, x):
+        x = self.fc1(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class SingleLayerLinearDynamicModel(torch.nn.Module):
+    def __init__(self, qengine='fbgemm'):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class LinearAddModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = torch.add(x, 5)
+        x = self.fc2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class RNNDynamicModel(torch.nn.Module):
+    def __init__(self, mod_type):
+        super().__init__()
+        self.qconfig = default_dynamic_qconfig
+        if mod_type == 'GRU':
+            self.mod = torch.nn.GRU(2, 2).to(dtype=torch.float)
+        if mod_type == 'LSTM':
+            self.mod = torch.nn.LSTM(2, 2).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.mod(x)
+        return x
+
+class RNNCellDynamicModel(torch.nn.Module):
+    def __init__(self, mod_type):
+        super().__init__()
+        self.qconfig = default_dynamic_qconfig
+        if mod_type == 'GRUCell':
+            self.mod = torch.nn.GRUCell(2, 2).to(dtype=torch.float)
+        if mod_type == 'LSTMCell':
+            self.mod = torch.nn.LSTMCell(2, 2).to(dtype=torch.float)
+        if mod_type == 'RNNReLU':
+            self.mod = torch.nn.RNNCell(2, 2, nonlinearity='relu').to(dtype=torch.float)
+        if mod_type == 'RNNTanh':
+            self.mod = torch.nn.RNNCell(2, 2, nonlinearity='tanh').to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.mod(x)
+        return x
+
+class LSTMwithHiddenDynamicModel(torch.nn.Module):
+    def __init__(self, qengine='fbgemm'):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.lstm = torch.nn.LSTM(2, 2).to(dtype=torch.float)
+
+    def forward(self, x, hid):
+        x, hid = self.lstm(x, hid)
+        return x, hid
+
+class ConvModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class ConvTransposeModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class AnnotatedConvModel(torch.nn.Module):
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.dequant(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class AnnotatedConvTransposeModel(torch.nn.Module):
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.conv = torch.nn.ConvTranspose2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.dequant(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class ConvBnModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class AnnotatedConvBnModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.qconfig = default_qconfig
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.dequant(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class ConvBnReLUModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class AnnotatedConvBnReLUModel(torch.nn.Module):
+    def __init__(self, qengine='fbgemm'):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.conv = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.bn = torch.nn.BatchNorm2d(5).to(dtype=torch.float)
+        self.relu = nn.ReLU(inplace=True)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self):
+        # TODO: remove this check and define two fuse_modules function on this module
+        if self.training:
+            torch.ao.quantization.fuse_modules_qat(self, [['conv', 'bn', 'relu']], inplace=True)
+        else:
+            torch.ao.quantization.fuse_modules(self, [['conv', 'bn', 'relu']], inplace=True)
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class TwoLayerConvModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(3, 5, 3, bias=False).to(dtype=torch.float)
+        self.conv2 = torch.nn.Conv2d(5, 5, 1, bias=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class TwoLayerLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class LinearModelWithSubmodule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.subm = TwoLayerLinearModel()
+        self.fc = nn.Linear(5, 5)
+
+    def forward(self, x):
+        x = self.subm(x)
+        x = self.fc(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.subm.get_example_inputs()
+
+class AnnotatedTwoLayerLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.fc2 = QuantWrapper(torch.nn.Linear(8, 5).to(dtype=torch.float))
+        self.fc2.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class ActivationsTestModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig("fbgemm")
+        self.quant = torch.ao.quantization.QuantStub()
+        self.hardswish = torch.nn.Hardswish().to(dtype=torch.float)
+        self.elu = torch.nn.ELU().to(dtype=torch.float)
+        self.dequant = torch.ao.quantization.DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.hardswish(x)
+        x = self.elu(x)
+        x = self.dequant(x)
+        return x
+
+class LinearReluModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.fc(x))
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+
+class LinearReluLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class LinearReluAddModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = torch.add(x, 5)
+        x = self.fc2(x)
+        self.relu = torch.nn.ReLU()
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class LinearBnLeakyReluModel(torch.nn.Module):
+    def __init__(self, with_bn=True):
+        super().__init__()
+        self.linear = nn.Linear(5, 5)
+        self.bn1d = nn.BatchNorm1d(5)
+        self.leaky_relu = nn.LeakyReLU(0.01)
+        self.with_bn = with_bn
+
+    def forward(self, x):
+        x = self.linear(x)
+        if self.with_bn:
+            x = self.bn1d(x)
+        x = self.leaky_relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class LinearTanhModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(5, 5)
+        self.tanh = nn.Tanh()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.tanh(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class ConvBnAddReluModel(torch.nn.Module):
+    def __init__(self,
+                 with_bn=True,
+                 with_relu=True,
+                 left_conv=True,
+                 two_conv=True,
+                 use_torch_add=True):
+        super().__init__()
+        self.conv = nn.Conv2d(5, 5, (2, 2))
+        self.conv2 = nn.Conv2d(5, 5, (2, 2))
+        self.bn = nn.BatchNorm2d(5)
+        self.relu = nn.ReLU()
+        self.with_bn = with_bn
+        self.with_relu = with_relu
+        self.two_conv = two_conv
+        self.left_conv = left_conv
+        self.use_torch_add = use_torch_add
+
+    def forward(self, x1, x2):
+        if self.two_conv:
+            if self.use_torch_add:
+                if self.with_bn:
+                    x = torch.add(self.bn(self.conv(x1)), self.conv2(x1))
+                else:
+                    x = torch.add(self.conv(x1), self.conv2(x1))
+            else:
+                if self.with_bn:
+                    x = self.bn(self.conv(x1)) + self.conv2(x1)
+                else:
+                    x = self.conv(x1) + self.conv2(x1)
+        else:
+            if self.use_torch_add:
+                if self.left_conv:
+                    if self.with_bn:
+                        x = torch.add(self.bn(self.conv(x1)), x2)
+                    else:
+                        x = torch.add(self.conv(x1), x2)
+                else:
+                    if self.with_bn:
+                        x = torch.add(x2, self.bn(self.conv(x1)))
+                    else:
+                        x = torch.add(x2, self.conv(x1))
+            else:
+                if self.left_conv:
+                    if self.with_bn:
+                        x = self.bn(self.conv(x1)) + x2
+                    else:
+                        x = self.conv(x1) + x2
+                else:
+                    if self.with_bn:
+                        x = x2 + self.bn(self.conv(x1))
+                    else:
+                        x = x2 + self.conv(x1)
+        if self.with_relu:
+            x = self.relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5, 3, 3), torch.rand(1, 5, 2, 2))
+
+# TODO: self.fc should be self.conv
+class ConvReluModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x):
+        x = self.relu(self.fc(x))
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+# TODO: self.fc should be self.conv
+class ConvReluConvModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Conv2d(5, 5, 1).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+# TODO: self.fc should be self.conv
+class ConvReluAddModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Conv2d(3, 5, 3).to(dtype=torch.float)
+        self.relu = torch.nn.ReLU()
+        self.fc2 = torch.nn.Conv2d(5, 5, 1).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = torch.add(x, 5)
+        x = self.fc2(x)
+        self.relu = torch.nn.ReLU()
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class NormalizationTestModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.quant = torch.ao.quantization.QuantStub()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.layer_norm = torch.nn.LayerNorm(8)
+        self.group_norm = torch.nn.GroupNorm(2, 8)
+        self.instance_norm1d = torch.nn.InstanceNorm1d(8)
+        self.instance_norm2d = torch.nn.InstanceNorm2d(8)
+        self.instance_norm3d = torch.nn.InstanceNorm3d(8)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc1(x)
+        x = self.layer_norm(x)
+        x = self.group_norm(x.unsqueeze(-1).repeat(1, 1, 3))
+        x = self.instance_norm1d(x)
+        x = self.instance_norm2d(x.unsqueeze(-1))
+        x = self.instance_norm3d(x.unsqueeze(-1))
+        return x
+
+class NestedModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = TwoLayerLinearModel()
+        self.fc3 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class AnnotatedNestedModel(torch.nn.Module):
+    def __init__(self, qengine):
+        super().__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = TwoLayerLinearModel()
+        self.fc3 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
+        self.fc3.qconfig = default_qconfig
+        self.sub2.fc1 = QuantWrapper(self.sub2.fc1)
+        if qengine == 'fbgemm':
+            self.sub2.fc1.qconfig = default_per_channel_qconfig
+        else:
+            self.sub2.fc1.qconfig = default_qconfig
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class AnnotatedSubNestedModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = QuantWrapper(TwoLayerLinearModel())
+        self.fc3 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
+        self.fc3.qconfig = default_qconfig
+        self.sub2.qconfig = default_qconfig
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class AnnotatedCustomConfigNestedModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = TwoLayerLinearModel()
+        self.fc3 = QuantWrapper(torch.nn.Linear(5, 5).to(dtype=torch.float))
+        self.fc3.qconfig = default_qconfig
+        self.sub2.qconfig = default_qconfig
+
+        custom_options = {
+            'dtype': torch.quint8,
+            'qscheme': torch.per_tensor_affine
+        }
+        custom_qconfig = QConfig(activation=default_observer.with_args(**custom_options),
+                                 weight=default_weight_observer)
+        self.sub2.fc1.qconfig = custom_qconfig
+
+        self.sub2.fc1 = QuantWrapper(self.sub2.fc1)
+        self.sub2.fc2 = QuantWrapper(self.sub2.fc2)
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class QuantSubModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.sub1 = LinearReluModel()
+        self.sub2 = QuantWrapper(TwoLayerLinearModel())
+        self.sub2.qconfig = default_qconfig
+        self.fc3 = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        self.fc3.qconfig = default_qconfig
+
+    def forward(self, x):
+        x = self.sub1(x)
+        x = self.sub2(x)
+        x = self.fc3(x)
+        return x
+
+class InnerModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(5, 8).to(dtype=torch.float)
+        self.relu1 = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(8, 5).to(dtype=torch.float)
+        self.relu2 = torch.nn.ReLU()
+
+    def forward(self, x):
+        return self.relu2(self.fc2(self.relu1(self.fc1(x))))
+
+    def fuse_modules(self):
+        fusable_layers = []
+        named_children = list(self.named_children())
+        for idx, (current_name, layer) in enumerate(named_children):
+            if isinstance(layer, torch.nn.Linear):
+                if idx >= len(named_children) - 1:
+                    break
+                if isinstance(named_children[idx + 1][1], torch.nn.ReLU):
+                    fusable_layers.append([current_name,
+                                           named_children[idx + 1][0]])
+        # TODO: remove this check and define two fuse_modules function on this module
+        if self.training:
+            torch.ao.quantization.fuse_modules_qat(self, fusable_layers, inplace=True)
+        else:
+            torch.ao.quantization.fuse_modules(self, fusable_layers, inplace=True)
+
+class FunctionalLinear(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.rand((5, 5))
+        self.bias = torch.zeros(5)
+
+    def forward(self, x):
+        return F.linear(x, self.weight, self.bias)
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 5),)
+
+class SingleLayerFunctionalLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = FunctionalLinear()
+
+    def forward(self, x):
+        x = self.linear1(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.linear1.get_example_inputs()
+
+class TwoLayerFunctionalLinearModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = FunctionalLinear()
+        self.linear2 = FunctionalLinear()
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.linear1.get_example_inputs()
+
+class FunctionalLinearAddModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = FunctionalLinear()
+        self.linear2 = FunctionalLinear()
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = torch.add(x, 5)
+        x = self.linear2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.linear1.get_example_inputs()
+
+class FunctionalLinearReluModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = FunctionalLinear()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = F.relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.linear.get_example_inputs()
+
+class FunctionalLinearReluLinearModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = FunctionalLinear()
+        self.relu = nn.ReLU()
+        self.linear2 = FunctionalLinear()
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.linear1.get_example_inputs()
+
+class FunctionalConv2d(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = torch.rand(3, 3, 3, 3)
+        self.bias = torch.rand(3)
+        self.stride = (1, 1)
+        self.padding = (0, 0)
+        self.dilation = (1, 1)
+        self.groups = 1
+
+    def forward(self, x):
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return (torch.rand(1, 3, 5, 5),)
+
+class SingleLayerFunctionalConvModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = FunctionalConv2d()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.conv1.get_example_inputs()
+
+class TwoLayerFunctionalConvModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = FunctionalConv2d()
+        self.conv2 = FunctionalConv2d()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.conv1.get_example_inputs()
+
+class FunctionalConvReluModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = FunctionalConv2d()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.conv.get_example_inputs()
+
+class FunctionalConvReluConvModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = FunctionalConv2d()
+        self.relu = nn.ReLU()
+        self.conv2 = FunctionalConv2d()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        return x
+
+    def get_example_inputs(self) -> Tuple[Any, ...]:
+        return self.conv1.get_example_inputs()
+
+class SkipQuantModel(torch.nn.Module):
+    r"""We can skip quantization by explicitly
+    setting qconfig of a submodule to None
+    """
+    def __init__(self):
+        super().__init__()
+        self.sub = InnerModule()
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        return self.fc(self.sub(x))
+
+    def fuse_modules(self):
+        self.sub.fuse_modules()
+
+class AnnotatedSkipQuantModel(torch.nn.Module):
+    r"""We can skip quantization by explicitly
+    setting qconfig of a submodule to None
+    """
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig(qengine)
+        self.sub = QuantWrapper(InnerModule())
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+        # don't quantize this fc
+        self.fc.qconfig = None
+
+    def forward(self, x):
+        return self.fc(self.sub(x))
+
+    def fuse_modules(self):
+        self.sub.module.fuse_modules()
+
+class QuantStubModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
+    """
+    def __init__(self):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qconfig("qnnpack")
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc(x)
+        return self.dequant(x)
+
+class ManualLinearQATModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
+    """
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.fc1 = torch.nn.Linear(5, 1).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(1, 10).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return self.dequant(x)
+
+class ManualDropoutQATModel(torch.nn.Module):
+    r"""A Module with manually inserted `QuantStub` and `DeQuantStub`
+    """
+    def __init__(self, qengine):
+        super().__init__()
+        self.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.fc1 = torch.nn.Linear(5, 1).to(dtype=torch.float)
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc1(x)
+        x = self.dropout(x)
+        return self.dequant(x)
+
+class ManualLinearDynamicQATModel(torch.nn.Module):
+    r"""A Module that uses a dynamic QAT by default.
+    """
+    def __init__(self, qconfig=None):
+        super().__init__()
+        self.qconfig = qconfig or default_dynamic_qat_qconfig
+        self.fc1 = torch.nn.Linear(5, 1).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(1, 10).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+class ManualConvLinearQATModel(torch.nn.Module):
+    r"""A module with manually inserted `QuantStub` and `DeQuantStub`
+    and contains both linear and conv modules
+    """
+    def __init__(self, qconfig=None):
+        super().__init__()
+        self.qconfig = qconfig if qconfig else torch.ao.quantization.get_default_qat_qconfig("qnnpack")
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.conv = torch.nn.Conv2d(3, 1, kernel_size=3).to(dtype=torch.float)
+        self.fc1 = torch.nn.Linear(64, 10).to(dtype=torch.float)
+        self.fc2 = torch.nn.Linear(10, 10).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv(x)
+        x = x.view(-1, 64).contiguous()
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return self.dequant(x)
+
+class ManualConvLinearSymmQATModel(ManualConvLinearQATModel):
+    r"""Same as ManualConvLinearQATModule but with Symmetric Quantization.
+    Supported only with qnnpack.
+    """
+    def __init__(self):
+        super().__init__(default_symmetric_qnnpack_qat_qconfig)
+
+class ManualEmbeddingBagLinear(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = nn.EmbeddingBag(num_embeddings=10, embedding_dim=12, mode='sum')
+        self.emb.qconfig = default_embedding_qat_qconfig
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.linear = nn.Linear(12, 1).to(dtype=torch.float)
+        self.qconfig = get_default_qat_qconfig("qnnpack")
+
+    def forward(self, input: torch.Tensor, offsets: Optional[torch.Tensor] = None,
+                per_sample_weights: Optional[torch.Tensor] = None):
+        x = self.emb(input, offsets, per_sample_weights)
+        x = self.quant(x)
+        x = self.linear(x)
+        return self.dequant(x)
+
+class DeFusedEmbeddingBagLinear(nn.Module):
+    r"""A module to simulate QAT embedding bag with a linear layer,
+    this module uses a separate embedding and bagging op, similar
+    to that which is described in the EmbeddingBag documentation.
+
+    https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings=10, embedding_dim=12)
+        self.emb.qconfig = default_embedding_qat_qconfig
+        self.bagging_op = torch.sum
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.linear = nn.Linear(12, 1).to(dtype=torch.float)
+        self.qconfig = get_default_qat_qconfig("qnnpack")
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        x = self.bagging_op(self.emb(input), dim=1)
+        x = self.quant(x)
+        x = self.linear(x)
+        return self.dequant(x)
+
+class SubModelForFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(2, 2, 1, bias=None).to(dtype=torch.float)
+        self.bn = nn.BatchNorm2d(2).to(dtype=torch.float)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class SubModelWithoutFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(2, 2, 1, bias=None).to(dtype=torch.float)
+        self.relu = nn.ReLU(inplace=False).to(dtype=torch.float)
+
+    def forward(self, x):
+        return self.relu(self.conv(x))
+
+class ModelForFusion(nn.Module):
+    def __init__(self, qconfig):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 2, 1, bias=None).to(dtype=torch.float)
+        self.bn1 = nn.BatchNorm2d(2).to(dtype=torch.float)
+        self.relu1 = nn.ReLU(inplace=True).to(dtype=torch.float)
+        self.sub1 = SubModelForFusion()
+        self.sub2 = SubModelWithoutFusion()
+        self.fc = nn.Linear(36, 10).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.qconfig = qconfig
+        self.conv2 = nn.Conv3d(3, 2, (1, 1, 1), bias=None).to(dtype=torch.float)
+        self.relu2 = nn.ReLU(inplace=False).to(dtype=torch.float)
+        self.bn2 = nn.BatchNorm3d(2).to(dtype=torch.float)
+        self.relu3 = nn.ReLU(inplace=True).to(dtype=torch.float)
+        self.conv3 = nn.Conv1d(3, 3, 2).to(dtype=torch.float)
+        self.bn3 = nn.BatchNorm1d(3).to(dtype=torch.float)
+        self.relu4 = nn.ReLU(inplace=True).to(dtype=torch.float)
+        # don't quantize sub2
+        self.sub2.qconfig = None
+        self.fc.qconfig = None
+
+    def forward(self, x):
+        x = x.squeeze(2)
+        x = self.quant(x)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        x = self.relu4(x)
+        x = x.unsqueeze(2)
+        y = x.unsqueeze(2)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.sub1(x)
+        x = self.dequant(x)
+        x = self.sub2(x)
+        x = x.reshape(-1, 36).contiguous()
+        x = self.fc(x)
+        y = self.conv2(y)
+        y = self.relu2(y)
+        y = self.bn2(y)
+        y = self.relu3(y)
+        y = self.dequant(y)
+        return x
+
+class ConvBNReLU(nn.Sequential):
+    def __init__(self):
+        super().__init__(
+            nn.Conv2d(3, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(3),
+            nn.ReLU(inplace=False)
+        )
+
+class ModelWithSequentialFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 3, 1)
+        self.relu1 = nn.ReLU(inplace=False)
+        layers = []
+        for i in range(3):
+            layers.append(ConvBNReLU())
+        self.features = nn.Sequential(*layers)
+        head = [nn.Linear(300, 10), nn.ReLU(inplace=False)]
+        self.classifier = nn.Sequential(*head)
+        self.seq = nn.Sequential()
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.features(x)
+        x = torch.reshape(x, (-1, 3 * 10 * 10))
+        x = self.classifier(x)
+        x = self.seq(x)
+        x = self.dequant(x)
+        return x
+
+class ModelForFusionWithBias(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 2, 5, bias=True).to(dtype=torch.float)
+        self.bn1 = nn.BatchNorm2d(2).to(dtype=torch.float)
+        self.relu1 = nn.ReLU(inplace=True).to(dtype=torch.float)
+        self.conv2 = nn.Conv2d(2, 2, 1, bias=True).to(dtype=torch.float)
+        self.bn2 = nn.BatchNorm2d(2).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.dequant(x)
+        return x
+
+class ModelForLinearBNFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(20, 10)
+        self.bn = nn.BatchNorm1d(10)
+        nn.init.uniform_(self.bn.weight)
+        nn.init.uniform_(self.bn.bias)
+
+    def forward(self, x):
+        return self.bn(self.fc(x))
+
+class DummyObserver(torch.nn.Module):
+    def calculate_qparams(self):
+        return 1.0, 0
+
+    def forward(self, x):
+        return x
+
+
+class ModelForConvTransposeBNFusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.ConvTranspose1d(3, 3, 1)
+        self.bn1 = nn.BatchNorm1d(3)
+        self.conv2 = nn.ConvTranspose2d(3, 3, 1)
+        self.bn2 = nn.BatchNorm2d(3)
+        self.conv3 = nn.ConvTranspose3d(3, 3, 1)
+        self.bn3 = nn.BatchNorm3d(3)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = x.unsqueeze(2)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = x.unsqueeze(2)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        return x
+
+
+class ModelWithFunctionals(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mycat = nnq.FloatFunctional()
+        self.myadd = nnq.FloatFunctional()
+        self.myadd_relu = nnq.FloatFunctional()
+        self.mymatmul = nnq.FloatFunctional()
+        # Tracing doesnt work yet for c10 ops with scalar inputs
+        # https://github.com/pytorch/pytorch/issues/27097
+        # self.my_scalar_add = nnq.FloatFunctional()
+        # self.my_scalar_mul = nnq.FloatFunctional()
+
+    def forward(self, x):
+        y = self.mycat.cat([x, x, x])
+        z = self.myadd.add(y, y)
+        w = self.myadd_relu.add_relu(z, z)
+        u = self.mymatmul.matmul(w, w.T)
+        # Tracing doesnt work yet for c10 ops with scalar inputs
+        # https://github.com/pytorch/pytorch/issues/27097
+        # w = self.my_scalar_add.add_scalar(w, -0.5)
+        # w = self.my_scalar_mul.mul_scalar(w, 0.5)
+        return u
+
+
+class ResNetBase(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        norm_layer = nn.BatchNorm2d
+        inplanes = 3
+        self.conv1 = nn.Conv2d(inplanes, inplanes, (1, 1), bias=False)
+        self.bn1 = norm_layer(inplanes)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.downsample = torch.nn.Identity()
+        self.myop = nn.quantized.FloatFunctional()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = torch.nn.Linear(inplanes, 1)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        identity = self.downsample(x)
+        out = self.myop.add(out, identity)
+        out = self.relu2(out)
+        out = self.avgpool(out)
+        out = torch.flatten(out, 1)
+        out = self.fc(out)
+        return out
+
+    def fuse_model(self):
+        # TODO: remove this check and define two fuse_model function on this module
+        if self.training:
+            torch.ao.quantization.fuse_modules_qat(self, [['conv1', 'bn1', 'relu1']], inplace=True)
+        else:
+            torch.ao.quantization.fuse_modules(self, [['conv1', 'bn1', 'relu1']], inplace=True)
+
+class ModelMultipleOps(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        norm_layer = nn.BatchNorm2d
+        inplanes = 3
+        self.conv1 = nn.Conv2d(inplanes, inplanes, (1, 1), bias=False)
+        self.conv2 = nn.Conv2d(inplanes, inplanes, (1, 1), bias=False)
+        self.bn1 = norm_layer(inplanes)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.downsample = torch.nn.Identity()
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.cat = nn.quantized.FloatFunctional()
+        self.avgpool = nn.AdaptiveAvgPool2d((4, 4))
+        self.fc = nn.Linear(12, 6)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        identity = self.downsample(x)
+        out = self.skip_add.add(out, identity)
+        out = self.relu2(out)
+        out = self.avgpool(out)
+        out = self.conv2(out)
+        out = torch.nn.functional.max_pool2d(out, 2, 2)
+        out = self.cat.cat([out, out])
+        out = out.reshape(-1, 3 * 2 * 2)
+        out = self.fc(out)
+        return out
+
+# Model to ensure consistency of fake quant with true quant
+# Average pooling and mean operations are not modelled
+# accurately with fake-quant so this model does not
+# contain those operations
+class ModelMultipleOpsNoAvgPool(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        norm_layer = nn.BatchNorm2d
+        inplanes = 3
+        self.conv1 = nn.Conv2d(inplanes, inplanes, (1, 1), bias=False)
+        self.conv2 = nn.Conv2d(inplanes, inplanes, (1, 1), bias=False)
+        self.bn1 = norm_layer(inplanes)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.cat = nn.quantized.FloatFunctional()
+        self.maxpool = nn.MaxPool2d((4, 4))
+        self.fc = nn.Linear(12, 6)
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        skip = self.conv2(x)
+        out = self.skip_add.add(out, skip)
+        out = self.relu2(out)
+        out = self.maxpool(out)
+        out = self.conv2(out)
+        out = torch.nn.functional.max_pool2d(out, 2, 2)
+        out = self.cat.cat([out, out])
+        out = out.reshape(-1, 3 * 2 * 2)
+        out = self.fc(out)
+        return out
+
+class EmbeddingBagModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12,
+                                         include_last_offset=True, scale_grad_by_freq=False, mode='sum')
+
+    def forward(self, indices, offsets, per_sample_weights):
+        return self.emb(indices, offsets, per_sample_weights)
+
+class EmbeddingModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=12)
+
+    def forward(self, indices):
+        return self.emb(indices)
+
+class EmbeddingWithStaticLinear(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = torch.nn.EmbeddingBag(num_embeddings=10, embedding_dim=12)
+        self.fc = torch.nn.Linear(4, 2)
+        self.emb.qconfig = float_qparams_weight_only_qconfig
+        self.qconfig = default_qconfig
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+
+    def forward(self, indices, offsets, linear_in):
+        emb = self.emb(indices, offsets)
+        q_x = self.quant(linear_in)
+        fc = self.fc(q_x)
+        fc = self.dequant(fc)
+        features = torch.cat([fc] + [emb], dim=1)
+        return features
+
+class DenseTopMLP(nn.Module):
+
+    def __init__(self, dense_dim, dense_out, embedding_dim, top_out_in, top_out_out) -> None:
+        super().__init__()
+
+        self.dense_mlp = nn.Sequential(
+            nn.Linear(dense_dim, dense_out),
+        )
+        self.top_mlp = nn.Sequential(
+            nn.Linear(dense_out + embedding_dim, top_out_in),
+            nn.Linear(top_out_in, top_out_out),
+        )
+
+    def forward(
+        self,
+        sparse_feature: torch.Tensor,
+        dense: torch.Tensor,
+    ) -> torch.Tensor:
+        dense_feature = self.dense_mlp(dense)
+        features = torch.cat([dense_feature] + [sparse_feature], dim=1)
+
+        out = self.top_mlp(features)
+        return out
+
+# thin wrapper around embedding bag, because tracing inside nn.Embedding
+# bag is not supported at the moment and this is top level
+class EmbBagWrapper(nn.Module):
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__()
+        self.emb_bag = nn.EmbeddingBag(num_embeddings, embedding_dim, mode='sum')
+
+    def forward(self, indices, offsets):
+        return self.emb_bag(indices, offsets)
+
+class SparseNNModel(nn.Module):
+    _NUM_EMBEDDINGS = 10
+    _EMBEDDING_DIM = 5
+    _DENSE_DIM = 4
+    _DENSE_OUTPUT = 2
+    _TOP_OUT_IN = 2
+    _TOP_OUT_OUT = 2
+    _TOP_MLP_DIM = 1
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self.model_sparse = EmbBagWrapper(self._NUM_EMBEDDINGS, self._EMBEDDING_DIM)
+        self.dense_top = DenseTopMLP(
+            self._DENSE_DIM, self._DENSE_OUTPUT, self._EMBEDDING_DIM, self._TOP_OUT_IN,
+            self._TOP_OUT_OUT)
+
+    def forward(
+        self,
+        sparse_indices: torch.Tensor,
+        sparse_offsets: torch.Tensor,
+        dense: torch.Tensor,
+    ) -> torch.Tensor:
+
+        sparse_feature = self.model_sparse(sparse_indices, sparse_offsets)
+        out = self.dense_top(sparse_feature, dense)
+
+        return out
+
+class TestHelperModules:
+    class Conv2dPropAnnotaton(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+            self.linear = torch.nn.Linear(3, 3)
+
+        def forward(self, x):
+            x = self.conv(x)
+            x = x.view(-1, 3)
+            x = torch.nn.functional.hardtanh(x, -0.5, 0.5)
+            x = self.linear(x)
+            return x
+
+    class Conv2dWithObsSharingOps(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+            self.hardtanh = torch.nn.Hardtanh()
+            self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
+
+        def forward(self, x):
+            x = self.conv(x)
+            x = self.adaptive_avg_pool2d(x)
+            x = self.hardtanh(x)
+            x = torch.mean(x)
+            return x
+
+    class Conv2dWithTwoLinearPermute(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3)
+            self.linear1 = torch.nn.Linear(16, 8, bias=False)
+            self.linear2 = torch.nn.Linear(8, 8)
+
+        def forward(self, x):
+            conv_out = self.conv(x)
+            permute_out = torch.permute(conv_out, (0, 2, 3, 1))
+            return self.linear2(self.linear1(permute_out))
+
+    class Conv2dWithTwoLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 16, 3)
+            self.linear1 = torch.nn.Linear(64, 8, bias=False)
+            self.linear2 = torch.nn.Linear(8, 8)
+
+        def forward(self, x):
+            conv_out = self.conv(x)
+            reshape_out = torch.reshape(conv_out, (2, 64))
+            return self.linear2(self.linear1(reshape_out))
+
+    class ConvLinearWPermute(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 8, 3)
+            self.linear1 = torch.nn.Linear(8, 8)
+
+        def forward(self, x):
+            conv_out = self.conv(x)
+            permute_out = torch.permute(conv_out, (0, 2, 3, 1))
+            return self.linear1(permute_out)
+
+    class TwoLinearModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(8, 16, bias=False)
+            self.linear2 = torch.nn.Linear(16, 8)
+
+        def forward(self, x):
+            return self.linear2(self.linear1(x))
+
+    class ConvMaxPool2d(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(2, 2, 1)
+            self.pool = torch.nn.MaxPool2d(1, 1)
+
+        def forward(self, x):
+            x = self.conv(x)
+            x = self.pool(x)
+            return x
+
+    class ConvWithAdaptiveAvgPool2d(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, 3)
+            self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d((1, 1))
+
+        def forward(self, x):
+            x = self.conv(x)
+            x = self.adaptive_avg_pool2d(x)
+            return x
+
+    class ConvWithBNRelu(torch.nn.Module):
+        def __init__(self, relu, dim=2, bn=True, bias=True):
+            super().__init__()
+            convs = {1: torch.nn.Conv1d, 2: torch.nn.Conv2d}
+            bns = {1: torch.nn.BatchNorm1d, 2: torch.nn.BatchNorm2d}
+            self.conv = convs[dim](3, 3, 3, bias=bias)
+
+            if bn:
+                self.bn = bns[dim](3)
+            else:
+                self.bn = torch.nn.Identity()
+            if relu:
+                self.relu = torch.nn.ReLU()
+            else:
+                self.relu = torch.nn.Identity()
+
+        def forward(self, x):
+            x = self.conv(x)
+            x = self.bn(x)
+            return self.relu(x)
+
+    class Conv2dThenConv1d(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1d = torch.nn.Conv1d(3, 3, 3)
+            self.conv2d = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            x = self.conv2d(x)
+            x = x.squeeze(0)
+            x = self.conv1d(x)
+            return x
+
+        def example_inputs(self):
+            return (torch.randn(1, 3, 5, 5),)
+
+    class Conv2dWithCat(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x, y):
+            x = self.conv1(x)
+            y = self.conv2(y)
+            z = torch.cat([x, y], dim=1)
+            return z
+
+    class Conv2dWithTwoCat(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x1, x2, x3, x4):
+            x1 = self.conv1(x1)
+            x2 = self.conv2(x2)
+            y = torch.cat([x1, x2], dim=1)
+            z = x3 + x4
+            w = torch.cat([z, y])
+            return w
+
+    class ThreeAdd(torch.nn.Module):
+        def forward(self, x1, x2, x3, x4):
+            y = x1 + x2
+            z = x3 + x4
+            w = y + z
+            return w
+
+    class EmbeddingModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=12)
+
+        def forward(self, indices):
+            return self.emb(indices)
+
+    class EmbeddingConvLinearModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.emb = torch.nn.Embedding(num_embeddings=10, embedding_dim=8)
+            self.conv = torch.nn.Conv2d(8, 16, (1, 3))
+            self.linear = torch.nn.Linear(16, 8)
+
+        def forward(self, indices):
+            embeddings = self.emb(indices)
+            embeddings = torch.unsqueeze(embeddings, dim=0)
+            embeddings = torch.permute(embeddings, (0, 3, 1, 2))
+            conv_out = self.conv(embeddings)
+            conv_out = torch.permute(conv_out, (0, 2, 3, 1))
+            conv_out = torch.squeeze(conv_out, dim=0)
+            return self.linear(conv_out)
+
+    class AddInplaceAdd(torch.nn.Module):
+        def forward(self, x, y):
+            x = x + y
+            x += y
+            return x
+
+    class MulInplaceMul(torch.nn.Module):
+        def forward(self, x, y):
+            x = x * y
+            x *= y
+            return x
+
+    class AddMulScalar(torch.nn.Module):
+        def forward(self, x):
+            x = x + 3
+            x = x * 3
+            x += 3
+            x *= 3
+            return x
+
+    class ConvBnReLU2dAndLinearReLU(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv_bn_relu = TestHelperModules.ConvWithBNRelu(relu=True)
+            self.linear = torch.nn.Linear(3, 8, bias=False)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            x = self.conv_bn_relu(x)
+            permute_out = torch.permute(x, (0, 2, 3, 1))
+            linear_out = self.linear(permute_out)
+            return linear_out
+
+    class GroupwiseConv2d(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = torch.nn.Conv2d(4, 4, 3, groups=2)
+
+        def forward(self, x):
+            return self.conv(x)
+
+        def example_inputs(self):
+            return (torch.randn(2, 4, 10, 10),)
+
+    class LinearReluModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = torch.nn.Linear(5, 5).to(dtype=torch.float)
+            self.relu = torch.nn.ReLU()
+
+        def forward(self, x):
+            x = self.relu(self.fc(x))
+            return x
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_quantized.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_quantized.py
new file mode 100644
index 0000000000000000000000000000000000000000..536554e3a9caee901decb9f2f9e346a4f33e24e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_quantized.py
@@ -0,0 +1,227 @@
+# mypy: ignore-errors
+
+r"""Importing this file includes common utility methods for checking quantized
+tensors and modules.
+"""
+import numpy as np
+import torch
+from contextlib import contextmanager
+from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_TSAN, TEST_WITH_UBSAN, IS_PPC, IS_MACOS, IS_WINDOWS
+
+supported_qengines = torch.backends.quantized.supported_engines
+supported_qengines.remove('none')
+# Note: We currently do not run QNNPACK tests on WINDOWS and MACOS as it is flaky. Issue #29326
+# QNNPACK is not supported on PPC
+# QNNPACK throws ASAN heap-buffer-overflow error.
+if 'qnnpack' in supported_qengines and any([IS_PPC, TEST_WITH_ASAN, TEST_WITH_TSAN, TEST_WITH_UBSAN, IS_MACOS, IS_WINDOWS]):
+    supported_qengines.remove('qnnpack')
+
+def _conv_output_shape(input_size, kernel_size, padding, stride, dilation,
+                       output_padding=0):
+    """Computes the output shape given convolution parameters."""
+    return np.floor((input_size + 2 * padding - kernel_size - (kernel_size - 1)
+                     * (dilation - 1)) / stride) + 2 * output_padding + 1
+
+# Quantization references
+def _quantize(x, scale, zero_point, qmin=None, qmax=None, dtype=np.uint8):
+    """Quantizes a numpy array."""
+    if qmin is None:
+        qmin = np.iinfo(dtype).min
+    if qmax is None:
+        qmax = np.iinfo(dtype).max
+    qx = np.round(x / scale + zero_point).astype(np.int64)
+    qx = np.clip(qx, qmin, qmax)
+    qx = qx.astype(dtype)
+    return qx
+
+
+def _dequantize(qx, scale, zero_point):
+    """Dequantizes a numpy array."""
+    x = (qx.astype(float) - zero_point) * scale
+    return x
+
+
+def _requantize(x, multiplier, zero_point, qmin=0, qmax=255, qtype=np.uint8):
+    """Requantizes a numpy array, i.e., intermediate int32 or int16 values are
+    converted back to given type"""
+    qx = (x * multiplier).round() + zero_point
+    qx = np.clip(qx, qmin, qmax).astype(qtype)
+    return qx
+
+def _calculate_dynamic_qparams(X, dtype, reduce_range=False, qscheme=torch.per_tensor_affine):
+    """Calculate the dynamic quantization parameters (scale, zero_point)
+    according to the min and max element of the tensor"""
+    assert qscheme in (torch.per_tensor_affine, torch.per_tensor_symmetric)
+    if qscheme == torch.per_tensor_symmetric:
+        assert dtype == torch.qint8
+    if isinstance(X, torch.Tensor):
+        X = X.numpy()
+    if dtype == torch.qint8:
+        if reduce_range:
+            qmin, qmax = -64, 63
+        else:
+            qmin, qmax = -128, 127
+    else:  # dtype == torch.quint8
+        if reduce_range:
+            qmin, qmax = 0, 127
+        else:
+            qmin, qmax = 0, 255
+    min_val = X.min()
+    max_val = X.max()
+    is_symmetric = (qscheme == torch.per_tensor_symmetric)
+    if min_val == max_val:
+        scale = 1.0
+        zero_point = 0
+    else:
+        if is_symmetric:
+            max_val = max(max_val, -min_val)
+            min_val = -max_val
+            scale = (max_val - min_val) / (qmax - qmin)
+            scale = max(scale, np.finfo(np.float32).eps)
+            zero_point = 0
+        else:
+            max_val = max(max_val, 0.0)
+            min_val = min(min_val, 0.0)
+            scale = (max_val - min_val) / (qmax - qmin)
+            scale = max(scale, np.finfo(np.float32).eps)
+            zero_point = qmin - round(min_val / scale)
+            zero_point = max(qmin, zero_point)
+            zero_point = min(qmax, zero_point)
+    return [float(scale), int(zero_point)]
+
+def _calculate_dynamic_per_channel_qparams(X, dtype):
+    """Calculate the dynamic quantization parameters (scale, zero_point)
+    according to the min and max element of the tensor"""
+    if isinstance(X, torch.Tensor):
+        X = X.numpy()
+    qmin, qmax = torch.iinfo(dtype).min, torch.iinfo(dtype).max
+    n_levels = qmax - qmin
+    scale = np.zeros(X.shape[0], dtype=np.float64)
+    zero_point = np.zeros(X.shape[0], dtype=np.int64)
+    for i in range(zero_point.shape[0]):
+        min_val = X.min()
+        max_val = X.max()
+        if min_val == max_val:
+            scale[i] = 1.0
+            zero_point[i] = 0
+        else:
+            max_val = max(max_val, 0.0)
+            min_val = min(min_val, 0.0)
+            scale[i] = (max_val - min_val) / n_levels
+            scale[i] = max(scale[i], np.finfo(np.float32).eps)
+            zero_point[i] = qmin - round(min_val / scale[i])
+            zero_point[i] = max(qmin, zero_point[i])
+            zero_point[i] = min(qmax, zero_point[i])
+
+    return scale, zero_point
+
+def _snr(x, x_hat):
+    """Calculates the signal to noise ratio and returns the signal and noise
+    power, as well as the SNR in dB.
+    If the input is a list/tuple this function is called recursively on each
+    element. The result will have the same nested structure as the inputs.
+
+    Args:
+        x, x_hat: Either a tensor or a nested list/tuple of tensors.
+    Returns:
+        signal, noise, SNR(in dB): Either floats or a nested list of floats
+    """
+    if isinstance(x, (list, tuple)):
+        assert len(x) == len(x_hat)
+        res = []
+        for idx in range(len(x)):
+            res.append(_snr(x[idx], x_hat[idx]))
+        return res
+    if x_hat.is_quantized:
+        x_hat = x_hat.dequantize()
+    if x.is_quantized:
+        x = x.dequantize()
+    noise = (x - x_hat).norm()
+    if noise == 0:
+        return 0.0, float('inf'), float('inf')
+    signal = x.norm()
+    snr = signal / noise
+    snr_db = 20 * snr.log10()
+    return signal, noise, snr_db
+
+@contextmanager
+def override_quantized_engine(qengine):
+    previous = torch.backends.quantized.engine
+    torch.backends.quantized.engine = qengine
+    try:
+        yield
+    finally:
+        torch.backends.quantized.engine = previous
+
+@contextmanager
+def override_cpu_allocator_for_qnnpack(qengine_is_qnnpack):
+    try:
+        if qengine_is_qnnpack:
+            torch._C._set_default_mobile_cpu_allocator()
+        yield
+    finally:
+        if qengine_is_qnnpack:
+            torch._C._unset_default_mobile_cpu_allocator()
+
+# TODO: Update all quantization tests to use this decorator.
+# Currently for some of the tests it seems to have inconsistent params
+# for fbgemm vs qnnpack.
+def override_qengines(qfunction):
+    def test_fn(*args, **kwargs):
+        for qengine in supported_qengines:
+            with override_quantized_engine(qengine):
+                # qfunction should not return anything.
+                qfunction(*args, **kwargs)
+    return test_fn
+
+def qengine_is_fbgemm():
+    return torch.backends.quantized.engine == 'fbgemm'
+def qengine_is_qnnpack():
+    return torch.backends.quantized.engine == 'qnnpack'
+def qengine_is_onednn():
+    return torch.backends.quantized.engine == 'onednn'
+def qengine_is_x86():
+    return torch.backends.quantized.engine == 'x86'
+
+# Helper function used to simulate per-channel fake-quant against any axis
+def _permute_to_axis_zero(X, axis):
+    new_axis_list = list(range(X.dim()))
+    new_axis_list[axis] = 0
+    new_axis_list[0] = axis
+    y = X.permute(tuple(new_axis_list))
+    return y, new_axis_list
+
+# Reference method for fake quantize
+# Note: because scale/zero_point are left as float in the actual kernel, this mimics how fake_quant works for float16/64
+def _fake_quantize_per_channel_affine_reference(X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max):
+    dtype = X.dtype
+    X, permute_axis_list = _permute_to_axis_zero(X.to(torch.float32), axis)
+    res = torch.zeros_like(X)
+
+    for i in range(X.size()[0]):
+        res[i] = (torch.clamp(torch.round(X[i] * (1.0 / per_channel_scale[i]) +
+                  per_channel_zero_point[i]), quant_min, quant_max) - per_channel_zero_point[i]) * per_channel_scale[i]
+
+    out = res.permute(tuple(permute_axis_list))
+    return out.to(dtype)
+
+# Reference method for the gradient of the fake quantize operator
+# Note: because scale/zero_point are left as float in the actual kernel, this mimics how fake_quant works for float16/64
+def _fake_quantize_per_channel_affine_grad_reference(dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max):
+    dtype = X.dtype
+    X, permute_axis_list = _permute_to_axis_zero(X.to(torch.float32), axis)
+    Xq = torch.zeros_like(X)
+    for i in range(X.size()[0]):
+        Xq[i] = torch.round(X[i] * (1.0 / per_channel_scale[i]) + per_channel_zero_point[i])
+    Xq = Xq.permute(tuple(permute_axis_list))
+    mask = (Xq >= quant_min) * (Xq <= quant_max)
+    res = torch.zeros_like(dY)
+    res[mask] = dY[mask]
+    return res.to(dtype)
+
+def to_tensor(X, device):
+    if not isinstance(X, torch.Tensor):
+        X = torch.tensor(X)
+    else:
+        X = X.clone().detach()
+    return X.to(device=torch.device(device), dtype=torch.float32)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_subclass.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_subclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..965f74033453027e9b023967bf409603267c6ce0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_subclass.py
@@ -0,0 +1,218 @@
+# mypy: ignore-errors
+
+import torch
+from copy import deepcopy
+from torch.utils._pytree import tree_map
+
+# TODO: Move LoggingTensor here.
+from torch.testing._internal.logging_tensor import LoggingTensor
+
+
+# Base class for wrapper-style tensors.
+class WrapperTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, *args, **kwargs):
+        t, kwargs = cls.get_wrapper_properties(*args, **kwargs)
+        if "size" not in kwargs:
+            size = t.size()
+        else:
+            size = kwargs["size"]
+            del kwargs["size"]
+        if "dtype" not in kwargs:
+            kwargs["dtype"] = t.dtype
+        if "layout" not in kwargs:
+            kwargs["layout"] = t.layout
+        if "device" not in kwargs:
+            kwargs["device"] = t.device
+        if "requires_grad" not in kwargs:
+            kwargs["requires_grad"] = False
+        # Ignore memory_format and pin memory for now as I don't know how to
+        # safely access them on a Tensor (if possible??)
+
+        wrapper = torch.Tensor._make_wrapper_subclass(cls, size, **kwargs)
+        wrapper._validate_methods()
+        return wrapper
+
+    @classmethod
+    def get_wrapper_properties(cls, *args, **kwargs):
+        # Should return both an example Tensor and a dictionary of kwargs
+        # to override any of that example Tensor's properly.
+        # This is very similar to the `t.new_*(args)` API
+        raise NotImplementedError("You need to implement get_wrapper_properties")
+
+    def _validate_methods(self):
+        # Skip this if not in debug mode?
+        # Changing these on the python side is wrong as it would not be properly reflected
+        # on the c++ side
+        # This doesn't catch attributes set in the __init__
+        forbidden_overrides = ["size", "stride", "dtype", "layout", "device", "requires_grad"]
+        for el in forbidden_overrides:
+            if getattr(self.__class__, el) is not getattr(torch.Tensor, el):
+                raise RuntimeError(f"Subclass {self.__class__.__name__} is overwriting the "
+                                   f"property {el} but this is not allowed as such change would "
+                                   "not be reflected to c++ callers.")
+
+
+class DiagTensorBelow(WrapperTensor):
+    @classmethod
+    def get_wrapper_properties(cls, diag, requires_grad=False):
+        assert diag.ndim == 1
+        return diag, {"size": diag.size() + diag.size(), "requires_grad": requires_grad}
+
+    def __init__(self, diag, requires_grad=False):
+        self.diag = diag
+
+    handled_ops = {}
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        # For everything else, call the handler:
+        fn = cls.handled_ops.get(func.__name__, None)
+        if fn:
+            return fn(*args, **(kwargs or {}))
+        else:
+            # Note that here, because we don't need to provide the autograd formulas
+            # we can have a default "fallback" that creates a plain Tensor based
+            # on the diag elements and calls the func again.
+
+            def unwrap(e):
+                return e.diag.diag() if isinstance(e, DiagTensorBelow) else e
+
+            def wrap(e):
+                if isinstance(e, torch.Tensor) and e.ndim == 1:
+                    return DiagTensorBelow(e)
+                if isinstance(e, torch.Tensor) and e.ndim == 2 and e.count_nonzero() == e.diag().count_nonzero():
+                    return DiagTensorBelow(e.diag())
+                return e
+
+            rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs or {})))
+            return rs
+
+    def __repr__(self):
+        return super().__repr__(tensor_contents=f"diag={self.diag}")
+
+
+class SparseTensor(WrapperTensor):
+    @classmethod
+    def get_wrapper_properties(cls, size, values, indices, requires_grad=False):
+        assert values.device == indices.device
+        return values, {"size": size, "requires_grad": requires_grad}
+
+    def __init__(self, size, values, indices, requires_grad=False):
+        self.values = values
+        self.indices = indices
+
+    def __repr__(self):
+        return super().__repr__(tensor_contents=f"values={self.values}, indices={self.indices}")
+
+    def sparse_to_dense(self):
+        res = torch.zeros(self.size(), dtype=self.values.dtype)
+        res[self.indices.unbind(1)] = self.values
+        return res
+
+    @staticmethod
+    def from_dense(t):
+        indices = t.nonzero()
+        values = t[indices.unbind(1)]
+        return SparseTensor(t.size(), values, indices)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        func_name = f"{func.__module__}.{func.__name__}"
+
+        res = cls._try_call_special_impl(func_name, args, kwargs)
+        if res is not NotImplemented:
+            return res
+
+        # Otherwise, use a default implementation that construct dense
+        # tensors and use that to compute values
+        def unwrap(e):
+            return e.sparse_to_dense() if isinstance(e, SparseTensor) else e
+
+        # Wrap back all Tensors into our custom class
+        def wrap(e):
+            # Check for zeros and use that to get indices
+            return SparseTensor.from_dense(e) if isinstance(e, torch.Tensor) else e
+
+        rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs or {})))
+        return rs
+
+    # To show how things happen later
+    def __rmul__(self, other):
+        return super().__rmul__(other)
+
+    _SPECIAL_IMPLS = {}
+
+    @classmethod
+    def _try_call_special_impl(cls, func, args, kwargs):
+        if func not in cls._SPECIAL_IMPLS:
+            return NotImplemented
+        return cls._SPECIAL_IMPLS[func](args, kwargs)
+
+
+# Example non-wrapper subclass that stores extra state.
+class NonWrapperTensor(torch.Tensor):
+    def __new__(cls, data):
+        t = torch.Tensor._make_subclass(cls, data)
+        t.extra_state = {
+            'last_func_called': None
+        }
+        return t
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        result = super().__torch_function__(func, types, args, kwargs)
+
+        if isinstance(result, cls):
+            # Do something with the extra state. For the example here, just store the name of the
+            # last function called (skip for deepcopy so the copy has the same extra state).
+            if func is torch.Tensor.__deepcopy__:
+                result.extra_state = deepcopy(args[0].extra_state)
+            else:
+                result.extra_state = {
+                    'last_func_called': func.__name__,
+                }
+
+        return result
+
+    # new_empty() must be defined for deepcopy to work
+    def new_empty(self, shape):
+        return type(self)(torch.empty(shape))
+
+
+# Class used to store info about subclass tensors used in testing.
+class SubclassInfo:
+
+    __slots__ = ['name', 'create_fn', 'closed_under_ops']
+
+    def __init__(self, name, create_fn, closed_under_ops=True):
+        self.name = name
+        self.create_fn = create_fn  # create_fn(shape) -> tensor instance
+        self.closed_under_ops = closed_under_ops
+
+
+subclass_db = {
+    torch.Tensor: SubclassInfo(
+        'base_tensor', create_fn=torch.randn
+    ),
+    NonWrapperTensor: SubclassInfo(
+        'non_wrapper_tensor',
+        create_fn=lambda shape: NonWrapperTensor(torch.randn(shape))
+    ),
+    LoggingTensor: SubclassInfo(
+        'logging_tensor',
+        create_fn=lambda shape: LoggingTensor(torch.randn(shape))
+    ),
+    SparseTensor: SubclassInfo(
+        'sparse_tensor',
+        create_fn=lambda shape: SparseTensor.from_dense(torch.randn(shape).relu())
+    ),
+    DiagTensorBelow: SubclassInfo(
+        'diag_tensor_below',
+        create_fn=lambda shape: DiagTensorBelow(torch.randn(shape)),
+        closed_under_ops=False  # sparse semantics
+    ),
+}
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/common_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/common_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fb38a8e1b30c719a96afd3fc153e431b95f0604
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/common_utils.py
@@ -0,0 +1,5031 @@
+# mypy: ignore-errors
+
+r"""Importing this file must **not** initialize CUDA context. test_distributed
+relies on this assumption to properly run. This means that when this is imported
+no CUDA calls shall be made, including torch.cuda.device_count(), etc.
+
+torch.testing._internal.common_cuda.py can freely initialize CUDA context when imported.
+"""
+
+import argparse
+import contextlib
+import copy
+import ctypes
+import errno
+import functools
+import gc
+import inspect
+import io
+import json
+import logging
+import math
+import operator
+import os
+import platform
+import random
+import re
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import types
+import unittest
+import warnings
+from collections.abc import Mapping, Sequence
+from contextlib import closing, contextmanager
+from copy import deepcopy
+from dataclasses import dataclass
+from enum import Enum
+from functools import partial, wraps
+from itertools import product, chain
+from pathlib import Path
+from statistics import mean
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+from unittest.mock import MagicMock
+
+import expecttest
+import numpy as np
+
+import __main__  # type: ignore[import]
+import torch
+import torch.backends.cudnn
+import torch.backends.mkl
+import torch.backends.mps
+import torch.backends.xnnpack
+import torch.cuda
+from torch import Tensor
+from torch._C import ScriptDict, ScriptList  # type: ignore[attr-defined]
+from torch._utils_internal import get_writable_path
+from torch.nn import (
+    ModuleDict,
+    ModuleList,
+    ParameterDict,
+    ParameterList,
+    Sequential,
+)
+from torch.onnx import (
+    register_custom_op_symbolic,
+    unregister_custom_op_symbolic,
+)
+from torch.testing import make_tensor
+from torch.testing._comparison import (
+    BooleanPair,
+    NonePair,
+    NumberPair,
+    Pair,
+    TensorLikePair,
+)
+from torch.testing._comparison import not_close_error_metas
+from torch.testing._internal.common_dtype import get_all_dtypes
+from torch.utils._import_utils import _check_module_exists
+import torch.utils._pytree as pytree
+
+from .composite_compliance import no_dispatch
+
+
+# Class to keep track of test flags configurable by environment variables.
+# Flags set here are intended to be read-only and should not be modified after
+# definition.
+# TODO: Expand this class to handle abritrary settings in addition to boolean flags?
+class TestEnvironment:
+    # Set of env vars to set for the repro command that is output on test failure.
+    # Specifically, this includes env vars that are set to non-default values and
+    # are not implied. Maps from env var name -> value (int)
+    repro_env_vars: dict = {}
+
+    # Defines a flag usable throughout the test suite, determining its value by querying
+    # the specified environment variable.
+    #
+    # Args:
+    #     name (str): The name of the flag. A global variable with this name will be set
+    #         for convenient access throughout the test suite.
+    #     env_var (str): The name of the primary environment variable from which to
+    #         determine the value of this flag. If this is None or the environment variable
+    #         is unset, the default value will be used unless otherwise implied (see
+    #         implied_by_fn). Default: None
+    #     default (bool): The default value to use for the flag if unset by the environment
+    #         variable and unimplied. Default: False
+    #     include_in_repro (bool): Indicates whether this flag should be included in the
+    #         repro command that is output on test failure (i.e. whether it is possibly
+    #         relevant to reproducing the test failure). Default: True
+    #     enabled_fn (Callable): Callable returning whether the flag should be enabled
+    #         given the environment variable value and the default value. Default: Lambda
+    #         requiring "0" to disable if on by default OR "1" to enable if off by default.
+    #     implied_by_fn (Callable): Thunk returning a bool to imply this flag as enabled
+    #         by something outside of its primary environment variable setting. For example,
+    #         this can be useful if the value of another environment variable implies the flag
+    #         as enabled. Default: Lambda returning False to indicate no implications.
+    @staticmethod
+    def def_flag(
+        name,
+        env_var=None,
+        default=False,
+        include_in_repro=True,
+        enabled_fn=lambda env_var_val, default: (
+            (env_var_val != "0") if default else (env_var_val == "1")),
+        implied_by_fn=lambda: False,
+    ):
+        enabled = default
+        if env_var is not None:
+            env_var_val = os.getenv(env_var)
+            enabled = enabled_fn(env_var_val, default)
+        implied = implied_by_fn()
+        enabled = enabled or implied
+        if include_in_repro and (env_var is not None) and (enabled != default) and not implied:
+            TestEnvironment.repro_env_vars[env_var] = env_var_val
+
+        # export flag globally for convenience
+        assert name not in globals(), f"duplicate definition of flag '{name}'"
+        globals()[name] = enabled
+
+    # Returns a string prefix usable to set environment variables for any test
+    # settings that should be explicitly set to match this instantiation of the
+    # test suite.
+    # Example: "PYTORCH_TEST_WITH_ASAN=1 PYTORCH_TEST_WITH_ROCM=1"
+    @staticmethod
+    def repro_env_var_prefix() -> str:
+        return " ".join([f"{env_var}={value}"
+                         for env_var, value in TestEnvironment.repro_env_vars.items()])
+
+
+log = logging.getLogger(__name__)
+torch.backends.disable_global_flags()
+
+FILE_SCHEMA = "file://"
+if sys.platform == 'win32':
+    FILE_SCHEMA = "file:///"
+
+# NB: This flag differs semantically from others in that setting the env var to any
+# non-empty value will cause it to be true:
+#   CI=1, CI="true", CI=0, etc. all set the flag to be true.
+#   CI= and an unset CI set the flag to be false.
+# GitHub sets the value to CI="true" to enable it.
+TestEnvironment.def_flag("IS_CI", env_var="CI", include_in_repro=False,
+                         enabled_fn=lambda env_var_value, _: bool(env_var_value))
+TestEnvironment.def_flag(
+    "IS_SANDCASTLE",
+    env_var="SANDCASTLE",
+    implied_by_fn=lambda: os.getenv("TW_JOB_USER") == "sandcastle",
+    include_in_repro=False)
+
+_is_fbcode_default = (
+    hasattr(torch._utils_internal, "IS_FBSOURCE") and
+    torch._utils_internal.IS_FBSOURCE
+)
+
+TestEnvironment.def_flag("IS_FBCODE", env_var="PYTORCH_TEST_FBCODE",
+                         default=_is_fbcode_default,
+                         include_in_repro=False)
+TestEnvironment.def_flag("IS_REMOTE_GPU", env_var="PYTORCH_TEST_REMOTE_GPU",
+                         include_in_repro=False)
+
+TestEnvironment.def_flag(
+    "DISABLE_RUNNING_SCRIPT_CHK",
+    env_var="PYTORCH_DISABLE_RUNNING_SCRIPT_CHK",
+    include_in_repro=False)
+# NB: enabled by default unless in an fbcode context.
+TestEnvironment.def_flag("PRINT_REPRO_ON_FAILURE", env_var="PYTORCH_PRINT_REPRO_ON_FAILURE",
+                         default=(not IS_FBCODE), include_in_repro=False)  # noqa: F821
+
+DEFAULT_DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json'
+DEFAULT_SLOW_TESTS_FILE = '.pytorch-slow-tests.json'
+
+disabled_tests_dict = {}
+slow_tests_dict = {}
+
+def maybe_load_json(filename):
+    if os.path.isfile(filename):
+        with open(filename) as fp:
+            return json.load(fp)
+    log.warning("Attempted to load json file '%s' but it does not exist.", filename)
+    return {}
+
+# set them here in case the tests are running in a subprocess that doesn't call run_tests
+if os.getenv("SLOW_TESTS_FILE", ""):
+    slow_tests_dict = maybe_load_json(os.getenv("SLOW_TESTS_FILE", ""))
+if os.getenv("DISABLED_TESTS_FILE", ""):
+    disabled_tests_dict = maybe_load_json(os.getenv("DISABLED_TESTS_FILE", ""))
+
+NATIVE_DEVICES = ('cpu', 'cuda', 'meta', torch._C._get_privateuse1_backend_name())
+
+check_names = ['orin', 'concord', 'galen', 'xavier', 'nano', 'jetson', 'tegra']
+IS_JETSON = any(name in platform.platform() for name in check_names)
+
+def gcIfJetson(fn):
+    # Irregular Jetson host/device memory setup requires cleanup to avoid tests being killed
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if IS_JETSON:
+            gc.collect()
+            torch.cuda.empty_cache()
+        fn(*args, **kwargs)
+    return wrapper
+
+# Tries to extract the current test function by crawling the stack.
+# If unsuccessful, return None.
+def extract_test_fn() -> Optional[Callable]:
+    try:
+        stack = inspect.stack()
+        for frame_info in stack:
+            frame = frame_info.frame
+            if "self" not in frame.f_locals:
+                continue
+            self_val = frame.f_locals["self"]
+            if isinstance(self_val, unittest.TestCase):
+                test_id = self_val.id()
+                test_name = test_id.split('.')[2]
+                test_fn = getattr(self_val, test_name).__func__
+                return test_fn
+    except Exception:
+        pass
+    return None
+
+# Contains tracked input data useful for debugging purposes
+@dataclass
+class TrackedInput:
+    index: int
+    val: Any
+    type_desc: str
+
+# Attempt to pull out tracked input information from the test function.
+# A TrackedInputIter is used to insert this information.
+def get_tracked_input() -> Optional[TrackedInput]:
+    test_fn = extract_test_fn()
+    if test_fn is None:
+        return None
+    if not hasattr(test_fn, "tracked_input"):
+        return None
+    return test_fn.tracked_input
+
+def clear_tracked_input():
+    test_fn = extract_test_fn()
+    if test_fn is None:
+        return
+    if not hasattr(test_fn, "tracked_input"):
+        return None
+    test_fn.tracked_input = None
+
+# Wraps an iterator and tracks the most recent value the iterator produces
+# for debugging purposes. Tracked values are stored on the test function.
+class TrackedInputIter:
+    def __init__(self, child_iter, input_type_desc, callback=lambda x: x):
+        self.child_iter = enumerate(child_iter)
+        # Input type describes the things we're tracking (e.g. "sample input", "error input").
+        self.input_type_desc = input_type_desc
+        # Callback is run on each iterated thing to get the thing to track.
+        self.callback = callback
+        self.test_fn = extract_test_fn()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        # allow StopIteration to bubble up
+        input_idx, input_val = next(self.child_iter)
+        self._set_tracked_input(
+            TrackedInput(
+                index=input_idx, val=self.callback(input_val), type_desc=self.input_type_desc
+            )
+        )
+        return input_val
+
+    def _set_tracked_input(self, tracked_input: TrackedInput):
+        if self.test_fn is None:
+            return
+        if not hasattr(self.test_fn, "tracked_input"):
+            return
+        self.test_fn.tracked_input = tracked_input
+
+class _TestParametrizer:
+    """
+    Decorator class for parametrizing a test function, yielding a set of new tests spawned
+    from the original generic test, each specialized for a specific set of test inputs. For
+    example, parametrizing a test across the set of ops will result in a test function per op.
+
+    The decision of how to parametrize / what to parametrize over is intended to be implemented
+    by each derived class.
+
+    In the details, the decorator adds a 'parametrize_fn' property to the test function. This function
+    is intended to be called later by one of:
+      * Device-specific test instantiation via instantiate_device_type_tests(). Note that for this
+        case there is no need to explicitly parametrize over device type, as that is handled separately.
+      * Device-agnostic parametrized test instantiation via instantiate_parametrized_tests().
+
+    If the decorator is applied to a test function that already has a 'parametrize_fn' property, a new
+    composite 'parametrize_fn' will be created that generates tests with the product of the parameters
+    generated by the old and new parametrize_fns. This allows for convenient composability of decorators.
+    """
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        """
+        Parametrizes the given test function across whatever dimension is specified by the derived class.
+        Tests can be parametrized over any arbitrary dimension or combination of dimensions, such as all
+        ops, all modules, or all ops + their associated dtypes.
+
+        Args:
+            test (fn): Test function to parametrize over
+            generic_cls (class): Generic test class object containing tests (e.g. TestFoo)
+            device_cls (class): Device-specialized test class object (e.g. TestFooCPU); set to None
+                if the tests are not part of a device-specific set
+
+        Returns:
+            Generator object returning 4-tuples of:
+                test (fn): Parametrized test function; must support a device arg and args for any params
+                test_name (str): Parametrized suffix for the test (e.g. opname_int64); will be appended to
+                    the base name of the test
+                param_kwargs (dict): Param kwargs to pass to the test (e.g. {'op': 'add', 'dtype': torch.int64})
+                decorator_fn (callable): Callable[[Dict], List] for list of decorators to apply given param_kwargs
+        """
+        raise NotImplementedError
+
+    def __call__(self, fn):
+        if hasattr(fn, 'parametrize_fn'):
+            # Do composition with the product of args.
+            old_parametrize_fn = fn.parametrize_fn
+            new_parametrize_fn = self._parametrize_test
+            fn.parametrize_fn = compose_parametrize_fns(old_parametrize_fn, new_parametrize_fn)
+        else:
+            fn.parametrize_fn = self._parametrize_test
+        return fn
+
+
+def compose_parametrize_fns(old_parametrize_fn, new_parametrize_fn):
+    """
+    Returns a parametrize_fn that parametrizes over the product of the parameters handled
+    by the given parametrize_fns. Each given parametrize_fn should each have the signature
+    f(test, generic_cls, device_cls).
+
+    The test names will be a combination of the names produced by the parametrize_fns in
+    "<new_name>_<old_name>" order. This order is done to match intuition for constructed names
+    when composing multiple decorators; the names will be built in top to bottom order when stacking
+    parametrization decorators.
+
+    Args:
+        old_parametrize_fn (callable) - First parametrize_fn to compose.
+        new_parametrize_fn (callable) - Second parametrize_fn to compose.
+    """
+
+    def composite_fn(test, generic_cls, device_cls,
+                     old_parametrize_fn=old_parametrize_fn,
+                     new_parametrize_fn=new_parametrize_fn):
+        old_tests = list(old_parametrize_fn(test, generic_cls, device_cls))
+        for (old_test, old_test_name, old_param_kwargs, old_dec_fn) in old_tests:
+            for (new_test, new_test_name, new_param_kwargs, new_dec_fn) in \
+                    new_parametrize_fn(old_test, generic_cls, device_cls):
+                redundant_params = set(old_param_kwargs.keys()).intersection(new_param_kwargs.keys())
+                if redundant_params:
+                    raise RuntimeError('Parametrization over the same parameter by multiple parametrization '
+                                       'decorators is not supported. For test "{}", the following parameters '
+                                       'are handled multiple times: {}'.format(
+                                           test.__name__, redundant_params))
+                full_param_kwargs = {**old_param_kwargs, **new_param_kwargs}
+                merged_test_name = '{}{}{}'.format(new_test_name,
+                                                   '_' if old_test_name != '' and new_test_name != '' else '',
+                                                   old_test_name)
+
+                def merged_decorator_fn(param_kwargs, old_dec_fn=old_dec_fn, new_dec_fn=new_dec_fn):
+                    return list(old_dec_fn(param_kwargs)) + list(new_dec_fn(param_kwargs))
+
+                yield (new_test, merged_test_name, full_param_kwargs, merged_decorator_fn)
+
+    return composite_fn
+
+
+def instantiate_parametrized_tests(generic_cls):
+    """
+    Instantiates tests that have been decorated with a parametrize_fn. This is generally performed by a
+    decorator subclass of _TestParametrizer. The generic test will be replaced on the test class by
+    parametrized tests with specialized names. This should be used instead of
+    instantiate_device_type_tests() if the test class contains device-agnostic tests.
+
+    You can also use it as a class decorator. E.g.
+
+    ```
+    @instantiate_parametrized_tests
+    class TestFoo(TestCase):
+        ...
+    ```
+
+    Args:
+        generic_cls (class): Generic test class object containing tests (e.g. TestFoo)
+    """
+    for attr_name in tuple(dir(generic_cls)):
+        class_attr = getattr(generic_cls, attr_name)
+        if not hasattr(class_attr, 'parametrize_fn'):
+            continue
+
+        # Remove the generic test from the test class.
+        delattr(generic_cls, attr_name)
+
+        # Add parametrized tests to the test class.
+        def instantiate_test_helper(cls, name, test, param_kwargs):
+            @wraps(test)
+            def instantiated_test(self, param_kwargs=param_kwargs):
+                test(self, **param_kwargs)
+
+            assert not hasattr(generic_cls, name), f"Redefinition of test {name}"
+            setattr(generic_cls, name, instantiated_test)
+
+        for (test, test_suffix, param_kwargs, decorator_fn) in class_attr.parametrize_fn(
+                class_attr, generic_cls=generic_cls, device_cls=None):
+            full_name = f'{test.__name__}_{test_suffix}'
+
+            # Apply decorators based on full param kwargs.
+            for decorator in decorator_fn(param_kwargs):
+                test = decorator(test)
+
+            instantiate_test_helper(cls=generic_cls, name=full_name, test=test, param_kwargs=param_kwargs)
+    return generic_cls
+
+
+class subtest:
+    """
+    Explicit subtest case for use with test parametrization.
+    Allows for explicit naming of individual subtest cases as well as applying
+    decorators to the parametrized test.
+
+    Args:
+        arg_values (iterable): Iterable of arg values (e.g. range(10)) or
+            tuples of arg values (e.g. [(1, 2), (3, 4)]).
+        name (str): Optional name to use for the test.
+        decorators (iterable): Iterable of decorators to apply to the generated test.
+    """
+    __slots__ = ['arg_values', 'name', 'decorators']
+
+    def __init__(self, arg_values, name=None, decorators=None):
+        self.arg_values = arg_values
+        self.name = name
+        self.decorators = decorators if decorators else []
+
+
+class parametrize(_TestParametrizer):
+    """
+    Decorator for applying generic test parametrizations.
+
+    The interface for this decorator is modeled after `@pytest.mark.parametrize`.
+    Basic usage between this decorator and pytest's is identical. The first argument
+    should be a string containing comma-separated names of parameters for the test, and
+    the second argument should be an iterable returning values or tuples of values for
+    the case of multiple parameters.
+
+    Beyond this basic usage, the decorator provides some additional functionality that
+    pytest does not.
+
+    1. Parametrized tests end up as generated test functions on unittest test classes.
+    Since this differs from how pytest works, this decorator takes on the additional
+    responsibility of naming these test functions. The default test names consists of
+    the test's base name followed by each parameter name + value (e.g. "test_bar_x_1_y_foo"),
+    but custom names can be defined using `name_fn` or the `subtest` structure (see below).
+
+    2. The decorator specially handles parameter values of type `subtest`, which allows for
+    more fine-grained control over both test naming and test execution. In particular, it can
+    be used to tag subtests with explicit test names or apply arbitrary decorators (see examples
+    below).
+
+    Examples::
+
+        @parametrize("x", range(5))
+        def test_foo(self, x):
+            ...
+
+        @parametrize("x,y", [(1, 'foo'), (2, 'bar'), (3, 'baz')])
+        def test_bar(self, x, y):
+            ...
+
+        @parametrize("x,y", [(1, 'foo'), (2, 'bar'), (3, 'baz')],
+                     name_fn=lambda x, y: '{}_{}'.format(x, y))
+        def test_bar_custom_names(self, x, y):
+            ...
+
+        @parametrize("x, y", [subtest((1, 2), name='double'),
+                              subtest((1, 3), name='triple', decorators=[unittest.expectedFailure]),
+                              subtest((1, 4), name='quadruple')])
+        def test_baz(self, x, y):
+            ...
+
+    To actually instantiate the parametrized tests, one of instantiate_parametrized_tests() or
+    instantiate_device_type_tests() should be called. The former is intended for test classes
+    that contain device-agnostic tests, while the latter should be used for test classes that
+    contain device-specific tests. Both support arbitrary parametrizations using the decorator.
+
+    Args:
+        arg_str (str): String of arg names separate by commas (e.g. "x,y").
+        arg_values (iterable): Iterable of arg values (e.g. range(10)) or
+            tuples of arg values (e.g. [(1, 2), (3, 4)]).
+        name_fn (Callable): Optional function that takes in parameters and returns subtest name.
+    """
+    def __init__(self, arg_str, arg_values, name_fn=None):
+        self.arg_names: List[str] = [s.strip() for s in arg_str.split(',') if s != '']
+        self.arg_values = arg_values
+        self.name_fn = name_fn
+
+    def _formatted_str_repr(self, idx, name, value):
+        """ Returns a string representation for the given arg that is suitable for use in test function names. """
+        if isinstance(value, torch.dtype):
+            return dtype_name(value)
+        elif isinstance(value, torch.device):
+            return str(value)
+        # Can't use isinstance as it would cause a circular import
+        elif type(value).__name__ in {'OpInfo', 'ModuleInfo'}:
+            return value.formatted_name
+        elif isinstance(value, (int, float, str)):
+            return f"{name}_{str(value).replace('.', '_')}"
+        else:
+            return f"{name}{idx}"
+
+    def _default_subtest_name(self, idx, values):
+        return '_'.join([self._formatted_str_repr(idx, a, v) for a, v in zip(self.arg_names, values)])
+
+    def _get_subtest_name(self, idx, values, explicit_name=None):
+        if explicit_name:
+            subtest_name = explicit_name
+        elif self.name_fn:
+            subtest_name = self.name_fn(*values)
+        else:
+            subtest_name = self._default_subtest_name(idx, values)
+        return subtest_name
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        if len(self.arg_names) == 0:
+            # No additional parameters needed for the test.
+            test_name = ''
+            yield (test, test_name, {}, lambda _: [])
+        else:
+            # Each "values" item is expected to be either:
+            # * A tuple of values with one for each arg. For a single arg, a single item is expected.
+            # * A subtest instance with arg_values matching the previous.
+            values = check_exhausted_iterator = object()
+            for idx, values in enumerate(self.arg_values):
+                maybe_name = None
+
+                decorators = []
+                if isinstance(values, subtest):
+                    sub = values
+                    values = sub.arg_values
+                    maybe_name = sub.name
+
+                    @wraps(test)
+                    def test_wrapper(*args, **kwargs):
+                        return test(*args, **kwargs)
+
+                    decorators = sub.decorators
+                    gen_test = test_wrapper
+                else:
+                    gen_test = test
+
+                values = list(values) if len(self.arg_names) > 1 else [values]
+                if len(values) != len(self.arg_names):
+                    raise RuntimeError(f'Expected # values == # arg names, but got: {len(values)} '
+                                       f'values and {len(self.arg_names)} names for test "{test.__name__}"')
+
+                param_kwargs = dict(zip(self.arg_names, values))
+
+                test_name = self._get_subtest_name(idx, values, explicit_name=maybe_name)
+
+                def decorator_fn(_, decorators=decorators):
+                    return decorators
+
+                yield (gen_test, test_name, param_kwargs, decorator_fn)
+
+            if values is check_exhausted_iterator:
+                raise ValueError(f'{test}: An empty arg_values was passed to @parametrize. '
+                                 'Note that this may result from reuse of a generator.')
+
+
+class decorateIf(_TestParametrizer):
+    """
+    Decorator for applying parameter-specific conditional decoration.
+    Composes with other test parametrizers (e.g. @modules, @ops, @parametrize, etc.).
+
+    Examples::
+
+        @decorateIf(unittest.skip, lambda params: params["x"] == 2)
+        @parametrize("x", range(5))
+        def test_foo(self, x):
+            ...
+
+        @parametrize("x,y", [(1, 'foo'), (2, 'bar'), (3, 'baz')])
+        @decorateIf(
+            unittest.expectedFailure,
+            lambda params: params["x"] == 3 and params["y"] == "baz"
+        )
+        def test_bar(self, x, y):
+            ...
+
+        @decorateIf(
+            unittest.expectedFailure,
+            lambda params: params["op"].name == "add" and params["dtype"] == torch.float16
+        )
+        @ops(op_db)
+        def test_op_foo(self, device, dtype, op):
+            ...
+
+        @decorateIf(
+            unittest.skip,
+            lambda params: params["module_info"].module_cls is torch.nn.Linear and \
+                params["device"] == "cpu"
+        )
+        @modules(module_db)
+        def test_module_foo(self, device, dtype, module_info):
+            ...
+
+    Args:
+        decorator: Test decorator to apply if the predicate is satisfied.
+        predicate_fn (Callable): Function taking in a dict of params and returning a boolean
+            indicating whether the decorator should be applied or not.
+    """
+    def __init__(self, decorator, predicate_fn):
+        self.decorator = decorator
+        self.predicate_fn = predicate_fn
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+
+        # Leave test as-is and return the appropriate decorator_fn.
+        def decorator_fn(params, decorator=self.decorator, predicate_fn=self.predicate_fn):
+            if predicate_fn(params):
+                return [decorator]
+            else:
+                return []
+
+        @wraps(test)
+        def test_wrapper(*args, **kwargs):
+            return test(*args, **kwargs)
+
+        test_name = ''
+        yield (test_wrapper, test_name, {}, decorator_fn)
+
+
+class ProfilingMode(Enum):
+    LEGACY = 1
+    SIMPLE = 2
+    PROFILING = 3
+
+def cppProfilingFlagsToProfilingMode():
+    old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
+    old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
+    torch._C._jit_set_profiling_executor(old_prof_exec_state)
+    torch._C._get_graph_executor_optimize(old_prof_mode_state)
+
+    if old_prof_exec_state:
+        if old_prof_mode_state:
+            return ProfilingMode.PROFILING
+        else:
+            return ProfilingMode.SIMPLE
+    else:
+        return ProfilingMode.LEGACY
+
+@contextmanager
+def enable_profiling_mode_for_profiling_tests():
+    if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+        old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
+        old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
+    try:
+        yield
+    finally:
+        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+            torch._C._jit_set_profiling_executor(old_prof_exec_state)
+            torch._C._get_graph_executor_optimize(old_prof_mode_state)
+
+@contextmanager
+def enable_profiling_mode():
+    old_prof_exec_state = torch._C._jit_set_profiling_executor(True)
+    old_prof_mode_state = torch._C._get_graph_executor_optimize(True)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_profiling_executor(old_prof_exec_state)
+        torch._C._get_graph_executor_optimize(old_prof_mode_state)
+
+@contextmanager
+def num_profiled_runs(num_runs):
+    old_num_runs = torch._C._jit_set_num_profiled_runs(num_runs)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_num_profiled_runs(old_num_runs)
+
+func_call = torch._C.ScriptFunction.__call__
+meth_call = torch._C.ScriptMethod.__call__
+
+def prof_callable(callable, *args, **kwargs):
+    if 'profile_and_replay' in kwargs:
+        del kwargs['profile_and_replay']
+        if GRAPH_EXECUTOR == ProfilingMode.PROFILING:
+            with enable_profiling_mode_for_profiling_tests():
+                callable(*args, **kwargs)
+                return callable(*args, **kwargs)
+
+    return callable(*args, **kwargs)
+
+def prof_func_call(*args, **kwargs):
+    return prof_callable(func_call, *args, **kwargs)
+
+def prof_meth_call(*args, **kwargs):
+    return prof_callable(meth_call, *args, **kwargs)
+
+torch._C.ScriptFunction.__call__ = prof_func_call  # type: ignore[method-assign]
+torch._C.ScriptMethod.__call__ = prof_meth_call  # type: ignore[method-assign]
+
+def _get_test_report_path():
+    # allow users to override the test file location. We need this
+    # because the distributed tests run the same test file multiple
+    # times with different configurations.
+    override = os.environ.get('TEST_REPORT_SOURCE_OVERRIDE')
+    test_source = override if override is not None else 'python-unittest'
+    return os.path.join('test-reports', test_source)
+
+is_running_via_run_test = "run_test.py" in getattr(__main__, "__file__", "")
+parser = argparse.ArgumentParser(add_help=not is_running_via_run_test, allow_abbrev=False)
+parser.add_argument('--subprocess', action='store_true',
+                    help='whether to run each test in a subprocess')
+parser.add_argument('--seed', type=int, default=1234)
+parser.add_argument('--accept', action='store_true')
+parser.add_argument('--jit-executor', '--jit_executor', type=str)
+parser.add_argument('--repeat', type=int, default=1)
+parser.add_argument('--test-bailouts', '--test_bailouts', action='store_true')
+parser.add_argument('--use-pytest', action='store_true')
+parser.add_argument('--save-xml', nargs='?', type=str,
+                    const=_get_test_report_path(),
+                    default=_get_test_report_path() if IS_CI else None)  # noqa: F821
+parser.add_argument('--discover-tests', action='store_true')
+parser.add_argument('--log-suffix', type=str, default="")
+parser.add_argument('--run-parallel', type=int, default=1)
+parser.add_argument('--import-slow-tests', type=str, nargs='?', const=DEFAULT_SLOW_TESTS_FILE)
+parser.add_argument('--import-disabled-tests', type=str, nargs='?', const=DEFAULT_DISABLED_TESTS_FILE)
+parser.add_argument('--rerun-disabled-tests', action='store_true')
+parser.add_argument('--pytest-single-test', type=str, nargs=1)
+
+# Only run when -h or --help flag is active to display both unittest and parser help messages.
+def run_unittest_help(argv):
+    unittest.main(argv=argv)
+
+if '-h' in sys.argv or '--help' in sys.argv:
+    help_thread = threading.Thread(target=run_unittest_help, args=(sys.argv,))
+    help_thread.start()
+    help_thread.join()
+
+args, remaining = parser.parse_known_args()
+if args.jit_executor == 'legacy':
+    GRAPH_EXECUTOR = ProfilingMode.LEGACY
+elif args.jit_executor == 'profiling':
+    GRAPH_EXECUTOR = ProfilingMode.PROFILING
+elif args.jit_executor == 'simple':
+    GRAPH_EXECUTOR = ProfilingMode.SIMPLE
+else:
+    # infer flags based on the default settings
+    GRAPH_EXECUTOR = cppProfilingFlagsToProfilingMode()
+
+RERUN_DISABLED_TESTS = args.rerun_disabled_tests
+
+SLOW_TESTS_FILE = args.import_slow_tests
+DISABLED_TESTS_FILE = args.import_disabled_tests
+LOG_SUFFIX = args.log_suffix
+RUN_PARALLEL = args.run_parallel
+TEST_BAILOUTS = args.test_bailouts
+USE_PYTEST = args.use_pytest
+PYTEST_SINGLE_TEST = args.pytest_single_test
+TEST_DISCOVER = args.discover_tests
+TEST_IN_SUBPROCESS = args.subprocess
+TEST_SAVE_XML = args.save_xml
+REPEAT_COUNT = args.repeat
+SEED = args.seed
+if not getattr(expecttest, "ACCEPT", False):
+    expecttest.ACCEPT = args.accept
+UNITTEST_ARGS = [sys.argv[0]] + remaining
+torch.manual_seed(SEED)
+
+# CI Prefix path used only on CI environment
+CI_TEST_PREFIX = str(Path(os.getcwd()))
+CI_PT_ROOT = str(Path(os.getcwd()).parent)
+CI_FUNCTORCH_ROOT = str(os.path.join(Path(os.getcwd()).parent, "functorch"))
+
+def wait_for_process(p, timeout=None):
+    try:
+        return p.wait(timeout=timeout)
+    except KeyboardInterrupt:
+        # Give `p` a chance to handle KeyboardInterrupt. Without this,
+        # `pytest` can't print errors it collected so far upon KeyboardInterrupt.
+        exit_status = p.wait(timeout=5)
+        if exit_status is not None:
+            return exit_status
+        else:
+            p.kill()
+            raise
+    except subprocess.TimeoutExpired:
+        # send SIGINT to give pytest a chance to make xml
+        p.send_signal(signal.SIGINT)
+        exit_status = None
+        try:
+            exit_status = p.wait(timeout=5)
+        # try to handle the case where p.wait(timeout=5) times out as well as
+        # otherwise the wait() call in the finally block can potentially hang
+        except subprocess.TimeoutExpired:
+            pass
+        if exit_status is not None:
+            return exit_status
+        else:
+            p.kill()
+        raise
+    except:  # noqa: B001,E722, copied from python core library
+        p.kill()
+        raise
+    finally:
+        # Always call p.wait() to ensure exit
+        p.wait()
+
+def shell(command, cwd=None, env=None, stdout=None, stderr=None, timeout=None):
+    sys.stdout.flush()
+    sys.stderr.flush()
+    # The following cool snippet is copied from Py3 core library subprocess.call
+    # only the with
+    #   1. `except KeyboardInterrupt` block added for SIGINT handling.
+    #   2. In Py2, subprocess.Popen doesn't return a context manager, so we do
+    #      `p.wait()` in a `final` block for the code to be portable.
+    #
+    # https://github.com/python/cpython/blob/71b6c1af727fbe13525fb734568057d78cea33f3/Lib/subprocess.py#L309-L323
+    assert not isinstance(command, str), "Command to shell should be a list or tuple of tokens"
+    p = subprocess.Popen(command, universal_newlines=True, cwd=cwd, env=env, stdout=stdout, stderr=stderr)
+    return wait_for_process(p, timeout=timeout)
+
+
+def retry_shell(
+    command,
+    cwd=None,
+    env=None,
+    stdout=None,
+    stderr=None,
+    timeout=None,
+    retries=1,
+    was_rerun=False,
+) -> Tuple[int, bool]:
+    # Returns exicode + whether it was rerun
+    assert (
+        retries >= 0
+    ), f"Expecting non negative number for number of retries, got {retries}"
+    try:
+        exit_code = shell(
+            command, cwd=cwd, env=env, stdout=stdout, stderr=stderr, timeout=timeout
+        )
+        if exit_code == 0 or retries == 0:
+            return exit_code, was_rerun
+        print(
+            f"Got exit code {exit_code}, retrying (retries left={retries})",
+            file=stdout,
+            flush=True,
+        )
+    except subprocess.TimeoutExpired:
+        if retries == 0:
+            print(
+                f"Command took >{timeout // 60}min, returning 124",
+                file=stdout,
+                flush=True,
+            )
+            return 124, was_rerun
+        print(
+            f"Command took >{timeout // 60}min, retrying (retries left={retries})",
+            file=stdout,
+            flush=True,
+        )
+    return retry_shell(
+        command,
+        cwd=cwd,
+        env=env,
+        stdout=stdout,
+        stderr=stderr,
+        timeout=timeout,
+        retries=retries - 1,
+        was_rerun=True,
+    )
+
+
+def discover_test_cases_recursively(suite_or_case):
+    if isinstance(suite_or_case, unittest.TestCase):
+        return [suite_or_case]
+    rc = []
+    for element in suite_or_case:
+        print(element)
+        rc.extend(discover_test_cases_recursively(element))
+    return rc
+
+def get_test_names(test_cases):
+    return ['.'.join(case.id().split('.')[-2:]) for case in test_cases]
+
+def _print_test_names():
+    suite = unittest.TestLoader().loadTestsFromModule(__main__)
+    test_cases = discover_test_cases_recursively(suite)
+    for name in get_test_names(test_cases):
+        print(name)
+
+def chunk_list(lst, nchunks):
+    return [lst[i::nchunks] for i in range(nchunks)]
+
+# sanitize filename e.g., distributed/pipeline/sync/skip/test_api.py -> distributed.pipeline.sync.skip.test_api
+def sanitize_test_filename(filename):
+    # inspect.getfile returns absolute path in some CI jobs, converting it to relative path if needed
+    if filename.startswith(CI_TEST_PREFIX):
+        filename = filename[len(CI_TEST_PREFIX) + 1:]
+    strip_py = re.sub(r'.py$', '', filename)
+    return re.sub('/', r'.', strip_py)
+
+def lint_test_case_extension(suite):
+    succeed = True
+    for test_case_or_suite in suite:
+        test_case = test_case_or_suite
+        if isinstance(test_case_or_suite, unittest.TestSuite):
+            first_test = test_case_or_suite._tests[0] if len(test_case_or_suite._tests) > 0 else None
+            if first_test is not None and isinstance(first_test, unittest.TestSuite):
+                return succeed and lint_test_case_extension(test_case_or_suite)
+            test_case = first_test
+
+        if test_case is not None:
+            test_class = test_case.id().split('.', 1)[1].split('.')[0]
+            if not isinstance(test_case, TestCase):
+                err = "This test class should extend from torch.testing._internal.common_utils.TestCase but it doesn't."
+                print(f"{test_class} - failed. {err}")
+                succeed = False
+    return succeed
+
+
+def get_report_path(argv=UNITTEST_ARGS, pytest=False):
+    test_filename = sanitize_test_filename(argv[0])
+    test_report_path = TEST_SAVE_XML + LOG_SUFFIX
+    test_report_path = os.path.join(test_report_path, test_filename)
+    if pytest:
+        test_report_path = test_report_path.replace('python-unittest', 'python-pytest')
+        os.makedirs(test_report_path, exist_ok=True)
+        test_report_path = os.path.join(test_report_path, f"{test_filename}-{os.urandom(8).hex()}.xml")
+        return test_report_path
+    os.makedirs(test_report_path, exist_ok=True)
+    return test_report_path
+
+
+def sanitize_pytest_xml(xml_file: str):
+    # pytext xml is different from unittext xml, this function makes pytest xml more similar to unittest xml
+    # consider somehow modifying the XML logger in conftest to do this instead
+    import xml.etree.ElementTree as ET
+    tree = ET.parse(xml_file)
+    for testcase in tree.iter('testcase'):
+        full_classname = testcase.attrib['classname']
+        # The test prefix is optional
+        regex_result = re.search(r"^(test\.)?(?P<file>.*)\.(?P<classname>[^\.]*)$", full_classname)
+        if regex_result is None:
+            continue
+        classname = regex_result.group("classname")
+        file = regex_result.group("file").replace(".", "/")
+        testcase.set("classname", classname)
+        testcase.set("file", f"{file}.py")
+    tree.write(xml_file)
+
+
+def get_pytest_test_cases(argv: List[str]) -> List[str]:
+    class TestCollectorPlugin:
+        def __init__(self):
+            self.tests = []
+
+        def pytest_collection_finish(self, session):
+            for item in session.items:
+                self.tests.append(session.config.cwd_relative_nodeid(item.nodeid))
+
+    test_collector_plugin = TestCollectorPlugin()
+    import pytest
+    pytest.main(
+        [arg for arg in argv if arg != '-vv'] + ['--collect-only', '-qq', '--use-main-module'],
+        plugins=[test_collector_plugin]
+    )
+    return test_collector_plugin.tests
+
+
+def run_tests(argv=UNITTEST_ARGS):
+    # import test files.
+    if SLOW_TESTS_FILE:
+        if os.path.exists(SLOW_TESTS_FILE):
+            with open(SLOW_TESTS_FILE) as fp:
+                global slow_tests_dict
+                slow_tests_dict = json.load(fp)
+                # use env vars so pytest-xdist subprocesses can still access them
+                os.environ['SLOW_TESTS_FILE'] = SLOW_TESTS_FILE
+        else:
+            warnings.warn(f'slow test file provided but not found: {SLOW_TESTS_FILE}')
+    if DISABLED_TESTS_FILE:
+        if os.path.exists(DISABLED_TESTS_FILE):
+            with open(DISABLED_TESTS_FILE) as fp:
+                global disabled_tests_dict
+                disabled_tests_dict = json.load(fp)
+                os.environ['DISABLED_TESTS_FILE'] = DISABLED_TESTS_FILE
+        else:
+            warnings.warn(f'disabled test file provided but not found: {DISABLED_TESTS_FILE}')
+    # Determine the test launch mechanism
+    if TEST_DISCOVER:
+        _print_test_names()
+        return
+
+    # Before running the tests, lint to check that every test class extends from TestCase
+    suite = unittest.TestLoader().loadTestsFromModule(__main__)
+    if not lint_test_case_extension(suite):
+        sys.exit(1)
+
+    if TEST_IN_SUBPROCESS:
+        other_args = []
+        if DISABLED_TESTS_FILE:
+            other_args.append("--import-disabled-tests")
+        if SLOW_TESTS_FILE:
+            other_args.append("--import-slow-tests")
+        if USE_PYTEST:
+            other_args.append("--use-pytest")
+        if RERUN_DISABLED_TESTS:
+            other_args.append("--rerun-disabled-tests")
+        if TEST_SAVE_XML:
+            other_args += ['--save-xml', args.save_xml]
+
+        test_cases = (
+            get_pytest_test_cases(argv) if USE_PYTEST else
+            [case.id().split('.', 1)[1] for case in discover_test_cases_recursively(suite)]
+        )
+
+        failed_tests = []
+
+        for test_case_full_name in test_cases:
+
+            cmd = (
+                [sys.executable] + [argv[0]] + other_args + argv[1:] +
+                (["--pytest-single-test"] if USE_PYTEST else []) +
+                [test_case_full_name]
+            )
+            string_cmd = " ".join(cmd)
+
+            timeout = None if RERUN_DISABLED_TESTS else 15 * 60
+
+            exitcode, _ = retry_shell(cmd, timeout=timeout, retries=0 if RERUN_DISABLED_TESTS else 1)
+
+            if exitcode != 0:
+                # This is sort of hacky, but add on relevant env variables for distributed tests.
+                if 'TestDistBackendWithSpawn' in test_case_full_name:
+                    backend = os.environ.get("BACKEND", "")
+                    world_size = os.environ.get("WORLD_SIZE", "")
+                    env_prefix = f"BACKEND={backend} WORLD_SIZE={world_size}"
+                    string_cmd = env_prefix + " " + string_cmd
+                # Log the command to reproduce the failure.
+                print(f"Test exited with non-zero exitcode {exitcode}. Command to reproduce: {string_cmd}")
+                failed_tests.append(test_case_full_name)
+
+            assert len(failed_tests) == 0, "{} unit test(s) failed:\n\t{}".format(
+                len(failed_tests), '\n\t'.join(failed_tests))
+
+    elif RUN_PARALLEL > 1:
+        test_cases = discover_test_cases_recursively(suite)
+        test_batches = chunk_list(get_test_names(test_cases), RUN_PARALLEL)
+        processes = []
+        for i in range(RUN_PARALLEL):
+            command = [sys.executable] + argv + [f'--log-suffix=-shard-{i + 1}'] + test_batches[i]
+            processes.append(subprocess.Popen(command, universal_newlines=True))
+        failed = False
+        for p in processes:
+            failed |= wait_for_process(p) != 0
+        assert not failed, "Some test shards have failed"
+    elif USE_PYTEST:
+        pytest_args = argv + ["--use-main-module"]
+        if TEST_SAVE_XML:
+            test_report_path = get_report_path(pytest=True)
+            print(f'Test results will be stored in {test_report_path}')
+            pytest_args.append(f'--junit-xml-reruns={test_report_path}')
+        if PYTEST_SINGLE_TEST:
+            pytest_args = PYTEST_SINGLE_TEST + pytest_args[1:]
+
+        import pytest
+        os.environ["NO_COLOR"] = "1"
+        exit_code = pytest.main(args=pytest_args)
+        if TEST_SAVE_XML:
+            sanitize_pytest_xml(test_report_path)
+
+        if not RERUN_DISABLED_TESTS:
+            # exitcode of 5 means no tests were found, which happens since some test configs don't
+            # run tests from certain files
+            sys.exit(0 if exit_code == 5 else exit_code)
+        else:
+            # Only record the test report and always return a success code when running under rerun
+            # disabled tests mode
+            sys.exit(0)
+    elif TEST_SAVE_XML is not None:
+        # import here so that non-CI doesn't need xmlrunner installed
+        import xmlrunner  # type: ignore[import]
+        from xmlrunner.result import _XMLTestResult  # type: ignore[import]
+
+        class XMLTestResultVerbose(_XMLTestResult):
+            """
+            Adding verbosity to test outputs:
+            by default test summary prints 'skip',
+            but we want to also print the skip reason.
+            GH issue: https://github.com/pytorch/pytorch/issues/69014
+
+            This works with unittest_xml_reporting<=3.2.0,>=2.0.0
+            (3.2.0 is latest at the moment)
+            """
+            def __init__(self, *args, **kwargs):
+                super().__init__(*args, **kwargs)
+
+            def addSkip(self, test, reason):
+                super().addSkip(test, reason)
+                for c in self.callback.__closure__:
+                    if isinstance(c.cell_contents, str) and c.cell_contents == 'skip':
+                        # this message is printed in test summary;
+                        # it stands for `verbose_str` captured in the closure
+                        c.cell_contents = f"skip: {reason}"
+
+            def printErrors(self) -> None:
+                super().printErrors()
+                self.printErrorList("XPASS", self.unexpectedSuccesses)
+        test_report_path = get_report_path()
+        verbose = '--verbose' in argv or '-v' in argv
+        if verbose:
+            print(f'Test results will be stored in {test_report_path}')
+        unittest.main(argv=argv, testRunner=xmlrunner.XMLTestRunner(
+            output=test_report_path,
+            verbosity=2 if verbose else 1,
+            resultclass=XMLTestResultVerbose))
+    elif REPEAT_COUNT > 1:
+        for _ in range(REPEAT_COUNT):
+            if not unittest.main(exit=False, argv=argv).result.wasSuccessful():
+                sys.exit(-1)
+    else:
+        unittest.main(argv=argv)
+
+IS_LINUX = sys.platform == "linux"
+IS_WINDOWS = sys.platform == "win32"
+IS_MACOS = sys.platform == "darwin"
+IS_PPC = platform.machine() == "ppc64le"
+IS_X86 = platform.machine() in ('x86_64', 'i386')
+IS_ARM64 = platform.machine() in ('arm64', 'aarch64')
+
+def is_avx512_vnni_supported():
+    if sys.platform != 'linux':
+        return False
+    with open("/proc/cpuinfo", encoding="ascii") as f:
+        lines = f.read()
+    return "vnni" in lines
+
+IS_AVX512_VNNI_SUPPORTED = is_avx512_vnni_supported()
+
+if IS_WINDOWS:
+    @contextmanager
+    def TemporaryFileName(*args, **kwargs):
+        # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
+        # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
+        # close the file after creation and try to remove it manually
+        if 'delete' in kwargs:
+            if kwargs['delete'] is not False:
+                raise UserWarning("only TemporaryFileName with delete=False is supported on Windows.")
+        else:
+            kwargs['delete'] = False
+        f = tempfile.NamedTemporaryFile(*args, **kwargs)
+        try:
+            f.close()
+            yield f.name
+        finally:
+            os.unlink(f.name)
+else:
+    @contextmanager  # noqa: T484
+    def TemporaryFileName(*args, **kwargs):
+        with tempfile.NamedTemporaryFile(*args, **kwargs) as f:
+            yield f.name
+
+if IS_WINDOWS:
+    @contextmanager
+    def TemporaryDirectoryName(suffix=None):
+        # On Windows the directory created by TemporaryDirectory is likely to be removed prematurely,
+        # so we first create the directory using mkdtemp and then remove it manually
+        try:
+            dir_name = tempfile.mkdtemp(suffix=suffix)
+            yield dir_name
+        finally:
+            shutil.rmtree(dir_name)
+else:
+    @contextmanager  # noqa: T484
+    def TemporaryDirectoryName(suffix=None):
+        with tempfile.TemporaryDirectory(suffix=suffix) as d:
+            yield d
+
+IS_FILESYSTEM_UTF8_ENCODING = sys.getfilesystemencoding() == 'utf-8'
+
+TEST_NUMPY = _check_module_exists('numpy')
+TEST_FAIRSEQ = _check_module_exists('fairseq')
+TEST_SCIPY = _check_module_exists('scipy')
+TEST_MKL = torch.backends.mkl.is_available()
+TEST_MPS = torch.backends.mps.is_available()
+TEST_XPU = torch.xpu.is_available()
+TEST_CUDA = torch.cuda.is_available()
+custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name(), None)
+TEST_PRIVATEUSE1 = True if (hasattr(custom_device_mod, "is_available") and custom_device_mod.is_available()) else False
+TEST_NUMBA = _check_module_exists('numba')
+
+TEST_DILL = _check_module_exists('dill')
+
+TEST_LIBROSA = _check_module_exists('librosa') and not IS_ARM64
+
+TEST_OPT_EINSUM = _check_module_exists('opt_einsum')
+
+TEST_Z3 = _check_module_exists('z3')
+
+BUILD_WITH_CAFFE2 = torch.onnx._CAFFE2_ATEN_FALLBACK
+
+def split_if_not_empty(x: str):
+    return x.split(",") if len(x) != 0 else []
+
+NOTEST_CPU = "cpu" in split_if_not_empty(os.getenv('PYTORCH_TESTING_DEVICE_EXCEPT_FOR', ''))
+
+skipIfNoDill = unittest.skipIf(not TEST_DILL, "no dill")
+
+
+# Python 2.7 doesn't have spawn
+TestEnvironment.def_flag("NO_MULTIPROCESSING_SPAWN", env_var="NO_MULTIPROCESSING_SPAWN")
+TestEnvironment.def_flag("TEST_WITH_ASAN", env_var="PYTORCH_TEST_WITH_ASAN")
+TestEnvironment.def_flag("TEST_WITH_DEV_DBG_ASAN", env_var="PYTORCH_TEST_WITH_DEV_DBG_ASAN")
+TestEnvironment.def_flag("TEST_WITH_TSAN", env_var="PYTORCH_TEST_WITH_TSAN")
+TestEnvironment.def_flag("TEST_WITH_UBSAN", env_var="PYTORCH_TEST_WITH_UBSAN")
+TestEnvironment.def_flag("TEST_WITH_ROCM", env_var="PYTORCH_TEST_WITH_ROCM")
+
+# TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+# See #64427
+TEST_WITH_MIOPEN_SUGGEST_NHWC = os.getenv('PYTORCH_MIOPEN_SUGGEST_NHWC', '0') == '1'
+# Enables tests that are slow to run (disabled by default)
+TestEnvironment.def_flag("TEST_WITH_SLOW", env_var="PYTORCH_TEST_WITH_SLOW")
+
+# Disables non-slow tests (these tests enabled by default)
+# This is usually used in conjunction with TEST_WITH_SLOW to
+# run *only* slow tests.  (I could have done an enum, but
+# it felt a little awkward.
+TestEnvironment.def_flag("TEST_SKIP_FAST", env_var="PYTORCH_TEST_SKIP_FAST")
+
+# Enables crossref tests, in addition to standard tests which
+# are being run.  crossref tests work by installing a torch
+# function mode that runs extra compute alongside the regular
+# computation that happens with the test.  After both computations
+# are done, we cross-reference them (thus the name) to check for
+# correction, before throwing out the extra compute and proceeding
+# as we had before.  By default, we don't run these tests.
+TestEnvironment.def_flag("TEST_WITH_CROSSREF", env_var="PYTORCH_TEST_WITH_CROSSREF")
+
+TestEnvironment.def_flag("TEST_SKIP_CUDAGRAPH", env_var="PYTORCH_TEST_SKIP_CUDAGRAPH")
+TEST_CUDA_GRAPH = TEST_CUDA and (not TEST_SKIP_CUDAGRAPH) and (  # noqa: F821
+    (torch.version.cuda and int(torch.version.cuda.split(".")[0]) >= 11) or
+    (torch.version.hip and float(".".join(torch.version.hip.split(".")[0:2])) >= 5.3)
+)
+
+if TEST_CUDA and 'NUM_PARALLEL_PROCS' in os.environ:
+    num_procs = int(os.getenv("NUM_PARALLEL_PROCS", "2"))
+    # other libraries take up about 11% of space per process
+    torch.cuda.set_per_process_memory_fraction(round(1 / num_procs - .11, 2))
+
+requires_cuda = unittest.skipUnless(torch.cuda.is_available(), "Requires CUDA")
+
+def skipIfCrossRef(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_WITH_CROSSREF:  # noqa: F821
+            raise unittest.SkipTest("test doesn't currently with crossref")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+class CrossRefMode(torch.overrides.TorchFunctionMode):
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        r = func(*args, **kwargs)
+        return r
+
+# Run PyTorch tests with TorchDynamo
+TestEnvironment.def_flag("TEST_WITH_TORCHINDUCTOR", env_var="PYTORCH_TEST_WITH_INDUCTOR")
+# AOT_EAGER not tested in ci, useful for debugging
+TestEnvironment.def_flag("TEST_WITH_AOT_EAGER", env_var="PYTORCH_TEST_WITH_AOT_EAGER")
+TestEnvironment.def_flag("TEST_WITH_TORCHDYNAMO", env_var="PYTORCH_TEST_WITH_DYNAMO",
+                         implied_by_fn=lambda: TEST_WITH_TORCHINDUCTOR or TEST_WITH_AOT_EAGER)  # noqa: F821
+
+if TEST_WITH_TORCHDYNAMO:  # noqa: F821
+    import torch._dynamo
+    # Do not spend time on helper functions that are called with different inputs
+    torch._dynamo.config.accumulated_cache_size_limit = 8
+    # Do not log compilation metrics from unit tests
+    torch._dynamo.config.log_compilation_metrics = False
+    if TEST_WITH_TORCHINDUCTOR:  # noqa: F821
+        import torch._inductor.config
+        torch._inductor.config.fallback_random = True
+
+
+def xpassIfTorchDynamo(func):
+    return func if TEST_WITH_TORCHDYNAMO else unittest.expectedFailure(func)  # noqa: F821
+
+
+def xfailIfTorchDynamo(func):
+    return unittest.expectedFailure(func) if TEST_WITH_TORCHDYNAMO else func  # noqa: F821
+
+
+def skipIfTorchDynamo(msg="test doesn't currently work with dynamo"):
+    """
+    Usage:
+    @skipIfTorchDynamo(msg)
+    def test_blah(self):
+        ...
+    """
+    assert isinstance(msg, str), "Are you using skipIfTorchDynamo correctly?"
+
+    def decorator(fn):
+        if not isinstance(fn, type):
+            @wraps(fn)
+            def wrapper(*args, **kwargs):
+                if TEST_WITH_TORCHDYNAMO:  # noqa: F821
+                    raise unittest.SkipTest(msg)
+                else:
+                    fn(*args, **kwargs)
+            return wrapper
+
+        assert isinstance(fn, type)
+        if TEST_WITH_TORCHDYNAMO:  # noqa: F821
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = msg
+
+        return fn
+
+    return decorator
+
+def skipIfTorchInductor(msg="test doesn't currently work with torchinductor",
+                        condition=TEST_WITH_TORCHINDUCTOR):  # noqa: F821
+    def decorator(fn):
+        if not isinstance(fn, type):
+            @wraps(fn)
+            def wrapper(*args, **kwargs):
+                if condition:
+                    raise unittest.SkipTest(msg)
+                else:
+                    fn(*args, **kwargs)
+            return wrapper
+
+        assert isinstance(fn, type)
+        if condition:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = msg
+
+        return fn
+
+    return decorator
+
+
+def unMarkDynamoStrictTest(cls=None):
+    def decorator(cls):
+        cls.dynamo_strict = False
+        return cls
+
+    if cls is None:
+        return decorator
+    else:
+        return decorator(cls)
+
+
+def markDynamoStrictTest(cls_or_func=None, nopython=False):
+    """
+    Marks the test as 'strict'. In strict mode, we reset before and after the
+    test, and run without suppress errors.
+
+    Args:
+    - nopython: if we should run torch._dynamo.optimize with nopython={True/False}.
+    """
+    def decorator(cls_or_func):
+        if inspect.isclass(cls_or_func):
+            cls_or_func.dynamo_strict = True
+            cls_or_func.dynamo_strict_nopython = nopython
+            return cls_or_func
+
+        fn = cls_or_func
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            torch._dynamo.reset()
+            with unittest.mock.patch("torch._dynamo.config.suppress_errors", False):
+                fn(*args, **kwargs)
+            torch._dynamo.reset()
+        return wrapper
+
+    if cls_or_func is None:
+        return decorator
+    else:
+        return decorator(cls_or_func)
+
+
+def skipRocmIfTorchInductor(msg="test doesn't currently work with torchinductor on the ROCm stack"):
+    return skipIfTorchInductor(msg=msg, condition=TEST_WITH_ROCM and TEST_WITH_TORCHINDUCTOR)  # noqa: F821
+
+def skipIfLegacyJitExecutor(msg="test doesn't currently work with legacy JIT executor"):
+    def decorator(fn):
+        if not isinstance(fn, type):
+            @wraps(fn)
+            def wrapper(*args, **kwargs):
+                if GRAPH_EXECUTOR == ProfilingMode.LEGACY:
+                    raise unittest.SkipTest(msg)
+                else:
+                    fn(*args, **kwargs)
+            return wrapper
+
+        assert isinstance(fn, type)
+        if GRAPH_EXECUTOR == ProfilingMode.LEGACY:
+            fn.__unittest_skip__ = True
+            fn.__unittest_skip_why__ = msg
+
+        return fn
+
+
+    return decorator
+
+
+# Run PyTorch tests with translation validation on.
+TEST_WITH_TV = os.getenv('PYTORCH_TEST_WITH_TV') == '1'
+
+if TEST_WITH_TV:
+    torch.fx.experimental._config.translation_validation = True
+
+# Some tests take too long when dynamic_shapes is combined with
+# translation_validation. Whenever that happens, we solve that by
+# disabling translation_validation.
+def disable_translation_validation_if_dynamic_shapes(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if torch._dynamo.config.dynamic_shapes:
+            # Turning TV off due to high latency on dynamic shapes.
+            torch.fx.experimental._config.translation_validation = False
+        return fn(*args, **kwargs)
+    return wrapper
+
+
+# Determine whether to enable cuda memory leak check.
+# CUDA mem leak check is expensive and thus we don't want to execute it on every
+# test case / configuration.
+# If this is True then CUDA memory leak checks are skipped. If this is false
+#   then CUDA memory leak checks are performed.
+# See: https://github.com/pytorch/pytorch/pull/59402#issuecomment-858811135
+TestEnvironment.def_flag("TEST_CUDA_MEM_LEAK_CHECK", env_var="PYTORCH_TEST_CUDA_MEM_LEAK_CHECK")
+
+# True if CI is running TBB-enabled Pytorch
+IS_TBB = "tbb" in os.getenv("BUILD_ENVIRONMENT", "")
+
+# Dict of NumPy dtype -> torch dtype (when the correspondence exists)
+numpy_to_torch_dtype_dict = {
+    np.bool_      : torch.bool,
+    np.uint8      : torch.uint8,
+    np.uint16     : torch.uint16,
+    np.uint32     : torch.uint32,
+    np.uint64     : torch.uint64,
+    np.int8       : torch.int8,
+    np.int16      : torch.int16,
+    np.int32      : torch.int32,
+    np.int64      : torch.int64,
+    np.float16    : torch.float16,
+    np.float32    : torch.float32,
+    np.float64    : torch.float64,
+    np.complex64  : torch.complex64,
+    np.complex128 : torch.complex128
+}
+
+
+# numpy dtypes like np.float64 are not instances, but rather classes. This leads to rather absurd cases like
+# np.float64 != np.dtype("float64") but np.float64 == np.dtype("float64").type.
+# Especially when checking against a reference we can't be sure which variant we get, so we simply try both.
+def numpy_to_torch_dtype(np_dtype):
+    try:
+        return numpy_to_torch_dtype_dict[np_dtype]
+    except KeyError:
+        return numpy_to_torch_dtype_dict[np_dtype.type]
+
+
+def has_corresponding_torch_dtype(np_dtype):
+    try:
+        numpy_to_torch_dtype(np_dtype)
+        return True
+    except KeyError:
+        return False
+
+
+if IS_WINDOWS:
+    # Size of `np.intc` is platform defined.
+    # It is returned by functions like `bitwise_not`.
+    # On Windows `int` is 32-bit
+    # https://docs.microsoft.com/en-us/cpp/cpp/data-type-ranges?view=msvc-160
+    numpy_to_torch_dtype_dict[np.intc] = torch.int
+
+# Dict of torch dtype -> NumPy dtype
+torch_to_numpy_dtype_dict = {value : key for (key, value) in numpy_to_torch_dtype_dict.items()}
+torch_to_numpy_dtype_dict.update({
+    torch.bfloat16: np.float32,
+    torch.complex32: np.complex64
+})
+
+def skipIfRocm(func=None, *, msg="test doesn't currently work on the ROCm stack"):
+    def dec_fn(fn):
+        reason = f"skipIfRocm: {msg}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_WITH_ROCM:  # noqa: F821
+                raise unittest.SkipTest(reason)
+            else:
+                return fn(*args, **kwargs)
+        return wrapper
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
+def runOnRocm(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_WITH_ROCM:  # noqa: F821
+            fn(*args, **kwargs)
+        else:
+            raise unittest.SkipTest("test currently only works on the ROCm stack")
+    return wrapper
+
+def skipIfXpu(func=None, *, msg="test doesn't currently work on the XPU stack"):
+    def dec_fn(fn):
+        reason = f"skipIfXpu: {msg}"
+
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if TEST_XPU:
+                raise unittest.SkipTest(reason)
+            else:
+                return fn(*args, **kwargs)
+        return wrapper
+    if func:
+        return dec_fn(func)
+    return dec_fn
+
+def skipIfMps(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if TEST_MPS:
+            raise unittest.SkipTest("test doesn't currently work with MPS")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+# Skips a test on CUDA if ROCm is available and its version is lower than requested.
+def skipIfRocmVersionLessThan(version=None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:  # noqa: F821
+                rocm_version = str(torch.version.hip)
+                rocm_version = rocm_version.split("-")[0]    # ignore git sha
+                rocm_version_tuple = tuple(int(x) for x in rocm_version.split("."))
+                if rocm_version_tuple is None or version is None or rocm_version_tuple < tuple(version):
+                    reason = f"ROCm {rocm_version_tuple} is available but {version} required"
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
+def skipIfNotMiopenSuggestNHWC(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_MIOPEN_SUGGEST_NHWC:
+            raise unittest.SkipTest("test doesn't currently work without MIOpen NHWC activation")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+
+# Reverts the linalg backend back to default to make sure potential failures in one
+# test do not affect other tests
+def setLinalgBackendsToDefaultFinally(fn):
+    @wraps(fn)
+    def _fn(*args, **kwargs):
+        _preferred_backend = torch.backends.cuda.preferred_linalg_library()
+        try:
+            fn(*args, **kwargs)
+        finally:
+            torch.backends.cuda.preferred_linalg_library(_preferred_backend)
+    return _fn
+
+
+# Context manager for setting deterministic flag and automatically
+# resetting it to its original value
+class DeterministicGuard:
+    def __init__(self, deterministic, *, warn_only=False, fill_uninitialized_memory=True):
+        self.deterministic = deterministic
+        self.warn_only = warn_only
+        self.fill_uninitialized_memory = fill_uninitialized_memory
+
+    def __enter__(self):
+        self.deterministic_restore = torch.are_deterministic_algorithms_enabled()
+        self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled()
+        self.fill_uninitialized_memory_restore = torch.utils.deterministic.fill_uninitialized_memory
+        torch.use_deterministic_algorithms(
+            self.deterministic,
+            warn_only=self.warn_only)
+        torch.utils.deterministic.fill_uninitialized_memory = self.fill_uninitialized_memory
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.use_deterministic_algorithms(
+            self.deterministic_restore,
+            warn_only=self.warn_only_restore)
+        torch.utils.deterministic.fill_uninitialized_memory = self.fill_uninitialized_memory_restore
+
+class AlwaysWarnTypedStorageRemoval:
+    def __init__(self, always_warn):
+        assert isinstance(always_warn, bool)
+        self.always_warn = always_warn
+
+    def __enter__(self):
+        self.always_warn_restore = torch.storage._get_always_warn_typed_storage_removal()
+        torch.storage._set_always_warn_typed_storage_removal(self.always_warn)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.storage._set_always_warn_typed_storage_removal(self.always_warn_restore)
+
+# Context manager for setting cuda sync debug mode and reset it
+# to original value
+# we are not exposing it to the core because sync debug mode is
+# global and thus not thread safe
+class CudaSyncGuard:
+    def __init__(self, sync_debug_mode):
+        self.mode = sync_debug_mode
+
+    def __enter__(self):
+        self.debug_mode_restore = torch.cuda.get_sync_debug_mode()
+        torch.cuda.set_sync_debug_mode(self.mode)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.cuda.set_sync_debug_mode(self.debug_mode_restore)
+
+# Context manager for setting torch.__future__.set_swap_module_params_on_conversion
+# and automatically resetting it to its original value
+class SwapTensorsGuard:
+    def __init__(self, use_swap_tensors):
+        self.use_swap_tensors = use_swap_tensors
+
+    def __enter__(self):
+        self.swap_tensors_restore = torch.__future__.get_swap_module_params_on_conversion()
+        if self.use_swap_tensors is not None:
+            torch.__future__.set_swap_module_params_on_conversion(self.use_swap_tensors)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.__future__.set_swap_module_params_on_conversion(self.swap_tensors_restore)
+
+# This decorator can be used for API tests that call
+# torch.use_deterministic_algorithms().  When the test is finished, it will
+# restore the previous deterministic flag setting.
+#
+# If CUDA >= 10.2, this will set the environment variable
+# CUBLAS_WORKSPACE_CONFIG=:4096:8 so that the error associated with that
+# setting is not thrown during the test unless the test changes that variable
+# on purpose. The previous CUBLAS_WORKSPACE_CONFIG setting will also be
+# restored once the test is finished.
+#
+# Note that if a test requires CUDA to actually register the changed
+# CUBLAS_WORKSPACE_CONFIG variable, a new subprocess must be created, because
+# CUDA only checks the variable when the runtime initializes. Tests can be
+# run inside a subprocess like so:
+#
+#   import subprocess, sys, os
+#   script = '''
+#   # Test code should go here
+#   '''
+#   try:
+#       subprocess.check_output(
+#           [sys.executable, '-c', script],
+#           stderr=subprocess.STDOUT,
+#           cwd=os.path.dirname(os.path.realpath(__file__)),
+#           env=os.environ.copy())
+#   except subprocess.CalledProcessError as e:
+#       error_message = e.output.decode('utf-8')
+#       # Handle exceptions raised by the subprocess here
+#
+def wrapDeterministicFlagAPITest(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        with DeterministicGuard(
+                torch.are_deterministic_algorithms_enabled(),
+                warn_only=torch.is_deterministic_algorithms_warn_only_enabled()):
+            class CuBLASConfigGuard:
+                cublas_var_name = 'CUBLAS_WORKSPACE_CONFIG'
+
+                def __enter__(self):
+                    self.is_cuda10_2_or_higher = (
+                        (torch.version.cuda is not None)
+                        and ([int(x) for x in torch.version.cuda.split(".")] >= [10, 2]))
+                    if self.is_cuda10_2_or_higher:
+                        self.cublas_config_restore = os.environ.get(self.cublas_var_name)
+                        os.environ[self.cublas_var_name] = ':4096:8'
+
+                def __exit__(self, exception_type, exception_value, traceback):
+                    if self.is_cuda10_2_or_higher:
+                        cur_cublas_config = os.environ.get(self.cublas_var_name)
+                        if self.cublas_config_restore is None:
+                            if cur_cublas_config is not None:
+                                del os.environ[self.cublas_var_name]
+                        else:
+                            os.environ[self.cublas_var_name] = self.cublas_config_restore
+            with CuBLASConfigGuard():
+                fn(*args, **kwargs)
+    return wrapper
+
+# This decorator can be used for API tests that want to safely call
+# torch.__future__.set_swap_module_params_on_conversion.  `swap` can be set to
+# True, False or None where None indicates that the context manager does not
+# set the flag. When the test is finished, it will restore the previous swap
+# flag setting.
+def wrapSwapTensorsTest(swap=None):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with SwapTensorsGuard(swap):
+                fn(*args, **kwargs)
+        return wrapper
+    return dec_fn
+
+# test parametrizer for swapping
+class swap(_TestParametrizer):
+    def __init__(self, swap_values):
+        super().__init__()
+        self.swap_values = swap_values
+
+    def _parametrize_test(self, test, generic_cls, device_cls):
+        for swap in self.swap_values:
+            yield wrapSwapTensorsTest(swap)(test), f'swap_{swap}', {}, lambda _: []
+
+def skipIfCompiledWithoutNumpy(fn):
+    # Even if the numpy module is present, if `USE_NUMPY=0` is used during the
+    # build, numpy tests will fail
+    numpy_support = TEST_NUMPY
+    if numpy_support:
+        try:
+            # The numpy module is present, verify that PyTorch is compiled with
+            # numpy support
+            torch.from_numpy(np.array([2, 2]))
+        except RuntimeError:
+            numpy_support = False
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not numpy_support:
+            raise unittest.SkipTest("PyTorch was compiled without numpy support")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def _test_function(fn, device):
+    def run_test_function(self):
+        return fn(self, device)
+    return run_test_function
+
+def skipIfNoXNNPACK(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch.backends.xnnpack.enabled:
+            raise unittest.SkipTest('XNNPACK must be enabled for these tests. Please build with USE_XNNPACK=1.')
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNoLapack(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not torch._C.has_lapack:
+            raise unittest.SkipTest('PyTorch compiled without Lapack')
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+def skipIfNotRegistered(op_name, message):
+    """Wraps the decorator to hide the import of the `core`.
+
+    Args:
+        op_name: Check if this op is registered in `core._REGISTERED_OPERATORS`.
+        message: message to fail with.
+
+    Usage:
+        @skipIfNotRegistered('MyOp', 'MyOp is not linked!')
+            This will check if 'MyOp' is in the caffe2.python.core
+    """
+    if not BUILD_WITH_CAFFE2:
+        return unittest.skip("Pytorch is compiled without Caffe2")
+    try:
+        from caffe2.python import core
+        skipper = unittest.skipIf(op_name not in core._REGISTERED_OPERATORS,
+                                  message)
+    except ImportError:
+        skipper = unittest.skip("Cannot import `caffe2.python.core`")
+    return skipper
+
+def _decide_skip_caffe2(expect_caffe2, reason):
+    def skip_dec(func):
+        @wraps(func)
+        def wrapper(self):
+            if torch.onnx._CAFFE2_ATEN_FALLBACK != expect_caffe2:
+                raise unittest.SkipTest(reason)
+            return func(self)
+        return wrapper
+    return skip_dec
+
+skipIfCaffe2 = _decide_skip_caffe2(False, "Not compatible with Caffe2")
+skipIfNoCaffe2 = _decide_skip_caffe2(True, "Caffe2 is not available")
+
+def skipIfNoSciPy(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not TEST_SCIPY:
+            raise unittest.SkipTest("test require SciPy, but SciPy not found")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
+
+def skipIfTBB(message="This test makes TBB sad"):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            if IS_TBB:
+                raise unittest.SkipTest(message)
+            else:
+                fn(*args, **kwargs)
+        return wrapper
+    return dec_fn
+
+
+def skip_if_pytest(fn):
+    @wraps(fn)
+    def wrapped(*args, **kwargs):
+        if "PYTEST_CURRENT_TEST" in os.environ:
+            raise unittest.SkipTest("does not work under pytest")
+        return fn(*args, **kwargs)
+
+    return wrapped
+
+
+def slowTest(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not TEST_WITH_SLOW:  # noqa: F821
+            raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
+        else:
+            fn(*args, **kwargs)
+    wrapper.__dict__['slow_test'] = True
+    return wrapper
+
+
+def slowTestIf(condition):
+    return slowTest if condition else lambda fn: fn
+
+
+def skipCUDAMemoryLeakCheckIf(condition):
+    def dec(fn):
+        if getattr(fn, '_do_cuda_memory_leak_check', True):  # if current True
+            fn._do_cuda_memory_leak_check = not condition
+        return fn
+    return dec
+
+def skipCUDANonDefaultStreamIf(condition):
+    def dec(fn):
+        if getattr(fn, '_do_cuda_non_default_stream', True):  # if current True
+            fn._do_cuda_non_default_stream = not condition
+        return fn
+    return dec
+
+def suppress_warnings(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            fn(*args, **kwargs)
+    return wrapper
+
+
+def to_gpu(obj, type_map=None):
+    if type_map is None:
+        type_map = {}
+    if isinstance(obj, torch.Tensor):
+        assert obj.is_leaf
+        t = type_map.get(obj.dtype, obj.dtype)
+        with torch.no_grad():
+            res = obj.clone().to(dtype=t, device="cuda")
+            res.requires_grad = obj.requires_grad
+        return res
+    elif torch.is_storage(obj):
+        return obj.new().resize_(obj.size()).copy_(obj)
+    elif isinstance(obj, list):
+        return [to_gpu(o, type_map) for o in obj]
+    elif isinstance(obj, tuple):
+        return tuple(to_gpu(o, type_map) for o in obj)
+    else:
+        return deepcopy(obj)
+
+
+def get_function_arglist(func):
+    return inspect.getfullargspec(func).args
+
+
+def set_rng_seed(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    if TEST_NUMPY:
+        np.random.seed(seed)
+
+
+disable_functorch = torch._C._DisableFuncTorch
+
+
+@contextlib.contextmanager
+def freeze_rng_state():
+    # no_dispatch needed for test_composite_compliance
+    # Some OpInfos use freeze_rng_state for rng determinism, but
+    # test_composite_compliance overrides dispatch for all torch functions
+    # which we need to disable to get and set rng state
+    with no_dispatch(), disable_functorch():
+        rng_state = torch.get_rng_state()
+        if torch.cuda.is_available():
+            cuda_rng_state = torch.cuda.get_rng_state()
+    try:
+        yield
+    finally:
+        # Modes are not happy with torch.cuda.set_rng_state
+        # because it clones the state (which could produce a Tensor Subclass)
+        # and then grabs the new tensor's data pointer in generator.set_state.
+        #
+        # In the long run torch.cuda.set_rng_state should probably be
+        # an operator.
+        #
+        # NB: Mode disable is to avoid running cross-ref tests on thes seeding
+        with no_dispatch(), disable_functorch():
+            if torch.cuda.is_available():
+                torch.cuda.set_rng_state(cuda_rng_state)
+            torch.set_rng_state(rng_state)
+
+@contextlib.contextmanager
+def set_default_dtype(dtype):
+    saved_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    try:
+        yield
+    finally:
+        torch.set_default_dtype(saved_dtype)
+
+@contextlib.contextmanager
+def set_default_tensor_type(tensor_type):
+    saved_tensor_type = torch.tensor([]).type()
+    torch.set_default_tensor_type(tensor_type)
+    try:
+        yield
+    finally:
+        torch.set_default_tensor_type(saved_tensor_type)
+
+def iter_indices(tensor):
+    if tensor.dim() == 0:
+        return range(0)
+    if tensor.dim() == 1:
+        return range(tensor.size(0))
+    return product(*(range(s) for s in tensor.size()))
+
+
+def is_iterable(obj):
+    try:
+        iter(obj)
+        return True
+    except TypeError:
+        return False
+
+
+def is_iterable_of_tensors(iterable, include_empty=False):
+    """ Returns True if iterable is an iterable of tensors and False o.w.
+
+        If the iterable is empty, the return value is :attr:`include_empty`
+    """
+    # Tensor itself is iterable so we check this first
+    if isinstance(iterable, torch.Tensor):
+        return False
+
+    try:
+        if len(iterable) == 0:
+            return include_empty
+
+        for t in iter(iterable):
+            if not isinstance(t, torch.Tensor):
+                return False
+
+    except TypeError as te:
+        return False
+
+    return True
+
+
+class CudaNonDefaultStream:
+    def __enter__(self):
+        # Before starting CUDA test save currently active streams on all
+        # CUDA devices and set new non default streams to all CUDA devices
+        # to ensure CUDA tests do not use default stream by mistake.
+        beforeDevice = torch.cuda.current_device()
+        self.beforeStreams = []
+        for d in range(torch.cuda.device_count()):
+            self.beforeStreams.append(torch.cuda.current_stream(d))
+            deviceStream = torch.cuda.Stream(device=d)
+            self.beforeStreams[-1].synchronize()
+            torch._C._cuda_setStream(stream_id=deviceStream.stream_id,
+                                     device_index=deviceStream.device_index,
+                                     device_type=deviceStream.device_type)
+        torch._C._cuda_setDevice(beforeDevice)
+
+    def __exit__(self, exec_type, exec_value, traceback):
+        # After completing CUDA test load previously active streams on all
+        # CUDA devices.
+        beforeDevice = torch.cuda.current_device()
+        for d in range(torch.cuda.device_count()):
+            torch._C._cuda_setStream(stream_id=self.beforeStreams[d].stream_id,
+                                     device_index=self.beforeStreams[d].device_index,
+                                     device_type=self.beforeStreams[d].device_type)
+        torch._C._cuda_setDevice(beforeDevice)
+
+class CudaMemoryLeakCheck:
+    def __init__(self, testcase, name=None):
+        self.name = testcase.id() if name is None else name
+        self.testcase = testcase
+
+        # initialize context & RNG to prevent false positive detections
+        # when the test is the first to initialize those
+        from torch.testing._internal.common_cuda import initialize_cuda_context_rng
+        initialize_cuda_context_rng()
+
+    # Stores CUDA memory data provided by PyTorch's caching allocator and
+    #   the CUDA driver.
+    #
+    # NOTE: The undocumented torch.cuda.mem_get_info() returns
+    #   (#free bytes, #total bytes available) on the GPU
+    def __enter__(self):
+        self.caching_allocator_befores = []
+        self.driver_befores = []
+
+        # Performs a gc if required (required if any CUDA memory is held)
+        num_devices = torch.cuda.device_count()
+        for i in range(num_devices):
+            caching_allocator_mem_allocated = torch.cuda.memory_allocated(i)
+            # NOTE: gc is based exclusively on caching allocator memory
+            #   because the driver will always have some bytes in use (context size?)
+            if caching_allocator_mem_allocated > 0:
+                gc.collect()
+                torch._C._cuda_clearCublasWorkspaces()
+                torch.cuda.empty_cache()
+                break
+
+        # Acquires caching allocator and driver statistics before the test is run
+        for i in range(num_devices):
+            self.caching_allocator_befores.append(torch.cuda.memory_allocated(i))
+            bytes_free, bytes_total = torch.cuda.mem_get_info(i)
+            driver_mem_allocated = bytes_total - bytes_free
+            self.driver_befores.append(driver_mem_allocated)
+
+    def __exit__(self, exec_type, exec_value, traceback):
+        # Don't check for leaks if an exception was thrown
+        if exec_type is not None:
+            return
+
+        # Compares caching allocator before/after statistics
+        # An increase in allocated memory is a discrepancy indicating a possible
+        #   memory leak
+        discrepancy_detected = False
+        num_devices = torch.cuda.device_count()
+        for i in range(num_devices):
+            # avoid counting cublasWorkspace allocations
+            torch._C._cuda_clearCublasWorkspaces()
+            caching_allocator_mem_allocated = torch.cuda.memory_allocated(i)
+
+            if caching_allocator_mem_allocated > self.caching_allocator_befores[i]:
+                discrepancy_detected = True
+                break
+
+        # Short-circuits if no discrepancy detected
+        if not discrepancy_detected:
+            return
+
+        # Validates the discrepancy persists after garbage collection and
+        #   is confirmed by the driver API
+
+        # NOTE: driver API iscrepancies alone are ignored because with the jiterator
+        #   some tests may permanently increase the CUDA context size and
+        #   that will appear as a driver memory leak but is the expected behavior.
+
+        # GCs and clears the cache
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        for i in range(num_devices):
+
+            discrepancy_detected = True
+
+            # Query memory multiple items to ensure leak was not transient
+            for n in range(3):
+                caching_allocator_mem_allocated = torch.cuda.memory_allocated(i)
+                bytes_free, bytes_total = torch.cuda.mem_get_info(i)
+                driver_mem_allocated = bytes_total - bytes_free
+
+                caching_allocator_discrepancy = False
+                driver_discrepancy = False
+
+                if caching_allocator_mem_allocated > self.caching_allocator_befores[i]:
+                    caching_allocator_discrepancy = True
+
+                if driver_mem_allocated > self.driver_befores[i]:
+                    driver_discrepancy = True
+
+                if not (caching_allocator_discrepancy or driver_discrepancy):
+                    # Leak was false positive, exit loop
+                    discrepancy_detected = False
+                    break
+
+            if not discrepancy_detected:
+                continue
+
+            if caching_allocator_discrepancy and not driver_discrepancy:
+                # Just raises a warning if the leak is not validated by the
+                #   driver API
+                # NOTE: this may be a problem with how the caching allocator collects its
+                #   statistics or a leak too small to trigger the allocation of an
+                #   additional block of memory by the CUDA driver
+                msg = ("CUDA caching allocator reports a memory leak not "
+                       "verified by the driver API in {}! "
+                       "Caching allocator allocated memory was {} and is now reported as {} "
+                       "on device {}. "
+                       "CUDA driver allocated memory was {} and is now {}.").format(
+                    self.name,
+                    self.caching_allocator_befores[i],
+                    caching_allocator_mem_allocated,
+                    i,
+                    self.driver_befores[i],
+                    driver_mem_allocated)
+                warnings.warn(msg)
+            elif caching_allocator_discrepancy and driver_discrepancy:
+                # A caching allocator discrepancy validated by the driver API is a
+                #   failure (except on ROCm, see below)
+                msg = ("CUDA driver API confirmed a leak in {}! "
+                       "Caching allocator allocated memory was {} and is now reported as {} "
+                       "on device {}. "
+                       "CUDA driver allocated memory was {} and is now {}.").format(
+                    self.name,
+                    self.caching_allocator_befores[i],
+                    caching_allocator_mem_allocated,
+                    i,
+                    self.driver_befores[i],
+                    driver_mem_allocated)
+
+                raise RuntimeError(msg)
+
+@contextmanager
+def skip_exception_type(exc_type):
+    try:
+        yield
+    except exc_type as e:
+        raise unittest.SkipTest(f"not implemented: {e}") from e
+
+@contextmanager
+def print_repro_on_failure(repro_str):
+    try:
+        yield
+    except unittest.SkipTest:
+        raise
+    except Exception as e:
+        # NB: Hacking the exception args is the cleanest way I've found to append
+        # failure reproduction info without poisoning the stack trace.
+        if len(e.args) >= 1:
+            e.args = (f"{e.args[0]}\n{repro_str}", *e.args[1:])
+        raise
+
+#  "min_satisfying_examples" setting has been deprecated in hypothesis
+#  3.56.0 and removed in hypothesis 4.x
+try:
+    import hypothesis
+
+    def settings(*args, **kwargs):
+        if 'min_satisfying_examples' in kwargs and hypothesis.version.__version_info__ >= (3, 56, 0):
+            kwargs.pop('min_satisfying_examples')
+        return hypothesis.settings(*args, **kwargs)
+
+
+    hypothesis.settings.register_profile(
+        "pytorch_ci",
+        settings(
+            derandomize=True,
+            suppress_health_check=[hypothesis.HealthCheck.too_slow],
+            database=None,
+            max_examples=50,
+            verbosity=hypothesis.Verbosity.normal))
+    hypothesis.settings.register_profile(
+        "dev",
+        settings(
+            suppress_health_check=[hypothesis.HealthCheck.too_slow],
+            database=None,
+            max_examples=10,
+            verbosity=hypothesis.Verbosity.normal))
+    hypothesis.settings.register_profile(
+        "debug",
+        settings(
+            suppress_health_check=[hypothesis.HealthCheck.too_slow],
+            database=None,
+            max_examples=1000,
+            verbosity=hypothesis.Verbosity.verbose))
+
+    hypothesis.settings.load_profile(
+        "pytorch_ci" if IS_CI else os.getenv('PYTORCH_HYPOTHESIS_PROFILE', 'dev')  # noqa: F821
+    )
+except ImportError:
+    print('Fail to import hypothesis in common_utils, tests are not derandomized')
+
+# Used in check_if_enable to see if a test method should be disabled by an issue,
+# sanitizes a test method name from appended suffixes by @dtypes parametrization.
+# e.g., an issue with title "DISABLED test_bitwise_ops (__main__.TestBinaryUfuncs)" should
+# disabled ALL parametrized test_bitwise_ops tests, such test_bitwise_ops_cuda_int32
+def remove_device_and_dtype_suffixes(test_name: str) -> str:
+    # import statement is localized to avoid circular dependency issues with common_device_type.py
+    from torch.testing._internal.common_device_type import get_device_type_test_bases
+    device_suffixes = [x.device_type for x in get_device_type_test_bases()]
+    dtype_suffixes = [str(dt)[len("torch."):] for dt in get_all_dtypes()]
+
+    test_name_chunks = test_name.split("_")
+    if len(test_name_chunks) > 0 and test_name_chunks[-1] in dtype_suffixes:
+        if len(test_name_chunks) > 1 and test_name_chunks[-2] in device_suffixes:
+            return "_".join(test_name_chunks[0:-2])
+        return "_".join(test_name_chunks[0:-1])
+    return test_name
+
+
+def check_if_enable(test: unittest.TestCase):
+    classname = str(test.__class__).split("'")[1].split(".")[-1]
+    sanitized_testname = remove_device_and_dtype_suffixes(test._testMethodName)
+
+    def matches_test(target: str):
+        target_test_parts = target.split()
+        if len(target_test_parts) < 2:
+            # poorly formed target test name
+            return False
+        target_testname = target_test_parts[0]
+        target_classname = target_test_parts[1][1:-1].split(".")[-1]
+        # if test method name or its sanitized version exactly matches the disabled
+        # test method name AND allow non-parametrized suite names to disable
+        # parametrized ones (TestSuite disables TestSuiteCPU)
+        return classname.startswith(target_classname) and (target_testname in (test._testMethodName, sanitized_testname))
+
+    if any(matches_test(x) for x in slow_tests_dict.keys()):
+        getattr(test, test._testMethodName).__dict__['slow_test'] = True
+        if not TEST_WITH_SLOW:  # noqa: F821
+            raise unittest.SkipTest("test is slow; run with PYTORCH_TEST_WITH_SLOW to enable test")
+
+    if not IS_SANDCASTLE:  # noqa: F821
+        should_skip = False
+        skip_msg = ""
+
+        for disabled_test, (issue_url, platforms) in disabled_tests_dict.items():
+            if matches_test(disabled_test):
+                platform_to_conditional: Dict = {
+                    "mac": IS_MACOS,
+                    "macos": IS_MACOS,
+                    "win": IS_WINDOWS,
+                    "windows": IS_WINDOWS,
+                    "linux": IS_LINUX,
+                    "rocm": TEST_WITH_ROCM,  # noqa: F821
+                    "xpu": TEST_XPU,  # noqa: F821
+                    "asan": TEST_WITH_ASAN,  # noqa: F821
+                    "dynamo": TEST_WITH_TORCHDYNAMO,  # noqa: F821
+                    "inductor": TEST_WITH_TORCHINDUCTOR,  # noqa: F821
+                    "slow": TEST_WITH_SLOW,  # noqa: F821
+                }
+
+                invalid_platforms = list(filter(lambda p: p not in platform_to_conditional, platforms))
+                if len(invalid_platforms) > 0:
+                    invalid_plats_str = ", ".join(invalid_platforms)
+                    valid_plats = ", ".join(platform_to_conditional.keys())
+
+                    print(f"Test {disabled_test} is disabled for some unrecognized ",
+                          f"platforms: [{invalid_plats_str}]. Please edit issue {issue_url} to fix the platforms ",
+                          "assigned to this flaky test, changing \"Platforms: ...\" to a comma separated ",
+                          f"subset of the following (or leave it blank to match all platforms): {valid_plats}")
+
+                    # Sanitize the platforms list so that we continue to disable the test for any valid platforms given
+                    platforms = list(filter(lambda p: p in platform_to_conditional, platforms))
+
+                if platforms == [] or any(platform_to_conditional[platform] for platform in platforms):
+                    should_skip = True
+                    skip_msg = f"Test is disabled because an issue exists disabling it: {issue_url}" \
+                        f" for {'all' if platforms == [] else ''}platform(s) {', '.join(platforms)}. " \
+                        "If you're seeing this on your local machine and would like to enable this test, " \
+                        "please make sure CI is not set and you are not using the flag --import-disabled-tests."
+                    break
+
+        if should_skip and not RERUN_DISABLED_TESTS:
+            # Skip the disabled test when not running under --rerun-disabled-tests verification mode
+            raise unittest.SkipTest(skip_msg)
+
+        if not should_skip and RERUN_DISABLED_TESTS:
+            skip_msg = "Test is enabled but --rerun-disabled-tests verification mode is set, so only" \
+                " disabled tests are run"
+            raise unittest.SkipTest(skip_msg)
+
+    if TEST_SKIP_FAST:  # noqa: F821
+        if hasattr(test, test._testMethodName) and not getattr(test, test._testMethodName).__dict__.get('slow_test', False):
+            raise unittest.SkipTest("test is fast; we disabled it with PYTORCH_TEST_SKIP_FAST")
+
+
+# `TestCase.assertEqual` is very permissive and coerced the inputs into a format that could be compared. This is very
+# convenient when writing tests, but not so much while reviewing them. By default, the comparison `Pair` framework of
+# `torch.testing._comparison.are_equal`, used for example by the public testing function
+# `torch.testing.assert_close`, is more strict. In order to use the same framework and thus reduce the divergence
+# between internal and external comparison logic as much as possible, we define some "relaxed" pairs here. They only
+# change the supported inputs, but the comparison logic is the same.
+# TODO: Revisit the relaxed pairs and check how much work it is to fix the tests that would fail without the relaxation.
+
+class RelaxedBooleanPair(BooleanPair):
+    """Pair for boolean-like inputs.
+
+    In contrast to the builtin :class:`BooleanPair`, this class also supports one input being a number or a single
+    element tensor-like.
+    """
+    _supported_number_types = NumberPair(0, 0)._supported_types
+
+    def _process_inputs(self, actual, expected, *, id):
+        # We require only one of the inputs of the inputs to be a boolean and the other can also be a boolean, a
+        # number, or a single element tensor or array, whereas in default BooleanPair both inputs have to be booleans.
+        tensor_or_array_types: Tuple[Type, ...] = (torch.Tensor, np.ndarray)
+        other_supported_types = (*self._supported_types, *self._supported_number_types, *tensor_or_array_types)
+        if not (
+            (isinstance(actual, self._supported_types) and isinstance(expected, other_supported_types))
+            or (isinstance(expected, self._supported_types) and isinstance(actual, other_supported_types))
+        ):
+            self._inputs_not_supported()
+
+        return [self._to_bool(input, id=id) for input in (actual, expected)]
+
+    def _to_bool(self, bool_like, *, id):
+        if isinstance(bool_like, np.number):
+            return bool(bool_like.item())
+        elif type(bool_like) in self._supported_number_types:
+            return bool(bool_like)
+        elif isinstance(bool_like, (torch.Tensor, np.ndarray)):
+            numel = bool_like.numel() if isinstance(bool_like, torch.Tensor) else bool_like.size
+            if numel > 1:
+                self._fail(
+                    ValueError,
+                    f"Only single element tensor-likes can be compared against a boolean. "
+                    f"Got {numel} elements instead.",
+                    id=id
+                )
+
+            return bool(bool_like.item())
+        else:
+            return super()._to_bool(bool_like, id=id)
+
+
+class RelaxedNumberPair(NumberPair):
+    """Pair for number-like inputs.
+
+    In contrast to the builtin :class:`NumberPair`, this class also supports one input being a single element
+    tensor-like or a :class:`enum.Enum`. (D)Type checks are disabled, meaning comparing 1 to 1.0 succeeds even when
+    ``check_dtype=True`` is passed.
+
+    In addition, this class uses looser default tolerances for :class:`float` and :class:`complex` inputs. Also
+    supports overriding the absolute and relative tolerance through the ``@precisionOverride`` and
+    ``@toleranceOverride`` decorators.
+    """
+    _TYPE_TO_DTYPE = {
+        int: torch.int64,
+        float: torch.float32,
+        complex: torch.complex64,
+    }
+
+    def __init__(
+            self, actual, expected, *, rtol_override=0.0, atol_override=0.0, check_dtype=None, **other_parameters
+    ) -> None:
+        super().__init__(actual, expected, check_dtype=False, **other_parameters)
+        self.rtol = max(self.rtol, rtol_override)
+        self.atol = max(self.atol, atol_override)
+
+    def _process_inputs(self, actual, expected, *, id):
+        # We require only one of the inputs of the inputs to be a number and the other can also be a number or a single
+        # element tensor or array, whereas in default NumberPair both inputs have to be numbers.
+        tensor_or_array_types: Tuple[Type, ...] = (torch.Tensor, np.ndarray)
+        other_supported_types = (*self._supported_types, *tensor_or_array_types)
+        if not (
+                (isinstance(actual, self._supported_types) and isinstance(expected, other_supported_types))
+                or (isinstance(expected, self._supported_types) and isinstance(actual, other_supported_types))
+        ):
+            self._inputs_not_supported()
+
+        return [self._to_number(input, id=id) for input in (actual, expected)]
+
+    def _to_number(self, number_like, *, id):
+        if isinstance(number_like, (torch.Tensor, np.ndarray)):
+            numel = number_like.numel() if isinstance(number_like, torch.Tensor) else number_like.size
+            if numel > 1:
+                self._fail(
+                    ValueError,
+                    f"Only single element tensor-likes can be compared against a number. "
+                    f"Got {numel} elements instead.",
+                    id=id
+                )
+            number = number_like.item()
+            if isinstance(number, bool):
+                number = int(number)
+
+            return number
+        elif isinstance(number_like, Enum):
+            return int(number_like)  # type: ignore[call-overload]
+        else:
+            return super()._to_number(number_like, id=id)
+
+
+class TensorOrArrayPair(TensorLikePair):
+    """Pair for tensor-like inputs.
+
+    On the one hand this class is stricter than the builtin :class:`TensorLikePair` since it only allows instances of
+    :class:`torch.Tensor` and :class:`numpy.ndarray` rather than allowing any tensor-like than can be converted into a
+    tensor. On the other hand this class is looser since it converts all inputs into tensors with no regard of their
+    relationship, e.g. comparing a :class:`torch.Tensor` to :class:`numpy.ndarray` is fine.
+
+    In addition, this class supports overriding the absolute and relative tolerance through the ``@precisionOverride``
+    and ``@toleranceOverride`` decorators.
+    """
+    def __init__(self, actual, expected, *, rtol_override=0.0, atol_override=0.0, **other_parameters):
+        super().__init__(actual, expected, **other_parameters)
+        self.rtol = max(self.rtol, rtol_override)
+        self.atol = max(self.atol, atol_override)
+
+    def _process_inputs(self, actual, expected, *, id, allow_subclasses):
+        self._check_inputs_isinstance(actual, expected, cls=(torch.Tensor, np.ndarray))
+
+        actual, expected = (self._to_tensor(input) for input in (actual, expected))
+        for tensor in (actual, expected):
+            self._check_supported(tensor, id=id)
+        return actual, expected
+
+
+class TypedStoragePair(TensorLikePair):
+    """Pair for :class:`torch.storage.TypedStorage` inputs."""
+    def __init__(self, actual, expected, *, rtol_override=0.0, atol_override=0.0, **other_parameters):
+        self._check_inputs_isinstance(actual, expected, cls=torch.storage.TypedStorage)
+        super().__init__(actual, expected, **other_parameters)
+        self.rtol = max(self.rtol, rtol_override)
+        self.atol = max(self.atol, atol_override)
+
+    def _to_tensor(self, typed_storage):
+        return torch.tensor(
+            typed_storage._untyped_storage,
+            dtype={
+                torch.quint8: torch.uint8,
+                torch.quint4x2: torch.uint8,
+                torch.quint2x4: torch.uint8,
+                torch.qint32: torch.int32,
+                torch.qint8: torch.int8
+            }.get(typed_storage.dtype, typed_storage.dtype),
+            device=typed_storage.device,
+        )
+
+
+class UnittestPair(Pair):
+    """Fallback ABC pair that handles non-numeric inputs.
+
+    To avoid recreating the mismatch messages of :meth:`unittest.TestCase.assertEqual`, this pair simply wraps it in
+    order to use it with the :class:`Pair` "framework" from :func:`are_equal`.
+
+    Define the :attr:`UnittestPair.CLS` in a subclass to indicate which class(es) of the inputs the pair should support.
+    """
+    CLS: Union[Type, Tuple[Type, ...]]
+    TYPE_NAME: Optional[str] = None
+
+    def __init__(self, actual, expected, **other_parameters):
+        self._check_inputs_isinstance(actual, expected, cls=self.CLS)
+        super().__init__(actual, expected, **other_parameters)
+
+    def compare(self):
+        test_case = unittest.TestCase()
+
+        try:
+            return test_case.assertEqual(self.actual, self.expected)
+        except test_case.failureException as error:
+            msg = str(error)
+
+        type_name = self.TYPE_NAME or (self.CLS if isinstance(self.CLS, type) else self.CLS[0]).__name__
+        self._fail(AssertionError, f"{type_name.title()} comparison failed: {msg}")
+
+
+class StringPair(UnittestPair):
+    CLS = (str, bytes)
+    TYPE_NAME = "string"
+
+
+class SetPair(UnittestPair):
+    CLS = set
+
+
+class TypePair(UnittestPair):
+    CLS = type
+
+
+class ObjectPair(UnittestPair):
+    CLS = object
+
+
+# This implements a variant of assertRaises/assertRaisesRegex where we first test
+# if the exception is NotImplementedError, and if so just skip the test instead
+# of failing it.
+#
+# This is implemented by inheriting from the (private) implementation of
+# assertRaises from unittest.case, and slightly tweaking it for this new
+# behavior.  The year is 2021: this private class hierarchy hasn't changed since
+# 2010, seems low risk to inherit from.
+class AssertRaisesContextIgnoreNotImplementedError(unittest.case._AssertRaisesContext):
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_type is not None and issubclass(exc_type, NotImplementedError):
+            self.test_case.skipTest(f"not_implemented: {exc_value}")  # type: ignore[attr-defined]
+        return super().__exit__(exc_type, exc_value, tb)
+
+
+@contextmanager
+def set_warn_always_context(new_val: bool):
+    old_val = torch.is_warn_always_enabled()
+    torch.set_warn_always(new_val)
+    try:
+        yield
+    finally:
+        torch.set_warn_always(old_val)
+
+
+class NoTest:
+    # causes pytest to not recognize this class as a test
+    __test__ = False
+
+
+class TestCase(expecttest.TestCase):
+    # NOTE: "precision" lets classes and generated tests set minimum
+    # atol values when comparing tensors. Used by @precisionOverride and @toleranceOverride, for
+    # example.
+    # NOTE: "rel_tol" lets classes and generated tests set minimum
+    # rtol values when comparing tensors. Used by @toleranceOverride, for example.
+    _precision: float = 0
+    _rel_tol: float = 0
+
+    # Toggles whether to assert that `torch.get_default_dtype()` returns
+    # `torch.float` when `setUp` and `tearDown` are called.
+    _default_dtype_check_enabled: bool = False
+
+    # Always use difflib to print diffs on multi line equality.
+    # Undocumented feature in unittest
+    _diffThreshold = sys.maxsize
+    maxDiff = None
+
+    # checker to early terminate test suite if unrecoverable failure occurs.
+    def _should_stop_test_suite(self):
+        if torch.cuda.is_initialized():
+            # CUDA device side error will cause subsequence test cases to fail.
+            # stop entire test suite if catches RuntimeError during torch.cuda.synchronize().
+            try:
+                torch.cuda.synchronize()
+            except RuntimeError as rte:
+                print("TEST SUITE EARLY TERMINATION due to torch.cuda.synchronize() failure", file=sys.stderr)
+                print(str(rte), file=sys.stderr)
+                return True
+            return False
+        else:
+            return False
+
+    @property
+    def precision(self) -> float:
+        return self._precision
+
+    @precision.setter
+    def precision(self, prec: float) -> None:
+        self._precision = prec
+
+    @property
+    def rel_tol(self) -> float:
+        return self._rel_tol
+
+    @rel_tol.setter
+    def rel_tol(self, prec: float) -> None:
+        self._rel_tol = prec
+
+    _do_cuda_memory_leak_check = False
+    _do_cuda_non_default_stream = False
+
+    # When True, if a test case raises a NotImplementedError, instead of failing
+    # the test, skip it instead.
+    _ignore_not_implemented_error = False
+
+    def __init__(self, method_name='runTest'):
+        super().__init__(method_name)
+
+        test_method = getattr(self, method_name, None)
+        if test_method is not None:
+            # Wraps the tested method if we should do CUDA memory check.
+            if TEST_CUDA_MEM_LEAK_CHECK:  # noqa: F821
+                self._do_cuda_memory_leak_check &= getattr(test_method, '_do_cuda_memory_leak_check', True)
+                # FIXME: figure out the flaky -1024 anti-leaks on windows. See #8044
+                if self._do_cuda_memory_leak_check and not IS_WINDOWS:
+                    self.wrap_with_cuda_policy(method_name, self.assertLeaksNoCudaTensors)
+
+            # Wraps the tested method if we should enforce non default CUDA stream.
+            self._do_cuda_non_default_stream &= getattr(test_method, '_do_cuda_non_default_stream', True)
+            if self._do_cuda_non_default_stream and not IS_WINDOWS:
+                self.wrap_with_cuda_policy(method_name, self.enforceNonDefaultStream)
+
+            if self._ignore_not_implemented_error:
+                self.wrap_with_policy(method_name, lambda: skip_exception_type(NotImplementedError))
+
+            if PRINT_REPRO_ON_FAILURE:  # noqa: F821
+                env_var_prefix = TestEnvironment.repro_env_var_prefix()
+                try:
+                    def _get_rel_test_path(abs_test_path):
+                        # Attempt to get relative path based on the "test" dir.
+                        # In CI, the working dir is not guaranteed to be the base repo dir so
+                        # we can't just compute relative path from that.
+                        parts = Path(abs_test_path).parts
+                        for i, part in enumerate(parts):
+                            if part == "test":
+                                base_dir = os.path.join(*parts[:i]) if i > 0 else ''
+                                return os.path.relpath(abs_test_path, start=base_dir)
+
+                        # Can't determine containing dir; just return the test filename.
+                        # The path isn't strictly correct but it's arguably better than nothing.
+                        return os.path.split(abs_test_path)[1]
+
+                    test_filename = _get_rel_test_path(inspect.getfile(type(self)))
+                    repro_str = f"""
+To execute this test, run the following from the base repo dir:
+    {env_var_prefix} python {test_filename} -k {method_name}
+
+This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0"""
+                    self.wrap_with_policy(
+                        method_name,
+                        lambda repro_str=repro_str: print_repro_on_failure(repro_str=repro_str))
+                except Exception as e:
+                    # Don't fail entirely if we can't get the test filename
+                    log.info("could not print repro string", extra=str(e))
+
+    def assertLeaksNoCudaTensors(self, name=None):
+        name = self.id() if name is None else name
+        return CudaMemoryLeakCheck(self, name)
+
+    def enforceNonDefaultStream(self):
+        return CudaNonDefaultStream()
+
+    def assertExpectedInline(self, actual, expect, skip=0):
+        return super().assertExpectedInline(actual if isinstance(actual, str) else str(actual), expect, skip + 1)
+
+    # Munges exceptions that internally contain stack traces, using munge_exc
+    def assertExpectedInlineMunged(
+        self, exc_type, callable, expect, *, suppress_suffix=True
+    ):
+        try:
+            callable()
+        except exc_type as e:
+            self.assertExpectedInline(
+                munge_exc(e, suppress_suffix=suppress_suffix, skip=1), expect, skip=1
+            )
+            return
+        self.fail(msg="Did not raise when expected to")
+
+    def assertLogs(self, logger=None, level=None):
+        if logger is None:
+            logger = logging.getLogger("torch")
+        return super().assertLogs(logger, level)
+
+    def assertNoLogs(self, logger=None, level=None):
+        if logger is None:
+            logger = logging.getLogger("torch")
+        return super().assertNoLogs(logger, level)
+
+    def wrap_with_cuda_policy(self, method_name, policy):
+        test_method = getattr(self, method_name)
+        # the import below may initialize CUDA context, so we do it only if
+        # self._do_cuda_memory_leak_check or self._do_cuda_non_default_stream
+        # is True.
+        # TODO: sure looks like we unconditionally initialize the context here
+        # -- ezyang
+        from torch.testing._internal.common_cuda import TEST_CUDA
+        fullname = self.id().lower()  # class_name.method_name
+        if TEST_CUDA and ('gpu' in fullname or 'cuda' in fullname):
+            setattr(self, method_name, self.wrap_method_with_policy(test_method, policy))
+
+    def wrap_with_policy(self, method_name, policy):
+        test_method = getattr(self, method_name)
+        setattr(self, method_name, self.wrap_method_with_policy(test_method, policy))
+
+    # A policy is a zero-argument function that returns a context manager.
+    # We don't take the context manager directly as it may be necessary to
+    # construct it once per test method
+    def wrap_method_with_policy(self, method, policy):
+        # Assumes that `method` is the tested function in `self`.
+        # NOTE: Python Exceptions (e.g., unittest.Skip) keeps objects in scope
+        #       alive, so this cannot be done in setUp and tearDown because
+        #       tearDown is run unconditionally no matter whether the test
+        #       passes or not. For the same reason, we can't wrap the `method`
+        #       call in try-finally and always do the check.
+        @wraps(method)
+        def wrapper(self, *args, **kwargs):
+            with policy():
+                method(*args, **kwargs)
+        return types.MethodType(wrapper, self)
+
+    def wrap_with_cuda_memory_check(self, method):
+        return self.wrap_method_with_policy(method, self.assertLeaksNoCudaTensors)
+
+    def _run_custom(self, result=None):
+        using_unittest = isinstance(result, unittest.TestResult)
+
+        super_run = super().run
+        test_cls = super_run.__self__
+
+        # Are we compiling?
+        compiled = TEST_WITH_TORCHDYNAMO or TEST_WITH_AOT_EAGER or TEST_WITH_TORCHINDUCTOR  # noqa: F821
+        # Is the class strict and compiling?
+        strict_default = False
+        if compiled:
+            try:
+                path = inspect.getfile(type(test_cls))
+                full_path = os.path.abspath(path)
+                match = re.match(r".*/test/(.*).py", full_path)
+                if match is not None:
+                    filename = match.group(1)
+                    if TEST_WITH_TORCHINDUCTOR:  # noqa: F821
+                        from .dynamo_test_failures import FIXME_inductor_non_strict
+                        strict_default = filename not in FIXME_inductor_non_strict
+                    else:
+                        strict_default = True
+            # inspect.getfile can fail with these
+            except (OSError, TypeError):
+                pass
+            if "STRICT_DEFAULT" in os.environ:
+                if os.environ["STRICT_DEFAULT"] == "1":
+                    strict_default = True
+
+        strict_mode = False
+        if compiled:
+            test_method = getattr(self, self._testMethodName)
+            if hasattr(test_method, "dynamo_strict"):
+                strict_mode = test_method.dynamo_strict
+            elif hasattr(test_cls, "dynamo_strict"):
+                strict_mode = test_cls.dynamo_strict
+            else:
+                strict_mode = strict_default
+        nopython = getattr(test_cls, "dynamo_strict_nopython", False) and compiled
+
+        if strict_mode:
+            torch._dynamo.reset()
+
+        # TODO: Remove this; this is grandfathered in because we suppressed errors
+        # on test suite previously
+        # When strict mode is False, suppress_errors is True
+        if compiled:
+            suppress_errors = not strict_mode
+        else:
+            suppress_errors = torch._dynamo.config.suppress_errors
+        with unittest.mock.patch("torch._dynamo.config.suppress_errors", suppress_errors):
+            if TEST_WITH_TORCHINDUCTOR:  # noqa: F821
+                super_run = torch._dynamo.optimize("inductor")(super_run)
+            elif TEST_WITH_AOT_EAGER:  # noqa: F821
+                super_run = torch._dynamo.optimize("aot_eager_decomp_partition")(super_run)
+            elif TEST_WITH_TORCHDYNAMO:  # noqa: F821
+                # TorchDynamo optimize annotation
+                super_run = torch._dynamo.optimize("eager", nopython=nopython)(super_run)
+                key = f"{self.__class__.__name__}.{self._testMethodName}"
+                from .dynamo_test_failures import dynamo_expected_failures, dynamo_skips
+
+                def expect_failure(f, test_name):
+                    @wraps(f)
+                    def wrapper(*args, **kwargs):
+                        try:
+                            f(*args, **kwargs)
+                        except BaseException as e:
+                            self.skipTest(e)
+                        raise RuntimeError(f"Unexpected success, please remove `test/dynamo_expected_failures/{test_name}`")
+                    return wrapper
+
+                if key in dynamo_expected_failures:
+                    method = getattr(self, self._testMethodName)
+                    setattr(self, self._testMethodName, expect_failure(method, key))
+
+                def ignore_failure(f, test_name):
+                    @wraps(f)
+                    def wrapper(*args, **kwargs):
+                        try:
+                            f(*args, **kwargs)
+                        except BaseException as e:
+                            self.skipTest(e)
+                        method = getattr(self, self._testMethodName)
+                        if getattr(method, "__unittest_expecting_failure__", False):
+                            self.skipTest("unexpected success")
+                        else:
+                            self.skipTest(f"This test passed, maybe we can remove `test/dynamo_skips/{test_name}`")
+                    return wrapper
+
+                if key in dynamo_skips:
+                    method = getattr(self, self._testMethodName)
+                    setattr(self, self._testMethodName, ignore_failure(method, key))
+
+            super_run(result=result)
+
+        if strict_mode:
+            torch._dynamo.reset()
+
+        # Early terminate test if necessary.  If using pytest, use the -x flag instead
+        if using_unittest and self._should_stop_test_suite():
+            if result.wasSuccessful():
+                case = TestCase()
+                if TEST_SAVE_XML is not None:
+                    # This is a big hacky, XMLRunner modifies expected type from TestCase to TestInfo
+                    # Create dummy TestInfo to record results correctly
+                    from xmlrunner.result import _TestInfo  # type: ignore[import]
+                    case = _TestInfo(result, case)
+                    case.output = _TestInfo.ERROR
+                    case.elapsed_time = 0.0
+                    case.test_description = "TestSuiteEarlyFailure"
+                # This shouldn't really happen, but if does add fake failure
+                # For more details see https://github.com/pytorch/pytorch/issues/71973
+                result.failures.append((case, "TestSuite execution was aborted early"))
+                assert result.wasSuccessful() is False
+            result.stop()
+
+
+    def run(self, result=None):
+        with contextlib.ExitStack() as stack:
+            if TEST_WITH_CROSSREF:  # noqa: F821
+                stack.enter_context(CrossRefMode())
+            self._run_custom(
+                result=result,
+            )
+
+    def setUp(self):
+        check_if_enable(self)
+        set_rng_seed(SEED)
+
+        # Save global check sparse tensor invariants state that can be
+        # restored from tearDown:
+        self._check_invariants = torch.sparse.check_sparse_tensor_invariants.is_enabled()
+
+        # Enable invariant checks for all sparse tensors constructions
+        # including the unsafe ones. If this is not desired for some
+        # test case, use check_invariants=False optional argument to
+        # sparse tensor constructors or
+        # @torch.sparse.check_sparse_tensor_invariants(False)
+        # decorator to disable the invariant checks.
+        torch.sparse.check_sparse_tensor_invariants.enable()
+
+        if self._default_dtype_check_enabled:
+            assert torch.get_default_dtype() == torch.float
+
+    def tearDown(self):
+        # There exists test cases that override TestCase.setUp
+        # definition, so we cannot assume that _check_invariants
+        # attribute is defined in general.
+        if hasattr(self, '_check_invariants'):
+            # Restore the global check sparse tensor invariants state
+            if self._check_invariants:
+                torch.sparse.check_sparse_tensor_invariants.enable()
+            else:
+                torch.sparse.check_sparse_tensor_invariants.disable()
+
+        if self._default_dtype_check_enabled:
+            assert torch.get_default_dtype() == torch.float
+
+    @staticmethod
+    def _make_crow_indices(n_rows, n_cols, nnz,
+                           *, device, dtype, random=True):
+        """Return crow_indices of a CSR tensor with size (n_rows, n_cols) and
+        the number of specified elements nnz.
+
+        If random is True, the column counts of rows are in random
+        order. Otherwise, the column counts of rows are defined by the
+        used sampling method.
+
+        Sampling method
+        ---------------
+
+        The used sampling method was introduced in
+        https://pearu.github.io/csr_sampling.html, and here we give
+        only an overall description of the method.
+
+        Notice that crow_indices can be defined as cumsum(counts)
+        where counts is a sequence of non-negative integers satisfying
+        the following conditions:
+
+          len(counts) == n_rows + 1
+          counts.max() <= n_cols
+
+        while counts[i + 1] is interpreted as the number of specified
+        elements in the i-th row.
+
+        The used sampling method aims at increasing the diversity of
+        CSR samples, that is, a CSR sample should contain (i) rows
+        that are all filled, (ii) rows with no elements at all, and
+        (iii) rows that are partially filled. At the same time and for
+        the given total number of specified elements (nnz), there
+        should be minimal preference to rows with a given number of
+        elements.  To achieve this, the sampling method is built-up on
+        using a sawteeth model for counts. In the simplest case, we
+        would have
+
+          counts = arange(n_rows + 1) % (n_cols + 1)
+
+        that has equal number of all possible column counts per row.
+        This formula can be used only for specific input values of
+        n_rows, n_cols, and nnz. To generalize this model to any
+        combinations of inputs, the counts model above is extended
+        with an incomplete sawtooth, and the right and lower
+        rectangular parts that will guarantee that
+
+          counts.sum() == nnz
+
+        for any combination of n_rows, n_cols, and nnz. Basically,
+        we'll find a maximal window in (n_rows + 1, n_cols + 1)-grid
+        that is able to hold a sequence of sawteeth and so-called
+        final correction, while the external part of the window is
+        filled with counts to meet the nnz constraint exactly.
+        """
+        assert 0 <= nnz <= n_rows * n_cols, (nnz, n_rows, n_cols)
+
+        def sawteeth(n, m):
+            # return the total number of counts in the sequence of
+            # sawteeth where n and m define a window in (n_rows+1,
+            # n_cols+1) rectangle where the sequence of sawteeth
+            # perfectly fit.
+            M = (n_cols - m) * (n_cols - m + 1) // 2
+            K = (n_rows - n) % (n_cols - m + 1)
+            return M * ((n_rows - n) // (n_cols - m + 1)) + K * (K - 1) // 2
+
+        # Different from the original method description, here counts
+        # has leading 0 required by crow_indices:
+        counts = torch.zeros(n_rows + 1, dtype=dtype, device=torch.device('cpu'))
+
+        n = m = 0
+        N = sawteeth(n, m)
+        if N and nnz >= max(N, n_cols):
+            # determine the width of the sawteeth window. We use bisection to solve
+            #   N(n, 0) == 0 or nnz - n * n_cols < max(N(n, 0), n_cols)
+            # for n
+            n_left = n
+            n_right = n_rows - 1
+            N_right = sawteeth(n_right, m)
+            while n_right - n_left > 1:
+                n_middle = (n_left + n_right) // 2
+                N_middle = sawteeth(n_middle, m)
+                if N_middle == 0 or nnz - n_middle * n_cols < max(N_middle, n_cols):
+                    n_right, N_right = n_middle, N_middle
+                else:
+                    n_left = n_middle
+            n, N = n_right, N_right
+            # fill the right rectangle with counts:
+            assert n
+            counts[-n:].fill_(n_cols)
+
+        if N and nnz - n * n_cols >= max(N, n_rows - n):
+            # determine the height of the sawteeth window. We use bisection to solve
+            #   N(n, m) == 0 or nnz - n * n_cols - m * (n_rows - n) < max(N(n, m), n_rows - n)
+            # for m.
+            m_left = m
+            m_right = n_cols - 1
+            N_right = sawteeth(n, m_right)
+            while m_right - m_left > 1:
+                m_middle = (m_left + m_right) // 2
+                N_middle = sawteeth(n, m_middle)
+                if N_middle == 0 or nnz - n * n_cols - m_middle * (n_rows - n) < max(N_middle, n_rows - n):
+                    m_right, N_right = m_middle, N_middle
+                else:
+                    m_left = m_middle
+            m, N = m_right, N_right
+            # fill the bottom rectangle with counts:
+            assert m
+            counts[1:n_rows - n + 1].fill_(m)
+
+        if N:
+            # fill the sawteeth window with counts
+            q, r = divmod(nnz - n * n_cols - m * (n_rows - n),
+                          (n_cols - m) * (n_cols - m + 1) // 2)
+            p = 1 + q * (n_cols - m + 1)
+            k = math.isqrt(2 * r)
+            if k * (k + 1) > 2 * r:
+                k -= 1
+            corr = r - k * (k + 1) // 2
+            assert not ((p > 1) and (m > 0))  # full sawteeth are never on top of a bottom rectangle
+            # sequence of full sawteeth:
+            counts[1:p] = torch.arange(p - 1, dtype=dtype, device=counts.device) % (n_cols - m + 1)
+            # incomplete sawtooth:
+            counts[p:p + k + 1] += torch.arange(k + 1, dtype=dtype, device=counts.device)
+        else:
+            # given input does not support sawteeth
+            p = 1
+            corr = nnz - n * n_cols - m * (n_rows - n)
+
+        # correction that will guarantee counts.sum() == nnz:
+        counts[p] += corr
+
+        if random:
+            # randomize crow_indices by shuffling the sawteeth
+            # sequence:
+            perm = torch.randperm(n_rows, device=counts.device)
+            counts[1:] = counts[1:][perm]
+
+        # compute crow_indices:
+        crow_indices = counts
+        crow_indices.cumsum_(dim=0)
+        return crow_indices.to(device=device)
+
+    def genSparseCompressedTensor(self, size, nnz, *, layout, device, dtype, index_dtype, blocksize=(), dense_dims=0):
+        from operator import mul
+        from functools import reduce
+        sparse_dim = 2
+        assert all(size[d] > 0 for d in range(len(size))) or nnz == 0, 'invalid arguments'
+        assert len(size) >= sparse_dim
+        if blocksize:
+            assert len(blocksize) == 2, (size, blocksize)
+            assert size[-2 - dense_dims] % blocksize[0] == 0, (size, blocksize)
+            assert size[-1 - dense_dims] % blocksize[1] == 0, (size, blocksize)
+            blocksize0, blocksize1 = blocksize
+        else:
+            blocksize0 = blocksize1 = 1
+
+        size = tuple(size)
+        dense_size = size[(len(size) - dense_dims):]
+
+        def random_sparse_compressed(n_compressed_dims, n_plain_dims, nnz):
+            compressed_indices = self._make_crow_indices(n_compressed_dims, n_plain_dims, nnz, device=device, dtype=index_dtype)
+            plain_indices = torch.zeros(nnz, dtype=index_dtype, device=device)
+            for i in range(n_compressed_dims):
+                count = compressed_indices[i + 1] - compressed_indices[i]
+                plain_indices[compressed_indices[i]:compressed_indices[i + 1]], _ = torch.sort(
+                    torch.randperm(n_plain_dims, dtype=index_dtype, device=device)[:count])
+            low = -1 if dtype != torch.uint8 else 0
+            high = 1 if dtype != torch.uint8 else 2
+            values = make_tensor((nnz,) + blocksize + dense_size, device=device, dtype=dtype, low=low, high=high)
+            return values, compressed_indices, plain_indices
+
+        batch_shape = size[:-2 - dense_dims]
+        n_batch = reduce(mul, batch_shape, 1)
+
+        if layout in {torch.sparse_csr, torch.sparse_bsr}:
+            n_compressed_dims, n_plain_dims = size[-2 - dense_dims] // blocksize0, size[-1 - dense_dims] // blocksize1
+        else:
+            n_compressed_dims, n_plain_dims = size[-1 - dense_dims] // blocksize1, size[-2 - dense_dims] // blocksize0
+        blocknnz = nnz // (blocksize0 * blocksize1)
+        sparse_tensors = [random_sparse_compressed(n_compressed_dims, n_plain_dims, blocknnz) for _ in range(n_batch)]
+        sparse_tensors_it = map(list, zip(*sparse_tensors))
+
+        values = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, blocknnz, *blocksize, *dense_size)
+        compressed_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
+        plain_indices = torch.stack(next(sparse_tensors_it)).reshape(*batch_shape, -1)
+        return torch.sparse_compressed_tensor(compressed_indices, plain_indices,
+                                              values, size=size, dtype=dtype, layout=layout, device=device)
+
+    def genSparseCSRTensor(self, size, nnz, *, device, dtype, index_dtype, dense_dims=0):
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csr, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=(), dense_dims=dense_dims)
+
+    def genSparseCSCTensor(self, size, nnz, *, device, dtype, index_dtype, dense_dims=0):
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_csc, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=(), dense_dims=0)
+
+    def genSparseBSRTensor(self, size, blocksize, nnz, *, device, dtype, index_dtype, dense_dims=0):
+        assert len(blocksize) == 2
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsr, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=blocksize, dense_dims=dense_dims)
+
+    def genSparseBSCTensor(self, size, blocksize, nnz, *, device, dtype, index_dtype, dense_dims=0):
+        assert len(blocksize) == 2
+        return self.genSparseCompressedTensor(size, nnz, layout=torch.sparse_bsc, device=device,
+                                              dtype=dtype, index_dtype=index_dtype, blocksize=blocksize, dense_dims=dense_dims)
+
+    def genSparseTensor(self, size, sparse_dim, nnz, is_uncoalesced, device, dtype):
+        # Assert not given impossible combination, where the sparse dims have
+        # empty numel, but nnz > 0 makes the indices containing values.
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+
+        v_size = [nnz] + list(size[sparse_dim:])
+        v = make_tensor(v_size, device=device, dtype=dtype, low=-1, high=1)
+        i = torch.rand(sparse_dim, nnz, device=device)
+        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+        i = i.to(torch.long)
+        if is_uncoalesced:
+            i1 = i[:, :(nnz // 2), ...]
+            i2 = i[:, :((nnz + 1) // 2), ...]
+            i = torch.cat([i1, i2], 1)
+        x = torch.sparse_coo_tensor(i, v, torch.Size(size), dtype=dtype, device=device)
+
+        if not is_uncoalesced:
+            x = x.coalesce()
+        else:
+            # FIXME: `x` is a sparse view of `v`. Currently rebase_history for
+            #        sparse views is not implemented, so this workaround is
+            #        needed for inplace operations done on `x`, e.g., copy_().
+            #        Remove after implementing something equivalent to CopySlice
+            #        for sparse views.
+            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of x afterwards
+            x = x.detach().clone()._coalesced_(False)
+        return x, x._indices().clone(), x._values().clone()
+
+    def generate_simple_inputs(self, layout,
+                               device=None,
+                               dtype=None,
+                               index_dtype=None,
+                               enable_batch=True,
+                               enable_hybrid=True,
+                               enable_zero_sized=True,
+                               enable_non_contiguous_indices=True,
+                               enable_non_contiguous_values=True,
+                               enable_batch_variable_nse=False,
+                               output_tensor=True,
+                               patterns=None):
+        """Generator of simple inputs for tensor constructors of the given layout.
+
+        The generated tensor inputs have the following properties:
+
+        - tensor shapes are minimal but not trivial
+        - tensor values are sorted sequences for COO and CSR formats, e.g. [1, 2, 3, 4]
+        - the generated tensors represent the same mathematical tensor for all layouts
+        - the generated tensors include regular, zero-sized, and optionally, batched or/and hybrid tensors.
+        - the generated tensors include contiguous or non-contiguous tensors both in indices and values
+
+        If output_tensor is True, yield tensors with the given
+        layout. Otherwise, yield inputs to the corresponding tensor
+        constructors:
+
+          - sparse compressed input is defined as
+            (compressed_indices, plain_indices, values), dict(size=expected_size_from_shape_inference, device=device, dtype=dtype)
+
+          - sparse COO input is defined as
+            (indices, values), dict(size=expected_size_from_shape_inference, device=device, dtype=dtype)
+
+          - strided input is defined as
+            (values,), dict(device=device, dtype=dtype)
+        """
+        if index_dtype is None:
+            index_dtype = torch.int64
+
+        is_compressed_sparse_layout = layout in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+
+        if output_tensor:
+            for args, kwargs in self.generate_simple_inputs(layout, device=device, dtype=dtype, index_dtype=index_dtype,
+                                                            enable_batch=enable_batch, enable_hybrid=enable_hybrid,
+                                                            enable_zero_sized=enable_zero_sized,
+                                                            enable_non_contiguous_indices=enable_non_contiguous_indices,
+                                                            enable_non_contiguous_values=enable_non_contiguous_values,
+                                                            enable_batch_variable_nse=enable_batch_variable_nse,
+                                                            output_tensor=False):
+                if layout is torch.strided:
+                    assert len(args) == 1
+                    size = kwargs.pop('size', None)  # to ensure that a zero-sized tensor has the desired shape
+                    assert size is not None
+                    yield args[0].reshape(size)
+                elif layout is torch.sparse_coo:
+                    yield torch.sparse_coo_tensor(*args, **kwargs)
+                elif is_compressed_sparse_layout:
+                    kwargs.update(layout=layout)
+                    yield torch.sparse_compressed_tensor(*args, **kwargs)
+                else:
+                    assert 0  # unreachable
+            return
+
+        def get_blockpattern(pattern, blocksize):
+            basesize = pattern.shape
+            assert basesize[0] % blocksize[0] == 0, (basesize, blocksize)
+            assert basesize[1] % blocksize[1] == 0, (basesize, blocksize)
+            blockpattern = pattern.reshape(-1,
+                                           blocksize[0],
+                                           basesize[1] // blocksize[1],
+                                           blocksize[1]).transpose(-3, -2).any(-1).any(-1)
+            block_ids = torch.arange(1, blockpattern.numel() + 1).reshape(blockpattern.shape)
+            return (blockpattern != 0) * block_ids
+
+        def get_sparse_data(pattern):
+            basesize = pattern.shape
+            assert len(basesize) == 2, basesize  # pattern is expected to be a matrix
+
+            # We cannot use `torch.sparse_xyz_tensor(pattern)` to
+            # compute the sparse layout indices and values because
+            # generate_simple_inputs is used to generate the inputs to
+            # test `torch.sparse_xyz_tensor` factory functions, so
+            # we'll compute the indices and values independently of
+            # the factory functions.
+
+            indices = torch.where(pattern != 0)
+            coo_indices = torch.stack(indices)
+            crow_indices = torch.zeros(basesize[0] + 1, dtype=torch.int64)
+            crow_indices[1:] = torch.cumsum(coo_indices[0].bincount(minlength=basesize[0]), 0)
+            col_indices = coo_indices[1]
+            strided_values = torch.zeros(basesize, dtype=torch.int64)
+
+            # the property of `values == range(1, 1+nnz)` is used in
+            # get_sparse_data_with_block to relate BSR and BSC values,
+            # so, don't change the following line:
+            values = torch.arange(1, 1 + len(indices[0]), dtype=torch.int64)
+            strided_values[indices] = values
+
+            indices_T = torch.where(pattern.transpose(0, 1) != 0)
+            coo_indices_T = torch.stack(indices_T)
+            ccol_indices = torch.zeros(basesize[1] + 1, dtype=torch.int64)
+            ccol_indices[1:] = torch.cumsum(coo_indices_T[0].bincount(minlength=basesize[1]), 0)
+            row_indices = coo_indices_T[1]
+            csc_values = strided_values.transpose(0, 1)[indices_T]
+
+            return {torch.sparse_coo: (coo_indices, values),
+                    torch.sparse_csr: (crow_indices, col_indices, values),
+                    torch.sparse_csc: (ccol_indices, row_indices, csc_values),
+                    torch.strided: (strided_values,)}
+
+        def get_sparse_data_with_block(pattern, blocksize):
+            nonblock_data = get_sparse_data(pattern)
+            blockpattern = get_blockpattern(pattern, blocksize)
+            block_data = get_sparse_data(blockpattern)
+
+            strided_values = nonblock_data[torch.strided][0]
+            block_indices = block_data[torch.sparse_coo][0]
+            bsr_values = torch.stack([strided_values[bi * blocksize[0]:(bi + 1) * blocksize[0],
+                                                     bj * blocksize[1]:(bj + 1) * blocksize[1]]
+                                      for bi, bj in block_indices.transpose(0, 1)])
+
+            # here we use the property `values == range(1, 1+nnz)` and
+            # `values` relation to `csc_values` (see get_sparse_data)
+            # to get BSC blocks via reordering the BSR blocks:
+            bsc_values = bsr_values[block_data[torch.sparse_csc][2] - 1]
+
+            return {torch.sparse_bsr: (*block_data[torch.sparse_csr][:2], bsr_values),
+                    torch.sparse_bsc: (*block_data[torch.sparse_csc][:2], bsc_values),
+                    **nonblock_data}
+
+        def get_batch_sparse_data(pattern, blocksize):
+            size = pattern.shape
+            if len(size) <= 2:  # non-batch
+                return get_sparse_data_with_block(pattern, blocksize)
+
+            # batch data is created recursively:
+            batch_data = {}
+            for i, item in enumerate(pattern):
+                for layout, d in get_batch_sparse_data(item, blocksize).items():
+                    target = batch_data.get(layout)
+                    if layout is torch.sparse_coo:
+                        # a "batch COO" means a COO with the leading
+                        # sparse dimensions interpreted as batch
+                        # dimensions
+                        ext_coo_indices1 = torch.cat((torch.full((1, len(d[1])), i, dtype=torch.int64), d[0]))
+                        if target is None:
+                            target = batch_data[layout] = (ext_coo_indices1, d[1])
+                        else:
+                            target[0].set_(torch.cat((target[0], ext_coo_indices1), 1))
+                            target[1].set_(torch.cat((target[1], d[1])))
+                    else:
+                        if target is None:
+                            target = batch_data[layout] = tuple(d[j].unsqueeze(0) for j in range(len(d)))
+                        else:
+                            for j in range(len(d)):
+                                target[j].set_(torch.cat((target[j], d[j].unsqueeze(0))))
+            return batch_data
+
+        def generate_values(base, densesize):
+            """Generates a tensor of shape densesize with values equal to
+
+              base + i_1 * 10^0 + ... + i_d * 10^{d - 1}
+
+            at indices i_1, ..., i_d (with 0 <= i_j < densesize[j] for any 1 <= j <=
+            len(densesize))
+
+            This mapping produces unique values as long as
+            densesize[i] < 10 for all i in range(len(densesize)).
+            """
+
+            if not densesize:
+                return base
+            if not isinstance(base, int) and base.ndim > 0:
+                return torch.stack([generate_values(b, densesize) for b in base])
+            if base == 0:
+                return torch.zeros(densesize, dtype=torch.int64)
+            r = torch.arange(densesize[0], dtype=torch.int64)
+            for i, d in enumerate(densesize[1:]):
+                y = torch.arange(d, dtype=torch.int64) * (10 ** (i + 1))
+                r = r[..., None] + y[None, ...]
+            r.add_(base)
+            return r
+
+        if patterns is None:
+            # A pattern is a 3-tuple with the following items:
+            #
+            # - a list of integers with the depth of two or more. The
+            #   integers define the sparsity patterns of the generated
+            #   inputs: zero values correspond to unspecified
+            #   elements/blocks, and non-zero values to the specified
+            #   elements.
+            #
+            #   For debugging convenience, the elements with the same
+            #   value typically belong to the same block. However, it
+            #   is not a hard requirement: as long as the shape of a
+            #   pattern divides with block sizes, the pattern will be
+            #   a valid one.
+            #
+            #   If the depth of the list is larger than two, inputs
+            #   with batch dimensions will be generated.
+            #
+            # - a list of 2-tuples of block sizes, used to generate
+            #   BSR/BSC tensors with various block size parameters
+            #
+            # - a list of tuples of dense dimensions, used to generate
+            #   hybrid tensors with various dense dimensions
+            #
+            patterns = [
+                # a simple 3 x 2 tensor: non-hybrid, hybrid with 1 and 2 dense dimensions
+                ([[1, 2, 0],
+                  [1, 0, 3]], [(2, 1), (1, 3)], [(), (2,), (4, 5)]),
+                # 2 x 3 batch of 3 x 2 tensors: non-hybrid and hybrid with 2 dense dimensions
+                ([[[[1, 2, 0],
+                    [1, 0, 3]],
+                   [[1, 2, 3],
+                    [1, 0, 0]],
+                   [[1, 0, 0],
+                    [1, 2, 3]]],
+                  [[[0, 2, 0],
+                    [1, 2, 3]],
+                   [[1, 0, 3],
+                    [1, 2, 0]],
+                   [[1, 2, 3],
+                    [0, 2, 0]]]], [(2, 1), (2, 3)], [(), (2,)]),
+                # tensor with non-trivial blocksize
+                ([[0, 1, 0, 2, 0, 2],
+                  [0, 1, 0, 0, 2, 0],
+                  [3, 3, 3, 0, 0, 0],
+                  [0, 0, 0, 0, 0, 0],
+                  [0, 5, 0, 6, 6, 6],
+                  [5, 0, 5, 6, 6, 6],
+                  [0, 0, 0, 0, 8, 8],
+                  [7, 7, 7, 0, 8, 8]], [(2, 3)], [(), (4, 5)]),
+                # batch tensor with variable NSE
+                # Requires https://github.com/pytorch/pytorch/pull/84843 or similar.
+                ([[[1, 2],
+                   [3, 4]],
+                  [[1, 0],
+                   [0, 0]]], [(1, 1)], ([()] if enable_batch_variable_nse else []))]
+
+        def non_contiguous_copy(t, dim=-1, offset=0):
+            # return a copy of t that is non-contiguous along the
+            # given dimension and with the given storage offset
+            self.assertTrue(t.is_contiguous())
+            if dim < 0:
+                dim = dim + t.ndim
+            assert dim >= 0 and dim < t.ndim
+            step = max(2, offset + 1)
+            tmp = torch.zeros((*t.shape[:dim], t.shape[dim] * step, *t.shape[dim + 1:]), dtype=t.dtype, device=t.device)
+            dim_slices = (*((slice(None),) * dim), slice(offset, None, step))
+            r = tmp[dim_slices].copy_(t)
+            self.assertFalse(r.is_contiguous())
+            self.assertEqual(t, r)
+            return r
+
+        # the main loop of the method:
+        for pattern, blocksizes, densesizes in patterns:
+            if not enable_hybrid:
+                densesizes = [s for s in densesizes if not s]
+            if not (densesizes and blocksizes):
+                continue
+            pattern = torch.tensor(pattern, dtype=torch.int64)
+            if not enable_batch and pattern.ndim > 2:
+                continue
+            for blocksize in blocksizes:
+                data = get_batch_sparse_data(pattern, blocksize)[layout]
+                for densesize in densesizes:
+                    indices = [a.to(device=device, dtype=index_dtype) for a in data[:-1]]
+                    values = generate_values(data[-1], densesize).to(device=device, dtype=dtype)
+                    yield (*indices, values), dict(device=device, dtype=dtype,
+                                                   size=pattern.shape + densesize)
+
+                    if enable_non_contiguous_indices and pattern.ndim > 2:
+                        # sparse compressed indices can be sliced only along batch dimensions
+                        for (dim, offset) in {(0, 1), (-2, 0)}:
+                            indices_copy = [non_contiguous_copy(a, dim=dim, offset=offset) for a in indices]
+                            yield (*indices_copy, values), dict(device=device, dtype=dtype,
+                                                                size=pattern.shape + densesize)
+
+                            if enable_non_contiguous_values:
+                                values_copy = non_contiguous_copy(values, dim=-1, offset=1)
+                                yield (*indices_copy, values_copy), dict(device=device, dtype=dtype,
+                                                                         size=pattern.shape + densesize)
+
+                    if enable_non_contiguous_values:
+                        values_copy = non_contiguous_copy(values, dim=-1, offset=1)
+                        yield (*indices, values_copy), dict(device=device, dtype=dtype,
+                                                            size=pattern.shape + densesize)
+
+        # zero-sized tensor inputs, non-batch, non-hybrid/hybrid
+        if enable_zero_sized:
+            for basesize, blocksizes, densesizes in [
+                    ((2, 0), [(1, 2)], [(), (2,), (2, 3)] if enable_hybrid else [()]),
+                    ((0, 2), [(1, 2), (2, 1), (3, 2)], [()]),
+                    ((0, 0), [(1, 2)], [()]),
+            ]:
+                for blocksize in blocksizes:
+                    for densesize in densesizes:
+                        if layout == torch.strided:
+                            indices = ()
+                            values = torch.empty((basesize + densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_coo:
+                            indices = (torch.empty(len(basesize), 0, device=device, dtype=index_dtype),)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_csr:
+                            crow_indices = torch.tensor([0] * (basesize[0] + 1), device=device, dtype=index_dtype)
+                            col_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (crow_indices, col_indices)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_csc:
+                            ccol_indices = torch.tensor([0] * (basesize[1] + 1), device=device, dtype=index_dtype)
+                            row_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (ccol_indices, row_indices)
+                            values = torch.empty((0, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_bsr:
+                            crow_indices = torch.tensor([0] * (basesize[0] // blocksize[0] + 1), device=device, dtype=index_dtype)
+                            col_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (crow_indices, col_indices)
+                            values = torch.empty((0, *blocksize, *densesize), device=device, dtype=dtype)
+                        elif layout == torch.sparse_bsc:
+                            ccol_indices = torch.tensor([0] * (basesize[1] // blocksize[1] + 1), device=device, dtype=index_dtype)
+                            row_indices = torch.empty(0, device=device, dtype=index_dtype)
+                            indices = (ccol_indices, row_indices)
+                            values = torch.empty((0, *blocksize, *densesize), device=device, dtype=dtype)
+                        else:
+                            assert 0  # unreachable
+                        yield (*indices, values), dict(device=device, dtype=dtype, size=basesize + densesize)
+
+    def safeToDense(self, t):
+        # coalesce is only implemented for COO
+        if t.layout == torch.sparse_coo:
+            t = t.coalesce()
+        return t.to_dense()
+
+    # Compares a torch function with a reference function for a given sample input (object of SampleInput)
+    # Note: only values are compared, type comparison is not done here
+    def compare_with_reference(self, torch_fn, ref_fn, sample_input, **kwargs):
+        numpy_sample = sample_input.numpy()
+        n_inp, n_args, n_kwargs = numpy_sample.input, numpy_sample.args, numpy_sample.kwargs
+        t_inp, t_args, t_kwargs = sample_input.input, sample_input.args, sample_input.kwargs
+
+        actual = torch_fn(t_inp, *t_args, **t_kwargs)
+        expected = ref_fn(n_inp, *n_args, **n_kwargs)
+
+        self.assertEqual(actual, expected, exact_device=False, **kwargs)
+
+    # Compares the given Torch and NumPy functions on the given tensor-like object.
+    # NOTE: both torch_fn and np_fn should be functions that take a single
+    #   tensor (array). If the torch and/or NumPy function require additional
+    #   arguments then wrap the function in a lambda or pass a partial function.
+    # TODO: add args/kwargs for passing to assertEqual (e.g. rtol, atol)
+    def compare_with_numpy(self, torch_fn, np_fn, tensor_like,
+                           device=None, dtype=None, **kwargs):
+        assert TEST_NUMPY
+
+        if isinstance(tensor_like, torch.Tensor):
+            assert device is None
+            assert dtype is None
+            t_cpu = tensor_like.detach().cpu()
+            if t_cpu.dtype is torch.bfloat16:
+                t_cpu = t_cpu.float()
+            a = t_cpu.numpy()
+            t = tensor_like
+        else:
+            d = copy.copy(torch_to_numpy_dtype_dict)
+            d[torch.bfloat16] = np.float32
+            a = np.array(tensor_like, dtype=d[dtype])
+            t = torch.tensor(tensor_like, device=device, dtype=dtype)
+
+        np_result = np_fn(a)
+        torch_result = torch_fn(t).cpu()
+
+        # Converts arrays to tensors
+        if isinstance(np_result, np.ndarray):
+            try:
+                np_result = torch.from_numpy(np_result)
+            except Exception:
+                # NOTE: copying an array before conversion is necessary when,
+                #   for example, the array has negative strides.
+                np_result = torch.from_numpy(np_result.copy())
+            if t.dtype is torch.bfloat16 and torch_result.dtype is torch.bfloat16 and np_result.dtype is torch.float:
+                torch_result = torch_result.to(torch.float)
+
+        self.assertEqual(np_result, torch_result, **kwargs)
+
+    def assertEqualIgnoreType(self, *args, **kwargs) -> None:
+        # If you are seeing this function used, that means test is written wrongly
+        # and deserves detailed investigation
+        return self.assertEqual(*args, exact_dtype=False, **kwargs)
+
+    def assertEqualBroadcasting(self, x, y, *args, **kwargs) -> None:
+        r"""Tests if tensor x equals to y, if y to be broadcast to x.shape.
+        """
+        if not isinstance(y, Iterable):
+            # int, float, etc. or different shape tensors
+            y = torch.ones_like(x) * y
+        if not isinstance(y, torch.Tensor):
+            # iterable, but not a tensor
+            y = torch.ones_like(x) * torch.tensor(y)
+        return self.assertEqual(x, y, *args, **kwargs)
+
+    def assertEqual(
+            self,
+            x,
+            y,
+            msg: Optional[Union[str, Callable[[str], str]]] = None,
+            *,
+            atol: Optional[float] = None,
+            rtol: Optional[float] = None,
+            equal_nan=True,
+            exact_dtype=True,
+            # TODO: default this to True
+            exact_device=False,
+            exact_layout=False,
+            exact_stride=False,
+            exact_is_coalesced=False
+    ):
+        # Hide this function from `pytest`'s traceback
+        __tracebackhide__ = True
+
+        # numpy's dtypes are a superset of what PyTorch supports. In case we encounter an unsupported dtype, we fall
+        # back to an elementwise comparison. Note that this has to happen here and not for example in
+        # `TensorOrArrayPair`, since at that stage we can no longer split the array into its elements and perform
+        # multiple comparisons.
+        if any(
+            isinstance(input, np.ndarray) and not has_corresponding_torch_dtype(input.dtype) for input in (x, y)
+        ):
+            def to_list(input):
+                return input.tolist() if isinstance(input, (torch.Tensor, np.ndarray)) else list(input)
+
+            x = to_list(x)
+            y = to_list(y)
+        # When comparing a sequence of numbers to a tensor, we need to convert the sequence to a tensor here.
+        # Otherwise, the pair origination of `are_equal` will fail, because the sequence is recognized as container
+        # that should be checked elementwise while the tensor is not.
+        elif isinstance(x, torch.Tensor) and isinstance(y, Sequence):
+            y = torch.as_tensor(y, dtype=x.dtype, device=x.device)
+        elif isinstance(x, Sequence) and isinstance(y, torch.Tensor):
+            x = torch.as_tensor(x, dtype=y.dtype, device=y.device)
+
+        # If x or y are tensors and nested then we unbind them to a list of tensors this should allow us to compare
+        # a nested tensor to a nested tensor and a nested tensor to a list of expected tensors
+        if isinstance(x, torch.Tensor) and x.is_nested:
+            x = x.unbind()
+        if isinstance(y, torch.Tensor) and y.is_nested:
+            y = y.unbind()
+
+        error_metas = not_close_error_metas(
+            x,
+            y,
+            pair_types=(
+                NonePair,
+                RelaxedBooleanPair,
+                RelaxedNumberPair,
+                TensorOrArrayPair,
+                TypedStoragePair,
+                StringPair,
+                SetPair,
+                TypePair,
+                ObjectPair,
+            ),
+            sequence_types=(
+                Sequence,
+                Sequential,
+                ModuleList,
+                ParameterList,
+                ScriptList,
+                torch.utils.data.dataset.Subset,
+            ),
+            mapping_types=(Mapping, ModuleDict, ParameterDict, ScriptDict),
+            rtol=rtol,
+            rtol_override=self.rel_tol,
+            atol=atol,
+            atol_override=self.precision,
+            equal_nan=equal_nan,
+            check_device=exact_device,
+            check_dtype=exact_dtype,
+            check_layout=exact_layout,
+            check_stride=exact_stride,
+            check_is_coalesced=exact_is_coalesced,
+        )
+
+        if error_metas:
+            # See [ErrorMeta Cycles]
+            error_metas = [error_metas]
+            # TODO: compose all metas into one AssertionError
+            raise error_metas.pop()[0].to_error(
+                # This emulates unittest.TestCase's behavior if a custom message passed and
+                # TestCase.longMessage (https://docs.python.org/3/library/unittest.html#unittest.TestCase.longMessage)
+                # is True (default)
+                (lambda generated_msg: f"{generated_msg}\n{msg}") if isinstance(msg, str) and self.longMessage else msg
+            )
+
+    def assertNotEqual(self, x, y, msg: Optional[str] = None, *,                                       # type: ignore[override]
+                       atol: Optional[float] = None, rtol: Optional[float] = None, **kwargs) -> None:
+        with self.assertRaises(AssertionError, msg=msg):
+            self.assertEqual(x, y, msg, atol=atol, rtol=rtol, **kwargs)
+
+    def assertEqualTypeString(self, x, y) -> None:
+        # This API is used simulate deprecated x.type() == y.type()
+        self.assertEqual(x.device, y.device)
+        self.assertEqual(x.dtype, y.dtype)
+        self.assertEqual(x.is_sparse, y.is_sparse)
+
+    def assertObjectIn(self, obj: Any, iterable: Iterable[Any]) -> None:
+        for elem in iterable:
+            if id(obj) == id(elem):
+                return
+        raise AssertionError("object not found in iterable")
+
+    # Reimplemented to provide special behavior when
+    # _ignore_not_implemented_error is True
+    def assertRaises(self, expected_exception, *args, **kwargs):
+        if self._ignore_not_implemented_error:
+            context: Optional[AssertRaisesContextIgnoreNotImplementedError] = \
+                AssertRaisesContextIgnoreNotImplementedError(expected_exception, self)  # type: ignore[call-arg]
+            try:
+                return context.handle('assertRaises', args, kwargs)  # type: ignore[union-attr]
+            finally:
+                # see https://bugs.python.org/issue23890
+                context = None
+        else:
+            return super().assertRaises(expected_exception, *args, **kwargs)
+
+    # Reimplemented to provide special behavior when
+    # _ignore_not_implemented_error is True
+    def assertRaisesRegex(self, expected_exception, expected_regex, *args, **kwargs):
+        # Verifies that an exception with the type expected_exception and message
+        # matching the regular expression defined by expected_regex is thrown.
+        # If the test is instantiated for a non-native device type (like XLA)
+        # then the message is not validated.
+
+        # Checks whether the test is instantiated for a device type by testing
+        # if the test class has defined the device_type attribute and,
+        # if so, tests whether the instantiated device type is native or not
+        if hasattr(self, 'device_type') and self.device_type not in NATIVE_DEVICES and self.device_type != "mps":  # type: ignore[attr-defined]
+            # empty string matches any string
+            expected_regex = ''
+
+        if self._ignore_not_implemented_error:
+            context = AssertRaisesContextIgnoreNotImplementedError(  # type: ignore[call-arg]
+                expected_exception, self, expected_regex)
+            return context.handle('assertRaisesRegex', args, kwargs)  # type: ignore[attr-defined]
+        else:
+            return super().assertRaisesRegex(expected_exception, expected_regex, *args, **kwargs)
+
+    # Verifies that no unraisable exceptions are raised by callable.  Unlike regular
+    # exceptions, these do not actually propagate to the caller and are
+    # suppressed.  We must test for them specially.
+    def assertNoUnraisable(self, callable, *args, **kwargs):
+        raised = None
+
+        def record_unraisable(unraisable):
+            nonlocal raised
+            raised = unraisable
+
+        # Disable GC when running the callable to prevent spurious flakiness
+        # from unlucky GCs inside the callable
+        prev = gc.isenabled()
+        gc.disable()
+        try:
+            with unittest.mock.patch("sys.unraisablehook", record_unraisable):
+                callable(*args, **kwargs)
+        finally:
+            if prev:
+                gc.enable()
+
+        self.assertIsNone(raised)
+
+    # TODO: Support context manager interface
+    # NB: The kwargs forwarding to callable robs the 'subname' parameter.
+    # If you need it, manually apply your callable in a lambda instead.
+    def assertExpectedRaises(self, exc_type, callable, *args, **kwargs):
+        subname = None
+        if 'subname' in kwargs:
+            subname = kwargs['subname']
+            del kwargs['subname']
+        try:
+            callable(*args, **kwargs)
+        except exc_type as e:
+            self.assertExpected(str(e), subname)
+            return
+        # Don't put this in the try block; the AssertionError will catch it
+        self.fail(msg="Did not raise when expected to")
+
+    def assertNotWarn(self, callable, msg=''):
+        r"""
+        Test if :attr:`callable` does not raise a warning.
+        """
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            with set_warn_always_context(True):
+                callable()
+            self.assertTrue(len(ws) == 0, msg)
+
+    @contextmanager
+    def assertWarnsOnceRegex(self, category, regex=''):
+        """Context manager for code that *must always* warn
+
+        This filters expected warnings from the test and fails if
+        the expected warning is not caught. It uses set_warn_always() to force
+        TORCH_WARN_ONCE to behave like TORCH_WARN
+        """
+        pattern = re.compile(regex)
+        with warnings.catch_warnings(record=True) as ws:
+            warnings.simplefilter("always")  # allow any warning to be raised
+            with set_warn_always_context(True):
+                yield
+            if len(ws) == 0:
+                self.fail('no warning caught')
+            self.assertTrue(any(type(w.message) is category for w in ws))
+            self.assertTrue(
+                any(re.match(pattern, str(w.message)) for w in ws),
+                f'{pattern}, {[w.message for w in ws if type(w.message) is category]}')
+
+    def assertExpected(self, s, subname=None):
+        r"""
+        Test that a string matches the recorded contents of a file
+        derived from the name of this test and subname.  This file
+        is placed in the 'expect' directory in the same directory
+        as the test script. You can automatically update the recorded test
+        output using --accept.
+
+        If you call this multiple times in a single function, you must
+        give a unique subname each time.
+        """
+        if not isinstance(s, str):
+            raise TypeError("assertExpected is strings only")
+
+        def remove_prefix(text, prefix):
+            if text.startswith(prefix):
+                return text[len(prefix):]
+            return text
+        # NB: we take __file__ from the module that defined the test
+        # class, so we place the expect directory where the test script
+        # lives, NOT where test/common_utils.py lives.  This doesn't matter in
+        # PyTorch where all test scripts are in the same directory as
+        # test/common_utils.py, but it matters in onnx-pytorch
+        module_id = self.__class__.__module__
+        munged_id = remove_prefix(self.id(), module_id + ".")
+        test_file = os.path.realpath(sys.modules[module_id].__file__)
+        expected_file = os.path.join(os.path.dirname(test_file),
+                                     "expect",
+                                     munged_id)
+
+        subname_output = ""
+        if subname:
+            expected_file += "-" + subname
+            subname_output = f" ({subname})"
+        expected_file += ".expect"
+        expected = None
+
+        def accept_output(update_type):
+            print(f"Accepting {update_type} for {munged_id}{subname_output}:\n\n{s}")
+            with open(expected_file, 'w') as f:
+                # Adjust for producer_version, leave s unmodified
+                s_tag = re.sub(r'(producer_version): "[0-9.]*"',
+                               r'\1: "CURRENT_VERSION"', s)
+                f.write(s_tag)
+
+        try:
+            with open(expected_file) as f:
+                expected = f.read()
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+            elif expecttest.ACCEPT:
+                return accept_output("output")
+            else:
+                raise RuntimeError(
+                      f"I got this output for {munged_id}{subname_output}:\n\n{s}\n\n"
+                      "No expect file exists; to accept the current output, run:\n"
+                      f"python {__main__.__file__} {munged_id} --accept") from None
+
+        # a hack for JIT tests
+        if IS_WINDOWS:
+            expected = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', expected)
+            s = re.sub(r'CppOp\[(.+?)\]', 'CppOp[]', s)
+
+        # Adjust for producer_version
+        expected = expected.replace(
+            'producer_version: "CURRENT_VERSION"',
+            f'producer_version: "{torch.onnx.producer_version}"'
+        )
+        if expecttest.ACCEPT:
+            if expected != s:
+                return accept_output("updated output")
+        else:
+            if hasattr(self, "assertMultiLineEqual"):
+                # Python 2.7 only
+                # NB: Python considers lhs "old" and rhs "new".
+                self.assertMultiLineEqual(expected, s)
+            else:
+                self.assertEqual(s, expected)
+
+    def assertExpectedStripMangled(self, s, subname=None):
+        s = re.sub(r'__torch__[^ ]+', '', s)
+        self.assertExpected(s, subname)
+
+    def assertGreaterAlmostEqual(self, first, second, places=None, msg=None, delta=None):
+        """Assert that ``first`` is greater than or almost equal to ``second``.
+
+        The equality of ``first`` and ``second`` is determined in a similar way to
+        the ``assertAlmostEqual`` function of the standard library.
+        """
+        if delta is not None and places is not None:
+            raise TypeError("specify delta or places not both")
+
+        if first >= second:
+            return
+
+        diff = second - first
+        if delta is not None:
+            if diff <= delta:
+                return
+
+            standardMsg = f"{first} not greater than or equal to {second} within {delta} delta"
+        else:
+            if places is None:
+                places = 7
+
+            if round(diff, places) == 0:
+                return
+
+            standardMsg = f"{first} not greater than or equal to {second} within {places} places"
+
+        msg = self._formatMessage(msg, standardMsg)
+        raise self.failureException(msg)
+
+    def assertAtenOp(self, onnx_model, operator, overload_name=""):
+        all_aten_nodes = [p for p in onnx_model.graph.node
+                          if p.op_type == "ATen" and p.domain == "org.pytorch.aten"]
+        self.assertTrue(all_aten_nodes)
+
+        for op in all_aten_nodes:
+            attrs = {attr.name: attr.s.decode() for attr in op.attribute}
+            if attrs.get("operator") == operator:
+                break
+
+        self.assertEqual(attrs["operator"], operator)
+        self.assertEqual(attrs.get("overload_name", ""), overload_name)
+
+    def check_nondeterministic_alert(self, fn, caller_name, should_alert=True):
+        '''Checks that an operation produces a nondeterministic alert when
+        expected while `torch.use_deterministic_algorithms(True)` is set.
+
+        Args:
+          fn (callable): Function to check for a nondeterministic alert
+
+          caller_name (str): Name of the operation that produces the
+              nondeterministic alert. This name is expected to appear at the
+              beginning of the error/warning message.
+
+          should_alert (bool, optional): If True, then the check will only pass
+              if calling `fn` produces a nondeterministic error/warning with the
+              expected message. If False, then the check will only pass if
+              calling `fn` does not produce an error. Default: `True`.
+        '''
+
+        alert_message = '^' + caller_name + ' does not have a deterministic implementation, but you set'
+
+        # Check that errors are thrown correctly
+        with DeterministicGuard(True):
+            if should_alert:
+                with self.assertRaisesRegex(
+                        RuntimeError,
+                        alert_message,
+                        msg='expected a non-deterministic error, but it was not raised'):
+                    fn()
+
+            else:
+                # If a nondeterministic error is not expected, make sure
+                # that it is not raised
+                try:
+                    fn()
+                except RuntimeError as e:
+                    if 'does not have a deterministic implementation' in str(e):
+                        self.fail(
+                            'did not expect non-deterministic error message, '
+                            + 'but got one anyway: "' + str(e) + '"')
+                    # Reraise exceptions unrelated to nondeterminism
+                    raise
+
+        # Check that warnings are thrown correctly
+        with DeterministicGuard(True, warn_only=True):
+            if should_alert:
+                with self.assertWarnsRegex(
+                        UserWarning,
+                        alert_message):
+                    fn()
+            else:
+                with warnings.catch_warnings(record=True) as w:
+                    warnings.simplefilter("always")
+                    fn()
+                    for warning in w:
+                        if isinstance(warning, UserWarning):
+                            self.assertTrue(re.search(alert_message, str(warning)) is None)
+
+    # run code in subprocess and capture exceptions.
+    @staticmethod
+    def run_process_no_exception(code, env=None):
+        import subprocess
+
+        popen = subprocess.Popen(
+            [sys.executable, '-c', code],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env)
+        (stdout, stderr) = popen.communicate()
+        return (stdout, stderr)
+
+    # returns captured stderr
+    @staticmethod
+    def runWithPytorchAPIUsageStderr(code):
+        env = os.environ.copy()
+        env["PYTORCH_API_USAGE_STDERR"] = "1"
+        # remove CI flag since this is a wrapped test process.
+        # CI flag should be set in the parent process only.
+        if "CI" in env.keys():
+            del env["CI"]
+        (stdout, stderr) = TestCase.run_process_no_exception(code, env=env)
+        return stderr.decode('ascii')
+
+
+class TestCaseBase(TestCase):
+    # Calls to super() in dynamically created classes are a bit odd.
+    # See https://github.com/pytorch/pytorch/pull/118586 for more info
+    # Subclassing this class and then calling super(TestCaseBase) will run
+    # TestCase's setUp, tearDown etc functions
+    pass
+
+
+def download_file(url, binary=True):
+    from urllib.parse import urlsplit
+    from urllib import request, error
+
+    filename = os.path.basename(urlsplit(url)[2])
+    data_dir = get_writable_path(os.path.join(os.path.dirname(__file__), 'data'))
+    path = os.path.join(data_dir, filename)
+
+    if os.path.exists(path):
+        return path
+    try:
+        data = request.urlopen(url, timeout=15).read()
+        with open(path, 'wb' if binary else 'w') as f:
+            f.write(data)
+        return path
+    except error.URLError as e:
+        msg = f"could not download test file '{url}'"
+        warnings.warn(msg, RuntimeWarning)
+        raise unittest.SkipTest(msg) from e
+
+def find_free_port():
+    """
+    Finds an available port and returns that port number.
+
+    NOTE: If this function is being used to allocate a port to Store (or
+    indirectly via init_process_group or init_rpc), it should be used
+    in conjuction with the `retry_on_connect_failures` decorator as there is a potential
+    race condition where the allocated port may become unavailable before it can be used
+    """
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        sock.bind(('localhost', 0))
+        _, port = sock.getsockname()
+        return port
+
+# Errors that we can get in c10d initialization for which we should retry tests for.
+ADDRESS_IN_USE = "Address already in use"
+CONNECT_TIMEOUT = "connect() timed out."
+
+def retry_on_connect_failures(func=None, connect_errors=(ADDRESS_IN_USE)):
+    """Reruns a test if the test returns a RuntimeError and the exception
+    contains one of the strings in connect_errors."""
+    # This if block is executed when using this function as a decorator with arguments.
+    if func is None:
+        return partial(retry_on_connect_failures, connect_errors=connect_errors)
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        n_retries = 10
+        tries_remaining = n_retries
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except RuntimeError as error:
+                if any(connect_error in str(error) for connect_error in connect_errors):
+                    tries_remaining -= 1
+                    if tries_remaining == 0:
+                        raise RuntimeError(f"Failing after {n_retries} retries with error: {str(error)}") from error
+                    time.sleep(random.random())
+                    continue
+                raise
+    return wrapper
+
+
+# Decorator to retry upon certain Exceptions.
+def retry(ExceptionToCheck, tries=3, delay=3, skip_after_retries=False):
+    def deco_retry(f):
+        @wraps(f)
+        def f_retry(*args, **kwargs):
+            mtries, mdelay = tries, delay
+            while mtries > 1:
+                try:
+                    return f(*args, **kwargs)
+                except ExceptionToCheck as e:
+                    msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
+                    print(msg)
+                    time.sleep(mdelay)
+                    mtries -= 1
+            try:
+                return f(*args, **kwargs)
+            except ExceptionToCheck as e:
+                raise unittest.SkipTest(f"Skipping after {tries} consecutive {str(e)}") from e if skip_after_retries else e
+        return f_retry  # true decorator
+    return deco_retry
+
+
+# FIXME: modernize these to be consistent with make_tensor
+#   and review including them in torch.testing
+# Methods for matrix generation
+
+def random_square_matrix_of_rank(l, rank, dtype=torch.double, device='cpu'):
+    assert rank <= l
+    A = torch.randn(l, l, dtype=dtype, device=device)
+    u, s, vh = torch.linalg.svd(A, full_matrices=False)
+    for i in range(l):
+        if i >= rank:
+            s[i] = 0
+        elif s[i] == 0:
+            s[i] = 1
+    return (u * s.to(dtype).unsqueeze(-2)) @ vh
+
+def random_well_conditioned_matrix(*shape, dtype, device, mean=1.0, sigma=0.001):
+    """
+    Returns a random rectangular matrix (batch of matrices)
+    with singular values sampled from a Gaussian with
+    mean `mean` and standard deviation `sigma`.
+    The smaller the `sigma`, the better conditioned
+    the output matrix is.
+    """
+    primitive_dtype = {
+        torch.float: torch.float,
+        torch.double: torch.double,
+        torch.cfloat: torch.float,
+        torch.cdouble: torch.double
+    }
+    x = torch.rand(shape, dtype=dtype, device=device)
+    m = x.size(-2)
+    n = x.size(-1)
+    u, _, vh = torch.linalg.svd(x, full_matrices=False)
+    s = (torch.randn(*(shape[:-2] + (min(m, n),)), dtype=primitive_dtype[dtype], device=device) * sigma + mean) \
+        .sort(-1, descending=True).values.to(dtype)
+    return (u * s.unsqueeze(-2)) @ vh
+
+# Returns a noncontiguous (tensor with the same shape and values as t
+# The noncontiguous tensor is constructed such that elements in the innermost
+#   dimension are separated by zeros or (whenever possible) nans
+# TODO: consider more complicated noncontiguity schemes
+def noncontiguous_like(t):
+    # Short-circuits if t is already noncontiguous
+    if not t.is_contiguous():
+        return t
+
+    # Choose a "weird" value that won't be accessed
+    if t.dtype.is_floating_point or t.dtype.is_complex:
+        value = math.nan
+    elif t.dtype == torch.bool:
+        value = True
+    else:
+        value = 12
+
+    result = t.new_empty(t.shape + (2,))
+    result[..., 0] = value
+    result[..., 1] = t.detach()
+    result = result[..., 1]
+    result.requires_grad_(t.requires_grad)
+    return result
+
+# TODO: remove this (prefer make_symmetric_matrices below)
+def random_symmetric_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
+    A = (A + A.mT).div_(2)
+    return A
+
+# Creates a symmetric matrix or batch of symmetric matrices
+# Shape must be a square matrix or batch of square matrices
+def make_symmetric_matrices(*shape, device, dtype):
+    assert shape[-1] == shape[-2]
+    t = make_tensor(shape, device=device, dtype=dtype)
+    t = (t + t.mT).div_(2)
+    return t
+
+def random_hermitian_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
+    A = (A + A.mH).div_(2)
+    return A
+
+
+def random_symmetric_psd_matrix(l, *batches, **kwargs):
+    """
+    Returns a batch of random symmetric positive-semi-definite matrices.
+    The shape of the result is batch_dims + (matrix_size, matrix_size)
+    The following example creates a tensor of size 2 x 4 x 3 x 3
+    >>> # xdoctest: +SKIP("undefined variables")
+    >>> matrices = random_symmetric_psd_matrix(3, 2, 4, dtype=dtype, device=device)
+    """
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
+    return A @ A.mT
+
+
+def random_hermitian_psd_matrix(matrix_size, *batch_dims, dtype=torch.double, device='cpu'):
+    """
+    Returns a batch of random Hermitian positive-semi-definite matrices.
+    The shape of the result is batch_dims + (matrix_size, matrix_size)
+    The following example creates a tensor of size 2 x 4 x 3 x 3
+    >>> # xdoctest: +SKIP("undefined variables")
+    >>> matrices = random_hermitian_psd_matrix(3, 2, 4, dtype=dtype, device=device)
+    """
+    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)), dtype=dtype, device=device)
+    return A @ A.mH
+
+
+# TODO: remove this (prefer make_symmetric_pd_matrices below)
+def random_symmetric_pd_matrix(matrix_size, *batch_dims, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)),
+                    dtype=dtype, device=device)
+    return torch.matmul(A, A.mT) \
+        + torch.eye(matrix_size, dtype=dtype, device=device) * 1e-5
+
+
+# Creates a symmetric positive-definite matrix or batch of
+#   such matrices
+def make_symmetric_pd_matrices(*shape, device, dtype):
+    assert shape[-1] == shape[-2]
+    t = make_tensor(shape, device=device, dtype=dtype)
+    i = torch.eye(shape[-1], device=device, dtype=dtype) * 1e-5
+    return t @ t.mT + i
+
+def random_hermitian_pd_matrix(matrix_size, *batch_dims, dtype, device):
+    """
+    Returns a batch of random Hermitian positive-definite matrices.
+    The shape of the result is batch_dims + (matrix_size, matrix_size)
+    The following example creates a tensor of size 2 x 4 x 3 x 3
+    >>> # xdoctest: +SKIP("undefined variables")
+    >>> matrices = random_hermitian_pd_matrix(3, 2, 4, dtype=dtype, device=device)
+    """
+    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)),
+                    dtype=dtype, device=device)
+    return A @ A.mH + torch.eye(matrix_size, dtype=dtype, device=device)
+
+# Creates a full rank matrix with distinct singular values or
+#   a batch of such matrices
+def make_fullrank_matrices_with_distinct_singular_values(*shape, device, dtype, requires_grad=False):
+    with torch.no_grad():
+        t = make_tensor(shape, device=device, dtype=dtype)
+        u, _, vh = torch.linalg.svd(t, full_matrices=False)
+        real_dtype = t.real.dtype if t.dtype.is_complex else t.dtype
+        k = min(shape[-1], shape[-2])
+        # We choose the singular values to be "around one"
+        # This is to make the matrix well conditioned
+        # s = [2, 3, ..., k+1]
+        s = torch.arange(2, k + 2, dtype=real_dtype, device=device)
+        # s = [2, -3, 4, ..., (-1)^k k+1]
+        s[1::2] *= -1.
+        # 1 + 1/s so that the singular values are in the range [2/3, 3/2]
+        # This gives a condition number of 9/4, which should be good enough
+        s.reciprocal_().add_(1.)
+        # Note that the singular values need not be ordered in an SVD so
+        # we don't need need to sort S
+        x = (u * s.to(u.dtype)) @ vh
+    x.requires_grad_(requires_grad)
+    return x
+
+def random_matrix(rows, columns, *batch_dims, **kwargs):
+    """Return rectangular matrix or batches of rectangular matrices.
+
+    Parameters:
+      dtype - the data type
+      device - the device kind
+      singular - when True, the output will be singular
+    """
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    silent = kwargs.get("silent", False)
+    singular = kwargs.get("singular", False)
+    if silent and not torch._C.has_lapack:
+        return torch.ones(rows, columns, dtype=dtype, device=device)
+
+    A = torch.randn(batch_dims + (rows, columns), dtype=dtype, device=device)
+    if A.numel() == 0:
+        return A
+    u, _, vh = torch.linalg.svd(A, full_matrices=False)
+    k = min(rows, columns)
+    s = torch.linspace(1 / (k + 1), 1, k, dtype=dtype, device=device)
+    if singular:
+        # make matrix singular
+        s[k - 1] = 0
+        if k > 2:
+            # increase the order of singularity so that the pivoting
+            # in LU factorization will be non-trivial
+            s[0] = 0
+    return (u * s.unsqueeze(-2)) @ vh
+
+
+def random_lowrank_matrix(rank, rows, columns, *batch_dims, **kwargs):
+    """Return rectangular matrix or batches of rectangular matrices with
+    given rank.
+    """
+    B = random_matrix(rows, rank, *batch_dims, **kwargs)
+    C = random_matrix(rank, columns, *batch_dims, **kwargs)
+    return B.matmul(C)
+
+
+def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
+    """Return rectangular random sparse matrix within given density.
+
+    The density of the result approaches to given density as the size
+    of the matrix is increased and a relatively small value of density
+    is specified but higher than min(rows, columns)/(rows * columns)
+    for non-singular matrices.
+    """
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    singular = kwargs.get("singular", False)
+
+    k = min(rows, columns)
+    nonzero_elements = max(min(rows, columns), int(rows * columns * density))
+
+    row_indices = [i % rows for i in range(nonzero_elements)]
+    column_indices = [i % columns for i in range(nonzero_elements)]
+    random.shuffle(column_indices)
+    indices = [row_indices, column_indices]
+    values = torch.randn(nonzero_elements, dtype=dtype, device=device)
+    # ensure that the diagonal dominates
+    values *= torch.tensor([-float(i - j)**2 for i, j in zip(*indices)], dtype=dtype, device=device).exp()
+    indices_tensor = torch.tensor(indices)
+    A = torch.sparse_coo_tensor(indices_tensor, values, (rows, columns), device=device)
+    return A.coalesce()
+
+
+def random_sparse_pd_matrix(matrix_size, density=0.01, **kwargs):
+    """Return random sparse positive-definite matrix with given density.
+
+    The eigenvalues of the matrix are defined as::
+      arange(1, matrix_size+1)/matrix_size
+
+    Algorithm:
+      A = diag(arange(1, matrix_size+1)/matrix_size)
+      while <A density is smaller than required>:
+          <choose random i, j in range(matrix_size), theta in [0, 2*pi]>
+          R = <rotation matrix (i,j,theta)>
+          A = R^T A R
+    """
+    import math
+    torch = kwargs.get('torch', globals()['torch'])
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    data = {(i, i): float(i + 1) / matrix_size
+            for i in range(matrix_size)}
+
+
+    def multiply(data, N, i, j, cs, sn, left=True):
+        for k in range(N):
+            if left:
+                ik, jk = (k, i), (k, j)
+            else:
+                ik, jk = (i, k), (j, k)
+            aik, ajk = data.get(ik, 0), data.get(jk, 0)
+            aik, ajk = cs * aik + sn * ajk, -sn * aik + cs * ajk
+            if aik:
+                data[ik] = aik
+            else:
+                data.pop(ik, None)
+            if ajk:
+                data[jk] = ajk
+            else:
+                data.pop(jk, None)
+
+    target_nnz = density * matrix_size * matrix_size
+    while len(data) < target_nnz:
+        i = random.randint(0, matrix_size - 1)
+        j = random.randint(0, matrix_size - 1)
+        if i != j:
+            theta = random.uniform(0, 2 * math.pi)
+            cs = math.cos(theta)
+            sn = math.sin(theta)
+            multiply(data, matrix_size, i, j, cs, sn, left=True)
+            multiply(data, matrix_size, i, j, cs, sn, left=False)
+    icoords, jcoords, values = [], [], []
+    for (i, j), v in sorted(data.items()):
+        icoords.append(i)
+        jcoords.append(j)
+        values.append(v)
+    indices_tensor = torch.tensor([icoords, jcoords])
+    return torch.sparse_coo_tensor(indices_tensor, values, (matrix_size, matrix_size), dtype=dtype, device=device)
+
+# FIXME: remove this by updating test suites using it
+def do_test_dtypes(self, dtypes, layout, device):
+    for dtype in dtypes:
+        if dtype != torch.float16:
+            out = torch.zeros((2, 3), dtype=dtype, layout=layout, device=device)
+            self.assertIs(dtype, out.dtype)
+            self.assertIs(layout, out.layout)
+            self.assertEqual(device, out.device)
+
+# FIXME: remove this by updating test suites using it
+def do_test_empty_full(self, dtypes, layout, device):
+    shape = torch.Size([2, 3])
+
+    def check_value(tensor, dtype, layout, device, value, requires_grad):
+        self.assertEqual(shape, tensor.shape)
+        self.assertIs(dtype, tensor.dtype)
+        self.assertIs(layout, tensor.layout)
+        self.assertEqual(tensor.requires_grad, requires_grad)
+        if tensor.is_cuda and device is not None:
+            self.assertEqual(device, tensor.device)
+        if value is not None:
+            fill = tensor.new(shape).fill_(value)
+            self.assertEqual(tensor, fill)
+
+    def get_int64_dtype(dtype):
+        module = '.'.join(str(dtype).split('.')[1:-1])
+        if not module:
+            return torch.int64
+        return operator.attrgetter(module)(torch).int64
+
+    default_dtype = torch.get_default_dtype()
+    check_value(torch.empty(shape), default_dtype, torch.strided, -1, None, False)
+    check_value(torch.full(shape, -5.), default_dtype, torch.strided, -1, None, False)
+    for dtype in dtypes:
+        for rg in {dtype.is_floating_point, False}:
+            int64_dtype = get_int64_dtype(dtype)
+            v = torch.empty(shape, dtype=dtype, device=device, layout=layout, requires_grad=rg)
+            check_value(v, dtype, layout, device, None, rg)
+            out = v.new()
+            check_value(torch.empty(shape, out=out, device=device, layout=layout, requires_grad=rg),
+                        dtype, layout, device, None, rg)
+            check_value(v.new_empty(shape), dtype, layout, device, None, False)
+            check_value(v.new_empty(shape, dtype=int64_dtype, device=device, requires_grad=False),
+                        int64_dtype, layout, device, None, False)
+            check_value(torch.empty_like(v), dtype, layout, device, None, False)
+            check_value(torch.empty_like(v, dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
+                        int64_dtype, layout, device, None, False)
+
+            if dtype is not torch.float16 and layout != torch.sparse_coo:
+                fv = 3
+                v = torch.full(shape, fv, dtype=dtype, layout=layout, device=device, requires_grad=rg)
+                check_value(v, dtype, layout, device, fv, rg)
+                check_value(v.new_full(shape, fv + 1), dtype, layout, device, fv + 1, False)
+                out = v.new()
+                check_value(torch.full(shape, fv + 2, out=out, device=device, layout=layout, requires_grad=rg),
+                            dtype, layout, device, fv + 2, rg)
+                check_value(v.new_full(shape, fv + 3, dtype=int64_dtype, device=device, requires_grad=False),
+                            int64_dtype, layout, device, fv + 3, False)
+                check_value(torch.full_like(v, fv + 4), dtype, layout, device, fv + 4, False)
+                check_value(torch.full_like(v, fv + 5,
+                                            dtype=int64_dtype, layout=layout, device=device, requires_grad=False),
+                            int64_dtype, layout, device, fv + 5, False)
+
+# FIXME: improve load_tests() documentation here
+running_script_path = None
+def set_running_script_path():
+    global running_script_path
+    try:
+        running_file = os.path.abspath(os.path.realpath(sys.argv[0]))
+        if running_file.endswith('.py'):  # skip if the running file is not a script
+            running_script_path = running_file
+    except Exception:
+        pass
+
+def check_test_defined_in_running_script(test_case):
+    if running_script_path is None:
+        return
+    test_case_class_file = os.path.abspath(os.path.realpath(inspect.getfile(test_case.__class__)))
+    assert test_case_class_file == running_script_path, f"Class of loaded TestCase \"{test_case.id()}\" " \
+        f"is not defined in the running script \"{running_script_path}\", but in \"{test_case_class_file}\". Did you " \
+        "accidentally import a unittest.TestCase from another file?"
+
+def load_tests(loader, tests, pattern):
+    set_running_script_path()
+    test_suite = unittest.TestSuite()
+    for test_group in tests:
+        if not DISABLE_RUNNING_SCRIPT_CHK:  # noqa: F821
+            for test in test_group:
+                check_test_defined_in_running_script(test)
+        if test_group._tests:
+            test_suite.addTest(test_group)
+    return test_suite
+
+# FIXME: document this and move it to test_serialization
+class BytesIOContext(io.BytesIO):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+# Tentative value for nondet_tol for gradcheck when backward implementation
+# relies on nondeterministic operations, i.e., those listed here:
+# https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
+#
+# For more information see https://github.com/pytorch/pytorch/issues/56202
+GRADCHECK_NONDET_TOL = 1e-12
+
+TestEnvironment.def_flag("TEST_WITH_SLOW_GRADCHECK", env_var="PYTORCH_TEST_WITH_SLOW_GRADCHECK")
+
+skipIfSlowGradcheckEnv = unittest.skipIf(
+    TEST_WITH_SLOW_GRADCHECK,  # noqa: F821
+    "Tests that don't use gradcheck don't need to run on slow_gradcheck CI"
+)
+
+def gradcheck(fn, inputs, **kwargs):
+    # Wrapper around gradcheck that enables certain keys by default.
+    # Use this testing-internal gradcheck instead of autograd.gradcheck so that new features like vmap and
+    # forward-mode AD are tested by default. We create this wrapper because we'd like to keep new checks
+    # to be disabled to default for the public-facing api to avoid breaking user code.
+    #
+    # All PyTorch devs doing testing should use this wrapper instead of autograd.gradcheck.
+    default_values = {
+        "check_batched_grad": True,
+        "fast_mode": True,
+    }
+
+    if TEST_WITH_SLOW_GRADCHECK:  # noqa: F821
+        default_values["fast_mode"] = False
+
+    for key, value in default_values.items():
+        # default value override values explicitly set to None
+        k = kwargs.get(key, None)
+        kwargs[key] = k if k is not None else value
+
+    return torch.autograd.gradcheck(fn, inputs, **kwargs)
+
+def gradgradcheck(fn, inputs, grad_outputs=None, **kwargs):
+    # Wrapper around gradgradcheck that enables certain keys by default
+    # See gradcheck above for an explanation of why we need something like this.
+    #
+    # All PyTorch devs doing testing should use this wrapper instead of autograd.gradgradcheck
+    default_values = {
+        "check_batched_grad": True,
+        "fast_mode": True,
+    }
+
+    if TEST_WITH_SLOW_GRADCHECK:  # noqa: F821
+        default_values["fast_mode"] = False
+
+    for key, value in default_values.items():
+        # default value override values explicitly set to None
+        k = kwargs.get(key, None)
+        kwargs[key] = k if k is not None else value
+
+    return torch.autograd.gradgradcheck(fn, inputs, grad_outputs, **kwargs)
+
+
+def _assertGradAndGradgradChecks(test_case, apply_fn, inputs, **kwargs):
+    # call assert function rather than returning a bool since it's nicer
+    # if we get whether this failed on the gradcheck or the gradgradcheck.
+    test_case.assertTrue(gradcheck(apply_fn, inputs, **kwargs))
+    test_case.assertTrue(gradgradcheck(apply_fn, inputs, **kwargs))
+
+
+@contextmanager
+def set_cwd(path: str) -> Iterator[None]:
+    old_cwd = os.getcwd()
+    try:
+        os.chdir(path)
+        yield
+    finally:
+        os.chdir(old_cwd)
+
+
+# FIXME: delete this
+# Using @toleranceOverride specific to your test is the recommended way
+# of doing this. These are just some values that worked for test_nn.
+dtype2prec_DONTUSE = {torch.float: 1e-5,
+                      torch.double: 1e-5,
+                      torch.half: 1e-2,
+                      torch.bfloat16: 1e-1}
+
+# FIXME: move to test_sparse or sparse utils
+# This is a wrapper that wraps a test to run this test twice, one with
+# coalesced=True, another with coalesced=False for coalesced/uncoalesced sparse tensors.
+def coalescedonoff(f):
+    @wraps(f)
+    def wrapped(self, *args, **kwargs):
+        f(self, *args, **kwargs, coalesced=True)
+        f(self, *args, **kwargs, coalesced=False)
+    return wrapped
+
+
+def is_coalesced_indices(s):
+    indices = s._indices()
+    hash_coeffs = (1,) + s.shape[s.sparse_dim() - 1:0:-1]
+    hash_indices = torch.tensor(hash_coeffs, device=s.device).cumprod(-1).flip(-1)
+    if s.sparse_dim() > 1:
+        hash_indices.unsqueeze_(-1)
+        hash_indices = (indices * hash_indices).sum(0)
+    else:
+        hash_indices = indices * hash_indices
+
+    # check if indices are sorted
+    res = torch.allclose(hash_indices, hash_indices.sort()[0])
+
+    # check if there are no repeated indices
+    res = res and torch.allclose(hash_indices, hash_indices.unique())
+
+    return res
+
+
+@contextlib.contextmanager
+def disable_gc():
+    if gc.isenabled():
+        try:
+            gc.disable()
+            yield
+        finally:
+            gc.enable()
+    else:
+        yield
+
+
+def find_library_location(lib_name: str) -> Path:
+    # return the shared library file in the installed folder if exist,
+    # else the file in the build folder
+    torch_root = Path(torch.__file__).resolve().parent
+    path = torch_root / 'lib' / lib_name
+    if os.path.exists(path):
+        return path
+    torch_root = Path(__file__).resolve().parent.parent.parent
+    return torch_root / 'build' / 'lib' / lib_name
+
+def skip_but_pass_in_sandcastle(reason):
+    """
+    Similar to unittest.skip, however in the sandcastle environment it just
+    "passes" the test instead to avoid creating tasks complaining about tests
+    skipping continuously.
+    """
+    def decorator(func):
+        if not IS_SANDCASTLE:  # noqa: F821
+            func.__unittest_skip__ = True
+            func.__unittest_skip_why__ = reason
+            return func
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            print(f'Skipping {func.__name__} on sandcastle for following reason: {reason}', file=sys.stderr)
+            return
+        return wrapper
+
+    return decorator
+
+def mock_wrapper(method):
+    """
+    Returns a function that calls the real implementation of a method
+    in addition to passing args to a mock object.
+    """
+    mock = MagicMock()
+
+    @wraps(method)
+    def wrapper(self, *args, **kwargs):
+        mock(*args, **kwargs)
+        return method(self, *args, **kwargs)
+    wrapper.mock = mock  # type: ignore[attr-defined]
+    return wrapper
+
+def get_tensors_from(args, kwargs):
+    """ Returns a set of all Tensor objects in the given args and kwargs. """
+    return set([arg for arg in args if isinstance(arg, Tensor)] +
+               [v for v in kwargs.values() if isinstance(v, Tensor)])
+
+
+# Returns scalar tensor representation of a list of integer byte values
+def bytes_to_scalar(byte_list: List[int], dtype: torch.dtype, device: torch.device):
+    dtype_to_ctype: Dict[torch.dtype, Any] = {
+        torch.int8: ctypes.c_int8,
+        torch.uint8: ctypes.c_uint8,
+        torch.uint16: ctypes.c_uint16,
+        torch.uint32: ctypes.c_uint32,
+        torch.uint64: ctypes.c_uint64,
+        torch.int16: ctypes.c_int16,
+        torch.int32: ctypes.c_int32,
+        torch.int64: ctypes.c_int64,
+        torch.bool: ctypes.c_bool,
+        torch.float32: ctypes.c_float,
+        torch.complex64: ctypes.c_float,
+        torch.float64: ctypes.c_double,
+        torch.complex128: ctypes.c_double,
+    }
+    ctype = dtype_to_ctype[dtype]
+    num_bytes = ctypes.sizeof(ctype)
+
+    def check_bytes(byte_list):
+        for byte in byte_list:
+            assert 0 <= byte <= 255
+
+    if dtype.is_complex:
+        assert len(byte_list) == (num_bytes * 2)
+        check_bytes(byte_list)
+        real = ctype.from_buffer((ctypes.c_byte * num_bytes)(
+            *byte_list[:num_bytes])).value
+        imag = ctype.from_buffer((ctypes.c_byte * num_bytes)(
+            *byte_list[num_bytes:])).value
+        res = real + 1j * imag
+    else:
+        assert len(byte_list) == num_bytes
+        check_bytes(byte_list)
+        res = ctype.from_buffer((ctypes.c_byte * num_bytes)(
+            *byte_list)).value
+
+    return torch.tensor(res, device=device, dtype=dtype)
+
+
+def copy_func(f):
+    """Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)"""
+    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__,
+                           argdefs=f.__defaults__,
+                           closure=f.__closure__)
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
+
+
+def xfail_inherited_tests(tests):
+    """
+    Given a list of test names which are defined by a superclass of the
+    class this decorates, mark them as expected failure.  This is useful
+    if you are doing poor man's parameterized tests by subclassing a generic
+    test class.
+    """
+    def deco(cls):
+        for t in tests:
+            # NB: expectedFailure operates by mutating the method in question,
+            # which is why you have to copy the function first
+            setattr(cls, t, unittest.expectedFailure(copy_func(getattr(cls, t))))
+        return cls
+    return deco
+
+
+def skip_but_pass_in_sandcastle_if(condition, reason):
+    """
+    Similar to unittest.skipIf, however in the sandcastle environment it just
+    "passes" the test instead to avoid creating tasks complaining about tests
+    skipping continuously.
+    """
+    def decorator(func):
+        if condition:
+            if IS_SANDCASTLE:  # noqa: F821
+                @wraps(func)
+                def wrapper(*args, **kwargs):
+                    print(f'Skipping {func.__name__} on sandcastle for following reason: {reason}', file=sys.stderr)
+                return wrapper
+            else:
+                func.__unittest_skip__ = True
+                func.__unittest_skip_why__ = reason
+
+        return func
+
+    return decorator
+
+def dtype_name(dtype):
+    """ Returns the pretty name of the dtype (e.g. torch.int64 -> int64). """
+    return str(dtype).split('.')[1]
+
+
+dtype_abbrs = {
+    torch.bfloat16: 'bf16',
+    torch.float64: 'f64',
+    torch.float32: 'f32',
+    torch.float16: 'f16',
+    torch.complex32: 'c32',
+    torch.complex64: 'c64',
+    torch.complex128: 'c128',
+    torch.int8: 'i8',
+    torch.int16: 'i16',
+    torch.int32: 'i32',
+    torch.int64: 'i64',
+    torch.bool: 'b8',
+    torch.uint8: 'u8',
+}
+
+
+def set_single_threaded_if_parallel_tbb(fn):
+    """Set test to be single threaded for parallel tbb.
+
+    See https://github.com/pytorch/pytorch/issues/64571#issuecomment-914691883
+    """
+    if not IS_TBB:
+        return fn
+
+    @wraps(fn)
+    def wrap_fn(*args, **kwargs):
+        num_threads = torch.get_num_threads()
+        torch.set_num_threads(1)
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            torch.set_num_threads(num_threads)
+    return wrap_fn
+
+
+@functools.lru_cache
+def get_cycles_per_ms() -> float:
+    """Measure and return approximate number of cycles per millisecond for torch.cuda._sleep
+    """
+
+    def measure() -> float:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        torch.cuda._sleep(1000000)
+        end.record()
+        end.synchronize()
+        cycles_per_ms = 1000000 / start.elapsed_time(end)
+        return cycles_per_ms
+
+    # Get 10 values and remove the 2 max and 2 min and return the avg.
+    # This is to avoid system disturbance that skew the results, e.g.
+    # the very first cuda call likely does a bunch of init, which takes
+    # much longer than subsequent calls.
+    #
+    # Tested on both Tesla V100, Quadro GP100, Titan RTX, RTX 3090 GPUs
+    # and seems to return stable values. Therefore, we enable caching
+    # using lru_cache decorator above.
+    num = 10
+    vals = []
+    for _ in range(num):
+        vals.append(measure())
+    vals = sorted(vals)
+    return mean(vals[2 : num - 2])
+
+
+# OpInfo utils
+
+T = TypeVar('T')
+def first_sample(self: unittest.TestCase, samples: Iterable[T]) -> T:
+    """
+    Returns the first sample from an iterable of samples, like those returned by OpInfo.
+    The test will be skipped if no samples are available.
+    """
+    try:
+        return next(iter(samples))
+    except StopIteration as e:
+        raise unittest.SkipTest('Skipped! Need at least 1 sample input') from e
+
+# this helper method is to recursively
+# clone the tensor-type input of operators tested by OpInfo
+def clone_input_helper(input):
+    if isinstance(input, torch.Tensor):
+        return torch.clone(input)
+
+    if isinstance(input, Sequence):
+        return tuple(map(clone_input_helper, input))
+
+    return input
+
+@contextmanager
+def custom_op(opname, symbolic_fn, opset_version):
+    """Context manager/decorator to test ONNX export with custom operator"""
+    try:
+        register_custom_op_symbolic(opname, symbolic_fn, opset_version)
+        yield
+    finally:
+        unregister_custom_op_symbolic(opname, opset_version)
+
+
+def outs_and_grads(fn, graph_inps, inps):
+    outs = fn(*graph_inps)
+    for out in pytree.tree_leaves(outs):
+        if isinstance(out, torch.Tensor) and out.requires_grad:
+            out.sum().backward(retain_graph=True)
+    grads = [inp.grad for inp in pytree.tree_leaves(inps) if isinstance(inp, torch.Tensor)]
+    for inp in pytree.tree_leaves(inps):
+        if isinstance(inp, torch.Tensor):
+            inp.grad = None
+    return outs, grads
+
+def compare_equal_outs_and_grads(test, m1, m2, inps):
+    r1, g1 = outs_and_grads(m1, inps, inps)
+    r2, g2 = outs_and_grads(m2, inps, inps)
+    test.assertEqual(r1, r2)
+    test.assertEqual(g1, g2)
+
+class TestGradients(TestCase):
+    exact_dtype = True
+
+    # Copies inputs to inplace operations to avoid inplace modifications
+    #   to leaves requiring gradient
+    def _get_safe_inplace(self, inplace_variant):
+        @wraps(inplace_variant)
+        def _fn(t, *args, **kwargs):
+            return inplace_variant(t.clone(), *args, **kwargs)
+
+        return _fn
+
+    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
+                      check_batched_grad=None, check_batched_forward_grad=False):
+        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
+        # NB: check_backward_ad does not affect gradgradcheck (always True)
+        if variant is None:
+            self.skipTest("Skipped! Variant not implemented.")
+        if not op.supports_dtype(dtype, torch.device(device).type):
+            self.skipTest(f"Skipped! {op.name} does not support dtype {str(dtype)}")
+
+        def is_inplace(variant):
+            if hasattr(variant, "__wrapped__"):
+                return variant.__wrapped__ is op.get_inplace()
+            return variant is op.get_inplace()
+
+        include_conjugated_inputs = op.test_conjugated_samples and dtype.is_complex
+
+        samples = op.sample_inputs(device, dtype, requires_grad=True, include_conjugated_inputs=include_conjugated_inputs,
+                                   small_inputs_only=TEST_WITH_SLOW_GRADCHECK)  # noqa: F821
+
+        for sample in samples:
+            if sample.broadcasts_input and is_inplace(variant):
+                continue
+
+            # Gradcheck expects tensors as its input, but autograd actually supports tensorlists
+            #   and tensors passed as kwargs. The following creates a function that accepts just
+            #   the tensors that require grad as varargs, and then recomposes them back into the
+            #   original input.
+
+            # Creates gradcheck inputs by identifying tensors requiring grad
+            all_args = None
+            if is_iterable_of_tensors(sample.input):
+                all_args = chain(sample.input, sample.args, sample.kwargs.values())
+            else:
+                all_args = tuple(chain((sample.input,), sample.args, sample.kwargs.values()))
+            gradcheck_args = tuple(x for x in all_args if (isinstance(x, torch.Tensor) and x.requires_grad))
+
+            # Verifies sample input tensors should have no grad
+            # This may happen if the same tensor is used in two different SampleInputs
+            for t in gradcheck_args:
+                self.assertIsNone(t.grad,
+                                  "A sampled input has a gradient before running autograd. "
+                                  "This usually means that (at least) one input tensor is reused "
+                                  "across different SampleInputs. "
+                                  "Please create a new tensor for each SampleInput.")
+
+            def _input_recomposition_helper(inputs, inp, input_idx):
+                if is_iterable_of_tensors(inp):
+                    tensor_list = []
+                    for x in inp:
+                        if isinstance(x, torch.Tensor) and x.requires_grad:
+                            tensor_list.append(inputs[input_idx])
+                            input_idx = input_idx + 1
+                        else:
+                            tensor_list.append(x)
+                    return tensor_list, input_idx
+                elif isinstance(inp, torch.Tensor) and inp.requires_grad:
+                    return inputs[input_idx], input_idx + 1
+                else:
+                    return inp, input_idx
+
+            def fn(*inputs):
+                # Puts inputs back into sample properly
+                positional_args = []
+                input_idx = 0
+                inp, input_idx = _input_recomposition_helper(inputs, sample.input, input_idx)
+                positional_args.append(inp)
+
+                for x in sample.args:
+                    inp, input_idx = _input_recomposition_helper(inputs, x, input_idx)
+                    positional_args.append(inp)
+
+                # Recreates kwargs
+                kwargs = {}
+                for k, v in sample.kwargs.items():
+                    inp, input_idx = _input_recomposition_helper(inputs, v, input_idx)
+                    kwargs[k] = inp
+
+                output = op.gradcheck_wrapper(variant, *positional_args, **kwargs)
+                if sample.output_process_fn_grad is not None:
+                    return sample.output_process_fn_grad(output)
+                return output
+
+            if check == 'gradcheck':
+                if check_batched_grad is None:
+                    check_batched_grad = op.check_batched_grad
+                self.assertTrue(gradcheck(fn, gradcheck_args,
+                                          check_batched_grad=check_batched_grad,
+                                          check_grad_dtypes=True,
+                                          nondet_tol=op.gradcheck_nondet_tol,
+                                          fast_mode=op.gradcheck_fast_mode,
+                                          check_forward_ad=check_forward_ad,
+                                          check_backward_ad=check_backward_ad,
+                                          check_undefined_grad=True,
+                                          check_batched_forward_grad=check_batched_forward_grad))
+            elif check in ('bwgrad_bwgrad', 'fwgrad_bwgrad'):  # gradgrad check
+                self.assertFalse(check_forward_ad, msg="Cannot run forward AD check for gradgradcheck")
+                for gen_non_contig_grad_outputs in (False, True):
+                    kwargs = {
+                        "gen_non_contig_grad_outputs": gen_non_contig_grad_outputs,
+                        "check_batched_grad": op.check_batched_gradgrad,
+                        "check_grad_dtypes": True,
+                        "nondet_tol": op.gradcheck_nondet_tol,
+                        "fast_mode": op.gradcheck_fast_mode
+                    }
+                    if check == "fwgrad_bwgrad":
+                        kwargs["check_fwd_over_rev"] = True
+                        kwargs["check_rev_over_rev"] = False
+                        kwargs["check_batched_grad"] = False
+                        kwargs["check_undefined_grad"] = False
+
+                    self.assertTrue(gradgradcheck(fn, gradcheck_args, **kwargs))
+            else:
+                self.assertTrue(False, msg="Unknown check requested!")
+
+    def _grad_test_helper(self, device, dtype, op, variant, *, check_forward_ad=False, check_backward_ad=True,
+                          check_batched_grad=None, check_batched_forward_grad=False):
+        return self._check_helper(device, dtype, op, variant, 'gradcheck', check_forward_ad=check_forward_ad,
+                                  check_backward_ad=check_backward_ad, check_batched_grad=check_batched_grad,
+                                  check_batched_forward_grad=check_batched_forward_grad)
+
+    def _skip_helper(self, op, device, dtype):
+        if dtype not in op.supported_backward_dtypes(torch.device(device).type):
+            self.skipTest("Skipped! Op doesn't support autograd for this dtype.")
+        if not op.supports_autograd and not op.supports_forward_ad:
+            self.skipTest("Skipped! autograd not supported.")
+
+def make_lazy_class(cls):
+
+    def lazy_init(self, cb):
+        self._cb = cb
+        self._value = None
+
+    cls.__init__ = lazy_init
+
+    for basename in [
+        "add", "sub", "mul", "truediv", "floordiv", "mod", "divmod", "pow",
+        "lshift", "rshift", "and", "or", "xor", "neg", "pos", "abs", "invert",
+        "eq", "ne", "lt", "le", "gt", "ge", "bool", "int", "index",
+    ]:
+        name = f"__{basename}__"
+
+        def inner_wrapper(name):
+            use_operator = basename not in ("bool", "int")
+
+            def wrapped(self, *args, **kwargs):
+                if self._cb is not None:
+                    self._value = self._cb()
+                    self._cb = None
+                if not use_operator:
+                    return getattr(self._value, name)(*args, **kwargs)
+                else:
+                    return getattr(operator, name)(self._value, *args, **kwargs)
+            return wrapped
+
+        setattr(cls, name, inner_wrapper(name))
+
+    return cls
+
+@make_lazy_class
+class LazyVal:
+    pass
+
+
+def munge_exc(e, *, suppress_suffix=True, suppress_prefix=True, file=None, skip=0):
+    if file is None:
+        file = inspect.stack()[1 + skip].filename  # skip one frame
+
+    s = str(e)
+
+    # Remove everything that looks like stack frames in NOT this file
+    def repl_frame(m):
+        if m.group(1) != file:
+            return ""
+        # Don't accept top-level, even for this script, these will wobble
+        # depending on how the testing script was invoked
+        if m.group(2) == "<module>":
+            return ""
+
+        return m.group(0)
+
+    s = re.sub(r'  File "([^"]+)", line \d+, in (.+)\n    .+\n( +[~^]+ *\n)?', repl_frame, s)
+    s = re.sub(r"line \d+", "line N", s)
+    s = re.sub(r".py:\d+", ".py:N", s)
+    s = re.sub(file, os.path.basename(file), s)
+    s = re.sub(os.path.join(os.path.dirname(torch.__file__), ""), "", s)
+    s = re.sub(r"\\", "/", s)  # for Windows
+    if suppress_suffix:
+        s = re.sub(r"\n*Set TORCH_LOGS.+", "", s, flags=re.DOTALL)
+        s = re.sub(r"\n*You can suppress this exception.+", "", s, flags=re.DOTALL)
+    if suppress_prefix:
+        s = re.sub(r"Cannot export model.+\n\n", "", s)
+    s = re.sub(r" +$", "", s, flags=re.M)
+    return s
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/composite_compliance.py b/MLPY/Lib/site-packages/torch/testing/_internal/composite_compliance.py
new file mode 100644
index 0000000000000000000000000000000000000000..e93209eac9c06514dc7b7284b9c6b74ca52d9fc4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/composite_compliance.py
@@ -0,0 +1,581 @@
+# mypy: ignore-errors
+
+import torch
+from torch import Tensor
+import itertools
+
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
+from torch.utils import _pytree as pytree
+from functools import partial
+from torch.utils._mode_utils import no_dispatch, all_same_mode
+import torch.autograd.forward_ad as fwAD
+from typing import Callable
+import re
+
+
+def check_attr_consistency(wrapper_tensor, metadata_name, metadata_accessor):
+    elem = wrapper_tensor.elem
+    metadata_wrapper_tensor = metadata_accessor(wrapper_tensor)
+    metadata_elem = metadata_accessor(elem)
+    if metadata_wrapper_tensor == metadata_elem:
+        return
+    raise RuntimeError(
+        f"This operator is not Composite Compliant: the "
+        f"{metadata_name} of the tensor was modified directly without "
+        f"going through the PyTorch dispatcher.")
+
+def check_metadata_consistency(wrapper_tensor, CCT):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct
+    if not isinstance(wrapper_tensor, CCT):
+        return
+    things_to_check = {
+        'shape': Tensor.size,
+        'dtype': lambda x: x.dtype,
+        'device': lambda x: x.device,
+        'numel': Tensor.numel,
+        'stride': Tensor.stride,
+        'storage_offset': Tensor.storage_offset,
+    }
+    for metadata_name, metadata_accessor in things_to_check.items():
+        check_attr_consistency(wrapper_tensor, metadata_name, metadata_accessor)
+
+def is_view_fn(func):
+    return func.overloadpacket.__name__ in {
+        'as_strided',
+        'detach',
+        'diagonal',
+        'expand',
+        'expand_as',
+        'movedim',
+        'narrow',
+        'permute',
+        'select',
+        'squeeze',
+        'transpose',
+        't',
+        'real',
+        'imag',
+        'view_as_real',
+        'view_as_complex',
+        'unflatten',
+        'unfold',
+        'unsqueeze',
+        'view',
+        'view_as',
+        'unbind',
+        'split',
+        'split_with_sizes',
+        'vsplit',
+        'hsplit',
+        'tensor_split',
+        'chunk',
+        'swapaxes',
+        'slice',
+        '_reshape_alias',
+        '_unsafe_view',
+        '_conj',
+        'alias',
+    }
+
+# manually populated from native_functions that have inplace_view: True.
+# In the future we will probably be able to grab that list directly
+def is_inplace_view_fn(func):
+    return func.overloadpacket.__name__ in {
+        'as_strided_',
+        'detach_',
+        'squeeze_',
+        'swapaxes_',
+        'swapdims_',
+        't_',
+        'transpose_',
+        'unsqueeze_',
+    }
+
+
+# Introspection please save us
+def is_inplace(func):
+    name = func.overloadpacket.__name__
+    if re.match('__i.+__', name):
+        return True
+    if re.match('__.+__', name):
+        return False
+    return name[-1] == '_'
+
+
+def generate_cct_and_mode(autograd_view_consistency=True):
+    # This function returns a new class CompositeCompliantTensor
+    # The two arguments control the behaviour described below.
+
+    # autograd_view_consistency:
+    #   If True, alias result using `set_` if func returns a view
+    #   (See Note [Alias Result]).
+    #   Since Forward AD doesn't work with `set_`
+    #   we disable it by setting alias to False.
+
+    class CompositeCompliantTensor(torch.Tensor):
+        elem: torch.Tensor
+
+        __slots__ = ['elem']
+
+        @staticmethod
+        def __new__(cls, elem, mode, *args, **kwargs):
+            assert type(elem) is not cls, \
+                "Wrapping a CompositeCompliantTensor in a CompositeCompliantTensor is not supported"
+
+            # The storage of CompositeCompliantTensor should never be used directly
+            # by a Composite operation; if the Composite
+            # operator attempts to read from the storage without dispatching then it'll
+            # raise a RuntimeError due to it being a meta storage.
+            r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+                cls, elem.size(),
+                dtype=elem.dtype, layout=elem.layout,
+                device=elem.device, requires_grad=elem.requires_grad,
+                strides=elem.stride(), storage_offset=elem.storage_offset())
+
+            if elem.requires_grad:
+                # CompositeCompliantTensor steals the "requires_grad"-ness.
+                # Why a new copy of `elem`? Because sometimes OpInfo shares inputs between tests...
+                tmp = torch.empty_strided(elem.shape, elem.stride(), dtype=elem.dtype,
+                                          device=elem.device, layout=elem.layout,
+                                          requires_grad=False)
+                tmp.copy_(elem.detach())
+                r.elem = tmp
+            else:
+                r.elem = elem
+
+            assert r.stride() == r.elem.stride()
+
+            # Propagate conjugate bits to the wrapper tensor
+            # Ref: https://github.com/albanD/subclass_zoo/issues/24
+            # Ref: https://github.com/albanD/subclass_zoo/issues/21
+            torch._C._set_conj(r, r.elem.is_conj())
+            torch._C._set_neg(r, r.elem.is_neg())
+
+            r.mode = mode
+            return r
+
+        def __repr__(self):
+            return f"CompositeCompliantTensor({self.elem})"
+
+        @classmethod
+        def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+            all_args = pytree.arg_tree_leaves(*args, **(kwargs or {}))
+            modes = tuple(e.mode for e in all_args if isinstance(e, CompositeCompliantTensor))
+            if not all_same_mode(modes):
+                raise RuntimeError("Multiple CompositeCompliantTensorModes NYI")
+            with modes[0]:
+                return func(*args, **kwargs)
+
+    class CompositeCompliantTensorMode(TorchDispatchMode):
+        def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+            def unwrap(e):
+                return e.elem if isinstance(e, CompositeCompliantTensor) else e
+
+            def wrap(e):
+                return CompositeCompliantTensor(e, self) if isinstance(e, torch.Tensor) else e
+
+            if func == torch.ops.aten._local_scalar_dense.default:
+                raise RuntimeError(
+                    ".item() is not allowed to be called inside of composite "
+                    "functions in the PyTorch library because not all backends "
+                    "and/or Tensor subclasses (e.g. vmap, ProxyTensor) support them.")
+
+            if func.overloadpacket.__name__ in ('set_', 'resize_'):
+                raise RuntimeError(
+                    f"{func.__name__} is not allowed to be called inside of "
+                    f"Composite operators.")
+
+            if is_inplace(func):
+                # NB: We are making an assumption that if the function is in-place,
+                # then the first argument is being written to. Introspection please save us!
+                mutated_argument = args[0]
+                if not isinstance(mutated_argument, CompositeCompliantTensor) and \
+                        any(isinstance(a, CompositeCompliantTensor) for a in args[1:]):
+                    raise RuntimeError(
+                        'Not composite compliant: performing in-place operation '
+                        f'{func.__name__} where the Tensor being written to is '
+                        'regular Tensor but the other tensors are Tensor Subclasses. '
+                        'Please try to avoid this in-place operation.')
+
+            unwrapped_args = tree_map(unwrap, args)
+            unwrapped_kwargs = tree_map(unwrap, kwargs)
+            unwrapped_rs = func(*unwrapped_args, **unwrapped_kwargs)
+            rs = tree_map(wrap, unwrapped_rs)
+
+            if is_view_fn(func) and autograd_view_consistency:
+                # Note [Alias Result]
+                # Autograd asserts that for B = A.view_fn(...), B and A's storages
+                # are the same. Here we try to make B alias A to avoid those asserts.
+                # See https://github.com/pytorch/pytorch/issues/65339 for more information
+                # about the issue.
+                with no_dispatch():
+                    # Idea: this is a weird way of getting a storage that aliases the input.
+                    # This is a workaround for #65339.
+                    # 1. under no_dispatch, all of the wrapper tensors look like regular
+                    #    tensors with special storage (the storage is nullptr and
+                    #    advertises CPU/CUDA device.
+                    # 2. we run func, which ends up running the view operation
+                    # 3. All view operations reuse the input's storage and return
+                    #    result Tensor(s) with new sizes/strides/offset that alias
+                    #    the input.
+                    # 4. we set the storage (and sizes/strides/offset) of the wrapper
+                    #    tensor results to be that of the tensors that alias the input
+                    result = func(*args, **kwargs)
+                    if isinstance(result, (tuple, list)):
+                        for a, b in zip(rs, result):
+                            a.set_(b)
+                    else:
+                        rs.set_(result)
+
+            # Some operations are allowed to in-place modify the metadata of the
+            # inputs. The only ones are the "inplace view functions"; when we
+            # run into these, we manually modify the metadata of the input.
+            with no_dispatch():
+                if is_inplace_view_fn(func):
+                    func(*args, **kwargs)
+
+            # For each CompositeCompliantTensor t, we check that t and t.elem
+            # have consistent metadata. If they don't have consistent metadata,
+            # that means the operator did something fishy.
+            check = partial(check_metadata_consistency, CCT=CompositeCompliantTensor)
+            pytree.tree_map_(check, args)
+            pytree.tree_map_(check, kwargs)
+            pytree.tree_map_(check, rs)
+            return rs
+
+    return CompositeCompliantTensor, CompositeCompliantTensorMode()
+
+def is_tensorlist(lst):
+    if not isinstance(lst, list) and not isinstance(lst, tuple):
+        return False
+    if len(lst) == 0:
+        return False
+    all_tensors = all(isinstance(elt, torch.Tensor) for elt in lst)
+    if all_tensors:
+        return True
+    exists_one_tensor = all(isinstance(elt, torch.Tensor) for elt in lst)
+    if exists_one_tensor:
+        raise RuntimeError('This test assumes that PyTorch APIs cannot take '
+                           'mixed lists of Tensor and other things')
+    return False
+
+
+def maybe_map(fn, should_map, arg):
+    return fn(arg) if should_map else arg
+
+
+def wrap(arg, CCT, cct_mode):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct_and_mode
+    if isinstance(arg, torch.Tensor):
+        return CCT(arg, cct_mode)
+    if is_tensorlist(arg):
+        return [CCT(a, cct_mode) for a in arg]
+    raise RuntimeError("wrap assumes that the input can be wrapped")
+
+
+# Given a list of flat arguments, some of which may be Tensors, return all
+# possible ways some of the arguments could be CompositeCompliantTensors (CCT).
+# For example, given Tensors A, B, C and flat_args = [A, 1, B],
+# We would return the following 4 options:
+# [CCT(A), 1, CCT(B)]
+# [CCT(A), 1, B]
+# [A, 1, CCT(B)]
+# [A, 1, B]
+# NB: Yes, this is exponential. No, we don't care too much because PyTorch ops
+# don't accept that many input Tensors.
+def generate_subclass_choices(flat_args, CCT, cct_mode):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct_and_mode
+    is_tensor_likes = [isinstance(arg, torch.Tensor) or is_tensorlist(arg) for arg in flat_args]
+    subclass_options = [[False, True] if is_tensor_like else [False] for is_tensor_like in is_tensor_likes]
+
+    for which_args_are_wrapped in itertools.product(*subclass_options):
+
+        result = [maybe_map(partial(wrap, CCT=CCT, cct_mode=cct_mode), should_wrap_arg, arg)
+                  for should_wrap_arg, arg in zip(which_args_are_wrapped, flat_args)]
+        yield result, which_args_are_wrapped
+
+
+# For an operation f(*args, **kwargs), each Tensor argument may either be
+# a regular Tensor or a Tensor Subclass. This iterator iterates through
+# all of those options.
+def generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
+    # CCT: CompositeCompliantTensor class which is generated using generate_cct_and_mode
+    flat_kwargs, spec = tree_flatten(kwargs)
+    flat_args_kwargs = list(args) + list(flat_kwargs)
+    for choice, debug_metadata in generate_subclass_choices(flat_args_kwargs, CCT, cct_mode):
+        new_args = choice[:len(args)]
+        new_kwargs = tree_unflatten(choice[len(args):], spec)
+        which_args_are_wrapped = debug_metadata[:len(args)]
+        which_kwargs_are_wrapped = tree_unflatten(debug_metadata[len(args):], spec)
+        yield new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped
+
+
+def raise_composite_compliance_error(err, additional_info=''):
+    raise RuntimeError(
+        "Composite compliance check failed with "
+        "the above error.\n"
+        f"{additional_info}"
+        "If you are adding an OpInfo of an "
+        "existing operator, please feel free to skip this test "
+        "because the problem was pre-existing and file an issue. "
+        "Otherwise, if you added a new operator, please read "
+        "through the Composite Compliance section in "
+        "aten/src/ATen/native/README.md for how to resolve this. "
+    ) from err
+
+
+# This test checks ALL possible permutations of calling `op` with arguments
+# that are individually either a regular Tensor or a Tensor subclass.
+#
+# The general strategy is to wrap some Tensor args and kwargs in
+# CompositeCompliantTensor wrappers and call the operation.
+
+# If some composite operation does any non-compliant behavior,
+# CompositeCompliantTensor will raise an error.
+def check_all_permutations(op, args, kwargs, assert_equal_fn):
+    CCT, cct_mode = generate_cct_and_mode()
+    expected = op(*args, **kwargs)
+    for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
+        new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+
+        try:
+            actual = op(*new_args, **new_kwargs)
+        # NOTE: [What errors are Composite Compliance trying to catch?]
+        #
+        # There's two things we want to catch:
+        # - errors that would raise within the torch_dispatch impl
+        # - data_ptr accesses
+        # The first is easy to filter for (we could make the error a different
+        # error class), the second is always going to be a RuntimeError due to
+        # how it is implemented (if you try to access the data_ptr of thex
+        # wrapper Tensor, it raises you some internal RuntimeError).
+        #
+        # So the most general thing to catch here was RuntimeError. If you
+        # are here and debugging why your test failed, it's plausible that
+        # the operator itself is broken and that there are other tests failing.
+        except RuntimeError as err:
+            raise_composite_compliance_error(
+                err,
+                f"- wrapped_args: {which_args_are_wrapped}\n"
+                f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+            )
+
+        def unwrap(e):
+            return e.elem if isinstance(e, CCT) else e
+
+        assert_equal_fn(tree_map(unwrap, actual), expected)
+
+# Checks via the usage of torch dispatch mode certain anti-patterns that
+# are not composite compliant.
+#
+# In particular, the anti-pattern we are trying to prevent is a user
+# creating an empty tensor and then resize_-ing it. Torch Dispatch Mode helps
+# here because all factory functions will create tensors that are
+# CompositeCompliantTensor.
+#
+# The general strategy is to wrap all Tensor args and kwargs in
+# CompositeCompliantTensor wrappers. If an operator that is
+# Composite does any non-compliant behavior,
+# CompositeCompliantTensor will raise an error.
+def check_with_mode(op, args, kwargs, assert_equal_fn):
+    CCT, cct_mode = generate_cct_and_mode()
+
+    def wrap(e):
+        return CCT(e, cct_mode) if isinstance(e, torch.Tensor) else e
+
+    expected = op(*args, **kwargs)
+
+    args = tree_map(wrap, args)
+    kwargs = tree_map(wrap, kwargs)
+    try:
+        with cct_mode:
+            actual = op(*args, **kwargs)
+    # see NOTE: [What errors are Composite Compliance trying to catch?]
+    except RuntimeError as err:
+        raise_composite_compliance_error(err)
+
+    def unwrap(e):
+        return e.elem if isinstance(e, CCT) else e
+
+    assert_equal_fn(tree_map(unwrap, actual), expected)
+
+def gather_leaf_tensors(args, kwargs):
+    leaf_tensors = []
+    args, args_spec = tree_flatten(args)
+    kwargs, kwargs_spec = tree_flatten(kwargs)
+    args = args + kwargs
+    for arg in args:
+        if not isinstance(arg, torch.Tensor):
+            continue
+        if arg.requires_grad:
+            leaf_tensors.append(arg)
+    return leaf_tensors
+
+
+def compute_expected_grads(op, args, kwargs, output_process_fn_grad=None, gradcheck_wrapper=None):
+    if gradcheck_wrapper is None:
+        results = op(*args, **kwargs)
+    else:
+        results = gradcheck_wrapper(op, *args, **kwargs)
+
+    if output_process_fn_grad is not None:
+        results = output_process_fn_grad(results)
+
+    flat_results = pytree.tree_leaves(results)
+    flat_results = [r for r in flat_results if isinstance(r, torch.Tensor)]
+    flat_diff_results = [r for r in flat_results if r.requires_grad]
+    assert len(flat_diff_results) > 0
+
+    grads = [torch.ones(r.shape, device=r.device, dtype=r.dtype) for r in flat_diff_results]
+    leaf_tensors = gather_leaf_tensors(args, kwargs)
+    assert len(leaf_tensors) > 0
+    return torch.autograd.grad(flat_diff_results, leaf_tensors,
+                               grads, allow_unused=True, retain_graph=True)
+
+
+# Checks if the backward formula is composite compliant by testing
+# all possible permutations of {inputs, grad_outputs} being
+# CompositeCompliantTensor or regular Tensors.
+#
+# NB: it is important that op is accepted as a Callable and not an OpInfo,
+# this means we can apply check_backward_formula to things that aren't OpInfos
+# while debugging.
+def check_backward_formula(op: Callable, args, kwargs,
+                           output_process_fn_grad=None,
+                           gradcheck_wrapper=None, assert_equal_fn=None):
+    CCT, cct_mode = generate_cct_and_mode()
+
+    expected = compute_expected_grads(op, args, kwargs, output_process_fn_grad, gradcheck_wrapper)
+
+    for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
+        new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+        leaf_tensors = gather_leaf_tensors(new_args, new_kwargs)
+        assert len(leaf_tensors) > 0
+
+        try:
+            if gradcheck_wrapper is None:
+                results = op(*new_args, **new_kwargs)
+            else:
+                results = gradcheck_wrapper(op, *new_args, **new_kwargs)
+            if output_process_fn_grad is not None:
+                results = output_process_fn_grad(results)
+        # see NOTE: [What errors are Composite Compliance trying to catch?]
+        except RuntimeError as err:
+            raise_composite_compliance_error(
+                err,
+                f"- wrapped_args: {which_args_are_wrapped}\n"
+                f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+            )
+
+        flat_results = pytree.tree_leaves(results)
+        flat_results = [r for r in flat_results if isinstance(r, torch.Tensor)]
+        flat_diff_results = [r for r in flat_results if r.requires_grad]
+        assert len(flat_diff_results) > 0
+
+        # NB: ones, not ones_like, so we get a regular Tensor here
+        grads = [torch.ones(r.shape, device=r.device, dtype=r.dtype)
+                 for r in flat_diff_results]
+        for flat_new_grads, which_grad_is_batched in generate_subclass_choices(grads, CCT, cct_mode):
+            try:
+                actual = torch.autograd.grad(flat_diff_results, leaf_tensors, flat_new_grads,
+                                             allow_unused=True, retain_graph=True)
+            # see NOTE: [What errors are Composite Compliance trying to catch?]
+            except RuntimeError as err:
+                raise_composite_compliance_error(
+                    err,
+                    f"- wrapped_args: {which_args_are_wrapped}\n"
+                    f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+                    f"- wrapped_grads: {which_grad_is_batched}\n"
+                )
+
+            def unwrap(e):
+                return e.elem if isinstance(e, CCT) else e
+
+            assert_equal_fn(tuple(map(unwrap, actual)), expected, equal_nan=True)
+
+# Checks if the forward AD formula is composite compliant by testing
+# all possible permutations of {primals, tangents} being
+# CompositeCompliantTensor or regular Tensors.
+#
+# NB: it is important that op is accepted as a Callable and not an OpInfo,
+# this means we can apply check_forward_ad_formula to things that aren't OpInfos
+# while debugging.
+def check_forward_ad_formula(op: Callable, args, kwargs, gradcheck_wrapper=None, assert_equal_fn=None):
+    CCT, cct_mode = generate_cct_and_mode(autograd_view_consistency=False)
+
+    def maybe_tangent(t):
+        assert type(t) is not CCT
+        # Generate `tangent` tensor
+        # if given object is a Tensor and requires grad is set.
+        if isinstance(t, torch.Tensor) and t.requires_grad:
+            return torch.randn_like(t)
+        elif is_tensorlist(t):
+            return [torch.randn_like(e) if e.requires_grad else None for e in t]
+        return None
+
+    tangent_args = tuple(maybe_tangent(arg) for arg in args)
+    flat_kwargs, spec = tree_flatten(kwargs)
+    flat_tangent_kwargs = tuple(maybe_tangent(arg) for arg in flat_kwargs)
+    tangent_kwargs = tree_unflatten(flat_tangent_kwargs, spec)
+
+    with fwAD.dual_level():
+        def maybe_make_dual(dual):
+            # Returns dual tensor if primal is a tensor/tensor subclass
+            # with requires_grad set.
+            primal, tangent = dual
+            if isinstance(primal, torch.Tensor) and primal.requires_grad:
+                return fwAD.make_dual(primal.detach(), tangent)
+            elif is_tensorlist(primal):
+                return tuple(fwAD.make_dual(pri.detach(), tang) if tang is not None else pri
+                             for pri, tang in zip(primal, tangent))
+            return primal
+
+        def compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs):
+            op_args = tuple(map(maybe_make_dual, zip(args, tangent_args)))
+            op_kwargs = {k: maybe_make_dual((v, tangent_kwargs[k])) for k, v in kwargs.items()}
+
+            if gradcheck_wrapper is None:
+                return op(*op_args, **op_kwargs)
+            return gradcheck_wrapper(op, *op_args, **op_kwargs)
+
+        expected = compute_expected_grad(args, tangent_args, kwargs, tangent_kwargs)
+        expected = tree_map(fwAD.unpack_dual, expected)
+        expected_primals = tree_map(lambda x: x.primal, expected)
+        expected_tangents = tree_map(lambda x: x.tangent, expected)
+
+        # Permutations of arg and kwargs in CCT.
+        for choice in generate_subclass_choices_args_kwargs(args, kwargs, CCT, cct_mode):
+            new_args, new_kwargs, which_args_are_wrapped, which_kwargs_are_wrapped = choice
+
+            # Permutations tangent arg and tangent kwargs in CCT.
+            for tang_choice in generate_subclass_choices_args_kwargs(tangent_args, tangent_kwargs, CCT, cct_mode):
+                new_tang_args, new_tang_kwargs, \
+                    which_tang_args_are_wrapped, which_tang_kwargs_are_wrapped = tang_choice
+
+                op_args = tuple(map(maybe_make_dual, zip(new_args, new_tang_args)))
+                op_kwargs = {k: maybe_make_dual((v, new_tang_kwargs[k])) for k, v in new_kwargs.items()}
+
+                try:
+                    if gradcheck_wrapper is None:
+                        actual = op(*op_args, **op_kwargs)
+                    else:
+                        actual = gradcheck_wrapper(op, *op_args, **op_kwargs)
+                # see NOTE: [What errors are Composite Compliance trying to catch?]
+                except RuntimeError as err:
+                    raise_composite_compliance_error(
+                        err,
+                        f"- wrapped_args: {which_args_are_wrapped}\n"
+                        f"- wrapped_kwargs: {which_kwargs_are_wrapped}\n"
+                        f"- wrapped_tangent_args: {which_tang_args_are_wrapped}\n"
+                        f"- wrapped_tangent_kwargs: {which_tang_kwargs_are_wrapped}\n"
+                    )
+
+                def unwrap(e):
+                    return e.elem if isinstance(e, CCT) else e
+
+                actual = tree_map(fwAD.unpack_dual, actual)
+                actual_primals = tree_map(lambda x: unwrap(x.primal), actual)
+                actual_tangents = tree_map(lambda x: unwrap(x.tangent), actual)
+                assert_equal_fn(actual_primals, expected_primals, equal_nan=True)
+                assert_equal_fn(actual_tangents, expected_tangents, equal_nan=True)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/control_flow_opinfo_db.py b/MLPY/Lib/site-packages/torch/testing/_internal/control_flow_opinfo_db.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cddbdf758759454edbcfd38747716b80b9e3945
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/control_flow_opinfo_db.py
@@ -0,0 +1,77 @@
+# mypy: ignore-errors
+
+import torch
+import functools
+from torch.testing import make_tensor
+from functorch.experimental.control_flow import map
+from torch.testing._internal.opinfo.core import (
+    OpInfo,
+    SampleInput,
+)
+from torch.testing._internal.common_dtype import all_types_and
+
+def sample_inputs_map(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    yield SampleInput([make_arg(2, 2, 2, low=0.1, high=2), make_arg(2, 2, 2, low=0.1, high=2)],
+                      args=(make_arg(1, low=0.1, high=2), make_arg(1, low=0.1, high=2)))
+
+def inner_f(x, y0, y1):
+    return [x[0].cos().add_(1.) * y0, (x[1] + y1.sin()).cos_().view(x[1].size())]
+
+def simple_map(xs, y0, y1):
+    def f(x, y0, y1):
+        return inner_f(x, y0, y1)
+    return map(f, xs, y0, y1)
+
+def nested_map(xs, y0, y1):
+    def f1(xx, y0, y1):
+        def f2(x, y0, y1):
+            return inner_f(x, y0, y1)
+        return map(f2, xx, y0, y1)
+    return map(f1, xs, y0, y1)
+
+def triple_nested_map(xs, y0, y1):
+    def f0(xs, y0, y1):
+        def f1(xx, y0, y1):
+            def f2(x, y0, y1):
+                return inner_f(x, y0, y1)
+            return map(f2, xx, y0, y1)
+        return map(f1, xs, y0, y1)
+    return map(f0, xs, y0, y1)
+
+control_flow_opinfo_db = [
+    OpInfo(
+        "MapControlflowOp",
+        op=simple_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    ),
+    OpInfo(
+        "NestedMapControlflowOp",
+        op=nested_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    ),
+    OpInfo(
+        "TripleNestedMapControlflowOp",
+        op=triple_nested_map,
+        sample_inputs_func=sample_inputs_map,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+    )
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/custom_op_db.py b/MLPY/Lib/site-packages/torch/testing/_internal/custom_op_db.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6073587e9bbe8c1812bdafa0b118e14ce2ba34
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/custom_op_db.py
@@ -0,0 +1,456 @@
+# mypy: ignore-errors
+
+import torch
+import functools
+from torch.testing import make_tensor
+from torch.testing._internal.opinfo.core import (
+    OpInfo,
+    SampleInput,
+)
+from torch.testing._internal.common_dtype import all_types_and
+import numpy as np
+from torch.testing._internal.autograd_function_db import (
+    sample_inputs_numpy_cube,
+    sample_inputs_numpy_mul,
+    sample_inputs_numpy_sort,
+    sample_inputs_numpy_take,
+)
+from torch import Tensor
+from torch.types import Number
+from typing import *  # noqa: F403
+import torch._custom_ops as custom_ops
+
+# Note: [custom op db]
+#
+# This is a collection of custom operator test cases written as OpInfos
+# so they can easily be consumed by OpInfo-based tests to check if subsystems
+# support them correctly.
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+@custom_ops.custom_op('_torch_testing::numpy_cube')
+def numpy_cube(x: Tensor) -> Tuple[Tensor, Tensor]:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_cube')
+def numpy_cube_impl(x):
+    x_np = to_numpy(x)
+    dx = torch.tensor(3 * x_np ** 2, device=x.device)
+    return torch.tensor(x_np ** 3, device=x.device), dx
+
+@custom_ops.impl_abstract('_torch_testing::numpy_cube')
+def numpy_cube_abstract(x):
+    return x.clone(), x.clone()
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_cube')
+def numpy_cube_save_for_backward(inputs, output):
+    return (inputs.x, output[1])
+
+@custom_ops.impl_backward('_torch_testing::numpy_cube')
+def numpy_cube_backward(ctx, saved, grad_out, grad_dx):
+    x, dx = saved
+    grad_x = torch.ops._torch_testing.numpy_mul(grad_out, dx) + 6 * torch.ops._torch_testing.numpy_mul(grad_dx, x)
+    return {'x': grad_x}
+
+@custom_ops.custom_op('_torch_testing::numpy_mul')
+def numpy_mul(x: Tensor, y: Tensor) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_mul')
+def numpy_mul_impl(x, y):
+    return torch.tensor(to_numpy(x) * to_numpy(y), device=x.device)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_mul')
+def numpy_mul_abstract(x, y):
+    assert x.device == y.device
+    return (x * y).contiguous()
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_mul')
+def numpy_mul_save_for_backward(inputs, output):
+    saved = {}
+    saved['x_requires_grad'] = inputs.x.requires_grad
+    saved['y_requires_grad'] = inputs.y.requires_grad
+    # Optimization: only save what is necessary
+    saved['y'] = inputs.y if inputs.x.requires_grad else None
+    saved['x'] = inputs.x if inputs.y.requires_grad else None
+    return saved
+
+@custom_ops.impl_backward('_torch_testing::numpy_mul')
+def numpy_mul_backward(ctx, saved, grad_out):
+    grad_x = grad_out * saved['y'] if saved['x_requires_grad'] else None
+    grad_y = grad_out * saved['x'] if saved['x_requires_grad'] else None
+    return {'y': grad_y, 'x': grad_x}
+
+@custom_ops.custom_op('_torch_testing::numpy_sort')
+def numpy_sort(x: Tensor, dim: int) -> Tuple[Tensor, Tensor, Tensor]:
+    raise NotImplementedError()
+
+@custom_ops.impl("_torch_testing::numpy_sort")
+def numpy_sort_impl(x, dim):
+    device = x.device
+    x = to_numpy(x)
+    ind = np.argsort(x, axis=dim)
+    ind_inv = np.argsort(ind, axis=dim)
+    result = np.take_along_axis(x, ind, axis=dim)
+    return (
+        torch.tensor(result, device=device),
+        torch.tensor(ind, device=device),
+        torch.tensor(ind_inv, device=device),
+    )
+
+@custom_ops.impl_abstract('_torch_testing::numpy_sort')
+def numpy_sort_abstract(x, dim):
+    return torch.empty_like(x), torch.empty_like(x, dtype=torch.long), torch.empty_like(x, dtype=torch.long)
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_sort')
+def numpy_sort_save_for_backward(inputs, output):
+    out, ind, ind_inv = output
+    return [inputs.dim, ind, ind_inv]
+
+@custom_ops.impl_backward('_torch_testing::numpy_sort', output_differentiability=[True, False, False])
+def numpy_sort_backward(ctx, saved, grad_out, grad_ind, grad_ind_inv):
+    dim, ind, ind_inv = saved
+    return {'x': torch.ops._torch_testing.numpy_take(grad_out, ind_inv, ind, dim)}
+
+@custom_ops.custom_op('_torch_testing::numpy_take')
+def numpy_take(x: Tensor, ind: Tensor, ind_inv: Tensor, dim: int) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl("_torch_testing::numpy_take")
+def numpy_take_impl(x, ind, ind_inv, dim):
+    device = x.device
+    x = to_numpy(x)
+    ind = to_numpy(ind)
+    return torch.tensor(np.take_along_axis(x, ind, dim), device=device)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_take')
+def numpy_take_abstract(x, ind, ind_inv, dim):
+    assert x.device == ind.device
+    assert x.device == ind_inv.device
+    assert ind.dtype == torch.long
+    assert ind_inv.dtype == torch.long
+    return torch.empty_like(x)
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_take')
+def numpy_take_save_for_backward(inputs, output):
+    return {
+        'dim': inputs.dim,
+        'ind': inputs.ind,
+        'ind_inv': inputs.ind_inv,
+    }
+
+@custom_ops.impl_backward('_torch_testing::numpy_take')
+def numpy_take_backward(ctx, saved, grad_out):
+    return {
+        'x': torch.ops._torch_testing.numpy_take(grad_out, saved['ind_inv'], saved['ind'], saved['dim']),
+        'ind': None,
+        'ind_inv': None,
+    }
+
+@custom_ops.custom_op('_torch_testing::numpy_nonzero')
+def numpy_nonzero(x: Tensor) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_nonzero')
+def numpy_nonzero_impl(x):
+    x_np = to_numpy(x)
+    res = np.stack(np.nonzero(x_np), axis=1)
+    if res.shape[0] <= 1:
+        raise RuntimeError("not supported")
+    return torch.tensor(res, device=x.device)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_nonzero')
+def numpy_nonzero_abstract(x):
+    ctx = torch._custom_op.impl.get_ctx()
+    i0 = ctx.create_unbacked_symint()
+    shape = [i0, x.dim()]
+    result = x.new_empty(shape, dtype=torch.long)
+    return result
+
+def sample_inputs_numpy_nonzero(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    shape = 10
+    result = make_arg(shape, low=0.9, high=2)
+    mask = make_tensor(shape, low=0, high=2, device=device, dtype=torch.long)
+    with torch.no_grad():
+        result *= mask
+
+    yield SampleInput(result, args=())
+
+@custom_ops.custom_op('_torch_testing::numpy_view_copy')
+def numpy_view_copy(x: Tensor, shape: Sequence[int]) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_view_copy')
+def numpy_view_copy_impl(x, shape) -> Tensor:
+    return torch.tensor(np.copy(to_numpy(x).reshape(shape)), device=x.device)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_view_copy')
+def numpy_view_copy_abstract(x, shape) -> Tensor:
+    return x.clone().view(shape).clone()
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_view_copy')
+def numpy_view_copy_save_for_backward(inputs, output) -> Tensor:
+    return inputs.x.shape
+
+@custom_ops.impl_backward('_torch_testing::numpy_view_copy')
+def numpy_view_copy_backward(ctx, x_shape, grad_out) -> Tensor:
+    return {'x': torch.ops._torch_testing.numpy_view_copy(grad_out, x_shape)}
+
+def sample_inputs_numpy_view_copy(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    result = make_arg(2, 3, 4, low=0.9, high=2)
+    yield SampleInput(result, args=([2, 12],))
+
+@custom_ops.custom_op('_torch_testing::numpy_cat')
+def numpy_cat(xs: Sequence[Tensor], dim: int) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_cat')
+def numpy_cat_impl(xs, dim):
+    assert len(xs) > 0
+    assert all(x.device == xs[0].device for x in xs)
+    assert all(x.dtype == xs[0].dtype for x in xs)
+    np_xs = [to_numpy(x) for x in xs]
+    np_out = np.concatenate(np_xs, axis=dim)
+    return torch.tensor(np_out, device=xs[0].device)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_cat')
+def numpy_cat_abstract(xs, dim):
+    assert len(xs) > 0
+    assert all(x.device == xs[0].device for x in xs)
+    assert all(x.dtype == xs[0].dtype for x in xs)
+    return torch.cat(xs, dim=dim)
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_cat')
+def numpy_cat_save_for_backward(inputs, output):
+    dim_sizes = [x.shape[inputs.dim] for x in inputs.xs]
+    return dim_sizes, inputs.dim
+
+@custom_ops.impl_backward('_torch_testing::numpy_cat')
+def numpy_cat_backward(ctx, saved, grad_out):
+    dim_sizes, dim = saved
+    splits = list(np.cumsum(dim_sizes)[:-1])
+    grad_xs = torch.ops._torch_testing.numpy_split_copy(grad_out, splits, dim)
+    return {'xs': grad_xs}
+
+def sample_inputs_numpy_cat(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    r0 = make_arg(2, 3, 4, low=0.9, high=2)
+    r1 = make_arg(4, 3, 4, low=0.9, high=2)
+    r2 = make_arg(5, 3, 4, low=0.9, high=2)
+    yield SampleInput([r0, r1, r2], args=(0,))
+
+@custom_ops.custom_op('_torch_testing::numpy_split_copy')
+def numpy_split_copy(x: Tensor, sections: Sequence[int], dim: int) -> List[Tensor]:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_split_copy')
+def numpy_split_copy_impl(x, splits, dim):
+    x_np = to_numpy(x)
+    arrs = np.split(x_np, splits, axis=dim)
+    return [torch.tensor(arr, device=x.device, dtype=x.dtype) for arr in arrs]
+
+@custom_ops.impl_abstract('_torch_testing::numpy_split_copy')
+def numpy_split_copy_abstract(x, splits, dim):
+    return [xi.clone() for xi in torch.tensor_split(x, splits, dim)]
+
+@custom_ops.impl_save_for_backward('_torch_testing::numpy_split_copy')
+def numpy_split_copy_save_for_backward(inputs, output):
+    return inputs.dim
+
+@custom_ops.impl_backward('_torch_testing::numpy_split_copy')
+def numpy_split_copy_backward(ctx, saved, grad_out):
+    dim = saved
+    return {'x': torch.ops._torch_testing.numpy_cat(grad_out, dim=dim)}
+
+def sample_inputs_numpy_split_copy(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+    x = make_arg(2, 9, low=0.9, high=2)
+    yield SampleInput(x, args=([1, 3, 6], 1))
+
+@custom_ops.custom_op('_torch_testing::numpy_split_copy_with_int')
+def numpy_split_copy_with_int(x: Tensor, sections: Sequence[int], dim: int) -> Tuple[List[Tensor], int]:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_split_copy_with_int')
+def numpy_split_copy_with_int_impl(x, splits, dim):
+    x_np = to_numpy(x)
+    arrs = np.split(x_np, splits, axis=dim)
+    return [torch.tensor(arr, device=x.device, dtype=x.dtype) for arr in arrs], len(splits)
+
+@custom_ops.impl_abstract('_torch_testing::numpy_split_copy_with_int')
+def numpy_split_copy_with_int_abstract(x, splits, dim):
+    return [xi.clone() for xi in torch.tensor_split(x, splits, dim)], len(splits)
+
+@custom_ops.impl_save_for_backward(
+    '_torch_testing::numpy_split_copy_with_int')
+def numpy_split_copy_with_int_save_for_backward(inputs, output):
+    return inputs.dim
+
+@custom_ops.impl_backward(
+    '_torch_testing::numpy_split_copy_with_int',
+    output_differentiability=[True, False])
+def numpy_split_copy_with_int_backward(ctx, saved, grad_out, _):
+    dim = saved
+    return {'x': torch.ops._torch_testing.numpy_cat(grad_out, dim=dim)}
+
+@custom_ops.custom_op('_torch_testing::numpy_nms')
+def numpy_nms(boxes: Tensor, scores: Tensor, iou_threshold: Number) -> Tensor:
+    raise NotImplementedError()
+
+@custom_ops.impl('_torch_testing::numpy_nms')
+def numpy_nms_impl(boxes, scores, iou_threshold):
+    # Adapted from Ross Girshick's fast-rcnn implementation at
+    # https://github.com/rbgirshick/fast-rcnn/blob/master/lib/utils/nms.py
+    assert boxes.device == scores.device
+    device = boxes.device
+
+    boxes = to_numpy(boxes)
+    scores = to_numpy(scores)
+
+    N = boxes.shape[0]
+    assert boxes.shape == (N, 4)
+    assert scores.shape == (N,)
+
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= iou_threshold)[0]
+        order = order[inds + 1]
+
+    result = torch.tensor(np.stack(keep), device=device)
+    # Needed for data-dependent condition :(
+    assert result.size(0) >= 2
+    return result
+
+@custom_ops.impl_abstract('_torch_testing::numpy_nms')
+def numpy_nms_abstract(boxes, scores, iou_threshold):
+    assert boxes.device == scores.device
+    N = boxes.shape[0]
+    assert boxes.shape == (N, 4)
+    assert scores.shape == (N,)
+
+    ctx = torch._custom_op.impl.get_ctx()
+    i0 = ctx.create_unbacked_symint()
+    result = boxes.new_empty([i0], dtype=torch.int64)
+    return result
+
+def sample_inputs_numpy_nms(opinfo, device, dtype, requires_grad, **kwargs):
+    make_arg = functools.partial(make_tensor, device=device, dtype=dtype)
+    N = 64
+    xs = make_arg([N], low=0, high=28)
+    dx = make_arg([N], low=0, high=4)
+    ys = make_arg([N], low=0, high=28)
+    dy = make_arg([N], low=0, high=4)
+    boxes = torch.stack([xs, ys, xs + dx, ys + dy], dim=1).requires_grad_(requires_grad)
+    scores = make_arg([N], low=0, high=1, requires_grad=requires_grad)
+    iou_threshold = make_arg([], low=0, high=1).item()
+
+    yield SampleInput(boxes, args=(scores, iou_threshold))
+
+custom_op_db = [
+    OpInfo(
+        'NumpyCubeCustomOp',
+        op=torch.ops._torch_testing.numpy_cube,
+        sample_inputs_func=sample_inputs_numpy_cube,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyMulCustomOp',
+        op=torch.ops._torch_testing.numpy_mul,
+        sample_inputs_func=sample_inputs_numpy_mul,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpySortCustomOp',
+        op=torch.ops._torch_testing.numpy_sort,
+        sample_inputs_func=sample_inputs_numpy_sort,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyTakeCustomOp',
+        op=torch.ops._torch_testing.numpy_take,
+        sample_inputs_func=sample_inputs_numpy_take,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyNonzeroCustomOp',
+        op=torch.ops._torch_testing.numpy_nonzero,
+        sample_inputs_func=sample_inputs_numpy_nonzero,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_autograd=False,
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyNMSCustomOp',
+        op=torch.ops._torch_testing.numpy_nms,
+        sample_inputs_func=sample_inputs_numpy_nms,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_autograd=False,
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyViewCopyCustomOp',
+        op=torch.ops._torch_testing.numpy_view_copy,
+        sample_inputs_func=sample_inputs_numpy_view_copy,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_autograd=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpyCatCustomOp',
+        op=torch.ops._torch_testing.numpy_cat,
+        sample_inputs_func=sample_inputs_numpy_cat,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_autograd=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpySplitCopyCustomOp',
+        op=torch.ops._torch_testing.numpy_split_copy,
+        sample_inputs_func=sample_inputs_numpy_split_copy,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_autograd=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_out=False,
+    ),
+    OpInfo(
+        'NumpySplitCopyWithIntCustomOp',
+        op=torch.ops._torch_testing.numpy_split_copy_with_int,
+        sample_inputs_func=sample_inputs_numpy_split_copy,
+        dtypes=all_types_and(torch.bool, torch.half),
+        gradcheck_wrapper=lambda op, *args, **kwargs: op(*args, **kwargs)[0],
+        supports_autograd=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_out=False,
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ee76da0bd8a1c5c7522a820a99c7503d904c32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/data/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90ac7c448a0324dac2df0703f97746d5e13be9e6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network1.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network1.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3dbab26868fbad25796463a25ed225325bf947
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network1.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network2.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a89b60483d8958a7d91528bf497ce2da2464d96
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/data/__pycache__/network2.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/network1.py b/MLPY/Lib/site-packages/torch/testing/_internal/data/network1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f639453c1e6a2697b8e964a1fd1c49c6ea70d466
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/data/network1.py
@@ -0,0 +1,10 @@
+# mypy: ignore-errors
+
+import torch.nn as nn
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/data/network2.py b/MLPY/Lib/site-packages/torch/testing/_internal/data/network2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac1215d40a8538b7de7fa6950031d8640e7d2bf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/data/network2.py
@@ -0,0 +1,11 @@
+# mypy: ignore-errors
+
+import torch.nn as nn
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(10, 20)
+        self.relu = nn.ReLU()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/dist_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6029d5a84cf303fff6d1528fbf065015831d6708
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/dist_utils.py
@@ -0,0 +1,206 @@
+# mypy: ignore-errors
+
+import re
+import sys
+import time
+from functools import partial, wraps
+from typing import Tuple
+
+import torch.distributed as dist
+import torch.distributed.rpc as rpc
+from torch.distributed.rpc import _rref_context_get_debug_info
+from torch.testing._internal.common_utils import FILE_SCHEMA, TEST_WITH_TSAN
+
+
+if not dist.is_available():
+    print("c10d not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+
+INIT_METHOD_TEMPLATE = FILE_SCHEMA + "{file_name}"
+
+def dist_init(
+    old_test_method=None,
+    setup_rpc: bool = True,
+    clean_shutdown: bool = True,
+    faulty_messages=None,
+    messages_to_delay=None,
+):
+    """
+    We use this decorator for setting up and tearing down state since
+    MultiProcessTestCase runs each `test*` method in a separate process and
+    each process just runs the `test*` method without actually calling
+    'setUp' and 'tearDown' methods of unittest.
+
+    Note: pass the string representation of MessageTypes that should be used
+    with the faulty agent's send function. By default, all retriable messages
+    ("RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT", "RREF_USER_DELETE",
+    "CLEANUP_AUTOGRAD_CONTEXT_REQ") will use the faulty send (this default is
+    set from faulty_rpc_agent_test_fixture.py).
+    """
+    # If we use dist_init without arguments (ex: @dist_init), old_test_method is
+    # appropriately set and we return the wrapper appropriately. On the other
+    # hand if dist_init has arguments (ex: @dist_init(clean_shutdown=False)),
+    # old_test_method is None and we return a functools.partial which is the real
+    # decorator that is used and as a result we recursively call dist_init with
+    # old_test_method and the rest of the arguments appropriately set.
+    if old_test_method is None:
+        return partial(
+            dist_init,
+            setup_rpc=setup_rpc,
+            clean_shutdown=clean_shutdown,
+            faulty_messages=faulty_messages,
+            messages_to_delay=messages_to_delay,
+        )
+
+    @wraps(old_test_method)
+    def new_test_method(self, *arg, **kwargs):
+        # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted
+        # in tests.
+        import torch.distributed.rpc.api as api
+
+        api._ignore_rref_leak = False
+        self.worker_id = self.rank
+        self.setup_fault_injection(faulty_messages, messages_to_delay)
+
+        rpc_backend_options = self.rpc_backend_options
+        if setup_rpc:
+            if TEST_WITH_TSAN:
+                # TSAN runs much slower.
+                rpc_backend_options.rpc_timeout = rpc.constants.DEFAULT_RPC_TIMEOUT_SEC * 5
+                rpc.constants.DEFAULT_SHUTDOWN_TIMEOUT = 60
+
+            rpc.init_rpc(
+                name="worker%d" % self.rank,
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+        return_value = old_test_method(self, *arg, **kwargs)
+
+        if setup_rpc:
+            rpc.shutdown(graceful=clean_shutdown)
+
+        return return_value
+
+    return new_test_method
+
+
+def noop() -> None:
+    pass
+
+
+def wait_until_node_failure(rank: int, expected_error_regex: str = ".*") -> str:
+    """
+    Loops until an RPC to the given rank fails. This is used to
+    indicate that the node has failed in unit tests.
+    Args:
+    rank (int): Rank of the node expected to fail
+    expected_error_regex (optional, str): Regex of exception message expected. Useful to ensure a specific failure
+    occurs, not just any.
+    """
+    while True:
+        try:
+            rpc.rpc_sync(f"worker{rank}", noop, args=())
+            time.sleep(0.1)
+        except Exception as e:
+            if re.search(pattern=expected_error_regex, string=str(e)):
+                return str(e)
+
+
+def wait_until_pending_futures_and_users_flushed(timeout: int = 20) -> None:
+    """
+    The RRef protocol holds forkIds of rrefs in a map until those forks are
+    confirmed by the owner. The message confirming the fork may arrive after
+    our tests check whether this map is empty, which leads to failures and
+    flaky tests. to_here also does not guarantee that we have finished
+    processind the owner's confirmation message for the RRef. This function
+    loops until the map is empty, which means the messages have been received
+    as processed. Call this function before asserting the map returned by
+    _get_debug_info is empty.
+    """
+    start = time.time()
+    while True:
+        debug_info = _rref_context_get_debug_info()
+        num_pending_futures = int(debug_info["num_pending_futures"])
+        num_pending_users = int(debug_info["num_pending_users"])
+        if num_pending_futures == 0 and num_pending_users == 0:
+            break
+        time.sleep(0.1)
+        if time.time() - start > timeout:
+            raise ValueError(
+                "Timed out waiting to flush pending futures and users, had {} pending futures and {} pending users".format(
+                    num_pending_futures, num_pending_users
+                )
+            )
+
+
+def get_num_owners_and_forks() -> Tuple[str, str]:
+    """
+    Retrieves number of OwnerRRefs and forks on this node from
+    _rref_context_get_debug_info.
+    """
+    rref_dbg_info = _rref_context_get_debug_info()
+    num_owners = rref_dbg_info["num_owner_rrefs"]
+    num_forks = rref_dbg_info["num_forks"]
+    return num_owners, num_forks
+
+
+def wait_until_owners_and_forks_on_rank(
+    num_owners: int, num_forks: int, rank: int, timeout: int = 20
+) -> None:
+    """
+    Waits until timeout for num_forks and num_owners to exist on the rank. Used
+    to ensure proper deletion of RRefs in tests.
+    """
+    start = time.time()
+    while True:
+        num_owners_on_rank, num_forks_on_rank = rpc.rpc_sync(
+            worker_name(rank), get_num_owners_and_forks, args=(), timeout=5
+        )
+        num_owners_on_rank = int(num_owners_on_rank)
+        num_forks_on_rank = int(num_forks_on_rank)
+        if num_owners_on_rank == num_owners and num_forks_on_rank == num_forks:
+            return
+        time.sleep(1)
+        if time.time() - start > timeout:
+            raise ValueError(
+                "Timed out waiting {} sec for {} owners and {} forks on rank, had {} owners and {} forks".format(
+                    timeout,
+                    num_owners,
+                    num_forks,
+                    num_owners_on_rank,
+                    num_forks_on_rank,
+                )
+            )
+
+
+def initialize_pg(init_method, rank: int, world_size: int) -> None:
+    # This is for tests using `dist.barrier`.
+    if not dist.is_initialized():
+        dist.init_process_group(
+            backend="gloo",
+            init_method=init_method,
+            rank=rank,
+            world_size=world_size,
+        )
+
+
+def worker_name(rank: int) -> str:
+    return f"worker{rank}"
+
+
+def get_function_event(function_events, partial_event_name):
+    """
+    Returns the first event that matches partial_event_name in the provided
+    function_events. These function_events should be the output of
+    torch.autograd.profiler.function_events().
+
+    Args:
+    function_events: function_events returned by the profiler.
+    event_name (str): partial key that the event was profiled with.
+    """
+    event = [event for event in function_events if partial_event_name in event.name][0]  # noqa: RUF015
+    return event
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..214b01d13dcc90f395c86dfe9c17fbbd4507401c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/checkpoint_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/checkpoint_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..969a07de5ef3a1609f03de2050d1abdcf016c9d9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/checkpoint_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/common_state_dict.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/common_state_dict.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8d190426a8fd86eb8614406643676f559245c67
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/common_state_dict.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/ddp_under_dist_autograd_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/ddp_under_dist_autograd_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd5933239b35725f909763ad497cd685d460d34b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/ddp_under_dist_autograd_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb3f4f6e8de32b60bd0055c6bae312dbf5848c31
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad954fd867ae4bd1df1b6df08f15da97a9447a79
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/distributed_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/fake_pg.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/fake_pg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b785038a290f135131f35225745f0c7921cc531c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/fake_pg.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/multi_threaded_pg.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/multi_threaded_pg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a76d8678c791927ce07ee56aeefc5d430766685d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/multi_threaded_pg.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/pipe_with_ddp_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/pipe_with_ddp_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bfad74481661c6e30c90bfe31c731062d68185b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/pipe_with_ddp_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/rpc_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/rpc_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d339ebe36a4536105adf7414e88daa9ac275f0c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/__pycache__/rpc_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ee76da0bd8a1c5c7522a820a99c7503d904c32
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__init__.py
@@ -0,0 +1 @@
+# mypy: ignore-errors
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..384b1abcd419b3e3d93be8c57dd10171e313f8cb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/test_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/test_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d3ecde336a08277a948a287ca8ff52f0a16cb51
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/__pycache__/test_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e22becfb4ab35aa7be9c34e52f2e0c43542d4ac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__init__.py
@@ -0,0 +1,98 @@
+# mypy: ignore-errors
+
+import sys
+from functools import wraps, partial
+
+import torch
+import torch.distributed as dist
+from torch.distributed import rpc
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    TEST_SKIPS,
+    tp_transports,
+)
+
+TEST_GPU_NUM = 4
+
+class ShardedTensorTestBase(MultiProcessTestCase):
+    @property
+    def world_size(self):
+        return TEST_GPU_NUM
+
+    def init_pg(self, backend="nccl"):
+        if backend not in ["nccl", "gloo", "mpi"]:
+            raise RuntimeError(f"Backend {backend} not supported!")
+
+        dist.init_process_group(
+            backend=backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            init_method=f"file://{self.file_name}",
+        )
+
+        # set device for nccl pg for collectives
+        if backend == "nccl":
+            torch.cuda.set_device(self.rank)
+
+
+    def init_rpc(self):
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(_transports=tp_transports())
+        rpc_backend_options.init_method = f"file://{self.file_name}"
+        for rank in range(self.world_size):
+            rpc_backend_options.set_device_map(
+                f"worker{rank}", {rank: self.rank, self.rank: rank}
+            )
+
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+    def init_comms(self, init_rpc=True, backend="nccl"):
+        if init_rpc:
+            self.init_rpc()
+        self.init_pg(backend=backend)
+
+    def destroy_comms(self, destroy_rpc=True):
+        # Wait for all ranks to reach here before starting shutdown.
+        dist.barrier()
+
+        if destroy_rpc:
+            rpc.shutdown()
+        dist.destroy_process_group()
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    def assert_sharded_tensor_equal(self, st1, st2):
+        st1_local_shards = st1.local_shards()
+        st2_local_shards = st2.local_shards()
+        self.assertEqual(len(st1_local_shards), len(st2_local_shards))
+        for i, st1_local_shard in enumerate(st1_local_shards):
+            self.assertEqual(st1_local_shard.tensor, st2_local_shards[i].tensor)
+            self.assertEqual(st1_local_shard.metadata, st2_local_shards[i].metadata)
+
+        self.assertEqual(st1.metadata(), st2.metadata())
+        self.assertEqual(st1.sharding_spec(), st2.sharding_spec())
+        self.assertEqual(len(st1.remote_shards()), len(st2.remote_shards()))
+
+# wrapper to initialize comms (processgroup + rpc)
+def with_comms(func=None, init_rpc=True, backend="nccl"):
+    if func is None:
+        return partial(
+            with_comms,
+            init_rpc=init_rpc,
+            backend=backend,
+        )
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        if backend == "nccl" and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+        self.init_comms(init_rpc=init_rpc, backend=backend)
+        func(self, *args, **kwargs)
+        self.destroy_comms(destroy_rpc=init_rpc)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..132c8cebb24f1515baac67c09f45dc046f3b6442
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_ops_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_ops_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2135d325aaab7dc0ad93400b59e577ed6291edc2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_ops_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_st_common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_st_common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cba92ccc6de15bc2e44383c61bb2fd699c8600b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/__pycache__/_test_st_common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cdccd587102ee492bdfa030f295f3167f01b217
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_ops_common.py
@@ -0,0 +1,136 @@
+# mypy: ignore-errors
+
+import builtins
+
+import torch
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+    EnumerableShardingSpec,
+    ShardMetadata,
+)
+from torch.distributed._shard.sharding_spec._internals import (
+    get_chunked_dim_size,
+    get_split_size,
+)
+
+
+def generate_chunk_sharding_specs_for_test(sharding_dim):
+    return [
+        ChunkShardingSpec(
+            dim=sharding_dim,
+            placements=[
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+            ],
+        ),
+        # Test different ordering. (Case 1)
+        ChunkShardingSpec(
+            dim=sharding_dim,
+            placements=[
+                "rank:2/cuda:2",
+                "rank:3/cuda:3",
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+            ],
+        ),
+        # Test different ordering. (Case 2)
+        ChunkShardingSpec(
+            dim=sharding_dim,
+            placements=[
+                "rank:3/cuda:3",
+                "rank:0/cuda:0",
+                "rank:1/cuda:1",
+                "rank:2/cuda:2",
+            ],
+        ),
+    ]
+
+
+def generate_enumerable_sharding_specs_for_test():
+    return [
+        EnumerableShardingSpec(
+            [
+                ShardMetadata(
+                    shard_offsets=[0, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:0/cuda:0",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 0],
+                    shard_sizes=[5, 5],
+                    placement="rank:1/cuda:1",
+                ),
+                ShardMetadata(
+                    shard_offsets=[0, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:2/cuda:2",
+                ),
+                ShardMetadata(
+                    shard_offsets=[5, 5],
+                    shard_sizes=[5, 5],
+                    placement="rank:3/cuda:3",
+                ),
+            ]
+        )
+    ]
+
+
+def generate_local_weight_sharding_params_for_test(
+    local_weight, sharded_dim, gpu_num, spec, rank
+):
+    """
+    Shard the local weight based the given spec, so we can compare against
+    the one from sharded tensor.
+
+    Args:
+        local_weight: weight matrix to be sharded.
+        sharded_dim: The dimension which we shard on.
+        gpu_num: number of ranks.
+        spec: sharding spec.
+        rank: # of cuda process.
+
+    Returns:
+        start_pos: start position of sharded weight on the given rank.
+        chunk_size: chunk size of sharded weight on the given rank.
+    """
+    sharding_dim_size = local_weight.size(sharded_dim)
+    split_size = get_split_size(sharding_dim_size, gpu_num)
+    current_offsets = 0
+    start_pos = current_offsets
+    for idx, placement in enumerate(spec.placements):
+        chunk_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
+        if rank == placement.rank():
+            start_pos = current_offsets
+            break
+        current_offsets += chunk_size
+    return start_pos, chunk_size
+
+
+def clone_module_parameter(module, param_name):
+    """
+    Clone a parameter from a given existing module.
+
+    Args:
+        module (:class:`torch.nn.Module`): Module whose parameter needs to be cloned.
+        param_name (str): Name of the parameter of ``module`` that needs to be cloned.
+
+    Returns: cloned tensor as :class:`torch.nn.Parameter`.
+    """
+    tensor = getattr(module, param_name)
+    return torch.nn.Parameter(tensor.detach().clone())
+
+def gen_binary_op_func(python_op, inplace=False):
+    src_lines = ['def f(lhs, rhs):']
+    if "torch" in python_op:
+        src_lines.append(f'  return {python_op}(lhs, rhs)\n')
+    elif inplace:
+        src_lines.append(f'  lhs {python_op}= rhs\n  return lhs\n')
+    else:
+        src_lines.append(f'  return lhs {python_op} rhs\n')
+
+    code_str = '\n'.join(src_lines)
+    g = {'torch': torch}
+    builtins.exec(code_str, g)
+    return g["f"]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..59be3c77eb9e994ec0dbc339d98353886dcee373
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/sharded_tensor/_test_st_common.py
@@ -0,0 +1,66 @@
+# mypy: ignore-errors
+
+import copy
+import random
+import torch
+from torch.distributed._shard import sharded_tensor
+
+from torch.distributed._shard.sharding_spec import (
+    ChunkShardingSpec,
+)
+
+PLACEMENTS = [
+    "rank:0/cuda:0",
+    "rank:1/cuda:1",
+    "rank:2/cuda:2",
+    "rank:3/cuda:3",
+]
+
+DEFAULT_GPU_NUM = 4
+
+
+def _chunk_sharding_specs_list_for_test(sharding_dims, seed=0):
+    spec_list = []
+    for i in range(len(sharding_dims)):
+        random.Random(seed + i).shuffle(PLACEMENTS)
+        spec_list.append(
+            ChunkShardingSpec(
+                dim=sharding_dims[i],
+                placements=copy.deepcopy(PLACEMENTS),
+            )
+        )
+    return spec_list
+
+class MyShardedModel2(torch.nn.Module):
+    def __init__(
+        self,
+        spec=None,
+        group=None,
+        init_rrefs=True
+    ) -> None:
+        super().__init__()
+        if spec is not None:
+            self.sharded_tensor2 = sharded_tensor.rand(
+                spec, 10, 20, process_group=group, init_rrefs=init_rrefs
+            )
+        else:
+            self.sharded_tensor2 = None
+        self.random_tensor2 = torch.nn.Parameter(torch.rand(2, 2))
+
+
+class MyShardedModel1(torch.nn.Module):
+    def __init__(
+        self,
+        spec=None,
+        group=None,
+        init_rrefs=True
+    ) -> None:
+        super().__init__()
+        if spec is not None:
+            self.sharded_tensor1 = sharded_tensor.rand(
+                spec, 10, 20, process_group=group, init_rrefs=init_rrefs
+            )
+        else:
+            self.sharded_tensor1 = None
+        self.random_tensor1 = torch.nn.Parameter(torch.rand(2, 2))
+        self.submodule = MyShardedModel2(spec, group, init_rrefs)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/test_common.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..daccd43469529dc952fee6b358fa909265d43630
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_shard/test_common.py
@@ -0,0 +1,42 @@
+# mypy: ignore-errors
+
+import torch
+import torch.nn as nn
+
+from torch.distributed._shard.sharded_tensor import ShardedTensor
+
+
+class SimpleMegatronLM(nn.Module):
+    def __init__(self, linear_size, rank=None, dtype=torch.float32):
+        super().__init__()
+        self.fc1 = nn.Linear(*linear_size[0], dtype=dtype)
+        self.gelu = nn.GELU()
+        self.fc2 = nn.Linear(*linear_size[1], dtype=dtype)
+        if rank is not None:
+            self.fc1.cuda(rank)
+            self.fc2.cuda(rank)
+
+    def forward(self, inp):
+        return self.fc2(self.gelu(self.fc1(inp)))
+
+    def get_weights(self):
+        if isinstance(self.fc1.weight, ShardedTensor):
+            weight1 = self.fc1.weight.local_tensor()
+        else:
+            weight1 = self.fc1.weight
+
+        if isinstance(self.fc2.weight, ShardedTensor):
+            weight2 = self.fc2.weight.local_tensor()
+        else:
+            weight2 = self.fc2.weight
+
+        return (weight1, weight2)
+
+    def get_biases(self):
+        return (self.fc1.bias, self.fc2.bias)
+
+    def get_weight_grads(self):
+        return (self.fc1.weight.grad, self.fc2.weight.grad)
+
+    def get_bias_grads(self):
+        return (self.fc1.bias.grad, self.fc2.bias.grad)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d683d9a0757fcf32b658645c8b1f7e4764640c35
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/common_dtensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/common_dtensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd895153e595e8d39c8bc68ab35adf30833bc3a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/__pycache__/common_dtensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..acdbca6fd782c0780a60766d40ddd7bcabddf774
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -0,0 +1,486 @@
+# mypy: ignore-errors
+
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import itertools
+from dataclasses import dataclass
+import sys
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Iterator,
+    Tuple,
+    Dict,
+    List,
+    Sequence,
+    TypeVar,
+    cast,
+)
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    MultiThreadedTestCase,
+    TEST_SKIPS,
+    skip_if_lt_x_gpu,
+)
+
+
+from torch.distributed._tensor import (
+    DeviceMesh,
+    Shard,
+    Replicate,
+    distribute_tensor,
+)
+from torch.distributed._tensor.placement_types import Placement
+
+DEVICE_TYPE = "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
+PG_BACKEND = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
+
+NUM_DEVICES = 4
+
+# We use this as a proxy for "multiple GPUs exist"
+if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+    # when we actually have multiple GPUs, relax the requirement to smaller counts.
+    NUM_DEVICES = min(NUM_DEVICES, torch.cuda.device_count())
+
+T = TypeVar("T")
+
+
+# simple RMSNorm layer for testing
+class RMSNormPython(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x)
+        return output * self.weight
+
+class MLPModule(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        torch.manual_seed(5)
+        self.net1 = nn.Linear(10, 16, device=device)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(16, 10, device=device)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+    def reset_parameters(self):
+        self.net1.reset_parameters()
+        self.net2.reset_parameters()
+
+
+@dataclass
+class ModelArgs:
+    n_layers: int = 2
+    vocab_size: int = 16
+    max_seq_len: int = 16
+    dim: int = 8
+    n_heads: int = 4
+    dropout_p: float = 0.1
+    use_attn_mask: bool = True
+    weight_tying: bool = True
+    checkpoint_activations: bool = False
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        assert args.dim % args.n_heads == 0
+        self.head_dim = args.dim // args.n_heads
+        self.n_heads = args.n_heads
+        self.dropout_p = args.dropout_p
+        self.resid_dropout = nn.Dropout(args.dropout_p)
+        self.use_attn_mask = args.use_attn_mask
+
+        self.wq = nn.Linear(args.dim, args.dim, bias=False)
+        self.wk = nn.Linear(args.dim, args.dim, bias=False)
+        self.wv = nn.Linear(args.dim, args.dim, bias=False)
+        self.wo = nn.Linear(args.dim, args.dim, bias=False)
+
+    def forward(self, x):
+        bsz, seq_len, _ = x.size()
+        queries, keys, values = self.wq(x), self.wk(x), self.wv(x)
+        queries = queries.view(bsz, seq_len, self.n_heads, self.head_dim)
+        keys = keys.view(bsz, seq_len, self.n_heads, self.head_dim)
+        values = values.view(bsz, seq_len, self.n_heads, self.head_dim)
+
+        queries = queries.transpose(1, 2)  # (bsz, n_heads, seq_len, head_dim)
+        keys = keys.transpose(1, 2)  # (bsz, n_heads, seq_len, head_dim)
+        values = values.transpose(1, 2)  # (bsz, n_heads, seq_len, head_dim)
+
+        output = F.scaled_dot_product_attention(
+            queries, keys, values, None,
+            self.dropout_p if self.training else 0,
+            self.use_attn_mask,
+        )
+        output = output.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.resid_dropout(self.wo(output))
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout_p):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim)
+        self.gelu = nn.GELU()
+        self.w2 = nn.Linear(hidden_dim, dim)
+        self.resid_dropout = nn.Dropout(dropout_p)
+
+    def forward(self, x):
+        return self.resid_dropout(self.w2(self.gelu(self.w1(x))))
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.attention_norm = nn.LayerNorm(args.dim)
+        self.attention = Attention(args)
+        self.ffn_norm = nn.LayerNorm(args.dim)
+        self.feed_forward = FeedForward(args.dim, hidden_dim=4 * args.dim, dropout_p=args.dropout_p)
+
+    def forward(self, x):
+        h = x + self.attention(self.attention_norm(x))
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+
+# A toy transformer model, partly inspired by the nanoGPT model:
+# https://github.com/karpathy/nanoGPT.
+class Transformer(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        assert args.vocab_size is not None
+        assert args.max_seq_len is not None
+        self.max_seq_len = args.max_seq_len
+        self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
+        self.pos_embeddings = nn.Embedding(args.max_seq_len, args.dim)
+        self.dropout = nn.Dropout(args.dropout_p)
+        self.layers = nn.ModuleList()
+        for _ in range(args.n_layers):
+            self.layers.append(TransformerBlock(args))
+        self.norm = nn.LayerNorm(args.dim)
+        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)
+        if args.weight_tying:
+            self.output.weight = self.tok_embeddings.weight
+        self.checkpoint_activations = args.checkpoint_activations
+
+    def forward(self, tokens):
+        _bsz, seq_len = tokens.size()
+        assert seq_len <= self.max_seq_len
+        h = self.tok_embeddings(tokens)
+        pos = torch.arange(0, seq_len, device=tokens.device)
+        p = self.pos_embeddings(pos)  # positional embeddings of shape (seq_len, dim)
+        h = h + p
+        h = self.dropout(h)
+        for layer in self.layers:
+            if self.checkpoint_activations:
+                h = torch.utils.checkpoint.checkpoint(layer, h, use_reentrant=False)
+            else:
+                h = layer(h)
+        h = self.norm(h)
+        output = self.output(h).float()
+        return output
+
+
+def skip_unless_torch_gpu(method: T) -> T:
+    """
+    Test decorator which skips the test unless there's a GPU available to torch.
+
+    >>> # xdoctest: +SKIP
+    >>> @skip_unless_torch_gpu
+    >>> def test_some_method(self) -> None:
+    >>>   ...
+    """
+    # The builtin @skip_if_no_gpu relies on os.environ['WORLD_SIZE'] being set.
+    return cast(T, skip_if_lt_x_gpu(NUM_DEVICES)(method))
+
+
+class DTensorTestBase(MultiProcessTestCase):
+    @property
+    def world_size(self) -> int:
+        return NUM_DEVICES
+
+    @property
+    def backend(self) -> str:
+        return PG_BACKEND
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return DeviceMesh(DEVICE_TYPE, list(range(self.world_size)))
+
+    def init_pg(self) -> None:
+        if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+
+        if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl"]:
+            raise RuntimeError(f"Backend {self.backend} not supported!")
+
+        dist.init_process_group(
+            backend=self.backend,
+            world_size=self.world_size,
+            rank=self.rank,  # pyre-ignore[16]
+            init_method=f"file://{self.file_name}",  # pyre-ignore[16]
+        )
+
+        # set device for nccl pg for collectives
+        if "nccl" in self.backend:
+            torch.cuda.set_device(self.rank)
+
+    def destroy_pg(self) -> None:
+        # Wait for all ranks to reach here before starting shutdown.
+        # FIXME dist.barrier deadlocks with multiple threads and NCCL: https://github.com/pytorch/pytorch/issues/95895
+        # dist.all_reduce(torch.zeros((1,), device="cuda" if torch.cuda.is_available() else "cpu"))
+        # FIXME can't use the above all_reduce as it causes hangs on bionic and focal. It hangs:
+        #  test_dtensor.py  -- DTensorMeshTest.test_dtensor_device_mesh_device_conversion
+        dist.barrier()
+        dist.destroy_process_group()
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    # pyre-ignore[2]:
+    def _test_op(self, mesh: DeviceMesh, op_call, *args, **kwargs) -> None:
+        out = op_call(*args, **kwargs)
+        dtc = DTensorConverter(mesh, args, kwargs)
+        for d_args, d_kwargs in dtc:
+            # pyre can't find assertTrue anymore?
+            self.assertEqual(dtc.successful(), True)
+            d_out = op_call(*d_args, **d_kwargs)
+            self.assertEqual(d_out.full_tensor(), out)
+
+    def run_subtests(self, *args, **kwargs):
+        return run_subtests(self, *args, **kwargs)
+
+
+TestFunc = Callable[[object], object]
+
+# wrapper to initialize comms (processgroup)
+def with_comms(func: TestFunc) -> TestFunc:
+    assert func is not None
+
+    @wraps(func)  # pyre-ignore[6]
+    def wrapper(
+        self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
+    ) -> None:
+        # if backend not specified, and cuda available, then use nccl, else gloo
+        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
+            self.device_type = "cuda"
+        else:
+            self.device_type = "cpu"
+
+        self.init_pg()
+        func(self, *args, **kwargs)  # type: ignore[misc]
+        self.destroy_pg()
+
+    return wrapper
+
+
+def run_subtests(
+    cls_inst,
+    subtest_config: Dict[str, List[Any]],
+    test_fn: Callable,
+    *test_args,
+    **test_kwargs: Any,
+):
+    """
+    Runs a test function given by ``test_fn`` as a subtest according to the
+    configurations specified by ``subtest_config``. This amortizes the
+    costly setup overhead (including process spawn and initializing the
+    process group) over the subtests.
+
+    Args:
+        subtest_config (Dict[str, List[Any]]): A mapping from subtest
+            keyword argument name to a list of its possible values.
+        test_fn (Callable): A callable that runs the actual test.
+        test_args: Positional arguments to pass to ``test_fn``.
+        test_kwargs: Keyword arguments to pass to ``test_fn``.
+    """
+    # Convert the config mapping to a list to have a fixed order
+    subtest_config_items: List[Tuple[str, List[Any]]] = list(subtest_config.items())
+    subtest_config_keys: List[str] = [item[0] for item in subtest_config_items]
+    subtest_config_values: List[List[Any]] = [item[1] for item in subtest_config_items]
+    for values in itertools.product(*subtest_config_values):
+        # Map keyword to chosen value
+        subtest_kwargs = dict(zip(subtest_config_keys, values))
+        with cls_inst.subTest(**subtest_kwargs):
+            test_fn(*test_args, **test_kwargs, **subtest_kwargs)
+        dist.barrier()
+
+
+class DTensorOpTestBase(MultiThreadedTestCase):
+    @property
+    def world_size(self) -> int:
+        return NUM_DEVICES
+
+    @property
+    def device_type(self) -> str:
+        return DEVICE_TYPE
+
+    def build_device_mesh(self):
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
+
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_threads()
+
+
+# This is a class for converting args/kwargs of an op into distributed args/kwargs
+class DTensorConverter:
+    def __init__(
+        self,
+        mesh: DeviceMesh,
+        args: Tuple[object, ...],
+        kwargs: Dict[str, object],
+    ) -> None:
+        self.hit = 0
+        self.miss = 0
+        self.mesh = mesh
+        self.args = args
+        self.kwargs = kwargs
+        flatten_args, flatten_args_spec = tree_flatten(args)
+        flatten_kwargs, flatten_kwargs_spec = tree_flatten(kwargs)
+
+        self.flatten_args: List[object] = flatten_args
+        self.flatten_args_spec: TreeSpec = flatten_args_spec
+        self.flatten_kwargs: List[object] = flatten_kwargs
+        self.flatten_kwargs_spec: TreeSpec = flatten_kwargs_spec
+
+        choices_for_args = []
+        for arg in self.flatten_args:
+            if isinstance(arg, torch.Tensor):
+                choices_for_args.append(self.gen_sharding_choices_for_arg(arg))
+
+        for arg in self.flatten_kwargs:
+            if isinstance(arg, torch.Tensor):
+                choices_for_args.append(self.gen_sharding_choices_for_arg(arg))
+
+        self.sharding_combs: Iterator[Sequence[Placement]] = iter(
+            itertools.product(*choices_for_args)
+        )
+
+    def successful(self) -> bool:
+        return self.hit > 0 and self.miss == 0
+
+    def is_supported_tensor(self, t: torch.Tensor) -> bool:
+        # TODO: dist tensor need to support quantized and sparse
+        # tensors, quantized tensor might be relatively easy, but
+        # sparse tensor have special layouts that we need to possibly
+        # deal with, until we are clear about them, we don't officially
+        # support them.
+        return not any(
+            [
+                t.is_sparse_csr,
+                t.is_sparse,
+                t.is_mkldnn,
+                t.is_quantized,
+                t.is_nested,
+                torch._is_functional_tensor(t),
+                t.is_neg(),
+                t.is_conj(),
+                t.device.type in ("lazy", "meta"),
+                # We need a way to test if a tensor is batched but there
+                # is no official APi to do it
+                # torch._C._is_batched(t),
+            ]
+        )
+
+    def gen_sharding_choices_for_arg(
+        self, arg: torch.Tensor
+    ) -> Sequence[Placement]:
+        mesh_size = self.mesh.size()
+        sharding_choices: List[Placement] = [Replicate()]
+        # c10d collective does not support bool tensor
+        # for bool tensor we treat it as replicated
+        if arg.dtype != torch.bool:
+            # only generating choices with: replicate, or sharding
+            # evenly on a dimension that could be sharded
+            sharding_choices = sharding_choices + [
+                Shard(i)
+                for i, s in enumerate(arg.shape)
+                if s > 1 and s % mesh_size == 0
+            ]
+        # TODO: add multi mesh choices
+        # all_choices = itertools.product(
+        #     *(self.mesh.ndim * [sharding_choices])
+        # )
+        return sharding_choices
+
+    def __iter__(self) -> "DTensorConverter":
+        return self
+
+    def __next__(self) -> Tuple[Tuple[object, ...], Dict[str, object]]:
+        try:
+            next_sharding_choices = next(self.sharding_combs)
+            idx = 0
+
+            new_args: List[object] = []
+            for arg in self.flatten_args:
+                if isinstance(arg, torch.Tensor):
+                    new_args.append(
+                        self.to_dist_tensor(
+                            arg, self.mesh, [next_sharding_choices[idx]]
+                        )
+                    )
+                    idx += 1
+                else:
+                    new_args.append(arg)
+
+            new_kwargs: List[object] = []
+            for arg in self.flatten_kwargs:
+                if isinstance(arg, torch.Tensor):
+                    new_kwargs.append(
+                        self.to_dist_tensor(
+                            arg, self.mesh, [next_sharding_choices[idx]]
+                        )
+                    )
+                    idx += 1
+                else:
+                    new_kwargs.append(arg)
+
+            return (
+                tree_unflatten(new_args, self.flatten_args_spec),
+                tree_unflatten(new_kwargs, self.flatten_kwargs_spec),
+            )
+        except StopIteration as e:
+            raise StopIteration from e
+
+    def to_dist_tensor(
+        self, t: torch.Tensor, mesh: DeviceMesh, placements: List[Placement]
+    ) -> torch.Tensor:
+        if type(t) is torch.Tensor or type(t) is nn.Parameter:
+            if self.is_supported_tensor(t):
+                self.hit += 1
+                if t.ndim == 0:
+                    # scalar tensor by default will be replicated
+                    r = distribute_tensor(t, mesh, [Replicate()] * mesh.ndim)
+                else:
+                    # distribute non-scalar tensors
+                    r = distribute_tensor(t, mesh, placements)
+                if type(t) is nn.Parameter:
+                    r = nn.Parameter(  # type: ignore[assignment]
+                        r, requires_grad=r.requires_grad
+                    )
+                return r
+            else:
+                self.miss += 1
+                return t
+        elif torch.overrides.is_tensor_like(t):
+            # Blindly converting tensor subclasses to dist tensor can cause
+            # unpredictable problems, we explicitly disable this conversion
+            # for now (i.e. we don't support DTensor holding tensor subclass
+            # until there's a strong reason later).
+            self.miss += 1
+            return t
+        else:
+            raise RuntimeError(
+                f"Trying to convert to DTensor, but got {type(t)}"
+            )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/checkpoint_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/checkpoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1baaa6a16fba1a834c972ba83af8b703d28228ae
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/checkpoint_utils.py
@@ -0,0 +1,51 @@
+# mypy: ignore-errors
+
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+import os
+import shutil
+import tempfile
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import torch.distributed as dist
+
+
+def with_temp_dir(
+    func: Optional[Callable] = None,
+) -> Optional[Callable]:
+    """
+    Wrapper to initialize temp directory for distributed checkpoint.
+    """
+    assert func is not None
+
+    @wraps(func)
+    def wrapper(self, *args: Tuple[object], **kwargs: Dict[str, Any]) -> None:
+        if dist.is_initialized():
+            # Only create temp_dir when rank is 0
+            if dist.get_rank() == 0:
+                temp_dir = tempfile.mkdtemp()
+                print(f"Using temp directory: {temp_dir}")
+            else:
+                temp_dir = ""
+            object_list = [temp_dir]
+
+            # Broadcast temp_dir to all the other ranks
+            os.sync()
+            dist.broadcast_object_list(object_list)
+            self.temp_dir = object_list[0]
+            os.sync()
+        else:
+            temp_dir = tempfile.mkdtemp()
+            print(f"No process group initialized, using temp directory: {temp_dir}")
+            self.temp_dir = temp_dir
+
+        try:
+            func(self, *args, **kwargs)
+        finally:
+            if dist.is_initialized() and dist.get_rank() == 0:
+                shutil.rmtree(self.temp_dir, ignore_errors=True)
+            else:
+                shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/common_state_dict.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/common_state_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee36e449b2e001862b33f628280273acbe686a7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/common_state_dict.py
@@ -0,0 +1,113 @@
+# mypy: ignore-errors
+
+# Owner(s): ["oncall: distributed"]
+
+import copy
+from itertools import chain
+from typing import Any, Dict
+
+import torch
+import torch.nn as nn
+
+from torch.distributed._sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import _gather_state_dict
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.state_dict import (
+    PG,
+    set_state_dict,
+    STATE,
+    StateDictOptions,
+)
+
+
+class VerifyStateDictMixin:
+    def _compare_tensor(self, orig_tensor, dist_tensor):
+        if isinstance(dist_tensor, (DTensor, ShardedTensor)):
+            dist_tensor = _gather_state_dict({"mykey": dist_tensor}).pop("mykey")
+        self.assertTrue(isinstance(dist_tensor, torch.Tensor))
+        self.assertTrue(torch.allclose(orig_tensor, dist_tensor))
+
+    def _verify_msd(
+        self,
+        msd: Dict[str, Any],
+        dist_msd: Dict[str, Any],
+        options: StateDictOptions = StateDictOptions(),
+    ) -> None:
+        if not options.ignore_frozen_params:
+            self.assertEqual(len(msd), len(dist_msd))
+        for fqn, param in msd.items():
+            dist_param = dist_msd.get(fqn, None)
+            if not options.ignore_frozen_params:
+                self.assertIsNotNone(dist_param, f"{fqn=}")
+                self._compare_tensor(param, dist_param)
+            elif dist_param is None:
+                self.assertFalse(param.requires_grad, f"{fqn=}")
+
+    def _verify_osd(
+        self,
+        model: nn.Module,
+        optim: torch.optim.Optimizer,
+        osd: Dict[str, Any],
+        dist_osd: Dict[str, Any],
+    ) -> None:
+        params = list(chain.from_iterable(g["params"] for g in optim.param_groups))
+        param_pid_mapping = dict(zip(params, range(len(params))))
+        fqn_pid_mapping = {}
+        for fqn, param in model.named_parameters():
+            pid = param_pid_mapping[param]
+            fqn_pid_mapping[fqn] = pid
+            fqn_pid_mapping[pid] = fqn
+        # Check optimizer_state_dict state
+
+        self.assertEqual(len(osd[STATE]), len(dist_osd[STATE]))
+        for pid, states in osd[STATE].items():
+            fqn = fqn_pid_mapping[pid]
+            dist_states = dist_osd[STATE].get(fqn, None)
+            self.assertIsNotNone(dist_states, fqn)
+            self.assertEqual(len(states), len(dist_states))
+            for key, state in states.items():
+                dist_state = states.get(key, None)
+                self.assertIsNotNone(dist_state)
+                self._compare_tensor(state, dist_state)
+
+        # Check optimizer_state_dict param_group
+        old_dist_osd_pg = dist_osd[PG]
+        if len(osd[PG]) != len(dist_osd[PG]):
+            self.assertTrue(len(dist_osd[PG]) > len(osd[PG]))
+            new_pg = copy.deepcopy(dist_osd[PG][0])
+            new_pg["params"] = []
+            for dist_group in dist_osd[PG]:
+                new_pg["params"].extend(dist_group["params"])
+            dist_osd[PG] = [new_pg]
+
+        self.assertEqual(len(osd[PG]), len(dist_osd[PG]))
+        for group, dist_group in zip(osd[PG], dist_osd[PG]):
+            self.assertEqual(len(group), len(dist_group))
+            for key, value in group.items():
+                # Below doesn't work because param_groups can have None
+                # values.
+                # dist_value = dist_group.get(key, None)
+                # self.assertIsNotNone(dist_value, (dist_group, group))
+                dist_value = dist_group[key]
+                if key == "params":
+                    fqns = [fqn_pid_mapping[pid] for pid in value]
+                    self.assertEqual(sorted(fqns), sorted(dist_value))
+                else:
+                    self.assertEqual(value, dist_value)
+        dist_osd[PG] = old_dist_osd_pg
+
+    def _verify_osd_by_load(
+        self,
+        model: nn.Module,
+        optim: torch.optim.Optimizer,
+        new_optim: torch.optim.Optimizer,
+        dist_osd: Dict[str, Any],
+    ) -> None:
+        new_dist_osd = _gather_state_dict(dist_osd)
+        set_state_dict(
+            model,
+            optimizers=new_optim,
+            model_state_dict={},
+            optim_state_dict=new_dist_osd,
+        )
+        self.assertEqual(optim.state_dict(), new_optim.state_dict())
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d266d3e420b66ba03ff2563a1d30787c0e3e0c42
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/ddp_under_dist_autograd_test.py
@@ -0,0 +1,733 @@
+# mypy: ignore-errors
+
+import contextlib
+import enum
+import logging
+import os
+import threading
+from typing import NamedTuple
+
+import torch
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+import torch.nn as nn
+from torch.distributed import rpc
+from torch.distributed.nn import RemoteModule
+from torch.nn.parallel import DistributedDataParallel
+from torch.testing._internal.common_distributed import (
+    requires_gloo,
+    requires_nccl,
+    skip_if_lt_x_gpu,
+    skip_if_rocm,
+)
+from torch.testing._internal.dist_utils import INIT_METHOD_TEMPLATE, dist_init
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+
+NUM_EM_ROW = 2
+D_SPARSE = 3
+D_DENSE = 2
+D_HID = 3
+D_OUT = 1
+NUM_TRAINERS = 4
+# Trainers + the master + the remote worker
+WORLD_SIZE = NUM_TRAINERS + 2
+TRAINER_RANKS = list(range(NUM_TRAINERS))
+REMOTE_WORKER_RANK = TRAINER_RANKS[-1] + 1
+MASTER_RANK = REMOTE_WORKER_RANK + 1
+
+
+class DdpMode(enum.Enum):
+    # Don't apply DDP
+    NONE = enum.auto()
+    # Apply DDP to the top level nn.Module
+    OUTSIDE = enum.auto()
+    # Embed DDP inside the top level nn.Module
+    INSIDE = enum.auto()
+
+
+def init_logger():
+    logger = logging.getLogger(__name__)
+    level = logging.DEBUG if "debug" in os.environ else logging.INFO
+    logger.setLevel(level)
+    console = logging.StreamHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s %(filename)s:%(lineno)s %(levelname)s p:%(processName)s t:%(threadName)s: %(message)s"
+    )
+    console.setFormatter(formatter)
+    console.setLevel(level)
+    # add the handlers to the logger
+    logger.addHandler(console)
+    logger.propagate = False
+    return logger
+
+
+gLogger = init_logger()
+
+
+class FeatureSet(NamedTuple):
+    """ A feature set has 2 types of features"""
+
+    dense_features: torch.Tensor
+    sparse_features: torch.LongTensor
+    values: torch.Tensor
+
+
+def _call_method(method, rref, *args, **kwargs):
+    return method(rref.local_value(), *args, **kwargs)
+
+
+def _remote_method(method, rref, *args, **kwargs):
+    args_tup = tuple([method, rref] + list(args))
+    return rpc.rpc_sync(rref.owner(), _call_method, args=args_tup, kwargs=kwargs)
+
+
+def _remote_method_async(method, rref, *args, **kwargs):
+    args_tup = tuple([method, rref] + list(args))
+    return rpc.rpc_async(rref.owner(), _call_method, args=args_tup, kwargs=kwargs)
+
+
+class RemoteEM(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        gLogger.info("Initing RemoteEM with %s %s", num_embeddings, embedding_dim)
+        super().__init__()
+        init_em = [0.5] * embedding_dim
+        self.em = nn.EmbeddingBag(
+            num_embeddings,
+            embedding_dim,
+            _weight=torch.tensor([init_em] * num_embeddings),
+        )
+
+    def forward(self, input: torch.Tensor):
+        gLogger.debug("Running RemoteEM.forward() on: %s", input)
+        return self.em(input, offsets=torch.LongTensor(range(input.shape[0])))
+
+
+# Return a linear module with predefined parameters.
+def getLinear(d_in, d_out):
+    l = nn.Linear(d_in, d_out, bias=False)
+    w = torch.ones((d_out, d_in))
+    w[0][0] = -1
+    w.requires_grad_()
+    l.weight.data = w
+    return l
+
+
+class RemoteNet(nn.Module):
+    def __init__(self, d_in: int, d_out: int):
+        gLogger.info("Initing RemoteNet with %s %s", d_in, d_out)
+        super().__init__()
+        self.fc = getLinear(d_in, d_out)
+        self.relu = nn.ReLU()
+
+    def forward(self, input: torch.Tensor):
+        gLogger.debug("Running RemoteNet.forward() on: %s", input)
+        return self.relu(self.fc(input))
+
+
+class HybridModel(nn.Module):
+    def __init__(
+        self,
+        remote_em_rref: rpc.RRef,
+        remote_net_rref: rpc.RRef,
+        process_group_for_ddp: dist.ProcessGroup = None,
+    ):
+        super().__init__()
+        self.remote_em_rref = remote_em_rref
+        self.remote_net_rref = remote_net_rref
+        self.fc1 = getLinear(D_DENSE, D_DENSE)
+        self.fc2 = getLinear(D_HID, D_OUT)
+
+        self.non_ddp_params = tuple(self.fc1.parameters()) + tuple(
+            self.fc2.parameters()
+        )
+        self.ddp_params = ()
+
+        if process_group_for_ddp is not None:
+            self.non_ddp_params, self.ddp_params = (
+                tuple(self.fc1.parameters()),
+                tuple(self.fc2.parameters()),
+            )
+            gLogger.info("Use DDP for the second local net.")
+            self.fc2 = DistributedDataParallel(
+                self.fc2, check_reduction=True, process_group=process_group_for_ddp
+            )
+
+        gLogger.info(
+            "HybridModel has %s groups of parameters.", len(list(self.parameters()))
+        )
+
+    def forward(self, input: FeatureSet):
+        gLogger.debug("Running HybridModel.forward on %s", input)
+        sparse = _remote_method(
+            RemoteEM.forward, self.remote_em_rref, input.sparse_features
+        )
+        # The same size of mini batch.
+        assert sparse.shape[0] == input.dense_features.shape[0]
+        dense = self.fc1(input.dense_features)
+        x = torch.cat((dense, sparse), 1)
+        gLogger.debug("Concatenated feature: %s", x)
+        x = _remote_method(RemoteNet.forward, self.remote_net_rref, x)
+        return self.fc2(x)
+
+
+class Trainer:
+    def __init__(
+        self,
+        remote_em_rref: rpc.RRef,
+        remote_net_rref: rpc.RRef,
+        ddp_mode: DdpMode,
+        rank: int,
+    ):
+        self.rank = rank
+        self.trainer_group = (
+            dist.new_group(TRAINER_RANKS)
+            if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE)
+            else None
+        )
+        self.remote_em_rref = remote_em_rref
+        self.remote_net_rref = remote_net_rref
+        self.hybrid_module = HybridModel(
+            self.remote_em_rref,
+            self.remote_net_rref,
+            self.trainer_group if ddp_mode in (DdpMode.INSIDE,) else None,
+        )
+        self.ddp_params, self.non_ddp_params = (
+            self.hybrid_module.ddp_params,
+            self.hybrid_module.non_ddp_params,
+        )
+        if ddp_mode == DdpMode.OUTSIDE:
+            gLogger.info("Wrapping the whole hybrid module into DDP.")
+            self.ddp_params += self.non_ddp_params
+            self.non_ddp_params = ()
+            self.hybrid_module = DistributedDataParallel(
+                self.hybrid_module,
+                check_reduction=True,
+                process_group=self.trainer_group,
+            )
+        gLogger.info(
+            "Succeeded in creating a HybridModel instance with "
+            "%s ddp params and %s other local params.",
+            len(self.ddp_params), len(self.non_ddp_params)
+        )
+
+    def destroy_pg(self):
+        if self.trainer_group:
+            dist.destroy_process_group(self.trainer_group)
+
+    def train_batch(
+        self,
+        mini_batch: FeatureSet,
+        trainer_has_less_inputs: bool,
+        simulate_uneven_inputs: bool,
+    ):
+        grads_dict = None
+
+        if not simulate_uneven_inputs:
+            input_batches = [mini_batch]
+        else:
+            # Split into microbatches, and trim to simulate uneven inputs.
+            dense_features = mini_batch.dense_features
+            sparse_features = mini_batch.sparse_features
+            values = mini_batch.values
+
+            dense_microbatch = torch.split(dense_features, 2)
+            sparse_microbatch = torch.split(sparse_features, 2)
+            values_microbatch = torch.split(values, 2)
+            batches = []
+            for d, s, v in zip(dense_microbatch, sparse_microbatch, values_microbatch):
+                feature_set = FeatureSet(dense_features=d, sparse_features=s, values=v)
+                batches.append(feature_set)
+
+            if trainer_has_less_inputs:
+                input_batches = batches[: len(batches) // 2]
+                gLogger.info(
+                    "Trainer reduced input patches from %s "
+                    "to %s to simulate uneven inputs.",
+                    len(batches), len(input_batches)
+                )
+            else:
+                input_batches = batches
+
+        with self.hybrid_module.join() if simulate_uneven_inputs else contextlib.nullcontext():
+            for b in input_batches:
+                with dist_autograd.context() as context_id:
+                    output = self.hybrid_module.forward(b)
+                    loss = (output * mini_batch.values).sum()
+                    dist_autograd.backward(context_id, [loss])
+                    grads_dict = dist_autograd.get_gradients(context_id)
+                    gLogger.info(
+                        "Loss is %s for mini batch: %s. "
+                        "Grads dict has %s entries: %s", loss, mini_batch, len(grads_dict), grads_dict
+                    )
+        return (
+            tuple(grads_dict[param] for param in self.ddp_params),
+            tuple(grads_dict[param] for param in self.non_ddp_params),
+        )
+
+
+def get_training_examples():
+    n = 16
+    training_examples = FeatureSet(
+        dense_features=torch.zeros((n, D_DENSE)),
+        sparse_features=torch.zeros(n, dtype=torch.long),
+        values=torch.zeros(n),
+    )
+    idx = 0
+    # Every example has another one that has exactly the same features but an
+    # opposite value. Therefore, their grads cancel each other in all-reduce.
+    for value in (-1, 1):
+        for x in (-1.0 * value, 1.0 * value):
+            for y in (1.0 * value, -1.0 * value):
+                for z in (0, 1):
+                    training_examples.dense_features[idx, :] = torch.tensor((x, y))
+                    training_examples.sparse_features[idx] = z
+                    training_examples.values[idx] = value
+                    idx += 1
+
+    # Split the examples among NUM_TRAINERS trainers
+    assert 0 == (n % NUM_TRAINERS)
+    examples_per_trainer = int(n / NUM_TRAINERS)
+    return [
+        FeatureSet(
+            dense_features=training_examples.dense_features[
+                start : start + examples_per_trainer, :
+            ],
+            sparse_features=training_examples.sparse_features[
+                start : start + examples_per_trainer
+            ],
+            values=training_examples.values[start : start + examples_per_trainer],
+        )
+        for start in range(0, n, examples_per_trainer)
+    ]
+
+
+shutdown_signal = threading.Condition()
+
+
+def set_shutdown_signal():
+    global shutdown_signal
+    with shutdown_signal:
+        shutdown_signal.notify()
+
+
+class DdpUnderDistAutogradTest(RpcAgentTestFixture):
+    @property
+    def world_size(self) -> int:
+        return WORLD_SIZE
+
+    def remote_worker_name(self) -> str:
+        # The name has to be consistent with that in 'dist_init' decorator.
+        return f"worker{REMOTE_WORKER_RANK}"
+
+    def trainer_name(self, rank):
+        # The name has to be consistent with that in 'dist_init' decorator.
+        return f"worker{rank}"
+
+    def _remote_worker_process(self, ddp_mode):
+        gLogger.info("The remote worker is running.")
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
+        global shutdown_signal
+        with shutdown_signal:
+            shutdown_signal.wait()
+        gLogger.info("Exiting remote worker.")
+        dist.destroy_process_group()
+
+    def _trainer_process(self, rank: int):
+        gLogger.info("Running the trainer #%s...", rank)
+        gLogger.info(
+            "Initing trainer process group by trainer #%s with ranks %s", rank, TRAINER_RANKS
+        )
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        gLogger.info("Waiting for shutdown signal on trainer #%s...", rank)
+
+        global shutdown_signal
+        with shutdown_signal:
+            shutdown_signal.wait()
+        gLogger.info("Exiting the trainer #%s...", rank)
+        dist.destroy_process_group()
+
+    def _master_process(self, ddp_mode: DdpMode, simulate_uneven_inputs: bool):
+        gLogger.info("Running the master process...")
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        remote_em_rref = rpc.remote(
+            self.remote_worker_name(), RemoteEM, args=(NUM_EM_ROW, D_SPARSE)
+        )
+        remote_net_rref = rpc.remote(
+            self.remote_worker_name(), RemoteNet, args=(D_DENSE + D_SPARSE, D_HID)
+        )
+        gLogger.info("Created remote rrefs on master")
+        self.do_test_on_master(
+            ddp_mode, simulate_uneven_inputs, remote_em_rref, remote_net_rref
+        )
+
+    def do_test_on_master(
+        self,
+        ddp_mode: DdpMode,
+        simulate_uneven_inputs: bool,
+        remote_em_rref: rpc.RRef,
+        remote_net_rref: rpc.RRef,
+    ):
+        if simulate_uneven_inputs:
+            gLogger.info(
+                "Running DDP + RPC test with simulating uneven inputs across trainers."
+            )
+
+        trainer_rrefs = []
+        for rank in TRAINER_RANKS:
+            trainer = self.trainer_name(rank)
+            trainer_rrefs.append(
+                rpc.remote(
+                    trainer,
+                    Trainer,
+                    args=(remote_em_rref, remote_net_rref, ddp_mode, rank),
+                )
+            )
+
+        if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE):
+            # new_group needs to be called on ranks.
+            dist.new_group(TRAINER_RANKS)
+
+        training_examples = get_training_examples()
+        for _ in range(3):
+            futures = []
+            num_trainers = len(trainer_rrefs)
+            for idx, trainer_rref in enumerate(trainer_rrefs):
+                # Half the trainers will deplete inputs earlier than the rest.
+                trainer_has_less_inputs = (
+                    simulate_uneven_inputs and idx < num_trainers // 2
+                )
+                futures.append(
+                    _remote_method_async(
+                        Trainer.train_batch,
+                        trainer_rref,
+                        training_examples[idx],
+                        trainer_has_less_inputs,
+                        simulate_uneven_inputs,
+                    )
+                )
+
+            for future in futures:
+                ddp_grads, non_ddp_grads = future.wait()
+                # When there are uneven inputs, it is not necessary that grads
+                # cancel each other out, since some trainers contribute 0 grad.
+                if not simulate_uneven_inputs:
+                    for grad in ddp_grads:
+                        self.assertEqual(
+                            grad,
+                            torch.zeros_like(grad),
+                            msg=f"The grad for any ddp parameter should be zeros, because "
+                            "the training examples' grads cancel each other. Received "
+                            f"gradient {grad}",
+                        )
+                for grad in non_ddp_grads:
+                    self.assertNotEqual(
+                        grad,
+                        torch.zeros_like(grad),
+                        msg="The grad for any non-ddp parameter shouldn't be zeros",
+                    )
+
+        # Destroy process groups
+        for idx, trainer_rref in enumerate(trainer_rrefs):
+            _remote_method_async(Trainer.destroy_pg, trainer_rref).wait()
+
+        # Send shutdown signals.
+        for rank in TRAINER_RANKS:
+            trainer = self.trainer_name(rank)
+            rpc.rpc_sync(trainer, set_shutdown_signal, args=())
+
+        rpc.rpc_sync(self.remote_worker_name(), set_shutdown_signal, args=())
+
+    def _do_test(self, ddp_mode, simulate_uneven_inputs=False):
+        if self.rank == MASTER_RANK:
+            self._master_process(ddp_mode, simulate_uneven_inputs)
+        elif self.rank == REMOTE_WORKER_RANK:
+            self._remote_worker_process(ddp_mode)
+        elif self.rank in TRAINER_RANKS:
+            self._trainer_process(self.rank)
+        else:
+            raise RuntimeError(f"Unknown process rank: {self.rank}")
+
+    @requires_gloo()
+    @dist_init
+    def test_backward_no_ddp(self):
+        self._do_test(DdpMode.NONE)
+
+    @requires_gloo()
+    @dist_init
+    def test_backward_ddp_outside(self):
+        self._do_test(DdpMode.OUTSIDE)
+
+    @requires_gloo()
+    @dist_init
+    def test_backward_ddp_outside_uneven_inputs(self):
+        self._do_test(DdpMode.OUTSIDE, simulate_uneven_inputs=True)
+
+    @requires_gloo()
+    @dist_init
+    def test_backward_ddp_inside(self):
+        self._do_test(DdpMode.INSIDE)
+
+
+# Common utils for both CPU and CUDA test suites
+class CommonDdpComparisonTest(RpcAgentTestFixture):
+    @property
+    def world_size(self) -> int:
+        return NUM_TRAINERS
+
+    def trainer_name(self, rank):
+        # The name has to be consistent with that in 'dist_init' decorator.
+        return f"worker{rank}"
+
+    @staticmethod
+    def get_remote_grads(rref, context_id):
+        return dist_autograd.get_gradients(context_id)[rref.local_value().weight]
+
+
+class DdpComparisonTest(CommonDdpComparisonTest):
+    def _run_test_ddp_comparision(self, simulate_uneven_inputs=False):
+        gLogger.info("Running trainer rank: %s", self.rank)
+        # Each trainer uses a different random seed. Otherwise, they are going
+        # to have exactly the same initial model parameters, input, and
+        # therefore grads. That means the grads will be the same before and
+        # after DDP's all-reduce.
+        torch.manual_seed(self.rank)
+        dist.init_process_group(
+            backend="gloo",
+            # Postfix file_name with "pg" since file_name is also used by RPC agent
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=f"{self.file_name}_pg"),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+        net = nn.Linear(2, 3)
+        ddp_net = DistributedDataParallel(net)
+
+        # Odd ranks join early if simulate_uneven_inputs.
+        num_inputs = 1
+        if simulate_uneven_inputs:
+            if self.rank % 2 == 0:
+                num_inputs += 2
+        inputs_list = [torch.rand((3, 2)) for _ in range(num_inputs)]
+
+        if simulate_uneven_inputs:
+            gLogger.info("Rank %s training with %s inputs.", self.rank, len(inputs_list))
+
+        # Use distributed autograd. The gradients will be in RPC context map.
+        grads_dict = {}
+        with ddp_net.join(simulate_uneven_inputs):
+            for i, inputs in enumerate(inputs_list):
+                with dist_autograd.context() as context_id:
+                    loss = ddp_net(inputs).norm()
+                    dist_autograd.backward(context_id, [loss])
+                    grads_dict = dist_autograd.get_gradients(context_id)
+                gLogger.info("Trainer #%s got grad dict: %s", self.rank, grads_dict)
+
+                # Use local autograd. The gradients will be in each variable's '.grad'.
+                ddp_net.zero_grad()
+                loss = ddp_net(inputs).norm()
+                loss.backward()
+
+                # The gradients should be the same
+                for param in net.parameters():
+                    self.assertTrue(
+                        param in grads_dict,
+                        msg=f"Param {param} is not in dist_auto grad dict {grads_dict} for iteration {i}",
+                    )
+                    self.assertEqual(
+                        grads_dict[param],
+                        param.grad,
+                        msg=f"The grads for param {param} are different under local "
+                        f"and dist autograd: {param.grad} \n---\n {grads_dict[param]} for iteration {i}",
+                    )
+        dist.destroy_process_group()
+
+    @requires_gloo()
+    @dist_init
+    def test_ddp_comparison(self):
+        self._run_test_ddp_comparision()
+
+    @requires_gloo()
+    @dist_init
+    def test_ddp_comparison_uneven_inputs(self):
+        # test with simulating uneven inputs in DDP
+        self._run_test_ddp_comparision(simulate_uneven_inputs=True)
+
+    @requires_gloo()
+    @dist_init
+    def test_ddp_dist_autograd_sparse_grads(self):
+        # Each trainer uses a different random seed. Otherwise, they are going
+        # to have exactly the same initial model parameters, input, and
+        # therefore grads. That means the grads will be the same before and
+        # after DDP's all-reduce.
+        torch.manual_seed(self.rank)
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        model = nn.EmbeddingBag(10, 3, sparse=True)
+        ddp_model = DistributedDataParallel(model)
+
+        # Different inputs for each
+        input = torch.LongTensor(10).random_(0, 10)
+        offsets = torch.LongTensor([0, 4])
+
+        # Run local.
+        loss = ddp_model(input, offsets).sum()
+        loss.backward()
+
+        with dist_autograd.context() as context_id:
+            loss = ddp_model(input, offsets).sum()
+            dist_autograd.backward(context_id, [loss])
+            grads_dict = dist_autograd.get_gradients(context_id)
+            self.assertEqual(1, len(grads_dict))
+            self.assertEqual(model.weight.grad, grads_dict[model.weight])
+
+    @requires_gloo()
+    @dist_init
+    def test_ddp_dist_autograd_local_vs_remote(self):
+        # Each trainer uses a different random seed. Otherwise, they are going
+        # to have exactly the same initial model parameters, input, and
+        # therefore grads. That means the grads will be the same before and
+        # after DDP's all-reduce.
+        torch.manual_seed(self.rank)
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        # Use two different remote device input string, w/ and w/o the default
+        # device string "cpu", respectively.
+        for remote_device in ["worker0/cpu", "worker0"]:
+            remote_layer1 = RemoteModule(
+                remote_device=remote_device, module_cls=nn.Linear, args=(10, 5, False)
+            )
+            layer1 = nn.Linear(10, 5, False)
+            # Start with the same parameters for remote and local
+            layer1.weight = remote_layer1.module_rref.to_here().weight
+
+            # Run local case.
+            layer2 = nn.Linear(5, 1)
+            inputs = torch.rand((10, 10))
+            ddp_model = DistributedDataParallel(layer2)
+            loss = ddp_model(layer1(inputs)).sum()
+            loss.backward()
+
+            # Run remote case.
+            with dist_autograd.context() as context_id:
+                loss = ddp_model(remote_layer1(inputs)).sum()
+                dist_autograd.backward(context_id, [loss])
+                grads_dict = dist_autograd.get_gradients(context_id)
+                dist.barrier()
+                self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
+                self.assertEqual(
+                    layer1.weight.grad,
+                    rpc.rpc_sync(
+                        "worker0",
+                        CommonDdpComparisonTest.get_remote_grads,
+                        args=(remote_layer1.module_rref, context_id),
+                    ),
+                )
+
+
+class CudaDdpComparisonTest(CommonDdpComparisonTest):
+    @skip_if_lt_x_gpu(NUM_TRAINERS)
+    @requires_nccl()
+    @dist_init
+    @skip_if_rocm
+    def test_ddp_dist_autograd_local_vs_remote_gpu(self):
+        # Each trainer uses a different random seed. Otherwise, they are going
+        # to have exactly the same initial model parameters, input, and
+        # therefore grads. That means the grads will be the same before and
+        # after DDP's all-reduce.
+        torch.manual_seed(self.rank)
+        dist.init_process_group(
+            backend="gloo",
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        remote_layer1 = RemoteModule(
+            remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False)
+        )
+        layer1 = nn.Linear(10, 7, False)
+        # Start with the same parameters for remote and local
+        layer1.weight = remote_layer1.module_rref.to_here().weight
+
+        layer2 = nn.Linear(7, 5).cuda(self.rank)
+        ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank])
+
+        remote_layer3 = RemoteModule(
+            remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False)
+        )
+        layer3 = nn.Linear(5, 3, False)
+        # Start with the same parameters for remote and local
+        layer3.weight = remote_layer3.module_rref.to_here().weight
+
+        layer4 = nn.Linear(3, 1).cuda(self.rank)
+        ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank])
+
+        # Run local case.
+        inputs = torch.rand((10, 10))
+        loss = ddp_layer4(
+            layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda(self.rank)
+        ).sum()
+        loss.backward()
+
+        # Run remote case.
+        with dist_autograd.context() as context_id:
+            loss = ddp_layer4(
+                remote_layer3(
+                    ddp_layer2(remote_layer1(inputs).cuda(self.rank)).cpu()
+                ).cuda(self.rank)
+            ).sum()
+            dist_autograd.backward(context_id, [loss])
+            grads_dict = dist_autograd.get_gradients(context_id)
+            dist.barrier()
+            self.assertEqual(
+                layer1.weight.grad,
+                rpc.rpc_sync(
+                    "worker0",
+                    CommonDdpComparisonTest.get_remote_grads,
+                    args=(remote_layer1.module_rref, context_id),
+                ),
+            )
+            self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight])
+            self.assertEqual(
+                layer3.weight.grad,
+                rpc.rpc_sync(
+                    "worker0",
+                    CommonDdpComparisonTest.get_remote_grads,
+                    args=(remote_layer3.module_rref, context_id),
+                ),
+            )
+            self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..239a1cda9ecc608d25d19a44285b062f042307cd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_test.py
@@ -0,0 +1,10062 @@
+# mypy: ignore-errors
+
+import copy
+import itertools
+import math
+import os
+import random
+import sys
+import tempfile
+import time
+from collections import namedtuple, OrderedDict
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from datetime import timedelta
+from functools import reduce
+from typing import Union, NamedTuple, Callable, Any
+import unittest
+import numpy as np
+import torch
+import torch.cuda
+import torch.distributed as dist
+import torch.distributed.algorithms.model_averaging.averagers as averagers
+import torch.distributed.algorithms.model_averaging.hierarchical_model_averager as hierarchicalSGD
+import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
+import torch.nn as nn
+import torch.nn.functional as F
+from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
+from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
+from torch.autograd import DeviceType
+from torch.cuda.amp import GradScaler, autocast
+
+from torch.distributed.algorithms.ddp_comm_hooks import (
+    post_localSGD_hook as post_localSGD,
+    powerSGD_hook as powerSGD,
+    default_hooks as default,
+    quantization as quantization_hooks,
+)
+from torch.distributed.optim import _apply_optimizer_in_backward
+
+from torch.distributed.distributed_c10d import (
+    get_world_size,
+    _get_default_group,
+    _get_pg_config,
+)
+from torch.distributed.utils import (
+    _verify_param_shape_across_processes,
+    _sync_module_states,
+)
+
+from torch.nn.parallel import DistributedDataParallel
+from torch.nn.parallel.distributed import _dump_DDP_relevant_env_vars, _MixedPrecision
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    TEST_SKIPS,
+    init_multigpu_helper,
+    initialize_temp_directories,
+    cleanup_temp_dir,
+    simple_sparse_reduce_tests,
+    skip_if_rocm,
+    skip_if_small_worldsize,
+    skip_if_odd_worldsize,
+    skip_if_lt_x_gpu,
+    nccl_skip_if_lt_x_gpu,
+    skip_if_no_gpu,
+    require_n_gpus_for_nccl_backend,
+    requires_nccl_version,
+    captured_output,
+    with_nccl_blocking_wait,
+    with_dist_debug_levels,
+    verify_ddp_error_logged,
+    DistTestCases,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_MACOS,
+    IS_WINDOWS,
+    FILE_SCHEMA,
+    IS_FBCODE,
+    NO_MULTIPROCESSING_SPAWN,
+    IS_SANDCASTLE,
+    skip_but_pass_in_sandcastle,
+    skip_but_pass_in_sandcastle_if,
+)
+
+import torch.distributed.optim.post_localSGD_optimizer as post_localSGD_optimizer
+
+from torch.utils.data.distributed import DistributedSampler
+import operator
+
+try:
+    import torchvision
+
+    HAS_TORCHVISION = True
+except ImportError:
+    HAS_TORCHVISION = False
+
+if sys.platform == "win32":
+    import msvcrt
+else:
+    import fcntl
+
+
+class NetWithBuffers(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = nn.Linear(10, 10, bias=False)
+        self.b = nn.Linear(10, 1, bias=False)
+        self.register_buffer("buffer", torch.randn(1, 2))
+
+    def forward(self, x):
+        self.buffer.add_(1)
+        return self.b(self.a(x))
+
+
+class Foo:
+    def __init__(self, x):
+        # Can be tensor or int
+        self.x = x
+
+    def __eq__(self, other):
+        def eq(value, other):
+            if isinstance(value, torch.Tensor):
+                return torch.equal(value, other)
+            return value == other
+
+        for attr, value in self.__dict__.items():
+            other_value = other.__dict__[attr]
+            if not eq(value, other_value):
+                return False
+        return True
+
+
+f = Foo(10)
+f.bar = 1
+
+foo_cpu_tensor = Foo(torch.randn(3, 3))
+
+
+COLLECTIVES_OBJECT_TEST_LIST = [
+    {"key1": 3, "key2": 4, "key3": {"nested": True}},
+    f,
+    foo_cpu_tensor,
+    "foo",
+    [1, 2, True, "string", [4, 5, "nested"]],
+]
+
+# Allowlist of distributed backends where profiling collectives is supported.
+PROFILING_SUPPORTED_BACKENDS = [
+    dist.Backend.NCCL,
+    dist.Backend.GLOO,
+    dist.Backend.MPI,
+    dist.Backend.UCC,
+]
+
+# Allowlist of distributed backends where profiling is supported with use_cuda=True
+CUDA_PROFILING_SUPPORTED_BACKENDS = [
+    dist.Backend.GLOO,
+    dist.Backend.MPI,
+    dist.Backend.NCCL,
+    dist.Backend.UCC,
+]
+
+# Allowlist of distributed backends where profiling is supported for p2p ops
+SEND_RECV_PROFILING_SUPPORTED_BACKENDS = [
+    dist.Backend.MPI,
+    dist.Backend.GLOO,
+    dist.Backend.NCCL,
+    dist.Backend.UCC,
+]
+
+# Dummy NamedTuple data structures to test DDP support for NamedTuple types.
+EXPECTED_FIELDS = ("a", "b")
+TestNamedTupleInput_0 = namedtuple("NamedTuple", EXPECTED_FIELDS)
+
+
+class TestNamedTupleInput_1(NamedTuple):
+    a: torch.tensor
+    b: torch.tensor
+
+
+skipIfNoTorchVision = skip_but_pass_in_sandcastle_if(
+    not HAS_TORCHVISION, "no torchvision"
+)
+
+BACKEND = os.environ["BACKEND"]
+INIT_METHOD = os.getenv("INIT_METHOD", "env://")
+
+DEFAULT_TIMEOUT = 300
+CUSTOMIZED_TIMEOUT = {"test_DistributedDataParallel": 500}
+
+
+def get_profiling_event(event_name, profiler, dedup_gpu_user_annotation=False):
+    event_list = (
+        profiler.events()
+        if isinstance(profiler, torch.profiler.profile)
+        else profiler.function_events
+    )
+    return [
+        event for event in event_list
+        if (
+            (event.name.endswith(event_name) or event.name.startswith(event_name))
+            and (not dedup_gpu_user_annotation or event.device_type != DeviceType.CUDA)
+        )
+    ]
+
+
+# Base error message substring on unfinished reductions.
+ddp_prev_reduction_unfinished_str = (
+    "Expected to have finished reduction in the prior iteration"
+)
+# Error message substring when find_unused_parameters=True has not been passed
+ddp_recommend_find_unused_params_str = (
+    "passing the keyword argument `find_unused_parameters=True`"
+)
+# Error message substring when find_unused_parameters=True is enabled
+ddp_find_unused_params_enabled_str = "Since `find_unused_parameters=True` is enabled"
+# Error message substring for possibility of not all model outputs being used
+# in loss computation
+ddp_outputs_not_used_in_loss_str = (
+    "`forward` function outputs participate in calculating loss"
+)
+# Error message substring suggesting to use TORCH_DISTRIBUTED_DEBUG
+ddp_suggest_debug_mode_str = (
+    "set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL"
+)
+
+
+class DDPUnevenTestInput(NamedTuple):
+    name: str
+    model: nn.Module
+    inp: Union[torch.tensor, tuple]
+    sync_interval: int
+    throw_on_early_termination: bool = False
+    hook: Callable = None
+    state: Any = None
+
+
+class _FC2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = nn.Linear(10, 50, bias=True)
+        self.fc.bias.requires_grad = False
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(2, 10, bias=False)
+        self.fc2 = _FC2()
+        self.fc3 = nn.Linear(50, 4, bias=False)
+        self.relu = nn.ReLU()
+        self.no_grad_param = nn.Parameter(
+            torch.tensor([2, 2]).long(), requires_grad=False
+        )
+
+    def forward(self, x):
+        x = self.relu(self.fc1(x))
+        x = self.relu(self.fc2(x))
+        x = self.fc3(x)
+        return F.softmax(x, dim=1)
+
+
+class LargeNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(1000, 2000, bias=False)
+        self.fc2 = nn.Linear(2000, 500, bias=False)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+
+class Task(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.p = nn.Parameter(torch.ones(2, 2))
+
+    def forward(self, x):
+        return self.p + x
+
+
+class BatchNormNet(nn.Module):
+    def __init__(self, affine=True):
+        super().__init__()
+        self.fc1 = nn.Linear(2, 40, bias=False)
+        self.bn = nn.BatchNorm1d(4, affine=affine)
+        self.fc2 = nn.Linear(40, 4, bias=False)
+
+    def forward(self, x):
+        x = torch.reshape(self.fc1(x), (-1, 4, 10))
+        x = self.bn(x)
+        x = torch.reshape(x, (-1, 40))
+        x = self.fc2(x)
+        return F.softmax(x, dim=1)
+
+
+class UnusedParamTwoLinLayerNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = nn.Linear(10, 10, bias=False)
+        self.b = nn.Linear(10, 10, bias=False)
+        self.c = nn.Linear(5, 5, bias=False)
+
+    def forward(self, x):
+        a = self.a(x)
+        b = self.b(x)
+        return (a, b)
+
+
+class DictOutputModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.module = UnusedParamTwoLinLayerNet()
+
+    def forward(self, x):
+        predictions = self.module(x)
+        loss = (predictions[0] + predictions[1]).sum()
+        return {
+            "predictions": predictions,
+            "loss": loss,
+        }
+
+
+class TwoLinLayerNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.a = nn.Linear(10, 10, bias=False)
+        self.b = nn.Linear(10, 1, bias=False)
+
+    def forward(self, x):
+        a = self.a(x)
+        b = self.b(x)
+        return (a, b)
+
+
+class EmbeddingNetDifferentParams(nn.Module):
+    """
+    A module containing an embedding with different dimension or different # of
+    parameters depending on the rank.
+    """
+
+    def __init__(self, rank, diff_num_params=False):
+        super().__init__()
+        embedding_dim = 500 if diff_num_params or rank == 0 else 50
+        self.embedding = nn.Embedding(num_embeddings=10, embedding_dim=embedding_dim)
+        self.lin = nn.Linear(embedding_dim, 1)
+        if diff_num_params:
+            self.lin2 = nn.Linear(1, 1, bias=False)
+
+    def forward(self, x):
+        x = self.embedding(x)
+        return self.lin(x)
+
+
+class ControlFlowToyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin1 = nn.Linear(10, 10, bias=False)
+        self.lin2 = nn.Linear(10, 10, bias=False)
+
+    def forward(self, x):
+        # Second layer is used dependent on input x.
+        use_second_layer = torch.equal(x, torch.ones(20, 10, device=x.device))
+        if use_second_layer:
+            return self.lin2(F.relu(self.lin1(x)))
+        else:
+            return F.relu(self.lin1(x))
+
+
+DDP_NET = Net()
+BN_NET = BatchNormNet()
+BN_NET_NO_AFFINE = BatchNormNet(affine=False)
+ONLY_SBN_NET = nn.SyncBatchNorm(2, momentum=0.99)
+
+
+def get_timeout(test_id):
+    test_name = test_id.split(".")[-1]
+    if test_name in CUSTOMIZED_TIMEOUT:
+        return CUSTOMIZED_TIMEOUT[test_name]
+    else:
+        return DEFAULT_TIMEOUT
+
+
+default_pg_timeout = 60
+
+CUSTOM_PG_TIMEOUT = {
+    # This test runs slowly and needs additional time to complete, otherwise can
+    # be taken down by TORCH_NCCL_ASYNC_ERROR_HANDLING
+    "test_ddp_uneven_inputs": 300,
+    # This test has a short timeout since it tests being taken down by
+    # TORCH_NCCL_ASYNC_ERROR_HANDLING which we want to happen quickly.
+    "test_ddp_model_diff_across_ranks": 5,
+    # This test has a short timeout since it tests being taken down by
+    # TORCH_NCCL_ASYNC_ERROR_HANDLING which we want to happen quickly.
+    "test_ddp_has_finalized": 5,
+}
+
+def require_backend_is_available(backends):
+    def check(backend):
+        if backend == dist.Backend.GLOO:
+            return dist.is_gloo_available()
+        if backend == dist.Backend.NCCL:
+            return dist.is_nccl_available()
+        if backend == dist.Backend.MPI:
+            return dist.is_mpi_available()
+        if backend == dist.Backend.UCC:
+            return dist.is_ucc_available()
+        if backend in DistTestCases.backend_feature["plugin"]:
+            return True
+        return False
+
+    if BACKEND not in backends:
+        return skip_but_pass_in_sandcastle(
+            f"Test requires backend {BACKEND} to be one of {backends}"
+        )
+
+    if not check(dist.Backend(BACKEND)):
+        return skip_but_pass_in_sandcastle(
+            f"Test requires backend {BACKEND} to be available"
+        )
+    return lambda func: func
+
+
+def require_world_size(world_size):
+    if int(os.environ["WORLD_SIZE"]) < world_size:
+        return skip_but_pass_in_sandcastle(
+            "Test requires world size of %d" % world_size
+        )
+    return lambda func: func
+
+
+@contextmanager
+def _lock():
+    TEMP_DIR = os.environ["TEMP_DIR"]
+    lockfile = os.path.join(TEMP_DIR, "lockfile")
+    with open(lockfile, "w") as lf:
+        try:
+            if sys.platform == "win32":
+                msvcrt.locking(lf.fileno(), msvcrt.LK_RLCK, 1)
+                yield
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_EX)
+                yield
+        finally:
+            if sys.platform == "win32":
+                msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
+            else:
+                fcntl.flock(lf.fileno(), fcntl.LOCK_UN)
+            lf.close()
+
+
+@contextmanager
+def _rank_temp_file():
+    if dist.get_rank() == 0:
+        fd, name = tempfile.mkstemp()
+        os.close(fd)
+    else:
+        name = None
+    object_list = [name]
+    dist.broadcast_object_list(object_list)
+    name = object_list[0]
+    try:
+        yield name
+    finally:
+        if dist.get_rank() == 0:
+            os.remove(name)
+
+
+def _build_tensor(size, value=None, dtype=torch.float, device_id=None):
+    if value is None:
+        value = size
+    if device_id is None:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value)
+    else:
+        return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id)
+
+
+def _build_multidim_tensor(dim, dim_size, value=None, dtype=torch.float):
+    if value is None:
+        value = dim
+    return torch.empty(size=[dim_size for _ in range(dim)], dtype=dtype).fill_(value)
+
+
+def _create_autograd_profiler():
+    return torch.autograd.profiler.profile(record_shapes=True)
+
+
+def _create_torch_profiler():
+    return torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+        ],
+        record_shapes=True,
+    )
+
+
+class Barrier:
+    barrier_id = 0
+
+    @classmethod
+    def init(cls):
+        cls.barrier_id = 0
+        barrier_dir = os.path.join(os.environ["TEMP_DIR"], "barrier")
+        for f_name in os.listdir(barrier_dir):
+            os.unlink(os.path.join(barrier_dir, f_name))
+
+    @classmethod
+    def sync(cls, wait_for=None, timeout=10):
+        if wait_for is None:
+            wait_for = dist.get_world_size()
+        cls.barrier_id += 1
+        barrier_dir = os.path.join(os.environ["TEMP_DIR"], "barrier")
+        pid = str(os.getpid())
+        barrier_file = os.path.join(barrier_dir, pid)
+        with _lock():
+            with open(barrier_file, "w") as f:
+                f.write(str(cls.barrier_id))
+
+        start_time = time.time()
+        while True:
+            arrived = 0
+            with _lock():
+                for f_name in os.listdir(barrier_dir):
+                    with open(os.path.join(barrier_dir, f_name)) as f:
+                        data = f.read()
+                        if int(data) >= cls.barrier_id:
+                            arrived += 1
+            if arrived == wait_for:
+                break
+
+            if time.time() - start_time > timeout:
+                raise RuntimeError("barrier timeout")
+            time.sleep(0.1)
+
+
+class TestDistBackend(MultiProcessTestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ["MASTER_ADDR"] = str(MASTER_ADDR)
+        # Not setting MASTER_PORT and get a random free port
+        super().setUpClass()
+
+    def setUp(self):
+        super().setUp()
+        # initialize temp directories
+        initialize_temp_directories()
+        # initialize Barrier
+        Barrier.init()
+        # Skip return code checking for following tests as they are expected to
+        # crash a process due to TORCH_NCCL_ASYNC_ERROR_HANDLING.
+        self.skip_return_code_checks = [self.test_ddp_has_finalized.__wrapped__]
+
+    def tearDown(self):
+        cleanup_temp_dir()
+        super().tearDown()
+
+    @property
+    def init_method(self):
+        return f"{FILE_SCHEMA}{self.file_name}"
+
+    @classmethod
+    def _run(cls, rank, test_name, file_name, pipe):
+        if BACKEND == "nccl" and not torch.cuda.is_available():
+            sys.exit(TEST_SKIPS["no_cuda"].exit_code)
+        self = cls(test_name)
+        self.rank = rank
+        self.file_name = file_name
+
+        if torch.cuda.is_available() and torch.cuda.device_count() < int(
+            self.world_size
+        ):
+            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
+        try:
+            pg_timeout_seconds = CUSTOM_PG_TIMEOUT.get(test_name, default_pg_timeout)
+            timeout = timedelta(seconds=pg_timeout_seconds)
+            dist.init_process_group(
+                init_method=self.init_method,
+                backend=BACKEND,
+                world_size=int(self.world_size),
+                rank=self.rank,
+                timeout=timeout,
+            )
+        except RuntimeError as e:
+            if "recompile" in e.args[0]:
+                sys.exit(TEST_SKIPS["backend_unavailable"].exit_code)
+
+            raise
+
+        # Execute barrier prior to running test to ensure that every process
+        # has finished initialization and that the following test
+        # immediately exiting due to a skip doesn't cause flakiness.
+        self._barrier()
+
+        self.run_test(test_name, pipe)
+        self._barrier()
+        dist.destroy_process_group()
+        sys.exit(0)
+
+    # Needed since MultiProcessTestCase assumes a world_size of 4, but we
+    # run these tests under other various world_sizes.
+    @property
+    def world_size(self):
+        return os.environ["WORLD_SIZE"]
+
+
+class DistributedTest:
+    class _DistTestBase:
+        def _barrier(self, *args, **kwargs):
+            Barrier.sync(*args, **kwargs)
+
+        def _init_group_test(self, **kwargs):
+            group = [1, 2]
+            group_id = dist.new_group(group, **kwargs)
+            rank = dist.get_rank()
+            if rank not in group:
+                return ([], None, rank)
+
+            return (group, group_id, rank)
+
+        def _init_full_group_test(self, **kwargs):
+            group = list(range(0, dist.get_world_size()))
+            group_id = dist.new_group(**kwargs)
+            rank = dist.get_rank()
+            return (group, group_id, rank)
+
+        def _init_global_test(self):
+            group = list(range(0, dist.get_world_size()))
+            group_id = dist.group.WORLD
+            rank = dist.get_rank()
+            return (group, group_id, rank)
+
+        def _verify_buffers_equal(self, m1, m2):
+            # verify buffers across models
+            m1_buf_dict = dict(m1.module.named_buffers())
+            for name, buf in m2.module.named_buffers():
+                self.assertEqual(buf, m1_buf_dict[name])
+
+            # Verify buffers across ranks.
+            m1_buffers = list(m1.buffers())
+            m2_buffers = list(m2.buffers())
+            for (buf1, buf2) in zip(m1_buffers, m2_buffers):
+                gathered_bufs = [
+                    torch.empty_like(buf1) for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(gathered_bufs, buf1)
+                gathered_bufs_m2 = [
+                    torch.empty_like(buf2) for _ in range(dist.get_world_size())
+                ]
+                for b in gathered_bufs:
+                    self.assertEqual(b, buf1)
+                dist.all_gather(gathered_bufs_m2, buf2)
+                for b in gathered_bufs_m2:
+                    self.assertEqual(b, buf2)
+
+        def test_dump_DDP_relevant_env_vars(self):
+            with captured_output() as (out, _):
+                _dump_DDP_relevant_env_vars()
+                lines = out.getvalue().splitlines()
+
+            def format_line(var):
+                return f"env:{var}={os.environ[var] if var in os.environ else 'N/A'}"
+
+            # Check relevant env vars
+            vars = [
+                "MASTER_ADDR",
+                "MASTER_PORT",
+                "WORLD_SIZE",
+                "NCCL_TOPO_DUMP_FILE",  # N/A
+                "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+            ]
+            for var in vars:
+                line = format_line(var)
+                self.assertIn(line, lines)
+            # Check irrelevant env vars
+            vars = [
+                "xxx",
+                "yyy",
+                "zzz",
+            ]
+            for var in vars:
+                line = format_line(var)
+                self.assertNotIn(line, lines)
+
+        # GET RANK
+        def test_get_rank(self):
+            test_dir = os.path.join(os.environ["TEMP_DIR"], "test_dir")
+            pid = str(os.getpid())
+            num_processes = dist.get_world_size()
+            with open(os.path.join(test_dir, pid), "w") as f:
+                f.write(str(dist.get_rank()))
+
+            self._barrier()
+
+            all_ranks = set()
+            for f_name in os.listdir(test_dir):
+                with open(os.path.join(test_dir, f_name)) as f:
+                    all_ranks.add(int(f.read()))
+            self.assertEqual(len(all_ranks), num_processes)
+
+            self._barrier()
+
+            if dist.get_rank() == 0:
+                for f_name in os.listdir(test_dir):
+                    os.unlink(os.path.join(test_dir, f_name))
+
+            self._barrier()
+
+        def test_get_backend(self):
+            if dist.get_world_size() > 2:
+                group = [1, 2]
+            else:
+                group = [0, 1]
+            group_id = dist.new_group(group)
+            backend_str = BACKEND.lower()
+            self.assertEqual(dist.get_backend(), backend_str)
+            if dist.get_rank() in group:
+                self.assertEqual(dist.get_backend(group_id), backend_str)
+            else:
+                with self.assertRaisesRegex(
+                    ValueError, "Invalid process group specified"
+                ):
+                    dist.get_backend(group_id)
+
+        def test_Backend_enum_class(self):
+            # test parsing
+            backend = BACKEND.lower()
+            self.assertEqual(dist.Backend(BACKEND.upper()), backend)
+            self.assertEqual(dist.Backend(BACKEND), backend)
+            with self.assertRaises(ValueError):
+                dist.Backend(None)
+            with self.assertRaises(ValueError):
+                dist.Backend(3)
+            with self.assertRaises(ValueError):
+                dist.Backend(["gloo"])
+
+        # Test destroy
+        def test_destroy_group(self):
+            if dist.get_world_size() > 2:
+                group = [1, 2]
+            else:
+                group = [0, 1]
+            group_id = dist.new_group(group)
+            self._barrier()
+            dist.destroy_process_group(group_id)
+
+        # Test get rank and size of group
+        def test_get_rank_size_group(self):
+            if dist.get_world_size() > 2:
+                group = [1, 2]
+            else:
+                group = [0, 1]
+            group_id = dist.new_group(group)
+            if dist.get_rank() in group:
+                self.assertEqual(dist.get_world_size(group_id), 2)
+                self.assertTrue(dist.get_rank(group_id) in list(range(2)))
+            else:
+                self.assertEqual(dist.get_world_size(group_id), -1)
+                self.assertEqual(dist.get_rank(group_id), -1)
+
+        # Test destroy full groups
+        def test_destroy_full_group(self):
+            _, group_id, _ = self._init_full_group_test()
+            self._barrier()
+            dist.destroy_process_group(group_id)
+
+        # Test get rank and size of full group
+        def test_get_rank_size_full_group(self):
+            _, group_id, _ = self._init_full_group_test()
+            self.assertEqual(dist.get_world_size(group_id), dist.get_world_size())
+            self.assertEqual(dist.get_rank(group_id), dist.get_rank())
+
+        def _test_barrier_timeout(self, group_id, timeout):
+            local_rank = dist.get_rank(group_id)
+
+            # Only execute barrier on rank == 0, causing it to timeout
+            if local_rank == 0:
+                expected_time = time.time() + timeout.total_seconds()
+                # In debug mode, we execute a monitored_barrier before the
+                # collective, so assert on that.
+                if dist.get_debug_level() == dist.DebugLevel.DETAIL:
+                    exception_ctx = self.assertRaisesRegex(
+                        Exception, "failed to pass monitoredBarrier"
+                    )
+                else:
+                    exception_ctx = self.assertRaisesRegex(
+                        Exception, " (Timed out|closed|timeout) "
+                    )
+                with exception_ctx:
+                    dist.barrier(group_id)
+                self.assertGreaterAlmostEqual(time.time(), expected_time, delta=0.1)
+            else:
+                pass
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo", "Only gloo backend supports timeouts"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            not INIT_METHOD.startswith("file://"),
+            "Requires file:// initialization method. "
+            + "Both tcp:// and env:// rely on the TCP store for which "
+            "reinitialization has proven racy.",
+        )
+        def test_barrier_timeout_global(self):
+            dist.destroy_process_group()
+
+            # Explicitly pass world size to the barrier because we've
+            # just destroyed any state in torch.distributed.
+            self._barrier(wait_for=int(os.environ["WORLD_SIZE"]))
+
+            # Reinitialize global process group
+            timeout = timedelta(seconds=1)
+            dist.init_process_group(
+                init_method=INIT_METHOD,
+                backend=BACKEND,
+                world_size=int(os.environ["WORLD_SIZE"]),
+                rank=self.rank,
+                timeout=timeout,
+            )
+            self._test_barrier_timeout(dist.group.WORLD, timeout)
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo", "Only gloo backend supports timeouts"
+        )
+        def test_barrier_timeout_group(self):
+            timeout = timedelta(seconds=5)
+            _, group_id, _ = self._init_group_test(timeout=timeout)
+            if group_id is not None:
+                self._test_barrier_timeout(group_id, timeout)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo", "Only gloo backend supports timeouts"
+        )
+        def test_barrier_timeout_full_group(self):
+            timeout = timedelta(seconds=1)
+            _, group_id, _ = self._init_full_group_test(timeout=timeout)
+            if group_id is not None:
+                self._test_barrier_timeout(group_id, timeout)
+
+        # This test helper can only be used when using the Gloo or NCCL backend
+        # **and** both the Gloo and NCCL backends are available.
+        # See the @skip annotations below.
+        def _test_group_override_backend(self, initializer):
+            if BACKEND == "gloo":
+                new_backend = "nccl"
+            elif BACKEND == "nccl":
+                new_backend = "gloo"
+            elif BACKEND in DistTestCases.backend_feature["plugin"]:
+                new_backend = "gloo"
+
+            group, group_id, rank = initializer(backend=new_backend)
+            if group_id is None:
+                return
+
+            if new_backend == "gloo":
+                self.assertTrue(group_id._get_backend_name(), "gloo")
+            if new_backend == "nccl":
+                self.assertTrue(group_id._get_backend_name(), "nccl")
+
+            self.assertEqual(rank, group[dist.get_rank(group_id)])
+            self.assertEqual(len(group), dist.get_world_size(group_id))
+
+            # Pin device (so we avoid NCCL race conditions/deadlocks).
+            group_rank = dist.get_rank(group_id)
+            torch.cuda.set_device(group_rank)
+
+            # Run broadcast of CUDA tensor (so it works for both Gloo and NCCL).
+            tensor = _build_tensor(2, value=group_rank).cuda()
+            dist.broadcast(tensor, src=group[0], group=group_id)
+            self.assertEqual(_build_tensor(2, value=0), tensor.to("cpu"))
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @require_world_size(3)
+        @skip_if_lt_x_gpu(2)
+        def test_backend_group(self):
+            self._test_group_override_backend(self._init_group_test)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        @unittest.skipIf(BACKEND == "ucc", "broken, see https://github.com/pytorch/pytorch/pull/113620")
+        def test_backend_full_group(self):
+            self._test_group_override_backend(self._init_full_group_test)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(2)
+        def test_new_subgroups(self):
+            subgroup_size = 2
+            cur_subgroup, subgroups = dist.new_subgroups(subgroup_size)
+
+            world_size = dist.get_world_size()
+            self.assertEqual(cur_subgroup.size(), subgroup_size)
+            self.assertEqual(len(subgroups), world_size / subgroup_size)
+            self.assertFalse(dist._rank_not_in_group(cur_subgroup))
+
+            for subgroup in subgroups:
+                dist.destroy_process_group(subgroup)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @skip_if_no_gpu
+        def test_new_subgroups_group_size_exceeds_world_size(self):
+            with self.assertRaisesRegex(ValueError, "must not exceed"):
+                dist.new_subgroups(100)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_new_subgroups_world_size_not_divisible_by_group_size(self):
+            with self.assertRaisesRegex(
+                ValueError, "The world size must be divisible by 'group_size'"
+            ):
+                dist.new_subgroups(3)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_new_subgroups_by_enumeration(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            cur_subgroup, subgroups = dist.new_subgroups_by_enumeration(
+                ranks_per_subgroup_list=[[0, 2], [1, 3]]
+            )
+            if device_id >= 4:
+                self.assertIsNone(cur_subgroup)
+            else:
+                self.assertEqual(cur_subgroup.size(), 2)
+                self.assertEqual(len(subgroups), 2)
+                if device_id == 0 or device_id == 2:
+                    self.assertEqual(cur_subgroup, subgroups[0])
+                else:
+                    self.assertEqual(cur_subgroup, subgroups[1])
+
+            for subgroup in subgroups:
+                dist.destroy_process_group(subgroup)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_new_subgroups_by_enumeration_input_rank_exceeds_world_size(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            world_size = get_world_size(group_id)
+
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "The new group's rank should be within the world_size set by init_process_group",
+            ):
+                dist.new_subgroups_by_enumeration(
+                    ranks_per_subgroup_list=[[0, 1], [world_size, 2]]
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @skip_if_no_gpu
+        def test_new_subgroups_by_enumeration_negative_input_rank(self):
+            group, group_id, rank = self._init_global_test()
+
+            with self.assertRaisesRegex(
+                ValueError,
+                "The new group's rank should be within the world_size set by init_process_group",
+            ):
+                dist.new_subgroups_by_enumeration(
+                    ranks_per_subgroup_list=[[-1, -2], [-3, -4]]
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_new_subgroups_overlap_not_allowed(self):
+            with self.assertRaisesRegex(
+                ValueError, "Rank 1 has appeared in both subgroup"
+            ):
+                dist.new_subgroups_by_enumeration(
+                    ranks_per_subgroup_list=[[0], [1, 2], [1, 3]]
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_average_parameters(self):
+            rank = dist.get_rank()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Sequential(
+                nn.Conv2d(3, 3, kernel_size=3, padding=1),
+                nn.ReLU(),
+                nn.Linear(1, 5, bias=False),
+            ).cuda(device_id)
+            # Test global model averaging
+            for p in model.parameters():
+                p.data = torch.ones_like(p.data)
+            model_averaging_utils.average_parameters(
+                params=model.parameters(), process_group=None
+            )
+            # Every element will be the same as the input.
+            for p in model.parameters():
+                self.assertEqual(p.data, torch.ones_like(p.data))
+
+            # Test partial model averaging
+            for p in model.parameters():
+                p.data = torch.ones_like(p.data) * rank
+            group_nccl = dist.new_group(ranks=[0, 1], backend="nccl")
+            model_averaging_utils.average_parameters(
+                params=model.parameters(), process_group=group_nccl
+            )
+            if not dist._rank_not_in_group(group_nccl):
+                # Every element on device 0 or 1 should be the average of 0 and 1, i.e., 0.5.
+                for p in model.parameters():
+                    self.assertEqual(p.data, torch.ones_like(p.data) * 0.5)
+            else:
+                # Every element on device not in the subgroup should remain the same.
+                for p in model.parameters():
+                    self.assertEqual(p.data, torch.ones_like(p.data) * rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_periodic_model_averager(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            tensor = torch.ones_like(param.data) * rank
+            expected_avg_tensor = (
+                torch.ones_like(param.data) * sum(range(world_size)) / world_size
+            )
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = averagers.PeriodicModelAverager(
+                    period=period, warmup_steps=warmup_steps
+                )
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    param.data = copy.deepcopy(tensor)
+                    for params in model.parameters():
+                        # mock grad
+                        params.grad = torch.ones_like(param.data)
+                    averager.average_parameters(model.parameters())
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        self.assertEqual(param.data, expected_avg_tensor)
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        self.assertEqual(param.data, tensor)
+
+        @skip_if_lt_x_gpu(2)
+        def test_periodic_model_averager_param_group(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            opt = torch.optim.SGD(model.parameters(), lr=0.1)
+
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = averagers.PeriodicModelAverager(
+                    period=period, warmup_steps=warmup_steps
+                )
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    for param_group in opt.param_groups:
+                        for params in param_group["params"]:
+                            # mock grad
+                            params.grad = torch.ones_like(param.data) * rank
+                            params.data = torch.ones_like(param.data) * rank
+                    averager.average_parameters(opt.param_groups)
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        for param_group in opt.param_groups:
+                            for params in param_group["params"]:
+                                if params.grad is None:
+                                    continue
+                                self.assertEqual(
+                                    param.data,
+                                    torch.ones_like(param.data)
+                                    * sum(range(world_size))
+                                    / world_size,
+                                )
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        for param_group in opt.param_groups:
+                            for params in param_group["params"]:
+                                if params.grad is None:
+                                    continue
+                                self.assertEqual(
+                                    param.data, torch.ones_like(param.data) * rank
+                                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_1_level_hierarchical_model_averager_equivalent_to_periodic_model_averager(
+            self,
+        ):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            tensor = torch.ones_like(param.data) * rank
+            expected_avg_tensor = (
+                torch.ones_like(param.data) * sum(range(world_size)) / world_size
+            )
+            period = 4
+            for warmup_steps in [12, 13, 14, 15]:
+                averager = hierarchicalSGD.HierarchicalModelAverager(
+                    # Run the global averaging at a period of 4,
+                    # which is equivalent to the above periodic model averaging test case.
+                    period_group_size_dict=OrderedDict([(period, world_size)]),
+                    warmup_steps=warmup_steps,
+                )
+
+                averager = averagers.PeriodicModelAverager(
+                    period=period, warmup_steps=warmup_steps
+                )
+                for step in range(0, 20):
+                    # Reset the parameters at every step.
+                    param.data = copy.deepcopy(tensor)
+                    for params in model.parameters():
+                        # mock grad
+                        params.grad = torch.ones_like(param.data)
+                    averager.average_parameters(model.parameters())
+                    if step >= warmup_steps and (step - warmup_steps) % period == 0:
+                        self.assertEqual(param.data, expected_avg_tensor)
+                    else:
+                        # No model averaging, so the parameters are not updated.
+                        self.assertEqual(param.data, tensor)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["subgroup"],
+            f"The {BACKEND} backend does not support creating subgroups on CUDA devices",
+        )
+        @require_world_size(4)
+        @skip_if_lt_x_gpu(4)
+        def test_3_level_hierarchical_model_averager(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            model = nn.Linear(1, 5, bias=False).cuda(device_id)
+            param = next(model.parameters())
+            tensor = torch.ones_like(param.data) * rank
+            # Set up such a hierarchical model averaging as follows:
+            # after the first 10 warmup steps,
+            # run model averaging every 2 steps within each subgroup of size 2,
+            # run model averaging every 4 steps within each subgroup of size 3,
+            # and run the global model averaging every 8 steps.
+            # If there is a conflict in model averaging at a step, only run the highest-level model averaging.
+            warmup_steps = 10
+            subgroup_size1 = 2
+            subgroup_avg_period1 = 2
+            subgroup_size2 = 4
+            subgroup_avg_period2 = 4
+            global_avg_period = 8
+            period_group_size_dict = OrderedDict(
+                [
+                    (subgroup_avg_period1, subgroup_size1),
+                    (subgroup_avg_period2, subgroup_size2),
+                    (global_avg_period, world_size),
+                ]
+            )
+            averager = hierarchicalSGD.HierarchicalModelAverager(
+                period_group_size_dict=period_group_size_dict, warmup_steps=warmup_steps
+            )
+            self.assertEqual(dist.get_pg_count(), len(period_group_size_dict))
+
+            subgroup1 = averager.period_process_group_dict[subgroup_avg_period1]
+            subgroup2 = averager.period_process_group_dict[subgroup_avg_period2]
+            real_group_ranks_res1 = _get_pg_config(subgroup1)['ranks']
+            real_group_ranks_res2 = _get_pg_config(subgroup2)['ranks']
+
+            expect_group_ranks_res1 = (
+                rank // subgroup_size1 * subgroup_size1
+                + np.array(list(range(subgroup_size1)))
+            ).tolist()
+            expect_group_ranks_res2 = (
+                rank // subgroup_size2 * subgroup_size2
+                + np.array(list(range(subgroup_size2)))
+            ).tolist()
+            self.assertEqual(real_group_ranks_res1, expect_group_ranks_res1)
+            self.assertEqual(real_group_ranks_res2, expect_group_ranks_res2)
+
+            expected_avg_tensor_within_subgroup1 = (
+                torch.ones_like(param.data)
+                * sum(real_group_ranks_res1)
+                / subgroup_size1
+            )
+            expected_avg_tensor_within_subgroup2 = (
+                torch.ones_like(param.data)
+                * sum(real_group_ranks_res2)
+                / subgroup_size2
+            )
+            expected_global_avg_tensor = (
+                torch.ones_like(param.data) * sum(range(world_size)) / world_size
+            )
+            for step in range(0, 25):
+                # Reset the parameters at every step.
+                param.data = copy.deepcopy(tensor)
+                for params in model.parameters():
+                    # mock grad
+                    params.grad = torch.ones_like(param.data)
+                averager.average_parameters(model.parameters())
+                if step == 16 or step == 24:
+                    # Run global model averaging when `step` can be divided by 8.
+                    self.assertEqual(param.data, expected_global_avg_tensor)
+                elif step == 12 or step == 20:
+                    # Run model averaging within subgroup when `step` can be divided by 4 but not by 8.
+                    self.assertEqual(param.data, expected_avg_tensor_within_subgroup2)
+                elif step == 10 or step == 14 or step == 18 or step == 22:
+                    # Run model averaging within subgroup when `step` can be divided by 2 but not by 4 or 8.
+                    self.assertEqual(param.data, expected_avg_tensor_within_subgroup1)
+                else:
+                    # No model averaging, so the parameters are not updated.
+                    self.assertEqual(param.data, tensor)
+
+        # Coalescing manager (sync mode)
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl" or IS_FBCODE or IS_SANDCASTLE,
+            "Coalescing manager currently tests with NCCL only; internal test flaky"
+        )
+        def test_coalescing_manager(self):
+            self._barrier()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            num_colls = 2
+            size_per_coll = 8
+            small_tensors = [
+                torch.ones(size_per_coll, device=device_id) for _ in range(num_colls)
+            ]
+
+            with dist._coalescing_manager():
+                for i in range(num_colls):
+                    dist.all_reduce(small_tensors[i])
+
+            big_tensor = torch.ones(num_colls * size_per_coll, device=device_id)
+            dist.all_reduce(big_tensor)
+
+            for i in range(num_colls):
+                self.assertEqual(
+                    small_tensors[i],
+                    big_tensor[i * size_per_coll : (i + 1) * size_per_coll]
+                )
+
+            self._barrier()
+
+        # Coalescing manager (async mode)
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl" or IS_FBCODE or IS_SANDCASTLE,
+            "Coalescing manager currently tests with NCCL only; internal test flaky"
+        )
+        def test_coalescing_manager_async(self):
+            self._barrier()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            num_colls = 2
+            size_per_coll = 8
+            small_tensors = [
+                torch.ones(size_per_coll, device=device_id) for _ in range(num_colls)
+            ]
+
+            with dist._coalescing_manager(async_ops=True) as cm:
+                for i in range(num_colls):
+                    dist.all_reduce(small_tensors[i])
+            cm.wait()
+
+            big_tensor = torch.ones(num_colls * size_per_coll, device=device_id)
+            dist.all_reduce(big_tensor)
+
+            for i in range(num_colls):
+                self.assertEqual(
+                    small_tensors[i],
+                    big_tensor[i * size_per_coll : (i + 1) * size_per_coll]
+                )
+
+            self._barrier()
+
+        # NCCL Batch SEND RECV
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            p2p_op_list = []
+            recv_tensors = [None for _ in range(world_size)]
+            expected_tensors = [None for _ in range(world_size)]
+
+            for val in ["1", "0"]:
+                os.environ["TORCH_NCCL_BLOCKING_WAIT"] = val
+                for src in range(0, world_size):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id).fill_(
+                        src
+                    )
+                    recv_tensors[src] = _build_tensor(
+                        src + 1, value=-1, device_id=device_id
+                    ).fill_(-1)
+                    expected_tensors[src] = _build_tensor(
+                        src + 1, value=-1, device_id=device_id
+                    ).fill_(rank)
+                    recv_op = dist.P2POp(dist.irecv, recv_tensors[src], src)
+                    p2p_op_list.append(recv_op)
+                    send_op = dist.P2POp(dist.isend, send_tensor, src)
+                    p2p_op_list.append(send_op)
+
+                reqs = dist.batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+                for src in range(0, world_size):
+                    self.assertEqual(recv_tensors[src], expected_tensors[src])
+
+            self._barrier()
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_ring_exchange_nccl(self):
+            self._barrier()
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            p2p_op_list = []
+
+            send_tensor = _build_tensor(world_size, device_id=device_id)
+            recv_tensor = _build_tensor(world_size, value=-1, device_id=device_id)
+            send_op = dist.P2POp(dist.isend, send_tensor, (rank + 1) % world_size)
+            recv_op = dist.P2POp(
+                dist.irecv, recv_tensor, (rank - 1 + world_size) % world_size
+            )
+            reqs = dist.batch_isend_irecv([send_op, recv_op])
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_self_nccl(self):
+            self._barrier()
+            # Ensure the process group has been fully initialized (needed by
+            # the first sub-group batch_isend_irecv call)
+            dist.barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            p2p_op_list = []
+
+            if rank == 0:
+                send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                recv_tensor = _build_tensor(rank + 1, value=-1, device_id=device_id)
+                recv_op = dist.P2POp(dist.irecv, recv_tensor, 0)
+                p2p_op_list.append(recv_op)
+                send_op = dist.P2POp(dist.isend, send_tensor, 0)
+                p2p_op_list.append(send_op)
+
+                reqs = dist.batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+            self._barrier()
+
+        @skip_if_no_gpu
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_no_rank_zero_nccl(self):
+            self._barrier()
+            # Ensure the process group has been fully initialized (needed by
+            # the first sub-group batch_isend_irecv call)
+            dist.barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            p2p_op_list = []
+
+            if rank == 1:
+                peer = 2
+            elif rank == 2:
+                peer = 1
+
+            if rank in [1, 2]:
+                send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                recv_tensor = _build_tensor(peer + 1, value=-1, device_id=device_id)
+                recv_op = dist.P2POp(dist.irecv, recv_tensor, peer)
+                p2p_op_list.append(recv_op)
+                send_op = dist.P2POp(dist.isend, send_tensor, peer)
+                p2p_op_list.append(send_op)
+
+                reqs = dist.batch_isend_irecv(p2p_op_list)
+                for req in reqs:
+                    req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU
+        @skip_but_pass_in_sandcastle_if(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist.P2POp(dist.irecv, recv_tensor, src)
+                p2p_op_list.append(recv_op)
+                send_op = dist.P2POp(dist.isend, send_tensor, src)
+                p2p_op_list.append(send_op)
+
+            reqs = dist.batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # GLOO Batch SEND RECV CPU with provided tags
+        @skip_but_pass_in_sandcastle_if(BACKEND != "gloo", "GLOO Batch Send Recv CPU")
+        def test_batch_isend_irecv_gloo_tags(self):
+            self._barrier()
+            rank = dist.get_rank()
+            p2p_op_list = []
+
+            for src in range(0, dist.get_world_size()):
+                if src == rank:
+                    continue
+                send_tensor = _build_tensor(rank + 1)
+                recv_tensor = _build_tensor(src + 1, value=-1)
+                recv_op = dist.P2POp(dist.irecv, recv_tensor, src, tag=src)
+                p2p_op_list.append(recv_op)
+                send_op = dist.P2POp(dist.isend, send_tensor, src, tag=rank)
+                p2p_op_list.append(send_op)
+
+            reqs = dist.batch_isend_irecv(p2p_op_list)
+            for req in reqs:
+                req.wait()
+
+            self._barrier()
+
+        # NCCL Batch SEND RECV Op Error
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+                device_id = rank_to_GPU[rank][0]
+                with self.assertRaisesRegex(ValueError, "^Invalid ``op``"):
+                    send_tensor = _build_tensor(rank + 1, device_id=device_id)
+                    send_op = dist.P2POp(dist.broadcast, send_tensor, 1)
+                    dist.batch_isend_irecv([send_op])
+
+        # NCCL Batch SEND RECV p2p_op_list Error
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_op_list_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            if rank == 0:
+                with self.assertRaisesRegex(ValueError, "^Invalid ``p2p_op_list``"):
+                    dist.batch_isend_irecv([1, 2])
+
+        # NCCL Batch SEND RECV Mixed Backend Error
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Batch Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_batch_isend_irecv_mixed_backend_err(self):
+            self._barrier()
+            rank = dist.get_rank()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            group_gloo = dist.new_group(ranks=[0, 1], backend="gloo")
+            group_nccl = dist.new_group(ranks=[0, 1], backend="nccl")
+            if rank == 0:
+                with self.assertRaisesRegex(
+                    ValueError, "All ops need to use the same group"
+                ):
+                    send_tensor = _build_tensor(rank + 1)
+                    send_op_gloo = dist.P2POp(dist.isend, send_tensor, 1, group_gloo)
+                    send_op_nccl = dist.P2POp(dist.isend, send_tensor, 1, group_nccl)
+                    dist.batch_isend_irecv([send_op_gloo, send_op_nccl])
+
+        # NCCL SEND RECV
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def _test_send_recv_nccl(self, profiler_ctx=None):
+            # TODO: now that nccl send/recv is supported, there does not seem to
+            # be a need to have nccl send/recv be tested separately.
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+
+            tensor = _build_tensor(rank + 1, device_id=device_id)
+            profiler_cls = profiler_ctx if profiler_ctx is not None else nullcontext()
+            with profiler_cls as prof:
+                for src in range(0, world_size):
+                    if src == rank:
+                        # Send mode
+                        for dst in range(0, world_size):
+                            if dst == rank:
+                                continue
+                            dist.send(tensor, dst)
+                    else:
+                        # Recv mode
+                        expected_tensor = _build_tensor(src + 1)
+                        output_tensor = _build_tensor(
+                            src + 1, value=-1, device_id=device_id
+                        )
+                        dist.recv(output_tensor, src)
+                        self.assertEqual(output_tensor, expected_tensor)
+
+                self._barrier()
+
+            if profiler_ctx is not None:
+                backend = dist.get_backend()
+                if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
+                    for event_name in [f"{backend}:send", f"{backend}:recv"]:
+                        events = get_profiling_event(event_name, prof, dedup_gpu_user_annotation=True)
+                        self.assertTrue(events)
+                        # Event order is not deterministic, so simply assert their shape
+                        # is found in the following list.
+                        expected_shapes = [
+                            [[rank + 1] * 3] for rank in range(dist.get_world_size())
+                        ]
+                        for event in events:
+                            self.assertTrue(event.input_shapes in expected_shapes)
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_send_recv_nccl(self):
+            self._test_send_recv_nccl()
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        def test_send_recv_nccl_autograd_profiler(self):
+            profiler_ctx = torch.autograd.profiler.profile(record_shapes=True)
+            self._test_send_recv_nccl(profiler_ctx)
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(BACKEND != "nccl", "NCCL Send Recv Only")
+        @requires_nccl_version((2, 7, 0), "Need NCCL 2.7+ for send/recv")
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_send_recv_nccl_torch_profiler(self):
+            profiler_ctx = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                record_shapes=True,
+            )
+            self._test_send_recv_nccl(profiler_ctx)
+
+        # SEND RECV
+        def _test_send_recv(self, profiler_ctx):
+            rank = dist.get_rank()
+            send_size = rank + 1
+            tensor = _build_tensor(send_size)
+            ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
+            with ctx as prof:
+                for src in range(0, dist.get_world_size()):
+                    if src == rank:
+                        # Send mode
+                        for dst in range(0, dist.get_world_size()):
+                            if dst == rank:
+                                continue
+                            dist.send(tensor, dst)
+                    else:
+                        # Recv mode
+                        recv_size = src + 1
+                        expected_tensor = _build_tensor(recv_size)
+                        output_tensor = _build_tensor(recv_size, value=-1)
+                        dist.recv(output_tensor, src)
+                        self.assertEqual(output_tensor, expected_tensor)
+
+            if profiler_ctx is not None:
+                backend = dist.get_backend()
+                if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
+                    for event_name in [f"{backend}:send", f"{backend}:recv"]:
+                        events = get_profiling_event(event_name, prof)
+                        # Each rank sends/recvs from all other ranks.
+                        event_count = sum(e.count for e in events)
+                        expected_event_count = dist.get_world_size() - 1
+                        self.assertEqual(event_count, expected_event_count)
+                        # Event order is not deterministic, so simply assert their shape
+                        # is found in the following list.
+                        expected_shapes = [
+                            [[rank + 1] * 3] for rank in range(dist.get_world_size())
+                        ]
+                        for event in events:
+                            self.assertTrue(event.is_async)
+                            self.assertTrue(event.input_shapes in expected_shapes)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl send/recv tested by test_send_recv_nccl"
+        )
+        def test_send_recv(self):
+            self._test_send_recv(profiler_ctx=None)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "NCCL send/recv tested by test_send_recv_nccl"
+        )
+        def test_send_recv_autograd_profiler(self):
+            autograd_profiler_ctx = _create_autograd_profiler()
+            self._test_send_recv(profiler_ctx=autograd_profiler_ctx)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "NCCL send/recv tested by test_send_recv_nccl"
+        )
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_send_recv_torch_profiler(self):
+            torch_profiler_ctx = _create_torch_profiler()
+            return self._test_send_recv(profiler_ctx=torch_profiler_ctx)
+
+        # SEND RECV ANY SOURCE
+        def _test_send_recv_any_source(self, profiler_ctx):
+            rank = dist.get_rank()
+            send_recv_size = 10
+            tensor = _build_tensor(send_recv_size, value=rank)
+            recv_ranks = list()
+            irecv_ranks = list()
+
+            ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
+            with ctx as prof:
+                for dst in range(0, dist.get_world_size()):
+                    if dst == rank:
+                        # Recv mode
+                        for dst in range(0, dist.get_world_size()):
+                            if dst == rank:
+                                continue
+
+                            for recv in ["recv", "irecv"]:
+                                output_tensor = _build_tensor(send_recv_size, value=-1)
+
+                                if recv == "recv":
+                                    sender = dist.recv(output_tensor)
+                                    recv_ranks.append(sender)
+                                elif recv == "irecv":
+                                    work = dist.irecv(output_tensor)
+                                    work.wait()
+                                    sender = work._source_rank()
+                                    irecv_ranks.append(sender)
+
+                                # Assert the scalar value "sender" that should be
+                                # equal to the rank of the sender is equal to all
+                                # values in the received tensor.
+                                self.assertTrue(output_tensor.eq(sender).all())
+                    else:
+                        # Send mode
+                        dist.send(tensor, dst)  # recv
+                        dist.send(tensor, dst)  # irecv
+
+            if profiler_ctx is not None:
+                backend = dist.get_backend()
+                if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
+                    for event_name in [f"{backend}:send", f"{backend}:recvAnySource"]:
+                        events = get_profiling_event(event_name, prof)
+                        # Each rank sends/recvs from other rank twice.
+                        self.assertEqual(
+                            sum(event.count for event in events),
+                            2 * (dist.get_world_size() - 1),
+                        )
+                        for event in events:
+                            self.assertTrue(event.is_async)
+                            self.assertEqual(event.input_shapes, [[send_recv_size] * 3])
+
+                # Each rank would have 2 * (world_size - 1) sends, verify that
+                # globally we receive the same amount on the other end.
+                recv_ranks_tensor = torch.cat(
+                    (torch.tensor(recv_ranks), torch.tensor(irecv_ranks)), 0
+                )
+                global_recv_ranks = [
+                    torch.empty_like(recv_ranks_tensor)
+                    for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(global_recv_ranks, recv_ranks_tensor)
+                global_recv_ranks_list = []
+                for tensor in global_recv_ranks:
+                    global_recv_ranks_list += tensor.tolist()
+
+                from itertools import groupby
+
+                global_recv_ranks_list.sort()
+                frequency = [
+                    len(list(group)) for key, group in groupby(global_recv_ranks_list)
+                ]
+                self.assertEqual(dist.get_world_size(), len(frequency))
+                self.assertEqual(
+                    [2 * (dist.get_world_size() - 1)] * dist.get_world_size(), frequency
+                )
+                self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["sendrecv anysource"],
+            f"{BACKEND} does not support send/recv from any source",
+        )
+        def test_send_recv_any_source(self):
+            self._test_send_recv_any_source(profiler_ctx=None)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["sendrecv anysource"],
+            f"{BACKEND} does not support send/recv from any source",
+        )
+        def test_send_recv_any_source_autograd_profiler(self):
+            autograd_profiler_ctx = _create_autograd_profiler()
+            self._test_send_recv_any_source(profiler_ctx=autograd_profiler_ctx)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["sendrecv anysource"],
+            f"{BACKEND} does not support send/recv from any source",
+        )
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_send_recv_any_source_torch_profiler(self):
+            torch_profiler_ctx = _create_torch_profiler()
+            return self._test_send_recv_any_source(profiler_ctx=torch_profiler_ctx)
+
+        # SEND RECV WITH TAG
+        def _test_send_recv_with_tag(self, profiler_ctx):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            send_recv_size = 10
+            tensor = _build_tensor(send_recv_size, value=rank)
+            ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
+            with ctx as prof:
+                for dst in range(0, world_size):
+                    if dst == rank:
+                        # Recv mode
+                        for src in range(0, world_size):
+                            if src == rank:
+                                continue
+                            output_tensor = _build_tensor(send_recv_size, value=-1)
+                            dist.recv(output_tensor, src, tag=src)
+                            self.assertTrue(output_tensor.eq(src).all())
+                    else:
+                        # Send mode
+                        dist.send(tensor, dst, tag=rank)
+
+            if profiler_ctx is not None:
+                backend = dist.get_backend()
+                if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
+                    for event_name in [f"{backend}:send", f"{backend}:recv"]:
+                        events = get_profiling_event(event_name, prof)
+                        # Each rank sends/recvs from all other ranks
+                        event_count = sum(e.count for e in events)
+                        expected_event_count = dist.get_world_size() - 1
+                        self.assertEqual(event_count, expected_event_count)
+                        for event in events:
+                            self.assertTrue(event.is_async)
+                            self.assertEqual(event.name, event_name)
+                            self.assertEqual(event.input_shapes, [[send_recv_size] * 3])
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "NCCL send/recv tested by test_send_recv_nccl"
+        )
+        def test_send_recv_with_tag(self):
+            self._test_send_recv_with_tag(profiler_ctx=None)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "NCCL send/recv tested by test_send_recv_nccl"
+        )
+        def test_send_recv_with_tag_autograd_profiler(self):
+            autograd_profiler_ctx = _create_autograd_profiler()
+            return self._test_send_recv_with_tag(profiler_ctx=autograd_profiler_ctx)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "NCCL send/recv tested by test_send_recv_nccl"
+        )
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_send_recv_with_tag_torch_profiler(self):
+            torch_profiler_ctx = _create_torch_profiler()
+            return self._test_send_recv_with_tag(profiler_ctx=torch_profiler_ctx)
+
+        # ISEND
+        def _test_isend(self, profiler_ctx):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            ctx = profiler_ctx if profiler_ctx is not None else nullcontext()
+            with ctx as prof:
+                if rank == 0:
+                    requests = [
+                        dist.isend(_build_tensor(dest, 10), dest)
+                        for dest in range(1, world_size)
+                    ]
+                    for request in requests:
+                        request.wait()
+                        self.assertTrue(request.is_completed())
+                else:
+                    tensor = _build_tensor(rank, -1)
+                    dist.recv(tensor, 0)
+                    self.assertEqual(tensor, _build_tensor(rank, 10))
+
+                self._barrier()
+
+            if profiler_ctx is not None:
+                backend = dist.get_backend()
+                if backend in SEND_RECV_PROFILING_SUPPORTED_BACKENDS:
+                    expected_event_name = (
+                        f"{backend}:send" if rank == 0 else f"{backend}:recv"
+                    )
+                    events = get_profiling_event(expected_event_name, prof)
+                    event_count = sum(e.count for e in events)
+                    expected_count = dist.get_world_size() - 1 if rank == 0 else 1
+                    self.assertEqual(expected_count, event_count)
+                    # Event ordering is not guaranteed, so simply ensure the shapes are
+                    # found in the following map.
+                    expected_shapes = {
+                        r: [[r] * 3] for r in range(1, dist.get_world_size())
+                    }
+                    for event in events:
+                        self.assertTrue(event.is_async)
+                        self.assertEqual(event.name, expected_event_name)
+                        if rank == 0:
+                            self.assertTrue(
+                                event.input_shapes in expected_shapes.values()
+                            )
+                        else:
+                            self.assertEqual(event.input_shapes, expected_shapes[rank])
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support isend"
+        )
+        def test_isend(self):
+            self._test_isend(profiler_ctx=None)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support isend"
+        )
+        def test_isend_autograd_profiler(self):
+            autograd_profiler_ctx = _create_autograd_profiler()
+            self._test_isend(profiler_ctx=autograd_profiler_ctx)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support isend"
+        )
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_isend_torch_profiler(self):
+            torch_profiler_ctx = _create_torch_profiler()
+            self._test_isend(profiler_ctx=torch_profiler_ctx)
+
+        # IRECV
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support irecv"
+        )
+        def test_irecv(self):
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+
+            if rank == 0:
+                expected_tensors = [
+                    _build_tensor(src, -1) for src in range(1, world_size)
+                ]
+                requests = [
+                    dist.irecv(expected_tensors[src - 1], src)
+                    for src in range(1, world_size)
+                ]
+
+                for src in range(1, world_size):
+                    requests[src - 1].wait()
+                    self.assertTrue(requests[src - 1].is_completed())
+                    self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
+            else:
+                tensor = _build_tensor(rank, 10)
+                dist.send(tensor, 0)
+
+            self._barrier()
+
+        # BROADCAST
+        def _test_broadcast_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            cuda=False,
+            rank_to_GPU=None,
+            with_options=False,
+        ):
+            for dtype, value, requires_cuda in [
+                (torch.float, -1e-10, False),
+                (torch.double, -1e-100, False),
+                (torch.half, -0.1, True),
+                (torch.int8, -2, False),
+                (torch.uint8, 129, False),
+                (torch.int, -1e5, False),
+                (torch.long, -1e15, False),
+            ]:
+                if requires_cuda and not cuda:
+                    continue
+                for src in group:
+                    expected_tensor = _build_tensor(src + 1, value, dtype)
+                    if cuda:
+                        expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                    if rank == src:
+                        if with_options:
+                            opts = dist.BroadcastOptions()
+                            opts.rootTensor = 0
+                            opts.rootRank = src
+                            self.call_dist_op(
+                                ":broadcast",
+                                True,
+                                group_id.broadcast,
+                                [expected_tensor],
+                                opts,
+                            )
+                        else:
+                            self.call_dist_op(
+                                ":broadcast",
+                                False,
+                                dist.broadcast,
+                                expected_tensor,
+                                src,
+                                group_id,
+                            )
+                    else:
+                        tensor = _build_tensor(src + 1, -1, dtype)
+                        if cuda:
+                            tensor = tensor.cuda(rank_to_GPU[rank][0])
+                        if with_options:
+                            opts = dist.BroadcastOptions()
+                            opts.rootTensor = 0
+                            opts.rootRank = src
+                            self.call_dist_op(
+                                ":broadcast", True, group_id.broadcast, [tensor], opts
+                            )
+                        else:
+                            self.call_dist_op(
+                                ":broadcast",
+                                False,
+                                dist.broadcast,
+                                tensor,
+                                src,
+                                group_id,
+                            )
+                        self.assertEqual(tensor.size(), expected_tensor.size())
+                        self.assertEqual(
+                            tensor.ne(expected_tensor).max(), torch.tensor(False)
+                        )
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_broadcast(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_broadcast_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and Nccl backend supports CUDA allReduce",
+        )
+        @skip_if_no_gpu
+        def test_broadcast_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_broadcast_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_broadcast_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_broadcast_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_broadcast_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl",
+            "Only NCCL backend supports high priority stream",
+        )
+        @skip_if_no_gpu
+        def test_nccl_high_priority_stream(self):
+            group, _, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+
+            new_port = str(MASTER_PORT + 1)
+            os.environ["MASTER_PORT"] = new_port
+            gen_iterator = dist.rendezvous("env://", rank, dist.get_world_size())
+            store, rank, size = next(gen_iterator)
+            store = dist.PrefixStore(new_port, store)
+
+            opts = dist.ProcessGroupNCCL.Options()
+            opts.is_high_priority_stream = False
+            group_id = dist.ProcessGroupNCCL(store, rank, size, opts)
+
+            self._test_broadcast_helper(group, group_id, rank, True, rank_to_GPU, True)
+
+        # REDUCE
+        def _test_reduce_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            op,
+            master_value,
+            worker_value,
+            expected_value,
+            cuda=False,
+            rank_to_GPU=None,
+        ):
+            for src in group:
+                tensor = _build_tensor(src + 1).fill_(
+                    master_value if rank == src else worker_value
+                )
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                self.call_dist_op(
+                    ":reduce",
+                    False,
+                    dist.reduce,
+                    tensor,
+                    src,
+                    op,
+                    group_id,
+                    tensor_shapes=[tensor.shape],
+                )
+                if rank == src:
+                    self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_sum(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA reduce"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_no_gpu
+        def test_reduce_sum_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + 10 * (len(group) - 1),
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_product(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_min(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_max(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_small_worldsize
+        def test_reduce_group_sum(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_small_worldsize
+        def test_reduce_group_product(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_small_worldsize
+        def test_reduce_group_min(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_small_worldsize
+        def test_reduce_group_max(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_full_group_sum(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_full_group_product(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_full_group_min(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_full_group_max(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        # REDUCE TWICE
+        def _test_reduce_twice_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            op,
+            master_value,
+            worker_value,
+            expected_value,
+            cuda=False,
+            rank_to_GPU=None,
+        ):
+            for src in group:
+                tensors = [
+                    _build_tensor(src + 1).fill_(
+                        master_value if rank == src else worker_value
+                    )
+                    for i in range(2)
+                ]
+                if cuda:
+                    for i in range(2):
+                        tensors[i] = tensors[i].cuda(rank_to_GPU[rank][0])
+                self.call_dist_op(
+                    ":reduce",
+                    False,
+                    dist.reduce,
+                    tensors[0],
+                    src,
+                    op,
+                    group_id,
+                    secondary_op_call=lambda: dist.reduce(
+                        tensors[1], src, op, group_id
+                    ),
+                    tensor_shapes=[tensors[0].shape],
+                )
+                if rank == src:
+                    for tensor in tensors:
+                        self.assertEqual(tensor, _build_tensor(src + 1, expected_value))
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        def test_reduce_sum_twice(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_reduce_twice_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA reduce"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_no_gpu
+        def test_reduce_sum_cuda_twice(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+            torch.cuda.set_device(device_id)
+            self._test_reduce_twice_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + 10 * (len(group) - 1),
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports reduce_scatter_v"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["reduce"],
+            f"{BACKEND} does not support reduce",
+        )
+        @skip_if_no_gpu
+        def test_reduce_scatter_v_cuda(self):
+            self._barrier()
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            input_split_sizes = []
+            for src in group:
+                input_split_sizes.append(src + 1)
+            start_len = sum(input_split_sizes[:rank])
+            end_len = start_len + input_split_sizes[rank]
+            sum_len = sum(input_split_sizes)
+            master_value = 2
+            worker_value = 10
+
+            for async_val in [True, False]:
+                tensor = _build_tensor(sum_len, worker_value, device_id=device_id)
+                tensor[start_len:end_len].fill_(master_value)
+                out_tensor = (
+                    torch.empty(
+                        input_split_sizes[rank], sum_len, sum_len, dtype=torch.float
+                    )
+                    .fill_(-1)
+                    .cuda(device_id)
+                )
+
+                req = dist.reduce_scatter(
+                    out_tensor,
+                    list(torch.split(tensor, input_split_sizes)),
+                    dist.ReduceOp.SUM,
+                    group_id,
+                    async_val,
+                )
+                if async_val:
+                    req.wait()
+
+                expected_value = 2 + (10 * (len(group) - 1))
+                expected_tensor = torch.empty(
+                    input_split_sizes[rank], sum_len, sum_len, dtype=torch.float
+                )
+                expected_tensor = expected_tensor.fill_(expected_value).cuda(device_id)
+
+                self.assertEqual(out_tensor, expected_tensor)
+            self._barrier()
+
+        # Test reduce_scatter_tensor accepting single tensor as input
+        def _reduce_scatter_tensor_helper(
+            self, tensor_out, tensor_in, group_id, rank, cuda=True, rank_to_GPU=None
+        ):
+            if cuda:
+                tensor_in = tensor_in.cuda(rank_to_GPU[rank][0])
+                tensor_out = tensor_out.cuda(rank_to_GPU[rank][0])
+            tensor_shapes = [tensor_out.shape]
+            self.call_dist_op(
+                ":reduce_scatter_tensor",
+                False,
+                dist.reduce_scatter_tensor,
+                tensor_out,
+                tensor_in,
+                dist.ReduceOp.SUM,
+                group_id,
+                False,
+                expect_event=False,
+                tensor_shapes=tensor_shapes,
+            )
+            return tensor_out
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA reduce_scatter_tensor"
+        )
+        @skip_if_no_gpu
+        def test_reduce_scatter_tensor_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            size = 2
+            tensor_out = torch.zeros(size, dtype=torch.int64)
+
+            # Concatenated input
+            tensor_in = torch.arange(len(group) * size)
+            tensor_out = self._reduce_scatter_tensor_helper(
+                tensor_out, tensor_in, group_id, rank, True, rank_to_GPU
+            )
+            # Check result
+            expected_tensor = torch.arange(rank * size, (rank + 1) * size) * len(group)
+            self.assertEqual(tensor_out, expected_tensor)
+            self._barrier()
+
+            # Stacked input
+            tensor_in = torch.reshape(tensor_in, (len(group), size))
+            tensor_out = self._reduce_scatter_tensor_helper(
+                tensor_out, tensor_in, group_id, rank, True, rank_to_GPU
+            )
+            # Check result
+            # Should be the same as the result in concatenated case
+            self.assertEqual(tensor_out, expected_tensor)
+            self._barrier()
+
+        def call_dist_op(
+            self,
+            profiling_title_postfix,
+            is_async,
+            op,
+            *args,
+            expect_event=True,
+            secondary_op_call=None,
+            profile_cuda=False,
+            tensor_shapes=None,
+            **kwargs,
+        ):
+            op_calls = [lambda: op(*args, **kwargs)]
+            if secondary_op_call is not None:
+                op_calls.append(secondary_op_call)
+
+            autograd_profiler_ctx = torch.autograd.profiler.profile(
+                use_cuda=profile_cuda, record_shapes=True
+            )
+
+            # TODO: move this test to use torch.profiler once kineto issues are
+            # fixed internally.
+            with autograd_profiler_ctx as prof:
+                works = [op_call() for op_call in op_calls]
+                if is_async:
+                    for work in works:
+                        work.wait()
+
+            if expect_event and dist.get_backend() in PROFILING_SUPPORTED_BACKENDS:
+                # We are only interested in the backend's implementation not the dispatcher wrapper.
+                events = get_profiling_event(
+                    dist.get_backend() + profiling_title_postfix, autograd_profiler_ctx
+                )
+                # DETAIL debug mode can use a pg wrapper that issues more collectives
+                # under the hood
+                if dist.get_debug_level() != dist.DebugLevel.DETAIL:
+                    self.assertEqual(len(events), len(op_calls))
+                for e in events:
+                    self.assertTrue(e.is_async)
+                    self.assertEqual(e.count, 1)
+                    self.assertGreaterEqual(e.cpu_time, 0)
+                    # Verify tensor shapes if given
+                    # DETAIL debug mode can use a pg wrapper that issues more collectives
+                    # under the hood
+                    if (
+                        tensor_shapes is not None
+                        and dist.get_debug_level() != dist.DebugLevel.DETAIL
+                    ):
+                        self.assertEqual(
+                            e.input_shapes,
+                            tensor_shapes,
+                            f"event shape: {e.input_shapes} vs tensor {tensor_shapes}",
+                        )
+
+        # ALL REDUCE
+        def _test_all_reduce_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            op,
+            master_value,
+            worker_value,
+            expected_value,
+            cuda=False,
+            rank_to_GPU=None,
+            dtype=torch.float,
+            async_op=False,
+        ):
+            for src in group:
+                curr_value = master_value if rank == src else worker_value
+
+                tensor = _build_tensor(src + 1, dtype=dtype).fill_(curr_value)
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                if tensor.dtype == torch.complex64:
+                    tensor_shapes = [torch.view_as_real(tensor).shape]
+                else:
+                    tensor_shapes = [tensor.shape]
+                self.call_dist_op(
+                    ":all_reduce",
+                    async_op,
+                    dist.all_reduce,
+                    tensor,
+                    op,
+                    group_id,
+                    async_op=async_op,
+                    tensor_shapes=tensor_shapes,
+                )
+                # Currently, only Gloo backend has profiling tested with CUDA enabled.
+                # Only run cuda profiling test for one rank to speed up since
+                # running with different src_rank does not affect the correctness.
+                if (
+                    src == 0
+                    and cuda
+                    and dist.get_backend() in CUDA_PROFILING_SUPPORTED_BACKENDS
+                ):
+                    self.call_dist_op(
+                        ":all_reduce",
+                        async_op,
+                        dist.all_reduce,
+                        tensor,
+                        op,
+                        group_id,
+                        async_op=async_op,
+                        profile_cuda=True,
+                        tensor_shapes=tensor_shapes,
+                    )
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_sum(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_sum_async(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+                async_op=True,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and NCCL backends will have CUDA allReduce tested",
+        )
+        @skip_if_no_gpu
+        def test_all_reduce_sum_cuda(self):
+            torch.cuda.set_device(self.rank)
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and NCCL backends will have CUDA allReduce tested",
+        )
+        @skip_if_no_gpu
+        def test_all_reduce_sum_cuda_async(self):
+            torch.cuda.set_device(self.rank)
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+                True,
+                rank_to_GPU,
+                async_op=True,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_sum_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                complex(2, 3),
+                complex(10, 11),
+                complex(2, 3) + (complex(10, 11) * (len(group) - 1)),
+                dtype=torch.cfloat,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_complex_unsupported_ops(self):
+            unsupported_ops = [
+                dist.ReduceOp.MAX,
+                dist.ReduceOp.MIN,
+                dist.ReduceOp.PRODUCT,
+                dist.ReduceOp.BAND,
+                dist.ReduceOp.BOR,
+                dist.ReduceOp.BXOR,
+            ]
+            group, group_id, rank = self._init_global_test()
+            for unsupported_op in unsupported_ops:
+                with self.assertRaisesRegex(
+                    ValueError, "all_reduce does not support"
+                ):
+                    dist.all_reduce(
+                        _build_tensor(1, dtype=torch.cfloat), unsupported_op, group_id
+                    )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo" and BACKEND != "nccl",
+            "Only Gloo and NCCL backends will have CUDA allReduce tested",
+        )
+        @skip_if_no_gpu
+        def test_all_reduce_sum_cuda_complex(self):
+            torch.cuda.set_device(self.rank)
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                complex(2, 3),
+                complex(10, 11),
+                complex(2, 3) + (complex(10, 11) * (len(group) - 1)),
+                True,
+                rank_to_GPU,
+                dtype=torch.cfloat,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_product(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_min(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_max(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_group_sum(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_group_product(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_group_min(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_group_max(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_full_group_sum(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                2,
+                10,
+                2 + (10 * (len(group) - 1)),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_full_group_product(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                2,
+                10,
+                reduce(operator.mul, [10] * (len(group) - 1), 2),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_full_group_min(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, 1010, 1, 1
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_full_group_max(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, -1, 10, 10
+            )
+
+        # SPARSE ALL REDUCE
+        def _test_sparse_all_reduce_sum(self, fn):
+            group, group_id, rank = self._init_global_test()
+
+            tests = simple_sparse_reduce_tests(
+                rank, dist.get_world_size(), num_inputs=1
+            )
+            for (inputs, outputs) in tests:
+                tensors = [fn(input) for input in inputs]
+                dist.all_reduce(tensors[0], dist.ReduceOp.SUM, group_id)
+                self.assertEqual(tensors[0], outputs[0])
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo", "Only Gloo backend support sparse all reduce"
+        )
+        def test_sparse_all_reduce_sum(self):
+            self._test_sparse_all_reduce_sum(lambda t: t)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "gloo", "Only Gloo backend support sparse all reduce"
+        )
+        @skip_if_no_gpu
+        def test_sparse_all_reduce_sum_cuda(self):
+            self._test_sparse_all_reduce_sum(lambda t: t.clone().cuda())
+
+        # ALL REDUCE - COALESCED
+        @staticmethod
+        def _all_reduce_coalesced_sum_test_cases(group_size):
+            return (
+                [2, 3, complex(2, 3)],
+                [10, 11, complex(10, 11)],
+                [
+                    2 + 10 * (group_size - 1),
+                    3 + 11 * (group_size - 1),
+                    complex(2, 3) + complex(10, 11) * (group_size - 1),
+                ],
+                [torch.float, torch.float, torch.cfloat],
+            )
+
+        @staticmethod
+        def _all_reduce_coalesced_product_test_cases(group_size):
+            return (
+                [1, 2],
+                [3, 4],
+                [1 * 3 ** (group_size - 1), 2 * 4 ** (group_size - 1)],
+                [torch.float, torch.float],
+            )
+
+        @staticmethod
+        def _all_reduce_coalesced_min_test_cases(group_size):
+            return (
+                [1, 4],
+                [2, 3],
+                [1, 3],
+                [torch.float, torch.float],
+            )
+
+        @staticmethod
+        def _all_reduce_coalesced_max_test_cases(group_size):
+            return (
+                [1, 4],
+                [2, 3],
+                [2, 4],
+                [torch.float, torch.float],
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_reduce_coalesced_max_complex_unsupported(self):
+            group, group_id, rank = self._init_global_test()
+            with self.assertRaisesRegex(ValueError, "all_reduce does not support"):
+                dist.all_reduce_coalesced(
+                    [_build_tensor(1, dtype=torch.cfloat)], dist.ReduceOp.MAX, group_id
+                )
+
+        def _test_all_reduce_coalesced_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            op,
+            cuda=False,
+            rank_to_GPU=None,
+        ):
+            test_case_func = {
+                dist.ReduceOp.SUM: self._all_reduce_coalesced_sum_test_cases,
+                dist.ReduceOp.PRODUCT: self._all_reduce_coalesced_product_test_cases,
+                dist.ReduceOp.MIN: self._all_reduce_coalesced_min_test_cases,
+                dist.ReduceOp.MAX: self._all_reduce_coalesced_max_test_cases,
+            }[op]
+
+            master_values, worker_values, expected_values, dtypes = test_case_func(
+                len(group)
+            )
+
+            for src in group:
+                curr_values = master_values if rank == src else worker_values
+                tensors = [
+                    _build_tensor(src + 1, val, dtype=dtype)
+                    for dtype, val in zip(dtypes, curr_values)
+                ]
+                if cuda:
+                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+                tensor_shapes = []
+                for tensor in tensors:
+                    if tensor.dtype == torch.complex64:
+                        tensor_shapes.append(torch.view_as_real(tensor).shape)
+                    else:
+                        tensor_shapes.append(tensor.shape)
+                self.call_dist_op(
+                    ":all_reduce",
+                    False,
+                    dist.all_reduce_coalesced,
+                    tensors,
+                    op,
+                    group_id,
+                    tensor_shapes=tensor_shapes,
+                )
+                expected_tensors = [
+                    _build_tensor(src + 1, expected_value, dtype=dtype)
+                    for dtype, expected_value in zip(dtypes, expected_values)
+                ]
+                self.assertEqual(tensors, expected_tensors)
+
+            self._barrier()
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_sum(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.SUM,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_product(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_min(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.MIN,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_max(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, cuda=False, rank_to_GPU=None
+            )
+
+        @skip_if_small_worldsize
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_group_sum(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.SUM, cuda=False, rank_to_GPU=None
+            )
+
+        @skip_if_small_worldsize
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_group_product(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @skip_if_small_worldsize
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_group_min(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.MIN, cuda=False, rank_to_GPU=None
+            )
+
+        @skip_if_small_worldsize
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_group_max(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, cuda=False, rank_to_GPU=None
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_full_group_sum(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.SUM, cuda=False, rank_to_GPU=None
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_full_group_product(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.PRODUCT,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_full_group_min(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group,
+                group_id,
+                rank,
+                dist.ReduceOp.MIN,
+                cuda=False,
+                rank_to_GPU=None,
+            )
+
+        @require_backend_is_available({"gloo"})
+        def test_all_reduce_coalesced_full_group_max(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_reduce_coalesced_helper(
+                group, group_id, rank, dist.ReduceOp.MAX, cuda=False, rank_to_GPU=None
+            )
+
+        # SCATTER
+        def _test_scatter_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float
+        ):
+            for dest in group:
+                tensor = _build_tensor(dest + 1, -1, dtype=dtype)
+                expected_tensor = _build_tensor(dest + 1, rank, dtype=dtype)
+                tensors = (
+                    [_build_tensor(dest + 1, i, dtype=dtype) for i in group]
+                    if rank == dest
+                    else []
+                )
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+                if dtype == torch.complex64:
+                    tensor_shapes = [torch.view_as_real(t).shape for t in tensors]
+                else:
+                    tensor_shapes = [t.shape for t in tensors]
+                self.call_dist_op(
+                    ":scatter",
+                    False,
+                    dist.scatter,
+                    tensor,
+                    src=dest,
+                    scatter_list=tensors,
+                    group=group_id,
+                    expect_event=False,
+                    tensor_shapes=tensor_shapes,
+                )
+                self.assertEqual(tensor, expected_tensor)
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_scatter_checks(self):
+            group, group_id, rank = self._init_global_test()
+            one = torch.ones([1])
+
+            # Specify scatter_list argument only on source rank.
+            output = one.clone() * -1
+            if rank == 0:
+                scatter_list = [one.clone() * i for i in group]
+                dist.scatter(output, src=0, scatter_list=scatter_list)
+            else:
+                dist.scatter(output, src=0)
+            self.assertEqual(output, one * rank)
+
+            # Don't specify src argument.
+            output = one.clone() * -1
+            if rank == 0:
+                scatter_list = [one.clone() * i for i in group]
+                dist.scatter(output, scatter_list=scatter_list)
+            else:
+                dist.scatter(output)
+            self.assertEqual(output, one * rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_scatter(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_scatter_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA gather"
+        )
+        @skip_if_no_gpu
+        def test_scatter_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_scatter_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_scatter_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_scatter_helper(group, group_id, rank, dtype=torch.cfloat)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA gather"
+        )
+        @skip_if_no_gpu
+        def test_scatter_cuda_complex(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_scatter_helper(
+                group, group_id, rank, True, rank_to_GPU, dtype=torch.cfloat
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        @skip_if_small_worldsize
+        def test_scatter_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_scatter_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_scatter_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_scatter_helper(group, group_id, rank)
+
+        # GATHER
+        def _test_gather_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None
+        ):
+            for dest in group:
+                tensor = _build_tensor(dest + 1, rank)
+                tensors = (
+                    [_build_tensor(dest + 1, -1) for i in group] if rank == dest else []
+                )
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+                self.call_dist_op(
+                    ":gather",
+                    False,
+                    dist.gather,
+                    tensor,
+                    dst=dest,
+                    gather_list=tensors,
+                    group=group_id,
+                    expect_event=False,
+                    tensor_shapes=[tensors[0].shape] if len(tensors) > 0 else None,
+                )
+                if rank == dest:
+                    expected_tensors = [_build_tensor(dest + 1, i) for i in group]
+                    for t1, t2 in zip(tensors, expected_tensors):
+                        self.assertEqual(t1, t2)
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_gather_checks(self):
+            group, group_id, rank = self._init_global_test()
+            one = torch.ones([1])
+
+            # Specify gather_list argument only on destination rank.
+            if rank == 0:
+                gather_list = [one.clone() for _ in group]
+                dist.gather(one * rank, dst=0, gather_list=gather_list)
+                for i in group:
+                    self.assertEqual(gather_list[i], one * i)
+            else:
+                dist.gather(one * rank, dst=0)
+
+            # Don't specify dst argument.
+            if rank == 0:
+                gather_list = [one.clone() for _ in group]
+                dist.gather(one * rank, gather_list=gather_list)
+                for i in group:
+                    self.assertEqual(gather_list[i], one * i)
+            else:
+                dist.gather(one * rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_gather(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_gather_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA gather"
+        )
+        @skip_if_no_gpu
+        def test_gather_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_gather_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        @skip_if_small_worldsize
+        def test_gather_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_gather_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        def test_gather_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_gather_helper(group, group_id, rank)
+
+        # ALL GATHER
+        def _test_all_gather_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float
+        ):
+            for dest in group:
+                tensor = _build_tensor(dest + 1, rank, dtype=dtype)
+                tensors = [_build_tensor(dest + 1, -1, dtype=dtype) for i in group]
+                allgather = dist.all_gather
+                if cuda:
+                    tensor = tensor.cuda(rank_to_GPU[rank][0])
+                    tensors = [t.cuda(rank_to_GPU[rank][0]) for t in tensors]
+                if tensors[0].dtype == torch.complex64:
+                    tensor_shapes = [torch.view_as_real(tensors[0]).shape]
+                else:
+                    tensor_shapes = [tensors[0].shape]
+                self.call_dist_op(
+                    ":all_gather",
+                    False,
+                    allgather,
+                    tensors,
+                    tensor,
+                    group_id,
+                    False,
+                    tensor_shapes=tensor_shapes,
+                )
+
+                expected_tensors = [
+                    _build_tensor(dest + 1, i, dtype=dtype) for i in group
+                ]
+                for t1, t2 in zip(tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_gather(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_gather_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all gather"
+        )
+        @skip_if_no_gpu
+        def test_all_gather_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_gather_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_gather_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_gather_helper(group, group_id, rank, dtype=torch.cfloat)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all gather"
+        )
+        @skip_if_no_gpu
+        def test_all_gather_cuda_complex(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_gather_helper(
+                group, group_id, rank, True, rank_to_GPU, dtype=torch.cfloat
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_gather_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_gather_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "Nccl does not support CPU tensors"
+        )
+        def test_all_gather_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_gather_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports all_gather_v"
+        )
+        @skip_if_no_gpu
+        def test_all_gather_v_cuda(self):
+            self._barrier()
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            device_id = rank_to_GPU[rank][0]
+
+            output_split_sizes = []
+            for dst in group:
+                output_split_sizes.append(dst + 1)
+            sum_len = sum(output_split_sizes)
+            value = 2
+
+            for async_val in [True, False]:
+                tensor = (
+                    torch.empty(
+                        output_split_sizes[rank], sum_len, sum_len, dtype=torch.float
+                    )
+                    .fill_(value)
+                    .cuda(device_id)
+                )
+                out_tensor = _build_tensor(sum_len, -1, device_id=device_id)
+
+                req = dist.all_gather(
+                    list(torch.split(out_tensor, output_split_sizes)),
+                    tensor,
+                    group_id,
+                    async_val,
+                )
+                if async_val:
+                    req.wait()
+
+                expected_value = value
+                expected_tensor = _build_tensor(
+                    sum_len, expected_value, device_id=device_id
+                )
+
+                self.assertEqual(out_tensor, expected_tensor)
+            self._barrier()
+
+        # Test all_gather accepting single tensor as output
+        def _all_gather_into_tensor_helper(
+            self, tensor_out, tensor_in, group_id, rank, cuda=True, rank_to_GPU=None
+        ):
+            if cuda:
+                tensor_in = tensor_in.cuda(rank_to_GPU[rank][0])
+                tensor_out = tensor_out.cuda(rank_to_GPU[rank][0])
+            if tensor_out.dtype == torch.complex64:
+                tensor_shapes = [torch.view_as_real(tensor_in).shape]
+            else:
+                tensor_shapes = [tensor_in.shape]
+            self.call_dist_op(
+                ":all_gather_into_tensor",
+                False,
+                dist.all_gather_into_tensor,
+                tensor_out,
+                tensor_in,
+                group_id,
+                False,
+                expect_event=False,
+                tensor_shapes=tensor_shapes,
+            )
+            return tensor_out
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
+        )
+        @skip_if_no_gpu
+        def test_all_gather_into_cat_tensor_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            size = 2
+            tensor_in = torch.ones([size, size]) * rank
+            # Concatenated output
+            tensor_out = torch.ones([len(group) * size, size]) * (-1)
+            tensor_out = self._all_gather_into_tensor_helper(
+                tensor_out, tensor_in, group_id, rank, True, rank_to_GPU
+            )
+
+            # Check result
+            # Concatenate all blocks into a bigger tensor
+            expected_tensor = torch.cat([torch.ones([size, size]) * i for i in group])
+            self.assertEqual(tensor_out, expected_tensor)
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_gather_into_tensor"
+        )
+        @skip_if_no_gpu
+        def test_all_gather_into_stack_tensor_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            size = 2
+            tensor_in = torch.ones([size, size]) * rank
+            # Stacked output
+            tensor_out = torch.ones([len(group), size, size]) * (-1)
+            tensor_out = self._all_gather_into_tensor_helper(
+                tensor_out, tensor_in, group_id, rank, True, rank_to_GPU
+            )
+
+            # Check result
+            # Stack all blocks into a bigger tensor
+            expected_tensor = torch.stack([torch.ones([size, size]) * i for i in group])
+            self.assertEqual(tensor_out, expected_tensor)
+            self._barrier()
+
+        def _run_all_gather_coalesced_and_verify(
+            self, output_tensor_lists, input_tensors, expected_tensors, group_id
+        ):
+            """
+            Helper that runs all_gather_coalesced and returns true if output
+            matches expectations.
+            """
+            tensor_shapes = []
+            for input_tensor in input_tensors:
+                if input_tensor.dtype == torch.complex64:
+                    tensor_shapes.append(torch.view_as_real(input_tensor).shape)
+                else:
+                    tensor_shapes.append(input_tensor.shape)
+            self.call_dist_op(
+                ":all_gather",
+                False,
+                dist.all_gather_coalesced,
+                output_tensor_lists,
+                input_tensors,
+                group_id,
+                tensor_shapes=tensor_shapes,
+            )
+
+            for l1, l2 in zip(output_tensor_lists, expected_tensors):
+                for t1, t2 in zip(l1, l2):
+                    if not torch.equal(t1, t2):
+                        return False
+            return True
+
+        def _test_all_gather_coalesced_helper(
+            self, group, group_id, rank, dtype=torch.float
+        ):
+            # TODO: Instead we should probably go through _rank_not_in_group
+            # mechanism to disable sending tensors
+            if group_id is not None:
+                for test_case_id in range(2, 5):
+                    # Make sure we create tensors of incompatible sizes, e.g.
+                    # [1], [2x2], [3x3x3] ... to be sent in one batch
+                    input_tensors = [
+                        _build_multidim_tensor(
+                            tensor_id, tensor_id, rank + tensor_id, dtype=dtype
+                        )
+                        for tensor_id in range(1, test_case_id)
+                    ]
+                    output_tensor_lists = [
+                        [
+                            _build_multidim_tensor(
+                                tensor_id, tensor_id, -1, dtype=dtype
+                            )
+                            for tensor_id in range(1, test_case_id)
+                        ]
+                        for _ in group
+                    ]
+                    expected_tensors = [
+                        [
+                            _build_multidim_tensor(
+                                tensor_id, tensor_id, rank_iter + tensor_id, dtype=dtype
+                            )
+                            for tensor_id in range(1, test_case_id)
+                        ]
+                        for rank_iter in group
+                    ]
+                    assert self._run_all_gather_coalesced_and_verify(
+                        output_tensor_lists, input_tensors, expected_tensors, group_id
+                    ), "output tensors do not match expected outputs"
+
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["allgather_coalesced"],
+            f"{BACKEND} does not support all_gather_coalesced",
+        )
+        def test_all_gather_coalesced_simple(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_gather_coalesced_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["allgather_coalesced"],
+            f"{BACKEND} does not support all_gather_coalesced",
+        )
+        def test_all_gather_coalesced_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_gather_coalesced_helper(
+                group, group_id, rank, dtype=torch.cfloat
+            )
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["allgather_coalesced"],
+            f"{BACKEND} does not support all_gather_coalesced",
+        )
+        def test_all_gather_coalesced_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_gather_coalesced_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["allgather_coalesced"],
+            f"{BACKEND} does not support all_gather_coalesced",
+        )
+        def test_all_gather_coalesced_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_gather_coalesced_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["allgather_coalesced"],
+            f"{BACKEND} does not support all_gather_coalesced",
+        )
+        def test_all_gather_coalesced_with_empty(self):
+            group, group_id, rank = self._init_global_test()
+            input_tensors = [
+                rank * torch.ones([2, 2]),
+                torch.ones([0]),
+                (rank + 1) * torch.ones([3, 3]),
+                torch.ones([0]),
+                torch.ones([0]),
+            ]
+            output_tensors_lists = [
+                [
+                    -1 * torch.ones([2, 2]),
+                    -1 * torch.ones([0]),
+                    -1 * torch.ones([3, 3]),
+                    -1 * torch.ones([0]),
+                    -1 * torch.ones([0]),
+                ]
+                for _ in group
+            ]
+            expected_tensors = [
+                [
+                    r * torch.ones([2, 2]),
+                    torch.ones([0]),
+                    (r + 1) * torch.ones([3, 3]),
+                    torch.ones([0]),
+                    torch.ones([0]),
+                ]
+                for r in group
+            ]
+            assert self._run_all_gather_coalesced_and_verify(
+                output_tensors_lists, input_tensors, expected_tensors, group_id
+            )
+            self._barrier()
+
+        # AllToAll
+        def _test_all_to_all_single_equal_split_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float
+        ):
+            if group_id is not None:
+                size = len(group)
+                in_tensor = torch.ones([size, size], dtype=dtype) * rank
+                expected_tensor = torch.cat(
+                    [torch.ones([1, size], dtype=dtype) * i for i in group]
+                )
+                out_tensor = torch.ones([size, size], dtype=dtype) * -1
+                if cuda:
+                    in_tensor = in_tensor.cuda(rank_to_GPU[rank][0])
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                    out_tensor = out_tensor.cuda(rank_to_GPU[rank][0])
+                if dtype == torch.complex64:
+                    tensor_shapes = [torch.view_as_real(in_tensor).shape]
+                else:
+                    tensor_shapes = [in_tensor.shape]
+                self.call_dist_op(
+                    ":all_to_all",
+                    False,
+                    dist.all_to_all_single,
+                    out_tensor,
+                    in_tensor,
+                    group=group_id,
+                    tensor_shapes=tensor_shapes,
+                )
+                self.assertEqual(out_tensor, expected_tensor)
+            self._barrier()
+
+        def _test_all_to_all_single_unequal_split_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None, dtype=torch.float
+        ):
+            if group_id is not None:
+                size = len(group)
+                in_splits = [i + 1 for i in group]
+                out_splits = [rank + 1 for _ in group]
+                in_tensor = torch.ones([sum(in_splits), size], dtype=dtype) * rank
+                out_tensor = torch.ones([(rank + 1) * size, size], dtype=dtype)
+                expected_tensor = torch.cat(
+                    [torch.ones([rank + 1, size], dtype=dtype) * i for i in group]
+                )
+                if cuda:
+                    in_tensor = in_tensor.cuda(rank_to_GPU[rank][0])
+                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
+                    out_tensor = out_tensor.cuda(rank_to_GPU[rank][0])
+                dist.all_to_all_single(
+                    out_tensor, in_tensor, out_splits, in_splits, group=group_id
+                )
+                self.assertEqual(out_tensor, expected_tensor)
+            self._barrier()
+
+        def _test_all_to_all_helper(
+            self,
+            group,
+            group_id,
+            rank,
+            cuda=False,
+            rank_to_GPU=None,
+            dtype=torch.float,
+        ):
+            if group_id is not None:
+                size = len(group)
+                in_splits = [i + 1 for i in group]
+                in_tensors = [
+                    torch.ones([in_splits[i], size], dtype=dtype) * rank
+                    for i, _ in enumerate(group)
+                ]
+                out_tensors = [
+                    torch.ones([(rank + 1), size], dtype=dtype) for _ in group
+                ]
+                expected_tensors = [
+                    torch.ones([rank + 1, size], dtype=dtype) * i for i in group
+                ]
+                if cuda:
+                    in_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in in_tensors]
+                    expected_tensors = [
+                        t.cuda(rank_to_GPU[rank][0]) for t in expected_tensors
+                    ]
+                    out_tensors = [t.cuda(rank_to_GPU[rank][0]) for t in out_tensors]
+                dist.all_to_all(out_tensors, in_tensors, group=group_id)
+                for t1, t2 in zip(out_tensors, expected_tensors):
+                    self.assertEqual(t1, t2)
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_equal_split(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_equal_split_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_equal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_equal_split_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_single_equal_split_helper(
+                group, group_id, rank, dtype=torch.cfloat
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_equal_split_cuda_complex(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_equal_split_helper(
+                group, group_id, rank, True, rank_to_GPU, dtype=torch.cfloat
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_unequal_split(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_unequal_split_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_unequal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_unequal_split_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_single_unequal_split_helper(
+                group, group_id, rank, dtype=torch.cfloat
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_unequal_split_cuda_complex(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_unequal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+                dtype=torch.cfloat,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports all_to_all"
+        )
+        def test_all_to_all(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only NCCL supports CUDA all_to_all"
+        )
+        @skip_if_rocm
+        def test_all_to_all_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports all_to_all"
+        )
+        def test_all_to_all_complex(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_all_to_all_helper(group, group_id, rank, dtype=torch.cfloat)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only NCCL supports CUDA all_to_all"
+        )
+        @skip_if_rocm
+        def test_all_to_all_cuda_complex(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_helper(
+                group, group_id, rank, True, rank_to_GPU, dtype=torch.cfloat
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        @skip_if_small_worldsize
+        def test_all_to_all_single_equal_split_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        @skip_if_small_worldsize
+        def test_all_to_all_single_equal_split_group_cuda(self):
+            group, group_id, rank = self._init_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_equal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        @skip_if_small_worldsize
+        def test_all_to_all_single_unequal_split_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        @skip_if_small_worldsize
+        def test_all_to_all_single_unequal_split_group_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_unequal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports all_to_all"
+        )
+        @skip_if_small_worldsize
+        def test_all_to_all_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_all_to_all_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_small_worldsize
+        @skip_if_rocm
+        def test_all_to_all_group_cuda(self):
+            group, group_id, rank = self._init_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_equal_split_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_to_all_single_equal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_equal_split_full_group_cuda(self):
+            group, group_id, rank = self._init_full_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_equal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports CPU all_to_all_single"
+        )
+        def test_all_to_all_single_unequal_split_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_to_all_single_unequal_split_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only Nccl supports CUDA all_to_all_single"
+        )
+        @skip_if_no_gpu
+        def test_all_to_all_single_unequal_split_full_group_cuda(self):
+            group, group_id, rank = self._init_full_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_single_unequal_split_helper(
+                group,
+                group_id,
+                rank,
+                True,
+                rank_to_GPU,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi", "Only MPI supports all_to_all"
+        )
+        def test_all_to_all_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_all_to_all_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl", "Only NCCL supports CUDA all_to_all"
+        )
+        @skip_if_rocm
+        def test_all_to_all_full_group_cuda(self):
+            group, group_id, rank = self._init_full_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_all_to_all_helper(group, group_id, rank, True, rank_to_GPU)
+
+        # BARRIER
+        def _test_barrier_helper(
+            self, group, group_id, rank, cuda=False, rank_to_GPU=None
+        ):
+            WAIT_TIME = 0.3  # seconds
+
+            for dest in group:
+                expected_time = torch.DoubleTensor(1).fill_(0.0)
+                if cuda:
+                    expected_time = expected_time.cuda(rank_to_GPU[rank][0])
+                if dest == rank:
+                    expected_time.fill_(time.time() + WAIT_TIME)
+                    dist.broadcast(expected_time, dest, group_id)
+                    time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
+                    dist.barrier(group_id)
+                else:
+                    dist.broadcast(expected_time, dest, group_id)
+                    dist.barrier(group_id)
+                    self.assertGreaterAlmostEqual(
+                        float(time.time()),
+                        float(expected_time[0]),
+                        "destination rank: %d, my rank: %d" % (dest, rank)
+                        + " (if you see this failure, please report in #14554)",
+                    )
+
+            # Use higher timeout for the instance where the test runs
+            # against a subgroup and uses a CUDA tensor for expected time.
+            # The CUDA initialization for the participating processes can
+            # take long enough for the barrier timeout to trigger on the
+            # process that doesn't participate in the group.
+            self._barrier(timeout=20)
+
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "mpi", "MPI doesn't supports GPU barrier"
+        )
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+        )
+        def test_barrier_cuda(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_barrier_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_if_small_worldsize
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "mpi", "MPI doesn't supports GPU barrier"
+        )
+        def test_barrier_group_cuda(self):
+            group, group_id, rank = self._init_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_barrier_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_if_small_worldsize
+        @skip_if_no_gpu
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "mpi", "MPI doesn't supports GPU barrier"
+        )
+        def test_barrier_full_group_cuda(self):
+            group, group_id, rank = self._init_full_group_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            self._test_barrier_helper(group, group_id, rank, True, rank_to_GPU)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["cpu barrier"],
+            f"{BACKEND} does not support CPU barrier",
+        )
+        def test_barrier(self):
+            group, group_id, rank = self._init_global_test()
+            self._test_barrier_helper(group, group_id, rank)
+
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["cpu barrier"],
+            f"{BACKEND} does not support CPU barrier",
+        )
+        def test_barrier_group(self):
+            group, group_id, rank = self._init_group_test()
+            self._test_barrier_helper(group, group_id, rank)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND in DistTestCases.skip_collective["cpu barrier"],
+            f"{BACKEND} does not support CPU barrier",
+        )
+        def test_barrier_full_group(self):
+            group, group_id, rank = self._init_full_group_test()
+            self._test_barrier_helper(group, group_id, rank)
+
+        def _model_step(self, model):
+            for param in model.parameters():
+                if param.grad is not None:
+                    with torch.no_grad():
+                        param += param.grad
+                    param.grad = None
+
+        def _model_step_with_zero_grad(self, model):
+            for param in model.parameters():
+                if param.grad is not None:
+                    with torch.no_grad():
+                        param += param.grad
+                    param.grad.requires_grad_(False)
+                    param.grad.zero_()
+
+        def _prepare_dummy_data(self, local_bs):
+            # global_bs for DDP should be divisible by WORLD_SIZE
+            world_size = int(os.environ["WORLD_SIZE"])
+            global_bs = world_size * local_bs
+            input_cpu = torch.randn(global_bs, 2)
+            target = torch.randn(global_bs, 4)
+            loss = nn.MSELoss()
+            return global_bs, input_cpu, target, loss
+
+        # END TO END TEST FOR DISTRIBUTEDDATAPARALLEL
+        def _test_DDP_helper(
+            self, model, input_var, target, loss, scale_factor=1.0, memory_format=None
+        ):
+            model.train()
+            output = model(input_var)
+            l = loss(output, target) * scale_factor
+            l.backward()
+            if memory_format is not None:
+                self.assertTrue(output.is_contiguous(memory_format=memory_format))
+
+        def _assert_equal_param(self, param_gpu, param_DDP):
+            self.assertEqual(len(param_gpu), len(param_DDP))
+            for p_gpu, p_DDP in zip(param_gpu, param_DDP):
+                self.assertEqual(p_gpu, p_DDP)
+
+        def _test_DDP_niter(
+            self,
+            model_base,
+            model_DDP,
+            input,
+            target,
+            loss,
+            local_bs,
+            rank,
+            batch_size,
+            test_save,
+            offset=None,
+            world_size=0,
+            zero_grad=False,
+            memory_format=None,
+            n_iter=5,
+        ):
+            for idx in range(n_iter):
+                # single cpu/gpu training
+                self._test_DDP_helper(
+                    model_base, input, target, loss, memory_format=memory_format
+                )
+
+                if offset is None:
+                    offset = rank * local_bs
+
+                # DDP training, DDP scatters subsets of input_cpu to nodes/GPUs
+                self._test_DDP_helper(
+                    model_DDP,
+                    input[offset : offset + local_bs],
+                    target[offset : offset + local_bs],
+                    loss,
+                    world_size * local_bs / batch_size if world_size != 0 else 1,
+                    memory_format=memory_format,
+                )
+
+                # Update weights and run a second iteration to shake out errors
+                if zero_grad:
+                    self._model_step_with_zero_grad(model_base)
+                    self._model_step_with_zero_grad(model_DDP)
+                else:
+                    self._model_step(model_base)
+                    self._model_step(model_DDP)
+                self._assert_equal_param(
+                    list(model_base.parameters()), list(model_DDP.module.parameters())
+                )
+
+                # Shuffle the input so that DDP input is different
+                input = input[torch.randperm(batch_size)]
+
+                # save the model in the middle and reload
+                if test_save and idx == 2 and INIT_METHOD.startswith("file://"):
+                    with tempfile.NamedTemporaryFile() as tmp:
+                        if sys.platform == "win32":
+                            torch.save(model_DDP, tmp)
+                            tmp.seek(0)
+                            model_DDP = torch.load(tmp)
+                        else:
+                            torch.save(model_DDP, tmp.name)
+                            model_DDP = torch.load(tmp.name)
+
+            with tempfile.TemporaryFile() as tmp_file:
+                torch.save(model_DDP, tmp_file)
+                tmp_file.seek(0)
+                saved_model = torch.load(tmp_file)
+            for k in model_DDP.state_dict():
+                self.assertEqual(model_DDP.state_dict()[k], saved_model.state_dict()[k])
+
+        def _test_DistributedDataParallel(
+            self,
+            gpu_subset,
+            rank,
+            output_device=None,
+            gradient_as_bucket_view=False,
+            static_graph=False,
+            set_static_graph_twice=False,
+        ):
+            # Run a simple end to end DDP model, use result of single node model
+            # as baseline
+
+            # cpu training setup
+            model = DDP_NET
+
+            # single gpu training setup
+            model_gpu = copy.deepcopy(model)
+            model_gpu.cuda(gpu_subset[0])
+
+            # DDP training setup
+            model_DDP = copy.deepcopy(model)
+            model_DDP.cuda(gpu_subset[0])
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_DDP,
+                device_ids=gpu_subset,
+                gradient_as_bucket_view=gradient_as_bucket_view,
+                static_graph=static_graph,
+            )
+
+            if set_static_graph_twice:
+                model_DDP._set_static_graph()
+
+            # test serializable/unserializable
+            with tempfile.NamedTemporaryFile() as tmp:
+                if sys.platform == "win32":
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
+
+            # dummy data initialization
+            local_bs = len(gpu_subset)
+            global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+            # check two model parameters over 5 iterations
+            self._test_DDP_niter(
+                model_gpu,
+                model_DDP,
+                input_cpu.cuda(gpu_subset[0]),
+                target.cuda(gpu_subset[0]),
+                loss,
+                local_bs,
+                rank,
+                global_bs,
+                True,
+            )
+            self._barrier()
+
+        def _test_DistributedDataParallelCPU(self, gradient_as_bucket_view=False):
+            # Run a simple end to end DDP-CPU model, use result of single node
+            # model as baseline
+            group, group_id, rank = self._init_global_test()
+
+            # cpu training setup
+            model_base = DDP_NET
+
+            # DDP-CPU training setup
+            model_DDP = copy.deepcopy(model_base)
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_DDP, gradient_as_bucket_view=gradient_as_bucket_view
+            )
+
+            # dummy data initialization
+            local_bs = 2
+            global_bs, input_cpu, target, loss = self._prepare_dummy_data(local_bs)
+
+            # check two model parameters over 5 iterations
+            self._test_DDP_niter(
+                model_base,
+                model_DDP,
+                input_cpu,
+                target,
+                loss,
+                local_bs,
+                rank,
+                global_bs,
+                False,
+                zero_grad=True,
+            )
+            self._barrier()
+
+            return model_DDP
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU(self):
+            self._test_DistributedDataParallelCPU()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_DistributedDataParallelCPU_grad_is_view(self):
+            self._test_DistributedDataParallelCPU(gradient_as_bucket_view=True)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_DistributedDataParallel_requires_grad(self):
+            # a module without gradients shouldn't be accepted
+            self.assertRaises(
+                RuntimeError, lambda: nn.parallel.DistributedDataParallel(nn.Module())
+            )
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_zero_output_features(self):
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.net1 = nn.Linear(10, 10)
+                    self.relu = nn.ReLU()
+                    self.net2 = nn.Linear(10, 0)
+
+            model = ToyModel().to(self.rank)
+            ddp_model = nn.parallel.DistributedDataParallel(
+                model, device_ids=[self.rank]
+            )
+
+        @skip_but_pass_in_sandcastle_if(BACKEND == "nccl", "Gloo-only test")
+        def test_ddp_create_graph(self):
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.p = nn.Parameter(torch.tensor(1.0))
+
+                def forward(self):
+                    return self.p.pow(2)
+
+            model = Model()
+            ddp_model = torch.nn.parallel.DistributedDataParallel(model)
+            for _ in range(6):
+                # Verify DDP doesn't throw when ran with create_graph=True.
+                # Although we do warn about potential issues, please see
+                # https://github.com/pytorch/pytorch/issues/63929 for details.
+                ddp_model().backward(create_graph=True)
+                # grad tensors should require grad.
+                self.assertTrue(
+                    all(param.requires_grad for param in ddp_model.parameters())
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_DistributedDataParallel_non_default_stream(self):
+            stream = torch.cuda.Stream(self.rank)
+            rank = self.rank
+            with torch.cuda.stream(stream):
+                net = torch.nn.parallel.DistributedDataParallel(
+                    torch.nn.Linear(1, 1, bias=False).cuda(rank), device_ids=[rank]
+                )
+                for i in range(1000):
+                    # Clear gradients manually
+                    grad = net.module.weight.grad
+                    if grad is not None:
+                        grad.requires_grad_(False)
+                        grad.zero_()
+                    # Forward + BW
+                    batch = torch.tensor([rank]).float().cuda(rank)
+                    loss = net(batch).sum()
+                    loss.backward()
+                    # For each worker, the gradient on the weight should be worker_rank.
+                    grad = net.module.weight.grad
+                    avg = grad.clone()
+                    # All-reducing the gradient averages should give us the gradient
+                    # average. If not, then one of the workers has not correctly
+                    # written back the averaged gradient before this all-reduce call.
+                    dist.all_reduce(avg)
+                    world_size = int(os.environ["WORLD_SIZE"])
+                    avg.div_(world_size)
+                    expected_grad = sum(i for i in range(world_size)) / world_size
+                    self.assertEqual(
+                        avg[0, 0],
+                        expected_grad,
+                        msg=f"Expected gradient of {expected_grad} but got {avg} on rank {self.rank}",
+                    )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_comm_hook_logging(self):
+            hooks = [
+                default.allreduce_hook,
+                default.fp16_compress_hook,
+                powerSGD.powerSGD_hook,
+                powerSGD.batched_powerSGD_hook,
+                quantization_hooks.quantization_pertensor_hook,
+                quantization_hooks.quantization_perchannel_hook,
+            ]
+
+            cpp_builtin_hooks = [
+                dist.BuiltinCommHookType.ALLREDUCE,
+                dist.BuiltinCommHookType.FP16_COMPRESS,
+            ]
+
+            for hook in hooks:
+                ddp_model = torch.nn.parallel.DistributedDataParallel(
+                    torch.nn.Linear(1, 1, bias=False).cuda(self.rank),
+                    device_ids=[self.rank],
+                )
+                ddp_logging_data = ddp_model._get_ddp_logging_data()
+                # Hook not registered yet, so should be empty
+                self.assertEqual(ddp_logging_data.get("comm_hook"), None)
+                ddp_model.register_comm_hook(None, hook)
+                ddp_logging_data = ddp_model._get_ddp_logging_data()
+                self.assertEqual(ddp_logging_data.get("comm_hook"), hook.__qualname__)
+
+            for hook in cpp_builtin_hooks:
+                ddp_model = torch.nn.parallel.DistributedDataParallel(
+                    torch.nn.Linear(1, 1, bias=False).cuda(self.rank),
+                    device_ids=[self.rank],
+                )
+                ddp_logging_data = ddp_model._get_ddp_logging_data()
+                # Hook not registered yet, so should be empty
+                self.assertEqual(ddp_logging_data.get("comm_hook"), None)
+                ddp_model._register_builtin_comm_hook(hook)
+                ddp_logging_data = ddp_model._get_ddp_logging_data()
+                self.assertEqual(ddp_logging_data.get("comm_hook"), str(hook))
+
+            # No hook registered
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                torch.nn.Linear(1, 1, bias=False).cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            ddp_logging_data = ddp_model._get_ddp_logging_data()
+            # Hook not registered yet, so should be empty
+            self.assertEqual(ddp_logging_data.get("comm_hook"), None)
+            # After second forward pass, hook should still be empty string
+            for i in range(2):
+                inp = torch.ones(1, 1, device=self.rank)
+                loss = ddp_model(inp).sum()
+                loss.backward()
+
+            ddp_logging_data = ddp_model._get_ddp_logging_data()
+            # Note: DETAIL debug mode logs DDP logging data to stdout and
+            # thus accesses std::map, which fills in a default value for the
+            # type if it didn't exist.
+            self.assertEqual(ddp_logging_data.get("comm_hook", ""), "")
+
+        def _test_ddp_hook_with_optimizer_parity(
+            self,
+            grad_as_bucket_view,
+            static_graph,
+            optim_cls,
+            optimize_subset,
+            *functional_optim_args,
+            **functional_optim_kwargs,
+        ):
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+            models_to_test = [
+                (LargeNet(), torch.randn(1, 1000).cuda()),
+            ]
+            if HAS_TORCHVISION:
+                models_to_test.append(
+                    (torchvision.models.resnet50(), torch.randn(1, 3, 3, 1000).cuda())
+                )
+            for (model, inp) in models_to_test:
+                # Enable determinism in cudnn operators
+                with torch.backends.cudnn.flags(
+                    enabled=True, deterministic=True, benchmark=False
+                ):
+                    # Create DDP model that runs optimizer in fused fashion.
+                    ddp_model_with_optimizer_hook = (
+                        torch.nn.parallel.DistributedDataParallel(
+                            copy.deepcopy(model).cuda(),
+                            device_ids=[self.rank],
+                            gradient_as_bucket_view=grad_as_bucket_view,
+                            static_graph=static_graph,
+                        )
+                    )
+
+                    # Create DDP model with no hook that does optimizer after
+                    # backward.
+                    ddp_model_with_no_hook = torch.nn.parallel.DistributedDataParallel(
+                        copy.deepcopy(model).cuda(),
+                        device_ids=[self.rank],
+                        gradient_as_bucket_view=grad_as_bucket_view,
+                        static_graph=static_graph,
+                    )
+                    hook_params = ddp_model_with_optimizer_hook.parameters()
+                    no_hook_params = ddp_model_with_no_hook.parameters()
+                    if optimize_subset:
+                        hook_params = list(hook_params)
+                        no_hook_params = list(no_hook_params)
+                        self.assertGreater(len(hook_params), 0)
+                        hook_params = [hook_params[0]]
+                        no_hook_params = [no_hook_params[0]]
+
+                    # Register a fused optimizer that will run optimizer in step
+                    # with allreduce.
+
+                    if optimize_subset:
+                        # API where optim_params is specified.
+                        ddp_model_with_optimizer_hook._register_fused_optim(
+                            optim_cls,
+                            *functional_optim_args,
+                            optim_params=hook_params,
+                            **functional_optim_kwargs,
+                        )
+                    else:
+                        # API where optim_params is omitted
+                        ddp_model_with_optimizer_hook._register_fused_optim(
+                            optim_cls,
+                            *functional_optim_args,
+                            **functional_optim_kwargs,
+                        )
+
+                    optimizer_no_hook = optim_cls(
+                        no_hook_params,
+                        *functional_optim_args,
+                        **functional_optim_kwargs,
+                    )
+
+                    # Verify parameters are equal initially.
+                    for hook_param, allreduce_param in zip(
+                        ddp_model_with_optimizer_hook.parameters(),
+                        ddp_model_with_no_hook.parameters(),
+                    ):
+                        self.assertEqual(hook_param, allreduce_param)
+
+                    # Save old parameters to later verify optimizer modified them.
+                    opt_hook_init_params = copy.deepcopy(
+                        list(ddp_model_with_optimizer_hook.parameters())
+                    )
+
+                    # Run optimizer with hook model.
+                    for i in range(6):
+                        ddp_model_with_optimizer_hook.zero_grad()
+                        out = ddp_model_with_optimizer_hook(inp)
+                        loss = out.sum()
+                        loss.backward()
+
+                    dist.barrier()
+
+                    # Run regular model.
+                    for i in range(6):
+                        ddp_model_with_no_hook.zero_grad()
+                        out = ddp_model_with_no_hook(inp)
+                        loss = out.sum()
+                        loss.backward()
+                        optimizer_no_hook.step()
+
+                    dist.barrier()
+
+                    # Now verify parameters are equal.
+                    for hook_param, allreduce_param in zip(
+                        ddp_model_with_optimizer_hook.parameters(),
+                        ddp_model_with_no_hook.parameters(),
+                    ):
+                        self.assertEqual(hook_param, allreduce_param)
+
+                    # Verify optimizer modified appropriate parameter set,
+                    # otherwise they'd be trivially equal above.
+                    if optimize_subset:
+                        self.assertNotEqual(
+                            opt_hook_init_params[0],
+                            next(iter(ddp_model_with_optimizer_hook.parameters())),
+                        )
+                        # Untouched params should be equal
+                        self.assertEqual(
+                            opt_hook_init_params[1:],
+                            list(ddp_model_with_optimizer_hook.parameters())[1:],
+                        )
+                    else:
+                        self.assertNotEqual(
+                            opt_hook_init_params,
+                            list(ddp_model_with_optimizer_hook.parameters()),
+                        )
+                    dist.barrier()
+
+        """
+        # Commenting out the following 3 tests as they cause Sandcastle jobs to fail
+        # Failure signature:
+        # AttributeError: type object 'TestDistBackendWithSpawn' has no attribute 'test_ddp_hook_with_optimizer_parity_adamw
+
+        from torch.testing._internal.common_utils import parametrize
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259",
+        )
+        @skip_if_lt_x_gpu(2)
+        @parametrize("grad_as_bucket_view", [True, False])
+        @parametrize("static_graph", [True, False])
+        @parametrize("optimize_subset", [True, False])
+        def test_ddp_hook_with_optimizer_parity_adamw(
+            self,
+            grad_as_bucket_view,
+            static_graph,
+            optimize_subset,
+        ):
+            adamw_lr = 1e-2
+            adamw_betas = (0.9, 0.99)
+            adamw_eps = 1e-6
+            self._test_ddp_hook_with_optimizer_parity(
+                grad_as_bucket_view,
+                static_graph,
+                torch.optim.AdamW,
+                optimize_subset,
+                adamw_lr,
+                betas=adamw_betas,
+                eps=adamw_eps,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259",
+        )
+        @skip_if_lt_x_gpu(2)
+        @parametrize("optimize_subset", [True, False])
+        def test_ddp_hook_with_optimizer_parity_adam(self, optimize_subset):
+            adam_lr = 1e-2
+            adam_betas = (0.9, 0.99)
+            adam_eps = 1e-6
+            self._test_ddp_hook_with_optimizer_parity(
+                True,  # grad as bucket view
+                False,  # static graph
+                torch.optim.Adam,
+                optimize_subset,
+                adam_lr,
+                betas=adam_betas,
+                eps=adam_eps,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl" or BACKEND == "ucc",
+            "Issues with async error handling, see https://github.com/pytorch/pytorch/issues/73259",
+        )
+        @skip_if_lt_x_gpu(2)
+        @parametrize("optimize_subset", [True, False])
+        def test_ddp_hook_with_optimizer_parity_sgd(self, optimize_subset):
+            sgd_lr = 1e-2
+            sgd_momentum = 0.9
+            sgd_weight_decay = 0.01
+            # Not testing grad_as_bucket_view and static_graph as they are
+            # tested in AdamW test above.
+            self._test_ddp_hook_with_optimizer_parity(
+                True,  # grad as bucket view
+                False,  # static_graph
+                torch.optim.SGD,
+                optimize_subset,
+                sgd_lr,
+                momentum=sgd_momentum,
+                weight_decay=sgd_weight_decay,
+            )
+        """
+
+        @skip_if_lt_x_gpu(2)
+        def test_get_data_parallel_params(self):
+            torch.cuda.set_device(self.rank)
+            model = TwoLinLayerNet().cuda()
+            # Parameters to ignore are in the format {module_name}.{param_name}
+            params_to_ignore = ["a.weight"]
+            torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                model, params_to_ignore
+            )
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[self.rank]
+            )
+            dp_params = torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(
+                model, named_params=True
+            )
+            for name, _ in dp_params:
+                self.assertNotEqual(f"module.{params_to_ignore[0]}", name)
+
+            # test named_params=False, just check if returns the expected
+            # no of parameters.
+            num_ddp_params = len(list(model.parameters())) - 1
+            count = 0
+            dp_params = torch.nn.parallel.DistributedDataParallel._get_data_parallel_params(model, named_params=False)
+            for _ in dp_params:
+                count += 1
+            self.assertEqual(count, num_ddp_params)
+
+        def _test_ddp_apply_optim_in_backward(
+            self,
+            optim_cls,
+            optim_kwargs,
+            init_before,
+            gradient_as_bucket_view=True,
+        ):
+            # Need to seed to ensure inputs are unique across rank. Otherwise,
+            # allreduce won't have any effect.
+            torch.manual_seed(self.rank)
+            torch.cuda.manual_seed(self.rank)
+            torch.cuda.set_device(self.rank)
+
+            # Test a simple linear as well as a ResNet model.
+            models_to_test = [
+                nn.Sequential(nn.Linear(3, 3), nn.Linear(3, 3), nn.Linear(3, 3)).cuda()
+            ]
+            if HAS_TORCHVISION:
+                models_to_test.append(torchvision.models.resnet50().cuda())
+
+            for j, model in enumerate(models_to_test):
+                model_optim_in_bwd = copy.deepcopy(model)
+                model = nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                    gradient_as_bucket_view=gradient_as_bucket_view,
+                )
+                optim = optim_cls(model.parameters(), **optim_kwargs)
+                if init_before:
+                    _apply_optimizer_in_backward(
+                        optimizer_class=optim_cls,
+                        params=model_optim_in_bwd.parameters(),
+                        optimizer_kwargs=optim_kwargs,
+                    )
+                model_optim_in_bwd = nn.parallel.DistributedDataParallel(
+                    model_optim_in_bwd,
+                    device_ids=[self.rank],
+                    gradient_as_bucket_view=gradient_as_bucket_view,
+                )
+                if not init_before:
+                    _apply_optimizer_in_backward(
+                        optimizer_class=optim_cls,
+                        params=model_optim_in_bwd.parameters(),
+                        optimizer_kwargs=optim_kwargs,
+                    )
+
+                for p1, p2 in zip(model.parameters(), model_optim_in_bwd.parameters()):
+                    self.assertEqual(p1, p2, "Parameters not initially equal!")
+                # Enable determinism in cudnn operators
+                with torch.backends.cudnn.flags(
+                    enabled=True, deterministic=True, benchmark=False
+                ):
+                    for i in range(8):
+                        inp = (
+                            torch.randn(1, 3, 1000, 1000, device="cuda")
+                            if j == 1
+                            else torch.randn(10, 3, device="cuda")
+                        )
+                        model(inp).sum().backward()
+                        optim.step()
+                        model_optim_in_bwd(
+                            inp
+                        ).sum().backward()  # runs optimizer as well
+                        for p1, p2 in zip(
+                            model.parameters(), model_optim_in_bwd.parameters()
+                        ):
+                            self.assertEqual(
+                                p1, p2, f"Params not equal at iteration {i}"
+                            )
+                            self.assertTrue(
+                                p2.grad is None,
+                                f"Optim in backward grad is not None at {i}",
+                            )
+
+                        # set_to_none for regular optimizer to match in backward
+                        # case.
+                        optim.zero_grad(set_to_none=True)
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_apply_optim_in_backward(self):
+            for optim_cls, init_before in itertools.product(
+                [torch.optim.SGD, torch.optim.Adam], [True, False]
+            ):
+                with self.subTest(optim_cls=optim_cls):
+                    self._test_ddp_apply_optim_in_backward(
+                        optim_cls=optim_cls,
+                        optim_kwargs={"lr": 0.03},
+                        init_before=init_before,
+                    )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_apply_optim_in_backward_grad_as_bucket_view_false(self):
+            for init_before in [True, False]:
+                self._test_ddp_apply_optim_in_backward(
+                    optim_cls=torch.optim.SGD,
+                    optim_kwargs={"lr": 0.03},
+                    init_before=init_before,
+                    gradient_as_bucket_view=False,
+                )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_apply_optim_in_backward_ignored_params(self):
+            torch.cuda.set_device(self.rank)
+            for init_before in [True, False]:
+                with self.subTest(init_before=init_before):
+                    torch.manual_seed(self.rank)
+                    torch.cuda.manual_seed(self.rank)
+                    model = TwoLinLayerNet()
+                    # Parameters to ignore are in the format {module_name}.{param_name}
+                    params_to_ignore = ["a.weight"]
+                    torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                        model, params_to_ignore
+                    )
+                    if init_before:
+                        _apply_optimizer_in_backward(
+                            optimizer_class=torch.optim.SGD,
+                            params=model.parameters(),
+                            optimizer_kwargs={"lr": 0.03},
+                        )
+                    net = torch.nn.parallel.DistributedDataParallel(
+                        model.cuda(self.rank),
+                        device_ids=[self.rank],
+                    )
+                    if not init_before:
+                        _apply_optimizer_in_backward(
+                            optimizer_class=torch.optim.SGD,
+                            params=model.parameters(),
+                            optimizer_kwargs={"lr": 0.03},
+                        )
+                    inp = torch.randn(1, 10)
+                    a, b = net(inp)
+                    (a.transpose(0, 1) @ b).sum().backward()
+                    # a.weight did not go through allreduce, so optimizer acted on local
+                    # gradient, which should be different across ranks. Remaining params
+                    # should be equal.
+                    models = [None for _ in range(dist.get_world_size())]
+                    dist.all_gather_object(models, model)
+                    rank0_model, remainder = models[0], models[1:]
+                    for m in remainder:
+                        self.assertNotEqual(rank0_model.a.weight, m.a.weight)
+                        self.assertEqual(
+                            list(rank0_model.b.parameters()), list(m.b.parameters())
+                        )
+                        self.assertEqual(rank0_model.a.bias, m.a.bias)
+
+        def _get_fp16_config(self) -> _MixedPrecision:
+            return _MixedPrecision(
+                param_dtype=torch.float16,
+                reduce_dtype=torch.float16,
+                buffer_dtype=torch.float16,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_native_mixed_precision_ignored_params(self):
+            rank = self.rank
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+            torch.cuda.set_device(rank)
+            model = TwoLinLayerNet()
+            model.register_buffer("buffer", torch.ones(5))
+            # Parameters to ignore are in the format {module_name}.{param_name}
+            to_ignore = ["a.weight", "buffer"]
+            torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                model, to_ignore,
+            )
+            mp_config = self._get_fp16_config()
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.to(rank),
+                device_ids=[rank],
+                mixed_precision=mp_config,
+                gradient_as_bucket_view=True,
+            )
+            to_ignore = [f"module.{name}" for name in to_ignore]
+            expected_ignored = len(to_ignore)
+            n_ignored = 0
+            # ignored params should not have _mp_param or _fp_param fields.
+            for (n, p) in itertools.chain(net.named_parameters(), net.named_buffers()):
+                if n in to_ignore:
+                    n_ignored += 1
+                    self.assertFalse(hasattr(p, '_mp_param'))
+                    self.assertFalse(hasattr(p, '_fp_param'))
+                else:
+                    self.assertEqual(mp_config.param_dtype, p._mp_param.dtype)
+                    self.assertEqual(torch.float32, p._fp_param.dtype)
+
+            self.assertEqual(expected_ignored, n_ignored)
+
+        def _test_ddp_native_mixed_precision(
+            self, gradient_as_bucket_view, set_grad_to_none
+        ):
+            rank = self.rank
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+            torch.cuda.set_device(rank)
+            inp = torch.randn(10, 1)
+            mp_config = self._get_fp16_config()
+
+            class MyModel(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.m = torch.nn.Linear(1, 5)
+                    self.register_buffer('buffer', torch.randn(1, 2))
+                    self.p = torch.nn.Parameter(
+                        torch.randn(10, 5), requires_grad=False
+                    )
+
+                def forward(self_, x):  # noqa: B902
+                    params = self_.m.parameters()
+                    for p in params:
+                        self.assertEqual(mp_config.param_dtype, p.dtype)
+
+                    self.assertEqual(self_.buffer.dtype, mp_config.buffer_dtype)
+
+                    self.assertEqual(mp_config.param_dtype, x.dtype)
+                    return self_.m(x) + self_.p
+
+            m = MyModel()
+
+            net = torch.nn.parallel.DistributedDataParallel(
+                m.to(rank),
+                device_ids=[rank],
+                mixed_precision=mp_config,
+                gradient_as_bucket_view=gradient_as_bucket_view,
+            )
+            # Buffers are casted in constructor.
+            self.assertEqual(net.module.buffer.dtype, mp_config.buffer_dtype)
+            # Each param should have an mp_param in the lower precision, and
+            # an fp_param in the higher precision.
+            for p in net.parameters():
+                self.assertEqual(mp_config.param_dtype, p._mp_param.dtype)
+                self.assertEqual(torch.float32, p._fp_param.dtype)
+
+            for i in range(6):
+                loss = net(inp).sum()
+                loss.backward()
+                # Verify gradient synchronization and params and grads are fp32.
+                for n, param in net.named_parameters():
+                    self.assertEqual(param.dtype, torch.float32)
+                    if param.grad is None:
+                        assert n == 'module.p'  # Only param that doesn't require grad
+                    else:
+                        self.assertEqual(param.grad.dtype, torch.float32)
+                        tensor_list = [
+                            torch.zeros_like(param.grad)
+                            for _ in range(dist.get_world_size(net.process_group))
+                        ]
+                        dist.all_gather(tensor_list, param.grad)
+                        g, rest = tensor_list[0], tensor_list[1:]
+                        self.assertEqual(g.dtype, torch.float32)
+                        for g_ in rest:
+                            self.assertEqual(g_.dtype, torch.float32)
+                            self.assertEqual(g, g_)
+                net.zero_grad(set_to_none=set_grad_to_none)
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_native_mixed_precision_no_grad_as_bucket_view_no_set_grad_none(self):
+            self._test_ddp_native_mixed_precision(
+                gradient_as_bucket_view=False,
+                set_grad_to_none=False,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_native_mixed_precision_grad_as_bucket_view_no_set_grad_none(self):
+            self._test_ddp_native_mixed_precision(
+                gradient_as_bucket_view=True,
+                set_grad_to_none=False,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_native_mixed_precision_grad_as_bucket_view_set_grad_to_none(self):
+            self._test_ddp_native_mixed_precision(
+                gradient_as_bucket_view=True, set_grad_to_none=True
+            )
+
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_native_mixed_precision_no_grad_as_bucket_view_set_grad_to_none(self):
+            self._test_ddp_native_mixed_precision(
+                gradient_as_bucket_view=True, set_grad_to_none=True
+            )
+
+        def _test_ddp_hook_parity(self, state, hook, num_validated_iters=100):
+            rank = self.rank
+            m = torch.nn.Linear(1, 5)
+            try:
+                process_group = state.process_group
+            except AttributeError:
+                process_group = state
+
+            net_with_hook = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(m).to(rank),
+                device_ids=[rank],
+                process_group=process_group,
+            )
+            net_with_hook.register_comm_hook(state=state, hook=hook)
+            net_without_hook = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(m).to(rank),
+                device_ids=[rank],
+                process_group=process_group,
+            )
+            for i in range(100):
+                # Clear gradients manually.
+                for g in [
+                    net_without_hook.module.weight.grad,
+                    net_with_hook.module.weight.grad,
+                ]:
+                    if g is not None:
+                        g.requires_grad_(False)
+                        g.zero_()
+                # Forward + BW
+                batch = torch.tensor([rank]).float().cuda(rank)
+                loss = net_without_hook(batch).sum()
+                loss.backward()
+                # For each worker, the gradient on the weight should be worker_rank.
+                grad = net_without_hook.module.weight.grad
+                avg = grad.clone()
+                expected_grad = (
+                    sum(i for i in range(dist.get_world_size())) / dist.get_world_size()
+                )
+                loss_hook = net_with_hook(batch).sum()
+                loss_hook.backward()
+                grad_hook = net_with_hook.module.weight.grad
+                avg_hook = grad_hook.clone()
+
+                if i < num_validated_iters:
+                    # Verify hook grad with expected.
+                    self.assertEqual(
+                        avg_hook[0, 0].item(),
+                        expected_grad,
+                        msg=f"Expected hook grad of {expected_grad} but got {avg_hook[0, 0]}",
+                    )
+                    # Verify hook grad with vanilla allreduce
+                    self.assertEqual(
+                        avg_hook[0, 0],
+                        avg[0, 0],
+                        msg=f"Expected hook grad to be close to allreduce {avg[0, 0]}, but got {avg_hook[0, 0]}",
+                    )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_hook_parity_allreduce(self):
+            self._test_ddp_hook_parity(state=None, hook=default.allreduce_hook)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_hook_parity_allreduce_process_group(self):
+            # process_group is passed in to both DDP and comm. hook
+            world_size = dist.get_world_size()
+            rank_to_GPU = init_multigpu_helper(world_size, BACKEND)
+            gpus = [rank_to_GPU[int(r)][0] for r in range(world_size)]
+            process_group = torch.distributed.new_group(gpus)
+            self._test_ddp_hook_parity(state=process_group, hook=default.allreduce_hook)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_hook_parity_powerSGD(self):
+            for warm_start in [True, False]:
+                powersgd_state = powerSGD.PowerSGDState(
+                    process_group=None,
+                    matrix_approximation_rank=1,
+                    start_powerSGD_iter=2,
+                    warm_start=warm_start,
+                )
+                self._test_ddp_hook_parity(
+                    state=powersgd_state, hook=powerSGD.powerSGD_hook
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_but_pass_in_sandcastle_if(
+            NO_MULTIPROCESSING_SPAWN,
+            "Disabled for environments that \
+                         don't support multiprocessing with spawn start method",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_ddp_hook_parity_post_localSGD(self):
+            # Although we start run local SGD at iteration 10, since we still use the global process group to run it,
+            # the post-LocalSGD actually still allreduces gradients globally for the remaining iterations.
+            state = post_localSGD.PostLocalSGDState(
+                process_group=None, subgroup=dist.group.WORLD, start_localSGD_iter=10
+            )
+            self._test_ddp_hook_parity(
+                state=state, hook=post_localSGD.post_localSGD_hook
+            )
+            # Only validate the warmup iterations before local SGD is applied,
+            # because when `post_local_gradient_allreduce` is disabled, the gradients will not be synchronized at all.
+            # Note that in practice a model averager has to be applied to run model averaging,
+            # so local gradient averaging is not necessary.
+            start_localSGD_iter = 10
+            state = post_localSGD.PostLocalSGDState(
+                process_group=None,
+                subgroup=dist.group.WORLD,
+                start_localSGD_iter=start_localSGD_iter,
+                post_local_gradient_allreduce=False,
+            )
+            self._test_ddp_hook_parity(
+                state=state,
+                hook=post_localSGD.post_localSGD_hook,
+                num_validated_iters=start_localSGD_iter,
+            )
+
+            # When `subgroup` is None, it is equivalent to the subgroup on the each node.
+            # For this single-node test environment, the intra-node process group is equivalent to
+            # the global process group.
+            if self.world_size == dist.get_world_size():
+                state = post_localSGD.PostLocalSGDState(
+                    process_group=None, subgroup=None, start_localSGD_iter=10
+                )
+                self._test_ddp_hook_parity(
+                    state=state, hook=post_localSGD.post_localSGD_hook
+                )
+
+            # Since we start local SGD later than the total number of 100 iterations,
+            # no local SGD actually is executed, and we don't even need to provide a subgroup for this case.
+            state = post_localSGD.PostLocalSGDState(
+                process_group=None, subgroup=None, start_localSGD_iter=1000
+            )
+            self._test_ddp_hook_parity(
+                state=state, hook=post_localSGD.post_localSGD_hook
+            )
+
+        def _prepare_single_device_module(
+            self,
+            rank,
+            process_group,
+            devices,
+            device_ids,
+            global_batch_size,
+            gradient_as_bucket_view=False,
+        ):
+            model = Net()
+            device = devices[0] if devices else torch.device("cuda:%d" % rank)
+            ddp_model = DistributedDataParallel(
+                copy.deepcopy(model).to(device),
+                device_ids=device_ids,
+                process_group=process_group,
+                bucket_cap_mb=0.001,
+                gradient_as_bucket_view=gradient_as_bucket_view,
+            )
+
+            model.to(device)
+
+            input = torch.randn(global_batch_size, 2).to(device)
+            target = torch.randn(global_batch_size, 4).to(device)
+
+            return model, ddp_model, input, target
+
+        def _prepare_cpu_module(
+            self,
+            process_group,
+            global_batch_size,
+            gradient_as_bucket_view=False,
+        ):
+            model = Net()
+            ddp_model = DistributedDataParallel(
+                copy.deepcopy(model),
+                process_group=process_group,
+                bucket_cap_mb=0.001,
+                gradient_as_bucket_view=gradient_as_bucket_view,
+            )
+            input = torch.randn(global_batch_size, 2)
+            target = torch.randn(global_batch_size, 4)
+            return model, ddp_model, input, target
+
+        def _test_accumulate_gradients_no_sync(
+            self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False
+        ):
+            """
+            This is the recommended way to implement accumulate grads.
+            If ``ddp_comm_hook`` input was specified, it will also register that hook
+            to the ``ddp_model``. The hook fed into this function should not change
+            the resulting gradients.
+            """
+            group, group_id, rank = self._init_global_test()
+            world_size = get_world_size()
+
+            # FIXME: Add testing for gloo/CUDA
+            if BACKEND == "mpi" or BACKEND == "gloo":
+                global_batch_size = world_size
+                local_batch_size = 1
+                model, ddp_model, input, target = self._prepare_cpu_module(
+                    group_id, global_batch_size, gradient_as_bucket_view
+                )
+
+            if BACKEND == "nccl":
+                rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+                int_devices = rank_to_GPU[rank][:1]
+                devices = [torch.device("cuda:" + str(i)) for i in int_devices]
+                global_batch_size = world_size
+                local_batch_size = len(devices)
+                model, ddp_model, input, target = self._prepare_single_device_module(
+                    rank,
+                    group_id,
+                    devices,
+                    devices,
+                    global_batch_size,
+                    gradient_as_bucket_view,
+                )
+
+            if ddp_comm_hook is not None:
+                ddp_model.register_comm_hook(group_id, ddp_comm_hook)
+
+            def step_model(model, input, target):
+                model.train()
+                output = model(input)
+                loss = F.mse_loss(output, target.to(output.device))
+                loss.backward()
+
+            # ensure accumulate grads works with no_grad => no grads are accumulated.
+            with torch.no_grad():
+                with ddp_model.no_sync():
+                    ddp_model.train()
+                    ddp_model(input)
+
+            # check two model parameters over num_iters iterations
+            for iteration in range(num_iters):
+                step_model(model, input, target)
+
+                ddp_input = input[
+                    rank * local_batch_size : (rank + 1) * local_batch_size
+                ]
+                ddp_target = target[
+                    rank * local_batch_size : (rank + 1) * local_batch_size
+                ]
+
+                if iteration % 2 == 0:
+                    # accumulate grads locally
+                    with ddp_model.no_sync():
+                        step_model(ddp_model, ddp_input, ddp_target)
+                else:
+                    # sync grads
+                    step_model(ddp_model, ddp_input, ddp_target)
+
+                for i, j in zip(model.parameters(), ddp_model.parameters()):
+                    if not i.requires_grad:
+                        continue
+                    if iteration % 2 == 0:
+                        self.assertNotEqual(i.grad, j.grad)
+                    else:
+                        self.assertEqual(i.grad, j.grad)
+
+                # Shuffle the input so that DDP input is different
+                torch.manual_seed(1337 + iteration)
+                input = input[torch.randperm(global_batch_size)]
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+            "get_future is only supported on mpi, nccl and gloo",
+        )
+        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+        def test_accumulate_gradients_no_sync(self):
+            """
+            Runs _test_accumulate_gradients_no_sync using default inputs
+            """
+            self._test_accumulate_gradients_no_sync()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+            "get_future is only supported on mpi, nccl and gloo",
+        )
+        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+        def test_accumulate_gradients_no_sync_grad_is_view(self):
+            """
+            Runs _test_accumulate_gradients_no_sync using default inputs
+            """
+            self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+            "get_future is only supported on mpi, nccl and gloo",
+        )
+        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+        def test_accumulate_gradients_no_sync_allreduce_hook(self):
+            """
+            Runs multiple iterations on _test_accumulate_gradients_no_sync
+            using allreduce hook and validates whether future result was properly
+            passed as gradients in reducer.
+            """
+
+            world_size = get_world_size()
+
+            def allreduce_hook(
+                group_id: object, bucket: dist.GradBucket
+            ) -> torch.futures.Future[torch.Tensor]:
+                tensors = [bucket.buffer() / world_size]
+                return (
+                    group_id.allreduce(tensors)
+                    .get_future()
+                    .then(lambda fut: fut.value()[0])
+                )
+
+            self._test_accumulate_gradients_no_sync(
+                num_iters=4, ddp_comm_hook=allreduce_hook
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+            "get_future is only supported on mpi, nccl and gloo",
+        )
+        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+        def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
+            """
+            Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
+            hook that also uses then callbacks. In first then callback result is multiplied
+            by 2, and the second callback divides the result by 2 * world_size. It validates
+            whether final result was properly passed as gradients in reducer.
+            """
+
+            world_size = get_world_size()
+
+            def allreduce_with_then_hook(
+                group_id: object, bucket: dist.GradBucket
+            ) -> torch.futures.Future[torch.Tensor]:
+                fut = group_id.allreduce([bucket.buffer()]).get_future()
+
+                def mult(fut):
+                    # Multiply the result by 2.
+                    return 2 * fut.wait()[0]
+
+                def div(fut):
+                    # Divide the result by 2 * world_size.
+                    return fut.wait() / (2 * world_size)
+
+                return fut.then(mult).then(div)
+
+            self._test_accumulate_gradients_no_sync(
+                num_iters=4, ddp_comm_hook=allreduce_with_then_hook
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+            "get_future is only supported on mpi, nccl and gloo",
+        )
+        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+        def test_get_future(self):
+            def mult(fut):
+                return [t * 3 for t in fut.wait()]
+
+            def add(fut):
+                return [t + 1 for t in fut.wait()]
+
+            group, group_id, rank = self._init_global_test()
+            input = _build_tensor(3, 2)
+            if BACKEND == "nccl":
+                rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+                device_id = rank_to_GPU[rank][0]
+                input = input.to(device_id)
+            fut = group_id.allreduce([input]).get_future()
+            res = fut.then(mult).then(add).wait()
+            expected = _build_tensor(3, 2 * len(group) * 3 + 1)
+
+            self.assertEqual(res[0], expected)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel(self):
+            group, group_id, rank = self._init_global_test()
+            rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
+            gpus = list(rank_to_GPU[rank])
+
+            for use_bucket_view, static_graph in itertools.product(
+                (False, True), (False, True)
+            ):
+                self._test_DistributedDataParallel(
+                    gpu_subset=gpus,
+                    rank=rank,
+                    gradient_as_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                )
+
+                # test set static graph twice
+                self._test_DistributedDataParallel(
+                    gpu_subset=gpus,
+                    rank=rank,
+                    gradient_as_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                    set_static_graph_twice=True,
+                )
+
+                # test output_device
+                self._test_DistributedDataParallel(
+                    gpu_subset=gpus,
+                    rank=rank,
+                    output_device=torch.device("cuda"),
+                    gradient_as_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                )
+
+                # test device_ids
+                gpus_list = [torch.device("cuda:" + str(i)) for i in gpus]
+                self._test_DistributedDataParallel(
+                    gpu_subset=gpus_list,
+                    rank=rank,
+                    output_device=torch.device("cuda"),
+                    gradient_as_bucket_view=use_bucket_view,
+                    static_graph=static_graph,
+                )
+
+        def _test_DistributedDataParallel_with_amp(self, grad_is_view=False):
+            torch.manual_seed(31415)
+            # Creates model and optimizer in default precision
+            model = copy.deepcopy(DDP_NET).cuda()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
+
+            # Creates a GradScaler once at the beginning of training.
+            scaler = GradScaler()
+
+            ddp_model = nn.parallel.DistributedDataParallel(
+                model, device_ids=[self.rank], gradient_as_bucket_view=grad_is_view
+            )
+
+            input = torch.randn(dist.get_world_size() * 2, 2).cuda()
+            target = torch.randn(dist.get_world_size() * 2, 4).cuda()
+            loss_fn = nn.MSELoss()
+
+            # verify grads are none before training
+            for p in ddp_model.parameters():
+                self.assertTrue(p is not None)
+                self.assertTrue(p.grad is None)
+
+            for idx in range(20):
+                optimizer.zero_grad()
+                # Runs the forward pass with autocasting.
+                with autocast():
+                    output = ddp_model(input)
+                    loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                # Backward passes under autocast are not recommended.
+                # Backward ops run in the same dtype autocast chose for corresponding forward ops.
+                scaler.scale(loss).backward()
+
+                # verify grads are not none and are valid during training
+                for p in ddp_model.parameters():
+                    if p.requires_grad:
+                        self.assertTrue(p.grad is not None)
+                        self.assertFalse(p.grad.isnan().any())
+                        self.assertFalse(p.grad.isinf().any())
+
+                # scaler.step() first unscales the gradients of the optimizer's assigned params.
+                # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+                # Shuffle the input so that DDP input is different
+                torch.manual_seed(1337 + idx)
+                input = input[torch.randperm(dist.get_world_size() * 2)]
+
+            return ddp_model
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_with_amp_and_grad_is_view(self):
+            torch.cuda.set_device(self.rank)
+            ddp_model_grad_not_view = self._test_DistributedDataParallel_with_amp(
+                grad_is_view=False
+            )
+            ddp_model_grad_is_view = self._test_DistributedDataParallel_with_amp(
+                grad_is_view=True
+            )
+            for i, j in zip(
+                ddp_model_grad_not_view.parameters(),
+                ddp_model_grad_is_view.parameters(),
+            ):
+                self.assertEqual(i, j)
+
+        def _test_DistributedDataParallel_SyncBatchNorm(
+            self,
+            gpu_subset,
+            rank,
+            local_bs,
+            global_bs,
+            offset,
+            output_device=None,
+            affine=True,
+        ):
+            # Run a simple end to end DDP model, use result of single node model
+            # as baseline
+
+            # cpu training setup
+            model = BN_NET if affine else BN_NET_NO_AFFINE
+
+            # single gpu training setup
+            model_gpu = copy.deepcopy(model)
+            model_gpu.cuda(gpu_subset[0])
+
+            # DDP training setup
+            model_DDP = nn.SyncBatchNorm.convert_sync_batchnorm(copy.deepcopy(model))
+            model_DDP.cuda(gpu_subset[0])
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_DDP, device_ids=gpu_subset
+            )
+
+            # test serializable/unserializable
+            with tempfile.NamedTemporaryFile() as tmp:
+                if sys.platform == "win32":
+                    torch.save(model_DDP, tmp)
+                    tmp.seek(0)
+                    model_DDP = torch.load(tmp)
+                else:
+                    torch.save(model_DDP, tmp.name)
+                    model_DDP = torch.load(tmp.name)
+
+            # data initialization
+            input_cpu = torch.randn(global_bs, 2)
+            target = torch.randn(global_bs, 4)
+            loss = nn.MSELoss()
+
+            # check two model parameters over 5 iterations
+            self._test_DDP_niter(
+                model_gpu,
+                model_DDP,
+                input_cpu.cuda(gpu_subset[0]),
+                target.cuda(gpu_subset[0]),
+                loss,
+                local_bs,
+                rank,
+                global_bs,
+                True,
+                offset,
+                dist.get_world_size(),
+                5 if affine else 2,
+            )
+            self._barrier()
+
+        def _test_post_localSGD_optimizer_parity(self, create_averager, grad_is_view):
+            learning_rate = 0.03
+
+            net = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(DDP_NET).cuda(),
+                device_ids=[self.rank],
+                gradient_as_bucket_view=grad_is_view,
+            )
+            averager = create_averager()
+            opt = torch.optim.SGD(net.parameters(), lr=learning_rate)
+
+            net_using_post_localSGD_opt = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(DDP_NET).cuda(),
+                device_ids=[self.rank],
+                gradient_as_bucket_view=grad_is_view,
+            )
+            # Process group cannot be pickled in some environments,
+            # so cannot deep copy an averager. See:
+            # https://github.com/pytorch/pytorch/pull/74737#pullrequestreview-922487496
+            averager2 = create_averager()
+            post_localSGD_opt = self._create_post_localSGD_optimizer(
+                net_using_post_localSGD_opt, learning_rate, averager2
+            )
+
+            input = torch.randn(dist.get_world_size() * 2, 2).cuda()
+            target = torch.randn(dist.get_world_size() * 2, 4).cuda()
+            loss_fn = nn.MSELoss()
+
+            for _ in range(20):
+                self._perform_a_train_step(opt, net, loss_fn, input, target)
+                averager.average_parameters(net.parameters())
+
+                self._perform_a_train_step(
+                    post_localSGD_opt,
+                    net_using_post_localSGD_opt,
+                    loss_fn,
+                    input,
+                    target,
+                )
+                for p1, p2 in zip(
+                    net.parameters(), net_using_post_localSGD_opt.parameters()
+                ):
+                    self.assertEqual(p1.data, p2.data)
+
+            # Also check if the built-in step counters are the same to prevent a bug like #74737.
+            self.assertEqual(averager.step, averager2.step)
+
+        def _create_periodic_model_averager(self):
+            return averagers.PeriodicModelAverager(period=4, warmup_steps=10)
+
+        def _create_post_localSGD_optimizer(self, net, learning_rate, averager):
+            return post_localSGD_optimizer.PostLocalSGDOptimizer(
+                optim=torch.optim.SGD(net.parameters(), lr=learning_rate),
+                averager=averager,
+            )
+
+        def _perform_a_train_step(self, optimizer, net, loss_fn, input, target):
+            optimizer.zero_grad()
+            output = net(input)
+            loss = loss_fn(output, target)
+            loss.backward()
+            optimizer.step()
+
+        def _test_post_localSGD_optimizer_step_reload(
+            self, create_averager, chkpt_file
+        ):
+            learning_rate = 0.03
+
+            net_using_post_localSGD_opt = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(DDP_NET).cuda(), device_ids=[self.rank]
+            )
+
+            averager = create_averager()
+            post_localSGD_opt = self._create_post_localSGD_optimizer(
+                net_using_post_localSGD_opt, learning_rate, averager
+            )
+
+            averager2 = create_averager()
+            dummy_post_localSGD_opt = self._create_post_localSGD_optimizer(
+                net_using_post_localSGD_opt, learning_rate, averager2
+            )
+
+            input = torch.randn(dist.get_world_size() * 2, 2).cuda()
+            target = torch.randn(dist.get_world_size() * 2, 4).cuda()
+            loss_fn = nn.MSELoss()
+
+            for _ in range(20):
+                self._perform_a_train_step(
+                    post_localSGD_opt,
+                    net_using_post_localSGD_opt,
+                    loss_fn,
+                    input,
+                    target,
+                )
+
+            if self.rank == 0:
+                torch.save(
+                    {"optimizer_state_dict": post_localSGD_opt.state_dict()}, chkpt_file
+                )
+
+            dist.barrier()
+            map_location = {"cuda:%d" % 0: "cuda:%d" % self.rank}
+            checkpoint = torch.load(chkpt_file, map_location=map_location)
+            dummy_post_localSGD_opt.load_state_dict(checkpoint["optimizer_state_dict"])
+
+            # Check that we didn't hit the trivial case
+            self.assertNotEqual(averager2.step, 0)
+            # Check if dummy averager was initialized to a correct value
+            self.assertEqual(averager.step, averager2.step)
+
+            # Remove 'step' entry from a checkpoint.
+            # And make sure it is not in the state dictionary
+            del checkpoint["optimizer_state_dict"]["step"]
+            self.assertNotIn("step", checkpoint["optimizer_state_dict"])
+
+            # Check if checkpoint without a 'step' entry invokes a warning
+            with self.assertWarnsRegex(
+                expected_warning=UserWarning,
+                expected_regex="Loaded state dict does not contain a step counter for an averager. "
+                "Setting step counter to 0.",
+            ):
+                dummy_post_localSGD_opt.load_state_dict(
+                    checkpoint["optimizer_state_dict"]
+                )
+
+            self.assertEqual(averager2.step, 0)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_post_localSGD_optimizer_parity(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_periodic_model_averager,
+                grad_is_view=False,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_post_localSGD_optimizer_parity_grad_is_view(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_periodic_model_averager,
+                grad_is_view=True,
+            )
+
+        def _create_hierarchical_model_averager(self):
+            period_group_size_dict = OrderedDict([(2, 2), (4, dist.get_world_size())])
+            return hierarchicalSGD.HierarchicalModelAverager(
+                period_group_size_dict=period_group_size_dict, warmup_steps=4
+            )
+
+        @skip_if_lt_x_gpu(4)
+        @skip_if_odd_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_post_localSGD_optimizer_parity_with_hierarchical_sgd(self):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_hierarchical_model_averager,
+                grad_is_view=False,
+            )
+
+        @skip_if_lt_x_gpu(4)
+        @skip_if_odd_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_post_localSGD_optimizer_parity_with_hierarchical_sgd_grad_is_view(
+            self,
+        ):
+            torch.cuda.set_device(self.rank)
+            self._test_post_localSGD_optimizer_parity(
+                self._create_hierarchical_model_averager,
+                grad_is_view=True,
+            )
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_post_localSGD_optimizer_step_reload(self):
+            torch.cuda.set_device(self.rank)
+            with _rank_temp_file() as tmp_file:
+                self._test_post_localSGD_optimizer_step_reload(
+                    self._create_periodic_model_averager, tmp_file
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_Channels_Last(self):
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(
+                torch.channels_last
+            )
+            self._test_DistributedDataParallel_SyncBatchNorm_with_memory_format(
+                torch.channels_last_3d
+            )
+
+        def _test_DistributedDataParallel_SyncBatchNorm_with_memory_format(
+            self, memory_format
+        ):
+            group, group_id, rank = self._init_global_test()
+            num_processes = dist.get_world_size()
+            local_bs = 2
+            bs_offset = int(rank * 2)
+            global_bs = int(num_processes * 2)
+
+            model = ONLY_SBN_NET
+            model_gpu = copy.deepcopy(model).cuda(rank)
+            model_DDP = nn.parallel.DistributedDataParallel(
+                model_gpu, device_ids=[rank]
+            )
+
+            shapes = [global_bs, 2, 4, 4] + (
+                [] if memory_format is torch.channels_last else [4]
+            )
+
+            input_gpu = (
+                torch.randn(*shapes, dtype=torch.float)
+                .cuda(rank)
+                .to(memory_format=memory_format)
+            )
+            target_gpu = (
+                torch.randn(*shapes, dtype=torch.float)
+                .cuda(rank)
+                .to(memory_format=memory_format)
+            )
+            loss = nn.MSELoss()
+
+            # check two model parameters over 5 iterations
+            self._test_DDP_niter(
+                model_gpu,
+                model_DDP,
+                input_gpu,
+                target_gpu,
+                loss,
+                local_bs,
+                rank,
+                global_bs,
+                True,
+                bs_offset,
+                dist.get_world_size(),
+                memory_format=memory_format,
+            )
+            self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm(self):
+            group, group_id, rank = self._init_global_test()
+            world_size = dist.get_world_size()
+            # DDP does not support replicating BN layers within a process, hence
+            # testing with one module replica per process
+            gpus = [rank]
+
+            local_bs = 2
+            bs_offset = int(rank * 2)
+            global_bs = int(world_size * 2)
+
+            self._test_DistributedDataParallel_SyncBatchNorm(
+                gpu_subset=gpus,
+                rank=rank,
+                local_bs=local_bs,
+                global_bs=global_bs,
+                offset=bs_offset,
+            )
+
+            # test output_device
+            self._test_DistributedDataParallel_SyncBatchNorm(
+                gpu_subset=gpus,
+                rank=rank,
+                local_bs=local_bs,
+                global_bs=global_bs,
+                offset=bs_offset,
+                output_device=torch.device("cuda"),
+            )
+
+            # test device_ids
+            gpus = [torch.device("cuda:" + str(i)) for i in gpus]
+            self._test_DistributedDataParallel_SyncBatchNorm(
+                gpu_subset=gpus,
+                rank=rank,
+                local_bs=local_bs,
+                global_bs=global_bs,
+                offset=bs_offset,
+                output_device=torch.device("cuda"),
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_No_Affine(self):
+            group, group_id, rank = self._init_global_test()
+            world_size = dist.get_world_size()
+            # DDP does not support replicating BN layers within a process, hence
+            # testing with one module replica per process
+            gpus = [rank]
+
+            local_bs = 2
+            bs_offset = int(rank * 2)
+            global_bs = int(world_size * 2)
+
+            self._test_DistributedDataParallel_SyncBatchNorm(
+                gpu_subset=gpus,
+                rank=rank,
+                local_bs=local_bs,
+                global_bs=global_bs,
+                offset=bs_offset,
+                affine=False,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_2D_Input(self):
+            group, group_id, rank = self._init_global_test()
+            # DDP does not support replicating BN layers within a process, hence
+            # testing with one module replica per process
+            gpus = [rank]
+
+            model = nn.BatchNorm1d(2)
+
+            # single gpu training setup
+            model_gpu = copy.deepcopy(model)
+            model_gpu.cuda(gpus[0])
+
+            # DDP training setup
+            model_DDP = nn.SyncBatchNorm.convert_sync_batchnorm(copy.deepcopy(model))
+            model_DDP.cuda(gpus[0])
+            model_DDP = nn.parallel.DistributedDataParallel(model_DDP, device_ids=gpus)
+
+            local_bs = len(gpus) * 2
+            global_bs = dist.get_world_size() * local_bs
+            input_cpu = torch.randn(global_bs, 2)
+            target = torch.randn(global_bs, 2)
+            loss = nn.MSELoss()
+
+            # disabling cudnn.
+            # SyncBatchNorm goes through native_batch_norm kernel, this avoids the
+            # numerical issue created by the divergent code path.
+            with torch.backends.cudnn.flags(False):
+                # check two model parameters over 5 iterations
+                self._test_DDP_niter(
+                    model_gpu,
+                    model_DDP,
+                    input_cpu.cuda(gpus[0]),
+                    target.cuda(gpus[0]),
+                    loss,
+                    local_bs,
+                    rank,
+                    global_bs,
+                    True,
+                )
+                self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        @require_world_size(2)
+        def test_DistributedDataParallel_SyncBatchNorm_Single_Input_Per_Process(self):
+            group, group_id, rank = self._init_global_test()
+            # DDP does not support replicating BN layers within a process, hence
+            # testing with one module replica per process
+            gpus = [rank]
+
+            model = nn.BatchNorm1d(2)
+
+            # single gpu training setup
+            model_gpu = copy.deepcopy(model)
+            model_gpu.cuda(gpus[0])
+
+            # DDP training setup
+            model_DDP = nn.SyncBatchNorm.convert_sync_batchnorm(copy.deepcopy(model))
+            model_DDP.cuda(gpus[0])
+            model_DDP = nn.parallel.DistributedDataParallel(model_DDP, device_ids=gpus)
+
+            local_bs = 1
+            global_bs = dist.get_world_size()
+            input_cpu = torch.randn(global_bs, 2)
+            target = torch.randn(global_bs, 2)
+            loss = nn.MSELoss()
+
+            # disabling cudnn.
+            # SyncBatchNorm goes through native_batch_norm kernel, this avoids the
+            # numerical issue created by the divergent code path.
+            with torch.backends.cudnn.flags(False):
+                # check two model parameters over 5 iterations
+                self._test_DDP_niter(
+                    model_gpu,
+                    model_DDP,
+                    input_cpu.cuda(gpus[0]),
+                    target.cuda(gpus[0]),
+                    loss,
+                    local_bs,
+                    rank,
+                    global_bs,
+                    True,
+                )
+                self._barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_Running_Value(
+            self,
+        ):
+            group, group_id, rank = self._init_global_test()
+            model = nn.parallel.DistributedDataParallel(
+                ONLY_SBN_NET.cuda(rank), device_ids=[rank]
+            )
+
+            input_var = []
+            for i in range(dist.get_world_size()):
+                input_var_rank = torch.cat(
+                    [
+                        torch.ones(2, 1, 10 ** (i + 1)) * (0.1 ** (i - 1)),
+                        torch.ones(2, 1, 10 ** (i + 1)) * (0.3 ** (i - 1)),
+                    ],
+                    dim=1,
+                )
+                input_var.append(input_var_rank)
+
+            all_input_var = torch.cat(
+                [
+                    x.permute(1, 0, 2).contiguous().view(ONLY_SBN_NET.num_features, -1)
+                    for x in input_var
+                ],
+                dim=1,
+            ).cuda(rank)
+
+            for i in range(100):
+                y = model(input_var[rank].cuda(rank))
+                y.mean().backward()
+
+            running_mean, running_var = (
+                model.module.running_mean,
+                model.module.running_var,
+            )
+            torch.testing.assert_close(running_mean, all_input_var.mean(1))
+            torch.testing.assert_close(running_var, all_input_var.var(1))
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_Diff_Input_Sizes_gradient(self):
+            group, group_id, rank = self._init_global_test()
+            # only do single GPU per process
+            gpus = [rank]
+
+            # cpu training setup
+            model = BN_NET
+
+            num_processes = dist.get_world_size()
+            local_bs = rank + 2
+            bs_offset = int((rank + 3) * rank / 2)
+            global_bs = int((num_processes + 3) * num_processes / 2)
+
+            self._test_DistributedDataParallel_SyncBatchNorm(
+                gpu_subset=gpus,
+                rank=rank,
+                local_bs=local_bs,
+                global_bs=global_bs,
+                offset=bs_offset,
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_DistributedDataParallel_SyncBatchNorm_half(self):
+            group, group_id, rank = self._init_global_test()
+
+            model = copy.deepcopy(BN_NET)
+            model = model.half()
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            model = nn.parallel.DistributedDataParallel(model.cuda(rank), device_ids=[rank])
+            inp = torch.randn(2, 2, dtype=torch.float16, device=torch.device(rank))
+            # Check that forward/backward do not error with dtype mismatch
+            out = model(inp)
+            self.assertEqual(out.dtype, torch.float16)
+            out.sum().backward()
+            for param in model.parameters():
+                self.assertEqual(param.grad.dtype, torch.float16)
+
+        def _test_ddp_logging_data(self, is_gpu):
+            rank = dist.get_rank()
+            model_DDP = copy.deepcopy(DDP_NET)
+            if is_gpu:
+                model_DDP = nn.parallel.DistributedDataParallel(
+                    model_DDP.cuda(rank), device_ids=[rank]
+                )
+            else:
+                model_DDP = nn.parallel.DistributedDataParallel(model_DDP)
+
+            # dummy data initialization
+            local_bs = 2
+            batch_size, input, target, loss = self._prepare_dummy_data(local_bs)
+            if is_gpu:
+                input = input.cuda(rank)
+                target = target.cuda(rank)
+
+            model_DDP._set_ddp_runtime_logging_sample_rate(2)
+
+            for idx in range(20):
+                offset = rank * local_bs
+
+                # DDP training, DDP scatters subsets of input to nodes/GPUs
+                self._test_DDP_helper(
+                    model_DDP,
+                    input[offset : offset + local_bs],
+                    target[offset : offset + local_bs],
+                    loss,
+                    1,
+                )
+
+                self._model_step_with_zero_grad(model_DDP)
+
+                # Verify DDP logging data is sampled as expected
+                # If it has ran more than 10 iterations and this is
+                # the sampled iteration for measuring run time stats,
+                # the run time stats for this idx-th iteration will not
+                # be zeros.
+                ddp_logging_data = model_DDP._get_ddp_logging_data()
+                if idx > 0 and (idx < 10 or idx % 2 == 0):
+                    self.assertGreaterEqual(
+                        ddp_logging_data.get("forward_compute_time"), 1
+                    )
+                    self.assertGreaterEqual(
+                        ddp_logging_data.get("backward_compute_time"), 1
+                    )
+                    self.assertGreaterEqual(
+                        ddp_logging_data.get("backward_comm_time"), 1
+                    )
+                    self.assertGreaterEqual(
+                        ddp_logging_data.get("backward_compute_time"),
+                        ddp_logging_data.get("backward_compute_comm_overlap_time"),
+                    )
+                    self.assertGreaterEqual(
+                        ddp_logging_data.get("backward_comm_time"),
+                        ddp_logging_data.get("backward_compute_comm_overlap_time"),
+                    )
+                    self.assertEqual(ddp_logging_data.get("iteration"), idx)
+                elif idx > 0:
+                    # if the idx-th iteration is not sampled to set runtime stats,
+                    # ddp_logging_data.iteration will not be updated to current
+                    # iteration.
+                    self.assertNotEqual(ddp_logging_data.get("iteration"), idx)
+
+                # Shuffle the input so that DDP input is different
+                input = input[torch.randperm(batch_size)]
+
+            return model_DDP
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_ddp_logging_data_cpu(self):
+            def parse_env(var):
+                return os.environ[var] if var in os.environ else "N/A"
+
+            dist.set_debug_level(dist.DebugLevel.INFO)
+            group, group_id, rank = self._init_global_test()
+            model_DDP = self._test_ddp_logging_data(is_gpu=False)
+
+            ddp_logging_data = model_DDP._get_ddp_logging_data()
+            self.assertEqual(ddp_logging_data.get("world_size"), dist.get_world_size())
+            self.assertEqual(ddp_logging_data.get("rank"), dist.get_rank())
+            self.assertEqual(ddp_logging_data.get("module_name"), "Net")
+            self.assertEqual(ddp_logging_data.get("device_ids"), "")
+            # output_device is -1 in default if it is not set, e.g.
+            # output_device of CPU training is -1.
+            self.assertEqual(ddp_logging_data.get("output_device"), -1)
+            self.assertEqual(ddp_logging_data.get("broadcast_buffers"), 1)
+            self.assertEqual(ddp_logging_data.get("bucket_cap_bytes"), 25 * 1024 * 1024)
+            self.assertEqual(ddp_logging_data.get("find_unused_parameters"), 0)
+            self.assertEqual(ddp_logging_data.get("gradient_as_bucket_view"), 0)
+            self.assertEqual(
+                ddp_logging_data.get("backend_name"), dist.get_backend(group_id)
+            )
+            self.assertEqual(ddp_logging_data.get("iteration"), 18)
+            params = list(model_DDP.parameters())
+            num_params = 0
+            param_size = 0
+            params = list(filter(lambda parameter: parameter.requires_grad, params))
+            for p in params:
+                num_params += 1
+                param_size += p.numel() * p.element_size()
+            self.assertEqual(ddp_logging_data.get("dtypes"), "float")
+            self.assertEqual(
+                ddp_logging_data.get("total_parameter_size_bytes"), param_size
+            )
+            self.assertEqual(ddp_logging_data.get("num_parameter_tensors"), num_params)
+            self.assertEqual(ddp_logging_data.get("bucket_sizes"), str(param_size))
+            self.assertEqual(
+                ddp_logging_data.get("master_port"), parse_env("MASTER_PORT")
+            )
+            self.assertEqual(
+                ddp_logging_data.get("master_addr"), parse_env("MASTER_ADDR")
+            )
+            self.assertEqual(
+                ddp_logging_data.get("torch_distributed_debug"),
+                parse_env("TORCH_DISTRIBUTED_DEBUG"),
+            )
+            self.assertEqual(
+                ddp_logging_data.get("cuda_visible_devices"),
+                parse_env("CUDA_VISIBLE_DEVICES"),
+            )
+            if ddp_logging_data.get("backend_name") == "gloo":
+                self.assertEqual(
+                    ddp_logging_data.get("gloo_socket_ifname"),
+                    parse_env("GLOO_SOCKET_IFNAME"),
+                )
+                self.assertEqual(
+                    ddp_logging_data.get("gloo_device_transport"),
+                    parse_env("GLOO_DEVICE_TRANSPORT"),
+                )
+                default_gloo_threads = 2
+                self.assertEqual(
+                    ddp_logging_data.get("gloo_num_threads"),
+                    default_gloo_threads,
+                )
+
+            self.assertEqual(ddp_logging_data.get("nccl_socket_ifname"), None)
+            self.assertEqual(ddp_logging_data.get("nccl_blocking_wait"), None)
+            self.assertEqual(ddp_logging_data.get("nccl_async_error_handling"), None)
+            self.assertEqual(ddp_logging_data.get("nccl_debug"), None)
+            self.assertEqual(ddp_logging_data.get("nccl_nthreads"), None)
+            self.assertEqual(ddp_logging_data.get("nccl_ib_timeout"), None)
+            # test runtime logging fields
+            # Note: DETAIL debug mode logs DDP logging data to stdout and
+            # thus accesses std::map, which fills in a default value for the
+            # type if it didn't exist.
+            self.assertEqual(ddp_logging_data.get("unused_parameter_size", 0), 0)
+            self.assertEqual(ddp_logging_data.get("has_rebuilt_buckets"), 1)
+            self.assertEqual(
+                ddp_logging_data.get("rebuilt_bucket_sizes"), str(param_size)
+            )
+            grad_ready_order = ddp_logging_data.get(
+                "prev_iteration_grad_ready_order_indices"
+            )
+            expected_order = list(reversed([str(x) for x in range(3)]))
+            self.assertEqual(grad_ready_order, ", ".join(expected_order))
+            bucket_indices = ddp_logging_data.get("rebuilt_per_bucket_param_indices")
+            self.assertEqual(bucket_indices, " ".join(expected_order))
+            # It is hard to test accurate latency, but it can test whether the latency is
+            # a valid value and in the expected range.
+            self.assertGreaterEqual(ddp_logging_data.get("avg_forward_compute_time"), 1)
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_compute_time"), 1
+            )
+            self.assertGreaterEqual(ddp_logging_data.get("avg_backward_comm_time"), 1)
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_compute_time"),
+                ddp_logging_data.get("avg_backward_compute_comm_overlap_time"),
+            )
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_comm_time"),
+                ddp_logging_data.get("avg_backward_compute_comm_overlap_time"),
+            )
+            # Test host-side times are roughly in the order that we expect
+            fwd_host_side_time = ddp_logging_data.get("forward_compute_time_start")
+            bwd_comp_start_host_side_time = ddp_logging_data.get(
+                "backward_compute_time_start"
+            )
+            bwd_comp_end_host_side_time = ddp_logging_data.get(
+                "backward_compute_time_end"
+            )
+            bwd_comm_start_host_side_time = ddp_logging_data.get(
+                "backward_comm_time_start"
+            )
+            bwd_comm_end_host_side_time = ddp_logging_data.get("backward_comm_time_end")
+            self.assertGreaterEqual(
+                bwd_comm_end_host_side_time, bwd_comm_start_host_side_time
+            )
+            self.assertGreaterEqual(
+                bwd_comm_start_host_side_time, bwd_comp_start_host_side_time
+            )
+            self.assertGreaterEqual(
+                bwd_comp_end_host_side_time, bwd_comp_start_host_side_time
+            )
+            self.assertGreaterEqual(bwd_comp_start_host_side_time, fwd_host_side_time)
+
+            # test larger net with mixed data types, verify multiple bucket sizes
+            model = LargeNet()
+            model.float()
+            model.fc1.double()
+            model_DDP = nn.parallel.DistributedDataParallel(model, bucket_cap_mb=1.5)
+            ddp_logging_data = model_DDP._get_ddp_logging_data()
+            params = list(model_DDP.parameters())
+            self.assertEqual(
+                ddp_logging_data.get("bucket_cap_bytes"), int(1.5 * 1024 * 1024)
+            )
+            bucket_sizes = [
+                params[1].numel() * params[1].element_size(),
+                params[0].numel() * params[0].element_size(),
+            ]
+            self.assertEqual(
+                ddp_logging_data.get("bucket_sizes"),
+                ", ".join(str(x) for x in bucket_sizes),
+            )
+            self.assertEqual(ddp_logging_data.get("dtypes"), "double, float")
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_no_gpu
+        def test_ddp_logging_data_gpu(self):
+            group, group_id, rank = self._init_global_test()
+            model_DDP = self._test_ddp_logging_data(is_gpu=True)
+            ddp_logging_data = model_DDP._get_ddp_logging_data()
+            self.assertEqual(ddp_logging_data.get("device_ids"), str(rank))
+            self.assertEqual(ddp_logging_data.get("output_device"), rank)
+            grad_ready_order = ddp_logging_data.get(
+                "prev_iteration_grad_ready_order_indices"
+            )
+            expected_order = list(reversed([str(x) for x in range(3)]))
+            self.assertEqual(grad_ready_order, ", ".join(expected_order))
+            bucket_indices = ddp_logging_data.get("rebuilt_per_bucket_param_indices")
+            self.assertEqual(bucket_indices, " ".join(expected_order))
+            # test runtime logging fields
+            # It is hard to test accurate latency, but it can test whether the latency is
+            # a valid value and in the expected range.
+            self.assertGreaterEqual(ddp_logging_data.get("avg_forward_compute_time"), 1)
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_compute_comm_overlap_time"), 1
+            )
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_compute_time"),
+                ddp_logging_data.get("avg_backward_compute_comm_overlap_time"),
+            )
+            self.assertGreaterEqual(
+                ddp_logging_data.get("avg_backward_comm_time"),
+                ddp_logging_data.get("avg_backward_compute_comm_overlap_time"),
+            )
+            # Test host-side times are roughly in the order that we expect
+            fwd_host_side_time = ddp_logging_data.get("forward_compute_time_start")
+            bwd_comp_start_host_side_time = ddp_logging_data.get(
+                "backward_compute_time_start"
+            )
+            bwd_comp_end_host_side_time = ddp_logging_data.get(
+                "backward_compute_time_end"
+            )
+            bwd_comm_start_host_side_time = ddp_logging_data.get(
+                "backward_comm_time_start"
+            )
+            bwd_comm_end_host_side_time = ddp_logging_data.get("backward_comm_time_end")
+            self.assertGreaterEqual(
+                bwd_comm_end_host_side_time, bwd_comm_start_host_side_time
+            )
+            self.assertGreaterEqual(
+                bwd_comm_start_host_side_time, bwd_comp_start_host_side_time
+            )
+            self.assertGreaterEqual(
+                bwd_comp_end_host_side_time, bwd_comp_start_host_side_time
+            )
+            self.assertGreaterEqual(bwd_comp_start_host_side_time, fwd_host_side_time)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "nccl", "nccl does not support DDP on CPU models"
+        )
+        def test_static_graph_api_cpu(self):
+            model_DDP = nn.parallel.DistributedDataParallel(DDP_NET)
+            expected_err = "should be called before training loop starts"
+            with self.assertRaisesRegex(RuntimeError, expected_err):
+                local_bs = 2
+                batch_size, input, target, loss = self._prepare_dummy_data(local_bs)
+                offset = dist.get_rank() * local_bs
+
+                # DDP training, DDP scatters subsets of input to nodes/GPUs
+                self._test_DDP_helper(
+                    model_DDP,
+                    input[offset : offset + local_bs],
+                    target[offset : offset + local_bs],
+                    loss,
+                    1,
+                )
+                model_DDP._set_static_graph()
+
+            # Verify error was logged in ddp_logging_data.
+            verify_ddp_error_logged(model_DDP, expected_err)
+
+        @skipIfNoTorchVision
+        def test_SyncBatchNorm_process_group(self):
+            # When adopting `convert_sync_batchnorm` to convert a `nn.modules`,
+            # it need to recursively pass the `process_group` in the module when the `SyncBatchNorm`
+            # is nested in a sub-module or sub-sub-module (e.g. resnet50 in torchvision.models).
+
+            process_ids = 0
+            process_group = torch.distributed.new_group([process_ids])
+            res50_model = torchvision.models.resnet50()
+            res50_model_sync = nn.SyncBatchNorm.convert_sync_batchnorm(
+                copy.deepcopy(res50_model), process_group
+            )
+            process_group_sync = res50_model_sync.layer1[0].bn1.process_group
+            self.assertEqual(process_group_sync, process_group)
+
+        def _run_reduction_test(
+            self, tensor, expected_tensor, op, reduction_fn=dist.all_reduce, dst=None
+        ):
+            if reduction_fn != dist.all_reduce and dst is None:
+                raise ValueError(f"Reduction fn {reduction_fn} must specify dst!")
+            if dst is not None:
+                reduction_fn(tensor, dst, op)
+                # Only destination rank tensor is expected to have final result.
+                if dist.get_rank() == dst:
+                    self.assertEqual(tensor, expected_tensor)
+            else:
+                reduction_fn(tensor, op)
+                self.assertEqual(tensor, expected_tensor)
+
+        @require_backend_is_available({"nccl"})
+        @skip_if_lt_x_gpu(2)
+        def test_nccl_backend_bool_allreduce(self):
+            torch.cuda.set_device(self.rank)
+            # Run all_reduce with PRODUCT
+            element = self.rank % 2 == 0
+            for op in [dist.ReduceOp.PRODUCT, dist.ReduceOp.MIN]:
+                input_tensor = torch.tensor([element, element]).to(self.rank)
+                self._run_reduction_test(
+                    input_tensor, torch.tensor([False, False]).to(self.rank), op
+                )
+                # Ensure that all ranks contributing True (cast to 1) results in the
+                # correct reduction.
+                input_tensor = torch.tensor([True, True]).to(self.rank)
+                expected_tensor = input_tensor.clone()
+                self._run_reduction_test(input_tensor, expected_tensor, op)
+
+            # Run all_reduce with SUM
+            for op in [dist.ReduceOp.SUM, dist.ReduceOp.MAX]:
+                input_tensor = torch.tensor([element, element]).to(self.rank)
+                self._run_reduction_test(
+                    input_tensor, torch.tensor([True, True]).to(self.rank), op
+                )
+            # TODO: NCCL backend does not work correctly for bitwise reduction ops
+            # (see https://github.com/pytorch/pytorch/issues/41362). Add tests for
+            # these once it is supported.
+
+        @require_backend_is_available({"nccl"})
+        @skip_if_lt_x_gpu(2)
+        def test_nccl_backend_bool_allgather(self):
+            torch.cuda.set_device(self.rank)
+            inp = {0: [True, True], 1: [False, True]}
+            input_tensor = torch.tensor(inp[self.rank % 2]).to(self.rank)
+            # Preserve a copy of the tensor to compare against after allgather.
+            input_tensor_copy = input_tensor.clone()
+            tensor_list = [
+                torch.tensor([False, False]).to(self.rank)
+                for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(tensor_list, input_tensor)
+
+            self.assertEqual(len(tensor_list), dist.get_world_size())
+            for i, t in enumerate(tensor_list):
+                expected = torch.tensor(inp[i % 2]).to(self.rank)
+                self.assertEqual(t, expected)
+            # Ensure that the input tensor is not modified, since this collective
+            # does not modify its input.
+            self.assertEqual(input_tensor_copy, input_tensor)
+
+        @require_backend_is_available({"nccl"})
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_nccl_backend_bool_reduce(self):
+            torch.cuda.set_device(self.rank)
+            inp = {0: [True, True], 1: [False, False]}
+            # Run reduce() with product op
+            for op in [dist.ReduceOp.PRODUCT, dist.ReduceOp.MIN]:
+                input_tensor = torch.tensor(inp[self.rank % 2]).to(self.rank)
+                expected = torch.tensor([False, False]).to(self.rank)
+                self._run_reduction_test(input_tensor, expected, op, dist.reduce, dst=0)
+                # Ensure that all ranks contributing True (cast to 1) results in the
+                # correct reduction.
+                input_tensor = torch.tensor([True, True]).to(self.rank)
+                expected_tensor = input_tensor.clone()
+                self._run_reduction_test(
+                    input_tensor, expected_tensor, op, dist.reduce, dst=0
+                )
+
+            for op in [dist.ReduceOp.SUM, dist.ReduceOp.MAX]:
+                input_tensor = torch.tensor(inp[self.rank % 2]).to(self.rank)
+                expected = (
+                    torch.tensor([True, True]).to(self.rank)
+                    if self.rank == 0
+                    else input_tensor.clone()
+                )
+                self._run_reduction_test(input_tensor, expected, op, dist.reduce, dst=0)
+
+        @require_backend_is_available({"nccl"})
+        @skip_if_lt_x_gpu(2)
+        def test_nccl_backend_bool_broadcast(self):
+            tensor_size = 10
+            bcast_tensor = torch.tensor(
+                [
+                    (random.random() < 0.5 if self.rank == 0 else False)
+                    for _ in range(tensor_size)
+                ]
+            ).to(self.rank)
+            dist.broadcast(bcast_tensor, src=0)
+            # Now allgather and ensure the tensors are equal.
+            tensor_list = [
+                torch.tensor([False for _ in range(tensor_size)]).to(self.rank)
+                for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(tensor_list, bcast_tensor)
+            expected = tensor_list[0]
+            for tensor in tensor_list[1:]:
+                self.assertEqual(tensor, expected)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_DistributedSampler_padding(self):
+            # Tests padding of distributed sampler.
+            world_size = dist.get_world_size()
+
+            # Simulates the 'casual' dataset size
+            dataset_size = 100 + world_size + 1
+            dataset = [torch.ones(1).to(self.rank) * i for i in range(dataset_size)]
+
+            # Simulates the 'tiny' dataset size
+            dataset_tiny_size = max(world_size // 2 - 1, 1)
+            dataset_tiny = [
+                torch.ones(1).to(self.rank) * i for i in range(dataset_tiny_size)
+            ]
+
+            # Specifying drop_last=True will cause the tail of the data to be dropped.
+            dist_sampler = DistributedSampler(dataset=dataset, drop_last=True)
+            local_num_samples, local_dataset_size = (
+                dist_sampler.num_samples,
+                dist_sampler.total_size,
+            )
+            # The effective dataset size should be the greatest integer that is <=
+            # dataset_size that is divisible by the world_size. This is to ensure each
+            # rank processes the same number of samples.
+            effective_dataset_size = (
+                math.ceil((dataset_size - world_size) / world_size)
+                if dataset_size % world_size != 0
+                else dataset_size / world_size
+            )
+            self.assertEqual(local_num_samples, effective_dataset_size)
+            self.assertEqual(local_dataset_size, local_num_samples * world_size)
+            indices_list = list(iter(dist_sampler))
+            self.assertEqual(len(indices_list), local_num_samples)
+
+            def validate_global_samples(local_num_samples):
+                # Ensure that each rank processes the same number of samples.
+                world_samples = [
+                    torch.LongTensor([0]).to(self.rank) for _ in range(world_size)
+                ]
+                dist.all_gather(
+                    world_samples, torch.tensor([local_num_samples]).to(self.rank)
+                )
+                world_samples = [sample.item() for sample in world_samples]
+                self.assertEqual(len(set(world_samples)), 1)
+
+            validate_global_samples(local_num_samples)
+
+            # drop_last=False is the default and will add additional indices to be sampled,
+            # increasing the effective dataset size.
+            dist_sampler_added_samples = DistributedSampler(dataset=dataset)
+            local_num_samples, local_dataset_size = (
+                dist_sampler_added_samples.num_samples,
+                dist_sampler_added_samples.total_size,
+            )
+            # The effective dataset size is the smallest integer that is >= dataset_size
+            # and divisible by the world size.
+            self.assertEqual(local_num_samples, math.ceil(dataset_size / world_size))
+            self.assertEqual(local_dataset_size, local_num_samples * world_size)
+            indices_list = list(iter(dist_sampler_added_samples))
+            self.assertEqual(len(indices_list), local_num_samples)
+
+            # Ensure that each rank processes the same number of samples.
+            validate_global_samples(local_num_samples)
+
+            # Ensure additional samples are padded even when
+            # the extremely small dataset is given.
+            dist_sampler_added_samples_tiny = DistributedSampler(dataset=dataset_tiny)
+            local_num_samples, local_dataset_size = (
+                dist_sampler_added_samples_tiny.num_samples,
+                dist_sampler_added_samples_tiny.total_size,
+            )
+            self.assertEqual(
+                local_num_samples, math.ceil(dataset_tiny_size / world_size)
+            )
+            self.assertEqual(local_dataset_size, local_num_samples * world_size)
+            indices_list = list(iter(dist_sampler_added_samples_tiny))
+            self.assertEqual(len(indices_list), local_num_samples)
+            validate_global_samples(local_num_samples)
+
+        def _test_allgather_object(self, subgroup=None):
+            # Only set device for NCCL backend since it must use GPUs.
+
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+
+            backend = os.environ["BACKEND"]
+            if backend == "nccl":
+                # Case where rank != GPU device.
+                next_rank = (self.rank + 1) % int(self.world_size)
+                torch.cuda.set_device(next_rank)
+
+            # If GPU test, add object with GPU tensor
+            if backend == "nccl":
+                gather_objects.append(Foo(torch.randn(3, 3, device=0)))
+
+            output_gathered = [None for _ in range(dist.get_world_size())]
+            dist.all_gather_object(
+                output_gathered,
+                gather_objects[self.rank % len(gather_objects)],
+                group=subgroup,
+            )
+
+            for i, val in enumerate(output_gathered):
+                expected = gather_objects[i % len(gather_objects)]
+                self.assertEqual(val, expected)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @require_n_gpus_for_nccl_backend(
+            int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
+        )
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        def test_all_gather_object_default_pg(self):
+            return self._test_allgather_object()
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @require_n_gpus_for_nccl_backend(
+            int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
+        )
+        @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        def test_all_gather_object_subgroup(self):
+            default = _get_default_group()
+            backend = dist.get_backend(default)
+            subgroup = dist.new_group(backend=backend)
+            return self._test_allgather_object(subgroup=subgroup)
+
+        def _test_gather_object(self, pg=None):
+            # Ensure stateful objects can be gathered
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+            my_rank = dist.get_rank(pg)
+
+            backend = os.environ["BACKEND"]
+            if backend == "nccl":
+                # Case where rank != GPU device.
+                next_rank = (self.rank + 1) % int(self.world_size)
+                torch.cuda.set_device(next_rank)
+
+            # If GPU test, add object with GPU tensor
+            if backend == "nccl":
+                gather_objects.append(Foo(torch.randn(3, 3, device=my_rank)))
+
+            output_gathered = [None for _ in range(dist.get_world_size(pg))]
+            gather_on_rank = 0
+            dist.gather_object(
+                gather_objects[self.rank % len(gather_objects)],
+                object_gather_list=output_gathered
+                if my_rank == gather_on_rank
+                else None,
+                dst=gather_on_rank,
+                group=pg,
+            )
+            if my_rank != gather_on_rank:
+                self.assertEqual(
+                    output_gathered, [None for _ in range(dist.get_world_size())]
+                )
+            else:
+                for i, val in enumerate(output_gathered):
+                    expected = gather_objects[i % len(gather_objects)]
+                    self.assertEqual(val, expected)
+
+            # Validate errors when objects can't be pickled.
+            class Bar:
+                pass
+
+            b = Bar()
+            gather_objects = [b for _ in range(dist.get_world_size())]
+            with self.assertRaisesRegex(AttributeError, "Can't pickle local object"):
+                dist.all_gather_object(
+                    [None for _ in range(dist.get_world_size())],
+                    gather_objects[self.rank],
+                    group=pg,
+                )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        def test_gather_object(self):
+            return self._test_gather_object()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc", "CPU tensor ops not supported by UCP TL"
+        )
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @with_dist_debug_levels(levels=["DETAIL", "OFF", "INFO"])
+        def test_gather_object_subgroup(self):
+            default = _get_default_group()
+            backend = dist.get_backend(default)
+            subgroup = dist.new_group(backend=backend)
+            return self._test_gather_object(subgroup)
+
+        def validate_net_equivalence(self, net):
+            # Helper to validate synchronization of nets across ranks.
+            net_module_states = list(net.module.state_dict().values())
+            # Check that all tensors in module's state_dict() are equal.
+            for t in net_module_states:
+                tensor_list = [
+                    torch.zeros_like(t) for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(tensor_list, t)
+                for tensor in tensor_list:
+                    self.assertEqual(tensor, t)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_sync_module_states(self):
+            # Test that after calling _sync_module_states, models across ranks
+            # are the same and are equal to the model on the input rank.
+            dim = 2
+            rank = self.rank
+            rank_to_broadcast = 1
+            # Seed to ensure that ranks are initialized with different initial models.
+            torch.manual_seed(rank)
+            model = nn.Linear(dim, dim, bias=False)
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(rank), device_ids=[self.rank], bucket_cap_mb=1
+            )
+            new_model = nn.Linear(dim, dim, bias=False).cuda(rank)
+            net.module = copy.deepcopy(new_model)
+            # Assert params are different
+            net_module_states = list(net.module.state_dict().values())
+            for t in net_module_states:
+                tensor_list = [
+                    torch.zeros_like(t) for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(tensor_list, t)
+                for i, tensor in enumerate(tensor_list):
+                    if i == rank:
+                        self.assertEqual(t, tensor)
+                    else:
+                        # tensor from another rank should be different.
+                        self.assertNotEqual(t, tensor)
+
+            _sync_module_states(
+                module=net.module,
+                process_group=net.process_group,
+                broadcast_bucket_size=net.broadcast_bucket_size,
+                src=rank_to_broadcast,
+                params_and_buffers_to_ignore=net.parameters_to_ignore,
+            )
+            # Now all model params should be the same.
+            self.validate_net_equivalence(net)
+            # Since the network params were broadcast from rank_to_broadcast, validate that
+            # they are the same as new_model on rank_to_broadcast.
+            if rank == rank_to_broadcast:
+                expected_states = new_model.state_dict().values()
+                for t, expected in zip(net_module_states, expected_states):
+                    self.assertEqual(t, expected)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_grad_div_uneven_inputs(self):
+            # Test gradient division during training with join() API. If
+            # divide_by_initial_world_size=False, we scale by the effective world
+            # size when allreducing grads.
+            dim = 5
+            batch = 1
+            grad_scale = 50
+            rank = self.rank
+            model = nn.Linear(dim, dim, bias=False)
+            inp = torch.ones(batch, dim, device=self.rank) * grad_scale
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(rank), device_ids=[self.rank], bucket_cap_mb=1
+            )
+            n_iters = 3
+            if self.rank > 0:
+                n_iters += 2
+
+            with net.join(divide_by_initial_world_size=False):
+                for _ in range(n_iters):
+                    loss = net(inp).sum()
+                    loss.backward()
+                    # The grad is always expected_grad, since we divide by the number
+                    # of currently active processes and inactive processes contribute
+                    # zero gradient. If we kept dividing by static initial world
+                    # size as processes leave, the grad would be smaller.
+                    expected_grad = torch.ones(dim, dim, device=self.rank) * grad_scale
+                    param = next(iter(net.parameters()))
+                    self.assertEqual(expected_grad, param.grad)
+                    # Avoid accumulating grads so that it's the same every iteration
+                    net.zero_grad()
+                    torch.cuda.synchronize(device=self.rank)
+
+            # If divide_by_initial_world_size=True (default), we always scale grads
+            # by the initial world_size.
+            with net.join(divide_by_initial_world_size=True):
+                for i in range(n_iters):
+                    loss = net(inp).sum()
+                    loss.backward()
+                    effective_ws = dist.get_world_size()
+                    if i >= 3:
+                        effective_ws -= 1
+                    expected_grad = (
+                        torch.ones(dim, dim, device=self.rank)
+                        * grad_scale
+                        * effective_ws
+                    ) / dist.get_world_size()
+                    param = next(iter(net.parameters()))
+                    self.assertEqual(expected_grad, param.grad)
+                    # Avoid accumulating grad so that it's the same every iteration.
+                    net.zero_grad()
+                    torch.cuda.synchronize(device=self.rank)
+
+        def _test_ddp_profiling(self, profiler_ctx):
+            batch = 3
+            dim = 10
+            num_iters = 6
+            torch.cuda.set_device(self.rank)
+            model = nn.Linear(dim, dim, bias=False)
+            inp = torch.rand(batch, dim, device=self.rank)
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            profiler_ctx_copy = copy.deepcopy(profiler_ctx)
+
+            with profiler_ctx as prof:
+                for i in range(num_iters):
+                    loss = net(inp).sum()
+                    loss.backward()
+
+            all_reduce_event_name = f"{dist.get_backend()}:all_reduce"
+            events = get_profiling_event(all_reduce_event_name, prof, dedup_gpu_user_annotation=True)
+            event_count = sum(e.count for e in events)
+            self.assertEqual(event_count, num_iters)
+            for event in events:
+                self.assertTrue(event.is_async)
+                self.assertEqual(event.name, all_reduce_event_name)
+
+            broadcast_event_name = f"{dist.get_backend()}:broadcast"
+            broadcast_events = get_profiling_event(broadcast_event_name, prof, dedup_gpu_user_annotation=True)
+            event_count = sum(e.count for e in broadcast_events)
+            # Broadcast is called during rebuild_buckets
+            self.assertGreaterEqual(event_count, 1)
+            for event in broadcast_events:
+                self.assertEqual(event.name, broadcast_event_name)
+
+            # Run DDP with profiling for a few iterations, then enable profiling
+            # for a single pass, and ensure it is recorded. This tests that the
+            # thread local state is correctly updated.
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            for i in range(3):
+                loss = net(inp).sum()
+                loss.backward()
+            # Now enable the profiler.
+            with profiler_ctx_copy as prof:
+                loss = net(inp).sum()
+                loss.backward()
+
+            events = get_profiling_event(all_reduce_event_name, prof, dedup_gpu_user_annotation=True)
+            self.assertGreaterEqual(len(events), 1)
+            self.assertGreaterEqual(events[0].count, 1)
+            self.assertEqual(events[0].name, all_reduce_event_name)
+            for event in events:
+                self.assertTrue(event.is_async)
+            # Ensure searching unused parameters was profiled
+            events = get_profiling_event("search_unused_parameters", prof)
+            self.assertEqual(len(events), 1)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle("Currently failing in NVIDIA internal CI")
+        def test_ddp_profiling_autograd_profiler(self):
+            autograd_profiler_ctx = torch.autograd.profiler.profile()
+            return self._test_ddp_profiling(profiler_ctx=autograd_profiler_ctx)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        def test_ddp_profiling_torch_profiler(self):
+            cpu_act = torch.profiler.ProfilerActivity.CPU
+            cuda_act = torch.profiler.ProfilerActivity.CUDA
+            torch_profiler_ctx = torch.profiler.profile(activities=[cpu_act, cuda_act])
+            self._test_ddp_profiling(profiler_ctx=torch_profiler_ctx)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_join_model_equivalence(self):
+            # Verifies equivalence with model training locally and with DDP under
+            # the join context manager.
+            batch = 3
+            dim = 10
+            learning_rate = 0.03
+            model = nn.Linear(dim, dim, bias=False)
+            inp = torch.rand(batch, dim, device=self.rank)
+            local_model = copy.deepcopy(model)
+            local_model = local_model.cuda(self.rank)
+            rank_to_iter_mapping = {
+                rank: 2 * (rank + 1) for rank in range(dist.get_world_size())
+            }
+            # run local model
+            local_iters = sum(rank_to_iter_mapping.values())
+            local_optim = torch.optim.SGD(local_model.parameters(), lr=learning_rate)
+            for _ in range(local_iters):
+                local_optim.zero_grad()
+                out = local_model(inp)
+                loss = out.sum()
+                loss.backward()
+                local_optim.step()
+
+            # run DDP model with join API
+            num_iters = rank_to_iter_mapping[self.rank]
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank), device_ids=[self.rank]
+            )
+            ddp_optim = torch.optim.SGD(
+                model.parameters(), lr=learning_rate * dist.get_world_size()
+            )
+            with net.join():
+                for i in range(num_iters):
+                    ddp_optim.zero_grad()
+                    out = net(inp)
+                    loss = out.sum()
+                    loss.backward()
+                    torch.cuda.synchronize(device=self.rank)
+                    ddp_optim.step()
+
+            # Validate model state dicts are equal
+            for (_, local_tensor), (_, dist_tensor) in zip(
+                local_model.state_dict().items(), net.module.state_dict().items()
+            ):
+                self.assertEqual(local_tensor, dist_tensor)
+
+        def _run_uneven_inputs_test(
+            self,
+            test_case,
+            iteration_mapping,
+            find_unused_params,
+        ):
+            model = test_case.model
+            inp = test_case.inp
+            rank = self.rank
+            sync_interval = test_case.sync_interval
+            torch.cuda.set_device(rank)
+            # Ensure all outstanding GPU work is completed so this test runs independently.
+            dist.barrier()
+            # Bucket_cap_mb is intentionally low to test allreduce scheduling when
+            # there are many buckets.
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(rank),
+                device_ids=[rank],
+                bucket_cap_mb=1,
+                find_unused_parameters=find_unused_params,
+            )
+            # Register hook if specified
+            if test_case.hook is not None:
+                net.register_comm_hook(test_case.state, test_case.hook)
+                print(f"registered hook {test_case.hook}")
+
+            # Determine num iters for this rank via the passed in mapping.
+            num_iters = iteration_mapping[rank]
+            # If we throw when earliest rank terminates, we should ensure
+            # that we iterate for that minimum number of times.
+            num_iters_tensor = torch.tensor(
+                [num_iters], device=torch.cuda.current_device()
+            )
+            dist.all_reduce(num_iters_tensor, op=dist.ReduceOp.MIN)
+            min_num_iters = num_iters_tensor.item()
+            total_iters = 0
+            if test_case.throw_on_early_termination:
+                if min_num_iters == num_iters:
+                    # Early termination rank(s)
+                    exception_ctx = self.assertRaisesRegex(
+                        RuntimeError, f"Rank {self.rank} exhausted all inputs"
+                    )
+                else:
+                    # Non early termination rank
+                    exception_ctx = self.assertRaisesRegex(
+                        RuntimeError,
+                        "Detected at least one rank that exhausted inputs.",
+                    )
+            else:
+                exception_ctx = nullcontext()
+            with exception_ctx:
+                with net.join(
+                    throw_on_early_termination=test_case.throw_on_early_termination
+                ):
+                    for i in range(num_iters):
+                        # Use model.no_sync() to disable grad synchronization every
+                        # sync_interval.
+                        if i % sync_interval != 0:
+                            context = net.no_sync()
+                        else:
+                            context = nullcontext()
+                        with context:
+                            if isinstance(inp, tuple):
+                                loss = net(*inp).sum()
+                            else:
+                                loss = net(inp).sum()
+                            loss.backward()
+                            self._model_step(net)
+                            # Ensure completion of GPU kernels (including allreduce). If the
+                            # join API is not properly implemented, then this should hang
+                            # since the allreduce will hang.
+                            torch.cuda.synchronize(device=rank)
+                        total_iters += 1
+            if test_case.throw_on_early_termination:
+                # Ensure we iterated min_num_iters times.
+                self.assertEqual(total_iters, min_num_iters)
+            else:
+                # Ensure we iterated at least min_num_iters times.
+                self.assertGreaterEqual(total_iters, min_num_iters)
+
+            # Ensure completion of all GPU kernels.
+            torch.cuda.synchronize(device=rank)
+            # When throwing on early rank termination, we do not
+            # broadcast model state from an authoritative rank. All models
+            # should already be in sync.
+            if not test_case.throw_on_early_termination:
+                self.assertTrue(net._authoritative_rank)
+                # All ranks should have agreed on the same authoritative_rank!
+                final_rank_tensor = torch.tensor(
+                    [net._authoritative_rank], device=self.rank
+                )
+                tensor_list = [
+                    torch.zeros_like(final_rank_tensor)
+                    for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(tensor_list, final_rank_tensor)
+                max_rank = dist.get_world_size() - 1
+                self.assertSetEqual(
+                    {max_rank}, {tensor.item() for tensor in tensor_list}
+                )
+                # Ensure that all models are the same across ranks after all have joined.
+                self.validate_net_equivalence(net)
+                # Ensure that running with DDP uneven inputs was logged.
+                ddp_logging_data = net._get_ddp_logging_data()
+                self.assertTrue(ddp_logging_data.get("join_uneven_inputs"))
+                dist.barrier()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_uneven_inputs_stop_iteration_sync_bn(self):
+            # Tests that uneven inputs join handler correctly throws StopIteration
+            # for models with SyncBN or general collective comm when
+            # throw_on_early_termination=True.
+            class ModelWithComm(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.lin = nn.Linear(2, 40, bias=False)
+
+                def forward(self, x):
+                    x = self.lin(x)
+                    dist.all_reduce(x)
+                    return x
+
+            torch.cuda.set_device(self.rank)
+            model_bn = BN_NET
+            model_bn = nn.SyncBatchNorm.convert_sync_batchnorm(
+                copy.deepcopy(model_bn)
+            ).cuda(self.rank)
+            comm_model = ModelWithComm().cuda(self.rank)
+            model_input = torch.randn(10, 2).cuda(torch.cuda.current_device())
+
+            for model in [model_bn, comm_model]:
+                model = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                )
+                min_num_iters = 5
+                if self.rank != 0:
+                    # Early termination rank(s)
+                    num_iters = min_num_iters
+                    exception_ctx = self.assertRaisesRegex(
+                        RuntimeError, f"Rank {self.rank} exhausted all inputs"
+                    )
+                else:
+                    # Non early termination rank
+                    num_iters = min_num_iters * 2
+                    exception_ctx = self.assertRaisesRegex(
+                        RuntimeError,
+                        "Detected at least one rank that exhausted inputs.",
+                    )
+                n = 0
+                with exception_ctx:
+                    with model.join(throw_on_early_termination=True):
+                        for i in range(num_iters):
+                            loss = model(model_input).sum()
+                            loss.backward()
+                            self._model_step(model)
+                            n += 1
+
+                self.assertEqual(n, min_num_iters)
+                # Verify model equivalence
+                self.validate_net_equivalence(model)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_uneven_inputs(self):
+            dim = 1000
+            batch = 1
+            # Create a variety of models to run uneven input tests on.
+            large_model = nn.Sequential(
+                nn.Conv2d(1, 20, 5),
+                nn.ReLU(),
+                nn.Conv2d(20, 32, 5),
+                nn.ReLU(),
+                nn.Conv2d(32, 256, 5),
+                nn.ReLU(),
+            )
+            small_model = nn.Linear(dim, dim, bias=False)
+            bn_net = BatchNormNet()
+
+            class UnusedParamModule(nn.Module):
+                def __init__(self, unused_params_rank):
+                    super().__init__()
+                    self.t0 = Task()
+                    self.t1 = Task()
+                    self.unused_params_rank = unused_params_rank
+
+                def task_parameters(self):
+                    return (self.t0.p, self.t1.p)
+
+                def forward(self, x, rank):
+                    return (
+                        self.t1(self.t0(x))
+                        if rank != self.unused_params_rank
+                        else self.t1(x)
+                    )
+
+            unjoined_rank_with_unused_params_model = UnusedParamModule(1)
+            joined_rank_with_unused_params_model = UnusedParamModule(0)
+
+            rank = self.rank
+            models_to_test = [
+                # Network with batchnorm
+                DDPUnevenTestInput(
+                    name="batch_norm_net",
+                    model=bn_net,
+                    inp=torch.ones(batch, 2, device=rank),
+                    sync_interval=1,
+                ),
+                DDPUnevenTestInput(
+                    name="large_conv_model",
+                    model=large_model,
+                    inp=torch.ones(batch, batch, dim, dim, device=rank),
+                    sync_interval=1,
+                ),
+                DDPUnevenTestInput(
+                    name="small_model",
+                    model=small_model,
+                    inp=torch.ones(batch, dim, device=rank),
+                    sync_interval=1,
+                ),
+                # Unused parameter test where rank that does not join early has unused params
+                DDPUnevenTestInput(
+                    name="unjoined_rank_with_unused_params_model",
+                    model=unjoined_rank_with_unused_params_model,
+                    inp=(torch.ones(batch, 2, device=rank), rank),
+                    sync_interval=1,
+                ),
+                # Unused parameter test where rank that does join early has unused params
+                DDPUnevenTestInput(
+                    name="joined_rank_with_unused_params_model",
+                    model=joined_rank_with_unused_params_model,
+                    inp=(torch.ones(batch, 2, device=rank), rank),
+                    sync_interval=1,
+                ),
+            ]
+
+            # Test models that have hook installed.
+            models_with_hook = [
+                DDPUnevenTestInput(
+                    name="small_model_allreduce_hook",
+                    model=small_model,
+                    hook=default.allreduce_hook,
+                    state=None,
+                    inp=torch.ones(batch, dim, device=rank),
+                    sync_interval=1,
+                ),
+                DDPUnevenTestInput(
+                    name="small_model_power_sgd_hook",
+                    model=small_model,
+                    hook=powerSGD.powerSGD_hook,
+                    state=powerSGD.PowerSGDState(
+                        process_group=None,
+                        matrix_approximation_rank=1,
+                        # Config so that powerSGD runs immediately instead of
+                        # allreduce.
+                        start_powerSGD_iter=1,
+                        warm_start=False,
+                        use_error_feedback=False,
+                    ),
+                    inp=torch.ones(batch, dim, device=rank),
+                    sync_interval=1,
+                ),
+            ]
+            models_to_test.extend(models_with_hook)
+
+            # Add resnet model if we have torchvision installed.
+            if HAS_TORCHVISION:
+                resnet_model = torchvision.models.resnet50()
+                models_to_test.append(
+                    DDPUnevenTestInput(
+                        name="resnet_model",
+                        model=resnet_model,
+                        inp=torch.ones(1, 3, 1000, 1000),
+                        sync_interval=1,
+                    )
+                )
+
+            # Test with no_sync every 2, 3, 4, ... iterations.
+            models_with_sync = []
+            for i, test_input in enumerate(models_to_test):
+                models_with_sync.append(
+                    DDPUnevenTestInput(
+                        name=test_input.name,
+                        model=test_input.model,
+                        inp=test_input.inp,
+                        sync_interval=i + 2,
+                    )
+                )
+
+            throw_on_early_term_tests = []
+            for test_input in models_to_test:
+                throw_on_early_term_tests.append(
+                    DDPUnevenTestInput(
+                        name=test_input.name,
+                        model=test_input.model,
+                        inp=test_input.inp,
+                        sync_interval=test_input.sync_interval,
+                        throw_on_early_termination=True,
+                    )
+                )
+
+            models_to_test.extend(models_with_sync)
+            models_to_test.extend(throw_on_early_term_tests)
+
+            # 0 iteration tests for when one process does not train model at all, so
+            # we must shadow the broadcast calls made when rebuilding buckets.
+            baseline_num_iters = [0, 5]
+            iteration_offsets = [2, 3, 10]
+            num_uneven_ranks = [1]
+            if dist.get_world_size() > 2:
+                num_uneven_ranks.append(2)
+            iteration_mappings = []
+            # Generate rank : num_iters mappings for various uneven input scenarios.
+            # This includes cases where rank 0 joins early and all other ranks join
+            # later, and scenarios where multiple ranks join early, but at different
+            # iterations, and later ranks join later.
+            for num_early_join_ranks in num_uneven_ranks:
+                for baseline_iter in baseline_num_iters:
+                    for offset in iteration_offsets:
+                        mapping = dict.fromkeys(range(0, num_early_join_ranks), baseline_iter)
+                        # if num_early_join_ranks > 1, ranks > 0 that will join early
+                        # iterate offset//2 more times than rank 0, to test nodes
+                        # depleting inputs at different times.
+                        if num_early_join_ranks > 1:
+                            for rank in mapping.keys():
+                                if rank > 0:
+                                    mapping[rank] += offset // 2
+                        mapping.update(
+                            dict.fromkeys(range(num_early_join_ranks, dist.get_world_size()), baseline_iter + offset)
+                        )
+                        iteration_mappings.append(mapping)
+
+            for (test_case, iteration_mapping) in itertools.product(
+                models_to_test, iteration_mappings
+            ):
+                if self.rank == 0:
+                    print(
+                        f"""Running test: {test_case.name} sync interval
+                        {test_case.sync_interval} with iteration mapping
+                        {iteration_mapping}"""
+                    )
+                self._run_uneven_inputs_test(
+                    test_case,
+                    iteration_mapping,
+                    find_unused_params=("unused_params_model" in test_case.name),
+                )
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_uneven_input_join_disable(self):
+            # tests that if net.join() with enable=False is specified, DDP works as
+            # expected with even inputs.
+            torch.manual_seed(self.rank)
+            net = torch.nn.parallel.DistributedDataParallel(
+                torch.nn.Linear(1, 1).cuda(self.rank), device_ids=[self.rank]
+            )
+            inp = torch.ones(1) * self.rank
+            n_iters = 5
+            world_size = dist.get_world_size()
+            with net.join(enable=False):
+                for _ in range(n_iters):
+                    # Clear grads
+                    grad = net.module.weight.grad
+                    if grad is not None:
+                        grad.requires_grad_(False)
+                        grad.zero_()
+                    out = net(inp)
+                    loss = out.sum()
+                    loss.backward()
+                    # Validate gradients to ensure that we divide by the correct
+                    # world_size when join mode is disabled.
+                    expected_grad = sum(i for i in range(world_size)) / world_size
+                    self.assertEqual(net.module.weight.grad.item(), expected_grad)
+
+            join_config = net._join_config
+            self.assertFalse(join_config.enable)
+            self.validate_net_equivalence(net)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_uneven_input_exception(self):
+            # Tests that exceptions during training are correctly propagated by the
+            # context manager.
+            error_str = "Intentional error"
+
+            class ExceptionModule(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.param = nn.Parameter(torch.ones(1, requires_grad=True))
+
+                def forward(self, _):
+                    raise ValueError(error_str)
+
+            exception_module = ExceptionModule()
+            net = torch.nn.parallel.DistributedDataParallel(
+                exception_module.cuda(self.rank), device_ids=[self.rank]
+            )
+            inp = torch.ones(1)
+            with self.assertRaisesRegex(ValueError, error_str):
+                with net.join():
+                    out = net(inp)
+                    loss = out.sum()
+                    loss.backward()
+
+        def _test_broadcast_object_list(self, group=None):
+            gather_objects = COLLECTIVES_OBJECT_TEST_LIST.copy()
+
+            # Only set device for NCCL backend since it must use GPUs.
+            # Case where rank != GPU device.
+            next_rank = (self.rank + 1) % int(self.world_size)
+            backend = os.environ["BACKEND"]
+            if backend == "nccl":
+                torch.cuda.set_device(next_rank)
+
+            src_rank = 0
+            # If GPU test, add object with GPU tensor
+            if backend == "nccl":
+                gather_objects.append(Foo(torch.randn(3, 3, device=0)))
+
+            if IS_FBCODE:
+                # Create Tensor with > 2^31 Bytes storage requirements
+                # Only on FBCODE as testing OOMs in OSS
+                gather_objects.append(Foo(torch.randn(3, 178956971)))
+            objects = (
+                gather_objects
+                if self.rank == src_rank
+                else [None for _ in gather_objects]
+            )
+
+            # Single object test with device specified. Backend="gloo", device=cpu
+            if backend != "nccl":
+                single_obj_list = [objects[0]]
+                if self.rank != src_rank:
+                    self.assertNotEqual(single_obj_list[0], gather_objects[0])
+                dist.broadcast_object_list(
+                    single_obj_list, src=0, group=group, device=torch.device("cpu")
+                )
+                self.assertEqual(single_obj_list[0], gather_objects[0])
+
+            # Single object test with device specified. Backend="gloo", device=current_device+1
+            # The test is gated by the fact GPU count is the same as world size to avoid the case
+            # when backend is gloo but there is no multiple GPU devices.
+            if backend != "nccl" and torch.cuda.device_count() == int(self.world_size):
+                single_obj_list = [objects[0]]
+                if self.rank != src_rank:
+                    self.assertNotEqual(single_obj_list[0], gather_objects[0])
+                dist.broadcast_object_list(
+                    single_obj_list, src=0, group=group, device=torch.device(next_rank)
+                )
+                self.assertEqual(single_obj_list[0], gather_objects[0])
+
+            # Single object test with device specified. Backend="nccl", device=current_device+1
+            if backend == "nccl" and torch.cuda.device_count() == int(self.world_size):
+                single_obj_list = [objects[0]]
+                if self.rank != src_rank:
+                    self.assertNotEqual(single_obj_list[0], gather_objects[0])
+                dist.broadcast_object_list(
+                    single_obj_list, src=0, group=group, device=torch.device(next_rank)
+                )
+                self.assertEqual(single_obj_list[0], gather_objects[0])
+
+            # Single object test: backward compatibility with device unspecified
+            single_obj_list = [objects[0]]
+            if self.rank != src_rank:
+                self.assertNotEqual(single_obj_list[0], gather_objects[0])
+            dist.broadcast_object_list(single_obj_list, src=0, group=group)
+            self.assertEqual(single_obj_list[0], gather_objects[0])
+
+            # Multiple input objects test
+            if self.rank != src_rank:
+                self.assertNotEqual(objects, gather_objects)
+            dist.broadcast_object_list(objects, src=0, group=group)
+            self.assertEqual(objects, gather_objects)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @require_n_gpus_for_nccl_backend(
+            int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
+        )
+        @with_dist_debug_levels(levels=["DETAIL"])
+        @unittest.skip("Test is failing, see https://github.com/pytorch/pytorch/pull/113620")
+        def test_broadcast_object_list(self):
+            return self._test_broadcast_object_list()
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @require_n_gpus_for_nccl_backend(
+            int(os.environ["WORLD_SIZE"]), os.environ["BACKEND"]
+        )
+        @with_dist_debug_levels(levels=["DETAIL"])
+        def _test_broadcast_object_list_subgroup(self):
+            default = _get_default_group()
+            backend = dist.get_backend(default)
+            subgroup = dist.new_group(backend=backend)
+            return self._test_broadcast_object_list(subgroup)
+
+        def _test_ddp_ignore_params_arg(self, static_graph=False):
+            class TestModel(nn.Module):
+                def __init__(self, rank):
+                    self.rank = rank
+                    super().__init__()
+                    self.fc1 = nn.Linear(1, 1, bias=False)
+                    # Proxy that will be materialized to another architecture later.
+                    # (after wrapping model with DDP)
+                    if self.rank == 0:
+                        self.fc2 = nn.Linear(1, 10, bias=False)
+                    else:
+                        self.fc2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    x = self.fc1(x)
+                    x = self.fc2(x)
+                    return x
+
+            device_id = self.rank
+            # Ensure the test works for both find_unused_parameter and broadcast_buffer settings.
+            for (find_unused, broadcast_buffers) in itertools.product(
+                [False, True], [False, True]
+            ):
+                model = TestModel(self.rank).float().to(device_id)
+                # Note that the model can have different shape buffers if we pass
+                # them in to be ignored as well.
+                model.fc2.register_buffer(
+                    "ignore_buffer", torch.zeros(5 + self.rank, device=self.rank)
+                )
+                proxy_params = list(model.fc2.parameters())
+                proxy_buffers = list(model.fc2.buffers())
+                model_fc2_name = next(
+                    module_name
+                    for module_name, module in model.named_modules()
+                    if module is model.fc2
+                )
+                proxy_param_names = [
+                    f"{model_fc2_name}.{param_name}"
+                    for param_name, _ in model.fc2.named_parameters()
+                ]
+                proxy_buffer_names = [
+                    f"{model_fc2_name}.{buf_name}"
+                    for buf_name, _ in model.fc2.named_buffers()
+                ]
+                # Specify that we should ignore proxy_params since it will be
+                # materialized later.
+                torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                    model, proxy_param_names + proxy_buffer_names
+                )
+                ddp = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[device_id],
+                    find_unused_parameters=find_unused,
+                    broadcast_buffers=broadcast_buffers,
+                    static_graph=static_graph,
+                )
+                # Materialize new params. These are not registered in DDP and thus
+                # don't have autograd hooks installed on them.
+                ddp.module.fc2 = nn.Linear(1, 1, bias=False).to(device_id)
+
+                # local model with the new materialized parameters.
+                local_model = copy.deepcopy(ddp.module).cuda(self.rank)
+
+                inp = torch.ones(1, dtype=torch.float).to(device_id) * (self.rank + 1)
+                for i in range(6):
+                    ddp(inp).sum().backward()
+
+                    local_model(inp).sum().backward()
+                    # materialized param grad is not touched by DDP, so its grad should
+                    # be the same as if running locally.
+                    for materialized_param, local_param in zip(
+                        ddp.module.fc2.parameters(), local_model.fc2.parameters()
+                    ):
+                        self.assertEqual(materialized_param.grad, local_param.grad)
+
+                    # fc1 parameter grad should still be different, due to allreduce.
+                    for synced_param, local_param in zip(
+                        ddp.module.fc1.parameters(), local_model.fc1.parameters()
+                    ):
+                        self.assertFalse(synced_param.grad == local_param.grad)
+
+                    # Proxy module grad should not be touched
+                    for proxy_param in proxy_params:
+                        self.assertTrue(proxy_param.grad is None)
+
+                # Synchronize since we run multiple iterations of this test, to
+                # isolate failure hangs.
+                torch.cuda.synchronize(device=self.rank)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_ignore_params_arg(self):
+            self._test_ddp_ignore_params_arg(static_graph=False)
+            self._test_ddp_ignore_params_arg(static_graph=True)
+
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_unused_params_rebuild_buckets_exception(self):
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.net1 = nn.Linear(10, 10, bias=False)
+                    self.net2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    return self.net1(x)
+
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                ToyModel().cuda(self.rank), device_ids=[self.rank]
+            )
+            for i in range(2):
+                inp = torch.rand(1, 10)
+                if i > 0:
+                    # On 2nd iteration, this will fail during rebuild_buckets,
+                    # but we should report an error regarding unused parameters
+                    # since that is the underlying root cause.
+                    try:
+                        ddp(inp).sum().backward()
+                    except RuntimeError as e:
+                        msg = str(e)
+                        verify_ddp_error_logged(ddp, msg)
+                        expected_strs = [
+                            ddp_prev_reduction_unfinished_str,
+                            ddp_recommend_find_unused_params_str,
+                            ddp_outputs_not_used_in_loss_str,
+                        ]
+                        # In debug mode, should show parameters that weren't reduced.
+                        # Without debug mode, should show suggestion to use debug mode.
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
+                            expected_strs.append(ddp_suggest_debug_mode_str)
+                        else:
+                            unreduced_params = ", ".join(["net2.weight"])
+                            expected_strs.append(
+                                f"did not receive grad for rank {self.rank}: {unreduced_params}"
+                            )
+                        for s in expected_strs:
+                            self.assertTrue(s in msg, f"Expected {s} to be in {msg}")
+                        self.assertFalse(ddp_find_unused_params_enabled_str in msg)
+                    else:
+                        self.assertFalse(
+                            True, "DDP unused parameters error not raised."
+                        )
+                else:
+                    ddp(inp).sum().backward()
+
+            dist.barrier()
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_shared_grad_acc_unused_params(self):
+            # When find_unused_parameters=True, ensure we mark unused parameters
+            # even if they share gradient accumulators.
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    # net1, bias, and net1.bias are all unused params.
+                    self.net1 = nn.Linear(10, 5, bias=False)
+                    self.bias = nn.Parameter(torch.zeros(5))
+                    # net1.bias and self.bias are names for the same underlying
+                    # parameter, so they share the same grad acc. This caused
+                    # the bug reported in https://github.com/pytorch/pytorch/issues/41324.
+                    self.net1.bias = self.bias
+                    self.net2 = nn.Linear(10, 5)
+
+                def forward(self, x):
+                    return self.net2(x).sum()
+
+            torch.cuda.set_device(self.rank)
+            model = ToyModel().to(torch.cuda.current_device())
+            for static in [True, False]:
+                ddp_model = torch.nn.parallel.DistributedDataParallel(
+                    copy.deepcopy(model),
+                    device_ids=[self.rank],
+                    find_unused_parameters=True,
+                    static_graph=static,
+                )
+                inp = torch.randn(20, 10, device=self.rank)
+                for i in range(6):
+                    loss = ddp_model(inp)
+                    # To test https://github.com/pytorch/pytorch/issues/61982
+                    loss /= 10
+                    loss.backward()
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_device(self):
+            m = nn.Linear(10, 10).to(self.rank)
+            expected_len = 2
+
+            class TensorWrapper:
+                __slots__ = ["t", "moved_to_gpu"]
+
+                def __init__(self, t):
+                    self.t = t
+                    self.moved_to_gpu = False
+
+            # Handlers for specific types of validation we want to do based on
+            # the input type.
+
+            def tuple_and_list_validator(x):
+                self.assertTrue(len(x), expected_len)
+                self.assertEqual(1, len({t.device for t in x}))
+                self.assertEqual(x[0].device.index, self.rank)
+                return x[0] + x[1]
+
+            def namedtuple_validator(x):
+                self.assertEqual(x._fields, EXPECTED_FIELDS)
+                self.assertEqual(x.a.device.index, x.b.device.index)
+                self.assertEqual(x.a.device.index, self.rank)
+                return x.a + x.b
+
+            def custom_type_validator(x):
+                self.assertTrue(x.moved_to_gpu or (str(x.t.device) == "cpu"))
+                x.t = x.t.to(self.rank)
+                x.moved_to_gpu = True
+                return x.t
+
+            def dict_validator(x):
+                self.assertTrue(EXPECTED_FIELDS[0] in x.keys())
+                self.assertTrue(EXPECTED_FIELDS[1] in x.keys())
+                self.assertEqual(1, len({t.device for t in x.values()}))
+                self.assertEqual(x[EXPECTED_FIELDS[0]].device.index, self.rank)
+                return x[EXPECTED_FIELDS[0]] + x[EXPECTED_FIELDS[1]]
+
+            validators = {
+                TensorWrapper: custom_type_validator,
+                tuple: tuple_and_list_validator,
+                list: tuple_and_list_validator,
+                TestNamedTupleInput_0: namedtuple_validator,
+                TestNamedTupleInput_1: namedtuple_validator,
+                dict: dict_validator,
+            }
+
+            class ToyModel(torch.nn.Module):
+                def __init__(_self):  # noqa: B902
+                    super().__init__()
+                    _self.lin = nn.Linear(10, 10, bias=False)
+
+                def forward(_self, x, expected_type):  # noqa: B902
+                    # Similar to scatter, the recursive to in the single-device
+                    # case does not move tensors if they are in a custom type.
+                    self.assertTrue(isinstance(x, expected_type))
+                    fwd_tensor = validators[expected_type](x)
+                    return _self.lin(fwd_tensor)
+
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel().to(self.rank), device_ids=[self.rank]
+            )
+
+            def train_iter(inp, input_type):
+                for _ in range(4):
+                    out = model(inp, input_type)
+                    out.sum().backward()
+
+            # CPU tuple input, should be moved to the proper device before call
+            # to forward.
+            inp = tuple(torch.randn(10, 10) for _ in range(expected_len))
+            train_iter(inp, tuple)
+
+            # List CPU input, should be moved to proper device before call to
+            # forward.
+            inp = [torch.randn(10, 10) for _ in range(expected_len)]
+            train_iter(inp, list)
+            # Custom type containing tensor. The type is maintained, but the
+            # device is not propagated (which is what happens with scatter too)
+            inp = TensorWrapper(torch.randn(10, 10))
+            train_iter(inp, TensorWrapper)
+            # NamedTuple input. The type should be maintained and tensor inputs
+            # should be moved to the correct device as in scatter.
+            batch = 5
+            dim = 10
+            a = torch.rand(batch, dim)
+            b = torch.rand(batch, dim)
+
+            inp = TestNamedTupleInput_0(a, b)
+            train_iter(inp, type(inp))
+
+            inp = TestNamedTupleInput_1(a, b)
+            train_iter(inp, type(inp))
+
+            # dictionary input.
+            inp = {
+                EXPECTED_FIELDS[0]: a,
+                EXPECTED_FIELDS[1]: b,
+            }
+            train_iter(inp, type(inp))
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_namedtuple(self):
+            batch = 5
+            dim = 10
+
+            a = torch.rand(batch, dim, device=self.rank)
+            b = torch.rand(batch, dim, device=self.rank)
+
+            class NamedTupleModule(torch.nn.Module):
+                def __init__(_self):  # noqa: B902
+                    super().__init__()
+                    _self.lin = nn.Linear(10, 1)
+
+                def forward(_self, input, expected_type):  # noqa: B902
+                    # Without NamedTuple support, this would be of type tuple.
+                    self.assertTrue(
+                        isinstance(input, expected_type),
+                        f"Expected type {expected_type} but got {type(input)}",
+                    )
+                    self.assertEqual(input._fields, EXPECTED_FIELDS)
+                    self.assertEqual(a, input.a)
+                    self.assertEqual(b, input.b)
+                    return _self.lin(torch.mul(input.a, input.b))
+
+            model = torch.nn.parallel.DistributedDataParallel(
+                NamedTupleModule().cuda(self.rank), device_ids=[self.rank]
+            )
+            inp = TestNamedTupleInput_0(a, b)
+            # The following would fail if DDP does not propagate NamedTuples correctly.
+            model(inp, type(inp))
+
+            inp = TestNamedTupleInput_1(a, b)
+            model(inp, type(inp))
+
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_control_flow_same_across_ranks(self):
+            # Control flow that is the same across ranks.
+            batch = 20
+            dim = 10
+
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                ControlFlowToyModel().cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            random_input = torch.randn(batch, dim, device=self.rank)
+            ones_input = torch.ones(batch, dim, device=self.rank)
+            for i in range(6):
+                if i % 2 == 0:
+                    out = model(random_input)
+                else:
+                    out = model(ones_input)
+                loss = out.sum()
+                loss.backward()
+                # On even iterations, 2nd param goes unused, on odd iterations,
+                # it is used.
+                local_used_map = model.reducer._get_local_used_map()
+                if i % 2 == 0:
+                    expected = torch.tensor(
+                        [world_size, 0], device=self.rank, dtype=torch.int32
+                    )
+                else:
+                    expected = torch.tensor(
+                        [world_size, world_size], device=self.rank, dtype=torch.int32
+                    )
+
+                # Validate parameter usage.
+                variable_usage_tensor = local_used_map
+                self.assertEqual(variable_usage_tensor, expected)
+
+            # Validate appropriate error message when DDP is used with
+            # find_unused_parameters=False.
+            model = torch.nn.parallel.DistributedDataParallel(
+                ControlFlowToyModel().cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=False,
+            )
+            for i in range(2):
+                if i == 0:
+                    loss = model(random_input).sum()
+                    loss.backward()
+                else:
+                    try:
+                        loss = model(random_input).sum()
+                        loss.backward()
+                    except RuntimeError as e:
+                        msg = str(e)
+                        verify_ddp_error_logged(model, msg)
+                        # 2nd linear layer is unused
+                        unused_param_index = 1
+                        expected_strs = [
+                            ddp_prev_reduction_unfinished_str,
+                            ddp_recommend_find_unused_params_str,
+                            ddp_outputs_not_used_in_loss_str,
+                            f"Parameter indices which did not receive grad for rank {self.rank}: {unused_param_index}",
+                        ]
+                        # In debug mode, should show parameters that weren't reduced.
+                        # Without debug mode, should show suggestion to use debug mode.
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
+                            expected_strs.append(ddp_suggest_debug_mode_str)
+                        else:
+                            unreduced_params = ", ".join(["lin2.weight"])
+                            expected_strs.append(
+                                f"did not receive grad for rank {self.rank}: {unreduced_params}"
+                            )
+                        for s in expected_strs:
+                            self.assertTrue(s in msg, f"Expected {s} to be in {msg}")
+                        self.assertFalse(ddp_find_unused_params_enabled_str in msg)
+                    else:
+                        self.assertFalse(True, "DDP error not raised")
+
+            dist.barrier()
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_invalid_static_graph(self):
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                ControlFlowToyModel().cuda(self.rank),
+                device_ids=[self.rank],
+                static_graph=True,
+            )
+            random_input = torch.randn(20, 10, device=self.rank)
+            ones_input = torch.ones(20, 10, device=self.rank)
+            # unused parameter in the first iteration got used
+            # in second iteration.
+            expected_err = "Your training graph has changed in this iteration"
+            with self.assertRaisesRegex(RuntimeError, expected_err):
+                for i in range(2):
+                    if i % 2 == 0:
+                        out = model(random_input)
+                    else:
+                        out = model(ones_input)
+                    loss = out.sum()
+                    loss.backward()
+
+            verify_ddp_error_logged(model, expected_err)
+
+            # used parameter in the first iteration got unused
+            # in second iteration.
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Expected to have finished reduction in the prior iteration "
+                "before starting a new one. This error indicates that your "
+                "training graph has changed in this iteration, "
+                "e.g., one parameter is used in first iteration, "
+                "but then got unused in the second iteration. "
+                "this is not compatible with static_graph set to True.\n"
+                "Parameter indices which did not receive grad for",
+            ):
+                for i in range(2):
+                    if i % 2 != 0:
+                        out = model(random_input)
+                    else:
+                        out = model(ones_input)
+                    loss = out.sum()
+                    loss.backward()
+
+            verify_ddp_error_logged(model, "Expected to have finished reduction")
+
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_control_flow_different_across_ranks(self):
+            # Control flow that is different across ranks.
+            batch = 20
+            dim = 10
+
+            class ToyModel(nn.Module):
+                def __init__(self, rank):
+                    super().__init__()
+                    self.lin1 = nn.Linear(10, 10, bias=False)
+                    self.lin2 = nn.Linear(10, 10, bias=False)
+                    self.rank = rank
+
+                def forward(self, x):
+                    # Control-flow that is rank and input dependent for the
+                    # model.
+                    use_second_layer = (
+                        torch.equal(x, torch.ones(batch, dim, device=x.device))
+                        and self.rank == 1
+                    )
+
+                    if use_second_layer:
+                        return self.lin2(F.relu(self.lin1(x)))
+                    else:
+                        return F.relu(self.lin1(x))
+
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel(self.rank).cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            random_input = torch.randn(batch, dim, device=self.rank)
+            ones_input = torch.ones(batch, dim, device=self.rank)
+            for i in range(6):
+                if i % 2 == 0:
+                    out = model(random_input)
+                else:
+                    out = model(ones_input)
+                loss = out.sum()
+                loss.backward()
+                # On even iterations, 2nd param goes unused, on odd iterations,
+                # it is used only on rank 1.
+                local_used_map = model.reducer._get_local_used_map()
+
+                if i % 2 == 0:
+                    expected = torch.tensor(
+                        [world_size, 0], device=self.rank, dtype=torch.int32
+                    )
+                else:
+                    expected = torch.tensor(
+                        [world_size, 1], device=self.rank, dtype=torch.int32
+                    )
+
+                variable_usage_tensor = local_used_map
+                # Validate parameter usage. On odd iterations, 2nd param is only
+                # used on rank 1.
+                self.assertEqual(variable_usage_tensor, expected)
+
+            # Validate appropriate error message when DDP is used with
+            # find_unused_parameters=False.
+            model = torch.nn.parallel.DistributedDataParallel(
+                ToyModel(self.rank).cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=False,
+            )
+            for i in range(2):
+                if i == 0:
+                    loss = model(random_input).sum()
+                    loss.backward()
+                else:
+                    try:
+                        loss = model(random_input).sum()
+                        loss.backward()
+                    except RuntimeError as e:
+                        msg = str(e)
+                        verify_ddp_error_logged(model, msg)
+                        unused_param_index = 1
+                        expected_strs = [
+                            ddp_prev_reduction_unfinished_str,
+                            ddp_recommend_find_unused_params_str,
+                            ddp_outputs_not_used_in_loss_str,
+                            f"Parameter indices which did not receive grad for rank {self.rank}: {unused_param_index}",
+                        ]
+                        # In debug mode, should show parameters that weren't reduced.
+                        # Without debug mode, should show suggestion to use debug mode.
+                        if dist.get_debug_level() == dist.DebugLevel.OFF:
+                            expected_strs.append(ddp_suggest_debug_mode_str)
+                        else:
+                            unreduced_params = ", ".join(["lin2.weight"])
+                            expected_strs.append(
+                                f"did not receive grad for rank {self.rank}: {unreduced_params}"
+                            )
+                        for s in expected_strs:
+                            self.assertTrue(s in msg, f"Expected {s} to be in {msg}")
+                        self.assertFalse(ddp_find_unused_params_enabled_str in msg)
+                    else:
+                        self.assertFalse(True, "DDP error not raised")
+
+            dist.barrier()
+
+        @require_backend_is_available({"gloo"})
+        def test_scatter_object_list(self):
+            src_rank = 0
+            scatter_list = (
+                COLLECTIVES_OBJECT_TEST_LIST
+                if self.rank == src_rank
+                else [None for _ in COLLECTIVES_OBJECT_TEST_LIST]
+            )
+            world_size = dist.get_world_size()
+            scatter_list = scatter_list[:world_size]
+            i = 0
+            while len(scatter_list) < world_size:
+                scatter_list.append(scatter_list[i])
+                i += 1
+
+            output_obj_list = [None]
+            dist.scatter_object_list(output_obj_list, scatter_list, src=src_rank)
+            self.assertEqual(
+                output_obj_list[0],
+                COLLECTIVES_OBJECT_TEST_LIST[
+                    self.rank % len(COLLECTIVES_OBJECT_TEST_LIST)
+                ],
+            )
+            # Ensure errors are raised upon incorrect arguments.
+            with self.assertRaisesRegex(
+                ValueError,
+                "Expected argument scatter_object_output_list to be a list of size at least 1.",
+            ):
+                dist.scatter_object_list([], scatter_list, src=src_rank)
+
+        def _generate_sparse_tensors_for_bucket_assignment_test(self):
+            tensors = [
+                torch.empty([50], dtype=torch.float),
+                torch.empty([25], dtype=torch.double),
+                torch.empty([50], dtype=torch.float),
+                torch.empty([25], dtype=torch.double),
+                torch.empty([50], dtype=torch.float),
+                torch.empty([25], dtype=torch.double),
+            ]
+
+            tensors_sparse = [t.to_sparse() for t in tensors]
+            return tensors_sparse
+
+        def _test_compute_bucket_assignment_by_size(self, use_logger):
+            group_gloo = dist.new_group(
+                timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
+            )
+            # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
+            # determinism.
+            os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+            group_to_use = dist.new_group(
+                backend=dist.get_backend(), timeout=timedelta(seconds=5)
+            )
+            torch.cuda.set_device(self.rank)
+
+            # Create a valid model. The constructor initializes the logger that we use later.
+            # We never actually use the rest of the model - we only need its logger.
+            net = EmbeddingNetDifferentParams(0)
+            net = torch.nn.parallel.DistributedDataParallel(
+                net.to(self.rank),
+                device_ids=[self.rank],
+                process_group=group_to_use,
+            )
+
+            # if we don't pass a logger then we can only check that an exception was thrown.
+            expected_err = "No support for sparse tensors."
+            with self.assertRaisesRegex(RuntimeError, expected_err):
+                tensors_sparse = (
+                    self._generate_sparse_tensors_for_bucket_assignment_test()
+                )
+                if use_logger:
+                    result = dist._compute_bucket_assignment_by_size(
+                        tensors_sparse, [400], logger=net.logger
+                    )
+                else:
+                    result = dist._compute_bucket_assignment_by_size(
+                        tensors_sparse, [400]
+                    )
+            if use_logger:
+                verify_ddp_error_logged(net, expected_err)
+
+            # Perform gloo-based barrier to ensure one rank doesn't exit test
+            # early which causes failure with Barrier.sync.
+            dist.barrier(group_gloo)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_compute_bucket_assignment_by_size_sparse_error_without_logger(self):
+            self._test_compute_bucket_assignment_by_size(use_logger=False)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_compute_bucket_assignment_by_size_sparse_error_with_logger(self):
+            self._test_compute_bucket_assignment_by_size(use_logger=True)
+
+        def _determine_expected_error_verify_model_across_rank(
+            self, group_to_use, diff_num_params=False
+        ):
+            # When running with NCCL backend, we don't expect an error on rank 0,
+            # rather, it will be taken down by TORCH_NCCL_ASYNC_ERROR_HANDLING. When
+            # running with Gloo or with debug mode wrapper, we expect the error
+            # to be caught inline.
+            # All ranks report same error when there is a # of parameter
+            # mismatch since we use allgather in the impl.
+            if diff_num_params:
+                expected_err = "DDP expects same model across all ranks"
+                ctx = self.assertRaisesRegex(RuntimeError, expected_err)
+                return ctx, expected_err
+
+            is_detail_dbg_mode = dist.get_debug_level() == dist.DebugLevel.DETAIL
+            if self.rank == 0:
+                if (
+                    dist.get_backend(group_to_use) == dist.Backend.NCCL
+                    and not is_detail_dbg_mode
+                ):
+                    expected_err = "caught collective operation timeout"
+                    ctx = self.assertRaisesRegex(RuntimeError, expected_err)
+                else:
+                    expected_err = None
+                    ctx = self.assertRaises(RuntimeError)
+            else:
+                expected_err = "appears not to match"
+                ctx = self.assertRaisesRegex(RuntimeError, expected_err)
+            return ctx, expected_err
+
+        def _test_verify_model_across_rank(self, use_logger):
+            group_gloo = dist.new_group(
+                timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
+            )
+            # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
+            # determinism.
+            os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+            group_to_use = dist.new_group(
+                backend=dist.get_backend(), timeout=timedelta(seconds=5)
+            )
+            torch.cuda.set_device(self.rank)
+            ctx, expected_err = self._determine_expected_error_verify_model_across_rank(
+                group_to_use
+            )
+
+            # Create a valid model. The constructor initializes the logger that we use later.
+            net = EmbeddingNetDifferentParams(0)
+            net = torch.nn.parallel.DistributedDataParallel(
+                net.to(self.rank),
+                device_ids=[self.rank],
+                process_group=group_to_use,
+            )
+
+            # Modify the model so that the number of parameters are different for each rank.
+            # This will cause a RuntimeError to be thrown below in _verify_param_shape_across_processes,
+            # so we can check if the correct error is thrown and is logged.
+            # We can't do this in the constructor above otherwise the logger will
+            # not be properly initialized.
+            net.module.lin = nn.Linear(100 if self.rank == 0 else 10, 1)
+
+            # if we pass a logger we can verify that it was logged
+            with ctx:
+                if use_logger:
+                    _verify_param_shape_across_processes(
+                        net.process_group, list(net.parameters()), net.logger
+                    )
+                else:
+                    _verify_param_shape_across_processes(
+                        net.process_group, list(net.parameters())
+                    )
+                # Should only be run by rank 0, and blocking_wait catches and
+                # reports exception.
+                dist.barrier(group_to_use)
+
+            # We don't check when self.rank != 0 because the logger doesn't log
+            # the error "Caught collective operation" as that is not thrown in the reducer.
+            if use_logger and self.rank != 0:
+                verify_ddp_error_logged(net, expected_err)
+
+            # Perform gloo-based barrier to ensure one rank doesn't exit test
+            # early which causes failure with Barrier.sync.
+            dist.barrier(group_gloo)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_verify_model_across_rank_with_logger(self):
+            self._test_verify_model_across_rank(use_logger=True)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_verify_model_across_rank_without_logger(self):
+            self._test_verify_model_across_rank(use_logger=False)
+
+        def _run_test_ddp_model_with_diff_params(self, ctx, net, ddp_group, group_gloo):
+            with ctx:
+                net = torch.nn.parallel.DistributedDataParallel(
+                    net.to(self.rank), device_ids=[self.rank], process_group=ddp_group
+                )
+                # Should only be run by rank 0, and blocking_wait catches and
+                # reports exception.
+                dist.barrier(ddp_group)
+
+            # can't use verify_ddp_error_logged here because net was never properly constructed
+
+            # Perform gloo-based barrier to ensure one rank doesn't exit test
+            # early which causes failure with Barrier.sync.
+            dist.barrier(group_gloo)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_model_diff_shape_across_ranks(self):
+            group_gloo = dist.new_group(
+                timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
+            )
+            # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
+            # determinism.
+            os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+            group_to_use = dist.new_group(
+                backend=dist.get_backend(), timeout=timedelta(seconds=10)
+            )
+            torch.cuda.set_device(self.rank)
+            ctx, expected_err = self._determine_expected_error_verify_model_across_rank(
+                group_to_use
+            )
+            # Creates network with different sized embedding table on different
+            # ranks. This should throw an error during DDP init.
+            net = EmbeddingNetDifferentParams(self.rank)
+            self._run_test_ddp_model_with_diff_params(
+                ctx, net, group_to_use, group_gloo
+            )
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_model_diff_num_params_across_ranks(self):
+            group_gloo = dist.new_group(
+                timeout=timedelta(seconds=60), backend=dist.Backend.GLOO
+            )
+            # Set TORCH_NCCL_BLOCKING_WAIT and use a new NCCL group to improve test
+            # determinism.
+            os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
+            group_to_use = dist.new_group(
+                backend=dist.get_backend(), timeout=timedelta(seconds=10)
+            )
+            torch.cuda.set_device(self.rank)
+            ctx, expected_err = self._determine_expected_error_verify_model_across_rank(
+                group_to_use, diff_num_params=True
+            )
+
+            # Creates network with diff # of param across ranks, reducer should
+            # recognize this and throw appropriate error.
+            net = EmbeddingNetDifferentParams(
+                self.rank, diff_num_params=(self.rank == 1)
+            )
+
+            self._run_test_ddp_model_with_diff_params(
+                ctx,
+                net,
+                group_to_use,
+                group_gloo,
+            )
+
+        def _test_output_unused_in_loss(self, module_cls, gradient_as_bucket_view):
+            model = module_cls()
+            local_net = copy.deepcopy(model)
+            net = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(model).cuda(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+
+            # Tests that certain parameters not getting gradient since the
+            # output is unused in loss computation is supported. Specifically,
+            # checks that the grads remain unchanged and are the same as local
+            # training.
+            inp = torch.randn(10, 10)
+
+            # Ensure that if a param is not used in loss computation, its
+            # gradient is untouched, i.e. if it is None before it is None after,
+            # not zero.
+            if module_cls == DictOutputModule:
+                a, b = local_net(inp)["predictions"]
+                a_dist, b_dist = net(inp)["predictions"]
+            else:
+                a, b = local_net(inp)
+                a_dist, b_dist = net(inp)
+
+            loss_dist = b_dist.sum()
+            loss_dist.backward()
+
+            # Ensure that gradient corresponding to parameter "a" was not
+            # touched, i.e. it is None and matches the local grad.
+            if module_cls == DictOutputModule:
+                self.assertTrue(net.module.module.a.weight.grad is None)
+                self.assertEqual(
+                    net.module.module.a.weight.grad, local_net.module.a.weight.grad
+                )
+            else:
+                self.assertTrue(net.module.a.weight.grad is None)
+                self.assertEqual(net.module.a.weight.grad, local_net.a.weight.grad)
+
+            saved_a_local_grad = None
+            saved_a_dist_grad = None
+            net.zero_grad()
+            local_net.zero_grad()
+            for i in range(6):
+                if module_cls == DictOutputModule:
+                    a, b = local_net(inp)["predictions"]
+                    a_dist, b_dist = net(inp)["predictions"]
+                else:
+                    a, b = local_net(inp)
+                    a_dist, b_dist = net(inp)
+                if i < 2:
+                    # Use both params in loss computation. Later, "a" will go
+                    # unused and we check to ensure DDP supports this and
+                    # gradients remain the same as local training.
+                    t = a @ b
+                    t_dist = a_dist @ b_dist
+                    loss = t.sum()
+                    loss_dist = t_dist.sum()
+                else:
+                    # Model output "a" unused in loss.
+                    loss = b.sum()
+                    loss_dist = b_dist.sum()
+                loss.backward()
+                loss_dist.backward()
+                if i == 1:
+                    # Save grads to compare with them in next iterations.
+                    if module_cls == DictOutputModule:
+                        saved_a_local_grad = local_net.module.a.weight.grad
+                        saved_a_dist_grad = net.module.module.a.weight.grad
+                    else:
+                        saved_a_local_grad = local_net.a.weight.grad
+                        saved_a_dist_grad = net.module.a.weight.grad
+                    self.assertEqual(saved_a_local_grad, saved_a_dist_grad)
+                elif i >= 2:
+                    # parameter "a" of both models should be the same and not change
+                    if module_cls == DictOutputModule:
+                        self.assertEqual(
+                            net.module.module.a.weight.grad, saved_a_dist_grad
+                        )
+                        self.assertEqual(
+                            local_net.module.a.weight.grad, saved_a_local_grad
+                        )
+                    else:
+                        self.assertEqual(net.module.a.weight.grad, saved_a_dist_grad)
+                        self.assertEqual(local_net.a.weight.grad, saved_a_local_grad)
+
+                # Verify grads are the same
+                for (local_param, dist_param) in zip(
+                    local_net.parameters(), net.parameters()
+                ):
+                    local_grad = local_param.grad
+                    dist_grad = dist_param.grad
+                    self.assertEqual(local_grad, dist_grad)
+
+            dist.barrier()
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_output_unused_in_loss_tuple_module(self):
+            module_cls = UnusedParamTwoLinLayerNet
+            for grad_as_bucket_view in [True, False]:
+                self._test_output_unused_in_loss(module_cls, grad_as_bucket_view)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_output_unused_in_loss_dict_module(self):
+            module_cls = DictOutputModule
+            for grad_as_bucket_view in [True, False]:
+                self._test_output_unused_in_loss(module_cls, grad_as_bucket_view)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_undefined_grad_parity_unused_parameters(self):
+            # TODO: enable this for general training use cases:
+            # https://github.com/pytorch/pytorch/issues/58511.
+            x = torch.ones(1, 2).to(self.rank)
+            net = Net().to(self.rank)
+            local_net = copy.deepcopy(net)
+            net = torch.nn.parallel.DistributedDataParallel(
+                net,
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            out = net(x).sum()
+            local_out = local_net(x).sum()
+            # Simulates undefined gradients.
+            torch._C._functions.UndefinedGrad()(out).backward()
+            torch._C._functions.UndefinedGrad()(local_out).backward()
+            for (dist_param_name, dist_param), (local_param_name, local_param) in zip(
+                net.named_parameters(), local_net.named_parameters()
+            ):
+                dist_grad = dist_param.grad
+                local_grad = local_param.grad
+                self.assertEqual(
+                    dist_grad,
+                    local_grad,
+                    f"""DDP param {dist_param_name} with grad {dist_grad}
+                    does not match local param {local_param_name} with grad
+                    {local_grad}""",
+                )
+
+        def _test_different_graph_across_ranks(
+            self, find_unused_parameters=False, static_graph=False
+        ):
+            class ToyModel(nn.Module):
+                def __init__(self, rank):
+                    super().__init__()
+                    self.lin1 = nn.Linear(10, 10, bias=False)
+                    self.lin2 = nn.Linear(10, 10, bias=False)
+                    self.rank = rank
+
+                def forward(self, x):
+                    if self.rank == 0:
+                        return self.lin2(F.relu(self.lin1(x)))
+                    else:
+                        return F.relu(self.lin1(x))
+
+            torch.manual_seed(31415)
+            world_size = dist.get_world_size()
+            torch.cuda.set_device(self.rank)
+            model = ToyModel(self.rank).cuda(self.rank)
+            ddp_model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+                find_unused_parameters=find_unused_parameters,
+                gradient_as_bucket_view=True,
+                static_graph=static_graph,
+            )
+            random_input = torch.randn(20, 10, device=self.rank)
+            for i in range(10):
+                out = ddp_model(random_input)
+                loss = out.sum()
+                loss.backward()
+            return ddp_model
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_different_graph_across_ranks(self):
+            base_model = self._test_different_graph_across_ranks(
+                find_unused_parameters=True
+            )
+            self.assertFalse(
+                base_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
+            )
+            static_model = self._test_different_graph_across_ranks(static_graph=True)
+            self.assertTrue(
+                static_model._get_ddp_logging_data().get("has_rebuilt_buckets", 0)
+            )
+            for i, j in zip(base_model.parameters(), static_model.parameters()):
+                self.assertEqual(i, j)
+
+        @require_backend_is_available({"gloo"})
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "MacOS uses uv transport which does not have as robust error handling as tcp transport",
+        )
+        def test_monitored_barrier_gloo(self):
+            tensors = [torch.ones(10) * self.rank]
+            # Kick off some allreduce work on all ranks
+            for _ in range(10):
+                dist.all_reduce(torch.cat(tensors))
+            # Run monitored barrier and ensure it passes
+            timeout = timedelta(seconds=2)
+            dist.monitored_barrier(timeout=timeout)
+            # Check monitored_barrier success with wait_all_ranks=True
+            for _ in range(10):
+                dist.all_reduce(torch.cat(tensors))
+            dist.monitored_barrier(timeout=timeout, wait_all_ranks=True)
+            # All ranks besides 1 call into barrier, rank 0 should report failure
+            # while others report gloo error.
+            failed_rank = 1
+            src_rank = 0
+            if self.rank == src_rank:
+                with self.assertRaisesRegex(
+                    RuntimeError, f"Rank {failed_rank} failed to pass monitoredBarrier"
+                ):
+                    dist.monitored_barrier(timeout=timeout)
+            elif self.rank != failed_rank:
+                # Other ranks should not pass barrier since rank 0 failed.
+                err_regex = (
+                    f"Rank {self.rank} successfully reached monitoredBarrier,"
+                    f" but received errors while waiting for send/recv from rank"
+                    f" {src_rank}"
+                )
+                with self.assertRaisesRegex(RuntimeError, err_regex):
+                    dist.monitored_barrier(timeout=timeout)
+
+            # We need a barrier since otherwise failed_rank exits too early
+            # and cause a timeout.
+            self._barrier(timeout=30)
+
+        @require_backend_is_available({"gloo"})
+        def test_monitored_barrier_gloo_subgroup(self):
+            # Tests that monitored_barrier works as expected on non-default
+            # process groups.
+            failed_rank = 1
+            timeout = 0.1
+            subgroup = dist.new_group(ranks=[0, 1])
+
+            if self.rank == failed_rank:
+                return
+
+            if self.rank == 0:
+                with self.assertRaisesRegex(
+                    RuntimeError, f"Rank {failed_rank} failed to pass monitoredBarrier"
+                ):
+                    dist.monitored_barrier(subgroup, timeout)
+            else:
+                # Other ranks call into monitored_barrier, but this should be a
+                # noop because they are not part of the subgroup. Verify that
+                # there are no errors here.
+                dist.monitored_barrier(subgroup, timeout)
+
+        def _test_monitored_barrier_allreduce_hang(self, wait_all_ranks):
+            # tests expected behavior when nonzero rank hangs.
+            nccl_pg = dist.new_group(
+                ranks=list(range(int(self.world_size))),
+                # provide sufficient timeout so communicators
+                # can be initialized in ctor.
+                timeout=timedelta(seconds=15),
+                backend=dist.Backend.NCCL,
+            )
+            gloo_pg = dist.new_group(
+                ranks=list(range(int(self.world_size))),
+                backend=dist.Backend.GLOO,
+            )
+            tensors = [torch.ones(10, device=self.rank) * self.rank]
+            # Let all ranks call allreduce first to set up communicators etc.
+            # Directly simulating error here will run into store issue described
+            # in https://github.com/pytorch/pytorch/issues/54524.
+            nccl_pg.allreduce(tensors).wait(timedelta(seconds=5))
+            # All ranks besides 0 call into allreduce. This is to simulate a
+            # desync across the world, where some ranks call into
+            # monitored_barrier() and others are stuck in collective comm. In
+            # practice, we don't need TORCH_NCCL_BLOCKING_WAIT, but we use it in this
+            # test to ensure it exits cleanly.
+            if self.rank != 0:
+                # Can get different errors here depending on whether gloo-based
+                # wrapper PG is enabled or not, since with wrapper pg, it will
+                # fail in a collective synchronization check and not actually
+                # call into the nccl pg.
+                if dist.get_debug_level() == dist.DebugLevel.DETAIL:
+                    err_regex = "Timed out waiting"
+                else:
+                    err_regex = "caught collective operation timeout"
+                with self.assertRaisesRegex(RuntimeError, err_regex):
+                    nccl_pg.allreduce(tensors).wait(timedelta(seconds=0.1))
+            else:
+                # Rank 0 should report first (in order) timed out rank or all ranks
+                # depending on wait_all_ranks flag passed into monitored_barrier.
+                if wait_all_ranks:
+                    rank_str = ", ".join(
+                        [str(i) for i in range(1, int(self.world_size))]
+                    )
+                    err_regex = f"Ranks {rank_str} failed to pass monitoredBarrier"
+                else:
+                    expected_first_fail_rank = 1
+                    err_regex = f"Rank {expected_first_fail_rank} failed to pass monitoredBarrier"
+                monitored_barrier_timeout_seconds = timedelta(seconds=0.1)
+                with self.assertRaisesRegex(RuntimeError, err_regex):
+                    gloo_pg.monitored_barrier(
+                        monitored_barrier_timeout_seconds, wait_all_ranks=wait_all_ranks
+                    )
+
+            self._barrier(timeout=30)
+
+        @with_nccl_blocking_wait
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_monitored_barrier_allreduce_hang(self):
+            # tests expected behavior when nonzero rank hangs and we want to
+            # report first timed out rank.
+            self._test_monitored_barrier_allreduce_hang(wait_all_ranks=False)
+
+        @with_nccl_blocking_wait
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        def test_monitored_barrier_allreduce_hang_wait_all_ranks(self):
+            # tests expected behavior when nonzero rank hangs and we want to
+            # report all timed out ranks.
+            self._test_monitored_barrier_allreduce_hang(wait_all_ranks=True)
+
+        @require_backend_is_available({"gloo"})
+        def test_monitored_barrier_gloo_rank_0_timeout(self):
+            # tests error when rank 0 exhausts its given timeout.
+            process_group = dist.new_group(ranks=list(range(int(self.world_size))))
+            timeout = timedelta(seconds=0)
+            if self.rank == 0:
+                with self.assertRaisesRegex(
+                    RuntimeError, f"Rank {self.rank} timed out in monitoredBarrier"
+                ):
+                    process_group.monitored_barrier(timeout)
+
+        @require_backend_is_available({"gloo"})
+        @skip_if_small_worldsize
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "MacOS uses uv transport which does not have as robust error handling as tcp transport",
+        )
+        def test_monitored_barrier_failure_order(self):
+            # Ensure that the first (in sorted order) rank is reported when
+            # multiple ranks fail to pass the monitored_barrier.
+            # TODO(#54879): Provide ability to wait and report all failed ranks
+            expected_first_failed_rank = 2
+            timeout = timedelta(seconds=2)
+            src_rank = 0
+            if self.rank == src_rank:
+                with self.assertRaisesRegex(
+                    RuntimeError, f"Rank {expected_first_failed_rank}"
+                ):
+                    dist.monitored_barrier(timeout=timeout)
+            elif self.rank == 1:
+                err_regex = (
+                    f"Rank {self.rank} successfully reached monitoredBarrier,"
+                    f" but received errors while waiting for send/recv from rank"
+                    f" {src_rank}"
+                )
+                with self.assertRaisesRegex(RuntimeError, err_regex):
+                    dist.monitored_barrier(timeout=timeout)
+
+        @require_backend_is_available({"gloo"})
+        @skip_if_small_worldsize
+        def test_monitored_barrier_wait_all_ranks(self):
+            # Tests simple case where > 1 rank does not call into monitored
+            # barrier and verifies all ranks are reported by rank 0.
+            if self.rank == 0:
+                timeout = timedelta(seconds=0.1)
+                rank_str = ", ".join([str(i) for i in range(1, int(self.world_size))])
+                err_regex = f"Ranks {rank_str} failed to pass monitoredBarrier"
+                with self.assertRaisesRegex(RuntimeError, err_regex):
+                    dist.monitored_barrier(timeout=timeout, wait_all_ranks=True)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @with_dist_debug_levels(levels=["INFO"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_build_debug_param_to_name_mapping(self):
+            model = TwoLinLayerNet()
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            expected_mapping = {0: "a.weight", 1: "b.weight"}
+            net_params, _ = net._build_params_for_reducer()
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
+            self.assertDictEqual(expected_mapping, param_to_name_mapping)
+
+            # Test when DDP is used with ignored parameters.
+            model = TwoLinLayerNet()
+            # Parameters to ignore are in the format {module_name}.{param_name}
+            params_to_ignore = ["a.weight"]
+            torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                model, params_to_ignore
+            )
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            expected_mapping = {0: "b.weight"}
+            net_params, _ = net._build_params_for_reducer()
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
+            self.assertDictEqual(expected_mapping, param_to_name_mapping)
+
+            # Test errors are raised when DDP and module parameters mismatch.
+            # This generally indicates a bug with DDP and is not expected to
+            # happen in user applications.
+            model = TwoLinLayerNet()
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            net_params, _ = net._build_params_for_reducer()
+            if self.rank == 0:
+                print(type(net_params[0]))
+
+            net_params.extend(
+                [
+                    torch.nn.Parameter(torch.ones(1)),
+                    torch.nn.Parameter(torch.ones(1)),
+                ]
+            )
+
+            with self.assertRaisesRegex(ValueError, "Expected param to name mapping"):
+                net._build_debug_param_to_name_mapping(net_params)
+
+            net_params = net_params[:-3]
+            with self.assertRaisesRegex(ValueError, "Param with name"):
+                net._build_debug_param_to_name_mapping(net_params)
+
+            net_params.extend(
+                [
+                    torch.nn.Parameter(torch.ones(1)),
+                    torch.nn.Parameter(torch.ones(1)),
+                ]
+            )
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @with_dist_debug_levels(levels=["INFO"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_build_debug_param_to_name_mapping_requires_grad(self):
+            class Net(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.lin = nn.Linear(10, 10)
+                    # Is not tracked by DDP and should not show up in param to
+                    # name mapping.
+                    self.lin.bias.requires_grad_(False)
+
+                def forward(self, x):
+                    return self.lin(x)
+
+            model = Net()
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank), device_ids=[self.rank]
+            )
+            expected_mapping = {
+                0: "lin.weight",
+            }
+            net_params, _ = net._build_params_for_reducer()
+            param_to_name_mapping = net._build_debug_param_to_name_mapping(net_params)
+            self.assertEqual(param_to_name_mapping, expected_mapping)
+
+        def _test_ddp_multiple_nested_unused_params_error(self, ignore_sparse):
+            debug_mode_off = dist.get_debug_level() == dist.DebugLevel.OFF
+
+            class SubModule(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.embedding_net = EmbeddingNetDifferentParams(0)
+                    self.lin = TwoLinLayerNet()
+                    self.bn = BatchNormNet()
+                    self.lin_layer = nn.Linear(4, 10, bias=False)
+
+                def forward(self, x):
+                    x = self.bn(x)
+                    x = self.lin_layer(x)
+                    x = self.lin.a(x)  # self.lin.b param unused
+                    # EmbeddingNetDifferentParams entirely unused: self.embedding_net.embedding and
+                    # self.embedding_net.lin unused.
+                    return x
+
+            class MyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.sub_module = SubModule()
+
+                def forward(self, x):
+                    return self.sub_module(x)
+
+            model = MyModel()
+            sparse_embedding_fqns = []
+            if ignore_sparse:
+                for module_name, module in model.named_modules():
+                    if module == model.sub_module.embedding_net.embedding:
+                        for parameter_name, param in module.named_parameters(
+                            recurse=False
+                        ):
+                            fqn = f"{module_name}.{parameter_name}"
+                            sparse_embedding_fqns.append(fqn)
+
+                torch.nn.parallel.DistributedDataParallel._set_params_and_buffers_to_ignore_for_model(
+                    model, sparse_embedding_fqns
+                )
+                unused_modules = [
+                    model.sub_module.embedding_net.lin,
+                    model.sub_module.lin.b,
+                ]
+            else:
+                unused_modules = list(model.sub_module.embedding_net.modules()) + [
+                    model.sub_module.lin.b,
+                ]
+
+            expected_unused_param_fqns = []
+            used_param_fqns = []  # Validate that these don't mistakenly show up.
+            fqn_to_param_index = {}
+            index = 0
+            for module_name, module in model.named_modules():
+                for parameter_name, param in module.named_parameters(recurse=False):
+                    fqn = f"{module_name}.{parameter_name}"
+                    fqn_to_param_index[fqn] = index
+                    if fqn not in sparse_embedding_fqns:
+                        index += 1
+                    if module in unused_modules:
+                        expected_unused_param_fqns.append(fqn)
+                    else:
+                        if (
+                            not ignore_sparse
+                            or module != model.sub_module.embedding_net.embedding
+                        ):
+                            used_param_fqns.append(fqn)
+
+            net = torch.nn.parallel.DistributedDataParallel(
+                model.cuda(self.rank),
+                device_ids=[self.rank],
+            )
+            batch, dim = 10, 2
+            inp = torch.ones(batch, dim)
+            for i in range(2):
+                if i == 0:
+                    out = net(inp)
+                    loss = out.sum()
+                    loss.backward()
+                else:
+                    try:
+                        out = net(inp)
+                        loss = out.sum()
+                        loss.backward()
+                    except RuntimeError as e:
+                        e = str(e)
+
+                        unused_param_substr = e[e.find("did not receive grad") :]
+                        # Validate that each unused param fully qualified name
+                        # shows up in error logs. We do this instead of
+                        # constructing a joined string since order of parameters
+                        # can be different in Reducer. In addition, validate
+                        # param indices show up as well.
+                        for unused_param_fqn in expected_unused_param_fqns:
+                            self.assertTrue(
+                                unused_param_fqn in unused_param_substr
+                                or debug_mode_off
+                            )
+                            self.assertTrue(
+                                str(fqn_to_param_index[unused_param_fqn])
+                                in unused_param_substr,
+                                f"Did not find index {fqn_to_param_index[unused_param_fqn]} for {unused_param_fqn}",
+                            )
+
+                        # Validate that used param fqns don't show up in error
+                        # logs.
+                        for used_param_fqn in used_param_fqns:
+                            self.assertFalse(used_param_fqn in unused_param_substr)
+                        # Validate that ignored param fqns don't show up as unused
+                        # (since DDP does not track them)
+                        for sparse_param_fqn in sparse_embedding_fqns:
+                            self.assertFalse(sparse_param_fqn in unused_param_substr)
+                    else:
+                        self.assertTrue(False, "Expected error was not raised!")
+
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_multiple_nested_unused_params_error(self):
+            self._test_ddp_multiple_nested_unused_params_error(ignore_sparse=False)
+
+        @with_dist_debug_levels(levels=["OFF", "INFO", "DETAIL"])
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_multiple_nested_unused_params_err_ignore_params(self):
+            # Tests unused parameter reporting when DDP is configured to ignore
+            # certain parameters.
+            self._test_ddp_multiple_nested_unused_params_error(ignore_sparse=True)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_inference(self):
+            # tests that DDP module can be run on a single node with no_grad
+            # or eval setting and there is no hang.
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            model = Net().cuda()
+            local_model = copy.deepcopy(model)
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[rank],
+            )
+            syncbn_model = nn.SyncBatchNorm(
+                2, momentum=0.99, track_running_stats=False
+            ).cuda()
+            local_syncbn_model = copy.deepcopy(syncbn_model)
+            syncbn_model = torch.nn.parallel.DistributedDataParallel(
+                syncbn_model, device_ids=[rank]
+            )
+            inp = torch.randn(10, 2, device=rank)
+            inp_syncbn = torch.randn(10, 2, 4, 4, device=rank)
+            tests = [
+                (model, local_model, inp),
+                (syncbn_model, local_syncbn_model, inp_syncbn),
+            ]
+            for test in tests:
+                test_model, test_local_model, test_inp = test
+                if self.rank == 0:
+                    test_model.eval()
+                    test_local_model.eval()
+                    for _ in range(6):
+                        self.assertEqual(
+                            test_model(test_inp), test_local_model(test_inp)
+                        )
+
+            # Barrier since only rank 0 runs inference. Test should be
+            # much faster than 30s, but this is to avoid flakiness.
+            self._barrier(timeout=30)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @skip_if_lt_x_gpu(2)
+        @unittest.skip("Test is failing, see https://github.com/pytorch/pytorch/pull/113620")
+        def test_ddp_sync_bn_training_vs_eval(self):
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            # Need to set track_running_stats=False, when track_running_stats=True,
+            # bn_training is False and sync could not occur in eval model.
+            model = nn.SyncBatchNorm(2, momentum=0.99, track_running_stats=False).cuda(
+                rank
+            )
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
+            # Test sync occurs in training mode.
+            with torch.autograd.profiler.profile() as prof:
+                for i in range(6):
+                    inp = torch.randn(10, 2, 4, 4).cuda(rank)
+                    out = model(inp)
+                    loss = out.sum()
+                    loss.backward()
+
+            # SyncBN allgathers stats across all ranks, so verify call to
+            # all_gather in profiler.
+            if BACKEND == "nccl":
+                all_gather_calls = get_profiling_event("_all_gather_base", prof)
+            else:
+                all_gather_calls = get_profiling_event("all_gather", prof)
+            self.assertNotEqual([], all_gather_calls)
+
+            # Only do inference on one rank. If SyncBN did collective stats sync,
+            # this would hang/error.
+            model_inference = model.module
+            if self.rank == 0:
+                model_inference.eval()
+                with torch.autograd.profiler.profile() as prof:
+                    for i in range(6):
+                        inp = torch.randn(10, 2, 4, 4).cuda(rank)
+                        out = model_inference(inp)
+                        loss = out.sum()
+                        loss.backward()
+
+                # Ensure sync does not occur in eval() mode.
+                if BACKEND == "nccl":
+                    all_gather_calls = get_profiling_event("_all_gather_base", prof)
+                else:
+                    all_gather_calls = get_profiling_event("all_gather", prof)
+                self.assertEqual([], all_gather_calls)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_python_error_logged(self):
+            # Most python exceptions in DDP are raised during init before
+            # reducer is constructed, so we don't have a logger in those cases.
+            # However, the below is one example where a python error is thrown
+            # after reducer is constructed.
+            model = TwoLinLayerNet().cuda(self.rank)
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+            expected_err = "must be callable"
+            with self.assertRaisesRegex(TypeError, expected_err):
+                model.register_comm_hook({}, {})
+
+            verify_ddp_error_logged(model, expected_err)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_static_graph_nested_types(self):
+            # Tests for static graph training when outputs are not just tensors
+            # but can be (nested) tuple, list, dict, etc.
+            rank = self.rank
+            torch.cuda.set_device(rank)
+
+            class NestedOutputModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.lin = nn.Linear(100, 1, bias=False)
+
+                def forward(self, inp, output_type):
+                    if output_type == "tuple":
+                        return (
+                            self.lin(inp),
+                            (
+                                self.lin(inp),
+                                self.lin(inp),
+                            ),
+                        )
+                    elif output_type == "list":
+                        return [
+                            self.lin(inp),
+                            [
+                                self.lin(inp),
+                                self.lin(inp),
+                            ],
+                        ]
+                    elif output_type == "dict":
+                        return {
+                            "a": self.lin(inp),
+                            "b": {
+                                "c": self.lin(inp),
+                            },
+                        }
+
+            def get_loss(model_output):
+                loss = 0.0
+                if isinstance(model_output, torch.Tensor):
+                    return model_output.sum()
+                elif isinstance(model_output, dict):
+                    for value in model_output.values():
+                        loss += get_loss(value)
+                elif isinstance(model_output, (tuple, list)):
+                    for x in model_output:
+                        loss += get_loss(x)
+                else:
+                    raise ValueError(f"Unknown model output type {type(model_output)}")
+                return loss
+
+            model = NestedOutputModule().cuda(rank)
+            model_static_graph = copy.deepcopy(model)
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[rank],
+            )
+            model_static_graph = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[rank],
+                static_graph=True,
+            )
+            inp = torch.randn(10, 100)
+            type_mapping = {
+                "list": list,
+                "tuple": tuple,
+                "dict": dict,
+            }
+            for output_type in type_mapping.keys():
+                for i in range(6):
+                    out = model(inp, output_type=output_type)
+                    loss = get_loss(out)
+                    loss.backward()
+                    self._model_step(model)
+                    out_static = model_static_graph(inp, output_type=output_type)
+                    self.assertTrue(isinstance(out_static, type_mapping[output_type]))
+                    loss_static = get_loss(out_static)
+                    loss_static.backward()
+                    self._model_step(model_static_graph)
+                    for (p, p_static) in zip(
+                        model.parameters(), model_static_graph.parameters()
+                    ):
+                        self.assertEqual(p, p_static)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_returns_tensor_with_no_grad(self):
+            # Tests case where module returns tensor that does not require grad.
+            torch.cuda.set_device(self.rank)
+
+            class MyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.fc1 = nn.Linear(10, 10, bias=False)
+                    self.fc2 = nn.Linear(10, 10, bias=False)
+
+                def forward(self, x):
+                    x = self.fc2(F.relu(self.fc1(x)))
+                    y = x.clone()
+                    x = x.detach()
+                    assert not x.requires_grad
+                    return (x, y)
+
+            model = MyModel().to(self.rank)
+            inp = torch.randn(1, 10, device=self.rank)
+            for (find_unused, static_graph) in itertools.product(
+                [True, False], [True, False]
+            ):
+                ddp = DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                    output_device=self.rank,
+                    find_unused_parameters=find_unused,
+                    static_graph=static_graph,
+                )
+                for i in range(6):
+                    out = ddp(inp)
+                    self.assertFalse(out[0].requires_grad)
+                    o = (out[0] + out[1]).sum()
+                    o.backward()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_detect_ddp_is_actually_static(self):
+            class ToyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.net1 = nn.Linear(10, 10, bias=False)
+                    self.net2 = nn.Linear(10, 10)
+
+                def forward(self, x, find_unused, dynamic):
+                    if find_unused:
+                        if dynamic:
+                            return self.net2(self.net1(x))
+                        else:
+                            return self.net2(x)
+                    else:
+                        return self.net2(self.net1(x))
+
+            # Set of unused parameters don't change across iterations
+            torch.cuda.set_device(self.rank)
+            model = ToyModel().cuda()
+            for find_unused in [True, False]:
+                ddp = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                    find_unused_parameters=find_unused,
+                )
+                inp = torch.randn(1, 10, device="cuda")
+                for _ in range(6):
+                    out = ddp(inp, find_unused=find_unused, dynamic=False)
+                    loss = out.sum()
+                    loss.backward()
+                    self.assertTrue(ddp.reducer._ddp_graph_static())
+
+            # Set of unused parameters dynamically change
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+            )
+            inp = torch.randn(1, 10, device="cuda")
+            for i in range(6):
+                out = ddp(inp, find_unused=True, dynamic=i % 2 == 0)
+                loss = out.sum()
+                loss.backward()
+            self.assertFalse(ddp.reducer._ddp_graph_static())
+
+        def _test_ddp_new_tensor_in_fwd(self, static_graph):
+            # Test from https://github.com/pytorch/pytorch/issues/60733
+            class MyModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.fc1 = nn.Linear(10, 10, bias=False)
+                    self.fc2 = nn.Linear(10, 10, bias=False)
+                    self.device = self.fc1.weight.device
+
+                def __init_opt(self):
+                    opt = torch.randn(1, 10, device=self.device)
+                    return opt
+
+                def forward(self, x, opt_1, opt_2, opt_nested):
+                    x = F.relu(self.fc1(x))
+                    x = self.fc2(x)
+                    if opt_1 is None:
+                        opt_1 = self.__init_opt()
+                    if opt_2 is None:
+                        opt_2 = self.__init_opt()
+                    if opt_nested is None or not torch.is_tensor(opt_nested):
+                        opt_nested = self.__init_opt()
+                    # Test multiple tensors as well as newly created tensors
+                    # within a struct.
+                    return x, opt_1, opt_2, {"tensor": opt_nested}
+
+            model = MyModel().to(self.rank)
+            for find_unused in [True, False]:
+                ddp = DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                    output_device=self.rank,
+                    broadcast_buffers=False,
+                    find_unused_parameters=find_unused,
+                    static_graph=static_graph,
+                )
+
+                opt = [None for _ in range(3)]
+                for i in range(2):
+                    ddp.zero_grad()
+                    x = torch.randn(1, 10, device=self.rank)
+                    out, opt[0], opt[1], opt[2] = ddp(
+                        x, opt_1=opt[0], opt_2=opt[1], opt_nested=opt[2]
+                    )
+                    for i in range(len(opt)):
+                        if torch.is_tensor(opt[i]):
+                            self.assertEqual(opt[i].grad_fn, None)
+                        else:
+                            self.assertEqual(opt[i]["tensor"].grad_fn, None)
+                    out.mean().backward()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_new_tensor_in_fwd(self):
+            return self._test_ddp_new_tensor_in_fwd(static_graph=False)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_new_tensor_in_fwd_static_graph(self):
+            return self._test_ddp_new_tensor_in_fwd(static_graph=True)
+
+        def _test_ddp_buffer_hook_allreduce(self, return_futures):
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+
+            def buffer_comm_hook(ddp, named_buffers):
+                buffers = [buffer for (_, buffer) in named_buffers.items()]
+                futs = [
+                    dist.all_reduce(
+                        buffer, group=ddp.process_group, async_op=True
+                    ).get_future()
+                    for buffer in buffers
+                ]
+                if return_futures:
+                    return futs
+                else:
+                    torch.futures.collect_all(futs).wait()
+
+            hook_pre_fwd = (
+                torch.nn.parallel.distributed._BufferCommHookLocation.PRE_FORWARD
+            )
+            hook_post_fwd = (
+                torch.nn.parallel.distributed._BufferCommHookLocation.POST_FORWARD
+            )
+            for hook_run_location in [
+                hook_pre_fwd,
+                hook_post_fwd,
+            ]:
+                model = NetWithBuffers().cuda(rank)
+                model_ddp = torch.nn.parallel.DistributedDataParallel(
+                    model,
+                    device_ids=[self.rank],
+                )
+                model_ddp._register_buffer_comm_hook(
+                    model_ddp, buffer_comm_hook, hook_run_location
+                )
+                model_ddp_no_hook = torch.nn.parallel.DistributedDataParallel(
+                    copy.deepcopy(model),
+                    device_ids=[self.rank],
+                    broadcast_buffers=False,
+                )
+                inp = torch.randn(2, 10, device=rank)
+                for i in range(2):
+                    loss_hook = model_ddp(inp).sum()
+                    # Since buffer reduction is done pre-forward, simulate it for
+                    # no hook case here.
+                    # Simulate allreduce appropriately depending on hook location.
+                    if hook_run_location == hook_pre_fwd:
+                        model_no_hook_buffers = list(model_ddp_no_hook.module.buffers())
+                        for tensor in model_no_hook_buffers:
+                            dist.all_reduce(tensor)
+
+                    loss_no_hook = model_ddp_no_hook(inp).sum()
+                    if hook_run_location == hook_post_fwd:
+                        model_no_hook_buffers = list(model_ddp_no_hook.module.buffers())
+                        for tensor in model_no_hook_buffers:
+                            dist.all_reduce(tensor)
+                    torch.cuda.synchronize()
+
+                    # if return_futures, they are only awaited on by DDP
+                    # at the end of the backwards pass for maximum overlap.
+                    if not return_futures:
+                        self._verify_buffers_equal(model_ddp, model_ddp_no_hook)
+                    loss_hook.backward()
+                    loss_no_hook.backward()
+                    # Note that when custom hooks return futures, this
+                    # comparison is not expected to work when hook run location
+                    # is pre-forward pass. This is because the hook does async
+                    # communication and forward pass modifies the buffer without
+                    # appropriate synchronization. Therefore, if returning
+                    # futures from custom buffer hooks, it is advised to set
+                    # hook run location to post forward.
+                    if return_futures and hook_run_location == hook_post_fwd:
+                        self._verify_buffers_equal(model_ddp, model_ddp_no_hook)
+                dist.barrier()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_buffer_hook_allreduce_return_future(self):
+            self._test_ddp_buffer_hook_allreduce(return_futures=True)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_buffer_hook_allreduce(self):
+            self._test_ddp_buffer_hook_allreduce(return_futures=False)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_broadcast_buffer_via_hook(self):
+            # test that _distributed_broadcast_coalesced via registered hook is
+            # equivalent to DDP's default broadcast coalesced.
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+
+            def buffer_comm_hook(ddp, named_buffers):
+                # named_buffers is a Dict[str, Tensor] representing a mapping
+                # from buffer name to buffer.
+                buffers = [buffer for (_, buffer) in named_buffers.items()]
+                ddp._default_broadcast_coalesced(buffers)
+
+            model = NetWithBuffers().cuda(rank)
+            model_ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+            model_ddp._register_buffer_comm_hook(model_ddp, buffer_comm_hook)
+            model_ddp_no_hook = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(model),
+                device_ids=[self.rank],
+            )
+            inp = torch.randn(2, 10, device=rank)
+            for i in range(2):
+                loss_hook = model_ddp(inp).sum()
+                loss_no_hook = model_ddp_no_hook(inp).sum()
+                self._verify_buffers_equal(model_ddp, model_ddp_no_hook)
+                loss_hook.backward()
+                loss_no_hook.backward()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_remove_autograd_hooks(self):
+
+            class SimulateError(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, input):
+                    return input
+
+                @staticmethod
+                def backward(ctx, grad_output):
+                    raise RuntimeError()
+
+            class MyModel(nn.Module):
+                def __init__(self, device):
+                    super().__init__()
+                    self.error = True
+                    self.fc1 = nn.Linear(10, 10).cuda(device)
+
+                def forward(self, inp):
+                    if self.error:
+                        return self.fc1(SimulateError.apply(inp))
+                    else:
+                        return self.fc1(inp)
+
+
+            # Run with error to trigger backward pass that marks fc1 as being marked
+            # ready. If we don't remove autograd hooks before running below it would
+            # fail on the old autograd hook.
+            model = MyModel(self.rank)
+            input = torch.rand(10, 10, requires_grad=True).cuda(self.rank)
+            model_ddp1 = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+
+            with self.assertRaises(RuntimeError):
+                model_ddp1(input).sum().backward()
+
+            # Remove autograd hooks on old instance.
+            model_ddp1._remove_autograd_hooks()
+
+            # Try another DDP instance without error now.
+            model.error = False
+            model_ddp2 = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+            model_ddp2(input).sum().backward()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        @unittest.skip("Test is failing, tracking issue at https://github.com/pytorch/pytorch/issues/102751")
+        def test_ddp_has_finalized(self):
+
+            @dataclass
+            class MyClass:
+                obj: torch.Tensor
+
+            class MyModel(nn.Module):
+                def __init__(self, rank):
+                    super().__init__()
+                    self.rank = rank
+                    self.fc1 = nn.Linear(1024, 1024).cuda(rank)
+                    self.fc2 = nn.Linear(1024, 2 * 1024).cuda(rank)
+
+                def forward(self, inp):
+                    if self.rank == 0:
+                        return self.fc1(inp), MyClass(self.fc2(inp))
+                    else:
+                        return self.fc1(inp), self.fc2(inp)
+
+            model = MyModel(self.rank)
+            input = torch.rand(10, 1024, requires_grad=True).cuda(self.rank)
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+                bucket_cap_mb=(1024 * 4 / 1024 / 1024),  # One bucket per parameter.
+            )
+
+            if self.rank == 0:
+                out1, _ = ddp(input)
+                out1.sum().backward()
+            else:
+                out1, out2 = ddp(input)
+                (out1.sum() + out2.sum()).backward()
+
+            if self.rank == 0:
+                with self.assertRaisesRegex(RuntimeError, "Expected to have finished reduction in the prior iteration"):
+                    ddp._check_reducer_finalized()
+
+                with self.assertRaisesRegex(RuntimeError, "Expected to have finished reduction in the prior iteration"):
+                    ddp(input)
+            else:
+                ddp._check_reducer_finalized()
+                ddp(input)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl",
+            "TORCH_NCCL_USE_COMM_NONBLOCKING only applies to NCCL"
+        )
+        def test_nccl_init_abort(self):
+            """
+            Tests that we can abort a NCCL communicator during initialization and
+            recover appropriately.
+            """
+            # Reinitialize global process group with TORCH_NCCL_USE_COMM_NONBLOCKING=1
+            os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
+            dist.destroy_process_group()
+            timeout = timedelta(seconds=1)
+            dist.init_process_group(
+                init_method=INIT_METHOD,
+                backend=BACKEND,
+                world_size=int(os.environ["WORLD_SIZE"]),
+                rank=self.rank,
+                timeout=timeout,
+            )
+
+            # Abort pg in background thread.
+            running = True
+
+            def abort(device):
+                pg = _get_default_group()
+                while running:
+                    pg._get_backend(torch.device(device))._shutdown()
+                    time.sleep(1)
+
+            if self.rank != 1:
+                import threading
+                t = threading.Thread(target=abort, args=(self.rank,))
+                t.start()
+                with self.assertRaises(RuntimeError):
+                    # First collective triggers initialization via ncclCommInitRank.
+                    torch.distributed.barrier()
+                running = False
+                t.join()
+
+        def _run_ddp_update_process_group(self, new_pg):
+            def get_num_torch_recompiles():
+                guard_failures = torch._dynamo.utils.guard_failures
+                num_recompiles = [len(guard_failures[code]) for code in guard_failures]
+                return 0 if len(num_recompiles) == 0 else max(num_recompiles)
+
+            class SimulateError(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, input):
+                    return input
+
+                @staticmethod
+                def backward(ctx, grad_output):
+                    raise RuntimeError()
+
+            class MyModel(torch.nn.Module):
+                def __init__(self, device):
+                    super().__init__()
+                    # 4MB for multiple buckets.
+                    self.fc1 = torch.nn.Linear(1024, 1024).cuda(device)
+                    self.fc2 = torch.nn.Linear(1024, 1024).cuda(device)
+                    self.fc3 = torch.nn.Linear(1024, 1024).cuda(device)
+
+                def forward(self, inp, error):
+                    if error:
+                        return self.fc3(self.fc2(self.fc1(SimulateError.apply(inp))))
+                    else:
+                        return self.fc3(self.fc2(self.fc1(inp)))
+
+
+            input = torch.rand(10, 1024, requires_grad=True).cuda(self.rank)
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                MyModel(self.rank),
+                device_ids=[self.rank],
+                find_unused_parameters=True,
+                bucket_cap_mb=1,
+            )
+            model = torch.compile(ddp)
+
+            def run_iteration():
+                # Run regular iteration.
+                out = model(input, error=False)
+                out.sum().backward()
+                torch.cuda.synchronize()
+
+                # Run with error.
+                with self.assertRaises(RuntimeError):
+                    out = model(input, error=True)
+                    out.sum().backward()
+                torch.cuda.synchronize()
+
+            run_iteration()
+            assert 0 == get_num_torch_recompiles()
+
+            if new_pg:
+                # Now reduce world_size and run iteration.
+                group_size_2 = dist.new_group(ranks=[0, 1])
+                ddp._update_process_group(group_size_2)
+                if self.rank in [0, 1]:
+                    run_iteration()
+
+                # Increase the world size and run iteration.
+                group_size_3 = dist.new_group(ranks=[1, 2, 3])
+                ddp._update_process_group(group_size_3)
+                if self.rank in [1, 2, 3]:
+                    run_iteration()
+
+                # Back to default size.
+                ddp._update_process_group(_get_default_group())
+                run_iteration()
+            else:
+                # Create default pg of smaller size.
+                dist.destroy_process_group()
+
+                if self.rank in [1, 2, 3]:
+                    dist.init_process_group(
+                        init_method=self.init_method,
+                        backend=BACKEND,
+                        world_size=3,
+                        rank=self.rank - 1,
+                        timeout=timedelta(seconds=default_pg_timeout),
+                    )
+                    ddp._update_process_group(_get_default_group())
+                    run_iteration()
+                    dist.destroy_process_group()
+
+                # Need a barrier here to ensure ranks 1, 2 and 3 are done.
+                self._barrier(wait_for=4)
+
+                # Need to init pg again for "_barrier" to succeed.
+                dist.init_process_group(
+                    init_method=self.init_method,
+                    backend=BACKEND,
+                    world_size=4,
+                    rank=self.rank,
+                    timeout=timedelta(seconds=default_pg_timeout),
+                )
+
+            # Validate no more recompiles.
+            assert 0 == get_num_torch_recompiles()
+
+        @skip_if_lt_x_gpu(4)
+        @require_world_size(4)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_update_process_group_new_group(self):
+            self._run_ddp_update_process_group(new_pg=True)
+
+        @skip_if_lt_x_gpu(4)
+        @require_world_size(4)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_update_process_group_default_group(self):
+            self._run_ddp_update_process_group(new_pg=False)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_broadcast_buffer(self):
+            rank = self.rank
+            torch.cuda.set_device(rank)
+            torch.manual_seed(rank)
+            torch.cuda.manual_seed(rank)
+
+            class NetWithBuffers(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.a = nn.Linear(10, 10, bias=False)
+                    self.b = nn.Linear(10, 1, bias=False)
+                    self.register_buffer("buffer", torch.randn(1, 2))
+
+                def forward(self, x):
+                    return self.b(self.a(x))
+
+            model = NetWithBuffers().cuda(rank)
+            model_ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+            inp = torch.randn(2, 10, device=rank)
+            for i in range(2):
+                if rank == 0:
+                    model_ddp.module.buffer = model_ddp.module.buffer + 1
+                loss = model_ddp(inp).sum()
+                loss.backward()
+                # Ensure all buffers are synchronized.
+                bufs = [
+                    torch.empty_like(model_ddp.module.buffer)
+                    for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(bufs, model_ddp.module.buffer)
+                rank_0_buf = bufs[0]
+                for buf in bufs[1:]:
+                    self.assertEqual(rank_0_buf, buf)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "Only Nccl & Gloo backend support DistributedDataParallel",
+        )
+        def test_static_graph_multi_forward(self):
+            class Net(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.lin = nn.Linear(10, 10)
+                    self.relu = nn.ReLU()
+
+                def forward(self, x):
+                    return self.relu(self.lin(x))
+
+            torch.cuda.set_device(self.rank)
+            torch.manual_seed(42 << 1337 % (self.rank + 1))
+            model = Net().cuda(self.rank)
+            local_model = copy.deepcopy(model)
+            model = torch.nn.parallel.DistributedDataParallel(
+                model, device_ids=[self.rank], static_graph=True
+            )
+            inp = torch.ones(2, 10, device="cuda")
+            for _ in range(3):
+                model.zero_grad()
+                local_model.zero_grad()
+                a = model(inp)
+                b = model(inp)
+                loss = a.sum() + b.sum()
+                loss.backward()
+                # Grads should be equal to a local model that ran through inp twice and averaged grads
+                if self.rank == 0:
+                    inp_clone = inp.clone()
+                    for _ in range(2):
+                        a = local_model(inp_clone)
+                        b = local_model(inp_clone)
+                        loss = a.sum() + b.sum()
+                        loss.backward()
+
+                    ws = dist.get_world_size()
+                    for p in local_model.parameters():
+                        p.grad.data = p.grad / dist.get_world_size()
+
+                    for p_ddp, p_local in zip(
+                        model.parameters(),
+                        local_model.parameters()
+                    ):
+                        self.assertTrue(
+                            torch.allclose(
+                                p_ddp.grad, p_local.grad
+                            ),
+                            f"{p_ddp.grad} vs {p_local.grad}"
+                        )
+
+            dist.barrier()
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND != "nccl" and BACKEND != "gloo",
+            "Only Nccl & Gloo backend support DistributedDataParallel",
+        )
+        def test_sync_bn_logged(self):
+            model = BN_NET
+            rank = self.rank
+            # single gpu training setup
+            model_gpu = model.cuda(rank)
+            no_sync_bn = torch.nn.parallel.DistributedDataParallel(
+                copy.deepcopy(model_gpu),
+                device_ids=[self.rank],
+            )
+            ddp_logging_data = no_sync_bn._get_ddp_logging_data()
+            sync_bn_logged = ddp_logging_data.get("has_sync_bn", True)
+            self.assertFalse(sync_bn_logged)
+            model_DDP = nn.SyncBatchNorm.convert_sync_batchnorm(model_gpu)
+            model_DDP = torch.nn.parallel.DistributedDataParallel(
+                model_DDP,
+                device_ids=[self.rank],
+            )
+            ddp_logging_data = model_DDP._get_ddp_logging_data()
+            sync_bn_logged = ddp_logging_data.get("has_sync_bn", False)
+            self.assertTrue(sync_bn_logged)
+
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_stateless_api_with_ddp(self):
+            class MockModule(torch.nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.l1 = torch.nn.Linear(1, 1)
+                    buffer = torch.ones(1)
+                    self.register_buffer("buffer", buffer)
+
+                def forward(self, x):
+                    return self.l1(x) + self.buffer
+
+            device = self.rank
+            module = MockModule().to(device)
+            module = torch.nn.parallel.DistributedDataParallel(
+                module, device_ids=[device]
+            )
+            x = torch.rand((1, 1)).to(device)
+            weight = torch.tensor([[1.0]], device=device, requires_grad=True)
+            bias = torch.tensor([0.0], device=device, requires_grad=True)
+            buffer = torch.tensor([0.0], device=device)
+            parameters = {
+                "module.l1.weight": weight,
+                "module.l1.bias": bias,
+                "module.buffer": buffer,
+            }
+            prev_weight = module.module.l1.weight.clone()
+            prev_buffer = module.module.buffer.clone()
+
+            res = torch.func.functional_call(module, parameters, x)
+            self.assertEqual(x, res)
+            # check that the weight remain unmodified
+            cur_weight = module.module.l1.weight
+            cur_buffer = module.module.buffer
+            self.assertEqual(cur_weight, prev_weight)
+            self.assertEqual(cur_buffer, prev_buffer)
+            # run a backward pass and check the gradients
+            res.backward()
+            self.assertIsNotNone(weight.grad)
+            self.assertIsNotNone(bias.grad)
+            # Gradient was not calculated for the module stated and buffers
+            self.assertIsNone(buffer.grad)
+            self.assertIsNone(module.module.l1.weight.grad)
+            self.assertIsNone(module.module.l1.bias.grad)
+            self.assertIsNone(module.module.buffer.grad)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_forward_backward_hook(self):
+            class DummyTestModel(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    torch.manual_seed(0)
+                    self.fc = nn.Linear(2, 2)
+
+                def forward(self, x):
+                    return self.fc(x)
+
+            def relu_hook(module, input):
+                return nn.functional.relu(input[0])
+
+            def gelu_hook(module, _input, output):
+                return nn.functional.gelu(output)
+
+            def celu_hook(module, _input, output):
+                return (nn.functional.celu(output[0]),)
+
+            local_model = DummyTestModel()
+            ddp_model = DummyTestModel()
+            local_model.fc.register_forward_pre_hook(relu_hook)
+            local_model.fc.register_forward_hook(gelu_hook)
+            ddp_model.fc.register_forward_pre_hook(relu_hook)
+            ddp_model.fc.register_forward_hook(gelu_hook)
+            local_model.fc.register_backward_hook(celu_hook)
+            ddp_model.fc.register_backward_hook(celu_hook)
+            ddp_model = DistributedDataParallel(
+                ddp_model.to(self.rank), device_ids=[self.rank]
+            )
+            input_data = torch.rand(5, 2)
+            output_local = local_model(input_data)
+            output_ddp = ddp_model(input_data.to(self.rank))
+            self.assertEqual(output_local, output_ddp)
+            output_local.sum().backward()
+            output_ddp.sum().backward()
+            ddp_grads = [p.grad for p in ddp_model.parameters()]
+            self.assertEqual(ddp_grads[0], local_model.fc.weight.grad)
+            self.assertEqual(ddp_grads[1], local_model.fc.bias.grad)
+
+        def _test_hook_pickling(self, hook, hook_state):
+            torch.manual_seed(0)
+            learning_rate = 0.01
+            chkpt_file = tempfile.gettempdir() + "/checkpoint.pt"
+            rank = self.rank
+
+            input = torch.randn(7, 1, device=rank)
+            target = torch.randn(7, 5, device=rank)
+            net = torch.nn.Linear(1, 5).to(rank)
+            ddp_model = DistributedDataParallel(copy.deepcopy(net), device_ids=[rank])
+            dummy_ddp_model = DistributedDataParallel(
+                copy.deepcopy(net), device_ids=[rank]
+            )
+            optimizer = torch.optim.SGD(ddp_model.parameters(), lr=learning_rate)
+            ddp_model.register_comm_hook(hook_state, hook)
+            ddp_model.train()
+
+            for _ in range(10):
+                optimizer.zero_grad()
+                out = ddp_model(input)
+                loss = F.mse_loss(out, target)
+                loss.backward()
+                optimizer.step()
+
+            state = {
+                "state_dict": ddp_model.state_dict(),
+                "comm_hook": hook,
+                "comm_hook_state": hook_state,
+            }
+
+            if rank == 0:
+                with self.assertLogs("torch.distributed") as captured:
+                    torch.save(state, chkpt_file)
+
+                # Check that the logger has only one entry
+                self.assertEqual(len(captured.records), 1)
+                # Check that the logger has an expected entry
+                self.assertEqual(
+                    captured.records[0].getMessage(),
+                    "NOTE: Process group is not serializable and excluded from a saved state.",
+                )
+
+            dist.barrier()
+            map_location = {"cuda:%d" % 0: "cuda:%d" % rank}
+            with self.assertLogs("torch.distributed") as captured:
+                checkpoint = torch.load(chkpt_file, map_location=map_location)
+
+            # Check that the logger has only one entry
+            self.assertEqual(len(captured.records), 1)
+            # Check that the logger has an expected entry
+            self.assertEqual(
+                captured.records[0].getMessage(),
+                "NOTE: Process group will be set to a default group (i.e. the world size).\
+                If a different group is desired, please set `self.process_group` after PowerSGD state is loaded.",
+            )
+
+            dummy_ddp_model.load_state_dict(checkpoint["state_dict"])
+            dummy_hook = checkpoint["comm_hook"]
+            dummy_hook_state = checkpoint["comm_hook_state"]
+            dummy_optimizer = torch.optim.SGD(
+                dummy_ddp_model.parameters(), lr=learning_rate
+            )
+
+            # Check that loaded function is correct
+            self.assertEqual(dummy_hook.__qualname__, hook.__qualname__)
+
+            # Check that all slots' keys were restored correctly
+            self.assertEqual(hook_state.__slots__, dummy_hook_state.__slots__)
+
+            # Check that all slots' attributes are restored correctly
+            # Excluding ``process_group`` and ``rng``.
+            for entry in dummy_hook_state.__slots__:
+                if entry != "process_group" and entry != "rng":
+                    self.assertEqual(
+                        getattr(dummy_hook_state, entry), getattr(hook_state, entry)
+                    )
+
+            # Check that ``process_group`` was set to default
+            self.assertEqual(dummy_hook_state.process_group, _get_default_group())
+
+            # Check that a random state was restored properly:
+            # ``np.random.RandomState.get_state`` returns a tuple with entries:
+            # ``bit_generator`` - str,
+            # ``state.key`` - ndarray dtype[uint32],
+            # ``state.pos`` - int,
+            # ``has_gauss`` - int,
+            # ``gauss`` - float
+            #  (refer to https://github.com/numpy/numpy/blob/266aad7478bc7fbcc55eea7f942a0d373b838396/numpy/random/mtrand.pyi)
+            # To make sure random state was restored properly, all entries should equal the original
+            for entry1, entry2 in zip(
+                hook_state.rng.get_state(), dummy_hook_state.rng.get_state()
+            ):
+                np.testing.assert_array_equal(entry1, entry2)
+
+            dummy_ddp_model.register_comm_hook(dummy_hook_state, dummy_hook)
+            dummy_ddp_model.train()
+
+            for _ in range(10):
+                optimizer.zero_grad()
+                dummy_optimizer.zero_grad()
+                out_origin = ddp_model(input)
+                out_dummy = dummy_ddp_model(input)
+                loss_origin = F.mse_loss(out_origin, target)
+                loss_dummy = F.mse_loss(out_dummy, target)
+                loss_origin.backward()
+                loss_dummy.backward()
+                optimizer.step()
+                dummy_optimizer.step()
+
+            # Check that gradients after 10 epochs are the same
+            for orig_param, dummy_param in zip(
+                ddp_model.parameters(), dummy_ddp_model.parameters()
+            ):
+                self.assertEqual(orig_param.grad, dummy_param.grad)
+
+            dist.barrier()
+            if rank == 0:
+                os.remove(chkpt_file)
+
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["cuda"],
+            f"The {BACKEND} backend does not support DDP communication hook on CUDA devices",
+        )
+        @skip_if_lt_x_gpu(int(os.environ["WORLD_SIZE"]))
+        @skip_but_pass_in_sandcastle_if(
+            True, "Skipped due to flakiness"
+        )
+        def test_ddp_hook_pickling_powerSGD(self):
+
+            hook = powerSGD.powerSGD_hook
+            powersgd_state = powerSGD.PowerSGDState(
+                process_group=None,
+                matrix_approximation_rank=1,
+                start_powerSGD_iter=4,
+            )
+            self._test_hook_pickling(hook, powersgd_state)
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        def test_ddp_device_mesh_initialization(self):
+            """
+            Test DDP with device_mesh initialization.
+            """
+            world_size = int(os.environ["WORLD_SIZE"])
+
+            from torch.distributed.device_mesh import init_device_mesh
+            device_mesh = init_device_mesh("cuda", (world_size,))
+
+            pg = _get_default_group()
+
+            torch.cuda.set_device(self.rank)
+            model = TwoLinLayerNet().cuda()
+            ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_mesh=device_mesh)
+            self.assertEqual(ddp_model.device_mesh, device_mesh)
+            self.assertEqual(ddp_model.device_mesh.get_group(mesh_dim=0), pg)
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Cannot specify both process_group and device_mesh arguments."
+            ):
+                ddp_model = torch.nn.parallel.DistributedDataParallel(
+                    model, process_group=pg, device_mesh=device_mesh
+                )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Only 1D device mesh is supported,"
+            ):
+                device_mesh = init_device_mesh("cuda", (2, world_size // 2))
+                ddp_model = torch.nn.parallel.DistributedDataParallel(
+                    model, device_mesh=device_mesh
+                )
+
+        @skip_if_lt_x_gpu(2)
+        @require_world_size(2)
+        @skip_but_pass_in_sandcastle_if(
+            BACKEND not in DistTestCases.backend_feature["ddp"],
+            f"The {BACKEND} backend does not support DistributedDataParallel",
+        )
+        def test_ddp_compile_static_graph(self):
+            "Tests that DDP works with torch compile when static_graph=True"
+            model = torch.nn.Linear(10, 10).cuda(self.rank)
+            model_clone = copy.deepcopy(model)
+            ddp = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[self.rank],
+            )
+            ddp_static = torch.nn.parallel.DistributedDataParallel(
+                model_clone,
+                device_ids=[self.rank],
+                static_graph=True
+            )
+            ddp = torch.compile(ddp)
+            ddp_static = torch.compile(ddp_static)
+            input = torch.rand(10, 10).cuda(self.rank)
+            # verify output and gradient parity
+            for _ in range(6):
+                out_ddp = ddp(input).sum()
+                out_ddp_static = ddp_static(input).sum()
+                self.assertEqual(out_ddp, out_ddp_static)
+                out_ddp.backward()
+                out_ddp_static.backward()
+                for p1, p2 in zip(ddp.parameters(), ddp_static.parameters()):
+                    self.assertEqual(p1.grad, p2.grad)
+
+
+instantiate_parametrized_tests(DistributedTest._DistTestBase)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8e09011c08daf418035ee46dcbd52f5c84d5b0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/distributed_utils.py
@@ -0,0 +1,66 @@
+# mypy: ignore-errors
+
+from contextlib import contextmanager
+from datetime import timedelta
+from functools import (
+    partial,
+    wraps,
+)
+
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as c10d
+
+class MockProcessGroup(dist.ProcessGroup):
+
+    def __init__(self, rank, world):
+        super().__init__(rank, world)
+
+    def getBackendName(self):
+        return "mock_process_group"
+
+def create_mock_pg(prefix_store, rank, world_size, timeout):
+    return MockProcessGroup(rank, world_size)
+
+dist.Backend.register_backend('mock_process_group', create_mock_pg)
+
+def mock_init_dist(rank, world_size):
+    # !!! WARNING !!!
+    # Kids don't try this at home, this is a cute pile of hacks that
+    # depends on a small mountain of c10d internals
+    assert not dist.is_initialized()
+    store = dist.HashStore()
+    # Trick _store_based_barrier into believing everyone else already checked-in
+    # Zero is the group index
+    store.add(f"{c10d.STORE_BASED_BARRIER_PREFIX}:0", world_size - 1)
+    dist.init_process_group(
+        backend="mock_process_group",
+        rank=rank,
+        world_size=world_size,
+        store=store,
+        group_name="fake",
+        timeout=timedelta(seconds=1))
+
+@contextmanager
+def with_dist(rank=0, world_size=2):
+    """
+    Context manager that initializer c10d with a fake process group.
+    """
+    mock_init_dist(rank=rank, world_size=world_size)
+    try:
+        yield
+    finally:
+        dist.destroy_process_group()
+
+def with_fake_comms(func=None, rank=0, world_size=2):
+    """
+    Function wrapper that inits a fake process group designed for testing.
+    Right now only querying for world size is available
+    """
+    if func is None:
+        return partial(with_fake_comms, rank=rank, world_size=world_size)
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        with with_dist(rank, world_size):
+            func(self, *args, **kwargs)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/fake_pg.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/fake_pg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3728955452694b0faf02305732895512405274
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/fake_pg.py
@@ -0,0 +1,32 @@
+# mypy: ignore-errors
+
+import torch.distributed as dist
+
+from torch._C._distributed_c10d import (
+    FakeProcessGroup,
+)
+
+
+class FakeStore(dist.Store):
+    """
+    A fake store is a fake Key-Value store simply for initialization usage
+    the of fake process group, one can either use FakeStore or HashStore.
+    """
+    pass
+
+
+def _create_fake_pg(prefix_store, rank, world_size, timeout):
+    """
+    A fake process group (not related to FakeTensor) is a process group which
+    doesn't actually do any communication, it just hallucinates some
+    communication.  You can run a single rank with a fake process group
+    without needing multiple processes (simulates per-rank behavior)
+
+    NOTE: This is not a real process group, and it would produce wrong results
+    for every collective. It should be used as a convinient tool when playing
+    with distributed but don't care about the actual data.
+    """
+    return FakeProcessGroup(rank, world_size)
+
+
+dist.Backend.register_backend("fake", _create_fake_pg, devices=['cpu', 'cuda'])
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/multi_threaded_pg.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/multi_threaded_pg.py
new file mode 100644
index 0000000000000000000000000000000000000000..49366842c616cd48cd46a7616aa374a53b180be5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/multi_threaded_pg.py
@@ -0,0 +1,494 @@
+# mypy: ignore-errors
+
+import sys
+import threading
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+from functools import partial, reduce
+
+import torch
+import torch.distributed as dist
+import weakref
+from torch._C._distributed_c10d import (
+    _create_work_from_future,
+    AllgatherOptions,
+    AllreduceOptions,
+    AllToAllOptions,
+    BarrierOptions,
+    BroadcastOptions,
+    ReduceScatterOptions,
+    ScatterOptions,
+    Store,
+    ReduceOp,
+)
+from torch.distributed.distributed_c10d import _CollOp, _store_based_barrier, P2POp
+from torch.futures import Future
+from torch.utils import _pytree as pytree
+
+"""
+TODO:
+Lots of missing collectives.
+Collectives validation.
+Make timeout robust by making collectives respect the test deadline.
+Make tests robust by making collectives interruptible.
+We need some synchronization around cleanup to ensure that timedout ranks don't cause spurious failures.
+
+"""
+
+
+def flatten_list(lst):
+    return pytree.tree_leaves(lst)
+
+
+def ret_work(ret):
+    fut = Future()
+    fut.set_result(ret)
+    return _create_work_from_future(fut)
+
+def binop_reduce(tensors, op):
+    res = op(torch.stack(tensors), dim=0)
+    if isinstance(res, torch.Tensor):
+        return res
+    # min/max return a namedtuple
+    return res.values
+
+def bitwise_reduce(tensors, op):
+    return reduce(op, tensors)
+
+_reduce_ops = {
+    ReduceOp.SUM: partial(binop_reduce, op=torch.sum),
+    ReduceOp.AVG: partial(binop_reduce, op=torch.mean),
+    ReduceOp.PRODUCT: partial(binop_reduce, op=torch.prod),
+    ReduceOp.MIN: partial(binop_reduce, op=torch.min),
+    ReduceOp.MAX: partial(binop_reduce, op=torch.max),
+    ReduceOp.BAND: partial(bitwise_reduce, op=torch.bitwise_and),
+    ReduceOp.BOR: partial(bitwise_reduce, op=torch.bitwise_or),
+    ReduceOp.BXOR: partial(bitwise_reduce, op=torch.bitwise_xor),
+}
+
+class AllToAll:
+    @torch.no_grad()
+    def work(self, data):
+        world_size = len(data)
+        for dest_rank in range(world_size):
+            output_tensor_list, _ = data[dest_rank]
+            for src_rank in range(world_size):
+                _, input_tensor_list = data[src_rank]
+                output_tensor_list[src_rank].copy_(input_tensor_list[dest_rank])
+
+class AllReduce:
+    def __init__(self, op):
+        if op.op not in _reduce_ops:
+            raise NotImplementedError(
+                f"AllReduce op {op.op} not supported on multithreaded pg for now."
+            )
+        self.op = op.op
+
+    @torch.no_grad()
+    def work(self, data):
+        for i in range(len(data[0])):
+            tensors = []
+            # use rank0 as the device for sum
+            rank_0_device = data[0][i].device
+            # collect all data to the list and make them
+            # all on rank 0 device
+            for src_rank in range(0, len(data)):
+                tensors.append(data[src_rank][i].to(rank_0_device))
+
+            # now mimic reduce across all ranks
+            res = _reduce_ops[self.op](tensors)
+
+            # copy all the reduced value to each rank
+            for src_rank in range(len(data)):
+                data[src_rank][i].copy_(res.to(data[src_rank][i].device))
+
+
+class AllGather:
+    @torch.no_grad()
+    def work(self, data):
+        for src_rank in range(len(data)):
+            in_tensor_list = data[src_rank][1]
+            # Can't handle all_gather with multiple tensors
+            assert len(in_tensor_list) == 1
+            src_tensor = in_tensor_list[0]
+
+            for dest in data:
+                dest_tensor = dest[0][0][src_rank]
+                dest_tensor.copy_(src_tensor)
+
+
+class Scatter:
+    def __init__(self, src):
+        self.src = src
+
+    @torch.no_grad()
+    def work(self, data):
+        src_in_tensor_list = data[self.src][1]
+        # Can't handle scatter with multiple input tensor list
+        assert len(src_in_tensor_list) == 1
+        src_in_tensors = src_in_tensor_list[0]
+
+        for rank, each_rank_data in enumerate(data):
+            out_tensor_list = each_rank_data[0]
+            # Can't handle scatter with multiple output tensor
+            assert len(out_tensor_list) == 1
+            dest_tensor = out_tensor_list[0]
+            dest_tensor.copy_(src_in_tensors[rank])
+
+
+class Gather:
+    def __init__(self, dst):
+        self.dst = dst
+
+    @torch.no_grad()
+    def work(self, data):
+        # Can't handle gather with multiple tensor lists
+        assert len(data[self.dst][0]) == 1
+        out_tensor_list = data[self.dst][0][0]
+        for rank, each_rank_data in enumerate(data):
+            src_in_tensor_list = each_rank_data[1]
+            # Can't handle gather with multiple tensor lists
+            assert len(src_in_tensor_list) == 1
+            dest_tensor = out_tensor_list[rank]
+            dest_tensor.copy_(src_in_tensor_list[0])
+
+class ReduceScatter:
+    def __init__(self, op):
+        if op != dist.ReduceOp.SUM and op != dist.ReduceOp.AVG:
+            raise NotImplementedError(f"ReduceScatter does not support {op}")
+        self.op = op
+
+    @torch.no_grad()
+    def work(self, data):
+        start_reduction = [False for _ in range(len(data))]
+        for each_rank_data in data:
+            # Can't handle reduce_scatter with multiple scatter list
+            assert len(each_rank_data[1]) == 1
+            to_scatter = each_rank_data[1][0]
+            for i in range(len(to_scatter)):
+                dest_tensor_on_rank_i = data[i][0]
+                # Can't handle reduce_scatter with multiple output tensor
+                assert len(dest_tensor_on_rank_i) == 1
+                dst_tensor_device = dest_tensor_on_rank_i[0].device
+                if not start_reduction[i]:
+                    dest_tensor_on_rank_i[0].copy_(to_scatter[i].to(dst_tensor_device))
+                    start_reduction[i] = True
+                else:
+                    dest_tensor_on_rank_i[0].add_(to_scatter[i].to(dst_tensor_device))
+        if self.op == dist.ReduceOp.AVG:
+            num_ranks = len(data)
+            for each_rank_data in data:
+                each_rank_data[0][0] /= num_ranks
+
+
+class Broadcast:
+    def __init__(self, src):
+        self.src = src
+
+    @torch.no_grad()
+    def work(self, data):
+        in_tensor_list = flatten_list(data[self.src])
+        for i in range(len(data)):
+            out_tensor_list = flatten_list(data[i])
+            for j in range(len(in_tensor_list)):
+                out_tensor_list[j].copy_(in_tensor_list[j])
+
+
+class Collective:
+    def __init__(self, world_size, collective, pg):
+        self._world_size = world_size
+        self._collective = collective
+
+        self._start_cond = threading.Condition()
+        self._done_cond = threading.Condition()
+
+        self._data = [None] * world_size
+        self._count = 0
+        self._done = False
+
+        self._pg = pg
+
+    def join(self, rank, data):
+        with self._start_cond:
+            self._data[rank] = data
+            self._count += 1
+
+            # notify rank 0
+            if self._count == self._world_size:
+                if rank > 0:
+                    self._start_cond.notify()
+
+            if rank == 0:
+                self._start_cond.wait_for(
+                    lambda: self._count == self._world_size or self._pg._terminate.is_set()
+                )
+                # SystemExit is not a subclass of Exception but BaseException
+                # and can be distinguished from normal exception raised from program errors
+                # so that we can hide it from the exception queue
+                if self._pg._terminate.is_set():
+                    sys.exit("Test termination event occurs.")
+
+        with self._done_cond:
+            # wait for rank 0 to finish
+            if rank > 0:
+                self._done_cond.wait_for(lambda: self._done or self._pg._terminate.is_set())
+                if self._pg._terminate.is_set():
+                    sys.exit("Test termination event occurs.")
+            else:
+                # copy data around
+                self._collective.work(self._data)
+                self._done = True
+                self._done_cond.notify_all()
+        return ret_work(data)
+
+
+class ProcessLocalGroup(dist.ProcessGroup):
+    _coll_lock = threading.Lock()
+    _cur_coll_on_pgs = {}
+
+    _terminate = threading.Event()
+
+    @classmethod
+    def _start_coll(cls, collective, pg):
+        with cls._coll_lock:
+            # pg_name is unique, we use that to record the mapping between pg and collective
+            if pg.pg_name not in cls._cur_coll_on_pgs:
+                cls._cur_coll_on_pgs[pg.pg_name] = Collective(pg.size(), collective, cls)
+            return cls._cur_coll_on_pgs[pg.pg_name]
+
+    @classmethod
+    def _end_coll(cls, collective, pg):
+        # This is racily called by all ranks, so only one will work
+        with cls._coll_lock:
+            if pg.pg_name in cls._cur_coll_on_pgs and cls._cur_coll_on_pgs[pg.pg_name] == collective:
+                cls._cur_coll_on_pgs.pop(pg.pg_name)
+
+    @classmethod
+    def exception_handle(cls, exc):
+        cls._terminate.set()
+        for coll in cls._cur_coll_on_pgs.values():
+            with coll._start_cond:
+                coll._start_cond.notify()
+            with coll._done_cond:
+                coll._done_cond.notify_all()
+
+    @classmethod
+    def reset(cls):
+        with cls._coll_lock:
+            cls._cur_coll_on_pgs = {}
+            cls._terminate.clear()
+
+    def alltoall(self, output_tensor_list, input_tensor_list, opts=AllToAllOptions()):
+        coll = ProcessLocalGroup._start_coll(AllToAll(), self)
+        res = coll.join(self._rank, (output_tensor_list, input_tensor_list))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def allreduce(self, tensor_list, opts=AllreduceOptions()):
+        coll = ProcessLocalGroup._start_coll(AllReduce(opts.reduceOp), self)
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def allreduce_coalesced(self, tensor_list, opts=AllreduceOptions()):
+        coll = ProcessLocalGroup._start_coll(AllReduce(opts.reduceOp), self)
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def barrier(self, opts=BarrierOptions()):
+        return self.allreduce(tensor_list=[torch.ones(1)])
+
+    def allgather(self, output_tensors, input_tensor, opts=AllgatherOptions()):
+        coll = ProcessLocalGroup._start_coll(AllGather(), self)
+        res = coll.join(self._rank, (output_tensors, input_tensor))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def _allgather_base(self, output_tensor, input_tensor, opts=AllgatherOptions()):
+        tensor_list = list(torch.chunk(output_tensor, self._world_size))
+        return self.allgather([tensor_list], [input_tensor], opts)
+
+    def broadcast(self, tensor_list, opts=BroadcastOptions()):
+        coll = ProcessLocalGroup._start_coll(Broadcast(opts.rootRank), self)
+        res = coll.join(self._rank, tensor_list)
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def scatter(self, output_tensors, input_tensors, opts=ScatterOptions()):
+        coll = ProcessLocalGroup._start_coll(Scatter(opts.rootRank), self)
+        res = coll.join(self._rank, (output_tensors, input_tensors))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def gather(self, output_tensors, input_tensors, opts=ScatterOptions()):
+        coll = ProcessLocalGroup._start_coll(Gather(opts.rootRank), self)
+        res = coll.join(self._rank, (output_tensors, input_tensors))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def reduce_scatter(self, output_tensor, scatter_list, opts=ReduceScatterOptions()):
+        coll = ProcessLocalGroup._start_coll(ReduceScatter(opts.reduceOp), self)
+        res = coll.join(self._rank, (output_tensor, scatter_list))
+        ProcessLocalGroup._end_coll(coll, self)
+        return res
+
+    def _reduce_scatter_base(self, output_tensor, input_tensor, opts=ReduceScatterOptions()):
+        tensor_list = list(torch.chunk(input_tensor, self._world_size))
+        return self.reduce_scatter([output_tensor], [tensor_list], opts)
+
+    def reduce_scatter_tensor_coalesced(self, output_tensors, input_tensors, opts=ReduceScatterOptions()):
+        works = [
+            self._reduce_scatter_base(output_tensor, input_tensor, opts)
+            for output_tensor, input_tensor
+            in zip(output_tensors, input_tensors)
+        ]
+        for work in works[:-1]:
+            work.wait()
+        return works[-1]
+
+    def allgather_into_tensor_coalesced(self, output_tensor_list, input_tensor_list, opts=AllgatherOptions()):
+        res = None
+        for o_t, i_t in zip(output_tensor_list, input_tensor_list):
+            res = self._allgather_base(o_t, i_t)
+        return res
+
+    def __init__(self, rank, world_size):
+        super().__init__(rank, world_size)
+        self._rank = rank
+        self._world_size = world_size
+        world = dist.distributed_c10d._world
+        if isinstance(world, ThreadLocalWorld):
+            world = world._get_world()
+        self._world = weakref.ref(world)
+        self._ctx = torch.autograd.set_multithreading_enabled(False)
+
+    def size(self):
+        return self._world_size
+
+    @property
+    def pg_name(self):
+        """
+        return the global registered name of the current pg in the world
+        """
+        return self._world().pg_names[self]
+
+    @property
+    def group_name(self):
+        return self.pg_name
+
+    def getBackendName(self):
+        return "threaded"
+
+    def __repr__(self):
+        return f"ThreadedPG world_size:{self._world_size} rank:{self._rank}"
+
+
+def _create_threaded_pg(prefix_store, rank, world_size, timeout):
+    pg = ProcessLocalGroup(rank, world_size)
+    # https://github.com/pytorch/pytorch/pull/103033 changed store based barrier to optional
+    # When device mesh involves sub groups while store based barrier is not enabled in c10d,
+    # even though threaded pg actual collectives are assumed to be single threaded,
+    # different threads may be initializing different groups,
+    # leading to race conditions.
+    # For example, if we have a mesh of [[0, 1], [2, 3]], the sub groups
+    # (dim 0 and 1) would be initialized in different threads independently.
+    # In this case we can no longer rely on class or global variables
+    # but have to rely on store based barrier to make sure each group
+    # is ready separately before we can invoke collectives in any of the groups.
+
+    # the prefix store is already per group so we pass an empty name here
+    _store_based_barrier(rank, prefix_store, "", world_size, timeout)
+    return pg
+
+
+dist.Backend.register_backend("threaded", _create_threaded_pg, devices=["cpu", "cuda"])
+
+
+@dataclass
+class WorldData:
+    default_pg: dist.ProcessGroup
+    pg_map: Dict[dist.ProcessGroup, Tuple[str, Optional[Store]]]
+    pg_names: Dict[dist.ProcessGroup, str]
+    pg_group_ranks: Dict[dist.ProcessGroup, Dict[int, int]]
+    pg_backend_config: Dict[dist.ProcessGroup, str]
+    group_count: int
+    tags_to_pg: Dict[str, List[dist.ProcessGroup]]
+    pg_to_tag: Dict[dist.ProcessGroup, str]
+    pg_coalesce_state: Dict[dist.ProcessGroup, List[Union[_CollOp, P2POp]]]
+    pg_default_device: Dict[dist.ProcessGroup, torch.device]
+
+
+class ThreadLocalWorld:
+    _world = threading.local()
+
+    def _get_world(self) -> WorldData:
+        if not hasattr(ThreadLocalWorld._world, "world"):
+            ThreadLocalWorld._world.world = WorldData(None, {}, {}, {}, {}, 0, {}, {}, {}, {})
+        return ThreadLocalWorld._world.world
+
+    @property
+    def default_pg(self):
+        return self._get_world().default_pg
+
+    @default_pg.setter
+    def default_pg(self, value):
+        self._get_world().default_pg = value
+
+    @property
+    def pg_map(self):
+        return self._get_world().pg_map
+
+    @property
+    def pg_names(self):
+        return self._get_world().pg_names
+
+    @property
+    def pg_group_ranks(self):
+        return self._get_world().pg_group_ranks
+
+    @property
+    def pg_backend_config(self):
+        return self._get_world().pg_backend_config
+
+    @property
+    def group_count(self) -> int:
+        return self._get_world().group_count
+
+    @group_count.setter
+    def group_count(self, value):
+        self._get_world().group_count = value
+
+    @property
+    def tags_to_pg(self):
+        return self._get_world().tags_to_pg
+
+    @property
+    def pg_to_tag(self):
+        return self._get_world().pg_to_tag
+
+    @property
+    def pg_coalesce_state(self) -> Dict[dist.ProcessGroup, List[Union[_CollOp, P2POp]]]:
+        return self._get_world().pg_coalesce_state
+
+    @property
+    def pg_default_device(self) -> Dict[dist.ProcessGroup, torch.device]:
+        return self._get_world().pg_default_device
+
+
+_old_pg_world = None
+_ctx_manager = None
+
+
+def _install_threaded_pg():
+    global _old_pg_world
+    global _ctx_manager
+    _old_pg_world = dist.distributed_c10d._world
+    dist.distributed_c10d._world = ThreadLocalWorld()
+    _ctx_manager = torch.autograd.set_multithreading_enabled(False)
+
+    return dist.distributed_c10d._world
+
+
+def _uninstall_threaded_pg():
+    dist.distributed_c10d._world = _old_pg_world
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..231c601e0a1146745e56e9a22cb38a181a999da4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1ae62ff7c0db01efc24f32c10f70c3105e59e4b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/remote_module_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/remote_module_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a23a247f633ac4b3324d431dc39d66284022f34a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/__pycache__/remote_module_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/remote_module_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..64538dd773d5d5febbf25dc2f6eb01f660d25c74
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/nn/api/remote_module_test.py
@@ -0,0 +1,734 @@
+# mypy: ignore-errors
+
+import enum
+from typing import Tuple
+
+import torch
+import torch.distributed.rpc as rpc
+import torch.testing._internal.dist_utils as dist_utils
+from torch import Tensor, nn
+from torch._jit_internal import Future
+from torch.distributed.nn import RemoteModule
+from torch.distributed.nn.api.remote_module import _REMOTE_MODULE_PICKLED_ATTRIBUTES
+from torch.distributed.nn.api.remote_module import _RemoteModule
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_utils import TemporaryFileName
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+
+_PARAM_VAL = torch.nn.Parameter(torch.ones(1))
+
+
+# RPC handler for querying the device on the destination worker.
+def remote_device(module_rref):
+    for param in module_rref.local_value().parameters():
+        return param.device
+
+
+# RPC handler for querying __dict__ on the destination worker.
+def remote_module_attributes(remote_module):
+    return remote_module.__dict__
+
+
+# RPC handler for running forward on the destination worker.
+def remote_forward(remote_module, args):
+    return remote_module.forward(*args)
+
+# RPC handler for running forward_async on the destination worker.
+def remote_forward_async(remote_module, args):
+    # Since future cannot be pickled and sent over the RPC layer,
+    # have to wait and behave just like ``forward_sync``.
+    return remote_module.forward_async(*args).wait()
+
+# RPC handler for getting training mode on the destination worker.
+def get_remote_training_arg(module_rref):
+    return module_rref.local_value().training
+
+class ModuleCreationMode(enum.Enum):
+    MODULE_CTOR_WITH_INTERFACE = "module_ctor_with_interface"
+    MODULE_CTOR = "module_ctor"
+
+
+@torch.jit.interface
+class MyModuleInterface:
+    def forward(
+        self, tensor: Tensor, number: int, word: str = "default"
+    ) -> Tuple[str, int, Tensor]:
+        # pyre-ignore[7]: Pyre and torch.jit.interface don't mix well
+        pass
+
+
+@torch.jit.interface
+class RemoteMyModuleInterface:
+    def forward(
+        self, tensor: Tensor, number: int, word: str = "default"
+    ) -> Tuple[str, int, Tensor]:
+        # pyre-ignore[7]: Pyre and torch.jit.interface don't mix well
+        pass
+
+    def forward_async(
+        self, tensor: Tensor, number: int, word: str = "default"
+    ) -> Future[Tuple[str, int, Tensor]]:
+        pass
+
+
+class MyModule(nn.Module):
+    def __init__(self, first_arg, first_kwarg=-1):
+        super().__init__()
+        self.param1 = _PARAM_VAL
+
+    def forward(
+        self, tensor: Tensor, number: int, word: str = "default"
+    ) -> Tuple[str, int, Tensor]:
+        return word, number, tensor
+
+
+class BadModule:
+    def __init__(self, first_arg, first_kwarg=-1):
+        pass
+
+
+def create_scripted_module(first_arg, first_kwarg=-1):
+    module = MyModule(first_arg, first_kwarg=first_kwarg)
+    scripted_module = torch.jit.script(module)
+    return scripted_module
+
+
+# Common utils for both CPU and CUDA test suites
+class CommonRemoteModuleTest(RpcAgentTestFixture):
+    @property
+    def world_size(self):  # Override setting in RpcAgentTestFixture
+        return 2
+
+    @staticmethod
+    def _create_remote_module_iter(remote_device, modes=None):
+        if modes is None:
+            modes = ModuleCreationMode.__members__.values()
+
+        args = (1,)
+        kwargs = dict(first_kwarg=2)
+
+        if ModuleCreationMode.MODULE_CTOR in modes:
+            remote_module = RemoteModule(remote_device, MyModule, args, kwargs)
+            yield remote_module
+
+        if ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE in modes:
+            remote_module = _RemoteModule(
+                remote_device,
+                create_scripted_module,
+                args,
+                kwargs,
+                _module_interface_cls=MyModuleInterface,
+            )
+            scripted_remote_module = torch.jit.script(remote_module)
+            yield scripted_remote_module
+
+
+class RemoteModuleTest(CommonRemoteModuleTest):
+    @dist_utils.dist_init
+    def test_bad_module(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        remote_device = f"{dst_worker_name}/cpu"
+        args = (1,)
+        kwargs = dict(first_kwarg=2)
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Expect `module_cls\(\*args, \*\*kwargs\)` returns an instance of <class nn.Module>,",
+        ):
+            RemoteModule(remote_device, BadModule, args, kwargs).forward()
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Expect `module_cls\(\*args, \*\*kwargs\)` returns an instance of <class nn.Module>,",
+        ):
+            RemoteModule(remote_device, BadModule, args, kwargs).forward()
+
+
+    @dist_utils.dist_init
+    def test_forward_async(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        args = (torch.ones(1), 2, "3")
+        for remote_module in self._create_remote_module_iter(dst_worker_name):
+            ret_fut = remote_module.forward_async(*args)
+            ret = ret_fut.wait()
+            self.assertEqual(ret, tuple(reversed(args)))
+
+    @dist_utils.dist_init
+    def test_forward_async_script(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        scripted_remote_module = next(
+            self._create_remote_module_iter(
+                dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
+            )
+        )
+
+        @torch.jit.script
+        def run_forward_async(scripted_remote_module: RemoteMyModuleInterface):
+            ret_fut = scripted_remote_module.forward_async(torch.ones(1), 2, "3")
+            ret = ret_fut.wait()
+            return ret
+
+        ret = run_forward_async(scripted_remote_module)
+
+        self.assertEqual(ret, ("3", 2, torch.ones(1)))
+
+    @dist_utils.dist_init
+    def test_forward_sync(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        args = (torch.ones(1), 2, "3")
+        for remote_module in self._create_remote_module_iter(dst_worker_name):
+            ret = remote_module.forward(*args)
+            self.assertEqual(ret, tuple(reversed(args)))
+
+    @dist_utils.dist_init
+    def test_forward_sync_script(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        scripted_remote_module = next(
+            self._create_remote_module_iter(
+                dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
+            )
+        )
+
+        @torch.jit.script
+        def run_forward(scripted_remote_module: MyModuleInterface):
+            ret = scripted_remote_module.forward(torch.ones(1), 2, "3")
+            return ret
+
+        ret = run_forward(scripted_remote_module)
+
+        self.assertEqual(ret, ("3", 2, torch.ones(1)))
+
+    @dist_utils.dist_init
+    def test_forward_with_kwargs(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        args = (torch.ones(1), 2)
+        kwargs = dict(word="3")
+        # Only test Python nn.Module, because script module methods don't support taking kwargs.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            ret_fut = remote_module.forward_async(*args, **kwargs)
+            ret = ret_fut.wait()
+            self.assertEqual(ret, tuple(reversed(args + ("3",))))
+
+            ret = remote_module.forward(*args, **kwargs)
+            self.assertEqual(ret, tuple(reversed(args + ("3",))))
+
+    @dist_utils.dist_init
+    def test_remote_parameters(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        # Only test Python nn.Module, because script module methods don't support ``remote_parameters``.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            param_rrefs = remote_module.remote_parameters()
+            self.assertEqual(len(param_rrefs), 1)
+            self.assertTrue(torch.equal(param_rrefs[0].to_here(), _PARAM_VAL))
+
+    @dist_utils.dist_init
+    def test_get_module_rref(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        # Only test Python nn.Module, because script module methods don't support ``get_module_rref``.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            rref = remote_module.get_module_rref()
+            self.assertEqual(rref, remote_module.module_rref)
+            for param in rref.to_here().parameters():
+                self.assertTrue(torch.equal(param, _PARAM_VAL))
+
+    @dist_utils.dist_init
+    def test_train_eval(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            remote_module.train()
+            ret1 = rpc.rpc_sync(dst_worker_name, get_remote_training_arg, args=(remote_module.get_module_rref(),))
+            self.assertEqual(ret1, True)
+
+            remote_module.eval()
+            ret2 = rpc.rpc_sync(dst_worker_name, get_remote_training_arg, args=(remote_module.get_module_rref(),))
+            self.assertEqual(ret2, False)
+
+    @dist_utils.dist_init
+    def test_unsupported_methods(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``register_buffer`` not supported for RemoteModule"
+            ):
+                remote_module.register_buffer("buffer", torch.ones(5))
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``register_parameter`` not supported for RemoteModule",
+            ):
+                remote_module.register_parameter(
+                    "param", torch.nn.Parameter(torch.ones(1))
+                )
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``add_module`` not supported for RemoteModule"
+            ):
+                remote_module.add_module("empty", None)
+
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``apply`` not supported for RemoteModule"
+            ):
+                fn = torch.rand((3, 3), requires_grad=False)
+                remote_module.apply(fn)
+
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``cuda`` not supported for RemoteModule"
+            ):
+                remote_module.cuda()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``cpu`` not supported for RemoteModule"
+            ):
+                remote_module.cpu()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``type`` not supported for RemoteModule"
+            ):
+                remote_module.type(torch.FloatTensor)
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``float`` not supported for RemoteModule"
+            ):
+                remote_module.float()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``double`` not supported for RemoteModule"
+            ):
+                remote_module.double()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``bfloat16`` not supported for RemoteModule"
+            ):
+                remote_module.bfloat16()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``to`` not supported for RemoteModule"
+            ):
+                remote_module.to("cpu", dtype=torch.int32)
+
+            def hook(module, grad_input, grad_output):
+                pass
+
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``register_backward_hook`` not supported for RemoteModule",
+            ):
+                remote_module.register_backward_hook(hook)
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``register_forward_pre_hook`` not supported for RemoteModule",
+            ):
+                remote_module.register_forward_pre_hook(hook)
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``register_forward_hook`` not supported for RemoteModule",
+            ):
+                remote_module.register_forward_hook(hook)
+
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``state_dict`` not supported for RemoteModule"
+            ):
+                remote_module.state_dict()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``load_state_dict`` not supported for RemoteModule"
+            ):
+                remote_module.load_state_dict({})
+
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``parameters`` not supported for RemoteModule. Please use ``remote_parameters`` instead.",
+            ):
+                remote_module.parameters()
+            with self.assertRaisesRegex(
+                ValueError,
+                r"Method ``named_parameters`` not supported for RemoteModule",
+            ):
+                remote_module.named_parameters()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``buffers`` not supported for RemoteModule"
+            ):
+                remote_module.buffers()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``named_buffers`` not supported for RemoteModule"
+            ):
+                remote_module.named_buffers()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``children`` not supported for RemoteModule"
+            ):
+                remote_module.children()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``named_children`` not supported for RemoteModule"
+            ):
+                remote_module.named_children()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``modules`` not supported for RemoteModule"
+            ):
+                remote_module.modules()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``named_modules`` not supported for RemoteModule"
+            ):
+                remote_module.named_modules()
+
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``requires_grad_`` not supported for RemoteModule"
+            ):
+                remote_module.requires_grad_()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``zero_grad`` not supported for RemoteModule"
+            ):
+                remote_module.zero_grad()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``share_memory`` not supported for RemoteModule"
+            ):
+                remote_module.share_memory()
+            with self.assertRaisesRegex(
+                ValueError, r"Method ``extra_repr`` not supported for RemoteModule"
+            ):
+                remote_module.extra_repr()
+
+    @dist_utils.dist_init
+    def test_send_remote_module_with_a_new_attribute_not_pickled_over_the_wire(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        # If a new attribute is added to this RemoteModule after the initialization,
+        # and it will be sent over the wire by RPC,
+        # this new field will not be pickled, because it's not specified in _REMOTE_MODULE_PICKLED_ATTRIBUTES.
+        # Note that adding a new attribute out of constructor should rarely happen.
+        # If a new attribute is added to RemoteModule constructor,
+        # there is a sanity check to enforce developers to add this attribute to either
+        # _REMOTE_MODULE_PICKLED_ATTRIBUTES or _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            new_attr_name = "new_attr"
+            setattr(remote_module, new_attr_name, 1)
+
+            attrs = rpc.rpc_sync(
+                dst_worker_name, remote_module_attributes, (remote_module,)
+            )
+            self.assertNotIn(new_attr_name, attrs)
+
+    @dist_utils.dist_init
+    def test_remote_module_py_pickle_not_supported(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            with TemporaryFileName() as fname:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    "Cannot pickle RemoteModule in python pickler. RemoteModule can only be pickled when using RPC",
+                ):
+                    torch.save(remote_module, fname)
+
+    @dist_utils.dist_init
+    def test_remote_module_py_pickle_not_supported_script(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        for remote_module in self._create_remote_module_iter(
+            dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
+        ):
+            with TemporaryFileName() as fname:
+                with self.assertRaisesRegex(torch.jit.Error, "can only be pickled when using RPC"):
+                    torch.save(remote_module, fname)
+
+
+class ThreeWorkersRemoteModuleTest(CommonRemoteModuleTest):
+    @property
+    def world_size(self):  # Override setting in CommonRemoteModuleTest
+        return 3
+
+    @dist_utils.dist_init
+    def test_send_remote_module_over_the_wire(self):
+        if self.rank != 0:
+            return
+        dst_worker1_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        dst_worker2_name = dist_utils.worker_name((self.rank + 2) % self.world_size)
+
+        # Unpickled attributes include both the inherent attributes of RemoteModule
+        # (not inherited from the superclass) and two installed methods.
+        expected_unpickled_attrs = list(_REMOTE_MODULE_PICKLED_ATTRIBUTES)
+        expected_unpickled_attrs.append("forward_async")
+        expected_unpickled_attrs.append("forward")
+
+        # Create a remote module on worker1 and then pass it to worker2 over the RPC layer.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker1_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            # Test querying some simple attributes from worker2.
+            attrs = rpc.rpc_sync(
+                dst_worker2_name, remote_module_attributes, (remote_module,)
+            )
+            self.assertListEqual(list(attrs.keys()), expected_unpickled_attrs)
+            self.assertEqual(attrs["on"], "worker1")
+            self.assertEqual(attrs["device"], "cpu")
+            self.assertFalse(attrs["is_device_map_set"])
+            self.assertFalse(attrs["is_scriptable"])
+
+            # Test the installed methods on worker1's can be initiated by worker2 over RPC layer.
+            # NOTE: In practice a remote module should be directly stored on the worker that runs ``forward``` or ``forward_async``,
+            # not have another worker to initiate forward over the RPC layer.
+            args = (torch.ones(1), 2, "3")
+            ret1 = rpc.rpc_sync(dst_worker2_name, remote_forward, (remote_module, args))
+            self.assertEqual(ret1, tuple(reversed(args)))
+            ret2 = rpc.rpc_sync(
+                dst_worker2_name, remote_forward_async, (remote_module, args)
+            )
+            self.assertEqual(ret2, tuple(reversed(args)))
+
+    @dist_utils.dist_init
+    def test_send_remote_module_over_the_wire_script_not_supported(self):
+        if self.rank != 0:
+            return
+        dst_worker1_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        dst_worker2_name = dist_utils.worker_name((self.rank + 2) % self.world_size)
+
+        # Unpickled attributes include both the inherent attributes of RemoteModule
+        # (not inherited from the superclass) and two installed methods.
+        expected_unpickled_attrs = list(_REMOTE_MODULE_PICKLED_ATTRIBUTES)
+        expected_unpickled_attrs.append("forward_async")
+        expected_unpickled_attrs.append("forward")
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Passing a script RemoteModule over RPC is not supported."
+        ):
+            # Create a remote module on worker1 and then pass it to worker2 over the RPC layer.
+            for remote_module in self._create_remote_module_iter(
+                dst_worker1_name, modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE]
+            ):
+                # Test querying some simple attributes from worker2.
+                attrs = rpc.rpc_sync(
+                    dst_worker2_name, remote_module_attributes, (remote_module,)
+                )
+
+    @dist_utils.dist_init
+    def test_create_remote_module_from_module_rref(self):
+        if self.rank != 0:
+            return
+        dst_worker1_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+        dst_worker2_name = dist_utils.worker_name((self.rank + 2) % self.world_size)
+
+        # Create a remote module on worker1 and then pass its `module_rref` to worker2 over the RPC layer.
+        for remote_module in self._create_remote_module_iter(
+            dst_worker1_name, modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            remote_module2 = rpc.rpc_sync(
+                dst_worker2_name,
+                RemoteModule.init_from_module_rref,
+                (dst_worker2_name, remote_module.get_module_rref()),
+            )
+
+            args = (torch.ones(1), 2, "3")
+            ret1 = rpc.rpc_sync(
+                dst_worker1_name, remote_forward, (remote_module, args)
+            )
+            ret2 = rpc.rpc_sync(
+                dst_worker2_name, remote_forward, (remote_module2, args)
+            )
+            self.assertEqual(ret2, ret2)
+
+
+class CudaRemoteModuleTest(CommonRemoteModuleTest):
+    @skip_if_lt_x_gpu(1)
+    @dist_utils.dist_init
+    def test_valid_device(self):
+        if self.rank != 0:
+            return
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker_name = dist_utils.worker_name(dst_rank)
+
+        for remote_module in self._create_remote_module_iter(
+            f"{dst_worker_name}/cuda:0", modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            device = rpc.rpc_sync(
+                dst_worker_name, remote_device, (remote_module.module_rref,)
+            )
+            self.assertEqual(device.type, "cuda")
+            self.assertEqual(device.index, 0)
+
+        # Test rank works as well.
+        for remote_module in self._create_remote_module_iter(
+            f"rank:{dst_rank}/cuda:0", modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            device = rpc.rpc_sync(
+                dst_worker_name, remote_device, (remote_module.module_rref,)
+            )
+            self.assertEqual(device.type, "cuda")
+            self.assertEqual(device.index, 0)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_utils.dist_init
+    def test_invalid_devices(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected one of .+ device type at start of device string",
+        ):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    f"{dst_worker_name}/foo",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"CUDA error: invalid device ordinal"
+        ):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    f"{dst_worker_name}/cuda:100",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(RuntimeError, r"Invalid device string: 'cpu2'"):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    f"{dst_worker_name}/cpu2",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(RuntimeError, r"Device string must not be empty"):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    f"{dst_worker_name}/",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Could not parse remote_device: worker1/cuda:0/cuda:1. The valid format is '<workername>/<device>'",
+        ):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    f"{dst_worker_name}/cuda:0/cuda:1",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Could not parse remote_device: /. The valid format is '<workername>/<device>'",
+        ):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    "/",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+        with self.assertRaisesRegex(
+            ValueError,
+            r"Could not parse remote_device: /cuda:0. The valid format is '<workername>/<device>'",
+        ):
+            [
+                m.forward()
+                for m in self._create_remote_module_iter(
+                    "/cuda:0",
+                    modes=[ModuleCreationMode.MODULE_CTOR],
+                )
+            ]
+
+    @skip_if_lt_x_gpu(1)
+    @dist_utils.dist_init
+    def test_input_moved_to_cuda_device(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        # These two CPU tensors (in args and kwargs) should be implicitly moved to an appropriate cuda device.
+        t1 = torch.ones(1)
+        args = (t1, 2)
+        t2 = t1 * 2
+        kwargs = dict(word=t2)
+
+        # Only test Python nn.Module, because script module methods don't support taking kwargs.
+        for remote_module in self._create_remote_module_iter(
+            f"{dst_worker_name}/cuda:0", modes=[ModuleCreationMode.MODULE_CTOR]
+        ):
+            ret_fut = remote_module.forward_async(*args, **kwargs)
+            ret = ret_fut.wait()
+            self.assertEqual(ret, tuple(reversed(args + (t2,))))
+            # TODO: Once the RPC backend can support directly sending GPU tensors, the expected device type should be "cuda:0".
+            self.assertEqual(ret[0].device.type, "cpu")
+            self.assertEqual(ret[2].device.type, "cpu")
+
+            ret = remote_module.forward(*args, **kwargs)
+            self.assertEqual(ret, tuple(reversed(args + (t2,))))
+            # TODO: Once the RPC backend can support directly sending GPU tensors, the expected device type should be "cuda:0".
+            self.assertEqual(ret[0].device.type, "cpu")
+            self.assertEqual(ret[2].device.type, "cpu")
+
+    @skip_if_lt_x_gpu(1)
+    @dist_utils.dist_init
+    def test_input_moved_to_cuda_device_script(self):
+        if self.rank != 0:
+            return
+        dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size)
+
+        scripted_remote_module = next(
+            self._create_remote_module_iter(
+                f"{dst_worker_name}/cuda:0",
+                modes=[ModuleCreationMode.MODULE_CTOR_WITH_INTERFACE],
+            )
+        )
+
+        @torch.jit.script
+        def run_forward(scripted_remote_module: MyModuleInterface):
+            ret = scripted_remote_module.forward(torch.ones(1), 2, "3")
+            return ret
+
+        ret = run_forward(scripted_remote_module)
+
+        self.assertEqual(ret, ("3", 2, torch.ones(1)))
+        # TODO: Once the RPC backend can support directly sending GPU tensors, the expected device type should be "cuda:0".
+        self.assertEqual(ret[2].device.type, "cpu")
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipe_with_ddp_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipe_with_ddp_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bcdfe4811370569c21ffc86594c2b403d1ea133
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipe_with_ddp_test.py
@@ -0,0 +1,149 @@
+# mypy: ignore-errors
+
+import torch
+import torch.distributed as dist
+
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+from torch.testing._internal.dist_utils import INIT_METHOD_TEMPLATE, dist_init
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+from torch.testing._internal.common_distributed import (
+    requires_gloo,
+    requires_nccl,
+    skip_if_lt_x_gpu,
+    skip_if_rocm,
+)
+from torch.distributed.pipeline.sync import Pipe
+
+class PipeWithDDPTest(RpcAgentTestFixture):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_nccl_ckpt_never(self):
+        self._run_basic_test("nccl", "never")
+
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_nccl_ckpt_never_find_unused(self):
+        self._run_basic_test("nccl", "never", find_unused_parameters=True)
+
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_nccl_ckpt_always(self):
+        self._run_basic_test("nccl", "always", static_graph=True)
+
+    @skip_if_lt_x_gpu(4)
+    @requires_nccl()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_nccl_ckpt_except_last(self):
+        self._run_basic_test("nccl", "except_last", static_graph=True)
+
+    @skip_if_lt_x_gpu(4)
+    @requires_gloo()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_gloo_ckpt_never(self):
+        self._run_basic_test("gloo", "never")
+
+    @skip_if_lt_x_gpu(4)
+    @requires_gloo()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_gloo_ckpt_never_find_unused(self):
+        self._run_basic_test("gloo", "never", find_unused_parameters=True)
+
+    @skip_if_lt_x_gpu(4)
+    @requires_gloo()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_gloo_ckpt_always(self):
+        self._run_basic_test("gloo", "always", static_graph=True)
+
+    @skip_if_lt_x_gpu(4)
+    @requires_gloo()
+    @dist_init
+    @skip_if_rocm
+    def test_basic_gloo_ckpt_except_last(self):
+        self._run_basic_test("gloo", "except_last", static_graph=True)
+
+    def _run_basic_test(self, backend, checkpoint, find_unused_parameters=False, static_graph=False):
+        dist.init_process_group(
+            backend=backend,
+            init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name),
+            world_size=self.world_size,
+            rank=self.rank,
+        )
+
+        # Use 4 GPUs, two replicas of a pipe across GPU 0 and 1 and another
+        # pipe between GPU 2 and 3. Both replicas are replicated via DDP.
+        fc1 = nn.Linear(16, 8, bias=False).cuda(2 * self.rank)
+
+        class MyModule(nn.Module):
+            def __init__(self, device):
+                super().__init__()
+                self.fc2 = nn.Linear(8, 4, bias=False).cuda(device)
+                self.fc3 = nn.Linear(4, 2, bias=False).cuda(device)
+
+            def forward(self, inp):
+                if find_unused_parameters:
+                    return self.fc2(inp)
+                else:
+                    return self.fc3(self.fc2(inp))
+
+        layer2 = MyModule(2 * self.rank + 1)
+        model = nn.Sequential(
+            fc1,
+            layer2
+        )
+        model = Pipe(model, chunks=2, checkpoint=checkpoint)
+        model = DistributedDataParallel(
+            model,
+            find_unused_parameters=find_unused_parameters,
+            static_graph=static_graph,
+        )
+
+        # Ensure inputs are different across ranks to verify that gradient
+        # sync indeed occurs.
+        model_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1)
+        out = model(model_input).local_value()
+        out.sum().backward()
+
+        # Run forward again for find_unused_parameters to trigger any potential errors.
+        if find_unused_parameters:
+            # Ensure inputs are different across ranks to verify that gradient
+            # sync indeed occurs.
+            unused_param_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1)
+            model(unused_param_input).local_value().sum().backward()
+
+        # Run a few more iterations of fwd + bwd to ensure gradient synchronization
+        # occurs properly across iterations via delay_all_reduce/bucketized allreduce.
+        for _ in range(3):
+            model_input = torch.rand(16, 16).cuda(2 * self.rank) * (self.rank + 1)
+            out = model(model_input).local_value()
+            out.sum().backward()
+
+        # Check grads
+        output = [torch.empty_like(fc1.weight.grad), torch.empty_like(fc1.weight.grad)]
+        dist.all_gather(output, fc1.weight.grad)
+        self.assertEqual(output[0], output[1])
+
+        output = [torch.empty_like(layer2.fc2.weight.grad), torch.empty_like(layer2.fc2.weight.grad)]
+        dist.all_gather(output, layer2.fc2.weight.grad)
+        self.assertEqual(output[0], output[1])
+
+        if not find_unused_parameters:
+            output = [torch.empty_like(layer2.fc3.weight.grad), torch.empty_like(layer2.fc3.weight.grad)]
+            dist.all_gather(output, layer2.fc3.weight.grad)
+            self.assertEqual(output[0], output[1])
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipeline/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipeline/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipeline/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5f0c8b7c2fdee01b821a2e6c81c91d75e4c7387
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/pipeline/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71f456eb3dd68068cdd632f728a71be15d273ef3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_autograd_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_autograd_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a223960901f45c96a6559bfb69e57079b6f18002
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_autograd_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_optimizer_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_optimizer_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a02a229bf3e4728ed37a59c552ec2b74b934d4a3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/dist_optimizer_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_agent_rpc_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_agent_rpc_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d0a25a74ffb9cf837314a27dd49d0dbdf397f43
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_agent_rpc_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_rpc_agent_test_fixture.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_rpc_agent_test_fixture.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75ad40d486d7e790b92325c0ac93a5a56dd4c653
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/faulty_rpc_agent_test_fixture.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_agent_test_fixture.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_agent_test_fixture.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4ba3c7d8516e76ca925b3a5c55f23d421cdb2a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_agent_test_fixture.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb680a784fd6b0e6c9d56de54186a8ced3e47b2b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/rpc_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/tensorpipe_rpc_agent_test_fixture.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/tensorpipe_rpc_agent_test_fixture.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c23e5c91e479d64ce3ae7f043caa27b4b44be3e8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/__pycache__/tensorpipe_rpc_agent_test_fixture.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9174a11ec31425fe40b757441dce095dda1d98a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -0,0 +1,2783 @@
+# mypy: ignore-errors
+
+import sys
+import threading
+import time
+from enum import Enum
+import random
+import torch
+import torch.nn as nn
+from datetime import timedelta
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+import torch.testing._internal.dist_utils
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.distributed.rpc import RRef
+from torch.testing._internal.common_utils import IS_MACOS, skip_but_pass_in_sandcastle_if
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    initialize_pg,
+    wait_until_node_failure,
+    worker_name,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+
+
+# Right now we test up to 3-layer nested rpc calls.
+# rpc_done[1] and ctx_ids[1] represent rpc is done in prev rank, and context id
+# sent from prev rank respectively.
+# rpc_done[2] and ctx_ids[2] represents for prev of prev rank.
+# rpc_done[3] and ctx_ids[3] represents for prev of prev of prev rank.
+# rpc_done[0] and ctx_ids[0] represents for current rank, but mostly not used.
+rpc_done = [False, False, False, False]
+ctx_ids = [-1, -1, -1, -1]
+
+known_context_ids = set()
+
+requires_grad_tensor = torch.ones(3, 3, requires_grad=True)
+
+# Send rpc done info and context_id to
+# dst_rank = (self.rank + rank_distance) % self.world_size
+# we don't need a lock here since the GIL is held while executing remote
+# python UDFs, so access is serialized across several workers.
+def _set_rpc_done(ctx_id, rank_distance):
+    global rpc_done
+    global ctx_ids
+    global known_context_ids
+    rpc_done[rank_distance] = True
+    ctx_ids[rank_distance] = ctx_id
+    known_context_ids.add(ctx_id)
+
+
+def _check_rpc_done(rank_distance):
+    while not rpc_done[rank_distance]:
+        time.sleep(0.1)
+
+
+def _torch_ones(sizes, requires_grad=False):
+    return torch.ones(sizes, requires_grad=requires_grad)
+
+# This method must be called on the rref owner, and verifies that the grad of
+# rref tensor equals to the given grad.
+def _compare_owner_value(context_id, rref, grad):
+    grads = dist_autograd.get_gradients(context_id)
+    x = grads[rref.local_value()]
+    if x.is_sparse:
+        assert grad.is_sparse
+        x = x.to_dense()
+        grad = grad.to_dense()
+    else:
+        assert not grad.is_sparse
+    return torch.equal(x, grad)
+
+
+def create_tensor():
+    return torch.ones((3, 3), requires_grad=True)
+
+
+def build_sparse_tensor(coalesce=False, requires_grad=True, dtype=torch.float32):
+    i = [[0, 1, 1], [2, 0, 2]]
+    v = [3.2, 4.1, 5.3]
+    tensor = torch.sparse_coo_tensor(
+        i, v, (3, 3), requires_grad=requires_grad, dtype=dtype
+    )
+    if coalesce:
+        tensor = tensor.coalesce()
+    return tensor
+
+
+@torch.jit.script
+def create_torchscript_tensor() -> torch.Tensor:
+    return torch.ones((3, 3)).requires_grad_()
+
+
+def my_py_add(t1, t2):
+    return torch.add(t1, t2)
+
+
+def my_scalar_add(a, b):
+    return a + b
+
+
+def my_rref_add(rref_t1, t2):
+    ret = torch.add(rref_t1.local_value(), t2)
+    return ret
+
+
+@torch.jit.script
+def my_script_add(t1, t2):
+    return torch.add(t1, t2)
+
+
+@torch.jit.script
+def my_script_ref_add(ref_t1: RRef[torch.Tensor], t2: torch.Tensor) -> torch.Tensor:
+    t1 = ref_t1.to_here()
+    return torch.add(t1, t2)
+
+
+def my_nested_rref_add(dst, rref_t1, t2):
+    return rpc.rpc_sync(dst, my_rref_add, args=(rref_t1, t2))
+
+
+def ret_requires_grad():
+    return requires_grad_tensor
+
+
+def my_py_nested_call(t1, t2, dst, world_size, hops):
+    next_dst = (dst + 1) % world_size
+    if hops > 0:
+        return rpc.rpc_sync(
+            worker_name(next_dst),
+            my_py_nested_call,
+            args=(t1, t2, next_dst, world_size, hops - 1),
+        )
+    else:
+        return rpc.rpc_sync(worker_name(next_dst), my_py_add, args=(t1, t2))
+
+
+# after dist autograd context is cleaned up, it should be cleaned up on other
+# nodes. This helper allows timeout_seconds for those RPCs to be completed, and
+# ensures that all the contexts have been cleaned up in that timeframe.any
+def _all_contexts_cleaned_up(timeout_seconds=10):
+    global known_context_ids
+    start = time.time()
+    context_id_to_raised = set()
+    while (
+        time.time() - start < timeout_seconds
+        and context_id_to_raised != known_context_ids
+    ):
+        for context_id in known_context_ids:
+            try:
+                dist_autograd._retrieve_context(context_id)
+            except RuntimeError:
+                context_id_to_raised.add(context_id)
+    # all contexts have been cleaned up if trying to retrieve any context resulted in a RuntimeError.
+    success = context_id_to_raised == known_context_ids
+    return success
+
+
+# This function creates a dis autograd context, run rpc_sync on the given ps,
+# and then blocks until the ps has verified the grads are correctly accumulated.
+def _run_trainer(rref_t1, t2, ps, rank_diff, sparse):
+    with dist_autograd.context() as context_id:
+        ret = rpc.rpc_sync(ps, my_rref_add, args=(rref_t1, t2))
+        if sparse:
+            loss = torch.sparse.sum(ret)
+        else:
+            loss = ret.sum()
+        dist_autograd.backward(context_id, [loss])
+        # prevent deleting dist autograd context
+        rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
+        rpc.rpc_sync(ps, _check_rpc_done, args=(0,))
+
+# This function is the same as _run_trainer, except rpc calls torchscript
+# function "my_script_ref_add" instead of python function "my_rref_add"
+def _run_trainer_torchscript(rref_t1, t2, ps, rank_diff, sparse):
+    with dist_autograd.context() as context_id:
+        ret = rpc.rpc_sync(ps, my_script_ref_add, args=(rref_t1, t2))
+        if sparse:
+            loss = torch.sparse.sum(ret)
+        else:
+            loss = ret.sum()
+        dist_autograd.backward(context_id, [loss])
+        # prevent deleting dist autograd context
+        rpc.rpc_sync(ps, _set_rpc_done, args=(context_id, rank_diff))
+        rpc.rpc_sync(ps, _check_rpc_done, args=(0,))
+
+
+class SimulateBackwardError(Function):
+    _simulate_error = True
+
+    @staticmethod
+    def forward(ctx, input):
+        return input
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, input):
+        if SimulateBackwardError._simulate_error:
+            raise Exception("Simulate error on backward pass")
+        else:
+            return input
+
+
+class ExecMode(Enum):
+    LOCAL = 1  # Run the operation locally.
+    RPC_SYNC = 2  # Run the operation using rpc_sync
+    REMOTE = 3  # Run the operation using remote.
+    RPC_ASYNC = 4  # Run the operation using rpc_async
+
+
+# Common utils for both CPU and CUDA test suites
+class CommonDistAutogradTest(RpcAgentTestFixture):
+    def _exec_func_with_dst(self, dst, exec_mode, method, *args):
+        if ExecMode.LOCAL == exec_mode:
+            if len(args) == 1 and isinstance(args[0], list):
+                return method(*args[0])
+            return method(*args)
+        elif ExecMode.RPC_SYNC == exec_mode:
+            return rpc.rpc_sync(worker_name(dst), method, args=(args))
+        elif ExecMode.REMOTE == exec_mode:
+            return rpc.remote(worker_name(dst), method, args=(args)).to_here()
+        elif ExecMode.RPC_ASYNC == exec_mode:
+            fut = rpc.rpc_async(worker_name(dst), method, args=(args))
+            return fut.wait()
+        else:
+            raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+    def _exec_func(self, exec_mode, method, *args):
+        return self._exec_func_with_dst(
+            self._next_rank(), exec_mode, method, *args
+        )
+
+    def _next_rank(self):
+        if hasattr(self, "dst_rank"):
+            self.dst_rank = (self.dst_rank + 1) % self.world_size
+            if self.dst_rank == self.rank:
+                return self._next_rank()
+        else:
+            self.dst_rank = (self.rank + 1) % self.world_size
+        return self.dst_rank
+
+    def _check_rpc_done(self, rank_distance):
+        _check_rpc_done(rank_distance)
+
+    def _verify_backwards(self, exec_mode, tensors, context_id, local_grads, *args):
+        if exec_mode == ExecMode.LOCAL:
+            torch.autograd.backward(tensors)
+            return [arg.grad for arg in args]
+        else:
+            self._verify_backwards_remote(tensors, context_id, local_grads, *args)
+
+    def _verify_backwards_remote(self, tensors, context_id, local_grads, *args):
+        dist_autograd.backward(context_id, tensors)
+
+        # Verify grads were accumulated appropriately.
+        grads = dist_autograd.get_gradients(context_id)
+        nargs = len(args)
+        ngrads = 0
+        for i in range(0, nargs):
+            if local_grads[i] is not None:
+                self.assertIn(args[i], grads)
+                self.assertEqual(local_grads[i], grads[args[i]])
+                ngrads += 1
+            else:
+                self.assertNotIn(args[i], grads)
+
+        self.assertEqual(ngrads, len(grads))
+
+    def _test_graph(self, fn, exec_mode, sparse):
+        dst_rank = (self.rank + 1) % self.world_size
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        with dist_autograd.context() as context_id:
+            if sparse:
+                t1 = build_sparse_tensor()
+                t2 = build_sparse_tensor()
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(worker_name(dst_rank), fn, args=(t1, t2))
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank), fn, args=(t1, t2)
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            rpc.rpc_sync(
+                worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+            )
+
+            # Verify graph for current context id.
+            ctx = dist_autograd._current_context()
+            self.assertEqual(context_id, ctx._context_id())
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(1, len(recv_functions))
+            self._verify_graph_for_first_rpc_call(
+                next(iter(send_functions.values())),
+                next(iter(recv_functions.values())),
+                t1,
+                t2,
+                ret,
+            )
+
+            # Wait for the prev rank to be done with rpc.
+            self._check_rpc_done(1)
+            # Verify graph for previous context id.
+            ctx = dist_autograd._retrieve_context(ctx_ids[1])
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+            self._verify_graph_for_rpc_call_exec(next(iter(send_functions.values())))
+            # this barrier is needed so one worker does not clean up their
+            # autograd context before another worker tries to access it.
+            dist.barrier()
+
+        # autograd context should be cleaned up by now.
+        with self.assertRaises(RuntimeError):
+            ctx = dist_autograd._retrieve_context(context_id)
+
+        # No autograd context available.
+        with self.assertRaises(RuntimeError):
+            ctx = dist_autograd._current_context()
+
+    # 3-layer nested calls
+    def _test_graph_for_py_nested_call(self, exec_mode, sparse):
+        dst_rank = (self.rank + 1) % self.world_size
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        with dist_autograd.context() as context_id:
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=True)
+                t2 = build_sparse_tensor(requires_grad=True)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
+            nest_dst_rank = (dst_rank + 1) % self.world_size
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(
+                    worker_name(dst_rank),
+                    my_py_nested_call,
+                    args=(t1, t2, dst_rank, self.world_size, 1),
+                )
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank),
+                    my_py_nested_call,
+                    args=(t1, t2, dst_rank, self.world_size, 1),
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            # Barrier to ensure all RPCs are done.
+            dist.barrier()
+
+            for rd in [1, 2, 3]:
+                rpc.rpc_sync(
+                    worker_name((self.rank + rd) % self.world_size),
+                    _set_rpc_done,
+                    args=(context_id, rd),
+                )
+
+            # Barrier to ensure all set_rpc_done have completed.
+            dist.barrier()
+
+            # For self.rank, it has 4 graphs to verify
+            # One is for current context id when this rank send first rpc call.
+            # Second one is for prev context id when this rank make 1st nested
+            # call.
+            # Third one is for prev prev context id when this rank make
+            # 2nd nested call.
+            # Last one is for prev prev prev context id when this rank
+            # execute the torch.add() operator.
+
+            # Verify first graph for current context id.
+            ctx = dist_autograd._current_context()
+            self.assertEqual(context_id, ctx._context_id())
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(1, len(recv_functions))
+            self._verify_graph_for_first_rpc_call(
+                next(iter(send_functions.values())),
+                next(iter(recv_functions.values())),
+                t1,
+                t2,
+                ret,
+            )
+
+            # Verify second graph for 1st nested call.
+            ctx = dist_autograd._retrieve_context(ctx_ids[1])
+            self._verify_graph_for_nested_rpc_call(ctx)
+
+            # Verify third graph for 2nd nested call.
+            ctx = dist_autograd._retrieve_context(ctx_ids[2])
+            self._verify_graph_for_nested_rpc_call(ctx)
+
+            # verify last graph for rpc call execution.
+            ctx = dist_autograd._retrieve_context(ctx_ids[3])
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+            self._verify_graph_for_rpc_call_exec(next(iter(send_functions.values())))
+            # this barrier is needed so one worker does not clean up their
+            # autograd context before another worker tries to access it.
+            dist.barrier()
+
+    # Rank0->Rank1->Rank0
+    def _test_graph_for_py_nested_call_itself(self, exec_mode, sparse):
+        dst_rank = (self.rank + 1) % self.world_size
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        with dist_autograd.context() as context_id:
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=True)
+                t2 = build_sparse_tensor(requires_grad=True)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=True)
+                t2 = torch.zeros(3, 3, requires_grad=True)
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(
+                    worker_name(dst_rank),
+                    my_py_nested_call,
+                    args=(
+                        t1,
+                        t2,
+                        (self.rank - 1 + self.world_size) % self.world_size,
+                        self.world_size,
+                        0,
+                    ),
+                )
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank),
+                    my_py_nested_call,
+                    args=(
+                        t1,
+                        t2,
+                        (self.rank - 1 + self.world_size) % self.world_size,
+                        self.world_size,
+                        0,
+                    ),
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            rpc.rpc_sync(
+                worker_name((self.rank + 1) % self.world_size),
+                _set_rpc_done,
+                args=(context_id, 1),
+            )
+
+            # For self.rank, it has 2 graphs to verify.
+            # One is for current context id when this rank send first rpc
+            # call and execute the torch.add() operator.
+            # Another one is for prev context id when this rank make
+            # nested call.
+            ctx = dist_autograd._current_context()
+            self.assertEqual(context_id, ctx._context_id())
+            send_functions = ctx._send_functions()
+            self.assertEqual(2, len(send_functions))
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(2, len(recv_functions))
+            self._verify_graph_for_first_rpc_call(
+                next(iter(send_functions.values())),
+                list(recv_functions.values())[1],
+                t1,
+                t2,
+                ret,
+            )
+            self._verify_graph_for_rpc_call_exec(list(send_functions.values())[1])
+
+            # Verify two pairs of send and recv functions for nested
+            # call
+            self._check_rpc_done(1)
+            ctx = dist_autograd._retrieve_context(ctx_ids[1])
+            self._verify_graph_for_nested_rpc_call(ctx)
+            # this barrier is needed so one worker does not clean up their
+            # autograd context before another worker tries to access it.
+            dist.barrier()
+
+    def _test_no_graph_with_tensors_not_require_grad(self, exec_mode, sparse):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dst_rank = (self.rank + 1) % self.world_size
+        with dist_autograd.context() as context_id:
+            if sparse:
+                t1 = build_sparse_tensor(requires_grad=False)
+                t2 = build_sparse_tensor(requires_grad=False)
+            else:
+                t1 = torch.ones(3, 3, requires_grad=False)
+                t2 = torch.zeros(3, 3, requires_grad=False)
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(
+                    worker_name(dst_rank), torch.add, args=(t1, t2)
+                )
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank), torch.add, args=(t1, t2)
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            rpc.rpc_sync(
+                worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+            )
+
+            ctx = dist_autograd._current_context()
+            send_functions = ctx._send_functions()
+            self.assertEqual(len(send_functions), 0)
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(len(recv_functions), 0)
+
+            # Wait for the prev rank to be done with rpc.
+            self._check_rpc_done(1)
+            # NB: RRef.to_here() always passes the autograd context to the
+            # the callee, as the caller does not know whether the return
+            # value would contain a requires_grad tensor or not.
+            #
+            # rpc/remote with udf (_set_rpc_done here) also always passes the
+            # autograd context to the callee due to the same reason.
+            self.assertNotEqual(-1, dist_autograd._retrieve_context(ctx_ids[1]))
+            dist.barrier()
+
+    def _test_rpc_complex_args(self, exec_mode, sparse):
+        with dist_autograd.context() as context_id:
+            num_tensors = 10
+            tensors = []
+            for i in range(num_tensors):
+                if sparse:
+                    tensor = build_sparse_tensor(requires_grad=(i % 2 == 0))
+                else:
+                    tensor = torch.ones(3, 3, requires_grad=(i % 2 == 0))
+                tensors.append(tensor)
+            dst_rank = self._next_rank()
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(
+                    worker_name(dst_rank), torch.stack, args=(tensors,)
+                )
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank), torch.stack, args=(tensors,)
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            self.assertEqual(torch.stack(tensors), ret)
+
+            # Verify appropriate tensors have been attached the autograd graph.
+            next_funcs = next(iter(dist_autograd._current_context()._send_functions().values())).next_functions
+            idx = 0
+            for i in range(len(next_funcs)):
+                self.assertEqual(
+                    "torch::autograd::AccumulateGrad", next_funcs[i][0].name()
+                )
+                self.assertEqual(tensors[i], next_funcs[i][0].variable)
+
+            # Verify that the worker id has been recorded in the context
+            ctx = dist_autograd._current_context()
+            worker_ids = ctx._known_worker_ids()
+            self.assertEqual(len(worker_ids), 1)
+            self.assertEqual(worker_ids, {dst_rank})
+
+    def context_cleanup_test_helper(self, rpc_args, func, nested=False):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        # test that in dist autograd, in the case that tensors communicated over RPC do
+        # NOT require grad, we still cleanup the dist autograd contexts created
+        # on other nodes. This is because the autograd context is still
+        # communicated over RPC even if tensor arguments do not require grad, as
+        #  it is possible that the response could.
+        if nested:
+            dst_rank = (self.rank + 1) % self.world_size
+            nested_dst_rank = (dst_rank + 1) % self.world_size
+            dst_ranks = {dst_rank}
+        else:
+            dst_ranks = {rank for rank in range(self.world_size) if rank != self.rank}
+
+        with dist_autograd.context() as context_id:
+            for dst_rank in dst_ranks:
+                rpc.rpc_sync(worker_name(dst_rank), func, args=rpc_args)
+                rpc.rpc_sync(
+                    worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+                )
+                if nested:
+                    rpc.rpc_sync(
+                        worker_name(nested_dst_rank),
+                        _set_rpc_done,
+                        args=(context_id, 2),
+                    )
+        # the thread's context id should be cleaned up
+        with self.assertRaises(RuntimeError):
+            dist_autograd._retrieve_context(context_id)
+        # Ensure all peers have finished mutating the
+        # `known_context_ids` set.
+        dist.barrier()
+        # check that all contexts have been cleaned up.
+        success = _all_contexts_cleaned_up()
+        self.assertTrue(success)
+
+    def _backward_no_grad_on_tensor(self, t1, t2, sparse):
+        with dist_autograd.context() as context_id:
+            loss = rpc.rpc_sync(
+                worker_name(self._next_rank()),
+                torch.add,
+                args=(t1, t2))
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
+            dist_autograd.backward(context_id, [loss], retain_graph=True)
+            self.assertIsNone(t1.grad)
+            self.assertIsNone(t2.grad)
+
+            # Now populate .grad with local autograd engine and
+            # verify dist autograd doesn't mess with it.
+            loss_local = torch.add(t1, t2)
+            if sparse:
+                loss_local = torch.sparse.sum(loss_local)
+            else:
+                loss_local = loss_local.sum()
+            loss_local.backward()
+            self.assertIsNotNone(t1.grad)
+            self.assertIsNotNone(t2.grad)
+
+            t1_grad_before = t1.grad
+            t2_grad_before = t2.grad
+            dist_autograd.backward(context_id, [loss])
+            self.assertEqual(t1_grad_before, t1.grad)
+            self.assertEqual(t2_grad_before, t2.grad)
+
+    # The current rank first creates a tensor on the rref_owner, and then passes
+    # the rref with another tensor to the callee to run either my_rref_add or
+    # my_nested_rref_add, depending on whether the callee is the rref owner.
+    # The grad of tensor lives on the current rank, and the grad of the rref
+    # tensor lives on the rref owner.
+    def _backward_rref(self, callee, rref_owner, t1, t2, local_grads, sparse):
+        local_ret = torch.add(t1, t2)
+        if sparse:
+            local_ret = torch.sparse.sum(local_ret)
+        else:
+            local_ret = local_ret.sum()
+        local_ret.backward()
+        with dist_autograd.context() as context_id:
+            if sparse:
+                rref_t1 = rpc.remote(
+                    rref_owner, build_sparse_tensor, args=(False, True,)
+                )
+            else:
+                rref_t1 = rpc.remote(
+                    rref_owner, _torch_ones, args=((3, 3),), kwargs={"requires_grad": True}
+                )
+            if callee == rref_owner:
+                rref = rpc.remote(callee, my_rref_add, args=(rref_t1, t2))
+            else:
+                rref = rpc.remote(
+                    callee, my_nested_rref_add, args=(rref_owner, rref_t1, t2)
+                )
+            ret = rref.to_here()
+            if sparse:
+                ret = torch.sparse.sum(ret)
+            else:
+                ret = ret.sum()
+            dist_autograd.backward(context_id, [ret])
+
+            # verify grads on caller
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertIn(t2, grads)
+            self.assertEqual(grads[t2], t2.grad)
+
+            # verify grads on rref owner
+            self.assertTrue(
+                rpc.rpc_sync(
+                    rref_owner,
+                    _compare_owner_value,
+                    args=(context_id, rref_t1, t1.grad),
+                )
+            )
+
+    # In this test, every rank will serve as a parameter server (ps) and a
+    # driver, and then kicks off trainers on the other three ranks. So, we have:
+    # ps = rank0 with trainers = rank1/2/3
+    # ps = rank2 with trainers = rank2/3/0
+    # ps = rank3 with trainers = rank3/0/1
+    # ps = rank4 with trainers = rank0/1/2
+    #
+    # These four test ps-trainer groups run on completely separate autograd
+    # graphs, but they share the same set of underlying RpcAgents.
+    def _test_trainer_ps(self, create_ref_fn, trainer_fn, sparse):
+        if sparse:
+            t1 = build_sparse_tensor(requires_grad=True)
+            t2 = build_sparse_tensor(requires_grad=True)
+        else:
+            t1 = torch.ones((3, 3), requires_grad=True)
+            t2 = torch.zeros((3, 3), requires_grad=True)
+
+        local_ret = torch.add(t1, t2)
+        if sparse:
+            torch.sparse.sum(local_ret).backward()
+        else:
+            local_ret.sum().backward()
+
+        # create rref on self
+        rref_t1 = rpc.remote(
+            worker_name(self.rank),
+            create_ref_fn,
+            args=())
+
+        # kick off forward and backward pass on three other workers (trainers)
+        rank_diffs = [1, 2, 3]
+        futures = []
+        for rank_diff in rank_diffs:
+            futures.append(
+                rpc.rpc_async(
+                    worker_name((self.rank + rank_diff) % self.world_size),
+                    trainer_fn,
+                    args=(rref_t1, t2, worker_name(self.rank), rank_diff, sparse),
+                )
+            )
+
+        # check if the trainers have done with their backward pass
+        for rank_diff in rank_diffs:
+            self._check_rpc_done(rank_diff)
+
+        # trainers are done and holding the context for verification
+        accumulate_grad_func = None
+        for rank_diff in rank_diffs:
+            # make sure grads are accumulated for the same tensors and values
+            # are all correct
+            ctx_id = ctx_ids[rank_diff]
+            grads = dist_autograd.get_gradients(ctx_id)
+            local_t1 = rref_t1.to_here()
+            self.assertIn(local_t1, grads)
+            self.assertEqual(grads[local_t1], t1.grad)
+
+        # unblock trainers
+        _set_rpc_done(None, 0)
+
+        # wait until all trainers are done
+        torch.futures.wait_all(futures)
+
+    def _backward_multiple_round_trips(self, t1, t2, t3, t4, t5, local_grads, sparse):
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                # Multiple RPCs between different nodes.
+                val = self._exec_func(exec_mode, torch.add, t1, t2)
+                val = self._exec_func(exec_mode, torch.mul, t3, val)
+                s1 = self._exec_func(exec_mode, torch.stack, (t4, val))
+                s2 = self._exec_func(exec_mode, torch.stack, (t5, val))
+                if sparse:
+                    val = self._exec_func(exec_mode, torch.mul, s1, s2)
+                    val = self._exec_func(exec_mode, torch.mul, val, val)
+                    loss = torch.sparse.sum(val)
+                else:
+                    val = self._exec_func(exec_mode, torch.bmm, s1, s2)
+                    val = self._exec_func(exec_mode, torch.matmul, val, val)
+                    loss = val.sum()
+
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2, t3, t4, t5
+                )
+                local_grads = ret if ret else local_grads
+
+    def _backward_different_dtypes(self, t1, t2, sparse):
+        local_grads = None
+        for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                loss = self._exec_func(exec_mode, torch.add, t1, t2)
+                if sparse:
+                    loss = torch.sparse.sum(loss)
+                else:
+                    loss = loss.sum()
+                local_grads = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple_python_udf(self, t1, t2, sparse):
+        local_grads = None
+        for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                ret = self._exec_func(exec_mode, my_py_add, t1, t2)
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
+                local_grads = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple_script_call(self, t1, t2, sparse):
+        local_grads = None
+        for exec_mode in [
+            ExecMode.LOCAL,
+            ExecMode.RPC_SYNC,
+            ExecMode.RPC_ASYNC,
+            ExecMode.REMOTE,
+        ]:
+            with dist_autograd.context() as context_id:
+                forward_ret = self._exec_func(exec_mode, my_script_add, t1, t2)
+                if sparse:
+                    loss = torch.sparse.sum(forward_ret)
+                else:
+                    loss = forward_ret.sum()
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+                local_grads = ret if ret else local_grads
+
+    def _nested_backward_accumulate_grads(self, t1, t2, sparse):
+        with dist_autograd.context() as context_id:
+            ret = rpc.rpc_sync(
+                worker_name(self._next_rank()),
+                DistAutogradTest._test_nested_backward_accumulate_grads,
+                args=(t1, t2, self._next_rank()),
+            )
+            if sparse:
+                loss = torch.sparse.sum(ret)
+            else:
+                loss = ret.sum()
+            # Run backward twice.
+            dist_autograd.backward(context_id, [loss], retain_graph=True)
+            dist_autograd.backward(context_id, [loss])
+
+    def _backwards_nested_python_udf(self, t1, t2, sparse):
+        t3 = t1 * t2
+        t4 = t1 + t2
+        res = t3 + t4
+        loss = t1 * t2 * t3 * t4 * res
+        if sparse:
+            loss = torch.sparse.sum(loss)
+        else:
+            loss = loss.sum()
+        torch.autograd.backward([loss])
+
+        # Now run distributed autograd.
+        with dist_autograd.context() as context_id:
+            loss = rpc.rpc_sync(
+                worker_name(self._next_rank()),
+                DistAutogradTest._nested_python_udf,
+                args=(t1, t2, self._next_rank()),
+            )
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
+            dist_autograd.backward(context_id, [loss])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(t1.grad, grads[t1])
+            self.assertEqual(t2.grad, grads[t2])
+
+    def _mixed_requires_grad(self, t1, t2, sparse):
+        for exec_mode in [ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                ret = self._exec_func(
+                    exec_mode, DistAutogradTest._mixed_requires_grad_operaton, t1, t2
+                )
+                self.assertEqual(t1 * t2, ret)
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
+                dist_autograd.backward(context_id, [loss])
+                self.assertTrue(t1.requires_grad)
+                self.assertFalse(t2.requires_grad)
+                grads = dist_autograd.get_gradients(context_id)
+                self.assertIn(t1, grads)
+                self.assertNotIn(t2, grads)
+                self.assertEqual(t2, grads[t1])
+
+    def _multiple_backward(self, t1, t2, sparse):
+        with dist_autograd.context() as context_id:
+            loss = rpc.rpc_sync(
+                worker_name(self._next_rank()),
+                torch.add,
+                args=(t1, t2))
+            if sparse:
+                loss = torch.sparse.sum(loss)
+            else:
+                loss = loss.sum()
+            # Run backward in a loop multiple times.
+            for i in range(1000):
+                dist_autograd.backward(context_id, [loss], retain_graph=True)
+
+    # For current context, this rank sends t1 and t2 tensors to dst_rank,
+    # then get t3 = torch.add(t1, t2) result tensor.
+    # For the current context in this rank, it expects graph like this:
+    #  send function:
+    #              rpcSendBackward
+    #                  /          \
+    #  t1.AccumulateGrad         t2.AccumulateGrad
+    #
+    #  recv function:
+    #
+    #            |
+    #          t3.rpcRecvBackward
+    #
+    def _verify_graph_for_first_rpc_call(
+        self, send_function, recv_function, t1, t2, ret
+    ):
+        # Retrieve the next functions in the graph.
+        next_funcs = send_function.next_functions
+        self.assertEqual(2, len(next_funcs))
+
+        # We should now hit t1 and t2 in the autograd graph.
+        self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[0][0].name())
+        self.assertEqual(t1, next_funcs[0][0].variable)
+        self.assertEqual(0, next_funcs[0][1])
+        self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[1][0].name())
+        self.assertEqual(t2, next_funcs[1][0].variable)
+        self.assertEqual(0, next_funcs[1][1])
+
+        # Test recv functions.
+        self.assertEqual(ret.grad_fn, recv_function)
+
+    # Run the same code locally and with dist autograd and verify gradients
+    # are same.
+    def _backward_simple(self, dst, t1, t2, local_grads, sparse):
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                ret = self._exec_func_with_dst(
+                    dst, exec_mode, torch.add, t1, t2
+                )
+                if sparse:
+                    loss = torch.sparse.sum(ret)
+                else:
+                    loss = ret.sum()
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+                local_grads = ret if ret else local_grads
+
+    # For a context passed from previous nested chain calls, this rank
+    # receives two tensors t1 and t2, executes torch.add(t1, t2) and sends
+    # result tensor t3 back.
+    # For this context in this rank, it expects graph like this:
+    #  send and recv functions:
+    #       rpcSendBackward
+    #           |
+    #          t3.AddBackward0
+    #          /             \
+    # t1.recvRpcBackward    t2.recvRpcBackward
+    def _verify_graph_for_rpc_call_exec(self, send_function):
+        # Verify next function is AddBackward0
+        next_funcs = send_function.next_functions
+        self.assertEqual(1, len(next_funcs))
+        add_backward_fn = next_funcs[0][0]
+        self.assertEqual("AddBackward0", add_backward_fn.name())
+
+        # Verify the next two functions are the same recv backward function.
+        next_funcs = add_backward_fn.next_functions
+        self.assertEqual(2, len(next_funcs))
+        self.assertEqual(
+            "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()
+        )
+        self.assertEqual(
+            "torch::distributed::autograd::RecvRpcBackward", next_funcs[1][0].name()
+        )
+        self.assertEqual(next_funcs[0][0], next_funcs[1][0])
+
+    # For a context passed from previous nested chain calls, this rank
+    # receives two tensors t1 and t2, forwards t1 and t2 tensors using
+    # nested rpc call to next dst. In return route, receive result tensor t3
+    # from next dst and forwarding t3 back to previous calls.
+    # For this context in this rank, it expects graph like this:
+    #  send and recv functions for receiving and forwarding t1 and t2:
+    #       rpcSendBackward
+    #          /          \
+    # t1.recvRpcBackward    t2.recvRpcBackward
+    #  send and recv functions for receiving and forwarding t3:
+    #       rpcSendBackward
+    #             |
+    #           t3.recvRpcBackward
+    def _verify_graph_for_nested_rpc_call(self, ctx):
+        send_functions = ctx._send_functions()
+        self.assertEqual(2, len(send_functions))
+
+        # For send function when making nest rpc call,
+        # next functions of the send function are two recv functions
+        # for received two tensors from previous call
+        next_funcs = next(iter(send_functions.values())).next_functions
+        self.assertEqual(2, len(next_funcs))
+        self.assertEqual(
+            "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()
+        )
+        self.assertEqual(
+            "torch::distributed::autograd::RecvRpcBackward", next_funcs[1][0].name()
+        )
+        self.assertEqual(next_funcs[0][0], next_funcs[1][0])
+
+        # For send function when returning response to previous call
+        # next function of the send function is the recv function
+        # for received tensor result returned from nested call
+        next_funcs = list(send_functions.values())[1].next_functions
+        self.assertEqual(1, len(next_funcs))
+        self.assertEqual(
+            "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()
+        )
+
+
+class TensorPipeAgentDistAutogradTest(CommonDistAutogradTest):
+
+    # Sparse tests only work with TensorPipeAgent.
+    @dist_init
+    def test_graph_for_builtin_call_sparse(self):
+        self._test_graph(torch.add, ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_graph_for_python_call_sparse(self):
+        self._test_graph(my_py_add, ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_graph_for_builtin_remote_call_sparse(self):
+        self._test_graph(torch.add, ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_graph_for_python_remote_call_sparse(self):
+        self._test_graph(my_py_add, ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_graph_for_py_nested_call_sparse(self):
+        self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call_sparse(self):
+        self._test_graph_for_py_nested_call(ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_graph_for_py_nested_call_itself_sparse(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call_itself_sparse(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad_sparse(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad_remote_sparse(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_rpc_complex_args_sparse(self):
+        self._test_rpc_complex_args(ExecMode.RPC_SYNC, True)
+
+    @dist_init
+    def test_remote_complex_args_sparse(self):
+        self._test_rpc_complex_args(ExecMode.REMOTE, True)
+
+    @dist_init
+    def test_context_cleanup_tensor_with_grad_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=True)
+        t2 = build_sparse_tensor(requires_grad=True)
+        self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add)
+
+    @dist_init
+    def test_context_cleanup_tensor_no_grad_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=False)
+        self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add)
+
+    @dist_init
+    def test_context_cleanup_nested_rpc_sparse(self):
+        t1 = build_sparse_tensor(requires_grad=True)
+        t2 = build_sparse_tensor(requires_grad=True)
+        dst_rank = (self.rank + 1) % self.world_size
+        args = (t1, t2, dst_rank, self.world_size, 0)
+        self.context_cleanup_test_helper(
+            rpc_args=args, func=my_py_nested_call, nested=True
+        )
+
+    @dist_init
+    def test_backward_no_grad_on_tensor_sparse(self):
+        self._backward_no_grad_on_tensor(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_backward_simple_sparse(self):
+        self._backward_simple(
+            self._next_rank(),
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
+    @dist_init
+    def test_backward_simple_self_sparse(self):
+        self._backward_simple(
+            self.rank,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
+    @dist_init
+    def test_backward_rref_multi_sparse(self):
+        if self.rank > 0:
+            callee = "worker0"
+            rref_owner = callee
+            self._backward_rref(
+                callee,
+                rref_owner,
+                build_sparse_tensor(requires_grad=True),
+                build_sparse_tensor(requires_grad=True),
+                None,
+                True
+            )
+
+    @dist_init
+    def test_backward_rref_sparse(self):
+        callee = worker_name(self._next_rank())
+        rref_owner = callee
+        self._backward_rref(
+            callee,
+            rref_owner,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
+    @dist_init
+    def test_backward_rref_nested_sparse(self):
+        callee = worker_name((self.rank + 1) % self.world_size)
+        rref_owner = worker_name((self.rank + 2) % self.world_size)
+        self._backward_rref(
+            callee,
+            rref_owner,
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
+    @dist_init
+    def test_trainer_ps_sparse(self):
+        self._test_trainer_ps(
+            build_sparse_tensor,
+            _run_trainer,
+            True
+        )
+
+    @dist_init
+    def test_backward_multiple_round_trips_sparse(self):
+        self._backward_multiple_round_trips(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            build_sparse_tensor(requires_grad=True),
+            None,
+            True
+        )
+
+    @dist_init
+    def test_backward_different_dtypes_sparse(self):
+        self._backward_different_dtypes(
+            build_sparse_tensor(requires_grad=True, dtype=torch.float32),
+            build_sparse_tensor(requires_grad=True, dtype=torch.float64),
+            True
+        )
+
+    @dist_init
+    def test_backward_simple_python_udf_sparse(self):
+        self._backward_simple_python_udf(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_backward_simple_script_call_sparse(self):
+        self._backward_simple_script_call(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_nested_backward_accumulate_grads_sparse(self):
+        self._nested_backward_accumulate_grads(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_backwards_nested_python_udf_sparse(self):
+        # Run equivalent of _nested_python_udf locally.
+        self._backwards_nested_python_udf(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_mixed_requires_grad_sparse(self):
+        self._mixed_requires_grad(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=False),
+            True
+        )
+
+    @dist_init
+    def test_multiple_backward_sparse(self):
+        self._multiple_backward(
+            build_sparse_tensor(requires_grad=True),
+            build_sparse_tensor(requires_grad=True),
+            True
+        )
+
+    @dist_init
+    def test_embedding_bag_with_no_grad_tensors(self):
+        dst = self._next_rank()
+        remote_embedding = rpc.remote(
+            worker_name(dst),
+            torch.nn.EmbeddingBag,
+            args=(16, 16),
+            kwargs={"mode": "sum", "sparse": True},
+        )
+        local_embedding = torch.nn.EmbeddingBag(16, 16, mode="sum", sparse=True)
+
+        input = torch.LongTensor([1, 2, 4, 5, 4, 3, 2, 9])
+        # requires_grad = True to record send/recv functions
+        per_sample_weights = torch.rand((8), requires_grad=True)
+        offsets = torch.LongTensor([0, 4])
+
+        local_res = local_embedding(input, offsets, per_sample_weights)
+
+        # Run backward twice.
+        torch.autograd.backward([local_res.sum()], retain_graph=True)
+        torch.autograd.backward([local_res.sum()])
+        local_grad = local_embedding.weight.grad
+
+        with dist_autograd.context() as context_id:
+            res = rpc.rpc_sync(
+                worker_name(dst),
+                DistAutogradTest._call_remote_embedding,
+                args=(remote_embedding, input, offsets, per_sample_weights),
+            )
+
+            # Run backward twice to test accumulation of sparse gradients.
+            dist_autograd.backward(context_id, [res.sum()], retain_graph=True)
+            dist_autograd.backward(context_id, [res.sum()])
+
+            remote_grad = rpc.rpc_sync(
+                worker_name(dst),
+                DistAutogradTest._get_grad,
+                args=(remote_embedding, context_id),
+            )
+
+            self.assertEqual(local_grad, remote_grad)
+
+
+class DistAutogradTest(CommonDistAutogradTest):
+    @dist_init
+    def test_autograd_context(self):
+        # Verify max possible id.
+        max_auto_increment = 281474976710655
+        self.assertEqual(
+            max_auto_increment + (self.worker_id << 48), dist_autograd._get_max_id()
+        )
+
+        context_ids = []
+        for i in range(200):
+            with dist_autograd.context() as context_id:
+                self.assertEqual(
+                    context_id,
+                    dist_autograd._retrieve_context(context_id)._context_id(),
+                )
+                # First 16 bits should be worker_id.
+                self.assertEqual(self.worker_id, context_id >> 48)
+                context_ids.append(context_id)
+
+        for context_id in context_ids:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                f"Could not find autograd context with id: {context_id}",
+            ):
+                dist_autograd._retrieve_context(context_id)
+
+    @dist_init
+    def test_nested_context(self):
+        with dist_autograd.context() as context_id:
+            # Nested contexts not supported.
+            with self.assertRaisesRegex(
+                RuntimeError, "Already have an autograd context id for this thread"
+            ):
+                with dist_autograd.context() as context_id:
+                    pass
+
+    @dist_init
+    def test_graph_for_builtin_call(self):
+        self._test_graph(torch.add, ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_python_call(self):
+        self._test_graph(my_py_add, ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_builtin_remote_call(self):
+        self._test_graph(torch.add, ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_python_remote_call(self):
+        self._test_graph(my_py_add, ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_py_nested_call(self):
+        self._test_graph_for_py_nested_call(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call(self):
+        self._test_graph_for_py_nested_call(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_graph_for_py_nested_call_itself(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_graph_for_py_nested_remote_call_itself(self):
+        self._test_graph_for_py_nested_call_itself(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_no_graph_with_tensors_not_require_grad_remote(self):
+        self._test_no_graph_with_tensors_not_require_grad(ExecMode.REMOTE, False)
+
+    def _test_grad_only_on_return_value(self, exec_mode):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dst_rank = (self.rank + 1) % self.world_size
+        with dist_autograd.context() as context_id:
+            if ExecMode.RPC_SYNC == exec_mode:
+                ret = rpc.rpc_sync(worker_name(dst_rank), ret_requires_grad)
+            elif ExecMode.REMOTE == exec_mode:
+                ret = rpc.remote(
+                    worker_name(dst_rank), ret_requires_grad
+                ).to_here()
+            else:
+                raise ValueError(f"Unrecognized ExecMode {exec_mode}")
+
+            dist_autograd.backward(context_id, [ret.sum()])
+
+            rpc.rpc_sync(
+                worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+            )
+
+            # Wait for the prev rank to be done with rpc.
+            self._check_rpc_done(1)
+            grads = dist_autograd.get_gradients(ctx_ids[1])
+            self.assertEqual(1, len(grads))
+            self.assertIn(requires_grad_tensor, grads)
+            self.assertEqual(torch.ones_like(ret), grads[requires_grad_tensor])
+            # due to the above get_gradients call, ensure that dist autograd
+            # contexts aren't cleaned up until all workers exit context managers
+            dist.barrier()
+
+    @dist_init
+    def test_grad_only_on_return_value(self):
+        self._test_grad_only_on_return_value(ExecMode.RPC_SYNC)
+
+    @dist_init
+    def test_grad_only_on_return_value_remote(self):
+        self._test_grad_only_on_return_value(ExecMode.REMOTE)
+
+    @dist_init
+    def test_rpc_complex_args(self):
+        self._test_rpc_complex_args(ExecMode.RPC_SYNC, False)
+
+    @dist_init
+    def test_remote_complex_args(self):
+        self._test_rpc_complex_args(ExecMode.REMOTE, False)
+
+    @dist_init
+    def test_context_cleanup_tensor_with_grad(self):
+        t1 = torch.ones(3, 3, requires_grad=True)
+        t2 = torch.zeros(3, 3, requires_grad=True)
+        self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add)
+
+    @dist_init
+    def test_context_cleanup_tensor_no_grad(self):
+        t1 = torch.ones(3, 3, requires_grad=False)
+        self.context_cleanup_test_helper(rpc_args=(t1, t1), func=torch.add)
+
+    @dist_init
+    def test_context_cleanup_no_tensors(self):
+        self.context_cleanup_test_helper(rpc_args=(1, 1), func=my_scalar_add)
+
+    @dist_init
+    def test_context_cleanup_nested_rpc(self):
+        t1 = torch.ones(3, 3, requires_grad=True)
+        t2 = torch.zeros(3, 3, requires_grad=True)
+        dst_rank = (self.rank + 1) % self.world_size
+        args = (t1, t2, dst_rank, self.world_size, 0)
+        self.context_cleanup_test_helper(
+            rpc_args=args, func=my_py_nested_call, nested=True
+        )
+
+    @dist_init
+    def test_worker_ids_recorded(self):
+        dst_ranks = {rank for rank in range(self.world_size) if rank != self.rank}
+        with dist_autograd.context() as context_id:
+            # if no tensors require grad, we should still record worker_ids, as
+            # the autograd context ID is still passed to other workers.
+            t1 = torch.ones(3, 3, requires_grad=False)
+            t2 = torch.zeros(3, 3, requires_grad=False)
+            for dst_rank in dst_ranks:
+                rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
+                rpc.rpc_sync(
+                    worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+                )
+            # all worker_ids in dst_ranks should be recorded.
+            ctx = dist_autograd._current_context()
+            worker_ids = ctx._known_worker_ids()
+            self.assertEqual(worker_ids, dst_ranks)
+
+            # worker_ids should be recorded when tensors do require grad
+            t1.requires_grad = True
+            t2.requires_grad = True
+            for dst_rank in dst_ranks:
+                ret = rpc.rpc_sync(
+                    worker_name(dst_rank), torch.add, args=(t1, t2)
+                )
+                rpc.rpc_sync(
+                    worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+                )
+            # all worker_ids in dst_ranks should be recorded.
+            worker_ids = ctx._known_worker_ids()
+            self.assertEqual(worker_ids, dst_ranks)
+
+    @dist_init
+    def test_dist_autograd_profiling(self):
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand(3, 3, requires_grad=True)
+            t2 = torch.rand(3, 3, requires_grad=True)
+            loss = rpc.rpc_sync(worker_name(self._next_rank()), torch.add, args=(t1, t2)).sum()
+            with torch.autograd.profiler.profile() as p:
+                dist_autograd.backward(context_id, [loss])
+
+        function_events = p.function_events
+
+        def get_event(partial_key):
+            return next(event for event in function_events if partial_key in event.name)
+
+        send_event = get_event("SendRpcBackward")
+        recv_event = get_event("RecvRpcBackward")
+        backward_event = get_event("torch::distributed::autograd::backward")
+        # There should be at least 1 send and recv_events each, corresponding to send/recv functions executed.
+        self.assertEqual(send_event.count, 1)
+        self.assertEqual(recv_event.count, 1)
+        # The CPU total for backward event should be great than send and recv, since
+        # applying those functions in the backwards pass is a subset of the entire backward pass.
+        self.assertGreater(backward_event.cpu_time_total, send_event.cpu_time_total)
+        self.assertGreater(backward_event.cpu_time_total, recv_event.cpu_time_total)
+
+    @dist_init
+    def test_error_in_context(self):
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand(3, 3, requires_grad=True)
+            t2 = torch.rand(6, 6, requires_grad=True)
+
+            with self.assertRaises(RuntimeError):
+                # This should throw an error since matrix sizes don't match.
+                rpc.rpc_sync(
+                    worker_name(self._next_rank()), torch.matmul, args=(t1, t2)
+                )
+
+    @dist_init
+    def test_backward_no_grad_on_tensor(self):
+        self._backward_no_grad_on_tensor(
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backward_simple(self):
+        self._backward_simple(
+            self._next_rank(),
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_self(self):
+        self._backward_simple(
+            self.rank,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_rref(self):
+        callee = worker_name(self._next_rank())
+        rref_owner = callee
+        self._backward_rref(
+            callee,
+            rref_owner,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_rref_multi(self):
+        if self.rank > 0:
+            callee = "worker0"
+            rref_owner = callee
+            self._backward_rref(
+                callee,
+                rref_owner,
+                torch.rand((3, 3), requires_grad=True),
+                torch.rand((3, 3), requires_grad=True),
+                None,
+                False
+            )
+
+    @dist_init
+    def test_backward_rref_nested(self):
+        callee = worker_name((self.rank + 1) % self.world_size)
+        rref_owner = worker_name((self.rank + 2) % self.world_size)
+        self._backward_rref(
+            callee,
+            rref_owner,
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_trainer_ps(self):
+        self._test_trainer_ps(
+            create_tensor,
+            _run_trainer,
+            False
+        )
+
+    @dist_init
+    def test_trainer_ps_torchscript_functions(self):
+        # TODO, need more investigation
+        # there is rref leak when shutting down, suspect it is because
+        # ref as arg is passed to pybind boundary, and the ref is not garbage
+        # collected by python when calling shutdown()
+        import torch.distributed.rpc.api as api
+        api._ignore_rref_leak = True
+
+        self._test_trainer_ps(create_torchscript_tensor, _run_trainer_torchscript, False)
+
+    @dist_init
+    def test_backward_multiple_round_trips(self):
+        self._backward_multiple_round_trips(
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3)),
+            torch.rand((3, 3), requires_grad=True),
+            torch.rand((3, 3)),
+            torch.rand((3, 3), requires_grad=True),
+            None,
+            False
+        )
+
+    @dist_init
+    def test_backward_different_tensor_dims(self):
+        local_grads = None
+        t1 = torch.rand((4, 6), requires_grad=True)
+        t2 = torch.rand((6, 5))
+        t3 = torch.rand((5, 7), requires_grad=True)
+        t4 = torch.rand((7, 9))
+
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                val = self._exec_func(exec_mode, torch.matmul, t1, t2)
+                val = self._exec_func(exec_mode, torch.linalg.multi_dot, (val, t3, t4))
+                loss = val.sum()
+
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2, t2, t3, t4
+                )
+                local_grads = ret if ret else local_grads
+
+    @dist_init
+    def test_backward_unused_tensors(self):
+        local_grads = None
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        t3 = torch.rand((3, 3), requires_grad=True)
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                s = self._exec_func(exec_mode, torch.stack, (t1, t2, t3))
+                val = self._exec_func(
+                    exec_mode,
+                    torch.matmul,
+                    torch.narrow(s, 0, 0, 1),
+                    torch.narrow(s, 0, 2, 1),
+                )
+
+                loss = val.sum()
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2, t3
+                )
+                local_grads = ret if ret else local_grads
+
+    @dist_init
+    def test_backward_multiple_output_tensors(self):
+        local_grads = None
+        t = torch.rand((10, 2), requires_grad=True)
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                tensor_list = self._exec_func(exec_mode, torch.split, t, 2)
+                t1 = tensor_list[0]
+                t2 = tensor_list[2]
+                t3 = tensor_list[4]
+
+                val = self._exec_func(exec_mode, torch.linalg.multi_dot, (t1, t2, t3))
+
+                loss = val.sum()
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t
+                )
+                local_grads = ret if ret else local_grads
+
+    def _run_test_backward_unused_send_function_in_thread(self):
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+
+            # We don't use the result of an RPC function, as a result the
+            # backward pass would hang in the "FAST" mode.
+            res = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            )
+
+            val = torch.mul(t1, t2)
+
+            # Run backward, this would hang forever.
+            dist_autograd.backward(context_id, [val.sum()])
+
+    @dist_init
+    def test_backward_unused_send_function(self):
+        # Run the test in a thread which would never finish.
+        t = threading.Thread(
+            target=self._run_test_backward_unused_send_function_in_thread
+        )
+        t.daemon = True
+        t.start()
+        t.join(10)  # Wait for 10s.
+
+        # Verify thread is still alive (indicating backward hasn't completed yet).
+        self.assertTrue(t.is_alive())
+
+    @dist_init
+    def test_backward_autograd_engine_error(self):
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+            # Perform some ops before error simulation.
+            tmp = (t1 + t2) * (t1 + t2)
+            t3 = SimulateBackwardError.apply(tmp)
+
+            # Run multiple round trips across different nodes and verify the
+            # original node receives an error thrown on a node deep in the chain.
+            val = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.add, args=(t2, t3)
+            )
+            val = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.mul, args=(val, t2)
+            )
+            val = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.matmul, args=(val, t2)
+            )
+            val = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.div, args=(val, t2)
+            )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Error on Node [0-9]+: Simulate error on backward pass"
+            ):
+                # Run backwards, and validate we receive an error.
+                dist_autograd.backward(context_id, [val.sum()])
+
+    @dist_init(clean_shutdown=False)
+    @skip_but_pass_in_sandcastle_if(
+        IS_MACOS,
+        "Test is flaky on MacOS since libuv error handling is not as robust as TCP",
+    )
+    def test_backward_node_failure(self):
+        rpc._set_rpc_timeout(5)  # 5 seconds
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+            res = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            )
+
+            # Wait for all RPCs to be done.
+            dist.barrier()
+
+            # Kill all odd rank nodes.
+            if self.rank % 2 == 0:
+                shutdown_error_regex = self.get_shutdown_error_regex()
+                # Wait for all other nodes to die.
+                for rank in range(self.world_size):
+                    if rank % 2 != 0:
+                        wait_until_node_failure(rank, shutdown_error_regex)
+
+                # Shutdown sequence is not very well defined and as a result
+                # we might see any error given by get_shutdown_error_regex()
+                with self.assertRaisesRegex(RuntimeError, shutdown_error_regex):
+                    # Run backwards, and validate we receive an error since all
+                    # other nodes are dead.
+                    dist_autograd.backward(context_id, [res.sum()])
+            else:
+                # Exit all other nodes.
+                pass
+
+    @dist_init
+    def test_backward_without_context(self):
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+
+        context_id = 100  # dummy context_id
+        with self.assertRaisesRegex(
+            RuntimeError,
+            f"Could not find autograd context with id: {context_id}",
+        ):
+            res = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.add, args=(t1, t2)
+            )
+            dist_autograd.backward(context_id, [res.sum()])
+
+    @dist_init
+    def test_backward_without_rpc(self):
+        dst_rank = self.rank
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+            t3 = torch.add(t1, t2)
+
+            dist_autograd.backward(context_id, [t3.sum()])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(2, len(grads))
+            self.assertIn(t1, grads)
+            self.assertIn(t2, grads)
+            self.assertEqual(torch.ones(3, 3), grads[t1])
+            self.assertEqual(torch.ones(3, 3), grads[t2])
+
+    @dist_init
+    def test_backward_invalid_args(self):
+        with dist_autograd.context() as context_id:
+
+            with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+                dist_autograd.backward(context_id, None)
+
+            with self.assertRaisesRegex(TypeError, "incompatible function arguments"):
+                dist_autograd.backward(None, None)
+
+            with self.assertRaisesRegex(
+                RuntimeError, "No tensors provided for gradient computation"
+            ):
+                dist_autograd.backward(context_id, [])
+
+            with self.assertRaisesRegex(RuntimeError, "requires_grad not set on"):
+                t = torch.rand(3, 3)
+                dist_autograd.backward(context_id, [t])
+
+            with self.assertRaisesRegex(
+                RuntimeError, "is not a scalar, all roots need to be scalar"
+            ):
+                t = torch.rand(3, 3, requires_grad=True)
+                dist_autograd.backward(context_id, [t])
+
+            with self.assertRaisesRegex(
+                RuntimeError, "does not have a valid gradient function"
+            ):
+                t = torch.rand(1, requires_grad=True)
+                dist_autograd.backward(context_id, [t])
+
+    @dist_init
+    def test_backward_multiple_roots(self):
+        local_grads = None
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC]:
+            with dist_autograd.context() as context_id:
+                r1 = self._exec_func(exec_mode, torch.add, t1, t2).sum()
+                r2 = self._exec_func(exec_mode, torch.mul, t1, t2).sum()
+                r3 = self._exec_func(exec_mode, torch.cos, t1).sum()
+                r4 = self._exec_func(exec_mode, torch.div, t1, t2).sum()
+
+                local_grads = self._verify_backwards(
+                    exec_mode, [r1, r2, r3, r4], context_id, local_grads, t1, t2
+                )
+
+    @dist_init
+    def test_backward_different_dtypes(self):
+        self._backward_different_dtypes(
+            torch.rand((3, 3), requires_grad=True, dtype=torch.float32),
+            torch.rand((3, 3), requires_grad=True, dtype=torch.float64),
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_python_udf(self):
+        self._backward_simple_python_udf(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_backward_simple_script_call(self):
+        self._backward_simple_script_call(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @staticmethod
+    def _complex_python_udf(t1, t2):
+        t3 = torch.nn.functional.linear(t1, t2)
+        t4 = torch.nn.functional.linear(t2, t3)
+        t5 = torch.nn.functional.linear(t3, t4)
+        return torch.linalg.multi_dot([t1, t2, t3, t4, t5])
+
+    @dist_init
+    def test_backward_complex_python_udf(self):
+        # Run the same code locally and with dist autograd and verify gradients
+        # are same.
+        local_grads = None
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        for exec_mode in [ExecMode.LOCAL, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                ret = self._exec_func(
+                    exec_mode, DistAutogradTest._complex_python_udf, t1, t2
+                )
+                loss = ret.sum()
+                local_grads = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+
+    @staticmethod
+    def _python_udf_with_backward_error(t1, t2):
+        t3 = t1 + t2
+        t4 = SimulateBackwardError.apply(t3)
+        return torch.linalg.multi_dot([t1, t2, t3, t4])
+
+    @staticmethod
+    def _nested_rpc_call_backward_error(t1, t2, dst):
+        t1 = t1 * t2
+        t2 = t1 + t2
+        res = rpc.rpc_sync(
+            worker_name(dst),
+            DistAutogradTest._python_udf_with_backward_error,
+            args=(t1, t2),
+        )
+        return torch.linalg.multi_dot([t1, t2, res])
+
+    @dist_init
+    def test_backward_python_udf_error(self):
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        with dist_autograd.context() as context_id:
+            loss = rpc.rpc_sync(
+                worker_name(self._next_rank()),
+                DistAutogradTest._nested_rpc_call_backward_error,
+                args=(t1, t2, self._next_rank()),
+            )
+            with self.assertRaisesRegex(
+                RuntimeError, "Simulate error on backward pass"
+            ):
+                dist_autograd.backward(context_id, [loss.sum()])
+
+    _backward_done = False
+
+    @dist_init(clean_shutdown=False)
+    @skip_but_pass_in_sandcastle_if(
+        IS_MACOS,
+        "Test is flaky on MacOS since libuv error handling is not as robust as TCP",
+    )
+    def test_backward_node_failure_python_udf(self):
+        # Set a short timeout to quickly time out failed RPCs.
+        rpc._set_rpc_timeout(5)  # 5 seconds
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+
+            dst = self._next_rank()
+            res = rpc.rpc_sync(
+                worker_name(dst),
+                my_py_nested_call,
+                args=(t1, t2, dst, self.world_size, 1),
+            )
+
+            dist.barrier()
+
+            # Kill rank 2 (last hop of nested rpc) and verify rank 0 receives an error.
+            if self.rank == 2:
+                return
+
+            store = dist.distributed_c10d._get_default_store()
+            if self.rank == 0:
+                # Wait for rank 2 to die.
+                shutdown_error_regex = self.get_shutdown_error_regex()
+                wait_until_node_failure(2, shutdown_error_regex)
+                # Shutdown sequence is not very well defined and as a result
+                # we might see any error given by get_shutdown_error_regex().
+                with self.assertRaisesRegex(RuntimeError, shutdown_error_regex):
+                    # Run backwards, and validate we receive an error since rank 2 is dead.
+                    dist_autograd.backward(context_id, [res.sum()])
+
+                # Mark rank 0 is done in the store, since the RPC framework on
+                # some nodes might be broken at this point.
+                store.set('test_backward_node_failure_python_udf_rank0_done', "True")
+            else:
+                # Wait for backward to finish on rank 0.
+                store.wait(['test_backward_node_failure_python_udf_rank0_done'], timedelta(seconds=10))
+
+    @staticmethod
+    def _nested_python_udf(t1, t2, dst):
+        t3 = t1 * t2
+        t4 = t1 + t2
+        res = rpc.rpc_sync(worker_name(dst), my_py_add, args=(t3, t4))
+        return t1 * t2 * t3 * t4 * res
+
+    @dist_init
+    def test_backwards_nested_python_udf(self):
+        # Run equivalent of _nested_python_udf locally.
+        self._backwards_nested_python_udf(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    _test_clean_context_backward_context_id = None
+
+    class MyBackwardFunc(Function):
+        @staticmethod
+        def forward(ctx, input):
+            return input
+
+        @staticmethod
+        @once_differentiable
+        def backward(ctx, input):
+            assert DistAutogradTest._test_clean_context_backward_context_id is not None
+
+            # Release the context to simulate error (use barrier before releasing
+            # context to ensure all nodes execute the backward function).
+            dist.barrier()
+            dist_autograd._release_context(
+                DistAutogradTest._test_clean_context_backward_context_id
+            )
+
+            # Verify all contexts are cleaned up.
+            assert _all_contexts_cleaned_up()
+
+            return input
+
+    @dist_init
+    def test_clean_context_during_backward(self):
+        """
+        This test simulates the situation where the 'backward' call might throw
+        an exception locally which would lead to the autograd context being
+        cleaned up if we're using the context manager. As a result, the autograd
+        context might be cleaned up while some threads are still using the
+        autograd context.
+
+        It is fine for the 'backward' call to throw an exception in this test,
+        but the process should not crash.
+        """
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        context = dist_autograd._new_context()
+        context_id = context._context_id()
+        DistAutogradTest._test_clean_context_backward_context_id = context_id
+
+        # Send the context id to all nodes.
+        for i in range(0, self.world_size):
+            if i != self.rank:
+                rank_distance = (i - self.rank + self.world_size) % self.world_size
+                rpc.rpc_sync(
+                    worker_name(i),
+                    _set_rpc_done,
+                    args=(context_id, rank_distance),
+                )
+
+        dist.barrier()
+
+        # Verify all context ids have been received.
+        self.assertEqual(self.world_size - 1, len(known_context_ids))
+
+        t1 = torch.rand((3, 3), requires_grad=True)
+        for i in range(0, 100):
+            dst = self._next_rank()
+            t1 = rpc.rpc_sync(worker_name(dst), torch.add, args=(t1, t1))
+
+        # Call MyBackwardFunc as the first op of the backward pass to
+        # ensure we release the context early in the backward pass.
+        t1 = DistAutogradTest.MyBackwardFunc.apply(t1)
+        self.assertEqual(100, len(context._send_functions()))
+
+        context_id = 100  # dummy context_id
+        with self.assertRaisesRegex(
+            RuntimeError,
+            f"Could not find autograd context with id: {context_id}",
+        ):
+            dist_autograd.backward(context_id, [t1.sum()])
+
+        # HACK: Killing workers since otherwise the autograd engine gets stuck on
+        # other nodes. The proper fix would be addressing:
+        # https://github.com/pytorch/pytorch/issues/27643, which would inform
+        # other nodes about the failure.
+        # The autograd engine gets stuck on other nodes since they're waiting to
+        # receive gradients from the node that received an error (and as a
+        # result it didn't execute the rest of the graph).
+        dist.barrier()
+        rpc.shutdown(graceful=False)
+        sys.exit(0)
+
+    @classmethod
+    def _call_remote_embedding(cls, embedding_rref, input, offsets, per_sample_weights):
+        embedding = embedding_rref.local_value()
+        return embedding(input, offsets, per_sample_weights)
+
+    @classmethod
+    def _get_grad(cls, embedding_rref, context_id):
+        embedding = embedding_rref.local_value()
+        grad_map = dist_autograd.get_gradients(context_id)
+        return grad_map[embedding.weight]
+
+    @classmethod
+    def _mixed_requires_grad_operaton(cls, t1, t2):
+        if t2.requires_grad:
+            return t1 - t2
+        else:
+            return t1 * t2
+
+    @dist_init
+    def test_mixed_requires_grad(self):
+        self._mixed_requires_grad(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=False),
+            False
+        )
+
+    class TestDebugInfoFunc(Function):
+        @staticmethod
+        def forward(ctx, input):
+            return input
+
+        @staticmethod
+        @once_differentiable
+        def backward(ctx, input):
+            debug_info = dist_autograd._get_debug_info()
+            assert debug_info is not None
+            backward_passes = int(debug_info["num_current_backward_passes"])
+
+            # Hard to validate exact numbers because of the distributed nature.
+            # We can't use a barrier() here since that would block the single
+            # CPU thread available for autograd and can cause deadlocks.
+            assert backward_passes >= 1 and backward_passes <= 4
+            return input
+
+    @dist_init
+    def test_debug_info(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        with dist_autograd.context() as context_id:
+            i = 0
+            res = {}
+            res[i] = t1
+            for rank in range(self.world_size):
+                if rank != self.rank:
+                    res[i + 1] = rpc.rpc_sync(
+                        worker_name(rank), torch.add, args=(res[i], t2)
+                    )
+                    i += 1
+
+            # Call custom function in middle of backward pass to ensure all
+            # nodes are still waiting on a backward().
+            res[i + 1] = DistAutogradTest.TestDebugInfoFunc.apply(res[i])
+            i += 1
+
+            for rank in range(self.world_size):
+                if rank != self.rank:
+                    res[i + 1] = rpc.rpc_sync(
+                        worker_name(rank), torch.add, args=(res[i], t2)
+                    )
+                    i += 1
+
+            dist_autograd.backward(context_id, [res[i].sum()])
+
+            debug_info = dist_autograd._get_debug_info()
+            num_autograd_context = int(debug_info["num_autograd_contexts"])
+            # Need atleast one context and not more than 4.
+            self.assertTrue(num_autograd_context >= 1 and num_autograd_context <= 4)
+
+        for rd in range(self.world_size - 1):
+            rpc.rpc_sync(
+                worker_name((self.rank + rd + 1) % self.world_size),
+                _set_rpc_done,
+                args=(context_id, rd + 1),
+            )
+
+        dist.barrier()
+
+        # Validate information
+        debug_info = dist_autograd._get_debug_info()
+        assert debug_info is not None
+        self.assertEqual(0, int(debug_info["num_current_backward_passes"]))
+        # only have `num_current_backward_passes` and `num_autograd contexts`
+        self.assertTrue(len(debug_info) == 2)
+
+        self.assertTrue(_all_contexts_cleaned_up())
+
+        # All contexts should be cleaned up.
+        debug_info = dist_autograd._get_debug_info()
+        self.assertEqual(0, int(debug_info["num_autograd_contexts"]))
+
+    @staticmethod
+    def _workload_thread():
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        with dist_autograd.context() as context_id:
+            t3 = rpc.rpc_sync("worker0", torch.add, args=(t1, t2))
+            t4 = rpc.rpc_sync("worker0", torch.mul, args=(t2, t3))
+            t5 = rpc.rpc_sync("worker0", torch.matmul, args=(t3, t4))
+            t6 = rpc.rpc_sync("worker0", torch.add, args=(t4, t5))
+
+            dist_autograd.backward(context_id, [t6.sum()])
+
+    @dist_init
+    def test_async_dist_autograd(self):
+        """
+        This test ensures async processing for distributed autograd works
+        appropriately. This is achieved by spawning multiple threads and
+        hammering a single node with a lot of backward() calls.
+        """
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        if self.rank != 0:
+            # All other ranks schedule work on rank 0.
+            threads = []
+            for i in range(20):
+                t = threading.Thread(target=DistAutogradTest._workload_thread)
+                t.start()
+                threads.append(t)
+
+            for thread in threads:
+                thread.join()
+
+        dist.barrier()
+
+    @dist_init
+    def test_backward_accumulate_grads(self):
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        with dist_autograd.context() as context_id:
+            t3 = torch.matmul(t1, t2)
+            # Run backward twice.
+            torch.autograd.backward([t3.sum()], retain_graph=True)
+            torch.autograd.backward([t3.sum()])
+
+            t3 = rpc.rpc_sync(
+                worker_name(self._next_rank()), torch.matmul, args=(t1, t2)
+            )
+            # Run backward twice.
+            dist_autograd.backward(context_id, [t3.sum()], retain_graph=True)
+            dist_autograd.backward(context_id, [t3.sum()])
+
+            # Verify the gradients are same for local and remote execution.
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(2, len(grads))
+            self.assertIn(t1, grads)
+            self.assertIn(t2, grads)
+            self.assertEqual(t1.grad, grads[t1])
+            self.assertEqual(t2.grad, grads[t2])
+
+    @staticmethod
+    def _test_nested_backward_accumulate_grads(t1, t2, dst_rank):
+        return rpc.rpc_sync(worker_name(dst_rank), torch.add, args=(t1, t2))
+
+    @dist_init
+    def test_nested_backward_accumulate_grads(self):
+        self._nested_backward_accumulate_grads(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init
+    def test_multiple_backward(self):
+        self._multiple_backward(
+            torch.rand(3, 3, requires_grad=True),
+            torch.rand(3, 3, requires_grad=True),
+            False
+        )
+
+    @dist_init(clean_shutdown=False)
+    def test_multiple_backward_with_errors(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        t1 = torch.rand((3, 3), requires_grad=True)
+        t2 = torch.rand((3, 3), requires_grad=True)
+        with dist_autograd.context() as context_id:
+            loss = rpc.rpc_sync(
+                f'worker{self._next_rank()}',
+                DistAutogradTest._python_udf_with_backward_error,
+                args=(t1, t2)).sum()
+
+            try:
+                # Run backward in a loop multiple times.
+                for i in range(100):
+                    if i < 50:
+                        with self.assertRaisesRegex(RuntimeError, "Simulate error on backward pass"):
+                            dist_autograd.backward(context_id, [loss], retain_graph=True)
+                    elif i > 50:
+                        # Recovered from error.
+                        dist_autograd.backward(context_id, [loss], retain_graph=True)
+                    else:
+                        dist.barrier()
+                        SimulateBackwardError._simulate_error = False
+                        dist.barrier()
+            finally:
+                # Sync before resetting flag.
+                dist.barrier()
+
+                # Reset the flag.
+                SimulateBackwardError._simulate_error = True
+
+    @dist_init
+    def test_backward_verify_hooks(self):
+        t1 = torch.ones((3, 3), requires_grad=True)
+        # Double the gradient.
+        t1.register_hook(lambda grad: grad * 2)
+        t2 = torch.ones((3, 3), requires_grad=True)
+        local_grads = None
+        for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC, ExecMode.REMOTE]:
+            with dist_autograd.context() as context_id:
+                ret = self._exec_func(exec_mode, torch.matmul, t1, t2)
+                loss = ret.sum()
+                ret = self._verify_backwards(
+                    exec_mode, [loss], context_id, local_grads, t1, t2
+                )
+                local_grads = ret if ret else local_grads
+
+    @dist_init
+    def test_no_grad_copy(self):
+        '''
+        Similar to test in test_autograd.py.
+        '''
+        # create autograd function that saves grad pointer as class static
+        class MyFunc(Function):
+            static_grad_ptr = None
+
+            @staticmethod
+            def forward(ctx, inp1, inp2):
+                return inp1 + inp2
+
+            @staticmethod
+            def backward(ctx, grad):
+                MyFunc.static_grad_ptr = grad.data_ptr()
+                return grad, grad
+
+        class MyFuncSingleGrad(Function):
+            static_grad_ptr = None
+
+            @staticmethod
+            def forward(ctx, inp):
+                return inp
+
+            @staticmethod
+            def backward(ctx, grad):
+                MyFuncSingleGrad.static_grad_ptr = grad.data_ptr()
+                return grad
+
+        class NonContGradFunc(Function):
+            @staticmethod
+            def forward(ctx, inp1):
+                ctx.size = inp1.size()
+                return torch.tensor([1.])
+
+            @staticmethod
+            def backward(ctx, grad):
+                return torch.ones(1).expand(ctx.size)
+
+        a = torch.randn(5, 6, requires_grad=True)
+        b = torch.randn(5, 6, requires_grad=True)
+        # non-contiguous grad should be copied
+        with dist_autograd.context() as context_id:
+            dist_autograd.backward(context_id, [NonContGradFunc.apply(MyFunc.apply(a, b))])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertFalse(grads[a].data_ptr() == MyFunc.static_grad_ptr)
+            self.assertFalse(grads[b].data_ptr() == MyFunc.static_grad_ptr)
+
+        # test case that should trigger no copy for a
+        with dist_autograd.context() as context_id:
+            dist_autograd.backward(context_id, [MyFuncSingleGrad.apply(a)[1][0]])
+            grads = dist_autograd.get_gradients(context_id)
+            p_g = MyFuncSingleGrad.static_grad_ptr
+            p_a = grads[a].data_ptr()
+            # Verify there was no clone.
+            self.assertTrue(p_a == p_g)
+
+        # Test case that should trigger copy for both of a,b. This is
+        # different in the distributed autograd case since we hold
+        # a reference to all grads in a vector until all accumulation is done.
+        with dist_autograd.context() as context_id:
+            dist_autograd.backward(context_id, [MyFunc.apply(a, b)[1][0]])
+            grads = dist_autograd.get_gradients(context_id)
+            p_g = MyFunc.static_grad_ptr
+            p_a = grads[a].data_ptr()
+            p_b = grads[b].data_ptr()
+            # check a,b uses different grad buffer
+            self.assertFalse(p_a == p_b)
+            # both should be copied.
+            self.assertFalse(grads[a].data_ptr() == MyFunc.static_grad_ptr)
+            self.assertFalse(grads[b].data_ptr() == MyFunc.static_grad_ptr)
+
+    @dist_init
+    def test_no_grad_copy_sparse(self):
+        # create autograd function that saves grad pointer as class static
+        class MyFunc(Function):
+            static_grad_ptr = None
+
+            @staticmethod
+            def forward(ctx, inp):
+                return inp
+
+            @staticmethod
+            def backward(ctx, grad):
+                MyFunc.static_grad_ptr = grad._values().data_ptr()
+                return grad
+
+        class NonContGradFunc(Function):
+            static_grad_ptr = None
+
+            @staticmethod
+            def forward(ctx, inp1, inp2):
+                return inp1 + inp2
+
+            @staticmethod
+            def backward(ctx, grad):
+                # Create a sparse tensor with non-contiguous indices and values
+                # and return as grad.
+                v = torch.rand(1, 3)
+                i = torch.ones(1, 1, dtype=torch.long)
+                nv = v.expand(8, 3)
+                ni = i.expand(1, 8)
+                ngrad = torch.sparse_coo_tensor(ni, nv, (10, 3), dtype=torch.float32)
+                NonContGradFunc.static_grad_ptr = ngrad._values().data_ptr()
+                return ngrad, ngrad
+
+        a = torch.randn(10, 3, requires_grad=True)
+        b = torch.randn(10, 3, requires_grad=True)
+        input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
+        offsets = torch.tensor([0, 4])
+        import torch.nn.functional as F
+
+        # test case that should trigger no copy for a.
+        with dist_autograd.context() as context_id:
+            emb_matrix = MyFunc.apply(a)
+            loss = F.embedding_bag(emb_matrix, input, offsets, sparse=True).sum()
+            dist_autograd.backward(context_id, [loss], retain_graph=True)
+            grads = dist_autograd.get_gradients(context_id)
+            p_g = MyFunc.static_grad_ptr
+            p_a = grads[a]._values().data_ptr()
+            # check a uses the same buffer
+            self.assertTrue(p_a == p_g)
+
+            # Run backwards multiple times.
+            for i in range(10):
+                dist_autograd.backward(context_id, [loss], retain_graph=True)
+
+        # non-contiguous indices and value, we should trigger a copy.
+        with dist_autograd.context() as context_id:
+            emb_matrix = NonContGradFunc.apply(a, b)
+            loss = F.embedding_bag(emb_matrix, input, offsets, sparse=True).sum()
+            dist_autograd.backward(context_id, [loss], retain_graph=True)
+            grads = dist_autograd.get_gradients(context_id)
+            p_g = NonContGradFunc.static_grad_ptr
+            p_a = grads[a]._values().data_ptr()
+            p_b = grads[b]._values().data_ptr()
+            # check a,b uses different grad buffer
+            self.assertFalse(p_a == p_b)
+            # Verify we cloned both grads.
+            self.assertFalse(p_a == p_g)
+            self.assertFalse(p_b == p_g)
+
+            # Run backwards multiple times to verify accumulation.
+            for i in range(10):
+                dist_autograd.backward(context_id, [loss], retain_graph=True)
+
+    @dist_init
+    def test_grad_copy_sparse_indices_extra_ref(self):
+        # create autograd function that saves grad pointer as class static
+        class MyFunc(Function):
+            static_grad_ptr = None
+            static_grad_indices_ref = None
+            static_grad_values_ref = None
+
+            @staticmethod
+            def forward(ctx, inp):
+                return inp
+
+            @staticmethod
+            def backward(ctx, grad):
+                MyFunc.static_grad_ptr = grad._values().data_ptr()
+                # indices() and values() return views, so holding onto
+                # references of them would not increment refcount of indices
+                # and values inside the sparse tensor.
+                MyFunc.static_grad_indices_ref = grad._indices()
+                MyFunc.static_grad_values_ref = grad._values()
+                return grad
+
+        a = torch.randn(10, 3, requires_grad=True)
+        input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9])
+        offsets = torch.tensor([0, 4])
+        import torch.nn.functional as F
+
+        with dist_autograd.context() as context_id:
+            emb_matrix = MyFunc.apply(a)
+            loss = F.embedding_bag(emb_matrix, input, offsets, sparse=True).sum()
+            dist_autograd.backward(context_id, [loss], retain_graph=True)
+            grads = dist_autograd.get_gradients(context_id)
+            p_g = MyFunc.static_grad_ptr
+            p_a = grads[a]._values().data_ptr()
+            self.assertIsNotNone(MyFunc.static_grad_indices_ref)
+            self.assertIsNotNone(MyFunc.static_grad_values_ref)
+            # grad would be stolen, since static_grad_indices_ref and
+            # static_grad_values_ref are holding onto views and don't bump the
+            # refcount.
+            self.assertTrue(p_g == p_a)
+
+    @dist_init
+    def test_post_hooks(self):
+        self.hook_called_times = 0
+
+        def post_hook_add_one(output_grads, input_grads):
+            self.hook_called_times += 1
+            return output_grads
+
+        def post_hook_add_two(output_grads, input_grads):
+            self.hook_called_times += 2
+            return output_grads
+
+        t = torch.rand(10, 10, requires_grad=True)
+        a = t + t
+
+        # Register post hooks
+        accumulate_grad_0 = a.grad_fn.next_functions[0][0]
+        accumulate_grad_0.register_hook(post_hook_add_one)
+        accumulate_grad_0.register_hook(post_hook_add_two)
+
+        accumulate_grad_1 = a.grad_fn.next_functions[1][0]
+        accumulate_grad_1.register_hook(post_hook_add_two)
+
+        with dist_autograd.context() as context_id:
+            loss = a.sum()
+            dist_autograd.backward(context_id, [loss])
+            self.assertEqual(5, self.hook_called_times)
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(1, len(grads))
+            self.assertTrue(t in grads)
+
+    @staticmethod
+    def _slow_add(t1, t2):
+        time.sleep(1)
+        t3 = t1 + t2
+        t3.requires_grad = True
+        return t3
+
+    @dist_init
+    def test_thread_local_context_id(self):
+        t1 = torch.rand((3, 3))
+        t2 = torch.rand((3, 3))
+
+        t3 = t1 + t2
+        t3.requires_grad = True
+        t3.sum().backward()
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        rref = rpc.remote(dst, DistAutogradTest._slow_add, args=(t1, t2))
+
+        with dist_autograd.context() as context_id:
+            loss = rref.to_here().sum()
+            # due to slow add, the continuation of this backward pass will be
+            # invoked by the previous rpc.remote thread which does not have a
+            # valid context_id. So, this can test whether we propagate
+            # thread_local states properly when jumping across threads on the
+            # server side.
+            dist_autograd.backward(context_id, [loss])
+            self.assertTrue(
+                rpc.rpc_sync(
+                    dst,
+                    _compare_owner_value,
+                    args=(context_id, rref, t3.grad)
+                )
+            )
+
+
+class CudaDistAutogradTest(CommonDistAutogradTest):
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_simple(self):
+        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
+        t2 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
+        (t1 + t2).sum().backward()
+        with dist_autograd.context() as context_id:
+            t3 = t1 + t2
+            dist_autograd.backward(context_id, [t3.sum()])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(2, len(grads))
+            self.assertEqual(t1.grad, grads[t1])
+            self.assertEqual(t2.grad, grads[t2])
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_to_cpu_continuation(self):
+        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
+        t2 = torch.rand(3, 3, requires_grad=True)
+        # Run a few iterations.
+        for i in range(3):
+            t1.grad = None
+            t2.grad = None
+            # Root is CPU
+            local_grads = None
+            for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC]:
+                with dist_autograd.context() as context_id:
+                    t3 = self._exec_func(exec_mode, torch.add, t2, t2)
+                    t4 = t3.cuda(0) + t1
+                    t5 = self._exec_func(exec_mode, torch.add, t4.cpu(), t2)
+                    t6 = t5.cuda(0) + t4
+                    t7 = self._exec_func(exec_mode, torch.add, t6.cpu(), t5)
+                    # Autograd graph consists of CPU -> GPU -> CPU execution.
+                    ret = self._verify_backwards(
+                        exec_mode, [t7.sum()], context_id, local_grads, t1, t2
+                    )
+                    local_grads = ret if ret else local_grads
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_gpu_to_cpu_continuation_gpu_root(self):
+        t1 = torch.rand(3, 3, requires_grad=True, device="cuda:0")
+        t2 = torch.rand(3, 3, requires_grad=True)
+        # Run a few iterations.
+        for i in range(3):
+            t1.grad = None
+            t2.grad = None
+            # Root is CPU
+            local_grads = None
+            for exec_mode in [ExecMode.LOCAL, ExecMode.RPC_SYNC]:
+                with dist_autograd.context() as context_id:
+                    t3 = self._exec_func(exec_mode, torch.add, t2, t2)
+                    t4 = t3.cuda(0) + t1
+                    t5 = self._exec_func(exec_mode, torch.add, t4.cpu(), t2)
+                    t6 = t5.cuda(0) + t4
+                    # Autograd graph consists of CPU -> GPU -> CPU execution.
+                    ret = self._verify_backwards(
+                        exec_mode, [t6.sum()], context_id, local_grads, t1, t2
+                    )
+                    local_grads = ret if ret else local_grads
+
+
+class FaultyAgentDistAutogradTest(RpcAgentTestFixture):
+    # Reusing a simplified helper function from DistAutogradTest to ensure
+    # autograd context is successfully cleaned up even when RPCs are failing.
+    def context_cleanup_test_helper(self, rpc_args, func):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        # test that in dist autograd, in the case that tensors communicated over RPC do
+        # NOT require grad, we still cleanup the dist autograd contexts created
+        # on other nodes. This is because the autograd context is still
+        # communicated over RPC even if tensor arguments do not require grad, as
+        # it is possible that the response could.
+        dst_ranks = {rank for rank in range(self.world_size) if rank != self.rank}
+
+        with dist_autograd.context() as context_id:
+            for dst_rank in dst_ranks:
+                rpc.rpc_sync(worker_name(dst_rank), func, args=rpc_args)
+                rpc.rpc_sync(
+                    worker_name(dst_rank), _set_rpc_done, args=(context_id, 1)
+                )
+        # the thread's context id should be cleaned up
+        with self.assertRaises(RuntimeError):
+            dist_autograd._retrieve_context(context_id)
+        # Ensure all peers have finished mutating the
+        # `known_context_ids` set.
+        dist.barrier()
+        # check that all contexts have been cleaned up.
+        success = _all_contexts_cleaned_up()
+        self.assertTrue(success)
+
+    # no faulty_messages defined so this fails all retryable messages - see
+    # faulty_rpc_agent_test_fixture.py for the list of retryable messages.
+    @dist_init
+    def test_context_cleanup_tensor_with_grad(self):
+        t1 = torch.ones(3, 3, requires_grad=True)
+        t2 = torch.zeros(3, 3, requires_grad=True)
+        self.context_cleanup_test_helper(rpc_args=(t1, t2), func=torch.add)
+
+    @dist_init
+    def test_verify_backend_options(self):
+        self.assertEqual(self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE)
+        self.assertEqual(self.rpc_backend_options.num_worker_threads, 8)
+        self.assertEqual(self.rpc_backend_options.num_fail_sends, 3)
+        self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4)
+
+
+class WrapperModule(nn.Module):
+    def __init__(self, model, device):
+        super().__init__()
+        self.model = model.to(device)
+
+    def forward(self, *args):
+        return self.model(*args)
+
+    def gradients(self, ctx_id):
+        grads = dist_autograd.get_gradients(ctx_id)
+        return [grads[p] for p in self.model.parameters()]
+
+
+class TensorPipeCudaDistAutogradTest(RpcAgentTestFixture):
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_backward_pass(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        # The reverse of this device mapping should be used for the backward pass.
+        options.set_device_map(dst, {self.rank: (self.rank + 1) % self.world_size})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        t1 = torch.rand(10, device=self.rank, requires_grad=True)
+        t2 = torch.rand(10, device=self.rank, requires_grad=True)
+        with dist_autograd.context() as context_id:
+            res = rpc.rpc_sync(dst, torch.add, args=(t1, t2))
+            dist_autograd.backward(context_id, [res.sum()])
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(torch.ones(10), grads[t1])
+            self.assertEqual(torch.ones(10), grads[t2])
+            self.assertEqual(t1.device, grads[t1].device)
+            self.assertEqual(t2.device, grads[t2].device)
+
+        rpc.shutdown()
+
+    class MyRemoteCompute(torch.nn.Module):
+        def forward(self, input):
+            input = input * 2.0
+            return input
+
+    class MyLocalCompute(torch.nn.Module):
+        def __init__(self, next_stage):
+            super().__init__()
+            self.next_stage = next_stage
+
+        def forward(self, input):
+            return self.next_stage.rpc_sync().forward(input)
+
+    @skip_if_lt_x_gpu(4)
+    def test_dist_autograd_sync_streams(self):
+
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        # The reverse of this device mapping should be used for the backward pass.
+        options.set_device_map(dst, {self.rank: (self.rank + 1) % self.world_size})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        remote_compute = rpc.remote(dst, TensorPipeCudaDistAutogradTest.MyRemoteCompute)
+        local_compute = TensorPipeCudaDistAutogradTest.MyLocalCompute(remote_compute)
+        for _ in range(10):
+            input = torch.rand([1000, 10000], device=self.rank, requires_grad=True)
+            # Run local autograd
+            result = input * 2.0
+            r = random.random()
+            loss = result.sum() * r
+            loss.backward()
+
+            # Run distributed autograd
+            with dist_autograd.context() as context_id:
+                result = local_compute(input)
+                loss = result.sum() * r
+                dist_autograd.backward(context_id, [loss])
+
+                # Compare grads.
+                grads = dist_autograd.get_gradients(context_id)
+                self.assertEqual(input.grad, grads[input])
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(4)
+    def test_gradients_synchronizations(self):
+        options = self.rpc_backend_options
+        for peer_rank in range(self.world_size):
+            options.set_device_map(worker_name(peer_rank), {self.rank: peer_rank})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        if self.rank == 0:
+            # this is master
+            layers = [nn.Linear(2000, 2000) for _ in range(self.world_size - 1)]
+            local_layers = [l.to(0) for l in layers]
+            remote_layers = []
+            for rank in range(1, self.world_size):
+                remote_layers.append(rpc.remote(
+                    worker_name(rank),
+                    WrapperModule,
+                    args=(layers[rank - 1], rank)
+                ))
+
+            x = torch.randn(5000, 2000).to(0)
+            # local iteration
+            local_model = nn.Sequential(*local_layers)
+            local_model(x).sum().backward()
+
+            # remote iteration
+            with dist_autograd.context() as context_id:
+                for remote_layer in remote_layers:
+                    x = remote_layer.rpc_sync().forward(x)
+
+                dist_autograd.backward(context_id, [x.sum()])
+
+                futs = []
+                for remote_layer in remote_layers:
+                    futs.append(remote_layer.rpc_async().gradients(context_id))
+
+                for i in range(len(futs)):
+                    local_gradients = [p.grad for p in local_layers[i].parameters()]
+                    for g1, g2 in zip(futs[i].wait(), local_gradients):
+                        self.assertEqual(g1, g2)
+
+        rpc.shutdown()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfdb7b8148f4d0173b5e89da50b17b0b938d632
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -0,0 +1,281 @@
+# mypy: ignore-errors
+
+
+import threading
+
+import torch
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+from torch import optim
+from torch.distributed.optim import DistributedOptimizer
+from torch.testing._internal.dist_utils import dist_init
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+
+class MyModule:
+    lock = threading.Lock()
+
+    def __init__(self, requires_grad=True):
+        # cannot directly use torch.manual_seed(0) as all threads share the same
+        # default generator. The race from multiple RPC threads could mess up
+        # the draw order from the default RNG instance, leading to
+        # non-deterministic behavior. Hence, create a dedicated RNG here.
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        self.w = torch.rand((3, 3), requires_grad=requires_grad, generator=g_cpu)
+
+    def forward(self, t1):
+        return torch.mm(self.w, t1)
+
+    def get_w(self):
+        return self.w
+
+
+class FailingOptimizer(optim.Optimizer):
+    def __init__(self, params):
+        super().__init__(params, {})
+
+    def step(self, closure=None):
+        raise ValueError("Error running optimizer.")
+
+
+class OptimizerFailingOnConstructor(optim.Optimizer):
+    def __init__(self, params):
+        super().__init__(params, {})
+        raise ValueError("Error creating optimizer.")
+
+    def step(self, closure=None):
+        raise NotImplementedError
+
+
+def _call_method(method, obj_rref, *args, **kwargs):
+    return method(obj_rref.local_value(), *args, **kwargs)
+
+
+def remote_method(method, obj_rref, *args, **kwargs):
+    """
+    Call rpc.remote on a method in a remote object.
+
+    Args:
+        method: the method (for example, Class.method)
+        obj_rref (RRef): remote reference to the object
+        args: positional arguments to pass to the method
+        kwargs: keyword arguments to pass to the method
+
+    Returns a RRef to the remote method call result.
+    """
+    return rpc.remote(
+        obj_rref.owner(),
+        _call_method,
+        args=[method, obj_rref] + list(args),
+        kwargs=kwargs,
+    )
+
+
+def rpc_async_method(method, obj_rref, *args, **kwargs):
+    """
+    Call rpc.rpc_async on a method in a remote object.
+
+    Args:
+        method: the method (for example, Class.method)
+        obj_rref (RRef): remote reference to the object
+        args: positional arguments to pass to the method
+        kwargs: keyword arguments to pass to the method
+
+    Returns a Future to the method call result.
+    """
+    return rpc.rpc_async(
+        obj_rref.owner(),
+        _call_method,
+        args=[method, obj_rref] + list(args),
+        kwargs=kwargs,
+    )
+
+
+class DistOptimizerTest(RpcAgentTestFixture):
+    @dist_init()
+    def test_dist_optim_exception(self):
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        dist_optim = DistributedOptimizer(
+            FailingOptimizer, [remote_param1, remote_param2]
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
+            output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait())
+            loss = torch.add(output2.wait(), t1).sum()
+
+            dist_autograd.backward(context_id, [loss])
+            with self.assertRaisesRegex(Exception, "Error running optimizer"):
+                dist_optim.step(context_id)
+
+    @dist_init()
+    def test_dist_optim_exception_on_constructor(self):
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        with self.assertRaisesRegex(Exception, "Error creating optimizer."):
+            dist_optim = DistributedOptimizer(
+                OptimizerFailingOnConstructor, [remote_param1, remote_param2]
+            )
+
+    def _test_dist_optim_base(self, optim_cls, *args, **kwargs):
+        # local version
+        module1 = MyModule()
+        module2 = MyModule()
+        params = [module1.get_w(), module2.get_w()]
+        local_optim = optim_cls(params, *args, **kwargs)
+
+        old_w1 = module1.w.clone().detach()
+        old_w2 = module2.w.clone().detach()
+
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        output1 = module1.forward(t2)
+        output2 = module2.forward(output1)
+        loss = torch.add(output2, t1).sum()
+
+        loss.backward()
+        local_optim.step()
+
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        old_w1_remote = remote_param1.to_here()
+
+        # sanity check: local and remote initial weights should match
+        self.assertEqual(old_w1, remote_param1.to_here())
+        self.assertEqual(old_w2, remote_param2.to_here())
+
+        dist_optim = DistributedOptimizer(
+            optim_cls, [remote_param1, remote_param2], *args, **kwargs
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
+            output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait())
+            loss = torch.add(output2.wait(), t1)
+
+            dist_autograd.backward(context_id, [loss.sum()])
+            dist_optim.step(context_id)
+
+            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
+            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()
+
+            # ensure optimizer changed weights
+            self.assertNotEqual(old_w1, new_w1)
+            self.assertNotEqual(old_w2, new_w2)
+            # ensure local equals remote
+            self.assertEqual(new_w1, module1.get_w())
+            self.assertEqual(new_w2, module2.get_w())
+
+    @dist_init()
+    def test_dist_optim(self):
+        self._test_dist_optim_base(optim.Adagrad, lr=0.05)
+        self._test_dist_optim_base(optim.Adam, lr=1e-2, amsgrad=True)
+        self._test_dist_optim_base(optim.AdamW, lr=0.05, amsgrad=True)
+        self._test_dist_optim_base(optim.SGD, lr=0.05)
+        self._test_dist_optim_base(optim.SGD, lr=1e-3, momentum=1, weight_decay=1, nesterov=True)
+        self._test_dist_optim_base(optim.Adadelta, rho=0.95)
+        self._test_dist_optim_base(optim.RMSprop, lr=0.05)
+        self._test_dist_optim_base(optim.Adamax, lr=0.05)
+        self._test_dist_optim_base(optim.Rprop, lr=0.05)
+
+    def _test_dist_optim_none_grads(self, optim_cls, *args, **kwargs):
+        # local version
+        module1 = MyModule()
+        module2 = MyModule(requires_grad=False)
+        params = [module1.get_w(), module2.get_w()]
+        local_optim = optim_cls(params, *args, **kwargs)
+
+        old_w1 = module1.w.clone().detach()
+        old_w2 = module2.w.clone().detach()
+
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        output1 = module1.forward(t2)
+        output2 = module2.forward(output1)
+        loss = torch.add(output2, t1).sum()
+
+        loss.backward()
+        local_optim.step()
+
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule, args=(False,))
+        remote_param1 = remote_module1.remote().get_w()
+        remote_param2 = remote_module2.remote().get_w()
+
+        # sanity check: local and remote initial weights should match
+        self.assertEqual(old_w1, remote_param1.to_here())
+        self.assertEqual(old_w2, remote_param2.to_here())
+
+        dist_optim = DistributedOptimizer(
+            optim_cls, [remote_param1, remote_param2], *args, **kwargs
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = remote_module1.rpc_async().forward(t2)
+            output2 = remote_module2.rpc_async().forward(output1.wait())
+            loss = torch.add(output2.wait(), t1)
+
+            dist_autograd.backward(context_id, [loss.sum()])
+            dist_optim.step(context_id)
+
+            new_w1 = remote_module1.rpc_async().get_w().wait()
+            new_w2 = remote_module2.rpc_async().get_w().wait()
+
+            # ensure optimizer changed weights for w1
+            self.assertNotEqual(old_w1, new_w1)
+
+            # ensure optimizer not changed weights for w2
+            self.assertEqual(old_w2, new_w2)
+            # ensure local equals remote
+            self.assertEqual(new_w1, module1.get_w())
+            self.assertEqual(new_w2, module2.get_w())
+
+    @dist_init()
+    def test_dist_optim_none_grads(self):
+        self._test_dist_optim_none_grads(optim.SGD, lr=0.05)
+        self._test_dist_optim_none_grads(optim.RMSprop, lr=0.05)
+        self._test_dist_optim_none_grads(optim.Rprop, lr=0.05)
+        self._test_dist_optim_none_grads(optim.Adadelta, rho=0.95)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dee63e69ac57a4242ce53dce4b5d068c41fd5f67
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/parameter_server_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/parameter_server_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa3dc7356a135e4560b4beb5d1d2dc42013ea77b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/parameter_server_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/reinforcement_learning_rpc_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/reinforcement_learning_rpc_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19b1b9de8537243a8bc4e8ef7304e913c74bc8e5
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/__pycache__/reinforcement_learning_rpc_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..df19b56f862243b078bf42a1f4ddeaec827c3379
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/parameter_server_test.py
@@ -0,0 +1,144 @@
+# mypy: ignore-errors
+
+# If you need to modify this file to make this test pass, please also apply same edits accordingly to
+# https://github.com/pytorch/examples/blob/master/distributed/rpc/batch/parameter_server.py
+# and https://pytorch.org/tutorials/intermediate/rpc_async_execution.html#batch-updating-parameter-server
+
+import threading
+from datetime import datetime
+from time import perf_counter
+
+import torch
+import torch.distributed.rpc as rpc
+import torch.nn as nn
+from torch import optim
+
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    worker_name,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import RpcAgentTestFixture
+
+batch_size = 20
+in_features = 100
+out_features = 30
+num_batches = 4
+
+
+def timed_log(text):
+    print(f"{datetime.now().strftime('%H:%M:%S')} {text}")
+
+
+class BatchUpdateParameterServer:
+
+    def __init__(self, batch_update_size):
+        self.model = nn.Linear(in_features, out_features)
+        self.lock = threading.Lock()
+        self.future_model = torch.futures.Future()
+        self.batch_update_size = batch_update_size
+        self.curr_update_size = 0
+        self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9)
+        for p in self.model.parameters():
+            p.grad = torch.zeros_like(p)
+
+    def get_model(self):
+        return self.model
+
+    @staticmethod
+    @rpc.functions.async_execution
+    def update_and_fetch_model(ps_rref, grads):
+        self = ps_rref.local_value()
+        for p, g in zip(self.model.parameters(), grads):
+            if p.grad is None:
+                p.grad = g
+            else:
+                p.grad += g
+        with self.lock:
+            timed_log(f"PS got {self.curr_update_size}/{self.batch_update_size} updates")
+            self.curr_update_size += 1
+            fut = self.future_model
+
+            if self.curr_update_size >= self.batch_update_size:
+                for p in self.model.parameters():
+                    p.grad /= self.batch_update_size
+                self.curr_update_size = 0
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                fut.set_result(self.model)
+                timed_log("PS updated model")
+                self.future_model = torch.futures.Future()
+
+        return fut
+
+
+class Trainer:
+
+    def __init__(self, ps_rref):
+        self.ps_rref = ps_rref
+        self.loss_fn = nn.L1Loss()
+
+    def get_next_batch(self):
+        for _ in range(num_batches):
+            inputs = torch.randn(batch_size, in_features)
+            labels = torch.zeros(batch_size, out_features)
+            yield inputs, labels
+
+    def train(self):
+        name = rpc.get_worker_info().name
+        m = self.ps_rref.rpc_sync().get_model()
+        for inputs, labels in self.get_next_batch():
+            timed_log(f"{name} processing one batch")
+            self.loss_fn(m(inputs), labels).backward()
+            timed_log(f"{name} reporting grads")
+            m = rpc.rpc_sync(
+                self.ps_rref.owner(),
+                BatchUpdateParameterServer.update_and_fetch_model,
+                args=(self.ps_rref, [p.grad for p in m.cpu().parameters()]),
+            )
+            timed_log(f"{name} got updated model")
+
+
+def run_trainer(ps_rref):
+    trainer = Trainer(ps_rref)
+    trainer.train()
+
+
+def run_ps(trainers):
+    timed_log("Start training")
+    start = perf_counter()
+    ps_rref = rpc.RRef(BatchUpdateParameterServer(len(trainers)))
+    futs = []
+    for trainer in trainers:
+        futs.append(
+            rpc.rpc_async(trainer, run_trainer, args=(ps_rref,))
+        )
+
+    torch.futures.wait_all(futs)
+    stop = perf_counter()
+    timed_log("Finish training")
+    timed_log(f"Time spent training: {stop-start}s")
+
+class ParameterServerTest(RpcAgentTestFixture):
+
+    @dist_init(setup_rpc=False)
+    def test_batch_updating_parameter_server(self):
+
+        if self.rank != 0:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+        else:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+            run_ps([f"{worker_name(r)}" for r in range(1, self.world_size)])
+
+        rpc.shutdown()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f193bac01c52f5389b88b9e783725c5de647b7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/examples/reinforcement_learning_rpc_test.py
@@ -0,0 +1,261 @@
+# mypy: ignore-errors
+
+# If you need to modify this file to make this test pass, please also apply same edits accordingly to
+# https://github.com/pytorch/examples/blob/master/distributed/rpc/rl/main.py
+# and https://pytorch.org/tutorials/intermediate/rpc_tutorial.html
+
+import numpy as np
+from itertools import count
+
+import torch
+import torch.distributed.rpc as rpc
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.distributed.rpc import RRef, rpc_sync, rpc_async, remote
+from torch.distributions import Categorical
+
+from torch.testing._internal.dist_utils import dist_init, worker_name
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import RpcAgentTestFixture
+
+TOTAL_EPISODE_STEP = 5000
+GAMMA = 0.1
+SEED = 543
+
+def _call_method(method, rref, *args, **kwargs):
+    r"""
+    a helper function to call a method on the given RRef
+    """
+    return method(rref.local_value(), *args, **kwargs)
+
+
+def _remote_method(method, rref, *args, **kwargs):
+    r"""
+    a helper function to run method on the owner of rref and fetch back the
+    result using RPC
+    """
+    args = [method, rref] + list(args)
+    return rpc_sync(rref.owner(), _call_method, args=args, kwargs=kwargs)
+
+
+class Policy(nn.Module):
+    r"""
+    Borrowing the ``Policy`` class from the Reinforcement Learning example.
+    Copying the code to make these two examples independent.
+    See https://github.com/pytorch/examples/tree/master/reinforcement_learning
+    """
+    def __init__(self):
+        super().__init__()
+        self.affine1 = nn.Linear(4, 128)
+        self.dropout = nn.Dropout(p=0.6)
+        self.affine2 = nn.Linear(128, 2)
+
+        self.saved_log_probs = []
+        self.rewards = []
+
+    def forward(self, x):
+        x = self.affine1(x)
+        x = self.dropout(x)
+        x = F.relu(x)
+        action_scores = self.affine2(x)
+        return F.softmax(action_scores, dim=1)
+
+
+class DummyEnv:
+    r"""
+    A dummy environment that implements the required subset of the OpenAI gym
+    interface. It exists only to avoid a dependency on gym for running the
+    tests in this file. It is designed to run for a set max number of iterations,
+    returning random states and rewards at each step.
+    """
+    def __init__(self, state_dim=4, num_iters=10, reward_threshold=475.0):
+        self.state_dim = state_dim
+        self.num_iters = num_iters
+        self.iter = 0
+        self.reward_threshold = reward_threshold
+
+    def seed(self, manual_seed):
+        torch.manual_seed(manual_seed)
+
+    def reset(self):
+        self.iter = 0
+        return torch.randn(self.state_dim)
+
+    def step(self, action):
+        self.iter += 1
+        state = torch.randn(self.state_dim)
+        reward = torch.rand(1).item() * self.reward_threshold
+        done = self.iter >= self.num_iters
+        info = {}
+        return state, reward, done, info
+
+
+class Observer:
+    r"""
+    An observer has exclusive access to its own environment. Each observer
+    captures the state from its environment, and send the state to the agent to
+    select an action. Then, the observer applies the action to its environment
+    and reports the reward to the agent.
+    """
+    def __init__(self):
+        self.id = rpc.get_worker_info().id
+        self.env = DummyEnv()
+        self.env.seed(SEED)
+
+    def run_episode(self, agent_rref, n_steps):
+        r"""
+        Run one episode of n_steps.
+        Arguments:
+            agent_rref (RRef): an RRef referencing the agent object.
+            n_steps (int): number of steps in this episode
+        """
+        state, ep_reward = self.env.reset(), 0
+        for step in range(n_steps):
+            # send the state to the agent to get an action
+            action = _remote_method(Agent.select_action, agent_rref, self.id, state)
+
+            # apply the action to the environment, and get the reward
+            state, reward, done, _ = self.env.step(action)
+
+            # report the reward to the agent for training purpose
+            _remote_method(Agent.report_reward, agent_rref, self.id, reward)
+
+            if done:
+                break
+
+
+class Agent:
+    def __init__(self, world_size):
+        self.ob_rrefs = []
+        self.agent_rref = RRef(self)
+        self.rewards = {}
+        self.saved_log_probs = {}
+        self.policy = Policy()
+        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
+        self.eps = np.finfo(np.float32).eps.item()
+        self.running_reward = 0
+        self.reward_threshold = DummyEnv().reward_threshold
+        for ob_rank in range(1, world_size):
+            ob_info = rpc.get_worker_info(worker_name(ob_rank))
+            self.ob_rrefs.append(remote(ob_info, Observer))
+            self.rewards[ob_info.id] = []
+            self.saved_log_probs[ob_info.id] = []
+
+    def select_action(self, ob_id, state):
+        r"""
+        This function is mostly borrowed from the Reinforcement Learning example.
+        See https://github.com/pytorch/examples/tree/master/reinforcement_learning
+        The main difference is that instead of keeping all probs in one list,
+        the agent keeps probs in a dictionary, one key per observer.
+
+        NB: no need to enforce thread-safety here as GIL will serialize
+        executions.
+        """
+        probs = self.policy(state.unsqueeze(0))
+        m = Categorical(probs)
+        action = m.sample()
+        self.saved_log_probs[ob_id].append(m.log_prob(action))
+        return action.item()
+
+    def report_reward(self, ob_id, reward):
+        r"""
+        Observers call this function to report rewards.
+        """
+        self.rewards[ob_id].append(reward)
+
+    def run_episode(self, n_steps=0):
+        r"""
+        Run one episode. The agent will tell each observer to run n_steps.
+        """
+        futs = []
+        for ob_rref in self.ob_rrefs:
+            # make async RPC to kick off an episode on all observers
+            futs.append(
+                rpc_async(
+                    ob_rref.owner(),
+                    _call_method,
+                    args=(Observer.run_episode, ob_rref, self.agent_rref, n_steps)
+                )
+            )
+
+        # wait until all observers have finished this episode
+        for fut in futs:
+            fut.wait()
+
+    def finish_episode(self):
+        r"""
+        This function is mostly borrowed from the Reinforcement Learning example.
+        See https://github.com/pytorch/examples/tree/master/reinforcement_learning
+        The main difference is that it joins all probs and rewards from
+        different observers into one list, and uses the minimum observer rewards
+        as the reward of the current episode.
+        """
+
+        # joins probs and rewards from different observers into lists
+        R, probs, rewards = 0, [], []
+        for ob_id in self.rewards:
+            probs.extend(self.saved_log_probs[ob_id])
+            rewards.extend(self.rewards[ob_id])
+
+        # use the minimum observer reward to calculate the running reward
+        min_reward = min([sum(self.rewards[ob_id]) for ob_id in self.rewards])
+        self.running_reward = 0.05 * min_reward + (1 - 0.05) * self.running_reward
+
+        # clear saved probs and rewards
+        for ob_id in self.rewards:
+            self.rewards[ob_id] = []
+            self.saved_log_probs[ob_id] = []
+
+        policy_loss, returns = [], []
+        for r in rewards[::-1]:
+            R = r + GAMMA * R
+            returns.insert(0, R)
+        returns = torch.tensor(returns)
+        returns = (returns - returns.mean()) / (returns.std() + self.eps)
+        for log_prob, R in zip(probs, returns):
+            policy_loss.append(-log_prob * R)
+        self.optimizer.zero_grad()
+        policy_loss = torch.cat(policy_loss).sum()
+        policy_loss.backward()
+        self.optimizer.step()
+        return min_reward
+
+
+def run_agent(agent, n_steps):
+    for i_episode in count(1):
+        agent.run_episode(n_steps=n_steps)
+        last_reward = agent.finish_episode()
+
+        if agent.running_reward > agent.reward_threshold:
+            print(f"Solved! Running reward is now {agent.running_reward}!")
+            break
+
+
+class ReinforcementLearningRpcTest(RpcAgentTestFixture):
+    @dist_init(setup_rpc=False)
+    def test_rl_rpc(self):
+        if self.rank == 0:
+            # Rank 0 is the agent.
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+            agent = Agent(self.world_size)
+            run_agent(agent, n_steps=int(TOTAL_EPISODE_STEP / (self.world_size - 1)))
+
+            # Ensure training was run. We don't really care about whether the task was learned,
+            # since the purpose of the test is to check the API calls.
+            self.assertGreater(agent.running_reward, 0.0)
+        else:
+            # Other ranks are observers that passively wait for instructions from the agent.
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+        rpc.shutdown()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66606d6bd424ddbb052e7861f33ac74b8acbe845
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_agent_rpc_test.py
@@ -0,0 +1,326 @@
+# mypy: ignore-errors
+
+import torch
+import time
+import torch.distributed.rpc as rpc
+from torch.distributed.rpc.api import _delete_all_user_and_unforked_owner_rrefs
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    wait_until_pending_futures_and_users_flushed,
+    wait_until_owners_and_forks_on_rank,
+    worker_name,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+def my_sleep_func(seconds=1):
+    time.sleep(seconds)
+    return torch.mul(torch.tensor(1), torch.tensor(1))
+
+@torch.jit.script
+def my_script_func(tensor):
+    return torch.add(tensor, tensor)
+
+def add_rref_to_value(rref, value):
+    return rref.to_here() + value
+
+class FaultyAgentRpcTest(RpcAgentTestFixture):
+
+    # no faulty_messages defined so this fails all retryable messages - see
+    # faulty_rpc_agent_test_fixture.py for the list of retryable messages.
+    @dist_init(messages_to_delay={})
+    def test_check_failed_messages(self):
+        if self.rank == 0:
+            dst_worker_b = worker_name((self.rank + 1) % self.world_size)
+            dst_worker_c = worker_name((self.rank + 2) % self.world_size)
+
+            # Worker0 sends RPC to Worker1 and creates an RRef there
+            rref = rpc.remote(dst_worker_b, torch.add, args=(torch.ones(2, 2), torch.ones(2, 2)))
+            # Worker0 sends an RPC to Worker2 with the RRef as an arg
+            rpc.remote(dst_worker_c, add_rref_to_value, args=(rref, torch.ones(2, 2)))
+            # check if the output is as expected
+            self.assertEqual(rref.to_here(), torch.add(torch.ones(2, 2), torch.ones(2, 2)))
+        # explicitly delete all User RRefs
+        _delete_all_user_and_unforked_owner_rrefs()
+
+    @dist_init
+    def test_verify_backend_options(self):
+        self.assertEqual(self.rpc_backend, rpc.backend_registry.BackendType.FAULTY_TENSORPIPE)
+        self.assertEqual(self.rpc_backend_options.num_worker_threads, 8)
+        self.assertEqual(self.rpc_backend_options.num_fail_sends, 3)
+        self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 4)
+        self.assertEqual(len(self.rpc_backend_options.messages_to_delay), 2)
+        self.assertEqual(self.rpc_backend_options.rpc_timeout, rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    @dist_init(faulty_messages=["RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"])
+    def test_custom_faulty_messages(self):
+        self.assertEqual(
+            {"RREF_FORK_REQUEST", "RREF_CHILD_ACCEPT"},
+            set(self.rpc_backend_options.messages_to_fail),
+        )
+
+    @dist_init(faulty_messages=[])
+    def test_no_faulty_messages(self):
+        self.assertEqual(len(self.rpc_backend_options.messages_to_fail), 0)
+
+    @dist_init(messages_to_delay={"SCRIPT_CALL": 1.5})
+    def test_custom_messages_to_delay(self):
+        self.assertEqual(self.rpc_backend_options.messages_to_delay, {"SCRIPT_CALL": 1.5})
+
+    def _test_remote_message_dropped_pickle(self, dst=None):
+        if self.rank != 0:
+            return
+        dst_rank = dst if dst is not None else (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        # Since we fail python_remote_call messages synchronously, the future
+        # corresponding to this remote call will be marked with an error when
+        # this function returns.
+        rref = rpc.remote(dst_worker, my_sleep_func, args=(1,))
+        # Call to ensure pending callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        # Attempt to fork the RRef should raise an error indicating the rpc.remote timeout.
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rref._serialize()
+        # Test that using RRef as arg over RPC (which forks) results in the same
+        # error
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rpc.rpc_async(dst_worker, add_rref_to_value, args=(rref, 1))
+
+    @dist_init(faulty_messages=["PYTHON_REMOTE_CALL"])
+    def test_remote_message_dropped_pickle(self):
+        self._test_remote_message_dropped_pickle()
+
+    @dist_init(faulty_messages=["PYTHON_REMOTE_CALL"])
+    def test_remote_message_dropped_pickle_to_self(self):
+        self._test_remote_message_dropped_pickle(self.rank)
+
+
+    def _test_remote_message_dropped_timeout(self, func, args, dst=None):
+        if self.rank != 0:
+            return
+
+        # test the case where rpc.remote() message creation is completely dropped.
+        dst_rank = dst if dst is not None else (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        # Since we fail python_remote_call messages synchronously, the future
+        # corresponding to this remote call will be marked with an error when
+        # this function returns.
+        rref = rpc.remote(dst_worker, func, args=args)
+        # Call to ensure pending callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rref.to_here()
+        # Note: during shutdown, logs will indicate "Could not find OwnerRRef..."
+        # on the owning nodes, this is expected because the OwnerRRef was never
+        # successfully created. Therefore, delAllUsers will work as expected.
+
+    @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
+    def test_builtin_remote_message_dropped_timeout(self):
+        func = torch.add
+        args = (torch.tensor(1), torch.tensor(1))
+        self._test_remote_message_dropped_timeout(func, args)
+
+    @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
+    def test_builtin_remote_message_dropped_timeout_to_self(self):
+        func = torch.add
+        args = (torch.tensor(1), torch.tensor(1))
+        self._test_remote_message_dropped_timeout(func, args, dst=0)
+
+    @dist_init(faulty_messages=["PYTHON_REMOTE_CALL"])
+    def test_udf_remote_message_dropped_timeout(self):
+        func = my_sleep_func
+        args = (2,)
+        self._test_remote_message_dropped_timeout(func, args)
+
+    @dist_init(faulty_messages=["PYTHON_REMOTE_CALL"])
+    def test_udf_remote_message_dropped_timeout_to_self(self):
+        func = my_sleep_func
+        args = (2,)
+        self._test_remote_message_dropped_timeout(func, args, dst=0)
+
+    def _test_remote_message_delay_timeout(self, func, args, dst=None):
+        if self.rank != 0:
+            return
+        # Test the case where remote message is eventually processed on the owner,
+        # but the future on the creator times out before the response comes back.
+        dst_rank = dst if dst is not None else (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        # 10 ms timeout
+        rref = rpc.remote(dst_worker, func, args=args, timeout=0.001)
+        # Future corresponding to the remote creation should time out.
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rref._get_future().wait()
+
+        # Call to ensure pending callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        # to_here() should now pick up that rpc.remote() creation has failed.
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rref.to_here()
+
+        # Test the case where rpc.remote() times out, but to_here() has already
+        # started blocking before.
+        # NOTE: we only test this when not sending to self, as to_here() calls
+        # calls localValue(), which does not send an RPC and thus does not have
+        # a timeout. This can be supported by allowing future.wait() to
+        # take in an optional timeout (https://github.com/pytorch/pytorch/issues/39280)
+        if dst_rank != self.rank:
+            slow_rref = rpc.remote(dst_worker, func, args=args, timeout=2)
+
+            with self.assertRaisesRegex(RuntimeError, expected_error):
+                # to_here() should raise timeout error, since it does not know about the
+                # status of rpc.remote().
+                slow_rref.to_here(0.001)
+        # Note: If we proceed with shutdown, UserRRef will send out a RRefUserDelete
+        # but this can be a noop since it may not exist on the owner yet. Later,
+        # the owner can process the RRef creation and wait for the delete message,
+        # thus leading to a timeout.
+        # Therefore, we wait until we get notification that pending owners have
+        # been confirmed before sending out RRefUserDeletes.
+        if dst_rank != self.rank:
+            wait_until_owners_and_forks_on_rank(2, 2, rank=dst_rank)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"PYTHON_REMOTE_CALL": 2})
+    def test_udf_remote_message_delay_timeout(self):
+        func = my_sleep_func
+        args = (2,)
+        self._test_remote_message_delay_timeout(func, args)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"PYTHON_REMOTE_CALL": 2})
+    def test_udf_remote_message_delay_timeout_to_self(self):
+        func = my_sleep_func
+        args = (1,)
+        self._test_remote_message_delay_timeout(func, args, dst=0)
+
+    @dist_init(
+        faulty_messages=[],
+        messages_to_delay={"SCRIPT_REMOTE_CALL": 2, "SCRIPT_RREF_FETCH_CALL": 1},
+    )
+    def test_remote_message_builtin_delay_timeout(self):
+        func = torch.add
+        args = (torch.tensor(1), torch.tensor(1))
+        self._test_remote_message_delay_timeout(func, args)
+
+    @dist_init(
+        faulty_messages=[],
+        messages_to_delay={"SCRIPT_REMOTE_CALL": 2, "SCRIPT_RREF_FETCH_CALL": 1},
+    )
+    def test_remote_message_builtin_delay_timeout_to_self(self):
+        func = torch.add
+        args = (torch.tensor(1), torch.tensor(1))
+        self._test_remote_message_delay_timeout(func, args, dst=0)
+
+    @dist_init(
+        faulty_messages=[],
+        messages_to_delay={"SCRIPT_REMOTE_CALL": 2, "SCRIPT_RREF_FETCH_CALL": 1},
+    )
+    def test_remote_message_script_delay_timeout(self):
+        func = my_script_func
+        args = (torch.tensor(1),)
+        self._test_remote_message_delay_timeout(func, args)
+
+    @dist_init(
+        faulty_messages=[],
+        messages_to_delay={"SCRIPT_REMOTE_CALL": 2, "SCRIPT_RREF_FETCH_CALL": 1},
+    )
+    def test_remote_message_script_delay_timeout_to_self(self):
+        func = my_script_func
+        args = (torch.tensor(1),)
+        self._test_remote_message_delay_timeout(func, args, dst=0)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_RREF_FETCH_CALL": 1})
+    def test_rref_to_here_timeout(self):
+        if self.rank != 0:
+            return
+
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        rref = rpc.remote(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rref.to_here(0.01)
+
+        rref.to_here()
+
+    @dist_init(faulty_messages=[])
+    def test_rpc_builtin_timeout(self):
+        next_rank = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(next_rank)
+        expected_error = self.get_timeout_error_regex()
+        # PYTHON_CALL message types which correspond to Python UDF over RPC
+        # by default get a delay (see faulty_rpc_agent_test_fixture)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rpc.rpc_sync(
+                dst_worker,
+                torch.add,
+                args=(torch.tensor(1), torch.tensor(1)),
+                timeout=1,
+            )
+
+        fut = rpc.rpc_async(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1)), timeout=1
+        )
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure that the currently set default timeout is large enough such
+        # that RPCs with delays still complete.
+        fut = rpc.rpc_async(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        fut.wait()
+
+        # Ensure timeout if we set a new default and don't override
+        rpc._set_rpc_timeout(0.001)
+        fut = rpc.rpc_async(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure run to completion if we specify timeout of 0
+        fut = rpc.rpc_async(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1)), timeout=0
+        )
+        fut.wait()
+        # Reset for clean shutdown
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_CALL": 1.5})
+    def test_rpc_script_timeout(self):
+        next_rank = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(next_rank)
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rpc.rpc_sync(dst_worker, my_script_func, args=(torch.tensor(1),), timeout=1)
+
+        fut = rpc.rpc_async(dst_worker, my_script_func, args=(torch.tensor(1),), timeout=1)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure that the currently set default timeout is large enough such
+        # that RPCs with delays still complete.
+        fut = rpc.rpc_async(
+            dst_worker, my_script_func, args=(torch.tensor(1),)
+        )
+        fut.wait()
+
+        # Ensure timeout if we set a new default and don't override
+        rpc._set_rpc_timeout(0.001)
+        fut = rpc.rpc_async(
+            dst_worker, my_script_func, args=(torch.tensor(1),)
+        )
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure run to completion if we specify timeout of 0
+        rpc._set_rpc_timeout(0.001)
+        fut = rpc.rpc_async(
+            dst_worker, my_script_func, args=(torch.tensor(1),), timeout=0
+        )
+        fut.wait()
+        # Reset for clean shutdown
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..63ef4f563913e19d2d172f8841a07279fcf87cd8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
@@ -0,0 +1,62 @@
+# mypy: ignore-errors
+
+import torch.distributed.rpc as rpc
+import torch.distributed.rpc._testing  # noqa: F401
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+# The following message types are currently retried in the RREF protocol and
+# distributed autograd. Thus only these messages should be tested with the
+# Faulty RPC Agent.
+retryable_message_types = ["RREF_FORK_REQUEST",
+                           "RREF_CHILD_ACCEPT",
+                           "RREF_USER_DELETE",
+                           "CLEANUP_AUTOGRAD_CONTEXT_REQ"]
+
+# The following messages incur the corresponding delay in seconds while being
+# processed in FaultyTensorPipeAgent's enqueueSend() function.
+default_messages_to_delay = {
+    "PYTHON_CALL": 1.5,  # Python UDF
+    "SCRIPT_CALL": 1.5,  # Script/Builtin
+}
+
+class FaultyRpcAgentTestFixture(RpcAgentTestFixture):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.messages_to_fail = retryable_message_types
+        self.messages_to_delay = default_messages_to_delay
+
+    @property
+    def rpc_backend(self):
+        return rpc.backend_registry.BackendType[
+            "FAULTY_TENSORPIPE"
+        ]
+
+    @property
+    def rpc_backend_options(self):
+        return rpc.backend_registry.construct_rpc_backend_options(
+            self.rpc_backend,
+            init_method=self.init_method,
+            num_worker_threads=8,
+            num_fail_sends=3,
+            messages_to_fail=self.messages_to_fail,
+            messages_to_delay=self.messages_to_delay,
+        )
+
+    def setup_fault_injection(self, faulty_messages, messages_to_delay):
+        if faulty_messages is not None:
+            self.messages_to_fail = faulty_messages
+        if messages_to_delay is not None:
+            self.messages_to_delay = messages_to_delay
+
+    def get_shutdown_error_regex(self):
+        error_regexes = [
+            "Exception in thread pool task",
+            "Connection reset by peer",
+            "Connection closed by peer"
+        ]
+        return "|".join([f"({error_str})" for error_str in error_regexes])
+
+    def get_timeout_error_regex(self):
+        return "RPC ran for more than"
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..591e53b5cbdac7547e6fa408aa2ddba7d5ce2204
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/dist_autograd_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/dist_autograd_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..931776864d04314fd3b0028630b5bfd200ff85bb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/dist_autograd_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d9026801225381c651d4465379cea8a063311a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test_faulty.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test_faulty.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..930179b9cea61f63787bdd2d44f45a94f6193a90
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/__pycache__/rpc_test_faulty.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f513d5f6be91c75e618baede485e49c4d33637
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
@@ -0,0 +1,116 @@
+# mypy: ignore-errors
+
+from typing import Dict, Tuple
+
+import torch
+import torch.distributed.autograd as dist_autograd
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch.distributed.rpc import rpc_async
+from torch.testing import FileCheck
+from torch.testing._internal.dist_utils import dist_init, worker_name
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+
+@torch.jit.script
+def local_add(t1, t2):
+    return torch.add(t1, t2)
+
+
+@torch.jit.script
+def remote_add(t1, t2, dst: str):  # noqa: E999
+    return rpc_async(dst, local_add, (t1, t2)).wait()
+
+
+@torch.jit.script
+def fork_add(t1, t2, dst: str):
+    fut = torch.jit._fork(remote_add, t1, t2, dst)
+    return torch.jit._wait(fut)
+
+
+class JitDistAutogradTest(RpcAgentTestFixture):
+    @dist_init
+    def test_get_gradients(self):
+        dst_rank = self.rank
+
+        @torch.jit.script
+        def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]):
+            return dist_autograd.get_gradients(context_id)
+
+        FileCheck().check("get_gradients").run(str(dist_get_gradients.graph))
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+            t3 = torch.add(t1, t2)
+
+            dist_autograd.backward(context_id, [t3.sum()])
+            grads = dist_get_gradients(context_id)
+
+            self.assertEqual(2, len(grads))
+            self.assertIn(t1, grads)
+            self.assertIn(t2, grads)
+            self.assertEqual(torch.ones(3, 3), grads[t1])
+            self.assertEqual(torch.ones(3, 3), grads[t2])
+
+    @dist_init
+    def test_dist_backward(self):
+        if self.rank != 0:
+            return
+
+        @torch.jit.script
+        def dist_backward_script(context_id: int, loss: torch.Tensor):
+            dist_autograd.backward(context_id, [loss])
+
+        FileCheck().check("dist_backward").run(str(dist_backward_script.graph))
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand(3, 3, requires_grad=True)
+            t2 = torch.rand(3, 3, requires_grad=True)
+            dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+            loss = rpc.rpc_sync(dst_worker_name, torch.add, args=(t1, t2)).sum()
+            dist_backward_script(context_id, loss)
+
+    @dist_init
+    def test_jit_fork_within_context(self):
+        with dist_autograd.context() as context_id:
+            t1 = torch.rand((3, 3), requires_grad=True)
+            t2 = torch.rand((3, 3), requires_grad=True)
+            dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+            res = fork_add(t1, t2, dst_worker_name)
+            loss = res.sum()
+            dist_autograd.backward(context_id, [loss])
+
+            grads = dist_autograd.get_gradients(context_id)
+            self.assertEqual(2, len(grads))
+            self.assertIn(t1, grads)
+            self.assertIn(t2, grads)
+
+    @dist_init
+    def test_restore_context_after_swtich_to_jit_thread(self):
+        if self.rank != 0:
+            return
+
+        @torch.jit.script
+        def forward_script(
+            context_id: int, dst_worker_name: str, t1: Tensor, t2: Tensor
+        ) -> Tuple[Tensor, Tensor]:
+            res1_fut = rpc.rpc_async(dst_worker_name, local_add, (t1, t1))
+            res1 = res1_fut.wait()  # After this, the script runs in a new JIT thread.
+            loss1 = res1.sum()
+
+            # SendRpcBackward is not attached, since DistAutogradContext is lost here.
+            res2_fut = rpc.rpc_async(dst_worker_name, local_add, (t2, t2))
+            res2 = res2_fut.wait()
+            loss2 = res2.sum()
+
+            return loss1, loss2
+
+        with dist_autograd.context() as context_id:
+            t1 = torch.ones((2, 3), requires_grad=True)
+            t2 = torch.ones((2, 3), requires_grad=True)
+            dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+            loss0, loss1 = forward_script(context_id, dst_worker_name, t1, t2)
+            dist_autograd.backward(context_id, [loss0, loss1])
+            grad0, grad1 = dist_autograd.get_gradients(context_id)
+            self.assertEqual(grad0, grad1)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf3d6264a9cd70c02600e99cb4ab7784b5e1e12
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test.py
@@ -0,0 +1,1385 @@
+# mypy: ignore-errors
+
+import time
+import io
+from typing import Dict, List, Tuple, Any
+
+import torch
+import torch.distributed as dist
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch.autograd.profiler import record_function
+from torch.distributed.rpc import RRef
+from torch.distributed.rpc.internal import RPCExecMode, _build_rpc_profiling_key
+from torch.futures import Future
+from torch.testing._internal.common_utils import TemporaryFileName
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    get_function_event,
+    initialize_pg,
+    worker_name,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+from torch.autograd.profiler_legacy import profile as _profile
+
+def rref_isinstance(rref, cls_to_check):
+    return isinstance(rref.local_value(), cls_to_check)
+
+def sleep(t):
+    time.sleep(t)
+
+
+def rpc_return_rref(dst):
+    return rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 1))
+
+
+@torch.jit.script
+def rref_local_value(rref: RRef[Tensor]) -> Tensor:
+    return rref.local_value()
+
+
+@torch.jit.script
+def list_create() -> List[int]:
+    global_list = [1, 2, 3]
+    return global_list
+
+
+@torch.jit.script
+def rref_list_mutate(rref: RRef[List[int]]) -> None:
+    rref.local_value().append(4)
+    rref.to_here().append(5)
+    rref.to_here(5.0).append(6)
+
+
+def return_value(value: int) -> int:
+    return value
+
+
+class RRefAPITest:
+    @dist_init
+    def test_rref_is_owner(self):
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        rref_var = rpc_return_rref(dst_worker_name)
+
+        @torch.jit.script
+        def rref_tensor_is_owner(rref_var: RRef[Tensor]) -> bool:
+            return rref_var.is_owner()
+
+        res = rref_tensor_is_owner(rref_var)
+        self.assertEqual(res, False)
+
+    @dist_init
+    def test_rref_local_value(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        rref = rpc_return_rref(dst_worker_name)
+
+        with self.assertRaisesRegex(
+            RuntimeError, r"Can't call RRef.local_value\(\) on a non-owner RRef"
+        ):
+            rref_local_value(rref)
+
+        ret = ret = rpc.rpc_sync(dst_worker_name, rref_local_value, (rref,))
+        self.assertEqual(ret, torch.add(torch.ones(2, 2), 1))
+
+    @dist_init
+    def test_local_rref_local_value(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name(self.rank)
+        rref = rpc.remote(dst_worker_name, return_value, (5,), {})
+
+        ret = rref_local_value(rref)
+        self.assertEqual(ret, 5)
+
+    def _create_rref(self):
+        owner_rank = (self.rank + 2) % self.world_size
+        return rpc.remote(
+            worker_name(owner_rank), torch.add, args=(torch.zeros(2, 2), 1)
+        )
+
+    @dist_init
+    def test_user_rrefs_confirmed(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        rref = self._create_rref()
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), script_check_rref_confirmed, args=(rref,)
+        )
+        self.assertEqual(ret, True)
+
+    @dist_init
+    def test_user_rrefs_confirmed_remote(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        rref = self._create_rref()
+        ret_rref = rpc.remote(
+            worker_name(dst_rank), script_check_rref_confirmed, args=(rref,)
+        )
+        self.assertEqual(ret_rref.to_here(), True)
+
+    @dist_init
+    def test_rref_list_mutate(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        list_rref = rpc.remote(dst, list_create)
+
+        rpc.rpc_sync(dst, rref_list_mutate, args=(list_rref,))
+        self.assertEqual(list_rref.to_here(), [1, 2, 3, 4, 5, 6])
+
+
+@torch.jit.script
+def no_arg():
+    return 0
+
+
+@torch.jit.script
+def one_arg(value):
+    return value + 1
+
+@torch.jit.script
+def script_add_ones(x):
+    return torch.add(x, torch.ones(1))
+
+@torch.jit.script
+def script_add_ones_with_record_function(x, block: str):
+    with record_function(block):
+        return torch.add(x, torch.ones(1))
+
+
+@torch.jit.script
+def record_function_on_caller_rpc_async(dst_worker_name: str, block: str) -> Tensor:
+    t: Tensor = torch.ones(1)
+    with record_function(block) as rf:
+        fut1 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
+        # Extra operator call to avoid de-duplication of the next async call
+        # see https://github.com/pytorch/pytorch/pull/62710#discussion_r694680279
+        zero = torch.zeros_like(t)
+        fut2 = rpc.rpc_async(dst_worker_name, script_add_ones, (t, ))
+        res = fut1.wait() + fut2.wait() + zero
+    return res
+
+
+
+@torch.jit.script
+def script_fork_wait_udf(tensor):
+    fut = torch.jit._fork(script_add_ones, tensor)
+    x = torch.jit._wait(fut)
+    return x
+
+
+@torch.jit.script
+def rref_to_here(rref_var: RRef[Tensor]) -> Tensor:
+    return rref_var.to_here()
+
+
+@torch.jit.script
+def return_rref(rref_var: RRef[Tensor]) -> RRef[Tensor]:
+    return rref_var
+
+
+@torch.jit.script
+def script_raise_func(value):
+    if value.numel() == 2:
+        raise ValueError("Expected error")
+    return value + 1
+
+
+@torch.jit.script
+def script_fork_wait_throw(invalue):
+    fut = torch.jit._fork(script_raise_func, invalue)
+    value = torch.jit._wait(fut)
+    return value
+
+
+@torch.jit.script
+def call_rpc_with_profiling(record: torch.classes.profiler._RecordFunction, dst_worker_name: str) -> Tensor:
+    # Call rpc_async from within ScriptFunction and ensure that we can attach
+    # profiling callbacks. Note that handle here is a Tensor representation of
+    # RecordFunction.
+    fut = rpc.rpc_async(dst_worker_name, one_arg, (torch.tensor(1),))
+    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
+    ret = fut.wait()
+    return ret
+
+@torch.jit.script
+def call_rpc_torchscript_with_record_function(dst_worker_name: str, block: str) -> Tensor:
+    fut = rpc.rpc_async(dst_worker_name, script_add_ones_with_record_function, (torch.tensor(1), block))
+    return fut.wait()
+
+
+@torch.jit.script
+def call_fork_with_profiling(record: torch.classes.profiler._RecordFunction) -> Tensor:
+    # Call fork from within ScriptFunction and ensure that we can attach profiling
+    # callbacks to the resulting future. Note that handle here is a Tensor
+    # representation of RecordFunction.
+    fut = torch.jit._fork(one_arg, torch.tensor(1))
+    torch.ops.profiler._call_end_callbacks_on_jit_fut(record, fut)
+    ret = fut.wait()
+    return ret
+
+
+class MyScriptModuleWithRRefs(torch.jit.ScriptModule):
+    def __init__(self, dst_worker):
+        super().__init__()
+        self.rrefs = []
+        for _ in range(4):
+            self.rrefs.append(rpc_return_rref(dst_worker))
+
+    @torch.jit.script_method
+    def forward(self) -> Tensor:
+        res_tensor = torch.ones(2, 2)
+        for rref in self.rrefs:
+            res_tensor += rref.to_here()
+
+        return res_tensor
+
+
+@torch.jit.ignore
+def rref_python_annotation(rref_var: RRef[Tensor]) -> RRef[Tensor]:
+    return rref_var
+
+
+@torch.jit.script
+def rref_script_annotation(rref_var: RRef[Tensor]) -> Tensor:
+    return rref_python_annotation(rref_var).to_here()
+
+
+class RRefTypingTest:
+    @dist_init
+    def test_rref_as_arg_and_return(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        local_ret = one_arg(torch.ones(2, 2))
+
+        # create rref on current rank
+        rref = rpc.remote(worker_name(self.rank), one_arg, args=(torch.ones(2, 2),))
+
+        # pass rref to another user in rpc call
+        ret = rpc.rpc_sync(worker_name(dst_rank), rref_to_here, args=(rref,))
+        self.assertEqual(ret, local_ret)
+
+        # return rref in rpc call
+        rref1 = rpc.rpc_sync(worker_name(dst_rank), return_rref, args=(rref,))
+        self.assertEqual(rref1.to_here(), local_ret)
+
+        # pass rref to another user in remote call
+        rref2 = rpc.remote(worker_name(dst_rank), rref_to_here, args=(rref,))
+        self.assertEqual(rref2.to_here(), local_ret)
+
+        # return rref in remote call
+        rref3 = rpc.remote(worker_name(dst_rank), return_rref, args=(rref,))
+        self.assertEqual(rref3.to_here().to_here(), local_ret)
+
+    @dist_init
+    def test_my_script_module_with_rrefs(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        module_with_rrefs = MyScriptModuleWithRRefs(worker_name(dst_rank))
+        res = module_with_rrefs()
+        self.assertEqual(res, torch.ones(2, 2) * 9)
+
+    @dist_init
+    def test_rref_python_annotation(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_var = rpc_return_rref(worker_name(dst_rank))
+
+        res = rref_script_annotation(rref_var)
+        self.assertEqual(res, torch.ones(2, 2) + 1)
+
+
+class FutureTypingTest:
+    @dist_init
+    def test_future_passed_between_python_and_jit(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        inputs = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        ret_fut = rpc.rpc_async(worker_name(dst_rank), two_args_two_kwargs, args=inputs)
+        expected_res = torch.tensor([10, 10])
+
+        @torch.jit.script
+        def future_wait_in_script(fut: Future[Tensor]) -> Tensor:
+            return fut.wait()
+
+        self.assertEqual(future_wait_in_script(ret_fut), expected_res)
+
+        @torch.jit.script
+        def future_return_to_python(
+            dst_rank: int, inputs: Tuple[Tensor, Tensor]
+        ) -> Future[Tensor]:
+            return rpc.rpc_async(
+                f"worker{dst_rank}", two_args_two_kwargs, inputs
+            )
+
+        fut_res = future_return_to_python(dst_rank, inputs)
+        self.assertEqual(fut_res.wait(), expected_res)
+
+    @dist_init
+    def test_future_python_annotation(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        input_0 = torch.ones(2, 2)
+        input_1 = 1
+        expected_res = torch.add(input_0, input_1)
+
+        @torch.jit.ignore
+        def python_return_future() -> Future[Tensor]:
+            fut = rpc.rpc_async(dst_worker_name, torch.add, (input_0, input_1), {})
+            return fut
+
+        @torch.jit.script
+        def script_use_future() -> Tensor:
+            fut = python_return_future()
+            return fut.wait()
+
+        res = script_use_future()
+        self.assertEqual(res, expected_res)
+
+
+@torch.jit.script
+class MyScriptClass:
+    def __init__(self, a: int):
+        self.a = a
+
+    def get_value(self) -> int:
+        return self.a
+
+
+@torch.jit.interface
+class MyModuleInterface(torch.nn.Module):
+    def forward(self) -> Tensor:
+        # pyre-ignore[7]: Pyre and torch.jit.interface don't mix well
+        pass
+
+
+class MyScriptModule(torch.jit.ScriptModule):
+    def __init__(self, rank):
+        super().__init__()
+        self.a = torch.ones(rank)
+
+    @torch.jit.script_method
+    def forward(self) -> Tensor:
+        return self.a
+
+    @torch.jit.script_method
+    def custom_func(self) -> Tensor:
+        return self.a
+
+
+def owner_create_rref_my_script_class(a):
+    return rpc.RRef(MyScriptClass(a))
+
+
+def owner_create_rref_my_script_module(a):
+    return rpc.RRef(MyScriptModule(a), type_hint=MyModuleInterface)
+
+
+@torch.jit.script
+def script_rref_get_value_my_script_class(rref: RRef[MyScriptClass]) -> int:
+    return rref.to_here().get_value()
+
+
+@torch.jit.script
+def script_rref_run_forward_my_script_module(rref: RRef[MyModuleInterface]) -> Tensor:
+    return rref.to_here().forward()
+
+
+class LocalRRefTest:
+    @dist_init
+    def test_create_local_script_class_rref_in_py(self):
+        if self.rank != 0:
+            return
+
+        # Create a local RRef<MyScriptClass>.
+        rref_script_class = rpc.RRef(MyScriptClass(self.rank))
+        ret = rref_script_class.to_here().get_value()
+        self.assertEqual(ret, self.rank)
+
+    @dist_init
+    def test_create_local_script_module_rref_in_py(self):
+        if self.rank != 0:
+            return
+
+        # Create a local RRef<MyModuleInterface>.
+        rref_script_module = rpc.RRef(MyScriptModule(self.rank), MyModuleInterface)
+        ret = rref_script_module.to_here().forward()
+        self.assertEqual(ret, torch.ones(self.rank))
+
+        # Create a local RRef<MyModuleInterface> without type hint.
+        with self.assertRaisesRegex(
+            RuntimeError,
+            (
+                "The RRef being created contains a ScriptModule, "
+                "must provide its ModuleInterface type hint."
+            ),
+        ):
+            rref_script_module = rpc.RRef(MyScriptModule(self.rank))
+
+    @dist_init
+    def test_return_local_script_class_rref_in_py_and_use_in_script(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Create a local RRef<MyScriptClass> remotely in Python.
+        rref = rpc.rpc_sync(
+            dst_worker_name, owner_create_rref_my_script_class, args=(self.rank,)
+        )
+
+        def use_rref_on_owner(rref: RRef[MyScriptClass]) -> int:
+            args = (rref,)
+            kwargs: Dict[str, Any] = {}
+            fut = rpc.rpc_async(
+                rref.owner(), script_rref_get_value_my_script_class, args, kwargs
+            )
+            ret = fut.wait()
+            return ret
+
+        # Use RRef<MyScriptClass> in local Python RPC and remote Script run.
+        ret = use_rref_on_owner(rref)
+        self.assertEqual(ret, self.rank)
+
+        # Use RRef<MyScriptClass> in local Script RPC and remote Script run.
+        use_rref_on_owner_script = torch.jit.script(use_rref_on_owner)
+        ret = use_rref_on_owner_script(rref)
+        self.assertEqual(ret, self.rank)
+
+    @dist_init
+    def test_return_local_script_module_rref_in_py_and_use_in_script(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Create a local RRef<MyModuleInterface> remotely in Python.
+        rref = rpc.rpc_sync(
+            dst_worker_name, owner_create_rref_my_script_module, args=(self.rank,)
+        )
+
+        def use_rref_on_owner(rref: RRef[MyModuleInterface]) -> Tensor:
+            args = (rref,)
+            kwargs: Dict[str, Any] = {}
+            fut = rpc.rpc_async(
+                rref.owner_name(),
+                script_rref_run_forward_my_script_module,
+                args,
+                kwargs,
+            )
+            ret = fut.wait()
+            return ret
+
+        # Use RRef<MyScriptClass> in local Python RPC and remote Script run.
+        ret = use_rref_on_owner(rref)
+        self.assertEqual(ret, torch.ones(self.rank))
+
+        # Use RRef<MyScriptClass> in local Script RPC and remote Script run.
+        use_rref_on_owner_script = torch.jit.script(use_rref_on_owner)
+        ret = use_rref_on_owner_script(rref)
+        self.assertEqual(ret, torch.ones(self.rank))
+
+
+def python_function():
+    return 0
+
+
+@torch.jit.script
+def two_args_two_kwargs(
+    first_arg,
+    second_arg,
+    first_kwarg=torch.tensor([3, 3]),
+    second_kwarg=torch.tensor([4, 4]),
+):
+    return first_arg + second_arg + first_kwarg + second_kwarg
+
+
+@torch.jit.script
+def assorted_types_args_kwargs(
+    tensor_arg: Tensor,  # noqa: E999
+    str_arg: str,
+    int_arg: int,
+    tensor_kwarg: Tensor = torch.tensor([2, 2]),
+    str_kwarg: str = "str_kwarg",
+    int_kwarg: int = 2,
+):
+    return tensor_arg + tensor_kwarg, str_arg + str_kwarg, int_arg + int_kwarg
+
+
+@torch.jit.script
+def raise_script():
+    raise RuntimeError("Expected error")
+
+
+@torch.jit.script
+def script_rpc_async_call(
+    dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]
+):
+    fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+    ret = fut.wait()
+    return ret
+
+@torch.jit.script
+def script_rpc_sync_call(
+    dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]
+):
+    res = rpc.rpc_sync(dst_worker_name, two_args_two_kwargs, args, kwargs)
+    return res
+
+@torch.jit.script
+def script_rpc_remote_call(
+    dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]
+):
+    rref_res = rpc.remote(dst_worker_name, two_args_two_kwargs, args, kwargs)
+    return rref_res.to_here()
+
+class JitRpcOpTest:
+    # Call functions remotely from Script.
+    @dist_init
+    def test_all_kwargs_are_populated_by_defaults(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        kwargs = {}
+
+        for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
+            ret = script_op(
+                dst_worker_name, args, kwargs
+            )
+            self.assertEqual(ret, torch.tensor([10, 10]))
+
+    @dist_init
+    def test_some_kwargs_are_populated_by_defaults(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        kwargs = {"first_kwarg": torch.tensor([2, 2])}
+
+        for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
+            ret = script_op(
+                dst_worker_name, args, kwargs
+            )
+            self.assertEqual(ret, torch.tensor([9, 9]))
+
+    @dist_init
+    def test_no_kwargs_are_populated_by_defaults(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        kwargs = {
+            "first_kwarg": torch.tensor([2, 2]),
+            "second_kwarg": torch.tensor([3, 3]),
+        }
+        for script_op in [script_rpc_async_call, script_rpc_sync_call, script_rpc_remote_call]:
+            ret = script_op(
+                dst_worker_name, args, kwargs
+            )
+            self.assertEqual(ret, torch.tensor([8, 8]))
+
+    @dist_init
+    def test_args_and_kwargs_contain_different_types(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        @torch.jit.script
+        def script_rpc_async_call_with_assorted_types(
+            dst_worker_name: str,
+        ):
+            args = (torch.tensor([1, 1]), "str_arg", 1)
+            # Must annotate the value type as `Any`, because JIT type inference
+            # does not support multiple types when defining a Dict.
+            # The error JIT gives is,
+            # "Dict values must contain only a single type, "
+            # "expected: Tensor but found str instead."
+            kwargs: Dict[str, Any] = {
+                "tensor_kwarg": torch.tensor([3, 3]),
+                "str_kwarg": "_str_kwarg",
+                "int_kwarg": 3,
+            }
+            fut = rpc.rpc_async(
+                dst_worker_name, assorted_types_args_kwargs, args, kwargs
+            )
+            ret = fut.wait()
+            return ret
+
+        ret = script_rpc_async_call_with_assorted_types(
+            dst_worker_name
+        )
+        self.assertEqual(ret, (torch.tensor([4, 4]), "str_arg_str_kwarg", 4))
+
+    @dist_init
+    def test_kwargs_not_passed(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        @torch.jit.script
+        def script_rpc_async_call_without_kwargs_passed(
+            dst_worker_name: str,
+        ):
+            args = ()
+            fut = rpc.rpc_async(dst_worker_name, no_arg, args)
+            ret = fut.wait()
+            return ret
+
+        ret = script_rpc_async_call_without_kwargs_passed(
+            dst_worker_name
+        )
+        self.assertEqual(ret, 0)
+
+    @dist_init
+    def test_args_kwargs_are_neither_passed(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        @torch.jit.script
+        def script_rpc_async_call_without_args_kwargs_passed(
+            dst_worker_name: str,
+        ):
+            fut = rpc.rpc_async(dst_worker_name, no_arg)
+            ret = fut.wait()
+            return ret
+
+        ret = script_rpc_async_call_without_args_kwargs_passed(
+            dst_worker_name
+        )
+        self.assertEqual(ret, 0)
+
+    @dist_init
+    def test_less_than_needed_args_are_specified(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Notice, args matching happens during scripting.
+        with self.assertRaisesRegex(RuntimeError, "Argument second_arg not provided"):
+
+            @torch.jit.script
+            def script_rpc_async_call_with_less_args(
+                dst_worker_name: str,  # noqa: E999
+            ):
+                args = (torch.tensor([1, 1]),)
+                kwargs = {}
+                fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+                ret = fut.wait()
+                return ret
+
+    @dist_init
+    def test_more_than_needed_args_are_specified(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Notice, args matching happens during scripting.
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected at most 4 arguments but found 5 positional arguments",
+        ):
+
+            @torch.jit.script
+            def script_rpc_async_call_with_more_args(
+                dst_worker_name: str,
+            ):
+                args = (
+                    torch.tensor([1, 1]),
+                    torch.tensor([2, 2]),
+                    torch.tensor([3, 3]),
+                    torch.tensor([4, 4]),
+                    torch.tensor([5, 5]),
+                )
+                kwargs = {}
+                fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+                ret = fut.wait()
+                return ret
+
+    @dist_init
+    def test_unexepected_kwarg_is_specified(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Notice, kwargs matching happens during execution.
+        @torch.jit.script
+        def script_rpc_async_call_with_unexpected_kwarg(
+            dst_worker_name: str,  # noqa: E999
+        ):
+            args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+            kwargs = {"third_kwarg": torch.tensor([1, 1])}
+            fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+            ret = fut.wait()
+            return ret
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Unknown keyword argument 'third_kwarg'"
+        ):
+            ret = script_rpc_async_call_with_unexpected_kwarg(
+                dst_worker_name
+            )
+            self.assertEqual(ret, 0)
+
+    @dist_init
+    def test_call_python_function_remotely_from_script_not_supported(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        @torch.jit.script
+        def rpc_async_call_remote_py_function_in_torchscript(dst_worker_name: str):
+            args = ()
+            kwargs = {}
+            fut = rpc.rpc_async(dst_worker_name, python_function, args, kwargs)
+            ret = fut.wait()
+            return ret
+
+        with self.assertRaisesRegex(
+            RuntimeError, "attempted to get undefined function"
+        ):
+            ret = rpc_async_call_remote_py_function_in_torchscript(dst_worker_name)
+            self.assertEqual(ret, 0)
+
+    @dist_init
+    def test_call_script_function_that_raises_remotely_from_script(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        # Notice, TorchScript always translates(emits) Python `raise` statement,
+        # as the exception message string, "Exception",
+        # no matter what exception type and exception message are in the statement,
+        @torch.jit.script
+        def rpc_async_call_remote_raising_torchscript_in_torchscript(
+            dst_worker_name: str,
+        ):
+            args = ()
+            kwargs = {}
+            fut = rpc.rpc_async(dst_worker_name, raise_script, args, kwargs)
+            ret = fut.wait()
+            return ret
+
+        with self.assertRaisesRegex(RuntimeError, "Expected error"):
+            ret = rpc_async_call_remote_raising_torchscript_in_torchscript(
+                dst_worker_name
+            )
+            self.assertEqual(ret, 0)
+
+    @dist_init
+    def test_call_script_function_that_not_exists_remotely_from_script(self):
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        @torch.jit.script
+        def nonexisting_script():
+            return 0
+
+        @torch.jit.script
+        def rpc_async_call_remote_nonexisting_torchscript_in_torchscript(
+            dst_worker_name: str,
+        ):
+            args = ()
+            kwargs = {}
+            fut = rpc.rpc_async(dst_worker_name, nonexisting_script, args, kwargs)
+            ret = fut.wait()
+            return ret
+
+        with self.assertRaisesRegex(
+            RuntimeError, "attempted to get undefined function nonexisting_script"
+        ):
+            ret = rpc_async_call_remote_nonexisting_torchscript_in_torchscript(
+                dst_worker_name
+            )
+            self.assertEqual(ret, 0)
+
+
+@torch.jit.ignore
+def my_script_module_init(rank: int) -> MyModuleInterface:
+    return MyScriptModule(rank)
+
+
+@torch.jit.script
+def construct_my_script_module(rank: int) -> MyModuleInterface:
+    return my_script_module_init(rank)
+
+
+@torch.jit.script
+def run_ref_script_module(
+    ref_script_module: RRef[MyModuleInterface], t: Tensor
+) -> Tensor:
+    module = ref_script_module.to_here()
+    return module.forward() + t
+
+
+@torch.jit.script
+def script_check_rref_confirmed(rref: RRef[Tensor]) -> bool:
+    return rref.confirmed_by_owner()
+
+
+@torch.jit.script
+def save_rref(rref_var: RRef[Tensor], fname: str) -> None:
+    torch.save(rref_var, fname)
+
+
+@torch.jit.script
+def script_add(x: Tensor, y: Tensor) -> Tensor:
+    return x + y
+
+
+@rpc.functions.async_execution
+@torch.jit.script
+def async_add(to: str, x: Tensor, y: Tensor) -> Future[Tensor]:
+    return rpc.rpc_async(to, script_add, (x, y))
+
+
+@rpc.functions.async_execution
+@torch.jit.script
+def async_wrong_type() -> Tensor:
+    return torch.zeros(2)
+
+
+def load_script_module_with_pickled_rref(pickled_script_module):
+    f = io.BytesIO(pickled_script_module)
+    m = torch.jit.load(f)
+    return m()
+
+
+class JitRpcTest(
+    RRefAPITest,
+    RRefTypingTest,
+    LocalRRefTest,
+    JitRpcOpTest,
+    FutureTypingTest,
+    RpcAgentTestFixture,
+):
+    @dist_init
+    def test_torchscript_function(self):
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        local_ret = one_arg(torch.ones(2, 2))
+        ret = rpc.rpc_sync(dst_worker_name, one_arg, args=(torch.ones(2, 2),))
+        self.assertEqual(ret, local_ret)
+        rref = rpc.remote(dst_worker_name, one_arg, args=(torch.ones(2, 2),))
+        self.assertEqual(rref.to_here(), local_ret)
+        # create rref to itself
+        local_rref = rpc.remote(
+            worker_name(self.rank), one_arg, args=(torch.ones(2, 2),)
+        )
+        self.assertEqual(local_rref.to_here(), local_ret)
+
+    @dist_init
+    def test_torchscript_function_exception(self):
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        with self.assertRaisesRegex(RuntimeError, r"one_arg\(\) expected at most"):
+            ret = rpc.rpc_sync(dst_worker_name, one_arg, args=(10, 20))
+
+        with self.assertRaisesRegex(RuntimeError, r"one_arg\(\) expected at most"):
+            rref = rpc.remote(dst_worker_name, one_arg, args=(10, 20))
+
+    @dist_init
+    def test_torchscript_functions_not_supported(self):
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        my_local_script_module = MyScriptModule(self.rank)
+
+        # It is not thread safe to instantiate MyScriptModule in multiple threads,
+        # wait for local MyScriptModule instantiation to finish,
+        # otherwise it could instantiate MyScriptModule in parallel with
+        # server thread in the below
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+
+        # rpc_sync still accepts script class and run it in
+        # the same code path as python call.
+        ret = rpc.rpc_sync(dst_worker_name, MyScriptClass, args=(self.rank,))
+
+        # rpc_sync does not accept script module method.
+        # Python 3.5 and Python 3.6 throw different error message, the only
+        # common word can be greped is "pickle".
+        with self.assertRaisesRegex(TypeError, "pickle"):
+            ret = rpc.rpc_async(
+                dst_worker_name, my_local_script_module.forward, args=()
+            )
+
+    @dist_init
+    def test_remote_script_module(self):
+        # TODO, need more investigation
+        # there is rref leak when shutting down, suspect it is because
+        # ref as arg is passed to pybind boundary, and the ref is not garbage
+        # collected by python when calling shutdown()
+        import torch.distributed.rpc.api as api
+
+        api._ignore_rref_leak = True
+
+        local_ret = torch.ones(self.rank) + torch.ones(self.rank)
+
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        remote_ref = rpc.remote(
+            worker_name(dst_rank), construct_my_script_module, args=(self.rank,)
+        )
+
+        # pass rref arg to owner
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            run_ref_script_module,
+            args=(remote_ref, torch.ones(self.rank)),
+        )
+        self.assertEqual(ret, local_ret)
+
+        # pass rref arg to self/user
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "is an RRef to a ScriptModule. It can't be sent through RPC from owner,",
+        ):
+            ret = rpc.rpc_sync(
+                worker_name(self.rank),
+                run_ref_script_module,
+                args=(remote_ref, torch.ones(self.rank)),
+            )
+
+    @dist_init
+    def test_create_script_module_on_remote(self):
+        dst_name = worker_name((self.rank + 1) % self.world_size)
+        # Construct on remote end with rpc_sync
+        created_script_module = rpc.rpc_sync(
+            dst_name, MyScriptModule, args=(self.rank,)
+        )
+        # Forward should output a ones tensor of self.rank.
+        self.assertTrue(isinstance(created_script_module, torch.jit.ScriptModule))
+        rank_ones_tensor = created_script_module()
+        self.assertEqual(torch.ones(self.rank), rank_ones_tensor)
+
+        # Construct ScriptModule with rpc.remote.
+        remote_script_module = rpc.remote(dst_name, MyScriptModule, args=(self.rank,))
+        # Verify it is an instance of ScriptModule on remote end.
+        remote_end_is_script = rpc.rpc_sync(
+            remote_script_module.owner(),
+            rref_isinstance,
+            args=(remote_script_module, torch.jit.ScriptModule),
+        )
+        self.assertTrue(remote_end_is_script)
+        # Run forward pass remotely.
+        remote_forward_output = remote_script_module.rpc_sync().forward()
+        self.assertEqual(remote_forward_output, torch.ones(self.rank))
+        # Run function defined on ScriptModule remotely.
+        remote_func_output = remote_script_module.rpc_sync().custom_func()
+        self.assertEqual(remote_func_output, torch.ones(self.rank))
+        # Ensure we can transfer ScriptModule RRef to this rank and run
+        # forward pass.
+        local_script_module = remote_script_module.to_here()
+        self.assertTrue(isinstance(local_script_module, torch.jit.ScriptModule))
+        rank_ones_tensor = local_script_module()
+        self.assertEqual(rank_ones_tensor, torch.ones(self.rank))
+        local_script_func_output = local_script_module.custom_func()
+        self.assertEqual(local_script_func_output, torch.ones(self.rank))
+
+    @dist_init
+    def test_load_script_module_with_pickled_rref(self):
+        dst_name = worker_name((self.rank + 1) % self.world_size)
+        m1 = MyScriptModuleWithRRefs(dst_name)
+        m2 = MyScriptModuleWithRRefs(dst_name)
+
+        f = io.BytesIO()
+
+        rpc._enable_jit_rref_pickle()
+        torch.jit.save(m1, f)
+        rpc._disable_jit_rref_pickle()
+
+        out1 = rpc.rpc_sync(
+            dst_name,
+            load_script_module_with_pickled_rref,
+            args=(f.getvalue(),)
+        )
+        out2 = m2()
+        self.assertEqual(out1, out2)
+
+    @dist_init
+    def test_rref_jit_pickle_not_supported(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_var = rpc_return_rref(worker_name(dst_rank))
+        with TemporaryFileName() as fname:
+            with self.assertRaisesRegex(
+                RuntimeError, "RRef jit pickling is only allowed inside RPC calls"
+            ):
+                save_rref(rref_var, fname)
+
+    @dist_init
+    def test_remote_script_throw(self):
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size),
+            script_raise_func,
+            args=(torch.ones(2),),
+        )
+        with self.assertRaisesRegex(Exception, ".*Expected error.*"):
+            rref.to_here()
+
+    @dist_init
+    def test_remote_script_udf(self):
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_udf,
+            args=(torch.ones(2),),
+        )
+        self.assertEqual(rref.to_here(), torch.ones(2) * 2)
+
+    @dist_init
+    def test_async_script_udf(self):
+        future = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_udf,
+            args=(torch.ones(2),),
+        )
+        self.assertEqual(future.wait(), torch.ones(2) * 2)
+
+    @dist_init
+    def test_callback_simple(self):
+        def callback(fut):
+            return fut.wait() + 1
+
+        future = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_udf,
+            args=(torch.ones(2),),
+        ).then(callback)
+        self.assertEqual(future.wait(), torch.ones(2) * 2 + 1)
+
+    @dist_init
+    def test_callback_chain(self):
+        n = self.rank + 1
+        dst = worker_name(n % self.world_size)
+
+        def callback(fut):
+            return fut.wait() + 1
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size), one_arg, args=(torch.ones(n, n),)
+        )
+
+        num_cbs = 20
+        for _ in range(num_cbs):
+            fut = fut.then(callback)
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) + 1 + num_cbs)
+
+    @dist_init
+    def test_add_done_callback(self):
+        callback_called = None
+
+        def callback(fut):
+            nonlocal callback_called
+            callback_called = fut.wait() * 2
+
+        future = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_udf,
+            args=(torch.ones(2),),
+        )
+
+        future.add_done_callback(callback)
+        future_then = future.then(lambda _: True)
+
+        self.assertEqual(future.wait(), torch.ones(2) * 2)
+
+        # We have no guarantee that the add_done_callback fn will execute before the test finishes.
+        # Adding a 'then' callback that runs afterwards to guarantee we wait for the first callback
+        future_then.wait()
+        self.assertEqual(callback_called, torch.ones(2) * 4)
+
+    @dist_init
+    def test_async_script_throw(self):
+        future = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_throw,
+            args=(torch.ones(2),),
+        )
+        with self.assertRaisesRegex(Exception, ".*Expected error.*"):
+            future.wait()
+
+    @dist_init
+    def test_callback_with_exception(self):
+        def callback(fut):
+            with self.assertRaisesRegex(Exception, ".*Expected error.*"):
+                fut.wait()
+            raise RuntimeError("Another expected error")
+
+        future = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            script_fork_wait_throw,
+            args=(torch.ones(2),),
+        ).then(callback)
+
+        with self.assertRaisesRegex(RuntimeError, "Another expected error"):
+            future.wait()
+
+    @dist_init
+    def test_call_rpc_with_profiling(self):
+        # Ensures that we can call torch.ops.profiler._call_end_callbacks_on_jit_fut on a jit
+        # future from within a script function that calls rpc_async
+        if self.rank == 0:
+            with _profile() as prof:
+                prof_key = _build_rpc_profiling_key(
+                    RPCExecMode.ASYNC,
+                    torch._jit_internal._qualified_name(one_arg),
+                    "worker0",
+                    "worker1",
+                )
+                with torch.autograd.profiler.record_function(prof_key) as rf:
+                    ret = call_rpc_with_profiling(rf.record, "worker1")
+            # TODO: Can't get a reliable time for this profiling event since
+            # it's hard to estimate the execution time on the remote end for non-UDFs.
+            # This can be resolved by https://github.com/pytorch/pytorch/issues/36272.
+            # After that, this test should be modified to validate the function time.
+            events = prof.function_events
+            function_event = get_function_event(events, prof_key)
+            self.assertTrue(torch._jit_internal._qualified_name(one_arg) in function_event.name)
+
+    @dist_init
+    def test_rpc_async_jit_profiled(self):
+        # Tests that rpc_async calls made from within a TorchScript function are
+        # profiled.
+        if self.rank == 0:
+            dst_rank = (self.rank + 1) % self.world_size
+            dst_worker_name = worker_name(dst_rank)
+            args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+            kwargs = {}
+            with _profile() as prof:
+                script_rpc_async_call(
+                    dst_worker_name, args, kwargs
+                )
+
+            # Ensure rpc_async call is profiled
+            function_events = prof.function_events
+            qual_name = torch._jit_internal._qualified_name(two_args_two_kwargs)
+            rpc_async_jit_event = [
+                event
+                for event in function_events
+                if qual_name in event.name and event.node_id == self.rank
+            ]
+            self.assertEqual(len(rpc_async_jit_event), 1)
+            rpc_async_jit_event = rpc_async_jit_event[0]
+            profiled_name = _build_rpc_profiling_key(
+                RPCExecMode.ASYNC_JIT,
+                qual_name,
+                worker_name(self.rank),
+                dst_worker_name,
+            )
+            self.assertEqual(profiled_name, rpc_async_jit_event.name)
+            remote_events = [event for event in function_events if event.is_remote]
+            # All remote events should have taken place on dst_rank
+            remote_event_node_ids = {
+                remote_event.node_id for remote_event in remote_events
+            }
+            self.assertEqual(remote_event_node_ids, {dst_rank})
+            # script_rpc_async_call invokes add operator
+            # so we should see this as a remote event.
+            remote_add = next(
+                remote_event
+                for remote_event in remote_events
+                if "aten::add" in remote_event.name
+            )
+            remote_add_profiled_name = f"{profiled_name}#remote_op: aten::add"
+            self.assertEqual(remote_add.name, remote_add_profiled_name)
+
+    @dist_init
+    def test_record_function_on_caller_rpc_async(self):
+        if self.rank == 0:
+            dst_rank = (self.rank + 1) % self.world_size
+            dst_worker_name = worker_name(dst_rank)
+            block_scope = "foo"
+            with _profile() as prof:
+                # Runs 2 rpc_async calls within JIT under record_function.
+                record_function_on_caller_rpc_async(dst_worker_name, block_scope)
+
+            # Ensure record_function event is profiled.
+            function_events = prof.function_events
+            record_function_scope_event = [
+                event for event in function_events if event.name == block_scope
+            ]
+            self.assertEqual(1, len(record_function_scope_event))
+            record_function_scope_event = record_function_scope_event[0]
+            # Ensure RPC future is profiled.
+            expected_key = _build_rpc_profiling_key(
+                RPCExecMode.ASYNC_JIT,
+                torch._jit_internal._qualified_name(script_add_ones),
+                worker_name(self.rank),
+                dst_worker_name,
+            )
+            jit_rpc_events = [
+                event for event in function_events if event.name == expected_key
+            ]
+            self.assertEqual(2, len(jit_rpc_events))
+            # Validate that the record_function scope time is greater than both
+            # of the individual RPC async call times. The reason it is not necessarily
+            # greater than the sum is because the two can execute in parallel.
+            for jit_rpc_event in jit_rpc_events:
+                self.assertTrue(
+                    record_function_scope_event.cpu_time_total
+                    > jit_rpc_event.cpu_time_total
+                )
+
+    @dist_init
+    def test_rpc_torchscript_record_function(self):
+        # tests that torchscript functions can be profiled using with
+        # record_function(...) over RPC.
+        REMOTE_OP_STR = "#remote_op: "
+        if self.rank == 0:
+            dst_rank = (self.rank + 1) % self.world_size
+            dst_worker_name = worker_name(dst_rank)
+            block_scope = "foo"
+            with _profile() as prof:
+                call_rpc_torchscript_with_record_function(dst_worker_name, block_scope)
+
+            # Need to call below to populate CPU children.
+            prof.key_averages()
+            function_events = prof.function_events
+            expected_key = (
+                _build_rpc_profiling_key(
+                    RPCExecMode.ASYNC_JIT,
+                    torch._jit_internal._qualified_name(
+                        script_add_ones_with_record_function
+                    ),
+                    worker_name(self.rank),
+                    dst_worker_name,
+                )
+                + REMOTE_OP_STR
+                + block_scope
+            )
+            remote_record_function_event = next(
+                evt for evt in function_events if evt.name == expected_key
+            )
+            self.assertTrue(block_scope in remote_record_function_event.name)
+            remote_children = remote_record_function_event.cpu_children
+            self.assertTrue("aten::add" in child.name for child in remote_children)
+
+    def test_record_function_jit_end_callbacks_with_fork(self):
+        # Ensures that we can call rf._call_end_callbacks_on_future on a jit
+        # future in python eager mode with torch.jit.fork
+        sleep_interval = 1
+        with _profile() as prof:
+            with torch.autograd.profiler.record_function("foo") as rf:
+                fut = torch.jit._fork(sleep, sleep_interval)
+                rf._call_end_callbacks_on_future(fut)
+            fut.wait()
+
+        function_events = prof.function_events
+        sleep_event = get_function_event(function_events, "foo")
+        self.assertEqual(sleep_event.name, "foo")
+        # Validate that callbacks were fired at the right time by checking the
+        # profiling event cpu time
+        self.assertGreaterAlmostEqual(sleep_event.cpu_time * 1e-6, sleep_interval)
+
+    def test_call_fork_in_jit_with_profiling(self):
+        # Ensures that we can call torch.ops.profiler._call_end_callbacks_on_jit_fut on a jit
+        # future from within a script function with torch.jit.fork
+        with _profile() as prof:
+            with torch.autograd.profiler.record_function("foo") as rf:
+                ret = call_fork_with_profiling(rf.record)
+
+        events = prof.function_events
+        function_event = get_function_event(events, "foo")
+        self.assertEqual(function_event.name, "foo")
+
+    @dist_init
+    def test_async_function_simple(self):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        ret = rpc.rpc_sync(
+            dst1, async_add, args=(dst2, torch.ones(2, 2), torch.ones(2, 2))
+        )
+        self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_async_function_wrong_return_type(self):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Async functions must return an IValue of Future type, but got Tensor",
+        ):
+            rpc.rpc_sync(
+                worker_name((self.rank + 1) % self.world_size), async_wrong_type
+            )
+
+    @dist_init
+    def test_async_function_wrong_decorator_order(self):
+        # @torch.jit.script complains about undefined value rpc. Error is shown
+        # below. The reason for not checking error string is to avoid making
+        # JIT error handling code depend on RPC tests, as we don't have any
+        # restrictions on the error message here.
+        #
+        # RuntimeError:
+        # undefined value rpc:
+        # def async_wrong_decorator_order(to, x, y):
+        #    # type: (str, Tensor, Tensor) -> Future[Tensor]
+        #    return rpc.rpc_async(to, script_add, (x, y))
+        #           ~~~ <--- HERE
+        with self.assertRaises(RuntimeError):
+
+            @torch.jit.script
+            @rpc.functions.async_execution
+            def async_wrong_decorator_order(
+                to: str, x: Tensor, y: Tensor
+            ) -> Future[Tensor]:
+                return rpc.rpc_async(to, script_add, (x, y))
+
+    @dist_init
+    def test_async_function_remote(self):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        rref = rpc.remote(
+            dst1, async_add, args=(dst2, torch.ones(2, 2), torch.ones(2, 2))
+        )
+        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_async_function_remote_multi(self):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        num = 20
+        rrefs = []
+        for i in range(num):
+            rrefs.append(
+                rpc.remote(
+                    dst1, async_add, args=(dst2, torch.ones(2, 2), torch.ones(2, 2) * i)
+                )
+            )
+
+        for i in range(num):
+            self.assertEqual(rrefs[i].to_here(), torch.ones(2, 2) + i)
+
+    @dist_init
+    def test_async_function_wrong_return_type_remote(self):
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size), async_wrong_type
+        )
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Async functions must return an IValue of Future type, but got Tensor",
+        ):
+            rref.to_here()
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b404128775fe0ca93c3319602c86775abf0d51
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
@@ -0,0 +1,218 @@
+# mypy: ignore-errors
+
+from typing import Dict, Tuple
+
+import torch
+import torch.distributed.rpc as rpc
+from torch import Tensor
+from torch.distributed.rpc import RRef
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    worker_name,
+    wait_until_pending_futures_and_users_flushed
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+
+
+@torch.jit.script
+def two_args_two_kwargs(
+    first_arg,
+    second_arg,
+    first_kwarg=torch.tensor([3, 3]),
+    second_kwarg=torch.tensor([4, 4]),
+):
+    return first_arg + second_arg + first_kwarg + second_kwarg
+
+
+@torch.jit.script
+def script_rpc_async_call(
+    dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]
+):
+    fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+    ret = fut.wait()
+    return ret
+
+
+@torch.jit.script
+def rpc_async_call_with_timeout(
+    dst_worker_name: str,
+    args: Tuple[Tensor, Tensor],
+    kwargs: Dict[str, Tensor],
+    timeout: float,
+):
+    fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs, timeout)
+    ret = fut.wait()
+    return ret
+
+
+@torch.jit.script
+def rpc_async_call_with_timeout_future_ret(
+    dst_worker_name: str,
+    args: Tuple[Tensor, Tensor],
+    kwargs: Dict[str, Tensor],
+    timeout: float,
+):
+    fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs, timeout)
+    return fut
+
+
+@torch.jit.script
+def rpc_async_call_future_ret(
+    dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]
+):
+    fut = rpc.rpc_async(dst_worker_name, two_args_two_kwargs, args, kwargs)
+    return fut
+
+@torch.jit.script
+def rref_to_here(rref_var: RRef[Tensor]) -> Tensor:
+    return rref_var.to_here()
+
+@torch.jit.script
+def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor:
+    return rref_var.to_here(timeout)
+
+@torch.jit.script
+def rpc_async_with_rref_arg(dst_worker_name: str, args: Tuple[RRef[Tensor]]) -> Tensor:
+    fut = rpc.rpc_async(dst_worker_name, rref_to_here, args)
+    ret = fut.wait()
+    return ret
+
+
+class JitFaultyAgentRpcTest(RpcAgentTestFixture):
+    """
+    Run tests for rpc_async in JIT under the faulty agent test fixture to test
+    arbitrary timeouts.
+    """
+    @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_CALL": 1.5})
+    def test_timeout_in_torchscript_function(self):
+        # Call rpc_async + fut.wait() in torchscript function and ensure that
+        # timeout is raised.
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        kwargs = {
+            "first_kwarg": torch.tensor([2, 2]),
+            "second_kwarg": torch.tensor([3, 3]),
+        }
+        expected_error = self.get_timeout_error_regex()
+        # Ensure that we get a timeout if we override the default timeout and
+        # the RPC takes longer to execute.
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rpc_async_call_with_timeout(dst_worker_name, args, kwargs, 0.5)
+
+        # Ensure that we timeout if we don't specify a timeout but the default
+        # is less than the RPC takes to execute.
+        rpc._set_rpc_timeout(0.001)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            script_rpc_async_call(
+                dst_worker_name, args, kwargs
+            )
+
+        # Ensure that we run to completion if zero timeout is specified.
+        ret = rpc_async_call_with_timeout(dst_worker_name, args, kwargs, 0)
+        self.assertEqual(ret, torch.tensor([8, 8]))
+        # reset for clean shutdown
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_CALL": 1.5})
+    def test_timeout_in_python(self):
+        # Ensures timeouts are raised if we call rpc_async from within a
+        # torchscript function, but wait on the future in python.
+        if self.rank != 0:
+            return
+
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+        args = (torch.tensor([1, 1]), torch.tensor([2, 2]))
+        kwargs = {
+            "first_kwarg": torch.tensor([2, 2]),
+            "second_kwarg": torch.tensor([3, 3]),
+        }
+        expected_error = self.get_timeout_error_regex()
+
+        fut = rpc_async_call_with_timeout_future_ret(dst_worker_name, args, kwargs, 0.5)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure timeout if we don't specify but the default is less than the
+        # RPC takes to execute.
+        rpc._set_rpc_timeout(0.001)
+        fut = rpc_async_call_future_ret(dst_worker_name, args, kwargs)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure run to completion if zero timeout is specified
+        fut = rpc_async_call_with_timeout_future_ret(dst_worker_name, args, kwargs, 0)
+        result = fut.wait()
+        self.assertEqual(result, torch.tensor([8, 8]))
+        # reset for clean shutdown
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
+    def test_remote_timeout_to_here_in_jit(self):
+        # Test that calling to_here() in JIT will raise timeout error if
+        # rpc.remote failed.
+        if self.rank != 0:
+            return
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        rref = rpc.remote(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        # Will ensure error handling callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        # Call to_here() within a ScriptFunction and ensure it raises
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rref_to_here(rref)
+
+    @dist_init(faulty_messages=[], messages_to_delay={"SCRIPT_RREF_FETCH_CALL": 1})
+    def test_rref_to_here_timeout_in_jit(self):
+        if self.rank != 0:
+            return
+
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        rref = rpc.remote(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rref_to_here_with_timeout(rref, 0.01)
+
+        rref_to_here_with_timeout(rref, 100)
+
+    @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
+    def test_rref_timeout_pickle_in_jit(self):
+        if self.rank != 0:
+            return
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        rref = rpc.remote(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        # Will ensure error handling callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        # Call RPC with RRef arg in JIT, which will go through JIT pickling and
+        # ensure error is raised.
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rpc_async_with_rref_arg(dst_worker, (rref, ))
+
+    @dist_init(faulty_messages=["SCRIPT_REMOTE_CALL"])
+    def test_rref_timeout_pickle_script_func(self):
+        # Similar to above test, but calls python rpc with script function.
+        if self.rank != 0:
+            return
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        rref = rpc.remote(
+            dst_worker, torch.add, args=(torch.tensor(1), torch.tensor(1))
+        )
+        # Will ensure error handling callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        # Call RPC with script function that takes RRef, ensure timeout during pickling
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rpc.rpc_sync(dst_worker, rref_to_here, args=(rref, ))
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..04809ed44a3f69f762bd1d4957dc56def7b43f0c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_agent_test_fixture.py
@@ -0,0 +1,66 @@
+# mypy: ignore-errors
+
+import os
+from abc import ABC, abstractmethod
+
+import torch.testing._internal.dist_utils
+
+
+class RpcAgentTestFixture(ABC):
+    @property
+    def world_size(self) -> int:
+        return 4
+
+    @property
+    def init_method(self):
+        use_tcp_init = os.environ.get("RPC_INIT_WITH_TCP", None)
+        if use_tcp_init == "1":
+            master_addr = os.environ["MASTER_ADDR"]
+            master_port = os.environ["MASTER_PORT"]
+            return f"tcp://{master_addr}:{master_port}"
+        else:
+            return self.file_init_method
+
+    @property
+    def file_init_method(self):
+        return torch.testing._internal.dist_utils.INIT_METHOD_TEMPLATE.format(
+            file_name=self.file_name
+        )
+
+    @property
+    @abstractmethod
+    def rpc_backend(self):
+        pass
+
+    @property
+    @abstractmethod
+    def rpc_backend_options(self):
+        pass
+
+    def setup_fault_injection(self, faulty_messages, messages_to_delay):  # noqa: B027
+        """Method used by dist_init to prepare the faulty agent.
+
+        Does nothing for other agents.
+        """
+        pass
+
+    # Shutdown sequence is not well defined, so we may see any of the following
+    # errors when running tests that simulate errors via a shutdown on the
+    # remote end.
+    @abstractmethod
+    def get_shutdown_error_regex(self):
+        """
+        Return various error message we may see from RPC agents while running
+        tests that check for failures. This function is used to match against
+        possible errors to ensure failures were raised properly.
+        """
+        pass
+
+    @abstractmethod
+    def get_timeout_error_regex(self):
+        """
+        Returns a partial string indicating the error we should receive when an
+        RPC has timed out. Useful for use with assertRaisesRegex() to ensure we
+        have the right errors during timeout.
+        """
+        pass
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_test.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..11684041d3ea7c21e78e204b29032cd174e2dda9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -0,0 +1,6495 @@
+# mypy: ignore-errors
+
+import concurrent.futures
+import contextlib
+import json
+import os
+import sys
+import threading
+import time
+
+from collections import namedtuple
+from functools import partial
+from threading import Event
+from threading import Lock
+from unittest import mock
+
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import torch.distributed.rpc as rpc
+import torch.distributed.autograd as dist_autograd
+from torch.distributed.rpc import RRef, _get_debug_info, _rref_context_get_debug_info, WorkerInfo
+from torch.distributed.rpc.api import _use_rpc_pickler, _thread_local_var, _wait_all
+from torch.distributed.rpc.internal import (
+    PythonUDF,
+    RPCExecMode,
+    _internal_rpc_pickler,
+    _build_rpc_profiling_key,
+)
+from torch.futures import Future
+from torch.testing._internal.common_distributed import (
+    skip_if_lt_x_gpu,
+    captured_output,
+    tp_transports,
+)
+from torch.testing._internal.common_utils import (
+    IS_MACOS,
+    load_tests,
+    skip_but_pass_in_sandcastle_if,
+    get_cycles_per_ms,
+)
+
+from torch.testing._internal.dist_utils import (
+    dist_init,
+    get_function_event,
+    initialize_pg,
+    wait_until_node_failure,
+    wait_until_pending_futures_and_users_flushed,
+    wait_until_owners_and_forks_on_rank,
+    worker_name,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+from torch.testing._internal.common_utils import TemporaryFileName
+
+from torch.autograd.profiler_legacy import profile as _profile
+
+
+def foo_add():
+    return torch.add(torch.ones(1), torch.ones(1))
+
+def udf_with_torch_ops(device=-1, use_record_function=False):
+    device_ctx = contextlib.nullcontext() if device == -1 else torch.cuda.device(device)
+    record_function_ctx = (
+        torch.autograd.profiler.record_function("##forward##")
+        if use_record_function
+        else contextlib.nullcontext()
+    )
+    with device_ctx, record_function_ctx:
+        t1, t2 = torch.ones(1), torch.ones(1)
+        t = torch.add(t1, t2)
+        t = torch.mul(t, t)
+        t = t.relu()
+        t = t.sigmoid()
+
+# Events (operator invocations) that are expected to be ran as part of the above
+# function.
+EXPECTED_REMOTE_EVENTS = [
+    "aten::ones",
+    "aten::ones",
+    "aten::add",
+    "aten::mul",
+    "aten::relu",
+    "aten::clamp_min",
+    "aten::sigmoid",
+]
+
+# Remote operations are prefixed with the following string for RPC profiling.
+REMOTE_OP_STR = "#remote_op: "
+
+
+VALUE_FUTURE = concurrent.futures.Future()
+DONE_FUTURE = concurrent.futures.Future()
+
+FIFTY_MIL_CYCLES = 50000000
+
+_rpc_barrier_count = 0
+
+def _increment_count():
+    global _rpc_barrier_count
+    _rpc_barrier_count += 1
+
+def _reset_count():
+    global _rpc_barrier_count
+    _rpc_barrier_count = 0
+
+class StubRpcAgent:
+    def __init__(self, world_size):
+        self.world_size = world_size
+
+    def get_worker_infos(self):
+        return {
+            WorkerInfo(name=worker_name(rank), id=rank)
+            for rank in range(self.world_size)
+        }
+
+
+def _stub_construct_rpc_backend_options_handler(**kwargs):
+    return mock.Mock()  # RpcBackendOptions.
+
+
+def _stub_init_rpc_backend_handler(store, name, rank, world_size, rpc_backend_options):
+    return StubRpcAgent(world_size=world_size)
+
+
+def set_value(value):
+    VALUE_FUTURE.set_result(value)
+
+
+def wait_for_value_future():
+    return VALUE_FUTURE.result()
+
+
+def set_and_check_done(value):
+    VALUE_FUTURE.set_result(value)
+    return DONE_FUTURE.result()
+
+
+# it is used to test python user defined function over rpc
+# classes and functions are used to test python user defined class and
+# methods over rpc
+TensorClass = namedtuple("TensorClass", ["tensors"])
+
+class MyPickleClass:
+    def __init__(self):
+        self.t = None
+
+    def __getstate__(self):
+        (pickled_python_udf, tensors) = _internal_rpc_pickler.serialize(
+            PythonUDF(my_tensor_function, (torch.ones(2, 2), torch.ones(2, 2)), None)
+        )
+        return (pickled_python_udf, tensors)
+
+    def __setstate__(self, obj):
+        python_udf = _internal_rpc_pickler.deserialize(obj[0], obj[1])
+        result = python_udf.func(python_udf.args[0], python_udf.args[1])
+        self.t = result
+
+    def set(self, val):
+        self.t = val
+
+
+class SlowPickleClass:
+    def __init__(self, t):
+        self.t = t
+
+    def __getstate__(self):
+        time.sleep(self.t)
+        return (self.t, )
+
+    def __setstate__(self, obj):
+        self.t = obj[0]
+        time.sleep(self.t)
+
+
+class MyClass:
+    def __init__(self, a, delay=False):
+        self.a = a
+        # delay initialization to simulate errors if specified
+        if delay:
+            time.sleep(2)
+
+    def my_instance_method(self, b):
+        return self.a + b
+
+    @classmethod
+    def my_class_method(cls, d, e):
+        return d + e
+
+    @staticmethod
+    def my_static_method(f):
+        return f > 10
+
+    def increment_value(self, increment):
+        self.a += increment
+
+    def get_value(self):
+        return self.a
+
+    def my_slow_method(self, my_tensor_arg):
+        time.sleep(5)
+        return torch.add(self.a, my_tensor_arg)
+
+
+def _call_method_on_rref(method, rref, *args, **kwargs):
+    return method(rref.local_value(), *args, **kwargs)
+
+
+def get_rref_list(values):
+    return [RRef(MyClass(a)) for a in values]
+
+
+def add_rref_to_value(rref, value):
+    return rref.to_here() + value
+
+
+def run_nested_pickle(pickle_cls_instance, tensor):
+    return pickle_cls_instance.t + tensor
+
+def build_sparse_tensor(coalesce=False):
+    i = [[0, 1, 1], [2, 0, 2]]
+    v = [3, 4, 5]
+    tensor = torch.sparse_coo_tensor(i, v, (2, 3))
+    if coalesce:
+        tensor = tensor.coalesce()
+    return tensor
+
+def build_complex_tensors():
+    a = torch.ones(3, 3)
+    b = [a, a]
+    c = [b, b]
+    d = [a, b]
+    e = {a: d}
+    return [a, b, c, d, e]
+
+def non_cont_test(t_view, t_cont):
+    if t_view.is_contiguous():
+        raise Exception('t_view is contiguous!')
+    if not t_cont.is_contiguous():
+        raise Exception('t_cont is not contiguous!')
+    if not torch.equal(t_view, t_cont):
+        raise Exception('t_view is not equal to t_cont!')
+    return t_view
+
+def my_function(a, b, c):
+    return a + b + c
+
+
+def my_tensor_function(a, b):
+    return a + b
+
+def my_container_sum(a):
+    result = a[0]
+    for tensor in a[1:]:
+        result += tensor
+    return result
+
+
+def my_sleep_func(seconds=1):
+    time.sleep(seconds)
+    return torch.mul(torch.tensor(1), torch.tensor(1))
+
+
+def my_complex_tensor_function(list_input, tensor_class_input, dict_input):
+    res = list_input[0]
+    for t in list_input:
+        res += t
+    for v in dict_input.values():
+        res += v
+    complex_tensors = tensor_class_input.tensors
+    return (res, complex_tensors[0], complex_tensors[1], complex_tensors[2])
+
+
+def my_rref_function(rref_a, rref_b):
+    return rref_a.to_here() + rref_b.to_here()
+
+
+def delayed_add(a, b, seconds=0.05):
+    time.sleep(seconds)
+    return a + b
+
+
+def identity(a):
+    return a
+
+def no_result():
+    print("do nothing")
+
+def raise_or_inc(value):
+    if value.numel() == 2:
+        raise ValueError("Expected error")
+    return value + 1
+
+def nested_rpc(dst):
+    return rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
+
+
+def nested_rpc_sparse(dst):
+    return rpc.rpc_sync(
+        dst,
+        torch.add,
+        args=(build_sparse_tensor(), build_sparse_tensor())
+    )
+
+
+def multi_layer_nested_async_rpc(dst, world_size, ttl):
+    # this method returns immediately without blocking the callee, but will
+    # generate additional requests.
+    if ttl > 0:
+        current_dst = worker_name(dst)
+        next_dst = (dst + 1) % world_size
+        rpc.rpc_async(
+            current_dst,
+            multi_layer_nested_async_rpc,
+            args=(next_dst, world_size, ttl - 1),
+        )
+        return 0
+
+
+def nested_rref(dst):
+    return (
+        rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 1)),
+        rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 2)),
+    )
+
+
+def nested_rref_sparse(dst):
+    return (
+        rpc.remote(
+            dst,
+            torch.add,
+            args=(build_sparse_tensor(), build_sparse_tensor())
+        ),
+        rpc.remote(
+            dst,
+            torch.add,
+            args=(build_sparse_tensor(), build_sparse_tensor())
+        ),
+    )
+
+
+def nested_remote(dst):
+    rref = rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 3))
+    return rref.to_here()
+
+def nested_remote_sparse(dst):
+    rref = rpc.remote(dst, torch.add, args=(build_sparse_tensor(), build_sparse_tensor()))
+    return rref.to_here()
+
+
+def rref_forward_chain(dst, world_size, rref, ttl):
+    if ttl > 0:
+        current_dst = worker_name(dst)
+        next_dst = (dst + 1) % world_size
+        ret_rref = rpc.remote(
+            current_dst, rref_forward_chain, args=(next_dst, world_size, rref, ttl - 1)
+        )
+        return [ret_rref]
+    else:
+        return rref.to_here()
+
+
+def rpc_return_rref(dst):
+    return rpc.remote(dst, torch.add, args=(torch.ones(2, 2), 1))
+
+
+def light_rpc():
+    return 0
+
+
+def heavy_rpc(tensor):
+    for i in range(1, 100):
+        tensor *= i
+        tensor /= i + 1
+    return 0
+
+
+def heavy_rpc_sparse(tensor):
+    for i in range(1, 100):
+        tensor *= i
+        tensor = tensor / (i + 1)
+    return 0
+
+@torch.jit.script
+def heavy_rpc_torchscript(tensor):
+    for i in range(1, 100):
+        tensor *= i
+        tensor /= i + 1
+    return 0
+
+
+@torch.jit.script
+def my_script_func(tensor):
+    return torch.add(tensor, tensor)
+
+
+expected_err = "Expected error"
+
+# Note that it needs to inherit from Exception, not BaseException. See comment
+# in rpc/internal.py
+class CustomException(Exception):
+    def __init__(self, bool, msg):
+        self.bool = bool
+        super().__init__(msg)
+
+def raise_func():
+    raise ValueError(expected_err)
+
+def custom_raise_func():
+    raise CustomException(True, "foo")
+
+@torch.jit.script
+def raise_func_script(expected_err: str) -> torch.Tensor:
+    raise ValueError(expected_err)
+
+expected_err_escape = "\nFirst line of error \n next line of error \n last line of error"
+def raise_func_escape():
+    raise ValueError(expected_err_escape)
+
+
+global_rref = None
+
+
+def set_global_rref(rref):
+    global global_rref
+    global_rref = rref
+
+
+def clear_global_rref():
+    global global_rref
+    global_rref = None
+
+
+def check_rref_confirmed(rref):
+    return rref.confirmed_by_owner()
+
+
+def get_rref_debug_info():
+    return _rref_context_get_debug_info()
+
+
+def add_use_future_cb(to, x, y, z):
+    out = concurrent.futures.Future()
+
+    def callback(fut):
+        out.set_result(fut.wait() + z)
+
+    fut = rpc.rpc_async(to, torch.add, args=(x, y))
+    fut.then(callback)
+    return out.result()
+
+
+def get_events_from_profile(profile_rref):
+    return profile_rref.local_value().process_global_function_events
+
+
+def add_use_future_set_result(to, x, y, z):
+    out = torch.futures.Future()
+    fut = rpc.rpc_async(to, torch.add, args=(x, y))
+    fut.then(lambda fut : out.set_result(fut.wait() + z))
+    return out.wait()
+
+
+def add_use_future_nested_cb(to, x, y, z):
+    out = torch.futures.Future()
+
+    def callback(fut1):
+        fut2 = rpc.rpc_async(to, torch.add, args=(fut1.wait(), z))
+        fut2.then(lambda fut2 : out.set_result(fut2.wait()))
+
+    fut1 = rpc.rpc_async(to, torch.add, args=(x, y))
+    fut1.then(callback)
+    return out.wait()
+
+
+def fail_on_fut(fut):
+    pass
+
+
+@rpc.functions.async_execution
+def async_raise_func():
+    raise RuntimeError("Expected error")
+
+
+@rpc.functions.async_execution
+def async_wrong_type():
+    return torch.zeros(2, 2)
+
+
+@rpc.functions.async_execution
+def async_add(to, x, y):
+    return rpc.rpc_async(to, torch.add, args=(x, y))
+
+
+def slow_add(x, y, device="cpu"):
+    time.sleep(1)
+    x = x.to(device)
+    y = y.to(device)
+    return torch.add(x, y).cpu()
+
+
+@rpc.functions.async_execution
+def slow_async_add(to, x, y, device="cpu"):
+    return rpc.rpc_async(to, slow_add, args=(x, y, device))
+
+
+@rpc.functions.async_execution
+def async_add_with_future_ctor(to, x, y, z):
+    fut = torch.futures.Future()
+    rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        lambda fut1: fut.set_result(fut1.wait() + z)
+    )
+    return fut
+
+
+@rpc.functions.async_execution
+def async_add_chained(to, x, y, z):
+    return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+        lambda fut: fut.wait() + z
+    )
+
+
+@rpc.functions.async_execution
+def async_add_chained_multi(to, x, num, step):
+    fut = rpc.rpc_async(to, torch.add, args=(x, 0))
+    for _ in range(num):
+        fut = fut.then(lambda fut: fut.wait() + step)
+    return fut
+
+
+@rpc.functions.async_execution
+def async_add_nested(to, x, y, z):
+    return rpc.rpc_async(to, async_add, args=(to, x, y)).then(
+        lambda fut: fut.wait() + z
+    )
+
+
+@rpc.functions.async_execution
+def async_add_multi_fanout(to, x, num, step):
+    futs = []
+    for i in range(num):
+        if i == 0:
+            futs.append(rpc.rpc_async(to, torch.add, args=(x, step)))
+        else:
+            futs.append(rpc.rpc_async(to, torch.add, args=(0, step)))
+
+    # TODO: use torch.futures.collect_all
+    lock = Lock()
+    state = {"cnt": 0, "ret": torch.zeros_like(x)}
+    ret_future = torch.futures.Future()
+
+    def inc_and_set(fut):
+        with lock:
+            state["cnt"] += 1
+            state["ret"] += fut.wait()
+            if state["cnt"] >= len(futs):
+                ret_future.set_result(state["ret"])
+
+    for fut in futs:
+        fut.then(inc_and_set)
+
+    return ret_future
+
+
+@rpc.functions.async_execution
+def async_cuda_sleep_and_set_to_one(t):
+    device = t.device
+    original_stream = torch.cuda.current_stream(device)
+    new_stream = torch.cuda.Stream(device)
+    new_stream.wait_stream(original_stream)
+    with torch.cuda.stream(new_stream):
+        torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+        t.fill_(1)
+        fut = Future(devices=[device])
+        fut.set_result(t)
+        return fut
+
+
+@rpc.functions.async_execution
+def async_cuda_nested_add(to, x, y, z):
+    def cb(fut):
+        torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+        return fut.value() + z
+
+    return rpc.rpc_async(to, torch.add, args=(x, y)).then(cb)
+
+
+# A custom Python class that contains a tensor, needed to see if we correctly
+# use the Python pickler to extract tensors from non-IValue-convertible types.
+class TensorWrapper:
+    __slots__ = ("tensor", "lock", "event", "thread")
+
+    def __init__(self, t):
+        self.tensor = t
+        # Add one non-picklable field, to ensure it's ignored/skipped.
+        self.lock = Lock()
+        self.event = torch.cuda.Event(enable_timing=True)
+        self.thread = threading.Thread()
+        self.thread.start()
+
+    def increase(self, v):
+        with self.lock:
+            self.tensor += v
+
+    def sum(self):
+        with self.lock:
+            self.event.record()
+            return self.tensor.sum()
+
+
+class AsyncExecutionClass:
+
+    @staticmethod
+    @rpc.functions.async_execution
+    def static_async_add(to, x, y, z):
+        return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+            lambda fut: fut.wait() + z
+        )
+
+    @classmethod
+    @rpc.functions.async_execution
+    def class_async_add(cls, to, x, y, z):
+        ret_fut = torch.futures.Future()
+        rpc.rpc_async(to, torch.add, args=(x, y)).then(
+            lambda fut: ret_fut.set_result(fut.wait() + z)
+        )
+        return ret_fut
+
+    @rpc.functions.async_execution
+    def bound_async_add(self, to, x, y, z):
+        return rpc.rpc_async(to, torch.add, args=(x, y)).then(
+            lambda fut: fut.wait() + z
+        )
+
+
+def return_future():
+    return torch.futures.Future()
+
+
+class FooBackendOptions(rpc.RpcBackendOptions):
+    def __init__(self, init_method):
+        # Must call the __init__ of the superclass (and do so directly,
+        # without using super()) because... pybind.
+        rpc.RpcBackendOptions.__init__(self)
+        self.init_method = init_method
+
+
+# load_tests from common_utils is used to automatically filter tests for
+# sharding on sandcastle. This line silences flake warnings
+load_tests = load_tests
+
+
+class MyEmbeddingBagModel(torch.nn.Module):
+    def __init__(self, sparse):
+        super().__init__()
+        self.eb = torch.nn.EmbeddingBag(
+            10,
+            10,
+            sparse=sparse
+        )
+
+    def forward(self, x):
+        return self.eb(x)
+
+
+class MyParameterServer:
+    def __init__(self, trainers):
+        self.lock = Lock()
+        self.trainers = trainers
+        self.iteration = 0
+        self.updates = 0
+        self.futures = []
+        self.total = None
+        self.gradient = None
+
+    @staticmethod
+    def get_gradient(rref):
+        return rref.local_value().gradient
+
+    @staticmethod
+    @rpc.functions.async_execution
+    def average(rref, riteration, tensor):
+        self = rref.local_value()
+        fut = torch.futures.Future()
+        with self.lock:
+            if riteration > self.iteration:
+                self.iteration = riteration
+                self.updates = 0
+                self.futures.clear()
+            self.futures.append(fut)
+            if self.total is None:
+                self.total = tensor
+            else:
+                self.total += tensor
+            self.updates += 1
+            if self.trainers == self.updates:
+                self.gradient = self.total / float(self.trainers)
+                for fut in self.futures:
+                    result = self.total / float(self.trainers)
+                    fut.set_result(result)
+        return fut
+
+
+class MyConvNetForMNIST(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(1, 16, 3, 1),
+            nn.ReLU(),
+            nn.Conv2d(16, 32, 3, 1),
+            nn.ReLU(),
+            nn.MaxPool2d(2),
+            nn.Flatten(1),
+            nn.Linear(4608, 128),
+            nn.ReLU(),
+            nn.Linear(128, 10),
+        ).to(device)
+        self.device = device
+
+    def forward(self, x, is_rref=False):
+        x = x.to_here() if is_rref else x
+        with torch.cuda.stream(torch.cuda.current_stream(self.device)):
+            # intentionally adding delay to current CUDA stream
+            torch.cuda._sleep(10 * FIFTY_MIL_CYCLES)
+            return self.net(x)
+
+    def __getstate__(self):
+        # return an empty dict to avoid inspecting the model contents on the
+        # owner
+        return {}
+
+
+class RpcTestCommon:
+    def _run_func_in_mode(self, to, fn, mode, args=None, kwargs=None):
+        if mode == RPCExecMode.SYNC:
+            return rpc.rpc_sync(to, fn, args=args, kwargs=kwargs)
+        elif mode == RPCExecMode.ASYNC:
+            return rpc.rpc_async(to, fn, args=args, kwargs=kwargs).wait()
+        elif mode == RPCExecMode.REMOTE:
+            return rpc.remote(to, fn, args=args, kwargs=kwargs).to_here()
+
+    def _self_py_udf_remote(self, worker_info, x, y, z):
+        rref = rpc.remote(worker_info, my_function, args=(x, y, z))
+        self.assertEqual(rref.to_here(), x + y + z)
+
+    def _self_remote_rref_as_rpc_arg(self, dst, x, y, z):
+        self_worker_info = rpc.get_worker_info()
+        rref = rpc.remote(self_worker_info, my_function, args=(x, y, z))
+        fut = rpc.rpc_async(dst, add_rref_to_value, args=(rref, x))
+        ret = rpc.rpc_sync(dst, add_rref_to_value, args=(rref, x + y))
+        self.assertEqual(ret, x + y + z + x + y)
+        self.assertEqual(fut.wait(), x + y + z + x)
+
+    def _self_remote_rref_as_remote_arg(self, dst, x, y, z):
+        self_worker_info = rpc.get_worker_info()
+        rref = rpc.remote(self_worker_info, my_function, args=(x, y, z))
+        ret_rref = rpc.remote(dst, add_rref_to_value, args=(rref, x))
+        self.assertEqual(
+            ret_rref.to_here(), x + y + z + x
+        )
+
+    def _world_size_one(self, a, b):
+        if self.rank == 0:
+            rpc.init_rpc(
+                name="me",
+                backend=self.rpc_backend,
+                rank=0,
+                world_size=1,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+            def _rpc_sync(x, y):
+                expect = x * 2
+                result = rpc.rpc_sync(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                )
+                self.assertEqual(expect, result)
+
+            def _rpc_async(x, y):
+                expect = x * 2
+                result = rpc.rpc_async(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                ).wait()
+                self.assertEqual(expect, result)
+
+            def _remote(x, y):
+                expect = x * 2
+                result = rpc.remote(
+                    "me",
+                    my_tensor_function,
+                    args=(x, y)
+                ).to_here()
+                self.assertEqual(expect, result)
+
+            _rpc_sync(a, b)
+            _rpc_async(a, b)
+            _remote(a, b)
+
+            rpc.shutdown()
+
+    def _multi_rpc(self, sparse):
+        dst_rank = (self.rank + 1) % self.world_size
+        for i in range(20):
+            n = i + self.rank + 1
+            if sparse:
+                x = build_sparse_tensor() * n
+                y = build_sparse_tensor() * n
+            else:
+                x = torch.ones(2, 2)
+                y = torch.ones(2, 2)
+            ret = rpc.rpc_sync(
+                worker_name(dst_rank),
+                torch.add,
+                args=(x, y),
+            )
+            self.assertEqual(ret, x * 2)
+
+    def _run_uneven_workload(self, f, x, num_repeat=30):
+        # worker0 drives and waits for worker1 and worker2
+        # throughout the test.
+        if self.rank == 0:
+            self.assertTrue(self.world_size >= 3)
+
+            # Phase 1: Only worker1 has workload.
+            dst = "worker1"
+            futs = []
+            for _ in range(num_repeat):
+                fut = rpc.rpc_async(dst, f, args=(x,))
+                futs.append(fut)
+
+            for fut in torch.futures.collect_all(futs).wait():
+                self.assertEqual(fut.wait(), 0)
+
+            # Phase 2: Only worker2 has workload.
+            # If join is not correctly implemented,
+            # worker2 should be closed by now.
+            dst = "worker2"
+            futs = []
+            for _ in range(num_repeat):
+                fut = rpc.rpc_async(dst, f, args=(x,))
+                futs.append(fut)
+
+            for val in torch.futures.wait_all(futs):
+                self.assertEqual(val, 0)
+
+    def _wait_all_workers(self, f, x):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        self._run_uneven_workload(f, x)
+
+        # worker0 calls this at the end after waiting for RPC responses.
+        # worker1/2 calls this immediately and has some works after it.
+        # worker3 calls this immediately and has no more work.
+        rpc.api._wait_all_workers()
+
+        # Wait before proceeding to shutdown to ensure worker0 RPCs make
+        # it through to other workers.
+        dist.barrier()
+        rpc.shutdown(graceful=False)
+
+    def _wait_all_workers_twice(self, f, x):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        self._run_uneven_workload(f, x)
+
+        # worker0 calls this at the end after waiting for RPC responses.
+        # worker1/2 calls this immediately and has some works after it.
+        # worker3 calls this immediately and has no more work.
+        rpc.api._wait_all_workers()
+        rpc.api._wait_all_workers()
+
+        # Wait before proceeding to shutdown to ensure worker0 RPCs make
+        # it through to other workers.
+        dist.barrier()
+        rpc.shutdown(graceful=False)
+
+    def _nested_rpc(self, f, expected):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            f,
+            args=(worker_name(self.rank),),
+        )
+        self.assertEqual(ret, expected)
+
+    def _stress_test_rpc(self, f, repeat=1000, args=()):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        futs = []
+        tik = time.time()
+        for _ in range(repeat):
+            fut = rpc.rpc_async(worker_name(dst_rank), f, args=args)
+            futs.append(fut)
+
+        for val in torch.futures.wait_all(futs):
+            self.assertEqual(val, 0)
+        tok = time.time()
+        print(
+            f"Rank {self.rank} finished testing {repeat} times in {tok - tik} seconds."
+        )
+
+    def _builtin_remote_ret(self, x, y, expected):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref = rpc.remote(
+            worker_name(dst_rank),
+            torch.add,
+            args=(x, y),
+        )
+        self.assertEqual(rref.to_here(), expected)
+
+    def _builtin_remote_self(self, x, y, expected):
+        rref = rpc.remote(
+            worker_name(self.rank),
+            torch.add,
+            args=(x, y),
+        )
+        self.assertEqual(rref.local_value(), expected)
+
+    def _test_multi_remote_call(self, fn, sparse, args_fn=lambda x, y: (), kwargs_fn=lambda x, y: {}):
+        m = 10
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rrefs = []
+        expected = []
+        for i in range(m):
+            n = n + i
+            rrefs.append(
+                rpc.remote(
+                    worker_name(dst_rank),
+                    fn,
+                    args=args_fn(n, sparse),
+                    kwargs=kwargs_fn(n, sparse),
+                )
+            )
+            expected.append(fn(*args_fn(n, sparse), **kwargs_fn(n, sparse)))
+
+        for i in range(m):
+            self.assertEqual(rrefs[i].to_here(), expected[i])
+
+    def _py_rref_args(self, a, b, x, y, expected):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(a, b)
+        )
+        rref_b = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(x, y)
+        )
+        rref_c = rpc.remote(
+            worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), expected)
+
+    def _py_rref_args_user_share(self, a, b, c, x, y, z, expected):
+        n = self.rank + 1
+        owner_rank = n % self.world_size
+        user_rank = (n + 1) % self.world_size
+        rref_a = rpc.remote(
+            worker_name(owner_rank), my_function, args=(a, b, c)
+        )
+        rref_b = rpc.remote(
+            worker_name(owner_rank), my_function, args=(x, y, z)
+        )
+        rref_c = rpc.remote(
+            worker_name(user_rank), my_rref_function, args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), expected)
+
+    def _py_rpc_rref_args(self, a, b, c, x, y, z, expected):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = rpc.remote(
+            worker_name(dst_rank), my_function, args=(a, b, c)
+        )
+        rref_b = rpc.remote(
+            worker_name(dst_rank), my_function, args=(x, y, z)
+        )
+
+        c = rpc.rpc_sync(
+            worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
+        )
+        self.assertEqual(c, expected)
+
+    def _nested_remote(self, f, expected):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+
+        rref = rpc.remote(
+            worker_name(dst_rank1),
+            f,
+            args=(worker_name(dst_rank2),),
+        )
+        self.assertEqual(rref.to_here(), expected)
+
+    def _nested_rref(self, f, expected1, expected2):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        rref_of_rrefs = rpc.remote(
+            worker_name(dst_rank1),
+            f,
+            args=(worker_name(dst_rank2),),
+        )
+
+        # Say C has 2 OwnerRRefs.
+        # B has 2 UserRRefs to those 2 OwnerRRefs, respectively.
+        # This call is effectively A asking B to share its 2 UserRRefs.
+        rrefs = rref_of_rrefs.to_here()
+
+        self.assertEqual(len(rrefs), 2)
+        self.assertEqual(rrefs[0].to_here(), expected1)
+        self.assertEqual(rrefs[1].to_here(), expected2)
+
+    def _nested_rref_stress(self, f, expected1, expected2):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        all_rrefs = []
+        for _ in range(20):
+            all_rrefs.append(
+                rpc.remote(
+                    worker_name(dst_rank1),
+                    f,
+                    args=(worker_name(dst_rank2),),
+                )
+            )
+
+        for i in range(20):
+            rref_of_rrefs = all_rrefs[i]
+            rrefs = rref_of_rrefs.to_here()
+            self.assertEqual(len(rrefs), 2)
+            self.assertEqual(rrefs[0].to_here(), expected1)
+            self.assertEqual(rrefs[1].to_here(), expected2)
+
+    def _trainer_func(self, rref, sparse):
+        m = MyEmbeddingBagModel(sparse=sparse)
+        loss_fn = nn.MSELoss()
+        for i in range(10):
+            outputs = m(torch.rand(10, 10).long())
+            loss_fn(outputs, torch.rand(10, 10)).backward()
+            gradient = next(iter(m.parameters())).grad
+            fut = rref.rpc_async().average(rref, i, gradient)
+            gradient = fut.wait()
+            if gradient.is_sparse:
+                gradient = gradient.to_dense().double()
+            ps_gradient = rref.rpc_sync().get_gradient(rref)
+            if ps_gradient.is_sparse:
+                ps_gradient = ps_gradient.to_dense().double()
+            self.assertTrue(torch.equal(gradient, ps_gradient))
+
+    def _my_parameter_server(self, sparse):
+        ps_rref = RRef(MyParameterServer(self.world_size - 1))
+        futures = []
+        for index in range(1, self.world_size):
+            futures.append(
+                rpc.rpc_async(
+                    worker_name((self.rank + index) % self.world_size),
+                    self._trainer_func,
+                    args=(
+                        ps_rref,
+                        sparse
+                    ),
+                )
+            )
+        torch.futures.wait_all(futures)
+
+    def _test_cuda_future_extraction(self, wrapper, unwrapper, sparse_tensor):
+        # We check proper CUDA stream synchronization by adding to the tensor
+        # in one stream to get the expected value, and reading it from another stream.
+        future = Future(devices=["cuda:0"])
+        with torch.cuda.device("cuda:0"):
+            stream = torch.cuda.Stream()
+            another_stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                if sparse_tensor:
+                    tensor = build_sparse_tensor().to("cuda:0")
+                    add_tensor = build_sparse_tensor().to("cuda:0")
+                    expected_tensor = (tensor + add_tensor).coalesce()
+                else:
+                    tensor = torch.zeros((100,), device="cuda:0")
+                    add_tensor = torch.ones((100,), device="cuda:0")
+                    expected_tensor = tensor + add_tensor
+                torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+                tensor += add_tensor
+                if sparse_tensor:
+                    tensor = tensor.coalesce()
+                future.set_result(wrapper(tensor))
+            with torch.cuda.stream(another_stream):
+                tensor = unwrapper(future.wait())
+                if sparse_tensor:
+                    self.assertTrue(torch.eq(tensor.indices(), expected_tensor.indices()).all().item())
+                    self.assertTrue(torch.eq(tensor.values(), expected_tensor.values()).all().item())
+                    self.assertEqual(tensor.size(), expected_tensor.size())
+                else:
+                    self.assertTrue(torch.eq(tensor, expected_tensor).all().item())
+
+
+class RpcTest(RpcAgentTestFixture, RpcTestCommon):
+    @dist_init
+    def test_worker_id(self):
+        n = self.rank + 1
+        peer_rank = n % self.world_size
+        self_worker_info = rpc.get_worker_info()
+        peer_worker_info = rpc.get_worker_info(worker_name(peer_rank))
+
+        self.assertEqual(self_worker_info.name, worker_name(self.rank))
+        self.assertEqual(peer_worker_info.name, worker_name(peer_rank))
+
+        with self.assertRaisesRegex(RuntimeError, "could not find destination"):
+            unknown_worker_id = rpc.get_worker_info("WorkerUnknown")
+
+    @dist_init
+    def test_get_worker_infos(self):
+        worker_infos = rpc.api._get_current_rpc_agent().get_worker_infos()
+
+        worker_names = {worker_info.name for worker_info in worker_infos}
+        expected_worker_names = {
+            worker_name(rank) for rank in range(self.world_size)
+        }
+        self.assertEqual(worker_names, expected_worker_names)
+
+        worker_ids = {worker_info.id for worker_info in worker_infos}
+        expected_worker_ids = set(range(self.world_size))
+        self.assertEqual(worker_ids, expected_worker_ids)
+
+    @dist_init
+    def test_self_add(self):
+        self_worker_info = rpc.get_worker_info()
+        self_worker_name = worker_name(self.rank)
+        fut = rpc.rpc_async(self_worker_info, torch.add, args=(torch.ones(2, 2), 1))
+        ret = rpc.rpc_sync(self_worker_info, torch.add, args=(torch.ones(2, 2), 1))
+        self.assertEqual(fut.wait(), torch.ones(2, 2) + 1)
+        self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_send_to_rank(self):
+        dst_rank = (self.rank + 1) % self.world_size
+
+        # Test dense tensor
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+            self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+        # Test invalid ranks
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(RuntimeError):
+                self._run_func_in_mode(self.world_size + 1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(RuntimeError):
+                self._run_func_in_mode(-1, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(ValueError):
+                self._run_func_in_mode(dst_rank + 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            with self.assertRaises(ValueError):
+                self._run_func_in_mode(dst_rank - 0.5, torch.add, exec_mode, args=(torch.ones(2, 2), 1))
+
+    @dist_init
+    def test_self_py_udf_remote(self):
+        self._self_py_udf_remote(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_rpc_arg(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_rpc_arg(
+            dst,
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_rpc_arg(self):
+        self._self_remote_rref_as_rpc_arg(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_remote_arg(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_remote_arg(
+            dst,
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_remote_arg(self):
+        self._self_remote_rref_as_remote_arg(
+            rpc.get_worker_info(),
+            torch.ones(2, 2),
+            1,
+            3
+        )
+
+    @dist_init
+    def test_rref_proxy_non_exist(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        rref = rpc.remote(dst, my_function, args=(torch.ones(2, 2), 1, 3))
+        msg = "has no attribute \'non_exist\'"
+        with self.assertRaisesRegex(AttributeError, msg):
+            rref.rpc_sync().non_exist()
+
+        with self.assertRaisesRegex(AttributeError, msg):
+            rref.rpc_async().non_exist().wait()
+
+        with self.assertRaisesRegex(AttributeError, msg):
+            rref.remote().non_exist()
+
+    def _test_rref_proxy_tensor(self, dst):
+        rref = rpc.remote(dst, my_function, args=(torch.ones(2, 2), 1, 3))
+
+        expected = torch.ones(2, 2) + 1 + 3
+        self.assertEqual(expected.size(), rref.rpc_sync().size())
+        self.assertEqual(expected + 1, rref.rpc_async().add(1).wait())
+        self.assertEqual(expected.view(1, 4), rref.remote().view(1, 4).to_here())
+
+    @dist_init
+    def test_rref_proxy_tensor(self):
+        self._test_rref_proxy_tensor(worker_name((self.rank + 1) % self.world_size))
+
+    @dist_init
+    def test_rref_proxy_tensor_self(self):
+        self._test_rref_proxy_tensor(rpc.get_worker_info())
+
+    @dist_init
+    def test_rref_proxy_reuse(self):
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size),
+            my_function,
+            args=(torch.ones(2, 2), 1, 3)
+        )
+        expected = torch.ones(2, 2) + 1 + 3
+
+        proxy_rpc_sync = rref.rpc_sync()
+        proxy_rpc_async = rref.rpc_async()
+        proxy_remote = rref.remote()
+
+        self.assertEqual(expected.size(), proxy_rpc_sync.size())
+        self.assertEqual(expected + 1, proxy_rpc_sync.add(1))
+        self.assertEqual(expected.view(1, 4), proxy_rpc_sync.view(1, 4))
+
+        self.assertEqual(expected.size(), proxy_rpc_async.size().wait())
+        self.assertEqual(expected + 3, proxy_rpc_async.add(3).wait())
+        self.assertEqual(expected.view(4, 1), proxy_rpc_async.view(4, 1).wait())
+
+        self.assertEqual(expected.size(), proxy_remote.size().to_here())
+        self.assertEqual(expected + 5, proxy_remote.add(5).to_here())
+        self.assertEqual(expected.view(-1), proxy_remote.view(-1).to_here())
+
+    def _test_rref_proxy_class(self, dst):
+        rref = rpc.remote(dst, MyClass, args=(7,))
+        expected = MyClass(7)
+        self.assertEqual(expected.get_value(), rref.rpc_sync().get_value())
+        self.assertEqual(expected.get_value(), rref.rpc_async().get_value().wait())
+        self.assertEqual(expected.get_value(), rref.remote().get_value().to_here())
+
+        expected.increment_value(3)
+        self.assertEqual(None, rref.rpc_sync().increment_value(1))
+        self.assertEqual(None, rref.rpc_async().increment_value(1).wait())
+        self.assertEqual(None, rref.remote().increment_value(1).to_here())
+
+        self.assertEqual(expected.get_value(), rref.rpc_sync().get_value())
+        self.assertEqual(expected.get_value(), rref.rpc_async().get_value().wait())
+        self.assertEqual(expected.get_value(), rref.remote().get_value().to_here())
+
+        self.assertEqual(
+            expected.my_instance_method(2),
+            rref.rpc_sync().my_instance_method(2)
+        )
+        self.assertEqual(
+            expected.my_instance_method(3),
+            rref.rpc_async().my_instance_method(3).wait()
+        )
+        self.assertEqual(
+            expected.my_instance_method(4),
+            rref.remote().my_instance_method(4).to_here()
+        )
+
+        self.assertEqual(
+            expected.my_static_method(9),
+            rref.rpc_sync().my_static_method(9)
+        )
+        self.assertEqual(
+            expected.my_static_method(10),
+            rref.rpc_async().my_static_method(10).wait()
+        )
+        self.assertEqual(
+            expected.my_static_method(11),
+            rref.remote().my_static_method(11).to_here()
+        )
+
+        self.assertEqual(
+            expected.my_class_method(2, torch.zeros(2, 2)),
+            rref.rpc_sync().my_class_method(2, torch.zeros(2, 2))
+        )
+        self.assertEqual(
+            expected.my_class_method(2, torch.ones(3, 3)),
+            rref.rpc_async().my_class_method(2, torch.ones(3, 3)).wait()
+        )
+        self.assertEqual(
+            expected.my_class_method(2, torch.ones(4, 4)),
+            rref.remote().my_class_method(2, torch.ones(4, 4)).to_here()
+        )
+
+    @dist_init
+    def test_rref_proxy_class(self):
+        self._test_rref_proxy_class(worker_name((self.rank + 1) % self.world_size))
+
+    @dist_init
+    def test_rref_proxy_class_self(self):
+        self._test_rref_proxy_class(rpc.get_worker_info())
+
+    @mock.patch.object(torch.distributed.autograd, "_init")
+    @mock.patch.object(torch.distributed.rpc.api, "_set_and_start_rpc_agent")
+    @dist_init(setup_rpc=False)
+    def test_register_rpc_backend_and_set_and_start_rpc_backend(
+        self, mock_rpc_agent, mock_dist_autograd_init
+    ):
+        backend_name = "stub_backend"
+
+        backend = rpc.backend_registry.register_backend(
+            backend_name,
+            _stub_construct_rpc_backend_options_handler,
+            _stub_init_rpc_backend_handler,
+        )
+
+        with self.assertRaisesRegex(
+            RuntimeError, "^RPC backend .+: already registered$"
+        ):
+            backend = rpc.backend_registry.register_backend(
+                backend_name,
+                _stub_construct_rpc_backend_options_handler,
+                _stub_init_rpc_backend_handler,
+            )
+
+        rpc.init_rpc(
+            name="worker1",
+            backend=backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+    @dist_init(setup_rpc=False)
+    def test_duplicate_name(self):
+        with self.assertRaisesRegex(RuntimeError, "is not unique"):
+            store, _, _ = next(
+                torch.distributed.rendezvous(
+                    self.init_method, rank=self.rank, world_size=self.world_size
+                )
+            )
+            rpc._init_rpc_backend(
+                backend=self.rpc_backend,
+                store=store,
+                name="duplicate_name",
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+    @dist_init(setup_rpc=False)
+    def test_duplicate_name_2(self):
+        with self.assertRaisesRegex(RuntimeError, "is not unique"):
+            rpc.init_rpc(
+                name=worker_name(self.rank % (self.world_size - 1)),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+    @dist_init(setup_rpc=False)
+    def test_reinit(self):
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        # Wait for all init to complete.
+        dist.barrier()
+
+        # TODO: with TCP init, rank 0 raises Address already in use because
+        # rank 0 is the start daemon and the store is created before checking if
+        # RPC is already initialized in init_rpc.
+        if os.environ.get("RPC_INIT_WITH_TCP", None) == "1" and self.rank == 0:
+            expected_reinit_err = "Address already in use"
+        else:
+            expected_reinit_err = "is already initialized"
+
+        with self.assertRaisesRegex(RuntimeError, expected_reinit_err):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+        rpc.shutdown()
+
+    @dist_init(setup_rpc=False)
+    def test_pg_init_no_rpc_init(self):
+        dist.init_process_group(
+            backend='gloo',
+            init_method=self.file_init_method,
+            rank=self.rank,
+            world_size=self.world_size)
+
+        class MyModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.lin = torch.nn.Linear(3, 4)
+
+            def forward(self, x):
+                return self.lin(x)
+
+        model = MyModel()
+        model.train()
+        model = torch.nn.parallel.DistributedDataParallel(model)
+
+        with self.assertRaisesRegex(RuntimeError, 'Current RPC agent is not set! Did you initialize the RPC framework'):
+            params = []
+            for param in model.parameters():
+                params.append(RRef(param))
+
+    def test_world_size_one(self):
+        self._world_size_one(
+            torch.ones(2, 2),
+            torch.ones(2, 2)
+        )
+
+    @dist_init(setup_rpc=False)
+    def test_invalid_names(self):
+
+        worker_id = 0
+        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
+            info = WorkerInfo("abc*", worker_id)
+
+        with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
+            info = WorkerInfo(" ", worker_id)
+
+        with self.assertRaisesRegex(RuntimeError, "must be non-empty"):
+            info = WorkerInfo("", worker_id)
+
+        # If the number in the message does not match, it is likely that the
+        # value of MAX_NAME_LEN in RPC WorkerInfo has changed.
+        with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
+            info = WorkerInfo("".join(["a" for i in range(500)]), worker_id)
+
+    # Test that WorkerInfo can be pickled and sent in RPC call
+    @dist_init
+    def test_worker_info_pickle(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        worker_info = rpc.api.get_worker_info()
+        ret = rpc.rpc_sync(worker_name(dst_rank), identity, args=(worker_info,))
+        self.assertEqual(ret, worker_info)
+
+    @dist_init
+    def test_add(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        self.assertEqual(ret, torch.ones(n, n) * 2)
+
+    @staticmethod
+    def return_callee_id():
+        return rpc.get_worker_info().id
+
+    @dist_init
+    def test_int_callee(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        ret = rpc.rpc_sync(dst_rank, RpcTest.return_callee_id)
+        self.assertEqual(ret, dst_rank)
+
+    @dist_init
+    def test_add_with_id(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        workder_info = rpc.get_worker_info(worker_name(dst_rank))
+
+        ret = rpc.rpc_sync(
+            workder_info, torch.add, args=(torch.ones(n, n), torch.ones(n, n))
+        )
+        self.assertEqual(ret, torch.ones(n, n) * 2)
+
+    @dist_init
+    def test_scalar_add(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), n)
+        )
+        self.assertEqual(ret, (torch.ones(n, n) + n))
+
+    @dist_init
+    def test_async_add(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        fut = rpc.rpc_async(
+            worker_name(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+    @dist_init
+    def test_nonzero(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        x = torch.ones(self.world_size, self.world_size)
+        x[self.rank][self.rank] = 0
+        ret = rpc.rpc_sync(worker_name(dst_rank), torch.nonzero, args=(x,))
+        self.assertEqual(ret, x.nonzero())
+
+    @dist_init
+    def test_multi_rpc(self):
+        self._multi_rpc(False)
+
+    @dist_init
+    def test_future_wait_twice(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        futs = []
+        for i in range(20):
+            futs.append(rpc.rpc_async(dst, raise_func))
+
+        with self.assertRaisesRegex(ValueError, "Expected error"):
+            torch.futures.wait_all(futs)
+
+        for fut in futs:
+            with self.assertRaisesRegex(ValueError, "Expected error"):
+                fut.wait()
+
+    @dist_init(setup_rpc=False)
+    def test_wait_all_workers_timeout(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        og_func = rpc.api._wait_all_workers
+
+        def wait_all_workers_sleep(timeout):
+            rpc.api._all_gather(SlowPickleClass(0.5), timeout=timeout)
+
+        rpc.api._wait_all_workers = wait_all_workers_sleep
+
+        try:
+            with self.assertRaisesRegex(RuntimeError, ''):
+                rpc.shutdown(graceful=True, timeout=0.01)
+        finally:
+            rpc.api._wait_all_workers = og_func
+        dist.barrier()
+
+    def test_wait_all_workers_dense(self):
+        self._wait_all_workers(heavy_rpc, torch.ones(100, 100))
+
+    def test_wait_all_workers_twice_dense(self):
+        self._wait_all_workers_twice(heavy_rpc, torch.ones(100, 100))
+
+    @dist_init
+    def test_all_gather(self):
+        info = rpc.get_worker_info()
+        results = rpc.api._all_gather(info.id)
+        expected = {}
+        for info in rpc._get_current_rpc_agent().get_worker_infos():
+            expected[info.name] = info.id
+
+        self.assertEqual(expected, results)
+
+    @dist_init
+    def test_all_gather_timeout(self):
+        rpc._set_rpc_timeout(0.1)
+
+        if self.rank == 0:
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "timed out in _all_gather after 0\\.10 seconds"
+            ):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+        else:
+            expected_error = self.get_timeout_error_regex()
+            with self.assertRaisesRegex(RuntimeError, expected_error):
+                rpc.api._all_gather(SlowPickleClass(0.5))
+
+    def _test_barrier_helper(self, info, names, multi_threaded=False):
+        names = sorted(names)
+        leader = names[0]
+        rpc.rpc_sync(leader, _reset_count)
+        if not multi_threaded and info.name == leader:
+            self.assertEqual(_rpc_barrier_count, 0)
+        rpc.api._barrier(names)
+        rpc.rpc_sync(leader, _increment_count)
+        rpc.api._barrier(names)
+        if not multi_threaded and info.name == leader:
+            self.assertEqual(_rpc_barrier_count, len(names))
+
+    @dist_init
+    def test_rpc_barrier_all(self):
+        # Test rpc barrier when called with full list of workers
+        info = rpc.get_worker_info()
+        all_worker_info = rpc._get_current_rpc_agent().get_worker_infos()
+        names = [worker.name for worker in all_worker_info]
+        self._test_barrier_helper(info, names)
+
+    @dist_init
+    def test_rpc_barrier_subset(self):
+        # Test rpc barrier when processes are called with different subsets of the full list
+        info = rpc.get_worker_info()
+        all_worker_info = rpc._get_current_rpc_agent().get_worker_infos()
+        if info.id % 2:
+            names = [worker.name for worker in all_worker_info if worker.id % 2]
+        else:
+            names = [worker.name for worker in all_worker_info if not worker.id % 2]
+        self._test_barrier_helper(info, names)
+
+    @dist_init
+    def test_rpc_barrier_partial_subset(self):
+        # Test rpc barrier when some processes are not involved in the barrier
+        info = rpc.get_worker_info()
+        all_worker_info = rpc._get_current_rpc_agent().get_worker_infos()
+        if info.id % 2:
+            names = [worker.name for worker in all_worker_info if worker.id % 2]
+        else:
+            names = [f"worker{info.id}"]
+        self._test_barrier_helper(info, names)
+
+    @dist_init
+    def test_rpc_barrier_multithreaded(self):
+        # This tests validates the implementation of barrier when multiple threads call into it
+        # We only need to check that it does not hang in this case
+        info = rpc.get_worker_info()
+        all_worker_info = rpc._get_current_rpc_agent().get_worker_infos()
+        names = [worker.name for worker in all_worker_info]
+        threads = []
+        for _ in range(3):
+            th = threading.Thread(target=self._test_barrier_helper, args=(info, names, True))
+            threads.append(th)
+            th.start()
+        for th in threads:
+            th.join()
+
+    @dist_init
+    def test_graceful_shutdown_with_uneven_workload(self):
+        """Test graceful termination."""
+        self._run_uneven_workload(heavy_rpc, torch.ones(100, 100))
+
+    @dist_init(setup_rpc=False)
+    def test_shutdown_followed_by_rpc(self):
+        # Initialize RPC.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        self.assertEqual(ret, torch.ones(n, n) * 2)
+        rpc.shutdown()
+
+        with self.assertRaisesRegex(RuntimeError, "^RPC has not been initialized"):
+            rpc.rpc_sync(
+                worker_name(dst_rank),
+                torch.add,
+                args=(torch.ones(n, n), torch.ones(n, n)),
+            )
+
+    @dist_init
+    def test_expected_src(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        expected_src_rank = (self.rank - 1) % self.world_size
+        ret = rpc.rpc_sync(worker_name(dst_rank), set_value, args=(self.rank,))
+        value = VALUE_FUTURE.result()
+        self.assertEqual(value, expected_src_rank)
+
+    @dist_init
+    def test_py_built_in(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(worker_name(dst_rank), min, args=(n, n + 1, n + 2))
+        self.assertEqual(ret, min(n, n + 1, n + 2))
+
+    @dist_init
+    def test_py_user_defined(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            my_function,
+            kwargs={"a": n, "b": n + 1, "c": n + 2},
+        )
+        self.assertEqual(ret, my_function(n, n + 1, n + 2))
+
+    def test_build_rpc_profiling_key(self):
+        # Tests that the name that shows up as an Event in profiling RPCs has all
+        # the necessary information.
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            rpc_profiling_key = _build_rpc_profiling_key(
+                exec_mode, "foo", "worker0", "worker1"
+            )
+            self.assertIn(exec_mode.value, rpc_profiling_key)
+            self.assertIn("foo", rpc_profiling_key)
+            self.assertIn("worker0", rpc_profiling_key)
+            self.assertIn("worker1", rpc_profiling_key)
+
+    def check_profiling_info(self, self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode):
+        self.assertTrue(self_worker_name in rpc_event.name)
+        self.assertTrue(dst_worker_name in rpc_event.name)
+        if isinstance(func, torch.jit.ScriptFunction):
+            self.assertTrue(torch._jit_internal._qualified_name(func) in rpc_event.name)
+        else:
+            self.assertTrue(func.__name__ in rpc_event.name)
+        self.assertTrue(rpc_exec_mode.value in rpc_event.name)
+        self.assertEqual(rpc_event.count, 1)
+
+    @dist_init
+    def test_profiler_rpc_record_shapes(self):
+        if self.rank != 1:
+            return
+        dst = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst)
+        t1, t2 = torch.ones(100), torch.ones(100)
+        with _profile(record_shapes=True) as prof:
+            rpc.rpc_sync(dst_worker, torch.add, args=(t1, t2))
+
+        function_events = prof.function_events
+        remote_events = [event for event in function_events if event.is_remote]
+        remote_add_event = next(
+            event for event in remote_events if "aten::add" in event.name
+        )
+        remote_add_input_shapes = remote_add_event.input_shapes
+        # Run profiler on equivalent local op and validate shapes are the same.
+        with _profile(record_shapes=True) as prof:
+            torch.add(t1, t2)
+
+        local_function_events = prof.function_events
+        local_add_event = next(
+            event for event in local_function_events if "aten::add" in event.name
+        )
+        local_add_input_shapes = local_add_event.input_shapes
+        self.assertEqual(remote_add_input_shapes, local_add_input_shapes)
+
+    @dist_init
+    def test_profiler_rpc_memory(self):
+        if self.rank != 1:
+            return
+        dst = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst)
+        with _profile(profile_memory=True) as p:
+            fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=())
+            res = fut.wait()
+
+        function_events = p.function_events
+        event_cpu_mem_usages = {event.cpu_memory_usage for event in function_events}
+        # if cpu_memory_usage was not propagated over the wire, this set would
+        # only contain 0 (indicates no memory being profiled)
+        self.assertNotEqual({0}, event_cpu_mem_usages)
+        # No memory profiled if profile_memory=False
+        with _profile(profile_memory=False) as p:
+            fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=())
+            res = fut.wait()
+
+        function_events = p.function_events
+        event_cpu_mem_usages = {event.cpu_memory_usage for event in function_events}
+        self.assertEqual({0}, event_cpu_mem_usages)
+
+    @dist_init
+    def test_profiler_export_trace(self):
+        if self.rank != 1:
+            return
+        dst = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst)
+        with _profile() as p:
+            fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=())
+            res = fut.wait()
+
+        events = p.function_events
+        with TemporaryFileName() as fname:
+            path = fname
+            p.export_chrome_trace(path)
+            with open(path) as f:
+                trace = json.load(f)
+                event_names = [event['name'] for event in trace]
+                for expected_event_name in EXPECTED_REMOTE_EVENTS + [RPCExecMode.ASYNC.value]:
+                    event_exists = any(expected_event_name in event_name for event_name in event_names)
+                    self.assertTrue(event_exists)
+
+    @dist_init
+    def test_profiler_rpc_key_names(self):
+        # tests that remote events are properly prefixed with the RPC profiling key.
+        if self.rank != 1:
+            return
+
+        # Spawn multiple threads that send RPCs to ensure keys are correctly
+        # prefixed when there are multiple RPCs being created/in flight at the
+        # same time.
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+
+        def rpc_with_profiling(dst_worker):
+            with _profile() as prof:
+                fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=())
+                fut.wait()
+
+            events = prof.function_events
+            remote_event_names = {
+                event.name: event for event in events if event.is_remote
+            }
+            rpc_profiling_key = _build_rpc_profiling_key(
+                RPCExecMode.ASYNC,
+                udf_with_torch_ops.__qualname__,
+                worker_name(self.rank),
+                dst_worker,
+            )
+
+            remote_event_name_set = set(EXPECTED_REMOTE_EVENTS)
+            for name, event in remote_event_names.items():
+                # Ensure that we have the expected key as part of the remote
+                # event.
+                self.assertTrue(name.startswith(rpc_profiling_key))
+                self.assertTrue(event.is_remote)
+                self.assertTrue(event.node_id == rpc.get_worker_info(dst_worker).id)
+                # Ensure that the remote event name also contains the operator.
+                operator_name_substr = name[len(rpc_profiling_key) :]
+                # Note: we don't assert that every remote event needs to be
+                # in the above set, the set is just a representative set of
+                # what we expect to see. The profiler can change and add more
+                # events, but we should always expect to see this representative
+                # set.
+                matching_event = {
+                    remote_event_name
+                    for remote_event_name in remote_event_name_set
+                    if remote_event_name in operator_name_substr
+                }
+                remote_event_name_set -= matching_event
+
+            # The set should be empty, otherwise its contained elements did
+            # not show up in the remote profiler output.
+            self.assertTrue(
+                remote_event_name_set == set(),
+                f"Expected {remote_event_name_set} to be included in remote profiler output.",
+            )
+
+        for dst in dst_ranks:
+            dst_worker = worker_name(dst)
+            num_parallel_rpcs = 2
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=num_parallel_rpcs
+            ) as executor:
+                futs = [
+                    executor.submit(rpc_with_profiling, dst_worker)
+                    for _ in range(num_parallel_rpcs)
+                ]
+                # Wait for workers to finish test
+                for fut in futs:
+                    fut.result()
+
+    def _run_test_profiler_remote_events_profiled(self):
+        # Tests that we can successfully invoke the profiler on a remote node,
+        # and collect the remote events back in the local profiler.
+        if self.rank != 1:
+            return
+
+        dst_ranks = [rank for rank in range(0, self.world_size) if rank != self.rank]
+        for dst in dst_ranks:
+            dst_worker = worker_name(dst)
+            with _profile() as prof:
+                fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=())
+                ret = fut.wait()
+
+            events = prof.function_events
+
+            rpc_event = get_function_event(events, RPCExecMode.ASYNC.value)
+            self.check_profiling_info(
+                worker_name(self.rank),
+                dst_worker,
+                udf_with_torch_ops,
+                rpc_event,
+                RPCExecMode.ASYNC,
+            )
+
+            remote_events = {event.name: event for event in events if event.is_remote}
+            rpc_profiling_key = _build_rpc_profiling_key(
+                RPCExecMode.ASYNC,
+                udf_with_torch_ops.__qualname__,
+                worker_name(self.rank),
+                worker_name(dst),
+            )
+
+            for expected_remote_event_name in EXPECTED_REMOTE_EVENTS:
+                expected_key = rpc_profiling_key + REMOTE_OP_STR + expected_remote_event_name
+                self.assertTrue(expected_key in remote_events)
+                remote_event = remote_events[expected_key]
+                # Remote event should have a node ID corresponding to the worker
+                # it ran on.
+                self.assertEqual(remote_event.node_id, dst)
+
+            # Validate order remote events show up in profiling output.
+            def convert_remote_to_local(event_name):
+                remote_op_key = rpc_profiling_key + REMOTE_OP_STR
+                return event_name[
+                    event_name.find(remote_op_key)
+                    + len(remote_op_key) :
+                ]
+
+            remote_events_list = [
+                convert_remote_to_local(event.name)
+                for event in events
+                if convert_remote_to_local(event.name) in EXPECTED_REMOTE_EVENTS
+            ]
+            self.assertEqual(
+                set(remote_events_list),
+                set(EXPECTED_REMOTE_EVENTS),
+                f"Mismatch between profiled events: {set(remote_events_list)} and expected events: {set(EXPECTED_REMOTE_EVENTS)}",
+            )
+
+    @dist_init
+    def test_profiler_remote_events_profiled(self):
+        self._run_test_profiler_remote_events_profiled()
+
+    @dist_init
+    def test_profiler_remote_events_profiled_single_threaded(self):
+        self._run_test_profiler_remote_events_profiled()
+
+    def run_profiling_workload(self, dst):
+        fut = rpc.rpc_async(
+            worker_name(dst),
+            torch.mul,
+            args=(
+                torch.tensor(1.0, requires_grad=True),
+                torch.tensor(1.0, requires_grad=True),
+            ),
+        )
+        fut.wait()
+
+    def _run_rpc_profiling_async_function(self, device="cpu"):
+        if self.rank != 1:
+            return
+
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+        x = torch.ones(2)
+        y = torch.ones(2)
+        with _profile() as prof:
+            ret = rpc.rpc_async(
+                dst1, slow_async_add, args=(dst2, x, y, device), timeout=20
+            )
+            out = ret.wait()
+
+        function_events = prof.function_events
+        # slow_async_add resulted in an RPC from dst1 -> dst2, so this should be
+        # recorded.
+        key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_async_add.__qualname__, worker_name(self.rank), dst1
+        )
+
+        nested_rpc_key_prefix = _build_rpc_profiling_key(
+            RPCExecMode.ASYNC, slow_add.__qualname__, dst1, dst2
+        )
+        expected_key = key_prefix + REMOTE_OP_STR + nested_rpc_key_prefix
+        remote_events = [event for event in function_events if event.is_remote]
+        rpc_remote_event = [
+            event for event in remote_events if event.name == expected_key
+        ]
+        self.assertEqual(1, len(rpc_remote_event))
+        rpc_remote_event = rpc_remote_event[0]
+        self.assertEqual(rpc_remote_event.node_id, (self.rank + 1) % self.world_size)
+        # slow_async_add's RPC does an add on dst2, which should be reflected as well.
+        remote_add_key = (
+            expected_key + REMOTE_OP_STR + torch.jit._builtins._find_builtin(torch.add)
+        )
+        remote_add_event = [
+            event for event in remote_events if event.name == remote_add_key
+        ]
+        self.assertEqual(1, len(remote_add_event))
+        remote_add_event = remote_add_event[0]
+        # Validate that node_id is dst2.
+        self.assertEqual(remote_add_event.node_id, (self.rank + 2) % self.world_size)
+
+    @dist_init
+    def test_rpc_profiling_async_function(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
+    @dist_init
+    def test_rpc_profiling_async_function_single_threaded(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        self._run_rpc_profiling_async_function()
+        if torch.cuda.is_available():
+            dist.barrier()
+            self._run_rpc_profiling_async_function(device="cuda:0")
+
+    @dist_init
+    def test_rpc_profiling_remote_record_function(self):
+        # test that functions run over RPC with record_function show the expected
+        # profiled block.
+        if self.rank != 1:
+            return
+        dst_ranks = [i for i in range(self.world_size) if i != self.rank]
+        for dst_rank in dst_ranks:
+            dst_worker = worker_name(dst_rank)
+            with _profile() as prof:
+                fut = rpc.rpc_async(dst_worker, udf_with_torch_ops, args=(-1, True))
+                fut.wait()
+
+            function_events = prof.function_events
+            record_function_remote_event = [
+                evt for evt in function_events if "##forward##" in evt.name
+            ]
+            self.assertEqual(1, len(record_function_remote_event))
+            record_function_remote_event = record_function_remote_event[0]
+            self.assertEqual(record_function_remote_event.node_id, dst_rank)
+            # cpu_children only returns direct children, so here we get all
+            # children recursively.
+
+            def get_cpu_children(event):
+                if not event.cpu_children:
+                    return []
+                cpu_children = event.cpu_children
+                for e in event.cpu_children:
+                    cpu_children.extend(get_cpu_children(e))
+                return cpu_children
+
+            remote_children = get_cpu_children(record_function_remote_event)
+            # Get local children and verify parity.
+            with _profile() as prof:
+                udf_with_torch_ops(-1, True)
+
+            local_function_events = prof.function_events
+            local_record_function_event = next(
+                evt for evt in local_function_events if "##forward##" in evt.name
+            )
+            local_children = get_cpu_children(local_record_function_event)
+            local_children_names = [
+                evt.name for evt in local_children
+            ]
+
+            REMOTE_OP_STR = "#remote_op: "
+
+            def convert_remote_to_local(event_name):
+                remote_op_key = REMOTE_OP_STR
+                return event_name[
+                    event_name.find(remote_op_key) + len(remote_op_key) :
+                ]
+
+            for evt in remote_children:
+                local_name = convert_remote_to_local(evt.name)
+                self.assertTrue(local_name in local_children_names)
+
+    def validate_profiling_workload(self, dst, prof):
+
+        def convert_remote_to_local(event_name):
+            return event_name[event_name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR) :]
+
+        events = prof.function_events
+        remote_events = {
+            convert_remote_to_local(event.name): event
+            for event in events
+            if event.is_remote
+        }
+        self.assertTrue("aten::mul" in remote_events)
+        remote_mul_event = remote_events["aten::mul"]
+        self.assertEqual(remote_mul_event.node_id, dst)
+        self.check_profiling_info(
+            worker_name(self.rank),
+            worker_name(dst),
+            torch.mul,
+            remote_mul_event,
+            RPCExecMode.ASYNC,
+        )
+
+    def _run_test_profiler_with_autograd_context(self):
+        dst = (self.rank + 1) % self.world_size
+        if self.rank == 1:
+            # Cases where we can double wrap messages with profiling information and autograd info.
+            with dist_autograd.context() as context_id:
+                with _profile() as prof:
+                    self.run_profiling_workload(dst)
+
+            self.validate_profiling_workload(dst, prof)
+
+            # Ensure that flipped order of ctx managers results in events being
+            # recorded as expected.
+            with _profile() as prof:
+                with dist_autograd.context() as context_id:
+                    self.run_profiling_workload(dst)
+
+            self.validate_profiling_workload(dst, prof)
+
+    @dist_init
+    def test_profiler_with_autograd_context_single_threaded(self):
+        self._run_test_profiler_with_autograd_context()
+
+    @dist_init
+    def test_profiler_with_autograd_context(self):
+        self._run_test_profiler_with_autograd_context()
+
+    def _profiler_test_with_rpc(
+        self, rpc_exec_mode, func, args, use_record_function=False, dst=None, kineto_profile=False
+    ):
+        dst = dst if dst is not None else (self.rank + 1) % self.world_size
+
+        # only run profiler on rank 1.
+        p = _profile if not kineto_profile else torch.profiler.profile  # kineto
+        if self.rank == 1:
+            with p() as prof:
+                record_function_ctx_mgr = (
+                    contextlib.nullcontext()
+                    if not use_record_function
+                    else torch.autograd.profiler.record_function(
+                        "foo"
+                    )
+                )
+                with record_function_ctx_mgr as rf:
+                    if rpc_exec_mode == RPCExecMode.SYNC:
+                        rpc.rpc_sync(worker_name(dst), func, args=args)
+                    elif rpc_exec_mode == RPCExecMode.ASYNC:
+                        fut = rpc.rpc_async(worker_name(dst), func, args=args)
+                        if kineto_profile:
+                            # Ensure multiple async RPCs don't cause issues.
+                            # Would have raised
+                            # "RuntimeError: Cannot call
+                            # RemoteProfilerManager::setCurrentKey when current
+                            # key is already set." error if RPC profiling was
+                            # not disabled properly for kineto.
+                            fut2 = rpc.rpc_async(worker_name(dst), func, args=args)
+                            fut2.wait()
+                        fut.wait()
+                    else:
+                        self.assertTrue(rpc_exec_mode == RPCExecMode.REMOTE)
+                        rref = rpc.remote(worker_name(dst), func, args=args)
+                        rref.to_here()
+                        # To avoid flakiness, wait for the RRef to be profiled. This
+                        # means that we received the acknowledgement of successful
+                        # creation on the owner and ran the callbacks responsible
+                        # for recording the profiling event.
+                        rref._get_profiling_future().wait()
+
+            events = prof.function_events if not kineto_profile else prof.events()
+            if kineto_profile:
+                # RPC profiling is disabled so there should be no rpc related
+                # events.
+                with self.assertRaises(IndexError):
+                    get_function_event(events, rpc_exec_mode.value)
+
+                return
+
+            rpc_event = get_function_event(events, rpc_exec_mode.value)
+            # verify Node ID for this rpc event.
+            self.assertEqual(rpc_event.node_id, self.rank)
+            # Ensure recording of remote events.
+            remote_events = {event for event in events if event.node_id == dst} - {rpc_event}
+            self.assertGreaterEqual(len(remote_events), 1)
+            for remote_event in remote_events:
+                self.assertEqual(remote_event.node_id, dst)
+
+            if use_record_function:
+                scope_event = get_function_event(events, "foo")
+                # Since RPC call is within the scope, its CPU interval should be
+                # contained within foo's interval.
+                self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start)
+                self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end)
+            # the sender, dest worker, function run, and type of RPC should all
+            # be recorded.
+            self_worker_name = worker_name(self.rank)
+            dst_worker_name = worker_name(dst)
+            self.check_profiling_info(self_worker_name, dst_worker_name, func, rpc_event, rpc_exec_mode)
+            if use_record_function:
+                # verify order by ensuring that the outer context comes
+                # before the rpc event.
+                foo_event_ix = next(i for i, event in enumerate(events) if "foo" in event.name)
+                rpc_event_idx = next(i for i, event in enumerate(events) if rpc_exec_mode.value in event.name)
+                self.assertLess(foo_event_ix, rpc_event_idx)
+
+    def _run_test_profiler_with_sync_rpc_udf(self):
+        self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,))
+        self._profiler_test_with_rpc(RPCExecMode.SYNC, my_sleep_func, args=(1,),
+                                     use_record_function=True)
+
+    @dist_init
+    def test_profiler_with_sync_rpc_udf(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    @dist_init
+    def test_profiler_with_sync_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_udf()
+
+    def _run_test_profiler_with_sync_rpc_builtin(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.SYNC, torch.mul, args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True
+        )
+
+    @dist_init
+    def test_profiler_with_sync_rpc_builtin(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    @dist_init
+    def test_profiler_with_sync_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_sync_rpc_builtin()
+
+    def _run_test_profiler_with_async_rpc_udf(self):
+        self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,))
+        self._profiler_test_with_rpc(RPCExecMode.ASYNC, my_sleep_func, args=(1,),
+                                     use_record_function=True)
+        # Test to ensure that kineto profiler enabled in RPC does not enable
+        # RPC profiling (it is unsupported) and does not result in issues.
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, my_sleep_func, args=(1,), kineto_profile=True
+        )
+
+    @dist_init
+    def test_profiler_with_async_rpc_udf(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    @dist_init
+    def test_profiler_with_async_rpc_udf_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_udf()
+
+    def _run_test_profiler_with_async_rpc_builtin(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1))
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, torch.mul, args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True
+        )
+
+    @dist_init
+    def test_profiler_with_async_rpc_builtin(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    @dist_init
+    def test_profiler_with_async_rpc_builtin_single_threaded(self):
+        self._run_test_profiler_with_async_rpc_builtin()
+
+    def _run_test_profiler_with_remote_udf(self):
+        self._profiler_test_with_rpc(RPCExecMode.REMOTE, my_sleep_func, args=(1,))
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, my_sleep_func, args=(1,), use_record_function=True
+        )
+        # test remote to self
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, my_sleep_func, args=(1,), dst=self.rank
+        )
+
+    @dist_init
+    def test_profiler_with_remote_udf(self):
+        self._run_test_profiler_with_remote_udf()
+
+    @dist_init
+    def test_profiler_with_remote_udf_single_threaded(self):
+        self._run_test_profiler_with_remote_udf()
+
+    def _run_test_profiler_with_remote_builtin(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1))
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, torch.mul, args=(torch.ones(1), torch.ones(1)),
+            use_record_function=True
+        )
+        # test remote to self
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE,
+            torch.mul,
+            args=(torch.ones(1), torch.ones(1)),
+            dst=self.rank,
+        )
+
+    @dist_init
+    def test_profiler_with_remote_builtin(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    @dist_init
+    def test_profiler_with_remote_builtin_single_threaded(self):
+        self._run_test_profiler_with_remote_builtin()
+
+    def _run_test_profiler_with_script_async_rpc(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC, my_script_func, args=(torch.tensor(1),)
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.ASYNC,
+            my_script_func,
+            args=(torch.tensor(1),),
+            use_record_function=True,
+        )
+
+    @dist_init
+    def test_profiler_with_script_async_rpc(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    @dist_init
+    def test_profiler_with_script_async_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_async_rpc()
+
+    def _run_test_profiler_with_script_sync_rpc(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.SYNC, my_script_func, args=(torch.tensor(1),)
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.SYNC,
+            my_script_func,
+            args=(torch.tensor(1),),
+            use_record_function=True,
+        )
+
+    @dist_init
+    def test_profiler_with_script_sync_rpc(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    @dist_init
+    def test_profiler_with_script_sync_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_sync_rpc()
+
+    def _run_test_profiler_with_script_remote_rpc(self):
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),)
+        )
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE,
+            my_script_func,
+            args=(torch.tensor(1),),
+            use_record_function=True,
+        )
+        # test remote to self
+        self._profiler_test_with_rpc(
+            RPCExecMode.REMOTE, my_script_func, args=(torch.tensor(1),), dst=self.rank
+        )
+
+    @dist_init
+    def test_profiler_with_script_remote_rpc(self):
+        self._run_test_profiler_with_script_remote_rpc()
+
+    @dist_init
+    def test_profiler_with_script_remote_rpc_single_threaded(self):
+        self._run_test_profiler_with_script_remote_rpc()
+
+    def _assert_top_level_events(self, process_global_events, expected_top_level_event_names):
+        top_level_event_names = []
+        for thread_local_events in process_global_events:
+            # Get top-level events from all events happened on a thread.
+            last_end_time = 0
+            for event in thread_local_events:
+                event_name = event.name
+                time_range = event.time_range
+                if time_range.start > last_end_time:
+                    top_level_event_names.append(event_name)
+                    last_end_time = time_range.end
+        top_level_event_names = sorted(top_level_event_names)
+        expected_top_level_event_names = sorted(expected_top_level_event_names)
+        self.assertEqual(
+            top_level_event_names,
+            expected_top_level_event_names,
+            f"Expected events {expected_top_level_event_names}, but got {top_level_event_names}",
+        )
+
+    @dist_init
+    def test_server_process_global_profiler(self):
+        if self.rank != 0:
+            return
+
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker_name = worker_name(dst_rank)
+
+        x = torch.tensor(1)
+        y = torch.tensor(2)
+
+        outer_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
+        outer_profile_rref.rpc_sync().__enter__()
+        rpc.rpc_sync(dst_worker_name, torch.add, (x, y))
+        inner_profile_rref = rpc.remote(dst_worker_name, rpc._server_process_global_profile)
+        inner_profile_rref.rpc_sync().__enter__()
+        rpc.rpc_sync(dst_worker_name, torch.sub, (x, y))
+        inner_profile_rref.rpc_sync().__exit__(None, None, None)
+        outer_profile_rref.rpc_sync().__exit__(None, None, None)
+
+        inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,))
+        expected_inner_events = ['aten::sub']
+        expected_outer_events = expected_inner_events + ['aten::add']
+
+        self._assert_top_level_events(inner_events, expected_inner_events)
+        outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,))
+        self._assert_top_level_events(outer_events, expected_outer_events)
+
+        inner_profile_rref.rpc_sync().key_averages()
+        outer_profile_rref.rpc_sync().key_averages()
+
+    @dist_init
+    def test_async_record_function_double_end_callbacks(self):
+        num_sleep_seconds = 1
+        if self.rank == 1:
+            # Validate that calling the function twice results in an error.
+            with _profile() as pf:
+                with torch.autograd.profiler.record_function("foo") as rf:
+                    fut = rpc.rpc_async(
+                        worker_name(0), my_sleep_func, args=(num_sleep_seconds,)
+                    )
+                    rf._call_end_callbacks_on_future(fut)
+                    with self.assertRaisesRegex(
+                        RuntimeError, "can only be called once."
+                    ):
+                        rf._call_end_callbacks_on_future(fut)
+                fut.wait()
+
+    @dist_init
+    def test_async_record_function_legacy(self):
+        # Test the legacy _record_function ops work
+        # Note: These exist for backward compatibility with TorchScript
+        num_sleep_seconds = 1
+        if self.rank == 1:
+            with _profile() as pf:
+                try:
+                    handle = torch.ops.profiler._record_function_enter("foo", None)
+                    fut = rpc.rpc_async(
+                        worker_name(0), my_sleep_func, args=(num_sleep_seconds,)
+                    )
+                    torch.ops.profiler._call_end_callbacks_on_jit_fut(handle, fut)
+                finally:
+                    torch.ops.profiler._record_function_exit(handle)
+
+                fut.wait()
+
+    @dist_init
+    def test_async_record_function_cbs_jit_call(self):
+        if self.rank == 1:
+            with _profile() as pf:
+                key = _build_rpc_profiling_key(
+                    RPCExecMode.ASYNC,
+                    torch._jit_internal._qualified_name(my_script_func),
+                    "worker1",
+                    "worker0",
+                )
+                with torch.autograd.profiler.record_function(key) as rf:
+                    fut = rpc.rpc_async(
+                        worker_name(0), my_script_func, args=(torch.tensor(1),)
+                    )
+                    # Intentionally calling record_function internals
+                    fut = torch.ops.profiler._call_end_callbacks_on_jit_fut(rf.record, fut)
+                result = fut.wait()
+                # Validate that the profiling future returns the same value as the RPC
+                # future.
+                expected = torch.add(torch.tensor(1), torch.tensor(1))
+                self.assertEqual(result, expected)
+            events = pf.function_events
+            rpc_event = get_function_event(
+                events, torch._jit_internal._qualified_name(my_script_func)
+            )
+            self.assertTrue(torch._jit_internal._qualified_name(my_script_func) in rpc_event.name)
+
+    @dist_init
+    def test_py_class_constructor(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(worker_name(dst_rank), MyClass, args=(n,))
+        self.assertEqual(ret.a, n)
+
+    @dist_init
+    def test_py_class_instance_method(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), MyClass(2).my_instance_method, args=(n,)
+        )
+        self.assertEqual(ret, MyClass(2).my_instance_method(n))
+
+    @dist_init
+    def test_py_class_method(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), MyClass.my_class_method, args=(n, n + 1)
+        )
+        self.assertEqual(ret, MyClass.my_class_method(n, n + 1))
+
+    @dist_init
+    def test_py_class_static_method(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), MyClass.my_static_method, args=(n + 10,)
+        )
+        self.assertEqual(ret, MyClass.my_static_method(n + 10))
+
+    @dist_init
+    def test_py_multi_async_call(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        dst_worker_info = rpc.get_worker_info(worker_name(dst_rank))
+        fut1 = rpc.rpc_async(dst_worker_info, MyClass.my_static_method, args=(n + 10,))
+        fut2 = rpc.rpc_async(dst_worker_info, min, args=(n, n + 1, n + 2))
+        self.assertEqual(fut1.wait(), MyClass.my_static_method(n + 10))
+        self.assertEqual(fut2.wait(), min(n, n + 1, n + 2))
+
+    @dist_init
+    def test_py_no_return_result(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(worker_name(dst_rank), no_result)
+        self.assertEqual(ret, no_result())
+
+    @dist_init
+    def test_py_tensors(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            my_tensor_function,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        self.assertEqual(ret, my_tensor_function(torch.ones(n, n), torch.ones(n, n)))
+
+    @dist_init
+    def test_py_tensors_multi_async_call(self):
+        futs = []
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        for i in range(100):
+            fut = rpc.rpc_async(
+                worker_name(dst_rank),
+                my_tensor_function,
+                args=(torch.ones(i, i), torch.ones(i, i)),
+            )
+            futs.append(fut)
+
+        j = 0
+        for val in torch.futures.wait_all(futs):
+            self.assertEqual(
+                val, my_tensor_function(torch.ones(j, j), torch.ones(j, j))
+            )
+            j += 1
+
+    @dist_init
+    def test_py_tensors_in_container(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        a = [torch.ones(n, n), torch.ones(n, n)]
+        b = TensorClass(build_complex_tensors())
+        c = {"foo": torch.ones(n, n), "bar": torch.ones(n, n)}
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), my_complex_tensor_function, args=(a, b, c)
+        )
+        self.assertEqual(ret, my_complex_tensor_function(a, b, c))
+
+    @dist_init
+    def test_py_nested_pickle(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            run_nested_pickle,
+            args=(MyPickleClass(), torch.ones(2, 2)),
+        )
+
+        m = MyPickleClass()
+        m.set(my_tensor_function(torch.ones(2, 2), torch.ones(2, 2)))
+        self.assertEqual(ret, run_nested_pickle(m, torch.ones(2, 2)))
+
+    @dist_init
+    def test_py_function_exception(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        with self.assertRaises(TypeError):
+            ret = rpc.rpc_sync(worker_name(dst_rank), no_result, args=(10,))
+
+    @dist_init
+    def test_py_raise_in_user_func(self):
+        with captured_output() as (_, err):
+            # This barrier prevents a race condition where the main thread has
+            # not entered the context manager when the remote function runs.
+            initialize_pg(self.file_init_method, self.rank, self.world_size)
+            dist.barrier()
+            n = self.rank + 1
+            dst_rank = n % self.world_size
+            fut = rpc.rpc_async(worker_name(dst_rank), raise_func)
+            with self.assertRaisesRegex(ValueError, expected_err):
+                fut.wait()
+            # This barrier prevents a race condition where the main thread exits
+            # context manager before the remote function has ran.
+            dist.barrier()
+
+        # Validate that trainers log errors when running functions.
+        stderr_lines = err.getvalue()
+        self.assertTrue(expected_err in stderr_lines)
+
+    @dist_init
+    def test_py_raise_in_user_func_escaped_str(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        fut = rpc.rpc_async(worker_name(dst_rank), raise_func_escape)
+        try:
+            fut.wait()
+        except ValueError as e:
+            msg = str(e)
+            # Ensure newlines are unescaped to provide a better repr of error.
+            self.assertEqual(msg, msg.encode("utf-8").decode("unicode_escape"))
+        else:
+            self.assertTrue(False, "expected raise_func_escape to raise ValueError.")
+
+    @dist_init
+    def test_nested_rpc(self):
+        self._nested_rpc(nested_rpc, torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_stress_light_rpc(self):
+        self._stress_test_rpc(light_rpc)
+
+    @dist_init
+    def test_stress_heavy_rpc(self):
+        self._stress_test_rpc(heavy_rpc, repeat=20, args=(torch.ones(100, 100),))
+
+    @dist_init
+    def test_stress_heavy_rpc_torchscript(self):
+        self._stress_test_rpc(heavy_rpc_torchscript, repeat=20, args=(torch.ones(100, 100),))
+
+    @dist_init
+    def test_builtin_remote_ret(self):
+        self._builtin_remote_ret(
+            torch.ones(2, 2),
+            torch.ones(2, 2),
+            torch.ones(2, 2) * 2
+        )
+
+    @dist_init
+    def test_builtin_remote_self(self):
+        self._builtin_remote_self(
+            torch.ones(2, 2),
+            torch.ones(2, 2),
+            torch.ones(2, 2) * 2
+        )
+
+    @staticmethod
+    def _multi_args_fn(n, sparse=False):
+        if sparse:
+            return (build_sparse_tensor(), build_sparse_tensor())
+        else:
+            return (torch.ones(n, n), torch.ones(n, n))
+
+    @dist_init
+    def test_multi_builtin_remote_ret(self):
+        self._test_multi_remote_call(
+            torch.add, False,
+            args_fn=RpcTest._multi_args_fn
+        )
+
+    @dist_init
+    def test_py_udf_remote(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref = rpc.remote(
+            worker_name(dst_rank),
+            my_function,
+            kwargs={"a": n, "b": n + 1, "c": n + 2},
+        )
+        self.assertEqual(rref.to_here(), my_function(n, n + 1, n + 2))
+
+    @staticmethod
+    def _multi_kwargs_fn(n, sparse=False):
+        if sparse:
+            return {
+                "a": build_sparse_tensor(),
+                "b": build_sparse_tensor(),
+                "c": build_sparse_tensor()
+            }
+        else:
+            return {"a": torch.ones(n, n), "b": torch.ones(n, n), "c": torch.ones(n, n)}
+
+    @dist_init
+    def test_multi_py_udf_remote(self):
+        self._test_multi_remote_call(
+            my_function,
+            False,
+            kwargs_fn=RpcTest._multi_kwargs_fn
+        )
+
+    @dist_init
+    def test_py_rref_args(self):
+        self._py_rref_args(
+            torch.ones(2, 2),
+            1,
+            torch.ones(2, 2),
+            2,
+            torch.ones(2, 2) * 2 + 3)
+
+    @dist_init
+    def test_py_rref_args_user_share(self):
+        self._py_rref_args_user_share(
+            torch.ones(2, 2),
+            1,
+            2,
+            torch.ones(2, 2),
+            3,
+            4,
+            torch.ones(2, 2) * 2 + 10
+        )
+
+    @dist_init
+    def test_py_rpc_rref_args(self):
+        self._py_rpc_rref_args(
+            torch.ones(2, 2),
+            1,
+            2,
+            torch.ones(2, 2),
+            3,
+            4,
+            torch.ones(2, 2) * 2 + 10
+        )
+
+    @dist_init
+    def test_nested_remote(self):
+        self._nested_remote(
+            nested_remote,
+            torch.ones(2, 2) + 3
+        )
+
+    @dist_init
+    def test_nested_rref(self):
+        self._nested_rref(
+            nested_rref,
+            torch.ones(2, 2) + 1,
+            torch.ones(2, 2) + 2
+        )
+
+    @dist_init
+    def test_nested_rref_stress(self):
+        self._nested_rref_stress(
+            nested_rref,
+            torch.ones(2, 2) + 1,
+            torch.ones(2, 2) + 2
+        )
+
+    @dist_init
+    def test_multi_layer_nested_async_rpc(self):
+        # This test will exit right away, but there will be a chain of async
+        # RPCs. The termination algorithm should detect those messages properly.
+        # Otherwise, some peer could exit early, leaving others to timeout
+        # errors or connection closed errors.
+        ttl = 20
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        multi_layer_nested_async_rpc(dst_rank, self.world_size, ttl)
+
+    @dist_init
+    def test_remote_with_exception(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        # check ref to other workers
+        rref = rpc.remote(worker_name(dst_rank), raise_func)
+        with self.assertRaises(ValueError):
+            rref.to_here()
+        # check ref to itself
+        rref = rpc.remote(worker_name(self.rank), no_result, args=(10,))
+        with self.assertRaises(TypeError):
+            rref.to_here()
+
+    @dist_init
+    def test_rpc_return_rref(self):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        rref = rpc.rpc_sync(
+            worker_name(dst_rank1),
+            rpc_return_rref,
+            args=(worker_name(dst_rank2),),
+        )
+        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1)
+
+    @dist_init
+    def test_rref_forward_chain(self):
+        ttl = 8
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        rref = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1)
+        )
+
+        ret_rref = rref_forward_chain(dst_rank, self.world_size, rref, ttl)
+
+        for i in range(ttl):
+            self.assertEqual(len(ret_rref), 1)
+            ret_rref = ret_rref[0].to_here()
+
+        ret = ret_rref
+        self.assertEqual(ret, torch.add(torch.ones(n, n), 1))
+
+    @dist_init
+    def test_local_rref_no_fork(self):
+        local_rref = RRef(35)
+        self.assertEqual(local_rref.local_value(), 35)
+
+    @dist_init
+    def test_local_value_not_on_owner(self):
+        # ensure that an error message is thrown if a user tries to call
+        # local_value() on a non-owning node.
+        next_rank = (self.rank + 1) % self.world_size
+        rref = rpc.remote(
+            worker_name(next_rank), torch.add, args=(torch.ones(1), torch.ones(1))
+        )
+        with self.assertRaisesRegex(
+            RuntimeError, (
+                fr"For UserRRef\(rref_id=GloballyUniqueId\(created_on={self.rank}, local_id=0\), "
+                fr"fork_id=GloballyUniqueId\(created_on={self.rank}, local_id=1\)\), "
+                r"can't call localValue\(\) on user "
+                fr"WorkerInfo\(id={self.rank}, name={worker_name(self.rank)}\). "
+                fr"Call it on owner WorkerInfo\(id={next_rank}, name={worker_name(next_rank)}\)"
+            )
+        ):
+            rref.local_value()
+
+    @dist_init
+    def test_return_local_rrefs(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        rref_list = rpc.rpc_sync(
+            worker_name(dst_rank), get_rref_list, args=([1, 2, 3],)
+        )
+
+        for rref in rref_list:
+            rpc.rpc_sync(
+                rref.owner(),
+                _call_method_on_rref,
+                args=(MyClass.increment_value, rref, 10),
+            )
+
+        rets = [
+            rpc.rpc_sync(
+                rref.owner(), _call_method_on_rref, args=(MyClass.get_value, rref)
+            )
+            for rref in rref_list
+        ]
+
+        self.assertEqual(rets, [11, 12, 13])
+
+    @dist_init
+    def _test_rref_type(self, blocking):
+
+        def launched_rpc(events):
+            expected_name = f"rpc_{RPCExecMode.ASYNC.value}#_rref_typeof_on_owner"
+            return any(e.name.startswith(expected_name) for e in events)
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        rref = rpc.remote(dst, torch.add, args=(torch.ones(2), 1))
+
+        with _profile() as p:
+            t = rref._get_type(blocking=blocking)
+            if not blocking:
+                t = t.wait()
+
+        self.assertTrue(launched_rpc(p.function_events))
+        expected_type = type(torch.ones(2))
+        self.assertEqual(t, expected_type)
+
+        futs = []
+
+        def verify(fut):
+            self.assertEqual(fut.value(), expected_type)
+
+        with _profile() as p:
+            for _ in range(10):
+                t = rref._get_type(blocking=blocking)
+                if not blocking:
+                    futs.append(t)
+                    t.add_done_callback(verify)
+                    t = t.wait()
+                self.assertEqual(t, expected_type)
+
+        if not blocking:
+            # Note that cached calls with blocking=False all return the same
+            # cached original future.
+            first_fut = futs[0]
+            for f in futs[1:]:
+                self.assertTrue(f is first_fut)
+        # Ensure we never launch another RPC, other than for the very
+        # first call.
+        self.assertFalse(launched_rpc(p.function_events))
+        self.assertEqual(t, type(torch.ones(2)))
+
+        rref = rpc.remote(dst, MyClass, args=(0,))
+        rref_type = rref._get_type(blocking=blocking)
+        if not blocking:
+            rref_type = rref_type.wait()
+        self.assertEqual(rref_type, MyClass)
+
+    def test_rref_type_blocking(self):
+        self._test_rref_type(blocking=True)
+
+    def test_rref_type_non_blocking(self):
+        self._test_rref_type(blocking=False)
+
+    @dist_init
+    def _test_rref_type_with_error(self, blocking):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        # 10 ms timeout
+        rref = rpc.remote(dst, raise_func)
+        # Blocking: error raised inline
+        if blocking:
+            with self.assertRaisesRegex(ValueError, "Expected error"):
+                rref._get_type(blocking=blocking)
+        else:
+            # Non-blocking: Immediately return future, block on wait
+            fut = rref._get_type(blocking=blocking)
+            with self.assertRaisesRegex(ValueError, "Expected error"):
+                fut.wait()
+
+
+    def test_rref_type_with_error_blocking(self):
+        self._test_rref_type_with_error(blocking=True)
+
+    def test_rref_type_with_error_non_blocking(self):
+        self._test_rref_type_with_error(blocking=False)
+
+    @dist_init
+    def _test_rref_type_owner(self, blocking):
+        rref = RRef(torch.ones(2) + 1)
+        rref_type = rref._get_type(blocking=blocking)
+        if not blocking:
+            rref_type = rref_type.wait()
+        self.assertEqual(rref_type, type(torch.ones(2)))
+
+        rref = RRef(MyClass(0))
+        rref_type = rref._get_type(blocking=blocking)
+        if not blocking:
+            rref_type = rref_type.wait()
+        self.assertEqual(rref_type, MyClass)
+
+    def test_rref_type_owner_blocking(self):
+        self._test_rref_type_owner(blocking=True)
+
+    def test_rref_type_owner_non_blocking(self):
+        self._test_rref_type_owner(blocking=False)
+
+    @staticmethod
+    def _slow_add(x, y):
+        time.sleep(1)
+        return x + y
+
+    @dist_init
+    def test_rref_type_slow_init(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        rref = rpc.remote(dst, RpcTest._slow_add, args=(torch.ones(2), 1))
+        self.assertEqual(rref._get_type(), type(torch.ones(2)))
+
+    @dist_init
+    def test_owner_equality(self):
+        a = RRef(40)
+        b = RRef(50)
+
+        other_rank = (self.rank + 1) % self.world_size
+        other_a = rpc.remote(
+            worker_name(other_rank), torch.add, args=(torch.ones(1), 1)
+        )
+        other_b = rpc.remote(
+            worker_name(other_rank), torch.add, args=(torch.ones(1), 1)
+        )
+        other_a.to_here()  # to ensure clean termination
+        other_b.to_here()
+
+        self.assertNotEqual(a.owner(), 23)
+        self.assertEqual(other_a.owner(), other_b.owner())
+        self.assertNotEqual(a.owner(), other_a.owner())
+        self.assertEqual(other_a.owner(), other_a.owner())
+        self.assertEqual(other_a.owner(), other_b.owner())
+        self.assertEqual(a.owner(), a.owner())
+        self.assertEqual(a.owner(), b.owner())
+        self.assertEqual(a.owner(), rpc.get_worker_info())
+        x = {}
+        x[a.owner()] = a
+        x[other_a.owner()] = other_a
+        self.assertEqual(x[a.owner()], a)
+        self.assertEqual(x[b.owner()], a)
+        self.assertEqual(x[other_a.owner()], other_a)
+        self.assertEqual(x[other_b.owner()], other_a)
+        self.assertEqual(len(x), 2)
+
+    @dist_init
+    def test_pass_local_rrefs(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        dst_worker = worker_name(dst_rank)
+
+        rref = RRef(40)
+        self.assertEqual(
+            rpc.rpc_sync(dst_worker, add_rref_to_value, args=(rref, 50)), 90
+        )
+        self.assertEqual(
+            rpc.rpc_async(dst_worker, add_rref_to_value, args=(rref, 50)).wait(), 90
+        )
+        self.assertEqual(
+            rpc.remote(dst_worker, add_rref_to_value, args=(rref, 50)).to_here(), 90
+        )
+
+    @dist_init
+    def test_remote_same_worker(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 2)
+        )
+        rref_b = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(n, n), 1)
+        )
+        rref_c = rpc.remote(
+            worker_name(dst_rank), my_rref_function, args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
+
+    @dist_init(setup_rpc=True)
+    def test_call_method_on_rref(self):
+        """
+        Tests that it is possible to call an instance method on a remote object
+        by using rref.owner() as destination of the call.
+        """
+        vals = [10, 2, 5, 7]
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst_rank)
+
+        # creates a remote object
+        rref = rpc.remote(dst_worker, MyClass, args=(vals[0],))
+
+        # modifies state of the remote object
+        rpc.rpc_sync(
+            rref.owner(),
+            _call_method_on_rref,
+            args=(MyClass.increment_value, rref, vals[1]),
+        )
+        rpc.rpc_async(
+            rref.owner(),
+            _call_method_on_rref,
+            args=(MyClass.increment_value, rref, vals[2]),
+        ).wait()
+        rpc.remote(
+            rref.owner(),
+            _call_method_on_rref,
+            args=(MyClass.increment_value, rref, vals[3]),
+        ).to_here()
+
+        # queries state of the remote object
+        result = rpc.rpc_sync(
+            dst_worker, _call_method_on_rref, args=(MyClass.get_value, rref)
+        )
+
+        self.assertEqual(result, sum(vals))
+
+    # Notice `rpc.api.shutdown()` accesses
+    # `_delete_all_user_and_unforked_owner_rrefs` through
+    # `torch.distributed.rpc.api`, so patching
+    # `torch.distributed.rpc._delete_all_user_and_unforked_owner_rrefs` will
+    # not help.
+    @mock.patch.object(torch.distributed.rpc.api, "_delete_all_user_and_unforked_owner_rrefs")
+    def _test_rref_leak(self, _mock_delete_all_user_and_unforked_owner_rrefs, ignore_leak):
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        # Wait for all init to complete.
+        dist.barrier()
+
+        rref = rpc.remote(
+            worker_name((self.rank + 1) % self.world_size),
+            torch.add,
+            args=(torch.ones(2, 2), 1),
+        )
+
+        import torch.distributed.rpc.api as api
+
+        if ignore_leak:
+            api._ignore_rref_leak = True
+            rpc.shutdown(graceful=True)
+        else:
+            api._ignore_rref_leak = False
+            with self.assertRaisesRegex(RuntimeError, "Leaking RRef"):
+                rpc.shutdown(graceful=True)
+
+    @dist_init(setup_rpc=False)
+    def test_rref_leak(self):
+        self._test_rref_leak(ignore_leak=False)
+
+    @dist_init(setup_rpc=False)
+    def test_ignore_rref_leak(self):
+        self._test_rref_leak(ignore_leak=True)
+
+    @dist_init
+    def test_rref_str(self):
+        rref1 = RRef(self.rank)
+        id_class = "GloballyUniqueId"
+        self.assertEqual(
+            f"OwnerRRef({id_class}(created_on={self.rank}, local_id=0))", rref1.__str__()
+        )
+
+        dst_rank = (self.rank + 1) % self.world_size
+        rref2 = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
+        )
+        self.assertEqual(
+            rref2.__str__(),
+            "UserRRef(RRefId = {0}(created_on={1}, local_id=1), ForkId = {0}(created_on={1}, local_id=2))".format(
+                id_class, self.rank
+            ),
+        )
+
+    @dist_init
+    def test_rref_get_future(self):
+        # Tests that we can obtain the future corresponding to the creation of
+        # the RRef on remote end
+        if self.rank == 0:
+            # Builtin
+            rref = rpc.remote(worker_name(1), torch.add, args=(1, 1))
+            rref.to_here()
+            fut = rref._get_future()
+            self.assertIsInstance(fut, torch._C.Future)
+
+            # UDF
+            rref = rpc.remote(worker_name(1), foo_add, args=())
+            rref.to_here()
+            fut = rref._get_future()
+            self.assertIsInstance(fut, torch._C.Future)
+
+            # Script
+            rref = rpc.remote(worker_name(1), my_script_func, args=(torch.tensor(1), ))
+            rref.to_here()
+            fut = rref._get_future()
+            self.assertIsInstance(fut, torch._C.Future)
+
+
+    @dist_init
+    def test_rref_context_debug_info(self):
+        # This test checks local states that are modified by remote workers.
+        # This means that we would need barrier before and after every check.
+        # The barrier before the check makes sure that all previous states are
+        # cleared globally, the barrier after ensures that no following states
+        # change gets into the current check.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        # Check 1: local RRef does not update owners_ map or add a pending user.
+        #################################################
+
+        rref1 = RRef(self.rank)
+
+        # don't need a barrier here as local RRef is handled by this thread
+        info = _rref_context_get_debug_info()
+        self.assertIn("num_owner_rrefs", info)
+        self.assertIn("num_pending_users", info)
+        # RRef on local value is not added to context until shared across RPC
+        self.assertEqual(0, int(info["num_owner_rrefs"]))
+        self.assertEqual(0, int(info["num_pending_users"]))
+        # barrier after the check 1
+        dist.barrier()
+
+        # Check 2: Sharing RRef as an arg should update owners_ map
+        ###########################################################
+
+        dst_rank = (self.rank + 1) % self.world_size
+        rpc.rpc_sync(worker_name(dst_rank), set_global_rref, args=(rref1,))
+
+        # barrier before check 2
+        wait_until_pending_futures_and_users_flushed()
+        dist.barrier()
+
+        info = _rref_context_get_debug_info()
+        self.assertIn("num_owner_rrefs", info)
+        self.assertEqual(1, int(info["num_owner_rrefs"]))
+        # no pending users since the fork is finished
+        self.assertEqual(0, int(info["num_pending_users"]))
+        # barrier after check 2
+        dist.barrier()
+
+        # clear states for check 2
+        rpc.rpc_sync(worker_name(dst_rank), clear_global_rref)
+
+        # Wait for owner rref to be cleared.
+        while int(info["num_owner_rrefs"]) != 0:
+            info = _rref_context_get_debug_info()
+            time.sleep(0.1)
+        dist.barrier()
+
+        # Check 3: rpc.remote call should update owners_ map
+        ####################################################
+        rref2 = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
+        )
+        rref3 = rpc.remote(
+            worker_name(dst_rank), torch.add, args=(torch.ones(2, 2), 1)
+        )
+        rref2.to_here()
+        rref3.to_here()
+
+        # barrier before check 3
+        wait_until_pending_futures_and_users_flushed()
+        dist.barrier()
+
+        info = _rref_context_get_debug_info()
+        self.assertIn("num_owner_rrefs", info)
+        self.assertEqual(2, int(info["num_owner_rrefs"]))
+        # no pending users since the fork is finished
+        self.assertEqual(0, int(info["num_pending_users"]))
+
+        # barrier after check 3
+        dist.barrier()
+
+    @dist_init
+    def test_disable_gil_profiling(self):
+        # test that rpc.enable_gil_profiling(false) will result in
+        # GIL wait time not being recorded.
+
+        # GIL profiling should be disabled by default.
+        dst_rank = (self.rank + 1) % self.world_size
+        rpc.rpc_sync(
+            worker_name(dst_rank), torch.add, args=(torch.ones(1), torch.ones(1))
+        )
+        info = rpc.api._get_current_rpc_agent().get_debug_info()
+        self.assertRaises(KeyError, lambda: info["agent.gil_average_wait_time_us"])
+        rpc.enable_gil_profiling(True)
+        rpc.rpc_sync(
+            worker_name(dst_rank), torch.add, args=(torch.ones(1), torch.ones(1))
+        )
+        info = rpc.api._get_current_rpc_agent().get_debug_info()
+        self.assertIn("agent.gil_average_wait_time_us", info)
+
+    @dist_init(setup_rpc=False)
+    def test_local_shutdown(self):
+        # test that we can start RPC and then immediately locally shutdown
+        # without sending any messages.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        # pass in graceful=False to ensure that we don't wait for other workers.
+        rpc.shutdown(graceful=False)
+
+    @dist_init
+    def test_debug_info(self):
+        # only test keys in this test case. Values should be covered by
+        # individual module debug info tests
+        import torch.distributed.autograd as dist_autograd
+
+        info = _get_debug_info()
+        rref_info = _rref_context_get_debug_info()
+        agent_info = rpc.api._get_current_rpc_agent().get_debug_info()
+        autograd_info = dist_autograd._get_debug_info()
+        common_keys = rref_info.keys() & agent_info.keys() & autograd_info.keys()
+        self.assertEqual(0, len(common_keys))
+        expected = {}
+        expected.update(rref_info)
+        expected.update(agent_info)
+        expected.update(autograd_info)
+        # NB: Key ordering is only preserved in python 3.6+. So here, we
+        # manually check keys are equal.
+        for key in expected.keys():
+            self.assertIn(key, info.keys())
+
+        for key in info.keys():
+            self.assertIn(key, expected.keys())
+
+    @dist_init(setup_rpc=False)
+    @skip_but_pass_in_sandcastle_if(
+        IS_MACOS,
+        "Test is flaky on MacOS since libuv error handling is not as robust as TCP",
+    )
+    def test_handle_send_exceptions(self):
+        # test that if a callee node has gone down, we raise an appropriate
+        # exception instead of just crashing.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        rpc._set_rpc_timeout(10)
+        # This barrier is needed to ensure that some workers do not exit before
+        # others have been brought up.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+        if self.rank == 1:
+            dst_rank = (self.rank + 1) % self.world_size
+            dst_worker = worker_name(dst_rank)
+            # allow destination worker to exit without joining
+            error_str = self.get_shutdown_error_regex()
+            wait_until_node_failure(dst_rank, error_str)
+            fut = rpc.rpc_async(dst_worker, torch.add, args=(torch.ones(1), 3))
+            # Shutdown sequence is not very well defined and as a result
+            # we can see any of the error messages defined in get_shutdown_error_regex.
+            with self.assertRaisesRegex(RuntimeError, error_str):
+                fut.wait()
+        # exit all workers non-gracefully.
+        rpc.shutdown(graceful=False)
+
+    @dist_init
+    def test_deadlock(self):
+        # this test is copied from https://github.com/pytorch/pytorch/issues/45089
+        if self.rank == 1:
+            dst1 = worker_name((self.rank + 1) % self.world_size)
+            x = torch.ones(2)
+            y = torch.ones(2)
+            rpc.rpc_async(dst1, RpcTest._slow_add, args=(x, y), timeout=15).wait()
+
+        dist_initialized = dist.is_initialized()
+        if not dist_initialized:
+            dist.init_process_group(
+                backend="gloo",
+                init_method=self.file_init_method,
+                rank=self.rank,
+                world_size=self.world_size,
+            )
+
+    @dist_init(setup_rpc=False)
+    def test_local_shutdown_with_rpc(self):
+        # test that we can start RPC, send RPCs, and then run local shutdown.
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rpc.rpc_sync(
+            worker_name(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        # A barrier is needed to ensure that all RPCs are processed.
+        # Otherwise, some RPCs can timeout since the receiving end
+        # has terminated.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+        # pass in graceful=False to ensure that we don't wait for other workers.
+        rpc.shutdown(graceful=False)
+
+    @dist_init(setup_rpc=False)
+    def test_set_and_get_default_rpc_timeout(self):
+        timeout = 0.5
+
+        # A new `RpcBackendOptions` is constructed
+        # when accessing `self.rpc_backend_options`.
+        rpc_backend_options = self.rpc_backend_options
+        rpc_backend_options.rpc_timeout = timeout
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+        set_timeout = rpc.get_rpc_timeout()
+        self.assertEqual(timeout, set_timeout)
+        rpc.shutdown()
+
+    @dist_init
+    def test_default_timeout_used(self):
+        """
+        Tests that if no timeout is passed into rpc_async and rpc_sync, then the
+        default timeout is used.
+        """
+        dst_rank = (self.rank + 1) % self.world_size
+        rpc._set_rpc_timeout(0.001)  # 1 ms
+        # futures should time out and be marked with an exception indicating it as such.
+        futs = [
+            rpc.rpc_async(worker_name(dst_rank), my_sleep_func, args=())
+            for _ in range(10)
+        ]
+        expected_error = self.get_timeout_error_regex()
+        for fut in futs:
+            with self.assertRaisesRegex(RuntimeError, expected_error):
+                fut.wait()
+
+        # ensure that if a new timeout is set old futures don't time out but new ones do.
+        rpc._set_rpc_timeout(200)  # 200 seconds
+        # create a longstanding RPC.
+        fut1 = rpc.rpc_async(worker_name(dst_rank), my_sleep_func, args=(1,))
+        # now, set a short timeout.
+        rpc._set_rpc_timeout(0.001)
+        # fut2 should time out, fut1 should not.
+        fut2 = rpc.rpc_async(worker_name(dst_rank), my_sleep_func, args=(1,))
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut2.wait()
+        fut1.wait()
+
+        # Zero timeout means infinity, so future should run to completion.
+        rpc._set_rpc_timeout(0)
+        rpc.rpc_async(worker_name(dst_rank), my_sleep_func, args=()).wait()
+
+        # reset to default timeout so shutdown messages can process cleanly.
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    @dist_init
+    def test_rpc_timeouts(self):
+        # TODO: enable timeouts for rpc.remote/RRef (https://github.com/pytorch/pytorch/issues/33803)
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = worker_name(dst_rank)
+        timeout = 0.1  # 100 ms
+        expected_error = self.get_timeout_error_regex()
+        # Test async UDF
+        fut = rpc.rpc_async(dst_worker, my_sleep_func, args=(1,), timeout=timeout)
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+
+        # Ensure run to completion if there is no timeout and we use the default
+        # RPC timeout.
+        rpc.rpc_async(dst_worker, my_sleep_func, args=(1,)).wait()
+
+        # Test sync UDF
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rpc.rpc_sync(dst_worker, my_sleep_func, args=(1,), timeout=timeout)
+
+        # Ensure run to completion if there is no timeout and we use the default
+        # RPC timeout.
+        rpc.rpc_sync(dst_worker, my_sleep_func, args=(1,))
+
+        # If we set a default timeout for RPCs, it should be respected, though
+        # still overridden if we pass in a different timeout to the APIs.
+        rpc._set_rpc_timeout(0.001)
+        fut = rpc.rpc_async(dst_worker, my_sleep_func, args=(1,))
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            fut.wait()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rpc.rpc_sync(dst_worker, my_sleep_func, args=(1,))
+
+        # The RPCs should run to completion since we override the timeout.
+        rpc.rpc_async(dst_worker, my_sleep_func, args=(1,), timeout=5).wait()
+        rpc.rpc_sync(dst_worker, my_sleep_func, args=(1,), timeout=5)
+        # Passing in a zero timeout should ensure that the RPC won't time out.
+        rpc.rpc_async(dst_worker, my_sleep_func, args=(1,), timeout=0).wait()
+        rpc.rpc_sync(dst_worker, my_sleep_func, args=(1,), timeout=0)
+        # Reset for clean shutdown
+        rpc._set_rpc_timeout(rpc.constants.DEFAULT_RPC_TIMEOUT_SEC)
+
+    def test_dist_init_decorator(self):
+        @dist_init(setup_rpc=False)
+        def test_func(self):
+            return "expected result"
+
+        self.assertEqual(test_func(self), "expected result")
+
+        @dist_init
+        def test_func(self):
+            return "expected result"
+
+        self.assertEqual(test_func(self), "expected result")
+
+    def test_use_rpc_pickler(self):
+        class TestPickler:
+            pass
+
+        test_pickler = TestPickler()
+        with _use_rpc_pickler(test_pickler):
+            self.assertTrue(torch.distributed.rpc.api._default_pickler is test_pickler)
+        self.assertTrue(
+            torch.distributed.rpc.api._default_pickler is _internal_rpc_pickler
+        )
+
+    @dist_init
+    def test_wait_all(self):
+        with _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            fut = rpc.rpc_async(dst, torch.add, (torch.ones(2, 2), 1))
+            self.assertTrue(len(_thread_local_var.future_list) == 1)
+            self.assertTrue(isinstance(_thread_local_var.future_list[0], torch._C.Future))
+        self.assertTrue(fut.done())
+        self.assertEqual(fut.wait(), torch.ones(2, 2) + 1)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_multiple_call(self):
+        with _wait_all():
+            self.assertTrue(_thread_local_var.future_list == [])
+            dst = worker_name((self.rank + 1) % self.world_size)
+            for i in range(20):
+                fut = rpc.rpc_async(dst, torch.add, (torch.ones(i, i), 1))
+                res = rpc.rpc_sync(dst, torch.add, (torch.ones(i, i), 1))
+                self.assertEqual(res, torch.ones(i, i) + 1)
+                self.assertEqual(fut.wait(), torch.ones(i, i) + 1)
+            self.assertTrue(len(_thread_local_var.future_list) == 20)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_timeout(self):
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            with _wait_all():
+                self.assertTrue(_thread_local_var.future_list == [])
+                dst = worker_name((self.rank + 1) % self.world_size)
+                timeout = 0.1  # 100 ms
+                fut = rpc.rpc_async(dst, my_sleep_func, args=(1,), timeout=timeout)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_raise_in_user_func(self):
+        with self.assertRaises(ValueError):
+            with _wait_all():
+                self.assertTrue(_thread_local_var.future_list == [])
+                dst = worker_name((self.rank + 1) % self.world_size)
+                fut = rpc.rpc_async(dst, raise_func)
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_wait_all_raise_in_body(self):
+        with self.assertRaises(ValueError):
+            with _wait_all():
+                raise_func()
+        self.assertFalse(hasattr(_thread_local_var, "future_list"))
+
+    @dist_init
+    def test_custom_exception_throw_during_reconstruction(self):
+        """
+        Test that we still throw info about the remote side exception even when
+        we cannot recreate it on client side.
+        """
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        if self.rank != 0:
+            exc_caught = False
+            dst = worker_name(0)
+            try:
+                rpc.rpc_sync(dst, custom_raise_func, args=())
+            except RuntimeError as e:
+                exc_caught = True
+                msg = str(e)
+                print(f"Got msg {msg}")
+                self.assertTrue("Original exception on remote side was" in msg)
+                self.assertTrue("CustomException" in msg)
+            except BaseException as e:
+                raise RuntimeError(
+                    f"Failure - expected RuntimeError, got {e}"
+                ) from e
+            finally:
+                self.assertTrue(exc_caught)
+
+        dist.barrier()
+
+
+    timed_out_rpc_event = None
+
+    @staticmethod
+    def timed_out_rpc():
+        RpcTest.timed_out_rpc_event.wait()
+
+    @dist_init
+    def test_wait_all_exit_early_python(self):
+        # Initialize the event in the subprocess.
+        RpcTest.timed_out_rpc_event = Event()
+
+        # Wait for all processes to initialize event.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        fut1 = rpc.rpc_async(dst, RpcTest.timed_out_rpc)
+        fut2 = rpc.rpc_async(dst, raise_func)
+        fut3 = rpc.rpc_async(dst, raise_func)
+
+        # We should receive the error from fut2
+        with self.assertRaisesRegex(ValueError, expected_err):
+            torch.futures.wait_all([fut1, fut2, fut3])
+
+        # Unblock RPC thread for fut1
+        RpcTest.timed_out_rpc_event.set()
+
+    @dist_init
+    def test_wait_all_exit_early_builtin(self):
+        # Initialize the event in the subprocess.
+        RpcTest.timed_out_rpc_event = Event()
+
+        # Wait for all processes to initialize event.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        fut1 = rpc.rpc_async(dst, RpcTest.timed_out_rpc)
+        fut2 = rpc.rpc_async(dst, torch.add, args=(torch.rand(10), torch.rand(5)))
+        fut3 = rpc.rpc_async(dst, torch.add, args=(torch.rand(10), torch.rand(5)))
+
+        # We should receive the error from fut2
+        with self.assertRaisesRegex(RuntimeError, "size of tensor"):
+            torch.futures.wait_all([fut1, fut2, fut3])
+
+        # Unblock RPC thread for fut1
+        RpcTest.timed_out_rpc_event.set()
+
+    @dist_init
+    def test_wait_all_exit_early_script_function(self):
+        # Initialize the event in the subprocess.
+        RpcTest.timed_out_rpc_event = Event()
+
+        # Wait for all processes to initialize event.
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+        dist.barrier()
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        fut1 = rpc.rpc_async(dst, RpcTest.timed_out_rpc)
+        fut2 = rpc.rpc_async(dst, raise_func_script, args=(expected_err,))
+        fut3 = rpc.rpc_async(dst, raise_func_script, args=(expected_err,))
+
+        # We should receive the error from fut2
+        with self.assertRaisesRegex(RuntimeError, expected_err):
+            torch.futures.wait_all([fut1, fut2, fut3])
+
+        # Unblock RPC thread for fut1
+        RpcTest.timed_out_rpc_event.set()
+
+
+    @dist_init
+    def test_function_not_on_callee(self):
+        # test that if a function does not exist on a callee, we don't crash,
+        # instead we get an AttributeError indicating that the func does not exist.
+        this_module = sys.modules[__name__]
+        caller_worker = "worker0"
+        callee_worker = "worker1"
+
+        if self.rank == 1:
+            # Use delattr to remove the binding of a func on this nodes
+            delattr(this_module, "foo_add")
+            # notify remote end that we have removed it.
+            rpc.rpc_sync(caller_worker, set_value, args=(self.rank,))
+
+        if self.rank == 0:
+            # func exists on caller, but not callee.
+            # wait for remote end to remove the binding of foo_add func.
+            wait_for_value_future()
+            # Ensure that we have the attribute on this module. Otherwise, the test could fail due to a caller-side pickling error.
+            self.assertTrue(hasattr(this_module, "foo_add"))
+            with self.assertRaisesRegex(
+                RuntimeError, "RPC pickler does not serialize"
+            ):
+                rpc.rpc_sync(callee_worker, foo_add, args=())
+
+    @dist_init
+    def test_non_garbage_collected_user_rref_due_to_local_circular_dependency(self):
+        dst_worker_name = worker_name((self.rank + 1) % self.world_size)
+
+        a = MyClass(1)
+        b = MyClass(2)
+
+        # This is to make Python not garbage collect a and b.
+        a.other = b
+        b.other = a
+
+        n = self.rank
+        a.rref = rpc.remote(
+            dst_worker_name,
+            torch.add,
+            args=(torch.ones(n, n), 2)
+        )
+
+    @dist_init(setup_rpc=False)
+    def test_use_rref_after_shutdown(self):
+        rpc.init_rpc(
+            name="worker%d" % self.rank,
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref = rpc.remote(
+            worker_name(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n)),
+        )
+        # pass in graceful=True to ensure that local UserRRefs are deleted.
+        rpc.shutdown(graceful=True)
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot call to_here\\(\\) on it after deletion."
+        ):
+            rref.to_here()
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Cannot call fork an UserRRef after deletion."
+        ):
+            import torch.distributed.rpc.internal as internal
+            internal.serialize(rref)
+
+    @staticmethod
+    def _return_gpu_tensor():
+        return torch.rand(3, 3).cuda(0)
+
+    @staticmethod
+    def _return_gpu_tensor_list():
+        return [torch.rand(3, 3).cuda(0), torch.rand(3, 3).cuda(1)]
+
+    @staticmethod
+    def _gpu_tensor_list_arg(tensor_list):
+        return torch.rand(3, 3)
+
+    def _create_rref(self):
+        owner_rank = (self.rank + 2) % self.world_size
+        return rpc.remote(
+            worker_name(owner_rank),
+            torch.add,
+            args=(torch.zeros(2, 2), 1)
+        )
+
+    @dist_init
+    def test_user_rrefs_confirmed(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        rref = self._create_rref()
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank),
+            check_rref_confirmed,
+            args=(rref,)
+        )
+        self.assertEqual(ret, True)
+
+    @dist_init
+    def test_user_rrefs_confirmed_remote(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        rref = self._create_rref()
+        ret_rref = rpc.remote(
+            worker_name(dst_rank),
+            check_rref_confirmed,
+            args=(rref,)
+        )
+        self.assertEqual(ret_rref.to_here(), True)
+
+    @dist_init
+    def test_rref_py_pickle_not_supported(self):
+        local_rref = RRef(35)
+        with TemporaryFileName() as fname:
+            with self.assertRaisesRegex(RuntimeError, "Can not pickle rref in python pickler"):
+                torch.save(local_rref, fname)
+
+    @dist_init
+    def test_remote_throw(self):
+        rref = rpc.remote(worker_name((self.rank + 1) % self.world_size),
+                          raise_or_inc,
+                          args=(torch.ones(2),))
+        with self.assertRaisesRegex(Exception, ".*Expected error.*"):
+            rref.to_here()
+
+    @dist_init
+    def test_non_cont_tensors(self):
+        if self.rank == 0:
+            # Create a non-contiguous tensor.
+            t = torch.rand(5, 5)
+            t_view = t.narrow(1, 2, 2)
+            self.assertFalse(t_view.is_contiguous())
+            t_cont = t_view.contiguous()
+            self.assertTrue(t_cont.is_contiguous())
+            self.assertEqual(t_view, t_cont)
+
+            # Send non-cont tensor over RPC.
+            next_rank = (self.rank + 1) % self.world_size
+            t_ret = rpc.rpc_sync(worker_name(next_rank), non_cont_test, args=(t_view, t_cont))
+
+            # Verify the returned tensor.
+            self.assertEqual(t_view, t_ret)
+            self.assertFalse(t_ret.is_contiguous())
+
+    @dist_init
+    def test_callback_simple(self):
+        set_by_cb = concurrent.futures.Future()
+        n = self.rank + 1
+
+        def callback(fut):
+            ret = fut.wait()
+            self.assertEqual(ret, torch.ones(n, n) * 2)
+            set_by_cb.set_result(ret.clone() + 1)
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n))
+        )
+
+        fut.then(callback)
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+        self.assertEqual(set_by_cb.result(), torch.ones(n, n) * 2 + 1)
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+    @dist_init
+    def test_callback_wrong_arg_num(self):
+        set_by_cb = concurrent.futures.Future()
+        n = self.rank + 1
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n))
+        )
+
+        cb_fut = fut.then(my_function)
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "my\\_function\\(\\) missing 2 required positional arguments"
+        ):
+            cb_fut.wait()
+
+    @dist_init
+    def test_callback_wrong_arg_type(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        fut0 = rpc.rpc_async(dst, torch.add, args=(torch.ones(2, 2), 1))
+        fut1 = fut0.then(lambda x: x + 1)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "unsupported operand type\\(s\\) for \\+"
+        ):
+            fut1.wait()
+
+    @dist_init
+    def test_callback_multi(self):
+        num_cbs = 10
+        n = self.rank + 1
+
+        def callback(idx, fut):
+            ret = fut.wait()
+            self.assertEqual(ret, torch.ones(n, n) * 2)
+            return ret + idx
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n))
+        )
+
+        cb_futs = []
+        for idx in range(num_cbs):
+            cb_futs.append(fut.then(partial(callback, idx)))
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+        for idx in range(num_cbs):
+            self.assertEqual(
+                cb_futs[idx].wait(),
+                torch.ones(n, n) * 2 + idx
+            )
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+    @dist_init
+    def test_callback_chain(self):
+        n = self.rank + 1
+        dst = worker_name(n % self.world_size)
+
+        def callback(fut):
+            return fut.wait() + 1
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size),
+            torch.add,
+            args=(torch.ones(n, n), 1)
+        )
+
+        num_cbs = 20
+        for _ in range(num_cbs):
+            fut = fut.then(callback)
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) + 1 + num_cbs)
+
+    @dist_init
+    def test_callback_in_rpc(self):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        ret = rpc.rpc_sync(
+            dst1,
+            add_use_future_cb,
+            args=(dst2, torch.ones(2, 2), 1, 2)
+        )
+        self.assertEqual(ret, torch.ones(2, 2) + 1 + 2)
+
+    @dist_init
+    def test_callback_with_ret(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        def callback(fut0):
+            fut2 = rpc.rpc_async(
+                dst,
+                torch.add,
+                args=(fut0.wait(), 1)
+            ).then(lambda fut1: fut1.wait() + 1)
+
+            return fut2.wait()
+
+        fut3 = rpc.rpc_async(
+            dst,
+            torch.add,
+            args=(torch.ones(2, 2), 1)
+        ).then(callback)
+
+        self.assertEqual(fut3.wait(), torch.ones(2, 2) + 3)
+
+    @dist_init
+    def test_callback_with_error(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+
+        def callback(fut0):
+            with self.assertRaisesRegex(ValueError, "Expected error"):
+                fut0.wait()
+            raise RuntimeError("Another expected error")
+
+        fut1 = rpc.rpc_async(dst, raise_func).then(callback)
+        with self.assertRaisesRegex(RuntimeError, "Another expected error"):
+            fut1.wait()
+
+    @dist_init
+    def test_callback_none(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        with self.assertRaisesRegex(
+            TypeError,
+            "incompatible function arguments."
+        ):
+            rpc.rpc_async(dst, raise_func).then(None)
+
+    @dist_init
+    def test_add_done_callback(self):
+        set_by_cb = False
+        n = self.rank + 1
+
+        def callback(fut):
+            nonlocal set_by_cb
+            fut.wait()
+            set_by_cb = True
+
+        fut = rpc.rpc_async(
+            worker_name(n % self.world_size),
+            torch.add,
+            args=(torch.ones(n, n), torch.ones(n, n))
+        )
+
+        fut.add_done_callback(callback)
+        fut_then = fut.then(lambda _: True)
+
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+        # We have no guarantee that the add_done_callback fn will execute before the test finishes.
+        # Adding a 'then' callback that runs afterwards to guarantee we wait for the first callback
+        fut_then.wait()
+        self.assertTrue(set_by_cb)
+        self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
+
+    @dist_init
+    def test_mark_future_twice(self):
+        fut = rpc.rpc_async(
+            worker_name((self.rank + 1) % self.world_size),
+            torch.add,
+            args=(torch.zeros(2, 2), 1)
+        )
+        self.assertEqual(fut.wait(), torch.zeros(2, 2) + 1)
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Future can only be marked completed once"
+        ):
+            fut.set_result(1)
+
+    @dist_init
+    def test_pickle_future(self):
+        fut = torch.futures.Future()
+        errMsg = "Can not pickle torch.futures.Future"
+
+        dst = worker_name((self.rank + 1) % self.world_size)
+        with TemporaryFileName() as fname:
+            with self.assertRaisesRegex(RuntimeError, errMsg):
+                rpc.rpc_sync(dst, fail_on_fut, args=(fut,))
+
+        with TemporaryFileName() as fname:
+            with self.assertRaisesRegex(RuntimeError, errMsg):
+                rpc.rpc_async(dst, fail_on_fut, args=(fut,))
+
+        with TemporaryFileName() as fname:
+            with self.assertRaisesRegex(RuntimeError, errMsg):
+                rpc.remote(dst, fail_on_fut, args=(fut,))
+
+    @dist_init
+    def test_future_done(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        fut = rpc.rpc_async(dst, torch.add, args=(torch.zeros(2), 1))
+        fut.wait()
+        self.assertTrue(fut.done())
+
+    @dist_init
+    def test_future_done_exception(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        fut = rpc.rpc_async(dst, raise_func)
+        with self.assertRaisesRegex(ValueError, "Expected error"):
+            fut.wait()
+        self.assertTrue(fut.done())
+
+    def _test_future_cb(self, func):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        ret = rpc.rpc_sync(
+            dst1,
+            func,
+            args=(dst2, torch.ones(2, 2), 1, 2)
+        )
+        self.assertEqual(ret, torch.ones(2, 2) + 1 + 2)
+
+    @dist_init
+    def test_future_in_rpc(self):
+        self._test_future_cb(add_use_future_set_result)
+
+    @dist_init
+    def test_future_nested_callback(self):
+        self._test_future_cb(add_use_future_nested_cb)
+
+    def _test_async_function_raise(self, mode):
+        with self.assertRaisesRegex(RuntimeError, "Expected error"):
+            self._run_func_in_mode(
+                worker_name((self.rank + 1) % self.world_size),
+                async_raise_func,
+                mode
+            )
+
+    @dist_init
+    def test_async_function_raise(self):
+        self._test_async_function_raise(RPCExecMode.SYNC)
+
+    @dist_init
+    def test_async_function_raise_async(self):
+        self._test_async_function_raise(RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_async_function_raise_remote(self):
+        self._test_async_function_raise(RPCExecMode.REMOTE)
+
+    def _test_async_function_wrong_return_type(self, mode):
+        errMsg = (
+            "Functions decorated with @rpc\\.async_function must return a "
+            "torch\\.futures\\.Future object,"
+        )
+        with self.assertRaisesRegex(RuntimeError, errMsg):
+            self._run_func_in_mode(
+                worker_name((self.rank + 1) % self.world_size),
+                async_wrong_type,
+                mode
+            )
+
+    @dist_init
+    def test_async_function_wrong_return_type(self):
+        self._test_async_function_wrong_return_type(RPCExecMode.SYNC)
+
+    @dist_init
+    def test_async_function_wrong_return_type_async(self):
+        self._test_async_function_wrong_return_type(RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_async_function_wrong_return_type_remote(self):
+        self._test_async_function_wrong_return_type(RPCExecMode.REMOTE)
+
+    @dist_init
+    def test_async_function_simple(self):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        ret = rpc.rpc_sync(dst1, async_add, args=(dst2, torch.ones(2, 2), 1))
+        self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+    def _test_async_function(self, fn, mode=RPCExecMode.SYNC):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        args = (dst2, torch.ones(2, 2), 1, 2)
+        ret = self._run_func_in_mode(dst1, fn, mode, args=args)
+        self.assertEqual(ret, torch.ones(2, 2) + 3)
+
+    @dist_init
+    def test_async_function_with_future_ctor(self):
+        self._test_async_function(async_add_with_future_ctor)
+
+    @dist_init
+    def test_async_function_with_future_ctor_remote(self):
+        self._test_async_function(
+            async_add_with_future_ctor,
+            RPCExecMode.REMOTE
+        )
+
+    @dist_init
+    def test_async_function_chained(self):
+        self._test_async_function(async_add_chained)
+
+    @dist_init
+    def test_async_function_chained_remote(self):
+        self._test_async_function(async_add_chained, RPCExecMode.REMOTE)
+
+    @dist_init
+    def test_async_function_nested(self):
+        self._test_async_function(async_add_nested)
+
+    @dist_init
+    def test_async_function_nested_remote(self):
+        self._test_async_function(async_add_nested, RPCExecMode.REMOTE)
+
+    @dist_init
+    def test_async_static_method(self):
+        self._test_async_function(AsyncExecutionClass.static_async_add)
+
+    @dist_init
+    def test_async_static_method_remote(self):
+        self._test_async_function(
+            AsyncExecutionClass.static_async_add,
+            RPCExecMode.REMOTE
+        )
+
+    @dist_init
+    def test_async_class_method(self):
+        self._test_async_function(AsyncExecutionClass.class_async_add)
+
+    @dist_init
+    def test_async_class_method_remote(self):
+        self._test_async_function(
+            AsyncExecutionClass.class_async_add,
+            RPCExecMode.REMOTE
+        )
+
+    def _test_test_async_class_rref_proxy(self, mode=RPCExecMode.SYNC):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+        rref = rpc.remote(dst1, AsyncExecutionClass)
+
+        x = torch.ones(2, 2)
+        y = torch.ones(2, 2) + 1
+        if mode == RPCExecMode.SYNC:
+            ret = rref.rpc_sync().static_async_add(dst2, x, x, y)
+            ret += rref.rpc_sync().class_async_add(dst2, x, x, y)
+            ret += rref.rpc_sync().bound_async_add(dst2, x, x, y)
+        elif mode == RPCExecMode.ASYNC:
+            ret = rref.rpc_async().static_async_add(dst2, x, x, y).wait()
+            ret += rref.rpc_async().class_async_add(dst2, x, x, y).wait()
+            ret += rref.rpc_async().bound_async_add(dst2, x, x, y).wait()
+        elif mode == RPCExecMode.REMOTE:
+            ret = rref.remote().static_async_add(dst2, x, x, y).to_here()
+            ret += rref.remote().class_async_add(dst2, x, x, y).to_here()
+            ret += rref.remote().bound_async_add(dst2, x, x, y).to_here()
+
+        self.assertEqual(ret, 3 * 4 * x)
+
+    @dist_init
+    def test_async_class_rref_proxy(self):
+        self._test_test_async_class_rref_proxy()
+
+    @dist_init
+    def test_async_class_rref_proxy_async(self):
+        self._test_test_async_class_rref_proxy(mode=RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_async_class_rref_proxy_remote(self):
+        self._test_test_async_class_rref_proxy(mode=RPCExecMode.REMOTE)
+
+    def _test_async_function_multi(self, fn, mode=RPCExecMode.SYNC):
+        dst1 = worker_name((self.rank + 1) % self.world_size)
+        dst2 = worker_name((self.rank + 2) % self.world_size)
+
+        num = 20
+        step = 3
+        args = (dst2, torch.ones(2, 2), num, step)
+        ret = self._run_func_in_mode(dst1, fn, mode, args=args)
+        self.assertEqual(ret, torch.ones(2, 2) + num * step)
+
+    @dist_init
+    def test_async_function_multi_chained(self):
+        self._test_async_function_multi(async_add_chained_multi)
+
+    @dist_init
+    def test_async_function_multi_chained_async(self):
+        self._test_async_function_multi(
+            async_add_chained_multi,
+            RPCExecMode.ASYNC
+        )
+
+    @dist_init
+    def test_async_function_multi_chained_remote(self):
+        self._test_async_function_multi(
+            async_add_chained_multi,
+            RPCExecMode.REMOTE
+        )
+
+    @dist_init
+    def test_async_function_multi_fanout(self):
+        self._test_async_function_multi(async_add_multi_fanout)
+
+    @dist_init
+    def test_async_function_multi_fanout_async(self):
+        self._test_async_function_multi(
+            async_add_multi_fanout,
+            RPCExecMode.ASYNC
+        )
+
+    @dist_init
+    def test_async_function_multi_fanout_remote(self):
+        self._test_async_function_multi(
+            async_add_multi_fanout,
+            RPCExecMode.REMOTE
+        )
+
+    def _test_return_future(self, mode):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Can not pickle torch.futures.Future"
+        ):
+            self._run_func_in_mode(
+                worker_name((self.rank + 1) % self.world_size),
+                return_future,
+                mode
+            )
+
+    @dist_init
+    def test_return_future(self):
+        self._test_return_future(RPCExecMode.SYNC)
+
+    @dist_init
+    def test_return_future_async(self):
+        self._test_return_future(RPCExecMode.ASYNC)
+
+    @dist_init
+    def test_return_future_remote(self):
+        self._test_return_future(RPCExecMode.REMOTE)
+
+    @dist_init
+    def test_rref_timeout(self):
+        # This test is similar to ones in FaultyProcessGroupTest, but is meant to be
+        # run with other backends besides ProcessGroup.
+        if self.rank != 0:
+            return
+
+        dst_rank = (self.rank + 1) % self.world_size
+        dst_worker = f"worker{dst_rank}"
+        # 10 ms timeout
+        rref = rpc.remote(dst_worker, my_sleep_func, args=(2, ), timeout=0.01)
+        # Future corresponding to the remote creation should time out.
+        expected_error = self.get_timeout_error_regex()
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            rref._get_future().wait()
+        # Call to ensure pending callbacks are run.
+        wait_until_pending_futures_and_users_flushed()
+        with self.assertRaisesRegex(RuntimeError, "RRef creation"):
+            rref.to_here()
+
+        wait_until_owners_and_forks_on_rank(1, 1, rank=1)
+
+    @dist_init(setup_rpc=False)
+    @skip_but_pass_in_sandcastle_if(
+        os.environ.get("RPC_INIT_WITH_TCP", None) == "1",
+        "init_pg_then_rpc does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614."
+    )
+    def test_init_pg_then_rpc(self):
+        dist.init_process_group(
+            backend="gloo",
+            init_method=self.init_method,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        # Test RPC.
+        next_rank = (self.rank + 1) % self.world_size
+        ret = rpc.rpc_sync(worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1))
+        self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+        # Test PG
+        dist.barrier()
+
+        rpc.shutdown()
+
+    @dist_init(setup_rpc=False)
+    @skip_but_pass_in_sandcastle_if(
+        os.environ.get("RPC_INIT_WITH_TCP", None) == "1",
+        "init_rpc_then_pg does not work with TCP init, see https://github.com/pytorch/pytorch/issues/41614."
+    )
+    def test_init_rpc_then_pg(self):
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        dist.init_process_group(
+            backend="gloo",
+            init_method=self.init_method,
+            rank=self.rank,
+            world_size=self.world_size,
+        )
+
+        # Test RPC.
+        next_rank = (self.rank + 1) % self.world_size
+        ret = rpc.rpc_sync(worker_name(next_rank), torch.add, args=(torch.ones(2, 2), 1))
+        self.assertEqual(ret, torch.ones(2, 2) + 1)
+
+        # Test PG
+        dist.barrier()
+
+        rpc.shutdown()
+
+    @dist_init
+    def test_wait_all_with_exception(self):
+        futs = []
+        dst = worker_name((self.rank + 1) % self.world_size)
+        for _ in range(10):
+            futs.append(rpc.rpc_async(dst, raise_func))
+
+        with self.assertRaisesRegex(ValueError, "Expected error"):
+            ret = torch.futures.wait_all(futs)
+
+    @dist_init
+    def test_wait_all_with_partial_exception(self):
+        futs = []
+        dst = worker_name((self.rank + 1) % self.world_size)
+        for _ in range(10):
+            futs.append(rpc.rpc_async(dst, torch.add, args=(torch.ones(2), 1)))
+
+        futs.append(rpc.rpc_async(dst, raise_func))
+
+        with self.assertRaisesRegex(ValueError, "Expected error"):
+            ret = torch.futures.wait_all(futs)
+
+    @dist_init(setup_rpc=False)
+    @skip_but_pass_in_sandcastle_if(
+        os.environ.get("RPC_INIT_WITH_TCP", None) == "1",
+        "Test does not work with TCP init, see https://github.com/pytorch/pytorch/issues/46491",
+    )
+    def test_init_rpc_twice(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        rpc.shutdown()
+
+        # Wait for all init to complete.
+        dist.barrier()
+
+        # Use a different file name for the next initialization
+        new_backend_options = self.rpc_backend_options
+        new_backend_options.init_method += "init_2"
+
+        # Ensure rpc initialization works again.
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=new_backend_options,
+        )
+
+        # Verify RPCs work after re-init.
+        dst = worker_name((self.rank + 1) % self.world_size)
+        rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
+        rpc.rpc_sync(dst, foo_add, args=())
+
+        rpc.shutdown()
+
+    def test_wrong_types(self):
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument backend must be a member of BackendType",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend="TENSORPIPE",
+            )
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Argument rpc_backend_options must be an instance of RpcBackendOptions",
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=self.rpc_backend,
+                rpc_backend_options={"init_method": self.init_method}
+            )
+
+    def test_cannot_infer_backend_from_options(self):
+        # An exception should be raised if the backend isn't specified but
+        # options are given which are not an instance of any of the known
+        # agents' option classes.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(TypeError, "Could not infer backend for options"):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                # Do _not_ pass backend.
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    @dist_init
+    def test_owner_rref_backward(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        t1 = torch.rand(10, 10, requires_grad=True)
+        rref = rpc.RRef(t1.sum() + t1.sum())
+        rref.backward()
+        expected_grad = torch.ones_like(t1) * 2
+        self.assertEqual(expected_grad, t1.grad)
+
+        with dist_autograd.context() as context_id:
+            t2 = rpc.rpc_sync(dst, torch.add, args=(t1, t1))
+            rref = rpc.RRef(t2.sum())
+            rref.backward(context_id)
+            self.assertEqual(expected_grad, dist_autograd.get_gradients(context_id)[t1])
+
+        # Double backward.
+        with dist_autograd.context() as context_id:
+            t2 = rpc.rpc_sync(dst, torch.add, args=(t1, t1))
+            rref = rpc.RRef(t2.sum())
+            rref.backward(context_id, retain_graph=True)
+            rref.backward(context_id)
+            self.assertEqual(expected_grad * 2, dist_autograd.get_gradients(context_id)[t1])
+
+        # Test errors.
+        with self.assertRaisesRegex(RuntimeError, "tensors does not require grad and does not have a grad_fn"):
+            rpc.RRef(torch.rand(10)).backward()
+
+        with self.assertRaisesRegex(RuntimeError, "grad can be implicitly created only for scalar outputs"):
+            rpc.RRef(torch.rand(10, requires_grad=True)).backward()
+
+        with self.assertRaisesRegex(RuntimeError, "Could not find autograd context with id: 100"):
+            rpc.RRef(torch.rand(10, requires_grad=True).sum()).backward(100)
+
+        with self.assertRaisesRegex(RuntimeError, "RRef should contain a tensor for .backward()"):
+            rpc.RRef("foo").backward()
+
+    @staticmethod
+    def _sum(x):
+        return x.sum()
+
+    @staticmethod
+    def _identity(x):
+        return x
+
+    @dist_init
+    def test_user_rref_backward(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        t = torch.rand(10, requires_grad=True)
+        with dist_autograd.context() as context_id:
+            rref = rpc.remote(dst, RpcTest._sum, args=(t,))
+            rref.backward(context_id, retain_graph=True)
+            rref.backward(context_id)
+            self.assertEqual(torch.ones_like(t) * 2, dist_autograd.get_gradients(context_id)[t])
+
+        with dist_autograd.context() as context_id:
+            rref = rpc.remote(dst, RpcTest._identity, args=("foo",))
+            with self.assertRaisesRegex(RuntimeError, "RRef should contain a tensor for .backward()"):
+                rref.backward(context_id)
+
+            with self.assertRaisesRegex(RuntimeError, "User RRefs require 'dist_autograd_ctx_id' to be specified"):
+                rref.backward()
+
+    @dist_init(setup_rpc=False)
+    def test_shutdown_errors(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+
+        if self.rank != 0:
+            og_func = rpc.api._broadcast_to_followers
+            og_rref_func = rpc.api._delete_all_user_and_unforked_owner_rrefs
+
+            # Monkey-patch _broadcast_to_followers to fail, which would ensure
+            # _all_gather on leader raises an exception.
+            def raise_error(sequence_id, objects_map):
+                og_func(sequence_id, objects_map)
+                raise RuntimeError('simulation')
+
+            # Monkey-patch _delete_all_user_and_unforked_owner_rrefs to fail,
+            # which would ensure barrier is not called on followers.
+            def rref_error():
+                raise RuntimeError('simulation rref')
+
+            try:
+                rpc.api._broadcast_to_followers = raise_error
+                rpc.api._delete_all_user_and_unforked_owner_rrefs = rref_error
+                with self.assertRaisesRegex(RuntimeError, 'simulation rref'):
+                    rpc.shutdown()
+            finally:
+                rpc.api._broadcast_to_followers = og_func
+                rpc.api._delete_all_user_and_unforked_owner_rrefs = og_rref_func
+        else:
+            with self.assertRaisesRegex(RuntimeError, 'timed out in _all_gather'):
+                rpc.shutdown()
+
+        dist.barrier()
+
+    @dist_init
+    def test_my_parameter_server(self):
+        self._my_parameter_server(False)
+
+
+class CudaRpcTest(RpcAgentTestFixture):
+
+    @skip_if_lt_x_gpu(2)
+    @dist_init
+    def test_profiler_remote_cuda(self):
+        if self.rank != 1:
+            return
+
+        dst_cuda_0 = (self.rank + 1) % self.world_size
+        dst_cuda_1 = (self.rank + 2) % self.world_size
+        dst_worker_cuda_0 = worker_name(dst_cuda_0)
+        dst_worker_cuda_1 = worker_name(dst_cuda_1)
+
+        with _profile(use_cuda=True) as p:
+            fut1 = rpc.rpc_async(dst_worker_cuda_0, udf_with_torch_ops, args=(0, ))
+            fut2 = rpc.rpc_async(dst_worker_cuda_1, udf_with_torch_ops, args=(1, ))
+            fut1.wait()
+            fut2.wait()
+
+        def get_name(event):
+            return event.name[event.name.find(REMOTE_OP_STR) + len(REMOTE_OP_STR):]
+
+        function_events = p.function_events
+        for event in function_events:
+            if event.is_async:
+                self.assertEqual(0, event.cuda_time_total)
+                self.assertEqual([], event.kernels)
+                self.assertEqual(0, event.cuda_time)
+            else:
+                if event.node_id == 1:
+                    continue
+                self.assertTrue(event.node_id in [dst_cuda_0, dst_cuda_1])
+                if get_name(event) in EXPECTED_REMOTE_EVENTS:
+                    self.assertGreater(event.cuda_time_total, 0)
+                    self.assertEqual(1, len(event.kernels))
+                    kernel = event.kernels[0]
+                    if event.node_id == dst_cuda_0:
+                        self.assertEqual(kernel.device, 0)
+                    if event.node_id == dst_cuda_1:
+                        self.assertEqual(kernel.device, 1)
+                    self.assertGreater(event.cuda_time, 0)
+
+        # Validate that EXPECTED_REMOTE_EVENTS is a subset of remotely profiled
+        # events.
+        remote_events = [event for event in function_events if event.is_remote]
+        remote_event_names = [get_name(event) for event in remote_events if get_name(event) in EXPECTED_REMOTE_EVENTS]
+        self.assertEqual(set(remote_event_names), set(EXPECTED_REMOTE_EVENTS))
+
+
+class TensorPipeAgentRpcTest(RpcAgentTestFixture, RpcTestCommon):
+
+    def test_mismatched_type_for_options(self):
+        # An exception should be raised if the options are not an instance of
+        # TensorPipeRpcBackendOptions.
+        rpc_backend_options = FooBackendOptions(self.init_method)
+
+        with self.assertRaisesRegex(
+            TypeError, "`rpc_backend_options` must be a `TensorPipeRpcBackendOptions`"
+        ):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                rank=self.rank,
+                world_size=self.world_size,
+                backend=rpc.BackendType.TENSORPIPE,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    def test_infer_backend_from_options(self):
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            init_method=self.init_method,
+            _transports=tp_transports()
+        )
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            rank=self.rank,
+            world_size=self.world_size,
+            # Do _not_ pass backend.
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        self.assertIsInstance(rpc.api._get_current_rpc_agent(), rpc.TensorPipeAgent)
+
+    # FIXME Merge this test with the corresponding one in RpcTest.
+    @dist_init(setup_rpc=False)
+    def test_set_and_get_num_worker_threads(self):
+        NUM_THREADS = 27
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            init_method=self.rpc_backend_options.init_method,
+            num_worker_threads=NUM_THREADS,
+            _transports=tp_transports(),
+        )
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        info = rpc.api._get_current_rpc_agent().get_debug_info()
+        self.assertEqual(int(info["agent.thread_pool_size"]), NUM_THREADS)
+        rpc.shutdown()
+
+    # FIXME Merge this test with the corresponding one in RpcTest.
+    @dist_init(setup_rpc=False)
+    def test_tensorpipe_set_default_timeout(self):
+        # Set a high timeout since it doesn't affect test runtime and ensures
+        # the test doesn't erroneously timeout due to slow machines.
+        timeout = 100
+        rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+            init_method=self.rpc_backend_options.init_method,
+            num_worker_threads=self.rpc_backend_options.num_worker_threads,
+            rpc_timeout=timeout,
+            _transports=tp_transports(),
+        )
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc_backend_options,
+        )
+
+        default_timeout = rpc.get_rpc_timeout()
+        self.assertEqual(default_timeout, timeout)
+        rpc.shutdown()
+
+    # FIXME Merge this test with the corresponding one in RpcTest.
+    @dist_init(setup_rpc=False)
+    def test_tensorpipe_options_throw_on_timedelta_timeout(self):
+        from datetime import timedelta
+
+        timeout = timedelta()
+        # Ensure that constructing TensorPipeRpcBackendOptions with timedelta fails
+        with self.assertRaisesRegex(TypeError, "incompatible constructor arguments"):
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(
+                init_method=self.rpc_backend_options.init_method,
+                num_worker_threads=self.rpc_backend_options.num_worker_threads,
+                rpc_timeout=timeout,
+            )
+
+    @dist_init
+    def _test_rref_get_type_timeout(self, blocking):
+        # Test where we try to get the type of a RRef from an owner, but RRef
+        # creation is slower than timeout passed into _get_type.
+        dst_rank = (self.rank + 1) % self.world_size
+        dst = worker_name(dst_rank)
+        slow_rref = rpc.remote(dst, MyClass, args=(torch.ones(2, 2), True))
+        timeout = 0.5
+        expected_err = self.get_timeout_error_regex()
+        # Blocking: blocks on inline call
+        if blocking:
+            with self.assertRaisesRegex(RuntimeError, expected_err):
+                slow_rref._get_type(timeout=timeout, blocking=blocking)
+        # Non-blocking: blocks on wait
+        else:
+            fut = slow_rref._get_type(timeout=timeout, blocking=blocking)
+            with self.assertRaisesRegex(RuntimeError, expected_err):
+                fut.wait()
+
+        # FIXME We wait until the remote completed creating the OwnerRRef
+        # because there's currently a race if we shut down RPC before that.
+        slow_rref.to_here()
+
+    def test_rref_get_type_timeout_blocking(self):
+        self._test_rref_get_type_timeout(blocking=True)
+
+    def test_rref_get_type_timeout_non_blocking(self):
+        self._test_rref_get_type_timeout(blocking=False)
+
+    @dist_init
+    def test_op_with_invalid_args(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        with self.assertRaisesRegex(
+            RuntimeError, "Overloaded torch operator invoked from Python failed to match any schema"
+        ):
+            rpc.rpc_sync(dst, torch.add, args=())
+
+    def _test_rref_proxy_timeout(self, rref_proxy_api):
+        dst_rank = (self.rank + 1) % self.world_size
+        dst = worker_name(dst_rank)
+        rref = rpc.remote(dst, MyClass, args=(torch.ones(2, 2), ))
+        # Ensure RRef is created on remote node.
+        rref.to_here()
+        rref_api = getattr(rref, rref_proxy_api)
+        self.assertTrue(rref_api is not None, f"Failed to get RRef proxy api: {rref_proxy_api}")
+        expected_error = self.get_timeout_error_regex()
+        timeout = 2
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            result = rref_api(timeout=timeout).my_slow_method(torch.ones(2, 2))
+            if rref_api == rref.rpc_async:
+                result.wait()
+            elif rref_api == rref.remote:
+                result._get_future().wait()
+
+        # Case where rpc.remote() is stuck and exceeds timeout
+        slow_rref = rpc.remote(dst, MyClass, args=(torch.ones(2, 2), True))
+        timeout = 0.01
+        rref_api = getattr(slow_rref, rref_proxy_api)
+        # Note that even when we call rref.rpc_async() in this case, we
+        # time out in future creation, not waiting for future. This is because
+        # rref proxy function calls rref._get_type before returning future,
+        # which blocks on the RRef being created on owner node, until the
+        # specified timeout.
+        with self.assertRaisesRegex(RuntimeError, expected_error):
+            result = rref_api(timeout=timeout).my_instance_method(torch.ones(2, 2))
+            # rpc_async returns immediately and surface a timeout through wait()
+            if rref_api == slow_rref.rpc_async:
+                result.wait()
+
+        # FIXME We wait until the remote completed creating the OwnerRRef
+        # because there's currently a race if we shut down RPC before that.
+        slow_rref.to_here()
+
+    @dist_init
+    def test_rref_proxy_timeout(self):
+        for rpc_api in ["rpc_sync", "rpc_async", "remote"]:
+            self._test_rref_proxy_timeout(rpc_api)
+
+    @dist_init
+    def test_send_to_rank_sparse(self):
+        dst_rank = (self.rank + 1) % self.world_size
+
+        # Test sparse tensor
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor()
+            y = build_sparse_tensor()
+            expected_tensor = (x + y)
+            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            self.assertEqual(expected_tensor, ret)
+
+        for exec_mode in [RPCExecMode.SYNC, RPCExecMode.ASYNC, RPCExecMode.REMOTE]:
+            x = build_sparse_tensor(coalesce=True)
+            y = build_sparse_tensor(coalesce=True)
+            expected_tensor = (x + y)
+            ret = self._run_func_in_mode(dst_rank, torch.add, exec_mode, args=(x, y))
+            self.assertEqual(expected_tensor, ret)
+
+    @dist_init
+    def test_self_py_udf_remote_sparse(self):
+        self._self_py_udf_remote(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_rpc_arg_sparse(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_rpc_arg(
+            dst,
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_rpc_arg_sparse(self):
+        self._self_remote_rref_as_rpc_arg(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_remote_arg_sparse(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._self_remote_rref_as_remote_arg(
+            dst,
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_self_remote_rref_as_self_remote_arg_sparse(self):
+        self._self_remote_rref_as_remote_arg(
+            rpc.get_worker_info(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    def test_world_size_one_sparse(self):
+        self._world_size_one(
+            build_sparse_tensor(),
+            build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_multi_rpc_sparse(self):
+        self._multi_rpc(True)
+
+    def test_wait_all_workers_sparse(self):
+        self._wait_all_workers(heavy_rpc_sparse, build_sparse_tensor())
+
+    def test_wait_all_workers_twice_sparse(self):
+        self._wait_all_workers_twice(heavy_rpc_sparse, build_sparse_tensor())
+
+    @dist_init
+    def test_py_sparse_tensors_in_container(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        a = [build_sparse_tensor(), build_sparse_tensor()]
+        ret = rpc.rpc_sync(
+            worker_name(dst_rank), my_container_sum, args=(a,)
+        )
+        self.assertEqual(ret, my_container_sum(a))
+
+    @dist_init
+    def test_nested_rpc_sparse(self):
+        self._nested_rpc(nested_rpc_sparse, build_sparse_tensor() * 2)
+
+    @dist_init
+    def test_stress_heavy_rpc_sparse(self):
+        self._stress_test_rpc(heavy_rpc_sparse, repeat=20, args=(build_sparse_tensor(),))
+
+    @dist_init
+    def test_builtin_remote_ret_sparse(self):
+        self._builtin_remote_ret(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 2
+        )
+
+    @dist_init
+    def test_builtin_remote_self_sparse(self):
+        self._builtin_remote_self(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 2
+        )
+
+    @dist_init
+    def test_multi_builtin_remote_ret_sparse(self):
+        self._test_multi_remote_call(
+            torch.add, True,
+            args_fn=RpcTest._multi_args_fn
+        )
+
+    @dist_init
+    def test_multi_py_udf_remote_sparse(self):
+        self._test_multi_remote_call(
+            my_function,
+            True,
+            kwargs_fn=RpcTest._multi_kwargs_fn
+        )
+
+    @dist_init
+    def test_py_rref_args_sparse(self):
+        self._py_rref_args(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 4
+        )
+
+    @dist_init
+    def test_py_rref_args_user_share_sparse(self):
+        self._py_rref_args_user_share(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 6
+        )
+
+    @dist_init
+    def test_py_rpc_rref_args_sparse(self):
+        self._py_rpc_rref_args(
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor(),
+            build_sparse_tensor() * 6
+        )
+
+    @dist_init
+    def test_nested_remote_sparse(self):
+        self._nested_remote(
+            nested_remote_sparse,
+            build_sparse_tensor() + build_sparse_tensor()
+        )
+
+    @dist_init
+    def test_nested_rref_sparse(self):
+        self._nested_rref(
+            nested_rref_sparse,
+            build_sparse_tensor() * 2,
+            build_sparse_tensor() * 2
+        )
+
+    @dist_init
+    def test_nested_rref_stress_sparse(self):
+        self._nested_rref_stress(
+            nested_rref_sparse,
+            build_sparse_tensor() * 2,
+            build_sparse_tensor() * 2
+        )
+
+    @dist_init
+    def test_my_parameter_server_sparse(self):
+        self._my_parameter_server(True)
+
+    # Test init_rpc without world_size argument
+    @dist_init(setup_rpc=False)
+    def test_dynamic_rpc_init_rpc(self):
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            rpc_backend_options=self.rpc_backend_options,
+        )
+        rpc.shutdown()
+
+    # Dynamic RPC new ranks communicate with existing ranks
+    @dist_init(setup_rpc=False)
+    def test_dynamic_rpc_new_rank_can_communicated_with_existing_rank(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+            result = rpc.rpc_sync(worker_name(0), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+            self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    # Dynamic RPC existing ranks can communicate with new ranks
+    @dist_init(setup_rpc=False)
+    def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        # Rest of ranks join after barrier
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        dist.barrier()
+        if self.rank == 0:
+            for i in range(1, self.world_size):
+                result = rpc.rpc_sync(worker_name(i), torch.add, args=(torch.tensor(1), torch.tensor(1)))
+                self.assertEqual(torch.add(torch.tensor(1), torch.tensor(1)), result)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    # Dynamic RPC existing ranks can communicate with new ranks using CUDA rpc
+    @skip_if_lt_x_gpu(2)
+    @dist_init(setup_rpc=False)
+    def test_dynamic_rpc_existing_rank_can_communicate_with_new_rank_cuda(self):
+        initialize_pg(self.file_init_method, self.rank, self.world_size)
+
+        if self.rank == 0:
+            options = self.rpc_backend_options
+            for i in range(1, self.world_size):
+                dst = worker_name(i)
+                options.set_device_map(dst, {1: 0})
+                options.set_device_map(dst, {0: 1})
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=options,
+            )
+
+        # Rank 0 will be initialized with RPC after this barrier
+        dist.barrier()
+
+        # Rest of ranks join after barrier
+        if self.rank != 0:
+            # Newly joined ranks will be able to communicate with rank 0, since that was created first
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # TODO: Cuda RPC is failing due to:
+        # terminate called after throwing an instance of 'c10::Error'
+        # what():  0 <= device && static_cast<size_t>(device) < device_allocator.size()
+        # INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":1937,
+        # please report a bug to PyTorch. Allocator not initialized for device 1: did you call init?
+        # dist.barrier()
+        # if self.rank == 0:
+        #     for i in range(1, self.world_size):
+        #         x = torch.ones(2)
+        #         result_on_device_0 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(0), 1))
+        #         result_on_device_1 = rpc.rpc_sync(worker_name(i), torch.add, args=(x.to(1), 1))
+        #         self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_0)
+        #         self.assertEqual(torch.device('cuda:0'), result_on_device_0.device)
+        #         self.assertEqual(torch.add(torch.ones(2), 1), result_on_device_1)
+        #         self.assertEqual(torch.device('cuda:1'), result_on_device_1.device)
+
+        # Barrier to ensure that all rpc_sync calls are finished
+        dist.barrier()
+        rpc.shutdown()
+
+    @dist_init(setup_rpc=False)
+    def test_dynamic_rpc_init_rpc_without_rank(self):
+        # default initialization uses file init
+        with self.assertRaisesRegex(ValueError, "rank parameter missing"):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        # env init
+        with self.assertRaisesRegex(ValueError, "environment variable RANK expected"):
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(init_method="env://")
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+        # tcp init
+        with self.assertRaisesRegex(ValueError, "rank parameter missing"):
+            rpc_backend_options = rpc.TensorPipeRpcBackendOptions(init_method="tcp://127.0.0.1:23456")
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rpc_backend_options=rpc_backend_options,
+            )
+
+    @dist_init(setup_rpc=False)
+    def test_dynamic_and_static_init_rpc_together(self):
+        # Initialize a static rpc group with size = self.world_size - 1
+        dist.init_process_group(
+            backend='gloo',
+            init_method=self.file_init_method,
+            rank=self.rank,
+            world_size=self.world_size)
+
+        world_size_minus_one = self.world_size - 1
+        if self.rank < world_size_minus_one:
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=world_size_minus_one,
+                rpc_backend_options=self.rpc_backend_options,
+            )
+
+        dist.barrier()
+
+        # Attempt to add an additional dynamic group member
+        if self.rank == world_size_minus_one:
+            # Expect error message to be thrown
+            with self.assertRaisesRegex(RuntimeError, "RPC group mixes statically and dynamically\
+ initialized members which is not supported."):
+                rpc.init_rpc(
+                    name=worker_name(self.rank),
+                    backend=self.rpc_backend,
+                    rank=self.rank,
+                    rpc_backend_options=self.rpc_backend_options,
+                )
+
+class TensorPipeAgentCudaRpcTest(RpcAgentTestFixture, RpcTestCommon):
+
+    def _test_device_maps(self, options, errMsg):
+        with self.assertRaisesRegex(ValueError, errMsg):
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=options,
+            )
+
+        self.assertFalse(rpc.api._is_current_rpc_agent_set())
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_wrong_worker_name(self):
+        options = self.rpc_backend_options
+        options.set_device_map("none_exist", {0: 1})
+
+        self._test_device_maps(
+            options,
+            errMsg="Node worker0 has invalid target node names in its device maps"
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_max_local_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {torch.cuda.device_count(): 0})
+
+        self._test_device_maps(
+            options,
+            errMsg="Node worker0 has source devices with invalid indices in its device map for worker1"
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_max_remote_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {0: torch.cuda.device_count()})
+
+        self._test_device_maps(
+            options,
+            errMsg="Node worker0 has target devices with invalid indices in its device map for worker1"
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_many_to_one(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {1: 0})
+        options.set_device_map(dst, {0: 0})
+
+        self._test_device_maps(
+            options,
+            errMsg="Node worker0 has duplicated target devices in its device map for worker1"
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_one_to_many(self):
+        if self.rank == 0:
+            options = self.rpc_backend_options
+            dst = worker_name((self.rank + 1) % self.world_size)
+            options.set_device_map(dst, {0: 1})
+            with self.assertRaisesRegex(
+                ValueError, "`set_device_map` only supports 1-to-1 mapping"
+            ):
+                options.set_device_map(dst, {0: 0})
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_invalid_min_device(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        with self.assertRaisesRegex(
+            RuntimeError, "Device index must not be negative"
+        ):
+            options.set_device_map(dst, {-1: 0})
+
+        with self.assertRaisesRegex(
+            RuntimeError, "Device index must not be negative"
+        ):
+            options.set_device_map(dst, {0: -1})
+
+    @staticmethod
+    def _gpu_add(x, y):
+        if all([x.is_cuda, x.device.index == 1, y.is_cuda, y.device.index == 1]):
+            return (x + y).to(0)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_gpu(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {0: 1, 1: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        ret = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._gpu_add,
+            args=(torch.zeros(2).to(0), torch.ones(2).to(0))
+        )
+        self.assertEqual(ret.device, torch.device(1))
+        self.assertEqual(ret, (torch.zeros(2) + torch.ones(2)).to(1))
+        rpc.shutdown()
+
+    @staticmethod
+    def _gpu_add_given_devices(x, y, x_to, y_to, z_to):
+        x_device = "cpu" if x.device.type == "cpu" else x.device.index
+        y_device = "cpu" if y.device.type == "cpu" else y.device.index
+        if x_device == x_to and y_device == y_to:
+            return x.to(z_to) + y.to(z_to)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    def _test_device_maps_gpu(self, x_from, y_from, z_to, device_map, dst=None, fn=None):
+        fn = TensorPipeAgentCudaRpcTest._gpu_add_given_devices if fn is None else fn
+        x_to = device_map[x_from]
+        y_to = device_map[y_from]
+
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size) if dst is None else dst
+        options.set_device_map(dst, device_map)
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        x = torch.zeros(2).to(x_from)
+        y = torch.ones(2).to(y_from)
+
+        ret = rpc.rpc_sync(dst, fn, args=(x, y, x_to, y_to, z_to))
+
+        reverse_device_map = {device_map[k] : k for k in device_map}
+        z_from = reverse_device_map[z_to]
+
+        ret_device = "cpu" if ret.device.type == "cpu" else ret.device.index
+        self.assertEqual(ret_device, z_from)
+        self.assertEqual(ret, torch.ones(2).to(z_from))
+
+        rpc.shutdown()
+
+    def test_device_map_cpu(self):
+        self._test_device_maps_gpu(
+            x_from="cpu",
+            y_from="cpu",
+            z_to="cpu",
+            device_map={"cpu" : "cpu"},
+            fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_map_cpu_to_gpu_default(self):
+        self._test_device_maps_gpu(
+            x_from="cpu",
+            y_from="cpu",
+            z_to=0,
+            device_map={"cpu" : 0},
+            fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_cpu_to_gpu_non_default(self):
+        self._test_device_maps_gpu(
+            x_from="cpu",
+            y_from="cpu",
+            z_to=1,
+            device_map={"cpu" : 1},
+            fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_map_gpu_to_cpu_default(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=0,
+            z_to="cpu",
+            device_map={0 : "cpu"},
+            fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_to_cpu_non_default(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=1,
+            z_to="cpu",
+            device_map={1 : "cpu"},
+            fn=TensorPipeAgentCudaRpcTest._gpu_add_given_devices,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_default(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=0,
+            z_to=0,
+            device_map={0 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_non_default(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=1,
+            z_to=1,
+            device_map={1 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_default_to_non_default(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=0,
+            z_to=1,
+            device_map={0 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_non_default_to_default(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=1,
+            z_to=0,
+            device_map={1 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_1(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=0,
+            device_map={0 : 0, 1 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_2(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=1,
+            device_map={0 : 0, 1 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_3(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=0,
+            device_map={0 : 0, 1 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_4(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=1,
+            device_map={0 : 0, 1 : 1}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_5(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=0,
+            device_map={0 : 1, 1 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_6(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=1,
+            device_map={0 : 1, 1 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_7(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=0,
+            device_map={0 : 1, 1 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_8(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=1,
+            device_map={0 : 1, 1 : 0}
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_1(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=0,
+            device_map={0 : 0, 1 : 1},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_2(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=1,
+            device_map={0 : 0, 1 : 1},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_3(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=0,
+            device_map={0 : 0, 1 : 1},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_4(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=1,
+            device_map={0 : 0, 1 : 1},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_5(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=0,
+            device_map={0 : 1, 1 : 0},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_6(self):
+        self._test_device_maps_gpu(
+            x_from=0,
+            y_from=1,
+            z_to=1,
+            device_map={0 : 1, 1 : 0},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_7(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=0,
+            device_map={0 : 1, 1 : 0},
+            dst=worker_name(self.rank)
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_map_gpu_mixed_self_8(self):
+        self._test_device_maps_gpu(
+            x_from=1,
+            y_from=0,
+            z_to=1,
+            device_map={0 : 1, 1 : 0},
+            dst=worker_name(self.rank)
+        )
+
+    @staticmethod
+    def _gpu_add_multi_gpu(x, y):
+        if all([x.is_cuda, x.device.index == 1, y.is_cuda, y.device.index == 0]):
+            return x.to(0) + y, x - y.to(1)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    def _test_device_maps_multi_gpu(self, dst):
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {0: 1})
+        options.set_device_map(dst, {1: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        x = torch.zeros(2).to(0)
+        y = torch.ones(2).to(1)
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._gpu_add_multi_gpu,
+            args=(x, y)
+        )
+
+        self.assertEqual(rets[0].device, torch.device(1))
+        self.assertEqual(rets[1].device, torch.device(0))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_multi_gpu(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._test_device_maps_multi_gpu(dst)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_multi_gpu_self(self):
+        dst = worker_name(self.rank)
+        self._test_device_maps_multi_gpu(dst)
+
+    @staticmethod
+    def _gpu_add_return_to_gpu(x, y):
+        if x.device.type == 'cpu' and y.device.type == 'cpu':
+            return (x + y).to(0), (x - y).to(1), (x * y).to(2), (x / y).to(3)
+        else:
+            raise ValueError("Wrong device affinity")
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_in_options(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=rpc.TensorPipeRpcBackendOptions(
+                init_method=options.init_method,
+                num_worker_threads=options.num_worker_threads,
+                device_maps={dst: {0: 1, 1: 0}},
+                _transports=tp_transports()
+            )
+        )
+
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._gpu_add_multi_gpu,
+            args=(torch.zeros(2).to(0), torch.ones(2).to(1))
+        )
+        self.assertEqual(rets[0].device, torch.device(1))
+        self.assertEqual(rets[1].device, torch.device(0))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(1))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        rpc.shutdown()
+
+    def _test_device_maps_return_to_gpu(self, dst):
+        options = self.rpc_backend_options
+
+        options.set_device_map(dst, {0: 1})
+        options.set_device_map(dst, {1: 2})
+        options.set_device_map(dst, {2: 3})
+        options.set_device_map(dst, {3: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rets = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._gpu_add_return_to_gpu,
+            args=(torch.zeros(2), torch.ones(2))
+        )
+        for i in range(len(rets)):
+            self.assertEqual(rets[i].device, torch.device((3 + i) % 4))
+        self.assertEqual(rets[0], (torch.zeros(2) + torch.ones(2)).to(3))
+        self.assertEqual(rets[1], (torch.zeros(2) - torch.ones(2)).to(0))
+        self.assertEqual(rets[2], (torch.zeros(2) * torch.ones(2)).to(1))
+        self.assertEqual(rets[3], (torch.zeros(2) / torch.ones(2)).to(2))
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_return_to_gpu(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        self._test_device_maps_return_to_gpu(dst)
+
+    @skip_if_lt_x_gpu(4)
+    def test_device_maps_return_to_gpu_self(self):
+        dst = worker_name(self.rank)
+        self._test_device_maps_return_to_gpu(dst)
+
+    @staticmethod
+    def _add_to_gpu(x, y):
+        return (x + y).to(0)
+
+    def _test_device_maps_missing_config(self, mode):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        errMsg = (
+            "TensorPipe RPC backend only supports CPU tensors by default.*"
+            "`set_device_map` on `TensorPipeRpcBackendOptions`"
+        )
+
+        with self.assertRaisesRegex(RuntimeError, errMsg):
+            if mode == RPCExecMode.SYNC:
+                rpc.rpc_sync(dst, torch.add, args=(torch.zeros(2).to(0), 1))
+            elif mode == RPCExecMode.REMOTE:
+                rpc.remote(dst, torch.add, args=(torch.zeros(2).to(0), 1)).to_here()
+            else:
+                raise ValueError(f"unexpected mode {mode}")
+
+        # make sure RPC is still functioning
+        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
+        self.assertEqual(ret, torch.ones(2) + 1)
+
+    def _test_device_maps_missing_config_response(self, mode):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        errMsg = "Response device mapping is not available"
+
+        with self.assertRaisesRegex(RuntimeError, errMsg):
+            if mode == RPCExecMode.SYNC:
+                rpc.rpc_sync(
+                    dst,
+                    TensorPipeAgentCudaRpcTest._add_to_gpu,
+                    args=(torch.zeros(2), 1)
+                )
+            elif mode == RPCExecMode.REMOTE:
+                rpc.remote(
+                    dst,
+                    TensorPipeAgentCudaRpcTest._add_to_gpu,
+                    args=(torch.zeros(2), 1)
+                ).to_here()
+            else:
+                raise ValueError(f"unexpected mode {mode}")
+
+        # make sure RPC is still functioning
+        ret = rpc.rpc_sync(dst, torch.add, args=(torch.ones(2), 1))
+        self.assertEqual(ret, torch.ones(2) + 1)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config(self):
+        self._test_device_maps_missing_config(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_maps_missing_config_not_timeout(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=self.rpc_backend_options
+        )
+
+        timeout = rpc.get_rpc_timeout()
+
+        tik = time.time()
+        self._test_device_maps_missing_config(RPCExecMode.SYNC)
+        rpc.shutdown()
+        tok = time.time()
+
+        self.assertTrue(tok - tik < timeout)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_loop(self):
+        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
+            self._test_device_maps_missing_config(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_response(self):
+        self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_response_loop(self):
+        for _ in range(self.rpc_backend_options.num_worker_threads + 5):
+            self._test_device_maps_missing_config_response(RPCExecMode.SYNC)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_remote(self):
+        self._test_device_maps_missing_config(RPCExecMode.REMOTE)
+
+    @skip_if_lt_x_gpu(1)
+    @dist_init
+    def test_device_maps_missing_config_remote_response(self):
+        self._test_device_maps_missing_config_response(RPCExecMode.REMOTE)
+
+    @skip_if_lt_x_gpu(2)
+    def test_device_maps_remote(self):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, {1: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rref = rpc.remote(
+            dst,
+            TensorPipeAgentCudaRpcTest._add_to_gpu,
+            args=(torch.zeros(2), 1)
+        )
+
+        self.assertEqual(rref.to_here().device.index, 1)
+        self.assertEqual(rref.to_here(), torch.ones(2).to(1))
+
+        rpc.shutdown()
+
+    @staticmethod
+    def _slow_add_on_user_stream(x, y):
+        s0 = torch.cuda.current_stream(x.device)
+        s1 = torch.cuda.Stream(device=x.device)
+        s1.wait_stream(s0)
+        x.record_stream(s1)
+        y.record_stream(s1)
+        with torch.cuda.stream(s1):
+            torch.cuda._sleep(10 * FIFTY_MIL_CYCLES)
+            z = x + y
+        s0.wait_stream(s1)
+        z.record_stream(s0)
+        return z
+
+    def _test_custom_stream(self, fn, device_map):
+        options = self.rpc_backend_options
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options.set_device_map(dst, device_map)
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        fn(dst)
+
+        rpc.shutdown()
+
+    def _test_stream_sync(self, dst):
+        x = torch.ones(2, 2).to(0)
+        ret = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
+            args=(x, x)
+        )
+        self.assertEqual(ret, 2 * x)
+
+    @skip_if_lt_x_gpu(2)
+    def test_custom_stream(self):
+        self._test_custom_stream(self._test_stream_sync, {"cuda:0": "cuda:1"})
+
+    def _test_stream_multi_async(self, dst):
+        futs = []
+        for i in range(20):
+            x = torch.ones(2, 2).to(0) * i
+            futs.append(
+                rpc.rpc_async(
+                    dst,
+                    TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
+                    args=(x, x)
+                )
+            )
+
+        for i in range(20):
+            self.assertEqual(futs[i].wait(), 2 * torch.ones(2, 2).to(0) * i)
+
+    @skip_if_lt_x_gpu(2)
+    def test_custom_stream_multi(self):
+        self._test_custom_stream(
+            self._test_stream_multi_async,
+            {"cuda:0": "cuda:1"}
+        )
+
+    @staticmethod
+    def _nested_slow_add_on_user_stream(dst, x, y, z):
+        ret = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._slow_add_on_user_stream,
+            args=(x, y)
+        )
+
+        return TensorPipeAgentCudaRpcTest._slow_add_on_user_stream(ret, z)
+
+    def _test_stream_nested_sync(self, dst):
+        x = torch.ones(2, 2).to(0)
+        y = torch.ones(2, 2).to(0) * 2
+        z = torch.ones(2, 2).to(0) * 3
+        nested_dst = worker_name((self.rank + 2) % self.world_size)
+        ret = rpc.rpc_sync(
+            dst,
+            TensorPipeAgentCudaRpcTest._nested_slow_add_on_user_stream,
+            args=(nested_dst, x, y, z)
+        )
+        self.assertEqual(ret, 6 * x)
+
+    @skip_if_lt_x_gpu(2)
+    def test_custom_stream_nested(self):
+        self._test_custom_stream(
+            self._test_stream_nested_sync,
+            {"cuda:0": "cuda:1", "cuda:1": "cuda:0"}
+        )
+
+    def _test_stream_nested_multi_async(self, dst):
+        if self.rank == 0:
+            futs = []
+            n = 5
+            xs, ys, zs = [], [], []
+            for i in range(n):
+                x = torch.ones(2, 2).to(0) * (i - 1)
+                y = torch.ones(2, 2).to(0) * i
+                z = torch.ones(2, 2).to(0) * (i + 1)
+                xs.append(x)
+                ys.append(y)
+                zs.append(z)
+                nested_dst = worker_name((self.rank + 2) % self.world_size)
+                futs.append(
+                    rpc.rpc_async(
+                        dst,
+                        TensorPipeAgentCudaRpcTest._nested_slow_add_on_user_stream,
+                        args=(nested_dst, x, y, z)
+                    )
+                )
+
+            for i in range(n):
+                self.assertEqual(futs[i].wait(), xs[i] + ys[i] + zs[i])
+
+    @skip_if_lt_x_gpu(2)
+    def test_custom_stream_nested_multi(self):
+        self._test_custom_stream(
+            self._test_stream_nested_multi_async,
+            {"cuda:0": "cuda:1", "cuda:1": "cuda:0"}
+        )
+
+    @staticmethod
+    def _gpu_add_wrong_gpus(x, y):
+        if x.is_cuda and y.is_cuda:
+            return x.cpu() + y.cuda()
+        else:
+            raise ValueError("Wrong device affinity")
+
+    @skip_if_lt_x_gpu(1)
+    def test_device_mismatch(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {0: 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        x = torch.zeros(2).to(0)
+        y = torch.ones(2).to(0)
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Expected all tensors to be on the same device, but found at least two devices"
+        ):
+            rets = rpc.rpc_sync(
+                dst,
+                TensorPipeAgentCudaRpcTest._gpu_add_wrong_gpus,
+                args=(x, y)
+            )
+
+        rpc.shutdown()
+
+    def _test_rref_synchronization(self, local_device, remote_device):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {local_device : remote_device})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        if self.rank == 1:
+            # This test compares rref.rpc_sync().forward(x) vs rref.remote().forward(x).to_here()
+            # If to_here() is properly synchronized with forward(x) the results must be identical
+            # This test needs multiple iterations and significant batch size to simulate real
+            # training of a CNN of MNIST-like data.
+            # see https://github.com/pytorch/pytorch/issues/54771
+            rref = rpc.remote(dst, MyConvNetForMNIST, args=(remote_device,))
+            for _ in range(10):
+                x = torch.randn(200, 1, 28, 28).to(local_device)
+                actual = rref.remote().forward(x).to_here()
+                expected = rref.rpc_sync().forward(x)
+                self.assertEqual(actual, expected)
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_rref_to_here_synchronization1(self):
+        self._test_rref_synchronization("cuda:0", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_to_here_synchronization2(self):
+        self._test_rref_synchronization("cuda:1", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_to_here_synchronization3(self):
+        self._test_rref_synchronization("cuda:1", "cuda:1")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_to_here_synchronization4(self):
+        self._test_rref_synchronization("cuda:0", "cuda:1")
+
+    def _test_rref_as_arg_synchronization(
+        self,
+        local_device,
+        remote_device,
+        devicesOptions=None
+    ):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {local_device: remote_device})
+
+        input_src = worker_name((self.rank - 1 + self.world_size) % self.world_size)
+        options.set_device_map(input_src, {remote_device: local_device})
+
+        if devicesOptions is not None:
+            options.set_devices(devicesOptions[self.rank])
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        if self.rank == 1:
+            # This test compares rref.rpc_sync().forward(x) vs rref.remote().forward(x).to_here()
+            # If to_here() is properly synchronized with forward(x) the results must be identical
+            # This test needs multiple iterations and significant batch size to simulate real
+            # training of a CNN of MNIST-like data.
+            # see https://github.com/pytorch/pytorch/issues/54771
+            rref = rpc.remote(dst, MyConvNetForMNIST, args=(remote_device,))
+            for _ in range(10):
+                rref_x = RRef(torch.randn(200, 1, 28, 28).to(local_device))
+                actual = rref.remote().forward(rref_x, True).to_here()
+                expected = rref.rpc_sync().forward(rref_x, True)
+                self.assertEqual(actual, expected)
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_rref_as_arg_synchronization1(self):
+        self._test_rref_as_arg_synchronization("cuda:0", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_as_arg_synchronization2(self):
+        self._test_rref_as_arg_synchronization("cuda:1", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_as_arg_synchronization3(self):
+        self._test_rref_as_arg_synchronization("cuda:1", "cuda:1")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_as_arg_synchronization4(self):
+        self._test_rref_as_arg_synchronization("cuda:0", "cuda:1")
+
+    @skip_if_lt_x_gpu(1)
+    def test_rref_as_arg_synchronization5(self):
+        self._test_rref_as_arg_synchronization(
+            "cuda:0",
+            "cuda:0",
+            [["cuda:0"] for _ in range(4)],  # devicesOptions
+        )
+
+    @staticmethod
+    def _rref_relay(rref):
+        return rref.to_here()
+
+    def _test_rref_forward_synchronization(self, local_device, remote_device):
+        options = self.rpc_backend_options
+
+        input_src = worker_name(0)
+        model_dst = worker_name(1)
+        out_relay = worker_name(2)
+
+        if self.rank == 0:
+            # for 1) model construction 2) forward execution
+            options.set_device_map(model_dst, {local_device: remote_device})
+
+            # Forward output will be first copied to the relay node before
+            # returning to the worker. This is intentional, to test RRef
+            # forward CUDA stream synchronizations.
+            options.set_device_map(out_relay, {local_device: local_device})
+        elif self.rank == 1:
+            # worker1 hosts the model and runs forward. The forward functions
+            # calls RRef.to_here(), hence needs to configure the device map
+            options.set_device_map(input_src, {remote_device: local_device})
+        elif self.rank == 2:
+            # worker2 will get the out RRef and call to_here() and hence, needs
+            # to configure device map.
+            options.set_device_map(model_dst, {local_device: remote_device})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        if self.rank == 0:
+            # This test compares rref.rpc_sync().forward(x) vs rref.remote().forward(x).to_here()
+            # If to_here() is properly synchronized with forward(x) the results must be identical
+            # This test needs multiple iterations and significant batch size to simulate real
+            # training of a CNN of MNIST-like data.
+            # see https://github.com/pytorch/pytorch/issues/54771
+            rref = rpc.remote(model_dst, MyConvNetForMNIST, args=(remote_device,))
+            for _ in range(10):
+                rref_input = RRef(torch.randn(200, 1, 28, 28).to(local_device))
+                rref_out = rref.remote().forward(rref_input, True)
+                out = rpc.remote(
+                    out_relay,
+                    TensorPipeAgentCudaRpcTest._rref_relay,
+                    args=(rref_out,)
+                ).to_here()
+                expected = rref.rpc_sync().forward(rref_input, True)
+                self.assertEqual(out, expected)
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_rref_forward_synchronization1(self):
+        self._test_rref_forward_synchronization("cuda:0", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_forward_synchronization2(self):
+        self._test_rref_forward_synchronization("cuda:0", "cuda:1")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_forward_synchronization3(self):
+        self._test_rref_forward_synchronization("cuda:1", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_rref_forward_synchronization4(self):
+        self._test_rref_forward_synchronization("cuda:1", "cuda:1")
+
+    def _test_owner_rref_forward_synchronization(self, local_device, remote_device):
+        if self.rank == 0:
+            options = self.rpc_backend_options
+            options.set_device_map("w0", {local_device: remote_device})
+            rpc.init_rpc(
+                "w0",
+                rank=0,
+                world_size=1,
+                rpc_backend_options=options
+            )
+
+            model = rpc.remote(
+                "w0", torch.nn.Linear, (2048, 20000)
+            ).remote().to(remote_device)
+            for _ in range(30):
+                data = torch.rand(2048, 2048).to(local_device)
+                output = model.rpc_sync().forward(data)
+                # to_here() internally calls localValue as the caller is
+                # the owner of the RRef.
+                v0 = rpc.RRef(output).remote().sum().to_here().item()
+                v1 = output.sum().item()
+                self.assertEqual(v0, v1)
+
+            rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_owner_rref_forward_synchronization1(self):
+        self._test_owner_rref_forward_synchronization("cuda:0", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_owner_rref_forward_synchronization2(self):
+        self._test_owner_rref_forward_synchronization("cuda:0", "cuda:1")
+
+    @skip_if_lt_x_gpu(2)
+    def test_owner_rref_forward_synchronization3(self):
+        self._test_owner_rref_forward_synchronization("cuda:1", "cuda:0")
+
+    @skip_if_lt_x_gpu(2)
+    def test_owner_rref_forward_synchronization4(self):
+        self._test_owner_rref_forward_synchronization("cuda:1", "cuda:1")
+
+    @staticmethod
+    def _return_tensor_view(i):
+        x = torch.ones(1000, 200).cuda(0) * i
+        torch.cuda._sleep(10 * FIFTY_MIL_CYCLES)
+        # serialization of the return value will create a new tensor from the
+        # view, which is done outside of the user function.
+        return x.split(100)[0]
+
+    @skip_if_lt_x_gpu(1)
+    def test_tensor_view_as_return_value(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {0 : 0})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        futs = []
+        for i in range(5):
+            futs.append(rpc.rpc_async(
+                dst,
+                TensorPipeAgentCudaRpcTest._return_tensor_view,
+                args=(i,)
+            ))
+
+        for i in range(5):
+            self.assertEqual(torch.ones(100, 200) * i, futs[i].wait())
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(2)
+    def test_devices_option_mismatch(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Node worker0 has unexpected source devices in its device map for worker1"
+        ):
+            dst = worker_name((self.rank + 1) % self.world_size)
+            options = self.rpc_backend_options
+            options.set_device_map(dst, {0 : 0})
+            options.set_devices([1])
+
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=options,
+            )
+
+            rpc.shutdown()
+
+    @skip_if_lt_x_gpu(2)
+    def test_devices_option_mismatch_reverse(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Node worker0 has unexpected target devices in its device map for worker1"
+        ):
+            dst = worker_name((self.rank + 1) % self.world_size)
+
+            options = rpc.TensorPipeRpcBackendOptions(
+                init_method=self.rpc_backend_options.init_method,
+                num_worker_threads=self.rpc_backend_options.num_worker_threads,
+                device_maps={dst: {0 : 1}},
+                devices=[0]
+            )
+
+            rpc.init_rpc(
+                name=worker_name(self.rank),
+                backend=self.rpc_backend,
+                rank=self.rank,
+                world_size=self.world_size,
+                rpc_backend_options=options,
+            )
+
+            rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_device_as_int(self):
+        fut = Future(devices=[0])
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_device_as_str(self):
+        fut = Future(devices=["cuda:0"])
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_device_as_device(self):
+        fut = Future(devices=[torch.device("cuda", 0)])
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_device_not_cuda(self):
+        with self.assertRaisesRegex(
+            ValueError, "Expected devices to have indices, got cpu"
+        ):
+            fut = Future(devices=["cpu"])
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_cuda_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=lambda t: t, unwrapper=lambda v: v, sparse_tensor=False
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_list_with_cuda_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=lambda t: [t], unwrapper=lambda v: v[0], sparse_tensor=False
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_custom_class_with_cuda_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=TensorWrapper, unwrapper=lambda v: v.tensor, sparse_tensor=False
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_cuda_future_callback_changes_devices(self):
+        # We check proper CUDA stream synchronization by filling the tensor with
+        # the expected value in one stream, and reading it from another stream.
+        tensor0 = torch.zeros((100,), device="cuda:0")
+        tensor1 = torch.zeros((100,), device="cuda:1")
+        parent_future = Future(devices=["cuda:0", "cuda:1"])
+
+        def cb(fut):
+            t0 = fut.value()
+            tensor1.copy_(t0, non_blocking=True)
+            return tensor1
+
+        child_future = parent_future.then(cb)
+        with torch.cuda.device("cuda:0"):
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+                tensor0.fill_(1)
+                parent_future.set_result(tensor0)
+        with torch.cuda.device("cuda:1"):
+            another_stream = torch.cuda.Stream()
+            with torch.cuda.stream(another_stream):
+                self.assertTrue(torch.eq(child_future.wait(), 1).all().item())
+
+    @skip_if_lt_x_gpu(2)
+    def test_cuda_future_value_on_bad_device(self):
+        tensor0 = torch.zeros((100,), device="cuda:0")
+        tensor1 = torch.zeros((100,), device="cuda:1")
+        parent_future = Future(devices=["cuda:1"])
+
+        # As a plus, we test that futures still invoke callbacks even in case of
+        # error, and that the child futures are successful if those callbacks
+        # don't access the parent future.
+        def cb(fut):
+            with torch.cuda.device("cuda:1"):
+                torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+                tensor1.fill_(1)
+                return tensor1
+
+        child_future = parent_future.then(cb)
+        with torch.cuda.device("cuda:0"):
+            stream = torch.cuda.Stream()
+            with torch.cuda.stream(stream):
+                torch.cuda._sleep(int(1000 * get_cycles_per_ms()))
+                tensor0.fill_(1)
+                parent_future.set_result(tensor0)
+        with self.assertRaisesRegex(
+            ValueError,
+            r"The result contained tensors residing on device\(s\) cuda:0 "
+            r"which are not among the expected device\(s\) cuda:1",
+        ):
+            parent_future.wait()
+        with torch.cuda.device("cuda:1"):
+            another_stream = torch.cuda.Stream()
+            with torch.cuda.stream(another_stream):
+                self.assertTrue(torch.eq(child_future.wait(), 1).all().item())
+
+    @skip_if_lt_x_gpu(1)
+    def test_async_execution_with_cuda_future(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {"cuda:0": "cuda:0"})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        t = torch.zeros((100,), device="cuda:0")
+        fut = rpc.rpc_async(dst, async_cuda_sleep_and_set_to_one, args=(t,))
+        another_stream = torch.cuda.Stream("cuda:0")
+        with torch.cuda.stream(another_stream):
+            self.assertTrue(torch.eq(fut.wait(), 1).all().item())
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_async_execution_nested_with_cuda_future(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        nested_dst = worker_name((self.rank + 2) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {"cuda:0": "cuda:0"})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        a = torch.ones((100,), device="cuda:0")
+        b = torch.ones((100,), device="cuda:0")
+        c = torch.ones((100,), device="cuda:0")
+        fut = rpc.rpc_async(dst, async_cuda_nested_add, args=(nested_dst, a, b, c))
+        another_stream = torch.cuda.Stream("cuda:0")
+        with torch.cuda.stream(another_stream):
+            self.assertTrue(torch.eq(fut.wait(), 3).all().item())
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_modify_tensor_inplace(self):
+        tensor = torch.zeros((100,), device="cuda:0")
+        future = Future(devices=["cuda:0"])
+        future.set_result(tensor)
+        # It's weird to modify the value of a future once it's complete, but
+        # technically possible. Currently this is considered undefined behavior
+        # (in practice the future will ignore the modification and still
+        # synchronize with the original value). We could one day add logic to
+        # detect and warn or throw in such cases, but for now we just check that
+        # this doesn't crash.
+        tensor.fill_(1)
+        future.wait()
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_replace_tensor(self):
+        tensor_list = [torch.zeros((100,), device="cuda:0")]
+        future = Future(devices=["cuda:0"])
+        future.set_result(tensor_list)
+        # It's weird to modify the value of a future once it's complete, but
+        # technically possible. Currently this is considered undefined behavior
+        # (in practice the future will ignore the modification and still
+        # synchronize with the original value). We could one day add logic to
+        # detect and warn or throw in such cases, but for now we just check that
+        # this doesn't crash.
+        # We set things up so that the original tensor contained in the list
+        # gets deleted once we replace it with the other one. This will
+        # invalidate any cached information held by the future.
+        tensor_list[0] = torch.ones((100,), device="cuda:0")
+        future.wait()
+
+    @skip_if_lt_x_gpu(1)
+    def test_rref_with_unpickleable_attributes(self):
+        dst = worker_name((self.rank + 1) % self.world_size)
+        options = self.rpc_backend_options
+        options.set_device_map(dst, {"cuda:0": "cuda:0"})
+
+        rpc.init_rpc(
+            name=worker_name(self.rank),
+            backend=self.rpc_backend,
+            rank=self.rank,
+            world_size=self.world_size,
+            rpc_backend_options=options,
+        )
+
+        rref = rpc.remote(dst, TensorWrapper, args=(torch.zeros(42, device="cuda:0"),))
+        rref.rpc_sync().increase(1)
+        ret = rref.rpc_sync().sum()
+        self.assertEqual(ret, 42)
+
+        rpc.shutdown()
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_cuda_sparse_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=lambda t: t, unwrapper=lambda v: v, sparse_tensor=True
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_list_with_cuda_sparse_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=lambda t: [t], unwrapper=lambda v: v[0], sparse_tensor=True
+        )
+
+    @skip_if_lt_x_gpu(1)
+    def test_cuda_future_can_extract_custom_class_with_cuda_sparse_tensor(self):
+        self._test_cuda_future_extraction(
+            wrapper=TensorWrapper, unwrapper=lambda v: v.tensor, sparse_tensor=True
+        )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
new file mode 100644
index 0000000000000000000000000000000000000000..75184ac093033a3bda7c09f8bb9a9ffa6af62b31
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc/tensorpipe_rpc_agent_test_fixture.py
@@ -0,0 +1,34 @@
+# mypy: ignore-errors
+
+import torch.distributed.rpc as rpc
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+from torch.testing._internal.common_distributed import (
+    tp_transports,
+)
+
+
+class TensorPipeRpcAgentTestFixture(RpcAgentTestFixture):
+    @property
+    def rpc_backend(self):
+        return rpc.backend_registry.BackendType[
+            "TENSORPIPE"
+        ]
+
+    @property
+    def rpc_backend_options(self):
+        return rpc.backend_registry.construct_rpc_backend_options(
+            self.rpc_backend,
+            init_method=self.init_method,
+            _transports=tp_transports()
+        )
+
+    def get_shutdown_error_regex(self):
+        # FIXME Once we consolidate the error messages returned by the
+        # TensorPipe agent put some more specific regex here.
+        error_regexes = [".*"]
+        return "|".join([f"({error_str})" for error_str in error_regexes])
+
+    def get_timeout_error_regex(self):
+        return "RPC ran for more than"
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..898def0c264a7f4affab653243d3fbcb4308d08a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/distributed/rpc_utils.py
@@ -0,0 +1,185 @@
+# mypy: ignore-errors
+
+import os
+import sys
+import unittest
+from typing import Dict, List, Type
+
+from torch.testing._internal.common_distributed import MultiProcessTestCase
+from torch.testing._internal.common_utils import (
+    TEST_WITH_DEV_DBG_ASAN,
+    find_free_port,
+    IS_SANDCASTLE,
+)
+from torch.testing._internal.distributed.ddp_under_dist_autograd_test import (
+    CudaDdpComparisonTest,
+    DdpComparisonTest,
+    DdpUnderDistAutogradTest,
+)
+from torch.testing._internal.distributed.pipe_with_ddp_test import (
+    PipeWithDDPTest,
+)
+from torch.testing._internal.distributed.nn.api.remote_module_test import (
+    CudaRemoteModuleTest,
+    RemoteModuleTest,
+    ThreeWorkersRemoteModuleTest,
+)
+from torch.testing._internal.distributed.rpc.dist_autograd_test import (
+    DistAutogradTest,
+    CudaDistAutogradTest,
+    FaultyAgentDistAutogradTest,
+    TensorPipeAgentDistAutogradTest,
+    TensorPipeCudaDistAutogradTest
+)
+from torch.testing._internal.distributed.rpc.dist_optimizer_test import (
+    DistOptimizerTest,
+)
+from torch.testing._internal.distributed.rpc.jit.dist_autograd_test import (
+    JitDistAutogradTest,
+)
+from torch.testing._internal.distributed.rpc.jit.rpc_test import JitRpcTest
+from torch.testing._internal.distributed.rpc.jit.rpc_test_faulty import (
+    JitFaultyAgentRpcTest,
+)
+from torch.testing._internal.distributed.rpc.rpc_agent_test_fixture import (
+    RpcAgentTestFixture,
+)
+from torch.testing._internal.distributed.rpc.faulty_agent_rpc_test import (
+    FaultyAgentRpcTest,
+)
+from torch.testing._internal.distributed.rpc.rpc_test import (
+    CudaRpcTest,
+    RpcTest,
+    TensorPipeAgentRpcTest,
+    TensorPipeAgentCudaRpcTest,
+)
+from torch.testing._internal.distributed.rpc.examples.parameter_server_test import ParameterServerTest
+from torch.testing._internal.distributed.rpc.examples.reinforcement_learning_rpc_test import (
+    ReinforcementLearningRpcTest,
+)
+
+
+def _check_and_set_tcp_init():
+    # if we are running with TCP init, set main address and port
+    # before spawning subprocesses, since different processes could find
+    # different ports.
+    use_tcp_init = os.environ.get("RPC_INIT_WITH_TCP", None)
+    if use_tcp_init == "1":
+        os.environ["MASTER_ADDR"] = '127.0.0.1'
+        os.environ["MASTER_PORT"] = str(find_free_port())
+
+def _check_and_unset_tcp_init():
+    use_tcp_init = os.environ.get("RPC_INIT_WITH_TCP", None)
+    if use_tcp_init == "1":
+        del os.environ["MASTER_ADDR"]
+        del os.environ["MASTER_PORT"]
+
+# The tests for the RPC module need to cover multiple possible combinations:
+# - different aspects of the API, each one having its own suite of tests;
+# - different agents (ProcessGroup, TensorPipe, ...);
+# To avoid a combinatorial explosion in code size, and to prevent forgetting to
+# add a combination, these are generated automatically by the code in this file.
+# Here, we collect all the test suites that we need to cover.
+# We then have one separate file for each agent, from which
+# we call the generate_tests function of this file, passing to it a fixture for
+# the agent, which then gets mixed-in with each test suite.
+
+@unittest.skipIf(
+    TEST_WITH_DEV_DBG_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
+)
+class SpawnHelper(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        _check_and_set_tcp_init()
+        self._spawn_processes()
+
+    def tearDown(self):
+        _check_and_unset_tcp_init()
+        super().tearDown()
+
+
+# This list contains test suites that are agent-agnostic and that only verify
+# compliance with the generic RPC interface specification. These tests should
+# *not* make use of implementation details of a specific agent (options,
+# attributes, ...). These test suites will be instantiated multiple times, once
+# for each agent (except the faulty agent, which is special).
+GENERIC_TESTS = [
+    RpcTest,
+    ParameterServerTest,
+    DistAutogradTest,
+    DistOptimizerTest,
+    JitRpcTest,
+    JitDistAutogradTest,
+    RemoteModuleTest,
+    ThreeWorkersRemoteModuleTest,
+    DdpUnderDistAutogradTest,
+    DdpComparisonTest,
+    ReinforcementLearningRpcTest,
+]
+GENERIC_CUDA_TESTS = [
+    CudaRpcTest,
+    CudaDistAutogradTest,
+    CudaRemoteModuleTest,
+    CudaDdpComparisonTest,
+    PipeWithDDPTest,
+]
+
+
+# This list contains test suites that will only be run on the TensorPipeAgent.
+# These suites should be standalone, and separate from the ones in the generic
+# list (not subclasses of those!).
+TENSORPIPE_TESTS = [
+    TensorPipeAgentRpcTest,
+    TensorPipeAgentDistAutogradTest,
+]
+TENSORPIPE_CUDA_TESTS = [
+    TensorPipeAgentCudaRpcTest,
+    TensorPipeCudaDistAutogradTest,
+]
+
+
+# This list contains test suites that will only be run on the faulty RPC agent.
+# That agent is special as it's only used to perform fault injection in order to
+# verify the error handling behavior. Thus the faulty agent will only run the
+# suites in this list, which were designed to test such behaviors, and not the
+# ones in the generic list.
+FAULTY_AGENT_TESTS = [
+    FaultyAgentRpcTest,
+    FaultyAgentDistAutogradTest,
+    JitFaultyAgentRpcTest,
+]
+
+
+def generate_tests(
+    prefix: str,
+    mixin: Type[RpcAgentTestFixture],
+    tests: List[Type[RpcAgentTestFixture]],
+    module_name: str,
+) -> Dict[str, Type[RpcAgentTestFixture]]:
+    """Mix in the classes needed to autogenerate the tests based on the params.
+
+    Takes a series of test suites, each written against a "generic" agent (i.e.,
+    derived from the abstract RpcAgentTestFixture class), as the `tests` args.
+    Takes a concrete subclass of RpcAgentTestFixture, which specializes it for a
+    certain agent, as the `mixin` arg. Produces all combinations of them.
+    Returns a dictionary of class names to class type
+    objects which can be inserted into the global namespace of the calling
+    module. The name of each test will be a concatenation of the `prefix` arg
+    and the original name of the test suite.
+    The `module_name` should be the name of the calling module so
+    that the classes can be fixed to make it look like they belong to it, which
+    is necessary for pickling to work on them.
+    """
+    ret: Dict[str, Type[RpcAgentTestFixture]] = {}
+    for test_class in tests:
+        if IS_SANDCASTLE and TEST_WITH_DEV_DBG_ASAN:
+            print(
+                f'Skipping test {test_class} on sandcastle for the following reason: '
+                'Skip dev-asan as torch + multiprocessing spawn have known issues', file=sys.stderr)
+            continue
+
+        name = f"{prefix}{test_class.__name__}"
+        class_ = type(name, (test_class, mixin, SpawnHelper), {})
+        class_.__module__ = module_name
+        ret[name] = class_
+    return ret
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/dynamo_test_failures.py b/MLPY/Lib/site-packages/torch/testing/_internal/dynamo_test_failures.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a99285190d8e8ffe13112fcf3f8b058c136dd24
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/dynamo_test_failures.py
@@ -0,0 +1,113 @@
+import logging
+import os
+import sys
+
+# NOTE: [dynamo_test_failures.py]
+#
+# We generate xFailIfTorchDynamo* for all tests in `dynamo_expected_failures`
+# We generate skipIfTorchDynamo* for all tests in `dynamo_skips`
+#
+# For an easier-than-manual way of generating and updating these lists,
+# see scripts/compile_tests/update_failures.py
+#
+# If you're adding a new test, and it's failing PYTORCH_TEST_WITH_DYNAMO=1,
+# either add the appropriate decorators to your test or add skips for them
+# via test/dynamo_skips and test/dynamo_expected_failures.
+#
+# *These are not exactly unittest.expectedFailure and unittest.skip. We'll
+# always execute the test and then suppress the signal, if necessary.
+# If your tests crashes, or is slow, please use @skipIfTorchDynamo instead.
+#
+# The expected failure and skip files are located in test/dynamo_skips and
+# test/dynamo_expected_failures. They're individual files rather than a list so
+# git will merge changes easier.
+
+
+def find_test_dir():
+    # Find the path to the dynamo expected failure and skip files.
+    from os.path import abspath, basename, dirname, exists, join, normpath
+
+    if sys.platform == "win32":
+        return None
+
+    # Check relative to this file (local build):
+    test_dir = normpath(join(dirname(abspath(__file__)), "../../../test"))
+    if exists(join(test_dir, "dynamo_expected_failures")):
+        return test_dir
+
+    # Check relative to __main__ (installed builds relative to test file):
+    main = sys.modules["__main__"]
+    file = getattr(main, "__file__", None)
+    if file is None:
+        # Generated files do not have a module.__file__
+        return None
+    test_dir = dirname(abspath(file))
+    while dirname(test_dir) != test_dir:
+        if basename(test_dir) == "test" and exists(
+            join(test_dir, "dynamo_expected_failures")
+        ):
+            return test_dir
+        test_dir = dirname(test_dir)
+
+    # Not found
+    return None
+
+
+test_dir = find_test_dir()
+if not test_dir:
+    logger = logging.getLogger(__name__)
+    logger.warning(
+        "test/dynamo_expected_failures directory not found - known dynamo errors won't be skipped."
+    )
+
+# Tests that run without strict mode in PYTORCH_TEST_WITH_INDUCTOR=1.
+# Please don't add anything to this list.
+FIXME_inductor_non_strict = {
+    "test_modules",
+    "test_ops",
+    "test_ops_gradients",
+    "test_torch",
+}
+
+# We generate unittest.expectedFailure for all of the following tests
+# when run under PYTORCH_TEST_WITH_DYNAMO=1.
+# see NOTE [dynamo_test_failures.py] for more details
+#
+# This lists exists so we can more easily add large numbers of failing tests,
+if test_dir is None:
+    dynamo_expected_failures = set()
+    dynamo_skips = set()
+else:
+    failures_directory = os.path.join(test_dir, "dynamo_expected_failures")
+    skips_directory = os.path.join(test_dir, "dynamo_skips")
+
+    dynamo_expected_failures = set(os.listdir(failures_directory))
+    dynamo_skips = set(os.listdir(skips_directory))
+
+# TODO: due to case sensitivity problems, for now list these files by hand
+extra_dynamo_skips = {
+    "TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_fake_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_inplace_t_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_T_cpu_float32",
+    "TestProxyTensorOpInfoCPU.test_make_fx_symbolic_exhaustive_out_t_cpu_float32",
+}
+dynamo_skips = dynamo_skips.union(extra_dynamo_skips)
+
+
+# verify some invariants
+for test in dynamo_expected_failures.union(dynamo_skips):
+    if len(test.split(".")) != 2:
+        raise AssertionError(f'Invalid test name: "{test}"')
+
+intersection = dynamo_expected_failures.intersection(dynamo_skips)
+if len(intersection) > 0:
+    raise AssertionError(
+        "there should be no overlap between dynamo_expected_failures "
+        "and dynamo_skips, got " + str(intersection)
+    )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/generated/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/generated/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccaf7ac251fb717d5ded6e64612c1b08c5199821
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/annotated_fn_args.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/annotated_fn_args.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..363bf9bf6bae10af0a174c98e0129b384744303a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/generated/__pycache__/annotated_fn_args.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/generated/annotated_fn_args.py b/MLPY/Lib/site-packages/torch/testing/_internal/generated/annotated_fn_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..63518e3cb7f25d9471c11ea39817db7e3f0d5f4b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/generated/annotated_fn_args.py
@@ -0,0 +1,2849 @@
+"""
+This file is needed for generating procedural tests required for
+testing __torch_function__. See tests/test_overrides.py.
+"""
+
+# flake8: noqa
+import torch
+
+annotated_args = {
+    torch._C._VariableFunctions._cast_Byte: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Char: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Double: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Float: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Int: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Long: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Short: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cast_Half: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._make_dual: [{'is_kwarg_only': 'False', 'name': 'primal', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tangent', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._unpack_dual: [{'is_kwarg_only': 'False', 'name': 'dual', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.align_tensors: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._assert_async: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._assert_async: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'assert_msg', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions._assert_scalar: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'assert_msg', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions._functional_assert_scalar: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'assert_msg', 'simple_type': 'c10::string_view'}, {'is_kwarg_only': 'False', 'name': 'dep_token', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._functional_assert_async: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'assert_msg', 'simple_type': 'c10::string_view'}, {'is_kwarg_only': 'False', 'name': 'dep_token', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._assert_tensor_metadata: [{'is_kwarg_only': 'False', 'name': 'a', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._print: [{'is_kwarg_only': 'False', 'name': 's', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.sym_constrain_range: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.sym_constrain_range_for_size: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._functional_sym_constrain_range: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'int64_t?'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'int64_t?'}, {'is_kwarg_only': 'False', 'name': 'dep_token', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._functional_sym_constrain_range_for_size: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'int64_t?'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'int64_t?'}, {'is_kwarg_only': 'False', 'name': 'dep_token', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._make_dep_token: [],
+    torch._C._VariableFunctions._use_cudnn_ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'blank', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._use_cudnn_ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blank', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._cudnn_ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'blank', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'zero_infinity', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._cudnn_ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blank', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'zero_infinity', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._use_cudnn_rnn_flatten_weight: [],
+    torch._C._VariableFunctions._cudnn_rnn_flatten_weight: [{'is_kwarg_only': 'False', 'name': 'weight_arr', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weight_stride0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'input_size', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'hidden_size', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'proj_size', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._cudnn_rnn: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weight_stride0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'weight_buf', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cx', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'hidden_size', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'proj_size', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dropout_state', 'simple_type': 'Tensor?'}],
+    torch._C._VariableFunctions._cudnn_init_dropout_state: [{'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'dropout_seed', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._debug_has_internal_overlap: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._fused_dropout: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._masked_scale: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.native_dropout: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool?'}],
+    torch._C._VariableFunctions._sobol_engine_draw: [{'is_kwarg_only': 'False', 'name': 'quasi', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'sobolstate', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_generated', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType?'}],
+    torch._C._VariableFunctions._sobol_engine_ff_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'sobolstate', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_generated', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._sobol_engine_scramble_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ltm', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._sobol_engine_initialize_state_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._reshape_from_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shape', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._shape_as_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.dropout: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.dropout_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.feature_dropout: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.feature_dropout_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.alpha_dropout: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.alpha_dropout_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.feature_alpha_dropout: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.feature_alpha_dropout_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.abs: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.abs: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.abs_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.absolute: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.absolute: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.angle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.angle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_as_real: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_as_complex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sgn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sgn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.real: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.imag: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._conj_physical: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conj_physical: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conj_physical: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conj_physical_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.resolve_conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.resolve_neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._neg_view: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.avg_pool1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.adaptive_avg_pool1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.adaptive_max_pool1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._add_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._add_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._add_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._add_relu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._add_relu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.addmv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addmv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addmv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.affine_grid_generator: [{'is_kwarg_only': 'False', 'name': 'theta', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._is_all_true: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._is_any_true: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_check_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_functorch_fallback: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.allclose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arange: [{'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.arange: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.arange: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.arange: [{'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.arange: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._dim_arange: [{'is_kwarg_only': 'False', 'name': 'like', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.argmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.argmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.argmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.argmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.acosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arccosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.asinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.asinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.asinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.as_strided: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.as_strided_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.asin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.asin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.asin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arcsin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atleast_1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atleast_1d: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.atleast_2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atleast_2d: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.atleast_3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atleast_3d: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.baddbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.baddbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bartlett_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.bartlett_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'cudnn_enabled', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.quantized_batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'var', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'output_scale', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'output_zero_point', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._batch_norm_impl_index: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'cudnn_enabled', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.bernoulli: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bernoulli: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bernoulli: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.bilinear: [{'is_kwarg_only': 'False', 'name': 'input1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.binary_cross_entropy_with_logits: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bincount: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._lazy_clone: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logical_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.blackman_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.blackman_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.bmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.broadcast_tensors: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.broadcast_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._sparse_broadcast_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.concat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.concat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.concat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.concat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.concatenate: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.concatenate: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.concatenate: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.concatenate: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.block_diag: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.ceil: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ceil: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ceil_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.chain_matmul: [{'is_kwarg_only': 'False', 'name': 'matrices', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.chain_matmul: [{'is_kwarg_only': 'False', 'name': 'matrices', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.unsafe_chunk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'chunks', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.chunk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'chunks', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor_indices_or_sections', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.clip_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cudnn_is_acceptable: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.complex: [{'is_kwarg_only': 'False', 'name': 'real', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'imag', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.complex: [{'is_kwarg_only': 'False', 'name': 'real', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'imag', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.polar: [{'is_kwarg_only': 'False', 'name': 'abs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'angle', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.polar: [{'is_kwarg_only': 'False', 'name': 'abs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'angle', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.constant_pad_nd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pad', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.convolution: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'transposed', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions._convolution: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'transposed', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'cudnn_enabled', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'allow_tf32', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._convolution: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'transposed', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'cudnn_enabled', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._convolution_mode: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'c10::string_view'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.conv1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv_tbc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv_transpose1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv_transpose2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.conv_transpose3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._copy_from: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dst', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._copy_from_and_resize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dst', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cosine_embedding_loss: [{'is_kwarg_only': 'False', 'name': 'input1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.count_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.count_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cov: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.corrcoef: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cudnn_affine_grid_generator: [{'is_kwarg_only': 'False', 'name': 'theta', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'N', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'C', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'H', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'W', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cudnn_batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'exponential_average_factor', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'epsilon', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.cudnn_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'allow_tf32', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.cudnn_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'allow_tf32', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.cudnn_convolution_transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'allow_tf32', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._mps_convolution_transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.cudnn_convolution_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.cudnn_convolution_add_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'z', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'alpha', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.cudnn_grid_sampler: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'grid', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._cummax_helper: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._cummin_helper: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.cumulative_trapezoid: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cumulative_trapezoid: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._ctc_loss: [{'is_kwarg_only': 'False', 'name': 'log_probs', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'targets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input_lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target_lengths', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diag_embed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diagflat: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diagonal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diagonal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diff: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diff: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'spacing', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'spacing', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'spacing', 'simple_type': 'ScalarList'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'spacing', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.gradient: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'spacing', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch._C._VariableFunctions.true_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.true_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.true_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.dot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.dot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.vdot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.vdot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.einsum: [{'is_kwarg_only': 'False', 'name': 'equation', 'simple_type': 'c10::string_view'}, {'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.embedding: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.embedding_renorm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max_norm', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'norm_type', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._embedding_bag_forward_only: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._rowwise_prune: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'compressed_indices_dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.row_stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.row_stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.embedding_bag: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.embedding_bag: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_grad_by_freq', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'sparse', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'per_sample_weights', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'include_last_offset', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'padding_idx', 'simple_type': 'int64_t?'}],
+    torch._C._VariableFunctions._embedding_bag: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.empty: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.empty: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.empty: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.empty_permuted: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'physical_layout', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._empty_affine_quantized: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._empty_per_channel_affine_quantized: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'scales', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'zero_points', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'axis', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._resize_output_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'device', 'simple_type': 'Device'}],
+    torch._C._VariableFunctions.empty_quantized: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'qtensor', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.empty_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.empty_strided: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erf_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erfc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.exp2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.expm1_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.eye: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.eye: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'm', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.eye: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.eye: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'm', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'start_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'end_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'start_dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'end_dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'DimnameList'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.unflatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.unflatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'sizes', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList'}],
+    torch._C._VariableFunctions.fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.floor_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.frac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.frac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.frac_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.full: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.full: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.full: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.full_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.from_file: [{'is_kwarg_only': 'False', 'name': 'filename', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.gcd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gcd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gcd_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lcm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lcm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lcm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.grid_sampler: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'grid', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'interpolation_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'padding_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.grid_sampler_2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'grid', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'interpolation_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'padding_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._grid_sampler_2d_cpu_fallback: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'grid', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'interpolation_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'padding_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.grid_sampler_3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'grid', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'interpolation_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'padding_mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.hann_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.hann_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.hamming_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.hamming_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.hamming_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'alpha', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.hamming_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'alpha', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'beta', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.kaiser_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.kaiser_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.kaiser_window: [{'is_kwarg_only': 'False', 'name': 'window_length', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'periodic', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'beta', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.hinge_embedding_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.group_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_groups', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.native_group_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'N', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'C', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'HxW', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'group', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._fft_r2c: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'onesided', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fft_r2c: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'onesided', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fft_c2r: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'last_dim_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions._fft_c2r: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'last_dim_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions._fft_c2c: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'forward', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fft_c2c: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'normalization', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'forward', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._validate_compressed_sparse_indices: [{'is_kwarg_only': 'False', 'name': 'is_crow', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'compressed_idx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'plain_idx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cdim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'nnz', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._cufft_get_plan_cache_size: [{'is_kwarg_only': 'False', 'name': 'device_index', 'simple_type': 'DeviceIndex'}],
+    torch._C._VariableFunctions._cufft_get_plan_cache_max_size: [{'is_kwarg_only': 'False', 'name': 'device_index', 'simple_type': 'DeviceIndex'}],
+    torch._C._VariableFunctions._cufft_set_plan_cache_max_size: [{'is_kwarg_only': 'False', 'name': 'device_index', 'simple_type': 'DeviceIndex'}, {'is_kwarg_only': 'False', 'name': 'max_size', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._cufft_clear_plan_cache: [{'is_kwarg_only': 'False', 'name': 'device_index', 'simple_type': 'DeviceIndex'}],
+    torch._C._VariableFunctions._unsafe_index: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}],
+    torch._C._VariableFunctions.index_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_put_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_put: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._unsafe_index_put: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._index_put_impl_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.instance_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'use_input_stats', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'cudnn_enabled', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.isclose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'elements', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'test_elements', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'elements', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'test_elements', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'elements', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'test_element', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'elements', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'test_element', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'element', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'test_elements', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isin: [{'is_kwarg_only': 'False', 'name': 'element', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'test_elements', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isnan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_distributed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_floating_point: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_complex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._is_zerotensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isreal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_same_size: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_signed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.is_inference: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.kl_div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.kron: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.kron: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.layer_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'normalized_shape', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.native_layer_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'normalized_shape', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.nan_to_num: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nan_to_num: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nan_to_num_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mkldnn_linear_backward_weights: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias_defined', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._cslt_compress: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cslt_sparse_mm: [{'is_kwarg_only': 'False', 'name': 'compressed_A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dense_B', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._cslt_sparse_mm_search: [{'is_kwarg_only': 'False', 'name': 'compressed_A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dense_B', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._sparse_semi_structured_linear: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'meta', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._mixed_dtypes_linear: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_linear_int8_weight_fp32_activation: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight_scale', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'weight_zero_point', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_linear_int8_weight: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight_scale', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'weight_zero_point', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_linear_quantize_weight: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_pack_gemm_matrix_fp16: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_linear_fp16_weight_fp32_activation: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_linear_fp16_weight: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_pack_quantized_matrix: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fbgemm_pack_quantized_matrix: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'K', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'N', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.ldexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ldexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ldexp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.linspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.log: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log10: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log10: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log10_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log1p_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.log2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logaddexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logaddexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logaddexp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logaddexp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.xlogy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.xlogy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logspace: [{'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'steps', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'half_to_float', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'half_to_float', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._log_softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'input_dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._log_softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'input_dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.margin_ranking_loss: [{'is_kwarg_only': 'False', 'name': 'input1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.matrix_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.matrix_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.matrix_exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._aminmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._aminmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.aminmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.aminmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._compute_linear_combination: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'coefficients', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._compute_linear_combination: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'coefficients', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.amax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.amax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.max_pool1d_with_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.max_pool1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.mkldnn_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.mkldnn_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._VariableFunctions.quantized_max_pool1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.quantized_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.quantized_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._VariableFunctions.max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._VariableFunctions.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.nanmean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nanmean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.amin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.amin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._mps_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.mkldnn_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.mkldnn_rnn_layer: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight0', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight3', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx_', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cx_', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reverse', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'hidden_size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.miopen_batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'exponential_average_factor', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'epsilon', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.miopen_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.miopen_convolution_transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'output_padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.miopen_depthwise_convolution: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'benchmark', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'deterministic', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.miopen_convolution_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.miopen_convolution_add_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'z', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'alpha', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.miopen_rnn: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weight_stride0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cx', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'hidden_size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dropout_state', 'simple_type': 'Tensor?'}],
+    torch._C._VariableFunctions.mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._int_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._int_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._convert_weight_to_int4pack: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'innerKTiles', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._weight_int4pack_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'qGroupSize', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'qScaleAndZeros', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._weight_int8pack_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scales', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._sparse_sparse_matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.multiply: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.multiply: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.multiply: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.mv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mvlgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.mvlgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.narrow_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.narrow_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.narrow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.narrow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.native_batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.native_batch_norm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._native_batch_norm_legit: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._native_batch_norm_legit: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._native_batch_norm_legit: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._native_batch_norm_legit: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'training', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions._native_batch_norm_legit_no_training: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.batch_norm_stats: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.batch_norm_elemt: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.batch_norm_elemt: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.batch_norm_gather_stats: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'count', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.batch_norm_gather_stats_with_counts: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'counts', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.batch_norm_backward_reduce: [{'is_kwarg_only': 'False', 'name': 'grad_out', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'input_g', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'weight_g', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bias_g', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.batch_norm_backward_elemt: [{'is_kwarg_only': 'False', 'name': 'grad_out', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'invstd', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'sum_dy', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sum_dy_xmu', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'count', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.batch_norm_update_stats: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_mean', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'running_var', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'momentum', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.is_vulkan_available: [],
+    torch._C._VariableFunctions._nnpack_available: [],
+    torch._C._VariableFunctions._nnpack_spatial_convolution: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.ones: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.ones: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.ones: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.ones_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pairwise_distance: [{'is_kwarg_only': 'False', 'name': 'x1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cdist: [{'is_kwarg_only': 'False', 'name': 'x1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._euclidean_dist: [{'is_kwarg_only': 'False', 'name': 'x1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pdist: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cosine_similarity: [{'is_kwarg_only': 'False', 'name': 'x1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.permute: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.movedim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.movedim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.moveaxis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.moveaxis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.adjoint: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pixel_shuffle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'upscale_factor', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.pixel_unshuffle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'downscale_factor', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.channel_shuffle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.native_channel_shuffle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'groups', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions._pin_memory: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pinverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.poisson_nll_loss: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'log_input', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'full', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'reduction', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.rad2deg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rad2deg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rad2deg_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.deg2rad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.deg2rad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.deg2rad_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scalar_tensor: [{'is_kwarg_only': 'False', 'name': 's', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.rand: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.rand_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'low', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'low', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'low', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randint: [{'is_kwarg_only': 'False', 'name': 'low', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randint_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.randint_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'low', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'high', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.randn: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randn_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.randperm: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.randperm: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.randperm: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.randperm: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'True', 'name': 'generator', 'simple_type': 'Generator?'}],
+    torch._C._VariableFunctions.ravel: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.reciprocal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.reciprocal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.reciprocal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.neg_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.negative: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.negative: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.negative_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.repeat_interleave: [{'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.repeat_interleave: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.repeat_interleave: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.reshape: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shape', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._mkldnn_reshape: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shape', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.round_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.round_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rrelu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rrelu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.relu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.prelu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._prelu_kernel: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hardshrink: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hardshrink: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rsqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rsqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rsqrt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.selu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.selu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.celu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.celu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sigmoid_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logit_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.detach: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.detach_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slice_inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slice_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slice_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.select_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.diagonal_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.as_strided_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.smm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'half_to_float', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'half_to_float', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'input_dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'input_dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.unsafe_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.unsafe_split_with_sizes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.split_with_sizes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.hsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.hsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.vsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.vsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.dsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.dsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.sspaddmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sspaddmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._chunk_cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_chunks', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._chunk_cat: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_chunks', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.hstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.hstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.vstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.vstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.dstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.dstack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.stft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.stft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.istft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.nansum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nansum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sqrt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.square: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.square: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.square_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.std_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.std_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.std_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.std_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.t: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tensordot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims_self', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dims_other', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.tensordot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims_self', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'dims_other', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.threshold: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'threshold', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.threshold: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'threshold', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.threshold_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'threshold', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.tile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._mkldnn_transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._mkldnn_transpose_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.flip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.fliplr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.flipud: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.roll: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shifts', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.rot90: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trapezoid: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trapezoid: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trapz: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trapz: [{'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._transform_bias_rescale_qkv: [{'is_kwarg_only': 'False', 'name': 'qkv', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'qkv_bias', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_heads', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._nested_tensor_from_mask: [{'is_kwarg_only': 'False', 'name': 't', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_tensor_from_mask_left_aligned: [{'is_kwarg_only': 'False', 'name': 't', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_from_padded: [{'is_kwarg_only': 'False', 'name': 'padded', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cpu_nested_shape_example', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_from_padded_and_nested_example: [{'is_kwarg_only': 'False', 'name': 'padded', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nt_example', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_buffer: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_size', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_strides', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_buffer_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_size', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_strides', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_buffer_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_size', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'nested_strides', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_jagged: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_jagged_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_view_from_jagged_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'offsets', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_values: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_offsets: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_lengths: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_ragged_idx: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_get_jagged_dummy: [{'is_kwarg_only': 'False', 'name': 'any', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._trilinear: [{'is_kwarg_only': 'False', 'name': 'i1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'i2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'i3', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'expand1', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'expand2', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'expand3', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'sumdim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.triplet_margin_loss: [{'is_kwarg_only': 'False', 'name': 'anchor', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'positive', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'negative', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trunc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trunc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.trunc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fix: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fix: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fix_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._has_compatible_shallow_copy_type: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'from', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._unique: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unique_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.unique_consecutive: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._unique2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unsqueeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.vander: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.var_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch._C._VariableFunctions.var_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.var_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.var_mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.norm_except_dim: [{'is_kwarg_only': 'False', 'name': 'v', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._weight_norm: [{'is_kwarg_only': 'False', 'name': 'v', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'g', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._weight_norm_interface: [{'is_kwarg_only': 'False', 'name': 'v', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'g', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.zeros: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'True', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch._C._VariableFunctions.zeros: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.zeros: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._efficientzerotensor: [{'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.zeros_like: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._standard_gamma_grad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._standard_gamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._dirichlet_grad: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'alpha', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'total', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._sample_dirichlet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.poisson: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.binomial: [{'is_kwarg_only': 'False', 'name': 'count', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'prob', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.native_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.native_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType?'}],
+    torch._C._VariableFunctions._sparse_sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._sparse_sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._sparse_sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions._sparse_sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._sparse_csr_sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions._sparse_csr_prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions._sparse_softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._sparse_log_softmax_backward_data: [{'is_kwarg_only': 'False', 'name': 'grad_output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch._C._VariableFunctions.frexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.frexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.frobenius_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.frobenius_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._VariableFunctions.nuclear_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nuclear_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nuclear_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.nuclear_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.clone: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.positive: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.resize_as_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'the_template', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.resize_as_sparse_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'the_template', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.zero_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.subtract: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.subtract: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.subtract: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.rsub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rsub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.heaviside: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.heaviside: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._addmm_activation: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._addmm_activation: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._validate_sparse_coo_tensor_args: [{'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._validate_sparse_compressed_tensor_args: [{'is_kwarg_only': 'False', 'name': 'compressed_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'plain_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'layout', 'simple_type': 'Layout'}],
+    torch._C._VariableFunctions._validate_sparse_csr_tensor_args: [{'is_kwarg_only': 'False', 'name': 'crow_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._validate_sparse_csc_tensor_args: [{'is_kwarg_only': 'False', 'name': 'ccol_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'row_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._validate_sparse_bsr_tensor_args: [{'is_kwarg_only': 'False', 'name': 'crow_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._validate_sparse_bsc_tensor_args: [{'is_kwarg_only': 'False', 'name': 'ccol_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'row_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._to_cpu: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._coalesce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hspmm: [{'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hspmm: [{'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unbind: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unbind: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions._to_sparse_semi_structured: [{'is_kwarg_only': 'False', 'name': 'dense', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.quantize_per_tensor_dynamic: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}, {'is_kwarg_only': 'False', 'name': 'reduce_range', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.quantize_per_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.quantize_per_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.quantize_per_tensor: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scales', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_points', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.quantize_per_channel: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scales', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_points', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.dequantize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.dequantize: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.q_scale: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.q_zero_point: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.q_per_channel_scales: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.q_per_channel_zero_points: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.q_per_channel_axis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.int_repr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._make_per_tensor_quantized_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._make_per_channel_quantized_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.fake_quantize_per_tensor_affine: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.fake_quantize_per_tensor_affine: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._fake_quantize_per_tensor_affine_cachemask_tensor_qparams: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'fake_quant_enabled', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._fake_quantize_learnable_per_tensor_affine: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.fake_quantize_per_channel_affine: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._fake_quantize_learnable_per_channel_affine: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.fused_moving_avg_obs_fake_quant: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'observer_on', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'fake_quant_on', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_min', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_max', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'averaging_const', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'ch_axis', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._fused_moving_avg_obs_fq_helper: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'observer_on', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'fake_quant_on', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_min', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'running_max', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'zero_point', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'averaging_const', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'quant_min', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'quant_max', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'ch_axis', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._choose_qparams_per_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._saturate_weight_to_fp16: [{'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.choose_qparams_optimized: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'numel', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'n_bins', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'ratio', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'bit_width', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.meshgrid: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.meshgrid: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'indexing', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.cartesian_prod: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.combinations: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.result_type: [{'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.result_type: [{'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.result_type: [{'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.result_type: [{'is_kwarg_only': 'False', 'name': 'scalar1', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'scalar2', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.can_cast: [{'is_kwarg_only': 'False', 'name': 'from', 'simple_type': 'ScalarType'}, {'is_kwarg_only': 'False', 'name': 'to', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.promote_types: [{'is_kwarg_only': 'False', 'name': 'type1', 'simple_type': 'ScalarType'}, {'is_kwarg_only': 'False', 'name': 'type2', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions._lstm_mps: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.lstm: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.lstm: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.gru: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.gru: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.rnn_tanh: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.rnn_tanh: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.rnn_relu: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.rnn_relu: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'params', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'has_biases', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'num_layers', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dropout', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'train', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'bidirectional', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.lstm_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gru_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rnn_tanh_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.rnn_relu_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.quantized_lstm_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'scale_hh', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_hh', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.quantized_gru_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'scale_hh', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_hh', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.quantized_rnn_relu_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'scale_hh', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_hh', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.quantized_rnn_tanh_cell: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'hx', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'w_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'packed_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_ih', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_offsets_hh', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'scale_hh', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_ih', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'zero_point_hh', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._pack_padded_sequence: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'lengths', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._pad_packed_sequence: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_sizes', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'padding_value', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'total_length', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.masked_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.masked_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.masked_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._masked_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.put: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.index_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.scatter_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.scatter_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.scatter_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.__and__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.__and__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.__or__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.__or__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.__xor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.__xor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.__lshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.__lshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.__rshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.__rshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diag: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diag: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cross: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cross: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.triu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.triu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tril: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tril: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.tril_indices: [{'is_kwarg_only': 'False', 'name': 'row', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'col', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.triu_indices: [{'is_kwarg_only': 'False', 'name': 'row', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'col', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.trace: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.take: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.take: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.take_along_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.take_along_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.masked_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.masked_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nonzero_static: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'size', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.nonzero_static: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'size', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.argwhere: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.triangular_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.triangular_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_check_errors: [{'is_kwarg_only': 'False', 'name': 'info', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'api_name', 'simple_type': 'c10::string_view'}, {'is_kwarg_only': 'True', 'name': 'is_matrix', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.svd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.svd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.swapaxes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'axis1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.swapdims: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.cholesky: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cholesky: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cholesky_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cholesky_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cholesky_inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.cholesky_inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.qr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.qr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.geqrf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.geqrf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.orgqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.orgqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ormqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input3', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ormqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input3', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._lu_with_info: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lu_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_pivots', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lu_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_pivots', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lu_unpack: [{'is_kwarg_only': 'False', 'name': 'LU_data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_pivots', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lu_unpack: [{'is_kwarg_only': 'False', 'name': 'LU_data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_pivots', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.multinomial: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_samples', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.multinomial: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_samples', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.lgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.digamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.digamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.polygamma: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.polygamma: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erfinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.erfinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.i0_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.signbit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.signbit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.dist: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.atan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.arctan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._histogramdd_bin_edges: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._histogramdd_from_bin_cts: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._histogramdd_from_bin_tensors: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.histogramdd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.histogramdd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.histogramdd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hypot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.hypot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.igamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.igamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.igammac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.igammac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nextafter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nextafter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.fmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.msort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.msort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch._C._VariableFunctions.topk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.topk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.renorm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'maxnorm', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.renorm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'maxnorm', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.normal: [{'is_kwarg_only': 'False', 'name': 'mean', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'std', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._amp_foreach_non_finite_check_and_unscale_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'found_inf', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'inv_scale', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._amp_update_scale_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'growth_tracker', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'found_inf', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'scale_growth_factor', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'scale_backoff_factor', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'growth_interval', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._foreach_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_sub_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_sub_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sub_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_mul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_mul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_mul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_mul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_maximum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_maximum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_maximum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_minimum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalar', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_minimum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_minimum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_addcdiv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_addcdiv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_addcdiv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_addcmul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_addcmul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_addcmul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'scalars', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foreach_abs: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_abs_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_acos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_acos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_asin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_asin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_atan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_atan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_ceil: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_ceil_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_cos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_cos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_cosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_cosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_erf_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_erfc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_exp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_expm1_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_floor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_floor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_frac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_frac_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensors1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weights', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensors1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_lerp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensors1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weights', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_lerp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'tensors1', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_lgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_lgamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log10: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log10_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log1p_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_log2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_neg_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_pow_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_pow_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._foreach_pow_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'ScalarList'}],
+    torch._C._VariableFunctions._foreach_reciprocal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_reciprocal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_round_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sigmoid_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sign_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_sqrt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_tan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_tan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_tanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_tanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_trunc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_trunc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_zero_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._foreach_copy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.bucketize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'boundaries', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bucketize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'boundaries', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.bucketize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'boundaries', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.searchsorted: [{'is_kwarg_only': 'False', 'name': 'sorted_sequence', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.searchsorted: [{'is_kwarg_only': 'False', 'name': 'sorted_sequence', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.searchsorted: [{'is_kwarg_only': 'False', 'name': 'sorted_sequence', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions.searchsorted: [{'is_kwarg_only': 'False', 'name': 'sorted_sequence', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}],
+    torch._C._VariableFunctions._convert_indices_from_coo_to_csr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._convert_indices_from_coo_to_csr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._convert_indices_from_csr_to_coo: [{'is_kwarg_only': 'False', 'name': 'crow_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_indices', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._convert_indices_from_csr_to_coo: [{'is_kwarg_only': 'False', 'name': 'crow_indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'col_indices', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.mkldnn_adaptive_avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions.mkldnn_adaptive_avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions._adaptive_avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._VariableFunctions._adaptive_avg_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._VariableFunctions.column_stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.column_stack: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions.isfinite: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isinf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isposinf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isposinf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isneginf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.isneginf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._add_batch_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._remove_batch_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'batch_size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._linalg_det: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_det: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.det: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_slogdet: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_slogdet: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slogdet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slogdet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.logdet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_eigh: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_eigh: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.inner: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.inner: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.outer: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.outer: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ger: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ger: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_svd: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_svd: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_solve_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._linalg_solve_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_serialization_subcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_parallel_materialize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_parallel', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._test_autograd_multiple_dispatch: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_autograd_multiple_dispatch: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._test_autograd_multiple_dispatch_view: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_autograd_multiple_dispatch_view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._test_autograd_multiple_dispatch_view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.segment_reduce: [{'is_kwarg_only': 'False', 'name': 'data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch._C._VariableFunctions._nested_tensor_from_tensor_list: [{'is_kwarg_only': 'False', 'name': 'list', 'simple_type': 'TensorList'}],
+    torch._C._VariableFunctions._fw_primal_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._fw_primal_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._make_dual_copy: [{'is_kwarg_only': 'False', 'name': 'primal', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tangent', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._make_dual_copy: [{'is_kwarg_only': 'False', 'name': 'primal', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tangent', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'level', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.view_as_real_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_as_real_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_as_complex_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_as_complex_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._conj_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._conj_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._neg_view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._neg_view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.as_strided_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.as_strided_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._sparse_broadcast_to_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._sparse_broadcast_to_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.diagonal_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.diagonal_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.expand_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.expand_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.permute_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.permute_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions._reshape_alias_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions._reshape_alias_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.select_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.select_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.detach_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.detach_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slice_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.slice_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.split_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.split_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch._C._VariableFunctions.split_with_sizes_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.split_with_sizes_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.squeeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch._C._VariableFunctions.t_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.t_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.transpose_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.transpose_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.unsqueeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.unsqueeze_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.values_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.crow_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.crow_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.col_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.col_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ccol_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.ccol_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.row_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.row_indices_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unbind_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.unbind_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._VariableFunctions.view_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch._C._VariableFunctions.unfold_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'step', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.unfold_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'step', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions.alias_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions.alias_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._nested_tensor_softmax_with_shape: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._transformer_encoder_layer_fwd: [{'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'embed_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_heads', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'qkv_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'qkv_bias', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_bias', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'use_gelu', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'norm_first', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'norm_weight_1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'norm_bias_1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'norm_weight_2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'norm_bias_2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ffn_weight_1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ffn_bias_1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ffn_weight_2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ffn_bias_2', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._native_multi_head_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'embed_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_head', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'qkv_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'qkv_bias', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._fused_sdp_choice: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_dot_product_attention_math: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_dot_product_flash_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_dot_product_flash_attention_for_cpu: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._scaled_dot_product_efficient_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'attn_bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'compute_log_sumexp', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._scaled_dot_product_cudnn_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._triton_scaled_dot_attention: [{'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'v', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._fill_mem_eff_dropout_mask_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dropout_p', 'simple_type': 'double'}, {'is_kwarg_only': 'False', 'name': 'seed', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'offset', 'simple_type': 'int64_t'}],
+    torch._C._VariableFunctions._triton_multi_head_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'embed_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'num_head', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'qkv_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'qkv_bias', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'proj_bias', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._foobar: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._VariableFunctions._fused_adam_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avgs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'max_exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'state_steps', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta1', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta2', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'amsgrad', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fused_adam_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avgs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'max_exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'state_steps', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'beta1', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta2', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'amsgrad', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fused_adamw_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avgs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'max_exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'state_steps', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta1', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta2', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'amsgrad', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fused_adamw_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avgs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'max_exp_avg_sqs', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'state_steps', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'beta1', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'beta2', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'eps', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'amsgrad', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fused_sgd_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'momentum_buffer_list', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'dampening', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'nesterov', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'is_first_step', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._fused_sgd_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'grads', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'False', 'name': 'momentum_buffer_list', 'simple_type': 'TensorList'}, {'is_kwarg_only': 'True', 'name': 'weight_decay', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'momentum', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'lr', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'dampening', 'simple_type': 'double'}, {'is_kwarg_only': 'True', 'name': 'nesterov', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'maximize', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'is_first_step', 'simple_type': 'bool'}],
+    torch._C._VariableFunctions._propagate_xla_data: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output', 'simple_type': 'Tensor'}],
+    torch._C._nn.binary_cross_entropy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.binary_cross_entropy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.linear: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._nn.linear: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._nn.mkldnn_linear: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch._C._nn.relu6: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.relu6_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.gelu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.gelu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.gelu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.silu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.silu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.silu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.mish: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.mish: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.mish_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.one_hot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.mkldnn_reorder_conv2d_weight: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.mkldnn_reorder_conv3d_weight: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.cross_entropy_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.mse_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.mse_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.l1_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.multi_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.multi_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.multilabel_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.multilabel_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.nll_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.nll_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.nll_loss_nd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.nll_loss2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.nll_loss2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.smooth_l1_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.smooth_l1_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.huber_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.huber_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.soft_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.soft_margin_loss: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'target', 'simple_type': 'Tensor'}],
+    torch._C._nn.elu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.elu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.elu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.glu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.glu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardsigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardsigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardsigmoid_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardtanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardtanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardtanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardswish: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardswish: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.hardswish_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.leaky_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.leaky_relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.leaky_relu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.log_sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.log_sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.rrelu_with_noise: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'noise', 'simple_type': 'Tensor'}],
+    torch._C._nn.rrelu_with_noise: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'noise', 'simple_type': 'Tensor'}],
+    torch._C._nn.rrelu_with_noise_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'noise', 'simple_type': 'Tensor'}],
+    torch._C._nn.softplus: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.softplus: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.softshrink: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.softshrink: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.adaptive_avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.adaptive_avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.adaptive_avg_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.adaptive_avg_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.adaptive_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.adaptive_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.adaptive_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.adaptive_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.avg_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.avg_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.avg_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.fractional_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'random_samples', 'simple_type': 'Tensor'}],
+    torch._C._nn.fractional_max_pool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'random_samples', 'simple_type': 'Tensor'}],
+    torch._C._nn.fractional_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'random_samples', 'simple_type': 'Tensor'}],
+    torch._C._nn.fractional_max_pool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'random_samples', 'simple_type': 'Tensor'}],
+    torch._C._nn.max_pool2d_with_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.max_pool2d_with_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.max_pool3d_with_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.max_pool3d_with_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.max_unpool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.max_unpool2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.max_unpool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.max_unpool3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 3}],
+    torch._C._nn.reflection_pad1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.reflection_pad1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.reflection_pad2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 4}],
+    torch._C._nn.reflection_pad2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 4}],
+    torch._C._nn.reflection_pad3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 6}],
+    torch._C._nn.reflection_pad3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 6}],
+    torch._C._nn.replication_pad1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.replication_pad1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.replication_pad2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 4}],
+    torch._C._nn.replication_pad2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 4}],
+    torch._C._nn.replication_pad3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 6}],
+    torch._C._nn.replication_pad3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 6}],
+    torch._C._nn._pad_circular: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pad', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._nn._pad_enum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pad', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'mode', 'simple_type': 'int64_t'}],
+    torch._C._nn.pad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pad', 'simple_type': 'SymIntArrayRef'}],
+    torch._C._nn.upsample_linear1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_linear1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_linear1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_bilinear2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_bilinear2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_bilinear2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn._upsample_bilinear2d_aa: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._upsample_bilinear2d_aa: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn._upsample_bilinear2d_aa: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_trilinear3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_trilinear3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_trilinear3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_bicubic2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_bicubic2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_bicubic2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn._upsample_bicubic2d_aa: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._upsample_bicubic2d_aa: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn._upsample_bicubic2d_aa: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'align_corners', 'simple_type': 'bool'}],
+    torch._C._nn.upsample_nearest1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_nearest1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch._C._nn.upsample_nearest1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch._C._nn._upsample_nearest_exact1d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._upsample_nearest_exact1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch._C._nn._upsample_nearest_exact1d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch._C._nn.upsample_nearest2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_nearest2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.upsample_nearest2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn._upsample_nearest_exact2d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._upsample_nearest_exact2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn._upsample_nearest_exact2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.upsample_nearest3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn.upsample_nearest3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.upsample_nearest3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn._upsample_nearest_exact3d: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef?'}, {'is_kwarg_only': 'False', 'name': 'scale_factors', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._upsample_nearest_exact3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn._upsample_nearest_exact3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.slow_conv_transpose2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.slow_conv_transpose2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.slow_conv_transpose3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.slow_conv_transpose3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.thnn_conv2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.thnn_conv2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn._conv_depthwise2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn._conv_depthwise2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.conv_depthwise3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'bias', 'simple_type': 'Tensor?'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'SymIntArrayRef', 'size': 3}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.slow_conv3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.slow_conv3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.slow_conv_dilated2d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 2}],
+    torch._C._nn.slow_conv_dilated3d: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'SymIntArrayRef', 'size': 3}],
+    torch._C._nn.col2im: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.col2im: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'output_size', 'simple_type': 'SymIntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.im2col: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn.im2col: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'kernel_size', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'dilation', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'IntArrayRef', 'size': 2}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch._C._nn._test_optional_intlist: [{'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'addends', 'simple_type': 'IntArrayRef?'}],
+    torch._C._nn._test_optional_filled_intlist: [{'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'addends', 'simple_type': 'IntArrayRef?', 'size': 2}],
+    torch._C._nn._test_optional_floatlist: [{'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'addends', 'simple_type': 'ArrayRef<double>?'}],
+    torch._C._nn._test_string_default: [{'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._nn._test_ambiguous_defaults: [{'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._nn._test_ambiguous_defaults: [{'is_kwarg_only': 'False', 'name': 'dummy', 'simple_type': 'Tensor'}],
+    torch._C._nn._test_warn_in_autograd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._nn.pad_sequence: [{'is_kwarg_only': 'False', 'name': 'sequences', 'simple_type': 'TensorList'}],
+    torch._C._nn.flatten_dense_tensors: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._nn.unflatten_dense_tensors: [{'is_kwarg_only': 'False', 'name': 'flat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._nn.scaled_dot_product_attention: [{'is_kwarg_only': 'False', 'name': 'query', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'key', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_diagonal: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_solve_triangular: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'upper', 'simple_type': 'bool'}],
+    torch._C._linalg.linalg_solve_triangular: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'upper', 'simple_type': 'bool'}],
+    torch._C._linalg.linalg_vander: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cholesky_ex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cholesky_ex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cholesky: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cholesky: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cross: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cross: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_factor: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_factor: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_factor_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_factor_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_solve: [{'is_kwarg_only': 'False', 'name': 'LU', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pivots', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lu_solve: [{'is_kwarg_only': 'False', 'name': 'LU', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pivots', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_det: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_det: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_factor_ex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_factor_ex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_factor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_factor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_solve: [{'is_kwarg_only': 'False', 'name': 'LD', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pivots', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_ldl_solve: [{'is_kwarg_only': 'False', 'name': 'LD', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'pivots', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lstsq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_lstsq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'b', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_vecdot: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_vecdot: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'y', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_slogdet: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_slogdet: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eig: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eig: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg._linalg_eigvals: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigvals: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigvals: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigvalsh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_eigvalsh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_householder_product: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tau', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_householder_product: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tau', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_inv_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_inv_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_inv: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_inv: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ord', 'simple_type': 'c10::string_view'}],
+    torch._C._linalg.linalg_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ord', 'simple_type': 'c10::string_view'}],
+    torch._C._linalg.linalg_vector_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_vector_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ord', 'simple_type': 'Scalar'}],
+    torch._C._linalg.linalg_matrix_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'ord', 'simple_type': 'Scalar'}],
+    torch._C._linalg.linalg_matrix_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_svd: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_svd: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_svdvals: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_svdvals: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cond: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cond: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_cond: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'c10::string_view'}],
+    torch._C._linalg.linalg_cond: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'c10::string_view'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'rcond', 'simple_type': 'double'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'rcond', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'rcond', 'simple_type': 'double'}],
+    torch._C._linalg.linalg_pinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'rcond', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_solve_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_solve_ex: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_solve: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_solve: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'B', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_tensorinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_tensorinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_tensorsolve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_tensorsolve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_qr: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_qr: [{'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._linalg.linalg_matrix_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tol', 'simple_type': 'double'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tol', 'simple_type': 'double'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tol', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_matrix_rank: [{'is_kwarg_only': 'False', 'name': 'input', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tol', 'simple_type': 'Tensor'}],
+    torch._C._linalg.linalg_multi_dot: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._linalg.linalg_multi_dot: [{'is_kwarg_only': 'False', 'name': 'tensors', 'simple_type': 'TensorList'}],
+    torch._C._special.special_entr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_entr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_ndtri: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_ndtri: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_log_ndtr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_log_ndtr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_exp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_exp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_psi: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_psi: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_digamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_digamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_gammaln: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_gammaln: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfcx: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfcx: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_erfinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_ndtr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_ndtr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlog1py: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_zeta: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch._C._special.special_i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i0e: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i0e: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i1e: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_i1e: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_logit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_logit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_polygamma: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_polygamma: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._special.special_logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch._C._special.special_expit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_expit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_sinc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_sinc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._special.special_gammainc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_gammainc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_gammaincc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_gammaincc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch._C._special.special_multigammaln: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch._C._special.special_multigammaln: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch._C._special.special_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch._C._special.special_airy_ai: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_airy_ai: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_j0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_j0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_j1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_j1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_y0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_y0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_y1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_bessel_y1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_h: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_hermite_polynomial_he: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_laguerre_polynomial_l: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_legendre_polynomial_p: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_modified_bessel_i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_i1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_i1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_k0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_k0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_k1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_modified_bessel_k1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._special.special_scaled_modified_bessel_k0: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_scaled_modified_bessel_k0: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_scaled_modified_bessel_k1: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_scaled_modified_bessel_k1: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_t: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_u: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_v: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Tensor'}],
+    torch._C._special.special_shifted_chebyshev_polynomial_w: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'Scalar'}],
+    torch._C._special.special_spherical_bessel_j0: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._special.special_spherical_bessel_j0: [{'is_kwarg_only': 'False', 'name': 'x', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfft2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_rfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_irfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_hfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ihfftn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_fftfreq: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._fft.fft_fftfreq: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._fft.fft_rfftfreq: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._fft.fft_rfftfreq: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch._C._fft.fft_fftshift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch._C._fft.fft_ifftshift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.retain_grad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.rename_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch.Tensor.rename: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList?'}],
+    torch.Tensor.align_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList'}],
+    torch.Tensor.align_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'order', 'simple_type': 'DimnameList'}, {'is_kwarg_only': 'False', 'name': 'ellipsis_idx', 'simple_type': 'int64_t'}],
+    torch.Tensor.align_as: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.refine_names: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList'}],
+    torch.Tensor.abs: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.abs_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.absolute: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.absolute_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.angle: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sgn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sgn_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.chalf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._conj_physical: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.conj_physical: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.conj_physical_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.resolve_conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.resolve_neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._neg_view: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.acos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.acos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arccos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arccos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.addmv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch.Tensor.addmv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch.Tensor.addr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addr_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch.Tensor._is_all_true: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._is_any_true: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.all: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.allclose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.any: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.argmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.argmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.acosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.acosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arccosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arccosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.asinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.asinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arcsinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arcsinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.atanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.atanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.as_strided: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.as_strided_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.asin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.asin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arcsin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arcsin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.atan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.atan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.baddbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch.Tensor.baddbmm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch.Tensor.bernoulli: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.bernoulli: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}],
+    torch.Tensor.bernoulli_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Tensor'}],
+    torch.Tensor.bernoulli_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.bincount: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_not_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.copysign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.copysign_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.copysign_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor._lazy_clone: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_not: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_not_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_xor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_and_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logical_or_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.broadcast_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.ceil: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.ceil_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.unsafe_chunk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'chunks', 'simple_type': 'int64_t'}],
+    torch.Tensor.chunk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'chunks', 'simple_type': 'int64_t'}],
+    torch.Tensor.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'SymInt'}],
+    torch.Tensor.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.tensor_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor_indices_or_sections', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Scalar'}],
+    torch.Tensor.clamp_max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Scalar'}],
+    torch.Tensor.clamp_max_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'max', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Scalar'}],
+    torch.Tensor.clamp_min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Tensor'}],
+    torch.Tensor.clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Scalar'}],
+    torch.Tensor.clamp_min_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'min', 'simple_type': 'Tensor'}],
+    torch.Tensor.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clip_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clip_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cos: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cos_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cosh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cosh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.count_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.count_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cov: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.corrcoef: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cummax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cummin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cumprod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.cumprod_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cumprod_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cumsum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.cumsum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.cumsum_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.diag_embed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.diagflat: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.diagonal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.diagonal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.fill_diagonal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}],
+    torch.Tensor.diff: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.div: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.div_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'rounding_mode', 'simple_type': 'c10::string_view?'}],
+    torch.Tensor.true_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.true_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.true_divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.true_divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.dot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}],
+    torch.Tensor.vdot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.new_empty: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.new_empty_strided: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.new_full: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'fill_value', 'simple_type': 'Scalar'}],
+    torch.Tensor.new_zeros: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.new_ones: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.resize_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.erf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.erf_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.erfc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.erfc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.exp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.exp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.exp2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.expm1: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.expm1_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.expand: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.expand_as: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'start_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'end_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'start_dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'end_dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.flatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'DimnameList'}, {'is_kwarg_only': 'False', 'name': 'out_dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.unflatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.unflatten: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'sizes', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'names', 'simple_type': 'DimnameList'}],
+    torch.Tensor.fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.floor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.floor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.floor_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.floor_divide: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.floor_divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.floor_divide_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.frac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.frac_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.gcd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.gcd_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.lcm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.lcm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_copy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_copy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_put_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_put: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'c10::List<c10::optional<Tensor>>'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch.Tensor.isclose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.isnan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_distributed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_floating_point: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_complex: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_conj: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._is_zerotensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.isreal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_nonzero: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_same_size: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_signed: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_inference: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.kron: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}],
+    torch.Tensor.kthvalue: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.nan_to_num: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.nan_to_num_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.ldexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.ldexp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.log: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log10: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log10_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log1p: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log1p_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logaddexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.logaddexp2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.xlogy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.xlogy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.xlogy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.log_softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.logcumsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch.Tensor.logsumexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.matmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.matrix_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch.Tensor.matrix_exp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.aminmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.max: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.amax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch.Tensor.mean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.nanmean: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.median: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.nanmedian: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.min: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.amin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.mm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.mode: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.mul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.mul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.multiply: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.multiply: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.multiply_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.multiply_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.mv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec', 'simple_type': 'Tensor'}],
+    torch.Tensor.mvlgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch.Tensor.mvlgamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'int64_t'}],
+    torch.Tensor.narrow_copy: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch.Tensor.narrow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'SymInt'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch.Tensor.narrow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'start', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'length', 'simple_type': 'SymInt'}],
+    torch.Tensor.permute: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.movedim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.movedim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'int64_t'}],
+    torch.Tensor.moveaxis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.moveaxis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'destination', 'simple_type': 'int64_t'}],
+    torch.Tensor.adjoint: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_pinned: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.pin_memory: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.pinverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.rad2deg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.rad2deg_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.deg2rad: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.deg2rad_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.ravel: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.reciprocal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.reciprocal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.neg: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.neg_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.negative: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.negative_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.repeat: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.repeat_interleave: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'Tensor'}],
+    torch.Tensor.repeat_interleave: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'repeats', 'simple_type': 'SymInt'}],
+    torch.Tensor.reshape: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shape', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.reshape_as: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.round: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.round_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.round_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.relu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.relu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.prelu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch.Tensor.hardshrink: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.rsqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.rsqrt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'int64_t'}],
+    torch.Tensor.select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch.Tensor.sigmoid: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sigmoid_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logit_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sin_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sinc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sinc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sinh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sinh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.detach: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.detach_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.slice_inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.slice_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.select_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'SymInt'}],
+    torch.Tensor.diagonal_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.as_strided_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'stride', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.smm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.softmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.unsafe_split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch.Tensor.split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymInt'}],
+    torch.Tensor.split: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.unsafe_split_with_sizes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.split_with_sizes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'split_sizes', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.hsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch.Tensor.hsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.vsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch.Tensor.vsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.dsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sections', 'simple_type': 'int64_t'}],
+    torch.Tensor.dsplit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.squeeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.squeeze_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.squeeze_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.squeeze_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.squeeze_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.sspaddmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.stft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch.Tensor.stft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch.Tensor.istft: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n_fft', 'simple_type': 'int64_t'}],
+    torch.Tensor.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch.Tensor.sum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.nansum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sum_to_size: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.sqrt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sqrt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.square: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.square_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch.Tensor.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.std: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.prod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.t: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.t_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tan: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tan_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tanh: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tanh_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch.Tensor.transpose: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'Dimname'}],
+    torch.Tensor.transpose_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch.Tensor.flip: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dims', 'simple_type': 'IntArrayRef'}],
+    torch.Tensor.fliplr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.flipud: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.roll: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'shifts', 'simple_type': 'SymIntArrayRef', 'size': 1}],
+    torch.Tensor.rot90: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._nested_tensor_size: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._nested_tensor_strides: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._nested_tensor_storage_offsets: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.trunc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.trunc_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.fix: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.fix_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.type_as: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.unsqueeze: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.unsqueeze_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef?', 'size': 1}],
+    torch.Tensor.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.var: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.view_as: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.where: [{'is_kwarg_only': 'False', 'name': 'condition', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'IntArrayRef', 'size': 1}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}, {'is_kwarg_only': 'False', 'name': 'keepdim', 'simple_type': 'bool'}, {'is_kwarg_only': 'True', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch.Tensor.norm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar?'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'DimnameList', 'size': 1}],
+    torch.Tensor.frexp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.clone: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.positive: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.resize_as_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'the_template', 'simple_type': 'Tensor'}],
+    torch.Tensor.resize_as_sparse_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'the_template', 'simple_type': 'Tensor'}],
+    torch.Tensor.zero_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sub: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.sub_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.subtract: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.subtract: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.subtract_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.subtract_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.heaviside: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch.Tensor.heaviside_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'values', 'simple_type': 'Tensor'}],
+    torch.Tensor.addmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addmm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor._addmm_activation: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mat2', 'simple_type': 'Tensor'}],
+    torch.Tensor.sparse_resize_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'sparse_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dense_dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.sparse_resize_and_clear_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'IntArrayRef'}, {'is_kwarg_only': 'False', 'name': 'sparse_dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dense_dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.sparse_mask: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch.Tensor._sparse_mask_projection: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch.Tensor.to_dense: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._to_dense: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sparse_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._dimI: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.dense_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._dimV: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._nnz: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.coalesce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.is_coalesced: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._values: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._coalesced_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'coalesced', 'simple_type': 'bool'}],
+    torch.Tensor.indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.values: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.crow_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.col_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.ccol_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.row_indices: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.unbind: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.unbind: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.to_sparse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sparse_dim', 'simple_type': 'int64_t'}],
+    torch.Tensor.to_sparse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._to_sparse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'sparse_dim', 'simple_type': 'int64_t'}],
+    torch.Tensor._to_sparse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.to_sparse_csr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._to_sparse_csr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.to_sparse_csc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._to_sparse_csc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.to_sparse_bsr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blocksize', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch.Tensor._to_sparse_bsr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blocksize', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch.Tensor.to_sparse_bsc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blocksize', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch.Tensor._to_sparse_bsc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'blocksize', 'simple_type': 'IntArrayRef', 'size': 2}],
+    torch.Tensor.to_mkldnn: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.dequantize: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.q_scale: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.q_zero_point: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.q_per_channel_scales: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.q_per_channel_zero_points: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.q_per_channel_axis: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.int_repr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.qscheme: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor._autocast_to_reduced_precision: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cuda_enabled', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'cpu_enabled', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'cuda_dtype', 'simple_type': 'ScalarType'}, {'is_kwarg_only': 'False', 'name': 'cpu_dtype', 'simple_type': 'ScalarType'}],
+    torch.Tensor._autocast_to_full_precision: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'cuda_enabled', 'simple_type': 'bool'}, {'is_kwarg_only': 'False', 'name': 'cpu_enabled', 'simple_type': 'bool'}],
+    torch.Tensor.is_set_to: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor', 'simple_type': 'Tensor'}],
+    torch.Tensor.masked_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.masked_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.masked_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.masked_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.masked_scatter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.masked_scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.view: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'SymIntArrayRef'}],
+    torch.Tensor.view: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dtype', 'simple_type': 'ScalarType'}],
+    torch.Tensor.put_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.put: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_reduce_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.index_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'source', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.index_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.index_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.index_fill_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.index_fill: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'True', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.scatter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.scatter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'value', 'simple_type': 'Scalar'}],
+    torch.Tensor.scatter_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter_add: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter_add_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}],
+    torch.Tensor.scatter_reduce: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.scatter_reduce_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'src', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'reduce', 'simple_type': 'c10::string_view'}],
+    torch.Tensor.eq_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.eq_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_and: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_and_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_and_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__and__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__and__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__iand__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__iand__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_or: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_or_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_or_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__or__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__or__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__ior__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__ior__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_xor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_xor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_xor_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__xor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__xor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__ixor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__ixor__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__lshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__lshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__ilshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__ilshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_left_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_left_shift_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_left_shift_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__rshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__rshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.__irshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.__irshift__: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_right_shift: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.bitwise_right_shift_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.bitwise_right_shift_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.tril_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.triu_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.digamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.lerp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch.Tensor.lerp_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch.Tensor.addbmm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addbmm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'batch2', 'simple_type': 'Tensor'}],
+    torch.Tensor.random_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'from', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'to', 'simple_type': 'int64_t?'}],
+    torch.Tensor.random_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'to', 'simple_type': 'int64_t'}],
+    torch.Tensor.random_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.uniform_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cauchy_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.log_normal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.exponential_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.geometric_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'double'}],
+    torch.Tensor.diag: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cross: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.triu: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.tril: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.trace: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.ne: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.ne_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.ne_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.not_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.not_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.not_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.eq: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.ge: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.ge_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.ge_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.greater_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.greater_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.greater_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.le: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.le_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.le_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.less_equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.less_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.less_equal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.gt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.gt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.gt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.greater: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.greater_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.greater_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.lt: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.lt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.lt_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.less: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.less_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.less_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.take: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch.Tensor.take_along_dim: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'indices', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch.Tensor.index_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch.Tensor.masked_select: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'mask', 'simple_type': 'Tensor'}],
+    torch.Tensor.nonzero_static: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'size', 'simple_type': 'int64_t'}],
+    torch.Tensor.argwhere: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch.Tensor.gather: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}, {'is_kwarg_only': 'False', 'name': 'index', 'simple_type': 'Tensor'}],
+    torch.Tensor.addcmul: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addcmul_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addcdiv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch.Tensor.addcdiv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor1', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'tensor2', 'simple_type': 'Tensor'}],
+    torch.Tensor.triangular_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'A', 'simple_type': 'Tensor'}],
+    torch.Tensor.svd: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.swapaxes: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'axis1', 'simple_type': 'int64_t'}],
+    torch.Tensor.swapaxes_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'axis0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'axis1', 'simple_type': 'int64_t'}],
+    torch.Tensor.swapdims: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch.Tensor.swapdims_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim0', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'dim1', 'simple_type': 'int64_t'}],
+    torch.Tensor.cholesky: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.cholesky_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch.Tensor.cholesky_inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.qr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.geqrf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.orgqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}],
+    torch.Tensor.ormqr: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input2', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'input3', 'simple_type': 'Tensor'}],
+    torch.Tensor.lu_solve: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_data', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'LU_pivots', 'simple_type': 'Tensor'}],
+    torch.Tensor.multinomial: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'num_samples', 'simple_type': 'int64_t'}],
+    torch.Tensor.lgamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.lgamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.digamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.polygamma: [{'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.polygamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'n', 'simple_type': 'int64_t'}],
+    torch.Tensor.erfinv: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.erfinv_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.i0: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.i0_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sign: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sign_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.signbit: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.dist: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.atan2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.atan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctan2: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.arctan2_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Scalar'}],
+    torch.Tensor.lerp: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'end', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'weight', 'simple_type': 'Tensor'}],
+    torch.Tensor.histc: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'bins', 'simple_type': 'Tensor'}],
+    torch.Tensor.histogram: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.fmod: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.fmod_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.fmod_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.hypot: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.hypot_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.igamma: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.igamma_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.igammac: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.igammac_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.nextafter: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.nextafter_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.remainder: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.remainder_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Scalar'}],
+    torch.Tensor.remainder_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.fmin: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.fmax: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.maximum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.minimum: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch.Tensor.quantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch.Tensor.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'Tensor'}],
+    torch.Tensor.nanquantile: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'q', 'simple_type': 'double'}],
+    torch.Tensor.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}],
+    torch.Tensor.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.sort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool?'}, {'is_kwarg_only': 'True', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.msort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'True', 'name': 'stable', 'simple_type': 'bool'}],
+    torch.Tensor.argsort: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'Dimname'}],
+    torch.Tensor.topk: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'k', 'simple_type': 'SymInt'}],
+    torch.Tensor.renorm: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'maxnorm', 'simple_type': 'Scalar'}],
+    torch.Tensor.renorm_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'p', 'simple_type': 'Scalar'}, {'is_kwarg_only': 'False', 'name': 'dim', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'maxnorm', 'simple_type': 'Scalar'}],
+    torch.Tensor.unfold: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'dimension', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'size', 'simple_type': 'int64_t'}, {'is_kwarg_only': 'False', 'name': 'step', 'simple_type': 'int64_t'}],
+    torch.Tensor.equal: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch.Tensor.pow: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch.Tensor.pow_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch.Tensor.pow_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch.Tensor.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch.Tensor.float_power: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch.Tensor.float_power_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Scalar'}],
+    torch.Tensor.float_power_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'exponent', 'simple_type': 'Tensor'}],
+    torch.Tensor.normal_: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.isfinite: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.isinf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.record_stream: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 's', 'simple_type': 'Stream'}],
+    torch.Tensor.isposinf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.isneginf: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.det: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.slogdet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.logdet: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.inverse: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}],
+    torch.Tensor.inner: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'other', 'simple_type': 'Tensor'}],
+    torch.Tensor.outer: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch.Tensor.ger: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'vec2', 'simple_type': 'Tensor'}],
+    torch.Tensor.to_padded_tensor: [{'is_kwarg_only': 'False', 'name': 'self', 'simple_type': 'Tensor'}, {'is_kwarg_only': 'False', 'name': 'padding', 'simple_type': 'double'}],
+}
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/hypothesis_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/hypothesis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e34c048fdadc2b911316d9b12b5fe8d871869b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/hypothesis_utils.py
@@ -0,0 +1,371 @@
+# mypy: ignore-errors
+
+from collections import defaultdict
+from collections.abc import Iterable
+import numpy as np
+import torch
+
+import hypothesis
+from functools import reduce
+from hypothesis import assume
+from hypothesis import settings
+from hypothesis import strategies as st
+from hypothesis.extra import numpy as stnp
+from hypothesis.strategies import SearchStrategy
+
+from torch.testing._internal.common_quantized import _calculate_dynamic_qparams, _calculate_dynamic_per_channel_qparams
+
+# Setup for the hypothesis tests.
+# The tuples are (torch_quantized_dtype, zero_point_enforce), where the last
+# element is enforced zero_point. If None, any zero_point point within the
+# range of the data type is OK.
+
+# Tuple with all quantized data types.
+_ALL_QINT_TYPES = (
+    torch.quint8,
+    torch.qint8,
+    torch.qint32,
+)
+
+# Enforced zero point for every quantized data type.
+# If None, any zero_point point within the range of the data type is OK.
+_ENFORCED_ZERO_POINT = defaultdict(lambda: None, {
+    torch.quint8: None,
+    torch.qint8: None,
+    torch.qint32: 0
+})
+
+def _get_valid_min_max(qparams):
+    scale, zero_point, quantized_type = qparams
+    adjustment = 1 + torch.finfo(torch.float).eps
+    _long_type_info = torch.iinfo(torch.long)
+    long_min, long_max = _long_type_info.min / adjustment, _long_type_info.max / adjustment
+    # make sure intermediate results are within the range of long
+    min_value = max((long_min - zero_point) * scale, (long_min / scale + zero_point))
+    max_value = min((long_max - zero_point) * scale, (long_max / scale + zero_point))
+    return np.float32(min_value), np.float32(max_value)
+
+# This wrapper wraps around `st.floats` and checks the version of `hypothesis`, if
+# it is too old, removes the `width` parameter (which was introduced)
+# in 3.67.0
+def _floats_wrapper(*args, **kwargs):
+    if 'width' in kwargs and hypothesis.version.__version_info__ < (3, 67, 0):
+        # As long as nan, inf, min, max are not specified, reimplement the width
+        # parameter for older versions of hypothesis.
+        no_nan_and_inf = (
+            (('allow_nan' in kwargs and not kwargs['allow_nan']) or
+             'allow_nan' not in kwargs) and
+            (('allow_infinity' in kwargs and not kwargs['allow_infinity']) or
+             'allow_infinity' not in kwargs))
+        min_and_max_not_specified = (
+            len(args) == 0 and
+            'min_value' not in kwargs and
+            'max_value' not in kwargs
+        )
+        if no_nan_and_inf and min_and_max_not_specified:
+            if kwargs['width'] == 16:
+                kwargs['min_value'] = torch.finfo(torch.float16).min
+                kwargs['max_value'] = torch.finfo(torch.float16).max
+            elif kwargs['width'] == 32:
+                kwargs['min_value'] = torch.finfo(torch.float32).min
+                kwargs['max_value'] = torch.finfo(torch.float32).max
+            elif kwargs['width'] == 64:
+                kwargs['min_value'] = torch.finfo(torch.float64).min
+                kwargs['max_value'] = torch.finfo(torch.float64).max
+        kwargs.pop('width')
+    return st.floats(*args, **kwargs)
+
+def floats(*args, **kwargs):
+    if 'width' not in kwargs:
+        kwargs['width'] = 32
+    return _floats_wrapper(*args, **kwargs)
+
+"""Hypothesis filter to avoid overflows with quantized tensors.
+
+Args:
+    tensor: Tensor of floats to filter
+    qparams: Quantization parameters as returned by the `qparams`.
+
+Returns:
+    True
+
+Raises:
+    hypothesis.UnsatisfiedAssumption
+
+Note: This filter is slow. Use it only when filtering of the test cases is
+      absolutely necessary!
+"""
+def assume_not_overflowing(tensor, qparams):
+    min_value, max_value = _get_valid_min_max(qparams)
+    assume(tensor.min() >= min_value)
+    assume(tensor.max() <= max_value)
+    return True
+
+"""Strategy for generating the quantization parameters.
+
+Args:
+    dtypes: quantized data types to sample from.
+    scale_min / scale_max: Min and max scales. If None, set to 1e-3 / 1e3.
+    zero_point_min / zero_point_max: Min and max for the zero point. If None,
+        set to the minimum and maximum of the quantized data type.
+        Note: The min and max are only valid if the zero_point is not enforced
+              by the data type itself.
+
+Generates:
+    scale: Sampled scale.
+    zero_point: Sampled zero point.
+    quantized_type: Sampled quantized type.
+"""
+@st.composite
+def qparams(draw, dtypes=None, scale_min=None, scale_max=None,
+            zero_point_min=None, zero_point_max=None):
+    if dtypes is None:
+        dtypes = _ALL_QINT_TYPES
+    if not isinstance(dtypes, (list, tuple)):
+        dtypes = (dtypes,)
+    quantized_type = draw(st.sampled_from(dtypes))
+
+    _type_info = torch.iinfo(quantized_type)
+    qmin, qmax = _type_info.min, _type_info.max
+
+    # TODO: Maybe embed the enforced zero_point in the `torch.iinfo`.
+    _zp_enforced = _ENFORCED_ZERO_POINT[quantized_type]
+    if _zp_enforced is not None:
+        zero_point = _zp_enforced
+    else:
+        _zp_min = qmin if zero_point_min is None else zero_point_min
+        _zp_max = qmax if zero_point_max is None else zero_point_max
+        zero_point = draw(st.integers(min_value=_zp_min, max_value=_zp_max))
+
+    if scale_min is None:
+        scale_min = torch.finfo(torch.float).eps
+    if scale_max is None:
+        scale_max = torch.finfo(torch.float).max
+    scale = draw(floats(min_value=scale_min, max_value=scale_max, width=32))
+
+    return scale, zero_point, quantized_type
+
+"""Strategy to create different shapes.
+Args:
+    min_dims / max_dims: minimum and maximum rank.
+    min_side / max_side: minimum and maximum dimensions per rank.
+
+Generates:
+    Possible shapes for a tensor, constrained to the rank and dimensionality.
+
+Example:
+    # Generates 3D and 4D tensors.
+    @given(Q = qtensor(shapes=array_shapes(min_dims=3, max_dims=4))
+    some_test(self, Q):...
+"""
+@st.composite
+def array_shapes(draw, min_dims=1, max_dims=None, min_side=1, max_side=None, max_numel=None):
+    """Return a strategy for array shapes (tuples of int >= 1)."""
+    assert min_dims < 32
+    if max_dims is None:
+        max_dims = min(min_dims + 2, 32)
+    assert max_dims < 32
+    if max_side is None:
+        max_side = min_side + 5
+    candidate = st.lists(st.integers(min_side, max_side), min_size=min_dims, max_size=max_dims)
+    if max_numel is not None:
+        candidate = candidate.filter(lambda x: reduce(int.__mul__, x, 1) <= max_numel)
+    return draw(candidate.map(tuple))
+
+
+"""Strategy for generating test cases for tensors.
+The resulting tensor is in float32 format.
+
+Args:
+    shapes: Shapes under test for the tensor. Could be either a hypothesis
+            strategy, or an iterable of different shapes to sample from.
+    elements: Elements to generate from for the returned data type.
+              If None, the strategy resolves to float within range [-1e6, 1e6].
+    qparams: Instance of the qparams strategy. This is used to filter the tensor
+             such that the overflow would not happen.
+
+Generates:
+    X: Tensor of type float32. Note that NaN and +/-inf is not included.
+    qparams: (If `qparams` arg is set) Quantization parameters for X.
+        The returned parameters are `(scale, zero_point, quantization_type)`.
+        (If `qparams` arg is None), returns None.
+"""
+@st.composite
+def tensor(draw, shapes=None, elements=None, qparams=None, dtype=np.float32):
+    if isinstance(shapes, SearchStrategy):
+        _shape = draw(shapes)
+    else:
+        _shape = draw(st.sampled_from(shapes))
+    if qparams is None:
+        if elements is None:
+            elements = floats(-1e6, 1e6, allow_nan=False, width=32)
+        X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape))
+        assume(not (np.isnan(X).any() or np.isinf(X).any()))
+        return X, None
+    qparams = draw(qparams)
+    if elements is None:
+        min_value, max_value = _get_valid_min_max(qparams)
+        elements = floats(min_value, max_value, allow_infinity=False,
+                          allow_nan=False, width=32)
+    X = draw(stnp.arrays(dtype=dtype, elements=elements, shape=_shape))
+    # Recompute the scale and zero_points according to the X statistics.
+    scale, zp = _calculate_dynamic_qparams(X, qparams[2])
+    enforced_zp = _ENFORCED_ZERO_POINT.get(qparams[2], None)
+    if enforced_zp is not None:
+        zp = enforced_zp
+    return X, (scale, zp, qparams[2])
+
+@st.composite
+def per_channel_tensor(draw, shapes=None, elements=None, qparams=None):
+    if isinstance(shapes, SearchStrategy):
+        _shape = draw(shapes)
+    else:
+        _shape = draw(st.sampled_from(shapes))
+    if qparams is None:
+        if elements is None:
+            elements = floats(-1e6, 1e6, allow_nan=False, width=32)
+        X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape))
+        assume(not (np.isnan(X).any() or np.isinf(X).any()))
+        return X, None
+    qparams = draw(qparams)
+    if elements is None:
+        min_value, max_value = _get_valid_min_max(qparams)
+        elements = floats(min_value, max_value, allow_infinity=False,
+                          allow_nan=False, width=32)
+    X = draw(stnp.arrays(dtype=np.float32, elements=elements, shape=_shape))
+    # Recompute the scale and zero_points according to the X statistics.
+    scale, zp = _calculate_dynamic_per_channel_qparams(X, qparams[2])
+    enforced_zp = _ENFORCED_ZERO_POINT.get(qparams[2], None)
+    if enforced_zp is not None:
+        zp = enforced_zp
+    # Permute to model quantization along an axis
+    axis = int(np.random.randint(0, X.ndim, 1))
+    permute_axes = np.arange(X.ndim)
+    permute_axes[0] = axis
+    permute_axes[axis] = 0
+    X = np.transpose(X, permute_axes)
+
+    return X, (scale, zp, axis, qparams[2])
+
+"""Strategy for generating test cases for tensors used in Conv.
+The resulting tensors is in float32 format.
+
+Args:
+    spatial_dim: Spatial Dim for feature maps. If given as an iterable, randomly
+                 picks one from the pool to make it the spatial dimension
+    batch_size_range: Range to generate `batch_size`.
+                      Must be tuple of `(min, max)`.
+    input_channels_per_group_range:
+        Range to generate `input_channels_per_group`.
+        Must be tuple of `(min, max)`.
+    output_channels_per_group_range:
+        Range to generate `output_channels_per_group`.
+        Must be tuple of `(min, max)`.
+    feature_map_range: Range to generate feature map size for each spatial_dim.
+                       Must be tuple of `(min, max)`.
+    kernel_range: Range to generate kernel size for each spatial_dim. Must be
+                  tuple of `(min, max)`.
+    max_groups: Maximum number of groups to generate.
+    elements: Elements to generate from for the returned data type.
+              If None, the strategy resolves to float within range [-1e6, 1e6].
+    qparams: Strategy for quantization parameters. for X, w, and b.
+             Could be either a single strategy (used for all) or a list of
+             three strategies for X, w, b.
+Generates:
+    (X, W, b, g): Tensors of type `float32` of the following drawen shapes:
+        X: (`batch_size, input_channels, H, W`)
+        W: (`output_channels, input_channels_per_group) + kernel_shape
+        b: `(output_channels,)`
+        groups: Number of groups the input is divided into
+Note: X, W, b are tuples of (Tensor, qparams), where qparams could be either
+      None or (scale, zero_point, quantized_type)
+
+
+Example:
+    @given(tensor_conv(
+        spatial_dim=2,
+        batch_size_range=(1, 3),
+        input_channels_per_group_range=(1, 7),
+        output_channels_per_group_range=(1, 7),
+        feature_map_range=(6, 12),
+        kernel_range=(3, 5),
+        max_groups=4,
+        elements=st.floats(-1.0, 1.0),
+        qparams=qparams()
+    ))
+"""
+@st.composite
+def tensor_conv(
+    draw, spatial_dim=2, batch_size_range=(1, 4),
+    input_channels_per_group_range=(3, 7),
+    output_channels_per_group_range=(3, 7), feature_map_range=(6, 12),
+    kernel_range=(3, 7), max_groups=1, can_be_transposed=False,
+    elements=None, qparams=None
+):
+
+    # Resolve the minibatch, in_channels, out_channels, iH/iW, iK/iW
+    batch_size = draw(st.integers(*batch_size_range))
+    input_channels_per_group = draw(
+        st.integers(*input_channels_per_group_range))
+    output_channels_per_group = draw(
+        st.integers(*output_channels_per_group_range))
+    groups = draw(st.integers(1, max_groups))
+    input_channels = input_channels_per_group * groups
+    output_channels = output_channels_per_group * groups
+
+    if isinstance(spatial_dim, Iterable):
+        spatial_dim = draw(st.sampled_from(spatial_dim))
+
+    feature_map_shape = []
+    for i in range(spatial_dim):
+        feature_map_shape.append(draw(st.integers(*feature_map_range)))
+
+    kernels = []
+    for i in range(spatial_dim):
+        kernels.append(draw(st.integers(*kernel_range)))
+
+    tr = False
+    weight_shape = (output_channels, input_channels_per_group) + tuple(kernels)
+    bias_shape = output_channels
+    if can_be_transposed:
+        tr = draw(st.booleans())
+        if tr:
+            weight_shape = (input_channels, output_channels_per_group) + tuple(kernels)
+            bias_shape = output_channels
+
+    # Resolve the tensors
+    if qparams is not None:
+        if isinstance(qparams, (list, tuple)):
+            assert len(qparams) == 3, "Need 3 qparams for X, w, b"
+        else:
+            qparams = [qparams] * 3
+
+    X = draw(tensor(shapes=(
+        (batch_size, input_channels) + tuple(feature_map_shape),),
+        elements=elements, qparams=qparams[0]))
+    W = draw(tensor(shapes=(weight_shape,), elements=elements,
+                    qparams=qparams[1]))
+    b = draw(tensor(shapes=(bias_shape,), elements=elements,
+                    qparams=qparams[2]))
+
+    return X, W, b, groups, tr
+
+# We set the deadline in the currently loaded profile.
+# Creating (and loading) a separate profile overrides any settings the user
+# already specified.
+hypothesis_version = hypothesis.version.__version_info__
+current_settings = settings._profiles[settings._current_profile].__dict__
+current_settings['deadline'] = None
+if hypothesis_version >= (3, 16, 0) and hypothesis_version < (5, 0, 0):
+    current_settings['timeout'] = hypothesis.unlimited
+def assert_deadline_disabled():
+    if hypothesis_version < (3, 27, 0):
+        import warnings
+        warning_message = (
+            "Your version of hypothesis is outdated. "
+            "To avoid `DeadlineExceeded` errors, please update. "
+            f"Current hypothesis version: {hypothesis.__version__}"
+        )
+        warnings.warn(warning_message)
+    else:
+        assert settings().deadline is None
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/inductor_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/inductor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c3218e657debef1d8a09ffddc648fcf34274a2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/inductor_utils.py
@@ -0,0 +1,87 @@
+# mypy: ignore-errors
+
+import torch
+import re
+import unittest
+import functools
+from subprocess import CalledProcessError
+
+from torch._inductor.codecache import CppCodeCache
+from torch.utils._triton import has_triton
+from torch.testing._internal.common_utils import (
+    LazyVal,
+    IS_FBCODE,
+)
+from torch._dynamo.backends.registry import register_backend
+from torch._inductor.compile_fx import compile_fx, count_bytes_inner
+from torch.testing._internal.common_utils import TestCase
+
+def test_cpu():
+    try:
+        CppCodeCache.load("")
+        return not IS_FBCODE
+    except (
+        CalledProcessError,
+        OSError,
+        torch._inductor.exc.InvalidCxxCompiler,
+        torch._inductor.exc.CppCompileError,
+    ):
+        return False
+
+HAS_CPU = LazyVal(test_cpu)
+
+HAS_CUDA = torch.cuda.is_available() and has_triton()
+
+HAS_GPU = HAS_CUDA
+
+GPUS = ["cuda"]
+
+HAS_MULTIGPU = any(
+    getattr(torch, gpu).is_available() and getattr(torch, gpu).device_count() >= 2
+    for gpu in GPUS
+)
+
+tmp_gpus = [x for x in GPUS if getattr(torch, x).is_available()]
+assert len(tmp_gpus) <= 1
+GPU_TYPE = "cuda" if len(tmp_gpus) == 0 else tmp_gpus.pop()
+del tmp_gpus
+
+@register_backend
+def count_bytes_inductor(gm, example_inputs):
+    return compile_fx(gm, example_inputs, inner_compile=count_bytes_inner)
+
+def _check_has_dynamic_shape(
+    self: TestCase,
+    code,
+):
+    for_loop_found = False
+    has_dynamic = False
+    lines = code.split("\n")
+    for line in lines:
+        if "for(" in line:
+            for_loop_found = True
+            if re.search(r";.*ks.*;", line) is not None:
+                has_dynamic = True
+                break
+    self.assertTrue(
+        has_dynamic, msg=f"Failed to find dynamic for loop variable\n{code}"
+    )
+    self.assertTrue(for_loop_found, f"Failed to find for loop\n{code}")
+
+
+def skipDeviceIf(cond, msg, *, device):
+    if cond:
+        def decorate_fn(fn):
+            def inner(self, *args, **kwargs):
+                if self.device == device:
+                    raise unittest.SkipTest(msg)
+                return fn(self, *args, **kwargs)
+            return inner
+    else:
+        def decorate_fn(fn):
+            return fn
+
+    return decorate_fn
+
+skipCUDAIf = functools.partial(skipDeviceIf, device="cuda")
+skipCPUIf = functools.partial(skipDeviceIf, device="cpu")
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/jit_metaprogramming_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/jit_metaprogramming_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..119e8c6abcb2a641cff0291d67389fc049915eb7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/jit_metaprogramming_utils.py
@@ -0,0 +1,722 @@
+# mypy: ignore-errors
+
+# Torch
+from torch.jit.annotations import BroadcastingList2, BroadcastingList3  # noqa: F401
+import torch.nn.functional as F
+import torch
+import torch.cuda
+import torch.jit
+import torch.jit._logging
+import torch.jit.frontend
+from torch.testing._internal.common_nn import module_tests, new_module_tests
+from torch.testing._internal.common_utils import is_iterable_of_tensors, noncontiguous_like
+
+import collections
+from copy import deepcopy
+from typing import Any, Dict, List, Union
+import math  # noqa: F401
+
+# Testing utils
+from torch import inf
+
+assert torch.get_default_dtype() == torch.float32
+
+L = 20
+M = 10
+S = 5
+
+
+def unpack_variables(args):
+    if isinstance(args, tuple):
+        return tuple(unpack_variables(elem) for elem in args)
+    else:
+        return args
+
+class dont_convert(tuple):
+    pass
+
+non_differentiable = collections.namedtuple('non_differentiable', ['tensor'])
+
+def create_input(call_args, requires_grad=True, non_contiguous=False, call_kwargs=None, dtype=torch.float, device=None):
+    if not isinstance(call_args, tuple):
+        call_args = (call_args,)
+
+    def map_arg(arg):
+        def maybe_non_contig(tensor):
+            if not non_contiguous or tensor.numel() < 2:
+                return tensor.clone()
+
+            return noncontiguous_like(tensor)
+
+        def conjugate(tensor):
+            return tensor.conj()
+
+        if isinstance(arg, (torch.Size, dont_convert)):
+            return arg
+        elif isinstance(arg, tuple) and len(arg) == 0:
+            var = conjugate(torch.randn((), dtype=dtype, device=device))
+            var.requires_grad = requires_grad
+            return var
+        elif isinstance(arg, tuple) and not isinstance(arg[0], torch.Tensor):
+            return conjugate(maybe_non_contig(torch.randn(*arg, dtype=dtype, device=device))).requires_grad_(requires_grad)
+        # double check casting
+        elif isinstance(arg, non_differentiable):
+            if isinstance(arg.tensor, torch.Tensor):
+                return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
+            return conjugate(maybe_non_contig(arg.tensor.to(device=device)))
+        elif isinstance(arg, torch.Tensor):
+            if arg.is_complex() != dtype.is_complex:
+                raise RuntimeError("User provided tensor is real for a test that runs with complex dtype, ",
+                                   "which is not supported for now")
+            # NOTE: We do clone() after detach() here because we need to be able to change size/storage of v afterwards
+            v = conjugate(maybe_non_contig(arg)).detach().to(device=device).clone()
+            v.requires_grad = requires_grad and (v.is_floating_point() or v.is_complex())
+            return v
+        elif callable(arg):
+            return map_arg(arg(dtype=dtype, device=device))
+        else:
+            return arg
+    args_out = tuple(map_arg(arg) for arg in call_args)
+    kwargs_out = {k: map_arg(v) for k, v in call_kwargs.items()} if call_kwargs else {}
+    return args_out, kwargs_out
+
+# NB: JIT script tests for all nn functional interfaces, script mode does
+# not support in_place operations yet, so no inplace operation tests added.
+# removed all the deprecated functions
+#
+# (
+#   method name,
+#   input size/constructing fn,
+#   args (tuple represents shape of a tensor arg),
+#   test variant name(will be used at test name suffix,
+#       'inplace' skips grad tests),                         // optional
+#   (True, nonfusible_nodes, fusible_nodes) for autodiff     // optional
+#   fn to determine if test should be skipped,               // optional
+#   fn mapping output to part that should be gradcheck'ed,   // optional
+#   kwargs for function,                                     // optional
+# )
+nn_functional_tests = [
+    ('conv1d', (S, S, S), ((S, S, S),)),
+    ('conv2d', (S, S, S, S), ((S, S, S, S),)),
+    ('conv3d', (S, S, S, S, S), ((S, S, S, S, S),)),
+    ('conv_transpose1d', (S, S, S), ((S, S, S),)),
+    ('conv_transpose2d', (S, S, S, S), ((S, S, S, S),)),
+    ('conv_transpose3d', (S, S, S, S, S), ((S, S, S, S, S),)),
+    ('conv_tbc', (S, S, S), ((S, S, S), (S,), 2)),
+    ('avg_pool1d', (S, S, S), (3,)),
+    ('avg_pool2d', (S, S, S, S), (3,), '', (True,)),
+    ('avg_pool3d', (S, S, S, S, S), (3,)),
+    ('fractional_max_pool2d', (S, S, S, S), (3, [2, 3],)),
+    ('max_pool1d', (S, S, S), (2, 1)),
+    ('max_pool1d', (S, S, S), (2, 1, 1, 1, False, True), 'with_indices'),
+    ('max_pool2d', (S, S, S, S), (2, 1), '', (True, 'aten::max_pool2d_with_indices')),
+    ('max_pool2d', (S, S, S, S), (2, 1, 1, 1, False, True), 'with_indices', (True, 'aten::max_pool2d_with_indices')),
+    ('max_pool3d', (S, S, S, S, S), (2, 1)),
+    ('max_unpool1d', torch.tensor([[[2., 4]]]), (torch.tensor([[[1, 3]]]), 2, 2, 0)),
+    ('max_unpool2d', torch.tensor([[[[2., 4]]]]), (torch.tensor([[[[1, 3]]]]), 2, 2, 0)),
+    ('max_unpool3d', torch.tensor([[[[[2., 4]]]]]), (torch.tensor([[[[[1, 3]]]]]), 2, 2, 0)),
+    ('lp_pool1d', (S, S, S), (2., 3, 2,)),
+    ('lp_pool2d', (S, S, S, S), (2., 3, 2,)),
+    ('lp_pool3d', (S, S, S, S, S), (2., 3, 2,)),
+    ('adaptive_max_pool1d', (S, S, S), (5,)),
+    ('adaptive_max_pool2d', (S, S, S, S), ([5, 7],)),
+    ('adaptive_max_pool3d', (S, S, S, S, S), ([3, 2, 2],)),
+    ('adaptive_avg_pool1d', (S, S, S), (5,), '', (True,)),
+    ('adaptive_avg_pool2d', (S, S, S, S), ([5, 7],), '', (True,)),
+    ('adaptive_avg_pool3d', (S, S, S, S, S), ([3, 2, 2],), '', (True,)),
+    ('dropout', (S, S, S), (0.5,), '', (True, 'aten::native_dropout')),
+    ('alpha_dropout', (S, S, S), (0.5,)),
+    ('dropout2d', (S, S, S), (0.5,)),
+    ('dropout2d', (S, S, S, S), (0.5,), 'batched'),
+    ('dropout3d', (S, S, S, S), (0.5,)),
+    ('dropout3d', (S, S, S, S, S), (0.5,), 'batched'),
+    ('feature_alpha_dropout', (S, S, S), (0.5,)),
+    ('threshold', (S, S, S), (0.1, 2.), '', (True,)),
+    ('threshold', (S, S, S), (0.1, 2., True), 'inplace'),
+    ('relu', (S, S, S), (), '', (True,)),
+    ('relu', (S, S, S), (), 'inplace'),
+    ('glu', (S - 1, S - 1, S - 1), (),),
+    ('hardtanh', (S, S, S), (-0.5, 0.5), '', (True,)),
+    ('hardtanh', (S, S, S), (-0.5, 0.5, True), 'inplace'),
+    ('relu6', (S, S, S), (), '', (True,)),
+    ('relu6', (S, S, S), (True), 'inplace'),
+    ('elu', (S, S, S), (0.9,),),
+    ('elu', (S, S, S), (0.9, True), 'inplace'),
+    ('selu', (S, S, S), (),),
+    ('selu', (S, S, S), (True), 'inplace'),
+    ('celu', (S, S, S), (0.9,),),
+    ('celu', (S, S, S), (0.9, True), 'inplace'),
+    ('leaky_relu', (S, S, S), (0.02,), '', (True,)),
+    ('leaky_relu', (S, S, S), (0.02,), 'inplace'),
+    ('rrelu', (S, S), (0.1, 0.3, False),),
+    ('rrelu', (S, S), (0.1, 0.3, False, True), 'inplace'),
+    ('hardshrink', (S, S, S), (0.4,), '', (True,)),
+    ('tanhshrink', (S, S, S), (),),
+    ('softsign', (S, S, S), (),),
+    ('softplus', (S, S, S), (), '', (True,)),
+    ('softmin', (S, S, S), (0,),),
+    ('softmax', (S, S, S), (0,), '', (True,)),
+    ('softmax', (S, S, S), (0, 3, torch.double), 'with_all_args', (True,)),
+    ('tanh', (S, S, S), (), '', (True,)),
+    ('sigmoid', (S, S, S), (), '', (True,)),
+    ('silu', (S, S, S), (), '', (True,)),
+    ('log_softmax', (S, S, S), (0,), '', (True,)),
+    ('linear', (S, S), ((M, S),), '', (True, ['aten::linear'])),
+    ('linear', (S, S), ((M, S), (M,)), 'addmm', (True, ['aten::linear'])),
+    ('bilinear', (S, S, S), ((S, S, M), torch.zeros(M, S, M),),),
+    ('embedding', torch.tensor([[1, 2, 4, 5], [4, 3, 2, 5]]), (torch.rand(6, 3), ), '', (True,)),
+    ('embedding_bag', torch.tensor([1, 2, 4, 2]), (torch.rand(5, 3), torch.tensor([0, 4]),),),
+    ('batch_norm', (S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), None, None, True, ),
+        'training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (0, S, S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'size_zero', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (0, S, S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'size_zero_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S),
+        (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+         non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), True, ),
+        'with_weight_and_bias_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, non_differentiable(torch.ones(S)), True, ),
+        'with_only_bias_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), None, True, ),
+        'with_only_weight_training', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, None, False, ),
+        'inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)), False, ),
+        'with_weight_and_bias_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            None, non_differentiable(torch.ones(S)), False, ),
+        'with_only_bias_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('batch_norm', (S, S), (non_differentiable(torch.randn(S)), non_differentiable(torch.ones(S)),
+                            non_differentiable(torch.randn(S)), None, False, ),
+        'with_only_weight_inference', (True, 'aten::_batch_norm_impl_index')),
+    ('instance_norm', (S, S, S), (non_differentiable(torch.zeros(S)), non_differentiable(torch.ones(S))),),
+    ('layer_norm', (S, S, S, S), ([5],), '',
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+    ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),), 'with_only_weight',
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+    ('layer_norm', (S, S, S, S), ([5], None, non_differentiable(torch.rand(S)),), 'with_only_bias',
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index'])),
+    ('layer_norm', (S, S, S, S), ([5], non_differentiable(torch.rand(S)),
+                                  non_differentiable(torch.rand(S))), 'with_weight_and_bias',
+     (False, ['aten::contiguous', 'aten::_batch_norm_impl_index', 'aten::addcmul'])),
+    ('group_norm', (S, S, S), (1, torch.rand(5),),),
+    ('local_response_norm', (S, S, S), (2, ),),
+    ('nll_loss', F.log_softmax(torch.randn(3, 5), dim=0), (torch.tensor([1, 0, 4]),), '',),
+    ('poisson_nll_loss', torch.rand(S, 2), (torch.rand(S, 2),),),
+    ('poisson_nll_loss', torch.rand(S, 2), (torch.rand(S, 2), True, True), 'full'),
+    ('kl_div', F.log_softmax(torch.randn(S, 10), 1), (F.softmax(torch.randn(S, 10), 1),),),
+    ('cross_entropy', (3, S), (torch.randint(S, (3,), dtype=torch.int64),),),
+    ('binary_cross_entropy_with_logits', (3,), (torch.empty(3).random_(2), ),),
+    ('smooth_l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('huber_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('l1_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('mse_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('smooth_l1_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'),
+    ('huber_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'),
+    ('l1_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'),
+    ('mse_loss', (3, S), ((torch.rand(3, S)),), 'with_grad'),
+    ('margin_ranking_loss', (S,), ((S,), (S,)),),
+    ('hinge_embedding_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('multilabel_soft_margin_loss', (3, S), (non_differentiable(torch.rand(3, S)),),),
+    ('cosine_embedding_loss', (S, S), ((S, S), non_differentiable(torch.rand(S,))),),
+    ('pixel_shuffle', (1, 9, 4, 4), (3,),),
+    ('pixel_unshuffle', (1, 1, 12, 12), (3,),),
+    ('affine_grid', (S, 2, 3), (torch.Size([S, 1, 7, 7]),),),
+    ('pad', (3, 3, 4, 2), ([1, 1],),),
+    ('pairwise_distance', (S, S), ((S, S),),),
+    ('pdist', (S, S), (),),
+    ('cosine_similarity', (S, S), ((S, S),),),
+    ('triplet_margin_loss', (S, S), ((S, S), (S, S)),),
+    ('normalize', (S, S, S), (),),
+    ('unfold', (S, S, S, S), ([2, 3]),),
+    ('fold', (1, 3 * 2 * 2, 12), ([4, 5], [2, 2]),),
+    ('grid_sample', (S, S, S, S), (non_differentiable(torch.rand(S, S, S, 2)),),),
+    ('gumbel_softmax', (S, S), (2.,), '', (True, ['aten::softmax', 'aten::add', 'aten::div'], ['aten::neg'])),
+    ('gumbel_softmax', (S, S), (2., True,), 'hard', (True, ['aten::softmax', 'aten::add', 'aten::div'], ['aten::neg'])),
+    ('multilabel_margin_loss', torch.tensor([[0.2, -0.2, 0.07]]), (torch.tensor([[0, 0, 1]]),),),
+    ('multi_margin_loss', (S, S), (non_differentiable(torch.randint(S, (S, ), dtype=torch.int64)),
+                                   1, 1., non_differentiable(torch.randn(S))),),
+    ('binary_cross_entropy', torch.randn(3, 2).sigmoid(), (non_differentiable(torch.rand(3, 2)),
+                                                           non_differentiable(torch.randn(3, 2))),),
+    ('binary_cross_entropy', torch.randn(3, 2).sigmoid(),
+        (non_differentiable(torch.rand(3, 2)),
+         non_differentiable(torch.randn(3, 2)), None, None, 'mean'), 'size_average'),
+    ('ctc_loss', torch.rand(S, S, S).log_softmax(2).detach().requires_grad_(),
+     (torch.randint(1, S, (S, S), dtype=torch.long), torch.full((S,), S, dtype=torch.long),
+      torch.randint(1, S, (S,), dtype=torch.long))),
+    ('upsample', torch.randn(S, S, M, M), (None, 2.), 'with_scale'),
+    ('upsample', torch.randn(S, S, M, M), (4,), 'with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2,), 'nearest_4d'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2.), 'nearest_4d_with_scale'),
+    ('interpolate', torch.randn(S, S, M, M), (4,), 'nearest_4d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2,), 'area_4d'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2.), 'area_4d_with_scale'),
+    ('interpolate', torch.randn(S, S, M, M), (4,), 'area_4d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2,), 'bilinear_4d'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2.), 'bilinear_4d_with_scale'),
+    ('interpolate', torch.randn(S, S, M, M), (4,), 'bilinear_4d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2,), 'bicubic_4d'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2.), 'bicubic_4d_with_scale'),
+    ('interpolate', torch.randn(S, S, M, M), (4,), 'bicubic_4d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 3, 3), (2,), 'nearest_3d'),
+    ('interpolate', torch.randn(S, M, M), (None, 2.), 'nearest_3d_with_scale'),
+    ('interpolate', torch.randn(S, M, M), (4,), 'nearest_3d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 3, 3), (2,), 'area_3d'),
+    ('interpolate', torch.randn(S, M, M), (None, 2.), 'area_3d_with_scale'),
+    ('interpolate', torch.randn(S, M, M), (4,), 'area_3d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 3, 3), (2,), 'linear_3d'),
+    ('interpolate', torch.randn(S, M, M), (None, 2.), 'linear_3d_with_scale'),
+    ('interpolate', torch.randn(S, M, M), (4,), 'linear_3d_with_size'),
+    ('interpolate', torch.randn(S, M, M, M, M), (None, 2.), 'nearest_5d_with_scale'),
+    ('interpolate', torch.randn(S, M, M, M, M), (4,), 'nearest_5d_with_size'),
+    ('interpolate', torch.zeros(3, 3, 3).view(1, 1, 3, 3, 3), (2,), 'area_5d'),
+    ('interpolate', torch.randn(S, M, M, M, M), (None, 2.), 'area_5d_with_scale'),
+    ('interpolate', torch.randn(S, M, M, M, M), (4,), 'area_5d_with_size'),
+    ('interpolate', torch.zeros(3, 3, 3).view(1, 1, 3, 3, 3), (2,), 'trilinear_5d'),
+    ('interpolate', torch.randn(S, M, M, M, M), (None, 2.), 'trilinear_5d_with_scale'),
+    ('interpolate', torch.randn(S, M, M, M, M), (4,), 'trilinear_5d_with_size'),
+    ('interpolate', torch.zeros(3, 3).view(1, 1, 3, 3), (2, None, 'nearest', None, False),
+     'nearest_4d_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, S, M, M), (4, None, 'nearest', None, False),
+     'nearest_4d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2., 'bilinear', None, False),
+     'bilinear_4d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, S, M, M), (4, None, 'bilinear', None, False),
+     'bilinear_4d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, S, M, M), (None, 2., 'bicubic', None, False),
+     'bicubic_4d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, S, M, M), (4, None, 'bicubic', None, False),
+     'bicubic_4d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M), (None, 2., 'nearest', None, False),
+     'nearest_3d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M), (4, None, 'nearest', None, False),
+     'nearest_3d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M), (None, 2., 'linear', None, False),
+     'linear_3d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M), (4, None, 'linear', None, False),
+     'linear_3d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M, M, M), (None, 2., 'nearest', None, False),
+     'nearest_5d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M, M, M), (4, None, 'nearest', None, False),
+     'nearest_5d_with_size_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M, M, M), (None, 2., 'trilinear', None, False),
+     'trilinear_5d_with_scale_not_recompute_scale_factor'),
+    ('interpolate', torch.randn(S, M, M, M, M), (4, None, 'trilinear', None, False),
+     'trilinear_5d_with_size_not_recompute_scale_factor'),
+]
+
+script_template = '''
+def the_method({}):
+    return {}
+'''
+
+def value_to_literal(value):
+    if isinstance(value, str):
+        # Quotes string and escapes special characters
+        return ascii(value)
+    if isinstance(value, torch.Tensor):
+        return 'torch.' + str(value)
+    else:
+        return str(value)
+
+def get_call(method_name, func_type, args, kwargs):
+    kwargs_str = ', '.join([k + '=' + value_to_literal(v) for k, v in kwargs.items()])
+    self_arg = args[0]
+    if func_type == 'method':
+        args = args[1:]
+
+    argument_str = ', '.join(args)
+    argument_str += ', ' if len(args) and len(kwargs) else ''
+    argument_str += kwargs_str
+
+    if func_type == 'functional' or func_type == 'function':
+        call = f'torch.{method_name}({argument_str})'
+    elif func_type == 'method':
+        call = f'{self_arg}.{method_name}({argument_str})'
+    elif func_type == 'nn_functional':
+        call = f'torch.nn.functional.{method_name}({argument_str})'
+    else:
+        raise TypeError('Unsupported function type')
+
+    return call
+
+def get_constant(x):
+    if x == inf:
+        return 'math.inf'
+    if x == -inf:
+        return '-math.inf'
+    return x
+
+def get_script_args(args):
+    formals: List[str] = []
+    tensors: List[Union[torch.Tensor, List[torch.Tensor]]] = []
+    actuals: List[str] = []
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            name = f'i{len(formals)}'
+            formals.append(name)
+            actuals.append(name)
+            tensors.append(arg)
+        elif is_iterable_of_tensors(arg):
+            name = f'i{len(formals)}'
+            formals.append(name + ': List[torch.Tensor]')
+            actuals.append(name)
+            tensors.append(list(arg))
+        elif isinstance(arg, str):
+            actuals.append(f"'{arg}'")
+        else:
+            actuals.append(str(get_constant(arg)))
+    return (formals, tensors, actuals)
+
+# create a script function from (name, func_type, output_process_fn),
+# and returns the compiled function and example inputs
+def gen_script_fn_and_args(method_name, func_type, *args, **kwargs):
+    formals, tensors, actuals = get_script_args(args)
+    call = get_call(method_name, func_type, actuals, kwargs)
+    script = script_template.format(', '.join(formals), call)
+    CU = torch.jit.CompilationUnit(script)
+    return CU.the_method, tensors
+
+# create a script function from (name, func_type),
+# returns a function takes in (args, kwargs) and runs the compiled function
+def create_script_fn(self, method_name, func_type):
+    # function returns tuple containing original output and
+    # filtered output to be used in checking gradients
+    def script_fn(*args, **kwargs):
+        fn, tensors = gen_script_fn_and_args(method_name, func_type, *args, **kwargs)
+        self.assertExportImport(fn.graph, tensors)
+        output = fn(*tensors)
+        # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087
+        script_fn.last_graph = fn.graph_for(*tensors)  # type: ignore[attr-defined]
+        return output
+    return script_fn
+
+class SplitInputs:
+    all_tensors: List[Any]
+    tensor_args: List[Any]
+    nontensor_args: List[Any]
+    arg_types: List[str]
+    tensor_kwargs: Dict[str, Any]
+    kwarg_order: List[str]
+    nontensor_kwargs: Dict[str, Any]
+    kwarg_types: Dict[str, Any]
+
+    @staticmethod
+    def _is_tensor_input(arg):
+        return isinstance(arg, torch.Tensor) or is_iterable_of_tensors(arg)
+
+    def __init__(self, args, kwargs):
+        self.arg_types = ['t' if self._is_tensor_input(arg) else 's' for arg in args]
+        self.kwarg_types = {k: 't' if self._is_tensor_input(v) else 's' for k, v in kwargs.items()}
+        self.tensor_args = [arg for arg in args if self._is_tensor_input(arg)]
+        self.nontensor_args = [arg for arg in args if not self._is_tensor_input(arg)]
+        self.tensor_kwargs = {k: v for k, v in kwargs.items() if self._is_tensor_input(v)}
+        self.nontensor_kwargs = {k: v for k, v in kwargs.items() if not self._is_tensor_input(v)}
+        self.all_tensors = [*self.tensor_args, *[v for k, v in self.tensor_kwargs.items()]]
+        self.kwarg_order = [k for k, v in kwargs.items()]
+
+    def nontensors_match(self, other: 'SplitInputs'):
+        if self.arg_types != other.arg_types:
+            return False
+        if self.kwarg_types != other.kwarg_types:
+            return False
+        if self.kwarg_order != other.kwarg_order:
+            return False
+        if self.nontensor_args != other.nontensor_args:
+            return False
+        if self.nontensor_kwargs != other.nontensor_kwargs:
+            return False
+        return True
+
+# make a new function where all non-tensor arguments in 'args' have been partially
+# applied, and all tensor arguments remain.
+# used to trace functions when some arguments are not tensors
+def partial_apply_nontensors(fn, args, kwargs):
+    inputs = SplitInputs(args, kwargs)
+
+    def new_fn(*tensors_):
+        tensors = iter(tensors_)
+        full_args = [args[i] if s == 's' else next(tensors) for i, s in enumerate(inputs.arg_types)]
+        full_kwargs = {k: kwargs[k] if s == 's' else next(tensors) for k, s in inputs.kwarg_types.items()}
+        return fn(*full_args, **full_kwargs)
+
+    return new_fn, inputs
+
+# create a trace function from input fn
+def create_traced_fn(self, fn, cache_traced_fn=False):
+    def traced_fn(*inputs, **kwargs):
+        # `check_trace` is set to False because check_trace is run with @no_grad
+        # Also, `check_against_reference` already does all the checks
+        # against python function
+        fn_tensors, split_inputs = partial_apply_nontensors(fn, inputs, kwargs)
+        if not cache_traced_fn or not hasattr(traced_fn, 'traced'):
+            traced = torch.jit.trace(fn_tensors, split_inputs.all_tensors, check_trace=False)
+            self.assertExportImport(traced.graph, split_inputs.all_tensors)
+            output = traced(*split_inputs.all_tensors)
+            if cache_traced_fn:
+                traced_fn.traced = traced
+                traced_fn.split_inputs = split_inputs
+        else:
+            # Guard to check that nontensor inputs are the same as during tracing
+            self.assertTrue(traced_fn.split_inputs.nontensors_match(split_inputs))
+            output = traced_fn.traced(*split_inputs.all_tensors)
+            traced = traced_fn.traced
+        # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087
+        traced_fn.last_graph = traced.graph_for(*split_inputs.all_tensors)  # type: ignore[attr-defined]
+        traced_fn.graph = traced.graph  # type: ignore[attr-defined]
+        return output
+    return traced_fn
+
+# known to be failing in script
+EXCLUDE_SCRIPT = {
+    'test_norm_fro_default',
+    'test_norm_fro_cpu',
+    'test_norm_nuc',
+    'test_norm_fro',
+    'test_norm_nuc_batched',
+
+    # aten op has additional cudnn argument
+    'test_nn_unfold',
+
+    # flaky test - TODO fix
+    'test_nn_ctc_loss',
+
+    # unknown builtin op
+    'test_nn_fold',
+
+    # jit doesn't support sparse tensors.
+    'test_to_sparse',
+    'test_to_sparse_dim',
+}
+
+# generates a script function and set of example inputs
+# from a specified test in the format of nn_functional_tests
+def get_nn_functional_compiled_fn_and_inputs(name, self_size, args, variant_name='', *extra_args):
+    test_name = 'test_nn_' + name
+
+    if variant_name != '':
+        test_name = test_name + '_' + variant_name
+
+    no_grad = variant_name == 'inplace'
+
+    self_variable = create_input((self_size,))[0][0]
+    kwargs = None
+
+    # need to record this because methods can change the size (e.g. unsqueeze)
+    args_variable, kwargs_variable = create_input(args)
+
+    self_tensor = deepcopy(self_variable.data)
+    args_tensor = deepcopy(unpack_variables(args_variable))
+
+    f_args_variable = (self_variable,) + args_variable
+    f_args_tensor = (self_tensor,) + args_tensor
+    with torch._jit_internal._disable_emit_hooks():
+        script_fn, inputs = gen_script_fn_and_args(name, "nn_functional", *f_args_variable)
+    return script_fn, inputs
+
+
+# additional modules test
+# TODO: delete this list once we make all nn_tests work
+additional_module_tests = [
+    {
+        'module_name': 'Bilinear',
+        'constructor_args': (S, S, M),
+        'input_size': (S, S),
+        'extra_args': ((S, S),)
+    },
+    {
+        'module_name': 'RNNCell',
+        'constructor_args': (S, S),
+        'input_size': (S, S),
+    },
+    {
+        'module_name': 'LSTMCell',
+        'constructor_args': (S, S),
+        'input_size': (S, S),
+    },
+    {
+        'module_name': 'GRUCell',
+        'constructor_args': (S, S),
+        'input_size': (S, S),
+    },
+    {
+        'module_name': 'MultiheadAttention',
+        'constructor_args': (128, 8),
+        'input_size': (10, 8, 128),
+        'extra_args': (torch.randn(10, 8, 128), torch.randn(10, 8, 128)),
+        'slowTest': True
+    },
+    {
+        'module_name': 'Transformer',
+        'constructor_args': (1, 1, 1, 1, 2),
+        'input_size': (3, 1, 1),
+        'extra_args': (torch.randn(1, 1, 1),),
+        'slowTest': True
+    }
+]
+
+EXCLUDE_SCRIPT_MODULES = {
+    'test_nn_AdaptiveAvgPool2d_tuple_none',
+    'test_nn_AdaptiveAvgPool3d_tuple_none',
+    'test_nn_AdaptiveMaxPool2d_tuple_none',
+    'test_nn_AdaptiveMaxPool3d_tuple_none',
+
+    # Doesn't use future division, so this is not supported
+    'test_nn_CrossMapLRN2d',
+    # Derivative for aten::_scaled_dot_product_flash_attention_backward is not implemented
+    'test_nn_TransformerDecoderLayer_gelu_activation',
+    'test_nn_TransformerDecoderLayer_relu_activation',
+    'test_nn_TransformerEncoderLayer_gelu_activation',
+    'test_nn_TransformerEncoderLayer_relu_activation',
+    'test_nn_Transformer_multilayer_coder',
+}
+
+script_method_template = '''
+def forward({}):
+    return {}
+'''
+
+def create_script_module(self, nn_module, constructor_args, *args, **kwargs):
+    def script_module(*args, **kwargs):
+        formals, tensors, actuals = get_script_args(args)
+
+        method_args = ', '.join(['self'] + actuals)
+        call_args_str = ', '.join(actuals)
+        call = f"self.submodule({call_args_str})"
+        script = script_method_template.format(method_args, call)
+
+        submodule_constants = []
+        if kwargs.get('is_constant'):
+            submodule_constants = ['submodule']
+
+        # Create module to use the script method
+        class TheModule(torch.jit.ScriptModule):
+            __constants__ = submodule_constants
+
+            def __init__(self):
+                super().__init__()
+                self.submodule = nn_module(*constructor_args)
+
+        def make_module(script):
+            module = TheModule()
+            # check __repr__
+            str(module)
+            module.define(script)
+            return module
+
+        module = make_module(script)
+        if self:
+            self.assertExportImportModule(module, tensors)
+            module(*args)
+        # skip type annotate function attributes for now, see: https://github.com/python/mypy/issues/2087
+        create_script_module.last_graph = module.graph  # type: ignore[attr-defined]
+        return module
+    return script_module
+
+def check_alias_annotation(method_name, args, kwargs, *, aten_name, func_type='method'):
+    formals, tensors, actuals = get_script_args(args)
+    call = get_call(method_name, func_type, actuals, kwargs)
+    script = script_template.format(', '.join(formals), call)
+    CU = torch.jit.CompilationUnit(script)
+    # to clean up IR
+    torch._C._jit_pass_inline(CU.the_method.graph)
+    torch._C._jit_pass_constant_propagation(CU.the_method.graph)
+    torch._C._jit_check_alias_annotation(CU.the_method.graph, tuple(tensors), aten_name)
+
+def get_nn_module_name_from_kwargs(**kwargs):
+    if 'module_name' in kwargs:
+        return kwargs['module_name']
+    elif 'fullname' in kwargs:
+        return kwargs['fullname']
+    elif 'constructor' in kwargs:
+        return kwargs['constructor'].__name__
+
+def get_nn_mod_test_name(**kwargs):
+    if 'fullname' in kwargs:
+        test_name = kwargs['fullname']
+    else:
+        test_name = get_nn_module_name_from_kwargs(**kwargs)
+        if 'desc' in kwargs:
+            test_name = f"{test_name}_{kwargs['desc']}"
+    return f'test_nn_{test_name}'
+
+def get_nn_module_class_from_kwargs(**kwargs):
+    name = get_nn_module_name_from_kwargs(**kwargs)
+    index = name.find("_")
+    if index == -1:
+        return name
+    else:
+        return name[0:name.find("_")]
+
+def try_get_nn_module_compiled_mod_and_inputs(*args, **kwargs):
+    name = get_nn_module_name_from_kwargs(**kwargs)
+
+    if 'desc' in kwargs and 'eval' in kwargs['desc']:
+        # eval() is not supported, so skip these tests
+        return
+
+    test_name = name
+    if 'desc' in kwargs:
+        test_name = f"{test_name}_{kwargs['desc']}"
+    test_name = get_nn_mod_test_name(**kwargs)
+
+    if test_name in EXCLUDE_SCRIPT_MODULES:
+        return
+    if 'constructor' in kwargs:
+        nn_module = kwargs['constructor']
+    else:
+        nn_module = getattr(torch.nn, name)
+
+    if "FunctionalModule" in str(nn_module):
+        return
+
+    if 'constructor_args_fn' in kwargs:
+        constructor_args = kwargs['constructor_args_fn']()
+    else:
+        constructor_args = kwargs.get('constructor_args', ())
+
+    # Set up inputs from tuple of sizes or constructor fn
+    input_dtype = torch.double
+    if 'input_fn' in kwargs:
+        input = kwargs['input_fn']()
+        if isinstance(input, torch.Tensor):
+            input = (input,)
+
+        if all(tensor.is_complex() for tensor in input):
+            input_dtype = torch.cdouble
+    else:
+        input = (kwargs['input_size'],)
+
+    # Extra parameters to forward()
+    if 'extra_args' in kwargs:
+        input = input + kwargs['extra_args']
+
+    if 'target_size' in kwargs:
+        input = input + (kwargs['target_size'],)
+    elif 'target_fn' in kwargs:
+        if torch.is_tensor(input):
+            input = (input,)
+        input = input + (kwargs['target_fn'](),)
+
+    args_variable, kwargs_variable = create_input(input, dtype=input_dtype)
+    f_args_variable = deepcopy(unpack_variables(args_variable))
+    out_var = deepcopy(f_args_variable)
+
+    args, mod = f_args_variable, create_script_module(None, nn_module, constructor_args, *f_args_variable)(*f_args_variable)
+
+    return mod, out_var
+
+
+def get_all_nn_module_tests():
+    return module_tests + new_module_tests + additional_module_tests
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/jit_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/jit_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0799652446e98c7149686347d27aeb9a2f412b6d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/jit_utils.py
@@ -0,0 +1,893 @@
+# mypy: ignore-errors
+
+# Torch
+from torch.autograd import Variable
+from torch.autograd.function import _nested_map
+from torch.jit.annotations import BroadcastingList2, BroadcastingList3  # noqa: F401
+
+from torch.onnx import OperatorExportTypes
+import torch
+import torch.cuda
+import torch.jit
+import torch.jit._logging
+import torch.jit.frontend
+import torch.jit.quantized
+import zipfile
+import functools
+
+# Testing utils
+from torch.testing import FileCheck
+from torch.testing._internal.common_utils import IS_WINDOWS, \
+    freeze_rng_state, enable_profiling_mode_for_profiling_tests, ProfilingMode, TEST_BAILOUTS, \
+    is_iterable_of_tensors
+from torch.testing._internal.common_jit import JitCommonTestCase
+from torch.testing._internal.common_utils import enable_profiling_mode  # noqa: F401
+
+# Standard library
+from contextlib import contextmanager
+from functools import reduce
+from io import StringIO
+from collections import defaultdict
+
+import importlib.util
+import inspect
+import io
+import math
+import os
+import pickle
+import sys
+import tempfile
+import textwrap
+from importlib.abc import Loader
+from typing import Any, Dict, List, Tuple, Union
+
+RUN_CUDA = torch.cuda.is_available()
+RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
+RUN_CUDA_HALF = RUN_CUDA
+# HIP supports half, no version check necessary
+if torch.cuda.is_available() and not torch.version.hip:
+    CUDA_VERSION = torch._C._cuda_getCompiledVersion()
+    for d in range(torch.cuda.device_count()):
+        major = torch.cuda.get_device_capability(d)[0]
+        if (major < 6):
+            RUN_CUDA_HALF = False
+
+def execWrapper(code, glob, loc):
+    exec(code, glob, loc)
+
+def do_input_map(fn, input):
+    return _nested_map(lambda t: isinstance(t, torch.Tensor), fn)(input)
+
+def clear_class_registry():
+    torch._C._jit_clear_class_registry()
+    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
+    torch.jit._state._clear_class_state()
+
+def get_execution_plan(graph_executor_state):
+    execution_plans = list(graph_executor_state.execution_plans.values())
+    num_plans = len(execution_plans)
+    if num_plans != 1:
+        raise RuntimeError('This test assumes this GraphExecutor should '
+                           f'only have one execution plan, got: {num_plans}')
+    return execution_plans[0]
+
+class _AssertRaisesRegexWithHighlightContext:
+    """
+    A context manager that is useful for checking that error messages highlight
+    the correct part of the source code.
+    """
+
+    def __init__(self, test_case, exception, regex, highlight):
+        self.test_case = test_case
+        self.exception_type = exception
+        self.regex = regex
+        self.highlight = highlight
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        with self.test_case.assertRaisesRegex(self.exception_type, self.regex):
+            if type:
+                raise value
+
+        if self.highlight:
+            FileCheck().check_source_highlighted(self.highlight).run(str(value))
+
+        return True
+
+FUSION_GROUP = "prim::TensorExprGroup"
+
+class JitTestCase(JitCommonTestCase):
+    _do_cuda_memory_leak_check = True
+    _restored_warnings = False
+
+    class capture_stdout(list):
+        """
+        Replace sys.stdout with a temporary StringIO
+        """
+        def __enter__(self):
+            self.sys_stdout = sys.stdout
+            self.stringio = StringIO()
+            sys.stdout = self.stringio
+            return self
+
+        def __exit__(self, *args):
+            self.append(str(self.stringio.getvalue()))
+            del self.stringio
+            sys.stdout = self.sys_stdout
+
+    class capture_stderr(list):
+        """
+        Replace sys.stderr with a temporary StringIO
+        """
+        def __enter__(self):
+            self.sys_stderr = sys.stderr
+            self.stringio = StringIO()
+            sys.stderr = self.stringio
+            return self
+
+        def __exit__(self, *args):
+            self.append(str(self.stringio.getvalue()))
+            del self.stringio
+            sys.stderr = self.sys_stderr
+
+    def setHooks(self):
+        torch._C._jit_set_emit_hooks(self.emitModuleHook, self.emitFunctionHook)
+
+    def clearHooks(self):
+        torch._C._jit_set_emit_hooks(None, None)
+
+    def setUp(self):
+        super().setUp()
+        # unittest overrides all warning filters and forces all of them to show up
+        # after we install our own to silence those coming from inside PyTorch.
+        # This will ensure that our filter still takes precedence.
+        if not JitTestCase._restored_warnings:
+            torch.jit.TracerWarning.ignore_lib_warnings()
+            JitTestCase._restored_warnings = True
+        self.setHooks()
+
+    def tearDown(self):
+        super().tearDown()
+        # needs to be cleared because python might be unloaded before
+        # the callback gets destructed
+        self.clearHooks()
+        clear_class_registry()
+
+    def assertAllFused(self, graph, except_for=()):
+
+        # note this helper collects nodes on 'fast path' only
+        # i.e. the true blocks of specialized checks
+        def get_nodes_and_parents_recursively(block, kind, acc):
+            for node in block.nodes():
+                if node.kind() == kind:
+                    acc[block].append(node)
+                elif node.kind() == 'prim::DifferentiableGraph':
+                    get_nodes_and_parents_recursively(node.g('Subgraph'), kind, acc)
+                elif node.kind() == 'prim::If' and (node.inputs().__next__().node().kind() == 'aten::all' or
+                                                    node.inputs().__next__().node().kind() == 'prim::TypeCheck' or
+                                                    node.inputs().__next__().node().kind() == 'prim::RequiresGradCheck'):
+                    get_nodes_and_parents_recursively(node.blocks().__next__(), kind, acc)
+                else:
+                    for inner_block in node.blocks():
+                        get_nodes_and_parents_recursively(inner_block, kind, acc)
+
+        allowed_nodes = {'prim::Constant', FUSION_GROUP, 'prim::BailoutTemplate',
+                         'prim::TupleConstruct', 'prim::If', 'prim::TypeCheck', 'prim::RequiresGradCheck'} | set(except_for)
+
+        fusion_groups : Dict[torch._C.Block, List[torch._C.Node]] = defaultdict(list)
+        get_nodes_and_parents_recursively(graph, FUSION_GROUP, fusion_groups)
+        self.assertTrue(len(fusion_groups) == 1, f'got {graph}')
+        (graph, fusion_nodes) = next(iter(fusion_groups.items()))
+        # the block contains one FUSION_GROUP and the rest of nodes are `allowed_nodes`
+        self.assertTrue(len(fusion_nodes) == 1, f'got {graph}')
+        self.assertTrue(all(node.kind() in allowed_nodes for node in graph.nodes()),
+                        f'got {graph}')
+
+    def _isHookExceptionOk(self, e):
+        se = str(e)
+        allowed = ("Could not export Python function",
+                   "closures are not exportable")
+        for a in allowed:
+            if a in se:
+                return True
+        return False
+
+    def _compared_saved_loaded(self, m):
+        def extract_files(buffer):
+            # crack open the zip format to get at the main module code
+            archive = zipfile.ZipFile(buffer)
+            # check that we have no duplicate names
+            self.assertEqual(len(set(archive.namelist())), len(archive.namelist()))
+            files = list(filter(lambda x: x.startswith('archive/code/'), archive.namelist()))
+            # unwrap all the code files into strings
+            code_files_str = filter(lambda x: x.endswith('.py'), files)
+            code_files_stream = (archive.open(f) for f in code_files_str)
+            code_files = ("".join([line.decode() for line in file]) for file in code_files_stream)
+
+            # unpickled all the debug files
+            debug_files_str = filter(lambda f: f.endswith('.debug_pkl'), files)
+            debug_files_stream = (archive.open(f) for f in debug_files_str)
+            debug_files = (pickle.load(f) for f in debug_files_stream)
+            return code_files, debug_files
+
+        # disable the hook while we parse code, otherwise we will re-enter the hook
+        with torch._jit_internal._disable_emit_hooks():
+            try:
+                # short-circuit if this is an empty function or module
+                if len(m.code) == 0:
+                    return
+                if isinstance(m, torch._C.ScriptModule):
+                    if len(m._method_names()) == 0:
+                        return
+
+                # save the module to a buffer
+                buffer = io.BytesIO()
+                torch.jit.save(m, buffer)
+                # copy the data in the buffer so we can restore it later. This
+                # is because py2 and py3 have different semantics with zipfile
+                # and it's easier to just work with a fresh copy each time.
+                buffer_copy = buffer.getvalue()
+
+                code_files, debug_files = extract_files(buffer)
+
+            except RuntimeError as e:
+                if not self._isHookExceptionOk(e):
+                    raise
+                else:
+                    return
+
+            # import the model again (from a the copy we made of the original)
+            buffer2 = io.BytesIO(buffer_copy)
+            imported = torch.jit.load(buffer2)
+
+            # save it again
+            saved_module_buffer_2 = io.BytesIO()
+            torch.jit.save(imported, saved_module_buffer_2)
+
+            saved_module_buffer_2.seek(0)
+            code_files_2, debug_files_2 = extract_files(saved_module_buffer_2)
+
+            for a, b in zip(code_files, code_files_2):
+                self.assertMultiLineEqual(a, b)
+
+            if isinstance(m, torch._C.ScriptModule):
+                self.assertTrue(torch._C._ivalue_tags_match(m, imported._c))
+
+
+    def emitFunctionHook(self, func):
+        # func has invalid names for export, skip the jitter check
+        if func.name == "<lambda>" or "aten::" in func.name:
+            return
+        self._compared_saved_loaded(func)
+
+    def emitModuleHook(self, module):
+        self._compared_saved_loaded(module)
+
+
+    def getExportImportCopyWithPacking(self, m, also_test_file=True, map_location=None):
+        buffer = io.BytesIO()
+        m.apply(lambda s: s._pack() if s._c._has_method('_pack') else None)
+        torch.jit.save(m, buffer)
+        m.apply(lambda s: s._unpack() if s._c._has_method('_unpack') else None)
+        buffer.seek(0)
+        imported = torch.jit.load(buffer, map_location=map_location)
+        imported.apply(lambda s: s._unpack() if s._c._has_method('_unpack') else None)
+
+        if not also_test_file:
+            return imported
+
+        # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
+        # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
+        # close the file after creation and try to remove it manually
+        f = tempfile.NamedTemporaryFile(delete=False)
+        try:
+            f.close()
+            imported.save(f.name)
+            result = torch.jit.load(f.name, map_location=map_location)
+        finally:
+            os.unlink(f.name)
+
+        result.apply(lambda s: s._unpack() if s._c._has_method('_unpack') else None)
+        return result
+
+    def assertGraphContains(self, graph, kind, consider_subgraphs=False):
+
+        if consider_subgraphs:
+            strgraph = str(graph)
+            count = strgraph.count(kind) - strgraph.count(f'with {kind}')
+            self.assertTrue(count > 0)
+            return
+
+        def nodes(block):
+            out = []
+            for node in block.nodes():
+                if node.kind() == kind:
+                    out.append(node)
+                for block in node.blocks():
+                    out += nodes(block)
+            return out
+
+        out_nodes = nodes(graph)
+        self.assertTrue(len(out_nodes) > 0)
+
+    def assertGraphContainsExactly(self, graph, kind, num_kind_nodes, consider_subgraphs=False):
+        def perform_assert(graph, kind, actual, expected, consider_subgraphs):
+            if actual == expected:
+                return
+            subgraph = 'including' if consider_subgraphs else 'excluding'
+            raise AssertionError(
+                f'{graph}\nError: graph contains {actual} {kind} nodes ({subgraph} subgraphs) but expected {expected}')
+
+        if consider_subgraphs:
+            strgraph = str(graph)
+            count = strgraph.count(kind) - strgraph.count(f'with {kind}')
+            perform_assert(graph, kind, count, num_kind_nodes,
+                           consider_subgraphs)
+            return
+
+        def nodes(block):
+            out = []
+            for node in block.nodes():
+                if node.kind() == kind:
+                    out.append(node)
+                for block in node.blocks():
+                    out += nodes(block)
+            return out
+
+        out_nodes = nodes(graph)
+        perform_assert(graph, kind, len(out_nodes), num_kind_nodes,
+                       consider_subgraphs)
+
+    def assertExpectedONNXGraph(self, g, *args, **kwargs):
+        g = torch.onnx._optimize_trace(g, operator_export_type=OperatorExportTypes.ONNX)
+        self.assertExpectedGraph(g, *args, **kwargs)
+
+    def assertExpectedGraph(self, trace, *args, **kwargs):
+        if isinstance(trace, torch._C.Graph):
+            graph = trace
+        else:
+            graph = trace.graph()
+
+        torch._C._jit_pass_lint(graph)
+        torch._C._jit_pass_dce(graph)
+        torch._C._jit_pass_lint(graph)
+        graph = torch._C._jit_pass_canonicalize(graph)
+        torch._C._jit_pass_lint(graph)
+        self.assertExpected(str(graph), *args, **kwargs)
+
+    def run_pass(self, name, trace):
+        if isinstance(trace, torch._C.Graph):
+            graph = trace
+            set_graph = False
+        else:
+            set_graph = True
+            graph = trace.graph()
+
+        torch._C._jit_pass_lint(graph)
+        result = getattr(torch._C, '_jit_pass_' + name)(graph)
+        if result is not None and not isinstance(result, bool):
+            graph = result
+        torch._C._jit_pass_lint(graph)
+
+        if set_graph:
+            trace.set_graph(graph)
+        return graph
+
+    def get_frame_vars(self, frames_up):
+        frame = inspect.currentframe()
+        if not frame:
+            raise RuntimeError("failed to inspect frame")
+        i = 0
+        while i < frames_up + 1:
+            frame = frame.f_back
+            if not frame:
+                raise RuntimeError("failed to get frame")
+            i += 1
+        defined_vars: Dict[str, Any] = {}
+        defined_vars.update(frame.f_locals)
+        defined_vars.update(frame.f_globals)
+        return defined_vars
+
+    def assertRaisesRegexWithHighlight(self, exception, regex, highlight):
+        return _AssertRaisesRegexWithHighlightContext(self, exception, regex, highlight)
+
+    def checkScriptRaisesRegex(self, script, inputs, exception, regex,
+                               name=None, outputs=None, capture_output=False,
+                               frames_up=1, profiling=ProfilingMode.PROFILING):
+        """
+        Checks that a given function will throw the correct exception,
+        when executed with normal python, the string frontend, and the
+        AST frontend. Logic taken from `checkScript` (see comments there
+        for details)
+        """
+        with enable_profiling_mode_for_profiling_tests():
+            # Normal Python
+            with self.assertRaisesRegex(exception, regex):
+                if isinstance(script, str):
+                    frame = self.get_frame_vars(frames_up)
+                    the_locals: Dict[str, Any] = {}
+                    execWrapper(script, glob=frame, loc=the_locals)
+                    frame.update(the_locals)
+
+                    python_fn = frame[name]
+                else:
+                    python_fn = script
+
+                python_fn(*inputs)
+
+            # String frontend
+            with self.assertRaisesRegex(exception, regex):
+                if isinstance(script, str):
+                    cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
+                    string_frontend = getattr(cu, name)
+                else:
+                    source = textwrap.dedent(inspect.getsource(script))
+                    cu = torch.jit.CompilationUnit(source, _frames_up=frames_up)
+                    string_frontend = getattr(cu, script.__name__)
+
+                string_frontend(*inputs)
+
+            # Python AST frontend
+            if not isinstance(script, str):
+                with self.assertRaisesRegex(exception, regex):
+                    ge = torch.jit.script(python_fn)
+                    ge(*inputs)
+
+    def checkBailouts(self, model, inputs, expected):
+        state = model.get_debug_state()
+        plan = get_execution_plan(state)
+        num_bailouts = plan.code.num_bailouts()
+        for i in range(0, num_bailouts):
+            plan.code.request_bailout(i)
+            bailout_outputs = model(*inputs)
+            self.assertEqual(bailout_outputs, expected)
+
+    def checkScript(self,
+                    script,
+                    inputs,
+                    name='func',
+                    optimize=True,
+                    inputs_requires_grad=False,
+                    capture_output=False,
+                    frames_up=1,
+                    profiling=ProfilingMode.PROFILING,
+                    atol=None,
+                    rtol=None):
+        """
+        Checks that a given script generates the same output as the Python
+        version using the given inputs.
+        """
+        with torch.jit.optimized_execution(optimize):
+            with enable_profiling_mode_for_profiling_tests():
+                extra_profile_runs = any(isinstance(x, torch.Tensor) and x.requires_grad for x in inputs)
+                if isinstance(script, str):
+                    # Compile the string to a Script function
+                    # with enable_profiling_mode():
+                    cu = torch.jit.CompilationUnit(script, _frames_up=frames_up)
+
+                    # Execute the Python function so we can run it later and get its
+                    # outputs
+
+                    frame = self.get_frame_vars(frames_up)
+                    the_locals: Dict[str, Any] = {}
+                    execWrapper(script, glob=frame, loc=the_locals)
+                    frame.update(the_locals)
+
+                    python_fn = frame[name]
+                    scripted_fn = getattr(cu, name)
+                else:
+
+                    # Check the string frontend first
+                    source = textwrap.dedent(inspect.getsource(script))
+                    self.checkScript(
+                        source,
+                        inputs,
+                        script.__name__,
+                        optimize=optimize,
+                        inputs_requires_grad=inputs_requires_grad,
+                        capture_output=capture_output,
+                        profiling=profiling,
+                        frames_up=2)
+
+                    # Continue checking the Python frontend
+                    scripted_fn = torch.jit.script(script, _frames_up=1)
+                    python_fn = script
+
+                if inputs_requires_grad:
+                    recording_inputs = do_input_map(lambda t: t.detach().requires_grad_(), inputs)
+                else:
+                    recording_inputs = inputs
+
+                if capture_output:
+                    with self.capture_stdout() as script_stdout:
+                        script_outputs = scripted_fn(*recording_inputs)
+                    with self.capture_stdout() as opt_script_stdout:
+                        opt_script_outputs = scripted_fn(*recording_inputs)
+                    with self.capture_stdout() as _python_stdout:
+                        python_outputs = python_fn(*inputs)
+                    if not IS_WINDOWS:
+                        self.assertExpected(script_stdout[0], subname='stdout')
+                    self.assertEqual(python_outputs, opt_script_outputs, atol=atol, rtol=rtol)
+                else:
+                    # profiling run
+                    script_outputs = scripted_fn(*recording_inputs)
+                    if inputs_requires_grad or extra_profile_runs:
+                        opt_script_outputs = scripted_fn(*recording_inputs)
+                    # optimized run
+                    opt_script_outputs = scripted_fn(*recording_inputs)
+                    if TEST_BAILOUTS:
+                        self.checkBailouts(scripted_fn, inputs, opt_script_outputs)
+                    python_outputs = python_fn(*inputs)
+                self.assertEqual(python_outputs, script_outputs, atol=atol, rtol=rtol)
+                self.assertEqual(script_outputs, opt_script_outputs, atol=atol, rtol=rtol)
+                return scripted_fn
+
+    def checkTrace(self, func, reference_tensors, input_tensors=None,
+                   drop=None, allow_unused=False, verbose=False,
+                   inputs_require_grads=True, check_tolerance=1e-5, export_import=True,
+                   _force_outplace=False, grad_atol=None, grad_rtol=None):
+
+        # TODO: check gradients for parameters, not just inputs
+        def allSum(vs):
+            # drop allows us to remove some values from ever being used
+            # to test unused outputs
+            if drop is not None:
+                vs = vs[:-drop]
+            # we don't want all the grad for all the outputs to be the same
+            # so we multiply each by a constant
+            return sum(math.log(i + 2) * v.sum() for i, v in enumerate(vs) if v is not None)
+        if input_tensors is None:
+            input_tensors = reference_tensors
+
+        def flatten_inputs(inputs):
+            def input_reduce(input, fn, acc):
+                if isinstance(input, torch.Tensor):
+                    fn(input, acc)
+                elif isinstance(input, dict):
+                    reduce(lambda acc, key: input_reduce(input[key], fn, acc), input, acc)
+                else:
+                    reduce(lambda acc, val: input_reduce(val, fn, acc), input, acc)
+                return acc
+            return tuple(input_reduce(recording_inputs, lambda t, acc: acc.append(t), []))
+
+        nograd_inputs = reference_tensors
+        if inputs_require_grads:
+            recording_inputs = do_input_map(lambda t: t.clone().requires_grad_(), reference_tensors)
+            flattened_recording_inputs = flatten_inputs(recording_inputs)
+        else:
+            recording_inputs = reference_tensors
+
+        # `check_trace` is set to False because check_trace is run with @no_grad
+        # Also, `checkTrace` already does all the checks
+        # against python function
+        ge = torch.jit.trace(func, input_tensors, check_tolerance=check_tolerance,
+                             _force_outplace=_force_outplace, check_trace=False)
+
+        if export_import:
+            ge = self.getExportImportCopy(ge)
+
+        if verbose:
+            print(ge.graph)
+
+        # test no gradients case
+        outputs = func(*nograd_inputs)
+        outputs_ge = ge(*nograd_inputs)
+        self.assertEqual(outputs, outputs_ge)
+
+        # test gradients case
+        outputs = func(*recording_inputs)
+        if inputs_require_grads:
+            grads = torch.autograd.grad(allSum(outputs), flattened_recording_inputs,
+                                        allow_unused=allow_unused)
+
+        outputs_ge = ge(*recording_inputs)
+        if inputs_require_grads:
+            grads_ge = torch.autograd.grad(allSum(outputs_ge), flattened_recording_inputs,
+                                           allow_unused=allow_unused)
+        self.assertEqual(outputs, outputs_ge)
+        if inputs_require_grads:
+            self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
+
+        # test the grad grad case
+        outputs = func(*recording_inputs)
+        l1 = allSum(outputs)
+        if inputs_require_grads:
+            grads = torch.autograd.grad(l1, flattened_recording_inputs, create_graph=True,
+                                        allow_unused=allow_unused)
+        if inputs_require_grads:
+            l2 = (allSum(grads) * l1)
+            grads2 = torch.autograd.grad(l2, flattened_recording_inputs, allow_unused=allow_unused)
+
+        if inputs_require_grads:
+            recording_inputs = do_input_map(lambda t: Variable(t, requires_grad=True), reference_tensors)
+            flattened_recording_inputs = flatten_inputs(recording_inputs)
+
+        outputs_ge = ge(*recording_inputs)
+        l1_ge = allSum(outputs_ge)
+        if inputs_require_grads:
+            grads_ge = torch.autograd.grad(
+                l1_ge, flattened_recording_inputs, create_graph=True, allow_unused=allow_unused)
+
+        if inputs_require_grads:
+            l2_ge = (allSum(grads_ge) * l1_ge)
+            grads2_ge = torch.autograd.grad(l2_ge, flattened_recording_inputs, allow_unused=allow_unused)
+
+        self.assertEqual(outputs, outputs_ge)
+        if inputs_require_grads:
+            self.assertEqual(grads, grads_ge, atol=grad_atol, rtol=grad_rtol)
+            for g2, g2_ge in zip(grads2, grads2_ge):
+                if g2 is None and g2_ge is None:
+                    continue
+                self.assertEqual(g2, g2_ge, atol=8e-4, rtol=8e-4)
+
+        return ge
+
+    def checkModule(self, nn_module, args):
+        """
+        Check that a nn.Module's results in Script mode match eager and that it
+        can be exported
+        """
+        sm = torch.jit.script(nn_module)
+
+        with freeze_rng_state():
+            eager_out = nn_module(*args)
+
+        with freeze_rng_state():
+            script_out = sm(*args)
+
+        self.assertEqual(eager_out, script_out)
+        self.assertExportImportModule(sm, args)
+
+        return sm
+
+class NoTracerWarnContextManager:
+    def __enter__(self):
+        self.prev = torch._C._jit_get_tracer_state_warn()
+        torch._C._jit_set_tracer_state_warn(False)
+
+    def __exit__(self, *args):
+        torch._C._jit_set_tracer_state_warn(self.prev)
+
+@contextmanager
+def inline_everything_mode(should_inline):
+    old = torch._C._jit_get_inline_everything_mode()
+    torch._C._jit_set_inline_everything_mode(should_inline)
+    try:
+        yield
+    finally:
+        torch._C._jit_set_inline_everything_mode(old)
+
+@contextmanager
+def set_fusion_group_inlining(inlining):
+    old = torch._C._debug_get_fusion_group_inlining()
+    torch._C._debug_set_fusion_group_inlining(inlining)
+    try:
+        yield
+    finally:
+        torch._C._debug_set_fusion_group_inlining(old)
+
+# note: not re-entrant, use unnested only
+@contextmanager
+def disable_autodiff_subgraph_inlining(enabled=True):
+    torch._C._debug_set_autodiff_subgraph_inlining(not enabled)
+    try:
+        yield
+    finally:
+        torch._C._debug_set_autodiff_subgraph_inlining(True)
+
+def _inline_everything(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        with inline_everything_mode(True):
+            fn(*args, **kwargs)
+    return wrapper
+
+# this exists for forward compatibility reasons temporarily.
+# TODO(suo) remove
+def _tmp_donotuse_dont_inline_everything(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        with inline_everything_mode(False):
+            fn(*args, **kwargs)
+    return wrapper
+
+# make it easy to quicky define/trace a function for these tests
+def _trace(*args, **kwargs):
+    def wrapper(func):
+        return torch.jit.trace(func, args, **kwargs)
+    return wrapper
+
+
+def enable_cpu_fuser(fn):
+    def wrapper(*args, **kwargs):
+        torch._C._jit_override_can_fuse_on_cpu_legacy(True)
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_set_te_must_use_llvm_cpu(False)
+        try:
+            fn(*args, **kwargs)
+        finally:
+            torch._C._jit_override_can_fuse_on_cpu_legacy(False)
+            torch._C._jit_override_can_fuse_on_cpu(False)
+            torch._C._jit_set_te_must_use_llvm_cpu(True)
+    return wrapper
+
+
+def enable_cpu_fuser_if(cond):
+    if cond:
+        return enable_cpu_fuser
+    else:
+        def noop_fuser(fn):
+            def wrapper(*args, **kwargs):
+                return fn(*args, **kwargs)
+            return wrapper
+        return noop_fuser
+
+def get_forward(c):
+    return c._get_method('forward')
+
+def get_forward_graph(c):
+    return c._get_method('forward').graph
+
+def get_module_method(m, module, method):
+    return m._c.getattr(module)._get_method(method)
+
+def attrs_with_prefix(module, prefix):
+    return [x for x, _ in module._modules._c.items()
+            if x.startswith(prefix)]
+
+def warmup_backward(f, *args):
+    profiling_count = 3
+    results = []
+    for i in range(profiling_count):
+        if len(args) > 0:
+            r = torch.autograd.grad(f, *args)
+            results.append(r)
+        else:
+            f.backward(retain_graph=True)
+
+    return results
+
+# TODO: Remove me once https://bugs.python.org/issue42666 is resolved
+def make_global(*args):
+    for arg in args:
+        setattr(sys.modules[arg.__module__], arg.__name__, arg)
+
+# Helper function to eval Python3 code without causing a syntax error for
+# this file under py2
+def _get_py3_code(code, fn_name):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        script_path = os.path.join(tmp_dir, 'script.py')
+        with open(script_path, 'w') as f:
+            f.write(code)
+        spec = importlib.util.spec_from_file_location(fn_name, script_path)
+        module = importlib.util.module_from_spec(spec)
+        loader = spec.loader
+        assert isinstance(loader, Loader)  # Assert type to meet MyPy requirement
+        loader.exec_module(module)
+        fn = getattr(module, fn_name)
+        return fn
+
+class TensorExprTestOptions:
+    def __init__(self):
+        self.old_profiling_executor = torch._C._jit_set_profiling_executor(True)
+        self.old_profiling_mode = torch._C._get_graph_executor_optimize(True)
+
+        self.old_cpu_fuser_state = torch._C._jit_can_fuse_on_cpu()
+        self.old_gpu_fuser_state = torch._C._jit_can_fuse_on_gpu()
+        torch._C._jit_override_can_fuse_on_cpu(True)
+        torch._C._jit_override_can_fuse_on_gpu(True)
+        self.texpr_fuser_state = torch._C._jit_texpr_fuser_enabled()
+        torch._C._jit_set_texpr_fuser_enabled(True)
+        self.old_fusion_inlining = torch._C._debug_get_fusion_group_inlining()
+        torch._C._debug_set_fusion_group_inlining(False)
+        self.old_te_must_use_llvm_cpu = torch._C._jit_get_te_must_use_llvm_cpu()
+        torch._C._jit_set_te_must_use_llvm_cpu(False)
+
+    def restore(self):
+        torch._C._jit_set_profiling_executor(self.old_profiling_executor)
+        torch._C._get_graph_executor_optimize(self.old_profiling_mode)
+
+        torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
+        torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuser_state)
+        torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
+        torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining)
+        torch._C._jit_set_te_must_use_llvm_cpu(self.old_te_must_use_llvm_cpu)
+
+def clone_inputs(args):
+    inputs: List[Union[torch.Tensor, List[torch.Tensor]]] = []
+
+    for arg in args:
+        if isinstance(arg, torch.Tensor):
+            inputs.append(arg.detach().clone())
+        elif is_iterable_of_tensors(arg):
+            inputs.append([t.detach().clone() for t in arg])
+        else:
+            inputs.append(arg)
+
+    return inputs
+
+def get_traced_sample_variant_pairs(device, dtype, op):
+    # tuples of (variant, sample)
+    outputs: List[Tuple[Any, Any]] = []
+
+    samples = op.sample_inputs(device, dtype)
+
+    # Acquires variants to test
+    func = op.get_op()
+    method = op.get_method()
+    variants = {
+        # TODO: inplace tests currently fail, fix and add inplace variant
+        'function': func, 'method': method,
+    }
+
+    # TODO: find better way to standardize on op registration itself..
+    has_fake_function = op.name in ["resize_", 'resize_as_']
+
+    if has_fake_function:
+        variants = {'method': getattr(torch.Tensor, op.name)}
+
+    # In eager mode, these ops can take (Tensor, bool) args; but in
+    # JIT they can only take (Tensor, Scalar), and bool is not a
+    # scalar in the JIT type system. So to test these in JIT, the bool
+    # is converted to an int for the test.
+    ops_with_unsupported_bool_args = [
+        {
+            "name": "div_floor_rounding",
+            "arg_idx": [0],
+        },
+        {
+            "name": "div_no_rounding_mode",
+            "arg_idx": [0],
+        },
+        {
+            "name": "div_trunc_rounding",
+            "arg_idx": [0],
+        },
+        {
+            "name": "index_fill",
+            "arg_idx": [2],
+        },
+        {
+            "name": "full_like",
+            "arg_idx": [0],
+        },
+        {
+            "name": "mul",
+            "arg_idx": [0],
+        },
+        {
+            "name": "new_full",
+            "arg_idx": [1],
+        },
+    ]
+
+    # doesn't support tracing
+    if has_fake_function:
+        return outputs
+
+    for sample in samples:
+        for variant in variants.values():
+            if variant is None:
+                continue
+
+            if is_lambda(variant):
+                continue
+
+            matching_ops = filter(lambda x: op.formatted_name == x["name"], ops_with_unsupported_bool_args)
+            for op_data in matching_ops:
+                for idx in op_data["arg_idx"]:
+                    args = list(sample.args)
+                    if len(sample.args) > idx and isinstance(sample.args[idx], bool):
+                        args[idx] = int(args[idx])
+                    sample.args = tuple(args)
+
+            outputs.append((variant, sample))
+
+    return outputs
+
+# types.LambdaType gave false positives
+def is_lambda(lamb):
+    LAMBDA = lambda: 0  # noqa: E731
+    return isinstance(lamb, type(LAMBDA)) and lamb.__name__ == LAMBDA.__name__
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/logging_tensor.py b/MLPY/Lib/site-packages/torch/testing/_internal/logging_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dab8bd075ca99bab275d1ee9a0d73ab1782629f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/logging_tensor.py
@@ -0,0 +1,182 @@
+# mypy: ignore-errors
+
+import torch
+from torch.utils._pytree import tree_map
+from typing import Iterator, List, Optional
+import logging
+import contextlib
+import itertools
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.weak import WeakTensorKeyDictionary
+import functools
+from torch._C._profiler import gather_traceback, symbolize_tracebacks
+
+
+_dtype_abbrs = {
+    torch.bfloat16: "bf16",
+    torch.float64: "f64",
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.complex32: "c32",
+    torch.complex64: "c64",
+    torch.complex128: "c128",
+    torch.int8: "i8",
+    torch.int16: "i16",
+    torch.int32: "i32",
+    torch.int64: "i64",
+    torch.bool: "b8",
+    torch.uint8: "u8",
+}
+
+# How the chain of calls works for LoggingTensor:
+# 1. Call torch.sin
+# 2. Attempt __torch_function__. In LoggingTensor torch function is disabled so we bypass it entirely
+# 3. Enter dispatcher, wind your way through Autograd
+# 4. Hit Python dispatch key, call __torch_dispatch__
+
+# This Tensor can work with autograd in two ways:
+#  - The wrapped Tensor does not require gradients. In that case, the LoggingTensor
+#    can require gradients if the user asks for it as a constructor kwarg.
+#  - The wrapped Tensor can require gradients. In that case autograd will be tracked
+#    for the wrapped Tensor and the LoggingTensor itself cannot require gradients.
+# WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single
+# test or you might get surprising behavior.
+
+# TODO: TensorBase should work
+class LoggingTensor(torch.Tensor):
+    elem: torch.Tensor
+
+    __slots__ = ['elem']
+
+    context = contextlib.nullcontext
+
+    @staticmethod
+    def __new__(cls, elem, *args, **kwargs):
+        # The wrapping tensor (LoggingTensor) shouldn't hold any
+        # memory for the class in question, but it should still
+        # advertise the same device as before
+        r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
+            cls, elem.size(),
+            strides=elem.stride(), storage_offset=elem.storage_offset(),
+            # TODO: clone storage aliasing
+            dtype=elem.dtype, layout=elem.layout,
+            device=elem.device, requires_grad=kwargs.get("requires_grad", False)
+        )
+        # ...the real tensor is held as an element on the tensor.
+        r.elem = elem.detach() if r.requires_grad else elem
+        return r
+
+    def __repr__(self):
+        return super().__repr__(tensor_contents=f"{self.elem}")
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        def unwrap(e):
+            return e.elem if isinstance(e, cls) else e
+
+        def wrap(e):
+            return cls(e) if isinstance(e, torch.Tensor) else e
+
+        with cls.context():
+            rs = tree_map(wrap, func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
+        logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
+        return rs
+
+class LoggingTensorMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        rs = func(*args, **kwargs)
+        logging.getLogger("LoggingTensor").info(f"{func.__module__}.{func.__name__}", args, kwargs, rs)
+        return rs
+
+class LoggingTensorReentrant(LoggingTensor):
+    context = torch.overrides.enable_reentrant_dispatch
+
+# https://stackoverflow.com/questions/36408496/python-logging-handler-to-append-to-list
+class LoggingTensorHandler(logging.Handler):
+    def __init__(
+            self, log_list: List[str], use_shortid_for_all_tensors: bool,
+            with_type: bool, tracebacks_list: Optional[List]) -> None:
+        logging.Handler.__init__(self)
+        self.log_list = log_list
+        self.use_shortid_for_all_tensors = use_shortid_for_all_tensors
+        self.tracebacks_list = tracebacks_list
+        self.memo = WeakTensorKeyDictionary()
+        self.next_id = 0
+        self.with_type = with_type
+
+    def _shortid(self, t: torch.Tensor) -> int:
+        if t not in self.memo:
+            self.memo[t] = self.next_id
+            self.next_id += 1
+        return self.memo[t]
+
+    def _fmt(self, a: object, with_type: bool = False) -> str:
+        cond_cls = torch.Tensor if self.use_shortid_for_all_tensors else LoggingTensor
+        if isinstance(a, cond_cls):
+            maybe_type = ""
+            if with_type and self.with_type:
+                maybe_type = f": {_dtype_abbrs[a.dtype]}[{', '.join(map(str, a.shape))}]"
+            x = f"${self._shortid(a)}{maybe_type}"
+            return x
+        else:
+            return repr(a)
+
+    def emit(self, record):
+        fmt_args = ", ".join(
+            itertools.chain(
+                (str(tree_map(self._fmt, a)) for a in record.args[0]),
+                (f"{k}={str(tree_map(self._fmt, v))}" for k, v in record.args[1].items()),
+            )
+        )
+        fmt_rets = tree_map(functools.partial(self._fmt, with_type=True), record.args[2])
+        self.log_list.append(f'{fmt_rets} = {record.msg}({fmt_args})')
+        if self.tracebacks_list is not None:
+            self.tracebacks_list.append(record.traceback)
+
+def log_input(name: str, var: object):
+    logging.getLogger("LoggingTensor").info("input", (name,), {}, var)
+
+class GatherTraceback(logging.Filter):
+    def __init__(self, python=True, script=True, cpp=False):
+        self.python = python
+        self.script = script
+        self.cpp = cpp
+
+    def filter(self, record):
+        record.traceback = gather_traceback(python=self.python, script=self.script, cpp=self.cpp)
+        return True
+
+@contextlib.contextmanager
+def capture_logs(is_mode=False, python_tb=False, script_tb=False, cpp_tb=False) -> Iterator[List[str]]:
+    collect_traceback = python_tb or script_tb or cpp_tb
+    logger = logging.getLogger("LoggingTensor")
+    log_list: List[str] = []
+    tracebacks_list: List[str] = []
+    handler = LoggingTensorHandler(
+        log_list,
+        with_type=True,
+        use_shortid_for_all_tensors=is_mode,
+        tracebacks_list=tracebacks_list if collect_traceback else None
+    )
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    if collect_traceback:
+        logger.addFilter(GatherTraceback(python=python_tb, script=script_tb, cpp=cpp_tb))
+    try:
+        if collect_traceback:
+            yield log_list, tracebacks_list
+        else:
+            yield log_list
+    finally:
+        symbolized_tracebacks = symbolize_tracebacks(tracebacks_list)
+        tracebacks_list.clear()
+        tracebacks_list.extend(symbolized_tracebacks)
+        logger.removeHandler(handler)
+
+@contextlib.contextmanager
+def capture_logs_with_logging_tensor_mode(python_tb=False, script_tb=False, cpp_tb=False):
+    with LoggingTensorMode(), capture_logs(True, python_tb, script_tb, cpp_tb) as logs:
+        yield logs
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/logging_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/logging_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a623aa079bc960caadeec04c5c1f8f27a7b2d3ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/logging_utils.py
@@ -0,0 +1,208 @@
+# mypy: ignore-errors
+
+import torch._dynamo.test_case
+import unittest.mock
+import os
+import contextlib
+import torch._logging
+import torch._logging._internal
+from torch._dynamo.utils import LazyString
+import logging
+import io
+
+@contextlib.contextmanager
+def preserve_log_state():
+    prev_state = torch._logging._internal._get_log_state()
+    torch._logging._internal._set_log_state(torch._logging._internal.LogState())
+    try:
+        yield
+    finally:
+        torch._logging._internal._set_log_state(prev_state)
+        torch._logging._internal._init_logs()
+
+def log_settings(settings):
+    exit_stack = contextlib.ExitStack()
+    settings_patch = unittest.mock.patch.dict(os.environ, {"TORCH_LOGS": settings})
+    exit_stack.enter_context(preserve_log_state())
+    exit_stack.enter_context(settings_patch)
+    torch._logging._internal._init_logs()
+    return exit_stack
+
+def log_api(**kwargs):
+    exit_stack = contextlib.ExitStack()
+    exit_stack.enter_context(preserve_log_state())
+    torch._logging.set_logs(**kwargs)
+    return exit_stack
+
+
+def kwargs_to_settings(**kwargs):
+    INT_TO_VERBOSITY = {10: "+", 20: "", 40: "-"}
+
+    settings = []
+
+    def append_setting(name, level):
+        if isinstance(name, str) and isinstance(level, int) and level in INT_TO_VERBOSITY:
+            settings.append(INT_TO_VERBOSITY[level] + name)
+            return
+        else:
+            raise ValueError("Invalid value for setting")
+
+    for name, val in kwargs.items():
+        if isinstance(val, bool):
+            settings.append(name)
+        elif isinstance(val, int):
+            append_setting(name, val)
+        elif isinstance(val, dict) and name == "modules":
+            for module_qname, level in val.items():
+                append_setting(module_qname, level)
+        else:
+            raise ValueError("Invalid value for setting")
+
+    return ",".join(settings)
+
+
+# Note on testing strategy:
+# This class does two things:
+# 1. Runs two versions of a test:
+#    1a. patches the env var log settings to some specific value
+#    1b. calls torch._logging.set_logs(..)
+# 2. patches the emit method of each setup handler to gather records
+# that are emitted to each console stream
+# 3. passes a ref to the gathered records to each test case for checking
+#
+# The goal of this testing in general is to ensure that given some settings env var
+# that the logs are setup correctly and capturing the correct records.
+def make_logging_test(**kwargs):
+    def wrapper(fn):
+        def test_fn(self):
+
+            torch._dynamo.reset()
+            records = []
+            # run with env var
+            if len(kwargs) == 0:
+                with self._handler_watcher(records):
+                    fn(self, records)
+            else:
+                with log_settings(kwargs_to_settings(**kwargs)), self._handler_watcher(records):
+                    fn(self, records)
+
+            # run with API
+            torch._dynamo.reset()
+            records.clear()
+            with log_api(**kwargs), self._handler_watcher(records):
+                fn(self, records)
+
+
+        return test_fn
+
+    return wrapper
+
+def make_settings_test(settings):
+    def wrapper(fn):
+        def test_fn(self):
+            torch._dynamo.reset()
+            records = []
+            # run with env var
+            with log_settings(settings), self._handler_watcher(records):
+                fn(self, records)
+
+        return test_fn
+
+    return wrapper
+
+class LoggingTestCase(torch._dynamo.test_case.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._exit_stack.enter_context(
+            unittest.mock.patch.dict(os.environ, {"___LOG_TESTING": ""})
+        )
+        cls._exit_stack.enter_context(
+            unittest.mock.patch("torch._dynamo.config.suppress_errors", True)
+        )
+        cls._exit_stack.enter_context(
+            unittest.mock.patch("torch._dynamo.config.verbose", False)
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._exit_stack.close()
+        torch._logging._internal.log_state.clear()
+        torch._logging._init_logs()
+
+    def getRecord(self, records, m):
+        record = None
+        for r in records:
+            # NB: not r.msg because it looks like 3.11 changed how they
+            # structure log records
+            if m in r.getMessage():
+                self.assertIsNone(
+                    record,
+                    msg=LazyString(
+                        lambda: f"multiple matching records: {record} and {r} among {records}"
+                    ),
+                )
+                record = r
+        if record is None:
+            self.fail(f"did not find record with {m} among {records}")
+        return record
+
+    # This patches the emit method of each handler to gather records
+    # as they are emitted
+    def _handler_watcher(self, record_list):
+        exit_stack = contextlib.ExitStack()
+
+        def emit_post_hook(record):
+            nonlocal record_list
+            record_list.append(record)
+
+        # registered logs are the only ones with handlers, so patch those
+        for log_qname in torch._logging._internal.log_registry.get_log_qnames():
+            logger = logging.getLogger(log_qname)
+            num_handlers = len(logger.handlers)
+            self.assertLessEqual(
+                num_handlers,
+                2,
+                "All pt2 loggers should only have at most two handlers (debug artifacts and messages above debug level).",
+            )
+
+            self.assertGreater(num_handlers, 0, "All pt2 loggers should have more than zero handlers")
+
+            for handler in logger.handlers:
+                old_emit = handler.emit
+
+                def new_emit(record):
+                    old_emit(record)
+                    emit_post_hook(record)
+
+                exit_stack.enter_context(
+                    unittest.mock.patch.object(handler, "emit", new_emit)
+                )
+
+        return exit_stack
+
+
+def logs_to_string(module, log_option):
+    """Example:
+    logs_to_string("torch._inductor.compile_fx", "post_grad_graphs")
+    returns the output of TORCH_LOGS="post_grad_graphs" from the
+    torch._inductor.compile_fx module.
+    """
+    log_stream = io.StringIO()
+    handler = logging.StreamHandler(stream=log_stream)
+
+    @contextlib.contextmanager
+    def tmp_redirect_logs():
+        try:
+            logger = torch._logging.getArtifactLogger(module, log_option)
+            logger.addHandler(handler)
+            yield
+        finally:
+            logger.removeHandler(handler)
+
+    def ctx_manager():
+        exit_stack = log_settings(log_option)
+        exit_stack.enter_context(tmp_redirect_logs())
+        return exit_stack
+
+    return log_stream, ctx_manager
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25919a06cdcc6ea61576c1a2654929ee06d16723
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__init__.py
@@ -0,0 +1,4 @@
+# mypy: ignore-errors
+
+import torch.testing._internal.opinfo.core
+import torch.testing._internal.opinfo.definitions
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d0c64c2ce1dda362db6baa6270c94526ae264a2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/core.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/core.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87958f907422dc585411d7407558c87f00c1d16f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/core.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/refs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/refs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b7032cab195422d0b7e203ed2b81410f5fcb0d1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/refs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c17fc2cfd1e3d3c708eb429ab44df65f83bd685
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/core.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c6e479603239d7a369c6204ae8d14e9ac2b9316
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/core.py
@@ -0,0 +1,2864 @@
+# mypy: ignore-errors
+
+import collections
+import collections.abc
+import math
+import operator
+import unittest
+from dataclasses import asdict, dataclass
+from enum import Enum
+from functools import partial
+from itertools import product
+from typing import Any, Callable, Iterable, List, Optional, Tuple
+
+from torchgen.utils import dataclass_repr
+
+import torch
+from torch.testing import make_tensor
+from torch.testing._internal.common_device_type import (
+    skipCPUIfNoFFT,
+    tol,
+    toleranceOverride,
+)
+from torch.testing._internal.common_dtype import (
+    _dispatch_dtypes,
+    floating_and_complex_types,
+    floating_and_complex_types_and,
+    floating_types,
+)
+from torch.testing._internal.common_utils import (
+    is_iterable_of_tensors,
+    noncontiguous_like,
+    TEST_WITH_ROCM,
+    torch_to_numpy_dtype_dict,
+    TrackedInputIter,
+)
+from torch.testing._internal.opinfo import utils
+
+# Reasonable testing sizes for dimensions
+L = 20
+M = 10
+S = 5
+XS = 3
+
+# Unique value to distinguish default from anything else
+_NOTHING = object()
+
+
+# Extension of getattr to support qualified names
+# e.g. _getattr_qual(torch, 'linalg.norm') -> torch.linalg.norm
+def _getattr_qual(obj, name, default=_NOTHING):
+    try:
+        for path in name.split("."):
+            obj = getattr(obj, path)
+        return obj
+    except AttributeError:
+        if default is not _NOTHING:
+            return default
+        else:
+            raise
+
+
+class DecorateInfo:
+    """Describes which test, or type of tests, should be wrapped in the given
+    decorators when testing an operator. Any test that matches all provided
+    arguments will be decorated. The decorators will only be applied if the
+    active_if argument is True."""
+
+    __slots__ = [
+        "decorators",
+        "cls_name",
+        "test_name",
+        "device_type",
+        "dtypes",
+        "active_if",
+    ]
+
+    def __init__(
+        self,
+        decorators,
+        cls_name=None,
+        test_name=None,
+        *,
+        device_type=None,
+        dtypes=None,
+        active_if=True,
+    ):
+        self.decorators = (
+            list(decorators)
+            if isinstance(decorators, collections.abc.Sequence)
+            else [decorators]
+        )
+        self.cls_name = cls_name
+        self.test_name = test_name
+        self.device_type = device_type
+        self.dtypes = dtypes
+        self.active_if = active_if
+
+        # Validate dtypes
+        if self.dtypes is not None:
+            for dtype in self.dtypes:
+                assert isinstance(dtype, torch.dtype)
+
+    def is_active(self, cls_name, test_name, device_type, dtype, param_kwargs):
+        return (
+            self.active_if
+            and (self.cls_name is None or self.cls_name == cls_name)
+            and (self.test_name is None or self.test_name == test_name)
+            and (self.device_type is None or self.device_type == device_type)
+            and (self.dtypes is None or dtype in self.dtypes)
+            # Support callables over kwargs to determine if the decorator is active.
+            and (
+                self.active_if(param_kwargs)
+                if isinstance(self.active_if, Callable)
+                else self.active_if
+            )
+        )
+
+
+# FIXME
+# Note: historically the 'input' kwarg had to be a Tensor or TensorList, but we are trying
+#   to support scalar inputs, too. Some tests still depend on 'input' being a Tensor
+#   or TensorList, however.
+class SampleInput:
+    """Represents sample inputs to a function."""
+
+    __slots__ = [
+        "input",
+        "args",
+        "kwargs",
+        "output_process_fn_grad",
+        "broadcasts_input",
+        "name",
+    ]
+
+    def __init__(
+        self,
+        input,
+        *var_args,
+        args=None,
+        kwargs=None,
+        output_process_fn_grad=None,
+        broadcasts_input=None,
+        name=None,
+        **var_kwargs,
+    ):
+        # input is the first input to the op and is typically either a Tensor or TensorList (Sequence[Tensor]).
+        # This follows the typical pattern where for Tensor inputs op(t, ...) = t.op(...).
+        self.input = input
+
+        # Allow calling either as SampleInput(input, args=args, kwargs=kwargs), or as
+        # SampleInput(input, *args, **kwargs) but not to mix the two forms
+        if args is not None or kwargs is not None:
+            assert (
+                not var_args and not var_kwargs
+            ), """
+A SampleInput can be constructed "naturally" with *args and **kwargs or by
+explicitly setting the "args" and "kwargs" parameters, but the two
+methods of construction cannot be mixed!"""
+        elif len(var_args) or len(var_kwargs):
+            assert (
+                output_process_fn_grad is None
+                and broadcasts_input is None
+                and name is None
+            ), """
+A SampleInput constructed "naturally" with *args and **kwargs
+cannot specify additional metadata in keyword arguments"""
+
+        self.args = args if args is not None else var_args
+        assert isinstance(self.args, tuple)
+        self.kwargs = kwargs if kwargs is not None else var_kwargs
+        assert isinstance(self.kwargs, dict)
+
+        self.output_process_fn_grad = (
+            output_process_fn_grad
+            if output_process_fn_grad is not None
+            else lambda x: x
+        )
+        self.name = name if name is not None else ""
+
+        # Specifies if `self.input` is broadcasted or not,
+        # given that the operator supports broadcasting.
+        # This field is used to verify the behavior for inplace variant.
+        #
+        # If a SampleInput is marked with `broadcasts_input=True`,
+        # it is verified that we get a `RuntimeError` with this sample,
+        # and inplace variant. Also inplace grad{grad} tests are skipped,
+        # for such inputs (as they will error out otherwise).
+        self.broadcasts_input = (
+            broadcasts_input if broadcasts_input is not None else False
+        )
+
+    def with_metadata(
+        self, *, output_process_fn_grad=None, broadcasts_input=None, name=None
+    ):
+        if output_process_fn_grad is not None:
+            self.output_process_fn_grad = output_process_fn_grad
+        if broadcasts_input is not None:
+            self.broadcasts_input = broadcasts_input
+        if name is not None:
+            self.name = name
+        return self
+
+    def _repr_helper(self, formatter):
+        # Helper function to return the details of the SampleInput as `str`
+        # It consolidates all the fields of SampleInput and allows,
+        # formatting the fields like `input`, `args`, etc with `formatter`
+        # callable to customize the representation.
+        # Look at `summary` method for example.
+        arguments = [
+            f"input={formatter(self.input)}",
+            f"args={formatter(self.args)}",
+            f"kwargs={formatter(self.kwargs)}",
+            f"broadcasts_input={self.broadcasts_input}",
+            f"name={repr(self.name)}",
+        ]
+
+        return f'SampleInput({", ".join(a for a in arguments if a is not None)})'
+
+    def __repr__(self):
+        return self._repr_helper(lambda x: x)
+
+    def summary(self):
+        # Returns the SampleInput details in a more
+        # friendly format.
+        # It formats `Tensor` and `TensorList`
+        # in a more condensed representation.
+        def formatter(arg):
+            # Format any instance of `Tensor` (standalone, in list, or in dict)
+            # by Tensor[TensorShape]
+            # Eg. Tensor with shape (3, 4) is formatted as Tensor[3, 4]
+            if isinstance(arg, torch.Tensor):
+                shape = str(tuple(arg.shape))
+                dtype = str(arg.dtype)
+                device = str(arg.device)
+                contiguity_suffix = ""
+                # NB: sparse CSR tensors annoyingly return is_sparse=False
+                is_sparse = arg.is_sparse or arg.layout == torch.sparse_csr
+                if not is_sparse and not arg.is_contiguous():
+                    contiguity_suffix = ", contiguous=False"
+                return f'Tensor[size={shape}, device="{device}", dtype={dtype}{contiguity_suffix}]'
+            elif isinstance(arg, dict):
+                return {k: formatter(v) for k, v in arg.items()}
+            elif is_iterable_of_tensors(arg):
+                return "TensorList[" + ", ".join(map(formatter, arg)) + "]"
+            elif isinstance(arg, (list, tuple)):  # Handle list, tuple
+                return "(" + ",".join(map(formatter, arg)) + ")"
+
+            return repr(arg)
+
+        return self._repr_helper(formatter)
+
+    # Applies the transform f(t) -> t to each tensor and dtype in the SampleInput
+    def transform(self, f):
+        def tt(t):
+            def _tt(t):
+                with torch.no_grad():
+                    return f(t)
+
+            if isinstance(t, torch.Tensor):
+                return _tt(t)
+            elif isinstance(t, torch.dtype):
+                return _tt(t)
+            elif isinstance(t, list):
+                return list(map(tt, t))
+            elif isinstance(t, tuple):
+                return tuple(map(tt, t))
+            elif isinstance(t, dict):
+                return {k: tt(v) for k, v in t.items()}
+            else:
+                return t
+
+        sample_tt_input, tt_args, tt_kwargs = (
+            tt(self.input),
+            tt(self.args),
+            tt(self.kwargs),
+        )
+
+        # Note the transformed SampleInput assumes metadata like output_process_fn_grad is still valid!
+        return SampleInput(
+            sample_tt_input,
+            args=tt_args,
+            kwargs=tt_kwargs,
+            output_process_fn_grad=self.output_process_fn_grad,
+            broadcasts_input=self.broadcasts_input,
+            name=self.name + "_transformed",
+        )
+
+    # Returns the NumPy version of the sample input object in the form of a tuple: (input, args, kwargs)
+    # Converts tensors to ndarrays by calling .detach().cpu().numpy() on them
+    # Converts dtypes by remapping them using torch_to_numpy_dtype_dict
+    def numpy(self):
+        def to_numpy(t):
+            if isinstance(t, torch.Tensor):
+                if t.dtype is torch.bfloat16:
+                    return t.detach().cpu().to(torch.float32).numpy()
+                if t.dtype is torch.chalf:
+                    return t.detach().cpu().to(torch.cfloat).numpy()
+                return t.detach().cpu().numpy()
+            elif isinstance(t, torch.dtype):
+                return torch_to_numpy_dtype_dict[t]
+
+            return t
+
+        return self.transform(to_numpy)
+
+    def noncontiguous(self):
+        def to_noncontiguous(t):
+            if isinstance(t, torch.Tensor):
+                return noncontiguous_like(t)
+            elif isinstance(t, torch.dtype):
+                return t
+
+            return t
+
+        return self.transform(to_noncontiguous)
+
+
+NumericsFilter = collections.namedtuple("NumericsFilter", ["condition", "safe_val"])
+
+
+class ErrorInput:
+    """
+    A SampleInput that will cause the operation to throw an error plus information
+    about the resulting error.
+    """
+
+    __slots__ = ["sample_input", "error_type", "error_regex"]
+
+    def __init__(self, sample_input, *, error_type=RuntimeError, error_regex):
+        self.sample_input = sample_input
+        self.error_type = error_type
+        self.error_regex = error_regex
+
+
+class AliasInfo:
+    """Class holds alias information. For example, torch.abs ->
+    torch.absolute, torch.Tensor.absolute, torch.Tensor.absolute_
+    """
+
+    def __init__(self, alias_name):
+        self.name = alias_name
+        self.op = _getattr_qual(torch, alias_name)
+        self.method_variant = getattr(torch.Tensor, alias_name, None)
+        self.inplace_variant = getattr(torch.Tensor, alias_name + "_", None)
+
+    def __call__(self, *args, **kwargs):
+        return self.op(*args, **kwargs)
+
+
+# Note [OpInfos]
+# ~~~~~~~~~~~~~~
+#
+# The majority of this note was written shortly after the PyTorch 1.9 release.
+# If you notice it's out-of-date or think it could be improved then please
+# file an issue.
+#
+# See also: the OpInfo tracker (https://github.com/pytorch/pytorch/issues/54261)
+# See also: "Writing Test Templates" in common_device_type.py to learn how to
+#   parametrize a test template using OpInfos.
+# See also: PyTorch's GitHub wiki on running and writing tests
+#   https://github.com/pytorch/pytorch/wiki/Running-and-writing-tests
+# See also: ModuleInfos, OpInfo's sister class, defined in common_modules.py
+#
+# An OpInfo is a collection of metadata related to a PyTorch operator. This
+#   metadata is used to generate tests that validate properties of the operator,
+#   like if it implements the correct gradient formula.
+#
+# WHY OPINFOS?
+# ~~~~~~~~~~~~
+#
+# OpInfos are principally intended to do three things:
+#
+#   1) to allow systematic testing over all PyTorch's operators
+#   2) to simplify operating testing by autogenerating many tests
+#   3) to allow systems (like autograd, torchscript, fx, nnc...) to test
+#        against every PyTorch operator
+#
+# All these goals are still a work in progress. Not every operator has an
+#   OpInfo, and some operator tests that could be automatically generated
+#   still have to be written manually.
+#
+# It's helpful to understand that OpInfos are both about test simplification and
+#   modularity. PyTorch is a complicated framework with many interrelated systems,
+#   too many for any one person to keep track of. An OpInfo can be thought of as the
+#   interface between an operator implementer and those other systems. Instead of
+#   requiring the implementer of torch.foo understand how to test its forward
+#   mode AD or NNC support that's typically handled automatically just by
+#   defining an OpInfo.
+#
+# It's often surprising to OpInfo writers that just implementing an OpInfo
+#   typically can't verify an operator is actually implemented correctly:
+#
+# "If an OpInfo doesn't validate my op works as expected, what's the point
+#     of it?"
+#
+# But the point of is the above. OpInfos are intended to let you focus on testing
+#   the operator logic you're familiar with instead of having to write tests for
+#   how the operator interacts with each of PyTorch's many systems.
+#
+# And, OK, it turns out that SOMETIMES just writing an OpInfo DOES
+#   validate your op works as expected, but that's only in special
+#   cases. See below for details.
+#
+# WHAT'S AN OPINFO?
+# ~~~~~~~~~~~~~~~~~
+#
+# So what is an OpInfo? It's a Python class that describes an operator's properties,
+#   like which dtypes it supports on the CPU and whether it has any aliases.
+#   These properties can be divided into three categories:
+#
+#   1) Metadata describing the operator, like the operator's name and if it
+#     "supports" the out kwarg.
+#   2) Test directives, like "skips" that tell the test suite to skip some
+#     tests.
+#   3) A "sample inputs" function that generates valid inputs for the operator.
+#
+# OpInfo attributes are described in more detail below.
+#
+# THE SAMPLE INPUTS FUNCTION
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The "sample inputs" function merits special elaboration. This function is
+#   crucial to testing with OpInfos. A typical OpInfo test has to treat the operator
+#   as a black box. There's no structure for the test to understand or exploit.
+#   Without "sample inputs" it wouldn't even know how to call the OpInfo's
+#   operator. The sample input function saves the day by providing different
+#   "SampleInputs" that can be used to call the operator. A sample input
+#   function should have the following signature:
+#
+#   def sample_inputs_foo(op_info, device, dtype, requires_grad, **kwargs):
+#
+#   And should return an iterable of SampleInputs (see the class description
+#   above). Each SampleInput defines an "input", "args", "kwargs", an
+#   "output_process_fn_grad" function, the "broadcasts_input" bool and a
+#   "name".
+#
+#   All the "sample_inputs" functions are invoked within a `torch.no_grad()`
+#   environment for efficiency and correctness. As such remember to set the
+#   "requires_grad" flag on the inputs **after** performing any transformations
+#   on them.
+#
+# The "input" is the first argument to the operator, or the tensor that
+#   the method or inplace variants of the operator should be called on, and
+#   should be on the requested device, of the requested dtype, and its
+#   requires_grad attribute should be set to the requires_grad argument.
+#
+# "args" should contain positional arguments, and "kwargs" keyword arguments.
+#
+# "output_process_fn_grad" has an interesting name. It's a function that maps
+#   the operator's output (when given the input, args, and kwargs) to the
+#   portion of the output to gradcheck. For example, consider an operator
+#   like torch.linalg.slogdet
+#   (https://pytorch.org/docs/master/generated/torch.linalg.slogdet.html).
+#   This operator returns a tuple of two tensors, but the first tensor
+#   cannot be backwarded through. Its "output_process_fn_grad" filters
+#   this output tuple to just the second argument, which we can call backward
+#   on. Functions that produce a single tensor can ignore this argument.
+#
+# "broadcasts_input" is a bool indicated if the SampleInput causes the operator
+#   to broadcast the "input" argument. This is important for tests to understand
+#   because inplace variants of operations throw a runtime error if they
+#   would broadcast their input arguments, so tests that work with inplace
+#   variants filter SampleInputs that broadcast their input.
+#
+# "name" is a string that's just used for debugging. It appears when printing
+#   the SampleInput.
+#
+# Sample inputs are designed to be used with many tests, some
+#   that are very time consuming, so they should be a small
+#   set with small tensors. An elaborated set of sample inputs
+#   can be specified using the "reference_inputs_func" attribute.
+#   The "reference inputs" for an operation are an extended
+#   set of sample inputs that can more exhausively test an
+#   operator. They are used by only a few tests that are careful
+#   not to take too long to run. Adding reference inputs
+#   is highly encouraged!
+#
+# THE (OPTIONAL) ERROR INPUTS FUNCTION
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# OpInfos may optionally specify "error inputs" through an error function. If
+#   specified test_errors in test_ops.py will call the op with these inputs
+#   and validate that the desired error is thrown.
+#
+# Error inputs automate a common testing pattern where multiple inputs are
+#   passed to an operation and the errors they thrown are reviewed. Tests
+#   written in this style should be ported to the new OpInfo pattern.
+#
+# Error inputs are specified using the ErrorInputs class, which contains
+#   a SampleInput (see above) and data about the expected error.
+#
+# OPINFO FILE ORGANIZATION
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# All OpInfos are currently defined in this file. Most OpInfo tests are defined
+#   in test_ops.py, but some system-specific tests are defined in those
+#   systems' test files, and subclass-specific tests are defined in the test
+#   file that corresponds to that subclass (see the below).
+#   Expect a reorganization in the future.
+#
+# WHAT'S TESTED?
+# ~~~~~~~~~~~~~~
+#
+# Every OpInfo in the op_db sequence has the following properties validated in
+# test_ops.py:
+#
+#   - that its supported dtypes are specified correctly
+#   - that the operation produces the same results when called with noncontiguous inputs
+#   - that it supports the out= argument properly (if it allows out=),
+#       see https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch
+#   - that it works with the conjugate view bit properly
+#   - that its function, method, and inplace variants perform the same operation
+#       (that is, that torch.add, torch.Tensor.add, and torch.Tensor.add_ all
+#       do the same thing).
+#   - that its inplace variant preserves the input's storage
+#   - that its gradient formula is implemented correctly, and that it supports
+#       gradgrad and complex grad and gradgrad and forward mode AD properly for
+#       the op's function and inplace variants (method variants are skipped
+#       to reduce test time).
+#   - that the operation performs the same operation when traced or scripted
+#       using the jit
+#   - that the operation is autodifferentiated by the jit as expected
+#   - that the operator's aliases, if any, perform the same operation and that
+#       the jit understands the alias
+#   - that the operator throws the correct errors (if error_inputs is defined)
+#   - that the operator produces the same results as a NumPy reference (if ref is defined)
+#   - that the operator produces the same results as a NumPy reference on an extended
+#       set of "reference inputs" (if both ref and reference_inputs_func are defined)
+#       (NOTE: elementwise unary and elementwise binary OpInfos do this even if only
+#         ref is defined, because they effectively autogenerate reference inputs)
+#   - that the operator works on different CUDA devices
+#
+# Additional OpInfo tests are in test_jit_fuser_te.py, test_fx_experimental.py,
+#   and test_fx.py. These tests validate that operators work with NNC and FX
+#   as expected.
+#
+# For performance, some of the above tests may only run on the first
+#   SampleInput returned by an OpInfo's sample input function.
+#
+# In addition to these tests, some subclasses (discussed in the next section)
+#   define additional tests.
+#
+# Critically, as mentioned above, what's not necessarily tested is that the operator
+#   works as expected. When implementing an OpInfo an engineer must still
+#   typically write one or more tests validating the operator's behavior.
+#   The exception to this is if reference testing is sufficient, or if
+#   the operation belongs to an OpInfo subclass that has more exhaustive
+#   operator testing. Elementwise unary and elementwise binary operators,
+#   in particular, usually don't require additional testing beyond
+#   writing an Opinfo.
+#
+#
+# OPINFO (SUB)CLASSES
+# ~~~~~~~~~~~~~~~~~~~
+#
+# In addition to the OpInfo base class there are several specialized OpInfo
+#   subclasses. For example, the UnaryUfuncInfo subclass is used for
+#   unary elementwise operations. These operations have a common structure
+#   that test_unary_ufuncs.py exploits with additional automated testing.
+#   The automated testing in test_unary_ufuncs.py is so thorough, comparing
+#   the operator to a NumPy reference function on a plethora of values, that
+#   just implementing an OpInfo for a unary elementwise operation is often
+#   sufficient testing.
+#
+# The ForeachFuncInfo is another OpInfo subclass that is hyper-specialized to a
+#   very unique class of operations. These OpInfos aren't included in the
+#   op_db sequence and have their own tests.
+#
+# Other OpInfo subclasses, like SpectralFuncInfo, are just for convenience
+# when writing OpInfos.
+#
+# TESTING A NEW OPERATOR
+# ~~~~~~~~~~~~~~~~~~~~~~
+#
+# If you're adding a new operator to any of the following namespaces:
+#   - torch
+#   - torch.fft
+#   - torch.linalg,
+#   - torch.special
+#   - torch.nn.functional
+# then you should typically add an OpInfo for it.
+#
+# As mentioned a couple times above, implementing an OpInfo is not
+#   usually sufficient testing (unless the operator is a unary or binary elementwise
+#   operator). The OpInfo will only test the properties described in the
+#   "WHAT'S TESTED" section. It DOES NOT necessarily verify that the operator is
+#   implemented correctly.
+#
+# TIPS FOR WRITING AN OPINFO AND OPINFO TESTS
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Writing an OpInfo can be a little daunting. Since the point of an OpInfo is to
+#   be consumed by a variety of systems it can be hard to understand how to
+#   deal with test failures or how to set the OpInfo metadata properly.
+#
+# Before adding an OpInfo it helps to look at other OpInfos. A sample inputs
+#   function must be defined, and the operator's dtypes must be specified.
+#   Once that's done you should run the operator's tests in test_ops.py
+#   (these can be filtered using the "-k" argument in pytest). Tests that
+#   fail should provide an error message that describes what to change about
+#   your OpInfo. You don't need to worry about changing an OpInfo's default
+#   values unless a test yells at you.
+#
+# Similarly, if you're writing a test that consumes OpInfos then it's critical
+#   your test provides a clear error message describing what to do when it
+#   fails. You should not assume the OpInfo implementer is familiar with your
+#   system.
+#
+# If you see a confusing error message while developing an OpInfo then please
+#   file an issue describing what happened.
+#
+# This trial-and-error approach to writing an OpInfo can be frustrating,
+#   but it's probably necessary as long as OpInfos don't require
+#   learning about all the systems that consume them. One thing that can help
+#   is the get_supported_dtypes() function defined in utils.py. This
+#   function can be used to programmatically specify the dtypes an operator
+#   supports, and is especially useful if writing an OpInfo on a machine
+#   without a CUDA device. See its documentation for more details.
+#
+# THE FUTURE OF OPINFOS AND OPINFO TESTING
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In the future we expect OpInfo coverage to improve and cover
+#   the great majority of PyTorch's (public) operators.
+#
+
+
+# Classes and methods for the operator database
+@dataclass
+class OpInfo:
+    """Operator information and helper functions for acquiring it."""
+
+    # the string name of the function
+    name: str
+
+    # An optional reference function that accepts ndarrays (AKA "NumPy arrays").
+    # If given, the op will be compared with its reference on each of its sample inputs.
+    ref: Optional[Callable] = None
+
+    # the following metadata describes the operator, its variants, and its aliases, if any
+
+    # iterable of aliases, e.g. ("absolute",) for torch.abs
+    aliases: Iterable = None
+
+    # additional string to include in the test name
+    # this is useful when an op needs multiple OpInfos,
+    # like divide does, often because it's really several
+    # different ops behind the scenes
+    variant_test_name: str = ""
+
+    # the function variant of the operation, populated as torch.<name> if None
+    op: Callable = None
+
+    # allows the method variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated method
+    # - if a Callable, then that callable should be the method associated with this operation
+    method_variant: Callable = _NOTHING
+
+    # allows the inplace variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated inplace variant
+    # - if a Callable, then that callable should be the inplace variant associated with this operation
+    inplace_variant: Callable = _NOTHING
+
+    # allows the operator variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated operator
+    # - if a Callable, then that callable should be the operator associated with this operation
+    operator_variant: Callable = _NOTHING
+
+    # allows the inplace operator variant of this operation to be specified as follows:
+    # - if _NOTHING (default), then the OpInfo attempts to discover the variant using its name
+    # - if None, then the OpInfo explicitly specifies is has no associated inplace operator
+    # - if a Callable, then that callable should be the inplace operator associated with this operation
+    inplace_operator_variant: Callable = _NOTHING
+
+    # the following metadata are test directives for skipping or modifying tests
+
+    # information about which tests to skip
+    skips: Tuple = tuple()
+
+    # decorators to apply to generated tests
+    decorators: Tuple = tuple()
+
+    # the following are pointers to functions to generate certain classes of inputs
+
+    # function to generate sample inputs with strided layouts
+    sample_inputs_func: Callable = None
+
+    # function to generate a more thorough set of samples inputs with strided layouts
+    reference_inputs_func: Callable = None
+
+    # function to generate inputs that will throw errors
+    error_inputs_func: Callable = None
+
+    # function to generate sparse (coo, csr, csc, bsr, bsc) inputs that will throw errors
+    error_inputs_sparse_func: Callable = None
+
+    # function to generate sample inputs with sparse coo layouts
+    sample_inputs_sparse_coo_func: Callable = None
+
+    # function to generate sample inputs with sparse csr layouts
+    sample_inputs_sparse_csr_func: Callable = None
+
+    # function to generate sample inputs with sparse csc layouts
+    sample_inputs_sparse_csc_func: Callable = None
+
+    # function to generate sample inputs with sparse bsr layouts
+    sample_inputs_sparse_bsr_func: Callable = None
+
+    # function to generate sample inputs with sparse bsc layouts
+    sample_inputs_sparse_bsc_func: Callable = None
+
+    # the following metadata relates to dtype support and is tested for correctness in test_ops.py
+
+    # dtypes this function works with on the CPU,
+    # inherited by other device types that don't specify their own dtypes
+    dtypes: _dispatch_dtypes = None
+
+    # the following dtypesIf... options override the dtypes value on their respective device types
+
+    # dtypes this function is expected to work with on CUDA
+    dtypesIfCUDA: _dispatch_dtypes = None
+
+    # dtypes this function is expected to work with on ROCM
+    dtypesIfROCM: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with
+    backward_dtypes: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with on CUDA
+    backward_dtypesIfCUDA: _dispatch_dtypes = None
+
+    # backward dtypes this function is expected to work with on ROCM
+    backward_dtypesIfROCM: _dispatch_dtypes = None
+
+    # the following metadata describes the operators out= support
+
+    # whether the op supports the out kwarg
+    # defaults to True, if the op does not allow the out kwarg or
+    # supports it incorrectly then test_out in test_ops.py should fail
+    supports_out: bool = True
+
+    # the following metadata relates to autograd support
+    # whether the operation supports backward mode AD
+    # if true, gradient correctness is tested in test_ops.py
+    # using the op's sample inputs
+    supports_autograd: bool = True
+
+    # whether the op supports second order gradients
+    # if true, gradgrad correctness is tested in test_ops.py
+    # defaults to support_autograd's value
+    # TODO: rename this to supports_bwgrad_bwgrad to be consistent with below
+    supports_gradgrad: bool = None
+
+    # whether the ops supports second order gradients via
+    # forward-over-reverse. If True, forward-over-reverse gradgrad correctness
+    # is tested. If False, test that forward grad is not implemented.
+    # Defaults to False.
+    supports_fwgrad_bwgrad: bool = False
+
+    # whether the operation supports inplace autograd
+    # if true, tested in test_ops.py
+    # defaults to supports_autograd's value
+    supports_inplace_autograd: bool = None
+
+    # Whether the operation support forward mode AD
+    # If the value is True, we check that the gradients are correct
+    # If the value is False, we test that forward grad is not implemented
+    supports_forward_ad: bool = False
+
+    # Whether the operation has a varargs variant
+    # (e.g. functions like ones, zeros, methods like view, permute)
+    supports_varargs: bool = False
+
+    # Whether the operation avoids materializing COW tensor inputs
+    supports_cow_input_no_materialize: bool = True
+
+    # wrapper function for gradcheck
+    gradcheck_wrapper: Callable = lambda op, *args, **kwargs: op(*args, **kwargs)
+
+    # whether to check batched grad when doing gradcheck
+    # defaults to support_autograd's value
+    check_batched_grad: bool = None
+
+    # whether to check batched grad grad when doing gradgradcheck
+    # default's to support_gradgrad's value
+    check_batched_gradgrad: bool = None
+
+    # whether to check batched forward grad when doing gradcheck
+    # defaults to the value of `supports_forward_ad`
+    check_batched_forward_grad: bool = None
+
+    # whether to check batched forward grad when doing gradcheck
+    # defaults to the value of `check_batched_forward_grad`
+    check_inplace_batched_forward_grad: bool = None
+
+    # tolerance for nondeterminism while performing gradcheck
+    gradcheck_nondet_tol: float = 0.0
+
+    # Whether to use the fast implmentation for gradcheck/gradgradcheck.
+    # When set to None, defers to the default value provided by the wrapper
+    # function around gradcheck (testing._internal.common_utils.gradcheck)
+    gradcheck_fast_mode: bool = None
+
+    # the following metadata relates to JIT support and is tested for correctness in test_ops.py
+
+    # name of the corresponding aten:: operator
+    aten_name: str = None
+
+    # if this is a composite implicit autograd op, the decomposed op
+    decomp_aten_name: Optional[str] = None
+
+    # name of the corresponding aten:: operator for backwards
+    aten_backward_name: Optional[str] = None
+
+    # if a op's aten::node is expected to be symbolically autodiffed
+    assert_autodiffed: bool = False
+
+    # a list of strings with node names that are expected to be in a
+    # DifferentiableGraph when autodiffed. Ex: ['aten::add', 'aten::mm'],
+    # default is populated to be ['aten::(name of Python operator)']
+    autodiff_nonfusible_nodes: List[str] = None
+
+    # a list of strings with node names that are expected to be in FusionGroups
+    # inside of DifferentiableGraphs when this operation is autodiffed.
+    # Ex: ['aten::add', 'aten::mm'], defaults to an empty list
+    # Note: currently no ops use fusible nodes
+    autodiff_fusible_nodes: List[str] = None
+
+    # the following metadata relates to sparse support and is used in test_sparse.py
+
+    # whether the op supports sparse coo inputs, defaults to False
+    # TODO: rename supports_sparse to supports_sparse_coo
+    supports_sparse: bool = None
+
+    # only run tracing tests
+    supports_scripting: bool = True
+
+    # if the operator can be traced
+    supports_tracing: bool = True
+
+    # the following metadata relates to sparse compressed support and
+    # is used in test_sparse_csr.py and test_sparse.py
+
+    # whether the op supports sparse csr inputs, defaults to False
+    supports_sparse_csr: bool = None
+    # whether the op supports sparse csc inputs, defaults to False
+    supports_sparse_csc: bool = None
+    # whether the op supports sparse bsr inputs, defaults to False
+    supports_sparse_bsr: bool = None
+    # whether the op supports sparse bsc inputs, defaults to False
+    supports_sparse_bsc: bool = None
+
+    # whether the op promotes integer inputs to float
+    promotes_int_to_float: bool = False
+
+    # the following metadata relates to complex support and is checked in test_ops.py
+
+    test_conjugated_samples: bool = True
+
+    test_neg_view: bool = True
+
+    # assert that jit shape analysis fully propagates shape
+    assert_jit_shape_analysis: bool = False
+
+    # the following metadata relates to ExpandedWeights support and is checked in test_expanded_weights.py
+
+    supports_expanded_weight: bool = False
+
+    is_factory_function: bool = False
+
+    def __post_init__(self):
+        self._original_opinfo_args = asdict(self).copy()
+
+        assert self.dtypes is not None, f"OpInfo for {self.name} has no dtypes!"
+
+        dtypes_args = (self.dtypes, self.dtypesIfCUDA, self.dtypesIfROCM)
+
+        # Validates the dtypes are generated from the dispatch-related functions
+        for dtype_list in dtypes_args:
+            assert isinstance(dtype_list, (_dispatch_dtypes, type(None)))
+
+        if self.aten_name is None:
+            self.aten_name = self.name
+
+        # Attribute to verify dynamic_dtypes are used.
+        self.dynamic_dtypes = any(
+            isinstance(dtypes, utils._dynamic_dispatch_dtypes) for dtypes in dtypes_args
+        )
+
+        if self.dynamic_dtypes:
+            # Make sure `dtyesIfCUDA` is dynamic, if dynamic dispatch is used for CPU
+            # This is because, below we set dtypesIfCUDA to dtypes if they are None.
+            assert isinstance(self.dtypesIfCUDA, utils._dynamic_dispatch_dtypes), (
+                f"To use dynamic dypes for operator {self.name}, "
+                "acquire the dtypes dynamically for argument `dtypesIfCUDA`."
+                "This is to ensure that CUDA dtypes are acquired correctly as they"
+                "differ from CPU dtypes occasionally"
+            )
+
+        self.dtypes = set(self.dtypes)
+
+        # NOTE: backward dtypes must be acquired before forward dtypes
+        #   since they fallback to explicit (not implicit!) specifications of
+        #   forward dtypes
+        self.backward_dtypesIfROCM = (
+            set(self.backward_dtypesIfROCM)
+            if self.backward_dtypesIfROCM is not None
+            else (
+                self.backward_dtypesIfCUDA
+                if self.backward_dtypesIfCUDA is not None
+                else self.backward_dtypes
+                if self.backward_dtypes is not None
+                else self.dtypesIfROCM
+                if self.dtypesIfROCM is not None
+                else self.dtypesIfCUDA
+                if self.dtypesIfCUDA is not None
+                else self.dtypes
+            )
+        )
+        self.backward_dtypesIfCUDA = (
+            set(self.backward_dtypesIfCUDA)
+            if self.backward_dtypesIfCUDA is not None
+            else (
+                self.backward_dtypes
+                if self.backward_dtypes is not None
+                else self.dtypesIfCUDA
+                if self.dtypesIfCUDA is not None
+                else self.dtypes
+            )
+        )
+        self.backward_dtypes = (
+            set(self.backward_dtypes)
+            if self.backward_dtypes is not None
+            else self.dtypes
+        )
+
+        self.dtypesIfCUDA = (
+            set(self.dtypesIfCUDA) if self.dtypesIfCUDA is not None else self.dtypes
+        )
+        self.dtypesIfROCM = (
+            set(self.dtypesIfROCM)
+            if self.dtypesIfROCM is not None
+            else self.dtypesIfCUDA
+        )
+
+        # NOTE: if the op is unspecified it is assumed to be under the torch namespace
+        if not self.op:
+            self.op = _getattr_qual(torch, self.name)
+
+        if self.method_variant is _NOTHING:
+            self.method_variant = getattr(torch.Tensor, self.name, None)
+
+        # attributes like real, imag are not callable
+        if not callable(self.method_variant):
+            self.method_variant = None
+
+        if self.inplace_variant is _NOTHING:
+            inplace_name = self.name + "_"
+            self.inplace_variant = getattr(torch.Tensor, inplace_name, None)
+
+        if self.operator_variant is _NOTHING:
+            self.operator_variant = getattr(operator, self.name, None)
+
+        if self.inplace_operator_variant is _NOTHING:
+            # Note: operator.i<op> will use operator.<op> and assign the result to the lhs when no
+            # __i<op>__ method is found. This results in the appearance of an inplace operator variant which
+            # does not have the correct inplace behavior. To avoid this, we guard automatic detection of the inplace
+            # operator with a check that an inplace variant exists.
+            if self.inplace_variant is not None:
+                inplace_operator_name = "i" + self.name
+                self.inplace_operator_variant = getattr(
+                    operator, inplace_operator_name, None
+                )
+            else:
+                self.inplace_operator_variant = None
+
+        self.decorators = (*self.decorators, *self.skips)
+
+        # Specifying sample inputs function without specifying the
+        # corresponding layout support implies the layout support:
+        if self.supports_sparse is None:
+            self.supports_sparse = self.sample_inputs_sparse_coo_func is not None
+        if self.sample_inputs_sparse_coo_func is None:
+            self.sample_inputs_sparse_coo_func = self._sample_inputs_unspecified
+
+        if self.supports_sparse_csr is None:
+            self.supports_sparse_csr = self.sample_inputs_sparse_csr_func is not None
+        if self.sample_inputs_sparse_csr_func is None:
+            self.sample_inputs_sparse_csr_func = self._sample_inputs_unspecified
+
+        if self.supports_sparse_csc is None:
+            self.supports_sparse_csc = self.sample_inputs_sparse_csc_func is not None
+        if self.sample_inputs_sparse_csc_func is None:
+            self.sample_inputs_sparse_csc_func = self._sample_inputs_unspecified
+
+        if self.supports_sparse_bsr is None:
+            self.supports_sparse_bsr = self.sample_inputs_sparse_bsr_func is not None
+        if self.sample_inputs_sparse_bsr_func is None:
+            self.sample_inputs_sparse_bsr_func = self._sample_inputs_unspecified
+
+        if self.supports_sparse_bsc is None:
+            self.supports_sparse_bsc = self.sample_inputs_sparse_bsc_func is not None
+        if self.sample_inputs_sparse_bsc_func is None:
+            self.sample_inputs_sparse_bsc_func = self._sample_inputs_unspecified
+
+        # We run the sampling functions without tracking the gradiends of the creation of inputs
+        self.sample_inputs_func = torch.no_grad()(self.sample_inputs_func)
+        self.sample_inputs_sparse_coo_func = torch.no_grad()(
+            self.sample_inputs_sparse_coo_func
+        )
+        self.sample_inputs_sparse_csr_func = torch.no_grad()(
+            self.sample_inputs_sparse_csr_func
+        )
+        self.sample_inputs_sparse_csc_func = torch.no_grad()(
+            self.sample_inputs_sparse_csc_func
+        )
+        self.sample_inputs_sparse_bsr_func = torch.no_grad()(
+            self.sample_inputs_sparse_bsr_func
+        )
+        self.sample_inputs_sparse_bsc_func = torch.no_grad()(
+            self.sample_inputs_sparse_bsc_func
+        )
+        if self.reference_inputs_func is not None:
+            self.reference_inputs_func = torch.no_grad()(self.reference_inputs_func)
+
+        if not self.autodiff_fusible_nodes:
+            self.autodiff_fusible_nodes = []
+
+        if self.autodiff_nonfusible_nodes is None:
+            self.autodiff_nonfusible_nodes = ["aten::" + self.name]
+
+        # Autograd support
+
+        # Autograd flags that depend on backward AD only
+        # - If setting has been explicitly set, raise error if inconsistent
+        if self.supports_gradgrad is None:
+            self.supports_gradgrad = self.supports_autograd
+        else:
+            assert not (self.supports_gradgrad and not self.supports_autograd), (
+                "supports_gradgrad refines the part of autograd is supported, so it should "
+                "not be set if supports_autograd is False"
+            )
+        if self.check_batched_grad is None:
+            self.check_batched_grad = self.supports_autograd or self.supports_forward_ad
+        else:
+            assert not (
+                self.check_batched_grad
+                and not (self.supports_autograd or self.supports_forward_ad)
+            ), (
+                "check_batched_grad refines the part of autograd that will be checked (by gradcheck), so "
+                "it should not be set if supports_autograd is False"
+            )
+        if self.check_batched_gradgrad is None:
+            self.check_batched_gradgrad = self.supports_gradgrad
+        else:
+            assert not (self.check_batched_gradgrad and not self.supports_gradgrad), (
+                "check_batched_gradgrad refines the part of autograd that will be checked (by "
+                "gradgradcheck), so it should not be set if either supports_gradgrad or supports_autograd "
+                "is False."
+            )
+        if self.check_batched_forward_grad is None:
+            self.check_batched_forward_grad = self.supports_forward_ad
+        else:
+            assert not (
+                self.check_batched_forward_grad and not self.supports_forward_ad
+            ), (
+                "check_batched_forward_grad should only be used when supports_forward_ad "
+                "is True. It is used to disable the test in the specific cases "
+                "where the op supports forward ad but fails to compute "
+                "batched forward grad."
+            )
+
+        if self.check_inplace_batched_forward_grad is None:
+            self.check_inplace_batched_forward_grad = self.check_batched_forward_grad
+        else:
+            assert not (
+                self.check_inplace_batched_forward_grad
+                and not self.check_batched_forward_grad
+            ), (
+                "check_batched_forward_grad should only be used when check_batched_forward_grad "
+                "is True. It is used to disable the test in the specific cases "
+                "where the op supports batched forward grad but fails to compute batched forward "
+                "grad for the inplace variant of the op."
+            )
+
+        assert not (self.supports_fwgrad_bwgrad and not self.supports_autograd), (
+            "supports_fwgrad_bwgrad enables forward-over-backward gradgrad checks and should only be "
+            "True if backward ad is also checked, i.e., supports_forward_ad should be True.",
+            self.name,
+        )
+
+        # Autograd flags that depend on both forward AD and backward AD
+        if self.supports_inplace_autograd is None:
+            self.supports_inplace_autograd = (
+                self.supports_autograd or self.supports_forward_ad
+            )
+        else:
+            assert not (
+                self.supports_inplace_autograd
+                and not self.supports_autograd
+                and not self.supports_forward_ad
+            ), (
+                "supports_inplace_autograd refines the part of autograd that is supported, so "
+                "it should not be set if both supports_autograd and supports_forward_ad are False"
+            )
+
+        if self.aliases is not None:
+            self.aliases = tuple(AliasInfo(a) for a in self.aliases)  # type: ignore[assignment]
+        else:
+            self.aliases = ()
+
+    def __call__(self, *args, **kwargs):
+        """Calls the function variant of the operator."""
+        return self.op(*args, **kwargs)
+
+    def __str__(self):
+        return dataclass_repr(self)
+
+    def get_op(self):
+        """Returns the function variant of the operator, torch.<op_name>."""
+        return self.op
+
+    def get_method(self):
+        """Returns the method variant of the operator, torch.Tensor.<op_name>.
+        Returns None if the operator has no method variant.
+        """
+        return self.method_variant
+
+    def get_inplace(self):
+        """Returns the inplace variant of the operator, torch.Tensor.<op_name>_.
+        Returns None if the operator has no inplace variant.
+        """
+        return self.inplace_variant
+
+    def get_operator(self):
+        """Returns operator variant of the operator, e.g. operator.neg
+        Returns None if the operator has no operator variant.
+        """
+        return self.operator_variant
+
+    def get_inplace_operator(self):
+        """Returns the inplace operator variant of the operator, e.g operator.iadd
+        Returns None if the operator has no inplace operator variant"""
+        return self.inplace_operator_variant
+
+    def conjugate_sample_inputs(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs but with the tensor input or first
+        tensor in a sequence input conjugated.
+        """
+
+        samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
+        conj_samples = list(samples)
+
+        def conjugate(tensor):
+            _requires_grad = tensor.requires_grad
+            tensor = tensor.conj()
+            return tensor.requires_grad_(_requires_grad)
+
+        for i, sample in enumerate(samples):
+            sample = conj_samples[i]
+            # Note: it is assumed that the input here is either a tensor or tensorlist
+            if isinstance(sample.input, torch.Tensor):
+                sample.input = conjugate(sample.input)
+            else:
+                sample.input[0] = conjugate(sample.input[0])
+
+        return TrackedInputIter(iter(conj_samples), "conjugate sample input")
+
+    def sample_inputs(self, device, dtype, requires_grad=False, **kwargs):
+        """
+        Returns an iterable of SampleInputs.
+
+        These samples should be sufficient to test the function works correctly
+        with autograd, TorchScript, etc.
+        """
+        samples = self.sample_inputs_func(self, device, dtype, requires_grad, **kwargs)
+
+        if kwargs.get("include_conjugated_inputs", False):
+            conj_samples = self.conjugate_sample_inputs(
+                device, dtype, requires_grad, **kwargs
+            )
+            samples_list = list(samples)
+            samples_list.extend(conj_samples)
+            samples = tuple(samples_list)
+
+        return TrackedInputIter(iter(samples), "sample input")
+
+    def reference_inputs(self, device, dtype, requires_grad=False, **kwargs):
+        """
+        Returns an iterable of SampleInputs.
+
+        Distinct from sample_inputs() above because this returns an expanded set
+        of inputs when reference_inputs_func is defined. If undefined this returns
+        the sample inputs.
+        """
+        if self.reference_inputs_func is None:
+            samples = self.sample_inputs_func(
+                self, device, dtype, requires_grad, **kwargs
+            )
+            return TrackedInputIter(iter(samples), "sample input")
+
+        if kwargs.get("include_conjugated_inputs", False):
+            raise NotImplementedError
+
+        references = self.reference_inputs_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+        return TrackedInputIter(iter(references), "reference input")
+
+    def error_inputs(self, device, **kwargs):
+        """
+        Returns an iterable of ErrorInputs.
+        """
+        errs = self.error_inputs_func(self, device, **kwargs)
+        return TrackedInputIter(
+            iter(errs), "error input", callback=lambda e: e.sample_input
+        )
+
+    def error_inputs_sparse(self, device, layout, **kwargs):
+        """
+        Returns an iterable of ErrorInputs that contain sparse sample
+        inputs with a specified layout.
+        """
+        if not self.supports_sparse_layout(layout):
+            raise unittest.SkipTest("unsupported sparse layout")
+        return self.error_inputs_sparse_func(self, device, layout, **kwargs)
+
+    def supports_sparse_layout(self, layout):
+        """Return True if OpInfo supports the specified sparse layout."""
+        layout_name = str(layout).split(".")[-1]
+        # map torch.sparse_coo to OpInfo.supports_sparse:
+        layout_name = layout_name.replace("_coo", "")
+        return getattr(self, f"supports_{layout_name}")
+
+    def sample_inputs_sparse(
+        self, layout, device, dtype, requires_grad=False, **kwargs
+    ):
+        """Returns an iterable of SampleInputs that contain inputs with a
+        specified sparse layout.
+        """
+        layout_name = str(layout).split(".")[-1]
+        sample_inputs_mth = getattr(self, "sample_inputs_" + layout_name)
+
+        def non_empty_sampler(op, generator):
+            found_sample = False
+            for sample in generator:
+                found_sample = True
+                yield sample
+            if not found_sample:
+                raise unittest.SkipTest("NO SAMPLES!")
+
+        return non_empty_sampler(
+            self,
+            sample_inputs_mth(device, dtype, requires_grad=requires_grad, **kwargs),
+        )
+
+    def _sample_inputs_unspecified(self, *args, **kwargs):
+        """Raises an NotImplemented exception in a OpInfo instance creation
+        that specifies supports_sparse(|_csr|_csc|_bsr|_bsc)=True
+        without specifying the corresponding sample function as
+        sample_inputs_sparse_(coo|csr|csc|bsr|bsc)_func.
+
+        To avoid this, either define the corresponding sample function,
+        or re-map unsupported samples to error inputs in an appropiate
+
+          opinfo/definitions/sparse.py:_validate_sample_input_sparse_<op>
+
+        function.
+        """
+        raise NotImplementedError("no sample function specified")
+
+    def sample_inputs_sparse_coo(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        coo layout.
+        """
+        return self.sample_inputs_sparse_coo_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+    def sample_inputs_sparse_csr(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        csr layout.
+        """
+        return self.sample_inputs_sparse_csr_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+    def sample_inputs_sparse_csc(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        csc layout.
+        """
+        return self.sample_inputs_sparse_csc_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+    def sample_inputs_sparse_bsr(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        bsr layout.
+        """
+        return self.sample_inputs_sparse_bsr_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+    def sample_inputs_sparse_bsc(self, device, dtype, requires_grad=False, **kwargs):
+        """Returns an iterable of SampleInputs that contain inputs with sparse
+        bsc layout.
+        """
+        return self.sample_inputs_sparse_bsc_func(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+    def get_decorators(self, test_class, test_name, device, dtype, param_kwargs):
+        """Returns the decorators targeting the given test."""
+        result = []
+        for decorator in self.decorators:
+            if isinstance(decorator, DecorateInfo):
+                if decorator.is_active(
+                    test_class, test_name, device, dtype, param_kwargs
+                ):
+                    result.extend(decorator.decorators)
+            else:
+                result.append(decorator)
+        return result
+
+    def supported_dtypes(self, device_type):
+        if device_type == "privateuse1":
+            device_type = torch._C._get_privateuse1_backend_name()
+        device_type = torch.device(device_type).type
+        if device_type == "cuda":
+            return self.dtypesIfROCM if TEST_WITH_ROCM else self.dtypesIfCUDA
+        return self.dtypes
+
+    def supported_backward_dtypes(self, device_type):
+        if not self.supports_autograd:
+            return set()
+
+        if device_type == "privateuse1":
+            device_type = torch._C._get_privateuse1_backend_name()
+        device_type = torch.device(device_type).type
+        backward_dtypes = None
+        if device_type == "cuda":
+            backward_dtypes = (
+                self.backward_dtypesIfROCM
+                if TEST_WITH_ROCM
+                else self.backward_dtypesIfCUDA
+            )
+        else:
+            backward_dtypes = self.backward_dtypes
+
+        allowed_backward_dtypes = floating_and_complex_types_and(
+            torch.bfloat16, torch.float16, torch.complex32
+        )
+        return set(allowed_backward_dtypes).intersection(backward_dtypes)
+
+    def supports_dtype(self, dtype, device_type) -> bool:
+        return dtype in self.supported_dtypes(device_type)
+
+    @property
+    def formatted_name(self):
+        """Returns a formatted full name for this OpInfo that can be used in test names."""
+        variant = (
+            "_" + self.variant_test_name.replace(".", "_")
+            if self.variant_test_name
+            else ""
+        )
+        return f"{self.name.replace('.', '_')}{variant}"
+
+
+def _generate_reduction_inputs(device, dtype, requires_grad, **kwargs):
+    """Generates input tensors for testing reduction operators"""
+    yield make_tensor([], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor([2], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor([3, 5], dtype=dtype, device=device, requires_grad=requires_grad)
+    yield make_tensor(
+        [3, 2, 1, 2], dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+
+def _generate_reduction_kwargs(ndim, supports_multiple_dims=True):
+    """Generates a subset of all valid dim and keepdim kwargs given ndim that
+    is appropriate for testing reduction operators.
+    """
+
+    # Test default dim and keepdim
+    yield {}
+
+    # Test reducing inner and outer most dimensions
+    yield {"dim": 0, "keepdim": True}
+    yield {"dim": -1, "keepdim": False}
+
+    # Test reducing middle dimension
+    if ndim > 2:
+        yield {"dim": ndim // 2, "keepdim": True}
+
+    if supports_multiple_dims:
+        # Test reducing all dimensions
+        yield {"dim": tuple(range(ndim)), "keepdim": False}
+
+        # Test reducing both first and last dimensions
+        if ndim > 1:
+            yield {"dim": (0, -1), "keepdim": True}
+
+        # Test reducing every other dimension starting with the second
+        if ndim > 3:
+            yield {"dim": tuple(range(1, ndim, 2)), "keepdim": False}
+
+
+def sample_inputs_reduction(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for reduction operators."""
+
+    # TODO(@heitorschueroff) Once all reduction operators are using
+    # ReductionOpInfo use op_info.supports_multiple_dims directly.
+    supports_multiple_dims: bool = kwargs.get("supports_multiple_dims", True)
+
+    # TODO(@heitorschueroff) Once all reduction operators are using ReductionOpInfo
+    # use op_info.generate_args_kwargs directly.
+    generate_args_kwargs = kwargs.get(
+        "generate_args_kwargs", lambda *args, **kwargs: (yield tuple(), {})
+    )
+
+    for t in _generate_reduction_inputs(device, dtype, requires_grad):
+        for reduction_kwargs in _generate_reduction_kwargs(
+            t.ndim, supports_multiple_dims
+        ):
+            for args, kwargs in generate_args_kwargs(t, **reduction_kwargs):
+                kwargs.update(reduction_kwargs)
+                yield SampleInput(
+                    t.detach().requires_grad_(requires_grad), args=args, kwargs=kwargs
+                )
+
+
+# NOTE [Reductions]:
+#
+# For testing purposes, we relax the definition of a reduction operator
+# as defined in the docstring below. We do this to capture operators with
+# a similar API so they can be tested automatically. However...
+#
+# Strictly speaking a reduction operator is an operator that can reduce an
+# array to a single scalar value and that can be computed from the partial
+# result of reducing subarrays. This usually means that the reduction operation
+# should be commutative and associative. This definition is important when it
+# comes to implementation as it determines how a reduction can be parallelized.
+#
+# For example, many summary statistics such as median, mode and quantile cannot
+# be computed from partial results because these are sorting and counting based
+# algorithms that need information that would be lost in the reduced value.
+class ReductionOpInfo(OpInfo):
+    """Reduction operator information.
+
+    An operator is a reduction operator if it reduces one or more dimensions of
+    the input tensor to a single value. Reduction operators must implement the
+    following signature:
+
+    - `op(input, *args, *, dim=None, keepdim=False, **kwargs) -> Tensor`
+
+    ReductionOpInfo tests that reduction operators implement a consistent API.
+    Optional features such as reducing over multiple dimensions are captured in
+    the optional keyword parameters of the ReductionOpInfo constructor.
+
+    If a reduction operator does not yet implement the full required API of
+    reduction operators, this should be documented by xfailing the failing
+    tests rather than adding optional parameters to ReductionOpInfo.
+
+    NOTE
+    The API for reduction operators has not yet been finalized and some
+    requirements may change.
+
+    See tests in test/test_reductions.py
+    """
+
+    def __init__(
+        self,
+        name,
+        *,
+        # The identity value for the operator if it has one.
+        identity: Optional[Any] = None,
+        # The nan policy for the operator if it implements one.
+        # - propagate: NaN values are propagated to the output
+        # - omit: NaN values are discarded during the reduction
+        nan_policy: Optional[str] = None,
+        # Whether the operator supports reducing multiple dimensions.
+        supports_multiple_dims: bool = True,
+        # Whether the operator promotes integral to floating point dtypes.
+        promotes_int_to_float: bool = False,
+        # Whether the operator promotes all integral dtypes to int64.
+        promotes_int_to_int64: bool = False,
+        # If a specific dtype is given, then the operator always returns that
+        # dtype irrespective of the input dtype. If None, the operator returns
+        # the dtype according to the type promotion rules above.
+        result_dtype: Optional[torch.dtype] = None,
+        # Casts complex results to real (e.g. linalg.norm or torch.var)
+        complex_to_real: bool = False,
+        # ReductionOpInfo tests generate their own input, dim and keepdim
+        # arguments and call this function to generate tuples of extra args and
+        # kwargs to use when calling the op. This is required for operators that
+        # have other required parameters besides the input tensor.
+        generate_args_kwargs: Callable = lambda t, dim=None, keepdim=False: (
+            yield tuple(),
+            {},
+        ),
+        # Options from the OpInfo base class
+        **kwargs,
+    ):
+        self._original_reduction_args = locals().copy()
+        assert nan_policy in (None, "propagate", "omit")
+
+        # These are mutually exclusive options
+        assert not (result_dtype and promotes_int_to_float)
+        assert not (result_dtype and promotes_int_to_int64)
+        assert not (result_dtype and complex_to_real)
+        assert not (promotes_int_to_float and promotes_int_to_int64)
+
+        # Default sample_inputs_func for ReductionOpInfo which augments sample
+        # inputs from sample_inputs_reduction with the args and kwargs from
+        # generate_args_kwargs. This is only used if sample_inputs_func is None.
+        def sample_inputs_func(*args, **kwargs):
+            kwargs["supports_multiple_dims"] = supports_multiple_dims
+            kwargs["generate_args_kwargs"] = generate_args_kwargs
+            yield from sample_inputs_reduction(*args, **kwargs)
+
+        # Override OpInfo defaults and call base class __init__
+        kwargs.setdefault("inplace_variant", None)
+        kwargs.setdefault("sample_inputs_func", sample_inputs_func)
+        super().__init__(name, promotes_int_to_float=promotes_int_to_float, **kwargs)
+
+        self.identity = identity
+        self.nan_policy = nan_policy
+        self.supports_multiple_dims = supports_multiple_dims
+        self.promotes_int_to_int64 = promotes_int_to_int64
+        self.complex_to_real = complex_to_real
+        self.result_dtype = result_dtype
+        self.generate_args_kwargs = generate_args_kwargs
+
+
+# The base reference input generation for elementwise binary operations
+def _reference_inputs_elementwise_binary(
+    op, device, dtype, requires_grad, exclude_zero, **kwargs
+):
+    yield from op.sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
+    yield from generate_elementwise_binary_tensors(
+        op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    if dtype is not torch.bool:
+        yield from generate_elementwise_binary_small_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    if dtype not in (torch.bool, torch.uint8, torch.int8):
+        yield from generate_elementwise_binary_large_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    yield from generate_elementwise_binary_broadcasting_tensors(
+        op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    yield from generate_elementwise_binary_with_scalar_samples(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    yield from generate_elementwise_binary_with_scalar_and_type_promotion_samples(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    if dtype.is_floating_point or dtype.is_complex:
+        yield from generate_elementwise_binary_extremal_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+
+
+# Note that these references inputs use scalars for the SampleInput.input value,
+#   and many tests require SampleInput.input be a tensor or a list of tensors
+def reference_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs):
+    if hasattr(op, "rhs_make_tensor_kwargs"):
+        exclude_zero = op.rhs_make_tensor_kwargs.get("exclude_zero", False)
+
+    gen = partial(
+        _reference_inputs_elementwise_binary,
+        op,
+        device,
+        dtype,
+        requires_grad,
+        exclude_zero,
+        **kwargs,
+    )
+
+    # yields "normal" samples
+    yield from gen()
+
+    # yields noncontiguous samples
+    for sample in gen():
+        yield sample.noncontiguous()
+
+    yield from generate_elementwise_binary_noncontiguous_tensors(
+        op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+
+    yield from generate_elementwise_binary_arbitrarily_strided_tensors(
+        op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+
+
+# A functional that extends an elementwise binary operator's bespoke error inputs
+#   with generic error inputs for the class of elementwise binary operations
+def make_error_inputs_elementwise_binary(error_inputs_func):
+    def error_inputs_func_wrapper(op, device, **kwargs):
+        if error_inputs_func is not None:
+            yield from error_inputs_func(op, device, **kwargs)
+
+        if not op.supports_rhs_python_scalar:
+            si = SampleInput(torch.tensor((1, 2, 3), device=device), args=(2,))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+        if not op.supports_one_python_scalar:
+            si = SampleInput(2, args=(torch.tensor((1, 2, 3), device=device),))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+        if (
+            not kwargs.get("skip_two_python_scalars", False)
+            and not op.supports_two_python_scalars
+        ):
+            si = SampleInput(2, args=(3,))
+            yield ErrorInput(si, error_type=Exception, error_regex="")
+
+    return error_inputs_func_wrapper
+
+
+# The following functions and classes are for testing elementwise binary operators.
+
+
+# Returns a generator of pairs of contiguous tensors on the requested device
+#   and with the requested dtype.
+#
+# This function is intended to test the non-vectorized and vectorized code
+#   paths of elementwise binary functions, as well as their handling of odd tensor
+#   sizes (like zero-dim tensors and tensors with zero elements).
+#
+# Each iterable will include an a tensor with no elements,
+#   zero dim (scalar) tensors, small 1D tensors, a medium 1D tensor, and
+#   a large 2D tensor.
+def generate_elementwise_binary_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=False
+):
+    shapes = (
+        # tensors with no elements
+        (0,),
+        (1, 0, 3),
+        # zero dim (scalar) tensor
+        (),
+        # small 1D tensor
+        (20,),
+        # medium 1D tensor
+        (812,),
+        # large 2D tensor
+        (1029, 917),
+    )
+
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+        yield SampleInput(lhs, args=(rhs,))
+
+
+def generate_elementwise_binary_arbitrarily_strided_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=False
+):
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    for shape, strides, offset in strided_cases:
+        a = make_arg(
+            500,
+        ).as_strided(shape, strides, offset)
+        b = make_arg(shape)
+        yield SampleInput(a, args=(b,))
+
+
+# Returns a generator of pairs of contiguous tensors on the requested device and with
+#   the requested dtype.
+#
+# Unlike the previous function, the values in these tensors are specified manually.
+def generate_elementwise_binary_small_value_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=None
+):
+    if exclude_zero is None:
+        if hasattr(op, "rhs_make_tensor_kwargs"):
+            exclude_zero = op.rhs_make_tensor_kwargs.get("exclude_zero", False)
+
+    # defines interesting values
+    _unsigned_int_vals = (0, 1, 55, 127, 128, 190, 210, 220, 254)
+    _int_vals = (0, -1, 1, -55, 55, -127, 127, -128)
+    _float_vals = (
+        0.0,
+        -0.0,
+        -0.001,
+        0.001,
+        -0.25,
+        0.25,
+        -1.0,
+        1.0,
+        -math.pi / 2,
+        math.pi / 2,
+        -math.pi + 0.00001,
+        math.pi - 0.00001,
+        -math.pi,
+        math.pi,
+        -math.pi - 0.00001,
+        math.pi + 0.00001,
+    )
+
+    l_vals = []
+    r_vals = []
+
+    if dtype.is_floating_point:
+        prod = product(_float_vals, _float_vals)
+    elif dtype.is_complex:
+        complex_vals = product(_float_vals, _float_vals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = [complex(*x) for x in complex_vals]
+        prod = product(complex_vals, complex_vals)
+    elif dtype in (torch.int8, torch.int16, torch.int32, torch.int64):
+        prod = product(_int_vals, _int_vals)
+    elif dtype is torch.uint8:
+        prod = product(_unsigned_int_vals, _unsigned_int_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        if r == 0 and exclude_zero:
+            r_vals.append(1)
+        else:
+            r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+
+def generate_elementwise_binary_large_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    _large_int_vals = (-1113, 1113, -10701, 10701)
+    _large_float16_vals = (-501, 501, -1001.2, 1001.2, -13437.7, 13437.7)
+    _large_float_vals = _large_float16_vals + (-4988429.2, 4988429.2, -1e20, 1e20)
+
+    l_vals = []
+    r_vals = []
+
+    if dtype == torch.float16:
+        prod = product(_large_float16_vals, _large_float16_vals)
+    elif dtype.is_floating_point:
+        prod = product(_large_float_vals, _large_float_vals)
+    elif dtype.is_complex:
+        complex_vals = product(_large_float_vals, _large_float_vals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = [complex(*x) for x in complex_vals]
+        prod = product(complex_vals, complex_vals)
+    elif dtype in (torch.int16, torch.int32, torch.int64):
+        prod = product(_large_int_vals, _large_int_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+
+def generate_elementwise_binary_extremal_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    _float_extremals = (float("inf"), float("-inf"), float("nan"))
+
+    l_vals = []
+    r_vals = []
+
+    if dtype.is_floating_point:
+        prod = product(_float_extremals, _float_extremals)
+    elif dtype.is_complex:
+        complex_vals = product(_float_extremals, _float_extremals)
+        # Note the use of list is required here or the map generator will be
+        #  emptied by the following product and it won't produce the desired cross-product
+        complex_vals = [complex(*x) for x in complex_vals]
+        prod = product(complex_vals, complex_vals)
+    else:
+        raise ValueError("Unsupported dtype!")
+
+    for l, r in prod:
+        l_vals.append(l)
+        r_vals.append(r)
+
+    lhs = torch.tensor(l_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+    rhs = torch.tensor(r_vals, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    yield SampleInput(lhs, args=(rhs,))
+
+    # Test case for NaN propagation
+    nan = (
+        float("nan") if dtype.is_floating_point else complex(float("nan"), float("nan"))
+    )
+    lhs = make_tensor(
+        (128, 128), device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    lhs.view(-1)[::3] = nan
+    rhs = make_tensor(
+        (128, 128), device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    rhs.view(-1)[::3] = nan
+
+    yield SampleInput(lhs, args=(rhs,))
+
+
+# Returns a generator of pairs of contiguous and noncontiguous tensors that
+#   require broadcasting
+def generate_elementwise_binary_broadcasting_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=False
+):
+    shapes = (
+        ((1,), ()),
+        ((2,), ()),
+        ((1,), (2,)),
+        ((2, 1), (2,)),
+        ((1, 2), (2,)),
+        ((3, 2), (2,)),
+        ((1, 3, 2), (2,)),
+        ((1, 3, 2), (3, 2)),
+        ((3, 1, 2), (3, 2)),
+        ((2, 3, 2), ()),
+        ((3, 1, 2), (1, 3, 2)),
+    )
+
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    for shape, noncontiguous in product(shapes, [True, False]):
+        shape_lhs, shape_rhs = shape
+        lhs = make_arg(
+            shape_lhs, noncontiguous=noncontiguous, **op.lhs_make_tensor_kwargs
+        )
+        rhs = make_arg(
+            shape_rhs, noncontiguous=noncontiguous, **op.rhs_make_tensor_kwargs
+        )
+
+        yield SampleInput(lhs, args=(rhs,), broadcasts_input=True)
+
+
+# Returns a generator of pairs of contiguous tensors and scalars
+def generate_elementwise_binary_with_scalar_samples(
+    op, *, device, dtype, requires_grad=False
+):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    shapes = ((), (3,), (5, 3), (0, 1, 3), (1, 5))
+    if op.supports_rhs_python_scalar:
+        for shape in shapes:
+            lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+            rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+            lhs_scalar = make_arg((), **op.lhs_make_tensor_kwargs).item()
+            rhs_scalar = make_arg((), **op.rhs_make_tensor_kwargs).item()
+
+            yield SampleInput(lhs, args=(rhs_scalar,))
+
+        # Extends with scalar lhs
+        if op.supports_one_python_scalar:
+            yield SampleInput(lhs_scalar, args=(rhs,))
+
+    if op.supports_two_python_scalars:
+        lhs_scalar = make_arg((), **op.lhs_make_tensor_kwargs).item()
+        rhs_scalar = make_arg((), **op.rhs_make_tensor_kwargs).item()
+
+        yield SampleInput(lhs_scalar, args=(rhs_scalar,))
+
+
+# Returns a generator of pairs of contiguous tensors and 0d tensors and scalars and type promotion
+def generate_elementwise_binary_with_scalar_and_type_promotion_samples(
+    op, *, device, dtype, requires_grad=False
+):
+    # add these samples only for logical and comparison ops, arithmetic ops are not happy about extremal scalars
+    if op.name in (
+        "eq",
+        "ne",
+        "gt",
+        "ge",
+        "lt",
+        "le",
+        "logical_and",
+        "logical_or",
+        "logical_xor",
+    ):
+        make_arg = partial(
+            make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+        )
+        shape = (
+            23,
+        )  # this shape is big enough to trigger vectorization, and has non-vectorized tail
+        values = (float("nan"), float("inf"), -float("inf"))
+        scalar_tensors = tuple(torch.tensor(val) for val in values)
+        if op.supports_rhs_python_scalar:
+            lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+            rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+            for scalar in values + scalar_tensors:
+                yield SampleInput(lhs, args=(scalar,))
+                # Extends with scalar lhs
+                if op.supports_one_python_scalar:
+                    yield SampleInput(scalar, args=(rhs,))
+
+
+# Returns a generator of pairs of noncontiguous tensors
+def generate_elementwise_binary_noncontiguous_tensors(
+    op, *, device, dtype, requires_grad=False, exclude_zero=False
+):
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+
+    # Generic noncontiguity
+    lhs = make_arg((1026,), noncontiguous=True, **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((1026,), noncontiguous=True, **op.rhs_make_tensor_kwargs)
+
+    yield SampleInput(lhs.clone(), args=(rhs.clone(),))
+    yield SampleInput(lhs.contiguous(), args=(rhs,))
+
+    # Transposed
+    lhs = make_arg((789, 357), **op.lhs_make_tensor_kwargs)
+    rhs = make_arg((789, 357), **op.rhs_make_tensor_kwargs)
+
+    yield SampleInput(lhs.T, args=(rhs.T,))
+
+    # More noncontiguity
+    shapes = ((5, 7), (1024,))
+
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+        lhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+        lhs_non_contig.copy_(lhs)
+
+        rhs_non_contig = torch.empty(shape + (2,), device=device, dtype=dtype)[..., 0]
+        rhs_non_contig.copy_(rhs)
+
+        yield SampleInput(lhs_non_contig.clone(), args=(rhs_non_contig.clone(),))
+        yield SampleInput(lhs_non_contig.contiguous(), args=(rhs_non_contig,))
+
+    # Noncontiguous indices
+    shape = (2, 2, 1, 2)
+    lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+    rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+    lhs_non_contig = lhs[:, 1, ...]
+    rhs_non_contig = rhs[:, 1, ...]
+
+    yield SampleInput(lhs_non_contig.clone(), args=(rhs_non_contig.clone(),))
+    yield SampleInput(lhs_non_contig.contiguous(), args=(rhs_non_contig,))
+
+    # Expanded tensors
+    shapes = ((1, 3), (1, 7), (5, 7))
+
+    for shape in shapes:
+        lhs = make_arg(shape, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape, **op.rhs_make_tensor_kwargs)
+
+        lhs_non_contig = lhs.expand(3, -1, -1)
+        rhs_non_contig = rhs.expand(3, -1, -1)
+
+        yield SampleInput(lhs_non_contig, args=(rhs_non_contig,))
+
+
+# Sample inputs for elementwise binary operators, like add
+def sample_inputs_elementwise_binary(op, device, dtype, requires_grad, **kwargs):
+    _M = S if kwargs.get("small_inputs_only", False) else M
+    _S = XS if kwargs.get("small_inputs_only", False) else S
+
+    if hasattr(op, "rhs_make_tensor_kwargs"):
+        exclude_zero = op.rhs_make_tensor_kwargs.get("exclude_zero", False)
+
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+
+    shapes = (
+        ((), ()),
+        ((_S,), ()),
+        ((_S, 1), (_S,)),
+        ((_M, _S), ()),
+        ((_S, _M, _S), (_M, _S)),
+        ((_S, _M, _S), (_S, _M, _S)),
+        ((_M, 1, _S), (_M, _S)),
+        ((_M, 1, _S), (1, _M, _S)),
+        ((0, 1, XS), (0, _M, XS)),
+    )
+
+    sample_kwargs = kwargs.get("sample_kwargs", {})
+
+    for shape_lhs, shape_rhs in shapes:
+        lhs = make_arg(shape_lhs, **op.lhs_make_tensor_kwargs)
+        rhs = make_arg(shape_rhs, **op.rhs_make_tensor_kwargs)
+        broadcasts_input = shape_lhs != torch.broadcast_shapes(shape_lhs, shape_rhs)
+
+        yield SampleInput(
+            lhs, args=(rhs,), kwargs=sample_kwargs, broadcasts_input=broadcasts_input
+        )
+
+
+# Metadata class for binary "universal functions (ufuncs)" that accept two
+# tensor and have common properties
+class BinaryUfuncInfo(OpInfo):
+    """Operator information for 'universal binary functions (binary ufuncs).'
+    These are functions of two tensors with common properties like:
+      - they are elementwise functions
+      - the output shape is determined by the input shape
+      - they typically have method and inplace variants
+      - they typically support the out kwarg
+      - they typically have NumPy or SciPy references
+    See NumPy's universal function documentation
+    (https://numpy.org/doc/stable/reference/ufuncs.html) for more details
+    about the concept of ufuncs.
+    """
+
+    def __init__(
+        self,
+        name,
+        *,
+        sample_inputs_func=sample_inputs_elementwise_binary,
+        reference_inputs_func=reference_inputs_elementwise_binary,
+        error_inputs_func=None,
+        lhs_make_tensor_kwargs=None,
+        rhs_make_tensor_kwargs=None,
+        always_returns_bool=False,  # Set to true if the op always returns bool tensors
+        supports_rhs_python_scalar=True,  # Whether the operator allows Tensor x scalar inputs
+        supports_one_python_scalar=False,  # Whether the operator allows scalar x tensor and tensor x scalar inputs
+        supports_two_python_scalars=False,  # Whether the operator allows scalar x scalar inputs
+        **kwargs,
+    ):
+        self._original_binary_ufunc_args = locals().copy()
+
+        # Elementwise binary operations perform the equivalent of test_numpy_refs
+        #   in test_binary_ufuncs, but with additional test granularity. So the
+        #   generic test_ops.py test is skipped because it's redundant.
+        common_skips = (
+            DecorateInfo(
+                unittest.skip("Skipping redundant test."),
+                "TestCommon",
+                "test_numpy_refs",
+            ),
+        )
+        kwargs["skips"] = kwargs.get("skips", tuple()) + common_skips
+        super().__init__(
+            name,
+            sample_inputs_func=sample_inputs_func,
+            reference_inputs_func=reference_inputs_func,
+            error_inputs_func=make_error_inputs_elementwise_binary(error_inputs_func),
+            **kwargs,
+        )
+
+        # [lr]hs_make_tensor_kwargs are part of the OpInfo to be able to dynamically generate valid samples later on.
+        if lhs_make_tensor_kwargs is None:
+            lhs_make_tensor_kwargs = {}
+        self.lhs_make_tensor_kwargs = lhs_make_tensor_kwargs
+
+        if rhs_make_tensor_kwargs is None:
+            rhs_make_tensor_kwargs = {}
+        self.rhs_make_tensor_kwargs = rhs_make_tensor_kwargs
+
+        self.always_returns_bool = always_returns_bool
+        self.supports_rhs_python_scalar = supports_rhs_python_scalar
+        self.supports_one_python_scalar = supports_one_python_scalar
+        self.supports_two_python_scalars = supports_two_python_scalars
+
+        if self.supports_two_python_scalars:
+            self.supports_one_python_scalar = True
+
+        if self.supports_one_python_scalar:
+            assert (
+                supports_rhs_python_scalar
+            ), "Can't support lhs and rhs Python scalars but not rhs scalars!"
+
+
+# The following functions and classes are for testing elementwise unary operators.
+def sample_inputs_elementwise_unary(
+    op_info, device, dtype, requires_grad, op_kwargs=None, **kwargs
+):
+    if not op_kwargs:
+        op_kwargs = {}
+
+    _L = S if kwargs.get("small_inputs_only", False) else L
+
+    low, high = op_info.domain
+    is_floating = dtype.is_floating_point or dtype.is_complex
+    low = low if low is None or not is_floating else low + op_info._domain_eps
+    high = high if high is None or not is_floating else high - op_info._domain_eps
+    if (
+        op_info.supports_sparse_csr
+        or op_info.supports_sparse_csc
+        or op_info.supports_sparse_bsr
+        or op_info.supports_sparse_bsc
+    ):
+        # Tensors with dim=2 for sparse compressed testing
+        yield SampleInput(
+            make_tensor(
+                (_L, _L),
+                device=device,
+                dtype=dtype,
+                low=low,
+                high=high,
+                requires_grad=requires_grad,
+            ),
+            kwargs=op_kwargs,
+        )
+    else:
+        # Creates a 1D, empty, and scalar tensor
+        for shape in ((_L,), (1, 0, 3), ()):
+            yield SampleInput(
+                make_tensor(
+                    shape,
+                    device=device,
+                    dtype=dtype,
+                    low=low,
+                    high=high,
+                    requires_grad=requires_grad,
+                ),
+                kwargs=op_kwargs,
+            )
+
+
+# Replace values satisfying condition with a safe value. This is used to block
+# out values the could cause singularity like tan(pi/2)
+def _replace_values_in_tensor(tensor, condition, safe_value):
+    mask = condition(tensor)
+    tensor.masked_fill_(mask, safe_value)
+
+
+# Helper to create a unary elementwise tensor with valid inputs
+def _make_unary_elementwise_tensor(shape, *, op, dtype, **kwargs):
+    low, high = op.domain
+    is_floating = dtype.is_floating_point or dtype.is_complex
+    low = low if low is None or not is_floating else low + op._domain_eps
+    high = high if high is None or not is_floating else high - op._domain_eps
+
+    a = make_tensor(shape, low=low, high=high, dtype=dtype, **kwargs)
+
+    if op.reference_numerics_filter is not None and dtype is not torch.bool:
+        condition, safe_value = op.reference_numerics_filter
+        _replace_values_in_tensor(a, condition, safe_value)
+
+    return a
+
+
+# Restricts the values in the tensor to the domain of the
+# given elementwise unary operator
+def _filter_unary_elementwise_tensor(a, *, op):
+    # short-circuits for boolean tensors
+    if a.dtype is torch.bool:
+        return a
+
+    low, high = op.domain
+    is_floating = a.dtype.is_floating_point or a.dtype.is_complex
+    low = low if low is None or not is_floating else low + op._domain_eps
+    high = high if high is None or not is_floating else high - op._domain_eps
+
+    if a.dtype is torch.uint8 and low is not None:
+        low = max(low, 0)
+
+    if not a.dtype.is_floating_point and not a.dtype.is_complex:
+        low = math.ceil(low) if low is not None else None
+        high = math.floor(high) if high is not None else None
+
+    if op.reference_numerics_filter is not None:
+        condition, safe_value = op.reference_numerics_filter
+        _replace_values_in_tensor(a, condition, safe_value)
+
+    if low is not None or high is not None:
+        if a.dtype.is_complex:
+            a.real.clamp_(low, high)
+            a.imag.clamp_(low, high)
+        else:
+            a.clamp_(min=low, max=high)
+
+    return a
+
+
+def generate_elementwise_unary_tensors(op, *, device, dtype, requires_grad, **kwargs):
+    # Special-cases bool
+    if dtype is torch.bool:
+        tensors = (
+            torch.empty(0, device=device, dtype=torch.bool),
+            torch.tensor(True, device=device),
+            torch.tensor(False, device=device),
+            torch.tensor((True, False), device=device),
+            make_tensor((812,), device=device, dtype=dtype),
+            make_tensor((1029, 917), device=device, dtype=dtype),
+        )
+        for a in tensors:
+            yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+    shapes = (
+        (1029, 917),
+        (812,),
+        # Empty sizes
+        (0,),
+        (0, 3, 3),
+        (1, 0, 5),
+        (6, 0, 0, 0),
+        (3, 0, 1, 0),
+    )
+
+    make_arg = partial(
+        _make_unary_elementwise_tensor,
+        op=op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+    )
+    for shape in shapes:
+        a = make_arg(shape)
+        yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_small_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_small_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        a = _filter_unary_elementwise_tensor(sample.input, op=op)
+        yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_large_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_large_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        a = _filter_unary_elementwise_tensor(sample.input, op=op)
+        yield SampleInput(sample.input, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+def generate_elementwise_unary_extremal_value_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    for sample in generate_elementwise_binary_extremal_value_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad
+    ):
+        yield SampleInput(
+            sample.input, kwargs=op.sample_kwargs(device, dtype, sample.input)[0]
+        )
+
+
+def generate_elementwise_unary_noncontiguous_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    make_arg = partial(
+        _make_unary_elementwise_tensor,
+        op=op,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+    )
+
+    # Generic noncontiguity
+    t = make_arg((1026,), noncontiguous=True)
+    yield SampleInput(t, kwargs=op.sample_kwargs(device, dtype, t)[0])
+
+    # Transposed
+    t = make_arg((1024, 1024)).T
+    yield SampleInput(t, kwargs=op.sample_kwargs(device, dtype, t)[0])
+
+    # Expanded tensors
+    shapes = ((1, 3), (1, 7), (5, 7))
+
+    for shape in shapes:
+        t = make_arg(shape)
+        t_non_contig = t.expand(3, -1, -1)
+        yield SampleInput(
+            t_non_contig, kwargs=op.sample_kwargs(device, dtype, t_non_contig)[0]
+        )
+
+
+def generate_elementwise_unary_arbitrarily_strided_tensors(
+    op, *, device, dtype, requires_grad=False
+):
+    # shape, strides, offset
+    strided_cases = (
+        ((5, 6, 2), (1, 1, 7), 2),
+        ((5, 5, 4), (1, 1, 7), 2),
+        ((5, 5, 2), (4, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 7), 3),
+        ((5, 5, 2), (5, 5, 5), 3),
+        ((9, 5, 2), (0, 1, 7), 3),
+    )
+
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    for shape, strides, offset in strided_cases:
+        a = make_arg(
+            500,
+        ).as_strided(shape, strides, offset)
+        yield SampleInput(a, kwargs=op.sample_kwargs(device, dtype, a)[0])
+
+
+# Reuses the elementwise binary generators for consistency
+# TODO: in the future generalize the reference generators to handle n-ary elementwise operations
+def _reference_inputs_elementwise_unary(op, device, dtype, requires_grad, **kwargs):
+    yield from op.sample_inputs_func(op, device, dtype, requires_grad, **kwargs)
+
+    yield from generate_elementwise_unary_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+    if dtype is not torch.bool:
+        yield from generate_elementwise_unary_small_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+    if dtype not in (torch.bool, torch.uint8, torch.int8) and (
+        op.handles_large_floats
+        or (not dtype.is_floating_point and not dtype.is_complex)
+    ):
+        yield from generate_elementwise_unary_large_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+
+    if dtype.is_floating_point or (
+        op.handles_complex_extremal_values and dtype.is_complex
+    ):
+        yield from generate_elementwise_unary_extremal_value_tensors(
+            op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+
+
+def reference_inputs_elementwise_unary(op, device, dtype, requires_grad, **kwargs):
+    gen = partial(
+        _reference_inputs_elementwise_unary, op, device, dtype, requires_grad, **kwargs
+    )
+
+    # yields "normal" samples
+    yield from gen()
+
+    # yields noncontiguous samples
+    for sample in gen():
+        yield sample.noncontiguous()
+
+    yield from generate_elementwise_unary_noncontiguous_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+    yield from generate_elementwise_unary_arbitrarily_strided_tensors(
+        op, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+    )
+
+
+# Metadata class for unary "universal functions (ufuncs)" that accept a single
+# tensor and have common properties like:
+class UnaryUfuncInfo(OpInfo):
+    """Operator information for 'universal unary functions (unary ufuncs).'
+    These are functions of a single tensor with common properties like:
+      - they are elementwise functions
+      - the input shape is the output shape
+      - they typically have method and inplace variants
+      - they typically support the out kwarg
+      - they typically have NumPy or SciPy references
+    See NumPy's universal function documentation
+    (https://numpy.org/doc/1.18/reference/ufuncs.html) for more details
+    about the concept of ufuncs.
+    """
+
+    def __init__(
+        self,
+        name,  # the string name of the function
+        *,
+        dtypes=floating_types(),
+        domain=(None, None),  # the [low, high) domain of the function
+        handles_complex_extremal_values=True,  # whether the op correctly handles extremal values (like nan/inf)
+        handles_large_floats=True,  # whether the op correctly handles large float values (like 1e20)
+        supports_complex_to_float=False,  # op supports casting from complex input to real output safely eg. angle
+        sample_inputs_func=sample_inputs_elementwise_unary,
+        reference_inputs_func=reference_inputs_elementwise_unary,
+        sample_kwargs=lambda device, dtype, input: ({}, {}),
+        reference_numerics_filter=None,  # Filters values in the range of the domain specified above but that should not be tested
+        **kwargs,
+    ):
+        self._original_unary_ufunc_args = locals().copy()
+
+        super().__init__(
+            name,
+            dtypes=dtypes,
+            sample_inputs_func=sample_inputs_func,
+            reference_inputs_func=reference_inputs_func,
+            **kwargs,
+        )
+        self.domain = domain
+        self.handles_complex_extremal_values = handles_complex_extremal_values
+        self.handles_large_floats = handles_large_floats
+        self.supports_complex_to_float = supports_complex_to_float
+        self.reference_numerics_filter = reference_numerics_filter
+
+        # test_unary_ufuncs.py generates its own inputs to test the consistency
+        # of the operator on sliced tensors, non-contig tensors, etc.
+        # `sample_kwargs` is a utility function to provide kwargs
+        # along with those inputs if required (eg. clamp).
+        # It should return two dictionaries, first holding kwarg for
+        # torch operator and second one for reference NumPy operator.
+        self.sample_kwargs = sample_kwargs
+
+        # Epsilon to ensure grad and gradgrad checks don't test values
+        #   outside a function's domain.
+        self._domain_eps = 1e-5
+
+
+def sample_inputs_spectral_ops(self, device, dtype, requires_grad=False, **kwargs):
+    is_fp16_or_chalf = dtype == torch.complex32 or dtype == torch.half
+    if not is_fp16_or_chalf:
+        nd_tensor = partial(
+            make_tensor,
+            (S, S + 1, S + 2),
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+        )
+        oned_tensor = partial(
+            make_tensor, (31,), device=device, dtype=dtype, requires_grad=requires_grad
+        )
+    else:
+        # cuFFT supports powers of 2 for half and complex half precision
+        # NOTE: For hfft, hfft2, hfftn, irfft, irfft2, irfftn with default args
+        # where output_size n=2*(input_size - 1), we make sure that logical fft size is a power of two
+        low = None
+        high = None
+        if self.name in ["fft.hfft", "fft.irfft", "_refs.fft.hfft", "_refs.fft.irfft"]:
+            shapes = ((2, 9, 9), (33,))
+        elif self.name in [
+            "fft.hfft2",
+            "fft.irfft2",
+            "_refs.fft.hfft2",
+            "_refs.fft.irfft2",
+        ]:
+            shapes = ((2, 8, 9), (33,))
+        elif self.name in [
+            "fft.hfftn",
+            "fft.irfftn",
+            "_refs.fft.hfftn",
+            "_refs.fft.irfftn",
+        ]:
+            shapes = ((2, 2, 33), (33,))
+            # Adjusting the limits because the test would be flaky due to over-saturation of float16
+            # See: https://github.com/pytorch/pytorch/pull/81416
+            low = -1.0
+            high = 1.0
+        else:
+            shapes = ((2, 8, 16), (32,))
+        nd_tensor = partial(
+            make_tensor,
+            shapes[0],
+            device=device,
+            low=low,
+            high=high,
+            dtype=dtype,
+            requires_grad=requires_grad,
+        )
+        oned_tensor = partial(
+            make_tensor,
+            shapes[1],
+            device=device,
+            low=low,
+            high=high,
+            dtype=dtype,
+            requires_grad=requires_grad,
+        )
+
+    if self.ndimensional == SpectralFuncType.ND:
+        yield SampleInput(
+            nd_tensor(),
+            s=(3, 10) if not is_fp16_or_chalf else (4, 8),
+            dim=(1, 2),
+            norm="ortho",
+        )
+        yield SampleInput(nd_tensor(), norm="ortho")
+        yield SampleInput(nd_tensor(), s=(8,))
+        yield SampleInput(oned_tensor())
+        yield from (SampleInput(nd_tensor(), dim=dim) for dim in [-1, -2, -3, (0, -1)])
+    elif self.ndimensional == SpectralFuncType.TwoD:
+        yield SampleInput(
+            nd_tensor(),
+            s=(3, 10) if not is_fp16_or_chalf else (4, 8),
+            dim=(1, 2),
+            norm="ortho",
+        )
+        yield SampleInput(nd_tensor(), norm="ortho")
+        yield SampleInput(nd_tensor(), s=(6, 8) if not is_fp16_or_chalf else (4, 8))
+        yield SampleInput(nd_tensor(), dim=0)
+        yield SampleInput(nd_tensor(), dim=(0, -1))
+        yield SampleInput(nd_tensor(), dim=(-3, -2, -1))
+    else:
+        yield SampleInput(
+            nd_tensor(),
+            n=10 if not is_fp16_or_chalf else 8,
+            dim=1,
+            norm="ortho",
+        )
+        yield SampleInput(nd_tensor(), norm="ortho")
+        yield SampleInput(nd_tensor(), n=7 if not is_fp16_or_chalf else 8)
+        yield SampleInput(oned_tensor())
+        yield from (SampleInput(nd_tensor(), dim=dim) for dim in [-1, -2, -3])
+
+
+SpectralFuncType = Enum("SpectralFuncType", ("OneD", "TwoD", "ND"))
+
+
+# Metadata class for Fast Fourier Transforms in torch.fft.
+class SpectralFuncInfo(OpInfo):
+    """Operator information for torch.fft transforms."""
+
+    def __init__(
+        self,
+        name,  # the string name of the function
+        *,
+        ref=None,  # Reference implementation (probably in np.fft namespace)
+        dtypes=floating_and_complex_types(),
+        ndimensional: SpectralFuncType,
+        sample_inputs_func=sample_inputs_spectral_ops,
+        decorators=None,
+        **kwargs,
+    ):
+        self._original_spectral_func_args = dict(locals()).copy()
+        self._original_spectral_func_args.update(kwargs)
+
+        decorators = list(decorators) if decorators is not None else []
+        decorators += [
+            skipCPUIfNoFFT,
+            DecorateInfo(
+                toleranceOverride({torch.chalf: tol(4e-2, 4e-2)}),
+                "TestCommon",
+                "test_complex_half_reference_testing",
+            ),
+        ]
+
+        super().__init__(
+            name=name,
+            dtypes=dtypes,
+            decorators=decorators,
+            sample_inputs_func=sample_inputs_func,
+            **kwargs,
+        )
+        self.ref = ref
+        self.ndimensional = ndimensional
+
+
+class ShapeFuncInfo(OpInfo):
+    """Early version of a specialized OpInfo for Shape manipulating operations like tile and roll"""
+
+    def __init__(
+        self,
+        name,  # the string name of the function
+        *,
+        ref,  # a reference function
+        dtypes=floating_types(),
+        dtypesIfCUDA=None,
+        dtypesIfROCM=None,
+        sample_inputs_func=None,
+        **kwargs,
+    ):
+        super().__init__(
+            name,
+            dtypes=dtypes,
+            dtypesIfCUDA=dtypesIfCUDA,
+            dtypesIfROCM=dtypesIfROCM,
+            sample_inputs_func=sample_inputs_func,
+            **kwargs,
+        )
+        self.ref = ref
+
+
+def sample_inputs_foreach(
+    self,
+    device,
+    dtype,
+    N,
+    *,
+    noncontiguous=False,
+    same_size=False,
+    low=None,
+    high=None,
+    zero_size: bool,
+    requires_grad: bool,
+    # mutually exclusive from same_size and zero_size, which are all or nothing
+    intersperse_empty_tensors: bool = False,
+):
+    if zero_size:
+        return [torch.empty(0, dtype=dtype, device=device) for _ in range(N)]
+    if same_size:
+        return [
+            make_tensor(
+                (N, N),
+                dtype=dtype,
+                device=device,
+                noncontiguous=noncontiguous,
+                low=low,
+                high=high,
+                requires_grad=requires_grad,
+            )
+            for _ in range(N)
+        ]
+    else:
+        # interweave some empty tensors + have the last 2 tensors be empty (see #100701)
+        return [
+            torch.empty(0, dtype=dtype, device=device, requires_grad=requires_grad)
+            if (i % 3 == 0 or i >= N - 2) and intersperse_empty_tensors
+            else make_tensor(
+                (N - i, N - i),
+                dtype=dtype,
+                device=device,
+                noncontiguous=noncontiguous,
+                low=low,
+                high=high,
+                requires_grad=requires_grad,
+            )
+            for i in range(N)
+        ]
+
+
+def get_foreach_method_names(name):
+    # get torch inplace reference function
+    op_name = "_foreach_" + name
+    inplace_op_name = op_name + "_"
+
+    op = getattr(torch, op_name, None)
+    inplace_op = getattr(torch, inplace_op_name, None)
+
+    ref = getattr(torch, name, None)
+    ref_inplace = getattr(torch.Tensor, name + "_", None)
+    return op, inplace_op, ref, ref_inplace
+
+
+class ForeachFuncInfo(OpInfo):
+    """Early version of a specialized OpInfo for foreach functions"""
+
+    def __init__(
+        self,
+        name,
+        sample_inputs_func,
+        *,
+        dtypes=floating_and_complex_types(),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half),
+        dtypesIfROCM=None,
+        supports_alpha_param=False,
+        supports_autograd=True,
+        supports_inplace_autograd=True,
+        supports_scalar_self_arg=False,
+        supports_forward_ad=True,
+        backward_requires_result=False,
+        supports_out=True,
+        **kwargs,
+    ):
+        (
+            foreach_method,
+            foreach_method_inplace,
+            torch_ref_method,
+            torch_ref_inplace,
+        ) = get_foreach_method_names(name)
+        if not supports_out:
+            # note(crcrpar): `foreach_method` for `"zero"` is `None` but `None` would call
+            # `_getattr_qual` in `OpInfo.__post_init__` which should fail since `_foreach_zero`
+            # is not defined at the moment. Thus to skip the qualification, set a similar torch
+            # function.
+            assert foreach_method is None
+            assert torch_ref_method is None
+            foreach_method = foreach_method_inplace
+            torch_ref_method = torch_ref_inplace
+        super().__init__(
+            name="_foreach_" + name,
+            op=foreach_method,
+            ref=torch_ref_method,
+            method_variant=foreach_method,
+            inplace_variant=foreach_method_inplace,
+            dtypes=dtypes,
+            dtypesIfCUDA=dtypesIfCUDA,
+            dtypesIfROCM=dtypesIfROCM,
+            sample_inputs_func=sample_inputs_func,
+            supports_autograd=supports_autograd,
+            supports_forward_ad=supports_forward_ad,
+            supports_out=supports_out,
+            **kwargs,
+        )
+        self.supports_scalar_self_arg = supports_scalar_self_arg
+
+        self.ref_inplace = torch_ref_inplace
+        self.supports_alpha_param = supports_alpha_param
+        self.backward_requires_result = backward_requires_result
+        self.has_no_in_place = self.inplace_variant is None
+        self.supports_inplace_autograd = supports_inplace_autograd
+
+        if name == "norm":
+            self.ref = torch.linalg.vector_norm
+        elif name == "minimum":
+            # because minimum ref does not support inplace or scalar
+            self.ref = torch.clamp_max
+            self.ref_inplace = torch.Tensor.clamp_max_
+        elif name == "maximum":
+            # because maximum ref does not support inplace or scalar
+            self.ref = torch.clamp_min
+            self.ref_inplace = torch.Tensor.clamp_min_
+
+    def sample_zero_size_inputs(self, device, dtype, requires_grad=False, **kwargs):
+        if not hasattr(self.sample_inputs_func, "sample_zero_size_tensor_inputs"):
+            return []
+        return self.sample_inputs_func.sample_zero_size_tensor_inputs(
+            self, device, dtype, requires_grad, **kwargs
+        )
+
+
+def gradcheck_wrapper_hermitian_input(op, input, *args, **kwargs):
+    """Gradcheck wrapper for functions that take Hermitian matrices as input.
+
+    They require a modified function because the finite-difference algorithm
+    for calculating derivatives does not preserve the Hermitian property of the input.
+    """
+    return op(input + input.mH, *args, **kwargs)
+
+
+def gradcheck_wrapper_triangular_input(op, *args, upper=False, idx=0, **kwargs):
+    """Gradcheck wrapper for functions that take lower or upper triangular matrices as input.
+
+    They require a modified function because the finite-difference algorithm
+    for calculating derivatives does not preserve the triangular property of the input.
+    `idx` is used to specific which `args[idx]` is to be triangularized.
+    """
+    triangular_arg = args[idx].triu() if upper else args[idx].tril()
+    return op(*args[:idx], triangular_arg, *args[idx + 1 :], upper, **kwargs)
+
+
+def gradcheck_wrapper_triangular_input_real_positive_diagonal(
+    op, *args, upper=False, idx=0, **kwargs
+):
+    """Gradcheck wrapper for functions that take lower/upper triangular matrices
+    with real and positive diagonals, for example, cholesky-like operations.
+    """
+    arg = args[idx]
+    arg_diag = arg.diagonal(0, -2, -1)
+    arg_diag_embed = torch.diag_embed(arg_diag)
+    id_diag_tensor = torch.ones_like(arg_diag)
+    id_tensor = torch.diag_embed(id_diag_tensor)
+    # new_arg = arg - diag(arg) + I
+    new_arg = arg - arg_diag_embed + id_tensor
+    return gradcheck_wrapper_triangular_input(
+        op, *args[:idx], new_arg, *args[idx + 1 :], upper=upper, idx=idx, **kwargs
+    )
+
+
+def gradcheck_wrapper_masked_operation(op, input, *args, **kwargs):
+    """Gradcheck wrapper for masked operations.
+
+    When mask is specified, replaces masked-out elements with zeros.
+
+    Use for operations that produce non-finite masked-out elements,
+    for instance, for minimum and maximum reductions.
+    """
+    output = op(input, *args, **kwargs)
+    mask = kwargs.get("mask")
+    if mask is not None:
+        output_mask = torch.masked._output_mask(op, input, *args, **kwargs)
+        output = torch.where(output_mask, output, output.new_zeros([]))
+    return output
+
+
+def gradcheck_wrapper_masked_pointwise_operation(op, input, *args, **kwargs):
+    """Gradcheck wrapper for masked pointwise operations. Assumes that the result
+    will be masked iff both tensors are masked at a specific index
+
+    When mask is specified, replaces masked-out elements with zeros.
+
+    Use for operations that produce non-finite masked-out elements,
+    for instance, for minimum and maximum reductions.
+    """
+    output = op(input, *args, **kwargs)
+    input_mask = kwargs.get("input_mask")
+    other_mask = kwargs.get("other_mask")
+    if input_mask is not None and other_mask is not None:
+        combined_mask = torch.logical_and(input_mask, other_mask)
+        new_kwargs = dict(mask=combined_mask, **kwargs)
+        output_mask = torch.masked._input_mask(input, *args, **new_kwargs)
+        output = torch.where(output_mask, output, output.new_zeros([]))
+    return output
+
+
+def clone_sample(sample, **kwargs):
+    """
+    Given a SampleInput, this function analyzes its input, args and kwargs,
+    and produces a copy with each non-Tensor entry being copied by reference,
+    and with each Tensor entry cloned with `t.clone().requires_grad_(t.requires_grad)`
+    """
+
+    def clone_tensor(t):
+        if isinstance(t, torch.Tensor):
+            return t.detach().clone().requires_grad_(t.requires_grad)
+        else:
+            return t
+
+    sample_kwargs = kwargs if kwargs else sample.kwargs
+
+    return SampleInput(
+        clone_tensor(sample.input),
+        args=tuple(map(clone_tensor, sample.args)),
+        kwargs={k: clone_tensor(v) for k, v in sample_kwargs.items()},
+    )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d913f23b49896153356b8d4a32cb13c67076ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__init__.py
@@ -0,0 +1,27 @@
+# mypy: ignore-errors
+
+from typing import List
+
+from torch.testing._internal.opinfo.core import OpInfo
+from torch.testing._internal.opinfo.definitions import (
+    _masked,
+    fft,
+    linalg,
+    signal,
+    special,
+)
+
+# Operator database
+op_db: List[OpInfo] = [
+    *fft.op_db,
+    *linalg.op_db,
+    *signal.op_db,
+    *special.op_db,
+    *_masked.op_db,
+]
+
+python_ref_db: List[OpInfo] = [
+    *fft.python_ref_db,
+    *linalg.python_ref_db,
+    *special.python_ref_db,
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a84bdc0f38010673ac01bee835e13be63fdb8e3c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/_masked.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/_masked.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f0a939c8f6b17f5de2842bfa6bc595d5f7bc27a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/_masked.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/fft.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/fft.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89af31bb7053ea7f0c1b099342e788dcd6f63216
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/fft.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/linalg.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/linalg.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f2c9d864075bea62d0d9b226329762c2dc9900f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/linalg.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/signal.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/signal.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..173e1b86d89ffc76bde9860a820c2a2d69aba8d3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/signal.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/sparse.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/sparse.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f59ebf74a9e912570e265c7afdb556e2b5fd515c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/sparse.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/special.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/special.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73162bd02ba05b1b3cb8cab670701bd9ea7b70ec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/__pycache__/special.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/_masked.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/_masked.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e4758bbee7a4ffce77ab0cd823a55d57c87431
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/_masked.py
@@ -0,0 +1,1184 @@
+# mypy: ignore-errors
+
+import unittest
+from collections.abc import Sequence
+from functools import partial
+from typing import List
+
+import numpy as np
+
+import torch
+from torch.testing import make_tensor
+from torch.testing._internal.common_device_type import tol, toleranceOverride
+from torch.testing._internal.common_dtype import (
+    all_types_and,
+    all_types_and_complex_and,
+    complex_types,
+    floating_and_complex_types_and,
+    floating_types_and,
+    integral_types,
+)
+from torch.testing._internal.opinfo.core import (
+    DecorateInfo,
+    gradcheck_wrapper_masked_operation,
+    gradcheck_wrapper_masked_pointwise_operation,
+    M,
+    OpInfo,
+    ReductionOpInfo,
+    S,
+    sample_inputs_reduction,
+    SampleInput,
+)
+from torch.testing._internal.opinfo.utils import prod_numpy, reference_reduction_numpy
+
+
+# Used for log_softmax, softmax, softmin
+def sample_inputs_softmax_variant(
+    op_info,
+    device,
+    dtype,
+    requires_grad,
+    with_dtype=False,
+    use_zero_dimensions=True,
+    **kwargs,
+):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    cases = [
+        ((S,), (0,)),
+        ((S, S), (0,)),
+        ((S, S), (1,)),
+        ((S, S), (-1,)),
+        ((S, M, S), (2,)),
+        *([((S, 0, 0), (-1,))] if use_zero_dimensions else []),
+    ]
+    kwargs = dict(dtype=torch.float64) if with_dtype else None
+
+    # PyTorch on XLA throws an error when passed with dim argument for 0d tensor.
+    # See https://github.com/pytorch/xla/issues/3061 for more details.
+    if torch.device(device).type != "xla":
+        cases.append(((), (0,)))
+
+    return (
+        SampleInput(make_arg(shape), args=dim, kwargs=kwargs) for shape, dim in cases
+    )
+
+
+def _generate_masked_op_mask(input_shape, device, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=torch.bool, device=device, requires_grad=False
+    )
+    yield None
+    yield make_arg(input_shape)
+    if len(input_shape) > 2:
+        # broadcast last mask dimension:
+        yield make_arg(input_shape[:-1] + (1,))
+        # broadcast middle mask dimension:
+        yield make_arg(input_shape[:1] + (1,) + input_shape[2:])
+        # broadcast first mask dimension:
+        yield make_arg((1,) + input_shape[1:])
+        # mask.ndim < input.ndim
+        yield make_arg(input_shape[1:])
+        # mask.ndim == 1
+        yield make_arg(input_shape[-1:])
+        # masks that require broadcasting of inputs (mask.ndim >
+        # input.ndim) will not be supported, however, we may
+        # reconsider this if there will be demand on this kind of
+        # degenerate cases.
+
+
+def sample_inputs_masked_reduction(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked reduction operators.
+
+    Masked reduction operator is a reduction operator with trailing
+    mask optional argument. A mask is a bool tensor with the same
+    shape as input or a shape that is broadcastable to input shape.
+    """
+    kwargs["supports_multiple_dims"] = op_info.supports_multiple_dims
+
+    for sample_input in sample_inputs_reduction(
+        op_info, device, dtype, requires_grad, **kwargs
+    ):
+        for mask in _generate_masked_op_mask(
+            sample_input.input.shape, device, **kwargs
+        ):
+            sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                mask=mask, **sample_input.kwargs
+            )
+            yield SampleInput(
+                sample_input.input.detach().requires_grad_(requires_grad),
+                args=sample_input_args,
+                kwargs=sample_input_kwargs,
+            )
+            if (
+                not requires_grad
+                and dtype.is_floating_point
+                and sample_input.input.ndim == 2
+                and mask is not None
+                and mask.shape == sample_input.input.shape
+            ):
+                for v in [torch.inf, -torch.inf, torch.nan]:
+                    t = sample_input.input.detach()
+                    t.diagonal(0, -2, -1).fill_(v)
+                    yield SampleInput(
+                        t.requires_grad_(requires_grad),
+                        args=sample_input_args,
+                        kwargs=sample_input_kwargs,
+                    )
+
+
+def sample_inputs_sparse_coo_masked_reduction(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    """Sample inputs for masked reduction operators that support inputs
+    with sparse coo layouts.
+    """
+    if op_info.supports_sparse:
+        op_name = op_info.name.replace("masked.", "")
+        for sample_input in sample_inputs_masked_reduction(
+            op_info, device, dtype, requires_grad, **kwargs
+        ):
+            mask = sample_input.kwargs.get("mask")
+            if mask is not None:
+                sample_input_kwargs = sample_input.kwargs.copy()
+                sample_input_kwargs.update(mask=mask.to_sparse())
+                yield SampleInput(
+                    sample_input.input.to_sparse(),
+                    args=sample_input.args,
+                    kwargs=sample_input_kwargs,
+                )
+            else:
+                if op_name in {"prod", "amax", "amin"}:
+                    # FIXME: for now reductions with non-zero reduction identity and
+                    # unspecified mask are not supported for sparse COO
+                    # tensors, see torch.masked.prod implementation
+                    # for details.
+                    continue
+                yield SampleInput(
+                    sample_input.input.to_sparse(),
+                    args=sample_input.args,
+                    kwargs=sample_input.kwargs,
+                )
+
+
+def sample_inputs_sparse_csr_masked_reduction(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    """Sample inputs for masked reduction operators that support inputs
+    with sparse csr layouts.
+    """
+    if op_info.supports_sparse_csr:
+        op_name = op_info.name.replace("masked.", "")
+        for sample_input in sample_inputs_masked_reduction(
+            op_info, device, dtype, requires_grad, **kwargs
+        ):
+            if not (
+                sample_input.input.ndim == 2 and sample_input.kwargs.get("keepdim")
+            ):
+                # - sparse CSR tensors are always 2-D tensors
+                # - masked reduction on CSR tensors are defined only if keepdim is True.
+                continue
+            mask = sample_input.kwargs.get("mask")
+            if mask is not None:
+                sample_input_kwargs = sample_input.kwargs.copy()
+                sample_input_kwargs.update(mask=mask.to_sparse_csr())
+                new_sample = SampleInput(
+                    sample_input.input.to_sparse_csr(),
+                    args=sample_input.args,
+                    kwargs=sample_input_kwargs,
+                )
+            else:
+                if op_name in ["prod", "amax", "amin", "mean"]:
+                    # reductions with non-zero reduction identity and
+                    # unspecified mask is not supported for sparse CSR
+                    # tensors, see torch.masked.prod implementation
+                    # for details.
+                    continue
+                new_sample = SampleInput(
+                    sample_input.input.to_sparse_csr(),
+                    args=sample_input.args,
+                    kwargs=sample_input.kwargs,
+                )
+            yield new_sample
+            if sample_input.kwargs["dim"] == 0:
+                # Reductions of CSR tensors use different implementations for
+                # inner and/or outer dimensions. So, as a minimum of testing CSR
+                # implementations the following kwargs must be generated:
+                #   dict(dim=0, keepdim=True)
+                #   dict(dim=1, keepdim=True)
+                #   dict(dim=(0, 1), keepdim=True)
+                # Here we generate the dim=1 case from the dim=0 case.
+                sample_input_kwargs = new_sample.kwargs.copy()
+                sample_input_kwargs.update(dim=1)
+                yield SampleInput(
+                    new_sample.input.clone(),
+                    args=sample_input.args,
+                    kwargs=sample_input_kwargs,
+                )
+
+
+def sample_inputs_masked_norm(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked norm."""
+    for ord in [2.0, 1, float("inf"), float("-inf"), 0]:
+        for sample_input in sample_inputs_masked_reduction(
+            op_info, device, dtype, requires_grad, **kwargs
+        ):
+            sample_input_args, sample_input_kwargs = (
+                ord,
+            ) + sample_input.args, sample_input.kwargs.copy()
+            yield SampleInput(
+                sample_input.input.clone().requires_grad_(requires_grad),
+                args=sample_input_args,
+                kwargs=sample_input_kwargs,
+            )
+
+
+def reference_masked_std_var(
+    numpy_fn,
+):
+    ref = reference_reduction_numpy(numpy_fn)
+
+    # Translate unbiased or correction arguments into ddof
+    def func(
+        input,
+        dim=None,
+        unbiased=None,
+        *,
+        correction=None,
+        **kwargs,
+    ):
+        ddof = 1
+        if unbiased is not None:
+            ddof = 1 if unbiased else 0
+        if correction is not None:
+            ddof = correction
+
+        if isinstance(dim, Sequence):
+            dim = tuple(dim)
+
+        return ref(input, dim, ddof=ddof, **kwargs)
+
+    return func
+
+
+def sample_inputs_masked_std_var(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked std/var."""
+    kwargs["supports_multiple_dims"] = op_info.supports_multiple_dims
+    from torch.testing._internal.common_methods_invocations import sample_inputs_std_var
+
+    def masked_samples():
+        for sample_input in sample_inputs_std_var(
+            op_info, device, dtype, requires_grad, **kwargs
+        ):
+            if len(sample_input.args) and isinstance(sample_input.args[0], bool):
+                continue  # masked.{std, var} doesn't support `.var(unbiased)`
+
+            for mask in _generate_masked_op_mask(
+                sample_input.input.shape, device, **kwargs
+            ):
+                sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                    mask=mask, **sample_input.kwargs
+                )
+                yield SampleInput(
+                    sample_input.input.detach().requires_grad_(requires_grad),
+                    args=sample_input_args,
+                    kwargs=sample_input_kwargs,
+                )
+                if (
+                    not requires_grad
+                    and dtype.is_floating_point
+                    and sample_input.input.ndim == 2
+                    and mask is not None
+                    and mask.shape == sample_input.input.shape
+                ):
+                    for v in [torch.inf, -torch.inf, torch.nan]:
+                        t = sample_input.input.detach()
+                        t.diagonal(0, -2, -1).fill_(v)
+                        yield SampleInput(
+                            t.requires_grad_(requires_grad),
+                            args=sample_input_args,
+                            kwargs=sample_input_kwargs,
+                        )
+
+    for sample_input in masked_samples():
+        correction = sample_input.kwargs.get("correction")
+        if correction is None:
+            correction = int(sample_input.kwargs.get("unbiased", True))
+
+        dim = sample_input.kwargs.get("dim", None)
+
+        if sample_input.kwargs.get("mask") is None:
+            orig_count = torch.masked.sum(
+                torch.ones(sample_input.input.shape, dtype=torch.int64),
+                dim,
+                keepdim=True,
+            )
+        else:
+            inmask = torch.masked._input_mask(
+                sample_input.input, *sample_input.args, **sample_input.kwargs
+            )
+            orig_count = torch.masked.sum(
+                inmask.new_ones(sample_input.input.shape, dtype=torch.int64),
+                dim,
+                keepdim=True,
+                mask=inmask,
+            )
+        if orig_count.min() <= correction + 1:
+            # Skip samples that lead to nans in var computation
+            continue
+
+        yield sample_input
+
+
+def sample_inputs_masked_softmax(
+    op_info, device, dtype, requires_grad, with_dtype=False, **kwargs
+):
+    """Sample inputs for masked softmax, log_softmax, and softmin.
+
+    Masked normalization operator is a reduction operator with
+    trailing mask optional argument. A mask is a bool tensor with the
+    same shape as input or a shape that is broadcastable to input
+    shape.
+    """
+    for sample_input in sample_inputs_softmax_variant(
+        op_info, device, dtype, requires_grad, with_dtype=with_dtype, **kwargs
+    ):
+        for mask in _generate_masked_op_mask(
+            sample_input.input.shape, device, **kwargs
+        ):
+            yield SampleInput(
+                sample_input.input.clone().requires_grad_(requires_grad),
+                *sample_input.args,
+                mask=mask,
+                **sample_input.kwargs,
+            )
+
+
+def sample_inputs_masked_cumops(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked cumsum and cumprod."""
+    inputs: List[SampleInput] = []
+    for sample_input in sample_inputs_softmax_variant(
+        op_info, device, dtype, requires_grad, **kwargs
+    ):
+        for mask in _generate_masked_op_mask(
+            sample_input.input.shape, device, **kwargs
+        ):
+            if type(mask) != torch.Tensor:
+                continue
+            sample_input_args, sample_input_kwargs = sample_input.args, dict(
+                mask=mask, **sample_input.kwargs
+            )
+            if "keepdim" in sample_input_kwargs:
+                sample_input_kwargs.pop("keepdim")
+            # dimension is required
+            if sample_input_args:
+                dim = sample_input.args[0]
+            else:
+                if "dim" not in sample_input_kwargs:
+                    continue
+                dim = sample_input_kwargs.pop("dim")
+                sample_input_args = (dim,)
+            yield SampleInput(
+                sample_input.input.clone().requires_grad_(requires_grad),
+                *sample_input_args,
+                **sample_input_kwargs,
+            )
+
+
+def sample_inputs_masked_logaddexp(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked logaddexp."""
+    shapes = [(S,), (S, S), (S, M, S)]
+    input_mask_lists = [
+        list(_generate_masked_op_mask(shape, device, **kwargs)) for shape in shapes
+    ]
+    other_mask_lists = [
+        list(_generate_masked_op_mask(shape, device, **kwargs)) for shape in shapes
+    ]
+
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    for shape, input_masks, other_masks in zip(
+        shapes, input_mask_lists, other_mask_lists
+    ):
+        for input_mask, other_mask in zip(input_masks, other_masks):
+            yield SampleInput(
+                make_arg(shape),
+                make_arg(shape),
+                input_mask=input_mask,
+                other_mask=other_mask,
+            )
+
+
+def sample_inputs_masked_normalize(op_info, device, dtype, requires_grad, **kwargs):
+    """Sample inputs for masked normalize."""
+    for ord in [2.0, 1, float("inf"), float("-inf"), 0]:
+        for sample_input in sample_inputs_softmax_variant(
+            op_info, device, dtype, requires_grad, use_zero_dimensions=False, **kwargs
+        ):
+            yield SampleInput(
+                sample_input.input.clone().requires_grad_(requires_grad),
+                ord,
+                *sample_input.args,
+                **sample_input.kwargs,
+            )
+
+
+op_db: List[OpInfo] = [
+    ReductionOpInfo(
+        "masked.sum",
+        ref=reference_reduction_numpy(np.sum),
+        method_variant=None,
+        identity=0,
+        nan_policy="propagate",
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_sparse=True,
+        supports_sparse_csr=True,
+        promotes_int_to_int64=True,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(
+                unittest.skip("Failing on some jobs"),
+                "TestReductions",
+                "test_reference_masked",
+                dtypes=(torch.bool, torch.int8, torch.int16, torch.int32),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: undefined value tensor
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=1e-03, rtol=5e-2),
+                        torch.float16: tol(atol=1e-03, rtol=5e-3),
+                    }
+                ),
+                "TestReductions",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-03)}),
+                "TestReductions",
+                "test_ref_small_input",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=0.1, rtol=0.1),
+                        torch.float16: tol(atol=5e-3, rtol=5e-3),
+                    }
+                ),
+                "TestMasked",
+                "test_mask_layout",
+            ),
+        ],
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+    ),
+    ReductionOpInfo(
+        "masked.prod",
+        ref=prod_numpy,
+        method_variant=None,
+        identity=1,
+        nan_policy="propagate",
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_sparse=True,
+        supports_sparse_csr=True,
+        promotes_int_to_int64=True,
+        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            DecorateInfo(
+                unittest.skip("Failing on some jobs"),
+                "TestReductions",
+                "test_reference_masked",
+                dtypes=(torch.bool, torch.int8, torch.int16, torch.int32),
+            ),
+            DecorateInfo(
+                "TestReductions",
+                "test_ref_small_input",
+                dtypes=(torch.int8, torch.int16, torch.int32),
+            ),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestMasked",
+                "test_mask_layout",
+                device_type="cuda",
+                dtypes=(torch.bool, *integral_types(), *complex_types()),
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-02)}),
+                "TestReductions",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-03)}),
+                "TestReductions",
+                "test_ref_duplicate_values",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-03)}),
+                "TestReductions",
+                "test_ref_small_input",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1.5e-03)}),
+                "TestMasked",
+                "test_mask_layout",
+                device_type="cpu",
+            ),
+        ],
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+    ),
+    OpInfo(
+        "masked.cumsum",
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        method_variant=None,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        # Can reuse the same inputs; dim is required in both
+        sample_inputs_func=sample_inputs_masked_cumops,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    OpInfo(
+        "masked.cumprod",
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        method_variant=None,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+                "TestCompositeCompliance",
+                "test_backward",
+                device_type="cuda",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=2e-3, rtol=2e-3)}),
+                "TestInductorOpInfo",
+                "test_comprehensive",
+                device_type="cuda",
+            ),
+        ),
+        # Can reuse the same inputs; dim is required in both
+        sample_inputs_func=sample_inputs_masked_cumops,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.amax",
+        nan_policy="propagate",
+        supports_out=False,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        supports_sparse=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_sparse_csr=True,
+        ref=reference_reduction_numpy(np.amax),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: amax reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: Unknown builtin op: aten::iinfo
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            # FIXME: "_segment_reduce_lengths_cpu/cuda" not implemented for ... (used for sparse_csr inputs)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestMasked",
+                "test_mask_layout",
+                dtypes=(torch.bool, *integral_types(), *complex_types()),
+            ),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.amin",
+        nan_policy="propagate",
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        supports_sparse=True,
+        supports_sparse_csr=True,
+        ref=reference_reduction_numpy(np.amin),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: amax reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: Unknown builtin op: aten::iinfo
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            # FIXME: "cuda_scatter_gather_base_kernel_func" not implemented for ... (used for sparse_coo inputs)
+            # FIXME: "_segment_reduce_lengths_cpu/cuda" not implemented for ... (used for sparse_csr inputs)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestMasked",
+                "test_mask_layout",
+                dtypes=(torch.bool, *integral_types(), *complex_types()),
+            ),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_coo_func=sample_inputs_sparse_coo_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.argmax",
+        supports_out=False,
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmax, supports_keepdims=False),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # initial is not a keyword for argmax
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_reference_masked"
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.argmin",
+        supports_out=False,
+        supports_multiple_dims=False,
+        supports_autograd=False,
+        dtypes=all_types_and(torch.float16, torch.bfloat16),
+        ref=reference_reduction_numpy(np.argmin, supports_keepdims=False),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # initial is not a keyword for argmin
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_reference_masked"
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.mean",
+        ref=reference_reduction_numpy(np.mean)
+        if np.lib.NumpyVersion(np.__version__) >= "1.20.2"
+        else None,
+        method_variant=None,
+        nan_policy="propagate",
+        supports_out=False,
+        supports_sparse_csr=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        promotes_int_to_float=True,
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16, torch.bool),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestReductions",
+                "test_ref_duplicate_values",
+                dtypes=(torch.bool,),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestReductions",
+                "test_reference_masked",
+                dtypes=(torch.bool,),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestReductions",
+                "test_ref_small_input",
+                dtypes=(torch.bool,),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: undefined value tensor
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            # FIXME: "_segment_reduce_lengths_cpu/cuda" not implemented for ... (used for sparse_csr inputs)
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestMasked",
+                "test_mask_layout",
+                dtypes=(torch.bool, *integral_types(), *complex_types()),
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=1e-03, rtol=0.05),
+                        torch.float16: tol(atol=1e-03, rtol=1e-03),
+                    }
+                ),
+                "TestReductions",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=1e-03)}),
+                "TestReductions",
+                "test_ref_small_input",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-03, rtol=2e-03)}),
+                "TestSparseCompressed",
+                "test_consistency",
+                device_type="cuda",
+            ),
+        ],
+        sample_inputs_func=sample_inputs_masked_reduction,
+        sample_inputs_sparse_csr_func=sample_inputs_sparse_csr_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    OpInfo(
+        "masked.median",
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        method_variant=None,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        sample_inputs_func=partial(
+            sample_inputs_masked_softmax, use_zero_dimensions=False
+        ),
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.norm",
+        identity=0,
+        method_variant=None,
+        nan_policy="propagate",
+        supports_out=False,
+        promotes_int_to_float=True,
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # torch.jit.frontend.NotSupportedError: Compiled functions
+            # can't take variable number of arguments or use
+            # keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_masked_norm,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+    ReductionOpInfo(
+        "masked.var",
+        ref=reference_masked_std_var(np.var)
+        if np.lib.NumpyVersion(np.__version__) >= "1.20.2"
+        else None,
+        method_variant=None,
+        nan_policy="propagate",
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        promotes_int_to_float=True,
+        dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16),
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=(torch.complex64, torch.complex128),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: undefined value tensor
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float16: tol(atol=1e-02, rtol=1e-02),
+                        torch.bfloat16: tol(atol=1e-03, rtol=1e-03),
+                    }
+                ),
+                "TestReductions",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                "TestReductions",
+                "test_ref_small_input",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                "TestMasked",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float16: tol(atol=1e-02, rtol=1e-02),
+                        torch.bfloat16: tol(atol=1e-03, rtol=1e-03),
+                    }
+                ),
+                "TestMasked",
+                "test_reference_masked",
+            ),
+        ],
+        sample_inputs_func=sample_inputs_masked_std_var,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        check_batched_grad=True,
+    ),
+    ReductionOpInfo(
+        "masked.std",
+        ref=reference_masked_std_var(np.std)
+        if np.lib.NumpyVersion(np.__version__) >= "1.20.2"
+        else None,
+        method_variant=None,
+        nan_policy="propagate",
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        promotes_int_to_float=True,
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=(torch.complex64, torch.complex128),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # RuntimeError: undefined value tensor
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=1e-02, rtol=1e-02),
+                        torch.float16: tol(atol=1e-02, rtol=1e-02),
+                    }
+                ),
+                "TestReductions",
+                "test_reference_masked",
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float16: tol(atol=1e-02, rtol=1e-02)}),
+                "TestReductions",
+                "test_ref_small_input",
+            ),
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float16: tol(atol=1e-02, rtol=1e-02),
+                        torch.bfloat16: tol(atol=5e-03, rtol=5e-04),
+                    }
+                ),
+                "TestMasked",
+                "test_reference_masked",
+            ),
+        ],
+        sample_inputs_func=sample_inputs_masked_std_var,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        check_batched_grad=True,
+    ),
+    OpInfo(
+        "masked.softmax",
+        method_variant=None,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_masked_softmax,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        "masked.log_softmax",
+        method_variant=None,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_masked_softmax,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        decorators=[
+            DecorateInfo(
+                toleranceOverride({torch.bfloat16: tol(atol=1e-02, rtol=1e-02)}),
+                "TestMasked",
+                "test_reference_masked",
+            ),
+        ],
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        "masked.softmin",
+        method_variant=None,
+        dtypes=floating_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_masked_softmax,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        "masked.normalize",
+        method_variant=None,
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_masked_normalize,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+        ),
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+    ),
+    OpInfo(
+        "masked.logaddexp",
+        dtypes=floating_types_and(torch.float16, torch.bfloat16),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_forward_grad=False,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestFwdGradients", "test_fn_gradgrad"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestBwdGradients", "test_fn_gradgrad"
+            ),
+        ),
+        sample_inputs_func=sample_inputs_masked_logaddexp,
+        gradcheck_wrapper=gradcheck_wrapper_masked_pointwise_operation,
+    ),
+    ReductionOpInfo(
+        "masked.logsumexp",
+        dtypes=all_types_and(torch.half, torch.bfloat16),
+        method_variant=None,
+        nan_policy="propagate",
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+            # FIXME: reduces all dimensions when dim=[]
+            DecorateInfo(unittest.skip("Skipped!"), "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestReductions", "test_dim_empty_keepdim"
+            ),
+            # Identity can't be -torch.inf without overflow
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestReductions",
+                "test_empty_tensor_empty_slice",
+            ),
+            # NotSupportedError: Compiled functions can't ... use keyword-only arguments with defaults
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"
+            ),
+            # all the values are the same except for -inf vs nan
+            DecorateInfo(unittest.skip("Skipped!"), "TestDecomp", "test_comprehensive"),
+        ),
+        sample_inputs_func=sample_inputs_masked_reduction,
+        gradcheck_wrapper=gradcheck_wrapper_masked_operation,
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/fft.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/fft.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a84d865f887d86a041b0ef334e2609b4bc9b37b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/fft.py
@@ -0,0 +1,784 @@
+# mypy: ignore-errors
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+
+import torch
+
+from torch.testing import make_tensor
+from torch.testing._internal.common_cuda import SM53OrLater
+from torch.testing._internal.common_device_type import precisionOverride
+from torch.testing._internal.common_dtype import (
+    all_types_and,
+    all_types_and_complex_and,
+)
+from torch.testing._internal.common_utils import TEST_SCIPY, TEST_WITH_ROCM
+from torch.testing._internal.opinfo.core import (
+    DecorateInfo,
+    ErrorInput,
+    OpInfo,
+    sample_inputs_spectral_ops,
+    SampleInput,
+    SpectralFuncInfo,
+    SpectralFuncType,
+)
+from torch.testing._internal.opinfo.refs import (
+    _find_referenced_opinfo,
+    _inherit_constructor_args,
+    PythonRefInfo,
+)
+
+has_scipy_fft = False
+if TEST_SCIPY:
+    try:
+        import scipy.fft
+
+        has_scipy_fft = True
+    except ModuleNotFoundError:
+        pass
+
+
+class SpectralFuncPythonRefInfo(SpectralFuncInfo):
+    """
+    An OpInfo for a Python reference of an elementwise unary operation.
+    """
+
+    def __init__(
+        self,
+        name,  # the stringname of the callable Python reference
+        *,
+        op=None,  # the function variant of the operation, populated as torch.<name> if None
+        torch_opinfo_name,  # the string name of the corresponding torch opinfo
+        torch_opinfo_variant="",
+        **kwargs,
+    ):  # additional kwargs override kwargs inherited from the torch opinfo
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo = _find_referenced_opinfo(
+            torch_opinfo_name, torch_opinfo_variant, op_db=op_db
+        )
+        assert isinstance(self.torch_opinfo, SpectralFuncInfo)
+
+        inherited = self.torch_opinfo._original_spectral_func_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        super().__init__(**ukwargs)
+
+
+def error_inputs_fft(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    # Zero-dimensional tensor has no dimension to take FFT of
+    yield ErrorInput(
+        SampleInput(make_arg()),
+        error_type=IndexError,
+        error_regex="Dimension specified as -1 but tensor has no dimensions",
+    )
+
+
+def error_inputs_fftn(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+    # Specifying a dimension on a zero-dimensional tensor
+    yield ErrorInput(
+        SampleInput(make_arg(), dim=(0,)),
+        error_type=IndexError,
+        error_regex="Dimension specified as 0 but tensor has no dimensions",
+    )
+
+
+def sample_inputs_fft_with_min(
+    op_info, device, dtype, requires_grad=False, *, min_size, **kwargs
+):
+    yield from sample_inputs_spectral_ops(
+        op_info, device, dtype, requires_grad, **kwargs
+    )
+    if TEST_WITH_ROCM:
+        # FIXME: Causes floating point exception on ROCm
+        return
+
+    # Check the "Invalid number of data points" error isn't too strict
+    # https://github.com/pytorch/pytorch/pull/109083
+    a = make_tensor(min_size, dtype=dtype, device=device, requires_grad=requires_grad)
+    yield SampleInput(a)
+
+
+def sample_inputs_fftshift(op_info, device, dtype, requires_grad, **kwargs):
+    def mt(shape, **kwargs):
+        return make_tensor(
+            shape, device=device, dtype=dtype, requires_grad=requires_grad, **kwargs
+        )
+
+    yield SampleInput(mt((9, 10)))
+    yield SampleInput(mt((50,)), kwargs=dict(dim=0))
+    yield SampleInput(mt((5, 11)), kwargs=dict(dim=(1,)))
+    yield SampleInput(mt((5, 6)), kwargs=dict(dim=(0, 1)))
+    yield SampleInput(mt((5, 6, 2)), kwargs=dict(dim=(0, 2)))
+
+
+# Operator database
+op_db: List[OpInfo] = [
+    SpectralFuncInfo(
+        "fft.fft",
+        aten_name="fft_fft",
+        decomp_aten_name="_fft_c2c",
+        ref=np.fft.fft,
+        ndimensional=SpectralFuncType.OneD,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=1),
+        error_inputs_func=error_inputs_fft,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+    ),
+    SpectralFuncInfo(
+        "fft.fft2",
+        aten_name="fft_fft2",
+        ref=np.fft.fft2,
+        decomp_aten_name="_fft_c2c",
+        ndimensional=SpectralFuncType.TwoD,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        decorators=[precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_complex_half_reference_testing",
+                device_type="cuda",
+                dtypes=[torch.complex32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.fftn",
+        aten_name="fft_fftn",
+        decomp_aten_name="_fft_c2c",
+        ref=np.fft.fftn,
+        ndimensional=SpectralFuncType.ND,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        decorators=[precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4})],
+    ),
+    SpectralFuncInfo(
+        "fft.hfft",
+        aten_name="fft_hfft",
+        decomp_aten_name="_fft_c2r",
+        ref=np.fft.hfft,
+        ndimensional=SpectralFuncType.OneD,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=2),
+        error_inputs_func=error_inputs_fft,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        check_batched_gradgrad=False,
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=(torch.complex64, torch.complex128),
+            ),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.hfft2",
+        aten_name="fft_hfft2",
+        decomp_aten_name="_fft_c2r",
+        ref=scipy.fft.hfft2 if has_scipy_fft else None,
+        ndimensional=SpectralFuncType.TwoD,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(2, 2)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_gradgrad=False,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4, torch.cfloat: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+            ),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.hfftn",
+        aten_name="fft_hfftn",
+        decomp_aten_name="_fft_c2r",
+        ref=scipy.fft.hfftn if has_scipy_fft else None,
+        ndimensional=SpectralFuncType.ND,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(2, 2)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_gradgrad=False,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4, torch.cfloat: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            ),
+        ],
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+            ),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.rfft",
+        aten_name="fft_rfft",
+        decomp_aten_name="_fft_r2c",
+        ref=np.fft.rfft,
+        ndimensional=SpectralFuncType.OneD,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=1),
+        error_inputs_func=error_inputs_fft,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_grad=False,
+        skips=(),
+        check_batched_gradgrad=False,
+    ),
+    SpectralFuncInfo(
+        "fft.rfft2",
+        aten_name="fft_rfft2",
+        decomp_aten_name="_fft_r2c",
+        ref=np.fft.rfft2,
+        ndimensional=SpectralFuncType.TwoD,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        decorators=[
+            precisionOverride({torch.float: 1e-4}),
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.rfftn",
+        aten_name="fft_rfftn",
+        decomp_aten_name="_fft_r2c",
+        ref=np.fft.rfftn,
+        ndimensional=SpectralFuncType.ND,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        decorators=[
+            precisionOverride({torch.float: 1e-4}),
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.ifft",
+        aten_name="fft_ifft",
+        decomp_aten_name="_fft_c2c",
+        ref=np.fft.ifft,
+        ndimensional=SpectralFuncType.OneD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=1),
+        error_inputs_func=error_inputs_fft,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.ifft2",
+        aten_name="fft_ifft2",
+        decomp_aten_name="_fft_c2c",
+        ref=np.fft.ifft2,
+        ndimensional=SpectralFuncType.TwoD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.ifftn",
+        aten_name="fft_ifftn",
+        decomp_aten_name="_fft_c2c",
+        ref=np.fft.ifftn,
+        ndimensional=SpectralFuncType.ND,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.ihfft",
+        aten_name="fft_ihfft",
+        decomp_aten_name="_fft_r2c",
+        ref=np.fft.ihfft,
+        ndimensional=SpectralFuncType.OneD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fft,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        skips=(),
+        check_batched_grad=False,
+    ),
+    SpectralFuncInfo(
+        "fft.ihfft2",
+        aten_name="fft_ihfft2",
+        decomp_aten_name="_fft_r2c",
+        ref=scipy.fft.ihfftn if has_scipy_fft else None,
+        ndimensional=SpectralFuncType.TwoD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        decorators=(
+            # The values for attribute 'shape' do not match: torch.Size([5, 6, 5]) != torch.Size([5, 6, 6]).
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out_warning"),
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4}), "TestFFT", "test_reference_nd"
+            ),
+            # Mismatched elements!
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out"),
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out_warnings"),
+        ),
+    ),
+    SpectralFuncInfo(
+        "fft.ihfftn",
+        aten_name="fft_ihfftn",
+        decomp_aten_name="_fft_r2c",
+        ref=scipy.fft.ihfftn if has_scipy_fft else None,
+        ndimensional=SpectralFuncType.ND,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 1)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archss
+        dtypesIfCUDA=all_types_and(
+            torch.bool, *(() if (not SM53OrLater) else (torch.half,))
+        ),
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        decorators=[
+            # The values for attribute 'shape' do not match: torch.Size([5, 6, 5]) != torch.Size([5, 6, 6]).
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out_warning"),
+            # Mismatched elements!
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_out"),
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4}), "TestFFT", "test_reference_nd"
+            ),
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.irfft",
+        aten_name="fft_irfft",
+        decomp_aten_name="_fft_c2r",
+        ref=np.fft.irfft,
+        ndimensional=SpectralFuncType.OneD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 2)),
+        error_inputs_func=error_inputs_fft,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        check_batched_gradgrad=False,
+    ),
+    SpectralFuncInfo(
+        "fft.irfft2",
+        aten_name="fft_irfft2",
+        decomp_aten_name="_fft_c2r",
+        ref=np.fft.irfft2,
+        ndimensional=SpectralFuncType.TwoD,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 2)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        check_batched_gradgrad=False,
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncInfo(
+        "fft.irfftn",
+        aten_name="fft_irfftn",
+        decomp_aten_name="_fft_c2r",
+        ref=np.fft.irfftn,
+        ndimensional=SpectralFuncType.ND,
+        sample_inputs_func=partial(sample_inputs_fft_with_min, min_size=(1, 2)),
+        error_inputs_func=error_inputs_fftn,
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        dtypes=all_types_and_complex_and(torch.bool),
+        # CUDA supports Half/ComplexHalf Precision FFT only on SM53 or later archs
+        dtypesIfCUDA=all_types_and_complex_and(
+            torch.bool,
+            *(() if (not SM53OrLater) else (torch.half, torch.complex32)),
+        ),
+        check_batched_gradgrad=False,
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    OpInfo(
+        "fft.fftshift",
+        dtypes=all_types_and_complex_and(
+            torch.bool, torch.bfloat16, torch.half, torch.chalf
+        ),
+        sample_inputs_func=sample_inputs_fftshift,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    OpInfo(
+        "fft.ifftshift",
+        dtypes=all_types_and_complex_and(
+            torch.bool, torch.bfloat16, torch.half, torch.chalf
+        ),
+        sample_inputs_func=sample_inputs_fftshift,
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+]
+
+python_ref_db: List[OpInfo] = [
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.fft",
+        torch_opinfo_name="fft.fft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ifft",
+        torch_opinfo_name="fft.ifft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.rfft",
+        torch_opinfo_name="fft.rfft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.irfft",
+        torch_opinfo_name="fft.irfft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.hfft",
+        torch_opinfo_name="fft.hfft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ihfft",
+        torch_opinfo_name="fft.ihfft",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.fftn",
+        torch_opinfo_name="fft.fftn",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ifftn",
+        torch_opinfo_name="fft.ifftn",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.rfftn",
+        torch_opinfo_name="fft.rfftn",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.irfftn",
+        torch_opinfo_name="fft.irfftn",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.hfftn",
+        torch_opinfo_name="fft.hfftn",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4, torch.cfloat: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ihfftn",
+        torch_opinfo_name="fft.ihfftn",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.fft2",
+        torch_opinfo_name="fft.fft2",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ifft2",
+        torch_opinfo_name="fft.ifft2",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.rfft2",
+        torch_opinfo_name="fft.rfft2",
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.irfft2",
+        torch_opinfo_name="fft.irfft2",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 1e-4, torch.cfloat: 1e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.hfft2",
+        torch_opinfo_name="fft.hfft2",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4, torch.cfloat: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    SpectralFuncPythonRefInfo(
+        "_refs.fft.ihfft2",
+        torch_opinfo_name="fft.ihfft2",
+        decorators=[
+            DecorateInfo(
+                precisionOverride({torch.float: 2e-4}),
+                "TestFFT",
+                "test_reference_nd",
+            )
+        ],
+    ),
+    PythonRefInfo(
+        "_refs.fft.fftshift",
+        op_db=op_db,
+        torch_opinfo_name="fft.fftshift",
+        skips=(
+            # TODO Move fftshift to decomps
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_ref"),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.fft.ifftshift",
+        op_db=op_db,
+        torch_opinfo_name="fft.ifftshift",
+        skips=(
+            # TODO Move ifftshift to decomps
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_ref"),
+        ),
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/linalg.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..83a5bb0a885f8128aae059d320777ea5c6b0caff
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/linalg.py
@@ -0,0 +1,2454 @@
+# mypy: ignore-errors
+
+import itertools
+import random
+import unittest
+from functools import partial
+from itertools import chain, product
+from typing import Iterable, List, Tuple
+
+import numpy as np
+from numpy import inf
+
+import torch
+
+from torch.testing import make_tensor
+from torch.testing._internal.common_cuda import (
+    _get_magma_version,
+    _get_torch_cuda_version,
+    with_tf32_off,
+)
+from torch.testing._internal.common_device_type import (
+    has_cusolver,
+    skipCPUIfNoLapack,
+    skipCUDAIf,
+    skipCUDAIfNoCusolver,
+    skipCUDAIfNoMagma,
+    skipCUDAIfNoMagmaAndNoCusolver,
+    skipCUDAIfNoMagmaAndNoLinalgsolver,
+    skipCUDAIfRocm,
+    tol,
+    toleranceOverride,
+)
+from torch.testing._internal.common_dtype import (
+    all_types_and_complex,
+    all_types_and_complex_and,
+    floating_and_complex_types,
+    floating_and_complex_types_and,
+    get_all_complex_dtypes,
+)
+from torch.testing._internal.common_utils import (
+    GRADCHECK_NONDET_TOL,
+    IS_MACOS,
+    make_fullrank_matrices_with_distinct_singular_values,
+    skipIfSlowGradcheckEnv,
+    slowTest,
+    TEST_WITH_ROCM,
+)
+from torch.testing._internal.opinfo.core import (
+    clone_sample,
+    DecorateInfo,
+    ErrorInput,
+    gradcheck_wrapper_hermitian_input,
+    L,
+    M,
+    OpInfo,
+    ReductionOpInfo,
+    S,
+    SampleInput,
+)
+from torch.testing._internal.opinfo.refs import PythonRefInfo, ReductionPythonRefInfo
+
+
+def sample_kwargs_vector_norm(t, **kwargs):
+    # orders with / without identity
+    def ords():
+        has_id = (6, 4, 2, 1, 0, 0.9)
+        no_id = (inf, -2.1, -inf)
+        if t.numel() == 0:
+            dim = kwargs.get("dim")
+            if dim is None:
+                return has_id
+            if not isinstance(dim, Iterable):
+                dim = (dim,)
+            for d in dim:
+                if t.size(d) == 0:
+                    return has_id
+        return has_id + no_id
+
+    return (((), dict(ord=o)) for o in ords())
+
+
+def sample_inputs_svd(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_fullrank = make_fullrank_matrices_with_distinct_singular_values
+    make_arg = partial(
+        make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    is_linalg_svd = "linalg.svd" in op_info.name
+    batches = [(), (0,), (3,)]
+    ns = [0, 3, 5]
+
+    def uniformize(usv):
+        S = usv[1]
+        k = S.shape[-1]
+        U = usv[0][..., :k]
+        Vh = usv[2] if is_linalg_svd else usv[2].mH
+        Vh = Vh[..., :k, :]
+        return U, S, Vh
+
+    def fn_U(usv):
+        U, _, _ = uniformize(usv)
+        return U.abs()
+
+    def fn_S(usv):
+        return uniformize(usv)[1]
+
+    def fn_Vh(usv):
+        # We also return S to test
+        _, S, Vh = uniformize(usv)
+        return S, Vh.abs()
+
+    def fn_UVh(usv):
+        U, S, Vh = uniformize(usv)
+        return U @ Vh, S
+
+    fns = (fn_U, fn_S, fn_Vh, fn_UVh)
+
+    fullmat = "full_matrices" if is_linalg_svd else "some"
+
+    for batch, n, k, fullmat_val, fn in product(batches, ns, ns, (True, False), fns):
+        shape = batch + (n, k)
+        yield SampleInput(
+            make_arg(*shape), kwargs={fullmat: fullmat_val}, output_process_fn_grad=fn
+        )
+
+
+def sample_inputs_cross(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    yield SampleInput(make_arg((S, 3)), args=(make_arg((S, 3)),))
+    yield SampleInput(
+        make_arg((S, 3, S)), args=(make_arg((S, 3, S)),), kwargs=dict(dim=1)
+    )
+    yield SampleInput(make_arg((1, 3)), args=(make_arg((S, 3)),), kwargs=dict(dim=-1))
+
+
+def error_inputs_cross(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    sample = SampleInput(input=make_arg((S, 3)), args=(make_arg((S, 1)),))
+    err = "inputs dimension -1 must have length 3"
+    yield ErrorInput(sample, error_regex=err, error_type=RuntimeError)
+
+    sample = SampleInput(input=make_arg((5, S, 3)), args=(make_arg((S, 3)),))
+    err = "inputs must have the same number of dimensions"
+    yield ErrorInput(sample, error_regex=err, error_type=RuntimeError)
+
+    sample = SampleInput(input=make_arg((S, 2)), args=(make_arg((S, 2)),))
+    err = "must have length 3"
+    yield ErrorInput(sample, error_regex=err, error_type=RuntimeError)
+
+    sample = SampleInput(
+        input=make_arg((S, 2)), args=(make_arg((S, 2)),), kwargs=dict(dim=2)
+    )
+    err = "Dimension out of range"
+    yield ErrorInput(sample, error_regex=err, error_type=IndexError)
+
+
+def sample_inputs_householder_product(op_info, device, dtype, requires_grad, **kwargs):
+    """
+    This function generates input for torch.linalg.householder_product (torch.orgqr).
+    The first argument should be a square matrix or batch of square matrices, the second argument is a vector or batch of vectors.
+    Empty, square, rectangular, batched square and batched rectangular input is generated.
+    """
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        low=-2,
+        high=2,
+    )
+    # Each column of the matrix is getting multiplied many times leading to very large values for
+    # the Jacobian matrix entries and making the finite-difference result of grad check less accurate.
+    # That's why gradcheck with the default range [-9, 9] fails and [-2, 2] is used here.
+    yield SampleInput(make_arg((S, S)), make_arg((S,)))
+    yield SampleInput(make_arg((S + 1, S)), make_arg((S,)))
+    yield SampleInput(make_arg((2, 1, S, S)), make_arg((2, 1, S)))
+    yield SampleInput(make_arg((2, 1, S + 1, S)), make_arg((2, 1, S)))
+    yield SampleInput(
+        make_arg((0, 0), low=None, high=None),
+        make_arg((0,), low=None, high=None),
+    )
+    yield SampleInput(make_arg((S, S)), make_arg((0,), low=None, high=None))
+    # m = n = S, k = S - 2
+    yield SampleInput(make_arg((S, S)), make_arg((S - 2,), low=None, high=None))
+    # m = S, n = S -1, k = S - 2
+    yield SampleInput(make_arg((S, S - 1)), make_arg((S - 2,), low=None, high=None))
+
+
+def sample_inputs_linalg_det_singular(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=dtype)
+
+    def make_singular_matrix_batch_base(size, rank):
+        assert size[-1] == size[-2]
+        assert rank > 0 and rank < size[-1]
+
+        n = size[-1]
+        a = make_arg(size[:-2] + (n, rank)) / 10
+        b = make_arg(size[:-2] + (rank, n)) / 10
+        x = a @ b
+        lu, pivs, _ = torch.linalg.lu_factor_ex(x)
+        p, l, u = torch.lu_unpack(lu, pivs)
+        u_diag_abs = u.diagonal(0, -2, -1).abs()
+        u_diag_abs_largest = u_diag_abs.max(dim=-1, keepdim=True).values
+        u_diag_abs_smallest_idxs = torch.topk(
+            u_diag_abs, k=(n - rank), largest=False
+        ).indices
+        u.diagonal(0, -2, -1).div_(u_diag_abs_largest)
+        u.diagonal(0, -2, -1)[..., u_diag_abs_smallest_idxs] = torch.finfo(dtype).eps
+        matrix = p @ l @ u
+
+        matrix.requires_grad_(requires_grad)
+        return matrix
+
+    for batch, size in product(((), (2,), (2, 2)), range(6)):
+        shape = batch + (size, size)
+        for rank in range(1, size):
+            yield SampleInput(make_singular_matrix_batch_base(shape, rank))
+
+
+def sample_inputs_linalg_matrix_power(op_info, device, dtype, requires_grad, **kwargs):
+    make_fullrank = make_fullrank_matrices_with_distinct_singular_values
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    make_arg_fullrank = partial(
+        make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    # (<matrix_size>, (<batch_sizes, ...>))
+    test_sizes = [
+        (1, ()),
+        (2, (0,)),
+        (2, (2,)),
+    ]
+
+    for matrix_size, batch_sizes in test_sizes:
+        size = batch_sizes + (matrix_size, matrix_size)
+        for n in (0, 3, 5):
+            yield SampleInput(make_arg(size), args=(n,))
+        for n in [-4, -2, -1]:
+            yield SampleInput(make_arg_fullrank(*size), args=(n,))
+
+
+def sample_inputs_linalg_det_logdet_slogdet(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    make_fullrank = make_fullrank_matrices_with_distinct_singular_values
+    make_arg = partial(
+        make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    batches = [(), (0,), (3,)]
+    ns = [0, 1, 5]
+
+    is_logdet = op_info.name == "logdet"
+
+    for (
+        batch,
+        n,
+    ) in product(batches, ns):
+        shape = batch + (n, n)
+        A = make_arg(*shape)
+        # Need to make the matrices in A have positive determinant for autograd
+        # To do so, we multiply A by its determinant to flip the sign of its determinant
+        if is_logdet and not A.is_complex() and A.numel() > 0:
+            s = torch.linalg.slogdet(A).sign
+            A = A * s.unsqueeze(-1).unsqueeze(-1)
+            A.requires_grad_(requires_grad)
+        yield SampleInput(A)
+
+
+def sample_inputs_lu_solve(op_info, device, dtype, requires_grad=False, **kwargs):
+    """Samples the inputs for both linalg.lu_solve and lu_solve"""
+    make_fn = make_fullrank_matrices_with_distinct_singular_values
+    make_a = partial(make_fn, dtype=dtype, device=device)
+    make_b = partial(make_tensor, dtype=dtype, device=device)
+
+    def clone(X, requires_grad):
+        Y = X.clone()
+        Y.requires_grad_(requires_grad)
+        return Y
+
+    is_linalg_lu_solve = op_info.name == "linalg.lu_solve"
+
+    batches = ((), (0,), (2,))
+    ns = (3, 1, 0)
+    nrhs = (4, 1, 0)
+
+    for n, batch, rhs in product(ns, batches, nrhs):
+        A = make_a(*(batch + (n, n)))
+        LU, pivots = torch.linalg.lu_factor(A)
+
+        B = make_b(batch + (n, rhs))
+
+        grads = (False,) if not requires_grad else (True, False)
+        # we try all possible combinations of requires_grad for each input
+        for LU_grad, B_grad in product(grads, grads):
+            # when requires_grad == True, at least one input has to have requires_grad enabled
+            if requires_grad and not LU_grad and not B_grad:
+                continue
+
+            if is_linalg_lu_solve:
+                for adjoint, left in product((True, False), repeat=2):
+                    yield SampleInput(
+                        clone(LU, LU_grad),
+                        args=(pivots, clone(B if left else B.mT, B_grad)),
+                        kwargs=dict(adjoint=adjoint, left=left),
+                    )
+            else:
+                yield SampleInput(clone(B, B_grad), args=(clone(LU, LU_grad), pivots))
+
+
+def sample_inputs_linalg_multi_dot(op_info, device, dtype, requires_grad, **kwargs):
+    # Each test case consists of the sizes in the chain of multiplications
+    # e.g. [2, 3, 4, 5] generates matrices (2, 3) @ (3, 4) @ (4, 5)
+    test_cases = [
+        [1, 2, 1],
+        [2, 0, 2],
+        [0, 2, 2],
+        [2, 2, 2, 2],
+        [2, 3, 4, 5],
+        [5, 4, 0, 2],
+        [2, 4, 3, 5, 3, 2],
+    ]
+
+    for sizes in test_cases:
+        tensors = []
+        for size in zip(sizes[:-1], sizes[1:]):
+            t = make_tensor(
+                size, dtype=dtype, device=device, requires_grad=requires_grad
+            )
+            tensors.append(t)
+        yield SampleInput(tensors)
+
+
+def sample_inputs_linalg_matrix_norm(op_info, device, dtype, requires_grad, **kwargs):
+    low_precision_dtypes = (torch.float16, torch.bfloat16, torch.complex32)
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+
+    sizes = ((2, 2), (2, 3, 2))
+    if dtype in low_precision_dtypes:
+        # svdvals not supported for low precision dtypes
+        ords = ("fro", inf, -inf, 1, -1)
+    else:
+        ords = ("fro", "nuc", inf, -inf, 1, -1, 2, -2)
+    dims = ((-2, -1), (-1, 0))
+
+    for size, ord, dim, keepdim in product(sizes, ords, dims, [True, False]):
+        yield SampleInput(make_arg(size), args=(ord, dim, keepdim))
+
+
+def sample_inputs_linalg_norm(
+    op_info, device, dtype, requires_grad, *, variant=None, **kwargs
+):
+    if variant is not None and variant not in ("subgradient_at_zero",):
+        raise ValueError(
+            f"Unsupported variant, expected variant to be 'subgradient_at_zero' but got: {variant}"
+        )
+
+    test_sizes = [
+        (S,),
+        (0,),
+        (S, S),
+        (0, 0),
+        (S, 0),
+        (0, S),
+        (S, S, S),
+        (0, S, S),
+        (S, 0, S),
+        (0, 0, 0),
+    ]
+
+    vector_ords = (None, 0, 0.5, 1, 2, 3.5, inf, -0.5, -1, -2, -3.5, -inf)
+    if dtype in {torch.float16, torch.bfloat16, torch.complex32}:
+        # svdvals not supported for low precision dtypes
+        matrix_ords = ("fro", inf, -inf, 1, -1)
+    else:
+        matrix_ords = (None, "fro", "nuc", inf, -inf, 1, -1, 2, -2)
+
+    make_arg = partial(
+        make_tensor,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+        low=None,
+        high=None,
+    )
+
+    for test_size in test_sizes:
+        is_vector_norm = len(test_size) == 1
+        is_matrix_norm = len(test_size) == 2
+
+        # IndexError: amax(): Expected reduction dim 0 to have non-zero size.
+        is_valid_for_p2 = is_vector_norm or (test_size[-1] != 0 and test_size[-2] != 0)
+
+        for keepdim in [False, True]:
+            if variant != "subgradient_at_zero" and is_valid_for_p2:
+                yield SampleInput(make_arg(test_size), keepdim=keepdim)
+
+            if not (is_vector_norm or is_matrix_norm):
+                continue
+
+            ords = vector_ords if is_vector_norm else matrix_ords
+
+            for ord in ords:
+                if is_vector_norm and test_size[-1] == 0:
+                    if ord == np.inf or (ord is not None and ord < 0):
+                        # RuntimeError: linalg.vector_norm cannot compute the
+                        # {ord} norm on an empty tensor because the operation
+                        # does not have an identity
+                        continue
+                elif is_matrix_norm:
+                    dims_to_check = {
+                        None: (0,),
+                        np.inf: (0,),
+                        2: (0, 1),
+                        1: (1,),
+                        -1: (1,),
+                        -2: (0, 1),
+                        -np.inf: (0,),
+                    }.get(ord, ())
+
+                    if any(test_size[d] == 0 for d in dims_to_check):
+                        # IndexError: amax(): Expected reduction dim {dim} to
+                        # have non-zero size.
+                        continue
+
+                if variant == "subgradient_at_zero":
+                    yield SampleInput(
+                        torch.zeros(
+                            test_size,
+                            dtype=dtype,
+                            device=device,
+                            requires_grad=requires_grad,
+                        ),
+                        ord,
+                        keepdim=keepdim,
+                    )
+                else:
+                    yield SampleInput(make_arg(test_size), ord, keepdim=keepdim)
+
+                    if ord in ["nuc", "fro"]:
+                        yield SampleInput(
+                            make_arg(test_size), ord=ord, keepdim=keepdim, dim=(0, 1)
+                        )
+
+
+def sample_inputs_linalg_vecdot(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    batches = ((), (0,), (1,), (5,))
+    ns = (0, 1, 3, 5)
+    for b, n in product(batches, ns):
+        shape = b + (n,)
+        yield SampleInput(make_arg(shape), args=(make_arg(shape),))
+        for i in range(len(shape)):
+            yield SampleInput(
+                make_arg(shape), args=(make_arg(shape),), kwargs=dict(dim=i)
+            )
+
+
+def sample_inputs_linalg_invertible(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    """
+    This function generates invertible inputs for linear algebra ops
+    The input is generated as the itertools.product of 'batches' and 'ns'.
+    In total this function generates 8 SampleInputs
+    'batches' cases include:
+        () - single input,
+        (0,) - zero batched dimension,
+        (2,) - batch of two matrices,
+        (1, 1) - 1x1 batch of matrices
+    'ns' gives 0x0 and 5x5 matrices.
+    Zeros in dimensions are edge cases in the implementation and important to test for in order to avoid unexpected crashes.
+    """
+    make_fn = make_fullrank_matrices_with_distinct_singular_values
+    make_arg = partial(make_fn, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    batches = [(), (0,), (2,), (1, 1)]
+    ns = [5, 0]
+
+    for batch, n in product(batches, ns):
+        yield SampleInput(make_arg(*batch, n, n))
+
+
+def sample_inputs_matrix_rank(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function produces inputs for matrix rank that test
+    all possible combinations for atol and rtol
+    """
+
+    def make_tol_arg(kwarg_type, inp):
+        if kwarg_type == "none":
+            return None
+        if kwarg_type == "float":
+            return 1.0
+        assert kwarg_type == "tensor"
+        return torch.ones(inp.shape[:-2], device=device)
+
+    for tol_type in ["float", "tensor"]:
+        for atol_type, rtol_type in product(["none", tol_type], repeat=2):
+            if (
+                not atol_type and not rtol_type
+            ):  # default behavior, so skipped here so it's not tested 2 extra times
+                continue
+            for sample in sample_inputs_linalg_invertible(
+                op_info, device, dtype, requires_grad
+            ):
+                assert sample.kwargs == {}
+                sample.kwargs = {
+                    "atol": make_tol_arg(atol_type, sample.input),
+                    "rtol": make_tol_arg(rtol_type, sample.input),
+                }
+                yield sample
+
+    # default kwargs
+    yield from sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad)
+
+
+def sample_inputs_linalg_pinv_singular(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    """
+    This function produces factors `a` and `b` to generate inputs of the form `a @ b.t()` to
+    test the backward method of `linalg_pinv`. That way we always preserve the rank of the
+    input no matter the perturbations applied to it by the gradcheck.
+    Note that `pinv` is Frechet-differentiable in a rank-preserving neighborhood.
+    """
+    batches = [(), (0,), (2,), (1, 1)]
+    # the size of at least 30 is required to cause failures for the previous implicit implementation
+    # of the pinv's backward method, albeit it is slow.
+    size = [0, 3, 50]
+
+    for batch, m, n in product(batches, size, size):
+        for k in range(min(3, m, n)):
+            # Note that by making the columns of `a` and `b` orthonormal we make sure that
+            # the product matrix `a @ b.t()` has condition number 1 when restricted to its image
+            a = (
+                torch.rand(*batch, m, k, device=device, dtype=dtype)
+                .qr()
+                .Q.requires_grad_(requires_grad)
+            )
+            b = (
+                torch.rand(*batch, n, k, device=device, dtype=dtype)
+                .qr()
+                .Q.requires_grad_(requires_grad)
+            )
+            yield SampleInput(a, args=(b,))
+
+
+def sample_inputs_linalg_cond(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    # autograd is not supported for inputs with zero number of elements
+    shapes = (
+        (S, S),
+        (2, S, S),
+        (2, 1, S, S),
+    )
+
+    for shape in shapes:
+        yield SampleInput(make_arg(shape))
+
+
+def sample_inputs_linalg_vander(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    shapes = (
+        (),
+        (1,),
+        (S,),
+        (2, S),
+    )
+
+    for shape in shapes:
+        if len(shape) > 0 and shape[-1] > 1:
+            yield SampleInput(make_arg(shape))
+        n = shape[-1] if len(shape) > 0 else 1
+        for i in range(3):
+            # n-1, n, n+1
+            N = n + i - 1
+            if N < 2:
+                continue
+            yield SampleInput(make_arg(shape), kwargs=dict(N=N))
+
+
+def np_vander_batched(x, N=None):
+    # Wrapper around np.vander that supports batches of 1 dimension (enough for the tests)
+    if x.ndim == 0:
+        x = x[np.newaxis]
+    if x.ndim == 1:
+        y = np.vander(x, N=N, increasing=True)
+        return y
+    else:
+        if N is None:
+            N = x.shape[-1]
+        y = np.vander(x.ravel(), N=N, increasing=True).reshape((*x.shape, N))
+        return y
+
+
+def sample_inputs_linalg_cholesky_inverse(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    from torch.testing._internal.common_utils import random_well_conditioned_matrix
+
+    # Cholesky factorization is for positive-definite matrices
+    single_well_conditioned_matrix = random_well_conditioned_matrix(
+        S, S, dtype=dtype, device=device
+    )
+    batch_well_conditioned_matrices = random_well_conditioned_matrix(
+        2, S, S, dtype=dtype, device=device
+    )
+    single_pd = single_well_conditioned_matrix @ single_well_conditioned_matrix.mH
+    batch_pd = batch_well_conditioned_matrices @ batch_well_conditioned_matrices.mH
+
+    inputs = (
+        torch.zeros(0, 0, dtype=dtype, device=device),  # 0x0 matrix
+        torch.zeros(0, 2, 2, dtype=dtype, device=device),  # zero batch of matrices
+        single_pd,
+        batch_pd,
+    )
+    test_cases = (torch.linalg.cholesky(a, upper=False) for a in inputs)
+    for l in test_cases:
+        # generated lower-triangular samples
+        l.requires_grad = requires_grad
+        yield SampleInput(l)  # upper=False by default
+        yield SampleInput(
+            l.detach().clone().requires_grad_(requires_grad), kwargs=dict(upper=False)
+        )
+
+        # generate upper-triangular inputs
+        u = l.detach().clone().mT.contiguous().requires_grad_(requires_grad)
+        yield SampleInput(u, kwargs=dict(upper=True))
+
+
+def sample_inputs_linalg_ldl_factor(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    from torch.testing._internal.common_utils import (
+        random_hermitian_pd_matrix,
+        random_symmetric_pd_matrix,
+    )
+
+    device = torch.device(device)
+
+    # Symmetric inputs
+    yield SampleInput(
+        random_symmetric_pd_matrix(S, dtype=dtype, device=device),
+        kwargs=dict(hermitian=False),
+    )  # single matrix
+    yield SampleInput(
+        random_symmetric_pd_matrix(S, 2, dtype=dtype, device=device),
+        kwargs=dict(hermitian=False),
+    )  # batch of matrices
+    yield SampleInput(
+        torch.zeros(0, 0, dtype=dtype, device=device), kwargs=dict(hermitian=False)
+    )  # 0x0 matrix
+    yield SampleInput(
+        torch.zeros(0, 2, 2, dtype=dtype, device=device), kwargs=dict(hermitian=False)
+    )  # zero batch of matrices
+
+    # Hermitian inputs
+    # hermitian=True for complex inputs on CUDA is supported only with MAGMA 2.5.4+
+    magma_254_available = device.type == "cuda" and _get_magma_version() >= (2, 5, 4)
+    if dtype.is_complex and (device.type == "cpu" or magma_254_available):
+        yield SampleInput(
+            random_hermitian_pd_matrix(S, dtype=dtype, device=device),
+            kwargs=dict(hermitian=True),
+        )  # single matrix
+        yield SampleInput(
+            random_hermitian_pd_matrix(S, 2, dtype=dtype, device=device),
+            kwargs=dict(hermitian=True),
+        )  # batch of matrices
+
+
+def sample_inputs_linalg_ldl_solve(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    # Generate LDL factors of symmetric (and Hermitian on CPU) matrices
+    from torch.testing._internal.common_utils import (
+        random_hermitian_pd_matrix,
+        random_symmetric_pd_matrix,
+    )
+
+    device = torch.device(device)
+    symmetric_inputs = (
+        random_symmetric_pd_matrix(S, dtype=dtype, device=device),  # single matrix
+        random_symmetric_pd_matrix(
+            S, 2, dtype=dtype, device=device
+        ),  # batch of matrices
+        torch.zeros(0, 0, dtype=dtype, device=device),  # 0x0 matrix
+        torch.zeros(0, 2, 2, dtype=dtype, device=device),  # zero batch of matrices
+    )
+    hermitian_inputs = (
+        (
+            random_hermitian_pd_matrix(S, dtype=dtype, device=device),
+            random_hermitian_pd_matrix(S, 2, dtype=dtype, device=device),
+        )
+        if device.type == "cpu" and dtype.is_complex
+        else ()
+    )
+    test_cases1 = (
+        torch.linalg.ldl_factor_ex(a, hermitian=False) for a in symmetric_inputs
+    )
+    test_cases2 = (
+        torch.linalg.ldl_factor_ex(a, hermitian=True) for a in hermitian_inputs
+    )
+
+    # Symmetric case
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    for test_case in test_cases1:
+        factors, pivots, _ = test_case
+        factors.requires_grad = requires_grad
+        for B_batch_shape in ((), factors.shape[:-2]):
+            B = make_arg((*B_batch_shape, factors.shape[-1], S))
+            yield SampleInput(factors, args=(pivots, B), kwargs=dict(hermitian=False))
+            clone_factors = factors.detach().clone().requires_grad_(requires_grad)
+            yield SampleInput(
+                clone_factors, args=(pivots, B), kwargs=dict(hermitian=False)
+            )
+
+    # Hermitian case
+    for test_case in test_cases2:
+        factors, pivots, _ = test_case
+        factors.requires_grad = requires_grad
+        for B_batch_shape in ((), factors.shape[:-2]):
+            B = make_arg((*B_batch_shape, factors.shape[-1], S))
+            yield SampleInput(factors, args=(pivots, B), kwargs=dict(hermitian=True))
+            clone_factors = factors.detach().clone().requires_grad_(requires_grad)
+            yield SampleInput(
+                clone_factors, args=(pivots, B), kwargs=dict(hermitian=True)
+            )
+
+
+def sample_inputs_linalg_lstsq(op_info, device, dtype, requires_grad=False, **kwargs):
+    from torch.testing._internal.common_utils import random_well_conditioned_matrix
+
+    device = torch.device(device)
+
+    drivers: Tuple[str, ...]
+    if device.type == "cuda":
+        drivers = ("gels",)
+    else:
+        drivers = ("gels", "gelsy", "gelss", "gelsd")
+
+    # we generate matrices of shape (..., n + delta, n)
+    deltas: Tuple[int, ...]
+    if device.type == "cpu" or has_cusolver():
+        deltas = (-1, 0, +1)
+    # only square systems if Cusolver is not available
+    # becase we solve a lstsq problem with a transposed matrix in the backward
+    else:
+        deltas = (0,)
+
+    for batch, driver, delta in product(((), (3,), (3, 3)), drivers, deltas):
+        shape = batch + (3 + delta, 3)
+        a = random_well_conditioned_matrix(*shape, dtype=dtype, device=device)
+        a.requires_grad_(requires_grad)
+        b = make_tensor(
+            shape,
+            dtype=dtype,
+            device=device,
+            low=None,
+            high=None,
+            requires_grad=requires_grad,
+        )
+        yield SampleInput(a, b, driver=driver)
+
+
+def error_inputs_lstsq(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(
+        SampleInput(zero_d, args=(zero_d,)),
+        error_type=RuntimeError,
+        error_regex="at least 2 dimensions",
+    )
+
+
+def error_inputs_lstsq_grad_oriented(op_info, device, **kwargs):
+    zero_d = torch.randn((), device=device)
+    yield ErrorInput(
+        SampleInput(zero_d, args=(zero_d, None)),
+        error_type=RuntimeError,
+        error_regex="at least 2 dimensions",
+    )
+
+
+def sample_inputs_diagonal_diag_embed(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    # Shapes for 2D Tensors
+    shapes_2d = ((S, S), (3, 5), (5, 3))
+
+    # Shapes for 3D Tensors
+    shapes_3d = ((S, S, S),)
+
+    kwargs_2d = (dict(), dict(offset=2), dict(offset=2), dict(offset=1))
+    kwargs_3d = (
+        dict(offset=1, dim1=1, dim2=2),
+        dict(offset=2, dim1=0, dim2=1),
+        dict(offset=-2, dim1=0, dim2=1),
+    )
+
+    for shape, kwarg in chain(
+        product(shapes_2d, kwargs_2d), product(shapes_3d, kwargs_3d)
+    ):
+        yield SampleInput(make_arg(shape), kwargs=kwarg)
+
+
+def error_inputs_diagonal_diag_embed(op_info, device, **kwargs):
+    make_arg = partial(make_tensor, device=device, dtype=torch.float32)
+
+    shapes1d = (0, 1, (0,), (1,))
+    shapes2d = ((M, L),)
+    shapes3d = ((M, S, L),)
+
+    kwargs1d = {}
+
+    kwargs2d = (
+        # dim1 == dim2 is not allowed
+        dict(dim1=1, dim2=1),
+        # out of bounds dims are not allowed
+        dict(dim1=10000),
+        dict(dim2=10000),
+    )
+
+    kwargs3d = kwargs2d
+
+    samples1d = product(shapes1d, kwargs1d)
+    samples2d = product(shapes2d, kwargs2d)
+    samples3d = product(shapes3d, kwargs3d)
+
+    for shape, kwargs in chain(samples1d, samples2d, samples3d):
+        arg = make_arg(shape)
+        sample = SampleInput(input=arg, kwargs=kwargs)
+
+        dim1 = kwargs.get("dim1")
+        dim2 = kwargs.get("dim2")
+
+        if "diagonal" in op_info.name:
+            num_dim = arg.dim()
+        elif op_info.name in ("diag_embed", "_refs.diag_embed"):
+            # these are valid inputs for diag_embed
+            if shape in ((0,), (1,)):
+                continue
+            num_dim = arg.dim() + 1
+        else:
+            raise RuntimeError("should be unreachable")
+
+        bound1 = -num_dim
+        bound2 = num_dim - 1
+        dim_range = range(bound1, bound2 + 1)
+        dim1_cond = dim1 and dim1 not in dim_range
+        dim2_cond = dim2 and dim2 not in dim_range
+
+        if dim1 == dim2:
+            err = f"diagonal dimensions cannot be identical {dim1}, {dim2}"
+            yield ErrorInput(sample, error_regex=err, error_type=RuntimeError)
+        elif dim1_cond or dim2_cond:
+            err_dim = dim1 if dim1_cond else dim2
+            err = (
+                r"Dimension out of range \(expected to be in range of "
+                rf"\[{bound1}, {bound2}\], but got {err_dim}\)"
+            )
+            yield ErrorInput(sample, error_regex=err, error_type=IndexError)
+        else:
+            raise RuntimeError("should be unreachable")
+
+
+def sample_inputs_linalg_cholesky(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    """
+    This function generates always positive-definite input for torch.linalg.cholesky using
+    random_hermitian_pd_matrix.
+    The input is generated as the itertools.product of 'batches' and 'ns'.
+    In total this function generates 8 SampleInputs
+    'batches' cases include:
+        () - single input,
+        (0,) - zero batched dimension,
+        (2,) - batch of two matrices,
+        (1, 1) - 1x1 batch of matrices
+    'ns' gives 0x0 and 5x5 matrices.
+    Zeros in dimensions are edge cases in the implementation and important to test for in order to avoid unexpected crashes.
+    """
+    from torch.testing._internal.common_utils import random_hermitian_pd_matrix
+
+    batches = [(), (0,), (2,), (1, 1)]
+    ns = [5, 0]
+    for batch, n, upper in product(batches, ns, [True, False]):
+        a = random_hermitian_pd_matrix(n, *batch, dtype=dtype, device=device)
+        a.requires_grad = requires_grad
+        yield SampleInput(a, upper=upper)
+
+
+def sample_inputs_linalg_eig(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function generates input for torch.linalg.eig
+    """
+
+    def out_fn(output):
+        return output[0], abs(output[1])
+
+    samples = sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad)
+    for sample in samples:
+        sample.output_process_fn_grad = out_fn
+        yield sample
+
+
+def sample_inputs_linalg_eigh(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function generates input for torch.linalg.eigh/eigvalsh with UPLO="U" or "L" keyword argument.
+    """
+
+    def out_fn(output):
+        if isinstance(output, tuple):
+            # eigh function
+            return output[0], abs(output[1])
+        else:
+            # eigvalsh function
+            return output
+
+    # Samples do not need to be Hermitian, as we're using gradcheck_wrapper_hermitian_input
+    samples = sample_inputs_linalg_invertible(op_info, device, dtype, requires_grad)
+    for sample in samples:
+        # Note: we cannot use np.random.choice here as TorchDynamo
+        # does not support tensors of strings.
+        sample.kwargs = {"UPLO": random.choice(["L", "U"])}
+        sample.output_process_fn_grad = out_fn
+        yield sample
+
+
+def sample_inputs_linalg_pinv(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function generates input for torch.linalg.pinv with hermitian=False keyword argument.
+    """
+    for o in sample_inputs_linalg_invertible(
+        op_info, device, dtype, requires_grad, **kwargs
+    ):
+        real_dtype = o.input.real.dtype if dtype.is_complex else dtype
+        # requires_grad path for rtol tensor is not implemented
+        for rtol in (None, 1.0, torch.tensor(1.0, dtype=real_dtype, device=device)):
+            o = clone_sample(o)
+            o.kwargs = {"rtol": rtol}
+            yield o
+
+
+def sample_inputs_linalg_pinv_hermitian(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    """
+    This function generates input for torch.linalg.pinv with hermitian=True keyword argument.
+    """
+    for o in sample_inputs_linalg_invertible(
+        op_info, device, dtype, requires_grad, **kwargs
+    ):
+        o.kwargs = {"hermitian": True}
+        yield o
+
+
+def sample_inputs_linalg_solve(
+    op_info, device, dtype, requires_grad=False, vector_rhs_allowed=True, **kwargs
+):
+    """
+    This function generates always solvable input for torch.linalg.solve
+    We sample a fullrank square matrix (i.e. invertible) A
+    The first input to torch.linalg.solve is generated as the itertools.product of 'batches' and 'ns'.
+    The second input is generated as the product of 'batches', 'ns' and 'nrhs'.
+    In total this function generates 18 SampleInputs
+    'batches' cases include:
+        () - single input,
+        (0,) - zero batched dimension,
+        (2,) - batch of two matrices.
+    'ns' gives 0x0 and 5x5 matrices.
+    and 'nrhs' controls the number of vectors to solve for:
+        () - using 1 as the number of vectors implicitly
+        (1,) - same as () but explicit
+        (3,) - solve for 3 vectors.
+    Zeros in dimensions are edge cases in the implementation and important to test for in order to avoid unexpected crashes.
+    'vector_rhs_allowed' controls whether to include nrhs = () to the list of SampleInputs.
+    torch.solve / triangular_solve / cholesky_solve (opposed to torch.linalg.solve) do not allow
+    1D tensors (vectors) as the right-hand-side.
+    Once torch.solve / triangular_solve / cholesky_solve and its testing are removed,
+    'vector_rhs_allowed' may be removed here as well.
+    """
+    make_fullrank = make_fullrank_matrices_with_distinct_singular_values
+    make_a = partial(
+        make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    make_b = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    batches = [(), (0,), (2,)]
+    ns = [5, 0]
+    if vector_rhs_allowed:
+        nrhs = [(), (1,), (3,)]
+    else:
+        nrhs = [(1,), (3,)]
+
+    for n, batch, rhs in product(ns, batches, nrhs):
+        yield SampleInput(make_a(*batch, n, n), args=(make_b(batch + (n,) + rhs),))
+
+
+def sample_inputs_linalg_solve_triangular(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    make_arg = partial(make_tensor, dtype=dtype, device=device)
+    bs = (1, 2, 0)
+    ns = (3, 0)
+    ks = (1, 3, 0)
+
+    for b, n, k, (left, upper, uni) in product(
+        bs, ns, ks, product((True, False), repeat=3)
+    ):
+        if b == 1:
+            A = make_arg((n, n)) if left else make_arg((k, k))
+            B = make_arg((n, k))
+        else:
+            A = make_arg((b, n, n)) if left else make_arg((b, k, k))
+            B = make_arg((b, n, k))
+        if uni:
+            # Not really necessary, but writing it for consistency
+            A.diagonal(0, -2, -1).fill_(1.0)
+        else:
+            d = A.diagonal(0, -2, -1)
+            d[d.abs() < 1e-6] = 1.0
+        if upper:
+            A.triu_()
+        else:
+            A.tril_()
+        kwargs = {"upper": upper, "left": left, "unitriangular": uni}
+        if requires_grad:
+            for grad_A, grad_B in product((True, False), repeat=2):
+                # Either A or B needs to have a gradient
+                if not grad_A and not grad_B:
+                    continue
+                yield SampleInput(
+                    A.clone().requires_grad_(grad_A),
+                    args=(B.clone().requires_grad_(grad_B),),
+                    kwargs=kwargs,
+                )
+        else:
+            yield SampleInput(A, args=(B,), kwargs=kwargs)
+
+
+def sample_inputs_legacy_solve(op_info, device, dtype, requires_grad=False, **kwargs):
+    """
+    This function generates always solvable input for legacy solve functions
+    (the ones that are not in torch.linalg module).
+    The difference from sample_inputs_linalg_solve is that here the right-hand-side of A x = b equation
+    should have b.ndim >= 2, vectors are not allowed.
+    Also the arguments order is swapped.
+    """
+    out = sample_inputs_linalg_solve(
+        op_info, device, dtype, requires_grad=requires_grad, vector_rhs_allowed=False
+    )
+
+    def out_fn(output):
+        return output[0]
+
+    # Reverses tensor order
+    for sample in out:
+        sample.input, sample.args = sample.args[0], (sample.input,)
+        if op_info.name == "solve":
+            sample.output_process_fn_grad = out_fn
+        yield sample
+
+
+def sample_inputs_linalg_lu(op_info, device, dtype, requires_grad=False, **kwargs):
+    full_rank = op_info.name == "linalg.lu_factor"
+    make_fn = (
+        make_tensor
+        if not full_rank
+        else make_fullrank_matrices_with_distinct_singular_values
+    )
+    make_arg = partial(make_fn, dtype=dtype, device=device, requires_grad=requires_grad)
+
+    def out_fn(output):
+        if op_info.name == "linalg.lu":
+            return output[1], output[2]
+        else:
+            return output
+
+    batch_shapes = ((), (3,), (3, 3))
+    # pivot=False only supported in CUDA
+    pivots = (True, False) if torch.device(device).type == "cuda" else (True,)
+    deltas = (-2, -1, 0, +1, +2)
+    for batch_shape, pivot, delta in product(batch_shapes, pivots, deltas):
+        shape = batch_shape + (S + delta, S)
+        # Insanely annoying that make_fullrank_blablabla accepts a *shape and not a tuple!
+        A = make_arg(shape) if not full_rank else make_arg(*shape)
+        yield SampleInput(A, kwargs={"pivot": pivot}, output_process_fn_grad=out_fn)
+
+
+def sample_inputs_linalg_svdvals(op_info, device, dtype, requires_grad=False, **kwargs):
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    batches = [(), (0,), (2,), (1, 1)]
+    ns = [5, 2, 0]
+
+    for batch, m, n in product(batches, ns, ns):
+        yield SampleInput(make_arg(batch + (m, n)))
+
+
+def sample_inputs_linalg_qr_geqrf(
+    op_info, device, dtype, requires_grad=False, **kwargs
+):
+    # QR is just well defined when the matrix is full rank
+    make_fullrank = make_fullrank_matrices_with_distinct_singular_values
+    make_arg = partial(
+        make_fullrank, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+
+    batches = [(), (0,), (2,), (1, 1)]
+    ns = [5, 2, 0]
+
+    for batch, (m, n) in product(batches, product(ns, ns)):
+        shape = batch + (m, n)
+        yield SampleInput(make_arg(*shape))
+
+
+def sample_inputs_tensorsolve(op_info, device, dtype, requires_grad, **kwargs):
+    a_shapes = [(2, 3, 6), (3, 4, 4, 3)]
+    # Zero-dim tensors are not supported in NumPy, so we skip them for now.
+    # NumPy is used in reference check tests.
+    # See https://github.com/numpy/numpy/pull/20482 for tracking NumPy bugfix.
+    # a_shapes += [(0, 0, 1, 2, 3, 0)]
+    dimss = [None, (0, 2)]
+
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, requires_grad=requires_grad
+    )
+    for a_shape, dims in itertools.product(a_shapes, dimss):
+        a = make_arg(a_shape)
+        b = make_arg(a_shape[:2])
+        yield SampleInput(a, b, dims=dims)
+
+
+def sample_inputs_tensorinv(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = make_fullrank_matrices_with_distinct_singular_values
+
+    def make_input():
+        return make_arg(12, 12, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # lhs / rhs shape can have any number of dimensions as long as their product equals 12
+    shapes = [
+        ((2, 2, 3), (12, 1)),
+        ((4, 3), (6, 1, 2)),
+    ]
+
+    for shape_lhs, shape_rhs in shapes:
+        inp = make_input().reshape(*shape_lhs, *shape_rhs).detach()
+        inp.requires_grad_(requires_grad)
+        yield SampleInput(inp, ind=len(shape_lhs))
+
+
+op_db: List[OpInfo] = [
+    OpInfo(
+        "linalg.cross",
+        ref=lambda x, y, dim=-1: np.cross(x, y, axis=dim),
+        op=torch.linalg.cross,
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        aten_name="linalg_cross",
+        sample_inputs_func=sample_inputs_cross,
+        error_inputs_func=error_inputs_cross,
+        supports_out=True,
+        supports_fwgrad_bwgrad=True,
+        supports_forward_ad=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.det",
+        aten_name="linalg_det",
+        op=torch.linalg.det,
+        aliases=("det",),
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_det_logdet_slogdet,
+        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+        check_batched_gradgrad=False,
+    ),
+    OpInfo(
+        "linalg.det",
+        aten_name="linalg_det",
+        op=torch.linalg.det,
+        variant_test_name="singular",
+        aliases=("det",),
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_gradgrad=False,
+        sample_inputs_func=sample_inputs_linalg_det_singular,
+        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+        skips=(
+            DecorateInfo(
+                unittest.skip("The backward may give different results"),
+                "TestCommon",
+                "test_noncontiguous_samples",
+            ),
+            DecorateInfo(
+                unittest.skip("Gradients are incorrect on macos"),
+                "TestBwdGradients",
+                "test_fn_grad",
+                device_type="cpu",
+                dtypes=(torch.float64,),
+                active_if=IS_MACOS,
+            ),
+            DecorateInfo(
+                unittest.skip("Gradients are incorrect on macos"),
+                "TestFwdGradients",
+                "test_forward_mode_AD",
+                device_type="cpu",
+                dtypes=(torch.float64,),
+                active_if=IS_MACOS,
+            ),
+            # Both Hessians are incorrect on complex inputs??
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestBwdGradients",
+                "test_fn_gradgrad",
+                dtypes=(torch.complex128,),
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                dtypes=(torch.complex128,),
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped, see https://github.com//issues/84192"),
+                "TestBwdGradients",
+                "test_fn_gradgrad",
+                device_type="cuda",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped, see https://github.com//issues/84192"),
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                device_type="cuda",
+            ),
+            DecorateInfo(
+                unittest.skip(
+                    "Flaky on ROCm https://github.com/pytorch/pytorch/issues/93044"
+                ),
+                "TestBwdGradients",
+                "test_fn_grad",
+                device_type="cuda",
+                dtypes=get_all_complex_dtypes(),
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip(
+                    "Flaky on ROCm https://github.com/pytorch/pytorch/issues/93045"
+                ),
+                "TestFwdGradients",
+                "test_forward_mode_AD",
+                device_type="cuda",
+                dtypes=get_all_complex_dtypes(),
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.diagonal",
+        aten_name="linalg_diagonal",
+        aten_backward_name="diagonal_backward",
+        dtypes=all_types_and_complex_and(
+            torch.bool, torch.bfloat16, torch.float16, torch.chalf
+        ),
+        supports_out=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_diagonal_diag_embed,
+        error_inputs_func=error_inputs_diagonal_diag_embed,
+    ),
+    OpInfo(
+        "linalg.cholesky",
+        aten_name="linalg_cholesky",
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        sample_inputs_func=sample_inputs_linalg_cholesky,
+        gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.cholesky_ex",
+        aten_name="linalg_cholesky_ex",
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        sample_inputs_func=sample_inputs_linalg_cholesky,
+        gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.vecdot",
+        aten_name="linalg_vecdot",
+        ref=lambda x, y, *, dim=-1: (x.conj() * y).sum(dim),
+        dtypes=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        sample_inputs_func=sample_inputs_linalg_vecdot,
+        check_batched_forward_grad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # Issue with conj and torch dispatch, see https://github.com/pytorch/pytorch/issues/82479
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestSchemaCheckModeOpInfo",
+                "test_schema_correctness",
+                dtypes=(torch.complex64, torch.complex128),
+            ),
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.cond",
+        aten_name="linalg_cond",
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_cond,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_no_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.eig",
+        aten_name="linalg_eig",
+        op=torch.linalg.eig,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_eig,
+        check_batched_forward_grad=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # AssertionError: Scalars are not equal!
+            DecorateInfo(
+                unittest.expectedFailure, "TestCommon", "test_out", device_type="cpu"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off],
+    ),
+    OpInfo(
+        "linalg.eigvals",
+        aten_name="linalg_eigvals",
+        op=torch.linalg.eigvals,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_invertible,
+        check_batched_forward_grad=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.eigh",
+        aten_name="linalg_eigh",
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_eigh,
+        gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+        check_batched_forward_grad=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack, with_tf32_off],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.eigvalsh",
+        aten_name="linalg_eigvalsh",
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_eigh,
+        gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+        check_batched_forward_grad=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        skips=(
+            # Pre-existing condition; Needs to be fixed
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.householder_product",
+        aten_name="linalg_householder_product",
+        op=torch.linalg.householder_product,
+        aliases=("orgqr",),
+        dtypes=floating_and_complex_types(),
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        # TODO: backward uses in-place operations that vmap doesn't like
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_forward_grad=False,
+        sample_inputs_func=sample_inputs_householder_product,
+        decorators=[
+            skipCUDAIfNoCusolver,
+            skipCPUIfNoLapack,
+            DecorateInfo(
+                toleranceOverride({torch.complex64: tol(atol=1e-3, rtol=1e-3)})
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped! Flaky"),
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                device_type="cpu",
+                dtypes=(torch.complex128,),
+            ),
+        ],
+    ),
+    OpInfo(
+        "linalg.ldl_factor",
+        aten_name="linalg_ldl_factor",
+        dtypes=floating_and_complex_types(),
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_linalg_ldl_factor,
+        decorators=[skipCUDAIfNoMagmaAndNoLinalgsolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.ldl_factor_ex",
+        aten_name="linalg_ldl_factor_ex",
+        dtypes=floating_and_complex_types(),
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_linalg_ldl_factor,
+        decorators=[skipCUDAIfNoMagmaAndNoLinalgsolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.ldl_solve",
+        aten_name="linalg_ldl_solve",
+        dtypes=floating_and_complex_types(),
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_linalg_ldl_solve,
+        decorators=[
+            skipCUDAIf(
+                _get_torch_cuda_version() < (11, 4), "not available before CUDA 11.3.1"
+            ),
+            skipCUDAIfNoCusolver,
+            skipCUDAIfRocm,
+            skipCPUIfNoLapack,
+        ],
+    ),
+    OpInfo(
+        "linalg.lstsq",
+        aten_name="linalg_lstsq",
+        dtypes=floating_and_complex_types(),
+        supports_out=True,
+        sample_inputs_func=sample_inputs_linalg_lstsq,
+        error_inputs_func=error_inputs_lstsq,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        skips=(
+            # we skip gradient checks for this suite as they are tested in
+            # variant_test_name='grad_oriented'
+            DecorateInfo(unittest.skip("Skipped!"), "TestFwdGradients"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestBwdGradients"),
+            # The values for attribute 'shape' do not match
+            DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_out"),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.lstsq",
+        aten_name="linalg_lstsq",
+        variant_test_name="grad_oriented",
+        # gradchecks for forward AD fails with multi-Tensor outputs
+        op=lambda a, b, driver: torch.linalg.lstsq(a, b, driver=driver)[0],
+        supports_out=False,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_lstsq,
+        error_inputs_func=error_inputs_lstsq_grad_oriented,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_autograd=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        skips=(
+            # tests do not work with passing lambda for op
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestOperatorSignatures",
+                "test_get_torch_func_signature_exhaustive",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.matrix_power",
+        aliases=("matrix_power",),
+        aten_name="linalg_matrix_power",
+        dtypes=floating_and_complex_types(),
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_inplace_autograd=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        check_batched_grad=False,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        sample_inputs_func=sample_inputs_linalg_matrix_power,
+    ),
+    OpInfo(
+        "linalg.multi_dot",
+        # Need this lambda because gradcheck does not work with TensorList inputs
+        aten_name="linalg_multi_dot",
+        dtypes=all_types_and_complex_and(torch.half, torch.bfloat16),
+        dtypesIfCUDA=floating_and_complex_types_and(torch.half, torch.bfloat16),
+        supports_inplace_autograd=False,
+        # Batched grad checks fail for empty input tensors (see https://github.com/pytorch/pytorch/issues/53407)
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # https://github.com/pytorch/pytorch/issues/66357
+        check_batched_forward_grad=False,
+        sample_inputs_func=sample_inputs_linalg_multi_dot,
+        gradcheck_nondet_tol=GRADCHECK_NONDET_TOL,
+        skips=(
+            # https://github.com/pytorch/pytorch/issues/67470
+            DecorateInfo(
+                unittest.skip("67470!"), "TestCommon", "test_noncontiguous_samples"
+            ),
+            # Fails on XLA.
+            # AssertionError: False is not true : Tensors failed to compare as equal!
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestOpInfo",
+                device_type="xla",
+                dtypes=(torch.long,),
+            ),
+            # https://github.com/pytorch/pytorch/issues/71774
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestNNCOpInfo",
+                "test_nnc_correctness",
+                device_type="cpu",
+                dtypes=(torch.long,),
+            ),
+        ),
+    ),
+    # NB: linalg.norm has two variants so that different skips can be used for different sample inputs
+    OpInfo(
+        "linalg.norm",
+        aten_name="linalg_norm",
+        op=torch.linalg.norm,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        sample_inputs_func=sample_inputs_linalg_norm,
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure, "TestBwdGradients", "test_fn_gradgrad"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_no_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.norm",
+        op=torch.linalg.norm,
+        variant_test_name="subgradients_at_zero",
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        sample_inputs_func=partial(
+            sample_inputs_linalg_norm, variant="subgradient_at_zero"
+        ),
+        aten_name="linalg_norm",
+        supports_forward_ad=True,
+        # torch.autograd.gradcheck.GradcheckError: While computing batched gradients, got:
+        # Could not allocate memory to change Tensor SizesAndStrides!
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # [NEW] Skips specifically for sample inputs at zero
+            # norm's vjp/jvp are not well-conditioned near zero
+            DecorateInfo(
+                unittest.expectedFailure, "TestBwdGradients", "test_fn_gradgrad"
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestFwdGradients", "test_fn_fwgrad_bwgrad"
+            ),
+            DecorateInfo(
+                unittest.expectedFailure, "TestFwdGradients", "test_forward_mode_AD"
+            ),
+            DecorateInfo(unittest.expectedFailure, "TestBwdGradients", "test_fn_grad"),
+        ),
+    ),
+    OpInfo(
+        "linalg.matrix_norm",
+        aten_name="linalg_matrix_norm",
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        check_batched_gradgrad=False,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        sample_inputs_func=sample_inputs_linalg_matrix_norm,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_no_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.qr",
+        aten_name="linalg_qr",
+        op=torch.linalg.qr,
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # In-place ops
+        check_batched_gradgrad=False,
+        sample_inputs_func=sample_inputs_linalg_qr_geqrf,
+        decorators=[skipCUDAIfNoCusolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.slogdet",
+        aten_name="linalg_slogdet",
+        op=torch.linalg.slogdet,
+        dtypes=floating_and_complex_types(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_det_logdet_slogdet,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+    ),
+    OpInfo(
+        "linalg.vander",
+        aten_name="linalg_vander",
+        ref=np_vander_batched,
+        op=torch.linalg.vander,
+        dtypes=all_types_and_complex(),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_out=False,
+        sample_inputs_func=sample_inputs_linalg_vander,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+    ReductionOpInfo(
+        "linalg.vector_norm",
+        op=torch.linalg.vector_norm,
+        identity=0,
+        nan_policy="propagate",
+        supports_multiple_dims=True,
+        complex_to_real=True,
+        supports_forward_ad=True,
+        # torch.autograd.gradcheck.GradcheckError: While computing batched gradients
+        # got: Could not allocate memory to change Tensor SizesAndStrides!
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16),
+        generate_args_kwargs=sample_kwargs_vector_norm,
+        aten_name="linalg_vector_norm",
+        skips=(
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.lu_factor",
+        aten_name="linalg_lu_factor",
+        op=torch.linalg.lu_factor,
+        dtypes=floating_and_complex_types(),
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_lu,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+    ),
+    OpInfo(
+        "linalg.lu_factor_ex",
+        aten_name="linalg_lu_factor_ex",
+        op=torch.linalg.lu_factor_ex,
+        dtypes=floating_and_complex_types(),
+        # https://github.com/pytorch/pytorch/issues/80411
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_lu,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+    ),
+    OpInfo(
+        "linalg.lu",
+        aten_name="linalg_lu",
+        op=torch.linalg.lu,
+        dtypes=floating_and_complex_types(),
+        # https://github.com/pytorch/pytorch/issues/80411
+        # Runs very slowly on slow-gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_lu,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            # linalg.lu_factor: LU without pivoting is not implemented on the CPU
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+    ),
+    OpInfo(
+        "linalg.lu_solve",
+        op=torch.linalg.lu_solve,
+        aten_name="linalg_lu_solve",
+        dtypes=floating_and_complex_types(),
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_lu_solve,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Tests different backward paths"),
+                "TestCommon",
+                "test_floating_inputs_are_differentiable",
+            ),
+        ),
+        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+    ),
+    OpInfo(
+        "linalg.inv",
+        aten_name="linalg_inv",
+        op=torch.linalg.inv,
+        aliases=("inverse",),
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_invertible,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.inv_ex",
+        aten_name="linalg_inv_ex",
+        op=torch.linalg.inv_ex,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_invertible,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.solve",
+        aten_name="linalg_solve",
+        op=torch.linalg.solve,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_solve,
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.solve_ex",
+        aten_name="linalg_solve_ex",
+        op=torch.linalg.solve_ex,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_solve,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.solve_triangular",
+        aten_name="linalg_solve_triangular",
+        op=torch.linalg.solve_triangular,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_linalg_solve_triangular,
+        supports_fwgrad_bwgrad=True,
+        skips=(skipCPUIfNoLapack,),
+        # linalg.solve_triangular cannot be batched over because of a call to out.copy_(result);
+        supports_forward_ad=True,
+    ),
+    OpInfo(
+        "linalg.matrix_rank",
+        aten_name="linalg_matrix_rank",
+        dtypes=floating_and_complex_types(),
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_matrix_rank,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            # jit doesn't accept tensor inputs for matrix rank
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                dtypes=[torch.complex64, torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.matrix_rank",
+        aten_name="linalg_matrix_rank",
+        variant_test_name="hermitian",
+        dtypes=floating_and_complex_types(),
+        supports_autograd=False,
+        sample_inputs_func=sample_inputs_linalg_pinv_hermitian,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.pinv",
+        aten_name="linalg_pinv",
+        op=torch.linalg.pinv,
+        dtypes=floating_and_complex_types(),
+        # Runs very slowly on slow gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_pinv,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            # errors with "leaked XXXX bytes CUDA memory on device 0"
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="cuda",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.pinv",
+        aten_name="linalg_pinv",
+        variant_test_name="singular",
+        # pinv is Frechet-differentiable in a rank-preserving neighborhood,
+        # so we feed inputs that are the products of two full-rank factors,
+        # to avoid any rank changes caused by the perturbations in the gradcheck
+        op=lambda a, b: torch.linalg.pinv(a @ b.mT),
+        dtypes=floating_and_complex_types(),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_linalg_pinv_singular,
+        # Only large tensors show issues with implicit backward used prior to
+        # explicit backward implementation.
+        decorators=[slowTest, skipCUDAIfNoCusolver, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            # CUDA runs out of memory
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                device_type="cuda",
+                dtypes=[torch.cdouble],
+            ),
+            # This test takes almost 2 hours to run!
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestBwdGradients",
+                "test_fn_gradgrad",
+                device_type="cuda",
+                dtypes=[torch.cdouble],
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.pinv",
+        aten_name="linalg_pinv",
+        variant_test_name="hermitian",
+        dtypes=floating_and_complex_types(),
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        sample_inputs_func=sample_inputs_linalg_pinv_hermitian,
+        gradcheck_wrapper=gradcheck_wrapper_hermitian_input,
+        decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-5, rtol=1e-5)}),
+                "TestCommon",
+                "test_noncontiguous_samples",
+                device_type="cuda",
+            ),
+            # This test is flaky under slow gradcheck, likely due to rounding issues
+            DecorateInfo(
+                skipIfSlowGradcheckEnv,
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                device_type="cuda",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.svd",
+        op=torch.linalg.svd,
+        aten_name="linalg_svd",
+        decomp_aten_name="_linalg_svd",
+        dtypes=floating_and_complex_types(),
+        # Runs very slowly on slow-gradcheck - alternatively reduce input sizes
+        gradcheck_fast_mode=True,
+        supports_fwgrad_bwgrad=True,
+        supports_forward_ad=True,
+        check_batched_forward_grad=False,
+        # We're using at::allclose, which does not have a batching rule
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        sample_inputs_func=sample_inputs_svd,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_out",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestJit",
+                "test_variant_consistency_jit",
+                device_type="mps",
+                dtypes=[torch.float32],
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_no_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.svdvals",
+        op=torch.linalg.svdvals,
+        aten_name="linalg_svdvals",
+        decomp_aten_name="_linalg_svd",
+        dtypes=floating_and_complex_types(),
+        check_batched_forward_grad=False,
+        supports_fwgrad_bwgrad=True,
+        supports_forward_ad=True,
+        # We're using at::allclose, which does not have a batching rule
+        check_batched_gradgrad=False,
+        sample_inputs_func=sample_inputs_linalg_svdvals,
+        decorators=[skipCUDAIfNoMagmaAndNoCusolver, skipCPUIfNoLapack, with_tf32_off],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestFakeTensor",
+                "test_fake_crossref_backward_no_amp",
+                device_type="cuda",
+                dtypes=[torch.float32],
+                active_if=TEST_WITH_ROCM,
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.tensorinv",
+        ref=np.linalg.tensorinv,
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_tensorinv,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        # See https://github.com/pytorch/pytorch/pull/78358
+        check_batched_forward_grad=False,
+        decorators=[skipCPUIfNoLapack, skipCUDAIfNoMagmaAndNoCusolver],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+    OpInfo(
+        "linalg.tensorsolve",
+        ref=lambda a, b, dims=None: np.linalg.tensorsolve(a, b, axes=dims),
+        dtypes=floating_and_complex_types(),
+        sample_inputs_func=sample_inputs_tensorsolve,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=[
+            skipCUDAIfNoMagmaAndNoCusolver,
+            skipCPUIfNoLapack,
+            DecorateInfo(
+                toleranceOverride({torch.float32: tol(atol=1e-03, rtol=1e-03)}),
+                "TestCommon",
+                "test_noncontiguous_samples",
+                device_type="cuda",
+            ),
+        ],
+        skips=(
+            DecorateInfo(
+                unittest.skip("Unsupported on MPS for now"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+]
+
+python_ref_db: List[OpInfo] = [
+    #
+    # torch.linalg
+    #
+    PythonRefInfo(
+        "_refs.linalg.cross",
+        torch_opinfo_name="linalg.cross",
+        supports_out=True,
+        op_db=op_db,
+        skips=(
+            # no _refs support for Tensor.__getitem__
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_python_ref"),
+            # TODO: is this really needed?
+            DecorateInfo(
+                unittest.expectedFailure, "TestCommon", "test_python_ref_errors"
+            ),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.linalg.diagonal",
+        torch_opinfo_name="linalg.diagonal",
+        supports_out=False,
+        op_db=op_db,
+    ),
+    PythonRefInfo(
+        "_refs.linalg.vecdot",
+        torch_opinfo_name="linalg.vecdot",
+        op_db=op_db,
+    ),
+    ReductionPythonRefInfo(
+        "_refs.linalg.vector_norm",
+        torch_opinfo_name="linalg.vector_norm",
+        supports_out=True,
+        op_db=op_db,
+        skips=(
+            # FIXME: sum reduces all dimensions when dim=[]
+            DecorateInfo(unittest.expectedFailure, "TestReductions", "test_dim_empty"),
+            DecorateInfo(
+                unittest.expectedFailure, "TestReductions", "test_dim_empty_keepdim"
+            ),
+        ),
+    ),
+    PythonRefInfo(
+        "_refs.linalg.matrix_norm",
+        torch_opinfo_name="linalg.matrix_norm",
+        supports_out=True,
+        # Uses vector_norm inside and vector_norm is affected by
+        # https://github.com/pytorch/pytorch/issues/77216
+        validate_view_consistency=False,
+        op_db=op_db,
+    ),
+    PythonRefInfo(
+        "_refs.linalg.norm",
+        torch_opinfo_name="linalg.norm",
+        supports_out=True,
+        # Uses vector_norm inside and vector_norm is affected by
+        # https://github.com/pytorch/pytorch/issues/77216
+        validate_view_consistency=False,
+        op_db=op_db,
+    ),
+    PythonRefInfo(
+        "_refs.linalg.svd",
+        torch_opinfo_name="linalg.svd",
+        supports_out=True,
+        op_db=op_db,
+    ),
+    PythonRefInfo(
+        "_refs.linalg.svdvals",
+        torch_opinfo_name="linalg.svdvals",
+        supports_out=True,
+        op_db=op_db,
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/signal.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/signal.py
new file mode 100644
index 0000000000000000000000000000000000000000..672f1db03ecca11bfc7108ba7a6bf7fa72b135ef
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/signal.py
@@ -0,0 +1,458 @@
+# mypy: ignore-errors
+
+import unittest
+from functools import partial
+
+from itertools import product
+from typing import Callable, List, Tuple
+
+import numpy
+
+import torch
+from torch.testing._internal.common_dtype import floating_types
+from torch.testing._internal.common_utils import TEST_SCIPY
+from torch.testing._internal.opinfo.core import (
+    DecorateInfo,
+    ErrorInput,
+    OpInfo,
+    SampleInput,
+)
+
+if TEST_SCIPY:
+    import scipy.signal
+
+
+def sample_inputs_window(op_info, device, dtype, requires_grad, *args, **kwargs):
+    r"""Base function used to create sample inputs for windows.
+
+    For additional required args you should use *args, as well as **kwargs for
+    additional keyword arguments.
+    """
+
+    # Tests window sizes up to 5 samples.
+    for size, sym in product(range(6), (True, False)):
+        yield SampleInput(
+            size,
+            *args,
+            sym=sym,
+            device=device,
+            dtype=dtype,
+            requires_grad=requires_grad,
+            **kwargs,
+        )
+
+
+def reference_inputs_window(op_info, device, dtype, requires_grad, *args, **kwargs):
+    r"""Reference inputs function to use for windows which have a common signature, i.e.,
+    window size and sym only.
+
+    Implement other special functions for windows that have a specific signature.
+    See exponential and gaussian windows for instance.
+    """
+    yield from sample_inputs_window(
+        op_info, device, dtype, requires_grad, *args, **kwargs
+    )
+
+    cases = (8, 16, 32, 64, 128, 256)
+
+    for size in cases:
+        yield SampleInput(size, sym=False)
+        yield SampleInput(size, sym=True)
+
+
+def reference_inputs_exponential_window(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"center": 4, "tau": 0.5}),
+        (16, {"center": 8, "tau": 2.5}),
+        (32, {"center": 16, "tau": 43.5}),
+        (64, {"center": 20, "tau": 3.7}),
+        (128, {"center": 62, "tau": 99}),
+        (256, {"tau": 10}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        kw["center"] = None
+        yield SampleInput(size, sym=True, **kw)
+
+
+def reference_inputs_gaussian_window(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"std": 0.1}),
+        (16, {"std": 1.2}),
+        (32, {"std": 2.1}),
+        (64, {"std": 3.9}),
+        (128, {"std": 4.5}),
+        (256, {"std": 10}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
+def reference_inputs_kaiser_window(op_info, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"beta": 2}),
+        (16, {"beta": 12}),
+        (32, {"beta": 30}),
+        (64, {"beta": 35}),
+        (128, {"beta": 41.2}),
+        (256, {"beta": 100}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
+def reference_inputs_general_cosine_window(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"a": [0.5, 0.5]}),
+        (16, {"a": [0.46, 0.54]}),
+        (32, {"a": [0.46, 0.23, 0.31]}),
+        (64, {"a": [0.5]}),
+        (128, {"a": [0.1, 0.8, 0.05, 0.05]}),
+        (256, {"a": [0.2, 0.2, 0.2, 0.2, 0.2]}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
+def reference_inputs_general_hamming_window(
+    op_info, device, dtype, requires_grad, **kwargs
+):
+    yield from sample_inputs_window(op_info, device, dtype, requires_grad, **kwargs)
+
+    cases = (
+        (8, {"alpha": 0.54}),
+        (16, {"alpha": 0.5}),
+        (32, {"alpha": 0.23}),
+        (64, {"alpha": 0.8}),
+        (128, {"alpha": 0.9}),
+        (256, {"alpha": 0.05}),
+    )
+
+    for size, kw in cases:
+        yield SampleInput(size, sym=False, **kw)
+        yield SampleInput(size, sym=True, **kw)
+
+
+def error_inputs_window(op_info, device, *args, **kwargs):
+    # Tests for windows that have a negative size
+    yield ErrorInput(
+        SampleInput(-1, *args, dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="requires non-negative window length, got M=-1",
+    )
+
+    # Tests for window tensors that are not torch.strided, for instance, torch.sparse_coo.
+    yield ErrorInput(
+        SampleInput(
+            3,
+            *args,
+            layout=torch.sparse_coo,
+            device=device,
+            dtype=torch.float32,
+            **kwargs,
+        ),
+        error_type=ValueError,
+        error_regex="is implemented for strided tensors only, got: torch.sparse_coo",
+    )
+
+    # Tests for window tensors that are not floating point dtypes, for instance, torch.long.
+    yield ErrorInput(
+        SampleInput(3, *args, dtype=torch.long, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="expects float32 or float64 dtypes, got: torch.int64",
+    )
+
+    # Tests for window tensors that are bfloat16
+    yield ErrorInput(
+        SampleInput(3, *args, dtype=torch.bfloat16, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="expects float32 or float64 dtypes, got: torch.bfloat16",
+    )
+
+    # Tests for window tensors that are float16
+    yield ErrorInput(
+        SampleInput(3, *args, dtype=torch.float16, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="expects float32 or float64 dtypes, got: torch.float16",
+    )
+
+
+def error_inputs_exponential_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, **kwargs)
+
+    # Tests for negative decay values.
+    yield ErrorInput(
+        SampleInput(3, tau=-1, dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="Tau must be positive, got: -1 instead.",
+    )
+
+    # Tests for symmetric windows and a given center value.
+    yield ErrorInput(
+        SampleInput(3, center=1, sym=True, dtype=torch.float32, device=device),
+        error_type=ValueError,
+        error_regex="Center must be None for symmetric windows",
+    )
+
+
+def error_inputs_gaussian_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, std=0.5, **kwargs)
+
+    # Tests for negative standard deviations
+    yield ErrorInput(
+        SampleInput(3, std=-1, dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="Standard deviation must be positive, got: -1 instead.",
+    )
+
+
+def error_inputs_kaiser_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, beta=12, **kwargs)
+
+    # Tests for negative beta
+    yield ErrorInput(
+        SampleInput(3, beta=-1, dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="beta must be non-negative, got: -1 instead.",
+    )
+
+
+def error_inputs_general_cosine_window(op_info, device, **kwargs):
+    # Yield common error inputs
+    yield from error_inputs_window(op_info, device, a=[0.54, 0.46], **kwargs)
+
+    # Tests for negative beta
+    yield ErrorInput(
+        SampleInput(3, a=None, dtype=torch.float32, device=device, **kwargs),
+        error_type=TypeError,
+        error_regex="Coefficients must be a list/tuple",
+    )
+
+    yield ErrorInput(
+        SampleInput(3, a=[], dtype=torch.float32, device=device, **kwargs),
+        error_type=ValueError,
+        error_regex="Coefficients cannot be empty",
+    )
+
+
+def reference_signal_window(fn: Callable):
+    r"""Wrapper for scipy signal window references.
+
+    Discards keyword arguments for window reference functions that don't have a matching signature with
+    torch, e.g., gaussian window.
+    """
+
+    def _fn(
+        *args,
+        dtype=numpy.float64,
+        device=None,
+        layout=torch.strided,
+        requires_grad=False,
+        **kwargs,
+    ):
+        r"""The unused arguments are defined to disregard those values"""
+        return fn(*args, **kwargs).astype(dtype)
+
+    return _fn
+
+
+def make_signal_windows_opinfo(
+    name: str,
+    ref: Callable,
+    sample_inputs_func: Callable,
+    reference_inputs_func: Callable,
+    error_inputs_func: Callable,
+    *,
+    skips: Tuple[DecorateInfo, ...] = (),
+):
+    r"""Helper function to create OpInfo objects related to different windows."""
+    return OpInfo(
+        name=name,
+        ref=ref if TEST_SCIPY else None,
+        dtypes=floating_types(),
+        dtypesIfCUDA=floating_types(),
+        sample_inputs_func=sample_inputs_func,
+        reference_inputs_func=reference_inputs_func,
+        error_inputs_func=error_inputs_func,
+        supports_out=False,
+        supports_autograd=False,
+        skips=(
+            # TODO: same as this?
+            # https://github.com/pytorch/pytorch/issues/81774
+            # also see: arange, new_full
+            # fails to match any schemas despite working in the interpreter
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestOperatorSignatures",
+                "test_get_torch_func_signature_exhaustive",
+            ),
+            # fails to match any schemas despite working in the interpreter
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            # skip these tests since we have non tensor input
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestCommon", "test_noncontiguous_samples"
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestCommon",
+                "test_variant_consistency_eager",
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestMathBits", "test_conj_view"),
+            DecorateInfo(
+                unittest.skip("Skipped!"), "TestMathBits", "test_neg_conj_view"
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestMathBits", "test_neg_view"),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestVmapOperatorsOpInfo",
+                "test_vmap_exhaustive",
+            ),
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestVmapOperatorsOpInfo",
+                "test_op_has_batch_rule",
+            ),
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+            *skips,
+        ),
+    )
+
+
+op_db: List[OpInfo] = [
+    make_signal_windows_opinfo(
+        name="signal.windows.hamming",
+        ref=reference_signal_window(scipy.signal.windows.hamming)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.hann",
+        ref=reference_signal_window(scipy.signal.windows.hann) if TEST_SCIPY else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.bartlett",
+        ref=reference_signal_window(scipy.signal.windows.bartlett)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.blackman",
+        ref=reference_signal_window(scipy.signal.windows.blackman)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.cosine",
+        ref=reference_signal_window(scipy.signal.windows.cosine)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.exponential",
+        ref=reference_signal_window(scipy.signal.windows.exponential)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, tau=2.78),
+        reference_inputs_func=partial(reference_inputs_exponential_window, tau=2.78),
+        error_inputs_func=error_inputs_exponential_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.gaussian",
+        ref=reference_signal_window(scipy.signal.windows.gaussian)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, std=1.92),
+        reference_inputs_func=partial(reference_inputs_gaussian_window, std=1.92),
+        error_inputs_func=error_inputs_gaussian_window,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Buggy on MPS for now (mistakenly promotes to float64)"),
+                "TestCommon",
+                "test_numpy_ref_mps",
+            ),
+        ),
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.kaiser",
+        ref=reference_signal_window(scipy.signal.windows.kaiser)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, beta=12.0),
+        reference_inputs_func=partial(reference_inputs_kaiser_window, beta=12.0),
+        error_inputs_func=error_inputs_kaiser_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.general_cosine",
+        ref=reference_signal_window(scipy.signal.windows.general_cosine)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, a=[0.54, 0.46]),
+        reference_inputs_func=partial(
+            reference_inputs_general_cosine_window, a=[0.54, 0.46]
+        ),
+        error_inputs_func=error_inputs_general_cosine_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.general_hamming",
+        ref=reference_signal_window(scipy.signal.windows.general_hamming)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=partial(sample_inputs_window, alpha=0.54),
+        reference_inputs_func=partial(
+            reference_inputs_general_hamming_window, alpha=0.54
+        ),
+        error_inputs_func=error_inputs_window,
+    ),
+    make_signal_windows_opinfo(
+        name="signal.windows.nuttall",
+        ref=reference_signal_window(scipy.signal.windows.nuttall)
+        if TEST_SCIPY
+        else None,
+        sample_inputs_func=sample_inputs_window,
+        reference_inputs_func=reference_inputs_window,
+        error_inputs_func=error_inputs_window,
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/sparse.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..da8b47517100a54339bd55ec82f3f7173d9e1f15
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -0,0 +1,933 @@
+# mypy: ignore-errors
+
+import os
+
+import torch
+from torch.testing import make_tensor  # noqa: F401
+from torch.testing._internal.opinfo.core import (  # noqa: F401
+    BinaryUfuncInfo,
+    ErrorInput,
+    generate_elementwise_binary_tensors,
+    ReductionOpInfo,
+    sample_inputs_reduction,
+    SampleInput,
+)
+
+
+def _check_validate(op_info, sample):
+    def _check_fail(sample):
+        try:
+            op_info(
+                sample.sample_input.input,
+                *sample.sample_input.args,
+                **sample.sample_input.kwargs,
+            )
+        except sample.error_type:
+            pass
+        except Exception as msg:
+            raise AssertionError(  # noqa: TRY200
+                f"{op_info.name} on {sample.sample_input=} expected exception "
+                f"{sample.error_type}: {sample.error_regex}, got {type(msg).__name__}: {msg}"
+            )
+        else:
+            raise AssertionError(
+                f"{op_info.name} on {sample.sample_input=} expected exception "
+                f"{sample.error_type}: {sample.error_regex}, got none."
+            )
+
+    def _check_success(sample):
+        try:
+            op_info(sample.input, *sample.args, **sample.kwargs)
+        except Exception as msg:
+            raise AssertionError(  # noqa: TRY200
+                f"{op_info.name} on {sample=} expected to succeed "
+                f", got {type(msg).__name__}: {msg}"
+            )
+
+    if isinstance(sample, ErrorInput):
+        _check_fail(sample)
+    else:
+        _check_success(sample)
+
+
+def _sample_inputs_sparse(
+    sample_inputs,
+    maybe_failing_sample_inputs,
+    validate_sample_input,
+    op_info,
+    *args,
+    **kwargs,
+):
+    check_validate = (
+        os.environ.get("PYTORCH_TEST_CHECK_VALIDATE_SPARSE_SAMPLES", "0") == "1"
+    )
+    for sample in sample_inputs(op_info, *args, **kwargs):
+        sample = validate_sample_input(op_info, sample, check_validate=check_validate)
+        if isinstance(sample, SampleInput):
+            yield sample
+        # Error inputs are handled in error_inputs_sparse
+
+    for sample in maybe_failing_sample_inputs(op_info, *args, **kwargs):
+        sample = validate_sample_input(op_info, sample, check_validate=check_validate)
+        if isinstance(sample, SampleInput):
+            yield sample
+
+
+def _error_inputs_sparse(
+    maybe_failing_sample_inputs, validate_sample_input, op_info, *args, **kwargs
+):
+    check_validate = (
+        os.environ.get("PYTORCH_TEST_CHECK_VALIDATE_SPARSE_SAMPLES", "0") == "1"
+    )
+    for sample in maybe_failing_sample_inputs(op_info, *args, **kwargs):
+        sample = validate_sample_input(op_info, sample, check_validate=check_validate)
+        if isinstance(sample, ErrorInput):
+            yield sample
+        # Sample inputs are handled in sample_inputs_sparse
+
+
+def _apply_requires_grad_to_samples(sample_inputs):
+    """Decorator to _maybe_failing_sample_inputs_... generator functions
+    that clones and sets requires_grad argument to tensors in sample
+    input arguments. This is needed when the generated samples share
+    tensor instances.
+    """
+
+    def wrapper(op_info, device, dtype, requires_grad, layout, **kwargs):
+        def apply_requires_grad(x):
+            if (
+                not isinstance(x, torch.Tensor)
+                or x.requires_grad
+                or not requires_grad
+                or not (x.is_floating_point() or x.is_complex())
+            ):
+                return x
+            return x.detach().clone().requires_grad_(requires_grad)
+
+        if requires_grad:
+            for sample_input in sample_inputs(
+                op_info, device, dtype, requires_grad, layout, **kwargs
+            ):
+                yield sample_input.transform(apply_requires_grad)
+        else:
+            yield from sample_inputs(
+                op_info, device, dtype, requires_grad, layout, **kwargs
+            )
+
+    return wrapper
+
+
+def sample_inputs_sparse_reduction(
+    op_info, device, dtype, requires_grad, layout, blocksize=None, **kwargs
+):
+    """Sample inputs for reduction operations on sparse tensors."""
+    layout_name = str(layout).split(".", 1)[-1].rsplit("_coo", 1)[0]
+    op_supports_layout = getattr(op_info, "supports_" + layout_name)
+    if not op_supports_layout:
+        return
+
+    for sample_input in sample_inputs_reduction(
+        op_info, device, dtype, requires_grad, **kwargs
+    ):
+        if sample_input.input.ndim == 0:
+            # scalar sparse tensors are not supported
+            continue
+
+        if layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if sample_input.input.ndim < 2:
+                # conversion to sparse compressed tensors requires at
+                # least 2 dimensional tensors
+                continue
+            if sample_input.input.ndim > 2 and (sample_input.input == 0).any():
+                # Skip batched sparse compressed samples that contain
+                # explicit zeros because to_sparse(layout=..) will
+                # fail, see gh-98495.
+                # TODO: remove this if-block after gh-98495 is fixed.
+                continue
+
+        if layout in {torch.sparse_bsr, torch.sparse_bsc} and blocksize is None:
+            blocksize = (1, 1)
+
+        yield SampleInput(
+            sample_input.input.detach()
+            .to_sparse(layout=layout, blocksize=blocksize)
+            .requires_grad_(requires_grad),
+            args=sample_input.args,
+            kwargs=sample_input.kwargs,
+        )
+
+        if layout is torch.sparse_coo and (dtype.is_floating_point or dtype.is_complex):
+            # uncoalesced samples
+            inp = sample_input.input.detach().to_sparse(layout=layout)
+            inp = torch.sparse_coo_tensor(
+                inp.indices().repeat(1, 2),
+                inp.values().repeat(2),
+                inp.shape,
+                dtype=inp.dtype,
+                device=inp.device,
+            )
+            assert not inp.is_coalesced()
+            yield SampleInput(
+                inp.requires_grad_(requires_grad),
+                args=sample_input.args,
+                kwargs=sample_input.kwargs,
+            )
+
+        if sample_input.input.ndim > 2:
+            # hybrid samples
+            yield SampleInput(
+                sample_input.input.detach()
+                .to_sparse(
+                    layout=layout,
+                    blocksize=blocksize,
+                    dense_dim=sample_input.input.ndim - 2,
+                )
+                .requires_grad_(requires_grad),
+                args=sample_input.args,
+                kwargs=sample_input.kwargs,
+            )
+
+
+def _validate_sample_input_sparse_reduction(op_info, sample, check_validate=False):
+    """Return the specified sample when it is valid and supported by the
+    operation. Otherwise, return the sample as ErrorInput instance.
+
+    When check_validate is True, the result is validated against
+    calling the op on the sample.
+    """
+    UNSPECIFIED = object()
+    if op_info.name == "sum":
+        sample = _validate_sample_input_sparse_reduction_sum(sample)
+
+    if op_info.name in {"masked.sum"}:
+        mask = sample.kwargs.get("mask", UNSPECIFIED)
+        if (
+            mask not in {None, UNSPECIFIED}
+            and mask.ndim > 2
+            and mask.layout is torch.strided
+            and (mask == 0).any()
+        ):
+            # TODO: remove this if-block after gh-98495 is fixed.
+            sample = ErrorInput(
+                sample,
+                error_regex="Expect the same number of specified elements per batch.",
+            )
+        elif not sample.kwargs.get("keepdim"):
+            sample = ErrorInput(
+                sample,
+                error_type=(AssertionError, RuntimeError),
+                error_regex="reduction operations on (CSR|CSC) tensors with keepdim=False is unsupported",
+            )
+        elif mask is UNSPECIFIED:
+            sample = ErrorInput(
+                sample,
+                error_type=ValueError,
+                error_regex="masked (.*) expects explicit mask for sparse_csr tensor input",
+            )
+        elif sample.input.ndim > 2:
+            sample = ErrorInput(
+                sample,
+                error_regex="crow_indices is supposed to be a vector, but got 3 dimensional tensor.",
+            )
+
+    if op_info.name in {"masked.amax", "masked.amin", "masked.mean", "masked.prod"}:
+        t_inp = sample.input
+        batch_dim = t_inp.dim() - t_inp.dense_dim() - t_inp.sparse_dim()
+        mask = sample.kwargs.get("mask")
+        if (
+            mask is not None
+            and mask.ndim > 2
+            and mask.layout is torch.strided
+            and (mask == 0).any()
+        ):
+            # TODO: remove this if-block after gh-98495 is fixed.
+            sample = ErrorInput(
+                sample,
+                error_regex="Expect the same number of specified elements per batch.",
+            )
+        elif mask is None:
+            sample = ErrorInput(
+                sample,
+                error_type=ValueError,
+                error_regex="masked (.*) expects explicit mask for sparse_csr tensor input",
+            )
+        elif (
+            mask.layout is sample.input.layout
+            and mask.ndim > 2
+            and op_info.name == "masked.mean"
+        ):
+            sample = ErrorInput(
+                sample,
+                error_type=TypeError,
+                error_regex=(
+                    "where[(][)] received an invalid combination of arguments"
+                    " - got [(]Tensor, Tensor, NoneType[)]"
+                ),
+            )
+        elif not sample.kwargs.get("keepdim"):
+            sample = ErrorInput(
+                sample,
+                error_type=(AssertionError, RuntimeError),
+                error_regex="reduction operations on (CSR|CSC) tensors with keepdim=False is unsupported",
+            )
+        elif (
+            sample.input.ndim > 2
+            and (sample.kwargs.get("dim") not in {0, 1})
+            and mask.ndim > 2
+            and mask.layout is not torch.strided
+        ):
+            if sample.kwargs.get("dim") == (0, -1):
+                sample = ErrorInput(
+                    sample,
+                    error_regex="tensor dimensionality must be sum of batch, base, and dense dimensionalities",
+                )
+            elif op_info.name == "masked.prod":
+                sample = ErrorInput(
+                    sample,
+                    error_regex="input_dim == 2 INTERNAL ASSERT FAILED at",
+                )
+            else:
+                sample = ErrorInput(
+                    sample,
+                    error_type=AssertionError,
+                    error_regex="Sparse CSR tensors are 2D and only support reduction along dim 0 or 1.",
+                )
+        elif sample.input.ndim > 2:
+            sample = ErrorInput(
+                sample,
+                error_regex="crow_indices is supposed to be a vector, but got 3 dimensional tensor.",
+            )
+        elif (
+            mask.layout is t_inp.layout
+            and mask._nnz() != t_inp._nnz()
+            and t_inp.dense_dim() > 0
+        ):
+            sample = ErrorInput(
+                sample,
+                error_regex="Index tensor must have the same number of dimensions as src tensor",
+            )
+
+    if check_validate:
+        _check_validate(op_info, sample)
+
+    return sample
+
+
+def _validate_sample_input_sparse_reduction_sum(sample, check_validate=False):
+    # NOTE: When fixing a failing sample case, remove the
+    #       corresponding if-block
+    t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs
+    dim = t_kwargs.get("dim")
+    keepdim = t_kwargs.get("keepdim")
+    layout = t_inp.layout
+    if isinstance(dim, (int, list, tuple)):
+        if layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if layout in {torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}:
+                return ErrorInput(
+                    sample,
+                    error_regex=(
+                        "Currently the only compressed sparse format supported for sum.dim_IntList is CSR, but got layout"
+                    ),
+                )
+            if layout in {torch.sparse_csr, torch.sparse_csc} and not keepdim:
+                return ErrorInput(
+                    sample,
+                    error_regex=(
+                        "reduction operations on CSR tensors with keepdim=False is unsupported"
+                    ),
+                )
+            if t_inp.dim() != 2:
+                return ErrorInput(
+                    sample,
+                    error_regex=("input_dim == 2 INTERNAL ASSERT"),
+                )
+            if layout == torch.sparse_csr:
+                if t_inp.dtype == torch.bool:
+                    return ErrorInput(
+                        sample,
+                        error_regex=("_sparse_csr_sum_cpu not implemented for 'Bool'"),
+                    )
+                if t_inp.dtype == torch.complex32:
+                    return ErrorInput(
+                        sample,
+                        error_regex=(
+                            "_sparse_csr_sum_cuda not implemented for 'ComplexHalf'"
+                        ),
+                    )
+    return sample
+
+
+def _maybe_failing_sample_inputs_sparse_reduction_sum(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    """Generator of samples that are known to fail or that were failing in past."""
+    # NOTE: When fixing a failing case, remove the Exception comment
+    #       but keep the `yield sample` statement.
+    if layout in [
+        torch.sparse_csr,
+        torch.sparse_csc,
+    ]:
+        # NotImplementedError: Could not run 'aten::sum.IntList_out' with arguments from the 'SparseCsrCPU' backend.
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0, keepdim=True),
+        )
+        yield SampleInput(
+            torch.tensor([[[0, 1]], [[2, 3]]], dtype=dtype)
+            .to_sparse(layout=layout, dense_dim=1)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0),
+        )
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,)),
+        )
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,), keepdim=True),
+        )
+        yield SampleInput(
+            torch.tensor([[[0, 1]], [[2, 3]]], dtype=dtype)
+            .to_sparse(layout=layout, dense_dim=1)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,)),
+        )
+
+        # RuntimeError: torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size [2]
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0),
+        )
+
+    if layout in [
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    ]:
+        # RuntimeError: empty_sparse_compressed expected sparse compressed (non-block) tensor layout but got SparseBsr
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout, blocksize=(2, 2))
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0, keepdim=True),
+        )
+        yield SampleInput(
+            torch.tensor([[[0, 1]], [[2, 3]]], dtype=dtype)
+            .to_sparse(layout=layout, dense_dim=1, blocksize=(1, 1))
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0),
+        )
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout, blocksize=(1, 1))
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,)),
+        )
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout, blocksize=(1, 1))
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,), keepdim=True),
+        )
+        yield SampleInput(
+            torch.tensor([[[0, 1]], [[2, 3]]], dtype=dtype)
+            .to_sparse(layout=layout, blocksize=(1, 1), dense_dim=1)
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=(0,)),
+        )
+
+        # RuntimeError: torch.empty: Only batched sparse compressed (non-block) tensors are supported, but got size [2]
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype)
+            .to_sparse(layout=layout, blocksize=(1, 1))
+            .requires_grad_(requires_grad),
+            kwargs=dict(dim=0),
+        )
+
+
+def sample_inputs_sparse_reduction_sum(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    """Sample inputs for sum on sparse tensors."""
+    yield from _sample_inputs_sparse(
+        sample_inputs_sparse_reduction,
+        _maybe_failing_sample_inputs_sparse_reduction_sum,
+        _validate_sample_input_sparse_reduction,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def error_inputs_sparse_reduction_sum(op_info, device, layout, **kwargs):
+    """Error inputs for sum on sparse tensors."""
+    dtype = torch.float64
+    requires_grad = False
+    yield from _error_inputs_sparse(
+        _maybe_failing_sample_inputs_sparse_reduction_sum,
+        _validate_sample_input_sparse_reduction,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def sample_inputs_sparse_elementwise_binary_operation(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    """Sample inputs for elementwise binary operations on sparse tensors.
+
+    The samples include regular, zero-sized, batched, and hybrid
+    sparse tensors as well as rhs scalars. All tensors are full tensors.
+    """
+
+    def _to_sparse(tensor, **kwargs):
+        return tensor.detach().to_sparse(**kwargs).requires_grad_(requires_grad)
+
+    for sample_input in generate_elementwise_binary_tensors(
+        op_info,
+        device=device,
+        dtype=dtype,
+        requires_grad=requires_grad,
+        exclude_zero=True,
+        **kwargs,
+    ):
+        lhs, rhs = sample_input.input, sample_input.args[0]
+        min_dense_dim = 0
+        max_dense_dim = lhs.ndim - 1
+        if layout in {
+            torch.sparse_csr,
+            torch.sparse_csc,
+            torch.sparse_bsr,
+            torch.sparse_bsc,
+        }:
+            if lhs.ndim < 2:
+                # sparse compressed tensors sparse_dim must be 2
+                continue
+            max_dense_dim = lhs.ndim - 2
+
+        for dense_dim in range(min_dense_dim, max_dense_dim + 1):
+            if layout in {torch.sparse_bsr, torch.sparse_bsc}:
+                blocksizes = [(1, 1)]
+                if lhs.numel() > 0:
+                    blocksizes.append(
+                        (
+                            lhs.shape[lhs.ndim - 2 - dense_dim],
+                            lhs.shape[lhs.ndim - 1 - dense_dim],
+                        )
+                    )
+            else:
+                blocksizes = [None]
+            for blocksize in blocksizes:
+                to_sparse_kwargs = dict(
+                    layout=layout, dense_dim=dense_dim, blocksize=blocksize
+                )
+                lhs_sparse = _to_sparse(lhs, **to_sparse_kwargs)
+                rhs_sparse = _to_sparse(rhs, **to_sparse_kwargs)
+                # op(sparse, sparse)
+                yield SampleInput(
+                    lhs_sparse,
+                    args=(rhs_sparse, *sample_input.args[1:]),
+                    kwargs=sample_input.kwargs,
+                )
+                # op(sparse, scalar)
+                yield SampleInput(
+                    lhs_sparse,
+                    args=(
+                        make_tensor(
+                            (), dtype=dtype, device=device, requires_grad=requires_grad
+                        ),
+                        *sample_input.args[1:],
+                    ),
+                    kwargs=sample_input.kwargs,
+                )
+
+
+def _validate_sample_input_elementwise_binary_sparse_mul(sample):
+    # NOTE: When fixing a failing sample case, remove the
+    #       corresponding if-block
+    t_inp, t_args, t_kwargs = sample.input, sample.args, sample.kwargs
+    batch_dim = t_inp.dim() - t_inp.dense_dim() - t_inp.sparse_dim()
+    layout = t_inp.layout
+    dtype = t_inp.dtype
+    if layout is torch.sparse_csr and batch_dim > 0 and t_args[0].ndim > 0:
+        return ErrorInput(
+            sample,
+            error_regex=(
+                "coo_to_sparse_csr: conversion from Sparse to SparseCsr for input"
+                " tensors with sparse_dim[(][)]!=2 is not supported"
+            ),
+        )
+    elif layout is torch.sparse_csc and t_args[0].ndim > 0:
+        return ErrorInput(
+            sample, error_regex="Expected result Tensor to be of format CSR"
+        )
+    elif layout is torch.sparse_bsr and t_args[0].ndim > 0:
+        return ErrorInput(
+            sample,
+            error_regex="empty_sparse_compressed expected sparse compressed [(]non-block[)] tensor layout but got SparseBsr",
+        )
+    elif layout is torch.sparse_bsc and t_args[0].ndim > 0:
+        return ErrorInput(
+            sample,
+            error_regex="empty_sparse_compressed expected sparse compressed [(]non-block[)] tensor layout but got SparseBsc",
+        )
+    elif (
+        layout is torch.sparse_coo
+        and dtype is torch.bool
+        and t_args[0].ndim > 0
+        and t_inp.is_cpu
+        and t_inp.numel() > 0
+        and t_inp.dense_dim() > 0
+    ):
+        return ErrorInput(
+            sample, error_regex="\"addcmul_cpu_out\" not implemented for 'Bool'"
+        )
+    elif (
+        layout in {torch.sparse_coo, torch.sparse_csr}
+        and dtype is torch.bool
+        and t_inp._nnz() > 0
+        and t_args[0].ndim > 0
+        and t_inp.is_cpu
+        and t_inp.numel() > 0
+    ):
+        return ErrorInput(
+            sample, error_regex="\"mul_out_sparse\" not implemented for 'Bool'"
+        )
+    elif (
+        layout is torch.sparse_csr
+        and t_args[0].layout is torch.strided
+        and 0 < t_args[0].ndim
+        and t_args[0].ndim < t_inp.ndim
+    ):
+        return ErrorInput(
+            sample, error_regex="sparse_mask_sparse_csr expects self to be 2D"
+        )
+    elif layout is torch.sparse_csr and (
+        (t_args[0].layout is torch.strided and 0 < t_args[0].ndim)
+        or (t_args[0].layout is layout and t_inp.shape != t_args[0].shape)
+    ):
+        return ErrorInput(
+            sample,
+            error_regex=(
+                "expects sparse inputs with equal dimensionality, number of sparse dimensions,"
+                " and shape of sparse dimensions"
+            ),
+        )
+    elif (
+        layout is torch.sparse_csr
+        and t_inp.dense_dim() > 0
+        and t_inp._nnz() > 0
+        and t_inp.is_cpu
+        and dtype is torch.float16
+        and t_args[0].ndim > 0
+    ):
+        return ErrorInput(
+            sample, error_regex="\"addcmul_cpu_out\" not implemented for 'Half'"
+        )
+    return sample
+
+
+@_apply_requires_grad_to_samples
+def _maybe_failing_sample_inputs_sparse_elementwise_binary_mul(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    """Generator of samples that are known to fail or that were failing in past."""
+    # NOTE: When fixing a failing case, remove the Exception comment
+    #       but keep the `yield sample` statement.
+
+    blocksize = (1, 1) if layout in {torch.sparse_bsr, torch.sparse_bsc} else None
+    regular = torch.tensor([[1, 2], [3, 4]], device=device, dtype=dtype).to_sparse(
+        layout=layout, dense_dim=0, blocksize=blocksize
+    )
+    batch = torch.tensor(
+        [[[1, 2], [3, 4]], [[4, 5], [6, 7]]], device=device, dtype=dtype
+    ).to_sparse(layout=layout, dense_dim=0, blocksize=blocksize)
+    hybrid = torch.tensor(
+        [[[1], [2]], [[3], [4]]], device=device, dtype=dtype
+    ).to_sparse(layout=layout, dense_dim=1, blocksize=blocksize)
+
+    if layout is torch.sparse_csr:
+        # RuntimeError: crow_indices is supposed to be a vector, but got 2 dimensional tensor
+        yield SampleInput(batch, args=(batch,))
+        # RuntimeError: Only tensors with two sparse dimensions can be
+        # converted to the SparseCsr layout, got self with 3 sparse
+        # dimensions.
+        yield SampleInput(
+            torch.zeros_like(hybrid).requires_grad_(requires_grad),
+            args=(torch.zeros_like(hybrid).requires_grad_(requires_grad),),
+        )
+        if dtype is torch.complex32:
+            # RuntimeError: "mul_out_sparse" not implemented for 'ComplexHalf'
+            yield SampleInput(regular, args=(regular,))
+        if dtype is torch.bool and regular.is_cpu:
+            # RuntimeError: "mul_out_sparse" not implemented for 'Bool'
+            yield SampleInput(regular, args=(regular,))
+    if layout is torch.sparse_csc:
+        # RuntimeError: Expected result Tensor to be of format CSR
+        yield SampleInput(regular, args=(regular,))
+    if layout is torch.sparse_bsr:
+        # RuntimeError: empty_sparse_compressed expected sparse compressed (non-block) tensor layout but got SparseBsr
+        yield SampleInput(regular, args=(regular,))
+    if layout is torch.sparse_bsc:
+        # RuntimeError: empty_sparse_compressed expected sparse compressed (non-block) tensor layout but got SparseBsc
+        yield SampleInput(regular, args=(regular,))
+    if layout is torch.sparse_coo:
+        if dtype is torch.complex32:
+            # RuntimeError: "mul_out_sparse" not implemented for 'ComplexHalf'
+            yield SampleInput(regular, args=(regular,))
+        if dtype is torch.bool and regular.is_cpu:
+            # RuntimeError: "mul_out_sparse" not implemented for 'Bool'
+            yield SampleInput(regular, args=(regular,))
+        if dtype in {torch.bool, torch.float16} and regular.is_cpu:
+            # RuntimeError: "addcmul_cpu_out" not implemented for '(Bool|Half)'
+            yield SampleInput(hybrid, args=(hybrid,))
+
+
+def _validate_sample_input_sparse_elementwise_binary_operation(
+    op_info, sample, check_validate=False
+):
+    if op_info.name == "mul":
+        sample = _validate_sample_input_elementwise_binary_sparse_mul(sample)
+
+    if check_validate:
+        _check_validate(op_info, sample)
+    return sample
+
+
+def sample_inputs_sparse_mul(op_info, device, dtype, requires_grad, layout, **kwargs):
+    """Sample inputs for mul operation on sparse tensors."""
+    yield from _sample_inputs_sparse(
+        sample_inputs_sparse_elementwise_binary_operation,
+        _maybe_failing_sample_inputs_sparse_elementwise_binary_mul,
+        _validate_sample_input_sparse_elementwise_binary_operation,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def error_inputs_sparse_mul(op_info, device, layout, **kwargs):
+    """Error inputs for mul operation on sparse tensors."""
+    dtype = torch.float64
+    requires_grad = False
+    yield from _error_inputs_sparse(
+        _maybe_failing_sample_inputs_sparse_elementwise_binary_mul,
+        _validate_sample_input_sparse_elementwise_binary_operation,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def _sample_inputs_sparse_like_fns(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    from torch.testing._internal.common_utils import TestCase
+
+    for tensor in TestCase().generate_simple_inputs(
+        layout,
+        device=device,
+        dtype=dtype,
+        enable_batch=True,
+        enable_hybrid=True,
+        enable_zero_sized=True,
+        enable_non_contiguous_indices=False,
+        enable_non_contiguous_values=False,
+    ):
+        yield SampleInput(tensor, args=(), kwargs={})
+        yield SampleInput(
+            tensor, args=(), kwargs=dict(device=device, dtype=dtype, layout=layout)
+        )
+
+        if dtype is not torch.float64:
+            yield SampleInput(tensor, args=(), kwargs=dict(dtype=torch.float64))
+
+        if torch.cuda.is_available():
+            other_device = "cuda" if tensor.device.type == "cpu" else "cpu"
+            yield SampleInput(tensor, args=(), kwargs=dict(device=other_device))
+
+        if layout is torch.sparse_csr:
+            other_layout = torch.sparse_csc
+        elif layout is torch.sparse_csc:
+            other_layout = torch.sparse_csr
+        elif layout is torch.sparse_bsr:
+            other_layout = torch.sparse_bsc
+        elif layout is torch.sparse_bsc:
+            other_layout = torch.sparse_bsr
+        else:
+            other_layout = torch.strided
+        yield SampleInput(tensor, args=(), kwargs=dict(layout=other_layout))
+
+        if layout is not torch.sparse_coo:
+            yield SampleInput(tensor, args=(), kwargs=dict(layout=torch.sparse_coo))
+
+
+def _validate_sample_input_sparse_like_fns(op_info, sample, check_validate=False):
+    if sample.input.layout in {
+        torch.sparse_csr,
+        torch.sparse_csc,
+        torch.sparse_bsr,
+        torch.sparse_bsc,
+    }:
+        if sample.kwargs.get("device", sample.input.device) != sample.input.device:
+            return ErrorInput(
+                sample,
+                error_regex=(
+                    "device of (ccol|crow)_indices \\(=(cpu|cuda.*)\\) must"
+                    " match device of values \\(=(cuda.*|cpu)\\)"
+                ),
+            )
+        if sample.kwargs.get("layout", sample.input.layout) != sample.input.layout:
+            return ErrorInput(
+                sample,
+                error_regex=(
+                    "empty_like with different sparse layout is not supported"
+                    " \\(self is Sparse(Csc|Csr|Bsc|Bsr) but you requested Sparse(Csr|Csc|Bsr|Bsc)\\)"
+                ),
+            )
+    if sample.input.layout is torch.sparse_coo:
+        return ErrorInput(
+            sample,
+            error_regex=(
+                "Could not run 'aten::normal_' with arguments from the 'Sparse(CPU|CUDA)' backend."
+            ),
+        )
+    if check_validate:
+        _check_validate(op_info, sample)
+    return sample
+
+
+def _maybe_failing_sample_inputs_sparse_like_fns(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    if torch.cuda.is_available() and layout is not torch.sparse_coo:
+        other_device = "cuda" if torch.device(device).type == "cpu" else "cpu"
+        if layout is torch.sparse_csr:
+            other_layout = torch.sparse_csc
+        elif layout is torch.sparse_csc:
+            other_layout = torch.sparse_csr
+        elif layout is torch.sparse_bsr:
+            other_layout = torch.sparse_bsc
+        elif layout is torch.sparse_bsc:
+            other_layout = torch.sparse_bsr
+        else:
+            other_layout = torch.strided
+
+        blocksize = (1, 1) if layout in {torch.sparse_bsr, torch.sparse_bsc} else None
+
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype, device=device).to_sparse(
+                layout=layout, blocksize=blocksize
+            ),
+            kwargs=dict(device=other_device),
+        )
+
+        yield SampleInput(
+            torch.tensor([[0, 1], [2, 3]], dtype=dtype, device=device).to_sparse(
+                layout=layout, blocksize=blocksize
+            ),
+            kwargs=dict(layout=other_layout),
+        )
+
+
+def sample_inputs_sparse_like_fns(
+    op_info, device, dtype, requires_grad, layout, **kwargs
+):
+    """Sample inputs for like-functions on sparse tensors."""
+    yield from _sample_inputs_sparse(
+        _sample_inputs_sparse_like_fns,
+        _maybe_failing_sample_inputs_sparse_like_fns,
+        _validate_sample_input_sparse_like_fns,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def error_inputs_sparse_like_fns(op_info, device, layout, **kwargs):
+    """Error inputs for like-functions on sparse tensors."""
+    dtype = torch.float64
+    requires_grad = False
+    yield from _error_inputs_sparse(
+        _maybe_failing_sample_inputs_sparse_like_fns,
+        _validate_sample_input_sparse_like_fns,
+        op_info,
+        device,
+        dtype,
+        requires_grad,
+        layout,
+        **kwargs,
+    )
+
+
+def _validate_sample_input_sparse_default(op_info, sample, check_validate=False):
+    if op_info.name == "to_sparse":
+        if (
+            sample.input.layout
+            in {torch.sparse_csr, torch.sparse_csc, torch.sparse_bsr, torch.sparse_bsc}
+            and len(sample.args) == 1
+            and isinstance(sample.args[0], int)
+            and sample.args[0] != 2
+        ):
+            sample = ErrorInput(
+                sample,
+                error_regex="sparse dim argument must be 2 for sparse_compressed_to_sparse",
+            )
+
+    if check_validate:
+        _check_validate(op_info, sample)
+    return sample
+
+
+def validate_sample_input_sparse(op_info, sample, check_validate=False):
+    """Return the specified sample when it is valid and supported by the
+    operation. Otherwise, return the sample as ErrorInput instance.
+
+    When check_validate is True, the result is validated against
+    calling the op on the sample.
+    """
+    if isinstance(op_info, ReductionOpInfo):
+        return _validate_sample_input_sparse_reduction(
+            op_info, sample, check_validate=check_validate
+        )
+    elif isinstance(op_info, BinaryUfuncInfo):
+        return _validate_sample_input_sparse_elementwise_binary_operation(
+            op_info, sample, check_validate=check_validate
+        )
+    else:
+        return _validate_sample_input_sparse_default(
+            op_info, sample, check_validate=check_validate
+        )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/special.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/special.py
new file mode 100644
index 0000000000000000000000000000000000000000..e549a6e593a2609193c43cd2098eed50331c065d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/definitions/special.py
@@ -0,0 +1,838 @@
+# mypy: ignore-errors
+
+import unittest
+from functools import partial
+from itertools import product
+from typing import List
+
+import numpy as np
+
+import torch
+from torch.testing import make_tensor
+from torch.testing._internal.common_device_type import (
+    precisionOverride,
+    tol,
+    toleranceOverride,
+)
+from torch.testing._internal.common_dtype import all_types_and, floating_types
+from torch.testing._internal.common_utils import TEST_SCIPY, torch_to_numpy_dtype_dict
+from torch.testing._internal.opinfo.core import (
+    BinaryUfuncInfo,
+    DecorateInfo,
+    L,
+    NumericsFilter,
+    OpInfo,
+    S,
+    SampleInput,
+    UnaryUfuncInfo,
+)
+from torch.testing._internal.opinfo.refs import (
+    ElementwiseBinaryPythonRefInfo,
+    ElementwiseUnaryPythonRefInfo,
+)
+from torch.testing._internal.opinfo.utils import (
+    np_unary_ufunc_integer_promotion_wrapper,
+)
+
+
+if TEST_SCIPY:
+    import scipy.special
+
+
+# TODO: Consolidate `i0e` with sample_inputs_unary when `make_tensor`,
+#       supports `exclude` argument.
+#       For more context: https://github.com/pytorch/pytorch/pull/56352#discussion_r633277617
+def sample_inputs_i0_i1(op_info, device, dtype, requires_grad, **kwargs):
+    exclude_zero = requires_grad and op_info.op == torch.special.i0e
+    make_arg = partial(
+        make_tensor,
+        dtype=dtype,
+        device=device,
+        requires_grad=requires_grad,
+        exclude_zero=exclude_zero,
+    )
+    yield SampleInput(make_arg((S,)))
+    yield SampleInput(make_arg(()))
+
+    if requires_grad and not exclude_zero:
+        # Special Case for gradient
+        # Sample with `0` in the input
+        t = make_arg((S,))
+        t[0] = 0
+
+        yield SampleInput(t)
+
+
+def sample_inputs_polygamma(op_info, device, dtype, requires_grad, **kwargs):
+    make_arg = partial(
+        make_tensor,
+        device=device,
+        # TODO: eliminate low after gh-106692 is fixed:
+        low=(1 if dtype in {torch.int32, torch.int64} else None),
+        dtype=dtype,
+        requires_grad=requires_grad,
+    )
+    tensor_shapes = ((S, S), ())
+    ns = (1, 2, 3, 4, 5)
+
+    for shape, n in product(tensor_shapes, ns):
+        yield SampleInput(make_arg(shape), args=(n,))
+
+
+def reference_polygamma(x, n):
+    # WEIRD `scipy.special.polygamma` behavior
+    # >>> scipy.special.polygamma(0, np.array(501, dtype=np.float32)).dtype
+    # dtype('float64')
+    # >>> scipy.special.polygamma(0, np.array([501], dtype=np.float32)).dtype
+    # dtype('float32')
+    #
+    # Thus we cast output to the default torch dtype or preserve double
+    result_dtype = torch_to_numpy_dtype_dict[torch.get_default_dtype()]
+    if x.dtype == np.double:
+        result_dtype = np.double
+    return scipy.special.polygamma(n, x).astype(result_dtype)
+
+
+def sample_inputs_entr(op_info, device, dtype, requires_grad, **kwargs):
+    low, _ = op_info.domain
+
+    if requires_grad:
+        low = 0 + op_info._domain_eps
+
+    make_arg = partial(
+        make_tensor, dtype=dtype, device=device, low=low, requires_grad=requires_grad
+    )
+    yield SampleInput(make_arg((L,)))
+    yield SampleInput(make_arg(()))
+
+
+def sample_inputs_erfcx(op_info, device, dtype, requires_grad, **kwargs):
+    for shape in ((L,), (1, 0, 3), ()):
+        yield SampleInput(
+            make_tensor(
+                shape,
+                device=device,
+                dtype=dtype,
+                low=-5,
+                requires_grad=requires_grad,
+            ),
+        )
+
+
+op_db: List[OpInfo] = [
+    UnaryUfuncInfo(
+        "special.i0e",
+        aten_name="special_i0e",
+        ref=scipy.special.i0e if TEST_SCIPY else None,
+        decorators=(precisionOverride({torch.bfloat16: 3e-1, torch.float16: 3e-1}),),
+        dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        backward_dtypes=floating_types(),
+        sample_inputs_func=sample_inputs_i0_i1,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    UnaryUfuncInfo(
+        "special.i1",
+        aten_name="special_i1",
+        ref=np_unary_ufunc_integer_promotion_wrapper(scipy.special.i1)
+        if TEST_SCIPY
+        else None,
+        dtypes=all_types_and(torch.bool),
+        dtypesIfCUDA=all_types_and(torch.bool),
+        sample_inputs_func=sample_inputs_i0_i1,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float32: tol(atol=1e-4, rtol=0),
+                        torch.bool: tol(atol=1e-4, rtol=0),
+                    }
+                )
+            ),
+        ),
+        skips=(
+            DecorateInfo(
+                unittest.skip("Incorrect result!"),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_large",
+                dtypes=(torch.int8,),
+            ),
+        ),
+        supports_fwgrad_bwgrad=True,
+        supports_forward_ad=True,
+    ),
+    UnaryUfuncInfo(
+        "special.i1e",
+        aten_name="special_i1e",
+        ref=scipy.special.i1e if TEST_SCIPY else None,
+        dtypes=all_types_and(torch.bool),
+        dtypesIfCUDA=all_types_and(torch.bool),
+        sample_inputs_func=sample_inputs_i0_i1,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    UnaryUfuncInfo(
+        "special.ndtr",
+        aten_name="special_ndtr",
+        decorators=(precisionOverride({torch.bfloat16: 5e-3, torch.float16: 5e-4}),),
+        ref=scipy.special.ndtr if TEST_SCIPY else None,
+        dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        skips=(
+            # Dispatch stub: unsupported device typemeta
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestFwdGradients",
+                "test_fn_fwgrad_bwgrad",
+                device_type="meta",
+            ),
+        ),
+    ),
+    # A separate OpInfo entry for special.polygamma is needed to reorder the arguments
+    # for the alias. See the discussion here: https://github.com/pytorch/pytorch/pull/59691#discussion_r650261939
+    UnaryUfuncInfo(
+        "special.polygamma",
+        op=lambda x, n, **kwargs: torch.special.polygamma(n, x, **kwargs),
+        variant_test_name="special_polygamma_n_0",
+        ref=reference_polygamma if TEST_SCIPY else None,
+        dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_polygamma,
+        skips=(
+            # lambda impl
+            DecorateInfo(
+                unittest.expectedFailure, "TestJit", "test_variant_consistency_jit"
+            ),
+            DecorateInfo(
+                unittest.expectedFailure,
+                "TestNormalizeOperators",
+                "test_normalize_operator_exhaustive",
+            ),
+        ),
+        sample_kwargs=lambda device, dtype, input: ({"n": 0}, {"n": 0}),
+        # polygamma functions have multiple singularities at x having non-positive integer value
+        reference_numerics_filter=NumericsFilter(
+            condition=lambda x: (x < 0.1) & ((x - x.round()).abs() < 1e-4), safe_val=1
+        ),
+    ),
+    BinaryUfuncInfo(
+        "special.xlog1py",
+        aten_name="special_xlog1py",
+        dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        promotes_int_to_float=True,
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        supports_one_python_scalar=True,
+        # We don't test -1 as the gradient will be NaN and it'll break
+        rhs_make_tensor_kwargs=dict(low=-0.99),
+    ),
+    BinaryUfuncInfo(
+        "special.zeta",
+        aten_name="special_zeta",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        supports_autograd=False,
+        supports_one_python_scalar=True,
+        skips=(
+            # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+    ),
+    # TODO: FIXME
+    # OpInfo entry to verify the gradient formula of `other`/`q`
+    # BinaryUfuncInfo('special.zeta',
+    #                 op=lambda q, x, **kwargs: torch.special.zeta(x, q, **kwargs),
+    #                 aten_name='special_zeta',
+    #                 variant_test_name='grad',
+    #                 dtypes=all_types_and(torch.bool),
+    #                 promotes_int_to_float=True,
+    #                 supports_autograd=True,
+    #                 supports_rhs_python_scalar=False,
+    #                 decorators=[
+    #                     # Derivative wrt first tensor not implemented
+    #                     DecorateInfo(unittest.expectedFailure, "TestCommon",
+    #                                  "test_floating_inputs_are_differentiable")
+    #                 ],
+    #                 skips=(
+    #                     # Lambda doesn't work in JIT test
+    #                     # AssertionError: JIT Test does not execute any logic
+    #                     DecorateInfo(unittest.skip("Skipped!"), "TestJit", "test_variant_consistency_jit"),
+    #                 )),
+    UnaryUfuncInfo(
+        "special.entr",
+        ref=scipy.special.entr if TEST_SCIPY else None,
+        aten_name="special_entr",
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        decorators=(precisionOverride({torch.float16: 1e-1, torch.bfloat16: 1e-1}),),
+        dtypes=all_types_and(torch.bool, torch.half, torch.bfloat16),
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_large",
+                dtypes=[torch.bfloat16, torch.float16],
+            ),
+        ),
+        supports_inplace_autograd=False,
+        sample_inputs_func=sample_inputs_entr,
+    ),
+    UnaryUfuncInfo(
+        "special.ndtri",
+        ref=scipy.special.ndtri if TEST_SCIPY else None,
+        domain=(0, 1),
+        aten_name="special_ndtri",
+        dtypes=all_types_and(torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    UnaryUfuncInfo(
+        "special.log_ndtr",
+        aten_name="special_log_ndtr",
+        ref=scipy.special.log_ndtr if TEST_SCIPY else None,
+        dtypes=all_types_and(torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+    ),
+    UnaryUfuncInfo(
+        "special.erfcx",
+        ref=scipy.special.erfcx if TEST_SCIPY else None,
+        aten_name="special_erfcx",
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=0, rtol=4e-6),
+                }
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        supports_forward_ad=True,
+        supports_fwgrad_bwgrad=True,
+        sample_inputs_func=sample_inputs_erfcx,
+    ),
+    UnaryUfuncInfo(
+        "special.airy_ai",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-03,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=lambda x: scipy.special.airy(x)[0] if TEST_SCIPY else None,
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_large",
+            ),
+        ),
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.bessel_j0",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.j0 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.bessel_j1",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.j1 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.bessel_y0",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.y0 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.bessel_y1",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.y1 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.chebyshev_polynomial_t",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.chebyshev_polynomial_u",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.chebyshev_polynomial_v",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.chebyshev_polynomial_w",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.hermite_polynomial_h",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            # Greatest absolute difference: inf
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.hermite_polynomial_he",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.laguerre_polynomial_l",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.legendre_polynomial_p",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.modified_bessel_i0",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-03,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.i0 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.modified_bessel_i1",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-03,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.i1 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.modified_bessel_k0",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-03,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.k0 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.modified_bessel_k1",
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-03,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.k1 if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.scaled_modified_bessel_k0",
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=1e-03, rtol=1e-03),
+                    torch.float64: tol(atol=1e-05, rtol=1e-03),
+                }
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.k0e if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.scaled_modified_bessel_k1",
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=1e-03, rtol=1e-03),
+                    torch.float64: tol(atol=1e-05, rtol=1e-03),
+                }
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=scipy.special.k1e if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.shifted_chebyshev_polynomial_t",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.shifted_chebyshev_polynomial_u",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.shifted_chebyshev_polynomial_v",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    BinaryUfuncInfo(
+        "special.shifted_chebyshev_polynomial_w",
+        dtypes=all_types_and(torch.bool),
+        promotes_int_to_float=True,
+        skips=(
+            DecorateInfo(
+                unittest.skip(
+                    "Skipping - testing takes an unreasonably long time, #79528"
+                )
+            ),
+            DecorateInfo(unittest.skip("Skipped!"), "TestCudaFuserOpInfo"),
+            DecorateInfo(unittest.skip("Skipped!"), "TestNNCOpInfo"),
+            DecorateInfo(
+                unittest.skip("testing takes an unreasonably long time, #79528"),
+                "TestCommon",
+                "test_compare_cpu",
+            ),
+        ),
+        supports_one_python_scalar=True,
+        supports_autograd=False,
+    ),
+    UnaryUfuncInfo(
+        "special.spherical_bessel_j0",
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=1e-03, rtol=1e-03),
+                    torch.float64: tol(atol=1e-05, rtol=1e-03),
+                }
+            ),
+        ),
+        dtypes=all_types_and(torch.bool),
+        ref=lambda x: scipy.special.spherical_jn(0, x) if TEST_SCIPY else None,
+        supports_autograd=False,
+    ),
+]
+
+python_ref_db: List[OpInfo] = [
+    #
+    # Elementwise Unary Special OpInfos
+    #
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.bessel_j0",
+        torch_opinfo_name="special.bessel_j0",
+        op_db=op_db,
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.bessel_j1",
+        torch_opinfo_name="special.bessel_j1",
+        op_db=op_db,
+        decorators=(
+            precisionOverride(
+                {
+                    torch.float32: 1e-04,
+                    torch.float64: 1e-05,
+                },
+            ),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.entr",
+        torch_opinfo_name="special.entr",
+        op_db=op_db,
+        decorators=(precisionOverride({torch.float16: 1e-1, torch.bfloat16: 1e-1}),),
+        skips=(
+            DecorateInfo(
+                unittest.skip("Skipped!"),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_large",
+                dtypes=[torch.bfloat16, torch.float16],
+            ),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.erfcx",
+        torch_opinfo_name="special.erfcx",
+        op_db=op_db,
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=0, rtol=4e-6),
+                }
+            ),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.i0e",
+        torch_opinfo_name="special.i0e",
+        op_db=op_db,
+        decorators=(precisionOverride({torch.bfloat16: 3e-1, torch.float16: 3e-1}),),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.i1",
+        torch_opinfo_name="special.i1",
+        op_db=op_db,
+        decorators=(
+            DecorateInfo(
+                toleranceOverride(
+                    {
+                        torch.float32: tol(atol=1e-4, rtol=0),
+                        torch.bool: tol(atol=1e-4, rtol=0),
+                    }
+                )
+            ),
+        ),
+        skips=(
+            DecorateInfo(
+                unittest.skip("Incorrect result!"),
+                "TestUnaryUfuncs",
+                "test_reference_numerics_large",
+                dtypes=(torch.int8,),
+            ),
+        ),
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.i1e",
+        torch_opinfo_name="special.i1e",
+        op_db=op_db,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.log_ndtr",
+        torch_opinfo_name="special.log_ndtr",
+        op_db=op_db,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.ndtr",
+        torch_opinfo_name="special.ndtr",
+        op_db=op_db,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.ndtri",
+        torch_opinfo_name="special.ndtri",
+        op_db=op_db,
+    ),
+    ElementwiseUnaryPythonRefInfo(
+        "_refs.special.spherical_bessel_j0",
+        torch_opinfo_name="special.spherical_bessel_j0",
+        op_db=op_db,
+        decorators=(
+            toleranceOverride(
+                {
+                    torch.float32: tol(atol=1e-03, rtol=1e-03),
+                    torch.float64: tol(atol=1e-05, rtol=1e-03),
+                }
+            ),
+        ),
+    ),
+    #
+    # Elementwise Binary Special OpInfos
+    #
+    ElementwiseBinaryPythonRefInfo(
+        "_refs.special.zeta",
+        torch_opinfo_name="special.zeta",
+        supports_one_python_scalar=True,
+        op_db=op_db,
+        skips=(
+            # Reference reference_inputs nans and infs on cuda and nan, inf, 0., -inf for cpu
+            DecorateInfo(unittest.expectedFailure, "TestCommon", "test_compare_cpu"),
+        ),
+    ),
+]
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/refs.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/refs.py
new file mode 100644
index 0000000000000000000000000000000000000000..089edf4dd9f449fa73213caedf13217c5b628b60
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/refs.py
@@ -0,0 +1,206 @@
+# mypy: ignore-errors
+
+from torch.testing._internal.opinfo.core import (
+    BinaryUfuncInfo,
+    OpInfo,
+    ReductionOpInfo,
+    UnaryUfuncInfo,
+)
+
+# NOTE [Python References]
+# Python References emulate existing PyTorch operations, but can ultimately
+#   be expressed in terms of "primitive" operations from torch._prims.
+#
+# These references are experimental.
+# See https://dev-discuss.pytorch.org/t/tracing-with-primitives-update-0/577
+#   for additional context.
+#
+# Python Reference OpInfos should be added to the python_ref_db list below.
+#   Tests can opt-into running on these references by including
+#   that list in the Sequence they pass to the @ops decorator.
+#
+# When a Python Reference OpInfo is constructed a pointer to an
+#   existing OpInfo must be provided using the torch_opinfo_name kwarg.
+#   The existing OpInfo with that name and no variant will be found
+#   to inherit from.
+#
+# Instead of just inheriting the existing OpInfo's metadata, the
+#   Python Reference OpInfos inherit the existing OpInfo's
+#   construction arguments. These arguments can be overridden
+#   by adding kwargs to the constructor.
+
+
+def _find_referenced_opinfo(referenced_name, variant_name, *, op_db=None):
+    """
+    Finds the OpInfo with the given name that has no variant name.
+    """
+    # NOTE: searching the global op_db doesn't work when OpInfos are split into
+    # different modules, as otherwise the op_db will not be fully constructed
+    # yet. So, instead the local op_db must be passed in explicitly.
+    if op_db is None:
+        from torch.testing._internal.common_methods_invocations import op_db
+
+    for opinfo in op_db:
+        if opinfo.name == referenced_name and opinfo.variant_test_name == variant_name:
+            return opinfo
+
+
+def _inherit_constructor_args(name, op, inherited, overrides):
+    # inherits metadata
+    common_kwargs = {
+        "name": name,
+        "op": op,
+        "aliases": None,  # TODO add a check for alias coverage
+        "method_variant": None,
+        "inplace_variant": None,  # TODO: add a check for inplace coverage
+        "supports_scripting": False,
+    }
+
+    # Acquires inherited kwargs
+    kwargs = inherited.copy()
+
+    # Fixes metadata
+    if "kwargs" in kwargs:
+        kwargs.update(kwargs["kwargs"])
+        del kwargs["kwargs"]
+    if "self" in kwargs:
+        del kwargs["self"]
+    if "__class__" in kwargs:
+        del kwargs["__class__"]
+    if "skips" in kwargs:
+        del kwargs["skips"]
+    if "decorators" in kwargs:
+        del kwargs["decorators"]
+
+    # Overrides metadata
+    kwargs.update(common_kwargs)
+    kwargs.update(overrides)
+
+    # At the moment no prims support autograd, so we must not run autograd
+    # tests e.g. when testing dtype support.  Once we start writing autograd
+    # formulas for prims this can be removed.
+    kwargs["supports_autograd"] = False
+    kwargs["supports_gradgrad"] = False
+    kwargs["supports_fwgrad_bwgrad"] = False
+    kwargs["supports_inplace_autograd"] = False
+    kwargs["supports_forward_ad"] = False
+
+    return kwargs
+
+
+class PythonRefInfo(OpInfo):
+    """
+    An OpInfo for a Python reference of an OpInfo base class operation.
+    """
+
+    def __init__(
+        self,
+        name,  # the stringname of the callable Python reference
+        *,
+        op=None,  # the function variant of the operation, populated as torch.<name> if None
+        op_db=None,  # The database of opinfos to search for the parent opinfo
+        torch_opinfo_name,  # the string name of the corresponding torch opinfo
+        torch_opinfo_variant_name="",  # the variant name for corresponding torch opinfo
+        validate_view_consistency=True,
+        **kwargs,
+    ):  # additional kwargs override kwargs inherited from the torch opinfo
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo_variant_name = torch_opinfo_variant_name
+        self.torch_opinfo = _find_referenced_opinfo(
+            torch_opinfo_name, torch_opinfo_variant_name, op_db=op_db
+        )
+        self.validate_view_consistency = validate_view_consistency
+        assert isinstance(self.torch_opinfo, OpInfo)
+
+        inherited = self.torch_opinfo._original_opinfo_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+        super().__init__(**ukwargs)
+
+
+class ReductionPythonRefInfo(ReductionOpInfo):
+    """
+    An OpInfo for a Python reference of an elementwise unary operation.
+    """
+
+    def __init__(
+        self,
+        name,  # the stringname of the callable Python reference
+        *,
+        op=None,  # the function variant of the operation, populated as torch.<name> if None
+        op_db=None,  # The database of opinfos to search for the parent opinfo
+        torch_opinfo_name,  # the string name of the corresponding torch opinfo
+        torch_opinfo_variant_name="",  # the variant name for corresponding torch opinfo
+        **kwargs,
+    ):  # additional kwargs override kwargs inherited from the torch opinfo
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo_variant_name = torch_opinfo_variant_name
+        self.torch_opinfo = _find_referenced_opinfo(
+            torch_opinfo_name, torch_opinfo_variant_name, op_db=op_db
+        )
+        assert isinstance(self.torch_opinfo, ReductionOpInfo)
+
+        inherited = self.torch_opinfo._original_reduction_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        # See https://github.com/pytorch/pytorch/issues/77216
+        self.validate_view_consistency = False
+
+        super().__init__(**ukwargs)
+
+
+class ElementwiseUnaryPythonRefInfo(UnaryUfuncInfo):
+    """
+    An OpInfo for a Python reference of an elementwise unary operation.
+    """
+
+    def __init__(
+        self,
+        name,  # the stringname of the callable Python reference
+        *,
+        op=None,  # the function variant of the operation, populated as torch.<name> if None
+        op_db=None,  # The database of opinfos to search for the parent opinfo
+        torch_opinfo_name,  # the string name of the corresponding torch opinfo
+        torch_opinfo_variant_name="",  # the variant name for corresponding torch opinfo
+        validate_view_consistency=True,
+        **kwargs,
+    ):  # additional kwargs override kwargs inherited from the torch opinfo
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo_variant_name = torch_opinfo_variant_name
+        self.torch_opinfo = _find_referenced_opinfo(
+            torch_opinfo_name, torch_opinfo_variant_name, op_db=op_db
+        )
+        self.validate_view_consistency = validate_view_consistency
+        assert isinstance(self.torch_opinfo, UnaryUfuncInfo)
+
+        inherited = self.torch_opinfo._original_unary_ufunc_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        super().__init__(**ukwargs)
+
+
+class ElementwiseBinaryPythonRefInfo(BinaryUfuncInfo):
+    """
+    An OpInfo for a Python reference of an elementwise binary operation.
+    """
+
+    def __init__(
+        self,
+        name,  # the stringname of the callable Python reference
+        *,
+        op=None,  # the function variant of the operation, populated as torch.<name> if None
+        op_db=None,  # The database of opinfos to search for the parent opinfo
+        torch_opinfo_name,  # the string name of the corresponding torch opinfo
+        torch_opinfo_variant_name="",  # the variant name for corresponding torch opinfo
+        **kwargs,
+    ):  # additional kwargs override kwargs inherited from the torch opinfo
+        self.torch_opinfo_name = torch_opinfo_name
+        self.torch_opinfo_variant_name = torch_opinfo_variant_name
+        self.torch_opinfo = _find_referenced_opinfo(
+            torch_opinfo_name, torch_opinfo_variant_name, op_db=op_db
+        )
+        assert isinstance(self.torch_opinfo, BinaryUfuncInfo)
+
+        inherited = self.torch_opinfo._original_binary_ufunc_args
+        ukwargs = _inherit_constructor_args(name, op, inherited, kwargs)
+
+        super().__init__(**ukwargs)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..638641b75905b77084cd212f1eeb342f145d3a75
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/opinfo/utils.py
@@ -0,0 +1,273 @@
+# mypy: ignore-errors
+
+import collections
+import warnings
+from functools import partial, wraps
+from typing import Sequence
+
+import numpy as np
+
+import torch
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_dtype import (
+    _dispatch_dtypes,
+    all_types,
+    all_types_and,
+    all_types_and_complex,
+    all_types_and_complex_and,
+    all_types_and_half,
+    complex_types,
+    floating_and_complex_types,
+    floating_and_complex_types_and,
+    floating_types,
+    floating_types_and,
+    floating_types_and_half,
+    integral_types,
+    integral_types_and,
+)
+from torch.testing._internal.common_utils import torch_to_numpy_dtype_dict
+
+
+COMPLETE_DTYPES_DISPATCH = (
+    all_types,
+    all_types_and_complex,
+    all_types_and_half,
+    floating_types,
+    floating_and_complex_types,
+    floating_types_and_half,
+    integral_types,
+    complex_types,
+)
+
+EXTENSIBLE_DTYPE_DISPATCH = (
+    all_types_and_complex_and,
+    floating_types_and,
+    floating_and_complex_types_and,
+    integral_types_and,
+    all_types_and,
+)
+
+# Better way to acquire devices?
+DEVICES = ["cpu"] + (["cuda"] if TEST_CUDA else [])
+
+
+class _dynamic_dispatch_dtypes(_dispatch_dtypes):
+    # Class to tag the dynamically generated types.
+    pass
+
+
+def get_supported_dtypes(op, sample_inputs_fn, device_type):
+    # Returns the supported dtypes for the given operator and device_type pair.
+    assert device_type in ["cpu", "cuda"]
+    if not TEST_CUDA and device_type == "cuda":
+        warnings.warn(
+            "WARNING: CUDA is not available, empty_dtypes dispatch will be returned!"
+        )
+        return _dynamic_dispatch_dtypes(())
+
+    supported_dtypes = set()
+    for dtype in all_types_and_complex_and(torch.bool, torch.bfloat16, torch.half):
+        try:
+            samples = sample_inputs_fn(op, device_type, dtype, False)
+        except RuntimeError:
+            # If `sample_inputs_fn` doesn't support sampling for a given
+            # `dtype`, we assume that the `dtype` is not supported.
+            # We raise a warning, so that user knows that this was the case
+            # and can investigate if there was an issue with the `sample_inputs_fn`.
+            warnings.warn(
+                f"WARNING: Unable to generate sample for device:{device_type} and dtype:{dtype}"
+            )
+            continue
+
+        # We assume the dtype is supported
+        # only if all samples pass for the given dtype.
+        supported = True
+        for sample in samples:
+            try:
+                op(sample.input, *sample.args, **sample.kwargs)
+            except RuntimeError as re:
+                # dtype is not supported
+                supported = False
+                break
+
+        if supported:
+            supported_dtypes.add(dtype)
+
+    return _dynamic_dispatch_dtypes(supported_dtypes)
+
+
+def dtypes_dispatch_hint(dtypes):
+    # Function returns the appropriate dispatch function (from COMPLETE_DTYPES_DISPATCH and EXTENSIBLE_DTYPE_DISPATCH)
+    # and its string representation for the passed `dtypes`.
+    return_type = collections.namedtuple("return_type", "dispatch_fn dispatch_fn_str")
+
+    # CUDA is not available, dtypes will be empty.
+    if len(dtypes) == 0:
+        return return_type((), str(tuple()))
+
+    set_dtypes = set(dtypes)
+    for dispatch in COMPLETE_DTYPES_DISPATCH:
+        # Short circuit if we get an exact match.
+        if set(dispatch()) == set_dtypes:
+            return return_type(dispatch, dispatch.__name__ + "()")
+
+    chosen_dispatch = None
+    chosen_dispatch_score = 0.0
+    for dispatch in EXTENSIBLE_DTYPE_DISPATCH:
+        dispatch_dtypes = set(dispatch())
+        if not dispatch_dtypes.issubset(set_dtypes):
+            continue
+
+        score = len(dispatch_dtypes)
+        if score > chosen_dispatch_score:
+            chosen_dispatch_score = score
+            chosen_dispatch = dispatch
+
+    # If user passed dtypes which are lower than the lowest
+    # dispatch type available (not likely but possible in code path).
+    if chosen_dispatch is None:
+        return return_type((), str(dtypes))
+
+    return return_type(
+        partial(dispatch, *tuple(set(dtypes) - set(dispatch()))),
+        dispatch.__name__ + str(tuple(set(dtypes) - set(dispatch()))),
+    )
+
+
+def is_dynamic_dtype_set(op):
+    # Detect if the OpInfo entry acquired dtypes dynamically
+    # using `get_supported_dtypes`.
+    return op.dynamic_dtypes
+
+
+def str_format_dynamic_dtype(op):
+    fmt_str = f"""
+        OpInfo({op.name},
+               dtypes={dtypes_dispatch_hint(op.dtypes).dispatch_fn_str},
+               dtypesIfCUDA={dtypes_dispatch_hint(op.dtypesIfCUDA).dispatch_fn_str},
+        )
+        """
+
+    return fmt_str
+
+
+def np_unary_ufunc_integer_promotion_wrapper(fn):
+    # Wrapper that passes PyTorch's default scalar
+    #   type as an argument to the wrapped NumPy
+    #   unary ufunc when given an integer input.
+    #   This mimicks PyTorch's integer->floating point
+    #   type promotion.
+    #
+    # This is necessary when NumPy promotes
+    #   integer types to double, since PyTorch promotes
+    #   integer types to the default scalar type.
+
+    # Helper to determine if promotion is needed
+    def is_integral(dtype):
+        return dtype in [
+            np.bool_,
+            bool,
+            np.uint8,
+            np.int8,
+            np.int16,
+            np.int32,
+            np.int64,
+        ]
+
+    @wraps(fn)
+    def wrapped_fn(x):
+        # As the default dtype can change, acquire it when function is called.
+        # NOTE: Promotion in PyTorch is from integer types to the default dtype
+        np_dtype = torch_to_numpy_dtype_dict[torch.get_default_dtype()]
+
+        if is_integral(x.dtype):
+            return fn(x.astype(np_dtype))
+        return fn(x)
+
+    return wrapped_fn
+
+
+def reference_reduction_numpy(f, supports_keepdims=True):
+    """Wraps a NumPy reduction operator.
+
+    The wrapper function will forward dim, keepdim, mask, and identity
+    kwargs to the wrapped function as the NumPy equivalent axis,
+    keepdims, where, and initiak kwargs, respectively.
+
+    Args:
+        f: NumPy reduction operator to wrap
+        supports_keepdims (bool, optional): Whether the NumPy operator accepts
+            keepdims parameter. If it does not, the wrapper will manually unsqueeze
+            the reduced dimensions if it was called with keepdim=True. Defaults to True.
+
+    Returns:
+        Wrapped function
+
+    """
+
+    @wraps(f)
+    def wrapper(x: np.ndarray, *args, **kwargs):
+        # Copy keys into a set
+        keys = set(kwargs.keys())
+
+        dim = kwargs.pop("dim", None)
+        keepdim = kwargs.pop("keepdim", False)
+
+        if "dim" in keys:
+            dim = tuple(dim) if isinstance(dim, Sequence) else dim
+
+            # NumPy reductions don't accept dim=0 for scalar inputs
+            # so we convert it to None if and only if dim is equivalent
+            if x.ndim == 0 and dim in {0, -1, (0,), (-1,)}:
+                kwargs["axis"] = None
+            else:
+                kwargs["axis"] = dim
+
+        if "keepdim" in keys and supports_keepdims:
+            kwargs["keepdims"] = keepdim
+
+        if "mask" in keys:
+            mask = kwargs.pop("mask")
+            if mask is not None:
+                assert mask.layout == torch.strided
+                kwargs["where"] = mask.cpu().numpy()
+
+        if "identity" in keys:
+            identity = kwargs.pop("identity")
+            if identity is not None:
+                if identity.dtype is torch.bfloat16:
+                    identity = identity.cpu().to(torch.float32)
+                else:
+                    identity = identity.cpu()
+                kwargs["initial"] = identity.numpy()
+
+        result = f(x, *args, **kwargs)
+
+        # Unsqueeze reduced dimensions if NumPy does not support keepdims
+        if keepdim and not supports_keepdims and x.ndim > 0:
+            dim = list(range(x.ndim)) if dim is None else dim
+            result = np.expand_dims(result, dim)
+
+        return result
+
+    return wrapper
+
+
+def prod_numpy(a, *args, **kwargs):
+    """
+    The function will call np.prod with type as np.int64 if the input type
+    is int or uint64 if is uint. This is necessary because windows np.prod uses by default
+    int32 while on linux it uses int64.
+    This is for fixing integer overflow https://github.com/pytorch/pytorch/issues/77320
+
+    Returns:
+        np.prod of input
+    """
+    if "dtype" not in kwargs:
+        if np.issubdtype(a.dtype, np.signedinteger):
+            a = a.astype(np.int64)
+        elif np.issubdtype(a.dtype, np.unsignedinteger):
+            a = a.astype(np.uint64)
+
+    fn = reference_reduction_numpy(np.prod)
+    return fn(a, *args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c322c7db919d71c102d950e7163691561ef1fcd1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__init__.py
@@ -0,0 +1,7 @@
+# mypy: ignore-errors
+
+from .make_fx import make_fx_check
+from .aot_autograd import aot_autograd_check, _test_aot_autograd_forwards_backwards_helper
+from .fake_tensor import fake_check
+from .autograd_registration import autograd_registration_check
+from .generate_tests import generate_opcheck_tests, opcheck, OpCheckError, dontGenerateOpCheckTests, is_inside_opcheck_mode
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68bdb1cd167179d7dbceb8854d5aef340295087c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/aot_autograd.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/aot_autograd.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83127053ae0b4b1d4d2b656a2a69f98b0411f391
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/aot_autograd.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/autograd_registration.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/autograd_registration.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20fa528ec1fac78aacfac2416a638f958fd87096
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/autograd_registration.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/fake_tensor.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/fake_tensor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b07127598b839943d0da7470befe0038ce7520e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/fake_tensor.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/generate_tests.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/generate_tests.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60f8bc6b6edcb2a415f203f4daff7eda98898f02
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/generate_tests.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/make_fx.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/make_fx.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72351d2e80ef919660e2393541d354bb366e004e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/optests/__pycache__/make_fx.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/aot_autograd.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/aot_autograd.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0cd104f2df474463d7ea008e99dd12edfbf494e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/aot_autograd.py
@@ -0,0 +1,146 @@
+# mypy: ignore-errors
+
+import torch
+import torch.utils._pytree as pytree
+from torch.testing._internal.common_methods_invocations import wrapper_set_seed
+from functorch.compile import compiled_function, min_cut_rematerialization_partition, nop
+from .make_fx import randomize
+import re
+
+
+class assert_raises_regex:
+    def __init__(self, exception_cls, regex):
+        self.exception_cls = exception_cls
+        self.regex = regex
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        if exc_type == self.exception_cls:
+            msg = str(exc_val)
+            if not re.search(self.regex, msg):
+                raise AssertionError(
+                    f"Expected exception to match regex. regex: {self.regex}, exception: {msg}")
+            return True  # Squashes the exception
+        if exc_type is not None:
+            raise AssertionError(
+                f"Expected {self.exception_cls} to be raised, instead got exception {exc_type}")
+        raise AssertionError("Expected exception to be raised but none was")
+
+
+def aot_autograd_check(
+        func,
+        args,
+        kwargs,
+        dynamic,
+        assert_raises_regex_fn=assert_raises_regex,
+        assert_equals_fn=torch.testing._comparison.assert_close,
+        check_gradients=True,
+        try_check_data_specialization=False):
+    """Compares func(*args, **kwargs) in eager-mode to under AOTAutograd.
+
+    Compares outputs and (if check_gradients=True) gradients produced by
+    AOTAutograd against eager-mode PyTorch.
+
+    We assume that func(*args, **kwargs) succeeds in eager-mode PyTorch.
+
+    """
+    flat_args, args_spec = pytree.tree_flatten((args, kwargs))
+    args_is_tensor = [isinstance(arg, torch.Tensor) for arg in flat_args]
+    args = [arg for arg in flat_args if isinstance(arg, torch.Tensor)]
+
+    # We construct a new function that only accepts Tensors as inputs
+    def func_no_tensors(args):
+        reconstructed_flat_args = []
+        args = iter(args)
+        for v in flat_args:
+            if isinstance(v, torch.Tensor):
+                reconstructed_flat_args.append(next(args))
+            else:
+                reconstructed_flat_args.append(v)
+
+        c_args, c_kwargs = pytree.tree_unflatten(reconstructed_flat_args, args_spec)
+        return func(*c_args, **c_kwargs)
+
+    compiled_f = compiled_function(
+        func_no_tensors, nop, nop, dynamic=dynamic, partition_fn=min_cut_rematerialization_partition)
+
+    out = wrapper_set_seed(func_no_tensors, args)
+    if check_gradients == "auto":
+        any_tensor_requires_grad = pytree.tree_any_only(torch.Tensor, lambda x: x.requires_grad, args)
+        any_output_requires_grad = pytree.tree_any_only(torch.Tensor, lambda x: x.requires_grad, out)
+        check_gradients = any_tensor_requires_grad and any_output_requires_grad
+    if not check_gradients:
+        compiled_out = wrapper_set_seed(compiled_f, args)
+        assert_equals_fn(compiled_out, out, msg=outputs_msg)
+        return
+    _test_aot_autograd_forwards_backwards_helper(
+        func_no_tensors, compiled_f, args, assert_raises_regex_fn, assert_equals_fn,
+        try_check_data_specialization)
+
+outputs_msg = (
+    "Outputs of the operator are different in eager-mode PyTorch vs "
+    "AOTAutograd. This means the operator will have incorrect output "
+    "underneath torch.compile. This could be because the operator's "
+    "implementation not traceable or that there is a bug in AOTAutograd."
+)
+
+
+def _test_aot_autograd_forwards_backwards_helper(
+        f, compiled_f, args, assert_raises_regex_fn, assert_equals_fn,
+        try_check_data_specialization):
+    # Verify grads are equal between compiled and non-compiled versions of f.
+
+    def call_forwards_backwards(f, args):
+        flat_args = pytree.arg_tree_leaves(*args)
+        diff_args = [arg for arg in flat_args if isinstance(arg, torch.Tensor) and
+                     arg.requires_grad]
+        out = wrapper_set_seed(f, args)
+        flat_out = pytree.tree_leaves(out)
+
+        sm = 0
+        for i in flat_out:
+            if isinstance(i, torch.Tensor):
+                # We need to call .abs() because it is possible that the output of the
+                # operator is a complex Tensor and autograd will yell at autograd.grad
+                # on a complex Tensor unless we manually provide the grad_output flag.
+                sm += i.sum().abs()
+        assert isinstance(sm, torch.Tensor)
+        return out, torch.autograd.grad(sm, diff_args, allow_unused=True)
+
+    def check(args, ignore_failure=False):
+        try:
+            orig_out, orig_grad = call_forwards_backwards(f, args)
+        except Exception:
+            if ignore_failure:
+                return
+            raise
+
+        # See https://github.com/pytorch/pytorch/pull/98960#issuecomment-1505962215
+        if all(x is None for x in orig_grad):
+            with assert_raises_regex_fn(RuntimeError, 'does not require grad and does not have a grad_fn'):
+                call_forwards_backwards(compiled_f, args)
+            return
+
+        msg = (
+            "Gradients of the operator are different in eager-mode PyTorch vs "
+            "AOTAutograd. This means the operator will have incorrect gradients "
+            "underneath torch.compile. This could be because the operator's "
+            "backward is incorrectly registered or not traceable or that there "
+            "is a bug in AOTAutograd."
+        )
+
+        compiled_out, compiled_grad = call_forwards_backwards(compiled_f, args)
+        assert_equals_fn(compiled_out, orig_out, msg=outputs_msg)
+        assert_equals_fn(compiled_grad, orig_grad, msg=msg)
+
+    check(args, ignore_failure=False)
+
+    # Randomize the data and run the traced graph with it, to catch bugs
+    # where we may have baked in Tensor data into the trace.
+    # This is not guaranteed to succeed, because `f` might have preconditions
+    # on the values of the inputs, so we just ignore if this test fails.
+    if try_check_data_specialization:
+        args = randomize(args)
+        check(args, ignore_failure=True)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/autograd_registration.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/autograd_registration.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1923111dd9905d51bf19856d970776f38e07f4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/autograd_registration.py
@@ -0,0 +1,132 @@
+# mypy: ignore-errors
+
+import contextlib
+
+import torch
+import torch.utils._pytree as pytree
+
+
+@contextlib.contextmanager
+def set_autograd_fallback_mode(mode):
+    prev = torch._C._get_autograd_fallback_mode()
+    try:
+        torch._C._set_autograd_fallback_mode(mode)
+        yield
+    finally:
+        torch._C._set_autograd_fallback_mode(prev)
+
+
+def autograd_registration_check(op, args, kwargs):
+    """Check if autograd was registered correctly (for the operator).
+
+    Operators should have "autograd support" registered directly to an
+    autograd dispatch key.
+    An incorrect registration may lead to unexpected silent incorrectness.
+    Note that this check won't catch all problems but will catch
+    the most common ones.
+
+    Example usage:
+        >>> x = torch.randn(3, requires_grad=True)
+        >>> autograd_registration_check(torch.ops.aten.sin.default, (x,), {})
+
+    Here are some best practices if you do find your autograd is
+    registered incorrectly:
+    - If the operator is composite (i.e. consists of other PyTorch ops)
+      and you wish the operator to decompose and get autograd support
+      that way, then please register the implementation to
+      DispatchKey::CompositeImplicitAutograd
+    - If you're adding an autograd formula for the operator, the correct
+      thing to do is to register an autograd.Function to
+      DispatchKey::Autograd (preferred) or one of the
+      DispatchKey::Autograd<BACKEND> keys. It is NOT OK to register
+      an autograd.Function to a backend (e.g. CPU/CUDA) key.
+    - If your operator is non-differentiable, then you should register
+      an implementation to the Autograd key that uses
+      AutoDispatchBelowAutograd and re-invokes the operator.
+
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    # Implementation details
+    # -----------------------------------------------
+    # If an operator doesn't have an autograd kernel at an autograd key,
+    # and the operator does not return inputs as-is, then all of
+    # the outputs should have requires_grad=False before we apply
+    # special behaviors of our default autograd fallback.
+    # (The default autograd fallback may set requires_grad=True on output
+    # tensors in certain modes so that when they are backpropped through,
+    # they raise an error).
+    #
+    # Our strategy for detecting if an operator doesn't have an autograd
+    # kernel at the autograd key is:
+    # - set the autograd fallback mode to "nothing" (so it does not change
+    #   the required-gradness of outputs)
+    # - run the operator
+    # - Check if any outputs of the operator (that are not inputs) require
+    #   grad. This would only happen if the user calls regular PyTorch
+    #   operations in their backend key (this op should instead be
+    #   CompositeImplicitAutograd or not an op) or if the user invokes
+    #   an autograd.Function in the backend key.
+    #
+    # Note that it's already likely a bug if the operator directly returns
+    # an input as output (because custom ops don't have a good way of
+    # constructing true in-place or out variants), but we defer that
+    # responsibility to a different test (schema_check).
+
+    flat_args = pytree.arg_tree_leaves(*args, **kwargs)
+    all_tensors = [arg for arg in flat_args if isinstance(arg, torch.Tensor)]
+    if not any(t.requires_grad for t in all_tensors):
+        raise RuntimeError(
+            "autograd_registration_check: no inputs have requires_grad=True so "
+            "we are unable to actually perform this test. Please pass inputs "
+            "that do require grad."
+        )
+
+    # Determine which AutogradBACKEND key to check
+    all_device_types = {arg.device.type for arg in all_tensors}
+    if not all_device_types.issubset(["cpu", "cuda"]):
+        # Don't want to support other keys yet
+        raise NotImplementedError(
+            f"autograd_registration_check: NYI devices other than CPU/CUDA, got {all_device_types}"
+        )
+    if "cuda" in all_device_types:
+        key = "AutogradCUDA"
+    elif "cpu" in all_device_types:
+        key = "AutogradCPU"
+
+    if torch._C._dispatch_has_kernel_for_dispatch_key(op.name(), key):
+        return
+    if torch._C._dispatch_has_kernel_for_dispatch_key(op.name(), "Autograd"):
+        return
+    if torch._C._dispatch_has_kernel_for_dispatch_key(
+        op.name(), "CompositeImplicitAutograd"
+    ):
+        return
+
+    # At this point, we know the operator doesn't have a kernel registered to an
+    # autograd key. Let's proceed with our test.
+    with set_autograd_fallback_mode("nothing"):
+        all_outs = op(*args, **kwargs)
+
+    inp_ids = {id(arg) for arg in flat_args}
+
+    def not_an_input_and_requires_grad(tensor):
+        if not tensor.requires_grad:
+            return False
+        if id(tensor) in inp_ids:
+            return False
+        return True
+
+    if not pytree.tree_any_only(torch.Tensor, not_an_input_and_requires_grad, all_outs):
+        return
+
+    raise AssertionError(
+        f"{op.name()}: at least one output of this operator has requires_grad=True "
+        f"but the operator does not have an autograd kernel defined at an autograd "
+        f"key (e.g. DispatchKey::Autograd). This could mean that you have "
+        f"incorrectly registered an autograd kernel to a non-Autograd DispatchKey, "
+        f"which may lead to silently incorrect results. If your operator consists "
+        f"of regular PyTorch operations, consider not using an operator at all "
+        f"or registering your operator as CompositeImplicitAutograd. If you have "
+        f"an autograd.Function registered to a backend (CPU/CUDA) key, the correct "
+        f"location for it is the Autograd key."
+    )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/fake_tensor.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/fake_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..354ee267cd4e8e36b2a735284f4157379fb1215d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/fake_tensor.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+
+import torch._subclasses
+
+
+def is_builtin(op):
+    return op.namespace in ('aten', 'prims', 'prim')
+
+
+def fake_check(op, args, kwargs):
+    with torch._subclasses.CrossRefFakeMode(ignore_op_fn=is_builtin):
+        op(*args, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/generate_tests.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/generate_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c8c414261b307fab9ca20e65406264aef8b8e1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/generate_tests.py
@@ -0,0 +1,852 @@
+# mypy: ignore-errors
+
+import datetime
+import difflib
+import functools
+import inspect
+import json
+import os
+import re
+import tempfile
+import threading
+import unittest
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+import torch._dynamo
+
+import torch.utils._pytree as pytree
+from torch._dynamo.utils import clone_input
+from torch._subclasses.schema_check_mode import SchemaCheckMode
+from torch._utils_internal import get_file_path_2
+from torch.overrides import TorchFunctionMode
+from torch.testing._internal.optests import (
+    aot_autograd_check,
+    autograd_registration_check,
+    fake_check,
+)
+
+
+def dontGenerateOpCheckTests(reason: str):
+    def inner(fun):
+        fun._torch_dont_generate_opcheck_tests = True
+        return fun
+
+    return inner
+
+
+def is_abstract(tensor: torch.Tensor) -> bool:
+    if tensor.is_meta:
+        return True
+    if torch._subclasses.fake_tensor.is_fake(tensor):
+        return True
+    return False
+
+
+def safe_schema_check(
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    *,
+    copy_inputs: bool = True,
+) -> Any:
+    if copy_inputs:
+        args, kwargs = deepcopy_tensors((args, kwargs))
+    if pytree.tree_any_only(torch.Tensor, is_abstract, (args, kwargs)):
+        return None
+    with SchemaCheckMode():
+        result = op(*args, **kwargs)
+        return result
+
+
+def safe_autograd_registration_check(
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    *,
+    copy_inputs: bool = True,
+) -> None:
+    if pytree.tree_any_only(torch.Tensor, is_abstract, (args, kwargs)):
+        return
+    if copy_inputs:
+        args, kwargs = deepcopy_tensors((args, kwargs))
+    # Don't perform autograd_registration_check if none of the inputs require grad.
+    if not pytree.tree_any_only(
+        torch.Tensor, lambda x: x.requires_grad, (args, kwargs)
+    ):
+        return
+    return autograd_registration_check(op, args, kwargs)
+
+
+def safe_fake_check(
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    *,
+    copy_inputs: bool = True,
+) -> None:
+    if pytree.tree_any_only(torch.Tensor, is_abstract, (args, kwargs)):
+        return None
+    if copy_inputs:
+        args, kwargs = deepcopy_tensors((args, kwargs))
+    return fake_check(op, args, kwargs)
+
+
+def safe_aot_autograd_check(
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    dynamic: bool,
+    *,
+    copy_inputs: bool = True,
+) -> Any:
+    # NB: copy_inputs does nothing for aot_autograd_check: it always needs to copy
+    # inputs.
+    if pytree.tree_any_only(torch.Tensor, is_abstract, (args, kwargs)):
+        return None
+
+    def func(*args, **kwargs):
+        args, kwargs = pytree.tree_map_only(torch.Tensor, torch.clone, (args, kwargs))
+        return op(*args, **kwargs)
+
+    # aot_autograd_check runs func(*args, **kwargs) multiple times
+    # and assumes `func` does not modify its inputs.
+    return aot_autograd_check(func, args, kwargs, dynamic, check_gradients="auto")
+
+
+def deepcopy_tensors(inputs: Any) -> Any:
+    return pytree.tree_map_only(torch.Tensor, clone_input, inputs)
+
+
+# Test util requirements
+# - The test util must have signature (op: OpOverload, args, kwargs)
+# - The test util must NOT mutate args, kwargs.
+# - The test utils in this list must not be prefixes of each other. For example,
+#   having both "test_schema" and "test_schema_is_functional" is NOT OK.
+# - The order of items in this dict matters (for opcheck), we'll run them
+#   in order.
+ALL_TEST_UTILS = {
+    "test_schema": safe_schema_check,
+    "test_autograd_registration": safe_autograd_registration_check,
+    "test_faketensor": safe_fake_check,
+    "test_aot_dispatch_static": functools.partial(
+        safe_aot_autograd_check,
+        dynamic=False,
+    ),
+    "test_aot_dispatch_dynamic": functools.partial(
+        safe_aot_autograd_check,
+        dynamic=True,
+    ),
+}
+
+GDOC = "https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit"
+
+DEFAULT_TEST_UTILS = [
+    "test_schema",
+    "test_autograd_registration",
+    "test_faketensor",
+    "test_aot_dispatch_static",
+    "test_aot_dispatch_dynamic",
+]
+
+
+def generate_opcheck_tests(
+    testcase: Any,
+    namespaces: List[str],
+    failures_dict_path: Optional[str] = None,
+    additional_decorators: Dict[str, Callable] = None,
+    test_utils: List[str] = DEFAULT_TEST_UTILS,
+) -> None:
+    """Given an existing TestCase, use the existing tests to generate
+    additional validation tests for custom operators.
+
+    For {all existing tests in the TestCase} x {all test utils},
+    we will generate one new test. The new test runs a TorchFunctionMode
+    that intercepts ``op(*args, **kwargs)`` calls and invokes
+    ``test_util(op, *args, **kwargs)``, where ``op`` is an operator.
+
+    The test_util that we support are in ALL_TEST_UTILS. They are:
+    - test_schema: This runs SchemaCheckMode.
+    - test_autograd_registration: This runs autograd_registration_check.
+    - test_faketensor: This runs CrossRefFakeMode.
+    - test_aot_dispatch_static: This runs aot_autograd_check, which:
+        checks that the outputs (and gradients, if they are computable)
+        are the same under eager-mode PyTorch and using AOTAutograd.
+    - test_aot_dispatch_dynamic: Same as aot_dispatch_static, but
+        runs AOTAutograd using dynamic shapes instead of static shapes.
+
+    The generated test will have name ``{test_util}__{original_name}``.
+    For example, if there is a method named ``test_cumsum``, then
+    we will generate a ``test_schema__test_cumsum``,
+    ``test_faketensor__test_cumsum``, etc.
+
+    For more details, see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit
+
+    Args:
+        testcase: The testcase we will modify and generate additional tests for.
+        namespaces: We will only intercept calls to custom operators with these
+                    namespaces.
+        failures_dict_path: See ``validate_failures_dict_structure`` for more details
+        test_utils: a list of test_utils to generate. Example: ["test_schema", "test_faketensor"]
+    """
+    if additional_decorators is None:
+        additional_decorators = {}
+    test_methods = [
+        m
+        for m in dir(testcase)
+        if m.startswith("test_") and callable(getattr(testcase, m))
+    ]
+    if failures_dict_path is None:
+        # The default failures_dict_path is failures_dict.json in
+        # the same directory as the test file.
+        prev_frame = inspect.currentframe().f_back
+        filename = inspect.getframeinfo(prev_frame)[0]
+        failures_dict_path = get_file_path_2(
+            os.path.dirname(filename), "failures_dict.json"
+        )
+    failures_dict = FailuresDict.load(
+        failures_dict_path, create_file=should_update_failures_dict()
+    )
+    validate_failures_dict_structure(failures_dict, test_utils, testcase)
+    validate_failures_dict_formatting(failures_dict_path)
+
+    def construct_method(attr, prefix, tester):
+        method = getattr(testcase, attr)
+        if getattr(method, "_torch_dont_generate_opcheck_tests", False):
+            return
+        new_method_name = prefix + "__" + attr
+
+        @functools.wraps(method)
+        def new_method(*args, **kwargs):
+            with OpCheckMode(
+                namespaces,
+                prefix,
+                tester,
+                failures_dict,
+                f"{testcase.__name__}.{new_method_name}",
+                failures_dict_path,
+            ):
+                result = method(*args, **kwargs)
+            return result
+
+        if pytestmark := new_method.__dict__.get("pytestmark"):
+            import pytest
+
+            # check if we need to simplify the parametrize marks
+            # NB: you need to add this mark to your pytest.ini
+            opcheck_only_one = False
+            for mark in pytestmark:
+                if isinstance(mark, pytest.Mark) and mark.name == "opcheck_only_one":
+                    opcheck_only_one = True
+
+            if opcheck_only_one:
+                new_pytestmark = []
+                for mark in pytestmark:
+                    if isinstance(mark, pytest.Mark) and mark.name == "parametrize":
+                        argnames, argvalues = mark.args
+                        assert not mark.kwargs, "NYI"
+                        # Special case for device, we want to run on all
+                        # devices
+                        if argnames != "device":
+                            new_pytestmark.append(
+                                pytest.mark.parametrize(
+                                    argnames, (next(iter(argvalues)),)
+                                )
+                            )
+                            continue
+                    new_pytestmark.append(mark)
+                new_method.__dict__["pytestmark"] = new_pytestmark
+
+        if new_method_name in additional_decorators:
+            for dec in additional_decorators[new_method_name]:
+                new_method = dec(new_method)
+
+        if hasattr(testcase, new_method_name):
+            raise RuntimeError(
+                f"Tried to autogenerate {new_method_name} but {testcase} already "
+                f"has method named {new_method_name}. Please rename the original "
+                f"method on the TestCase."
+            )
+        setattr(testcase, new_method_name, new_method)
+
+    test_utils = {name: ALL_TEST_UTILS[name] for name in test_utils}
+    for attr in test_methods:
+        for prefix, tester in test_utils.items():
+            construct_method(attr, prefix, tester)
+
+    generate_tag_tests(testcase, failures_dict, additional_decorators)
+
+
+def generate_tag_tests(testcase, failures_dict, additional_decorators):
+    def generate_test(qualname, definitely_not_pt2_compliant, xfailed_tests):
+        def inner(self):
+            try:
+                op = torch._library.utils.lookup_op(qualname)
+            except AttributeError as e:
+                # Operator not importable in this test file
+                raise unittest.SkipTest(f"Can't import operator {qualname}") from e
+            op_marked_as_compliant = torch.Tag.pt2_compliant_tag in op.tags
+            if not op_marked_as_compliant:
+                return
+            if not definitely_not_pt2_compliant:
+                return
+            raise AssertionError(
+                f"op '{qualname}' was tagged with torch.Tag.pt2_compliant_tag "
+                f"but it failed some of the generated opcheck tests "
+                f"({xfailed_tests}). This may lead to silent correctness issues, "
+                f"please fix this."
+            )
+
+        return inner
+
+    for qualname, test_dict in failures_dict.data.items():
+        xfailed_tests = [
+            test
+            for test, status_dict in test_dict.items()
+            # We're about to delete the following test after Ed's PR
+            # to specialize on C++ .size() calls
+            if "test_aot_dispatch_static" not in test
+            and status_dict["status"] == "xfail"
+        ]
+        definitely_not_pt2_compliant = len(xfailed_tests) > 0
+        generated = generate_test(qualname, definitely_not_pt2_compliant, xfailed_tests)
+
+        # Could result in collisions, but unlikely. We'll raise if we see one below.
+        mangled_qualname = qualname.replace("::", "_").replace(".", "_")
+        test_name = "test_pt2_compliant_tag_" + mangled_qualname
+
+        # You can skip this test via the additional_decorators argument
+        # in generate_opcheck_tests
+        if test_name in additional_decorators:
+            for decorator in additional_decorators[test_name]:
+                generated = decorator(generated)
+
+        if hasattr(testcase, test_name):
+            raise RuntimeError(
+                f"Tried to generate a test named {test_name}, but it exists "
+                f"already. This could be because of a name collision (where "
+                f"we generated two tests with the same name), or where we "
+                f"generated a test with the same name as an existing test."
+            )
+        setattr(testcase, test_name, generated)
+
+
+TEST_OPTIONS = ("xfail", "skip", "xsuccess")
+
+
+def validate_failures_dict_formatting(failures_dict_path: str) -> None:
+    with open(failures_dict_path) as fp:
+        actual = fp.read()
+    failures_dict = FailuresDict.load(failures_dict_path)
+    expected = failures_dict._save(to_str=True)
+    if actual == expected:
+        return
+    if should_update_failures_dict():
+        failures_dict = FailuresDict.load(failures_dict_path)
+        failures_dict.save()
+        return
+    expected = expected.splitlines(1)
+    actual = actual.splitlines(1)
+    diff = difflib.unified_diff(actual, expected)
+    diff = "".join(diff)
+    raise RuntimeError(
+        f"\n{diff}\n\nExpected the failures dict to be formatted "
+        f"a certain way. Please see the above diff; you can correct "
+        f"this either manually or by re-running the test with "
+        f"PYTORCH_OPCHECK_ACCEPT=1"
+    )
+
+
+def validate_failures_dict_structure(
+    failure_dict: "FailuresDict", test_utils: List[str], testcase: Any
+) -> None:
+    """Validates the failures dict.
+
+    The failure dict looks something like the following.
+    It maps operator name (qualname) to a list of autogenerated tests.
+    Each autogenerated test may have a check for the operator (if the operator is
+    called by the test); the dictionary specifies if we should skip the check,
+    or if we expect some check to fail.
+
+    {
+        "fbgemm::split_lengths": {
+            "test_schema__test_split_lengths": {
+                "comment": "you can put whatever you want into the comment section",
+                "status": "xfail",
+            }
+            "test_schema__test_split_lengths_empty": {
+                "comment": "",
+                "status": "skip",
+            },
+        },
+        "fbgemm::gather_lengths": {
+            "test_schema__test_gather_lengths": {
+                "comment": "",
+                "status": "skip",
+            },
+        },
+    }
+
+    """
+    failure_dict = failure_dict.data
+    qualnames = list(failure_dict.keys())
+    for test_to_option in failure_dict.values():
+        test_names = list(test_to_option.keys())
+        for test_name, test_dict in test_to_option.items():
+            if set(test_dict.keys()) != set({"comment", "status"}):
+                raise RuntimeError(
+                    "in failures_dict, expected sub-dict to have keys 'comment' and 'status'"
+                )
+            test_option = test_dict["status"]
+            if test_option not in TEST_OPTIONS:
+                raise RuntimeError(
+                    f"In failures_dict, got status={test_option} but it needs to be in {TEST_OPTIONS}"
+                )
+            test_class, actual_test_name = test_name.split(".")
+            if not any(actual_test_name.startswith(test) for test in test_utils):
+                raise RuntimeError(
+                    f"In failures_dict, test name '{test_name}' should begin with one of {test_utils}"
+                )
+            for test in test_utils:
+                if not actual_test_name.startswith(test):
+                    continue
+                base_test_name = actual_test_name[len(test) + 2 :]
+                # remove potential pytest parametrization suffix
+                base_test_name = re.sub(r"\[.*\]", "", base_test_name)
+                if testcase.__name__ != test_class:
+                    continue
+                if hasattr(testcase, base_test_name):
+                    continue
+                raise RuntimeError(
+                    f"In failures dict, got test name '{test_name}'. We parsed this as "
+                    f"running test '{test}' on '{base_test_name}', but "
+                    f"{base_test_name} does not exist on the TestCase '{testcase.__name__}]. "
+                    f"Maybe you need to change the test name?"
+                )
+
+
+def should_update_failures_dict() -> bool:
+    key = "PYTORCH_OPCHECK_ACCEPT"
+    return key in os.environ and os.environ[key] == "1"
+
+
+_is_inside_opcheck_mode = threading.local()
+_is_inside_opcheck_mode.value = False
+
+
+def is_inside_opcheck_mode():
+    return _is_inside_opcheck_mode.value
+
+
+class OpCheckMode(TorchFunctionMode):
+    """
+    For a given test, OpCheckMode intercepts calls to operators and runs
+    test_util(op, args, kwargs) for each intercepted (op, args, kwargs).
+    """
+
+    def __init__(
+        self,
+        namespaces: List[str],
+        test_util_name: str,
+        test_util: Callable,
+        failures_dict: "FailuresDict",
+        test_name: str,
+        failures_dict_path: str,
+    ):
+        # We will intercept calls to ops with these namespaces
+        self.namespaces = namespaces
+        # The test utility function. Its signature should be (op, args, kwargs) -> None.
+        # Examples of test utilities are: schema_check, make_fx_check
+        self.test_util = test_util
+        self.test_util_name = test_util_name
+        # The name of the test that is running this OpCheckMode.
+        self.test_name = test_name
+        # Maps qualname -> test_name -> skip/xfail
+        # Tells us if we should skip a test or assert that there is a failure.
+        self.failures_dict = failures_dict
+        # Location of the failures dict. Makes it so that the error message is better.
+        self.failures_dict_path = failures_dict_path
+
+        # OpCheckMode surpresses errors, collects them here, and then raises them on exit.
+        # Maps qualname -> List[(Exception, func, maybe args, maybe kwargs)]
+        self.seen_ops_to_errors = {}
+
+    def maybe_raise_errors_on_exit(self) -> None:
+        # Check expected failures first
+        for qualname in self.seen_ops_to_errors.keys():
+            option = self.failures_dict.get_status(qualname, self.test_name)
+            if len(self.seen_ops_to_errors[qualname]) == 0:
+                if should_update_failures_dict():
+                    self.failures_dict.set_status(
+                        qualname, self.test_name, "xsuccess", comment=""
+                    )
+                else:
+                    if option == "xfail":
+                        raise OpCheckError(
+                            f"generate_opcheck_tests: Unexpected success for operator "
+                            f"{qualname} on test {self.test_name}. This may mean that "
+                            f"you have fixed this test failure. Please rerun the test with "
+                            f"PYTORCH_OPCHECK_ACCEPT=1 to automatically update the test runner "
+                            f"or manually remove the "
+                            f"expected failure in the failure dict at "
+                            f"{self.failures_dict_path}"
+                            f"For more details, see "
+                            f"{GDOC}"
+                        )
+                continue
+        failed_ops = []
+        for qualname in self.seen_ops_to_errors.keys():
+            option = self.failures_dict.get_status(qualname, self.test_name)
+            if option != "xsuccess":
+                continue
+            if len(self.seen_ops_to_errors[qualname]) == 0:
+                continue
+            failed_ops.append(qualname)
+        if not failed_ops:
+            return
+
+        if should_update_failures_dict():
+            for op in failed_ops:
+                self.failures_dict.set_status(op, self.test_name, "xfail")
+            return
+
+        # Raise from the first error but also report about all of them to make
+        # recording xfails easier.
+        ex, op, args, kwargs = self.seen_ops_to_errors[failed_ops[0]][0]
+        repro_command = generate_repro(
+            self.test_util_name, op, args, kwargs, save_data=should_print_better_repro()
+        )
+        raise OpCheckError(
+            f"Test generated by `generate_opcheck_tests`, {self.test_name}, "
+            f"failed on operators {failed_ops}. This usually means that the "
+            f"operators are not implemented correctly and may lead to silently "
+            f"incorrect behavior. Set PYTORCH_OPCHECK_PRINT_BETTER_REPRO=1 for a standalone repro, "
+            f"or please see "
+            f"{GDOC} "
+            f"for more recommendations. "
+            f"To reproduce this problem locally, try to run the following:\n{repro_command}"
+        ) from ex
+
+    def __enter__(self, *args, **kwargs):
+        self.prev_is_opcheck_mode = _is_inside_opcheck_mode.value
+        self.prev_dynamo_disable = os.environ.get("TORCHDYNAMO_DISABLE", "")
+        _is_inside_opcheck_mode.value = True
+        os.environ["TORCHDYNAMO_DISABLE"] = "1"
+        return super().__enter__(*args, **kwargs)
+
+    def __exit__(self, *args, **kwargs):
+        _is_inside_opcheck_mode.value = self.prev_is_opcheck_mode
+        os.environ["TORCHDYNAMO_DISABLE"] = self.prev_dynamo_disable
+        try:
+            self.maybe_raise_errors_on_exit()
+            if should_update_failures_dict():
+                self.failures_dict.save()
+        finally:
+            result = super().__exit__(*args, **kwargs)
+        return result
+
+    def run_test_util(self, op, args, kwargs):
+        try:
+            self.test_util(op, args, kwargs, copy_inputs=False)
+        except torch._subclasses.fake_tensor.UnsupportedFakeTensorException:
+            # We might get here if the input is already a FakeTensor
+            # or if we're in a torch.compile block. Just ignore these
+            # since we can't handle them and reporting them as failures
+            # is too noisy.
+            pass
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        # Only intercept calls to operators
+        if not isinstance(func, (torch._ops.OpOverloadPacket, torch._ops.OpOverload)):
+            return func(*args, **kwargs)
+        if (
+            torch.jit.is_tracing()
+            or torch.jit.is_scripting()
+            or torch._dynamo.is_compiling()
+        ):
+            return func(*args, **kwargs)
+        # Pre-existing code may not use the .default overload. If we see an
+        # OpOverloadPacket and we cannot resolve the overload, then we just throw
+        # and ask the user to clarify. Otherwise, we attempt to resolve the overload.
+        if isinstance(func, torch._ops.OpOverloadPacket):
+            func = resolve_unique_overload_or_throw(func)
+        qualname = func.name()
+        ns = qualname.split("::")[0]
+        if ns not in self.namespaces:
+            return func(*args, **kwargs)
+
+        args_c, kwargs_c = deepcopy_tensors((args, kwargs))
+        result = func(*args, **kwargs)
+
+        option = self.failures_dict.get_status(qualname, self.test_name)
+        if option == "xsuccess" or option == "xfail":
+            # Surpress all errors during execution. Raise them during __exit__.
+            try:
+                if qualname not in self.seen_ops_to_errors:
+                    self.seen_ops_to_errors[qualname] = []
+                self.run_test_util(func, args_c, kwargs_c)
+            except Exception as ex:
+                if should_print_better_repro():
+                    self.seen_ops_to_errors[qualname].append((ex, func, args, kwargs))
+                else:
+                    self.seen_ops_to_errors[qualname].append((ex, func, None, None))
+        elif option == "skip":
+            pass
+        return result
+
+
+def should_print_better_repro() -> None:
+    """If set, the tests generated by `generate_opcheck_tests` will print a
+    repro command on failure.
+
+    In order to print the repro command, we need to save some tensors to disk.
+    These will be saved under the following directory:
+    {tempfile.gettempdir()}/pytorch_opcheck_safe_to_delete/.
+
+    Although this is a temp folder, it will usually not automatically get cleaned
+    up, so you'll need to manually delete it.
+    """
+    key = "PYTORCH_OPCHECK_PRINT_BETTER_REPRO"
+    if key not in os.environ:
+        return False
+    value = os.environ[key]
+    return value == "1" or value == 1
+
+
+def opcheck(
+    op: torch._ops.OperatorBase,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    test_utils: Union[str, List[str]] = "ALL",
+    raise_exception: bool = True,
+) -> Dict[str, str]:
+    """Given an operator and some sample arguments, tests if the operator is
+    registered correctly.
+
+    We test the following (which are important for correctness in eager-mode
+    PyTorch and with torch.compile):
+    - test_schema: if the operator's schema is correct.
+    - test_autograd_registration: if autograd was registered correctly,
+        i.e. to the correct DispatchKey.
+    - test_faketensor: If the operator has a FakeTensor implementation
+        (and if it is correct).
+    - test_aot_dispatch_static: If the operator works with
+        AOTAutograd/AOTDispatch, which is one of the parts in the PT2 stack.
+        Checks that the outputs (and gradients, if they are computable)
+        of the operator are the same under eager-mode PyTorch and torch.compile.
+    - test_aot_dispatch_dynamic: Same as aot_dispatch_static, but
+        tests dynamic shapes instead of static shapes.
+
+    For best results, please call ``opcheck`` multiple times with a
+    representative set of inputs. For example, if your operator supports
+    autograd, please use ``opcheck`` with inputs that require_grad.
+
+    Args:
+        op: The operator. Should look like torch.ops.aten.foo
+        args: The args to the operator
+        kwargs: The kwargs to the operator
+        test_utils: Tests that we should run. Default: all of them.
+            Example: ["test_schema", "test_faketensor"]
+        raise_exception: If we should raise an exception on the first
+            error. If False, we will return a dict with information
+            on if each test passed or not.
+
+    """
+
+    if kwargs is None:
+        kwargs = {}
+    if isinstance(op, torch._ops.OpOverloadPacket):
+        op = resolve_unique_overload_or_throw(op)
+    if not isinstance(op, torch._ops.OpOverload):
+        raise ValueError(
+            f"opcheck(op, ...): op must be instance of torch._ops.OpOverload, "
+            f"e.g. torch.ops.aten.sin.default, got {type(op)}"
+        )
+    if test_utils == "ALL":
+        test_utils = tuple(ALL_TEST_UTILS.keys())
+    if isinstance(test_utils, str):
+        test_utils = (test_utils,)
+    if not isinstance(test_utils, (tuple, list)) or not set(test_utils).issubset(
+        ALL_TEST_UTILS.keys()
+    ):
+        raise ValueError(
+            f"opcheck(op, ..., test_utils={test_utils}), expected test_utils "
+            f"to be subset of {tuple(ALL_TEST_UTILS.keys())} but it was not"
+        )
+
+    results_dict = {}
+    for test_util in test_utils:
+        tester = ALL_TEST_UTILS[test_util]
+        try:
+            tester(op, args, kwargs)
+            results_dict[test_util] = "SUCCESS"
+        except Exception as ex:
+            if raise_exception:
+                raise OpCheckError(
+                    f"opcheck(op, ...): {test_util} failed with {ex} "
+                    f"(scroll up for stack trace)"
+                ) from ex
+            results_dict[test_util] = ex
+    return results_dict
+
+
+class OpCheckError(Exception):
+    pass
+
+
+def generate_repro(
+    test: str,
+    op: torch._ops.OpOverload,
+    args: Tuple[Any, ...],
+    kwargs: Dict[str, Any],
+    *,
+    save_data: bool,
+    dry_run: bool = False,
+) -> str:
+    if save_data:
+        now = datetime.datetime.now()
+        path = os.path.join(tempfile.gettempdir(), "pytorch_opcheck_safe_to_delete")
+        unix_timestamp = datetime.datetime.timestamp(now) * 100000
+        filepath = os.path.join(path, f"repro_{unix_timestamp}.pt")
+        if not dry_run:
+            os.makedirs(path, exist_ok=True)
+            torch.save((args, kwargs), filepath)
+        args_kwargs = f'args, kwargs = torch.load("{filepath}")'
+    else:
+        args_kwargs = (
+            "# If you rerun your test with PYTORCH_OPCHECK_PRINT_BETTER_REPRO=1\n"
+            "# we will fill them in same (args, kwargs) as in your test\n"
+            "args = ()  # args to the operator\n"
+            "kwargs = {}  # kwargs to the operator"
+        )
+
+    ns, name = op._schema.name.split("::")
+    overload = op._overloadname
+
+    repro_command = (
+        f"# =========================================================\n"
+        f"# BEGIN REPRO SCRIPT\n"
+        f"# =========================================================\n"
+        f"import torch\n"
+        f"from torch.testing._internal.optests import opcheck\n"
+        f"\n"
+        f"# Make sure you have loaded the library that contains the op\n"
+        f"# via an import or torch.ops.load_library(...)\n"
+        f"op = torch.ops.{ns}.{name}.{overload}\n"
+        f"\n"
+        f"{args_kwargs}\n"
+        f'opcheck(op, args, kwargs, test_utils="{test}")\n'
+        f"# =========================================================\n"
+        f"# END REPRO SCRIPT\n"
+        f"# =========================================================\n"
+    )
+    return repro_command
+
+
+def resolve_unique_overload_or_throw(
+    op: torch._ops.OpOverloadPacket,
+) -> torch._ops.OpOverload:
+    all_schemas = torch._C._jit_get_schemas_for_operator(op._qualified_op_name)
+    if len(all_schemas) != 1:
+        raise RuntimeError(
+            f"opcheck can only test operators without overloads. "
+            f"Got the following overloads for {op._qualified_op_name}: "
+            f"{[schema.overload_name for schema in all_schemas]}"
+        )
+
+    overload_name = all_schemas[0].overload_name
+    if overload_name == "":
+        return op.default
+    return getattr(op, overload_name)
+
+
+DUMP_OPTIONS = {"indent": 2, "sort_keys": True}
+
+
+FailuresDictData = Dict[str, Dict[str, Dict[str, str]]]
+
+
+VERSION = 1
+DESCRIPTION = (
+    f"This is a dict containing failures for tests autogenerated by "
+    f"generate_opcheck_tests. "
+    f"For more details, please see {GDOC}"
+)
+
+
+class FailuresDict:
+    def __init__(self, path: str, data: FailuresDictData):
+        self.path = path
+        self.data = data
+
+    @staticmethod
+    def load(path, *, create_file=False) -> "FailuresDict":
+        if create_file and not os.path.exists(path):
+            result = FailuresDict(path, {})
+            FailuresDict.save()
+            return result
+        with open(path) as fp:
+            contents = fp.read()
+            if contents.strip() == "":
+                dct = {
+                    "_description": DESCRIPTION,
+                    "data": {},
+                    "_version": VERSION,
+                }
+            else:
+                dct = json.loads(contents)
+                assert "data" in dct
+                assert "_version" in dct and dct["_version"] == VERSION
+        return FailuresDict(path, dct["data"])
+
+    def _save(self, to_str=False) -> Optional[str]:
+        to_dump = {
+            "_description": DESCRIPTION,
+            "data": self.data,
+            "_version": VERSION,
+        }
+        # json.dumps doesn't end with a newline. Let's add one because files
+        # should end in newlines.
+        serialized = json.dumps(to_dump, **DUMP_OPTIONS) + "\n"
+        if to_str:
+            return serialized
+        with open(self.path, "w") as fp:
+            fp.write(serialized)
+        return None
+
+    def save(self) -> None:
+        return self._save()
+
+    def get_status(self, qualname: str, test_name: str) -> str:
+        if qualname not in self.data:
+            return "xsuccess"
+        dct = self.data[qualname]
+        if test_name not in dct:
+            return "xsuccess"
+        return dct[test_name]["status"]
+
+    def set_status(
+        self,
+        qualname: str,
+        test_name: str,
+        status: str,
+        *,
+        comment: Optional[str] = None,
+    ):
+        if qualname not in self.data:
+            self.data[qualname] = {}
+        dct = self.data[qualname]
+        if test_name not in dct:
+            dct[test_name] = {"status": None, "comment": ""}
+
+        if status == "xsuccess":
+            # The default status is "xsuccess".
+            del dct[test_name]
+        else:
+            dct[test_name]["status"] = status
+            if comment is not None:
+                dct[test_name]["comment"] = comment
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/optests/make_fx.py b/MLPY/Lib/site-packages/torch/testing/_internal/optests/make_fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac73b4265d386457ca39d0b9d6a8f1d513502a55
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/optests/make_fx.py
@@ -0,0 +1,89 @@
+# mypy: ignore-errors
+
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+from torch.testing._internal.common_methods_invocations import wrapper_set_seed
+import torch.utils._pytree as pytree
+
+
+def make_fx_check(
+    func,
+    args,
+    kwargs,
+    tracing_mode,
+    assert_close=torch.testing.assert_close,
+    randomize_data=False,
+):
+    f, *new_args = handle_sizes_for_dynamic_shapes(func, args, kwargs)
+
+    def run(f, *args, **kwargs):
+        return wrapper_set_seed(f, *args, **kwargs)
+
+    traced_f = make_fx(f, tracing_mode=tracing_mode)(*new_args)
+
+    msg = (
+        "op(*args, **kwargs) and make_fx(op)(*args, **kwargs) produced different "
+        "values. This could mean that your abstract impls (meta/FakeTensor impls) "
+        "are incorrect, that your operator is not completely traceable (e.g., "
+        "it relies on some global state), or that there is a bug in make_fx. "
+        "Note that if you passed a python function (and not an operator) to "
+        "make_fx_check, it is still possible that the python function will still "
+        "work with torch.compile because it handles capturing pieces of "
+        "your python code to compile."
+    )
+
+    # Randomize the data and run the traced graph with it, to catch bugs
+    # where we may have baked in Tensor data into the trace.
+    # This is not guaranteed to succeed, because `f` might have preconditions
+    # on the values of the inputs, so we just ignore if we used
+    # random data and it fails.
+    if randomize_data:
+        new_args = randomize(new_args)
+    try:
+        expected = run(f, *new_args)
+    except Exception:
+        if randomize_data:
+            return
+        raise
+    result = run(traced_f, *new_args)
+    assert_close(result, expected, msg=msg)
+
+
+# Arguably we should make make_fx promote torch.Size() objects to symbolic shapes.
+# Absent that, here is our strategy:
+#
+# If any argument is a torch.Size(), maybe get dynamic shapes for it by:
+# - Create a temporary Tensor whose size is the torch.Size() we want. Note that
+#   we use an expanded Tensor as we cannot pass "meta" Tensors to make_fx.
+# - Pass it to make_fx such that it is is converted to a proxy Tensor
+# - Unpack the size in the wrapper to get a torch.Size with dynamic shapes (in
+#   symbolic mode, a no-op otherwise)
+def handle_sizes_for_dynamic_shapes(func, args, kwargs):
+    def f(args, kwargs, extra_args, extra_kwargs):
+        if extra_args:
+            for i, t in extra_args:
+                args[i] = t.size()
+        if extra_kwargs:
+            for k, t in extra_kwargs.items():
+                kwargs[k] = t.size()
+
+        return func(*args, **kwargs)
+
+    extra_args = []
+    extra_kwargs = {}
+    for i, arg in enumerate(args):
+        if isinstance(arg, torch.Size):
+            extra_args.append((i, torch.empty(arg, device="cpu")))
+    for key, value in kwargs.items():
+        if isinstance(value, torch.Size):
+            extra_kwargs[key] = torch.empty(value, device="cpu")
+
+    return f, args, kwargs, extra_args, extra_kwargs
+
+
+def randomize(args):
+    def transform(x):
+        if not x.dtype.is_floating_point:
+            return x
+        return x.detach().clone().uniform_(0, 1).requires_grad_(x.requires_grad)
+    return pytree.tree_map_only(torch.Tensor, transform, args)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/quantization_torch_package_models.py b/MLPY/Lib/site-packages/torch/testing/_internal/quantization_torch_package_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd391f7c7ed5a1b4621a1717ae4e529ce938a57f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/quantization_torch_package_models.py
@@ -0,0 +1,33 @@
+# mypy: ignore-errors
+
+import math
+
+import torch
+import torch.nn as nn
+
+
+class LinearReluFunctionalChild(nn.Module):
+    def __init__(self, N):
+        super().__init__()
+        self.w1 = nn.Parameter(torch.empty(N, N))
+        self.b1 = nn.Parameter(torch.zeros(N))
+        torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
+
+    def forward(self, x):
+        x = torch.nn.functional.linear(x, self.w1, self.b1)
+        x = torch.nn.functional.relu(x)
+        return x
+
+class LinearReluFunctional(nn.Module):
+    def __init__(self, N):
+        super().__init__()
+        self.child = LinearReluFunctionalChild(N)
+        self.w1 = nn.Parameter(torch.empty(N, N))
+        self.b1 = nn.Parameter(torch.zeros(N))
+        torch.nn.init.kaiming_uniform_(self.w1, a=math.sqrt(5))
+
+    def forward(self, x):
+        x = self.child(x)
+        x = torch.nn.functional.linear(x, self.w1, self.b1)
+        x = torch.nn.functional.relu(x)
+        return x
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/static_module.py b/MLPY/Lib/site-packages/torch/testing/_internal/static_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef0bfbbc28d0e03b4ca97bc5a434cd10f40b6a5b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/static_module.py
@@ -0,0 +1,26 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+
+
+class StaticModule:
+    def __init__(self, scripted):
+        # this is an nn.Module
+        if hasattr(scripted, "_c"):
+            self.static_module = torch._C._jit_to_static_module(scripted._c)
+        else:
+            self.static_module = torch._C._jit_to_static_module(scripted.graph)
+
+    def __call__(self, *args, **kwargs):
+        return self.static_module(*args, **kwargs)
+
+    def benchmark(self, args, kwargs, warmup_runs, main_runs):
+        self.static_module.benchmark(args, kwargs, warmup_runs, main_runs)
+
+    def runAsync(self, args, kwargs):
+        return self.static_module.runAsync(args, kwargs)
+
+    def benchmark_individual_ops(self, args, kwargs, warmup_runs, main_runs):
+        return self.static_module.benchmark_individual_ops(
+            args, kwargs, warmup_runs, main_runs
+        )
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__init__.py b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0234024a3758fb14112bef54d79a145f10b280de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/future_div.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/future_div.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02c822e5ff1b5aa21a3e631e6177af346700f5a7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/future_div.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/no_future_div.cpython-39.pyc b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/no_future_div.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb01941d5c235983a8e821b0b8608d8b1ca152ba
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/__pycache__/no_future_div.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/future_div.py b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/future_div.py
new file mode 100644
index 0000000000000000000000000000000000000000..1136094dda1a3000c90a7ad8abc61a8699ff4944
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/future_div.py
@@ -0,0 +1,10 @@
+# mypy: ignore-errors
+
+
+
+def div_int_future():
+    return 1 / 2
+
+
+def div_float_future():
+    return 3.14 / 0.125
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/test_module/no_future_div.py b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/no_future_div.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c48be0d09e2af1a169b83ec301acfee54574bf9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/test_module/no_future_div.py
@@ -0,0 +1,11 @@
+# mypy: ignore-errors
+
+import torch  # noqa: F401
+
+
+def div_int_nofuture():
+    return 1 / 2
+
+
+def div_float_nofuture():
+    return 3.14 / 0.125
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/triton_utils.py b/MLPY/Lib/site-packages/torch/testing/_internal/triton_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6afa2a65689a7d54e92f9618034fbc1a2723ca18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/triton_utils.py
@@ -0,0 +1,392 @@
+# mypy: ignore-errors
+
+import unittest
+
+from torch.testing._internal.inductor_utils import HAS_CUDA
+
+
+def has_lark():
+    try:
+        import lark  # noqa: F401
+
+        return True
+    except ModuleNotFoundError:
+        return False
+
+
+HAS_LARK = has_lark()
+
+requires_lark = unittest.skipUnless(HAS_LARK, "requires lark")
+requires_cuda = unittest.skipUnless(HAS_CUDA, "requires cuda")
+
+if HAS_CUDA:
+    import triton
+    from triton import language as tl
+
+    # Define here so that multiple tests can take advantage of it
+    @triton.jit
+    def add_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def add_kernel_with_optional_param(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        ARGS_PASSED: "tl.constexpr",
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        if ARGS_PASSED == "two":
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+        else:
+            output = x
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.autotune(
+        configs=[
+            triton.Config({"BLOCK_SIZE": 128}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 128}, num_stages=4, num_warps=4),
+            triton.Config({"BLOCK_SIZE": 64}, num_stages=3, num_warps=8),
+            triton.Config({"BLOCK_SIZE": 64}, num_stages=4, num_warps=4),
+        ],
+        key=[],
+    )
+    @triton.jit
+    def add_kernel_autotuned(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.autotune(
+        configs=[
+            triton.Config(
+                {"BLOCK_SIZE_X": 128, "BLOCK_SIZE_Y": 128}, num_stages=3, num_warps=8
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_X": 128, "BLOCK_SIZE_Y": 128}, num_stages=4, num_warps=4
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_X": 64, "BLOCK_SIZE_Y": 64}, num_stages=3, num_warps=8
+            ),
+            triton.Config(
+                {"BLOCK_SIZE_X": 64, "BLOCK_SIZE_Y": 64}, num_stages=4, num_warps=4
+            ),
+        ],
+        key=[],
+    )
+    @triton.jit
+    def add_kernel_2d_autotuned(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        x_elements,
+        y_elements,
+        BLOCK_SIZE_X: "tl.constexpr",
+        BLOCK_SIZE_Y: "tl.constexpr",
+    ):
+        xoffset = tl.program_id(0) * BLOCK_SIZE_X
+        xindex = xoffset + tl.arange(0, BLOCK_SIZE_X)[:, None]
+        xmask = xindex < x_elements
+        yoffset = tl.program_id(1) * BLOCK_SIZE_Y
+        yindex = yoffset + tl.arange(0, BLOCK_SIZE_Y)[None, :]
+        ymask = yindex < y_elements
+        x1 = xindex
+        y0 = yindex
+        tmp0 = tl.load(in_ptr0 + (x1 + (x_elements * y0)), xmask & ymask)
+        tmp1 = tl.load(in_ptr0 + (y0 + (y_elements * x1)), xmask & ymask)
+        tmp2 = tmp0 + tmp1
+        tl.store(out_ptr + (x1 + (x_elements * y0)), tmp2, xmask & ymask)
+
+    @triton.jit
+    def mul2_kernel(
+        in_ptr0,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        output = 2 * x
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def mul2_inplace_kernel(
+        ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(ptr + offsets, mask=mask)
+        output = 2 * x
+        tl.store(ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def zero_negs(x):
+        return tl.where(x >= 0, x, 0)
+
+    @triton.jit
+    def indirection_kernel(
+        in_ptr0,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+        ACTIVATION: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        if ACTIVATION == "mul2_inplace_kernel":
+            mul2_inplace_kernel(in_ptr0, n_elements, BLOCK_SIZE=BLOCK_SIZE)
+        elif ACTIVATION == "add_kernel":
+            add_kernel(in_ptr0, in_ptr0, out_ptr, n_elements, BLOCK_SIZE=BLOCK_SIZE)
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        tl.store(out_ptr + offsets, x, mask=mask)
+
+    @triton.jit
+    def double_strided_kernel(
+        in_ptr,
+        out_ptr,
+        in_y_stride,
+        out_y_stride,
+        X_BLOCK_SIZE: "tl.constexpr",
+        Y_BLOCK_SIZE: "tl.constexpr",
+    ):
+        xid = tl.program_id(axis=0)
+        yid = tl.program_id(axis=1)
+        x_start = xid * X_BLOCK_SIZE
+        y_start = yid * Y_BLOCK_SIZE
+        x_offsets = x_start + tl.arange(0, X_BLOCK_SIZE)
+        y_offsets = y_start + tl.arange(0, Y_BLOCK_SIZE)
+        src_offsets = y_offsets[:, None] * in_y_stride + x_offsets[None, :]
+        dst_offsets = y_offsets[:, None] * out_y_stride + x_offsets[None, :]
+        src = tl.load(in_ptr + src_offsets)
+        tl.store(out_ptr + dst_offsets, src * 2.0)
+
+    @triton.jit
+    def inline_asm_kernel(X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"):
+        x = tl.load(X + tl.arange(0, BLOCK))
+        y = tl.load(Y + tl.arange(0, BLOCK))
+        s = tl.full([BLOCK], n, tl.int32)
+        z = tl.inline_asm_elementwise(
+            "shf.l.wrap.b32 $0, $1, $2, $3;",
+            "=r,r, r, r",
+            [x, y, s],
+            dtype=tl.int32,
+            is_pure=True,
+            pack=1,
+        )
+        tl.store(Z + tl.arange(0, BLOCK), z)
+
+    @triton.jit
+    def add_kernel_with_block_ptr(
+        x_ptr,
+        y_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        x = tl.load(
+            tl.make_block_ptr(
+                base=x_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            boundary_check=[0],
+        )
+        y = tl.load(
+            tl.make_block_ptr(
+                base=y_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            boundary_check=[0],
+        )
+        output = x + y
+        tl.store(
+            tl.make_block_ptr(
+                base=output_ptr,
+                shape=[n_elements],
+                strides=[1],
+                offsets=[block_start],
+                block_shape=[BLOCK_SIZE],
+                order=[0],
+            ),
+            output,
+            boundary_check=[0],
+        )
+
+    @triton.jit
+    def kernel_with_block_ptr_2d(
+        x_ptr,
+        output_ptr,
+        n_elements,
+        BLOCK_SIZE: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        x = tl.load(
+            tl.make_block_ptr(
+                base=x_ptr,
+                shape=[n_elements, 1],
+                strides=[1, 1],
+                offsets=[block_start, 0],
+                block_shape=[BLOCK_SIZE, 1],
+                order=[1, 0],
+            ),
+            boundary_check=[0],
+        )
+        output = x
+        tl.store(
+            tl.make_block_ptr(
+                base=output_ptr,
+                shape=[n_elements, 1],
+                strides=[1, 1],
+                offsets=[block_start, 0],
+                block_shape=[BLOCK_SIZE, 1],
+                order=[1, 0],
+            ),
+            output,
+            boundary_check=[0],
+        )
+
+    from triton.language import load, store
+
+    @triton.jit
+    def add_kernel_with_import(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = load(in_ptr0 + offsets, mask=mask)
+        y = load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def cond_op_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        if tl.program_id(0) == 0:
+            output = x + y
+        else:
+            output = x * y
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def atomic_add_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.atomic_add(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def add_4_times_kernel(
+        in_ptr0,
+        in_ptr1,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        for i in range(2):
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+        i = 2
+        while i > 0:
+            i -= 1
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+    @triton.jit
+    def add_kernel_out_of_order_fn2(
+        in_ptr0,
+        in_ptr1,
+        n_elements,
+        out_ptr,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        y = tl.load(in_ptr1 + offsets, mask=mask)
+        output = x + y
+        tl.store(out_ptr + offsets, output, mask=mask)
diff --git a/MLPY/Lib/site-packages/torch/testing/_internal/two_tensor.py b/MLPY/Lib/site-packages/torch/testing/_internal/two_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..13d9ddc624740dcea6ae79eb3baf006ff1698bd6
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/testing/_internal/two_tensor.py
@@ -0,0 +1,82 @@
+# mypy: ignore-errors
+
+import torch
+import torch.utils._pytree as pytree
+from torch.utils._python_dispatch import return_and_correct_aliasing
+
+
+# A simple tensor subclass that holds two tensors internally, and runs every op on both tensors.
+class TwoTensor(torch.Tensor):
+    @staticmethod
+    def __new__(cls, a, b):
+        assert (
+            a.device == b.device
+            and a.layout == b.layout
+            and a.requires_grad == b.requires_grad
+            and a.dtype == b.dtype
+        )
+        # I guess it would be more accurate to represent the shape as torch.cat(a, b).shape
+        shape = a.shape
+        kwargs = {}
+        kwargs["strides"] = a.stride()
+        kwargs["storage_offset"] = a.storage_offset()
+        kwargs["device"] = a.device
+        kwargs["layout"] = a.layout
+        kwargs["requires_grad"] = a.requires_grad
+        kwargs["dtype"] = a.dtype
+        out = torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)
+
+        assert a.shape == b.shape
+        assert a.stride() == b.stride()
+        assert a.storage_offset() == b.storage_offset()
+        return out
+
+    def __init__(self, a, b):
+        self.a = a
+        self.b = b
+
+    def __repr__(self):
+        a_repr = repr(self.a)
+        b_repr = repr(self.b)
+        return f"TwoTensor({a_repr}, {b_repr})"
+
+    def __tensor_flatten__(self):
+        return ["a", "b"], None
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
+        assert meta is None
+        a, b = inner_tensors["a"], inner_tensors["b"]
+        return TwoTensor(a, b)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        if kwargs is None:
+            kwargs = {}
+        args_a = pytree.tree_map_only(TwoTensor, lambda x: x.a, args)
+        args_b = pytree.tree_map_only(TwoTensor, lambda x: x.b, args)
+
+        kwargs_a = pytree.tree_map_only(TwoTensor, lambda x: x.a, kwargs)
+        kwargs_b = pytree.tree_map_only(TwoTensor, lambda x: x.b, kwargs)
+
+        out_a = func(*args_a, **kwargs_a)
+        out_b = func(*args_b, **kwargs_b)
+        assert type(out_a) == type(out_b)
+        out_a_flat, spec = pytree.tree_flatten(out_a)
+        out_b_flat = pytree.tree_leaves(out_b)
+        # for aten ops that return non-tensors, just assume that
+        # our two inner tensors return the same value
+        out_flat = [
+            TwoTensor(o_a, o_b) if isinstance(o_a, torch.Tensor) else o_a
+            for o_a, o_b in zip(out_a_flat, out_b_flat)
+        ]
+        out = pytree.tree_unflatten(out_flat, spec)
+        return return_and_correct_aliasing(func, args, kwargs, out)
+
+
+class TwoTensorMode(torch.utils._python_dispatch.TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        out = func(*args, **kwargs)
+        if torch._subclasses.fake_tensor._is_tensor_constructor(func):
+            out = TwoTensor(out, out.clone())
+        return out
diff --git a/MLPY/Lib/site-packages/torch/torch_version.py b/MLPY/Lib/site-packages/torch/torch_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81cb7c02ed7d94981a9f6e9237522c752c3f523
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/torch_version.py
@@ -0,0 +1,58 @@
+# mypy: ignore-errors
+
+from typing import Any, Iterable
+from .version import __version__ as internal_version
+from ._vendor.packaging.version import Version, InvalidVersion
+
+__all__ = ['TorchVersion']
+
+
+class TorchVersion(str):
+    """A string with magic powers to compare to both Version and iterables!
+    Prior to 1.10.0 torch.__version__ was stored as a str and so many did
+    comparisons against torch.__version__ as if it were a str. In order to not
+    break them we have TorchVersion which masquerades as a str while also
+    having the ability to compare against both packaging.version.Version as
+    well as tuples of values, eg. (1, 2, 1)
+    Examples:
+        Comparing a TorchVersion object to a Version object
+            TorchVersion('1.10.0a') > Version('1.10.0a')
+        Comparing a TorchVersion object to a Tuple object
+            TorchVersion('1.10.0a') > (1, 2)    # 1.2
+            TorchVersion('1.10.0a') > (1, 2, 1) # 1.2.1
+        Comparing a TorchVersion object against a string
+            TorchVersion('1.10.0a') > '1.2'
+            TorchVersion('1.10.0a') > '1.2.1'
+    """
+    # fully qualified type names here to appease mypy
+    def _convert_to_version(self, inp: Any) -> Any:
+        if isinstance(inp, Version):
+            return inp
+        elif isinstance(inp, str):
+            return Version(inp)
+        elif isinstance(inp, Iterable):
+            # Ideally this should work for most cases by attempting to group
+            # the version tuple, assuming the tuple looks (MAJOR, MINOR, ?PATCH)
+            # Examples:
+            #   * (1)         -> Version("1")
+            #   * (1, 20)     -> Version("1.20")
+            #   * (1, 20, 1)  -> Version("1.20.1")
+            return Version('.'.join(str(item) for item in inp))
+        else:
+            raise InvalidVersion(inp)
+
+    def _cmp_wrapper(self, cmp: Any, method: str) -> bool:
+        try:
+            return getattr(Version(self), method)(self._convert_to_version(cmp))
+        except BaseException as e:
+            if not isinstance(e, InvalidVersion):
+                raise
+            # Fall back to regular string comparison if dealing with an invalid
+            # version like 'parrot'
+            return getattr(super(), method)(cmp)
+
+
+for cmp_method in ["__gt__", "__lt__", "__eq__", "__ge__", "__le__"]:
+    setattr(TorchVersion, cmp_method, lambda x, y, method=cmp_method: x._cmp_wrapper(y, method))
+
+__version__ = TorchVersion(internal_version)
diff --git a/MLPY/Lib/site-packages/torch/types.py b/MLPY/Lib/site-packages/torch/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d54ccae1b2880278e5b6988bf53e01c69507d0a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/types.py
@@ -0,0 +1,79 @@
+import torch
+from typing import Any, List, Optional, Sequence, Tuple, Union
+
+import builtins
+
+# Convenience aliases for common composite types that we need
+# to talk about in PyTorch
+
+_TensorOrTensors = Union[torch.Tensor, Sequence[torch.Tensor]]
+_TensorOrTensorsOrGradEdge = Union[
+    torch.Tensor, Sequence[torch.Tensor],
+    "torch.autograd.graph.GradientEdge",
+    Sequence["torch.autograd.graph.GradientEdge"]]
+
+# In some cases, these basic types are shadowed by corresponding
+# top-level values.  The underscore variants let us refer to these
+# types.  See https://github.com/python/mypy/issues/4146 for why these
+# workarounds is necessary
+_int = builtins.int
+_float = builtins.float
+_bool = builtins.bool
+_complex = builtins.complex
+
+_dtype = torch.dtype
+_device = torch.device
+_qscheme = torch.qscheme
+_size = Union[torch.Size, List[_int], Tuple[_int, ...]]
+_layout = torch.layout
+_dispatchkey = Union[str, torch._C.DispatchKey]
+
+# Meta-type for "numeric" things; matches our docs
+Number = Union[builtins.int, builtins.float, builtins.bool]
+
+# Meta-type for "device-like" things.  Not to be confused with 'device' (a
+# literal device object).  This nomenclature is consistent with PythonArgParser.
+# None means use the default device (typically CPU)
+Device = Optional[Union[_device, str, _int]]
+del Optional
+
+# Storage protocol implemented by ${Type}StorageBase classes
+
+class Storage:
+    _cdata: int
+    device: torch.device
+    dtype: torch.dtype
+    _torch_load_uninitialized: bool
+
+    def __deepcopy__(self, memo) -> 'Storage':  # type: ignore[empty-body]
+        ...
+
+    def _new_shared(self, int) -> 'Storage':  # type: ignore[empty-body]
+        ...
+
+    def _write_file(self, f: Any, is_real_file: _bool, save_size: _bool, element_size: int) -> None:
+        ...
+
+    def element_size(self) -> int:  # type: ignore[empty-body]
+        ...
+
+    def is_shared(self) -> bool:  # type: ignore[empty-body]
+        ...
+
+    def share_memory_(self) -> 'Storage':  # type: ignore[empty-body]
+        ...
+
+    def nbytes(self) -> int:  # type: ignore[empty-body]
+        ...
+
+    def cpu(self) -> 'Storage':  # type: ignore[empty-body]
+        ...
+
+    def data_ptr(self) -> int:  # type: ignore[empty-body]
+        ...
+
+    def from_file(self, filename: str, shared: bool = False, nbytes: int = 0) -> 'Storage':  # type: ignore[empty-body]
+        ...
+
+    def _new_with_file(self, f: Any, element_size: int) -> 'Storage':  # type: ignore[empty-body]
+        ...
diff --git a/MLPY/Lib/site-packages/torch/utils/__init__.py b/MLPY/Lib/site-packages/torch/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0524906fb514a3363a47b3bbe809c77e31e2bb14
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/__init__.py
@@ -0,0 +1,68 @@
+import os.path as _osp
+import torch
+
+from .throughput_benchmark import ThroughputBenchmark
+from .cpp_backtrace import get_cpp_backtrace
+from .backend_registration import rename_privateuse1_backend, generate_methods_for_privateuse1_backend
+from . import deterministic
+from . import collect_env
+import weakref
+import copyreg
+
+def set_module(obj, mod):
+    """
+    Set the module attribute on a python object for a given object for nicer printing
+    """
+    if not isinstance(mod, str):
+        raise TypeError("The mod argument should be a string")
+    obj.__module__ = mod
+
+if torch._running_with_deploy():
+    # not valid inside torch_deploy interpreter, no paths exists for frozen modules
+    cmake_prefix_path = None
+else:
+    cmake_prefix_path = _osp.join(_osp.dirname(_osp.dirname(__file__)), 'share', 'cmake')
+
+def swap_tensors(t1, t2):
+    """
+    This function swaps the content of the two Tensor objects.
+    At a high level, this will make t1 have the content of t2 while preserving
+    its identity.
+
+    This will not work if t1 and t2 have different slots.
+    """
+    # Ensure there are no weakrefs
+    if weakref.getweakrefs(t1):
+        raise RuntimeError("Cannot swap t1 because it has weakref associated with it")
+    if weakref.getweakrefs(t2):
+        raise RuntimeError("Cannot swap t2 because it has weakref associated with it")
+    t1_slots = set(copyreg._slotnames(t1.__class__))  # type: ignore[attr-defined]
+    t2_slots = set(copyreg._slotnames(t2.__class__))  # type: ignore[attr-defined]
+    if t1_slots != t2_slots:
+        raise RuntimeError("Cannot swap t1 and t2 if they have different slots")
+
+    def swap_attr(name):
+        tmp = getattr(t1, name)
+        setattr(t1, name, (getattr(t2, name)))
+        setattr(t2, name, tmp)
+
+    # Swap the types
+    # Note that this will fail if there are mismatched slots
+    swap_attr("__class__")
+
+    # Swap the dynamic attributes
+    swap_attr("__dict__")
+
+    # Swap the slots
+    for slot in t1_slots:
+        if hasattr(t1, slot) and hasattr(t2, slot):
+            swap_attr(slot)
+        elif hasattr(t1, slot):
+            setattr(t2, slot, (getattr(t1, slot)))
+            delattr(t1, slot)
+        elif hasattr(t2, slot):
+            setattr(t1, slot, (getattr(t2, slot)))
+            delattr(t2, slot)
+
+    # Swap the at::Tensor they point to
+    torch._C._swap_tensor_impl(t1, t2)
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a8887f0644690dd00e06c3b6e9e199d7343b721
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_config_module.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_config_module.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b463123e1e9878deab7477e218e0e2a67798703
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_config_module.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_content_store.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_content_store.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bb668954e955f2d4fa8d0f9496bfca6c85b8a66
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_content_store.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_contextlib.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_contextlib.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52c5d312ef2ef63986a01a0b1ddd85c905ed8932
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_contextlib.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_cpp_extension_versioner.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cpp_extension_versioner.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d27882163cda0226b4e5e80a4b662f75192f7cb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cpp_extension_versioner.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_cuda_trace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cuda_trace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..030e4a15a3bb5774b70640ff1c97f6b241d3f539
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cuda_trace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_cxx_pytree.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cxx_pytree.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59643fc1bb2535dfe21020f7e54c762520349045
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_cxx_pytree.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_device.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_device.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2df0bccc5abbf425119ba2b1e51d1898124f4519
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_device.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_foreach_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_foreach_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd4c3bb8f1f4f4f3aaf57da4cf74ae63b1a355a8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_foreach_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_freeze.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_freeze.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd70fb058759c9d3b3a28970df673cea67a420b9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_freeze.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_import_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_import_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fd0619b8b31f7567606860ca5aac9b2f77fafde
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_import_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_mode_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_mode_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a65c93533fe2b9caf4b44d74bb12bdc6206fa7b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_mode_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_python_dispatch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_python_dispatch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50dff83ae6cda467c3e414bce0fd24f2754da989
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_python_dispatch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_pytree.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_pytree.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b86dbbf76bf3b4f0c196d15c9fb37cb4b96bf382
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_pytree.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_stats.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_stats.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4682f3d089e43ea772db92ac5cac5fd6ef8faf2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_stats.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_traceback.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_traceback.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a11665fda969d6e72fc7adc50b5187e1dc31357
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_traceback.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_triton.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_triton.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5adf1eed94220afedd010b8ff4d776d7918fb252
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_triton.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_typing_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_typing_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9bdd9bce75001134e18f5a53233f81596e04be0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_typing_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/_zip.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/_zip.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..346849a88b7abc27c59c39bb53cce6d8f57c241b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/_zip.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/backend_registration.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/backend_registration.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed2c1697bab3ba7658003ac9309e86439e6f0795
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/backend_registration.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/bundled_inputs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/bundled_inputs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..509a433109151713ce6adcf20948cf1b781e2241
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/bundled_inputs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/checkpoint.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/checkpoint.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..254f60ef22682ebd00c83c48087b6f4df3dd924d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/checkpoint.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/collect_env.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/collect_env.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e409ea0d12976e8a13c28625d698b2760e39b961
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/collect_env.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_backtrace.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_backtrace.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78515ee23d5343822957fcfb9b6acf2b8a37de12
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_backtrace.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_extension.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_extension.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ddeebee9edd1649020c0f9b6bdb414d18bfd2db
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/cpp_extension.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/deterministic.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/deterministic.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f93af27b7636c152103add92ee3c0615e8484cb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/deterministic.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/dlpack.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/dlpack.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4b4a1cf9a2ed30cde0630ad281e9d82f9ad857d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/dlpack.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/file_baton.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/file_baton.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bd796bf9165467e54297368cd20e26a6ab0aa8c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/file_baton.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/flop_counter.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/flop_counter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7540a8beefca029aa29e4055e099122670005753
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/flop_counter.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/hooks.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0743c4c8c6d0fcea4007b930eec6aa12c3b54bb8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/hooks.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/mkldnn.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/mkldnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf645868ee229826f9ca332b36a8c0bdedff4d68
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/mkldnn.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/mobile_optimizer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/mobile_optimizer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95923d263291579d8ad480c5429648e0603137a4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/mobile_optimizer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/model_zoo.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/model_zoo.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b81f41063876f49672ea7ef39364b1792caca2
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/model_zoo.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/show_pickle.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/show_pickle.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e88357b8f5dddf575554d599b0350ce5d4a0461a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/show_pickle.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/throughput_benchmark.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/throughput_benchmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..745c0ff771399c135dd553489af0de5bc47e6841
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/throughput_benchmark.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/__pycache__/weak.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/__pycache__/weak.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc83f6608325985561dcd50d5f6b5f7417baf19
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/__pycache__/weak.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_config_module.py b/MLPY/Lib/site-packages/torch/utils/_config_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b02319241eff54bd7aba32b60965638fd06314
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_config_module.py
@@ -0,0 +1,369 @@
+import contextlib
+
+import copy
+import hashlib
+import inspect
+import io
+import pickle
+import tokenize
+import unittest
+import warnings
+from types import FunctionType, ModuleType
+from typing import Any, Dict, Optional, Set, Union
+from unittest import mock
+
+# Types saved/loaded in configs
+CONFIG_TYPES = (int, float, bool, type(None), str, list, set, tuple, dict)
+
+
+def install_config_module(module):
+    """
+    Converts a module-level config into a `ConfigModule()`.
+
+    See _config_typing.pyi for instructions on how to get the converted module to typecheck.
+    """
+
+    class ConfigModuleInstance(ConfigModule):
+        _bypass_keys = set({"_is_dirty", "_hash_digest"})
+
+    def visit(source, dest, prefix):
+        """Walk the module structure and move everything to module._config"""
+        for key, value in list(source.__dict__.items()):
+            if (
+                key.startswith("__")
+                or isinstance(value, (ModuleType, FunctionType))
+                or (hasattr(value, "__module__") and value.__module__ == "typing")
+            ):
+                continue
+
+            name = f"{prefix}{key}"
+            if isinstance(value, CONFIG_TYPES):
+                config[name] = value
+                default[name] = value
+                if dest is module:
+                    delattr(module, key)
+            elif isinstance(value, type):
+                assert value.__module__ == module.__name__
+                # a subconfig with `class Blah:` syntax
+                proxy = SubConfigProxy(module, f"{name}.")
+                visit(value, proxy, f"{name}.")
+                setattr(dest, key, proxy)
+            else:
+                raise AssertionError(f"Unhandled config {key}={value} ({type(value)})")
+
+    config: Dict[str, Any] = dict()
+    default: Dict[str, Any] = dict()
+
+    compile_ignored_keys = get_assignments_with_compile_ignored_comments(module)
+
+    visit(module, module, "")
+    module._config = config
+    module._default = default
+    module._allowed_keys = set(config.keys())
+    module._compile_ignored_keys = compile_ignored_keys
+    module.__class__ = ConfigModuleInstance
+    module._is_dirty = True
+    module._hash_digest = None
+
+
+COMPILE_IGNORED_MARKER = "@compile_ignored"
+
+
+# Gets all the keys (i.e. assignments) with a @compile_ignored comment
+def get_assignments_with_compile_ignored_comments(module):
+    source_code = inspect.getsource(module)
+    assignments = set()
+
+    # Tokenize the source code to retrieve comments
+    tokens = tokenize.tokenize(io.BytesIO(source_code.encode("utf-8")).readline)
+    current_comment = "", -1
+    prev_name = ""
+
+    for token in tokens:
+        if token.type == tokenize.COMMENT:
+            prev_name = ""
+            maybe_current = token.string.strip()
+            if COMPILE_IGNORED_MARKER in maybe_current:
+                assert current_comment == (
+                    "",
+                    -1,
+                ), f"unconsumed {COMPILE_IGNORED_MARKER}"
+                current_comment = maybe_current, token.start[0]
+        elif token.type == tokenize.NAME:
+            # Only accept the first name token, to handle if you have
+            # something like foo: Bar = ...
+            if not prev_name:
+                prev_name = token.string
+        elif token.type == tokenize.OP and token.string == "=":
+            # Check if the current assignment follows a comment
+            # with COMPILE_IGNORED_MARKER
+            if (
+                COMPILE_IGNORED_MARKER in current_comment[0]
+                and current_comment[1] == token.start[0] - 1
+            ):
+                assignments.add(prev_name)
+                current_comment = "", -1  # reset
+            prev_name = ""
+    assert current_comment == ("", -1), f"unconsumed {COMPILE_IGNORED_MARKER}"
+    return assignments
+
+
+class ConfigModule(ModuleType):
+    # NOTE: This should be kept in sync with _config_typing.pyi.
+
+    # The default values of the configuration settings.  This can be used to
+    # determine if the config has been changed or not.
+    _default: Dict[str, Any]
+    # The actual configuration settings.  E.g., torch._dynamo.config.debug
+    # would live as "debug" in the key, and torch._inductor.config.triton.cudagraphs
+    # maps as "triton.cudagraphs"
+    _config: Dict[str, Any]
+    _allowed_keys: Set[str]
+    _bypass_keys: Set[str]
+    _compile_ignored_keys: Set[str]
+    _is_dirty: bool
+    _hash_digest: Optional[bytes]
+
+    def __init__(self):
+        raise NotImplementedError(
+            f"use {__name__}.install_config_module(sys.modules[__name__])"
+        )
+
+    def __setattr__(self, name, value):
+        if name in self._bypass_keys:
+            super().__setattr__(name, value)
+        elif name not in self._allowed_keys:
+            raise AttributeError(f"{self.__name__}.{name} does not exist")
+        else:
+            self._config[name] = value
+
+    def __getattr__(self, name):
+        try:
+            return self._config[name]
+        except KeyError as e:
+            # make hasattr() work properly
+            raise AttributeError(f"{self.__name__}.{name} does not exist") from e
+
+    def __delattr__(self, name):
+        # must support delete because unittest.mock.patch deletes
+        # then recreate things
+        del self._config[name]
+
+    def save_config(self) -> bytes:
+        """Convert config to a pickled blob"""
+        config = dict(self._config)
+        for key in config.get("_save_config_ignore", ()):
+            config.pop(key)
+        return pickle.dumps(config, protocol=2)
+
+    def codegen_config(self) -> str:
+        """Convert config to Python statements that replicate current config.
+        This does NOT include config settings that are at default values.
+        """
+        lines = []
+        mod = self.__name__
+        for k, v in self._config.items():
+            if k in self._config.get("_save_config_ignore", ()):
+                continue
+            if v == self._default[k]:
+                continue
+            lines.append(f"{mod}.{k} = {v!r}")
+        return "\n".join(lines)
+
+    def get_hash(self) -> bytes:
+        """Hashes the configs that are not compile_ignored"""
+        if self._is_dirty or self._hash_digest is None:
+            dict_to_hash = {
+                k: v
+                for k, v in self._config.items()
+                if k not in self._compile_ignored_keys
+            }
+            string_to_hash = repr(sorted(dict_to_hash.items()))
+            self._hash_digest = hashlib.md5(string_to_hash.encode("utf-8")).digest()
+            self._is_dirty = False
+        return self._hash_digest
+
+    def to_dict(self) -> Dict[str, Any]:
+        warnings.warn(
+            "config.to_dict() has been deprecated. It may no longer change the underlying config."
+            " use config.shallow_copy_dict() or config.get_config_copy() instead",
+            DeprecationWarning,
+        )
+        return self.shallow_copy_dict()
+
+    def shallow_copy_dict(self) -> Dict[str, Any]:
+        return {**self._config}
+
+    def load_config(self, maybe_pickled_config: Union[bytes, Dict[str, Any]]) -> None:
+        """Restore from a prior call to save_config() or shallow_copy_dict()"""
+        if not isinstance(maybe_pickled_config, dict):
+            config = pickle.loads(maybe_pickled_config)
+        else:
+            config = maybe_pickled_config
+        self._config.update(config)
+
+    def get_config_copy(self) -> Dict[str, Any]:
+        return copy.deepcopy(self._config)
+
+    def patch(
+        self,
+        arg1: Optional[Union[str, Dict[str, Any]]] = None,
+        arg2: Any = None,
+        **kwargs,
+    ):
+        """
+        Decorator and/or context manager to make temporary changes to a config.
+
+        As a decorator:
+
+            @config.patch("name", val)
+            @config.patch(name1=val1, name2=val2)
+            @config.patch({"name1": val1, "name2", val2})
+            def foo(...):
+                ...
+
+        As a context manager:
+
+            with config.patch("name", val):
+                ...
+        """
+        changes: Dict[str, Any]
+        if arg1 is not None:
+            if arg2 is not None:
+                assert isinstance(arg1, str)
+                # patch("key", True) syntax
+                changes = {arg1: arg2}
+            else:
+                assert isinstance(arg1, dict)
+                # patch({"key": True}) syntax
+                changes = arg1
+            assert not kwargs
+        else:
+            # patch(key=True) syntax
+            changes = kwargs
+            assert arg2 is None
+        assert isinstance(changes, dict), f"expected `dict` got {type(changes)}"
+        prior: Dict[str, Any] = {}
+        config = self
+        dirty = False
+
+        class ConfigPatch(ContextDecorator):
+            def __enter__(self):
+                assert not prior
+                nonlocal dirty
+                for key in changes.keys():
+                    # KeyError on invalid entry
+                    prior[key] = config._config[key]
+                    dirty = key not in config._compile_ignored_keys
+                config._config.update(changes)
+                config._is_dirty = dirty
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                nonlocal dirty
+                config._config.update(prior)
+                config._is_dirty = dirty
+                prior.clear()
+
+        return ConfigPatch()
+
+    def _make_closure_patcher(self, **changes):
+        """
+        A lower-overhead version of patch() for things on the critical path.
+
+        Usage:
+
+            # do this off the critical path
+            change_fn = config.make_closure_patcher(foo=True)
+
+            ...
+
+            revert = change_fn()
+            try:
+              ...
+            finally:
+                revert()
+
+        """
+        config = self._config
+
+        def change():
+            prior = {k: config[k] for k in changes}
+            config.update(changes)
+
+            def revert():
+                config.update(prior)
+
+            return revert
+
+        return change
+
+
+class ContextDecorator(contextlib.ContextDecorator):
+    """
+    Same as contextlib.ContextDecorator, but with support for
+    `unittest.TestCase`
+    """
+
+    def __enter__(self):
+        raise NotImplementedError("NYI")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        raise NotImplementedError("NYI")
+
+    def __call__(self, func):
+        if isinstance(func, type) and issubclass(func, unittest.TestCase):
+
+            class _TestCase(func):  # type: ignore[valid-type, misc]
+                @classmethod
+                def setUpClass(cls):
+                    self.__enter__()
+                    try:
+                        super().setUpClass()
+                    except Exception:
+                        self.__exit__(None, None, None)
+                        raise
+
+                @classmethod
+                def tearDownClass(cls):
+                    try:
+                        super().tearDownClass()
+                    finally:
+                        self.__exit__(None, None, None)
+
+            _TestCase.__name__ = func.__name__
+            _TestCase.__qualname__ = func.__qualname__
+            _TestCase.__module__ = func.__module__
+
+            return _TestCase
+
+        return super().__call__(func)
+
+
+class SubConfigProxy:
+    """
+    Shim to redirect to main config.
+    `config.triton.cudagraphs` maps to _config["triton.cudagraphs"]
+    """
+
+    def __init__(self, config, prefix):
+        # `super().__setattr__` to bypass custom `__setattr__`
+        super().__setattr__("_config", config)
+        super().__setattr__("_prefix", prefix)
+
+    def __setattr__(self, name, value):
+        return self._config.__setattr__(self._prefix + name, value)
+
+    def __getattr__(self, name):
+        return self._config.__getattr__(self._prefix + name)
+
+    def __delattr__(self, name):
+        return self._config.__delattr__(self._prefix + name)
+
+
+def patch_object(obj, name, value):
+    """
+    Workaround `mock.patch.object` issue with ConfigModule
+    """
+    if isinstance(obj, ConfigModule):
+        return obj.patch(name, value)
+    return mock.patch.object(obj, name, value)
diff --git a/MLPY/Lib/site-packages/torch/utils/_config_typing.pyi b/MLPY/Lib/site-packages/torch/utils/_config_typing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..b5ce051910c60fada1e5f65944d11a84a0706daf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_config_typing.pyi
@@ -0,0 +1,34 @@
+from typing import Any, Dict, Optional, TYPE_CHECKING, Union
+
+"""
+This was semi-automatically generated by running
+
+    stubgen torch.utils._config_module.py
+
+And then manually extracting the methods of ConfigModule and converting them into top-level functions.
+
+This file should be imported into any file that uses install_config_module like so:
+
+    if TYPE_CHECKING:
+        from torch.utils._config_typing import *  # noqa: F401, F403
+
+    from torch.utils._config_module import install_config_module
+
+    # adds patch, save_config, etc
+    install_config_module(sys.modules[__name__])
+
+Note that the import should happen before the call to install_config_module(), otherwise runtime errors may occur.
+"""
+
+assert TYPE_CHECKING, "Do not use at runtime"
+
+def save_config() -> bytes: ...
+def codegen_config() -> str: ...
+def get_hash() -> bytes: ...
+def to_dict() -> Dict[str, Any]: ...
+def shallow_copy_dict() -> Dict[str, Any]: ...
+def load_config(config: Union[bytes, Dict[str, Any]]) -> None: ...
+def get_config_copy() -> Dict[str, Any]: ...
+def patch(
+    arg1: Optional[Union[str, Dict[str, Any]]] = None, arg2: Any = None, **kwargs
+): ...
diff --git a/MLPY/Lib/site-packages/torch/utils/_content_store.py b/MLPY/Lib/site-packages/torch/utils/_content_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc1cc2ef85ed74fee440da3cf0f74b0e5fdc8ecb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_content_store.py
@@ -0,0 +1,238 @@
+# This module provides a FAST (on GPU) content addressable store for storages
+# (and tensors on top of them) with VERY WEAK portability guarantees (e.g.,
+# don't expect CPU/CUDA to address to the same hash, don't expect it to be
+# portable across devices) that is NOT cryptographically secure.  In return,
+# we are able to hash 40G of tensor data on GPU in less than a second,
+# compared to running SHA-1 in CPU which would a minute or so.  The primary
+# use case is for efficiently snapshotting intermediate tensor data for
+# offline debugging, but it's been put in this module in case you think of
+# another use case for it.  The hash function could be replaced with a
+# straight reimplementation of SHA-1, which would give us much stronger
+# portability guarantees.
+#
+# WARNING: THERE IS NO BC/FC GUARANTEE FOR THIS FORMAT!  If you need to format
+# shift the result, consider packing it into a single torch.save object
+# with traditional view sharing.
+#
+# Because of the weak portability guarantees, you can only write to the
+# content store from a single process; we don't provide any capability
+# of "reopening" a content store to add more things to it.  But we don't
+# assume that you can keep all of the tensors you want to add to the store
+# in memory at once, because you probably can't!  Nor do we assume that
+# you know a priori whether or not two storages can be deduplicated or not.
+#
+# Note: only storages are content-addressed; tensors are name addressed
+#
+# Note: our padding strategy means that [1, 0] and [1] int16 tensors would
+# map to the same (padded) storage.  We think this will be immaterial for most
+# users.
+
+import ctypes
+import functools
+import hashlib
+import os.path
+import struct
+from collections import defaultdict
+from typing import Dict, Optional, Set
+
+import torch
+import torch._prims as prims
+import torch._utils
+import torch.nn.functional as F
+from torch._C import default_generator
+
+from torch.multiprocessing.reductions import StorageWeakRef
+
+
+def lazy_compile(**compile_kwargs):
+    """Lazily wrap a function with torch.compile on the first call
+
+    This avoids eagerly importing dynamo.
+    """
+
+    def decorate_fn(fn):
+        @functools.wraps(fn)
+        def compile_hook(*args, **kwargs):
+            compiled_fn = torch.compile(fn, **compile_kwargs)
+            globals()[fn.__name__] = functools.wraps(fn)(compiled_fn)
+            return compiled_fn(*args, **kwargs)
+
+        return compile_hook
+
+    return decorate_fn
+
+
+# Use of torch.compile is mandatory for (1) good memory usage
+# and (2) xor_sum implementation.  This is our first instance of
+# using PT2 to implement a kernel in PyTorch; if we get AOT capabilities
+# it would be good to apply it here.
+@lazy_compile(dynamic=True)
+def hash_storage_kernel(x):
+    # The randint calls are carefully written to hit things we
+    # have lowerings for in inductor.  Lack of unsigned 32-bit integer
+    # is a pain.
+    a = torch.randint(
+        -(2**31), 2**31, x.shape, device=x.device, dtype=torch.int32
+    ).abs()
+    a = ((a % (2**31 - 1)) + 1).long()
+    b = (
+        torch.randint(-(2**31), 2**31, x.shape, device=x.device, dtype=torch.int32)
+        .abs()
+        .long()
+    )
+    # This is a standard shift-multiply universal hash family
+    # plus xor sum hash, using Philox to generate random numbers.
+    # Our Philox RNG is not deterministic across devices so
+    # don't use this for stable hashing.
+    #
+    # This assumes fixed length so you're also obligated to bucket
+    # by the length of tensor as well
+    return prims.xor_sum((a * x + b).int(), [0])
+
+
+# Returns a hex digest of the data in the storage.  Guaranteed to be
+# SHA-1 if stable_hash=True, otherwise it will consistent for a single
+# process run but not necessarily across processes.
+def hash_storage(storage: torch.UntypedStorage, *, stable_hash: bool = False) -> str:
+    import torch._dynamo
+    from torch._dynamo.utils import is_compile_supported
+
+    device_type = storage.device.type
+    if stable_hash or not is_compile_supported(device_type):
+        cpu_storage = storage.cpu()
+        # TODO: make storage support buffer protocol so this isn't
+        # necessary
+        buf = (ctypes.c_byte * cpu_storage.nbytes()).from_address(
+            cpu_storage.data_ptr()
+        )
+        sha1 = hashlib.sha1()
+        sha1.update(buf)
+        return sha1.hexdigest()
+
+    # TODO: factor this into a random utility
+    if device_type == "cpu":
+        generator = default_generator
+    elif device_type == "cuda":
+        import torch.cuda
+
+        generator = torch.cuda.default_generators[storage.device.index]
+    else:
+        raise AssertionError(f"unhandled device type {device_type}")
+    state = generator.get_state()
+    try:
+        generator.manual_seed(0)
+        x = torch.empty(0, dtype=torch.uint8, device=storage.device).set_(storage)  # type: ignore[call-overload]
+        # The dtype-casting view cannot be compiled, and so the
+        # padding/reshaping also needs to be done externally even
+        # though it could be profitably fused
+        pad = -x.numel() % 4
+        if pad > 0:
+            x = F.pad(x, (0, pad), "constant", 0)
+        x = x.view(torch.int32)
+        # We run the 32-bit hash five times with differing parameters to
+        # reduce chance of collision
+        ITER = 5
+        cs = [hash_storage_kernel(x).item() for _ in range(ITER)]
+        return struct.pack(">" + "i" * ITER, *cs).hex()
+    finally:
+        generator.set_state(state)
+
+
+class ContentStoreWriter:
+    # Structure:
+    #   storages/
+    #     00/
+    #       0000..00
+    #   tensors/
+    #     name
+    def __init__(self, loc: str, stable_hash: bool = False) -> None:
+        self.loc: str = loc
+        self.seen_storage_hashes: Set[str] = set()
+        self.stable_hash = stable_hash
+
+    # TODO: offer some sort of non-blocking API to speed things up
+    def write_storage(self, storage: torch.UntypedStorage) -> str:
+        h = hash_storage(storage, stable_hash=self.stable_hash)
+        if h in self.seen_storage_hashes:
+            return h
+        # TODO: consider not using torch.save for this; we don't actually
+        # need any metadata for the storage
+        subfolder = os.path.join(self.loc, "storages")
+        os.makedirs(subfolder, exist_ok=True)
+        target = os.path.join(subfolder, h)
+        if os.path.exists(target):
+            return h
+        torch.save(storage, target)
+        self.seen_storage_hashes.add(h)
+        return h
+
+    def compute_tensor_metadata(self, t: torch.Tensor, h=None):
+        if h is None:
+            h = hash_storage(t.untyped_storage(), stable_hash=self.stable_hash)
+        return (
+            t.dtype,
+            h,
+            t.storage_offset(),
+            tuple(t.shape),
+            t.stride(),
+            torch._utils.get_tensor_metadata(t),
+        )
+
+    def write_tensor(self, name: str, t: torch.Tensor) -> None:
+        storage = t.untyped_storage()
+        h = self.write_storage(storage)
+        # TODO: Support more advanced snapshotting of requires_grad/grad/etc
+        d, f = os.path.split(name)
+        payload = self.compute_tensor_metadata(t, h=h)
+        subfolder = os.path.join(self.loc, "tensors", d)
+        os.makedirs(subfolder, exist_ok=True)
+        torch.save(payload, os.path.join(subfolder, f))
+
+
+class ContentStoreReader:
+    def __init__(self, loc: str, *, cache=True) -> None:
+        self.loc = loc
+        self.storage_cache: Optional[
+            Dict[Optional[torch.device], Dict[str, StorageWeakRef]]
+        ] = None
+        if cache:
+            self.storage_cache = defaultdict(dict)
+
+    def read_storage(self, h: str, *, device=None) -> torch.UntypedStorage:
+        if device is not None:
+            device = torch.device(device)
+        ws = (
+            self.storage_cache[device].get(h)
+            if self.storage_cache is not None
+            else None
+        )
+        s: Optional[torch.UntypedStorage]
+        if ws is not None:
+            s = torch.UntypedStorage._new_with_weak_ptr(ws.cdata)
+            if s is not None:
+                return s
+        s = torch.load(
+            os.path.join(self.loc, "storages", h),
+            weights_only=True,
+            map_location=device,
+        )._untyped_storage
+        assert s is not None
+        if self.storage_cache is not None:
+            self.storage_cache[device][h] = StorageWeakRef(s)
+        return s
+
+    def read_tensor_metadata(self, name: str):
+        fn = os.path.join(self.loc, "tensors", name)
+        if not os.path.exists(fn):
+            raise FileNotFoundError(fn)
+        return torch.load(fn, weights_only=True)
+
+    def read_tensor(self, name: str, *, device=None) -> torch.Tensor:
+        dtype, h, storage_offset, size, stride, metadata = self.read_tensor_metadata(
+            name
+        )
+        storage = self.read_storage(h, device=device)
+        t = torch.tensor([], dtype=dtype, device=storage.device)
+        t.set_(storage, storage_offset, size, stride)
+        torch._utils.set_tensor_metadata(t, metadata)
+        return t
diff --git a/MLPY/Lib/site-packages/torch/utils/_contextlib.py b/MLPY/Lib/site-packages/torch/utils/_contextlib.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9db46bb3f2e99d4a6d7b3b41fdf6c33d9350efa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_contextlib.py
@@ -0,0 +1,152 @@
+# Extra utilities for working with context managers that should have been
+# in the standard library but are not
+
+import functools
+import inspect
+import warnings
+import sys
+from typing import Any, Callable, TypeVar, cast
+
+# Used for annotating the decorator usage of _DecoratorContextManager (e.g.,
+# 'no_grad' and 'enable_grad').
+# See https://mypy.readthedocs.io/en/latest/generics.html#declaring-decorators
+FuncType = Callable[..., Any]
+F = TypeVar('F', bound=FuncType)
+
+
+def _wrap_generator(ctx_factory, func):
+    """
+    Wrap each generator invocation with the context manager factory.
+
+    The input should be a function that returns a context manager,
+    not a context manager itself, to handle one-shot context managers.
+    """
+    @functools.wraps(func)
+    def generator_context(*args, **kwargs):
+        gen = func(*args, **kwargs)
+
+        # Generators are suspended and unsuspended at `yield`, hence we
+        # make sure the grad mode is properly set every time the execution
+        # flow returns into the wrapped generator and restored when it
+        # returns through our `yield` to our caller (see PR #49017).
+        try:
+            # Issuing `None` to a generator fires it up
+            with ctx_factory():
+                response = gen.send(None)
+
+            while True:
+                try:
+                    # Forward the response to our caller and get its next request
+                    request = yield response
+
+                except GeneratorExit:
+                    # Inform the still active generator about its imminent closure
+                    with ctx_factory():
+                        gen.close()
+                    raise
+
+                except BaseException:
+                    # Propagate the exception thrown at us by the caller
+                    with ctx_factory():
+                        response = gen.throw(*sys.exc_info())
+
+                else:
+                    # Pass the last request to the generator and get its response
+                    with ctx_factory():
+                        response = gen.send(request)
+
+        # We let the exceptions raised above by the generator's `.throw` or
+        # `.send` methods bubble up to our caller, except for StopIteration
+        except StopIteration as e:
+            # The generator informed us that it is done: take whatever its
+            # returned value (if any) was and indicate that we're done too
+            # by returning it (see docs for python's return-statement).
+            return e.value
+
+    return generator_context
+
+
+def context_decorator(ctx, func):
+    """
+    Like contextlib.ContextDecorator.
+
+    But with the following differences:
+    1. Is done by wrapping, rather than inheritance, so it works with context
+       managers that are implemented from C and thus cannot easily inherit from
+       Python classes
+    2. Wraps generators in the intuitive way (c.f. https://bugs.python.org/issue37743)
+    3. Errors out if you try to wrap a class, because it is ambiguous whether
+       or not you intended to wrap only the constructor
+
+    The input argument can either be a context manager (in which case it must
+    be a multi-shot context manager that can be directly invoked multiple times)
+    or a callable that produces a context manager.
+    """
+    assert not (callable(ctx) and hasattr(ctx, '__enter__')), (
+        f"Passed in {ctx} is both callable and also a valid context manager "
+        "(has __enter__), making it ambiguous which interface to use.  If you "
+        "intended to pass a context manager factory, rewrite your call as "
+        "context_decorator(lambda: ctx()); if you intended to pass a context "
+        "manager directly, rewrite your call as context_decorator(lambda: ctx)"
+    )
+
+    if not callable(ctx):
+        def ctx_factory():
+            return ctx
+    else:
+        ctx_factory = ctx
+
+    if inspect.isclass(func):
+        raise RuntimeError(
+            "Cannot decorate classes; it is ambiguous whether or not only the "
+            "constructor or all methods should have the context manager applied; "
+            "additionally, decorating a class at definition-site will prevent "
+            "use of the identifier as a conventional type.  "
+            "To specify which methods to decorate, decorate each of them "
+            "individually."
+        )
+
+    if inspect.isgeneratorfunction(func):
+        return _wrap_generator(ctx_factory, func)
+
+    @functools.wraps(func)
+    def decorate_context(*args, **kwargs):
+        with ctx_factory():
+            return func(*args, **kwargs)
+
+    return decorate_context
+
+
+class _DecoratorContextManager:
+    """Allow a context manager to be used as a decorator."""
+
+    def __call__(self, orig_func: F) -> F:
+        if inspect.isclass(orig_func):
+            warnings.warn("Decorating classes is deprecated and will be disabled in "
+                          "future versions. You should only decorate functions or methods. "
+                          "To preserve the current behavior of class decoration, you can "
+                          "directly decorate the `__init__` method and nothing else.")
+            func = cast(F, lambda *args, **kwargs: orig_func(*args, **kwargs))
+        else:
+            func = orig_func
+
+        return cast(F, context_decorator(self.clone, func))
+
+    def __enter__(self) -> None:
+        raise NotImplementedError
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        raise NotImplementedError
+
+    def clone(self):
+        # override this method if your children class takes __init__ parameters
+        return self.__class__()
+
+
+class _NoParamDecoratorContextManager(_DecoratorContextManager):
+    """Allow a context manager to be used as a decorator without parentheses."""
+
+    def __new__(cls, orig_func=None):
+        if orig_func is None:
+            return super().__new__(cls)
+        return cls()(orig_func)
diff --git a/MLPY/Lib/site-packages/torch/utils/_cpp_extension_versioner.py b/MLPY/Lib/site-packages/torch/utils/_cpp_extension_versioner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a228cf13d9a652d670db61777c5203eb6b00a632
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_cpp_extension_versioner.py
@@ -0,0 +1,58 @@
+import collections
+
+
+Entry = collections.namedtuple('Entry', 'version, hash')
+
+
+def update_hash(seed, value):
+    # Good old boost::hash_combine
+    # https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+    return seed ^ (hash(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2))
+
+
+def hash_source_files(hash_value, source_files):
+    for filename in source_files:
+        with open(filename) as file:
+            hash_value = update_hash(hash_value, file.read())
+    return hash_value
+
+
+def hash_build_arguments(hash_value, build_arguments):
+    for group in build_arguments:
+        if group:
+            for argument in group:
+                hash_value = update_hash(hash_value, argument)
+    return hash_value
+
+
+class ExtensionVersioner:
+    def __init__(self):
+        self.entries = {}
+
+    def get_version(self, name):
+        entry = self.entries.get(name)
+        return None if entry is None else entry.version
+
+    def bump_version_if_changed(self,
+                                name,
+                                source_files,
+                                build_arguments,
+                                build_directory,
+                                with_cuda,
+                                is_python_module,
+                                is_standalone):
+        hash_value = 0
+        hash_value = hash_source_files(hash_value, source_files)
+        hash_value = hash_build_arguments(hash_value, build_arguments)
+        hash_value = update_hash(hash_value, build_directory)
+        hash_value = update_hash(hash_value, with_cuda)
+        hash_value = update_hash(hash_value, is_python_module)
+        hash_value = update_hash(hash_value, is_standalone)
+
+        entry = self.entries.get(name)
+        if entry is None:
+            self.entries[name] = entry = Entry(0, hash_value)
+        elif hash_value != entry.hash:
+            self.entries[name] = entry = Entry(entry.version + 1, hash_value)
+
+        return entry.version
diff --git a/MLPY/Lib/site-packages/torch/utils/_cuda_trace.py b/MLPY/Lib/site-packages/torch/utils/_cuda_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5936f019252e3ff0fa415c4043458d9b16c4877
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_cuda_trace.py
@@ -0,0 +1,99 @@
+import logging
+from typing import Callable, Generic, List
+
+from typing_extensions import ParamSpec  # Python 3.10+
+
+logger = logging.getLogger(__name__)
+P = ParamSpec("P")
+
+
+class CallbackRegistry(Generic[P]):
+    def __init__(self, name: str):
+        self.name = name
+        self.callback_list: List[Callable[P, None]] = []
+
+    def add_callback(self, cb: Callable[P, None]) -> None:
+        self.callback_list.append(cb)
+
+    def fire_callbacks(self, *args: P.args, **kwargs: P.kwargs) -> None:
+        for cb in self.callback_list:
+            try:
+                cb(*args, **kwargs)
+            except Exception as e:
+                logger.exception(
+                    "Exception in callback for %s registered with CUDA trace", self.name
+                )
+
+
+CUDAEventCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event creation"
+)
+CUDAEventDeletionCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event deletion"
+)
+CUDAEventRecordCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "CUDA event record"
+)
+CUDAEventWaitCallbacks: "CallbackRegistry[int, int]" = CallbackRegistry(
+    "CUDA event wait"
+)
+CUDAMemoryAllocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA memory allocation"
+)
+CUDAMemoryDeallocationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA memory deallocation"
+)
+CUDAStreamCreationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA stream creation"
+)
+CUDADeviceSynchronizationCallbacks: "CallbackRegistry[[]]" = CallbackRegistry(
+    "CUDA device synchronization"
+)
+CUDAStreamSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA stream synchronization"
+)
+CUDAEventSynchronizationCallbacks: "CallbackRegistry[int]" = CallbackRegistry(
+    "CUDA event synchronization"
+)
+
+
+def register_callback_for_cuda_event_creation(cb: Callable[[int], None]) -> None:
+    CUDAEventCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_event_deletion(cb: Callable[[int], None]) -> None:
+    CUDAEventDeletionCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_event_record(cb: Callable[[int, int], None]) -> None:
+    CUDAEventRecordCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_event_wait(cb: Callable[[int, int], None]) -> None:
+    CUDAEventWaitCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_memory_allocation(cb: Callable[[int], None]) -> None:
+    CUDAMemoryAllocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_memory_deallocation(cb: Callable[[int], None]) -> None:
+    CUDAMemoryDeallocationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_stream_creation(cb: Callable[[int], None]) -> None:
+    CUDAStreamCreationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_device_synchronization(cb: Callable[[], None]) -> None:
+    CUDADeviceSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_stream_synchronization(
+    cb: Callable[[int], None]
+) -> None:
+    CUDAStreamSynchronizationCallbacks.add_callback(cb)
+
+
+def register_callback_for_cuda_event_synchronization(cb: Callable[[int], None]) -> None:
+    CUDAEventSynchronizationCallbacks.add_callback(cb)
diff --git a/MLPY/Lib/site-packages/torch/utils/_cxx_pytree.py b/MLPY/Lib/site-packages/torch/utils/_cxx_pytree.py
new file mode 100644
index 0000000000000000000000000000000000000000..e68ab603831df22d4e89c75315c18a4ebc967c56
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_cxx_pytree.py
@@ -0,0 +1,970 @@
+"""
+Contains utility functions for working with nested python data structures.
+
+A *pytree* is Python nested data structure. It is a tree in the sense that
+nodes are Python collections (e.g., list, tuple, dict) and the leaves are
+Python values. Furthermore, a pytree should not contain reference cycles.
+
+pytrees are useful for working with nested collections of Tensors. For example,
+one can use `tree_map` to map a function over all Tensors inside some nested
+collection of Tensors and `tree_leaves` to get a flat list of all Tensors
+inside some nested collection. pytrees are helpful for implementing nested
+collection support for PyTorch APIs.
+"""
+
+import functools
+import sys
+import types
+import warnings
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Optional,
+    overload,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import torch
+
+if torch._running_with_deploy():  # type: ignore[no-untyped-call]
+    raise ImportError("C++ pytree utilities do not work with torch::deploy.")
+
+import optree
+from optree import PyTreeSpec  # direct import for type annotations
+
+from torch.utils._pytree import KeyEntry
+
+
+__all__ = [
+    "PyTree",
+    "Context",
+    "FlattenFunc",
+    "UnflattenFunc",
+    "DumpableContext",
+    "ToDumpableContextFn",
+    "FromDumpableContextFn",
+    "TreeSpec",
+    "LeafSpec",
+    "keystr",
+    "key_get",
+    "register_pytree_node",
+    "tree_flatten",
+    "tree_flatten_with_path",
+    "tree_unflatten",
+    "tree_leaves",
+    "tree_leaves_with_path",
+    "tree_structure",
+    "tree_map",
+    "tree_map_with_path",
+    "tree_map_",
+    "tree_map_only",
+    "tree_map_only_",
+    "tree_all",
+    "tree_any",
+    "tree_all_only",
+    "tree_any_only",
+    "treespec_dumps",
+    "treespec_loads",
+    "treespec_pprint",
+]
+
+
+T = TypeVar("T")
+S = TypeVar("S")
+U = TypeVar("U")
+R = TypeVar("R")
+
+
+Context = Any
+PyTree = Any
+TreeSpec = PyTreeSpec
+FlattenFunc = Callable[[PyTree], Tuple[List[Any], Context]]
+UnflattenFunc = Callable[[Iterable[Any], Context], PyTree]
+OpTreeUnflattenFunc = Callable[[Context, Iterable[Any]], PyTree]
+DumpableContext = Any  # Any json dumpable text
+ToDumpableContextFn = Callable[[Context], DumpableContext]
+FromDumpableContextFn = Callable[[DumpableContext], Context]
+KeyPath = Tuple[KeyEntry, ...]
+FlattenWithKeysFunc = Callable[[PyTree], Tuple[List[Tuple[KeyEntry, Any]], Any]]
+
+
+def _reverse_args(func: UnflattenFunc) -> OpTreeUnflattenFunc:
+    @functools.wraps(func)
+    def wrapped(*args: Any, **kwargs: Any) -> Any:
+        return func(*reversed(args), **kwargs)
+
+    return wrapped
+
+
+def register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+) -> None:
+    """Register a container-like type as pytree node.
+
+    Args:
+        cls (type): A Python type to treat as an internal pytree node.
+        flatten_fn (callable): A function to be used during flattening, taking an instance of
+            ``cls`` and returning a pair, with (1) an iterable for the children to be flattened
+            recursively, and (2) some hashable auxiliary data to be stored in the treespec and to be
+            passed to the ``unflatten_fn``.
+        unflatten_fn (callable): A function taking two arguments: the auxiliary data that was
+            returned by ``flatten_fn`` and stored in the treespec, and the unflattened children.
+            The function should return an instance of ``cls``.
+        serialized_type_name (str, optional): A keyword argument used to specify the fully
+            qualified name used when serializing the tree spec.
+        to_dumpable_context (callable, optional): An optional keyword argument to custom specify how
+            to convert the context of the pytree to a custom json dumpable representation. This is
+            used for json serialization, which is being used in :mod:`torch.export` right now.
+        from_dumpable_context (callable, optional): An optional keyword argument to custom specify
+            how to convert the custom json dumpable representation of the context back to the
+            original context. This is used for json deserialization, which is being used in
+            :mod:`torch.export` right now.
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> # Registry a Python type with lambda functions
+        >>> register_pytree_node(
+        ...     set,
+        ...     lambda s: (sorted(s), None, None),
+        ...     lambda children, _: set(children),
+        ... )
+    """
+    if flatten_with_keys_fn is not None:
+        raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+    _private_register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+    )
+
+    from . import _pytree as python
+
+    python._private_register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+    )
+
+
+def _register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+) -> None:
+    """Register a container-like type as pytree node for the C++ pytree only.
+
+    The ``namespace`` argument is used to avoid collisions that occur when different libraries
+    register the same Python type with different behaviors. It is recommended to add a unique prefix
+    to the namespace to avoid conflicts with other libraries. Namespaces can also be used to specify
+    the same class in different namespaces for different use cases.
+
+    .. warning::
+        For safety reasons, a ``namespace`` must be specified while registering a custom type. It is
+        used to isolate the behavior of flattening and unflattening a pytree node type. This is to
+        prevent accidental collisions between different libraries that may register the same type.
+
+    Args:
+        cls (type): A Python type to treat as an internal pytree node.
+        flatten_fn (callable): A function to be used during flattening, taking an instance of
+            ``cls`` and returning a pair, with (1) an iterable for the children to be flattened
+            recursively, and (2) some hashable auxiliary data to be stored in the treespec and to be
+            passed to the ``unflatten_fn``.
+        unflatten_fn (callable): A function taking two arguments: the auxiliary data that was
+            returned by ``flatten_fn`` and stored in the treespec, and the unflattened children.
+            The function should return an instance of ``cls``.
+        serialized_type_name (str, optional): A keyword argument used to specify the fully
+            qualified name used when serializing the tree spec.
+        to_dumpable_context (callable, optional): An optional keyword argument to custom specify how
+            to convert the context of the pytree to a custom json dumpable representation. This is
+            used for json serialization, which is being used in :mod:`torch.export` right now.
+        from_dumpable_context (callable, optional): An optional keyword argument to custom specify
+            how to convert the custom json dumpable representation of the context back to the
+            original context. This is used for json deserialization, which is being used in
+            :mod:`torch.export` right now.
+    """
+    warnings.warn(
+        "torch.utils._cxx_pytree._register_pytree_node is deprecated. "
+        "Please use torch.utils._cxx_pytree.register_pytree_node instead.",
+        stacklevel=2,
+    )
+
+    _private_register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+    )
+
+
+def _private_register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+) -> None:
+    """This is an internal function that is used to register a pytree node type
+    for the C++ pytree only. End-users should use :func:`register_pytree_node`
+    instead.
+    """
+    # TODO(XuehaiPan): remove this condition when we make Python pytree out-of-box support
+    # PyStructSequence types
+    if not optree.is_structseq_class(cls):
+        optree.register_pytree_node(
+            cls,
+            flatten_fn,
+            _reverse_args(unflatten_fn),
+            namespace="torch",
+        )
+
+
+def tree_flatten(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Any], TreeSpec]:
+    """Flatten a pytree.
+
+    See also :func:`tree_unflatten`.
+
+    The flattening order (i.e., the order of elements in the output list) is deterministic,
+    corresponding to a left-to-right depth-first tree traversal.
+
+    >>> tree = {'b': (2, [3, 4]), 'a': 1, 'c': None, 'd': 5}
+    >>> tree_flatten(tree)
+    ([1, 2, 3, 4, None, 5], PyTreeSpec({'a': *, 'b': (*, [*, *]), 'c': *, 'd': *}, NoneIsLeaf))
+    >>> tree_flatten(1)
+    ([1], PyTreeSpec(*, NoneIsLeaf))
+    >>> tree_flatten(None)
+    ([None], PyTreeSpec(*, NoneIsLeaf))
+
+    For unordered dictionaries, :class:`dict` and :class:`collections.defaultdict`, the order is
+    dependent on the **sorted** keys in the dictionary. Please use :class:`collections.OrderedDict`
+    if you want to keep the keys in the insertion order.
+
+    >>> from collections import OrderedDict
+    >>> tree = OrderedDict([('b', (2, [3, 4])), ('a', 1), ('c', None), ('d', 5)])
+    >>> tree_flatten(tree)
+    ([2, 3, 4, 1, None, 5], PyTreeSpec(OrderedDict([('b', (*, [*, *])), ('a', *), ('c', *), ('d', *)]), NoneIsLeaf))
+
+    Args:
+        tree (pytree): A pytree to flatten.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A pair ``(leaves, treespec)`` where the first element is a list of leaf values and the
+        second element is a treespec representing the structure of the pytree.
+    """
+    return optree.tree_flatten(  # type: ignore[return-value]
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
+    """Reconstruct a pytree from the treespec and the leaves.
+
+    The inverse of :func:`tree_flatten`.
+
+    >>> tree = {'b': (2, [3, 4]), 'a': 1, 'c': None, 'd': 5}
+    >>> leaves, treespec = tree_flatten(tree)
+    >>> tree == tree_unflatten(leaves, treespec)
+    True
+
+    Args:
+        leaves (iterable): The list of leaves to use for reconstruction. The list must match the
+            number of leaves of the treespec.
+        treespec (TreeSpec): The treespec to reconstruct.
+
+    Returns:
+        The reconstructed pytree, containing the ``leaves`` placed in the structure described by
+        ``treespec``.
+    """
+    if not isinstance(treespec, TreeSpec):
+        raise TypeError(
+            f"tree_unflatten(values, spec): Expected `spec` to be instance of "
+            f"TreeSpec but got item of type {type(treespec)}."
+        )
+    return optree.tree_unflatten(treespec, leaves)  # type: ignore[arg-type]
+
+
+def tree_leaves(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Any]:
+    """Get the leaves of a pytree.
+
+    See also :func:`tree_flatten`.
+
+    >>> tree = {'b': (2, [3, 4]), 'a': 1, 'c': None, 'd': 5}
+    >>> tree_leaves(tree)
+    [1, 2, 3, 4, None, 5]
+    >>> tree_leaves(1)
+    [1]
+    >>> tree_leaves(None)
+    [None]
+
+    Args:
+        tree (pytree): A pytree to flatten.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A list of leaf values.
+    """
+    return optree.tree_leaves(
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+def tree_structure(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> TreeSpec:
+    """Get the treespec for a pytree.
+
+    See also :func:`tree_flatten`.
+
+    >>> tree = {'b': (2, [3, 4]), 'a': 1, 'c': None, 'd': 5}
+    >>> tree_structure(tree)
+    PyTreeSpec({'a': *, 'b': (*, [*, *]), 'c': *, 'd': *}, NoneIsLeaf)
+    >>> tree_structure(1)
+    PyTreeSpec(*, NoneIsLeaf)
+    >>> tree_structure(None)
+    PyTreeSpec(*, NoneIsLeaf)
+
+    Args:
+        tree (pytree): A pytree to flatten.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A treespec object representing the structure of the pytree.
+    """
+    return optree.tree_structure(  # type: ignore[return-value]
+        tree,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+def tree_map(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Map a multi-input function over pytree args to produce a new pytree.
+
+    See also :func:`tree_map_`.
+
+    >>> tree_map(lambda x: x + 1, {'x': 7, 'y': (42, 64)})
+    {'x': 8, 'y': (43, 65)}
+    >>> tree_map(lambda x: x is None, {'x': 7, 'y': (42, 64), 'z': None})
+    {'x': False, 'y': (False, False), 'z': True}
+
+    If multiple inputs are given, the structure of the tree is taken from the first input;
+    subsequent inputs need only have ``tree`` as a prefix:
+
+    >>> tree_map(lambda x, y: [x] + y, [5, 6], [[7, 9], [1, 2]])
+    [[5, 7, 9], [6, 1, 2]]
+
+    Args:
+        func (callable): A function that takes ``1 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees.
+        tree (pytree): A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests (tuple of pytree): A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(x, *xs)`` where ``x`` is the value at the corresponding leaf in ``tree`` and ``xs``
+        is the tuple of values at corresponding nodes in ``rests``.
+    """
+    return optree.tree_map(
+        func,
+        tree,
+        *rests,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+def tree_map_(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but do an inplace call on each leaf and return the original tree.
+
+    See also :func:`tree_map`.
+
+    Args:
+        func (callable): A function that takes ``1 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees.
+        tree (pytree): A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests (tuple of pytree): A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        The original ``tree`` with the value at each leaf is given by the side-effect of function
+        ``func(x, *xs)`` (not the return value) where ``x`` is the value at the corresponding leaf
+        in ``tree`` and ``xs`` is the tuple of values at values at corresponding nodes in ``rests``.
+    """
+    return optree.tree_map_(
+        func,
+        tree,
+        *rests,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+Type2 = Tuple[Type[T], Type[S]]
+Type3 = Tuple[Type[T], Type[S], Type[U]]
+if sys.version_info >= (3, 10):
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
+
+Fn2 = Callable[[Union[T, S]], R]
+Fn3 = Callable[[Union[T, S, U]], R]
+Fn = Callable[[T], R]
+FnAny = Callable[[Any], R]
+
+MapOnlyFn = Callable[[T], Callable[[Any], Any]]
+
+
+# These specializations help with type inference on the lambda passed to this
+# function
+@overload
+def map_only(__type_or_types_or_pred: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
+    ...
+
+
+# This specialization is needed for the implementations below that call
+@overload
+def map_only(__type_or_types_or_pred: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Callable[[Any], bool]) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+def map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]]
+) -> MapOnlyFn[FnAny[Any]]:
+    """
+    Suppose you are writing a tree_map over tensors, leaving everything
+    else unchanged.  Ordinarily you would have to write:
+
+        def go(t):
+            if isinstance(t, Tensor):
+                return ...
+            else:
+                return t
+
+    With this function, you only need to write:
+
+        @map_only(Tensor)
+        def go(t):
+            return ...
+
+    You can also directly use 'tree_map_only'
+    """
+    if isinstance(__type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(__type_or_types_or_pred, types.UnionType)
+    ):
+
+        def pred(x: Any) -> bool:
+            return isinstance(x, __type_or_types_or_pred)  # type: ignore[arg-type]
+
+    elif callable(__type_or_types_or_pred):
+        pred = __type_or_types_or_pred  # type: ignore[assignment]
+    else:
+        raise TypeError("Argument must be a type, a tuple of types, or a callable.")
+
+    def wrapper(func: Callable[[T], Any]) -> Callable[[Any], Any]:
+        @functools.wraps(func)
+        def wrapped(x: T) -> Any:
+            if pred(x):
+                return func(x)
+            return x
+
+        return wrapped
+
+    return wrapper
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type[T],
+    func: Fn[T, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type2[T, S],
+    func: Fn2[T, S, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type3[T, S, U],
+    func: Fn3[T, S, U, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Callable[[Any], bool],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+def tree_map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type[T],
+    func: Fn[T, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type2[T, S],
+    func: Fn2[T, S, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type3[T, S, U],
+    func: Fn3[T, S, U, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Callable[[Any], bool],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+def tree_map_only_(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map_(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
+
+
+def tree_all(
+    pred: Callable[[Any], bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return all(map(pred, flat_args))
+
+
+def tree_any(
+    pred: Callable[[Any], bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return any(map(pred, flat_args))
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type[T],
+    pred: Fn[T, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type2[T, S],
+    pred: Fn2[T, S, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type3[T, S, U],
+    pred: Fn3[T, S, U, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+def tree_all_only(
+    __type_or_types: TypeAny,
+    pred: FnAny[bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return all(pred(x) for x in flat_args if isinstance(x, __type_or_types))
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type[T],
+    pred: Fn[T, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type2[T, S],
+    pred: Fn2[T, S, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type3[T, S, U],
+    pred: Fn3[T, S, U, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+def tree_any_only(
+    __type_or_types: TypeAny,
+    pred: FnAny[bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return any(pred(x) for x in flat_args if isinstance(x, __type_or_types))
+
+
+def broadcast_prefix(
+    prefix_tree: PyTree,
+    full_tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Any]:
+    """Return a list of broadcasted leaves in ``prefix_tree`` to match the number of leaves in ``full_tree``.
+
+    If a ``prefix_tree`` is a prefix of a ``full_tree``, this means the ``full_tree`` can be
+    constructed by replacing the leaves of ``prefix_tree`` with appropriate **subtrees**.
+
+    This function returns a list of leaves with the same size as ``full_tree``. The leaves are
+    replicated from ``prefix_tree``. The number of replicas is determined by the corresponding
+    subtree in ``full_tree``.
+
+    >>> broadcast_prefix(1, [1, 2, 3])
+    [1, 1, 1]
+    >>> broadcast_prefix([1, 2, 3], [1, 2, 3])
+    [1, 2, 3]
+    >>> broadcast_prefix([1, 2, 3], [1, 2, 3, 4])
+    Traceback (most recent call last):
+        ...
+    ValueError: list arity mismatch; expected: 3, got: 4; list: [1, 2, 3, 4].
+    >>> broadcast_prefix([1, 2, 3], [1, 2, (3, 4)])
+    [1, 2, 3, 3]
+    >>> broadcast_prefix([1, 2, 3], [1, 2, {'a': 3, 'b': 4, 'c': (None, 5)}])
+    [1, 2, 3, 3, 3, 3]
+
+    Args:
+        prefix_tree (pytree): A pytree with the same structure as a prefix of ``full_tree``.
+        full_tree (pytree): A pytree with the same structure as a suffix of ``prefix_tree``.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A list of leaves in ``prefix_tree`` broadcasted to match the number of leaves in ``full_tree``.
+    """
+    return optree.broadcast_prefix(
+        prefix_tree,
+        full_tree,
+        is_leaf=is_leaf,
+        none_is_leaf=True,
+        namespace="torch",
+    )
+
+
+# Broadcasts a pytree to the provided TreeSpec and returns the flattened
+# values. If this is not possible, then this function returns None.
+#
+# For example, given pytree=0 and spec=TreeSpec(list, None, [LeafSpec(), LeafSpec()]),
+# would return [0, 0]. This is useful for part of the vmap implementation:
+# a user can pass in vmap(fn, in_dims)(*inputs). `in_dims` should be
+# broadcastable to the tree structure of `inputs` and we use
+# _broadcast_to_and_flatten to check this.
+def _broadcast_to_and_flatten(
+    tree: PyTree,
+    treespec: TreeSpec,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Optional[List[Any]]:
+    assert isinstance(treespec, TreeSpec)
+    full_tree = tree_unflatten([0] * treespec.num_leaves, treespec)
+    try:
+        return broadcast_prefix(tree, full_tree, is_leaf=is_leaf)
+    except ValueError:
+        return None
+
+
+def treespec_dumps(treespec: TreeSpec, protocol: Optional[int] = None) -> str:
+    """Serialize a treespec to a JSON string."""
+    if not isinstance(treespec, TreeSpec):
+        raise TypeError(
+            f"treespec_dumps(spec): Expected `spec` to be instance of "
+            f"TreeSpec but got item of type {type(treespec)}."
+        )
+    from ._pytree import (
+        tree_structure as _tree_structure,
+        treespec_dumps as _treespec_dumps,
+    )
+
+    orig_treespec = _tree_structure(tree_unflatten([0] * treespec.num_leaves, treespec))
+    return _treespec_dumps(orig_treespec, protocol=protocol)
+
+
+def treespec_loads(serialized: str) -> TreeSpec:
+    """Deserialize a treespec from a JSON string."""
+    from ._pytree import (
+        tree_unflatten as _tree_unflatten,
+        treespec_loads as _treespec_loads,
+    )
+
+    orig_treespec = _treespec_loads(serialized)
+    dummy_tree = _tree_unflatten([0] * orig_treespec.num_leaves, orig_treespec)
+    treespec = tree_structure(dummy_tree)
+    return treespec
+
+
+class _DummyLeaf:
+    def __repr__(self) -> str:
+        return "*"
+
+
+def treespec_pprint(treespec: TreeSpec) -> str:
+    dummy_tree = tree_unflatten(
+        [_DummyLeaf() for _ in range(treespec.num_leaves)],
+        treespec,
+    )
+    return repr(dummy_tree)
+
+
+class LeafSpecMeta(type(TreeSpec)):  # type: ignore[misc]
+    def __instancecheck__(self, instance: object) -> bool:
+        return isinstance(instance, TreeSpec) and instance.is_leaf()
+
+
+class LeafSpec(TreeSpec, metaclass=LeafSpecMeta):
+    def __new__(cls) -> "LeafSpec":
+        return optree.treespec_leaf(none_is_leaf=True)  # type: ignore[return-value]
+
+
+def tree_flatten_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Tuple[KeyPath, Any]], TreeSpec]:
+    """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
+
+    Args:
+        tree: a pytree to flatten. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A tuple where the first element is a list of (key path, leaf) pairs, and the
+        second element is a :class:`TreeSpec` representing the structure of the flattened
+        tree.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def tree_leaves_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Tuple[KeyPath, Any]]:
+    """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
+
+    Args:
+        tree: a pytree. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A list of (key path, leaf) pairs.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def tree_map_with_path(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
+
+    Args:
+        func: A function that takes ``2 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees. The first positional argument
+            to ``func`` is the key path of the leaf in question. The second
+            positional argument is the value of the leaf.
+        tree: A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests: A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(keypath, x, *xs)`` where ``keypath`` is the key path at the
+        corresponding leaf in ``tree``, ``x`` is the value at that leaf, and
+        ``xs`` is the tuple of values at corresponding nodes in ``rests``.
+    """
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def keystr(kp: KeyPath) -> str:
+    """Given a key path, return a pretty-printed representation."""
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
+
+
+def key_get(obj: Any, kp: KeyPath) -> Any:
+    """Given an object and a key path, return the value at the key path."""
+    raise NotImplementedError("KeyPaths are not yet supported in cxx_pytree.")
diff --git a/MLPY/Lib/site-packages/torch/utils/_device.py b/MLPY/Lib/site-packages/torch/utils/_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a2f791ac2fed07575bae24efba53570ee63d1c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_device.py
@@ -0,0 +1,91 @@
+from typing import Optional
+import torch
+from torch.overrides import TorchFunctionMode
+from torch.utils._contextlib import context_decorator
+import functools
+
+CURRENT_DEVICE: Optional[torch.device] = None
+
+@functools.lru_cache(1)
+def _device_constructors():
+    return {
+        # standard ones
+        torch.empty,
+        torch.empty_permuted,
+        torch.empty_strided,
+        torch.empty_quantized,
+        torch.ones,
+        torch.arange,
+        torch.bartlett_window,
+        torch.blackman_window,
+        torch.eye,
+        torch.fft.fftfreq,
+        torch.fft.rfftfreq,
+        torch.full,
+        torch.fill,
+        torch.hamming_window,
+        torch.hann_window,
+        torch.kaiser_window,
+        torch.linspace,
+        torch.logspace,
+        torch.nested.nested_tensor,
+        # This function doesn't actually take a device argument
+        # torch.normal,
+        torch.ones,
+        torch.rand,
+        torch.randn,
+        torch.randint,
+        torch.randperm,
+        torch.range,
+        torch.sparse_coo_tensor,
+        torch.sparse_compressed_tensor,
+        torch.sparse_csr_tensor,
+        torch.sparse_csc_tensor,
+        torch.sparse_bsr_tensor,
+        torch.sparse_bsc_tensor,
+        torch.tril_indices,
+        torch.triu_indices,
+        torch.vander,
+        torch.zeros,
+        torch.asarray,
+        # weird ones
+        torch.tensor,
+        torch.as_tensor,
+        torch.scalar_tensor,
+        torch.asarray,
+    }
+
+# NB: This is directly called from C++ in torch/csrc/Device.cpp
+class DeviceContext(TorchFunctionMode):
+    def __init__(self, device):
+        self.device = torch.device(device)
+
+    def __enter__(self):
+        global CURRENT_DEVICE
+        self.old_device = CURRENT_DEVICE
+        CURRENT_DEVICE = self.device
+        return super().__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global CURRENT_DEVICE
+        CURRENT_DEVICE = self.old_device
+        return super().__exit__(exc_type, exc_val, exc_tb)
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        if func in _device_constructors() and kwargs.get('device') is None:
+            kwargs['device'] = self.device
+        return func(*args, **kwargs)
+
+# NB: This is directly called from C++ in torch/csrc/Device.cpp
+def device_decorator(device, func):
+    return context_decorator(lambda: device, func)
+
+def set_device(device):
+    """
+    Set the default device inside of the wrapped function by decorating it with this function.
+
+    If you would like to use this as a context manager, use device as a
+    context manager directly, e.g., ``with torch.device(device)``.
+    """
+    return lambda func: device_decorator(torch.device(device), func)
diff --git a/MLPY/Lib/site-packages/torch/utils/_foreach_utils.py b/MLPY/Lib/site-packages/torch/utils/_foreach_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c079794808248d126b11a7600118fe9e64ac5a5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_foreach_utils.py
@@ -0,0 +1,47 @@
+from typing import List, Dict, Tuple, Optional
+
+import torch
+from torch import Tensor
+from torch.autograd.grad_mode import no_grad
+from typing_extensions import TypeAlias
+
+def _get_foreach_kernels_supported_devices() -> List[str]:
+    r"""Return the device type list that supports foreach kernels."""
+    return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
+
+def _get_fused_kernels_supported_devices() -> List[str]:
+    r"""Return the device type list that supports fused kernels in optimizer."""
+    return ["cuda", "xpu", torch._C._get_privateuse1_backend_name()]
+
+TensorListList: TypeAlias = List[List[Optional[Tensor]]]
+Indices: TypeAlias = List[int]
+
+# This util function splits tensors into groups by device and dtype, which is useful before sending
+# tensors off to a foreach implementation, which requires tensors to be on one device and dtype.
+# If tensorlistlist contains more than one tensorlist, the following assumptions are made BUT NOT verified:
+#   - tensorlists CAN be None
+#   - all tensors in the first specified list cannot be None
+#   - given an index i, all specified tensorlist[i]s match in dtype and device
+# with_indices (bool, optional): whether to track previous indices as the last list per dictionary entry.
+#   It comes in handy if there are Nones or literals in the tensorlists that are getting scattered out.
+#   Whereas mutating a tensor in the resulting split-up tensorlists WILL propagate changes back to the
+#   original input tensorlists, changing up Nones/literals WILL NOT propagate, and manual propagation
+#   may be necessary. Check out torch/optim/sgd.py for an example.
+@no_grad()
+def _group_tensors_by_device_and_dtype(
+    tensorlistlist: TensorListList,
+    with_indices: bool = False,
+) -> Dict[Tuple[torch.device, torch.dtype], Tuple[TensorListList, Indices]]:
+    return {
+        (device, getattr(torch, str_dtype)): value
+        for (device, str_dtype), value in
+        torch._C._group_tensors_by_device_and_dtype(tensorlistlist, with_indices).items()
+    }
+
+
+def _device_has_foreach_support(device: torch.device) -> bool:
+    return device.type in (_get_foreach_kernels_supported_devices() + ["cpu"]) and not torch.jit.is_scripting()
+
+
+def _has_foreach_support(tensors: List[Tensor], device: torch.device) -> bool:
+    return _device_has_foreach_support(device) and all(t is None or type(t) == torch.Tensor for t in tensors)
diff --git a/MLPY/Lib/site-packages/torch/utils/_freeze.py b/MLPY/Lib/site-packages/torch/utils/_freeze.py
new file mode 100644
index 0000000000000000000000000000000000000000..0385ca7437558e559c87cf9860df90daf4f28e0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_freeze.py
@@ -0,0 +1,289 @@
+"""
+Freeze Python packages.
+
+Freezing makes it possible to ship arbitrary Python modules as part of a C++
+library. The Python source of the module is compiled to bytecode and written
+to `.c` files, to be imported by Python's built-in FrozenImporter.
+
+In a normal Python installation, FrozenImporter is only used to bootstrap the
+initialization of the import machinery. Python's importers are defined in
+Python (see `_bootstrap.py` and `_bootstrap_external.py`) but need to be
+retrieved before any importers are available. Freezing the module bytecode
+resolves this circular dependency.
+
+This script will freeze the Python standard library. It produces two things:
+- Bytecode files: A set of `.c` that define C variables containing Python bytecode.
+- Main file: A `main.c` file listing all of these modules in the right form to be
+  consumed by FrozenImporter.
+
+The library that wishes to these modules make them available to the local
+Python instance by extending `PyImport_FrozenModules` appropriately (see
+https://docs.python.org/3/c-api/import.html#c.PyImport_FrozenModules).
+"""
+
+import argparse
+import functools
+import itertools
+import marshal
+import os
+import types
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+
+
+PATH_MARKER = "<Generated by torch::deploy>"
+MAIN_INCLUDES = """#include <Python.h>
+
+"""
+
+MAIN_PREFIX_TEMPLATE = """
+// Compiled standard library modules. These should be appended to the existing
+// `PyImport_FrozenModules` that ships with CPython.
+struct _frozen {}[] = {{
+"""
+
+FAKE_PREFIX = MAIN_PREFIX_TEMPLATE.format("_PyImport_FrozenModules")
+
+MAIN_SUFFIX = """\
+    {0, 0, 0} /* sentinel */
+};
+"""
+
+# Exclude some standard library modules to:
+# 1. Slim down the final frozen lib.
+# 2. Remove functionality we don't want to support.
+DENY_LIST = [
+    # Interface to unix databases
+    "dbm",
+    # ncurses bindings (terminal interfaces)
+    "curses",
+    # Tcl/Tk GUI
+    "tkinter",
+    "tkinter",
+    # Tests for the standard library
+    "test",
+    "tests",
+    "idle_test",
+    "__phello__.foo.py",
+    # importlib frozen modules. These are already baked into CPython.
+    "_bootstrap.py",
+    "_bootstrap_external.py",
+]
+
+NUM_BYTECODE_FILES = 5
+
+
+def indent_msg(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        args[0].indent += 1
+        ret = fn(*args, **kwargs)
+        args[0].indent -= 1
+        return ret
+
+    return wrapper
+
+
+@dataclass
+class FrozenModule:
+    # The fully qualified module name, e.g. 'foo.bar.baz'
+    module_name: str
+    # The name of the C variable that holds the bytecode, e.g. 'M_foo__bar__baz'
+    c_name: str
+    # The size of the C variable. Negative if this module is a package.
+    size: int
+    # The frozen bytecode
+    bytecode: bytes
+
+
+class Freezer:
+    def __init__(self, verbose: bool):
+        self.frozen_modules: List[FrozenModule] = []
+        self.indent: int = 0
+        self.verbose: bool = verbose
+
+    def msg(self, path: Path, code: str):
+        if not self.verbose:
+            return
+        # P: package dir
+        # F: python file
+        # S: skipped (not a package dir)
+        # X: skipped (deny-listed)
+        # N: skipped (not a python file)
+        for i in range(self.indent):
+            print("    ", end="")
+        print(f"{code} {path}")
+
+    def write_bytecode(self, install_root):
+        """
+        Write the `.c` files containing the frozen bytecode.
+
+        Shared frozen modules evenly across the files.
+        """
+        bytecode_file_names = [f"bytecode_{i}.c" for i in range(NUM_BYTECODE_FILES)]
+        bytecode_files = [
+            open(os.path.join(install_root, name), "w") for name in bytecode_file_names
+        ]
+        it = itertools.cycle(bytecode_files)
+        for m in self.frozen_modules:
+            self.write_frozen(m, next(it))
+
+        for f in bytecode_files:
+            f.close()
+
+    def write_main(self, install_root, oss, symbol_name):
+        """Write the `main.c` file containing a table enumerating all the frozen modules."""
+        with open(os.path.join(install_root, "main.c"), "w") as outfp:
+            outfp.write(MAIN_INCLUDES)
+            for m in self.frozen_modules:
+                outfp.write(f"extern unsigned char {m.c_name}[];\n")
+
+            outfp.write(MAIN_PREFIX_TEMPLATE.format(symbol_name))
+            for m in self.frozen_modules:
+                outfp.write(f'\t{{"{m.module_name}", {m.c_name}, {m.size}}},\n')
+            outfp.write(MAIN_SUFFIX)
+            if oss:
+                outfp.write(FAKE_PREFIX)
+                outfp.write(MAIN_SUFFIX)
+
+    def write_frozen(self, m: FrozenModule, outfp):
+        """Write a single frozen module's bytecode out to a C variable."""
+        outfp.write(f"unsigned char {m.c_name}[] = {{")
+        for i in range(0, len(m.bytecode), 16):
+            outfp.write("\n\t")
+            for c in bytes(m.bytecode[i : i + 16]):
+                outfp.write("%d," % c)
+        outfp.write("\n};\n")
+
+    def compile_path(self, path: Path, top_package_path: Path):
+        """Entry point for compiling a Path object."""
+        if path.is_dir():
+            self.compile_package(path, top_package_path)
+        else:
+            self.compile_file(path, top_package_path)
+
+    @indent_msg
+    def compile_package(self, path: Path, top_package_path: Path):
+        """Compile all the files within a Python package dir."""
+        assert path.is_dir()
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        # Python packages are directories that have __init__.py in them.
+        is_package_dir = any(child.name == "__init__.py" for child in path.iterdir())
+        if not is_package_dir:
+            self.msg(path, "S")
+            return
+
+        self.msg(path, "P")
+        # Recursively compile all children in this dir
+        for child in path.iterdir():
+            self.compile_path(child, top_package_path)
+
+    def get_module_qualname(self, file_path: Path, top_package_path: Path) -> List[str]:
+        # `path` looks like 'Lib/foo/bar/baz.py'
+
+        # chop off 'Lib/' to get something that represents a Python module hierarchy.
+        # e.g. 'foo/bar/baz.py', which maps to 'foo.bar.baz'
+        normalized_path = file_path.relative_to(top_package_path.parent)
+
+        if normalized_path.name == "__init__.py":
+            # Special handling for `__init__.py`. In this case, this file
+            # specifies that the containing directory should be treated as a package.
+            # For 'foo/bar/baz/__init__.py':
+            # - The module name is 'baz'
+            module_basename = normalized_path.parent.name
+            # - The parent is foo.bar (need to shave off the 'baz')
+            module_parent = normalized_path.parent.parent.parts
+        else:
+            module_basename = normalized_path.stem
+            module_parent = normalized_path.parent.parts
+        return list(module_parent) + [module_basename]
+
+    def compile_string(self, file_content: str) -> types.CodeType:
+        # instead of passing in the real build time path to 'compile', we
+        # pass in a marker instead. This prevents the build time path being
+        # leaked to runtime. That path may not be available at runtime.
+        # Setting the path to a mark make sure it's a hard error rather
+        # than a flaky error when inspect module tries to retrieve python source
+        # code during torchscripting.
+        path_marker = PATH_MARKER
+        return compile(file_content, path_marker, "exec")
+
+    @indent_msg
+    def compile_file(self, path: Path, top_package_path: Path):
+        """
+        Compile a Python source file to frozen bytecode.
+
+        Append the result to `self.frozen_modules`.
+        """
+        assert path.is_file()
+        if path.suffix != ".py":
+            self.msg(path, "N")
+            return
+
+        if path.name in DENY_LIST:
+            self.msg(path, "X")
+            return
+
+        self.msg(path, "F")
+        module_qualname = self.get_module_qualname(path, top_package_path)
+        module_mangled_name = "__".join(module_qualname)
+        c_name = "M_" + module_mangled_name
+
+        with open(path) as src_file:
+            co = self.compile_string(src_file.read())
+
+        bytecode = marshal.dumps(co)
+        size = len(bytecode)
+        if path.name == "__init__.py":
+            # Python packages are signified by negative size.
+            size = -size
+        self.frozen_modules.append(
+            FrozenModule(".".join(module_qualname), c_name, size, bytecode)
+        )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compile py source")
+    parser.add_argument("paths", nargs="*", help="Paths to freeze.")
+    parser.add_argument("--verbose", action="store_true", help="Print debug logs")
+    parser.add_argument(
+        "--install-dir", "--install_dir", help="Root directory for all output files"
+    )
+    parser.add_argument(
+        "--oss",
+        action="store_true",
+        help="If it's OSS build, add a fake _PyImport_FrozenModules",
+    )
+    parser.add_argument(
+        "--symbol-name",
+        "--symbol_name",
+        help="The name of the frozen module array symbol to generate",
+        default="_PyImport_FrozenModules_torch",
+    )
+
+    args = parser.parse_args()
+
+    f = Freezer(args.verbose)
+
+    for p in args.paths:
+        path = Path(p)
+        if path.is_dir() and not Path.exists(path / "__init__.py"):
+            # this 'top level path p' is a standard directory containing modules,
+            # not a module itself
+            # each 'mod' could be a dir containing __init__.py or .py file
+            # NB: sorted to make sure this is deterministic
+            for mod in sorted(path.glob("*")):
+                f.compile_path(mod, mod)
+        else:
+            f.compile_path(path, path)
+
+    f.write_bytecode(args.install_dir)
+    f.write_main(args.install_dir, args.oss, args.symbol_name)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/MLPY/Lib/site-packages/torch/utils/_import_utils.py b/MLPY/Lib/site-packages/torch/utils/_import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7836ca4b38c9ef17179fef18242e6c2f7ab8b50
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_import_utils.py
@@ -0,0 +1,42 @@
+import functools
+import importlib.util
+
+import torch
+
+
+def _check_module_exists(name: str) -> bool:
+    r"""Returns if a top-level module with :attr:`name` exists *without**
+    importing it. This is generally safer than try-catch block around a
+    `import X`. It avoids third party libraries breaking assumptions of some of
+    our tests, e.g., setting multiprocessing start method when imported
+    (see librosa/#747, torchvision/#544).
+    """
+    try:
+        spec = importlib.util.find_spec(name)
+        return spec is not None
+    except ImportError:
+        return False
+
+
+@functools.lru_cache
+def dill_available():
+    return (
+        _check_module_exists("dill")
+        # dill fails to import under torchdeploy
+        and not torch._running_with_deploy()
+    )
+
+
+@functools.lru_cache
+def import_dill():
+    if not dill_available():
+        return None
+
+    import dill
+
+    # XXX: By default, dill writes the Pickler dispatch table to inject its
+    # own logic there. This globally affects the behavior of the standard library
+    # pickler for any user who transitively depends on this module!
+    # Undo this extension to avoid altering the behavior of the pickler globally.
+    dill.extend(use_dill=False)
+    return dill
diff --git a/MLPY/Lib/site-packages/torch/utils/_mode_utils.py b/MLPY/Lib/site-packages/torch/utils/_mode_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b942404dc79c56d6666b3133011569a7f6717bc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_mode_utils.py
@@ -0,0 +1,10 @@
+import torch
+from typing import TypeVar
+
+T = TypeVar('T')
+
+# returns if all are the same mode
+def all_same_mode(modes):
+    return all(tuple(mode == modes[0] for mode in modes))
+
+no_dispatch = torch._C._DisableTorchDispatch
diff --git a/MLPY/Lib/site-packages/torch/utils/_python_dispatch.py b/MLPY/Lib/site-packages/torch/utils/_python_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..16955de19dedec470ce30aca23d3f1ae8b47be73
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_python_dispatch.py
@@ -0,0 +1,495 @@
+import contextlib
+from typing import Optional, Union, List, Set, Dict, Any
+
+import warnings
+from dataclasses import dataclass
+import torch
+import torchgen
+from torch._C import _len_torch_dispatch_stack, _get_dispatch_stack_at, \
+    _pop_torch_dispatch_stack, _push_on_torch_dispatch_stack, DispatchKey
+
+
+# TODO: Limitations and things about enable_torch_dispatch_mode we should fix before exposing it:
+# - We need a better user-facing api for _DisableTorchDispatch that
+#   is able to selectively disable __torch_dispatch__ of a particular class.
+# - It doesn't work with the tensor constructors (torch.tensor, torch.Tensor)
+# - Better name (see https://github.com/pytorch/pytorch/pull/63496#discussion_r694091694)
+
+class TorchDispatchMode:
+    """
+    A ``TorchDispatchMode`` allows you to override the meaning of all
+    ``__torch_dispatch__`` overrideable functions within a dynamic scope,
+    without having to actually create a tensor subclass or manually
+    monkey-patch functions in the PyTorch API.  Some common situations
+    where you should use a mode:
+
+        * You want to override the meaning of factory functions, or other
+          functions that do not otherwise take a tensor as an argument
+          (these cannot be overridden with tensor subclasses).
+
+        * You want to override the behavior of all functions without needing
+          to wrap your inputs in tensor subclasses; e.g., if you are just
+          interested in logging intermediate computations.
+
+        * You want to control the order of execution of various tensor
+          subclasses explicitly, rather than implicitly via the return of
+          ``NotImplemented``.
+
+    Independent subclasses of :class:`TorchDispatchMode` are compositional:
+    modes can be pushed onto a stack using ``with MyMode():``.
+    When you call functions in the PyTorch API inside your
+    ``__torch_dispatch__`` implementation, by default, they will forward on to
+    the next mode on the mode stack.  If you want recursively call back into
+    your current ``__torch_dispatch__`` implementation, either explicitly
+    invoke ``self.__torch_dispatch__(...)``, or use the context manager
+    ``__torch_dispatch__(self)`` to make PyTorch
+    API self-referential (beware of infinite loops, in this case!)
+    """
+
+    def __init__(self, _dispatch_key=None):
+        if _dispatch_key is not None:
+            assert isinstance(_dispatch_key, torch._C.DispatchKey)
+            self.__dict__['_dispatch_key'] = _dispatch_key
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        raise NotImplementedError()
+
+    def __enter__(self):
+        _push_mode(self)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        mb_dk_or_mode_key = self.__dict__.get("_dispatch_key", None)
+        if mb_dk_or_mode_key is None:
+            # Today, mode keys are not used at all in the per-dispatch-key-mode logic (for pre-dispatch)
+            # We should probably revisit this.
+            mb_dk_or_mode_key = self.__dict__.get("_mode_key", None)
+        _pop_mode(mb_dk_or_mode_key)
+
+    @classmethod
+    def push(cls, *args, **kwargs):
+        warnings.warn("`Mode.push()` is no longer necessary and can be replaced with just `with Mode()`")
+        instance = cls(*args, **kwargs)
+        return instance
+
+def _get_current_dispatch_mode():
+    stack_len = _len_torch_dispatch_stack()
+    # Return a user mode on the stack if there are any
+    if stack_len > 0:
+        return _get_dispatch_stack_at(stack_len - 1)
+    return None
+
+
+def _detect_functional_mode():
+    from torch._ops import _get_dispatch_mode_pre_dispatch
+    pre_dispatch_functional_mode = _get_dispatch_mode_pre_dispatch(torch._C._TorchDispatchModeKey.FUNCTIONAL)
+    post_dispatch_functional_mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FUNCTIONAL)
+
+    assert (pre_dispatch_functional_mode is None) or (post_dispatch_functional_mode is None)
+
+    if pre_dispatch_functional_mode is None:
+        return post_dispatch_functional_mode
+
+    return pre_dispatch_functional_mode
+
+def _unset_infra_mode(key):
+    from torch._ops import unset_mode_pre_dispatch, _get_dispatch_mode_pre_dispatch
+    pre_dispatch_mode = _get_dispatch_mode_pre_dispatch(key)
+    post_dispatch_mode = torch._C._get_dispatch_mode(key)
+    if pre_dispatch_mode and post_dispatch_mode:
+        raise AssertionError("Can't have active infra mode on both pre and post dispatch mode stack")
+
+    if pre_dispatch_mode:
+        mode = unset_mode_pre_dispatch(key)
+        return mode
+    if post_dispatch_mode:
+        return torch._C._unset_dispatch_mode(key)
+
+
+def _disable_infra_mode(key):
+    assert key in (torch._C._TorchDispatchModeKey.FUNCTIONAL, torch._C._TorchDispatchModeKey.PROXY)
+    mode_unset = _unset_infra_mode(key)
+    try:
+        yield mode_unset
+    finally:
+        if mode_unset is not None:
+            _push_mode(mode_unset)
+
+
+def _get_current_dispatch_mode_stack():
+    stack_len = _len_torch_dispatch_stack()
+    return [_get_dispatch_stack_at(i) for i in range(stack_len)]
+
+
+def _push_mode(mode):
+    k = mode._dispatch_key if hasattr(mode, "_dispatch_key") else None
+    assert k is None or k == torch._C.DispatchKey.PreDispatch
+    if k is None:
+        _push_on_torch_dispatch_stack(mode)
+        return
+
+    from torch._ops import get_cached_ops, _set_mode_pre_dispatch
+    # See Note [Not Caching Per-Dispatch-Key Mode Handlers]
+    # Clear the cache of every op that has been used so far, for this particular key.
+    ks = torch._C._functionality_to_backend_keys(k)
+    for op in get_cached_ops():
+        for key in ks:
+            op._uncache_dispatch(key)
+    _set_mode_pre_dispatch(mode)
+
+
+def _pop_mode(k: Optional[Union[DispatchKey, torch._C._TorchDispatchModeKey]] = None):
+    if k == torch._C.DispatchKey.PreDispatch:  # type: ignore[attr-defined]
+        from torch._ops import _pop_mode_from_pre_dispatch
+        return _pop_mode_from_pre_dispatch()
+
+    if k is None or isinstance(k, torch._C._TorchDispatchModeKey):
+        return _pop_torch_dispatch_stack(k)
+
+@contextlib.contextmanager
+def _pop_mode_temporarily(k: Optional[DispatchKey] = None):
+    old = _pop_mode(k)
+    try:
+        yield old
+    finally:
+        _push_mode(old)
+
+@contextlib.contextmanager
+def _disable_current_modes():
+    from torch._ops import _len_torch_dispatch_stack_pre_dispatch, _pop_mode_from_pre_dispatch
+    from torch._subclasses.functional_tensor import FunctionalTensorMode
+    from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+    mode_len_pre_dispatch = _len_torch_dispatch_stack_pre_dispatch()
+    old_pre_dispatch_modes = [_pop_mode_from_pre_dispatch() for _ in range(mode_len_pre_dispatch)]
+
+    has_proxy_mode_in_pre_dispatch = False
+    has_functional_mode_in_pre_dispatch = False
+
+    for i in old_pre_dispatch_modes:
+        if isinstance(i, ProxyTorchDispatchMode):
+            has_proxy_mode_in_pre_dispatch = True
+        if isinstance(i, FunctionalTensorMode):
+            has_functional_mode_in_pre_dispatch = True
+
+    mode_len = _len_torch_dispatch_stack()
+    old_modes = [_pop_mode() for _ in range(mode_len)]
+
+    for old in old_modes:
+        if isinstance(old, FunctionalTensorMode) and has_functional_mode_in_pre_dispatch:
+            raise AssertionError("Can't have FunctionalMode available both in PreDispatch and Python Key")
+        if isinstance(old, ProxyTorchDispatchMode) and has_proxy_mode_in_pre_dispatch:
+            raise AssertionError("Can't have ProxyTorchDispatchMode available both in PreDispatch and Python Key")
+
+    # Manually disable proxy and fake modes, if any are active
+    try:
+        yield old_pre_dispatch_modes + old_modes
+    finally:
+        for mode in reversed(old_modes):
+            _push_mode(mode)
+        for mode in reversed(old_pre_dispatch_modes):
+            _push_mode(mode)
+
+
+class BaseTorchDispatchMode(TorchDispatchMode):
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        return func(*args, **kwargs)
+
+def is_traceable_wrapper_subclass(t):
+    """
+    Returns whether or not a tensor subclass that implements __torch_dispatch__
+    is 'traceable' with torch.compile.
+    In order for a tensor subclass to support TorchDispatchMode-style tracing in PT2,
+    It must implement two magic methods: __tensor_flatten__ and __tensor_unflatten__.
+    It is also expected to obey some restrictions around traceability and aliasing:
+        * The subclass's __torch_dispatch__() implementation should desugar into pytorch
+            dispatcher operations that can be traced into a graph.
+        * The subclass should use return_and_correct_aliasing(). This is needed today to make
+            sure that torch.compile does the right thing in a few cases around input mutation
+            and output aliasing.
+
+    Expected magic method signatures:
+        attrs, ctx = t.__tensor_flatten__()
+            attrs: list of attribute name strings for inner tensors
+            ctx: dict containing any other subclass-specific metadata needed for unflattening
+
+        t = MySubClass.__tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride)
+            inner_tensors: dict mapping attribute name -> tensor for each inner tensor
+            ctx: dict with subclass metadata in the form that __tensor_flatten__() produces
+            outer_size: expected (possibly symbolic) size that the returned subclass
+                instance should have. Note that this arg is useful for certain subclasses
+                that require the shape info to be constructed. In most cases, this arg can be
+                safely ignored.
+            outer_stride: expected (possibly symbolic) stride that the returned subclass
+                instance should have. Note that this arg is useful for certain subclasses
+                that require the stride info to be constructed. In most cases, this arg can be
+                safely ignored.
+    """
+    is_subclass = isinstance(t, torch.Tensor) and type(t) != torch.Tensor
+    return is_subclass and hasattr(t, "__tensor_flatten__") and hasattr(t, "__tensor_unflatten__")
+
+def transform_subclass(t, callback, outer_size=None, outer_stride=None):
+    """
+    Given a traceable, wrapper tensor subclass ``t`` that implements
+    ``__torch_dispatch__`` and holds some inner tensors,
+    and a callback of type ``Callable[[str, torch.Tensor], torch.Tensor]``,
+    `transform_subclass` will construct a fresh instance of the wrapper tensor subclass.
+    It will do so by grabbing each inner tensor attribute from the wrapper,
+    passing them into ``callback`` to get a transformed tensor,
+    and putting each transformed tensor into the fresh tensor subclass instance.
+
+    Note: this function will not handle ensuring that the fresh subclass
+    gets the same (autograd, and aliasing) metadata as the original tensor.
+    This is generally handled in other subsystems like AOTAutograd.
+    """
+    outer_size = outer_size if outer_size is not None else t.size()
+    outer_stride = outer_stride if outer_stride is not None else t.stride()
+
+    attrs, ctx = t.__tensor_flatten__()
+    transformed_tensors_dict = {}
+    for attr in attrs:
+        transformed_tensors_dict[attr] = callback(attr, getattr(t, attr))
+    sub = type(t).__tensor_unflatten__(
+        transformed_tensors_dict, ctx, outer_size, outer_stride
+    )
+
+    # NB: Purposefully guard here to simplify the inner / outer symbols.
+    # Using sym_eq() for symbolic comparison can result in an expression that's too
+    # difficult to guard on, so we use == here.
+    assert sub.shape == outer_size, \
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have " \
+        f"shape equal to {outer_size}, but got: {sub.shape}"
+    assert sub.stride() == outer_stride, \
+        f"Expected return value from {type(t)}__tensor_unflatten__() to have " \
+        f"stride equal to {outer_stride}, but got: {sub.stride()}"
+
+    return sub
+
+def _correct_storage_aliasing(func, schema_info, args, outs):
+    """
+    Given: an OpOverload, a SchemaInfo (cached information from torchgen about schema),
+    and the inputs/outputs to the OpOverload,
+    this function checks to see if func is a view operator
+    (by checking if any of the outputs in the op's schema
+     are immutable aliases of inputs).
+    If so, this function manually aliases the storage of the output tensor
+    with its corresponding input tensor alias.
+    It does this by unsafely overwriting the storage field of the output tensor
+    to be the same storage as the input.
+    """
+    assert isinstance(func, torch._ops.OpOverload)
+    assert isinstance(args, tuple)
+    assert isinstance(outs, (list, tuple))
+    flat_outs = torch.utils._pytree.tree_leaves(outs)
+
+    def alias_non_inplace_storage(arg, ret):
+        # This is hopefully a reasonable assert:
+        # subclasses that rely on this API for output aliasing
+        # should always return wrapper tensor subclasses for us to manually alias.
+        # in theory if a subclass that needs this API wants to sometimes return
+        # plain tensors, we could remove the assert and just not perform the aliasing,
+        # but it seems safer to learn more about this case first.
+        if is_traceable_wrapper_subclass(arg) or is_traceable_wrapper_subclass(ret):
+            ret_list = ret if isinstance(ret, list) else [ret]
+            for r in ret_list:
+                assert type(arg) == type(r), f"""Called {str(func)} with input of type {type(arg)}
+and output of type {type(ret)}. But expected types to match."""
+        # Need to run under no_dispatch, because we explicitly do **not**
+        # want our subclass to intercept the set_() call.
+        # instead, our subclass should directly have its storage swapped out.
+        with torch.utils._mode_utils.no_dispatch():
+            # See Note: [Fake Tensor Dispatch Keys]
+            # we're borrowing the way it modifies dispatch key TLS.
+            meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+            torch._C._set_meta_in_tls_dispatch_include(True)
+            try:
+                # directly calling this overload, and passing ret.shape, because we **explicitly**
+                # don't want to reset the sizes on ret, if the storage implies a size change.
+                # Why?
+                # The purpose of this API is *not* to change the size/strides of our output- we assume it's already correct.
+                # We just want to "fix up" the storage aliasing, without modifying or output's metadata.
+                # Example: out = inp.expand(inp.shape[0], inp.shape[0])
+                #     This requires swapping the storage of out to be the same as inp,
+                #     but we do *not* want it to change the sizes/strides that were compute for out.
+
+                if isinstance(ret, list):
+                    for r in ret:
+                        torch.ops.aten.set_.source_Storage_storage_offset(
+                            r, arg.untyped_storage(), r.storage_offset(), r.shape, r.stride())
+                else:
+                    assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
+                    torch.ops.aten.set_.source_Storage_storage_offset(
+                        ret, arg.untyped_storage(), ret.storage_offset(), ret.shape, ret.stride()
+                    )
+            finally:
+                torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
+
+    def is_read_only_alias_match(arg, ret):
+        shared_aliases = arg.alias_set & ret.alias_set
+        return len(shared_aliases) > 0 and not arg.is_write
+
+    num_args = len(func._schema.arguments)
+    num_returns = len(func._schema.returns)
+    for arg_idx in range(num_args):
+        for return_idx in range(num_returns):
+            if is_read_only_alias_match(schema_info.args[arg_idx], schema_info.outs[return_idx]):
+                alias_non_inplace_storage(args[arg_idx], outs[return_idx])
+
+# This abstracts over the fact that in return_and_correct_aliasing,
+# we sometimes use torchgen schema parsing (for aten ops, since torchscript's schema parsing is sometimes buggy),
+# and sometimes use torchscript schema parsing (for custom ops, for which torchgen parsing is untested).
+@dataclass
+class AliasInfo:
+    alias_set: Set[str]
+    is_write: bool
+    name: Optional[str]
+
+@dataclass
+class SchemaInfo:
+    args: List[AliasInfo]
+    outs: List[AliasInfo]
+
+# Can't import torch._ops.OpOverload due to circular reference
+parsed_schema_map: Dict[Any, SchemaInfo] = {}
+
+# Given an OpOverload, returns schema information on it.
+# This is cached for efficiency, since it can involve running torchgen
+def get_alias_info(func) -> SchemaInfo:
+    if func in parsed_schema_map:
+        return parsed_schema_map[func]
+    # For ATen ops: use torchgen (since torchscript parser doesn't handle alias annotations
+    # properly for some ops that output tensorlists)
+    if func.namespace == "aten":
+        torchgen_schema_str = str(func._schema)
+        assert torchgen_schema_str.startswith("aten::")
+        # remove the aten:: namespace, which is added by the torchscript parser,
+        # and torchgen doesn't know how to handle
+        torchgen_schema_str = torchgen_schema_str[6:]
+        import re
+        # the torchscript parser ends up converting int[2]=1 into int[2]=[1, 1],
+        # which torchgen chokes on.
+        torchgen_schema_str = re.sub(r'=\[[0, ]+\]', '=0', torchgen_schema_str)
+        torchgen_schema_str = re.sub(r'=\[[1, ]+\]', '=1', torchgen_schema_str)
+        # for aten::rot90
+        torchgen_schema_str = torchgen_schema_str.replace("=[0, 1]", "=[0,1]")
+        torchgen_schema = torchgen.model.FunctionSchema.parse(torchgen_schema_str)
+        arg_schemas = [AliasInfo(
+            alias_set=set() if a.annotation is None else set(a.annotation.alias_set),
+            is_write=a.annotation is not None and a.annotation.is_write,
+            name=a.name,
+        ) for a in torchgen_schema.arguments.flat_all]
+        out_schemas = [AliasInfo(
+            alias_set=set() if a.annotation is None else set(a.annotation.alias_set),
+            is_write=a.annotation is not None and a.annotation.is_write,
+            name=a.name,
+        ) for a in torchgen_schema.returns]
+    else:
+        # For non-aten ops, torchgen is untested so we rely on torchscript schema parsing
+        arg_schemas = [AliasInfo(
+            alias_set=set() if a.alias_info is None else set(a.alias_info.before_set),
+            is_write=a.alias_info is not None and a.alias_info.is_write,
+            name=a.name,
+        ) for a in func._schema.arguments]
+        out_schemas = [AliasInfo(
+            alias_set=set() if a.alias_info is None else set(a.alias_info.before_set),
+            is_write=a.alias_info is not None and a.alias_info.is_write,
+            name=a.name,
+        ) for a in func._schema.returns]
+    schema_info = SchemaInfo(args=arg_schemas, outs=out_schemas)
+    parsed_schema_map[func] = schema_info
+    return schema_info
+
+def return_and_correct_aliasing(func, args, kwargs, out):
+    """
+    This function should be used by wrapper tensor ``__torch_dispatch__`` subclasses
+    that would like to work with torch.compile. It ensures that the subclass
+    properly implements the aliasing behavior of every op,
+    which is needed for correctness in AOTAutograd.
+    This function will handle:
+
+        * When we see a view op, we will alias the storages of any
+          input and output tensor subclasses
+
+        * When we see an inplace or out= op, we will directly
+          return the corresponding input tensor, instead of returning
+          a (potentially) fresh output tensor.
+    """
+
+    # Caching here because torchgen parsing is definitely not fast, and this function is called
+    # once for every op in the graph during functionalization.
+    schema_info = get_alias_info(func)
+
+    def get_write_alias(x):
+        if len(x.alias_set) == 0:
+            return None
+        alias_set = list(x.alias_set)
+        # torchscript allows for complicated alias sets, but our dispatcher ops only really involve simple aliasing
+        assert len(alias_set) == 1
+        if x.is_write:
+            return alias_set[0]
+        return None
+
+    def get_arg_from_alias(output_alias, schema_info, args, kwargs):
+        new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(func, args=args, kwargs=kwargs)
+
+        arg_indices = [
+            i for i, a in enumerate(schema_info.args)
+            if output_alias in a.alias_set
+        ]
+        # For any dispatcher op with an output alias, we expect it to map to exactly one alias in the schema's input arguments.
+        assert len(arg_indices) == 1
+        idx = arg_indices[0]
+        arg_info = schema_info.args[idx]
+        if arg_info.name is not None and arg_info.name in new_kwargs:
+            return new_kwargs[arg_info.name]
+        return new_args[idx]
+
+    # Fix up the storages of any outs so that they point to the same storage as the input,
+    # if func is a view op.
+    _correct_storage_aliasing(func, schema_info, args, (out,) if not isinstance(out, tuple) else out)
+
+    # For inplace_view ops in particular, we'll try hard to make sure that the wrapper subclass's
+    # metadata is set correctly.
+    if torch.Tag.inplace_view in func.tags:
+        # no_dispatch() to make sure that we secretly change the metadata on the wrapper,
+        # but don't end up dispatching the op anywhere else.
+        mutated_args = [x for i, x in enumerate(args) if get_write_alias(schema_info.args[i]) is not None]
+        # Assumption: we have a very small number of inplace_view ops that follow a strict schema:
+        # there is only a single argument that gets its metadata mutated.
+        assert len(mutated_args) == 1
+        # This check exists because we generally *do* want to update the metadata of any wrapper subclasses,
+        # but FunctionalTensor is special: it overrides all size/stride calls to plumb to the inner tensor.
+        # so we don't actually need to update the metadata (and attempting to do so causes errors)
+        from torch._subclasses.functional_tensor import FunctionalTensor
+        if not isinstance(mutated_args[0], FunctionalTensor):
+            with torch.utils._mode_utils.no_dispatch():
+                # See Note: [Fake Tensor Dispatch Keys]
+                # we're borrowing the way it modifies dispatch key TLS.
+                meta_in_tls = torch._C._meta_in_tls_dispatch_include()
+                torch._C._set_meta_in_tls_dispatch_include(True)
+                try:
+                    func(*args, **kwargs)
+                finally:
+                    torch._C._set_meta_in_tls_dispatch_include(meta_in_tls)
+
+    # Next: we need to make sure to return inputs directly, if the output is a mutable alias (e.g. add_()).
+
+    # simple case: none of our outputs have mutable aliases, so we can return the output as-is
+    if not any(get_write_alias(r) is not None for r in schema_info.outs):
+        return out
+
+    # simplifying assumption: we don't have **any** ops with return types like "-> (Tensor(a!), Tensor)"
+    if not all(get_write_alias(r) is not None for r in schema_info.outs):
+        raise RuntimeError("Unsupported schema: " + str(func._schema))
+
+    if len(func._schema.returns) == 1:
+        return get_arg_from_alias(get_write_alias(schema_info.outs[0]), schema_info, args, kwargs)
+
+    # In the multi-return case, all aten ops return a tuple / list, so cast accordingly.
+    outs_to_return = type(out)([
+        get_arg_from_alias(get_write_alias(schema_info.outs[i]), schema_info, args, kwargs)
+        if get_write_alias(r) is not None else o
+        for ((i, r), o) in zip(enumerate(schema_info.outs), out)
+    ])
+    return outs_to_return
diff --git a/MLPY/Lib/site-packages/torch/utils/_pytree.py b/MLPY/Lib/site-packages/torch/utils/_pytree.py
new file mode 100644
index 0000000000000000000000000000000000000000..8053559beb1232b53d9057eac7b20a634a06dd9e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_pytree.py
@@ -0,0 +1,1550 @@
+"""
+Contains utility functions for working with nested python data structures.
+
+A *pytree* is Python nested data structure. It is a tree in the sense that
+nodes are Python collections (e.g., list, tuple, dict) and the leaves are
+Python values. Furthermore, a pytree should not contain reference cycles.
+
+pytrees are useful for working with nested collections of Tensors. For example,
+one can use `tree_map` to map a function over all Tensors inside some nested
+collection of Tensors and `tree_leaves` to get a flat list of all Tensors
+inside some nested collection. pytrees are helpful for implementing nested
+collection support for PyTorch APIs.
+
+This pytree implementation is not very performant due to Python overhead
+To improve the performance we can move parts of the implementation to C++.
+"""
+
+import dataclasses
+import importlib
+import json
+import sys
+import threading
+import types
+import warnings
+from collections import defaultdict, deque, namedtuple, OrderedDict
+from typing import (
+    Any,
+    Callable,
+    cast,
+    DefaultDict,
+    Deque,
+    Dict,
+    FrozenSet,
+    Generic,
+    Hashable,
+    Iterable,
+    List,
+    Mapping,
+    NamedTuple,
+    Optional,
+    OrderedDict as GenericOrderedDict,
+    overload,
+    Protocol,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+
+__all__ = [
+    "PyTree",
+    "Context",
+    "FlattenFunc",
+    "UnflattenFunc",
+    "DumpableContext",
+    "ToDumpableContextFn",
+    "FromDumpableContextFn",
+    "TreeSpec",
+    "LeafSpec",
+    "keystr",
+    "key_get",
+    "register_pytree_node",
+    "tree_flatten",
+    "tree_flatten_with_path",
+    "tree_unflatten",
+    "tree_leaves",
+    "tree_leaves_with_path",
+    "tree_structure",
+    "tree_map",
+    "tree_map_with_path",
+    "tree_map_",
+    "tree_map_only",
+    "tree_map_only_",
+    "tree_all",
+    "tree_any",
+    "tree_all_only",
+    "tree_any_only",
+    "treespec_dumps",
+    "treespec_loads",
+    "treespec_pprint",
+]
+
+
+T = TypeVar("T")
+S = TypeVar("S")
+U = TypeVar("U")
+R = TypeVar("R")
+
+
+DEFAULT_TREESPEC_SERIALIZATION_PROTOCOL = 1
+NO_SERIALIZED_TYPE_NAME_FOUND = "NO_SERIALIZED_TYPE_NAME_FOUND"
+
+
+class KeyEntry(Protocol):
+    def __hash__(self) -> int:
+        ...
+
+    def __eq__(self, other: object) -> bool:
+        ...
+
+    def __str__(self) -> str:
+        ...
+
+    def get(self, parent: Any) -> Any:
+        ...
+
+
+Context = Any
+PyTree = Any
+FlattenFunc = Callable[[PyTree], Tuple[List[Any], Context]]
+UnflattenFunc = Callable[[Iterable[Any], Context], PyTree]
+DumpableContext = Any  # Any json dumpable text
+ToDumpableContextFn = Callable[[Context], DumpableContext]
+FromDumpableContextFn = Callable[[DumpableContext], Context]
+ToStrFunc = Callable[["TreeSpec", List[str]], str]
+MaybeFromStrFunc = Callable[[str], Optional[Tuple[Any, Context, str]]]
+KeyPath = Tuple[KeyEntry, ...]
+FlattenWithKeysFunc = Callable[[PyTree], Tuple[List[Tuple[KeyEntry, Any]], Any]]
+
+
+# A NodeDef holds two callables:
+# - flatten_fn should take the collection and return a flat list of values.
+#   It can also return some context that is used in reconstructing the
+#   collection.
+# - unflatten_fn should take a flat list of values and some context
+#   (returned by flatten_fn). It returns the collection by reconstructing
+#   it from the list and the context.
+# - flatten_with_keys_fn, which is a callable that takes a
+#   pytree and returns a list of (keypath, value) pairs and a context.
+class NodeDef(NamedTuple):
+    type: Type[Any]
+    flatten_fn: FlattenFunc
+    unflatten_fn: UnflattenFunc
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc]
+
+
+_NODE_REGISTRY_LOCK = threading.Lock()
+SUPPORTED_NODES: Dict[Type[Any], NodeDef] = {}
+
+
+# _SerializeNodeDef holds the following:
+# - typ: the type of the node (e.g., "Dict", "List", etc)
+# - serialized_type_name: the fully qualified name of the type, e.g. "collections.OrderedDict"
+# - to_dumpable_context takes a TreeSpec, and returns a serialized string format of the
+#   context, and the version number
+# - from_dumpable_context takes in a string representation of the context, and the
+#   version, and returns the deserialized context
+class _SerializeNodeDef(NamedTuple):
+    typ: Type[Any]
+    serialized_type_name: str
+    to_dumpable_context: Optional[ToDumpableContextFn]
+    from_dumpable_context: Optional[FromDumpableContextFn]
+
+
+SUPPORTED_SERIALIZED_TYPES: Dict[Type[Any], _SerializeNodeDef] = {}
+SERIALIZED_TYPE_TO_PYTHON_TYPE: Dict[str, Type[Any]] = {}
+
+
+def register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+) -> None:
+    """Register a container-like type as pytree node.
+
+    Args:
+        cls: the type to register
+        flatten_fn: A callable that takes a pytree and returns a flattened
+            representation of the pytree and additional context to represent the
+            flattened pytree.
+        unflatten_fn: A callable that takes a flattened version of the pytree,
+            additional context, and returns an unflattened pytree.
+        serialized_type_name: A keyword argument used to specify the fully qualified
+            name used when serializing the tree spec.
+        to_dumpable_context: An optional keyword argument to custom specify how
+            to convert the context of the pytree to a custom json dumpable
+            representation. This is used for json serialization, which is being
+            used in torch.export right now.
+        from_dumpable_context: An optional keyword argument to custom specify how
+            to convert the custom json dumpable representation of the context
+            back to the original context. This is used for json deserialization,
+            which is being used in torch.export right now.
+        flatten_with_keys_fn: An optional keyword argument to specify how to
+            access each pytree leaf's keypath when flattening and tree-mapping.
+            Like ``flatten_fn``, but in place of a List[leaf], it should return
+            a List[(keypath, leaf)].
+    """
+    with _NODE_REGISTRY_LOCK:
+        if cls in SUPPORTED_NODES:
+            raise ValueError(f"{cls} is already registered as pytree node.")
+
+    _private_register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+        flatten_with_keys_fn=flatten_with_keys_fn,
+    )
+
+    try:
+        from . import _cxx_pytree as cxx
+    except ImportError:
+        pass
+    else:
+        cxx._private_register_pytree_node(
+            cls,
+            flatten_fn,
+            unflatten_fn,
+            serialized_type_name=serialized_type_name,
+            to_dumpable_context=to_dumpable_context,
+            from_dumpable_context=from_dumpable_context,
+        )
+
+
+def _register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    to_str_fn: Optional[ToStrFunc] = None,  # deprecated
+    maybe_from_str_fn: Optional[MaybeFromStrFunc] = None,  # deprecated
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+) -> None:
+    """Register a container-like type as pytree node for the Python pytree only.
+
+    Args:
+        cls: the type to register
+        flatten_fn: A callable that takes a pytree and returns a flattened
+            representation of the pytree and additional context to represent the
+            flattened pytree.
+        unflatten_fn: A callable that takes a flattened version of the pytree,
+            additional context, and returns an unflattened pytree.
+        serialized_type_name: A keyword argument used to specify the fully qualified
+            name used when serializing the tree spec.
+        to_dumpable_context: An optional keyword argument to custom specify how
+            to convert the context of the pytree to a custom json dumpable
+            representation. This is used for json serialization, which is being
+            used in torch.export right now.
+        from_dumpable_context: An optional keyword argument to custom specify how
+            to convert the custom json dumpable representation of the context
+            back to the original context. This is used for json deserialization,
+            which is being used in torch.export right now.
+        flatten_with_keys_fn: An optional keyword argument to specify how to
+            access each pytree leaf's keypath when flattening and tree-mapping.
+            Like ``flatten_fn``, but in place of a List[leaf], it should return
+            a List[(keypath, leaf)].
+    """
+    warnings.warn(
+        "torch.utils._pytree._register_pytree_node is deprecated. "
+        "Please use torch.utils._pytree.register_pytree_node instead.",
+        stacklevel=2,
+    )
+
+    if to_str_fn is not None or maybe_from_str_fn is not None:
+        warnings.warn(
+            "to_str_fn and maybe_from_str_fn is deprecated. "
+            "Please use to_dumpable_context and from_dumpable_context instead."
+        )
+
+    _private_register_pytree_node(
+        cls,
+        flatten_fn,
+        unflatten_fn,
+        serialized_type_name=serialized_type_name,
+        to_dumpable_context=to_dumpable_context,
+        from_dumpable_context=from_dumpable_context,
+        flatten_with_keys_fn=flatten_with_keys_fn,
+    )
+
+
+def _private_register_pytree_node(
+    cls: Type[Any],
+    flatten_fn: FlattenFunc,
+    unflatten_fn: UnflattenFunc,
+    *,
+    serialized_type_name: Optional[str] = None,
+    to_dumpable_context: Optional[ToDumpableContextFn] = None,
+    from_dumpable_context: Optional[FromDumpableContextFn] = None,
+    flatten_with_keys_fn: Optional[FlattenWithKeysFunc] = None,
+) -> None:
+    """This is an internal function that is used to register a pytree node type
+    for the Python pytree only. End-users should use :func:`register_pytree_node`
+    instead.
+    """
+    with _NODE_REGISTRY_LOCK:
+        if cls in SUPPORTED_NODES:
+            # TODO: change this warning to an error after OSS/internal stabilize
+            warnings.warn(
+                f"{cls} is already registered as pytree node. "
+                "Overwriting the previous registration.",
+            )
+
+        node_def = NodeDef(cls, flatten_fn, unflatten_fn, flatten_with_keys_fn)
+        SUPPORTED_NODES[cls] = node_def
+
+        if (to_dumpable_context is None) ^ (from_dumpable_context is None):
+            raise ValueError(
+                f"Both to_dumpable_context and from_dumpable_context for {cls} must "
+                "be None or registered."
+            )
+
+        if serialized_type_name is None:
+            serialized_type_name = NO_SERIALIZED_TYPE_NAME_FOUND
+
+        serialize_node_def = _SerializeNodeDef(
+            cls,
+            serialized_type_name,
+            to_dumpable_context,
+            from_dumpable_context,
+        )
+        SUPPORTED_SERIALIZED_TYPES[cls] = serialize_node_def
+        SERIALIZED_TYPE_TO_PYTHON_TYPE[serialized_type_name] = cls
+
+
+@dataclasses.dataclass(frozen=True)
+class SequenceKey(Generic[T]):
+    idx: int
+
+    def __str__(self) -> str:
+        return f"[{self.idx!r}]"
+
+    def get(self, sequence: Sequence[T]) -> T:
+        return sequence[self.idx]
+
+
+K = TypeVar("K", bound=Hashable)
+
+
+@dataclasses.dataclass(frozen=True)
+class MappingKey(Generic[K, T]):
+    key: K
+
+    def __str__(self) -> str:
+        return f"[{self.key!r}]"
+
+    def get(self, mapping: Mapping[K, T]) -> T:
+        return mapping[self.key]
+
+
+@dataclasses.dataclass(frozen=True)
+class GetAttrKey:
+    name: str
+
+    def __str__(self) -> str:
+        return f".{self.name}"
+
+    def get(self, obj: Any) -> Any:
+        return getattr(obj, self.name)
+
+
+def _tuple_flatten(d: Tuple[Any, ...]) -> Tuple[List[Any], Context]:
+    return list(d), None
+
+
+def _tuple_flatten_with_keys(
+    d: Tuple[Any, ...]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _tuple_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+
+def _tuple_unflatten(values: Iterable[Any], context: Context) -> Tuple[Any, ...]:
+    return tuple(values)
+
+
+def _list_flatten(d: List[Any]) -> Tuple[List[Any], Context]:
+    return d, None
+
+
+def _list_flatten_with_keys(d: List[Any]) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _list_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+
+def _list_unflatten(values: Iterable[Any], context: Context) -> List[Any]:
+    return list(values)
+
+
+def _dict_flatten(d: Dict[Any, Any]) -> Tuple[List[Any], Context]:
+    return list(d.values()), list(d.keys())
+
+
+def _dict_flatten_with_keys(
+    d: Dict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _dict_flatten(d)
+    return [(MappingKey(k), v) for k, v in zip(context, values)], context
+
+
+def _dict_unflatten(values: Iterable[Any], context: Context) -> Dict[Any, Any]:
+    return dict(zip(context, values))
+
+
+def _namedtuple_flatten(d: NamedTuple) -> Tuple[List[Any], Context]:
+    return list(d), type(d)
+
+
+def _namedtuple_flatten_with_keys(
+    d: NamedTuple,
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _namedtuple_flatten(d)
+    return (
+        [(GetAttrKey(field), v) for field, v in zip(context._fields, values)],
+        context,
+    )
+
+
+def _namedtuple_unflatten(values: Iterable[Any], context: Context) -> NamedTuple:
+    return cast(NamedTuple, context(*values))
+
+
+def _namedtuple_serialize(context: Context) -> DumpableContext:
+    json_namedtuple = {
+        "class_name": context.__name__,
+        "fields": context._fields,
+    }
+    return json_namedtuple
+
+
+def _namedtuple_deserialize(dumpable_context: DumpableContext) -> Context:
+    class_name = dumpable_context["class_name"]
+    assert isinstance(class_name, str)
+    context = namedtuple(class_name, dumpable_context["fields"])  # type: ignore[misc]
+    return context
+
+
+def _ordereddict_flatten(d: GenericOrderedDict[Any, Any]) -> Tuple[List[Any], Context]:
+    return list(d.values()), list(d.keys())
+
+
+def _ordereddict_flatten_with_keys(
+    d: GenericOrderedDict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _ordereddict_flatten(d)
+    return [(MappingKey(k), v) for k, v in zip(context, values)], context
+
+
+def _ordereddict_unflatten(
+    values: Iterable[Any],
+    context: Context,
+) -> GenericOrderedDict[Any, Any]:
+    return OrderedDict((key, value) for key, value in zip(context, values))
+
+
+_odict_flatten = _ordereddict_flatten
+_odict_unflatten = _ordereddict_unflatten
+
+
+def _defaultdict_flatten(d: DefaultDict[Any, Any]) -> Tuple[List[Any], Context]:
+    values, dict_context = _dict_flatten(d)
+    return values, [d.default_factory, dict_context]
+
+
+def _defaultdict_flatten_with_keys(
+    d: DefaultDict[Any, Any]
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _defaultdict_flatten(d)
+    _, dict_context = context
+    return [(MappingKey(k), v) for k, v in zip(dict_context, values)], context
+
+
+def _defaultdict_unflatten(
+    values: Iterable[Any],
+    context: Context,
+) -> DefaultDict[Any, Any]:
+    default_factory, dict_context = context
+    return defaultdict(default_factory, _dict_unflatten(values, dict_context))
+
+
+def _defaultdict_serialize(context: Context) -> DumpableContext:
+    default_factory, dict_context = context
+    json_defaultdict = {
+        "default_factory_module": default_factory.__module__,
+        "default_factory_name": default_factory.__qualname__,
+        "dict_context": dict_context,
+    }
+    return json_defaultdict
+
+
+def _defaultdict_deserialize(dumpable_context: DumpableContext) -> Context:
+    assert isinstance(dumpable_context, dict)
+    assert set(dumpable_context) == {
+        "default_factory_module",
+        "default_factory_name",
+        "dict_context",
+    }
+
+    default_factory_module = dumpable_context["default_factory_module"]
+    default_factory_name = dumpable_context["default_factory_name"]
+    assert isinstance(default_factory_module, str)
+    assert isinstance(default_factory_name, str)
+    module = importlib.import_module(default_factory_module)
+    default_factory = getattr(module, default_factory_name)
+
+    dict_context = dumpable_context["dict_context"]
+    return [default_factory, dict_context]
+
+
+def _deque_flatten(d: Deque[Any]) -> Tuple[List[Any], Context]:
+    return list(d), d.maxlen
+
+
+def _deque_flatten_with_keys(
+    d: Deque[Any],
+) -> Tuple[List[Tuple[KeyEntry, Any]], Context]:
+    values, context = _deque_flatten(d)
+    return [(SequenceKey(i), v) for i, v in enumerate(values)], context
+
+
+def _deque_unflatten(values: Iterable[Any], context: Context) -> Deque[Any]:
+    return deque(values, maxlen=context)
+
+
+_private_register_pytree_node(
+    tuple,
+    _tuple_flatten,
+    _tuple_unflatten,
+    serialized_type_name="builtins.tuple",
+    flatten_with_keys_fn=_tuple_flatten_with_keys,
+)
+_private_register_pytree_node(
+    list,
+    _list_flatten,
+    _list_unflatten,
+    serialized_type_name="builtins.list",
+    flatten_with_keys_fn=_list_flatten_with_keys,
+)
+_private_register_pytree_node(
+    dict,
+    _dict_flatten,
+    _dict_unflatten,
+    serialized_type_name="builtins.dict",
+    flatten_with_keys_fn=_dict_flatten_with_keys,
+)
+_private_register_pytree_node(
+    namedtuple,  # type: ignore[arg-type]
+    _namedtuple_flatten,
+    _namedtuple_unflatten,
+    serialized_type_name="collections.namedtuple",
+    to_dumpable_context=_namedtuple_serialize,
+    from_dumpable_context=_namedtuple_deserialize,
+    flatten_with_keys_fn=_namedtuple_flatten_with_keys,
+)
+_private_register_pytree_node(
+    OrderedDict,
+    _ordereddict_flatten,
+    _ordereddict_unflatten,
+    serialized_type_name="collections.OrderedDict",
+    flatten_with_keys_fn=_ordereddict_flatten_with_keys,
+)
+_private_register_pytree_node(
+    defaultdict,
+    _defaultdict_flatten,
+    _defaultdict_unflatten,
+    serialized_type_name="collections.defaultdict",
+    to_dumpable_context=_defaultdict_serialize,
+    from_dumpable_context=_defaultdict_deserialize,
+    flatten_with_keys_fn=_defaultdict_flatten_with_keys,
+)
+_private_register_pytree_node(
+    deque,
+    _deque_flatten,
+    _deque_unflatten,
+    serialized_type_name="collections.deque",
+    flatten_with_keys_fn=_deque_flatten_with_keys,
+)
+
+
+STANDARD_DICT_TYPES: FrozenSet[type] = frozenset(
+    {dict, OrderedDict, defaultdict},
+)
+BUILTIN_TYPES: FrozenSet[type] = frozenset(
+    {tuple, list, dict, namedtuple, OrderedDict, defaultdict, deque},  # type: ignore[arg-type]
+)
+
+
+# h/t https://stackoverflow.com/questions/2166818/how-to-check-if-an-object-is-an-instance-of-a-namedtuple
+def _is_namedtuple_instance(tree: Any) -> bool:
+    typ = type(tree)
+    bases = typ.__bases__
+    if len(bases) != 1 or bases[0] != tuple:
+        return False
+    fields = getattr(typ, "_fields", None)
+    if not isinstance(fields, tuple):
+        return False
+    return all(type(entry) == str for entry in fields)
+
+
+def _get_node_type(tree: Any) -> Any:
+    if _is_namedtuple_instance(tree):
+        return namedtuple
+    return type(tree)
+
+
+# A leaf is defined as anything that is not a Node.
+def _is_leaf(tree: PyTree, is_leaf: Optional[Callable[[PyTree], bool]] = None) -> bool:
+    return (is_leaf is not None and is_leaf(tree)) or _get_node_type(
+        tree
+    ) not in SUPPORTED_NODES
+
+
+# A TreeSpec represents the structure of a pytree. It holds:
+# "type": the type of root Node of the pytree
+# context: some context that is useful in unflattening the pytree
+# children_specs: specs for each child of the root Node
+# num_leaves: the number of leaves
+@dataclasses.dataclass
+class TreeSpec:
+    type: Any
+    context: Context
+    children_specs: List["TreeSpec"]
+
+    num_nodes: int = dataclasses.field(init=False)
+    num_leaves: int = dataclasses.field(init=False)
+    num_children: int = dataclasses.field(init=False)
+
+    def __post_init__(self) -> None:
+        self.num_nodes = 1 + sum(spec.num_nodes for spec in self.children_specs)
+        self.num_leaves = sum(spec.num_leaves for spec in self.children_specs)
+        self.num_children = len(self.children_specs)
+
+    def __repr__(self, indent: int = 0) -> str:
+        repr_prefix: str = f"TreeSpec({self.type.__name__}, {self.context}, ["
+        children_specs_str: str = ""
+        if self.num_children > 0:
+            indent += 2
+            children_specs_str += self.children_specs[0].__repr__(indent)
+            children_specs_str += "," if self.num_children > 1 else ""
+            children_specs_str += ",".join(
+                [
+                    "\n" + " " * indent + child.__repr__(indent)
+                    for child in self.children_specs[1:]
+                ]
+            )
+        repr_suffix: str = f"{children_specs_str}])"
+        return repr_prefix + repr_suffix
+
+    def is_leaf(self) -> bool:
+        return self.num_nodes == 1 and self.num_leaves == 1
+
+    def _flatten_up_to_helper(self, tree: PyTree, subtrees: List[PyTree]) -> None:
+        if self.is_leaf():
+            subtrees.append(tree)
+            return
+
+        node_type = _get_node_type(tree)
+        if self.type not in BUILTIN_TYPES:
+            # Always require custom node types to match exactly
+            if node_type != self.type:
+                raise ValueError(
+                    f"Type mismatch; "
+                    f"expected {self.type!r}, but got {node_type!r}.",
+                )
+            flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+            child_pytrees, context = flatten_fn(tree)
+            if len(child_pytrees) != self.num_children:
+                raise ValueError(
+                    f"Node arity mismatch; "
+                    f"expected {self.num_children}, but got {len(child_pytrees)}.",
+                )
+            if context != self.context:
+                raise ValueError(
+                    f"Node context mismatch for custom node type {self.type!r}.",
+                )
+        else:
+            # For builtin dictionary types, we allow some flexibility
+            # Otherwise, we require exact matches
+            both_standard_dict = (
+                self.type in STANDARD_DICT_TYPES and node_type in STANDARD_DICT_TYPES
+            )
+            if node_type != self.type and not both_standard_dict:
+                raise ValueError(
+                    f"Node type mismatch; "
+                    f"expected {self.type!r}, but got {node_type!r}.",
+                )
+            if len(tree) != self.num_children:
+                raise ValueError(
+                    f"Node arity mismatch; "
+                    f"expected {self.num_children}, but got {len(tree)}.",
+                )
+
+            if both_standard_dict:  # dictionary types are compatible with each other
+                dict_context = (
+                    self.context
+                    if self.type is not defaultdict
+                    # ignore mismatch of `default_factory` for defaultdict
+                    else self.context[1]
+                )
+                expected_keys = dict_context
+                got_key_set = set(tree)
+                expected_key_set = set(expected_keys)
+                if got_key_set != expected_key_set:
+                    missing_keys = expected_key_set.difference(got_key_set)
+                    extra_keys = got_key_set.difference(expected_key_set)
+                    message = ""
+                    if missing_keys:
+                        message += f"; missing key(s): {missing_keys}"
+                    if extra_keys:
+                        message += f"; extra key(s): {extra_keys}"
+                    raise ValueError(f"Node keys mismatch{message}.")
+                child_pytrees = [tree[key] for key in expected_keys]
+            else:
+                flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+                child_pytrees, context = flatten_fn(tree)
+                if (
+                    context != self.context
+                    and self.type is not deque  # ignore mismatch of `maxlen` for deque
+                ):
+                    raise ValueError(
+                        f"Node context mismatch for node type {self.type!r}; "
+                        f"expected {self.context!r}, but got {context!r}.",  # namedtuple type mismatch
+                    )
+
+        for child_pytree, child_spec in zip(child_pytrees, self.children_specs):
+            child_spec._flatten_up_to_helper(child_pytree, subtrees)
+
+    def flatten_up_to(self, tree: PyTree) -> List[PyTree]:
+        subtrees: List[PyTree] = []
+        self._flatten_up_to_helper(tree, subtrees)
+        return subtrees
+
+    def unflatten(self, leaves: Iterable[Any]) -> PyTree:
+        if not isinstance(leaves, (list, tuple)):
+            leaves = list(leaves)
+        if len(leaves) != self.num_leaves:
+            raise ValueError(
+                f"treespec.unflatten(leaves): `leaves` has length {len(leaves)} "
+                f"but the spec refers to a pytree that holds {self.num_leaves} "
+                f"items ({self}).",
+            )
+        if self.is_leaf():
+            return leaves[0]
+
+        unflatten_fn = SUPPORTED_NODES[self.type].unflatten_fn
+
+        # Recursively unflatten the children
+        start = 0
+        end = 0
+        child_pytrees = []
+        for child_spec in self.children_specs:
+            end += child_spec.num_leaves
+            child_pytrees.append(child_spec.unflatten(leaves[start:end]))
+            start = end
+
+        return unflatten_fn(child_pytrees, self.context)
+
+
+class LeafSpec(TreeSpec):
+    def __init__(self) -> None:
+        super().__init__(None, None, [])
+
+    def __post_init__(self) -> None:
+        self.num_nodes = 1
+        self.num_leaves = 1
+        self.num_children = 0
+
+    def __repr__(self, indent: int = 0) -> str:
+        return "*"
+
+
+# All leaves are equivalent, so represent with a single object to save on
+# object construction time
+_LEAF_SPEC = LeafSpec()
+
+
+def _tree_flatten_helper(
+    tree: PyTree,
+    leaves: List[Any],
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> TreeSpec:
+    if _is_leaf(tree, is_leaf=is_leaf):
+        leaves.append(tree)
+        return _LEAF_SPEC
+
+    node_type = _get_node_type(tree)
+    flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+    child_pytrees, context = flatten_fn(tree)
+
+    # Recursively flatten the children
+    children_specs = [
+        _tree_flatten_helper(child, leaves, is_leaf=is_leaf) for child in child_pytrees
+    ]
+
+    return TreeSpec(node_type, context, children_specs)
+
+
+def tree_flatten(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Any], TreeSpec]:
+    """Flattens a pytree into a list of values and a TreeSpec that can be used
+    to reconstruct the pytree.
+    """
+    leaves: List[Any] = []
+    spec = _tree_flatten_helper(tree, leaves, is_leaf=is_leaf)
+    return leaves, spec
+
+
+def tree_unflatten(leaves: Iterable[Any], treespec: TreeSpec) -> PyTree:
+    """Given a list of values and a TreeSpec, builds a pytree.
+    This is the inverse operation of `tree_flatten`.
+    """
+    if not isinstance(treespec, TreeSpec):
+        raise TypeError(
+            f"tree_unflatten(leaves, treespec): Expected `treespec` to be "
+            f"instance of TreeSpec but got item of type {type(treespec)}.",
+        )
+    return treespec.unflatten(leaves)
+
+
+def _tree_leaves_helper(
+    tree: PyTree,
+    leaves: List[Any],
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> None:
+    if _is_leaf(tree, is_leaf=is_leaf):
+        leaves.append(tree)
+        return
+
+    node_type = _get_node_type(tree)
+    flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+    child_pytrees, _ = flatten_fn(tree)
+
+    # Recursively flatten the children
+    for child in child_pytrees:
+        _tree_leaves_helper(child, leaves, is_leaf=is_leaf)
+
+
+def tree_leaves(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Any]:
+    """Get a list of leaves of a pytree."""
+    leaves: List[Any] = []
+    _tree_leaves_helper(tree, leaves, is_leaf=is_leaf)
+    return leaves
+
+
+def tree_structure(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> TreeSpec:
+    """Get the TreeSpec for a pytree."""
+    return tree_flatten(tree, is_leaf=is_leaf)[1]
+
+
+def tree_map(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Map a multi-input function over pytree args to produce a new pytree.
+
+    See also :func:`tree_map_`.
+
+    >>> tree_map(lambda x: x + 1, {'x': 7, 'y': (42, 64)})
+    {'x': 8, 'y': (43, 65)}
+    >>> tree_map(lambda x: x is None, {'x': 7, 'y': (42, 64), 'z': None})
+    {'x': False, 'y': (False, False), 'z': True}
+
+    If multiple inputs are given, the structure of the tree is taken from the first input;
+    subsequent inputs need only have ``tree`` as a prefix:
+
+    >>> tree_map(lambda x, y: [x] + y, [5, 6], [[7, 9], [1, 2]])
+    [[5, 7, 9], [6, 1, 2]]
+
+    Args:
+        func (callable): A function that takes ``1 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees.
+        tree (pytree): A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests (tuple of pytree): A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(x, *xs)`` where ``x`` is the value at the corresponding leaf in ``tree`` and ``xs``
+        is the tuple of values at corresponding nodes in ``rests``.
+    """
+    leaves, treespec = tree_flatten(tree, is_leaf=is_leaf)
+    flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
+    return treespec.unflatten(map(func, *flat_args))
+
+
+def tree_map_(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but do an inplace call on each leaf and return the original tree.
+
+    See also :func:`tree_map`.
+
+    Args:
+        func (callable): A function that takes ``1 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees.
+        tree (pytree): A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests (tuple of pytree): A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf (callable, optional): An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns:
+        The original ``tree`` with the value at each leaf is given by the side-effect of function
+        ``func(x, *xs)`` (not the return value) where ``x`` is the value at the corresponding leaf
+        in ``tree`` and ``xs`` is the tuple of values at values at corresponding nodes in ``rests``.
+    """
+    leaves, treespec = tree_flatten(tree, is_leaf=is_leaf)
+    flat_args = [leaves] + [treespec.flatten_up_to(r) for r in rests]
+    tuple(map(func, *flat_args))  # consume and exhaust the iterable
+    return tree
+
+
+Type2 = Tuple[Type[T], Type[S]]
+Type3 = Tuple[Type[T], Type[S], Type[U]]
+if sys.version_info >= (3, 10):
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...], types.UnionType]
+else:
+    TypeAny = Union[Type[Any], Tuple[Type[Any], ...]]
+
+Fn2 = Callable[[Union[T, S]], R]
+Fn3 = Callable[[Union[T, S, U]], R]
+Fn = Callable[[T], R]
+FnAny = Callable[[Any], R]
+
+MapOnlyFn = Callable[[T], Callable[[Any], Any]]
+
+
+# These specializations help with type inference on the lambda passed to this
+# function
+@overload
+def map_only(__type_or_types_or_pred: Type2[T, S]) -> MapOnlyFn[Fn2[T, S, Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Type3[T, S, U]) -> MapOnlyFn[Fn3[T, S, U, Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Type[T]) -> MapOnlyFn[Fn[T, Any]]:
+    ...
+
+
+# This specialization is needed for the implementations below that call
+@overload
+def map_only(__type_or_types_or_pred: TypeAny) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+@overload
+def map_only(__type_or_types_or_pred: Callable[[Any], bool]) -> MapOnlyFn[FnAny[Any]]:
+    ...
+
+
+def map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]]
+) -> MapOnlyFn[FnAny[Any]]:
+    """
+    Suppose you are writing a tree_map over tensors, leaving everything
+    else unchanged.  Ordinarily you would have to write:
+
+        def go(t):
+            if isinstance(t, Tensor):
+                return ...
+            else:
+                return t
+
+    With this function, you only need to write:
+
+        @map_only(Tensor)
+        def go(t):
+            return ...
+
+    You can also directly use 'tree_map_only'
+    """
+    if isinstance(__type_or_types_or_pred, (type, tuple)) or (
+        sys.version_info >= (3, 10)
+        and isinstance(__type_or_types_or_pred, types.UnionType)
+    ):
+
+        def pred(x: Any) -> bool:
+            return isinstance(x, __type_or_types_or_pred)  # type: ignore[arg-type]
+
+    elif callable(__type_or_types_or_pred):
+        pred = __type_or_types_or_pred  # type: ignore[assignment]
+    else:
+        raise TypeError("Argument must be a type, a tuple of types, or a callable.")
+
+    def wrapper(func: Callable[[T], Any]) -> Callable[[Any], Any]:
+        # @functools.wraps(func)  # torch dynamo doesn't support this yet
+        def wrapped(x: T) -> Any:
+            if pred(x):
+                return func(x)
+            return x
+
+        return wrapped
+
+    return wrapper
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type[T],
+    func: Fn[T, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type2[T, S],
+    func: Fn2[T, S, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Type3[T, S, U],
+    func: Fn3[T, S, U, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only(
+    __type_or_types_or_pred: Callable[[Any], bool],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+def tree_map_only(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type[T],
+    func: Fn[T, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type2[T, S],
+    func: Fn2[T, S, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Type3[T, S, U],
+    func: Fn3[T, S, U, Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+@overload
+def tree_map_only_(
+    __type_or_types_or_pred: Callable[[Any], bool],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    ...
+
+
+def tree_map_only_(
+    __type_or_types_or_pred: Union[TypeAny, Callable[[Any], bool]],
+    func: FnAny[Any],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    return tree_map_(map_only(__type_or_types_or_pred)(func), tree, is_leaf=is_leaf)
+
+
+def tree_all(
+    pred: Callable[[Any], bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return all(map(pred, flat_args))
+
+
+def tree_any(
+    pred: Callable[[Any], bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return any(map(pred, flat_args))
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type[T],
+    pred: Fn[T, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type2[T, S],
+    pred: Fn2[T, S, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_all_only(
+    __type_or_types: Type3[T, S, U],
+    pred: Fn3[T, S, U, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+def tree_all_only(
+    __type_or_types: TypeAny,
+    pred: FnAny[bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return all(pred(x) for x in flat_args if isinstance(x, __type_or_types))
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type[T],
+    pred: Fn[T, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type2[T, S],
+    pred: Fn2[T, S, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+@overload
+def tree_any_only(
+    __type_or_types: Type3[T, S, U],
+    pred: Fn3[T, S, U, bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    ...
+
+
+def tree_any_only(
+    __type_or_types: TypeAny,
+    pred: FnAny[bool],
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> bool:
+    flat_args = tree_leaves(tree, is_leaf=is_leaf)
+    return any(pred(x) for x in flat_args if isinstance(x, __type_or_types))
+
+
+# Broadcasts a pytree to the provided TreeSpec and returns the flattened
+# values. If this is not possible, then this function returns None.
+#
+# For example, given pytree=0 and spec=TreeSpec(list, None, [LeafSpec(), LeafSpec()]),
+# would return [0, 0]. This is useful for part of the vmap implementation:
+# a user can pass in vmap(fn, in_dims)(*inputs). `in_dims` should be
+# broadcastable to the tree structure of `inputs` and we use
+# _broadcast_to_and_flatten to check this.
+def _broadcast_to_and_flatten(
+    tree: PyTree,
+    treespec: TreeSpec,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Optional[List[Any]]:
+    assert isinstance(treespec, TreeSpec)
+
+    if _is_leaf(tree, is_leaf=is_leaf):
+        return [tree] * treespec.num_leaves
+    if treespec.is_leaf():
+        return None
+    node_type = _get_node_type(tree)
+    if node_type != treespec.type:
+        return None
+
+    flatten_fn = SUPPORTED_NODES[node_type].flatten_fn
+    child_pytrees, ctx = flatten_fn(tree)
+
+    # Check if the Node is different from the spec
+    if len(child_pytrees) != treespec.num_children or ctx != treespec.context:
+        return None
+
+    # Recursively flatten the children
+    result: List[Any] = []
+    for child, child_spec in zip(child_pytrees, treespec.children_specs):
+        flat = _broadcast_to_and_flatten(child, child_spec, is_leaf=is_leaf)
+        if flat is not None:
+            result += flat
+        else:
+            return None
+
+    return result
+
+
+@dataclasses.dataclass
+class _TreeSpecSchema:
+    """
+    _TreeSpecSchema is the schema used to serialize the TreeSpec
+    It contains the following fields:
+    - type: A string name of the type. null for the case of a LeafSpec.
+    - context: Any format which is json dumpable
+    - children_spec: A list of children serialized specs.
+    """
+
+    type: Optional[str]
+    context: DumpableContext
+    children_spec: List["_TreeSpecSchema"]
+
+
+class _ProtocolFn(NamedTuple):
+    treespec_to_json: Callable[[TreeSpec], DumpableContext]
+    json_to_treespec: Callable[[DumpableContext], TreeSpec]
+
+
+_SUPPORTED_PROTOCOLS: Dict[int, _ProtocolFn] = {}
+
+
+def _treespec_to_json(treespec: TreeSpec) -> _TreeSpecSchema:
+    if treespec.is_leaf():
+        return _TreeSpecSchema(None, None, [])
+
+    if treespec.type not in SUPPORTED_SERIALIZED_TYPES:
+        raise NotImplementedError(
+            f"Serializing {treespec.type} in pytree is not registered.",
+        )
+
+    serialize_node_def = SUPPORTED_SERIALIZED_TYPES[treespec.type]
+
+    serialized_type_name = serialize_node_def.serialized_type_name
+
+    if serialized_type_name == NO_SERIALIZED_TYPE_NAME_FOUND:
+        raise NotImplementedError(
+            f"No registered serialization name for {treespec.type} found. "
+            "Please update your _register_pytree_node call with a `serialized_type_name` kwarg."
+        )
+
+    if serialize_node_def.to_dumpable_context is None:
+        try:
+            serialized_context = json.dumps(treespec.context)
+        except TypeError as e:
+            raise TypeError(
+                "Unable to serialize context. "
+                "Please make the context json dump-able, or register a "
+                "custom serializer using _register_pytree_node."
+            ) from e
+    else:
+        serialized_context = serialize_node_def.to_dumpable_context(treespec.context)
+
+    child_schemas = [_treespec_to_json(child) for child in treespec.children_specs]
+
+    return _TreeSpecSchema(serialized_type_name, serialized_context, child_schemas)
+
+
+def _json_to_treespec(json_schema: DumpableContext) -> TreeSpec:
+    if (
+        json_schema["type"] is None
+        and json_schema["context"] is None
+        and len(json_schema["children_spec"]) == 0
+    ):
+        return _LEAF_SPEC
+
+    if json_schema["type"] not in SERIALIZED_TYPE_TO_PYTHON_TYPE:
+        raise NotImplementedError(
+            f'Deserializing {json_schema["type"]} in pytree is not registered.',
+        )
+
+    typ = SERIALIZED_TYPE_TO_PYTHON_TYPE[json_schema["type"]]
+    serialize_node_def = SUPPORTED_SERIALIZED_TYPES[typ]
+
+    if serialize_node_def.from_dumpable_context is None:
+        try:
+            context = json.loads(json_schema["context"])
+        except TypeError as ex:
+            raise TypeError(
+                "Unable to deserialize context. "
+                "Please make the context json load-able, or register a "
+                "custom serializer using _register_pytree_node.",
+            ) from ex
+    else:
+        context = serialize_node_def.from_dumpable_context(json_schema["context"])
+
+    children_specs = []
+    for child_string in json_schema["children_spec"]:
+        children_specs.append(_json_to_treespec(child_string))
+
+    return TreeSpec(typ, context, children_specs)
+
+
+_SUPPORTED_PROTOCOLS[1] = _ProtocolFn(_treespec_to_json, _json_to_treespec)
+
+
+def treespec_dumps(treespec: TreeSpec, protocol: Optional[int] = None) -> str:
+    if not isinstance(treespec, TreeSpec):
+        raise TypeError(
+            f"treespec_dumps(treespec, protocol): Expected `treespec` to be instance of "
+            f"TreeSpec but got item of type {type(treespec)}.",
+        )
+
+    if protocol is None:
+        protocol = DEFAULT_TREESPEC_SERIALIZATION_PROTOCOL
+
+    if protocol in _SUPPORTED_PROTOCOLS:
+        json_spec = _SUPPORTED_PROTOCOLS[protocol].treespec_to_json(treespec)
+    else:
+        raise ValueError(
+            f"Unknown protocol {protocol}. "
+            f"Available protocols: {list(_SUPPORTED_PROTOCOLS.keys())}",
+        )
+
+    str_spec = json.dumps((protocol, dataclasses.asdict(json_spec)))
+    return str_spec
+
+
+def treespec_loads(serialized: str) -> TreeSpec:
+    protocol, json_schema = json.loads(serialized)
+
+    if protocol in _SUPPORTED_PROTOCOLS:
+        return _SUPPORTED_PROTOCOLS[protocol].json_to_treespec(json_schema)
+    raise ValueError(
+        f"Unknown protocol {protocol}. "
+        f"Available protocols: {list(_SUPPORTED_PROTOCOLS.keys())}",
+    )
+
+
+class _DummyLeaf:
+    def __repr__(self) -> str:
+        return "*"
+
+
+def treespec_pprint(treespec: TreeSpec) -> str:
+    dummy_tree = tree_unflatten(
+        [_DummyLeaf() for _ in range(treespec.num_leaves)],
+        treespec,
+    )
+    return repr(dummy_tree)
+
+
+# TODO(angelayi): remove this function after OSS/internal stabilize
+def pytree_to_str(treespec: TreeSpec) -> str:
+    warnings.warn("pytree_to_str is deprecated. Please use treespec_dumps")
+    return treespec_dumps(treespec)
+
+
+# TODO(angelayi): remove this function after OSS/internal stabilize
+def str_to_pytree(json: str) -> TreeSpec:
+    warnings.warn("str_to_pytree is deprecated. Please use treespec_loads")
+    return treespec_loads(json)
+
+
+def arg_tree_leaves(*args: PyTree, **kwargs: PyTree) -> List[Any]:
+    """Get a flat list of arguments to this function
+
+    A slightly faster version of tree_leaves((args, kwargs))
+    """
+    leaves: List[Any] = []
+    for a in args:
+        _tree_leaves_helper(a, leaves)
+    for a in kwargs.values():
+        _tree_leaves_helper(a, leaves)
+    return leaves
+
+
+def tree_flatten_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Tuple[List[Tuple[KeyPath, Any]], TreeSpec]:
+    """Flattens a pytree like :func:`tree_flatten`, but also returns each leaf's key path.
+
+    Args:
+        tree: a pytree to flatten. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A tuple where the first element is a list of (key path, leaf) pairs, and the
+        second element is a :class:`TreeSpec` representing the structure of the flattened
+        tree.
+    """
+    _, treespec = tree_flatten(tree, is_leaf)
+    return list(_generate_key_paths((), tree, is_leaf)), treespec
+
+
+def tree_leaves_with_path(
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> List[Tuple[KeyPath, Any]]:
+    """Gets the leaves of a pytree like ``tree_leaves`` and returns each leaf's key path.
+
+    Args:
+        tree: a pytree. If it contains a custom type, that type must be
+            registered with an appropriate `tree_flatten_with_path_fn` when registered
+            with :func:`register_pytree_node`.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+    Returns:
+        A list of (key path, leaf) pairs.
+    """
+    return list(_generate_key_paths((), tree, is_leaf))
+
+
+def _generate_key_paths(
+    key_path: KeyPath,
+    tree: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> Iterable[Tuple[KeyPath, Any]]:
+    if is_leaf and is_leaf(tree):
+        yield key_path, tree
+        return
+
+    node_type = _get_node_type(tree)
+    handler = SUPPORTED_NODES.get(node_type)
+    if not handler:
+        # This is a leaf
+        yield key_path, tree
+        return
+
+    flatten_with_keys = handler.flatten_with_keys_fn
+    if flatten_with_keys:
+        key_children, _ = flatten_with_keys(tree)
+        for k, c in key_children:
+            yield from _generate_key_paths((*key_path, k), c, is_leaf)
+    else:
+        # We registered this pytree but didn't add a flatten_with_keys_fn, complain.
+        raise ValueError(
+            f"Did not find a flatten_with_keys_fn for type: {node_type}. "
+            "Please pass a flatten_with_keys_fn argument to register_pytree_node."
+        )
+
+
+def tree_map_with_path(
+    func: Callable[..., Any],
+    tree: PyTree,
+    *rests: PyTree,
+    is_leaf: Optional[Callable[[PyTree], bool]] = None,
+) -> PyTree:
+    """Like :func:`tree_map`, but the provided callable takes an additional key path argument.
+
+    Args:
+        func: A function that takes ``2 + len(rests)`` arguments, to be applied at the
+            corresponding leaves of the pytrees. The first positional argument
+            to ``func`` is the key path of the leaf in question. The second
+            positional argument is the value of the leaf.
+        tree: A pytree to be mapped over, with each leaf providing the first positional
+            argument to function ``func``.
+        rests: A tuple of pytrees, each of which has the same structure as
+            ``tree`` or has ``tree`` as a prefix.
+        is_leaf: An extra leaf predicate function that will be called at each
+            flattening step. The function should have a single argument with signature
+            ``is_leaf(node) -> bool``. If it returns :data:`True`, the whole subtree being treated
+            as a leaf. Otherwise, the default pytree registry will be used to determine a node is a
+            leaf or not. If the function is not specified, the default pytree registry will be used.
+
+    Returns
+        A new pytree with the same structure as ``tree`` but with the value at each leaf given by
+        ``func(keypath, x, *xs)`` where ``keypath`` is the key path at the
+        corresponding leaf in ``tree``, ``x`` is the value at that leaf, and
+        ``xs`` is the tuple of values at corresponding nodes in ``rests``.
+    """
+    keypath_leaves, treespec = tree_flatten_with_path(tree, is_leaf)
+    keypath_leaves = list(zip(*keypath_leaves))
+    all_keypath_leaves = keypath_leaves + [treespec.flatten_up_to(r) for r in rests]
+    return treespec.unflatten(func(*xs) for xs in zip(*all_keypath_leaves))
+
+
+def keystr(kp: KeyPath) -> str:
+    """Given a key path, return a pretty-printed representation."""
+    return "".join([str(k) for k in kp])
+
+
+def key_get(obj: Any, kp: KeyPath) -> Any:
+    """Given an object and a key path, return the value at the key path."""
+    for k in kp:
+        obj = k.get(obj)
+    return obj
diff --git a/MLPY/Lib/site-packages/torch/utils/_stats.py b/MLPY/Lib/site-packages/torch/utils/_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc4c92e283e64f20774178dd3a8648626f25821
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_stats.py
@@ -0,0 +1,21 @@
+# NOTE! PLEASE KEEP THIS FILE *FREE* OF TORCH DEPS! IT SHOULD BE IMPORTABLE ANYWHERE.
+# IF YOU FEEL AN OVERWHELMING URGE TO ADD A TORCH DEP, MAKE A TRAMPOLINE FILE A LA torch._dynamo.utils
+# AND SCRUB AWAY TORCH NOTIONS THERE.
+import collections
+import functools
+from typing import OrderedDict
+
+simple_call_counter: OrderedDict[str, int] = collections.OrderedDict()
+
+def count_label(label):
+    prev = simple_call_counter.setdefault(label, 0)
+    simple_call_counter[label] = prev + 1
+
+def count(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if fn.__qualname__ not in simple_call_counter:
+            simple_call_counter[fn.__qualname__] = 0
+        simple_call_counter[fn.__qualname__] = simple_call_counter[fn.__qualname__] + 1
+        return fn(*args, **kwargs)
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__init__.py b/MLPY/Lib/site-packages/torch/utils/_sympy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30105bc23764a868ab02497c02cad50178e86050
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/functions.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/functions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b4e53014bc052aa62da2ba687ce2d4de45dcaaa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/functions.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/interp.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/interp.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca272b567515262b1ce6b0ba8c308371f39aec6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/interp.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/reference.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/reference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5238f4e23b7df0c57f33cd188beb9db22c7e7d77
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/reference.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/singleton_int.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/singleton_int.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8d77a05c1b1294b33cbb1a55dddbdbabd2b5f11
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/singleton_int.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/solve.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/solve.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9d5e1543992e3b0a3eaf52b680f6b8f15874a1e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/solve.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/value_ranges.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/value_ranges.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d1882fe9985f577fe4b88ad69fc105c7dbc805e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/_sympy/__pycache__/value_ranges.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/functions.py b/MLPY/Lib/site-packages/torch/utils/_sympy/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2209af4b4668bc214826e62edb4bf6d3801b04e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/functions.py
@@ -0,0 +1,353 @@
+import sympy
+from sympy import S
+from sympy.core.logic import fuzzy_and, fuzzy_not, fuzzy_or
+
+__all__ = [
+    "FloorDiv", "ModularIndexing", "CleanDiv", "CeilDiv", "Pow", "TrueDiv",
+    "LShift", "RShift", "IsNonOverlappingAndDenseIndicator", "Round", "RoundDecimal",
+]
+
+
+def fuzzy_eq(x, y):
+    if None in (x, y):
+        return None
+    return x == y
+
+
+class FloorDiv(sympy.Function):
+    """
+    We maintain this so that:
+    1. We can use divisibility guards to simplify FloorDiv(a, b) to a / b.
+    2. Printing out the expression is nicer (compared to say, representing a//b as (a - a % b) / b)
+    """
+    nargs = (2,)
+    precedence = 50  # precedence of mul  # noqa: F811
+
+    # Default return type for SymPy assumptions.
+    # https://docs.sympy.org/latest/guides/assumptions.html#implementing-assumptions-handlers
+    is_real = True
+
+    @property
+    def base(self):
+        return self.args[0]
+
+    @property
+    def divisor(self):
+        return self.args[1]
+
+    def _sympystr(self, printer):
+        base = printer.parenthesize(self.base, self.precedence)
+        divisor = printer.parenthesize(self.divisor, self.precedence)
+        return f"({base}//{divisor})"
+
+    # SymPy assumptions based on argument types.
+    def _eval_is_real(self):
+        return fuzzy_or([self.base.is_real, self.divisor.is_real])
+
+    def _eval_is_integer(self):
+        return fuzzy_and([self.base.is_integer, self.divisor.is_integer])
+
+    # Automatic evaluation.
+    # https://docs.sympy.org/latest/guides/custom-functions.html#best-practices-for-eval
+    @classmethod
+    def eval(cls, base, divisor):
+        def check_supported_type(x):
+            if (x.is_integer is False and x.is_real is False and x.is_complex) or x.is_Boolean:
+                raise TypeError(
+                    f"unsupported operand type(s) for //: "
+                    f"'{type(base).__name__}' and '{type(divisor).__name__}'"
+                    f", expected integer or real")
+
+        check_supported_type(base)
+        check_supported_type(divisor)
+
+        # We don't provide the same error message as in Python because SymPy
+        # makes it difficult to check the types.
+        if divisor.is_zero:
+            raise ZeroDivisionError("division by zero")
+
+        if base.is_zero:
+            return sympy.S.Zero
+        if base.is_integer and divisor == 1:
+            return base
+        if base.is_real and divisor == 1:
+            return sympy.floor(base)
+        if base.is_integer and divisor == -1:
+            return sympy.Mul(base, -1)
+        if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
+            return base // divisor
+        if isinstance(base, (sympy.Integer, sympy.Float)) and isinstance(divisor, (sympy.Integer, sympy.Float)):
+            return sympy.floor(base / divisor)
+        if isinstance(base, FloorDiv):
+            return FloorDiv(base.args[0], base.args[1] * divisor)
+        if isinstance(divisor, sympy.Rational) and divisor.p == 1:
+            return sympy.floor(base * divisor.q)
+
+        if isinstance(base, sympy.Add):
+            for a in base.args:
+                gcd = sympy.gcd(a, divisor)
+                if gcd == divisor:
+                    return FloorDiv(base - a, divisor) + a / gcd
+
+        try:
+            gcd = sympy.gcd(base, divisor)
+            if gcd != 1:
+                return FloorDiv(
+                    sympy.simplify(base / gcd), sympy.simplify(divisor / gcd)
+                )
+        except sympy.PolynomialError:
+            pass  # https://github.com/pytorch/pytorch/issues/108276
+
+
+class ModularIndexing(sympy.Function):
+    """
+    ModularIndexing(a, b, c) => (a // b) % c where % is the C modulus
+    """
+
+    nargs = (3,)
+    is_integer = True
+
+    @classmethod
+    def eval(cls, base, divisor, modulus):
+        if base == 0 or modulus == 1:
+            return sympy.Integer(0)
+
+        if (
+            isinstance(base, sympy.Integer)
+            and isinstance(divisor, sympy.Integer)
+            and isinstance(modulus, sympy.Integer)
+        ):
+            return (base // divisor) % modulus
+
+        try:
+            if divisor != 1:
+                gcd = sympy.gcd(base, divisor)
+                if gcd != 1:
+                    return ModularIndexing(
+                        sympy.simplify(base / gcd), sympy.simplify(divisor / gcd), modulus
+                    )
+        except sympy.PolynomialError:
+            pass  # https://github.com/pytorch/pytorch/issues/108276
+
+        if isinstance(base, sympy.Add):
+            new_terms = []
+            all_positive = True
+            for term in base.args:
+                if sympy.gcd(term, modulus * divisor) != modulus * divisor:
+                    if (isinstance(term, sympy.Integer) and term < 0) or (
+                        isinstance(term, sympy.Mul)
+                        and isinstance(term.args[0], sympy.Integer)
+                        and term.args[0] < 0
+                    ):
+                        # workaround for https://github.com/openai/triton/issues/619,
+                        # if there are negative terms, // produces wrong result
+                        # TODO if https://github.com/openai/triton/issues/619 is fixed
+                        # this optimization would become valid
+                        all_positive = False
+                        break
+                    else:
+                        new_terms.append(term)
+
+            if len(new_terms) != len(base.args) and all_positive:
+                return ModularIndexing(sum(new_terms), divisor, modulus)
+
+        if isinstance(base, FloorDiv):
+            return ModularIndexing(base.args[0], base.args[1] * divisor, modulus)
+
+    def _eval_is_nonnegative(self):
+        p, q = self.args[:2]
+        return fuzzy_eq(p.is_nonnegative, q.is_nonnegative)  # type: ignore[attr-defined]
+
+    def _eval_is_positive(self):
+        p, q = self.args[:2]
+        return fuzzy_eq(p.is_positive, q.is_positive)  # type: ignore[attr-defined]
+
+
+class Where(sympy.Function):
+    """
+    Good ol' ternary operator
+    """
+
+    nargs = (3,)
+
+    @classmethod
+    def eval(cls, c, p, q):
+        if c == sympy.true:
+            return p
+        elif c == sympy.false:
+            return q
+
+class Mod(sympy.Function):
+    """
+    We maintain this so that we avoid SymPy correctness issues, such as:
+    https://github.com/sympy/sympy/issues/25146
+    """
+
+    nargs = (2,)
+
+    @classmethod
+    def eval(cls, p, q):
+        # This was adapted from: sympy/core/mod.py
+
+        if q.is_zero:
+            raise ZeroDivisionError("Modulo by zero")
+        # If either of them is NaN or infinite.
+        if p is S.NaN or q is S.NaN or p.is_finite is False or q.is_finite is False:
+            return S.NaN
+        # Three cases:
+        #   1. p == 0
+        #   2. p is either q or -q
+        #   3. p is integer and q == 1
+        if p is S.Zero or p in (q, -q) or (p.is_integer and q == 1):
+            return S.Zero
+
+        # Evaluate if they are both literals.
+        if q.is_Number and p.is_Number:
+            return p % q
+
+        # If q == 2, it's a matter of whether p is odd or even.
+        if q.is_Number and q == 2:
+            if p.is_even:
+                return S.Zero
+            if p.is_odd:
+                return S.One
+
+        # If p is a multiple of q.
+        r = p / q
+        if r.is_integer:
+            return S.Zero
+
+        # If p < q and its ratio is positive, then:
+        #   - floor(p / q) = 0
+        #   - p % q = p - floor(p / q) * q = p
+        less = p < q
+        if less.is_Boolean and bool(less) and r.is_positive:
+            return p
+
+    def _eval_is_integer(self):
+        p, q = self.args
+        return fuzzy_and([p.is_integer, q.is_integer, fuzzy_not(q.is_zero)])  # type: ignore[attr-defined]
+
+    def _eval_is_nonnegative(self):
+        return True if self.args[1].is_positive else None  # type: ignore[attr-defined]
+
+    def _eval_is_nonpositive(self):
+        return True if self.args[1].is_negative else None  # type: ignore[attr-defined]
+
+
+class CleanDiv(FloorDiv):
+    """
+    Div where we can assume no rounding.
+    This is to enable future optimizations.
+    """
+
+    pass
+
+
+class CeilDiv(sympy.Function):
+    """
+    Div used in indexing that rounds up.
+    """
+
+    is_integer = True
+
+    def __new__(cls, base, divisor):
+        if sympy.gcd(base, divisor) == divisor:
+            return CleanDiv(base, divisor)
+        else:
+            return FloorDiv(base + (divisor - 1), divisor)
+
+
+class LShift(sympy.Function):
+    @classmethod
+    def eval(cls, base, shift):
+        if shift < 0:
+            raise ValueError('negative shift count')
+        return base * 2 ** shift
+
+
+class RShift(sympy.Function):
+    @classmethod
+    def eval(cls, base, shift):
+        if shift < 0:
+            raise ValueError('negative shift count')
+        return base // 2 ** shift
+
+# Overloaded to be compatible with regular Python.
+# https://github.com/pytorch/pytorch/issues/90900
+class Pow(sympy.Function):
+    @classmethod
+    def eval(cls, base, exp):
+        if exp.is_zero:
+            return sympy.Integer(1)
+        elif base.is_zero and exp < 0:
+            raise ZeroDivisionError(f"{base} cannot be raised to a negative power")
+        else:
+            return base ** exp
+
+# Overloaded to be compatible with regular Python.
+# https://github.com/pytorch/pytorch/issues/90900
+class TrueDiv(sympy.Function):
+    @classmethod
+    def eval(cls, base, divisor):
+        if divisor.is_zero:
+            raise ZeroDivisionError("division by zero")
+        else:
+            return base / divisor
+
+
+# TODO: As an indicator, this != 0 implies == 1 (and vice versa).
+# Because we do not have the ability to guard on the stride permutation
+# at the moment, it is hard to make further inferences when this is true,
+# as although we know the tensor is contiguous in *some* layout, we don't
+# know which one (however, you could, for example, make the inference that
+# reshaping this to a 1D tensor can be guard-free.)
+class IsNonOverlappingAndDenseIndicator(sympy.Function):
+    is_integer = True
+
+    @classmethod
+    def eval(cls, *args):
+        assert len(args) % 2 == 0
+        dim = len(args) // 2
+        # TODO: it is possible to make progress evaluating this guard
+        # even if not all of the inputs are known.  For example, a 2D
+        # tensor with non-0/1 sizes but strides (0, 1) is definitely
+        # false, because we know its numel > 1 but it's broadcasted
+        # in dim 0.
+        if all(isinstance(a, sympy.Integer) for a in args):
+            # sym_node imported in torch.__init__. Local import to avoid an import cycle
+            from torch.fx.experimental.symbolic_shapes import eval_is_non_overlapping_and_dense
+
+            size_args = args[0:dim]
+            stride_args = args[dim:]
+            return eval_is_non_overlapping_and_dense(
+                [int(a) for a in size_args],
+                [int(a) for a in stride_args]
+            )
+        return None
+
+
+class Round(sympy.Function):
+    is_integer = True
+
+    @classmethod
+    def eval(cls, number):
+        if number.is_integer:
+            return number
+        elif isinstance(number, sympy.Number):
+            return sympy.Integer(round(float(number)))
+
+    def __int__(self):
+        # This will only ever be called when computing size hints. At that point, self.args[0] should be a number and
+        # no longer an expression. If it were, the float call would fail and the caller would handle this further.
+        return round(float(self.args[0]))  # type: ignore[arg-type]
+
+
+class RoundDecimal(sympy.Function):
+    @classmethod
+    def eval(cls, number, ndigits):
+        if number.is_integer and ndigits >= 0:
+            return number
+        elif isinstance(number, sympy.Number) and isinstance(ndigits, sympy.Integer):
+            value_type, output_type = (int, sympy.Integer) if isinstance(number, sympy.Integer) else (float, sympy.Float)
+            return output_type(round(value_type(number), int(ndigits)))
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/interp.py b/MLPY/Lib/site-packages/torch/utils/_sympy/interp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda2273e95c97d00265c1c5592744ae05ae8b63a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/interp.py
@@ -0,0 +1,118 @@
+"""
+This is a simple interpreter for Sympy expressions that dispatches to
+classes following the torch._inductor.virtualized calling convention.
+For directness, the interpreter takes the handler directly rather than
+consulting the TLS.  It does not use most of the methods on the full
+handler; only those with corresponding Sympy expressions.  To see an example
+of a full handler, see torch.utils._sympy.value_ranges.ValueRangeAnalysis.
+"""
+
+import functools
+from typing import Any, Dict, Union
+
+import sympy
+from sympy.logic.boolalg import Boolean as SympyBoolean, BooleanAtom
+
+import torch
+from .functions import (
+    CleanDiv,
+    FloorDiv,
+    IsNonOverlappingAndDenseIndicator,
+    Mod,
+    ModularIndexing,
+    Pow,
+    Round,
+    RoundDecimal,
+    TrueDiv,
+    Where,
+)
+
+
+# TODO: Dedupe this with SYMPY_INTERP
+
+
+@functools.lru_cache(None)
+def handlers():
+    # TODO add CeilDiv (it doesn't appear in the index_expr)
+
+    # TODO default to some decompositions if the interpreter doesn't have them
+    # like decomposing ModularIndexing or implementing Le(a,b) as Ge(b, a)
+
+    HANDLERS = {
+        sympy.Or: "or_",
+        sympy.And: "and_",
+        sympy.Eq: "eq",
+        sympy.Ne: "ne",
+        sympy.Lt: "lt",
+        sympy.Gt: "gt",
+        sympy.Le: "le",
+        sympy.Ge: "ge",
+        sympy.Not: "not_",
+        TrueDiv: "truediv",
+        FloorDiv: "floordiv",
+        CleanDiv: "div",
+        Where: "where",
+        sympy.Add: "add",
+        sympy.Mul: "mul",
+        Pow: "pow",
+        sympy.Pow: "pow",
+        Mod: "mod",
+        sympy.Mod: "mod",
+        sympy.Abs: "abs",
+        sympy.log: "log",
+        sympy.exp: "exp",
+        sympy.floor: "floor",
+        sympy.ceiling: "ceil",
+        sympy.Min: "minimum",
+        sympy.Max: "maximum",
+        ModularIndexing: "modular_indexing",
+        sympy.functions.elementary.piecewise.ExprCondPair: "expr_cond_pair",
+        sympy.Piecewise: "piecewise",
+        IsNonOverlappingAndDenseIndicator: "is_non_overlapping_and_dense_indicator",
+        Round: "round",
+        RoundDecimal: "round",
+    }
+    for name in ["cos", "sin", "tan", "sinh", "cosh", "tanh", "asin", "acos", "atan"]:
+        HANDLERS[getattr(sympy, name)] = name
+
+    return HANDLERS
+
+
+ASSOCIATIVE_OPS = {"minimum", "maximum", "mul", "add", "and_", "or_"}
+
+
+def sympy_interp(
+    analysis, env: Dict[sympy.Symbol, Any], expr: Union[sympy.Expr, SympyBoolean]
+):
+    # Handle base cases
+    dtype = None
+    if isinstance(expr, BooleanAtom):
+        dtype = torch.bool
+    elif isinstance(expr, sympy.Integer):
+        dtype = torch.int64
+    elif isinstance(expr, sympy.Number):
+        dtype = torch.double
+
+    if dtype is not None:
+        return analysis.constant(expr, dtype)
+    elif isinstance(expr, sympy.Symbol):
+        return env[expr]
+
+    # Special cases
+    if isinstance(expr, sympy.Pow) and isinstance(
+        expr.args[1], sympy.core.numbers.Half
+    ):
+        return analysis.sqrt(sympy_interp(analysis, env, expr.args[0]))
+
+    # Recursive case
+    args = [sympy_interp(analysis, env, arg) for arg in expr.args]  # type: ignore[arg-type]
+    handler_name = handlers()[expr.func]
+    handler = getattr(analysis, handler_name)
+    if handler_name in ASSOCIATIVE_OPS:
+        assert len(args) > 1
+        acc = handler(args[0], args[1])
+        for i in range(2, len(args)):
+            acc = handler(acc, args[i])
+        return acc
+    else:
+        return handler(*args)
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/reference.py b/MLPY/Lib/site-packages/torch/utils/_sympy/reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd1f27a7a9031314cda3986cea834da68879d479
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/reference.py
@@ -0,0 +1,214 @@
+import math
+
+import sympy
+
+import torch
+
+
+# The sympy interpretation of operators.  It will also sometimes work with
+# plain int/float, but if you do certain operations you will get out a
+# sympy.Basic in the end.  If you want the Python/FX traceable interpretation,
+# check PythonReferenceAnalysis.
+# NB: For magic methods this needs to use normal magic methods
+# so that test_magic_methods works
+class ReferenceAnalysis:
+    @staticmethod
+    def constant(c, dtype):
+        return sympy.sympify(c)
+
+    @staticmethod
+    def or_(a, b):
+        return a | b
+
+    @staticmethod
+    def and_(a, b):
+        return a & b
+
+    @staticmethod
+    def eq(a, b):
+        if isinstance(a, sympy.Expr) or isinstance(b, sympy.Expr):
+            return sympy.Eq(a, b)
+        return a == b
+
+    @classmethod
+    def ne(cls, a, b):
+        return cls.not_(cls.eq(a, b))
+
+    @staticmethod
+    def lt(a, b):
+        return a < b
+
+    @staticmethod
+    def gt(a, b):
+        return a > b
+
+    @staticmethod
+    def le(a, b):
+        return a <= b
+
+    @staticmethod
+    def ge(a, b):
+        return a >= b
+
+    @staticmethod
+    def not_(a):
+        assert not isinstance(a, bool)
+        return ~a
+
+    @staticmethod
+    def reciprocal(x):
+        return 1 / x
+
+    @staticmethod
+    def square(x):
+        return x * x
+
+    @staticmethod
+    def mod(x, y):
+        return x % y
+
+    @staticmethod
+    def abs(x):
+        return abs(x)
+
+    @staticmethod
+    def neg(x):
+        return -x
+
+    @staticmethod
+    def truediv(a, b):
+        return a / b
+
+    @staticmethod
+    def div(a, b):
+        return ReferenceAnalysis.truediv(a, b)
+
+    @staticmethod
+    def floordiv(a, b):
+        if b == 0:
+            return sympy.nan if a == 0 else sympy.zoo
+        return a // b
+
+    @staticmethod
+    def truncdiv(a, b):
+        result = a / b
+        if result.is_finite:
+            result = sympy.Integer(result)
+
+        return result
+
+    @staticmethod
+    def add(a, b):
+        return a + b
+
+    @staticmethod
+    def mul(a, b):
+        return a * b
+
+    @staticmethod
+    def sub(a, b):
+        return a - b
+
+    @staticmethod
+    def exp(x):
+        return sympy.exp(x)
+
+    @staticmethod
+    def log(x):
+        return sympy.log(x)
+
+    @staticmethod
+    def sqrt(x):
+        return sympy.sqrt(x)
+
+    @staticmethod
+    def pow(a, b):
+        return a**b
+
+    @staticmethod
+    def minimum(a, b):
+        # Poorman's version of upcasting in Sympy
+        # This won't do for sympy.Expr as the casting does nothing for those
+        if a.is_Float or not a.is_finite or b.is_Float or not b.is_finite:
+            result_type = sympy.Float
+        else:
+            assert a.is_Integer
+            assert b.is_Integer
+            result_type = sympy.Integer
+        return sympy.Min(result_type(a), result_type(b))
+
+    @staticmethod
+    def maximum(a, b):
+        # Poorman's version of upcasting in Sympy
+        # This won't do for sympy.Expr as the casting does nothing for those
+        if a.is_Float or not a.is_finite or b.is_Float or not b.is_finite:
+            result_type = sympy.Float
+        else:
+            assert a.is_Integer
+            assert b.is_Integer
+            result_type = sympy.Integer
+        return sympy.Max(result_type(a), result_type(b))
+
+    @staticmethod
+    def floor(x):
+        return sympy.floor(x)
+
+    @staticmethod
+    def ceil(x):
+        return sympy.ceiling(x)
+
+
+# Unlike ReferenceAnalysis, does NOT sympyify, instead, works with plain
+# Python types and is FX traceable.  Inheritance here is purely for code
+# sharing (TODO: considering splitting out a BaseReferenceAnalysis).
+class PythonReferenceAnalysis(ReferenceAnalysis):
+    @staticmethod
+    def constant(c, dtype):
+        if dtype is torch.int64:
+            return int(c)
+        elif dtype is torch.double:
+            return float(c)
+        elif dtype is torch.bool:
+            return bool(c)
+        else:
+            raise AssertionError(f"unrecognized dtype {dtype}")
+
+    @staticmethod
+    def not_(a):
+        return torch.sym_not(a)
+
+    @staticmethod
+    def floordiv(a, b):
+        return a // b
+
+    @staticmethod
+    def truncdiv(a, b):
+        return a / b
+
+    @staticmethod
+    def exp(x):
+        raise AssertionError("exp is not valid shape sympy expr")
+
+    @staticmethod
+    def log(x):
+        raise AssertionError("log is not valid shape sympy expr")
+
+    @staticmethod
+    def sqrt(x):
+        return torch._sym_sqrt(x)  # type: ignore[attr-defined]
+
+    @staticmethod
+    def minimum(a, b):
+        return torch.sym_min(a, b)
+
+    @staticmethod
+    def maximum(a, b):
+        return torch.sym_max(a, b)
+
+    @staticmethod
+    def floor(x):
+        return math.floor(x)
+
+    @staticmethod
+    def ceil(x):
+        return math.ceil(x)
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/singleton_int.py b/MLPY/Lib/site-packages/torch/utils/_sympy/singleton_int.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda6fce8900286bb06dc9874828300f199f8ad96
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/singleton_int.py
@@ -0,0 +1,94 @@
+import sympy
+from sympy.multipledispatch import dispatch
+
+__all__ = ["SingletonInt"]
+
+
+class SingletonInt(sympy.AtomicExpr):
+    # This is probably not super important unless we are in multiple dispatch
+    # situations with other more exotic Expr types.
+    _op_priority = 99999
+
+    def __new__(cls, *args, coeff=None, **kwargs):
+        instance = super().__new__(cls, *args, **kwargs)
+        return instance
+
+    # The semantics of this class should match that of NestedIntSymNodeImpl in
+    # c10/core/NestedIntSymNodeImpl.h
+    def __init__(self, val, *, coeff=1):
+        self._val = val
+        self._coeff = coeff
+        super().__init__()
+
+    # See NOTE [ Inequalities with nested int ]
+    def _eval_Eq(self, other):
+        if (
+            isinstance(other, SingletonInt)
+            and other._val == self._val
+            and self._coeff == other._coeff
+        ):
+            return sympy.true
+        else:
+            return sympy.false
+
+    # This is necessary so that calling expr.free_symbols on exprs that contain
+    # this Singleton does not error
+    @property
+    def free_symbols(self):
+        return set()
+
+    def __mul__(self, other):
+        if isinstance(other, SingletonInt):
+            raise ValueError(
+                "SingletonInt cannot be multiplied by another SingletonInt"
+            )
+        return SingletonInt(self._val, coeff=self._coeff * other)
+
+    def __rmul__(self, other):
+        if isinstance(other, SingletonInt):
+            raise ValueError(
+                "SingletonInt cannot be multiplied by another SingletonInt"
+            )
+        return SingletonInt(self._val, coeff=self._coeff * other)
+
+    # Make sure we promptly raise an error instead of falling back to building
+    # an expression tree. There are probably more ops, how can we be exhaustive?
+    def __add__(self, other):
+        raise NotImplementedError("NYI")
+
+    def __sub__(self, other):
+        raise NotImplementedError("NYI")
+
+    def __truediv__(self, other):
+        raise NotImplementedError("NYI")
+
+    def __floordiv__(self, other):
+        raise NotImplementedError("NYI")
+
+    def __mod__(self, other):
+        raise NotImplementedError("NYI")
+
+
+# See NOTE [ Inequalities with nested int ]
+@dispatch(sympy.Integer, SingletonInt)
+def _eval_is_ge(a, b):
+    if a < 2:
+        return sympy.false
+    raise ValueError("Symbolic SingletonInt: Relation is indeterminate")
+
+
+@dispatch(SingletonInt, sympy.Integer)  # type: ignore[no-redef]
+def _eval_is_ge(a, b):  # noqa: F811
+    if b <= 2:
+        return sympy.true
+    raise ValueError("Symbolic SingletonInt: Relation is indeterminate")
+
+
+@dispatch(SingletonInt, SingletonInt)  # type: ignore[no-redef]
+def _eval_is_ge(a, b):  # noqa: F811
+    if a._val == b._val:
+        if a._coeff >= b._coeff:
+            return sympy.true
+        else:
+            return sympy.false
+    raise ValueError("Symbolic SingletonInt: Relation is indeterminate")
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/solve.py b/MLPY/Lib/site-packages/torch/utils/_sympy/solve.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5a6343834f6b740b9cef7195ac23cbf3e444df2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/solve.py
@@ -0,0 +1,175 @@
+import logging
+
+from typing import Dict, Optional, Tuple, Type
+
+import sympy
+
+from torch.utils._sympy.functions import FloorDiv
+
+log = logging.getLogger(__name__)
+
+_MIRROR_REL_OP: Dict[Type[sympy.Basic], Type[sympy.Rel]] = {
+    sympy.Eq: sympy.Eq,
+    sympy.Ne: sympy.Ne,
+    sympy.Ge: sympy.Le,
+    sympy.Gt: sympy.Lt,
+    sympy.Le: sympy.Ge,
+    sympy.Lt: sympy.Gt,
+}
+
+INEQUALITY_TYPES = (sympy.Gt, sympy.Ge, sympy.Lt, sympy.Le)
+
+
+def mirror_rel_op(type: Type) -> Optional[Type[sympy.Rel]]:
+    return _MIRROR_REL_OP.get(type, None)
+
+
+# Tries to simplify 'expr', so as to leave only 'thing' in the left-hand side.
+#
+# Returns a tuple of:
+#   1. The simplified expression
+#   2. The expression on the right-hand side
+#
+# Returns 'None' if it can't reach a state where the only thing in the left
+# hand side is 'thing'.
+#
+# 'trials': number of times 'try_solve' will try to isolate 'thing' to the
+# left-hand side.
+#
+# 'floordiv_inequality': flag to enable conversion of 'FloorDiv' into
+# inequalities.
+def try_solve(
+    expr: sympy.Basic,
+    thing: sympy.Basic,
+    trials: int = 5,
+    floordiv_inequality: bool = True,
+) -> Optional[Tuple[sympy.Rel, sympy.Basic]]:
+    mirror = mirror_rel_op(type(expr))
+
+    # Ignore unsupported expressions:
+    #   - Those that are not relational operations
+    #   - Those that don't have a mirror (just avoiding unexpected classes)
+    if not isinstance(expr, sympy.Rel) or mirror is None:
+        log.debug("expression with unsupported type: %s", type(expr))
+        return None
+
+    lhs_has_thing = expr.lhs.has(thing)
+    rhs_has_thing = expr.rhs.has(thing)
+
+    # Give up when 'thing' appears on both sides of the relational expression.
+    # That is because, as is, we assume the thing we are trying to isolate is
+    # only on the right-hand side.
+    if lhs_has_thing and rhs_has_thing:
+        log.debug("thing (%s) found in both sides of expression: %s", thing, expr)
+        return None
+
+    # Try considering both LHS and RHS by mirroring the original expression:
+    # a < b ==> b > a
+    expressions = []
+
+    # Add each version of 'expr' if 'thing' is in its left-hand side.
+    if lhs_has_thing:
+        expressions.append(expr)
+    if rhs_has_thing:
+        expressions.append(mirror(expr.rhs, expr.lhs))
+
+    for e in expressions:
+        if e is None:
+            continue
+
+        assert isinstance(e, sympy.Rel)
+
+        for _ in range(trials):
+            trial = _try_isolate_lhs(e, thing, floordiv_inequality=floordiv_inequality)
+            # Stop if there was no change in this trial.
+            if trial == e:
+                break
+            e = trial  # type: ignore[assignment]
+
+        # Return if we were able to isolate 'thing' on the left-hand side.
+        if isinstance(e, sympy.Rel) and e.lhs == thing:
+            return e, e.rhs
+
+    return None
+
+
+def _try_isolate_lhs(
+    expr: sympy.Basic, thing: sympy.Basic, floordiv_inequality: bool
+) -> sympy.Basic:
+    e = expr
+    op = type(expr)
+
+    if isinstance(e, sympy.Rel):
+        # Move any constants in the left-hand side to the right-hand side.
+        lhs_not_thing = (
+            sum([a for a in e.lhs.args if not a.has(thing)])
+            if isinstance(e.lhs, sympy.Add)
+            else 0
+        )
+        e = op(expr.lhs - lhs_not_thing, expr.rhs - lhs_not_thing)  # type: ignore[attr-defined]
+
+    # Divide both sides by the factors that don't contain thing.
+    if isinstance(e, sympy.Rel) and isinstance(e.lhs, sympy.Mul):
+        lhs, rhs = e.args
+        other = sympy.Mul(*[a for a in lhs.args if not a.has(thing)])
+
+        # If we can't tell whether 'other' is negative or positive, we do nothing.
+        # That is because we don't know whether we have mirror the operation or not.
+        if not (isinstance(e, INEQUALITY_TYPES) and other.is_negative is None):
+            # Divide both sides by 'other'.
+            lhs = lhs / other
+            rhs = rhs / other
+
+            # If 'e' is an inequality and 'other' is negative, we have to
+            # mirror the expression.
+            if isinstance(e, INEQUALITY_TYPES) and other.is_negative:
+                op = mirror_rel_op(op)  # type: ignore[assignment]
+
+            assert op is not None
+            e = op(lhs, rhs)
+
+    ################################################################################
+    # left-hand side is FloorDiv
+    ################################################################################
+    #
+    # Given the expression: a // b op c
+    # where 'op' is a relational operation, these rules only work if:
+    #   - b > 0
+    #   - c is an integer
+    if (
+        floordiv_inequality
+        and isinstance(e, sympy.Rel)
+        and isinstance(e.lhs, FloorDiv)
+        and e.lhs.divisor.is_positive
+        and e.rhs.is_integer
+    ):
+        # a // b == expr
+        # => a >= (b * expr) and a < (b * (expr + 1))
+        if isinstance(expr, sympy.Eq):
+            numerator, denominator = e.lhs.args
+            return sympy.And(
+                sympy.Ge(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
+                sympy.Lt(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+            )
+        # a // b != expr
+        # => a < (b * expr) or a >= (b * (expr + 1))
+        if isinstance(expr, sympy.Ne):
+            numerator, denominator = e.lhs.args
+            return sympy.Or(
+                sympy.Lt(numerator, (e.rhs * denominator)),  # type: ignore[arg-type]
+                sympy.Ge(numerator, ((e.rhs + 1) * denominator)),  # type: ignore[arg-type]
+            )
+        # The transformations below only work if b is positive.
+        # Note: we only have this information for constants.
+        # a // b > expr  => a >= b * (expr + 1)
+        # a // b >= expr => a >= b * expr
+        if isinstance(expr, (sympy.Gt, sympy.Ge)):
+            quotient = e.rhs if isinstance(expr, sympy.Ge) else (e.rhs + 1)  # type: ignore[arg-type]
+            return sympy.Ge(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+        # a // b < expr  => a < b * expr
+        # a // b <= expr => a < b * (expr + 1)
+        if isinstance(expr, (sympy.Lt, sympy.Le)):
+            quotient = e.rhs if isinstance(expr, sympy.Lt) else (e.rhs + 1)  # type: ignore[arg-type]
+            return sympy.Lt(e.lhs.args[0], (quotient * e.lhs.args[1]))  # type: ignore[arg-type]
+
+    return e
diff --git a/MLPY/Lib/site-packages/torch/utils/_sympy/value_ranges.py b/MLPY/Lib/site-packages/torch/utils/_sympy/value_ranges.py
new file mode 100644
index 0000000000000000000000000000000000000000..d443c2618e85d3e7f69fecf34ce42afed7bf5a70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_sympy/value_ranges.py
@@ -0,0 +1,782 @@
+from __future__ import annotations
+
+import dataclasses
+import itertools
+import sympy
+from sympy.logic.boolalg import BooleanAtom, Boolean as SympyBoolean
+import operator
+import math
+import logging
+import torch
+from typing import Dict, Optional, SupportsFloat, TypeVar, Generic, Union, overload, Callable, TYPE_CHECKING
+from typing_extensions import TypeGuard
+
+from torch._prims_common import dtype_to_type
+from .interp import sympy_interp
+from .functions import Round, RoundDecimal
+
+log = logging.getLogger(__name__)
+
+__all__ = ["ValueRanges", "ValueRangeAnalysis", "bound_sympy"]
+
+_T = TypeVar('_T', sympy.Expr, SympyBoolean)
+
+class ValueRangeError(RuntimeError):
+    pass
+
+
+# Like sympify, but supports less stuff, and also ensures that direct
+# sympy expressions don't have free variables
+def simple_sympify(e):
+    if isinstance(e, bool):
+        return sympy.true if e else sympy.false
+    elif isinstance(e, int):
+        return sympy.Integer(e)
+    elif isinstance(e, float):
+        # infinity is special; we use it to bracket integers as well
+        if math.isinf(e):
+            return sympy.oo if e > 0 else -sympy.oo
+        return sympy.Float(e)
+    elif isinstance(e, sympy.Expr):
+        assert e.is_number, e
+        # NaNs can occur when doing things like 0 * sympy.oo, but it is better
+        # if the operator notices this and takes care of it, because sometimes
+        # the NaN is inappropriate (for example, for ints, the [-oo, oo] range
+        # should go to zero when multiplied with [0, 0])
+        assert e != sympy.nan
+        return e
+    elif isinstance(e, BooleanAtom):
+        return e
+    else:
+        raise AssertionError(f"not simple sympy type {type(e)}: {e}")
+
+
+# Sympy atomics only. Unlike <=, it also works on Sympy bools.
+def sympy_generic_le(lower, upper):
+    if isinstance(lower, sympy.Expr):
+        assert isinstance(upper, sympy.Expr)
+        return lower <= upper
+    else:
+        # only negative condition is True > False
+        assert isinstance(lower, SympyBoolean) and isinstance(upper, SympyBoolean)
+        return not (lower and not upper)
+
+
+def vr_is_bool(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[SympyBoolean]]:
+    return vr.is_bool
+
+
+def vr_is_expr(vr: ValueRanges[_T]) -> TypeGuard[ValueRanges[sympy.Expr]]:
+    return not vr.is_bool
+
+
+ExprIn = Union[int, float, sympy.Expr]
+BoolIn = Union[bool, SympyBoolean]
+AllIn = Union[ExprIn, BoolIn]
+ExprFn = Callable[[sympy.Expr], sympy.Expr]
+ExprFn2 = Callable[[sympy.Expr, sympy.Expr], sympy.Expr]
+BoolFn = Callable[[SympyBoolean], SympyBoolean]
+BoolFn2 = Callable[[SympyBoolean, SympyBoolean], SympyBoolean]
+AllFn = Union[ExprFn, BoolFn]
+AllFn2 = Union[ExprFn2, BoolFn2]
+
+
+@dataclasses.dataclass(frozen=True)
+class ValueRanges(Generic[_T]):
+    if TYPE_CHECKING:
+        # ruff doesn't understand circular references but mypy does
+        ExprVR = ValueRanges[sympy.Expr]  # noqa: F821
+        BoolVR = ValueRanges[SympyBoolean]  # noqa: F821
+        AllVR = Union[ExprVR, BoolVR]
+
+    # Although the type signature here suggests you can pass any
+    # sympy expression, in practice the analysis here only works
+    # with constant sympy expressions
+    lower: _T
+    upper: _T
+    is_bool: bool
+
+    @overload
+    def __init__(self: ValueRanges[sympy.Expr], lower: ExprIn, upper: ExprIn) -> None:
+        ...
+
+    @overload
+    def __init__(self: ValueRanges[SympyBoolean], lower: BoolIn, upper: BoolIn) -> None:
+        ...
+
+    def __init__(self, lower: AllIn, upper: AllIn) -> None:
+        lower = simple_sympify(lower)
+        upper = simple_sympify(upper)
+        # TODO: when the bounds have free variables, this may be
+        # nontrivial to actually verify
+        if not sympy_generic_le(lower, upper):
+            raise ValueRangeError(f"Invalid ranges [{lower}:{upper}]")
+        # Because this is a frozen class
+        object.__setattr__(self, "lower", lower)
+        object.__setattr__(self, "upper", upper)
+        object.__setattr__(self, "is_bool", isinstance(lower, SympyBoolean))
+        assert isinstance(upper, SympyBoolean) == self.is_bool
+
+    def boolify(self) -> ValueRanges[SympyBoolean]:
+        if vr_is_bool(self):
+            return self
+        elif self == ValueRanges.unknown():
+            return ValueRanges.unknown_bool()
+        else:
+            raise AssertionError(f"not bool like {self}")
+
+    def __contains__(self, x: AllIn) -> bool:
+        x = simple_sympify(x)
+        return sympy_generic_le(self.lower, x) and sympy_generic_le(x, self.upper)
+
+    def issubset(self, other):
+        return sympy_generic_le(other.lower, self.lower) and sympy_generic_le(self.upper, other.upper)
+
+    def tighten(self, other) -> ValueRanges:
+        """Given two ValueRanges, returns their intersection"""
+        return self & other
+
+    # Intersection
+    @overload
+    def __and__(self: ValueRanges[sympy.Expr], other: ValueRanges[sympy.Expr]) -> ValueRanges[sympy.Expr]:
+        ...
+
+    @overload
+    def __and__(self: ValueRanges[SympyBoolean], other: ValueRanges[SympyBoolean]) -> ValueRanges[SympyBoolean]:
+        ...
+
+    def __and__(self: AllVR, other: AllVR) -> AllVR:
+        if other == ValueRanges.unknown():
+            return self
+        if self == ValueRanges.unknown():
+            return other
+        assert self.is_bool == other.is_bool, (self, other)
+        if self.is_bool:
+            return ValueRanges(sympy.Or(self.lower, other.lower), sympy.And(self.upper, other.upper))
+        else:
+            return ValueRanges(sympy.Max(self.lower, other.lower), sympy.Min(self.upper, other.upper))
+
+    # Union
+    @overload
+    def __or__(self: ValueRanges[sympy.Expr], other: ValueRanges[sympy.Expr]) -> ValueRanges[sympy.Expr]:
+        ...
+
+    @overload
+    def __or__(self: ValueRanges[SympyBoolean], other: ValueRanges[SympyBoolean]) -> ValueRanges[SympyBoolean]:
+        ...
+
+    def __or__(self: AllVR, other: AllVR) -> AllVR:
+        if ValueRanges.unknown() in (self, other):
+            return ValueRanges.unknown()
+        assert self.is_bool == other.is_bool, (self, other)
+        if self.is_bool:
+            return ValueRanges(sympy.And(self.lower, other.lower), sympy.Or(self.upper, other.upper))
+        else:
+            return ValueRanges(sympy.Min(self.lower, other.lower), sympy.Max(self.upper, other.upper))
+
+    def is_singleton(self) -> bool:
+        return self.lower == self.upper
+
+    # TODO: this doesn't work with bools but arguably it should
+    @staticmethod
+    def unknown() -> ValueRanges[sympy.Expr]:
+        return ValueRanges(-sympy.oo, sympy.oo)
+
+    @staticmethod
+    def unknown_bool() -> ValueRanges[SympyBoolean]:
+        return ValueRanges(sympy.false, sympy.true)
+
+    @overload
+    @staticmethod
+    # work around the fact that bool and int overlap
+    def wrap(arg: Union[ExprIn, ExprVR]) -> ExprVR:  # type: ignore[overload-overlap]
+        ...
+
+    @overload
+    @staticmethod
+    def wrap(arg: Union[BoolIn, BoolVR]) -> BoolVR:
+        ...
+
+    @staticmethod
+    def wrap(arg: Union[AllIn, AllVR]) -> AllVR:
+        if isinstance(arg, ValueRanges):
+            return arg
+        # arg is either ExprIn or BoolIn, but we don't know it here
+        return ValueRanges(arg, arg)  # type: ignore[arg-type]
+
+    @staticmethod
+    def increasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        """Increasing: x <= y => f(x) <= f(y)."""
+        x = ValueRanges.wrap(x)
+        return ValueRanges(fn(x.lower), fn(x.upper))
+
+    @overload
+    @staticmethod
+    def decreasing_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        ...
+
+    @overload
+    @staticmethod
+    def decreasing_map(x: Union[BoolIn, BoolVR], fn: BoolFn) -> BoolVR:
+        ...
+
+    @staticmethod
+    def decreasing_map(x: Union[AllIn, AllVR], fn: AllFn) -> AllVR:
+        """Decreasing: x <= y => f(x) >= f(y)."""
+        x = ValueRanges.wrap(x)
+        # consistently either Expr or Bool, but we don't know it here
+        return ValueRanges(fn(x.upper), fn(x.lower))  # type: ignore[arg-type]
+
+    @staticmethod
+    def monotone_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        """It's increasing or decreasing."""
+        x = ValueRanges.wrap(x)
+        l = fn(x.lower)
+        u = fn(x.upper)
+        return ValueRanges(min(l, u), max(l, u))
+
+    @staticmethod
+    def convex_min_zero_map(x: Union[ExprIn, ExprVR], fn: ExprFn) -> ExprVR:
+        """Fn is convex and has a minimum at 0."""
+        x = ValueRanges.wrap(x)
+        if 0 in x:
+            return ValueRanges(0, max(fn(x.lower), fn(x.upper)))
+        else:
+            return ValueRanges.monotone_map(x, fn)
+
+    @overload
+    @staticmethod
+    def coordinatewise_increasing_map(x: Union[ExprIn, ExprVR], y: Union[ExprIn, ExprVR], fn: ExprFn2) -> ExprVR:
+        ...
+
+    @overload
+    @staticmethod
+    def coordinatewise_increasing_map(x: Union[BoolIn, BoolVR], y: Union[BoolIn, BoolVR], fn: BoolFn2) -> BoolVR:
+        ...
+
+    @staticmethod
+    def coordinatewise_increasing_map(x: Union[AllIn, AllVR], y: Union[AllIn, AllVR], fn: AllFn2) -> AllVR:
+        """
+        It's increasing on each coordinate.
+
+        Mathematically:
+        For every 1 <= i <= n and x_i <= y_i we have that
+        f(x1, .., xn) <= f(x1, , yi, ..., xn)
+        """
+        x, y = ValueRanges.wrap(x), ValueRanges.wrap(y)
+        return ValueRanges(
+            fn(x.lower, y.lower),  # type: ignore[arg-type]
+            fn(x.upper, y.upper),  # type: ignore[arg-type]
+        )
+
+    @classmethod
+    def coordinatewise_monotone_map(cls, x, y, fn):
+        """It's increasing or decreasing on each coordinate."""
+        x, y = cls.wrap(x), cls.wrap(y)
+        products = [
+            fn(a, b)
+            for a, b in itertools.product([x.lower, x.upper], [y.lower, y.upper])
+        ]
+        return ValueRanges(min(products), max(products))
+
+class SymPyValueRangeAnalysis:
+    """
+    It gives bounds on a SymPy operator given bounds on its arguments
+    See the function `bound_sympy` for a function that applies this logic to a full SymPy expression
+    """
+
+    @staticmethod
+    def constant(value, dtype):
+        # NB: value is NOT a sympy expression, it's a constant!
+        is_python = isinstance(value, (int, float, bool))
+        assert is_python or isinstance(value, (BooleanAtom, sympy.Integer, sympy.Number))
+
+        # using nan makes subsequent computation throw, and for the purposes of optimization
+        # returning -math.inf - math.inf is equivalent to giving up
+        if isinstance(value, SupportsFloat) and math.isnan(value):
+            return ValueRanges.unknown()
+
+        if is_python:
+            type_ = dtype_to_type(dtype)
+            value = type_(value)
+        else:
+            # We do a type check on a best-effort basis
+            # We don't want to force a cast to sympy.Float if the value is Rational to avoid losing precision
+            if dtype == torch.bool:
+                assert isinstance(value, BooleanAtom)
+            elif dtype.is_floating_point:
+                assert not value.is_finite or value.is_real
+            else:
+                # dtype is intXX
+                assert value.is_integer
+
+        return ValueRanges.wrap(value)
+
+    @staticmethod
+    def not_(a):
+        a = ValueRanges.wrap(a)
+        a = a.boolify()
+        assert a.is_bool
+        return ValueRanges.decreasing_map(a, sympy.Not)
+
+    @staticmethod
+    def or_(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, sympy.Or)
+
+    @staticmethod
+    def and_(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, sympy.And)
+
+    @staticmethod
+    def eq(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if a.is_singleton() and b.is_singleton() and a.lower == b.lower:
+            return ValueRanges.wrap(sympy.true)
+        elif a.lower > b.upper or b.lower > a.upper:  # ranges disjoint
+            return ValueRanges.wrap(sympy.false)
+        return ValueRanges(sympy.false, sympy.true)
+
+    @classmethod
+    def ne(cls, a, b):
+        return cls.not_(cls.eq(a, b))
+
+    @classmethod
+    def lt(cls, a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        assert a.is_bool == b.is_bool
+        if a.is_bool:
+            return cls.and_(cls.not_(a), b)
+        else:
+            if a.upper < b.lower:
+                return ValueRanges.wrap(sympy.true)
+            elif a.lower >= b.upper:
+                return ValueRanges.wrap(sympy.false)
+            return ValueRanges(sympy.false, sympy.true)
+
+    @classmethod
+    def gt(cls, a, b):
+        return cls.lt(b, a)
+
+    @classmethod
+    def le(cls, a, b):
+        return cls.not_(cls.gt(a, b))
+
+    @classmethod
+    def ge(cls, a, b):
+        return cls.not_(cls.lt(a, b))
+
+    @staticmethod
+    def add(a, b):
+        return ValueRanges.coordinatewise_increasing_map(a, b, operator.add)
+
+    @classmethod
+    def mul(cls, a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+
+        assert a.is_bool == b.is_bool
+        if a.is_bool:
+            return cls.and_(a, b)
+
+        def safe_mul(a, b):
+            # Make unknown() * wrap(0) == wrap(0)
+            if a == 0:
+                return a
+            elif b == 0:
+                return b
+            else:
+                return a * b
+
+        return ValueRanges.coordinatewise_monotone_map(a, b, safe_mul)
+
+    @classmethod
+    def div(cls, a, b):
+        return cls.truediv(a, b)
+
+    @staticmethod
+    def truediv(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if 0 in b or ((-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)):
+            return ValueRanges.unknown()
+        else:
+            return ValueRanges.coordinatewise_monotone_map(a, b, operator.truediv)
+
+    @staticmethod
+    def floordiv(a, b):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        if 0 in b or ((-sympy.oo in a or sympy.oo in a) and (-sympy.oo in b or sympy.oo in b)):
+            return ValueRanges.unknown()
+        else:
+            return ValueRanges.coordinatewise_monotone_map(a, b, operator.floordiv)
+
+    @staticmethod
+    def mod(x, y):
+        x = ValueRanges.wrap(x)
+        y = ValueRanges.wrap(y)
+        if x.is_singleton() and y.is_singleton() and y.lower != 0:
+            return ValueRanges.wrap(x.lower % y.lower)
+        if y.lower <= 0:
+            return ValueRanges.unknown()
+        return ValueRanges(0, y.upper)
+
+    @classmethod
+    def modular_indexing(cls, a, b, c):
+        return cls.mod(cls.floordiv(a, b), c)
+
+    @classmethod
+    def is_non_overlapping_and_dense_indicator(cls, *args):
+        return ValueRanges.unknown()
+
+    @classmethod
+    def pow(cls, a, b):
+        def is_integer(val):
+            return isinstance(val, int) or (
+                hasattr(val, "is_integer") and val.is_integer
+            )
+
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+        # Not implemented yet. It's a bit tricky
+        # If you want to implement it, compute the partial derivatives of a ** b
+        # and check the ranges where the function is increasing / decreasing
+        # Another non-tight way of doing this is defaulting to doing noting that for a > 0,  a ** b == exp(b * log(a))
+        # If this second option is implemented, by carefult about the types and possible infinities here and there.
+        if not b.is_singleton():
+            return ValueRanges.unknown()
+
+        b = b.lower
+        if a.is_singleton():
+            a = a.lower
+            r = a ** b
+            if not r.is_finite:
+                return ValueRanges.unknown()
+            return ValueRanges.wrap(r)
+
+        if b == 0:
+            if not a.lower.is_finite:
+                return ValueRanges.unknown()
+            type_ = sympy.Float if a.lower.is_real else sympy.Integer
+            return ValueRanges.wrap(type_(1))
+
+        if b < 0:
+            a = cls.reciprocal(a)
+            b = -b
+
+        if a == ValueRanges.unknown():
+            return ValueRanges.unknown()
+
+        # Here b > 0
+        if not is_integer(b):
+            # If the base is positive, then we're good, otherwise nothing's defined
+            if a.lower >= 0:
+                return ValueRanges.increasing_map(a, lambda x: x ** b)
+            else:
+                return ValueRanges.unknown()
+        else:
+            # b > 0 integer
+            if b % 2 == 0:
+                # x^n where n is even
+                return ValueRanges.convex_min_zero_map(a, lambda x: x ** b)
+            else:
+                # x^n where n is odd
+                return ValueRanges.increasing_map(a, lambda x: x ** b)
+
+    @staticmethod
+    def reciprocal(x):
+        """ Needed as it's used in pow, but it won't appear on a SymPy expression """
+        x = ValueRanges.wrap(x)
+        if 0 in x:
+            return ValueRanges.unknown()
+        else:
+            return ValueRanges.decreasing_map(x, lambda y: 1 / y)
+
+    @staticmethod
+    def abs(x):
+        return ValueRanges.convex_min_zero_map(x, abs)
+
+    @staticmethod
+    def exp(x):
+        return ValueRanges.increasing_map(x, sympy.functions.elementary.exponential.exp)
+
+    @staticmethod
+    def log(x):
+        x = ValueRanges.wrap(x)
+        if x.lower <= 0:
+            return ValueRanges.unknown()
+        return ValueRanges.increasing_map(x, sympy.log)
+
+    @classmethod
+    def minimum(cls, a, b):
+        return cls.min_or_max(a, b, sympy.Min)
+
+    @classmethod
+    def maximum(cls, a, b):
+        return cls.min_or_max(a, b, sympy.Max)
+
+    @staticmethod
+    def min_or_max(a, b, fn):
+        a = ValueRanges.wrap(a)
+        b = ValueRanges.wrap(b)
+
+        # Performs upcasting first
+        def fn_(x: sympy.Expr, y: sympy.Expr) -> sympy.Expr:
+            # Poorman's version of upcasting in Sympy
+            # Inf is not a float...
+            if x.is_Integer and y.is_Integer:
+                result_type = sympy.Integer
+            elif x.is_rational and y.is_rational:
+                result_type = sympy.Rational
+            else:
+                assert x.is_real or not x.is_finite or y.is_real or not y.is_finite
+                result_type = sympy.Float
+            return fn(result_type(x), result_type(y))
+
+        return ValueRanges.coordinatewise_increasing_map(a, b, fn_)
+
+    @classmethod
+    def floor(cls, x):
+        return ValueRanges.increasing_map(x, sympy.functions.elementary.integers.floor)
+
+    @classmethod
+    def ceil(cls, x):
+        return ValueRanges.increasing_map(x, sympy.functions.elementary.integers.ceiling)
+
+    @classmethod
+    def round(cls, number, ndigits=None):
+        if ndigits is None:
+            fn = Round
+        else:
+            assert ndigits.is_singleton()
+            ndigits = ndigits.lower
+            # We can't use functools.partial here since sympy doesn't support keyword arguments, but we have to bind
+            # the second parameter.
+            fn = lambda number: RoundDecimal(number, ndigits)  # type: ignore[misc, assignment]  # noqa: E731
+
+        return ValueRanges.increasing_map(number, fn)
+
+    # It's used in some models on symints
+    @staticmethod
+    def sqrt(x):
+        x = ValueRanges.wrap(x)
+        if x.lower < 0:
+            return ValueRanges.unknown()
+        return ValueRanges.increasing_map(x, sympy.sqrt)
+
+    @staticmethod
+    def where(a, b, c):
+        b = ValueRanges.wrap(b)
+        c = ValueRanges.wrap(c)
+        a = a.boolify()
+        assert b.is_bool == c.is_bool
+        if b.is_bool:
+            return ValueRanges(sympy.And(b.lower, c.lower), sympy.Or(b.upper, c.upper))
+        else:
+            return ValueRanges(sympy.Min(b.lower, c.lower), sympy.Max(b.upper, c.upper))
+
+    # expr_cond_pair is used to represent a single (expr, condition) pair in piecewise.
+    # We just return the value range of the expression and its corresponding condition as a tuple
+    # and defer the analysis to piecewise
+    @staticmethod
+    def expr_cond_pair(a, b):
+        b = b.boolify()
+        return (a, b)
+
+    # piecewise function can be used to convert a SymBool to SymInt:
+    # int_expr = Piecewise((1, bool_expr), (0, True)), it evalutes to 1 when sym_bool is True and 0 otherwise.
+    #
+    # ranges is a sequence of (expr_range, condition_range) pairs. The range pair is constructed in expr_cond_pair.
+    # The ValueRange of Piecewise is just the union of all expr ranges whose condition expr can be True.
+    @staticmethod
+    def piecewise(*ranges):
+        init_range = None
+        for expr_range, cond_range in ranges:
+            if sympy.true in cond_range:
+                if init_range is None:
+                    init_range = expr_range
+                else:
+                    init_range = init_range | expr_range
+        return init_range
+
+    @staticmethod
+    def cos(x):
+        # TODO: We should tighten value ranges
+        # If input range span is pi + 2*pi*k, then output range is (-1, 1)
+        # otherwise the minimum of the value of the function on the extremes
+        return ValueRanges(-1.0, 1.0)
+
+    @staticmethod
+    def cosh(x):
+        x = ValueRanges.wrap(x)
+        if x.lower > 0:
+            return ValueRanges.increasing_map(x, sympy.cosh)
+        elif x.upper < 0:
+            return ValueRanges.decreasing_map(x, sympy.cosh)
+        return ValueRanges(0.0, sympy.oo)
+
+    @staticmethod
+    def sin(x):
+        # TODO: We should tighten value ranges
+        # See details on cos
+        return ValueRanges(-1.0, 1.0)
+
+    @staticmethod
+    def sinh(x):
+        return ValueRanges.increasing_map(x, sympy.sinh)
+
+    @staticmethod
+    def tan(x):
+        return ValueRanges(-sympy.oo, sympy.oo)
+
+    @staticmethod
+    def tanh(x):
+        return ValueRanges.increasing_map(x, sympy.tanh)
+
+    @staticmethod
+    def asin(x):
+        x = ValueRanges.wrap(x)
+        if -1 <= x.lower and x.upper <= 1:
+            return ValueRanges.increasing_map(x, sympy.asin)
+        return ValueRanges.unknown()
+
+    @staticmethod
+    def acos(x):
+        x = ValueRanges.wrap(x)
+        if -1 <= x.lower and x.upper <= 1:
+            return ValueRanges.decreasing_map(x, sympy.acos)
+        return ValueRanges.unknown()
+
+    @staticmethod
+    def atan(x):
+        return ValueRanges.increasing_map(x, sympy.atan)
+
+
+class ValueRangeAnalysis(SymPyValueRangeAnalysis):
+    def __init__(self):
+        self.name = "ValueRangeAnalysis"
+        boolean_operators = (
+            "xor",
+            "logical_and",
+            "logical_or",
+            "logical_not",
+        )
+        for op in boolean_operators:
+            setattr(self, op, self.bool_handler)
+
+    @staticmethod
+    def bool_handler(*args, **kwargs):
+        # just assuming bools can have both values
+        return ValueRanges(sympy.false, sympy.true)  # type: ignore[arg-type]
+
+    @staticmethod
+    def default_handler(*args, **kwargs):
+        # many ops are unlikely to show up in optimizable indexing compute,
+        # so we dont have full coverage
+        return ValueRanges.unknown()
+
+    def load(self, name: str, index: sympy.Expr):
+        return ValueRanges.unknown()
+
+    def store(self, name, index, value, mode=None):
+        return
+
+    def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
+        return ValueRanges.unknown()
+
+    def index_expr(self, index, dtype):
+        assert isinstance(index, ValueRanges)
+        return index
+
+    @staticmethod
+    def to_dtype(x, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None):
+        x = ValueRanges.wrap(x)
+
+        if dtype == torch.bool:
+            if x.is_singleton():
+                return ValueRanges.wrap(x.lower != 0)
+            elif 0 not in x:
+                return ValueRanges.wrap(sympy.true)
+            else:
+                return ValueRanges(sympy.false, sympy.true)
+
+        def cast(x, dtype):
+            # dtype is int or float
+            if dtype.is_floating_point:
+                return sympy.Float(x)
+            else:
+                try:
+                    return sympy.Integer(x)
+                except TypeError:
+                    # inf cannot be cast to Integer
+                    return x
+
+        if x.is_bool:
+            if x.is_singleton():
+                val = 1 if x.lower else 0
+                return ValueRanges.wrap(cast(val, dtype))
+            else:
+                return ValueRanges(cast(0, dtype), cast(1, dtype))
+        else:
+            # int to float or float to int
+            return ValueRanges(cast(x.lower, dtype), cast(x.upper, dtype))
+
+    @staticmethod
+    def square(x):
+        return ValueRanges.convex_min_zero_map(x, lambda y: y * y)
+
+    @staticmethod
+    def neg(x):
+        return ValueRanges.decreasing_map(x, operator.neg)
+
+    @classmethod
+    def truncdiv(cls, a, b):
+        x = cls.truediv(a, b)
+        if x == ValueRanges.unknown():
+            return x
+
+        def trunc(x):
+            return sympy.Integer(x) if x.is_finite else x
+
+        return ValueRanges.increasing_map(x, trunc)
+
+    @classmethod
+    def sub(cls, a, b):
+        return cls.add(a, cls.neg(b))
+
+    def __getattr__(self, name):
+        log.debug("unhandled ValueRange op %s", name)
+        return self.default_handler
+
+
+def bound_sympy(expr: sympy.Expr, ranges: Optional[Dict[sympy.Symbol, ValueRanges]] = None) -> ValueRanges:
+    if isinstance(expr, sympy.Number):
+        return ValueRanges.wrap(expr)
+
+    ranges = ranges or {}
+
+    # If there's a tracing context, augment available constrained ranges.
+    context = torch._guards.TracingContext.try_get()
+    if context and context.fake_mode.shape_env:
+        ranges = {**context.fake_mode.shape_env.var_to_range, **ranges}
+
+    unbounded_vars = expr.free_symbols - ranges.keys()
+    if unbounded_vars:
+        # Give some bounds to the free variables via their SymPy assumptions
+        # TODO A better way of doing this would be to assign them a range upon creation, as
+        #      size variables can come with a lower bound of 2, as we specialise on 0 and 1
+        unbounded_ranges: Dict[sympy.Symbol, ValueRanges] = {}
+        for s in unbounded_vars:
+            assert s.is_integer  # type: ignore[attr-defined]
+            if s.is_positive:  # type: ignore[attr-defined]
+                lower = 1
+            elif s.is_nonnegative:  # type: ignore[attr-defined]
+                lower = 0
+            else:
+                lower = -math.inf  # type: ignore[assignment]
+            unbounded_ranges[s] = ValueRanges(lower, math.inf)  # type: ignore[index]
+        ranges = {**ranges, **unbounded_ranges}
+
+    return sympy_interp(SymPyValueRangeAnalysis, ranges, expr)
diff --git a/MLPY/Lib/site-packages/torch/utils/_traceback.py b/MLPY/Lib/site-packages/torch/utils/_traceback.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e18fe7c23b4ed756d6ea194eff70b2b5b3fe9e0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_traceback.py
@@ -0,0 +1,254 @@
+from types import TracebackType
+from typing import List, Optional
+import tempfile
+import traceback
+import contextlib
+import inspect
+import os.path
+
+# This file contains utilities for ensuring dynamically compile()'d
+# code fragments display their line numbers in backtraces.
+#
+# The constraints:
+#
+# - We don't have control over the user exception printer (in particular,
+#   we cannot assume the linecache trick will work, c.f.
+#   https://stackoverflow.com/q/50515651/23845 )
+#
+# - We don't want to create temporary files every time we compile()
+#   some code; file creation should happen lazily only at exception
+#   time.  Arguably, you *should* be willing to write out your
+#   generated Python code to file system, but in some situations
+#   (esp. library code) it would violate user expectation to write
+#   to the file system, so we try to avoid it.  In particular, we'd
+#   like to keep the files around, so users can open up the files
+#   mentioned in the trace; if the file is invisible, we want to
+#   avoid clogging up the filesystem.
+#
+#   If this is not a constraint for you, there is a substantially simpler
+#   way to implement the functionality in this PR: instead of using
+#   eval/exec directly, just always write a Python file to filesystem
+#   and compile that.
+#
+# - You have control over a context where the compiled code will get
+#   executed, so that we can interpose while the stack is unwinding
+#   (otherwise, we have no way to interpose on the exception printing
+#   process.)
+#
+# There are two things you have to do to make use of the utilities here:
+#
+# - When you compile your source code, you must save its string source
+#   in its f_globals under the magic name "__compile_source__"
+#
+# - Before running the compiled code, enter the
+#   report_compile_source_on_error() context manager.
+
+@contextlib.contextmanager
+def report_compile_source_on_error():
+    try:
+        yield
+    except Exception as exc:
+        tb = exc.__traceback__
+
+        # Walk the traceback, looking for frames that have
+        # source attached
+        stack = []
+        while tb is not None:
+            filename = tb.tb_frame.f_code.co_filename
+            source = tb.tb_frame.f_globals.get("__compile_source__")
+
+            if filename == "<string>" and source is not None:
+                # What black magic are we doing here?  Intuitively, what
+                # we would like to do is overwrite the co_filename on any
+                # frames that were generated from exec/eval so that they
+                # point to a temporary file that has the actual line
+                # information, so Python's default error printer can print
+                # useful line information on it.
+                #
+                # Writing out the temporary file is easy.  But overwriting
+                # co_filename is not!  You can't modify the code object
+                # associated with a frame.  You can, however, reconstruct
+                # a traceback with entirely new frames from scratch, so that's
+                # what we do.  But there's another problem, which is how to
+                # make the frame?
+                #
+                # The black magic is we make a frankenstein frame and code
+                # object which resembles the original frame/code enough so
+                # that it will print properly under traceback and the default
+                # error printer, but IT IS NOT THE ORIGINAL FRAME (you
+                # couldn't, e.g., execute its code with different variables
+                # and expect it to work.)
+
+                # Don't delete the temporary file so the user can inspect it
+                # TODO: This creates a temporary file for every frame, but we
+                # technically only need one per distinct __compile_source__
+                with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".py") as f:
+                    f.write(source)
+                # Create a frame.  Python doesn't let you construct
+                # FrameType directly, so just make one with compile
+                frame = tb.tb_frame
+                code = compile('__inspect_currentframe()', f.name, 'eval')
+                code = code.replace(co_name=frame.f_code.co_name)
+                # Python 3.11 only
+                if hasattr(frame.f_code, 'co_linetable'):
+                    # We can't copy ALL of the metadata over, because you
+                    # can cause Python to segfault this way.  What exactly
+                    # do we need?  We need enough information for
+                    # traceback to be able to print the exception
+                    # correctly.  Code reading Lib/traceback.py reveals
+                    # that traceback calls code.co_positions() in order to
+                    # get the augmented line/col numbers.  Objects/codeobject.c,
+                    # specifically _PyCode_InitAddressRange, reveals that
+                    # this iterator is initialized from co_linetable and
+                    # co_firstfileno.  So copy these we must!
+                    code = code.replace(  # type: ignore[call-arg]
+                        co_linetable=frame.f_code.co_linetable,  # type: ignore[attr-defined]
+                        co_firstlineno=frame.f_code.co_firstlineno,  # type: ignore[attr-defined]
+                    )
+                fake_frame = eval(
+                    code,
+                    frame.f_globals,
+                    {
+                        **frame.f_locals,
+                        '__inspect_currentframe': inspect.currentframe
+                    }
+                )
+                fake_tb = TracebackType(
+                    None, fake_frame, tb.tb_lasti, tb.tb_lineno
+                )
+                stack.append(fake_tb)
+            else:
+                stack.append(tb)
+
+            tb = tb.tb_next
+
+        # Reconstruct the linked list
+        tb_next = None
+        for tb in reversed(stack):
+            tb.tb_next = tb_next
+            tb_next = tb
+
+        raise exc.with_traceback(tb_next)  # noqa: TRY200
+
+def shorten_filename(fn, *, base=None):
+    """Shorten a source filepath, with the assumption that torch/ subdirectories don't need to be shown to user."""
+    if base is None:
+        base = os.path.dirname(os.path.dirname(__file__))
+    # Truncate torch/foo.py to foo.py
+    try:
+        prefix = os.path.commonpath([fn, base])
+    except ValueError:
+        return fn
+    else:
+        return fn[len(prefix) + 1:]
+
+def format_frame(frame, *, base=None, line=False):
+    """
+    Format a FrameSummary in a short way, without printing full absolute path or code.
+
+    The idea is the result fits on a single line.
+    """
+    extra_line = ""
+    if line:
+        extra_line = f"{frame.line}  # "
+    return f"{extra_line}{shorten_filename(frame.filename, base=base)}:{frame.lineno} in {frame.name}"
+
+def format_traceback_short(tb):
+    """Format a TracebackType in a short way, printing only the inner-most frame."""
+    return format_frame(traceback.extract_tb(tb)[-1])
+
+class CapturedTraceback:
+    __slots__ = ['tb', 'skip']
+
+    def __init__(self, tb, skip=0):
+        self.tb = tb
+        self.skip = skip
+
+    def cleanup(self):
+        self.tb = None
+
+    def summary(self):
+        import torch._C._profiler
+
+        if self.tb is None:
+            # TODO: Maybe indicate that the traceback was elided?
+            return traceback.StackSummary()
+
+        return _extract_symbolized_tb(
+            torch._C._profiler.symbolize_tracebacks([self.tb])[0],
+            self.skip
+        )
+
+    def __getstate__(self):
+        return (None, {
+            'tb': None,  # TB is not pickleable
+            'skip': self.skip,
+        })
+
+    @staticmethod
+    def extract(*, script=False, cpp=False, skip=0):
+        """
+        Like traceback.extract_stack(), but faster (approximately 20x faster); it
+        is fast enough that you can unconditionally log stacks this way as part of
+        normal execution.  It returns a torch._C._profiler.CapturedTraceback
+        object that must be formatted specially with format_captured_tb.
+
+        By default, this only reports Python backtraces (like extract_stack).  You
+        can set the script/cpp kwargs to also turn on TorchScript/C++ trace
+        reporting.
+        """
+        import torch._C._profiler
+
+        if script or cpp:
+            assert skip == 0, "skip with script/cpp NYI"
+
+        return CapturedTraceback(
+            torch._C._profiler.gather_traceback(python=True, script=script, cpp=cpp),
+            # Elide extract() frame if we don't have script/cpp frames.  If
+            # we do have those frames, it doesn't work so force zero.
+            0 if script or cpp else skip + 1
+        )
+
+    def format(self):
+        """
+        Formats a single torch._C._profiler.CapturedTraceback into a list of
+        strings equivalent to the output of traceback.format_list.  Note that if
+        pass it CapturedTraceback with C++ traces,  it is better not to use this
+        function and use the batch formatting API format_captured_tbs to amortize
+        the cost of symbolization
+        """
+        return traceback.format_list(self.summary())
+
+    @staticmethod
+    def format_all(tbs):
+        """
+        Bulk version of CapturedTraceback.format.  Returns a list of list of strings.
+        """
+        import torch._C._profiler
+
+        # Directly populate tracebacks that already have cached summaries
+        rs: List[Optional[List[str]]] = []
+        delayed_idxs = []
+        for i, tb in enumerate(tbs):
+            if tb.tb is None:
+                rs.append([])
+            else:
+                rs.append(None)
+                delayed_idxs.append(i)
+
+        stbs = torch._C._profiler.symbolize_tracebacks([tbs[i].tb for i in delayed_idxs])
+        for i, stb in zip(delayed_idxs, stbs):
+            rs[i] = traceback.format_list(tbs[i].summary())
+
+        return rs
+
+
+def _extract_symbolized_tb(tb, skip):
+    """
+    Given a symbolized traceback from symbolize_tracebacks, return a StackSummary object of
+    pre-processed stack trace entries.
+    """
+    stack = traceback.StackSummary()
+    for f in reversed(tb[skip:]):
+        stack.append(traceback.FrameSummary(f['filename'], f['line'], f['name']))
+    return stack
diff --git a/MLPY/Lib/site-packages/torch/utils/_triton.py b/MLPY/Lib/site-packages/torch/utils/_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..248648146a01803abc783ac6a6c27d608f067083
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_triton.py
@@ -0,0 +1,103 @@
+import functools
+import hashlib
+import os
+
+from torch._dynamo.device_interface import get_interface_for_device
+
+
+@functools.lru_cache(None)
+def has_triton_package() -> bool:
+    try:
+        import triton
+
+        return triton is not None
+    except ImportError:
+        return False
+
+
+@functools.lru_cache(None)
+def has_triton() -> bool:
+    def cuda_extra_check(device_interface):
+        return device_interface.Worker.get_device_properties().major >= 7
+
+    triton_supported_devices = {"cuda": cuda_extra_check}
+
+    def is_device_compatible_with_triton():
+        for device, extra_check in triton_supported_devices.items():
+            device_interface = get_interface_for_device(device)
+            if device_interface.is_available() and extra_check(device_interface):
+                return True
+        return False
+
+    return is_device_compatible_with_triton() and has_triton_package()
+
+
+@functools.lru_cache(None)
+def triton_backend_hash():
+    from triton.common.backend import get_backend, get_cuda_version_key
+
+    import torch
+
+    if torch.version.hip:
+        # Does not work with ROCm
+        return None
+
+    if not torch.cuda.is_available():
+        return None
+
+    backend = get_backend("cuda")
+    if backend is None:
+        return get_cuda_version_key()
+    else:
+        return backend.get_version_key()
+
+
+@functools.lru_cache
+def triton_key():
+    import pkgutil
+
+    import triton
+
+    TRITON_PATH = os.path.dirname(os.path.abspath(triton.__file__))
+    contents = []
+    # This is redundant. Doing it to be consistent with upstream.
+    # frontend
+    with open(os.path.join(TRITON_PATH, "compiler", "compiler.py"), "rb") as f:
+        contents += [hashlib.sha256(f.read()).hexdigest()]
+
+    # compiler
+    compiler_path = os.path.join(TRITON_PATH, "compiler")
+    backends_path = os.path.join(TRITON_PATH, "compiler", "backends")
+    for lib in pkgutil.iter_modules([compiler_path, backends_path]):
+        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:  # type: ignore[call-arg, union-attr, arg-type]
+            contents += [hashlib.sha256(f.read()).hexdigest()]
+    # backend
+    libtriton_hash = hashlib.sha256()
+    with open(os.path.join(TRITON_PATH, "_C/libtriton.so"), "rb") as f:
+        while True:
+            chunk = f.read(1024**2)
+            if not chunk:
+                break
+            libtriton_hash.update(chunk)
+    contents.append(libtriton_hash.hexdigest())
+    # language
+    language_path = os.path.join(TRITON_PATH, "language")
+    for lib in pkgutil.iter_modules([language_path]):
+        with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:  # type: ignore[call-arg, union-attr, arg-type]
+            contents += [hashlib.sha256(f.read()).hexdigest()]
+    from triton import __version__
+
+    return f"{__version__}" + "-".join(contents)
+
+
+@functools.lru_cache(None)
+def triton_hash_with_backend():
+    import torch
+
+    if torch.version.hip:
+        # Does not work with ROCm
+        return None
+
+    backend_hash = triton_backend_hash()
+    key = f"{triton_key()}-{backend_hash}"
+    return hashlib.sha256(key.encode("utf-8")).hexdigest()
diff --git a/MLPY/Lib/site-packages/torch/utils/_typing_utils.py b/MLPY/Lib/site-packages/torch/utils/_typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..105293ad3365ba638142c555673f9f5e9c1c054a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_typing_utils.py
@@ -0,0 +1,13 @@
+"""Miscellaneous utilities to aid with typing."""
+
+from typing import Optional, TypeVar
+
+# Helper to turn Optional[T] into T when we know None either isn't
+# possible or should trigger an exception.
+T = TypeVar("T")
+
+
+def not_none(obj: Optional[T]) -> T:
+    if obj is None:
+        raise TypeError("Invariant encountered: value was None when it should not be")
+    return obj
diff --git a/MLPY/Lib/site-packages/torch/utils/_zip.py b/MLPY/Lib/site-packages/torch/utils/_zip.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c3c956d5ed456d21899eb88a43353477a156f70
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/_zip.py
@@ -0,0 +1,85 @@
+import argparse
+import glob
+import os
+from pathlib import Path
+from zipfile import ZipFile
+
+# Exclude some standard library modules to:
+# 1. Slim down the final zipped file size
+# 2. Remove functionality we don't want to support.
+DENY_LIST = [
+    # Interface to unix databases
+    "dbm",
+    # ncurses bindings (terminal interfaces)
+    "curses",
+    # Tcl/Tk GUI
+    "tkinter",
+    "tkinter",
+    # Tests for the standard library
+    "test",
+    "tests",
+    "idle_test",
+    "__phello__.foo.py",
+    # importlib frozen modules. These are already baked into CPython.
+    "_bootstrap.py",
+    "_bootstrap_external.py",
+]
+
+strip_file_dir = ""
+
+
+def remove_prefix(text, prefix):
+    if text.startswith(prefix):
+        return text[len(prefix) :]
+    return text
+
+
+def write_to_zip(file_path, strip_file_path, zf, prepend_str=""):
+    stripped_file_path = prepend_str + remove_prefix(file_path, strip_file_dir + "/")
+    path = Path(stripped_file_path)
+    if path.name in DENY_LIST:
+        return
+    zf.write(file_path, stripped_file_path)
+
+
+def main() -> None:
+    global strip_file_dir
+    parser = argparse.ArgumentParser(description="Zip py source")
+    parser.add_argument("paths", nargs="*", help="Paths to zip.")
+    parser.add_argument(
+        "--install-dir", "--install_dir", help="Root directory for all output files"
+    )
+    parser.add_argument(
+        "--strip-dir",
+        "--strip_dir",
+        help="The absolute directory we want to remove from zip",
+    )
+    parser.add_argument(
+        "--prepend-str",
+        "--prepend_str",
+        help="A string to prepend onto all paths of a file in the zip",
+        default="",
+    )
+    parser.add_argument("--zip-name", "--zip_name", help="Output zip name")
+
+    args = parser.parse_args()
+
+    zip_file_name = args.install_dir + "/" + args.zip_name
+    strip_file_dir = args.strip_dir
+    prepend_str = args.prepend_str
+    zf = ZipFile(zip_file_name, mode="w")
+
+    for p in sorted(args.paths):
+        if os.path.isdir(p):
+            files = glob.glob(p + "/**/*.py", recursive=True)
+            for file_path in sorted(files):
+                # strip the absolute path
+                write_to_zip(
+                    file_path, strip_file_dir + "/", zf, prepend_str=prepend_str
+                )
+        else:
+            write_to_zip(p, strip_file_dir + "/", zf, prepend_str=prepend_str)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/MLPY/Lib/site-packages/torch/utils/backcompat/__init__.py b/MLPY/Lib/site-packages/torch/utils/backcompat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..636a8cbbbaa70a22f8a3ee34e00213c9874762c2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/backcompat/__init__.py
@@ -0,0 +1,21 @@
+from torch._C import _set_backcompat_broadcast_warn
+from torch._C import _get_backcompat_broadcast_warn
+from torch._C import _set_backcompat_keepdim_warn
+from torch._C import _get_backcompat_keepdim_warn
+
+
+class Warning:
+    def __init__(self, setter, getter):
+        self.setter = setter
+        self.getter = getter
+
+    def set_enabled(self, value):
+        self.setter(value)
+
+    def get_enabled(self):
+        return self.getter()
+
+    enabled = property(get_enabled, set_enabled)
+
+broadcast_warning = Warning(_set_backcompat_broadcast_warn, _get_backcompat_broadcast_warn)
+keepdim_warning = Warning(_set_backcompat_keepdim_warn, _get_backcompat_keepdim_warn)
diff --git a/MLPY/Lib/site-packages/torch/utils/backcompat/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/backcompat/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b82df3e522f38b183f2bbd1f93a5aa4cd9b082fa
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/backcompat/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/backend_registration.py b/MLPY/Lib/site-packages/torch/utils/backend_registration.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f671035668c87c29e4e13f8500cc75cd092421
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/backend_registration.py
@@ -0,0 +1,339 @@
+import torch
+from torch._C import _rename_privateuse1_backend, _get_privateuse1_backend_name
+from typing import List, Optional, Union
+
+__all__ = ["rename_privateuse1_backend", "generate_methods_for_privateuse1_backend"]
+
+# TODO: Should use `torch._C._get_privateuse1_backend_name()` to get
+# renamed-backend name for `privateuse1`, but the func will cause an
+# error with torch.jit.script, so we use the global variable named
+# `_privateuse1_backend_name`.
+_privateuse1_backend_name = "privateuseone"
+
+def rename_privateuse1_backend(backend_name: str) -> None:
+    r"""
+    Rename the privateuse1 backend device to make it more convenient to use as a device name within PyTorch APIs.
+
+    The steps are:
+
+    (1) (In C++) implement kernels for various torch operations, and register them
+        to the PrivateUse1 dispatch key.
+    (2) (In python) call torch.utils.rename_privateuse1_backend("foo")
+
+    You can now use "foo" as an ordinary device string in python.
+
+    Note: this API can only be called once per process. Attempting to change
+    the external backend after it's already been set will result in an error.
+
+    Note(AMP): If you want to support AMP on your device, you can register a custom backend module.
+    The backend must register a custom backend module with ``torch._register_device_module("foo", BackendModule)``.
+    BackendModule needs to have the following API's:
+
+    (1) ``get_amp_supported_dtype() -> List[torch.dtype]``
+        get the supported dtypes on your "foo" device in AMP, maybe the "foo" device supports one more dtype.
+
+    (2) ``is_autocast_enabled() -> bool``
+        check the AMP is enabled or not on your "foo" device.
+
+    (3) ``get_autocast_dtype() -> torch.dtype``
+        get the supported dtype on your "foo" device in AMP, which is set by ``set_autocast_dtype`` or the
+        default dtype, and the default dtype is ``torch.float16``.
+
+    (4) ``set_autocast_enabled(bool) -> None``
+        enable the AMP or not on your "foo" device.
+
+    (5) ``set_autocast_dtype(dtype) -> None``
+        set the supported dtype on your "foo" device in AMP, and the dtype be contained in the dtypes got
+        from ``get_amp_supported_dtype``.
+
+    Note(random): If you want to support to set seed for your device, BackendModule needs to have the following API's:
+
+    (1) ``_is_in_bad_fork() -> bool``
+        Return ``True`` if now it is in bad_fork, else return ``False``.
+
+    (2) ``manual_seed_all(seed int) -> None``
+        Sets the seed for generating random numbers for your devices.
+
+    (3) ``device_count() -> int``
+        Returns the number of "foo"s available.
+
+    (4) ``get_rng_state(device: Union[int, str, torch.device] = 'foo') -> Tensor``
+        Returns a list of ByteTensor representing the random number states of all devices.
+
+    (5) ``set_rng_state(new_state: Tensor, device: Union[int, str, torch.device] = 'foo') -> None``
+        Sets the random number generator state of the specified "foo" device.
+
+    And there are some common funcs:
+
+    (1) ``is_available() -> bool``
+        Returns a bool indicating if "foo" is currently available.
+
+    (2) ``current_device() -> int``
+        Returns the index of a currently selected device.
+
+    For more details, see https://pytorch.org/tutorials/advanced/extend_dispatcher.html#get-a-dispatch-key-for-your-backend
+    For an existing example, see https://github.com/bdhirsh/pytorch_open_registration_example
+
+    Example::
+
+        >>> # xdoctest: +SKIP("failing")
+        >>> torch.utils.rename_privateuse1_backend("foo")
+        # This will work, assuming that you've implemented the right C++ kernels
+        # to implement torch.ones.
+        >>> a = torch.ones(2, device="foo")
+
+    """
+    _rename_privateuse1_backend(backend_name)
+    global _privateuse1_backend_name
+    _privateuse1_backend_name = backend_name
+
+def _check_register_once(module, attr):
+    if hasattr(module, attr):
+        raise RuntimeError(f"The custom device module of {module} has already been registered with {attr}")
+
+
+def _normalization_device(custom_backend_name: str, device: Optional[Union[int, str, torch.device]] = None) -> int:
+    def _get_current_device_index():
+        _get_device_index = "current_device"
+        if hasattr(torch, custom_backend_name) and \
+                hasattr(getattr(torch, custom_backend_name), _get_device_index):
+            return getattr(getattr(torch, custom_backend_name), _get_device_index)()
+        else:
+            # The default device index is 0.
+            return 0
+
+    if device is None:
+        return _get_current_device_index()
+    # if isinstance(device, str), this means that the parameter passed in is in the string format "foo:0"
+    # convert str object to torch.device object, and then process it uniformly
+    elif isinstance(device, str):
+        device = torch.device(device)
+
+    # variable devcie can only be torch.device type or int type
+    if isinstance(device, torch.device):
+        if device.type != custom_backend_name:
+            raise RuntimeError(f"Invalid device, must be {custom_backend_name} device")
+        elif device.index is None:
+            device_idx = _get_current_device_index()
+        else:
+            device_idx = device.index
+    # if isinstance(device, int), we can take the index number directly
+    else:
+        device_idx = device
+    return device_idx
+
+
+def _generate_tensor_methods_for_privateuse1_backend(custom_backend_name: str) -> None:
+    @property  # type: ignore[misc]
+    def wrap_tensor_backend(self: torch.Tensor) -> bool:
+        return self.device.type == custom_backend_name
+
+    _check_register_once(torch.Tensor, f'is_{custom_backend_name}')
+    setattr(torch.Tensor, f'is_{custom_backend_name}', wrap_tensor_backend)
+
+    def wrap_tensor_to(self: torch.Tensor, device: Optional[Union[int, torch.device]] = None, non_blocking=False,
+                       **kwargs) -> torch.Tensor:
+        r"""Perform Tensor device conversion. Call the to operator implementation.
+
+        .. note::
+            If the ``self`` Tensor already
+            has the correct :class:`torch.device`, then ``self`` is returned.
+            Otherwise, the returned tensor is a copy of ``self`` with the desired :class:`torch.device`.
+
+        Args:
+            device (int, optional): if specified, all parameters will be copied to that device
+            non_blocking (bool): If ``True`` and the source is in pinned memory,
+                the copy will be asynchronous with respect to the host. Otherwise,
+                the argument has no effect.
+            **kwargs (dict): For compatibility, may contain the key ``memory_format`` argument.
+        """
+        device_idx = _normalization_device(custom_backend_name, device)
+        return self.to(device=torch.device(f'{custom_backend_name}:{device_idx}'), non_blocking=non_blocking, **kwargs)
+
+    _check_register_once(torch.Tensor, custom_backend_name)
+    setattr(torch.Tensor, custom_backend_name, wrap_tensor_to)
+
+
+def _generate_module_methods_for_privateuse1_backend(custom_backend_name: str) -> None:
+    # Generate Module attributes and methods depends on Tensor methods,
+    # so we need to check whether Tensor methods is already registered.
+    if not hasattr(torch.Tensor, custom_backend_name):
+        raise RuntimeError(
+            f"Can not automatically generate {custom_backend_name}() method for torch.nn.Module."
+            f"Because torch.Tensor doesn't has the method {custom_backend_name}()."
+            f"For this error, you can try setting for_tensor=True.")
+
+    def wrap_module_to(self: torch.nn.modules.module.T,
+                       device: Optional[Union[int, torch.device]] = None) -> torch.nn.modules.module.T:
+        r"""Move all model parameters and buffers to the custom device.
+
+        This also makes associated parameters and buffers different objects. So
+        it should be called before constructing optimizer if the module will
+        live on device while being optimized.
+
+        .. note::
+            This method modifies the module in-place.
+
+        Args:
+            device (int, optional): if specified, all parameters will be copied to that device
+        """
+        return self._apply(lambda t: getattr(t, custom_backend_name)(device))
+
+    _check_register_once(torch.nn.Module, custom_backend_name)
+    setattr(torch.nn.Module, custom_backend_name, wrap_module_to)
+
+
+def _generate_storage_methods_for_privateuse1_backend(custom_backend_name: str,
+                                                      unsupported_dtype: Optional[List[torch.dtype]] = None) -> None:
+    # Attribute is registered in the _StorageBase class
+    # and UntypedStorage obtains through inheritance.
+    @property  # type: ignore[misc]
+    def wrap_storage_backend(self: torch.storage._StorageBase) -> bool:
+        r"""Return the internal :class:`torch.UntypedStorage`."""
+        return self.device.type == custom_backend_name
+
+    _check_register_once(torch.storage._StorageBase, f'is_{custom_backend_name}')
+    setattr(torch.storage._StorageBase, f'is_{custom_backend_name}', wrap_storage_backend)
+
+    def wrap_storage_to(self, device=None, non_blocking=False):
+        r"""Return a copy of this object in custom device memory.
+
+        If this object is already in device memory and on the correct device, then
+        no copy is performed and the original object is returned.
+
+        Args:
+            device (int): The destination device id. Defaults to the current device.
+            non_blocking (bool): If ``True`` and the source is in pinned memory,
+            the copy will be asynchronous with respect to the host. Otherwise,
+            the argument has no effect.
+        """
+        # There should be a judgment related to storage device and a judgment related to storage type,
+        # but it depends on the extended function, so this part is temporarily omitted in the automatic generation.
+        device_idx = _normalization_device(custom_backend_name, device)
+
+        if getattr(self, f'is_{custom_backend_name}'):
+            # storage has already on expected device.
+            if self.get_device() == device_idx:
+                return self
+        # For sparse storage, custom need to extend the implementation by themselves.
+        if self.is_sparse:
+            raise RuntimeError(f"Can not support a sparse storage move to {custom_backend_name} backend")
+        # create untyped_storage and copy data
+        untyped_storage = torch.UntypedStorage(
+            self.size(), device=torch.device(f'{custom_backend_name}:{device_idx}')
+        )
+        untyped_storage.copy_(self, non_blocking)
+        return untyped_storage
+
+    _check_register_once(torch.storage._StorageBase, custom_backend_name)
+    setattr(torch.storage._StorageBase, custom_backend_name, wrap_storage_to)
+
+    # Register the corresponding attribute for the TypedStorage class.
+    # When the TypedStorage class is removed, the registration is also removed.
+
+    @property  # type: ignore[misc]
+    def wrap_typed_storage_backend(self: torch.storage.TypedStorage) -> bool:
+        torch.storage._warn_typed_storage_removal()
+        return self._untyped_storage.device.type == custom_backend_name
+
+    _check_register_once(torch.TypedStorage, f'is_{custom_backend_name}')
+    setattr(torch.storage.TypedStorage, f'is_{custom_backend_name}', wrap_typed_storage_backend)
+
+    def wrap_typed_storage_to(self: torch.storage.TypedStorage,
+                              device=None, non_blocking=False, **kwargs) -> torch.storage.TypedStorage:
+        torch.storage._warn_typed_storage_removal()
+        if unsupported_dtype and self.dtype in unsupported_dtype:
+            raise RuntimeError(f"Cannot create {custom_backend_name} storage "
+                               f"as {self.dtype} dtype is not supported by this backend")
+        custom_backend_storage: torch.UntypedStorage = getattr(
+            self._untyped_storage, custom_backend_name)(device, non_blocking, **kwargs)
+        return self._new_wrapped_storage(custom_backend_storage)
+
+    _check_register_once(torch.TypedStorage, custom_backend_name)
+    setattr(torch.TypedStorage, custom_backend_name, wrap_typed_storage_to)
+
+
+def generate_methods_for_privateuse1_backend(for_tensor: bool = True, for_module: bool = True,
+                                             for_storage: bool = False,
+                                             unsupported_dtype: Optional[List[torch.dtype]] = None) -> None:
+    r"""
+    Automatically generate attributes and methods for the custom backend after rename privateuse1 backend.
+
+    In the default scenario, storage-related methods will not be generated automatically.
+
+    When you implement kernels for various torch operations, and register them to the PrivateUse1 dispatch key.
+    And call the function torch.rename_privateuse1_backend("foo") to rename your backend name.
+    At this point, you can easily register specific methods and attributes by calling this function.
+    Just like torch.Tensor.foo(), torch.Tensor.is_foo, torch.Storage.foo(), torch.Storage.is_foo.
+
+    Note: We recommend you use generic functions (check devices are equal or to(device=)).
+    We provide these methods for convenience only and they will be "monkey patched" onto the objects
+    and so will not be properly typed. For Storage methods generate, if you need to support sparse data storage,
+    you need to extend the implementation yourself.
+
+    Args:
+        for_tensor (bool): whether register related methods for torch.Tensor class.
+        for_module (bool): whether register related methods for torch.nn.Module class.
+        for_storage (bool): whether register related methods for torch.Storage class.
+        unsupported_dtype (List[torch.dtype]): takes effect only when the storage method needs to be generated,
+            indicating that the storage does not support the torch.dtype type.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("failing")
+        >>> torch.utils.rename_privateuse1_backend("foo")
+        >>> torch.utils.generate_methods_for_privateuse1_backend()
+        # Then automatically generate backend-related attributes and methods.
+        >>> a = torch.tensor(2).foo()
+        >>> a.is_foo
+        >>> hasattr(torch.nn.Module, 'foo')
+    """
+    custom_backend_name = _get_privateuse1_backend_name()
+
+    if for_tensor:
+        _generate_tensor_methods_for_privateuse1_backend(custom_backend_name)
+
+    if for_module:
+        _generate_module_methods_for_privateuse1_backend(custom_backend_name)
+
+    if for_storage:
+        _generate_storage_methods_for_privateuse1_backend(custom_backend_name, unsupported_dtype)
+
+def _get_custom_mod_func(func_name: str):
+    r"""
+    Return the func named `func_name` defined in custom device module. If not defined,
+    return `None`. And the func is registered with `torch.utils.rename_privateuse1_backend('foo')`
+    and `torch._register_device_module('foo', BackendModule)`.
+    If the custom device module or the func is not defined, it will give warning or error message.
+    Args:
+        func_name (str): return the callable func named func_name defined in custom device module.
+    Example::
+        class DummyfooModule:
+            @staticmethod
+            def is_available():
+                return True
+            @staticmethod
+            def func_name(*args, **kwargs):
+                ....
+        torch.utils.rename_privateuse1_backend("foo")
+        torch._register_device_module("foo", DummyfooModule)
+        foo_is_available_func = torch.utils.backend_registration._get_custom_mod_func("is_available")
+        if foo_is_available_func:
+            foo_is_available = foo_is_available_func()
+        func_ = torch.utils.backend_registration._get_custom_mod_func("func_name")
+        if func_:
+            result = func_(*args, **kwargs)
+    Attention: This function is not meant to be used directly by users, which is why
+    it is marked as private. It is a convenience function for backend implementers to
+    more easily call the hooks into their backend extensions.
+    """
+    assert isinstance(func_name, str), f"func_name must be `str`, but got `{type(func_name)}`."
+    backend_name = _get_privateuse1_backend_name()
+    custom_device_mod = getattr(torch, backend_name, None)  # type: ignore[arg-type]
+    function = getattr(custom_device_mod, func_name, None)  # type: ignore[arg-type]
+    if custom_device_mod is None or function is None:
+        message = f'Try to call torch.{backend_name}.{func_name}. The backend must register a custom backend '
+        message += f"module with `torch._register_device_module('{backend_name}', BackendModule)`. And "
+        message += f"BackendModule needs to have the following API's:\n `{func_name}(*args, **kwargs)`. \n"
+        raise RuntimeError(message)
+    return function
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/__init__.py b/MLPY/Lib/site-packages/torch/utils/benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed6c4b8102eebc086c399057bf81c3de57d9c632
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/__init__.py
@@ -0,0 +1,6 @@
+from torch.utils.benchmark.utils.common import *  # noqa: F403
+from torch.utils.benchmark.utils.timer import *  # noqa: F403
+from torch.utils.benchmark.utils.compare import *  # noqa: F403
+from torch.utils.benchmark.utils.fuzzer import *  # noqa: F403
+from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import *  # noqa: F403
+from torch.utils.benchmark.utils.sparse_fuzzer import *  # noqa: F403
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8f4f82c3c7b66142298ab3bc7569f24cf7280d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__init__.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1901773a8dd0ad2132819c8ea47cf121606f315
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/blas_compare_setup.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/blas_compare_setup.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68461e6dc303f828be774a28392b74f1ba11dbe4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/blas_compare_setup.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/compare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/compare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a78a4338e0993e2ef8705fa9c0c15f38422a5ab
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/compare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/fuzzer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/fuzzer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab0cf04d379cf1c53afc77cb15df56cb9a57a185
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/fuzzer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/op_benchmark.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/op_benchmark.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..635f4714136853ca876bf2cd0b6e580882ea613f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/op_benchmark.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/simple_timeit.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/simple_timeit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b255b1e00b9f50f1f29b2c00e5f93447ddb4d66
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/simple_timeit.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/spectral_ops_fuzz_test.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/spectral_ops_fuzz_test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffe793c310c1b9865919b709fc088176a62d1938
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/__pycache__/spectral_ops_fuzz_test.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..759c847c41b09472b041b745efe5fc93841bdbf2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
@@ -0,0 +1,221 @@
+import collections
+import os
+import shutil
+import subprocess
+
+try:
+    # no type stub for conda command line interface
+    import conda.cli.python_api  # type: ignore[import]
+    from conda.cli.python_api import Commands as conda_commands
+except ImportError:
+    # blas_compare.py will fail to import these when it's inside a conda env,
+    # but that's fine as it only wants the constants.
+    pass
+
+
+WORKING_ROOT = "/tmp/pytorch_blas_compare_environments"
+MKL_2020_3 = "mkl_2020_3"
+MKL_2020_0 = "mkl_2020_0"
+OPEN_BLAS = "open_blas"
+EIGEN = "eigen"
+
+
+GENERIC_ENV_VARS = ("USE_CUDA=0", "USE_ROCM=0")
+BASE_PKG_DEPS = (
+    "cmake",
+    "hypothesis",
+    "ninja",
+    "numpy",
+    "pyyaml",
+    "setuptools",
+    "typing_extensions",
+)
+
+
+SubEnvSpec = collections.namedtuple(
+    "SubEnvSpec", (
+        "generic_installs",
+        "special_installs",
+        "environment_variables",
+
+        # Validate install.
+        "expected_blas_symbols",
+        "expected_mkl_version",
+    ))
+
+
+SUB_ENVS = {
+    MKL_2020_3: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.3", "mkl-include=2020.3")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.3",
+    ),
+
+    MKL_2020_0: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.0", "mkl-include=2020.0")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.0",
+    ),
+
+    OPEN_BLAS: SubEnvSpec(
+        generic_installs=("openblas",),
+        special_installs=(),
+        environment_variables=("BLAS=OpenBLAS",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("exec_blas",),
+        expected_mkl_version=None,
+    ),
+
+    # EIGEN: SubEnvSpec(
+    #     generic_installs=(),
+    #     special_installs=(),
+    #     environment_variables=("BLAS=Eigen",) + GENERIC_ENV_VARS,
+    #     expected_blas_symbols=(),
+    # ),
+}
+
+
+def conda_run(*args):
+    """Convenience method."""
+    stdout, stderr, retcode = conda.cli.python_api.run_command(*args)
+    if retcode:
+        raise OSError(f"conda error: {str(args)}  retcode: {retcode}\n{stderr}")
+
+    return stdout
+
+
+def main():
+    if os.path.exists(WORKING_ROOT):
+        print("Cleaning: removing old working root.")
+        shutil.rmtree(WORKING_ROOT)
+    os.makedirs(WORKING_ROOT)
+
+    git_root = subprocess.check_output(
+        "git rev-parse --show-toplevel",
+        shell=True,
+        cwd=os.path.dirname(os.path.realpath(__file__))
+    ).decode("utf-8").strip()
+
+    for env_name, env_spec in SUB_ENVS.items():
+        env_path = os.path.join(WORKING_ROOT, env_name)
+        print(f"Creating env: {env_name}: ({env_path})")
+        conda_run(
+            conda_commands.CREATE,
+            "--no-default-packages",
+            "--prefix", env_path,
+            "python=3",
+        )
+
+        print("Testing that env can be activated:")
+        base_source = subprocess.run(
+            f"source activate {env_path}",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if base_source.returncode:
+            raise OSError(
+                "Failed to source base environment:\n"
+                f"  stdout: {base_source.stdout.decode('utf-8')}\n"
+                f"  stderr: {base_source.stderr.decode('utf-8')}"
+            )
+
+        print("Installing packages:")
+        conda_run(
+            conda_commands.INSTALL,
+            "--prefix", env_path,
+            *(BASE_PKG_DEPS + env_spec.generic_installs)
+        )
+
+        if env_spec.special_installs:
+            channel, channel_deps = env_spec.special_installs
+            print(f"Installing packages from channel: {channel}")
+            conda_run(
+                conda_commands.INSTALL,
+                "--prefix", env_path,
+                "-c", channel, *channel_deps
+            )
+
+        if env_spec.environment_variables:
+            print("Setting environment variables.")
+
+            # This does not appear to be possible using the python API.
+            env_set = subprocess.run(
+                f"source activate {env_path} && "
+                f"conda env config vars set {' '.join(env_spec.environment_variables)}",
+                shell=True,
+                capture_output=True,
+                check=False,
+            )
+            if env_set.returncode:
+                raise OSError(
+                    "Failed to set environment variables:\n"
+                    f"  stdout: {env_set.stdout.decode('utf-8')}\n"
+                    f"  stderr: {env_set.stderr.decode('utf-8')}"
+                )
+
+            # Check that they were actually set correctly.
+            actual_env_vars = subprocess.run(
+                f"source activate {env_path} && env",
+                shell=True,
+                capture_output=True,
+                check=True,
+            ).stdout.decode("utf-8").strip().splitlines()
+            for e in env_spec.environment_variables:
+                assert e in actual_env_vars, f"{e} not in envs"
+
+        print(f"Building PyTorch for env: `{env_name}`")
+        # We have to re-run during each build to pick up the new
+        # build config settings.
+        build_run = subprocess.run(
+            f"source activate {env_path} && "
+            f"cd {git_root} && "
+            "python setup.py install --cmake",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        print("Checking configuration:")
+        check_run = subprocess.run(
+            # Shameless abuse of `python -c ...`
+            f"source activate {env_path} && "
+            "python -c \""
+            "import torch;"
+            "from torch.utils.benchmark import Timer;"
+            "print(torch.__config__.show());"
+            "setup = 'x=torch.ones((128, 128));y=torch.ones((128, 128))';"
+            "counts = Timer('torch.mm(x, y)', setup).collect_callgrind(collect_baseline=False);"
+            "stats = counts.as_standardized().stats(inclusive=True);"
+            "print(stats.filter(lambda l: 'blas' in l.lower()))\"",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if check_run.returncode:
+            raise OSError(
+                "Failed to set environment variables:\n"
+                f"  stdout: {check_run.stdout.decode('utf-8')}\n"
+                f"  stderr: {check_run.stderr.decode('utf-8')}"
+            )
+        check_run_stdout = check_run.stdout.decode('utf-8')
+        print(check_run_stdout)
+
+        for e in env_spec.environment_variables:
+            if "BLAS" in e:
+                assert e in check_run_stdout, f"PyTorch build did not respect `BLAS=...`: {e}"
+
+        for s in env_spec.expected_blas_symbols:
+            assert s in check_run_stdout
+
+        if env_spec.expected_mkl_version is not None:
+            assert f"- Intel(R) Math Kernel Library Version {env_spec.expected_mkl_version}" in check_run_stdout
+
+        print(f"Build complete: {env_name}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/compare.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..29726a5acce42d6662d283621c52db05e2bc8316
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/compare.py
@@ -0,0 +1,98 @@
+"""Example of Timer and Compare APIs:
+
+$ python -m examples.compare
+"""
+
+import pickle
+import sys
+import time
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+class FauxTorch:
+    """Emulate different versions of pytorch.
+
+    In normal circumstances this would be done with multiple processes
+    writing serialized measurements, but this simplifies that model to
+    make the example clearer.
+    """
+    def __init__(self, real_torch, extra_ns_per_element):
+        self._real_torch = real_torch
+        self._extra_ns_per_element = extra_ns_per_element
+
+    def extra_overhead(self, result):
+        # time.sleep has a ~65 us overhead, so only fake a
+        # per-element overhead if numel is large enough.
+        numel = int(result.numel())
+        if numel > 5000:
+            time.sleep(numel * self._extra_ns_per_element * 1e-9)
+        return result
+
+    def add(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.add(*args, **kwargs))
+
+    def mul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.mul(*args, **kwargs))
+
+    def cat(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.cat(*args, **kwargs))
+
+    def matmul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
+
+
+def main():
+    tasks = [
+        ("add", "add", "torch.add(x, y)"),
+        ("add", "add (extra +0)", "torch.add(x, y + zero)"),
+    ]
+
+    serialized_results = []
+    repeats = 2
+    timers = [
+        benchmark_utils.Timer(
+            stmt=stmt,
+            globals={
+                "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
+                "x": torch.ones((size, 4)),
+                "y": torch.ones((1, 4)),
+                "zero": torch.zeros(()),
+            },
+            label=label,
+            sub_label=sub_label,
+            description=f"size: {size}",
+            env=branch,
+            num_threads=num_threads,
+        )
+        for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)]
+        for label, sub_label, stmt in tasks
+        for size in [1, 10, 100, 1000, 10000, 50000]
+        for num_threads in [1, 4]
+    ]
+
+    for i, timer in enumerate(timers * repeats):
+        serialized_results.append(pickle.dumps(
+            timer.blocked_autorange(min_run_time=0.05)
+        ))
+        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
+        sys.stdout.flush()
+    print()
+
+    comparison = benchmark_utils.Compare([
+        pickle.loads(i) for i in serialized_results
+    ])
+
+    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
+    comparison.print()
+
+    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
+    comparison.trim_significant_figures()
+    comparison.colorize()
+    comparison.print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..70a0759c63c9ecc5c302d372047b5f8d2a4a49fc
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
@@ -0,0 +1,85 @@
+"""Example of the Timer and Fuzzer APIs:
+
+$ python -m examples.fuzzer
+"""
+
+import sys
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    add_fuzzer = benchmark_utils.Fuzzer(
+        parameters=[
+            [
+                benchmark_utils.FuzzedParameter(
+                    name=f"k{i}",
+                    minval=16,
+                    maxval=16 * 1024,
+                    distribution="loguniform",
+                ) for i in range(3)
+            ],
+            benchmark_utils.FuzzedParameter(
+                name="d",
+                distribution={2: 0.6, 3: 0.4},
+            ),
+        ],
+        tensors=[
+            [
+                benchmark_utils.FuzzedTensor(
+                    name=name,
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="d",
+                    probability_contiguous=0.75,
+                    min_elements=64 * 1024,
+                    max_elements=128 * 1024,
+                ) for name in ("x", "y")
+            ],
+        ],
+        seed=0,
+    )
+
+    n = 250
+    measurements = []
+    for i, (tensors, tensor_properties, _) in enumerate(add_fuzzer.take(n=n)):
+        x, x_order = tensors["x"], str(tensor_properties["x"]["order"])
+        y, y_order = tensors["y"], str(tensor_properties["y"]["order"])
+        shape = ", ".join(tuple(f'{i:>4}' for i in x.shape))
+
+        description = "".join([
+            f"{x.numel():>7} | {shape:<16} | ",
+            f"{'contiguous' if x.is_contiguous() else x_order:<12} | ",
+            f"{'contiguous' if y.is_contiguous() else y_order:<12} | ",
+        ])
+
+        timer = benchmark_utils.Timer(
+            stmt="x + y",
+            globals=tensors,
+            description=description,
+        )
+
+        measurements.append(timer.blocked_autorange(min_run_time=0.1))
+        measurements[-1].metadata = {"numel": x.numel()}
+        print(f"\r{i + 1} / {n}", end="")
+        sys.stdout.flush()
+    print()
+
+    # More string munging to make pretty output.
+    print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
+
+    def time_fn(m):
+        return m.median / m.metadata["numel"]
+    measurements.sort(key=time_fn)
+
+    template = f"{{:>6}}{' ' * 19}Size    Shape{' ' * 13}X order        Y order\n{'-' * 80}"
+    print(template.format("Best:"))
+    for m in measurements[:15]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+    print("\n" + template.format("Worst:"))
+    for m in measurements[-15:]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf3f376426803b5eb11258e42ae098dc2b39af8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
@@ -0,0 +1,103 @@
+"""Example use of Timer and op fuzzers to measure kernel performance.
+
+$ python -m examples.op_benchmark
+"""
+
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Timer
+from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
+from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
+
+
+_MEASURE_TIME = 1.0
+
+
+def assert_dicts_equal(dict_0, dict_1):
+    """Builtin dict comparison will not compare numpy arrays.
+    e.g.
+        x = {"a": np.ones((2, 1))}
+        x == x  # Raises ValueError
+    """
+    assert set(dict_0.keys()) == set(dict_0.keys())
+    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+
+
+def run(n, stmt, fuzzer_cls):
+    float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
+    int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
+    raw_results = []
+    for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
+        float_tensors, float_tensor_params, float_params = float_values
+        int_tensors, int_tensor_params, int_params = int_values
+
+        # This benchmark assumes that the two fuzzers generate identically
+        # sized and strided Tensors, since the same seed is used.
+        assert_dicts_equal(float_params, int_params)
+        assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])
+
+        float_measurement, int_measurement = (
+            Timer(
+                stmt,
+                globals=tensors,
+            ).blocked_autorange(min_run_time=_MEASURE_TIME)
+            for tensors in (float_tensors, int_tensors)
+        )
+
+        descriptions = []
+        for name in float_tensors:
+            shape_str = "(" + ", ".join([
+                f"2 ** {int(np.log2(i))}"
+                if 2 ** int(np.log2(i)) == i and i > 1
+                else str(i)
+                for i in float_tensors[name].shape
+            ]) + ")"
+            order = float_tensor_params[name]["order"]
+            order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order)))
+            steps = float_tensor_params[name]["steps"]
+            steps_str = str(steps) if sum(steps) > len(steps) else ""
+            descriptions.append((name, shape_str, order_str, steps_str))
+        raw_results.append((float_measurement, int_measurement, descriptions))
+
+        print(f"\r{i + 1} / {n}", end="")
+    print()
+
+    parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
+    for float_measurement, int_measurement, descriptions in raw_results:
+        t_float = float_measurement.median * 1e6
+        t_int = int_measurement.median * 1e6
+        rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
+        parsed_results.append((t_float, t_int, rel_diff, descriptions))
+        for name, shape, order, steps in descriptions:
+            name_len = max(name_len, len(name))
+            shape_len = max(shape_len, len(shape))
+            order_len = max(order_len, len(order))
+            steps_len = max(steps_len, len(steps))
+
+    parsed_results.sort(key=lambda x: x[2])
+
+    print(f"stmt: {stmt}")
+    print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
+    print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="")
+    print(f"          steps\n{'-' * 100}")
+    for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
+        for t_float, t_int, rel_diff, descriptions in results:
+            time_str = [f"{rel_diff * 100:>4.1f}%    {'int' if t_int < t_float else 'float':<20}"]
+            time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
+            for t_str, (name, shape, order, steps) in zip(time_str, descriptions):
+                name = f"{name}:".ljust(name_len + 1)
+                shape = shape.ljust(shape_len + 10)
+                order = order.ljust(order_len)
+                print(f"{t_str} {name}  {shape}|     {order}      |   {steps}")
+        print(spacer)
+
+
+def main():
+    run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe766ca30c39576545fd58c7ddea4e47ba75724c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
@@ -0,0 +1,25 @@
+"""Trivial use of Timer API:
+
+$ python -m examples.simple_timeit
+"""
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    timer = benchmark_utils.Timer(
+        stmt="x + y",
+        globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
+        label="Broadcasting add (4x8)",
+    )
+
+    for i in range(3):
+        print(f"Run: {i}\n{'-' * 40}")
+        print(f"timeit:\n{timer.timeit(10000)}\n")
+        print(f"autorange:\n{timer.blocked_autorange()}\n\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b5598ecfb2056364b9bb7a9d130a918d40caee
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@@ -0,0 +1,113 @@
+"""Microbenchmarks for the torch.fft module"""
+from argparse import ArgumentParser
+from collections import namedtuple
+from collections.abc import Iterable
+
+import torch
+import torch.fft
+from torch.utils import benchmark
+from torch.utils.benchmark.op_fuzzers.spectral import SpectralOpFuzzer
+
+
+def _dim_options(ndim):
+    if ndim == 1:
+        return [None]
+    elif ndim == 2:
+        return [0, 1, None]
+    elif ndim == 3:
+        return [0, 1, 2, (0, 1), (0, 2), None]
+    raise ValueError(f"Expected ndim in range 1-3, got {ndim}")
+
+
+def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
+                  probability_regular: float):
+    cuda = device == 'cuda'
+    spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
+                                       probability_regular=probability_regular)
+    results = []
+    for tensors, tensor_params, params in spectral_fuzzer.take(samples):
+        shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
+        str_shape = ' x '.join([f"{s:<4}" for s in shape])
+        sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+        for dim in _dim_options(params['ndim']):
+            for nthreads in (1, 4, 16) if not cuda else (1,):
+                measurement = benchmark.Timer(
+                    stmt='func(x, dim=dim)',
+                    globals={'func': function, 'x': tensors['x'], 'dim': dim},
+                    label=f"{name}_{device}",
+                    sub_label=sub_label,
+                    description=f"dim={dim}",
+                    num_threads=nthreads,
+                ).blocked_autorange(min_run_time=1)
+                measurement.metadata = {
+                    'name': name,
+                    'device': device,
+                    'dim': dim,
+                    'shape': shape,
+                }
+                measurement.metadata.update(tensor_params['x'])
+                results.append(measurement)
+    return results
+
+
+Benchmark = namedtuple('Benchmark', ['name', 'function', 'dtype'])
+BENCHMARKS = [
+    Benchmark('fft_real', torch.fft.fftn, torch.float32),
+    Benchmark('fft_complex', torch.fft.fftn, torch.complex64),
+    Benchmark('ifft', torch.fft.ifftn, torch.complex64),
+    Benchmark('rfft', torch.fft.rfftn, torch.float32),
+    Benchmark('irfft', torch.fft.irfftn, torch.complex64),
+]
+BENCHMARK_MAP = {b.name: b for b in BENCHMARKS}
+BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
+DEVICE_NAMES = ['cpu', 'cuda']
+
+def _output_csv(file, results):
+    file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
+    for measurement in results:
+        metadata = measurement.metadata
+        device, dim, shape, name, numel, contiguous = (
+            metadata['device'], metadata['dim'], metadata['shape'],
+            metadata['name'], metadata['numel'], metadata['is_contiguous'])
+
+        if isinstance(dim, Iterable):
+            dim_str = '-'.join(str(d) for d in dim)
+        else:
+            dim_str = str(dim)
+            shape_str = 'x'.join(str(s) for s in shape)
+
+        print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str,  # type: ignore[possibly-undefined]
+              measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
+              sep=',', file=file)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
+    parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--samples', type=int, default=10)
+    parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
+    parser.add_argument('-o', '--output', type=str)
+    args = parser.parse_args()
+
+    num_benchmarks = len(args.device) * len(args.bench)
+    i = 0
+    results = []
+    for device in args.device:
+        for bench in (BENCHMARK_MAP[b] for b in args.bench):
+            results += run_benchmark(
+                name=bench.name, function=bench.function, dtype=bench.dtype,
+                seed=args.seed, device=device, samples=args.samples,
+                probability_regular=args.probability_regular)
+            i += 1
+            print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')
+
+    if args.output is not None:
+        with open(args.output, 'w') as f:
+            _output_csv(f, results)
+
+    compare = benchmark.Compare(results)
+    compare.trim_significant_figures()
+    compare.colorize()
+    compare.print()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__init__.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb114fa72e82984cbc3fafd7d5bb5564d476009f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/binary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/binary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bf8fbb1aa627b184504e8daea51aac18e9ebf86
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/binary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_binary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_binary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cd5a84109a43631ad44343a8b1627d86950543b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_binary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_unary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_unary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c93968c70ea367ae84de8c84c2a82edaf415305
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/sparse_unary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/spectral.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/spectral.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c2f63893040f657394127439981cffb52a3717
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/spectral.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/unary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/unary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8f492400129c4acc88ed45de1b9ab503f2c861e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/__pycache__/unary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/binary.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..a686717545bfa9ea130a06bf90cd8ce6b000381c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/binary.py
@@ -0,0 +1,106 @@
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class BinaryOpFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x` and `y`.
+                #       It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                #       Moreover, `y` will occasionally have singleton
+                #   dimensions in order to test broadcasting.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                [
+                    FuzzedParameter(
+                        name=f"y_k{i}",
+                        distribution={
+                            ParameterAlias(f"k{i}"): 0.8,
+                            1: 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x` and `y`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"{name}_step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    )
+                    for i in range(3)
+                    for name in ("x", "y")
+                ],
+
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+                FuzzedTensor(
+                    name="y",
+                    size=("y_k0", "y_k1", "y_k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_binary.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb0511aa158da6f7a8c193d768dff3661d125fa
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_binary.py
@@ -0,0 +1,106 @@
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class BinaryOpSparseFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+                FuzzedParameter(
+                    name="sparse_dim",
+                    distribution={1: 0.4, 2: 0.4, 3: 0.2},
+                    strict=True
+                ),
+                # Shapes for `x` and `y`.
+                #       It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                #       Moreover, `y` will occasionally have singleton
+                #   dimensions in order to test broadcasting.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"y_k{i}",
+                        distribution={
+                            ParameterAlias(f"k{i}"): 1.0},
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                FuzzedParameter(
+                    name="density",
+                    distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
+                ),
+                FuzzedParameter(
+                    name="coalesced",
+                    distribution={True: 0.5, False: 0.5},
+                ),
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedSparseTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    density="density",
+                    coalesced="coalesced",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+                FuzzedSparseTensor(
+                    name="y",
+                    size=("y_k0", "y_k1", "y_k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    density="density",
+                    coalesced="coalesced",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_unary.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_unary.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48dfdc4bdf3b6c044c48b34d4ce54c3d904a91c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/sparse_unary.py
@@ -0,0 +1,82 @@
+
+import numpy as np
+import torch
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+class UnaryOpSparseFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Sparse dim parameter of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+                FuzzedParameter(
+                    name="sparse_dim",
+                    distribution={1: 0.4, 2: 0.4, 3: 0.2},
+                    strict=True
+                ),
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+                FuzzedParameter(
+                    name="density",
+                    distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
+                ),
+                FuzzedParameter(
+                    name="coalesced",
+                    distribution={True: 0.5, False: 0.5},
+                ),
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedSparseTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="dim_parameter",
+                    sparse_dim="sparse_dim",
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    density="density",
+                    coalesced="coalesced",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/spectral.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/spectral.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec036008a348c3e5fd5882a23aaa9aa6efc5a485
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/spectral.py
@@ -0,0 +1,93 @@
+import math
+
+import torch
+from torch.utils import benchmark
+from torch.utils.benchmark import FuzzedParameter, FuzzedTensor, ParameterAlias
+
+
+__all__ = ['SpectralOpFuzzer']
+
+MIN_DIM_SIZE = 16
+MAX_DIM_SIZE = 16 * 1024
+
+def power_range(upper_bound, base):
+    return (base ** i for i in range(int(math.log(upper_bound, base)) + 1))
+
+# List of regular numbers from MIN_DIM_SIZE to MAX_DIM_SIZE
+# These numbers factorize into multiples of prime factors 2, 3, and 5 only
+# and are usually the fastest in FFT implementations.
+REGULAR_SIZES = []
+for i in power_range(MAX_DIM_SIZE, 2):
+    for j in power_range(MAX_DIM_SIZE // i, 3):
+        ij = i * j
+        for k in power_range(MAX_DIM_SIZE // ij, 5):
+            ijk = ij * k
+            if ijk > MIN_DIM_SIZE:
+                REGULAR_SIZES.append(ijk)
+REGULAR_SIZES.sort()
+
+class SpectralOpFuzzer(benchmark.Fuzzer):
+    def __init__(self, *, seed: int, dtype=torch.float64,
+                 cuda: bool = False, probability_regular: float = 1.0):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("ndim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   regular sizes are especially important to the FFT and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the regular numbers
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=MIN_DIM_SIZE,
+                        maxval=MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_regular_{i}",
+                        distribution={size: 1. / len(REGULAR_SIZES) for size in REGULAR_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_regular_{i}"): probability_regular,
+                            ParameterAlias(f"k_any_{i}"): 1 - probability_regular,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    ) for i in range(3)
+                ],
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("step_0", "step_1", "step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="ndim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/unary.py b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/unary.py
new file mode 100644
index 0000000000000000000000000000000000000000..06422a9a9f2ba6550db60ff512917516d8b711e7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/op_fuzzers/unary.py
@@ -0,0 +1,81 @@
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
+
+
+_MIN_DIM_SIZE = 16
+_MAX_DIM_SIZE = 16 * 1024 ** 2
+_POW_TWO_SIZES = tuple(2 ** i for i in range(
+    int(np.log2(_MIN_DIM_SIZE)),
+    int(np.log2(_MAX_DIM_SIZE)) + 1,
+))
+
+
+class UnaryOpFuzzer(Fuzzer):
+    def __init__(self, seed, dtype=torch.float32, cuda=False):
+        super().__init__(
+            parameters=[
+                # Dimensionality of x. (e.g. 1D, 2D, or 3D.)
+                FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
+
+                # Shapes for `x`.
+                #   It is important to test all shapes, however
+                #   powers of two are especially important and therefore
+                #   warrant special attention. This is done by generating
+                #   both a value drawn from all integers between the min and
+                #   max allowed values, and another from only the powers of two
+                #   (both distributions are loguniform) and then randomly
+                #   selecting between the two.
+                [
+                    FuzzedParameter(
+                        name=f"k_any_{i}",
+                        minval=_MIN_DIM_SIZE,
+                        maxval=_MAX_DIM_SIZE,
+                        distribution="loguniform",
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k_pow2_{i}",
+                        distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
+                    ) for i in range(3)
+                ],
+                [
+                    FuzzedParameter(
+                        name=f"k{i}",
+                        distribution={
+                            ParameterAlias(f"k_any_{i}"): 0.8,
+                            ParameterAlias(f"k_pow2_{i}"): 0.2,
+                        },
+                        strict=True,
+                    ) for i in range(3)
+                ],
+
+                # Steps for `x`. (Benchmarks strided memory access.)
+                [
+                    FuzzedParameter(
+                        name=f"x_step_{i}",
+                        distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
+                    ) for i in range(3)
+                ],
+
+                # Repeatable entropy for downstream applications.
+                FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
+            ],
+            tensors=[
+                FuzzedTensor(
+                    name="x",
+                    size=("k0", "k1", "k2"),
+                    steps=("x_step_0", "x_step_1", "x_step_2"),
+                    probability_contiguous=0.75,
+                    min_elements=4 * 1024,
+                    max_elements=32 * 1024 ** 2,
+                    max_allocation_bytes=2 * 1024**3,  # 2 GB
+                    dim_parameter="dim",
+                    dtype=dtype,
+                    cuda=cuda,
+                ),
+            ],
+            seed=seed,
+        )
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__init__.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f40a6c12a606deb51ad2547b633c27e869b5da22
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/_stubs.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/_stubs.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c18a878495adef67320d5bdb017c953bfd8c00c8
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/_stubs.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42ef47e917e9ad456949c5f368f9bd547a52cec4
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compare.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compare.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36b24c690847d50408193718c0b0e4d1cfa3adcd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compare.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compile.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compile.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7761730dab3a0d23bb77172aa562edf2fae0914
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/compile.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/cpp_jit.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/cpp_jit.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e42262a40f1a1093ef4308f6afabd0482117031
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/cpp_jit.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/fuzzer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/fuzzer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffe028cae076d1d1bc6157afb9e2039bac3ba800
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/fuzzer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/sparse_fuzzer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/sparse_fuzzer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e5fcf4a70dcf880a1f310d4170deb56c562aa9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/sparse_fuzzer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/timer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/timer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83c917ec0ad52109d5622b93177c8af0feb5a8c6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/__pycache__/timer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/_stubs.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/_stubs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9727f822e0ffbdc3e1560e453a4f66d314c416e5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/_stubs.py
@@ -0,0 +1,40 @@
+from typing import Any, Callable, Dict, Protocol, runtime_checkable
+
+
+class TimerClass(Protocol):
+    """This is the portion of the `timeit.Timer` API used by benchmark utils."""
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any],
+        **kwargs: Any,
+    ) -> None:
+        ...
+
+    def timeit(self, number: int) -> float:
+        ...
+
+
+@runtime_checkable
+class TimeitModuleType(Protocol):
+    """Modules generated from `timeit_template.cpp`."""
+    def timeit(self, number: int) -> float:
+        ...
+
+
+class CallgrindModuleType(Protocol):
+    """Replicates the valgrind endpoints in `torch._C`.
+
+    These bindings are used to collect Callgrind profiles on earlier versions
+    of PyTorch and will eventually be removed.
+    """
+    __file__: str
+    __name__: str
+
+    def _valgrind_supported_platform(self) -> bool:
+        ...
+
+    def _valgrind_toggle(self) -> None:
+        ...
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/common.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f33509b70fcece438d1fb6b240f939e163e2df5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/common.py
@@ -0,0 +1,355 @@
+"""Base shared classes and utilities."""
+
+import collections
+import contextlib
+import dataclasses
+import os
+import shutil
+import tempfile
+import textwrap
+import time
+from typing import cast, Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple
+import uuid
+
+import torch
+
+
+__all__ = ["TaskSpec", "Measurement", "select_unit", "unit_to_english", "trim_sigfig", "ordered_unique", "set_torch_threads"]
+
+
+_MAX_SIGNIFICANT_FIGURES = 4
+_MIN_CONFIDENCE_INTERVAL = 25e-9  # 25 ns
+
+# Measurement will include a warning if the distribution is suspect. All
+# runs are expected to have some variation; these parameters set the
+# thresholds.
+_IQR_WARN_THRESHOLD = 0.1
+_IQR_GROSS_WARN_THRESHOLD = 0.25
+
+
+@dataclasses.dataclass(init=True, repr=False, eq=True, frozen=True)
+class TaskSpec:
+    """Container for information used to define a Timer. (except globals)"""
+    stmt: str
+    setup: str
+    global_setup: str = ""
+    label: Optional[str] = None
+    sub_label: Optional[str] = None
+    description: Optional[str] = None
+    env: Optional[str] = None
+    num_threads: int = 1
+
+    @property
+    def title(self) -> str:
+        """Best effort attempt at a string label for the measurement."""
+        if self.label is not None:
+            return self.label + (f": {self.sub_label}" if self.sub_label else "")
+        elif "\n" not in self.stmt:
+            return self.stmt + (f": {self.sub_label}" if self.sub_label else "")
+        return (
+            f"stmt:{f' ({self.sub_label})' if self.sub_label else ''}\n"
+            f"{textwrap.indent(self.stmt, '  ')}"
+        )
+
+    def setup_str(self) -> str:
+        return (
+            "" if (self.setup == "pass" or not self.setup)
+            else f"setup:\n{textwrap.indent(self.setup, '  ')}" if "\n" in self.setup
+            else f"setup: {self.setup}"
+        )
+
+    def summarize(self) -> str:
+        """Build TaskSpec portion of repr string for other containers."""
+        sections = [
+            self.title,
+            self.description or "",
+            self.setup_str(),
+        ]
+        return "\n".join([f"{i}\n" if "\n" in i else i for i in sections if i])
+
+_TASKSPEC_FIELDS = tuple(i.name for i in dataclasses.fields(TaskSpec))
+
+
+@dataclasses.dataclass(init=True, repr=False)
+class Measurement:
+    """The result of a Timer measurement.
+
+    This class stores one or more measurements of a given statement. It is
+    serializable and provides several convenience methods
+    (including a detailed __repr__) for downstream consumers.
+    """
+    number_per_run: int
+    raw_times: List[float]
+    task_spec: TaskSpec
+    metadata: Optional[Dict[Any, Any]] = None  # Reserved for user payloads.
+
+    def __post_init__(self) -> None:
+        self._sorted_times: Tuple[float, ...] = ()
+        self._warnings: Tuple[str, ...] = ()
+        self._median: float = -1.0
+        self._mean: float = -1.0
+        self._p25: float = -1.0
+        self._p75: float = -1.0
+
+    def __getattr__(self, name: str) -> Any:
+        # Forward TaskSpec fields for convenience.
+        if name in _TASKSPEC_FIELDS:
+            return getattr(self.task_spec, name)
+        return super().__getattribute__(name)
+
+    # =========================================================================
+    # == Convenience methods for statistics ===================================
+    # =========================================================================
+    #
+    # These methods use raw time divided by number_per_run; this is an
+    # extrapolation and hides the fact that different number_per_run will
+    # result in different amortization of overheads, however if Timer has
+    # selected an appropriate number_per_run then this is a non-issue, and
+    # forcing users to handle that division would result in a poor experience.
+    @property
+    def times(self) -> List[float]:
+        return [t / self.number_per_run for t in self.raw_times]
+
+    @property
+    def median(self) -> float:
+        self._lazy_init()
+        return self._median
+
+    @property
+    def mean(self) -> float:
+        self._lazy_init()
+        return self._mean
+
+    @property
+    def iqr(self) -> float:
+        self._lazy_init()
+        return self._p75 - self._p25
+
+    @property
+    def significant_figures(self) -> int:
+        """Approximate significant figure estimate.
+
+        This property is intended to give a convenient way to estimate the
+        precision of a measurement. It only uses the interquartile region to
+        estimate statistics to try to mitigate skew from the tails, and
+        uses a static z value of 1.645 since it is not expected to be used
+        for small values of `n`, so z can approximate `t`.
+
+        The significant figure estimation used in conjunction with the
+        `trim_sigfig` method to provide a more human interpretable data
+        summary. __repr__ does not use this method; it simply displays raw
+        values. Significant figure estimation is intended for `Compare`.
+        """
+        self._lazy_init()
+        n_total = len(self._sorted_times)
+        lower_bound = int(n_total // 4)
+        upper_bound = int(torch.tensor(3 * n_total / 4).ceil())
+        interquartile_points: Tuple[float, ...] = self._sorted_times[lower_bound:upper_bound]
+        std = torch.tensor(interquartile_points).std(unbiased=False).item()
+        sqrt_n = torch.tensor(len(interquartile_points)).sqrt().item()
+
+        # Rough estimates. These are by no means statistically rigorous.
+        confidence_interval = max(1.645 * std / sqrt_n, _MIN_CONFIDENCE_INTERVAL)
+        relative_ci = torch.tensor(self._median / confidence_interval).log10().item()
+        num_significant_figures = int(torch.tensor(relative_ci).floor())
+        return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
+
+    @property
+    def has_warnings(self) -> bool:
+        self._lazy_init()
+        return bool(self._warnings)
+
+    def _lazy_init(self) -> None:
+        if self.raw_times and not self._sorted_times:
+            self._sorted_times = tuple(sorted(self.times))
+            _sorted_times = torch.tensor(self._sorted_times, dtype=torch.float64)
+            self._median = _sorted_times.quantile(.5).item()
+            self._mean = _sorted_times.mean().item()
+            self._p25 = _sorted_times.quantile(.25).item()
+            self._p75 = _sorted_times.quantile(.75).item()
+
+            def add_warning(msg: str) -> None:
+                rel_iqr = self.iqr / self.median * 100
+                self._warnings += (
+                    f"  WARNING: Interquartile range is {rel_iqr:.1f}% "
+                    f"of the median measurement.\n           {msg}",
+                )
+
+            if not self.meets_confidence(_IQR_GROSS_WARN_THRESHOLD):
+                add_warning("This suggests significant environmental influence.")
+            elif not self.meets_confidence(_IQR_WARN_THRESHOLD):
+                add_warning("This could indicate system fluctuation.")
+
+
+    def meets_confidence(self, threshold: float = _IQR_WARN_THRESHOLD) -> bool:
+        return self.iqr / self.median < threshold
+
+    @property
+    def title(self) -> str:
+        return self.task_spec.title
+
+    @property
+    def env(self) -> str:
+        return (
+            "Unspecified env" if self.taskspec.env is None
+            else cast(str, self.taskspec.env)
+        )
+
+    @property
+    def as_row_name(self) -> str:
+        return self.sub_label or self.stmt or "[Unknown]"
+
+    def __repr__(self) -> str:
+        """
+        Example repr:
+            <utils.common.Measurement object at 0x7f395b6ac110>
+              Broadcasting add (4x8)
+              Median: 5.73 us
+              IQR:    2.25 us (4.01 to 6.26)
+              372 measurements, 100 runs per measurement, 1 thread
+              WARNING: Interquartile range is 39.4% of the median measurement.
+                       This suggests significant environmental influence.
+        """
+        self._lazy_init()
+        skip_line, newline = "MEASUREMENT_REPR_SKIP_LINE", "\n"
+        n = len(self._sorted_times)
+        time_unit, time_scale = select_unit(self._median)
+        iqr_filter = '' if n >= 4 else skip_line
+
+        repr_str = f"""
+{super().__repr__()}
+{self.task_spec.summarize()}
+  {'Median: ' if n > 1 else ''}{self._median / time_scale:.2f} {time_unit}
+  {iqr_filter}IQR:    {self.iqr / time_scale:.2f} {time_unit} ({self._p25 / time_scale:.2f} to {self._p75 / time_scale:.2f})
+  {n} measurement{'s' if n > 1 else ''}, {self.number_per_run} runs {'per measurement,' if n > 1 else ','} {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
+{newline.join(self._warnings)}""".strip()  # noqa: B950
+
+        return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
+
+    @staticmethod
+    def merge(measurements: Iterable["Measurement"]) -> List["Measurement"]:
+        """Convenience method for merging replicates.
+
+        Merge will extrapolate times to `number_per_run=1` and will not
+        transfer any metadata. (Since it might differ between replicates)
+        """
+        grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
+        for m in measurements:
+            grouped_measurements[m.task_spec].append(m)
+
+        def merge_group(task_spec: TaskSpec, group: List["Measurement"]) -> "Measurement":
+            times: List[float] = []
+            for m in group:
+                # Different measurements could have different `number_per_run`,
+                # so we call `.times` which normalizes the results.
+                times.extend(m.times)
+
+            return Measurement(
+                number_per_run=1,
+                raw_times=times,
+                task_spec=task_spec,
+                metadata=None,
+            )
+
+        return [merge_group(t, g) for t, g in grouped_measurements.items()]
+
+
+def select_unit(t: float) -> Tuple[str, float]:
+    """Determine how to scale times for O(1) magnitude.
+
+    This utility is used to format numbers for human consumption.
+    """
+    time_unit = {-3: "ns", -2: "us", -1: "ms"}.get(int(torch.tensor(t).log10().item() // 3), "s")
+    time_scale = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1}[time_unit]
+    return time_unit, time_scale
+
+
+def unit_to_english(u: str) -> str:
+    return {
+        "ns": "nanosecond",
+        "us": "microsecond",
+        "ms": "millisecond",
+        "s": "second",
+    }[u]
+
+
+def trim_sigfig(x: float, n: int) -> float:
+    """Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
+    assert n == int(n)
+    magnitude = int(torch.tensor(x).abs().log10().ceil().item())
+    scale = 10 ** (magnitude - n)
+    return float(torch.tensor(x / scale).round() * scale)
+
+
+def ordered_unique(elements: Iterable[Any]) -> List[Any]:
+    return list(collections.OrderedDict(dict.fromkeys(elements)).keys())
+
+
+@contextlib.contextmanager
+def set_torch_threads(n: int) -> Iterator[None]:
+    prior_num_threads = torch.get_num_threads()
+    try:
+        torch.set_num_threads(n)
+        yield
+    finally:
+        torch.set_num_threads(prior_num_threads)
+
+
+def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> str:
+    """Create a temporary directory. The caller is responsible for cleanup.
+
+    This function is conceptually similar to `tempfile.mkdtemp`, but with
+    the key additional feature that it will use shared memory if the
+    `BENCHMARK_USE_DEV_SHM` environment variable is set. This is an
+    implementation detail, but an important one for cases where many Callgrind
+    measurements are collected at once. (Such as when collecting
+    microbenchmarks.)
+
+    This is an internal utility, and is exported solely so that microbenchmarks
+    can reuse the util.
+    """
+    use_dev_shm: bool = (os.getenv("BENCHMARK_USE_DEV_SHM") or "").lower() in ("1", "true")
+    if use_dev_shm:
+        root = "/dev/shm/pytorch_benchmark_utils"
+        assert os.name == "posix", f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}"
+        assert os.path.exists("/dev/shm"), "This system does not appear to support tmpfs (/dev/shm)."
+        os.makedirs(root, exist_ok=True)
+
+        # Because we're working in shared memory, it is more important than
+        # usual to clean up ALL intermediate files. However we don't want every
+        # worker to walk over all outstanding directories, so instead we only
+        # check when we are sure that it won't lead to contention.
+        if gc_dev_shm:
+            for i in os.listdir(root):
+                owner_file = os.path.join(root, i, "owner.pid")
+                if not os.path.exists(owner_file):
+                    continue
+
+                with open(owner_file) as f:
+                    owner_pid = int(f.read())
+
+                if owner_pid == os.getpid():
+                    continue
+
+                try:
+                    # https://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
+                    os.kill(owner_pid, 0)
+
+                except OSError:
+                    print(f"Detected that {os.path.join(root, i)} was orphaned in shared memory. Cleaning up.")
+                    shutil.rmtree(os.path.join(root, i))
+
+    else:
+        root = tempfile.gettempdir()
+
+    # We include the time so names sort by creation time, and add a UUID
+    # to ensure we don't collide.
+    name = f"{prefix or tempfile.gettempprefix()}__{int(time.time())}__{uuid.uuid4()}"
+    path = os.path.join(root, name)
+    os.makedirs(path, exist_ok=False)
+
+    if use_dev_shm:
+        with open(os.path.join(path, "owner.pid"), "w") as f:
+            f.write(str(os.getpid()))
+
+    return path
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compare.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e5654a179b23c4d3d94e431e91c443a2c43cc3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compare.py
@@ -0,0 +1,320 @@
+"""Display class to aggregate and print the results of many measurements."""
+import collections
+import enum
+import itertools as it
+from typing import DefaultDict, List, Optional, Tuple
+
+from torch.utils.benchmark.utils import common
+from torch import tensor as _tensor
+
+__all__ = ["Colorize", "Compare"]
+
+BEST = "\033[92m"
+GOOD = "\033[34m"
+BAD = "\033[2m\033[91m"
+VERY_BAD = "\033[31m"
+BOLD = "\033[1m"
+TERMINATE = "\033[0m"
+
+
+class Colorize(enum.Enum):
+    NONE = "none"
+    COLUMNWISE = "columnwise"
+    ROWWISE = "rowwise"
+
+
+# Classes to separate internal bookkeeping from what is rendered.
+class _Column:
+    def __init__(
+        self,
+        grouped_results: List[Tuple[Optional[common.Measurement], ...]],
+        time_scale: float,
+        time_unit: str,
+        trim_significant_figures: bool,
+        highlight_warnings: bool,
+    ):
+        self._grouped_results = grouped_results
+        self._flat_results = list(it.chain(*grouped_results))
+        self._time_scale = time_scale
+        self._time_unit = time_unit
+        self._trim_significant_figures = trim_significant_figures
+        self._highlight_warnings = (
+            highlight_warnings
+            and any(r.has_warnings for r in self._flat_results if r)
+        )
+        leading_digits = [
+            int(_tensor(r.median / self._time_scale).log10().ceil()) if r else None
+            for r in self._flat_results
+        ]
+        unit_digits = max(d for d in leading_digits if d is not None)
+        decimal_digits = min(
+            max(m.significant_figures - digits, 0)
+            for digits, m in zip(leading_digits, self._flat_results)
+            if (m is not None) and (digits is not None)
+        ) if self._trim_significant_figures else 1
+        length = unit_digits + decimal_digits + (1 if decimal_digits else 0)
+        self._template = f"{{:>{length}.{decimal_digits}f}}{{:>{7 if self._highlight_warnings else 0}}}"
+
+    def get_results_for(self, group):
+        return self._grouped_results[group]
+
+    def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
+        if value is None:
+            return " " * len(self.num_to_str(1, estimated_sigfigs, None))
+
+        if self._trim_significant_figures:
+            value = common.trim_sigfig(value, estimated_sigfigs)
+
+        return self._template.format(
+            value,
+            f" (! {spread * 100:.0f}%)" if self._highlight_warnings and spread is not None else "")
+
+
+def optional_min(seq):
+    l = list(seq)
+    return None if len(l) == 0 else min(l)
+
+
+class _Row:
+    def __init__(self, results, row_group, render_env, env_str_len,
+                 row_name_str_len, time_scale, colorize, num_threads=None):
+        super().__init__()
+        self._results = results
+        self._row_group = row_group
+        self._render_env = render_env
+        self._env_str_len = env_str_len
+        self._row_name_str_len = row_name_str_len
+        self._time_scale = time_scale
+        self._colorize = colorize
+        self._columns: Tuple[_Column, ...] = ()
+        self._num_threads = num_threads
+
+    def register_columns(self, columns: Tuple[_Column, ...]):
+        self._columns = columns
+
+    def as_column_strings(self):
+        concrete_results = [r for r in self._results if r is not None]
+        env = f"({concrete_results[0].env})" if self._render_env else ""
+        env = env.ljust(self._env_str_len + 4)
+        output = ["  " + env + concrete_results[0].as_row_name]
+        for m, col in zip(self._results, self._columns or ()):
+            if m is None:
+                output.append(col.num_to_str(None, 1, None))
+            else:
+                output.append(col.num_to_str(
+                    m.median / self._time_scale,
+                    m.significant_figures,
+                    m.iqr / m.median if m.has_warnings else None
+                ))
+        return output
+
+    @staticmethod
+    def color_segment(segment, value, best_value):
+        if value <= best_value * 1.01 or value <= best_value + 100e-9:
+            return BEST + BOLD + segment + TERMINATE * 2
+        if value <= best_value * 1.1:
+            return GOOD + BOLD + segment + TERMINATE * 2
+        if value >= best_value * 5:
+            return VERY_BAD + BOLD + segment + TERMINATE * 2
+        if value >= best_value * 2:
+            return BAD + segment + TERMINATE * 2
+
+        return segment
+
+    def row_separator(self, overall_width):
+        return (
+            [f"{self._num_threads} threads: ".ljust(overall_width, "-")]
+            if self._num_threads is not None else []
+        )
+
+    def finalize_column_strings(self, column_strings, col_widths):
+        best_values = [-1 for _ in column_strings]
+        if self._colorize == Colorize.ROWWISE:
+            row_min = min(r.median for r in self._results if r is not None)
+            best_values = [row_min for _ in column_strings]
+        elif self._colorize == Colorize.COLUMNWISE:
+            best_values = [
+                optional_min(r.median for r in column.get_results_for(self._row_group) if r is not None)
+                for column in (self._columns or ())
+            ]
+
+        row_contents = [column_strings[0].ljust(col_widths[0])]
+        for col_str, width, result, best_value in zip(column_strings[1:], col_widths[1:], self._results, best_values):
+            col_str = col_str.center(width)
+            if self._colorize != Colorize.NONE and result is not None and best_value is not None:
+                col_str = self.color_segment(col_str, result.median, best_value)
+            row_contents.append(col_str)
+        return row_contents
+
+
+class Table:
+    def __init__(
+            self,
+            results: List[common.Measurement],
+            colorize: Colorize,
+            trim_significant_figures: bool,
+            highlight_warnings: bool
+    ):
+        assert len({r.label for r in results}) == 1
+
+        self.results = results
+        self._colorize = colorize
+        self._trim_significant_figures = trim_significant_figures
+        self._highlight_warnings = highlight_warnings
+        self.label = results[0].label
+        self.time_unit, self.time_scale = common.select_unit(
+            min(r.median for r in results)
+        )
+
+        self.row_keys = common.ordered_unique([self.row_fn(i) for i in results])
+        self.row_keys.sort(key=lambda args: args[:2])  # preserve stmt order
+        self.column_keys = common.ordered_unique([self.col_fn(i) for i in results])
+        self.rows, self.columns = self.populate_rows_and_columns()
+
+    @staticmethod
+    def row_fn(m: common.Measurement) -> Tuple[int, Optional[str], str]:
+        return m.num_threads, m.env, m.as_row_name
+
+    @staticmethod
+    def col_fn(m: common.Measurement) -> Optional[str]:
+        return m.description
+
+    def populate_rows_and_columns(self) -> Tuple[Tuple[_Row, ...], Tuple[_Column, ...]]:
+        rows: List[_Row] = []
+        columns: List[_Column] = []
+        ordered_results: List[List[Optional[common.Measurement]]] = [
+            [None for _ in self.column_keys]
+            for _ in self.row_keys
+        ]
+        row_position = {key: i for i, key in enumerate(self.row_keys)}
+        col_position = {key: i for i, key in enumerate(self.column_keys)}
+        for r in self.results:
+            i = row_position[self.row_fn(r)]
+            j = col_position[self.col_fn(r)]
+            ordered_results[i][j] = r
+
+        unique_envs = {r.env for r in self.results}
+        render_env = len(unique_envs) > 1
+        env_str_len = max(len(i) for i in unique_envs) if render_env else 0
+
+        row_name_str_len = max(len(r.as_row_name) for r in self.results)
+
+        prior_num_threads = -1
+        prior_env = ""
+        row_group = -1
+        rows_by_group: List[List[List[Optional[common.Measurement]]]] = []
+        for (num_threads, env, _), row in zip(self.row_keys, ordered_results):
+            thread_transition = (num_threads != prior_num_threads)
+            if thread_transition:
+                prior_num_threads = num_threads
+                prior_env = ""
+                row_group += 1
+                rows_by_group.append([])
+            rows.append(
+                _Row(
+                    results=row,
+                    row_group=row_group,
+                    render_env=(render_env and env != prior_env),
+                    env_str_len=env_str_len,
+                    row_name_str_len=row_name_str_len,
+                    time_scale=self.time_scale,
+                    colorize=self._colorize,
+                    num_threads=num_threads if thread_transition else None,
+                )
+            )
+            rows_by_group[-1].append(row)
+            prior_env = env
+
+        for i in range(len(self.column_keys)):
+            grouped_results = [tuple(row[i] for row in g) for g in rows_by_group]
+            column = _Column(
+                grouped_results=grouped_results,
+                time_scale=self.time_scale,
+                time_unit=self.time_unit,
+                trim_significant_figures=self._trim_significant_figures,
+                highlight_warnings=self._highlight_warnings,)
+            columns.append(column)
+
+        rows_tuple, columns_tuple = tuple(rows), tuple(columns)
+        for ri in rows_tuple:
+            ri.register_columns(columns_tuple)
+        return rows_tuple, columns_tuple
+
+    def render(self) -> str:
+        string_rows = [[""] + self.column_keys]
+        for r in self.rows:
+            string_rows.append(r.as_column_strings())
+        num_cols = max(len(i) for i in string_rows)
+        for sr in string_rows:
+            sr.extend(["" for _ in range(num_cols - len(sr))])
+
+        col_widths = [max(len(j) for j in i) for i in zip(*string_rows)]
+        finalized_columns = ["  |  ".join(i.center(w) for i, w in zip(string_rows[0], col_widths))]
+        overall_width = len(finalized_columns[0])
+        for string_row, row in zip(string_rows[1:], self.rows):
+            finalized_columns.extend(row.row_separator(overall_width))
+            finalized_columns.append("  |  ".join(row.finalize_column_strings(string_row, col_widths)))
+
+        newline = "\n"
+        has_warnings = self._highlight_warnings and any(ri.has_warnings for ri in self.results)
+        return f"""
+[{(' ' + (self.label or '') + ' ').center(overall_width - 2, '-')}]
+{newline.join(finalized_columns)}
+
+Times are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).
+{'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
+
+
+class Compare:
+    def __init__(self, results: List[common.Measurement]):
+        self._results: List[common.Measurement] = []
+        self.extend_results(results)
+        self._trim_significant_figures = False
+        self._colorize = Colorize.NONE
+        self._highlight_warnings = False
+
+    def __str__(self):
+        return "\n".join(self._render())
+
+    def extend_results(self, results):
+        for r in results:
+            if not isinstance(r, common.Measurement):
+                raise ValueError(
+                    "Expected an instance of `Measurement`, " f"got {type(r)} instead."
+                )
+        self._results.extend(results)
+
+    def trim_significant_figures(self):
+        self._trim_significant_figures = True
+
+    def colorize(self, rowwise=False):
+        self._colorize = Colorize.ROWWISE if rowwise else Colorize.COLUMNWISE
+
+    def highlight_warnings(self):
+        self._highlight_warnings = True
+
+    def print(self):
+        print(str(self))
+
+    def _render(self):
+        results = common.Measurement.merge(self._results)
+        grouped_results = self._group_by_label(results)
+        output = []
+        for group in grouped_results.values():
+            output.append(self._layout(group))
+        return output
+
+    def _group_by_label(self, results: List[common.Measurement]):
+        grouped_results: DefaultDict[str, List[common.Measurement]] = collections.defaultdict(list)
+        for r in results:
+            grouped_results[r.label].append(r)
+        return grouped_results
+
+    def _layout(self, results: List[common.Measurement]):
+        table = Table(
+            results,
+            self._colorize,
+            self._trim_significant_figures,
+            self._highlight_warnings
+        )
+        return table.render()
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compile.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..efed9a595c6d864a0310b3018f32b297871af0cf
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/compile.py
@@ -0,0 +1,187 @@
+import torch
+
+__all__ = ["bench_all", "benchmark_compile"]
+
+import torch._dynamo
+from torch._dynamo.testing import CompileCounterWithBackend
+from torch.utils.benchmark import Timer
+
+from typing import Optional, List, Callable, Union, Any, cast
+
+_warned_tensor_cores = False
+_default_float_32_precision = torch.get_float32_matmul_precision()
+
+try:
+    from tabulate import tabulate
+    HAS_TABULATE = True
+except ImportError:
+    HAS_TABULATE = False
+    print("tabulate is not installed, please pip install tabulate to use this utility")
+
+if HAS_TABULATE:
+    def _enable_tensor_cores():
+        global _warned_tensor_cores
+
+        if torch.cuda.is_available():
+            if torch.backends.cuda.matmul.allow_tf32 is False and torch.cuda.get_device_capability() >= (8, 0):
+                torch.set_float32_matmul_precision("high")
+                if not _warned_tensor_cores:
+                    print("Your GPU supports tensor cores")
+                    print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
+                    _warned_tensor_cores = True
+
+    def _disable_tensor_cores():
+        torch.set_float32_matmul_precision(_default_float_32_precision)
+
+    def bench_loop(
+        model: Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters: int = 5,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn: Optional[Callable] = None,
+    ):
+        # Define the statement and setup for the benchmark
+        if optimizer and loss_fn:
+            # Training mode
+            stmt = """
+    output = model(sample_input)
+    loss = loss_fn(output) if loss_fn else output.sum()
+    loss.backward()
+    optimizer.step()
+    optimizer.zero_grad()
+            """
+        else:
+            # Inference mode
+            stmt = "model(sample_input)"
+
+        # Create the Timer object
+        timer = Timer(
+            stmt=stmt,
+            globals={"model": model, "sample_input": sample_input, "optimizer": optimizer, "loss_fn": loss_fn},
+        )
+
+
+        result = timer.timeit(number=num_iters)
+
+        # Get the average time per iteration in milliseconds
+        avg_time = result.mean * 1000
+        return round(avg_time, 2)
+
+    def benchmark_compile(
+        model: Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters: int = 5,
+        backend: Optional[str] = None,
+        mode: Optional[str] = "default",
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+    ):
+        """
+        Use this utility to benchmark torch.compile
+        """
+        if backend:
+            try:
+                torch._dynamo.reset()
+                compile_counter_with_backend = CompileCounterWithBackend(backend)
+                opt_model = torch.compile(model, backend=compile_counter_with_backend, mode=mode)
+
+                # Compilation only happens after the first inference
+                compilation_time = bench_loop(opt_model, sample_input, 1, optimizer, loss_fn)
+
+                running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
+
+                if compile_counter_with_backend.frame_count == 0:
+                    raise RuntimeError("No compilation occurred during benchmarking.")
+
+                if compile_counter_with_backend.frame_count > 1:
+                    raise RuntimeError("Recompilation occurred during benchmarking.")
+
+            except Exception as e:
+                print(e)
+                print(f"Failed to compile {backend} with mode {mode}")
+                return None, None
+        else:
+            opt_model = model
+            compilation_time = None
+            running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
+
+        compilation_time = round(compilation_time, 2) if compilation_time else None
+        running_time = round(running_time, 2) if running_time else None
+
+
+        return compilation_time, running_time
+
+
+    def bench_all(
+        model : Union[torch.nn.Module, Callable],
+        sample_input: Union[torch.Tensor, Any],
+        num_iters : int = 5,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+        loss_fn : Union[torch.nn.Module, Callable, None] = None,
+    ):
+        """
+        This is a simple utility that can be used to benchmark torch.compile
+        In particular it ensures that your GPU is setup to use tensor cores if it supports its
+        It also tries out all the main backends and prints a table of results so you can easily compare them all
+        Many of the backendds have their own optional dependencies so please pip install them seperately
+
+        You will get one table for inference and another for training
+        If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
+
+        The important warnings are
+        Your GPU supports tensor cores
+        we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`
+
+        If a compilation fails for any reason including the dependency not being included
+        then we will print Failed to compile {backend} with mode {mode}
+        """
+        field_names = ["Train/Inference", "Backend", "Mode", "Compilation Time", "Average Running Time"]
+        table = []
+
+
+        eager_time = None
+        torch._dynamo.reset()
+        _, eager_time = benchmark_compile(model, sample_input, num_iters, None, None, optimizer)
+        table.append(
+            [("Training" if optimizer else "Inference"), "Eager", "-", "-", f"{eager_time} ms"]
+        )
+
+        for backend in torch._dynamo.list_backends():
+
+            if backend == "inductor":
+                mode_options = cast(List[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
+                for mode in mode_options:
+                    if mode == "default":
+                        continue
+                    torch._dynamo.reset()
+                    try:
+                        if torch.cuda.is_available():
+                            _enable_tensor_cores()
+                        compilation_time, running_time = benchmark_compile(
+                            model, sample_input, num_iters, backend, mode, optimizer, loss_fn)
+                    finally:
+                        if torch.cuda.is_available():
+                            _disable_tensor_cores()
+                            table.append([
+                                ("Training" if optimizer else "Inference"),
+                                backend if backend else "-",
+                                mode if mode is not None else "-",
+                                f"{compilation_time} ms " if compilation_time else "-",
+                                f"{running_time} ms " if running_time else "-",
+                            ])
+
+            else:
+                torch._dynamo.reset()
+                compilation_time, running_time = benchmark_compile(
+                    model, sample_input, num_iters, backend, None, optimizer, loss_fn)
+
+                if running_time is not None:
+                    table.append([
+                        ("Training" if optimizer else "Inference"),
+                        backend, "-",
+                        f"{compilation_time} ms " or "-",
+                        f"{running_time} ms ",
+                    ])
+
+
+        return tabulate(table, headers=field_names, tablefmt="github")
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/cpp_jit.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/cpp_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..6465d8814a7dc68ff82784085387a1a7a7337dbe
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/cpp_jit.py
@@ -0,0 +1,172 @@
+"""JIT C++ strings into executables."""
+import atexit
+import os
+import re
+import shutil
+import textwrap
+import threading
+from typing import Any, List, Optional
+
+import torch
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType
+from torch.utils.benchmark.utils.common import _make_temp_dir
+from torch.utils import cpp_extension
+
+
+LOCK = threading.Lock()
+SOURCE_ROOT = os.path.split(os.path.abspath(__file__))[0]
+
+# We calculate uuid once at import time so that separate processes will have
+# separate build roots, but threads will share the same build root.
+# `cpp_extension` uses build root as part of the cache key, so per-invocation
+# uuid's (e.g. different build root per _compile_template call) would lead to
+# a 0% cache hit rate and spurious recompilation. Consider the following:
+#   ```
+#   setup = "auto x = torch::ones({1024, 1024});"
+#   stmt = "torch::mm(x, x);"
+#   for num_threads in [1, 2, 4, 8]:
+#     print(Timer(stmt, setup, num_threads=num_threads, language="c++").blocked_autorange())
+#   ````
+# `setup` and `stmt` do not change, so we can reuse the executable from the
+# first pass through the loop.
+_BUILD_ROOT: Optional[str] = None
+
+def _get_build_root() -> str:
+    global _BUILD_ROOT
+    if _BUILD_ROOT is None:
+        _BUILD_ROOT = _make_temp_dir(prefix="benchmark_utils_jit_build")
+        atexit.register(shutil.rmtree, _BUILD_ROOT)
+    return _BUILD_ROOT
+
+
+# BACK_TESTING_NOTE:
+#   There are two workflows where this code could be used. One is the obvious
+#   case where someone simply builds or installs PyTorch and uses Timer.
+#   The other is that the entire `torch/utils/benchmark` folder from a CURRENT
+#   PyTorch checkout is copy-pasted into a much OLDER version of the PyTorch
+#   source code. This is what we refer to here as "back testing". The rationale
+#   is that we might want to use current tooling to study some aspect of an
+#   earlier version of PyTorch. (e.g. a regression.)
+#
+#   The problem is that Timer relies on several aspects of core PyTorch, namely
+#   some binding functions for Valgrind symbols in `torch._C` and the
+#   `torch.__config__._cxx_flags()` method. If we were to naively copy code
+#   around this wouldn't work as the symbols of interest aren't present in
+#   earlier versions of PyTorch. In order to work around this, we must add back
+#   testing shims. These shims will never activate during normal use, but will
+#   allow Timer to function outside of the "correct" version of PyTorch by
+#   emulating functionality that was added later.
+#
+#   These shims are temporary, and as Timer becomes more integrated with
+#   PyTorch the cost and complexity of such shims will increase. Once back
+#   testing is no longer required (which is to say we have done enough historic
+#   analysis and the shims no longer justify their maintenance and code
+#   complexity costs) back testing paths will be removed.
+
+CXX_FLAGS: Optional[List[str]]
+if hasattr(torch.__config__, "_cxx_flags"):
+    try:
+        CXX_FLAGS = torch.__config__._cxx_flags().strip().split()
+        if CXX_FLAGS is not None and "-g" not in CXX_FLAGS:
+            CXX_FLAGS.append("-g")
+        # remove "-W" flags to allow build benchmarks
+        # with a relaxed constraint of compiler versions
+        if CXX_FLAGS is not None:
+            CXX_FLAGS = list(filter(lambda x: not x.startswith("-W"), CXX_FLAGS))
+
+    except RuntimeError:
+        # We are in FBCode.
+        CXX_FLAGS = None
+else:
+    # FIXME: Remove when back testing is no longer required.
+    CXX_FLAGS = ["-O2", "-fPIC", "-g"]
+
+EXTRA_INCLUDE_PATHS: List[str] = [os.path.join(SOURCE_ROOT, "valgrind_wrapper")]
+CONDA_PREFIX = os.getenv("CONDA_PREFIX")
+if CONDA_PREFIX is not None:
+    # Load will automatically search /usr/include, but not conda include.
+    EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include"))
+
+
+COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None
+def get_compat_bindings() -> CallgrindModuleType:
+    with LOCK:
+        global COMPAT_CALLGRIND_BINDINGS
+        if COMPAT_CALLGRIND_BINDINGS is None:
+            COMPAT_CALLGRIND_BINDINGS = cpp_extension.load(
+                name="callgrind_bindings",
+                sources=[os.path.join(
+                    SOURCE_ROOT,
+                    "valgrind_wrapper",
+                    "compat_bindings.cpp"
+                )],
+                extra_cflags=CXX_FLAGS,
+                extra_include_paths=EXTRA_INCLUDE_PATHS,
+            )
+    return COMPAT_CALLGRIND_BINDINGS
+
+
+def _compile_template(
+    *,
+    stmt: str,
+    setup: str,
+    global_setup: str,
+    src: str,
+    is_standalone: bool
+) -> Any:
+    for before, after, indentation in (
+        ("// GLOBAL_SETUP_TEMPLATE_LOCATION", global_setup, 0),
+        ("// SETUP_TEMPLATE_LOCATION", setup, 4),
+        ("// STMT_TEMPLATE_LOCATION", stmt, 8)
+    ):
+        # C++ doesn't care about indentation so this code isn't load
+        # bearing the way it is with Python, but this makes the source
+        # look nicer if a human has to look at it.
+        src = re.sub(
+            before,
+            textwrap.indent(after, " " * indentation)[indentation:],
+            src
+        )
+
+    # We want to isolate different Timers. However `cpp_extension` will
+    # cache builds which will significantly reduce the cost of repeated
+    # invocations.
+    with LOCK:
+        name = f"timer_cpp_{abs(hash(src))}"
+        build_dir = os.path.join(_get_build_root(), name)
+        os.makedirs(build_dir, exist_ok=True)
+
+        src_path = os.path.join(build_dir, "timer_src.cpp")
+        with open(src_path, "w") as f:
+            f.write(src)
+
+    # `cpp_extension` has its own locking scheme, so we don't need our lock.
+    return cpp_extension.load(
+        name=name,
+        sources=[src_path],
+        build_directory=build_dir,
+        extra_cflags=CXX_FLAGS,
+        extra_include_paths=EXTRA_INCLUDE_PATHS,
+        is_python_module=not is_standalone,
+        is_standalone=is_standalone,
+    )
+
+
+def compile_timeit_template(*, stmt: str, setup: str, global_setup: str) -> TimeitModuleType:
+    template_path: str = os.path.join(SOURCE_ROOT, "timeit_template.cpp")
+    with open(template_path) as f:
+        src: str = f.read()
+
+    module = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=False)
+    assert isinstance(module, TimeitModuleType)
+    return module
+
+
+def compile_callgrind_template(*, stmt: str, setup: str, global_setup: str) -> str:
+    template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp")
+    with open(template_path) as f:
+        src: str = f.read()
+
+    target = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=True)
+    assert isinstance(target, str)
+    return target
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/fuzzer.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/fuzzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf587c5bd96c10d388a0035ff0a0c094bd59aa0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/fuzzer.py
@@ -0,0 +1,457 @@
+import functools
+import itertools as it
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+__all__ = [
+    "Fuzzer",
+    "FuzzedParameter", "ParameterAlias",
+    "FuzzedTensor",
+]
+
+
+_DISTRIBUTIONS = (
+    "loguniform",
+    "uniform",
+)
+
+
+class FuzzedParameter:
+    """Specification for a parameter to be generated during fuzzing."""
+    def __init__(
+        self,
+        name: str,
+        minval: Optional[Union[int, float]] = None,
+        maxval: Optional[Union[int, float]] = None,
+        distribution: Optional[Union[str, Dict[Any, float]]] = None,
+        strict: bool = False,
+    ):
+        """
+        Args:
+            name:
+                A string name with which to identify the parameter.
+                FuzzedTensors can reference this string in their
+                specifications.
+            minval:
+                The lower bound for the generated value. See the description
+                of `distribution` for type behavior.
+            maxval:
+                The upper bound for the generated value. Type behavior is
+                identical to `minval`.
+            distribution:
+                Specifies the distribution from which this parameter should
+                be drawn. There are three possibilities:
+                    - "loguniform"
+                        Samples between `minval` and `maxval` (inclusive) such
+                        that the probabilities are uniform in log space. As a
+                        concrete example, if minval=1 and maxval=100, a sample
+                        is as likely to fall in [1, 10) as it is [10, 100].
+                    - "uniform"
+                        Samples are chosen with uniform probability between
+                        `minval` and `maxval` (inclusive). If either `minval`
+                        or `maxval` is a float then the distribution is the
+                        continuous uniform distribution; otherwise samples
+                        are constrained to the integers.
+                    - dict:
+                        If a dict is passed, the keys are taken to be choices
+                        for the variables and the values are interpreted as
+                        probabilities. (And must sum to one.)
+                If a dict is passed, `minval` and `maxval` must not be set.
+                Otherwise, they must be set.
+            strict:
+                If a parameter is strict, it will not be included in the
+                iterative resampling process which Fuzzer uses to find a
+                valid parameter configuration. This allows an author to
+                prevent skew from resampling for a given parameter (for
+                instance, a low size limit could inadvertently bias towards
+                Tensors with fewer dimensions) at the cost of more iterations
+                when generating parameters.
+        """
+        self._name = name
+        self._minval = minval
+        self._maxval = maxval
+        self._distribution = self._check_distribution(distribution)
+        self.strict = strict
+
+    @property
+    def name(self):
+        return self._name
+
+    def sample(self, state):
+        if self._distribution == "loguniform":
+            return self._loguniform(state)
+
+        if self._distribution == "uniform":
+            return self._uniform(state)
+
+        if isinstance(self._distribution, dict):
+            return self._custom_distribution(state)
+
+    def _check_distribution(self, distribution):
+        if not isinstance(distribution, dict):
+            assert distribution in _DISTRIBUTIONS
+        else:
+            assert not any(i < 0 for i in distribution.values()), "Probabilities cannot be negative"
+            assert abs(sum(distribution.values()) - 1) <= 1e-5, "Distribution is not normalized"
+            assert self._minval is None
+            assert self._maxval is None
+
+        return distribution
+
+    def _loguniform(self, state):
+        output = int(2 ** state.uniform(
+            low=np.log2(self._minval) if self._minval is not None else None,
+            high=np.log2(self._maxval) if self._maxval is not None else None,
+        ))
+        if self._minval is not None and output < self._minval:
+            return self._minval
+        if self._maxval is not None and output > self._maxval:
+            return self._maxval
+        return output
+
+    def _uniform(self, state):
+        if isinstance(self._minval, int) and isinstance(self._maxval, int):
+            return int(state.randint(low=self._minval, high=self._maxval + 1))
+        return state.uniform(low=self._minval, high=self._maxval)
+
+    def _custom_distribution(self, state):
+        # If we directly pass the keys to `choice`, numpy will convert
+        # them to numpy dtypes.
+        index = state.choice(
+            np.arange(len(self._distribution)),
+            p=tuple(self._distribution.values()))
+        return list(self._distribution.keys())[index]
+
+
+class ParameterAlias:
+    """Indicates that a parameter should alias the value of another parameter.
+
+    When used in conjunction with a custom distribution, this allows fuzzed
+    tensors to represent a broader range of behaviors. For example, the
+    following sometimes produces Tensors which broadcast:
+
+    Fuzzer(
+        parameters=[
+            FuzzedParameter("x_len", 4, 1024, distribution="uniform"),
+
+            # `y` will either be size one, or match the size of `x`.
+            FuzzedParameter("y_len", distribution={
+                0.5: 1,
+                0.5: ParameterAlias("x_len")
+            }),
+        ],
+        tensors=[
+            FuzzedTensor("x", size=("x_len",)),
+            FuzzedTensor("y", size=("y_len",)),
+        ],
+    )
+
+    Chains of alias' are allowed, but may not contain cycles.
+    """
+    def __init__(self, alias_to):
+        self.alias_to = alias_to
+
+    def __repr__(self):
+        return f"ParameterAlias[alias_to: {self.alias_to}]"
+
+
+def dtype_size(dtype):
+    if dtype == torch.bool:
+        return 1
+    if dtype.is_floating_point or dtype.is_complex:
+        return int(torch.finfo(dtype).bits / 8)
+    return int(torch.iinfo(dtype).bits / 8)
+
+
+def prod(values, base=1):
+    """np.prod can overflow, so for sizes the product should be done in Python.
+
+    Even though np.prod type promotes to int64, it can still overflow in which
+    case the negative value will pass the size check and OOM when attempting to
+    actually allocate the Tensor.
+    """
+    return functools.reduce(lambda x, y: int(x) * int(y), values, base)
+
+
+class FuzzedTensor:
+    def __init__(
+        self,
+        name: str,
+        size: Tuple[Union[str, int], ...],
+        steps: Optional[Tuple[Union[str, int], ...]] = None,
+        probability_contiguous: float = 0.5,
+        min_elements: Optional[int] = None,
+        max_elements: Optional[int] = None,
+        max_allocation_bytes: Optional[int] = None,
+        dim_parameter: Optional[str] = None,
+        roll_parameter: Optional[str] = None,
+        dtype=torch.float32,
+        cuda=False,
+        tensor_constructor: Optional[Callable] = None
+    ):
+        """
+        Args:
+            name:
+                A string identifier for the generated Tensor.
+            size:
+                A tuple of integers or strings specifying the size of the generated
+                Tensor. String values will replaced with a concrete int during the
+                generation process, while ints are simply passed as literals.
+            steps:
+                An optional tuple with the same length as `size`. This indicates
+                that a larger Tensor should be allocated, and then sliced to
+                produce the generated Tensor. For instance, if size is (4, 8)
+                and steps is (1, 4), then a tensor `t` of size (4, 32) will be
+                created and then `t[:, ::4]` will be used. (Allowing one to test
+                Tensors with strided memory.)
+            probability_contiguous:
+                A number between zero and one representing the chance that the
+                generated Tensor has a contiguous memory layout. This is achieved by
+                randomly permuting the shape of a Tensor, calling `.contiguous()`,
+                and then permuting back. This is applied before `steps`, which can
+                also cause a Tensor to be non-contiguous.
+            min_elements:
+                The minimum number of parameters that this Tensor must have for a
+                set of parameters to be valid. (Otherwise they are resampled.)
+            max_elements:
+                Like `min_elements`, but setting an upper bound.
+            max_allocation_bytes:
+                Like `max_elements`, but for the size of Tensor that must be
+                allocated prior to slicing for `steps` (if applicable). For
+                example, a FloatTensor with size (1024, 1024) and steps (4, 4)
+                would have 1M elements, but would require a 64 MB allocation.
+            dim_parameter:
+                The length of `size` and `steps` will be truncated to this value.
+                This allows Tensors of varying dimensions to be generated by the
+                Fuzzer.
+            dtype:
+                The PyTorch dtype of the generated Tensor.
+            cuda:
+                Whether to place the Tensor on a GPU.
+            tensor_constructor:
+                Callable which will be used instead of the default Tensor
+                construction method. This allows the author to enforce properties
+                of the Tensor (e.g. it can only have certain values). The dtype and
+                concrete shape of the Tensor to be created will be passed, and
+                concrete values of all parameters will be passed as kwargs. Note
+                that transformations to the result (permuting, slicing) will be
+                performed by the Fuzzer; the tensor_constructor is only responsible
+                for creating an appropriately sized Tensor.
+        """
+        self._name = name
+        self._size = size
+        self._steps = steps
+        self._probability_contiguous = probability_contiguous
+        self._min_elements = min_elements
+        self._max_elements = max_elements
+        self._max_allocation_bytes = max_allocation_bytes
+        self._dim_parameter = dim_parameter
+        self._dtype = dtype
+        self._cuda = cuda
+        self._tensor_constructor = tensor_constructor
+
+    @property
+    def name(self):
+        return self._name
+
+    @staticmethod
+    def default_tensor_constructor(size, dtype, **kwargs):
+        if dtype.is_floating_point or dtype.is_complex:
+            return torch.rand(size=size, dtype=dtype, device="cpu")
+        else:
+            return torch.randint(1, 127, size=size, dtype=dtype, device="cpu")
+
+    def _make_tensor(self, params, state):
+        size, steps, allocation_size = self._get_size_and_steps(params)
+        constructor = (
+            self._tensor_constructor or
+            self.default_tensor_constructor
+        )
+
+        raw_tensor = constructor(size=allocation_size, dtype=self._dtype, **params)
+        if self._cuda:
+            raw_tensor = raw_tensor.cuda()
+
+        # Randomly permute the Tensor and call `.contiguous()` to force re-ordering
+        # of the memory, and then permute it back to the original shape.
+        dim = len(size)
+        order = np.arange(dim)
+        if state.rand() > self._probability_contiguous:
+            while dim > 1 and np.all(order == np.arange(dim)):
+                order = state.permutation(raw_tensor.dim())
+
+            raw_tensor = raw_tensor.permute(tuple(order)).contiguous()
+            raw_tensor = raw_tensor.permute(tuple(np.argsort(order)))
+
+        slices = [slice(0, size * step, step) for size, step in zip(size, steps)]
+        tensor = raw_tensor[slices]
+
+        properties = {
+            "numel": int(tensor.numel()),
+            "order": order,
+            "steps": steps,
+            "is_contiguous": tensor.is_contiguous(),
+            "dtype": str(self._dtype),
+        }
+
+        return tensor, properties
+
+    def _get_size_and_steps(self, params):
+        dim = (
+            params[self._dim_parameter]
+            if self._dim_parameter is not None
+            else len(self._size)
+        )
+
+        def resolve(values, dim):
+            """Resolve values into concrete integers."""
+            values = tuple(params.get(i, i) for i in values)
+            if len(values) > dim:
+                values = values[:dim]
+            if len(values) < dim:
+                values = values + tuple(1 for _ in range(dim - len(values)))
+            return values
+
+        size = resolve(self._size, dim)
+        steps = resolve(self._steps or (), dim)
+        allocation_size = tuple(size_i * step_i for size_i, step_i in zip(size, steps))
+        return size, steps, allocation_size
+
+    def satisfies_constraints(self, params):
+        size, _, allocation_size = self._get_size_and_steps(params)
+        # Product is computed in Python to avoid integer overflow.
+        num_elements = prod(size)
+        assert num_elements >= 0
+
+        allocation_bytes = prod(allocation_size, base=dtype_size(self._dtype))
+
+        def nullable_greater(left, right):
+            if left is None or right is None:
+                return False
+            return left > right
+
+        return not any((
+            nullable_greater(num_elements, self._max_elements),
+            nullable_greater(self._min_elements, num_elements),
+            nullable_greater(allocation_bytes, self._max_allocation_bytes),
+        ))
+
+
+class Fuzzer:
+    def __init__(
+        self,
+        parameters: List[Union[FuzzedParameter, List[FuzzedParameter]]],
+        tensors: List[Union[FuzzedTensor, List[FuzzedTensor]]],
+        constraints: Optional[List[Callable]] = None,
+        seed: Optional[int] = None
+    ):
+        """
+        Args:
+            parameters:
+                List of FuzzedParameters which provide specifications
+                for generated parameters. Iterable elements will be
+                unpacked, though arbitrary nested structures will not.
+            tensors:
+                List of FuzzedTensors which define the Tensors which
+                will be created each step based on the parameters for
+                that step. Iterable elements will be unpacked, though
+                arbitrary nested structures will not.
+            constraints:
+                List of callables. They will be called with params
+                as kwargs, and if any of them return False the current
+                set of parameters will be rejected.
+            seed:
+                Seed for the RandomState used by the Fuzzer. This will
+                also be used to set the PyTorch random seed so that random
+                ops will create reproducible Tensors.
+        """
+        if seed is None:
+            seed = np.random.RandomState().randint(0, 2 ** 32 - 1, dtype=np.int64)
+        self._seed = seed
+        self._parameters = Fuzzer._unpack(parameters, FuzzedParameter)
+        self._tensors = Fuzzer._unpack(tensors, FuzzedTensor)
+        self._constraints = constraints or ()
+
+        p_names = {p.name for p in self._parameters}
+        t_names = {t.name for t in self._tensors}
+        name_overlap = p_names.intersection(t_names)
+        if name_overlap:
+            raise ValueError(f"Duplicate names in parameters and tensors: {name_overlap}")
+
+        self._rejections = 0
+        self._total_generated = 0
+
+    @staticmethod
+    def _unpack(values, cls):
+        return tuple(it.chain(
+            *[[i] if isinstance(i, cls) else i for i in values]
+        ))
+
+    def take(self, n):
+        state = np.random.RandomState(self._seed)
+        torch.manual_seed(state.randint(low=0, high=2 ** 63, dtype=np.int64))
+        for _ in range(n):
+            params = self._generate(state)
+            tensors = {}
+            tensor_properties = {}
+            for t in self._tensors:
+                tensor, properties = t._make_tensor(params, state)
+                tensors[t.name] = tensor
+                tensor_properties[t.name] = properties
+            yield tensors, tensor_properties, params
+
+    @property
+    def rejection_rate(self):
+        if not self._total_generated:
+            return 0.
+        return self._rejections / self._total_generated
+
+    def _generate(self, state):
+        strict_params: Dict[str, Union[float, int, ParameterAlias]] = {}
+        for _ in range(1000):
+            candidate_params: Dict[str, Union[float, int, ParameterAlias]] = {}
+            for p in self._parameters:
+                if p.strict:
+                    if p.name in strict_params:
+                        candidate_params[p.name] = strict_params[p.name]
+                    else:
+                        candidate_params[p.name] = p.sample(state)
+                        strict_params[p.name] = candidate_params[p.name]
+                else:
+                    candidate_params[p.name] = p.sample(state)
+
+            candidate_params = self._resolve_aliases(candidate_params)
+
+            self._total_generated += 1
+            if not all(f(candidate_params) for f in self._constraints):
+                self._rejections += 1
+                continue
+
+            if not all(t.satisfies_constraints(candidate_params) for t in self._tensors):
+                self._rejections += 1
+                continue
+
+            return candidate_params
+        raise ValueError("Failed to generate a set of valid parameters.")
+
+    @staticmethod
+    def _resolve_aliases(params):
+        params = dict(params)
+        alias_count = sum(isinstance(v, ParameterAlias) for v in params.values())
+
+        keys = list(params.keys())
+        while alias_count:
+            for k in keys:
+                v = params[k]
+                if isinstance(v, ParameterAlias):
+                    params[k] = params[v.alias_to]
+            alias_count_new = sum(isinstance(v, ParameterAlias) for v in params.values())
+            if alias_count == alias_count_new:
+                raise ValueError(f"ParameterAlias cycle detected\n{params}")
+
+            alias_count = alias_count_new
+
+        return params
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/sparse_fuzzer.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/sparse_fuzzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..801e17c1a1f5bbe6c5edea6cd18ed01099fcb09d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/sparse_fuzzer.py
@@ -0,0 +1,120 @@
+from typing import Optional, Tuple, Union
+from numbers import Number
+import torch
+from torch.utils.benchmark import FuzzedTensor
+import math
+
+class FuzzedSparseTensor(FuzzedTensor):
+    def __init__(
+        self,
+        name: str,
+        size: Tuple[Union[str, int], ...],
+        min_elements: Optional[int] = None,
+        max_elements: Optional[int] = None,
+        dim_parameter: Optional[str] = None,
+        sparse_dim: Optional[str] = None,
+        nnz: Optional[str] = None,
+        density: Optional[str] = None,
+        coalesced: Optional[str] = None,
+        dtype=torch.float32,
+        cuda=False
+    ):
+        """
+        Args:
+            name:
+                A string identifier for the generated Tensor.
+            size:
+                A tuple of integers or strings specifying the size of the generated
+                Tensor. String values will replaced with a concrete int during the
+                generation process, while ints are simply passed as literals.
+            min_elements:
+                The minimum number of parameters that this Tensor must have for a
+                set of parameters to be valid. (Otherwise they are resampled.)
+            max_elements:
+                Like `min_elements`, but setting an upper bound.
+            dim_parameter:
+                The length of `size` will be truncated to this value.
+                This allows Tensors of varying dimensions to be generated by the
+                Fuzzer.
+            sparse_dim:
+                The number of sparse dimensions in a sparse tensor.
+            density:
+                This value allows tensors of varying sparsities to be generated by the Fuzzer.
+            coalesced:
+                The sparse tensor format permits uncoalesced sparse tensors,
+                where there may be duplicate coordinates in the indices.
+            dtype:
+                The PyTorch dtype of the generated Tensor.
+            cuda:
+                Whether to place the Tensor on a GPU.
+        """
+        super().__init__(name=name, size=size, min_elements=min_elements,
+                         max_elements=max_elements, dim_parameter=dim_parameter, dtype=dtype, cuda=cuda)
+        self._density = density
+        self._coalesced = coalesced
+        self._sparse_dim = sparse_dim
+
+    @staticmethod
+    def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
+        """sparse_tensor_constructor creates a sparse tensor with coo format.
+
+        Note that when `is_coalesced` is False, the number of elements is doubled but the number of indices
+        represents the same amount of number of non zeros `nnz`, i.e, this is virtually the same tensor
+        with the same sparsity pattern. Moreover, most of the sparse operation will use coalesce() method
+        and what we want here is to get a sparse tensor with the same `nnz` even if this is coalesced or not.
+
+        In the other hand when `is_coalesced` is True the number of elements is reduced in the coalescing process
+        by an unclear amount however the probability to generate duplicates indices are low for most of the cases.
+        This decision was taken on purpose to maintain the construction cost as low as possible.
+        """
+        if isinstance(size, Number):
+            size = [size] * sparse_dim
+        assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
+        v_size = [nnz] + list(size[sparse_dim:])
+        if dtype.is_floating_point:
+            v = torch.rand(size=v_size, dtype=dtype, device="cpu")
+        else:
+            v = torch.randint(1, 127, size=v_size, dtype=dtype, device="cpu")
+
+        i = torch.rand(sparse_dim, nnz, device="cpu")
+        i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
+        i = i.to(torch.long)
+
+        if not is_coalesced:
+            v = torch.cat([v, torch.randn_like(v)], 0)
+            i = torch.cat([i, i], 1)
+
+        x = torch.sparse_coo_tensor(i, v, torch.Size(size))
+        if is_coalesced:
+            x = x.coalesce()
+        return x
+
+    def _make_tensor(self, params, state):
+        size, _, _ = self._get_size_and_steps(params)
+        density = params['density']
+        nnz = math.ceil(sum(size) * density)
+        assert nnz <= sum(size)
+
+        is_coalesced = params['coalesced']
+        sparse_dim = params['sparse_dim'] if self._sparse_dim else len(size)
+        sparse_dim = min(sparse_dim, len(size))
+        tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
+
+        if self._cuda:
+            tensor = tensor.cuda()
+        sparse_dim = tensor.sparse_dim()
+        dense_dim = tensor.dense_dim()
+        is_hybrid = len(size[sparse_dim:]) > 0
+
+        properties = {
+            "numel": int(tensor.numel()),
+            "shape": tensor.size(),
+            "is_coalesced": tensor.is_coalesced(),
+            "density": density,
+            "sparsity": 1.0 - density,
+            "sparse_dim": sparse_dim,
+            "dense_dim": dense_dim,
+            "is_hybrid": is_hybrid,
+            "dtype": str(self._dtype),
+        }
+        return tensor, properties
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timeit_template.cpp b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timeit_template.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..afb9f570b6f6c31944f87e3c5ca7ca69bff3e70c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timeit_template.cpp
@@ -0,0 +1,43 @@
+/* C++ template for Timer.timeit
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `GLOBAL_SETUP_TEMPLATE_LOCATION`,
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+#include <chrono>
+
+#include <c10/util/irange.h>
+#include <torch/csrc/utils/pybind.h>
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+
+// Global setup. (e.g. #includes)
+// GLOBAL_SETUP_TEMPLATE_LOCATION
+
+double timeit(int n) {
+  pybind11::gil_scoped_release no_gil;
+
+  // Setup
+  // SETUP_TEMPLATE_LOCATION
+
+  {
+    // Warmup
+    // STMT_TEMPLATE_LOCATION
+  }
+
+  // Main loop
+  auto start_time = std::chrono::high_resolution_clock::now();
+  for (const auto loop_idx : c10::irange(n)) {
+    (void)loop_idx;
+    // STMT_TEMPLATE_LOCATION
+  }
+  auto end_time = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration<double>(end_time - start_time).count();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("timeit", &timeit);
+}
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timer.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..617cfc088135fc3782ede743cc726a108adef85a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/timer.py
@@ -0,0 +1,537 @@
+"""Timer class based on the timeit.Timer class, but torch aware."""
+import enum
+import timeit
+import textwrap
+from typing import overload, Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
+
+import torch
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import TimerClass, TimeitModuleType
+from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
+
+
+__all__ = ["Timer", "timer", "Language"]
+
+
+if torch.backends.cuda.is_built() and torch.cuda.is_available():  # type: ignore[no-untyped-call]
+    def timer() -> float:
+        torch.cuda.synchronize()
+        return timeit.default_timer()
+elif torch._C._get_privateuse1_backend_name() != "privateuseone":
+    privateuse1_device_handler = getattr(torch, torch._C._get_privateuse1_backend_name(), None) \
+        if torch._C._get_privateuse1_backend_name() != "cpu" else None
+
+    def timer() -> float:
+        if privateuse1_device_handler:
+            privateuse1_device_handler.synchronize()
+        return timeit.default_timer()
+else:
+    timer = timeit.default_timer
+
+
+class Language(enum.Enum):
+    PYTHON = 0
+    CPP = 1
+
+
+class CPPTimer:
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        global_setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any],
+    ) -> None:
+        if timer is not timeit.default_timer:
+            raise NotImplementedError(
+                "PyTorch was built with CUDA and a GPU is present; however "
+                "Timer does not yet support GPU measurements. If your "
+                "code is CPU only, pass `timer=timeit.default_timer` to the "
+                "Timer's constructor to indicate this. (Note that this will "
+                "produce incorrect results if the GPU is in fact used, as "
+                "Timer will not synchronize CUDA.)"
+            )
+
+        if globals:
+            raise ValueError("C++ timing does not support globals.")
+
+        self._stmt: str = textwrap.dedent(stmt)
+        self._setup: str = textwrap.dedent(setup)
+        self._global_setup: str = textwrap.dedent(global_setup)
+        self._timeit_module: Optional[TimeitModuleType] = None
+
+    def timeit(self, number: int) -> float:
+        if self._timeit_module is None:
+            self._timeit_module = cpp_jit.compile_timeit_template(
+                stmt=self._stmt,
+                setup=self._setup,
+                global_setup=self._global_setup,
+            )
+
+        return self._timeit_module.timeit(number)
+
+
+class Timer:
+    """Helper class for measuring execution time of PyTorch statements.
+
+    For a full tutorial on how to use this class, see:
+    https://pytorch.org/tutorials/recipes/recipes/benchmark.html
+
+    The PyTorch Timer is based on `timeit.Timer` (and in fact uses
+    `timeit.Timer` internally), but with several key differences:
+
+    1) Runtime aware:
+        Timer will perform warmups (important as some elements of PyTorch are
+        lazily initialized), set threadpool size so that comparisons are
+        apples-to-apples, and synchronize asynchronous CUDA functions when
+        necessary.
+
+    2) Focus on replicates:
+        When measuring code, and particularly complex kernels / models,
+        run-to-run variation is a significant confounding factor. It is
+        expected that all measurements should include replicates to quantify
+        noise and allow median computation, which is more robust than mean.
+        To that effect, this class deviates from the `timeit` API by
+        conceptually merging `timeit.Timer.repeat` and `timeit.Timer.autorange`.
+        (Exact algorithms are discussed in method docstrings.) The `timeit`
+        method is replicated for cases where an adaptive strategy is not
+        desired.
+
+    3) Optional metadata:
+        When defining a Timer, one can optionally specify `label`, `sub_label`,
+        `description`, and `env`. (Defined later) These fields are included in
+        the representation of result object and by the `Compare` class to group
+        and display results for comparison.
+
+    4) Instruction counts
+        In addition to wall times, Timer can run a statement under Callgrind
+        and report instructions executed.
+
+    Directly analogous to `timeit.Timer` constructor arguments:
+
+        `stmt`, `setup`, `timer`, `globals`
+
+    PyTorch Timer specific constructor arguments:
+
+        `label`, `sub_label`, `description`, `env`, `num_threads`
+
+    Args:
+        stmt: Code snippet to be run in a loop and timed.
+
+        setup: Optional setup code. Used to define variables used in `stmt`
+
+        global_setup: (C++ only)
+            Code which is placed at the top level of the file for things like
+            `#include` statements.
+
+        timer:
+            Callable which returns the current time. If PyTorch was built
+            without CUDA or there is no GPU present, this defaults to
+            `timeit.default_timer`; otherwise it will synchronize CUDA before
+            measuring the time.
+
+        globals:
+            A dict which defines the global variables when `stmt` is being
+            executed. This is the other method for providing variables which
+            `stmt` needs.
+
+        label:
+            String which summarizes `stmt`. For instance, if `stmt` is
+            "torch.nn.functional.relu(torch.add(x, 1, out=out))"
+            one might set label to "ReLU(x + 1)" to improve readability.
+
+        sub_label:
+            Provide supplemental information to disambiguate measurements
+            with identical stmt or label. For instance, in our example
+            above sub_label might be "float" or "int", so that it is easy
+            to differentiate:
+            "ReLU(x + 1): (float)"
+
+            "ReLU(x + 1): (int)"
+            when printing Measurements or summarizing using `Compare`.
+
+        description:
+            String to distinguish measurements with identical label and
+            sub_label. The principal use of `description` is to signal to
+            `Compare` the columns of data. For instance one might set it
+            based on the input size  to create a table of the form: ::
+
+                                        | n=1 | n=4 | ...
+                                        ------------- ...
+                ReLU(x + 1): (float)    | ... | ... | ...
+                ReLU(x + 1): (int)      | ... | ... | ...
+
+
+            using `Compare`. It is also included when printing a Measurement.
+
+        env:
+            This tag indicates that otherwise identical tasks were run in
+            different environments, and are therefore not equivalent, for
+            instance when A/B testing a change to a kernel. `Compare` will
+            treat Measurements with different `env` specification as distinct
+            when merging replicate runs.
+
+        num_threads:
+            The size of the PyTorch threadpool when executing `stmt`. Single
+            threaded performance is important as both a key inference workload
+            and a good indicator of intrinsic algorithmic efficiency, so the
+            default is set to one. This is in contrast to the default PyTorch
+            threadpool size which tries to utilize all cores.
+    """
+
+    _timer_cls: Type[TimerClass] = timeit.Timer
+
+    def __init__(
+        self,
+        stmt: str = "pass",
+        setup: str = "pass",
+        global_setup: str = "",
+        timer: Callable[[], float] = timer,
+        globals: Optional[Dict[str, Any]] = None,
+        label: Optional[str] = None,
+        sub_label: Optional[str] = None,
+        description: Optional[str] = None,
+        env: Optional[str] = None,
+        num_threads: int = 1,
+        language: Union[Language, str] = Language.PYTHON,
+    ):
+        if not isinstance(stmt, str):
+            raise ValueError("Currently only a `str` stmt is supported.")
+
+        # We copy `globals` to prevent mutations from leaking.
+        # (For instance, `eval` adds the `__builtins__` key)
+        self._globals = dict(globals or {})
+
+        timer_kwargs = {}
+        if language in (Language.PYTHON, "py", "python"):
+            # Include `torch` if not specified as a convenience feature.
+            self._globals.setdefault("torch", torch)
+            self._language: Language = Language.PYTHON
+            if global_setup:
+                raise ValueError(
+                    f"global_setup is C++ only, got `{global_setup}`. Most "
+                    "likely this code can simply be moved to `setup`."
+                )
+
+        elif language in (Language.CPP, "cpp", "c++"):
+            assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
+            self._timer_cls = CPPTimer
+            setup = ("" if setup == "pass" else setup)
+            self._language = Language.CPP
+            timer_kwargs["global_setup"] = global_setup
+
+        else:
+            raise ValueError(f"Invalid language `{language}`.")
+
+        # Convenience adjustment so that multi-line code snippets defined in
+        # functions do not IndentationError (Python) or look odd (C++). The
+        # leading newline removal is for the initial newline that appears when
+        # defining block strings. For instance:
+        #   textwrap.dedent("""
+        #     print("This is a stmt")
+        #   """)
+        # produces '\nprint("This is a stmt")\n'.
+        #
+        # Stripping this down to 'print("This is a stmt")' doesn't change
+        # what gets executed, but it makes __repr__'s nicer.
+        stmt = textwrap.dedent(stmt)
+        stmt = (stmt[1:] if stmt and stmt[0] == "\n" else stmt).rstrip()
+        setup = textwrap.dedent(setup)
+        setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
+
+        self._timer = self._timer_cls(
+            stmt=stmt,
+            setup=setup,
+            timer=timer,
+            globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(self._globals),
+            **timer_kwargs,
+        )
+        self._task_spec = common.TaskSpec(
+            stmt=stmt,
+            setup=setup,
+            global_setup=global_setup,
+            label=label,
+            sub_label=sub_label,
+            description=description,
+            env=env,
+            num_threads=num_threads,
+        )
+
+    def _timeit(self, number: int) -> float:
+        # Even calling a timer in C++ takes ~50 ns, so no real operation should
+        # take less than 1 ns. (And this prevents divide by zero errors.)
+        return max(self._timer.timeit(number), 1e-9)
+
+    def timeit(self, number: int = 1000000) -> common.Measurement:
+        """Mirrors the semantics of timeit.Timer.timeit().
+
+        Execute the main statement (`stmt`) `number` times.
+        https://docs.python.org/3/library/timeit.html#timeit.Timer.timeit
+        """
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Warmup
+            self._timeit(number=max(int(number // 100), 2))
+
+            return common.Measurement(
+                number_per_run=number,
+                raw_times=[self._timeit(number=number)],
+                task_spec=self._task_spec
+            )
+
+    def repeat(self, repeat: int = -1, number: int = -1) -> None:
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def autorange(self, callback: Optional[Callable[[int, float], NoReturn]] = None) -> None:
+        raise NotImplementedError("See `Timer.blocked_autorange.`")
+
+    def _threaded_measurement_loop(
+        self,
+        number: int,
+        time_hook: Callable[[], float],
+        stop_hook: Callable[[List[float]], bool],
+        min_run_time: float,
+        max_run_time: Optional[float] = None,
+        callback: Optional[Callable[[int, float], NoReturn]] = None
+    ) -> List[float]:
+        total_time = 0.0
+        can_stop = False
+        times: List[float] = []
+        with common.set_torch_threads(self._task_spec.num_threads):
+            while (total_time < min_run_time) or (not can_stop):
+                time_spent = time_hook()
+                times.append(time_spent)
+                total_time += time_spent
+                if callback:
+                    callback(number, time_spent)
+                can_stop = stop_hook(times)
+                if max_run_time and total_time > max_run_time:
+                    break
+        return times
+
+    def _estimate_block_size(self, min_run_time: float) -> int:
+        with common.set_torch_threads(self._task_spec.num_threads):
+            # Estimate the block size needed for measurement to be negligible
+            # compared to the inner loop. This also serves as a warmup.
+            overhead = torch.tensor([self._timeit(0) for _ in range(5)]).median().item()
+            number = 1
+            while True:
+                time_taken = self._timeit(number)
+                relative_overhead = overhead / time_taken
+                if relative_overhead <= 1e-4 and time_taken >= min_run_time / 1000:
+                    break
+                if time_taken > min_run_time:
+                    break
+                # Avoid overflow in C++ pybind11 interface
+                if number * 10 > 2147483647:
+                    break
+                number *= 10
+        return number
+
+    def blocked_autorange(
+        self,
+        callback: Optional[Callable[[int, float], NoReturn]] = None,
+        min_run_time: float = 0.2,
+    ) -> common.Measurement:
+        """Measure many replicates while keeping timer overhead to a minimum.
+
+        At a high level, blocked_autorange executes the following pseudo-code::
+
+            `setup`
+
+            total_time = 0
+            while total_time < min_run_time
+                start = timer()
+                for _ in range(block_size):
+                    `stmt`
+                total_time += (timer() - start)
+
+        Note the variable `block_size` in the inner loop. The choice of block
+        size is important to measurement quality, and must balance two
+        competing objectives:
+
+            1) A small block size results in more replicates and generally
+               better statistics.
+
+            2) A large block size better amortizes the cost of `timer`
+               invocation, and results in a less biased measurement. This is
+               important because CUDA synchronization time is non-trivial
+               (order single to low double digit microseconds) and would
+               otherwise bias the measurement.
+
+        blocked_autorange sets block_size by running a warmup period,
+        increasing block size until timer overhead is less than 0.1% of
+        the overall computation. This value is then used for the main
+        measurement loop.
+
+        Returns:
+            A `Measurement` object that contains measured runtimes and
+            repetition counts, and can be used to compute statistics.
+            (mean, median, etc.)
+        """
+        number = self._estimate_block_size(min_run_time)
+
+        def time_hook() -> float:
+            return self._timeit(number)
+
+        def stop_hook(times: List[float]) -> bool:
+            return True
+
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook,
+            min_run_time=min_run_time,
+            callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    def adaptive_autorange(
+            self,
+            threshold: float = 0.1,
+            *,
+            min_run_time: float = 0.01,
+            max_run_time: float = 10.0,
+            callback: Optional[Callable[[int, float], NoReturn]] = None,
+    ) -> common.Measurement:
+        """Similar to `blocked_autorange` but also checks for variablility in measurements
+        and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached.
+
+
+        At a high level, adaptive_autorange executes the following pseudo-code::
+
+            `setup`
+
+            times = []
+            while times.sum < max_run_time
+                start = timer()
+                for _ in range(block_size):
+                    `stmt`
+                times.append(timer() - start)
+
+                enough_data = len(times)>3 and times.sum > min_run_time
+                small_iqr=times.iqr/times.mean<threshold
+
+                if enough_data and small_iqr:
+                    break
+
+        Args:
+            threshold: value of iqr/median threshold for stopping
+
+            min_run_time: total runtime needed before checking `threshold`
+
+            max_run_time: total runtime  for all measurements regardless of `threshold`
+
+        Returns:
+            A `Measurement` object that contains measured runtimes and
+            repetition counts, and can be used to compute statistics.
+            (mean, median, etc.)
+        """
+        number = self._estimate_block_size(min_run_time=0.05)
+
+        def time_hook() -> float:
+            return self._timeit(number)
+
+        def stop_hook(times: List[float]) -> bool:
+            if len(times) > 3:
+                return common.Measurement(
+                    number_per_run=number,
+                    raw_times=times,
+                    task_spec=self._task_spec
+                ).meets_confidence(threshold=threshold)
+            return False
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
+    @overload
+    def collect_callgrind(
+        self,
+        number: int,
+        *,
+        repeats: None,
+        collect_baseline: bool,
+        retain_out_file: bool,
+    ) -> valgrind_timer_interface.CallgrindStats:
+        ...
+
+    @overload
+    def collect_callgrind(
+        self,
+        number: int,
+        *,
+        repeats: int,
+        collect_baseline: bool,
+        retain_out_file: bool,
+    ) -> Tuple[valgrind_timer_interface.CallgrindStats, ...]:
+        ...
+
+    def collect_callgrind(
+        self,
+        number: int = 100,
+        *,
+        repeats: Optional[int] = None,
+        collect_baseline: bool = True,
+        retain_out_file: bool = False,
+    ) -> Any:
+        """Collect instruction counts using Callgrind.
+
+        Unlike wall times, instruction counts are deterministic
+        (modulo non-determinism in the program itself and small amounts of
+        jitter from the Python interpreter.) This makes them ideal for detailed
+        performance analysis. This method runs `stmt` in a separate process
+        so that Valgrind can instrument the program. Performance is severely
+        degraded due to the instrumentation, however this is ameliorated by
+        the fact that a small number of iterations is generally sufficient to
+        obtain good measurements.
+
+        In order to to use this method `valgrind`, `callgrind_control`, and
+        `callgrind_annotate` must be installed.
+
+        Because there is a process boundary between the caller (this process)
+        and the `stmt` execution, `globals` cannot contain arbitrary in-memory
+        data structures. (Unlike timing methods) Instead, globals are
+        restricted to builtins, `nn.Modules`'s, and TorchScripted functions/modules
+        to reduce the surprise factor from serialization and subsequent
+        deserialization. The `GlobalsBridge` class provides more detail on this
+        subject. Take particular care with nn.Modules: they rely on pickle and
+        you may need to add an import to `setup` for them to transfer properly.
+
+        By default, a profile for an empty statement will be collected and
+        cached to indicate how many instructions are from the Python loop which
+        drives `stmt`.
+
+        Returns:
+            A `CallgrindStats` object which provides instruction counts and
+            some basic facilities for analyzing and manipulating results.
+        """
+        if not isinstance(self._task_spec.stmt, str):
+            raise ValueError("`collect_callgrind` currently only supports string `stmt`")
+
+        if repeats is not None and repeats < 1:
+            raise ValueError("If specified, `repeats` must be >= 1")
+
+        # Check that the statement is valid. It doesn't guarantee success, but it's much
+        # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
+        # the parent process rather than the valgrind subprocess.
+        self._timeit(1)
+        is_python = (self._language == Language.PYTHON)
+        assert is_python or not self._globals
+        result = valgrind_timer_interface.wrapper_singleton().collect_callgrind(
+            task_spec=self._task_spec,
+            globals=self._globals,
+            number=number,
+            repeats=repeats or 1,
+            collect_baseline=collect_baseline and is_python,
+            is_python=is_python,
+            retain_out_file=retain_out_file,
+        )
+
+        return (result[0] if repeats is None else result)
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76a21c688581e171c4d74092a71aec98933081ec
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/timer_interface.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/timer_interface.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d8894aaba8aac5eb7daac754d612ecc16a66cfd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/__pycache__/timer_interface.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e39be7f73f05ce7be4464021852126e08c313e2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
@@ -0,0 +1,129 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (callgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of callgrind, a valgrind tool for cache simulation
+   and call tree tracing.
+
+   Copyright (C) 2003-2017 Josef Weidendorfer.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (callgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#ifndef __CALLGRIND_H
+#define __CALLGRIND_H
+
+#include "valgrind.h"
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !!
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end.
+
+   The identification ('C','T') for Callgrind has historical
+   reasons: it was called "Calltree" before. Besides, ('C','G') would
+   clash with cachegrind.
+ */
+
+typedef
+   enum {
+      VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
+      VG_USERREQ__ZERO_STATS,
+      VG_USERREQ__TOGGLE_COLLECT,
+      VG_USERREQ__DUMP_STATS_AT,
+      VG_USERREQ__START_INSTRUMENTATION,
+      VG_USERREQ__STOP_INSTRUMENTATION
+   } Vg_CallgrindClientRequest;
+
+/* Dump current state of cost centers, and zero them afterwards */
+#define CALLGRIND_DUMP_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Dump current state of cost centers, and zero them afterwards.
+   The argument is appended to a string stating the reason which triggered
+   the dump. This string is written as a description field into the
+   profile data dump. */
+#define CALLGRIND_DUMP_STATS_AT(pos_str)                        \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS_AT,    \
+                                  pos_str, 0, 0, 0, 0)
+
+/* Zero cost centers */
+#define CALLGRIND_ZERO_STATS                                    \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ZERO_STATS,       \
+                                  0, 0, 0, 0, 0)
+
+/* Toggles collection state.
+   The collection state specifies whether the happening of events
+   should be noted or if they are to be ignored. Events are noted
+   by increment of counters in a cost center */
+#define CALLGRIND_TOGGLE_COLLECT                                \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT,   \
+                                  0, 0, 0, 0, 0)
+
+/* Start full callgrind instrumentation if not already switched on.
+   When cache simulation is done, it will flush the simulated cache;
+   this will lead to an artificial cache warmup phase afterwards with
+   cache misses which would not have happened in reality. */
+#define CALLGRIND_START_INSTRUMENTATION                              \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
+                                  0, 0, 0, 0, 0)
+
+/* Stop full callgrind instrumentation if not already switched off.
+   This flushes Valgrinds translation cache, and does no additional
+   instrumentation afterwards, which effectivly will run at the same
+   speed as the "none" tool (ie. at minimal slowdown).
+   Use this to bypass Callgrind aggregation for uninteresting code parts.
+   To start Callgrind in this mode to ignore the setup phase, use
+   the option "--instr-atstart=no". */
+#define CALLGRIND_STOP_INSTRUMENTATION                               \
+  VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION,  \
+                                  0, 0, 0, 0, 0)
+
+#endif /* __CALLGRIND_H */
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..532eec13708e9294ad58e63f59eafeedf21bf707
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
@@ -0,0 +1,35 @@
+/* Used to collect profiles of old versions of PyTorch. */
+#include <callgrind.h>
+#include <pybind11/pybind11.h>
+
+bool _valgrind_supported_platform() {
+#if defined(NVALGRIND)
+  return false;
+#else
+  return true;
+#endif
+}
+
+void _valgrind_toggle() {
+#if defined(NVALGRIND)
+  TORCH_CHECK(false, "Valgrind is not supported.");
+#else
+  CALLGRIND_TOGGLE_COLLECT;
+#endif
+}
+
+void _valgrind_toggle_and_dump_stats() {
+#if defined(NVALGRIND)
+  TORCH_CHECK(false, "Valgrind is not supported.");
+#else
+  // NB: See note in Module.cpp
+  CALLGRIND_TOGGLE_COLLECT;
+  CALLGRIND_DUMP_STATS;
+#endif
+}
+
+PYBIND11_MODULE(callgrind_bindings, m) {
+  m.def("_valgrind_supported_platform", &_valgrind_supported_platform);
+  m.def("_valgrind_toggle", &_valgrind_toggle);
+  m.def("_valgrind_toggle_and_dump_stats", &_valgrind_dump_stats);
+}
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c95d0da311b950864d1f67149fcbac8361c8a0a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
@@ -0,0 +1,68 @@
+/* C++ template for Timer.collect_callgrind
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `GLOBAL_SETUP_TEMPLATE_LOCATION`,
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+
+#include <c10/util/irange.h>
+#include <callgrind.h>
+#include <torch/torch.h>
+
+#include <string>
+
+// Global setup. (e.g. #includes)
+// GLOBAL_SETUP_TEMPLATE_LOCATION
+
+#if defined(NVALGRIND)
+static_assert(false);
+#endif
+
+int main(int argc, char* argv[]) {
+  // This file should only be called inside of `Timer`, so we can adopt a
+  // very simple and rigid argument parsing scheme.
+  TORCH_CHECK(argc == 9);
+  TORCH_CHECK(std::string(argv[1]) == "--number");
+  auto number = std::stoi(argv[2]);
+
+  TORCH_CHECK(
+      std::string(argv[3]) == "--number-warmup" ||
+      std::string(argv[3]) == "--number_warmup");
+  auto number_warmup = std::stoi(argv[4]);
+
+  TORCH_CHECK(std::string(argv[5]) == "--repeats");
+  auto repeats = std::stoi(argv[6]);
+
+  TORCH_CHECK(
+      std::string(argv[7]) == "--number-threads" ||
+      std::string(argv[7]) == "--number_threads");
+  auto number_threads = std::stoi(argv[8]);
+  torch::set_num_threads(number_threads);
+
+  // Setup
+  // SETUP_TEMPLATE_LOCATION
+
+  // Warmup
+  for (const auto i : c10::irange(number_warmup)) {
+    (void)i;
+    // STMT_TEMPLATE_LOCATION
+  }
+
+  // Main loop
+  for (const auto repeat : c10::irange(repeats)) {
+    (void)repeat;
+    CALLGRIND_TOGGLE_COLLECT;
+
+    for (const auto i : c10::irange(number)) {
+      (void)i;
+      // STMT_TEMPLATE_LOCATION
+    }
+
+    // NB: See note in Module.cpp
+    CALLGRIND_TOGGLE_COLLECT;
+    CALLGRIND_DUMP_STATS;
+  }
+}
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ae70c710ba27979183388026574e67b72365df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -0,0 +1,907 @@
+"""Intermediate layer between `Timer` and `valgrind`."""
+import collections
+import enum
+import dataclasses
+import itertools as it
+import os
+import pickle
+import re
+import shutil
+import subprocess
+import sys
+import textwrap
+from typing import (
+    cast, Any, Callable, DefaultDict, Dict, Iterator, List, NamedTuple,
+    Optional, Tuple, Union, TYPE_CHECKING)
+
+import torch
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType
+import operator
+
+
+__all__ = ["FunctionCount", "FunctionCounts", "CallgrindStats", "CopyIfCallgrind"]
+
+
+if TYPE_CHECKING:
+    CompletedProcessType = subprocess.CompletedProcess[str]
+else:
+    CompletedProcessType = subprocess.CompletedProcess
+
+
+class FunctionCount(NamedTuple):
+    # TODO(#105471): Rename the count field
+    count: int  # type: ignore[assignment]
+    function: str
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class FunctionCounts:
+    """Container for manipulating Callgrind results.
+
+    It supports:
+        1) Addition and subtraction to combine or diff results.
+        2) Tuple-like indexing.
+        3) A `denoise` function which strips CPython calls which are known to
+           be non-deterministic and quite noisy.
+        4) Two higher order methods (`filter` and `transform`) for custom
+           manipulation.
+    """
+    _data: Tuple[FunctionCount, ...]
+    inclusive: bool
+    truncate_rows: bool = True
+
+    # For normal use, torch._tensor_str.PRINT_OPTS.linewidth determines
+    # the print settings. This is simply to allow hermetic unit tests.
+    _linewidth: Optional[int] = None
+
+    def __iter__(self) -> Iterator[FunctionCount]:
+        yield from self._data
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __getitem__(self, item: Any) -> Union[FunctionCount, "FunctionCounts"]:
+        data: Union[FunctionCount, Tuple[FunctionCount, ...]] = self._data[item]
+        return (
+            FunctionCounts(cast(Tuple[FunctionCount, ...], data), self.inclusive, truncate_rows=False)
+            if isinstance(data, tuple) else data
+        )
+
+    def __repr__(self) -> str:
+        count_len = 0
+        for c, _ in self:
+            # Account for sign in string length.
+            count_len = max(count_len, len(str(c)) + int(c < 0))
+
+        lines = []
+        linewidth = self._linewidth or torch._tensor_str.PRINT_OPTS.linewidth
+        fn_str_len = max(linewidth - count_len - 4, 40)
+        for c, fn in self:
+            if len(fn) > fn_str_len:
+                left_len = int((fn_str_len - 5) // 2)
+                fn = fn[:left_len] + " ... " + fn[-(fn_str_len - left_len - 5):]
+            lines.append(f"  {c:>{count_len}}  {fn}")
+
+        if self.truncate_rows and len(lines) > 18:
+            lines = lines[:9] + ["...".rjust(count_len + 2)] + lines[-9:]
+
+        if not self.inclusive:
+            lines.extend(["", f"Total: {self.sum()}"])
+
+        return "\n".join([super().__repr__()] + lines)
+
+    def __add__(
+        self,
+        other: "FunctionCounts",
+    ) -> "FunctionCounts":
+        return self._merge(other, lambda c: c)
+
+    def __sub__(
+        self,
+        other: "FunctionCounts",
+    ) -> "FunctionCounts":
+        return self._merge(other, operator.neg)
+
+    def __mul__(self, other: Union[int, float]) -> "FunctionCounts":
+        return self._from_dict({
+            fn: int(c * other) for c, fn in self._data
+        }, self.inclusive)
+
+    def transform(self, map_fn: Callable[[str], str]) -> "FunctionCounts":
+        """Apply `map_fn` to all of the function names.
+
+        This can be used to regularize function names (e.g. stripping irrelevant
+        parts of the file path), coalesce entries by mapping multiple functions
+        to the same name (in which case the counts are added together), etc.
+        """
+        counts: DefaultDict[str, int] = collections.defaultdict(int)
+        for c, fn in self._data:
+            counts[map_fn(fn)] += c
+
+        return self._from_dict(counts, self.inclusive)
+
+    def filter(self, filter_fn: Callable[[str], bool]) -> "FunctionCounts":
+        """Keep only the elements where `filter_fn` applied to function name returns True."""
+        return FunctionCounts(tuple(i for i in self if filter_fn(i.function)), self.inclusive)
+
+    def sum(self) -> int:
+        return sum(c for c, _ in self)
+
+    def denoise(self) -> "FunctionCounts":
+        """Remove known noisy instructions.
+
+        Several instructions in the CPython interpreter are rather noisy. These
+        instructions involve unicode to dictionary lookups which Python uses to
+        map variable names. FunctionCounts is generally a content agnostic
+        container, however this is sufficiently important for obtaining
+        reliable results to warrant an exception."""
+        return self.filter(lambda fn: "dictobject.c:lookdict_unicode" not in fn)
+
+    def _merge(
+        self,
+        second: "FunctionCounts",
+        merge_fn: Callable[[int], int]
+    ) -> "FunctionCounts":
+        assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
+        counts: DefaultDict[str, int] = collections.defaultdict(int)
+        for c, fn in self:
+            counts[fn] += c
+
+        for c, fn in second:
+            counts[fn] += merge_fn(c)
+
+        return self._from_dict(counts, self.inclusive)
+
+    @staticmethod
+    def _from_dict(counts: Dict[str, int], inclusive: bool) -> "FunctionCounts":
+        flat_counts = (FunctionCount(c, fn) for fn, c in counts.items() if c)
+        return FunctionCounts(tuple(sorted(flat_counts, reverse=True)), inclusive)
+
+
+@dataclasses.dataclass(repr=False, eq=False, frozen=True)
+class CallgrindStats:
+    """Top level container for Callgrind results collected by Timer.
+
+    Manipulation is generally done using the FunctionCounts class, which is
+    obtained by calling `CallgrindStats.stats(...)`. Several convenience
+    methods are provided as well; the most significant is
+    `CallgrindStats.as_standardized()`.
+    """
+    task_spec: common.TaskSpec
+    number_per_run: int
+    built_with_debug_symbols: bool
+    baseline_inclusive_stats: FunctionCounts
+    baseline_exclusive_stats: FunctionCounts
+    stmt_inclusive_stats: FunctionCounts
+    stmt_exclusive_stats: FunctionCounts
+    stmt_callgrind_out: Optional[str]
+
+    def __repr__(self) -> str:
+        newline = "\n"  # `\` cannot appear in fstring code section.
+        base_stats = self.baseline_exclusive_stats
+        output = f"""
+{super().__repr__()}
+{self.task_spec.summarize()}
+  {'':>25}All{'':>10}Noisy symbols removed
+    Instructions: {self.counts(denoise=False):>12}{'':>15}{self.counts(denoise=True):>12}
+    Baseline:     {base_stats.sum():>12}{'':>15}{base_stats.denoise().sum():>12}
+{self.number_per_run} runs per measurement, {self.task_spec.num_threads} thread{'s' if self.task_spec.num_threads > 1 else ''}
+""".strip()
+        if not self.built_with_debug_symbols:
+            output += textwrap.dedent("""
+            Warning: PyTorch was not built with debug symbols.
+                     Source information may be limited. Rebuild with
+                     REL_WITH_DEB_INFO=1 for more detailed results.""")
+        return output
+
+    def stats(self, inclusive: bool = False) -> FunctionCounts:
+        """Returns detailed function counts.
+
+        Conceptually, the FunctionCounts returned can be thought of as a tuple
+        of (count, path_and_function_name) tuples.
+
+        `inclusive` matches the semantics of callgrind. If True, the counts
+        include instructions executed by children. `inclusive=True` is useful
+        for identifying hot spots in code; `inclusive=False` is useful for
+        reducing noise when diffing counts from two different runs. (See
+        CallgrindStats.delta(...) for more details)
+        """
+        return self.stmt_inclusive_stats if inclusive else self.stmt_exclusive_stats
+
+    def counts(self, *, denoise: bool = False) -> int:
+        """Returns the total number of instructions executed.
+
+        See `FunctionCounts.denoise()` for an explanation of the `denoise` arg.
+        """
+        stats = self.stmt_exclusive_stats
+        return (stats.denoise() if denoise else stats).sum()
+
+    # FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
+    def delta(
+        self,
+        other: "CallgrindStats",
+        inclusive: bool = False,
+    ) -> FunctionCounts:
+        """Diff two sets of counts.
+
+        One common reason to collect instruction counts is to determine the
+        the effect that a particular change will have on the number of instructions
+        needed to perform some unit of work. If a change increases that number, the
+        next logical question is "why". This generally involves looking at what part
+        if the code increased in instruction count. This function automates that
+        process so that one can easily diff counts on both an inclusive and
+        exclusive basis.
+        """
+        return self.stats(inclusive=inclusive) - other.stats(inclusive=inclusive)
+
+    def as_standardized(self) -> "CallgrindStats":
+        """Strip library names and some prefixes from function strings.
+
+        When comparing two different sets of instruction counts, on stumbling
+        block can be path prefixes. Callgrind includes the full filepath
+        when reporting a function (as it should). However, this can cause
+        issues when diffing profiles. If a key component such as Python
+        or PyTorch was built in separate locations in the two profiles, which
+        can result in something resembling::
+
+            23234231 /tmp/first_build_dir/thing.c:foo(...)
+             9823794 /tmp/first_build_dir/thing.c:bar(...)
+              ...
+               53453 .../aten/src/Aten/...:function_that_actually_changed(...)
+              ...
+             -9823794 /tmp/second_build_dir/thing.c:bar(...)
+            -23234231 /tmp/second_build_dir/thing.c:foo(...)
+
+        Stripping prefixes can ameliorate this issue by regularizing the
+        strings and causing better cancellation of equivalent call sites
+        when diffing.
+        """
+        def strip(stats: FunctionCounts) -> FunctionCounts:
+            transforms = (
+                # PyTorch may have been built in different locations.
+                (r"^.+build/\.\./", "build/../"),
+                (r"^.+/" + re.escape("build/aten/"), "build/aten/"),
+
+                # "Python" and "Objects" come from CPython.
+                (r"^.+/" + re.escape("Python/"), "Python/"),
+                (r"^.+/" + re.escape("Objects/"), "Objects/"),
+
+                # Strip library name. e.g. `libtorch.so`
+                (r"\s\[.+\]$", ""),
+            )
+
+            for before, after in transforms:
+                stats = stats.transform(lambda fn: re.sub(before, after, fn))
+
+            return stats
+
+        return CallgrindStats(
+            task_spec=self.task_spec,
+            number_per_run=self.number_per_run,
+            built_with_debug_symbols=self.built_with_debug_symbols,
+            baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
+            baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
+            stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
+            stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
+
+            # `as_standardized` will change symbol names, so the contents will
+            # no longer map directly to `callgrind.out`
+            stmt_callgrind_out=None,
+        )
+
+
+class Serialization(enum.Enum):
+    PICKLE = 0
+    TORCH = 1
+    TORCH_JIT = 2
+
+
+_GLOBALS_ALLOWED_TYPES: Dict[Serialization, Tuple[Any, ...]] = {
+    Serialization.PICKLE: (str, bytes, bool, int, float, complex),
+    Serialization.TORCH_JIT: (torch.jit.ScriptFunction, torch.jit.ScriptModule),
+    Serialization.TORCH: (torch.nn.Module,),
+}
+
+
+class CopyIfCallgrind:
+    """Signal that a global may be replaced with a deserialized copy.
+
+    See `GlobalsBridge` for why this matters.
+    """
+    def __init__(self, value: Any, *, setup: Optional[str] = None):
+        for method, supported_types in _GLOBALS_ALLOWED_TYPES.items():
+            if any(isinstance(value, t) for t in supported_types):
+                self._value: Any = value
+                self._setup: Optional[str] = setup
+                self._serialization: Serialization = method
+                break
+        else:
+            supported_str = "\n".join([
+                getattr(t, "__name__", repr(t))
+                for t in it.chain(_GLOBALS_ALLOWED_TYPES.values())])
+
+            raise ValueError(
+                f"Unsupported type: {type(value)}\n"
+                f"`collect_callgrind` restricts globals to the following types:\n"
+                f"{textwrap.indent(supported_str, '  ')}"
+            )
+
+    @property
+    def value(self) -> Any:
+        return self._value
+
+    @property
+    def setup(self) -> Optional[str]:
+        return self._setup
+
+    @property
+    def serialization(self) -> Serialization:
+        return self._serialization
+
+    @staticmethod
+    def unwrap_all(globals: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            k: (v.value if isinstance(v, CopyIfCallgrind) else v)
+            for k, v in globals.items()
+        }
+
+
+class GlobalsBridge:
+    """Handle the transfer of (certain) globals when collecting Callgrind statistics.
+
+    Key takeaway: Any globals passed must be wrapped in `CopyIfCallgrind` to
+                  work with `Timer.collect_callgrind`.
+
+    Consider the following code snippet:
+    ```
+        import pickle
+        import timeit
+
+        class Counter:
+            value = 0
+
+            def __call__(self):
+                self.value += 1
+
+        counter = Counter()
+        timeit.Timer("counter()", globals={"counter": counter}).timeit(10)
+        print(counter.value)  # 10
+
+        timeit.Timer(
+            "counter()",
+            globals={"counter": pickle.loads(pickle.dumps(counter))}
+        ).timeit(20)
+        print(counter.value)  # Still 10
+    ```
+
+    In the first case, `stmt` is executed using the objects in `globals`;
+    however, the addition of serialization and deserialization changes the
+    semantics and may meaningfully change behavior.
+
+    This is a practical consideration when collecting Callgrind statistics.
+    Unlike `exec` based execution (which `timeit` uses under the hood) which
+    can share in-memory data structures with the caller, Callgrind collection
+    requires an entirely new process in order to run under Valgrind. This means
+    that any data structures used for statement execution will have to be
+    serialized and deserialized in the subprocess.
+
+    In order to avoid surprising semantics from (user invisible) process
+    boundaries, what can be passed through `globals` is severely restricted
+    for `Timer.collect_callgrind`. It is expected that most setup should be
+    achievable (albeit perhaps less ergonomically) by passing a `setup`
+    string.
+
+    There are, however, exceptions. One such class are TorchScripted functions.
+    Because they require a concrete file with source code it is not possible
+    to define them using a `setup` string. Another group are torch.nn.Modules,
+    whose construction can be complex and prohibitively cumbersome to coerce
+    into a `setup` string. Finally, most builtin types are sufficiently well
+    behaved and sufficiently common to warrant allowing as well. (e.g.
+    `globals={"n": 1}` is very convenient.)
+
+    Fortunately, all have well defined serialization semantics. This class
+    is responsible for enabling the Valgrind subprocess to use elements in
+    `globals` so long as they are an allowed type.
+
+    Caveats:
+        The user is required to acknowledge this serialization by wrapping
+        elements in `globals` with `CopyIfCallgrind`.
+
+        While ScriptFunction and ScriptModule are expected to save and load
+        quite robustly, it is up to the user to ensure that an nn.Module can
+        un-pickle successfully.
+
+        `torch.Tensor` and `np.ndarray` are deliberately excluded. The
+        serialization/deserialization process perturbs the representation of a
+        tensor in ways that could result in incorrect measurements. For example,
+        if a tensor lives in pinned CPU memory, this fact would not be preserved
+        by a dump, and that will in turn change the performance of certain CUDA
+        operations.
+    """
+
+    def __init__(self, globals: Dict[str, Any], data_dir: str) -> None:
+        self._globals: Dict[str, CopyIfCallgrind] = {}
+        self._data_dir = data_dir
+        if not os.path.exists(data_dir):
+            os.mkdir(data_dir)
+
+        if globals.get("torch", torch) is not torch:
+            raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
+
+        for name, value in globals.items():
+            if name in ("torch", "__builtins__"):
+                # Torch will be imported by the collection script, and
+                # __builtins__ is added by Timer.
+                continue
+
+            if not isinstance(value, CopyIfCallgrind):
+                raise ValueError(
+                    "`collect_callgrind` requires that globals be wrapped in "
+                    "`CopyIfCallgrind` so that serialization is explicit."
+                )
+
+            self._globals[name] = value
+
+    def construct(self) -> str:
+        load_lines = []
+        for name, wrapped_value in self._globals.items():
+            if wrapped_value.setup is not None:
+                load_lines.append(textwrap.dedent(wrapped_value.setup))
+
+            if wrapped_value.serialization == Serialization.PICKLE:
+                path = os.path.join(self._data_dir, f"{name}.pkl")
+                load_lines.append(
+                    f"with open({repr(path)}, 'rb') as f:\n    {name} = pickle.load(f)")
+                with open(path, "wb") as f:
+                    pickle.dump(wrapped_value.value, f)
+
+            elif wrapped_value.serialization == Serialization.TORCH:
+                path = os.path.join(self._data_dir, f"{name}.pt")
+                load_lines.append(f"{name} = torch.load({repr(path)})")
+                torch.save(wrapped_value.value, path)
+
+            elif wrapped_value.serialization == Serialization.TORCH_JIT:
+                path = os.path.join(self._data_dir, f"{name}.pt")
+                load_lines.append(f"{name} = torch.jit.load({repr(path)})")
+                with open(path, "wb") as f:
+                    torch.jit.save(wrapped_value.value, f)  # type: ignore[no-untyped-call]
+
+            else:
+                raise NotImplementedError(
+                    f"Unknown serialization method: {wrapped_value.serialization}")
+
+        return "\n".join(load_lines)
+
+
+class _ValgrindWrapper:
+    def __init__(self) -> None:
+        self._bindings_module: Optional[CallgrindModuleType] = None
+        valgrind_symbols = (
+            "_valgrind_supported_platform",
+            "_valgrind_toggle",
+            "_valgrind_toggle_and_dump_stats",
+        )
+        if all(hasattr(torch._C, symbol) for symbol in valgrind_symbols):
+            self._supported_platform: bool = torch._C._valgrind_supported_platform()
+
+        else:
+            print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
+            self._bindings_module = cpp_jit.get_compat_bindings()
+            assert all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols)
+            self._supported_platform = self._bindings_module._valgrind_supported_platform()
+
+        self._commands_available: Dict[str, bool] = {}
+        if self._supported_platform:
+            # Only bother checking on supported platforms.
+            for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
+                self._commands_available[cmd] = not subprocess.run(
+                    ["which", cmd],
+                    capture_output=True,
+                    check=False,
+                ).returncode
+
+        self._build_type: Optional[str] = None
+        build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show())  # type: ignore[no-untyped-call]
+        if build_search is not None:
+            self._build_type = build_search.groups()[0].split(",")[0]
+
+    def _validate(self) -> None:
+        if not self._supported_platform:
+            raise OSError("Valgrind is not supported on this platform.")
+
+        missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
+        if missing_cmds:
+            raise OSError("Missing: " + ", ".join(missing_cmds))
+
+    def collect_callgrind(
+        self,
+        task_spec: common.TaskSpec,
+        globals: Dict[str, Any],
+        *,
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        is_python: bool,
+        retain_out_file: bool,
+    ) -> Tuple[CallgrindStats, ...]:
+        """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
+        self._validate()
+        assert is_python or not collect_baseline
+
+        *task_stats, baseline_stats = self._invoke(
+            task_spec=task_spec,
+            globals=globals,
+            number=number,
+            repeats=repeats,
+            collect_baseline=collect_baseline,
+            is_python=is_python,
+            retain_out_file=retain_out_file,
+        )
+        assert len(task_stats) == repeats
+
+        return tuple(
+            CallgrindStats(
+                task_spec=task_spec,
+                number_per_run=number,
+                built_with_debug_symbols=self._build_type == "RelWithDebInfo",
+                baseline_inclusive_stats=baseline_stats[0],
+                baseline_exclusive_stats=baseline_stats[1],
+                stmt_inclusive_stats=stmt_inclusive_stats,
+                stmt_exclusive_stats=stmt_exclusive_stats,
+                stmt_callgrind_out=out_contents,
+            )
+            for stmt_inclusive_stats, stmt_exclusive_stats, out_contents in task_stats
+        )
+
+    def _invoke(
+        self,
+        *,
+        task_spec: common.TaskSpec,
+        globals: Dict[str, Any],
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        is_python: bool,
+        retain_out_file: bool,
+    ) -> Tuple[Tuple[FunctionCounts, FunctionCounts, Optional[str]], ...]:
+        """Core invocation method for Callgrind collection.
+
+        Valgrind operates by effectively replacing the CPU with an emulated
+        version which allows it to instrument any code at the cost of severe
+        performance degradation. This has the practical effect that in order
+        to collect Callgrind statistics, a new process has to be created
+        running under `valgrind`. The steps for this process are:
+
+        1) Create a scratch directory.
+        2) Codegen a run script. (_ValgrindWrapper._construct_script)
+            Inside the run script:
+                * Validate that Python and torch match the parent process
+                * Validate that it is indeed running under valgrind
+                * Execute `setup` and warm up `stmt`
+                * Begin collecting stats
+                * Run the `stmt` loop
+                * Stop collecting stats
+        3) Parse the run results.
+        4) Cleanup the scratch directory.
+        """
+        working_dir = common._make_temp_dir(prefix="callgrind")
+        data_dir = os.path.join(working_dir, "data")
+        script_file = os.path.join(working_dir, "timer_callgrind.py")
+        callgrind_out = os.path.join(working_dir, "callgrind.out")
+        error_log = os.path.join(working_dir, "error.txt")
+        stat_log = os.path.join(working_dir, "callgrind_stat.txt")
+        stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
+
+        def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
+            # https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
+            f_stdout_stderr = open(stdout_stderr_log, "wb")
+            try:
+                invocation = subprocess.run(
+                    args,
+                    stdout=f_stdout_stderr,
+                    stderr=subprocess.STDOUT,
+                    **kwargs,
+                )
+                with open(stdout_stderr_log) as f:
+                    return invocation, f.read()
+            finally:
+                f_stdout_stderr.close()
+
+        try:
+            if is_python:
+                if self._bindings_module is not None:
+                    shutil.copy(
+                        self._bindings_module.__file__,
+                        os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
+                    )
+
+                script_file = os.path.join(working_dir, "timer_callgrind.py")
+                with open(script_file, "w") as f:
+                    f.write(self._construct_script(
+                        task_spec,
+                        globals=GlobalsBridge(globals, data_dir),
+                        number=number,
+                        repeats=repeats,
+                        collect_baseline=collect_baseline,
+                        error_log=error_log,
+                        stat_log=stat_log,
+                        bindings=self._bindings_module))
+
+                run_loop_cmd = ["python", script_file]
+            else:
+                assert not collect_baseline
+                run_loop_exec = cpp_jit.compile_callgrind_template(
+                    stmt=task_spec.stmt,
+                    setup=task_spec.setup,
+                    global_setup=task_spec.global_setup,
+                )
+                run_loop_cmd = [
+                    run_loop_exec,
+                    "--number", str(number),
+                    "--number-warmup", str(min(number, 10)),
+                    "--repeats", str(repeats),
+                    "--number-threads", str(task_spec.num_threads),
+                ]
+
+            valgrind_invocation, valgrind_invocation_output = run([
+                "valgrind",
+                "--tool=callgrind",
+                f"--callgrind-out-file={callgrind_out}",
+                "--dump-line=yes",
+                "--dump-instr=yes",
+                "--instr-atstart=yes",
+                "--collect-atstart=no",
+            ] + run_loop_cmd)
+
+            if valgrind_invocation.returncode:
+                error_report = ""
+                if os.path.exists(error_log):
+                    with open(error_log) as f:
+                        error_report = f.read()
+                if not error_report:
+                    error_report = "Unknown error.\n" + valgrind_invocation_output
+
+                raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
+
+            def parse_output(fpath: str, inclusive: bool) -> FunctionCounts:
+                annotate_invocation, annotate_invocation_output = run([
+                    "callgrind_annotate",
+                    f"--inclusive={'yes' if inclusive else 'no'}",
+                    "--threshold=100",
+                    "--show-percs=no",
+                    fpath
+                ], check=True)
+
+                total_pattern = re.compile(r"^([0-9,]+)\s+PROGRAM TOTALS")
+                begin_pattern = re.compile(r"Ir\s+file:function")
+                function_pattern = re.compile(r"^\s*([0-9,]+)\s+(.+:.+)$")
+
+                class ScanState(enum.Enum):
+                    SCANNING_FOR_TOTAL = 0
+                    SCANNING_FOR_START = 1
+                    PARSING = 2
+
+                scan_state = ScanState.SCANNING_FOR_TOTAL
+                fn_counts = []
+                for l in annotate_invocation_output.splitlines(keepends=False):
+                    if scan_state == ScanState.SCANNING_FOR_TOTAL:
+                        total_match = total_pattern.match(l)
+                        if total_match:
+                            program_totals = int(total_match.groups()[0].replace(",", ""))
+                            scan_state = ScanState.SCANNING_FOR_START
+
+                    elif scan_state == ScanState.SCANNING_FOR_START:
+                        if begin_pattern.match(l):
+                            scan_state = ScanState.PARSING
+
+                    else:
+                        assert scan_state == ScanState.PARSING
+                        fn_match = function_pattern.match(l)
+                        if fn_match:
+                            ir_str, file_function = fn_match.groups()
+                            ir = int(ir_str.replace(",", ""))
+                            if ir == program_totals:  # type: ignore[possibly-undefined]
+                                # Callgrind includes some top level red herring symbols when
+                                # a program dumps multiple profiles.
+                                continue
+                            fn_counts.append(FunctionCount(ir, file_function))
+
+                        elif re.match(r"-+", l):
+                            # Ignore heading separator lines.
+                            continue
+
+                        else:
+                            break
+
+                assert scan_state == ScanState.PARSING, f"Failed to parse {fpath}"
+                return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
+
+            def read_results(i: int) -> Tuple[FunctionCounts, FunctionCounts, Optional[str]]:
+                if i == repeats and not collect_baseline:
+                    # Null baseline.
+                    return (
+                        FunctionCounts((), inclusive=True),
+                        FunctionCounts((), inclusive=False),
+                        None,
+                    )
+
+                fpath = f"{callgrind_out}.{i + 1}"  # Callgrind one-indexes files.
+                callgrind_out_contents: Optional[str] = None
+                if retain_out_file:
+                    with open(fpath) as f:
+                        callgrind_out_contents = f.read()
+
+                return (
+                    parse_output(fpath, inclusive=True),
+                    parse_output(fpath, inclusive=False),
+                    callgrind_out_contents
+                )
+
+            return tuple(read_results(i) for i in range(repeats + 1))
+        finally:
+            shutil.rmtree(working_dir)
+
+    @staticmethod
+    def _construct_script(
+        task_spec: common.TaskSpec,
+        globals: GlobalsBridge,
+        *,
+        number: int,
+        repeats: int,
+        collect_baseline: bool,
+        error_log: str,
+        stat_log: str,
+        bindings: Optional[CallgrindModuleType],
+    ) -> str:
+        def block_stmt(stmt: str, indent: int = 0) -> str:
+            """Partially unroll benchmark loop.
+
+            The naive template looks something like:
+                "for _ in range({number}): {stmt}"
+
+            However a loop in Python is surprisingly expensive, and significantly
+            increases the number of background Python instructions. So instead we
+            partially unroll the loops, with a block size of 100 chosen to keep
+            the instruction overhead from `range` low while also not ballooning
+            the size of the generated file.
+            """
+            block_size = 100
+            loop_count = number // block_size
+            if loop_count == 1:
+                # There is no point in having `for _ in range(1): ...` rather
+                # than just `...`, and this lets us save shave a few background
+                # instructions.
+                loop_count = 0
+            remainder = number - block_size * loop_count
+            blocked_stmt = ""
+
+            if loop_count:
+                unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
+                blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
+
+            if remainder:
+                blocked_stmt += "\n".join([stmt] * remainder)
+
+            return textwrap.indent(blocked_stmt, " " * indent)
+
+        pass_baseline = (
+            "callgrind_bindings._valgrind_toggle()\n"
+            f"{block_stmt('pass')}\n"
+            "callgrind_bindings._valgrind_toggle_and_dump_stats()"
+        )
+
+        return textwrap.dedent(r"""
+            import gc
+            import os
+            import pickle
+            import subprocess
+            import sys
+            import time
+
+            # Mitigate https://github.com/pytorch/pytorch/issues/37377
+            # which can sometimes cause the subprocess call to fail.
+            import numpy as np
+
+            import torch
+            torch.set_num_threads({num_threads})
+
+            {bindings_import}
+
+            PID = os.getpid()
+
+            def log_failure(msg):
+                with open({error_log_repr}, "wt") as f:
+                    f.write(msg)
+                sys.exit(1)
+
+            def check_result(completed_process):
+                if completed_process.returncode:
+                    log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
+                return completed_process
+
+            # =============================================================================
+            # == Check that subprocess matches parent =====================================
+            # =============================================================================
+            if os.path.realpath(sys.executable) != "{parent_interpreter}":
+                log_failure(
+                    "Interpreter mismatch:\n"
+                    f"  {{os.path.realpath(sys.executable)}}\n    vs.\n  {parent_interpreter}"
+                )
+
+            if torch.__file__ != "{torch_file}":
+                log_failure(
+                    "PyTorch does not match expected file:\n"
+                    f"  {{torch.__file__}}\n    vs.\n  {torch_file}"
+                )
+
+            # =============================================================================
+            # == User specified setup =====================================================
+            # =============================================================================
+            # Load serialized globals
+            {load_globals}
+
+            # User setup str
+            {setup}
+
+            for _ in range({warmup_number}):
+            {indented_stmt}
+
+            # =============================================================================
+            # == Callgrind management =====================================================
+            # =============================================================================
+            with open("{stat_log}", "wb") as stat_file:
+                # If many instances of callgrind are running at once, the output of
+                # `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
+                # to deadlock. So instead we use a file.
+                callgrind_stat = check_result(subprocess.run(
+                    ["callgrind_control", "--stat"],
+                    stdout=stat_file,
+                    stderr=subprocess.STDOUT,
+                ))
+
+            with open("{stat_log}", "rt") as stat_file:
+                stat_lines = stat_file.read().splitlines()
+
+            if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
+                log_failure("Process does not appear to be running callgrind.")
+
+            gc.collect()
+            time.sleep(0.01)
+
+            # =============================================================================
+            # == User code block ==========================================================
+            # =============================================================================
+            for _ in range({repeats}):
+                callgrind_bindings._valgrind_toggle()
+            {blocked_stmt}
+                callgrind_bindings._valgrind_toggle_and_dump_stats()
+                gc.collect()
+
+            {baseline}
+        """).strip().format(
+            indented_stmt=textwrap.indent(task_spec.stmt, " " * 4),
+            blocked_stmt=block_stmt(task_spec.stmt, indent=4),
+            baseline=(pass_baseline if collect_baseline else ""),
+            number=number,
+            repeats=repeats,
+            load_globals=globals.construct(),
+            setup=task_spec.setup,
+            warmup_number=min(number, 10),
+            num_threads=task_spec.num_threads,
+            error_log_repr=repr(error_log),
+            stat_log=stat_log,
+            parent_interpreter=os.path.realpath(sys.executable),
+            torch_file=torch.__file__,
+            bindings_import=(
+                "import torch._C as callgrind_bindings" if bindings is None
+                else f"import {bindings.__name__} as callgrind_bindings"),
+        )
+
+
+CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
+def wrapper_singleton() -> _ValgrindWrapper:
+    global CALLGRIND_SINGLETON
+    if CALLGRIND_SINGLETON is None:
+        CALLGRIND_SINGLETON = _ValgrindWrapper()
+    return CALLGRIND_SINGLETON
diff --git a/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf227c56a91dbf0ceb65ff8bb97b8e2e244f2a63
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
@@ -0,0 +1,7157 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2017 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+
+/* ------------------------------------------------------------------ */
+/* VERSION NUMBER OF VALGRIND                                         */
+/* ------------------------------------------------------------------ */
+
+/* Specify Valgrind's version number, so that user code can
+   conditionally compile based on our version number.  Note that these
+   were introduced at version 3.6 and so do not exist in version 3.5
+   or earlier.  The recommended way to use them to check for "version
+   X.Y or later" is (eg)
+
+#if defined(__VALGRIND_MAJOR__) && defined(__VALGRIND_MINOR__)   \
+    && (__VALGRIND_MAJOR__ > 3                                   \
+        || (__VALGRIND_MAJOR__ == 3 && __VALGRIND_MINOR__ >= 6))
+*/
+#define __VALGRIND_MAJOR__    3
+#define __VALGRIND_MINOR__    17
+
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is).
+
+   Misc note: how to find out what's predefined in gcc by default:
+   gcc -Wp,-dM somefile.c
+*/
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_amd64_win64
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
+#undef PLAT_arm_linux
+#undef PLAT_arm64_linux
+#undef PLAT_s390x_linux
+#undef PLAT_mips32_linux
+#undef PLAT_mips64_linux
+#undef PLAT_nanomips_linux
+#undef PLAT_x86_solaris
+#undef PLAT_amd64_solaris
+
+
+#if defined(__APPLE__) && defined(__i386__)
+#  define PLAT_x86_darwin 1
+#elif defined(__APPLE__) && defined(__x86_64__)
+#  define PLAT_amd64_darwin 1
+#elif (defined(__MINGW32__) && defined(__i386__)) \
+      || defined(__CYGWIN32__) \
+      || (defined(_WIN32) && defined(_M_IX86))
+#  define PLAT_x86_win32 1
+#elif (defined(__MINGW32__) && defined(__x86_64__)) \
+      || (defined(_WIN32) && defined(_M_X64))
+/* __MINGW32__ and _WIN32 are defined in 64 bit mode as well. */
+#  define PLAT_amd64_win64 1
+#elif defined(__linux__) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif defined(__linux__) && defined(__x86_64__) && !defined(__ILP32__)
+#  define PLAT_amd64_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF != 2
+/* Big Endian uses ELF version 1 */
+#  define PLAT_ppc64be_linux 1
+#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) && _CALL_ELF == 2
+/* Little Endian uses ELF version 2 */
+#  define PLAT_ppc64le_linux 1
+#elif defined(__linux__) && defined(__arm__) && !defined(__aarch64__)
+#  define PLAT_arm_linux 1
+#elif defined(__linux__) && defined(__aarch64__) && !defined(__arm__)
+#  define PLAT_arm64_linux 1
+#elif defined(__linux__) && defined(__s390__) && defined(__s390x__)
+#  define PLAT_s390x_linux 1
+#elif defined(__linux__) && defined(__mips__) && (__mips==64)
+#  define PLAT_mips64_linux 1
+#elif defined(__linux__) && defined(__mips__) && (__mips==32)
+#  define PLAT_mips32_linux 1
+#elif defined(__linux__) && defined(__nanomips__)
+#  define PLAT_nanomips_linux 1
+#elif defined(__sun) && defined(__i386__)
+#  define PLAT_x86_solaris 1
+#elif defined(__sun) && defined(__x86_64__)
+#  define PLAT_amd64_solaris 1
+#else
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+/*
+ * VALGRIND_DO_CLIENT_REQUEST(): a statement that invokes a Valgrind client
+ * request. Accepts both pointers and integers as arguments.
+ *
+ * VALGRIND_DO_CLIENT_REQUEST_STMT(): a statement that invokes a Valgrind
+ * client request that does not return a value.
+
+ * VALGRIND_DO_CLIENT_REQUEST_EXPR(): a C expression that invokes a Valgrind
+ * client request and whose value equals the client request result.  Accepts
+ * both pointers and integers as arguments.  Note that such calls are not
+ * necessarily pure functions -- they may have side effects.
+ */
+
+#define VALGRIND_DO_CLIENT_REQUEST(_zzq_rlval, _zzq_default,            \
+                                   _zzq_request, _zzq_arg1, _zzq_arg2,  \
+                                   _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  do { (_zzq_rlval) = VALGRIND_DO_CLIENT_REQUEST_EXPR((_zzq_default),   \
+                        (_zzq_request), (_zzq_arg1), (_zzq_arg2),       \
+                        (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#define VALGRIND_DO_CLIENT_REQUEST_STMT(_zzq_request, _zzq_arg1,        \
+                           _zzq_arg2,  _zzq_arg3, _zzq_arg4, _zzq_arg5) \
+  do { (void) VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                        \
+                    (_zzq_request), (_zzq_arg1), (_zzq_arg2),           \
+                    (_zzq_arg3), (_zzq_arg4), (_zzq_arg5)); } while (0)
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+      (_zzq_default)
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ----------------- x86-{linux,darwin,solaris} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin)  \
+    ||  (defined(PLAT_x86_win32) && defined(__GNUC__)) \
+    ||  defined(PLAT_x86_solaris)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                   \
+  ({volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "xchgl %%edi,%%edi\n\t"                     \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || (PLAT_x86_win32 && __GNUC__)
+          || PLAT_x86_solaris */
+
+/* ------------------------- x86-Win32 ------------------------- */
+
+#if defined(PLAT_x86_win32) && !defined(__GNUC__)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#if defined(_MSC_VER)
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     __asm rol edi, 3  __asm rol edi, 13          \
+                     __asm rol edi, 29 __asm rol edi, 19
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    valgrind_do_client_request_expr((uintptr_t)(_zzq_default),    \
+        (uintptr_t)(_zzq_request), (uintptr_t)(_zzq_arg1),        \
+        (uintptr_t)(_zzq_arg2), (uintptr_t)(_zzq_arg3),           \
+        (uintptr_t)(_zzq_arg4), (uintptr_t)(_zzq_arg5))
+
+static __inline uintptr_t
+valgrind_do_client_request_expr(uintptr_t _zzq_default, uintptr_t _zzq_request,
+                                uintptr_t _zzq_arg1, uintptr_t _zzq_arg2,
+                                uintptr_t _zzq_arg3, uintptr_t _zzq_arg4,
+                                uintptr_t _zzq_arg5)
+{
+    volatile uintptr_t _zzq_args[6];
+    volatile unsigned int _zzq_result;
+    _zzq_args[0] = (uintptr_t)(_zzq_request);
+    _zzq_args[1] = (uintptr_t)(_zzq_arg1);
+    _zzq_args[2] = (uintptr_t)(_zzq_arg2);
+    _zzq_args[3] = (uintptr_t)(_zzq_arg3);
+    _zzq_args[4] = (uintptr_t)(_zzq_arg4);
+    _zzq_args[5] = (uintptr_t)(_zzq_arg5);
+    __asm { __asm lea eax, _zzq_args __asm mov edx, _zzq_default
+            __SPECIAL_INSTRUCTION_PREAMBLE
+            /* %EDX = client_request ( %EAX ) */
+            __asm xchg ebx,ebx
+            __asm mov _zzq_result, edx
+    }
+    return _zzq_result;
+}
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                        \
+            /* %EAX = guest_NRADDR */                             \
+            __asm xchg ecx,ecx                                    \
+            __asm mov __addr, eax                                 \
+    }                                                             \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX ERROR
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm { __SPECIAL_INSTRUCTION_PREAMBLE                       \
+            __asm xchg edi,edi                                   \
+    }                                                            \
+ } while (0)
+
+#else
+#error Unsupported compiler.
+#endif
+
+#endif /* PLAT_x86_win32 */
+
+/* ----------------- amd64-{linux,darwin,solaris} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin) \
+    ||  defined(PLAT_amd64_solaris) \
+    ||  (defined(PLAT_amd64_win64) && defined(__GNUC__))
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+    __extension__                                                 \
+    ({ volatile unsigned long int _zzq_args[6];                   \
+    volatile unsigned long int _zzq_result;                       \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long int __addr;                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "xchgq %%rdi,%%rdi\n\t"                     \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin || PLAT_amd64_solaris */
+
+/* ------------------------- amd64-Win64 ------------------------- */
+
+#if defined(PLAT_amd64_win64) && !defined(__GNUC__)
+
+#error Unsupported compiler.
+
+#endif /* PLAT_amd64_win64 */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                    "rlwinm 0,0,3,0,31  ; rlwinm 0,0,13,0,31\n\t" \
+                    "rlwinm 0,0,29,0,31 ; rlwinm 0,0,19,0,31\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+    __extension__                                                 \
+  ({         unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+    })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64be_linux)
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+      unsigned long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long int  _zzq_args[6];                     \
+             unsigned long int  _zzq_result;                      \
+             unsigned long int* _zzq_ptr;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc64be_linux */
+
+#if defined(PLAT_ppc64le_linux)
+
+typedef
+   struct {
+      unsigned long int nraddr; /* where's the code? */
+      unsigned long int r2;     /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({         unsigned long int  _zzq_args[6];                     \
+             unsigned long int  _zzq_result;                      \
+             unsigned long int* _zzq_ptr;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R12 */       \
+                     "or 3,3,3\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or 5,5,5\n\t"                              \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_ppc64le_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "mov r12, r12, ror #3  ; mov r12, r12, ror #13 \n\t"  \
+            "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned int  _zzq_args[6];                          \
+    volatile unsigned int  _zzq_result;                           \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("mov r3, %1\n\t" /*default*/                 \
+                     "mov r4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = client_request ( R4 ) */             \
+                     "orr r10, r10, r10\n\t"                      \
+                     "mov %0, r3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "cc","memory", "r3", "r4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* R3 = guest_NRADDR */                      \
+                     "orr r11, r11, r11\n\t"                      \
+                     "mov %0, r3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R4 */        \
+                     "orr r12, r12, r12\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "orr r9, r9, r9\n\t"                        \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ arm64-linux ------------------------- */
+
+#if defined(PLAT_arm64_linux)
+
+typedef
+   struct { 
+      unsigned long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+            "ror x12, x12, #3  ;  ror x12, x12, #13 \n\t"         \
+            "ror x12, x12, #51 ;  ror x12, x12, #61 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+        _zzq_default, _zzq_request,                               \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  __extension__                                                   \
+  ({volatile unsigned long int  _zzq_args[6];                     \
+    volatile unsigned long int  _zzq_result;                      \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+    __asm__ volatile("mov x3, %1\n\t" /*default*/                 \
+                     "mov x4, %2\n\t" /*ptr*/                     \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* X3 = client_request ( X4 ) */             \
+                     "orr x10, x10, x10\n\t"                      \
+                     "mov %0, x3"     /*result*/                  \
+                     : "=r" (_zzq_result)                         \
+                     : "r" ((unsigned long int)(_zzq_default)),   \
+                       "r" (&_zzq_args[0])                        \
+                     : "cc","memory", "x3", "x4");                \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned long int __addr;                                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* X3 = guest_NRADDR */                      \
+                     "orr x11, x11, x11\n\t"                      \
+                     "mov %0, x3"                                 \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "x3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir X8 */          \
+                     "orr x12, x12, x12\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "orr x9, x9, x9\n\t"                        \
+                     : : : "cc", "memory"                        \
+                    );                                           \
+ } while (0)
+
+#endif /* PLAT_arm64_linux */
+
+/* ------------------------ s390x-linux ------------------------ */
+
+#if defined(PLAT_s390x_linux)
+
+typedef
+  struct {
+     unsigned long int nraddr; /* where's the code? */
+  }
+  OrigFn;
+
+/* __SPECIAL_INSTRUCTION_PREAMBLE will be used to identify Valgrind specific
+ * code. This detection is implemented in platform specific toIR.c
+ * (e.g. VEX/priv/guest_s390_decoder.c).
+ */
+#define __SPECIAL_INSTRUCTION_PREAMBLE                           \
+                     "lr 15,15\n\t"                              \
+                     "lr 1,1\n\t"                                \
+                     "lr 2,2\n\t"                                \
+                     "lr 3,3\n\t"
+
+#define __CLIENT_REQUEST_CODE "lr 2,2\n\t"
+#define __GET_NR_CONTEXT_CODE "lr 3,3\n\t"
+#define __CALL_NO_REDIR_CODE  "lr 4,4\n\t"
+#define __VEX_INJECT_IR_CODE  "lr 5,5\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                         \
+       _zzq_default, _zzq_request,                               \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  __extension__                                                  \
+ ({volatile unsigned long int _zzq_args[6];                      \
+   volatile unsigned long int _zzq_result;                       \
+   _zzq_args[0] = (unsigned long int)(_zzq_request);             \
+   _zzq_args[1] = (unsigned long int)(_zzq_arg1);                \
+   _zzq_args[2] = (unsigned long int)(_zzq_arg2);                \
+   _zzq_args[3] = (unsigned long int)(_zzq_arg3);                \
+   _zzq_args[4] = (unsigned long int)(_zzq_arg4);                \
+   _zzq_args[5] = (unsigned long int)(_zzq_arg5);                \
+   __asm__ volatile(/* r2 = args */                              \
+                    "lgr 2,%1\n\t"                               \
+                    /* r3 = default */                           \
+                    "lgr 3,%2\n\t"                               \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CLIENT_REQUEST_CODE                        \
+                    /* results = r3 */                           \
+                    "lgr %0, 3\n\t"                              \
+                    : "=d" (_zzq_result)                         \
+                    : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                    : "cc", "2", "3", "memory"                   \
+                   );                                            \
+   _zzq_result;                                                  \
+ })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                      \
+ { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+   volatile unsigned long int __addr;                            \
+   __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __GET_NR_CONTEXT_CODE                        \
+                    "lgr %0, 3\n\t"                              \
+                    : "=a" (__addr)                              \
+                    :                                            \
+                    : "cc", "3", "memory"                        \
+                   );                                            \
+   _zzq_orig->nraddr = __addr;                                   \
+ }
+
+#define VALGRIND_CALL_NOREDIR_R1                                 \
+                    __SPECIAL_INSTRUCTION_PREAMBLE               \
+                    __CALL_NO_REDIR_CODE
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     __VEX_INJECT_IR_CODE);                      \
+ } while (0)
+
+#endif /* PLAT_s390x_linux */
+
+/* ------------------------- mips32-linux ---------------- */
+
+#if defined(PLAT_mips32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+/* .word  0x342
+ * .word  0x742
+ * .word  0xC2
+ * .word  0x4C2*/
+#define __SPECIAL_INSTRUCTION_PREAMBLE          \
+                     "srl $0, $0, 13\n\t"       \
+                     "srl $0, $0, 29\n\t"       \
+                     "srl $0, $0, 3\n\t"        \
+                     "srl $0, $0, 19\n\t"
+                    
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+       _zzq_default, _zzq_request,                                \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  __extension__                                                   \
+  ({ volatile unsigned int _zzq_args[6];                          \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+        __asm__ volatile("move $11, %1\n\t" /*default*/           \
+                     "move $12, %2\n\t" /*ptr*/                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* T3 = client_request ( T4 ) */             \
+                     "or $13, $13, $13\n\t"                       \
+                     "move %0, $11\n\t"     /*result*/            \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "$11", "$12", "memory");                   \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %t9 = guest_NRADDR */                     \
+                     "or $14, $14, $14\n\t"                       \
+                     "move %0, $11"     /*result*/                \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "$11"                                      \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE              \
+                     /* call-noredir *%t9 */                     \
+                     "or $15, $15, $15\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                 \
+ do {                                                            \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE              \
+                     "or $11, $11, $11\n\t"                      \
+                    );                                           \
+ } while (0)
+
+
+#endif /* PLAT_mips32_linux */
+
+/* ------------------------- mips64-linux ---------------- */
+
+#if defined(PLAT_mips64_linux)
+
+typedef
+   struct {
+      unsigned long nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+/* dsll $0,$0, 3
+ * dsll $0,$0, 13
+ * dsll $0,$0, 29
+ * dsll $0,$0, 19*/
+#define __SPECIAL_INSTRUCTION_PREAMBLE                              \
+                     "dsll $0,$0, 3 ; dsll $0,$0,13\n\t"            \
+                     "dsll $0,$0,29 ; dsll $0,$0,19\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                            \
+       _zzq_default, _zzq_request,                                  \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)       \
+  __extension__                                                     \
+  ({ volatile unsigned long int _zzq_args[6];                       \
+    volatile unsigned long int _zzq_result;                         \
+    _zzq_args[0] = (unsigned long int)(_zzq_request);               \
+    _zzq_args[1] = (unsigned long int)(_zzq_arg1);                  \
+    _zzq_args[2] = (unsigned long int)(_zzq_arg2);                  \
+    _zzq_args[3] = (unsigned long int)(_zzq_arg3);                  \
+    _zzq_args[4] = (unsigned long int)(_zzq_arg4);                  \
+    _zzq_args[5] = (unsigned long int)(_zzq_arg5);                  \
+        __asm__ volatile("move $11, %1\n\t" /*default*/             \
+                         "move $12, %2\n\t" /*ptr*/                 \
+                         __SPECIAL_INSTRUCTION_PREAMBLE             \
+                         /* $11 = client_request ( $12 ) */         \
+                         "or $13, $13, $13\n\t"                     \
+                         "move %0, $11\n\t"     /*result*/          \
+                         : "=r" (_zzq_result)                       \
+                         : "r" (_zzq_default), "r" (&_zzq_args[0])  \
+                         : "$11", "$12", "memory");                 \
+    _zzq_result;                                                    \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                         \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                     \
+    volatile unsigned long int __addr;                              \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* $11 = guest_NRADDR */                       \
+                     "or $14, $14, $14\n\t"                         \
+                     "move %0, $11"     /*result*/                  \
+                     : "=r" (__addr)                                \
+                     :                                              \
+                     : "$11");                                      \
+    _zzq_orig->nraddr = __addr;                                     \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* call-noredir $25 */                         \
+                     "or $15, $15, $15\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                    \
+ do {                                                               \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     "or $11, $11, $11\n\t"                         \
+                    );                                              \
+ } while (0)
+
+#endif /* PLAT_mips64_linux */
+
+#if defined(PLAT_nanomips_linux)
+
+typedef
+   struct {
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+/*
+   8000 c04d  srl  zero, zero, 13
+   8000 c05d  srl  zero, zero, 29
+   8000 c043  srl  zero, zero,  3
+   8000 c053  srl  zero, zero, 19
+*/
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE "srl[32] $zero, $zero, 13 \n\t" \
+                                       "srl[32] $zero, $zero, 29 \n\t" \
+                                       "srl[32] $zero, $zero, 3  \n\t" \
+                                       "srl[32] $zero, $zero, 19 \n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST_EXPR(                          \
+       _zzq_default, _zzq_request,                                \
+       _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)     \
+  __extension__                                                   \
+  ({ volatile unsigned int _zzq_args[6];                          \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile("move $a7, %1\n\t" /* default */             \
+                     "move $t0, %2\n\t" /* ptr */                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* $a7 = client_request( $t0 ) */            \
+                     "or[32] $t0, $t0, $t0\n\t"                   \
+                     "move %0, $a7\n\t"     /* result */          \
+                     : "=r" (_zzq_result)                         \
+                     : "r" (_zzq_default), "r" (&_zzq_args[0])    \
+                     : "$a7", "$t0", "memory");                   \
+    _zzq_result;                                                  \
+  })
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                         \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                     \
+    volatile unsigned long int __addr;                              \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* $a7 = guest_NRADDR */                       \
+                     "or[32] $t1, $t1, $t1\n\t"                     \
+                     "move %0, $a7"     /*result*/                  \
+                     : "=r" (__addr)                                \
+                     :                                              \
+                     : "$a7");                                      \
+    _zzq_orig->nraddr = __addr;                                     \
+  }
+
+#define VALGRIND_CALL_NOREDIR_T9                                    \
+                     __SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     /* call-noredir $25 */                         \
+                     "or[32] $t2, $t2, $t2\n\t"
+
+#define VALGRIND_VEX_INJECT_IR()                                    \
+ do {                                                               \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE                 \
+                     "or[32] $t3, $t3, $t3\n\t"                     \
+                    );                                              \
+ } while (0)
+
+#endif
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h.  NOTE also: inserts
+   the default behaviour equivalance class tag "0000" into the name.
+   See pub_tool_redir.h for details -- normally you don't need to
+   think about this, though. */
+
+/* Use an extra level of macroisation so as to ensure the soname/fnname
+   args are fully macro-expanded before pasting them together. */
+#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZU_,soname,_,fnname)
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   VG_CONCAT4(_vgw00000ZZ_,soname,_,fnname)
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Also provide end-user facilities for function replacement, rather
+   than wrapping.  A replacement function differs from a wrapper in
+   that it has no way to get hold of the original function being
+   called, and hence no way to call onwards to it.  In a replacement
+   function, VALGRIND_GET_ORIG_FN always returns zero. */
+
+#define I_REPLACE_SONAME_FNNAME_ZU(soname,fnname)                 \
+   VG_CONCAT4(_vgr00000ZU_,soname,_,fnname)
+
+#define I_REPLACE_SONAME_FNNAME_ZZ(soname,fnname)                 \
+   VG_CONCAT4(_vgr00000ZZ_,soname,_,fnname)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4)                \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0)
+
+#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5)             \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0)
+
+#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0)
+
+#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7)   \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0)
+
+/* ----------------- x86-{linux,darwin,solaris} ---------------- */
+
+#if defined(PLAT_x86_linux)  ||  defined(PLAT_x86_darwin) \
+    ||  defined(PLAT_x86_solaris)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "movl %%esp,%%edi\n\t"               \
+      "andl $0xfffffff0,%%esp\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "movl %%edi,%%esp\n\t"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $12, %%esp\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $8, %%esp\n\t"                                     \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "subl $4, %%esp\n\t"                                     \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "edi"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux || PLAT_x86_darwin || PLAT_x86_solaris */
+
+/* ---------------- amd64-{linux,darwin,solaris} --------------- */
+
+#if defined(PLAT_amd64_linux)  ||  defined(PLAT_amd64_darwin) \
+    ||  defined(PLAT_amd64_solaris)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* This is all pretty complex.  It's so as to make stack unwinding
+   work reliably.  See bug 243270.  The basic problem is the sub and
+   add of 128 of %rsp in all of the following macros.  If gcc believes
+   the CFA is in %rsp, then unwinding may fail, because what's at the
+   CFA is not what gcc "expected" when it constructs the CFIs for the
+   places where the macros are instantiated.
+
+   But we can't just add a CFI annotation to increase the CFA offset
+   by 128, to match the sub of 128 from %rsp, because we don't know
+   whether gcc has chosen %rsp as the CFA at that point, or whether it
+   has chosen some other register (eg, %rbp).  In the latter case,
+   adding a CFI annotation to change the CFA offset is simply wrong.
+
+   So the solution is to get hold of the CFA using
+   __builtin_dwarf_cfa(), put it in a known register, and add a
+   CFI annotation to say what the register is.  We choose %rbp for
+   this (perhaps perversely), because:
+
+   (1) %rbp is already subject to unwinding.  If a new register was
+       chosen then the unwinder would have to unwind it in all stack
+       traces, which is expensive, and
+
+   (2) %rbp is already subject to precise exception updates in the
+       JIT.  If a new register was chosen, we'd have to have precise
+       exceptions for it too, which reduces performance of the
+       generated code.
+
+   However .. one extra complication.  We can't just whack the result
+   of __builtin_dwarf_cfa() into %rbp and then add %rbp to the
+   list of trashed registers at the end of the inline assembly
+   fragments; gcc won't allow %rbp to appear in that list.  Hence
+   instead we need to stash %rbp in %r15 for the duration of the asm,
+   and say that %r15 is trashed instead.  gcc seems happy to go with
+   that.
+
+   Oh .. and this all needs to be conditionalised so that it is
+   unchanged from before this commit, when compiled with older gccs
+   that don't support __builtin_dwarf_cfa.  Furthermore, since
+   this header file is freestanding, it has to be independent of
+   config.h, and so the following conditionalisation cannot depend on
+   configure time checks.
+
+   Although it's not clear from
+   'defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)',
+   this expression excludes Darwin.
+   .cfi directives in Darwin assembly appear to be completely
+   different and I haven't investigated how they work.
+
+   For even more entertainment value, note we have to use the
+   completely undocumented __builtin_dwarf_cfa(), which appears to
+   really compute the CFA, whereas __builtin_frame_address(0) claims
+   to but actually doesn't.  See
+   https://bugs.kde.org/show_bug.cgi?id=243270#c47
+*/
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"r"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "movq %%rbp, %%r15\n\t"                                     \
+      "movq %2, %%rbp\n\t"                                        \
+      ".cfi_remember_state\n\t"                                   \
+      ".cfi_def_cfa rbp, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "movq %%r15, %%rbp\n\t"                                     \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "movq %%rsp,%%r14\n\t"               \
+      "andq $0xfffffffffffffff0,%%rsp\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "movq %%r14,%%rsp\n\t"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                        \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[1];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                                  \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[2];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                            \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[3];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                      \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[4];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)                \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[5];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)             \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[6];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)        \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[7];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7)                                 \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[8];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7,arg8)                            \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[9];                               \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,        \
+                                 arg7,arg8,arg9)                       \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[10];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                  arg7,arg8,arg9,arg10)                \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[11];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                  arg7,arg8,arg9,arg10,arg11)          \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[12];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      _argvec[11] = (unsigned long)(arg11);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $136,%%rsp\n\t"                                         \
+         "pushq 88(%%rax)\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,       \
+                                arg7,arg8,arg9,arg10,arg11,arg12)      \
+   do {                                                                \
+      volatile OrigFn        _orig = (orig);                           \
+      volatile unsigned long _argvec[13];                              \
+      volatile unsigned long _res;                                     \
+      _argvec[0] = (unsigned long)_orig.nraddr;                        \
+      _argvec[1] = (unsigned long)(arg1);                              \
+      _argvec[2] = (unsigned long)(arg2);                              \
+      _argvec[3] = (unsigned long)(arg3);                              \
+      _argvec[4] = (unsigned long)(arg4);                              \
+      _argvec[5] = (unsigned long)(arg5);                              \
+      _argvec[6] = (unsigned long)(arg6);                              \
+      _argvec[7] = (unsigned long)(arg7);                              \
+      _argvec[8] = (unsigned long)(arg8);                              \
+      _argvec[9] = (unsigned long)(arg9);                              \
+      _argvec[10] = (unsigned long)(arg10);                            \
+      _argvec[11] = (unsigned long)(arg11);                            \
+      _argvec[12] = (unsigned long)(arg12);                            \
+      __asm__ volatile(                                                \
+         VALGRIND_CFI_PROLOGUE                                         \
+         VALGRIND_ALIGN_STACK                                          \
+         "subq $128,%%rsp\n\t"                                         \
+         "pushq 96(%%rax)\n\t"                                         \
+         "pushq 88(%%rax)\n\t"                                         \
+         "pushq 80(%%rax)\n\t"                                         \
+         "pushq 72(%%rax)\n\t"                                         \
+         "pushq 64(%%rax)\n\t"                                         \
+         "pushq 56(%%rax)\n\t"                                         \
+         "movq 48(%%rax), %%r9\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                                   \
+         "movq 24(%%rax), %%rdx\n\t"                                   \
+         "movq 16(%%rax), %%rsi\n\t"                                   \
+         "movq 8(%%rax), %%rdi\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */                 \
+         VALGRIND_CALL_NOREDIR_RAX                                     \
+         VALGRIND_RESTORE_STACK                                        \
+         VALGRIND_CFI_EPILOGUE                                         \
+         : /*out*/   "=a" (_res)                                       \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER                 \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r14", "r15" \
+      );                                                               \
+      lval = (__typeof__(lval)) _res;                                  \
+   } while (0)
+
+#endif /* PLAT_amd64_linux || PLAT_amd64_darwin || PLAT_amd64_solaris */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rlwinm 1,1,0,0,27\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         VALGRIND_RESTORE_STACK                                   \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64be_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",         \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rldicr 1,1,0,59\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64be_linux */
+
+/* ------------------------- ppc64le-linux ----------------------- */
+#if defined(PLAT_ppc64le_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",         \
+   "r11", "r12", "r13"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+#define VALGRIND_ALIGN_STACK               \
+      "mr 28,1\n\t"                        \
+      "rldicr 1,1,0,59\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mr 1,28\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "mr 12,%1\n\t"                                           \
+         "std 2,-16(12)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(12)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(12)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(12)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(12)\n\t"                                       \
+         "std 3,104(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(12)\n\t"                                       \
+         "std 3,96(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "ld   3, 8(12)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(12)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(12)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(12)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(12)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(12)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(12)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(12)\n\t" /* arg8->r10 */                     \
+         "ld  12, 0(12)\n\t"  /* target->r12 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R12                  \
+         "mr 12,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(12)\n\t" /* restore tocptr */                  \
+         VALGRIND_RESTORE_STACK                                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r28"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64le_linux */
+
+/* ------------------------- arm-linux ------------------------- */
+
+#if defined(PLAT_arm_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4", "r12", "r14"
+
+/* Macros to save and align the stack before making a function
+   call and restore it afterwards as gcc may not keep the stack
+   pointer aligned if it doesn't realise calls are being made
+   to other functions. */
+
+/* This is a bit tricky.  We store the original stack pointer in r10
+   as it is callee-saves.  gcc doesn't allow the use of r11 for some
+   reason.  Also, we can't directly "bic" the stack pointer in thumb
+   mode since r13 isn't an allowed register number in that context.
+   So use r4 as a temporary, since that is about to get trashed
+   anyway, just after each use of this macro.  Side effect is we need
+   to be very careful about any future changes, since
+   VALGRIND_ALIGN_STACK simply assumes r4 is usable. */
+#define VALGRIND_ALIGN_STACK               \
+      "mov r10, sp\n\t"                    \
+      "mov r4,  sp\n\t"                    \
+      "bic r4,  r4, #7\n\t"                \
+      "mov sp,  r4\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mov sp,  r10\n\t"
+
+/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "push {r0, r1, r2, r3} \n\t"                             \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "push {r0} \n\t"                                         \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #4 \n\t"                                    \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "push {r0, r1} \n\t"                                     \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr r0, [%1, #40] \n\t"                                 \
+         "ldr r1, [%1, #44] \n\t"                                 \
+         "ldr r2, [%1, #48] \n\t"                                 \
+         "push {r0, r1, r2} \n\t"                                 \
+         "ldr r0, [%1, #20] \n\t"                                 \
+         "ldr r1, [%1, #24] \n\t"                                 \
+         "ldr r2, [%1, #28] \n\t"                                 \
+         "ldr r3, [%1, #32] \n\t"                                 \
+         "ldr r4, [%1, #36] \n\t"                                 \
+         "push {r0, r1, r2, r3, r4} \n\t"                         \
+         "ldr r0, [%1, #4] \n\t"                                  \
+         "ldr r1, [%1, #8] \n\t"                                  \
+         "ldr r2, [%1, #12] \n\t"                                 \
+         "ldr r3, [%1, #16] \n\t"                                 \
+         "ldr r4, [%1] \n\t"  /* target->r4 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, r0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "r10"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm_linux */
+
+/* ------------------------ arm64-linux ------------------------ */
+
+#if defined(PLAT_arm64_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS \
+     "x0", "x1", "x2", "x3","x4", "x5", "x6", "x7", "x8", "x9",   \
+     "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",      \
+     "x18", "x19", "x20", "x30",                                  \
+     "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",  \
+     "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",      \
+     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",      \
+     "v26", "v27", "v28", "v29", "v30", "v31"
+
+/* x21 is callee-saved, so we can use it to save and restore SP around
+   the hidden call. */
+#define VALGRIND_ALIGN_STACK               \
+      "mov x21, sp\n\t"                    \
+      "bic sp, x21, #15\n\t"
+#define VALGRIND_RESTORE_STACK             \
+      "mov sp,  x21\n\t"
+
+/* These CALL_FN_ macros assume that on arm64-linux,
+   sizeof(unsigned long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0\n"                                           \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x20 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x20 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x30 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1, #88] \n\t"                                 \
+         "str x8, [sp, #16] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11,     \
+                                  arg12)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         VALGRIND_ALIGN_STACK                                     \
+         "sub sp, sp, #0x30 \n\t"                                 \
+         "ldr x0, [%1, #8] \n\t"                                  \
+         "ldr x1, [%1, #16] \n\t"                                 \
+         "ldr x2, [%1, #24] \n\t"                                 \
+         "ldr x3, [%1, #32] \n\t"                                 \
+         "ldr x4, [%1, #40] \n\t"                                 \
+         "ldr x5, [%1, #48] \n\t"                                 \
+         "ldr x6, [%1, #56] \n\t"                                 \
+         "ldr x7, [%1, #64] \n\t"                                 \
+         "ldr x8, [%1, #72] \n\t"                                 \
+         "str x8, [sp, #0]  \n\t"                                 \
+         "ldr x8, [%1, #80] \n\t"                                 \
+         "str x8, [sp, #8]  \n\t"                                 \
+         "ldr x8, [%1, #88] \n\t"                                 \
+         "str x8, [sp, #16] \n\t"                                 \
+         "ldr x8, [%1, #96] \n\t"                                 \
+         "str x8, [sp, #24] \n\t"                                 \
+         "ldr x8, [%1] \n\t"  /* target->x8 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_X8                   \
+         VALGRIND_RESTORE_STACK                                   \
+         "mov %0, x0"                                             \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS, "x21"   \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_arm64_linux */
+
+/* ------------------------- s390x-linux ------------------------- */
+
+#if defined(PLAT_s390x_linux)
+
+/* Similar workaround as amd64 (see above), but we use r11 as frame
+   pointer and save the old r11 in r7. r11 might be used for
+   argvec, therefore we copy argvec in r1 since r1 is clobbered
+   after the call anyway.  */
+#if defined(__GNUC__) && defined(__GCC_HAVE_DWARF2_CFI_ASM)
+#  define __FRAME_POINTER                                         \
+      ,"d"(__builtin_dwarf_cfa())
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      ".cfi_remember_state\n\t"                                   \
+      "lgr 1,%1\n\t" /* copy the argvec pointer in r1 */          \
+      "lgr 7,11\n\t"                                              \
+      "lgr 11,%2\n\t"                                             \
+      ".cfi_def_cfa r11, 0\n\t"
+#  define VALGRIND_CFI_EPILOGUE                                   \
+      "lgr 11, 7\n\t"                                             \
+      ".cfi_restore_state\n\t"
+#else
+#  define __FRAME_POINTER
+#  define VALGRIND_CFI_PROLOGUE                                   \
+      "lgr 1,%1\n\t"
+#  define VALGRIND_CFI_EPILOGUE
+#endif
+
+/* Nb: On s390 the stack pointer is properly aligned *at all times*
+   according to the s390 GCC maintainer. (The ABI specification is not
+   precise in this regard.) Therefore, VALGRIND_ALIGN_STACK and
+   VALGRIND_RESTORE_STACK are not defined here. */
+
+/* These regs are trashed by the hidden call. Note that we overwrite
+   r14 in s390_irgen_noredir (VEX/priv/guest_s390_irgen.c) to give the
+   function a proper return address. All others are ABI defined call
+   clobbers. */
+#if defined(__VX__) || defined(__S390_VX__)
+#define __CALLER_SAVED_REGS "0", "1", "2", "3", "4", "5", "14",   \
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",             \
+      "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",       \
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",     \
+      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+#else
+#define __CALLER_SAVED_REGS "0", "1", "2", "3", "4", "5", "14",   \
+      "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7"
+#endif
+
+/* Nb: Although r11 is modified in the asm snippets below (inside 
+   VALGRIND_CFI_PROLOGUE) it is not listed in the clobber section, for
+   two reasons:
+   (1) r11 is restored in VALGRIND_CFI_EPILOGUE, so effectively it is not
+       modified
+   (2) GCC will complain that r11 cannot appear inside a clobber section,
+       when compiled with -O -fno-omit-frame-pointer
+ */
+
+#define CALL_FN_W_v(lval, orig)                                  \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long  _argvec[1];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 1, 0(1)\n\t"  /* target->r1 */                      \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "d" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+/* The call abi has the arguments in r2-r6 and stack */
+#define CALL_FN_W_W(lval, orig, arg1)                            \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[2];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1, arg2)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[3];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1, arg2, arg3)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[4];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1, arg2, arg3, arg4)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[5];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"7"     \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1, arg2, arg3, arg4, arg5)   \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[6];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-160\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,160\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6)                                       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[7];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-168\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,168\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7)                                 \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[8];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-176\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,176\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8)                           \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[9];                         \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-184\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,184\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1, arg2, arg3, arg4, arg5,   \
+                     arg6, arg7 ,arg8, arg9)                     \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[10];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-192\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,192\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10)              \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[11];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-200\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,200\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11)       \
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[12];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-208\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,208\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1, arg2, arg3, arg4, arg5,  \
+                     arg6, arg7 ,arg8, arg9, arg10, arg11, arg12)\
+   do {                                                          \
+      volatile OrigFn        _orig = (orig);                     \
+      volatile unsigned long _argvec[13];                        \
+      volatile unsigned long _res;                               \
+      _argvec[0] = (unsigned long)_orig.nraddr;                  \
+      _argvec[1] = (unsigned long)arg1;                          \
+      _argvec[2] = (unsigned long)arg2;                          \
+      _argvec[3] = (unsigned long)arg3;                          \
+      _argvec[4] = (unsigned long)arg4;                          \
+      _argvec[5] = (unsigned long)arg5;                          \
+      _argvec[6] = (unsigned long)arg6;                          \
+      _argvec[7] = (unsigned long)arg7;                          \
+      _argvec[8] = (unsigned long)arg8;                          \
+      _argvec[9] = (unsigned long)arg9;                          \
+      _argvec[10] = (unsigned long)arg10;                        \
+      _argvec[11] = (unsigned long)arg11;                        \
+      _argvec[12] = (unsigned long)arg12;                        \
+      __asm__ volatile(                                          \
+         VALGRIND_CFI_PROLOGUE                                   \
+         "aghi 15,-216\n\t"                                      \
+         "lg 2, 8(1)\n\t"                                        \
+         "lg 3,16(1)\n\t"                                        \
+         "lg 4,24(1)\n\t"                                        \
+         "lg 5,32(1)\n\t"                                        \
+         "lg 6,40(1)\n\t"                                        \
+         "mvc 160(8,15), 48(1)\n\t"                              \
+         "mvc 168(8,15), 56(1)\n\t"                              \
+         "mvc 176(8,15), 64(1)\n\t"                              \
+         "mvc 184(8,15), 72(1)\n\t"                              \
+         "mvc 192(8,15), 80(1)\n\t"                              \
+         "mvc 200(8,15), 88(1)\n\t"                              \
+         "mvc 208(8,15), 96(1)\n\t"                              \
+         "lg 1, 0(1)\n\t"                                        \
+         VALGRIND_CALL_NOREDIR_R1                                \
+         "aghi 15,216\n\t"                                       \
+         VALGRIND_CFI_EPILOGUE                                   \
+         "lgr %0, 2\n\t"                                         \
+         : /*out*/   "=d" (_res)                                 \
+         : /*in*/    "a" (&_argvec[0]) __FRAME_POINTER           \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS,"6","7" \
+      );                                                         \
+      lval = (__typeof__(lval)) _res;                            \
+   } while (0)
+
+
+#endif /* PLAT_s390x_linux */
+
+/* ------------------------- mips32-linux ----------------------- */
+ 
+#if defined(PLAT_mips32_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$2", "$3", "$4", "$5", "$6",       \
+"$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
+"$25", "$31"
+
+/* These CALL_FN_ macros assume that on mips-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16\n\t"                                  \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+     volatile unsigned long _argvec[2];                           \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"   /* arg1*/                          \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory",  __CALLER_SAVED_REGS               \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "subu $29, $29, 16 \n\t"                                 \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 16 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 24\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 24 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 32\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "nop\n\t"                                                \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 32 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 32\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 32 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 40\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 40 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 40\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 40 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 48\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 48 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 48\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 44(%1) \n\t"                                     \
+         "sw $4, 40($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 48 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subu $29, $29, 8 \n\t"                                  \
+         "sw $28, 0($29) \n\t"                                    \
+         "sw $31, 4($29) \n\t"                                    \
+         "lw $4, 20(%1) \n\t"                                     \
+         "subu $29, $29, 56\n\t"                                  \
+         "sw $4, 16($29) \n\t"                                    \
+         "lw $4, 24(%1) \n\t"                                     \
+         "sw $4, 20($29) \n\t"                                    \
+         "lw $4, 28(%1) \n\t"                                     \
+         "sw $4, 24($29) \n\t"                                    \
+         "lw $4, 32(%1) \n\t"                                     \
+         "sw $4, 28($29) \n\t"                                    \
+         "lw $4, 36(%1) \n\t"                                     \
+         "sw $4, 32($29) \n\t"                                    \
+         "lw $4, 40(%1) \n\t"                                     \
+         "sw $4, 36($29) \n\t"                                    \
+         "lw $4, 44(%1) \n\t"                                     \
+         "sw $4, 40($29) \n\t"                                    \
+         "lw $4, 48(%1) \n\t"                                     \
+         "sw $4, 44($29) \n\t"                                    \
+         "lw $4, 4(%1) \n\t"                                      \
+         "lw $5, 8(%1) \n\t"                                      \
+         "lw $6, 12(%1) \n\t"                                     \
+         "lw $7, 16(%1) \n\t"                                     \
+         "lw $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "addu $29, $29, 56 \n\t"                                 \
+         "lw $28, 0($29) \n\t"                                    \
+         "lw $31, 4($29) \n\t"                                    \
+         "addu $29, $29, 8 \n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_mips32_linux */
+
+/* ------------------------- nanomips-linux -------------------- */
+
+#if defined(PLAT_nanomips_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$t4", "$t5", "$a0", "$a1", "$a2",     \
+"$a3", "$a4", "$a5", "$a6", "$a7", "$t0", "$t1", "$t2", "$t3",     \
+"$t8","$t9", "$at"
+
+/* These CALL_FN_ macros assume that on mips-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         "lw $a6,28(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "lw $t9, 0(%1)\n\t"                                      \
+         "lw $a0, 4(%1)\n\t"                                      \
+         "lw $a1, 8(%1)\n\t"                                      \
+         "lw $a2,12(%1)\n\t"                                      \
+         "lw $a3,16(%1)\n\t"                                      \
+         "lw $a4,20(%1)\n\t"                                      \
+         "lw $a5,24(%1)\n\t"                                      \
+         "lw $a6,28(%1)\n\t"                                      \
+         "lw $a7,32(%1)\n\t"                                      \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0\n"                                         \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9,44(%1)        \n\t"                              \
+         "sw $t9, 8($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "addiu $sp, $sp, -16  \n\t"                              \
+         "lw $t9,36(%1)        \n\t"                              \
+         "sw $t9, 0($sp)       \n\t"                              \
+         "lw $t9,40(%1)        \n\t"                              \
+         "sw $t9, 4($sp)       \n\t"                              \
+         "lw $t9,44(%1)        \n\t"                              \
+         "sw $t9, 8($sp)       \n\t"                              \
+         "lw $t9,48(%1)        \n\t"                              \
+         "sw $t9,12($sp)       \n\t"                              \
+         "lw $t9, 0(%1)        \n\t"                              \
+         "lw $a0, 4(%1)        \n\t"                              \
+         "lw $a1, 8(%1)        \n\t"                              \
+         "lw $a2,12(%1)        \n\t"                              \
+         "lw $a3,16(%1)        \n\t"                              \
+         "lw $a4,20(%1)        \n\t"                              \
+         "lw $a5,24(%1)        \n\t"                              \
+         "lw $a6,28(%1)        \n\t"                              \
+         "lw $a7,32(%1)        \n\t"                              \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $a0         \n\t"                              \
+         "addiu $sp, $sp, 16   \n\t"                              \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_nanomips_linux */
+
+/* ------------------------- mips64-linux ------------------------- */
+
+#if defined(PLAT_mips64_linux)
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS "$2", "$3", "$4", "$5", "$6",       \
+"$7", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", \
+"$25", "$31"
+
+/* These CALL_FN_ macros assume that on mips64-linux,
+   sizeof(long long) == 8. */
+
+#define MIPS64_LONG2REG_CAST(x) ((long long)(long)x)
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[1];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      __asm__ volatile(                                           \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "0" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[2];                     \
+      volatile unsigned long long  _res;                          \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"   /* arg1*/                           \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[3];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = _orig.nraddr;                                  \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[4];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = _orig.nraddr;                                  \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[5];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[6];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[7];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[8];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[9];                     \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      __asm__ volatile(                                           \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1) \n\t"  /* target->t9 */                   \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[10];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 8\n\t"                                  \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 8\n\t"                                  \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[11];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 16\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 16\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[12];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      _argvec[11] = MIPS64_LONG2REG_CAST(arg11);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 24\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 88(%1)\n\t"                                      \
+         "sd $4, 16($29)\n\t"                                     \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 24\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long long _argvec[13];                    \
+      volatile unsigned long long _res;                           \
+      _argvec[0] = MIPS64_LONG2REG_CAST(_orig.nraddr);            \
+      _argvec[1] = MIPS64_LONG2REG_CAST(arg1);                    \
+      _argvec[2] = MIPS64_LONG2REG_CAST(arg2);                    \
+      _argvec[3] = MIPS64_LONG2REG_CAST(arg3);                    \
+      _argvec[4] = MIPS64_LONG2REG_CAST(arg4);                    \
+      _argvec[5] = MIPS64_LONG2REG_CAST(arg5);                    \
+      _argvec[6] = MIPS64_LONG2REG_CAST(arg6);                    \
+      _argvec[7] = MIPS64_LONG2REG_CAST(arg7);                    \
+      _argvec[8] = MIPS64_LONG2REG_CAST(arg8);                    \
+      _argvec[9] = MIPS64_LONG2REG_CAST(arg9);                    \
+      _argvec[10] = MIPS64_LONG2REG_CAST(arg10);                  \
+      _argvec[11] = MIPS64_LONG2REG_CAST(arg11);                  \
+      _argvec[12] = MIPS64_LONG2REG_CAST(arg12);                  \
+      __asm__ volatile(                                           \
+         "dsubu $29, $29, 32\n\t"                                 \
+         "ld $4, 72(%1)\n\t"                                      \
+         "sd $4, 0($29)\n\t"                                      \
+         "ld $4, 80(%1)\n\t"                                      \
+         "sd $4, 8($29)\n\t"                                      \
+         "ld $4, 88(%1)\n\t"                                      \
+         "sd $4, 16($29)\n\t"                                     \
+         "ld $4, 96(%1)\n\t"                                      \
+         "sd $4, 24($29)\n\t"                                     \
+         "ld $4, 8(%1)\n\t"                                       \
+         "ld $5, 16(%1)\n\t"                                      \
+         "ld $6, 24(%1)\n\t"                                      \
+         "ld $7, 32(%1)\n\t"                                      \
+         "ld $8, 40(%1)\n\t"                                      \
+         "ld $9, 48(%1)\n\t"                                      \
+         "ld $10, 56(%1)\n\t"                                     \
+         "ld $11, 64(%1)\n\t"                                     \
+         "ld $25, 0(%1)\n\t"  /* target->t9 */                    \
+         VALGRIND_CALL_NOREDIR_T9                                 \
+         "daddu $29, $29, 32\n\t"                                 \
+         "move %0, $2\n"                                          \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "memory", __CALLER_SAVED_REGS                \
+      );                                                          \
+      lval = (__typeof__(lval)) (long)_res;                       \
+   } while (0)
+
+#endif /* PLAT_mips64_linux */
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE NUMERIC VALUES OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end of the most
+   relevant group. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* Allows the client program and/or gdbserver to execute a monitor
+             command. */
+          VG_USERREQ__GDB_MONITOR_COMMAND = 0x1202,
+
+          /* Allows the client program to change a dynamic command line
+             option.  */
+          VG_USERREQ__CLO_CHANGE = 0x1203,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__RESIZEINPLACE_BLOCK = 0x130b,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          /* The first two pass the va_list argument by value, which
+             assumes it is the same size as or smaller than a UWord,
+             which generally isn't the case.  Hence are deprecated.
+             The second two pass the vargs by reference and so are
+             immune to this problem. */
+          /* both :: char* fmt, va_list vargs (DEPRECATED) */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+          /* both :: char* fmt, va_list* vargs */
+          VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403,
+          VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503,
+
+          /* Wine support */
+          VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601,
+
+          /* Querying of debug info. */
+          VG_USERREQ__MAP_IP_TO_SRCLOC = 0x1701,
+
+          /* Disable/enable error reporting level.  Takes a single
+             Word arg which is the delta to this thread's error
+             disablement indicator.  Hence 1 disables or further
+             disables errors, and -1 moves back towards enablement.
+             Other values are not allowed. */
+          VG_USERREQ__CHANGE_ERR_DISABLEMENT = 0x1801,
+
+          /* Some requests used for Valgrind internal, such as
+             self-test or self-hosting. */
+          /* Initialise IR injection */
+          VG_USERREQ__VEX_INIT_FOR_IRI = 0x1901,
+          /* Used by Inner Valgrind to inform Outer Valgrind where to
+             find the list of inner guest threads */
+          VG_USERREQ__INNER_THREADS    = 0x1902
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND                                           \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* if not */,         \
+                                    VG_USERREQ__RUNNING_ON_VALGRIND,  \
+                                    0, 0, 0, 0, 0)                    \
+
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                                    _qzz_addr, _qzz_len, 0, 0, 0)
+
+#define VALGRIND_INNER_THREADS(_qzz_addr)                               \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__INNER_THREADS,           \
+                                   _qzz_addr, 0, 0, 0, 0)
+
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack.  The return value
+   is the number of characters printed, excluding the "**<pid>** " part at the
+   start and the backtrace (if present). */
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) && !defined(_MSC_VER)
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   (void)format;
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER) && !defined(_MSC_VER)
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+#endif
+static int
+#if defined(_MSC_VER)
+__inline
+#endif
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+#if defined(NVALGRIND)
+   (void)format;
+   return 0;
+#else /* NVALGRIND */
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   uintptr_t _qzz_res;
+#else
+   unsigned long _qzz_res;
+#endif
+   va_list vargs;
+   va_start(vargs, format);
+#if defined(_MSC_VER) || defined(__MINGW64__)
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (uintptr_t)format,
+                              (uintptr_t)&vargs,
+                              0, 0, 0);
+#else
+   _qzz_res = VALGRIND_DO_CLIENT_REQUEST_EXPR(0,
+                              VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF,
+                              (unsigned long)format,
+                              (unsigned long)&vargs, 
+                              0, 0, 0);
+#endif
+   va_end(vargs);
+   return (int)_qzz_res;
+#endif /* NVALGRIND */
+}
+
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitrary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,       \
+                                    VG_USERREQ__CLIENT_CALL0,     \
+                                    _qyy_fn,                      \
+                                    0, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)                    \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL1,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, 0, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)         \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,            \
+                                    VG_USERREQ__CLIENT_CALL2,          \
+                                    _qyy_fn,                           \
+                                    _qyy_arg1, _qyy_arg2, 0, 0)
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+    VALGRIND_DO_CLIENT_REQUEST_EXPR(0 /* default return */,             \
+                                    VG_USERREQ__CLIENT_CALL3,           \
+                                    _qyy_fn,                            \
+                                    _qyy_arg1, _qyy_arg2,               \
+                                    _qyy_arg3, 0)
+
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(                    \
+                               0 /* default return */,            \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0)
+
+/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing
+   when heap blocks are allocated in order to give accurate results.  This
+   happens automatically for the standard allocator functions such as
+   malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete,
+   delete[], etc.
+
+   But if your program uses a custom allocator, this doesn't automatically
+   happen, and Valgrind will not do as well.  For example, if you allocate
+   superblocks with mmap() and then allocates chunks of the superblocks, all
+   Valgrind's observations will be at the mmap() level and it won't know that
+   the chunks should be considered separate entities.  In Memcheck's case,
+   that means you probably won't get heap block overrun detection (because
+   there won't be redzones marked as unaddressable) and you definitely won't
+   get any leak detection.
+
+   The following client requests allow a custom allocator to be annotated so
+   that it can be handled accurately by Valgrind.
+
+   VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated
+   by a malloc()-like function.  For Memcheck (an illustrative case), this
+   does two things:
+
+   - It records that the block has been allocated.  This means any addresses
+     within the block mentioned in error messages will be
+     identified as belonging to the block.  It also means that if the block
+     isn't freed it will be detected by the leak checker.
+
+   - It marks the block as being addressable and undefined (if 'is_zeroed' is
+     not set), or addressable and defined (if 'is_zeroed' is set).  This
+     controls how accesses to the block by the program are handled.
+   
+   'addr' is the start of the usable block (ie. after any
+   redzone), 'sizeB' is its size.  'rzB' is the redzone size if the allocator
+   can apply redzones -- these are blocks of padding at the start and end of
+   each block.  Adding redzones is recommended as it makes it much more likely
+   Valgrind will spot block overruns.  `is_zeroed' indicates if the memory is
+   zeroed (or filled with another predictable value), as is the case for
+   calloc().
+   
+   VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a
+   heap block -- that will be used by the client program -- is allocated.
+   It's best to put it at the outermost level of the allocator if possible;
+   for example, if you have a function my_alloc() which calls
+   internal_alloc(), and the client request is put inside internal_alloc(),
+   stack traces relating to the heap block will contain entries for both
+   my_alloc() and internal_alloc(), which is probably not what you want.
+
+   For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out
+   custom blocks from within a heap block, B, that has been allocated with
+   malloc/calloc/new/etc, then block B will be *ignored* during leak-checking
+   -- the custom blocks will take precedence.
+
+   VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK.  For
+   Memcheck, it does two things:
+
+   - It records that the block has been deallocated.  This assumes that the
+     block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - It marks the block as being unaddressable.
+
+   VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a
+   heap block is deallocated.
+
+   VALGRIND_RESIZEINPLACE_BLOCK informs a tool about reallocation. For
+   Memcheck, it does four things:
+
+   - It records that the size of a block has been changed.  This assumes that
+     the block was annotated as having been allocated via
+     VALGRIND_MALLOCLIKE_BLOCK.  Otherwise, an error will be issued.
+
+   - If the block shrunk, it marks the freed memory as being unaddressable.
+
+   - If the block grew, it marks the new area as undefined and defines a red
+     zone past the end of the new block.
+
+   - The V-bits of the overlap between the old and the new block are preserved.
+
+   VALGRIND_RESIZEINPLACE_BLOCK should be put after allocation of the new block
+   and before deallocation of the old block.
+
+   In many cases, these three client requests will not be enough to get your
+   allocator working well with Memcheck.  More specifically, if your allocator
+   writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call
+   will be necessary to mark the memory as addressable just before the zeroing
+   occurs, otherwise you'll get a lot of invalid write errors.  For example,
+   you'll need to do this if your allocator recycles freed blocks, but it
+   zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK).
+   Alternatively, if your allocator reuses freed blocks for allocator-internal
+   data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary.
+
+   Really, what's happening is a blurring of the lines between the client
+   program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the
+   memory should be considered unaddressable to the client program, but the
+   allocator knows more than the rest of the client program and so may be able
+   to safely access it.  Extra client requests are necessary for Valgrind to
+   understand the distinction between the allocator and the rest of the
+   program.
+
+   Ignored if addr == 0.
+*/
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)          \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MALLOCLIKE_BLOCK,       \
+                                    addr, sizeB, rzB, is_zeroed, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__RESIZEINPLACE_BLOCK,    \
+                                    addr, oldSizeB, newSizeB, rzB, 0)
+
+/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details.
+   Ignored if addr == 0.
+*/
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                              \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__FREELIKE_BLOCK,         \
+                                    addr, rzB, 0, 0, 0)
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,   \
+                                    pool, rzB, is_zeroed, 0, 0)
+
+/* Create a memory pool with some flags specifying extended behaviour.
+   When flags is zero, the behaviour is identical to VALGRIND_CREATE_MEMPOOL.
+   
+   The flag VALGRIND_MEMPOOL_METAPOOL specifies that the pieces of memory 
+   associated with the pool using VALGRIND_MEMPOOL_ALLOC  will be used
+   by the application as superblocks to dole out MALLOC_LIKE blocks using
+   VALGRIND_MALLOCLIKE_BLOCK. In other words, a meta pool is a "2 levels"
+   pool : first level is the blocks described by VALGRIND_MEMPOOL_ALLOC.
+   The second level blocks are described using VALGRIND_MALLOCLIKE_BLOCK.
+   Note that the association between the pool and the second level blocks
+   is implicit : second level blocks will be located inside first level
+   blocks. It is necessary to use the VALGRIND_MEMPOOL_METAPOOL flag
+   for such 2 levels pools, as otherwise valgrind will detect overlapping
+   memory blocks, and will abort execution (e.g. during leak search).
+
+   Such a meta pool can also be marked as an 'auto free' pool using the flag
+   VALGRIND_MEMPOOL_AUTO_FREE, which must be OR-ed together with the
+   VALGRIND_MEMPOOL_METAPOOL. For an 'auto free' pool, VALGRIND_MEMPOOL_FREE
+   will automatically free the second level blocks that are contained
+   inside the first level block freed with VALGRIND_MEMPOOL_FREE.
+   In other words, calling VALGRIND_MEMPOOL_FREE will cause implicit calls
+   to VALGRIND_FREELIKE_BLOCK for all the second level blocks included
+   in the first level block.
+   Note: it is an error to use the VALGRIND_MEMPOOL_AUTO_FREE flag
+   without the VALGRIND_MEMPOOL_METAPOOL flag.
+*/
+#define VALGRIND_MEMPOOL_AUTO_FREE  1
+#define VALGRIND_MEMPOOL_METAPOOL   2
+#define VALGRIND_CREATE_MEMPOOL_EXT(pool, rzB, is_zeroed, flags)        \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CREATE_MEMPOOL,          \
+                                   pool, rzB, is_zeroed, flags, 0)
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DESTROY_MEMPOOL,  \
+                                    pool, 0, 0, 0, 0)
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_ALLOC,    \
+                                    pool, addr, size, 0, 0)
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_FREE,     \
+                                    pool, addr, 0, 0, 0)
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_TRIM,     \
+                                    pool, addr, size, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MOVE_MEMPOOL,     \
+                                    poolA, poolB, 0, 0, 0)
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__MEMPOOL_CHANGE,   \
+                                    pool, addrA, addrB, size, 0)
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0)
+
+/* Mark a piece of memory as being a stack. Returns a stack id.
+   start is the lowest addressable stack byte, end is the highest
+   addressable stack byte. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0)
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_DEREGISTER, \
+                                    id, 0, 0, 0, 0)
+
+/* Change the start and end address of the stack id.
+   start is the new lowest addressable stack byte, end is the new highest
+   addressable stack byte. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STACK_CHANGE,     \
+                                    id, start, end, 0, 0)
+
+/* Load PDB debug info for Wine PE image_map. */
+#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta)     \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__LOAD_PDB_DEBUGINFO, \
+                                    fd, ptr, total_size, delta, 0)
+
+/* Map a code address to a source file name and line number.  buf64
+   must point to a 64-byte buffer in the caller's address space.  The
+   result will be dumped in there and is guaranteed to be zero
+   terminated.  If no info is found, the first byte is set to zero. */
+#define VALGRIND_MAP_IP_TO_SRCLOC(addr, buf64)                    \
+    (unsigned)VALGRIND_DO_CLIENT_REQUEST_EXPR(0,                  \
+                               VG_USERREQ__MAP_IP_TO_SRCLOC,      \
+                               addr, buf64, 0, 0, 0)
+
+/* Disable error reporting for this thread.  Behaves in a stack like
+   way, so you can safely call this multiple times provided that
+   VALGRIND_ENABLE_ERROR_REPORTING is called the same number of times
+   to re-enable reporting.  The first call of this macro disables
+   reporting.  Subsequent calls have no effect except to increase the
+   number of VALGRIND_ENABLE_ERROR_REPORTING calls needed to re-enable
+   reporting.  Child threads do not inherit this setting from their
+   parents -- they are always created with reporting enabled. */
+#define VALGRIND_DISABLE_ERROR_REPORTING                                \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    1, 0, 0, 0, 0)
+
+/* Re-enable error reporting, as per comments on
+   VALGRIND_DISABLE_ERROR_REPORTING. */
+#define VALGRIND_ENABLE_ERROR_REPORTING                                 \
+    VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CHANGE_ERR_DISABLEMENT, \
+                                    -1, 0, 0, 0, 0)
+
+/* Execute a monitor command from the client program.
+   If a connection is opened with GDB, the output will be sent
+   according to the output mode set for vgdb.
+   If no connection is opened, output will go to the log output.
+   Returns 1 if command not recognised, 0 otherwise. */
+#define VALGRIND_MONITOR_COMMAND(command)                               \
+   VALGRIND_DO_CLIENT_REQUEST_EXPR(0, VG_USERREQ__GDB_MONITOR_COMMAND, \
+                                   command, 0, 0, 0, 0)
+
+
+/* Change the value of a dynamic command line option.
+   Note that unknown or not dynamically changeable options
+   will cause a warning message to be output.  */
+#define VALGRIND_CLO_CHANGE(option)                           \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__CLO_CHANGE, \
+                                   option, 0, 0, 0, 0)
+
+
+#undef PLAT_x86_darwin
+#undef PLAT_amd64_darwin
+#undef PLAT_x86_win32
+#undef PLAT_amd64_win64
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64be_linux
+#undef PLAT_ppc64le_linux
+#undef PLAT_arm_linux
+#undef PLAT_s390x_linux
+#undef PLAT_mips32_linux
+#undef PLAT_mips64_linux
+#undef PLAT_nanomips_linux
+#undef PLAT_x86_solaris
+#undef PLAT_amd64_solaris
+
+#endif   /* __VALGRIND_H */
diff --git a/MLPY/Lib/site-packages/torch/utils/bottleneck/__init__.py b/MLPY/Lib/site-packages/torch/utils/bottleneck/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/bottleneck/__main__.py b/MLPY/Lib/site-packages/torch/utils/bottleneck/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd77e8cb400b6db0b4af42ba0b15ffcfc462d94
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/bottleneck/__main__.py
@@ -0,0 +1,229 @@
+import argparse
+import cProfile
+import pstats
+import sys
+import os
+from typing import Dict
+
+import torch
+from torch.autograd import profiler
+from torch.utils.collect_env import get_env_info
+
+
+def redirect_argv(new_argv):
+    sys.argv[:] = new_argv[:]
+
+
+def compiled_with_cuda(sysinfo):
+    if sysinfo.cuda_compiled_version:
+        return f'compiled w/ CUDA {sysinfo.cuda_compiled_version}'
+    return 'not compiled w/ CUDA'
+
+
+env_summary = """
+--------------------------------------------------------------------------------
+  Environment Summary
+--------------------------------------------------------------------------------
+PyTorch {pytorch_version}{debug_str} {cuda_compiled}
+Running with Python {py_version} and {cuda_runtime}
+
+`{pip_version} list` truncated output:
+{pip_list_output}
+""".strip()
+
+
+def run_env_analysis():
+    print('Running environment analysis...')
+    info = get_env_info()
+
+    result: Dict[str, str] = {}
+
+    debug_str = ''
+    if info.is_debug_build:
+        debug_str = ' DEBUG'
+
+    cuda_avail = ''
+    if info.is_cuda_available:
+        cuda = info.cuda_runtime_version
+        if cuda is not None:
+            cuda_avail = 'CUDA ' + cuda
+    else:
+        cuda = 'CUDA unavailable'
+
+    pip_version = info.pip_version
+    pip_list_output = info.pip_packages
+    if pip_list_output is None:
+        pip_list_output = 'Unable to fetch'
+
+    result = {
+        'debug_str': debug_str,
+        'pytorch_version': info.torch_version,
+        'cuda_compiled': compiled_with_cuda(info),
+        'py_version': f'{sys.version_info[0]}.{sys.version_info[1]}',
+        'cuda_runtime': cuda_avail,
+        'pip_version': pip_version,
+        'pip_list_output': pip_list_output,
+    }
+
+    return env_summary.format(**result)
+
+
+def run_cprofile(code, globs, launch_blocking=False):
+    print('Running your script with cProfile')
+    prof = cProfile.Profile()
+    prof.enable()
+    exec(code, globs, None)
+    prof.disable()
+    return prof
+
+
+cprof_summary = """
+--------------------------------------------------------------------------------
+  cProfile output
+--------------------------------------------------------------------------------
+""".strip()
+
+
+def print_cprofile_summary(prof, sortby='tottime', topk=15):
+    print(cprof_summary)
+    cprofile_stats = pstats.Stats(prof).sort_stats(sortby)
+    cprofile_stats.print_stats(topk)
+
+
+def run_autograd_prof(code, globs):
+    def run_prof(use_cuda=False):
+        with profiler.profile(use_cuda=use_cuda) as prof:
+            exec(code, globs, None)
+        return prof
+
+    print('Running your script with the autograd profiler...')
+    result = [run_prof(use_cuda=False)]
+    if torch.cuda.is_available():
+        result.append(run_prof(use_cuda=True))
+    else:
+        result.append(None)
+
+    return result
+
+
+autograd_prof_summary = """
+--------------------------------------------------------------------------------
+  autograd profiler output ({mode} mode)
+--------------------------------------------------------------------------------
+        {description}
+{cuda_warning}
+{output}
+""".strip()
+
+
+def print_autograd_prof_summary(prof, mode, sortby='cpu_time', topk=15):
+    valid_sortby = ['cpu_time', 'cuda_time', 'cpu_time_total', 'cuda_time_total', 'count']
+    if sortby not in valid_sortby:
+        warn = ('WARNING: invalid sorting option for autograd profiler results: {}\n'
+                'Expected `cpu_time`, `cpu_time_total`, or `count`. '
+                'Defaulting to `cpu_time`.')
+        print(warn.format(sortby))
+        sortby = 'cpu_time'
+
+    if mode == 'CUDA':
+        cuda_warning = ('\n\tBecause the autograd profiler uses the CUDA event API,\n'
+                        '\tthe CUDA time column reports approximately max(cuda_time, cpu_time).\n'
+                        '\tPlease ignore this output if your code does not use CUDA.\n')
+    else:
+        cuda_warning = ''
+
+    sorted_events = sorted(prof.function_events,
+                           key=lambda x: getattr(x, sortby), reverse=True)
+    topk_events = sorted_events[:topk]
+
+    result = {
+        'mode': mode,
+        'description': f'top {topk} events sorted by {sortby}',
+        'output': torch.autograd.profiler_util._build_table(topk_events),
+        'cuda_warning': cuda_warning
+    }
+
+    print(autograd_prof_summary.format(**result))
+
+
+descript = """
+`bottleneck` is a tool that can be used as an initial step for debugging
+bottlenecks in your program.
+
+It summarizes runs of your script with the Python profiler and PyTorch\'s
+autograd profiler. Because your script will be profiled, please ensure that it
+exits in a finite amount of time.
+
+For more complicated uses of the profilers, please see
+https://docs.python.org/3/library/profile.html and
+https://pytorch.org/docs/master/autograd.html#profiler for more information.
+""".strip()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=descript)
+    parser.add_argument('scriptfile', type=str,
+                        help='Path to the script to be run. '
+                        'Usually run with `python path/to/script`.')
+    parser.add_argument('args', type=str, nargs=argparse.REMAINDER,
+                        help='Command-line arguments to be passed to the script.')
+    return parser.parse_args()
+
+
+def cpu_time_total(autograd_prof):
+    return sum([event.cpu_time_total for event in autograd_prof.function_events])
+
+
+def main():
+    args = parse_args()
+
+    # Customizable constants.
+    scriptfile = args.scriptfile
+    scriptargs = [] if args.args is None else args.args
+    scriptargs.insert(0, scriptfile)
+    cprofile_sortby = 'tottime'
+    cprofile_topk = 15
+    autograd_prof_sortby = 'cpu_time_total'
+    autograd_prof_topk = 15
+
+    redirect_argv(scriptargs)
+
+    sys.path.insert(0, os.path.dirname(scriptfile))
+    with open(scriptfile, 'rb') as stream:
+        code = compile(stream.read(), scriptfile, 'exec')
+    globs = {
+        '__file__': scriptfile,
+        '__name__': '__main__',
+        '__package__': None,
+        '__cached__': None,
+    }
+
+    print(descript)
+
+    env_summary = run_env_analysis()
+
+    if torch.cuda.is_available():
+        torch.cuda.init()
+    cprofile_prof = run_cprofile(code, globs)
+    autograd_prof_cpu, autograd_prof_cuda = run_autograd_prof(code, globs)
+
+    print(env_summary)
+    print_cprofile_summary(cprofile_prof, cprofile_sortby, cprofile_topk)
+
+    if not torch.cuda.is_available():
+        print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+        return
+
+    # Print both the result of the CPU-mode and CUDA-mode autograd profilers
+    # if their execution times are very different.
+    cuda_prof_exec_time = cpu_time_total(autograd_prof_cuda)
+    if len(autograd_prof_cpu.function_events) > 0:
+        cpu_prof_exec_time = cpu_time_total(autograd_prof_cpu)
+        pct_diff = (cuda_prof_exec_time - cpu_prof_exec_time) / cuda_prof_exec_time
+        if abs(pct_diff) > 0.05:
+            print_autograd_prof_summary(autograd_prof_cpu, 'CPU', autograd_prof_sortby, autograd_prof_topk)
+
+    print_autograd_prof_summary(autograd_prof_cuda, 'CUDA', autograd_prof_sortby, autograd_prof_topk)
+
+if __name__ == '__main__':
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1941d7275975ce6af8f87ee2fd26a44efccafeda
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__main__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__main__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaffcfded40ead782ba1ee33d72b90bd79a9bc1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/bottleneck/__pycache__/__main__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/bundled_inputs.py b/MLPY/Lib/site-packages/torch/utils/bundled_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ad56f1e3a63fb94c11394fda05b6a76d9b38d3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/bundled_inputs.py
@@ -0,0 +1,471 @@
+#!/usr/bin/env python3
+from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence, Dict, Callable
+import textwrap
+import torch
+from torch._C import TupleType, ListType
+from torch.jit._recursive import wrap_cpp_module
+
+
+T = TypeVar("T")
+
+MAX_RAW_TENSOR_SIZE = 16
+
+class InflatableArg(NamedTuple):
+    """Helper type for bundled inputs.
+
+    'value' is the compressed/deflated input that is stored in the model. Value
+    must be of the same type as the argument to the function that it is a deflated
+    input for.
+
+    'fmt' is a formatable code string that is executed to inflate the compressed data into
+    the appropriate input. It can use 'value' as an input to the format str. It must result
+    in a value of the same type as 'value'.
+
+    'fmt_fn' is a formatable function code string that is executed to inflate the compressed
+    data into the appropriate input. It must result in a value of the same type as 'value'.
+    The function name should be the formatable part of the string.
+
+    Note: Only top level InflatableArgs can be inflated. i.e. you cannot place
+    an inflatable arg inside of some other structure. You should instead create
+    an inflatable arg such that the fmt code string returns the full structure
+    of your input.
+    """
+
+    value: Any
+    fmt: str = "{}"
+    fmt_fn: str = ""
+
+
+def bundle_inputs(
+        model: torch.jit.ScriptModule,
+        inputs: Union[Optional[Sequence[Tuple[Any, ...]]], Dict[Callable, Optional[Sequence[Tuple[Any, ...]]]]],
+        info: Optional[Union[List[str], Dict[Callable, List[str]]]] = None,
+        *,
+        _receive_inflate_expr: Optional[List[str]] = None,
+) -> torch.jit.ScriptModule:
+    """Create and return a copy of the specified model with inputs attached.
+
+    The original model is not mutated or changed in any way.
+
+    Models with bundled inputs can be invoked in a uniform manner by
+    benchmarking and code coverage tools.
+
+    If inputs is passed in as a list then the inputs will be bundled for 'forward'.
+    If inputs is instead passed in as a map then all the methods specified in the map
+    will have their corresponding inputs bundled. Info should match watchever type is
+    chosen for the inputs.
+
+    The returned model will support the following methods:
+
+        `get_all_bundled_inputs_for_<function_name>() -> List[Tuple[Any, ...]]`
+            Returns a list of tuples suitable for passing to the model like
+            `for inp in model.get_all_bundled_inputs_for_foo(): model.foo(*inp)`
+
+        `get_bundled_inputs_functions_and_info() -> Dict[str, Dict[str: List[str]]]`
+            Returns a dictionary mapping function names to a metadata dictionary.
+            This nested dictionary maps preset strings like:
+                'get_inputs_function_name' -> the name of a function attribute in this model that can be
+                    run to get back a list of inputs corresponding to that function.
+                'info' -> the user provided extra information about the bundled inputs
+
+    If forward has bundled inputs then these following functions will also be defined on the returned module:
+
+        `get_all_bundled_inputs() -> List[Tuple[Any, ...]]`
+            Returns a list of tuples suitable for passing to the model like
+            `for inp in model.get_all_bundled_inputs(): model(*inp)`
+
+        `get_num_bundled_inputs() -> int`
+            Equivalent to `len(model.get_all_bundled_inputs())`,
+            but slightly easier to call from C++.
+
+    Inputs can be specified in one of two ways:
+
+      - The model can define `_generate_bundled_inputs_for_<function_name>`.
+        If the user chooses this method inputs[<function>] should map to None
+
+      - The `inputs` argument to this function can be a dictionary mapping functions to a
+        list of inputs, of the same form that will be returned by get_all_bundled_inputs_for_<function_name>.
+        Alternatively if only bundling inputs for forward the map can be omitted and a singular list of inputs
+        can be provided instead.
+
+        The type of the inputs is List[Tuple[Any, ...]]. The outer list corresponds with a
+        list of inputs, the inner tuple is the list of args that together make up one input.
+        For inputs of functions that take one arg, this will be a tuple of length one. The Any, ...
+        is the actual data that makes up the args, e.g. a tensor.
+
+    Info is an optional parameter that maps functions to a list of strings providing extra information about that
+    function's bundled inputs. Alternatively if only bundling inputs for forward the map can be omitted and
+    a singular list of information can be provided instead. This could be descriptions, expected outputs, etc.
+        - Ex: info={model.forward : ['man eating icecream', 'an airplane', 'a dog']}
+
+    This function will attempt to optimize arguments so that (e.g.)
+    arguments like `torch.zeros(1000)` will be represented compactly.
+    Only top-level arguments will be optimized.
+    Tensors in lists or tuples will not.
+    """
+    if not isinstance(model, torch.jit.ScriptModule):
+        raise Exception("Only ScriptModule is supported.")
+
+    ignored_methods, ignored_attrs = _get_bundled_inputs_attributes_and_methods(model)
+    clone = torch._C._hack_do_not_use_clone_module_with_class(  # type: ignore[attr-defined]
+        model._c,
+        ignored_methods,
+        ignored_attrs,
+    )
+
+    # The above cloning function returns a torch._C.scriptmodule and we need a torch.jit.scriptmodule.
+    # Fortunately theres a function in _recursive that does exactly that conversion.
+    cloned_module = wrap_cpp_module(clone)
+    if isinstance(inputs, dict):
+        assert isinstance(info, dict) or info is None
+        augment_many_model_functions_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
+    else:
+        assert isinstance(info, list) or info is None
+        augment_model_with_bundled_inputs(cloned_module, inputs, _receive_inflate_expr, info)
+    return cloned_module
+
+def augment_model_with_bundled_inputs(
+        model: torch.jit.ScriptModule,
+        inputs: Optional[Sequence[Tuple[Any, ...]]] = None,
+        _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
+        info: Optional[List[str]] = None,  # Optional argument to provide info about forward or its inputs
+        skip_size_check=False,
+) -> None:
+    """Add bundled sample inputs to a model for the forward function.
+
+    Models with bundled inputs can be invoked in a uniform manner by
+    benchmarking and code coverage tools.
+
+    Augmented models will support the following methods:
+
+        `get_all_bundled_inputs() -> List[Tuple[Any, ...]]`
+            Returns a list of tuples suitable for passing to the model like
+            `for inp in model.get_all_bundled_inputs(): model(*inp)`
+
+        `get_num_bundled_inputs() -> int`
+            Equivalent to `len(model.get_all_bundled_inputs())`,
+            but slightly easier to call from C++.
+
+        `get_bundled_inputs_functions_and_info() -> Dict[str, Dict[str: List[str]]]`
+            Returns a dictionary mapping function names to a metadata dictionary.
+            This nested dictionary maps preset strings like:
+                'get_inputs_function_name' -> the name of a function attribute in this model that can be
+                    run to get back a list of inputs corresponding to that function.
+                'info' -> the user provided extra information about the bundled inputs
+
+    Inputs can be specified in one of two ways:
+
+      - The model can define `_generate_bundled_inputs_for_forward`.
+        If the user chooses this method inputs should be None
+
+      - `inputs` is a list of inputs of form List[Tuple[Any, ...]]. A list of tuples where the elements
+        of each tuple are the args that make up one input.
+    """
+    if not isinstance(model, torch.jit.ScriptModule):
+        raise Exception("Only ScriptModule is supported.")
+
+    forward: Callable = model.forward
+
+    # Sometimes forward won't have a name attached so just in case
+    if not hasattr(forward, "__name__"):
+        forward.__name__ = 'forward'
+    augment_many_model_functions_with_bundled_inputs(
+        model,
+        inputs={forward : inputs},
+        _receive_inflate_expr=_receive_inflate_expr,
+        info={forward : info} if info else None,
+        skip_size_check=skip_size_check,
+    )
+
+
+def augment_many_model_functions_with_bundled_inputs(
+        model: torch.jit.ScriptModule,
+        inputs: Dict[Callable, Optional[Sequence[Tuple[Any, ...]]]],
+        _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
+        info: Optional[Dict[Callable, List[str]]] = None,  # Optional argument to provide info about the function or its inputs
+        skip_size_check=False,
+) -> None:
+    """Add bundled sample inputs to a model for an arbitrary list of public functions.
+
+    Models with bundled inputs can be invoked in a uniform manner by
+    benchmarking and code coverage tools.
+
+    Augmented models will support the following methods:
+
+        `get_all_bundled_inputs_for_<function_name>() -> List[Tuple[Any, ...]]`
+            Returns a list of tuples suitable for passing to the model like
+            `for inp in model.get_all_bundled_inputs_for_foo(): model.foo(*inp)`
+
+        `get_bundled_inputs_functions_and_info() -> Dict[str, Dict[str: List[str]]]`
+            Returns a dictionary mapping function names to a metadata dictionary.
+            This nested dictionary maps preset strings like:
+                'get_inputs_function_name' -> the name of a function attribute in this model that can be
+                    run to get back a list of inputs corresponding to that function.
+                'info' -> the user provided extra information about the bundled inputs
+
+    If forward has bundled inputs then these following functions are also defined:
+
+        `get_all_bundled_inputs() -> List[Tuple[Any, ...]]`
+            Returns a list of tuples suitable for passing to the model like
+            `for inp in model.get_all_bundled_inputs(): model(*inp)`
+
+        `get_num_bundled_inputs() -> int`
+            Equivalent to `len(model.get_all_bundled_inputs())`,
+            but slightly easier to call from C++.
+
+    Inputs can be specified in one of two ways:
+
+      - The model can define `_generate_bundled_inputs_for_<function_name>`.
+        If the user chooses this method inputs[<function>] should map to None
+
+      - The `inputs` argument to this function can be a dictionary mapping functions to a
+        list of inputs, of the same form that will be returned by get_all_bundled_inputs_for_<function_name>.
+        The type of the inputs is List[Tuple[Any, ...]]. The outer list corresponds with a
+        list of inputs, the inner tuple is the list of args that together make up one input.
+        For inputs of functions that take one arg, this will be a tuple of length one. The Any, ...
+        is the actual data that makes up the args, e.g. a tensor.
+
+    Info is an optional parameter that maps functions to a list of strings providing extra information about that
+    function's bundled inputs. This could be descriptions, expected outputs, etc.
+        - Ex: info={model.forward : ['man eating icecream', 'an airplane', 'a dog']}
+
+    This function will attempt to optimize arguments so that (e.g.)
+    arguments like `torch.zeros(1000)` will be represented compactly.
+    Only top-level arguments will be optimized.
+    Tensors in lists or tuples will not.
+    """
+    if not isinstance(model, torch.jit.ScriptModule):
+        raise Exception("Only ScriptModule is supported.")
+
+    if not inputs:
+        raise Exception("Please provide inputs for at least 1 function")
+
+    if hasattr(model, "get_all_bundled_inputs") or hasattr(model, "get_bundled_inputs_functions_and_info"):
+        raise Exception(
+            "Models can only be augmented with bundled inputs once. "
+            "This Model seems to have already been augmented with "
+            "bundled inputs. Please start afresh with one that "
+            "doesn't have bundled inputs.",
+        )
+
+    get_bundled_inputs_functions_and_info_template = ""
+
+    for function, input_list in inputs.items():
+        if hasattr(function, "__name__"):
+            function_name = function.__name__
+        else:
+            if hasattr(function, "name"):
+                function_name = function.name  # type: ignore[attr-defined]
+            else:
+                raise Exception(
+                    'At least one of your functions has no attribute name please ensure all have one. m.foo.name = "foo"')
+
+
+        if input_list is not None and not isinstance(input_list, Sequence):
+            raise TypeError(f"Error inputs for function {function_name} is not a Sequence")
+
+        function_arg_types = [arg.type for arg in function.schema.arguments[1:]]  # type: ignore[attr-defined]
+        deflated_inputs_type: ListType = ListType(TupleType(function_arg_types))
+        model._c._register_attribute(f"_bundled_inputs_deflated_{function_name}", deflated_inputs_type, [])
+
+        if hasattr(model, "_generate_bundled_inputs_for_" + function_name):
+            if input_list is not None:
+                raise Exception(
+                    "inputs[{name}] is not None, but _generate_bundled_inputs_for_{name} is already defined".format(
+                        name=function_name
+                    )
+                )
+            # Model author already defined _generate_bundled_inputs_for_<function_name>.
+        elif input_list is None or len(input_list) == 0:
+            raise Exception(
+                "inputs for {name} must be specified if _generate_bundled_inputs_for_{name} is not already defined".format(
+                    name=function_name,
+                )
+            )
+        else:
+            # Iterate over the inputs and args in each input.
+            # Accumulate `deflated_inputs` as (possibly) compressed values
+            # and `parts` to be joined into the expression that unpacks them.
+            deflated_inputs = []
+            parts = []
+            for inp_idx, args in enumerate(input_list):
+                if not isinstance(args, Tuple) and not isinstance(args, List):  # type: ignore[arg-type]
+                    raise TypeError(
+                        f"Error bundled input for function {function_name} idx: {inp_idx} is not a Tuple or a List"
+                    )
+                deflated_args = []
+                parts.append("(")
+                for arg_idx, arg in enumerate(args):
+                    inflate_helper_fn_name = _get_inflate_helper_fn_name(arg_idx, inp_idx, function_name)
+                    deflated, inflater, helper_definition = _inflate_expr(
+                        arg,
+                        f"deflated[{inp_idx}][{arg_idx}]",
+                        inflate_helper_fn_name,
+                        skip_size_check=skip_size_check,
+                    )
+                    deflated_args.append(deflated)
+                    parts.append(f"    {inflater},")
+                    if helper_definition:
+                        model.define(textwrap.dedent(helper_definition))
+                deflated_inputs.append(tuple(deflated_args))
+                parts.append("),")
+            parts.append("")
+            expr = "\n".join(parts)
+
+            # Back-channel return this expr for debugging.
+            if _receive_inflate_expr is not None:
+                _receive_inflate_expr.append(expr)
+            setattr(model, f"_bundled_inputs_deflated_{function_name}", deflated_inputs)
+            definition = textwrap.dedent("""
+                def _generate_bundled_inputs_for_{name}(self):
+                    deflated = self._bundled_inputs_deflated_{name}
+                    return [
+                {expr}
+                    ]
+                """).format(expr=expr, name=function_name)
+            model.define(definition)
+
+        # Define get_all_bundled_inputs_for_<function_name> that caches the generated inputs.
+        model.define(textwrap.dedent("""
+            def get_all_bundled_inputs_for_{name}(self):
+                all_inputs = self._generate_bundled_inputs_for_{name}()
+                assert all_inputs is not None
+                return all_inputs
+            """).format(name=function_name))
+
+        # Add to the high level helper methods
+        inputs_info = repr(info[function]) if info and function in info else '[]'
+        get_bundled_inputs_functions_and_info_template += f"""
+            temp_dict : Dict[str,List[str]] = {{}}
+            info: List[str] = {inputs_info}
+
+            temp_dict['info'] = info
+            temp_dict['get_inputs_function_name'] = ['get_all_bundled_inputs_for_{function_name}']
+            all_inputs['{function_name}'] = temp_dict
+            """
+
+        # To ensure backwards compatibility and a streamlined api for forward these wrappers are provided
+        if function_name == 'forward':
+            model.define(textwrap.dedent("""
+                def get_all_bundled_inputs(self):
+                    return self.get_all_bundled_inputs_for_forward()
+                """))
+            model.define(textwrap.dedent("""
+                def get_num_bundled_inputs(self):
+                    return len(self.get_all_bundled_inputs_for_forward())
+                """))
+
+    # Define some high level helper methods that act on all bundled inputs
+    model.define(textwrap.dedent(f"""
+        def get_bundled_inputs_functions_and_info(self):
+            all_inputs : Dict[str, Dict[str,List[str]]] = {{}}
+            {get_bundled_inputs_functions_and_info_template}
+            return all_inputs
+        """))
+
+def _inflate_expr(
+    arg: T, ref: str, inflate_helper_fn_name: str, skip_size_check: bool = False
+) -> Tuple[Union[T, torch.Tensor], str, Optional[str]]:
+    # Allow custom inflation expressions any object.
+    # For example, calling custom image-decoding ops.
+    # Or just use "{}" as the format string to ignore size limits.
+    if isinstance(arg, InflatableArg):
+        if arg.fmt_fn:
+            if arg.fmt not in ["{}", ""]:
+                raise Exception(
+                    f"Bundled input argument at position '{ref}' has "
+                    f"both arg.fmt_fn => \n{arg.fmt_fn} "
+                    f"\n and arg.fmt  => {arg.fmt}. "
+                    "Please choose `arg.fmt` if the deflater is straightforward or "
+                    "`arg.fmt_fn` if you need a function."
+                )
+
+            helper_definition = arg.fmt_fn.format(inflate_helper_fn_name)
+            expr = f"self.{inflate_helper_fn_name}({ref})"
+
+            return arg.value, expr, helper_definition
+        else:
+            return arg.value, arg.fmt.format(ref), None
+
+    if isinstance(arg, torch.Tensor):
+        # Small-storage tensors can just be saved directly.
+        if arg._typed_storage().size() <= MAX_RAW_TENSOR_SIZE or skip_size_check:
+            return arg, ref, None
+        # Small contiguous tensors can be cloned to have small storage.
+        # TODO: Should we do this even for non-contiguous tensors?
+        if arg.is_contiguous() and arg.numel() <= MAX_RAW_TENSOR_SIZE:
+            return arg.clone(), ref, None
+        # Example inputs commonly come from torch.zeros, torch.ones, or torch.full.
+        # These can be represented compactly.
+        for fmt in [torch.contiguous_format, torch.channels_last]:
+            if arg.is_contiguous(memory_format=fmt) and (arg == arg.flatten()[0]).all().item():
+                return (arg.flatten()[0].clone().expand(*arg.size()),
+                        f"{ref}.contiguous(memory_format={fmt})", None)
+        # Prevent big tensors from being bundled by default.
+        # TODO: Provide more useful diagnostics.
+        raise Exception(
+            f"Bundled input argument at position '{ref}' is "
+            f"a tensor with storage size {arg._typed_storage().size()}. "
+            f"You probably don't want to bundle this as an input. "
+        )
+    else:
+        return arg, ref, None
+
+def _get_bundled_inputs_attributes_and_methods(script_module: torch.jit.ScriptModule) -> Tuple[List[str], List[str]]:
+    methods: List[str] = []
+    attributes: List[str] = []
+
+    # Has bundled inputs for forward
+    if hasattr(script_module, 'get_all_bundled_inputs'):
+        methods.append('get_all_bundled_inputs')
+        methods.append('get_num_bundled_inputs')
+        methods.append('run_on_bundled_input')
+
+    if hasattr(script_module, 'get_bundled_inputs_functions_and_info'):
+        methods.append('get_bundled_inputs_functions_and_info')
+        all_info = script_module.get_bundled_inputs_functions_and_info()
+        for function_name in all_info:
+            methods.append("get_all_bundled_inputs_for_" + function_name)
+            methods.append("_generate_bundled_inputs_for_" + function_name)
+            attributes.append("_bundled_inputs_deflated_" + function_name)
+
+            bundled_inputs_fn = getattr(
+                script_module,
+                f"get_all_bundled_inputs_for_{function_name}"
+            )
+            num_bundled_inputs: int = len(bundled_inputs_fn())
+
+            # Check inflate helper functions for each function, argument and bundled input
+            func = getattr(script_module, function_name)
+            for arg_idx in range(len(func.schema.arguments) - 1):
+                for input_idx in range(num_bundled_inputs):
+                    helper_fn_name = _get_inflate_helper_fn_name(
+                        arg_idx=arg_idx,
+                        input_idx=input_idx,
+                        function_name=function_name
+                    )
+                    # if the arg has an InflatableArg with fmt_fn, add the helper function name
+                    if hasattr(script_module, helper_fn_name):
+                        methods.append(helper_fn_name)
+
+    return (methods, attributes)
+
+
+def _get_inflate_helper_fn_name(
+    arg_idx: int,
+    input_idx: int,
+    function_name: str,
+) -> str:
+    return f"_inflate_helper_for_{function_name}_input_{input_idx}_arg_{arg_idx}"
+
+
+
+def bundle_randn(*size, dtype=None):
+    """Generate a tensor that will be inflated with torch.randn."""
+    stub = torch.zeros(1, dtype=dtype).expand(*size)
+    return InflatableArg(value=stub, fmt="torch.randn_like({})")
+
+
+def bundle_large_tensor(t):
+    """Wrap a tensor to allow bundling regardless of size."""
+    return InflatableArg(value=t, fmt="{}")
diff --git a/MLPY/Lib/site-packages/torch/utils/checkpoint.py b/MLPY/Lib/site-packages/torch/utils/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..8422393cee2205d22a71ee845b6035f0dc3187ad
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/checkpoint.py
@@ -0,0 +1,1439 @@
+import contextlib
+import platform
+import uuid
+import warnings
+import weakref
+from collections import defaultdict
+from itertools import count
+from typing import (
+    Any,
+    Callable,
+    ContextManager,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+)
+from weakref import ReferenceType
+
+import torch
+import torch.fx.traceback as fx_traceback
+from torch._functorch._aot_autograd.functional_utils import is_fun
+from torch.utils._pytree import tree_map
+from torch.testing._internal.logging_tensor import capture_logs, LoggingTensorMode
+from torch.utils._python_dispatch import TorchDispatchMode
+
+__all__ = [
+    "checkpoint",
+    "checkpoint_sequential",
+    "CheckpointError",
+    "CheckpointFunction",
+    "check_backward_validity",
+    "detach_variable",
+    "get_device_states",
+    "set_device_states",
+    "noop_context_fn",
+    "set_checkpoint_early_stop",
+    "DefaultDeviceType",
+    "set_checkpoint_debug_enabled",
+]
+
+_DEFAULT_DETERMINISM_MODE = "default"
+
+_checkpoint_debug_enabled: Optional[bool] = None
+
+
+@contextlib.contextmanager
+def set_checkpoint_debug_enabled(enabled: Optional[bool]):
+    """
+    Context manager that sets whether checkpoint should print additional debug
+    information when running. See the ``debug`` flag for
+    :func:`~torch.utils.checkpoint.checkpoint` for more information. Note that
+    when set, this context manager overrides the value of ``debug`` passed to
+    checkpoint. To defer to the local setting, pass ``None`` to this context.
+
+    Args:
+        enabled (bool): Whether checkpoint should print debug information.
+            Default is 'None'.
+    """
+    global _checkpoint_debug_enabled
+    try:
+        prev = _checkpoint_debug_enabled
+        _checkpoint_debug_enabled = enabled
+        yield
+    finally:
+        _checkpoint_debug_enabled = prev
+
+
+def detach_variable(inputs: Tuple[Any, ...]) -> Tuple[torch.Tensor, ...]:
+    if isinstance(inputs, tuple):
+        out = []
+        for inp in inputs:
+            if not isinstance(inp, torch.Tensor):
+                out.append(inp)
+                continue
+
+            x = inp.detach()
+            x.requires_grad = inp.requires_grad
+            out.append(x)
+        return tuple(out)
+    else:
+        raise RuntimeError(
+            "Only tuple of tensors is supported. Got Unsupported input type: ",
+            type(inputs).__name__,
+        )
+
+
+def check_backward_validity(inputs: Iterable[Any]) -> None:
+    if not any(inp.requires_grad for inp in inputs if isinstance(inp, torch.Tensor)):
+        warnings.warn(
+            "None of the inputs have requires_grad=True. Gradients will be None"
+        )
+
+
+def _get_device_module(device="cuda"):
+    device_module = getattr(torch, device)
+    return device_module
+
+
+class DefaultDeviceType:
+    r"""
+    A class that manages the default device type for checkpointing.
+
+    If no non-CPU tensors are present, the default device type will
+    be used. The default value is 'cuda'. The device type is used in
+    the checkpointing process when determining which device states
+    to save and restore for recomputation.
+    """
+
+    _default_device_type = "cuda"
+
+    @staticmethod
+    def set_device_type(device: str = "cuda"):
+        """
+        Set the default device type for checkpointing.
+
+        Args:
+            device (str): The device type to be set as default. Default is 'cuda'.
+        """
+        DefaultDeviceType._default_device_type = device
+
+    @staticmethod
+    def get_device_type() -> str:
+        """
+        Get the current default device type for checkpointing.
+
+        Returns:
+            str: The current default device type.
+        """
+        return DefaultDeviceType._default_device_type
+
+
+def _infer_device_type(*args):
+    device_types = list(
+        {
+            arg.device.type
+            for arg in args
+            if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu"
+        }
+    )
+    if len(device_types) > 1:
+        warnings.warn(
+            "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices. "
+            "Device state will only be saved for devices of a single device type, and the remaining "
+            "devices will be ignored. Consequently, if any checkpointed functions involve randomness, "
+            "this may result in incorrect gradients. (Note that if CUDA devices are among the devices "
+            "detected, it will be prioritized; otherwise, the first device encountered will be selected.)"
+        )
+    if len(device_types) == 0:
+        return DefaultDeviceType.get_device_type()
+    elif "cuda" in device_types:
+        return "cuda"
+    else:
+        return device_types[0]
+
+
+# We can't know if the run_fn will internally move some args to different devices,
+# which would require logic to preserve rng states for those devices as well.
+# We could paranoically stash and restore ALL the rng states for all visible devices,
+# but that seems very wasteful for most cases.  Compromise:  Stash the RNG state for
+# the device of all Tensor args.
+#
+# To consider:  maybe get_device_states and set_device_states should reside in torch/random.py?
+def get_device_states(*args) -> Tuple[List[int], List[torch.Tensor]]:
+    # This will not error out if "arg" is a CPU tensor or a non-tensor type because
+    # the conditionals short-circuit.
+    fwd_device_ids = list(
+        {
+            arg.get_device()
+            for arg in args
+            if isinstance(arg, torch.Tensor) and not arg.device.type == "cpu"
+        }
+    )
+
+    fwd_device_states = []
+    device_module = _get_device_module(_infer_device_type(*args))
+
+    for device_id in fwd_device_ids:
+        with device_module.device(device_id):
+            fwd_device_states.append(device_module.get_rng_state())
+
+    return fwd_device_ids, fwd_device_states
+
+
+def set_device_states(devices, states) -> None:
+    device_module = _get_device_module(_infer_device_type(*states))
+    for device, state in zip(devices, states):
+        with device_module.device(device):
+            device_module.set_rng_state(state)
+
+
+def _get_autocast_kwargs(device="cuda"):
+    if device == "cuda":
+        device_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+    elif _supports_autocast(device):
+        device_module = _get_device_module(device)
+        device_autocast_kwargs = {
+            "enabled": device_module.is_autocast_enabled(),
+            "dtype": device_module.get_autocast_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+    else:
+        device_autocast_kwargs = None
+
+    cpu_autocast_kwargs = {
+        "enabled": torch.is_autocast_cpu_enabled(),
+        "dtype": torch.get_autocast_cpu_dtype(),
+        "cache_enabled": torch.is_autocast_cache_enabled(),
+    }
+
+    return device_autocast_kwargs, cpu_autocast_kwargs
+
+def _supports_autocast(device):
+    device_module = _get_device_module(device)
+    return device == "cuda" or (hasattr(device_module, "is_autocast_enabled")
+                                and hasattr(device_module, "get_autocast_dtype"))
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_backward_validity(args)
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+        # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
+        ctx.device = _infer_device_type(*args)
+        ctx.device_autocast_kwargs, ctx.cpu_autocast_kwargs = _get_autocast_kwargs(
+            ctx.device
+        )
+        if preserve_rng_state:
+            ctx.fwd_cpu_state = torch.get_rng_state()
+            # Don't eagerly initialize the cuda context by accident.
+            # (If the user intends that the context is initialized later, within their
+            # run_function, we SHOULD actually stash the cuda state here.  Unfortunately,
+            # we have no way to anticipate this will happen before we run the function.)
+            ctx.had_device_in_fwd = False
+            device_module = _get_device_module(ctx.device)
+            if getattr(device_module, "_initialized", False):
+                ctx.had_device_in_fwd = True
+                ctx.fwd_devices, ctx.fwd_device_states = get_device_states(*args)
+
+        # Save non-tensor inputs in ctx, keep a placeholder None for tensors
+        # to be filled out during the backward.
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if torch.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+
+        ctx.save_for_backward(*tensor_inputs)
+
+        with torch.no_grad():
+            outputs = run_function(*args)
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        if not torch.autograd._is_checkpoint_valid():
+            raise RuntimeError(
+                "Checkpointing is not compatible with .grad() or when an `inputs` parameter"
+                " is passed to .backward(). Please use .backward() and do not pass its `inputs`"
+                " argument."
+            )
+        # Copy the list to avoid modifying original list.
+        inputs = list(ctx.inputs)
+        tensor_indices = ctx.tensor_indices
+        tensors = ctx.saved_tensors
+        device_module = _get_device_module(ctx.device)
+
+        # Fill in inputs with appropriate saved tensors.
+        for i, idx in enumerate(tensor_indices):
+            inputs[idx] = tensors[i]
+
+        # Stash the surrounding rng state, and mimic the state that was
+        # present at this time during forward.  Restore the surrounding state
+        # when we're done.
+        rng_devices = []
+        if ctx.preserve_rng_state and ctx.had_device_in_fwd:
+            rng_devices = ctx.fwd_devices
+        with torch.random.fork_rng(
+            devices=rng_devices, enabled=ctx.preserve_rng_state, device_type=ctx.device
+        ):
+            if ctx.preserve_rng_state:
+                torch.set_rng_state(ctx.fwd_cpu_state)
+                if ctx.had_device_in_fwd:
+                    set_device_states(ctx.fwd_devices, ctx.fwd_device_states)
+            detached_inputs = detach_variable(tuple(inputs))
+
+            device_autocast_ctx = device_module.amp.autocast(
+                **ctx.device_autocast_kwargs
+            ) if _supports_autocast(ctx.device) else contextlib.nullcontext()
+            with torch.enable_grad(), device_autocast_ctx, \
+                 torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):
+                outputs = ctx.run_function(*detached_inputs)
+
+        if isinstance(outputs, torch.Tensor):
+            outputs = (outputs,)
+
+        # run backward() with only tensor that requires grad
+        outputs_with_grad = []
+        args_with_grad = []
+        for i in range(len(outputs)):
+            if torch.is_tensor(outputs[i]) and outputs[i].requires_grad:
+                outputs_with_grad.append(outputs[i])
+                args_with_grad.append(args[i])
+        if len(outputs_with_grad) == 0:
+            raise RuntimeError(
+                "none of output has requires_grad=True,"
+                " this checkpoint() is not necessary"
+            )
+        torch.autograd.backward(outputs_with_grad, args_with_grad)
+        grads = tuple(
+            inp.grad if isinstance(inp, torch.Tensor) else None
+            for inp in detached_inputs
+        )
+
+        return (None, None) + grads
+
+
+def noop_context_fn():
+    return contextlib.nullcontext(), contextlib.nullcontext()
+
+# TorchDynamo does not step inside utils.checkpoint function.  The flow
+# looks likes this
+#  1) TorchDynamo tries to wrap utils.checkpoint in a HigherOrderOp by
+#     speculatively checking if the forward function is safe to trace.
+#  2) If yes, then Dynamo-generated Fx graph has the wrapped higher
+#     order op. As a result, TorchDynamo does not look inside utils.checkpoint.
+#  3) If not, then TorchDynamo falls back to eager by performing a graph
+#     break. And here, the following disable wrapper ensures that
+#     TorchDynamo does not trigger again on the frames created by
+#     utils.checkpoint innards.
+@torch._disable_dynamo
+def checkpoint(
+    function,
+    *args,
+    use_reentrant: Optional[bool] = None,
+    context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
+    determinism_check: str = _DEFAULT_DETERMINISM_MODE,
+    debug: bool = False,
+    **kwargs
+):
+    r"""Checkpoint a model or part of the model.
+
+    Activation checkpointing is a technique that trades compute for memory.
+    Instead of keeping tensors needed for backward alive until they are used in
+    gradient computation during backward, forward computation in checkpointed
+    regions omits saving tensors for backward and recomputes them during the
+    backward pass. Activation checkpointing can be applied to any part of a
+    model.
+
+    There are currently two checkpointing implementations available, determined
+    by the :attr:`use_reentrant` parameter. It is recommended that you use
+    ``use_reentrant=False``. Please refer the note below for a discussion of
+    their differences.
+
+    .. warning::
+
+        If the :attr:`function` invocation during the backward pass differs
+        from the forward pass, e.g., due to a global variable, the checkpointed
+        version may not be equivalent, potentially causing an
+        error being raised or leading to silently incorrect gradients.
+
+    .. warning::
+
+        The ``use_reentrant`` parameter should be passed explicitly. In version
+        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+        If you are using the ``use_reentrant=True`` variant, please refer to the
+        note below for important considerations and potential limitations.
+
+    .. note::
+
+        The reentrant variant of checkpoint (``use_reentrant=True``) and
+        the non-reentrant variant of checkpoint (``use_reentrant=False``)
+        differ in the following ways:
+
+        * Non-reentrant checkpoint stops recomputation as soon as all needed
+          intermediate activations have been recomputed. This feature is enabled
+          by default, but can be disabled with :func:`set_checkpoint_early_stop`.
+          Reentrant checkpoint always recomputes :attr:`function` in its
+          entirety during the backward pass.
+
+        * The reentrant variant does not record the autograd graph during the
+          forward pass, as it runs with the forward pass under
+          :func:`torch.no_grad`. The non-reentrant version does record the
+          autograd graph, allowing one to perform backward on the graph within
+          checkpointed regions.
+
+        * The reentrant checkpoint only supports the
+          :func:`torch.autograd.backward` API for the backward pass without its
+          `inputs` argument, while the non-reentrant version supports all ways
+          of performing the backward pass.
+
+        * At least one input and output must have ``requires_grad=True`` for the
+          reentrant variant. If this condition is unmet, the checkpointed part
+          of the model will not have gradients. The non-reentrant version does
+          not have this requirement.
+
+        * The reentrant version does not consider tensors in nested structures
+          (e.g., custom objects, lists, dicts, etc) as participating in
+          autograd, while the non-reentrant version does.
+
+        * The reentrant checkpoint does not support checkpointed regions with
+          detached tensors from the computational graph, whereas the
+          non-reentrant version does. For the reentrant variant, if the
+          checkpointed segment contains tensors detached using ``detach()`` or
+          with :func:`torch.no_grad`, the backward pass will raise an error.
+          This is because ``checkpoint`` makes all the outputs require gradients
+          and this causes issues when a tensor is defined to have no gradient in
+          the model. To avoid this, detach the tensors outside of the
+          ``checkpoint`` function.
+
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint. Note that under torch.compile,
+            this flag doesn't take effect and we always preserve RNG state.
+            Default: ``True``
+        use_reentrant(bool):
+            specify whether to use the activation checkpoint variant that
+            requires reentrant autograd. This parameter should be passed
+            explicitly. In version 2.4 we will raise an exception if
+            ``use_reentrant`` is not passed. If ``use_reentrant=False``,
+            ``checkpoint`` will use an implementation that does not require
+            reentrant autograd. This allows ``checkpoint`` to support additional
+            functionality, such as working as expected with
+            ``torch.autograd.grad`` and support for keyword arguments input into
+            the checkpointed function.
+        context_fn(Callable, optional): A callable returning a tuple of two
+            context managers. The function and its recomputation will be run
+            under the first and second context managers respectively.
+            This argument is only supported if ``use_reentrant=False``.
+        determinism_check(str, optional): A string specifying the determinism
+            check to perform. By default it is set to ``"default"`` which
+            compares the shapes, dtypes, and devices of the recomputed tensors
+            against those the saved tensors. To turn off this check, specify
+            ``"none"``. Currently these are the only two supported values.
+            Please open an issue if you would like to see more determinism
+            checks. This argument is only supported if ``use_reentrant=False``,
+            if ``use_reentrant=True``, the determinism check is always disabled.
+        debug(bool, optional): If ``True``, error messages will also include
+            a trace of the operators ran during the original forward computation
+            as well as the recomputation. This argument is only supported if
+            ``use_reentrant=False``.
+        args: tuple containing inputs to the :attr:`function`
+
+    Returns:
+        Output of running :attr:`function` on :attr:`*args`
+    """
+    if use_reentrant is None:
+        warnings.warn(
+            "torch.utils.checkpoint: the use_reentrant parameter should be "
+            "passed explicitly. In version 2.4 we will raise an exception "
+            "if use_reentrant is not passed. use_reentrant=False is "
+            "recommended, but if you need to preserve the current default "
+            "behavior, you can pass use_reentrant=True. Refer to docs for more "
+            "details on the differences between the two variants."
+        )
+        use_reentrant = True
+
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop("preserve_rng_state", True)
+    if kwargs and use_reentrant:
+        raise ValueError(
+            "Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)
+        )
+
+    if use_reentrant:
+        if context_fn is not noop_context_fn or debug is not False:
+            raise ValueError(
+                "Passing `context_fn` or `debug` is only supported when "
+                "use_reentrant=False."
+            )
+        return CheckpointFunction.apply(function, preserve, *args)
+    else:
+        gen = _checkpoint_without_reentrant_generator(
+            function, preserve, context_fn, determinism_check, debug, *args, **kwargs
+        )
+        # Runs pre-forward logic
+        next(gen)
+        ret = function(*args, **kwargs)
+        # Runs post-forward logic
+        try:
+            next(gen)
+        except StopIteration:
+            return ret
+
+
+def checkpoint_sequential(functions, segments, input, use_reentrant=None, **kwargs):
+    r"""Checkpoint a sequential model to save memory.
+
+    Sequential models execute a list of modules/functions in order
+    (sequentially). Therefore, we can divide such a model in various segments
+    and checkpoint each segment. All segments except the last will not store
+    the intermediate activations. The inputs of each checkpointed segment will
+    be saved for re-running the segment in the backward pass.
+
+    .. warning::
+        The ``use_reentrant`` parameter should be passed explicitly. In version
+        2.4 we will raise an exception if ``use_reentrant`` is not passed.
+        If you are using the ``use_reentrant=True` variant, please see
+        :func:`~torch.utils.checkpoint.checkpoint` for
+        the important considerations and limitations of this variant. It is
+        recommended that you use ``use_reentrant=False``.
+
+    .. warning:
+        Since PyTorch 1.4, it allows only one Tensor as the input and
+        intermediate outputs, just like :class:`torch.nn.Sequential`.
+
+    Args:
+        functions: A :class:`torch.nn.Sequential` or the list of modules or
+            functions (comprising the model) to run sequentially.
+        segments: Number of chunks to create in the model
+        input: A Tensor that is input to :attr:`functions`
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        use_reentrant(bool):
+            specify whether to use the activation checkpoint variant that
+            requires reentrant autograd. This parameter should be passed
+            explicitly. In version 2.4 we will raise an exception if
+            ``use_reentrant`` is not passed. If ``use_reentrant=False``,
+            ``checkpoint`` will use an implementation that does not require
+            reentrant autograd. This allows ``checkpoint`` to support additional
+            functionality, such as working as expected with
+            ``torch.autograd.grad`` and support for keyword arguments input into
+            the checkpointed function.
+
+    Returns:
+        Output of running :attr:`functions` sequentially on :attr:`*inputs`
+
+    Example:
+        >>> # xdoctest: +SKIP("stub")
+        >>> model = nn.Sequential(...)
+        >>> input_var = checkpoint_sequential(model, chunks, input_var)
+    """
+    if use_reentrant is None:
+        warnings.warn(
+            "torch.utils.checkpoint.checkpoint_sequential: the use_reentrant "
+            "parameter should be passed explicitly. "
+            "In version 2.4 we will raise an exception if use_reentrant "
+            "is not passed. use_reentrant=False is "
+            "recommended, but if you need to preserve the current default "
+            "behavior, you can pass use_reentrant=True. Refer to docs for more "
+            "details on the differences between the two variants."
+        )
+        use_reentrant = True
+
+    # Hack for keyword-only parameter in a python 2.7-compliant way
+    preserve = kwargs.pop("preserve_rng_state", True)
+    if kwargs:
+        raise ValueError(
+            "Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)
+        )
+
+    def run_function(start, end, functions):
+        def forward(input):
+            for j in range(start, end + 1):
+                input = functions[j](input)
+            return input
+
+        return forward
+
+    if isinstance(functions, torch.nn.Sequential):
+        functions = list(functions.children())
+
+    segment_size = len(functions) // segments
+    # the last chunk has to be non-volatile
+    end = -1
+    for start in range(0, segment_size * (segments - 1), segment_size):
+        end = start + segment_size - 1
+        input = checkpoint(
+            run_function(start, end, functions),
+            input,
+            use_reentrant=use_reentrant,
+            preserve_rng_state=preserve,
+        )
+    return run_function(end + 1, len(functions) - 1, functions)(input)
+
+
+def _internal_assert(cond):
+    if not cond:
+        raise AssertionError(
+            "Something went unexpectedly wrong in activation checkpoint. "
+            "Please report this bug by filing an issue to PyTorch."
+        )
+
+
+# NOTE [ Nestable Checkpoint ]
+#
+# The semantics of nested checkpoint can be defined by two basic rules.
+# Following the two rules leads to an important implication that is central
+# to motivating the design.
+#
+# Rule 1. Saved tensors are managed by inner-most checkpoint only and hidden
+#         from any outer layers of checkpoint.
+#
+# Rule 2. The inputs of inner checkpoints are treated as tensors saved to its
+#         parent checkpoint.
+#
+# Implication: To recompute any given saved tensor, we need to recompute all of
+#              the checkpoints wrapping it.
+#
+# Why is this implied? To unpack a saved tensor X during backward we need to
+# recompute the inner-most checkpoint (#1), and in order to recompute that
+# checkpoint I need to have its inputs, which are managed by that checkpoint's
+# parent (#2), which thus also needs to be recomputed first. Continue this line
+# of reasoning and we realize that in order to unpack X, all checkpoints that
+# were active at the time X was saved need to be recomputed. (unless we have
+# already done so in that backward for some other saved tensor).
+#
+# In practice, we use a noop autograd Function to save inputs as saved tensors.
+# During unpack calling ctx.saved_tensor triggers the parent checkpoint to
+# recompute.
+#
+# Rule 3. We should start recomputation as if there are no checkpoints currently
+#         active. Checkpoints encountered during recomputation are still
+#         respected.
+#
+# When we start recomputation, we push the saved variable hook meant for
+# recomputation on the stack. See examples in Rule 6 for more context.
+#
+#                                  * * * *
+#
+# Beyond the basic semantics specific to nested checkpoint, we impose several
+# more constraints that may apply to checkpointing in general.
+#
+# Rule 4. Lifetime of recomputed tensors
+#
+#         Recomputed tensors are considered specific to particular invocations
+#         of backward and are always cleared immediately as they are unpacked
+#         Particularly, we require this to happen even if retain_graph=True.
+#
+# [ Implementation details of Rule 4 ]
+#
+# If we were okay with recomputed tensors staying alive after backward is run
+# with retain_graph=True, we would store recomputed variables as the values of a
+# WeakKeyDictionary and pack strong references to the keys, so that as we
+# backward, those packed keys would be cleared as long as retain_graph=False.
+# Clearing the packed key clears the corresponding entry in the WKD.
+#
+# If we wish recomputed variables to be immediately cleared as we unpack them in
+# the retain_graph=True case, we cannot rely on the packed keys to be cleared by
+# backward automatically. Instead of packing the strong reference to the key
+# directly, we pack a container object, which we manually clear as we unpack.
+#
+# An important detail is that if a second backward happens, the second
+# recomputation needs to reset the container with a newly created key.
+#
+# Rule 5. Stop recomputation as soon as we've recomputed the saved tensors we
+#         know we need.
+#
+# [ Implementation details of Rule 5 ]
+#
+# During recomputation, raise an exception if the number of recomputed tensors
+# matches the number of tensors that we expected to recompute. We wrap the
+# recomputation call with a try-catch to catch this specific exception. See
+# Rule #6 below for some examples.
+#
+# Rule 6. We support doing backward inside checkpoint context
+#
+# [ retain_graph is True]
+#
+# def fn(x):
+#   y = x.sin()
+#   z = y.cos()
+#   gx, = torch.autograd.grad(z, x, retains_grad=True)
+#   return gx, z
+#
+# out = checkpoint(fn)(inp)
+# out.backward()
+#
+# Because z is saved by cos while checkpoint is enabled, it would not be
+# actually saved, and so the .grad() call inside must trigger a recomputation.
+#
+# During recomputation the "inner pack hook" has two responsibilities:
+#
+# 1) As usual, populating the WeakKeyDictionary storing recomputed tensors
+# 2) Pack the actual tensor (detached) so that one may perform backward on the
+#    recomputed graph. The tensors saved to this graph will live until the end
+#    of recomputation, or die earlier if someone performs backward with
+#    retain_graph=False.
+#
+# More generally performing backward on the recomputed graph occurs in the
+# following cases:
+# - If backward is performed inside forward,
+#   - During the original forward IF early-stop is disabled
+#   - During the original backward
+# - If there are multiple .grad()/.backward() calls, we would perform backward
+#   on the recomputed graph even if early-stop is enabled (see the example below)
+#
+# [ retain_graph is False ]
+#
+# The example below shows what happens if during recomputation we find that some
+# of the tensors we are trying to recompute have already been cleared.
+#
+# Spoiler: we don't do anything special, we just skip over them!
+#
+# def fn(x):
+#   y = x.sin()                           # (1)
+#   z = y.cos()                           # (2)
+#   gx, = torch.autograd.grad(z, x)       # (3)
+#   return x.cos() * gx                   # (4)
+#
+# out = checkpoint(fn)(inp)
+# out.backward()                          # (5)
+#
+# 1, 2. Don't save x and y since we are inside a checkpoint.
+# 3. Trigger a recompute of fn since x and y weren't saved.
+#    And depending on whether early stop is enabled, either stop at (2) or
+#    continue running the function.
+#    Because we are running backward with retain_graph=False, we clear x and y's
+#    holders.
+# 4. Don't save x since we are inside a checkpoint.
+# 5. Calling backward triggers another recompute of fn. During recompute, we see
+#    that x and y have already been cleared in the original graph as indicated
+#    by holder=None. We skip over them. We still save x at (4) (since its holder
+#    is still alive.)
+
+_enable_checkpoint_early_stop = True
+
+
+@contextlib.contextmanager
+def set_checkpoint_early_stop(enable: bool):
+    """Context manager that sets whether checkpoint should stop recomputation early.
+
+    By default, non-reentrant checkpoint stops recomputation as soon as it
+    has computed all needed Tensors. This context manager can be used to disable
+    that feature if it is problematic for your specific application.
+
+    This context manager only needs to be active when forward is run. It does
+    not need to be active during backward.
+
+    Example::
+
+    >>> # xdoctest: +SKIP(failing)
+    >>> message = "saved tensors default hooks are disabled"
+    >>> with set_checkpoint_early_stop(False):
+    ...     # Any checkpoint under this context manager will respect this
+    ...     # context manager, even if its backward is performed outside.
+    ...     out = checkpoint(fn, inputs)
+    ...
+    >>> out.backward()
+    """
+    global _enable_checkpoint_early_stop
+    try:
+        prev = _enable_checkpoint_early_stop
+        _enable_checkpoint_early_stop = enable
+        yield
+    finally:
+        _enable_checkpoint_early_stop = prev
+
+
+class _Handle:
+    pass
+
+
+class _Holder:
+    def __init__(self):
+        self.handles: Dict[int, Optional[_Handle]] = dict()
+
+
+class _NoopSaveInputs(torch.autograd.Function):
+    @staticmethod
+    def forward(*args):
+        return torch.empty((0,))
+
+    @staticmethod
+    def setup_context(ctx: Any, inputs: Tuple[Any, ...], output: Any) -> None:
+        # Only tensors can be saved with ctx.save_for_backward, everything else
+        # is captured by get_args, which is saved directly on ctx
+        tensor_indices, tensors = zip(
+            *[(i, o) for i, o in enumerate(inputs) if isinstance(o, torch.Tensor)]
+        )
+        idx2saved_idx = {b: a for a, b in enumerate(tensor_indices)}
+        # args but with tensors replaced with None as placeholders
+        args = [None if isinstance(o, torch.Tensor) else o for o in inputs]
+
+        def get_args(saved_tensors):
+            # restore the placeholders with the original tensors grabbed from
+            # ctx.saved_tensors (which may be saved on a parent checkpoint if
+            # this checkpoint is nested, and that would trigger a recursive
+            # unpack!)
+            ret = [
+                saved_tensors[idx2saved_idx[i]] if i in tensor_indices else o
+                for i, o in enumerate(args)
+            ]
+            # grab the tail since we also saved the dummy to avoid having to explicitly
+            # handle the case where there are no tensor inputs
+            return ret[1:]
+
+        ctx.get_args = get_args
+        ctx.save_for_backward(*tensors)
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        raise AssertionError("Did not expect to backward on this graph")
+
+
+class _CheckpointFrame:
+    def __init__(self, recompute_fn, early_stop, unpack_error_cb, metadata_fn):
+        self.recompute_fn = recompute_fn
+        self.input_saver = None
+        self.weak_holders: List[ReferenceType] = []
+        # We store this as a weakkeydictionary so that in the case of a partial
+        # backward, the entries in the dict are cleared alongside the Holder
+        # which will be removed when the SavedVariable is cleared.
+        self.recomputed: DefaultDict[
+            int, weakref.WeakKeyDictionary[_Handle, torch.Tensor]
+        ] = defaultdict(weakref.WeakKeyDictionary)
+        # We need both recomp_counter and recomputed since they can diverge
+        # https://github.com/pytorch/pytorch/pull/90105#discussion_r1135889885
+        self.recomp_counter: DefaultDict[int, int] = defaultdict(int)
+        self.is_recomputed: DefaultDict[int, bool] = defaultdict(bool)
+
+        # See Rule 5
+        self.early_stop = early_stop
+
+        # Debugging
+        self.metadata_fn = metadata_fn
+        self.unpack_error_cb = unpack_error_cb
+        self.x_metadatas = []
+        self.forward_completed = False
+        self.ignore_saved_mismatch = False
+
+    def check_recomputed_tensors_match(self, gid):
+        if self.ignore_saved_mismatch:
+            # TODO: we can probably make this check stricter by checking that
+            #       the metadata of the first tensors still match.
+            return
+        # NOTE [ Error handling for checkpoint ]
+        #
+        # At a high level, we need to check that the tensors saved
+        # during original forward matches tensors saved during recompute
+        # This means handling 3 cases:
+        #
+        # 1. During recompute, more tensors were saved.
+        #
+        #    Usually this is hidden due to the StopRecomputationError
+        #    but if early stop is not enabled, or we would have errored
+        #    anyway because there aren't enough weak_holders. But we
+        #    do want to have a nice error. See the _recomputation_hook
+        #    for details.
+        if not len(self.weak_holders) == self.recomp_counter[gid]:
+            # 2. During recompute, fewer tensors were saved
+            #
+            # We know that everytime we save something do original forward
+            # we append to weak_holder, and every time we save a tensor
+            # during recompute we increment recompute_counter.
+            raise CheckpointError(
+                "torch.utils.checkpoint: A different number of tensors was saved "
+                "during the original forward and recomputation.\n"
+                f"Number of tensors saved during forward: {len(self.weak_holders)}\n"
+                f"Number of tensors saved during recomputation: {self.recomp_counter[gid]}"
+            )
+
+        # 3. During recompute, the same tensors were saved, but they
+        #    have different metadata
+        nb_meta_different = []
+        for idx, weak_holder in enumerate(self.weak_holders):
+            holder = weak_holder()
+            if holder is None:
+                continue
+            # We've seen all holders since we iterate over them in order
+            # For every holder that is still alive now, it must've been
+            # alive when we saw it during recompute, therefore, the
+            # gid must be set.
+            _internal_assert(gid in holder.handles)
+            # We know this is the first unpack, so it couldn't have been set
+            # to None yet.
+            _internal_assert(holder.handles[gid] is not None)
+            # We always set these together in the recomputation hook
+            _internal_assert(holder.handles[gid] in self.recomputed[gid])
+            # see pack hook, x_metadata is 1:1 with weak_holders.
+            x_meta = self.x_metadatas[idx]
+            recomputed_x = self.recomputed[gid][holder.handles[gid]]
+            if x_meta != self.metadata_fn(recomputed_x):
+                nb_meta_different.append((idx, x_meta, self.metadata_fn(recomputed_x)))
+
+        if len(nb_meta_different) > 0:
+            mismatched_tensors = ""
+            for idx, x_meta, recomputed_meta in nb_meta_different:
+                mismatched_tensors += (
+                    f"tensor at position {idx}:\n"
+                    f"saved metadata: {x_meta}\n"
+                    f"recomputed metadata: {recomputed_meta}\n"
+                )
+            raise CheckpointError(
+                "torch.utils.checkpoint: Recomputed values for the following tensors "
+                "have different metadata than during the forward pass.\n"
+                f"{mismatched_tensors}"
+            )
+
+
+_checkpoint_error_template = """ \
+An error happened while unpacking tensors; dumping logs of latest computation
+because you passed `debug=True` to `torch.utils.checkpoint.checkpoint()`.
+Scroll all the way down for guidance on how to navigate these logs.
+
++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
+|        1. Stack traces of the operators that ran in the original forward     |
++------------------------------------------------------------------------------+
+
+{forward_traces}
++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
+|        2. Stack traces of the operators that ran during recomputation        |
++------------------------------------------------------------------------------+
+
+{recompute_traces}
++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~+
+|       3. Log of operators in the original forward and recomputation          |
++------------------------------------------------------------------------------+
+(Scroll up to correlate stack traces with each operation listed below. This
+ helps identify their source in the code.)
+
+IMPORTANT: Differences in "detach" calls between the original forward and the
+           recomputation are expected. They are introduced by the checkpointing
+           mechanism and can be ignored.
+
+Operations executed during the original forward:
+
+{forward_ops}
+
+Operations executed during recomputation:
+
+{recompute_ops}
+
++------------------------------------------------------------------------------+
+ ERROR: Detected non-determinism while running activation checkpointing
+
+ You are seeing this error because you passed `debug=True` to checkpoint and
+ tensors to be saved during the original forward and differ between those saved
+ during recomputation. This can happen if different operators were ran in the
+ original forward and in the recomputation.
+
+ To identify where the mismatch may be coming from, you can do the following:
+
+ 1) Compare the operators ran during original forward and recomputation to
+    see where they differ. These operators are printed above in the order they
+    were executed.
+
+ 2) Review the stack trace for each operator to locate its invocation source.
+    Each operator's stack trace is printed in their execution order.
+
+ Note that the logs can be quite long. Here's how they are structured:
+ (Tip: you can Ctrl-f for these headers)
+
+ 1. Stack traces of the operators that ran in the original forward
+ 2. Stack traces of the operators that ran during recomputation
+ 3. Log of operators in the original forward and recomputation
+ 4. Error message                                             <--- You are here
+--------------------------------------------------------------------------------
+"""
+
+class CheckpointError(RuntimeError):
+    pass
+
+
+def _get_debug_context_and_cb() -> Tuple[Callable[[], Any], Callable[[CheckpointError], None]]:
+    # This function returns the context_fn and error_cb to be used by the
+    # checkpointing mechanism. error_cb is invoked when an error is detected
+    # during unpack.
+
+    # record_context_cpp is not support on non-linux non-x86_64 platforms
+    cpp_tb = platform.machine() == 'x86_64' and platform.system() == 'Linux'
+
+    class CaptureLogs:
+        def __init__(self):
+            self.logs = None
+            self.tbs = None
+
+        def get_context_manager(self):
+            @contextlib.contextmanager
+            def logging_mode():
+                with LoggingTensorMode(), \
+                     capture_logs(True, python_tb=True, script_tb=True, cpp_tb=cpp_tb) as logs_and_tb:
+                    self.logs, self.tbs = logs_and_tb
+                    yield logs_and_tb
+            return logging_mode()
+
+    capture_logs_fwd = CaptureLogs()
+    capture_logs_recompute = CaptureLogs()
+
+    def unpack_error_cb(e: CheckpointError):
+        def get_str_tb(label, capture_logs):
+            out = ""
+            total_len = len(capture_logs.logs)
+            for i, (log, tb) in enumerate(zip(capture_logs.logs, capture_logs.tbs)):
+                out += f"{log}   ({i + 1} of {total_len} in {label})\n\n"
+                found_torch_dispatch = False
+                for line in tb:
+                    # Start printing stack trace only after __torch_dispatch__ is found
+                    is_torch_dispatch = line['name'] == '__torch_dispatch__'
+                    if not found_torch_dispatch and not is_torch_dispatch:
+                        continue
+                    elif is_torch_dispatch:
+                        found_torch_dispatch = True
+                        continue
+                    out += f"{line['filename']}:{line['line']}:{line['name']}\n"
+                out += "\n\n"
+            return out
+        assert capture_logs_fwd.logs is not None
+        assert capture_logs_recompute.logs is not None
+        raise CheckpointError(
+            _checkpoint_error_template.format(
+                forward_traces=get_str_tb("original", capture_logs_fwd),
+                recompute_traces=get_str_tb("recompute", capture_logs_recompute),
+                forward_ops="\n".join(capture_logs_fwd.logs),
+                recompute_ops="\n".join(capture_logs_recompute.logs)
+            )
+        ) from e
+
+    def context_fn():
+        return capture_logs_fwd.get_context_manager(), capture_logs_recompute.get_context_manager()
+
+    return context_fn, unpack_error_cb
+
+def _default_meta_extractor(x: torch.Tensor) -> Dict[str, Any]:
+    # These properties are fast to check, easy to understand
+    return {
+        "shape": x.shape,
+        "dtype": x.dtype,
+        "device": x.device
+    }
+
+_allowed_determinism_checks_to_fns: Dict[str, Callable[[torch.Tensor], Any]] = {
+    _DEFAULT_DETERMINISM_MODE: _default_meta_extractor,
+    "none": lambda _: None,
+}
+
+# See Rule 5
+class _StopRecomputationError(Exception):
+    pass
+
+
+class _recomputation_hook(torch.autograd.graph.saved_tensors_hooks):
+    def __init__(self, target_frame_ref: ReferenceType, gid: int):
+        def pack_hook(x):
+            target_frame = target_frame_ref()
+            assert target_frame is not None  # appease mypy
+            recomp_idx = target_frame.recomp_counter[gid]
+            target_frame.recomp_counter[gid] += 1
+
+            if recomp_idx >= len(target_frame.weak_holders):
+                assert not target_frame.early_stop
+                if not target_frame.forward_completed:
+                    # We run into this case when early stop is not enabled and do
+                    # grad within checkpoint.
+                    # We need to set this flag, so we don't error out later when
+                    # we check if the number of tensors saved during forward and
+                    # recomputation match.
+                    target_frame.ignore_saved_mismatch = True
+                    return x.detach()
+                raise CheckpointError(
+                    "torch.utils.checkpoint: trying to save more tensors during "
+                    "recomputation than during the original forward pass."
+                )
+
+            holder = target_frame.weak_holders[recomp_idx]()
+
+            # This holder may have been cleared because someone may have called
+            # backward within forward. If so, we don't need to save.
+            if holder is not None:
+                _internal_assert(holder.handles.get(gid, None) is None)
+                holder.handles[gid] = _Handle()
+                target_frame.recomputed[gid][holder.handles[gid]] = x.detach()
+
+            if target_frame.early_stop and target_frame.recomp_counter[gid] == len(
+                target_frame.weak_holders
+            ):
+                raise _StopRecomputationError()
+            # See Rule 6: [ retain_graph is True ] above
+            return x.detach()
+
+        def unpack_hook(x):
+            # See Rule 6: [ retain_graph is True ] above for an example of when
+            # the graph created during recomputation could be backwarded.
+            return x
+
+        super().__init__(pack_hook, unpack_hook)
+
+
+class _checkpoint_hook(torch.autograd.graph.saved_tensors_hooks):
+    def __init__(self, frame):
+        def pack_hook(x):
+            # See Rule 4 above
+            holder = _Holder()
+            frame.weak_holders.append(weakref.ref(holder))
+            # Save metadata to detect non-determinism
+            if frame.metadata_fn is not None:
+                with torch.no_grad():
+                    frame.x_metadatas.append(frame.metadata_fn(x))
+            return holder
+
+        def unpack_hook(holder):
+            gid = torch._C._current_graph_task_id()
+            if gid == -1:
+                # generate a temporary id if we trigger unpack outside of a backward call
+                gid = int(uuid.uuid4())
+
+            if not frame.is_recomputed[gid]:
+                ctx = frame.input_saver.grad_fn
+                args = ctx.get_args(ctx.saved_tensors)
+
+                try:
+                    with _recomputation_hook(
+                        weakref.ref(frame), gid
+                    ), torch.autograd.enable_grad():
+                        frame.recompute_fn(*args)
+                except _StopRecomputationError:
+                    pass
+                frame.is_recomputed[gid] = True
+                frame.check_recomputed_tensors_match(gid)
+
+            _internal_assert(gid in holder.handles)
+
+            if holder.handles[gid] is None:
+                raise CheckpointError(
+                    "torch.utils.checkpoint: Unpack is being triggered for a tensor that was already "
+                    "unpacked once. If you are calling ctx.saved_tensors in backward, make sure to do "
+                    "so only once. Otherwise please open an issue with details on your use case."
+                )
+            _internal_assert(holder.handles[gid] in frame.recomputed[gid])
+            ret = frame.recomputed[gid][holder.handles[gid]]
+            holder.handles[gid] = None
+            return ret
+
+        if frame.unpack_error_cb is not None:
+            def unpack_hook_with_error_cb(holder):
+                try:
+                    return unpack_hook(holder)
+                except CheckpointError as e:
+                    frame.unpack_error_cb(e)
+            super().__init__(pack_hook, unpack_hook_with_error_cb)
+        else:
+            super().__init__(pack_hook, unpack_hook)
+
+
+def _is_compiling(func, args, kwargs):
+    # Check if we are under AOTAutograd tracing
+    # There should probably be a better way to do this...
+    # TODO: unify _is_compiling across all compile stacks
+    for arg in args:
+        if isinstance(arg, torch.Tensor) and is_fun(arg):
+            return True
+    return False
+
+
+def _detach(x):
+    if isinstance(x, torch.Tensor):
+        return x.detach()
+    return x
+
+
+uid = count(1)
+
+
+# NOTE: torch.utils.checkpoint internal logic will call these two functions unknown number of times
+# (i.e. there could be _CachedTorchDispatchMode calls that doesn't map to a _CachingTorchDispatchMode call),
+# so we ignore these ops and just always recompute them.
+_ignored_ops = {
+    torch.ops.prim.device.default,
+    torch.ops.aten.detach.default,
+} | set(torch._subclasses.functional_tensor.FunctionalTensor.metadata_fns)
+
+
+class _CachingTorchDispatchMode(TorchDispatchMode):
+    r"""
+    A :class:`TorchDispatchMode` to implement selective activation checkpointing
+    that's compatible with torch.compile. Used together with _CachedTorchDispatchMode.
+    """
+    def __init__(self, policy_fn, storage):
+        self.policy_fn = policy_fn
+        self.storage = storage
+
+    def push_into_storage(self, out, func, args, kwargs):
+        out_detached = tree_map(_detach, out)
+        self.storage[func].append(out_detached)
+
+    def _handle_compile_in_forward_ctx(self, should_not_recompute, func, args, kwargs):
+        if func in _ignored_ops:
+            return func(*args, **kwargs)
+        if should_not_recompute:
+            fx_traceback.current_meta["recompute"] = 0
+        # NOTE: Here we just store and reuse output of all ops, since in torch.compile mode
+        # we decide and handle recomputation in the partitioner.
+        out = func(*args, **kwargs)
+        self.push_into_storage(out, func, args, kwargs)
+        return out
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        should_not_recompute = self.policy_fn("forward", func, *args, **kwargs)
+        if _is_compiling(func, args, kwargs):
+            return self._handle_compile_in_forward_ctx(should_not_recompute, func, args, kwargs)
+        else:
+            if should_not_recompute:
+                out = func(*args, **kwargs)
+                self.push_into_storage(out, func, args, kwargs)
+            else:
+                out = func(*args, **kwargs)
+            return out
+
+
+class _CachedTorchDispatchMode(TorchDispatchMode):
+    r"""
+    A :class:`TorchDispatchMode` to implement selective activation checkpointing
+    that's compatible with torch.compile. Used together with _CachingTorchDispatchMode.
+    """
+    def __init__(self, policy_fn, storage):
+        self.policy_fn = policy_fn
+        self.storage = storage
+
+    def pop_from_storage(self, func, args, kwargs):
+        assert func in self.storage
+        out = self.storage[func].pop(0)
+        return out
+
+    def _handle_compile_in_recompute_ctx(self, should_not_recompute, func, args, kwargs):
+        if func in _ignored_ops:
+            return func(*args, **kwargs)
+        out = self.pop_from_storage(func, args, kwargs)
+        return out
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        should_not_recompute = self.policy_fn("recompute", func, *args, **kwargs)
+        if _is_compiling(func, args, kwargs):
+            return self._handle_compile_in_recompute_ctx(should_not_recompute, func, args, kwargs)
+        else:
+            if should_not_recompute:
+                out = self.pop_from_storage(func, args, kwargs)
+            else:
+                out = func(*args, **kwargs)
+            return out
+
+
+def _pt2_selective_checkpoint_context_fn_gen(policy_fn):
+    """
+    A helper function that generates a pair of contexts to be later passed into
+    `torch.utils.checkpoint` API to implment selective checkpointing.
+
+    .. warning::
+        This is context_fn is intended for use with torch.compile only.
+
+    Args:
+        policy_fn (Callable[[Callable, List[Any], Dict[str, Any]], bool]): Policy function
+            to decide whether a particular op should be recomputed in backward pass or not.
+            In eager mode:
+                If policy_fn(...) returns True, the op is guaranteed to NOT be recomputed.
+                If policy_fn(...) returns False, the op is guaranteed to be recomputed.
+            In torch.compile mode:
+                If policy_fn(...) returns True, the op is guaranteed to NOT be recomputed.
+                If policy_fn(...) returns False, the op may or may not be recomputed
+                (it's up to the partitioner to decide).
+
+    Returns:
+        A pair of generated contexts.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(LINUX)
+        >>>
+        >>> def get_custom_policy():
+        >>>     no_recompute_list = [
+        >>>         torch.ops.aten.mm.default,
+        >>>     ]
+        >>>     def custom_policy(mode, func, *args, **kwargs):
+        >>>         return func in no_recompute_list
+        >>>     return custom_policy
+        >>>
+        >>> def selective_checkpointing_context_fn():
+        >>>     return _pt2_selective_checkpoint_context_fn_gen(get_custom_policy())
+        >>>
+        >>> def gn(x, y):
+        >>>     return torch.sigmoid(torch.matmul(torch.matmul(x, y), y)) * y
+        >>>
+        >>> def fn(x, y):
+        >>>     return torch.utils.checkpoint.checkpoint(
+        >>>         gn, x, y,
+        >>>         use_reentrant=False,
+        >>>         context_fn=selective_checkpointing_context_fn,
+        >>>     )
+        >>>
+        >>> x = torch.randn(4, 4, requires_grad=True)
+        >>> y = torch.randn(4, 4, requires_grad=True)
+        >>>
+        >>> compiled_fn = torch.compile(fn)
+    """
+    storage: Dict[Any, List[Any]] = defaultdict(list)
+    return _CachingTorchDispatchMode(policy_fn, storage), _CachedTorchDispatchMode(policy_fn, storage)
+
+
+# NB: this helper wraps fn before calling checkpoint_impl. kwargs and
+#     saving/restoring of global state is handled here.
+
+def _checkpoint_without_reentrant_generator(
+    fn,
+    preserve_rng_state=True,
+    context_fn: Callable[[], Tuple[ContextManager, ContextManager]] = noop_context_fn,
+    determinism_check: str = _DEFAULT_DETERMINISM_MODE,
+    debug: bool = False,
+    *args,
+    **kwargs
+):
+    """Checkpointing without reentrant autograd.
+
+    Args:
+        function: describes what to run in the forward pass of the model or
+            part of the model. It should also know how to handle the inputs
+            passed as the tuple. For example, in LSTM, if user passes
+            ``(activation, hidden)``, :attr:`function` should correctly use the
+            first input as ``activation`` and the second input as ``hidden``
+        preserve_rng_state(bool, optional):  Omit stashing and restoring
+            the RNG state during each checkpoint.
+            Default: ``True``
+        context_fn(Callable, optional): A callable returning a tuple of two
+            context managers. The function and its recomputation will be run
+            under the first and second context managers respectively.
+        determinism_check(str, optional): A string specifying the determinism
+            check to perform. By default it is set to ``"default"`` which
+            compares the shapes, dtypes, and devices of the recomputed tensors
+            against those the saved tensors. To turn off this check, specify
+            ``"none"``. Currently these are the only two supported values.
+            Please open an issue if you would like to see more determinism
+            checks.
+        debug(bool, optional): If ``True``, error messages will also include
+            a trace of the operators ran during the original forward computation
+            as well as the recomputation.
+        *args: Arguments to pass in to the given ``function``.
+        **kwargs: Keyword arguments to pass into the given ``function``.
+    """
+    unpack_error_cb = None
+
+    if _checkpoint_debug_enabled if _checkpoint_debug_enabled is not None else debug:
+        if context_fn != noop_context_fn:
+            raise ValueError(
+                "debug=True is incompatible with non-default context_fn"
+            )
+        context_fn, unpack_error_cb = _get_debug_context_and_cb()
+
+    if determinism_check in _allowed_determinism_checks_to_fns:
+        metadata_fn = _allowed_determinism_checks_to_fns[determinism_check]
+    else:
+        raise ValueError(
+            f"determinism_check should be one of {list(_allowed_determinism_checks_to_fns.keys())}, "
+            f"but got {determinism_check}"
+        )
+
+    device = _infer_device_type(*args)
+    device_module = _get_device_module(device)
+    forward_context, recompute_context = context_fn()
+    if _is_compiling(fn, args, kwargs) and context_fn != noop_context_fn:
+        assert (
+            isinstance(forward_context, TorchDispatchMode) and
+            isinstance(recompute_context, TorchDispatchMode)
+        ), \
+            "In torch.compile mode, `context_fn` arg passed to `torch.utils.checkpoint` " + \
+            "must generate a tuple of two `TorchDispatchMode`s."
+    # Accommodates the (remote) possibility that autocast is enabled for cpu AND gpu.
+    device_autocast_kwargs, cpu_autocast_kwargs = _get_autocast_kwargs(device=device)
+
+    if preserve_rng_state:
+        fwd_cpu_state = torch.get_rng_state()
+        # Don't eagerly initialize the cuda context by accident.
+        # (If the user intends that the context is initialized later, within their
+        # run_function, we SHOULD actually stash the cuda state here.  Unfortunately,
+        # we have no way to anticipate this will happen before we run the function.
+        # If they do so, we raise an error.)
+        had_device_in_fwd = False
+        if getattr(device_module, "_initialized", False):
+            had_device_in_fwd = True
+            fwd_devices, fwd_device_states = get_device_states(*args)
+
+    def recompute_fn(*inputs):
+        kwargs, *args = inputs
+        # This will be called later during recomputation. This wrapping enables
+        # the necessary global state to be captured.
+        rng_devices = []
+        if preserve_rng_state and had_device_in_fwd:
+            rng_devices = fwd_devices
+        with torch.random.fork_rng(
+            devices=rng_devices, enabled=preserve_rng_state, device_type=device
+        ):
+            if preserve_rng_state:
+                torch.set_rng_state(fwd_cpu_state)
+                if had_device_in_fwd:
+                    set_device_states(fwd_devices, fwd_device_states)
+
+            device_autocast_ctx = device_module.amp.autocast(
+                **device_autocast_kwargs
+            ) if _supports_autocast(device) else contextlib.nullcontext()
+            with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), \
+                 recompute_context:
+                fn(*args, **kwargs)
+
+    new_frame = _CheckpointFrame(
+        recompute_fn,
+        _enable_checkpoint_early_stop,
+        unpack_error_cb,
+        metadata_fn
+    )
+    dummy = torch.empty((0,), requires_grad=True)
+    new_frame.input_saver = _NoopSaveInputs.apply(dummy, kwargs, *args)
+
+    # When ambient grad_mode is False
+    if new_frame.input_saver.grad_fn is None:
+        yield
+        return
+
+    with _checkpoint_hook(new_frame), forward_context:
+        yield
+    new_frame.forward_completed = True
+
+    if getattr(device_module, "_initialized", False) and \
+       preserve_rng_state and not had_device_in_fwd:  # type: ignore[possibly-undefined]
+        # Device was not initialized before running the forward, so we didn't
+        # stash the device state.
+        raise RuntimeError(
+            "PyTorch's device state was initialized in the forward pass "
+            "of a Checkpoint, which is not allowed. Please open an issue "
+            "if you need this feature."
+        )
+
+    return
diff --git a/MLPY/Lib/site-packages/torch/utils/collect_env.py b/MLPY/Lib/site-packages/torch/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d18ffed9ebd5b6687a69321b59fbf505497c704
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/collect_env.py
@@ -0,0 +1,624 @@
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+import datetime
+import locale
+import re
+import subprocess
+import sys
+import os
+from collections import namedtuple
+
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple('SystemEnv', [
+    'torch_version',
+    'is_debug_build',
+    'cuda_compiled_version',
+    'gcc_version',
+    'clang_version',
+    'cmake_version',
+    'os',
+    'libc_version',
+    'python_version',
+    'python_platform',
+    'is_cuda_available',
+    'cuda_runtime_version',
+    'cuda_module_loading',
+    'nvidia_driver_version',
+    'nvidia_gpu_models',
+    'cudnn_version',
+    'pip_version',  # 'pip' or 'pip3'
+    'pip_packages',
+    'conda_packages',
+    'hip_compiled_version',
+    'hip_runtime_version',
+    'miopen_runtime_version',
+    'caching_allocator_config',
+    'is_xnnpack_available',
+    'cpu_info',
+])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(
+        line
+        for line in out.splitlines()
+        if not line.startswith("#")
+        and any(name in line for name in patterns)
+    )
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == 'darwin':
+        cmd = 'kextstat | grep -i cuda'
+        return run_and_parse_first_match(run_lambda, cmd,
+                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)')
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, 'System32', 'where')
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == 'darwin':
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get('CUDNN_LIBRARY')
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split('\n'):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = '\n'.join(files)
+    return 'Probably one of the following:\n{}'.format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files')
+        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi)
+        new_path = os.path.join(system_root, 'System32', smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE')
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+        return "\n".join(
+            line
+            for line in out.splitlines()
+            if any(name in line for name in patterns)
+        )
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip([sys.executable, '-mpip'])
+
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get('CUDA_MODULE_LOADING', '')
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version, 'hip') or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        else:  # HIP version
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else 'N/A'
+
+            cfg = torch._C._show_config().split('\n')
+            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
+            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
+            cuda_version_str = 'N/A'
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+    )
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+CUDA used to build PyTorch: {cuda_compiled_version}
+ROCM used to build PyTorch: {hip_compiled_version}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+Is CUDA available: {is_cuda_available}
+CUDA runtime version: {cuda_runtime_version}
+CUDA_MODULE_LOADING set to: {cuda_module_loading}
+GPU models and configuration: {nvidia_gpu_models}
+Nvidia driver version: {nvidia_driver_version}
+cuDNN version: {cudnn_version}
+HIP runtime version: {hip_runtime_version}
+MIOpen runtime version: {miopen_runtime_version}
+Is XNNPACK available: {is_xnnpack_available}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+
+def pretty_str(envinfo):
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(
+        mutable_dict[field] is None for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'],
+                                               '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'],
+                                                 '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+
+if __name__ == '__main__':
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/cpp_backtrace.py b/MLPY/Lib/site-packages/torch/utils/cpp_backtrace.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1f833e49adb9674ae3024ade5d3b4d3652b05a3
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/cpp_backtrace.py
@@ -0,0 +1,11 @@
+from torch._C import _get_cpp_backtrace
+
+def get_cpp_backtrace(frames_to_skip=0, maximum_number_of_frames=64) -> str:
+    r"""
+    Return a string containing the C++ stack trace of the current thread.
+
+    Args:
+        frames_to_skip (int): the number of frames to skip from the top of the stack
+        maximum_number_of_frames (int): the maximum number of frames to return
+    """
+    return _get_cpp_backtrace(frames_to_skip, maximum_number_of_frames)
diff --git a/MLPY/Lib/site-packages/torch/utils/cpp_extension.py b/MLPY/Lib/site-packages/torch/utils/cpp_extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..431cc104e67bcdab8ca8dc32302ac27fe9081c8c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/cpp_extension.py
@@ -0,0 +1,2428 @@
+import copy
+import glob
+import importlib
+import importlib.abc
+import os
+import re
+import shlex
+import shutil
+import setuptools
+import subprocess
+import sys
+import sysconfig
+import warnings
+import collections
+from pathlib import Path
+import errno
+
+import torch
+import torch._appdirs
+from .file_baton import FileBaton
+from ._cpp_extension_versioner import ExtensionVersioner
+from .hipify import hipify_python
+from .hipify.hipify_python import GeneratedFileCleaner
+from typing import Dict, List, Optional, Union, Tuple
+from torch.torch_version import TorchVersion, Version
+
+from setuptools.command.build_ext import build_ext
+
+IS_WINDOWS = sys.platform == 'win32'
+IS_MACOS = sys.platform.startswith('darwin')
+IS_LINUX = sys.platform.startswith('linux')
+LIB_EXT = '.pyd' if IS_WINDOWS else '.so'
+EXEC_EXT = '.exe' if IS_WINDOWS else ''
+CLIB_PREFIX = '' if IS_WINDOWS else 'lib'
+CLIB_EXT = '.dll' if IS_WINDOWS else '.so'
+SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared'
+
+_HERE = os.path.abspath(__file__)
+_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
+TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')
+
+
+SUBPROCESS_DECODE_ARGS = ('oem',) if IS_WINDOWS else ()
+MINIMUM_GCC_VERSION = (5, 0, 0)
+MINIMUM_MSVC_VERSION = (19, 0, 24215)
+
+VersionRange = Tuple[Tuple[int, ...], Tuple[int, ...]]
+VersionMap = Dict[str, VersionRange]
+# The following values were taken from the following GitHub gist that
+# summarizes the minimum valid major versions of g++/clang++ for each supported
+# CUDA version: https://gist.github.com/ax3l/9489132
+# Or from include/crt/host_config.h in the CUDA SDK
+# The second value is the exclusive(!) upper bound, i.e. min <= version < max
+CUDA_GCC_VERSIONS: VersionMap = {
+    '11.0': (MINIMUM_GCC_VERSION, (10, 0)),
+    '11.1': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.2': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.3': (MINIMUM_GCC_VERSION, (11, 0)),
+    '11.4': ((6, 0, 0), (12, 0)),
+    '11.5': ((6, 0, 0), (12, 0)),
+    '11.6': ((6, 0, 0), (12, 0)),
+    '11.7': ((6, 0, 0), (12, 0)),
+}
+
+MINIMUM_CLANG_VERSION = (3, 3, 0)
+CUDA_CLANG_VERSIONS: VersionMap = {
+    '11.1': (MINIMUM_CLANG_VERSION, (11, 0)),
+    '11.2': (MINIMUM_CLANG_VERSION, (12, 0)),
+    '11.3': (MINIMUM_CLANG_VERSION, (12, 0)),
+    '11.4': (MINIMUM_CLANG_VERSION, (13, 0)),
+    '11.5': (MINIMUM_CLANG_VERSION, (13, 0)),
+    '11.6': (MINIMUM_CLANG_VERSION, (14, 0)),
+    '11.7': (MINIMUM_CLANG_VERSION, (14, 0)),
+}
+
+__all__ = ["get_default_build_root", "check_compiler_ok_for_platform", "get_compiler_abi_compatibility_and_version", "BuildExtension",
+           "CppExtension", "CUDAExtension", "include_paths", "library_paths", "load", "load_inline", "is_ninja_available",
+           "verify_ninja_availability", "remove_extension_h_precompiler_headers", "get_cxx_compiler", "check_compiler_is_gcc"]
+# Taken directly from python stdlib < 3.9
+# See https://github.com/pytorch/pytorch/issues/48617
+def _nt_quote_args(args: Optional[List[str]]) -> List[str]:
+    """Quote command-line arguments for DOS/Windows conventions.
+
+    Just wraps every argument which contains blanks in double quotes, and
+    returns a new argument list.
+    """
+    # Cover None-type
+    if not args:
+        return []
+    return [f'"{arg}"' if ' ' in arg else arg for arg in args]
+
+def _find_cuda_home() -> Optional[str]:
+    """Find the CUDA install path."""
+    # Guess #1
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        # Guess #2
+        try:
+            which = 'where' if IS_WINDOWS else 'which'
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output([which, 'nvcc'],
+                                               stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            # Guess #3
+            if IS_WINDOWS:
+                cuda_homes = glob.glob(
+                    'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+                if len(cuda_homes) == 0:
+                    cuda_home = ''
+                else:
+                    cuda_home = cuda_homes[0]
+            else:
+                cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = None
+    if cuda_home and not torch.cuda.is_available():
+        print(f"No CUDA runtime is found, using CUDA_HOME='{cuda_home}'",
+              file=sys.stderr)
+    return cuda_home
+
+def _find_rocm_home() -> Optional[str]:
+    """Find the ROCm install path."""
+    # Guess #1
+    rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
+    if rocm_home is None:
+        # Guess #2
+        hipcc_path = shutil.which('hipcc')
+        if hipcc_path is not None:
+            rocm_home = os.path.dirname(os.path.dirname(
+                os.path.realpath(hipcc_path)))
+            # can be either <ROCM_HOME>/hip/bin/hipcc or <ROCM_HOME>/bin/hipcc
+            if os.path.basename(rocm_home) == 'hip':
+                rocm_home = os.path.dirname(rocm_home)
+        else:
+            # Guess #3
+            fallback_path = '/opt/rocm'
+            if os.path.exists(fallback_path):
+                rocm_home = fallback_path
+    if rocm_home and torch.version.hip is None:
+        print(f"No ROCm runtime is found, using ROCM_HOME='{rocm_home}'",
+              file=sys.stderr)
+    return rocm_home
+
+
+def _join_rocm_home(*paths) -> str:
+    """
+    Join paths with ROCM_HOME, or raises an error if it ROCM_HOME is not set.
+
+    This is basically a lazy way of raising an error for missing $ROCM_HOME
+    only once we need to get any ROCm-specific path.
+    """
+    if ROCM_HOME is None:
+        raise OSError('ROCM_HOME environment variable is not set. '
+                      'Please set it to your ROCm install root.')
+    elif IS_WINDOWS:
+        raise OSError('Building PyTorch extensions using '
+                      'ROCm and Windows is not supported.')
+    return os.path.join(ROCM_HOME, *paths)
+
+
+ABI_INCOMPATIBILITY_WARNING = '''
+
+                               !! WARNING !!
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+Your compiler ({}) may be ABI-incompatible with PyTorch!
+Please use a compiler that is ABI-compatible with GCC 5.0 and above.
+See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html.
+
+See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
+for instructions on how to install GCC 5 or higher.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+                              !! WARNING !!
+'''
+WRONG_COMPILER_WARNING = '''
+
+                               !! WARNING !!
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+Your compiler ({user_compiler}) is not compatible with the compiler Pytorch was
+built with for this platform, which is {pytorch_compiler} on {platform}. Please
+use {pytorch_compiler} to to compile your extension. Alternatively, you may
+compile PyTorch from source using {user_compiler}, and then you can also use
+{user_compiler} to compile your extension.
+
+See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help
+with compiling PyTorch from source.
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+                              !! WARNING !!
+'''
+CUDA_MISMATCH_MESSAGE = '''
+The detected CUDA version ({0}) mismatches the version that was used to compile
+PyTorch ({1}). Please make sure to use the same CUDA versions.
+'''
+CUDA_MISMATCH_WARN = "The detected CUDA version ({0}) has a minor version mismatch with the version that was used to compile PyTorch ({1}). Most likely this shouldn't be a problem."
+CUDA_NOT_FOUND_MESSAGE = '''
+CUDA was not found on the system, please set the CUDA_HOME or the CUDA_PATH
+environment variable or add NVCC to your system PATH. The extension compilation will fail.
+'''
+ROCM_HOME = _find_rocm_home()
+HIP_HOME = _join_rocm_home('hip') if ROCM_HOME else None
+IS_HIP_EXTENSION = True if ((ROCM_HOME is not None) and (torch.version.hip is not None)) else False
+ROCM_VERSION = None
+if torch.version.hip is not None:
+    ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
+
+CUDA_HOME = _find_cuda_home() if torch.cuda._is_compiled() else None
+CUDNN_HOME = os.environ.get('CUDNN_HOME') or os.environ.get('CUDNN_PATH')
+# PyTorch releases have the version pattern major.minor.patch, whereas when
+# PyTorch is built from source, we append the git commit hash, which gives
+# it the below pattern.
+BUILT_FROM_SOURCE_VERSION_PATTERN = re.compile(r'\d+\.\d+\.\d+\w+\+\w+')
+
+COMMON_MSVC_FLAGS = ['/MD', '/wd4819', '/wd4251', '/wd4244', '/wd4267', '/wd4275', '/wd4018', '/wd4190', '/wd4624', '/wd4067', '/wd4068', '/EHsc']
+
+MSVC_IGNORE_CUDAFE_WARNINGS = [
+    'base_class_has_different_dll_interface',
+    'field_without_dll_interface',
+    'dll_interface_conflict_none_assumed',
+    'dll_interface_conflict_dllexport_assumed'
+]
+
+COMMON_NVCC_FLAGS = [
+    '-D__CUDA_NO_HALF_OPERATORS__',
+    '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
+    '-D__CUDA_NO_HALF2_OPERATORS__',
+    '--expt-relaxed-constexpr'
+]
+
+COMMON_HIP_FLAGS = [
+    '-fPIC',
+    '-D__HIP_PLATFORM_AMD__=1',
+    '-DUSE_ROCM=1',
+]
+
+if ROCM_VERSION is not None and ROCM_VERSION >= (6, 0):
+    COMMON_HIP_FLAGS.append('-DHIPBLAS_V2')
+
+COMMON_HIPCC_FLAGS = [
+    '-DCUDA_HAS_FP16=1',
+    '-D__HIP_NO_HALF_OPERATORS__=1',
+    '-D__HIP_NO_HALF_CONVERSIONS__=1',
+]
+
+JIT_EXTENSION_VERSIONER = ExtensionVersioner()
+
+PLAT_TO_VCVARS = {
+    'win32' : 'x86',
+    'win-amd64' : 'x86_amd64',
+}
+
+def get_cxx_compiler():
+    if IS_WINDOWS:
+        compiler = os.environ.get('CXX', 'cl')
+    else:
+        compiler = os.environ.get('CXX', 'c++')
+    return compiler
+
+def _is_binary_build() -> bool:
+    return not BUILT_FROM_SOURCE_VERSION_PATTERN.match(torch.version.__version__)
+
+
+def _accepted_compilers_for_platform() -> List[str]:
+    # gnu-c++ and gnu-cc are the conda gcc compilers
+    return ['clang++', 'clang'] if IS_MACOS else ['g++', 'gcc', 'gnu-c++', 'gnu-cc', 'clang++', 'clang']
+
+def _maybe_write(filename, new_content):
+    r'''
+    Equivalent to writing the content into the file but will not touch the file
+    if it already had the right content (to avoid triggering recompile).
+    '''
+    if os.path.exists(filename):
+        with open(filename) as f:
+            content = f.read()
+
+        if content == new_content:
+            # The file already contains the right thing!
+            return
+
+    with open(filename, 'w') as source_file:
+        source_file.write(new_content)
+
+def get_default_build_root() -> str:
+    """
+    Return the path to the root folder under which extensions will built.
+
+    For each extension module built, there will be one folder underneath the
+    folder returned by this function. For example, if ``p`` is the path
+    returned by this function and ``ext`` the name of an extension, the build
+    folder for the extension will be ``p/ext``.
+
+    This directory is **user-specific** so that multiple users on the same
+    machine won't meet permission issues.
+    """
+    return os.path.realpath(torch._appdirs.user_cache_dir(appname='torch_extensions'))
+
+
+def check_compiler_ok_for_platform(compiler: str) -> bool:
+    """
+    Verify that the compiler is the expected one for the current platform.
+
+    Args:
+        compiler (str): The compiler executable to check.
+
+    Returns:
+        True if the compiler is gcc/g++ on Linux or clang/clang++ on macOS,
+        and always True for Windows.
+    """
+    if IS_WINDOWS:
+        return True
+    which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)
+    # Use os.path.realpath to resolve any symlinks, in particular from 'c++' to e.g. 'g++'.
+    compiler_path = os.path.realpath(which.decode(*SUBPROCESS_DECODE_ARGS).strip())
+    # Check the compiler name
+    if any(name in compiler_path for name in _accepted_compilers_for_platform()):
+        return True
+    # If compiler wrapper is used try to infer the actual compiler by invoking it with -v flag
+    env = os.environ.copy()
+    env['LC_ALL'] = 'C'  # Don't localize output
+    version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+    if IS_LINUX:
+        # Check for 'gcc' or 'g++' for sccache wrapper
+        pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
+        results = re.findall(pattern, version_string)
+        if len(results) != 1:
+            # Clang is also a supported compiler on Linux
+            # Though on Ubuntu it's sometimes called "Ubuntu clang version"
+            return 'clang version' in version_string
+        compiler_path = os.path.realpath(results[0].strip())
+        # On RHEL/CentOS c++ is a gcc compiler wrapper
+        if os.path.basename(compiler_path) == 'c++' and 'gcc version' in version_string:
+            return True
+        return any(name in compiler_path for name in _accepted_compilers_for_platform())
+    if IS_MACOS:
+        # Check for 'clang' or 'clang++'
+        return version_string.startswith("Apple clang")
+    return False
+
+
+def get_compiler_abi_compatibility_and_version(compiler) -> Tuple[bool, TorchVersion]:
+    """
+    Determine if the given compiler is ABI-compatible with PyTorch alongside its version.
+
+    Args:
+        compiler (str): The compiler executable name to check (e.g. ``g++``).
+            Must be executable in a shell process.
+
+    Returns:
+        A tuple that contains a boolean that defines if the compiler is (likely) ABI-incompatible with PyTorch,
+        followed by a `TorchVersion` string that contains the compiler version separated by dots.
+    """
+    if not _is_binary_build():
+        return (True, TorchVersion('0.0.0'))
+    if os.environ.get('TORCH_DONT_CHECK_COMPILER_ABI') in ['ON', '1', 'YES', 'TRUE', 'Y']:
+        return (True, TorchVersion('0.0.0'))
+
+    # First check if the compiler is one of the expected ones for the particular platform.
+    if not check_compiler_ok_for_platform(compiler):
+        warnings.warn(WRONG_COMPILER_WARNING.format(
+            user_compiler=compiler,
+            pytorch_compiler=_accepted_compilers_for_platform()[0],
+            platform=sys.platform))
+        return (False, TorchVersion('0.0.0'))
+
+    if IS_MACOS:
+        # There is no particular minimum version we need for clang, so we're good here.
+        return (True, TorchVersion('0.0.0'))
+    try:
+        if IS_LINUX:
+            minimum_required_version = MINIMUM_GCC_VERSION
+            versionstr = subprocess.check_output([compiler, '-dumpfullversion', '-dumpversion'])
+            version = versionstr.decode(*SUBPROCESS_DECODE_ARGS).strip().split('.')
+        else:
+            minimum_required_version = MINIMUM_MSVC_VERSION
+            compiler_info = subprocess.check_output(compiler, stderr=subprocess.STDOUT)
+            match = re.search(r'(\d+)\.(\d+)\.(\d+)', compiler_info.decode(*SUBPROCESS_DECODE_ARGS).strip())
+            version = ['0', '0', '0'] if match is None else list(match.groups())
+    except Exception:
+        _, error, _ = sys.exc_info()
+        warnings.warn(f'Error checking compiler version for {compiler}: {error}')
+        return (False, TorchVersion('0.0.0'))
+
+    if tuple(map(int, version)) >= minimum_required_version:
+        return (True, TorchVersion('.'.join(version)))
+
+    compiler = f'{compiler} {".".join(version)}'
+    warnings.warn(ABI_INCOMPATIBILITY_WARNING.format(compiler))
+
+    return (False, TorchVersion('.'.join(version)))
+
+
+def _check_cuda_version(compiler_name: str, compiler_version: TorchVersion) -> None:
+    if not CUDA_HOME:
+        raise RuntimeError(CUDA_NOT_FOUND_MESSAGE)
+
+    nvcc = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+    cuda_version_str = subprocess.check_output([nvcc, '--version']).strip().decode(*SUBPROCESS_DECODE_ARGS)
+    cuda_version = re.search(r'release (\d+[.]\d+)', cuda_version_str)
+    if cuda_version is None:
+        return
+
+    cuda_str_version = cuda_version.group(1)
+    cuda_ver = Version(cuda_str_version)
+    if torch.version.cuda is None:
+        return
+
+    torch_cuda_version = Version(torch.version.cuda)
+    if cuda_ver != torch_cuda_version:
+        # major/minor attributes are only available in setuptools>=49.4.0
+        if getattr(cuda_ver, "major", None) is None:
+            raise ValueError("setuptools>=49.4.0 is required")
+        if cuda_ver.major != torch_cuda_version.major:
+            raise RuntimeError(CUDA_MISMATCH_MESSAGE.format(cuda_str_version, torch.version.cuda))
+        warnings.warn(CUDA_MISMATCH_WARN.format(cuda_str_version, torch.version.cuda))
+
+    if not (sys.platform.startswith('linux') and
+            os.environ.get('TORCH_DONT_CHECK_COMPILER_ABI') not in ['ON', '1', 'YES', 'TRUE', 'Y'] and
+            _is_binary_build()):
+        return
+
+    cuda_compiler_bounds: VersionMap = CUDA_CLANG_VERSIONS if compiler_name.startswith('clang') else CUDA_GCC_VERSIONS
+
+    if cuda_str_version not in cuda_compiler_bounds:
+        warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')
+    else:
+        min_compiler_version, max_excl_compiler_version = cuda_compiler_bounds[cuda_str_version]
+        # Special case for 11.4.0, which has lower compiler bounds than 11.4.1
+        if "V11.4.48" in cuda_version_str and cuda_compiler_bounds == CUDA_GCC_VERSIONS:
+            max_excl_compiler_version = (11, 0)
+        min_compiler_version_str = '.'.join(map(str, min_compiler_version))
+        max_excl_compiler_version_str = '.'.join(map(str, max_excl_compiler_version))
+
+        version_bound_str = f'>={min_compiler_version_str}, <{max_excl_compiler_version_str}'
+
+        if compiler_version < TorchVersion(min_compiler_version_str):
+            raise RuntimeError(
+                f'The current installed version of {compiler_name} ({compiler_version}) is less '
+                f'than the minimum required version by CUDA {cuda_str_version} ({min_compiler_version_str}). '
+                f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
+            )
+        if compiler_version >= TorchVersion(max_excl_compiler_version_str):
+            raise RuntimeError(
+                f'The current installed version of {compiler_name} ({compiler_version}) is greater '
+                f'than the maximum required version by CUDA {cuda_str_version}. '
+                f'Please make sure to use an adequate version of {compiler_name} ({version_bound_str}).'
+            )
+
+
+class BuildExtension(build_ext):
+    """
+    A custom :mod:`setuptools` build extension .
+
+    This :class:`setuptools.build_ext` subclass takes care of passing the
+    minimum required compiler flags (e.g. ``-std=c++17``) as well as mixed
+    C++/CUDA compilation (and support for CUDA files in general).
+
+    When using :class:`BuildExtension`, it is allowed to supply a dictionary
+    for ``extra_compile_args`` (rather than the usual list) that maps from
+    languages (``cxx`` or ``nvcc``) to a list of additional compiler flags to
+    supply to the compiler. This makes it possible to supply different flags to
+    the C++ and CUDA compiler during mixed compilation.
+
+    ``use_ninja`` (bool): If ``use_ninja`` is ``True`` (default), then we
+    attempt to build using the Ninja backend. Ninja greatly speeds up
+    compilation compared to the standard ``setuptools.build_ext``.
+    Fallbacks to the standard distutils backend if Ninja is not available.
+
+    .. note::
+        By default, the Ninja backend uses #CPUS + 2 workers to build the
+        extension. This may use up too many resources on some systems. One
+        can control the number of workers by setting the `MAX_JOBS` environment
+        variable to a non-negative number.
+    """
+
+    @classmethod
+    def with_options(cls, **options):
+        """Return a subclass with alternative constructor that extends any original keyword arguments to the original constructor with the given options."""
+        class cls_with_options(cls):  # type: ignore[misc, valid-type]
+            def __init__(self, *args, **kwargs):
+                kwargs.update(options)
+                super().__init__(*args, **kwargs)
+
+        return cls_with_options
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", False)
+
+        self.use_ninja = kwargs.get('use_ninja', True)
+        if self.use_ninja:
+            # Test if we can use ninja. Fallback otherwise.
+            msg = ('Attempted to use ninja as the BuildExtension backend but '
+                   '{}. Falling back to using the slow distutils backend.')
+            if not is_ninja_available():
+                warnings.warn(msg.format('we could not find ninja.'))
+                self.use_ninja = False
+
+    def finalize_options(self) -> None:
+        super().finalize_options()
+        if self.use_ninja:
+            self.force = True
+
+    def build_extensions(self) -> None:
+        compiler_name, compiler_version = self._check_abi()
+
+        cuda_ext = False
+        extension_iter = iter(self.extensions)
+        extension = next(extension_iter, None)
+        while not cuda_ext and extension:
+            for source in extension.sources:
+                _, ext = os.path.splitext(source)
+                if ext == '.cu':
+                    cuda_ext = True
+                    break
+            extension = next(extension_iter, None)
+
+        if cuda_ext and not IS_HIP_EXTENSION:
+            _check_cuda_version(compiler_name, compiler_version)
+
+        for extension in self.extensions:
+            # Ensure at least an empty list of flags for 'cxx' and 'nvcc' when
+            # extra_compile_args is a dict. Otherwise, default torch flags do
+            # not get passed. Necessary when only one of 'cxx' and 'nvcc' is
+            # passed to extra_compile_args in CUDAExtension, i.e.
+            #   CUDAExtension(..., extra_compile_args={'cxx': [...]})
+            # or
+            #   CUDAExtension(..., extra_compile_args={'nvcc': [...]})
+            if isinstance(extension.extra_compile_args, dict):
+                for ext in ['cxx', 'nvcc']:
+                    if ext not in extension.extra_compile_args:
+                        extension.extra_compile_args[ext] = []
+
+            self._add_compile_flag(extension, '-DTORCH_API_INCLUDE_EXTENSION_H')
+            # See note [Pybind11 ABI constants]
+            for name in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+                val = getattr(torch._C, f"_PYBIND11_{name}")
+                if val is not None and not IS_WINDOWS:
+                    self._add_compile_flag(extension, f'-DPYBIND11_{name}="{val}"')
+            self._define_torch_extension_name(extension)
+            self._add_gnu_cpp_abi_flag(extension)
+
+            if 'nvcc_dlink' in extension.extra_compile_args:
+                assert self.use_ninja, f"With dlink=True, ninja is required to build cuda extension {extension.name}."
+
+        # Register .cu, .cuh, .hip, and .mm as valid source extensions.
+        self.compiler.src_extensions += ['.cu', '.cuh', '.hip']
+        if torch.backends.mps.is_built():
+            self.compiler.src_extensions += ['.mm']
+        # Save the original _compile method for later.
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler._cpp_extensions += ['.cu', '.cuh']
+            original_compile = self.compiler.compile
+            original_spawn = self.compiler.spawn
+        else:
+            original_compile = self.compiler._compile
+
+        def append_std17_if_no_std_present(cflags) -> None:
+            # NVCC does not allow multiple -std to be passed, so we avoid
+            # overriding the option if the user explicitly passed it.
+            cpp_format_prefix = '/{}:' if self.compiler.compiler_type == 'msvc' else '-{}='
+            cpp_flag_prefix = cpp_format_prefix.format('std')
+            cpp_flag = cpp_flag_prefix + 'c++17'
+            if not any(flag.startswith(cpp_flag_prefix) for flag in cflags):
+                cflags.append(cpp_flag)
+
+        def unix_cuda_flags(cflags):
+            cflags = (COMMON_NVCC_FLAGS +
+                      ['--compiler-options', "'-fPIC'"] +
+                      cflags + _get_cuda_arch_flags(cflags))
+
+            # NVCC does not allow multiple -ccbin/--compiler-bindir to be passed, so we avoid
+            # overriding the option if the user explicitly passed it.
+            _ccbin = os.getenv("CC")
+            if (
+                _ccbin is not None
+                and not any(flag.startswith(('-ccbin', '--compiler-bindir')) for flag in cflags)
+            ):
+                cflags.extend(['-ccbin', _ccbin])
+
+            return cflags
+
+        def convert_to_absolute_paths_inplace(paths):
+            # Helper function. See Note [Absolute include_dirs]
+            if paths is not None:
+                for i in range(len(paths)):
+                    if not os.path.isabs(paths[i]):
+                        paths[i] = os.path.abspath(paths[i])
+
+        def unix_wrap_single_compile(obj, src, ext, cc_args, extra_postargs, pp_opts) -> None:
+            # Copy before we make any modifications.
+            cflags = copy.deepcopy(extra_postargs)
+            try:
+                original_compiler = self.compiler.compiler_so
+                if _is_cuda_file(src):
+                    nvcc = [_join_rocm_home('bin', 'hipcc') if IS_HIP_EXTENSION else _join_cuda_home('bin', 'nvcc')]
+                    self.compiler.set_executable('compiler_so', nvcc)
+                    if isinstance(cflags, dict):
+                        cflags = cflags['nvcc']
+                    if IS_HIP_EXTENSION:
+                        cflags = COMMON_HIPCC_FLAGS + cflags + _get_rocm_arch_flags(cflags)
+                    else:
+                        cflags = unix_cuda_flags(cflags)
+                elif isinstance(cflags, dict):
+                    cflags = cflags['cxx']
+                if IS_HIP_EXTENSION:
+                    cflags = COMMON_HIP_FLAGS + cflags
+                append_std17_if_no_std_present(cflags)
+
+                original_compile(obj, src, ext, cc_args, cflags, pp_opts)
+            finally:
+                # Put the original compiler back in place.
+                self.compiler.set_executable('compiler_so', original_compiler)
+
+        def unix_wrap_ninja_compile(sources,
+                                    output_dir=None,
+                                    macros=None,
+                                    include_dirs=None,
+                                    debug=0,
+                                    extra_preargs=None,
+                                    extra_postargs=None,
+                                    depends=None):
+            r"""Compiles sources by outputting a ninja file and running it."""
+            # NB: I copied some lines from self.compiler (which is an instance
+            # of distutils.UnixCCompiler). See the following link.
+            # https://github.com/python/cpython/blob/f03a8f8d5001963ad5b5b28dbd95497e9cc15596/Lib/distutils/ccompiler.py#L564-L567
+            # This can be fragile, but a lot of other repos also do this
+            # (see https://github.com/search?q=_setup_compile&type=Code)
+            # so it is probably OK; we'll also get CI signal if/when
+            # we update our python version (which is when distutils can be
+            # upgraded)
+
+            # Use absolute path for output_dir so that the object file paths
+            # (`objects`) get generated with absolute paths.
+            output_dir = os.path.abspath(output_dir)
+
+            # See Note [Absolute include_dirs]
+            convert_to_absolute_paths_inplace(self.compiler.include_dirs)
+
+            _, objects, extra_postargs, pp_opts, _ = \
+                self.compiler._setup_compile(output_dir, macros,
+                                             include_dirs, sources,
+                                             depends, extra_postargs)
+            common_cflags = self.compiler._get_cc_args(pp_opts, debug, extra_preargs)
+            extra_cc_cflags = self.compiler.compiler_so[1:]
+            with_cuda = any(map(_is_cuda_file, sources))
+
+            # extra_postargs can be either:
+            # - a dict mapping cxx/nvcc to extra flags
+            # - a list of extra flags.
+            if isinstance(extra_postargs, dict):
+                post_cflags = extra_postargs['cxx']
+            else:
+                post_cflags = list(extra_postargs)
+            if IS_HIP_EXTENSION:
+                post_cflags = COMMON_HIP_FLAGS + post_cflags
+            append_std17_if_no_std_present(post_cflags)
+
+            cuda_post_cflags = None
+            cuda_cflags = None
+            if with_cuda:
+                cuda_cflags = common_cflags
+                if isinstance(extra_postargs, dict):
+                    cuda_post_cflags = extra_postargs['nvcc']
+                else:
+                    cuda_post_cflags = list(extra_postargs)
+                if IS_HIP_EXTENSION:
+                    cuda_post_cflags = cuda_post_cflags + _get_rocm_arch_flags(cuda_post_cflags)
+                    cuda_post_cflags = COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS + cuda_post_cflags
+                else:
+                    cuda_post_cflags = unix_cuda_flags(cuda_post_cflags)
+                append_std17_if_no_std_present(cuda_post_cflags)
+                cuda_cflags = [shlex.quote(f) for f in cuda_cflags]
+                cuda_post_cflags = [shlex.quote(f) for f in cuda_post_cflags]
+
+            if isinstance(extra_postargs, dict) and 'nvcc_dlink' in extra_postargs:
+                cuda_dlink_post_cflags = unix_cuda_flags(extra_postargs['nvcc_dlink'])
+            else:
+                cuda_dlink_post_cflags = None
+            _write_ninja_file_and_compile_objects(
+                sources=sources,
+                objects=objects,
+                cflags=[shlex.quote(f) for f in extra_cc_cflags + common_cflags],
+                post_cflags=[shlex.quote(f) for f in post_cflags],
+                cuda_cflags=cuda_cflags,
+                cuda_post_cflags=cuda_post_cflags,
+                cuda_dlink_post_cflags=cuda_dlink_post_cflags,
+                build_directory=output_dir,
+                verbose=True,
+                with_cuda=with_cuda)
+
+            # Return *all* object filenames, not just the ones we just built.
+            return objects
+
+        def win_cuda_flags(cflags):
+            return (COMMON_NVCC_FLAGS +
+                    cflags + _get_cuda_arch_flags(cflags))
+
+        def win_wrap_single_compile(sources,
+                                    output_dir=None,
+                                    macros=None,
+                                    include_dirs=None,
+                                    debug=0,
+                                    extra_preargs=None,
+                                    extra_postargs=None,
+                                    depends=None):
+
+            self.cflags = copy.deepcopy(extra_postargs)
+            extra_postargs = None
+
+            def spawn(cmd):
+                # Using regex to match src, obj and include files
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [
+                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [
+                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                include_regex = re.compile(r'((\-|\/)I.*)')
+                include_list = [
+                    m.group(1)
+                    for m in (include_regex.match(elem) for elem in cmd) if m
+                ]
+
+                if len(src_list) >= 1 and len(obj_list) >= 1:
+                    src = src_list[0]
+                    obj = obj_list[0]
+                    if _is_cuda_file(src):
+                        nvcc = _join_cuda_home('bin', 'nvcc')
+                        if isinstance(self.cflags, dict):
+                            cflags = self.cflags['nvcc']
+                        elif isinstance(self.cflags, list):
+                            cflags = self.cflags
+                        else:
+                            cflags = []
+
+                        cflags = win_cuda_flags(cflags) + ['-std=c++17', '--use-local-env']
+                        for flag in COMMON_MSVC_FLAGS:
+                            cflags = ['-Xcompiler', flag] + cflags
+                        for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+                            cflags = ['-Xcudafe', '--diag_suppress=' + ignore_warning] + cflags
+                        cmd = [nvcc, '-c', src, '-o', obj] + include_list + cflags
+                    elif isinstance(self.cflags, dict):
+                        cflags = COMMON_MSVC_FLAGS + self.cflags['cxx']
+                        append_std17_if_no_std_present(cflags)
+                        cmd += cflags
+                    elif isinstance(self.cflags, list):
+                        cflags = COMMON_MSVC_FLAGS + self.cflags
+                        append_std17_if_no_std_present(cflags)
+                        cmd += cflags
+
+                return original_spawn(cmd)
+
+            try:
+                self.compiler.spawn = spawn
+                return original_compile(sources, output_dir, macros,
+                                        include_dirs, debug, extra_preargs,
+                                        extra_postargs, depends)
+            finally:
+                self.compiler.spawn = original_spawn
+
+        def win_wrap_ninja_compile(sources,
+                                   output_dir=None,
+                                   macros=None,
+                                   include_dirs=None,
+                                   debug=0,
+                                   extra_preargs=None,
+                                   extra_postargs=None,
+                                   depends=None):
+
+            if not self.compiler.initialized:
+                self.compiler.initialize()
+            output_dir = os.path.abspath(output_dir)
+
+            # Note [Absolute include_dirs]
+            # Convert relative path in self.compiler.include_dirs to absolute path if any,
+            # For ninja build, the build location is not local, the build happens
+            # in a in script created build folder, relative path lost their correctness.
+            # To be consistent with jit extension, we allow user to enter relative include_dirs
+            # in setuptools.setup, and we convert the relative path to absolute path here
+            convert_to_absolute_paths_inplace(self.compiler.include_dirs)
+
+            _, objects, extra_postargs, pp_opts, _ = \
+                self.compiler._setup_compile(output_dir, macros,
+                                             include_dirs, sources,
+                                             depends, extra_postargs)
+            common_cflags = extra_preargs or []
+            cflags = []
+            if debug:
+                cflags.extend(self.compiler.compile_options_debug)
+            else:
+                cflags.extend(self.compiler.compile_options)
+            common_cflags.extend(COMMON_MSVC_FLAGS)
+            cflags = cflags + common_cflags + pp_opts
+            with_cuda = any(map(_is_cuda_file, sources))
+
+            # extra_postargs can be either:
+            # - a dict mapping cxx/nvcc to extra flags
+            # - a list of extra flags.
+            if isinstance(extra_postargs, dict):
+                post_cflags = extra_postargs['cxx']
+            else:
+                post_cflags = list(extra_postargs)
+            append_std17_if_no_std_present(post_cflags)
+
+            cuda_post_cflags = None
+            cuda_cflags = None
+            if with_cuda:
+                cuda_cflags = ['-std=c++17', '--use-local-env']
+                for common_cflag in common_cflags:
+                    cuda_cflags.append('-Xcompiler')
+                    cuda_cflags.append(common_cflag)
+                for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+                    cuda_cflags.append('-Xcudafe')
+                    cuda_cflags.append('--diag_suppress=' + ignore_warning)
+                cuda_cflags.extend(pp_opts)
+                if isinstance(extra_postargs, dict):
+                    cuda_post_cflags = extra_postargs['nvcc']
+                else:
+                    cuda_post_cflags = list(extra_postargs)
+                cuda_post_cflags = win_cuda_flags(cuda_post_cflags)
+
+            cflags = _nt_quote_args(cflags)
+            post_cflags = _nt_quote_args(post_cflags)
+            if with_cuda:
+                cuda_cflags = _nt_quote_args(cuda_cflags)
+                cuda_post_cflags = _nt_quote_args(cuda_post_cflags)
+            if isinstance(extra_postargs, dict) and 'nvcc_dlink' in extra_postargs:
+                cuda_dlink_post_cflags = win_cuda_flags(extra_postargs['nvcc_dlink'])
+            else:
+                cuda_dlink_post_cflags = None
+
+            _write_ninja_file_and_compile_objects(
+                sources=sources,
+                objects=objects,
+                cflags=cflags,
+                post_cflags=post_cflags,
+                cuda_cflags=cuda_cflags,
+                cuda_post_cflags=cuda_post_cflags,
+                cuda_dlink_post_cflags=cuda_dlink_post_cflags,
+                build_directory=output_dir,
+                verbose=True,
+                with_cuda=with_cuda)
+
+            # Return *all* object filenames, not just the ones we just built.
+            return objects
+
+        # Monkey-patch the _compile or compile method.
+        # https://github.com/python/cpython/blob/dc0284ee8f7a270b6005467f26d8e5773d76e959/Lib/distutils/ccompiler.py#L511
+        if self.compiler.compiler_type == 'msvc':
+            if self.use_ninja:
+                self.compiler.compile = win_wrap_ninja_compile
+            else:
+                self.compiler.compile = win_wrap_single_compile
+        else:
+            if self.use_ninja:
+                self.compiler.compile = unix_wrap_ninja_compile
+            else:
+                self.compiler._compile = unix_wrap_single_compile
+
+        build_ext.build_extensions(self)
+
+    def get_ext_filename(self, ext_name):
+        # Get the original shared library name. For Python 3, this name will be
+        # suffixed with "<SOABI>.so", where <SOABI> will be something like
+        # cpython-37m-x86_64-linux-gnu.
+        ext_filename = super().get_ext_filename(ext_name)
+        # If `no_python_abi_suffix` is `True`, we omit the Python 3 ABI
+        # component. This makes building shared libraries with setuptools that
+        # aren't Python modules nicer.
+        if self.no_python_abi_suffix:
+            # The parts will be e.g. ["my_extension", "cpython-37m-x86_64-linux-gnu", "so"].
+            ext_filename_parts = ext_filename.split('.')
+            # Omit the second to last element.
+            without_abi = ext_filename_parts[:-2] + ext_filename_parts[-1:]
+            ext_filename = '.'.join(without_abi)
+        return ext_filename
+
+    def _check_abi(self) -> Tuple[str, TorchVersion]:
+        # On some platforms, like Windows, compiler_cxx is not available.
+        if hasattr(self.compiler, 'compiler_cxx'):
+            compiler = self.compiler.compiler_cxx[0]
+        else:
+            compiler = get_cxx_compiler()
+        _, version = get_compiler_abi_compatibility_and_version(compiler)
+        # Warn user if VC env is activated but `DISTUILS_USE_SDK` is not set.
+        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+            msg = ('It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
+                   'This may lead to multiple activations of the VC env.'
+                   'Please set `DISTUTILS_USE_SDK=1` and try again.')
+            raise UserWarning(msg)
+        return compiler, version
+
+    def _add_compile_flag(self, extension, flag):
+        extension.extra_compile_args = copy.deepcopy(extension.extra_compile_args)
+        if isinstance(extension.extra_compile_args, dict):
+            for args in extension.extra_compile_args.values():
+                args.append(flag)
+        else:
+            extension.extra_compile_args.append(flag)
+
+    def _define_torch_extension_name(self, extension):
+        # pybind11 doesn't support dots in the names
+        # so in order to support extensions in the packages
+        # like torch._C, we take the last part of the string
+        # as the library name
+        names = extension.name.split('.')
+        name = names[-1]
+        define = f'-DTORCH_EXTENSION_NAME={name}'
+        self._add_compile_flag(extension, define)
+
+    def _add_gnu_cpp_abi_flag(self, extension):
+        # use the same CXX ABI as what PyTorch was compiled with
+        self._add_compile_flag(extension, '-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)))
+
+
+def CppExtension(name, sources, *args, **kwargs):
+    """
+    Create a :class:`setuptools.Extension` for C++.
+
+    Convenience method that creates a :class:`setuptools.Extension` with the
+    bare minimum (but often sufficient) arguments to build a C++ extension.
+
+    All arguments are forwarded to the :class:`setuptools.Extension`
+    constructor. Full list arguments can be found at
+    https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
+        >>> from setuptools import setup
+        >>> from torch.utils.cpp_extension import BuildExtension, CppExtension
+        >>> setup(
+        ...     name='extension',
+        ...     ext_modules=[
+        ...         CppExtension(
+        ...             name='extension',
+        ...             sources=['extension.cpp'],
+        ...             extra_compile_args=['-g'],
+        ...             extra_link_flags=['-Wl,--no-as-needed', '-lm'])
+        ...     ],
+        ...     cmdclass={
+        ...         'build_ext': BuildExtension
+        ...     })
+    """
+    include_dirs = kwargs.get('include_dirs', [])
+    include_dirs += include_paths()
+    kwargs['include_dirs'] = include_dirs
+
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths()
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    libraries.append('c10')
+    libraries.append('torch')
+    libraries.append('torch_cpu')
+    libraries.append('torch_python')
+    kwargs['libraries'] = libraries
+
+    kwargs['language'] = 'c++'
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def CUDAExtension(name, sources, *args, **kwargs):
+    """
+    Create a :class:`setuptools.Extension` for CUDA/C++.
+
+    Convenience method that creates a :class:`setuptools.Extension` with the
+    bare minimum (but often sufficient) arguments to build a CUDA/C++
+    extension. This includes the CUDA include path, library path and runtime
+    library.
+
+    All arguments are forwarded to the :class:`setuptools.Extension`
+    constructor. Full list arguments can be found at
+    https://setuptools.pypa.io/en/latest/userguide/ext_modules.html#extension-api-reference
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
+        >>> from setuptools import setup
+        >>> from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+        >>> setup(
+        ...     name='cuda_extension',
+        ...     ext_modules=[
+        ...         CUDAExtension(
+        ...                 name='cuda_extension',
+        ...                 sources=['extension.cpp', 'extension_kernel.cu'],
+        ...                 extra_compile_args={'cxx': ['-g'],
+        ...                                     'nvcc': ['-O2']},
+        ...                 extra_link_flags=['-Wl,--no-as-needed', '-lcuda'])
+        ...     ],
+        ...     cmdclass={
+        ...         'build_ext': BuildExtension
+        ...     })
+
+    Compute capabilities:
+
+    By default the extension will be compiled to run on all archs of the cards visible during the
+    building process of the extension, plus PTX. If down the road a new card is installed the
+    extension may need to be recompiled. If a visible card has a compute capability (CC) that's
+    newer than the newest version for which your nvcc can build fully-compiled binaries, Pytorch
+    will make nvcc fall back to building kernels with the newest version of PTX your nvcc does
+    support (see below for details on PTX).
+
+    You can override the default behavior using `TORCH_CUDA_ARCH_LIST` to explicitly specify which
+    CCs you want the extension to support:
+
+    ``TORCH_CUDA_ARCH_LIST="6.1 8.6" python build_my_extension.py``
+    ``TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" python build_my_extension.py``
+
+    The +PTX option causes extension kernel binaries to include PTX instructions for the specified
+    CC. PTX is an intermediate representation that allows kernels to runtime-compile for any CC >=
+    the specified CC (for example, 8.6+PTX generates PTX that can runtime-compile for any GPU with
+    CC >= 8.6). This improves your binary's forward compatibility. However, relying on older PTX to
+    provide forward compat by runtime-compiling for newer CCs can modestly reduce performance on
+    those newer CCs. If you know exact CC(s) of the GPUs you want to target, you're always better
+    off specifying them individually. For example, if you want your extension to run on 8.0 and 8.6,
+    "8.0+PTX" would work functionally because it includes PTX that can runtime-compile for 8.6, but
+    "8.0 8.6" would be better.
+
+    Note that while it's possible to include all supported archs, the more archs get included the
+    slower the building process will be, as it will build a separate kernel image for each arch.
+
+    Note that CUDA-11.5 nvcc will hit internal compiler error while parsing torch/extension.h on Windows.
+    To workaround the issue, move python binding logic to pure C++ file.
+
+    Example use:
+        #include <ATen/ATen.h>
+        at::Tensor SigmoidAlphaBlendForwardCuda(....)
+
+    Instead of:
+        #include <torch/extension.h>
+        torch::Tensor SigmoidAlphaBlendForwardCuda(...)
+
+    Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460
+    Complete workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48
+
+    Relocatable device code linking:
+
+    If you want to reference device symbols across compilation units (across object files),
+    the object files need to be built with `relocatable device code` (-rdc=true or -dc).
+    An exception to this rule is "dynamic parallelism" (nested kernel launches)  which is not used a lot anymore.
+    `Relocatable device code` is less optimized so it needs to be used only on object files that need it.
+    Using `-dlto` (Device Link Time Optimization) at the device code compilation step and `dlink` step
+    help reduce the protentional perf degradation of `-rdc`.
+    Note that it needs to be used at both steps to be useful.
+
+    If you have `rdc` objects you need to have an extra `-dlink` (device linking) step before the CPU symbol linking step.
+    There is also a case where `-dlink` is used without `-rdc`:
+    when an extension is linked against a static lib containing rdc-compiled objects
+    like the [NVSHMEM library](https://developer.nvidia.com/nvshmem).
+
+    Note: Ninja is required to build a CUDA Extension with RDC linking.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
+        >>> CUDAExtension(
+        ...        name='cuda_extension',
+        ...        sources=['extension.cpp', 'extension_kernel.cu'],
+        ...        dlink=True,
+        ...        dlink_libraries=["dlink_lib"],
+        ...        extra_compile_args={'cxx': ['-g'],
+        ...                            'nvcc': ['-O2', '-rdc=true']})
+    """
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths(cuda=True)
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    libraries.append('c10')
+    libraries.append('torch')
+    libraries.append('torch_cpu')
+    libraries.append('torch_python')
+    if IS_HIP_EXTENSION:
+        assert ROCM_VERSION is not None
+        libraries.append('amdhip64' if ROCM_VERSION >= (3, 5) else 'hip_hcc')
+        libraries.append('c10_hip')
+        libraries.append('torch_hip')
+    else:
+        libraries.append('cudart')
+        libraries.append('c10_cuda')
+        libraries.append('torch_cuda')
+    kwargs['libraries'] = libraries
+
+    include_dirs = kwargs.get('include_dirs', [])
+
+    if IS_HIP_EXTENSION:
+        build_dir = os.getcwd()
+        hipify_result = hipify_python.hipify(
+            project_directory=build_dir,
+            output_directory=build_dir,
+            header_include_dirs=include_dirs,
+            includes=[os.path.join(build_dir, '*')],  # limit scope to build_dir only
+            extra_files=[os.path.abspath(s) for s in sources],
+            show_detailed=True,
+            is_pytorch_extension=True,
+            hipify_extra_files_only=True,  # don't hipify everything in includes path
+        )
+
+        hipified_sources = set()
+        for source in sources:
+            s_abs = os.path.abspath(source)
+            hipified_s_abs = (hipify_result[s_abs].hipified_path if (s_abs in hipify_result and
+                              hipify_result[s_abs].hipified_path is not None) else s_abs)
+            # setup() arguments must *always* be /-separated paths relative to the setup.py directory,
+            # *never* absolute paths
+            hipified_sources.add(os.path.relpath(hipified_s_abs, build_dir))
+
+        sources = list(hipified_sources)
+
+    include_dirs += include_paths(cuda=True)
+    kwargs['include_dirs'] = include_dirs
+
+    kwargs['language'] = 'c++'
+
+    dlink_libraries = kwargs.get('dlink_libraries', [])
+    dlink = kwargs.get('dlink', False) or dlink_libraries
+    if dlink:
+        extra_compile_args = kwargs.get('extra_compile_args', {})
+
+        extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', [])
+        extra_compile_args_dlink += ['-dlink']
+        extra_compile_args_dlink += [f'-L{x}' for x in library_dirs]
+        extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries]
+
+        if (torch.version.cuda is not None) and TorchVersion(torch.version.cuda) >= '11.2':
+            extra_compile_args_dlink += ['-dlto']   # Device Link Time Optimization started from cuda 11.2
+
+        extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink
+
+        kwargs['extra_compile_args'] = extra_compile_args
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def include_paths(cuda: bool = False) -> List[str]:
+    """
+    Get the include paths required to build a C++ or CUDA extension.
+
+    Args:
+        cuda: If `True`, includes CUDA-specific include paths.
+
+    Returns:
+        A list of include path strings.
+    """
+    lib_include = os.path.join(_TORCH_PATH, 'include')
+    paths = [
+        lib_include,
+        # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
+        os.path.join(lib_include, 'torch', 'csrc', 'api', 'include'),
+        # Some internal (old) Torch headers don't properly prefix their includes,
+        # so we need to pass -Itorch/lib/include/TH as well.
+        os.path.join(lib_include, 'TH'),
+        os.path.join(lib_include, 'THC')
+    ]
+    if cuda and IS_HIP_EXTENSION:
+        paths.append(os.path.join(lib_include, 'THH'))
+        paths.append(_join_rocm_home('include'))
+    elif cuda:
+        cuda_home_include = _join_cuda_home('include')
+        # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home.
+        # but gcc doesn't like having /usr/include passed explicitly
+        if cuda_home_include != '/usr/include':
+            paths.append(cuda_home_include)
+        if CUDNN_HOME is not None:
+            paths.append(os.path.join(CUDNN_HOME, 'include'))
+    return paths
+
+
+def library_paths(cuda: bool = False) -> List[str]:
+    """
+    Get the library paths required to build a C++ or CUDA extension.
+
+    Args:
+        cuda: If `True`, includes CUDA-specific library paths.
+
+    Returns:
+        A list of library path strings.
+    """
+    # We need to link against libtorch.so
+    paths = [TORCH_LIB_PATH]
+
+    if cuda and IS_HIP_EXTENSION:
+        lib_dir = 'lib'
+        paths.append(_join_rocm_home(lib_dir))
+        if HIP_HOME is not None:
+            paths.append(os.path.join(HIP_HOME, 'lib'))
+    elif cuda:
+        if IS_WINDOWS:
+            lib_dir = os.path.join('lib', 'x64')
+        else:
+            lib_dir = 'lib64'
+            if (not os.path.exists(_join_cuda_home(lib_dir)) and
+                    os.path.exists(_join_cuda_home('lib'))):
+                # 64-bit CUDA may be installed in 'lib' (see e.g. gh-16955)
+                # Note that it's also possible both don't exist (see
+                # _find_cuda_home) - in that case we stay with 'lib64'.
+                lib_dir = 'lib'
+
+        paths.append(_join_cuda_home(lib_dir))
+        if CUDNN_HOME is not None:
+            paths.append(os.path.join(CUDNN_HOME, lib_dir))
+    return paths
+
+
+def load(name,
+         sources: Union[str, List[str]],
+         extra_cflags=None,
+         extra_cuda_cflags=None,
+         extra_ldflags=None,
+         extra_include_paths=None,
+         build_directory=None,
+         verbose=False,
+         with_cuda: Optional[bool] = None,
+         is_python_module=True,
+         is_standalone=False,
+         keep_intermediates=True):
+    """
+    Load a PyTorch C++ extension just-in-time (JIT).
+
+    To load an extension, a Ninja build file is emitted, which is used to
+    compile the given sources into a dynamic library. This library is
+    subsequently loaded into the current Python process as a module and
+    returned from this function, ready for use.
+
+    By default, the directory to which the build file is emitted and the
+    resulting library compiled to is ``<tmp>/torch_extensions/<name>``, where
+    ``<tmp>`` is the temporary folder on the current platform and ``<name>``
+    the name of the extension. This location can be overridden in two ways.
+    First, if the ``TORCH_EXTENSIONS_DIR`` environment variable is set, it
+    replaces ``<tmp>/torch_extensions`` and all extensions will be compiled
+    into subfolders of this directory. Second, if the ``build_directory``
+    argument to this function is supplied, it overrides the entire path, i.e.
+    the library will be compiled into that folder directly.
+
+    To compile the sources, the default system compiler (``c++``) is used,
+    which can be overridden by setting the ``CXX`` environment variable. To pass
+    additional arguments to the compilation process, ``extra_cflags`` or
+    ``extra_ldflags`` can be provided. For example, to compile your extension
+    with optimizations, pass ``extra_cflags=['-O3']``. You can also use
+    ``extra_cflags`` to pass further include directories.
+
+    CUDA support with mixed compilation is provided. Simply pass CUDA source
+    files (``.cu`` or ``.cuh``) along with other sources. Such files will be
+    detected and compiled with nvcc rather than the C++ compiler. This includes
+    passing the CUDA lib64 directory as a library directory, and linking
+    ``cudart``. You can pass additional flags to nvcc via
+    ``extra_cuda_cflags``, just like with ``extra_cflags`` for C++. Various
+    heuristics for finding the CUDA install directory are used, which usually
+    work fine. If not, setting the ``CUDA_HOME`` environment variable is the
+    safest option.
+
+    Args:
+        name: The name of the extension to build. This MUST be the same as the
+            name of the pybind11 module!
+        sources: A list of relative or absolute paths to C++ source files.
+        extra_cflags: optional list of compiler flags to forward to the build.
+        extra_cuda_cflags: optional list of compiler flags to forward to nvcc
+            when building CUDA sources.
+        extra_ldflags: optional list of linker flags to forward to the build.
+        extra_include_paths: optional list of include directories to forward
+            to the build.
+        build_directory: optional path to use as build workspace.
+        verbose: If ``True``, turns on verbose logging of load steps.
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on the existence of ``.cu`` or
+            ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
+            and libraries to be included.
+        is_python_module: If ``True`` (default), imports the produced shared
+            library as a Python module. If ``False``, behavior depends on
+            ``is_standalone``.
+        is_standalone: If ``False`` (default) loads the constructed extension
+            into the process as a plain dynamic library. If ``True``, build a
+            standalone executable.
+
+    Returns:
+        If ``is_python_module`` is ``True``:
+            Returns the loaded PyTorch extension as a Python module.
+
+        If ``is_python_module`` is ``False`` and ``is_standalone`` is ``False``:
+            Returns nothing. (The shared library is loaded into the process as
+            a side effect.)
+
+        If ``is_standalone`` is ``True``.
+            Return the path to the executable. (On Windows, TORCH_LIB_PATH is
+            added to the PATH environment variable as a side effect.)
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torch.utils.cpp_extension import load
+        >>> module = load(
+        ...     name='extension',
+        ...     sources=['extension.cpp', 'extension_kernel.cu'],
+        ...     extra_cflags=['-O2'],
+        ...     verbose=True)
+    """
+    return _jit_compile(
+        name,
+        [sources] if isinstance(sources, str) else sources,
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        build_directory or _get_build_directory(name, verbose),
+        verbose,
+        with_cuda,
+        is_python_module,
+        is_standalone,
+        keep_intermediates=keep_intermediates)
+
+def _get_pybind11_abi_build_flags():
+    # Note [Pybind11 ABI constants]
+    #
+    # Pybind11 before 2.4 used to build an ABI strings using the following pattern:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_BUILD_TYPE}__"
+    # Since 2.4 compier type, stdlib and build abi parameters are also encoded like this:
+    # f"__pybind11_internals_v{PYBIND11_INTERNALS_VERSION}{PYBIND11_INTERNALS_KIND}{PYBIND11_COMPILER_TYPE}{PYBIND11_STDLIB}{PYBIND11_BUILD_ABI}{PYBIND11_BUILD_TYPE}__"
+    #
+    # This was done in order to further narrow down the chances of compiler ABI incompatibility
+    # that can cause a hard to debug segfaults.
+    # For PyTorch extensions we want to relax those restrictions and pass compiler, stdlib and abi properties
+    # captured during PyTorch native library compilation in torch/csrc/Module.cpp
+
+    abi_cflags = []
+    for pname in ["COMPILER_TYPE", "STDLIB", "BUILD_ABI"]:
+        pval = getattr(torch._C, f"_PYBIND11_{pname}")
+        if pval is not None and not IS_WINDOWS:
+            abi_cflags.append(f'-DPYBIND11_{pname}=\\"{pval}\\"')
+    return abi_cflags
+
+def _get_glibcxx_abi_build_flags():
+    glibcxx_abi_cflags = ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
+    return glibcxx_abi_cflags
+
+def check_compiler_is_gcc(compiler):
+    if not IS_LINUX:
+        return False
+
+    env = os.environ.copy()
+    env['LC_ALL'] = 'C'  # Don't localize output
+    try:
+        version_string = subprocess.check_output([compiler, '-v'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+    except Exception as e:
+        try:
+            version_string = subprocess.check_output([compiler, '--version'], stderr=subprocess.STDOUT, env=env).decode(*SUBPROCESS_DECODE_ARGS)
+        except Exception as e:
+            return False
+    # Check for 'gcc' or 'g++' for sccache wrapper
+    pattern = re.compile("^COLLECT_GCC=(.*)$", re.MULTILINE)
+    results = re.findall(pattern, version_string)
+    if len(results) != 1:
+        return False
+    compiler_path = os.path.realpath(results[0].strip())
+    # On RHEL/CentOS c++ is a gcc compiler wrapper
+    if os.path.basename(compiler_path) == 'c++' and 'gcc version' in version_string:
+        return True
+    return False
+
+def _check_and_build_extension_h_precompiler_headers(
+        extra_cflags,
+        extra_include_paths,
+        is_standalone=False):
+    r'''
+    Precompiled Headers(PCH) can pre-build the same headers and reduce build time for pytorch load_inline modules.
+    GCC offical manual: https://gcc.gnu.org/onlinedocs/gcc-4.0.4/gcc/Precompiled-Headers.html
+    PCH only works when built pch file(header.h.gch) and build target have the same build parameters. So, We need
+    add a signature file to record PCH file parameters. If the build parameters(signature) changed, it should rebuild
+    PCH file.
+
+    Note:
+    1. Windows and MacOS have different PCH mechanism. We only support Linux currently.
+    2. It only works on GCC/G++.
+    '''
+    if not IS_LINUX:
+        return
+
+    compiler = get_cxx_compiler()
+
+    b_is_gcc = check_compiler_is_gcc(compiler)
+    if b_is_gcc is False:
+        return
+
+    head_file = os.path.join(_TORCH_PATH, 'include', 'torch', 'extension.h')
+    head_file_pch = os.path.join(_TORCH_PATH, 'include', 'torch', 'extension.h.gch')
+    head_file_signature = os.path.join(_TORCH_PATH, 'include', 'torch', 'extension.h.sign')
+
+    def listToString(s):
+        # initialize an empty string
+        string = ""
+        if s is None:
+            return string
+
+        # traverse in the string
+        for element in s:
+            string += (element + ' ')
+        # return string
+        return string
+
+    def format_precompiler_header_cmd(compiler, head_file, head_file_pch, common_cflags, torch_include_dirs, extra_cflags, extra_include_paths):
+        return re.sub(
+            r"[ \n]+",
+            " ",
+            f"""
+                {compiler} -x c++-header {head_file} -o {head_file_pch} {torch_include_dirs} {extra_include_paths} {extra_cflags} {common_cflags}
+            """,
+        ).strip()
+
+    def command_to_signature(cmd):
+        signature = cmd.replace(' ', '_')
+        return signature
+
+    def check_pch_signature_in_file(file_path, signature):
+        b_exist = os.path.isfile(file_path)
+        if b_exist is False:
+            return False
+
+        with open(file_path) as file:
+            # read all content of a file
+            content = file.read()
+            # check if string present in a file
+            if signature == content:
+                return True
+            else:
+                return False
+
+    def _create_if_not_exist(path_dir):
+        if not os.path.exists(path_dir):
+            try:
+                Path(path_dir).mkdir(parents=True, exist_ok=True)
+            except OSError as exc:  # Guard against race condition
+                if exc.errno != errno.EEXIST:
+                    raise RuntimeError(f"Fail to create path {path_dir}") from exc
+
+    def write_pch_signature_to_file(file_path, pch_sign):
+        _create_if_not_exist(os.path.dirname(file_path))
+        with open(file_path, "w") as f:
+            f.write(pch_sign)
+            f.close()
+
+    def build_precompile_header(pch_cmd):
+        try:
+            subprocess.check_output(pch_cmd, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Compile PreCompile Header fail, command: {pch_cmd}") from e
+
+    extra_cflags_str = listToString(extra_cflags)
+    extra_include_paths_str = " ".join(
+        [f"-I{include}" for include in extra_include_paths] if extra_include_paths else []
+    )
+
+    lib_include = os.path.join(_TORCH_PATH, 'include')
+    torch_include_dirs = [
+        f"-I {lib_include}",
+        # Python.h
+        "-I {}".format(sysconfig.get_path("include")),
+        # torch/all.h
+        "-I {}".format(os.path.join(lib_include, 'torch', 'csrc', 'api', 'include')),
+    ]
+
+    torch_include_dirs_str = listToString(torch_include_dirs)
+
+    common_cflags = []
+    if not is_standalone:
+        common_cflags += ['-DTORCH_API_INCLUDE_EXTENSION_H']
+
+    common_cflags += ['-std=c++17', '-fPIC']
+    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
+    common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()]
+    common_cflags_str = listToString(common_cflags)
+
+    pch_cmd = format_precompiler_header_cmd(compiler, head_file, head_file_pch, common_cflags_str, torch_include_dirs_str, extra_cflags_str, extra_include_paths_str)
+    pch_sign = command_to_signature(pch_cmd)
+
+    if os.path.isfile(head_file_pch) is not True:
+        build_precompile_header(pch_cmd)
+        write_pch_signature_to_file(head_file_signature, pch_sign)
+    else:
+        b_same_sign = check_pch_signature_in_file(head_file_signature, pch_sign)
+        if b_same_sign is False:
+            build_precompile_header(pch_cmd)
+            write_pch_signature_to_file(head_file_signature, pch_sign)
+
+def remove_extension_h_precompiler_headers():
+    def _remove_if_file_exists(path_file):
+        if os.path.exists(path_file):
+            os.remove(path_file)
+
+    head_file_pch = os.path.join(_TORCH_PATH, 'include', 'torch', 'extension.h.gch')
+    head_file_signature = os.path.join(_TORCH_PATH, 'include', 'torch', 'extension.h.sign')
+
+    _remove_if_file_exists(head_file_pch)
+    _remove_if_file_exists(head_file_signature)
+
+def load_inline(name,
+                cpp_sources,
+                cuda_sources=None,
+                functions=None,
+                extra_cflags=None,
+                extra_cuda_cflags=None,
+                extra_ldflags=None,
+                extra_include_paths=None,
+                build_directory=None,
+                verbose=False,
+                with_cuda=None,
+                is_python_module=True,
+                with_pytorch_error_handling=True,
+                keep_intermediates=True,
+                use_pch=False):
+    r'''
+    Load a PyTorch C++ extension just-in-time (JIT) from string sources.
+
+    This function behaves exactly like :func:`load`, but takes its sources as
+    strings rather than filenames. These strings are stored to files in the
+    build directory, after which the behavior of :func:`load_inline` is
+    identical to :func:`load`.
+
+    See `the
+    tests <https://github.com/pytorch/pytorch/blob/master/test/test_cpp_extensions_jit.py>`_
+    for good examples of using this function.
+
+    Sources may omit two required parts of a typical non-inline C++ extension:
+    the necessary header includes, as well as the (pybind11) binding code. More
+    precisely, strings passed to ``cpp_sources`` are first concatenated into a
+    single ``.cpp`` file. This file is then prepended with ``#include
+    <torch/extension.h>``.
+
+    Furthermore, if the ``functions`` argument is supplied, bindings will be
+    automatically generated for each function specified. ``functions`` can
+    either be a list of function names, or a dictionary mapping from function
+    names to docstrings. If a list is given, the name of each function is used
+    as its docstring.
+
+    The sources in ``cuda_sources`` are concatenated into a separate ``.cu``
+    file and  prepended with ``torch/types.h``, ``cuda.h`` and
+    ``cuda_runtime.h`` includes. The ``.cpp`` and ``.cu`` files are compiled
+    separately, but ultimately linked into a single library. Note that no
+    bindings are generated for functions in ``cuda_sources`` per  se. To bind
+    to a CUDA kernel, you must create a C++ function that calls it, and either
+    declare or define this C++ function in one of the ``cpp_sources`` (and
+    include its name in ``functions``).
+
+    See :func:`load` for a description of arguments omitted below.
+
+    Args:
+        cpp_sources: A string, or list of strings, containing C++ source code.
+        cuda_sources: A string, or list of strings, containing CUDA source code.
+        functions: A list of function names for which to generate function
+            bindings. If a dictionary is given, it should map function names to
+            docstrings (which are otherwise just the function names).
+        with_cuda: Determines whether CUDA headers and libraries are added to
+            the build. If set to ``None`` (default), this value is
+            automatically determined based on whether ``cuda_sources`` is
+            provided. Set it to ``True`` to force CUDA headers
+            and libraries to be included.
+        with_pytorch_error_handling: Determines whether pytorch error and
+            warning macros are handled by pytorch instead of pybind. To do
+            this, each function ``foo`` is called via an intermediary ``_safe_foo``
+            function. This redirection might cause issues in obscure cases
+            of cpp. This flag should be set to ``False`` when this redirect
+            causes issues.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CPP_EXT)
+        >>> from torch.utils.cpp_extension import load_inline
+        >>> source = """
+        at::Tensor sin_add(at::Tensor x, at::Tensor y) {
+          return x.sin() + y.sin();
+        }
+        """
+        >>> module = load_inline(name='inline_extension',
+        ...                      cpp_sources=[source],
+        ...                      functions=['sin_add'])
+
+    .. note::
+        By default, the Ninja backend uses #CPUS + 2 workers to build the
+        extension. This may use up too many resources on some systems. One
+        can control the number of workers by setting the `MAX_JOBS` environment
+        variable to a non-negative number.
+    '''
+    build_directory = build_directory or _get_build_directory(name, verbose)
+
+    if isinstance(cpp_sources, str):
+        cpp_sources = [cpp_sources]
+    cuda_sources = cuda_sources or []
+    if isinstance(cuda_sources, str):
+        cuda_sources = [cuda_sources]
+
+    cpp_sources.insert(0, '#include <torch/extension.h>')
+
+    if use_pch is True:
+        # Using PreCompile Header('torch/extension.h') to reduce compile time.
+        _check_and_build_extension_h_precompiler_headers(extra_cflags, extra_include_paths)
+    else:
+        remove_extension_h_precompiler_headers()
+
+    # If `functions` is supplied, we create the pybind11 bindings for the user.
+    # Here, `functions` is (or becomes, after some processing) a map from
+    # function names to function docstrings.
+    if functions is not None:
+        module_def = []
+        module_def.append('PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {')
+        if isinstance(functions, str):
+            functions = [functions]
+        if isinstance(functions, list):
+            # Make the function docstring the same as the function name.
+            functions = {f: f for f in functions}
+        elif not isinstance(functions, dict):
+            raise ValueError(f"Expected 'functions' to be a list or dict, but was {type(functions)}")
+        for function_name, docstring in functions.items():
+            if with_pytorch_error_handling:
+                module_def.append(f'm.def("{function_name}", torch::wrap_pybind_function({function_name}), "{docstring}");')
+            else:
+                module_def.append(f'm.def("{function_name}", {function_name}, "{docstring}");')
+        module_def.append('}')
+        cpp_sources += module_def
+
+    cpp_source_path = os.path.join(build_directory, 'main.cpp')
+    _maybe_write(cpp_source_path, "\n".join(cpp_sources))
+
+    sources = [cpp_source_path]
+
+    if cuda_sources:
+        cuda_sources.insert(0, '#include <torch/types.h>')
+        cuda_sources.insert(1, '#include <cuda.h>')
+        cuda_sources.insert(2, '#include <cuda_runtime.h>')
+
+        cuda_source_path = os.path.join(build_directory, 'cuda.cu')
+        _maybe_write(cuda_source_path, "\n".join(cuda_sources))
+
+        sources.append(cuda_source_path)
+
+    return _jit_compile(
+        name,
+        sources,
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        build_directory,
+        verbose,
+        with_cuda,
+        is_python_module,
+        is_standalone=False,
+        keep_intermediates=keep_intermediates)
+
+
+def _jit_compile(name,
+                 sources,
+                 extra_cflags,
+                 extra_cuda_cflags,
+                 extra_ldflags,
+                 extra_include_paths,
+                 build_directory: str,
+                 verbose: bool,
+                 with_cuda: Optional[bool],
+                 is_python_module,
+                 is_standalone,
+                 keep_intermediates=True) -> None:
+    if is_python_module and is_standalone:
+        raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.")
+
+    if with_cuda is None:
+        with_cuda = any(map(_is_cuda_file, sources))
+    with_cudnn = any('cudnn' in f for f in extra_ldflags or [])
+    old_version = JIT_EXTENSION_VERSIONER.get_version(name)
+    version = JIT_EXTENSION_VERSIONER.bump_version_if_changed(
+        name,
+        sources,
+        build_arguments=[extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
+        build_directory=build_directory,
+        with_cuda=with_cuda,
+        is_python_module=is_python_module,
+        is_standalone=is_standalone,
+    )
+    if version > 0:
+        if version != old_version and verbose:
+            print(f'The input conditions for extension module {name} have changed. ' +
+                  f'Bumping to version {version} and re-building as {name}_v{version}...',
+                  file=sys.stderr)
+        name = f'{name}_v{version}'
+
+    if version != old_version:
+        baton = FileBaton(os.path.join(build_directory, 'lock'))
+        if baton.try_acquire():
+            try:
+                with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
+                    if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
+                        hipify_result = hipify_python.hipify(
+                            project_directory=build_directory,
+                            output_directory=build_directory,
+                            header_include_dirs=(extra_include_paths if extra_include_paths is not None else []),
+                            extra_files=[os.path.abspath(s) for s in sources],
+                            ignores=[_join_rocm_home('*'), os.path.join(_TORCH_PATH, '*')],  # no need to hipify ROCm or PyTorch headers
+                            show_detailed=verbose,
+                            show_progress=verbose,
+                            is_pytorch_extension=True,
+                            clean_ctx=clean_ctx
+                        )
+
+                        hipified_sources = set()
+                        for source in sources:
+                            s_abs = os.path.abspath(source)
+                            hipified_sources.add(hipify_result[s_abs].hipified_path if s_abs in hipify_result else s_abs)
+
+                        sources = list(hipified_sources)
+
+                    _write_ninja_file_and_build_library(
+                        name=name,
+                        sources=sources,
+                        extra_cflags=extra_cflags or [],
+                        extra_cuda_cflags=extra_cuda_cflags or [],
+                        extra_ldflags=extra_ldflags or [],
+                        extra_include_paths=extra_include_paths or [],
+                        build_directory=build_directory,
+                        verbose=verbose,
+                        with_cuda=with_cuda,
+                        is_standalone=is_standalone)
+            finally:
+                baton.release()
+        else:
+            baton.wait()
+    elif verbose:
+        print('No modifications detected for re-loaded extension '
+              f'module {name}, skipping build step...',
+              file=sys.stderr)
+
+    if verbose:
+        print(f'Loading extension module {name}...', file=sys.stderr)
+
+    if is_standalone:
+        return _get_exec_path(name, build_directory)
+
+    return _import_module_from_library(name, build_directory, is_python_module)
+
+
+def _write_ninja_file_and_compile_objects(
+        sources: List[str],
+        objects,
+        cflags,
+        post_cflags,
+        cuda_cflags,
+        cuda_post_cflags,
+        cuda_dlink_post_cflags,
+        build_directory: str,
+        verbose: bool,
+        with_cuda: Optional[bool]) -> None:
+    verify_ninja_availability()
+
+    compiler = get_cxx_compiler()
+
+    get_compiler_abi_compatibility_and_version(compiler)
+    if with_cuda is None:
+        with_cuda = any(map(_is_cuda_file, sources))
+    build_file_path = os.path.join(build_directory, 'build.ninja')
+    if verbose:
+        print(f'Emitting ninja build file {build_file_path}...', file=sys.stderr)
+    _write_ninja_file(
+        path=build_file_path,
+        cflags=cflags,
+        post_cflags=post_cflags,
+        cuda_cflags=cuda_cflags,
+        cuda_post_cflags=cuda_post_cflags,
+        cuda_dlink_post_cflags=cuda_dlink_post_cflags,
+        sources=sources,
+        objects=objects,
+        ldflags=None,
+        library_target=None,
+        with_cuda=with_cuda)
+    if verbose:
+        print('Compiling objects...', file=sys.stderr)
+    _run_ninja_build(
+        build_directory,
+        verbose,
+        # It would be better if we could tell users the name of the extension
+        # that failed to build but there isn't a good way to get it here.
+        error_prefix='Error compiling objects for extension')
+
+
+def _write_ninja_file_and_build_library(
+        name,
+        sources: List[str],
+        extra_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        extra_include_paths,
+        build_directory: str,
+        verbose: bool,
+        with_cuda: Optional[bool],
+        is_standalone: bool = False) -> None:
+    verify_ninja_availability()
+
+    compiler = get_cxx_compiler()
+
+    get_compiler_abi_compatibility_and_version(compiler)
+    if with_cuda is None:
+        with_cuda = any(map(_is_cuda_file, sources))
+    extra_ldflags = _prepare_ldflags(
+        extra_ldflags or [],
+        with_cuda,
+        verbose,
+        is_standalone)
+    build_file_path = os.path.join(build_directory, 'build.ninja')
+    if verbose:
+        print(f'Emitting ninja build file {build_file_path}...', file=sys.stderr)
+    # NOTE: Emitting a new ninja build file does not cause re-compilation if
+    # the sources did not change, so it's ok to re-emit (and it's fast).
+    _write_ninja_file_to_build_library(
+        path=build_file_path,
+        name=name,
+        sources=sources,
+        extra_cflags=extra_cflags or [],
+        extra_cuda_cflags=extra_cuda_cflags or [],
+        extra_ldflags=extra_ldflags or [],
+        extra_include_paths=extra_include_paths or [],
+        with_cuda=with_cuda,
+        is_standalone=is_standalone)
+
+    if verbose:
+        print(f'Building extension module {name}...', file=sys.stderr)
+    _run_ninja_build(
+        build_directory,
+        verbose,
+        error_prefix=f"Error building extension '{name}'")
+
+
+def is_ninja_available():
+    """Return ``True`` if the `ninja <https://ninja-build.org/>`_ build system is available on the system, ``False`` otherwise."""
+    try:
+        subprocess.check_output('ninja --version'.split())
+    except Exception:
+        return False
+    else:
+        return True
+
+
+def verify_ninja_availability():
+    """Raise ``RuntimeError`` if `ninja <https://ninja-build.org/>`_ build system is not available on the system, does nothing otherwise."""
+    if not is_ninja_available():
+        raise RuntimeError("Ninja is required to load C++ extensions")
+
+
+def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
+    if IS_WINDOWS:
+        python_lib_path = os.path.join(sys.base_exec_prefix, 'libs')
+
+        extra_ldflags.append('c10.lib')
+        if with_cuda:
+            extra_ldflags.append('c10_cuda.lib')
+        extra_ldflags.append('torch_cpu.lib')
+        if with_cuda:
+            extra_ldflags.append('torch_cuda.lib')
+            # /INCLUDE is used to ensure torch_cuda is linked against in a project that relies on it.
+            # Related issue: https://github.com/pytorch/pytorch/issues/31611
+            extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
+        extra_ldflags.append('torch.lib')
+        extra_ldflags.append(f'/LIBPATH:{TORCH_LIB_PATH}')
+        if not is_standalone:
+            extra_ldflags.append('torch_python.lib')
+            extra_ldflags.append(f'/LIBPATH:{python_lib_path}')
+
+    else:
+        extra_ldflags.append(f'-L{TORCH_LIB_PATH}')
+        extra_ldflags.append('-lc10')
+        if with_cuda:
+            extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
+        extra_ldflags.append('-ltorch_cpu')
+        if with_cuda:
+            extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda')
+        extra_ldflags.append('-ltorch')
+        if not is_standalone:
+            extra_ldflags.append('-ltorch_python')
+
+        if is_standalone and "TBB" in torch.__config__.parallel_info():
+            extra_ldflags.append('-ltbb')
+
+        if is_standalone:
+            extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}")
+
+    if with_cuda:
+        if verbose:
+            print('Detected CUDA files, patching ldflags', file=sys.stderr)
+        if IS_WINDOWS:
+            extra_ldflags.append(f'/LIBPATH:{_join_cuda_home("lib", "x64")}')
+            extra_ldflags.append('cudart.lib')
+            if CUDNN_HOME is not None:
+                extra_ldflags.append(f'/LIBPATH:{os.path.join(CUDNN_HOME, "lib", "x64")}')
+        elif not IS_HIP_EXTENSION:
+            extra_lib_dir = "lib64"
+            if (not os.path.exists(_join_cuda_home(extra_lib_dir)) and
+                    os.path.exists(_join_cuda_home("lib"))):
+                # 64-bit CUDA may be installed in "lib"
+                # Note that it's also possible both don't exist (see _find_cuda_home) - in that case we stay with "lib64"
+                extra_lib_dir = "lib"
+            extra_ldflags.append(f'-L{_join_cuda_home(extra_lib_dir)}')
+            extra_ldflags.append('-lcudart')
+            if CUDNN_HOME is not None:
+                extra_ldflags.append(f'-L{os.path.join(CUDNN_HOME, "lib64")}')
+        elif IS_HIP_EXTENSION:
+            assert ROCM_VERSION is not None
+            extra_ldflags.append(f'-L{_join_rocm_home("lib")}')
+            extra_ldflags.append('-lamdhip64' if ROCM_VERSION >= (3, 5) else '-lhip_hcc')
+    return extra_ldflags
+
+
+def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
+    """
+    Determine CUDA arch flags to use.
+
+    For an arch, say "6.1", the added compile flag will be
+    ``-gencode=arch=compute_61,code=sm_61``.
+    For an added "+PTX", an additional
+    ``-gencode=arch=compute_xx,code=compute_xx`` is added.
+
+    See select_compute_arch.cmake for corresponding named and supported arches
+    when building with CMake.
+    """
+    # If cflags is given, there may already be user-provided arch flags in it
+    # (from `extra_compile_args`)
+    if cflags is not None:
+        for flag in cflags:
+            if 'TORCH_EXTENSION_NAME' in flag:
+                continue
+            if 'arch' in flag:
+                return []
+
+    # Note: keep combined names ("arch1+arch2") above single names, otherwise
+    # string replacement may not do the right thing
+    named_arches = collections.OrderedDict([
+        ('Kepler+Tesla', '3.7'),
+        ('Kepler', '3.5+PTX'),
+        ('Maxwell+Tegra', '5.3'),
+        ('Maxwell', '5.0;5.2+PTX'),
+        ('Pascal', '6.0;6.1+PTX'),
+        ('Volta+Tegra', '7.2'),
+        ('Volta', '7.0+PTX'),
+        ('Turing', '7.5+PTX'),
+        ('Ampere+Tegra', '8.7'),
+        ('Ampere', '8.0;8.6+PTX'),
+        ('Ada', '8.9+PTX'),
+        ('Hopper', '9.0+PTX'),
+    ])
+
+    supported_arches = ['3.5', '3.7', '5.0', '5.2', '5.3', '6.0', '6.1', '6.2',
+                        '7.0', '7.2', '7.5', '8.0', '8.6', '8.7', '8.9', '9.0', '9.0a']
+    valid_arch_strings = supported_arches + [s + "+PTX" for s in supported_arches]
+
+    # The default is sm_30 for CUDA 9.x and 10.x
+    # First check for an env var (same as used by the main setup.py)
+    # Can be one or more architectures, e.g. "6.1" or "3.5;5.2;6.0;6.1;7.0+PTX"
+    # See cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake
+    _arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
+
+    # If not given, determine what's best for the GPU / CUDA version that can be found
+    if not _arch_list:
+        warnings.warn(
+            "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
+            "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].")
+        arch_list = []
+        # the assumption is that the extension should run on any of the currently visible cards,
+        # which could be of different types - therefore all archs for visible cards should be included
+        for i in range(torch.cuda.device_count()):
+            capability = torch.cuda.get_device_capability(i)
+            supported_sm = [int(arch.split('_')[1])
+                            for arch in torch.cuda.get_arch_list() if 'sm_' in arch]
+            max_supported_sm = max((sm // 10, sm % 10) for sm in supported_sm)
+            # Capability of the device may be higher than what's supported by the user's
+            # NVCC, causing compilation error. User's NVCC is expected to match the one
+            # used to build pytorch, so we use the maximum supported capability of pytorch
+            # to clamp the capability.
+            capability = min(max_supported_sm, capability)
+            arch = f'{capability[0]}.{capability[1]}'
+            if arch not in arch_list:
+                arch_list.append(arch)
+        arch_list = sorted(arch_list)
+        arch_list[-1] += '+PTX'
+    else:
+        # Deal with lists that are ' ' separated (only deal with ';' after)
+        _arch_list = _arch_list.replace(' ', ';')
+        # Expand named arches
+        for named_arch, archval in named_arches.items():
+            _arch_list = _arch_list.replace(named_arch, archval)
+
+        arch_list = _arch_list.split(';')
+
+    flags = []
+    for arch in arch_list:
+        if arch not in valid_arch_strings:
+            raise ValueError(f"Unknown CUDA arch ({arch}) or GPU not supported")
+        else:
+            num = arch[0] + arch[2:].split("+")[0]
+            flags.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if arch.endswith('+PTX'):
+                flags.append(f'-gencode=arch=compute_{num},code=compute_{num}')
+
+    return sorted(set(flags))
+
+
+def _get_rocm_arch_flags(cflags: Optional[List[str]] = None) -> List[str]:
+    # If cflags is given, there may already be user-provided arch flags in it
+    # (from `extra_compile_args`)
+    if cflags is not None:
+        for flag in cflags:
+            if 'amdgpu-target' in flag or 'offload-arch' in flag:
+                return ['-fno-gpu-rdc']
+    # Use same defaults as used for building PyTorch
+    # Allow env var to override, just like during initial cmake build.
+    _archs = os.environ.get('PYTORCH_ROCM_ARCH', None)
+    if not _archs:
+        archFlags = torch._C._cuda_getArchFlags()
+        if archFlags:
+            archs = archFlags.split()
+        else:
+            archs = []
+    else:
+        archs = _archs.replace(' ', ';').split(';')
+    flags = [f'--offload-arch={arch}' for arch in archs]
+    flags += ['-fno-gpu-rdc']
+    return flags
+
+def _get_build_directory(name: str, verbose: bool) -> str:
+    root_extensions_directory = os.environ.get('TORCH_EXTENSIONS_DIR')
+    if root_extensions_directory is None:
+        root_extensions_directory = get_default_build_root()
+        cu_str = ('cpu' if torch.version.cuda is None else
+                  f'cu{torch.version.cuda.replace(".", "")}')  # type: ignore[attr-defined]
+        python_version = f'py{sys.version_info.major}{sys.version_info.minor}'
+        build_folder = f'{python_version}_{cu_str}'
+
+        root_extensions_directory = os.path.join(
+            root_extensions_directory, build_folder)
+
+    if verbose:
+        print(f'Using {root_extensions_directory} as PyTorch extensions root...', file=sys.stderr)
+
+    build_directory = os.path.join(root_extensions_directory, name)
+    if not os.path.exists(build_directory):
+        if verbose:
+            print(f'Creating extension directory {build_directory}...', file=sys.stderr)
+        # This is like mkdir -p, i.e. will also create parent directories.
+        os.makedirs(build_directory, exist_ok=True)
+
+    return build_directory
+
+
+def _get_num_workers(verbose: bool) -> Optional[int]:
+    max_jobs = os.environ.get('MAX_JOBS')
+    if max_jobs is not None and max_jobs.isdigit():
+        if verbose:
+            print(f'Using envvar MAX_JOBS ({max_jobs}) as the number of workers...',
+                  file=sys.stderr)
+        return int(max_jobs)
+    if verbose:
+        print('Allowing ninja to set a default number of workers... '
+              '(overridable by setting the environment variable MAX_JOBS=N)',
+              file=sys.stderr)
+    return None
+
+
+def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> None:
+    command = ['ninja', '-v']
+    num_workers = _get_num_workers(verbose)
+    if num_workers is not None:
+        command.extend(['-j', str(num_workers)])
+    env = os.environ.copy()
+    # Try to activate the vc env for the users
+    if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' not in env:
+        from setuptools import distutils
+
+        plat_name = distutils.util.get_platform()
+        plat_spec = PLAT_TO_VCVARS[plat_name]
+
+        vc_env = distutils._msvccompiler._get_vc_env(plat_spec)
+        vc_env = {k.upper(): v for k, v in vc_env.items()}
+        for k, v in env.items():
+            uk = k.upper()
+            if uk not in vc_env:
+                vc_env[uk] = v
+        env = vc_env
+    try:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        # Warning: don't pass stdout=None to subprocess.run to get output.
+        # subprocess.run assumes that sys.__stdout__ has not been modified and
+        # attempts to write to it by default.  However, when we call _run_ninja_build
+        # from ahead-of-time cpp extensions, the following happens:
+        # 1) If the stdout encoding is not utf-8, setuptools detachs __stdout__.
+        #    https://github.com/pypa/setuptools/blob/7e97def47723303fafabe48b22168bbc11bb4821/setuptools/dist.py#L1110
+        #    (it probably shouldn't do this)
+        # 2) subprocess.run (on POSIX, with no stdout override) relies on
+        #    __stdout__ not being detached:
+        #    https://github.com/python/cpython/blob/c352e6c7446c894b13643f538db312092b351789/Lib/subprocess.py#L1214
+        # To work around this, we pass in the fileno directly and hope that
+        # it is valid.
+        stdout_fileno = 1
+        subprocess.run(
+            command,
+            stdout=stdout_fileno if verbose else subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            cwd=build_directory,
+            check=True,
+            env=env)
+    except subprocess.CalledProcessError as e:
+        # Python 2 and 3 compatible way of getting the error object.
+        _, error, _ = sys.exc_info()
+        # error.output contains the stdout and stderr of the build attempt.
+        message = error_prefix
+        # `error` is a CalledProcessError (which has an `output`) attribute, but
+        # mypy thinks it's Optional[BaseException] and doesn't narrow
+        if hasattr(error, 'output') and error.output:  # type: ignore[union-attr]
+            message += f": {error.output.decode(*SUBPROCESS_DECODE_ARGS)}"  # type: ignore[union-attr]
+        raise RuntimeError(message) from e
+
+
+def _get_exec_path(module_name, path):
+    if IS_WINDOWS and TORCH_LIB_PATH not in os.getenv('PATH', '').split(';'):
+        torch_lib_in_path = any(
+            os.path.exists(p) and os.path.samefile(p, TORCH_LIB_PATH)
+            for p in os.getenv('PATH', '').split(';')
+        )
+        if not torch_lib_in_path:
+            os.environ['PATH'] = f"{TORCH_LIB_PATH};{os.getenv('PATH', '')}"
+    return os.path.join(path, f'{module_name}{EXEC_EXT}')
+
+
+def _import_module_from_library(module_name, path, is_python_module):
+    filepath = os.path.join(path, f"{module_name}{LIB_EXT}")
+    if is_python_module:
+        # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
+        spec = importlib.util.spec_from_file_location(module_name, filepath)
+        assert spec is not None
+        module = importlib.util.module_from_spec(spec)
+        assert isinstance(spec.loader, importlib.abc.Loader)
+        spec.loader.exec_module(module)
+        return module
+    else:
+        torch.ops.load_library(filepath)
+
+
+def _write_ninja_file_to_build_library(path,
+                                       name,
+                                       sources,
+                                       extra_cflags,
+                                       extra_cuda_cflags,
+                                       extra_ldflags,
+                                       extra_include_paths,
+                                       with_cuda,
+                                       is_standalone) -> None:
+    extra_cflags = [flag.strip() for flag in extra_cflags]
+    extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags]
+    extra_ldflags = [flag.strip() for flag in extra_ldflags]
+    extra_include_paths = [flag.strip() for flag in extra_include_paths]
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    user_includes = [os.path.abspath(file) for file in extra_include_paths]
+
+    # include_paths() gives us the location of torch/extension.h
+    system_includes = include_paths(with_cuda)
+    # sysconfig.get_path('include') gives us the location of Python.h
+    # Explicitly specify 'posix_prefix' scheme on non-Windows platforms to workaround error on some MacOS
+    # installations where default `get_path` points to non-existing `/Library/Python/M.m/include` folder
+    python_include_path = sysconfig.get_path('include', scheme='nt' if IS_WINDOWS else 'posix_prefix')
+    if python_include_path is not None:
+        system_includes.append(python_include_path)
+
+    # Windows does not understand `-isystem`.
+    if IS_WINDOWS:
+        user_includes += system_includes
+        system_includes.clear()
+
+    common_cflags = []
+    if not is_standalone:
+        common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}')
+        common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
+
+    common_cflags += [f"{x}" for x in _get_pybind11_abi_build_flags()]
+
+    common_cflags += [f'-I{include}' for include in user_includes]
+    common_cflags += [f'-isystem {include}' for include in system_includes]
+
+    common_cflags += [f"{x}" for x in _get_glibcxx_abi_build_flags()]
+
+    if IS_WINDOWS:
+        cflags = common_cflags + COMMON_MSVC_FLAGS + ['/std:c++17'] + extra_cflags
+        cflags = _nt_quote_args(cflags)
+    else:
+        cflags = common_cflags + ['-fPIC', '-std=c++17'] + extra_cflags
+
+    if with_cuda and IS_HIP_EXTENSION:
+        cuda_flags = ['-DWITH_HIP'] + cflags + COMMON_HIP_FLAGS + COMMON_HIPCC_FLAGS
+        cuda_flags += extra_cuda_cflags
+        cuda_flags += _get_rocm_arch_flags(cuda_flags)
+    elif with_cuda:
+        cuda_flags = common_cflags + COMMON_NVCC_FLAGS + _get_cuda_arch_flags()
+        if IS_WINDOWS:
+            for flag in COMMON_MSVC_FLAGS:
+                cuda_flags = ['-Xcompiler', flag] + cuda_flags
+            for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS:
+                cuda_flags = ['-Xcudafe', '--diag_suppress=' + ignore_warning] + cuda_flags
+            cuda_flags = cuda_flags + ['-std=c++17']
+            cuda_flags = _nt_quote_args(cuda_flags)
+            cuda_flags += _nt_quote_args(extra_cuda_cflags)
+        else:
+            cuda_flags += ['--compiler-options', "'-fPIC'"]
+            cuda_flags += extra_cuda_cflags
+            if not any(flag.startswith('-std=') for flag in cuda_flags):
+                cuda_flags.append('-std=c++17')
+            cc_env = os.getenv("CC")
+            if cc_env is not None:
+                cuda_flags = ['-ccbin', cc_env] + cuda_flags
+    else:
+        cuda_flags = None
+
+    def object_file_path(source_file: str) -> str:
+        # '/path/to/file.cpp' -> 'file'
+        file_name = os.path.splitext(os.path.basename(source_file))[0]
+        if _is_cuda_file(source_file) and with_cuda:
+            # Use a different object filename in case a C++ and CUDA file have
+            # the same filename but different extension (.cpp vs. .cu).
+            target = f'{file_name}.cuda.o'
+        else:
+            target = f'{file_name}.o'
+        return target
+
+    objects = [object_file_path(src) for src in sources]
+    ldflags = ([] if is_standalone else [SHARED_FLAG]) + extra_ldflags
+
+    # The darwin linker needs explicit consent to ignore unresolved symbols.
+    if IS_MACOS:
+        ldflags.append('-undefined dynamic_lookup')
+    elif IS_WINDOWS:
+        ldflags = _nt_quote_args(ldflags)
+
+    ext = EXEC_EXT if is_standalone else LIB_EXT
+    library_target = f'{name}{ext}'
+
+    _write_ninja_file(
+        path=path,
+        cflags=cflags,
+        post_cflags=None,
+        cuda_cflags=cuda_flags,
+        cuda_post_cflags=None,
+        cuda_dlink_post_cflags=None,
+        sources=sources,
+        objects=objects,
+        ldflags=ldflags,
+        library_target=library_target,
+        with_cuda=with_cuda)
+
+
+def _write_ninja_file(path,
+                      cflags,
+                      post_cflags,
+                      cuda_cflags,
+                      cuda_post_cflags,
+                      cuda_dlink_post_cflags,
+                      sources,
+                      objects,
+                      ldflags,
+                      library_target,
+                      with_cuda) -> None:
+    r"""Write a ninja file that does the desired compiling and linking.
+
+    `path`: Where to write this file
+    `cflags`: list of flags to pass to $cxx. Can be None.
+    `post_cflags`: list of flags to append to the $cxx invocation. Can be None.
+    `cuda_cflags`: list of flags to pass to $nvcc. Can be None.
+    `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
+    `sources`: list of paths to source files
+    `objects`: list of desired paths to objects, one per source.
+    `ldflags`: list of flags to pass to linker. Can be None.
+    `library_target`: Name of the output library. Can be None; in that case,
+                      we do no linking.
+    `with_cuda`: If we should be compiling with CUDA.
+    """
+    def sanitize_flags(flags):
+        if flags is None:
+            return []
+        else:
+            return [flag.strip() for flag in flags]
+
+    cflags = sanitize_flags(cflags)
+    post_cflags = sanitize_flags(post_cflags)
+    cuda_cflags = sanitize_flags(cuda_cflags)
+    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
+    cuda_dlink_post_cflags = sanitize_flags(cuda_dlink_post_cflags)
+    ldflags = sanitize_flags(ldflags)
+
+    # Sanity checks...
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+
+    compiler = get_cxx_compiler()
+
+    # Version 1.3 is required for the `deps` directive.
+    config = ['ninja_required_version = 1.3']
+    config.append(f'cxx = {compiler}')
+    if with_cuda or cuda_dlink_post_cflags:
+        if "PYTORCH_NVCC" in os.environ:
+            nvcc = os.getenv("PYTORCH_NVCC")    # user can set nvcc compiler with ccache using the environment variable here
+        else:
+            if IS_HIP_EXTENSION:
+                nvcc = _join_rocm_home('bin', 'hipcc')
+            else:
+                nvcc = _join_cuda_home('bin', 'nvcc')
+        config.append(f'nvcc = {nvcc}')
+
+    if IS_HIP_EXTENSION:
+        post_cflags = COMMON_HIP_FLAGS + post_cflags
+    flags = [f'cflags = {" ".join(cflags)}']
+    flags.append(f'post_cflags = {" ".join(post_cflags)}')
+    if with_cuda:
+        flags.append(f'cuda_cflags = {" ".join(cuda_cflags)}')
+        flags.append(f'cuda_post_cflags = {" ".join(cuda_post_cflags)}')
+    flags.append(f'cuda_dlink_post_cflags = {" ".join(cuda_dlink_post_cflags)}')
+    flags.append(f'ldflags = {" ".join(ldflags)}')
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    sources = [os.path.abspath(file) for file in sources]
+
+    # See https://ninja-build.org/build.ninja.html for reference.
+    compile_rule = ['rule compile']
+    if IS_WINDOWS:
+        compile_rule.append(
+            '  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags')
+        compile_rule.append('  deps = msvc')
+    else:
+        compile_rule.append(
+            '  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags')
+        compile_rule.append('  depfile = $out.d')
+        compile_rule.append('  deps = gcc')
+
+    if with_cuda:
+        cuda_compile_rule = ['rule cuda_compile']
+        nvcc_gendeps = ''
+        # --generate-dependencies-with-compile is not supported by ROCm
+        # Nvcc flag `--generate-dependencies-with-compile` is not supported by sccache, which may increase build time.
+        if torch.version.cuda is not None and os.getenv('TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES', '0') != '1':
+            cuda_compile_rule.append('  depfile = $out.d')
+            cuda_compile_rule.append('  deps = gcc')
+            # Note: non-system deps with nvcc are only supported
+            # on Linux so use --generate-dependencies-with-compile
+            # to make this work on Windows too.
+            nvcc_gendeps = '--generate-dependencies-with-compile --dependency-output $out.d'
+        cuda_compile_rule.append(
+            f'  command = $nvcc {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags')
+
+    # Emit one build rule per source to enable incremental build.
+    build = []
+    for source_file, object_file in zip(sources, objects):
+        is_cuda_source = _is_cuda_file(source_file) and with_cuda
+        rule = 'cuda_compile' if is_cuda_source else 'compile'
+        if IS_WINDOWS:
+            source_file = source_file.replace(':', '$:')
+            object_file = object_file.replace(':', '$:')
+        source_file = source_file.replace(" ", "$ ")
+        object_file = object_file.replace(" ", "$ ")
+        build.append(f'build {object_file}: {rule} {source_file}')
+
+    if cuda_dlink_post_cflags:
+        devlink_out = os.path.join(os.path.dirname(objects[0]), 'dlink.o')
+        devlink_rule = ['rule cuda_devlink']
+        devlink_rule.append('  command = $nvcc $in -o $out $cuda_dlink_post_cflags')
+        devlink = [f'build {devlink_out}: cuda_devlink {" ".join(objects)}']
+        objects += [devlink_out]
+    else:
+        devlink_rule, devlink = [], []
+
+    if library_target is not None:
+        link_rule = ['rule link']
+        if IS_WINDOWS:
+            cl_paths = subprocess.check_output(['where',
+                                                'cl']).decode(*SUBPROCESS_DECODE_ARGS).split('\r\n')
+            if len(cl_paths) >= 1:
+                cl_path = os.path.dirname(cl_paths[0]).replace(':', '$:')
+            else:
+                raise RuntimeError("MSVC is required to load C++ extensions")
+            link_rule.append(f'  command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out')
+        else:
+            link_rule.append('  command = $cxx $in $ldflags -o $out')
+
+        link = [f'build {library_target}: link {" ".join(objects)}']
+
+        default = [f'default {library_target}']
+    else:
+        link_rule, link, default = [], [], []
+
+    # 'Blocks' should be separated by newlines, for visual benefit.
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)  # type: ignore[possibly-undefined]
+    blocks += [devlink_rule, link_rule, build, devlink, link, default]
+    content = "\n\n".join("\n".join(b) for b in blocks)
+    # Ninja requires a new lines at the end of the .ninja file
+    content += "\n"
+    _maybe_write(path, content)
+
+def _join_cuda_home(*paths) -> str:
+    """
+    Join paths with CUDA_HOME, or raises an error if it CUDA_HOME is not set.
+
+    This is basically a lazy way of raising an error for missing $CUDA_HOME
+    only once we need to get any CUDA-specific path.
+    """
+    if CUDA_HOME is None:
+        raise OSError('CUDA_HOME environment variable is not set. '
+                      'Please set it to your CUDA install root.')
+    return os.path.join(CUDA_HOME, *paths)
+
+
+def _is_cuda_file(path: str) -> bool:
+    valid_ext = ['.cu', '.cuh']
+    if IS_HIP_EXTENSION:
+        valid_ext.append('.hip')
+    return os.path.splitext(path)[1] in valid_ext
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..62abe531cee0eaf8a0e09d9229291fa7d5b6a61b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/__init__.py
@@ -0,0 +1,76 @@
+# TODO(VitalyFedyunin): Rearranging this imports leads to crash,
+# need to cleanup dependencies and fix it
+from torch.utils.data.sampler import (
+    BatchSampler,
+    RandomSampler,
+    Sampler,
+    SequentialSampler,
+    SubsetRandomSampler,
+    WeightedRandomSampler,
+)
+from torch.utils.data.dataset import (
+    ChainDataset,
+    ConcatDataset,
+    Dataset,
+    IterableDataset,
+    StackDataset,
+    Subset,
+    TensorDataset,
+    random_split,
+)
+from torch.utils.data.datapipes.datapipe import (
+    DFIterDataPipe,
+    DataChunk,
+    IterDataPipe,
+    MapDataPipe,
+)
+from torch.utils.data.dataloader import (
+    DataLoader,
+    _DatasetKind,
+    get_worker_info,
+    default_collate,
+    default_convert,
+)
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.datapipes._decorator import (
+    argument_validation,
+    functional_datapipe,
+    guaranteed_datapipes_determinism,
+    non_deterministic,
+    runtime_validation,
+    runtime_validation_disabled,
+)
+
+__all__ = ['BatchSampler',
+           'ChainDataset',
+           'ConcatDataset',
+           'DFIterDataPipe',
+           'DataChunk',
+           'DataLoader',
+           'Dataset',
+           'DistributedSampler',
+           'IterDataPipe',
+           'IterableDataset',
+           'MapDataPipe',
+           'RandomSampler',
+           'Sampler',
+           'SequentialSampler',
+           'StackDataset',
+           'Subset',
+           'SubsetRandomSampler',
+           'TensorDataset',
+           'WeightedRandomSampler',
+           '_DatasetKind',
+           'argument_validation',
+           'default_collate',
+           'default_convert',
+           'functional_datapipe',
+           'get_worker_info',
+           'guaranteed_datapipes_determinism',
+           'non_deterministic',
+           'random_split',
+           'runtime_validation',
+           'runtime_validation_disabled']
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c641b8d24864312497b942a38687e3a2265b0ce1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/backward_compatibility.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/backward_compatibility.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d7a929b31f1a44c262e498f0450c6324bf47c81
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/backward_compatibility.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataloader.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataloader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e29b17647509add680bffd2fbfbb71ae48243da
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataloader.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataset.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..298388ede95ca1deb9fee34169421f151f4e8a8b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/dataset.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/distributed.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/distributed.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..653fca5a496b8e152be2aca074caea6955cdb00b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/distributed.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c927ab35b17dc2d207a23ab827207c813c313562
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph_settings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph_settings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17441467d804ed65c2229f374606337c7401bce0
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/graph_settings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/__pycache__/sampler.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/sampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..447b191c0906035eed0b4ca83b3a0c421fc8f994
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/__pycache__/sampler.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4666b1c1e08c477bd54b01b92cca15cc28eb9c0f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/__init__.py
@@ -0,0 +1,51 @@
+r"""Utility classes & functions for data loading. Code in this folder is mostly used by ../dataloder.py.
+
+A lot of multiprocessing is used in data loading, which only supports running
+functions defined in global environment (py2 can't serialize static methods).
+Therefore, for code tidiness we put these functions into different files in this
+folder.
+"""
+
+import sys
+import atexit
+
+# old private location of the ExceptionWrapper that some users rely on:
+from torch._utils import ExceptionWrapper
+
+
+IS_WINDOWS = sys.platform == "win32"
+
+
+MP_STATUS_CHECK_INTERVAL = 5.0
+r"""Interval (in seconds) to check status of processes to avoid hanging in
+    multiprocessing data loading. This is mainly used in getting data from
+    another process, in which case we need to periodically check whether the
+    sender is alive to prevent hanging."""
+
+
+python_exit_status = False
+r"""Whether Python is shutting down. This flag is guaranteed to be set before
+the Python core library resources are freed, but Python may already be exiting
+for some time when this is set.
+
+Hook to set this flag is `_set_python_exit_flag`, and is inspired by a similar
+hook in Python 3.7 multiprocessing library:
+https://github.com/python/cpython/blob/d4d60134b29290049e28df54f23493de4f1824b6/Lib/multiprocessing/util.py#L277-L327
+"""
+
+
+try:
+    import numpy
+    HAS_NUMPY = True
+except ModuleNotFoundError:
+    HAS_NUMPY = False
+
+
+def _set_python_exit_flag():
+    global python_exit_status
+    python_exit_status = True
+
+atexit.register(_set_python_exit_flag)
+
+
+from . import worker, signal_handling, pin_memory, collate, fetch
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44acaacf6faf0e59c712a20d2ca28d293bcd9d08
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/collate.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/collate.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8765cfb56970aa5631165ae0817f9bc9678ba47c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/collate.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/fetch.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/fetch.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d12ee9212e3572af6e0be2a239dd1dd4772dae8b
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/fetch.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/pin_memory.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/pin_memory.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47ef3b06d39fb8c881b711720bc320da01ee54eb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/pin_memory.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/signal_handling.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/signal_handling.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b482129f34f9f3cefe9f89ca11b1e9aac587e6c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/signal_handling.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/worker.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/worker.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fe61386d6942b92fe1a1d3d70462f2a80031d22
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/_utils/__pycache__/worker.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/collate.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0c40747fd706e635e53bc29314bd1a7b0608cb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/collate.py
@@ -0,0 +1,316 @@
+r"""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+
+These methods are used to collate samples fetched from dataset into Tensor(s).
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+
+`default_collate` and `default_convert` are exposed to users via 'dataloader.py'.
+"""
+
+import collections
+import contextlib
+import copy
+import re
+import torch
+
+from typing import Callable, Dict, Optional, Tuple, Type, Union
+
+np_str_obj_array_pattern = re.compile(r'[SaUO]')
+
+
+def default_convert(data):
+    r"""
+    Convert each NumPy array element into a :class:`torch.Tensor`.
+
+    If the input is a `Sequence`, `Collection`, or `Mapping`, it tries to convert each element inside to a :class:`torch.Tensor`.
+    If the input is not an NumPy array, it is left unchanged.
+    This is used as the default function for collation when both `batch_sampler` and `batch_size`
+    are NOT defined in :class:`~torch.utils.data.DataLoader`.
+
+    The general input type to output type mapping is similar to that
+    of :func:`~torch.utils.data.default_collate`. See the description there for more details.
+
+    Args:
+        data: a single data point to be converted
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # Example with `int`
+        >>> default_convert(0)
+        0
+        >>> # Example with NumPy array
+        >>> default_convert(np.array([0, 1]))
+        tensor([0, 1])
+        >>> # Example with NamedTuple
+        >>> Point = namedtuple('Point', ['x', 'y'])
+        >>> default_convert(Point(0, 0))
+        Point(x=0, y=0)
+        >>> default_convert(Point(np.array(0), np.array(0)))
+        Point(x=tensor(0), y=tensor(0))
+        >>> # Example with List
+        >>> default_convert([np.array([0, 1]), np.array([2, 3])])
+        [tensor([0, 1]), tensor([2, 3])]
+    """
+    elem_type = type(data)
+    if isinstance(data, torch.Tensor):
+        return data
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        # array of string classes and object
+        if elem_type.__name__ == 'ndarray' \
+                and np_str_obj_array_pattern.search(data.dtype.str) is not None:
+            return data
+        return torch.as_tensor(data)
+    elif isinstance(data, collections.abc.Mapping):
+        try:
+            if isinstance(data, collections.abc.MutableMapping):
+                # The mapping type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new mapping.
+                # Create a clone and update it if the mapping type is mutable.
+                clone = copy.copy(data)
+                clone.update({key: default_convert(data[key]) for key in data})
+                return clone
+            else:
+                return elem_type({key: default_convert(data[key]) for key in data})
+        except TypeError:
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
+            return {key: default_convert(data[key]) for key in data}
+    elif isinstance(data, tuple) and hasattr(data, '_fields'):  # namedtuple
+        return elem_type(*(default_convert(d) for d in data))
+    elif isinstance(data, tuple):
+        return [default_convert(d) for d in data]  # Backwards compatibility.
+    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
+        try:
+            if isinstance(data, collections.abc.MutableSequence):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)  # type: ignore[arg-type]
+                for i, d in enumerate(data):
+                    clone[i] = default_convert(d)
+                return clone
+            else:
+                return elem_type([default_convert(d) for d in data])
+        except TypeError:
+            # The sequence type may not support `copy()` / `__setitem__(index, item)`
+            # or `__init__(iterable)` (e.g., `range`).
+            return [default_convert(d) for d in data]
+    else:
+        return data
+
+
+default_collate_err_msg_format = (
+    "default_collate: batch must contain tensors, numpy arrays, numbers, "
+    "dicts or lists; found {}")
+
+
+def collate(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    r"""
+    General collate function that handles collection type of element within each batch.
+
+    The function also opens function registry to deal with specific element types. `default_collate_fn_map`
+    provides default collate functions for tensors, numpy arrays, numbers and strings.
+
+    Args:
+        batch: a single batch to be collated
+        collate_fn_map: Optional dictionary mapping from element type to the corresponding collate function.
+            If the element type isn't present in this dictionary,
+            this function will go through each key of the dictionary in the insertion order to
+            invoke the corresponding collate function if the element type is a subclass of the key.
+
+    Examples:
+        >>> def collate_tensor_fn(batch, *, collate_fn_map):
+        >>> # Extend this function to handle batch of tensors
+        ...     return torch.stack(batch, 0)
+        >>> def custom_collate(batch):
+        ...     collate_map = {torch.Tensor: collate_tensor_fn}
+        ...     return collate(batch, collate_fn_map=collate_map)
+        >>> # Extend `default_collate` by in-place modifying `default_collate_fn_map`
+        >>> default_collate_fn_map.update({torch.Tensor: collate_tensor_fn})
+
+    Note:
+        Each collate function requires a positional argument for batch and a keyword argument
+        for the dictionary of collate functions as `collate_fn_map`.
+    """
+    elem = batch[0]
+    elem_type = type(elem)
+
+    if collate_fn_map is not None:
+        if elem_type in collate_fn_map:
+            return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
+
+        for collate_type in collate_fn_map:
+            if isinstance(elem, collate_type):
+                return collate_fn_map[collate_type](batch, collate_fn_map=collate_fn_map)
+
+    if isinstance(elem, collections.abc.Mapping):
+        try:
+            if isinstance(elem, collections.abc.MutableMapping):
+                # The mapping type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new mapping.
+                # Create a clone and update it if the mapping type is mutable.
+                clone = copy.copy(elem)
+                clone.update({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+                return clone
+            else:
+                return elem_type({key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem})
+        except TypeError:
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
+            return {key: collate([d[key] for d in batch], collate_fn_map=collate_fn_map) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(collate(samples, collate_fn_map=collate_fn_map) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = list(zip(*batch))  # It may be accessed twice, so we use a list.
+
+        if isinstance(elem, tuple):
+            return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
+        else:
+            try:
+                if isinstance(elem, collections.abc.MutableSequence):
+                    # The sequence type may have extra properties, so we can't just
+                    # use `type(data)(...)` to create the new sequence.
+                    # Create a clone and update it if the sequence type is mutable.
+                    clone = copy.copy(elem)  # type: ignore[arg-type]
+                    for i, samples in enumerate(transposed):
+                        clone[i] = collate(samples, collate_fn_map=collate_fn_map)
+                    return clone
+                else:
+                    return elem_type([collate(samples, collate_fn_map=collate_fn_map) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `copy()` / `__setitem__(index, item)`
+                # or `__init__(iterable)` (e.g., `range`).
+                return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]
+
+    raise TypeError(default_collate_err_msg_format.format(elem_type))
+
+
+def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    elem = batch[0]
+    out = None
+    if elem.is_nested:
+        raise RuntimeError(
+            "Batches of nested tensors are not currently supported by the default collate_fn; "
+            "please provide a custom collate_fn to handle them appropriately."
+        )
+    if elem.layout in {torch.sparse_coo, torch.sparse_csr, torch.sparse_bsr, torch.sparse_csc, torch.sparse_bsc}:
+        raise RuntimeError(
+            "Batches of sparse tensors are not currently supported by the default collate_fn; "
+            "please provide a custom collate_fn to handle them appropriately."
+        )
+    if torch.utils.data.get_worker_info() is not None:
+        # If we're in a background process, concatenate directly into a
+        # shared memory tensor to avoid an extra copy
+        numel = sum(x.numel() for x in batch)
+        storage = elem._typed_storage()._new_shared(numel, device=elem.device)
+        out = elem.new(storage).resize_(len(batch), *list(elem.size()))
+    return torch.stack(batch, 0, out=out)
+
+
+def collate_numpy_array_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    elem = batch[0]
+    # array of string classes and object
+    if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+        raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+
+    return collate([torch.as_tensor(b) for b in batch], collate_fn_map=collate_fn_map)
+
+
+def collate_numpy_scalar_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return torch.as_tensor(batch)
+
+
+def collate_float_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return torch.tensor(batch, dtype=torch.float64)
+
+
+def collate_int_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return torch.tensor(batch)
+
+
+def collate_str_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ...]], Callable]] = None):
+    return batch
+
+
+default_collate_fn_map: Dict[Union[Type, Tuple[Type, ...]], Callable] = {torch.Tensor: collate_tensor_fn}
+with contextlib.suppress(ImportError):
+    import numpy as np
+    # For both ndarray and memmap (subclass of ndarray)
+    default_collate_fn_map[np.ndarray] = collate_numpy_array_fn
+    # See scalars hierarchy: https://numpy.org/doc/stable/reference/arrays.scalars.html
+    # Skip string scalars
+    default_collate_fn_map[(np.bool_, np.number, np.object_)] = collate_numpy_scalar_fn
+default_collate_fn_map[float] = collate_float_fn
+default_collate_fn_map[int] = collate_int_fn
+default_collate_fn_map[str] = collate_str_fn
+default_collate_fn_map[bytes] = collate_str_fn
+
+
+def default_collate(batch):
+    r"""
+    Take in a batch of data and put the elements within the batch into a tensor with an additional outer dimension - batch size.
+
+    The exact output type can be a :class:`torch.Tensor`, a `Sequence` of :class:`torch.Tensor`, a
+    Collection of :class:`torch.Tensor`, or left unchanged, depending on the input type.
+    This is used as the default function for collation when
+    `batch_size` or `batch_sampler` is defined in :class:`~torch.utils.data.DataLoader`.
+
+    Here is the general input type (based on the type of the element within the batch) to output type mapping:
+
+        * :class:`torch.Tensor` -> :class:`torch.Tensor` (with an added outer dimension batch size)
+        * NumPy Arrays -> :class:`torch.Tensor`
+        * `float` -> :class:`torch.Tensor`
+        * `int` -> :class:`torch.Tensor`
+        * `str` -> `str` (unchanged)
+        * `bytes` -> `bytes` (unchanged)
+        * `Mapping[K, V_i]` -> `Mapping[K, default_collate([V_1, V_2, ...])]`
+        * `NamedTuple[V1_i, V2_i, ...]` -> `NamedTuple[default_collate([V1_1, V1_2, ...]),
+          default_collate([V2_1, V2_2, ...]), ...]`
+        * `Sequence[V1_i, V2_i, ...]` -> `Sequence[default_collate([V1_1, V1_2, ...]),
+          default_collate([V2_1, V2_2, ...]), ...]`
+
+    Args:
+        batch: a single batch to be collated
+
+    Examples:
+        >>> # xdoctest: +SKIP
+        >>> # Example with a batch of `int`s:
+        >>> default_collate([0, 1, 2, 3])
+        tensor([0, 1, 2, 3])
+        >>> # Example with a batch of `str`s:
+        >>> default_collate(['a', 'b', 'c'])
+        ['a', 'b', 'c']
+        >>> # Example with `Map` inside the batch:
+        >>> default_collate([{'A': 0, 'B': 1}, {'A': 100, 'B': 100}])
+        {'A': tensor([  0, 100]), 'B': tensor([  1, 100])}
+        >>> # Example with `NamedTuple` inside the batch:
+        >>> Point = namedtuple('Point', ['x', 'y'])
+        >>> default_collate([Point(0, 0), Point(1, 1)])
+        Point(x=tensor([0, 1]), y=tensor([0, 1]))
+        >>> # Example with `Tuple` inside the batch:
+        >>> default_collate([(0, 1), (2, 3)])
+        [tensor([0, 2]), tensor([1, 3])]
+        >>> # Example with `List` inside the batch:
+        >>> default_collate([[0, 1], [2, 3]])
+        [tensor([0, 2]), tensor([1, 3])]
+        >>> # Two options to extend `default_collate` to handle specific type
+        >>> # Option 1: Write custom collate function and invoke `default_collate`
+        >>> def custom_collate(batch):
+        ...     elem = batch[0]
+        ...     if isinstance(elem, CustomType):  # Some custom condition
+        ...         return ...
+        ...     else:  # Fall back to `default_collate`
+        ...         return default_collate(batch)
+        >>> # Option 2: In-place modify `default_collate_fn_map`
+        >>> def collate_customtype_fn(batch, *, collate_fn_map=None):
+        ...     return ...
+        >>> default_collate_fn_map.update(CustoType, collate_customtype_fn)
+        >>> default_collate(batch)  # Handle `CustomType` automatically
+    """
+    return collate(batch, collate_fn_map=default_collate_fn_map)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/fetch.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/fetch.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a22a56fcda1856bfba7236ca963e855f2e9d78
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/fetch.py
@@ -0,0 +1,54 @@
+r"""Contains definitions of the methods used by the _BaseDataLoaderIter to fetch data from an iterable-style or map-style dataset.
+
+This logic is shared in both single- and multi-processing data loading.
+"""
+
+
+class _BaseDatasetFetcher:
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+        self.dataset = dataset
+        self.auto_collation = auto_collation
+        self.collate_fn = collate_fn
+        self.drop_last = drop_last
+
+    def fetch(self, possibly_batched_index):
+        raise NotImplementedError()
+
+
+class _IterableDatasetFetcher(_BaseDatasetFetcher):
+    def __init__(self, dataset, auto_collation, collate_fn, drop_last):
+        super().__init__(dataset, auto_collation, collate_fn, drop_last)
+        self.dataset_iter = iter(dataset)
+        self.ended = False
+
+    def fetch(self, possibly_batched_index):
+        if self.ended:
+            raise StopIteration
+
+        if self.auto_collation:
+            data = []
+            for _ in possibly_batched_index:
+                try:
+                    data.append(next(self.dataset_iter))
+                except StopIteration:
+                    self.ended = True
+                    break
+            if len(data) == 0 or (
+                self.drop_last and len(data) < len(possibly_batched_index)
+            ):
+                raise StopIteration
+        else:
+            data = next(self.dataset_iter)
+        return self.collate_fn(data)
+
+
+class _MapDatasetFetcher(_BaseDatasetFetcher):
+    def fetch(self, possibly_batched_index):
+        if self.auto_collation:
+            if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
+                data = self.dataset.__getitems__(possibly_batched_index)
+            else:
+                data = [self.dataset[idx] for idx in possibly_batched_index]
+        else:
+            data = self.dataset[possibly_batched_index]
+        return self.collate_fn(data)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/pin_memory.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/pin_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e1ec2bbcf8950a441f8dc7c487a5caa39942095
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/pin_memory.py
@@ -0,0 +1,98 @@
+r"""Contains definitions of the methods used by the _BaseDataLoaderIter to put fetched tensors into pinned memory.
+
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+"""
+
+import collections
+import copy
+import queue
+
+import torch
+from . import MP_STATUS_CHECK_INTERVAL
+from torch._utils import ExceptionWrapper
+
+
+def _pin_memory_loop(in_queue, out_queue, device_id, done_event, device):
+    # This setting is thread local, and prevents the copy in pin_memory from
+    # consuming all CPU cores.
+    torch.set_num_threads(1)
+
+    if device == "cuda":
+        torch.cuda.set_device(device_id)
+    elif device == "xpu":
+        torch.xpu.set_device(device_id)  # type: ignore[attr-defined]
+    elif device == torch._C._get_privateuse1_backend_name():
+        custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name())
+        custom_device_mod.set_device(device_id)
+
+    def do_one_step():
+        try:
+            r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+        except queue.Empty:
+            return
+        idx, data = r
+        if not done_event.is_set() and not isinstance(data, ExceptionWrapper):
+            try:
+                data = pin_memory(data, device)
+            except Exception:
+                data = ExceptionWrapper(
+                    where=f"in pin memory thread for device {device_id}")
+            r = (idx, data)
+        while not done_event.is_set():
+            try:
+                out_queue.put(r, timeout=MP_STATUS_CHECK_INTERVAL)
+                break
+            except queue.Full:
+                continue
+
+    # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
+    # logic of this function.
+    while not done_event.is_set():
+        # Make sure that we don't preserve any object from one iteration
+        # to the next
+        do_one_step()
+
+def pin_memory(data, device=None):
+    if isinstance(data, torch.Tensor):
+        return data.pin_memory(device)
+    elif isinstance(data, (str, bytes)):
+        return data
+    elif isinstance(data, collections.abc.Mapping):
+        try:
+            if isinstance(data, collections.abc.MutableMapping):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)
+                clone.update({k: pin_memory(sample, device) for k, sample in data.items()})
+                return clone
+            else:
+                return type(data)({k: pin_memory(sample, device) for k, sample in data.items()})  # type: ignore[call-arg]
+        except TypeError:
+            # The mapping type may not support `copy()` / `update(mapping)`
+            # or `__init__(iterable)`.
+            return {k: pin_memory(sample, device) for k, sample in data.items()}
+    elif isinstance(data, tuple) and hasattr(data, '_fields'):  # namedtuple
+        return type(data)(*(pin_memory(sample, device) for sample in data))
+    elif isinstance(data, tuple):
+        return [pin_memory(sample, device) for sample in data]  # Backwards compatibility.
+    elif isinstance(data, collections.abc.Sequence):
+        try:
+            if isinstance(data, collections.abc.MutableSequence):
+                # The sequence type may have extra properties, so we can't just
+                # use `type(data)(...)` to create the new sequence.
+                # Create a clone and update it if the sequence type is mutable.
+                clone = copy.copy(data)  # type: ignore[arg-type]
+                for i, item in enumerate(data):
+                    clone[i] = pin_memory(item, device)
+                return clone
+            return type(data)([pin_memory(sample, device) for sample in data])  # type: ignore[call-arg]
+        except TypeError:
+            # The sequence type may not support `copy()` / `__setitem__(index, item)`
+            # or `__init__(iterable)` (e.g., `range`).
+            return [pin_memory(sample, device) for sample in data]
+    elif hasattr(data, "pin_memory"):
+        return data.pin_memory()
+    else:
+        return data
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/signal_handling.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/signal_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..91bd0cc2754a84b813aed061e69ce73c0e31322f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/signal_handling.py
@@ -0,0 +1,72 @@
+r"""Signal handling for multiprocessing data loading.
+
+NOTE [ Signal handling in multiprocessing data loading ]
+
+In cases like DataLoader, if a worker process dies due to bus error/segfault
+or just hang, the main process will hang waiting for data. This is difficult
+to avoid on PyTorch side as it can be caused by limited shm, or other
+libraries users call in the workers. In this file and `DataLoader.cpp`, we make
+our best effort to provide some error message to users when such unfortunate
+events happen.
+
+When a _BaseDataLoaderIter starts worker processes, their pids are registered in a
+defined in `DataLoader.cpp`: id(_BaseDataLoaderIter) => Collection[ Worker pids ]
+via `_set_worker_pids`.
+
+When an error happens in a worker process, the main process received a SIGCHLD,
+and Python will eventually call the handler registered below
+(in `_set_SIGCHLD_handler`). In the handler, the `_error_if_any_worker_fails`
+call checks all registered worker pids and raise proper error message to
+prevent main process from hanging waiting for data from worker.
+
+Additionally, at the beginning of each worker's `_utils.worker._worker_loop`,
+`_set_worker_signal_handlers` is called to register critical signal handlers
+(e.g., for SIGSEGV, SIGBUS, SIGFPE, SIGTERM) in C, which just prints an error
+message to stderr before triggering the default handler. So a message will also
+be printed from the worker process when it is killed by such signals.
+
+See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for the reasoning of
+this signal handling design and other mechanism we implement to make our
+multiprocessing data loading robust to errors.
+"""
+
+import signal
+import threading
+from . import IS_WINDOWS
+
+# Some of the following imported functions are not used in this file, but are to
+# be used `_utils.signal_handling.XXXXX`.
+from torch._C import _set_worker_pids, _remove_worker_pids  # noqa: F401
+from torch._C import _error_if_any_worker_fails, _set_worker_signal_handlers  # noqa: F401
+
+_SIGCHLD_handler_set = False
+r"""Whether SIGCHLD handler is set for DataLoader worker failures. Only one
+handler needs to be set for all DataLoaders in a process."""
+
+
+def _set_SIGCHLD_handler():
+    # Windows doesn't support SIGCHLD handler
+    if IS_WINDOWS:
+        return
+    # can't set signal in child threads
+    if not isinstance(threading.current_thread(), threading._MainThread):  # type: ignore[attr-defined]
+        return
+    global _SIGCHLD_handler_set
+    if _SIGCHLD_handler_set:
+        return
+    previous_handler = signal.getsignal(signal.SIGCHLD)
+    if not callable(previous_handler):
+        # This doesn't catch default handler, but SIGCHLD default handler is a
+        # no-op.
+        previous_handler = None
+
+    def handler(signum, frame):
+        # This following call uses `waitid` with WNOHANG from C side. Therefore,
+        # Python can still get and update the process status successfully.
+        _error_if_any_worker_fails()
+        if previous_handler is not None:
+            assert callable(previous_handler)
+            previous_handler(signum, frame)
+
+    signal.signal(signal.SIGCHLD, handler)
+    _SIGCHLD_handler_set = True
diff --git a/MLPY/Lib/site-packages/torch/utils/data/_utils/worker.py b/MLPY/Lib/site-packages/torch/utils/data/_utils/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..23567341d2beff40570fc04f10ba97cac3ae27de
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/_utils/worker.py
@@ -0,0 +1,329 @@
+r""""Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+"""
+
+import torch
+import random
+import os
+import queue
+from dataclasses import dataclass
+from torch._utils import ExceptionWrapper
+from typing import Optional, Union, TYPE_CHECKING
+from . import signal_handling, MP_STATUS_CHECK_INTERVAL, IS_WINDOWS, HAS_NUMPY
+if TYPE_CHECKING:
+    from torch.utils.data import Dataset
+
+if IS_WINDOWS:
+    import ctypes
+    from ctypes.wintypes import DWORD, BOOL, HANDLE
+
+    # On Windows, the parent ID of the worker process remains unchanged when the manager process
+    # is gone, and the only way to check it through OS is to let the worker have a process handle
+    # of the manager and ask if the process status has changed.
+    class ManagerWatchdog:
+        def __init__(self):
+            self.manager_pid = os.getppid()
+
+            # mypy cannot detect this code is windows only
+            self.kernel32 = ctypes.WinDLL('kernel32', use_last_error=True)  # type: ignore[attr-defined]
+            self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
+            self.kernel32.OpenProcess.restype = HANDLE
+            self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
+            self.kernel32.WaitForSingleObject.restype = DWORD
+
+            # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx
+            SYNCHRONIZE = 0x00100000
+            self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
+
+            if not self.manager_handle:
+                raise ctypes.WinError(ctypes.get_last_error())  # type: ignore[attr-defined]
+
+            self.manager_dead = False
+
+        def is_alive(self):
+            if not self.manager_dead:
+                # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
+                self.manager_dead = self.kernel32.WaitForSingleObject(self.manager_handle, 0) == 0
+            return not self.manager_dead
+else:
+    class ManagerWatchdog:  # type: ignore[no-redef]
+        def __init__(self):
+            self.manager_pid = os.getppid()
+            self.manager_dead = False
+
+        def is_alive(self):
+            if not self.manager_dead:
+                self.manager_dead = os.getppid() != self.manager_pid
+            return not self.manager_dead
+
+_worker_info: Optional["WorkerInfo"] = None
+
+
+class WorkerInfo:
+    id: int
+    num_workers: int
+    seed: int
+    dataset: 'Dataset'
+    __initialized = False
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.__keys = tuple(kwargs.keys())
+        self.__initialized = True
+
+    def __setattr__(self, key, val):
+        if self.__initialized:
+            raise RuntimeError(f"Cannot assign attributes to {self.__class__.__name__} objects")
+        return super().__setattr__(key, val)
+
+    def __repr__(self):
+        items = []
+        for k in self.__keys:
+            items.append(f'{k}={getattr(self, k)}')
+        return f"{self.__class__.__name__}({', '.join(items)})"
+
+
+def get_worker_info() -> Optional[WorkerInfo]:
+    r"""Returns the information about the current
+    :class:`~torch.utils.data.DataLoader` iterator worker process.
+
+    When called in a worker, this returns an object guaranteed to have the
+    following attributes:
+
+    * :attr:`id`: the current worker id.
+    * :attr:`num_workers`: the total number of workers.
+    * :attr:`seed`: the random seed set for the current worker. This value is
+      determined by main process RNG and the worker id. See
+      :class:`~torch.utils.data.DataLoader`'s documentation for more details.
+    * :attr:`dataset`: the copy of the dataset object in **this** process. Note
+      that this will be a different object in a different process than the one
+      in the main process.
+
+    When called in the main process, this returns ``None``.
+
+    .. note::
+       When used in a :attr:`worker_init_fn` passed over to
+       :class:`~torch.utils.data.DataLoader`, this method can be useful to
+       set up each worker process differently, for instance, using ``worker_id``
+       to configure the ``dataset`` object to only read a specific fraction of a
+       sharded dataset, or use ``seed`` to seed other libraries used in dataset
+       code.
+    """
+    return _worker_info
+
+
+r"""Dummy class used to signal the end of an IterableDataset"""
+@dataclass(frozen=True)
+class _IterableDatasetStopIteration:
+    worker_id: int
+
+r"""Dummy class used to resume the fetching when worker reuse is enabled"""
+@dataclass(frozen=True)
+class _ResumeIteration:
+    seed: Optional[int] = None
+
+# The function `_generate_state` is adapted from `numpy.random.SeedSequence`
+# from https://github.com/numpy/numpy/blob/main/numpy/random/bit_generator.pyx
+# It's MIT licensed, here is the copyright:
+
+# Copyright (c) 2015 Melissa E. O'Neill
+# Copyright (c) 2019 NumPy Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# This function generates an array of int32 as the seed for
+# `numpy.random`, in order to prevent state collision due to same
+# seed and algorithm for `numpy.random` and `random` modules.
+# TODO: Implement `SeedSequence` like object for `torch.random`
+def _generate_state(base_seed, worker_id):
+    INIT_A = 0x43b0d7e5
+    MULT_A = 0x931e8875
+    INIT_B = 0x8b51f9dd
+    MULT_B = 0x58f38ded
+    MIX_MULT_L = 0xca01f9dd
+    MIX_MULT_R = 0x4973f715
+    XSHIFT = 4 * 8 // 2
+    MASK32 = 0xFFFFFFFF
+
+    entropy = [worker_id, base_seed & MASK32, base_seed >> 32, 0]
+    pool = [0] * 4
+
+    hash_const_A = INIT_A
+
+    def hash(value):
+        nonlocal hash_const_A
+        value = (value ^ hash_const_A) & MASK32
+        hash_const_A = (hash_const_A * MULT_A) & MASK32
+        value = (value * hash_const_A) & MASK32
+        value = (value ^ (value >> XSHIFT)) & MASK32
+        return value
+
+    def mix(x, y):
+        result_x = (MIX_MULT_L * x) & MASK32
+        result_y = (MIX_MULT_R * y) & MASK32
+        result = (result_x - result_y) & MASK32
+        result = (result ^ (result >> XSHIFT)) & MASK32
+        return result
+
+    # Add in the entropy to the pool.
+    for i in range(len(pool)):
+        pool[i] = hash(entropy[i])
+
+    # Mix all bits together so late bits can affect earlier bits.
+    for i_src in range(len(pool)):
+        for i_dst in range(len(pool)):
+            if i_src != i_dst:
+                pool[i_dst] = mix(pool[i_dst], hash(pool[i_src]))
+
+    hash_const_B = INIT_B
+    state = []
+    for i_dst in range(4):
+        data_val = pool[i_dst]
+        data_val = (data_val ^ hash_const_B) & MASK32
+        hash_const_B = (hash_const_B * MULT_B) & MASK32
+        data_val = (data_val * hash_const_B) & MASK32
+        data_val = (data_val ^ (data_val >> XSHIFT)) & MASK32
+        state.append(data_val)
+    return state
+
+def _worker_loop(dataset_kind, dataset, index_queue, data_queue, done_event,
+                 auto_collation, collate_fn, drop_last, base_seed, init_fn, worker_id,
+                 num_workers, persistent_workers, shared_seed):
+    # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
+    # logic of this function.
+
+    try:
+        # Initialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+        # module's handlers are executed after Python returns from C low-level
+        # handlers, likely when the same fatal signal had already happened
+        # again.
+        # https://docs.python.org/3/library/signal.html#execution-of-python-signal-handlers
+        signal_handling._set_worker_signal_handlers()
+
+        torch.set_num_threads(1)
+        seed = base_seed + worker_id
+        random.seed(seed)
+        torch.manual_seed(seed)
+        if HAS_NUMPY:
+            np_seed = _generate_state(base_seed, worker_id)
+            import numpy as np
+            np.random.seed(np_seed)
+
+        from torch.utils.data import IterDataPipe
+        from torch.utils.data.graph_settings import apply_random_seed
+
+        shared_rng = torch.Generator()
+        if isinstance(dataset, IterDataPipe):
+            assert shared_seed is not None
+            shared_rng.manual_seed(shared_seed)
+            dataset = apply_random_seed(dataset, shared_rng)
+
+        global _worker_info
+        _worker_info = WorkerInfo(id=worker_id, num_workers=num_workers,
+                                  seed=seed, dataset=dataset)
+
+        from torch.utils.data import _DatasetKind
+
+        init_exception = None
+
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+
+            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, auto_collation, collate_fn, drop_last)
+        except Exception:
+            init_exception = ExceptionWrapper(
+                where=f"in DataLoader worker process {worker_id}")
+
+        # When using Iterable mode, some worker can exit earlier than others due
+        # to the IterableDataset behaving differently for different workers.
+        # When such things happen, an `_IterableDatasetStopIteration` object is
+        # sent over to the main process with the ID of this worker, so that the
+        # main process won't send more tasks to this worker, and will send
+        # `None` to this worker to properly exit it.
+        #
+        # Note that we cannot set `done_event` from a worker as it is shared
+        # among all processes. Instead, we set the `iteration_end` flag to
+        # signify that the iterator is exhausted. When either `done_event` or
+        # `iteration_end` is set, we skip all processing step and just wait for
+        # `None`.
+        iteration_end = False
+
+        watchdog = ManagerWatchdog()
+
+        while watchdog.is_alive():
+            try:
+                r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+            if isinstance(r, _ResumeIteration):
+                # Acknowledge the main process
+                data_queue.put((r, None))
+                iteration_end = False
+
+                if isinstance(dataset, IterDataPipe):
+                    assert r.seed is not None
+                    shared_rng.manual_seed(r.seed)
+                    dataset = apply_random_seed(dataset, shared_rng)
+
+                # Recreate the fetcher for worker-reuse policy
+                fetcher = _DatasetKind.create_fetcher(
+                    dataset_kind, dataset, auto_collation, collate_fn, drop_last)
+                continue
+            elif r is None:
+                # Received the final signal
+                assert done_event.is_set() or iteration_end
+                break
+            elif done_event.is_set() or iteration_end:
+                # `done_event` is set. But I haven't received the final signal
+                # (None) yet. I will keep continuing until get it, and skip the
+                # processing steps.
+                continue
+            idx, index = r
+            data: Union[_IterableDatasetStopIteration, ExceptionWrapper]
+            if init_exception is not None:
+                data = init_exception
+                init_exception = None
+            else:
+                try:
+                    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
+                except Exception as e:
+                    if isinstance(e, StopIteration) and dataset_kind == _DatasetKind.Iterable:
+                        data = _IterableDatasetStopIteration(worker_id)
+                        # Set `iteration_end`
+                        #   (1) to save future `next(...)` calls, and
+                        #   (2) to avoid sending multiple `_IterableDatasetStopIteration`s.
+                        iteration_end = True
+                    else:
+                        # It is important that we don't store exc_info in a variable.
+                        # `ExceptionWrapper` does the correct thing.
+                        # See NOTE [ Python Traceback Reference Cycle Problem ]
+                        data = ExceptionWrapper(
+                            where=f"in DataLoader worker process {worker_id}")
+            data_queue.put((idx, data))
+            del data, idx, index, r  # save memory
+    except KeyboardInterrupt:
+        # Main process will raise KeyboardInterrupt anyways.
+        pass
+    if done_event.is_set():
+        data_queue.cancel_join_thread()
+        data_queue.close()
diff --git a/MLPY/Lib/site-packages/torch/utils/data/backward_compatibility.py b/MLPY/Lib/site-packages/torch/utils/data/backward_compatibility.py
new file mode 100644
index 0000000000000000000000000000000000000000..a82733eea5a05071286578ac8daf3c94796dd27a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/backward_compatibility.py
@@ -0,0 +1,5 @@
+import warnings
+
+def worker_init_fn(worker_id):
+    warnings.warn("Usage of backward_compatibility.worker_init_fn is deprecated"
+                  " as DataLoader automatically applies sharding in every worker")
diff --git a/MLPY/Lib/site-packages/torch/utils/data/dataloader.py b/MLPY/Lib/site-packages/torch/utils/data/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..02efa9c3870a1b715dfe367905a4860be964fc9a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/dataloader.py
@@ -0,0 +1,1479 @@
+r"""Definition of the DataLoader and associated iterators that subclass _BaseDataLoaderIter.
+
+To support these two classes, in `./_utils` we define many utility methods and
+functions to be run in multiprocessing. E.g., the data loading worker loop is
+in `./_utils/worker.py`.
+"""
+
+import functools
+import itertools
+import logging
+import os
+import queue
+import threading
+import warnings
+
+from typing import Any, Callable, Iterable, TypeVar, Generic, List, Optional, Union
+
+import multiprocessing as python_multiprocessing
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as multiprocessing
+import torch.utils.data.graph_settings
+
+from torch._utils import ExceptionWrapper
+
+from . import (
+    IterDataPipe,
+    MapDataPipe,
+    IterableDataset,
+    Sampler,
+    SequentialSampler,
+    RandomSampler,
+    BatchSampler,
+    Dataset,)
+
+from torch.utils.data.datapipes.datapipe import _IterDataPipeSerializationWrapper, _MapDataPipeSerializationWrapper
+
+from . import _utils
+
+__all__ = [
+    "DataLoader",
+    "get_worker_info",
+    "default_collate",
+    "default_convert",
+]
+
+T_co = TypeVar('T_co', covariant=True)
+T = TypeVar('T')
+_worker_init_fn_t = Callable[[int], None]
+
+# Ideally we would parameterize `DataLoader` by the return type of `collate_fn`, but there is currently no way to have that
+# type parameter set to a default value if the user doesn't pass in a custom 'collate_fn'.
+# See https://github.com/python/mypy/issues/3737.
+_collate_fn_t = Callable[[List[T]], Any]
+
+
+# These functions used to be defined in this file. However, it was moved to
+# _utils/collate.py. Although it is rather hard to access this from user land
+# (one has to explicitly directly `import torch.utils.data.dataloader`), there
+# probably is user code out there using it. This aliasing maintains BC in this
+# aspect.
+default_collate: _collate_fn_t = _utils.collate.default_collate
+default_convert = _utils.collate.default_convert
+
+get_worker_info = _utils.worker.get_worker_info
+
+logger = logging.getLogger(__name__)
+
+
+class _DatasetKind:
+    Map = 0
+    Iterable = 1
+
+    @staticmethod
+    def create_fetcher(kind, dataset, auto_collation, collate_fn, drop_last):
+        if kind == _DatasetKind.Map:
+            return _utils.fetch._MapDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
+        else:
+            return _utils.fetch._IterableDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
+
+
+class _InfiniteConstantSampler(Sampler):
+    r"""Analogous to ``itertools.repeat(None, None)``.
+
+    Used as sampler for :class:`~torch.utils.data.IterableDataset`.
+    """
+
+    def __iter__(self):
+        while True:
+            yield None
+
+
+def _get_distributed_settings():
+    if dist.is_available() and dist.is_initialized():
+        return dist.get_world_size(), dist.get_rank()
+    else:
+        return 1, 0
+
+
+def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
+    global_worker_id = worker_id
+    info = torch.utils.data.get_worker_info()
+    assert info is not None
+    total_workers = info.num_workers
+    datapipe = info.dataset
+    assert isinstance(datapipe, (IterDataPipe, MapDataPipe))
+    # To distribute elements across distributed process evenly, we should shard data on distributed
+    # processes first then shard on worker processes
+    total_workers *= world_size
+    global_worker_id = global_worker_id * world_size + rank_id
+    # For BC, use default SHARDING_PRIORITIES
+    torch.utils.data.graph_settings.apply_sharding(datapipe, total_workers, global_worker_id)
+    if worker_init_fn is not None:
+        worker_init_fn(worker_id)
+
+
+def _share_dist_seed(generator, pg):
+    _shared_seed = torch.empty((), dtype=torch.int64).random_(generator=generator)
+    if isinstance(pg, dist.ProcessGroup):
+        dist.broadcast(_shared_seed, src=0, group=pg)
+    return _shared_seed.item()
+
+
+class DataLoader(Generic[T_co]):
+    r"""
+    Data loader combines a dataset and a sampler, and provides an iterable over the given dataset.
+
+    The :class:`~torch.utils.data.DataLoader` supports both map-style and
+    iterable-style datasets with single- or multi-process loading, customizing
+    loading order and optional automatic batching (collation) and memory pinning.
+
+    See :py:mod:`torch.utils.data` documentation page for more details.
+
+    Args:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: ``1``).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        sampler (Sampler or Iterable, optional): defines the strategy to draw
+            samples from the dataset. Can be any ``Iterable`` with ``__len__``
+            implemented. If specified, :attr:`shuffle` must not be specified.
+        batch_sampler (Sampler or Iterable, optional): like :attr:`sampler`, but
+            returns a batch of indices at a time. Mutually exclusive with
+            :attr:`batch_size`, :attr:`shuffle`, :attr:`sampler`,
+            and :attr:`drop_last`.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        collate_fn (Callable, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a
+            map-style dataset.
+        pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
+            into device/CUDA pinned memory before returning them.  If your data elements
+            are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
+            see the example below.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: ``0``)
+        worker_init_fn (Callable, optional): If not ``None``, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: ``None``)
+        multiprocessing_context (str or multiprocessing.context.BaseContext, optional): If
+            ``None``, the default `multiprocessing context`_ of your operating system will
+            be used. (default: ``None``)
+        generator (torch.Generator, optional): If not ``None``, this RNG will be used
+            by RandomSampler to generate random indexes and multiprocessing to generate
+            ``base_seed`` for workers. (default: ``None``)
+        prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
+            in advance by each worker. ``2`` means there will be a total of
+            2 * num_workers batches prefetched across all workers. (default value depends
+            on the set value for num_workers. If value of num_workers=0 default is ``None``.
+            Otherwise, if value of ``num_workers > 0`` default is ``2``).
+        persistent_workers (bool, optional): If ``True``, the data loader will not shut down
+            the worker processes after a dataset has been consumed once. This allows to
+            maintain the workers `Dataset` instances alive. (default: ``False``)
+        pin_memory_device (str, optional): the device to :attr:`pin_memory` to if ``pin_memory`` is
+            ``True``.
+
+
+    .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
+                 cannot be an unpicklable object, e.g., a lambda function. See
+                 :ref:`multiprocessing-best-practices` on more details related
+                 to multiprocessing in PyTorch.
+
+    .. warning:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
+                 When :attr:`dataset` is an :class:`~torch.utils.data.IterableDataset`,
+                 it instead returns an estimate based on ``len(dataset) / batch_size``, with proper
+                 rounding depending on :attr:`drop_last`, regardless of multi-process loading
+                 configurations. This represents the best guess PyTorch can make because PyTorch
+                 trusts user :attr:`dataset` code in correctly handling multi-process
+                 loading to avoid duplicate data.
+
+                 However, if sharding results in multiple workers having incomplete last batches,
+                 this estimate can still be inaccurate, because (1) an otherwise complete batch can
+                 be broken into multiple ones and (2) more than one batch worth of samples can be
+                 dropped when :attr:`drop_last` is set. Unfortunately, PyTorch can not detect such
+                 cases in general.
+
+                 See `Dataset Types`_ for more details on these two types of datasets and how
+                 :class:`~torch.utils.data.IterableDataset` interacts with
+                 `Multi-process data loading`_.
+
+    .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
+                 :ref:`data-loading-randomness` notes for random seed related questions.
+
+    .. _multiprocessing context:
+        https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    """
+
+    dataset: Dataset[T_co]
+    batch_size: Optional[int]
+    num_workers: int
+    pin_memory: bool
+    drop_last: bool
+    timeout: float
+    sampler: Union[Sampler, Iterable]
+    pin_memory_device: str
+    prefetch_factor: Optional[int]
+    _iterator : Optional['_BaseDataLoaderIter']
+    __initialized = False
+
+    def __init__(self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
+                 shuffle: Optional[bool] = None, sampler: Union[Sampler, Iterable, None] = None,
+                 batch_sampler: Union[Sampler[List], Iterable[List], None] = None,
+                 num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None,
+                 pin_memory: bool = False, drop_last: bool = False,
+                 timeout: float = 0, worker_init_fn: Optional[_worker_init_fn_t] = None,
+                 multiprocessing_context=None, generator=None,
+                 *, prefetch_factor: Optional[int] = None,
+                 persistent_workers: bool = False,
+                 pin_memory_device: str = ""):
+        torch._C._log_api_usage_once("python.data_loader")
+
+        if num_workers < 0:
+            raise ValueError('num_workers option should be non-negative; '
+                             'use num_workers=0 to disable multiprocessing.')
+
+        if timeout < 0:
+            raise ValueError('timeout option should be non-negative')
+
+        if num_workers == 0 and prefetch_factor is not None:
+            raise ValueError('prefetch_factor option could only be specified in multiprocessing.'
+                             'let num_workers > 0 to enable multiprocessing, otherwise set prefetch_factor to None.')
+        elif num_workers > 0 and prefetch_factor is None:
+            prefetch_factor = 2
+        elif prefetch_factor is not None and prefetch_factor < 0:
+            raise ValueError('prefetch_factor option should be non-negative')
+
+        if persistent_workers and num_workers == 0:
+            raise ValueError('persistent_workers option needs num_workers > 0')
+
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.pin_memory = pin_memory
+        self.pin_memory_device = pin_memory_device
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        self.multiprocessing_context = multiprocessing_context
+
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   _DataPipeSerializationWrapper container makes it easier to serialize without redefining pickler
+        if isinstance(self.dataset, IterDataPipe):
+            self.dataset = _IterDataPipeSerializationWrapper(self.dataset)
+        elif isinstance(self.dataset, MapDataPipe):
+            self.dataset = _MapDataPipeSerializationWrapper(self.dataset)
+
+        # Arg-check dataset related before checking samplers because we want to
+        # tell users that iterable-style datasets are incompatible with custom
+        # samplers first, so that they don't learn that this combo doesn't work
+        # after spending time fixing the custom sampler errors.
+        if isinstance(dataset, IterableDataset):
+            self._dataset_kind = _DatasetKind.Iterable
+            # NOTE [ Custom Samplers and IterableDataset ]
+            #
+            # `IterableDataset` does not support custom `batch_sampler` or
+            # `sampler` since the key is irrelevant (unless we support
+            # generator-style dataset one day...).
+            #
+            # For `sampler`, we always create a dummy sampler. This is an
+            # infinite sampler even when the dataset may have an implemented
+            # finite `__len__` because in multi-process data loading, naive
+            # settings will return duplicated data (which may be desired), and
+            # thus using a sampler with length matching that of dataset will
+            # cause data lost (you may have duplicates of the first couple
+            # batches, but never see anything afterwards). Therefore,
+            # `Iterabledataset` always uses an infinite sampler, an instance of
+            # `_InfiniteConstantSampler` defined above.
+            #
+            # A custom `batch_sampler` essentially only controls the batch size.
+            # However, it is unclear how useful it would be since an iterable-style
+            # dataset can handle that within itself. Moreover, it is pointless
+            # in multi-process data loading as the assignment order of batches
+            # to workers is an implementation detail so users can not control
+            # how to batchify each worker's iterable. Thus, we disable this
+            # option. If this turns out to be useful in future, we can re-enable
+            # this, and support custom samplers that specify the assignments to
+            # specific workers.
+            if isinstance(dataset, IterDataPipe):
+                if shuffle is not None:
+                    dataset = torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
+            # We cannot check `shuffle is not None` here, since previously `shuffle=False` was the default.
+            elif shuffle not in {False, None}:
+                raise ValueError(
+                    f"DataLoader with IterableDataset: expected unspecified shuffle option, but got shuffle={shuffle}")
+
+            if sampler is not None:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                raise ValueError(
+                    f"DataLoader with IterableDataset: expected unspecified sampler option, but got sampler={sampler}")
+            elif batch_sampler is not None:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified "
+                    f"batch_sampler option, but got batch_sampler={batch_sampler}")
+        else:
+            shuffle = bool(shuffle)
+            self._dataset_kind = _DatasetKind.Map
+
+
+
+        if sampler is not None and shuffle:
+            raise ValueError('sampler option is mutually exclusive with '
+                             'shuffle')
+
+        if batch_sampler is not None:
+            # auto_collation with custom batch_sampler
+            if batch_size != 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError('batch_sampler option is mutually exclusive '
+                                 'with batch_size, shuffle, sampler, and '
+                                 'drop_last')
+            batch_size = None
+            drop_last = False
+        elif batch_size is None:
+            # no auto_collation
+            if drop_last:
+                raise ValueError('batch_size=None option disables auto-batching '
+                                 'and is mutually exclusive with drop_last')
+
+        if sampler is None:  # give default samplers
+            if self._dataset_kind == _DatasetKind.Iterable:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                sampler = _InfiniteConstantSampler()
+            else:  # map-style
+                if shuffle:
+                    sampler = RandomSampler(dataset, generator=generator)  # type: ignore[arg-type]
+                else:
+                    sampler = SequentialSampler(dataset)  # type: ignore[arg-type]
+
+        if batch_size is not None and batch_sampler is None:
+            # auto_collation without custom batch_sampler
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.generator = generator
+
+        if collate_fn is None:
+            if self._auto_collation:
+                collate_fn = _utils.collate.default_collate
+            else:
+                collate_fn = _utils.collate.default_convert
+
+        self.collate_fn = collate_fn
+        self.persistent_workers = persistent_workers
+
+        self.__initialized = True
+        self._IterableDataset_len_called = None  # See NOTE [ IterableDataset and __len__ ]
+
+        self._iterator = None
+
+        self.check_worker_number_rationality()
+
+        torch.set_vital('Dataloader', 'enabled', 'True')  # type: ignore[attr-defined]
+
+    def _get_iterator(self) -> '_BaseDataLoaderIter':
+        if self.num_workers == 0:
+            return _SingleProcessDataLoaderIter(self)
+        else:
+            self.check_worker_number_rationality()
+            return _MultiProcessingDataLoaderIter(self)
+
+    @property
+    def multiprocessing_context(self):
+        return self.__multiprocessing_context
+
+    @multiprocessing_context.setter
+    def multiprocessing_context(self, multiprocessing_context):
+        if multiprocessing_context is not None:
+            if self.num_workers > 0:
+                if isinstance(multiprocessing_context, str):
+                    valid_start_methods = multiprocessing.get_all_start_methods()
+                    if multiprocessing_context not in valid_start_methods:
+                        raise ValueError(
+                            'multiprocessing_context option '
+                            f'should specify a valid start method in {valid_start_methods!r}, but got '
+                            f'multiprocessing_context={multiprocessing_context!r}')
+                    multiprocessing_context = multiprocessing.get_context(multiprocessing_context)
+
+                if not isinstance(multiprocessing_context, python_multiprocessing.context.BaseContext):
+                    raise TypeError('multiprocessing_context option should be a valid context '
+                                    'object or a string specifying the start method, but got '
+                                    f'multiprocessing_context={multiprocessing_context}')
+            else:
+                raise ValueError('multiprocessing_context can only be used with '
+                                 'multi-process loading (num_workers > 0), but got '
+                                 f'num_workers={self.num_workers}')
+
+        self.__multiprocessing_context = multiprocessing_context
+
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in (
+                'batch_size', 'batch_sampler', 'sampler', 'drop_last', 'dataset', 'persistent_workers'):
+            raise ValueError(f'{attr} attribute should not be set after {self.__class__.__name__} is initialized')
+
+        super().__setattr__(attr, val)
+
+    # We quote '_BaseDataLoaderIter' since it isn't defined yet and the definition can't be moved up
+    # since '_BaseDataLoaderIter' references 'DataLoader'.
+    def __iter__(self) -> '_BaseDataLoaderIter':
+        # When using a single worker the returned iterator should be
+        # created everytime to avoid resetting its state
+        # However, in the case of a multiple workers iterator
+        # the iterator is only created once in the lifetime of the
+        # DataLoader object so that workers can be reused
+        if self.persistent_workers and self.num_workers > 0:
+            if self._iterator is None:
+                self._iterator = self._get_iterator()
+            else:
+                self._iterator._reset(self)
+            return self._iterator
+        else:
+            return self._get_iterator()
+
+    @property
+    def _auto_collation(self):
+        return self.batch_sampler is not None
+
+    @property
+    def _index_sampler(self):
+        # The actual sampler used for generating indices for `_DatasetFetcher`
+        # (see _utils/fetch.py) to read data at each time. This would be
+        # `.batch_sampler` if in auto-collation mode, and `.sampler` otherwise.
+        # We can't change `.sampler` and `.batch_sampler` attributes for BC
+        # reasons.
+        if self._auto_collation:
+            return self.batch_sampler
+        else:
+            return self.sampler
+
+    def __len__(self) -> int:
+        if self._dataset_kind == _DatasetKind.Iterable:
+            # NOTE [ IterableDataset and __len__ ]
+            #
+            # For `IterableDataset`, `__len__` could be inaccurate when one naively
+            # does multi-processing data loading, since the samples will be duplicated.
+            # However, no real use case should be actually using that behavior, so
+            # it should count as a user error. We should generally trust user
+            # code to do the proper thing (e.g., configure each replica differently
+            # in `__iter__`), and give us the correct `__len__` if they choose to
+            # implement it (this will still throw if the dataset does not implement
+            # a `__len__`).
+            #
+            # To provide a further warning, we track if `__len__` was called on the
+            # `DataLoader`, save the returned value in `self._len_called`, and warn
+            # if the iterator ends up yielding more than this number of samples.
+
+            # Cannot statically verify that dataset is Sized
+            length = self._IterableDataset_len_called = len(self.dataset)  # type: ignore[assignment, arg-type]
+            if self.batch_size is not None:  # IterableDataset doesn't allow custom sampler or batch_sampler
+                from math import ceil
+                if self.drop_last:
+                    length = length // self.batch_size
+                else:
+                    length = ceil(length / self.batch_size)
+            return length
+        else:
+            return len(self._index_sampler)
+
+    def check_worker_number_rationality(self):
+        # This function check whether the dataloader's worker number is rational based on
+        # current system's resource. Current rule is that if the number of workers this
+        # Dataloader will create is bigger than the number of logical cpus that is allowed to
+        # use, than we will pop up a warning to let user pay attention.
+        #
+        # eg. If current system has 2 physical CPUs with 16 cores each. And each core support 2
+        #     threads, then the total logical cpus here is 2 * 16 * 2 = 64. Let's say current
+        #     DataLoader process can use half of them which is 32, then the rational max number of
+        #     worker that initiated from this process is 32.
+        #     Now, let's say the created DataLoader has num_works = 40, which is bigger than 32.
+        #     So the warning message is triggered to notify the user to lower the worker number if
+        #     necessary.
+        #
+        #
+        # [Note] Please note that this function repects `cpuset` only when os.sched_getaffinity is
+        #        available (available in most of Linux system, but not OSX and Windows).
+        #        When os.sched_getaffinity is not available, os.cpu_count() is called instead, but
+        #        it doesn't repect cpuset.
+        #        We don't take threading into account since each worker process is single threaded
+        #        at this time.
+        #
+        #        We don't set any threading flags (eg. OMP_NUM_THREADS, MKL_NUM_THREADS, etc)
+        #        other than `torch.set_num_threads` to 1 in the worker process, if the passing
+        #        in functions use 3rd party modules that rely on those threading flags to determine
+        #        how many thread to create (eg. numpy, etc), then it is caller's responsibility to
+        #        set those flags correctly.
+        def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
+
+            suggested_max_worker_msg = ((
+                "Our suggested max number of worker in current system is {}{}, which is smaller "
+                "than what this DataLoader is going to create.").format(
+                    num_worker_suggest,
+                    ("" if cpuset_checked else " (`cpuset` is not taken into account)"))
+            ) if num_worker_suggest is not None else (
+                "DataLoader is not able to compute a suggested max number of worker in current system.")
+
+            warn_msg = (
+                "This DataLoader will create {} worker processes in total. {} "
+                "Please be aware that excessive worker creation might get DataLoader running slow or even freeze, "
+                "lower the worker number to avoid potential slowness/freeze if necessary.").format(
+                    num_worker_created,
+                    suggested_max_worker_msg)
+            return warn_msg
+
+        if not self.num_workers or self.num_workers == 0:
+            return
+
+        # try to compute a suggested max number of worker based on system's resource
+        max_num_worker_suggest = None
+        cpuset_checked = False
+        if hasattr(os, 'sched_getaffinity'):
+            try:
+                max_num_worker_suggest = len(os.sched_getaffinity(0))
+                cpuset_checked = True
+            except Exception:
+                pass
+        if max_num_worker_suggest is None:
+            # os.cpu_count() could return Optional[int]
+            # get cpu count first and check None in order to satisfy mypy check
+            cpu_count = os.cpu_count()
+            if cpu_count is not None:
+                max_num_worker_suggest = cpu_count
+
+        if max_num_worker_suggest is None:
+            warnings.warn(_create_warning_msg(
+                max_num_worker_suggest,
+                self.num_workers,
+                cpuset_checked))
+            return
+
+        if self.num_workers > max_num_worker_suggest:
+            warnings.warn(_create_warning_msg(
+                max_num_worker_suggest,
+                self.num_workers,
+                cpuset_checked))
+
+
+class _BaseDataLoaderIter:
+    def __init__(self, loader: DataLoader) -> None:
+        self._dataset = loader.dataset
+        self._shared_seed = None
+        self._pg = None
+        if isinstance(self._dataset, IterDataPipe):
+            if dist.is_available() and dist.is_initialized():
+                self._pg = dist.new_group(backend="gloo")
+            self._shared_seed = _share_dist_seed(loader.generator, self._pg)
+            shared_rng = torch.Generator()
+            shared_rng.manual_seed(self._shared_seed)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
+        self._dataset_kind = loader._dataset_kind
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+        self._auto_collation = loader._auto_collation
+        self._drop_last = loader.drop_last
+        self._index_sampler = loader._index_sampler
+        self._num_workers = loader.num_workers
+        ws, rank = _get_distributed_settings()
+        self._world_size = ws
+        self._rank = rank
+        # for other backends, pin_memory_device need to set. if not set
+        # default behaviour is CUDA device. if pin_memory_device is selected
+        # and pin_memory is not set, the default behaviour false.
+        if (len(loader.pin_memory_device) == 0):
+            self._pin_memory = loader.pin_memory and torch.cuda.is_available()
+            self._pin_memory_device = None
+        else:
+            if not loader.pin_memory:
+                warn_msg = ("pin memory device is set and pin_memory flag is not used then device pinned memory won't be used"
+                            "please set pin_memory to true, if you need to use the device pin memory")
+                warnings.warn(warn_msg)
+
+            self._pin_memory = loader.pin_memory
+            self._pin_memory_device = loader.pin_memory_device
+        self._timeout = loader.timeout
+        self._collate_fn = loader.collate_fn
+        self._sampler_iter = iter(self._index_sampler)
+        self._base_seed = torch.empty((), dtype=torch.int64).random_(generator=loader.generator).item()
+        self._persistent_workers = loader.persistent_workers
+        self._num_yielded = 0
+        self._profile_name = f"enumerate(DataLoader)#{self.__class__.__name__}.__next__"
+
+    def __iter__(self) -> '_BaseDataLoaderIter':
+        return self
+
+    def _reset(self, loader, first_iter=False):
+        self._sampler_iter = iter(self._index_sampler)
+        self._num_yielded = 0
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+        if isinstance(self._dataset, IterDataPipe):
+            self._shared_seed = _share_dist_seed(loader.generator, self._pg)
+            shared_rng = torch.Generator()
+            shared_rng.manual_seed(self._shared_seed)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
+
+    def _next_index(self):
+        return next(self._sampler_iter)  # may raise StopIteration
+
+    def _next_data(self):
+        raise NotImplementedError
+
+    def __next__(self) -> Any:
+        with torch.autograd.profiler.record_function(self._profile_name):
+            if self._sampler_iter is None:
+                # TODO(https://github.com/pytorch/pytorch/issues/76750)
+                self._reset()  # type: ignore[call-arg]
+            data = self._next_data()
+            self._num_yielded += 1
+            if self._dataset_kind == _DatasetKind.Iterable and \
+                    self._IterableDataset_len_called is not None and \
+                    self._num_yielded > self._IterableDataset_len_called:
+                warn_msg = ("Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} "
+                            "samples have been fetched. ").format(self._dataset, self._IterableDataset_len_called,
+                                                                  self._num_yielded)
+                if self._num_workers > 0:
+                    warn_msg += ("For multiprocessing data-loading, this could be caused by not properly configuring the "
+                                 "IterableDataset replica at each worker. Please see "
+                                 "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples.")
+                warnings.warn(warn_msg)
+            return data
+
+    def __len__(self) -> int:
+        return len(self._index_sampler)
+
+    def __getstate__(self):
+        # TODO: add limited pickling support for sharing an iterator
+        # across multiple threads for HOGWILD.
+        # Probably the best way to do this is by moving the sample pushing
+        # to a separate thread and then just sharing the data queue
+        # but signalling the end is tricky without a non-blocking API
+        raise NotImplementedError("{} cannot be pickled", self.__class__.__name__)
+
+
+class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
+    def __init__(self, loader):
+        super().__init__(loader)
+        assert self._timeout == 0
+        assert self._num_workers == 0
+
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   Taking care of distributed sharding
+        if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
+            # For BC, use default SHARDING_PRIORITIES
+            torch.utils.data.graph_settings.apply_sharding(self._dataset, self._world_size, self._rank)
+
+        self._dataset_fetcher = _DatasetKind.create_fetcher(
+            self._dataset_kind, self._dataset, self._auto_collation, self._collate_fn, self._drop_last)
+
+    def _next_data(self):
+        index = self._next_index()  # may raise StopIteration
+        data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
+        if self._pin_memory:
+            data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
+        return data
+
+
+class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
+    r"""Iterates once over the DataLoader's dataset, as specified by the sampler."""
+
+    # NOTE [ Data Loader Multiprocessing Shutdown Logic ]
+    #
+    # Preliminary:
+    #
+    # Our data model looks like this (queues are indicated with curly brackets):
+    #
+    #                main process                              ||
+    #                     |                                    ||
+    #               {index_queue}                              ||
+    #                     |                                    ||
+    #              worker processes                            ||     DATA
+    #                     |                                    ||
+    #            {worker_result_queue}                         ||     FLOW
+    #                     |                                    ||
+    #      pin_memory_thread of main process                   ||   DIRECTION
+    #                     |                                    ||
+    #               {data_queue}                               ||
+    #                     |                                    ||
+    #                data output                               \/
+    #
+    # P.S. `worker_result_queue` and `pin_memory_thread` part may be omitted if
+    #      `pin_memory=False`.
+    #
+    #
+    # Terminating multiprocessing logic requires very careful design. In
+    # particular, we need to make sure that
+    #
+    #   1. The iterator gracefully exits the workers when its last reference is
+    #      gone or it is depleted.
+    #
+    #      In this case, the workers should be gracefully exited because the
+    #      main process may still need to continue to run, and we want cleaning
+    #      up code in the workers to be executed (e.g., releasing GPU memory).
+    #      Naturally, we implement the shutdown logic in `__del__` of
+    #      DataLoaderIterator.
+    #
+    #      We delay the discussion on the logic in this case until later.
+    #
+    #   2. The iterator exits the workers when the loader process and/or worker
+    #      processes exits normally or with error.
+    #
+    #      We set all workers and `pin_memory_thread` to have `daemon=True`.
+    #
+    #      You may ask, why can't we make the workers non-daemonic, and
+    #      gracefully exit using the same logic as we have in `__del__` when the
+    #      iterator gets deleted (see 1 above)?
+    #
+    #      First of all, `__del__` is **not** guaranteed to be called when
+    #      interpreter exits. Even if it is called, by the time it executes,
+    #      many Python core library resources may already be freed, and even
+    #      simple things like acquiring an internal lock of a queue may hang.
+    #      Therefore, in this case, we actually need to prevent `__del__` from
+    #      being executed, and rely on the automatic termination of daemonic
+    #      children.
+    #
+    #      Thus, we register an `atexit` hook that sets a global flag
+    #      `_utils.python_exit_status`. Since `atexit` hooks are executed in the
+    #      reverse order of registration, we are guaranteed that this flag is
+    #      set before library resources we use are freed (which, at least in
+    #      CPython, is done via an `atexit` handler defined in
+    #      `multiprocessing/util.py`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/util.py#L320-L362
+    #      registered when an object requiring this mechanism is first
+    #      created, e.g., `mp.Queue`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/context.py#L100-L103
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/queues.py#L29
+    #      )
+    #
+    #      So in `__del__`, we check if `_utils.python_exit_status` is set or
+    #      `None` (freed), and perform no-op if so.
+    #
+    #      However, simply letting library clean-up codes run can also be bad,
+    #      because such codes (i.e., `multiprocessing.util._exit_function()`)
+    #      include join putting threads for `mp.Queue`, which can be blocking.
+    #      Hence, the main process putting threads are called with
+    #      `cancel_join_thread` at creation.  See later section
+    #      [ 3b. A process won't hang when putting into a queue; ]
+    #      for more details.
+    #
+    #      Here are two example cases where library clean-up codes can run
+    #      before `__del__` is called:
+    #
+    #        1. If we hold onto a reference to the iterator, it more often
+    #           than not tries to do `multiprocessing` library cleaning before
+    #           clearing the alive referenced objects (https://github.com/pytorch/pytorch/issues/48666)
+    #           and thus prevents our cleaning-up code to run first.
+    #
+    #        2. A similar issue araises when a `DataLoader` is used in a subprocess.
+    #           When a process ends, it shuts the all its daemonic children
+    #           down with a SIGTERM (instead of joining them without a timeout).
+    #           Simiarly for threads, but by a different mechanism. This fact,
+    #           together with a few implementation details of multiprocessing, forces
+    #           us to make workers daemonic. All of our problems arise when a
+    #           DataLoader is used in a subprocess, and are caused by multiprocessing
+    #           code which looks more or less like this:
+    #
+    #               try:
+    #                   your_function_using_a_dataloader()
+    #               finally:
+    #                   multiprocessing.util._exit_function()
+    #
+    #           The joining/termination mentioned above happens inside
+    #           `_exit_function()`. Now, if `your_function_using_a_dataloader()`
+    #           throws, the stack trace stored in the exception will prevent the
+    #           frame which uses `DataLoaderIter` to be freed. If the frame has any
+    #           reference to the `DataLoaderIter` (e.g., in a method of the iter),
+    #           its  `__del__`, which starts the shutdown procedure, will not be
+    #           called. That, in turn, means that workers aren't notified. Attempting
+    #           to join in `_exit_function` will then result in a hang.
+    #
+    #           For context, `_exit_function` is also registered as an `atexit` call.
+    #           So it is unclear to me (@ssnl) why this is needed in a finally block.
+    #           The code dates back to 2008 and there is no comment on the original
+    #           PEP 371 or patch https://bugs.python.org/issue3050 (containing both
+    #           the finally block and the `atexit` registration) that explains this.
+    #
+    #
+    #      Finally, another choice is to just shutdown workers with logic in 1
+    #      above whenever we see an error in `next`. This isn't ideal because
+    #        a. It prevents users from using try-catch to resume data loading.
+    #        b. It doesn't prevent hanging if users have references to the
+    #           iterator.
+    #
+    #   3. All processes exit if any of them die unexpectedly by fatal signals.
+    #
+    #      As shown above, the workers are set as daemonic children of the main
+    #      process. However, automatic cleaning-up of such child processes only
+    #      happens if the parent process exits gracefully (e.g., not via fatal
+    #      signals like SIGKILL). So we must ensure that each process will exit
+    #      even the process that should send/receive data to/from it were
+    #      killed, i.e.,
+    #
+    #        a. A process won't hang when getting from a queue.
+    #
+    #           Even with carefully designed data dependencies (i.e., a `put()`
+    #           always corresponding to a `get()`), hanging on `get()` can still
+    #           happen when data in queue is corrupted (e.g., due to
+    #           `cancel_join_thread` or unexpected exit).
+    #
+    #           For child exit, we set a timeout whenever we try to get data
+    #           from `data_queue`, and check the workers' status on each timeout
+    #           and error.
+    #           See `_DataLoaderiter._get_batch()` and
+    #           `_DataLoaderiter._try_get_data()` for details.
+    #
+    #           Additionally, for child exit on non-Windows platforms, we also
+    #           register a SIGCHLD handler (which is supported on Windows) on
+    #           the main process, which checks if any of the workers fail in the
+    #           (Python) handler. This is more efficient and faster in detecting
+    #           worker failures, compared to only using the above mechanism.
+    #           See `DataLoader.cpp` and `_utils/signal_handling.py` for details.
+    #
+    #           For `.get()` calls where the sender(s) is not the workers, we
+    #           guard them with timeouts, and check the status of the sender
+    #           when timeout happens:
+    #             + in the workers, the `_utils.worker.ManagerWatchdog` class
+    #               checks the status of the main process.
+    #             + if `pin_memory=True`, when getting from `pin_memory_thread`,
+    #               check `pin_memory_thread` status periodically until `.get()`
+    #               returns or see that `pin_memory_thread` died.
+    #
+    #        b. A process won't hang when putting into a queue;
+    #
+    #           We use `mp.Queue` which has a separate background thread to put
+    #           objects from an unbounded buffer array. The background thread is
+    #           daemonic and usually automatically joined when the process
+    #           *exits*.
+    #
+    #           In case that the receiver has ended abruptly while
+    #           reading from the pipe, the join will hang forever.  The usual
+    #           solution for this in Python is calling  `q.cancel_join_thread`,
+    #           which prevents automatically joining it when finalizing
+    #           (exiting).
+    #
+    #           Nonetheless, `cancel_join_thread` must only be called when the
+    #           queue is **not** going to be read from or write into by another
+    #           process, because it may hold onto a lock or leave corrupted data
+    #           in the queue, leading other readers/writers to hang.
+    #
+    #           Hence,
+    #             + For worker processes, we only do so (for their output
+    #               queues, i.e., `worker_result_queue`) before exiting.
+    #             + For `pin_memory_thread`, its output queue `data_queue` is a
+    #               `queue.Queue` that does blocking `put` if the queue is full.
+    #               So there is no above problem, but as a result, in
+    #               `_pin_memory_loop`, we do need to  wrap the `put` in a loop
+    #               that breaks not only upon success, but also when the main
+    #               process stops reading, i.e., is shutting down.
+    #             + For loader process, we `cancel_join_thread()` for all
+    #               `_index_queues` because the whole purpose of workers and
+    #               `pin_memory_thread` is to serve the loader process.  If
+    #               loader process is already exiting, we don't really care if
+    #               the queues are corrupted.
+    #
+    #
+    # Now let's get back to 1:
+    #   how we gracefully exit the workers when the last reference to the
+    #   iterator is gone.
+    #
+    # To achieve this, we implement the following logic along with the design
+    # choices mentioned above:
+    #
+    # `workers_done_event`:
+    #   A `multiprocessing.Event` shared among the main process and all worker
+    #   processes. This is used to signal the workers that the iterator is
+    #   shutting down. After it is set, they will not send processed data to
+    #   queues anymore, and only wait for the final `None` before exiting.
+    #   `done_event` isn't strictly needed. I.e., we can just check for `None`
+    #   from the input queue, but it allows us to skip wasting resources
+    #   processing data if we are already shutting down.
+    #
+    # `pin_memory_thread_done_event`:
+    #   A `threading.Event` for a similar purpose to that of
+    #   `workers_done_event`, but is for the `pin_memory_thread`. The reason
+    #   that separate events are needed is that `pin_memory_thread` reads from
+    #   the output queue of the workers. But the workers, upon seeing that
+    #   `workers_done_event` is set, only wants to see the final `None`, and is
+    #   not required to flush all data in the output queue (e.g., it may call
+    #   `cancel_join_thread` on that queue if its `IterableDataset` iterator
+    #   happens to exhaust coincidentally, which is out of the control of the
+    #   main process). Thus, since we will exit `pin_memory_thread` before the
+    #   workers (see below), two separete events are used.
+    #
+    # NOTE: In short, the protocol is that the main process will set these
+    #       `done_event`s and then the corresponding processes/threads a `None`,
+    #       and that they may exit at any time after receiving the `None`.
+    #
+    # NOTE: Using `None` as the final signal is valid, since normal data will
+    #       always be a 2-tuple with the 1st element being the index of the data
+    #       transferred (different from dataset index/key), and the 2nd being
+    #       either the dataset key or the data sample (depending on which part
+    #       of the data model the queue is at).
+    #
+    # [ worker processes ]
+    #   While loader process is alive:
+    #     Get from `index_queue`.
+    #       If get anything else,
+    #          Check `workers_done_event`.
+    #            If set, continue to next iteration
+    #                    i.e., keep getting until see the `None`, then exit.
+    #            Otherwise, process data:
+    #                If is fetching from an `IterableDataset` and the iterator
+    #                    is exhausted, send an `_IterableDatasetStopIteration`
+    #                    object to signal iteration end. The main process, upon
+    #                    receiving such an object, will send `None` to this
+    #                    worker and not use the corresponding `index_queue`
+    #                    anymore.
+    #       If timed out,
+    #          No matter `workers_done_event` is set (still need to see `None`)
+    #          or not, must continue to next iteration.
+    #   (outside loop)
+    #   If `workers_done_event` is set,  (this can be False with `IterableDataset`)
+    #     `data_queue.cancel_join_thread()`.  (Everything is ending here:
+    #                                          main process won't read from it;
+    #                                          other workers will also call
+    #                                          `cancel_join_thread`.)
+    #
+    # [ pin_memory_thread ]
+    #   # No need to check main thread. If this thread is alive, the main loader
+    #   # thread must be alive, because this thread is set as daemonic.
+    #   While `pin_memory_thread_done_event` is not set:
+    #     Get from `worker_result_queue`.
+    #       If timed out, continue to get in the next iteration.
+    #       Otherwise, process data.
+    #       While `pin_memory_thread_done_event` is not set:
+    #         Put processed data to `data_queue` (a `queue.Queue` with blocking put)
+    #         If timed out, continue to put in the next iteration.
+    #         Otherwise, break, i.e., continuing to the out loop.
+    #
+    #   NOTE: we don't check the status of the main thread because
+    #           1. if the process is killed by fatal signal, `pin_memory_thread`
+    #              ends.
+    #           2. in other cases, either the cleaning-up in __del__ or the
+    #              automatic exit of daemonic thread will take care of it.
+    #              This won't busy-wait either because `.get(timeout)` does not
+    #              busy-wait.
+    #
+    # [ main process ]
+    #   In the DataLoader Iter's `__del__`
+    #     b. Exit `pin_memory_thread`
+    #          i.   Set `pin_memory_thread_done_event`.
+    #          ii   Put `None` in `worker_result_queue`.
+    #          iii. Join the `pin_memory_thread`.
+    #          iv.  `worker_result_queue.cancel_join_thread()`.
+    #
+    #     c. Exit the workers.
+    #          i.   Set `workers_done_event`.
+    #          ii.  Put `None` in each worker's `index_queue`.
+    #          iii. Join the workers.
+    #          iv.  Call `.cancel_join_thread()` on each worker's `index_queue`.
+    #
+    #        NOTE: (c) is better placed after (b) because it may leave corrupted
+    #              data in `worker_result_queue`, which `pin_memory_thread`
+    #              reads from, in which case the `pin_memory_thread` can only
+    #              happen at timing out, which is slow. Nonetheless, same thing
+    #              happens if a worker is killed by signal at unfortunate times,
+    #              but in other cases, we are better off having a non-corrupted
+    #              `worker_result_queue` for `pin_memory_thread`.
+    #
+    #   NOTE: If `pin_memory=False`, there is no `pin_memory_thread` and (b)
+    #         can be omitted
+    #
+    # NB: `done_event`s isn't strictly needed. E.g., we can just check for
+    #     `None` from `index_queue`, but it allows us to skip wasting resources
+    #     processing indices already in `index_queue` if we are already shutting
+    #     down.
+
+    def __init__(self, loader):
+        super().__init__(loader)
+
+        self._prefetch_factor = loader.prefetch_factor
+
+        assert self._num_workers > 0
+        assert self._prefetch_factor > 0
+
+        if loader.multiprocessing_context is None:
+            multiprocessing_context = multiprocessing
+        else:
+            multiprocessing_context = loader.multiprocessing_context
+
+        self._worker_init_fn = loader.worker_init_fn
+
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   Additional worker init function will take care of sharding in MP and Distributed
+        if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
+            self._worker_init_fn = functools.partial(
+                _sharding_worker_init_fn, self._worker_init_fn, self._world_size, self._rank)
+
+        # No certainty which module multiprocessing_context is
+        self._worker_result_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
+        self._worker_pids_set = False
+        self._shutdown = False
+        self._workers_done_event = multiprocessing_context.Event()
+
+        self._index_queues = []
+        self._workers = []
+        for i in range(self._num_workers):
+            # No certainty which module multiprocessing_context is
+            index_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
+            # Need to `cancel_join_thread` here!
+            # See sections (2) and (3b) above.
+            index_queue.cancel_join_thread()
+            w = multiprocessing_context.Process(
+                target=_utils.worker._worker_loop,
+                args=(self._dataset_kind, self._dataset, index_queue,
+                      self._worker_result_queue, self._workers_done_event,
+                      self._auto_collation, self._collate_fn, self._drop_last,
+                      self._base_seed, self._worker_init_fn, i, self._num_workers,
+                      self._persistent_workers, self._shared_seed))
+            w.daemon = True
+            # NB: Process.start() actually take some time as it needs to
+            #     start a process and pass the arguments over via a pipe.
+            #     Therefore, we only add a worker to self._workers list after
+            #     it started, so that we do not call .join() if program dies
+            #     before it starts, and __del__ tries to join but will get:
+            #     AssertionError: can only join a started process.
+            w.start()
+            self._index_queues.append(index_queue)
+            self._workers.append(w)
+
+        if self._pin_memory:
+            self._pin_memory_thread_done_event = threading.Event()
+
+            # Queue is not type-annotated
+            self._data_queue = queue.Queue()  # type: ignore[var-annotated]
+            if self._pin_memory_device == "xpu":
+                current_device = torch.xpu.current_device()  # type: ignore[attr-defined]
+            elif self._pin_memory_device == torch._C._get_privateuse1_backend_name():
+                custom_device_mod = getattr(torch, torch._C._get_privateuse1_backend_name())
+                current_device = custom_device_mod.current_device()
+            else:
+                current_device = torch.cuda.current_device()  # choose cuda for default
+            pin_memory_thread = threading.Thread(
+                target=_utils.pin_memory._pin_memory_loop,
+                args=(self._worker_result_queue, self._data_queue,
+                      current_device,
+                      self._pin_memory_thread_done_event, self._pin_memory_device))
+            pin_memory_thread.daemon = True
+            pin_memory_thread.start()
+            # Similar to workers (see comment above), we only register
+            # pin_memory_thread once it is started.
+            self._pin_memory_thread = pin_memory_thread
+        else:
+            self._data_queue = self._worker_result_queue  # type: ignore[assignment]
+
+        # In some rare cases, persistent workers (daemonic processes)
+        # would be terminated before `__del__` of iterator is invoked
+        # when main process exits
+        # It would cause failure when pin_memory_thread tries to read
+        # corrupted data from worker_result_queue
+        # atexit is used to shutdown thread and child processes in the
+        # right sequence before main process exits
+        if self._persistent_workers and self._pin_memory:
+            import atexit
+            for w in self._workers:
+                atexit.register(_MultiProcessingDataLoaderIter._clean_up_worker, w)
+
+        # .pid can be None only before process is spawned (not the case, so ignore)
+        _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers))  # type: ignore[misc]
+        _utils.signal_handling._set_SIGCHLD_handler()
+        self._worker_pids_set = True
+        self._reset(loader, first_iter=True)
+
+    def _reset(self, loader, first_iter=False):
+        super()._reset(loader, first_iter)
+        self._send_idx = 0  # idx of the next task to be sent to workers
+        self._rcvd_idx = 0  # idx of the next task to be returned in __next__
+        # information about data not yet yielded, i.e., tasks w/ indices in range [rcvd_idx, send_idx).
+        # map: task idx => - (worker_id,)        if data isn't fetched (outstanding)
+        #                  \ (worker_id, data)   if data is already fetched (out-of-order)
+        self._task_info = {}
+        self._tasks_outstanding = 0  # always equal to count(v for v in task_info.values() if len(v) == 1)
+        # A list of booleans representing whether each worker still has work to
+        # do, i.e., not having exhausted its iterable dataset object. It always
+        # contains all `True`s if not using an iterable-style dataset
+        # (i.e., if kind != Iterable).
+        # Not that this indicates that a worker still has work to do *for this epoch*.
+        # It does not mean that a worker is dead. In case of `_persistent_workers`,
+        # the worker will be reset to available in the next epoch.
+        self._workers_status = [True for i in range(self._num_workers)]
+        # Reset the worker queue cycle so it resumes next epoch at worker 0
+        self._worker_queue_idx_cycle = itertools.cycle(range(self._num_workers))
+        # We resume the prefetching in case it was enabled
+        if not first_iter:
+            for idx in range(self._num_workers):
+                self._index_queues[idx].put(_utils.worker._ResumeIteration(self._shared_seed))
+            resume_iteration_cnt = self._num_workers
+            while resume_iteration_cnt > 0:
+                return_idx, return_data = self._get_data()
+                if isinstance(return_idx, _utils.worker._ResumeIteration):
+                    assert return_data is None
+                    resume_iteration_cnt -= 1
+        # prime the prefetch loop
+        for _ in range(self._prefetch_factor * self._num_workers):
+            self._try_put_index()
+
+    def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
+        # Tries to fetch data from `self._data_queue` once for a given timeout.
+        # This can also be used as inner loop of fetching without timeout, with
+        # the sender status as the loop condition.
+        #
+        # This raises a `RuntimeError` if any worker died expectedly. This error
+        # can come from either the SIGCHLD handler in `_utils/signal_handling.py`
+        # (only for non-Windows platforms), or the manual check below on errors
+        # and timeouts.
+        #
+        # Returns a 2-tuple:
+        #   (bool: whether successfully get data, any: data if successful else None)
+        try:
+            data = self._data_queue.get(timeout=timeout)
+            return (True, data)
+        except Exception as e:
+            # At timeout and error, we manually check whether any worker has
+            # failed. Note that this is the only mechanism for Windows to detect
+            # worker failures.
+            failed_workers = []
+            for worker_id, w in enumerate(self._workers):
+                if self._workers_status[worker_id] and not w.is_alive():
+                    failed_workers.append(w)
+                    self._mark_worker_as_unavailable(worker_id)
+            if len(failed_workers) > 0:
+                pids_str = ', '.join(str(w.pid) for w in failed_workers)
+                raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e
+            if isinstance(e, queue.Empty):
+                return (False, None)
+            import tempfile
+            import errno
+            try:
+                # Raise an exception if we are this close to the FDs limit.
+                # Apparently, trying to open only one file is not a sufficient
+                # test.
+                # See NOTE [ DataLoader on Linux and open files limit ]
+                fds_limit_margin = 10
+                fs = [tempfile.NamedTemporaryFile() for i in range(fds_limit_margin)]
+            except OSError as e:
+                if e.errno == errno.EMFILE:
+                    raise RuntimeError(
+                        "Too many open files. Communication with the"
+                        " workers is no longer possible. Please increase the"
+                        " limit using `ulimit -n` in the shell or change the"
+                        " sharing strategy by calling"
+                        " `torch.multiprocessing.set_sharing_strategy('file_system')`"
+                        " at the beginning of your code") from None
+            raise
+
+# NOTE [ DataLoader on Linux and open files limit ]
+#
+# On Linux when DataLoader is used with multiprocessing we pass the data between
+# the root process and the workers through SHM files. We remove those files from
+# the filesystem as soon as they are created and keep them alive by
+# passing around their file descriptors through AF_UNIX sockets. (See
+# docs/source/multiprocessing.rst and 'Multiprocessing Technical Notes` in
+# the wiki (https://github.com/pytorch/pytorch/wiki).)
+#
+# This sometimes leads us to exceeding the open files limit. When that happens,
+# and the offending file descriptor is coming over a socket, the `socket` Python
+# package silently strips the file descriptor from the message, setting only the
+# `MSG_CTRUNC` flag (which might be a bit misleading since the manpage says that
+# it _indicates that some control data were discarded due to lack of space in
+# the buffer for ancillary data_). This might reflect the C implementation of
+# AF_UNIX sockets.
+#
+# This behaviour can be reproduced with the script and instructions at the
+# bottom of this note.
+#
+# When that happens, the standard Python `multiprocessing` (and not
+# `torch.multiprocessing`) raises a `RuntimeError: received 0 items of ancdata`
+#
+# Sometimes, instead of the FD being stripped, you may get an `OSError:
+# Too many open files`, both in the script below and in DataLoader. However,
+# this is rare and seems to be nondeterministic.
+#
+#
+#   #!/usr/bin/env python3
+#   import sys
+#   import socket
+#   import os
+#   import array
+#   import shutil
+#   import socket
+#
+#
+#   if len(sys.argv) != 4:
+#       print("Usage: ", sys.argv[0], " tmp_dirname iteration (send|recv)")
+#       sys.exit(1)
+#
+#   if __name__ == '__main__':
+#       dirname = sys.argv[1]
+#       sock_path = dirname + "/sock"
+#       iterations = int(sys.argv[2])
+#       def dummy_path(i):
+#           return dirname + "/" + str(i) + ".dummy"
+#
+#
+#       if sys.argv[3] == 'send':
+#           while not os.path.exists(sock_path):
+#               pass
+#           client = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+#           client.connect(sock_path)
+#           for i in range(iterations):
+#               fd = os.open(dummy_path(i), os.O_WRONLY | os.O_CREAT)
+#               ancdata = array.array('i', [fd])
+#               msg = bytes([i % 256])
+#               print("Sending fd ", fd, " (iteration #", i, ")")
+#               client.sendmsg([msg], [(socket.SOL_SOCKET, socket.SCM_RIGHTS, ancdata)])
+#
+#
+#       else:
+#           assert sys.argv[3] == 'recv'
+#
+#           if os.path.exists(dirname):
+#               raise Exception("Directory exists")
+#
+#           os.mkdir(dirname)
+#
+#           print("Opening socket...")
+#           server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+#           server.bind(sock_path)
+#
+#           print("Listening...")
+#           for i in range(iterations):
+#               a = array.array('i')
+#               msg, ancdata, flags, addr = server.recvmsg(1, socket.CMSG_SPACE(a.itemsize))
+#               assert(len(ancdata) == 1)
+#               cmsg_level, cmsg_type, cmsg_data = ancdata[0]
+#               a.frombytes(cmsg_data)
+#               print("Received fd ", a[0], " (iteration #", i, ")")
+#
+#           shutil.rmtree(dirname)
+#
+# Steps to reproduce:
+#
+# 1. Run two shells and set lower file descriptor limit in the receiving one:
+# (shell1) ulimit -n 1020
+# (shell2) ulimit -n 1022
+#
+# 2. Run the script above with the `recv` option in the first shell
+# (shell1) ./test_socket.py sock_tmp 1017 recv
+#
+# 3. Run the script with the `send` option in the second shell:
+# (shell2) ./test_socket.py sock_tmp 1017 send
+
+    def _get_data(self):
+        # Fetches data from `self._data_queue`.
+        #
+        # We check workers' status every `MP_STATUS_CHECK_INTERVAL` seconds,
+        # which we achieve by running `self._try_get_data(timeout=MP_STATUS_CHECK_INTERVAL)`
+        # in a loop. This is the only mechanism to detect worker failures for
+        # Windows. For other platforms, a SIGCHLD handler is also used for
+        # worker failure detection.
+        #
+        # If `pin_memory=True`, we also need check if `pin_memory_thread` had
+        # died at timeouts.
+        if self._timeout > 0:
+            success, data = self._try_get_data(self._timeout)
+            if success:
+                return data
+            else:
+                raise RuntimeError(f'DataLoader timed out after {self._timeout} seconds')
+        elif self._pin_memory:
+            while self._pin_memory_thread.is_alive():
+                success, data = self._try_get_data()
+                if success:
+                    return data
+            else:
+                # while condition is false, i.e., pin_memory_thread died.
+                raise RuntimeError('Pin memory thread exited unexpectedly')
+            # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
+            # need to call `.task_done()` because we don't use `.join()`.
+        else:
+            while True:
+                success, data = self._try_get_data()
+                if success:
+                    return data
+
+    def _next_data(self):
+        while True:
+            # If the worker responsible for `self._rcvd_idx` has already ended
+            # and was unable to fulfill this task (due to exhausting an `IterableDataset`),
+            # we try to advance `self._rcvd_idx` to find the next valid index.
+            #
+            # This part needs to run in the loop because both the `self._get_data()`
+            # call and `_IterableDatasetStopIteration` check below can mark
+            # extra worker(s) as dead.
+            while self._rcvd_idx < self._send_idx:
+                info = self._task_info[self._rcvd_idx]
+                worker_id = info[0]
+                if len(info) == 2 or self._workers_status[worker_id]:  # has data or is still active
+                    break
+                del self._task_info[self._rcvd_idx]
+                self._rcvd_idx += 1
+            else:
+                # no valid `self._rcvd_idx` is found (i.e., didn't break)
+                if not self._persistent_workers:
+                    self._shutdown_workers()
+                raise StopIteration
+
+            # Now `self._rcvd_idx` is the batch index we want to fetch
+
+            # Check if the next sample has already been generated
+            if len(self._task_info[self._rcvd_idx]) == 2:
+                data = self._task_info.pop(self._rcvd_idx)[1]
+                return self._process_data(data)
+
+            assert not self._shutdown and self._tasks_outstanding > 0
+            idx, data = self._get_data()
+            self._tasks_outstanding -= 1
+            if self._dataset_kind == _DatasetKind.Iterable:
+                # Check for _IterableDatasetStopIteration
+                if isinstance(data, _utils.worker._IterableDatasetStopIteration):
+                    if self._persistent_workers:
+                        self._workers_status[data.worker_id] = False
+                    else:
+                        self._mark_worker_as_unavailable(data.worker_id)
+                    self._try_put_index()
+                    continue
+
+            if idx != self._rcvd_idx:
+                # store out-of-order samples
+                self._task_info[idx] += (data,)
+            else:
+                del self._task_info[idx]
+                return self._process_data(data)
+
+    def _try_put_index(self):
+        assert self._tasks_outstanding < self._prefetch_factor * self._num_workers
+
+        try:
+            index = self._next_index()
+        except StopIteration:
+            return
+        for _ in range(self._num_workers):  # find the next active worker, if any
+            worker_queue_idx = next(self._worker_queue_idx_cycle)
+            if self._workers_status[worker_queue_idx]:
+                break
+        else:
+            # not found (i.e., didn't break)
+            return
+
+        self._index_queues[worker_queue_idx].put((self._send_idx, index))  # type: ignore[possibly-undefined]
+        self._task_info[self._send_idx] = (worker_queue_idx,)
+        self._tasks_outstanding += 1
+        self._send_idx += 1
+
+    def _process_data(self, data):
+        self._rcvd_idx += 1
+        self._try_put_index()
+        if isinstance(data, ExceptionWrapper):
+            data.reraise()
+        return data
+
+    def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
+        # Mark a worker as having finished its work e.g., due to
+        # exhausting an `IterableDataset`. This should be used only when this
+        # `_MultiProcessingDataLoaderIter` is going to continue running.
+
+        assert self._workers_status[worker_id] or (self._persistent_workers and shutdown)
+
+        # Signal termination to that specific worker.
+        q = self._index_queues[worker_id]
+        # Indicate that no more data will be put on this queue by the current
+        # process.
+        q.put(None)
+
+        # Note that we don't actually join the worker here, nor do we remove the
+        # worker's pid from C side struct because (1) joining may be slow, and
+        # (2) since we don't join, the worker may still raise error, and we
+        # prefer capturing those, rather than ignoring them, even though they
+        # are raised after the worker has finished its job.
+        # Joinning is deferred to `_shutdown_workers`, which it is called when
+        # all workers finish their jobs (e.g., `IterableDataset` replicas) or
+        # when this iterator is garbage collected.
+
+        self._workers_status[worker_id] = False
+
+        assert self._workers_done_event.is_set() == shutdown
+
+    def _shutdown_workers(self):
+        # Called when shutting down this `_MultiProcessingDataLoaderIter`.
+        # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on
+        # the logic of this function.
+        if _utils is None or _utils.python_exit_status is True or _utils.python_exit_status is None:
+            # See (2) of the note. If Python is shutting down, do no-op.
+            return
+        # Normal exit when last reference is gone / iterator is depleted.
+        # See (1) and the second half of the note.
+        if not self._shutdown:
+            self._shutdown = True
+            try:
+                # Normal exit when last reference is gone / iterator is depleted.
+                # See (1) and the second half of the note.
+
+                # Exit `pin_memory_thread` first because exiting workers may leave
+                # corrupted data in `worker_result_queue` which `pin_memory_thread`
+                # reads from.
+                if hasattr(self, '_pin_memory_thread'):
+                    # Use hasattr in case error happens before we set the attribute.
+                    self._pin_memory_thread_done_event.set()
+                    # Send something to pin_memory_thread in case it is waiting
+                    # so that it can wake up and check `pin_memory_thread_done_event`
+                    self._worker_result_queue.put((None, None))
+                    self._pin_memory_thread.join()
+                    self._worker_result_queue.cancel_join_thread()
+                    self._worker_result_queue.close()
+
+                # Exit workers now.
+                self._workers_done_event.set()
+                for worker_id in range(len(self._workers)):
+                    # Get number of workers from `len(self._workers)` instead of
+                    # `self._num_workers` in case we error before starting all
+                    # workers.
+                    # If we are using workers_status with persistent_workers
+                    # we have to shut it down because the worker is paused
+                    if self._persistent_workers or self._workers_status[worker_id]:
+                        self._mark_worker_as_unavailable(worker_id, shutdown=True)
+                for w in self._workers:
+                    # We should be able to join here, but in case anything went
+                    # wrong, we set a timeout and if the workers fail to join,
+                    # they are killed in the `finally` block.
+                    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+                for q in self._index_queues:
+                    q.cancel_join_thread()
+                    q.close()
+            finally:
+                # Even though all this function does is putting into queues that
+                # we have called `cancel_join_thread` on, weird things can
+                # happen when a worker is killed by a signal, e.g., hanging in
+                # `Event.set()`. So we need to guard this with SIGCHLD handler,
+                # and remove pids from the C side data structure only at the
+                # end.
+                #
+                # FIXME: Unfortunately, for Windows, we are missing a worker
+                #        error detection mechanism here in this function, as it
+                #        doesn't provide a SIGCHLD handler.
+                if self._worker_pids_set:
+                    _utils.signal_handling._remove_worker_pids(id(self))
+                    self._worker_pids_set = False
+                for w in self._workers:
+                    if w.is_alive():
+                        # Existing mechanisms try to make the workers exit
+                        # peacefully, but in case that we unfortunately reach
+                        # here, which we shouldn't, (e.g., pytorch/pytorch#39570),
+                        # we kill the worker.
+                        w.terminate()
+
+    # staticmethod is used to remove reference to `_MultiProcessingDataLoaderIter`
+    @staticmethod
+    def _clean_up_worker(w):
+        try:
+            w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+        finally:
+            if w.is_alive():
+                w.terminate()
+
+    def __del__(self):
+        self._shutdown_workers()
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0454c07ef59712f4b591533e81e572ed6ca4f395
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__init__.py
@@ -0,0 +1,3 @@
+from . import iter
+from . import map
+from . import dataframe
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..635bd39cfeb5fe5bd50b92b35eef1e6c2c41b434
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_decorator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_decorator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67f8b153e670475538503c85823cf527ddc8143e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_decorator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_hook_iterator.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_hook_iterator.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe21f76382a17d7ce1e2a8f05d317a560d113ccf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_hook_iterator.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_typing.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_typing.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70020f805f86d55063c8ee6d3f3647dbb06e1272
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/_typing.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/datapipe.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/datapipe.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5aded78478abbe8329ffb8595576353035ff5b9
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/datapipe.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/gen_pyi.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/gen_pyi.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..318e53c80b4a4b0984d787e874c654fa2508babe
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/__pycache__/gen_pyi.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/_decorator.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f94652b28c5bbe17333fbe699ed49b228b09e510
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_decorator.py
@@ -0,0 +1,184 @@
+import inspect
+from functools import wraps
+from typing import Any, Callable, Optional, Type, Union, get_type_hints
+from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
+from torch.utils.data.datapipes._typing import _DataPipeMeta
+
+
+######################################################
+# Functional API
+######################################################
+class functional_datapipe:
+    name: str
+
+    def __init__(self, name: str, enable_df_api_tracing=False) -> None:
+        """
+        Define a functional datapipe.
+
+        Args:
+            enable_df_api_tracing - if set, any returned DataPipe would accept
+            DataFrames API in tracing mode.
+        """
+        self.name = name
+        self.enable_df_api_tracing = enable_df_api_tracing
+
+    def __call__(self, cls):
+        if issubclass(cls, IterDataPipe):
+            if isinstance(cls, Type):  # type: ignore[arg-type]
+                if not isinstance(cls, _DataPipeMeta):
+                    raise TypeError('`functional_datapipe` can only decorate IterDataPipe')
+            # with non_deterministic decorator
+            else:
+                if not isinstance(cls, non_deterministic) and \
+                    not (hasattr(cls, '__self__') and
+                         isinstance(cls.__self__, non_deterministic)):
+                    raise TypeError('`functional_datapipe` can only decorate IterDataPipe')
+            IterDataPipe.register_datapipe_as_function(self.name, cls, enable_df_api_tracing=self.enable_df_api_tracing)
+        elif issubclass(cls, MapDataPipe):
+            MapDataPipe.register_datapipe_as_function(self.name, cls)
+
+        return cls
+
+
+######################################################
+# Determinism
+######################################################
+_determinism: bool = False
+
+
+class guaranteed_datapipes_determinism:
+    prev: bool
+
+    def __init__(self) -> None:
+        global _determinism
+        self.prev = _determinism
+        _determinism = True
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        global _determinism
+        _determinism = self.prev
+
+
+class non_deterministic:
+    cls: Optional[Type[IterDataPipe]] = None
+    # TODO: Lambda for picking
+    deterministic_fn: Callable[[], bool]
+
+    def __init__(self, arg: Union[Type[IterDataPipe], Callable[[], bool]]) -> None:
+        # 1. Decorator doesn't have any argument
+        if isinstance(arg, Type):  # type: ignore[arg-type]
+            if not issubclass(arg, IterDataPipe):  # type: ignore[arg-type]
+                raise TypeError("Only `IterDataPipe` can be decorated with `non_deterministic`"
+                                f", but {arg.__name__} is found")
+            self.cls = arg  # type: ignore[assignment]
+        # 2. Decorator has an argument of a function
+        #    This class should behave differently given different inputs. Use this
+        #    function to verify the determinism for each instance.
+        #    When the function returns True, the instance is non-deterministic. Otherwise,
+        #    the instance is a deterministic DataPipe.
+        elif isinstance(arg, Callable):  # type:ignore[arg-type]
+            self.deterministic_fn = arg  # type: ignore[assignment, misc]
+        else:
+            raise TypeError(f"{arg} can not be decorated by non_deterministic")
+
+    def __call__(self, *args, **kwargs):
+        global _determinism
+        #  Decorate IterDataPipe
+        if self.cls is not None:
+            if _determinism:
+                raise TypeError("{} is non-deterministic, but you set 'guaranteed_datapipes_determinism'. "
+                                "You can turn off determinism for this DataPipe if that is acceptable "
+                                "for your application".format(self.cls.__name__))
+            return self.cls(*args, **kwargs)  # type: ignore[call-arg]
+
+        # Decorate with a functional argument
+        if not (isinstance(args[0], Type) and  # type: ignore[arg-type]
+                issubclass(args[0], IterDataPipe)):
+            raise TypeError(f"Only `IterDataPipe` can be decorated, but {args[0].__name__} is found")
+        self.cls = args[0]
+        return self.deterministic_wrapper_fn
+
+    def deterministic_wrapper_fn(self, *args, **kwargs) -> IterDataPipe:
+        res = self.deterministic_fn(*args, **kwargs)  # type: ignore[call-arg, misc]
+        if not isinstance(res, bool):
+            raise TypeError("deterministic_fn of `non_deterministic` decorator is required "
+                            f"to return a boolean value, but {type(res)} is found")
+        global _determinism
+        if _determinism and res:
+            raise TypeError(f"{self.cls.__name__} is non-deterministic with the inputs, but you set "  # type: ignore[union-attr]
+                            "'guaranteed_datapipes_determinism'. You can turn off determinism "
+                            "for this DataPipe if that is acceptable for your application"
+                            )
+        return self.cls(*args, **kwargs)  # type: ignore[call-arg, misc]
+
+
+######################################################
+# Type validation
+######################################################
+# Validate each argument of DataPipe with hint as a subtype of the hint.
+def argument_validation(f):
+    signature = inspect.signature(f)
+    hints = get_type_hints(f)
+
+    @wraps(f)
+    def wrapper(*args, **kwargs):
+        bound = signature.bind(*args, **kwargs)
+        for argument_name, value in bound.arguments.items():
+            if argument_name in hints and isinstance(hints[argument_name], _DataPipeMeta):
+                hint = hints[argument_name]
+                if not isinstance(value, IterDataPipe):
+                    raise TypeError(f"Expected argument '{argument_name}' as a IterDataPipe, but found {type(value)}")
+                if not value.type.issubtype(hint.type):
+                    raise TypeError(f"Expected type of argument '{argument_name}' as a subtype of "
+                                    f"hint {hint.type}, but found {value.type}"
+                                    )
+
+        return f(*args, **kwargs)
+
+    return wrapper
+
+
+# Default value is True
+_runtime_validation_enabled: bool = True
+
+
+class runtime_validation_disabled:
+    prev: bool
+
+    def __init__(self) -> None:
+        global _runtime_validation_enabled
+        self.prev = _runtime_validation_enabled
+        _runtime_validation_enabled = False
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        global _runtime_validation_enabled
+        _runtime_validation_enabled = self.prev
+
+
+# Runtime checking
+# Validate output data is subtype of return hint
+def runtime_validation(f):
+    # TODO:
+    # Can be extended to validate '__getitem__' and nonblocking
+    if f.__name__ != '__iter__':
+        raise TypeError(f"Can not decorate function {f.__name__} with 'runtime_validation'")
+
+    @wraps(f)
+    def wrapper(self):
+        global _runtime_validation_enabled
+        if not _runtime_validation_enabled:
+            yield from f(self)
+        else:
+            it = f(self)
+            for d in it:
+                if not self.type.issubtype_of_instance(d):
+                    raise RuntimeError(f"Expected an instance as subtype of {self.type}, but found {d}({type(d)})")
+                yield d
+
+    return wrapper
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/_hook_iterator.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_hook_iterator.py
new file mode 100644
index 0000000000000000000000000000000000000000..905b44b1e0d9af9031fcf3cd9068cd2c47dd115f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_hook_iterator.py
@@ -0,0 +1,248 @@
+import inspect
+import functools
+from enum import Enum
+
+import torch.autograd
+
+
+class _SnapshotState(Enum):
+    r"""
+    These are the snapshotting-related states that IterDataPipes can be in.
+
+    `NotStarted` - allows you to restore a snapshot and create an iterator with reset
+    `Restored` - cannot restore again, allows you to create an iterator without resetting the DataPipe
+    `Iterating` - can restore, will reset if you create a new iterator
+    """
+
+    NotStarted = 0
+    Restored = 1
+    Iterating = 2
+
+
+def _simplify_obj_name(obj) -> str:
+    """Simplify the display strings of objects for the purpose of rendering within DataPipe error messages."""
+    if inspect.isfunction(obj):
+        return obj.__name__
+    else:
+        return repr(obj)
+
+
+def _strip_datapipe_from_name(name: str) -> str:
+    return name.replace("IterDataPipe", "").replace("MapDataPipe", "")
+
+
+def _generate_input_args_string(obj):
+    """Generate a string for the input arguments of an object."""
+    signature = inspect.signature(obj.__class__)
+    input_param_names = set()
+    for param_name in signature.parameters.keys():
+        input_param_names.add(param_name)
+    result = []
+    for name, value in inspect.getmembers(obj):
+        if name in input_param_names:
+            result.append((name, _simplify_obj_name(value)))
+    return ', '.join([f'{name}={value}' for name, value in result])
+
+
+def _generate_iterdatapipe_msg(datapipe, simplify_dp_name: bool = False):
+    output_string = f"{datapipe.__class__.__name__}({_generate_input_args_string(datapipe)})"
+    if simplify_dp_name:
+        output_string = _strip_datapipe_from_name(output_string)
+    return output_string
+
+
+def _gen_invalid_iterdatapipe_msg(datapipe):
+    return ("This iterator has been invalidated because another iterator has been created "
+            f"from the same IterDataPipe: {_generate_iterdatapipe_msg(datapipe)}\n"
+            "This may be caused multiple references to the same IterDataPipe. We recommend "
+            "using `.fork()` if that is necessary.")
+
+
+_feedback_msg = ("\nFor feedback regarding this single iterator per IterDataPipe constraint, feel free "
+                 "to comment on this issue: https://github.com/pytorch/data/issues/45.")
+
+
+def _check_iterator_valid(datapipe, iterator_id, next_method_exists=False) -> None:
+    r"""
+    Given an instance of a DataPipe and an iterator ID, check if the IDs match, and if not, raises an exception.
+
+    In the case of ChildDataPipe, the ID gets compared to the one stored in `main_datapipe` as well.
+    """
+    if next_method_exists:
+        # This is the case where `IterDataPipe` has both `__iter__` and `__next__`.
+        # The `_valid_iterator_id` should either be never set (`None`), or set by at most one
+        # iterator (`0`). Otherwise, it means there are multiple iterators.
+        if datapipe._valid_iterator_id is not None and datapipe._valid_iterator_id != 0:
+            extra_msg = "\nNote that this exception is raised inside your IterDataPipe's a `__next__` method"
+            raise RuntimeError(_gen_invalid_iterdatapipe_msg(datapipe) + extra_msg + _feedback_msg)
+    elif hasattr(datapipe, "_is_child_datapipe") and datapipe._is_child_datapipe is True:
+        if hasattr(datapipe, "_check_valid_iterator_id"):
+            if not datapipe._check_valid_iterator_id(iterator_id):
+                raise RuntimeError("This iterator has been invalidated, because a new iterator has been created "
+                                   f"from one of the ChildDataPipes of "
+                                   f"{_generate_iterdatapipe_msg(datapipe.main_datapipe)}." + _feedback_msg)
+        else:
+            raise RuntimeError("ChildDataPipe must have method `_check_valid_iterator_id`.")
+    elif datapipe._valid_iterator_id != iterator_id:
+        raise RuntimeError(_gen_invalid_iterdatapipe_msg(datapipe) + _feedback_msg)
+
+
+def _set_datapipe_valid_iterator_id(datapipe):
+    """Given a DataPipe, updates its valid iterator ID and reset the DataPipe."""
+    if hasattr(datapipe, "_is_child_datapipe") and datapipe._is_child_datapipe is True:
+        if hasattr(datapipe, "_set_main_datapipe_valid_iterator_id"):
+            datapipe._set_main_datapipe_valid_iterator_id()  # reset() is called within this method when appropriate
+        else:
+            raise RuntimeError("ChildDataPipe must have method `_set_main_datapipe_valid_iterator_id`.")
+    else:
+        if datapipe._valid_iterator_id is None:
+            datapipe._valid_iterator_id = 0
+        else:
+            datapipe._valid_iterator_id += 1
+        datapipe.reset()
+    return datapipe._valid_iterator_id
+
+
+def hook_iterator(namespace):
+    r"""
+    Define a hook that is applied to all `__iter__` of metaclass `_DataPipeMeta`.
+
+    This is done for the purpose of profiling and checking if an iterator is still valid.
+    """
+
+    def profiler_record_fn_context(datapipe):
+        if not hasattr(datapipe, "_profile_name"):
+            datapipe._profile_name = _generate_iterdatapipe_msg(datapipe, simplify_dp_name=True)
+        return torch.autograd.profiler.record_function(datapipe._profile_name)
+
+    class IteratorDecorator:
+        r"""
+        Wrap the iterator and modifying its `__next__` method.
+
+        This decorator is applied to DataPipes of which `__iter__` method is NOT a generator function.
+        Those `__iter__` method commonly returns `self` but not necessarily.
+        """
+
+        def __init__(self, iterator, datapipe, iterator_id, has_next_method):
+            self.iterator = iterator
+            self.datapipe = datapipe
+            self.iterator_id = iterator_id
+            self._profiler_enabled = torch.autograd._profiler_enabled()
+            # Check if `__iter__` returns `self` and `DataPipe` has `__next__`
+            self.self_and_has_next_method = self.iterator is self.datapipe and has_next_method
+
+        def __iter__(self):
+            return self
+
+        def _get_next(self):
+            """Return next with logic related to iterator validity, profiler, and incrementation of samples yielded."""
+            _check_iterator_valid(self.datapipe, self.iterator_id)
+            result = next(self.iterator)
+            if not self.self_and_has_next_method:
+                self.datapipe._number_of_samples_yielded += 1
+            return result
+
+        def __next__(self):
+            # TODO: Add try-except to in-place reduce traceback from the Exception
+            # See: https://github.com/pytorch/data/issues/284
+            if self._profiler_enabled:
+                with profiler_record_fn_context(self.datapipe):
+                    return self._get_next()
+            else:  # Decided against using `contextlib.nullcontext` for performance reasons
+                return self._get_next()
+
+        def __getattr__(self, name):
+            return getattr(self.iterator, name)
+
+    func = namespace['__iter__']
+
+    # ``__iter__`` of IterDataPipe is a generator function
+    if inspect.isgeneratorfunction(func):
+        @functools.wraps(func)
+        def wrap_generator(*args, **kwargs):
+            gen = func(*args, **kwargs)
+            datapipe = args[0]
+            if datapipe._fast_forward_iterator:
+                it = datapipe._fast_forward_iterator
+                datapipe._fast_forward_iterator = None
+                datapipe._snapshot_state = _SnapshotState.Iterating
+                while True:
+                    try:
+                        yield next(it)
+                    except StopIteration:
+                        return
+            iterator_id = _set_datapipe_valid_iterator_id(datapipe)  # This ID is tied to each created iterator
+            _profiler_enabled = torch.autograd._profiler_enabled()
+            try:
+                if _profiler_enabled:
+                    with profiler_record_fn_context(datapipe):
+                        response = gen.send(None)
+                else:
+                    response = gen.send(None)
+
+                while True:
+                    datapipe._number_of_samples_yielded += 1
+                    request = yield response
+                    # Pass through here every time `__next__` is called
+                    if _profiler_enabled:
+                        with profiler_record_fn_context(datapipe):
+                            _check_iterator_valid(datapipe, iterator_id)
+                            response = gen.send(request)
+                    else:  # Decided against using `contextlib.nullcontext` for performance reasons
+                        _check_iterator_valid(datapipe, iterator_id)
+                        response = gen.send(request)
+            except StopIteration as e:
+                return
+            except Exception as e:
+                # TODO: Simplify the traceback message to skip over `response = gen.send(None)`
+                #       Part of https://github.com/pytorch/data/issues/284
+                datapipe = args[0]
+                msg = "thrown by __iter__ of"
+                single_iterator_msg = "single iterator per IterDataPipe constraint"
+                if hasattr(e.args, '__len__'):
+                    full_msg = f"{msg} {datapipe.__class__.__name__}({_generate_input_args_string(datapipe)})"
+                    if len(e.args) == 0 or not isinstance(e.args[0], str):  # If an exception message doesn't exist
+                        e.args = (f'\nThis exception is {full_msg}',)
+                    elif msg not in e.args[0] and single_iterator_msg not in e.args[0]:
+                        e.args = (e.args[0] + f'\nThis exception is {full_msg}',) + e.args[1:]
+                raise
+
+        namespace['__iter__'] = wrap_generator
+    else:  # ``__iter__`` of IterDataPipe is NOT a generator function
+        # IterDataPipe is an iterator with both ``__iter__`` and ``__next__``
+        # And ``__iter__`` may or may not return `self`
+        if '__next__' in namespace:  # If `__next__` exists, put a wrapper around it
+            next_func = namespace['__next__']
+
+            @functools.wraps(next_func)
+            def wrap_next(*args, **kwargs):
+                datapipe = args[0]
+                if torch.autograd._profiler_enabled():
+                    with profiler_record_fn_context(datapipe):
+                        result = next_func(*args, **kwargs)
+                else:
+                    result = next_func(*args, **kwargs)
+                datapipe._number_of_samples_yielded += 1
+                return result
+
+            namespace['__next__'] = wrap_next
+
+            # Note that if the `__next__` and `__iter__` do something completely unrelated. It may cause issue but
+            # the user will be violating the iterator protocol. Potential issue:
+            # 1. Valid iterator ID may not update or checked properly
+            # 2. The number of samples yielded will be miscounted
+
+        # Regardless if `__next__` exists or not, `__iter__` needs a wrapper to track the number of valid iterators
+        @functools.wraps(func)
+        def wrap_iter(*args, **kwargs):
+            iter_ret = func(*args, **kwargs)
+            datapipe = args[0]
+            datapipe._snapshot_state = _SnapshotState.Iterating
+            if datapipe._fast_forward_iterator:
+                iter_ret = datapipe._fast_forward_iterator
+                datapipe._fast_forward_iterator = None
+                return iter_ret
+            iterator_id = _set_datapipe_valid_iterator_id(datapipe)  # This ID is tied to each created iterator
+            return IteratorDecorator(iter_ret, datapipe, iterator_id, '__next__' in namespace)
+
+        namespace['__iter__'] = wrap_iter
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/_typing.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a0b5bf6db4931cf1b0312e1bf60e8d5fb650af
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/_typing.py
@@ -0,0 +1,430 @@
+# Taking reference from official Python typing
+# https://github.com/python/cpython/blob/master/Lib/typing.py
+
+import collections
+import functools
+import numbers
+import sys
+
+from torch.utils.data.datapipes._hook_iterator import hook_iterator, _SnapshotState
+from typing import (Any, Dict, Iterator, Generic, List, Set, Tuple, TypeVar, Union,
+                    get_type_hints)
+from typing import _eval_type, _tp_cache, _type_check, _type_repr  # type: ignore[attr-defined]
+from typing import ForwardRef
+
+# TODO: Use TypeAlias when Python 3.6 is deprecated
+# Please check [Note: TypeMeta and TypeAlias]
+# In case of metaclass conflict due to ABCMeta or _ProtocolMeta
+# For Python 3.9, only Protocol in typing uses metaclass
+from abc import ABCMeta
+from typing import _GenericAlias  # type: ignore[attr-defined, no-redef]
+
+class GenericMeta(ABCMeta):  # type: ignore[no-redef]
+    pass
+
+
+class Integer(numbers.Integral):
+    pass
+
+
+class Boolean(numbers.Integral):
+    pass
+
+
+# Python 'type' object is not subscriptable
+# Tuple[int, List, dict] -> valid
+# tuple[int, list, dict] -> invalid
+# Map Python 'type' to abstract base class
+TYPE2ABC = {
+    bool: Boolean,
+    int: Integer,
+    float: numbers.Real,
+    complex: numbers.Complex,
+    dict: Dict,
+    list: List,
+    set: Set,
+    tuple: Tuple,
+    None: type(None),
+}
+
+
+def issubtype(left, right, recursive=True):
+    r"""
+    Check if the left-side type is a subtype of the right-side type.
+
+    If any of type is a composite type like `Union` and `TypeVar` with
+    bounds, it would be expanded into a list of types and check all
+    of left-side types are subtypes of either one from right-side types.
+    """
+    left = TYPE2ABC.get(left, left)
+    right = TYPE2ABC.get(right, right)
+
+    if right is Any or left == right:
+        return True
+
+    if isinstance(right, _GenericAlias):
+        if getattr(right, '__origin__', None) is Generic:
+            return True
+
+    if right == type(None):
+        return False
+
+    # Right-side type
+    constraints = _decompose_type(right)
+
+    if len(constraints) == 0 or Any in constraints:
+        return True
+
+    if left is Any:
+        return False
+
+    # Left-side type
+    variants = _decompose_type(left)
+
+    # all() will return True for empty variants
+    if len(variants) == 0:
+        return False
+
+    return all(_issubtype_with_constraints(variant, constraints, recursive) for variant in variants)
+
+
+def _decompose_type(t, to_list=True):
+    if isinstance(t, TypeVar):
+        if t.__bound__ is not None:
+            ts = [t.__bound__]
+        else:
+            # For T_co, __constraints__ is ()
+            ts = list(t.__constraints__)
+    elif hasattr(t, '__origin__') and t.__origin__ == Union:
+        ts = t.__args__
+    else:
+        if not to_list:
+            return None
+        ts = [t]
+    # Ignored: Generator has incompatible item type "object"; expected "Type[Any]"
+    ts = [TYPE2ABC.get(_t, _t) for _t in ts]  # type: ignore[misc]
+    return ts
+
+
+def _issubtype_with_constraints(variant, constraints, recursive=True):
+    r"""
+    Check if the variant is a subtype of either one from constraints.
+
+    For composite types like `Union` and `TypeVar` with bounds, they
+    would be expanded for testing.
+    """
+    if variant in constraints:
+        return True
+
+    # [Note: Subtype for Union and TypeVar]
+    # Python typing is able to flatten Union[Union[...]] or Union[TypeVar].
+    # But it couldn't flatten the following scenarios:
+    #   - Union[int, TypeVar[Union[...]]]
+    #   - TypeVar[TypeVar[...]]
+    # So, variant and each constraint may be a TypeVar or a Union.
+    # In these cases, all of inner types from the variant are required to be
+    # extraced and verified as a subtype of any constraint. And, all of
+    # inner types from any constraint being a TypeVar or a Union are
+    # also required to be extracted and verified if the variant belongs to
+    # any of them.
+
+    # Variant
+    vs = _decompose_type(variant, to_list=False)
+
+    # Variant is TypeVar or Union
+    if vs is not None:
+        return all(_issubtype_with_constraints(v, constraints, recursive) for v in vs)
+
+    # Variant is not TypeVar or Union
+    if hasattr(variant, '__origin__') and variant.__origin__ is not None:
+        v_origin = variant.__origin__
+        # In Python-3.9 typing library untyped generics do not have args
+        v_args = getattr(variant, "__args__", None)
+    else:
+        v_origin = variant
+        v_args = None
+
+    # Constraints
+    for constraint in constraints:
+        cs = _decompose_type(constraint, to_list=False)
+
+        # Constraint is TypeVar or Union
+        if cs is not None:
+            if _issubtype_with_constraints(variant, cs, recursive):
+                return True
+        # Constraint is not TypeVar or Union
+        else:
+            # __origin__ can be None for plain list, tuple, ... in Python 3.6
+            if hasattr(constraint, '__origin__') and constraint.__origin__ is not None:
+                c_origin = constraint.__origin__
+                if v_origin == c_origin:
+                    if not recursive:
+                        return True
+                    # In Python-3.9 typing library untyped generics do not have args
+                    c_args = getattr(constraint, "__args__", None)
+                    if c_args is None or len(c_args) == 0:
+                        return True
+                    if v_args is not None and len(v_args) == len(c_args) and \
+                            all(issubtype(v_arg, c_arg) for v_arg, c_arg in zip(v_args, c_args)):
+                        return True
+            # Tuple[int] -> Tuple
+            else:
+                if v_origin == constraint:
+                    return True
+
+    return False
+
+
+def issubinstance(data, data_type):
+    if not issubtype(type(data), data_type, recursive=False):
+        return False
+
+    # In Python-3.9 typing library __args__ attribute is not defined for untyped generics
+    dt_args = getattr(data_type, "__args__", None)
+    if isinstance(data, tuple):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        if len(dt_args) != len(data):
+            return False
+        return all(issubinstance(d, t) for d, t in zip(data, dt_args))
+    elif isinstance(data, (list, set)):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        t = dt_args[0]
+        return all(issubinstance(d, t) for d in data)
+    elif isinstance(data, dict):
+        if dt_args is None or len(dt_args) == 0:
+            return True
+        kt, vt = dt_args
+        return all(issubinstance(k, kt) and issubinstance(v, vt) for k, v in data.items())
+
+    return True
+
+
+# [Note: TypeMeta and TypeAlias]
+# In order to keep compatibility for Python 3.6, use Meta for the typing.
+# TODO: When PyTorch drops the support for Python 3.6, it can be converted
+# into the Alias system and using `__class_getitem__` for DataPipe. The
+# typing system will gain benefit of performance and resolving metaclass
+# conflicts as elaborated in https://www.python.org/dev/peps/pep-0560/
+
+
+class _DataPipeType:
+    r"""Save type annotation in `param`."""
+
+    def __init__(self, param):
+        self.param = param
+
+    def __repr__(self):
+        return _type_repr(self.param)
+
+    def __eq__(self, other):
+        if isinstance(other, _DataPipeType):
+            return self.param == other.param
+        return NotImplemented
+
+    def __hash__(self):
+        return hash(self.param)
+
+    def issubtype(self, other):
+        if isinstance(other.param, _GenericAlias):
+            if getattr(other.param, '__origin__', None) is Generic:
+                return True
+        if isinstance(other, _DataPipeType):
+            return issubtype(self.param, other.param)
+        if isinstance(other, type):
+            return issubtype(self.param, other)
+        raise TypeError(f"Expected '_DataPipeType' or 'type', but found {type(other)}")
+
+    def issubtype_of_instance(self, other):
+        return issubinstance(other, self.param)
+
+
+# Default type for DataPipe without annotation
+T_co = TypeVar('T_co', covariant=True)
+_DEFAULT_TYPE = _DataPipeType(Generic[T_co])
+
+
+class _DataPipeMeta(GenericMeta):
+    r"""
+    Metaclass for `DataPipe`.
+
+    Add `type` attribute and `__init_subclass__` based on the type, and validate the return hint of `__iter__`.
+
+    Note that there is subclass `_IterDataPipeMeta` specifically for `IterDataPipe`.
+    """
+
+    type: _DataPipeType
+
+    def __new__(cls, name, bases, namespace, **kwargs):
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        # TODO: the statements below are not reachable by design as there is a bug and typing is low priority for now.
+        cls.__origin__ = None
+        if 'type' in namespace:
+            return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        namespace['__type_class__'] = False
+        #  For plain derived class without annotation
+        for base in bases:
+            if isinstance(base, _DataPipeMeta):
+                return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+        namespace.update({'type': _DEFAULT_TYPE,
+                          '__init_subclass__': _dp_init_subclass})
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+    def __init__(self, name, bases, namespace, **kwargs):
+        super().__init__(name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+    # TODO: Fix isinstance bug
+    @_tp_cache
+    def _getitem_(self, params):
+        if params is None:
+            raise TypeError(f'{self.__name__}[t]: t can not be None')
+        if isinstance(params, str):
+            params = ForwardRef(params)
+        if not isinstance(params, tuple):
+            params = (params, )
+
+        msg = f"{self.__name__}[t]: t must be a type"
+        params = tuple(_type_check(p, msg) for p in params)
+
+        if isinstance(self.type.param, _GenericAlias):
+            orig = getattr(self.type.param, '__origin__', None)
+            if isinstance(orig, type) and orig is not Generic:
+                p = self.type.param[params]  # type: ignore[index]
+                t = _DataPipeType(p)
+                l = len(str(self.type)) + 2
+                name = self.__name__[:-l]
+                name = name + '[' + str(t) + ']'
+                bases = (self,) + self.__bases__
+                return self.__class__(name, bases,
+                                      {'__init_subclass__': _dp_init_subclass,
+                                       'type': t,
+                                       '__type_class__': True})
+
+        if len(params) > 1:
+            raise TypeError(f'Too many parameters for {self} actual {len(params)}, expected 1')
+
+        t = _DataPipeType(params[0])
+
+        if not t.issubtype(self.type):
+            raise TypeError(f'Can not subclass a DataPipe[{t}] from DataPipe[{self.type}]')
+
+        # Types are equal, fast path for inheritance
+        if self.type == t:
+            return self
+
+        name = self.__name__ + '[' + str(t) + ']'
+        bases = (self,) + self.__bases__
+
+        return self.__class__(name, bases,
+                              {'__init_subclass__': _dp_init_subclass,
+                               '__type_class__': True,
+                               'type': t})
+
+    # TODO: Fix isinstance bug
+    def _eq_(self, other):
+        if not isinstance(other, _DataPipeMeta):
+            return NotImplemented
+        if self.__origin__ is None or other.__origin__ is None:  # type: ignore[has-type]
+            return self is other
+        return (self.__origin__ == other.__origin__  # type: ignore[has-type]
+                and self.type == other.type)
+
+    # TODO: Fix isinstance bug
+    def _hash_(self):
+        return hash((self.__name__, self.type))
+
+
+class _IterDataPipeMeta(_DataPipeMeta):
+    r"""
+    Metaclass for `IterDataPipe` and inherits from `_DataPipeMeta`.
+
+    Add various functions for behaviors specific to `IterDataPipe`.
+    """
+
+    def __new__(cls, name, bases, namespace, **kwargs):
+
+        if 'reset' in namespace:
+            reset_func = namespace['reset']
+
+            @functools.wraps(reset_func)
+            def conditional_reset(*args, **kwargs):
+                r"""
+                Only execute DataPipe's `reset()` method if `_SnapshotState` is `Iterating` or `NotStarted`.
+
+                This allows recently restored DataPipe to preserve its restored state during the initial `__iter__` call.
+                """
+                datapipe = args[0]
+                if datapipe._snapshot_state in (_SnapshotState.Iterating, _SnapshotState.NotStarted):
+                    # Reset `NotStarted` is necessary because the `source_datapipe` of a DataPipe might have
+                    # already begun iterating.
+                    datapipe._number_of_samples_yielded = 0
+                    datapipe._fast_forward_iterator = None
+                    reset_func(*args, **kwargs)
+                datapipe._snapshot_state = _SnapshotState.Iterating
+
+            namespace['reset'] = conditional_reset
+
+        if '__iter__' in namespace:
+            hook_iterator(namespace)
+        return super().__new__(cls, name, bases, namespace, **kwargs)  # type: ignore[call-overload]
+
+
+def _dp_init_subclass(sub_cls, *args, **kwargs):
+    # Add function for datapipe instance to reinforce the type
+    sub_cls.reinforce_type = reinforce_type
+
+    # TODO:
+    # - add global switch for type checking at compile-time
+
+    # Ignore internal type class
+    if getattr(sub_cls, '__type_class__', False):
+        return
+
+    # Check if the string type is valid
+    if isinstance(sub_cls.type.param, ForwardRef):
+        base_globals = sys.modules[sub_cls.__module__].__dict__
+        try:
+            param = _eval_type(sub_cls.type.param, base_globals, locals())
+            sub_cls.type.param = param
+        except TypeError as e:
+            raise TypeError(f"{sub_cls.type.param.__forward_arg__} is not supported by Python typing") from e
+
+    if '__iter__' in sub_cls.__dict__:
+        iter_fn = sub_cls.__dict__['__iter__']
+        hints = get_type_hints(iter_fn)
+        if 'return' in hints:
+            return_hint = hints['return']
+            # Plain Return Hint for Python 3.6
+            if return_hint == Iterator:
+                return
+            if not (hasattr(return_hint, '__origin__') and
+                    (return_hint.__origin__ == Iterator or
+                     return_hint.__origin__ == collections.abc.Iterator)):
+                raise TypeError("Expected 'Iterator' as the return annotation for `__iter__` of {}"
+                                ", but found {}".format(sub_cls.__name__, _type_repr(hints['return'])))
+            data_type = return_hint.__args__[0]
+            if not issubtype(data_type, sub_cls.type.param):
+                raise TypeError("Expected return type of '__iter__' as a subtype of {}, but found {}"
+                                " for {}".format(sub_cls.type, _type_repr(data_type), sub_cls.__name__))
+
+
+def reinforce_type(self, expected_type):
+    r"""
+    Reinforce the type for DataPipe instance.
+
+    And the 'expected_type' is required to be a subtype of the original type
+    hint to restrict the type requirement of DataPipe instance.
+    """
+    if isinstance(expected_type, tuple):
+        expected_type = Tuple[expected_type]
+    _type_check(expected_type, msg="'expected_type' must be a type")
+
+    if not issubtype(expected_type, self.type.param):
+        raise TypeError(f"Expected 'expected_type' as subtype of {self.type}, but found {_type_repr(expected_type)}")
+
+    self.type = _DataPipeType(expected_type)
+    return self
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3164a033bda6c52d38b0973948312b010a6928c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__init__.py
@@ -0,0 +1,11 @@
+from torch.utils.data.datapipes.dataframe.dataframes import (
+    CaptureDataFrame, DFIterDataPipe,
+)
+from torch.utils.data.datapipes.dataframe.datapipes import (
+    DataFramesAsTuplesPipe,
+)
+
+__all__ = ['CaptureDataFrame', 'DFIterDataPipe', 'DataFramesAsTuplesPipe']
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b684be39fb87443f5ec2ff5b940e48484d8f0de
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframe_wrapper.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframe_wrapper.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65fbbdb96016fbb3f8b95eca8f6ff4c68fb44ef3
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframe_wrapper.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdb0d01d07b06828ce406a369f068bb0ce837382
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/dataframes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/datapipes.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/datapipes.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b0ddb065e97583d2bc33f20a679ce969d1e9d6d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/datapipes.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/structures.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/structures.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5460779985e98074d851aea9f52a041010510e1f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/__pycache__/structures.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c948f88b4361ededcb2a90e478673f10a0b6703
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframe_wrapper.py
@@ -0,0 +1,125 @@
+from typing import Any, Optional
+
+_pandas: Any = None
+_WITH_PANDAS: Optional[bool] = None
+
+
+def _try_import_pandas() -> bool:
+    try:
+        import pandas  # type: ignore[import]
+        global _pandas
+        _pandas = pandas
+        return True
+    except ImportError:
+        return False
+
+
+# pandas used only for prototyping, will be shortly replaced with TorchArrow
+def _with_pandas() -> bool:
+    global _WITH_PANDAS
+    if _WITH_PANDAS is None:
+        _WITH_PANDAS = _try_import_pandas()
+    return _WITH_PANDAS
+
+
+class PandasWrapper:
+    @classmethod
+    def create_dataframe(cls, data, columns):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        return _pandas.DataFrame(data, columns=columns)  # type: ignore[union-attr]
+
+    @classmethod
+    def is_dataframe(cls, data):
+        if not _with_pandas():
+            return False
+        return isinstance(data, _pandas.core.frame.DataFrame)  # type: ignore[union-attr]
+
+    @classmethod
+    def is_column(cls, data):
+        if not _with_pandas():
+            return False
+        return isinstance(data, _pandas.core.series.Series)  # type: ignore[union-attr]
+
+    @classmethod
+    def iterate(cls, data):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        yield from data.itertuples(index=False)
+
+    @classmethod
+    def concat(cls, buffer):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        return _pandas.concat(buffer)  # type: ignore[union-attr]
+
+    @classmethod
+    def get_item(cls, data, idx):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        return data[idx: idx + 1]
+
+    @classmethod
+    def get_len(cls, df):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        return len(df.index)
+
+    @classmethod
+    def get_columns(cls, df):
+        if not _with_pandas():
+            raise Exception("DataFrames prototype requires pandas to function")
+        return list(df.columns.values.tolist())
+
+
+# When you build own implementation just override it with dataframe_wrapper.set_df_wrapper(new_wrapper_class)
+default_wrapper = PandasWrapper
+
+
+def get_df_wrapper():
+    return default_wrapper
+
+
+def set_df_wrapper(wrapper):
+    global default_wrapper
+    default_wrapper = wrapper
+
+
+def create_dataframe(data, columns=None):
+    wrapper = get_df_wrapper()
+    return wrapper.create_dataframe(data, columns)
+
+
+def is_dataframe(data):
+    wrapper = get_df_wrapper()
+    return wrapper.is_dataframe(data)
+
+
+def get_columns(data):
+    wrapper = get_df_wrapper()
+    return wrapper.get_columns(data)
+
+
+def is_column(data):
+    wrapper = get_df_wrapper()
+    return wrapper.is_column(data)
+
+
+def concat(buffer):
+    wrapper = get_df_wrapper()
+    return wrapper.concat(buffer)
+
+
+def iterate(data):
+    wrapper = get_df_wrapper()
+    return wrapper.iterate(data)
+
+
+def get_item(data, idx):
+    wrapper = get_df_wrapper()
+    return wrapper.get_item(data, idx)
+
+
+def get_len(df):
+    wrapper = get_df_wrapper()
+    return wrapper.get_len(df)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframes.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframes.py
new file mode 100644
index 0000000000000000000000000000000000000000..79895d011b28e0c574b1ad005e05ce85a78efe61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/dataframes.py
@@ -0,0 +1,433 @@
+from typing import Any, Dict, List, Optional
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import DFIterDataPipe, IterDataPipe
+
+from torch.utils.data.datapipes.dataframe.structures import DataChunkDF
+
+# TODO(VitalyFedyunin): Add error when two different traces get combined
+
+__all__ = [
+    "Capture",
+    "CaptureA",
+    "CaptureAdd",
+    "CaptureCall",
+    "CaptureControl",
+    "CaptureDataFrame",
+    "CaptureDataFrameWithDataPipeOps",
+    "CaptureF",
+    "CaptureGetAttr",
+    "CaptureGetItem",
+    "CaptureInitial",
+    "CaptureLikeMock",
+    "CaptureMul",
+    "CaptureSetItem",
+    "CaptureSub",
+    "CaptureVariable",
+    "CaptureVariableAssign",
+    "DataFrameTracer",
+    "DataFrameTracedOps",
+    "disable_capture",
+    "get_val",
+]
+
+
+def disable_capture():
+    CaptureControl.disabled = True
+
+
+class CaptureControl:
+    disabled = False
+
+
+class DataFrameTracedOps(DFIterDataPipe):
+    def __init__(self, source_datapipe, output_var):
+        self.source_datapipe = source_datapipe
+        self.output_var = output_var
+
+    def __iter__(self):
+        for item in self.source_datapipe:
+            yield self.output_var.apply_ops(item)
+
+
+#  TODO(VitalyFedyunin): Extract this list from the DFIterDataPipe registred functions
+DATAPIPES_OPS = ['_dataframes_as_tuples', 'groupby', '_dataframes_filter', 'map', 'to_datapipe',
+                 'shuffle', 'concat', 'batch', '_dataframes_per_row', '_dataframes_concat', '_dataframes_shuffle']
+
+UNIMPLEMENTED_ATTR = ['__deepcopy__', '__setstate__', 'is_shardable', 'apply_sharding']
+
+
+class Capture:
+    # TODO: All operations are shared across entire InitialCapture, need to figure out what if we join two captures
+
+    def __init__(self, schema_df=None):
+        self.ctx = {'operations': [], 'variables': [], 'schema_df': schema_df}
+
+    def __str__(self):
+        return self._ops_str()
+
+    def _ops_str(self):
+        res = ""
+        for op in self.ctx['operations']:
+            if len(res) > 0:
+                res += "\n"
+            res += str(op)
+        return res
+
+    def __getstate__(self):
+        # TODO(VitalyFedyunin): Currently can't pickle (why?)
+        self.ctx['schema_df'] = None
+        for var in self.ctx['variables']:
+            var.calculated_value = None
+        state = {}
+        for item in self.__dict__:
+            state[item] = getattr(self, item)
+        return state
+
+    def __setstate__(self, state):
+        for k, v in state.items():
+            setattr(self, k, v)
+
+    def __getattr__(self, attrname):
+        if attrname == 'kwarg' or attrname == 'kwargs':
+            raise Exception('no kwargs!')
+        if attrname in ['__deepcopy__']:
+            raise AttributeError()
+        result = CaptureGetAttr(self, attrname, ctx=self.ctx)
+        return result
+
+    def __getitem__(self, key):
+        return CaptureGetItem(self, key, ctx=self.ctx)
+
+    def __setitem__(self, key, value):
+        self.ctx['operations'].append(
+            CaptureSetItem(self, key, value, ctx=self.ctx))
+
+    def __add__(self, add_val):
+        res = CaptureAdd(self, add_val, ctx=self.ctx)
+        var = CaptureVariable(res, ctx=self.ctx)
+        self.ctx['operations'].append(
+            CaptureVariableAssign(variable=var, value=res, ctx=self.ctx))
+        return var
+
+    def __sub__(self, add_val):
+        res = CaptureSub(self, add_val, ctx=self.ctx)
+        var = CaptureVariable(res, ctx=self.ctx)
+        self.ctx['operations'].append(
+            CaptureVariableAssign(variable=var, value=res, ctx=self.ctx))
+        return var
+
+    def __mul__(self, add_val):
+        res = CaptureMul(self, add_val, ctx=self.ctx)
+        var = CaptureVariable(res, ctx=self.ctx)
+        t = CaptureVariableAssign(variable=var, value=res, ctx=self.ctx)
+        self.ctx['operations'].append(t)
+        return var
+
+    def _is_context_empty(self):
+        return len(self.ctx['operations']) == 0 and len(self.ctx['variables']) == 0
+
+    def apply_ops_2(self, dataframe):
+        # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+        self.ctx['variables'][0].calculated_value = dataframe
+        for op in self.ctx['operations']:
+            op.execute()
+
+    @property
+    def columns(self):
+        self.apply_ops_2(self.ctx['schema_df'])
+        value = self.execute()
+        return value.columns
+
+    # TODO(VitalyFedyunin): Add tests
+    # TODO(VitalyFedyunin): Need to join context if one of them are empty because we used capture
+
+    def __call__(self, *args, **kwargs):
+        # TODO: Check if args or kwargs have more than one different context
+        if self._is_context_empty():
+            # TODO: Allow CaptureA to take context from mock
+            for arg in args:
+                if isinstance(arg, Capture) and not arg._is_context_empty():
+                    self.ctx = arg.ctx
+                    break
+            if self._is_context_empty():
+                for k, v in kwargs.items():
+                    if isinstance(k, Capture) and not k._is_context_empty():
+                        self.ctx = k.ctx
+                        break
+                    if isinstance(v, Capture) and not v._is_context_empty():
+                        self.ctx = v.ctx
+                        break
+
+        res = CaptureCall(self, ctx=self.ctx, args=args, kwargs=kwargs)
+        var = CaptureVariable(None, ctx=self.ctx)
+        t = CaptureVariableAssign(ctx=self.ctx, variable=var, value=res)
+        self.ctx['operations'].append(t)
+        return var
+
+
+class CaptureF(Capture):
+    def __init__(self, ctx=None, **kwargs):
+        if ctx is None:
+            self.ctx = {'operations': [], 'variables': []}
+        else:
+            self.ctx = ctx
+        self.kwargs = kwargs
+
+
+class CaptureA(CaptureF):
+    def __str__(self):
+        return f"{self.kwargs['name']}"
+
+    def execute(self):
+        value = self.kwargs['real_attribute']
+        return value
+
+
+class CaptureLikeMock:
+    def __init__(self, name):
+        import unittest.mock as mock
+        # TODO(VitalyFedyunin): Do not use provate function here, copy own implementation instead.
+        get_target, attribute = mock._get_target(name)  # type: ignore[attr-defined]
+        self.get_target = get_target
+        self.attribute = attribute
+        self.name = name
+
+    def __enter__(self):
+        self.save = getattr(self.get_target(), self.attribute)
+        capt = CaptureA(name=self.name, real_attribute=self.save)
+        setattr(self.get_target(), self.attribute, capt)
+
+    def __exit__(self, *exc_info):
+        setattr(self.get_target(), self.attribute, self.save)
+
+
+class CaptureCall(Capture):
+
+    def __init__(self, callable, ctx=None, **kwargs):
+        if ctx is None:
+            self.ctx = {'operations': [], 'variables': []}
+        else:
+            self.ctx = ctx
+        self.kwargs = kwargs
+        self.callable = callable
+
+    def __str__(self):
+        return "{callable}({args},{kwargs})".format(callable=self.callable, **self.kwargs)
+
+    def execute(self):
+
+        # TODO: VitalyFedyunin execute kwargs and maybe nested structures
+        executed_args = []
+        for arg in self.kwargs['args']:
+            if isinstance(arg, Capture):
+                executed_args.append(arg.execute())
+            else:
+                executed_args.append(arg)
+        left = get_val(self.callable)
+        return left(*executed_args, **self.kwargs['kwargs'])
+
+
+class CaptureVariableAssign(CaptureF):
+    def __str__(self):
+        variable = self.kwargs['variable']
+        value = self.kwargs['value']
+        return f"{variable} = {value}"
+
+    def execute(self):
+        self.kwargs['variable'].calculated_value = self.kwargs['value'].execute()
+
+
+class CaptureVariable(Capture):
+    # TODO(VitalyFedyunin): This should be atomic and thread safe
+    names_idx = 0
+
+    def __init__(self, value, ctx):
+        if CaptureControl.disabled:
+            raise Exception('Attempting to create capture variable with capture off')
+        self.ctx = ctx
+        self.value = value
+        self.name = f'var_{CaptureVariable.names_idx}'
+        CaptureVariable.names_idx += 1
+        self.ctx['variables'].append(self)
+
+    def __str__(self):
+        return self.name
+
+    def execute(self):
+        return self.calculated_value
+
+    def apply_ops(self, dataframe):
+        # TODO(VitalyFedyunin): Make this calculation thread safe (as currently it updates pointer)
+        self.ctx['variables'][0].calculated_value = dataframe
+        for op in self.ctx['operations']:
+            op.execute()
+        return self.calculated_value
+
+
+class CaptureGetItem(Capture):
+    def __init__(self, left, key, ctx):
+        self.ctx = ctx
+        self.left = left
+        self.key = key
+
+    def __str__(self):
+        return f"{self.left}[{get_val(self.key)}]"
+
+    def execute(self):
+        left = self.left.execute()
+        return left[self.key]
+
+
+class CaptureSetItem(Capture):
+    def __init__(self, left, key, value, ctx):
+        self.ctx = ctx
+        self.left = left
+        self.key = key
+        self.value = value
+
+    def __str__(self):
+        return f"{self.left}[{get_val(self.key)}] = {self.value}"
+
+    def execute(self):
+        left = self.left.execute()
+        value = self.value.execute()
+        left[self.key] = value
+
+
+class CaptureAdd(Capture):
+    def __init__(self, left, right, ctx):
+        self.ctx = ctx
+        self.left = left
+        self.right = right
+
+    def __str__(self):
+        return f"{self.left} + {self.right}"
+
+    def execute(self):
+        return get_val(self.left) + get_val(self.right)
+
+
+class CaptureMul(Capture):
+    def __init__(self, left, right, ctx):
+        self.ctx = ctx
+        self.left = left
+        self.right = right
+
+    def __str__(self):
+        return f"{self.left} * {self.right}"
+
+    def execute(self):
+        return get_val(self.left) * get_val(self.right)
+
+
+class CaptureSub(Capture):
+    def __init__(self, left, right, ctx):
+        self.ctx = ctx
+        self.left = left
+        self.right = right
+
+    def __str__(self):
+        return f"{self.left} - {self.right}"
+
+    def execute(self):
+        return get_val(self.left) - get_val(self.right)
+
+
+class CaptureGetAttr(Capture):
+    def __init__(self, src, name, ctx):
+        self.ctx = ctx
+        self.src = src
+        self.name = name
+
+    def __str__(self):
+        return f"{self.src}.{self.name}"
+
+    def execute(self):
+        val = get_val(self.src)
+        return getattr(val, self.name)
+
+
+def get_val(capture):
+    if isinstance(capture, Capture):
+        return capture.execute()
+    elif isinstance(capture, str):
+        return f'"{capture}"'
+    else:
+        return capture
+
+
+class CaptureInitial(CaptureVariable):
+    def __init__(self, schema_df=None):
+        new_ctx: Dict[str, List[Any]] = {'operations': [], 'variables': [], 'schema_df': schema_df}
+        super().__init__(None, new_ctx)
+        self.name = f'input_{self.name}'
+
+
+class CaptureDataFrame(CaptureInitial):
+    pass
+
+
+class CaptureDataFrameWithDataPipeOps(CaptureDataFrame):
+    def as_datapipe(self):
+        return DataFrameTracedOps(
+            self.ctx['variables'][0].source_datapipe, self)
+
+    def raw_iterator(self):
+        return self.as_datapipe().__iter__()
+
+    def __iter__(self):
+        return iter(self._dataframes_as_tuples())
+
+    def batch(self, batch_size=10, drop_last: bool = False, wrapper_class=DataChunkDF):
+        dp = self._dataframes_per_row()._dataframes_concat(batch_size)
+        dp = dp.as_datapipe().batch(1, drop_last=drop_last, wrapper_class=wrapper_class)
+        dp._dp_contains_dataframe = True
+        return dp
+
+    def groupby(self,
+                group_key_fn,
+                *,
+                buffer_size=10000,
+                group_size=None,
+                guaranteed_group_size=None,
+                drop_remaining=False):
+        dp = self._dataframes_per_row()
+        dp = dp.as_datapipe().groupby(group_key_fn, buffer_size=buffer_size, group_size=group_size,
+                                      guaranteed_group_size=guaranteed_group_size, drop_remaining=drop_remaining)
+        return dp
+
+    def shuffle(self, *args, **kwargs):
+        return self._dataframes_shuffle(*args, **kwargs)
+
+    def filter(self, *args, **kwargs):
+        return self._dataframes_filter(*args, **kwargs)
+
+    def collate(self, *args, **kwargs):
+        raise Exception("Can't collate unbatched DataFrames stream")
+
+    def __getattr__(self, attrname):  # ?
+        if attrname in UNIMPLEMENTED_ATTR:
+            raise AttributeError('Attempting to get ', attrname)
+        if attrname in DATAPIPES_OPS:
+            return (self.as_datapipe()).__getattr__(attrname)
+        return super().__getattr__(attrname)
+
+
+@functional_datapipe('trace_as_dataframe')
+class DataFrameTracer(CaptureDataFrameWithDataPipeOps, IterDataPipe):  # type: ignore[misc]
+    source_datapipe: Optional[Any] = None
+
+    # TODO(VitalyFedyunin): Must implement all special functions of datapipes
+
+    def set_shuffle_settings(self, *args, **kwargs):
+        pass
+
+    def is_shardable(self):
+        return False
+
+    def __init__(self, source_datapipe, schema_df=None):
+        self.source_datapipe = source_datapipe
+        if schema_df is None:
+            schema_df = next(iter(self.source_datapipe))
+        super().__init__(schema_df=schema_df)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/datapipes.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/datapipes.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ace16aa52b2ac3a594b8ad344a4e6803d4a4dd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/datapipes.py
@@ -0,0 +1,131 @@
+import random
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import DFIterDataPipe, IterDataPipe
+
+from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+
+__all__ = [
+    "ConcatDataFramesPipe",
+    "DataFramesAsTuplesPipe",
+    "ExampleAggregateAsDataFrames",
+    "FilterDataFramesPipe",
+    "PerRowDataFramesPipe",
+    "ShuffleDataFramesPipe",
+]
+
+
+@functional_datapipe('_dataframes_as_tuples')
+class DataFramesAsTuplesPipe(IterDataPipe):
+    def __init__(self, source_datapipe):
+        self.source_datapipe = source_datapipe
+
+    def __iter__(self):
+        for df in self.source_datapipe:
+            # for record in df.to_records(index=False):
+            yield from df_wrapper.iterate(df)
+
+
+@functional_datapipe('_dataframes_per_row', enable_df_api_tracing=True)
+class PerRowDataFramesPipe(DFIterDataPipe):
+    def __init__(self, source_datapipe):
+        self.source_datapipe = source_datapipe
+
+    def __iter__(self):
+        for df in self.source_datapipe:
+            # TODO(VitalyFedyunin): Replacing with TorchArrow only API, as we are dropping pandas as followup
+            for i in range(len(df)):
+                yield df[i:i + 1]
+
+
+@functional_datapipe('_dataframes_concat', enable_df_api_tracing=True)
+class ConcatDataFramesPipe(DFIterDataPipe):
+    def __init__(self, source_datapipe, batch=3):
+        self.source_datapipe = source_datapipe
+        self.n_batch = batch
+
+    def __iter__(self):
+        buffer = []
+        for df in self.source_datapipe:
+            buffer.append(df)
+            if len(buffer) == self.n_batch:
+                yield df_wrapper.concat(buffer)
+                buffer = []
+        if len(buffer):
+            yield df_wrapper.concat(buffer)
+
+
+@functional_datapipe('_dataframes_shuffle', enable_df_api_tracing=True)
+class ShuffleDataFramesPipe(DFIterDataPipe):
+    def __init__(self, source_datapipe):
+        self.source_datapipe = source_datapipe
+
+    def __iter__(self):
+        size = None
+        all_buffer = []
+        for df in self.source_datapipe:
+            if size is None:
+                size = df_wrapper.get_len(df)
+            for i in range(df_wrapper.get_len(df)):
+                all_buffer.append(df_wrapper.get_item(df, i))
+        random.shuffle(all_buffer)
+        buffer = []
+        for df in all_buffer:
+            buffer.append(df)
+            if len(buffer) == size:
+                yield df_wrapper.concat(buffer)
+                buffer = []
+        if len(buffer):
+            yield df_wrapper.concat(buffer)
+
+
+@functional_datapipe('_dataframes_filter', enable_df_api_tracing=True)
+class FilterDataFramesPipe(DFIterDataPipe):
+    def __init__(self, source_datapipe, filter_fn):
+        self.source_datapipe = source_datapipe
+        self.filter_fn = filter_fn
+
+    def __iter__(self):
+        size = None
+        all_buffer = []
+        filter_res = []
+        for df in self.source_datapipe:
+            if size is None:
+                size = len(df.index)
+            for i in range(len(df.index)):
+                all_buffer.append(df[i:i + 1])
+                filter_res.append(self.filter_fn(df.iloc[i]))
+
+        buffer = []
+        for df, res in zip(all_buffer, filter_res):
+            if res:
+                buffer.append(df)
+                if len(buffer) == size:
+                    yield df_wrapper.concat(buffer)
+                    buffer = []
+        if len(buffer):
+            yield df_wrapper.concat(buffer)
+
+
+@functional_datapipe('_to_dataframes_pipe', enable_df_api_tracing=True)
+class ExampleAggregateAsDataFrames(DFIterDataPipe):
+    def __init__(self, source_datapipe, dataframe_size=10, columns=None):
+        self.source_datapipe = source_datapipe
+        self.columns = columns
+        self.dataframe_size = dataframe_size
+
+    def _as_list(self, item):
+        try:
+            return list(item)
+        except Exception:  # TODO(VitalyFedyunin): Replace with better iterable exception
+            return [item]
+
+    def __iter__(self):
+        aggregate = []
+        for item in self.source_datapipe:
+            aggregate.append(self._as_list(item))
+            if len(aggregate) == self.dataframe_size:
+                yield df_wrapper.create_dataframe(aggregate, columns=self.columns)
+                aggregate = []
+        if len(aggregate) > 0:
+            yield df_wrapper.create_dataframe(aggregate, columns=self.columns)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/structures.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f751aea0c21619fdf028d56c3b9d3903b776ef9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/dataframe/structures.py
@@ -0,0 +1,18 @@
+from torch.utils.data.datapipes.datapipe import DataChunk
+from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+
+__all__ = ["DataChunkDF", ]
+
+
+class DataChunkDF(DataChunk):
+    """DataChunkDF iterating over individual items inside of DataFrame containers, to access DataFrames user `raw_iterator`."""
+
+    def __iter__(self):
+        for df in self.items:
+            yield from df_wrapper.iterate(df)
+
+    def __len__(self):
+        total_len = 0
+        for df in self.items:
+            total_len += df_wrapper.get_len(df)
+        return total_len
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..530e5cd7143be031ae5cee3f2b138b7286e565b2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.py
@@ -0,0 +1,404 @@
+import functools
+import pickle
+from typing import Dict, Callable, Optional, TypeVar, Generic, Iterator
+
+from torch.utils.data.datapipes._typing import _DataPipeMeta, _IterDataPipeMeta
+from torch.utils.data.datapipes._hook_iterator import _SnapshotState
+from torch.utils.data.datapipes.utils.common import (
+    _deprecation_warning,
+    _iter_deprecated_functional_names,
+    _map_deprecated_functional_names,
+)
+from torch.utils.data.dataset import Dataset, IterableDataset
+from torch.utils._import_utils import import_dill
+
+dill = import_dill()
+HAS_DILL = dill is not None
+
+__all__ = [
+    "DataChunk",
+    "DFIterDataPipe",
+    "IterDataPipe",
+    "MapDataPipe",
+]
+
+T = TypeVar('T')
+T_co = TypeVar('T_co', covariant=True)
+
+UNTRACABLE_DATAFRAME_PIPES = ['batch',  # As it returns DataChunks
+                              'groupby',   # As it returns DataChunks
+                              '_dataframes_as_tuples',  # As it unpacks DF
+                              'trace_as_dataframe',  # As it used to mark DF for tracing
+                              ]
+
+
+class IterDataPipe(IterableDataset[T_co], metaclass=_IterDataPipeMeta):
+    r"""
+    Iterable-style DataPipe.
+
+    All DataPipes that represent an iterable of data samples should subclass this.
+    This style of DataPipes is particularly useful when data come from a stream, or
+    when the number of samples is too large to fit them all in memory. ``IterDataPipe`` is lazily initialized and its
+    elements are computed only when ``next()`` is called on the iterator of an ``IterDataPipe``.
+
+    All subclasses should overwrite :meth:`__iter__`, which would return an
+    iterator of samples in this DataPipe. Calling ``__iter__`` of an ``IterDataPipe`` automatically invokes its
+    method ``reset()``, which by default performs no operation. When writing a custom ``IterDataPipe``, users should
+    override ``reset()`` if necessary. The common usages include resetting buffers, pointers,
+    and various state variables within the custom ``IterDataPipe``.
+
+    Note:
+        Only `one` iterator can be valid for each ``IterDataPipe`` at a time,
+        and the creation a second iterator will invalidate the first one. This constraint is necessary because
+        some ``IterDataPipe`` have internal buffers, whose states can become invalid if there are multiple iterators.
+        The code example below presents details on how this constraint looks in practice.
+        If you have any feedback related to this constraint, please see `GitHub IterDataPipe Single Iterator Issue`_.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing ``IterDataPipe`` (recommended, available to most but not all DataPipes).
+    You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple
+    operations in succession.
+
+    .. _GitHub IterDataPipe Single Iterator Issue:
+        https://github.com/pytorch/data/issues/45
+
+    Note:
+        When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
+        item in the DataPipe will be yielded from the :class:`~torch.utils.data.DataLoader`
+        iterator. When :attr:`num_workers > 0`, each worker process will have a
+        different copy of the DataPipe object, so it is often desired to configure
+        each copy independently to avoid having duplicate data returned from the
+        workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
+        process, returns information about the worker. It can be used in either the
+        dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
+        :attr:`worker_init_fn` option to modify each copy's behavior.
+
+    Examples:
+        General Usage:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+            >>> dp = IterableWrapper(range(10))
+            >>> map_dp_1 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+            >>> map_dp_2 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+            >>> list(map_dp_1)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> list(map_dp_2)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0)
+            >>> list(filter_dp)
+            [2, 4, 6, 8, 10]
+        Single Iterator Constraint Example:
+            >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+            >>> source_dp = IterableWrapper(range(10))
+            >>> it1 = iter(source_dp)
+            >>> list(it1)
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+            >>> it1 = iter(source_dp)
+            >>> it2 = iter(source_dp)  # The creation of a new iterator invalidates `it1`
+            >>> next(it2)
+            0
+            >>> next(it1)  # Further usage of `it1` will raise a `RunTimeError`
+    """
+
+    functions: Dict[str, Callable] = {}
+    reduce_ex_hook: Optional[Callable] = None
+    getstate_hook: Optional[Callable] = None
+    str_hook: Optional[Callable] = None
+    repr_hook: Optional[Callable] = None
+    _valid_iterator_id: Optional[int] = None
+    _number_of_samples_yielded: int = 0
+    _snapshot_state: _SnapshotState = _SnapshotState.NotStarted
+    _fast_forward_iterator: Optional[Iterator] = None
+
+    def __iter__(self) -> Iterator[T_co]:
+        return self
+
+    def __getattr__(self, attribute_name):
+        if attribute_name in IterDataPipe.functions:
+            if attribute_name in _iter_deprecated_functional_names:
+                kwargs = _iter_deprecated_functional_names[attribute_name]
+                _deprecation_warning(**kwargs)
+            f = IterDataPipe.functions[attribute_name]
+            function = functools.partial(f, self)
+            functools.update_wrapper(wrapper=function, wrapped=f, assigned=("__doc__",))
+            return function
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute_name}")
+
+    @classmethod
+    def register_function(cls, function_name, function):
+        cls.functions[function_name] = function
+
+    @classmethod
+    def register_datapipe_as_function(cls, function_name, cls_to_register, enable_df_api_tracing=False):
+        if function_name in cls.functions:
+            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")
+
+        def class_function(cls, enable_df_api_tracing, source_dp, *args, **kwargs):
+            result_pipe = cls(source_dp, *args, **kwargs)
+            if isinstance(result_pipe, IterDataPipe):
+                if enable_df_api_tracing or isinstance(source_dp, DFIterDataPipe):
+                    if function_name not in UNTRACABLE_DATAFRAME_PIPES:
+                        result_pipe = result_pipe.trace_as_dataframe()
+
+            return result_pipe
+
+        function = functools.partial(
+            class_function, cls_to_register, enable_df_api_tracing
+        )
+        functools.update_wrapper(
+            wrapper=function, wrapped=cls_to_register, assigned=("__doc__",)
+        )
+        cls.functions[function_name] = function
+
+    def __getstate__(self):
+        """
+        Serialize `lambda` functions when `dill` is available.
+
+        If this doesn't cover your custom DataPipe's use case, consider writing custom methods for
+        `__getstate__` and `__setstate__`, or use `pickle.dumps` for serialization.
+        """
+        state = self.__dict__
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __reduce_ex__(self, *args, **kwargs):
+        if IterDataPipe.reduce_ex_hook is not None:
+            try:
+                return IterDataPipe.reduce_ex_hook(self)
+            except NotImplementedError:
+                pass
+        return super().__reduce_ex__(*args, **kwargs)
+
+    @classmethod
+    def set_getstate_hook(cls, hook_fn):
+        if IterDataPipe.getstate_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing getstate_hook")
+        IterDataPipe.getstate_hook = hook_fn
+
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn):
+        if IterDataPipe.reduce_ex_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing reduce_ex_hook")
+        IterDataPipe.reduce_ex_hook = hook_fn
+
+    def __repr__(self):
+        if self.repr_hook is not None:
+            return self.repr_hook(self)
+        # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __str__(self):
+        if self.str_hook is not None:
+            return self.str_hook(self)
+        # Instead of showing <torch. ... .MapperIterDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __dir__(self):
+        # for auto-completion in a REPL (e.g. Jupyter notebook)
+        return list(super().__dir__()) + list(self.functions.keys())
+
+    def reset(self) -> None:
+        r"""
+        Reset the `IterDataPipe` to the initial state.
+
+        By default, no-op. For subclasses of `IterDataPipe`, depending on their functionalities,
+        they may want to override this method with implementations that
+        may clear the buffers and reset pointers of the DataPipe.
+        The `reset` method is always called when `__iter__` is called as part of `hook_iterator`.
+        """
+        pass
+
+
+class DFIterDataPipe(IterDataPipe):
+    def _is_dfpipe(self):
+        return True
+
+
+class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+    r"""
+    Map-style DataPipe.
+
+    All datasets that represent a map from keys to data samples should subclass this.
+    Subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+    data sample for a given, unique key. Subclasses can also optionally overwrite
+    :meth:`__len__`, which is expected to return the size of the dataset by many
+    :class:`~torch.utils.data.Sampler` implementations and the default options
+    of :class:`~torch.utils.data.DataLoader`.
+
+    These DataPipes can be invoked in two ways, using the class constructor or applying their
+    functional form onto an existing `MapDataPipe` (recommend, available to most but not all DataPipes).
+
+    Note:
+        :class:`~torch.utils.data.DataLoader` by default constructs an index
+        sampler that yields integral indices. To make it work with a map-style
+        DataPipe with non-integral indices/keys, a custom sampler must be provided.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> map_dp_1 = dp.map(lambda x: x + 1)  # Using functional form (recommended)
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)  # Using class constructor
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> batch_dp = map_dp_1.batch(batch_size=2)
+        >>> list(batch_dp)
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
+    """
+
+    functions: Dict[str, Callable] = {}
+    reduce_ex_hook: Optional[Callable] = None
+    getstate_hook: Optional[Callable] = None
+    str_hook: Optional[Callable] = None
+    repr_hook: Optional[Callable] = None
+
+    def __getattr__(self, attribute_name):
+        if attribute_name in MapDataPipe.functions:
+            if attribute_name in _map_deprecated_functional_names:
+                kwargs = _map_deprecated_functional_names[attribute_name]
+                _deprecation_warning(**kwargs)
+            f = MapDataPipe.functions[attribute_name]
+            function = functools.partial(f, self)
+            functools.update_wrapper(wrapper=function, wrapped=f, assigned=("__doc__",))
+            return function
+        else:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{attribute_name}")
+
+    @classmethod
+    def register_function(cls, function_name, function):
+        cls.functions[function_name] = function
+
+    @classmethod
+    def register_datapipe_as_function(cls, function_name, cls_to_register):
+        if function_name in cls.functions:
+            raise Exception(f"Unable to add DataPipe function name {function_name} as it is already taken")
+
+        def class_function(cls, source_dp, *args, **kwargs):
+            result_pipe = cls(source_dp, *args, **kwargs)
+            return result_pipe
+
+        function = functools.partial(class_function, cls_to_register)
+        functools.update_wrapper(
+            wrapper=function, wrapped=cls_to_register, assigned=("__doc__",)
+        )
+        cls.functions[function_name] = function
+
+    def __getstate__(self):
+        """
+        Serialize `lambda` functions when `dill` is available.
+
+        If this doesn't cover your custom DataPipe's use case, consider writing custom methods for
+        `__getstate__` and `__setstate__`, or use `pickle.dumps` for serialization.
+        """
+        state = self.__dict__
+        if MapDataPipe.getstate_hook is not None:
+            return MapDataPipe.getstate_hook(state)
+        return state
+
+    def __reduce_ex__(self, *args, **kwargs):
+        if MapDataPipe.reduce_ex_hook is not None:
+            try:
+                return MapDataPipe.reduce_ex_hook(self)
+            except NotImplementedError:
+                pass
+        return super().__reduce_ex__(*args, **kwargs)
+
+    @classmethod
+    def set_getstate_hook(cls, hook_fn):
+        if MapDataPipe.getstate_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing getstate_hook")
+        MapDataPipe.getstate_hook = hook_fn
+
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn):
+        if MapDataPipe.reduce_ex_hook is not None and hook_fn is not None:
+            raise Exception("Attempt to override existing reduce_ex_hook")
+        MapDataPipe.reduce_ex_hook = hook_fn
+
+    def __repr__(self):
+        if self.repr_hook is not None:
+            return self.repr_hook(self)
+        # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __str__(self):
+        if self.str_hook is not None:
+            return self.str_hook(self)
+        # Instead of showing <torch. ... .MapperMapDataPipe object at 0x.....>, return the class name
+        return str(self.__class__.__qualname__)
+
+    def __dir__(self):
+        # for auto-completion in a REPL (e.g. Jupyter notebook)
+        return list(super().__dir__()) + list(self.functions.keys())
+
+
+
+class _DataPipeSerializationWrapper:
+    def __init__(self, datapipe):
+        self._datapipe = datapipe
+
+    def __getstate__(self):
+        use_dill = False
+        try:
+            value = pickle.dumps(self._datapipe)
+        except Exception:
+            if HAS_DILL:
+                value = dill.dumps(self._datapipe)
+                use_dill = True
+            else:
+                raise
+        return (value, use_dill)
+
+    def __setstate__(self, state):
+        value, use_dill = state
+        if use_dill:
+            self._datapipe = dill.loads(value)
+        else:
+            self._datapipe = pickle.loads(value)
+
+    def __len__(self):
+        try:
+            return len(self._datapipe)
+        except Exception as e:
+            raise TypeError(
+                f"{type(self).__name__} instance doesn't have valid length"
+            ) from e
+
+
+class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
+    def __init__(self, datapipe: IterDataPipe[T_co]):
+        super().__init__(datapipe)
+        self._datapipe_iter: Optional[Iterator[T_co]] = None
+
+    def __iter__(self) -> "_IterDataPipeSerializationWrapper":
+        self._datapipe_iter = iter(self._datapipe)
+        return self
+
+    def __next__(self) -> T_co:  # type: ignore[type-var]
+        assert self._datapipe_iter is not None
+        return next(self._datapipe_iter)
+
+
+class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe):
+    def __getitem__(self, idx):
+        return self._datapipe[idx]
+
+
+class DataChunk(list, Generic[T]):
+    def __init__(self, items):
+        super().__init__(items)
+        self.items = items
+
+    def as_str(self, indent=''):
+        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
+        return res
+
+    def __iter__(self) -> Iterator[T]:
+        yield from super().__iter__()
+
+    def raw_iterator(self) -> T:  # type: ignore[misc]
+        yield from self.items
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.pyi b/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..b895a38837ef8fcf88b426bf04c82e055fcd5d5c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/datapipe.pyi
@@ -0,0 +1,689 @@
+# This base template ("datapipe.pyi.in") is generated from mypy stubgen with minimal editing for code injection
+# The output file will be "datapipe.pyi". This is executed as part of torch/CMakeLists.txt
+# Note that, for mypy, .pyi file takes precedent over .py file, such that we must define the interface for other
+# classes/objects here, even though we are not injecting extra code into them at the moment.
+
+from typing import Any, Callable, Dict, Generic, Iterator, List, Literal, Optional, TypeVar, Union
+
+from torch.utils.data import Dataset, default_collate, IterableDataset
+from torch.utils.data.datapipes._hook_iterator import _SnapshotState
+from torch.utils.data.datapipes._typing import _DataPipeMeta, _IterDataPipeMeta
+
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+UNTRACABLE_DATAFRAME_PIPES: Any
+
+class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta):
+    functions: Dict[str, Callable] = ...
+    reduce_ex_hook: Optional[Callable] = ...
+    getstate_hook: Optional[Callable] = ...
+    str_hook: Optional[Callable] = ...
+    repr_hook: Optional[Callable] = ...
+    def __getattr__(self, attribute_name: Any): ...
+    @classmethod
+    def register_function(cls, function_name: Any, function: Any) -> None: ...
+    @classmethod
+    def register_datapipe_as_function(
+        cls,
+        function_name: Any,
+        cls_to_register: Any,
+    ): ...
+    def __getstate__(self): ...
+    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
+    @classmethod
+    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
+    # Functional form of 'BatcherMapDataPipe'
+    def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> MapDataPipe:
+        r"""
+        Create mini-batches of data (functional name: ``batch``).
+    
+        An outer dimension will be added as ``batch_size`` if ``drop_last`` is set to ``True``,
+        or ``length % batch_size`` for the last batch if ``drop_last`` is set to ``False``.
+    
+        Args:
+            datapipe: Iterable DataPipe being batched
+            batch_size: The size of each batch
+            drop_last: Option to drop the last batch if it's not full
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.map import SequenceWrapper
+            >>> dp = SequenceWrapper(range(10))
+            >>> batch_dp = dp.batch(batch_size=2)
+            >>> list(batch_dp)
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]
+        """
+    
+    # Functional form of 'ConcaterMapDataPipe'
+    def concat(self, *datapipes: MapDataPipe) -> MapDataPipe:
+        r"""
+        Concatenate multiple Map DataPipes (functional name: ``concat``).
+    
+        The new index of is the cumulative sum of source DataPipes.
+        For example, if there are 2 source DataPipes both with length 5,
+        index 0 to 4 of the resulting `ConcatMapDataPipe` would refer to
+        elements of the first DataPipe, and 5 to 9 would refer to elements
+        of the second DataPipe.
+    
+        Args:
+            datapipes: Map DataPipes being concatenated
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.map import SequenceWrapper
+            >>> dp1 = SequenceWrapper(range(3))
+            >>> dp2 = SequenceWrapper(range(3))
+            >>> concat_dp = dp1.concat(dp2)
+            >>> list(concat_dp)
+            [0, 1, 2, 0, 1, 2]
+        """
+    
+    # Functional form of 'MapperMapDataPipe'
+    def map(self, fn: Callable= ...) -> MapDataPipe:
+        r"""
+        Apply the input function over each item from the source DataPipe (functional name: ``map``).
+    
+        The function can be any regular Python function or partial object. Lambda
+        function is not recommended as it is not supported by pickle.
+    
+        Args:
+            datapipe: Source MapDataPipe
+            fn: Function being applied to each item
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+            >>> def add_one(x):
+            ...     return x + 1
+            >>> dp = SequenceWrapper(range(10))
+            >>> map_dp_1 = dp.map(add_one)
+            >>> list(map_dp_1)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+            >>> list(map_dp_2)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        """
+    
+    # Functional form of 'ShufflerIterDataPipe'
+    def shuffle(self, *, indices: Optional[List] = None) -> IterDataPipe:
+        r"""
+        Shuffle the input MapDataPipe via its indices (functional name: ``shuffle``).
+    
+        When it is used with :class:`~torch.utils.data.DataLoader`, the methods to
+        set up random seed are different based on :attr:`num_workers`.
+    
+        For single-process mode (:attr:`num_workers == 0`), the random seed is set before
+        the :class:`~torch.utils.data.DataLoader` in the main process. For multi-process
+        mode (:attr:`num_worker > 0`), ``worker_init_fn`` is used to set up a random seed
+        for each worker process.
+    
+        Args:
+            datapipe: MapDataPipe being shuffled
+            indices: a list of indices of the MapDataPipe. If not provided, we assume it uses 0-based indexing
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.map import SequenceWrapper
+            >>> dp = SequenceWrapper(range(10))
+            >>> shuffle_dp = dp.shuffle().set_seed(0)
+            >>> list(shuffle_dp)
+            [7, 8, 1, 5, 3, 4, 2, 0, 9, 6]
+            >>> list(shuffle_dp)
+            [6, 1, 9, 5, 2, 4, 7, 3, 8, 0]
+            >>> # Reset seed for Shuffler
+            >>> shuffle_dp = shuffle_dp.set_seed(0)
+            >>> list(shuffle_dp)
+            [7, 8, 1, 5, 3, 4, 2, 0, 9, 6]
+    
+        Note:
+            Even thought this ``shuffle`` operation takes a ``MapDataPipe`` as the input, it would return an
+            ``IterDataPipe`` rather than a ``MapDataPipe``, because ``MapDataPipe`` should be non-sensitive to
+            the order of data order for the sake of random reads, but ``IterDataPipe`` depends on the order
+            of data during data-processing.
+        """
+    
+    # Functional form of 'ZipperMapDataPipe'
+    def zip(self, *datapipes: MapDataPipe[T_co]) -> MapDataPipe:
+        r"""
+        Aggregates elements into a tuple from each of the input DataPipes (functional name: ``zip``).
+    
+        This MataPipe is out of bound as soon as the shortest input DataPipe is exhausted.
+    
+        Args:
+            *datapipes: Map DataPipes being aggregated
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.map import SequenceWrapper
+            >>> dp1 = SequenceWrapper(range(3))
+            >>> dp2 = SequenceWrapper(range(10, 13))
+            >>> zip_dp = dp1.zip(dp2)
+            >>> list(zip_dp)
+            [(0, 10), (1, 11), (2, 12)]
+        """
+    
+
+class IterDataPipe(IterableDataset[T_co], metaclass=_IterDataPipeMeta):
+    functions: Dict[str, Callable] = ...
+    reduce_ex_hook: Optional[Callable] = ...
+    getstate_hook: Optional[Callable] = ...
+    str_hook: Optional[Callable] = ...
+    repr_hook: Optional[Callable] = ...
+    _number_of_samples_yielded: int = ...
+    _snapshot_state: _SnapshotState = _SnapshotState.Iterating
+    _fast_forward_iterator: Optional[Iterator] = ...
+    def __getattr__(self, attribute_name: Any): ...
+    @classmethod
+    def register_function(cls, function_name: Any, function: Any) -> None: ...
+    @classmethod
+    def register_datapipe_as_function(
+        cls,
+        function_name: Any,
+        cls_to_register: Any,
+        enable_df_api_tracing: bool = ...,
+    ): ...
+    def __getstate__(self): ...
+    def __reduce_ex__(self, *args: Any, **kwargs: Any): ...
+    @classmethod
+    def set_getstate_hook(cls, hook_fn: Any) -> None: ...
+    @classmethod
+    def set_reduce_ex_hook(cls, hook_fn: Any) -> None: ...
+    # Functional form of 'BatcherIterDataPipe'
+    def batch(self, batch_size: int, drop_last: bool = False, wrapper_class=DataChunk) -> IterDataPipe:
+        r"""
+        Creates mini-batches of data (functional name: ``batch``).
+    
+        An outer dimension will be added as ``batch_size`` if ``drop_last`` is set to ``True``, or ``length % batch_size`` for the
+        last batch if ``drop_last`` is set to ``False``.
+    
+        Args:
+            datapipe: Iterable DataPipe being batched
+            batch_size: The size of each batch
+            drop_last: Option to drop the last batch if it's not full
+            wrapper_class: wrapper to apply onto each batch (type ``List``) before yielding,
+                defaults to ``DataChunk``
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> dp = IterableWrapper(range(10))
+            >>> dp = dp.batch(batch_size=3, drop_last=True)
+            >>> list(dp)
+            [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+        """
+    
+    # Functional form of 'CollatorIterDataPipe'
+    def collate(self, conversion: Optional[Union[Callable[..., Any],Dict[Union[str, Any], Union[Callable, Any]],]] = default_collate, collate_fn: Optional[Callable] = None) -> IterDataPipe:
+        r"""
+        Collates samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
+    
+        By default, it uses :func:`torch.utils.data.default_collate`.
+    
+        .. note::
+            While writing a custom collate function, you can import :func:`torch.utils.data.default_collate` for the
+            default behavior and `functools.partial` to specify any additional arguments.
+    
+        Args:
+            datapipe: Iterable DataPipe being collated
+            collate_fn: Customized collate function to collect and combine data or a batch of data.
+                Default function collates to Tensor(s) based on data type.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> # Convert integer data to float Tensor
+            >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
+            ...     def __init__(self, start, end):
+            ...         super(MyIterDataPipe).__init__()
+            ...         assert end > start, "this example code only works with end >= start"
+            ...         self.start = start
+            ...         self.end = end
+            ...
+            ...     def __iter__(self):
+            ...         return iter(range(self.start, self.end))
+            ...
+            ...     def __len__(self):
+            ...         return self.end - self.start
+            ...
+            >>> ds = MyIterDataPipe(start=3, end=7)
+            >>> print(list(ds))
+            [3, 4, 5, 6]
+            >>> def collate_fn(batch):
+            ...     return torch.tensor(batch, dtype=torch.float)
+            ...
+            >>> collated_ds = CollateIterDataPipe(ds, collate_fn=collate_fn)
+            >>> print(list(collated_ds))
+            [tensor(3.), tensor(4.), tensor(5.), tensor(6.)]
+        """
+    
+    # Functional form of 'ConcaterIterDataPipe'
+    def concat(self, *datapipes: IterDataPipe) -> IterDataPipe:
+        r"""
+        Concatenates multiple Iterable DataPipes (functional name: ``concat``).
+    
+        The resulting DataPipe will yield all the elements from the first input DataPipe, before yielding from the subsequent ones.
+    
+        Args:
+            datapipes: Iterable DataPipes being concatenated
+    
+        Example:
+            >>> # xdoctest: +REQUIRES(module:torchdata)
+            >>> import random
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> dp1 = IterableWrapper(range(3))
+            >>> dp2 = IterableWrapper(range(5))
+            >>> list(dp1.concat(dp2))
+            [0, 1, 2, 0, 1, 2, 3, 4]
+        """
+    
+    # Functional form of 'DemultiplexerIterDataPipe'
+    def demux(self, num_instances: int, classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000) -> List[IterDataPipe]:
+        r"""
+        Splits the input DataPipe into multiple child DataPipes, using the given classification function (functional name: ``demux``).
+    
+        A list of the child DataPipes is returned from this operation.
+    
+        Args:
+            datapipe: Iterable DataPipe being filtered
+            num_instances: number of instances of the DataPipe to create
+            classifier_fn: a function that maps values to an integer within the range ``[0, num_instances - 1]`` or ``None``
+            drop_none: defaults to ``False``, if ``True``, the function will skip over elements classified as ``None``
+            buffer_size: this defines the maximum number of inputs that the buffer can hold across all child
+                DataPipes while waiting for their values to be yielded.
+                Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+    
+        Examples:
+            >>> # xdoctest: +REQUIRES(module:torchdata)
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> def odd_or_even(n):
+            ...     return n % 2
+            >>> source_dp = IterableWrapper(range(5))
+            >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even)
+            >>> list(dp1)
+            [0, 2, 4]
+            >>> list(dp2)
+            [1, 3]
+            >>> # It can also filter out any element that gets `None` from the `classifier_fn`
+            >>> def odd_or_even_no_zero(n):
+            ...     return n % 2 if n != 0 else None
+            >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True)
+            >>> list(dp1)
+            [2, 4]
+            >>> list(dp2)
+            [1, 3]
+        """
+    
+    # Functional form of 'FilterIterDataPipe'
+    def filter(self, filter_fn: Callable, input_col=None) -> IterDataPipe:
+        r"""
+        Filters out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
+    
+        Args:
+            datapipe: Iterable DataPipe being filtered
+            filter_fn: Customized function mapping an element to a boolean.
+            input_col: Index or indices of data which ``filter_fn`` is applied, such as:
+    
+                - ``None`` as default to apply ``filter_fn`` to the data directly.
+                - Integer(s) is used for list/tuple.
+                - Key(s) is used for dict.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> def is_even(n):
+            ...     return n % 2 == 0
+            >>> dp = IterableWrapper(range(5))
+            >>> filter_dp = dp.filter(filter_fn=is_even)
+            >>> list(filter_dp)
+            [0, 2, 4]
+        """
+    
+    # Functional form of 'ForkerIterDataPipe'
+    def fork(self, num_instances: int, buffer_size: int = 1000, copy: Optional[Literal["shallow", "deep"]] = None) -> List[IterDataPipe]:
+        r"""
+        Creates multiple instances of the same Iterable DataPipe (functional name: ``fork``).
+    
+        Args:
+            datapipe: Iterable DataPipe being copied
+            num_instances: number of instances of the datapipe to create
+            buffer_size: this restricts how far ahead the leading child DataPipe
+               can read relative to the slowest child DataPipe.
+               Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+            copy: copy strategy to use for items yielded by each branch. Supported
+                options are ``None`` for no copying, ``"shallow"`` for shallow object
+                copies, and ``"deep"`` for deep object copies. Defaults to ``None``.
+    
+        Note:
+            All branches of the forked pipeline return the identical object unless
+            the copy parameter is supplied. If the object is mutable or contains
+            mutable objects, changing them in one branch will affect all others.
+    
+        Example:
+            >>> # xdoctest: +REQUIRES(module:torchdata)
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> source_dp = IterableWrapper(range(5))
+            >>> dp1, dp2 = source_dp.fork(num_instances=2)
+            >>> list(dp1)
+            [0, 1, 2, 3, 4]
+            >>> list(dp2)
+            [0, 1, 2, 3, 4]
+        """
+    
+    # Functional form of 'GrouperIterDataPipe'
+    def groupby(self, group_key_fn: Callable[[T_co], Any], *, keep_key: bool = False, buffer_size: int = 10000, group_size: Optional[int] = None, guaranteed_group_size: Optional[int] = None, drop_remaining: bool = False) -> IterDataPipe:
+        r"""
+        Groups data from IterDataPipe by keys from ``group_key_fn``, yielding a ``DataChunk`` with batch size up to ``group_size``.
+    
+        (functional name: ``groupby``).
+    
+        The samples are read sequentially from the source ``datapipe``, and a batch of samples belonging to the same group
+        will be yielded as soon as the size of the batch reaches ``group_size``. When the buffer is full,
+        the DataPipe will yield the largest batch with the same key, provided that its size is larger
+        than ``guaranteed_group_size``. If its size is smaller, it will be dropped if ``drop_remaining=True``.
+    
+        After iterating through the entirety of source ``datapipe``, everything not dropped due to the buffer capacity
+        will be yielded from the buffer, even if the group sizes are smaller than ``guaranteed_group_size``.
+    
+        Args:
+            datapipe: Iterable datapipe to be grouped
+            group_key_fn: Function used to generate group key from the data of the source datapipe
+            keep_key: Option to yield the matching key along with the items in a tuple,
+                resulting in `(key, [items])` otherwise returning [items]
+            buffer_size: The size of buffer for ungrouped data
+            group_size: The max size of each group, a batch is yielded as soon as it reaches this size
+            guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full
+            drop_remaining: Specifies if the group smaller than ``guaranteed_group_size`` will be dropped from buffer
+                when the buffer is full
+    
+        Example:
+            >>> import os
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> def group_fn(file):
+            ...     return os.path.basename(file).split(".")[0]
+            >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
+            >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
+            >>> list(dp0)
+            [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
+            >>> # A group is yielded as soon as its size equals to `group_size`
+            >>> dp1 = source_dp.groupby(group_key_fn=group_fn, group_size=2)
+            >>> list(dp1)
+            [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
+            >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
+            >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
+            >>> list(dp2)
+            [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
+        """
+    
+    # Functional form of 'FileListerIterDataPipe'
+    def list_files(self, masks: Union[str, List[str]] = '', *, recursive: bool = False, abspath: bool = False, non_deterministic: bool = False, length: int = -1) -> IterDataPipe:
+        r"""
+        Given path(s) to the root directory, yields file pathname(s) (path + filename) of files within the root directory.
+    
+        Multiple root directories can be provided (functional name: ``list_files``).
+    
+        Args:
+            root: Root directory or a sequence of root directories
+            masks: Unix style filter string or string list for filtering file name(s)
+            recursive: Whether to return pathname from nested directories or not
+            abspath: Whether to return relative pathname or absolute pathname
+            non_deterministic: Whether to return pathname in sorted order or not.
+                If ``False``, the results yielded from each root directory will be sorted
+            length: Nominal length of the datapipe
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import FileLister
+            >>> dp = FileLister(root=".", recursive=True)
+            >>> list(dp)
+            ['example.py', './data/data.tar']
+        """
+    
+    # Functional form of 'MapperIterDataPipe'
+    def map(self, fn: Callable, input_col=None, output_col=None) -> IterDataPipe:
+        r"""
+        Applies a function over each item from the source DataPipe (functional name: ``map``).
+    
+        The function can be any regular Python function or partial object. Lambda
+        function is not recommended as it is not supported by pickle.
+    
+        Args:
+            datapipe: Source Iterable DataPipe
+            fn: Function being applied over each item
+            input_col: Index or indices of data which ``fn`` is applied, such as:
+    
+                - ``None`` as default to apply ``fn`` to the data directly.
+                - Integer(s) is used for list/tuple.
+                - Key(s) is used for dict.
+    
+            output_col: Index of data where result of ``fn`` is placed. ``output_col`` can be specified
+                only when ``input_col`` is not ``None``
+    
+                - ``None`` as default to replace the index that ``input_col`` specified; For ``input_col`` with
+                  multiple indices, the left-most one is used, and other indices will be removed.
+                - Integer is used for list/tuple. ``-1`` represents to append result at the end.
+                - Key is used for dict. New key is acceptable.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+            >>> def add_one(x):
+            ...     return x + 1
+            >>> dp = IterableWrapper(range(10))
+            >>> map_dp_1 = dp.map(add_one)  # Invocation via functional form is preferred
+            >>> list(map_dp_1)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle`
+            >>> # Use `functools.partial` or explicitly define the function instead
+            >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+            >>> list(map_dp_2)
+            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        """
+    
+    # Functional form of 'MultiplexerIterDataPipe'
+    def mux(self, *datapipes) -> IterDataPipe:
+        r"""
+        Yields one element at a time from each of the input Iterable DataPipes (functional name: ``mux``).
+    
+        As in, one element from the 1st input DataPipe, then one element from the 2nd DataPipe in the next iteration,
+        and so on. It ends when the shortest input DataPipe is exhausted.
+    
+        Args:
+            datapipes: Iterable DataPipes that will take turn to yield their elements, until the shortest DataPipe is exhausted
+    
+        Example:
+            >>> # xdoctest: +REQUIRES(module:torchdata)
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> dp1, dp2, dp3 = IterableWrapper(range(3)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+            >>> list(dp1.mux(dp2, dp3))
+            [0, 10, 20, 1, 11, 21, 2, 12, 22]
+        """
+    
+    # Functional form of 'FileOpenerIterDataPipe'
+    def open_files(self, mode: str = 'r', encoding: Optional[str] = None, length: int = -1) -> IterDataPipe:
+        r"""
+        Given pathnames, opens files and yield pathname and file stream in a tuple (functional name: ``open_files``).
+    
+        Args:
+            datapipe: Iterable datapipe that provides pathnames
+            mode: An optional string that specifies the mode in which
+                the file is opened by ``open()``. It defaults to ``r``, other options are
+                ``b`` for reading in binary mode and ``t`` for text mode.
+            encoding: An optional string that specifies the encoding of the
+                underlying file. It defaults to ``None`` to match the default encoding of ``open``.
+            length: Nominal length of the datapipe
+    
+        Note:
+            The opened file handles will be closed by Python's GC periodically. Users can choose
+            to close them explicitly.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader
+            >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt'))
+            >>> dp = FileOpener(dp)
+            >>> dp = StreamReader(dp)
+            >>> list(dp)
+            [('./abc.txt', 'abc')]
+        """
+    
+    # Functional form of 'StreamReaderIterDataPipe'
+    def read_from_stream(self, chunk=None) -> IterDataPipe:
+        r"""
+        Given IO streams and their label names, yield bytes with label name as tuple.
+    
+        (functional name: ``read_from_stream``).
+    
+        Args:
+            datapipe: Iterable DataPipe provides label/URL and byte stream
+            chunk: Number of bytes to be read from stream per iteration.
+                If ``None``, all bytes will be read until the EOF.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper, StreamReader
+            >>> from io import StringIO
+            >>> dp = IterableWrapper([("alphabet", StringIO("abcde"))])
+            >>> list(StreamReader(dp, chunk=1))
+            [('alphabet', 'a'), ('alphabet', 'b'), ('alphabet', 'c'), ('alphabet', 'd'), ('alphabet', 'e')]
+        """
+    
+    # Functional form of 'RoutedDecoderIterDataPipe'
+    def routed_decode(self, *handlers: Callable, key_fn: Callable= ...) -> IterDataPipe:
+        r"""
+        Decodes binary streams from input DataPipe, yields pathname and decoded data in a tuple.
+    
+        (functional name: ``routed_decode``)
+    
+        Args:
+            datapipe: Iterable datapipe that provides pathname and binary stream in tuples
+            handlers: Optional user defined decoder handlers. If ``None``, basic and image decoder
+                handlers will be set as default. If multiple handles are provided, the priority
+                order follows the order of handlers (the first handler has the top priority)
+            key_fn: Function for decoder to extract key from pathname to dispatch handlers.
+                Default is set to extract file extension from pathname
+    
+        Note:
+            When ``key_fn`` is specified returning anything other than extension, the default
+            handler will not work and users need to specify custom handler. Custom handler
+            could use regex to determine the eligibility to handle data.
+        """
+    
+    # Functional form of 'ShardingFilterIterDataPipe'
+    def sharding_filter(self, sharding_group_filter=None) -> IterDataPipe:
+        r"""
+        Wrapper that allows DataPipe to be sharded (functional name: ``sharding_filter``).
+    
+        After ``apply_sharding`` is called, each instance of the DataPipe (on different workers) will have every `n`-th element of the
+        original DataPipe, where `n` equals to the number of instances.
+    
+        Args:
+            source_datapipe: Iterable DataPipe that will be sharded
+        """
+    
+    # Functional form of 'ShufflerIterDataPipe'
+    def shuffle(self, *, buffer_size: int = 10000, unbatch_level: int = 0) -> IterDataPipe:
+        r"""
+        Shuffle the input DataPipe with a buffer (functional name: ``shuffle``).
+    
+        The buffer with ``buffer_size`` is filled with elements from the datapipe first. Then,
+        each item will be yielded from the buffer by reservoir sampling via iterator.
+    
+        ``buffer_size`` is required to be larger than ``0``. For ``buffer_size == 1``, the
+        datapipe is not shuffled. In order to fully shuffle all elements from datapipe,
+        ``buffer_size`` is required to be greater than or equal to the size of datapipe.
+    
+        When it is used with :class:`torch.utils.data.DataLoader`, the methods to
+        set up random seed are different based on :attr:`num_workers`.
+    
+        For single-process mode (:attr:`num_workers == 0`), the random seed is set before
+        the :class:`~torch.utils.data.DataLoader` in the main process. For multi-process
+        mode (:attr:`num_worker > 0`), `worker_init_fn` is used to set up a random seed
+        for each worker process.
+    
+        Args:
+            datapipe: The IterDataPipe being shuffled
+            buffer_size: The buffer size for shuffling (default to ``10000``)
+            unbatch_level: Specifies if it is necessary to unbatch source data before
+                applying the shuffle
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> dp = IterableWrapper(range(10))
+            >>> shuffle_dp = dp.shuffle()
+            >>> list(shuffle_dp)
+            [0, 4, 1, 6, 3, 2, 9, 5, 7, 8]
+        """
+    
+    # Functional form of 'UnBatcherIterDataPipe'
+    def unbatch(self, unbatch_level: int = 1) -> IterDataPipe:
+        r"""
+        Undos batching of data (functional name: ``unbatch``).
+    
+        In other words, it flattens the data up to the specified level within a batched DataPipe.
+    
+        Args:
+            datapipe: Iterable DataPipe being un-batched
+            unbatch_level: Defaults to ``1`` (only flattening the top level). If set to ``2``,
+                it will flatten the top two levels, and ``-1`` will flatten the entire DataPipe.
+    
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> source_dp = IterableWrapper([[[0, 1], [2]], [[3, 4], [5]], [[6]]])
+            >>> dp1 = source_dp.unbatch()
+            >>> list(dp1)
+            [[0, 1], [2], [3, 4], [5], [6]]
+            >>> dp2 = source_dp.unbatch(unbatch_level=2)
+            >>> list(dp2)
+            [0, 1, 2, 3, 4, 5, 6]
+        """
+    
+    # Functional form of 'ZipperIterDataPipe'
+    def zip(self, *datapipes: IterDataPipe) -> IterDataPipe:
+        r"""
+        Aggregates elements into a tuple from each of the input DataPipes (functional name: ``zip``).
+    
+        The output is stopped as soon as the shortest input DataPipe is exhausted.
+    
+        Args:
+            *datapipes: Iterable DataPipes being aggregated
+    
+        Example:
+            >>> # xdoctest: +REQUIRES(module:torchdata)
+            >>> from torchdata.datapipes.iter import IterableWrapper
+            >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+            >>> list(dp1.zip(dp2, dp3))
+            [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)]
+        """
+    
+
+class DFIterDataPipe(IterDataPipe):
+    def _is_dfpipe(self): ...
+    def __iter__(self): ...
+
+class _DataPipeSerializationWrapper:
+    def __init__(self, datapipe): ...
+    def __getstate__(self): ...
+    def __setstate__(self, state): ...
+    def __len__(self): ...
+
+class _IterDataPipeSerializationWrapper(_DataPipeSerializationWrapper, IterDataPipe):
+    def __iter__(self): ...
+
+class _MapDataPipeSerializationWrapper(_DataPipeSerializationWrapper, MapDataPipe):
+    def __getitem__(self, idx): ...
+
+class DataChunk(list, Generic[T]):
+    def __init__(self, items):
+        super().__init__(items)
+        self.items = items
+    def as_str(self, indent: str = "") -> str:
+        res = indent + "[" + ", ".join(str(i) for i in iter(self)) + "]"
+        return res
+    def __iter__(self) -> Iterator[T]:
+        yield from super().__iter__()
+    def raw_iterator(self) -> T:  # type: ignore[misc]
+        yield from self.items
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/gen_pyi.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/gen_pyi.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ada79c1ed9c2fd0f8ee7aa095de871065018375
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/gen_pyi.py
@@ -0,0 +1,246 @@
+import os
+import pathlib
+from collections import defaultdict
+from typing import Any, Dict, List, Set, Tuple, Union
+
+
+def materialize_lines(lines: List[str], indentation: int) -> str:
+    output = ""
+    new_line_with_indent = "\n" + " " * indentation
+    for i, line in enumerate(lines):
+        if i != 0:
+            output += new_line_with_indent
+        output += line.replace('\n', new_line_with_indent)
+    return output
+
+
+def gen_from_template(dir: str, template_name: str, output_name: str, replacements: List[Tuple[str, Any, int]]):
+
+    template_path = os.path.join(dir, template_name)
+    output_path = os.path.join(dir, output_name)
+
+    with open(template_path) as f:
+        content = f.read()
+    for placeholder, lines, indentation in replacements:
+        with open(output_path, "w") as f:
+            content = content.replace(placeholder, materialize_lines(lines, indentation))
+            f.write(content)
+
+
+def find_file_paths(dir_paths: List[str], files_to_exclude: Set[str]) -> Set[str]:
+    """
+    When given a path to a directory, returns the paths to the relevant files within it.
+
+    This function does NOT recursive traverse to subdirectories.
+    """
+    paths: Set[str] = set()
+    for dir_path in dir_paths:
+        all_files = os.listdir(dir_path)
+        python_files = {fname for fname in all_files if ".py" == fname[-3:]}
+        filter_files = {fname for fname in python_files if fname not in files_to_exclude}
+        paths.update({os.path.join(dir_path, fname) for fname in filter_files})
+    return paths
+
+
+def extract_method_name(line: str) -> str:
+    """Extract method name from decorator in the form of "@functional_datapipe({method_name})"."""
+    if "(\"" in line:
+        start_token, end_token = "(\"", "\")"
+    elif "(\'" in line:
+        start_token, end_token = "(\'", "\')"
+    else:
+        raise RuntimeError(f"Unable to find appropriate method name within line:\n{line}")
+    start, end = line.find(start_token) + len(start_token), line.find(end_token)
+    return line[start:end]
+
+
+def extract_class_name(line: str) -> str:
+    """Extract class name from class definition in the form of "class {CLASS_NAME}({Type}):"."""
+    start_token = "class "
+    end_token = "("
+    start, end = line.find(start_token) + len(start_token), line.find(end_token)
+    return line[start:end]
+
+
+def parse_datapipe_file(file_path: str) -> Tuple[Dict[str, str], Dict[str, str], Set[str], Dict[str, List[str]]]:
+    """Given a path to file, parses the file and returns a dictionary of method names to function signatures."""
+    method_to_signature, method_to_class_name, special_output_type = {}, {}, set()
+    doc_string_dict = defaultdict(list)
+    with open(file_path) as f:
+        open_paren_count = 0
+        method_name, class_name, signature = "", "", ""
+        skip = False
+        for line in f:
+            if line.count("\"\"\"") % 2 == 1:
+                skip = not skip
+            if skip or "\"\"\"" in line:  # Saving docstrings
+                doc_string_dict[method_name].append(line)
+                continue
+            if "@functional_datapipe" in line:
+                method_name = extract_method_name(line)
+                doc_string_dict[method_name] = []
+                continue
+            if method_name and "class " in line:
+                class_name = extract_class_name(line)
+                continue
+            if method_name and ("def __init__(" in line or "def __new__(" in line):
+                if "def __new__(" in line:
+                    special_output_type.add(method_name)
+                open_paren_count += 1
+                start = line.find("(") + len("(")
+                line = line[start:]
+            if open_paren_count > 0:
+                open_paren_count += line.count('(')
+                open_paren_count -= line.count(')')
+                if open_paren_count == 0:
+                    end = line.rfind(')')
+                    signature += line[:end]
+                    method_to_signature[method_name] = process_signature(signature)
+                    method_to_class_name[method_name] = class_name
+                    method_name, class_name, signature = "", "", ""
+                elif open_paren_count < 0:
+                    raise RuntimeError("open parenthesis count < 0. This shouldn't be possible.")
+                else:
+                    signature += line.strip('\n').strip(' ')
+    return method_to_signature, method_to_class_name, special_output_type, doc_string_dict
+
+
+def parse_datapipe_files(file_paths: Set[str]) -> Tuple[Dict[str, str], Dict[str, str], Set[str], Dict[str, List[str]]]:
+    methods_and_signatures, methods_and_class_names, methods_with_special_output_types = {}, {}, set()
+    methods_and_doc_strings = {}
+    for path in file_paths:
+        (
+            method_to_signature,
+            method_to_class_name,
+            methods_needing_special_output_types,
+            doc_string_dict,
+        ) = parse_datapipe_file(path)
+        methods_and_signatures.update(method_to_signature)
+        methods_and_class_names.update(method_to_class_name)
+        methods_with_special_output_types.update(methods_needing_special_output_types)
+        methods_and_doc_strings.update(doc_string_dict)
+    return methods_and_signatures, methods_and_class_names, methods_with_special_output_types, methods_and_doc_strings
+
+
+def split_outside_bracket(line: str, delimiter: str = ",") -> List[str]:
+    """Given a line of text, split it on comma unless the comma is within a bracket '[]'."""
+    bracket_count = 0
+    curr_token = ""
+    res = []
+    for char in line:
+        if char == "[":
+            bracket_count += 1
+        elif char == "]":
+            bracket_count -= 1
+        elif char == delimiter and bracket_count == 0:
+            res.append(curr_token)
+            curr_token = ""
+            continue
+        curr_token += char
+    res.append(curr_token)
+    return res
+
+
+def process_signature(line: str) -> str:
+    """
+    Clean up a given raw function signature.
+
+    This includes removing the self-referential datapipe argument, default
+    arguments of input functions, newlines, and spaces.
+    """
+    tokens: List[str] = split_outside_bracket(line)
+    for i, token in enumerate(tokens):
+        tokens[i] = token.strip(' ')
+        if token == "cls":
+            tokens[i] = "self"
+        elif i > 0 and ("self" == tokens[i - 1]) and (tokens[i][0] != "*"):
+            # Remove the datapipe after 'self' or 'cls' unless it has '*'
+            tokens[i] = ""
+        elif "Callable =" in token:  # Remove default argument if it is a function
+            head, default_arg = token.rsplit("=", 2)
+            tokens[i] = head.strip(' ') + "= ..."
+    tokens = [t for t in tokens if t != ""]
+    line = ', '.join(tokens)
+    return line
+
+
+def get_method_definitions(file_path: Union[str, List[str]],
+                           files_to_exclude: Set[str],
+                           deprecated_files: Set[str],
+                           default_output_type: str,
+                           method_to_special_output_type: Dict[str, str],
+                           root: str = "") -> List[str]:
+    """
+    #.pyi generation for functional DataPipes Process.
+
+    # 1. Find files that we want to process (exclude the ones who don't)
+    # 2. Parse method name and signature
+    # 3. Remove first argument after self (unless it is "*datapipes"), default args, and spaces
+    """
+    if root == "":
+        root = str(pathlib.Path(__file__).parent.resolve())
+    file_path = [file_path] if isinstance(file_path, str) else file_path
+    file_path = [os.path.join(root, path) for path in file_path]
+    file_paths = find_file_paths(file_path,
+                                 files_to_exclude=files_to_exclude.union(deprecated_files))
+    methods_and_signatures, methods_and_class_names, methods_w_special_output_types, methods_and_doc_strings = \
+        parse_datapipe_files(file_paths)
+
+    for fn_name in method_to_special_output_type:
+        if fn_name not in methods_w_special_output_types:
+            methods_w_special_output_types.add(fn_name)
+
+    method_definitions = []
+    for method_name, arguments in methods_and_signatures.items():
+        class_name = methods_and_class_names[method_name]
+        if method_name in methods_w_special_output_types:
+            output_type = method_to_special_output_type[method_name]
+        else:
+            output_type = default_output_type
+        doc_string = "".join(methods_and_doc_strings[method_name])
+        if doc_string == "":
+            doc_string = "    ...\n"
+        method_definitions.append(f"# Functional form of '{class_name}'\n"
+                                  f"def {method_name}({arguments}) -> {output_type}:\n"
+                                  f"{doc_string}")
+    method_definitions.sort(key=lambda s: s.split('\n')[1])  # sorting based on method_name
+
+    return method_definitions
+
+
+# Defined outside of main() so they can be imported by TorchData
+iterDP_file_path: str = "iter"
+iterDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
+iterDP_deprecated_files: Set[str] = set()
+iterDP_method_to_special_output_type: Dict[str, str] = {"demux": "List[IterDataPipe]", "fork": "List[IterDataPipe]"}
+
+mapDP_file_path: str = "map"
+mapDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"}
+mapDP_deprecated_files: Set[str] = set()
+mapDP_method_to_special_output_type: Dict[str, str] = {"shuffle": "IterDataPipe"}
+
+
+def main() -> None:
+    """
+    # Inject file into template datapipe.pyi.in.
+
+    TODO: The current implementation of this script only generates interfaces for built-in methods. To generate
+          interface for user-defined DataPipes, consider changing `IterDataPipe.register_datapipe_as_function`.
+    """
+    iter_method_definitions = get_method_definitions(iterDP_file_path, iterDP_files_to_exclude, iterDP_deprecated_files,
+                                                     "IterDataPipe", iterDP_method_to_special_output_type)
+
+    map_method_definitions = get_method_definitions(mapDP_file_path, mapDP_files_to_exclude, mapDP_deprecated_files,
+                                                    "MapDataPipe", mapDP_method_to_special_output_type)
+
+    path = pathlib.Path(__file__).parent.resolve()
+    replacements = [('${IterDataPipeMethods}', iter_method_definitions, 4),
+                    ('${MapDataPipeMethods}', map_method_definitions, 4)]
+    gen_from_template(dir=str(path),
+                      template_name="datapipe.pyi.in",
+                      output_name="datapipe.pyi",
+                      replacements=replacements)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f3a76954352995705d01825eb85031fb73f86b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__init__.py
@@ -0,0 +1,64 @@
+from torch.utils.data.datapipes.iter.utils import (
+    IterableWrapperIterDataPipe as IterableWrapper,
+)
+from torch.utils.data.datapipes.iter.callable import (
+    CollatorIterDataPipe as Collator,
+    MapperIterDataPipe as Mapper,
+)
+from torch.utils.data.datapipes.iter.combinatorics import (
+    SamplerIterDataPipe as Sampler,
+    ShufflerIterDataPipe as Shuffler,
+)
+from torch.utils.data.datapipes.iter.combining import (
+    ConcaterIterDataPipe as Concater,
+    DemultiplexerIterDataPipe as Demultiplexer,
+    ForkerIterDataPipe as Forker,
+    MultiplexerIterDataPipe as Multiplexer,
+    ZipperIterDataPipe as Zipper,
+)
+from torch.utils.data.datapipes.iter.filelister import (
+    FileListerIterDataPipe as FileLister,
+)
+from torch.utils.data.datapipes.iter.fileopener import (
+    FileOpenerIterDataPipe as FileOpener,
+)
+from torch.utils.data.datapipes.iter.grouping import (
+    BatcherIterDataPipe as Batcher,
+    GrouperIterDataPipe as Grouper,
+    UnBatcherIterDataPipe as UnBatcher,
+)
+from torch.utils.data.datapipes.iter.sharding import (
+    ShardingFilterIterDataPipe as ShardingFilter,
+)
+from torch.utils.data.datapipes.iter.routeddecoder import (
+    RoutedDecoderIterDataPipe as RoutedDecoder,
+)
+from torch.utils.data.datapipes.iter.selecting import (
+    FilterIterDataPipe as Filter,
+)
+from torch.utils.data.datapipes.iter.streamreader import (
+    StreamReaderIterDataPipe as StreamReader,
+)
+
+__all__ = ['Batcher',
+           'Collator',
+           'Concater',
+           'Demultiplexer',
+           'FileLister',
+           'FileOpener',
+           'Filter',
+           'Forker',
+           'Grouper',
+           'IterableWrapper',
+           'Mapper',
+           'Multiplexer',
+           'RoutedDecoder',
+           'Sampler',
+           'ShardingFilter',
+           'Shuffler',
+           'StreamReader',
+           'UnBatcher',
+           'Zipper']
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0a2ff38b2ca7ad734bc2aa38ecae7406c528634
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/callable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/callable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87b08f5b8be918c3638f1fb50c32b6a752d1adb7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/callable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combinatorics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combinatorics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea6725d4a363d7d9ff1c242d2dd841dd0a7e80d7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combinatorics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combining.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combining.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba260602975ee1544210f3a4b334477456e444cf
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/combining.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/filelister.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/filelister.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07fedb990cae56ab582449c6c540191e6d447557
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/filelister.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/fileopener.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/fileopener.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9243f25718ba7032b60b31e48a92a641cefd653c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/fileopener.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/grouping.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/grouping.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b7b13b4c0532f711a14268641e43f6698d00fb7
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/grouping.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/routeddecoder.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/routeddecoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8626eda779cd819c38bcd311d22da68485b89330
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/routeddecoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/selecting.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/selecting.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3aadba2b3d5b1434ff47b6a6a31bfa148b79138f
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/selecting.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/sharding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/sharding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0f860d11dbbf4e2b278bea08da3df8240f69806
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/sharding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/streamreader.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/streamreader.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d982ad63bcfc7ca58876cea0024955606617e64
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/streamreader.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afdecb74ccc476019fa6b008d3bb2e4fc65bccbc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/callable.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/callable.py
new file mode 100644
index 0000000000000000000000000000000000000000..09504c4bc9948ce63e3d4bb830307394fcff7d20
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/callable.py
@@ -0,0 +1,237 @@
+import functools
+from collections import namedtuple
+
+from typing import Callable, Iterator, Sized, TypeVar, Optional, Union, Any, Dict, List
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data._utils.collate import default_collate
+from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import (_check_unpickable_fn,
+                                                     validate_input_col)
+
+__all__ = [
+    "CollatorIterDataPipe",
+    "MapperIterDataPipe",
+]
+
+T_co = TypeVar("T_co", covariant=True)
+
+
+@functional_datapipe("map")
+class MapperIterDataPipe(IterDataPipe[T_co]):
+    r"""
+    Applies a function over each item from the source DataPipe (functional name: ``map``).
+
+    The function can be any regular Python function or partial object. Lambda
+    function is not recommended as it is not supported by pickle.
+
+    Args:
+        datapipe: Source Iterable DataPipe
+        fn: Function being applied over each item
+        input_col: Index or indices of data which ``fn`` is applied, such as:
+
+            - ``None`` as default to apply ``fn`` to the data directly.
+            - Integer(s) is used for list/tuple.
+            - Key(s) is used for dict.
+
+        output_col: Index of data where result of ``fn`` is placed. ``output_col`` can be specified
+            only when ``input_col`` is not ``None``
+
+            - ``None`` as default to replace the index that ``input_col`` specified; For ``input_col`` with
+              multiple indices, the left-most one is used, and other indices will be removed.
+            - Integer is used for list/tuple. ``-1`` represents to append result at the end.
+            - Key is used for dict. New key is acceptable.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper, Mapper
+        >>> def add_one(x):
+        ...     return x + 1
+        >>> dp = IterableWrapper(range(10))
+        >>> map_dp_1 = dp.map(add_one)  # Invocation via functional form is preferred
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle`
+        >>> # Use `functools.partial` or explicitly define the function instead
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    """
+
+    datapipe: IterDataPipe
+    fn: Callable
+
+    def __init__(
+        self,
+        datapipe: IterDataPipe,
+        fn: Callable,
+        input_col=None,
+        output_col=None,
+    ) -> None:
+        super().__init__()
+        self.datapipe = datapipe
+
+        _check_unpickable_fn(fn)
+        self.fn = fn  # type: ignore[assignment]
+
+        self.input_col = input_col
+        if input_col is None and output_col is not None:
+            raise ValueError("`output_col` must be None when `input_col` is None.")
+        if isinstance(output_col, (list, tuple)):
+            if len(output_col) > 1:
+                raise ValueError("`output_col` must be a single-element list or tuple")
+            output_col = output_col[0]
+        self.output_col = output_col
+        validate_input_col(fn, input_col)
+
+    def _apply_fn(self, data):
+        if self.input_col is None and self.output_col is None:
+            return self.fn(data)
+
+        if self.input_col is None:
+            res = self.fn(data)
+        elif isinstance(self.input_col, (list, tuple)):
+            args = tuple(data[col] for col in self.input_col)
+            res = self.fn(*args)
+        else:
+            res = self.fn(data[self.input_col])
+
+        # Copy tuple to list and run in-place modification because tuple is immutable.
+        if isinstance(data, tuple):
+            t_flag = True
+            data = list(data)
+        else:
+            t_flag = False
+
+        if self.output_col is None:
+            if isinstance(self.input_col, (list, tuple)):
+                data[self.input_col[0]] = res
+                for idx in sorted(self.input_col[1:], reverse=True):
+                    del data[idx]
+            else:
+                data[self.input_col] = res
+        else:
+            if self.output_col == -1:
+                data.append(res)
+            else:
+                data[self.output_col] = res
+
+        # Convert list back to tuple
+        return tuple(data) if t_flag else data
+
+    def __iter__(self) -> Iterator[T_co]:
+        for data in self.datapipe:
+            yield self._apply_fn(data)
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            return len(self.datapipe)
+        raise TypeError(
+            f"{type(self).__name__} instance doesn't have valid length"
+        )
+
+
+def _collate_helper(conversion, item):
+    # TODO(VitalyFedyunin): Verify that item is any sort of batch
+    if len(item.items) > 1:
+        # TODO(VitalyFedyunin): Compact all batch dataframes into one
+        raise Exception("Only supports one DataFrame per batch")
+    df = item[0]
+    columns_name = df_wrapper.get_columns(df)
+    tuple_names: List = []
+    tuple_values: List = []
+
+    for name in conversion.keys():
+        if name not in columns_name:
+            raise Exception("Conversion keys missmatch")
+
+    for name in columns_name:
+        if name in conversion:
+            if not callable(conversion[name]):
+                raise Exception('Collate (DF)DataPipe requires callable as dict values')
+            collation_fn = conversion[name]
+        else:
+            # TODO(VitalyFedyunin): Add default collation into df_wrapper
+            try:
+                import torcharrow.pytorch as tap  # type: ignore[import]
+                collation_fn = tap.rec.Default()
+            except Exception as e:
+                raise Exception("unable to import default collation function from the TorchArrow") from e
+
+        tuple_names.append(str(name))
+        value = collation_fn(df[name])
+        tuple_values.append(value)
+
+    # TODO(VitalyFedyunin): We can dynamically extract types from the tuple_values here
+    # TODO(VitalyFedyunin): Instead of ignoring mypy error, make sure tuple_names is not empty
+    tpl_cls = namedtuple("CollateResult", tuple_names)  # type: ignore[misc]
+    tuple = tpl_cls(*tuple_values)
+    return tuple
+
+
+@functional_datapipe("collate")
+class CollatorIterDataPipe(MapperIterDataPipe):
+    r"""
+    Collates samples from DataPipe to Tensor(s) by a custom collate function (functional name: ``collate``).
+
+    By default, it uses :func:`torch.utils.data.default_collate`.
+
+    .. note::
+        While writing a custom collate function, you can import :func:`torch.utils.data.default_collate` for the
+        default behavior and `functools.partial` to specify any additional arguments.
+
+    Args:
+        datapipe: Iterable DataPipe being collated
+        collate_fn: Customized collate function to collect and combine data or a batch of data.
+            Default function collates to Tensor(s) based on data type.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> # Convert integer data to float Tensor
+        >>> class MyIterDataPipe(torch.utils.data.IterDataPipe):
+        ...     def __init__(self, start, end):
+        ...         super(MyIterDataPipe).__init__()
+        ...         assert end > start, "this example code only works with end >= start"
+        ...         self.start = start
+        ...         self.end = end
+        ...
+        ...     def __iter__(self):
+        ...         return iter(range(self.start, self.end))
+        ...
+        ...     def __len__(self):
+        ...         return self.end - self.start
+        ...
+        >>> ds = MyIterDataPipe(start=3, end=7)
+        >>> print(list(ds))
+        [3, 4, 5, 6]
+        >>> def collate_fn(batch):
+        ...     return torch.tensor(batch, dtype=torch.float)
+        ...
+        >>> collated_ds = CollateIterDataPipe(ds, collate_fn=collate_fn)
+        >>> print(list(collated_ds))
+        [tensor(3.), tensor(4.), tensor(5.), tensor(6.)]
+    """
+
+    def __init__(
+        self,
+        datapipe: IterDataPipe,
+        conversion: Optional[
+            Union[
+            Callable[..., Any],
+            Dict[Union[str, Any], Union[Callable, Any]],
+            ]
+        ] = default_collate,
+        collate_fn: Optional[Callable] = None,
+    ) -> None:
+        # TODO(VitalyFedyunin): Replace `Callable[..., Any]` with `Callable[[IColumn], Any]`
+        # TODO(VitalyFedyunin): Replace with `Dict[Union[str, IColumn], Union[Callable, Enum]]`
+        if collate_fn is not None:
+            super().__init__(datapipe, fn=collate_fn)
+        else:
+            if callable(conversion):
+                super().__init__(datapipe, fn=conversion)
+            else:
+                # TODO(VitalyFedyunin): Validate passed dictionary
+                collate_fn = functools.partial(_collate_helper, conversion)
+                super().__init__(datapipe, fn=collate_fn)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combinatorics.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combinatorics.py
new file mode 100644
index 0000000000000000000000000000000000000000..7360ea6a221323bcc37ba18431d09a126be767c5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combinatorics.py
@@ -0,0 +1,183 @@
+import random
+import torch
+
+from torch.utils.data import Sampler, SequentialSampler
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from typing import Dict, Iterator, List, Optional, Sized, Tuple, Type, TypeVar
+
+__all__ = [
+    "SamplerIterDataPipe",
+    "ShufflerIterDataPipe",
+]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+class SamplerIterDataPipe(IterDataPipe[T_co]):
+    r"""
+    Generate sample elements using the provided ``Sampler`` (defaults to :class:`SequentialSampler`).
+
+    Args:
+        datapipe: IterDataPipe to sample from
+        sampler: Sampler class to generate sample elements from input DataPipe.
+            Default is :class:`SequentialSampler` for IterDataPipe
+    """
+
+    datapipe: IterDataPipe
+    sampler: Sampler
+
+    def __init__(self,
+                 datapipe: IterDataPipe,
+                 sampler: Type[Sampler] = SequentialSampler,
+                 sampler_args: Optional[Tuple] = None,
+                 sampler_kwargs: Optional[Dict] = None
+                 ) -> None:
+        assert isinstance(datapipe, Sized), \
+            "Sampler class requires input datapipe implemented `__len__`"
+        super().__init__()
+        self.datapipe = datapipe
+        self.sampler_args = () if sampler_args is None else sampler_args
+        self.sampler_kwargs = {} if sampler_kwargs is None else sampler_kwargs
+        # https://github.com/python/mypy/pull/9629 will solve
+        self.sampler = sampler(*self.sampler_args, data_source=self.datapipe, **self.sampler_kwargs)  # type: ignore[misc]
+
+    def __iter__(self) -> Iterator[T_co]:
+        return iter(self.sampler)
+
+    def __len__(self) -> int:
+        # Dataset has been tested as `Sized`
+        if isinstance(self.sampler, Sized):
+            return len(self.sampler)
+        raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+
+@functional_datapipe('shuffle')
+class ShufflerIterDataPipe(IterDataPipe[T_co]):
+    r"""
+    Shuffle the input DataPipe with a buffer (functional name: ``shuffle``).
+
+    The buffer with ``buffer_size`` is filled with elements from the datapipe first. Then,
+    each item will be yielded from the buffer by reservoir sampling via iterator.
+
+    ``buffer_size`` is required to be larger than ``0``. For ``buffer_size == 1``, the
+    datapipe is not shuffled. In order to fully shuffle all elements from datapipe,
+    ``buffer_size`` is required to be greater than or equal to the size of datapipe.
+
+    When it is used with :class:`torch.utils.data.DataLoader`, the methods to
+    set up random seed are different based on :attr:`num_workers`.
+
+    For single-process mode (:attr:`num_workers == 0`), the random seed is set before
+    the :class:`~torch.utils.data.DataLoader` in the main process. For multi-process
+    mode (:attr:`num_worker > 0`), `worker_init_fn` is used to set up a random seed
+    for each worker process.
+
+    Args:
+        datapipe: The IterDataPipe being shuffled
+        buffer_size: The buffer size for shuffling (default to ``10000``)
+        unbatch_level: Specifies if it is necessary to unbatch source data before
+            applying the shuffle
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> shuffle_dp = dp.shuffle()
+        >>> list(shuffle_dp)
+        [0, 4, 1, 6, 3, 2, 9, 5, 7, 8]
+    """
+
+    datapipe: IterDataPipe[T_co]
+    buffer_size: int
+    _buffer: List[T_co]
+    _enabled: bool
+    _seed: Optional[int]
+    _rng: random.Random
+
+    def __init__(self,
+                 datapipe: IterDataPipe[T_co],
+                 *,
+                 buffer_size: int = 10000,
+                 unbatch_level: int = 0
+                 ) -> None:
+        super().__init__()
+        # TODO: Performance optimization
+        #       buffer can be a fixed size and remove expensive `append()` and `len()` operations
+        self._buffer: List[T_co] = []
+        assert buffer_size > 0, "buffer_size should be larger than 0"
+        if unbatch_level == 0:
+            self.datapipe = datapipe
+        else:
+            self.datapipe = datapipe.unbatch(unbatch_level=unbatch_level)
+        self.buffer_size = buffer_size
+        self._enabled = True
+        self._seed = None
+        self._rng = random.Random()
+
+    def set_shuffle(self, shuffle=True):
+        self._enabled = shuffle
+        return self
+
+    def set_seed(self, seed: int):
+        self._seed = seed
+        return self
+
+    def __iter__(self) -> Iterator[T_co]:
+        if not self._enabled:
+            yield from self.datapipe
+        else:
+            for x in self.datapipe:
+                if len(self._buffer) == self.buffer_size:
+                    idx = self._rng.randint(0, len(self._buffer) - 1)
+                    val, self._buffer[idx] = self._buffer[idx], x
+                    yield val
+                else:
+                    self._buffer.append(x)
+            while self._buffer:
+                idx = self._rng.randint(0, len(self._buffer) - 1)
+                yield self._buffer.pop(idx)
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            return len(self.datapipe)
+        raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+    def reset(self) -> None:
+        self._buffer = []
+        if self._enabled:
+            if self._seed is None:
+                self._seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            self._rng.seed(self._seed)
+            self._seed = None
+
+    def __getstate__(self):
+        state = (
+            self.datapipe,
+            self.buffer_size,
+            self._enabled,
+            self._seed,
+            self._buffer,
+            self._rng.getstate(),
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.datapipe,
+            self.buffer_size,
+            self._enabled,
+            self._seed,
+            self._buffer,
+            rng_state,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self._rng = random.Random()
+        self._rng.setstate(rng_state)
+
+    def __del__(self):
+        self._buffer.clear()
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combining.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combining.py
new file mode 100644
index 0000000000000000000000000000000000000000..d261081dc4c2030dc6add2ec4a4fac16afbf6456
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/combining.py
@@ -0,0 +1,639 @@
+import warnings
+
+from abc import ABC, abstractmethod
+from collections import deque
+import copy as copymodule
+from typing import Any, Callable, Iterator, List, Literal, Optional, Sized, Tuple, TypeVar, Deque
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes._hook_iterator import _SnapshotState
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import StreamWrapper, _check_unpickable_fn
+
+__all__ = [
+    "ConcaterIterDataPipe",
+    "DemultiplexerIterDataPipe",
+    "ForkerIterDataPipe",
+    "MultiplexerIterDataPipe",
+    "ZipperIterDataPipe",
+]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+@functional_datapipe('concat')
+class ConcaterIterDataPipe(IterDataPipe):
+    r"""
+    Concatenates multiple Iterable DataPipes (functional name: ``concat``).
+
+    The resulting DataPipe will yield all the elements from the first input DataPipe, before yielding from the subsequent ones.
+
+    Args:
+        datapipes: Iterable DataPipes being concatenated
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> import random
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1 = IterableWrapper(range(3))
+        >>> dp2 = IterableWrapper(range(5))
+        >>> list(dp1.concat(dp2))
+        [0, 1, 2, 0, 1, 2, 3, 4]
+    """
+
+    datapipes: Tuple[IterDataPipe]
+
+    def __init__(self, *datapipes: IterDataPipe):
+        if len(datapipes) == 0:
+            raise ValueError("Expected at least one DataPipe, but got nothing")
+        if not all(isinstance(dp, IterDataPipe) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `IterDataPipe`")
+        self.datapipes = datapipes  # type: ignore[assignment]
+
+    def __iter__(self) -> Iterator:
+        for dp in self.datapipes:
+            yield from dp
+
+    def __len__(self) -> int:
+        if all(isinstance(dp, Sized) for dp in self.datapipes):
+            return sum(len(dp) for dp in self.datapipes)
+        else:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+
+@functional_datapipe('fork')
+class ForkerIterDataPipe(IterDataPipe):
+    r"""
+    Creates multiple instances of the same Iterable DataPipe (functional name: ``fork``).
+
+    Args:
+        datapipe: Iterable DataPipe being copied
+        num_instances: number of instances of the datapipe to create
+        buffer_size: this restricts how far ahead the leading child DataPipe
+           can read relative to the slowest child DataPipe.
+           Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+        copy: copy strategy to use for items yielded by each branch. Supported
+            options are ``None`` for no copying, ``"shallow"`` for shallow object
+            copies, and ``"deep"`` for deep object copies. Defaults to ``None``.
+
+    Note:
+        All branches of the forked pipeline return the identical object unless
+        the copy parameter is supplied. If the object is mutable or contains
+        mutable objects, changing them in one branch will affect all others.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper(range(5))
+        >>> dp1, dp2 = source_dp.fork(num_instances=2)
+        >>> list(dp1)
+        [0, 1, 2, 3, 4]
+        >>> list(dp2)
+        [0, 1, 2, 3, 4]
+    """
+
+    def __new__(
+        cls,
+        datapipe: IterDataPipe,
+        num_instances: int,
+        buffer_size: int = 1000,
+        copy: Optional[Literal["shallow", "deep"]] = None
+    ):
+        if num_instances < 1:
+            raise ValueError(f"Expected `num_instances` larger than 0, but {num_instances} is found")
+        if num_instances == 1:
+            return datapipe
+        container = _ForkerIterDataPipe(datapipe, num_instances, buffer_size, copy)  # type: ignore[abstract]
+        return [_ChildDataPipe(container, i) for i in range(num_instances)]
+
+
+class _ContainerTemplate(ABC):
+    r"""Abstract class for container ``DataPipes``. The followings are three required methods."""
+
+    @abstractmethod
+    def get_next_element_by_instance(self, instance_id: int):
+        ...
+
+    @abstractmethod
+    def is_every_instance_exhausted(self) -> bool:
+        ...
+
+    @abstractmethod
+    def reset(self) -> None:
+        ...
+
+    @abstractmethod
+    def get_length_by_instance(self, instance_id: int):
+        r"""Raise TypeError if it's not supposed to be implemented to support `list(datapipe)`."""
+
+
+def _no_op(x):
+    return x
+
+
+class _ForkerIterDataPipe(IterDataPipe, _ContainerTemplate):
+    r"""
+    Container to hold instance-specific information on behalf of ForkerIterDataPipe.
+
+    It tracks the state of its child DataPipes, maintains the buffer, and yields the next value
+    as requested by the child DataPipes.
+    """
+
+    def __init__(
+        self,
+        datapipe: IterDataPipe,
+        num_instances: int,
+        buffer_size: int = 1000,
+        copy: Optional[Literal["shallow", "deep"]] = None
+    ):
+        self.main_datapipe = datapipe
+        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self.num_instances = num_instances
+        self.buffer: Deque = deque()
+        self.buffer_size = buffer_size
+        if self.buffer_size < 0:
+            warnings.warn(
+                "Unlimited buffer size is set for `fork`, "
+                "please be aware of OOM at random places",
+                UserWarning
+            )
+        if copy is None:
+            self.copy_fn = _no_op
+        elif copy == "shallow":
+            self.copy_fn = copymodule.copy
+        elif copy == "deep":
+            self.copy_fn = copymodule.deepcopy
+        else:
+            raise ValueError(f"Unknown copy method `{copy}` requested, choose one of None, `shallow` or `deep`.")
+
+        self.child_pointers: List[int] = [0] * num_instances  # Indicate the indices of the next element to get
+        self.slowest_ptr = 0  # The index to read by the slowest child
+        self.leading_ptr = 0  # The index to read by the fastest child
+        self.end_ptr: Optional[int] = None  # The index to stop child
+        self._child_stop: List[bool] = [True for _ in range(num_instances)]
+
+    def __len__(self):
+        return len(self.main_datapipe)
+
+    def get_next_element_by_instance(self, instance_id: int):
+        if self._datapipe_iterator is None and self._child_stop[instance_id]:
+            self._datapipe_iterator = iter(self.main_datapipe)
+            self._snapshot_state = _SnapshotState.Iterating
+            for i in range(self.num_instances):
+                self._child_stop[i] = False
+        try:
+            while not self._child_stop[instance_id]:
+                self.child_pointers[instance_id] += 1
+                if self.end_ptr is not None and self.child_pointers[instance_id] == self.end_ptr:
+                    self._child_stop[instance_id] = True
+                    break
+                # Use buffer
+                if self.buffer and self.child_pointers[instance_id] <= self.leading_ptr:
+                    idx = self.child_pointers[instance_id] - self.slowest_ptr - 1
+                    return_val = self.buffer[idx]
+                else:  # Retrieve one element from main datapipe
+                    self.leading_ptr = self.child_pointers[instance_id]
+                    try:
+                        return_val = next(self._datapipe_iterator)  # type: ignore[arg-type]
+                        self.buffer.append(return_val)
+                    except StopIteration:
+                        self._child_stop[instance_id] = True
+                        self._datapipe_iterator = None
+                        self.end_ptr = self.leading_ptr
+                        continue
+                if self.child_pointers[instance_id] == self.slowest_ptr + 1:
+                    new_min = min(self.child_pointers)  # Can optimize by avoiding the call to min()
+                    if self.slowest_ptr < new_min:
+                        self.slowest_ptr = new_min
+                        self.buffer.popleft()
+                if self.buffer_size >= 0 and self.leading_ptr > self.buffer_size + self.slowest_ptr:
+                    raise BufferError("ForkerIterDataPipe buffer overflow," +
+                                      f"buffer size {self.buffer_size} is insufficient.")
+
+                yield self.copy_fn(return_val)  # type: ignore[possibly-undefined]
+        finally:
+            self._child_stop[instance_id] = True
+            # Cleanup _datapipe_iterator for the case that fork exits earlier
+            if all(self._child_stop):
+                self._datapipe_iterator = None
+                self._cleanup()
+
+    def is_every_instance_exhausted(self) -> bool:
+        return self.end_ptr is not None and all(self._child_stop)
+
+    def get_length_by_instance(self, instance_id: int) -> int:
+        return len(self.main_datapipe)
+
+    def reset(self) -> None:
+        self._datapipe_iterator = None
+        self.buffer = deque()
+        self.child_pointers = [0] * self.num_instances
+        self.slowest_ptr = 0
+        self.leading_ptr = 0
+        self.end_ptr = None
+        self._child_stop = [True for _ in range(self.num_instances)]
+
+    def __getstate__(self):
+        state = (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+            self.copy_fn,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+            self.copy_fn,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self._datapipe_iterator = None
+        self.buffer = deque()
+        self.child_pointers = [0] * self.num_instances
+        self.slowest_ptr = 0
+        self.leading_ptr = 0
+        self.end_ptr = None
+        self._child_stop = [True for _ in range(self.num_instances)]
+
+    def _cleanup(self):
+        while self.buffer:
+            d = self.buffer.popleft()
+            StreamWrapper.close_streams(d)
+
+    def __del__(self):
+        self._cleanup()
+
+
+class _ChildDataPipe(IterDataPipe):
+    r"""
+    Iterable Datapipe that is a child of a main DataPipe.
+
+    The instance of this class will pass its instance_id to get the next value from its main DataPipe.
+
+    Note:
+        ChildDataPipe, like all other IterDataPipe, follows the single iterator per IterDataPipe constraint.
+        Since ChildDataPipes share a common buffer, when an iterator is created for one of the ChildDataPipes,
+        the previous iterators  for all ChildDataPipes must be invalidated, with the exception when a ChildDataPipe
+        hasn't had an iterator created from it since the last invalidation. See the example below.
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> # Singler Iterator per IteraDataPipe Invalidation
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper(range(10))
+        >>> cdp1, cdp2 = source_dp.fork(num_instances=2)
+        >>> it1, it2 = iter(cdp1), iter(cdp2)
+        >>> it3 = iter(cdp1)
+        >>> # The line above invalidates `it1` and `it2`, and resets `ForkerIterDataPipe`.
+        >>> it4 = iter(cdp2)
+        >>> # The line above doesn't invalidate `it3`, because an iterator for `cdp2` hasn't been created since
+        >>> # the last invalidation.
+
+    Args:
+        main_datapipe: Main DataPipe with a method 'get_next_element_by_instance(instance_id)'
+        instance_id: integer identifier of this instance
+    """
+
+    _is_child_datapipe: bool = True
+
+    def __init__(self, main_datapipe: IterDataPipe, instance_id: int):
+        assert isinstance(main_datapipe, _ContainerTemplate)
+
+        self.main_datapipe: IterDataPipe = main_datapipe
+        self.instance_id = instance_id
+
+    def __iter__(self):
+        # Note that the logic behind setting iterator ID and `reset` are handled within `hook_iterator`
+        # We want to separate the code for reset and yield, so that 'reset' executes before __next__ is called
+        return self.main_datapipe.get_next_element_by_instance(self.instance_id)
+
+    def __len__(self):
+        return self.main_datapipe.get_length_by_instance(self.instance_id)
+
+    # This method is called by `hook_iterator` in `_typing.py`.
+    def _set_main_datapipe_valid_iterator_id(self) -> int:
+        r"""
+        Update the valid iterator ID for both this DataPipe object and `main_datapipe`.
+
+        `main_datapipe.reset()` is called when the ID is incremented to a new generation.
+        """
+        # 1. First time any child iterator is created
+        if self.main_datapipe._valid_iterator_id is None:
+            self.main_datapipe._valid_iterator_id = 0  # type: ignore[attr-defined]
+        # 2. This instance was already in the same generation as `main_datapipe`,
+        #    we need to increment the ID further by 1
+        elif self.main_datapipe._valid_iterator_id == self._valid_iterator_id:  # type: ignore[has-type]
+            self.main_datapipe._valid_iterator_id += 1  # type: ignore[attr-defined]
+            # Whenever a new generation of iterator is created, the `main_datapipe` must reset
+            if not self.main_datapipe.is_every_instance_exhausted():
+                warnings.warn("Some child DataPipes are not exhausted when __iter__ is called. We are resetting "
+                              "the buffer and each child DataPipe will read from the start again.", UserWarning)
+            self.main_datapipe.reset()
+        # 3. Otherwise, the iterator is behind the others, so it will just need to catch up by setting
+        #    the instance's iterator to match that of `main_datapipe`
+        self._valid_iterator_id = self.main_datapipe._valid_iterator_id
+        return self._valid_iterator_id
+
+    # This method is called by `hook_iterator` in `_typing.py`.
+    def _check_valid_iterator_id(self, iterator_id) -> bool:
+        r"""Check the valid iterator ID against that of DataPipe object and that of `main_datapipe`."""
+        return iterator_id == self._valid_iterator_id and iterator_id == self.main_datapipe._valid_iterator_id
+
+
+@functional_datapipe('demux')
+class DemultiplexerIterDataPipe(IterDataPipe):
+    r"""
+    Splits the input DataPipe into multiple child DataPipes, using the given classification function (functional name: ``demux``).
+
+    A list of the child DataPipes is returned from this operation.
+
+    Args:
+        datapipe: Iterable DataPipe being filtered
+        num_instances: number of instances of the DataPipe to create
+        classifier_fn: a function that maps values to an integer within the range ``[0, num_instances - 1]`` or ``None``
+        drop_none: defaults to ``False``, if ``True``, the function will skip over elements classified as ``None``
+        buffer_size: this defines the maximum number of inputs that the buffer can hold across all child
+            DataPipes while waiting for their values to be yielded.
+            Defaults to ``1000``. Use ``-1`` for the unlimited buffer.
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def odd_or_even(n):
+        ...     return n % 2
+        >>> source_dp = IterableWrapper(range(5))
+        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even)
+        >>> list(dp1)
+        [0, 2, 4]
+        >>> list(dp2)
+        [1, 3]
+        >>> # It can also filter out any element that gets `None` from the `classifier_fn`
+        >>> def odd_or_even_no_zero(n):
+        ...     return n % 2 if n != 0 else None
+        >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True)
+        >>> list(dp1)
+        [2, 4]
+        >>> list(dp2)
+        [1, 3]
+    """
+
+    def __new__(cls, datapipe: IterDataPipe, num_instances: int,
+                classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000):
+        if num_instances < 1:
+            raise ValueError(f"Expected `num_instances` larger than 0, but {num_instances} is found")
+
+        _check_unpickable_fn(classifier_fn)
+
+        # When num_instances == 1, demux can be replaced by filter,
+        # but keep it as Demultiplexer for the sake of consistency
+        # like throwing Error when classification result is out of o range
+        container = _DemultiplexerIterDataPipe(datapipe, num_instances, classifier_fn, drop_none, buffer_size)  # type: ignore[abstract]
+        return [_ChildDataPipe(container, i) for i in range(num_instances)]
+
+
+class _DemultiplexerIterDataPipe(IterDataPipe, _ContainerTemplate):
+    r"""
+    Container to hold instance-specific information on behalf of DemultiplexerIterDataPipe.
+
+    It tracks the state of its child DataPipes, maintains the buffer, classifies and yields the next correct value
+    as requested by the child DataPipes.
+    """
+
+    def __init__(self, datapipe: IterDataPipe[T_co], num_instances: int,
+                 classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool, buffer_size: int):
+        self.main_datapipe = datapipe
+        self._datapipe_iterator: Optional[Iterator[Any]] = None
+        self.num_instances = num_instances
+        self.buffer_size = buffer_size
+        if self.buffer_size < 0:
+            warnings.warn(
+                "Unlimited buffer size is set for `demux`, "
+                "please be aware of OOM at random places",
+                UserWarning
+            )
+        self.current_buffer_usage = 0
+        self.child_buffers: List[Deque[T_co]] = [deque() for _ in range(num_instances)]
+        self.classifier_fn = classifier_fn
+        self.drop_none = drop_none
+        self.main_datapipe_exhausted = False
+        self._child_stop: List[bool] = [True for _ in range(num_instances)]
+
+    def _find_next(self, instance_id: int) -> T_co:  # type: ignore[type-var]
+        while True:
+            if self.main_datapipe_exhausted or self._child_stop[instance_id]:
+                raise StopIteration
+            if self._datapipe_iterator is None:
+                raise ValueError(
+                    "_datapipe_iterator has not been set, likely because this private method is called directly "
+                    "without invoking get_next_element_by_instance() first.")
+            value = next(self._datapipe_iterator)
+            classification = self.classifier_fn(value)
+            if classification is None and self.drop_none:
+                StreamWrapper.close_streams(value)
+                continue
+            if classification is None or classification >= self.num_instances or classification < 0:
+                raise ValueError(f"Output of the classification fn should be between 0 and {self.num_instances - 1}. " +
+                                 f"{classification} is returned.")
+            if classification == instance_id:
+                return value
+            self.child_buffers[classification].append(value)
+            self.current_buffer_usage += 1
+            if self.buffer_size >= 0 and self.current_buffer_usage > self.buffer_size:
+                raise BufferError(
+                    f"DemultiplexerIterDataPipe buffer overflow, buffer size {self.buffer_size} is insufficient.")
+
+    def get_next_element_by_instance(self, instance_id: int):
+        if self._datapipe_iterator is None and self._child_stop[instance_id]:
+            self._datapipe_iterator = iter(self.main_datapipe)
+            self._snapshot_state = _SnapshotState.Iterating  # This is necessary for the DataPipe to reset properly.
+            self.main_datapipe_exhausted = False
+            for i in range(self.num_instances):
+                self._child_stop[i] = False
+
+        try:
+            while not self._child_stop[instance_id]:
+                if self.child_buffers[instance_id]:
+                    self.current_buffer_usage -= 1
+                    yield self.child_buffers[instance_id].popleft()
+                else:
+                    try:
+                        yield self._find_next(instance_id)
+                    except StopIteration:
+                        self._child_stop[instance_id] = True
+                        self.main_datapipe_exhausted = True
+                        self._datapipe_iterator = None
+        finally:
+            self._child_stop[instance_id] = True
+            # Cleanup _datapipe_iterator for the case that demux exits earlier
+            if all(self._child_stop):
+                self._datapipe_iterator = None
+            if self.child_buffers[instance_id]:
+                self._cleanup(instance_id)
+
+    def is_every_instance_exhausted(self) -> bool:
+        return self.main_datapipe_exhausted and all(self._child_stop)
+
+    def get_length_by_instance(self, instance_id: int) -> int:
+        raise TypeError
+
+    def reset(self) -> None:
+        self._datapipe_iterator = None
+        self.current_buffer_usage = 0
+        self.child_buffers = [deque() for _ in range(self.num_instances)]
+        self._child_stop = [True for _ in range(self.num_instances)]
+        self.main_datapipe_exhausted = False
+
+    def __getstate__(self):
+        state = (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+            self.classifier_fn,
+            self.drop_none,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.main_datapipe,
+            self.num_instances,
+            self.buffer_size,
+            self.classifier_fn,
+            self.drop_none,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self._datapipe_iterator = None
+        self.current_buffer_usage = 0
+        self.child_buffers = [deque() for _ in range(self.num_instances)]
+        self._child_stop = [True for _ in range(self.num_instances)]
+        self.main_datapipe_exhausted = False
+
+    def _cleanup(self, instance_id: Optional[int] = None):
+        ids = range(self.num_instances) if instance_id is None else [instance_id, ]
+        for i in ids:
+            q = self.child_buffers[i]
+            while q:
+                d = q.popleft()
+                StreamWrapper.close_streams(d)
+
+
+    def __del__(self):
+        self._cleanup()
+
+
+@functional_datapipe('mux')
+class MultiplexerIterDataPipe(IterDataPipe):
+    r"""
+    Yields one element at a time from each of the input Iterable DataPipes (functional name: ``mux``).
+
+    As in, one element from the 1st input DataPipe, then one element from the 2nd DataPipe in the next iteration,
+    and so on. It ends when the shortest input DataPipe is exhausted.
+
+    Args:
+        datapipes: Iterable DataPipes that will take turn to yield their elements, until the shortest DataPipe is exhausted
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1, dp2, dp3 = IterableWrapper(range(3)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> list(dp1.mux(dp2, dp3))
+        [0, 10, 20, 1, 11, 21, 2, 12, 22]
+    """
+
+    def __init__(self, *datapipes):
+        self.datapipes = datapipes
+        self.buffer: List = []  # Store values to be yielded only when every iterator provides one
+
+    def __iter__(self):
+        iterators = [iter(x) for x in self.datapipes]
+        while len(iterators):
+            for it in iterators:
+                try:
+                    value = next(it)
+                    self.buffer.append(value)
+                except StopIteration:
+                    self.buffer.clear()
+                    return
+            yield from self.buffer
+            self.buffer.clear()
+
+    def __len__(self):
+        if all(isinstance(dp, Sized) for dp in self.datapipes):
+            return min(len(dp) for dp in self.datapipes) * len(self.datapipes)
+        else:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+    def reset(self) -> None:
+        self.buffer = []
+
+    def __getstate__(self):
+        state = (
+            self.datapipes,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.datapipes,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self.buffer = []
+
+    def __del__(self):
+        self.buffer.clear()
+
+
+@functional_datapipe('zip')
+class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]):
+    r"""
+    Aggregates elements into a tuple from each of the input DataPipes (functional name: ``zip``).
+
+    The output is stopped as soon as the shortest input DataPipe is exhausted.
+
+    Args:
+        *datapipes: Iterable DataPipes being aggregated
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:torchdata)
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25))
+        >>> list(dp1.zip(dp2, dp3))
+        [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)]
+    """
+
+    datapipes: Tuple[IterDataPipe]
+
+    def __init__(self, *datapipes: IterDataPipe):
+        if not all(isinstance(dp, IterDataPipe) for dp in datapipes):
+            raise TypeError("All inputs are required to be `IterDataPipe` "
+                            "for `ZipIterDataPipe`.")
+        super().__init__()
+        self.datapipes = datapipes  # type: ignore[assignment]
+
+    def __iter__(self) -> Iterator[Tuple[T_co]]:
+        iterators = [iter(datapipe) for datapipe in self.datapipes]
+        yield from zip(*iterators)
+
+    def __len__(self) -> int:
+        if all(isinstance(dp, Sized) for dp in self.datapipes):
+            return min(len(dp) for dp in self.datapipes)
+        else:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/filelister.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/filelister.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d20e5e8b87554c7c1c181b7db06029eedf63df
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/filelister.py
@@ -0,0 +1,66 @@
+from typing import Iterator, List, Sequence, Union
+
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.iter import IterableWrapper
+from torch.utils.data.datapipes.utils.common import get_file_pathnames_from_root
+
+__all__ = ["FileListerIterDataPipe", ]
+
+
+@functional_datapipe("list_files")
+class FileListerIterDataPipe(IterDataPipe[str]):
+    r"""
+    Given path(s) to the root directory, yields file pathname(s) (path + filename) of files within the root directory.
+
+    Multiple root directories can be provided (functional name: ``list_files``).
+
+    Args:
+        root: Root directory or a sequence of root directories
+        masks: Unix style filter string or string list for filtering file name(s)
+        recursive: Whether to return pathname from nested directories or not
+        abspath: Whether to return relative pathname or absolute pathname
+        non_deterministic: Whether to return pathname in sorted order or not.
+            If ``False``, the results yielded from each root directory will be sorted
+        length: Nominal length of the datapipe
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import FileLister
+        >>> dp = FileLister(root=".", recursive=True)
+        >>> list(dp)
+        ['example.py', './data/data.tar']
+    """
+
+    def __init__(
+        self,
+        root: Union[str, Sequence[str], IterDataPipe] = '.',
+        masks: Union[str, List[str]] = '',
+        *,
+        recursive: bool = False,
+        abspath: bool = False,
+        non_deterministic: bool = False,
+        length: int = -1
+    ) -> None:
+        super().__init__()
+        if isinstance(root, str):
+            root = [root, ]
+        if not isinstance(root, IterDataPipe):
+            root = IterableWrapper(root)
+        self.datapipe: IterDataPipe = root
+        self.masks: Union[str, List[str]] = masks
+        self.recursive: bool = recursive
+        self.abspath: bool = abspath
+        self.non_deterministic: bool = non_deterministic
+        self.length: int = length
+
+    def __iter__(self) -> Iterator[str] :
+        for path in self.datapipe:
+            yield from get_file_pathnames_from_root(path, self.masks, self.recursive, self.abspath, self.non_deterministic)
+
+    def __len__(self):
+        if self.length == -1:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+        return self.length
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/fileopener.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/fileopener.py
new file mode 100644
index 0000000000000000000000000000000000000000..324630beea96cb1764184925bf44923031bdf645
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/fileopener.py
@@ -0,0 +1,71 @@
+from io import IOBase
+from typing import Iterable, Tuple, Optional
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames
+
+__all__ = [
+    "FileOpenerIterDataPipe",
+]
+
+
+@functional_datapipe("open_files")
+class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]):
+    r"""
+    Given pathnames, opens files and yield pathname and file stream in a tuple (functional name: ``open_files``).
+
+    Args:
+        datapipe: Iterable datapipe that provides pathnames
+        mode: An optional string that specifies the mode in which
+            the file is opened by ``open()``. It defaults to ``r``, other options are
+            ``b`` for reading in binary mode and ``t`` for text mode.
+        encoding: An optional string that specifies the encoding of the
+            underlying file. It defaults to ``None`` to match the default encoding of ``open``.
+        length: Nominal length of the datapipe
+
+    Note:
+        The opened file handles will be closed by Python's GC periodically. Users can choose
+        to close them explicitly.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader
+        >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt'))
+        >>> dp = FileOpener(dp)
+        >>> dp = StreamReader(dp)
+        >>> list(dp)
+        [('./abc.txt', 'abc')]
+    """
+
+    def __init__(
+            self,
+            datapipe: Iterable[str],
+            mode: str = 'r',
+            encoding: Optional[str] = None,
+            length: int = -1):
+        super().__init__()
+        self.datapipe: Iterable = datapipe
+        self.mode: str = mode
+        self.encoding: Optional[str] = encoding
+
+        if self.mode not in ('b', 't', 'rb', 'rt', 'r'):
+            raise ValueError(f"Invalid mode {mode}")
+        # TODO: enforce typing for each instance based on mode, otherwise
+        #       `argument_validation` with this DataPipe may be potentially broken
+
+        if 'b' in mode and encoding is not None:
+            raise ValueError("binary mode doesn't take an encoding argument")
+
+        self.length: int = length
+
+    # Remove annotation due to 'IOBase' is a general type and true type
+    # is determined at runtime based on mode. Some `DataPipe` requiring
+    # a subtype would cause mypy error.
+    def __iter__(self):
+        yield from get_file_binaries_from_pathnames(self.datapipe, self.mode, self.encoding)
+
+    def __len__(self):
+        if self.length == -1:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+        return self.length
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/grouping.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/grouping.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea33913833e11aee6dab8509249282ffb20ec54
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/grouping.py
@@ -0,0 +1,300 @@
+import warnings
+from collections import defaultdict
+from typing import Any, Callable, DefaultDict, Iterator, List, Optional, Sized, TypeVar
+
+import torch.utils.data.datapipes.iter.sharding
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import DataChunk, IterDataPipe
+from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
+
+__all__ = [
+    "BatcherIterDataPipe",
+    "GrouperIterDataPipe",
+    "UnBatcherIterDataPipe",
+]
+
+T_co = TypeVar("T_co", covariant=True)
+
+def __getattr__(name: str):
+    if name in ["SHARDING_PRIORITIES", "ShardingFilterIterDataPipe"]:
+        warnings.warn(f"`{name}` from `torch.utils.data.datapipes.iter.grouping` is going to be removed in PyTorch 2.1"
+                      f"Please use `{name}` from the `torch.utils.data.datapipes.iter.sharding`",
+                      category=FutureWarning, stacklevel=2)
+
+        return getattr(torch.utils.data.datapipes.iter.sharding, name)
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
+
+@functional_datapipe('batch')
+class BatcherIterDataPipe(IterDataPipe[DataChunk]):
+    r"""
+    Creates mini-batches of data (functional name: ``batch``).
+
+    An outer dimension will be added as ``batch_size`` if ``drop_last`` is set to ``True``, or ``length % batch_size`` for the
+    last batch if ``drop_last`` is set to ``False``.
+
+    Args:
+        datapipe: Iterable DataPipe being batched
+        batch_size: The size of each batch
+        drop_last: Option to drop the last batch if it's not full
+        wrapper_class: wrapper to apply onto each batch (type ``List``) before yielding,
+            defaults to ``DataChunk``
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> dp = dp.batch(batch_size=3, drop_last=True)
+        >>> list(dp)
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    """
+
+    datapipe: IterDataPipe
+    batch_size: int
+    drop_last: bool
+
+    def __init__(self,
+                 datapipe: IterDataPipe,
+                 batch_size: int,
+                 drop_last: bool = False,
+                 wrapper_class=DataChunk,
+                 ) -> None:
+        assert batch_size > 0, "Batch size is required to be larger than 0!"
+        super().__init__()
+        self.datapipe = datapipe
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.wrapper_class = wrapper_class
+
+    def __iter__(self) -> Iterator[DataChunk]:
+        batch: List = []
+        for x in self.datapipe:
+            batch.append(x)
+            if len(batch) == self.batch_size:
+                yield self.wrapper_class(batch)
+                batch = []
+        if len(batch) > 0:
+            if not self.drop_last:
+                yield self.wrapper_class(batch)
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            if self.drop_last:
+                return len(self.datapipe) // self.batch_size
+            else:
+                return (len(self.datapipe) + self.batch_size - 1) // self.batch_size
+        else:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
+
+
+@functional_datapipe('unbatch')
+class UnBatcherIterDataPipe(IterDataPipe):
+    r"""
+    Undos batching of data (functional name: ``unbatch``).
+
+    In other words, it flattens the data up to the specified level within a batched DataPipe.
+
+    Args:
+        datapipe: Iterable DataPipe being un-batched
+        unbatch_level: Defaults to ``1`` (only flattening the top level). If set to ``2``,
+            it will flatten the top two levels, and ``-1`` will flatten the entire DataPipe.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper([[[0, 1], [2]], [[3, 4], [5]], [[6]]])
+        >>> dp1 = source_dp.unbatch()
+        >>> list(dp1)
+        [[0, 1], [2], [3, 4], [5], [6]]
+        >>> dp2 = source_dp.unbatch(unbatch_level=2)
+        >>> list(dp2)
+        [0, 1, 2, 3, 4, 5, 6]
+    """
+
+    def __init__(self,
+                 datapipe: IterDataPipe,
+                 unbatch_level: int = 1):
+        self.datapipe = datapipe
+        self.unbatch_level = unbatch_level
+
+    def __iter__(self):
+        for element in self.datapipe:
+            yield from self._dive(element, unbatch_level=self.unbatch_level)
+
+    def _dive(self, element, unbatch_level):
+        if unbatch_level < -1:
+            raise ValueError("unbatch_level must be -1 or >= 0")
+        if unbatch_level == -1:
+            if isinstance(element, (list, DataChunk)):
+                for item in element:
+                    yield from self._dive(item, unbatch_level=-1)
+            else:
+                yield element
+        elif unbatch_level == 0:
+            yield element
+        else:
+            if isinstance(element, (list, DataChunk)):
+                for item in element:
+                    yield from self._dive(item, unbatch_level=unbatch_level - 1)
+            else:
+                raise IndexError(f"unbatch_level {self.unbatch_level} exceeds the depth of the DataPipe")
+
+
+@functional_datapipe('groupby')
+class GrouperIterDataPipe(IterDataPipe[DataChunk]):
+    r"""
+    Groups data from IterDataPipe by keys from ``group_key_fn``, yielding a ``DataChunk`` with batch size up to ``group_size``.
+
+    (functional name: ``groupby``).
+
+    The samples are read sequentially from the source ``datapipe``, and a batch of samples belonging to the same group
+    will be yielded as soon as the size of the batch reaches ``group_size``. When the buffer is full,
+    the DataPipe will yield the largest batch with the same key, provided that its size is larger
+    than ``guaranteed_group_size``. If its size is smaller, it will be dropped if ``drop_remaining=True``.
+
+    After iterating through the entirety of source ``datapipe``, everything not dropped due to the buffer capacity
+    will be yielded from the buffer, even if the group sizes are smaller than ``guaranteed_group_size``.
+
+    Args:
+        datapipe: Iterable datapipe to be grouped
+        group_key_fn: Function used to generate group key from the data of the source datapipe
+        keep_key: Option to yield the matching key along with the items in a tuple,
+            resulting in `(key, [items])` otherwise returning [items]
+        buffer_size: The size of buffer for ungrouped data
+        group_size: The max size of each group, a batch is yielded as soon as it reaches this size
+        guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full
+        drop_remaining: Specifies if the group smaller than ``guaranteed_group_size`` will be dropped from buffer
+            when the buffer is full
+
+    Example:
+        >>> import os
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def group_fn(file):
+        ...     return os.path.basename(file).split(".")[0]
+        >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"])
+        >>> dp0 = source_dp.groupby(group_key_fn=group_fn)
+        >>> list(dp0)
+        [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']]
+        >>> # A group is yielded as soon as its size equals to `group_size`
+        >>> dp1 = source_dp.groupby(group_key_fn=group_fn, group_size=2)
+        >>> list(dp1)
+        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
+        >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size`
+        >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2)
+        >>> list(dp2)
+        [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']]
+    """
+
+    def __init__(self,
+                 datapipe: IterDataPipe[T_co],
+                 group_key_fn: Callable[[T_co], Any],
+                 *,
+                 keep_key: bool = False,
+                 buffer_size: int = 10000,
+                 group_size: Optional[int] = None,
+                 guaranteed_group_size: Optional[int] = None,
+                 drop_remaining: bool = False):
+        _check_unpickable_fn(group_key_fn)
+        self.datapipe = datapipe
+        self.group_key_fn = group_key_fn
+
+        self.keep_key = keep_key
+        self.max_buffer_size = buffer_size
+        self.buffer_elements: DefaultDict[Any, List] = defaultdict(list)
+        self.curr_buffer_size = 0
+        self.group_size = group_size
+        self.guaranteed_group_size = None
+        if group_size is not None and buffer_size is not None:
+            assert 0 < group_size <= buffer_size
+            self.guaranteed_group_size = group_size
+        if guaranteed_group_size is not None:
+            assert group_size is not None and 0 < guaranteed_group_size <= group_size
+            self.guaranteed_group_size = guaranteed_group_size
+        self.drop_remaining = drop_remaining
+        self.wrapper_class = DataChunk
+
+    def _remove_biggest_key(self):
+        biggest_key = None
+        biggest_size = 0
+        result_to_yield = None
+        for findkey in self.buffer_elements.keys():
+            if len(self.buffer_elements[findkey]) > biggest_size:
+                biggest_size = len(self.buffer_elements[findkey])
+                biggest_key = findkey
+
+        if self.guaranteed_group_size is not None and biggest_size < self.guaranteed_group_size and not self.drop_remaining:
+            raise RuntimeError('Failed to group items', str(self.buffer_elements[biggest_key]))
+
+        if self.guaranteed_group_size is None or biggest_size >= self.guaranteed_group_size:
+            result_to_yield = self.buffer_elements[biggest_key]
+
+        self.curr_buffer_size -= biggest_size
+        del self.buffer_elements[biggest_key]
+
+        return result_to_yield
+
+    def __iter__(self):
+        for x in self.datapipe:
+            key = self.group_key_fn(x)
+
+            self.buffer_elements[key].append(x)
+            self.curr_buffer_size += 1
+
+            if self.group_size is not None and self.group_size == len(self.buffer_elements[key]):
+                result: DataChunk[Any] = self.wrapper_class(self.buffer_elements[key])
+                yield (key, result) if self.keep_key else result
+                self.curr_buffer_size -= len(self.buffer_elements[key])
+                del self.buffer_elements[key]
+
+            if self.curr_buffer_size == self.max_buffer_size:
+                result_to_yield = self._remove_biggest_key()
+                if result_to_yield is not None:
+                    result = self.wrapper_class(result_to_yield)
+                    yield (key, result) if self.keep_key else result
+
+        for key in tuple(self.buffer_elements.keys()):
+            result = self.wrapper_class(self.buffer_elements.pop(key))
+            self.curr_buffer_size -= len(result)
+            yield (key, result) if self.keep_key else result
+
+    def reset(self) -> None:
+        self.curr_buffer_size = 0
+        self.buffer_elements = defaultdict(list)
+
+    def __getstate__(self):
+        state = (
+            self.datapipe,
+            self.group_key_fn,
+            self.keep_key,
+            self.max_buffer_size,
+            self.group_size,
+            self.guaranteed_group_size,
+            self.drop_remaining,
+            self.wrapper_class,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.datapipe,
+            self.group_key_fn,
+            self.keep_key,
+            self.max_buffer_size,
+            self.group_size,
+            self.guaranteed_group_size,
+            self.drop_remaining,
+            self.wrapper_class,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self.curr_buffer_size = 0
+        self.buffer_elements = defaultdict(list)
+
+    def __del__(self):
+        self.buffer_elements.clear()
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/routeddecoder.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/routeddecoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d4a2aaf2bb231e275977a01ff9551ca5169d668
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/routeddecoder.py
@@ -0,0 +1,66 @@
+from io import BufferedIOBase
+from typing import Any, Callable, Iterable, Iterator, Sized, Tuple
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.utils.common import _deprecation_warning
+from torch.utils.data.datapipes.utils.decoder import (
+    Decoder,
+    basichandlers as decoder_basichandlers,
+    imagehandler as decoder_imagehandler,
+    extension_extract_fn
+)
+
+__all__ = ["RoutedDecoderIterDataPipe", ]
+
+
+@functional_datapipe('routed_decode')
+class RoutedDecoderIterDataPipe(IterDataPipe[Tuple[str, Any]]):
+    r"""
+    Decodes binary streams from input DataPipe, yields pathname and decoded data in a tuple.
+
+    (functional name: ``routed_decode``)
+
+    Args:
+        datapipe: Iterable datapipe that provides pathname and binary stream in tuples
+        handlers: Optional user defined decoder handlers. If ``None``, basic and image decoder
+            handlers will be set as default. If multiple handles are provided, the priority
+            order follows the order of handlers (the first handler has the top priority)
+        key_fn: Function for decoder to extract key from pathname to dispatch handlers.
+            Default is set to extract file extension from pathname
+
+    Note:
+        When ``key_fn`` is specified returning anything other than extension, the default
+        handler will not work and users need to specify custom handler. Custom handler
+        could use regex to determine the eligibility to handle data.
+    """
+
+    def __init__(self,
+                 datapipe: Iterable[Tuple[str, BufferedIOBase]],
+                 *handlers: Callable,
+                 key_fn: Callable = extension_extract_fn) -> None:
+        super().__init__()
+        self.datapipe: Iterable[Tuple[str, BufferedIOBase]] = datapipe
+        if not handlers:
+            handlers = (decoder_basichandlers, decoder_imagehandler('torch'))
+        self.decoder = Decoder(*handlers, key_fn=key_fn)
+        _deprecation_warning(
+            type(self).__name__,
+            deprecation_version="1.12",
+            removal_version="1.13",
+            old_functional_name="routed_decode",
+        )
+
+    def add_handler(self, *handler: Callable) -> None:
+        self.decoder.add_handler(*handler)
+
+    def __iter__(self) -> Iterator[Tuple[str, Any]]:
+        for data in self.datapipe:
+            pathname = data[0]
+            result = self.decoder(data)
+            yield (pathname, result[pathname])
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            return len(self.datapipe)
+        raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/selecting.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/selecting.py
new file mode 100644
index 0000000000000000000000000000000000000000..635939438894bbcc80db20b03f19096de230687e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/selecting.py
@@ -0,0 +1,96 @@
+from typing import Callable, Iterator, Tuple, TypeVar
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
+from torch.utils.data.datapipes.utils.common import (
+    _check_unpickable_fn,
+    StreamWrapper,
+    validate_input_col
+)
+
+
+__all__ = ["FilterIterDataPipe", ]
+
+T = TypeVar('T')
+T_co = TypeVar('T_co', covariant=True)
+
+
+@functional_datapipe('filter')
+class FilterIterDataPipe(IterDataPipe[T_co]):
+    r"""
+    Filters out elements from the source datapipe according to input ``filter_fn`` (functional name: ``filter``).
+
+    Args:
+        datapipe: Iterable DataPipe being filtered
+        filter_fn: Customized function mapping an element to a boolean.
+        input_col: Index or indices of data which ``filter_fn`` is applied, such as:
+
+            - ``None`` as default to apply ``filter_fn`` to the data directly.
+            - Integer(s) is used for list/tuple.
+            - Key(s) is used for dict.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> def is_even(n):
+        ...     return n % 2 == 0
+        >>> dp = IterableWrapper(range(5))
+        >>> filter_dp = dp.filter(filter_fn=is_even)
+        >>> list(filter_dp)
+        [0, 2, 4]
+    """
+
+    datapipe: IterDataPipe[T_co]
+    filter_fn: Callable
+
+    def __init__(
+        self,
+        datapipe: IterDataPipe[T_co],
+        filter_fn: Callable,
+        input_col=None,
+    ) -> None:
+        super().__init__()
+        self.datapipe = datapipe
+
+        _check_unpickable_fn(filter_fn)
+        self.filter_fn = filter_fn  # type: ignore[assignment]
+
+        self.input_col = input_col
+        validate_input_col(filter_fn, input_col)
+
+    def _apply_filter_fn(self, data) -> bool:
+        if self.input_col is None:
+            return self.filter_fn(data)
+        elif isinstance(self.input_col, (list, tuple)):
+            args = tuple(data[col] for col in self.input_col)
+            return self.filter_fn(*args)
+        else:
+            return self.filter_fn(data[self.input_col])
+
+    def __iter__(self) -> Iterator[T_co]:
+        for data in self.datapipe:
+            condition, filtered = self._returnIfTrue(data)
+            if condition:
+                yield filtered
+            else:
+                StreamWrapper.close_streams(data)
+
+    def _returnIfTrue(self, data: T) -> Tuple[bool, T]:
+        condition = self._apply_filter_fn(data)
+
+        if df_wrapper.is_column(condition):
+            # We are operating on DataFrames filter here
+            result = []
+            for idx, mask in enumerate(df_wrapper.iterate(condition)):
+                if mask:
+                    result.append(df_wrapper.get_item(data, idx))
+            if len(result):
+                return True, df_wrapper.concat(result)
+            else:
+                return False, None  # type: ignore[return-value]
+
+        if not isinstance(condition, bool):
+            raise ValueError("Boolean output is required for `filter_fn` of FilterIterDataPipe, got", type(condition))
+
+        return condition, data
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/sharding.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..55a4927a73fa3af12152da9e45867544a8e63785
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/sharding.py
@@ -0,0 +1,84 @@
+from typing import (
+    Dict,
+    Sized,
+    Tuple,
+)
+
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from enum import IntEnum
+
+__all__ = [
+    "SHARDING_PRIORITIES",
+    "ShardingFilterIterDataPipe",
+]
+
+
+class SHARDING_PRIORITIES(IntEnum):
+    DEFAULT = 1
+    DISTRIBUTED = 2
+    MULTIPROCESSING = 3
+
+
+class _ShardingIterDataPipe(IterDataPipe):
+    def apply_sharding(self, num_of_instances: int, instance_id: int, sharding_group: SHARDING_PRIORITIES):
+        raise NotImplementedError
+
+
+@functional_datapipe('sharding_filter')
+class ShardingFilterIterDataPipe(_ShardingIterDataPipe):
+    r"""
+    Wrapper that allows DataPipe to be sharded (functional name: ``sharding_filter``).
+
+    After ``apply_sharding`` is called, each instance of the DataPipe (on different workers) will have every `n`-th element of the
+    original DataPipe, where `n` equals to the number of instances.
+
+    Args:
+        source_datapipe: Iterable DataPipe that will be sharded
+    """
+
+    def __init__(self, source_datapipe: IterDataPipe, sharding_group_filter=None):
+        self.source_datapipe = source_datapipe
+        self.sharding_group_filter = sharding_group_filter
+        self.groups: Dict[int, Tuple[int, int]] = {}
+        self.num_of_instances = 1
+        self.instance_id = 0
+        self._update_num_of_instances()
+
+    def apply_sharding(self, num_of_instances, instance_id, sharding_group=SHARDING_PRIORITIES.DEFAULT):
+        if instance_id >= num_of_instances:
+            raise ValueError(f"instance_id({instance_id}) should be smaller than num_of_instances({num_of_instances})")
+        if sharding_group == SHARDING_PRIORITIES.DEFAULT:
+            if len(self.groups) and SHARDING_PRIORITIES.DEFAULT not in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        else:
+            if SHARDING_PRIORITIES.DEFAULT in self.groups:
+                raise Exception('ShardingFilter cannot mix DEFAULT and non DEFAULT groups')
+        self.groups[sharding_group] = (num_of_instances, instance_id)
+        self._update_num_of_instances()
+
+    def _update_num_of_instances(self):
+        sorted_sharding_groups = []
+        for key in sorted(self.groups.keys()):
+            if self.sharding_group_filter is None or key == self.sharding_group_filter:
+                sorted_sharding_groups.append(self.groups[key])
+
+        sorted_sharding_groups.reverse()
+
+        self.num_of_instances = 1
+        self.instance_id = 0
+
+        for group_num_of_instances, group_instance_id in sorted_sharding_groups:
+            self.instance_id += self.num_of_instances * group_instance_id
+            self.num_of_instances *= group_num_of_instances
+
+    def __iter__(self):
+        for i, item in enumerate(self.source_datapipe):
+            if i % self.num_of_instances == self.instance_id:
+                yield item
+
+    def __len__(self):
+        if isinstance(self.source_datapipe, Sized):
+            return len(self.source_datapipe) // self.num_of_instances +\
+                (1 if (self.instance_id < len(self.source_datapipe) % self.num_of_instances) else 0)
+        raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/streamreader.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/streamreader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f37d0001a7093388b7f127b528d58e0011ccdd4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/streamreader.py
@@ -0,0 +1,40 @@
+from typing import Tuple
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+
+__all__ = ["StreamReaderIterDataPipe", ]
+
+
+@functional_datapipe('read_from_stream')
+class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]):
+    r"""
+    Given IO streams and their label names, yield bytes with label name as tuple.
+
+    (functional name: ``read_from_stream``).
+
+    Args:
+        datapipe: Iterable DataPipe provides label/URL and byte stream
+        chunk: Number of bytes to be read from stream per iteration.
+            If ``None``, all bytes will be read until the EOF.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper, StreamReader
+        >>> from io import StringIO
+        >>> dp = IterableWrapper([("alphabet", StringIO("abcde"))])
+        >>> list(StreamReader(dp, chunk=1))
+        [('alphabet', 'a'), ('alphabet', 'b'), ('alphabet', 'c'), ('alphabet', 'd'), ('alphabet', 'e')]
+    """
+
+    def __init__(self, datapipe, chunk=None):
+        self.datapipe = datapipe
+        self.chunk = chunk
+
+    def __iter__(self):
+        for furl, stream in self.datapipe:
+            while True:
+                d = stream.read(self.chunk)
+                if not d:
+                    stream.close()
+                    break
+                yield (furl, d)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/utils.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9cc3e6c118fd91ab0e1b525ccdf8ccda1c422bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/iter/utils.py
@@ -0,0 +1,51 @@
+import copy
+import warnings
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+
+__all__ = ["IterableWrapperIterDataPipe", ]
+
+
+class IterableWrapperIterDataPipe(IterDataPipe):
+    r"""
+    Wraps an iterable object to create an IterDataPipe.
+
+    Args:
+        iterable: Iterable object to be wrapped into an IterDataPipe
+        deepcopy: Option to deepcopy input iterable object for each
+            iterator. The copy is made when the first element is read in ``iter()``.
+
+    .. note::
+        If ``deepcopy`` is explicitly set to ``False``, users should ensure
+        that the data pipeline doesn't contain any in-place operations over
+        the iterable instance to prevent data inconsistency across iterations.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> dp = IterableWrapper(range(10))
+        >>> list(dp)
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    """
+
+    def __init__(self, iterable, deepcopy=True):
+        self.iterable = iterable
+        self.deepcopy = deepcopy
+
+    def __iter__(self):
+        source_data = self.iterable
+        if self.deepcopy:
+            try:
+                source_data = copy.deepcopy(self.iterable)
+            # For the case that data cannot be deep-copied,
+            # all in-place operations will affect iterable variable.
+            # When this DataPipe is iterated second time, it will
+            # yield modified items.
+            except TypeError:
+                warnings.warn(
+                    "The input iterable can not be deepcopied, "
+                    "please be aware of in-place modification would affect source data."
+                )
+        yield from source_data
+
+    def __len__(self):
+        return len(self.iterable)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b506b29a30cf73bfcf899af37a9e59704972ad8
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__init__.py
@@ -0,0 +1,17 @@
+# Functional DataPipe
+from torch.utils.data.datapipes.map.callable import MapperMapDataPipe as Mapper
+from torch.utils.data.datapipes.map.combinatorics import ShufflerIterDataPipe as Shuffler
+from torch.utils.data.datapipes.map.combining import (
+    ConcaterMapDataPipe as Concater,
+    ZipperMapDataPipe as Zipper
+)
+from torch.utils.data.datapipes.map.grouping import (
+    BatcherMapDataPipe as Batcher
+)
+from torch.utils.data.datapipes.map.utils import SequenceWrapperMapDataPipe as SequenceWrapper
+
+
+__all__ = ['Batcher', 'Concater', 'Mapper', 'SequenceWrapper', 'Shuffler', 'Zipper']
+
+# Please keep this list sorted
+assert __all__ == sorted(__all__)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..719cfd273ee426fff267c0d2dd57bd61b1c05813
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/callable.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/callable.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..139affe61c697491e77d377ed453313d30454aac
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/callable.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combinatorics.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combinatorics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7217ec1bdb096a7d6279627030ba934ecf55daff
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combinatorics.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combining.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combining.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a25b35ae5429437086a30fecb06fa80509e08bc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/combining.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/grouping.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/grouping.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9e87b3aa0e6ff1d14fd2ca14c8cd2425d2ef2ad
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/grouping.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c95d1396c82612f45e582d48a61e0e275906e6a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/__pycache__/utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/callable.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/callable.py
new file mode 100644
index 0000000000000000000000000000000000000000..729a4e30a35b6c0fe0aa69ebfc3e9d9d42323ae2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/callable.py
@@ -0,0 +1,61 @@
+from torch.utils.data.datapipes.utils.common import _check_unpickable_fn
+from typing import Callable, TypeVar
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
+
+__all__ = ["MapperMapDataPipe", "default_fn"]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+# Default function to return each item directly
+# In order to keep datapipe picklable, eliminates the usage
+# of python lambda function
+def default_fn(data):
+    return data
+
+
+@functional_datapipe('map')
+class MapperMapDataPipe(MapDataPipe[T_co]):
+    r"""
+    Apply the input function over each item from the source DataPipe (functional name: ``map``).
+
+    The function can be any regular Python function or partial object. Lambda
+    function is not recommended as it is not supported by pickle.
+
+    Args:
+        datapipe: Source MapDataPipe
+        fn: Function being applied to each item
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper, Mapper
+        >>> def add_one(x):
+        ...     return x + 1
+        >>> dp = SequenceWrapper(range(10))
+        >>> map_dp_1 = dp.map(add_one)
+        >>> list(map_dp_1)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        >>> map_dp_2 = Mapper(dp, lambda x: x + 1)
+        >>> list(map_dp_2)
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    """
+
+    datapipe: MapDataPipe
+    fn: Callable
+
+    def __init__(
+        self,
+        datapipe: MapDataPipe,
+        fn: Callable = default_fn,
+    ) -> None:
+        super().__init__()
+        self.datapipe = datapipe
+        _check_unpickable_fn(fn)
+        self.fn = fn  # type: ignore[assignment]
+
+    def __len__(self) -> int:
+        return len(self.datapipe)
+
+    def __getitem__(self, index) -> T_co:
+        return self.fn(self.datapipe[index])
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combinatorics.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combinatorics.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2ee0ca8cda844bdee807f4a6c44371885491694
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combinatorics.py
@@ -0,0 +1,126 @@
+import random
+
+import torch
+from torch.utils.data.datapipes.datapipe import IterDataPipe, MapDataPipe
+from typing import Iterator, List, Optional, TypeVar
+
+__all__ = ["ShufflerIterDataPipe", ]
+
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+# @functional_datapipe('shuffle')
+class ShufflerIterDataPipe(IterDataPipe[T_co]):
+    r"""
+    Shuffle the input MapDataPipe via its indices (functional name: ``shuffle``).
+
+    When it is used with :class:`~torch.utils.data.DataLoader`, the methods to
+    set up random seed are different based on :attr:`num_workers`.
+
+    For single-process mode (:attr:`num_workers == 0`), the random seed is set before
+    the :class:`~torch.utils.data.DataLoader` in the main process. For multi-process
+    mode (:attr:`num_worker > 0`), ``worker_init_fn`` is used to set up a random seed
+    for each worker process.
+
+    Args:
+        datapipe: MapDataPipe being shuffled
+        indices: a list of indices of the MapDataPipe. If not provided, we assume it uses 0-based indexing
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> shuffle_dp = dp.shuffle().set_seed(0)
+        >>> list(shuffle_dp)
+        [7, 8, 1, 5, 3, 4, 2, 0, 9, 6]
+        >>> list(shuffle_dp)
+        [6, 1, 9, 5, 2, 4, 7, 3, 8, 0]
+        >>> # Reset seed for Shuffler
+        >>> shuffle_dp = shuffle_dp.set_seed(0)
+        >>> list(shuffle_dp)
+        [7, 8, 1, 5, 3, 4, 2, 0, 9, 6]
+
+    Note:
+        Even thought this ``shuffle`` operation takes a ``MapDataPipe`` as the input, it would return an
+        ``IterDataPipe`` rather than a ``MapDataPipe``, because ``MapDataPipe`` should be non-sensitive to
+        the order of data order for the sake of random reads, but ``IterDataPipe`` depends on the order
+        of data during data-processing.
+    """
+
+    datapipe: MapDataPipe[T_co]
+    _enabled: bool
+    _seed: Optional[int]
+    _rng: random.Random
+
+    def __init__(self,
+                 datapipe: MapDataPipe[T_co],
+                 *,
+                 indices: Optional[List] = None,
+                 ) -> None:
+        super().__init__()
+        self.datapipe = datapipe
+        self.indices = list(range(len(datapipe))) if indices is None else indices
+        self._enabled = True
+        self._seed = None
+        self._rng = random.Random()
+        self._shuffled_indices: List = self.indices
+
+    def set_shuffle(self, shuffle=True):
+        self._enabled = shuffle
+        return self
+
+    def set_seed(self, seed: int):
+        self._seed = seed
+        return self
+
+    def __iter__(self) -> Iterator[T_co]:
+        if not self._enabled:
+            for idx in self.indices:
+                yield self.datapipe[idx]
+        else:
+            while self._shuffled_indices:
+                idx = self._shuffled_indices.pop()
+                yield self.datapipe[idx]
+
+    def reset(self) -> None:
+        if self._enabled and self._seed is None:
+            self._seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        self._rng.seed(self._seed)
+        self._seed = None
+        self._shuffled_indices = self._rng.sample(self.indices, len(self.indices))
+
+    def __len__(self) -> int:
+        return len(self.datapipe)
+
+    def __getstate__(self):
+        state = (
+            self.datapipe,
+            self.indices,
+            self._enabled,
+            self._seed,
+            self._rng.getstate(),
+            self._shuffled_indices,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        )
+        if IterDataPipe.getstate_hook is not None:
+            return IterDataPipe.getstate_hook(state)
+        return state
+
+    def __setstate__(self, state):
+        (
+            self.datapipe,
+            self.indices,
+            self._enabled,
+            self._seed,
+            rng_state,
+            self._shuffled_indices,
+            self._valid_iterator_id,
+            self._number_of_samples_yielded,
+        ) = state
+        self._rng = random.Random()
+        self._rng.setstate(rng_state)
+
+
+MapDataPipe.register_datapipe_as_function("shuffle", ShufflerIterDataPipe)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combining.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combining.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf11b2834375a469c5bad3762a8679d408ce0775
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/combining.py
@@ -0,0 +1,99 @@
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe
+from typing import Sized, Tuple, TypeVar
+
+__all__ = ["ConcaterMapDataPipe", "ZipperMapDataPipe"]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+@functional_datapipe('concat')
+class ConcaterMapDataPipe(MapDataPipe):
+    r"""
+    Concatenate multiple Map DataPipes (functional name: ``concat``).
+
+    The new index of is the cumulative sum of source DataPipes.
+    For example, if there are 2 source DataPipes both with length 5,
+    index 0 to 4 of the resulting `ConcatMapDataPipe` would refer to
+    elements of the first DataPipe, and 5 to 9 would refer to elements
+    of the second DataPipe.
+
+    Args:
+        datapipes: Map DataPipes being concatenated
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp1 = SequenceWrapper(range(3))
+        >>> dp2 = SequenceWrapper(range(3))
+        >>> concat_dp = dp1.concat(dp2)
+        >>> list(concat_dp)
+        [0, 1, 2, 0, 1, 2]
+    """
+
+    datapipes: Tuple[MapDataPipe]
+
+    def __init__(self, *datapipes: MapDataPipe):
+        if len(datapipes) == 0:
+            raise ValueError("Expected at least one DataPipe, but got nothing")
+        if not all(isinstance(dp, MapDataPipe) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `MapDataPipe`")
+        if not all(isinstance(dp, Sized) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `Sized`")
+        self.datapipes = datapipes  # type: ignore[assignment]
+
+    def __getitem__(self, index) -> T_co:  # type: ignore[type-var]
+        offset = 0
+        for dp in self.datapipes:
+            if index - offset < len(dp):
+                return dp[index - offset]
+            else:
+                offset += len(dp)
+        raise IndexError(f"Index {index} is out of range.")
+
+    def __len__(self) -> int:
+        return sum(len(dp) for dp in self.datapipes)
+
+
+@functional_datapipe('zip')
+class ZipperMapDataPipe(MapDataPipe[Tuple[T_co, ...]]):
+    r"""
+    Aggregates elements into a tuple from each of the input DataPipes (functional name: ``zip``).
+
+    This MataPipe is out of bound as soon as the shortest input DataPipe is exhausted.
+
+    Args:
+        *datapipes: Map DataPipes being aggregated
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp1 = SequenceWrapper(range(3))
+        >>> dp2 = SequenceWrapper(range(10, 13))
+        >>> zip_dp = dp1.zip(dp2)
+        >>> list(zip_dp)
+        [(0, 10), (1, 11), (2, 12)]
+    """
+
+    datapipes: Tuple[MapDataPipe[T_co], ...]
+
+    def __init__(self, *datapipes: MapDataPipe[T_co]) -> None:
+        if len(datapipes) == 0:
+            raise ValueError("Expected at least one DataPipe, but got nothing")
+        if not all(isinstance(dp, MapDataPipe) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `MapDataPipe`")
+        if not all(isinstance(dp, Sized) for dp in datapipes):
+            raise TypeError("Expected all inputs to be `Sized`")
+        self.datapipes = datapipes
+
+    def __getitem__(self, index) -> Tuple[T_co, ...]:
+        res = []
+        for dp in self.datapipes:
+            try:
+                res.append(dp[index])
+            except IndexError as e:
+                raise IndexError(f"Index {index} is out of range for one of the input MapDataPipes {dp}.") from e
+        return tuple(res)
+
+    def __len__(self) -> int:
+        return min(len(dp) for dp in self.datapipes)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/grouping.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/grouping.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b422fdcbf621d379e09d1449d7753faf889867
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/grouping.py
@@ -0,0 +1,69 @@
+from torch.utils.data.datapipes._decorator import functional_datapipe
+from torch.utils.data.datapipes.datapipe import MapDataPipe, DataChunk
+from typing import List, Sized, TypeVar
+
+__all__ = ["BatcherMapDataPipe", ]
+
+T = TypeVar('T')
+
+
+@functional_datapipe('batch')
+class BatcherMapDataPipe(MapDataPipe[DataChunk]):
+    r"""
+    Create mini-batches of data (functional name: ``batch``).
+
+    An outer dimension will be added as ``batch_size`` if ``drop_last`` is set to ``True``,
+    or ``length % batch_size`` for the last batch if ``drop_last`` is set to ``False``.
+
+    Args:
+        datapipe: Iterable DataPipe being batched
+        batch_size: The size of each batch
+        drop_last: Option to drop the last batch if it's not full
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> batch_dp = dp.batch(batch_size=2)
+        >>> list(batch_dp)
+        [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]
+    """
+
+    datapipe: MapDataPipe
+    batch_size: int
+    drop_last: bool
+
+    def __init__(self,
+                 datapipe: MapDataPipe[T],
+                 batch_size: int,
+                 drop_last: bool = False,
+                 wrapper_class=DataChunk,
+                 ) -> None:
+        assert batch_size > 0, "Batch size is required to be larger than 0!"
+        super().__init__()
+        self.datapipe = datapipe
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.wrapper_class = wrapper_class
+
+    def __getitem__(self, index) -> DataChunk:
+        batch: List = []
+        indices = range(index * self.batch_size, (index + 1) * self.batch_size)
+        try:
+            for i in indices:
+                batch.append(self.datapipe[i])
+            return self.wrapper_class(batch)
+        except IndexError as e:
+            if not self.drop_last and len(batch) > 0:
+                return self.wrapper_class(batch)
+            else:
+                raise IndexError(f"Index {index} is out of bound.") from e
+
+    def __len__(self) -> int:
+        if isinstance(self.datapipe, Sized):
+            if self.drop_last:
+                return len(self.datapipe) // self.batch_size
+            else:
+                return (len(self.datapipe) + self.batch_size - 1) // self.batch_size
+        else:
+            raise TypeError(f"{type(self).__name__} instance doesn't have valid length")
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/utils.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f690b43ceb41e7b273f109c1230300a7b3dce8a
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/map/utils.py
@@ -0,0 +1,50 @@
+import copy
+import warnings
+from torch.utils.data.datapipes.datapipe import MapDataPipe
+
+__all__ = ["SequenceWrapperMapDataPipe", ]
+
+
+class SequenceWrapperMapDataPipe(MapDataPipe):
+    r"""
+    Wraps a sequence object into a MapDataPipe.
+
+    Args:
+        sequence: Sequence object to be wrapped into an MapDataPipe
+        deepcopy: Option to deepcopy input sequence object
+
+    .. note::
+      If ``deepcopy`` is set to False explicitly, users should ensure
+      that data pipeline doesn't contain any in-place operations over
+      the iterable instance, in order to prevent data inconsistency
+      across iterations.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> from torchdata.datapipes.map import SequenceWrapper
+        >>> dp = SequenceWrapper(range(10))
+        >>> list(dp)
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+        >>> dp = SequenceWrapper({'a': 100, 'b': 200, 'c': 300, 'd': 400})
+        >>> dp['a']
+        100
+    """
+
+    def __init__(self, sequence, deepcopy=True):
+        if deepcopy:
+            try:
+                self.sequence = copy.deepcopy(sequence)
+            except TypeError:
+                warnings.warn(
+                    "The input sequence can not be deepcopied, "
+                    "please be aware of in-place modification would affect source data"
+                )
+                self.sequence = sequence
+        else:
+            self.sequence = sequence
+
+    def __getitem__(self, index):
+        return self.sequence[index]
+
+    def __len__(self):
+        return len(self.sequence)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__init__.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9efd61fb1d75d17404e7e0118ddd2efb13c66bbc
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/common.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b38c4eabef1367e33497fddd793710f3f0154dd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/common.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/decoder.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..805f4fc7830bd8a1da5a479f52a9e0b36400ec5e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/decoder.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/snapshot.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/snapshot.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf5b7a0160f60a65a11fc62e56a2916609212345
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/__pycache__/snapshot.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/common.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b13a61f5d15def5f9967060fcabf934af14f7d9f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/common.py
@@ -0,0 +1,383 @@
+import fnmatch
+import functools
+import inspect
+import os
+import warnings
+
+from io import IOBase
+
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from torch.utils._import_utils import dill_available
+
+__all__ = [
+    "validate_input_col",
+    "StreamWrapper",
+    "get_file_binaries_from_pathnames",
+    "get_file_pathnames_from_root",
+    "match_masks",
+    "validate_pathname_binary_tuple",
+]
+
+
+# BC for torchdata
+DILL_AVAILABLE = dill_available()
+
+
+def validate_input_col(fn: Callable, input_col: Optional[Union[int, tuple, list]]):
+    """
+    Check that function used in a callable datapipe works with the input column.
+
+    This simply ensures that the number of positional arguments matches the size
+    of the input column. The function must not contain any non-default
+    keyword-only arguments.
+
+    Examples:
+        >>> # xdoctest: +SKIP("Failing on some CI machines")
+        >>> def f(a, b, *, c=1):
+        >>>     return a + b + c
+        >>> def f_def(a, b=1, *, c=1):
+        >>>     return a + b + c
+        >>> assert validate_input_col(f, [1, 2])
+        >>> assert validate_input_col(f_def, 1)
+        >>> assert validate_input_col(f_def, [1, 2])
+
+    Notes:
+        If the function contains variable positional (`inspect.VAR_POSITIONAL`) arguments,
+        for example, f(a, *args), the validator will accept any size of input column
+        greater than or equal to the number of positional arguments.
+        (in this case, 1).
+
+    Args:
+        fn: The function to check.
+        input_col: The input column to check.
+
+    Raises:
+        ValueError: If the function is not compatible with the input column.
+    """
+    try:
+        sig = inspect.signature(fn)
+    except ValueError:  # Signature cannot be inspected, likely it is a built-in fn or written in C
+        return
+    if isinstance(input_col, (list, tuple)):
+        input_col_size = len(input_col)
+    else:
+        input_col_size = 1
+
+    pos = []
+    var_positional = False
+    non_default_kw_only = []
+
+    for p in sig.parameters.values():
+        if p.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD):
+            pos.append(p)
+        elif p.kind is inspect.Parameter.VAR_POSITIONAL:
+            var_positional = True
+        elif p.kind is inspect.Parameter.KEYWORD_ONLY:
+            if p.default is p.empty:
+                non_default_kw_only.append(p)
+        else:
+            continue
+
+    if isinstance(fn, functools.partial):
+        fn_name = getattr(fn.func, "__name__", repr(fn.func))
+    else:
+        fn_name = getattr(fn, "__name__", repr(fn))
+
+    if len(non_default_kw_only) > 0:
+        raise ValueError(
+            f"The function {fn_name} takes {len(non_default_kw_only)} "
+            f"non-default keyword-only parameters, which is not allowed."
+        )
+
+    if len(sig.parameters) < input_col_size:
+        if not var_positional:
+            raise ValueError(
+                f"The function {fn_name} takes {len(sig.parameters)} "
+                f"parameters, but {input_col_size} are required."
+            )
+    else:
+        if len(pos) > input_col_size:
+            if any(p.default is p.empty for p in pos[input_col_size:]):
+                raise ValueError(
+                    f"The function {fn_name} takes {len(pos)} "
+                    f"positional parameters, but {input_col_size} are required."
+                )
+        elif len(pos) < input_col_size:
+            if not var_positional:
+                raise ValueError(
+                    f"The function {fn_name} takes {len(pos)} "
+                    f"positional parameters, but {input_col_size} are required."
+                )
+
+
+def _is_local_fn(fn):
+    # Functions or Methods
+    if hasattr(fn, "__code__"):
+        return fn.__code__.co_flags & inspect.CO_NESTED
+    # Callable Objects
+    else:
+        if hasattr(fn, "__qualname__"):
+            return "<locals>" in fn.__qualname__
+        fn_type = type(fn)
+        if hasattr(fn_type, "__qualname__"):
+            return "<locals>" in fn_type.__qualname__
+    return False
+
+
+def _check_unpickable_fn(fn: Callable):
+    """
+    Check function is pickable or not.
+
+    If it is a lambda or local function, a UserWarning will be raised. If it's not a callable function, a TypeError will be raised.
+    """
+    if not callable(fn):
+        raise TypeError(f"A callable function is expected, but {type(fn)} is provided.")
+
+    # Extract function from partial object
+    # Nested partial function is automatically expanded as a single partial object
+    if isinstance(fn, partial):
+        fn = fn.func
+
+    # Local function
+    if _is_local_fn(fn) and not dill_available():
+        warnings.warn(
+            "Local function is not supported by pickle, please use "
+            "regular python function or functools.partial instead."
+        )
+        return
+
+    # Lambda function
+    if hasattr(fn, "__name__") and fn.__name__ == "<lambda>" and not dill_available():
+        warnings.warn(
+            "Lambda function is not supported by pickle, please use "
+            "regular python function or functools.partial instead."
+        )
+        return
+
+
+def match_masks(name : str, masks : Union[str, List[str]]) -> bool:
+    # empty mask matches any input name
+    if not masks:
+        return True
+
+    if isinstance(masks, str):
+        return fnmatch.fnmatch(name, masks)
+
+    for mask in masks:
+        if fnmatch.fnmatch(name, mask):
+            return True
+    return False
+
+
+def get_file_pathnames_from_root(
+        root: str,
+        masks: Union[str, List[str]],
+        recursive: bool = False,
+        abspath: bool = False,
+        non_deterministic: bool = False) -> Iterable[str]:
+
+    # print out an error message and raise the error out
+    def onerror(err : OSError):
+        warnings.warn(err.filename + " : " + err.strerror)
+        raise err
+
+    if os.path.isfile(root):
+        path = root
+        if abspath:
+            path = os.path.abspath(path)
+        fname = os.path.basename(path)
+        if match_masks(fname, masks):
+            yield path
+    else:
+        for path, dirs, files in os.walk(root, onerror=onerror):
+            if abspath:
+                path = os.path.abspath(path)
+            if not non_deterministic:
+                files.sort()
+            for f in files:
+                if match_masks(f, masks):
+                    yield os.path.join(path, f)
+            if not recursive:
+                break
+            if not non_deterministic:
+                # Note that this is in-place modifying the internal list from `os.walk`
+                # This only works because `os.walk` doesn't shallow copy before turn
+                # https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/os.py#L407
+                dirs.sort()
+
+
+def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str, encoding: Optional[str] = None):
+    if not isinstance(pathnames, Iterable):
+        pathnames = [pathnames, ]
+
+    if mode in ('b', 't'):
+        mode = 'r' + mode
+
+    for pathname in pathnames:
+        if not isinstance(pathname, str):
+            raise TypeError(f"Expected string type for pathname, but got {type(pathname)}")
+        yield pathname, StreamWrapper(open(pathname, mode, encoding=encoding))
+
+
+def validate_pathname_binary_tuple(data: Tuple[str, IOBase]):
+    if not isinstance(data, tuple):
+        raise TypeError(f"pathname binary data should be tuple type, but it is type {type(data)}")
+    if len(data) != 2:
+        raise TypeError(f"pathname binary stream tuple length should be 2, but got {len(data)}")
+    if not isinstance(data[0], str):
+        raise TypeError(f"pathname within the tuple should have string type pathname, but it is type {type(data[0])}")
+    if not isinstance(data[1], IOBase) and not isinstance(data[1], StreamWrapper):
+        raise TypeError(
+            f"binary stream within the tuple should have IOBase or"
+            f"its subclasses as type, but it is type {type(data[1])}"
+        )
+
+
+# Deprecated function names and its corresponding DataPipe type and kwargs for the `_deprecation_warning` function
+_iter_deprecated_functional_names: Dict[str, Dict] = {}
+_map_deprecated_functional_names: Dict[str, Dict] = {}
+
+
+def _deprecation_warning(
+    old_class_name: str,
+    *,
+    deprecation_version: str,
+    removal_version: str,
+    old_functional_name: str = "",
+    old_argument_name: str = "",
+    new_class_name: str = "",
+    new_functional_name: str = "",
+    new_argument_name: str = "",
+    deprecate_functional_name_only: bool = False,
+) -> None:
+    if new_functional_name and not old_functional_name:
+        raise ValueError("Old functional API needs to be specified for the deprecation warning.")
+    if new_argument_name and not old_argument_name:
+        raise ValueError("Old argument name needs to be specified for the deprecation warning.")
+
+    if old_functional_name and old_argument_name:
+        raise ValueError("Deprecating warning for functional API and argument should be separated.")
+
+    msg = f"`{old_class_name}()`"
+    if deprecate_functional_name_only and old_functional_name:
+        msg = f"{msg}'s functional API `.{old_functional_name}()` is"
+    elif old_functional_name:
+        msg = f"{msg} and its functional API `.{old_functional_name}()` are"
+    elif old_argument_name:
+        msg = f"The argument `{old_argument_name}` of {msg} is"
+    else:
+        msg = f"{msg} is"
+    msg = (
+        f"{msg} deprecated since {deprecation_version} and will be removed in {removal_version}."
+        f"\nSee https://github.com/pytorch/data/issues/163 for details."
+    )
+
+    if new_class_name or new_functional_name:
+        msg = f"{msg}\nPlease use"
+        if new_class_name:
+            msg = f"{msg} `{new_class_name}()`"
+        if new_class_name and new_functional_name:
+            msg = f"{msg} or"
+        if new_functional_name:
+            msg = f"{msg} `.{new_functional_name}()`"
+        msg = f"{msg} instead."
+
+    if new_argument_name:
+        msg = f"{msg}\nPlease use `{old_class_name}({new_argument_name}=)` instead."
+
+    warnings.warn(msg, FutureWarning)
+
+
+class StreamWrapper:
+    """
+    StreamWrapper is introduced to wrap file handler generated by DataPipe operation like `FileOpener`.
+
+    StreamWrapper would guarantee the wrapped file handler is closed when it's out of scope.
+    """
+
+    session_streams: Dict[Any, int] = {}
+    debug_unclosed_streams: bool = False
+
+    def __init__(self, file_obj, parent_stream=None, name=None):
+        self.file_obj = file_obj
+        self.child_counter = 0
+        self.parent_stream = parent_stream
+        self.close_on_last_child = False
+        self.name = name
+        self.closed = False
+        if parent_stream is not None:
+            if not isinstance(parent_stream, StreamWrapper):
+                raise RuntimeError(f'Parent stream should be StreamWrapper, {type(parent_stream)} was given')
+            parent_stream.child_counter += 1
+            self.parent_stream = parent_stream
+        if StreamWrapper.debug_unclosed_streams:
+            StreamWrapper.session_streams[self] = 1
+
+    @classmethod
+    def close_streams(cls, v, depth=0):
+        """Traverse structure and attempts to close all found StreamWrappers on best effort basis."""
+        if depth > 10:
+            return
+        if isinstance(v, StreamWrapper):
+            v.close()
+        else:
+            # Traverse only simple structures
+            if isinstance(v, dict):
+                for vv in v.values():
+                    cls.close_streams(vv, depth=depth + 1)
+            elif isinstance(v, (list, tuple)):
+                for vv in v:
+                    cls.close_streams(vv, depth=depth + 1)
+
+    def __getattr__(self, name):
+        file_obj = self.__dict__['file_obj']
+        return getattr(file_obj, name)
+
+    def close(self, *args, **kwargs):
+        if self.closed:
+            return
+        if StreamWrapper.debug_unclosed_streams:
+            del StreamWrapper.session_streams[self]
+        if hasattr(self, "parent_stream") and self.parent_stream is not None:
+            self.parent_stream.child_counter -= 1
+            if not self.parent_stream.child_counter and self.parent_stream.close_on_last_child:
+                self.parent_stream.close()
+        try:
+            self.file_obj.close(*args, **kwargs)
+        except AttributeError:
+            pass
+        self.closed = True
+
+    def autoclose(self):
+        """Automatically close stream when all child streams are closed or if there are none."""
+        self.close_on_last_child = True
+        if self.child_counter == 0:
+            self.close()
+
+    def __dir__(self):
+        attrs = list(self.__dict__.keys()) + list(StreamWrapper.__dict__.keys())
+        attrs += dir(self.file_obj)
+        return list(set(attrs))
+
+    def __del__(self):
+        if not self.closed:
+            self.close()
+
+    def __iter__(self):
+        yield from self.file_obj
+
+    def __next__(self):
+        return next(self.file_obj)
+
+    def __repr__(self):
+        if self.name is None:
+            return f"StreamWrapper<{self.file_obj!r}>"
+        else:
+            return f"StreamWrapper<{self.name},{self.file_obj!r}>"
+
+    def __getstate__(self):
+        return self.file_obj
+
+    def __setstate__(self, obj):
+        self.file_obj = obj
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/decoder.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e70e6599a871ed1fc814ab9041decfce704fd7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/decoder.py
@@ -0,0 +1,330 @@
+# This file takes partial of the implementation from NVIDIA's webdataset at here:
+# https://github.com/tmbdev/webdataset/blob/master/webdataset/autodecode.py
+
+import io
+import json
+import os.path
+import pickle
+import tempfile
+
+import torch
+from torch.utils.data.datapipes.utils.common import StreamWrapper
+
+
+__all__ = [
+    "Decoder",
+    "ImageHandler",
+    "MatHandler",
+    "audiohandler",
+    "basichandlers",
+    "extension_extract_fn",
+    "handle_extension",
+    "imagehandler",
+    "mathandler",
+    "videohandler",
+]
+
+
+################################################################
+# handle basic datatypes
+################################################################
+def basichandlers(extension, data):
+
+    if extension in "txt text transcript":
+        return data.decode("utf-8")
+
+    if extension in "cls cls2 class count index inx id".split():
+        try:
+            return int(data)
+        except ValueError:
+            return None
+
+    if extension in "json jsn":
+        return json.loads(data)
+
+    if extension in "pyd pickle".split():
+        return pickle.loads(data)
+
+    if extension in "pt".split():
+        stream = io.BytesIO(data)
+        return torch.load(stream)
+
+    # if extension in "ten tb".split():
+    #     from . import tenbin
+    #     return tenbin.decode_buffer(data)
+
+    # if extension in "mp msgpack msg".split():
+    #     import msgpack
+    #     return msgpack.unpackb(data)
+
+    return None
+
+
+################################################################
+# handle images
+################################################################
+imagespecs = {
+    "l8": ("numpy", "uint8", "l"),
+    "rgb8": ("numpy", "uint8", "rgb"),
+    "rgba8": ("numpy", "uint8", "rgba"),
+    "l": ("numpy", "float", "l"),
+    "rgb": ("numpy", "float", "rgb"),
+    "rgba": ("numpy", "float", "rgba"),
+    "torchl8": ("torch", "uint8", "l"),
+    "torchrgb8": ("torch", "uint8", "rgb"),
+    "torchrgba8": ("torch", "uint8", "rgba"),
+    "torchl": ("torch", "float", "l"),
+    "torchrgb": ("torch", "float", "rgb"),
+    "torch": ("torch", "float", "rgb"),
+    "torchrgba": ("torch", "float", "rgba"),
+    "pill": ("pil", None, "l"),
+    "pil": ("pil", None, "rgb"),
+    "pilrgb": ("pil", None, "rgb"),
+    "pilrgba": ("pil", None, "rgba"),
+}
+
+def handle_extension(extensions, f):
+    """
+    Return a decoder handler function for the list of extensions.
+
+    Extensions can be a space separated list of extensions.
+    Extensions can contain dots, in which case the corresponding number
+    of extension components must be present in the key given to f.
+    Comparisons are case insensitive.
+    Examples:
+    handle_extension("jpg jpeg", my_decode_jpg)  # invoked for any file.jpg
+    handle_extension("seg.jpg", special_case_jpg)  # invoked only for file.seg.jpg
+    """
+    extensions = extensions.lower().split()
+
+    def g(key, data):
+        extension = key.lower().split(".")
+
+        for target in extensions:
+            target = target.split(".")
+            if len(target) > len(extension):
+                continue
+
+            if extension[-len(target):] == target:
+                return f(data)
+            return None
+    return g
+
+
+class ImageHandler:
+    """
+    Decode image data using the given `imagespec`.
+
+    The `imagespec` specifies whether the image is decoded
+    to numpy/torch/pi, decoded to uint8/float, and decoded
+    to l/rgb/rgba:
+
+    - l8: numpy uint8 l
+    - rgb8: numpy uint8 rgb
+    - rgba8: numpy uint8 rgba
+    - l: numpy float l
+    - rgb: numpy float rgb
+    - rgba: numpy float rgba
+    - torchl8: torch uint8 l
+    - torchrgb8: torch uint8 rgb
+    - torchrgba8: torch uint8 rgba
+    - torchl: torch float l
+    - torchrgb: torch float rgb
+    - torch: torch float rgb
+    - torchrgba: torch float rgba
+    - pill: pil None l
+    - pil: pil None rgb
+    - pilrgb: pil None rgb
+    - pilrgba: pil None rgba
+    """
+
+    def __init__(self, imagespec):
+        assert imagespec in list(imagespecs.keys()), f"unknown image specification: {imagespec}"
+        self.imagespec = imagespec.lower()
+
+    def __call__(self, extension, data):
+        if extension.lower() not in "jpg jpeg png ppm pgm pbm pnm".split():
+            return None
+
+        try:
+            import numpy as np
+        except ImportError as e:
+            raise ModuleNotFoundError("Package `numpy` is required to be installed for default image decoder."
+                                      "Please use `pip install numpy` to install the package") from e
+
+        try:
+            import PIL.Image
+        except ImportError as e:
+            raise ModuleNotFoundError("Package `PIL` is required to be installed for default image decoder."
+                                      "Please use `pip install Pillow` to install the package") from e
+
+        imagespec = self.imagespec
+        atype, etype, mode = imagespecs[imagespec]
+
+        with io.BytesIO(data) as stream:
+            img = PIL.Image.open(stream)
+            img.load()
+            img = img.convert(mode.upper())
+            if atype == "pil":
+                return img
+            elif atype == "numpy":
+                result = np.asarray(img)
+                assert result.dtype == np.uint8, f"numpy image array should be type uint8, but got {result.dtype}"
+                if etype == "uint8":
+                    return result
+                else:
+                    return result.astype("f") / 255.0
+            elif atype == "torch":
+                result = np.asarray(img)
+                assert result.dtype == np.uint8, f"numpy image array should be type uint8, but got {result.dtype}"
+
+                if etype == "uint8":
+                    result = np.array(result.transpose(2, 0, 1))
+                    return torch.tensor(result)
+                else:
+                    result = np.array(result.transpose(2, 0, 1))
+                    return torch.tensor(result) / 255.0
+            return None
+
+def imagehandler(imagespec):
+    return ImageHandler(imagespec)
+
+
+################################################################
+# torch video
+################################################################
+def videohandler(extension, data):
+    if extension not in "mp4 ogv mjpeg avi mov h264 mpg webm wmv".split():
+        return None
+
+    try:
+        import torchvision.io
+    except ImportError as e:
+        raise ModuleNotFoundError("Package `torchvision` is required to be installed for default video file loader."
+                                  "Please use `pip install torchvision` or `conda install torchvision -c pytorch`"
+                                  "to install the package") from e
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"file.{extension}")
+        with open(fname, "wb") as stream:
+            stream.write(data)
+            return torchvision.io.read_video(fname)
+
+
+################################################################
+# torchaudio
+################################################################
+def audiohandler(extension, data):
+    if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]:
+        return None
+
+    try:
+        import torchaudio  # type: ignore[import]
+    except ImportError as e:
+        raise ModuleNotFoundError("Package `torchaudio` is required to be installed for default audio file loader."
+                                  "Please use `pip install torchaudio` or `conda install torchaudio -c pytorch`"
+                                  "to install the package") from e
+
+    with tempfile.TemporaryDirectory() as dirname:
+        fname = os.path.join(dirname, f"file.{extension}")
+        with open(fname, "wb") as stream:
+            stream.write(data)
+            return torchaudio.load(fname)
+
+
+################################################################
+# mat
+################################################################
+class MatHandler:
+    def __init__(self, **loadmat_kwargs) -> None:
+        try:
+            import scipy.io as sio
+        except ImportError as e:
+            raise ModuleNotFoundError("Package `scipy` is required to be installed for mat file."
+                                      "Please use `pip install scipy` or `conda install scipy`"
+                                      "to install the package") from e
+        self.sio = sio
+        self.loadmat_kwargs = loadmat_kwargs
+
+    def __call__(self, extension, data):
+        if extension != 'mat':
+            return None
+        with io.BytesIO(data) as stream:
+            return self.sio.loadmat(stream, **self.loadmat_kwargs)
+
+def mathandler(**loadmat_kwargs):
+    return MatHandler(**loadmat_kwargs)
+
+
+################################################################
+# a sample decoder
+################################################################
+# Extract extension from pathname
+def extension_extract_fn(pathname):
+    ext = os.path.splitext(pathname)[1]
+    # Remove dot
+    if ext:
+        ext = ext[1:]
+    return ext
+
+
+class Decoder:
+    """
+    Decode key/data sets using a list of handlers.
+
+    For each key/data item, this iterates through the list of
+    handlers until some handler returns something other than None.
+    """
+
+    def __init__(self, *handler, key_fn=extension_extract_fn):
+        self.handlers = list(handler) if handler else []
+        self.key_fn = key_fn
+
+    # Insert new handler from the beginning of handlers list to make sure the new
+    # handler having the highest priority
+    def add_handler(self, *handler):
+        if not handler:
+            return
+        self.handlers = list(handler) + self.handlers
+
+    @staticmethod
+    def _is_stream_handle(data):
+        obj_to_check = data.file_obj if isinstance(data, StreamWrapper) else data
+        return isinstance(obj_to_check, (io.BufferedIOBase, io.RawIOBase))
+
+    def decode1(self, key, data):
+        if not data:
+            return data
+
+        # if data is a stream handle, we need to read all the content before decoding
+        if Decoder._is_stream_handle(data):
+            ds = data
+            # The behavior of .read can differ between streams (e.g. HTTPResponse), hence this is used instead
+            data = b"".join(data)
+            ds.close()
+
+        for f in self.handlers:
+            result = f(key, data)
+            if result is not None:
+                return result
+        return data
+
+    def decode(self, data):
+        result = {}
+        # single data tuple(pathname, data stream)
+        if isinstance(data, tuple):
+            data = [data]
+
+        if data is not None:
+            for k, v in data:
+                # TODO: xinyu, figure out why Nvidia do this?
+                if k[0] == "_":
+                    if isinstance(v, bytes):
+                        v = v.decode("utf-8")
+                        result[k] = v
+                        continue
+                result[k] = self.decode1(self.key_fn(k), v)
+        return result
+
+    def __call__(self, data):
+        return self.decode(data)
diff --git a/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/snapshot.py b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/snapshot.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f22f289f367b3a47e148158b42fe8d11ab5802d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/datapipes/utils/snapshot.py
@@ -0,0 +1,58 @@
+from torch.utils.data.datapipes._hook_iterator import _SnapshotState
+from torch.utils.data.datapipes.datapipe import IterDataPipe
+from torch.utils.data.graph_settings import apply_random_seed
+
+
+# TODO: Caveats
+#   1. Caller (either the ReadingService or DataLoader) must pass in the initial RNG
+#   2. `in_batch_shuffle` and `bucketbatch` are not compatible with this because they currently
+#      lack the option to `set_seed`.
+def _simple_graph_snapshot_restoration(datapipe: IterDataPipe, n_iterations: int, rng=None) -> None:
+    r"""
+    Fast-forward the given DataPipe and its parents by ``n_iterations``, re-doing computations to restore a snapshot.
+
+    For instance, applying this function to the final DataPipe of a graph will restore the snapshot
+    (via fast-forward) every DataPipe within the graph.
+
+    After you deserialize a DataPipe, you can use its `_number_of_samples_yielded` attribute as the input
+    to this function to forward the DataPipe.
+
+    A DataPipe cannot be restored twice in a row unless there is an iteration started between the restoration
+    attempts.
+
+    Note:
+        This is the simplest but least efficient way to fast-forward a DataPipe. Usage of other fast-forwarding
+        methods (custom ones if necessary) are recommended.
+
+    Args:
+        datapipe: IterDataPipe to be fast-forwarded
+        n_iterations: number of iterations to fast-forward
+        rng: ``Optional[torch.Generator]``. If not ``None``, this RNG will be used for shuffling. The generator
+            should be in its `initial` state as it was first passed into ``DataLoader`` or ``ReadingService``.
+    """
+    if datapipe._snapshot_state == _SnapshotState.Restored:
+        raise RuntimeError(
+            "Snapshot restoration cannot be applied. You can only restore simple snapshot to the graph "
+            "if your graph has not been restored.")
+
+    # For this snapshot restoration function, we want the DataPipe to be at its initial state prior to
+    # simple fast-forwarding. Therefore, we need to call `reset` twice, because if `SnapshotState` is `Restored`,
+    # the first reset will not actually reset.
+    datapipe.reset()  # This ensures `SnapshotState` is `Iterating` by this point, even if it was `Restored`.
+    apply_random_seed(datapipe, rng)
+
+    remainder = n_iterations
+    it = iter(datapipe)  # This always reset the DataPipe if it hasn't already.
+    while remainder > 0:
+        try:
+            next(it)
+            remainder -= 1
+        except StopIteration as e:
+            raise RuntimeError(f"Fast-forward {datapipe} by {n_iterations} iterations "
+                               "exceeds the number of samples available.") from e
+    datapipe._fast_forward_iterator = it
+    # While the DataPipe has `_fast_forward_iterator`, `next()` will get result from there instead of elsewhere.
+
+    # This will prevent the DataPipe from resetting in the `iter()` call
+    # If another DataPipe is consuming it, it won't have to start over again
+    datapipe._snapshot_state = _SnapshotState.Restored
diff --git a/MLPY/Lib/site-packages/torch/utils/data/dataset.py b/MLPY/Lib/site-packages/torch/utils/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..32060832b303a5e4226d738b6ad9b925a86e5809
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/dataset.py
@@ -0,0 +1,488 @@
+import bisect
+import itertools
+import math
+import warnings
+from typing import (
+    cast,
+    Dict,
+    Generic,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+# No 'default_generator' in torch/__init__.pyi
+from torch import default_generator, randperm
+
+from ... import Generator, Tensor
+
+__all__ = [
+    "Dataset",
+    "IterableDataset",
+    "TensorDataset",
+    "StackDataset",
+    "ConcatDataset",
+    "ChainDataset",
+    "Subset",
+    "random_split",
+]
+
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+T_dict = Dict[str, T_co]
+T_tuple = Tuple[T_co, ...]
+T_stack = TypeVar("T_stack", T_tuple, T_dict)
+
+
+class Dataset(Generic[T_co]):
+    r"""An abstract class representing a :class:`Dataset`.
+
+    All datasets that represent a map from keys to data samples should subclass
+    it. All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
+    data sample for a given key. Subclasses could also optionally overwrite
+    :meth:`__len__`, which is expected to return the size of the dataset by many
+    :class:`~torch.utils.data.Sampler` implementations and the default options
+    of :class:`~torch.utils.data.DataLoader`. Subclasses could also
+    optionally implement :meth:`__getitems__`, for speedup batched samples
+    loading. This method accepts list of indices of samples of batch and returns
+    list of samples.
+
+    .. note::
+      :class:`~torch.utils.data.DataLoader` by default constructs an index
+      sampler that yields integral indices.  To make it work with a map-style
+      dataset with non-integral indices/keys, a custom sampler must be provided.
+    """
+
+    def __getitem__(self, index) -> T_co:
+        raise NotImplementedError("Subclasses of Dataset should implement __getitem__.")
+
+    # def __getitems__(self, indices: List) -> List[T_co]:
+    # Not implemented to prevent false-positives in fetcher check in
+    # torch.utils.data._utils.fetch._MapDatasetFetcher
+
+    def __add__(self, other: "Dataset[T_co]") -> "ConcatDataset[T_co]":
+        return ConcatDataset([self, other])
+
+    # No `def __len__(self)` default?
+    # See NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
+    # in pytorch/torch/utils/data/sampler.py
+
+
+class IterableDataset(Dataset[T_co], Iterable[T_co]):
+    r"""An iterable Dataset.
+
+    All datasets that represent an iterable of data samples should subclass it.
+    Such form of datasets is particularly useful when data come from a stream.
+
+    All subclasses should overwrite :meth:`__iter__`, which would return an
+    iterator of samples in this dataset.
+
+    When a subclass is used with :class:`~torch.utils.data.DataLoader`, each
+    item in the dataset will be yielded from the :class:`~torch.utils.data.DataLoader`
+    iterator. When :attr:`num_workers > 0`, each worker process will have a
+    different copy of the dataset object, so it is often desired to configure
+    each copy independently to avoid having duplicate data returned from the
+    workers. :func:`~torch.utils.data.get_worker_info`, when called in a worker
+    process, returns information about the worker. It can be used in either the
+    dataset's :meth:`__iter__` method or the :class:`~torch.utils.data.DataLoader` 's
+    :attr:`worker_init_fn` option to modify each copy's behavior.
+
+    Example 1: splitting workload across all workers in :meth:`__iter__`::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_DATALOADER)
+        >>> # xdoctest: +SKIP("Fails on MacOS12")
+        >>> class MyIterableDataset(torch.utils.data.IterableDataset):
+        ...     def __init__(self, start, end):
+        ...         super(MyIterableDataset).__init__()
+        ...         assert end > start, "this example code only works with end >= start"
+        ...         self.start = start
+        ...         self.end = end
+        ...
+        ...     def __iter__(self):
+        ...         worker_info = torch.utils.data.get_worker_info()
+        ...         if worker_info is None:  # single-process data loading, return the full iterator
+        ...             iter_start = self.start
+        ...             iter_end = self.end
+        ...         else:  # in a worker process
+        ...             # split workload
+        ...             per_worker = int(math.ceil((self.end - self.start) / float(worker_info.num_workers)))
+        ...             worker_id = worker_info.id
+        ...             iter_start = self.start + worker_id * per_worker
+        ...             iter_end = min(iter_start + per_worker, self.end)
+        ...         return iter(range(iter_start, iter_end))
+        ...
+        >>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+        >>> ds = MyIterableDataset(start=3, end=7)
+
+        >>> # Single-process loading
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+        [tensor([3]), tensor([4]), tensor([5]), tensor([6])]
+
+        >>> # xdoctest: +REQUIRES(POSIX)
+        >>> # Mult-process loading with two worker processes
+        >>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
+        >>> # xdoctest: +IGNORE_WANT("non deterministic")
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
+        [tensor([3]), tensor([5]), tensor([4]), tensor([6])]
+
+        >>> # With even more workers
+        >>> # xdoctest: +IGNORE_WANT("non deterministic")
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=12)))
+        [tensor([3]), tensor([5]), tensor([4]), tensor([6])]
+
+    Example 2: splitting workload across all workers using :attr:`worker_init_fn`::
+
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_DATALOADER)
+        >>> class MyIterableDataset(torch.utils.data.IterableDataset):
+        ...     def __init__(self, start, end):
+        ...         super(MyIterableDataset).__init__()
+        ...         assert end > start, "this example code only works with end >= start"
+        ...         self.start = start
+        ...         self.end = end
+        ...
+        ...     def __iter__(self):
+        ...         return iter(range(self.start, self.end))
+        ...
+        >>> # should give same set of data as range(3, 7), i.e., [3, 4, 5, 6].
+        >>> ds = MyIterableDataset(start=3, end=7)
+
+        >>> # Single-process loading
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=0)))
+        [3, 4, 5, 6]
+        >>>
+        >>> # Directly doing multi-process loading yields duplicate data
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=2)))
+        [3, 3, 4, 4, 5, 5, 6, 6]
+
+        >>> # Define a `worker_init_fn` that configures each dataset copy differently
+        >>> def worker_init_fn(worker_id):
+        ...     worker_info = torch.utils.data.get_worker_info()
+        ...     dataset = worker_info.dataset  # the dataset copy in this worker process
+        ...     overall_start = dataset.start
+        ...     overall_end = dataset.end
+        ...     # configure the dataset to only process the split workload
+        ...     per_worker = int(math.ceil((overall_end - overall_start) / float(worker_info.num_workers)))
+        ...     worker_id = worker_info.id
+        ...     dataset.start = overall_start + worker_id * per_worker
+        ...     dataset.end = min(dataset.start + per_worker, overall_end)
+        ...
+
+        >>> # Mult-process loading with the custom `worker_init_fn`
+        >>> # Worker 0 fetched [3, 4].  Worker 1 fetched [5, 6].
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=2, worker_init_fn=worker_init_fn)))
+        [3, 5, 4, 6]
+
+        >>> # With even more workers
+        >>> print(list(torch.utils.data.DataLoader(ds, num_workers=12, worker_init_fn=worker_init_fn)))
+        [3, 4, 5, 6]
+    """
+
+    def __add__(self, other: Dataset[T_co]):
+        return ChainDataset([self, other])
+
+    # No `def __len__(self)` default? Subclasses raise `TypeError` when needed.
+    # See NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
+
+
+class TensorDataset(Dataset[Tuple[Tensor, ...]]):
+    r"""Dataset wrapping tensors.
+
+    Each sample will be retrieved by indexing tensors along the first dimension.
+
+    Args:
+        *tensors (Tensor): tensors that have the same size of the first dimension.
+    """
+
+    tensors: Tuple[Tensor, ...]
+
+    def __init__(self, *tensors: Tensor) -> None:
+        assert all(
+            tensors[0].size(0) == tensor.size(0) for tensor in tensors
+        ), "Size mismatch between tensors"
+        self.tensors = tensors
+
+    def __getitem__(self, index):
+        return tuple(tensor[index] for tensor in self.tensors)
+
+    def __len__(self):
+        return self.tensors[0].size(0)
+
+
+class StackDataset(Dataset[T_stack]):
+    r"""Dataset as a stacking of multiple datasets.
+
+    This class is useful to assemble different parts of complex input data, given as datasets.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> images = ImageDataset()
+        >>> texts = TextDataset()
+        >>> tuple_stack = StackDataset(images, texts)
+        >>> tuple_stack[0] == (images[0], texts[0])
+        >>> dict_stack = StackDataset(image=images, text=texts)
+        >>> dict_stack[0] == {'image': images[0], 'text': texts[0]}
+
+    Args:
+        *args (Dataset): Datasets for stacking returned as tuple.
+        **kwargs (Dataset): Datasets for stacking returned as dict.
+    """
+
+    datasets: Union[tuple, dict]
+
+    def __init__(self, *args: Dataset[T_co], **kwargs: Dataset[T_co]) -> None:
+        if args:
+            if kwargs:
+                raise ValueError(
+                    "Supported either ``tuple``- (via ``args``) or"
+                    "``dict``- (via ``kwargs``) like input/output, but both types are given."
+                )
+            self._length = len(args[0])  # type: ignore[arg-type]
+            if any(self._length != len(dataset) for dataset in args):  # type: ignore[arg-type]
+                raise ValueError("Size mismatch between datasets")
+            self.datasets = args
+        elif kwargs:
+            tmp = list(kwargs.values())
+            self._length = len(tmp[0])  # type: ignore[arg-type]
+            if any(self._length != len(dataset) for dataset in tmp):  # type: ignore[arg-type]
+                raise ValueError("Size mismatch between datasets")
+            self.datasets = kwargs
+        else:
+            raise ValueError("At least one dataset should be passed")
+
+    def __getitem__(self, index):
+        if isinstance(self.datasets, dict):
+            return {k: dataset[index] for k, dataset in self.datasets.items()}
+        return tuple(dataset[index] for dataset in self.datasets)
+
+    def __getitems__(self, indices: list):
+        # add batched sampling support when parent datasets supports it.
+        if isinstance(self.datasets, dict):
+            dict_batch: List[T_dict] = [{} for _ in indices]
+            for k, dataset in self.datasets.items():
+                if callable(getattr(dataset, "__getitems__", None)):
+                    items = dataset.__getitems__(indices)  # type: ignore[attr-defined]
+                    if len(items) != len(indices):
+                        raise ValueError(
+                            "Nested dataset's output size mismatch."
+                            f" Expected {len(indices)}, got {len(items)}"
+                        )
+                    for data, d_sample in zip(items, dict_batch):
+                        d_sample[k] = data
+                else:
+                    for idx, d_sample in zip(indices, dict_batch):
+                        d_sample[k] = dataset[idx]
+            return dict_batch
+
+        # tuple data
+        list_batch: List[list] = [[] for _ in indices]
+        for dataset in self.datasets:
+            if callable(getattr(dataset, "__getitems__", None)):
+                items = dataset.__getitems__(indices)  # type: ignore[attr-defined]
+                if len(items) != len(indices):
+                    raise ValueError(
+                        "Nested dataset's output size mismatch."
+                        f" Expected {len(indices)}, got {len(items)}"
+                    )
+                for data, t_sample in zip(items, list_batch):
+                    t_sample.append(data)
+            else:
+                for idx, t_sample in zip(indices, list_batch):
+                    t_sample.append(dataset[idx])
+        tuple_batch: List[T_tuple] = [tuple(sample) for sample in list_batch]
+        return tuple_batch
+
+    def __len__(self):
+        return self._length
+
+
+class ConcatDataset(Dataset[T_co]):
+    r"""Dataset as a concatenation of multiple datasets.
+
+    This class is useful to assemble different existing datasets.
+
+    Args:
+        datasets (sequence): List of datasets to be concatenated
+    """
+
+    datasets: List[Dataset[T_co]]
+    cumulative_sizes: List[int]
+
+    @staticmethod
+    def cumsum(sequence):
+        r, s = [], 0
+        for e in sequence:
+            l = len(e)
+            r.append(l + s)
+            s += l
+        return r
+
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super().__init__()
+        self.datasets = list(datasets)
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"  # type: ignore[arg-type]
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets)
+
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+
+    def __getitem__(self, idx):
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    "absolute value of index should not exceed dataset length"
+                )
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx][sample_idx]
+
+    @property
+    def cummulative_sizes(self):
+        warnings.warn(
+            "cummulative_sizes attribute is renamed to " "cumulative_sizes",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.cumulative_sizes
+
+
+class ChainDataset(IterableDataset):
+    r"""Dataset for chaining multiple :class:`IterableDataset` s.
+
+    This class is useful to assemble different existing dataset streams. The
+    chaining operation is done on-the-fly, so concatenating large-scale
+    datasets with this class will be efficient.
+
+    Args:
+        datasets (iterable of IterableDataset): datasets to be chained together
+    """
+
+    def __init__(self, datasets: Iterable[Dataset]) -> None:
+        super().__init__()
+        self.datasets = datasets
+
+    def __iter__(self):
+        for d in self.datasets:
+            assert isinstance(
+                d, IterableDataset
+            ), "ChainDataset only supports IterableDataset"
+            yield from d
+
+    def __len__(self):
+        total = 0
+        for d in self.datasets:
+            assert isinstance(
+                d, IterableDataset
+            ), "ChainDataset only supports IterableDataset"
+            total += len(d)  # type: ignore[arg-type]
+        return total
+
+
+class Subset(Dataset[T_co]):
+    r"""
+    Subset of a dataset at specified indices.
+
+    Args:
+        dataset (Dataset): The whole Dataset
+        indices (sequence): Indices in the whole set selected for subset
+    """
+
+    dataset: Dataset[T_co]
+    indices: Sequence[int]
+
+    def __init__(self, dataset: Dataset[T_co], indices: Sequence[int]) -> None:
+        self.dataset = dataset
+        self.indices = indices
+
+    def __getitem__(self, idx):
+        if isinstance(idx, list):
+            return self.dataset[[self.indices[i] for i in idx]]
+        return self.dataset[self.indices[idx]]
+
+    def __getitems__(self, indices: List[int]) -> List[T_co]:
+        # add batched sampling support when parent dataset supports it.
+        # see torch.utils.data._utils.fetch._MapDatasetFetcher
+        if callable(getattr(self.dataset, "__getitems__", None)):
+            return self.dataset.__getitems__([self.indices[idx] for idx in indices])  # type: ignore[attr-defined]
+        else:
+            return [self.dataset[self.indices[idx]] for idx in indices]
+
+    def __len__(self):
+        return len(self.indices)
+
+
+def random_split(
+    dataset: Dataset[T],
+    lengths: Sequence[Union[int, float]],
+    generator: Optional[Generator] = default_generator,
+) -> List[Subset[T]]:
+    r"""
+    Randomly split a dataset into non-overlapping new datasets of given lengths.
+
+    If a list of fractions that sum up to 1 is given,
+    the lengths will be computed automatically as
+    floor(frac * len(dataset)) for each fraction provided.
+
+    After computing the lengths, if there are any remainders, 1 count will be
+    distributed in round-robin fashion to the lengths
+    until there are no remainders left.
+
+    Optionally fix the generator for reproducible results, e.g.:
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> generator1 = torch.Generator().manual_seed(42)
+        >>> generator2 = torch.Generator().manual_seed(42)
+        >>> random_split(range(10), [3, 7], generator=generator1)
+        >>> random_split(range(30), [0.3, 0.3, 0.4], generator=generator2)
+
+    Args:
+        dataset (Dataset): Dataset to be split
+        lengths (sequence): lengths or fractions of splits to be produced
+        generator (Generator): Generator used for the random permutation.
+    """
+    if math.isclose(sum(lengths), 1) and sum(lengths) <= 1:
+        subset_lengths: List[int] = []
+        for i, frac in enumerate(lengths):
+            if frac < 0 or frac > 1:
+                raise ValueError(f"Fraction at index {i} is not between 0 and 1")
+            n_items_in_split = int(
+                math.floor(len(dataset) * frac)  # type: ignore[arg-type]
+            )
+            subset_lengths.append(n_items_in_split)
+        remainder = len(dataset) - sum(subset_lengths)  # type: ignore[arg-type]
+        # add 1 to all the lengths in round-robin fashion until the remainder is 0
+        for i in range(remainder):
+            idx_to_add_at = i % len(subset_lengths)
+            subset_lengths[idx_to_add_at] += 1
+        lengths = subset_lengths
+        for i, length in enumerate(lengths):
+            if length == 0:
+                warnings.warn(
+                    f"Length of split at index {i} is 0. "
+                    f"This might result in an empty dataset."
+                )
+
+    # Cannot verify that dataset is Sized
+    if sum(lengths) != len(dataset):  # type: ignore[arg-type]
+        raise ValueError(
+            "Sum of input lengths does not equal the length of the input dataset!"
+        )
+
+    indices = randperm(sum(lengths), generator=generator).tolist()  # type: ignore[arg-type, call-overload]
+    lengths = cast(Sequence[int], lengths)
+    return [
+        Subset(dataset, indices[offset - length : offset])
+        for offset, length in zip(itertools.accumulate(lengths), lengths)
+    ]
diff --git a/MLPY/Lib/site-packages/torch/utils/data/distributed.py b/MLPY/Lib/site-packages/torch/utils/data/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..920f451f186c2d2b21db450e37b6ba71d0507a18
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/distributed.py
@@ -0,0 +1,137 @@
+import math
+from typing import TypeVar, Optional, Iterator
+
+import torch
+from . import Sampler, Dataset
+import torch.distributed as dist
+
+__all__ = ["DistributedSampler", ]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+class DistributedSampler(Sampler[T_co]):
+    r"""Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such a case, each
+    process can pass a :class:`~torch.utils.data.DistributedSampler` instance as a
+    :class:`~torch.utils.data.DataLoader` sampler, and load a subset of the
+    original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size and that any instance of it always
+        returns the same elements in the same order.
+
+    Args:
+        dataset: Dataset used for sampling.
+        num_replicas (int, optional): Number of processes participating in
+            distributed training. By default, :attr:`world_size` is retrieved from the
+            current distributed group.
+        rank (int, optional): Rank of the current process within :attr:`num_replicas`.
+            By default, :attr:`rank` is retrieved from the current distributed
+            group.
+        shuffle (bool, optional): If ``True`` (default), sampler will shuffle the
+            indices.
+        seed (int, optional): random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Default: ``0``.
+        drop_last (bool, optional): if ``True``, then the sampler will drop the
+            tail of the data to make it evenly divisible across the number of
+            replicas. If ``False``, the sampler will add extra indices to make
+            the data evenly divisible across the replicas. Default: ``False``.
+
+    .. warning::
+        In distributed mode, calling the :meth:`set_epoch` method at
+        the beginning of each epoch **before** creating the :class:`DataLoader` iterator
+        is necessary to make shuffling work properly across multiple epochs. Otherwise,
+        the same ordering will be always used.
+
+    Example::
+
+        >>> # xdoctest: +SKIP
+        >>> sampler = DistributedSampler(dataset) if is_distributed else None
+        >>> loader = DataLoader(dataset, shuffle=(sampler is None),
+        ...                     sampler=sampler)
+        >>> for epoch in range(start_epoch, n_epochs):
+        ...     if is_distributed:
+        ...         sampler.set_epoch(epoch)
+        ...     train(loader)
+    """
+
+    def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None,
+                 rank: Optional[int] = None, shuffle: bool = True,
+                 seed: int = 0, drop_last: bool = False) -> None:
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]")
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+
+    def __iter__(self) -> Iterator[T_co]:
+        if self.shuffle:
+            # deterministically shuffle based on epoch and seed
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(self.dataset)))  # type: ignore[arg-type]
+
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Set the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/MLPY/Lib/site-packages/torch/utils/data/graph.py b/MLPY/Lib/site-packages/torch/utils/data/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfe25a41f01d4ca27c228021082251867f113449
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/graph.py
@@ -0,0 +1,149 @@
+import io
+import pickle
+import warnings
+
+from collections.abc import Collection
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+from torch.utils.data import IterDataPipe, MapDataPipe
+from torch.utils._import_utils import dill_available
+
+
+__all__ = ["traverse", "traverse_dps"]
+
+DataPipe = Union[IterDataPipe, MapDataPipe]
+DataPipeGraph = Dict[int, Tuple[DataPipe, "DataPipeGraph"]]  # type: ignore[misc]
+
+
+def _stub_unpickler():
+    return "STUB"
+
+
+# TODO(VitalyFedyunin): Make sure it works without dill module installed
+def _list_connected_datapipes(scan_obj: DataPipe, only_datapipe: bool, cache: Set[int]) -> List[DataPipe]:
+    f = io.BytesIO()
+    p = pickle.Pickler(f)  # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is
+    if dill_available():
+        from dill import Pickler as dill_Pickler
+        d = dill_Pickler(f)
+    else:
+        d = None
+
+    captured_connections = []
+
+    def getstate_hook(ori_state):
+        state = None
+        if isinstance(ori_state, dict):
+            state = {}  # type: ignore[assignment]
+            for k, v in ori_state.items():
+                if isinstance(v, (IterDataPipe, MapDataPipe, Collection)):
+                    state[k] = v  # type: ignore[attr-defined]
+        elif isinstance(ori_state, (tuple, list)):
+            state = []  # type: ignore[assignment]
+            for v in ori_state:
+                if isinstance(v, (IterDataPipe, MapDataPipe, Collection)):
+                    state.append(v)  # type: ignore[attr-defined]
+        elif isinstance(ori_state, (IterDataPipe, MapDataPipe, Collection)):
+            state = ori_state  # type: ignore[assignment]
+        return state
+
+    def reduce_hook(obj):
+        if obj == scan_obj or id(obj) in cache:
+            raise NotImplementedError
+        else:
+            captured_connections.append(obj)
+            # Adding id to remove duplicate DataPipe serialized at the same level
+            cache.add(id(obj))
+            return _stub_unpickler, ()
+
+    datapipe_classes: Tuple[Type[DataPipe]] = (IterDataPipe, MapDataPipe)  # type: ignore[assignment]
+
+    try:
+        for cls in datapipe_classes:
+            cls.set_reduce_ex_hook(reduce_hook)
+            if only_datapipe:
+                cls.set_getstate_hook(getstate_hook)
+        try:
+            p.dump(scan_obj)
+        except (pickle.PickleError, AttributeError, TypeError):
+            if dill_available():
+                d.dump(scan_obj)
+            else:
+                raise
+    finally:
+        for cls in datapipe_classes:
+            cls.set_reduce_ex_hook(None)
+            if only_datapipe:
+                cls.set_getstate_hook(None)
+        if dill_available():
+            from dill import extend as dill_extend
+            dill_extend(False)  # Undo change to dispatch table
+    return captured_connections
+
+
+def traverse_dps(datapipe: DataPipe) -> DataPipeGraph:
+    r"""
+    Traverse the DataPipes and their attributes to extract the DataPipe graph.
+
+    This only looks into the attribute from each DataPipe that is either a
+    DataPipe and a Python collection object such as ``list``, ``tuple``,
+    ``set`` and ``dict``.
+
+    Args:
+        datapipe: the end DataPipe of the graph
+    Returns:
+        A graph represented as a nested dictionary, where keys are ids of DataPipe instances
+        and values are tuples of DataPipe instance and the sub-graph
+    """
+    cache: Set[int] = set()
+    return _traverse_helper(datapipe, only_datapipe=True, cache=cache)
+
+
+def traverse(datapipe: DataPipe, only_datapipe: Optional[bool] = None) -> DataPipeGraph:
+    r"""
+    Traverse the DataPipes and their attributes to extract the DataPipe graph.
+
+    [Deprecated]
+    When ``only_dataPipe`` is specified as ``True``, it would only look into the
+    attribute from each DataPipe that is either a DataPipe and a Python collection object
+    such as ``list``, ``tuple``, ``set`` and ``dict``.
+
+    Note:
+        This function is deprecated. Please use `traverse_dps` instead.
+
+    Args:
+        datapipe: the end DataPipe of the graph
+        only_datapipe: If ``False`` (default), all attributes of each DataPipe are traversed.
+          This argument is deprecating and will be removed after the next release.
+    Returns:
+        A graph represented as a nested dictionary, where keys are ids of DataPipe instances
+        and values are tuples of DataPipe instance and the sub-graph
+    """
+    msg = "`traverse` function and will be removed after 1.13. " \
+          "Please use `traverse_dps` instead."
+    if not only_datapipe:
+        msg += " And, the behavior will be changed to the equivalent of `only_datapipe=True`."
+    warnings.warn(msg, FutureWarning)
+    if only_datapipe is None:
+        only_datapipe = False
+    cache: Set[int] = set()
+    return _traverse_helper(datapipe, only_datapipe, cache)
+
+
+# Add cache here to prevent infinite recursion on DataPipe
+def _traverse_helper(datapipe: DataPipe, only_datapipe: bool, cache: Set[int]) -> DataPipeGraph:
+    if not isinstance(datapipe, (IterDataPipe, MapDataPipe)):
+        raise RuntimeError(f"Expected `IterDataPipe` or `MapDataPipe`, but {type(datapipe)} is found")
+
+    dp_id = id(datapipe)
+    if dp_id in cache:
+        return {}
+    cache.add(dp_id)
+    # Using cache.copy() here is to prevent the same DataPipe pollutes the cache on different paths
+    items = _list_connected_datapipes(datapipe, only_datapipe, cache.copy())
+    d: DataPipeGraph = {dp_id: (datapipe, {})}
+    for item in items:
+        # Using cache.copy() here is to prevent recursion on a single path rather than global graph
+        # Single DataPipe can present multiple times in different paths in graph
+        d[dp_id][1].update(_traverse_helper(item, only_datapipe, cache.copy()))
+    return d
diff --git a/MLPY/Lib/site-packages/torch/utils/data/graph_settings.py b/MLPY/Lib/site-packages/torch/utils/data/graph_settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa5f80c48ab7d397de02e16d896822ac4f3c41f5
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/graph_settings.py
@@ -0,0 +1,160 @@
+import inspect
+import warnings
+
+from typing import Any, List, Optional, Set
+
+import torch
+
+from torch.utils.data.datapipes.iter.sharding import (
+    _ShardingIterDataPipe,
+    SHARDING_PRIORITIES,
+)
+from torch.utils.data.graph import DataPipe, DataPipeGraph, traverse_dps
+
+__all__ = [
+    "apply_random_seed",
+    "apply_sharding",
+    "apply_shuffle_seed",
+    "apply_shuffle_settings",
+    "get_all_graph_pipes",
+]
+
+
+def get_all_graph_pipes(graph: DataPipeGraph) -> List[DataPipe]:
+    return _get_all_graph_pipes_helper(graph, set())
+
+
+def _get_all_graph_pipes_helper(graph: DataPipeGraph, id_cache: Set[int]) -> List[DataPipe]:
+    results: List[DataPipe] = []
+    for dp_id, (datapipe, sub_graph) in graph.items():
+        if dp_id in id_cache:
+            continue
+        id_cache.add(dp_id)
+        results.append(datapipe)
+        results.extend(_get_all_graph_pipes_helper(sub_graph, id_cache))
+    return results
+
+
+def _is_sharding_datapipe(datapipe: DataPipe) -> bool:
+    if isinstance(datapipe, _ShardingIterDataPipe):
+        return True
+    if hasattr(datapipe, "apply_sharding") and inspect.ismethod(datapipe.apply_sharding):
+        return True
+    return False
+
+
+def apply_sharding(datapipe: DataPipe,
+                   num_of_instances: int,
+                   instance_id: int,
+                   sharding_group=SHARDING_PRIORITIES.DEFAULT) -> DataPipe:
+    r"""
+    Apply dynamic sharding over the ``sharding_filter`` DataPipe that has a method ``apply_sharding``.
+
+    RuntimeError will be raised when multiple ``sharding_filter`` are presented in the same branch.
+    """
+    graph = traverse_dps(datapipe)
+
+    def _helper(graph, prev_applied=None):
+        for (dp, sub_graph) in graph.values():
+            applied = None
+            if _is_sharding_datapipe(dp):
+                if prev_applied is not None:
+                    raise RuntimeError("Sharding twice on a single pipeline is likely unintended and will cause data loss. "
+                                       f"Sharding already applied to {prev_applied} while trying to apply to {dp}")
+                # For BC, only provide sharding_group if accepted
+                sig = inspect.signature(dp.apply_sharding)
+                if len(sig.parameters) < 3:
+                    dp.apply_sharding(num_of_instances, instance_id)
+                else:
+                    dp.apply_sharding(num_of_instances, instance_id, sharding_group=sharding_group)
+                applied = dp
+            if applied is None:
+                applied = prev_applied
+            _helper(sub_graph, applied)
+
+    _helper(graph)
+
+    return datapipe
+
+
+def _is_shuffle_datapipe(datapipe: DataPipe) -> bool:
+    if not hasattr(datapipe, "set_shuffle") or not hasattr(datapipe, "set_seed"):
+        return False
+    if not inspect.ismethod(datapipe.set_shuffle) or not inspect.ismethod(datapipe.set_seed):
+        return False
+    return True
+
+
+def apply_shuffle_settings(datapipe: DataPipe, shuffle: Optional[bool] = None) -> DataPipe:
+    r"""
+    Traverse the graph of ``DataPipes`` to find and set shuffle attribute.
+
+    Apply the method to each `DataPipe` that has APIs of ``set_shuffle``
+    and ``set_seed``.
+
+    Args:
+        datapipe: DataPipe that needs to set shuffle attribute
+        shuffle: Shuffle option (default: ``None`` and no-op to the graph)
+    """
+    if shuffle is None:
+        return datapipe
+
+    graph = traverse_dps(datapipe)
+    all_pipes = get_all_graph_pipes(graph)
+    shufflers = [pipe for pipe in all_pipes if _is_shuffle_datapipe(pipe)]
+    if not shufflers and shuffle:
+        warnings.warn(
+            "`shuffle=True` was set, but the datapipe does not contain a `Shuffler`. Adding one at the end. "
+            "Be aware that the default buffer size might not be sufficient for your task."
+        )
+        datapipe = datapipe.shuffle()
+        shufflers = [datapipe, ]  # type: ignore[list-item]
+
+    for shuffler in shufflers:
+        shuffler.set_shuffle(shuffle)
+
+    return datapipe
+
+
+def apply_shuffle_seed(datapipe: DataPipe, rng: Any) -> DataPipe:
+    warnings.warn(
+        "`apply_shuffle_seed` is deprecated since 1.12 and will be removed in the future releases."
+        "\nPlease use `apply_random_seed` instead."
+    )
+    return apply_random_seed(datapipe, rng)
+
+
+def _is_random_datapipe(datapipe: DataPipe) -> bool:
+    if hasattr(datapipe, "set_seed") and inspect.ismethod(datapipe.set_seed):
+        return True
+    return False
+
+
+def apply_random_seed(datapipe: DataPipe, rng: torch.Generator) -> DataPipe:
+    r"""
+    Traverse the graph of ``DataPipes`` to find random ``DataPipe`` with an API of ``set_seed``.
+
+    Then set the random seed based on the provided RNG to those ``DataPipe``.
+
+    Args:
+        datapipe: DataPipe that needs to set randomness
+        rng: Random number generator to generate random seeds
+    """
+    graph = traverse_dps(datapipe)
+    all_pipes = get_all_graph_pipes(graph)
+    # Using a set to track id of DataPipe to prevent setting randomness per DataPipe more than once.
+    # And, `id` is used in case of unhashable DataPipe
+    cache = set()
+    random_datapipes = []
+    for pipe in all_pipes:
+        if id(pipe) in cache:
+            continue
+        if _is_random_datapipe(pipe):
+            random_datapipes.append(pipe)
+            cache.add(id(pipe))
+
+    for pipe in random_datapipes:
+        random_seed = int(torch.empty((), dtype=torch.int64).random_(generator=rng).item())
+        pipe.set_seed(random_seed)
+
+    return datapipe
diff --git a/MLPY/Lib/site-packages/torch/utils/data/sampler.py b/MLPY/Lib/site-packages/torch/utils/data/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f2837148fb57cb582eb2c0f6506a42bbadcea3e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/data/sampler.py
@@ -0,0 +1,305 @@
+import torch
+from torch import Tensor
+
+from typing import Iterator, Iterable, Optional, Sequence, List, TypeVar, Generic, Sized, Union
+
+__all__ = [
+    "BatchSampler",
+    "RandomSampler",
+    "Sampler",
+    "SequentialSampler",
+    "SubsetRandomSampler",
+    "WeightedRandomSampler",
+]
+
+T_co = TypeVar('T_co', covariant=True)
+
+
+class Sampler(Generic[T_co]):
+    r"""Base class for all Samplers.
+
+    Every Sampler subclass has to provide an :meth:`__iter__` method, providing a
+    way to iterate over indices or lists of indices (batches) of dataset elements, and a :meth:`__len__` method
+    that returns the length of the returned iterators.
+
+    Args:
+        data_source (Dataset): This argument is not used and will be removed in 2.2.0.
+            You may still have custom implementation that utilizes it.
+
+    Example:
+        >>> # xdoctest: +SKIP
+        >>> class AccedingSequenceLengthSampler(Sampler[int]):
+        >>>     def __init__(self, data: List[str]) -> None:
+        >>>         self.data = data
+        >>>
+        >>>     def __len__(self) -> int:
+        >>>         return len(self.data)
+        >>>
+        >>>     def __iter__(self) -> Iterator[int]:
+        >>>         sizes = torch.tensor([len(x) for x in self.data])
+        >>>         yield from torch.argsort(sizes).tolist()
+        >>>
+        >>> class AccedingSequenceLengthBatchSampler(Sampler[List[int]]):
+        >>>     def __init__(self, data: List[str], batch_size: int) -> None:
+        >>>         self.data = data
+        >>>         self.batch_size = batch_size
+        >>>
+        >>>     def __len__(self) -> int:
+        >>>         return (len(self.data) + self.batch_size - 1) // self.batch_size
+        >>>
+        >>>     def __iter__(self) -> Iterator[List[int]]:
+        >>>         sizes = torch.tensor([len(x) for x in self.data])
+        >>>         for batch in torch.chunk(torch.argsort(sizes), len(self)):
+        >>>             yield batch.tolist()
+
+    .. note:: The :meth:`__len__` method isn't strictly required by
+              :class:`~torch.utils.data.DataLoader`, but is expected in any
+              calculation involving the length of a :class:`~torch.utils.data.DataLoader`.
+    """
+
+    def __init__(self, data_source: Optional[Sized] = None) -> None:
+        if data_source is not None:
+            import warnings
+
+            warnings.warn("`data_source` argument is not used and will be removed in 2.2.0."
+                          "You may still have custom implementation that utilizes it.")
+
+    def __iter__(self) -> Iterator[T_co]:
+        raise NotImplementedError
+
+    # NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
+    #
+    # Many times we have an abstract class representing a collection/iterable of
+    # data, e.g., `torch.utils.data.Sampler`, with its subclasses optionally
+    # implementing a `__len__` method. In such cases, we must make sure to not
+    # provide a default implementation, because both straightforward default
+    # implementations have their issues:
+    #
+    #   + `return NotImplemented`:
+    #     Calling `len(subclass_instance)` raises:
+    #       TypeError: 'NotImplementedType' object cannot be interpreted as an integer
+    #
+    #   + `raise NotImplementedError()`:
+    #     This prevents triggering some fallback behavior. E.g., the built-in
+    #     `list(X)` tries to call `len(X)` first, and executes a different code
+    #     path if the method is not found or `NotImplemented` is returned, while
+    #     raising a `NotImplementedError` will propagate and make the call fail
+    #     where it could have used `__iter__` to complete the call.
+    #
+    # Thus, the only two sensible things to do are
+    #
+    #   + **not** provide a default `__len__`.
+    #
+    #   + raise a `TypeError` instead, which is what Python uses when users call
+    #     a method that is not defined on an object.
+    #     (@ssnl verifies that this works on at least Python 3.7.)
+
+
+class SequentialSampler(Sampler[int]):
+    r"""Samples elements sequentially, always in the same order.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+    """
+
+    data_source: Sized
+
+    def __init__(self, data_source: Sized) -> None:
+        self.data_source = data_source
+
+    def __iter__(self) -> Iterator[int]:
+        return iter(range(len(self.data_source)))
+
+    def __len__(self) -> int:
+        return len(self.data_source)
+
+
+class RandomSampler(Sampler[int]):
+    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`.
+        generator (Generator): Generator used in sampling.
+    """
+
+    data_source: Sized
+    replacement: bool
+
+    def __init__(self, data_source: Sized, replacement: bool = False,
+                 num_samples: Optional[int] = None, generator=None) -> None:
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+
+        if not isinstance(self.replacement, bool):
+            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
+
+    @property
+    def num_samples(self) -> int:
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self) -> Iterator[int]:
+        n = len(self.data_source)
+        if self.generator is None:
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = self.generator
+
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
+            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
+        else:
+            for _ in range(self.num_samples // n):
+                yield from torch.randperm(n, generator=generator).tolist()
+            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
+class SubsetRandomSampler(Sampler[int]):
+    r"""Samples elements randomly from a given list of indices, without replacement.
+
+    Args:
+        indices (sequence): a sequence of indices
+        generator (Generator): Generator used in sampling.
+    """
+
+    indices: Sequence[int]
+
+    def __init__(self, indices: Sequence[int], generator=None) -> None:
+        self.indices = indices
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        for i in torch.randperm(len(self.indices), generator=self.generator):
+            yield self.indices[i]
+
+    def __len__(self) -> int:
+        return len(self.indices)
+
+
+class WeightedRandomSampler(Sampler[int]):
+    r"""Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights).
+
+    Args:
+        weights (sequence)   : a sequence of weights, not necessary summing up to one
+        num_samples (int): number of samples to draw
+        replacement (bool): if ``True``, samples are drawn with replacement.
+            If not, they are drawn without replacement, which means that when a
+            sample index is drawn for a row, it cannot be drawn again for that row.
+        generator (Generator): Generator used in sampling.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT("non-deterministic")
+        >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
+        [4, 4, 1, 4, 5]
+        >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
+        [0, 1, 4, 3, 2]
+    """
+
+    weights: Tensor
+    num_samples: int
+    replacement: bool
+
+    def __init__(self, weights: Sequence[float], num_samples: int,
+                 replacement: bool = True, generator=None) -> None:
+        if not isinstance(num_samples, int) or isinstance(num_samples, bool) or \
+                num_samples <= 0:
+            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={num_samples}")
+        if not isinstance(replacement, bool):
+            raise ValueError(f"replacement should be a boolean value, but got replacement={replacement}")
+
+        weights_tensor = torch.as_tensor(weights, dtype=torch.double)
+        if len(weights_tensor.shape) != 1:
+            raise ValueError("weights should be a 1d sequence but given "
+                             f"weights have shape {tuple(weights_tensor.shape)}")
+
+        self.weights = weights_tensor
+        self.num_samples = num_samples
+        self.replacement = replacement
+        self.generator = generator
+
+    def __iter__(self) -> Iterator[int]:
+        rand_tensor = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator)
+        yield from iter(rand_tensor.tolist())
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+
+class BatchSampler(Sampler[List[int]]):
+    r"""Wraps another sampler to yield a mini-batch of indices.
+
+    Args:
+        sampler (Sampler or Iterable): Base sampler. Can be any iterable object
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``
+
+    Example:
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
+        >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
+        [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    """
+
+    def __init__(self, sampler: Union[Sampler[int], Iterable[int]], batch_size: int, drop_last: bool) -> None:
+        # Since collections.abc.Iterable does not check for `__getitem__`, which
+        # is one way for an object to be an iterable, we don't do an `isinstance`
+        # check here.
+        if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
+                batch_size <= 0:
+            raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
+        if not isinstance(drop_last, bool):
+            raise ValueError(f"drop_last should be a boolean value, but got drop_last={drop_last}")
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self) -> Iterator[List[int]]:
+        # Implemented based on the benchmarking in https://github.com/pytorch/pytorch/pull/76951
+        if self.drop_last:
+            sampler_iter = iter(self.sampler)
+            while True:
+                try:
+                    batch = [next(sampler_iter) for _ in range(self.batch_size)]
+                    yield batch
+                except StopIteration:
+                    break
+        else:
+            batch = [0] * self.batch_size
+            idx_in_batch = 0
+            for idx in self.sampler:
+                batch[idx_in_batch] = idx
+                idx_in_batch += 1
+                if idx_in_batch == self.batch_size:
+                    yield batch
+                    idx_in_batch = 0
+                    batch = [0] * self.batch_size
+            if idx_in_batch > 0:
+                yield batch[:idx_in_batch]
+
+    def __len__(self) -> int:
+        # Can only be called if self.sampler has __len__ implemented
+        # We cannot enforce this condition, so we turn off typechecking for the
+        # implementation below.
+        # Somewhat related: see NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size  # type: ignore[arg-type]
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size  # type: ignore[arg-type]
diff --git a/MLPY/Lib/site-packages/torch/utils/deterministic.py b/MLPY/Lib/site-packages/torch/utils/deterministic.py
new file mode 100644
index 0000000000000000000000000000000000000000..a924159596c0f1bbe5fa95e250718b5c58a9b137
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/deterministic.py
@@ -0,0 +1,21 @@
+import sys
+import types
+
+import torch
+
+
+class _Deterministic(types.ModuleType):
+    @property
+    def fill_uninitialized_memory(self):
+        """
+        Whether to fill uninitialized memory with a known value when
+        :meth:`torch.use_deterministic_algorithms()` is set to ``True``.
+        """
+        return torch._C._get_deterministic_fill_uninitialized_memory()
+
+    @fill_uninitialized_memory.setter
+    def fill_uninitialized_memory(self, mode):
+        return torch._C._set_deterministic_fill_uninitialized_memory(mode)
+
+
+sys.modules[__name__].__class__ = _Deterministic
diff --git a/MLPY/Lib/site-packages/torch/utils/dlpack.py b/MLPY/Lib/site-packages/torch/utils/dlpack.py
new file mode 100644
index 0000000000000000000000000000000000000000..1967a3964b4468d4251df8e8f0ded506796660ed
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/dlpack.py
@@ -0,0 +1,121 @@
+from typing import Any
+
+import torch
+import enum
+
+from torch._C import _from_dlpack
+from torch._C import _to_dlpack as to_dlpack
+
+
+class DLDeviceType(enum.IntEnum):
+    # Enums as in DLPack specification (aten/src/ATen/dlpack.h)
+    kDLCPU = 1,
+    kDLGPU = 2,
+    kDLCPUPinned = 3,
+    kDLOpenCL = 4,
+    kDLVulkan = 7,
+    kDLMetal = 8,
+    kDLVPI = 9,
+    kDLROCM = 10,
+    kDLExtDev = 12,
+    kDLOneAPI = 14,
+
+
+torch._C._add_docstr(to_dlpack, r"""to_dlpack(tensor) -> PyCapsule
+
+Returns an opaque object (a "DLPack capsule") representing the tensor.
+
+.. note::
+  ``to_dlpack`` is a legacy DLPack interface. The capsule it returns
+  cannot be used for anything in Python other than use it as input to
+  ``from_dlpack``. The more idiomatic use of DLPack is to call
+  ``from_dlpack`` directly on the tensor object - this works when that
+  object has a ``__dlpack__`` method, which PyTorch and most other
+  libraries indeed have now.
+
+.. warning::
+  Only call ``from_dlpack`` once per capsule produced with ``to_dlpack``.
+  Behavior when a capsule is consumed multiple times is undefined.
+
+Args:
+    tensor: a tensor to be exported
+
+The DLPack capsule shares the tensor's memory.
+""")
+
+
+# TODO: add a typing.Protocol to be able to tell Mypy that only objects with
+# __dlpack__ and __dlpack_device__ methods are accepted.
+def from_dlpack(ext_tensor: Any) -> 'torch.Tensor':
+    """from_dlpack(ext_tensor) -> Tensor
+
+    Converts a tensor from an external library into a ``torch.Tensor``.
+
+    The returned PyTorch tensor will share the memory with the input tensor
+    (which may have come from another library). Note that in-place operations
+    will therefore also affect the data of the input tensor. This may lead to
+    unexpected issues (e.g., other libraries may have read-only flags or
+    immutable data structures), so the user should only do this if they know
+    for sure that this is fine.
+
+    Args:
+        ext_tensor (object with ``__dlpack__`` attribute, or a DLPack capsule):
+            The tensor or DLPack capsule to convert.
+
+            If ``ext_tensor`` is a tensor (or ndarray) object, it must support
+            the ``__dlpack__`` protocol (i.e., have a ``ext_tensor.__dlpack__``
+            method). Otherwise ``ext_tensor`` may be a DLPack capsule, which is
+            an opaque ``PyCapsule`` instance, typically produced by a
+            ``to_dlpack`` function or method.
+
+    Examples::
+
+        >>> import torch.utils.dlpack
+        >>> t = torch.arange(4)
+
+        # Convert a tensor directly (supported in PyTorch >= 1.10)
+        >>> t2 = torch.from_dlpack(t)
+        >>> t2[:2] = -1  # show that memory is shared
+        >>> t2
+        tensor([-1, -1,  2,  3])
+        >>> t
+        tensor([-1, -1,  2,  3])
+
+        # The old-style DLPack usage, with an intermediate capsule object
+        >>> capsule = torch.utils.dlpack.to_dlpack(t)
+        >>> capsule
+        <capsule object "dltensor" at ...>
+        >>> t3 = torch.from_dlpack(capsule)
+        >>> t3
+        tensor([-1, -1,  2,  3])
+        >>> t3[0] = -9  # now we're sharing memory between 3 tensors
+        >>> t3
+        tensor([-9, -1,  2,  3])
+        >>> t2
+        tensor([-9, -1,  2,  3])
+        >>> t
+        tensor([-9, -1,  2,  3])
+
+    """
+    if hasattr(ext_tensor, '__dlpack__'):
+        device = ext_tensor.__dlpack_device__()
+        # device is either CUDA or ROCm, we need to pass the current
+        # stream
+        if device[0] in (DLDeviceType.kDLGPU, DLDeviceType.kDLROCM):
+            stream = torch.cuda.current_stream(f'cuda:{device[1]}')
+            # cuda_stream is the pointer to the stream and it is a public
+            # attribute, but it is not documented
+            # The array API specify that the default legacy stream must be passed
+            # with a value of 1 for CUDA
+            # https://data-apis.org/array-api/latest/API_specification/array_object.html?dlpack-self-stream-none#dlpack-self-stream-none
+            is_cuda = device[0] == DLDeviceType.kDLGPU
+            # Since pytorch is not using PTDS by default, lets directly pass
+            # the legacy stream
+            stream_ptr = 1 if is_cuda and stream.cuda_stream == 0 else stream.cuda_stream
+            dlpack = ext_tensor.__dlpack__(stream=stream_ptr)
+        else:
+            dlpack = ext_tensor.__dlpack__()
+    else:
+        # Old versions just call the converter
+        dlpack = ext_tensor
+    return _from_dlpack(dlpack)
diff --git a/MLPY/Lib/site-packages/torch/utils/file_baton.py b/MLPY/Lib/site-packages/torch/utils/file_baton.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f3be922958225f34bcff88a9c4e19ce614bf0d
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/file_baton.py
@@ -0,0 +1,49 @@
+import os
+import time
+
+
+class FileBaton:
+    """A primitive, file-based synchronization utility."""
+
+    def __init__(self, lock_file_path, wait_seconds=0.1):
+        """
+        Create a new :class:`FileBaton`.
+
+        Args:
+            lock_file_path: The path to the file used for locking.
+            wait_seconds: The seconds to periodically sleep (spin) when
+                calling ``wait()``.
+        """
+        self.lock_file_path = lock_file_path
+        self.wait_seconds = wait_seconds
+        self.fd = None
+
+    def try_acquire(self):
+        """
+        Try to atomically create a file under exclusive access.
+
+        Returns:
+            True if the file could be created, else False.
+        """
+        try:
+            self.fd = os.open(self.lock_file_path, os.O_CREAT | os.O_EXCL)
+            return True
+        except FileExistsError:
+            return False
+
+    def wait(self):
+        """
+        Periodically sleeps for a certain amount until the baton is released.
+
+        The amount of time slept depends on the ``wait_seconds`` parameter
+        passed to the constructor.
+        """
+        while os.path.exists(self.lock_file_path):
+            time.sleep(self.wait_seconds)
+
+    def release(self):
+        """Release the baton and removes its file."""
+        if self.fd is not None:
+            os.close(self.fd)
+
+        os.remove(self.lock_file_path)
diff --git a/MLPY/Lib/site-packages/torch/utils/flop_counter.py b/MLPY/Lib/site-packages/torch/utils/flop_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a00b95771c90f3205785f7a6e3c0187f4a3e62e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/flop_counter.py
@@ -0,0 +1,559 @@
+import torch
+import torch.nn as nn
+from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
+from typing import List, Any, Dict, Optional, Union, NamedTuple
+from collections import defaultdict
+from torch.utils._python_dispatch import TorchDispatchMode
+from torch.utils.hooks import RemovableHandle
+from torch._decomp import register_decomposition
+from math import prod
+from functools import wraps
+
+
+
+__all__ = ["FlopCounterMode", "register_flop_formula"]
+
+aten = torch.ops.aten
+
+def get_shape(i):
+    if isinstance(i, torch.Tensor):
+        return i.shape
+    return i
+
+flop_registry: Dict[Any, Any] = {}
+
+def shape_wrapper(f):
+    @wraps(f)
+    def nf(*args, out=None, **kwargs):
+        args, kwargs, out_shape = tree_map(get_shape, (args, kwargs, out))
+        return f(*args, out_shape=out_shape, **kwargs)
+    return nf
+
+def register_flop_formula(targets, get_raw=False):
+    def register_fun(flop_formula):
+        if not get_raw:
+            flop_formula = shape_wrapper(flop_formula)
+        register_decomposition(targets, registry=flop_registry, unsafe=True)(flop_formula)
+        return flop_formula
+
+    return register_fun
+
+@register_flop_formula(aten.mm)
+def mm_flop(a_shape, b_shape, *args, out_shape=None, **kwargs) -> int:
+    """Count flops for matmul."""
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    m, k = a_shape
+    k2, n = b_shape
+    assert k == k2
+    # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
+    return m * n * 2 * k
+
+@register_flop_formula(aten.addmm)
+def addmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:
+    """Count flops for addmm."""
+    return mm_flop(a_shape, b_shape)
+
+@register_flop_formula(aten.bmm)
+def bmm_flop(a_shape, b_shape, out_shape=None, **kwargs) -> int:
+    """Count flops for the bmm operation."""
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    b, m, k = a_shape
+    b2, k2, n = b_shape
+    assert b == b2
+    assert k == k2
+    # NB(chilli): Should be 2 * k - 1 technically for FLOPs.
+    flop = b * m * n * 2 * k
+    return flop
+
+@register_flop_formula(aten.baddbmm)
+def baddbmm_flop(self_shape, a_shape, b_shape, out_shape=None, **kwargs) -> int:
+    """Count flops for the baddbmm operation."""
+    # Inputs should be a list of length 3.
+    # Inputs contains the shapes of three tensors.
+    return bmm_flop(a_shape, b_shape)
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> int:
+    """Count flops for convolution.
+
+    Note only multiplication is
+    counted. Computation for bias are ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    c_out, c_in, *filter_size = w_shape
+
+    """
+    General idea here is that for a regular conv, for each point in the output
+    spatial dimension we convolve the filter with something (hence
+    `prod(conv_shape) * prod(filter_size)` ops). Then, this gets multiplied by
+    1. batch_size, 2. the cross product of input and weight channels.
+
+    For the transpose, it's not each point in the *output* spatial dimension but
+    each point in the *input* spatial dimension.
+    """
+    # NB(chilli): I don't think this properly accounts for padding :think:
+    # NB(chilli): Should be 2 * c_in - 1 technically for FLOPs.
+    flop = prod(conv_shape) * prod(filter_size) * batch_size * c_out * c_in * 2
+    return flop
+
+@register_flop_formula([aten.convolution, aten._convolution])
+def conv_flop(x_shape, w_shape, _bias, _stride, _padding, _dilation, transposed, *args, out_shape=None, **kwargs) -> int:
+    """Count flops for convolution."""
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+@register_flop_formula(aten.convolution_backward)
+def conv_backward_flop(
+        grad_out_shape,
+        x_shape,
+        w_shape,
+        _bias,
+        _stride,
+        _padding,
+        _dilation,
+        transposed,
+        _output_padding,
+        _groups,
+        output_mask,
+        out_shape) -> int:
+
+    def t(shape):
+        return [shape[1], shape[0]] + list(shape[2:])
+    flop_count = 0
+
+    """
+    Let's say we have a regular 1D conv
+    {A, B, C} [inp]
+    {i, j} [weight]
+    => (conv)
+    {Ai + Bj, Bi + Cj} [out]
+
+    And as a reminder, the transposed conv of the above is
+    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]
+
+    For the backwards of conv, we now have
+    {D, E} [grad_out]
+    {A, B, C} [inp]
+    {i, j} [weight]
+
+    # grad_inp as conv_transpose(grad_out, weight)
+    Let's first compute grad_inp. To do so, we can simply look at all the
+    multiplications that each element of inp is involved in. For example, A is
+    only involved in the first element of the output (and thus only depends upon
+    D in grad_out), and C is only involved in the last element of the output
+    (and thus only depends upon E in grad_out)
+
+    {Di, Dj + Ei, Ej} [grad_inp]
+
+    Note that this corresponds to the below conv_transpose. This gives us the
+    output_mask[0] branch, which is grad_inp.
+
+    {D, E} [inp (grad_out)]
+    {i, j} [weight]
+    => (conv_transpose)
+    {Di, Dj + Ei, Ej} [out (grad_inp)]
+
+    I leave the fact that grad_inp for a transposed conv is just conv(grad_out,
+    weight) as an exercise for the reader.
+
+    # grad_weight as conv(inp, grad_out)
+    To compute grad_weight, we again look at the terms in the output, which as
+    a reminder is:
+    => {Ai + Bj, Bi + Cj} [out]
+    => {D, E} [grad_out]
+    If we manually compute the gradient for the weights, we see it's
+    {AD + BE, BD + CE} [grad_weight]
+
+    This corresponds to the below conv
+    {A, B, C} [inp]
+    {D, E} [weight (grad_out)]
+    => (conv)
+    {AD + BE, BD + CE} [out (grad_weight)]
+
+    # grad_weight of transposed conv as conv(grad_out, inp)
+    As a reminder, the terms of the output of a transposed conv are:
+    => {Ai, Aj + Bi, Bj + Ci, Cj} [transposed conv out]
+    => {D, E, F, G} [grad_out]
+
+    Manually computing the gradient for the weights, we see it's
+    {AD + BE + CF, AE + BF + CG} [grad_weight]
+
+    This corresponds to the below conv
+    {D, E, F, G} [inp (grad_out)]
+    {A, B, C} [weight (inp)]
+    => (conv)
+    {AD + BE + CF, AE + BF + CG} [out (grad_weight)]
+
+    For the full backwards formula, there are also some details involving
+    transpose of the batch/channel dimensions and groups, but I skip those for
+    the sake of brevity (and they're pretty similar to matmul backwards)
+
+    Check [conv backwards decomposition as conv forwards]
+    """
+    # grad_inp as conv_transpose(grad_out, weight)
+    if output_mask[0]:
+        grad_input_shape = get_shape(out_shape[0])
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not transposed)
+
+    if output_mask[1]:
+        grad_weight_shape = get_shape(out_shape[1])
+        if transposed:
+            # grad_weight of transposed conv as conv(grad_out, inp)
+            flop_count += conv_flop_count(t(grad_out_shape), t(x_shape), t(grad_weight_shape), transposed=False)
+        else:
+            # grad_weight as conv(inp, grad_out)
+            flop_count += conv_flop_count(t(x_shape), t(grad_out_shape), t(grad_weight_shape), transposed=False)
+
+    return flop_count
+
+def sdpa_flop_count(query_shape, key_shape, value_shape):
+    """
+    Count flops for self-attention.
+
+    NB: We can assume that value_shape == key_shape
+    """
+    b, h, s_q, d_q = query_shape
+    _b2, _h2, s_k, _d2 = key_shape
+    _b3, _h3, _s3, d_v = value_shape
+    assert b == _b2 == _b3 and h == _h2 == _h3 and d_q == _d2 and s_k == _s3 and d_q == _d2
+    total_flops = 0
+    # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
+    total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))
+    # scores: [b, h, s_q, s_k] @ v: [b, h, s_k, d_v] -> out: [b, h, s_q, d_v]
+    total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_v))
+    return total_flops
+
+
+@register_flop_formula([aten._scaled_dot_product_efficient_attention, aten._scaled_dot_product_flash_attention])
+def sdpa_flop(query_shape, key_shape, value_shape, *args, out_shape=None, **kwargs) -> int:
+    """Count flops for self-attention."""
+    # NB: We aren't accounting for causal attention here
+    return sdpa_flop_count(query_shape, key_shape, value_shape)
+
+
+def sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape):
+    total_flops = 0
+    b, h, s_q, d_q = query_shape
+    _b2, _h2, s_k, _d2 = key_shape
+    _b3, _h3, _s3, d_v = value_shape
+    _b4, _h4, _s4, _d4 = grad_out_shape
+    assert b == _b2 == _b3 == _b4 and h == _h2 == _h3 == _h4 and d_q == _d2
+    assert d_v == _d4 and s_k == _s3 and s_q == _s4
+    total_flops = 0
+    # Step 1: We recompute the scores matrix.
+    # q: [b, h, s_q, d_q] @ k: [b, h, d_q, s_k] -> scores: [b, h, s_q, s_k]
+    total_flops += bmm_flop((b * h, s_q, d_q), (b * h, d_q, s_k))
+
+    # Step 2: We propagate the gradients through the score @ v operation.
+    # gradOut: [b, h, s_q, d_v] @ v: [b, h, d_v, s_k] -> gradScores: [b, h, s_q, s_k]
+    total_flops += bmm_flop((b * h, s_q, d_v), (b * h, d_v, s_k))
+    # scores: [b, h, s_k, s_q] @ gradOut: [b, h, s_q, d_v] -> gradV: [b, h, s_k, d_v]
+    total_flops += bmm_flop((b * h, s_k, s_q), (b * h, s_q, d_v))
+
+    # Step 3: We propagate th gradients through the k @ v operation
+    # gradScores: [b, h, s_q, s_k] @ k: [b, h, s_k, d_q] -> gradQ: [b, h, s_q, d_q]
+    total_flops += bmm_flop((b * h, s_q, s_k), (b * h, s_k, d_q))
+    # q: [b, h, d_q, s_q] @ gradScores: [b, h, s_q, s_k] -> gradK: [b, h, d_q, s_k]
+    total_flops += bmm_flop((b * h, d_q, s_q), (b * h, s_q, s_k))
+    return total_flops
+
+
+@register_flop_formula([aten._scaled_dot_product_efficient_attention_backward, aten._scaled_dot_product_flash_attention_backward])
+def sdpa_backward_flop(grad_out_shape, query_shape, key_shape, value_shape, *args, out_shape=None, **kwargs) -> int:
+    """Count flops for self-attention backward."""
+    return sdpa_backward_flop_count(grad_out_shape, query_shape, key_shape, value_shape)
+
+flop_registry = {
+    aten.mm: mm_flop,
+    aten.addmm: addmm_flop,
+    aten.bmm: bmm_flop,
+    aten.baddbmm: baddbmm_flop,
+    aten.convolution: conv_flop,
+    aten._convolution: conv_flop,
+    aten.convolution_backward: conv_backward_flop,
+    aten._scaled_dot_product_efficient_attention: sdpa_flop,
+    aten._scaled_dot_product_flash_attention: sdpa_flop,
+    aten._scaled_dot_product_efficient_attention_backward: sdpa_backward_flop,
+    aten._scaled_dot_product_flash_attention_backward: sdpa_backward_flop,
+}
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+# Define the suffixes for different orders of magnitude
+suffixes = ["", "K", "M", "B", "T"]
+# Thanks BingChat!
+def get_suffix_str(number):
+    # Find the index of the appropriate suffix based on the number of digits
+    # with some additional overflow.
+    # i.e. 1.01B should be displayed as 1001M, not 1.001B
+    index = max(0, min(len(suffixes) - 1, (len(str(number)) - 2) // 3))
+    return suffixes[index]
+
+def convert_num_with_suffix(number, suffix):
+    index = suffixes.index(suffix)
+    # Divide the number by 1000^index and format it to two decimal places
+    value = f"{number / 1000 ** index:.3f}"
+    # Return the value and the suffix as a string
+    return value + suffixes[index]
+
+def convert_to_percent_str(num, denom):
+    if denom == 0:
+        return "0%"
+    return f"{num / denom:.2%}"
+
+def _pytreeify_preserve_structure(f):
+    @wraps(f)
+    def nf(args):
+        flat_args, spec = tree_flatten(args)
+        out = f(*flat_args)
+        return tree_unflatten(out, spec)
+
+    return nf
+
+
+class FlopCounterMode(TorchDispatchMode):
+    """
+    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.
+
+    It does this using a ``TorchDispatchMode``.
+
+    It also supports hierarchical output by passing a module (or list of
+    modules) to FlopCounterMode on construction. If you do not need hierarchical
+    output, you do not need to use it with a module.
+
+    Example usage
+
+    .. code-block:: python
+
+        mod = ...
+        flop_counter = FlopCounterMode(mod)
+        with flop_counter:
+            mod.sum().backward()
+
+    """
+
+    def __init__(
+            self,
+            mods: Optional[Union[torch.nn.Module, List[torch.nn.Module]]] = None,
+            depth: int = 2,
+            display: bool = True,
+            custom_mapping: Optional[Dict[Any, Any]] = None):
+        self.flop_counts: Dict[str, Dict[Any, int]] = defaultdict(lambda: defaultdict(int))
+        self.depth = depth
+        self.parents = ["Global"]
+        self.in_backward = False
+        self.display = display
+        if custom_mapping is None:
+            custom_mapping = {}
+        if isinstance(mods, torch.nn.Module):
+            mods = [mods]
+        self.mods = mods
+        # Keys will include the modules in `mods` and their submodules
+        self._module_to_forward_hook_handles: Dict[nn.Module, _ForwardHookHandles] = {}
+        self.flop_registry = {
+            **flop_registry,
+            **{k: v if getattr(v, "_get_raw", False) else shape_wrapper(v) for k, v in custom_mapping.items()}
+        }
+
+    def _register_forward_hooks(self):
+        if self.mods is None:
+            return
+        for mod in self.mods:
+            prefix = type(mod).__name__
+            for name, module in dict(mod.named_modules()).items():
+                if name == "":
+                    name = prefix
+                else:
+                    name = ".".join([prefix, name])
+
+                forward_pre_hook_handle = module.register_forward_pre_hook(self._enter_module(name))
+                forward_hook_handle = module.register_forward_hook(self._exit_module(name))
+                self._module_to_forward_hook_handles[module] = _ForwardHookHandles(
+                    forward_pre_hook_handle, forward_hook_handle
+                )
+
+    def _deregister_forward_hooks(self):
+        for forward_hook_handles in self._module_to_forward_hook_handles.values():
+            forward_hook_handles[0].remove()
+            forward_hook_handles[1].remove()
+        self._module_to_forward_hook_handles.clear()
+
+    def _enter_module(self, name):
+        def f(module, inputs):
+            out = _pytreeify_preserve_structure(self._create_pre_module(name))(inputs)
+            return out
+
+        return f
+
+    def _exit_module(self, name):
+        def f(module, inputs, outputs):
+            outputs = _pytreeify_preserve_structure(self._create_post_module(name))(outputs)
+            return outputs
+        return f
+
+    def _create_post_module(self, name):
+        class PushState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                assert self.parents[-1] == name, f"{self.parents[-1]} is not {name}"
+                self.parents.pop()
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                self.in_backward = True
+                self.parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def _create_pre_module(self, name):
+        class PopState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                if self.in_backward:
+                    self.parents = ["Global"]
+                    self.in_backward = True
+                self.parents.append(name)
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                assert self.parents[-1] == name
+                self.parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def get_total_flops(self) -> int:
+        return sum(self.flop_counts['Global'].values())
+
+    def get_flop_counts(self) -> Dict[str, Dict[Any, int]]:
+        """Return the flop counts as a dictionary of dictionaries.
+
+        The outer
+        dictionary is keyed by module name, and the inner dictionary is keyed by
+        operation name.
+
+        Returns:
+            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
+        """
+        return {k: dict(v) for k, v in self.flop_counts.items()}
+
+    def get_table(self, depth=None):
+        if depth is None:
+            depth = self.depth
+        if depth is None:
+            depth = 999999
+
+        import tabulate
+        tabulate.PRESERVE_WHITESPACE = True
+        header = ["Module", "FLOP", "% Total"]
+        values = []
+        global_flops = self.get_total_flops()
+        global_suffix = get_suffix_str(global_flops)
+        is_global_subsumed = False
+
+        def process_mod(mod_name, depth):
+            nonlocal is_global_subsumed
+
+            total_flops = sum(self.flop_counts[mod_name].values())
+
+            is_global_subsumed |= total_flops >= global_flops
+
+            padding = " " * depth
+            values = []
+            values.append([
+                padding + mod_name,
+                convert_num_with_suffix(total_flops, global_suffix),
+                convert_to_percent_str(total_flops, global_flops)
+            ])
+            for k, v in self.flop_counts[mod_name].items():
+                values.append([
+                    padding + " - " + str(k),
+                    convert_num_with_suffix(v, global_suffix),
+                    convert_to_percent_str(v, global_flops)
+                ])
+            return values
+
+        for mod in self.flop_counts.keys():
+            if mod == 'Global':
+                continue
+            mod_depth = mod.count(".") + 1
+            if mod_depth > depth:
+                continue
+
+            cur_values = process_mod(mod, mod_depth - 1)
+            values.extend(cur_values)
+
+        # We do a bit of messing around here to only output the "Global" value
+        # if there are any FLOPs in there that aren't already fully contained by
+        # a module.
+        if 'Global' in self.flop_counts and not is_global_subsumed:
+            for idx, value in enumerate(values):
+                values[idx][0] = " " + values[idx][0]
+
+            values = process_mod('Global', 0) + values
+
+        if len(values) == 0:
+            values = [["Global", "0", "0%"]]
+
+        return tabulate.tabulate(values, headers=header, colalign=("left", "right", "right"))
+
+    def __enter__(self):
+        self.flop_counts.clear()
+        self._register_forward_hooks()
+        super().__enter__()
+        return self
+
+    def __exit__(self, *args):
+        if self.display:
+            print(self.get_table(self.depth))
+        self._deregister_forward_hooks()
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        if func_packet in self.flop_registry:
+            flop_count_func = self.flop_registry[func_packet]
+            flop_count = flop_count_func(*args, **kwargs, out=out)  # type: ignore[operator]
+            if len(set(self.parents)) != len(self.parents):
+                print(
+                    "The module hierarchy tracking seems to be messed up."
+                    "Please file a bug or just run the flop counter without"
+                    "tracking the module hierarchy (i.e. `with FlopCounterMode():`)"
+                )
+            for par in set(self.parents):
+                self.flop_counts[par][func_packet] += flop_count
+
+        return out
+
+class _ForwardHookHandles(NamedTuple):
+    forward_pre_hook_handle: RemovableHandle
+    forward_hook_handle: RemovableHandle
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__init__.py b/MLPY/Lib/site-packages/torch/utils/hipify/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ac4a0d10ed25a4dc7328e910d03520c7d389bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hipify/__init__.py
@@ -0,0 +1 @@
+from .version import __version__
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a7463199757c8c02c9ddff7dcf5eea8f1590012
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/constants.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/constants.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eea9ce8244c8619bde329d8796dd59c2e89d0abd
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/constants.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/cuda_to_hip_mappings.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/cuda_to_hip_mappings.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7389a4f887cf3a595ea9d26a168ed61197cf643
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/cuda_to_hip_mappings.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/hipify_python.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/hipify_python.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c325fb81429b41b09734c75c784067b6d1d34568
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/hipify_python.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/version.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/version.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d63d14e69705bfa40cb1fe156ff20d3617fdba6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/hipify/__pycache__/version.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/constants.py b/MLPY/Lib/site-packages/torch/utils/hipify/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b678e33fb4752f812f732224a1d4f9a9ccdbf44
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hipify/constants.py
@@ -0,0 +1,62 @@
+"""Constants for annotations in the mapping.
+
+The constants defined here are used to annotate the mapping tuples in cuda_to_hip_mappings.py.
+They are based on
+https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/Statistics.h
+and fall in three categories: 1) type of mapping, 2) API of mapping, 3) unsupported
+mapping.
+"""
+
+CONV_VERSION = 0,
+CONV_INIT = 1
+CONV_DEVICE = 2
+CONV_MEM = 3
+CONV_KERN = 4
+CONV_COORD_FUNC = 5
+CONV_MATH_FUNC = 6
+CONV_DEVICE_FUNC = 7
+CONV_SPECIAL_FUNC = 8
+CONV_STREAM = 9
+CONV_EVENT = 10
+CONV_OCCUPANCY = 11
+CONV_CONTEXT = 12
+CONV_PEER = 13
+CONV_MODULE = 14
+CONV_CACHE = 15
+CONV_EXEC = 16
+CONV_ERROR = 17
+CONV_DEF = 18
+CONV_TEX = 19
+CONV_GL = 20
+CONV_GRAPHICS = 21
+CONV_SURFACE = 22
+CONV_JIT = 23
+CONV_D3D9 = 24
+CONV_D3D10 = 25
+CONV_D3D11 = 26
+CONV_VDPAU = 27
+CONV_EGL = 28
+CONV_THREAD = 29
+CONV_OTHER = 30
+CONV_INCLUDE = 31
+CONV_INCLUDE_CUDA_MAIN_H = 32
+CONV_TYPE = 33
+CONV_LITERAL = 34
+CONV_NUMERIC_LITERAL = 35
+CONV_LAST = 36
+
+API_DRIVER = 37
+API_RUNTIME = 38
+API_BLAS = 39
+API_SPECIAL = 40
+API_RAND = 41
+API_LAST = 42
+API_FFT = 43
+API_RTC = 44
+API_ROCTX = 45
+
+HIP_UNSUPPORTED = 46
+API_PYTORCH = 1337
+API_CAFFE2 = 1338
+API_C10 = 1339
+API_ROCMSMI = 1340
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/cuda_to_hip_mappings.py b/MLPY/Lib/site-packages/torch/utils/hipify/cuda_to_hip_mappings.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3e1f723b325a22cf676e0fcc0b7c4b4fbeae172
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -0,0 +1,8714 @@
+import collections
+import os
+import re
+import subprocess
+
+from .constants import (API_BLAS, API_C10, API_CAFFE2, API_DRIVER, API_FFT,
+                        API_PYTORCH, API_RAND, API_ROCTX, API_RTC, API_RUNTIME,
+                        API_SPECIAL, API_ROCMSMI, CONV_CACHE, CONV_CONTEXT, CONV_D3D9,
+                        CONV_D3D10, CONV_D3D11, CONV_DEF, CONV_DEVICE,
+                        CONV_DEVICE_FUNC, CONV_EGL, CONV_ERROR, CONV_EVENT,
+                        CONV_EXEC, CONV_GL, CONV_GRAPHICS, CONV_INCLUDE,
+                        CONV_INCLUDE_CUDA_MAIN_H, CONV_INIT, CONV_JIT,
+                        CONV_MATH_FUNC, CONV_MEM, CONV_MODULE,
+                        CONV_NUMERIC_LITERAL, CONV_OCCUPANCY, CONV_OTHER,
+                        CONV_PEER, CONV_SPECIAL_FUNC, CONV_STREAM,
+                        CONV_SURFACE, CONV_TEX, CONV_THREAD, CONV_TYPE,
+                        CONV_VDPAU, CONV_VERSION, HIP_UNSUPPORTED)
+
+""" Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents
+This closely follows the implementation in hipify-clang
+https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp
+and its structure.
+There are different maps for fundamental names, include files, identifies, sparse, and
+PyTorch specific translations.
+Each of the entries in these maps translates a CUDA string to a tuple containing the
+ROCm/HIP string, a type and API annotation and - optionally - an annotation if it is not
+supported in ROCm/HIP yet.
+"""
+
+# We need to know the ROCm version so we can conditionalize some of the mappings later.
+# As of ROCm 5.0, the version is found in rocm_version.h header file under /opt/rocm/include.
+rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+try:
+    rocm_path = subprocess.check_output(["hipconfig", "--rocmpath"]).decode("utf-8")
+except subprocess.CalledProcessError:
+    print(f"Warning: hipconfig --rocmpath failed, assuming {rocm_path}")
+except (FileNotFoundError, PermissionError, NotADirectoryError):
+    # Do not print warning. This is okay. This file can also be imported for non-ROCm builds.
+    pass
+
+rocm_version = (0, 0, 0)
+rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+if not os.path.isfile(rocm_version_h):
+    rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+
+# The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+if os.path.isfile(rocm_version_h):
+    RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+    RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+    RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+    major, minor, patch = 0, 0, 0
+    for line in open(rocm_version_h):
+        match = RE_MAJOR.search(line)
+        if match:
+            major = int(match.group(1))
+        match = RE_MINOR.search(line)
+        if match:
+            minor = int(match.group(1))
+        match = RE_PATCH.search(line)
+        if match:
+            patch = int(match.group(1))
+    rocm_version = (major, minor, patch)
+
+# List of math functions that should be replaced inside device code only.
+MATH_TRANSPILATIONS = collections.OrderedDict(
+    [
+        ("std::max", ("::max")),
+        ("std::min", ("::min")),
+        ("std::ceil", ("::ceil")),
+        ("std::floor", ("::floor")),
+        ("std::exp", ("::exp")),
+        ("std::log", ("::log")),
+        ("std::pow", ("::pow")),
+        ("std::fabs", ("::fabs")),
+        ("std::fmod", ("::fmod")),
+        ("std::remainder", ("::remainder")),
+        ("std::frexp", ("::frexp")),
+    ]
+)
+
+CUDA_TYPE_NAME_MAP = collections.OrderedDict(
+    [
+        ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)),
+        ("cudaError_t", ("hipError_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaError", ("hipError_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ARRAY3D_DESCRIPTOR",
+            ("HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ARRAY_DESCRIPTOR", ("HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER)),
+        ("CUDA_MEMCPY2D", ("hip_Memcpy2D", CONV_TYPE, API_DRIVER)),
+        ("CUDA_MEMCPY3D", ("HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUDA_MEMCPY3D_PEER",
+            ("HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS",
+            (
+                "HIP_POINTER_ATTRIBUTE_P2P_TOKENS",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_RESOURCE_DESC",
+            ("HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_RESOURCE_VIEW_DESC",
+            ("HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUipcEventHandle",
+            ("hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUipcMemHandle", ("hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUaddress_mode", ("hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUarray_cubemap_face",
+            ("hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUarray_format", ("hipArray_format", CONV_TYPE, API_DRIVER)),
+        ("CUcomputemode", ("hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUmem_advise", ("hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUmem_range_attribute",
+            ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUctx_flags", ("hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUdevice", ("hipDevice_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevice_attribute_enum", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevice_attribute", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
+        ("CUpointer_attribute", ("hipPointer_attribute", CONV_TYPE, API_DRIVER)),
+        ("CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL", ("HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL", CONV_TYPE, API_DRIVER)),
+        ("CU_POINTER_ATTRIBUTE_BUFFER_ID", ("HIP_POINTER_ATTRIBUTE_BUFFER_ID", CONV_TYPE, API_DRIVER)),
+        ("CUdeviceptr", ("hipDeviceptr_t", CONV_TYPE, API_DRIVER)),
+        ("CUarray_st", ("hipArray", CONV_TYPE, API_DRIVER)),
+        ("CUarray", ("hipArray *", CONV_TYPE, API_DRIVER)),
+        ("CUdevprop_st", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevprop", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
+        ("CUfunction", ("hipFunction_t", CONV_TYPE, API_DRIVER)),
+        (
+            "CUgraphicsResource",
+            ("hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUmipmappedArray",
+            ("hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUfunction_attribute",
+            ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUfunction_attribute_enum",
+            ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsMapResourceFlags",
+            ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsMapResourceFlags_enum",
+            ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsRegisterFlags",
+            ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsRegisterFlags_enum",
+            ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUoccupancy_flags",
+            ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUoccupancy_flags_enum",
+            ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUfunc_cache_enum", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
+        ("CUfunc_cache", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
+        ("CUipcMem_flags", ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUipcMem_flags_enum",
+            ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_cacheMode", ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjit_cacheMode_enum",
+            ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_fallback", ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjit_fallback_enum",
+            ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_option", ("hipJitOption", CONV_JIT, API_DRIVER)),
+        ("CUjit_option_enum", ("hipJitOption", CONV_JIT, API_DRIVER)),
+        ("CUjit_target", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUjit_target_enum", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUjitInputType", ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjitInputType_enum",
+            ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUlimit", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
+        ("CUlimit_enum", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
+        (
+            "CUmemAttach_flags",
+            ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUmemAttach_flags_enum",
+            ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUmemorytype", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUmemorytype_enum", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUresourcetype", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUresourcetype_enum",
+            ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUresourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
+        ("CUresourceViewFormat_enum", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
+        ("CUsharedconfig", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
+        ("CUsharedconfig_enum", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
+        ("CUcontext", ("hipCtx_t", CONV_TYPE, API_DRIVER)),
+        ("CUmodule", ("hipModule_t", CONV_TYPE, API_DRIVER)),
+        ("CUstream", ("hipStream_t", CONV_TYPE, API_DRIVER)),
+        ("CUstream_st", ("ihipStream_t", CONV_TYPE, API_DRIVER)),
+        ("CUstreamCallback", ("hipStreamCallback_t", CONV_TYPE, API_DRIVER)),
+        ("CUsurfObject", ("hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUsurfref",
+            ("hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUtexObject", ("hipTextureObject_t", CONV_TYPE, API_DRIVER)),
+        ("CUtexref", ("textureReference", CONV_TYPE, API_DRIVER)),
+        ("CUstream_flags", ("hipStreamFlags", CONV_TYPE, API_DRIVER)),
+        (
+            "CUstreamWaitValue_flags",
+            ("hipStreamWaitValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUstreamWriteValue_flags",
+            ("hipStreamWriteValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUstreamBatchMemOpType",
+            ("hipStreamBatchMemOpType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUdevice_P2PAttribute",
+            ("hipDeviceP2PAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUevent", ("hipEvent_t", CONV_TYPE, API_DRIVER)),
+        ("CUevent_st", ("ihipEvent_t", CONV_TYPE, API_DRIVER)),
+        ("CUevent_flags", ("hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUfilter_mode", ("hipTextureFilterMode", CONV_TEX, API_DRIVER)),
+        ("CUGLDeviceList", ("hipGLDeviceList", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUGLmap_flags", ("hipGLMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUd3d9DeviceList",
+            ("hipD3D9DeviceList", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d9map_flags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d9register_flags",
+            ("hipD3D9RegisterFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10DeviceList",
+            ("hipd3d10DeviceList", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10map_flags",
+            ("hipD3D10MapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10register_flags",
+            ("hipD3D10RegisterFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d11DeviceList",
+            ("hipd3d11DeviceList", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUeglStreamConnection_st",
+            ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUeglStreamConnection",
+            ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "libraryPropertyType_t",
+            ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "libraryPropertyType",
+            ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaStreamCallback_t", ("hipStreamCallback_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaArray", ("hipArray", CONV_MEM, API_RUNTIME)),
+        ("cudaArray_t", ("hipArray_t", CONV_MEM, API_RUNTIME)),
+        ("cudaArray_const_t", ("hipArray_const_t", CONV_MEM, API_RUNTIME)),
+        ("cudaMipmappedArray_t", ("hipMipmappedArray_t", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMipmappedArray_const_t",
+            ("hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaArrayDefault", ("hipArrayDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaArrayLayered", ("hipArrayLayered", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaArraySurfaceLoadStore",
+            ("hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaArrayCubemap", ("hipArrayCubemap", CONV_MEM, API_RUNTIME)),
+        ("cudaArrayTextureGather", ("hipArrayTextureGather", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryAdvise", ("hipMemoryAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemRangeAttribute",
+            ("hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpyKind", ("hipMemcpyKind", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryType", ("hipMemoryType", CONV_MEM, API_RUNTIME)),
+        ("cudaExtent", ("hipExtent", CONV_MEM, API_RUNTIME)),
+        ("cudaPitchedPtr", ("hipPitchedPtr", CONV_MEM, API_RUNTIME)),
+        ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)),
+        ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceP2PAttr",
+            ("hipDeviceP2PAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeMode",
+            ("hipComputeMode", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaFuncCache", ("hipFuncCache_t", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncAttributes",
+            ("hipFuncAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaSharedMemConfig", ("hipSharedMemConfig", CONV_TYPE, API_RUNTIME)),
+        ("cudaLimit", ("hipLimit_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaOutputMode", ("hipOutputMode", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaTextureReadMode", ("hipTextureReadMode", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureFilterMode", ("hipTextureFilterMode", CONV_TEX, API_RUNTIME)),
+        ("cudaChannelFormatKind", ("hipChannelFormatKind", CONV_TEX, API_RUNTIME)),
+        ("cudaChannelFormatDesc", ("hipChannelFormatDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceDesc", ("hipResourceDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceViewDesc", ("hipResourceViewDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureDesc", ("hipTextureDesc", CONV_TEX, API_RUNTIME)),
+        (
+            "surfaceReference",
+            ("hipSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaTextureObject_t", ("hipTextureObject_t", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceType", ("hipResourceType", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureAddressMode", ("hipTextureAddressMode", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaSurfaceBoundaryMode",
+            ("hipSurfaceBoundaryMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSurfaceFormatMode",
+            ("hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaTextureType1D", ("hipTextureType1D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureType2D", ("hipTextureType2D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureType3D", ("hipTextureType3D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureTypeCubemap", ("hipTextureTypeCubemap", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaTextureType1DLayered",
+            ("hipTextureType1DLayered", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaTextureType2DLayered",
+            ("hipTextureType2DLayered", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaTextureTypeCubemapLayered",
+            ("hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaIpcEventHandle_t", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcEventHandle_st", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcMemHandle_t", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcMemHandle_st", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaGraphicsCubeFace",
+            ("hipGraphicsCubeFace", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapFlags",
+            ("hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsRegisterFlags",
+            ("hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceList",
+            ("hipGLDeviceList", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaGLMapFlags", ("hipGLMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaD3D9DeviceList",
+            ("hipD3D9DeviceList", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapFlags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterFlags",
+            ("hipD3D9RegisterFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceList",
+            ("hipd3d10DeviceList", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10MapFlags",
+            ("hipD3D10MapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterFlags",
+            ("hipD3D10RegisterFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceList",
+            ("hipd3d11DeviceList", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEglStreamConnection",
+            ("hipEglStreamConnection", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cublasHandle_t", ("hipblasHandle_t", CONV_TYPE, API_BLAS)),
+        ("cublasOperation_t", ("hipblasOperation_t", CONV_TYPE, API_BLAS)),
+        ("cublasStatus_t", ("hipblasStatus_t", CONV_TYPE, API_BLAS)),
+        ("cublasFillMode_t", ("hipblasFillMode_t", CONV_TYPE, API_BLAS)),
+        ("cublasDiagType_t", ("hipblasDiagType_t", CONV_TYPE, API_BLAS)),
+        ("cublasSideMode_t", ("hipblasSideMode_t", CONV_TYPE, API_BLAS)),
+        ("cublasPointerMode_t", ("hipblasPointerMode_t", CONV_TYPE, API_BLAS)),
+        ("cublasGemmAlgo_t", ("hipblasGemmAlgo_t", CONV_TYPE, API_BLAS)),
+        (
+            "cublasAtomicsMode_t",
+            ("hipblasAtomicsMode_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDataType_t",
+            ("hipblasDatatype_t", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("curandStatus", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
+        ("curandStatus_t", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
+        ("curandRngType", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
+        ("curandRngType_t", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
+        ("curandGenerator_st", ("hiprandGenerator_st", CONV_TYPE, API_RAND)),
+        ("curandGenerator_t", ("hiprandGenerator_t", CONV_TYPE, API_RAND)),
+        (
+            "curandDirectionVectorSet",
+            ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDirectionVectorSet_t",
+            ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandOrdering", ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curandOrdering_t",
+            ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistribution_st",
+            ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2V_st",
+            ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistribution_t",
+            ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2V_t",
+            ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionShift_st",
+            ("hiprandDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionShift_t",
+            ("hiprandDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionM2Shift_st",
+            ("hiprandDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionM2Shift_t",
+            ("hiprandDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2_st",
+            ("hiprandHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2_t",
+            ("hiprandHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2K_st",
+            ("hiprandHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2K_t",
+            ("hiprandHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDiscreteDistribution_st",
+            ("hiprandDiscreteDistribution_st", CONV_TYPE, API_RAND),
+        ),
+        (
+            "curandDiscreteDistribution_t",
+            ("hiprandDiscreteDistribution_t", CONV_TYPE, API_RAND),
+        ),
+        ("curandMethod", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        ("curandMethod_t", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curandDirectionVectors32_t",
+            ("hiprandDirectionVectors32_t", CONV_TYPE, API_RAND),
+        ),
+        (
+            "curandDirectionVectors64_t",
+            ("hiprandDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandStateMtgp32_t", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
+        ("curandStateMtgp32", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
+        (
+            "curandStateScrambledSobol64_t",
+            ("hiprandStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandStateSobol64_t",
+            ("hiprandStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandStateScrambledSobol32_t",
+            ("hiprandStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandStateSobol32_t", ("hiprandStateSobol32_t", CONV_TYPE, API_RAND)),
+        ("curandStateMRG32k3a_t", ("hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND)),
+        (
+            "curandStatePhilox4_32_10_t",
+            ("hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND),
+        ),
+        ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
+        ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
+        ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
+        ("CUuuid", ("hipUUID", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraph_t", ("hipGraph_t", CONV_TYPE, API_RAND)),
+        ("cudaGraphExec_t", ("hipGraphExec_t", CONV_TYPE, API_RAND)),
+    ]
+)
+
+CUDA_INCLUDE_MAP = collections.OrderedDict(
+    [
+        # since pytorch uses "\b{pattern}\b" as the actual re pattern,
+        # patterns listed here have to begin and end with alnum chars
+        (
+            "include <cuda.h",
+            ("include <hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER),
+        ),
+        (
+            'include "cuda.h',
+            ('include "hip/hip_runtime.h', CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER),
+        ),
+        (
+            "cuda_runtime.h",
+            ("hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME),
+        ),
+        ("cuda_runtime_api.h", ("hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuda_profiler_api.h", ("hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME)),
+        (
+            "channel_descriptor.h",
+            ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME),
+        ),
+        ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
+        ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("library_types.h", ("hip/library_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
+        (
+            "cuda_texture_types.h",
+            ("hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME),
+        ),
+        ("cooperative_groups.h", ("hip/hip_cooperative_groups.h", CONV_INCLUDE, API_RUNTIME)),
+        ("vector_types.h", ("hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cublas.h", ("hipblas/hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
+        ("cublas_v2.h", ("hipblas/hipblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
+        ("cublasLt.h", ("hipblaslt/hipblaslt.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
+        ("curand.h", ("hiprand/hiprand.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND)),
+        ("curand_kernel.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_discrete.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_discrete2.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_globals.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_lognormal.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mrg32k3a.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32_host.h", ("hiprand/hiprand_mtgp32_host.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32_kernel.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        (
+            "curand_mtgp32dc_p_11213.h",
+            ("rocrand/rocrand_mtgp32_11213.h", CONV_INCLUDE, API_RAND),
+        ),
+        ("curand_normal.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_normal_static.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_philox4x32_x.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_poisson.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_precalc.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_uniform.h", ("hiprand/hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("cusparse.h", ("hipsparse/hipsparse.h", CONV_INCLUDE, API_RAND)),
+        ("cufft.h", ("hipfft/hipfft.h", CONV_INCLUDE, API_BLAS)),
+        ("cufftXt.h", ("hipfft/hipfftXt.h", CONV_INCLUDE, API_BLAS)),
+        # PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
+        ("<nccl.h>", ("<rccl/rccl.h>", CONV_INCLUDE, API_RUNTIME)),
+        ("nvrtc.h", ("hip/hiprtc.h", CONV_INCLUDE, API_RTC)),
+        ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)),
+        ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_run_length_encode.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_radix_sort.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_select.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("nvToolsExt.h", ("roctracer/roctx.h", CONV_INCLUDE, API_ROCTX)),
+        ("nvml.h", ("rocm_smi/rocm_smi.h", CONV_INCLUDE, API_ROCMSMI)),
+    ]
+)
+
+CUDA_IDENTIFIER_MAP = collections.OrderedDict(
+    [
+        ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)),
+        (
+            "CUDA_ERROR_INVALID_CONTEXT",
+            ("hipErrorInvalidContext", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
+            ("hipErrorContextAlreadyCurrent", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_ARRAY_IS_MAPPED",
+            ("hipErrorArrayIsMapped", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_ALREADY_MAPPED", ("hipErrorAlreadyMapped", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_ALREADY_ACQUIRED",
+            ("hipErrorAlreadyAcquired", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_NOT_MAPPED", ("hipErrorNotMapped", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_NOT_MAPPED_AS_ARRAY",
+            ("hipErrorNotMappedAsArray", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_NOT_MAPPED_AS_POINTER",
+            ("hipErrorNotMappedAsPointer", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_ALREADY_IN_USE",
+            ("hipErrorContextAlreadyInUse", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_INVALID_SOURCE", ("hipErrorInvalidSource", CONV_TYPE, API_DRIVER)),
+        ("CUDA_ERROR_FILE_NOT_FOUND", ("hipErrorFileNotFound", CONV_TYPE, API_DRIVER)),
+        ("CUDA_ERROR_NOT_FOUND", ("hipErrorNotFound", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
+            (
+                "hipErrorLaunchIncompatibleTexturing",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE",
+            ("hipErrorPrimaryContextActive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_IS_DESTROYED",
+            ("hipErrorContextIsDestroyed", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NOT_PERMITTED",
+            ("hipErrorNotPermitted", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NOT_SUPPORTED",
+            ("hipErrorNotSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMissingConfiguration",
+            ("hipErrorMissingConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorPriorLaunchFailure",
+            ("hipErrorPriorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidDeviceFunction",
+            ("hipErrorInvalidDeviceFunction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidConfiguration",
+            ("hipErrorInvalidConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidPitchValue",
+            ("hipErrorInvalidPitchValue", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidSymbol",
+            ("hipErrorInvalidSymbol", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidHostPointer",
+            ("hipErrorInvalidHostPointer", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidDevicePointer",
+            ("hipErrorInvalidDevicePointer", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaErrorInvalidTexture",
+            ("hipErrorInvalidTexture", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidTextureBinding",
+            ("hipErrorInvalidTextureBinding", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidChannelDescriptor",
+            (
+                "hipErrorInvalidChannelDescriptor",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorInvalidMemcpyDirection",
+            ("hipErrorInvalidMemcpyDirection", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorAddressOfConstant",
+            ("hipErrorAddressOfConstant", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTextureFetchFailed",
+            ("hipErrorTextureFetchFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTextureNotBound",
+            ("hipErrorTextureNotBound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSynchronizationError",
+            ("hipErrorSynchronizationError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidFilterSetting",
+            ("hipErrorInvalidFilterSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidNormSetting",
+            ("hipErrorInvalidNormSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMixedDeviceExecution",
+            ("hipErrorMixedDeviceExecution", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNotYetImplemented",
+            ("hipErrorNotYetImplemented", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMemoryValueTooLarge",
+            ("hipErrorMemoryValueTooLarge", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInsufficientDriver",
+            ("hipErrorInsufficientDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSetOnActiveProcess",
+            ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidSurface",
+            ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateVariableName",
+            ("hipErrorDuplicateVariableName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateTextureName",
+            ("hipErrorDuplicateTextureName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateSurfaceName",
+            ("hipErrorDuplicateSurfaceName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDevicesUnavailable",
+            ("hipErrorDevicesUnavailable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorIncompatibleDriverContext",
+            (
+                "hipErrorIncompatibleDriverContext",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorDeviceAlreadyInUse",
+            ("hipErrorDeviceAlreadyInUse", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchMaxDepthExceeded",
+            ("hipErrorLaunchMaxDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFileScopedTex",
+            ("hipErrorLaunchFileScopedTex", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFileScopedSurf",
+            ("hipErrorLaunchFileScopedSurf", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSyncDepthExceeded",
+            ("hipErrorSyncDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchPendingCountExceeded",
+            (
+                "hipErrorLaunchPendingCountExceeded",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorNotPermitted",
+            ("hipErrorNotPermitted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNotSupported",
+            ("hipErrorNotSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorStartupFailure",
+            ("hipErrorStartupFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorApiFailureBase",
+            ("hipErrorApiFailureBase", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_SUCCESS", ("hipSuccess", CONV_TYPE, API_DRIVER)),
+        ("cudaSuccess", ("hipSuccess", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_VALUE", ("hipErrorInvalidValue", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorInvalidValue", ("hipErrorInvalidValue", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ERROR_OUT_OF_MEMORY",
+            ("hipErrorMemoryAllocation", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorMemoryAllocation",
+            ("hipErrorMemoryAllocation", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_NOT_INITIALIZED",
+            ("hipErrorNotInitialized", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInitializationError",
+            ("hipErrorInitializationError", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_DEINITIALIZED", ("hipErrorDeinitialized", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorCudartUnloading",
+            ("hipErrorDeinitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_DISABLED",
+            ("hipErrorProfilerDisabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerDisabled",
+            ("hipErrorProfilerDisabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_NOT_INITIALIZED",
+            ("hipErrorProfilerNotInitialized", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerNotInitialized",
+            ("hipErrorProfilerNotInitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_ALREADY_STARTED",
+            ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerAlreadyStarted",
+            ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_ALREADY_STOPPED",
+            ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerAlreadyStopped",
+            ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_NO_DEVICE", ("hipErrorNoDevice", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorNoDevice", ("hipErrorNoDevice", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_DEVICE", ("hipErrorInvalidDevice", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorInvalidDevice", ("hipErrorInvalidDevice", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_IMAGE", ("hipErrorInvalidImage", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorInvalidKernelImage",
+            ("hipErrorInvalidImage", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_MAP_FAILED", ("hipErrorMapFailed", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorMapBufferObjectFailed",
+            ("hipErrorMapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_UNMAP_FAILED", ("hipErrorUnmapFailed", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorUnmapBufferObjectFailed",
+            ("hipErrorUnmapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NO_BINARY_FOR_GPU",
+            ("hipErrorNoBinaryForGpu", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorNoKernelImageForDevice",
+            ("hipErrorNoBinaryForGpu", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_ECC_UNCORRECTABLE",
+            ("hipErrorECCNotCorrectable", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorECCUncorrectable",
+            ("hipErrorECCNotCorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_UNSUPPORTED_LIMIT",
+            ("hipErrorUnsupportedLimit", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorUnsupportedLimit",
+            ("hipErrorUnsupportedLimit", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED",
+            ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessUnsupported",
+            ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_PTX",
+            ("hipErrorInvalidKernelFile", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidPtx",
+            ("hipErrorInvalidKernelFile", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT",
+            ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidGraphicsContext",
+            ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NVLINK_UNCORRECTABLE",
+            ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNvlinkUncorrectable",
+            ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
+            ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorSharedObjectSymbolNotFound",
+            (
+                "hipErrorSharedObjectSymbolNotFound",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED",
+            ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorSharedObjectInitFailed",
+            ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_OPERATING_SYSTEM",
+            ("hipErrorOperatingSystem", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorOperatingSystem",
+            ("hipErrorOperatingSystem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_HANDLE",
+            ("hipErrorInvalidResourceHandle", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidResourceHandle",
+            ("hipErrorInvalidResourceHandle", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_NOT_READY", ("hipErrorNotReady", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorNotReady", ("hipErrorNotReady", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ERROR_ILLEGAL_ADDRESS",
+            ("hipErrorIllegalAddress", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorIllegalAddress",
+            ("hipErrorIllegalAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES",
+            ("hipErrorLaunchOutOfResources", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorLaunchOutOfResources",
+            ("hipErrorLaunchOutOfResources", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_LAUNCH_TIMEOUT", ("hipErrorLaunchTimeOut", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorLaunchTimeout",
+            ("hipErrorLaunchTimeOut", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED",
+            ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessAlreadyEnabled",
+            ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED",
+            ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessNotEnabled",
+            ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_ASSERT",
+            ("hipErrorAssert", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorAssert",
+            ("hipErrorAssert", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_TOO_MANY_PEERS",
+            ("hipErrorTooManyPeers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTooManyPeers",
+            ("hipErrorTooManyPeers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED",
+            ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorHostMemoryAlreadyRegistered",
+            ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED",
+            ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorHostMemoryNotRegistered",
+            ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_HARDWARE_STACK_ERROR",
+            ("hipErrorHardwareStackError", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorHardwareStackError",
+            ("hipErrorHardwareStackError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_ILLEGAL_INSTRUCTION",
+            ("hipErrorIllegalInstruction", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorIllegalInstruction",
+            ("hipErrorIllegalInstruction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_MISALIGNED_ADDRESS",
+            ("hipErrorMisalignedAddress", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMisalignedAddress",
+            ("hipErrorMisalignedAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_ADDRESS_SPACE",
+            ("hipErrorInvalidAddressSpace", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidAddressSpace",
+            ("hipErrorInvalidAddressSpace", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_PC",
+            ("hipErrorInvalidPc", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidPc",
+            ("hipErrorInvalidPc", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_LAUNCH_FAILED",
+            ("hipErrorLaunchFailure", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFailure",
+            ("hipErrorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_UNKNOWN",
+            ("hipErrorUnknown", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cudaErrorUnknown", ("hipErrorUnknown", CONV_TYPE, API_RUNTIME)),
+        (
+            "CU_TR_ADDRESS_MODE_WRAP",
+            ("HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_CLAMP",
+            ("HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_MIRROR",
+            ("HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_BORDER",
+            ("HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_X",
+            ("HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_X",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_Y",
+            ("HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_Y",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_Z",
+            ("HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_Z",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT8",
+            ("HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT16",
+            ("HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT32",
+            ("HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT8",
+            ("HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT16",
+            ("HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT32",
+            ("HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_AD_FORMAT_HALF", ("HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER)),
+        ("CU_AD_FORMAT_FLOAT", ("HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_COMPUTEMODE_DEFAULT",
+            ("hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_EXCLUSIVE",
+            ("hipComputeModeExclusive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_PROHIBITED",
+            ("hipComputeModeProhibited", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_EXCLUSIVE_PROCESS",
+            ("hipComputeModeExclusiveProcess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_READ_MOSTLY",
+            ("hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+            ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+            (
+                "hipMemAdviseSetPreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+            (
+                "hipMemAdviseUnsetPreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_ACCESSED_BY",
+            ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+            ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY",
+            ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION",
+            (
+                "hipMemRangeAttributePreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY",
+            ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION",
+            (
+                "hipMemRangeAttributeLastPrefetchLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_CTX_SCHED_AUTO",
+            ("HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_SPIN",
+            ("HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_YIELD",
+            ("HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_BLOCKING_SYNC",
+            ("HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_BLOCKING_SYNC",
+            ("HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_MASK",
+            ("HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_MAP_HOST",
+            ("HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_LMEM_RESIZE_TO_MAX",
+            ("HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_FLAGS_MASK",
+            ("HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LAUNCH_PARAM_BUFFER_POINTER",
+            ("HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_LAUNCH_PARAM_BUFFER_SIZE",
+            ("HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_LAUNCH_PARAM_END", ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_IPC_HANDLE_SIZE",
+            ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_DEVICEMAP",
+            ("HIP_MEMHOSTALLOC_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_PORTABLE",
+            ("HIP_MEMHOSTALLOC_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_WRITECOMBINED",
+            ("HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_DEVICEMAP",
+            ("HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_IOMEMORY",
+            ("HIP_MEMHOSTREGISTER_IOMEMORY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_PORTABLE",
+            ("HIP_MEMHOSTREGISTER_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PARAM_TR_DEFAULT",
+            ("HIP_PARAM_TR_DEFAULT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_LEGACY",
+            ("HIP_STREAM_LEGACY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_PER_THREAD",
+            ("HIP_STREAM_PER_THREAD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSA_OVERRIDE_FORMAT",
+            ("HIP_TRSA_OVERRIDE_FORMAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSF_NORMALIZED_COORDINATES",
+            ("HIP_TRSF_NORMALIZED_COORDINATES", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSF_READ_AS_INTEGER",
+            ("HIP_TRSF_READ_AS_INTEGER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_TRSF_SRGB", ("HIP_TRSF_SRGB", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUDA_ARRAY3D_2DARRAY",
+            ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_CUBEMAP",
+            ("HIP_ARRAY3D_CUBEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_DEPTH_TEXTURE",
+            ("HIP_ARRAY3D_DEPTH_TEXTURE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_LAYERED",
+            ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_SURFACE_LDST",
+            ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_TEXTURE_GATHER",
+            ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxThreadsPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X",
+            ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y",
+            ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z",
+            ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X",
+            ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y",
+            ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z",
+            ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY",
+            (
+                "hipDeviceAttributeTotalConstantMemory",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_WARP_SIZE",
+            ("hipDeviceAttributeWarpSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_PITCH",
+            ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxRegistersPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxRegistersPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CLOCK_RATE",
+            ("hipDeviceAttributeClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT",
+            (
+                "hipDeviceAttributeTextureAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT",
+            (
+                "hipDeviceAttributeMultiprocessorCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT",
+            (
+                "hipDeviceAttributeKernelExecTimeout",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_INTEGRATED",
+            ("hipDeviceAttributeIntegrated", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY",
+            (
+                "hipDeviceAttributeCanMapHostMemory",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_MODE",
+            ("hipDeviceAttributeComputeMode", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture3DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture3DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH",
+            (
+                "hipDeviceAttributeMaxTexture3DDepth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT",
+            (
+                "hipDeviceAttributeSurfaceAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS",
+            ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_ECC_ENABLED",
+            ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_BUS_ID",
+            ("hipDeviceAttributePciBusId", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID",
+            ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TCC_DRIVER",
+            ("hipDeviceAttributeTccDriver", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE",
+            (
+                "hipDeviceAttributeMemoryClockRate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH",
+            ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE",
+            ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR",
+            ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING",
+            (
+                "hipDeviceAttributeUnifiedAddressing",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER",
+            (
+                "hipDeviceAttributeCanTex2DGather",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DWidthAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DHeightAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DDepthAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID",
+            ("hipDeviceAttributePciDomainId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT",
+            (
+                "hipDeviceAttributeTexturePitchAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH",
+            (
+                "hipDeviceAttributeMaxTextureCubemapWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface1DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface2DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface2DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface3DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface3DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH",
+            (
+                "hipDeviceAttributeMaxSurface3DDepth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DLinearWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearPitch",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR",
+            ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR",
+            ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DMipmappedWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED",
+            (
+                "hipDeviceAttributeStreamPrioritiesSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED",
+            (
+                "hipDeviceAttributeGlobalL1CacheSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED",
+            (
+                "hipDeviceAttributeLocalL1CacheSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor",
+                CONV_TYPE,
+                API_DRIVER,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR",
+            (
+                "hipDeviceAttributeMaxRegistersPerMultiprocessor",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY",
+            ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD",
+            ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID",
+            (
+                "hipDeviceAttributeMultiGpuBoardGroupId",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED",
+            (
+                "hipDeviceAttributeHostNativeAtomicSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO",
+            (
+                "hipDeviceAttributeSingleToDoublePrecisionPerfRatio",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS",
+            (
+                "hipDeviceAttributePageableMemoryAccess",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS",
+            (
+                "hipDeviceAttributeConcurrentManagedAccess",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED",
+            (
+                "hipDeviceAttributeComputePreemptionSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM",
+            (
+                "hipDeviceAttributeCanUseHostPointerForRegisteredMem",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX",
+            ("hipDeviceAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_CONTEXT",
+            ("hipPointerAttributeContext", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_MEMORY_TYPE",
+            ("hipPointerAttributeMemoryType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_DEVICE_POINTER",
+            (
+                "hipPointerAttributeDevicePointer",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_HOST_POINTER",
+            ("hipPointerAttributeHostPointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_P2P_TOKENS",
+            ("hipPointerAttributeP2pTokens", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_SYNC_MEMOPS",
+            ("hipPointerAttributeSyncMemops", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_BUFFER_ID",
+            ("hipPointerAttributeBufferId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_IS_MANAGED",
+            ("hipPointerAttributeIsManaged", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK",
+            (
+                "hipFuncAttributeMaxThreadsPerBlocks",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES",
+            ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES",
+            ("hipFuncAttributeMaxDynamicSharedMemorySize", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES",
+            ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES",
+            ("hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_NUM_REGS",
+            ("hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_PTX_VERSION",
+            ("hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_BINARY_VERSION",
+            ("hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_CACHE_MODE_CA",
+            ("hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_MAX",
+            ("hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE",
+            ("hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY",
+            ("hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+            ("hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_NONE",
+            ("hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY",
+            (
+                "hipGraphicsRegisterFlagsReadOnly",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD",
+            (
+                "hipGraphicsRegisterFlagsWriteDiscard",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST",
+            (
+                "hipGraphicsRegisterFlagsSurfaceLoadStore",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER",
+            (
+                "hipGraphicsRegisterFlagsTextureGather",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_OCCUPANCY_DEFAULT",
+            ("hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE",
+            (
+                "hipOccupancyDisableCachingOverride",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_FUNC_CACHE_PREFER_NONE",
+            ("hipFuncCachePreferNone", CONV_CACHE, API_DRIVER),
+        ),
+        (
+            "CU_FUNC_CACHE_PREFER_SHARED",
+            ("hipFuncCachePreferShared", CONV_CACHE, API_DRIVER),
+        ),
+        ("CU_FUNC_CACHE_PREFER_L1", ("hipFuncCachePreferL1", CONV_CACHE, API_DRIVER)),
+        (
+            "CU_FUNC_CACHE_PREFER_EQUAL",
+            ("hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER),
+        ),
+        (
+            "CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS",
+            ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_IPC_HANDLE_SIZE", ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_JIT_CACHE_OPTION_NONE",
+            ("hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_CACHE_OPTION_CG",
+            ("hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_CACHE_OPTION_CA",
+            ("hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PREFER_PTX",
+            ("hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PREFER_BINARY",
+            ("hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_JIT_MAX_REGISTERS", ("hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_THREADS_PER_BLOCK",
+            ("hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_WALL_TIME", ("hipJitOptionWallTime", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_INFO_LOG_BUFFER", ("hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES",
+            ("hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_ERROR_LOG_BUFFER",
+            ("hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES",
+            ("hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_OPTIMIZATION_LEVEL",
+            ("hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_TARGET_FROM_CUCONTEXT",
+            ("hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_TARGET", ("hipJitOptionTarget", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_FALLBACK_STRATEGY",
+            ("hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_GENERATE_DEBUG_INFO",
+            ("hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_LOG_VERBOSE", ("hipJitOptionLogVerbose", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_GENERATE_LINE_INFO",
+            ("hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_CACHE_MODE", ("hipJitOptionCacheMode", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_NEW_SM3X_OPT", ("hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_FAST_COMPILE", ("hipJitOptionFastCompile", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_NUM_OPTIONS", ("hipJitOptionNumOptions", CONV_JIT, API_DRIVER)),
+        (
+            "CU_TARGET_COMPUTE_10",
+            ("hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_11",
+            ("hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_12",
+            ("hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_13",
+            ("hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_20",
+            ("hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_21",
+            ("hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_30",
+            ("hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_32",
+            ("hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_35",
+            ("hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_37",
+            ("hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_50",
+            ("hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_52",
+            ("hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_53",
+            ("hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_60",
+            ("hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_61",
+            ("hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_62",
+            ("hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_CUBIN",
+            ("hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_PTX",
+            ("hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_FATBINARY",
+            ("hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_OBJECT",
+            ("hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_LIBRARY",
+            ("hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_NUM_INPUT_TYPES",
+            ("hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_STACK_SIZE",
+            ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_PRINTF_FIFO_SIZE",
+            ("hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_MALLOC_HEAP_SIZE",
+            ("hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH",
+            ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT",
+            (
+                "hipLimitDevRuntimePendingLaunchCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_LIMIT_STACK_SIZE",
+            ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_GLOBAL",
+            ("hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_HOST",
+            ("hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_SINGLE",
+            ("hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_HOST",
+            ("hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_DEVICE",
+            ("hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_ARRAY",
+            ("hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_UNIFIED",
+            ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_ARRAY",
+            ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_MIPMAPPED_ARRAY",
+            ("hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_LINEAR",
+            ("hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_PITCH2D",
+            ("hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_RES_VIEW_FORMAT_NONE", ("hipResViewFormatNone", CONV_TEX, API_DRIVER)),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X8",
+            ("hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X8",
+            ("hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X8",
+            ("hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X8",
+            ("hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X8",
+            ("hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X8",
+            ("hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X16",
+            ("hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X16",
+            ("hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X16",
+            ("hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X16",
+            ("hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X16",
+            ("hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X16",
+            ("hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X32",
+            ("hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X32",
+            ("hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X32",
+            ("hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X32",
+            ("hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X32",
+            ("hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X32",
+            ("hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_1X16",
+            ("hipResViewFormatHalf1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_2X16",
+            ("hipResViewFormatHalf2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_4X16",
+            ("hipResViewFormatHalf4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_1X32",
+            ("hipResViewFormatFloat1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_2X32",
+            ("hipResViewFormatFloat2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_4X32",
+            ("hipResViewFormatFloat4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC1",
+            ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC2",
+            ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC3",
+            ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC4",
+            ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC4",
+            ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC5",
+            ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC5",
+            ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC6H",
+            ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC6H",
+            ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC7",
+            ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE",
+            ("hipSharedMemBankSizeDefault", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE",
+            ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE",
+            ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_STREAM_DEFAULT", ("hipStreamDefault", CONV_TYPE, API_DRIVER)),
+        ("CU_STREAM_NON_BLOCKING", ("hipStreamNonBlocking", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_STREAM_WAIT_VALUE_GEQ",
+            ("hipStreamWaitValueGeq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_EQ",
+            ("hipStreamWaitValueEq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_AND",
+            ("hipStreamWaitValueAnd", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_FLUSH",
+            ("hipStreamWaitValueFlush", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WRITE_VALUE_DEFAULT",
+            ("hipStreamWriteValueDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER",
+            (
+                "hipStreamWriteValueNoMemoryBarrier",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_STREAM_MEM_OP_WAIT_VALUE_32",
+            ("hipStreamBatchMemOpWaitValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_MEM_OP_WRITE_VALUE_32",
+            ("hipStreamBatchMemOpWriteValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES",
+            (
+                "hipStreamBatchMemOpFlushRemoteWrites",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGetErrorName",
+            ("hipGetErrorName", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGetErrorString",
+            ("hipDrvGetErrorString", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuInit", ("hipInit", CONV_INIT, API_DRIVER)),
+        ("cuDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_DRIVER)),
+        ("cuCtxCreate", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxCreate_v2", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxDestroy", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxDestroy_v2", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetApiVersion", ("hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetCacheConfig", ("hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetCurrent", ("hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetDevice", ("hipCtxGetDevice", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetFlags", ("hipCtxGetFlags", CONV_CONTEXT, API_DRIVER)),
+        ("cuDeviceGetUuid", ("hipDeviceGetUuid", CONV_CONTEXT, API_DRIVER)),
+        (
+            "cuCtxGetLimit",
+            ("hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuCtxGetSharedMemConfig",
+            ("hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuCtxGetStreamPriorityRange",
+            ("hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuCtxPopCurrent_v2", ("hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxPushCurrent_v2", ("hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxSetCacheConfig", ("hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxSetCurrent", ("hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER)),
+        (
+            "cuCtxSetLimit",
+            ("hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuCtxSetSharedMemConfig",
+            ("hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+        ),
+        ("cuCtxSynchronize", ("hipCtxSynchronize", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxAttach", ("hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuCtxDetach", ("hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuCtxEnablePeerAccess", ("hipCtxEnablePeerAccess", CONV_PEER, API_DRIVER)),
+        ("cuCtxDisablePeerAccess", ("hipCtxDisablePeerAccess", CONV_PEER, API_DRIVER)),
+        ("cuDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_DRIVER)),
+        (
+            "cuDeviceGetP2PAttribute",
+            ("hipDeviceGetP2PAttribute", CONV_PEER, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuDevicePrimaryCtxGetState",
+            ("hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxRelease",
+            ("hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxReset",
+            ("hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxRetain",
+            ("hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxSetFlags",
+            ("hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER),
+        ),
+        ("cuDeviceGet", ("hipDeviceGet", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetName", ("hipDeviceGetName", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetCount", ("hipGetDeviceCount", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetByPCIBusId", ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceTotalMem_v2", ("hipDeviceTotalMem", CONV_DEVICE, API_DRIVER)),
+        (
+            "cuDeviceComputeCapability",
+            ("hipDeviceComputeCapability", CONV_DEVICE, API_DRIVER),
+        ),
+        ("cuDeviceGetProperties", ("hipGetDeviceProperties", CONV_DEVICE, API_DRIVER)),
+        ("cuLinkAddData", ("hipLinkAddData", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLinkAddFile", ("hipLinkAddFile", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuLinkComplete",
+            ("hipLinkComplete", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLinkCreate", ("hipLinkCreate", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLinkDestroy", ("hipLinkDestroy", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuModuleGetFunction", ("hipModuleGetFunction", CONV_MODULE, API_DRIVER)),
+        ("cuModuleGetGlobal_v2", ("hipModuleGetGlobal", CONV_MODULE, API_DRIVER)),
+        (
+            "cuModuleGetSurfRef",
+            ("hipModuleGetSurfRef", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuModuleGetTexRef", ("hipModuleGetTexRef", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoad", ("hipModuleLoad", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoadData", ("hipModuleLoadData", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoadDataEx", ("hipModuleLoadDataEx", CONV_MODULE, API_DRIVER)),
+        (
+            "cuModuleLoadFatBinary",
+            ("hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuModuleUnload", ("hipModuleUnload", CONV_MODULE, API_DRIVER)),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK",
+            (
+                "hipDeviceP2PAttributePerformanceRank",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED",
+            (
+                "hipDeviceP2PAttributeAccessSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED",
+            (
+                "hipDeviceP2PAttributeNativeAtomicSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("CU_EVENT_DEFAULT", ("hipEventDefault", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_BLOCKING_SYNC", ("hipEventBlockingSync", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_DISABLE_TIMING", ("hipEventDisableTiming", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_INTERPROCESS", ("hipEventInterprocess", CONV_EVENT, API_DRIVER)),
+        ("cuEventCreate", ("hipEventCreate", CONV_EVENT, API_DRIVER)),
+        ("cuEventDestroy", ("hipEventDestroy", CONV_EVENT, API_DRIVER)),
+        ("cuEventDestroy_v2", ("hipEventDestroy", CONV_EVENT, API_DRIVER)),
+        ("cuEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_DRIVER)),
+        ("cuEventQuery", ("hipEventQuery", CONV_EVENT, API_DRIVER)),
+        ("cuEventRecord", ("hipEventRecord", CONV_EVENT, API_DRIVER)),
+        ("cuEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_DRIVER)),
+        ("cuFuncSetAttribute", ("hipFuncSetAttribute", CONV_EVENT, API_DRIVER)),
+        (
+            "cuFuncGetAttribute",
+            ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER)),
+        (
+            "cuFuncSetSharedMemConfig",
+            ("hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLaunchKernel", ("hipModuleLaunchKernel", CONV_MODULE, API_DRIVER)),
+        (
+            "cuFuncSetBlockShape",
+            ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuFuncSetSharedSize",
+            ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLaunch", ("hipLaunch", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLaunchGrid", ("hipLaunchGrid", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuLaunchGridAsync",
+            ("hipLaunchGridAsync", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuParamSetf", ("hipParamSetf", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuParamSeti", ("hipParamSeti", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuParamSetSize",
+            ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuParamSetSize",
+            ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuParamSetv", ("hipParamSetv", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuOccupancyMaxActiveBlocksPerMultiprocessor",
+            (
+                "hipModuleOccupancyMaxActiveBlocksPerMultiprocessor",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+            ),
+        ),
+        (
+            "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+            (
+                "hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuOccupancyMaxPotentialBlockSize",
+            ("hipModuleOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER),
+        ),
+        (
+            "cuOccupancyMaxPotentialBlockSizeWithFlags",
+            (
+                "hipModuleOccupancyMaxPotentialBlockSizeWithFlags",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cuStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamAttachMemAsync",
+            ("hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamCreate",
+            ("hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamCreateWithPriority",
+            ("hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)),
+        ("cuStreamDestroy_v2", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)),
+        ("cuStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamGetPriority",
+            ("hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuStreamQuery", ("hipStreamQuery", CONV_STREAM, API_DRIVER)),
+        ("cuStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_DRIVER)),
+        ("cuStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamWaitValue32",
+            ("hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamWriteValue32",
+            ("hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamBatchMemOp",
+            ("hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuArray3DCreate", ("hipArray3DCreate", CONV_MEM, API_DRIVER)),
+        (
+            "cuArray3DGetDescriptor",
+            ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuArrayCreate", ("hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuArrayDestroy", ("hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuArrayGetDescriptor",
+            ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcCloseMemHandle",
+            ("hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcGetEventHandle",
+            ("hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcGetMemHandle",
+            ("hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcOpenEventHandle",
+            ("hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcOpenMemHandle",
+            ("hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemAlloc_v2", ("hipMalloc", CONV_MEM, API_DRIVER)),
+        ("cuMemAllocHost", ("hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemAllocManaged",
+            ("hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemAllocPitch",
+            ("hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpy", ("hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpy2D", ("hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpy2DAsync",
+            ("hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy2DUnaligned",
+            ("hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpy3D", ("hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpy3DAsync",
+            ("hipMemcpy3DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy3DPeer",
+            ("hipMemcpy3DPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy3DPeerAsync",
+            ("hipMemcpy3DPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyAsync", ("hipMemcpyAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoA", ("hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoD", ("hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoH", ("hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpyAtoHAsync",
+            ("hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyDtoA", ("hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyDtoD_v2", ("hipMemcpyDtoD", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoDAsync_v2", ("hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoH_v2", ("hipMemcpyDtoH", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoHAsync_v2", ("hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyHtoA", ("hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpyHtoAAsync",
+            ("hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyHtoD_v2", ("hipMemcpyHtoD", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyHtoDAsync_v2", ("hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemcpyPeerAsync",
+            ("hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyPeer", ("hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemFree", ("hipFree", CONV_MEM, API_DRIVER)),
+        ("cuMemFree_v2", ("hipFree", CONV_MEM, API_DRIVER)),
+        ("cuMemFreeHost", ("hipHostFree", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemGetAddressRange",
+            ("hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemGetInfo_v2", ("hipMemGetInfo", CONV_MEM, API_DRIVER)),
+        ("cuMemHostAlloc", ("hipHostMalloc", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemHostGetDevicePointer",
+            ("hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemHostGetFlags",
+            ("hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemHostRegister_v2", ("hipHostRegister", CONV_MEM, API_DRIVER)),
+        ("cuMemHostUnregister", ("hipHostUnregister", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD16_v2", ("hipMemsetD16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD16Async",
+            ("hipMemsetD16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D16_v2", ("hipMemsetD2D16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D16Async",
+            ("hipMemsetD2D16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D32_v2", ("hipMemsetD2D32", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D32Async",
+            ("hipMemsetD2D32Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D8_v2", ("hipMemsetD2D8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D8Async",
+            ("hipMemsetD2D8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD32_v2", ("hipMemset", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD32Async", ("hipMemsetAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD8_v2", ("hipMemsetD8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD8Async",
+            ("hipMemsetD8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayCreate",
+            ("hipMipmappedArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayDestroy",
+            ("hipMipmappedArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayGetLevel",
+            ("hipMipmappedArrayGetLevel", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemPrefetchAsync",
+            ("hipMemPrefetchAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemAdvise", ("hipMemAdvise", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemRangeGetAttribute",
+            ("hipMemRangeGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemRangeGetAttributes",
+            ("hipMemRangeGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuPointerGetAttribute",
+            ("hipPointerGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemGetAddressRange_v2",
+            ("hipMemGetAddressRange", CONV_MEM, API_DRIVER),
+        ),
+        (
+            "cuPointerGetAttributes",
+            ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuPointerSetAttribute",
+            ("hipPointerSetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_TR_FILTER_MODE_POINT", ("hipFilterModePoint", CONV_TEX, API_DRIVER)),
+        (
+            "CU_TR_FILTER_MODE_LINEAR",
+            ("hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetAddress",
+            ("hipTexRefGetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetAddressMode",
+            ("hipTexRefGetAddressMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetArray",
+            ("hipTexRefGetArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetBorderColor",
+            ("hipTexRefGetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFilterMode",
+            ("hipTexRefGetFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFlags",
+            ("hipTexRefGetFlags", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFormat",
+            ("hipTexRefGetFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMaxAnisotropy",
+            ("hipTexRefGetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapFilterMode",
+            ("hipTexRefGetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapLevelBias",
+            ("hipTexRefGetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapLevelClamp",
+            ("hipTexRefGetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmappedArray",
+            ("hipTexRefGetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetAddress",
+            ("hipTexRefSetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetAddress2D",
+            ("hipTexRefSetAddress2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefSetAddressMode", ("hipTexRefSetAddressMode", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetArray", ("hipTexRefSetArray", CONV_TEX, API_DRIVER)),
+        (
+            "cuTexRefSetBorderColor",
+            ("hipTexRefSetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefSetFilterMode", ("hipTexRefSetFilterMode", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetFlags", ("hipTexRefSetFlags", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetFormat", ("hipTexRefSetFormat", CONV_TEX, API_DRIVER)),
+        (
+            "cuTexRefSetMaxAnisotropy",
+            ("hipTexRefSetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapFilterMode",
+            ("hipTexRefSetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapLevelBias",
+            ("hipTexRefSetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapLevelClamp",
+            ("hipTexRefSetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmappedArray",
+            ("hipTexRefSetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefCreate", ("hipTexRefCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuTexRefDestroy",
+            ("hipTexRefDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfRefGetArray",
+            ("hipSurfRefGetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfRefSetArray",
+            ("hipSurfRefSetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectCreate",
+            ("hipTexObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectDestroy",
+            ("hipTexObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetResourceDesc",
+            ("hipTexObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetResourceViewDesc",
+            ("hipTexObjectGetResourceViewDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetTextureDesc",
+            ("hipTexObjectGetTextureDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectCreate",
+            ("hipSurfObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectDestroy",
+            ("hipSurfObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectGetResourceDesc",
+            ("hipSurfObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsMapResources",
+            ("hipGraphicsMapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsResourceGetMappedMipmappedArray",
+            (
+                "hipGraphicsResourceGetMappedMipmappedArray",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsResourceGetMappedPointer",
+            (
+                "hipGraphicsResourceGetMappedPointer",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsResourceSetMapFlags",
+            (
+                "hipGraphicsResourceSetMapFlags",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsSubResourceGetMappedArray",
+            (
+                "hipGraphicsSubResourceGetMappedArray",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsUnmapResources",
+            ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsUnregisterResource",
+            (
+                "hipGraphicsUnregisterResource",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuProfilerInitialize",
+            ("hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuProfilerStart", ("hipProfilerStart", CONV_OTHER, API_DRIVER)),
+        ("cuProfilerStop", ("hipProfilerStop", CONV_OTHER, API_DRIVER)),
+        (
+            "CU_GL_DEVICE_LIST_ALL",
+            ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_DEVICE_LIST_CURRENT_FRAME",
+            ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_DEVICE_LIST_NEXT_FRAME",
+            ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuGLGetDevices", ("hipGLGetDevices", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_NONE",
+            ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+                CONV_GL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+                CONV_GL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cuGLCtxCreate", ("hipGLCtxCreate", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuGLInit", ("hipGLInit", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuGLMapBufferObject",
+            ("hipGLMapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLMapBufferObjectAsync",
+            ("hipGLMapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLRegisterBufferObject",
+            ("hipGLRegisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLSetBufferObjectMapFlags",
+            ("hipGLSetBufferObjectMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnmapBufferObject",
+            ("hipGLUnmapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnmapBufferObjectAsync",
+            ("hipGLUnmapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnregisterBufferObject",
+            ("hipGLUnregisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_ALL",
+            ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_NEXT_FRAME",
+            ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9CtxCreate",
+            ("hipD3D9CtxCreate", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9CtxCreateOnDevice",
+            ("hipD3D9CtxCreateOnDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDevice",
+            ("hipD3D9GetDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDevices",
+            ("hipD3D9GetDevices", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDirect3DDevice",
+            ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D9RegisterResource",
+            ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_NONE",
+            ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_READONLY",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_REGISTER_FLAGS_NONE",
+            ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_REGISTER_FLAGS_ARRAY",
+            ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9MapResources",
+            ("hipD3D9MapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9RegisterResource",
+            ("hipD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedArray",
+            ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedPitch",
+            ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedPointer",
+            ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedSize",
+            ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetSurfaceDimensions",
+            (
+                "hipD3D9ResourceGetSurfaceDimensions",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D9ResourceSetMapFlags",
+            ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9UnmapResources",
+            ("hipD3D9UnmapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9UnregisterResource",
+            ("hipD3D9UnregisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_ALL",
+            ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_NEXT_FRAME",
+            (
+                "HIP_D3D10_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D10GetDevice",
+            ("hipD3D10GetDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10GetDevices",
+            ("hipD3D10GetDevices", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D10RegisterResource",
+            (
+                "hipGraphicsD3D10RegisterResource",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_NONE",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_READONLY",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_REGISTER_FLAGS_NONE",
+            ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_REGISTER_FLAGS_ARRAY",
+            ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10CtxCreate",
+            ("hipD3D10CtxCreate", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10CtxCreateOnDevice",
+            ("hipD3D10CtxCreateOnDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10GetDirect3DDevice",
+            ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10MapResources",
+            ("hipD3D10MapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10RegisterResource",
+            ("hipD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedArray",
+            ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedPitch",
+            ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedPointer",
+            (
+                "hipD3D10ResourceGetMappedPointer",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D10ResourceGetMappedSize",
+            ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetSurfaceDimensions",
+            (
+                "hipD3D10ResourceGetSurfaceDimensions",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD310ResourceSetMapFlags",
+            ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10UnmapResources",
+            ("hipD3D10UnmapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10UnregisterResource",
+            ("hipD3D10UnregisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_ALL",
+            ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_NEXT_FRAME",
+            (
+                "HIP_D3D11_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D11CtxCreate",
+            ("hipD3D11CtxCreate", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11CtxCreateOnDevice",
+            ("hipD3D11CtxCreateOnDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11GetDirect3DDevice",
+            ("hipD3D11GetDirect3DDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsVDPAURegisterOutputSurface",
+            (
+                "hipGraphicsVDPAURegisterOutputSurface",
+                CONV_VDPAU,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsVDPAURegisterVideoSurface",
+            (
+                "hipGraphicsVDPAURegisterVideoSurface",
+                CONV_VDPAU,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuVDPAUGetDevice",
+            ("hipVDPAUGetDevice", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuVDPAUCtxCreate",
+            ("hipVDPAUCtxCreate", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerAcquireFrame",
+            ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerConnect",
+            ("hipEGLStreamConsumerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerConnectWithFlags",
+            (
+                "hipEGLStreamConsumerConnectWithFlags",
+                CONV_EGL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuEGLStreamConsumerDisconnect",
+            ("hipEGLStreamConsumerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerReleaseFrame",
+            ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerConnect",
+            ("hipEGLStreamProducerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerDisconnect",
+            ("hipEGLStreamProducerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerPresentFrame",
+            ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerReturnFrame",
+            ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsEGLRegisterImage",
+            ("hipGraphicsEGLRegisterImage", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsResourceGetMappedEglFrame",
+            (
+                "hipGraphicsResourceGetMappedEglFrame",
+                CONV_EGL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDataType_t", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaDataType", ("hipDataType", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_16BF", ("HIP_R_16BF", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_16BF", ("HIP_C_16BF", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_16F", ("HIP_R_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_16F", ("HIP_C_16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32F", ("HIP_R_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32F", ("HIP_C_32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_64F", ("HIP_R_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_64F", ("HIP_C_64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8I", ("HIP_R_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8I", ("HIP_C_8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8U", ("HIP_R_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8U", ("HIP_C_8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32I", ("HIP_R_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32I", ("HIP_C_32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32U", ("HIP_R_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32U", ("HIP_C_32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "MAJOR_VERSION",
+            ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "MINOR_VERSION",
+            ("hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "PATCH_LEVEL",
+            ("hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachGlobal",
+            ("hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachHost",
+            ("hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachSingle",
+            ("hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyDefault",
+            ("hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyDisableCachingOverride",
+            (
+                "hipOccupancyDisableCachingOverride",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaGetLastError", ("hipGetLastError", CONV_ERROR, API_RUNTIME)),
+        ("cudaPeekAtLastError", ("hipPeekAtLastError", CONV_ERROR, API_RUNTIME)),
+        ("cudaGetErrorName", ("hipGetErrorName", CONV_ERROR, API_RUNTIME)),
+        ("cudaGetErrorString", ("hipGetErrorString", CONV_ERROR, API_RUNTIME)),
+        ("cudaMemcpy3DParms", ("hipMemcpy3DParms", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy3DPeerParms",
+            ("hipMemcpy3DPeerParms", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpy", ("hipMemcpy", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToArray", ("hipMemcpyToArray", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToSymbol", ("hipMemcpyToSymbol", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToSymbolAsync", ("hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyAsync", ("hipMemcpyAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2D", ("hipMemcpy2D", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2DAsync", ("hipMemcpy2DAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2DToArray", ("hipMemcpy2DToArray", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy2DArrayToArray",
+            ("hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DFromArray",
+            ("hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DFromArrayAsync",
+            ("hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DToArrayAsync",
+            ("hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpy3D", ("hipMemcpy3D", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy3DAsync",
+            ("hipMemcpy3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy3DPeer",
+            ("hipMemcpy3DPeer", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy3DPeerAsync",
+            ("hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpyArrayToArray",
+            ("hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpyFromArrayAsync",
+            ("hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpyFromSymbol", ("hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpyFromSymbolAsync",
+            ("hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaMemAdvise", ("hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemRangeGetAttribute",
+            ("hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeGetAttributes",
+            ("hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseSetReadMostly",
+            ("hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseUnsetReadMostly",
+            ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseSetPreferredLocation",
+            (
+                "hipMemAdviseSetPreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemAdviseUnsetPreferredLocation",
+            (
+                "hipMemAdviseUnsetPreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemAdviseSetAccessedBy",
+            ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseUnsetAccessedBy",
+            ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributeReadMostly",
+            ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributePreferredLocation",
+            (
+                "hipMemRangeAttributePreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemRangeAttributeAccessedBy",
+            ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributeLastPrefetchLocation",
+            (
+                "hipMemRangeAttributeLastPrefetchLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaMemcpyHostToHost", ("hipMemcpyHostToHost", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyHostToDevice", ("hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyDeviceToHost", ("hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpyDeviceToDevice",
+            ("hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaMemcpyDefault", ("hipMemcpyDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaMemset", ("hipMemset", CONV_MEM, API_RUNTIME)),
+        ("cudaMemsetAsync", ("hipMemsetAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemset2D", ("hipMemset2D", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemset2DAsync",
+            ("hipMemset2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemset3D", ("hipMemset3D", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemset3DAsync",
+            ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemGetInfo", ("hipMemGetInfo", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaArrayGetInfo",
+            ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFreeMipmappedArray",
+            ("hipFreeMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetMipmappedArrayLevel",
+            ("hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSymbolAddress",
+            ("hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSymbolSize",
+            ("hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemPrefetchAsync",
+            ("hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMallocHost", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMallocArray", ("hipMallocArray", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc", ("hipMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc3D", ("hipMalloc3D", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc3DArray", ("hipMalloc3DArray", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMallocManaged",
+            ("hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMallocMipmappedArray",
+            ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)),
+        ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)),
+        ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)),
+        ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)),
+        ("cudaHostRegister", ("hipHostRegister", CONV_MEM, API_RUNTIME)),
+        ("cudaHostUnregister", ("hipHostUnregister", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)),
+        ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)),
+        ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)),
+        ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocDefault", ("hipHostMallocDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocPortable", ("hipHostMallocPortable", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocMapped", ("hipHostMallocMapped", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostAllocWriteCombined",
+            ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaHostGetFlags", ("hipHostGetFlags", CONV_MEM, API_RUNTIME)),
+        ("cudaHostRegisterDefault", ("hipHostRegisterDefault", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostRegisterPortable",
+            ("hipHostRegisterPortable", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaHostRegisterMapped", ("hipHostRegisterMapped", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostRegisterIoMemory",
+            ("hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME),
+        ),
+        # ("warpSize", ("hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME), (HIP actually uses warpSize...)),
+        ("cudaEventCreate", ("hipEventCreate", CONV_EVENT, API_RUNTIME)),
+        (
+            "cudaEventCreateWithFlags",
+            ("hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME),
+        ),
+        ("cudaEventDestroy", ("hipEventDestroy", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventRecord", ("hipEventRecord", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventQuery", ("hipEventQuery", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventDefault", ("hipEventDefault", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventBlockingSync", ("hipEventBlockingSync", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventDisableTiming", ("hipEventDisableTiming", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventInterprocess", ("hipEventInterprocess", CONV_EVENT, API_RUNTIME)),
+        ("cudaStreamCreate", ("hipStreamCreate", CONV_STREAM, API_RUNTIME)),
+        (
+            "cudaStreamCreateWithFlags",
+            ("hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME),
+        ),
+        (
+            "cudaStreamCreateWithPriority",
+            ("hipStreamCreateWithPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamQuery", ("hipStreamQuery", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_RUNTIME)),
+        (
+            "cudaStreamAttachMemAsync",
+            ("hipStreamAttachMemAsync", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaStreamGetPriority",
+            ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaCpuDeviceId", ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamGetCaptureInfo", ("hipStreamGetCaptureInfo", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureModeGlobal", ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureModeRelaxed", ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureModeThreadLocal", ("hipStreamCaptureModeThreadLocal", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphDestroy", ("hipGraphDestroy", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphExecDestroy", ("hipGraphExecDestroy", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphLaunch", ("hipGraphLaunch", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphGetNodes", ("hipGraphGetNodes", CONV_TYPE, API_RUNTIME)),
+        ("cudaGraphDebugDotPrint", ("hipGraphDebugDotPrint", CONV_TYPE, API_RUNTIME)),
+        ("cudaThreadExchangeStreamCaptureMode", ("hipThreadExchangeStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamIsCapturing", ("hipStreamIsCapturing", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
+        ("cudaDeviceReset", ("hipDeviceReset", CONV_DEVICE, API_RUNTIME)),
+        ("cudaSetDevice", ("hipSetDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaGetDevice", ("hipGetDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaGetDeviceCount", ("hipGetDeviceCount", CONV_DEVICE, API_RUNTIME)),
+        ("cudaChooseDevice", ("hipChooseDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaThreadExit", ("hipDeviceReset", CONV_THREAD, API_RUNTIME)),
+        (
+            "cudaThreadGetCacheConfig",
+            ("hipDeviceGetCacheConfig", CONV_THREAD, API_RUNTIME),
+        ),
+        (
+            "cudaThreadGetLimit",
+            ("hipThreadGetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaThreadSetCacheConfig",
+            ("hipDeviceSetCacheConfig", CONV_THREAD, API_RUNTIME),
+        ),
+        (
+            "cudaThreadSetLimit",
+            ("hipThreadSetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaThreadSynchronize", ("hipDeviceSynchronize", CONV_THREAD, API_RUNTIME)),
+        ("cudaDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaDevAttrMaxThreadsPerBlock",
+            ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimX",
+            ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimY",
+            ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimZ",
+            ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimX",
+            ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimY",
+            ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimZ",
+            ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxSharedMemoryPerBlock",
+            ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxSharedMemoryPerBlockOptin",
+            ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTotalConstantMemory",
+            ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_RUNTIME),
+        ),
+        ("cudaDevAttrWarpSize", ("hipDeviceAttributeWarpSize", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDevAttrMaxPitch",
+            ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMaxRegistersPerBlock",
+            ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrClockRate",
+            ("hipDeviceAttributeClockRate", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTextureAlignment",
+            (
+                "hipDeviceAttributeTextureAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrGpuOverlap",
+            ("hipDeviceAttributeGpuOverlap", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMultiProcessorCount",
+            ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrKernelExecTimeout",
+            (
+                "hipDeviceAttributeKernelExecTimeout",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrIntegrated",
+            ("hipDeviceAttributeIntegrated", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrCanMapHostMemory",
+            (
+                "hipDeviceAttributeCanMapHostMemory",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputeMode",
+            ("hipDeviceAttributeComputeMode", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DWidth",
+            (
+                "hipDeviceAttributeMaxTexture3DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DHeight",
+            (
+                "hipDeviceAttributeMaxTexture3DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DDepth",
+            (
+                "hipDeviceAttributeMaxTexture3DDepth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrSurfaceAlignment",
+            (
+                "hipDeviceAttributeSurfaceAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrConcurrentKernels",
+            ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrEccEnabled",
+            ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDevAttrPciBusId", ("hipDeviceAttributePciBusId", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDevAttrPciDeviceId",
+            ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTccDriver",
+            ("hipDeviceAttributeTccDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMemoryClockRate",
+            ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrGlobalMemoryBusWidth",
+            ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrL2CacheSize",
+            ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxThreadsPerMultiProcessor",
+            ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrAsyncEngineCount",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrUnifiedAddressing",
+            (
+                "hipDeviceAttributeUnifiedAddressing",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DGatherWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DGatherHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DWidthAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DWidthAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DHeightAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DHeightAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DDepthAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DDepthAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrPciDomainId",
+            ("hipDeviceAttributePciDomainId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrTexturePitchAlignment",
+            (
+                "hipDeviceAttributeTexturePitchAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapWidth",
+            (
+                "hipDeviceAttributeMaxTextureCubemapWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DWidth",
+            (
+                "hipDeviceAttributeMaxSurface1DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DWidth",
+            (
+                "hipDeviceAttributeMaxSurface2DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DHeight",
+            (
+                "hipDeviceAttributeMaxSurface2DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DWidth",
+            (
+                "hipDeviceAttributeMaxSurface3DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DHeight",
+            (
+                "hipDeviceAttributeMaxSurface3DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DDepth",
+            (
+                "hipDeviceAttributeMaxSurface3DDepth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredHeight",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapWidth",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLinearWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DLinearWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearPitch",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearPitch",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DMipmappedWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DMipmappedHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputeCapabilityMajor",
+            ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrComputeCapabilityMinor",
+            ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DMipmappedWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DMipmappedWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrStreamPrioritiesSupported",
+            (
+                "hipDeviceAttributeStreamPrioritiesSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrGlobalL1CacheSupported",
+            (
+                "hipDeviceAttributeGlobalL1CacheSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrLocalL1CacheSupported",
+            (
+                "hipDeviceAttributeLocalL1CacheSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSharedMemoryPerMultiprocessor",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor",
+                CONV_TYPE,
+                API_RUNTIME,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxRegistersPerMultiprocessor",
+            (
+                "hipDeviceAttributeMaxRegistersPerMultiprocessor",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrManagedMemory",
+            (
+                "hipDeviceAttributeManagedMemory",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrIsMultiGpuBoard",
+            ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMultiGpuBoardGroupID",
+            (
+                "hipDeviceAttributeMultiGpuBoardGroupID",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrHostNativeAtomicSupported",
+            (
+                "hipDeviceAttributeHostNativeAtomicSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrSingleToDoublePrecisionPerfRatio",
+            (
+                "hipDeviceAttributeSingleToDoublePrecisionPerfRatio",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrPageableMemoryAccess",
+            (
+                "hipDeviceAttributePageableMemoryAccess",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrConcurrentManagedAccess",
+            (
+                "hipDeviceAttributeConcurrentManagedAccess",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputePreemptionSupported",
+            (
+                "hipDeviceAttributeComputePreemptionSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrCanUseHostPointerForRegisteredMem",
+            (
+                "hipDeviceAttributeCanUseHostPointerForRegisteredMem",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaPointerGetAttributes",
+            ("hipPointerGetAttributes", CONV_MEM, API_RUNTIME),
+        ),
+        (
+            "cudaHostGetDevicePointer",
+            ("hipHostGetDevicePointer", CONV_MEM, API_RUNTIME),
+        ),
+        (
+            "cudaGetDeviceProperties",
+            ("hipGetDeviceProperties", CONV_DEVICE, API_RUNTIME),
+        ),
+        ("cudaDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaDeviceGetByPCIBusId",
+            ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetStreamPriorityRange",
+            (
+                "hipDeviceGetStreamPriorityRange",
+                CONV_DEVICE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaSetValidDevices",
+            ("hipSetValidDevices", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevP2PAttrPerformanceRank",
+            (
+                "hipDeviceP2PAttributePerformanceRank",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevP2PAttrAccessSupported",
+            (
+                "hipDeviceP2PAttributeAccessSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevP2PAttrNativeAtomicSupported",
+            (
+                "hipDeviceP2PAttributeNativeAtomicSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDeviceGetP2PAttribute",
+            ("hipDeviceGetP2PAttribute", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeDefault",
+            ("hipComputeModeDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeExclusive",
+            ("hipComputeModeExclusive", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeProhibited",
+            ("hipComputeModeProhibited", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeExclusiveProcess",
+            ("hipComputeModeExclusiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetDeviceFlags",
+            ("hipGetDeviceFlags", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaSetDeviceFlags", ("hipSetDeviceFlags", CONV_DEVICE, API_RUNTIME)),
+        ("cudaDeviceScheduleAuto", ("hipDeviceScheduleAuto", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceScheduleSpin", ("hipDeviceScheduleSpin", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceScheduleYield", ("hipDeviceScheduleYield", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceBlockingSync",
+            ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceScheduleBlockingSync",
+            ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceScheduleMask",
+            ("hipDeviceScheduleMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDeviceMapHost", ("hipDeviceMapHost", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceLmemResizeToMax",
+            ("hipDeviceLmemResizeToMax", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDeviceMask", ("hipDeviceMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaDeviceSetCacheConfig",
+            ("hipDeviceSetCacheConfig", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetCacheConfig",
+            ("hipDeviceGetCacheConfig", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncAttributes",
+            ("hipFuncAttributes", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncAttributeMaxDynamicSharedMemorySize",
+            ("hipFuncAttributeMaxDynamicSharedMemorySize", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncAttributePreferredSharedMemoryCarveout",
+            ("hipFuncAttributePreferredSharedMemoryCarveout", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncSetAttribute",
+            ("hipFuncSetAttribute", CONV_EXEC, API_RUNTIME),
+        ),
+        ("cudaFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncCachePreferNone",
+            ("hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncCachePreferShared",
+            ("hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME),
+        ),
+        ("cudaFuncCachePreferL1", ("hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncCachePreferEqual",
+            ("hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncGetAttributes",
+            ("hipFuncGetAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFuncSetSharedMemConfig",
+            ("hipFuncSetSharedMemConfig", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetParameterBuffer",
+            ("hipGetParameterBuffer", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSetDoubleForDevice",
+            ("hipSetDoubleForDevice", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSetDoubleForHost",
+            ("hipSetDoubleForHost", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaConfigureCall",
+            ("hipConfigureCall", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaLaunch", ("hipLaunch", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaLaunchCooperativeKernel",
+            ("hipLaunchCooperativeKernel", CONV_EXEC, API_RUNTIME),
+        ),
+        (
+            "cudaSetupArgument",
+            ("hipSetupArgument", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_RUNTIME)),
+        (
+            "cudaRuntimeGetVersion",
+            ("hipRuntimeGetVersion", CONV_VERSION, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSize",
+            ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_RUNTIME),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeWithFlags",
+            (
+                "hipOccupancyMaxPotentialBlockSizeWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxActiveBlocksPerMultiprocessor",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessor",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeVariableSMem",
+            (
+                "hipOccupancyMaxPotentialBlockSizeVariableSMem",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags",
+            (
+                "hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_RUNTIME)),
+        (
+            "cudaDeviceDisablePeerAccess",
+            ("hipDeviceDisablePeerAccess", CONV_PEER, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceEnablePeerAccess",
+            ("hipDeviceEnablePeerAccess", CONV_PEER, API_RUNTIME),
+        ),
+        ("cudaMemcpyPeerAsync", ("hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyPeer", ("hipMemcpyPeer", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaIpcMemLazyEnablePeerAccess",
+            ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceSetSharedMemConfig",
+            ("hipDeviceSetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetSharedMemConfig",
+            ("hipDeviceGetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeDefault",
+            ("hipSharedMemBankSizeDefault", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeFourByte",
+            ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeEightByte",
+            ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaLimitStackSize",
+            ("hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaLimitPrintfFifoSize",
+            ("hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaLimitMallocHeapSize", ("hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaLimitDevRuntimeSyncDepth",
+            ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaLimitDevRuntimePendingLaunchCount",
+            (
+                "hipLimitDevRuntimePendingLaunchCount",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDeviceGetLimit", ("hipDeviceGetLimit", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaProfilerInitialize",
+            ("hipProfilerInitialize", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaProfilerStart", ("hipProfilerStart", CONV_OTHER, API_RUNTIME)),
+        ("cudaProfilerStop", ("hipProfilerStop", CONV_OTHER, API_RUNTIME)),
+        (
+            "cudaKeyValuePair",
+            ("hipKeyValuePair", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaCSV", ("hipCSV", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaReadModeElementType", ("hipReadModeElementType", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaReadModeNormalizedFloat",
+            ("hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaFilterModePoint", ("hipFilterModePoint", CONV_TEX, API_RUNTIME)),
+        ("cudaFilterModeLinear", ("hipFilterModeLinear", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTexture", ("hipBindTexture", CONV_TEX, API_RUNTIME)),
+        ("cudaUnbindTexture", ("hipUnbindTexture", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTexture2D", ("hipBindTexture2D", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTextureToArray", ("hipBindTextureToArray", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaBindTextureToMipmappedArray",
+            ("hipBindTextureToMipmappedArray", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureAlignmentOffset",
+            ("hipGetTextureAlignmentOffset", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaGetTextureReference", ("hipGetTextureReference", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaChannelFormatKindSigned",
+            ("hipChannelFormatKindSigned", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindUnsigned",
+            ("hipChannelFormatKindUnsigned", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindFloat",
+            ("hipChannelFormatKindFloat", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindNone",
+            ("hipChannelFormatKindNone", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaCreateChannelDesc", ("hipCreateChannelDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaGetChannelDesc", ("hipGetChannelDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceTypeArray", ("hipResourceTypeArray", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResourceTypeMipmappedArray",
+            ("hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaResourceTypeLinear", ("hipResourceTypeLinear", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceTypePitch2D", ("hipResourceTypePitch2D", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatNone", ("hipResViewFormatNone", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResViewFormatUnsignedChar1",
+            ("hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedChar2",
+            ("hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedChar4",
+            ("hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar1",
+            ("hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar2",
+            ("hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar4",
+            ("hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort1",
+            ("hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort2",
+            ("hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort4",
+            ("hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort1",
+            ("hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort2",
+            ("hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort4",
+            ("hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt1",
+            ("hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt2",
+            ("hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt4",
+            ("hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt1",
+            ("hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt2",
+            ("hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt4",
+            ("hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaResViewFormatHalf1", ("hipResViewFormatHalf1", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatHalf2", ("hipResViewFormatHalf2", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatHalf4", ("hipResViewFormatHalf4", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat1", ("hipResViewFormatFloat1", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat2", ("hipResViewFormatFloat2", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat4", ("hipResViewFormatFloat4", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed1",
+            ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed2",
+            ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed3",
+            ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed4",
+            ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed4",
+            ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed5",
+            ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed5",
+            ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed6H",
+            ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed6H",
+            ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed7",
+            ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaAddressModeWrap", ("hipAddressModeWrap", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeClamp", ("hipAddressModeClamp", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeMirror", ("hipAddressModeMirror", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeBorder", ("hipAddressModeBorder", CONV_TEX, API_RUNTIME)),
+        ("cudaCreateTextureObject", ("hipCreateTextureObject", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaDestroyTextureObject",
+            ("hipDestroyTextureObject", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectResourceDesc",
+            ("hipGetTextureObjectResourceDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectResourceViewDesc",
+            ("hipGetTextureObjectResourceViewDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectTextureDesc",
+            ("hipGetTextureObjectTextureDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaBindSurfaceToArray",
+            ("hipBindSurfaceToArray", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSurfaceReference",
+            ("hipGetSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeZero",
+            ("hipBoundaryModeZero", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeClamp",
+            ("hipBoundaryModeClamp", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeTrap",
+            ("hipBoundaryModeTrap", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFormatModeForced",
+            ("hipFormatModeForced", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFormatModeAuto",
+            ("hipFormatModeAuto", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaCreateSurfaceObject",
+            ("hipCreateSurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDestroySurfaceObject",
+            ("hipDestroySurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSurfaceObjectResourceDesc",
+            (
+                "hipGetSurfaceObjectResourceDesc",
+                CONV_SURFACE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaIpcCloseMemHandle", ("hipIpcCloseMemHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcGetEventHandle", ("hipIpcGetEventHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcGetMemHandle", ("hipIpcGetMemHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcOpenEventHandle", ("hipIpcOpenEventHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcOpenMemHandle", ("hipIpcOpenMemHandle", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaGLGetDevices",
+            ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaWGLGetDevice",
+            ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapResources",
+            ("hipGraphicsMapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedMipmappedArray",
+            (
+                "hipGraphicsResourceGetMappedMipmappedArray",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedPointer",
+            (
+                "hipGraphicsResourceGetMappedPointer",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsResourceSetMapFlags",
+            (
+                "hipGraphicsResourceSetMapFlags",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsSubResourceGetMappedArray",
+            (
+                "hipGraphicsSubResourceGetMappedArray",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsUnmapResources",
+            ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsUnregisterResource",
+            (
+                "hipGraphicsUnregisterResource",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveX",
+            (
+                "hipGraphicsCubeFacePositiveX",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeX",
+            (
+                "hipGraphicsCubeFaceNegativeX",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveY",
+            (
+                "hipGraphicsCubeFacePositiveY",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeY",
+            (
+                "hipGraphicsCubeFaceNegativeY",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveZ",
+            (
+                "hipGraphicsCubeFacePositiveZ",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeZ",
+            (
+                "hipGraphicsCubeFaceNegativeZ",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsMapFlagsNone",
+            ("hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapFlagsReadOnly",
+            (
+                "hipGraphicsMapFlagsReadOnly",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsMapFlagsWriteDiscard",
+            (
+                "hipGraphicsMapFlagsWriteDiscard",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsNone",
+            (
+                "hipGraphicsRegisterFlagsNone",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsReadOnly",
+            (
+                "hipGraphicsRegisterFlagsReadOnly",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsWriteDiscard",
+            (
+                "hipGraphicsRegisterFlagsWriteDiscard",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsSurfaceLoadStore",
+            (
+                "hipGraphicsRegisterFlagsSurfaceLoadStore",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsTextureGather",
+            (
+                "hipGraphicsRegisterFlagsTextureGather",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLDeviceListAll",
+            ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceListCurrentFrame",
+            ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceListNextFrame",
+            ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLGetDevices",
+            ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaWGLGetDevice",
+            ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapFlagsNone",
+            ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapFlagsReadOnly",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+                CONV_GL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLMapFlagsWriteDiscard",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+                CONV_GL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLMapBufferObject",
+            ("hipGLMapBufferObject__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapBufferObjectAsync",
+            ("hipGLMapBufferObjectAsync__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLRegisterBufferObject",
+            ("hipGLRegisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLSetBufferObjectMapFlags",
+            ("hipGLSetBufferObjectMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLSetGLDevice",
+            ("hipGLSetGLDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnmapBufferObject",
+            ("hipGLUnmapBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnmapBufferObjectAsync",
+            ("hipGLUnmapBufferObjectAsync", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnregisterBufferObject",
+            ("hipGLUnregisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9DeviceListAll",
+            ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9DeviceListCurrentFrame",
+            (
+                "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9DeviceListNextFrame",
+            (
+                "HIP_D3D9_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9GetDevice",
+            ("hipD3D9GetDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9GetDevices",
+            ("hipD3D9GetDevices", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9GetDirect3DDevice",
+            ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9SetDirect3DDevice",
+            ("hipD3D9SetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D9RegisterResource",
+            (
+                "hipGraphicsD3D9RegisterResource",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapFlagsNone",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlagsReadOnly",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlagsWriteDiscard",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9RegisterFlagsNone",
+            ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterFlagsArray",
+            ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapResources",
+            ("hipD3D9MapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterResource",
+            ("hipD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedArray",
+            ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedPitch",
+            ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedPointer",
+            (
+                "hipD3D9ResourceGetMappedPointer",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedSize",
+            ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetSurfaceDimensions",
+            (
+                "hipD3D9ResourceGetSurfaceDimensions",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9ResourceSetMapFlags",
+            ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9UnmapResources",
+            ("hipD3D9UnmapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9UnregisterResource",
+            ("hipD3D9UnregisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceListAll",
+            ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceListCurrentFrame",
+            (
+                "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10DeviceListNextFrame",
+            (
+                "HIP_D3D10_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10GetDevice",
+            ("hipD3D10GetDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10GetDevices",
+            ("hipD3D10GetDevices", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D10RegisterResource",
+            (
+                "hipGraphicsD3D10RegisterResource",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsNone",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsReadOnly",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsWriteDiscard",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10RegisterFlagsNone",
+            ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterFlagsArray",
+            (
+                "HIP_D3D10_REGISTER_FLAGS_ARRAY",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10GetDirect3DDevice",
+            ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10MapResources",
+            ("hipD3D10MapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterResource",
+            ("hipD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedArray",
+            (
+                "hipD3D10ResourceGetMappedArray",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedPitch",
+            (
+                "hipD3D10ResourceGetMappedPitch",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedPointer",
+            (
+                "hipD3D10ResourceGetMappedPointer",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedSize",
+            ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10ResourceGetSurfaceDimensions",
+            (
+                "hipD3D10ResourceGetSurfaceDimensions",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceSetMapFlags",
+            ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10SetDirect3DDevice",
+            ("hipD3D10SetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10UnmapResources",
+            ("hipD3D10UnmapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10UnregisterResource",
+            ("hipD3D10UnregisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceListAll",
+            ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceListCurrentFrame",
+            (
+                "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11DeviceListNextFrame",
+            (
+                "HIP_D3D11_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsVDPAURegisterOutputSurface",
+            (
+                "hipGraphicsVDPAURegisterOutputSurface",
+                CONV_VDPAU,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsVDPAURegisterVideoSurface",
+            (
+                "hipGraphicsVDPAURegisterVideoSurface",
+                CONV_VDPAU,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaVDPAUGetDevice",
+            ("hipVDPAUGetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaVDPAUSetVDPAUDevice",
+            ("hipVDPAUSetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamConsumerAcquireFrame",
+            (
+                "hipEGLStreamConsumerAcquireFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamConsumerConnect",
+            ("hipEGLStreamConsumerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamConsumerConnectWithFlags",
+            (
+                "hipEGLStreamConsumerConnectWithFlags",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamConsumerReleaseFrame",
+            (
+                "hipEGLStreamConsumerReleaseFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamProducerConnect",
+            ("hipEGLStreamProducerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamProducerDisconnect",
+            ("hipEGLStreamProducerDisconnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamProducerPresentFrame",
+            (
+                "hipEGLStreamProducerPresentFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamProducerReturnFrame",
+            ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsEGLRegisterImage",
+            ("hipGraphicsEGLRegisterImage", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedEglFrame",
+            (
+                "hipGraphicsResourceGetMappedEglFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cublasInit", ("hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasShutdown",
+            ("hipblasShutdown", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetVersion",
+            ("hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetError",
+            ("hipblasGetError", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasAlloc", ("hipblasAlloc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasFree", ("hipblasFree", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSetKernelStream",
+            ("hipblasSetKernelStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetAtomicsMode",
+            ("hipblasGetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetAtomicsMode",
+            ("hipblasSetAtomicsMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetMathMode",
+            ("hipblasGetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetMathMode",
+            ("hipblasSetMathMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("CUBLAS_OP_N", ("HIPBLAS_OP_N", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUBLAS_OP_T",
+            ("HIPBLAS_OP_T", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_OP_C",
+            ("HIPBLAS_OP_C", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_SUCCESS",
+            ("HIPBLAS_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_NOT_INITIALIZED",
+            ("HIPBLAS_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_ALLOC_FAILED",
+            ("HIPBLAS_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_INVALID_VALUE",
+            ("HIPBLAS_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_MAPPING_ERROR",
+            ("HIPBLAS_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_EXECUTION_FAILED",
+            ("HIPBLAS_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_INTERNAL_ERROR",
+            ("HIPBLAS_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_NOT_SUPPORTED",
+            ("HIPBLAS_STATUS_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_ARCH_MISMATCH",
+            ("HIPBLAS_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_FILL_MODE_LOWER",
+            ("HIPBLAS_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_FILL_MODE_UPPER",
+            ("HIPBLAS_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_DIAG_NON_UNIT",
+            ("HIPBLAS_DIAG_NON_UNIT", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        ("CUBLAS_DIAG_UNIT", ("HIPBLAS_DIAG_UNIT", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLAS_SIDE_LEFT", ("HIPBLAS_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLAS_SIDE_RIGHT", ("HIPBLAS_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUBLAS_POINTER_MODE_HOST",
+            ("HIPBLAS_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_POINTER_MODE_DEVICE",
+            ("HIPBLAS_POINTER_MODE_DEVICE", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_ATOMICS_NOT_ALLOWED",
+            (
+                "HIPBLAS_ATOMICS_NOT_ALLOWED",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_ATOMICS_ALLOWED",
+            (
+                "HIPBLAS_ATOMICS_ALLOWED",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_FLOAT",
+            (
+                "HIPBLAS_DATA_FLOAT",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_DOUBLE",
+            (
+                "HIPBLAS_DATA_DOUBLE",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_HALF",
+            ("HIPBLAS_DATA_HALF", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUBLAS_DATA_INT8",
+            ("HIPBLAS_DATA_INT8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("CUBLAS_GEMM_DEFAULT", ("HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLAS_GEMM_DEFAULT_TENSOR_OP", ("HIPBLAS_GEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("cublasCreate", ("hipblasCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDestroy", ("hipblasDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSetVector", ("hipblasSetVector", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetVector", ("hipblasGetVector", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSetVectorAsync",
+            ("hipblasSetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetVectorAsync",
+            ("hipblasGetVectorAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSetMatrix", ("hipblasSetMatrix", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetMatrix", ("hipblasGetMatrix", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetMatrixAsync",
+            ("hipblasGetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetMatrixAsync",
+            ("hipblasSetMatrixAsync", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasXerbla", ("hipblasXerbla", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSnrm2", ("hipblasSnrm2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDnrm2", ("hipblasDnrm2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasScnrm2", ("hipblasScnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDznrm2", ("hipblasDznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasNrm2Ex",
+            ("hipblasNrm2Ex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSdot", ("hipblasSdot", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSdotBatched",
+            ("hipblasSdotBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDdot", ("hipblasDdot", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDdotBatched",
+            ("hipblasDdotBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCdotu", ("hipblasCdotu", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCdotc", ("hipblasCdotc", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasZdotu", ("hipblasZdotu", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasZdotc", ("hipblasZdotc", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSscal", ("hipblasSscal", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSscalBatched",
+            ("hipblasSscalBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDscal", ("hipblasDscal", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDscalBatched",
+            ("hipblasDscalBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCscal", ("hipblasCscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsscal", ("hipblasCsscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZscal", ("hipblasZscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdscal", ("hipblasZdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSaxpy", ("hipblasSaxpy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSaxpyBatched",
+            ("hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDaxpy", ("hipblasDaxpy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCaxpy", ("hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZaxpy", ("hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasScopy", ("hipblasScopy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScopyBatched",
+            ("hipblasScopyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDcopy", ("hipblasDcopy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDcopyBatched",
+            ("hipblasDcopyBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCcopy", ("hipblasCcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZcopy", ("hipblasZcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSswap", ("hipblasSswap", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDswap", ("hipblasDswap", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCswap", ("hipblasCswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZswap", ("hipblasZswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIsamax", ("hipblasIsamax", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamax", ("hipblasIdamax", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIcamax", ("hipblasIcamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIzamax", ("hipblasIzamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIsamin", ("hipblasIsamin", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamin", ("hipblasIdamin", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIcamin", ("hipblasIcamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIzamin", ("hipblasIzamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSasum", ("hipblasSasum", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSasumBatched",
+            ("hipblasSasumBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDasum", ("hipblasDasum", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDasumBatched",
+            ("hipblasDasumBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScasum", ("hipblasScasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDzasum", ("hipblasDzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrot", ("hipblasSrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrot", ("hipblasDrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrot", ("hipblasCrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsrot", ("hipblasCsrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZrot", ("hipblasZrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdrot", ("hipblasZdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotg", ("hipblasSrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotg", ("hipblasDrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrotg", ("hipblasCrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZrotg", ("hipblasZrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotm", ("hipblasSrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotm", ("hipblasDrotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotmg", ("hipblasSrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotmg", ("hipblasDrotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgemv", ("hipblasSgemv", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSgemvBatched",
+            ("hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDgemv", ("hipblasDgemv", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgemv", ("hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgemv", ("hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgbmv", ("hipblasSgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDgbmv", ("hipblasDgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCgbmv", ("hipblasCgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgbmv", ("hipblasZgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrmv", ("hipblasStrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrmv", ("hipblasDtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrmv", ("hipblasCtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrmv", ("hipblasZtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStbmv", ("hipblasStbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtbmv", ("hipblasDtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtbmv", ("hipblasCtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtbmv", ("hipblasZtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpmv", ("hipblasStpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpmv", ("hipblasDtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpmv", ("hipblasCtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpmv", ("hipblasZtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrsv", ("hipblasStrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrsv", ("hipblasDtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrsv", ("hipblasCtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrsv", ("hipblasZtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpsv", ("hipblasStpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpsv", ("hipblasDtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpsv", ("hipblasCtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpsv", ("hipblasZtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStbsv", ("hipblasStbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtbsv", ("hipblasDtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtbsv", ("hipblasCtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtbsv", ("hipblasZtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsymv", ("hipblasSsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsymv", ("hipblasDsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsymv", ("hipblasCsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsymv", ("hipblasZsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChemv", ("hipblasChemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhemv", ("hipblasZhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsbmv", ("hipblasSsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsbmv", ("hipblasDsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChbmv", ("hipblasChbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhbmv", ("hipblasZhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspmv", ("hipblasSspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspmv", ("hipblasDspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpmv", ("hipblasChpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpmv", ("hipblasZhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSger", ("hipblasSger", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDger", ("hipblasDger", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgeru", ("hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCgerc", ("hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgeru", ("hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgerc", ("hipblasZgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr", ("hipblasSsyr", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDsyr", ("hipblasDsyr", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCher", ("hipblasCher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher", ("hipblasZher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr", ("hipblasSspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr", ("hipblasDspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr", ("hipblasChpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr", ("hipblasZhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr2", ("hipblasSsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr2", ("hipblasDsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher2", ("hipblasCher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher2", ("hipblasZher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr2", ("hipblasSspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr2", ("hipblasDspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr2", ("hipblasChpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr2", ("hipblasZhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSgemmBatched",
+            ("hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgemmBatched",
+            ("hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasHgemmBatched",
+            ("hipblasHgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgemmStridedBatched",
+            ("hipblasSgemmStridedBatched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasDgemmStridedBatched",
+            ("hipblasDgemmStridedBatched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasHgemmStridedBatched",
+            ("hipblasHgemmStridedBatched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasCgemmBatched",
+            ("hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3mBatched",
+            ("hipblasCgemm3mBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemmBatched",
+            ("hipblasZgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemmStridedBatched",
+            (
+                "hipblasCgemmStridedBatched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasCgemm3mStridedBatched",
+            (
+                "hipblasCgemm3mStridedBatched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasZgemmStridedBatched",
+            (
+                "hipblasZgemmStridedBatched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasHgemmStridedBatched",
+            (
+                "hipblasHgemmStridedBatched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cublasSgemm", ("hipblasSgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemm", ("hipblasDgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgemm", ("hipblasCgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasZgemm", ("hipblasZgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasHgemm", ("hipblasHgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSsyrk", ("hipblasSsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyrk", ("hipblasDsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyrk", ("hipblasCsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyrk", ("hipblasZsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCherk", ("hipblasCherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZherk", ("hipblasZherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr2k", ("hipblasSsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr2k", ("hipblasDsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyr2k", ("hipblasCsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyr2k", ("hipblasZyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyrkx", ("hipblasSsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyrkx", ("hipblasDsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyrkx", ("hipblasCsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyrkx", ("hipblasZsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher2k", ("hipblasCher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher2k", ("hipblasZher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCherkx", ("hipblasCherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZherkx", ("hipblasZherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsymm", ("hipblasSsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsymm", ("hipblasDsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsymm", ("hipblasCsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsymm", ("hipblasZsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChemm", ("hipblasChemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhemm", ("hipblasZhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrsm", ("hipblasStrsm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDtrsm", ("hipblasDtrsm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCtrsm", ("hipblasCtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrsm", ("hipblasZtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasStrsmBatched",
+            ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsmBatched",
+            ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsmBatched",
+            ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsmBatched",
+            ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasStrmm", ("hipblasStrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrmm", ("hipblasDtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrmm", ("hipblasCtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrmm", ("hipblasZtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgeam", ("hipblasSgeam", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgeam", ("hipblasDgeam", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgeam", ("hipblasCgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgeam", ("hipblasZgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSgetrfBatched",
+            ("hipblasSgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetrfBatched",
+            ("hipblasDgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetrfBatched",
+            ("hipblasCgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetrfBatched",
+            ("hipblasZgetrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgetriBatched",
+            ("hipblasSgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetriBatched",
+            ("hipblasDgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetriBatched",
+            ("hipblasCgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetriBatched",
+            ("hipblasZgetriBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgetrsBatched",
+            ("hipblasSgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetrsBatched",
+            ("hipblasDgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetrsBatched",
+            ("hipblasCgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetrsBatched",
+            ("hipblasZgetrsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsmBatched",
+            ("hipblasStrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsmBatched",
+            ("hipblasDtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsmBatched",
+            ("hipblasCtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsmBatched",
+            ("hipblasZtrsmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSmatinvBatched",
+            ("hipblasSmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDmatinvBatched",
+            ("hipblasDmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCmatinvBatched",
+            ("hipblasCmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZmatinvBatched",
+            ("hipblasZmatinvBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgeqrfBatched",
+            ("hipblasSgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgeqrfBatched",
+            ("hipblasDgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgeqrfBatched",
+            ("hipblasCgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgeqrfBatched",
+            ("hipblasZgeqrfBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgelsBatched",
+            ("hipblasSgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgelsBatched",
+            ("hipblasDgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgelsBatched",
+            ("hipblasCgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgelsBatched",
+            ("hipblasZgelsBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSdgmm", ("hipblasSdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDdgmm", ("hipblasDdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCdgmm", ("hipblasCdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdgmm", ("hipblasZdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpttr", ("hipblasStpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpttr", ("hipblasDtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpttr", ("hipblasCtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpttr", ("hipblasZtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrttp", ("hipblasStrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrttp", ("hipblasDtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrttp", ("hipblasCtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrttp", ("hipblasZtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCreate_v2", ("hipblasCreate_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDestroy_v2", ("hipblasDestroy_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetVersion_v2",
+            ("hipblasGetVersion_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSetStream", ("hipblasSetStream", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetStream", ("hipblasGetStream", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSetStream_v2", ("hipblasSetStream_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetStream_v2", ("hipblasGetStream_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetPointerMode",
+            ("hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasSetPointerMode",
+            ("hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasGetPointerMode_v2",
+            ("hipblasGetPointerMode_v2", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasSetPointerMode_v2",
+            ("hipblasSetPointerMode_v2", CONV_MATH_FUNC, API_BLAS),
+        ),
+        ("cublasSgemv_v2", ("hipblasSgemv_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemv_v2", ("hipblasDgemv_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgemv_v2",
+            ("hipblasCgemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemv_v2",
+            ("hipblasZgemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgbmv_v2",
+            ("hipblasSgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgbmv_v2",
+            ("hipblasDgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgbmv_v2",
+            ("hipblasCgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgbmv_v2",
+            ("hipblasZgbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrmv_v2",
+            ("hipblasStrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrmv_v2",
+            ("hipblasDtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrmv_v2",
+            ("hipblasCtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrmv_v2",
+            ("hipblasZtrmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStbmv_v2",
+            ("hipblasStbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtbmv_v2",
+            ("hipblasDtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtbmv_v2",
+            ("hipblasCtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtbmv_v2",
+            ("hipblasZtbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStpmv_v2",
+            ("hipblasStpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtpmv_v2",
+            ("hipblasDtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtpmv_v2",
+            ("hipblasCtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtpmv_v2",
+            ("hipblasZtpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsv_v2",
+            ("hipblasStrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsv_v2",
+            ("hipblasDtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsv_v2",
+            ("hipblasCtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsv_v2",
+            ("hipblasZtrsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStpsv_v2",
+            ("hipblasStpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtpsv_v2",
+            ("hipblasDtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtpsv_v2",
+            ("hipblasCtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtpsv_v2",
+            ("hipblasZtpsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStbsv_v2",
+            ("hipblasStbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtbsv_v2",
+            ("hipblasDtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtbsv_v2",
+            ("hipblasCtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtbsv_v2",
+            ("hipblasZtbsv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsymv_v2",
+            ("hipblasSsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsymv_v2",
+            ("hipblasDsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsymv_v2",
+            ("hipblasCsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsymv_v2",
+            ("hipblasZsymv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChemv_v2",
+            ("hipblasChemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhemv_v2",
+            ("hipblasZhemv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsbmv_v2",
+            ("hipblasSsbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsbmv_v2",
+            ("hipblasDsbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChbmv_v2",
+            ("hipblasChbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhbmv_v2",
+            ("hipblasZhbmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSspmv_v2",
+            ("hipblasSspmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDspmv_v2",
+            ("hipblasDspmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChpmv_v2",
+            ("hipblasChpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhpmv_v2",
+            ("hipblasZhpmv_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSger_v2", ("hipblasSger_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDger_v2", ("hipblasDger_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgeru_v2",
+            ("hipblasCgeru_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgerc_v2",
+            ("hipblasCergc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgeru_v2",
+            ("hipblasZgeru_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgerc_v2",
+            ("hipblasZgerc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSsyr_v2", ("hipblasSsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr_v2", ("hipblasDsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyr_v2", ("hipblasCsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyr_v2", ("hipblasZsyr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher_v2", ("hipblasCher_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher_v2", ("hipblasZher_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr_v2", ("hipblasSspr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr_v2", ("hipblasDspr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr_v2", ("hipblasChpr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr_v2", ("hipblasZhpr_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSsyr2_v2",
+            ("hipblasSsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyr2_v2",
+            ("hipblasDsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyr2_v2",
+            ("hipblasCsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyr2_v2",
+            ("hipblasZsyr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCher2_v2",
+            ("hipblasCher2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZher2_v2",
+            ("hipblasZher2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSspr2_v2",
+            ("hipblasSspr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDspr2_v2",
+            ("hipblasDspr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChpr2_v2",
+            ("hipblasChpr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhpr2_v2",
+            ("hipblasZhpr2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSgemm_v2", ("hipblasSgemm_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemm_v2", ("hipblasDgemm_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgemm_v2",
+            ("hipblasCgemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3m",
+            ("hipblasCgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3mEx",
+            ("hipblasCgemm3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemm_v2",
+            ("hipblasZgemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemm3m",
+            ("hipblasZgemm3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgemmEx",
+            ("hipblasSgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasGemmEx", ("hipblasGemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasGemmBatchedEx",
+            ("hipblasGemmBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGemmStridedBatchedEx",
+            ("hipblasGemmStridedBatchedEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemmEx",
+            ("hipblasCgemmEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasUint8gemmBias",
+            ("hipblasUint8gemmBias", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsyrk_v2",
+            ("hipblasSsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyrk_v2",
+            ("hipblasDsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrk_v2",
+            ("hipblasCsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyrk_v2",
+            ("hipblasZsyrk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrkEx",
+            ("hipblasCsyrkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrk3mEx",
+            ("hipblasCsyrk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherk_v2",
+            ("hipblasCherk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherkEx",
+            ("hipblasCherkEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherk3mEx",
+            ("hipblasCherk3mEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZherk_v2",
+            ("hipblasZherk_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsyr2k_v2",
+            ("hipblasSsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyr2k_v2",
+            ("hipblasDsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyr2k_v2",
+            ("hipblasCsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyr2k_v2",
+            ("hipblasZsyr2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCher2k_v2",
+            ("hipblasCher2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZher2k_v2",
+            ("hipblasZher2k_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsymm_v2",
+            ("hipblasSsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsymm_v2",
+            ("hipblasDsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsymm_v2",
+            ("hipblasCsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsymm_v2",
+            ("hipblasZsymm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChemm_v2",
+            ("hipblasChemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhemm_v2",
+            ("hipblasZhemm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsm_v2",
+            ("hipblasStrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsm_v2",
+            ("hipblasDtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsm_v2",
+            ("hipblasCtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsm_v2",
+            ("hipblasZtrsm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrmm_v2",
+            ("hipblasStrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrmm_v2",
+            ("hipblasDtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrmm_v2",
+            ("hipblasCtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrmm_v2",
+            ("hipblasZtrmm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSnrm2_v2", ("hipblasSnrm2_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDnrm2_v2", ("hipblasDnrm2_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScnrm2_v2",
+            ("hipblasScnrm2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDznrm2_v2",
+            ("hipblasDznrm2_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDotEx", ("hipblasDotEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDotcEx", ("hipblasDotcEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSdot_v2", ("hipblasSdot_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDdot_v2", ("hipblasDdot_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCdotu_v2",
+            ("hipblasCdotu_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCdotc_v2",
+            ("hipblasCdotc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdotu_v2",
+            ("hipblasZdotu_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdotc_v2",
+            ("hipblasZdotc_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScalEx", ("hipblasScalEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSscal_v2", ("hipblasSscal_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDscal_v2", ("hipblasDscal_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCscal_v2",
+            ("hipblasCscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsscal_v2",
+            ("hipblasCsscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZscal_v2",
+            ("hipblasZcsal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdscal_v2",
+            ("hipblasZdscal_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasAxpyEx", ("hipblasAxpyEx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSaxpy_v2", ("hipblasSaxpy_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDaxpy_v2", ("hipblasDaxpy_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCaxpy_v2",
+            ("hipblasCaxpy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZaxpy_v2",
+            ("hipblasZaxpy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScopy_v2", ("hipblasScopy_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDcopy_v2", ("hipblasDcopy_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCcopy_v2",
+            ("hipblasCcopy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZcopy_v2",
+            ("hipblasZcopy_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSswap_v2", ("hipblasSswap_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDswap_v2", ("hipblasDswap_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCswap_v2",
+            ("hipblasCswap_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZswap_v2",
+            ("hipblasZswap_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasIsamax_v2", ("hipblasIsamax_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamax_v2", ("hipblasIdamax_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasIcamax_v2",
+            ("hipblasIcamax_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasIzamax_v2",
+            ("hipblasIzamax_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasIsamin_v2", ("hipblasIsamin_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamin_v2", ("hipblasIdamin_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasIcamin_v2",
+            ("hipblasIcamin_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasIzamin_v2",
+            ("hipblasIzamin_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSasum_v2", ("hipblasSasum_v2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDasum_v2", ("hipblasDasum_v2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScasum_v2",
+            ("hipblasScasum_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDzasum_v2",
+            ("hipblasDzasum_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSrot_v2", ("hipblasSrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrot_v2", ("hipblasDrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrot_v2", ("hipblasCrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasCsrot_v2",
+            ("hipblasCsrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasZrot_v2", ("hipblasZrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasZdrot_v2",
+            ("hipblasZdrot_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotg_v2",
+            ("hipblasSrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotg_v2",
+            ("hipblasDrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCrotg_v2",
+            ("hipblasCrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZrotg_v2",
+            ("hipblasZrotg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotm_v2",
+            ("hipblasSrotm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotm_v2",
+            ("hipblasDrotm_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotmg_v2",
+            ("hipblasSrotmg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotmg_v2",
+            ("hipblasDrotmg_v2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasComputeType_t",
+            ("hipblasComputeType_t" if rocm_version >= (6, 0, 0) else "hipblasLtComputeType_t",
+                CONV_MATH_FUNC, API_BLAS)
+        ),
+        (
+            "CUBLAS_COMPUTE_32I",
+            ("HIPBLAS_COMPUTE_32I" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_I32", CONV_MATH_FUNC, API_BLAS)
+        ),
+        (
+            "CUBLAS_COMPUTE_32F",
+            ("HIPBLAS_COMPUTE_32F" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_F32", CONV_MATH_FUNC, API_BLAS)
+        ),
+        (
+            "CUBLAS_COMPUTE_64F",
+            ("HIPBLAS_COMPUTE_64F" if rocm_version >= (6, 0, 0) else "HIPBLASLT_COMPUTE_F64", CONV_MATH_FUNC, API_BLAS)
+        ),
+        ("cublasLtEpilogue_t", ("hipblasLtEpilogue_t", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_DEFAULT", ("HIPBLASLT_EPILOGUE_DEFAULT", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_RELU", ("HIPBLASLT_EPILOGUE_RELU", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_BIAS", ("HIPBLASLT_EPILOGUE_BIAS", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_RELU_BIAS", ("HIPBLASLT_EPILOGUE_RELU_BIAS", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_GELU", ("HIPBLASLT_EPILOGUE_GELU", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_EPILOGUE_GELU_BIAS", ("HIPBLASLT_EPILOGUE_GELU_BIAS", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtHandle_t", ("hipblasLtHandle_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDesc_t", ("hipblasLtMatmulDesc_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDescOpaque_t", ("hipblasLtMatmulDescOpaque_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDescAttributes_t", ("hipblasLtMatmulDescAttributes_t", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_TRANSA", ("HIPBLASLT_MATMUL_DESC_TRANSA", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_TRANSB", ("HIPBLASLT_MATMUL_DESC_TRANSB", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_EPILOGUE", ("HIPBLASLT_MATMUL_DESC_EPILOGUE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_BIAS_POINTER", ("HIPBLASLT_MATMUL_DESC_BIAS_POINTER", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_A_SCALE_POINTER", ("HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_B_SCALE_POINTER", ("HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_D_SCALE_POINTER", ("HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreference_t", ("hipblasLtMatmulPreference_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreferenceOpaque_t", ("hipblasLtMatmulPreferenceOpaque_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreferenceAttributes_t", ("hipblasLtMatmulPreferenceAttributes_t", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_PREF_SEARCH_MODE", ("HIPBLASLT_MATMUL_PREF_SEARCH_MODE", CONV_MATH_FUNC, API_BLAS)),
+        ("CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", ("HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulAlgo_t", ("hipblasLtMatmulAlgo_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulHeuristicResult_t", ("hipblasLtMatmulHeuristicResult_t", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutCreate", ("hipblasLtMatrixLayoutCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatrixLayoutDestroy", ("hipblasLtMatrixLayoutDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtCreate", ("hipblasLtCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtDestroy", ("hipblasLtDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDescCreate", ("hipblasLtMatmulDescCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDescDestroy", ("hipblasLtMatmulDescDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulDescSetAttribute", ("hipblasLtMatmulDescSetAttribute", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreferenceCreate", ("hipblasLtMatmulPreferenceCreate", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreferenceDestroy", ("hipblasLtMatmulPreferenceDestroy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulPreferenceSetAttribute", ("hipblasLtMatmulPreferenceSetAttribute", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmulAlgoGetHeuristic", ("hipblasLtMatmulAlgoGetHeuristic", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasLtMatmul", ("hipblasLtMatmul", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "CURAND_STATUS_SUCCESS",
+            ("HIPRAND_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_VERSION_MISMATCH",
+            ("HIPRAND_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_NOT_INITIALIZED",
+            ("HIPRAND_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_ALLOCATION_FAILED",
+            ("HIPRAND_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_TYPE_ERROR",
+            ("HIPRAND_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_OUT_OF_RANGE",
+            ("HIPRAND_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_LENGTH_NOT_MULTIPLE",
+            ("HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED",
+            (
+                "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+            ),
+        ),
+        (
+            "CURAND_STATUS_LAUNCH_FAILURE",
+            ("HIPRAND_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_PREEXISTING_FAILURE",
+            ("HIPRAND_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_INITIALIZATION_FAILED",
+            ("HIPRAND_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_ARCH_MISMATCH",
+            ("HIPRAND_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_INTERNAL_ERROR",
+            ("HIPRAND_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        ("CURAND_RNG_TEST", ("HIPRAND_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND)),
+        (
+            "mtgp32dc_params_fast_11213",
+            ("mtgp32dc_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_DEFAULT",
+            ("HIPRAND_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_XORWOW",
+            ("HIPRAND_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MRG32K3A",
+            ("HIPRAND_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MTGP32",
+            ("HIPRAND_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MT19937",
+            ("HIPRAND_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_PHILOX4_32_10",
+            ("HIPRAND_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_DEFAULT",
+            ("HIPRAND_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SOBOL32",
+            ("HIPRAND_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SCRAMBLED_SOBOL32",
+            ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SOBOL64",
+            ("HIPRAND_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SCRAMBLED_SOBOL64",
+            ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_BEST",
+            (
+                "HIPRAND_ORDERING_PSEUDO_BEST",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_DEFAULT",
+            (
+                "HIPRAND_ORDERING_PSEUDO_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_SEEDED",
+            (
+                "HIPRAND_ORDERING_PSEUDO_SEEDED",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_QUASI_DEFAULT",
+            (
+                "HIPRAND_ORDERING_QUASI_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_DIRECTION_VECTORS_32_JOEKUO6",
+            (
+                "HIPRAND_DIRECTION_VECTORS_32_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6",
+            (
+                "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_DIRECTION_VECTORS_64_JOEKUO6",
+            (
+                "HIPRAND_DIRECTION_VECTORS_64_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6",
+            (
+                "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_CHOOSE_BEST",
+            ("HIPRAND_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_ITR",
+            ("HIPRAND_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_KNUTH",
+            ("HIPRAND_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_HITR",
+            ("HIPRAND_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curand_M1", ("HIPRAND_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
+        ("curand_M2", ("HIPRAND_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curand_BINARY_SEARCH",
+            ("HIPRAND_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DISCRETE_GAUSS",
+            ("HIPRAND_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_REJECTION",
+            ("HIPRAND_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DEVICE_API",
+            ("HIPRAND_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_FAST_REJECTION",
+            ("HIPRAND_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_3RD",
+            ("HIPRAND_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DEFINITION",
+            ("HIPRAND_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_POISSON",
+            ("HIPRAND_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandCreateGenerator", ("hiprandCreateGenerator", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandCreateGeneratorHost",
+            ("hiprandCreateGeneratorHost", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandCreatePoissonDistribution",
+            ("hiprandCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandDestroyDistribution",
+            ("hiprandDestroyDistribution", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandDestroyGenerator",
+            ("hiprandDestroyGenerator", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandGenerate", ("hiprandGenerate", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateLogNormal",
+            ("hiprandGenerateLogNormal", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGenerateLogNormalDouble",
+            ("hiprandGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGenerateLongLong",
+            ("hiprandGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandGenerateNormal", ("hiprandGenerateNormal", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateNormalDouble",
+            ("hiprandGenerateNormalDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandGeneratePoisson", ("hiprandGeneratePoisson", CONV_MATH_FUNC, API_RAND)),
+        ("curandGenerateSeeds", ("hiprandGenerateSeeds", CONV_MATH_FUNC, API_RAND)),
+        ("curandGenerateUniform", ("hiprandGenerateUniform", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateUniformDouble",
+            ("hiprandGenerateUniformDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGetDirectionVectors32",
+            ("hiprandGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetDirectionVectors64",
+            ("hiprandGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetProperty",
+            ("hiprandGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetScrambleConstants32",
+            (
+                "hiprandGetScrambleConstants32",
+                CONV_MATH_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curandGetScrambleConstants64",
+            (
+                "hiprandGetScrambleConstants64",
+                CONV_MATH_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("curandGetVersion", ("hiprandGetVersion", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandSetGeneratorOffset",
+            ("hiprandSetGeneratorOffset", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandSetGeneratorOrdering",
+            ("hiprandSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandSetPseudoRandomGeneratorSeed",
+            ("hiprandSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandSetQuasiRandomGeneratorDimensions",
+            ("hiprandSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandSetStream", ("hiprandSetStream", CONV_MATH_FUNC, API_RAND)),
+        ("curand", ("hiprand", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand4", ("hiprand4", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_init", ("hiprand_init", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_log_normal", ("hiprand_log_normal", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal_double",
+            ("hiprand_log_normal_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_log_normal2", ("hiprand_log_normal2", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal2_double",
+            ("hiprand_log_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_log_normal4", ("hiprand_log_normal4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal4_double",
+            ("hiprand_log_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curand_mtgp32_single",
+            ("hiprand_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_mtgp32_single_specific",
+            (
+                "hiprand_mtgp32_single_specific",
+                CONV_DEVICE_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_mtgp32_specific",
+            ("hiprand_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curand_normal", ("hiprand_normal", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curandMakeMTGP32Constants",
+            ("hiprandMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curandMakeMTGP32KernelState",
+            ("hiprandMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_normal_double", ("hiprand_normal_double", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_normal2", ("hiprand_normal2", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_normal2_double",
+            ("hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_normal4", ("hiprand_normal4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_normal4_double",
+            ("hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_uniform", ("hiprand_uniform", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_uniform_double",
+            ("hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curand_uniform2_double",
+            ("hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_uniform4", ("hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_uniform4_double",
+            ("hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_discrete", ("hiprand_discrete", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_discrete4", ("hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_poisson", ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_poisson4", ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_Philox4x32_10",
+            ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("mtgp32_kernel_params", ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)),
+        ("CUFFT_FORWARD", ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUFFT_INVERSE", ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUFFT_COMPATIBILITY_DEFAULT",
+            (
+                "HIPFFT_COMPATIBILITY_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuComplex",
+            ("hipComplex" if rocm_version >= (6, 0, 0) else "hipblasComplex", CONV_TYPE, API_BLAS)
+        ),
+        (
+            "cuDoubleComplex",
+            ("hipDoubleComplex" if rocm_version >= (6, 0, 0) else "hipblasDoubleComplex", CONV_TYPE, API_BLAS),
+        ),
+        ("cufftResult_t", ("hipfftResult_t", CONV_TYPE, API_FFT)),
+        ("cufftResult", ("hipfftResult", CONV_TYPE, API_FFT)),
+        ("CUFFT_SUCCESS", ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_PLAN", ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_ALLOC_FAILED", ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_TYPE", ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_INVALID_VALUE",
+            ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INTERNAL_ERROR",
+            ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("CUFFT_EXEC_FAILED", ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_SETUP_FAILED", ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_SIZE", ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_UNALIGNED_DATA",
+            ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INCOMPLETE_PARAMETER_LIST",
+            ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INVALID_DEVICE",
+            ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("CUFFT_PARSE_ERROR", ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_NO_WORKSPACE", ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_NOT_IMPLEMENTED",
+            ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_LICENSE_ERROR",
+            ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUFFT_NOT_SUPPORTED",
+            ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("cufftType_t", ("hipfftType_t", CONV_TYPE, API_FFT)),
+        ("cufftType", ("hipfftType", CONV_TYPE, API_FFT)),
+        ("CUFFT_R2C", ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_C2R", ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_C2C", ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_D2Z", ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_Z2D", ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_Z2Z", ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "cufftCompatibility_t",
+            ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "cufftCompatibility",
+            ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUFFT_COMPATIBILITY_FFTW_PADDING",
+            (
+                "HIPFFT_COMPATIBILITY_FFTW_PADDING",
+                CONV_NUMERIC_LITERAL,
+                API_FFT,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cufftReal", ("hipfftReal", CONV_TYPE, API_FFT)),
+        ("cufftDoubleReal", ("hipfftDoubleReal", CONV_TYPE, API_FFT)),
+        ("cufftComplex", ("hipfftComplex", CONV_TYPE, API_FFT)),
+        ("cufftDoubleComplex", ("hipfftDoubleComplex", CONV_TYPE, API_FFT)),
+        ("cufftHandle", ("hipfftHandle", CONV_TYPE, API_FFT)),
+        ("cufftPlan1d", ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlan2d", ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlan3d", ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlanMany", ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan1d", ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan2d", ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan3d", ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlanMany", ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlanMany64", ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSizeMany64", ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate1d", ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate2d", ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate3d", ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimateMany", ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftCreate", ("hipfftCreate", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize1d", ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize2d", ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize3d", ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSizeMany", ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize", ("hipfftGetSize", CONV_MATH_FUNC, API_FFT)),
+        ("cufftSetWorkArea", ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT)),
+        (
+            "cufftSetAutoAllocation",
+            ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT),
+        ),
+        ("cufftXtExec", ("hipfftXtExec", CONV_MATH_FUNC, API_FFT)),
+        ("cufftXtMakePlanMany", ("hipfftXtMakePlanMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecC2C", ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecR2C", ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecC2R", ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecZ2Z", ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecD2Z", ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecZ2D", ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT)),
+        ("cufftSetStream", ("hipfftSetStream", CONV_MATH_FUNC, API_FFT)),
+        ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)),
+        (
+            "cufftGetProperty",
+            ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED),
+        ),
+        ("nvrtcResult", ("hiprtcResult", CONV_TYPE, API_RTC)),
+        ("NVRTC_SUCCESS", ("HIPRTC_SUCCESS", CONV_TYPE, API_RTC)),
+        (
+            "NVRTC_ERROR_OUT_OF_MEMORY",
+            ("HIPRTC_ERROR_OUT_OF_MEMORY", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_PROGRAM_CREATION_FAILURE",
+            ("HIPRTC_ERROR_PROGRAM_CREATION_FAILURE", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INVALID_INPUT",
+            ("HIPRTC_ERROR_INVALID_INPUT", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INVALID_PROGRAM",
+            ("HIPRTC_ERROR_INVALID_PROGRAM", CONV_TYPE, API_RTC),
+        ),
+        ("NVRTC_ERROR_COMPILATION", ("HIPRTC_ERROR_COMPILATION", CONV_TYPE, API_RTC)),
+        (
+            "NVRTC_ERROR_BUILTIN_OPERATION_FAILURE",
+            ("HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION",
+            ("HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID",
+            ("HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INTERNAL_ERROR",
+            ("HIPRTC_ERROR_INTERNAL_ERROR", CONV_TYPE, API_RTC),
+        ),
+        ("nvrtcGetErrorString", ("hiprtcGetErrorString", CONV_JIT, API_RTC)),
+        ("nvrtcVersion", ("hiprtcVersion", CONV_JIT, API_RTC)),
+        ("nvrtcProgram", ("hiprtcProgram", CONV_TYPE, API_RTC)),
+        ("nvrtcAddNameExpression", ("hiprtcAddNameExpression", CONV_JIT, API_RTC)),
+        ("nvrtcCompileProgram", ("hiprtcCompileProgram", CONV_JIT, API_RTC)),
+        ("nvrtcCreateProgram", ("hiprtcCreateProgram", CONV_JIT, API_RTC)),
+        ("nvrtcDestroyProgram", ("hiprtcDestroyProgram", CONV_JIT, API_RTC)),
+        ("nvrtcGetLoweredName", ("hiprtcGetLoweredName", CONV_JIT, API_RTC)),
+        ("nvrtcGetProgramLog", ("hiprtcGetProgramLog", CONV_JIT, API_RTC)),
+        ("nvrtcGetProgramLogSize", ("hiprtcGetProgramLogSize", CONV_JIT, API_RTC)),
+        ("nvrtcGetPTX", ("hiprtcGetCode", CONV_JIT, API_RTC)),
+        ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)),
+        ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cudaCpuDeviceId",
+            ("hipCpuDeviceId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        # The caffe2 directory does a string match; pytorch does a word-boundary match.
+        # Patterns such as 'cub::' will not match for pytorch.
+        # We list all current uses of cub symbols for this reason.
+        ("cub::", ("hipcub::", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::ArgMax", ("hipcub::ArgMax", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::ArgMin", ("hipcub::ArgMin", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BLOCK_REDUCE_WARP_REDUCTIONS", ("hipcub::BLOCK_REDUCE_WARP_REDUCTIONS", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BlockReduce", ("hipcub::BlockReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::BlockScan", ("hipcub::BlockScan", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::CachingDeviceAllocator", ("hipcub::CachingDeviceAllocator", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::CountingInputIterator", ("hipcub::CountingInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceRadixSort", ("hipcub::DeviceRadixSort", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceReduce", ("hipcub::DeviceReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceRunLengthEncode", ("hipcub::DeviceRunLengthEncode", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceScan", ("hipcub::DeviceScan", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceSegmentedRadixSort", ("hipcub::DeviceSegmentedRadixSort", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceSegmentedReduce", ("hipcub::DeviceSegmentedReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::DeviceSelect", ("hipcub::DeviceSelect", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::KeyValuePair", ("hipcub::KeyValuePair", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::Max", ("hipcub::Max", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::Min", ("hipcub::Min", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::Sum", ("hipcub::Sum", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::ArgIndexInputIterator", ("hipcub::ArgIndexInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::TransformInputIterator", ("hipcub::TransformInputIterator", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("cub::WarpReduce", ("hipcub::WarpReduce", CONV_SPECIAL_FUNC, API_RUNTIME)),
+        ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)),
+        ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangePop", ("roctxRangePop", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangeStartA", ("roctxRangeStartA", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangeEnd", ("roctxRangeStop", CONV_OTHER, API_ROCTX)),
+        ("nvmlReturn_t", ("rsmi_status_t", CONV_OTHER, API_ROCMSMI)),
+        ("NVML_SUCCESS", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)),
+        ("NVML_P2P_CAPS_INDEX_READ", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)),
+        ("NVML_P2P_STATUS_OK", ("RSMI_STATUS_SUCCESS", CONV_OTHER, API_ROCMSMI)),
+        ("NVML_ERROR_INSUFFICIENT_SIZE", ("RSMI_STATUS_INSUFFICIENT_SIZE", CONV_OTHER, API_ROCMSMI)),
+        ("nvmlDevice_t", ("uint32_t", CONV_OTHER, API_ROCMSMI)),
+        ("nvmlGpuP2PStatus_t", ("bool", CONV_OTHER, API_ROCMSMI)),
+        ("nvmlProcessInfo_t", ("rsmi_process_info_t", CONV_OTHER, API_ROCMSMI)),
+        ("nvmlGpuP2PCapsIndex_t", ("uint32_t", CONV_OTHER, API_ROCMSMI)),
+    ]
+)
+
+CUDA_SPECIAL_MAP = collections.OrderedDict(
+    [
+        # SPARSE
+        ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cuComplex", ("hipComplex", CONV_TYPE, API_SPECIAL)),
+        ("cuDoubleComplex", ("hipDoubleComplex", CONV_TYPE, API_SPECIAL)),
+        (
+            "CUSPARSE_POINTER_MODE_HOST",
+            ("HIPSPARSE_POINTER_MODE_HOST", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPECIAL)),
+        (
+            "cusparseCreateMatDescr",
+            ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPECIAL)),
+        (
+            "cusparseDestroyMatDescr",
+            ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDiagType_t", ("hipsparseDiagType_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_DIAG_TYPE_UNIT", ("HIPSPARSE_DIAG_TYPE_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_DIAG_TYPE_NON_UNIT", ("HIPSPARSE_DIAG_TYPE_NON_UNIT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseSetMatDiagType", ("hipsparseSetMatDiagType", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseFillMode_t", ("hipsparseFillMode_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_FILL_MODE_UPPER", ("HIPSPARSE_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_FILL_MODE_LOWER", ("HIPSPARSE_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseSetMatFillMode", ("hipsparseSetMatFillMode", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDirection_t", ("hipsparseDirection_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_DIRECTION_ROW", ("HIPSPARSE_DIRECTION_ROW", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_DIRECTION_COLUMN", ("HIPSPARSE_DIRECTION_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseSolvePolicy_t", ("hipsparseSolvePolicy_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_SOLVE_POLICY_NO_LEVEL", ("HIPSPARSE_SOLVE_POLICY_NO_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SOLVE_POLICY_USE_LEVEL", ("HIPSPARSE_SOLVE_POLICY_USE_LEVEL", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("cusparseCreateBsrsv2Info", ("hipsparseCreateBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateBsrsm2Info", ("hipsparseCreateBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDestroyBsrsv2Info", ("hipsparseDestroyBsrsv2Info", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDestroyBsrsm2Info", ("hipsparseDestroyBsrsm2Info", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrmm", ("hipsparseSbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrmm", ("hipsparseDbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrmm", ("hipsparseCbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrmm", ("hipsparseZbsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrmv", ("hipsparseSbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrmv", ("hipsparseDbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrmv", ("hipsparseCbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrmv", ("hipsparseZbsrmv", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsv2_bufferSize", ("hipsparseSbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsv2_bufferSize", ("hipsparseDbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsv2_bufferSize", ("hipsparseCbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsv2_bufferSize", ("hipsparseZbsrsv2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsv2_analysis", ("hipsparseSbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsv2_analysis", ("hipsparseDbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsv2_analysis", ("hipsparseCbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsv2_analysis", ("hipsparseZbsrsv2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsv2_solve", ("hipsparseSbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsv2_solve", ("hipsparseDbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsv2_solve", ("hipsparseCbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsv2_solve", ("hipsparseZbsrsv2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsm2_bufferSize", ("hipsparseSbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsm2_bufferSize", ("hipsparseDbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsm2_bufferSize", ("hipsparseCbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsm2_bufferSize", ("hipsparseZbsrsm2_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsm2_analysis", ("hipsparseSbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsm2_analysis", ("hipsparseDbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsm2_analysis", ("hipsparseCbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsm2_analysis", ("hipsparseZbsrsm2_analysis", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSbsrsm2_solve", ("hipsparseSbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDbsrsm2_solve", ("hipsparseDbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCbsrsm2_solve", ("hipsparseCbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZbsrsm2_solve", ("hipsparseZbsrsm2_solve", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCcsrmm2", ("hipsparseCcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZcsrmm2", ("hipsparseZcsrmm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPECIAL)),
+        (
+            "cusparseXcsrsort_bufferSizeExt",
+            ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseCreateCsrgemm2Info", ("hipsparseCreateCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL)),
+        (
+            "cusparseDestroyCsrgemm2Info",
+            ("hipsparseDestroyCsrgemm2Info", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseXcsrgemm2Nnz", ("hipsparseXcsrgemm2Nnz", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrgemm2_bufferSizeExt", ("hipsparseDcsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrgemm2_bufferSizeExt", ("hipsparseScsrgemm2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrgemm2", ("hipsparseDcsrgemm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrgemm2", ("hipsparseScsrgemm2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSetPointerMode", ("hipsparseSetPointerMode", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseXcsrgeam2Nnz", ("hipsparseXcsrgeam2Nnz", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrgeam2_bufferSizeExt", ("hipsparseScsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrgeam2_bufferSizeExt", ("hipsparseDcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCcsrgeam2_bufferSizeExt", ("hipsparseCcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZcsrgeam2_bufferSizeExt", ("hipsparseZcsrgeam2_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseScsrgeam2", ("hipsparseScsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDcsrgeam2", ("hipsparseDcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCcsrgeam2", ("hipsparseCcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseZcsrgeam2", ("hipsparseZcsrgeam2", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseXbsrsm2_zeroPivot", ("hipsparseXbsrsm2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseXbsrsv2_zeroPivot", ("hipsparseXbsrsv2_zeroPivot", CONV_MATH_FUNC, API_SPECIAL)),
+        (
+            "cusparseXcoosort_bufferSizeExt",
+            ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        (
+            "cusparseXcoosortByRow",
+            ("hipsparseXcoosortByRow", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPECIAL)),
+        (
+            "cusparseCreateIdentityPermutation",
+            ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        (
+            "cusparseSetMatIndexBase",
+            ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPECIAL),
+        ),
+        ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMV", ("hipsparseSpMV", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMV_bufferSize", ("hipsparseSpMV_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMM", ("hipsparseSpMM", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMM_bufferSize", ("hipsparseSpMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateDnMat", ("hipsparseCreateDnMat", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCsrSetStridedBatch", ("hipsparseCsrSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateDnVec", ("hipsparseCreateDnVec", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDestroyDnMat", ("hipsparseDestroyDnMat", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDestroyDnVec", ("hipsparseDestroyDnVec", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDestroySpMat", ("hipsparseDestroySpMat", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpGEMM_destroyDescr", ("hipsparseSpGEMM_destroyDescr", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateCoo", ("hipsparseCreateCoo", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCreateCsr", ("hipsparseCreateCsr", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpGEMM_createDescr", ("hipsparseSpGEMM_createDescr", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseDnMatSetStridedBatch", ("hipsparseDnMatSetStridedBatch", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpGEMM_copy", ("hipsparseSpGEMM_copy", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSDDMM_bufferSize", ("hipsparseSDDMM_bufferSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSDDMM_preprocess", ("hipsparseSDDMM_preprocess", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSDDMM", ("hipsparseSDDMM", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpGEMM_compute", ("hipsparseSpGEMM_compute", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpGEMM_workEstimation", ("hipsparseSpGEMM_workEstimation", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMatGetSize", ("hipsparseSpMatGetSize", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseCsrSetPointers", ("hipsparseCsrSetPointers", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusparseSpMVAlg_t", ("hipsparseSpMVAlg_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseSpMMAlg_t", ("hipsparseSpMMAlg_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseIndexType_t", ("hipsparseIndexType_t", CONV_TYPE, API_SPECIAL)),
+        # Unsupported ("cusparseMatDescr", ("hipsparseMatDescr", CONV_TYPE, API_SPECIAL)),
+        # Unsupported ("cusparseDnMatDescr", ("hipsparseDnMatDescr", CONV_TYPE, API_SPECIAL)),
+        # Unsupported ("cusparseDnVecDescr", ("hipsparseDnVecDescr", CONV_TYPE, API_SPECIAL)),
+        # Unsupported ("cusparseSpMatDescr", ("hipsparseSpMatDescr", CONV_TYPE, API_SPECIAL)),
+        # Unsupported ("cusparseSpGEMMDescr", ("hipsparseSpGEMMDescr", CONV_TYPE, API_SPECIAL)),
+        ("cusparseDnMatDescr_t", ("hipsparseDnMatDescr_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseDnVecDescr_t", ("hipsparseDnVecDescr_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseSpMatDescr_t", ("hipsparseSpMatDescr_t", CONV_TYPE, API_SPECIAL)),
+        ("cusparseSpGEMMDescr_t", ("hipsparseSpGEMMDescr_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSPARSE_INDEX_32I", ("HIPSPARSE_INDEX_32I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_INDEX_64I", ("HIPSPARSE_INDEX_64I", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_ORDER_COL", ("HIPSPARSE_ORDER_COLUMN", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_MV_ALG_DEFAULT", ("HIPSPARSE_MV_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_MM_ALG_DEFAULT", ("HIPSPARSE_MM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPMM_COO_ALG1", ("HIPSPARSE_SPMM_COO_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPMM_COO_ALG2", ("HIPSPARSE_SPMM_COO_ALG2", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_COOMV_ALG", ("HIPSPARSE_COOMV_ALG", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPMM_CSR_ALG1", ("HIPSPARSE_CSRMM_ALG1", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SPGEMM_DEFAULT", ("HIPSPARSE_SPGEMM_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSPARSE_SDDMM_ALG_DEFAULT", ("HIPSPARSE_SDDMM_ALG_DEFAULT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        (
+            "CUSPARSE_STATUS_SUCCESS",
+            ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_NOT_INITIALIZED",
+            ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_ALLOC_FAILED",
+            ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_INVALID_VALUE",
+            ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_MAPPING_ERROR",
+            ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_EXECUTION_FAILED",
+            ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_INTERNAL_ERROR",
+            ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
+            (
+                "HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
+                CONV_NUMERIC_LITERAL,
+                API_SPECIAL,
+            ),
+        ),
+        (
+            "CUSPARSE_STATUS_ARCH_MISMATCH",
+            ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_STATUS_ZERO_PIVOT",
+            ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_OPERATION_TRANSPOSE",
+            ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_OPERATION_NON_TRANSPOSE",
+            ("HIPSPARSE_OPERATION_NON_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
+            (
+                "HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
+                CONV_NUMERIC_LITERAL,
+                API_SPECIAL,
+            ),
+        ),
+        (
+            "CUSPARSE_INDEX_BASE_ZERO",
+            ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_INDEX_BASE_ONE",
+            ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUSPARSE_MATRIX_TYPE_GENERAL",
+            ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        # SOLVER
+        ("cublasOperation_t", ("hipsolverOperation_t", CONV_TYPE, API_SPECIAL)),
+        ("CUBLAS_OP_N", ("HIPSOLVER_OP_N", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        (
+            "CUBLAS_OP_T",
+            ("HIPSOLVER_OP_T", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUBLAS_OP_C",
+            ("HIPSOLVER_OP_C", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        ("cublasFillMode_t", ("hipsolverFillMode_t", CONV_TYPE, API_SPECIAL)),
+        (
+            "CUBLAS_FILL_MODE_LOWER",
+            ("HIPSOLVER_FILL_MODE_LOWER", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        (
+            "CUBLAS_FILL_MODE_UPPER",
+            ("HIPSOLVER_FILL_MODE_UPPER", CONV_NUMERIC_LITERAL, API_SPECIAL),
+        ),
+        ("cublasSideMode_t", ("hipsolverSideMode_t", CONV_TYPE, API_SPECIAL)),
+        ("CUBLAS_SIDE_LEFT", ("HIPSOLVER_SIDE_LEFT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUBLAS_SIDE_RIGHT", ("HIPSOLVER_SIDE_RIGHT", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+
+        ("cusolverEigMode_t", ("hipsolverEigMode_t", CONV_TYPE, API_SPECIAL)),
+        ("CUSOLVER_EIG_MODE_VECTOR", ("HIPSOLVER_EIG_MODE_VECTOR", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+        ("CUSOLVER_EIG_MODE_NOVECTOR", ("HIPSOLVER_EIG_MODE_NOVECTOR", CONV_NUMERIC_LITERAL, API_SPECIAL)),
+
+        ("syevjInfo_t", ("hipsolverSyevjInfo_t", CONV_TYPE, API_SPECIAL)),
+        ("cusolverDnCreateSyevjInfo", ("hipsolverDnCreateSyevjInfo", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnXsyevjSetSortEig", ("hipsolverDnXsyevjSetSortEig", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnDestroySyevjInfo", ("hipsolverDnDestroySyevjInfo", CONV_MATH_FUNC, API_SPECIAL)),
+
+        ("gesvdjInfo_t", ("hipsolverGesvdjInfo_t", CONV_TYPE, API_SPECIAL)),
+        ("cusolverDnCreateGesvdjInfo", ("hipsolverDnCreateGesvdjInfo", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnXgesvdjSetSortEig", ("hipsolverDnXgesvdjSetSortEig", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnDestroyGesvdjInfo", ("hipsolverDnDestroyGesvdjInfo", CONV_MATH_FUNC, API_SPECIAL)),
+
+        ("cusolverDnHandle_t", ("hipsolverDnHandle_t", CONV_TYPE, API_SPECIAL)),
+        ("cusolverDnCreate", ("hipsolverDnCreate", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnSetStream", ("hipsolverDnSetStream", CONV_MATH_FUNC, API_SPECIAL)),
+        ("cusolverDnDestroy", ("hipsolverDnDestroy", CONV_MATH_FUNC, API_SPECIAL)),
+
+        # from aten/src/ATen/native/hip/linalg/HIPSolver.cpp
+        ('cusolverDnParams_t', ('hipsolverDnParams_t', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgeqrf', ('hipsolverDnCgeqrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgeqrf_bufferSize', ('hipsolverDnCgeqrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvd', ('hipsolverDnCgesvd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvd_bufferSize', ('hipsolverDnCgesvd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvdj', ('hipsolverDnCgesvdj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvdjBatched', ('hipsolverDnCgesvdjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvdjBatched_bufferSize', ('hipsolverDnCgesvdjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvdj_bufferSize', ('hipsolverDnCgesvdj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgetrf', ('hipsolverDnCgetrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgetrf_bufferSize', ('hipsolverDnCgetrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgetrs', ('hipsolverDnCgetrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevd', ('hipsolverDnCheevd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevd_bufferSize', ('hipsolverDnCheevd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevj', ('hipsolverDnCheevj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevjBatched', ('hipsolverDnCheevjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevjBatched_bufferSize', ('hipsolverDnCheevjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCheevj_bufferSize', ('hipsolverDnCheevj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCpotrf', ('hipsolverDnCpotrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCpotrfBatched', ('hipsolverDnCpotrfBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCpotrf_bufferSize', ('hipsolverDnCpotrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCpotrs', ('hipsolverDnCpotrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCpotrsBatched', ('hipsolverDnCpotrsBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCungqr', ('hipsolverDnCungqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCungqr_bufferSize', ('hipsolverDnCungqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCunmqr', ('hipsolverDnCunmqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCunmqr_bufferSize', ('hipsolverDnCunmqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgeqrf', ('hipsolverDnDgeqrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgeqrf_bufferSize', ('hipsolverDnDgeqrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvd', ('hipsolverDnDgesvd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvd_bufferSize', ('hipsolverDnDgesvd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvdj', ('hipsolverDnDgesvdj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvdjBatched', ('hipsolverDnDgesvdjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvdjBatched_bufferSize', ('hipsolverDnDgesvdjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvdj_bufferSize', ('hipsolverDnDgesvdj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgetrf', ('hipsolverDnDgetrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgetrf_bufferSize', ('hipsolverDnDgetrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgetrs', ('hipsolverDnDgetrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDorgqr', ('hipsolverDnDorgqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDorgqr_bufferSize', ('hipsolverDnDorgqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDormqr', ('hipsolverDnDormqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDormqr_bufferSize', ('hipsolverDnDormqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDpotrf', ('hipsolverDnDpotrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDpotrfBatched', ('hipsolverDnDpotrfBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDpotrf_bufferSize', ('hipsolverDnDpotrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDpotrs', ('hipsolverDnDpotrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDpotrsBatched', ('hipsolverDnDpotrsBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevd', ('hipsolverDnDsyevd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevd_bufferSize', ('hipsolverDnDsyevd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevj', ('hipsolverDnDsyevj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevjBatched', ('hipsolverDnDsyevjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevjBatched_bufferSize', ('hipsolverDnDsyevjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsyevj_bufferSize', ('hipsolverDnDsyevj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgeqrf', ('hipsolverDnSgeqrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgeqrf_bufferSize', ('hipsolverDnSgeqrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvd', ('hipsolverDnSgesvd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvd_bufferSize', ('hipsolverDnSgesvd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvdj', ('hipsolverDnSgesvdj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvdjBatched', ('hipsolverDnSgesvdjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvdjBatched_bufferSize', ('hipsolverDnSgesvdjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgesvdj_bufferSize', ('hipsolverDnSgesvdj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgetrf', ('hipsolverDnSgetrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgetrf_bufferSize', ('hipsolverDnSgetrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSgetrs', ('hipsolverDnSgetrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSorgqr', ('hipsolverDnSorgqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSorgqr_bufferSize', ('hipsolverDnSorgqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSormqr', ('hipsolverDnSormqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSormqr_bufferSize', ('hipsolverDnSormqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSpotrf', ('hipsolverDnSpotrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSpotrfBatched', ('hipsolverDnSpotrfBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSpotrf_bufferSize', ('hipsolverDnSpotrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSpotrs', ('hipsolverDnSpotrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSpotrsBatched', ('hipsolverDnSpotrsBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevd', ('hipsolverDnSsyevd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevd_bufferSize', ('hipsolverDnSsyevd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevj', ('hipsolverDnSsyevj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevjBatched', ('hipsolverDnSsyevjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevjBatched_bufferSize', ('hipsolverDnSsyevjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsyevj_bufferSize', ('hipsolverDnSsyevj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXgeqrf', ('hipsolverDnXgeqrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXgeqrf_bufferSize', ('hipsolverDnXgeqrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXpotrf', ('hipsolverDnXpotrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXpotrf_bufferSize', ('hipsolverDnXpotrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXpotrs', ('hipsolverDnXpotrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXsyevd', ('hipsolverDnXsyevd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXsyevd_bufferSize', ('hipsolverDnXsyevd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgeqrf', ('hipsolverDnZgeqrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgeqrf_bufferSize', ('hipsolverDnZgeqrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvd', ('hipsolverDnZgesvd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvd_bufferSize', ('hipsolverDnZgesvd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvdj', ('hipsolverDnZgesvdj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvdjBatched', ('hipsolverDnZgesvdjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvdjBatched_bufferSize', ('hipsolverDnZgesvdjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvdj_bufferSize', ('hipsolverDnZgesvdj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgetrf', ('hipsolverDnZgetrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgetrf_bufferSize', ('hipsolverDnZgetrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgetrs', ('hipsolverDnZgetrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevd', ('hipsolverDnZheevd', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevd_bufferSize', ('hipsolverDnZheevd_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevj', ('hipsolverDnZheevj', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevjBatched', ('hipsolverDnZheevjBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevjBatched_bufferSize', ('hipsolverDnZheevjBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZheevj_bufferSize', ('hipsolverDnZheevj_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZpotrf', ('hipsolverDnZpotrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZpotrfBatched', ('hipsolverDnZpotrfBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZpotrf_bufferSize', ('hipsolverDnZpotrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZpotrs', ('hipsolverDnZpotrs', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZpotrsBatched', ('hipsolverDnZpotrsBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZungqr', ('hipsolverDnZungqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZungqr_bufferSize', ('hipsolverDnZungqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZunmqr', ('hipsolverDnZunmqr', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZunmqr_bufferSize', ('hipsolverDnZunmqr_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+
+        # sytrf
+        ('cusolverDnDsytrf_bufferSize', ('hipsolverDnDsytrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsytrf_bufferSize', ('hipsolverDnSsytrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZsytrf_bufferSize', ('hipsolverDnZsytrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCsytrf_bufferSize', ('hipsolverDnCsytrf_bufferSize', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDsytrf', ('hipsolverDnDsytrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnSsytrf', ('hipsolverDnSsytrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZsytrf', ('hipsolverDnZsytrf', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCsytrf', ('hipsolverDnCsytrf', CONV_MATH_FUNC, API_SPECIAL)),
+
+        # gesdva strided
+        (
+            'cusolverDnSgesvdaStridedBatched_bufferSize',
+            ('hipsolverDnSgesvdaStridedBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)
+        ),
+        (
+            'cusolverDnDgesvdaStridedBatched_bufferSize',
+            ('hipsolverDnDgesvdaStridedBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)
+        ),
+        (
+            'cusolverDnCgesvdaStridedBatched_bufferSize',
+            ('hipsolverDnCgesvdaStridedBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)
+        ),
+        (
+            'cusolverDnZgesvdaStridedBatched_bufferSize',
+            ('hipsolverDnZgesvdaStridedBatched_bufferSize', CONV_MATH_FUNC, API_SPECIAL)
+        ),
+        ('cusolverDnSgesvdaStridedBatched', ('hipsolverDnSgesvdaStridedBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnDgesvdaStridedBatched', ('hipsolverDnDgesvdaStridedBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnCgesvdaStridedBatched', ('hipsolverDnCgesvdaStridedBatched', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnZgesvdaStridedBatched', ('hipsolverDnZgesvdaStridedBatched', CONV_MATH_FUNC, API_SPECIAL)),
+
+        # gesvdj SetXXX
+        ('cusolverDnXgesvdjSetTolerance', ('hipsolverDnXgesvdjSetTolerance', CONV_MATH_FUNC, API_SPECIAL)),
+        ('cusolverDnXgesvdjSetMaxSweeps', ('hipsolverDnXgesvdjSetMaxSweeps', CONV_MATH_FUNC, API_SPECIAL)),
+    ]
+)
+
+PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
+    [
+        ("USE_CUDA", ("USE_ROCM", API_PYTORCH)),
+        ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_PYTORCH)),
+        ("cudaHostAllocator", ("hipHostAllocator", API_PYTORCH)),
+        ("cudaDeviceAllocator", ("hipDeviceAllocator", API_PYTORCH)),
+        ("define MAX_NUM_BLOCKS 200", ("define MAX_NUM_BLOCKS 64", API_PYTORCH)),
+        ("cuda::CUDAGuard", ("hip::HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAGuard", ("HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::OptionalCUDAGuard",
+            ("hip::OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("OptionalCUDAGuard", ("OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::CUDAStreamGuard",
+            ("hip::HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("CUDAStreamGuard", ("HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::OptionalCUDAStreamGuard",
+            ("hip::OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "OptionalCUDAStreamGuard",
+            ("OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDAMultiStreamGuard",
+            ("hip::HIPMultiStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "CUDAMultiStreamGuard",
+            ("HIPMultiStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        # Only get needs to be transformed this way; all the other ones can go
+        # straight to the normal versions hip::HIPCachingAllocator
+        (
+            "cuda::CUDACachingAllocator::get",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::get",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordStream",
+            (
+                "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        (
+            "CUDACachingAllocator::recordStream",
+            (
+                "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        (
+            "cuda::CUDAAllocator::recordStream",
+            (
+                "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        (
+            "CUDAAllocator::recordStream",
+            (
+                "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::getStreamFromPool",
+            ("hip::getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("getStreamFromPool", ("getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::getDefaultCUDAStream",
+            ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::getStreamFromExternal",
+            ("hip::getStreamFromExternalMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("getStreamFromExternal", ("getStreamFromExternalMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::getDefaultCUDAStream",
+            ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "getDefaultCUDAStream",
+            ("getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::getCurrentCUDAStream",
+            ("hip::getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "getCurrentCUDAStream",
+            ("getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::setCurrentCUDAStream",
+            ("hip::setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "setCurrentCUDAStream",
+            ("setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "ATen/cudnn/Handle.h",
+            ("ATen/miopen/Handle.h", API_PYTORCH),
+        ),
+        # TODO: Undo this special-case; see the header for motivation behind this
+        # hack.  It's VERY important this is only applied to PyTorch HIPify.
+        (
+            "c10/cuda/CUDAGuard.h",
+            ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        (
+            "c10/cuda/CUDACachingAllocator.h",
+            ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        (
+            "c10/cuda/CUDAStream.h",
+            ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        ("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)),
+        (
+            "gloo/cuda_allreduce_halving_doubling.h",
+            ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH),
+        ),
+        (
+            "gloo/cuda_allreduce_halving_doubling_pipelined.h",
+            ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH),
+        ),
+        ("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
+        (
+            "gloo/cuda_broadcast_one_to_all.h",
+            ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH),
+        ),
+        (
+            "gloo::CudaAllreduceHalvingDoublingPipelined",
+            ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH),
+        ),
+        ("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
+        ("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
+        ("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
+        ("CUDNN_RNN_RELU", ("miopenRNNRELU", API_PYTORCH)),
+        ("CUDNN_RNN_TANH", ("miopenRNNTANH", API_PYTORCH)),
+        ("CUDNN_LSTM", ("miopenLSTM", API_PYTORCH)),
+        ("CUDNN_GRU", ("miopenGRU", API_PYTORCH)),
+        ("cudnnRNNMode_t", ("miopenRNNMode_t", API_PYTORCH)),
+        ("magma_queue_create_from_cuda", ("magma_queue_create_from_hip", API_PYTORCH)),
+    ]
+)
+
+CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
+    [
+        ("cuda_stream", ("hip_stream", API_CAFFE2)),
+        # if the header is a native hip folder (under hip directory),
+        # there is no need to add a hip path to it; the trie in hipify script
+        # takes this mapping order to forbid further replacement
+        ("/hip/", ("/hip/", API_CAFFE2)),
+        ("/context_gpu", ("/hip/context_gpu", API_CAFFE2)),
+        ("/common_gpu", ("/hip/common_gpu", API_CAFFE2)),
+        ("/cuda_nccl_gpu", ("/hip/hip_nccl_gpu", API_CAFFE2)),
+        ("/mixed_utils", ("/hip/mixed_utils", API_CAFFE2)),
+        ("/operator_fallback_gpu", ("/hip/operator_fallback_gpu", API_CAFFE2)),
+        (
+            "/spatial_batch_norm_op_impl",
+            ("/hip/spatial_batch_norm_op_impl", API_CAFFE2),
+        ),
+        (
+            "/recurrent_network_executor_gpu",
+            ("/hip/recurrent_network_executor_gpu", API_CAFFE2),
+        ),
+        (
+            "/generate_proposals_op_util_nms_gpu",
+            ("/hip/generate_proposals_op_util_nms_gpu", API_CAFFE2),
+        ),
+        ("/max_pool_with_index_gpu", ("/hip/max_pool_with_index_gpu", API_CAFFE2)),
+        ("/THCCachingAllocator_gpu", ("/hip/THCCachingAllocator_gpu", API_CAFFE2)),
+        ("/top_k_heap_selection", ("/hip/top_k_heap_selection", API_CAFFE2)),
+        ("/top_k_radix_selection", ("/hip/top_k_radix_selection", API_CAFFE2)),
+        ("/GpuAtomics", ("/hip/GpuAtomics", API_CAFFE2)),
+        ("/GpuDefs", ("/hip/GpuDefs", API_CAFFE2)),
+        ("/GpuScanUtils", ("/hip/GpuScanUtils", API_CAFFE2)),
+        ("/GpuBitonicSort", ("/hip/GpuBitonicSort", API_CAFFE2)),
+        ("/math/reduce.cuh", ("/math/hip/reduce.cuh", API_CAFFE2)),
+        ("/sgd/adagrad_fused_op_gpu.cuh", ("/sgd/hip/adagrad_fused_op_gpu.cuh", API_CAFFE2)),
+        ("/operators/segment_reduction_op_gpu.cuh", ("/operators/hip/segment_reduction_op_gpu.cuh", API_CAFFE2)),
+        ("/gather_op.cuh", ("/hip/gather_op.cuh", API_CAFFE2)),
+        ("caffe2/core/common_cudnn.h", ("caffe2/core/hip/common_miopen.h", API_CAFFE2)),
+        ("REGISTER_CUDA_OPERATOR", ("REGISTER_HIP_OPERATOR", API_CAFFE2)),
+        ("CUDA_1D_KERNEL_LOOP", ("HIP_1D_KERNEL_LOOP", API_CAFFE2)),
+        ("CUDAContext", ("HIPContext", API_CAFFE2)),
+        ("CAFFE_CUDA_NUM_THREADS", ("CAFFE_HIP_NUM_THREADS", API_CAFFE2)),
+        ("HasCudaGPU", ("HasHipGPU", API_CAFFE2)),
+        ("__expf", ("expf", API_CAFFE2)),
+        ("CUBLAS_ENFORCE", ("HIPBLAS_ENFORCE", API_CAFFE2)),
+        ("CUBLAS_CHECK", ("HIPBLAS_CHECK", API_CAFFE2)),
+        ("cublas_handle", ("hipblas_handle", API_CAFFE2)),
+        ("CURAND_ENFORCE", ("HIPRAND_ENFORCE", API_CAFFE2)),
+        ("CURAND_CHECK", ("HIPRAND_CHECK", API_CAFFE2)),
+        ("curandGenerateUniform", ("hiprandGenerateUniform", API_CAFFE2)),
+        ("curand_generator", ("hiprand_generator", API_CAFFE2)),
+        ("CaffeCudaGetDevice", ("CaffeHipGetDevice", API_CAFFE2)),
+        # do not rename CUDA_KERNEL_ASSERT, lazyInitCUDA in caffe2 sources
+        # the ordered dict guarantees this pattern will match first, before "CUDA"
+        ("CUDA_KERNEL_ASSERT", ("CUDA_KERNEL_ASSERT", API_CAFFE2)),
+        ("lazyInitCUDA", ("lazyInitCUDA", API_CAFFE2)),
+        ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_CAFFE2)),
+        ("CUDA", ("HIP", API_CAFFE2)),
+        ("Cuda", ("Hip", API_CAFFE2)),
+        ("cuda_", ("hip_", API_CAFFE2)),
+        ("_cuda", ("_hip", API_CAFFE2)),
+        ("CUDNN", ("MIOPEN", API_CAFFE2)),
+        ("CuDNN", ("MIOPEN", API_CAFFE2)),
+        ("cudnn", ("miopen", API_CAFFE2)),
+        ("namespace cuda", ("namespace hip", API_CAFFE2)),
+        ("cuda::CUDAGuard", ("hip::HIPGuard", API_CAFFE2)),
+        ("cuda::OptionalCUDAGuard", ("hip::OptionalHIPGuard", API_CAFFE2)),
+        ("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)),
+        ("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)),
+        ("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)),
+        ("gloo/cuda", ("gloo/hip", API_CAFFE2)),
+    ]
+)
+
+# We must tread very carefully here.  Blanket conversions like are done
+# in CAFFE2_SPECIFIC_MAPPINGS are not presently supported on PyTorch,
+# because a regex for CUDA will also match a filename like CUDAGuard.h,
+# but the HIPIFY script doesn't presently move the file and so the substitution
+# will be invalid.  Instead, we specifically list out every identifier
+# and file from c10/cuda which may be used externally, and do substitutions this
+# way.
+#
+# NB: if you want a transformation to ONLY apply to the c10/ directory,
+# put it as API_CAFFE2
+C10_MAPPINGS = collections.OrderedDict(
+    [
+        ("CUDA_VERSION", ("TORCH_HIP_VERSION", API_PYTORCH)),
+        ("CUDA_LAUNCH_BLOCKING=1", ("AMD_SERIALIZE_KERNEL=3", API_C10)),
+        ("CUDA_LAUNCH_BLOCKING", ("AMD_SERIALIZE_KERNEL", API_C10)),
+        ("cuda::compat::", ("hip::compat::", API_C10)),
+        ("c10/cuda/CUDAAlgorithm.h", ("c10/hip/HIPAlgorithm.h", API_C10)),
+        ("c10/cuda/CUDADeviceAssertion.h", ("c10/hip/HIPDeviceAssertion.h", API_C10)),
+        ("c10/cuda/CUDADeviceAssertionHost.h", ("c10/hip/HIPDeviceAssertionHost.h", API_C10)),
+        ("c10/cuda/CUDAException.h", ("c10/hip/HIPException.h", API_C10)),
+        ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)),
+        ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),
+        ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)),
+        ("c10/cuda/CUDAMiscFunctions.h", ("c10/hip/HIPMiscFunctions.h", API_C10)),
+        ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)),
+        ("c10/cuda/CUDAGraphsC10Utils.h", ("c10/hip/HIPGraphsC10Utils.h", API_C10)),
+        ("c10/cuda/CUDAAllocatorConfig.h", ("c10/hip/HIPAllocatorConfig.h", API_C10)),
+        ("c10/cuda/CUDACachingAllocator.h", ("c10/hip/HIPCachingAllocator.h", API_C10)),
+        ("c10/cuda/impl/CUDATest.h", ("c10/hip/impl/HIPTest.h", API_C10)),
+        ("c10/cuda/impl/CUDAGuardImpl.h", ("c10/hip/impl/HIPGuardImpl.h", API_C10)),
+        (
+            "c10/cuda/impl/cuda_cmake_macros.h",
+            ("c10/hip/impl/hip_cmake_macros.h", API_C10),
+        ),
+        ("C10_CUDA_CHECK", ("C10_HIP_CHECK", API_C10)),
+        ("C10_CUDA_CHECK_WARN", ("C10_HIP_CHECK_WARN", API_C10)),
+        ("C10_CUDA_ERROR_HANDLED", ("C10_HIP_ERROR_HANDLED", API_C10)),
+        ("C10_CUDA_IGNORE_ERROR", ("C10_HIP_IGNORE_ERROR", API_C10)),
+        ("C10_CUDA_CLEAR_ERROR", ("C10_HIP_CLEAR_ERROR", API_C10)),
+        ("c10::cuda", ("c10::hip", API_C10)),
+        ("cuda::CUDAStream", ("hip::HIPStream", API_C10)),
+        ("CUDAStream", ("HIPStream", API_C10)),
+        # This substitution is not permissible, because there's another copy of this
+        # function in torch/cuda.h
+        # ("cuda::device_count", ("hip::device_count", API_C10)),
+        ("cuda::current_device", ("hip::current_device", API_C10)),
+        ("cuda::set_device", ("hip::set_device", API_C10)),
+        ("cuda::device_synchronize", ("hip::device_synchronize", API_C10)),
+        ("cuda::getStreamFromPool", ("hip::getStreamFromPool", API_C10)),
+        ("getStreamFromPool", ("getStreamFromPool", API_C10)),
+        ("cuda::getDefaultCUDAStream", ("hip::getDefaultHIPStream", API_C10)),
+        ("getDefaultCUDAStream", ("getDefaultHIPStream", API_C10)),
+        ("cuda::getCurrentCUDAStream", ("hip::getCurrentHIPStream", API_C10)),
+        ("getCurrentCUDAStream", ("getCurrentHIPStream", API_C10)),
+        ("cuda::get_cuda_check_prefix", ("hip::get_cuda_check_prefix", API_C10)),
+        ("cuda::setCurrentCUDAStream", ("hip::setCurrentHIPStream", API_C10)),
+        ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)),
+        ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)),
+        ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
+        ("cuda::CUDAAllocatorConfig", ("hip::HIPAllocatorConfig", API_C10)),
+        ("CUDAAllocatorConfig", ("HIPAllocatorConfig", API_C10)),
+        ("pinned_use_cuda_host_register", ("pinned_use_hip_host_register", API_C10)),
+        ("c10::cuda::CUDAAllocator", ("c10::hip::HIPAllocator", API_C10)),
+        ("cuda::CUDAAllocator", ("hip::HIPAllocator", API_C10)),
+        ("CUDAAllocator", ("HIPAllocator", API_C10)),
+        ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10))
+    ]
+)
+
+# NB: C10 mappings are more specific than Caffe2 mappings, so run them
+# first
+CUDA_TO_HIP_MAPPINGS = [
+    CUDA_IDENTIFIER_MAP,
+    CUDA_TYPE_NAME_MAP,
+    CUDA_INCLUDE_MAP,
+    CUDA_SPECIAL_MAP,
+    C10_MAPPINGS,
+    PYTORCH_SPECIFIC_MAPPINGS,
+    CAFFE2_SPECIFIC_MAPPINGS,
+]
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/hipify_python.py b/MLPY/Lib/site-packages/torch/utils/hipify/hipify_python.py
new file mode 100644
index 0000000000000000000000000000000000000000..5350e8669208647329c9ad4dcfa83d8b14627ba2
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hipify/hipify_python.py
@@ -0,0 +1,1159 @@
+#!/usr/bin/env python3
+""" The Python Hipify script.
+##
+# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+#               2017-2018 Advanced Micro Devices, Inc. and
+#                         Facebook Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+import argparse
+import fnmatch
+import re
+import shutil
+import sys
+import os
+
+from . import constants
+from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
+from .cuda_to_hip_mappings import MATH_TRANSPILATIONS
+
+from typing import Dict, List, Iterator, Optional
+from collections.abc import Mapping, Iterable
+from enum import Enum
+
+class CurrentState(Enum):
+    INITIALIZED = 1
+    DONE = 2
+
+class HipifyResult:
+    def __init__(self, current_state, hipified_path):
+        self.current_state = current_state
+        self.hipified_path = hipified_path
+        self.status = ""
+
+    def __str__(self):
+        return ("HipifyResult:: current_state: {}, hipified_path : {}, status: {}".format(self.current_state,
+                                                                                          self.hipified_path, self.status))
+
+HipifyFinalResult = Dict[str, HipifyResult]
+HIPIFY_C_BREADCRUMB = "// !!! This is a file automatically generated by hipify!!!\n"
+HIPIFY_FINAL_RESULT: HipifyFinalResult = {}
+
+# Hardcode the PyTorch template map
+"""This dictionary provides the mapping from PyTorch kernel template types
+to their actual types."""
+PYTORCH_TEMPLATE_MAP = {"Dtype": "scalar_t", "T": "scalar_t"}
+
+__all__ = ['InputError', 'openf', 'bcolors', 'GeneratedFileCleaner', 'match_extensions', 'matched_files_iter',
+           'preprocess_file_and_save_result', 'compute_stats', 'add_dim3', 'processKernelLaunches', 'find_closure_group',
+           'find_bracket_group', 'find_parentheses_group', 'replace_math_functions', 'hip_header_magic', 'replace_extern_shared',
+           'get_hip_file_path', 'is_out_of_place', 'is_pytorch_file', 'is_cusparse_file', 'is_special_file', 'is_caffe2_gpu_file',
+           'is_caffe2_gpu_file', 'Trie', 'preprocessor', 'file_specific_replacement', 'file_add_header',
+           'fix_static_global_kernels', 'extract_arguments', 'str2bool', 'CurrentState', 'HipifyResult', 'hipify']
+
+
+class InputError(Exception):
+    # Exception raised for errors in the input.
+
+    def __init__(self, message):
+        super().__init__(message)
+        self.message = message
+
+    def __str__(self):
+        return f"Input error: {self.message}"
+
+
+def openf(filename, mode):
+    return open(filename, mode, errors='ignore')
+
+
+# Color coding for printing
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+# To the programmer, the output of hipify most likely are intermediates.
+# This class allows users of hipify to ask for a cleanup by running the
+# hipify and compilation in a with instantiating this context manager class
+# with keep_intermediates=False.
+# The main usecase is the cpp_extensions, specifically the load method.
+# It is a good idea to keep intermediates (in case of errors or to
+# not recompile unchanged files), but in cases where you don't want to
+# keep them (e.g. in the CI), this can be used to remove files.
+class GeneratedFileCleaner:
+    """Context Manager to clean up generated files"""
+    def __init__(self, keep_intermediates=False):
+        self.keep_intermediates = keep_intermediates
+        self.files_to_clean = set()
+        self.dirs_to_clean = []
+
+    def __enter__(self):
+        return self
+
+    def open(self, fn, *args, **kwargs):
+        if not os.path.exists(fn):
+            self.files_to_clean.add(os.path.abspath(fn))
+        return open(fn, *args, **kwargs)
+
+    def makedirs(self, dn, exist_ok=False):
+        parent, n = os.path.split(dn)
+        if not n:
+            parent, n = os.path.split(parent)
+        if parent and n and not os.path.exists(parent):
+            self.makedirs(parent, exist_ok=True)
+        if not os.path.isdir(dn) or not exist_ok:
+            os.mkdir(dn)
+            self.dirs_to_clean.append(os.path.abspath(dn))
+
+    def __exit__(self, type, value, traceback):
+        if not self.keep_intermediates:
+            for f in self.files_to_clean:
+                os.unlink(f)
+            for d in self.dirs_to_clean[::-1]:
+                os.rmdir(d)
+
+
+def match_extensions(filename: str, extensions: Iterable) -> bool:
+    """Helper method to see if filename ends with certain extension"""
+    return any(filename.endswith(e) for e in extensions)
+
+
+def _fnmatch(filepath, patterns):
+    return any(fnmatch.fnmatch(filepath, pattern) for pattern in patterns)
+
+
+def matched_files_iter(
+        root_path: str,
+        includes: Iterable = (),
+        ignores: Iterable = (),
+        extensions: Iterable = (),
+        out_of_place_only: bool = False,
+        is_pytorch_extension: bool = False) -> Iterator[str]:
+
+    exact_matches = set(includes)
+
+    # This is a very rough heuristic; really, we want to avoid scanning
+    # any file which is not checked into source control, but this script
+    # needs to work even if you're in a Git or Hg checkout, so easier to
+    # just block the biggest time sinks that won't matter in the
+    # end.
+    for (abs_dirpath, dirs, filenames) in os.walk(root_path, topdown=True):
+        rel_dirpath = os.path.relpath(abs_dirpath, root_path)
+        if rel_dirpath == '.':
+            # Blah blah blah O(n) blah blah
+            if ".git" in dirs:
+                dirs.remove(".git")
+            if "build" in dirs:
+                dirs.remove("build")
+            if "third_party" in dirs:
+                dirs.remove("third_party")
+                dirs.append("third_party/nvfuser")
+        for filename in filenames:
+            filepath = os.path.join(abs_dirpath, filename)
+            rel_filepath = os.path.join(rel_dirpath, filename)
+            # We respect extensions, UNLESS you wrote the entire
+            # filename verbatim, in which case we always accept it
+            if (
+                _fnmatch(filepath, includes)
+                and (not _fnmatch(filepath, ignores))
+                and (match_extensions(filepath, extensions) or filepath in exact_matches)
+            ):
+                if not is_pytorch_extension:  # for pytorch extensions, consider all files
+                    if not is_pytorch_file(rel_filepath) and not is_caffe2_gpu_file(rel_filepath):
+                        continue
+                    if out_of_place_only and not is_out_of_place(rel_filepath):
+                        continue
+                yield filepath
+
+
+def preprocess_file_and_save_result(
+        output_directory: str,
+        filepath: str,
+        all_files: Iterable,
+        header_include_dirs: Iterable,
+        stats: Dict[str, List],
+        hip_clang_launch: bool,
+        is_pytorch_extension: bool,
+        clean_ctx: GeneratedFileCleaner,
+        show_progress: bool) -> None:
+    fin_path = os.path.abspath(os.path.join(output_directory, filepath))
+    hipify_result = HipifyResult(current_state=CurrentState.INITIALIZED, hipified_path=fin_path)
+    HIPIFY_FINAL_RESULT[fin_path] = hipify_result
+    result = preprocessor(output_directory, filepath, all_files, header_include_dirs, stats,
+                          hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+
+    # Show what happened
+    if show_progress and "ignored" not in result.status:
+        print(
+            fin_path, "->",
+            result.hipified_path, result.status, flush=True)
+
+    HIPIFY_FINAL_RESULT[fin_path] = result
+
+
+def compute_stats(stats):
+    unsupported_calls = {cuda_call for (cuda_call, _filepath) in stats["unsupported_calls"]}
+
+    # Print the number of unsupported calls
+    print(f"Total number of unsupported CUDA function calls: {len(unsupported_calls):d}")
+
+    # Print the list of unsupported calls
+    print(", ".join(unsupported_calls))
+
+    # Print the number of kernel launches
+    print(f"\nTotal number of replaced kernel launches: {len(stats['kernel_launches']):d}")
+
+
+def add_dim3(kernel_string, cuda_kernel):
+    '''adds dim3() to the second and third arguments in the kernel launch'''
+    count = 0
+    closure = 0
+    kernel_string = kernel_string.replace("<<<", "").replace(">>>", "")
+    arg_locs: List[Dict[str, int]] = [{} for _ in range(2)]
+    arg_locs[count]['start'] = 0
+    for ind, c in enumerate(kernel_string):
+        if count > 1:
+            break
+        if c == "(":
+            closure += 1
+        elif c == ")":
+            closure -= 1
+        if (c == "," or ind == len(kernel_string) - 1) and closure == 0:
+            arg_locs[count]['end'] = ind + (c != ",")
+            count += 1
+            if count < 2:
+                arg_locs[count]['start'] = ind + 1
+
+    first_arg_raw = kernel_string[arg_locs[0]['start']:arg_locs[0]['end'] + 1]
+    second_arg_raw = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']]
+
+    first_arg_clean = kernel_string[arg_locs[0]['start']:arg_locs[0]['end']].replace("\n", "").strip(" ")
+    second_arg_clean = kernel_string[arg_locs[1]['start']:arg_locs[1]['end']].replace("\n", "").strip(" ")
+
+    first_arg_dim3 = f"dim3({first_arg_clean})"
+    second_arg_dim3 = f"dim3({second_arg_clean})"
+
+    first_arg_raw_dim3 = first_arg_raw.replace(first_arg_clean, first_arg_dim3)
+    second_arg_raw_dim3 = second_arg_raw.replace(second_arg_clean, second_arg_dim3)
+    cuda_kernel = cuda_kernel.replace(first_arg_raw + second_arg_raw, first_arg_raw_dim3 + second_arg_raw_dim3)
+    return cuda_kernel
+
+
+RE_KERNEL_LAUNCH = re.compile(r'([ ]+)(detail?)::[ ]+\\\n[ ]+')
+
+
+def processKernelLaunches(string, stats):
+    """ Replace the CUDA style Kernel launches with the HIP style kernel launches."""
+    # Concat the namespace with the kernel names. (Find cleaner way of doing this later).
+    string = RE_KERNEL_LAUNCH.sub(lambda inp: f"{inp.group(1)}{inp.group(2)}::", string)
+
+    def grab_method_and_template(in_kernel):
+        # The positions for relevant kernel components.
+        pos = {
+            "kernel_launch": {"start": in_kernel["start"], "end": in_kernel["end"]},
+            "kernel_name": {"start": -1, "end": -1},
+            "template": {"start": -1, "end": -1}
+        }
+
+        # Count for balancing template
+        count = {"<>": 0}
+
+        # Status for whether we are parsing a certain item.
+        START = 0
+        AT_TEMPLATE = 1
+        AFTER_TEMPLATE = 2
+        AT_KERNEL_NAME = 3
+
+        status = START
+
+        # Parse the string character by character
+        for i in range(pos["kernel_launch"]["start"] - 1, -1, -1):
+            char = string[i]
+
+            # Handle Templating Arguments
+            if status in (START, AT_TEMPLATE):
+                if char == ">":
+                    if status == START:
+                        status = AT_TEMPLATE
+                        pos["template"]["end"] = i
+                    count["<>"] += 1
+
+                if char == "<":
+                    count["<>"] -= 1
+                    if count["<>"] == 0 and (status == AT_TEMPLATE):
+                        pos["template"]["start"] = i
+                        status = AFTER_TEMPLATE
+
+            # Handle Kernel Name
+            if status != AT_TEMPLATE:
+                if string[i].isalnum() or string[i] in {'(', ')', '_', ':', '#'}:
+                    if status != AT_KERNEL_NAME:
+                        status = AT_KERNEL_NAME
+                        pos["kernel_name"]["end"] = i
+
+                    # Case: Kernel name starts the string.
+                    if i == 0:
+                        pos["kernel_name"]["start"] = 0
+
+                        # Finished
+                        return [(pos["kernel_name"]), (pos["template"]), (pos["kernel_launch"])]
+
+                else:
+                    # Potential ending point if we're already traversing a kernel's name.
+                    if status == AT_KERNEL_NAME:
+                        pos["kernel_name"]["start"] = i
+
+                        # Finished
+                        return [(pos["kernel_name"]), (pos["template"]), (pos["kernel_launch"])]
+
+    def find_kernel_bounds(string):
+        """Finds the starting and ending points for all kernel launches in the string."""
+        kernel_end = 0
+        kernel_positions = []
+
+        # Continue until we cannot find any more kernels anymore.
+        while string.find("<<<", kernel_end) != -1:
+            # Get kernel starting position (starting from the previous ending point)
+            kernel_start = string.find("<<<", kernel_end)
+
+            # Get kernel ending position (adjust end point past the >>>)
+            kernel_end = string.find(">>>", kernel_start) + 3
+            if kernel_end <= 0:
+                raise InputError("no kernel end found")
+
+            # Add to list of traversed kernels
+            kernel_positions.append({"start": kernel_start, "end": kernel_end,
+                                     "group": string[kernel_start: kernel_end]})
+
+        return kernel_positions
+
+    # Replace comments and string literals from the code so that find_kernel_bounds does not
+    # wrongly capture kernels in comments and string literals.
+    # This function replaces them with "x" to keep positions.
+    def mask_comments(string):
+        in_comment = ''
+        prev_c = ''
+        new_string = ''
+        for c in string:
+            if in_comment == '':
+                # Outside comments
+                if c == '/' and prev_c == '/':
+                    in_comment = '//'
+                elif c == '*' and prev_c == '/':
+                    in_comment = '/*'
+                elif c == '"' and prev_c != '\\' and prev_c != "'":
+                    in_comment = '"'
+            elif in_comment == '//':
+                # In // xxx
+                if c == '\r' or c == '\n':
+                    in_comment = ''
+            elif in_comment == '/*':
+                # In /* xxx */
+                if c == '/' and prev_c == '*':
+                    in_comment = ''
+            elif in_comment == '"':
+                # In ""
+                if c == '"' and prev_c != '\\':
+                    in_comment = ''
+            prev_c = c
+            if in_comment == '':
+                new_string += c
+            else:
+                new_string += 'x'
+        return new_string
+
+    # Grab positional ranges of all kernel launches
+    get_kernel_positions = list(find_kernel_bounds(mask_comments(string)))
+    output_string = string
+
+    # Replace each CUDA kernel with a HIP kernel.
+    for kernel in get_kernel_positions:
+        # Get kernel components
+        params = grab_method_and_template(kernel)
+
+        # Find parenthesis after kernel launch
+        parenthesis = string.find("(", kernel["end"])
+
+        # Extract cuda kernel
+        cuda_kernel = string[params[0]["start"]:parenthesis + 1]
+        kernel_string = string[kernel['start']:kernel['end']]
+        end_param_index = 0 if params[1]['end'] == -1 else 1
+        kernel_name_with_template = string[params[0]['start']:params[end_param_index]['end'] + 1]
+        cuda_kernel_dim3 = add_dim3(kernel_string, cuda_kernel)
+        # Keep number of kernel launch params consistent (grid dims, group dims, stream, dynamic shared size)
+        num_klp = len(extract_arguments(0, kernel["group"].replace("<<<", "(").replace(">>>", ")")))
+
+        hip_kernel = "hipLaunchKernelGGL(" + cuda_kernel_dim3[0:-1].replace(
+            ">>>", ", 0" * (4 - num_klp) + ">>>").replace("<<<", ", ").replace(
+            ">>>", ", ").replace(kernel_name_with_template, "(" + kernel_name_with_template + ")")
+
+        # Replace cuda kernel with hip kernel
+        output_string = output_string.replace(cuda_kernel, hip_kernel)
+
+        # Update the statistics
+        stats["kernel_launches"].append(hip_kernel)
+
+    return output_string
+
+
+def find_closure_group(input_string, start, group):
+    """Generalization for finding a balancing closure group
+
+         if group = ["(", ")"], then finds the first balanced parentheses.
+         if group = ["{", "}"], then finds the first balanced bracket.
+
+    Given an input string, a starting position in the input string, and the group type,
+    find_closure_group returns the positions of group[0] and group[1] as a tuple.
+
+    Example:
+        >>> find_closure_group("(hi)", 0, ["(", ")"])
+        (0, 3)
+    """
+
+    inside_parenthesis = False
+    parens = 0
+    pos = start
+    p_start, p_end = -1, -1
+
+    while pos < len(input_string):
+        if input_string[pos] == group[0]:
+            if inside_parenthesis is False:
+                inside_parenthesis = True
+                parens = 1
+                p_start = pos
+            else:
+                parens += 1
+        elif input_string[pos] == group[1] and inside_parenthesis:
+            parens -= 1
+
+            if parens == 0:
+                p_end = pos
+                return p_start, p_end
+
+        pos += 1
+    return None, None
+
+
+def find_bracket_group(input_string, start):
+    """Finds the first balanced parantheses."""
+    return find_closure_group(input_string, start, group=["{", "}"])
+
+
+def find_parentheses_group(input_string, start):
+    """Finds the first balanced bracket."""
+    return find_closure_group(input_string, start, group=["(", ")"])
+
+
+RE_ASSERT = re.compile(r"\bassert[ ]*\(")
+
+
+def replace_math_functions(input_string):
+    """FIXME: Temporarily replace std:: invocations of math functions
+        with non-std:: versions to prevent linker errors NOTE: This
+        can lead to correctness issues when running tests, since the
+        correct version of the math function (exp/expf) might not get
+        called.  Plan is to remove this function once HIP supports
+        std:: math function calls inside device code
+
+    """
+    output_string = input_string
+    for func in MATH_TRANSPILATIONS:
+        output_string = output_string.replace(fr'{func}(', f'{MATH_TRANSPILATIONS[func]}(')
+
+    return output_string
+
+
+RE_SYNCTHREADS = re.compile(r":?:?\b(__syncthreads)\b(\w*\()")
+
+
+def hip_header_magic(input_string):
+    """If the file makes kernel builtin calls and does not include the cuda_runtime.h header,
+    then automatically add an #include to match the "magic" includes provided by NVCC.
+    TODO:
+        Update logic to ignore cases where the cuda_runtime.h is included by another file.
+    """
+
+    # Copy the input.
+    output_string = input_string
+
+    # Check if one of the following headers is already included.
+    headers = ["hip/hip_runtime.h", "hip/hip_runtime_api.h"]
+    if any(re.search(fr'#include ("{ext}"|<{ext}>)', output_string) for ext in headers):
+        return output_string
+
+    # Rough logic to detect if we're inside device code
+    hasDeviceLogic: int
+    hasDeviceLogic = "hipLaunchKernelGGL" in output_string
+    hasDeviceLogic += "__global__" in output_string
+    hasDeviceLogic += "__shared__" in output_string
+    hasDeviceLogic += RE_SYNCTHREADS.search(output_string) is not None
+
+    # If device logic found, provide the necessary header.
+    if hasDeviceLogic:
+        output_string = '#include "hip/hip_runtime.h"\n' + input_string
+
+    return output_string
+
+
+RE_EXTERN_SHARED = re.compile(r"extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;")
+
+
+def replace_extern_shared(input_string):
+    """Match extern __shared__ type foo[]; syntax and use HIP_DYNAMIC_SHARED() MACRO instead.
+       https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md#__shared__
+    Example:
+        "extern __shared__ char smemChar[];" => "HIP_DYNAMIC_SHARED( char, smemChar)"
+        "extern __shared__ unsigned char smem[];" => "HIP_DYNAMIC_SHARED( unsigned char, my_smem)"
+    """
+    output_string = input_string
+    output_string = RE_EXTERN_SHARED.sub(
+        lambda inp: f"HIP_DYNAMIC_SHARED({inp.group(1) or ''} {inp.group(2)}, {inp.group(3)})", output_string)
+
+    return output_string
+
+
+def get_hip_file_path(rel_filepath, is_pytorch_extension=False):
+    """
+    Returns the new name of the hipified file
+    """
+    # At the moment, some PyTorch source files are HIPified in place.  The predicate
+    # is_out_of_place tells us if this is the case or not.
+    assert not os.path.isabs(rel_filepath)
+    if not is_pytorch_extension and not is_out_of_place(rel_filepath):
+        return rel_filepath
+
+    dirpath, filename = os.path.split(rel_filepath)
+    root, ext = os.path.splitext(filename)
+
+    # Here's the plan:
+    #
+    # In general, we need to disambiguate the HIPified filename so that
+    # it gets a different name from the original filename, so
+    # that we don't overwrite the original file
+    #
+    # There's a lot of different naming conventions across PyTorch
+    # and Caffe2, but the general recipe is to convert occurrences
+    # of cuda/gpu to hip, and add hip if there are no occurrences
+    # of cuda/gpu anywhere.
+    #
+    # Concretely, we do the following:
+    #
+    #   - If there is a directory component named "cuda", replace
+    #     it with "hip", AND
+    #
+    #   - If the file name contains "CUDA", replace it with "HIP", AND
+    #
+    #   - ALWAYS replace '.cu' with '.hip', because those files
+    #     contain CUDA kernels that needs to be hipified and processed with
+    #     hip compiler
+    #
+    #   - If we are not hipifying a PyTorch extension, and the parent
+    #     directory name did not change as a result of the above
+    #     transformations, insert "hip" in the file path
+    #     as the direct parent folder of the file
+    #
+    #   - If we are hipifying a PyTorch extension, and the parent directory
+    #     name as well as the filename (incl. extension) did not change as
+    #     a result of the above transformations, insert "_hip" in the filename
+    #
+    # This isn't set in stone; we might adjust this to support other
+    # naming conventions.
+
+    if ext == '.cu':
+        ext = '.hip'
+
+    orig_filename = filename
+    orig_dirpath = dirpath
+
+    dirpath = dirpath.replace('cuda', 'hip')
+    dirpath = dirpath.replace('CUDA', 'HIP')
+    dirpath = dirpath.replace('THC', 'THH')
+
+    root = root.replace('cuda', 'hip')
+    root = root.replace('CUDA', 'HIP')
+    # Special case to handle caffe2/core/THCCachingAllocator
+    if dirpath != "caffe2/core":
+        root = root.replace('THC', 'THH')
+
+    if not is_pytorch_extension and dirpath == orig_dirpath:
+        dirpath = os.path.join(dirpath, 'hip')
+
+    if is_pytorch_extension and dirpath == orig_dirpath and (root + ext) == orig_filename:
+        root = root + "_hip"
+
+    return os.path.join(dirpath, root + ext)
+
+
+def is_out_of_place(rel_filepath):
+    assert not os.path.isabs(rel_filepath)
+    if rel_filepath.startswith("torch/"):
+        return False
+    if rel_filepath.startswith("third_party/nvfuser/"):
+        return False
+    if rel_filepath.startswith("tools/autograd/templates/"):
+        return False
+    return True
+
+
+# Keep this synchronized with includes/ignores in build_amd.py
+def is_pytorch_file(rel_filepath):
+    assert not os.path.isabs(rel_filepath)
+    if rel_filepath.startswith("aten/"):
+        if rel_filepath.startswith("aten/src/ATen/core/"):
+            return False
+        return True
+    if rel_filepath.startswith("torch/"):
+        return True
+    if rel_filepath.startswith("third_party/nvfuser/"):
+        return True
+    if rel_filepath.startswith("tools/autograd/templates/"):
+        return True
+    return False
+
+
+def is_cusparse_file(rel_filepath):
+    if is_pytorch_file(rel_filepath):
+        return "sparse" in rel_filepath.lower()
+    return False
+
+
+def is_special_file(rel_filepath):
+    if is_pytorch_file(rel_filepath):
+        if "sparse" in rel_filepath.lower():
+            return True
+        elif "linalg" in rel_filepath.lower():
+            if "batchlinearalgebralibblas" in rel_filepath.lower():
+                return False  # don't use "special" mappings for this specific linalg cublas file
+            return True
+    return False
+
+def is_caffe2_gpu_file(rel_filepath):
+    assert not os.path.isabs(rel_filepath)
+    if rel_filepath.startswith("c10/cuda"):
+        return True
+    filename = os.path.basename(rel_filepath)
+    _, ext = os.path.splitext(filename)
+    return ('gpu' in filename or ext in ['.cu', '.cuh']) and ('cudnn' not in filename)
+
+class TrieNode:
+    """A Trie node whose children are represented as a directory of char: TrieNode.
+       A special char '' represents end of word
+    """
+
+    def __init__(self):
+        self.children = {}
+
+class Trie:
+    """Creates a Trie out of a list of words. The trie can be exported to a Regex pattern.
+    The corresponding Regex should match much faster than a simple Regex union."""
+
+    def __init__(self):
+        """Initialize the trie with an empty root node."""
+        self.root = TrieNode()
+
+    def add(self, word):
+        """Add a word to the Trie. """
+        node = self.root
+
+        for char in word:
+            node.children.setdefault(char, TrieNode())
+            node = node.children[char]
+        node.children[''] = True    # Mark the end of the word
+
+    def dump(self):
+        """Return the root node of Trie. """
+        return self.root
+
+    def quote(self, char):
+        """ Escape a char for regex. """
+        return re.escape(char)
+
+    def search(self, word):
+        """Search whether word is present in the Trie.
+        Returns True if yes, else return False"""
+        node = self.root
+        for char in word:
+            if char in node.children:
+                node = node.children[char]
+            else:
+                return False
+
+        # make sure to check the end-of-word marker present
+        return '' in node.children
+
+    def _pattern(self, root):
+        """Convert a Trie into a regular expression pattern"""
+        node = root
+
+        if "" in node.children and len(node.children.keys()) == 1:
+            return None
+
+        alt = []    # store alternative patterns
+        cc = []     # store char to char classes
+        q = 0       # for node representing the end of word
+        for char in sorted(node.children.keys()):
+            if isinstance(node.children[char], TrieNode):
+                try:
+                    recurse = self._pattern(node.children[char])
+                    alt.append(self.quote(char) + recurse)
+                except Exception:
+                    cc.append(self.quote(char))
+            else:
+                q = 1
+        cconly = not len(alt) > 0
+
+        if len(cc) > 0:
+            if len(cc) == 1:
+                alt.append(cc[0])
+            else:
+                alt.append('[' + ''.join(cc) + ']')
+
+        if len(alt) == 1:
+            result = alt[0]
+        else:
+            result = "(?:" + "|".join(alt) + ")"
+
+        if q:
+            if cconly:
+                result += "?"
+            else:
+                result = f"(?:{result})?"
+        return result
+
+    def pattern(self):
+        """Export the Trie to a regex pattern."""
+        return self._pattern(self.root)
+
+    def export_to_regex(self):
+        """Export the Trie to a regex pattern."""
+        return self._pattern(self.root)
+
+CAFFE2_TRIE = Trie()
+CAFFE2_MAP = {}
+PYTORCH_TRIE = Trie()
+PYTORCH_MAP: Dict[str, object] = {}
+
+# In PyTorch, we map cuBLAS->rocBLAS and cuSPARSE->hipSPARSE. Note the prefix, roc versus hip.
+# The 'hip' APIs offer a more direct CUDA-friendly mapping, but calling rocBLAS directly has better performance.
+# Unfortunately, the roc* types and hip* types differ, i.e., rocblas_float_complex versus hipComplex.
+# In the case of SPARSE, we must use the hip types for complex instead of the roc types,
+# but the pytorch mappings assume roc. Therefore, we create a new SPARSE mapping that has a higher priority.
+# Its mappings will trigger first, and only when a miss occurs will the lower-priority pytorch mapping take place.
+# When a file contains "sparse" in the filename, a mapping marked with API_SPARSE is preferred over other choices.
+# Similarly, "linalg" files require rocBLAS -> hipSOLVER so they also need special handling.
+PYTORCH_SPECIAL_MAP = {}
+
+for mapping in CUDA_TO_HIP_MAPPINGS:
+    assert isinstance(mapping, Mapping)
+    for src, value in mapping.items():
+        dst = value[0]
+        meta_data = value[1:]
+        if constants.API_CAFFE2 not in meta_data:
+            PYTORCH_TRIE.add(src)
+            # if src is already in PYTORCH_MAP and dst belongs to API_SPECIAL
+            # do not overwrite PYTORCH_MAP, store dst separately
+            if constants.API_SPECIAL in meta_data and PYTORCH_MAP.get(src, ""):
+                PYTORCH_SPECIAL_MAP[src] = dst
+            else:
+                PYTORCH_MAP[src] = dst
+        if constants.API_PYTORCH not in meta_data and constants.API_SPECIAL not in meta_data:
+            CAFFE2_TRIE.add(src)
+            CAFFE2_MAP[src] = dst
+RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.export_to_regex())
+RE_PYTORCH_PREPROCESSOR = re.compile(fr'(?<=\W)({PYTORCH_TRIE.export_to_regex()})(?=\W)')
+
+RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"')
+RE_ANGLE_HEADER = re.compile(r'#include <([^>]+)>')
+RE_THC_GENERIC_FILE = re.compile(r'#define THC_GENERIC_FILE "([^"]+)"')
+RE_CU_SUFFIX = re.compile(r'\.cu\b')  # be careful not to pick up .cuh
+
+"""
+Returns a HipifyResult object with the following details:
+    "hipified_path" : absolute path of hipified source file
+    "status"        : "ok"      if hipified file was written out
+                      "skipped" if an identical hipified file already existed or hipified file couldn't be written out
+                      "ignored" if the source file was a hipified file itself or not meant to be hipified
+    "current_state" : CurrentState.INITIALIZED if source file is first ready to be hipified
+                      CurrentState.DONE if source file is done with hipification process
+"""
+
+
+def preprocessor(
+        output_directory: str,
+        filepath: str,
+        all_files: Iterable,
+        header_include_dirs: Iterable,
+        stats: Dict[str, List],
+        hip_clang_launch: bool,
+        is_pytorch_extension: bool,
+        clean_ctx: GeneratedFileCleaner,
+        show_progress: bool) -> HipifyResult:
+    """ Executes the CUDA -> HIP conversion on the specified file. """
+    fin_path = os.path.abspath(os.path.join(output_directory, filepath))
+    hipify_result = HIPIFY_FINAL_RESULT[fin_path]
+    if filepath not in all_files:
+        hipify_result.hipified_path = None
+        hipify_result.status = "[ignored, not to be hipified]"
+        hipify_result.current_state = CurrentState.DONE
+        return hipify_result
+
+    rel_filepath = os.path.relpath(filepath, output_directory)
+
+    with open(fin_path, encoding='utf-8') as fin:
+        if fin.readline() == HIPIFY_C_BREADCRUMB:
+            hipify_result.hipified_path = None
+            hipify_result.status = "[ignored, input is hipified output]"
+            hipify_result.current_state = CurrentState.DONE
+            return hipify_result
+        fin.seek(0)
+        output_source = fin.read()
+
+    orig_output_source = output_source
+
+    # get_hip_file_path needs a relative path to work correctly
+    fout_path = os.path.abspath(os.path.join(output_directory, get_hip_file_path(rel_filepath, is_pytorch_extension)))
+    if not os.path.exists(os.path.dirname(fout_path)):
+        clean_ctx.makedirs(os.path.dirname(fout_path))
+
+    # unsupported_calls statistics reporting is broken atm
+    def pt_repl(m):
+        return PYTORCH_MAP[m.group(0)]
+
+    def pt_special_repl(m):
+        # checks SPECIAL map first, and if a miss occurs, falls back to pytorch mappings
+        return PYTORCH_SPECIAL_MAP.get(m.group(0), pt_repl(m))
+
+
+    if is_pytorch_extension:
+        output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
+    else:
+        if is_special_file(rel_filepath):
+            output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_special_repl, output_source)
+        elif is_pytorch_file(rel_filepath):
+            output_source = RE_PYTORCH_PREPROCESSOR.sub(pt_repl, output_source)
+        else:
+            def c2_repl(m):
+                return CAFFE2_MAP[m.group(0)]
+            output_source = RE_CAFFE2_PREPROCESSOR.sub(c2_repl, output_source)
+
+    # Header rewrites
+    def mk_repl(templ, include_current_dir=True):
+        def repl(m):
+            f = m.group(1)
+            dirpath, filename = os.path.split(f)
+            if (
+                f.startswith(("ATen/cuda",
+                              "ATen/native/cuda",
+                              "ATen/native/nested/cuda",
+                              "ATen/native/quantized/cuda",
+                              "ATen/native/sparse/cuda",
+                              "ATen/native/transformers/cuda",
+                              "THC/")) or
+                (f.startswith("THC") and not f.startswith("THCP"))
+            ):
+                return templ.format(get_hip_file_path(m.group(1), is_pytorch_extension))
+            # if filename is one of the files being hipified for this extension
+            if (is_pytorch_extension and any(s.endswith(filename) for s in all_files)):
+                header_dir = None
+                header_filepath = None
+                # If include_current_dir True, look first in same dir as the including source file
+                if include_current_dir:
+                    header_dir_to_check = os.path.dirname(fin_path)
+                    header_path_to_check = os.path.abspath(os.path.join(header_dir_to_check, f))
+                    if os.path.exists(header_path_to_check):
+                        header_dir = header_dir_to_check
+                        header_filepath = header_path_to_check
+                # If not found, look in include dirs one by one and first match wins
+                if header_filepath is None:
+                    for header_include_dir in header_include_dirs:
+                        header_dir_to_check = os.path.join(output_directory, header_include_dir)
+                        header_path_to_check = os.path.abspath(os.path.join(header_dir_to_check, f))
+                        if os.path.exists(header_path_to_check):
+                            header_dir = header_dir_to_check
+                            header_filepath = header_path_to_check
+                # If header file not found, keep as is
+                if header_filepath is None:
+                    return m.group(0)
+                # Hipify header file first if needed
+                if header_filepath not in HIPIFY_FINAL_RESULT:
+                    preprocess_file_and_save_result(output_directory,
+                                                    header_filepath,
+                                                    all_files, header_include_dirs, stats, hip_clang_launch,
+                                                    is_pytorch_extension, clean_ctx, show_progress)
+                elif header_filepath in HIPIFY_FINAL_RESULT:
+                    header_result = HIPIFY_FINAL_RESULT[header_filepath]
+                    if header_result.current_state == CurrentState.INITIALIZED:
+                        # get_hip_file_path needs a relative path to work correctly
+                        header_rel_path = os.path.relpath(header_filepath, output_directory)
+                        header_fout_path = os.path.abspath(os.path.join(output_directory,
+                                                                        get_hip_file_path(header_rel_path, is_pytorch_extension)))
+                        header_result.hipified_path = header_fout_path
+                        HIPIFY_FINAL_RESULT[header_filepath] = header_result
+                        return templ.format(os.path.relpath(header_fout_path if header_fout_path is not None
+                                                            else header_filepath, header_dir))
+                hipified_header_filepath = HIPIFY_FINAL_RESULT[header_filepath].hipified_path
+                return templ.format(os.path.relpath(hipified_header_filepath if hipified_header_filepath is not None
+                                                    else header_filepath, header_dir))
+
+            return m.group(0)
+        return repl
+    output_source = RE_QUOTE_HEADER.sub(mk_repl('#include "{0}"', True), output_source)
+    output_source = RE_ANGLE_HEADER.sub(mk_repl('#include <{0}>', False), output_source)
+    output_source = RE_THC_GENERIC_FILE.sub(mk_repl('#define THC_GENERIC_FILE "{0}"'), output_source)
+
+    # CMakeLists.txt rewrites
+    if filepath.endswith('CMakeLists.txt'):
+        output_source = output_source.replace('CUDA', 'HIP')
+        output_source = output_source.replace('THC', 'THH')
+        output_source = RE_CU_SUFFIX.sub('.hip', output_source)
+
+    # Perform Kernel Launch Replacements
+    if not hip_clang_launch:
+        output_source = processKernelLaunches(output_source, stats)
+
+    # Replace std:: with non-std:: versions
+    if (filepath.endswith((".cu", ".cuh"))) and "PowKernel" not in filepath:
+        output_source = replace_math_functions(output_source)
+
+    # Include header if device code is contained.
+    output_source = hip_header_magic(output_source)
+
+    # Replace the extern __shared__
+    # NOTE: No longer needed after transition from hcc to hipclang.
+    # output_source = replace_extern_shared(output_source)
+
+    # Don't write out identical hipified files for extensions if dirpath has not changed
+    if (
+        is_pytorch_extension
+        and orig_output_source == output_source
+        and os.path.dirname(fin_path) == os.path.dirname(fout_path)
+    ):
+        hipify_result.hipified_path = fin_path
+        hipify_result.status = "[skipped, no changes]"
+        hipify_result.current_state = CurrentState.DONE
+        return hipify_result
+
+    # Add hipify breadcrumb for C-style files to avoid re-hipification
+    if fin_path != fout_path and match_extensions(fin_path, (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".hpp")):
+        output_source = HIPIFY_C_BREADCRUMB + output_source
+
+    do_write = True
+    if os.path.exists(fout_path):
+        with open(fout_path, encoding='utf-8') as fout_old:
+            do_write = fout_old.read() != output_source
+    if do_write:
+        try:
+            with clean_ctx.open(fout_path, 'w', encoding='utf-8') as fout:
+                fout.write(output_source)
+            hipify_result.hipified_path = fout_path
+            hipify_result.status = "[ok]"
+            hipify_result.current_state = CurrentState.DONE
+            return hipify_result
+        except PermissionError as e:
+            print(f"{bcolors.WARNING}Failed to save {fout_path} with \"{e.strerror}\", leaving {fin_path} unchanged.{bcolors.ENDC}",
+                  file=sys.stderr)
+            hipify_result.hipified_path = fin_path
+            hipify_result.status = "[skipped, no permissions]"
+            hipify_result.current_state = CurrentState.DONE
+            return hipify_result
+    else:
+        hipify_result.hipified_path = fout_path
+        hipify_result.status = "[skipped, already hipified]"
+        hipify_result.current_state = CurrentState.DONE
+        return hipify_result
+
+def file_specific_replacement(filepath, search_string, replace_string, strict=False):
+    with openf(filepath, "r+") as f:
+        contents = f.read()
+        if strict:
+            contents = re.sub(fr'\b({re.escape(search_string)})\b', lambda x: replace_string, contents)
+        else:
+            contents = contents.replace(search_string, replace_string)
+        f.seek(0)
+        f.write(contents)
+        f.truncate()
+
+
+def file_add_header(filepath, header):
+    with openf(filepath, "r+") as f:
+        contents = f.read()
+        if header[0] != "<" and header[-1] != ">":
+            header = f'"{header}"'
+        contents = (f'#include {header} \n') + contents
+        f.seek(0)
+        f.write(contents)
+        f.truncate()
+
+
+def fix_static_global_kernels(in_txt):
+    """Static global kernels in HIP results in a compilation error."""
+    in_txt = in_txt.replace(" __global__ static", "__global__")
+    return in_txt
+
+
+RE_INCLUDE = re.compile(r"#include .*\n")
+
+
+def extract_arguments(start, string):
+    """ Return the list of arguments in the upcoming function parameter closure.
+        Example:
+        string (input): '(blocks, threads, 0, THCState_getCurrentStream(state))'
+        arguments (output):
+            '[{'start': 1, 'end': 7},
+            {'start': 8, 'end': 16},
+            {'start': 17, 'end': 19},
+            {'start': 20, 'end': 53}]'
+    """
+
+    arguments = []
+    closures = {
+        "<": 0,
+        "(": 0
+    }
+    current_position = start
+    argument_start_pos = current_position + 1
+
+    # Search for final parenthesis
+    while current_position < len(string):
+        if string[current_position] == "(":
+            closures["("] += 1
+        elif string[current_position] == ")":
+            closures["("] -= 1
+        elif string[current_position] == "<":
+            closures["<"] += 1
+        elif string[current_position] == ">" and string[current_position - 1] != "-" and closures["<"] > 0:
+            closures["<"] -= 1
+
+        # Finished all arguments
+        if closures["("] == 0 and closures["<"] == 0:
+            # Add final argument
+            arguments.append({"start": argument_start_pos, "end": current_position})
+            break
+
+        # Finished current argument
+        if closures["("] == 1 and closures["<"] == 0 and string[current_position] == ",":
+            arguments.append({"start": argument_start_pos, "end": current_position})
+            argument_start_pos = current_position + 1
+
+        current_position += 1
+
+    return arguments
+
+
+def str2bool(v):
+    """ArgumentParser doesn't support type=bool. Thus, this helper method will convert
+    from possible string types to True / False."""
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def hipify(
+    project_directory: str,
+    show_detailed: bool = False,
+    extensions: Iterable = (".cu", ".cuh", ".c", ".cc", ".cpp", ".h", ".in", ".hpp"),
+    header_extensions: Iterable = (".cuh", ".h", ".hpp"),
+    output_directory: str = "",
+    header_include_dirs: Iterable = (),
+    includes: Iterable = ('*',),
+    extra_files: Iterable = (),
+    out_of_place_only: bool = False,
+    ignores: Iterable = (),
+    show_progress: bool = True,
+    hip_clang_launch: bool = False,
+    is_pytorch_extension: bool = False,
+    hipify_extra_files_only: bool = False,
+    clean_ctx: Optional[GeneratedFileCleaner] = None
+) -> HipifyFinalResult:
+    if project_directory == "":
+        project_directory = os.getcwd()
+
+    # Verify the project directory exists.
+    if not os.path.exists(project_directory):
+        print("The project folder specified does not exist.")
+        sys.exit(1)
+
+    # If no output directory, provide a default one.
+    if not output_directory:
+        project_directory.rstrip("/")
+        output_directory = project_directory + "_amd"
+
+    if project_directory != output_directory:
+        includes = [include.replace(project_directory, output_directory) for include in includes]
+        ignores = [ignore.replace(project_directory, output_directory) for ignore in ignores]
+
+    # Copy from project directory to output directory if not done already.
+    if not os.path.exists(output_directory):
+        shutil.copytree(project_directory, output_directory)
+
+    all_files = list(matched_files_iter(output_directory, includes=includes,
+                                        ignores=ignores, extensions=extensions,
+                                        out_of_place_only=out_of_place_only,
+                                        is_pytorch_extension=is_pytorch_extension))
+    all_files_set = set(all_files)
+    for f in extra_files:
+        if not os.path.isabs(f):
+            f = os.path.join(output_directory, f)
+        if f not in all_files_set:
+            all_files.append(f)
+
+    # List all files in header_include_paths to ensure they are hipified
+    from pathlib import Path
+    for header_include_dir in header_include_dirs:
+        if os.path.isabs(header_include_dir):
+            header_include_dir_path = Path(header_include_dir)
+        else:
+            header_include_dir_path = Path(os.path.join(output_directory, header_include_dir))
+        for path in header_include_dir_path.rglob('*'):
+            if (
+                path.is_file()
+                and _fnmatch(str(path), includes)
+                and (not _fnmatch(str(path), ignores))
+                and match_extensions(path.name, header_extensions)
+            ):
+                all_files.append(str(path))
+
+    if clean_ctx is None:
+        clean_ctx = GeneratedFileCleaner(keep_intermediates=True)
+
+    # Preprocessing statistics.
+    stats: Dict[str, List] = {"unsupported_calls": [], "kernel_launches": []}
+
+    for filepath in (all_files if not hipify_extra_files_only else extra_files):
+        preprocess_file_and_save_result(output_directory, filepath, all_files, header_include_dirs,
+                                        stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress)
+
+    print(bcolors.OKGREEN + "Successfully preprocessed all matching files." + bcolors.ENDC, file=sys.stderr)
+
+    # Show detailed summary
+    if show_detailed:
+        compute_stats(stats)
+
+    return HIPIFY_FINAL_RESULT
diff --git a/MLPY/Lib/site-packages/torch/utils/hipify/version.py b/MLPY/Lib/site-packages/torch/utils/hipify/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..608f35d6f6b03ca23f46fbd6500fc32f694a858f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hipify/version.py
@@ -0,0 +1 @@
+__version__ = '1.0.0'
diff --git a/MLPY/Lib/site-packages/torch/utils/hooks.py b/MLPY/Lib/site-packages/torch/utils/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b51f39ab8d364123fd67ac1b3b89254185cae33
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/hooks.py
@@ -0,0 +1,252 @@
+import torch
+from collections import OrderedDict
+import weakref
+import warnings
+from typing import Any, Tuple
+
+__all__ = ["RemovableHandle", "unserializable_hook", "warn_if_has_hooks", "BackwardHook"]
+
+class RemovableHandle:
+    r"""
+    A handle which provides the capability to remove a hook.
+
+    Args:
+        hooks_dict (dict): A dictionary of hooks, indexed by hook ``id``.
+        extra_dict (Union[dict, List[dict]]): An additional dictionary or list of
+            dictionaries whose keys will be deleted when the same keys are
+            removed from ``hooks_dict``.
+    """
+
+    id: int
+    next_id: int = 0
+
+    def __init__(self, hooks_dict: Any, *, extra_dict: Any = None) -> None:
+        self.hooks_dict_ref = weakref.ref(hooks_dict)
+        self.id = RemovableHandle.next_id
+        RemovableHandle.next_id += 1
+
+        self.extra_dict_ref: Tuple = ()
+        if isinstance(extra_dict, dict):
+            self.extra_dict_ref = (weakref.ref(extra_dict),)
+        elif isinstance(extra_dict, list):
+            self.extra_dict_ref = tuple(weakref.ref(d) for d in extra_dict)
+
+    def remove(self) -> None:
+        hooks_dict = self.hooks_dict_ref()
+        if hooks_dict is not None and self.id in hooks_dict:
+            del hooks_dict[self.id]
+
+        for ref in self.extra_dict_ref:
+            extra_dict = ref()
+            if extra_dict is not None and self.id in extra_dict:
+                del extra_dict[self.id]
+
+    def __getstate__(self):
+        if self.extra_dict_ref is None:
+            return (self.hooks_dict_ref(), self.id)
+        else:
+            return (self.hooks_dict_ref(), self.id, tuple(ref() for ref in self.extra_dict_ref))
+
+    def __setstate__(self, state) -> None:
+        if state[0] is None:
+            # create a dead reference
+            self.hooks_dict_ref = weakref.ref(OrderedDict())
+        else:
+            self.hooks_dict_ref = weakref.ref(state[0])
+        self.id = state[1]
+        RemovableHandle.next_id = max(RemovableHandle.next_id, self.id + 1)
+
+        if len(state) < 3 or state[2] is None:
+            self.extra_dict_ref = ()
+        else:
+            self.extra_dict_ref = tuple(weakref.ref(d) for d in state[2])
+
+    def __enter__(self) -> "RemovableHandle":
+        return self
+
+    def __exit__(self, type: Any, value: Any, tb: Any) -> None:
+        self.remove()
+
+
+def unserializable_hook(f):
+    """
+    Mark a function as an unserializable hook with this decorator.
+
+    This suppresses warnings that would otherwise arise if you attempt
+    to serialize a tensor that has a hook.
+    """
+    f.__torch_unserializable__ = True
+    return f
+
+
+def warn_if_has_hooks(tensor):
+    if tensor._backward_hooks:
+        for k in tensor._backward_hooks:
+            hook = tensor._backward_hooks[k]
+            if not hasattr(k, "__torch_unserializable__"):
+                warnings.warn(f"backward hook {repr(hook)} on tensor will not be "
+                              "serialized.  If this is expected, you can "
+                              "decorate the function with @torch.utils.hooks.unserializable_hook "
+                              "to suppress this warning")
+
+class BackwardHook:
+    """
+    A wrapper class to implement nn.Module backward hooks.
+
+    It handles:
+      - Ignoring non-Tensor inputs and replacing them by None before calling the user hook
+      - Generating the proper Node to capture a set of Tensor's gradients
+      - Linking the gradients captures for the outputs with the gradients captured for the input
+      - Calling the user hook once both output and input gradients are available
+    """
+
+    def __init__(self, module, user_hooks, user_pre_hooks):
+        self.user_hooks = user_hooks
+        self.user_pre_hooks = user_pre_hooks
+        self.module = module
+
+        self.grad_outputs = None
+        self.n_outputs = -1
+        self.output_tensors_index = None
+        self.n_inputs = -1
+        self.input_tensors_index = None
+
+    def _pack_with_none(self, indices, values, size):
+        res = [None] * size
+        for idx, val in zip(indices, values):
+            res[idx] = val
+
+        return tuple(res)
+
+    def _unpack_none(self, indices, values):
+        res = []
+        for idx in indices:
+            res.append(values[idx])
+
+        return tuple(res)
+
+    def _set_user_hook(self, grad_fn):
+        def hook(grad_input, _):
+            if self.grad_outputs is None:
+                # This happens because the gradient in your nn.Module flows to
+                # the Module's input without " passing through the Module's
+                # output, e.g. when you're doing double backward.
+                return
+            res = self._pack_with_none(self.input_tensors_index, grad_input, self.n_inputs)
+
+            for hook in self.user_hooks:
+                out = hook(self.module, res, self.grad_outputs)
+
+                if out is None:
+                    continue
+
+                if len(out) != len(res):
+                    raise RuntimeError("Backward hook returned an invalid number of grad_input, "
+                                       f"got {len(out)}, but expected {len(res)}")
+
+                res = out
+
+            self.grad_outputs = None
+
+            return self._unpack_none(self.input_tensors_index, res)
+
+        grad_fn.register_hook(hook)
+
+    def _apply_on_tensors(self, fn, args):
+        # Can be used to apply the given function to the tensors contained in the
+        # args. Will return updated args and the tensors indices
+        tensors_idx = []
+        tensors = []
+
+        requires_grad = False
+        for i, arg in enumerate(args):
+            if isinstance(arg, torch.Tensor):
+                tensors_idx.append(i)
+                tensors.append(arg)
+                requires_grad |= arg.requires_grad
+
+        if not (requires_grad and torch.is_grad_enabled()):
+            return args, None
+
+        new_tensors = torch.nn.modules._functions.BackwardHookFunction.apply(*tensors)
+        if len(new_tensors) == 0:
+            raise RuntimeError("Cannot set Module backward hook for a Module with no input Tensors.")
+
+        grad_fns = [t.grad_fn for t in new_tensors if t.grad_fn is not None and t.grad_fn.name() == "BackwardHookFunctionBackward"]
+        if len(grad_fns) == 0:
+            raise RuntimeError("Error while setting up backward hooks. Please open "
+                               "an issue with a code sample to reproduce this.")
+
+        fn(grad_fns[0])
+
+        arg_list = list(args)
+        for idx, val in zip(tensors_idx, new_tensors):
+            arg_list[idx] = val
+
+        if type(args) is tuple:
+            out = tuple(arg_list)
+        else:
+            out = type(args)(*arg_list)
+        return out, tensors_idx
+
+    def setup_input_hook(self, args):
+        def fn(grad_fn):
+            self._set_user_hook(grad_fn)
+
+        res, input_idx = self._apply_on_tensors(fn, args)
+        self.n_inputs = len(args)
+        self.input_tensors_index = input_idx
+        return res
+
+    def setup_output_hook(self, args):
+        def fn(grad_fn):
+            def hook(_, grad_output):
+                self.grad_outputs = self._pack_with_none(self.output_tensors_index,
+                                                         grad_output,
+                                                         self.n_outputs)
+
+                if self.user_pre_hooks:
+                    expected_len = len(self.grad_outputs)
+                    for user_pre_hook in self.user_pre_hooks:
+                        hook_grad_outputs = user_pre_hook(self.module, self.grad_outputs)
+                        if hook_grad_outputs is None:
+                            continue
+
+                        actual_len = len(hook_grad_outputs)
+                        if actual_len != expected_len:
+                            raise RuntimeError("Backward pre hook returned an invalid number of grad_output, "
+                                               f"got {actual_len}, but expected {expected_len}")
+                        self.grad_outputs = hook_grad_outputs
+
+                # We need to be able to clear self.grad_outputs but also return it
+                local_grad_outputs = self.grad_outputs
+
+                # Special case if no input required gradients, this hook should call the user
+                # hook directly
+                if self.input_tensors_index is None:
+                    grad_inputs = self._pack_with_none([], [], self.n_inputs)
+                    for user_hook in self.user_hooks:
+                        res = user_hook(self.module, grad_inputs, self.grad_outputs)
+                        if res is not None and not (isinstance(res, tuple) and all(el is None for el in res)):
+                            raise RuntimeError("Backward hook for Modules where no input requires "
+                                               "gradient should always return None or None for all gradients.")
+                    self.grad_outputs = None
+
+                if local_grad_outputs is not None:
+                    assert self.output_tensors_index is not None  # mypy
+                    return tuple(local_grad_outputs[i] for i in self.output_tensors_index)
+
+            grad_fn.register_hook(hook)
+
+        is_tuple = True
+        if not isinstance(args, tuple):
+            args = (args,)
+            is_tuple = False
+
+        res, output_idx = self._apply_on_tensors(fn, args)
+        self.n_outputs = len(args)
+        self.output_tensors_index = output_idx
+
+        if not is_tuple:
+            res = res[0]
+        return res
diff --git a/MLPY/Lib/site-packages/torch/utils/jit/__init__.py b/MLPY/Lib/site-packages/torch/utils/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f5a12faa99758192ecc4ed3fc22c9249232e86
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/jit/__init__.py
@@ -0,0 +1 @@
+
diff --git a/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e204f94588af94d3a034a83ac0592fdad3775d1
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/log_extract.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/log_extract.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dea93667b393f2300e57b2c255bc7d0b1268a5b6
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/jit/__pycache__/log_extract.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/jit/log_extract.py b/MLPY/Lib/site-packages/torch/utils/jit/log_extract.py
new file mode 100644
index 0000000000000000000000000000000000000000..1896b7a220881c3de57a98e229269b36524b53a9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/jit/log_extract.py
@@ -0,0 +1,113 @@
+from contextlib import contextmanager
+from typing import Any, List, Tuple, cast
+import random
+import torch
+import time
+from torch.utils.benchmark import Timer
+
+def extract_ir(filename: str) -> List[str]:
+    BEGIN = "<GRAPH_EXPORT>"
+    END = "</GRAPH_EXPORT>"
+    pfx = None
+    current = ""
+    graphs = []
+    with open(filename) as f:
+        split_strs = f.read().split(BEGIN)
+        for i, split_str in enumerate(split_strs):
+            if i == 0:
+                continue
+            end_loc = split_str.find(END)
+            if end_loc == -1:
+                continue
+            s = split_str[:end_loc]
+            pfx = split_strs[i - 1].splitlines()[-1]
+            lines = [x[len(pfx):] for x in s.splitlines(keepends=True)]
+            graphs.append(''.join(lines))
+
+    return graphs
+
+
+def make_tensor_from_type(inp_type: torch._C.TensorType):
+    size = inp_type.sizes()
+    stride = inp_type.strides()
+    device = inp_type.device()
+    dtype = inp_type.dtype()
+    assert size is not None
+    assert stride is not None
+    assert device is not None
+    assert dtype is not None
+    return torch.empty_strided(size=size, stride=stride, device=device, dtype=dtype)
+
+def load_graph_and_inputs(ir: str) -> Tuple[Any, List[Any]]:
+    graph = torch._C.parse_ir(ir, parse_tensor_constants=True)
+    graph.makeMultiOutputIntoTuple()
+    inputs = []
+    for inp in graph.inputs():
+        if isinstance(inp.type(), torch._C.FloatType):
+            inputs.append(random.uniform(.1, 100))
+        elif isinstance(inp.type(), torch._C.IntType):
+            inputs.append(random.randint(1, 100))
+        elif isinstance(inp.type(), torch._C.TensorType):
+            tensorType = cast(torch._C.TensorType, inp.type())
+            inputs.append(make_tensor_from_type(tensorType))
+        elif isinstance(inp.type(), torch._C.BoolType):
+            inputs.append(random.randint(0, 1) == 1)
+        else:
+            raise NotImplementedError(f"A default value is not implemented for type {inp.type()}")
+
+    func = torch._C._create_function_from_graph("forward", graph)
+    torch._C._jit_pass_erase_shape_information(func.graph)
+    return (func, inputs)
+
+def time_cuda(fn, inputs, test_runs):
+    t = Timer(stmt="fn(*inputs)", globals={"fn": fn, "inputs" : inputs})
+    times = t.blocked_autorange()
+    return times.median * 1000  # time in ms
+
+def time_cpu(fn, inputs, test_runs):
+    s = time.perf_counter()
+    for _ in range(test_runs):
+        fn(*inputs)
+    e = time.perf_counter()
+    return (e - s) / test_runs * 1000  # time in ms
+
+def run_test(ir, inputs, *, warmup_runs=10, test_runs=20) -> float:
+    graph, _ = load_graph_and_inputs(ir)
+    for _ in range(warmup_runs):
+        graph(*inputs)
+
+    is_cpu = None
+    for input in inputs:
+        if isinstance(input, torch.Tensor):
+            is_cpu = input.device.type == "cpu"
+            break
+    assert is_cpu is not None
+
+    out = time_cpu(graph, inputs, test_runs) if is_cpu else time_cuda(graph, inputs, test_runs)
+    return out
+
+@contextmanager
+def no_fuser(*args, **kwargs):
+    old_optimize = torch._C._get_graph_executor_optimize(False)
+    try:
+        yield
+    finally:
+        torch._C._get_graph_executor_optimize(old_optimize)
+
+def run_baseline_no_fusion(ir, inputs) -> float:
+    with no_fuser():
+        return run_test(ir, inputs)
+
+
+def run_nnc(ir, inputs, dynamic) -> float:
+    try:
+        strat = [("DYNAMIC", 10)] if dynamic else [("STATIC", 10)]
+        old_strat = torch.jit.set_fusion_strategy(strat)
+        with torch.jit.fuser("fuser1"):
+            return run_test(ir, inputs)
+    finally:
+        torch.jit.set_fusion_strategy(old_strat)
+
+def run_nvfuser(ir, inputs) -> float:
+    with torch.jit.fuser("fuser2"):
+        return run_test(ir, inputs)
diff --git a/MLPY/Lib/site-packages/torch/utils/mkldnn.py b/MLPY/Lib/site-packages/torch/utils/mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..064dd7f5db477fa5a93f6b5d790311955af562a1
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/mkldnn.py
@@ -0,0 +1,233 @@
+import torch
+
+
+class MkldnnLinear(torch.jit.ScriptModule):
+    def __init__(self, dense_module, dtype):
+        super().__init__()
+        self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
+        if dense_module.bias is not None:
+            # Bias can be fp32 or bf16 for OneDNN bf16 path, but for good accuracy,
+            # we use fp32 dtype.
+            self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        else:
+            # TODO: Remove this once ScriptModule supports registering None buffer
+            self.register_buffer(
+                'bias',
+                torch.zeros([dense_module.weight.size(0)], dtype=torch.float).to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.bias.to_dense(), self.training)
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = state[0].to_mkldnn()
+        self.bias = state[1].to_mkldnn()
+        self.training = state[2]
+
+    @torch.jit.script_method
+    def forward(self, x):
+        x_mkldnn = x if x.is_mkldnn else x.to_mkldnn()
+        y_mkldnn = torch._C._nn.mkldnn_linear(x_mkldnn, self.weight, self.bias)
+        y = y_mkldnn if x.is_mkldnn else y_mkldnn.to_dense()
+        return y
+
+
+class _MkldnnConvNd(torch.jit.ScriptModule):
+    """Common base of MkldnnConv1d and MkldnnConv2d."""
+
+    __constants__ = ['stride', 'padding', 'dilation', 'groups']
+
+    def __init__(self, dense_module):
+        super().__init__()
+
+        self.stride = dense_module.stride
+        self.padding = dense_module.padding
+        self.dilation = dense_module.dilation
+        self.groups = dense_module.groups
+
+        if dense_module.bias is not None:
+            self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        else:
+            # Bias can be fp32 or bf16 for OneDNN bf16 path, but for good accuracy,
+            # we use fp32 dtype.
+            # TODO: Remove this once ScriptModule supports registering None buffer
+            self.register_buffer(
+                'bias',
+                torch.zeros([dense_module.weight.size(0)], dtype=torch.float).to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.bias.to_dense(), self.training)
+
+    @torch.jit.script_method
+    def forward(self, x):
+        return torch.mkldnn_convolution(
+            x,
+            self.weight,
+            self.bias,
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups)
+
+
+class MkldnnConv1d(_MkldnnConvNd):
+    def __init__(self, dense_module, dtype):
+        super().__init__(dense_module)
+
+        self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = state[0].to_mkldnn()
+        self.bias = state[1].to_mkldnn()
+        self.training = state[2]
+
+
+class MkldnnConv2d(_MkldnnConvNd):
+    def __init__(self, dense_module, dtype):
+        super().__init__(dense_module)
+
+        self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv2d_weight(
+            dense_module.weight.to_mkldnn(dtype),
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups))
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = torch._C._nn.mkldnn_reorder_conv2d_weight(
+            state[0].to_mkldnn(),
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups)
+        self.bias = state[1].to_mkldnn()
+        self.training = state[2]
+
+class MkldnnConv3d(_MkldnnConvNd):
+    def __init__(self, dense_module, dtype):
+        super().__init__(dense_module)
+
+        self.register_buffer('weight', torch._C._nn.mkldnn_reorder_conv3d_weight(
+            dense_module.weight.to_mkldnn(dtype),
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups))
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = torch._C._nn.mkldnn_reorder_conv3d_weight(
+            state[0].to_mkldnn(),
+            self.padding,
+            self.stride,
+            self.dilation,
+            self.groups)
+        self.bias = state[1].to_mkldnn()
+        self.training = state[2]
+
+
+class MkldnnBatchNorm(torch.jit.ScriptModule):
+    __constants__ = ['exponential_average_factor', 'eps']
+
+    def __init__(self, dense_module):
+        super().__init__()
+
+        assert not dense_module.training
+        assert dense_module.track_running_stats
+        assert dense_module.affine
+
+        if dense_module.momentum is None:
+            self.exponential_average_factor = 0.0
+        else:
+            self.exponential_average_factor = dense_module.momentum
+        self.eps = dense_module.eps
+
+        self.register_buffer('weight', dense_module.weight.to_mkldnn())
+        self.register_buffer('bias', dense_module.bias.to_mkldnn())
+        self.register_buffer('running_mean', dense_module.running_mean.to_mkldnn())
+        self.register_buffer('running_var', dense_module.running_var.to_mkldnn())
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        weight = self.weight.to_dense()
+        bias = self.bias.to_dense()
+        running_mean = self.running_mean.to_dense()
+        running_var = self.running_var.to_dense()
+        return (weight, bias, running_mean, running_var, self.training)
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = state[0].to_mkldnn()
+        self.bias = state[1].to_mkldnn()
+        self.running_mean = state[2].to_mkldnn()
+        self.running_var = state[3].to_mkldnn()
+        self.training = state[4]
+
+    @torch.jit.script_method
+    def forward(self, x):
+        return torch.batch_norm(
+            x,
+            self.weight,
+            self.bias,
+            self.running_mean,
+            self.running_var,
+            False,  # training
+            self.exponential_average_factor,
+            self.eps,
+            False,  # cuda_enabled
+        )
+
+class MkldnnPrelu(torch.jit.ScriptModule):
+    def __init__(self, dense_module, dtype):
+        super().__init__()
+        self.register_buffer('weight', dense_module.weight.to_mkldnn(dtype))
+
+    @torch.jit.script_method
+    def __getstate__(self):
+        return (self.weight.to_dense(), self.training)
+
+    @torch.jit.script_method
+    def __setstate__(self, state):
+        self.weight = state[0].to_mkldnn()
+        self.training = state[1]
+
+    @torch.jit.script_method
+    def forward(self, x):
+        x_mkldnn = x if x.is_mkldnn else x.to_mkldnn()
+        y_mkldnn = torch.prelu(x_mkldnn, self.weight)
+        y = y_mkldnn if x.is_mkldnn else y_mkldnn.to_dense()
+        return y
+
+def to_mkldnn(module, dtype=torch.float):
+    assert dtype in [torch.float, torch.bfloat16, torch.half], \
+        "MKLDNN only support float, bfloat16, and half path now"
+
+    def m_fn(m, d):
+        if isinstance(m, torch.nn.Linear):
+            return MkldnnLinear(m, d)
+        elif isinstance(m, torch.nn.Conv1d):
+            return MkldnnConv1d(m, d)
+        elif isinstance(m, torch.nn.Conv2d):
+            return MkldnnConv2d(m, d)
+        elif isinstance(m, torch.nn.Conv3d):
+            return MkldnnConv3d(m, d)
+        elif isinstance(m, (torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):
+            # For batchnorm bf16 path, OneDNN requires weight and bias need fp32 dtype.
+            # so it doesn't need dtype argument.
+            return MkldnnBatchNorm(m)
+        elif isinstance(m, torch.nn.PReLU):
+            return MkldnnPrelu(m, d)
+        else:
+            return m
+
+    def m_fn_rec(m, d):
+        new_m = m_fn(m, d)
+        for name, sub_m in m.named_children():
+            setattr(new_m, name, m_fn_rec(sub_m, d))
+        return new_m
+
+    return m_fn_rec(module, dtype)
diff --git a/MLPY/Lib/site-packages/torch/utils/mobile_optimizer.py b/MLPY/Lib/site-packages/torch/utils/mobile_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0286fe6be9df3cc976a7d4baef419774e9fec0bd
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/mobile_optimizer.py
@@ -0,0 +1,135 @@
+"""This module contains utility method for mobile model optimization and lint."""
+
+import torch
+from enum import Enum
+from torch._C import _MobileOptimizerType as MobileOptimizerType
+from typing import Optional, Set, List, AnyStr
+
+class LintCode(Enum):
+    BUNDLED_INPUT = 1
+    REQUIRES_GRAD = 2
+    DROPOUT = 3
+    BATCHNORM = 4
+
+def optimize_for_mobile(
+        script_module: torch.jit.ScriptModule,
+        optimization_blocklist: Optional[Set[MobileOptimizerType]] = None,
+        preserved_methods: Optional[List[AnyStr]] = None,
+        backend: str = 'CPU') -> torch.jit.RecursiveScriptModule:
+    """
+    Optimize a torch script module for mobile deployment.
+
+    Args:
+        script_module: An instance of torch script module with type of ScriptModule.
+        optimization_blocklist: A set with type of MobileOptimizerType. When set is not passed,
+            optimization method will run all the optimizer pass; otherwise, optimizer
+            method will run the optimization pass that is not included inside optimization_blocklist.
+        preserved_methods: A list of methods that needed to be preserved when freeze_module pass is invoked
+        backend: Device type to use for running the result model ('CPU'(default), 'Vulkan' or 'Metal').
+    Returns:
+        A new optimized torch script module
+    """
+    if not isinstance(script_module, torch.jit.ScriptModule):
+        raise TypeError(
+            f'Got {type(script_module)}, but ScriptModule is expected.')
+
+    if optimization_blocklist is None:
+        optimization_blocklist = set()
+
+    if preserved_methods is None:
+        preserved_methods = []
+
+    # Convert potential byte arrays into strings (if there is any) to pass type checking
+    # Here we use a new name as assigning it back to preserved_methods will invoke
+    # mypy errors (i.e. List[AnyStr] = List[str])
+    preserved_methods_str: List[str] = [str(method) for method in preserved_methods]
+
+    bundled_inputs_attributes = _get_bundled_inputs_preserved_attributes(script_module, preserved_methods_str)
+    if all(hasattr(script_module, method) for method in bundled_inputs_attributes):
+        preserved_methods_str = list(set(preserved_methods_str + bundled_inputs_attributes))
+
+    non_exist_methods = []
+    for method in preserved_methods_str:
+        if not hasattr(script_module, method):
+            non_exist_methods.append(method)
+    if non_exist_methods:
+        raise AttributeError(
+            f"The following methods to preserve do not exist in script_module: {', '.join(non_exist_methods)}")
+
+    backend = backend.lower()
+    if backend == 'cpu':
+        optimized_cpp_module = torch._C._jit_pass_optimize_for_mobile(
+            script_module._c,
+            optimization_blocklist,
+            preserved_methods_str)
+    elif backend == 'vulkan':
+        optimized_cpp_module = torch._C._jit_pass_vulkan_optimize_for_mobile(
+            script_module._c,
+            optimization_blocklist,
+            preserved_methods_str)
+    elif backend == 'metal':
+        optimized_cpp_module = torch._C._jit_pass_metal_optimize_for_mobile(script_module._c, preserved_methods_str)
+    else:
+        raise TypeError("Unknown backend, must be one of 'CPU', 'Vulkan' or 'Metal'")
+
+    return torch.jit._recursive.wrap_cpp_module(optimized_cpp_module)
+
+
+def generate_mobile_module_lints(script_module: torch.jit.ScriptModule):
+    """
+    Generate a list of lints for a given torch script module.
+
+    Args:
+        script_module: An instance of torch script module with type of ScriptModule.
+
+    Returns:
+        lint_map: A list of dictionary that contains modules lints
+    """
+    if not isinstance(script_module, torch.jit.ScriptModule):
+        raise TypeError(
+            f'Got {type(script_module)}, but ScriptModule is expected.')
+
+    lint_list = []
+
+    if not hasattr(script_module, "_generate_bundled_inputs_for_forward"):
+        lint_list.append({"name": LintCode.BUNDLED_INPUT.name, "message": "No bundled input for forward, please add bundled inputs "
+                          "before saving the module using torch.utils.bundled_inputs.augment_model_with_bundled_inputs."})
+
+    for name, param in script_module.named_parameters():
+        if param.requires_grad:
+            lint_list.append({"name": LintCode.REQUIRES_GRAD.name, "message": f"Param {name} requires grad, "
+                             "please set torch.no_grad() to reduce memory usage and improve computation speed during "
+                              "inference phase."})
+
+    op_names = torch.jit.export_opnames(script_module)
+    for op_name in op_names:
+        if "dropout" in op_name:
+            lint_list.append({"name": LintCode.DROPOUT.name, "message": "Operator {} exists, remember to call eval() before "
+                              "saving the module.and call torch.utils.mobile_optimizer.optimize_for_mobile to drop dropout "
+                              "operator.".format(op_name)})
+        if "batch_norm" in op_name:
+            lint_list.append({"name": LintCode.BATCHNORM.name, "message": "Operator {} exists, remember to call eval() before "
+                              "saving the module and call torch.utils.mobile_optimizer.optimize_for_mobile to drop batch_norm "
+                              "operator.".format(op_name)})
+
+    return lint_list
+
+def _get_bundled_inputs_preserved_attributes(script_module: torch.jit.ScriptModule, preserved_methods: List[str]) -> List[str]:
+
+    bundled_inputs_attributes = []
+    # Has bundled inputs for forward
+    if hasattr(script_module, 'get_all_bundled_inputs'):
+        bundled_inputs_attributes.append('get_all_bundled_inputs')
+        bundled_inputs_attributes.append('get_num_bundled_inputs')
+
+    # Bundled inputs in module after the change that introduced bundled inputs for multiple functions
+    if hasattr(script_module, 'get_bundled_inputs_functions_and_info'):
+        bundled_inputs_attributes.append('get_bundled_inputs_functions_and_info')
+        all_info = script_module.get_bundled_inputs_functions_and_info()
+        for function_name in all_info:
+            if function_name not in preserved_methods:
+                bundled_inputs_attributes.append(function_name)
+            bundled_inputs_attributes.append("get_all_bundled_inputs_for_" + function_name)
+            bundled_inputs_attributes.append("_bundled_inputs_deflated_" + function_name)
+
+    return bundled_inputs_attributes
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/__init__.py b/MLPY/Lib/site-packages/torch/utils/model_dump/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0084d33c0140fdd49842f7785b8ff9b94fac3955
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/__init__.py
@@ -0,0 +1,414 @@
+#!/usr/bin/env python3
+"""
+model_dump: a one-stop shop for TorchScript model inspection.
+
+The goal of this tool is to provide a simple way to extract lots of
+useful information from a TorchScript model and make it easy for humans
+to consume.  It (mostly) replaces zipinfo, common uses of show_pickle,
+and various ad-hoc analysis notebooks.
+
+The tool extracts information from the model and serializes it as JSON.
+That JSON can then be rendered by an HTML+JS page, either by
+loading the JSON over HTTP or producing a fully self-contained page
+with all of the code and data burned-in.
+"""
+
+# Maintainer notes follow.
+"""
+The implementation strategy has tension between 3 goals:
+- Small file size.
+- Fully self-contained.
+- Easy, modern JS environment.
+Using Preact and HTM achieves 1 and 2 with a decent result for 3.
+However, the models I tested with result in ~1MB JSON output,
+so even using something heavier like full React might be tolerable
+if the build process can be worked out.
+
+One principle I have followed that I think is very beneficial
+is to keep the JSON data as close as possible to the model
+and do most of the rendering logic on the client.
+This makes for easier development (just refresh, usually),
+allows for more laziness and dynamism, and lets us add more
+views of the same data without bloating the HTML file.
+
+Currently, this code doesn't actually load the model or even
+depend on any part of PyTorch.  I don't know if that's an important
+feature to maintain, but it's probably worth preserving the ability
+to run at least basic analysis on models that cannot be loaded.
+
+I think the easiest way to develop this code is to cd into model_dump and
+run "python -m http.server", then load http://localhost:8000/skeleton.html
+in the browser.  In another terminal, run
+"python -m torch.utils.model_dump --style=json FILE > \
+    torch/utils/model_dump/model_info.json"
+every time you update the Python code or model.
+When you update JS, just refresh.
+
+Possible improvements:
+    - Fix various TODO comments in this file and the JS.
+    - Make the HTML much less janky, especially the auxiliary data panel.
+    - Make the auxiliary data panel start small, expand when
+      data is available, and have a button to clear/contract.
+    - Clean up the JS.  There's a lot of copypasta because
+      I don't really know how to use Preact.
+    - Make the HTML render and work nicely inside a Jupyter notebook.
+    - Add the ability for JS to choose the URL to load the JSON based
+      on the page URL (query or hash).  That way we could publish the
+      inlined skeleton once and have it load various JSON blobs.
+    - Add a button to expand all expandable sections so ctrl-F works well.
+    - Add hyperlinking from data to code, and code to code.
+    - Add hyperlinking from debug info to Diffusion.
+    - Make small tensor contents available.
+    - Do something nice for quantized models
+      (they probably don't work at all right now).
+"""
+
+import sys
+import os
+import io
+import pathlib
+import re
+import argparse
+import zipfile
+import json
+import pickle
+import pprint
+import urllib.parse
+
+from typing import (
+    Dict,
+)
+
+import torch.utils.show_pickle
+
+
+DEFAULT_EXTRA_FILE_SIZE_LIMIT = 16 * 1024
+
+__all__ = ['get_storage_info', 'hierarchical_pickle', 'get_model_info', 'get_inline_skeleton',
+           'burn_in_info', 'get_info_and_burn_skeleton']
+
+def get_storage_info(storage):
+    assert isinstance(storage, torch.utils.show_pickle.FakeObject)
+    assert storage.module == "pers"
+    assert storage.name == "obj"
+    assert storage.state is None
+    assert isinstance(storage.args, tuple)
+    assert len(storage.args) == 1
+    sa = storage.args[0]
+    assert isinstance(sa, tuple)
+    assert len(sa) == 5
+    assert sa[0] == "storage"
+    assert isinstance(sa[1], torch.utils.show_pickle.FakeClass)
+    assert sa[1].module == "torch"
+    assert sa[1].name.endswith("Storage")
+    storage_info = [sa[1].name.replace("Storage", "")] + list(sa[2:])
+    return storage_info
+
+
+def hierarchical_pickle(data):
+    if isinstance(data, (bool, int, float, str, type(None))):
+        return data
+    if isinstance(data, list):
+        return [hierarchical_pickle(d) for d in data]
+    if isinstance(data, tuple):
+        return {
+            "__tuple_values__": hierarchical_pickle(list(data)),
+        }
+    if isinstance(data, dict):
+        return {
+            "__is_dict__": True,
+            "keys": hierarchical_pickle(list(data.keys())),
+            "values": hierarchical_pickle(list(data.values())),
+        }
+    if isinstance(data, torch.utils.show_pickle.FakeObject):
+        typename = f"{data.module}.{data.name}"
+        if (
+            typename.startswith(('__torch__.', 'torch.jit.LoweredWrapper.', 'torch.jit.LoweredModule.'))
+        ):
+            assert data.args == ()
+            return {
+                "__module_type__": typename,
+                "state": hierarchical_pickle(data.state),
+            }
+        if typename == "torch._utils._rebuild_tensor_v2":
+            assert data.state is None
+            if len(data.args) == 6:
+                storage, offset, size, stride, requires_grad, hooks = data.args
+            else:
+                storage, offset, size, stride, requires_grad, hooks, metadata = data.args
+            storage_info = get_storage_info(storage)
+            return {"__tensor_v2__": [storage_info, offset, size, stride, requires_grad]}
+        if typename == "torch._utils._rebuild_qtensor":
+            assert data.state is None
+            storage, offset, size, stride, quantizer, requires_grad, hooks = data.args
+            storage_info = get_storage_info(storage)
+            assert isinstance(quantizer, tuple)
+            assert isinstance(quantizer[0], torch.utils.show_pickle.FakeClass)
+            assert quantizer[0].module == "torch"
+            if quantizer[0].name == "per_tensor_affine":
+                assert len(quantizer) == 3
+                assert isinstance(quantizer[1], float)
+                assert isinstance(quantizer[2], int)
+                quantizer_extra = list(quantizer[1:3])
+            else:
+                quantizer_extra = []
+            quantizer_json = [quantizer[0].name] + quantizer_extra
+            return {"__qtensor__": [storage_info, offset, size, stride, quantizer_json, requires_grad]}
+        if typename == "torch.jit._pickle.restore_type_tag":
+            assert data.state is None
+            obj, typ = data.args
+            assert isinstance(typ, str)
+            return hierarchical_pickle(obj)
+        if re.fullmatch(r"torch\.jit\._pickle\.build_[a-z]+list", typename):
+            assert data.state is None
+            ls, = data.args
+            assert isinstance(ls, list)
+            return hierarchical_pickle(ls)
+        if typename == "torch.device":
+            assert data.state is None
+            name, = data.args
+            assert isinstance(name, str)
+            # Just forget that it was a device and return the name.
+            return name
+        if typename == "builtin.UnicodeDecodeError":
+            assert data.state is None
+            msg, = data.args
+            assert isinstance(msg, str)
+            # Hack: Pretend this is a module so we don't need custom serialization.
+            # Hack: Wrap the message in a tuple so it looks like a nice state object.
+            # TODO: Undo at least that second hack.  We should support string states.
+            return {
+                "__module_type__": typename,
+                "state": hierarchical_pickle((msg,)),
+            }
+        raise Exception(f"Can't prepare fake object of type for JS: {typename}")
+    raise Exception(f"Can't prepare data of type for JS: {type(data)}")
+
+
+def get_model_info(
+        path_or_file,
+        title=None,
+        extra_file_size_limit=DEFAULT_EXTRA_FILE_SIZE_LIMIT):
+    """Get JSON-friendly information about a model.
+
+    The result is suitable for being saved as model_info.json,
+    or passed to burn_in_info.
+    """
+
+    if isinstance(path_or_file, os.PathLike):
+        default_title = os.fspath(path_or_file)
+        file_size = path_or_file.stat().st_size  # type: ignore[attr-defined]
+    elif isinstance(path_or_file, str):
+        default_title = path_or_file
+        file_size = pathlib.Path(path_or_file).stat().st_size
+    else:
+        default_title = "buffer"
+        path_or_file.seek(0, io.SEEK_END)
+        file_size = path_or_file.tell()
+        path_or_file.seek(0)
+
+    title = title or default_title
+
+    with zipfile.ZipFile(path_or_file) as zf:
+        path_prefix = None
+        zip_files = []
+        for zi in zf.infolist():
+            prefix = re.sub("/.*", "", zi.filename)
+            if path_prefix is None:
+                path_prefix = prefix
+            elif prefix != path_prefix:
+                raise Exception(f"Mismatched prefixes: {path_prefix} != {prefix}")
+            zip_files.append(dict(
+                filename=zi.filename,
+                compression=zi.compress_type,
+                compressed_size=zi.compress_size,
+                file_size=zi.file_size,
+            ))
+
+        assert path_prefix is not None
+        version = zf.read(path_prefix + "/version").decode("utf-8").strip()
+
+        def get_pickle(name):
+            assert path_prefix is not None
+            with zf.open(path_prefix + f"/{name}.pkl") as handle:
+                raw = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
+                return hierarchical_pickle(raw)
+
+        model_data = get_pickle("data")
+        constants = get_pickle("constants")
+
+        # Intern strings that are likely to be re-used.
+        # Pickle automatically detects shared structure,
+        # so re-used strings are stored efficiently.
+        # However, JSON has no way of representing this,
+        # so we have to do it manually.
+        interned_strings : Dict[str, int] = {}
+
+        def ist(s):
+            if s not in interned_strings:
+                interned_strings[s] = len(interned_strings)
+            return interned_strings[s]
+
+        code_files = {}
+        for zi in zf.infolist():
+            if not zi.filename.endswith(".py"):
+                continue
+            with zf.open(zi) as handle:
+                raw_code = handle.read()
+            with zf.open(zi.filename + ".debug_pkl") as handle:
+                raw_debug = handle.read()
+
+            # Parse debug info and add begin/end markers if not present
+            # to ensure that we cover the entire source code.
+            debug_info_t = pickle.loads(raw_debug)
+            text_table = None
+
+            if (len(debug_info_t) == 3 and
+                    isinstance(debug_info_t[0], str) and
+                    debug_info_t[0] == 'FORMAT_WITH_STRING_TABLE'):
+                _, text_table, content = debug_info_t
+
+                def parse_new_format(line):
+                    # (0, (('', '', 0), 0, 0))
+                    num, ((text_indexes, fname_idx, offset), start, end), tag = line
+                    text = ''.join(text_table[x] for x in text_indexes)  # type: ignore[index]
+                    fname = text_table[fname_idx]  # type: ignore[index]
+                    return num, ((text, fname, offset), start, end), tag
+
+                debug_info_t = map(parse_new_format, content)
+
+            debug_info = list(debug_info_t)
+            if not debug_info:
+                debug_info.append((0, (('', '', 0), 0, 0)))
+            if debug_info[-1][0] != len(raw_code):
+                debug_info.append((len(raw_code), (('', '', 0), 0, 0)))
+
+            code_parts = []
+            for di, di_next in zip(debug_info, debug_info[1:]):
+                start, source_range, *_ = di
+                end = di_next[0]
+                assert end > start
+                source, s_start, s_end = source_range
+                s_text, s_file, s_line = source
+                # TODO: Handle this case better.  TorchScript ranges are in bytes,
+                # but JS doesn't really handle byte strings.
+                # if bytes and chars are not equivalent for this string,
+                # zero out the ranges so we don't highlight the wrong thing.
+                if len(s_text) != len(s_text.encode("utf-8")):
+                    s_start = 0
+                    s_end = 0
+                text = raw_code[start:end]
+                code_parts.append([text.decode("utf-8"), ist(s_file), s_line, ist(s_text), s_start, s_end])
+            code_files[zi.filename] = code_parts
+
+        extra_files_json_pattern = re.compile(re.escape(path_prefix) + "/extra/.*\\.json")
+        extra_files_jsons = {}
+        for zi in zf.infolist():
+            if not extra_files_json_pattern.fullmatch(zi.filename):
+                continue
+            if zi.file_size > extra_file_size_limit:
+                continue
+            with zf.open(zi) as handle:
+                try:
+                    json_content = json.load(handle)
+                    extra_files_jsons[zi.filename] = json_content
+                except json.JSONDecodeError:
+                    extra_files_jsons[zi.filename] = "INVALID JSON"
+
+        always_render_pickles = {
+            "bytecode.pkl",
+        }
+        extra_pickles = {}
+        for zi in zf.infolist():
+            if not zi.filename.endswith(".pkl"):
+                continue
+            with zf.open(zi) as handle:
+                # TODO: handle errors here and just ignore the file?
+                # NOTE: For a lot of these files (like bytecode),
+                # we could get away with just unpickling, but this should be safer.
+                obj = torch.utils.show_pickle.DumpUnpickler(handle, catch_invalid_utf8=True).load()
+            buf = io.StringIO()
+            pprint.pprint(obj, buf)
+            contents = buf.getvalue()
+            # Checked the rendered length instead of the file size
+            # because pickles with shared structure can explode in size during rendering.
+            if os.path.basename(zi.filename) not in always_render_pickles and \
+                    len(contents) > extra_file_size_limit:
+                continue
+            extra_pickles[zi.filename] = contents
+
+    return {"model": dict(
+        title=title,
+        file_size=file_size,
+        version=version,
+        zip_files=zip_files,
+        interned_strings=list(interned_strings),
+        code_files=code_files,
+        model_data=model_data,
+        constants=constants,
+        extra_files_jsons=extra_files_jsons,
+        extra_pickles=extra_pickles,
+    )}
+
+
+def get_inline_skeleton():
+    """Get a fully-inlined skeleton of the frontend.
+
+    The returned HTML page has no external network dependencies for code.
+    It can load model_info.json over HTTP, or be passed to burn_in_info.
+    """
+
+    import importlib.resources
+
+    skeleton = importlib.resources.read_text(__package__, "skeleton.html")
+    js_code = importlib.resources.read_text(__package__, "code.js")
+    for js_module in ["preact", "htm"]:
+        js_lib = importlib.resources.read_binary(__package__, f"{js_module}.mjs")
+        js_url = "data:application/javascript," + urllib.parse.quote(js_lib)
+        js_code = js_code.replace(f"https://unpkg.com/{js_module}?module", js_url)
+    skeleton = skeleton.replace(' src="./code.js">', ">\n" + js_code)
+    return skeleton
+
+
+def burn_in_info(skeleton, info):
+    """Burn model info into the HTML skeleton.
+
+    The result will render the hard-coded model info and
+    have no external network dependencies for code or data.
+    """
+
+    # Note that Python's json serializer does not escape slashes in strings.
+    # Since we're inlining this JSON directly into a script tag, a string
+    # containing "</script>" would end the script prematurely and
+    # mess up our page.  Unconditionally escape fixes that.
+    return skeleton.replace(
+        "BURNED_IN_MODEL_INFO = null",
+        "BURNED_IN_MODEL_INFO = " + json.dumps(info, sort_keys=True).replace("/", "\\/"))
+
+
+def get_info_and_burn_skeleton(path_or_bytesio, **kwargs):
+    model_info = get_model_info(path_or_bytesio, **kwargs)
+    skeleton = get_inline_skeleton()
+    page = burn_in_info(skeleton, model_info)
+    return page
+
+
+def main(argv, *, stdout=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--style", choices=["json", "html"])
+    parser.add_argument("--title")
+    parser.add_argument("model")
+    args = parser.parse_args(argv[1:])
+
+    info = get_model_info(args.model, title=args.title)
+
+    output = stdout or sys.stdout
+
+    if args.style == "json":
+        output.write(json.dumps(info, sort_keys=True) + "\n")
+    elif args.style == "html":
+        skeleton = get_inline_skeleton()
+        page = burn_in_info(skeleton, info)
+        output.write(page)
+    else:
+        raise Exception("Invalid style")
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/__main__.py b/MLPY/Lib/site-packages/torch/utils/model_dump/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..07956f5654ef85908fe5207fdcbd8bcde6166676
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/__main__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+import sys
+from . import main
+
+sys.exit(main(sys.argv))
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d8f5fc3c92404ac5866d7ce349f47a48849964c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__main__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__main__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46cc3cb61a98b07f24f08c1306f4f44804475f15
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/model_dump/__pycache__/__main__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/code.js b/MLPY/Lib/site-packages/torch/utils/model_dump/code.js
new file mode 100644
index 0000000000000000000000000000000000000000..d0eab53d72fbc382ef06cfceb420765c997b6f1f
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/code.js
@@ -0,0 +1,689 @@
+import { h, Component, render } from 'https://unpkg.com/preact?module';
+import htm from 'https://unpkg.com/htm?module';
+
+const html = htm.bind(h);
+
+const BURNED_IN_MODEL_INFO = null;
+
+// https://stackoverflow.com/a/20732091
+function humanFileSize(size) {
+  if (size == 0) { return "0 B"; }
+  var i = Math.floor( Math.log(size) / Math.log(1024) );
+  return (size / Math.pow(1024, i)).toFixed(2) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
+}
+
+function caret(down) {
+  return down ? "\u25BE" : "\u25B8";
+}
+
+class Blamer {
+  constructor() {
+    this.blame_on_click = false;
+    this.aux_content_pane = null;
+  }
+
+  setAuxContentPane(pane) {
+    this.aux_content_pane = pane;
+  }
+
+  readyBlame() {
+    this.blame_on_click = true;
+  }
+
+  maybeBlame(arg) {
+    if (!this.blame_on_click) {
+      return;
+    }
+    this.blame_on_click = false;
+    if (!this.aux_content_pane) {
+      return;
+    }
+    this.aux_content_pane.doBlame(arg);
+  }
+}
+
+let blame = new Blamer();
+
+class Hider extends Component {
+  constructor() {
+    super();
+    this.state = { shown: null };
+  }
+
+  componentDidMount() {
+    this.setState({ shown: this.props.shown === "true" });
+  }
+
+  render({name, children}, {shown}) {
+    let my_caret = html`<span class=caret onClick=${() => this.click()} >${caret(shown)}</span>`;
+    return html`<div data-hider-title=${name} data-shown=${shown}>
+      <h2>${my_caret} ${name}</h2>
+      <div>${shown ? this.props.children : []}</div></div>`;
+  }
+
+  click() {
+    this.setState({shown: !this.state.shown});
+  }
+}
+
+function ModelSizeSection({model: {file_size, zip_files}}) {
+  let store_size = 0;
+  let compr_size = 0;
+  for (const zi of zip_files) {
+    if (zi.compression === 0) {
+      // TODO: Maybe check that compressed_size === file_size.
+      store_size += zi.compressed_size;
+    } else {
+      compr_size += zi.compressed_size;
+    }
+  }
+  let zip_overhead = file_size - store_size - compr_size;
+  // TODO: Better formatting.  Right-align this.
+  return html`
+    <${Hider} name="Model Size" shown=true>
+    <pre>.
+      Model size: ${file_size} (${humanFileSize(file_size)})
+      Stored files: ${store_size} (${humanFileSize(store_size)})
+      Compressed files: ${compr_size} (${humanFileSize(compr_size)})
+      Zip overhead: ${zip_overhead} (${humanFileSize(zip_overhead)})
+    </pre><//>`;
+}
+
+function StructuredDataSection({name, data, shown}) {
+  return html`
+    <${Hider} name=${name} shown=${shown}>
+    <div style="font-family:monospace;">
+      <${StructuredData} data=${data} indent="" prefix=""/>
+    </div><//>`;
+}
+
+class StructuredData extends Component {
+  constructor() {
+    super();
+    this.state = { shown: false };
+
+    this.INLINE_TYPES = new Set(["boolean", "number", "string"])
+    this.IGNORED_STATE_KEYS = new Set(["training", "_is_full_backward_hook"])
+  }
+
+  click() {
+    this.setState({shown: !this.state.shown});
+  }
+
+  expando(data) {
+    if (data === null || this.INLINE_TYPES.has(typeof(data))) {
+      return false;
+    }
+    if (typeof(data) != "object") {
+      throw new Error("Not an object");
+    }
+    if (Array.isArray(data)) {
+      // TODO: Maybe show simple lists and tuples on one line.
+      return true;
+    }
+    if (data.__tuple_values__) {
+      // TODO: Maybe show simple lists and tuples on one line.
+      return true;
+    }
+    if (data.__is_dict__) {
+      // TODO: Maybe show simple (empty?) dicts on one line.
+      return true;
+    }
+    if (data.__module_type__) {
+      return true;
+    }
+    if (data.__tensor_v2__) {
+      return false;
+    }
+    if (data.__qtensor__) {
+      return false;
+    }
+    throw new Error("Can't handle data type.", data);
+  }
+
+  renderHeadline(data) {
+    if (data === null) {
+      return "None";
+    }
+    if (typeof(data) == "boolean") {
+      const sd = String(data);
+      return sd.charAt(0).toUpperCase() + sd.slice(1);
+    }
+    if (typeof(data) == "number") {
+      return JSON.stringify(data);
+    }
+    if (typeof(data) == "string") {
+      return JSON.stringify(data);
+    }
+    if (typeof(data) != "object") {
+      throw new Error("Not an object");
+    }
+    if (Array.isArray(data)) {
+      return "list([";
+    }
+    if (data.__tuple_values__) {
+      return "tuple((";
+    }
+    if (data.__is_dict__) {
+      return "dict({";
+    }
+    if (data.__module_type__) {
+      return data.__module_type__ + "()";
+    }
+    if (data.__tensor_v2__) {
+      const [storage, offset, size, stride, grad] = data.__tensor_v2__;
+      const [dtype, key, device, numel] = storage;
+      return this.renderTensor(
+        "tensor", dtype, key, device, numel, offset, size, stride, grad, []);
+    }
+    if (data.__qtensor__) {
+      const [storage, offset, size, stride, quantizer, grad] = data.__qtensor__;
+      const [dtype, key, device, numel] = storage;
+      let extra_parts = [];
+      if (quantizer[0] == "per_tensor_affine") {
+        extra_parts.push(`scale=${quantizer[1]}`);
+        extra_parts.push(`zero_point=${quantizer[2]}`);
+      } else {
+        extra_parts.push(`quantizer=${quantizer[0]}`);
+      }
+      return this.renderTensor(
+        "qtensor", dtype, key, device, numel, offset, size, stride, grad, extra_parts);
+    }
+    throw new Error("Can't handle data type.", data);
+  }
+
+  renderTensor(
+      prefix,
+      dtype,
+      storage_key,
+      device,
+      storage_numel,
+      offset,
+      size,
+      stride,
+      grad,
+      extra_parts) {
+    let parts = [
+      "(" + size.join(",") + ")",
+      dtype,
+    ];
+    parts.push(...extra_parts);
+    if (device != "cpu") {
+      parts.push(device);
+    }
+    if (grad) {
+      parts.push("grad");
+    }
+    // TODO: Check stride and indicate if the tensor is channels-last or non-contiguous
+    // TODO: Check size, stride, offset, and numel and indicate if
+    // the tensor doesn't use all data in storage.
+    // TODO: Maybe show key?
+    void(offset);
+    void(stride);
+    void(storage_key);
+    void(storage_numel);
+    return prefix + "(" + parts.join(", ") + ")";
+  }
+
+  renderBody(indent, data) {
+    if (data === null || this.INLINE_TYPES.has(typeof(data))) {
+      throw "Should not reach here."
+    }
+    if (typeof(data) != "object") {
+      throw new Error("Not an object");
+    }
+    if (Array.isArray(data)) {
+      let new_indent = indent + "\u00A0\u00A0";
+      let parts = [];
+      for (let idx = 0; idx < data.length; idx++) {
+        // Does it make sense to put explicit index numbers here?
+        parts.push(html`<br/><${StructuredData} prefix=${idx + ": "} indent=${new_indent} data=${data[idx]} />`);
+      }
+      return parts;
+    }
+    if (data.__tuple_values__) {
+      // Handled the same as lists.
+      return this.renderBody(indent, data.__tuple_values__);
+    }
+    if (data.__is_dict__) {
+      let new_indent = indent + "\u00A0\u00A0";
+      let parts = [];
+      for (let idx = 0; idx < data.keys.length; idx++) {
+        if (typeof(data.keys[idx]) != "string") {
+          parts.push(html`<br/>${new_indent}Non-string key`);
+        } else {
+          parts.push(html`<br/><${StructuredData} prefix=${data.keys[idx] + ": "} indent=${new_indent} data=${data.values[idx]} />`);
+        }
+      }
+      return parts;
+    }
+    if (data.__module_type__) {
+      const mstate = data.state;
+      if (mstate === null || typeof(mstate) != "object") {
+        throw new Error("Bad module state");
+      }
+      let new_indent = indent + "\u00A0\u00A0";
+      let parts = [];
+      if (mstate.__is_dict__) {
+        // TODO: Less copy/paste between this and normal dicts.
+        for (let idx = 0; idx < mstate.keys.length; idx++) {
+          if (typeof(mstate.keys[idx]) != "string") {
+            parts.push(html`<br/>${new_indent}Non-string key`);
+          } else if (this.IGNORED_STATE_KEYS.has(mstate.keys[idx])) {
+            // Do nothing.
+          } else {
+            parts.push(html`<br/><${StructuredData} prefix=${mstate.keys[idx] + ": "} indent=${new_indent} data=${mstate.values[idx]} />`);
+          }
+        }
+      } else if (mstate.__tuple_values__) {
+        parts.push(html`<br/><${StructuredData} prefix="" indent=${new_indent} data=${mstate} />`);
+      } else if (mstate.__module_type__) {
+        // We normally wouldn't have the state of a module be another module,
+        // but we use "modules" to encode special values (like Unicode decode
+        // errors) that might be valid states.  Just go with it.
+        parts.push(html`<br/><${StructuredData} prefix="" indent=${new_indent} data=${mstate} />`);
+      } else {
+        throw new Error("Bad module state");
+      }
+      return parts;
+    }
+    if (data.__tensor_v2__) {
+      throw "Should not reach here."
+    }
+    if (data.__qtensor__) {
+      throw "Should not reach here."
+    }
+    throw new Error("Can't handle data type.", data);
+  }
+
+  render({data, indent, prefix}, {shown}) {
+    const exp = this.expando(data) ? html`<span class=caret onClick=${() => this.click()} >${caret(shown)} </span>` : "";
+    const headline = this.renderHeadline(data);
+    const body = shown ? this.renderBody(indent, data) : "";
+    return html`${indent}${exp}${prefix}${headline}${body}`;
+  }
+}
+
+function ZipContentsSection({model: {zip_files}}) {
+  // TODO: Add human-readable sizes?
+  // TODO: Add sorting options?
+  // TODO: Add hierarchical collapsible tree?
+  return html`
+    <${Hider} name="Zip Contents" shown=false>
+    <table>
+      <thead>
+        <tr>
+          <th>Mode</th>
+          <th>Size</th>
+          <th>Compressed</th>
+          <th>Name</th>
+        </tr>
+      </thead>
+      <tbody style="font-family:monospace;">
+        ${zip_files.map(zf => html`<tr>
+          <td>${{0: "store", 8: "deflate"}[zf.compression] || zf.compression}</td>
+          <td>${zf.file_size}</td>
+          <td>${zf.compressed_size}</td>
+          <td>${zf.filename}</td>
+        </tr>`)}
+      </tbody>
+    </table><//>`;
+}
+
+function CodeSection({model: {code_files}}) {
+  return html`
+    <${Hider} name="Code" shown=false>
+    <div>
+      ${Object.entries(code_files).map(([fn, code]) => html`<${OneCodeSection}
+          filename=${fn} code=${code} />`)}
+    </div><//>`;
+}
+
+class OneCodeSection extends Component {
+  constructor() {
+    super();
+    this.state = { shown: false };
+  }
+
+  click() {
+    const shown = !this.state.shown;
+    this.setState({shown: shown});
+  }
+
+  render({filename, code}, {shown}) {
+    const header = html`
+        <h3 style="font-family:monospace;">
+        <span class=caret onClick=${() => this.click()} >${caret(shown)} </span>
+        ${filename}</h3>
+        `;
+    if (!shown) {
+      return header;
+    }
+    return html`
+      ${header}
+      <pre>${code.map(c => this.renderBlock(c))}</pre>
+      `;
+  }
+
+  renderBlock([text, ist_file, line, ist_s_text, s_start, s_end]) {
+    return html`<span
+        onClick=${() => blame.maybeBlame({ist_file, line, ist_s_text, s_start, s_end})}
+      >${text}</span>`;
+  }
+}
+
+function ExtraJsonSection({files}) {
+  return html`
+    <${Hider} name="Extra files (JSON)" shown=false>
+    <div>
+      <p>Use "Log Raw Model Info" for hierarchical view in browser console.</p>
+      ${Object.entries(files).map(([fn, json]) => html`<${OneJsonSection}
+          filename=${fn} json=${json} />`)}
+    </div><//>`;
+}
+
+class OneJsonSection extends Component {
+  constructor() {
+    super();
+    this.state = { shown: false };
+  }
+
+  click() {
+    const shown = !this.state.shown;
+    this.setState({shown: shown});
+  }
+
+  render({filename, json}, {shown}) {
+    const header = html`
+        <h3 style="font-family:monospace;">
+        <span class=caret onClick=${() => this.click()} >${caret(shown)} </span>
+        ${filename}</h3>
+        `;
+    if (!shown) {
+      return header;
+    }
+    return html`
+      ${header}
+      <pre>${JSON.stringify(json, null, 2)}</pre>
+      `;
+  }
+}
+
+function ExtraPicklesSection({files}) {
+  return html`
+    <${Hider} name="Extra Pickles" shown=false>
+    <div>
+      ${Object.entries(files).map(([fn, content]) => html`<${OnePickleSection}
+          filename=${fn} content=${content} />`)}
+    </div><//>`;
+}
+
+class OnePickleSection extends Component {
+  constructor() {
+    super();
+    this.state = { shown: false };
+  }
+
+  click() {
+    const shown = !this.state.shown;
+    this.setState({shown: shown});
+  }
+
+  render({filename, content}, {shown}) {
+    const header = html`
+        <h3 style="font-family:monospace;">
+        <span class=caret onClick=${() => this.click()} >${caret(shown)} </span>
+        ${filename}</h3>
+        `;
+    if (!shown) {
+      return header;
+    }
+    return html`
+      ${header}
+      <pre>${content}</pre>
+      `;
+  }
+}
+
+function assertStorageAreEqual(key, lhs, rhs) {
+  if (lhs.length !== rhs.length ||
+    !lhs.every((val, idx) => val === rhs[idx])) {
+    throw new Error("Storage mismatch for key '" + key + "'");
+  }
+}
+
+function computeTensorMemory(numel, dtype) {
+  const sizes = {
+    "Byte": 1,
+    "Char": 1,
+    "Short": 2,
+    "Int": 4,
+    "Long": 8,
+    "Half": 2,
+    "Float": 4,
+    "Double": 8,
+    "ComplexHalf": 4,
+    "ComplexFloat": 8,
+    "ComplexDouble": 16,
+    "Bool": 1,
+    "QInt8": 1,
+    "QUInt8": 1,
+    "QInt32": 4,
+    "BFloat16": 2,
+  };
+  let dtsize = sizes[dtype];
+  if (!dtsize) {
+    throw new Error("Unrecognized dtype: " + dtype);
+  }
+  return numel * dtsize;
+}
+
+// TODO: Maybe track by dtype as well.
+// TODO: Maybe distinguish between visible size and storage size.
+function getTensorStorages(data) {
+  if (data === null) {
+    return new Map();
+  }
+  if (typeof(data) == "boolean") {
+    return new Map();
+  }
+  if (typeof(data) == "number") {
+    return new Map();
+  }
+  if (typeof(data) == "string") {
+    return new Map();
+  }
+  if (typeof(data) != "object") {
+    throw new Error("Not an object");
+  }
+  if (Array.isArray(data)) {
+    let result = new Map();
+    for (const item of data) {
+      const tensors = getTensorStorages(item);
+      for (const [key, storage] of tensors.entries()) {
+        if (!result.has(key)) {
+          result.set(key, storage);
+        } else {
+          const old_storage = result.get(key);
+          assertStorageAreEqual(key, old_storage, storage);
+        }
+      }
+    }
+    return result;
+  }
+  if (data.__tuple_values__) {
+    return getTensorStorages(data.__tuple_values__);
+  }
+  if (data.__is_dict__) {
+    return getTensorStorages(data.values);
+  }
+  if (data.__module_type__) {
+    return getTensorStorages(data.state);
+  }
+  if (data.__tensor_v2__) {
+    const [storage, offset, size, stride, grad] = data.__tensor_v2__;
+    const [dtype, key, device, numel] = storage;
+    return new Map([[key, storage]]);
+  }
+  if (data.__qtensor__) {
+    const [storage, offset, size, stride, quantizer, grad] = data.__qtensor__;
+    const [dtype, key, device, numel] = storage;
+    return new Map([[key, storage]]);
+  }
+  throw new Error("Can't handle data type.", data);
+}
+
+function getTensorMemoryByDevice(pickles) {
+  let all_tensors = [];
+  for (const [name, pickle] of pickles) {
+    const tensors = getTensorStorages(pickle);
+    all_tensors.push(...tensors.values());
+  }
+  let result = {};
+  for (const storage of all_tensors.values()) {
+    const [dtype, key, device, numel] = storage;
+    const size = computeTensorMemory(numel, dtype);
+    result[device] = (result[device] || 0) + size;
+  }
+  return result;
+}
+
+// Make this a separate component so it is rendered lazily.
+class OpenTensorMemorySection extends Component {
+  render({model: {model_data, constants}}) {
+    let sizes = getTensorMemoryByDevice(new Map([
+      ["data", model_data],
+      ["constants", constants],
+    ]));
+    return html`
+      <table>
+        <thead>
+          <tr>
+            <th>Device</th>
+            <th>Bytes</th>
+            <th>Human</th>
+          </tr>
+        </thead>
+        <tbody style="font-family:monospace;">
+          ${Object.entries(sizes).map(([dev, size]) => html`<tr>
+            <td>${dev}</td>
+            <td>${size}</td>
+            <td>${humanFileSize(size)}</td>
+          </tr>`)}
+        </tbody>
+      </table>`;
+  }
+}
+
+function TensorMemorySection({model}) {
+  return html`
+    <${Hider} name="Tensor Memory" shown=false>
+    <${OpenTensorMemorySection} model=${model} /><//>`;
+}
+
+class AuxContentPane extends Component {
+  constructor() {
+    super();
+    this.state = {
+      blame_info: null,
+    };
+  }
+
+  doBlame(arg) {
+    this.setState({...this.state, blame_info: arg});
+  }
+
+  render({model: {interned_strings}}, {blame_info}) {
+    let blame_content = "";
+    if (blame_info) {
+      const {ist_file, line, ist_s_text, s_start, s_end} = blame_info;
+      let s_text = interned_strings[ist_s_text];
+      if (s_start != 0 || s_end != s_text.length) {
+        let prefix = s_text.slice(0, s_start);
+        let main = s_text.slice(s_start, s_end);
+        let suffix = s_text.slice(s_end);
+        s_text = html`${prefix}<strong>${main}</strong>${suffix}`;
+      }
+      blame_content = html`
+        <h3>${interned_strings[ist_file]}:${line}</h3>
+        <pre>${s_start}:${s_end}</pre>
+        <pre>${s_text}</pre><br/>
+        `;
+    }
+    return html`
+      <button onClick=${() => blame.readyBlame()}>Blame Code</button>
+      <br/>
+      ${blame_content}
+      `;
+  }
+}
+
+class App extends Component {
+  constructor() {
+    super();
+    this.state = {
+      err: false,
+      model: null,
+    };
+  }
+
+  componentDidMount() {
+    const app = this;
+    if (BURNED_IN_MODEL_INFO !== null) {
+      app.setState({model: BURNED_IN_MODEL_INFO});
+    } else {
+      fetch("./model_info.json").then(function(response) {
+        if (!response.ok) {
+          throw new Error("Response not ok.");
+        }
+        return response.json();
+      }).then(function(body) {
+        app.setState({model: body});
+      }).catch(function(error) {
+        console.log("Top-level error: ", error);
+      });
+    }
+  }
+
+  componentDidCatch(error) {
+    void(error);
+    this.setState({...this.state, err: true});
+  }
+
+  render(_, {err}) {
+    if (this.state.model === null) {
+      return html`<h1>Loading...</h1>`;
+    }
+
+    const model = this.state.model.model;
+
+    let error_msg = "";
+    if (err) {
+      error_msg = html`<h2 style="background:red">An error occurred.  Check console</h2>`;
+    }
+
+    return html`
+      ${error_msg}
+      <div id=main_content style="position:absolute;width:99%;height:79%;overflow:scroll">
+        <h1>TorchScript Model (version ${model.version}): ${model.title}</h1>
+        <button onClick=${() => console.log(model)}>Log Raw Model Info</button>
+        <${ModelSizeSection} model=${model}/>
+        <${StructuredDataSection} name="Model Data" data=${model.model_data} shown=true/>
+        <${StructuredDataSection} name="Constants" data=${model.constants} shown=false/>
+        <${ZipContentsSection} model=${model}/>
+        <${CodeSection} model=${model}/>
+        <${ExtraJsonSection} files=${model.extra_files_jsons}/>
+        <${ExtraPicklesSection} files=${model.extra_pickles}/>
+        <${TensorMemorySection} model=${model}/>
+      </div>
+      <div id=aux_content style="position:absolute;width:99%;top:80%;height:20%;overflow:scroll">
+        <${AuxContentPane}
+          err=${this.state.error}
+          model=${model}
+          ref=${(p) => blame.setAuxContentPane(p)}/>
+      </div>
+      `;
+  }
+}
+
+render(h(App), document.body);
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/htm.mjs b/MLPY/Lib/site-packages/torch/utils/model_dump/htm.mjs
new file mode 100644
index 0000000000000000000000000000000000000000..71d0b610fc29e5ad513d2235cdd80deb59a163a4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/htm.mjs
@@ -0,0 +1,2 @@
+// HTM, Apache License
+var n=function(t,s,r,e){var u;s[0]=0;for(var h=1;h<s.length;h++){var p=s[h++],a=s[h]?(s[0]|=p?1:2,r[s[h++]]):s[++h];3===p?e[0]=a:4===p?e[1]=Object.assign(e[1]||{},a):5===p?(e[1]=e[1]||{})[s[++h]]=a:6===p?e[1][s[++h]]+=a+"":p?(u=t.apply(a,n(t,a,r,["",null])),e.push(u),a[0]?s[0]|=2:(s[h-2]=0,s[h]=u)):e.push(a)}return e},t=new Map;export default function(s){var r=t.get(this);return r||(r=new Map,t.set(this,r)),(r=n(this,r.get(s)||(r.set(s,r=function(n){for(var t,s,r=1,e="",u="",h=[0],p=function(n){1===r&&(n||(e=e.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?h.push(0,n,e):3===r&&(n||e)?(h.push(3,n,e),r=2):2===r&&"..."===e&&n?h.push(4,n,0):2===r&&e&&!n?h.push(5,0,!0,e):r>=5&&((e||!n&&5===r)&&(h.push(r,0,e,s),r=6),n&&(h.push(r,n,0,s),r=6)),e=""},a=0;a<n.length;a++){a&&(1===r&&p(),p(a));for(var l=0;l<n[a].length;l++)t=n[a][l],1===r?"<"===t?(p(),h=[h],r=3):e+=t:4===r?"--"===e&&">"===t?(r=1,e=""):e=t+e[0]:u?t===u?u="":e+=t:'"'===t||"'"===t?u=t:">"===t?(p(),r=1):r&&("="===t?(r=5,s=e,e=""):"/"===t&&(r<5||">"===n[a][l+1])?(p(),3===r&&(h=h[0]),r=h,(h=h[0]).push(2,0,r),r=0):" "===t||"\t"===t||"\n"===t||"\r"===t?(p(),r=2):e+=t),3===r&&"!--"===e&&(r=4,h=h[0])}return p(),h}(s)),r),arguments,[])).length>1?r:r[0]}
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/preact.mjs b/MLPY/Lib/site-packages/torch/utils/model_dump/preact.mjs
new file mode 100644
index 0000000000000000000000000000000000000000..bd32598c6cb5fd76eb61a2217010fc9abb4d3693
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/preact.mjs
@@ -0,0 +1,2 @@
+// Preact, MIT License
+var n,l,u,i,t,o,r={},f=[],e=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i;function c(e,n){for(var t in n)e[t]=n[t];return e}function s(e){var n=e.parentNode;n&&n.removeChild(e)}function a(e,n,t){var _,l,o,r=arguments,i={};for(o in n)"key"==o?_=n[o]:"ref"==o?l=n[o]:i[o]=n[o];if(arguments.length>3)for(t=[t],o=3;o<arguments.length;o++)t.push(r[o]);if(null!=t&&(i.children=t),"function"==typeof e&&null!=e.defaultProps)for(o in e.defaultProps)void 0===i[o]&&(i[o]=e.defaultProps[o]);return v(e,i,_,l,null)}function v(e,t,_,l,o){var r={type:e,props:t,key:_,ref:l,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,__h:null,constructor:void 0,__v:null==o?++n.__v:o};return null!=n.vnode&&n.vnode(r),r}function h(){return{current:null}}function y(e){return e.children}function p(e,n){this.props=e,this.context=n}function d(e,n){if(null==n)return e.__?d(e.__,e.__.__k.indexOf(e)+1):null;for(var t;n<e.__k.length;n++)if(null!=(t=e.__k[n])&&null!=t.__e)return t.__e;return"function"==typeof e.type?d(e):null}function _(e){var n,t;if(null!=(e=e.__)&&null!=e.__c){for(e.__e=e.__c.base=null,n=0;n<e.__k.length;n++)if(null!=(t=e.__k[n])&&null!=t.__e){e.__e=e.__c.base=t.__e;break}return _(e)}}function k(e){(!e.__d&&(e.__d=!0)&&u.push(e)&&!b.__r++||t!==n.debounceRendering)&&((t=n.debounceRendering)||i)(b)}function b(){for(var e;b.__r=u.length;)e=u.sort(function(e,n){return e.__v.__b-n.__v.__b}),u=[],e.some(function(e){var n,t,l,o,r,i;e.__d&&(r=(o=(n=e).__v).__e,(i=n.__P)&&(t=[],(l=c({},o)).__v=o.__v+1,I(i,o,l,n.__n,void 0!==i.ownerSVGElement,null!=o.__h?[r]:null,t,null==r?d(o):r,o.__h),T(t,o),o.__e!=r&&_(o)))})}function m(e,n,t,_,l,o,i,u,s,c){var p,a,h,m,k,b,C,P=_&&_.__k||f,S=P.length;for(t.__k=[],p=0;p<n.length;p++)if(null!=(m=t.__k[p]=null==(m=n[p])||"boolean"==typeof m?null:"string"==typeof m||"number"==typeof m||"bigint"==typeof m?v(null,m,null,null,m):Array.isArray(m)?v(y,{children:m},null,null,null):m.__b>0?v(m.type,m.props,m.key,null,m.__v):m)){if(m.__=t,m.__b=t.__b+1,null===(h=P[p])||h&&m.key==h.key&&m.type===h.type)P[p]=void 0;else for(a=0;a<S;a++){if((h=P[a])&&m.key==h.key&&m.type===h.type){P[a]=void 0;break}h=null}I(e,m,h=h||r,l,o,i,u,s,c),k=m.__e,(a=m.ref)&&h.ref!=a&&(C||(C=[]),h.ref&&C.push(h.ref,null,m),C.push(a,m.__c||k,m)),null!=k?(null==b&&(b=k),"function"==typeof m.type&&null!=m.__k&&m.__k===h.__k?m.__d=s=g(m,s,e):s=x(e,m,h,P,k,s),c||"option"!==t.type?"function"==typeof t.type&&(t.__d=s):e.value=""):s&&h.__e==s&&s.parentNode!=e&&(s=d(h))}for(t.__e=b,p=S;p--;)null!=P[p]&&("function"==typeof t.type&&null!=P[p].__e&&P[p].__e==t.__d&&(t.__d=d(_,p+1)),L(P[p],P[p]));if(C)for(p=0;p<C.length;p++)z(C[p],C[++p],C[++p])}function g(e,n,t){var _,l;for(_=0;_<e.__k.length;_++)(l=e.__k[_])&&(l.__=e,n="function"==typeof l.type?g(l,n,t):x(t,l,l,e.__k,l.__e,n));return n}function w(e,n){return n=n||[],null==e||"boolean"==typeof e||(Array.isArray(e)?e.some(function(e){w(e,n)}):n.push(e)),n}function x(e,n,t,_,l,o){var r,i,u;if(void 0!==n.__d)r=n.__d,n.__d=void 0;else if(null==t||l!=o||null==l.parentNode)e:if(null==o||o.parentNode!==e)e.appendChild(l),r=null;else{for(i=o,u=0;(i=i.nextSibling)&&u<_.length;u+=2)if(i==l)break e;e.insertBefore(l,o),r=o}return void 0!==r?r:l.nextSibling}function A(e,n,t,_,l){var o;for(o in t)"children"===o||"key"===o||o in n||C(e,o,null,t[o],_);for(o in n)l&&"function"!=typeof n[o]||"children"===o||"key"===o||"value"===o||"checked"===o||t[o]===n[o]||C(e,o,n[o],t[o],_)}function P(n,t,_){"-"===t[0]?n.setProperty(t,_):n[t]=null==_?"":"number"!=typeof _||e.test(t)?_:_+"px"}function C(e,n,t,_,l){var o;e:if("style"===n)if("string"==typeof t)e.style.cssText=t;else{if("string"==typeof _&&(e.style.cssText=_=""),_)for(n in _)t&&n in t||P(e.style,n,"");if(t)for(n in t)_&&t[n]===_[n]||P(e.style,n,t[n])}else if("o"===n[0]&&"n"===n[1])o=n!==(n=n.replace(/Capture$/,"")),n=n.toLowerCase()in e?n.toLowerCase().slice(2):n.slice(2),e.l||(e.l={}),e.l[n+o]=t,t?_||e.addEventListener(n,o?H:$,o):e.removeEventListener(n,o?H:$,o);else if("dangerouslySetInnerHTML"!==n){if(l)n=n.replace(/xlink[H:h]/,"h").replace(/sName$/,"s");else if("href"!==n&&"list"!==n&&"form"!==n&&"tabIndex"!==n&&"download"!==n&&n in e)try{e[n]=null==t?"":t;break e}catch(e){}"function"==typeof t||(null!=t&&(!1!==t||"a"===n[0]&&"r"===n[1])?e.setAttribute(n,t):e.removeAttribute(n))}}function $(e){this.l[e.type+!1](n.event?n.event(e):e)}function H(e){this.l[e.type+!0](n.event?n.event(e):e)}function I(e,t,_,l,o,r,i,u,s){var f,a,d,h,v,k,g,b,C,x,P,S=t.type;if(void 0!==t.constructor)return null;null!=_.__h&&(s=_.__h,u=t.__e=_.__e,t.__h=null,r=[u]),(f=n.__b)&&f(t);try{e:if("function"==typeof S){if(b=t.props,C=(f=S.contextType)&&l[f.__c],x=f?C?C.props.value:f.__:l,_.__c?g=(a=t.__c=_.__c).__=a.__E:("prototype"in S&&S.prototype.render?t.__c=a=new S(b,x):(t.__c=a=new p(b,x),a.constructor=S,a.render=M),C&&C.sub(a),a.props=b,a.state||(a.state={}),a.context=x,a.__n=l,d=a.__d=!0,a.__h=[]),null==a.__s&&(a.__s=a.state),null!=S.getDerivedStateFromProps&&(a.__s==a.state&&(a.__s=c({},a.__s)),c(a.__s,S.getDerivedStateFromProps(b,a.__s))),h=a.props,v=a.state,d)null==S.getDerivedStateFromProps&&null!=a.componentWillMount&&a.componentWillMount(),null!=a.componentDidMount&&a.__h.push(a.componentDidMount);else{if(null==S.getDerivedStateFromProps&&b!==h&&null!=a.componentWillReceiveProps&&a.componentWillReceiveProps(b,x),!a.__e&&null!=a.shouldComponentUpdate&&!1===a.shouldComponentUpdate(b,a.__s,x)||t.__v===_.__v){a.props=b,a.state=a.__s,t.__v!==_.__v&&(a.__d=!1),a.__v=t,t.__e=_.__e,t.__k=_.__k,t.__k.forEach(function(e){e&&(e.__=t)}),a.__h.length&&i.push(a);break e}null!=a.componentWillUpdate&&a.componentWillUpdate(b,a.__s,x),null!=a.componentDidUpdate&&a.__h.push(function(){a.componentDidUpdate(h,v,k)})}a.context=x,a.props=b,a.state=a.__s,(f=n.__r)&&f(t),a.__d=!1,a.__v=t,a.__P=e,f=a.render(a.props,a.state,a.context),a.state=a.__s,null!=a.getChildContext&&(l=c(c({},l),a.getChildContext())),d||null==a.getSnapshotBeforeUpdate||(k=a.getSnapshotBeforeUpdate(h,v)),P=null!=f&&f.type===y&&null==f.key?f.props.children:f,m(e,Array.isArray(P)?P:[P],t,_,l,o,r,i,u,s),a.base=t.__e,t.__h=null,a.__h.length&&i.push(a),g&&(a.__E=a.__=null),a.__e=!1}else null==r&&t.__v===_.__v?(t.__k=_.__k,t.__e=_.__e):t.__e=j(_.__e,t,_,l,o,r,i,s);(f=n.diffed)&&f(t)}catch(e){t.__v=null,(s||null!=r)&&(t.__e=u,t.__h=!!s,r[r.indexOf(u)]=null),n.__e(e,t,_)}}function T(e,t){n.__c&&n.__c(t,e),e.some(function(t){try{e=t.__h,t.__h=[],e.some(function(e){e.call(t)})}catch(e){n.__e(e,t.__v)}})}function j(e,n,t,_,l,o,i,u){var c,p,a,d,h=t.props,v=n.props,y=n.type,k=0;if("svg"===y&&(l=!0),null!=o)for(;k<o.length;k++)if((c=o[k])&&(c===e||(y?c.localName==y:3==c.nodeType))){e=c,o[k]=null;break}if(null==e){if(null===y)return document.createTextNode(v);e=l?document.createElementNS("http://www.w3.org/2000/svg",y):document.createElement(y,v.is&&v),o=null,u=!1}if(null===y)h===v||u&&e.data===v||(e.data=v);else{if(o=o&&f.slice.call(e.childNodes),p=(h=t.props||r).dangerouslySetInnerHTML,a=v.dangerouslySetInnerHTML,!u){if(null!=o)for(h={},d=0;d<e.attributes.length;d++)h[e.attributes[d].name]=e.attributes[d].value;(a||p)&&(a&&(p&&a.__html==p.__html||a.__html===e.innerHTML)||(e.innerHTML=a&&a.__html||""))}if(A(e,v,h,l,u),a)n.__k=[];else if(k=n.props.children,m(e,Array.isArray(k)?k:[k],n,t,_,l&&"foreignObject"!==y,o,i,e.firstChild,u),null!=o)for(k=o.length;k--;)null!=o[k]&&s(o[k]);u||("value"in v&&void 0!==(k=v.value)&&(k!==e.value||"progress"===y&&!k)&&C(e,"value",k,h.value,!1),"checked"in v&&void 0!==(k=v.checked)&&k!==e.checked&&C(e,"checked",k,h.checked,!1))}return e}function z(e,t,_){try{"function"==typeof e?e(t):e.current=t}catch(e){n.__e(e,_)}}function L(e,t,_){var l,o,r;if(n.unmount&&n.unmount(e),(l=e.ref)&&(l.current&&l.current!==e.__e||z(l,null,t)),_||"function"==typeof e.type||(_=null!=(o=e.__e)),e.__e=e.__d=void 0,null!=(l=e.__c)){if(l.componentWillUnmount)try{l.componentWillUnmount()}catch(e){n.__e(e,t)}l.base=l.__P=null}if(l=e.__k)for(r=0;r<l.length;r++)l[r]&&L(l[r],t,_);null!=o&&s(o)}function M(e,n,t){return this.constructor(e,t)}function N(e,t,_){var l,o,i;n.__&&n.__(e,t),o=(l="function"==typeof _)?null:_&&_.__k||t.__k,i=[],I(t,e=(!l&&_||t).__k=a(y,null,[e]),o||r,r,void 0!==t.ownerSVGElement,!l&&_?[_]:o?null:t.firstChild?f.slice.call(t.childNodes):null,i,!l&&_?_:o?o.__e:t.firstChild,l),T(i,e)}function O(e,n){N(e,n,O)}function S(e,n,t){var _,l,o,r=arguments,i=c({},e.props);for(o in n)"key"==o?_=n[o]:"ref"==o?l=n[o]:i[o]=n[o];if(arguments.length>3)for(t=[t],o=3;o<arguments.length;o++)t.push(r[o]);return null!=t&&(i.children=t),v(e.type,i,_||e.key,l||e.ref,null)}function q(e,n){var t={__c:n="__cC"+o++,__:e,Consumer:function(e,n){return e.children(n)},Provider:function(e){var t,_;return this.getChildContext||(t=[],(_={})[n]=this,this.getChildContext=function(){return _},this.shouldComponentUpdate=function(e){this.props.value!==e.value&&t.some(k)},this.sub=function(e){t.push(e);var n=e.componentWillUnmount;e.componentWillUnmount=function(){t.splice(t.indexOf(e),1),n&&n.call(e)}}),e.children}};return t.Provider.__=t.Consumer.contextType=t}n={__e:function(e,n){for(var t,_,l;n=n.__;)if((t=n.__c)&&!t.__)try{if((_=t.constructor)&&null!=_.getDerivedStateFromError&&(t.setState(_.getDerivedStateFromError(e)),l=t.__d),null!=t.componentDidCatch&&(t.componentDidCatch(e),l=t.__d),l)return t.__E=t}catch(n){e=n}throw e},__v:0},l=function(e){return null!=e&&void 0===e.constructor},p.prototype.setState=function(e,n){var t;t=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=c({},this.state),"function"==typeof e&&(e=e(c({},t),this.props)),e&&c(t,e),null!=e&&this.__v&&(n&&this.__h.push(n),k(this))},p.prototype.forceUpdate=function(e){this.__v&&(this.__e=!0,e&&this.__h.push(e),k(this))},p.prototype.render=y,u=[],i="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,b.__r=0,o=0;export{N as render,O as hydrate,a as createElement,a as h,y as Fragment,h as createRef,l as isValidElement,p as Component,S as cloneElement,q as createContext,w as toChildArray,n as options};
diff --git a/MLPY/Lib/site-packages/torch/utils/model_dump/skeleton.html b/MLPY/Lib/site-packages/torch/utils/model_dump/skeleton.html
new file mode 100644
index 0000000000000000000000000000000000000000..5be812f295b30c00a80f742f533457ee1746da46
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_dump/skeleton.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>TorchScript Model</title>
+    <meta charset="UTF-8">
+    <style>
+      table, th, td {
+        border: 1px solid black;
+        border-collapse: collapse;
+      }
+      .caret {
+        cursor: pointer;
+        user-select: none;
+      }
+    </style>
+    <script type="module" src="./code.js"></script>
+  </head>
+
+  <body>
+  </body>
+</html>
diff --git a/MLPY/Lib/site-packages/torch/utils/model_zoo.py b/MLPY/Lib/site-packages/torch/utils/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..c109354498e6c1a55de73b3c2246c2a2cbb2114b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/model_zoo.py
@@ -0,0 +1,2 @@
+# torchvision imports tqdm from here.
+from torch.hub import tqdm, load_state_dict_from_url as load_url  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/utils/show_pickle.py b/MLPY/Lib/site-packages/torch/utils/show_pickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73bbda393d8a1d483a2f349bc63dec1df32a784
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/show_pickle.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+import sys
+import pickle
+import struct
+import pprint
+import zipfile
+import fnmatch
+from typing import Any, IO, BinaryIO, Union
+
+__all__ = ["FakeObject", "FakeClass", "DumpUnpickler", "main"]
+
+class FakeObject:
+    def __init__(self, module, name, args):
+        self.module = module
+        self.name = name
+        self.args = args
+        # NOTE: We don't distinguish between state never set and state set to None.
+        self.state = None
+
+    def __repr__(self):
+        state_str = "" if self.state is None else f"(state={self.state!r})"
+        return f"{self.module}.{self.name}{self.args!r}{state_str}"
+
+    def __setstate__(self, state):
+        self.state = state
+
+    @staticmethod
+    def pp_format(printer, obj, stream, indent, allowance, context, level):
+        if not obj.args and obj.state is None:
+            stream.write(repr(obj))
+            return
+        if obj.state is None:
+            stream.write(f"{obj.module}.{obj.name}")
+            printer._format(obj.args, stream, indent + 1, allowance + 1, context, level)
+            return
+        if not obj.args:
+            stream.write(f"{obj.module}.{obj.name}()(state=\n")
+            indent += printer._indent_per_level
+            stream.write(" " * indent)
+            printer._format(obj.state, stream, indent, allowance + 1, context, level + 1)
+            stream.write(")")
+            return
+        raise Exception("Need to implement")
+
+
+class FakeClass:
+    def __init__(self, module, name):
+        self.module = module
+        self.name = name
+        self.__new__ = self.fake_new  # type: ignore[assignment]
+
+    def __repr__(self):
+        return f"{self.module}.{self.name}"
+
+    def __call__(self, *args):
+        return FakeObject(self.module, self.name, args)
+
+    def fake_new(self, *args):
+        return FakeObject(self.module, self.name, args[1:])
+
+
+class DumpUnpickler(pickle._Unpickler):  # type: ignore[name-defined]
+    def __init__(
+            self,
+            file,
+            *,
+            catch_invalid_utf8=False,
+            **kwargs):
+        super().__init__(file, **kwargs)
+        self.catch_invalid_utf8 = catch_invalid_utf8
+
+    def find_class(self, module, name):
+        return FakeClass(module, name)
+
+    def persistent_load(self, pid):
+        return FakeObject("pers", "obj", (pid,))
+
+    dispatch = dict(pickle._Unpickler.dispatch)  # type: ignore[attr-defined]
+
+    # Custom objects in TorchScript are able to return invalid UTF-8 strings
+    # from their pickle (__getstate__) functions.  Install a custom loader
+    # for strings that catches the decode exception and replaces it with
+    # a sentinel object.
+    def load_binunicode(self):
+        strlen, = struct.unpack("<I", self.read(4))  # type: ignore[attr-defined]
+        if strlen > sys.maxsize:
+            raise Exception("String too long.")
+        str_bytes = self.read(strlen)  # type: ignore[attr-defined]
+        obj: Any
+        try:
+            obj = str(str_bytes, "utf-8", "surrogatepass")
+        except UnicodeDecodeError as exn:
+            if not self.catch_invalid_utf8:
+                raise
+            obj = FakeObject("builtin", "UnicodeDecodeError", (str(exn),))
+        self.append(obj)  # type: ignore[attr-defined]
+    dispatch[pickle.BINUNICODE[0]] = load_binunicode  # type: ignore[assignment]
+
+    @classmethod
+    def dump(cls, in_stream, out_stream):
+        value = cls(in_stream).load()
+        pprint.pprint(value, stream=out_stream)
+        return value
+
+
+def main(argv, output_stream=None):
+    if len(argv) != 2:
+        # Don't spam stderr if not using stdout.
+        if output_stream is not None:
+            raise Exception("Pass argv of length 2.")
+        sys.stderr.write("usage: show_pickle PICKLE_FILE\n")
+        sys.stderr.write("  PICKLE_FILE can be any of:\n")
+        sys.stderr.write("    path to a pickle file\n")
+        sys.stderr.write("    file.zip@member.pkl\n")
+        sys.stderr.write("    file.zip@*/pattern.*\n")
+        sys.stderr.write("      (shell glob pattern for members)\n")
+        sys.stderr.write("      (only first match will be shown)\n")
+        return 2
+
+    fname = argv[1]
+    handle: Union[IO[bytes], BinaryIO]
+    if "@" not in fname:
+        with open(fname, "rb") as handle:
+            DumpUnpickler.dump(handle, output_stream)
+    else:
+        zfname, mname = fname.split("@", 1)
+        with zipfile.ZipFile(zfname) as zf:
+            if "*" not in mname:
+                with zf.open(mname) as handle:
+                    DumpUnpickler.dump(handle, output_stream)
+            else:
+                found = False
+                for info in zf.infolist():
+                    if fnmatch.fnmatch(info.filename, mname):
+                        with zf.open(info) as handle:
+                            DumpUnpickler.dump(handle, output_stream)
+                        found = True
+                        break
+                if not found:
+                    raise Exception(f"Could not find member matching {mname} in {zfname}")
+
+
+if __name__ == "__main__":
+    # This hack works on every version of Python I've tested.
+    # I've tested on the following versions:
+    #   3.7.4
+    if True:
+        pprint.PrettyPrinter._dispatch[FakeObject.__repr__] = FakeObject.pp_format  # type: ignore[attr-defined]
+
+    sys.exit(main(sys.argv))
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__init__.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..39df653b2f7ed7e5961908304336ae5d00d582ea
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/__init__.py
@@ -0,0 +1,13 @@
+import tensorboard
+from torch._vendor.packaging.version import Version
+
+if not hasattr(tensorboard, "__version__") or Version(
+    tensorboard.__version__
+) < Version("1.15"):
+    raise ImportError("TensorBoard logging requires TensorBoard version 1.15 or above")
+
+del Version
+del tensorboard
+
+from .writer import FileWriter, SummaryWriter  # noqa: F401
+from tensorboard.summary.writer.record_writer import RecordWriter  # noqa: F401
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1b92a71a86a0f3fc7b2c594897f4bba9536ca90
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_caffe2_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_caffe2_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa453ac849f243a959776975060141ee28269056
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_caffe2_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_convert_np.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_convert_np.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a390de36cb29629bb2ca2953c9fb0fbef948080
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_convert_np.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_embedding.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_embedding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65e93ccd8bfc3d6a26f8edaa5cddb2ddd026ea2a
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_embedding.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_onnx_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_onnx_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65868ddc9563c079941341b6bb4b905e2f1d8682
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_onnx_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_proto_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_proto_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db03adc45310496c1528a7bdb4eb325895415f1d
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_proto_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_pytorch_graph.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_pytorch_graph.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ce5d277d4cc3c8cbea1687071df7e88f9763f04
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_pytorch_graph.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5713a4f15debd989e564eabd6dd7272dde4ae81e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/summary.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/summary.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebd38634fe3df4f6fb121ce8b571e574a7003aef
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/summary.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/writer.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/writer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..286eacef815ce95030d57b4bbd927b9672f3b5bb
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/tensorboard/__pycache__/writer.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_caffe2_graph.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_caffe2_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff9689752ecaa472bf8bf228c7726dd9d29e7e
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_caffe2_graph.py
@@ -0,0 +1,822 @@
+import copy
+import logging
+import os
+import re
+
+from tensorboard.compat.proto.graph_pb2 import GraphDef
+from tensorboard.compat.proto.node_def_pb2 import NodeDef
+from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
+
+from caffe2.proto import caffe2_pb2
+from caffe2.python import core, workspace
+
+from typing import Set, Dict, Tuple, List
+
+log = logging.getLogger(__name__)
+
+def _make_unique_name(seen: Set[str], name: str, min_version: int = 0):
+    """
+    Make the name unique by appending a unique number to the name. Used for SSA.
+
+    Args:
+        seen (set): Set of names that have already been used (with respect to
+            some context).
+        name (str): The name to make unique
+        min_version (number): Starting index. Is incremented continually until
+            it can make the resulting name unique relative to 'seen'.
+
+    Returns:
+        x (str): A version of name that is not in seen.
+    """
+    assert name is not None
+    i = min_version
+    x = "%s_%d" % (name, i) if i else name
+    while x in seen:
+        i += 1
+        x = "%s_%d" % (name, i)
+    seen.add(x)
+    return x
+
+
+def _rename_tensorflow_style(shapes, blob_name_tracker, ops):
+    """
+    Convert some of the common names in Caffe2 to tensorflow.
+
+    NOTE: The common names in both Caffe2 and Tensorflow are currently
+        hardcoded, if either side changes at some point, then this code should
+        change as well.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. The _rename_all() call modifies blob_name_tracker and ops in-place.
+    """
+    WEIGHT = re.compile(r"(_w)$")
+    WEIGHT_ = re.compile(r"(_w_)")
+    BN = re.compile(r"(_bn)$")
+    BN_ = re.compile(r"(_bn_)")
+    BIAS = re.compile(r"(_b)$")
+    BIAS_ = re.compile(r"(_b_)")
+    SCALE = re.compile(r"(_s)$")
+    SCALE_ = re.compile(r"(_s_)")
+    SUM = re.compile(r"(_sum)$")
+    SUM_ = re.compile(r"(_sum_)")
+    BRANCH = re.compile(r"(_branch)")
+
+    def f(name):
+        inter_name = WEIGHT_.sub("/weight_", WEIGHT.sub("/weight", name))
+        inter_name = BN_.sub("/batchnorm_", BN.sub("/batchnorm", inter_name))
+        inter_name = BIAS_.sub("/bias_", BIAS.sub("/bias", inter_name))
+        inter_name = SCALE_.sub("/scale_", SCALE.sub("/scale", inter_name))
+        inter_name = SUM_.sub("/sum_", SUM.sub("/sum", inter_name))
+        new_name = BRANCH.sub("/branch", inter_name)
+        return new_name
+
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _convert_to_ssa(shapes, blob_name_tracker, ops):
+    """
+    Convert an operator graph to SSA (i.e. out-of-place).
+
+    i.e. blobs will be renamed so that each blob is produced only once.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. Modifies blob_name_tracker and ops in-place.
+    """
+    ir = core.IR(ops)
+    seen: Set[str] = set()
+    versioned: Dict[Tuple[str, int], int] = {}
+    new_shapes = {}
+    new_blob_name_tracker = {}
+
+    def ssa_name(name: str, versions: Dict[str, int]) -> int:
+        assert name in versions
+        version = versions[name]
+        if (name, version) in versioned:
+            return versioned[(name, version)]
+        # Always setting name2 = `{name}_{version}` would work, but we also try
+        # to avoid a trailing `_0`, so we have to be careful not to introduce
+        # name collisions, such as (foo_1, 0) = foo_1 = (foo, 1).
+        # Note: operator names (if any) will be handled later.
+        new_name = _make_unique_name(seen, name, min_version=version)
+        versioned[(name, version)] = new_name
+        # Transfer shape.
+        if name in shapes:
+            new_shapes[new_name] = shapes[name]
+        if blob_name_tracker and name in blob_name_tracker:
+            new_blob_name_tracker[new_name] = blob_name_tracker[name]
+        return new_name
+
+    for (op, ssa) in zip(ops, ir.ssa):
+        assert op is ssa.op
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(ssa_name(name, ssa.in_versions) for name in inputs)
+        op.output.extend(ssa_name(name, ssa.out_versions) for name in outputs)
+
+    shapes.clear()
+    shapes.update(new_shapes)
+    if blob_name_tracker:
+        blob_name_tracker.clear()
+        blob_name_tracker.update(new_blob_name_tracker)
+
+
+def _get_blob_names(ops):
+    """
+    Get all the operator input and output blobs and perform dedup on their names.
+
+    Args:
+        ops: List of Caffe2 operators to extract inputs and outputs from
+
+    Returns:
+        set containing distinct inputs and outputs from 'ops'
+    """
+    names = set()
+    for op in ops:
+        names.update(op.input)
+        names.update(op.output)
+    return {name: name for name in names}
+
+
+def _remap_keys(old_dict, rename_fn):
+    """
+    Rename keys of 'old_dict' according to 'rename_fn'.
+
+    Args:
+        old_dict: Dictionary (i.e. containing blob_name -> blob_name
+            relationships.)
+        rename_fn: Function string -> string for renaming.
+
+    Returns:
+        None. Modifies old_dict in-place.
+    """
+    new_dict = {rename_fn(key): value for key, value in old_dict.items()}
+    old_dict.clear()
+    old_dict.update(new_dict)
+
+
+def _rename_all(shapes, blob_name_tracker, ops, rename_fn):
+    """
+    Rename all the names in the operators.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+        rename_fn: Function string -> string that specifies how to rename
+
+    Returns:
+        None. Modifies shapes, blob_name_tracker and ops in-place using the
+            specified 'rename_fn'.
+    """
+    seen: Set[str] = set()
+    renamed: Dict[Tuple[str, int], int] = {}
+
+    def g(name):
+        """Collision-free version of f."""
+        if name is None:
+            return None
+        if name in renamed:
+            return renamed[name]
+        new_name = _make_unique_name(seen, rename_fn(name))
+        renamed[name] = new_name
+        return new_name
+
+    for op in ops:
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        op.input.extend(g(name) for name in inputs)
+        op.output.extend(g(name) for name in outputs)
+
+    _remap_keys(shapes, g)
+    if blob_name_tracker:
+        _remap_keys(blob_name_tracker, g)
+    # Rename all operator names (if any) independently so that the
+    # unique-fication happens only once in _fill_missing_operator_names().
+    seen.clear()
+    renamed.clear()
+    for op in ops:
+        op.name = g(op.name)
+
+
+def _add_gradient_scope(shapes, blob_name_tracker, ops):
+    """
+    For all operators or blobs with name containing "_grad", add a "GRADIENTS/" scope.
+
+    Note: breaks graph execution since the blob -> gradient mapping is
+    hardcoded.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+
+    Returns:
+        None. Modifies shapes, blob_name_tracker and ops in-place by renaming.
+    """
+
+    def f(name):
+        if "_grad" in name:
+            return f"GRADIENTS/{name}"
+        else:
+            return name
+
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _replace_colons(shapes, blob_name_tracker, ops, repl):
+    """
+    `:i` has a special meaning in Tensorflow. This function replaces all colons with $ to avoid any possible conflicts.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        blob_name_tracker: Dictionary of all unique blob names (with respect to
+            some context).
+        ops: List of Caffe2 operators
+        repl: String representing the text to replace ':' with. Usually this is
+            '$'.
+
+    Returns:
+        None. Modifies blob_name_tracker in-place.
+
+    """
+
+    def f(name):
+        return name.replace(":", repl)
+
+    _rename_all(shapes, blob_name_tracker, ops, f)
+
+
+def _fill_missing_operator_names(ops):
+    """
+    Give missing operators a name.
+
+    We expect C2 operators to be generally unnamed. This gives them a scope
+    (inferred from their outputs) and a name after their type. Duplicates will
+    be postfixed by an index.
+
+    Args:
+        ops: List of Caffe2 operators to assign names to.
+
+    Returns:
+        None: Modifies 'ops' in-place.
+    """
+    seen = set()
+    for op in ops:
+        # Make sure operator names don't collide with blobs.
+        seen.update(op.input)
+        seen.update(op.output)
+    for op in ops:
+        if op.name:
+            name = op.name
+        elif op.output or op.input:
+            name_list = [os.path.dirname(name) for name in op.output or op.input]
+            scope = os.path.commonprefix(name_list)
+            name = os.path.join(scope, op.type)
+        else:
+            name = op.type
+        assert name
+        op.name = _make_unique_name(seen, name)
+
+
+def _tf_device(device_option):
+    """
+    Handle the devices.
+
+    Args:
+        device_option (caffe2_pb2.DeviceOption): DeviceOption protobuf,
+            associated to an operator, that contains information such as
+            device_type (optional), cuda_gpu_id (optional), node_name (optional,
+            tells which node the operator should execute on). See caffe2.proto
+            in caffe2/proto for the full list.
+
+    Returns:
+        Formatted string representing device information contained in
+            device_option.
+    """
+    if not device_option.HasField("device_type"):
+        return ""
+    if (
+        device_option.device_type == caffe2_pb2.CPU
+        or device_option.device_type == caffe2_pb2.MKLDNN
+    ):
+        return "/cpu:*"
+    if device_option.device_type == caffe2_pb2.CUDA:
+        return f"/gpu:{device_option.device_id}"
+    raise Exception("Unhandled device", device_option)
+
+
+def _add_tf_shape(attr_dict, ints):
+    """
+    Convert a list of ints to a TensorShapeProto representing the dimensions of a blob/object.
+
+    Args:
+        attr_dict: Dictionary to update (usually attributes of a Node)
+        ints: List of integers representing dimensions of some object.
+
+    Returns:
+        None. Modifies attr_dict in-place.
+    """
+    shape_proto = TensorShapeProto()
+    for i in ints:
+        dim = TensorShapeProto.Dim()
+        dim.size = i
+        shape_proto.dim.extend([dim])
+    attr_dict["_output_shapes"].list.shape.extend([shape_proto])
+
+
+def _set_tf_attr(attr_dict, arg):
+    """
+    Add attributes to a node. Key is the arg.name, and values can be shape, floats, strings, ints or an empty list.
+
+    Args:
+        attr_dict: Dictionary to update (usually attributes of a Node)
+        arg: Object with name and data fields.
+
+    Returns:
+        None. Modifies attr_dict in-place.
+    """
+    k = arg.name
+    if k == "shape" and arg.ints:
+        _add_tf_shape(attr_dict, arg.ints)
+        return
+    # Float
+    if arg.HasField("f"):
+        attr_dict[k].f = arg.f
+        return
+    # Integer
+    if arg.HasField("i"):
+        attr_dict[k].i = arg.i
+        return
+    # String
+    if arg.HasField("s"):
+        attr_dict[k].s = (
+            arg.s if isinstance(arg.s, bytes) else str(arg.s).encode("utf-8")
+        )
+        return
+    if arg.floats:
+        attr_dict[k].list.f.extend(arg.floats)
+        return
+    if arg.ints:
+        attr_dict[k].list.i.extend(arg.ints)
+        return
+    if arg.strings:
+        attr_dict[k].list.s.extend(
+            s if isinstance(s, bytes) else str(s).encode("utf-8") for s in arg.strings
+        )
+        return
+    # The value is an empty list.
+    attr_dict[k].list.s.extend([])
+
+
+def _operator_to_node(shapes, op):
+    """
+    Convert an operator to a node in a TF graph.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        op: The Caffe2 operator to convert to a TF graph node.
+
+    Returns:
+        n: The TF graph node created from op.
+    """
+    assert op.name, op
+    n = NodeDef()
+    n.name = op.name
+    n.input.extend(op.input)
+    n.op = op.type
+    n.device = _tf_device(op.device_option)
+    if shapes:
+        # Add shapes in order.
+        for output in op.output:
+            if output not in shapes:
+                break
+            _add_tf_shape(n.attr, shapes[output])
+    for arg in op.arg:
+        _set_tf_attr(n.attr, arg)
+    return n
+
+
+def _operator_to_node_simp(op, inter_blobs, seen):
+    """
+    Convert the operators to nodes.
+
+    Args:
+        op: Caffe2 operator to convert to node
+        inter_blobs: Set of intermediate blobs
+        seen: Names that have already been used and are not unique
+
+    Returns:
+        nodes: Nodes representing 'op' and the outputs of 'op'
+    """
+    assert op
+    nodes = []
+    outputs = [o for o in op.output if o not in inter_blobs]
+    seen.update(outputs)
+    len_outputs = len(outputs)
+    if len_outputs == 1:
+        n = NodeDef()
+        n.name = outputs[0]
+        # Here we are sure the name is unique.
+        n.input.extend(op.input)
+        n.op = op.type
+        n.device = _tf_device(op.device_option)
+        for arg in op.arg:
+            _set_tf_attr(n.attr, arg)
+        nodes.append(n)
+    elif len_outputs > 1:
+        # Create a name that is likely unique
+        if op.name:
+            name = op.name
+        else:
+            name_list = list(outputs)
+            scope = os.path.commonprefix(name_list)
+            name = os.path.join(scope, op.type)
+        assert name
+        op.name = _make_unique_name(seen, name)
+        device = _tf_device(op.device_option)
+
+        # Create additional output nodes
+        for output in outputs:
+            n = NodeDef()
+            n.name = output
+            n.input.extend([op.name])
+            n.op = "Blob"
+            n.device = device
+            nodes.append(n)
+
+        # Node for the current op
+        n = NodeDef()
+        n.name = op.name
+        n.input.extend(op.input)
+        n.op = op.type
+        n.device = device
+        for arg in op.arg:
+            _set_tf_attr(n.attr, arg)
+        nodes.append(n)
+
+    return nodes
+
+
+def _blob_to_node(producing_ops, shapes, name):
+    """
+    Convert a blob (operator input or output) to a node in a TF graph.
+
+    Args:
+        producing_ops: Dictionary of blob name to list of
+            (producing_op, blob_index within producing_op.output) mapping.
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        name: String representing the name of this blob.
+
+    Returns:
+        n: The TF graph node created from this blob.
+    """
+    assert name
+    n = NodeDef()
+    n.name = name
+    # Get all ops that have the blob corresponding to 'name' as one of their
+    # outputs. See _operators_to_graph_def.
+    produced_by = producing_ops.get(name, [])
+    if len(produced_by) > 0:
+        n.op = "Blob"
+    else:
+        # This blob is not produced but is instead a TF Placeholder where a
+        # value is passed in.
+        n.op = "Placeholder"
+    n.input.extend("%s:%d" % (p_op.name, i) for p_op, i in produced_by)
+    if produced_by:
+        device = produced_by[0][0].device_option
+        if all(producer[0].device_option == device for producer in produced_by):
+            n.device = _tf_device(device)
+    if shapes and name in shapes:
+        _add_tf_shape(n.attr, shapes[name])
+    return n
+
+
+def _clear_debug_info(ops, perform_clear):
+    """
+    Remove debug information from operators, they are copious.
+
+    Args:
+        ops: List of Caffe2 operators
+        perform_clear: Boolean passed from _operators_to_graph_def specifying
+            whether to remove the debug information. This boolean is passed into
+            this function to reduce the complexity of _operators_to_graph_def.
+
+    Returns:
+        None. Modifies the list of Caffe2 operators in-place and removes the
+        'debug_info' field.
+
+    """
+    if not perform_clear:
+        return
+
+    for op in ops:
+        if op.HasField("debug_info"):
+            op.ClearField("debug_info")
+
+
+def _check_if_forward(blob):
+    """
+    Blobs with names containing '_m' or 'grad' are part of the backward pass.
+
+        This function references facebookresearch/Detectron/detectron/utils/net.py.
+
+    Args:
+        blob: The blob to inspect
+
+    Returns:
+        Boolean representing whether this blob is part of the forward pass
+    """
+    #
+    return blob.find("__m") < 0 or blob.find("grad") < 0
+
+
+def _check_if_cpu(blob):
+    """
+    Check if the blob's name starts with '_gpu'.
+
+    Args:
+        blob: The blob to inspect
+
+    Returns:
+        Boolean representing whether this blob is associated with a gpu
+    """
+    return not blob.startswith("_gpu")
+
+
+def _compute_in_out(ops):
+    """
+    Find the input, intermediate and output nodes of a set of operators.
+
+    Args:
+        ops: List of Caffe2 operators to look through
+
+    Returns:
+        input_blobs: The input nodes of the set of operators
+        inter_blobs: The intermediate nodes of the set of operators
+        output_blobs: The output nodes of the set of operators
+    """
+    in_blobs = set()
+    out_blobs = set()
+
+    for op in ops:
+        for input_blob in op.input:
+            in_blobs.add(input_blob)
+        for output_blob in op.output:
+            out_blobs.add(output_blob)
+
+    input_blobs = list(in_blobs.difference(out_blobs))
+    output_blobs = list(out_blobs.difference(in_blobs))
+    inter_blobs = {b for b in output_blobs if b.startswith("_")}
+    output_blobs = [b for b in output_blobs if b not in inter_blobs]
+
+    return input_blobs, inter_blobs, output_blobs
+
+
+def _filter_ops(ops, filter_fn, perform_filter):
+    """
+    Filter unwanted operators based on criteria in 'filter_fn'.
+
+    Args:
+        ops: List of Caffe2 operators to filter
+        filter_fn: Criteria function for whether inputs/outputs in an operator
+            should be filtered.
+        perform_filter: Boolean passed from _operators_to_graph_def specifying
+            whether to filter operators
+
+    Returns:
+        new_ops: Subset of ops containing a subset of their inputs and outputs.
+    """
+    if not perform_filter:
+        return ops
+
+    new_ops = []
+    for op in ops:
+        inputs = list(op.input)
+        outputs = list(op.output)
+        del op.input[:]
+        del op.output[:]
+        new_inputs = [i for i in inputs if filter_fn(i)]
+        new_outputs = [o for o in outputs if filter_fn(o)]
+
+        # Only add the op if output is not empty
+        if new_outputs:
+            op.input.extend(new_inputs)
+            op.output.extend(new_outputs)
+            new_ops.append(op)
+
+    return new_ops
+
+
+def _operators_to_graph_def(
+    shapes,
+    ops,
+    colon_replacement="$",
+    with_ssa=True,
+    with_gradient_scope=True,
+    blob_name_tracker=None,
+    show_simplified=False,
+    custom_rename=None,
+):
+    """
+    Convert a set of operators to a graph using the main function.
+
+    Args:
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+        ops: List of Caffe2 operators, representing some computation graph
+        ### **kwargs (model_to_graph_def, nets_to_graph_def, protos_to_graph_def) ###
+        colon_replacement: Symbol to replace ':' with. ':i' in TF has a special
+            meaning, so we need to replace it with a non-conflicting symbol.
+        with_ssa: Boolean
+        with_gradient_scope: Boolean
+        blob_name_tracker: Dictionary tracking names of blobs (inputs/outputs
+            from operators)
+        show_simplified: Whether to show a simplified version of the model graph
+            Sets all of the following values:
+                clear_debug_info: Boolean representing whether to silence debug
+                    info (which can be very verbose)
+                show_forward_only: Boolean representing whether to only show
+                    blobs involved in the forward pass
+                show_cpu_only: Boolean representing whether to only show blobs
+                    that are not associated with a gpu
+                use_tensorflow_naming: Boolean representing whether to convert
+                    some common Caffe2 naming conventions to their Tensorflow
+                    counterparts
+        custom_rename: Function string -> string that defines a custom
+            renaming function to use.
+
+    Returns:
+        current_graph: GraphDef representing the computation graph formed by the
+            set of operators.
+    """
+    if blob_name_tracker is not None:
+        blob_name_tracker.clear()
+    else:
+        blob_name_tracker = {}
+
+    blob_name_tracker.update(_get_blob_names(ops))
+
+    _clear_debug_info(ops, show_simplified)  # clear_debug_info
+    ops = _filter_ops(ops, _check_if_forward, show_simplified)  # show_forward_only
+    ops = _filter_ops(ops, _check_if_cpu, show_simplified)  # show_cpu_only
+    if custom_rename:
+        _rename_all(shapes, blob_name_tracker, ops, custom_rename)
+    if colon_replacement:
+        _replace_colons(shapes, blob_name_tracker, ops, colon_replacement)
+    if with_ssa:
+        _convert_to_ssa(shapes, blob_name_tracker, ops)
+    if with_gradient_scope:
+        _add_gradient_scope(shapes, blob_name_tracker, ops)
+    _fill_missing_operator_names(ops)
+    if show_simplified:  # use_tensorflow_naming
+        _rename_tensorflow_style(shapes, blob_name_tracker, ops)
+    producing_ops: Dict[caffe2_pb2.OperatorDef, List] = {}
+    blobs = set()
+    input_blobs, inter_blobs, _ = _compute_in_out(ops)
+    current_graph = GraphDef()
+    seen = set(input_blobs)
+    for op in ops:
+        nodes_from_op = (
+            _operator_to_node_simp(op, inter_blobs, seen)
+            if show_simplified
+            else [_operator_to_node(shapes, op)]
+        )  # .extend() expects an iterable
+        current_graph.node.extend(nodes_from_op)
+        for input_blob in op.input:
+            blobs.add(input_blob)
+        for i, output_blob in enumerate(op.output):
+            blobs.add(output_blob)
+            producing_ops.setdefault(output_blob, []).append((op, i))
+
+    if show_simplified:
+        # Show a cleaner, easier-to-interpret version of the model graph
+        blobs = input_blobs
+
+    for blob in sorted(blobs):
+        current_graph.node.extend([_blob_to_node(producing_ops, {}, blob)])
+
+    return current_graph
+
+
+def _propagate_device_option(net_def):
+    """
+    Propagate the device options from net to operators.
+
+    Args:
+        net_def: A caffe2_pb2.NetDef representing a computation graph. The graph
+            consists of Caffe2 operators.
+
+    Returns:
+        None. Iterates through all ops contained within the net. For each op,
+            modifies the op device_option in-place to be the net device_option
+            if the op has no pre-existing device_option, and leaves the op as-is
+            if it already has a device_option.
+    """
+    if not net_def.HasField("device_option"):
+        return
+    for op in net_def.op:
+        if not op.HasField("device_option"):
+            op.device_option.CopyFrom(net_def.device_option)
+
+
+def _try_get_shapes(nets):
+    """
+    Get missing shapes for all blobs contained in the nets.
+
+    Args:
+        nets: List of core.Net to extract blob shape information from.
+
+    Returns:
+        Dictionary containing blob name to shape/dimensions mapping. The net
+            is a computation graph that is composed of operators, and the
+            operators have input and output blobs, each with their own dims.
+    """
+    try:
+        # Note: this will inspect the workspace for better or worse.
+        # We don't care about the types, only the shapes
+        shapes, _ = workspace.InferShapesAndTypes(nets)
+        return shapes
+    except Exception as e:
+        log.warning("Failed to compute shapes: %s", e)
+        return {}
+
+
+def model_to_graph_def(model, **kwargs):
+    """
+    Convert a Caffe2 model to a Tensorflow graph.
+
+    This function extracts 'param_init_net' and 'net' from the model and passes it to nets_to_graph()
+    for further processing.
+
+    Args:
+        model (cnn.CNNModelHelper, model_helper.ModelHelper): The model to
+            extract the nets (instances of core.Net) from.
+
+    Returns:
+        Call to nets_to_graph_def() with extracted 'param_init_net', 'net' and
+            **kwargs. See _operators_to_graph_def for detailed **kwargs.
+    """
+    nets = [model.param_init_net, model.net]
+    return nets_to_graph_def(nets, **kwargs)
+
+
+def nets_to_graph_def(nets, shapes=None, **kwargs):
+    """
+    Convert a set of Caffe2 nets to a Tensorflow graph.
+
+    Args:
+        nets: List of core.Nets. core.Net is a wrapper around a NetDef protobuf.
+            The corresponding protobuf can be extracted using .Proto().
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+
+    Returns:
+        Call to protos_to_graph_def() with the extracted NetDef protobufs and
+            **kwargs. See _operators_to_graph_def for detailed **kwargs.
+    """
+    # if shapes is None:
+    #     shapes = _try_get_shapes(nets)
+    # _try_get_shapes(nets) depends on workspace.InferShapesAndTypes(nets),
+    # which is currently broken (segfault). We omit the shapes for now.
+    shapes = {}
+    nets = [copy.deepcopy(net.Proto()) for net in nets]
+    shapes = copy.deepcopy(shapes)
+    return protos_to_graph_def(nets, shapes, **kwargs)
+
+
+def protos_to_graph_def(net_defs, shapes=None, **kwargs):
+    """
+    Convert a set of Caffe2 net definitions to a Tensorflow graph.
+
+    Args:
+        net_defs: List of caffe2_pb2.NetDef protobufs representing computation
+            graphs.
+        shapes: Dictionary mapping blob names to their shapes/dimensions.
+
+    Returns:
+        Call to _operators_to_graph_def() with the extracted operators from the
+            NetDefs and **kwargs. See _operators_to_graph_def for detailed
+            **kwargs.
+    """
+    for net in net_defs:
+        _propagate_device_option(net)
+    shapes = copy.deepcopy(shapes or {})
+    ops = [op for net_def in net_defs for op in net_def.op]
+    return _operators_to_graph_def(shapes, ops, **kwargs)
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_convert_np.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_convert_np.py
new file mode 100644
index 0000000000000000000000000000000000000000..923dc5915f57e2b9b931f90f8ed4732a899b0961
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_convert_np.py
@@ -0,0 +1,40 @@
+"""This module converts objects into numpy array."""
+import numpy as np
+import torch
+
+
+def make_np(x):
+    """
+    Convert an object into numpy array.
+
+    Args:
+      x: An instance of torch tensor or caffe blob name
+
+    Returns:
+        numpy.array: Numpy array
+    """
+    if isinstance(x, np.ndarray):
+        return x
+    if isinstance(x, str):  # Caffe2 will pass name of blob(s) to fetch
+        return _prepare_caffe2(x)
+    if np.isscalar(x):
+        return np.array([x])
+    if isinstance(x, torch.Tensor):
+        return _prepare_pytorch(x)
+    raise NotImplementedError(
+        f"Got {type(x)}, but numpy array, torch tensor, or caffe2 blob name are expected."
+    )
+
+
+def _prepare_pytorch(x):
+    if x.dtype == torch.bfloat16:
+        x = x.to(torch.float16)
+    x = x.detach().cpu().numpy()
+    return x
+
+
+def _prepare_caffe2(x):
+    from caffe2.python import workspace
+
+    x = workspace.FetchBlob(x)
+    return x
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_embedding.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad3cb1c2190dd6a9d00ede817f08777ea9c7daac
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_embedding.py
@@ -0,0 +1,85 @@
+import math
+import numpy as np
+from ._convert_np import make_np
+from ._utils import make_grid
+from tensorboard.compat import tf
+from tensorboard.plugins.projector.projector_config_pb2 import EmbeddingInfo
+
+
+_HAS_GFILE_JOIN = hasattr(tf.io.gfile, "join")
+
+
+def _gfile_join(a, b):
+    # The join API is different between tensorboard's TF stub and TF:
+    # https://github.com/tensorflow/tensorboard/issues/6080
+    # We need to try both because `tf` may point to either the stub or the real TF.
+    if _HAS_GFILE_JOIN:
+        return tf.io.gfile.join(a, b)
+    else:
+        fs = tf.io.gfile.get_filesystem(a)
+        return fs.join(a, b)
+
+
+def make_tsv(metadata, save_path, metadata_header=None):
+    if not metadata_header:
+        metadata = [str(x) for x in metadata]
+    else:
+        assert len(metadata_header) == len(
+            metadata[0]
+        ), "len of header must be equal to the number of columns in metadata"
+        metadata = ["\t".join(str(e) for e in l) for l in [metadata_header] + metadata]
+
+    metadata_bytes = tf.compat.as_bytes("\n".join(metadata) + "\n")
+    with tf.io.gfile.GFile(_gfile_join(save_path, "metadata.tsv"), "wb") as f:
+        f.write(metadata_bytes)
+
+
+# https://github.com/tensorflow/tensorboard/issues/44 image label will be squared
+def make_sprite(label_img, save_path):
+    from PIL import Image
+    from io import BytesIO
+
+    # this ensures the sprite image has correct dimension as described in
+    # https://www.tensorflow.org/get_started/embedding_viz
+    nrow = int(math.ceil((label_img.size(0)) ** 0.5))
+    arranged_img_CHW = make_grid(make_np(label_img), ncols=nrow)
+
+    # augment images so that #images equals nrow*nrow
+    arranged_augment_square_HWC = np.zeros(
+        (arranged_img_CHW.shape[2], arranged_img_CHW.shape[2], 3)
+    )
+    arranged_img_HWC = arranged_img_CHW.transpose(1, 2, 0)  # chw -> hwc
+    arranged_augment_square_HWC[: arranged_img_HWC.shape[0], :, :] = arranged_img_HWC
+    im = Image.fromarray(np.uint8((arranged_augment_square_HWC * 255).clip(0, 255)))
+
+    with BytesIO() as buf:
+        im.save(buf, format="PNG")
+        im_bytes = buf.getvalue()
+
+    with tf.io.gfile.GFile(_gfile_join(save_path, "sprite.png"), "wb") as f:
+        f.write(im_bytes)
+
+
+def get_embedding_info(metadata, label_img, subdir, global_step, tag):
+    info = EmbeddingInfo()
+    info.tensor_name = f"{tag}:{str(global_step).zfill(5)}"
+    info.tensor_path = _gfile_join(subdir, "tensors.tsv")
+    if metadata is not None:
+        info.metadata_path = _gfile_join(subdir, "metadata.tsv")
+    if label_img is not None:
+        info.sprite.image_path = _gfile_join(subdir, "sprite.png")
+        info.sprite.single_image_dim.extend([label_img.size(3), label_img.size(2)])
+    return info
+
+
+def write_pbtxt(save_path, contents):
+    config_path = _gfile_join(save_path, "projector_config.pbtxt")
+    with tf.io.gfile.GFile(config_path, "wb") as f:
+        f.write(tf.compat.as_bytes(contents))
+
+
+def make_mat(matlist, save_path):
+    with tf.io.gfile.GFile(_gfile_join(save_path, "tensors.tsv"), "wb") as f:
+        for x in matlist:
+            x = [str(i.item()) for i in x]
+            f.write(tf.compat.as_bytes("\t".join(x) + "\n"))
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_onnx_graph.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_onnx_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..254f340a099c50861378724cf4634294036f7cba
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_onnx_graph.py
@@ -0,0 +1,62 @@
+from tensorboard.compat.proto.graph_pb2 import GraphDef
+from tensorboard.compat.proto.node_def_pb2 import NodeDef
+from tensorboard.compat.proto.versions_pb2 import VersionDef
+from tensorboard.compat.proto.attr_value_pb2 import AttrValue
+from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
+
+
+def load_onnx_graph(fname):
+    import onnx
+
+    m = onnx.load(fname)  # type: ignore[attr-defined]
+    g = m.graph
+    return parse(g)
+
+
+def parse(graph):
+    nodes = []
+    import itertools
+
+    nodes_proto = list(itertools.chain(graph.input, graph.output))
+
+    for node in nodes_proto:
+        print(node.name)
+        shapeproto = TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=d.dim_value)
+                for d in node.type.tensor_type.shape.dim
+            ]
+        )
+        nodes.append(
+            NodeDef(
+                name=node.name.encode(encoding="utf_8"),
+                op="Variable",
+                input=[],
+                attr={
+                    "dtype": AttrValue(type=node.type.tensor_type.elem_type),
+                    "shape": AttrValue(shape=shapeproto),
+                },
+            )
+        )
+
+    for node in graph.node:
+        _attr = []
+        for s in node.attribute:
+            _attr.append(" = ".join([str(f[1]) for f in s.ListFields()]))
+        attr = ", ".join(_attr).encode(encoding="utf_8")
+        print(node.output[0])
+        nodes.append(
+            NodeDef(
+                name=node.output[0].encode(encoding="utf_8"),
+                op=node.op_type,
+                input=node.input,
+                attr={"parameters": AttrValue(s=attr)},
+            )
+        )
+
+    # two pass token replacement, appends opname to object id
+    mapping = {}
+    for node in nodes:
+        mapping[node.name] = node.op + "_" + node.name
+
+    return GraphDef(node=nodes, versions=VersionDef(producer=22))
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_proto_graph.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_proto_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..a23d3a00bc6ba8b5514d1299bea61ad231fb6ee7
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_proto_graph.py
@@ -0,0 +1,53 @@
+from typing import Optional
+from tensorboard.compat.proto.node_def_pb2 import NodeDef
+from tensorboard.compat.proto.attr_value_pb2 import AttrValue
+from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
+
+
+def attr_value_proto(dtype, shape, s):
+    """Create a dict of objects matching a NodeDef's attr field.
+
+    Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/attr_value.proto
+    specifically designed for a NodeDef. The values have been reverse engineered from
+    standard TensorBoard logged data.
+    """
+    attr = {}
+    if s is not None:
+        attr["attr"] = AttrValue(s=s.encode(encoding="utf_8"))
+    if shape is not None:
+        shapeproto = tensor_shape_proto(shape)
+        attr["_output_shapes"] = AttrValue(list=AttrValue.ListValue(shape=[shapeproto]))
+    return attr
+
+
+def tensor_shape_proto(outputsize):
+    """Create an object matching a tensor_shape field.
+
+    Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/tensor_shape.proto .
+    """
+    return TensorShapeProto(dim=[TensorShapeProto.Dim(size=d) for d in outputsize])
+
+
+def node_proto(
+    name,
+    op="UnSpecified",
+    input=None,
+    dtype=None,
+    shape: Optional[tuple] = None,
+    outputsize=None,
+    attributes="",
+):
+    """Create an object matching a NodeDef.
+
+    Follows https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/node_def.proto .
+    """
+    if input is None:
+        input = []
+    if not isinstance(input, list):
+        input = [input]
+    return NodeDef(
+        name=name.encode(encoding="utf_8"),
+        op=op,
+        input=input,
+        attr=attr_value_proto(dtype, outputsize, attributes),
+    )
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_pytorch_graph.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_pytorch_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f4a5ace070941081ba677abd694e111181bafa9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_pytorch_graph.py
@@ -0,0 +1,380 @@
+from collections import OrderedDict
+import contextlib
+from typing import Dict, Any
+
+from tensorboard.compat.proto.config_pb2 import RunMetadata
+from tensorboard.compat.proto.graph_pb2 import GraphDef
+from tensorboard.compat.proto.step_stats_pb2 import StepStats, DeviceStepStats
+from tensorboard.compat.proto.versions_pb2 import VersionDef
+
+import torch
+from ._proto_graph import node_proto
+
+methods_OP = [
+    "attributeNames",
+    "hasMultipleOutputs",
+    "hasUses",
+    "inputs",
+    "kind",
+    "outputs",
+    "outputsSize",
+    "scopeName",
+]
+# Some additional methods to explure for methods_IO are
+#
+#   'unique' (type int)
+#   'type' (type <Tensor<class 'torch._C.Type'>>)
+#
+# But the below are sufficient for now.
+methods_IO = ["node", "offset", "debugName"]
+
+GETATTR_KIND = "prim::GetAttr"
+CLASSTYPE_KIND = "ClassType"
+
+
+class NodeBase:
+    def __init__(
+        self,
+        debugName=None,
+        inputs=None,
+        scope=None,
+        tensor_size=None,
+        op_type="UnSpecified",
+        attributes="",
+    ):
+        # TODO; Specify a __slots__ for this class or potentially
+        # used namedtuple instead
+        self.debugName = debugName
+        self.inputs = inputs
+        self.tensor_size = tensor_size
+        self.kind = op_type
+        self.attributes = attributes
+        self.scope = scope
+
+    def __repr__(self):
+        repr = []
+        repr.append(str(type(self)))
+        for m in dir(self):
+            if "__" not in m:
+                repr.append(
+                    m + ": " + str(getattr(self, m)) + str(type(getattr(self, m)))
+                )
+        return "\n".join(repr) + "\n\n"
+
+
+class NodePy(NodeBase):
+    def __init__(self, node_cpp, valid_methods):
+        super().__init__(node_cpp)
+        valid_methods = valid_methods[:]
+        self.inputs = []
+
+        for m in valid_methods:
+            if m == "inputs" or m == "outputs":
+                list_of_node = list(getattr(node_cpp, m)())
+                io_unique_names = []
+                io_tensor_sizes = []
+                for n in list_of_node:
+                    io_unique_names.append(n.debugName())
+                    if n.isCompleteTensor():
+                        io_tensor_sizes.append(n.type().sizes())
+                    else:
+                        io_tensor_sizes.append(None)
+
+                setattr(self, m, io_unique_names)
+                setattr(self, m + "tensor_size", io_tensor_sizes)
+
+            else:
+                setattr(self, m, getattr(node_cpp, m)())
+
+
+class NodePyIO(NodePy):
+    def __init__(self, node_cpp, input_or_output=None):
+        super().__init__(node_cpp, methods_IO)
+        try:
+            tensor_size = node_cpp.type().sizes()
+        except RuntimeError:
+            tensor_size = [
+                1,
+            ]  # fail when constant model is used.
+        self.tensor_size = tensor_size
+        # Kind attribute string is purely descriptive and will be shown
+        # in detailed information for the node in TensorBoard's graph plugin.
+        #
+        # NodePyOP nodes get this from their kind() method.
+        self.kind = "Parameter"
+        if input_or_output:
+            self.input_or_output = input_or_output
+            self.kind = "IO Node"
+
+
+class NodePyOP(NodePy):
+    def __init__(self, node_cpp):
+        super().__init__(node_cpp, methods_OP)
+        # Replace single quote which causes strange behavior in TensorBoard
+        # TODO: See if we can remove this in the future
+        self.attributes = str(
+            {k: _node_get(node_cpp, k) for k in node_cpp.attributeNames()}
+        ).replace("'", " ")
+        self.kind = node_cpp.kind()
+
+
+class GraphPy:
+    """Helper class to convert torch.nn.Module to GraphDef proto and visualization with TensorBoard.
+
+    GraphDef generation operates in two passes:
+
+    In the first pass, all nodes are read and saved to two lists.
+    One list is for input/output nodes (nodes_io), which only have inbound
+    or outbound connections, but not both. Another list is for internal
+    operator nodes (nodes_op). The first pass also saves all scope name
+    appeared in the nodes in scope_name_appeared list for later processing.
+
+    In the second pass, scope names are fully applied to all nodes.
+    debugNameToScopedName is a mapping from a node's ID to its fully qualified
+    scope name. e.g. Net1/Linear[0]/1. Unfortunately torch.jit doesn't have
+    totally correct scope output, so this is nontrivial. The function
+    populate_namespace_from_OP_to_IO and find_common_root are used to
+    assign scope name to a node based on the connection between nodes
+    in a heuristic kind of way. Bookkeeping is done with shallowest_scope_name
+    and scope_name_appeared.
+    """
+
+    def __init__(self):
+        self.nodes_op = []
+        self.nodes_io = OrderedDict()
+        self.unique_name_to_scoped_name = {}
+        self.shallowest_scope_name = "default"
+        self.scope_name_appeared = []
+
+    def append(self, x):
+        if isinstance(x, NodePyIO):
+            self.nodes_io[x.debugName] = x
+        if isinstance(x, NodePyOP):
+            self.nodes_op.append(x)
+
+    def printall(self):
+        print("all nodes")
+        for node in self.nodes_op:
+            print(node)
+        for key in self.nodes_io:
+            print(self.nodes_io[key])
+
+    def find_common_root(self):
+        for fullscope in self.scope_name_appeared:
+            if fullscope:
+                self.shallowest_scope_name = fullscope.split("/")[0]
+
+    def populate_namespace_from_OP_to_IO(self):
+        for node in self.nodes_op:
+            for node_output, outputSize in zip(node.outputs, node.outputstensor_size):
+                self.scope_name_appeared.append(node.scopeName)
+                self.nodes_io[node_output] = NodeBase(
+                    node_output,
+                    node.inputs,
+                    node.scopeName,
+                    outputSize,
+                    op_type=node.kind,
+                    attributes=node.attributes,
+                )
+
+        self.find_common_root()
+
+        for node in self.nodes_op:
+            for input_node_id in node.inputs:
+                self.unique_name_to_scoped_name[input_node_id] = (
+                    node.scopeName + "/" + input_node_id
+                )
+
+        for key, node in self.nodes_io.items():
+            if type(node) == NodeBase:
+                self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
+            if hasattr(node, "input_or_output"):
+                self.unique_name_to_scoped_name[key] = (
+                    node.input_or_output + "/" + node.debugName
+                )
+
+            if hasattr(node, "scope") and node.scope is not None:
+                self.unique_name_to_scoped_name[key] = node.scope + "/" + node.debugName
+                if node.scope == "" and self.shallowest_scope_name:
+                    self.unique_name_to_scoped_name[node.debugName] = (
+                        self.shallowest_scope_name + "/" + node.debugName
+                    )
+
+        # replace name
+        for key, node in self.nodes_io.items():
+            self.nodes_io[key].inputs = [
+                self.unique_name_to_scoped_name[node_input_id]
+                for node_input_id in node.inputs
+            ]
+            if node.debugName in self.unique_name_to_scoped_name:
+                self.nodes_io[key].debugName = self.unique_name_to_scoped_name[
+                    node.debugName
+                ]
+
+    def to_proto(self):
+        """Convert graph representation of GraphPy object to TensorBoard required format."""
+        # TODO: compute correct memory usage and CPU time once
+        # PyTorch supports it
+        nodes = []
+        for v in self.nodes_io.values():
+            nodes.append(
+                node_proto(
+                    v.debugName,
+                    input=v.inputs,
+                    outputsize=v.tensor_size,
+                    op=v.kind,
+                    attributes=v.attributes,
+                )
+            )
+        return nodes
+
+
+def parse(graph, trace, args=None, omit_useless_nodes=True):
+    """Parse an optimized PyTorch model graph and produces a list of nodes and node stats.
+
+    Useful for eventual conversion to TensorBoard protobuf format.
+
+    Args:
+      graph (PyTorch module): The model graph to be parsed.
+      trace (PyTorch JIT TracedModule): The model trace to be parsed.
+      args (tuple): input tensor[s] for the model.
+      omit_useless_nodes (boolean): Whether to remove nodes from the graph.
+    """
+    n_inputs = len(args)
+
+    scope = {}
+    nodes_py = GraphPy()
+    for node in graph.inputs():
+        if omit_useless_nodes:
+            if (
+                len(node.uses()) == 0
+            ):  # number of user of the node (= number of outputs/ fanout)
+                continue
+
+        if node.type().kind() != CLASSTYPE_KIND:
+            nodes_py.append(NodePyIO(node, "input"))
+
+    attr_to_scope: Dict[Any, str] = {}
+    for node in graph.nodes():
+        if node.kind() == GETATTR_KIND:
+            attr_name = node.s("name")
+            attr_key = node.output().debugName()
+            parent = node.input().node()
+            if (
+                parent.kind() == GETATTR_KIND
+            ):  # If the parent node is not the top-level "self" node
+                parent_attr_name = parent.s("name")
+                parent_attr_key = parent.output().debugName()
+                parent_scope = attr_to_scope[parent_attr_key]
+                attr_scope = parent_scope.split("/")[-1]
+                attr_to_scope[attr_key] = f"{parent_scope}/{attr_scope}.{attr_name}"
+            else:
+                attr_to_scope[attr_key] = f"__module.{attr_name}"
+            # We don't need classtype nodes; scope will provide this information
+            if node.output().type().kind() != CLASSTYPE_KIND:
+                node_py = NodePyOP(node)
+                node_py.scopeName = attr_to_scope[attr_key]  # type: ignore[attr-defined]
+                nodes_py.append(node_py)
+        else:
+            nodes_py.append(NodePyOP(node))
+
+    for i, node in enumerate(graph.outputs()):  # Create sink nodes for output ops
+        node_pyio = NodePyIO(node, "output")
+        node_pyio.debugName = f"output.{i + 1}"
+        node_pyio.inputs = [node.debugName()]
+        nodes_py.append(node_pyio)
+
+    def parse_traced_name(module):
+        if isinstance(module, torch.jit.TracedModule):
+            module_name = module._name
+        else:
+            module_name = getattr(module, "original_name", "Module")
+        return module_name
+
+    alias_to_name = {}
+    base_name = parse_traced_name(trace)
+    for name, module in trace.named_modules(prefix="__module"):
+        mod_name = parse_traced_name(module)
+        attr_name = name.split(".")[-1]
+        alias_to_name[name] = f"{mod_name}[{attr_name}]"
+
+    for node in nodes_py.nodes_op:
+        module_aliases = node.scopeName.split("/")
+        replacements = [
+            alias_to_name[alias] if alias in alias_to_name else alias.split(".")[-1]
+            for alias in module_aliases
+        ]
+        node.scopeName = base_name
+        if any(replacements):
+            node.scopeName += "/" + "/".join(replacements)
+
+    nodes_py.populate_namespace_from_OP_to_IO()
+    return nodes_py.to_proto()
+
+
+def graph(model, args, verbose=False, use_strict_trace=True):
+    """
+    Process a PyTorch model and produces a `GraphDef` proto that can be logged to TensorBoard.
+
+    Args:
+      model (PyTorch module): The model to be parsed.
+      args (tuple): input tensor[s] for the model.
+      verbose (bool): Whether to print out verbose information while
+        processing.
+      use_strict_trace (bool): Whether to pass keyword argument `strict` to
+        `torch.jit.trace`. Pass False when you want the tracer to
+        record your mutable container types (list, dict)
+    """
+    with _set_model_to_eval(model):
+        try:
+            trace = torch.jit.trace(model, args, strict=use_strict_trace)
+            graph = trace.graph
+            torch._C._jit_pass_inline(graph)
+        except RuntimeError as e:
+            print(e)
+            print("Error occurs, No graph saved")
+            raise e
+
+    if verbose:
+        print(graph)
+    list_of_nodes = parse(graph, trace, args)
+    # We are hardcoding that this was run on CPU even though it might have actually
+    # run on GPU. Note this is what is shown in TensorBoard and has no bearing
+    # on actual execution.
+    # TODO: See if we can extract GPU vs CPU information from the PyTorch model
+    # and pass it correctly to TensorBoard.
+    #
+    # Definition of StepStats and DeviceStepStats can be found at
+    # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/test/graph-test.ts
+    # and
+    # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/step_stats.proto
+    stepstats = RunMetadata(
+        step_stats=StepStats(dev_stats=[DeviceStepStats(device="/device:CPU:0")])
+    )
+    return GraphDef(node=list_of_nodes, versions=VersionDef(producer=22)), stepstats
+    # The producer version has been reverse engineered from standard
+    # TensorBoard logged data.
+
+
+@contextlib.contextmanager
+def _set_model_to_eval(model):
+    """Context manager to temporarily set the training mode of ``model`` to eval."""
+    if not isinstance(model, torch.jit.ScriptFunction):
+        originally_training = model.training
+        model.train(False)
+        try:
+            yield
+        finally:
+            model.train(originally_training)
+    else:
+        # Do nothing for ScriptFunction
+        try:
+            yield
+        finally:
+            pass
+
+
+def _node_get(node: torch._C.Node, key: str):
+    """Get attributes of a node which is polymorphic over return type."""
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/_utils.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee41cd94dad55f079d1e33327cde707125295f9c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/_utils.py
@@ -0,0 +1,125 @@
+import numpy as np
+
+
+# Functions for converting
+def figure_to_image(figures, close=True):
+    """Render matplotlib figure to numpy format.
+
+    Note that this requires the ``matplotlib`` package.
+
+    Args:
+        figures (matplotlib.pyplot.figure or list of figures): figure or a list of figures
+        close (bool): Flag to automatically close the figure
+
+    Returns:
+        numpy.array: image in [CHW] order
+    """
+    import matplotlib.pyplot as plt
+    import matplotlib.backends.backend_agg as plt_backend_agg
+
+    def render_to_rgb(figure):
+        canvas = plt_backend_agg.FigureCanvasAgg(figure)
+        canvas.draw()
+        data: np.ndarray = np.frombuffer(canvas.buffer_rgba(), dtype=np.uint8)
+        w, h = figure.canvas.get_width_height()
+        image_hwc = data.reshape([h, w, 4])[:, :, 0:3]
+        image_chw = np.moveaxis(image_hwc, source=2, destination=0)
+        if close:
+            plt.close(figure)
+        return image_chw
+
+    if isinstance(figures, list):
+        images = [render_to_rgb(figure) for figure in figures]
+        return np.stack(images)
+    else:
+        image = render_to_rgb(figures)
+        return image
+
+
+def _prepare_video(V):
+    """
+    Convert a 5D tensor into 4D tensor.
+
+    Convesrion is done from [batchsize, time(frame), channel(color), height, width]  (5D tensor)
+    to [time(frame), new_width, new_height, channel] (4D tensor).
+
+    A batch of images are spreaded to a grid, which forms a frame.
+    e.g. Video with batchsize 16 will have a 4x4 grid.
+    """
+    b, t, c, h, w = V.shape
+
+    if V.dtype == np.uint8:
+        V = np.float32(V) / 255.0
+
+    def is_power2(num):
+        return num != 0 and ((num & (num - 1)) == 0)
+
+    # pad to nearest power of 2, all at once
+    if not is_power2(V.shape[0]):
+        len_addition = int(2 ** V.shape[0].bit_length() - V.shape[0])
+        V = np.concatenate((V, np.zeros(shape=(len_addition, t, c, h, w))), axis=0)
+
+    n_rows = 2 ** ((b.bit_length() - 1) // 2)
+    n_cols = V.shape[0] // n_rows
+
+    V = np.reshape(V, newshape=(n_rows, n_cols, t, c, h, w))
+    V = np.transpose(V, axes=(2, 0, 4, 1, 5, 3))
+    V = np.reshape(V, newshape=(t, n_rows * h, n_cols * w, c))
+
+    return V
+
+
+def make_grid(I, ncols=8):
+    # I: N1HW or N3HW
+    assert isinstance(I, np.ndarray), "plugin error, should pass numpy array here"
+    if I.shape[1] == 1:
+        I = np.concatenate([I, I, I], 1)
+    assert I.ndim == 4 and I.shape[1] == 3
+    nimg = I.shape[0]
+    H = I.shape[2]
+    W = I.shape[3]
+    ncols = min(nimg, ncols)
+    nrows = int(np.ceil(float(nimg) / ncols))
+    canvas = np.zeros((3, H * nrows, W * ncols), dtype=I.dtype)
+    i = 0
+    for y in range(nrows):
+        for x in range(ncols):
+            if i >= nimg:
+                break
+            canvas[:, y * H : (y + 1) * H, x * W : (x + 1) * W] = I[i]
+            i = i + 1
+    return canvas
+
+    # if modality == 'IMG':
+    #     if x.dtype == np.uint8:
+    #         x = x.astype(np.float32) / 255.0
+
+
+def convert_to_HWC(tensor, input_format):  # tensor: numpy array
+    assert len(set(input_format)) == len(
+        input_format
+    ), f"You can not use the same dimension shordhand twice.         input_format: {input_format}"
+    assert len(tensor.shape) == len(
+        input_format
+    ), f"size of input tensor and input format are different. \
+        tensor shape: {tensor.shape}, input_format: {input_format}"
+    input_format = input_format.upper()
+
+    if len(input_format) == 4:
+        index = [input_format.find(c) for c in "NCHW"]
+        tensor_NCHW = tensor.transpose(index)
+        tensor_CHW = make_grid(tensor_NCHW)
+        return tensor_CHW.transpose(1, 2, 0)
+
+    if len(input_format) == 3:
+        index = [input_format.find(c) for c in "HWC"]
+        tensor_HWC = tensor.transpose(index)
+        if tensor_HWC.shape[2] == 1:
+            tensor_HWC = np.concatenate([tensor_HWC, tensor_HWC, tensor_HWC], 2)
+        return tensor_HWC
+
+    if len(input_format) == 2:
+        index = [input_format.find(c) for c in "HW"]
+        tensor = tensor.transpose(index)
+        tensor = np.stack([tensor, tensor, tensor], 2)
+        return tensor
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/summary.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..8133c9320e8585e55551d3cd3817387621ee671b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/summary.py
@@ -0,0 +1,983 @@
+import json
+import logging
+import os
+import struct
+
+from typing import Any, List, Optional
+
+import torch
+import numpy as np
+
+from google.protobuf import struct_pb2
+
+from tensorboard.compat.proto.summary_pb2 import (
+    HistogramProto,
+    Summary,
+    SummaryMetadata,
+)
+from tensorboard.compat.proto.tensor_pb2 import TensorProto
+from tensorboard.compat.proto.tensor_shape_pb2 import TensorShapeProto
+from tensorboard.plugins.custom_scalar import layout_pb2
+from tensorboard.plugins.pr_curve.plugin_data_pb2 import PrCurvePluginData
+from tensorboard.plugins.text.plugin_data_pb2 import TextPluginData
+
+from ._convert_np import make_np
+from ._utils import _prepare_video, convert_to_HWC
+
+__all__ = [
+    "half_to_int",
+    "int_to_half",
+    "hparams",
+    "scalar",
+    "histogram_raw",
+    "histogram",
+    "make_histogram",
+    "image",
+    "image_boxes",
+    "draw_boxes",
+    "make_image",
+    "video",
+    "make_video",
+    "audio",
+    "custom_scalars",
+    "text",
+    "tensor_proto",
+    "pr_curve_raw",
+    "pr_curve",
+    "compute_curve",
+    "mesh",
+]
+
+logger = logging.getLogger(__name__)
+
+def half_to_int(f: float) -> int:
+    """Casts a half-precision float value into an integer.
+
+    Converts a half precision floating point value, such as `torch.half` or
+    `torch.bfloat16`, into an integer value which can be written into the
+    half_val field of a TensorProto for storage.
+
+    To undo the effects of this conversion, use int_to_half().
+
+    """
+    buf = struct.pack("f", f)
+    return struct.unpack("i", buf)[0]
+
+def int_to_half(i: int) -> float:
+    """Casts an integer value to a half-precision float.
+
+    Converts an integer value obtained from half_to_int back into a floating
+    point value.
+
+    """
+    buf = struct.pack("i", i)
+    return struct.unpack("f", buf)[0]
+
+def _tensor_to_half_val(t: torch.Tensor) -> List[int]:
+    return [half_to_int(x) for x in t.flatten().tolist()]
+
+def _tensor_to_complex_val(t: torch.Tensor) -> List[float]:
+    return torch.view_as_real(t).flatten().tolist()
+
+def _tensor_to_list(t: torch.Tensor) -> List[Any]:
+    return t.flatten().tolist()
+
+# type maps: torch.Tensor type -> (protobuf type, protobuf val field)
+_TENSOR_TYPE_MAP = {
+    torch.half: ("DT_HALF", "half_val", _tensor_to_half_val),
+    torch.float16: ("DT_HALF", "half_val", _tensor_to_half_val),
+    torch.bfloat16: ("DT_BFLOAT16", "half_val", _tensor_to_half_val),
+    torch.float32: ("DT_FLOAT", "float_val", _tensor_to_list),
+    torch.float: ("DT_FLOAT", "float_val", _tensor_to_list),
+    torch.float64: ("DT_DOUBLE", "double_val", _tensor_to_list),
+    torch.double: ("DT_DOUBLE", "double_val", _tensor_to_list),
+    torch.int8: ("DT_INT8", "int_val", _tensor_to_list),
+    torch.uint8: ("DT_UINT8", "int_val", _tensor_to_list),
+    torch.qint8: ("DT_UINT8", "int_val", _tensor_to_list),
+    torch.int16: ("DT_INT16", "int_val", _tensor_to_list),
+    torch.short: ("DT_INT16", "int_val", _tensor_to_list),
+    torch.int: ("DT_INT32", "int_val", _tensor_to_list),
+    torch.int32: ("DT_INT32", "int_val", _tensor_to_list),
+    torch.qint32: ("DT_INT32", "int_val", _tensor_to_list),
+    torch.int64: ("DT_INT64", "int64_val", _tensor_to_list),
+    torch.complex32: ("DT_COMPLEX32", "scomplex_val", _tensor_to_complex_val),
+    torch.chalf: ("DT_COMPLEX32", "scomplex_val", _tensor_to_complex_val),
+    torch.complex64: ("DT_COMPLEX64", "scomplex_val", _tensor_to_complex_val),
+    torch.cfloat: ("DT_COMPLEX64", "scomplex_val", _tensor_to_complex_val),
+    torch.bool: ("DT_BOOL", "bool_val", _tensor_to_list),
+    torch.complex128: ("DT_COMPLEX128", "dcomplex_val", _tensor_to_complex_val),
+    torch.cdouble: ("DT_COMPLEX128", "dcomplex_val", _tensor_to_complex_val),
+    torch.uint8: ("DT_UINT8", "uint32_val", _tensor_to_list),
+    torch.quint8: ("DT_UINT8", "uint32_val", _tensor_to_list),
+    torch.quint4x2: ("DT_UINT8", "uint32_val", _tensor_to_list),
+}
+
+
+def _calc_scale_factor(tensor):
+    converted = tensor.numpy() if not isinstance(tensor, np.ndarray) else tensor
+    return 1 if converted.dtype == np.uint8 else 255
+
+
+def _draw_single_box(
+    image,
+    xmin,
+    ymin,
+    xmax,
+    ymax,
+    display_str,
+    color="black",
+    color_text="black",
+    thickness=2,
+):
+    from PIL import ImageDraw, ImageFont
+
+    font = ImageFont.load_default()
+    draw = ImageDraw.Draw(image)
+    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+    draw.line(
+        [(left, top), (left, bottom), (right, bottom), (right, top), (left, top)],
+        width=thickness,
+        fill=color,
+    )
+    if display_str:
+        text_bottom = bottom
+        # Reverse list and print from bottom to top.
+        text_width, text_height = font.getsize(display_str)
+        margin = np.ceil(0.05 * text_height)
+        draw.rectangle(
+            [
+                (left, text_bottom - text_height - 2 * margin),
+                (left + text_width, text_bottom),
+            ],
+            fill=color,
+        )
+        draw.text(
+            (left + margin, text_bottom - text_height - margin),
+            display_str,
+            fill=color_text,
+            font=font,
+        )
+    return image
+
+
+def hparams(hparam_dict=None, metric_dict=None, hparam_domain_discrete=None):
+    """Output three `Summary` protocol buffers needed by hparams plugin.
+
+    `Experiment` keeps the metadata of an experiment, such as the name of the
+      hyperparameters and the name of the metrics.
+    `SessionStartInfo` keeps key-value pairs of the hyperparameters
+    `SessionEndInfo` describes status of the experiment e.g. STATUS_SUCCESS
+
+    Args:
+      hparam_dict: A dictionary that contains names of the hyperparameters
+        and their values.
+      metric_dict: A dictionary that contains names of the metrics
+        and their values.
+      hparam_domain_discrete: (Optional[Dict[str, List[Any]]]) A dictionary that
+        contains names of the hyperparameters and all discrete values they can hold
+
+    Returns:
+      The `Summary` protobufs for Experiment, SessionStartInfo and
+        SessionEndInfo
+    """
+    import torch
+    from tensorboard.plugins.hparams.api_pb2 import (
+        DataType,
+        Experiment,
+        HParamInfo,
+        MetricInfo,
+        MetricName,
+        Status,
+    )
+    from tensorboard.plugins.hparams.metadata import (
+        EXPERIMENT_TAG,
+        PLUGIN_DATA_VERSION,
+        PLUGIN_NAME,
+        SESSION_END_INFO_TAG,
+        SESSION_START_INFO_TAG,
+    )
+    from tensorboard.plugins.hparams.plugin_data_pb2 import (
+        HParamsPluginData,
+        SessionEndInfo,
+        SessionStartInfo,
+    )
+
+    # TODO: expose other parameters in the future.
+    # hp = HParamInfo(name='lr',display_name='learning rate',
+    # type=DataType.DATA_TYPE_FLOAT64, domain_interval=Interval(min_value=10,
+    # max_value=100))
+    # mt = MetricInfo(name=MetricName(tag='accuracy'), display_name='accuracy',
+    # description='', dataset_type=DatasetType.DATASET_VALIDATION)
+    # exp = Experiment(name='123', description='456', time_created_secs=100.0,
+    # hparam_infos=[hp], metric_infos=[mt], user='tw')
+
+    if not isinstance(hparam_dict, dict):
+        logger.warning("parameter: hparam_dict should be a dictionary, nothing logged.")
+        raise TypeError(
+            "parameter: hparam_dict should be a dictionary, nothing logged."
+        )
+    if not isinstance(metric_dict, dict):
+        logger.warning("parameter: metric_dict should be a dictionary, nothing logged.")
+        raise TypeError(
+            "parameter: metric_dict should be a dictionary, nothing logged."
+        )
+
+    hparam_domain_discrete = hparam_domain_discrete or {}
+    if not isinstance(hparam_domain_discrete, dict):
+        raise TypeError(
+            "parameter: hparam_domain_discrete should be a dictionary, nothing logged."
+        )
+    for k, v in hparam_domain_discrete.items():
+        if (
+            k not in hparam_dict
+            or not isinstance(v, list)
+            or not all(isinstance(d, type(hparam_dict[k])) for d in v)
+        ):
+            raise TypeError(
+                f"parameter: hparam_domain_discrete[{k}] should be a list of same type as hparam_dict[{k}]."
+            )
+    hps = []
+
+    ssi = SessionStartInfo()
+    for k, v in hparam_dict.items():
+        if v is None:
+            continue
+        if isinstance(v, (int, float)):
+            ssi.hparams[k].number_value = v
+
+            if k in hparam_domain_discrete:
+                domain_discrete: Optional[struct_pb2.ListValue] = struct_pb2.ListValue(
+                    values=[
+                        struct_pb2.Value(number_value=d)
+                        for d in hparam_domain_discrete[k]
+                    ]
+                )
+            else:
+                domain_discrete = None
+
+            hps.append(
+                HParamInfo(
+                    name=k,
+                    type=DataType.Value("DATA_TYPE_FLOAT64"),
+                    domain_discrete=domain_discrete,
+                )
+            )
+            continue
+
+        if isinstance(v, str):
+            ssi.hparams[k].string_value = v
+
+            if k in hparam_domain_discrete:
+                domain_discrete = struct_pb2.ListValue(
+                    values=[
+                        struct_pb2.Value(string_value=d)
+                        for d in hparam_domain_discrete[k]
+                    ]
+                )
+            else:
+                domain_discrete = None
+
+            hps.append(
+                HParamInfo(
+                    name=k,
+                    type=DataType.Value("DATA_TYPE_STRING"),
+                    domain_discrete=domain_discrete,
+                )
+            )
+            continue
+
+        if isinstance(v, bool):
+            ssi.hparams[k].bool_value = v
+
+            if k in hparam_domain_discrete:
+                domain_discrete = struct_pb2.ListValue(
+                    values=[
+                        struct_pb2.Value(bool_value=d)
+                        for d in hparam_domain_discrete[k]
+                    ]
+                )
+            else:
+                domain_discrete = None
+
+            hps.append(
+                HParamInfo(
+                    name=k,
+                    type=DataType.Value("DATA_TYPE_BOOL"),
+                    domain_discrete=domain_discrete,
+                )
+            )
+            continue
+
+        if isinstance(v, torch.Tensor):
+            v = make_np(v)[0]
+            ssi.hparams[k].number_value = v
+            hps.append(HParamInfo(name=k, type=DataType.Value("DATA_TYPE_FLOAT64")))
+            continue
+        raise ValueError(
+            "value should be one of int, float, str, bool, or torch.Tensor"
+        )
+
+    content = HParamsPluginData(session_start_info=ssi, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(
+        plugin_data=SummaryMetadata.PluginData(
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
+        )
+    )
+    ssi = Summary(value=[Summary.Value(tag=SESSION_START_INFO_TAG, metadata=smd)])
+
+    mts = [MetricInfo(name=MetricName(tag=k)) for k in metric_dict.keys()]
+
+    exp = Experiment(hparam_infos=hps, metric_infos=mts)
+
+    content = HParamsPluginData(experiment=exp, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(
+        plugin_data=SummaryMetadata.PluginData(
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
+        )
+    )
+    exp = Summary(value=[Summary.Value(tag=EXPERIMENT_TAG, metadata=smd)])
+
+    sei = SessionEndInfo(status=Status.Value("STATUS_SUCCESS"))
+    content = HParamsPluginData(session_end_info=sei, version=PLUGIN_DATA_VERSION)
+    smd = SummaryMetadata(
+        plugin_data=SummaryMetadata.PluginData(
+            plugin_name=PLUGIN_NAME, content=content.SerializeToString()
+        )
+    )
+    sei = Summary(value=[Summary.Value(tag=SESSION_END_INFO_TAG, metadata=smd)])
+
+    return exp, ssi, sei
+
+
+def scalar(name, tensor, collections=None, new_style=False, double_precision=False):
+    """Output a `Summary` protocol buffer containing a single scalar value.
+
+    The generated Summary has a Tensor.proto containing the input Tensor.
+    Args:
+      name: A name for the generated node. Will also serve as the series name in
+        TensorBoard.
+      tensor: A real numeric Tensor containing a single value.
+      collections: Optional list of graph collections keys. The new summary op is
+        added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+      new_style: Whether to use new style (tensor field) or old style (simple_value
+        field). New style could lead to faster data loading.
+    Returns:
+      A scalar `Tensor` of type `string`. Which contains a `Summary` protobuf.
+    Raises:
+      ValueError: If tensor has the wrong shape or type.
+    """
+    tensor = make_np(tensor).squeeze()
+    assert (
+        tensor.ndim == 0
+    ), f"Tensor should contain one element (0 dimensions). Was given size: {tensor.size} and {tensor.ndim} dimensions."
+    # python float is double precision in numpy
+    scalar = float(tensor)
+    if new_style:
+        tensor_proto = TensorProto(float_val=[scalar], dtype="DT_FLOAT")
+        if double_precision:
+            tensor_proto = TensorProto(double_val=[scalar], dtype="DT_DOUBLE")
+
+        plugin_data = SummaryMetadata.PluginData(plugin_name="scalars")
+        smd = SummaryMetadata(plugin_data=plugin_data)
+        return Summary(
+            value=[
+                Summary.Value(
+                    tag=name,
+                    tensor=tensor_proto,
+                    metadata=smd,
+                )
+            ]
+        )
+    else:
+        return Summary(value=[Summary.Value(tag=name, simple_value=scalar)])
+
+
+def tensor_proto(tag, tensor):
+    """Outputs a `Summary` protocol buffer containing the full tensor.
+    The generated Summary has a Tensor.proto containing the input Tensor.
+    Args:
+      name: A name for the generated node. Will also serve as the series name in
+        TensorBoard.
+      tensor: Tensor to be converted to protobuf
+    Returns:
+      A tensor protobuf in a `Summary` protobuf.
+    Raises:
+      ValueError: If tensor is too big to be converted to protobuf, or
+                     tensor data type is not supported
+    """
+    if tensor.numel() * tensor.itemsize >= (1 << 31):
+        raise ValueError(
+            "tensor is bigger than protocol buffer's hard limit of 2GB in size"
+        )
+
+    if tensor.dtype in _TENSOR_TYPE_MAP:
+        dtype, field_name, conversion_fn = _TENSOR_TYPE_MAP[tensor.dtype]
+        tensor_proto = TensorProto(
+            **{
+                "dtype": dtype,
+                "tensor_shape": TensorShapeProto(
+                    dim=[TensorShapeProto.Dim(size=x) for x in tensor.shape]
+                ),
+                field_name: conversion_fn(tensor),
+            },
+        )
+    else:
+        raise ValueError(f"{tag} has unsupported tensor dtype {tensor.dtype}")
+
+    plugin_data = SummaryMetadata.PluginData(plugin_name="tensor")
+    smd = SummaryMetadata(plugin_data=plugin_data)
+    return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor_proto)])
+
+
+def histogram_raw(name, min, max, num, sum, sum_squares, bucket_limits, bucket_counts):
+    # pylint: disable=line-too-long
+    """Output a `Summary` protocol buffer with a histogram.
+
+    The generated
+    [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+    has one summary value containing a histogram for `values`.
+    Args:
+      name: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      min: A float or int min value
+      max: A float or int max value
+      num: Int number of values
+      sum: Float or int sum of all values
+      sum_squares: Float or int sum of squares for all values
+      bucket_limits: A numeric `Tensor` with upper value per bucket
+      bucket_counts: A numeric `Tensor` with number of values per bucket
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    hist = HistogramProto(
+        min=min,
+        max=max,
+        num=num,
+        sum=sum,
+        sum_squares=sum_squares,
+        bucket_limit=bucket_limits,
+        bucket=bucket_counts,
+    )
+    return Summary(value=[Summary.Value(tag=name, histo=hist)])
+
+
+def histogram(name, values, bins, max_bins=None):
+    # pylint: disable=line-too-long
+    """Output a `Summary` protocol buffer with a histogram.
+
+    The generated
+    [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+    has one summary value containing a histogram for `values`.
+    This op reports an `InvalidArgument` error if any value is not finite.
+    Args:
+      name: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      values: A real numeric `Tensor`. Any shape. Values to use to
+        build the histogram.
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    values = make_np(values)
+    hist = make_histogram(values.astype(float), bins, max_bins)
+    return Summary(value=[Summary.Value(tag=name, histo=hist)])
+
+
+def make_histogram(values, bins, max_bins=None):
+    """Convert values into a histogram proto using logic from histogram.cc."""
+    if values.size == 0:
+        raise ValueError("The input has no element.")
+    values = values.reshape(-1)
+    counts, limits = np.histogram(values, bins=bins)
+    num_bins = len(counts)
+    if max_bins is not None and num_bins > max_bins:
+        subsampling = num_bins // max_bins
+        subsampling_remainder = num_bins % subsampling
+        if subsampling_remainder != 0:
+            counts = np.pad(
+                counts,
+                pad_width=[[0, subsampling - subsampling_remainder]],
+                mode="constant",
+                constant_values=0,
+            )
+        counts = counts.reshape(-1, subsampling).sum(axis=-1)
+        new_limits = np.empty((counts.size + 1,), limits.dtype)
+        new_limits[:-1] = limits[:-1:subsampling]
+        new_limits[-1] = limits[-1]
+        limits = new_limits
+
+    # Find the first and the last bin defining the support of the histogram:
+
+    cum_counts = np.cumsum(np.greater(counts, 0))
+    start, end = np.searchsorted(cum_counts, [0, cum_counts[-1] - 1], side="right")
+    start = int(start)
+    end = int(end) + 1
+    del cum_counts
+
+    # TensorBoard only includes the right bin limits. To still have the leftmost limit
+    # included, we include an empty bin left.
+    # If start == 0, we need to add an empty one left, otherwise we can just include the bin left to the
+    # first nonzero-count bin:
+    counts = (
+        counts[start - 1 : end] if start > 0 else np.concatenate([[0], counts[:end]])
+    )
+    limits = limits[start : end + 1]
+
+    if counts.size == 0 or limits.size == 0:
+        raise ValueError("The histogram is empty, please file a bug report.")
+
+    sum_sq = values.dot(values)
+    return HistogramProto(
+        min=values.min(),
+        max=values.max(),
+        num=len(values),
+        sum=values.sum(),
+        sum_squares=sum_sq,
+        bucket_limit=limits.tolist(),
+        bucket=counts.tolist(),
+    )
+
+
+def image(tag, tensor, rescale=1, dataformats="NCHW"):
+    """Output a `Summary` protocol buffer with images.
+
+    The summary has up to `max_images` summary values containing images. The
+    images are built from `tensor` which must be 3-D with shape `[height, width,
+    channels]` and where `channels` can be:
+    *  1: `tensor` is interpreted as Grayscale.
+    *  3: `tensor` is interpreted as RGB.
+    *  4: `tensor` is interpreted as RGBA.
+    The `name` in the outputted Summary.Value protobufs is generated based on the
+    name, with a suffix depending on the max_outputs setting:
+    *  If `max_outputs` is 1, the summary value tag is '*name*/image'.
+    *  If `max_outputs` is greater than 1, the summary value tags are
+       generated sequentially as '*name*/image/0', '*name*/image/1', etc.
+    Args:
+      tag: A name for the generated node. Will also serve as a series name in
+        TensorBoard.
+      tensor: A 3-D `uint8` or `float32` `Tensor` of shape `[height, width,
+        channels]` where `channels` is 1, 3, or 4.
+        'tensor' can either have values in [0, 1] (float32) or [0, 255] (uint8).
+        The image() function will scale the image values to [0, 255] by applying
+        a scale factor of either 1 (uint8) or 255 (float32). Out-of-range values
+        will be clipped.
+    Returns:
+      A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+      buffer.
+    """
+    tensor = make_np(tensor)
+    tensor = convert_to_HWC(tensor, dataformats)
+    # Do not assume that user passes in values in [0, 255], use data type to detect
+    scale_factor = _calc_scale_factor(tensor)
+    tensor = tensor.astype(np.float32)
+    tensor = (tensor * scale_factor).clip(0, 255).astype(np.uint8)
+    image = make_image(tensor, rescale=rescale)
+    return Summary(value=[Summary.Value(tag=tag, image=image)])
+
+
+def image_boxes(
+    tag, tensor_image, tensor_boxes, rescale=1, dataformats="CHW", labels=None
+):
+    """Output a `Summary` protocol buffer with images."""
+    tensor_image = make_np(tensor_image)
+    tensor_image = convert_to_HWC(tensor_image, dataformats)
+    tensor_boxes = make_np(tensor_boxes)
+    tensor_image = tensor_image.astype(np.float32) * _calc_scale_factor(tensor_image)
+    image = make_image(
+        tensor_image.clip(0, 255).astype(np.uint8),
+        rescale=rescale,
+        rois=tensor_boxes,
+        labels=labels,
+    )
+    return Summary(value=[Summary.Value(tag=tag, image=image)])
+
+
+def draw_boxes(disp_image, boxes, labels=None):
+    # xyxy format
+    num_boxes = boxes.shape[0]
+    list_gt = range(num_boxes)
+    for i in list_gt:
+        disp_image = _draw_single_box(
+            disp_image,
+            boxes[i, 0],
+            boxes[i, 1],
+            boxes[i, 2],
+            boxes[i, 3],
+            display_str=None if labels is None else labels[i],
+            color="Red",
+        )
+    return disp_image
+
+
+def make_image(tensor, rescale=1, rois=None, labels=None):
+    """Convert a numpy representation of an image to Image protobuf."""
+    from PIL import Image
+
+    height, width, channel = tensor.shape
+    scaled_height = int(height * rescale)
+    scaled_width = int(width * rescale)
+    image = Image.fromarray(tensor)
+    if rois is not None:
+        image = draw_boxes(image, rois, labels=labels)
+    try:
+        ANTIALIAS = Image.Resampling.LANCZOS
+    except AttributeError:
+        ANTIALIAS = Image.ANTIALIAS
+    image = image.resize((scaled_width, scaled_height), ANTIALIAS)
+    import io
+
+    output = io.BytesIO()
+    image.save(output, format="PNG")
+    image_string = output.getvalue()
+    output.close()
+    return Summary.Image(
+        height=height,
+        width=width,
+        colorspace=channel,
+        encoded_image_string=image_string,
+    )
+
+
+def video(tag, tensor, fps=4):
+    tensor = make_np(tensor)
+    tensor = _prepare_video(tensor)
+    # If user passes in uint8, then we don't need to rescale by 255
+    scale_factor = _calc_scale_factor(tensor)
+    tensor = tensor.astype(np.float32)
+    tensor = (tensor * scale_factor).clip(0, 255).astype(np.uint8)
+    video = make_video(tensor, fps)
+    return Summary(value=[Summary.Value(tag=tag, image=video)])
+
+
+def make_video(tensor, fps):
+    try:
+        import moviepy  # noqa: F401
+    except ImportError:
+        print("add_video needs package moviepy")
+        return
+    try:
+        from moviepy import editor as mpy
+    except ImportError:
+        print(
+            "moviepy is installed, but can't import moviepy.editor.",
+            "Some packages could be missing [imageio, requests]",
+        )
+        return
+    import tempfile
+
+    t, h, w, c = tensor.shape
+
+    # encode sequence of images into gif string
+    clip = mpy.ImageSequenceClip(list(tensor), fps=fps)
+
+    filename = tempfile.NamedTemporaryFile(suffix=".gif", delete=False).name
+    try:  # newer version of moviepy use logger instead of progress_bar argument.
+        clip.write_gif(filename, verbose=False, logger=None)
+    except TypeError:
+        try:  # older version of moviepy does not support progress_bar argument.
+            clip.write_gif(filename, verbose=False, progress_bar=False)
+        except TypeError:
+            clip.write_gif(filename, verbose=False)
+
+    with open(filename, "rb") as f:
+        tensor_string = f.read()
+
+    try:
+        os.remove(filename)
+    except OSError:
+        logger.warning("The temporary file used by moviepy cannot be deleted.")
+
+    return Summary.Image(
+        height=h, width=w, colorspace=c, encoded_image_string=tensor_string
+    )
+
+
+def audio(tag, tensor, sample_rate=44100):
+    array = make_np(tensor)
+    array = array.squeeze()
+    if abs(array).max() > 1:
+        print("warning: audio amplitude out of range, auto clipped.")
+        array = array.clip(-1, 1)
+    assert array.ndim == 1, "input tensor should be 1 dimensional."
+    array = (array * np.iinfo(np.int16).max).astype("<i2")
+
+    import io
+    import wave
+
+    fio = io.BytesIO()
+    with wave.open(fio, "wb") as wave_write:
+        wave_write.setnchannels(1)
+        wave_write.setsampwidth(2)
+        wave_write.setframerate(sample_rate)
+        wave_write.writeframes(array.data)
+    audio_string = fio.getvalue()
+    fio.close()
+    audio = Summary.Audio(
+        sample_rate=sample_rate,
+        num_channels=1,
+        length_frames=array.shape[-1],
+        encoded_audio_string=audio_string,
+        content_type="audio/wav",
+    )
+    return Summary(value=[Summary.Value(tag=tag, audio=audio)])
+
+
+def custom_scalars(layout):
+    categories = []
+    for k, v in layout.items():
+        charts = []
+        for chart_name, chart_meatadata in v.items():
+            tags = chart_meatadata[1]
+            if chart_meatadata[0] == "Margin":
+                assert len(tags) == 3
+                mgcc = layout_pb2.MarginChartContent(
+                    series=[
+                        layout_pb2.MarginChartContent.Series(
+                            value=tags[0], lower=tags[1], upper=tags[2]
+                        )
+                    ]
+                )
+                chart = layout_pb2.Chart(title=chart_name, margin=mgcc)
+            else:
+                mlcc = layout_pb2.MultilineChartContent(tag=tags)
+                chart = layout_pb2.Chart(title=chart_name, multiline=mlcc)
+            charts.append(chart)
+        categories.append(layout_pb2.Category(title=k, chart=charts))
+
+    layout = layout_pb2.Layout(category=categories)
+    plugin_data = SummaryMetadata.PluginData(plugin_name="custom_scalars")
+    smd = SummaryMetadata(plugin_data=plugin_data)
+    tensor = TensorProto(
+        dtype="DT_STRING",
+        string_val=[layout.SerializeToString()],
+        tensor_shape=TensorShapeProto(),
+    )
+    return Summary(
+        value=[
+            Summary.Value(tag="custom_scalars__config__", tensor=tensor, metadata=smd)
+        ]
+    )
+
+
+def text(tag, text):
+    plugin_data = SummaryMetadata.PluginData(
+        plugin_name="text", content=TextPluginData(version=0).SerializeToString()
+    )
+    smd = SummaryMetadata(plugin_data=plugin_data)
+    tensor = TensorProto(
+        dtype="DT_STRING",
+        string_val=[text.encode(encoding="utf_8")],
+        tensor_shape=TensorShapeProto(dim=[TensorShapeProto.Dim(size=1)]),
+    )
+    return Summary(
+        value=[Summary.Value(tag=tag + "/text_summary", metadata=smd, tensor=tensor)]
+    )
+
+
+def pr_curve_raw(
+    tag, tp, fp, tn, fn, precision, recall, num_thresholds=127, weights=None
+):
+    if num_thresholds > 127:  # weird, value > 127 breaks protobuf
+        num_thresholds = 127
+    data = np.stack((tp, fp, tn, fn, precision, recall))
+    pr_curve_plugin_data = PrCurvePluginData(
+        version=0, num_thresholds=num_thresholds
+    ).SerializeToString()
+    plugin_data = SummaryMetadata.PluginData(
+        plugin_name="pr_curves", content=pr_curve_plugin_data
+    )
+    smd = SummaryMetadata(plugin_data=plugin_data)
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=data.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=data.shape[0]),
+                TensorShapeProto.Dim(size=data.shape[1]),
+            ]
+        ),
+    )
+    return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
+
+
+def pr_curve(tag, labels, predictions, num_thresholds=127, weights=None):
+    # weird, value > 127 breaks protobuf
+    num_thresholds = min(num_thresholds, 127)
+    data = compute_curve(
+        labels, predictions, num_thresholds=num_thresholds, weights=weights
+    )
+    pr_curve_plugin_data = PrCurvePluginData(
+        version=0, num_thresholds=num_thresholds
+    ).SerializeToString()
+    plugin_data = SummaryMetadata.PluginData(
+        plugin_name="pr_curves", content=pr_curve_plugin_data
+    )
+    smd = SummaryMetadata(plugin_data=plugin_data)
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=data.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=data.shape[0]),
+                TensorShapeProto.Dim(size=data.shape[1]),
+            ]
+        ),
+    )
+    return Summary(value=[Summary.Value(tag=tag, metadata=smd, tensor=tensor)])
+
+
+# https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/summary.py
+def compute_curve(labels, predictions, num_thresholds=None, weights=None):
+    _MINIMUM_COUNT = 1e-7
+
+    if weights is None:
+        weights = 1.0
+
+    # Compute bins of true positives and false positives.
+    bucket_indices = np.int32(np.floor(predictions * (num_thresholds - 1)))
+    float_labels = labels.astype(np.float64)
+    histogram_range = (0, num_thresholds - 1)
+    tp_buckets, _ = np.histogram(
+        bucket_indices,
+        bins=num_thresholds,
+        range=histogram_range,
+        weights=float_labels * weights,
+    )
+    fp_buckets, _ = np.histogram(
+        bucket_indices,
+        bins=num_thresholds,
+        range=histogram_range,
+        weights=(1.0 - float_labels) * weights,
+    )
+
+    # Obtain the reverse cumulative sum.
+    tp = np.cumsum(tp_buckets[::-1])[::-1]
+    fp = np.cumsum(fp_buckets[::-1])[::-1]
+    tn = fp[0] - fp
+    fn = tp[0] - tp
+    precision = tp / np.maximum(_MINIMUM_COUNT, tp + fp)
+    recall = tp / np.maximum(_MINIMUM_COUNT, tp + fn)
+    return np.stack((tp, fp, tn, fn, precision, recall))
+
+
+def _get_tensor_summary(
+    name, display_name, description, tensor, content_type, components, json_config
+):
+    """Create a tensor summary with summary metadata.
+
+    Args:
+      name: Uniquely identifiable name of the summary op. Could be replaced by
+        combination of name and type to make it unique even outside of this
+        summary.
+      display_name: Will be used as the display name in TensorBoard.
+        Defaults to `name`.
+      description: A longform readable description of the summary data. Markdown
+        is supported.
+      tensor: Tensor to display in summary.
+      content_type: Type of content inside the Tensor.
+      components: Bitmask representing present parts (vertices, colors, etc.) that
+        belong to the summary.
+      json_config: A string, JSON-serialized dictionary of ThreeJS classes
+        configuration.
+
+    Returns:
+      Tensor summary with metadata.
+    """
+    import torch
+    from tensorboard.plugins.mesh import metadata
+
+    tensor = torch.as_tensor(tensor)
+
+    tensor_metadata = metadata.create_summary_metadata(
+        name,
+        display_name,
+        content_type,
+        components,
+        tensor.shape,
+        description,
+        json_config=json_config,
+    )
+
+    tensor = TensorProto(
+        dtype="DT_FLOAT",
+        float_val=tensor.reshape(-1).tolist(),
+        tensor_shape=TensorShapeProto(
+            dim=[
+                TensorShapeProto.Dim(size=tensor.shape[0]),
+                TensorShapeProto.Dim(size=tensor.shape[1]),
+                TensorShapeProto.Dim(size=tensor.shape[2]),
+            ]
+        ),
+    )
+
+    tensor_summary = Summary.Value(
+        tag=metadata.get_instance_name(name, content_type),
+        tensor=tensor,
+        metadata=tensor_metadata,
+    )
+
+    return tensor_summary
+
+
+def _get_json_config(config_dict):
+    """Parse and returns JSON string from python dictionary."""
+    json_config = "{}"
+    if config_dict is not None:
+        json_config = json.dumps(config_dict, sort_keys=True)
+    return json_config
+
+
+# https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/mesh/summary.py
+def mesh(
+    tag, vertices, colors, faces, config_dict, display_name=None, description=None
+):
+    """Output a merged `Summary` protocol buffer with a mesh/point cloud.
+
+    Args:
+      tag: A name for this summary operation.
+      vertices: Tensor of shape `[dim_1, ..., dim_n, 3]` representing the 3D
+        coordinates of vertices.
+      faces: Tensor of shape `[dim_1, ..., dim_n, 3]` containing indices of
+        vertices within each triangle.
+      colors: Tensor of shape `[dim_1, ..., dim_n, 3]` containing colors for each
+        vertex.
+      display_name: If set, will be used as the display name in TensorBoard.
+        Defaults to `name`.
+      description: A longform readable description of the summary data. Markdown
+        is supported.
+      config_dict: Dictionary with ThreeJS classes names and configuration.
+
+    Returns:
+      Merged summary for mesh/point cloud representation.
+    """
+    from tensorboard.plugins.mesh import metadata
+    from tensorboard.plugins.mesh.plugin_data_pb2 import MeshPluginData
+
+    json_config = _get_json_config(config_dict)
+
+    summaries = []
+    tensors = [
+        (vertices, MeshPluginData.VERTEX),
+        (faces, MeshPluginData.FACE),
+        (colors, MeshPluginData.COLOR),
+    ]
+    tensors = [tensor for tensor in tensors if tensor[0] is not None]
+    components = metadata.get_components_bitmask(
+        [content_type for (tensor, content_type) in tensors]
+    )
+
+    for tensor, content_type in tensors:
+        summaries.append(
+            _get_tensor_summary(
+                tag,
+                display_name,
+                description,
+                tensor,
+                content_type,
+                components,
+                json_config,
+            )
+        )
+
+    return Summary(value=summaries)
diff --git a/MLPY/Lib/site-packages/torch/utils/tensorboard/writer.py b/MLPY/Lib/site-packages/torch/utils/tensorboard/writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bad6cc8b216628b3ac7a624669f08fe61c27365
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/tensorboard/writer.py
@@ -0,0 +1,1270 @@
+"""Provide an API for writing protocol buffers to event files to be consumed by TensorBoard for visualization."""
+
+import os
+import time
+from typing import List, Optional, Union, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from matplotlib.figure import Figure
+from tensorboard.compat import tf
+from tensorboard.compat.proto import event_pb2
+from tensorboard.compat.proto.event_pb2 import Event, SessionLog
+from tensorboard.plugins.projector.projector_config_pb2 import ProjectorConfig
+from tensorboard.summary.writer.event_file_writer import EventFileWriter
+
+from ._convert_np import make_np
+from ._embedding import get_embedding_info, make_mat, make_sprite, make_tsv, write_pbtxt
+from ._onnx_graph import load_onnx_graph
+from ._pytorch_graph import graph
+from ._utils import figure_to_image
+from .summary import (
+    audio,
+    custom_scalars,
+    histogram,
+    histogram_raw,
+    hparams,
+    image,
+    image_boxes,
+    mesh,
+    pr_curve,
+    pr_curve_raw,
+    scalar,
+    tensor_proto,
+    text,
+    video,
+)
+
+__all__ = ["FileWriter", "SummaryWriter"]
+
+
+class FileWriter:
+    """Writes protocol buffers to event files to be consumed by TensorBoard.
+
+    The `FileWriter` class provides a mechanism to create an event file in a
+    given directory and add summaries and events to it. The class updates the
+    file contents asynchronously. This allows a training program to call methods
+    to add data to the file directly from the training loop, without slowing down
+    training.
+    """
+
+    def __init__(self, log_dir, max_queue=10, flush_secs=120, filename_suffix=""):
+        """Create a `FileWriter` and an event file.
+
+        On construction the writer creates a new event file in `log_dir`.
+        The other arguments to the constructor control the asynchronous writes to
+        the event file.
+
+        Args:
+          log_dir: A string. Directory where event file will be written.
+          max_queue: Integer. Size of the queue for pending events and
+            summaries before one of the 'add' calls forces a flush to disk.
+            Default is ten items.
+          flush_secs: Number. How often, in seconds, to flush the
+            pending events and summaries to disk. Default is every two minutes.
+          filename_suffix: A string. Suffix added to all event filenames
+            in the log_dir directory. More details on filename construction in
+            tensorboard.summary.writer.event_file_writer.EventFileWriter.
+        """
+        # Sometimes PosixPath is passed in and we need to coerce it to
+        # a string in all cases
+        # TODO: See if we can remove this in the future if we are
+        # actually the ones passing in a PosixPath
+        log_dir = str(log_dir)
+        self.event_writer = EventFileWriter(
+            log_dir, max_queue, flush_secs, filename_suffix
+        )
+
+    def get_logdir(self):
+        """Return the directory where event file will be written."""
+        return self.event_writer.get_logdir()
+
+    def add_event(self, event, step=None, walltime=None):
+        """Add an event to the event file.
+
+        Args:
+          event: An `Event` protocol buffer.
+          step: Number. Optional global step value for training process
+            to record with the event.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time()) seconds after epoch
+        """
+        event.wall_time = time.time() if walltime is None else walltime
+        if step is not None:
+            # Make sure step is converted from numpy or other formats
+            # since protobuf might not convert depending on version
+            event.step = int(step)
+        self.event_writer.add_event(event)
+
+    def add_summary(self, summary, global_step=None, walltime=None):
+        """Add a `Summary` protocol buffer to the event file.
+
+        This method wraps the provided summary in an `Event` protocol buffer
+        and adds it to the event file.
+
+        Args:
+          summary: A `Summary` protocol buffer.
+          global_step: Number. Optional global step value for training process
+            to record with the summary.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time()) seconds after epoch
+        """
+        event = event_pb2.Event(summary=summary)
+        self.add_event(event, global_step, walltime)
+
+    def add_graph(self, graph_profile, walltime=None):
+        """Add a `Graph` and step stats protocol buffer to the event file.
+
+        Args:
+          graph_profile: A `Graph` and step stats protocol buffer.
+          walltime: float. Optional walltime to override the default (current)
+            walltime (from time.time()) seconds after epoch
+        """
+        graph = graph_profile[0]
+        stepstats = graph_profile[1]
+        event = event_pb2.Event(graph_def=graph.SerializeToString())
+        self.add_event(event, None, walltime)
+
+        trm = event_pb2.TaggedRunMetadata(
+            tag="step1", run_metadata=stepstats.SerializeToString()
+        )
+        event = event_pb2.Event(tagged_run_metadata=trm)
+        self.add_event(event, None, walltime)
+
+    def add_onnx_graph(self, graph, walltime=None):
+        """Add a `Graph` protocol buffer to the event file.
+
+        Args:
+          graph: A `Graph` protocol buffer.
+          walltime: float. Optional walltime to override the default (current)
+            _get_file_writerfrom time.time())
+        """
+        event = event_pb2.Event(graph_def=graph.SerializeToString())
+        self.add_event(event, None, walltime)
+
+    def flush(self):
+        """Flushes the event file to disk.
+
+        Call this method to make sure that all pending events have been written to
+        disk.
+        """
+        self.event_writer.flush()
+
+    def close(self):
+        """Flushes the event file to disk and close the file.
+
+        Call this method when you do not need the summary writer anymore.
+        """
+        self.event_writer.close()
+
+    def reopen(self):
+        """Reopens the EventFileWriter.
+
+        Can be called after `close()` to add more events in the same directory.
+        The events will go into a new events file.
+        Does nothing if the EventFileWriter was not closed.
+        """
+        self.event_writer.reopen()
+
+
+class SummaryWriter:
+    """Writes entries directly to event files in the log_dir to be consumed by TensorBoard.
+
+    The `SummaryWriter` class provides a high-level API to create an event file
+    in a given directory and add summaries and events to it. The class updates the
+    file contents asynchronously. This allows a training program to call methods
+    to add data to the file directly from the training loop, without slowing down
+    training.
+    """
+
+    def __init__(
+        self,
+        log_dir=None,
+        comment="",
+        purge_step=None,
+        max_queue=10,
+        flush_secs=120,
+        filename_suffix="",
+    ):
+        """Create a `SummaryWriter` that will write out events and summaries to the event file.
+
+        Args:
+            log_dir (str): Save directory location. Default is
+              runs/**CURRENT_DATETIME_HOSTNAME**, which changes after each run.
+              Use hierarchical folder structure to compare
+              between runs easily. e.g. pass in 'runs/exp1', 'runs/exp2', etc.
+              for each new experiment to compare across them.
+            comment (str): Comment log_dir suffix appended to the default
+              ``log_dir``. If ``log_dir`` is assigned, this argument has no effect.
+            purge_step (int):
+              When logging crashes at step :math:`T+X` and restarts at step :math:`T`,
+              any events whose global_step larger or equal to :math:`T` will be
+              purged and hidden from TensorBoard.
+              Note that crashed and resumed experiments should have the same ``log_dir``.
+            max_queue (int): Size of the queue for pending events and
+              summaries before one of the 'add' calls forces a flush to disk.
+              Default is ten items.
+            flush_secs (int): How often, in seconds, to flush the
+              pending events and summaries to disk. Default is every two minutes.
+            filename_suffix (str): Suffix added to all event filenames in
+              the log_dir directory. More details on filename construction in
+              tensorboard.summary.writer.event_file_writer.EventFileWriter.
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+
+            # create a summary writer with automatically generated folder name.
+            writer = SummaryWriter()
+            # folder location: runs/May04_22-14-54_s-MacBook-Pro.local/
+
+            # create a summary writer using the specified folder name.
+            writer = SummaryWriter("my_experiment")
+            # folder location: my_experiment
+
+            # create a summary writer with comment appended.
+            writer = SummaryWriter(comment="LR_0.1_BATCH_16")
+            # folder location: runs/May04_22-14-54_s-MacBook-Pro.localLR_0.1_BATCH_16/
+
+        """
+        torch._C._log_api_usage_once("tensorboard.create.summarywriter")
+        if not log_dir:
+            import socket
+            from datetime import datetime
+
+            current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+            log_dir = os.path.join(
+                "runs", current_time + "_" + socket.gethostname() + comment
+            )
+        self.log_dir = log_dir
+        self.purge_step = purge_step
+        self.max_queue = max_queue
+        self.flush_secs = flush_secs
+        self.filename_suffix = filename_suffix
+
+        # Initialize the file writers, but they can be cleared out on close
+        # and recreated later as needed.
+        self.file_writer = self.all_writers = None
+        self._get_file_writer()
+
+        # Create default bins for histograms, see generate_testdata.py in tensorflow/tensorboard
+        v = 1e-12
+        buckets = []
+        neg_buckets = []
+        while v < 1e20:
+            buckets.append(v)
+            neg_buckets.append(-v)
+            v *= 1.1
+        self.default_bins = neg_buckets[::-1] + [0] + buckets
+
+    def _check_caffe2_blob(self, item):
+        """
+        Check if the input is a string representing a Caffe2 blob name.
+
+        Caffe2 users have the option of passing a string representing the name of a blob
+        in the workspace instead of passing the actual Tensor/array containing the numeric values.
+        Thus, we need to check if we received a string as input
+        instead of an actual Tensor/array, and if so, we need to fetch the Blob
+        from the workspace corresponding to that name. Fetching can be done with the
+        following:
+
+        from caffe2.python import workspace (if not already imported)
+        workspace.FetchBlob(blob_name)
+        workspace.FetchBlobs([blob_name1, blob_name2, ...])
+        """
+        return isinstance(item, str)
+
+    def _get_file_writer(self):
+        """Return the default FileWriter instance. Recreates it if closed."""
+        if self.all_writers is None or self.file_writer is None:
+            self.file_writer = FileWriter(
+                self.log_dir, self.max_queue, self.flush_secs, self.filename_suffix
+            )
+            self.all_writers = {self.file_writer.get_logdir(): self.file_writer}
+            if self.purge_step is not None:
+                most_recent_step = self.purge_step
+                self.file_writer.add_event(
+                    Event(step=most_recent_step, file_version="brain.Event:2")
+                )
+                self.file_writer.add_event(
+                    Event(
+                        step=most_recent_step,
+                        session_log=SessionLog(status=SessionLog.START),
+                    )
+                )
+                self.purge_step = None
+        return self.file_writer
+
+    def get_logdir(self):
+        """Return the directory where event files will be written."""
+        return self.log_dir
+
+    def add_hparams(
+        self, hparam_dict, metric_dict, hparam_domain_discrete=None, run_name=None, global_step=None
+    ):
+        """Add a set of hyperparameters to be compared in TensorBoard.
+
+        Args:
+            hparam_dict (dict): Each key-value pair in the dictionary is the
+              name of the hyper parameter and it's corresponding value.
+              The type of the value can be one of `bool`, `string`, `float`,
+              `int`, or `None`.
+            metric_dict (dict): Each key-value pair in the dictionary is the
+              name of the metric and it's corresponding value. Note that the key used
+              here should be unique in the tensorboard record. Otherwise the value
+              you added by ``add_scalar`` will be displayed in hparam plugin. In most
+              cases, this is unwanted.
+            hparam_domain_discrete: (Optional[Dict[str, List[Any]]]) A dictionary that
+              contains names of the hyperparameters and all discrete values they can hold
+            run_name (str): Name of the run, to be included as part of the logdir.
+              If unspecified, will use current timestamp.
+            global_step (int): Global step value to record
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            with SummaryWriter() as w:
+                for i in range(5):
+                    w.add_hparams({'lr': 0.1*i, 'bsize': i},
+                                  {'hparam/accuracy': 10*i, 'hparam/loss': 10*i})
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_hparam.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_hparams")
+        if type(hparam_dict) is not dict or type(metric_dict) is not dict:
+            raise TypeError("hparam_dict and metric_dict should be dictionary.")
+        exp, ssi, sei = hparams(hparam_dict, metric_dict, hparam_domain_discrete)
+
+        if not run_name:
+            run_name = str(time.time())
+        logdir = os.path.join(self._get_file_writer().get_logdir(), run_name)
+        with SummaryWriter(log_dir=logdir) as w_hp:
+            w_hp.file_writer.add_summary(exp, global_step)
+            w_hp.file_writer.add_summary(ssi, global_step)
+            w_hp.file_writer.add_summary(sei, global_step)
+            for k, v in metric_dict.items():
+                w_hp.add_scalar(k, v, global_step)
+
+    def add_scalar(
+        self,
+        tag,
+        scalar_value,
+        global_step=None,
+        walltime=None,
+        new_style=False,
+        double_precision=False,
+    ):
+        """Add scalar data to summary.
+
+        Args:
+            tag (str): Data identifier
+            scalar_value (float or string/blobname): Value to save
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              with seconds after epoch of event
+            new_style (boolean): Whether to use new style (tensor field) or old
+              style (simple_value field). New style could lead to faster data loading.
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            writer = SummaryWriter()
+            x = range(100)
+            for i in x:
+                writer.add_scalar('y=2x', i * 2, i)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_scalar.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_scalar")
+        if self._check_caffe2_blob(scalar_value):
+            from caffe2.python import workspace
+
+            scalar_value = workspace.FetchBlob(scalar_value)
+
+        summary = scalar(
+            tag, scalar_value, new_style=new_style, double_precision=double_precision
+        )
+        self._get_file_writer().add_summary(summary, global_step, walltime)
+
+    def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None):
+        """Add many scalar data to summary.
+
+        Args:
+            main_tag (str): The parent name for the tags
+            tag_scalar_dict (dict): Key-value pair storing the tag and corresponding values
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            writer = SummaryWriter()
+            r = 5
+            for i in range(100):
+                writer.add_scalars('run_14h', {'xsinx':i*np.sin(i/r),
+                                                'xcosx':i*np.cos(i/r),
+                                                'tanx': np.tan(i/r)}, i)
+            writer.close()
+            # This call adds three values to the same scalar plot with the tag
+            # 'run_14h' in TensorBoard's scalar section.
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_scalars.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_scalars")
+        walltime = time.time() if walltime is None else walltime
+        fw_logdir = self._get_file_writer().get_logdir()
+        for tag, scalar_value in tag_scalar_dict.items():
+            fw_tag = fw_logdir + "/" + main_tag.replace("/", "_") + "_" + tag
+            assert self.all_writers is not None
+            if fw_tag in self.all_writers.keys():
+                fw = self.all_writers[fw_tag]
+            else:
+                fw = FileWriter(
+                    fw_tag, self.max_queue, self.flush_secs, self.filename_suffix
+                )
+                self.all_writers[fw_tag] = fw
+            if self._check_caffe2_blob(scalar_value):
+                from caffe2.python import workspace
+
+                scalar_value = workspace.FetchBlob(scalar_value)
+            fw.add_summary(scalar(main_tag, scalar_value), global_step, walltime)
+
+    def add_tensor(
+        self,
+        tag,
+        tensor,
+        global_step=None,
+        walltime=None,
+    ):
+        """Add tensor data to summary.
+
+        Args:
+            tag (str): Data identifier
+            tensor (torch.Tensor): tensor to save
+            global_step (int): Global step value to record
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            writer = SummaryWriter()
+            x = torch.tensor([1,2,3])
+            writer.add_scalar('x', x)
+            writer.close()
+
+        Expected result:
+            Summary::tensor::float_val [1,2,3]
+                   ::tensor::shape [3]
+                   ::tag 'x'
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_tensor")
+        if self._check_caffe2_blob(tensor):
+            from caffe2.python import workspace
+
+            tensor = torch.tensor(workspace.FetchBlob(tensor))
+
+        summary = tensor_proto(tag, tensor)
+        self._get_file_writer().add_summary(summary, global_step, walltime)
+
+    def add_histogram(
+        self,
+        tag,
+        values,
+        global_step=None,
+        bins="tensorflow",
+        walltime=None,
+        max_bins=None,
+    ):
+        """Add histogram to summary.
+
+        Args:
+            tag (str): Data identifier
+            values (torch.Tensor, numpy.ndarray, or string/blobname): Values to build histogram
+            global_step (int): Global step value to record
+            bins (str): One of {'tensorflow','auto', 'fd', ...}. This determines how the bins are made. You can find
+              other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+            writer = SummaryWriter()
+            for i in range(10):
+                x = np.random.random(1000)
+                writer.add_histogram('distribution centers', x + i, i)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_histogram.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_histogram")
+        if self._check_caffe2_blob(values):
+            from caffe2.python import workspace
+
+            values = workspace.FetchBlob(values)
+        if isinstance(bins, str) and bins == "tensorflow":
+            bins = self.default_bins
+        self._get_file_writer().add_summary(
+            histogram(tag, values, bins, max_bins=max_bins), global_step, walltime
+        )
+
+    def add_histogram_raw(
+        self,
+        tag,
+        min,
+        max,
+        num,
+        sum,
+        sum_squares,
+        bucket_limits,
+        bucket_counts,
+        global_step=None,
+        walltime=None,
+    ):
+        """Add histogram with raw data.
+
+        Args:
+            tag (str): Data identifier
+            min (float or int): Min value
+            max (float or int): Max value
+            num (int): Number of values
+            sum (float or int): Sum of all values
+            sum_squares (float or int): Sum of squares for all values
+            bucket_limits (torch.Tensor, numpy.ndarray): Upper value per bucket.
+              The number of elements of it should be the same as `bucket_counts`.
+            bucket_counts (torch.Tensor, numpy.ndarray): Number of values per bucket
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+            see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/histogram/README.md
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+            writer = SummaryWriter()
+            dummy_data = []
+            for idx, value in enumerate(range(50)):
+                dummy_data += [idx + 0.001] * value
+
+            bins = list(range(50+2))
+            bins = np.array(bins)
+            values = np.array(dummy_data).astype(float).reshape(-1)
+            counts, limits = np.histogram(values, bins=bins)
+            sum_sq = values.dot(values)
+            writer.add_histogram_raw(
+                tag='histogram_with_raw_data',
+                min=values.min(),
+                max=values.max(),
+                num=len(values),
+                sum=values.sum(),
+                sum_squares=sum_sq,
+                bucket_limits=limits[1:].tolist(),
+                bucket_counts=counts.tolist(),
+                global_step=0)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_histogram_raw.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_histogram_raw")
+        if len(bucket_limits) != len(bucket_counts):
+            raise ValueError(
+                "len(bucket_limits) != len(bucket_counts), see the document."
+            )
+        self._get_file_writer().add_summary(
+            histogram_raw(
+                tag, min, max, num, sum, sum_squares, bucket_limits, bucket_counts
+            ),
+            global_step,
+            walltime,
+        )
+
+    def add_image(
+        self, tag, img_tensor, global_step=None, walltime=None, dataformats="CHW"
+    ):
+        """Add image data to summary.
+
+        Note that this requires the ``pillow`` package.
+
+        Args:
+            tag (str): Data identifier
+            img_tensor (torch.Tensor, numpy.ndarray, or string/blobname): Image data
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+            dataformats (str): Image data format specification of the form
+              CHW, HWC, HW, WH, etc.
+        Shape:
+            img_tensor: Default is :math:`(3, H, W)`. You can use ``torchvision.utils.make_grid()`` to
+            convert a batch of tensor into 3xHxW format or call ``add_images`` and let us do the job.
+            Tensor with :math:`(1, H, W)`, :math:`(H, W)`, :math:`(H, W, 3)` is also suitable as long as
+            corresponding ``dataformats`` argument is passed, e.g. ``CHW``, ``HWC``, ``HW``.
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+            img = np.zeros((3, 100, 100))
+            img[0] = np.arange(0, 10000).reshape(100, 100) / 10000
+            img[1] = 1 - np.arange(0, 10000).reshape(100, 100) / 10000
+
+            img_HWC = np.zeros((100, 100, 3))
+            img_HWC[:, :, 0] = np.arange(0, 10000).reshape(100, 100) / 10000
+            img_HWC[:, :, 1] = 1 - np.arange(0, 10000).reshape(100, 100) / 10000
+
+            writer = SummaryWriter()
+            writer.add_image('my_image', img, 0)
+
+            # If you have non-default dimension setting, set the dataformats argument.
+            writer.add_image('my_image_HWC', img_HWC, 0, dataformats='HWC')
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_image.png
+           :scale: 50 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_image")
+        if self._check_caffe2_blob(img_tensor):
+            from caffe2.python import workspace
+
+            img_tensor = workspace.FetchBlob(img_tensor)
+        self._get_file_writer().add_summary(
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime
+        )
+
+    def add_images(
+        self, tag, img_tensor, global_step=None, walltime=None, dataformats="NCHW"
+    ):
+        """Add batched image data to summary.
+
+        Note that this requires the ``pillow`` package.
+
+        Args:
+            tag (str): Data identifier
+            img_tensor (torch.Tensor, numpy.ndarray, or string/blobname): Image data
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+            dataformats (str): Image data format specification of the form
+              NCHW, NHWC, CHW, HWC, HW, WH, etc.
+        Shape:
+            img_tensor: Default is :math:`(N, 3, H, W)`. If ``dataformats`` is specified, other shape will be
+            accepted. e.g. NCHW or NHWC.
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+
+            img_batch = np.zeros((16, 3, 100, 100))
+            for i in range(16):
+                img_batch[i, 0] = np.arange(0, 10000).reshape(100, 100) / 10000 / 16 * i
+                img_batch[i, 1] = (1 - np.arange(0, 10000).reshape(100, 100) / 10000) / 16 * i
+
+            writer = SummaryWriter()
+            writer.add_images('my_image_batch', img_batch, 0)
+            writer.close()
+
+        Expected result:
+
+        .. image:: _static/img/tensorboard/add_images.png
+           :scale: 30 %
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_images")
+        if self._check_caffe2_blob(img_tensor):
+            from caffe2.python import workspace
+
+            img_tensor = workspace.FetchBlob(img_tensor)
+        self._get_file_writer().add_summary(
+            image(tag, img_tensor, dataformats=dataformats), global_step, walltime
+        )
+
+    def add_image_with_boxes(
+        self,
+        tag,
+        img_tensor,
+        box_tensor,
+        global_step=None,
+        walltime=None,
+        rescale=1,
+        dataformats="CHW",
+        labels=None,
+    ):
+        """Add image and draw bounding boxes on the image.
+
+        Args:
+            tag (str): Data identifier
+            img_tensor (torch.Tensor, numpy.ndarray, or string/blobname): Image data
+            box_tensor (torch.Tensor, numpy.ndarray, or string/blobname): Box data (for detected objects)
+              box should be represented as [x1, y1, x2, y2].
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+            rescale (float): Optional scale override
+            dataformats (str): Image data format specification of the form
+              NCHW, NHWC, CHW, HWC, HW, WH, etc.
+            labels (list of string): The label to be shown for each bounding box.
+        Shape:
+            img_tensor: Default is :math:`(3, H, W)`. It can be specified with ``dataformats`` argument.
+            e.g. CHW or HWC
+
+            box_tensor: (torch.Tensor, numpy.ndarray, or string/blobname): NX4,  where N is the number of
+            boxes and each 4 elements in a row represents (xmin, ymin, xmax, ymax).
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_image_with_boxes")
+        if self._check_caffe2_blob(img_tensor):
+            from caffe2.python import workspace
+
+            img_tensor = workspace.FetchBlob(img_tensor)
+        if self._check_caffe2_blob(box_tensor):
+            from caffe2.python import workspace
+
+            box_tensor = workspace.FetchBlob(box_tensor)
+        if labels is not None:
+            if isinstance(labels, str):
+                labels = [labels]
+            if len(labels) != box_tensor.shape[0]:
+                labels = None
+        self._get_file_writer().add_summary(
+            image_boxes(
+                tag,
+                img_tensor,
+                box_tensor,
+                rescale=rescale,
+                dataformats=dataformats,
+                labels=labels,
+            ),
+            global_step,
+            walltime,
+        )
+
+    def add_figure(
+        self,
+        tag: str,
+        figure: Union["Figure", List["Figure"]],
+        global_step: Optional[int] = None,
+        close: bool = True,
+        walltime: Optional[float] = None
+    ) -> None:
+        """Render matplotlib figure into an image and add it to summary.
+
+        Note that this requires the ``matplotlib`` package.
+
+        Args:
+            tag: Data identifier
+            figure: Figure or a list of figures
+            global_step: Global step value to record
+            close: Flag to automatically close the figure
+            walltime: Optional override default walltime (time.time())
+              seconds after epoch of event
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_figure")
+        if isinstance(figure, list):
+            self.add_image(
+                tag,
+                figure_to_image(figure, close),
+                global_step,
+                walltime,
+                dataformats="NCHW",
+            )
+        else:
+            self.add_image(
+                tag,
+                figure_to_image(figure, close),
+                global_step,
+                walltime,
+                dataformats="CHW",
+            )
+
+    def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
+        """Add video data to summary.
+
+        Note that this requires the ``moviepy`` package.
+
+        Args:
+            tag (str): Data identifier
+            vid_tensor (torch.Tensor): Video data
+            global_step (int): Global step value to record
+            fps (float or int): Frames per second
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+        Shape:
+            vid_tensor: :math:`(N, T, C, H, W)`. The values should lie in [0, 255] for type `uint8` or [0, 1] for type `float`.
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_video")
+        self._get_file_writer().add_summary(
+            video(tag, vid_tensor, fps), global_step, walltime
+        )
+
+    def add_audio(
+        self, tag, snd_tensor, global_step=None, sample_rate=44100, walltime=None
+    ):
+        """Add audio data to summary.
+
+        Args:
+            tag (str): Data identifier
+            snd_tensor (torch.Tensor): Sound data
+            global_step (int): Global step value to record
+            sample_rate (int): sample rate in Hz
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+        Shape:
+            snd_tensor: :math:`(1, L)`. The values should lie between [-1, 1].
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_audio")
+        if self._check_caffe2_blob(snd_tensor):
+            from caffe2.python import workspace
+
+            snd_tensor = workspace.FetchBlob(snd_tensor)
+        self._get_file_writer().add_summary(
+            audio(tag, snd_tensor, sample_rate=sample_rate), global_step, walltime
+        )
+
+    def add_text(self, tag, text_string, global_step=None, walltime=None):
+        """Add text data to summary.
+
+        Args:
+            tag (str): Data identifier
+            text_string (str): String to save
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+        Examples::
+
+            writer.add_text('lstm', 'This is an lstm', 0)
+            writer.add_text('rnn', 'This is an rnn', 10)
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_text")
+        self._get_file_writer().add_summary(
+            text(tag, text_string), global_step, walltime
+        )
+
+    def add_onnx_graph(self, prototxt):
+        torch._C._log_api_usage_once("tensorboard.logging.add_onnx_graph")
+        self._get_file_writer().add_onnx_graph(load_onnx_graph(prototxt))
+
+    def add_graph(
+        self, model, input_to_model=None, verbose=False, use_strict_trace=True
+    ):
+        """Add graph data to summary.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            input_to_model (torch.Tensor or list of torch.Tensor): A variable or a tuple of
+                variables to be fed.
+            verbose (bool): Whether to print graph structure in console.
+            use_strict_trace (bool): Whether to pass keyword argument `strict` to
+                `torch.jit.trace`. Pass False when you want the tracer to
+                record your mutable container types (list, dict)
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_graph")
+        if hasattr(model, "forward"):
+            # A valid PyTorch model should have a 'forward' method
+            self._get_file_writer().add_graph(
+                graph(model, input_to_model, verbose, use_strict_trace)
+            )
+        else:
+            # Caffe2 models do not have the 'forward' method
+            from caffe2.proto import caffe2_pb2
+            from caffe2.python import core
+
+            from ._caffe2_graph import (
+                model_to_graph_def,
+                nets_to_graph_def,
+                protos_to_graph_def,
+            )
+
+            if isinstance(model, list):
+                if isinstance(model[0], core.Net):
+                    current_graph = nets_to_graph_def(model)
+                elif isinstance(model[0], caffe2_pb2.NetDef):
+                    current_graph = protos_to_graph_def(model)
+            else:
+                # Handles cnn.CNNModelHelper, model_helper.ModelHelper
+                current_graph = model_to_graph_def(model)
+            event = event_pb2.Event(graph_def=current_graph.SerializeToString())  # type: ignore[possibly-undefined]
+            self._get_file_writer().add_event(event)
+
+    @staticmethod
+    def _encode(rawstr):
+        # I'd use urllib but, I'm unsure about the differences from python3 to python2, etc.
+        retval = rawstr
+        retval = retval.replace("%", f"%{ord('%'):02x}")
+        retval = retval.replace("/", f"%{ord('/'):02x}")
+        retval = retval.replace("\\", "%%%02x" % (ord("\\")))
+        return retval
+
+    def add_embedding(
+        self,
+        mat,
+        metadata=None,
+        label_img=None,
+        global_step=None,
+        tag="default",
+        metadata_header=None,
+    ):
+        """Add embedding projector data to summary.
+
+        Args:
+            mat (torch.Tensor or numpy.ndarray): A matrix which each row is the feature vector of the data point
+            metadata (list): A list of labels, each element will be convert to string
+            label_img (torch.Tensor): Images correspond to each data point
+            global_step (int): Global step value to record
+            tag (str): Name for the embedding
+        Shape:
+            mat: :math:`(N, D)`, where N is number of data and D is feature dimension
+
+            label_img: :math:`(N, C, H, W)`
+
+        Examples::
+
+            import keyword
+            import torch
+            meta = []
+            while len(meta)<100:
+                meta = meta+keyword.kwlist # get some strings
+            meta = meta[:100]
+
+            for i, v in enumerate(meta):
+                meta[i] = v+str(i)
+
+            label_img = torch.rand(100, 3, 10, 32)
+            for i in range(100):
+                label_img[i]*=i/100.0
+
+            writer.add_embedding(torch.randn(100, 5), metadata=meta, label_img=label_img)
+            writer.add_embedding(torch.randn(100, 5), label_img=label_img)
+            writer.add_embedding(torch.randn(100, 5), metadata=meta)
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_embedding")
+        mat = make_np(mat)
+        if global_step is None:
+            global_step = 0
+            # clear pbtxt?
+
+        # Maybe we should encode the tag so slashes don't trip us up?
+        # I don't think this will mess us up, but better safe than sorry.
+        subdir = f"{str(global_step).zfill(5)}/{self._encode(tag)}"
+        save_path = os.path.join(self._get_file_writer().get_logdir(), subdir)
+
+        fs = tf.io.gfile
+        if fs.exists(save_path):
+            if fs.isdir(save_path):
+                print(
+                    "warning: Embedding dir exists, did you set global_step for add_embedding()?"
+                )
+            else:
+                raise Exception(
+                    f"Path: `{save_path}` exists, but is a file. Cannot proceed."
+                )
+        else:
+            fs.makedirs(save_path)
+
+        if metadata is not None:
+            assert mat.shape[0] == len(
+                metadata
+            ), "#labels should equal with #data points"
+            make_tsv(metadata, save_path, metadata_header=metadata_header)
+
+        if label_img is not None:
+            assert (
+                mat.shape[0] == label_img.shape[0]
+            ), "#images should equal with #data points"
+            make_sprite(label_img, save_path)
+
+        assert (
+            mat.ndim == 2
+        ), "mat should be 2D, where mat.size(0) is the number of data points"
+        make_mat(mat, save_path)
+
+        # Filesystem doesn't necessarily have append semantics, so we store an
+        # internal buffer to append to and re-write whole file after each
+        # embedding is added
+        if not hasattr(self, "_projector_config"):
+            self._projector_config = ProjectorConfig()
+        embedding_info = get_embedding_info(
+            metadata, label_img, subdir, global_step, tag
+        )
+        self._projector_config.embeddings.extend([embedding_info])
+
+        from google.protobuf import text_format
+
+        config_pbtxt = text_format.MessageToString(self._projector_config)
+        write_pbtxt(self._get_file_writer().get_logdir(), config_pbtxt)
+
+    def add_pr_curve(
+        self,
+        tag,
+        labels,
+        predictions,
+        global_step=None,
+        num_thresholds=127,
+        weights=None,
+        walltime=None,
+    ):
+        """Add precision recall curve.
+
+        Plotting a precision-recall curve lets you understand your model's
+        performance under different threshold settings. With this function,
+        you provide the ground truth labeling (T/F) and prediction confidence
+        (usually the output of your model) for each target. The TensorBoard UI
+        will let you choose the threshold interactively.
+
+        Args:
+            tag (str): Data identifier
+            labels (torch.Tensor, numpy.ndarray, or string/blobname):
+              Ground truth data. Binary label for each element.
+            predictions (torch.Tensor, numpy.ndarray, or string/blobname):
+              The probability that an element be classified as true.
+              Value should be in [0, 1]
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            import numpy as np
+            labels = np.random.randint(2, size=100)  # binary label
+            predictions = np.random.rand(100)
+            writer = SummaryWriter()
+            writer.add_pr_curve('pr_curve', labels, predictions, 0)
+            writer.close()
+
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_pr_curve")
+        labels, predictions = make_np(labels), make_np(predictions)
+        self._get_file_writer().add_summary(
+            pr_curve(tag, labels, predictions, num_thresholds, weights),
+            global_step,
+            walltime,
+        )
+
+    def add_pr_curve_raw(
+        self,
+        tag,
+        true_positive_counts,
+        false_positive_counts,
+        true_negative_counts,
+        false_negative_counts,
+        precision,
+        recall,
+        global_step=None,
+        num_thresholds=127,
+        weights=None,
+        walltime=None,
+    ):
+        """Add precision recall curve with raw data.
+
+        Args:
+            tag (str): Data identifier
+            true_positive_counts (torch.Tensor, numpy.ndarray, or string/blobname): true positive counts
+            false_positive_counts (torch.Tensor, numpy.ndarray, or string/blobname): false positive counts
+            true_negative_counts (torch.Tensor, numpy.ndarray, or string/blobname): true negative counts
+            false_negative_counts (torch.Tensor, numpy.ndarray, or string/blobname): false negative counts
+            precision (torch.Tensor, numpy.ndarray, or string/blobname): precision
+            recall (torch.Tensor, numpy.ndarray, or string/blobname): recall
+            global_step (int): Global step value to record
+            num_thresholds (int): Number of thresholds used to draw the curve.
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+            see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/README.md
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_pr_curve_raw")
+        self._get_file_writer().add_summary(
+            pr_curve_raw(
+                tag,
+                true_positive_counts,
+                false_positive_counts,
+                true_negative_counts,
+                false_negative_counts,
+                precision,
+                recall,
+                num_thresholds,
+                weights,
+            ),
+            global_step,
+            walltime,
+        )
+
+    def add_custom_scalars_multilinechart(
+        self, tags, category="default", title="untitled"
+    ):
+        """Shorthand for creating multilinechart. Similar to ``add_custom_scalars()``, but the only necessary argument is *tags*.
+
+        Args:
+            tags (list): list of tags that have been used in ``add_scalar()``
+
+        Examples::
+
+            writer.add_custom_scalars_multilinechart(['twse/0050', 'twse/2330'])
+        """
+        torch._C._log_api_usage_once(
+            "tensorboard.logging.add_custom_scalars_multilinechart"
+        )
+        layout = {category: {title: ["Multiline", tags]}}
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_custom_scalars_marginchart(
+        self, tags, category="default", title="untitled"
+    ):
+        """Shorthand for creating marginchart.
+
+        Similar to ``add_custom_scalars()``, but the only necessary argument is *tags*,
+        which should have exactly 3 elements.
+
+        Args:
+            tags (list): list of tags that have been used in ``add_scalar()``
+
+        Examples::
+
+            writer.add_custom_scalars_marginchart(['twse/0050', 'twse/2330', 'twse/2006'])
+        """
+        torch._C._log_api_usage_once(
+            "tensorboard.logging.add_custom_scalars_marginchart"
+        )
+        assert len(tags) == 3
+        layout = {category: {title: ["Margin", tags]}}
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_custom_scalars(self, layout):
+        """Create special chart by collecting charts tags in 'scalars'.
+
+        NOTE: This function can only be called once for each SummaryWriter() object.
+
+        Because it only provides metadata to tensorboard, the function can be called before or after the training loop.
+
+        Args:
+            layout (dict): {categoryName: *charts*}, where *charts* is also a dictionary
+              {chartName: *ListOfProperties*}. The first element in *ListOfProperties* is the chart's type
+              (one of **Multiline** or **Margin**) and the second element should be a list containing the tags
+              you have used in add_scalar function, which will be collected into the new chart.
+
+        Examples::
+
+            layout = {'Taiwan':{'twse':['Multiline',['twse/0050', 'twse/2330']]},
+                         'USA':{ 'dow':['Margin',   ['dow/aaa', 'dow/bbb', 'dow/ccc']],
+                              'nasdaq':['Margin',   ['nasdaq/aaa', 'nasdaq/bbb', 'nasdaq/ccc']]}}
+
+            writer.add_custom_scalars(layout)
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars")
+        self._get_file_writer().add_summary(custom_scalars(layout))
+
+    def add_mesh(
+        self,
+        tag,
+        vertices,
+        colors=None,
+        faces=None,
+        config_dict=None,
+        global_step=None,
+        walltime=None,
+    ):
+        """Add meshes or 3D point clouds to TensorBoard.
+
+        The visualization is based on Three.js,
+        so it allows users to interact with the rendered object. Besides the basic definitions
+        such as vertices, faces, users can further provide camera parameter, lighting condition, etc.
+        Please check https://threejs.org/docs/index.html#manual/en/introduction/Creating-a-scene for
+        advanced usage.
+
+        Args:
+            tag (str): Data identifier
+            vertices (torch.Tensor): List of the 3D coordinates of vertices.
+            colors (torch.Tensor): Colors for each vertex
+            faces (torch.Tensor): Indices of vertices within each triangle. (Optional)
+            config_dict: Dictionary with ThreeJS classes names and configuration.
+            global_step (int): Global step value to record
+            walltime (float): Optional override default walltime (time.time())
+              seconds after epoch of event
+
+        Shape:
+            vertices: :math:`(B, N, 3)`. (batch, number_of_vertices, channels)
+
+            colors: :math:`(B, N, 3)`. The values should lie in [0, 255] for type `uint8` or [0, 1] for type `float`.
+
+            faces: :math:`(B, N, 3)`. The values should lie in [0, number_of_vertices] for type `uint8`.
+
+        Examples::
+
+            from torch.utils.tensorboard import SummaryWriter
+            vertices_tensor = torch.as_tensor([
+                [1, 1, 1],
+                [-1, -1, 1],
+                [1, -1, -1],
+                [-1, 1, -1],
+            ], dtype=torch.float).unsqueeze(0)
+            colors_tensor = torch.as_tensor([
+                [255, 0, 0],
+                [0, 255, 0],
+                [0, 0, 255],
+                [255, 0, 255],
+            ], dtype=torch.int).unsqueeze(0)
+            faces_tensor = torch.as_tensor([
+                [0, 2, 3],
+                [0, 3, 1],
+                [0, 1, 2],
+                [1, 3, 2],
+            ], dtype=torch.int).unsqueeze(0)
+
+            writer = SummaryWriter()
+            writer.add_mesh('my_mesh', vertices=vertices_tensor, colors=colors_tensor, faces=faces_tensor)
+
+            writer.close()
+        """
+        torch._C._log_api_usage_once("tensorboard.logging.add_mesh")
+        self._get_file_writer().add_summary(
+            mesh(tag, vertices, colors, faces, config_dict), global_step, walltime
+        )
+
+    def flush(self):
+        """Flushes the event file to disk.
+
+        Call this method to make sure that all pending events have been written to
+        disk.
+        """
+        if self.all_writers is None:
+            return
+        for writer in self.all_writers.values():
+            writer.flush()
+
+    def close(self):
+        if self.all_writers is None:
+            return  # ignore double close
+        for writer in self.all_writers.values():
+            writer.flush()
+            writer.close()
+        self.file_writer = self.all_writers = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
diff --git a/MLPY/Lib/site-packages/torch/utils/throughput_benchmark.py b/MLPY/Lib/site-packages/torch/utils/throughput_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..528e1dce0c32acedccd5b0658e120490e95bdda4
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/throughput_benchmark.py
@@ -0,0 +1,159 @@
+
+import torch._C
+
+
+def format_time(time_us=None, time_ms=None, time_s=None):
+    """Define time formatting."""
+    assert sum([time_us is not None, time_ms is not None, time_s is not None]) == 1
+
+    US_IN_SECOND = 1e6
+    US_IN_MS = 1e3
+
+    if time_us is None:
+        if time_ms is not None:
+            time_us = time_ms * US_IN_MS
+        elif time_s is not None:
+            time_us = time_s * US_IN_SECOND
+        else:
+            raise AssertionError("Shouldn't reach here :)")
+
+    if time_us >= US_IN_SECOND:
+        return f'{time_us / US_IN_SECOND:.3f}s'
+    if time_us >= US_IN_MS:
+        return f'{time_us / US_IN_MS:.3f}ms'
+    return f'{time_us:.3f}us'
+
+
+class ExecutionStats:
+    def __init__(self, c_stats, benchmark_config):
+        self._c_stats = c_stats
+        self.benchmark_config = benchmark_config
+
+    @property
+    def latency_avg_ms(self):
+        return self._c_stats.latency_avg_ms
+
+    @property
+    def num_iters(self):
+        return self._c_stats.num_iters
+
+    @property
+    def iters_per_second(self):
+        """Return total number of iterations per second across all calling threads."""
+        return self.num_iters / self.total_time_seconds
+
+    @property
+    def total_time_seconds(self):
+        return self.num_iters * (
+            self.latency_avg_ms / 1000.0) / self.benchmark_config.num_calling_threads
+
+    def __str__(self):
+        return '\n'.join([
+            "Average latency per example: " + format_time(time_ms=self.latency_avg_ms),
+            f"Total number of iterations: {self.num_iters}",
+            f"Total number of iterations per second (across all threads): {self.iters_per_second:.2f}",
+            "Total time: " + format_time(time_s=self.total_time_seconds)
+        ])
+
+
+class ThroughputBenchmark:
+    """
+    This class is a wrapper around a c++ component throughput_benchmark::ThroughputBenchmark.
+
+    This wrapper on the throughput_benchmark::ThroughputBenchmark component is responsible
+    for executing a PyTorch module (nn.Module or ScriptModule) under an inference
+    server like load. It can emulate multiple calling threads to a single module
+    provided. In the future we plan to enhance this component to support inter and
+    intra-op parallelism as well as multiple models running in a single process.
+
+    Please note that even though nn.Module is supported, it might incur an overhead
+    from the need to hold GIL every time we execute Python code or pass around
+    inputs as Python objects. As soon as you have a ScriptModule version of your
+    model for inference deployment it is better to switch to using it in this
+    benchmark.
+
+    Example::
+
+        >>> # xdoctest: +SKIP("undefined vars")
+        >>> from torch.utils import ThroughputBenchmark
+        >>> bench = ThroughputBenchmark(my_module)
+        >>> # Pre-populate benchmark's data set with the inputs
+        >>> for input in inputs:
+        ...     # Both args and kwargs work, same as any PyTorch Module / ScriptModule
+        ...     bench.add_input(input[0], x2=input[1])
+        >>> # Inputs supplied above are randomly used during the execution
+        >>> stats = bench.benchmark(
+        ...     num_calling_threads=4,
+        ...     num_warmup_iters = 100,
+        ...     num_iters = 1000,
+        ... )
+        >>> print("Avg latency (ms): {}".format(stats.latency_avg_ms))
+        >>> print("Number of iterations: {}".format(stats.num_iters))
+    """
+
+    def __init__(self, module):
+        if isinstance(module, torch.jit.ScriptModule):
+            self._benchmark = torch._C.ThroughputBenchmark(module._c)
+        else:
+            self._benchmark = torch._C.ThroughputBenchmark(module)
+
+    def run_once(self, *args, **kwargs):
+        """
+        Given input id (input_idx) run benchmark once and return prediction.
+
+        This is useful for testing that benchmark actually runs the module you
+        want it to run. input_idx here is an index into inputs array populated
+        by calling add_input() method.
+        """
+        return self._benchmark.run_once(*args, **kwargs)
+
+    def add_input(self, *args, **kwargs):
+        """
+        Store a single input to a module into the benchmark memory and keep it there.
+
+        During the benchmark execution every thread is going to pick up a
+        random input from the all the inputs ever supplied to the benchmark via
+        this function.
+        """
+        self._benchmark.add_input(*args, **kwargs)
+
+    def benchmark(
+            self,
+            num_calling_threads=1,
+            num_warmup_iters=10,
+            num_iters=100,
+            profiler_output_path=""):
+        """
+        Run a benchmark on the module.
+
+        Args:
+            num_warmup_iters (int): Warmup iters are used to make sure we run a module
+                a few times before actually measuring things. This way we avoid cold
+                caches and any other similar problems. This is the number of warmup
+                iterations for each of the thread in separate
+
+            num_iters (int): Number of iterations the benchmark should run with.
+                This number is separate from the warmup iterations. Also the number is
+                shared across all the threads. Once the num_iters iterations across all
+                the threads is reached, we will stop execution. Though total number of
+                iterations might be slightly larger. Which is reported as
+                stats.num_iters where stats is the result of this function
+
+            profiler_output_path (str): Location to save Autograd Profiler trace.
+                If not empty, Autograd Profiler will be enabled for the main benchmark
+                execution (but not the warmup phase). The full trace will be saved
+                into the file path provided by this argument
+
+
+        This function returns BenchmarkExecutionStats object which is defined via pybind11.
+        It currently has two fields:
+            - num_iters - number of actual iterations the benchmark have made
+            - avg_latency_ms - average time it took to infer on one input example in milliseconds
+        """
+        config = torch._C.BenchmarkConfig()
+        config.num_calling_threads = num_calling_threads
+        config.num_warmup_iters = num_warmup_iters
+        config.num_iters = num_iters
+        config.profiler_output_path = profiler_output_path
+        c_stats = self._benchmark.benchmark(config)
+        return ExecutionStats(c_stats, config)
diff --git a/MLPY/Lib/site-packages/torch/utils/viz/__init__.py b/MLPY/Lib/site-packages/torch/utils/viz/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a03f92099cc2d4cf9a25c750d0e93fadf460e2e
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/_cycles.cpython-39.pyc b/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/_cycles.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f772aa62b9ba0ad0b13ef9d28261a0a0d6d9964c
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/utils/viz/__pycache__/_cycles.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/utils/viz/_cycles.py b/MLPY/Lib/site-packages/torch/utils/viz/_cycles.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8f8d0ca86c79c6a06c2a9da959ea45f1186bdf0
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/viz/_cycles.py
@@ -0,0 +1,447 @@
+import gc
+import sys
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple
+import types
+import weakref
+import json
+from tempfile import NamedTemporaryFile
+import torch
+from torch.cuda._memory_viz import _frames_fmt, _block_extra
+import atexit
+import logging
+logger = logging.getLogger(__name__)
+
+def observe_garbage(observer):
+    enabled = True
+
+    def disable():
+        # when GC runs during exit, things like `sys` will already be unloaded
+        # so we have to disable the callback to avoid hitting errors.
+        nonlocal enabled
+        enabled = False
+    atexit.register(disable)
+
+    def gc_callback(phase, info):
+        nonlocal enabled
+        if not enabled:
+            return
+        if phase == "start":
+            gc.set_debug(gc.DEBUG_SAVEALL)
+        elif phase == "stop":
+            orig_trace = sys.getprofile()
+            self_return = [False]
+
+            def do_collect(*args, **kwargs):
+                nonlocal enabled
+                if not self_return[0]:
+                    self_return[0] = True
+                else:
+                    sys.setprofile(orig_trace)
+                    enabled = False
+                    try:
+                        # things in gc.garbage have survived a collection
+                        # so to free them we have to collect a generation greater than them
+                        # but that might _also_ free other stuff and we don't want to miss
+                        # that stuff. So we have to now force gc at the highest level here,
+                        # report all of what we found, _then_ we can free it up.
+                        if info['generation'] != 2:
+                            gc.collect()
+                        observer(gc.garbage)
+                        gc.garbage.clear()
+                        # we have to re-run GC to clean up the cycles
+                        # we saved from before.
+                        gc.set_debug(0)
+                        before = torch.cuda.memory_allocated()
+                        gc.collect()
+                        after = torch.cuda.memory_allocated()
+                        if before != after:
+                            logger.warning("CUDA Memory changed during GC, %d bytes freed.", before - after)
+                    finally:
+                        enabled = True
+                if orig_trace is not None:
+                    return orig_trace(*args, **kwargs)
+            sys.setprofile(do_collect)
+
+    gc.callbacks.append(gc_callback)
+
+    # provide a way to disarm the callback
+    def remove():
+        gc.callbacks.remove(gc_callback)
+    return remove
+
+# Function to visualize cycles adapated from refcycle:
+# Copyright 2013 Mark Dickinson
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+def _get_cell_type():
+    def f(x=None):
+        return lambda: x
+    return type(f().__closure__[0])
+
+CellType = _get_cell_type()
+
+def annotated_references(obj):
+    """
+    Return known information about references held by the given object.
+
+    Returns a mapping from referents to lists of descriptions.  Note that there
+    may be more than one edge leading to any particular referent; hence the
+    need for a list.  Descriptions are currently strings.
+
+    """
+    references: Dict[int, List[str]] = {}
+
+    def add_reference(name, obj):
+        references.setdefault(id(obj), []).append(name)
+
+    def add_attrs(*attrs):
+        for attr in attrs:
+            if hasattr(obj, attr):
+                add_reference(attr, getattr(obj, attr))
+
+    def add_cell_references():
+        try:
+            add_attrs("cell_contents")
+        except ValueError:
+            # if cell_contents is empty,
+            # accessing it raises ValueError
+            # in this case there is no object to
+            # annotate
+            pass
+
+    def add_function_references():
+        add_attrs("__defaults__",
+                  "__closure__",
+                  "__globals__",
+                  "__code__",
+                  "__name__",
+                  "__module__",
+                  "__doc__"
+                  "__qualname__",
+                  "__annotations__",
+                  "__kwdefaults__")
+
+
+    def add_sequence_references():
+        for position, item in enumerate(obj):
+            add_reference(f"[{position}]", item)
+
+    def add_dict_references():
+        for key, value in obj.items():
+            add_reference("key", key)
+            add_reference(f"[{repr(key)}]", value)
+
+    def add_set_references():
+        for elt in obj:
+            add_reference("element", elt)
+
+    def add_bound_method_references():
+        add_attrs("__self__", "__func__", "im_class")
+
+    def add_weakref_references():
+        # For subclasses of weakref, we can't reliably distinguish the
+        # callback (if any) from other attributes.
+        if type(obj) is weakref.ref:
+            referents = gc.get_referents(obj)
+            if len(referents) == 1:
+                target = referents[0]
+                add_reference("__callback__", target)
+
+
+    def add_frame_references():
+        f_locals = obj.f_locals
+        add_attrs("f_back", "f_code", "f_builtins", "f_globals", "f_trace", "f_locals")
+        # Some badly-behaved code replaces the f_locals dict with
+        # something that doesn't support the full dict interface.  So we
+        # only continue with the annotation if f_locals is a Python dict.
+        if type(f_locals) is dict:
+            for name, local in obj.f_locals.items():
+                add_reference(f"local {name}", local)
+
+    def add_getset_descriptor_references():
+        add_attrs("__objclass__", "__name__", "__doc__")
+
+    type_based_references = {
+        tuple: add_sequence_references,
+        list: add_sequence_references,
+        dict: add_dict_references,
+        set: add_set_references,
+        frozenset: add_set_references,
+        types.FunctionType: add_function_references,
+        types.FrameType: add_frame_references,
+        CellType: add_cell_references,
+        types.MethodType: add_bound_method_references,
+        weakref.ref: add_weakref_references,
+        types.GetSetDescriptorType: add_getset_descriptor_references,
+    }
+
+    for type_ in type(obj).__mro__:
+        if type_ in type_based_references:
+            type_based_references[type_]()
+
+    add_attrs("__dict__", "__class__")
+    if isinstance(obj, type):
+        add_attrs("__mro__")
+
+    return references
+
+###############################################################################
+# Object annotations.
+
+
+BASE_TYPES = (int, float, complex, type(None), str, bytes)
+FRAME_FILENAME_LIMIT = 32
+
+def object_annotation(obj):
+    """
+    Return a string to be used for Graphviz nodes.
+
+    The string should be short but as informative as possible.
+    """
+
+    def format_sequence(obj):
+        body = ','.join(repr(x) if isinstance(x, BASE_TYPES) else type(x).__name__ for i, x in zip(range(8), obj))
+        if len(obj) > 8:
+            body = f'{body}, ...{len(obj) - 8}'
+        return body
+
+    # For basic types, use the repr.
+    if isinstance(obj, BASE_TYPES):
+        return repr(obj)
+    if type(obj).__name__ == 'function':
+        return f"function\n{obj.__name__}"
+    elif isinstance(obj, types.MethodType):
+        try:
+            func_name = obj.__func__.__qualname__
+        except AttributeError:
+            func_name = "<anonymous>"
+        return f"instancemethod\n{func_name}"
+    elif isinstance(obj, list):
+        return f"[{format_sequence(obj)}]"
+    elif isinstance(obj, tuple):
+        return f"({format_sequence(obj)})"
+    elif isinstance(obj, dict):
+        return f"dict[{len(obj)}]"
+    elif isinstance(obj, types.ModuleType):
+        return f"module\n{obj.__name__}"
+    elif isinstance(obj, type):
+        return f"type\n{obj.__name__}"
+    elif isinstance(obj, weakref.ref):
+        referent = obj()
+        if referent is None:
+            return "weakref (dead referent)"
+        else:
+            return f"weakref to id 0x{id(referent):x}"
+    elif isinstance(obj, types.FrameType):
+        filename = obj.f_code.co_filename
+        if len(filename) > FRAME_FILENAME_LIMIT:
+            filename = "..." + filename[-(FRAME_FILENAME_LIMIT - 3):]
+        return f"frame\n{filename}:{obj.f_lineno}"
+    else:
+        return f"object\n{type(obj).__module__}.{type(obj).__name__}"
+
+
+
+class Node(NamedTuple):
+    label: str
+    context: Optional[str]
+    root: bool
+    referrents: List[Tuple[str, int]]
+
+def create_graph(objects, *, context=None, filter=None):
+    if context is None:
+        context = cuda_allocation_context()
+    if filter is None:
+        filter = is_cuda_tensor
+
+    nodes = [Node(object_annotation(obj), context(obj), filter(obj), []) for obj in objects]
+    node_referrers: List[List[int]] = [[] for obj in objects]
+
+    id_to_node = {id(obj): i for i, obj in enumerate(objects)}
+    for obj in objects:
+        fidx = id_to_node[id(obj)]
+        f = nodes[fidx]
+        references = annotated_references(obj)
+        for referrent in gc.get_referents(obj):
+            rid = id(referrent)
+            tidx = id_to_node.get(rid, None)
+            if tidx is None:
+                continue
+            t = nodes[tidx]
+            labels = references.get(rid, ["?"])
+            node_referrers[tidx].append(fidx)
+            for label in labels:
+                f.referrents.append((label, tidx))
+
+    to_search = [i for i, n in enumerate(nodes) if n.root]
+    to_keep = set()
+    while to_search:
+        idx = to_search.pop()
+        if idx in to_keep:
+            continue
+        to_keep.add(idx)
+        referrers = node_referrers[idx]
+        to_search.extend(referrers)
+    id_to_filtered_id: Dict[int, int] = {}
+    filtered: List[Any] = []
+    for i, n in enumerate(nodes):
+        if i in to_keep:
+            id_to_filtered_id[i] = len(id_to_filtered_id)
+            filtered.append(n)
+    for n in filtered:
+        n.referrents[:] = [(label, id_to_filtered_id[idx])
+                           for (label, idx) in n.referrents
+                           if idx in id_to_filtered_id]
+    return filtered
+
+def escape(n):
+    return json.dumps(n)
+
+
+def is_cuda_tensor(obj):
+    return isinstance(obj, torch.Tensor) and obj.is_cuda and not isinstance(obj, torch._subclasses.FakeTensor)
+
+def cuda_allocation_context():
+    snapshot = torch.cuda.memory._snapshot()
+    addr_to_frame = {}
+    for seg in snapshot['segments']:
+        addr = seg['address']
+        for blk in seg['blocks']:
+            if blk['state'] == 'active_allocated':
+                frames, real_size = _block_extra(blk)
+                addr_to_frame[addr] = frames
+            addr += blk['size']
+
+    def object_context(obj):
+        if is_cuda_tensor(obj):
+            addr = obj.untyped_storage().data_ptr()
+            frames = addr_to_frame.get(addr)
+            if frames is not None:
+                return '\n'.join(_frames_fmt(frames, full_filename=True))
+        return None
+    return object_context
+
+def to_dot(nodes):
+    lines = ["digraph GraphName {", "node [shape=rect];", 'rankdir=LR;']
+    for i, n in enumerate(nodes):
+        lines.append(f'{i} [label={escape(n.label)}, color={ "red" if n.root else "black"}];')
+
+    for i, f in enumerate(nodes):
+        for label, j in f.referrents:
+            lines.append(f'{i} -> {j} [label = {escape(label)}]')
+    lines.append("}\n")
+    return '\n'.join(lines)
+
+_template = """
+<!DOCTYPE html>
+<html>
+<head>
+  <style>
+    body {
+      margin: 0;
+      padding: 0;
+      overflow: hidden;
+    }
+
+    #container {
+      display: flex;
+      flex-direction: column;
+      height: 100vh;
+    }
+
+    #main {
+      flex: 2;
+      overflow: auto;
+    }
+
+    #preContainer {
+      flex: 1;
+      overflow: auto;
+    }
+
+    svg {
+        overflow: scroll;
+    }
+
+    pre {
+      margin: 0;
+      padding: 10px;
+    }
+  </style>
+</head>
+<body>
+  <div id="container">
+    <div id="main">
+    </div>
+    <div id="preContainer">
+      <pre id="stacktrace">Mouse over tensor objects to see where they were allocated.</pre>
+    </div>
+  </div>
+<script src='https://cdnjs.cloudflare.com/ajax/libs/viz.js/1.8.0/viz-lite.js'></script>
+<script>
+let dot = $DOT
+let image = Viz(dot, {format: 'svg'});
+document.getElementById('main').innerHTML = image
+$LISTENERS
+</script>
+</body>
+</html>
+"""
+_listener_template = """
+document.getElementById('node{id}').addEventListener('mouseover', function(event) {{
+  document.getElementById("stacktrace").textContent = {stack}
+}})
+"""
+def to_html(nodes):
+    listeners = []
+    for i, n in enumerate(nodes):
+        if n.context is None:
+            continue
+        s = _listener_template.format(id=str(i + 1), stack=escape(f'{n.label}:\n{n.context}'))
+        listeners.append(s)
+    dot = to_dot(nodes)
+    return _template.replace('$DOT', repr(dot)).replace('$LISTENERS', '\n'.join(listeners))
+
+def observe_tensor_cycles(callback):
+    torch.cuda.memory._record_memory_history(max_entries=100000)
+
+    def observer(garbage):
+        if garbage:
+            if not any(is_cuda_tensor(obj) for obj in garbage):
+                logger.info("No CUDA Tensors found in garbage")
+                return
+            callback(to_html(create_graph(garbage)))
+    return observe_garbage(observer)
+
+
+def warn_tensor_cycles():
+    """
+    Install a warning that reports whenever a cycle that is holding CUDA memory is observed.
+
+    The warning produces an .html file that visualizes the cycle,
+    and links it to the stack frame that allocted the CUDA tensor.
+
+    Reference cycles are freed by the cycle collector rather than being cleaned up
+    when the objects in the cycle first become unreachable. If a cycle points to a tensor,
+    the CUDA memory for that tensor will not be freed until garbage collection runs.
+    Accumulation of CUDA allocations can lead to out of memory errors (OOMs), as well as
+    non-deterministic allocation behavior which is harder to debug.
+    """
+    logger.info("Watching Python reference cycles for CUDA Tensors.")
+
+    def write_and_log(html):
+        with NamedTemporaryFile('w', suffix='.html', delete=False) as f:
+            f.write(html)
+            logger.warning('Reference cycle includes a CUDA Tensor see visualization of cycle %s', f.name)
+    return observe_tensor_cycles(write_and_log)
diff --git a/MLPY/Lib/site-packages/torch/utils/weak.py b/MLPY/Lib/site-packages/torch/utils/weak.py
new file mode 100644
index 0000000000000000000000000000000000000000..19ba620f0c5cef9ff2140befafd84d1ea7ad81eb
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/utils/weak.py
@@ -0,0 +1,321 @@
+from __future__ import annotations
+
+import weakref
+from weakref import ref
+from _weakrefset import _IterationGuard  # type: ignore[attr-defined]
+from collections.abc import MutableMapping, Mapping
+from torch import Tensor
+import collections.abc as _collections_abc
+
+
+WeakRef = ref
+
+
+__all__ = ['TensorWeakRef', 'WeakIdRef', 'WeakIdKeyDictionary', 'WeakTensorKeyDictionary']
+
+
+# This file defines a variant of WeakKeyDictionary that overrides the hashing
+# behavior of the key to use object identity, rather than the builtin
+# __eq__/__hash__ functions.  This is useful for Tensor weak keys, as their
+# __eq__ implementation return a Tensor (elementwise equality), which means
+# you can't use them directly with the WeakKeyDictionary in standard library.
+#
+# Our implementation strategy is to create a wrapper weak key object, which we
+# use as a key in a stock Python dictionary.  This is similar to how weakref
+# implements WeakKeyDictionary, but instead of using weakref.ref as the
+# wrapper, we use a custom wrapper that has different __eq__ and __hash__
+# behavior.  Note that we subsequently store this weak key directly in an
+# ORDINARY dictionary, since the newly constructed WeakIdKey's only use would
+# be a dictionary so it would have no strong references.  Ensuring that
+# only live WeakIdKeys are in the map is handled by putting finalizers on the
+# original key object.
+
+
+# It is simpler to implement this with composition, but if we want to
+# directly reuse the callback mechanism on weakref, we need the weakref
+# and the key to be exactly the same object.  Reusing the callback mechanism
+# minimizes the divergence between our implementation and Lib/weakref.py
+#
+# NB: Prefer using this when working with weakrefs of Tensors; e.g., do
+# WeakIdRef(tensor) rather than weakref.ref(tensor); it handles a number of
+# easy to get wrong cases transparently for you.
+class WeakIdRef(weakref.ref):
+    __slots__ = ['_id']
+
+    def __init__(self, key, callback=None):
+        # Unlike stock weakref, which preserves hash semantics of the
+        # original object but lazily defers hash calls until the first
+        # time the user attempts to hash the weakref, we can eagerly
+        # cache the id of the key as we know this is definitely the hash
+        # method
+        self._id = id(key)
+        super().__init__(key, callback)  # type: ignore[call-arg]
+
+    def __call__(self):
+        r = super().__call__()
+        # Special logic for Tensor PyObject resurrection
+        if hasattr(r, '_fix_weakref'):
+            r._fix_weakref()  # type: ignore[union-attr]
+        return r
+
+    def __hash__(self):
+        return self._id
+
+    def __eq__(self, other):
+        # An attractive but wrong alternate implementation is to only test if
+        # the stored _ids match.  This can lead to an ABA problem if you have:
+        #
+        #   a1 = A()
+        #   w1 = WeakIdRef(a1)
+        #   del a1
+        #   a2 = A()  # suppose it gets the same ID as a1
+        #   w2 = WeakIdRef(a2)
+        #   print(w1 == w2)
+        #
+        # This should be False, as a1 and a2 are unrelated (and a1 is
+        # dead anyway)
+        a = self()
+        b = other()
+        if a is not None and b is not None:
+            return a is b
+        return self is other
+
+# This is the same as WeakIdRef but equality is checked using hash() rather than id.
+# This will be equivalent to the one above except for classes where hash is not their id.
+class _WeakHashRef(weakref.ref):
+    __slots__ = ['_id']
+
+    def __init__(self, key, callback=None):
+        # Unlike stock weakref, which preserves hash semantics of the
+        # original object but lazily defers hash calls until the first
+        # time the user attempts to hash the weakref, we can eagerly
+        # cache the id of the key as we know this is definitely the hash
+        # method
+        self._id = hash(key)
+        super().__init__(key, callback)  # type: ignore[call-arg]
+
+    def __call__(self):
+        r = super().__call__()
+        # Special logic for Tensor PyObject resurrection
+        if hasattr(r, '_fix_weakref'):
+            r._fix_weakref()  # type: ignore[union-attr]
+        return r
+
+    def __hash__(self):
+        return self._id
+
+    def __eq__(self, other):
+        # Use hash equality to determine ref equality.
+        # ScriptObject implements __hash__ to return the wrapped IValue's id, so
+        # this is equivalent to doing an identity comparison.
+        a = self()
+        b = other()
+        if a is not None and b is not None:
+            return hash(a) == hash(b)
+        return self is other
+
+# This is directly adapted from cpython/Lib/weakref.py
+class WeakIdKeyDictionary(MutableMapping):
+    def __init__(self, dict=None, ref_type=WeakIdRef):  # CHANGED
+        self.data = {}
+
+        self.ref_type = ref_type  # CHANGED
+
+        def remove(k, selfref=ref(self)):
+            self = selfref()
+            if self is not None:
+                if self._iterating:
+                    self._pending_removals.append(k)
+                else:
+                    try:
+                        del self.data[k]
+                    except KeyError:
+                        pass
+        self._remove = remove
+        # A list of dead weakrefs (keys to be removed)
+        self._pending_removals = []
+        self._iterating = set()
+        self._dirty_len = False
+        if dict is not None:
+            self.update(dict)
+
+    def _commit_removals(self):
+        # NOTE: We don't need to call this method before mutating the dict,
+        # because a dead weakref never compares equal to a live weakref,
+        # even if they happened to refer to equal objects.
+        # However, it means keys may already have been removed.
+        pop = self._pending_removals.pop
+        d = self.data
+        while True:
+            try:
+                key = pop()
+            except IndexError:
+                return
+
+            try:
+                del d[key]
+            except KeyError:
+                pass
+
+    def _scrub_removals(self):
+        d = self.data
+        self._pending_removals = [k for k in self._pending_removals if k in d]
+        self._dirty_len = False
+
+    def __delitem__(self, key):
+        self._dirty_len = True
+        del self.data[self.ref_type(key)]  # CHANGED
+
+    def __getitem__(self, key):
+        return self.data[self.ref_type(key)]  # CHANGED
+
+    def __len__(self):
+        if self._dirty_len and self._pending_removals:
+            # self._pending_removals may still contain keys which were
+            # explicitly removed, we have to scrub them (see issue #21173).
+            self._scrub_removals()
+        return len(self.data) - len(self._pending_removals)
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__} at {id(self):#x}>"
+
+    def __setitem__(self, key, value):
+        self.data[self.ref_type(key, self._remove)] = value  # CHANGED
+
+    def copy(self):
+        new = WeakIdKeyDictionary()
+        with _IterationGuard(self):
+            for key, value in self.data.items():
+                o = key()
+                if o is not None:
+                    new[o] = value
+        return new
+
+    __copy__ = copy
+
+    def __deepcopy__(self, memo):
+        from copy import deepcopy
+        new = self.__class__()
+        with _IterationGuard(self):
+            for key, value in self.data.items():
+                o = key()
+                if o is not None:
+                    new[o] = deepcopy(value, memo)
+        return new
+
+    def get(self, key, default=None):
+        return self.data.get(self.ref_type(key), default)  # CHANGED
+
+    def __contains__(self, key):
+        try:
+            wr = self.ref_type(key)  # CHANGED
+        except TypeError:
+            return False
+        return wr in self.data
+
+    def items(self):
+        with _IterationGuard(self):
+            for wr, value in self.data.items():
+                key = wr()
+                if key is not None:
+                    yield key, value
+
+    def keys(self):
+        with _IterationGuard(self):
+            for wr in self.data:
+                obj = wr()
+                if obj is not None:
+                    yield obj
+
+    __iter__ = keys
+
+    def values(self):
+        with _IterationGuard(self):
+            for wr, value in self.data.items():
+                if wr() is not None:
+                    yield value
+
+    def keyrefs(self):
+        """Return a list of weak references to the keys.
+
+        The references are not guaranteed to be 'live' at the time
+        they are used, so the result of calling the references needs
+        to be checked before being used.  This can be used to avoid
+        creating references that will cause the garbage collector to
+        keep the keys around longer than needed.
+
+        """
+        return list(self.data)
+
+    def popitem(self):
+        self._dirty_len = True
+        while True:
+            key, value = self.data.popitem()
+            o = key()
+            if o is not None:
+                return o, value
+
+    def pop(self, key, *args):
+        self._dirty_len = True
+        return self.data.pop(self.ref_type(key), *args)  # CHANGED
+
+    def setdefault(self, key, default=None):
+        return self.data.setdefault(self.ref_type(key, self._remove), default)  # CHANGED
+
+    def update(self, dict=None, **kwargs):
+        d = self.data
+        if dict is not None:
+            if not hasattr(dict, "items"):
+                dict = type({})(dict)
+            for key, value in dict.items():
+                d[self.ref_type(key, self._remove)] = value  # CHANGED
+        if len(kwargs):
+            self.update(kwargs)
+
+    def __ior__(self, other):
+        self.update(other)
+        return self
+
+    def __or__(self, other):
+        if isinstance(other, _collections_abc.Mapping):
+            c = self.copy()
+            c.update(other)
+            return c
+        return NotImplemented
+
+    def __ror__(self, other):
+        if isinstance(other, _collections_abc.Mapping):
+            c = self.__class__()
+            c.update(other)
+            c.update(self)
+            return c
+        return NotImplemented
+
+    # Default Mapping equality will tests keys for equality, but
+    # we want to test ids for equality
+    def __eq__(self, other):
+        if not isinstance(other, Mapping):
+            return NotImplemented
+        return {id(k): v for k, v in self.items()} == {id(k): v for k, v in other.items()}
+
+# Convenience alias
+WeakTensorKeyDictionary = WeakIdKeyDictionary
+
+
+class TensorWeakRef:
+    """Wrapper around a weak ref of a Tensor that handles the _fix_weakref() call required when unwrapping a Tensor weakref."""
+
+    ref: WeakRef[Tensor]
+
+    def __init__(self, tensor: Tensor):
+        assert isinstance(tensor, Tensor)
+        self.ref = weakref.ref(tensor)
+
+    def __call__(self):
+        out = self.ref()
+        if out is None:
+            return out
+        assert isinstance(out, Tensor)
+        # TODO, add _fix_weakref type binding
+        out._fix_weakref()  # type: ignore[attr-defined]
+        return out
diff --git a/MLPY/Lib/site-packages/torch/version.py b/MLPY/Lib/site-packages/torch/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14cda66c7a696ae8c3dee775914b28a50e64f61
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/version.py
@@ -0,0 +1,8 @@
+from typing import Optional
+
+__all__ = ['__version__', 'debug', 'cuda', 'git_version', 'hip']
+__version__ = '2.3.1+cpu'
+debug = False
+cuda: Optional[str] = None
+git_version = 'd44533f9d073df13895333e70b66f81c513c1889'
+hip: Optional[str] = None
diff --git a/MLPY/Lib/site-packages/torch/xpu/__init__.py b/MLPY/Lib/site-packages/torch/xpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4b77a1eb5bd758e304a946627a5d1521bf779b
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/xpu/__init__.py
@@ -0,0 +1,485 @@
+r"""
+This package introduces support for the XPU backend, specifically tailored for
+Intel GPU optimization.
+
+This package is lazily initialized, so you can always import it, and use
+:func:`is_available()` to determine if your system supports XPU.
+"""
+import threading
+import traceback
+from functools import lru_cache
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch._C
+from .. import device as _device
+from .._utils import _dummy_type, _LazySeedTracker
+from ._utils import _get_device_index
+from .streams import Event, Stream
+
+_initialized = False
+_tls = threading.local()
+_initialization_lock = threading.Lock()
+_queued_calls: List[
+    Tuple[Callable[[], None], List[str]]
+] = []  # don't invoke these until initialization occurs
+_is_in_bad_fork = getattr(torch._C, "_xpu_isInBadFork", lambda: False)
+_device_t = Union[_device, str, int, None]
+_lazy_seed_tracker = _LazySeedTracker()
+default_generators: Tuple[torch._C.Generator] = ()  # type: ignore[assignment]
+
+
+def _is_compiled() -> bool:
+    r"""Return true if compile with XPU support."""
+    return torch._C._has_xpu
+
+
+if _is_compiled():
+    _XpuDeviceProperties = torch._C._XpuDeviceProperties
+    _exchange_device = torch._C._xpu_exchangeDevice
+    _maybe_exchange_device = torch._C._xpu_maybeExchangeDevice
+else:
+    # Define dummy if PyTorch was compiled without XPU
+    _XpuDeviceProperties = _dummy_type("_XpuDeviceProperties")  # type: ignore[assignment, misc]
+
+    def _exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+    def _maybe_exchange_device(device: int) -> int:
+        raise NotImplementedError("PyTorch was compiled without XPU support")
+
+
+@lru_cache(maxsize=1)
+def device_count() -> int:
+    r"""Return the number of XPU device available."""
+    if not _is_compiled():
+        return 0
+    return torch._C._xpu_getDeviceCount()
+
+
+def is_available() -> bool:
+    r"""Return a bool indicating if XPU is currently available."""
+    # This function nerver throws.
+    return device_count() > 0
+
+
+def is_bf16_supported():
+    r"""Return a bool indicating if the current XPU device supports dtype bfloat16."""
+    return True
+
+
+def is_initialized():
+    r"""Return whether PyTorch's XPU state has been initialized."""
+    return _initialized and not _is_in_bad_fork()
+
+
+def _lazy_call(callable, **kwargs):
+    if is_initialized():
+        callable()
+    else:
+        global _lazy_seed_tracker
+        if kwargs.get("seed_all", False):
+            _lazy_seed_tracker.queue_seed_all(callable, traceback.format_stack())
+        elif kwargs.get("seed", False):
+            _lazy_seed_tracker.queue_seed(callable, traceback.format_stack())
+        else:
+            # Don't store the actual traceback to avoid memory cycle
+            _queued_calls.append((callable, traceback.format_stack()))
+
+
+def init():
+    r"""Initialize PyTorch's XPU state.
+    This is a Python API about lazy initialization that avoids initializing
+    XPU until the first time it is accessed. Does nothing if the XPU state is
+    already initialized.
+    """
+    _lazy_init()
+
+
+def _lazy_init():
+    global _initialized, _queued_calls
+    if is_initialized() or hasattr(_tls, "is_initializing"):
+        return
+    with _initialization_lock:
+        # This test was was protected via GIL. Double-check whether XPU has
+        # already been initialized.
+        if is_initialized():
+            return
+        # Stop promptly upon encountering a bad fork error.
+        if _is_in_bad_fork():
+            raise RuntimeError(
+                "Cannot re-initialize XPU in forked subprocess. To use XPU with "
+                "multiprocessing, you must use the 'spawn' start method"
+            )
+        if not _is_compiled():
+            raise AssertionError("Torch not compiled with XPU enabled")
+        # This function inits XPU backend and detects bad fork processing.
+        torch._C._xpu_init()
+        # Some of the queued calls may reentrantly call _lazy_init(); We need to
+        # just return without initializing in that case.
+        _tls.is_initializing = True
+
+        for calls in _lazy_seed_tracker.get_calls():
+            if calls:
+                _queued_calls.append(calls)
+
+        try:
+            for queued_call, orig_traceback in _queued_calls:
+                try:
+                    queued_call()
+                except Exception as e:
+                    msg = (
+                        f"XPU call failed lazily at initialization with error: {str(e)}\n\n"
+                        f"XPU call was originally invoked at:\n\n{''.join(orig_traceback)}"
+                    )
+                    raise Exception(msg) from e
+        finally:
+            delattr(_tls, "is_initializing")
+        _initialized = True
+
+
+class _DeviceGuard:
+    def __init__(self, index: int):
+        self.idx = index
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device:
+    r"""Context-manager that changes the selected device.
+
+    Args:
+        device (torch.device or int or str): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+
+    def __init__(self, device: Any):
+        self.idx = _get_device_index(device, optional=True)
+        self.prev_idx = -1
+
+    def __enter__(self):
+        self.prev_idx = torch.xpu._exchange_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        self.idx = torch.xpu._maybe_exchange_device(self.prev_idx)
+        return False
+
+
+class device_of(device):
+    r"""Context-manager that changes the current device to that of given object.
+
+    You can use both tensors and storages as arguments. If a given object is
+    not allocated on a XPU, this is a no-op.
+
+    Args:
+        obj (Tensor or Storage): object allocated on the selected device.
+    """
+
+    def __init__(self, obj):
+        idx = obj.get_device() if obj.is_xpu else -1
+        super().__init__(idx)
+
+
+def set_device(device: _device_t) -> None:
+    r"""Set the current device.
+
+    Args:
+        device (torch.device or int or str): selected device. This function is a
+            no-op if this argument is negative.
+    """
+    _lazy_init()
+    device = _get_device_index(device)
+    if device >= 0:
+        torch._C._xpu_setDevice(device)
+
+
+def get_device_name(device: Optional[_device_t] = None) -> str:
+    r"""Get the name of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the name. This function is a no-op if this argument is a
+            negative integer. It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
+    """
+    return get_device_properties(device).name
+
+
+def get_device_capability(device: Optional[_device_t] = None) -> Dict[str, Any]:
+    r"""Get the xpu capability of a device.
+
+    Args:
+        device (torch.device or int or str, optional): device for which to
+            return the device capability. This function is a no-op if this
+            argument is a negative integer. It uses the current device, given by
+            :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+
+    Returns:
+        Dict[str, Any]: the xpu capability dictionary of the device
+    """
+    prop = get_device_properties(device)
+    return {
+        "max_work_group_size": prop.max_work_group_size,
+        "max_num_sub_groups": prop.max_num_sub_groups,
+        "sub_group_sizes": prop.sub_group_sizes,
+    }
+
+
+def get_device_properties(device: Optional[_device_t] = None) -> _XpuDeviceProperties:
+    r"""Get the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _XpuDeviceProperties: the properties of the device
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    if device < 0 or device >= device_count():
+        raise AssertionError("Invalid device index")
+    return _get_device_properties(device)  # type: ignore[name-defined]  # noqa: F821
+
+
+def current_device() -> int:
+    r"""Return the index of a currently selected device."""
+    _lazy_init()
+    return torch._C._xpu_getDevice()
+
+
+def _get_device(device: Union[int, str, torch.device]) -> torch.device:
+    r"""Return the torch.device type object from the passed in device.
+
+    Args:
+        device (torch.device or int or str): selected device.
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    return device
+
+
+class StreamContext:
+    r"""Context-manager that selects a given stream.
+
+    All XPU kernels queued within its context will be enqueued on a selected
+    stream.
+
+    Args:
+        Stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device.
+    """
+    cur_stream: Optional["torch.xpu.Stream"]
+
+    def __init__(self, stream: Optional["torch.xpu.Stream"]):
+        self.stream = stream
+        self.idx = _get_device_index(None, True)
+        if self.idx is None:
+            self.idx = -1
+
+    def __enter__(self):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+        self.src_prev_stream = torch.xpu.current_stream(None)
+
+        # If the stream is not on the current device, then set the current stream on the device
+        if self.src_prev_stream.device != cur_stream.device:
+            with device(cur_stream.device):
+                self.dst_prev_stream = torch.xpu.current_stream(cur_stream.device)
+        torch.xpu.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        cur_stream = self.stream
+        if cur_stream is None or self.idx == -1:
+            return
+
+        # Reset the stream on the original device and destination device
+        if self.src_prev_stream.device != cur_stream.device:
+            torch.xpu.set_stream(self.dst_prev_stream)
+        torch.xpu.set_stream(self.src_prev_stream)
+
+
+def stream(stream: Optional["torch.xpu.Stream"]) -> StreamContext:
+    r"""Wrap around the Context-manager StreamContext that selects a given stream.
+
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's ``None``.
+    """
+    return StreamContext(stream)
+
+
+def _set_stream_by_id(stream_id, device_index, device_type):
+    r"""set stream specified by the stream id, device index and device type
+
+    Args: stream_id (int): not visible to the user, used to assigned to the specific stream.
+          device_index (int): selected device index.
+          device_type (int): selected device type.
+    """
+    torch._C._xpu_setStream(
+        stream_id=stream_id,
+        device_index=device_index,
+        device_type=device_type,
+    )
+
+
+def set_stream(stream: Stream):
+    r"""Set the current stream.This is a wrapper API to set the stream.
+        Usage of this function is discouraged in favor of the ``stream``
+        context manager.
+
+    Args:
+        stream (Stream): selected stream. This function is a no-op
+            if this argument is ``None``.
+    """
+    if stream is None:
+        return
+    _lazy_init()
+    _set_stream_by_id(
+        stream_id=stream.stream_id,
+        device_index=stream.device_index,
+        device_type=stream.device_type,
+    )
+
+
+def current_stream(device: Optional[_device_t] = None) -> Stream:
+    r"""Return the currently selected :class:`Stream` for a given device.
+
+    Args:
+        device (torch.device or int, optional): selected device. Returns
+            the currently selected :class:`Stream` for the current device, given
+            by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None``
+            (default).
+    """
+    _lazy_init()
+    streamdata = torch._C._xpu_getCurrentStream(
+        _get_device_index(device, optional=True)
+    )
+    return Stream(
+        stream_id=streamdata[0], device_index=streamdata[1], device_type=streamdata[2]
+    )
+
+
+def synchronize(device: _device_t = None) -> None:
+    r"""Wait for all kernels in all streams on a XPU device to complete.
+
+    Args:
+        device (torch.device or int, optional): device for which to synchronize.
+            It uses the current device, given by :func:`~torch.xpu.current_device`,
+            if :attr:`device` is ``None`` (default).
+    """
+    _lazy_init()
+    device = _get_device_index(device, optional=True)
+    return torch._C._xpu_synchronize(device)
+
+
+def empty_cache() -> None:
+    r"""Release all unoccupied cached memory currently held by the caching
+    allocator so that those can be used in other XPU application.
+
+    .. note::
+        :func:`~torch.xpu.empty_cache` doesn't increase the amount of XPU
+        memory available for PyTorch. However, it may help reduce fragmentation
+        of XPU memory in certain cases.
+    """
+    if is_initialized():
+        torch._C._xpu_emptyCache()
+
+
+def _get_generator(device: torch.device) -> torch._C.Generator:
+    r"""Return the XPU Generator object for the given device.
+
+    Args:
+        device (torch.device): selected device.
+    """
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    return torch.xpu.default_generators[idx]
+
+
+def _set_rng_state_offset(
+    offset: int, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state offset of the specified GPU.
+
+    Args:
+        offset (int): The desired offset
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    final_device = _get_device(device)
+
+    def cb():
+        default_generator = _get_generator(final_device)
+        default_generator.set_offset(offset)
+
+    _lazy_call(cb)
+
+
+def _get_rng_state_offset(device: Union[int, str, torch.device] = "xpu") -> int:
+    r"""Return the random number generator state offset of the specified GPU.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state offset of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    final_device = _get_device(device)
+    default_generator = _get_generator(final_device)
+    return default_generator.get_offset()
+
+
+from .random import *  # noqa: F403
+
+
+__all__ = [
+    "Event",
+    "Stream",
+    "StreamContext",
+    "current_device",
+    "current_stream",
+    "default_generators",
+    "device",
+    "device_of",
+    "device_count",
+    "empty_cache",
+    "get_device_capability",
+    "get_device_name",
+    "get_device_properties",
+    "get_rng_state",
+    "get_rng_state_all",
+    "get_stream",
+    "init",
+    "initial_seed",
+    "is_available",
+    "is_bf16_supported",
+    "is_initialized",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "set_device",
+    "set_rng_state",
+    "set_rng_state_all",
+    "set_stream",
+    "stream",
+    "streams",
+    "synchronize",
+]
diff --git a/MLPY/Lib/site-packages/torch/xpu/__pycache__/__init__.cpython-39.pyc b/MLPY/Lib/site-packages/torch/xpu/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8042edfdea7895a8b4a48a5ecf6b38f61237d698
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/xpu/__pycache__/__init__.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/xpu/__pycache__/_utils.cpython-39.pyc b/MLPY/Lib/site-packages/torch/xpu/__pycache__/_utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70efde19a764f0f98919776811fd8d25b5f4d377
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/xpu/__pycache__/_utils.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/xpu/__pycache__/random.cpython-39.pyc b/MLPY/Lib/site-packages/torch/xpu/__pycache__/random.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5116b955fd5b4d7ae6f5edfa8cf09c6725445a69
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/xpu/__pycache__/random.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/xpu/__pycache__/streams.cpython-39.pyc b/MLPY/Lib/site-packages/torch/xpu/__pycache__/streams.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..134582f18212db929f6e1e3e97988e431a0e3e03
Binary files /dev/null and b/MLPY/Lib/site-packages/torch/xpu/__pycache__/streams.cpython-39.pyc differ
diff --git a/MLPY/Lib/site-packages/torch/xpu/_utils.py b/MLPY/Lib/site-packages/torch/xpu/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..512e8474ee443788db8bc2de79e83a03b9173693
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/xpu/_utils.py
@@ -0,0 +1,39 @@
+from typing import Any
+
+import torch
+
+# The _get_device_index has been moved to torch.utils._get_device_index
+from torch._utils import _get_device_index as _torch_get_device_index
+
+
+def _get_device_index(
+    device: Any, optional: bool = False, allow_cpu: bool = False
+) -> int:
+    r"""Get the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a XPU device. Note that for a XPU device without a specified index,
+    i.e., ``torch.device('xpu')``, this will return the current default XPU
+    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
+    CPU devices will be accepted and ``-1`` will be returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default XPU
+    device if :attr:`optional` is ``True``.
+    """
+    if isinstance(device, int):
+        return device
+    if isinstance(device, str):
+        device = torch.device(device)
+    if isinstance(device, torch.device):
+        if allow_cpu:
+            if device.type not in ["xpu", "cpu"]:
+                raise ValueError(f"Expected a xpu or cpu device, but got: {device}")
+        elif device.type != "xpu":
+            raise ValueError(f"Expected a xpu device, but got: {device}")
+    if not torch.jit.is_scripting():
+        if isinstance(device, torch.xpu.device):
+            return device.idx
+    return _torch_get_device_index(device, optional, allow_cpu)
diff --git a/MLPY/Lib/site-packages/torch/xpu/random.py b/MLPY/Lib/site-packages/torch/xpu/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e571e91443935748375f662437f134537d5041c
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/xpu/random.py
@@ -0,0 +1,176 @@
+from typing import Iterable, List, Union
+
+import torch
+from .. import Tensor
+from . import _lazy_call, _lazy_init, current_device, device_count
+
+
+def get_rng_state(device: Union[int, str, torch.device] = "xpu") -> Tensor:
+    r"""Return the random number generator state of the specified GPU as a ByteTensor.
+
+    Args:
+        device (torch.device or int, optional): The device to return the RNG state of.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+    idx = device.index
+    if idx is None:
+        idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.get_state()
+
+
+def get_rng_state_all() -> List[Tensor]:
+    r"""Return a list of ByteTensor representing the random number states of all devices."""
+    results = []
+    for i in range(device_count()):
+        results.append(get_rng_state(i))
+    return results
+
+
+def set_rng_state(
+    new_state: Tensor, device: Union[int, str, torch.device] = "xpu"
+) -> None:
+    r"""Set the random number generator state of the specified GPU.
+
+    Args:
+        new_state (torch.ByteTensor): The desired state
+        device (torch.device or int, optional): The device to set the RNG state.
+            Default: ``'xpu'`` (i.e., ``torch.device('xpu')``, the current XPU device).
+    """
+    with torch._C._DisableFuncTorch():
+        new_state_copy = new_state.clone(memory_format=torch.contiguous_format)
+    if isinstance(device, str):
+        device = torch.device(device)
+    elif isinstance(device, int):
+        device = torch.device("xpu", device)
+
+    def cb():
+        idx = device.index
+        if idx is None:
+            idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.set_state(new_state_copy)
+
+    _lazy_call(cb)
+
+
+def set_rng_state_all(new_states: Iterable[Tensor]) -> None:
+    r"""Set the random number generator state of all devices.
+
+    Args:
+        new_states (Iterable of torch.ByteTensor): The desired state for each device.
+    """
+    for i, state in enumerate(new_states):
+        set_rng_state(state, i)
+
+
+def manual_seed(seed: int) -> None:
+    r"""Set the seed for generating random numbers for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function is insufficient
+        to get determinism.  To seed all GPUs, use :func:`manual_seed_all`.
+    """
+    seed = int(seed)
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed=True)
+
+
+def manual_seed_all(seed: int) -> None:
+    r"""Set the seed for generating random numbers on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    Args:
+        seed (int): The desired seed.
+    """
+    seed = int(seed)
+
+    def cb():
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            default_generator.manual_seed(seed)
+
+    _lazy_call(cb, seed_all=True)
+
+
+def seed() -> None:
+    r"""Set the seed for generating random numbers to a random number for the current GPU.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+
+    .. warning::
+        If you are working with a multi-GPU model, this function will only initialize
+        the seed on one GPU.  To initialize all GPUs, use :func:`seed_all`.
+    """
+
+    def cb():
+        idx = current_device()
+        default_generator = torch.xpu.default_generators[idx]
+        default_generator.seed()
+
+    _lazy_call(cb)
+
+
+def seed_all() -> None:
+    r"""Set the seed for generating random numbers to a random number on all GPUs.
+
+    It's safe to call this function if XPU is not available; in that case, it is silently ignored.
+    """
+
+    def cb():
+        random_seed = 0
+        seeded = False
+        for i in range(device_count()):
+            default_generator = torch.xpu.default_generators[i]
+            if not seeded:
+                default_generator.seed()
+                random_seed = default_generator.initial_seed()
+                seeded = True
+            else:
+                default_generator.manual_seed(random_seed)
+
+    _lazy_call(cb)
+
+
+def initial_seed() -> int:
+    r"""Return the current random seed of the current GPU.
+
+    .. warning::
+        This function eagerly initializes XPU.
+    """
+    _lazy_init()
+    idx = current_device()
+    default_generator = torch.xpu.default_generators[idx]
+    return default_generator.initial_seed()
+
+
+__all__ = [
+    "get_rng_state",
+    "get_rng_state_all",
+    "set_rng_state",
+    "set_rng_state_all",
+    "manual_seed",
+    "manual_seed_all",
+    "seed",
+    "seed_all",
+    "initial_seed",
+]
diff --git a/MLPY/Lib/site-packages/torch/xpu/streams.py b/MLPY/Lib/site-packages/torch/xpu/streams.py
new file mode 100644
index 0000000000000000000000000000000000000000..961bc20b46e5134426061e993ae51f62276bb2e9
--- /dev/null
+++ b/MLPY/Lib/site-packages/torch/xpu/streams.py
@@ -0,0 +1,169 @@
+import ctypes
+
+import torch
+from torch._streambase import _EventBase, _StreamBase
+from .._utils import _dummy_type
+
+
+if not hasattr(torch._C, "_XpuStreamBase"):
+    # Define dummy base classes
+    torch._C.__dict__["_XpuStreamBase"] = _dummy_type("_XpuStreamBase")
+    torch._C.__dict__["_XpuEventBase"] = _dummy_type("_XpuEventBase")
+
+
+class Stream(torch._C._XpuStreamBase, _StreamBase):
+    r"""Wrapper around a XPU stream.
+
+    A XPU stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.
+
+    Args:
+        device(torch.device or int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream, should be 0 or
+            negative, where negative numbers indicate higher priority. By default,
+            streams have priority 0.
+    """
+
+    def __new__(cls, device=None, priority=0, **kwargs):
+        # setting device manager is expensive, so we avoid it unless necessary
+        if device is None or ("stream_id" in kwargs and "device_index" in kwargs):
+            return super().__new__(cls, priority=priority, **kwargs)
+        else:
+            with torch.xpu.device(device):
+                return super().__new__(cls, priority=priority, **kwargs)
+
+    def wait_event(self, event):
+        r"""Make all future work submitted to the stream wait for an event.
+
+        Args:
+            event (torch.xpu.Event): an event to wait for.
+        """
+        event.wait(self)
+
+    def wait_stream(self, stream):
+        r"""Synchronize with another stream.
+
+        All future work submitted to this stream will wait until all kernels
+        submitted to a given stream at the time of call complete.
+
+        Args:
+            stream (Stream): a stream to synchronize.
+        """
+        self.wait_event(stream.record_event())
+
+    def record_event(self, event=None):
+        r"""Record an event.
+
+        Args:
+            event (torch.xpu.Event, optional): event to record. If not given, a new one
+                will be allocated.
+
+        Returns:
+            Recorded event.
+        """
+        if event is None:
+            event = Event()
+        event.record(self)
+        return event
+
+    def query(self):
+        r"""Check if all the work submitted has been completed.
+
+        Returns:
+            A boolean indicating if all kernels in this stream are completed.
+        """
+        return super().query()
+
+    def synchronize(self):
+        r"""Wait for all the kernels in this stream to complete."""
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_queue)
+
+    def __eq__(self, o):
+        if isinstance(o, Stream):
+            return super().__eq__(o)
+        return False
+
+    def __hash__(self):
+        return hash((self.sycl_queue, self.device))
+
+    def __repr__(self):
+        return f"torch.xpu.Stream(device={self.device} sycl_queue={self.sycl_queue:#x})"
+
+
+class Event(torch._C._XpuEventBase, _EventBase):
+    r"""Wrapper around a XPU event.
+
+    XPU events are synchronization markers that can be used to monitor the
+    device's progress, and to synchronize XPU streams.
+
+    The underlying XPU events are lazily initialized when the event is first
+    recorded. After creation, only streams on the same device may record the
+    event. However, streams on any device can wait on the event.
+
+    Args:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+    """
+
+    def __new__(cls, enable_timing=False):
+        return super().__new__(cls, enable_timing=enable_timing)
+
+    def record(self, stream=None):
+        r"""Record the event in a given stream.
+
+        Uses ``torch.xpu.current_stream()`` if no stream is specified. The
+        stream's device must match the event's device.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().record(stream)
+
+    def wait(self, stream=None):
+        r"""Make all future work submitted to the given stream wait for this event.
+
+        Use ``torch.xpu.current_stream()`` if no stream is specified.
+        """
+        if stream is None:
+            stream = torch.xpu.current_stream()
+        super().wait(stream)
+
+    def query(self):
+        r"""Check if all work currently captured by event has completed.
+
+        Returns:
+            A boolean indicating if all work currently captured by event has
+            completed.
+        """
+        return super().query()
+
+    def elapsed_time(self, end_event):
+        r"""Return the time elapsed.
+
+        Time reported in milliseconds after the event was recorded and
+        before the end_event was recorded.
+        """
+        return super().elapsed_time(end_event)
+
+    def synchronize(self):
+        r"""Wait for the event to complete.
+
+        Waits until the completion of all work currently captured in this event.
+        This prevents the CPU thread from proceeding until the event completes.
+        """
+        super().synchronize()
+
+    @property
+    def _as_parameter_(self):
+        return ctypes.c_void_p(self.sycl_event)
+
+    def __repr__(self):
+        if self.sycl_event:
+            return f"torch.xpu.Event(sycl_event={self.sycl_event:#x})"
+        else:
+            return "torch.xpu.Event(uninitialized)"